diff --git a/qcd/part_cpu/LICENCE b/qcd/part_cpu/LICENCE new file mode 100644 index 0000000000000000000000000000000000000000..3eff5c81f79472018c6e4406ee1510fbada7c20e --- /dev/null +++ b/qcd/part_cpu/LICENCE @@ -0,0 +1,42 @@ +Copyright and Disclaimer +Copyright (C) 2008, Forschungszentrum Juelich GmbH, Federal Republic of Germany. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + +Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Any publications that result from the use of this software shall + reasonably refer to the Research Centre's development. + + * All advertising materials mentioning features or use of this + software must display the following acknowledgement: + + This product includes software developed by Forschungszentrum + Juelich GmbH, Federal Republic of Germany. + + * Forschungszentrum Juelich GmbH is not obligated to provide the + user with any support, consulting, training or assistance of any + kind with regard to the use, operation and performance of this + software or to provide the user with any updates, revisions or + new versions. + +THIS SOFTWARE IS PROVIDED BY FORSCHUNGSZENTRUM JUELICH GMBH "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL FORSCHUNGSZENTRUM JUELICH +GMBH BE LIABLE FOR ANY SPECIAL, DIRECT OR CONSEQUENTIAL DAMAGES OR ANY +DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, +WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, +ARISING OUT OF OR IN CONNECTION WITH THE ACCESS, USE OR PERFORMANCE OF +THIS SOFTWARE. diff --git a/qcd/part_cpu/applications/QCD/QUICK_GUIDE_UEABS_QCD_BENCHMARKSUITE b/qcd/part_cpu/applications/QCD/QUICK_GUIDE_UEABS_QCD_BENCHMARKSUITE new file mode 100644 index 0000000000000000000000000000000000000000..138b332d8f10d1b0b554247e1f155edc79a8f609 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/QUICK_GUIDE_UEABS_QCD_BENCHMARKSUITE @@ -0,0 +1,108 @@ +################# +################# UEABS - QCD - BENCHMARKSUITE -- QUICK-USERGUIDE +################# + +This is a very short summary of the general step, which has +to be performed, to run the UEABS QCD Benchmarksuite on a new +machine. More information can be found in the documentation of +the UEABS-QCD BENCHMARKSUITE which is located in in the folder +./PABS/doc/* +or under the web-link + +http://www.prace-ri.eu/UEABS/QCD/QCD_Build_README.txt +http://www.prace-ri.eu/UEABS/QCD/QCD_Run_README.txt + +The suite works with Jube, which will handle the compilation, +the submission and the analysis of the Benchmarksuite. On a new +machine several xml-files has to be added or created. +This guide will give a short and very quick overview about +the different steps. + +The FIRST STEP on a new machine is to add information about the +system to the platform-folder located in: +./PABS/platform +Here, the new platform has to be added to the xml-file "platform.xml" +similar to the already xml-templates: + +.. + + + + +The SECOND STEP is to provide a dummy-submit script which has to +added to a new subdirectory given by: + +./PABS/platform/"NEW-PLATFORM" + +In the THIRD STEP: Go to the home-directory of the UEABS-QCD-Benchmarksuite +located in: +./PABS/applications/QCD/ +Note that the source-files of the kernels are located in "./PABS/applications/QCD/src". +Here, similar to STEP ONE the xml-files: + +compile.xml, execute.xml and analyse.xml + +has to be edit, i.e. new xml-templates with the new platform-information +has to be added. + +In the FOURTH STEP the runs will be setup by creating runs-scripts similar to +"prace-functional-NEW-PLATORM.xml" for a functional test +and +"prace-scaling-NEW-PLATORM.xml" for a scaling run. +Here, several limits of the different codes has to be taken into account, see for +this the section "Limitation" at the end of this quick-userguide. + +In the FIFTH STEP the benchmark can be compiled and ran by using the command: + +perl ../../bench/jube prace-functional-"NEW-PLATFORM".xml + +in the directory: +"./PABS/applications/QCD/". +This will generate a folder "tmp" with subfolder in "./PABS/applications/QCD/" +where the source-file will be compiled and executed. If the compilation or the submission +fails, more information can be found in the subdirectories of "tmp". In any cases +after the generation of the folder "tmp", compilation and submition can be done, +in principle, without Jube. + +In the LAST STEP, the scaling results can be analyzed, by using +perl ../../bench/jube analyse.xml + +LIMITATION: + +The different kernels consists of lattice QCD production codes and have several limitations +in parallelization and lattice volume. Kernel A,C,D and E using a four dimensional +lattice while in case of kernel B a three dimensional lattice is used. All kernels +can be parallelized in all direction. The different lattice sizes and parallelization +has to be declared in the scripts: 'prace-functional-"NEW-PLATFORM".xml' or +'prace-scaling-NEW-PLATORM.xml'. The limitation for the different kernel are given by: + +"pt * px * py * pz = task" + +and additional for the Kernel A, D and E + +" nt / pt modulo 2 = 0 " and " nt => 4 " + +and the same condition for the other pairs +"{nx,px}, {ny,py}, {nz,pz}". Moreover +the lattice extends nt, nx, ny and nx has to be even and larger +than 4. + +####### +####### Please see for further information the Readme-files +####### which are provided under +####### +####### http://www.prace-ri.eu/UEABS/QCD/QCD_Build_README.txt +####### http://www.prace-ri.eu/UEABS/QCD/QCD_Run_README.txt +####### or in +####### ./PABS/doc/* +####### +####### Jacob Finkenrath, 2017 +####### \ No newline at end of file diff --git a/qcd/part_cpu/applications/QCD/README b/qcd/part_cpu/applications/QCD/README new file mode 100644 index 0000000000000000000000000000000000000000..e5ca95368bcab8545334c73216e2b6619d114301 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/README @@ -0,0 +1,1007 @@ +The lattice quatum chromodynamics (LQCD) benchmark is a compilation +of up to five (three in the moment) LQCD kernels. The kernels are: + +label : kernel_A +short label : KA +kernel origin : Berlin Quantum ChromoDynamics program (BQCD), + DEISA benchmark suite +kernel contact person : Hinnerk Stueben +kernel code status : 2008/08/25 +problem size parameter : KA_N{X,Y,Z,T}, 4D lattice +problem run time parameter : KA_MAXITER, iteration steps +other needed parameter : KA_P{X,Y,Z,T}, distribution of processes in 4D + KA_LIBCOMM, see readme section + KA_LIBCLOVER, see readme section + KA_LIBD, see readme section +notes : + + +label : kernel_B +short label : KB +kernel origin : University of Oulu, Finland + DEISA benchmark suite +kernel contact person : Kari Rummukainen +kernel code status : 2008/08/22 +problem size parameter : KB_NX, x component of the 3D grid + KB_NY, y component of the 3D grid + KB_NZ, z component of the 3D grid +problem run time parameter : KB_MAXITER, iteration steps +other needed parameter : +notes : number of processes needs to be a power of 2 + + +label : kernel_C +short label : KC +kernel origin : privat communication + +kernel contact person : Jacob Finkenrath +kernel code status : 2016/08/24 +problem size parameter : KC_L{X,Y,Z,T}, local size of the 4D grid in {x,y,z,t}-direction +problem run time parameter : +other needed parameter : KC_P{X,Y,Z,T}, number of processes in {x,y,z,t}-direction +notes : + + +label : kernel_D +short label : KD +kernel origin : privat communication + +kernel contact person : Jacob Finkenrath +kernel code status : 2016/08/24 +problem size parameter : K_L{X,Y,Z,T}, size of the 4D grid in {x,y,z,t}-direction +problem run time parameter : +other needed parameter : K_N{X,Y,Z,T}, number of processes in {x,y,z,t}-direction +notes : + + +label : kernel_E +short label : KE +kernel origin : privat communication + +kernel contact person : Stefan Krieg +kernel code status : 2008/11/10 +problem size parameter : K_L{X,Y,Z,T}, size of the 4D grid in {x,y,z,t}-direction +problem run time parameter : +other needed parameter : K_N{X,Y,Z,T}, number of processes in {x,y,z,t}-direction +notes : + + +###################################################################### +kernel_A README + +----------- +BQCD readme +----------- + + + +Note: all base information taken from the +BQCD document; updated with JuBE and new ported platforms + +Subdirectories in src: +~~~~~~~~~~~~~~~~~~~~ + +clover routines for the clover improvement + +comm communication routines + +d multiplication of a vector with "D-slash" + +modules (some) Fortran90 modules + +platform Makefiles and service routines for various platforms + + +General remarks +~~~~~~~~~~~~~~ + +BQCD has been ported to various platforms (see platform/Makefile-*.var): + +# Makefile-altix.var - settings on SGI-Altix 3700 and SGI-Altix 4700 +# Makefile-bgl.var - settings on BlueGene/L +# Makefile-cray.var - settings on Cray T3E and Cray XT4 +# Makefile-hitachi-omp.var - settings on Hitachi SR8000 +# Makefile-hitachi.var - settings on Hitachi SR8000 (pure MPI version) +# Makefile-hp.var - settings for HP-UX Fortran Compiler +# Makefile-ibm.var - settings on IBM +# Makefile-intel.var - settings for Intel Fortran Compiler +# Makefile-nec.var - settings on NEC SX-8 +# Makefile-sun.var - settings on Sun + +The corresponding files + + platform/service-*.F90 + +contain interfaces to service routines / system calls. + +Not all of these files have been used recently. There are kept as a +starting point. + +A "Makefile.var" and a "service.F90" have to be provided in the source +directory that work correctly with your system. +The contents of these files is explained in: + + platform/Makefile-EXPLAINED.var + platform/service-EXPLAINED.var + +"gmake prep-" will create symbolic links accordingly: + +berni1> gmake prep-ibm +gmake prep PLATFORM=ibm +rm -f Makefile.var service.F90 +ln -s platform/Makefile-ibm.var Makefile.var +ln -s platform/service-ibm.F90 service.F90 + + + +Resource requirements +~~~~~~~~~~~~~~~~~~~~ + +The resource requirements are approximately: + +benchmark lattice total memory size of output execution time +--------------------------------------------------------------------------- +MPP 48*48*48*96 497 GByte 4 GByte 268.2 s at 758.52 GFlop/s +SMP 24*24*24*48 37 GByte 0.25 GByte 44.4 s at 608.96 GFlop/s + + +Standart porting +~~~~~~~~~~~~~~~ + +*** make + +The Makefiles use the makro $(MAKE) and the "include" statement. Some +of Makefile-*.var are quite standard, some require GNU-make. + +"make fast" can be used for a parallel "make". + +"make fast" builds the binary "bqcd." + +Without "make fast" one has to enter: + + make Modules + make libs + make bqcd + +JuBE porting +~~~~~~~~~~~ +For Altix: + Change the following lines in the execution file bensh: + the first line: #!/usr/local/bin/perl -w + to + #!/usr/bin/perl -w + the line 1235: $cmd="cp -rp $srcdir/$file $dir/src/"; +to + $cmd="cp -rp $srcdir/* $dir/src/"; + + + + + + +*** ANSI C preprocessor + +The C preprocessor is needed for building the source. The C +preprocessor must be able to handle the string concatenation macro "##". + +Recent versions of the GNU C Proprocesse do not work because they +refuse to process the Fortran90 separator "%". + + +*** Service routines and "stderr" + +Service routines are needed for aborting, measuring CPU-time, to get +arguments from the command line, etc. The corresponding routines have +to be inserted in the file service.F90. + +It is assumed that Fortran unit 0 is pre-connected to stderr. If this +is not the case on your machine you should re-#define STDERR in "defs.h". + +For the time measurements it is important to use a time function with +high resolution in the function "sekunden". + + + +*** Message passing / Communication library + +Originally the communication was programmed with the shmem library on +a Cray T3E. + +Now MPI is mainly used. There is also a single CPU version (that +needs no communication library) and a combination of shmem for the +most time consuming part and MPI. + +See $(LIBCOMM) in platform/Makefile-EXPLAINED.var and "Hints for +optimisation" below. + + +*** OpenMP + +In addition to setting your compiler's OpenMP option you have to add +"-D_OPENMP" in "Makefile.var": + + MYFLAGS = ... -D_OPENMP + + + +Verification +~~~~~~~~~~~ + +*** Random numbers + +Correctness of random numbers can be checked by: + + make the_ranf_test + +The test is done by comparison with reference output. On most +platforms there is no difference. However, on Intel "diff" +usually reports differences in the last digit of the floating point +representation of the random numbers; the integer representations +match exactly, eg: + +< 1 4711 0.5499394951912783 +--- +> 1 4711 0.5499394951912784 + + +*** Argments from the command line + +Try option -V: + +berni1> ./bqcd -V + This is bqcd benchmark2 + input format: 4 + conf info format: 3 + MAX_TEMPER: 50 + real kind: 8 + version of D: 2 + D3: buffer vol: 0 + communication: single_pe + OpenMP + + + +*** BQCD + +To check that the BQCD works correctly execute the following sequence +of commands: + +berni1> cd work +berni1> ../bqcd input.TEST > out.TEST +berni1> grep ' %[fim][atc]' out.TEST > out.tmp +berni1> grep ' %[fim][atc]' out.TEST.reference | diff - out.tmp +18c18 +< %fa -1 1 0.4319366404 1.0173348431 43 407 38 +--- +> %fa -1 1 0.4319366404 1.0173348433 43 407 38 + +The test can be run for any domain decomposition and any number of +threads. In any case result should agree. Floating point numbers +might differ in the last digit as shown above. +(In total 20 lines containing floating point numbers are compared.) + + +*** Check sums + +BQCD writes restart files in the working directory. The extension of +the file containing information on the run is ".info". It contains +check sums of the binary data files (the example was run after the +test run): + +berni1> tail -6 bqcd.000.1.info + >BeginCheckSum + bqcd.000.1.00.u 286125633 24576 + bqcd.000.1.01.u 804770858 24576 + bqcd.000.1.02.u 657813015 24576 + bqcd.000.1.03.u 3802083338 24576 + >EndCheckSum + +These check sums should be identical to check sums calculated by the +"cksum" command: + +berni1> cksum bqcd.000.1.*.u | awk '{print $3, $1, $2}' +bqcd.000.1.00.u 286125633 24576 +bqcd.000.1.01.u 804770858 24576 +bqcd.000.1.02.u 657813015 24576 +bqcd.000.1.03.u 3802083338 24576 + + + +Structure of the input +~~~~~~~~~~~~~~~~~~~~~ + +run 204 names of restart files will contain "run" + can be set to 0 + +lattice 24 24 24 48 lattice size, can e.g. be modified for + weak scaling analysis + +processes 1 2 2 4 number of MPI-proceses per direction + (1 1 1 1 in the pure OpenMP case) + +boundary_conditions_fermions 1 1 1 -1 do not change + +beta 5 do not change +kappa 0.13 do not change +csw 2.3327 do not change + +hmc_test 0 do not change +hmc_model C do not change +hmc_rho 0.1 do not change +hmc_trajectory_length 0.2 do not change +hmc_steps 10 can be lowered -> shorter execution time +hmc_accept_first 1 do not change +hmc_m_scale 3 do not change + +start_configuration cold do not change +start_random default do not change + +mc_steps 1 do not change +mc_total_steps 100 do not change + +solver_rest 1e-99 do not change +solver_maxiter 100 can be lowered -> shorter execution time +solver_ignore_no_convergence 2 do not change (CG will not converge, + the numbers of iterations per call + will be exactly "solver_maxiter") +solver_mre_vectors 7 do not change + + + + + +Hints on optimisation +~~~~~~~~~~~~~~~~~~~~ + +Before starting any optimisation one should find the fastest variant +in the existing code. There are two libraries to look at: $(LIBD) and +$(LIBCOMM). + + + +*** LIBCOMM ("communication", directory: comm) + +There are the following variants: + +lib_single_pe.a: Single CPU version (PE: "processing element"). + +lib_mpi.a: MPI version. + +lib_shmempi.a: shmem for nearest neighbour communication, MPI for the rest. + + +*** Caveat + +Not all combinations of LIBD and LIBCOMM have been implemented. + +The following combinations should work (lib_mpi.a always works): + +LIBD LIBCOMM +-------------------------------------------------- +libd.a lib_single_pe.a lib_mpi.a +libd2.a lib_single_pe.a lib_mpi.a lib_schmempi.a +libd3.a lib_mpi.a +libd21.a lib_single_pe.a lib_mpi.a lib_schmempi.a + + + +Rules for time measurements +~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In "Makefile.var" "-DTIMING" must always be set: + + MYFLAGS = -DTIMING ... + +All time measurements (TIMING_START() ... TIMING_STOP()) must be kept. +There is one exception: If you restructure routines d() and d_dag() +it might occur that the current regions of time measurements (which +are per direction) do not make sense. (For example, this would occur +when combining loops from more than one direction.) + +In that case, please report in addition the best measurement obtained +with the existing code. + + + +###################################################################### +kernel_B README + +This is the README file for the SU3_AHiggs application benchmark, +distributed with the DEISA Benchmark Suite: +http://www.deisa.eu/science/benchmarking/ + +Last modified by the DEISA Benchmark Team on 2008-08-22. + + + +----------------- +SU3_AHiggs readme +----------------- + + + +Contents +-------- + +1 General description +2 Code structure +3 Parallelisation +4 Building +5 Execution +6 Verification +7 Input data +8 Output data + + +1 General description +===================== + +SU3_AHiggs is a lattice quantum chromodynamics (QCD) code intended for +computing the conditions of the Early Universe. Instead of the "full QCD", the +code applies an effective field theory, which is valid at high +temperatures. In the effective theory, the lattice is 3D. For this reason, +SU3_AHiggs stresses different parts of the architecture than the conventional +QCD applications using 4D lattices. + +SU3_AHiggs has roots in the MILC code, but it is heavily rewritten by +Prof. Kari Rummukainen (University of Oulu, Finland). The code is written +solely in C and it uses MPI communications. No external libraries are needed +to run the program. + +The directory SU3/src contains several closely related QCD programs: + + * SU3_4D + * SU3_AHiggs + * SU3_Gauge + +In the DEISA benchmarks, only the code SU3_AHiggs is used. If you find errors +in any of the files in the SU3 package, please contact benchmarking@deisa.eu. + + +2 Code structure +================ + +In SU3_AHiggs, the spacetime is discretised and replaced with a 3D cubic +lattice. Every lattice vertex contains a 3 x 3 traceless Hermitian +matrix. From each vertex, in turn, there are six edges to nearest-neighbour +vertices. Edges are 3 x 3 unitary matrices. + +The aim of the SU3_AHiggs computation is to generate lattice configurations +from the microcanonical distribution, which is the statistical equilibrium +state of the system. The program uses heat-bath and over-relaxation algorithms +to update lattice vertices and links. The computation starts from a random +initial configuration. + +The main function of SU3_AHiggs is in the file su3h_n/control.c. After the +initial setup, main calls the function runthis, which in turn calls other +functions in the SU3 package. If the dataset is sufficiently large, most of +the computing time is spent on lattice updates (functions updategauge and +updatehiggs in files su3h_n/updategauge.c and su3h_n/updatehiggs.c). If the +dataset is too small, in turn, the computation becomes communication +bound. MPI routines are not called directly but with customised communication +functions defined in generic/comdefs.h and generic/com_mpi.c. + + +3 Parallelisation +================= + +SU3_AHiggs uses a 3D domain decomposition method for parallelisation. Each MPI +task communicates with six neighbouring tasks only. The communication routines +are defined in the files generic/comdefs.h and generic/com_mpi.c. The most +important routines are: + + * start_get() + + This function starts asynchronous sends and receives required to gather + neighbouring lattice vertices and links. The call graph looks like this: + + start_get() --> start_gather() --> MPI_Irecv(), MPI_Isend() + + * wait_get() + + This function waits for receives to finish, ensuring that the data has + actually arrived. The call graph looks like this: + + wait_get() --> wait_gather() --> MPI_Wait() + +With a 32^3 lattice, the program performs well up to 256 processes. With a +256^3 lattice, the speedup is almost linear with the number of processes. The +highest processor number tested so far is 2048. The lattice size and the +number of iterations are controlled by four user-adjustable parameters. + + +4 Building +========== + +To build SU3_AHiggs with the JUBE tool on a new architecture (NEWARCH), do the +following steps: + + 1) Create a new top-level XML file for the new architecture + (bench-NEWARCH.xml). In this task, you can use the already existing + files as a starting point: bench-Cray-XT4-Louhi.xml, + bench-IBM-SP4-Jump.xml, and bench-SGI-Altix-HLRB2.xml. Normally you have + to change the values of $nodes and $taskspernode only. + + 2) Edit compile.xml: Create a new section , where + NEWARCH is the same as in the file + DEISA_BENCH/platform/platform.xml. Substitute values in the new compile + section with those proper for the new architecture. Normally you need to + change #CFLAGS# and #LDFLAGS#. Possibly you want to change #CC# and + #MPI_CC# also. + + 3) Run the compile step within the benchmark "test": Edit bench-NEWARCH.xml + and make sure that you have: + + + + ... + + + Then run: perl ../../bench/jube -debug bench-NEWARCH.xml + +If the compile step fails, go to the directory where JUBE has run the compile +step: + + tmp/SU3_NEWARCH_test_i000001/.../src + +Then try to run the command make manually. Analyze the error and try to fix it +modifying the file Makefile.defs. After the problem is solved, edit the file +compile.xml accordingly. If you cannot solve the problem just by editing +compile.xml, please contact benchmarking@deisa.eu. + + +5 Execution +=========== + +To run SU3_AHiggs with the JUBE tool, do the following steps: + + 1) Before running the benchmarks you need an execute script template, such + as: + + DEISA_BENCH/platform/Cray-XT4-Louhi/cray_qsub.job.in + + 2) Edit execute.xml: Create a new section , and + match the values in the new section with the execute script template. + + 3) Run a benchmark: Select a benchmark by setting active="1" in the file + bench-NEWARCH.xml. Then run: perl ../../bench/jube -submit + bench-NEWARCH.xml + +To run SU3_AHiggs manually (without JUBE), do the following steps: + + 1) Copy the SU3_AHiggs executable to a directory that is accessible from + compute nodes. The name of the SU3_AHiggs executable is: + + src/su3h_n/su3_ahiggs + + 2) Copy the input files beta, parameter, and status to the same + directory. In the directory input, there are several sets of input files + available: + + input/lat_256/* (256^3 lattice, 100 iterations) + input/lat_32/* (32^3 lattice, 10000 iterations) + input/test/* (32^3 lattice, 100 iterations) + + 3) Start the program with a MPI launcher available in your system, for + example: + + aprun -n 8 ./su3_ahiggs + + The test benchmark takes approximately 10 seconds with 8 processor + cores. Other benchmarks run longer: approximately 1 minute with 1024 + cores. + +Important: The number of tasks in su3_ahiggs must be a power of 2. Otherwise +the program cannot layout the lattice, and the execution stops. + + +6 Verification +============== + +JUBE verifies benchmark results automatically as part of the result analysis +step. In SU3_AHiggs, the verification cannot be done directly by comparing +benchmark results with some reference results. The reason to this is that the +results are very sensitive to compiler optimizations and the number of MPI +tasks as well. This can make results to appear very different if compared with +the reference results. Everything can still be all right, as long as the +results are statistically the same. + +Therefore SU3_AHiggs uses a statistical comparison test to verify benchmark +results (Student's t-test). Significance level is chosen to be 1e-4 (correct +results are rejected once every 10000 times). First 50 iterations are not +included in the comparison. The reference results are found at: + + reference/lat_256/higgs.out (256^3 lattice, 100 iterations) + reference/lat_32/higgs.out (32^3 lattice, 10000 iterations) + reference/test/higgs.out (32^3 lattice, 100 iterations) + +These files contain the Higgs field at each iteration for a given lattice +size. + +To verify benchmark results manually (without JUBE), do the following steps: + + 1) Copy the executable src/aa/aa to the directory SU3/run. + + 2) Run the following command in the directory SU3: + + perl run/check_results_su3.pl output.xml stdout.log stderr.log \ + $RUNDIR reference/lat_256 + + The environment variable $RUNDIR should point to the directory where + SU3_AHiggs has been executed. + + 3) If the benchmark results are correct, the file output.xml includes the + following lines: + + + + + If not, the same lines look like this: + + + + + +7 Input data +============ + +Input data for SU3_AHiggs consist of three short ASCII files containing +simulation parameters related to temperature, lattice size, iterations, etc. + +For example, the files related to the test benchmark look like this: + +input/test/beta: + + betag 12 + x 0.06 + y 0.69025056 + +input/test/parameters: + + nx 32 + ny 32 + nz 32 + micro steps 4 + n_measurement 1 + n_correlation 10000 + w_correlation 100000 + n_save -1000 + blocking levels 1 + level 0 1 + level 1 1 + +input/test/status: + + restart 0 + n_iteration 100 + n_thermal 0 + seed 479817384 + run status + iteration + time: gauge + time: higgs + time: rest + +It is easy to create new datasets by changing the lattice size (variables nx, +ny, and nz), number of iterations (n_iteration), and seed number for the +random number generator (seed). The duration of a simulation is roughly +proportional to: + + nx * ny * nz * n_iteration + +SU3_AHiggs has currently three datasets: + + test 32^3 lattice, 100 iterations + small 32^3 lattice, 10000 iterations (artificial dataset) + large 256^3 lattice, 100 iterations (real research dataset) + +The test dataset is designed to help porting to new architectures. The small +dataset, in turn, is designed for benchmarking purposes. With it, benchmark +timings depend strongly on the interconnect speed. + + +8 Output data +============= + +During the benchmarks, SU3_AHiggs writes out its result to the following files: + + correl + measure + status + +Note that the file named status is both input and output file; SU3_AHiggs +modifies it during the computation. The file measure is a binary file that +contains simulation results at each iteration. Its contents can be read with +the tool named aa available in the directory src/aa. + +The benchmark timings are written to the standard output. JUBE reads them +automatically as part of the analysis step. To get benchmark timings manually, +grep for "total time in seconds" in the standard output. + + + +###################################################################### +kernel_C README + + +This document is short guide to get started and run the speed tests. For +more detailed information see the README.extended. + + +PROGRAMS + +The benchmark programs are provided in source form and must be +compiled by the user on the machine that is to be tested. + +In addition the openQCD-1.4 package is needed. A tar-file of the +source code can be obtained from + +http://luscher.web.cern.ch/luscher/openQCD/ + +and should be extracted in the same directory level as this package. + +PROGRAM FEATURES + +All programs parallelize in 0,1,2,3 or 4 dimensions, depending on what is +specified at compilation time. They are highly optimized for machines with +current Intel or AMD processors, but will run correctly on any system that +complies with the ISO C89 (formerly ANSI C) and the MPI 1.2 standards. + +For the purpose of testing and code development, the programs can also +be run on a desktop or laptop computer. All what is needed for this is +a compliant C compiler and a local MPI installation such as Open MPI. + + +DOCUMENTATION + +The simulation program has a modular form, with strict prototyping and a +minimal use of external variables. Each program file contains a small number +of externally accessible functions whose functionality is described at the top +of the file. + +The data layout is explained in various README files and detailed instructions +are given on how to run the main programs. A set of further documentation +files are included in the doc directory, where the normalization conventions, +the chosen algorithms and other important program elements are described. + + +COMPILATION + +The compilation of the programs requires an ISO C89 compliant compiler and a +compatible MPI installation that complies with the MPI standard 1.2 (or later). + +In the main and devel directories, a GNU-style Makefile is included which +compiles and links the programs (just type "make" to compile everything; "make +clean" removes the files generated by "make"). The compiler options can be set +by editing the CFLAGS line in the Makefiles. + +The Makefiles assume that the following environment variables are set: + + GCC GNU C compiler command [Example: /usr/bin/gcc]. + + MPI_HOME MPI home directory [Example: /usr/lib64/mpi/gcc/openmpi]. + The mpicc command used is the one in $MPI_HOME/mpicc and + the MPI libraries are expected in $MPI_HOME/lib. + + MPI_INCLUDE Directory where the mpi.h file is to be found. + +All programs are then compiled using the $MPI_HOME/bin/mpicc command. The +compiler options that can be set in the CFLAGS line depend on which C compiler +the mpicc command invokes (the GCC compiler command is only used to resolve +the dependencies on the include files). + + +SSE/AVX ACCELERATION + +Current Intel and AMD processors are able to perform arithmetic operations on +short vectors of floating-point numbers in just one or two machine cycles, +using SSE and/or AVX instructions. The arithmetic performed by these +instructions fully complies with the IEEE 754 standard. + +Many programs in the module directories include SSE and AVX inline-assembly +code. On 64bit systems, and if the GNU or Intel C compiler is used, the code +can be activated by setting the compiler flags -Dx64 and -DAVX, respectively. +In addition, SSE prefetch instructions will be used if one of the following +options is specified: + + -DP4 Assume that prefetch instructions fetch 128 bytes at a time + (Pentium 4 and related Xeons). + + -DPM Assume that prefetch instructions fetch 64 bytes at a time + (Athlon, Opteron, Pentium M, Core, Core 2 and related Xeons). + + -DP3 Assume that prefetch instructions fetch 32 bytes at a time + (Pentium III). + +These options have an effect only if -Dx64 or -DAVX is set. The option +-DAVX implies -Dx64. + +On recent x86-64 machines with AMD Opteron or Intel Xeon processors, for +example, the recommended compiler flags are + + -std=c89 -O -mno-avx -DAVX -DPM + +For older machines that do not support the AVX instruction set, the +recommended flags are + + -std=c89 -O -mno-avx -Dx64 -DPM + +More aggressive optimization levels such as -O2 and -O3 tend to have little +effect on the execution speed of the programs, but the risk of generating +wrong code is higher. + +AVX instructions and the option -mno-avx may not be known to old versions +of the compilers, in which case one is limited to SSE accelerations with +option string -std=c89 -O -Dx64 -DPM. + + +DEBUGGING FLAGS + +For troubleshooting and parameter tuning, it may helpful to switch on some +debugging flags at compilation time. The simulation program then prints a +detailed report to the log file on the progress made in specified subprogram. + +The available flags are: + +-DCGNE_DBG CGNE solver. + +-DFGCR_DBG GCR solver. + +-FGCR4VD_DBG GCR solver for the little Dirac equation. + +-DMSCG_DBG MSCG solver. + +-DDFL_MODES_DBG Deflation subspace generation. + +-DMDINT_DBG Integration of the molecular-dynamics equations. + +-DRWRAT_DBG Computation of the rational function reweighting + factor. + + +RUNNING A SIMULATION + +The simulation programs reside in the directory "main". For each program, +there is a README file in this directory which describes the program +functionality and its parameters. + +Running a simulation for the first time requires its parameters to be chosen, +which tends to be a non-trivial task. The syntax of the input parameter files +and the meaning of the various parameters is described in some detail in +main/README.infiles and doc/parms.pdf. Examples of valid parameter files are +contained in the directory main/examples. + + +EXPORTED FIELD FORMAT + +The field configurations generated in the course of a simulation are written +to disk in a machine-independent format (see modules/misc/archive.c). +Independently of the machine endianness, the fields are written in little +endian format. A byte-reordering is therefore not required when machines with +different endianness are used for the simulation and the physics analysis. + + +AUTHORS + +The initial release of the openQCD package was written by Martin Lüscher and +Stefan Schaefer. Support for Schrödinger functional boundary conditions was +added by John Bulava. Several modules were taken over from the DD-HMC program +tree, which includes contributions from Luigi Del Debbio, Leonardo Giusti, +Björn Leder and Filippo Palombi. + + +ACKNOWLEDGEMENTS + +In the course of the development of the openQCD code, many people suggested +corrections and improvements or tested preliminary versions of the programs. +The authors are particularly grateful to Isabel Campos, Dalibor Djukanovic, +Georg Engel, Leonardo Giusti, Björn Leder, Carlos Pena and Hubert Simma for +their communications and help. + + +LICENSE + +The software may be used under the terms of the GNU General Public Licence +(GPL). + + +BUG REPORTS + +If a bug is discovered, please send a report to . + + +ALTERNATIVE PACKAGES AND COMPLEMENTARY PROGRAMS + +There is a publicly available BG/Q version of openQCD that takes advantage of +the machine-specific features of IBM BlueGene/Q computers. The version is +available at . + +The openQCD programs currently do not support reweighting in the quark +masses, but a module providing this functionality can be downloaded from +. + +Previously generated gauge-field configurations are often used as initial +configuration for a new run. If the old and new lattices or boundary +conditions are not the same, the old configuration may however need to be +adapted, using a field conversion tool such as the one available at +, before the new run is started. + +###################################################################### +kernel_D README + +Important compiler defines XXX are (-DXXX) +MPI -> switch on parallelisation +PARALLELXYZT -> 4-dimensional parallelisation +PARALLELXYT -> 3-dim +PARALLELXT -> 2-dim +PARALLELT -> 1-dim +SSE2 -> SSE2 inline assembly (to be used with one of the two following) +P4 -> pentium 4 +OPTERON -> opteron +_GAUGE_COPY -> non-strided memory access for gauge fields, but more memory required +BGL -> Blue Gene /L +BGP -> Blue Gene /P, to be used on top of BGL + +If none of them are used, you will get a serial version of the program. + +The local lattice size in the case of the one dimensional +prallelisation is controlled by the parameters in the file +benchmark.input: + +T = 32 +L = 16 + +which will give a 32 x 16^3 global lattice. + +NrXProcs = 2 + +needs only to be set in case of a parallel compilation and sets +the number of processes in x-direction. The same holds for NrYProcs and NrZProcs. +The number of processes in +t-direction is computed from NrX|Y|ZProcs and the total number of processes. +You should only take care that all this fits with the lattice size. + +the package size of the data that are send and recieved is +192 * (1/2) * L^3 Byte in case of the one dimensional parallelisation. +In case of the two dimensional parallelisation it is +192 * (1/2) ((L*L*L/N_PROC_X)+(T*L*L)) Byte. + +A run of the benchmark takes about one minute. + +The out-put of the program is something like this: (T=2,L=16) + +The number of processes is 12 +The local lattice size is 2 x 16 ^3 +total time 4.681349e+00 sec, Variance of the time 6.314982e-03 sec + + (297 Mflops [64 bit arithmetic]) + +communication switched off + (577 Mflops [64 bit arithmetic]) + +The size of the package is 393216 Byte +The bandwidth is 84.49 + 84.49 MB/sec + + +If you use the serial version of course the part depending on the +parallel setup will be missing. + + +Compilation examples (you need a c-compiler with c99 standard, otherwise you may need to define inline, restrict etc. to nothing): + +in general (gcc) +gcc -std=c99 -I. -I./ -I.. -o benchmark -D_GAUGE_COPY -O Hopping_Matrix.c mpi_init.c geometry_eo.c test/check_xchange.c test/check_geometry.c boundary.c start.c ranlxd.c init_gauge_field.c init_geometry_indices.c init_moment_field.c init_spinor_field.c read_input.c benchmark.c update_backward_gauge.c D_psi.c ranlxs.c -lm + +gcc and OPTERON (64 Bit architecture): +gcc -std=c99 -I. -I./ -I.. -o benchmark -DOPTERON -DSSE2 -mfpmath=387 -fomit-frame-pointer -ffloat-store -D_GAUGE_COPY -O Hopping_Matrix.c mpi_init.c geometry_eo.c test/check_xchange.c test/check_geometry.c boundary.c start.c ranlxd.c init_gauge_field.c init_geometry_indices.c init_moment_field.c init_spinor_field.c read_input.c benchmark.c update_backward_gauge.c D_psi.c ranlxs.c -lm + +gcc and pentium4: +gcc -std=c99 -I. -I./ -I.. -o benchmark -DSSE2 -DP4 -march=pentium4 -malign-double -fomit-frame-pointer -ffloat-store -D_GAUGE_COPY -O Hopping_Matrix.c mpi_init.c geometry_eo.c test/check_xchange.c test/check_geometry.c boundary.c start.c ranlxd.c init_gauge_field.c init_geometry_indices.c init_moment_field.c init_spinor_field.c read_input.c benchmark.c update_backward_gauge.c D_psi.c ranlxs.c -lm + +mpicc (gcc) general, four dimensional parallelisation: +mpicc -std=c99 -I. -I./ -I.. -o benchmark -O3 -DMPI -DPARALLELXYZT -D_GAUGE_COPY -O Hopping_Matrix.c Hopping_Matrix_nocom.c xchange_deri.c xchange_field.c xchange_gauge.c xchange_halffield.c xchange_lexicfield.c mpi_init.c geometry_eo.c test/check_xchange.c test/check_geometry.c boundary.c start.c ranlxd.c init_gauge_field.c init_geometry_indices.c init_moment_field.c init_spinor_field.c read_input.c benchmark.c update_backward_gauge.c D_psi.c ranlxs.c init_dirac_halfspinor.c -lm + + +###################################################################### +kernel_E README + +none \ No newline at end of file diff --git a/qcd/part_cpu/applications/QCD/analyse.xml b/qcd/part_cpu/applications/QCD/analyse.xml new file mode 100644 index 0000000000000000000000000000000000000000..e3d204ce271e43f7ca6b105360161edfffcf5c6b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/analyse.xml @@ -0,0 +1,97 @@ + + + + (cd $outdir; bash collectData.sh) + + + + + + + + + + (cd $outdir; bash collectData.sh) + + + + + + + + (cd $outdir; bash collectData.sh) + + + + + + + + (cd $outdir; bash collectData.sh) + + + + + + + + (cd $outdir; bash collectData.sh) + + + + + + + + + + (cd $outdir; bash collectData.sh) + + + + + + + + + + (cd $outdir; bash collectData.sh) + + + + + + + + + + (cd $outdir; bash collectData.sh) + + + + + + + + + + (cd $outdir; bash collectData.sh) + + + + + + + + + + (cd $outdir; bash collectData.sh) + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/compile.xml b/qcd/part_cpu/applications/QCD/compile.xml new file mode 100644 index 0000000000000000000000000000000000000000..045ca1891c60d323ca71994f1e12c4fd02f92506 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/compile.xml @@ -0,0 +1,2071 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ( cd kernel_A; sh fixLouhiPP.sh; cd ..; source /opt/modules/default/init/sh; `index('$CRAYPAT','on')==0 ? 'module load xt-craypat;' : ' '` make `index('$CRAYPAT','on')==0 ? '; pat_build -v -u -g $CRAYPAT_GROUP -o ${execname}+pat $execname; cp ${execname}+pat $execname' : ' '`) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ( cd kernel_A; sh fixLouhiPP.sh; cd ..; source /opt/modules/default/init/sh; `index('$CRAYPAT','on')==0 ? 'module load xt-craypat;' : ' '` make `index('$CRAYPAT','on')==0 ? '; pat_build -v -u -g $CRAYPAT_GROUP -o ${execname}+pat $execname; cp ${execname}+pat $execname' : ' '`) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + ( cd kernel_A; sh fixLouhiPP.sh; cd ..; source /opt/modules/default/init/sh; `index('$CRAYPAT','on')==0 ? 'module load xt-craypat;' : ' '` make `index('$CRAYPAT','on')==0 ? '; pat_build -v -u -g $CRAYPAT_GROUP -o ${execname}+pat $execname; cp ${execname}+pat $execname' : ' '`) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (`index('$PAPI','on')==0 ? 'module load papi;' : ' '` `index('$IHPCT_HWC','on')==0 || index('$IHPCT_MPITR','on')==0 ? 'module load hpct;' : ' '` make) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (make) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (make) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (cd kernel_A; sh fixLouhiPP.sh; cd ..; make) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (`index('$IHPCT_HWC','on')==0 || index('$IHPCT_MPITR','on')==0 ? 'module load ihpct;' : ' '` make) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (cd kernel_A; sh fixLouhiPP.sh; cd ..; make) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (cd kernel_A; sh fixLouhiPP.sh; cd ..; make) + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + (cd kernel_A; sh fixLouhiPP.sh; cd ..; make) + + + + diff --git a/qcd/part_cpu/applications/QCD/execute.xml b/qcd/part_cpu/applications/QCD/execute.xml new file mode 100644 index 0000000000000000000000000000000000000000..deae4afe9d42603614e9d4e928d3fa823ac46d9e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/execute.xml @@ -0,0 +1,450 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + llsubmit ibm_llsubmit.job + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + qsub -q prace cray_qsub.job + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + qsub cray_PBSsubmit.job + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + qsub intel_PBSsubmit.job + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + msub intel_PBSsubmit.job + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + llsubmit ibm_llsubmit.job + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + llsubmit ibm_llsubmit.job + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + llsubmit ibm_llsubmit.job + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + llsubmit ibm_llsubmit.job + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + sbatch intel_SLURMsubmit.job + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + qsub intel_PBSsubmit.job + + + + + + diff --git a/qcd/part_cpu/applications/QCD/input/kernel_A.input b/qcd/part_cpu/applications/QCD/input/kernel_A.input new file mode 100644 index 0000000000000000000000000000000000000000..9a8e24d12eae37c64bc6b35e6e1cb316a54ca851 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/input/kernel_A.input @@ -0,0 +1,29 @@ +run 0 + +lattice 2 2 2 2 +processes 1 1 1 1 +boundary_conditions_fermions 1 1 1 -1 + +beta 5 +kappa 0.13 +csw 2.3327 +h 0 + +hmc_test 0 +hmc_model C +hmc_rho 0.1 +hmc_trajectory_length 0.2 +hmc_steps 10 +hmc_accept_first 1 +hmc_m_scale 3 + +start_configuration cold +start_random default + +mc_steps 1 +mc_total_steps 100 + +solver_rest 1e-99 +solver_maxiter 50 +solver_ignore_no_convergence 2 +solver_mre_vectors 7 diff --git a/qcd/part_cpu/applications/QCD/input/kernel_A.input.in b/qcd/part_cpu/applications/QCD/input/kernel_A.input.in new file mode 100644 index 0000000000000000000000000000000000000000..b5ef4d8645bb9e63ef1cf08e31fe14dab6476069 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/input/kernel_A.input.in @@ -0,0 +1,29 @@ +run 0 + +lattice #KA_LATTICE# +processes #KA_PROCESSES# +boundary_conditions_fermions 1 1 1 -1 + +beta 5 +kappa 0.13 +csw 2.3327 +h 0 + +hmc_test 0 +hmc_model C +hmc_rho 0.1 +hmc_trajectory_length 0.2 +hmc_steps 10 +hmc_accept_first 1 +hmc_m_scale 3 + +start_configuration cold +start_random default + +mc_steps 1 +mc_total_steps 100 + +solver_rest 1e-99 +solver_maxiter #KA_MAXITER# +solver_ignore_no_convergence 2 +solver_mre_vectors 7 diff --git a/qcd/part_cpu/applications/QCD/input/kernel_B.input.beta.in b/qcd/part_cpu/applications/QCD/input/kernel_B.input.beta.in new file mode 100644 index 0000000000000000000000000000000000000000..084fefd6335861e219b2a507ebfb35672efc9fe1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/input/kernel_B.input.beta.in @@ -0,0 +1,3 @@ +betag 12 +x 0.06 +y 0.69025056 diff --git a/qcd/part_cpu/applications/QCD/input/kernel_B.input.parameters.in b/qcd/part_cpu/applications/QCD/input/kernel_B.input.parameters.in new file mode 100644 index 0000000000000000000000000000000000000000..9e892b7d77d7a02a143c855e464b52ace48c2350 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/input/kernel_B.input.parameters.in @@ -0,0 +1,11 @@ +nx #KB_NX# +ny #KB_NY# +nz #KB_NZ# +micro steps 4 +n_measurement 1 +n_correlation 10000 +w_correlation 100000 +n_save -1000 +blocking levels 1 +level 0 1 +level 1 1 diff --git a/qcd/part_cpu/applications/QCD/input/kernel_B.input.status.in b/qcd/part_cpu/applications/QCD/input/kernel_B.input.status.in new file mode 100644 index 0000000000000000000000000000000000000000..9234430ec16df17bf40e6eb2c9646d2075b70c7f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/input/kernel_B.input.status.in @@ -0,0 +1,9 @@ +restart 0 +n_iteration #KB_MAXITER# +n_thermal 0 +seed 989357013 +run status +iteration +time: gauge +time: higgs +time: rest diff --git a/qcd/part_cpu/applications/QCD/input/kernel_D.input.in b/qcd/part_cpu/applications/QCD/input/kernel_D.input.in new file mode 100644 index 0000000000000000000000000000000000000000..585b4875301ed2e9f00d8e232ea2b39e451a006b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/input/kernel_D.input.in @@ -0,0 +1,7 @@ +L=#KD_L# +T=#KD_T# + +# no of processors per direction, time direction chosen automatically +NrXProcs = #KD_NP_X# +NrYProcs = #KD_NP_Y# +NrZProcs = #KD_NP_Z# diff --git a/qcd/part_cpu/applications/QCD/input/kernel_E.input.in b/qcd/part_cpu/applications/QCD/input/kernel_E.input.in new file mode 100644 index 0000000000000000000000000000000000000000..461ca9e2b921991e17fd4cbf83026041d917c179 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/input/kernel_E.input.in @@ -0,0 +1,15 @@ +#lattice +nx #KE_NX# +ny #KE_NY# +nz #KE_NZ# +nt #KE_NT# +totnodes #KE_PROCS# + +#wilson +mass_wilson #KE_WILSON_MASS# + +#max iterations +max_cg_iters #KE_MAXITER# + +#etc +verbose 1 diff --git a/qcd/part_cpu/applications/QCD/patterns-gprof-qcd.xml b/qcd/part_cpu/applications/QCD/patterns-gprof-qcd.xml new file mode 100644 index 0000000000000000000000000000000000000000..e4f83d1f3925fb72b10ac27384b05832ca771b8e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/patterns-gprof-qcd.xml @@ -0,0 +1,20 @@ + + + JuBE: gprof: proc 1:\s*$patwrd\s*$patnfp + JuBE: gprof: proc 1:\s*$patnwrd\s*$patfp + + JuBE: gprof: proc 2:\s*$patwrd\s*$patnfp + JuBE: gprof: proc 2:\s*$patnwrd\s*$patfp + + JuBE: gprof: proc 3:\s*$patwrd\s*$patnfp + JuBE: gprof: proc 3:\s*$patnwrd\s*$patfp + + JuBE: gprof: proc 4:\s*$patwrd\s*$patnfp + JuBE: gprof: proc 4:\s*$patnwrd\s*$patfp + + JuBE: gprof: proc 5:\s*$patwrd\s*$patnfp + JuBE: gprof: proc 5:\s*$patnwrd\s*$patfp + + JuBE: gprof: proc $patint:\s+$patwrd\s+$patfp + + \ No newline at end of file diff --git a/qcd/part_cpu/applications/QCD/patterns-ihpct-qcd.xml b/qcd/part_cpu/applications/QCD/patterns-ihpct-qcd.xml new file mode 100644 index 0000000000000000000000000000000000000000..9b0d9be02118c3150871797ecba74dead0efa088 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/patterns-ihpct-qcd.xml @@ -0,0 +1,31 @@ + + + IHPCT: libHPM: in section kernel_A: PM_FPU_FLOP:\s*$patint + IHPCT: libHPM: in section kernel_B: PM_FPU_FLOP:\s*$patint + IHPCT: libHPM: in section kernel_C: PM_FPU_FLOP:\s*$patint + IHPCT: libHPM: in section kernel_D: PM_FPU_FLOP:\s*$patint + IHPCT: libHPM: in section kernel_E: PM_FPU_FLOP:\s*$patint + + IHPCT: libHPM: in section kernel_A: \% of peak performance:\s*$patint + IHPCT: libHPM: in section kernel_B: \% of peak performance:\s*$patint + IHPCT: libHPM: in section kernel_C: \% of peak performance:\s*$patint + IHPCT: libHPM: in section kernel_D: \% of peak performance:\s*$patint + IHPCT: libHPM: in section kernel_E: \% of peak performance:\s*$patint + + IHPCT: libHPM: in section kernel_A: number of load/stores per L1 miss:\s*$patint + IHPCT: libHPM: in section kernel_B: number of load/stores per L1 miss:\s*$patint + IHPCT: libHPM: in section kernel_C: number of load/stores per L1 miss:\s*$patint + IHPCT: libHPM: in section kernel_D: number of load/stores per L1 miss:\s*$patint + IHPCT: libHPM: in section kernel_E: number of load/stores per L1 miss:\s*$patint + + IHPCT: libHPM: in section kernel_A: $patwrd:\s+$patint + IHPCT: libHPM: in section kernel_B: $patwrd:\s+$patint + IHPCT: libHPM: in section kernel_C: $patwrd:\s+$patint + IHPCT: libHPM: in section kernel_D: $patwrd:\s+$patint + IHPCT: libHPM: in section kernel_E: $patwrd:\s+$patint + + IHPCT: libHPM: in section QCD: $patwrd:\s+$patint + + IHPCT: MPITracer: median communication time =\s*$patfp\s*sec + + diff --git a/qcd/part_cpu/applications/QCD/patterns-jube-qcd.xml b/qcd/part_cpu/applications/QCD/patterns-jube-qcd.xml new file mode 100644 index 0000000000000000000000000000000000000000..80081f279e56ed788d6fedb72ebcb21021f214d1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/patterns-jube-qcd.xml @@ -0,0 +1,45 @@ + + + JuBE: total mean run time: $patfp + JuBE: total mean run time: $patfp + + JuBE global mean timing statistics:\s*kernel_A\s*$patnfp\s*$patnfp\s*$patnfp\s*$patfp + JuBE global mean timing statistics:\s*kernel_B\s*$patnfp\s*$patnfp\s*$patnfp\s*$patfp + JuBE global mean timing statistics:\s*kernel_C\s*$patnfp\s*$patnfp\s*$patnfp\s*$patfp + JuBE global mean timing statistics:\s*kernel_D\s*$patnfp\s*$patnfp\s*$patnfp\s*$patfp + JuBE global mean timing statistics:\s*kernel_E\s*$patnfp\s*$patnfp\s*$patnfp\s*$patfp + + JuBE global mean timing statistics:\s*kernel_A\s*$patfp\s*$patnfp\s*$patnfp\s*$patnfp + JuBE global mean timing statistics:\s*kernel_B\s*$patfp\s*$patnfp\s*$patnfp\s*$patnfp + JuBE global mean timing statistics:\s*kernel_C\s*$patfp\s*$patnfp\s*$patnfp\s*$patnfp + JuBE global mean timing statistics:\s*kernel_D\s*$patfp\s*$patnfp\s*$patnfp\s*$patnfp + JuBE global mean timing statistics:\s*kernel_E\s*$patfp\s*$patnfp\s*$patnfp\s*$patnfp + + JuBE global mean timing statistics:\s*kernel_A\s*$patnfp\s*$patfp\s*$patnfp\s*$patnfp + JuBE global mean timing statistics:\s*kernel_B\s*$patnfp\s*$patfp\s*$patnfp\s*$patnfp + JuBE global mean timing statistics:\s*kernel_C\s*$patnfp\s*$patfp\s*$patnfp\s*$patnfp + JuBE global mean timing statistics:\s*kernel_D\s*$patnfp\s*$patfp\s*$patnfp\s*$patnfp + JuBE global mean timing statistics:\s*kernel_E\s*$patnfp\s*$patfp\s*$patnfp\s*$patnfp + + JuBE global mean timing statistics:\s*kernel_A\s*$patnfp\s*$patfp\s*$patnfp\s*$patnfp + JuBE global mean timing statistics:\s*kernel_B\s*$patnfp\s*$patfp\s*$patnfp\s*$patnfp + JuBE global mean timing statistics:\s*kernel_C\s*$patnfp\s*$patfp\s*$patnfp\s*$patnfp + JuBE global mean timing statistics:\s*kernel_D\s*$patnfp\s*$patfp\s*$patnfp\s*$patnfp + JuBE global mean timing statistics:\s*kernel_E\s*$patnfp\s*$patfp\s*$patnfp\s*$patnfp + + JuBE global mean timing statistics:\s*kernel_A\s*$patnfp\s*$patnfp\s*$patfp\s*$patnfp + JuBE global mean timing statistics:\s*kernel_B\s*$patnfp\s*$patnfp\s*$patfp\s*$patnfp + JuBE global mean timing statistics:\s*kernel_C\s*$patnfp\s*$patnfp\s*$patfp\s*$patnfp + JuBE global mean timing statistics:\s*kernel_D\s*$patnfp\s*$patnfp\s*$patfp\s*$patnfp + JuBE global mean timing statistics:\s*kernel_E\s*$patnfp\s*$patnfp\s*$patfp\s*$patnfp + + + JuBE: total max mem:\s*$patint + + JuBE: max mem for kernel_A:\s*$patint + JuBE: max mem for kernel_B:\s*$patint + JuBE: max mem for kernel_C:\s*$patint + JuBE: max mem for kernel_D:\s*$patint + JuBE: max mem for kernel_E:\s*$patint + + \ No newline at end of file diff --git a/qcd/part_cpu/applications/QCD/patterns-papi-qcd.xml b/qcd/part_cpu/applications/QCD/patterns-papi-qcd.xml new file mode 100644 index 0000000000000000000000000000000000000000..caf68d1e3ac62256b676feb1501593a6bfbdb8a6 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/patterns-papi-qcd.xml @@ -0,0 +1,17 @@ + + + JuBE: PAPI counter for kernel_A: PAPI_TOT_CYC:\s*$patint + JuBE: PAPI counter for kernel_B: PAPI_TOT_CYC:\s*$patint + JuBE: PAPI counter for kernel_C: PAPI_TOT_CYC:\s*$patint + JuBE: PAPI counter for kernel_D: PAPI_TOT_CYC:\s*$patint + JuBE: PAPI counter for kernel_E: PAPI_TOT_CYC:\s*$patint + + JuBE: PAPI counter for kernel_A: PAPI_FP_OPS:\s*$patint + JuBE: PAPI counter for kernel_B: PAPI_FP_OPS:\s*$patint + JuBE: PAPI counter for kernel_C: PAPI_FP_OPS:\s*$patint + JuBE: PAPI counter for kernel_D: PAPI_FP_OPS:\s*$patint + JuBE: PAPI counter for kernel_E: PAPI_FP_OPS:\s*$patint + + + + diff --git a/qcd/part_cpu/applications/QCD/prace-functional-cartesius.xml b/qcd/part_cpu/applications/QCD/prace-functional-cartesius.xml new file mode 100644 index 0000000000000000000000000000000000000000..bdd7b2cafe16fa5351e7f62f010f744948b84e10 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/prace-functional-cartesius.xml @@ -0,0 +1,188 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/prace-functional-cartesius_24.xml b/qcd/part_cpu/applications/QCD/prace-functional-cartesius_24.xml new file mode 100644 index 0000000000000000000000000000000000000000..f7546843fd11e6de8d12fcfabc9214a01301037c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/prace-functional-cartesius_24.xml @@ -0,0 +1,188 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/prace-functional-hector.xml b/qcd/part_cpu/applications/QCD/prace-functional-hector.xml new file mode 100644 index 0000000000000000000000000000000000000000..e600a4b7adf4db00a4366d4adf9f69a550778f0a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/prace-functional-hector.xml @@ -0,0 +1,182 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/prace-functional-hermit.xml b/qcd/part_cpu/applications/QCD/prace-functional-hermit.xml new file mode 100644 index 0000000000000000000000000000000000000000..80fc4c95f3310dc60b8e409f0bd9f0a2b326d375 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/prace-functional-hermit.xml @@ -0,0 +1,186 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/prace-functional-huygens.xml b/qcd/part_cpu/applications/QCD/prace-functional-huygens.xml new file mode 100644 index 0000000000000000000000000000000000000000..b9228ca77c72197f8317b3a22f6ecbdba831e3e4 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/prace-functional-huygens.xml @@ -0,0 +1,197 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/prace-functional-jugene.xml b/qcd/part_cpu/applications/QCD/prace-functional-jugene.xml new file mode 100644 index 0000000000000000000000000000000000000000..489fe4fb1ba0c007dfaf509918051739af4ea8db --- /dev/null +++ b/qcd/part_cpu/applications/QCD/prace-functional-jugene.xml @@ -0,0 +1,194 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/prace-functional-juqueen.xml b/qcd/part_cpu/applications/QCD/prace-functional-juqueen.xml new file mode 100644 index 0000000000000000000000000000000000000000..68af37a2c7eacd6d68adab42e56870cdeaa1406b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/prace-functional-juqueen.xml @@ -0,0 +1,198 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/prace-functional-juropa.xml b/qcd/part_cpu/applications/QCD/prace-functional-juropa.xml new file mode 100644 index 0000000000000000000000000000000000000000..0b83b7abb4a504f6da2eb7cbe08e53d32bfb9085 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/prace-functional-juropa.xml @@ -0,0 +1,188 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/prace-functional-louhi.xml b/qcd/part_cpu/applications/QCD/prace-functional-louhi.xml new file mode 100644 index 0000000000000000000000000000000000000000..8544acda0cc3ad228d6546257290205998c82ec7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/prace-functional-louhi.xml @@ -0,0 +1,184 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/prace-functional-marconi.xml b/qcd/part_cpu/applications/QCD/prace-functional-marconi.xml new file mode 100644 index 0000000000000000000000000000000000000000..0557883ef7a1f527fcb10b0f9eefc9dd67222771 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/prace-functional-marconi.xml @@ -0,0 +1,188 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/prace-functional-marconi2.xml b/qcd/part_cpu/applications/QCD/prace-functional-marconi2.xml new file mode 100644 index 0000000000000000000000000000000000000000..19b2d1f8b82bf1585eca082c89626c31c52d8664 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/prace-functional-marconi2.xml @@ -0,0 +1,188 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/prace-functional-supermuc.xml b/qcd/part_cpu/applications/QCD/prace-functional-supermuc.xml new file mode 100644 index 0000000000000000000000000000000000000000..b3aedd187111f9acf057d0fe77e951fff67cc656 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/prace-functional-supermuc.xml @@ -0,0 +1,189 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/prace-scaling-cartesius.xml b/qcd/part_cpu/applications/QCD/prace-scaling-cartesius.xml new file mode 100644 index 0000000000000000000000000000000000000000..1a12d10dfcb2a9c6b59660711767e9d8d2b2543a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/prace-scaling-cartesius.xml @@ -0,0 +1,223 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/prace-scaling-cartesius_24.xml b/qcd/part_cpu/applications/QCD/prace-scaling-cartesius_24.xml new file mode 100644 index 0000000000000000000000000000000000000000..a6f7c403a116a204d4a4968b01ce0c6b78df6d1e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/prace-scaling-cartesius_24.xml @@ -0,0 +1,192 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/prace-scaling-curie.xml b/qcd/part_cpu/applications/QCD/prace-scaling-curie.xml new file mode 100644 index 0000000000000000000000000000000000000000..ff6563a561f25773b5a8a3e4c113bd518c9699d2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/prace-scaling-curie.xml @@ -0,0 +1,193 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/prace-scaling-hector-medium.xml b/qcd/part_cpu/applications/QCD/prace-scaling-hector-medium.xml new file mode 100644 index 0000000000000000000000000000000000000000..57ae9c7365f668fcf76eb308be554a183de1fae0 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/prace-scaling-hector-medium.xml @@ -0,0 +1,192 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/prace-scaling-hector-small.xml b/qcd/part_cpu/applications/QCD/prace-scaling-hector-small.xml new file mode 100644 index 0000000000000000000000000000000000000000..8d90296be638349041bcbadbaeecf422d8346cd8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/prace-scaling-hector-small.xml @@ -0,0 +1,192 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/prace-scaling-hermit.xml b/qcd/part_cpu/applications/QCD/prace-scaling-hermit.xml new file mode 100644 index 0000000000000000000000000000000000000000..94908820eb67a5172cabf59e860daa3bf309b734 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/prace-scaling-hermit.xml @@ -0,0 +1,192 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/prace-scaling-huygens.xml b/qcd/part_cpu/applications/QCD/prace-scaling-huygens.xml new file mode 100644 index 0000000000000000000000000000000000000000..adcf6b83f2bc6269f13acd8b1e211dc8427fa317 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/prace-scaling-huygens.xml @@ -0,0 +1,202 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/prace-scaling-jugene.xml b/qcd/part_cpu/applications/QCD/prace-scaling-jugene.xml new file mode 100644 index 0000000000000000000000000000000000000000..09ad25ce3c09ad9ec355b017df99d0e294f3a546 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/prace-scaling-jugene.xml @@ -0,0 +1,204 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/prace-scaling-juqueen.xml b/qcd/part_cpu/applications/QCD/prace-scaling-juqueen.xml new file mode 100644 index 0000000000000000000000000000000000000000..e09e5907dc13f70738410f9e83376ad04eae2a16 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/prace-scaling-juqueen.xml @@ -0,0 +1,204 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/prace-scaling-juropa.xml b/qcd/part_cpu/applications/QCD/prace-scaling-juropa.xml new file mode 100644 index 0000000000000000000000000000000000000000..9a64dd3625e4b706d7eae32bc0ed07115a9a6830 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/prace-scaling-juropa.xml @@ -0,0 +1,193 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/prace-scaling-louhi.xml b/qcd/part_cpu/applications/QCD/prace-scaling-louhi.xml new file mode 100644 index 0000000000000000000000000000000000000000..519be3d8fd83344f08f2d54c4d9586b73d3131e8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/prace-scaling-louhi.xml @@ -0,0 +1,185 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/prace-scaling-marconi.xml b/qcd/part_cpu/applications/QCD/prace-scaling-marconi.xml new file mode 100644 index 0000000000000000000000000000000000000000..b4bdd614eda85b7501d5e1435a094257bf5b18ce --- /dev/null +++ b/qcd/part_cpu/applications/QCD/prace-scaling-marconi.xml @@ -0,0 +1,194 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/prace-scaling-supermuc.xml b/qcd/part_cpu/applications/QCD/prace-scaling-supermuc.xml new file mode 100644 index 0000000000000000000000000000000000000000..908f78c65c1c224dd9093f4184fd8d43b5432555 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/prace-scaling-supermuc.xml @@ -0,0 +1,194 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/prepare.xml b/qcd/part_cpu/applications/QCD/prepare.xml new file mode 100644 index 0000000000000000000000000000000000000000..04c33977e48bc8455a0097425df15d4bacc0af8e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/prepare.xml @@ -0,0 +1,85 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/result.gprof.xml b/qcd/part_cpu/applications/QCD/result.gprof.xml new file mode 100644 index 0000000000000000000000000000000000000000..b1486a3fd9a10889a58be28b26eddebf9c34205b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/result.gprof.xml @@ -0,0 +1,9 @@ + + + KERNELS, GPROF_01_NAME, GPROF_01_PART, GPROF_02_NAME, GPROF_02_PART, GPROF_03_NAME, GPROF_03_PART + + + + name + + diff --git a/qcd/part_cpu/applications/QCD/result.hwc.xml b/qcd/part_cpu/applications/QCD/result.hwc.xml new file mode 100644 index 0000000000000000000000000000000000000000..b965eae3c00291053b4bdc0e3a506e17728c0170 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/result.hwc.xml @@ -0,0 +1,13 @@ + + + KERNELS, walltime, HWC_FLOP_KA_avg, HWC_FLOP_KB_avg, HWC_FLOP_KC_avg + + + + name + + + + diff --git a/qcd/part_cpu/applications/QCD/result.mpi.xml b/qcd/part_cpu/applications/QCD/result.mpi.xml new file mode 100644 index 0000000000000000000000000000000000000000..085cb5e36095059f84d6a558de5b6c66ddc5258e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/result.mpi.xml @@ -0,0 +1,9 @@ + + + KERNELS, walltime, MPI_COMM_TIME_avg, MPI_COMM_TIME_std + + + + name + + diff --git a/qcd/part_cpu/applications/QCD/result.wct.xml b/qcd/part_cpu/applications/QCD/result.wct.xml new file mode 100644 index 0000000000000000000000000000000000000000..faf08f81146ee3bd87fb054851fb30d3352d7015 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/result.wct.xml @@ -0,0 +1,9 @@ + + + KERNELS, walltime, KA_wct, KB_wct, KC_wct, KD_wct + + + + name + + diff --git a/qcd/part_cpu/applications/QCD/result.xml b/qcd/part_cpu/applications/QCD/result.xml new file mode 100644 index 0000000000000000000000000000000000000000..429be125b68e6e3317ef42d14de491715737cd24 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/result.xml @@ -0,0 +1,64 @@ + + + + ncpus, time, time_KA, time_KB, time_KC, time_KD, time_KE + + + + + + ncpus, KERNELS, optflags, walltime, WCT_KA, WCT_KB, WCT_KC, WCT_KD, WCT_KE, COMMENT + + + + ncpus, KERNELS, walltime, TIME_INIT_KA, TIME_RUN_KA, TIME_FINALIZE_KA, KA_NX, KA_NY, KA_NZ, KA_NT + + + ncpus, KERNELS, walltime, TIME_INIT_KB, TIME_RUN_KB, TIME_FINALIZE_KB, KB_NX, KB_NY, KB_NZ + + + ncpus, KERNELS, walltime, TIME_INIT_KC, TIME_RUN_KC, TIME_FINALIZE_KC, KC_NX, KC_NY, KC_NZ, KC_NT + + + ncpus, KERNELS, walltime, TIME_INIT_KD, TIME_RUN_KD, TIME_FINALIZE_KD, KD_L, KD_T + + + ncpus, KERNELS, walltime, TIME_INIT_KE, TIME_RUN_KE, TIME_FINALIZE_KE, KE_NX, KE_NY, KE_NZ, KE_NT + + + + + KERNELS, walltime, MEM_MAX, MEM_KA, MEM_KB, MEM_KC, MEM_KD, MEM_KE, COMMENT + + + + KERNELS, GPROF_01_NAME, GPROF_01_PART, GPROF_02_NAME, GPROF_02_PART, GPROF_03_NAME, GPROF_03_PART + + + + KERNELS, walltime, HWC_FLOP_KA_avg, HWC_FLOP_KB_avg, HWC_FLOP_KC_avg, HWC_FLOP_KD_avg, HWC_FLOP_KE_avg + + + + KERNELS, walltime, MPI_COMM_TIME + + + + + ncpus, bgconn, optflags, KA_NPROC0, KA_NPROC1, KA_NPROC2, KA_NPROC3 + + + + ncpus, bgconn, walltime, KA_LATTICE0, KA_LATTICE1, KA_LATTICE2, KA_LATTICE3, KA_LIBCOMM, KA_LIBCLOVER, KA_LIBD + + + + jobenddate + + + + + KERNELS, ncpus, subid + + + diff --git a/qcd/part_cpu/applications/QCD/run/collectData.sh.in b/qcd/part_cpu/applications/QCD/run/collectData.sh.in new file mode 100644 index 0000000000000000000000000000000000000000..eb4483efcef2739e4964915f0b7bb394ff1687dd --- /dev/null +++ b/qcd/part_cpu/applications/QCD/run/collectData.sh.in @@ -0,0 +1,24 @@ +rm -f IHPCT.log GPROF.log CRAYPAT.log + +##COLLECT_IHPCT_HWC# #JUGENE# sed '$d' < QCD.viz > tmp.viz; mv tmp.viz QCD.viz + #COLLECT_IHPCT_HWC# #PERL# #BENCHHOME#/../../utils/ihpct/parseHWC.pl QCD*.viz >> IHPCT.log + +#COLLECT_IHPCT_MPITR# #PERL# #BENCHHOME#/../../utils/ihpct/parseMPITR.pl mpi_profile.* >> IHPCT.log + +#COLLECT_GPROF# #HUYGENS# gprof #EXECUTABLE# profdir*/gmon.out > GPROF.dat; #PERL# #BENCHHOME#/../../utils/gprof/parseGPROF.pl -1 1 GPROF.dat > GPROF.log +#COLLECT_GPROF# #JUGENE# gprof #EXECUTABLE# gmon.out.* > GPROF.dat; #PERL# #BENCHHOME#/../../utils/gprof/parseGPROF.pl -1 1 GPROF.dat > GPROF.log +#COLLECT_GPROF# #JUMP# gprof #EXECUTABLE# gmon.*.out > GPROF.dat; #PERL# #BENCHHOME#/../../utils/gprof/parseGPROF.pl -2 1 GPROF.dat > GPROF.log + +#COLLECT_CRAYPAT# source /opt/modules/default/init/sh; module load xt-craypat + +#COLLECT_CRAYPAT# pat_report -d P -b totals *.xf > CRAYPAT.HWC.dat; pat_report -d P -b totals ./*/*.xf > CRAYPAT.HWC.dat; +#COLLECT_CRAYPAT# #PERL# #BENCHHOME#/../../utils/craypat/parseCRAYPAT.pl HWC CRAYPAT.HWC.dat >> CRAYPAT.log + +#COLLECT_CRAYPAT# pat_report -d flops *.xf > CRAYPAT.FLOPS.dat; pat_report -d flops ./*/*.xf > CRAYPAT.FLOPS.dat; +#COLLECT_CRAYPAT# #PERL# #BENCHHOME#/../../utils/craypat/parseCRAYPAT.pl FLOPS CRAYPAT.FLOPS.dat >> CRAYPAT.log + +#COLLECT_CRAYPAT# pat_report -d am *.xf > CRAYPAT.HEAP.dat; pat_report -d am ./*/*.xf > CRAYPAT.HEAP.dat; +#COLLECT_CRAYPAT# #PERL# #BENCHHOME#/../../utils/craypat/parseCRAYPAT.pl HEAP CRAYPAT.HEAP.dat >> CRAYPAT.log + +#COLLECT_CRAYPAT# pat_report -d time@%0.1 *.xf > CRAYPAT.TIME.dat; pat_report -d time@%0.1 ./*/*.xf > CRAYPAT.TIME.dat; +#COLLECT_CRAYPAT# #PERL# #BENCHHOME#/../../utils/craypat/parseCRAYPAT.pl TIME CRAYPAT.TIME.dat >> CRAYPAT.log diff --git a/qcd/part_cpu/applications/QCD/run/verify_qcd.pl b/qcd/part_cpu/applications/QCD/run/verify_qcd.pl new file mode 100644 index 0000000000000000000000000000000000000000..4626ce67b54506c73fed26614c559401d85c55f0 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/run/verify_qcd.pl @@ -0,0 +1,32 @@ +#!/usr/local/bin/perl -w + +use strict; +use Carp; + +my $patint="([\\+\\-\\d]+)"; # Pattern for Integer number +my $patfp ="([\\+\\-\\d.Ee]+)"; # Pattern for Floating Point number +my $patwrd="([\^\\s]+)"; # Pattern for Work (all noblank characters) +my $patnint="[\\+\\-\\d]+"; # Pattern for Integer number, no () +my $patnfp ="[\\+\\-\\d.Ee]+"; # Pattern for Floating Point number, no () +my $patnwrd="[\^\\s]+"; # Pattern for Work (all noblank characters), no () +my $patbl ="\\s+"; # Pattern for blank space (variable length) + +if((scalar @ARGV) != 1) { + printf(STDERR "incorrect number of parameter (%d) of $0 (6 required)\n",scalar @ARGV); + exit(-1); +} + +my $xmloutfile = $ARGV[0]; +my $vcheck=0; +my $vcomment="not implemented"; + +open(XMLOUT,"> $xmloutfile") || die "cannot open file $xmloutfile"; +print XMLOUT "\n"; +print XMLOUT " \n"; +print XMLOUT " \n"; +print XMLOUT "\n"; +print XMLOUT "\n"; +close(XMLOUT); + + +exit(0); diff --git a/qcd/part_cpu/applications/QCD/sizes.pp b/qcd/part_cpu/applications/QCD/sizes.pp new file mode 100644 index 0000000000000000000000000000000000000000..d0e69102317db6ed0c92dc78234647091a8fbe81 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/sizes.pp @@ -0,0 +1,7 @@ +medium problem size + + * KA_NX=KA_NY=32, KA_NZ=KA_NT=64 + * KB_NX=KB_NY=KB_NC=256 + * KC_NX=KC_NY=KC_NZ=KC_NT=8 + * KD_L=KD_T=64 + * KE_NX=KE_NY=KE_NZ=64, KE_NT=32 \ No newline at end of file diff --git a/qcd/part_cpu/applications/QCD/src/Makefile.in b/qcd/part_cpu/applications/QCD/src/Makefile.in new file mode 100644 index 0000000000000000000000000000000000000000..f846472788e0196aee61ca5fad45b646474647d0 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/Makefile.in @@ -0,0 +1,35 @@ +KERNEL_ARCHS = #KERNEL_ARCHS# + +qcd-bench: qcd-bench.o $(KERNEL_ARCHS) + #LD# #LDFLAGS# -o #EXECNAME# qcd-bench.o $(KERNEL_ARCHS) #LDLIBS# + +kernel_A.a: + cd kernel_A && gmake prep-#KA_PLATFORM# && gmake kernel + +kernel_B.a: + cd kernel_B && gmake kernel + +kernel_C.a: + cd kernel_C && make kernel + +kernel_D.a: + cd kernel_D && make kernel + +kernel_E.a: + cd kernel_E && gmake kernel + +qcd-bench.o: qcd-bench.c + #MPI_CC# #CFLAGS# -c -o qcd-bench.o qcd-bench.c + +clean: + cd kernel_A && gmake clobber + cd kernel_B && make clean + cd kernel_C && make clean + cd kernel_D && make clean + cd kernel_E && gmake clean + #RM# -f qcd-bench.o qcd-bench + #RM# -f $(KERNEL_ARCHS) + +#nm -X64 *.a | grep ' T ' | cut -f 1 -d ' ' | sort > all.sort.dat +#nm -X64 *.a | grep ' T ' | cut -f 1 -d ' ' | sort | sort -u > all.sort.unique.dat +#diff all.sort.dat all.sort.unique.dat | grep '<' > doublicates.dat \ No newline at end of file diff --git a/qcd/part_cpu/applications/QCD/src/exchangeNames.sh b/qcd/part_cpu/applications/QCD/src/exchangeNames.sh new file mode 100644 index 0000000000000000000000000000000000000000..b1ab0f9efbf8d47b6e9ffd8d90d7856a31cb71c7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/exchangeNames.sh @@ -0,0 +1,44 @@ +replaceInFile () +{ + sed "s/${1}(/${2}(/g" $3 > tmp.dat; cp tmp.dat $3 +} + +case $1 in + "doublicates") echo "generate doublicate list"; + echo "parse files:\n`ls *.a`"; + nm -X64 *.a | grep ' T ' | cut -f 1 -d ' ' | sort > all.sort.dat + nm -X64 *.a | grep ' T ' | cut -f 1 -d ' ' | sort | sort -u > all.sort.unique.dat + diff all.sort.dat all.sort.unique.dat | grep '<' | cut -f 2- -d '.'> doublicates.dat + echo "found `wc -l doublicates.dat` doublicates" ;; + + "find") echo "looking for $2"; + grep -r $2 `find . -name '*.[c|h|f|f90|F90]'` | cut -f 2- -d '<' > find.dat; + cat find.dat;; + + "replace") echo "replace $2 by $3 in $4"; + replaceInFile $2 $3 $4;; + + "replaceAllFiles") echo "replace all $2 by $3"; + grep -r $2 `find . -name '*.[c|h|f|f90|F90]'` | cut -f 2- -d '<' | cut -f 1 -d ':' | sort -u > find.dat; + for i in `cat find.dat` + do + echo "replacing $2 by $3 in $i"; + replaceInFile $2 $3 $i + done + ;; + + "replaceAll") + echo "replace all doublicates in this directory, using doublicate list $2 and postfix $3"; + for dn in `cat $2` + do + echo "replace $dn" + grep -r $dn `find . -name '*.[c|h|f|f90|F90]'` | cut -f 2- -d '<' | cut -f 1 -d ':' | sort -u > find.dat; + for fn in `cat find.dat` + do + echo "replacing $dn by ${dn}_$3 in $fn"; + replaceInFile ${dn} ${dn}_${3} ${fn} + done + done + ;; + +esac \ No newline at end of file diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_A/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..7aab461bd39a9afb5243cf59d7563fbf804fe00e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/Makefile @@ -0,0 +1,185 @@ +#=============================================================================== +# +# BQCD -- Berlin Quantum ChromoDynamics programme +# +# Author: Hinnerk Stueben +# +# Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin +# +#------------------------------------------------------------------------------- +# +# Makefile +# +#=============================================================================== + +include Makefile.defs + +MODULES_DIR = modules + +.SUFFIXES: +.SUFFIXES: .o .F90 .c + + +.F90.f90: + $(FPP) $(FPPFLAGS) $< > $@ + +.F90.o: + $(FPP) $(FPPFLAGS) $< > $*.f90 + $(F90) -c $(FFLAGS) -I$(MODULES_DIR) $*.f90 + + +MODULES = $(MODULES_DIR)/*.o + +OBJS = \ + action.o \ + cg.o \ + checks.o \ + $(CKSUM_O) \ + conf.o \ + conf_info.o \ + cooling.o \ + dsd.o \ + dsg.o \ + dsf.o \ + dsf1.o \ + dsf2.o \ + files.o \ + flip_bc.o \ + hmc.o \ + hmc_init_p.o \ + hmc_init_phi.o \ + hmc_integrator.o \ + hmc_forces.o \ + hmc_leap_frog.o \ + hmc_test.o \ + hmc_u.o \ + h_mult.o \ + index.o \ + index2.o \ + init_common.o \ + init_modules.o \ + iteration_count.o \ + mc.o \ + misc.o \ + mre.o \ + mtdagmt.o \ + m_tilde.o \ + polyakov_loop.o \ + $(RANDOM_O) \ + sc.o \ + service.o \ + staple.o \ + su3.o \ + swap.o \ + timing.o \ + traces.o \ + $(UUU_O) \ + w_mult.o \ + xyzt2i.o + +LIBS = d/$(LIBD) comm/$(LIBCOMM) clover/$(LIBCLOVER) + +kernel: + cd modules && $(MAKE) + cd d && $(MAKE) fast + cd comm && $(MAKE) fast + cd clover && $(MAKE) fast + $(MAKE) ../kernel_A.a + +../kernel_A.a: bqcd.o $(MODULES) $(OBJS) $(LIBS) + $(AR) $(ARFLAGS) ../kernel_A.a *.o d/*.o modules/*.o comm/*.o clover/*.o + +bqcd: bqcd.o $(MODULES) $(OBJS) $(LIBS) + $(F90) -o $@ $(LDFLAGS) bqcd.o $(MODULES) $(OBJS) $(LIBS) $(SYSLIBS) + + +fast: + cd modules && $(MAKE) + cd d && $(MAKE) fast + cd comm && $(MAKE) fast + cd clover && $(MAKE) fast + $(FAST_MAKE) bqcd + +clean: + rm -f bqcd.[0-9][0-9][0-9].* diag.[0-9][0-9] core app.rif + rm -f random_test random_test.dump random_test.out + rm -f test_echo + rm -f a.out out out1 out2 + rm -f ../kernel-bqcd.a + +tidy: clean + rm -f *.[Toid] *.f90 *.mod work.pc work.pcl + +clobber: tidy + rm -f bqcd + $(MAKE) clobber_libs + cd modules && $(MAKE) clean + +Modules: + cd modules && $(MAKE) + +libd: + cd d && $(MAKE) + +libclover: $(MODULES) + cd clover && $(MAKE) + +libs: + cd d && $(MAKE) + cd comm && $(MAKE) + cd clover && $(MAKE) + +clean_libs: + cd d && $(MAKE) clean + cd comm && $(MAKE) clean + cd clover && $(MAKE) clean + +clobber_libs: + cd d && $(MAKE) clobber + cd comm && $(MAKE) clobber + cd clover && $(MAKE) clobber + +the_ranf_test: ranf.o + $(FPP) $(FPPFLAGS) ranf_test.F90 ranf_test.f90 + $(F90) ranf_test.f90 ranf.o + ./a.out | diff - ranf_test.reference + +test_echo: test_echo.o service.o + $(F90) -o $@ $(LDFLAGS) test_echo.o service.o + +prep: + rm -f Makefile.var service.F90 + ln -s platform/Makefile-$(PLATFORM).var Makefile.var + ln -s platform/service-$(PLATFORM).F90 service.F90 + +prep-altix: + $(MAKE) prep PLATFORM=altix + +prep-bgl: + $(MAKE) prep PLATFORM=bgl + +prep-cray: + $(MAKE) prep PLATFORM=cray + +prep-hitachi: + $(MAKE) prep PLATFORM=hitachi + +prep-hitachi-omp: + $(MAKE) prep PLATFORM=hitachi-omp + +prep-ibm: + $(MAKE) prep PLATFORM=ibm + +prep-hp: + $(MAKE) prep PLATFORM=hp + +prep-intel: + $(MAKE) prep PLATFORM=intel + +prep-nec: + $(MAKE) prep PLATFORM=nec + +prep-sun: + $(MAKE) prep PLATFORM=sun + +#=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/Makefile.defs.in b/qcd/part_cpu/applications/QCD/src/kernel_A/Makefile.defs.in new file mode 100644 index 0000000000000000000000000000000000000000..574fc5d616c56d9a63511edd4fdc39102cead3f5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/Makefile.defs.in @@ -0,0 +1,32 @@ +d3_buffer_vol = #d3_buffer_vol# + +SHELL = #SHELL# + +FPP = #FPP# +FPPFLAGS = #FPPFLAGS# + +F90 = #MPI_F90# +FFLAGS = #F90FLAGS# + +CC = #MPI_CC# +CFLAGS = #CFLAGS# + +AR = #AR# +ARFLAGS = #ARFLAGS# + +RANLIB = #RANLIB# + +LDFLAGS = #LDFLAGS# +SYSLIBS = #SYSLIBS# + +FAST_MAKE = #FAST_MAKE# + +RM = #RM# + +CKSUM_O = #CKSUM_O# +RANDOM_O = #RANDOM_O# +UUU_O = #UUU_O# + +LIBD = #KA_LIBD# +LIBCOMM = #KA_LIBCOMM# +LIBCLOVER = #KA_LIBCLOVER# \ No newline at end of file diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/Makefile.in b/qcd/part_cpu/applications/QCD/src/kernel_A/Makefile.in new file mode 100644 index 0000000000000000000000000000000000000000..b1ee85d3623d75eb8c53b99f22757e8de8c6e937 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/Makefile.in @@ -0,0 +1,176 @@ +#=============================================================================== +# +# BQCD -- Berlin Quantum ChromoDynamics programme +# +# Author: Hinnerk Stueben +# +# Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin +# +#------------------------------------------------------------------------------- +# +# Makefile +# +#=============================================================================== + +#include Makefile.var +include Makefile.defs + +.SUFFIXES: +.SUFFIXES: .o .F90 .c + + +.F90.f90: + $(FPP) $(FPPFLAGS) $< > $@ + +.F90.o: + $(FPP) $(FPPFLAGS) $< > $*.f90 + $(F90) -c $(FFLAGS) $*.f90 + +MODULES_DIR = modules + +MODULES = modules/*.o + +OBJS = \ + action.o \ + cg.o \ + checks.o \ + $(CKSUM_O) \ + conf.o \ + conf_info.o \ + cooling.o \ + dsd.o \ + dsg.o \ + dsf.o \ + dsf1.o \ + dsf2.o \ + files.o \ + flip_bc.o \ + hmc.o \ + hmc_init_p.o \ + hmc_init_phi.o \ + hmc_integrator.o \ + hmc_forces.o \ + hmc_leap_frog.o \ + hmc_test.o \ + hmc_u.o \ + h_mult.o \ + index.o \ + index2.o \ + init_common.o \ + init_modules.o \ + iteration_count.o \ + mc.o \ + misc.o \ + mre.o \ + mtdagmt.o \ + m_tilde.o \ + polyakov_loop.o \ + $(RANDOM_O) \ + sc.o \ + service.o \ + staple.o \ + su3.o \ + swap.o \ + timing.o \ + traces.o \ + $(UUU_O) \ + w_mult.o \ + xyzt2i.o + +LIBS = d/$(LIBD) comm/$(LIBCOMM) clover/$(LIBCLOVER) + +bqcd: bqcd.o $(MODULES) $(OBJS) $(LIBS) + $(F90) -o $@ $(LDFLAGS) bqcd.o $(MODULES) $(OBJS) $(LIBS) $(SYSLIBS) + + +fast: + cd modules && $(MAKE) + cd d && $(MAKE) fast + cd comm && $(MAKE) fast + cd clover && $(MAKE) fast + $(FAST_MAKE) bqcd + mv bqcd #EXECNAME# +# mv bqcd ../BQCD_SGI-ALTIX_cname_SGI-ALTIX.exe + +clean: + rm -f bqcd.[0-9][0-9][0-9].* diag.[0-9][0-9] core app.rif + rm -f random_test random_test.dump random_test.out + rm -f test_echo + rm -f a.out out out1 out2 + +tidy: clean + rm -f *.[Toid] *.f90 *.mod work.pc work.pcl + +clobber: tidy + rm -f bqcd + $(MAKE) clobber_libs + cd modules && $(MAKE) clean + +Modules: + cd modules && $(MAKE) + +libd: + cd d && $(MAKE) + +libclover: $(MODULES) + cd clover && $(MAKE) + +libs: + cd d && $(MAKE) + cd comm && $(MAKE) + cd clover && $(MAKE) + +clean_libs: + cd d && $(MAKE) clean + cd comm && $(MAKE) clean + cd clover && $(MAKE) clean + +clobber_libs: + cd d && $(MAKE) clobber + cd comm && $(MAKE) clobber + cd clover && $(MAKE) clobber + +the_ranf_test: ranf.o + $(FPP) $(FPPFLAGS) ranf_test.F90 ranf_test.f90 + $(F90) ranf_test.f90 ranf.o + ./a.out | diff - ranf_test.reference + +test_echo: test_echo.o service.o + $(F90) -o $@ $(LDFLAGS) test_echo.o service.o + +prep: + rm -f Makefile.var service.F90 + ln -s platform/Makefile-$(PLATFORM).var Makefile.var + ln -s platform/service-$(PLATFORM).F90 service.F90 + +prep-altix: + $(MAKE) prep PLATFORM=altix + +prep-bgl: + $(MAKE) prep PLATFORM=bgl + +prep-cray: + $(MAKE) prep PLATFORM=cray + +prep-hitachi: + $(MAKE) prep PLATFORM=hitachi + +prep-hitachi-omp: + $(MAKE) prep PLATFORM=hitachi-omp + +prep-ibm: + $(MAKE) prep PLATFORM=ibm + +prep-hp: + $(MAKE) prep PLATFORM=hp + +prep-intel: + $(MAKE) prep PLATFORM=intel + +prep-nec: + $(MAKE) prep PLATFORM=nec + +prep-sun: + $(MAKE) prep PLATFORM=sun + +#=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/Makefile.var b/qcd/part_cpu/applications/QCD/src/kernel_A/Makefile.var new file mode 100644 index 0000000000000000000000000000000000000000..4058c3904be9f1b63ec9cc2a5939e6a3ac34f158 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/Makefile.var @@ -0,0 +1,108 @@ +#=============================================================================== +# +# BQCD -- Berlin Quantum ChromoDynamics programme +# +# Author: Hinnerk Stueben +# +# Copyright (C) 2005, Hinnerk Stueben, Zuse-Institut Berlin +# +#------------------------------------------------------------------------------- +# +# Makefile-altix.var - settings on SGI-Altix +# +#------------------------------------------------------------------------------- + +timing = 1 +mpi = 1 +omp = 1 +shmem = +shmempi = +debug = +libd = 2 +d3_buffer_vol = 32*32*16*16 + +#------------------------------------------------------------------------------- + +SHELL = /bin/ksh + +FPP = mpif90 -g -E +FPP2 = icc -E -C -P +F90 = mpif90 +CC = mpicc +AR = ar +RANLIB = echo + +MODULES_FLAG = -I$(MODULES_DIR) + +MYFLAGS = -DINTEL -DALTIX +FFLAGS_STD= $(MODULES_FLAG) +CFLAGS_STD= -DNamesToLower_ +ARFLAGS = rv + +LDFLAGS = -Vaxlib +SYSLIBS = + +FAST_MAKE = gmake -j 8 + +CKSUM_O = cksum.o +RANDOM_O = ran.o ranf.o +UUU_O = uuu_f90.o + +LIBD = +LIBCOMM = lib_single_pe.a +LIBCLOVER = libclover.a + +#------------------------------------------------------------------------------- + +ifdef timing + MYFLAGS += -DTIMING +endif + +ifdef mpi + LIBCOMM = lib_mpi.a +endif + +ifdef omp + F90 += -openmp + MYFLAGS += -D_OPENMP +endif + +ifdef shmem + LDFLAGS += -lsma + LIBCOMM = lib_shmem.a +endif + +ifdef shmempi + LDFLAGS += -lsma + LIBCOMM = lib_shmempi.a +endif + +ifdef debug + FFLAGS = -g -O0 $(FFLAGS_STD) + CFLAGS = -g -O0 $(CFLAGS_STD) +else + FFLAGS = -O2 $(FFLAGS_STD) + CFLAGS = -O2 $(CFLAGS_STD) +endif + +ifeq ($(libd),1) + LIBD = libd.a + MYFLAGS += -DD3_BUFFER_VOL=1 +endif + +ifeq ($(libd),2) + LIBD = libd2.a + MYFLAGS += -DD3_BUFFER_VOL=1 +endif + +ifeq ($(libd),21) + LIBD = libd21.a + MYFLAGS += -DD3_BUFFER_VOL=1 +endif + +ifeq ($(libd),3) + LIBD = libd3.a + MYFLAGS += -DD3_BUFFER_VOL='$(d3_buffer_vol)' +endif + +#=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/action.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/action.F90 new file mode 100644 index 0000000000000000000000000000000000000000..24ad7a1c5268a8a0adc7042d49432513986f4769 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/action.F90 @@ -0,0 +1,130 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! action.F90 - calculation of actions +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +REAL function sf(para, conf) ! returns S_f + + use typedef_hmc + use module_p_interface + use module_vol + implicit none + + type(hmc_para), intent(in) :: para + type(hmc_conf), intent(in) :: conf + + P_SPINCOL_FIELD, save :: a, b + + REAL, external :: dotprod, clover_action + integer :: iterations + external :: mtdagmt + + if (para%kappa == ZERO) then + sf = ZERO + else + ALLOCATE_SC_FIELD(a) + ALLOCATE_SC_FIELD(b) + + call flip_bc(conf%u) + + call sc_copy(a, conf%phi) ! A = phi + + call cg(mtdagmt, a, conf%phi, para, conf, iterations) ! A = inv(M~+ M~) Phi + call mtil(b, a, para, conf) ! B = M~ A + + sf = dotprod(b, b, SIZE_SC_FIELD) + + call flip_bc(conf%u) + endif + + if (para%csw_kappa /= ZERO) sf = sf + clover_action(conf%b(1,1,ODD)) +end + +!------------------------------------------------------------------------------- +REAL function sg(u) ! returns S_g + + use module_nn + use module_vol + implicit none + + GAUGE_FIELD :: u + REAL :: plaq, global_sum, p + SU3 :: uuu + integer :: i, e, o, mu, nu, j1, j2 + REAL, external :: Re_Tr_uu + + TIMING_START(timing_bin_plaq) + + plaq = 0 + + do mu = 1, DIM + do e = EVEN, ODD + o = EVEN + ODD - e + do nu = mu + 1, DIM + p = ZERO + !$omp parallel do reduction(+: p) private(j1, j2, uuu) + do i = 1, VOLH + + ! (j2,o) --<-- x nu + ! | | + ! v ^ ^ + ! | | | + ! (i,e) -->-- (j1,o) x--> mu + + + j1 = nn(i, e, mu, FWD) + j2 = nn(i, e, nu, FWD) + + uuu = 0 + call uuu_fwd(uuu, u(1, 1, j1, o, nu), & + u(1, 1, j2, o, mu), & + u(1, 1, i, e, nu)) + + p = p + Re_Tr_uu(uuu, u(1, 1, i, e, mu)) + enddo + !$omp end parallel do + plaq = plaq + p + enddo + enddo + enddo + + plaq = global_sum(plaq) + + sg = (6 * volume) - plaq / THREE + + TIMING_STOP(timing_bin_plaq) + +end + +!------------------------------------------------------------------------------- +REAL function sp(p) ! returns action of momenta p + + use module_vol + implicit none + REAL, external :: dotprod + GENERATOR_FIELD :: p + + integer :: mu, eo + + sp = ZERO + do mu = 1, DIM + do eo = EVEN, ODD + sp = sp + dotprod(p(1, 1, eo, mu), p(1, 1, eo, mu), NGEN * volh) + enddo + enddo + sp = sp * HALF + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/bqcd.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/bqcd.F90 new file mode 100644 index 0000000000000000000000000000000000000000..9141c4c2f7c6ad61a5a654ffd8f896df441d13aa --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/bqcd.F90 @@ -0,0 +1,534 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! bqcd.F90 - main program and read/write of parameters +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- + +! JuBE +! use kernel_a as a subroutine in the qcd-bench, this was the main function +! in the original code +subroutine kernel_a() + + use typedef_flags + use typedef_para + use module_input + use module_function_decl + implicit none + + type(type_para) :: para + type(hmc_conf), dimension(MAX_TEMPER) :: conf + type(type_flags) :: flags + SECONDS :: time0, sekunden + integer :: kernel_number + + kernel_number = 0 + +! JuBE +! call jube initial function + call jube_kernel_init(kernel_number) + +! JuBE +! set the flags%input to the inputfile name: bqcd-input + flags%input = "kernel_A.input" + + time0 = sekunden() ! start/initialize timer + + TIMING_START(timing_bin_total) + + call comm_init() + +! JuBE +! there is no need for the following call, we ignore all cmd line arguments, non +! of them but the input file name (set above) is relevant for the benchmark +! call get_flags(flags) + + call begin(UREC, "Job") + call input_read(flags%input) + call init_para(para, flags) + call init_counter(para, flags) + call init_ran(para, flags) + call init_cooling(input%measure_cooling_list) + + call set_fmt_ensemble(para%n_temper) + call check_fmt(para%run, para%n_temper, para%maxtraj, para%L(4) - 1) + + call init_common(para) + call init_modules() + + call write_header(para) + + call init_flip_bc() + call init_cg_para(para%cg_rest, para%cg_maxiter, para%cg_log) + call init_cg_stat() + call init_xbound() + call init_confs(para, conf) + + call check_former(para%n_temper, conf) + + +! JuBE +! call jube kernel run function + call jube_kernel_run() + + call mc(para, conf) + !!call xbound_test() + +! JuBE +! call jube kernel finalize function + call jube_kernel_finalize() + + + call conf_write(.true., para, conf) + + call write_counter(para%maxtraj) + call write_ran() + + TIMING_STOP(timing_bin_total) + + call write_footer(time0) + call end_A(UREC, "Job") + + call comm_finalize() + +! JuBE +! call jube kernel end function + call jube_kernel_end() + +end subroutine kernel_a + +!------------------------------------------------------------------------------- +subroutine init_para(para, flags) + + ! initialises module_para, module_switches and module_mre + + use typedef_flags + use typedef_para + use module_bqcd + use module_input + use module_mre + use module_switches + implicit none + + type(type_para) :: para + type(type_flags) :: flags + integer :: i + logical :: quenched, dynamical, clover, h_ext + + quenched = .false. + dynamical = .false. + clover = .false. + h_ext = .false. + + para%run = input%run + para%L = input%lattice + para%NPE = input%processes + para%bc_fermions = input%boundary_conditions_fermions + para%gamma_index = input%gamma_index + para%n_temper = input%ensembles + para%nstd = input%tempering_steps_without + para%nforce = input%hmc_accept_first + para%ntraj = input%mc_steps + para%maxtraj = input%mc_total_steps + para%nsave = input%mc_save_frequency + para%c_cg_rest = input%solver_rest + para%cg_maxiter = input%solver_maxiter + para%cg_log = input%solver_ignore_no_convergence + mre_n_vec = input%solver_mre_vectors + + call check_bc_fermions(para%bc_fermions, para%gamma_index) + + read(para%c_cg_rest, *) para%cg_rest + + if (para%n_temper <= 0) call die("init_para(): n_temper <= 0") + if (para%n_temper > MAX_TEMPER) call die("init_para(): n_temper > max_temper") + + do i = 1, para%n_temper + para%c_hmc(i)%beta = input%beta(i) + para%c_hmc(i)%kappa = input%kappa(i) + para%c_hmc(i)%csw = input%csw(i) + para%c_hmc(i)%h = input%h(i) + para%c_hmc(i)%traj_length = input%hmc_trajectory_length(i) + para%c_hmc(i)%ntau = input%hmc_steps(i) + para%c_hmc(i)%rho = input%hmc_rho(i) + para%c_hmc(i)%m_scale = input%hmc_m_scale(i) + para%info_file(i) = input%start_info_file(i) + + read(para%c_hmc(i)%beta, *) para%hmc(i)%beta + read(para%c_hmc(i)%kappa, *) para%hmc(i)%kappa + read(para%c_hmc(i)%csw, *) para%hmc(i)%csw + read(para%c_hmc(i)%h, *) para%hmc(i)%h + read(para%c_hmc(i)%traj_length,*) para%hmc(i)%traj_length + read(para%c_hmc(i)%ntau, *) para%hmc(i)%ntau + read(para%c_hmc(i)%rho, *) para%hmc(i)%rho + read(para%c_hmc(i)%m_scale, *) para%hmc(i)%m_scale + + if (para%hmc(i)%kappa == ZERO .and. para%hmc(i)%csw /= ZERO) then + para%hmc(i)%csw_kappa = para%hmc(i)%csw + para%c_hmc(i)%csw = "-1 (infinity)" + para%hmc(i)%csw = -1 + else + para%hmc(i)%csw_kappa = para%hmc(i)%csw * para%hmc(i)%kappa + call check_csw(para%hmc(i)%beta, para%hmc(i)%csw) + endif + + para%hmc(i)%tau = para%hmc(i)%traj_length / para%hmc(i)%ntau + + write(para%c_hmc(i)%csw_kappa, "(f20.15)") para%hmc(i)%csw_kappa + write(para%c_hmc(i)%tau, "(f20.15)") para%hmc(i)%tau + + if (para%hmc(i)%kappa == ZERO .and. para%hmc(i)%csw == ZERO) then + quenched = .true. + else + dynamical = .true. + endif + + if (para%hmc(i)%csw /= ZERO) clover = .true. + if (para%hmc(i)%h /= ZERO) h_ext = .true. + + para%hmc(i)%model = input%hmc_model + + if (para%hmc(i)%model == "A" .and. para%hmc(i)%rho /= ZERO) then + call warn("init_para(): model == A but rho /= 0") + endif + + if (para%hmc(i)%model /= "A" .and. para%hmc(i)%rho == ZERO) then + call warn("init_para(): model /= A but rho == 0") + endif + enddo + + select case (input%start_configuration) + case ("hot"); para%start = START_HOT + case ("cold"); para%start = START_COLD + case ("file"); para%start = START_FILE + case default + call die("init_para(): start_configuration must be {hot|cold|file}") + end select + + select case (input%start_random) + case ("random"); para%seed = -1 + case ("default"); para%seed = 0 + case default; read(input%start_random, *) para%seed + end select + + select case (input%tempering_swap_sequence) + case ("up"); para%swap_seq = SWAP_UP + case ("down"); para%swap_seq = SWAP_DOWN + case ("random"); para%swap_seq = SWAP_RANDOM + case default + call die("init_para(): tempering_swap_sequence must be {up|down|random}") + end select + + if (quenched .and. dynamical) call die("init_para(): quenched/dynamical mixed") + + if (para%nforce < 0) call die("init_para(): nforce < 0") + + if (flags%continuation_job) para%start = START_CONT + + + switches%quenched = quenched + switches%dynamical = dynamical + switches%clover = clover + switches%h_ext = h_ext + switches%hasenbusch = (input%hmc_model /= "A") + + if (quenched) switches%hasenbusch = .false. + + switches%tempering = .false. + switches%measure_polyakov_loop = .false. + switches%measure_traces = .false. + + if (input%ensembles > 1) switches%tempering = .true. + if (input%measure_polyakov_loop /= 0) switches%measure_polyakov_loop = .true. + if (input%measure_traces /= 0) switches%measure_traces = .true. + + if (input%hmc_test == 0) then + switches%hmc_test = .false. + else + switches%hmc_test = .true. + endif + +end subroutine init_para + +!------------------------------------------------------------------------------- +subroutine init_counter(para, flags) + + use typedef_flags + use typedef_para + use module_counter + use module_function_decl + implicit none + + type(type_para) :: para + type(type_flags) :: flags + FILENAME, external :: count_file, stop_file + + if (f_exist(stop_file())) then + call die("init_counter(): found stop file " // stop_file()) + endif + + counter%run = para%run + counter%j_traj = 0 + + if (flags%continuation_job) then + open(UCOUNT, file = count_file(), action = "read", status = "old") + read(UCOUNT, *) counter%run + read(UCOUNT, *) counter%job + read(UCOUNT, *) counter%traj + close(UCOUNT) + + if (counter%run /= para%run) call die("init_counter(): RUN inconsistent") + counter%job = counter%job + 1 + else + counter%run = para%run + counter%job = 1 + counter%traj = -para%nforce + endif + +end subroutine init_counter + +!------------------------------------------------------------------------------- +subroutine write_counter(maxtraj) + + use module_counter + use module_function_decl + implicit none + + integer :: maxtraj + FILENAME, external :: count_file, stop_file + + if (my_pe() /= 0) return + + open(UCOUNT, file = count_file(), action = "write") + write(UCOUNT, *) counter%run, " run" + write(UCOUNT, *) counter%job, " job" + write(UCOUNT, *) counter%traj, " traj" + close(UCOUNT) + + if (counter%traj >= maxtraj) then + open(UCOUNT, file = stop_file(), status = "unknown") + close(UCOUNT) + endif + +end subroutine write_counter + +!------------------------------------------------------------------------------- +subroutine write_header(para) + + use typedef_para + use module_bqcd + use module_counter + use module_function_decl + use module_input + use module_mre + use module_thread + implicit none + + type(type_para) :: para + integer :: i + character(len = 50) :: fmt + character(len = 4), external :: format_ensemble + + if (my_pe() == 0) then + + fmt = "(1x,a," // format_ensemble() // ",2a)" + + call begin(UREC, "Header") + + if (input%comment /= "") then + write(UREC, 405) "Comment", trim(input%comment) + endif + + write(UREC, 400) "Program", prog_name, prog_version + write(UREC, *) "Version_of_D ", version_of_d() + write(UREC, *) "Communication ", trim(comm_method()) + write(UREC, *) "Run ", para%run + write(UREC, *) "Job ", counter%job + write(UREC, 405) "Host", rechner() + write(UREC, 400) "Date", datum(), uhrzeit() + write(UREC, 410) "L ", para%L + write(UREC, 410) "NPE ", para%NPE + write(UREC, 410) "bc_fermions", para%bc_fermions + write(UREC, 410) "gamma_index", para%gamma_index + + + write(UREC, *) "Threads ", n_thread + write(UREC, *) "Start ", para%start + + if (para%start == START_FILE) then + do i = 1, para%n_temper + write(UREC, fmt) "StartConf_", i, " ", trim(para%info_file(i)) + enddo + endif + + write(UREC, *) "Seed ", para%seed + write(UREC, *) "Swap_seq", para%swap_seq + write(UREC, *) "N_force ", para%nforce + write(UREC, *) "N_traj ", para%ntraj + write(UREC, *) "N_save ", para%nsave + write(UREC, *) "N_temper", para%n_temper + + do i = 1, para%n_temper + write(UREC, fmt) "beta_", i, " ", trim(para%c_hmc(i)%beta) + write(UREC, fmt) "kappa_", i, " ", trim(para%c_hmc(i)%kappa) + write(UREC, fmt) "csw_", i, " ", trim(para%c_hmc(i)%csw) + write(UREC, fmt) "csw_kappa_", i, " ", trim(para%c_hmc(i)%csw_kappa) + write(UREC, fmt) "h_", i, " ", trim(para%c_hmc(i)%h) + write(UREC, fmt) "tau_", i, " ", trim(para%c_hmc(i)%tau) + write(UREC, fmt) "N_tau_", i, " ", trim(para%c_hmc(i)%ntau) + write(UREC, fmt) "traj_length_", i, " ", trim(para%c_hmc(i)%traj_length) + write(UREC, fmt) "rho_", i, " ", trim(para%c_hmc(i)%rho) + write(UREC, fmt) "m_scale_", i, " ", trim(para%c_hmc(i)%m_scale) + enddo + + write(UREC, *) "HMC_model ", para%hmc(1)%model + write(UREC, *) "REAL_kind ", RKIND + write(UREC, 405) "CG_rest ", trim(para%c_cg_rest) + write(UREC, *) "MRE_vectors ", mre_n_vec + + call end_A(UREC, "Header") + +400 format (3(1x,a)) +405 format (2(1x,a)) +410 format (1x,a,4i3) + + endif + +end subroutine write_header + +!------------------------------------------------------------------------------- +subroutine write_footer(time0) + + use module_function_decl + use module_thread + implicit none + + SEED :: seed + SECONDS :: time0, sekunden + + call ranget(seed) + + call begin(UREC, "Footer") + + if (my_pe() == 0) then + write(UREC, 400) "Date", datum(), uhrzeit() + write(UREC, *) "Seed", seed + write(UREC, 410) "CPU-Time", & + sekunden() - time0, "s on", num_pes() * n_thread, "CPUs" + endif + +400 format (3(1x,a)) +410 format (1x,a,1x,f8.1,1x,a,1x,i5,1x,a) + + TIMING_WRITE(UREC) + + call end_A(UREC, "Footer") + +end subroutine write_footer + +!------------------------------------------------------------------------------- +subroutine get_flags(flags) + + use typedef_cksum + use typedef_flags + use module_bqcd + use module_function_decl + use module_input + implicit none + + type(type_flags), intent(out) :: flags + + integer :: iarg, length, stat, narg + integer, external :: ipxfargc + character(len = 2) :: opt + + flags%continuation_job = .false. + flags%show_version = .false. + + narg = ipxfargc() + + iarg = 1 + do while (iarg <= narg) + call pxfgetarg(iarg, opt, length, stat) + + if (opt(1:1) == "-") then + if (length > 2) call usage() + + select case (opt(2:2)) + case ("c") + flags%continuation_job = .true. + iarg = iarg + 1 + case ("I") + call input_dump(6) + call comm_finalize() + stop + case ("V") + flags%show_version = .true. + iarg = iarg + 1 + case default + call usage + end select + else + exit + endif + enddo + + if (flags%show_version) then + call version() + call comm_finalize() + stop + endif + + call take_arg(iarg, flags%input, narg) + if (narg >= iarg) call usage + +CONTAINS + + subroutine usage() + implicit none + call die("Usage: " // prog_name // " [-c] [-I] [-V] input") + end subroutine usage + + subroutine version() + implicit none + + if (my_pe() == 0) then + write(6,*) "This is ", prog_name, " ", prog_version + write(6,*) " input format: ", input_version + write(6,*) " conf info format:", conf_info_version + write(6,*) " MAX_TEMPER: ", MAX_TEMPER + write(6,*) " real kind: ", RKIND + write(6,*) " version of D: ", version_of_d() + write(6,*) " D3: buffer vol: ", get_d3_buffer_vol() + write(6,*) " communication: ", trim(comm_method()) + endif + end subroutine version + + subroutine take_arg(iarg, arg, narg) + implicit none + integer, intent(inout) :: iarg + character(len = *), intent(out) :: arg + integer, intent(in) :: narg + integer :: length, stat + + if (iarg > narg) call usage + call pxfgetarg(iarg, arg, length, stat) + if (length > len(arg)) then + call die("get_flags(): " // arg // ": argument too long") + endif + iarg = iarg + 1 + end subroutine take_arg + +end subroutine get_flags + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/bqcd.pcl b/qcd/part_cpu/applications/QCD/src/kernel_A/bqcd.pcl new file mode 100644 index 0000000000000000000000000000000000000000..10905e9f70bd1b3bd9c17624efe4251d574c8400 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/bqcd.pcl @@ -0,0 +1,2 @@ +work.pc +modules/work.pc diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/cg.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/cg.F90 new file mode 100644 index 0000000000000000000000000000000000000000..8ab5d17ce98c328687c12ea6631da68d1e2e4837 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/cg.F90 @@ -0,0 +1,167 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! cg.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module module_cg + + type type_cg_para + real :: rest + integer :: maxiter + integer :: log + end type type_cg_para + + type(type_cg_para), save :: cg_para + + type type_cg_stat + integer :: niter + integer :: niter_max + integer :: niter_tot + integer :: ncall + end type type_cg_stat + + type(type_cg_stat), save :: cg_stat + + integer, save :: cg_iterations_total = 0 ! used in timing.F90 +end + +!------------------------------------------------------------------------------- +subroutine cg(matrix_mult, x, b, para, conf, iterations) + + ! solves "matrix_mult * x = b" and returns number of iterations + + use module_cg + use module_function_decl + use module_p_interface + use module_vol + use typedef_hmc + implicit none + + external :: matrix_mult + SPINCOL_OVERINDEXED, intent(out) :: x + SPINCOL_OVERINDEXED, intent(in) :: b + type(hmc_para), intent(in) :: para + type(hmc_conf), intent(in) :: conf + integer, intent(out) :: iterations + + P_SPINCOL_OVERINDEXED, save :: r, p, aap + + REAL :: ak, bk, rtr, rtrold, paap + integer :: i, niter + character(72) :: msg + + TIMING_START(timing_bin_cg) + + ALLOCATE_SC_OVERINDEXED(r) + ALLOCATE_SC_OVERINDEXED(p) + ALLOCATE_SC_OVERINDEXED(aap) + + call matrix_mult(r, x, para, conf) + + rtrold = ZERO + !$omp parallel do reduction(+: rtrold) + do i = 1, size_sc_field + r(i) = b(i) - r(i) + p(i) = r(i) + rtrold = rtrold + r(i)**2 + enddo + + rtrold = global_sum(rtrold) + + do niter = 1, cg_para%maxiter + call matrix_mult(aap, p, para, conf) + + paap = sc_dot(p, aap) + paap = global_sum(paap) + + ak = rtrold / paap + + rtr = ZERO + !$omp parallel do reduction(+: rtr) + do i = 1, size_sc_field + x(i) = x(i) + ak * p(i) + r(i) = r(i) - ak * aap(i) + rtr = rtr + r(i)**2 + enddo + + rtr = global_sum(rtr) + + if (rtr <= cg_para%rest) goto 9999 + + bk = rtr / rtrold + rtrold = rtr + + call sc_xpby(p, r, bk) ! p = r + bk * p + enddo + + niter = niter - 1 + + if (cg_para%log /= 2) then + write(msg, *) "cg(): no convergence; rtr = ", rtr + call die(msg) + endif + +9999 continue + + cg_stat%ncall = cg_stat%ncall + 1 + cg_stat%niter = niter + cg_stat%niter_max = max(cg_stat%niter_max, niter) + cg_stat%niter_tot = cg_stat%niter_tot + niter + cg_iterations_total = cg_iterations_total + niter + + iterations = niter + + TIMING_STOP(timing_bin_cg) +end + +!------------------------------------------------------------------------------- +subroutine init_cg_para(rest, maxiter, log) + + use module_cg + implicit none + real rest + integer maxiter, log + + cg_para%rest = rest + cg_para%maxiter = maxiter + cg_para%log = log + +end + +!------------------------------------------------------------------------------- +subroutine init_cg_stat() + + use module_cg + implicit none + + cg_stat%ncall = 0 + cg_stat%niter_max = 0 + cg_stat%niter_tot = 0 + +end + +!------------------------------------------------------------------------------- +subroutine get_cg_stat(ncall, niter_max, niter_tot) + + use module_cg + implicit none + integer ncall, niter_max, niter_tot + + ncall = cg_stat%ncall + niter_max = cg_stat%niter_max + niter_tot = cg_stat%niter_tot + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/checks.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/checks.F90 new file mode 100644 index 0000000000000000000000000000000000000000..8c8cd815dd6e1089ba95ef3aa3d80833755bc086 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/checks.F90 @@ -0,0 +1,60 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! checks.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine check_csw(beta, csw) + + implicit none + REAL, intent(in) :: beta, csw + REAL :: g, c + + if (beta == ZERO) return + if (csw == ZERO) return + + g = SIX / beta + c = ONE - 0.454 * g - 0.175 * g**2 + 0.012 * g**3 + 0.045 * g**4 + c = c / (ONE - 0.720 * g) + + if (abs(c - csw) > 0.00005) then + call warn("check_csw(): c_sw differs more than 0.00005 from ALPHA value") + endif + +end + +!------------------------------------------------------------------------------- +subroutine check_bc_fermions(bc_fermions, gamma_index) + + ! warns if the number of anti-periodic fermionic b.c. is 1 and + ! the anti-periodic direction is not the gamma_4 direction + + implicit none + integer, dimension(DIM), intent(in) :: bc_fermions, gamma_index + + integer :: i, i_anti, count + + count = 0 + do i = 1, DIM + if (bc_fermions(i) < 0) then + count = count + 1 + i_anti = i + endif + enddo + + if (count == 1 .and. gamma_index(i_anti) /= 4) then + call warn("check_bc_fermions(): anti-periodic b.c. not in gamma_4 direction") + endif + +end +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/cksum.c b/qcd/part_cpu/applications/QCD/src/kernel_A/cksum.c new file mode 100644 index 0000000000000000000000000000000000000000..0b3c77470db5537b5b750e9a784a0474568b16ea --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/cksum.c @@ -0,0 +1,154 @@ +/* +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin +! +!------------------------------------------------------------------------------- +! +! Adopted from: +! +! ================== +! QCD SF/T3E PROGRAM +! ================== +! +! Calculate a modified cyclic redundancy check (CRC is specified by the +! POSIX.2 standard). Modification is necessary since I do not know how +! to handle a unsigned long in FORTRAN. Solution: CKSUM returns negative +! number if result is > LONG_MAX. +! +! CKSUM_GET() returns always positive numbers < LONG_MAX. (H.S.) +! +! +! Parts of the source come from cksum.c of the GNU text utilities (version +! 1.19) written by Q. Frank Xia. +! +! $Log: cksum.c,v $ +! Revision 1.1 2007/11/14 13:10:15 mallalen +! *** empty log message *** +! +! Revision 1.1 1997/12/04 10:31:09 pleiter +! Initial writing attempt +! +!----------------------------------------------------------------------------- +*/ + +#ifdef NamesToLower_ +# define CKSUM_INIT cksum_init_ +# define CKSUM_ADD cksum_add_ +# define CKSUM_GET cksum_get_ +#endif + +#ifdef NamesToLower +# define CKSUM_INIT cksum_init +# define CKSUM_ADD cksum_add +# define CKSUM_GET cksum_get +#endif + +#ifdef LongLong +# define INT8 long long +#else +# define INT8 long +#endif + +void CKSUM_INIT(void); +void CKSUM_ADD(void *, INT8 *); +void CKSUM_GET(INT8 *, INT8 *); + +static unsigned INT8 the_crc = 0; +static INT8 the_bytes = 0; + +static unsigned INT8 const crctab[256] = +{ + 0x0, + 0x04C11DB7, 0x09823B6E, 0x0D4326D9, 0x130476DC, 0x17C56B6B, + 0x1A864DB2, 0x1E475005, 0x2608EDB8, 0x22C9F00F, 0x2F8AD6D6, + 0x2B4BCB61, 0x350C9B64, 0x31CD86D3, 0x3C8EA00A, 0x384FBDBD, + 0x4C11DB70, 0x48D0C6C7, 0x4593E01E, 0x4152FDA9, 0x5F15ADAC, + 0x5BD4B01B, 0x569796C2, 0x52568B75, 0x6A1936C8, 0x6ED82B7F, + 0x639B0DA6, 0x675A1011, 0x791D4014, 0x7DDC5DA3, 0x709F7B7A, + 0x745E66CD, 0x9823B6E0, 0x9CE2AB57, 0x91A18D8E, 0x95609039, + 0x8B27C03C, 0x8FE6DD8B, 0x82A5FB52, 0x8664E6E5, 0xBE2B5B58, + 0xBAEA46EF, 0xB7A96036, 0xB3687D81, 0xAD2F2D84, 0xA9EE3033, + 0xA4AD16EA, 0xA06C0B5D, 0xD4326D90, 0xD0F37027, 0xDDB056FE, + 0xD9714B49, 0xC7361B4C, 0xC3F706FB, 0xCEB42022, 0xCA753D95, + 0xF23A8028, 0xF6FB9D9F, 0xFBB8BB46, 0xFF79A6F1, 0xE13EF6F4, + 0xE5FFEB43, 0xE8BCCD9A, 0xEC7DD02D, 0x34867077, 0x30476DC0, + 0x3D044B19, 0x39C556AE, 0x278206AB, 0x23431B1C, 0x2E003DC5, + 0x2AC12072, 0x128E9DCF, 0x164F8078, 0x1B0CA6A1, 0x1FCDBB16, + 0x018AEB13, 0x054BF6A4, 0x0808D07D, 0x0CC9CDCA, 0x7897AB07, + 0x7C56B6B0, 0x71159069, 0x75D48DDE, 0x6B93DDDB, 0x6F52C06C, + 0x6211E6B5, 0x66D0FB02, 0x5E9F46BF, 0x5A5E5B08, 0x571D7DD1, + 0x53DC6066, 0x4D9B3063, 0x495A2DD4, 0x44190B0D, 0x40D816BA, + 0xACA5C697, 0xA864DB20, 0xA527FDF9, 0xA1E6E04E, 0xBFA1B04B, + 0xBB60ADFC, 0xB6238B25, 0xB2E29692, 0x8AAD2B2F, 0x8E6C3698, + 0x832F1041, 0x87EE0DF6, 0x99A95DF3, 0x9D684044, 0x902B669D, + 0x94EA7B2A, 0xE0B41DE7, 0xE4750050, 0xE9362689, 0xEDF73B3E, + 0xF3B06B3B, 0xF771768C, 0xFA325055, 0xFEF34DE2, 0xC6BCF05F, + 0xC27DEDE8, 0xCF3ECB31, 0xCBFFD686, 0xD5B88683, 0xD1799B34, + 0xDC3ABDED, 0xD8FBA05A, 0x690CE0EE, 0x6DCDFD59, 0x608EDB80, + 0x644FC637, 0x7A089632, 0x7EC98B85, 0x738AAD5C, 0x774BB0EB, + 0x4F040D56, 0x4BC510E1, 0x46863638, 0x42472B8F, 0x5C007B8A, + 0x58C1663D, 0x558240E4, 0x51435D53, 0x251D3B9E, 0x21DC2629, + 0x2C9F00F0, 0x285E1D47, 0x36194D42, 0x32D850F5, 0x3F9B762C, + 0x3B5A6B9B, 0x0315D626, 0x07D4CB91, 0x0A97ED48, 0x0E56F0FF, + 0x1011A0FA, 0x14D0BD4D, 0x19939B94, 0x1D528623, 0xF12F560E, + 0xF5EE4BB9, 0xF8AD6D60, 0xFC6C70D7, 0xE22B20D2, 0xE6EA3D65, + 0xEBA91BBC, 0xEF68060B, 0xD727BBB6, 0xD3E6A601, 0xDEA580D8, + 0xDA649D6F, 0xC423CD6A, 0xC0E2D0DD, 0xCDA1F604, 0xC960EBB3, + 0xBD3E8D7E, 0xB9FF90C9, 0xB4BCB610, 0xB07DABA7, 0xAE3AFBA2, + 0xAAFBE615, 0xA7B8C0CC, 0xA379DD7B, 0x9B3660C6, 0x9FF77D71, + 0x92B45BA8, 0x9675461F, 0x8832161A, 0x8CF30BAD, 0x81B02D74, + 0x857130C3, 0x5D8A9099, 0x594B8D2E, 0x5408ABF7, 0x50C9B640, + 0x4E8EE645, 0x4A4FFBF2, 0x470CDD2B, 0x43CDC09C, 0x7B827D21, + 0x7F436096, 0x7200464F, 0x76C15BF8, 0x68860BFD, 0x6C47164A, + 0x61043093, 0x65C52D24, 0x119B4BE9, 0x155A565E, 0x18197087, + 0x1CD86D30, 0x029F3D35, 0x065E2082, 0x0B1D065B, 0x0FDC1BEC, + 0x3793A651, 0x3352BBE6, 0x3E119D3F, 0x3AD08088, 0x2497D08D, + 0x2056CD3A, 0x2D15EBE3, 0x29D4F654, 0xC5A92679, 0xC1683BCE, + 0xCC2B1D17, 0xC8EA00A0, 0xD6AD50A5, 0xD26C4D12, 0xDF2F6BCB, + 0xDBEE767C, 0xE3A1CBC1, 0xE760D676, 0xEA23F0AF, 0xEEE2ED18, + 0xF0A5BD1D, 0xF464A0AA, 0xF9278673, 0xFDE69BC4, 0x89B8FD09, + 0x8D79E0BE, 0x803AC667, 0x84FBDBD0, 0x9ABC8BD5, 0x9E7D9662, + 0x933EB0BB, 0x97FFAD0C, 0xAFB010B1, 0xAB710D06, 0xA6322BDF, + 0xA2F33668, 0xBCB4666D, 0xB8757BDA, 0xB5365D03, 0xB1F740B4 +}; + +void CKSUM_INIT(void) +{ + the_crc = 0; + the_bytes = 0; +} + +void CKSUM_ADD(void *memptr, INT8 *nbytes) +{ + register unsigned INT8 crc; + register INT8 i; + register unsigned char *cp; + + crc = the_crc; + cp = (unsigned char *) memptr; + for (i=0; i < *nbytes; i++) + crc = (crc << 8) ^ crctab[((crc >> 24) ^ *(cp++)) & 0xFF]; + + the_crc = crc; + the_bytes += *nbytes; +} + +void CKSUM_GET(INT8 *total_crc, INT8 *total_bytes) +{ + register unsigned INT8 crc; + register INT8 i; + + crc = the_crc; + + for (i = the_bytes; i > 0; i >>= 8) + crc = (crc << 8) ^ crctab[((crc >> 24) ^ i) & 0xFF]; + crc = (~crc & 0xFFFFFFFF); + + *total_crc = (INT8) crc; + *total_bytes = the_bytes; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/cksum_dummy.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/cksum_dummy.F90 new file mode 100644 index 0000000000000000000000000000000000000000..97fb8fda1b92d63dae060068a61611037a64fb39 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/cksum_dummy.F90 @@ -0,0 +1,35 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin +! +!------------------------------------------------------------------------------- +! +! cksum_dummy.F90 - dummy routines that can replace the real C routines +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine cksum_init() + return +end + +!------------------------------------------------------------------------------- +subroutine cksum_add(i, j) + integer i(*) + CHECK_SUM j + return +end + +!------------------------------------------------------------------------------- +subroutine cksum_get(sum, bytes) + CHECK_SUM sum, bytes + sum = 0 + bytes = 0 +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..445b413cf94079e033efb4c609177f59e8c69ea8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/Makefile @@ -0,0 +1,67 @@ +#=============================================================================== +# +# BQCD -- Berlin Quantum ChromoDynamics programme +# +# Author: Hinnerk Stueben +# +# Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin +# +#------------------------------------------------------------------------------- +# +# clover/Makefile +# +#=============================================================================== + +include ../Makefile.defs + +fpp = $(FPP) -I.. $(FPPFLAGS) + +MODULES_DIR = ../modules + +.SUFFIXES: +.SUFFIXES: .a .o .F90 + +.F90.o: + $(fpp) $< > $*.f90 + $(F90) -c $(FFLAGS) -I$(MODULES_DIR) $*.f90 + +OBJS = \ + clover_action.o \ + clover_allocate.o \ + clover_bsa.o \ + clover_d.o \ + clover_f_mu_nu.o \ + clover_init.o \ + clover_inv.o \ + clover_mult_a.o \ + clover_mult_ao.o \ + clover_mult_b.o \ + clover_t_init.o \ + clover_ts.o \ + clover_uuu.o \ + clover_uuuu.o + +OBJS_CTEST = \ + ctest.o \ + clover_inv.o \ + clover_mult_a.o \ + clover_mult_ao.o \ + clover_mult_b.o + +$(LIBCLOVER): + +libclover.a: $(OBJS) + $(AR) $(ARFLAGS) $@ $(OBJS) + $(RANLIB) $@ + +fast: + $(FAST_MAKE) + +ctest: $(OBJS_CTEST) + f90 -o $@ $(OBJS_CTEST) + +clean: + rm -f *.[Tiod] *.f90 *.mod core work.pc work.pcl + +clobber: clean + rm -f libclover.a ctest diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/bqcd.pcl b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/bqcd.pcl new file mode 100644 index 0000000000000000000000000000000000000000..906244500b31700684482c3dcfd32f6cec4279db --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/bqcd.pcl @@ -0,0 +1,2 @@ +work.pc +../modules/work.pc diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover.h b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover.h new file mode 100644 index 0000000000000000000000000000000000000000..9f22a2453f2aaceb37ffb33d9d8f03b4e08f0ac5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover.h @@ -0,0 +1,166 @@ +#ifdef CLOVER_AS_COMPLEX_ARRAY + +# define A11 Re(a(1,J,i)) +# define A22 Im(a(1,J,i)) +# define A33 Re(a(11,J,i)) +# define A44 Im(a(11,J,i)) +# define A55 Re(a(17,J,i)) +# define A66 Im(a(17,J,i)) + +# define A12 a(2,J,i) +# define A13 a(3,J,i) +# define A14 a(4,J,i) +# define A15 a(5,J,i) +# define A16 a(6,J,i) + +# define A23 a(7,J,i) +# define A24 a(8,J,i) +# define A25 a(9,J,i) +# define A26 a(10,J,i) + +# define A34 a(12,J,i) +# define A35 a(13,J,i) +# define A36 a(14,J,i) + +# define A45 a(15,J,i) +# define A46 a(16,J,i) + +# define A56 a(18,J,i) + +# define B11 Re(b(16,J,i)) +# define B22 Im(b(16,J,i)) +# define B33 Re(b(17,J,i)) +# define B44 Im(b(17,J,i)) +# define B55 Re(b(18,J,i)) +# define B66 Im(b(18,J,i)) + +# define B21 b(1,J,i) + +# define B31 b(2,J,i) +# define B32 b(3,J,i) + +# define B41 b(4,J,i) +# define B42 b(5,J,i) +# define B43 b(6,J,i) + +# define B51 b(7,J,i) +# define B52 b(8,J,i) +# define B53 b(9,J,i) +# define B54 b(10,J,i) + +# define B61 b(11,J,i) +# define B62 b(12,J,i) +# define B63 b(13,J,i) +# define B64 b(14,J,i) +# define B65 b(15,J,i) + +#else + +# define A11 a%i11 +# define A22 a%i22 +# define A33 a%i33 +# define A44 a%i44 +# define A55 a%i55 +# define A66 a%i66 + +# define A12 a%i12 +# define A13 a%i13 +# define A14 a%i14 +# define A15 a%i15 +# define A16 a%i16 + +# define A23 a%i23 +# define A24 a%i24 +# define A25 a%i25 +# define A26 a%i26 + +# define A34 a%i34 +# define A35 a%i35 +# define A36 a%i36 + +# define A45 a%i45 +# define A46 a%i46 + +# define A56 a%i56 + +# define B11 b%i11 +# define B22 b%i22 +# define B33 b%i33 +# define B44 b%i44 +# define B55 b%i55 +# define B66 b%i66 + +# define B21 b%i21 + +# define B31 b%i31 +# define B32 b%i32 + +# define B41 b%i41 +# define B42 b%i42 +# define B43 b%i43 + +# define B51 b%i51 +# define B52 b%i52 +# define B53 b%i53 +# define B54 b%i54 + +# define B61 b%i61 +# define B62 b%i62 +# define B63 b%i63 +# define B64 b%i64 +# define B65 b%i65 + +#endif + +# define A21 conjg(A12) +# define A31 conjg(A13) +# define A41 conjg(A14) +# define A51 conjg(A15) +# define A61 conjg(A16) + +# define A32 conjg(A23) +# define A42 conjg(A24) +# define A52 conjg(A25) +# define A62 conjg(A26) + +# define A43 conjg(A34) +# define A53 conjg(A35) +# define A63 conjg(A36) + +# define A54 conjg(A45) +# define A64 conjg(A46) + +# define A65 conjg(A56) + +# define B12 conjg(B21) + +# define B13 conjg(B31) +# define B23 conjg(B32) + +# define B14 conjg(B41) +# define B24 conjg(B42) +# define B34 conjg(B43) + +# define B15 conjg(B51) +# define B25 conjg(B52) +# define B35 conjg(B53) +# define B45 conjg(B54) + +# define B16 conjg(B61) +# define B26 conjg(B62) +# define B36 conjg(B63) +# define B46 conjg(B64) +# define B56 conjg(B65) + +# define SC1 1, 1 +# define SC2 1, 2 +# define SC3 1, 3 +# define SC4 2, 1 +# define SC5 2, 2 +# define SC6 2, 3 +# define SC7 3, 1 +# define SC8 3, 2 +# define SC9 3, 3 +# define SC10 4, 1 +# define SC11 4, 2 +# define SC12 4, 3 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_action.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_action.F90 new file mode 100644 index 0000000000000000000000000000000000000000..a9fbd0450cbba703fd61728ca1a91fd003fead18 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_action.F90 @@ -0,0 +1,51 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! clover_action.F90 - calculates: -2 Tr(log(T_oo)) +! +!------------------------------------------------------------------------------- +# include "defs.h" +# include "clover.h" + +!------------------------------------------------------------------------------- +REAL function clover_action(b) + + use typedef_clover + use module_vol + implicit none + + type(type_clover_b) :: b(2, volh) + integer :: i + REAL :: s, global_sum + + + s = ZERO + + !$omp parallel do reduction(+: s) + do i = 1, volh + s = s + log(det(b(1, i)) * det(b(2, i))) + enddo + + clover_action = TWO * global_sum(s) + + +CONTAINS + + REAL function det(b) ! returns (1 / det) + + type(type_clover_b) :: b + + det = B11 * B22 * B33 * B44 * B55 * B66 + + end function det + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_allocate.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_allocate.F90 new file mode 100644 index 0000000000000000000000000000000000000000..60475de54ce8caf5937248dac933d897ec09de9a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_allocate.F90 @@ -0,0 +1,40 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! clover_allocate.F90 - allocation of clover arrays +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine allocate_clover_field_a(a) + + use typedef_clover + use module_vol + implicit none + P_CLOVER_FIELD_A :: a + + allocate(a(2, volh, EVEN:ODD)) + +end + +!------------------------------------------------------------------------------- +subroutine allocate_clover_field_b(b) + + use typedef_clover + use module_vol + implicit none + P_CLOVER_FIELD_B :: b + + allocate(b(2, volh, EVEN:ODD)) + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_bsa.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_bsa.F90 new file mode 100644 index 0000000000000000000000000000000000000000..499abb10c889fd6d8c6b673a8fdd676a0e6a5f6c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_bsa.F90 @@ -0,0 +1,165 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! clover_bsa.F90 - calculates "B sigma A" +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine clover_bsa(mu, nu, w, b, a) ! w = transposed(conjg(b) sigma_mu_nu a) + + use module_vol + implicit none + + integer :: mu, nu + SU3_FIELD :: w + SPINCOL_FIELD :: b, a + + if (mu == 1) then + if (nu == 2) then ; call clover_bsa_12(w, b, a) + elseif (nu == 3) then ; call clover_bsa_13(w, b, a) + elseif (nu == 4) then ; call clover_bsa_14(w, b, a) ; endif + elseif (mu == 2) then + if (nu == 1) then ; call clover_bsa_21(w, b, a) + elseif (nu == 3) then ; call clover_bsa_23(w, b, a) + elseif (nu == 4) then ; call clover_bsa_24(w, b, a) ; endif + elseif (mu == 3) then + if (nu == 1) then ; call clover_bsa_31(w, b, a) + elseif (nu == 2) then ; call clover_bsa_32(w, b, a) + elseif (nu == 4) then ; call clover_bsa_34(w, b, a) ; endif + elseif (mu == 4) then + if (nu == 1) then ; call clover_bsa_41(w, b, a) + elseif (nu == 2) then ; call clover_bsa_42(w, b, a) + elseif (nu == 3) then ; call clover_bsa_43(w, b, a) ; endif + endif +end + +!------------------------------------------------------------------------------- +subroutine clover_bsa_12(w, b, a) + +# include "clover_bsa_head.h90" + a1 = -a(1, ca, i) + a2 = a(2, ca, i) + a3 = -a(3, ca, i) + a4 = a(4, ca, i) +# include "clover_bsa_tail.h90" + +!------------------------------------------------------------------------------- +subroutine clover_bsa_21(w, b, a) + +# include "clover_bsa_head.h90" + a1 = a(1, ca, i) + a2 = -a(2, ca, i) + a3 = a(3, ca, i) + a4 = -a(4, ca, i) +# include "clover_bsa_tail.h90" + +!------------------------------------------------------------------------------- +subroutine clover_bsa_13(w, b, a) + +# include "clover_bsa_head.h90" + a1 = -i_times(a(2, ca, i)) + a2 = i_times(a(1, ca, i)) + a3 = -i_times(a(4, ca, i)) + a4 = i_times(a(3, ca, i)) +# include "clover_bsa_tail.h90" + +!------------------------------------------------------------------------------- +subroutine clover_bsa_31(w, b, a) + +# include "clover_bsa_head.h90" + a1 = i_times(a(2, ca, i)) + a2 = -i_times(a(1, ca, i)) + a3 = i_times(a(4, ca, i)) + a4 = -i_times(a(3, ca, i)) +# include "clover_bsa_tail.h90" + +!------------------------------------------------------------------------------- +subroutine clover_bsa_14(w, b, a) + +# include "clover_bsa_head.h90" + a1 = a(4, ca, i) + a2 = a(3, ca, i) + a3 = a(2, ca, i) + a4 = a(1, ca, i) +# include "clover_bsa_tail.h90" + +!------------------------------------------------------------------------------- +subroutine clover_bsa_41(w, b, a) + +# include "clover_bsa_head.h90" + a1 = -a(4, ca, i) + a2 = -a(3, ca, i) + a3 = -a(2, ca, i) + a4 = -a(1, ca, i) +# include "clover_bsa_tail.h90" + +!------------------------------------------------------------------------------- +subroutine clover_bsa_23(w, b, a) + +# include "clover_bsa_head.h90" + a1 = -a(2, ca, i) + a2 = -a(1, ca, i) + a3 = -a(4, ca, i) + a4 = -a(3, ca, i) +# include "clover_bsa_tail.h90" + +!------------------------------------------------------------------------------- +subroutine clover_bsa_32(w, b, a) + +# include "clover_bsa_head.h90" + a1 = a(2, ca, i) + a2 = a(1, ca, i) + a3 = a(4, ca, i) + a4 = a(3, ca, i) +# include "clover_bsa_tail.h90" + +!------------------------------------------------------------------------------- +subroutine clover_bsa_24(w, b, a) + +# include "clover_bsa_head.h90" + a1 = -i_times(a(4, ca, i)) + a2 = i_times(a(3, ca, i)) + a3 = -i_times(a(2, ca, i)) + a4 = i_times(a(1, ca, i)) +# include "clover_bsa_tail.h90" + +!------------------------------------------------------------------------------- +subroutine clover_bsa_42(w, b, a) + +# include "clover_bsa_head.h90" + a1 = i_times(a(4, ca, i)) + a2 = -i_times(a(3, ca, i)) + a3 = i_times(a(2, ca, i)) + a4 = -i_times(a(1, ca, i)) +# include "clover_bsa_tail.h90" + +!------------------------------------------------------------------------------- +subroutine clover_bsa_34(w, b, a) + +# include "clover_bsa_head.h90" + a1 = a(3, ca, i) + a2 = -a(4, ca, i) + a3 = a(1, ca, i) + a4 = -a(2, ca, i) +# include "clover_bsa_tail.h90" + +!------------------------------------------------------------------------------- +subroutine clover_bsa_43(w, b, a) + +# include "clover_bsa_head.h90" + a1 = -a(3, ca, i) + a2 = a(4, ca, i) + a3 = -a(1, ca, i) + a4 = a(2, ca, i) +# include "clover_bsa_tail.h90" + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_bsa_head.h90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_bsa_head.h90 new file mode 100644 index 0000000000000000000000000000000000000000..b2dae802a86eb8c4376fa6ac835510fd28849878 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_bsa_head.h90 @@ -0,0 +1,16 @@ + use module_vol + implicit none + + SU3_FIELD :: w + SPINCOL_FIELD :: b, a + COMPLEX :: a1, a2, a3, a4 + integer :: i, ca, cb + + ! statement function: + + COMPLEX :: i_times, c + i_times(c) = cmplx(-aimag(c), real(c)) + + !$omp parallel do private(ca, cb, a1, a2, a3, a4) + do i = 1, volh + do ca = 1, NCOL diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_bsa_tail.h90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_bsa_tail.h90 new file mode 100644 index 0000000000000000000000000000000000000000..66092341dae7a1266365740e4405af306c5c8abc --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_bsa_tail.h90 @@ -0,0 +1,10 @@ + do cb = 1, NCOL + w(ca, cb, i) = a1 * conjg(b(1, cb, i)) & + + a2 * conjg(b(2, cb, i)) & + + a3 * conjg(b(3, cb, i)) & + + a4 * conjg(b(4, cb, i)) + enddo + enddo + enddo + +end diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_d.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_d.F90 new file mode 100644 index 0000000000000000000000000000000000000000..71611a1c43b24aebdaa31bc1668f3b33ecaf1587 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_d.F90 @@ -0,0 +1,255 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! clover_d.F90 - derivative of clover term +! +!------------------------------------------------------------------------------- +! +! E -- 2 -- B +! | | A = x +! 3 1 ^ nu B = x + mu^ + nu^ +! | | | C = x + mu^ - nu^ +! A -- 0 -- D x --> mu +! | | D = x + mu^ +! 4 6 E = x + nu^ +! | | F = x - nu^ +! F -- 5 -- C +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine clover_dsd(eo, p, b, s, u) + + use typedef_clover + use module_p_interface + use module_vol + implicit none + + integer :: eo ! EVEN/ODD property of "b" + GENERATOR_FIELD :: p + CLOVER_FIELD_B :: b + REAL :: s + GAUGE_FIELD :: u + + P_GAUGE_FIELD, save :: w ! use existing data structure + + CLOVER_FIELD_C :: t +!dir$ cache_align t + + + TIMING_START(timing_bin_clover_dsd) + + ALLOCATE_G_FIELD(w) + + call clover_t_init(t, b(1, 1, eo)) + + call clover_ts(1, 2, w(1, 1, 1, EVEN, 1), t) ; call xbound_g(w, EVEN, 1) + call clover_ts(1, 3, w(1, 1, 1, ODD, 1), t) ; call xbound_g(w, ODD, 1) + call clover_ts(1, 4, w(1, 1, 1, EVEN, 2), t) ; call xbound_g(w, EVEN, 2) + call clover_ts(2, 3, w(1, 1, 1, ODD, 2), t) ; call xbound_g(w, ODD, 2) + call clover_ts(2, 4, w(1, 1, 1, EVEN, 3), t) ; call xbound_g(w, EVEN, 3) + call clover_ts(3, 4, w(1, 1, 1, ODD, 3), t) ; call xbound_g(w, ODD, 3) + + call clover_d(eo, p, s, u, w) + + TIMING_STOP(timing_bin_clover_dsd) +end + +!------------------------------------------------------------------------------- +subroutine clover_dsf(eo, p, b, a, s, u) + + use module_p_interface + use module_vol + implicit none + + integer :: eo ! EVEN/ODD property of "b" and "a" + GENERATOR_FIELD :: p + SPINCOL_FIELD :: b, a + REAL :: s + GAUGE_FIELD :: u + + P_GAUGE_FIELD, save :: w ! use existing data structure + + + TIMING_START(timing_bin_clover_dsf) + + ALLOCATE_G_FIELD(w) + + call clover_bsa(1, 2, w(1, 1, 1, EVEN, 1), b, a) ; call xbound_g(w, EVEN, 1) + call clover_bsa(1, 3, w(1, 1, 1, ODD, 1), b, a) ; call xbound_g(w, ODD, 1) + call clover_bsa(1, 4, w(1, 1, 1, EVEN, 2), b, a) ; call xbound_g(w, EVEN, 2) + call clover_bsa(2, 3, w(1, 1, 1, ODD, 2), b, a) ; call xbound_g(w, ODD, 2) + call clover_bsa(2, 4, w(1, 1, 1, EVEN, 3), b, a) ; call xbound_g(w, EVEN, 3) + call clover_bsa(3, 4, w(1, 1, 1, ODD, 3), b, a) ; call xbound_g(w, ODD, 3) + + call clover_d(eo, p, s, u, w) + + TIMING_STOP(timing_bin_clover_dsf) +end + +!------------------------------------------------------------------------------- +subroutine clover_d(eo, p, s, u, w) + + use module_vol + implicit none + + integer :: eo + GENERATOR_FIELD :: p + REAL :: s + GAUGE_FIELD :: u, w + + call clover_d_mu_nu(eo, 1, 2, p, s, u, w(1, 1, 1, EVEN, 1)) + call clover_d_mu_nu(eo, 1, 3, p, s, u, w(1, 1, 1, ODD, 1)) + call clover_d_mu_nu(eo, 1, 4, p, s, u, w(1, 1, 1, EVEN, 2)) + call clover_d_mu_nu(eo, 2, 3, p, s, u, w(1, 1, 1, ODD, 2)) + call clover_d_mu_nu(eo, 2, 4, p, s, u, w(1, 1, 1, EVEN, 3)) + call clover_d_mu_nu(eo, 3, 4, p, s, u, w(1, 1, 1, ODD, 3)) + +end + +!------------------------------------------------------------------------------- +subroutine clover_d_mu_nu(e, mu, nu, p, s, u, w) + + use module_vol + implicit none + + integer :: e, o, mu, nu + GENERATOR_FIELD :: p + REAL :: s + GAUGE_FIELD :: u + SU3_FIELD :: w + external clover_d_same_eo, clover_d_diff_eo + + o = EVEN + ODD - e + + call clover_d_loop(e, mu, nu, p, s, u, w, clover_d_same_eo) + call clover_d_loop(e, nu, mu, p, -s, u, w, clover_d_same_eo) + call clover_d_loop(o, mu, nu, p, s, u, w, clover_d_diff_eo) + call clover_d_loop(o, nu, mu, p, -s, u, w, clover_d_diff_eo) + +end + +!------------------------------------------------------------------------------- +subroutine clover_d_loop(e, mu, nu, p, s, u, w, clover_dd) + + use module_vol + use module_nn + implicit none + + integer :: e, mu, nu + GENERATOR_FIELD :: p + REAL :: s + GAUGE_FIELD :: u + SU3_FIELD :: w + external clover_dd + + integer :: o, i, ia, ib, ic, id, ie, if, j + GENERATOR :: q + + o = EVEN + ODD - e + + !$omp parallel do private(ia, ib, ic, id, ie, if, j, q) + do i = 1, volh + + id = nn(i, e, mu, FWD) + ie = nn(i, e, nu, FWD) + if = nn(i, e, nu, BWD) + + ia = i + ib = nn(id, o, nu, FWD) + ic = nn(id, o, nu, BWD) + + call clover_dd(q, s, u(1, 1, ia, e, mu), & + u(1, 1, id, o, nu), & + u(1, 1, ie, o, mu), & + u(1, 1, ia, e, nu), & + u(1, 1, if, o, nu), & + u(1, 1, if, o, mu), & + u(1, 1, ic, e, nu), & + w(1, 1, ia), & + w(1, 1, ib), & + w(1, 1, ic), & + w(1, 1, id), & + w(1, 1, ie), & + w(1, 1, if)) + + do j = 1, NGEN + p(j, i, e, mu) = p(j, i, e, mu) + q(j) + enddo + enddo + +end + +!------------------------------------------------------------------------------- +subroutine clover_d_same_eo(p, s, u0,u1,u2,u3,u4,u5,u6, wa,wb,wc, wd,we,wf) + + implicit none + + GENERATOR :: p + REAL :: s + SU3 :: u0, u1, u2, u3, u4, u5, u6, wa, wb, wc, wd, we, wf + SU3 :: r, u, v + + p = ZERO + + call uu(u, u0, u1) + call uu(v, u3, u2) + + call clover_uuu_udu(r, u, v, wa) ; call re_tr_j(p, r, s) + call clover_uuu_uud(r, wa, v, u) ; call re_tr_j(p, r, s) + call clover_uuu_uud(r, u, wb, v) ; call re_tr_j(p, r, s) + call clover_uuu_uud(r, v, wb, u) ; call re_tr_j(p, r, s) + + call uud(u, u0, u6) + call udu(v, u4, u5) + + call clover_uuu_uud(r, wa, v, u) ; call re_tr_j(p, r, -s) + call clover_uuu_udu(r, u, v, wa) ; call re_tr_j(p, r, -s) + call clover_uuu_uud(r, v, wc, u) ; call re_tr_j(p, r, -s) + call clover_uuu_uud(r, u, wc, v) ; call re_tr_j(p, r, -s) + +end + +!------------------------------------------------------------------------------- +subroutine clover_d_diff_eo(p, s, u0,u1,u2,u3,u4,u5,u6, wa,wb,wc, wd,we,wf) + + implicit none + + GENERATOR :: p + REAL :: s + SU3 :: u0, u1, u2, u3, u4, u5, u6, wa, wb, wc, wd, we, wf + SU3 :: r, u, v + + p = ZERO + + u = ZERO + v = ZERO + call uuu_fwd(u, u1, u2, u3) + call uuu_fwd(v, u2, u1, u0) + + call clover_uuu_uuu(r, u0, wd, u) ; call re_tr_j(p, r, s) + call clover_uuu_dud(r, u, wd, u0) ; call re_tr_j(p, r, s) + call clover_uuu_dud(r, v, we, u3) ; call re_tr_j(p, r, s) + call clover_uuu_uuu(r, u3, we, v) ; call re_tr_j(p, r, s) + + u = ZERO + v = ZERO + call uuu_bwd(u, u6, u5, u4) + call uuu_fwd(v, u0, u6, u5) + + call clover_uuu_dud(r, u, wd, u0) ; call re_tr_j(p, r, -s) + call clover_uuu_uuu(r, u0, wd, u) ; call re_tr_j(p, r, -s) + call clover_uuu_dud(r, u4, wf, v) ; call re_tr_j(p, r, -s) + call clover_uuu_uuu(r, v, wf, u4) ; call re_tr_j(p, r, -s) + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_dummy.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_dummy.F90 new file mode 100644 index 0000000000000000000000000000000000000000..bcbe8b7de98fe3e0c379bef7bfaf82cfdfc86fa8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_dummy.F90 @@ -0,0 +1,136 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2000-2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! clover_dummy.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine allocate_clover_field_a(a) + + use typedef_clover + use module_vol + implicit none + P_CLOVER_FIELD_A :: a + + call die("allocate_clover_field_a(): must not be called.") +end + +!------------------------------------------------------------------------------- +subroutine allocate_clover_field_b(b) + + use typedef_clover + use module_vol + implicit none + P_CLOVER_FIELD_B :: b + + call die("allocate_clover_field_b(): must not be called.") +end + +!------------------------------------------------------------------------------- +subroutine clover_init(a, b, u, csw_kappa) + + use typedef_clover + use module_vol + implicit none + + CLOVER_FIELD_A, intent(out) :: a + CLOVER_FIELD_B, intent(out) :: b + GAUGE_FIELD, intent(in) :: u + REAL, intent(in) :: csw_kappa + + call die("clover_init(): must not be called.") +end + +!------------------------------------------------------------------------------- +REAL function clover_action(b) + + use typedef_clover + use module_vol + implicit none + + type(type_clover_b) :: b(2, volh) + integer :: i + REAL :: s, global_sum + + + call die("clover_action(): must not be called.") + clover_action = ZERO +end + +!------------------------------------------------------------------------------- +subroutine clover_mult_a(out, a, in, volh) + + implicit none + + COMPLEX, dimension(18, 2, *) :: a + COMPLEX, dimension(NDIRAC, NCOL, *) :: out, in + integer :: volh + + call die("clover_mult_a(): must not be called.") +end + +!------------------------------------------------------------------------------- +subroutine clover_mult_ao(a, x, volh) ! x := A x + + implicit none + + COMPLEX, dimension(18, 2, *) :: a + COMPLEX, dimension(NDIRAC, NCOL, *) :: x + integer :: volh + + call die("clover_mult_ao(): must not be called.") +end + +!------------------------------------------------------------------------------- +subroutine clover_mult_b(b, x, volh) + + implicit none + + COMPLEX, dimension(18, 2, *) :: b + COMPLEX, dimension(NDIRAC, NCOL, *) :: x + integer :: volh + + call die("clover_mult_b(): must not be called.") +end + +!------------------------------------------------------------------------------- +subroutine clover_dsd(eo, p, b, s, u) + + use typedef_clover + use module_vol + implicit none + + integer :: eo + GENERATOR_FIELD :: p + CLOVER_FIELD_B :: b + REAL :: s + GAUGE_FIELD :: u + + call die("clover_dsd(): must not be called.") +end + +!------------------------------------------------------------------------------- +subroutine clover_dsf(eo, p, b, a, s, u) + + use module_vol + implicit none + + integer :: eo + GENERATOR_FIELD :: p + SPINCOL_FIELD :: b, a + REAL :: s + GAUGE_FIELD :: u + + call die("clover_dsf(): must not be called.") +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_f_mu_nu.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_f_mu_nu.F90 new file mode 100644 index 0000000000000000000000000000000000000000..12a0fef130e5e1986f8b65647689f7ad38702b26 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_f_mu_nu.F90 @@ -0,0 +1,94 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! clover_f_mu_nu.F90 - F_mu_nu = (Q_mu_nu - h.c.) / i (missing factor 1/8) +! +!------------------------------------------------------------------------------- +! +! ^ nu +! xmp x_p (xpp) | +! | +! xm_ x xp_ x --> mu +! +! xmm x_m xpm +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine clover_f_mu_nu(f, mu, nu, x, e, u) + + use module_vol + use module_nn + implicit none + + SU3, intent(out) :: f + integer, intent(in) :: mu, nu, x, e + GAUGE_FIELD, intent(in) :: u + + integer :: xmp, x_p, xm_, xp_, xmm, x_m, xpm, o + + ! statement function: + + COMPLEX :: i_times, c + i_times(c) = cmplx(-aimag(c), real(c)) + + o = EVEN + ODD - e + + xp_ = nn(x, e, mu, FWD) + xm_ = nn(x, e, mu, BWD) + x_p = nn(x, e, nu, FWD) + x_m = nn(x, e, nu, BWD) + + xmp = nn(xm_, o, nu, FWD) + xmm = nn(xm_, o, nu, BWD) + xpm = nn(xp_, o, nu, BWD) + + if (xmp /= nn(x_p, o, mu, BWD)) call die("colver_f_mu_nu(): xmp") + if (xmm /= nn(x_m, o, mu, BWD)) call die("colver_f_mu_nu(): xmm") + if (xpm /= nn(x_m, o, mu, FWD)) call die("colver_f_mu_nu(): xpm") + + f = ZERO + + call clover_uuuu1(f, u(1, 1, x, e, mu), & + u(1, 1, xp_, o, nu), & + u(1, 1, x_p, o, mu), & + u(1, 1, x, e, nu)) + + call clover_uuuu2(f, u(1, 1, x, e, nu), & + u(1, 1, xmp, e, mu), & + u(1, 1, xm_, o, nu), & + u(1, 1, xm_, o, mu)) + + call clover_uuuu3(f, u(1, 1, xm_, o, mu), & + u(1, 1, xmm, e, nu), & + u(1, 1, xmm, e, mu), & + u(1, 1, x_m, o, nu)) + + call clover_uuuu4(f, u(1, 1, x_m, o, nu), & + u(1, 1, x_m, o, mu), & + u(1, 1, xpm, e, nu), & + u(1, 1, x, e, mu)) + + f(1, 1) = cmplx(TWO * Im(f(1, 1)), ZERO) + f(2, 2) = cmplx(TWO * Im(f(2, 2)), ZERO) + f(3, 3) = cmplx(TWO * Im(f(3, 3)), ZERO) + + f(1, 2) = i_times(conjg(f(2, 1)) - f(1, 2)) + f(1, 3) = i_times(conjg(f(3, 1)) - f(1, 3)) + f(2, 3) = i_times(conjg(f(3, 2)) - f(2, 3)) + + f(2, 1) = conjg(f(1, 2)) + f(3, 1) = conjg(f(1, 3)) + f(3, 2) = conjg(f(2, 3)) + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_init.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_init.F90 new file mode 100644 index 0000000000000000000000000000000000000000..8a7b62b1d3853d44f728db2c383a5ba92bf248e5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_init.F90 @@ -0,0 +1,180 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! clover_init.F90 - calculates clover matrix and its inverse +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine clover_init(a, ainv, b, u, csw_kappa) + + use typedef_clover + use module_vol + implicit none + + CLOVER_FIELD_A, intent(out) :: a, ainv + CLOVER_FIELD_B, intent(out) :: b + GAUGE_FIELD, intent(in) :: u + REAL, intent(in) :: csw_kappa + + integer :: i, eo + SU3 :: f, g + type(type_clover_a) :: p, q + REAL :: factor + + TIMING_START(timing_bin_clover_init) + + factor = -csw_kappa / EIGHT + + do eo = EVEN, ODD + !$omp parallel do private(f, g, p, q) + do i = 1, VOLH + call clover_f_mu_nu(f, 2, 1, i, eo, u) + + call clover_init1(p, f) + + call clover_f_mu_nu(f, 3, 2, i, eo, u) + call clover_f_mu_nu(g, 3, 1, i, eo, u) + + call clover_init2(p, f, g) + + call clover_f_mu_nu(f, 3, 4, i, eo, u) + + call clover_init1(q, f) + + call clover_f_mu_nu(f, 1, 4, i, eo, u) + call clover_f_mu_nu(g, 4, 2, i, eo, u) + + call clover_init2(q, f, g) + + call clover_init3(a(1, i, eo), a(2, i, eo), p, q, factor) + + call clover_inv(b(1, i, eo), ainv(1, i, eo), a(1, i, eo)) + call clover_inv(b(2, i, eo), ainv(2, i, eo), a(2, i, eo)) + enddo + enddo + + TIMING_STOP(timing_bin_clover_init) + +end + +!------------------------------------------------------------------------------- +subroutine clover_init1(a, f) + + use typedef_clover + implicit none + type(type_clover_a) :: a + SU3 :: f + + a%i11 = Re(f(1, 1)) + a%i22 = Re(f(2, 2)) + a%i33 = Re(f(3, 3)) + + a%i44 = -a%i11 + a%i55 = -a%i22 + a%i66 = -a%i33 + + a%i12 = f(1, 2) + a%i13 = f(1, 3) + a%i23 = f(2, 3) + + a%i45 = -a%i12 + a%i46 = -a%i13 + a%i56 = -a%i23 + +end + +!------------------------------------------------------------------------------- +subroutine clover_init2(a, f, g) + + use typedef_clover + implicit none + type(type_clover_a) :: a + SU3 :: f, g + + ! statement function: + + COMPLEX :: i_times, c + i_times(c) = cmplx(-aimag(c), real(c)) + + a%i14 = f(1, 1) + i_times(g(1, 1)) + a%i15 = f(1, 2) + i_times(g(1, 2)) + a%i16 = f(1, 3) + i_times(g(1, 3)) + + a%i24 = f(2, 1) + i_times(g(2, 1)) + a%i25 = f(2, 2) + i_times(g(2, 2)) + a%i26 = f(2, 3) + i_times(g(2, 3)) + + a%i34 = f(3, 1) + i_times(g(3, 1)) + a%i35 = f(3, 2) + i_times(g(3, 2)) + a%i36 = f(3, 3) + i_times(g(3, 3)) + +end + +!------------------------------------------------------------------------------- +subroutine clover_init3(a1, a2, p, q, s) + + use typedef_clover + implicit none + type(type_clover_a) :: a1, a2, p, q + REAL :: s + +# define CLOVER_INIT_3(I, J) \ +a1%i ## I ## J = s * (p%i ## I ## J + q%i ## I ## J ## ) ; \ +a2%i ## I ## J = s * (p%i ## I ## J - q%i ## I ## J ## ) + +! define => +! a1%iIJ = s * (p%iIJ + q%iIJ) ; a2%iIJ = s * (p%iIJ - q%iIJ) + + CLOVER_INIT_3(1, 1) + CLOVER_INIT_3(1, 2) + CLOVER_INIT_3(1, 3) + CLOVER_INIT_3(1, 4) + CLOVER_INIT_3(1, 5) + CLOVER_INIT_3(1, 6) + + CLOVER_INIT_3(2, 2) + CLOVER_INIT_3(2, 3) + CLOVER_INIT_3(2, 4) + CLOVER_INIT_3(2, 5) + CLOVER_INIT_3(2, 6) + + CLOVER_INIT_3(3, 3) + CLOVER_INIT_3(3, 4) + CLOVER_INIT_3(3, 5) + CLOVER_INIT_3(3, 6) + + CLOVER_INIT_3(4, 4) + CLOVER_INIT_3(4, 5) + CLOVER_INIT_3(4, 6) + + CLOVER_INIT_3(5, 5) + CLOVER_INIT_3(5, 6) + + CLOVER_INIT_3(6, 6) + + a1%i11 = a1%i11 + ONE + a1%i22 = a1%i22 + ONE + a1%i33 = a1%i33 + ONE + a1%i44 = a1%i44 + ONE + a1%i55 = a1%i55 + ONE + a1%i66 = a1%i66 + ONE + + a2%i11 = a2%i11 + ONE + a2%i22 = a2%i22 + ONE + a2%i33 = a2%i33 + ONE + a2%i44 = a2%i44 + ONE + a2%i55 = a2%i55 + ONE + a2%i66 = a2%i66 + ONE + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_inv.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_inv.F90 new file mode 100644 index 0000000000000000000000000000000000000000..6ec62f61115dd60d37fbdca26171f2a2bcba9888 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_inv.F90 @@ -0,0 +1,211 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! clover_inv.F90 - calculates inverse of clover matrix +! +!------------------------------------------------------------------------------- +# include "defs.h" +# include "clover.h" + +!------------------------------------------------------------------------------- +subroutine clover_inv(b, ainv, a) + + use typedef_clover + implicit none + type(type_clover_a), intent(inout) :: a + type(type_clover_a), intent(out) :: ainv + type(type_clover_b), intent(out) :: b + + REAL :: d1, d2, d3, d4, d5 + + ! statement function: + + COMPLEX :: z + REAL :: sq + sq(z) = (Re(z)**2 + Im(z)**2) + + d1 = A11 ! D1 + B11 = ONE / d1 ! 1 / D1 + + B21 = A21 * B11 ! L21 + + d2 = A22 - d1 * sq(B21) ! D2 + B22 = ONE / d2 ! 1 / D2 + + B31 = A31 ! L31 D1 + B32 = A32 - B31 * B12 ! L32 D2 + + B31 = B31 * B11 ! L31 + B32 = B32 * B22 ! L32 + + d3 = A33 - d1 * sq(B31) - d2 * sq(B32) ! D3 + B33 = ONE / d3 ! 1 / D3 + + B41 = A41 ! L41 D1 + B42 = A42 - B41 * B12 ! L42 D2 + B43 = A43 - B41 * B13 - B42 * B23 ! L43 D3 + + B41 = B41 * B11 ! L41 + B42 = B42 * B22 ! L42 + B43 = B43 * B33 ! L43 + + d4 = A44 - d1 * sq(B41) - d2 * sq(B42) - d3 * sq(B43) ! D4 + B44 = ONE / d4 ! 1 / D4 + + B51 = A51 + B52 = A52 - B51 * B12 + B53 = A53 - B51 * B13 - B52 * B23 + B54 = A54 - B51 * B14 - B52 * B24 - B53 * B34 + + B51 = B51 * B11 + B52 = B52 * B22 + B53 = B53 * B33 + B54 = B54 * B44 + + d5 = A55 - d1 * sq(B51) - d2 * sq(B52) - d3 * sq(B53) - d4 * sq(B54) + B55 = ONE / d5 + + B61 = A61 + B62 = A62 - B61 * B12 + B63 = A63 - B61 * B13 - B62 * B23 + B64 = A64 - B61 * B14 - B62 * B24 - B63 * B34 + B65 = A65 - B61 * B15 - B62 * B25 - B63 * B35 - B64 * B45 + + B61 = B61 * B11 + B62 = B62 * B22 + B63 = B63 * B33 + B64 = B64 * B44 + B65 = B65 * B55 + + B66 = A66 - d1 * sq(B61) - d2 * sq(B62) - d3 * sq(B63) & + - d4 * sq(B64) - d5 * sq(B65) + + B66 = ONE / B66 + + call clover_inv2(ainv, b) + + B11 = HALF * B11 + B22 = HALF * B22 + B33 = HALF * B33 + B44 = HALF * B44 + B55 = HALF * B55 + B66 = HALF * B66 + + A11 = HALF * A11 + A12 = HALF * A12 + A13 = HALF * A13 + A14 = HALF * A14 + A15 = HALF * A15 + A16 = HALF * A16 + + A22 = HALF * A22 + A23 = HALF * A23 + A24 = HALF * A24 + A25 = HALF * A25 + A26 = HALF * A26 + + A33 = HALF * A33 + A34 = HALF * A34 + A35 = HALF * A35 + A36 = HALF * A36 + + A44 = HALF * A44 + A45 = HALF * A45 + A46 = HALF * A46 + + A55 = HALF * A55 + A56 = HALF * A56 + + A66 = HALF * A66 + +end + +!------------------------------------------------------------------------------- +subroutine clover_inv2(a, b) + + use typedef_clover + implicit none + type(type_clover_a), intent(out) :: a + type(type_clover_b), intent(in) :: b + + COMPLEX, dimension(6) :: u, x, y + + call inv(1) + A11 = Re(x(1)) + A12 = x(2) + A13 = x(3) + A14 = x(4) + A15 = x(5) + A16 = x(6) + + call inv(2) + A22 = Re(x(2)) + A23 = x(3) + A24 = x(4) + A25 = x(5) + A26 = x(6) + + call inv(3) + A33 = Re(x(3)) + A34 = x(4) + A35 = x(5) + A36 = x(6) + + call inv(4) + A44 = Re(x(4)) + A45 = x(5) + A46 = x(6) + + call inv(5) + A55 = Re(x(5)) + A56 = x(6) + + call inv(6) + A66 = Re(x(6)) + + +CONTAINS + + subroutine inv(i) + + integer :: i + + u = ZERO + u(i) = ONE + + y(1) = u(1) + y(2) = u(2) - B21 * y(1) + y(3) = u(3) - B31 * y(1) - B32 * y(2) + y(4) = u(4) - B41 * y(1) - B42 * y(2) - B43 * y(3) + y(5) = u(5) - B51 * y(1) - B52 * y(2) - B53 * y(3) - B54 * y(4) + y(6) = u(6) - B61 * y(1) - B62 * y(2) - B63 * y(3) - B64 * y(4) - B65 * y(5) + + x(6) = y(6) * B66 + x(5) = y(5) * B55 - B56 * x(6) + x(4) = y(4) * B44 - B45 * x(5) - B46 * x(6) + x(3) = y(3) * B33 - B34 * x(4) - B35 * x(5) & + - B36 * x(6) + x(2) = y(2) * B22 - B23 * x(3) - B24 * x(4) & + - B25 * x(5) - B26 * x(6) + x(1) = y(1) * B11 - B12 * x(2) - B13 * x(3) & + - B14 * x(4) - B15 * x(5) - B16 * x(6) + + x(1) = HALF * conjg(x(1)) + x(2) = HALF * conjg(x(2)) + x(3) = HALF * conjg(x(3)) + x(4) = HALF * conjg(x(4)) + x(5) = HALF * conjg(x(5)) + x(6) = HALF * conjg(x(6)) + + end subroutine inv + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_mult_a.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_mult_a.F90 new file mode 100644 index 0000000000000000000000000000000000000000..2c03c4ad17b4c5a09bfd9f8684856e868364d596 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_mult_a.F90 @@ -0,0 +1,87 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin +! +!------------------------------------------------------------------------------- +! +! clover_mult_a.F90 +! +!------------------------------------------------------------------------------- +# define CLOVER_AS_COMPLEX_ARRAY +# include "defs.h" +# include "clover.h" + +!------------------------------------------------------------------------------- +subroutine clover_mult_a(out, a, in, volh) ! out := A in + + implicit none + + COMPLEX, dimension(18, 2, *) :: a + COMPLEX, dimension(NDIRAC, NCOL, *) :: out, in + integer :: volh + + integer :: i + COMPLEX :: x1, x2, x3, x4, x5, x6 + COMPLEX :: y1, y2, y3, y4, y5, y6 + + TIMING_START(timing_bin_clover_mult_a) + + !$omp parallel do private(x1, x2, x3, x4, x5, x6, y1, y2, y3, y4, y5, y6) + do i = 1, volh + x1 = in(SC1, i) + in(SC7, i) + x2 = in(SC2, i) + in(SC8, i) + x3 = in(SC3, i) + in(SC9, i) + x4 = in(SC4, i) + in(SC10, i) + x5 = in(SC5, i) + in(SC11, i) + x6 = in(SC6, i) + in(SC12, i) + +# define J 1 +# include "clover_mult_a.h90" + + out(SC1, i) = y1 + out(SC2, i) = y2 + out(SC3, i) = y3 + out(SC4, i) = y4 + out(SC5, i) = y5 + out(SC6, i) = y6 + out(SC7, i) = y1 + out(SC8, i) = y2 + out(SC9, i) = y3 + out(SC10, i) = y4 + out(SC11, i) = y5 + out(SC12, i) = y6 + + x1 = in(SC1, i) - in(SC7, i) + x2 = in(SC2, i) - in(SC8, i) + x3 = in(SC3, i) - in(SC9, i) + x4 = in(SC4, i) - in(SC10, i) + x5 = in(SC5, i) - in(SC11, i) + x6 = in(SC6, i) - in(SC12, i) + +# undef J +# define J 2 +# include "clover_mult_a.h90" + + out(SC1, i) = out(SC1, i) + y1 + out(SC2, i) = out(SC2, i) + y2 + out(SC3, i) = out(SC3, i) + y3 + out(SC4, i) = out(SC4, i) + y4 + out(SC5, i) = out(SC5, i) + y5 + out(SC6, i) = out(SC6, i) + y6 + out(SC7, i) = out(SC7, i) - y1 + out(SC8, i) = out(SC8, i) - y2 + out(SC9, i) = out(SC9, i) - y3 + out(SC10, i) = out(SC10, i) - y4 + out(SC11, i) = out(SC11, i) - y5 + out(SC12, i) = out(SC12, i) - y6 + + enddo + + TIMING_STOP(timing_bin_clover_mult_a) +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_mult_a.h90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_mult_a.h90 new file mode 100644 index 0000000000000000000000000000000000000000..49a8b2aaa3d48d29aefc6233d5379d6101708785 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_mult_a.h90 @@ -0,0 +1,12 @@ + y1 = A11 * x1 + A12 * x2 + A13 * x3 & + + A14 * x4 + A15 * x5 + A16 * x6 + y2 = A21 * x1 + A22 * x2 + A23 * x3 & + + A24 * x4 + A25 * x5 + A26 * x6 + y3 = A31 * x1 + A32 * x2 + A33 * x3 & + + A34 * x4 + A35 * x5 + A36 * x6 + y4 = A41 * x1 + A42 * x2 + A43 * x3 & + + A44 * x4 + A45 * x5 + A46 * x6 + y5 = A51 * x1 + A52 * x2 + A53 * x3 & + + A54 * x4 + A55 * x5 + A56 * x6 + y6 = A61 * x1 + A62 * x2 + A63 * x3 & + + A64 * x4 + A65 * x5 + A66 * x6 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_mult_ao.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_mult_ao.F90 new file mode 100644 index 0000000000000000000000000000000000000000..c35d1656cbb4dcbfe1568bd8e2b59c6361d5ab4a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_mult_ao.F90 @@ -0,0 +1,87 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin +! +!------------------------------------------------------------------------------- +! +! clover_mult_ao.F90 - ao: "A overwrite" +! +!------------------------------------------------------------------------------- +# define CLOVER_AS_COMPLEX_ARRAY +# include "defs.h" +# include "clover.h" + +!------------------------------------------------------------------------------- +subroutine clover_mult_ao(a, x, volh) ! x := A x + + implicit none + + COMPLEX, dimension(18, 2, *) :: a + COMPLEX, dimension(NDIRAC, NCOL, *) :: x + integer :: volh + + integer :: i + COMPLEX :: x1, x2, x3, x4, x5, x6 + COMPLEX :: y1, y2, y3, y4, y5, y6 + + TIMING_START(timing_bin_clover_mult_ao) + + !$omp parallel do private(x1, x2, x3, x4, x5, x6, y1, y2, y3, y4, y5, y6) + do i = 1, volh + x1 = x(SC1, i) + x(SC7, i) + x2 = x(SC2, i) + x(SC8, i) + x3 = x(SC3, i) + x(SC9, i) + x4 = x(SC4, i) + x(SC10, i) + x5 = x(SC5, i) + x(SC11, i) + x6 = x(SC6, i) + x(SC12, i) + +# define J 1 +# include "clover_mult_a.h90" + + x1 = x(SC1, i) - x(SC7, i) + x2 = x(SC2, i) - x(SC8, i) + x3 = x(SC3, i) - x(SC9, i) + x4 = x(SC4, i) - x(SC10, i) + x5 = x(SC5, i) - x(SC11, i) + x6 = x(SC6, i) - x(SC12, i) + + x(SC1, i) = y1 + x(SC2, i) = y2 + x(SC3, i) = y3 + x(SC4, i) = y4 + x(SC5, i) = y5 + x(SC6, i) = y6 + x(SC7, i) = y1 + x(SC8, i) = y2 + x(SC9, i) = y3 + x(SC10, i) = y4 + x(SC11, i) = y5 + x(SC12, i) = y6 + +# undef J +# define J 2 +# include "clover_mult_a.h90" + + x(SC1, i) = x(SC1, i) + y1 + x(SC2, i) = x(SC2, i) + y2 + x(SC3, i) = x(SC3, i) + y3 + x(SC4, i) = x(SC4, i) + y4 + x(SC5, i) = x(SC5, i) + y5 + x(SC6, i) = x(SC6, i) + y6 + x(SC7, i) = x(SC7, i) - y1 + x(SC8, i) = x(SC8, i) - y2 + x(SC9, i) = x(SC9, i) - y3 + x(SC10, i) = x(SC10, i) - y4 + x(SC11, i) = x(SC11, i) - y5 + x(SC12, i) = x(SC12, i) - y6 + + enddo + + TIMING_STOP(timing_bin_clover_mult_ao) +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_mult_b.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_mult_b.F90 new file mode 100644 index 0000000000000000000000000000000000000000..b0d30d053b882b1cdbc68c9ab3b09d30956e096f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_mult_b.F90 @@ -0,0 +1,88 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin +! +!------------------------------------------------------------------------------- +! +! clover_mult_b.F90 +! +!------------------------------------------------------------------------------- +# define CLOVER_AS_COMPLEX_ARRAY +# include "defs.h" +# include "clover.h" + +!------------------------------------------------------------------------------- +subroutine clover_mult_b(b, x, volh) ! x := B x + + implicit none + + COMPLEX, dimension(18, 2, *) :: b + COMPLEX, dimension(NDIRAC, NCOL, *) :: x + integer :: volh + + integer :: i + COMPLEX :: x1, x2, x3, x4, x5, x6 + COMPLEX :: y1, y2, y3, y4, y5, y6 + + TIMING_START(timing_bin_clover_mult_b) + + !$omp parallel do private(x1, x2, x3, x4, x5, x6, y1, y2, y3, y4, y5, y6) + do i = 1, volh + + y1 = x(SC1, i) + x(SC7, i) + y2 = x(SC2, i) + x(SC8, i) + y3 = x(SC3, i) + x(SC9, i) + y4 = x(SC4, i) + x(SC10, i) + y5 = x(SC5, i) + x(SC11, i) + y6 = x(SC6, i) + x(SC12, i) + +# define J 1 +# include "clover_mult_b.h90" + + y1 = x(SC1, i) - x(SC7, i) + y2 = x(SC2, i) - x(SC8, i) + y3 = x(SC3, i) - x(SC9, i) + y4 = x(SC4, i) - x(SC10, i) + y5 = x(SC5, i) - x(SC11, i) + y6 = x(SC6, i) - x(SC12, i) + + x(SC1, i) = x1 + x(SC2, i) = x2 + x(SC3, i) = x3 + x(SC4, i) = x4 + x(SC5, i) = x5 + x(SC6, i) = x6 + x(SC7, i) = x1 + x(SC8, i) = x2 + x(SC9, i) = x3 + x(SC10, i) = x4 + x(SC11, i) = x5 + x(SC12, i) = x6 + +# undef J +# define J 2 +# include "clover_mult_b.h90" + + x(SC1, i) = x(SC1, i) + x1 + x(SC2, i) = x(SC2, i) + x2 + x(SC3, i) = x(SC3, i) + x3 + x(SC4, i) = x(SC4, i) + x4 + x(SC5, i) = x(SC5, i) + x5 + x(SC6, i) = x(SC6, i) + x6 + x(SC7, i) = x(SC7, i) - x1 + x(SC8, i) = x(SC8, i) - x2 + x(SC9, i) = x(SC9, i) - x3 + x(SC10, i) = x(SC10, i) - x4 + x(SC11, i) = x(SC11, i) - x5 + x(SC12, i) = x(SC12, i) - x6 + + enddo + + TIMING_STOP(timing_bin_clover_mult_b) +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_mult_b.h90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_mult_b.h90 new file mode 100644 index 0000000000000000000000000000000000000000..f38943b10b35aaf30abdd18030e26f65e67a6930 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_mult_b.h90 @@ -0,0 +1,15 @@ + y2 = y2 - B21 * y1 + y3 = y3 - B31 * y1 - B32 * y2 + y4 = y4 - B41 * y1 - B42 * y2 - B43 * y3 + y5 = y5 - B51 * y1 - B52 * y2 - B53 * y3 - B54 * y4 + y6 = y6 - B61 * y1 - B62 * y2 - B63 * y3 - B64 * y4 - B65 * y5 + + x6 = y6 * B66 + x5 = y5 * B55 - B56 * x6 + x4 = y4 * B44 - B45 * x5 - B46 * x6 + x3 = y3 * B33 - B34 * x4 - B35 * x5 & + - B36 * x6 + x2 = y2 * B22 - B23 * x3 - B24 * x4 & + - B25 * x5 - B26 * x6 + x1 = y1 * B11 - B12 * x2 - B13 * x3 & + - B14 * x4 - B15 * x5 - B16 * x6 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_t_init.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_t_init.F90 new file mode 100644 index 0000000000000000000000000000000000000000..1c98847df50f08a5779ce46a72390f5d3b48243d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_t_init.F90 @@ -0,0 +1,56 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! clover_t_init.F90 - calculates T +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine clover_t_init(t, b) + + use typedef_clover + use module_vol + implicit none + + CLOVER_FIELD_C :: t + type(type_clover_b) :: b(2, volh) + + SPINCOL_FIELD :: x +!dir$ cache_align x + + integer :: c1, c2, s1, s2, i + + do c2 = 1, NCOL + do s2 = 1, NDIRAC + + x = ZERO + !$omp parallel do + do i = 1, volh + x(s2, c2, i) = ONE + enddo + + call clover_mult_b(b, x, volh) + + !$omp parallel do private(c1, s1) + do i = 1, volh + do c1 = 1, NCOL + do s1 = 1, NDIRAC + t(s1, c1, s2, c2, i) = x(s1, c1, i) + enddo + enddo + enddo + + enddo + enddo + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_ts.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_ts.F90 new file mode 100644 index 0000000000000000000000000000000000000000..c0fb1a8f81f1bf8612951f5bb451f6d157736db8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_ts.F90 @@ -0,0 +1,165 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! clover_ts.F90 - calculates T * sigma +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine clover_ts(mu, nu, w, t) ! w = t sigma_mu_nu + + use module_vol + implicit none + + integer :: mu, nu + SU3_FIELD :: w + CLOVER_FIELD_C :: t + + if (mu == 1) then + if (nu == 2) then ; call clover_ts_12(w, t) + elseif (nu == 3) then ; call clover_ts_13(w, t) + elseif (nu == 4) then ; call clover_ts_14(w, t) ; endif + elseif (mu == 2) then + if (nu == 1) then ; call clover_ts_21(w, t) + elseif (nu == 3) then ; call clover_ts_23(w, t) + elseif (nu == 4) then ; call clover_ts_24(w, t) ; endif + elseif (mu == 3) then + if (nu == 1) then ; call clover_ts_31(w, t) + elseif (nu == 2) then ; call clover_ts_32(w, t) + elseif (nu == 4) then ; call clover_ts_34(w, t) ; endif + elseif (mu == 4) then + if (nu == 1) then ; call clover_ts_41(w, t) + elseif (nu == 2) then ; call clover_ts_42(w, t) + elseif (nu == 3) then ; call clover_ts_43(w, t) ; endif + endif +end + +!------------------------------------------------------------------------------- +subroutine clover_ts_12(w, t) + +# include "clover_ts_head.h90" + w(c1, c2, i) = -t(1, c1, 1, c2, i) & + + t(2, c1, 2, c2, i) & + - t(3, c1, 3, c2, i) & + + t(4, c1, 4, c2, i) +# include "clover_ts_tail.h90" + +!------------------------------------------------------------------------------- +subroutine clover_ts_21(w, t) + +# include "clover_ts_head.h90" + w(c1, c2, i) = t(1, c1, 1, c2, i) & + - t(2, c1, 2, c2, i) & + + t(3, c1, 3, c2, i) & + - t(4, c1, 4, c2, i) +# include "clover_ts_tail.h90" + +!------------------------------------------------------------------------------- +subroutine clover_ts_13(w, t) + +# include "clover_ts_head.h90" + w(c1, c2, i) = i_times(t(1, c1, 2, c2, i)) & + - i_times(t(2, c1, 1, c2, i)) & + + i_times(t(3, c1, 4, c2, i)) & + - i_times(t(4, c1, 3, c2, i)) +# include "clover_ts_tail.h90" + +!------------------------------------------------------------------------------- +subroutine clover_ts_31(w, t) + +# include "clover_ts_head.h90" + w(c1, c2, i) = -i_times(t(1, c1, 2, c2, i)) & + + i_times(t(2, c1, 1, c2, i)) & + - i_times(t(3, c1, 4, c2, i)) & + + i_times(t(4, c1, 3, c2, i)) +# include "clover_ts_tail.h90" + +!------------------------------------------------------------------------------- +subroutine clover_ts_14(w, t) + +# include "clover_ts_head.h90" + w(c1, c2, i) = t(1, c1, 4, c2, i) & + + t(2, c1, 3, c2, i) & + + t(3, c1, 2, c2, i) & + + t(4, c1, 1, c2, i) +# include "clover_ts_tail.h90" + +!------------------------------------------------------------------------------- +subroutine clover_ts_41(w, t) + +# include "clover_ts_head.h90" + w(c1, c2, i) = -t(1, c1, 4, c2, i) & + - t(2, c1, 3, c2, i) & + - t(3, c1, 2, c2, i) & + - t(4, c1, 1, c2, i) +# include "clover_ts_tail.h90" + +!------------------------------------------------------------------------------- +subroutine clover_ts_23(w, t) + +# include "clover_ts_head.h90" + w(c1, c2, i) = -t(1, c1, 2, c2, i) & + - t(2, c1, 1, c2, i) & + - t(3, c1, 4, c2, i) & + - t(4, c1, 3, c2, i) +# include "clover_ts_tail.h90" + +!------------------------------------------------------------------------------- +subroutine clover_ts_32(w, t) + +# include "clover_ts_head.h90" + w(c1, c2, i) = t(1, c1, 2, c2, i) & + + t(2, c1, 1, c2, i) & + + t(3, c1, 4, c2, i) & + + t(4, c1, 3, c2, i) +# include "clover_ts_tail.h90" + +!------------------------------------------------------------------------------- +subroutine clover_ts_24(w, t) + +# include "clover_ts_head.h90" + w(c1, c2, i) = i_times(t(1, c1, 4, c2, i)) & + - i_times(t(2, c1, 3, c2, i)) & + + i_times(t(3, c1, 2, c2, i)) & + - i_times(t(4, c1, 1, c2, i)) +# include "clover_ts_tail.h90" + +!------------------------------------------------------------------------------- +subroutine clover_ts_42(w, t) + +# include "clover_ts_head.h90" + w(c1, c2, i) = -i_times(t(1, c1, 4, c2, i)) & + + i_times(t(2, c1, 3, c2, i)) & + - i_times(t(3, c1, 2, c2, i)) & + + i_times(t(4, c1, 1, c2, i)) +# include "clover_ts_tail.h90" + +!------------------------------------------------------------------------------- +subroutine clover_ts_34(w, t) + +# include "clover_ts_head.h90" + w(c1, c2, i) = t(1, c1, 3, c2, i) & + - t(2, c1, 4, c2, i) & + + t(3, c1, 1, c2, i) & + - t(4, c1, 2, c2, i) +# include "clover_ts_tail.h90" + +!------------------------------------------------------------------------------- +subroutine clover_ts_43(w, t) + +# include "clover_ts_head.h90" + w(c1, c2, i) = -t(1, c1, 3, c2, i) & + + t(2, c1, 4, c2, i) & + - t(3, c1, 1, c2, i) & + + t(4, c1, 2, c2, i) +# include "clover_ts_tail.h90" + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_ts_head.h90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_ts_head.h90 new file mode 100644 index 0000000000000000000000000000000000000000..fb209369c093d42e29f54e163f8612a271a45387 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_ts_head.h90 @@ -0,0 +1,17 @@ + use module_vol + implicit none + + SU3_FIELD :: w + CLOVER_FIELD_C :: t + + integer :: i, c1, c2 + + ! statement function: + + COMPLEX :: i_times, c + i_times(c) = cmplx(-aimag(c), real(c)) + + !$omp parallel do private(c1, c2) + do i = 1, volh + do c2 = 1, NCOL + do c1 = 1, NCOL diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_ts_tail.h90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_ts_tail.h90 new file mode 100644 index 0000000000000000000000000000000000000000..46655ebcdf9f049df5ea6453a96851d927592acb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_ts_tail.h90 @@ -0,0 +1,5 @@ + enddo + enddo + enddo + +end diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_uuu.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_uuu.F90 new file mode 100644 index 0000000000000000000000000000000000000000..0b513a2c746dba130f2208c72cad2dfb932439f3 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_uuu.F90 @@ -0,0 +1,116 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin +! +!------------------------------------------------------------------------------- +! +! clover_uuu.F90 - multiplications of three SU(3) matrices +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine clover_uuu_uuu(r, u, v, w) ! r = u * v * w + + implicit none + SU3 :: r, u, v, w + integer :: i, j, k, l + + do i = 1, NCOL + do l = 1, NCOL + r(i, l) = ZERO + do j = 1, NCOL + do k = 1, NCOL + r(i, l) = r(i, l) + u(i, j) * v(j, k) * w(k, l) + enddo + enddo + enddo + enddo + +end + +!------------------------------------------------------------------------------- +subroutine clover_uuu_duu(r, u, v, w) ! r = u+ * v * w + + implicit none + SU3 :: r, u, v, w + integer :: i, j, k, l + + do i = 1, NCOL + do l = 1, NCOL + r(i, l) = ZERO + do j = 1, NCOL + do k = 1, NCOL + r(i, l) = r(i, l) + conjg(u(j, i)) * v(j, k) * w(k, l) + enddo + enddo + enddo + enddo + +end + +!------------------------------------------------------------------------------- +subroutine clover_uuu_udu(r, u, v, w) ! r = u * v+ * w + + implicit none + SU3 :: r, u, v, w + integer :: i, j, k, l + + do i = 1, NCOL + do l = 1, NCOL + r(i, l) = ZERO + do j = 1, NCOL + do k = 1, NCOL + r(i, l) = r(i, l) + u(i, j) * conjg(v(k, j)) * w(k, l) + enddo + enddo + enddo + enddo + +end + +!------------------------------------------------------------------------------- +subroutine clover_uuu_uud(r, u, v, w) ! r = u * v * w+ + + implicit none + SU3 :: r, u, v, w + integer :: i, j, k, l + + do i = 1, NCOL + do l = 1, NCOL + r(i, l) = ZERO + do j = 1, NCOL + do k = 1, NCOL + r(i, l) = r(i, l) + u(i, j) * v(j, k) * conjg(w(l, k)) + enddo + enddo + enddo + enddo + +end + +!------------------------------------------------------------------------------- +subroutine clover_uuu_dud(r, u, v, w) ! r = u+ * v * w+ + + implicit none + SU3 :: r, u, v, w + integer :: i, j, k, l + + do i = 1, NCOL + do l = 1, NCOL + r(i, l) = ZERO + do j = 1, NCOL + do k = 1, NCOL + r(i, l) = r(i, l) + conjg(u(j, i)) * v(j, k) * conjg(w(l, k)) + enddo + enddo + enddo + enddo + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_uuuu.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_uuuu.F90 new file mode 100644 index 0000000000000000000000000000000000000000..a9994b8dc041702c348dea588210dc4e4162e017 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/clover_uuuu.F90 @@ -0,0 +1,117 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin +! +!------------------------------------------------------------------------------- +! +! clover_uuuu.F90 - multiplications of four SU(3) matrices +! +!------------------------------------------------------------------------------- +! +! --<-- --<-- +! | | | | +! v 2 ^ v 1 ^ +! | | | | +! -->-- -->-- +! x +! --<-- --<-- +! | | | | +! v 3 ^ v 4 ^ +! | | | | +! -->-- -->-- +! +! uuuu1: uuuu += u1 u2 u3+ u4+ ! + = dagger +! uuuu2: uuuu += u1 u2+ u3+ u4 +! uuuu3: uuuu += u1+ u2+ u3 u4 +! uuuu4: uuuu += u1+ u2 u3 u4+ +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine clover_uuuu1(uuuu, u1, u2, u3, u4) + + implicit none + SU3 :: uuuu, u1, u2, u3, u4 + integer :: i, j, k, l, m + + do i = 1, NCOL + do m = 1, NCOL + do j = 1, NCOL + do k = 1, NCOL + do l = 1, NCOL + uuuu(i,m)= uuuu(i,m)+ u1(i,j) * u2(j,k) * conjg(u3(l,k)) * conjg(u4(m,l)) + enddo + enddo + enddo + enddo + enddo +end + +!------------------------------------------------------------------------------- +subroutine clover_uuuu2(uuuu, u1, u2, u3, u4) + + implicit none + SU3 :: uuuu, u1, u2, u3, u4 + integer :: i, j, k, l, m + + do i = 1, NCOL + do m = 1, NCOL + do j = 1, NCOL + do k = 1, NCOL + do l = 1, NCOL + uuuu(i,m)= uuuu(i,m)+ u1(i,j) * conjg(u2(k,j)) * conjg(u3(l,k)) * u4(l,m) + enddo + enddo + enddo + enddo + enddo +end + + + +!------------------------------------------------------------------------------- +subroutine clover_uuuu3(uuuu, u1, u2, u3, u4) + + implicit none + SU3 :: uuuu, u1, u2, u3, u4 + integer :: i, j, k, l, m + + do i = 1, NCOL + do m = 1, NCOL + do j = 1, NCOL + do k = 1, NCOL + do l = 1, NCOL + uuuu(i,m)= uuuu(i,m)+ conjg(u1(j,i)) * conjg(u2(k,j)) * u3(k,l) * u4(l,m) + enddo + enddo + enddo + enddo + enddo +end + +!------------------------------------------------------------------------------- +subroutine clover_uuuu4(uuuu, u1, u2, u3, u4) + + implicit none + SU3 :: uuuu, u1, u2, u3, u4 + integer :: i, j, k, l, m + + do i = 1, NCOL + do m = 1, NCOL + do j = 1, NCOL + do k = 1, NCOL + do l = 1, NCOL + uuuu(i,m)= uuuu(i,m)+ conjg(u1(j,i)) * u2(j,k) * u3(k,l) * conjg(u4(m,l)) + enddo + enddo + enddo + enddo + enddo +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/clover/ctest.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/ctest.F90 new file mode 100644 index 0000000000000000000000000000000000000000..942ab53a936812d9bdec4f106ad51c82df549b8f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/clover/ctest.F90 @@ -0,0 +1,151 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin +! +!------------------------------------------------------------------------------- +! +! ctest.F90 - test of clover matrix multiplications: is (A * inv(A) = 1) ? +! +!------------------------------------------------------------------------------- +# include "defs.h" +# include "clover.h" + +!------------------------------------------------------------------------------- +program ctest + + use typedef_clover + implicit none + + integer, parameter :: volh = 1 + integer, parameter :: nz = 1 + + type(type_clover_a) :: a(2, volh), ainv(2, volh) + type(type_clover_b) :: b(2, volh) + + COMPLEX, dimension(NDIRAC, NCOL, volh) :: z, r + + integer :: i, j, s, c + + do i = 1, volh + do j = 1, 2 + call cinit(a(j, i)) + call clover_inv(b(j, i), ainv(j, i), a(j, i)) + enddo + enddo + + + do j = 1,12 + call zinit(z, j, volh) + !!call clover_mult_b(b, z, volh) + call clover_mult_ao(ainv, z, volh) + call clover_mult_a(r, a, z, volh) + call zwrite(r, j, volh) + +! call zinit(z, j+6, volh) +! call clover_mult_b(b, z, volh) +! call zwrite(z, j+6, volh) + +! call zinit(z, j+6, volh) +! call clover_mult_a(r, a, z, volh) +! call zwrite(r, j+6, volh) + + +!! call clover_mult_b(b, r, volh) + !!call zwrite(r, j, volh) + enddo + +end + +!------------------------------------------------------------------------------- +subroutine cinit(a) + + use typedef_clover + implicit none + type(type_clover_a) :: a + real, intrinsic :: ranf + + A11 = ranf() + A22 = ranf() + A33 = ranf() + A44 = ranf() + A55 = ranf() + A66 = ranf() + + A12 = cmplx(ranf(), ranf()) + A13 = cmplx(ranf(), ranf()) + A14 = cmplx(ranf(), ranf()) + A15 = cmplx(ranf(), ranf()) + A16 = cmplx(ranf(), ranf()) + + A23 = cmplx(ranf(), ranf()) + A24 = cmplx(ranf(), ranf()) + A25 = cmplx(ranf(), ranf()) + A26 = cmplx(ranf(), ranf()) + + A34 = cmplx(ranf(), ranf()) + A35 = cmplx(ranf(), ranf()) + A36 = cmplx(ranf(), ranf()) + + A45 = cmplx(ranf(), ranf()) + A46 = cmplx(ranf(), ranf()) + + A56 = cmplx(ranf(), ranf()) + +end + +!------------------------------------------------------------------------------- +subroutine zinit(z, j, volh) + + implicit none + integer :: volh, i, j + COMPLEX, dimension(NDIRAC, NCOL, volh) :: z + + z = 0 + do i = 1, volh + if (j == 1) z(SC1, i) = 1 + if (j == 2) z(SC2, i) = 1 + if (j == 3) z(SC3, i) = 1 + if (j == 4) z(SC4, i) = 1 + if (j == 5) z(SC5, i) = 1 + if (j == 6) z(SC6, i) = 1 + if (j == 7) z(SC7, i) = 1 + if (j == 8) z(SC8, i) = 1 + if (j == 9) z(SC9, i) = 1 + if (j == 10) z(SC10, i) = 1 + if (j == 11) z(SC11, i) = 1 + if (j == 12) z(SC12, i) = 1 + enddo + +end + +!------------------------------------------------------------------------------- +subroutine zwrite(z, j, volh) + + implicit none + integer :: volh, i, j + COMPLEX, dimension(NDIRAC, NCOL, volh) :: z + + write(6,*) "-----------------------------------------------" + do i = 1, volh + write(6, "(4i4,2f16.8)") j, i, SC1, z(SC1, i) + write(6, "(4i4,2f16.8)") j, i, SC2, z(SC2, i) + write(6, "(4i4,2f16.8)") j, i, SC3, z(SC3, i) + write(6, "(4i4,2f16.8)") j, i, SC4, z(SC4, i) + write(6, "(4i4,2f16.8)") j, i, SC5, z(SC5, i) + write(6, "(4i4,2f16.8)") j, i, SC6, z(SC6, i) + write(6,*) + write(6, "(4i4,2f16.8)") j, i, SC7, z(SC7, i) + write(6, "(4i4,2f16.8)") j, i, SC8, z(SC8, i) + write(6, "(4i4,2f16.8)") j, i, SC9, z(SC9, i) + write(6, "(4i4,2f16.8)") j, i, SC10, z(SC10, i) + write(6, "(4i4,2f16.8)") j, i, SC11, z(SC11, i) + write(6, "(4i4,2f16.8)") j, i, SC12, z(SC12, i) + enddo + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..8dd716e619545bfc4323d61b9c446881b58769d1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/Makefile @@ -0,0 +1,89 @@ +#=============================================================================== +# +# BQCD -- Berlin Quantum ChromoDynamics programme +# +# Author: Hinnerk Stueben +# +# Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin +# +#------------------------------------------------------------------------------- +# +# comm/Makefile +# +#=============================================================================== + +include ../Makefile.defs + +MODULES_DIR = ../modules + +.SUFFIXES: +.SUFFIXES: .a .o .F90 + +.F90.o: + $(FPP) -I.. $(FPPFLAGS) $< > $*.f90 + $(F90) -c $(FFLAGS) -I$(MODULES_DIR) $*.f90 + + +OBJS_MPI = \ + dotprod.o \ + comm_mpi.o \ + allocate.o \ + field_io_mpi.o \ + pes_mpi.o \ + reduction_mpi.o \ + seed_mpi.o \ + xbound_mpi.o + +OBJS_SHMEM = \ + dotprod.o \ + comm_shmem.o \ + allocate_shmem.o \ + field_io_shmem.o \ + reduction_shmem.o \ + seed_shmem.o \ + xbound_shmem.o + +OBJS_SHMEMPI = \ + dotprod.o \ + comm_shmempi.o \ + allocate_shmem.o \ + field_io_mpi.o \ + pes_mpi.o \ + reduction_mpi.o \ + seed_mpi.o \ + xbound_shmem.o + +OBJS_SINGLE_PE = \ + dotprod.o \ + allocate.o \ + comm_single_pe.o \ + field_io_single_pe.o \ + pes_single_pe.o \ + reduction_single_pe.o \ + seed_single_pe.o \ + xbound_single_pe.o + +$(LIBCOMM): + +fast: + $(FAST_MAKE) + +lib_mpi.a: $(OBJS_MPI) + $(AR) $(ARFLAGS) $@ $(OBJS_MPI) + $(RANLIB) $@ + +lib_shmem.a: $(OBJS_SHMEM) + $(AR) $(ARFLAGS) $@ $(OBJS_SHMEM) + $(RANLIB) $@ + +lib_shmempi.a: $(OBJS_SHMEMPI) + $(AR) $(ARFLAGS) $@ $(OBJS_SHMEMPI) + $(RANLIB) $@ + +lib_single_pe.a: $(OBJS_SINGLE_PE) + $(AR) $(ARFLAGS) $@ $(OBJS_SINGLE_PE) + $(RANLIB) $@ + +clobber: + rm -f *.[Tiod] *.f90 *.mod work.pc work.pcl + rm -f lib_mpi.a lib_shmem.a lib_shmempi.a lib_single_pe.a diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/allocate.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/allocate.F90 new file mode 100644 index 0000000000000000000000000000000000000000..6993128034183d80f1efac6cb00aa360b7a8bfb2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/allocate.F90 @@ -0,0 +1,133 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! allocate.F90 - allocation of gauge and pseudo fermion fields +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine allocate_g_field(u) + + use module_vol + implicit none + P_GAUGE_FIELD :: u + + if (associated(u)) then + call die("allocate_g_field(): memory leak") + else + allocate(u(NCOL, NCOL, volh_tot, EVEN:ODD, DIM)) + call conf_zero(u) + endif +end + +!------------------------------------------------------------------------------- +subroutine allocate_g_field_io(u) + + use module_lattice_io + implicit none + P_GAUGE_FIELD_IO :: u + + if (associated(u)) then + call die("allocate_g_field_io(): memory leak") + else + allocate(u(NCOL, NCOL-1, DIM, 0:NX-1, 0:NY-1, 0:NZ-1, 0:NT-1)) + endif +end + +!------------------------------------------------------------------------------- +subroutine allocate_gen_field(x) + + use module_vol + implicit none + P_GENERATOR_FIELD :: x + + integer :: i, eo, mu + + if (associated(x)) then + call die("allocate_gen_field(): memory leak") + else + allocate(x(NGEN, volh_tot, EVEN:ODD, DIM)) + do mu = 1, DIM + do eo = EVEN, ODD + !$omp parallel do + do i = 1, volh + x(1, i, eo, mu) = ZERO + x(2, i, eo, mu) = ZERO + x(3, i, eo, mu) = ZERO + x(4, i, eo, mu) = ZERO + x(5, i, eo, mu) = ZERO + x(6, i, eo, mu) = ZERO + x(7, i, eo, mu) = ZERO + x(8, i, eo, mu) = ZERO + enddo + enddo + enddo + endif +end + +!------------------------------------------------------------------------------- +subroutine allocate_sc_field(x) + + use module_vol + implicit none + P_SPINCOL_FIELD :: x + + if (associated(x)) then + call die("allocate_sc_field(): memory leak") + else + allocate(x(NDIRAC, NCOL, volh_tot)) + call sc_zero(x) + endif +end + +!------------------------------------------------------------------------------- +subroutine allocate_sc_field_io(x) + + use module_lattice_io + implicit none + P_SPINCOL_FIELD_IO :: x + + if (associated(x)) then + call die("allocate_sc_field_io(): memory leak") + else + allocate(x(NDIRAC, NCOL, 0:NXH-1, 0:NY-1, 0:NZ-1, 0:NT-1)) + endif +end + +!------------------------------------------------------------------------------- +subroutine allocate_sc_overindexed(x) + + use module_vol + implicit none + P_SPINCOL_OVERINDEXED :: x + + if (associated(x)) then + call die("allocate_sc_overindexed(): memory leak") + else + allocate(x(SIZE_COMPLEX*NDIRAC*NCOL*volh_tot)) + endif +end + +!------------------------------------------------------------------------------- +subroutine allocate_sc2_field(x) + + use module_vol + implicit none + P_SC2_FIELD :: x + + if (associated(x)) then + call die("allocate_sc2_field(): memory leak") + else + allocate(x(2, NCOL, volh_tot, DIM, FWD:BWD)) + endif +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/allocate_shmem.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/allocate_shmem.F90 new file mode 100644 index 0000000000000000000000000000000000000000..8e79bcea19c76dca26bf02917496e3b60f04f2cb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/allocate_shmem.F90 @@ -0,0 +1,229 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! allocate_shmem.F90 - allocation of gauge and pseudo fermion fields using shmem +! +!------------------------------------------------------------------------------- +# include "defs.h" +# include "shmem.h" + +!------------------------------------------------------------------------------- +subroutine allocate_g_field(u) + + use module_vol + implicit none + P_GAUGE_FIELD :: u + + GAUGE_FIELD :: uu + pointer (p_uu, uu) + + integer :: ierr + + if (associated(u)) call die("allocate_g_field(): memory leak") + + call barrier() + call shpalloc(p_uu, SIZE_COMPLEX * NCOL * NCOL * volh_tot * 2 * DIM, ierr, 1) + call cray_pointer_to_f90_pointer(uu) + +CONTAINS + + subroutine cray_pointer_to_f90_pointer(uu) + + implicit none + GAUGE_FIELD, target :: uu + + u => uu + end subroutine cray_pointer_to_f90_pointer + +end + +!------------------------------------------------------------------------------- +subroutine allocate_g_field_io(u) + + use module_lattice_io + use module_vol + implicit none + P_GAUGE_FIELD_IO :: u + + GAUGE_FIELD_IO :: uu + pointer (p_uu, uu) + + integer :: ierr + + if (associated(u)) call die("allocate_g_field_io(): memory leak") + + call barrier() + call shpalloc(p_uu, SIZE_COMPLEX * NCOL * (NCOL-1) * DIM * vol, ierr, 1) + call cray_pointer_to_f90_pointer(uu) + +CONTAINS + + subroutine cray_pointer_to_f90_pointer(uu) + + implicit none + GAUGE_FIELD_IO, target :: uu + + u => uu + end subroutine cray_pointer_to_f90_pointer + +end + +!------------------------------------------------------------------------------- +subroutine allocate_gen_field(u) + + use module_vol + implicit none + P_GENERATOR_FIELD :: u + + GENERATOR_FIELD :: uu + pointer (p_uu, uu) + + integer :: ierr + + if (associated(u)) call die("allocate_gen_field(): memory leak") + + call barrier() + call shpalloc(p_uu, NGEN * volh_tot * 2 * DIM, ierr, 1) + call cray_pointer_to_f90_pointer(uu) + +CONTAINS + + subroutine cray_pointer_to_f90_pointer(uu) + + implicit none + GENERATOR_FIELD, target :: uu + + u => uu + end subroutine cray_pointer_to_f90_pointer + +end + +!------------------------------------------------------------------------------- +subroutine allocate_sc_field(x) + + use module_vol + implicit none + P_SPINCOL_FIELD :: x + + SPINCOL_FIELD :: xx + pointer (p_xx, xx) + + integer :: ierr + + if (associated(x)) call die("allocate_sc_field(): memory leak") + + call barrier() + call shpalloc(p_xx, SIZE_COMPLEX * NDIRAC * NCOL * volh_tot, ierr, 1) + call cray_pointer_to_f90_pointer(xx) + +CONTAINS + + subroutine cray_pointer_to_f90_pointer(xx) + + implicit none + SPINCOL_FIELD, target :: xx + + x => xx + end subroutine cray_pointer_to_f90_pointer + +end + +!------------------------------------------------------------------------------- +subroutine allocate_sc2_field(x) + + use module_vol + implicit none + P_SC2_FIELD :: x + + SC2_FIELD :: xx + pointer (p_xx, xx) + + integer :: ierr + + if (associated(x)) call die("allocate_sc2_field(): memory leak") + + call barrier() + call shpalloc(p_xx, SIZE_COMPLEX * 2 * NCOL * volh_tot * DIM * 2, ierr, 1) + call cray_pointer_to_f90_pointer(xx) + +CONTAINS + + subroutine cray_pointer_to_f90_pointer(xx) + + implicit none + SC2_FIELD, target :: xx + + x => xx + end subroutine cray_pointer_to_f90_pointer + +end + +!------------------------------------------------------------------------------- +subroutine allocate_sc_field_io(x) + + use module_lattice_io + use module_vol + implicit none + P_SPINCOL_FIELD_IO :: x + + SPINCOL_FIELD_IO :: xx + pointer (p_xx, xx) + + integer :: ierr + + if (associated(x)) call die("allocate_sc_field_io(): memory leak") + + call barrier() + call shpalloc(p_xx, SIZE_COMPLEX * NDIRAC * NCOL * volh, ierr, 1) + call cray_pointer_to_f90_pointer(xx) + +CONTAINS + + subroutine cray_pointer_to_f90_pointer(xx) + + implicit none + SPINCOL_FIELD_IO, target :: xx + + x => xx + end subroutine cray_pointer_to_f90_pointer + +end + +!------------------------------------------------------------------------------- +subroutine allocate_sc_overindexed(x) + + use module_vol + implicit none + P_SPINCOL_OVERINDEXED :: x + + SPINCOL_OVERINDEXED :: xx + pointer (p_xx, xx) + + integer :: ierr + + if (associated(x)) call die("allocate_sc_overindexed(): memory leak") + + call barrier() + call shpalloc(p_xx, SIZE_COMPLEX * NDIRAC * NCOL * volh_tot, ierr, 1) + call cray_pointer_to_f90_pointer(xx) + +CONTAINS + + subroutine cray_pointer_to_f90_pointer(xx) + + implicit none + SPINCOL_OVERINDEXED, target :: xx + + x => xx + end subroutine cray_pointer_to_f90_pointer + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/bqcd.pcl b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/bqcd.pcl new file mode 100644 index 0000000000000000000000000000000000000000..906244500b31700684482c3dcfd32f6cec4279db --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/bqcd.pcl @@ -0,0 +1,2 @@ +work.pc +../modules/work.pc diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/comm_mpi.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/comm_mpi.F90 new file mode 100644 index 0000000000000000000000000000000000000000..cf4de03f19d6fd754f7db9491642aab5330eb56c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/comm_mpi.F90 @@ -0,0 +1,46 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! comm_mpi.F90 - wrapper for MPI routines +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine comm_init() + + implicit none + include 'mpif.h' + integer ierror + +! call mpi_init(ierror) +end + +!------------------------------------------------------------------------------- +subroutine comm_finalize() + + implicit none + include 'mpif.h' + integer ierror + +! call mpi_finalize(ierror) +end + +!------------------------------------------------------------------------------- +COMM_METHOD function comm_method() + +#ifdef _OPENMP + comm_method = "MPI + OpenMP" +#else + comm_method = "MPI" +#endif +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/comm_shmem.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/comm_shmem.F90 new file mode 100644 index 0000000000000000000000000000000000000000..84a096d709fea1ae7a90083fce27df6ff44cd4da --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/comm_shmem.F90 @@ -0,0 +1,44 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2005, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! comm_shmem.F90 - routines for shmem versions +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine comm_init() +#ifdef ALTIX + call start_pes(0) +#endif + return +end + +!------------------------------------------------------------------------------- +subroutine comm_finalize() + return +end + +!------------------------------------------------------------------------------- +COMM_METHOD function comm_method() + +#ifdef _OPENMP + comm_method = "shmem + OpenMP" +#else + comm_method = "shmem" +#endif +end + +!------------------------------------------------------------------------------- +integer function get_d3_buffer_vol() + get_d3_buffer_vol = 0 +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/comm_shmempi.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/comm_shmempi.F90 new file mode 100644 index 0000000000000000000000000000000000000000..d592215714ab6e724bcec00ad5138d5601589336 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/comm_shmempi.F90 @@ -0,0 +1,51 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! comm_shmempi.F90 - MPI + shmem on Altix +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine comm_init() + + implicit none + include 'mpif.h' + integer ierror + + call mpi_init(ierror) +end + +!------------------------------------------------------------------------------- +subroutine comm_finalize() + + implicit none + include 'mpif.h' + integer ierror + + call mpi_finalize(ierror) +end + +!------------------------------------------------------------------------------- +COMM_METHOD function comm_method() + +#ifdef _OPENMP + comm_method = "shmem/MPI + OpenMP" +#else + comm_method = "shmem/MPI" +#endif +end + +!------------------------------------------------------------------------------- +integer function get_d3_buffer_vol() + get_d3_buffer_vol = 0 +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/comm_single_pe.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/comm_single_pe.F90 new file mode 100644 index 0000000000000000000000000000000000000000..a977e50363d06772a61c3176c267543584f705f3 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/comm_single_pe.F90 @@ -0,0 +1,41 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! comm_single_pe.F90 - (dummy) routines for single CPU version +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine comm_init() + return +end + +!------------------------------------------------------------------------------- +subroutine comm_finalize() + return +end + +!------------------------------------------------------------------------------- +COMM_METHOD function comm_method() + +#ifdef _OPENMP + comm_method = "single_pe + OpenMP" +#else + comm_method = "single_pe" +#endif +end + +!------------------------------------------------------------------------------- +integer function get_d3_buffer_vol() + get_d3_buffer_vol = 0 +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/dotprod.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/dotprod.F90 new file mode 100644 index 0000000000000000000000000000000000000000..3600c9039f33867db6ec3cee765797bc041d0fe4 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/dotprod.F90 @@ -0,0 +1,33 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! dotprod.F90 - dot product for parallel computers +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +REAL function dotprod(a, b, n) + + implicit none + integer i, n + REAL a(n), b(n), s, global_sum + + s = ZERO + !$omp parallel do reduction(+: s) + do i = 1, n + s = s + a(i) * b(i) + enddo + + dotprod = global_sum(s) + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/field_io_mpi.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/field_io_mpi.F90 new file mode 100644 index 0000000000000000000000000000000000000000..83843827fafee23e802b7692cb5bab6482e1ef2a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/field_io_mpi.F90 @@ -0,0 +1,240 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! field_io_mpi.F90 - I/O routine for gauge and pseudo fermion fields using MPI +! +!------------------------------------------------------------------------------- +# include "defs.h" + +#ifndef INCLUDE_MPIF_H +#define INCLUDE_MPIF_H include 'mpif.h' +#endif + +!------------------------------------------------------------------------------- +subroutine field_io(action, m, mx, field, cksum) ! read or write g- or sc-field + + use typedef_cksum + use module_decomp + use module_function_decl + use module_lattice_io + use module_vol + implicit none + INCLUDE_MPIF_H + + character(len = *) :: action + integer :: m, mx + type(type_cksum) :: cksum(0:LT - 1) + + REAL :: field(SIZE_COMPLEX * m, 0:mx - 1, 0:NY - 1, 0:NZ - 1, 0:NT - 1) + + REAL :: buffer(0:(SIZE_COMPLEX * m * mx * NPE(1))-1, 0:(NY * NPE(2))-1) + + FILENAME :: file + integer :: i_pe(DIM) + integer, external :: i_global, ilex + integer :: x, y, z, t, t_global, me, pe, size, rec, recl + integer :: size_field + integer :: pe_x, pe_y, pe_z, pe_t + CHECK_SUM :: check_sum(2), n_bytes + integer :: tt, pe_tt, pe_io + integer :: status(MPI_STATUS_SIZE), ierror + integer :: count, block_length, stride, buf_type + logical :: io_pe + + + count = NY + block_length = SIZE_COMPLEX * m * mx + stride = block_length * NPE(1) + + size = block_length * NY ! words in send/recv + size_field = block_length * NY * NZ * NT ! words in "field" + n_bytes = size * NPE(1) * NPE(2) * RKIND ! bytes in "buffer" + recl = n_bytes ! cast to standard integer + + ASSERT(mod(recl, RECL_UNIT) == 0) + recl = recl / RECL_UNIT + + i_pe = decomp%std%i_pe + + call mpi_type_vector(count, block_length, stride, BQCD_REAL, buf_type, ierror) + call mpi_type_commit(buf_type, ierror) + + if (action == "write") call swap_endian8(size_field, field) + + if (i_pe(1) == 0 .and. i_pe(2) == 0 .and. i_pe(3) == 0) then + io_pe = .true. + else + io_pe = .false. + endif + + pe_t = i_pe(4) + do t = 0, NT - 1 + t_global = i_global(t, NT, i_pe(4)) + + file = cksum(t_global)%file + + if (io_pe) then + open(UCONF, file = file, action = action, form = "unformatted", & + access = "direct", recl = recl) + endif + + rec = 0 + call cksum_init() + + do pe_z = 0, NPE(3) - 1 + do z = 0, NZ - 1 + rec = rec + 1 + + if (io_pe .and. action == "read") then + read(UCONF, rec = rec) buffer + call cksum_add(buffer, n_bytes) + endif + + do pe_y = 0, NPE(2) - 1 + do pe_x = 0, NPE(1) - 1 + + y = count * pe_y + x = block_length * pe_x + + call field_io_pes(pe, pe_io, (/pe_x, pe_y, pe_z, pe_t/)) + + if (io_pe) then + if (my_pe() /= pe_io) call die("my_pe() /= pe_io") + endif + + if (my_pe() == pe .and. my_pe() == pe_io) then + call field_io_seq(action, count, block_length, stride, & + field(1,0,0,z,t), buffer(x,y)) + else + if (action == "read") then + if (my_pe() == pe_io) then + call mpi_ssend(buffer(x,y), 1, buf_type, & + pe, 0, MPI_COMM_WORLD, ierror) + endif + if (my_pe() == pe) then + call mpi_recv(field(1,0,0,z,t), size, BQCD_REAL, & + pe_io, 0, MPI_COMM_WORLD, status, ierror) + endif + else + if (my_pe() == pe_io) then + call mpi_recv(buffer(x,y), 1, buf_type, & + pe, 0, MPI_COMM_WORLD, status, ierror) + endif + if (my_pe() == pe) then + call mpi_ssend(field(1,0,0,z,t), size, BQCD_REAL, & + pe_io, 0, MPI_COMM_WORLD, ierror) + endif + endif + endif + + enddo + enddo + + if (io_pe .and. action == "write") then + write(UCONF, rec = rec) buffer + call cksum_add(buffer, n_bytes) + endif + enddo + enddo + + if (io_pe) then + close(UCONF) + call cksum_get(check_sum(1), check_sum(2)) + + if (action == "read") then + + if (check_sum(1) /= cksum(t_global)%sum) then + call die("field_io(): check sum error in file " // file) + endif + + else + + if (my_pe() == 0) then + cksum(t_global)%sum = check_sum(1) + cksum(t_global)%bytes = check_sum(2) + do pe_tt = 1, NPE(4) - 1 + tt = i_global(t, NT, pe_tt) + call mpi_recv(check_sum, 2, BQCD_CHECK_SUM, & + MPI_ANY_SOURCE, tt, MPI_COMM_WORLD, status, ierror) + cksum(tt)%sum = check_sum(1) + cksum(tt)%bytes = check_sum(2) + enddo + else + call mpi_ssend(check_sum, 2, BQCD_CHECK_SUM, 0, t_global, & + MPI_COMM_WORLD, ierror) + endif + endif + endif + enddo + + call swap_endian8(size_field, field) + call mpi_type_free(buf_type, ierror) +end + +!------------------------------------------------------------------------------- +subroutine field_io_seq(action, count, block_length, stride, field, buffer) + + use module_lattice_io + use module_vol + implicit none + + character(len = *) :: action + integer :: count, block_length, stride + REAL :: field(*) + REAL :: buffer(*) + integer :: i, j, x, y + + i = 0 + j = 0 + do y = 1, count + do x = 1, block_length + i = i + 1 + if (action == "read") then + field(i) = buffer(j + x) + else + buffer(j + x) = field(i) + endif + enddo + j = j + stride + enddo + +end + +!------------------------------------------------------------------------------- +subroutine field_io_pes(pe, pe_io, x_std) + + use module_lattice ! in contrast to the calling routine !! + implicit none + integer, intent(out) :: pe, pe_io + integer, intent(in) :: x_std(DIM) + integer :: x_act(DIM), x_std_io(DIM), x_act_io(DIM) + integer, external :: ilex + + x_std_io(1) = 0 + x_std_io(2) = 0 + x_std_io(3) = 0 + x_std_io(4) = x_std(4) + + x_act(1) = x_std(gamma_index(1)) + x_act(2) = x_std(gamma_index(2)) + x_act(3) = x_std(gamma_index(3)) + x_act(4) = x_std(gamma_index(4)) + + x_act_io(1) = x_std_io(gamma_index(1)) + x_act_io(2) = x_std_io(gamma_index(2)) + x_act_io(3) = x_std_io(gamma_index(3)) + x_act_io(4) = x_std_io(gamma_index(4)) + + pe = ilex(DIM, x_act, NPE) + pe_io = ilex(DIM, x_act_io, NPE) + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/field_io_shmem.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/field_io_shmem.F90 new file mode 100644 index 0000000000000000000000000000000000000000..00da858897bd42b72f8acc299cb36ed6c6481034 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/field_io_shmem.F90 @@ -0,0 +1,136 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! field_io_shmem.F90 - I/O routine for gauge and pseudo fermion fields (shmem) +! +!------------------------------------------------------------------------------- +# include "defs.h" +# include "shmem.h" + +!------------------------------------------------------------------------------- +subroutine field_io(action, m, mx, field, cksum) ! read or write g- or sc-field + + use typedef_cksum + use module_function_decl + use module_lattice + use module_vol + implicit none + + character(len = *) :: action + integer :: m, mx + COMPLEX :: field(m, 0:mx-1, 0:NY-1, 0:NZ-1, 0:NT-1) + type(type_cksum) :: cksum(0:LT-1) + +!!! COMPLEX :: buffer(m * mx * NY, 0:NPE(2)-1) +!!!!dir$ symmetric buffer + FILENAME :: file + integer :: i_pe(DIM) + integer, external :: i_global, ilex + integer :: t, t_global, z, me, pe, size, rec, recl + integer :: size_field + integer :: pe_x, pe_y, pe_z, pe_t + CHECK_SUM :: check_sum(2), n_bytes + + COMPLEX :: buffer(m * mx * NY, 0:NPE(2)-1) + CHECK_SUM :: check_sum_master(2, 0:LT-1) + + pointer(p_buffer, buffer) + pointer(p_check_sum_master, check_sum_master) + + save p_buffer + save p_check_sum_master + + logical, save :: initialized = .false. + integer :: ierr + + if (.not. initialized) then + call barrier() + call shpalloc(p_buffer, SIZE_COMPLEX * m * mx * NY * NPE(2), ierr, 1) + call barrier() + call shpalloc(p_check_sum_master, 2 * LT, ierr, 1) + initialized = .true. + endif + + call barrier() + + call unlex(my_pe(), DIM, i_pe, NPE) + + if (i_pe(2) == 0 .and. i_pe(3) == 0) then + size = SIZE_COMPLEX * m * mx * NY ! size in shmem + size_field = size * NZ * NT ! size of "field" + n_bytes = size * RKIND * NPE(2) ! no. of bytes of u_buf + recl = n_bytes ! cast to standard integer + + ASSERT(mod(recl, RECL_UNIT) == 0) + recl = recl / RECL_UNIT + + if (action == "write") call swap_endian8(size_field, field) + + pe_t = i_pe(4) + do t = 0, NT - 1 + t_global = i_global(t, NT, i_pe(4)) + + file = cksum(t_global)%file + + open(UCONF, file = file, action = action, form = "unformatted", & + access = "direct", recl = recl) + rec = 0 + call cksum_init() + + do pe_z = 0, NPE(3) - 1 + do z = 0, NZ - 1 + rec = rec + 1 + + if (action == "read") then + read(UCONF, rec = rec) buffer + call cksum_add(buffer, n_bytes) + endif + + do pe_y = 0, NPE(2) - 1 + pe_x = 0 + pe = ilex(DIM, (/pe_x, pe_y, pe_z, pe_t/), NPE) + + if (action == "read") then + call shmem_put(field(1,0,0,z,t), buffer(1, pe_y), size, pe) + else + call shmem_get(buffer(1, pe_y), field(1,0,0,z,t), size, pe) + endif + + enddo + if (action == "write") then + write(UCONF, rec = rec) buffer + call cksum_add(buffer, n_bytes) + endif + enddo + enddo + close(UCONF) + call cksum_get(check_sum(1), check_sum(2)) + if (action == "read") then + if (check_sum(1) /= cksum(t_global)%sum) then + call die("field_io(): check sum error in file " // file) + endif + else + call shmem_put(check_sum_master(1, t_global), check_sum, 2, 0) + endif + enddo + endif + call barrier() + + if (action == "write") then + do t_global = 0, LT - 1 + cksum(t_global)%sum = check_sum_master(1, t_global) + cksum(t_global)%bytes = check_sum_master(2, t_global) + enddo + endif + + call swap_endian8(size_field, field) +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/field_io_single_pe.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/field_io_single_pe.F90 new file mode 100644 index 0000000000000000000000000000000000000000..e72e1030314a7218443002bf5e15447eba370efb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/field_io_single_pe.F90 @@ -0,0 +1,52 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! field_io_single_pe.F90 - I/O routine for gauge and pseudo fermion fields +! (single processor version) +! +!------------------------------------------------------------------------------- +# define INCLUDE_MPIF_H + +# define MPI_STATUS_SIZE 2 +# define MPI_REAL8 0 +# define mpi_real8 0 +# define MPI_COMM_WORLD 0 +# define MPI_INTEGER8 0 +# define mpi_integer8 0 +# define MPI_ANY_SOURCE 0 + +# include "field_io_mpi.F90" + +!------------------------------------------------------------------------------- +subroutine mpi_type_vector(a, b, c, d, e, f) + return +end + +!------------------------------------------------------------------------------- +subroutine mpi_type_commit(a, b) + return +end + +!------------------------------------------------------------------------------- +subroutine mpi_type_free(a, b) + return +end + +!------------------------------------------------------------------------------- +subroutine mpi_ssend(a, b, c, d, e, f, g) + call die("mpi_ssend(): MPI must not be called in single PE version") +end + +!------------------------------------------------------------------------------- +subroutine mpi_recv(a, b, c, d, e, f, g, h) + call die("mpi_recv(): MPI must not be called in single PE version") +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/pes_mpi.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/pes_mpi.F90 new file mode 100644 index 0000000000000000000000000000000000000000..88576ead59b4565ca3aa1703df73539b4617d109 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/pes_mpi.F90 @@ -0,0 +1,33 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin +! +!------------------------------------------------------------------------------- +! +! pes_mpi.F90 - MPI version of shmem functions +! +!------------------------------------------------------------------------------- +integer function my_pe() + + implicit none + include 'mpif.h' + integer ierror + + call mpi_comm_rank(MPI_COMM_WORLD, my_pe, ierror) +end + +!------------------------------------------------------------------------------- +integer function num_pes() + + implicit none + include 'mpif.h' + integer ierror + + call mpi_comm_size(MPI_COMM_WORLD, num_pes, ierror) +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/pes_single_pe.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/pes_single_pe.F90 new file mode 100644 index 0000000000000000000000000000000000000000..ad919b808f4465cd4244abbbebf8c9987ae48195 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/pes_single_pe.F90 @@ -0,0 +1,28 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin +! +!------------------------------------------------------------------------------- +! +! pes_single_pe.F90 - dummy routines for shmem functions +! +!------------------------------------------------------------------------------- +integer function my_pe() + my_pe = 0 +end + +!------------------------------------------------------------------------------- +integer function num_pes() + num_pes = 1 +end + +!------------------------------------------------------------------------------- +subroutine barrier() + return +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/reduction_mpi.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/reduction_mpi.F90 new file mode 100644 index 0000000000000000000000000000000000000000..3cd8fe7adb0870c3854ae66ca2d43fd1d2884dca --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/reduction_mpi.F90 @@ -0,0 +1,74 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! reduction_mpi.F90 - reduction operations in MPI +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +function global_sum(local_sum) + + implicit none + include 'mpif.h' + REAL global_sum, local_sum + integer ierror + + TIMING_START(timing_bin_global_sum) + + call mpi_allreduce(local_sum, global_sum, 1, & + BQCD_REAL, MPI_SUM, MPI_COMM_WORLD, ierror) + + TIMING_STOP(timing_bin_global_sum) +end + +!------------------------------------------------------------------------------- +function global_min(local_min) + + implicit none + include 'mpif.h' + real global_min, local_min + integer ierror + + call mpi_allreduce(local_min, global_min, 1, & + MPI_REAL, MPI_MIN, MPI_COMM_WORLD, ierror) +end + +!------------------------------------------------------------------------------- +function global_max(local_max) + + implicit none + include 'mpif.h' + real global_max, local_max + integer ierror + + call mpi_allreduce(local_max, global_max, 1, & + MPI_REAL, MPI_MAX, MPI_COMM_WORLD, ierror) +end + +!------------------------------------------------------------------------------- +subroutine global_sum_vec(n, sum) + + implicit none + include 'mpif.h' + integer, intent(in) :: n + REAL, intent(inout) :: sum(n) + REAL :: tmp(n) + integer ierror + + TIMING_START(timing_bin_global_sum_vec) + + tmp = sum + call mpi_allreduce(tmp, sum, n, BQCD_REAL, MPI_SUM, MPI_COMM_WORLD, ierror) + + TIMING_STOP(timing_bin_global_sum_vec) +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/reduction_shmem.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/reduction_shmem.F90 new file mode 100644 index 0000000000000000000000000000000000000000..6fb261cd1ddcc41a6a3616a437b6b19d0cba2227 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/reduction_shmem.F90 @@ -0,0 +1,118 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! reduction_shmem.F90 - reduction operations in shmem +! +!------------------------------------------------------------------------------- +# include "defs.h" +# include "shmem.h" + +!------------------------------------------------------------------------------- +function global_sum(local_sum) + + implicit none + include 'mpp/shmem.fh' + + REAL :: global_sum, local_sum + REAL, save :: source, target + integer :: n_pes + integer, external :: num_pes + REAL, save :: pWrk(2 + shmem_reduce_min_wrkdata_size) + integer, save :: pSync(shmem_reduce_sync_size) + + + TIMING_START(timing_bin_global_sum) + + n_pes = num_pes() + + if (n_pes == 1) then + global_sum = local_sum + return + endif + + source = local_sum + + call shmem_real8_sum_to_all(target, source, 1, 0, 0, n_pes, pWrk, pSync) + + global_sum = target + + TIMING_STOP(timing_bin_global_sum) +end + +!------------------------------------------------------------------------------- +function global_min(local_min) + + implicit none + include 'mpp/shmem.fh' + + real :: global_min, local_min + real, save :: source, target + integer :: n_pes + integer, external :: num_pes + real, save :: pWrk(2 + shmem_reduce_min_wrkdata_size) + integer, save :: pSync(shmem_reduce_sync_size) + + n_pes = num_pes() + + if (n_pes == 1) then + global_min = local_min + return + endif + + source = local_min + + call shmem_real8_min_to_all(target, source, 1, 0, 0, n_pes, pWrk, pSync) + + global_min = target +end + +!------------------------------------------------------------------------------- +function global_max(local_max) + + implicit none + include 'mpp/shmem.fh' + + real :: global_max, local_max + real, save :: source, target + integer :: n_pes + integer, external :: num_pes + real, save :: pWrk(2 + shmem_reduce_min_wrkdata_size) + integer, save :: pSync(shmem_reduce_sync_size) + + n_pes = num_pes() + + if (n_pes == 1) then + global_max = local_max + return + endif + + source = local_max + + call shmem_real8_max_to_all(target, source, 1, 0, 0, n_pes, pWrk, pSync) + + global_max = target +end + +!------------------------------------------------------------------------------- +subroutine global_sum_vec(n, sum) + + implicit none + + integer, intent(in) :: n + REAL, intent(inout) :: sum(n) + + TIMING_START(timing_bin_global_sum_vec) + + call die("global_sum_vec(): shmem version not implemented yet.") + + TIMING_STOP(timing_bin_global_sum_vec) +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/reduction_single_pe.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/reduction_single_pe.F90 new file mode 100644 index 0000000000000000000000000000000000000000..67bf6f99beb81aa5615da8aaf705be1931fd099d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/reduction_single_pe.F90 @@ -0,0 +1,52 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! reduction_single_pe.F90 - reduction operations on a single processor +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +function global_sum(local_sum) + + implicit none + REAL :: global_sum, local_sum + + global_sum = local_sum +end + +!------------------------------------------------------------------------------- +function global_min(local_min) + + implicit none + real :: global_min, local_min + + global_min = local_min +end + +!------------------------------------------------------------------------------- +function global_max(local_max) + + implicit none + real :: global_max, local_max + + global_max = local_max +end + +!------------------------------------------------------------------------------- +subroutine global_sum_vec(n, sum) + + implicit none + integer, intent(in) :: n + REAL, intent(inout) :: sum(n) + + return +end +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/seed_mpi.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/seed_mpi.F90 new file mode 100644 index 0000000000000000000000000000000000000000..762a65cb26987313de6193127f10f4c021f619be --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/seed_mpi.F90 @@ -0,0 +1,48 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! seed_mpi.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine seed_broadcast(seed) + + use module_function_decl + implicit none + include 'mpif.h' + SEED seed + integer ierror + + call mpi_bcast(seed, 1, BQCD_SEED, 0, MPI_COMM_WORLD, ierror) +end + +!------------------------------------------------------------------------------- +subroutine seed_compare(seed) + + use module_function_decl + implicit none + include 'mpif.h' + SEED seed, s + integer pe, status(MPI_STATUS_SIZE), ierror + + if (my_pe() /= 0) then + call mpi_ssend(seed, 1, BQCD_SEED, 0, 0, MPI_COMM_WORLD, ierror) + else + do pe = 1, num_pes() - 1 + call mpi_recv(s, 1, BQCD_SEED, pe, 0, MPI_COMM_WORLD, status, ierror) + if (s /= seed) call die('rancheck(): seeds differ') + enddo + endif + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/seed_shmem.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/seed_shmem.F90 new file mode 100644 index 0000000000000000000000000000000000000000..8d76b5cf36f131a7e1c7c1370fba0306471fe53b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/seed_shmem.F90 @@ -0,0 +1,61 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! seed_shmem.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" +# include "shmem.h" + +!------------------------------------------------------------------------------- +subroutine seed_broadcast(seed) + + use module_function_decl + implicit none + include "mpp/shmem.fh" + + SEED :: seed + SEED, save :: s + integer, save :: psync(SHMEM_BCAST_SYNC_SIZE) + + psync = SHMEM_SYNC_VALUE + s = seed + + call barrier() + call shmem_broadcast(s, s, 1, 0, 0, 0, num_pes(), psync) + call barrier() + + seed = s +end + +!------------------------------------------------------------------------------- +subroutine seed_compare(seed) + + use module_function_decl + implicit none + + SEED :: seed + SEED, save :: s + integer :: pe + + s = seed + call barrier() + + if (my_pe() == 0) then + do pe = 1, num_pes() - 1 + call shmem_get(s, s, 1, pe) + if (s /= seed) call die('rancheck(): seeds differ') + enddo + endif + + call barrier() +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/seed_single_pe.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/seed_single_pe.F90 new file mode 100644 index 0000000000000000000000000000000000000000..188f7e7af78b1d2fb0c776784216f8459837456a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/seed_single_pe.F90 @@ -0,0 +1,34 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin +! +!------------------------------------------------------------------------------- +! +! seed_single_pe.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine seed_broadcast(seed) + + implicit none + SEED seed + + return +end + +!------------------------------------------------------------------------------- +subroutine seed_compare(seed) + + implicit none + SEED seed + + return +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/shmem.h b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/shmem.h new file mode 100644 index 0000000000000000000000000000000000000000..f6289299e7c9bd847cbb0e3f48eb7f8083dcedbd --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/shmem.h @@ -0,0 +1,7 @@ +#ifdef ALTIX +# define barrier shmem_barrier_all +# define shmem_broadcast shmem_broadcast8 +# define shmem_get shmem_get8 +# define shmem_put shmem_put8 +# define shpalloc(addr, length, errcode, abort) shpalloc(addr, 2 * (length), errcode, abort) +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/xbound_mpi.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/xbound_mpi.F90 new file mode 100644 index 0000000000000000000000000000000000000000..51ca5e2eb0e696aa5e62fd8aceae61334a2de5f4 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/xbound_mpi.F90 @@ -0,0 +1,816 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! xbound_mpi.F90 - boundary exchange with MPI +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module module_xbound + + implicit none + integer, parameter :: max_bound = 3 * 3 * 3 * 3 + + type type_xbound + integer :: i_source + integer :: i_target + integer :: pe_source + integer :: pe_target + integer :: size ! total size + integer :: vector_type + integer :: block_count + integer :: block_size + integer :: block_stride + end type type_xbound +end + +!------------------------------------------------------------------------------- +module module_xbound_g + + use module_xbound + implicit none + integer, save :: n_bound = 0 + type (type_xbound), save :: b(max_bound) +end + +!------------------------------------------------------------------------------- +module module_xbound_sc + + use module_xbound + implicit none + integer, save :: n_bound = 0 + integer, save :: i_bound(DIM, FWD:BWD) + type (type_xbound), save :: b(max_bound) +end + +!------------------------------------------------------------------------------- +module module_xbound_sc2 + + use module_xbound + implicit none + integer, save :: n_bound = 0 + integer, save :: i_bound(DIM, FWD:BWD) + type (type_xbound), save :: b(max_bound) +end + +!------------------------------------------------------------------------------- +subroutine init_xbound() + + implicit none + integer, external :: version_of_d + + call init_xbound_g() + call init_xbound_sc() + call init_xbound_sc2() + if (version_of_d() == 3) call init_xbound_d3() + if (version_of_d() == 31) call init_xbound_d3() + if (version_of_d() == 4) call init_xbound_d4() +end + +!------------------------------------------------------------------------------- +subroutine init_xbound_g() + + use module_xbound_g + implicit none + + integer :: x, y, z, t + + do t = -1, 1 + do z = -1, 1 + do y = -1, 1 + do x = -1, 1 + call init_xch_bound(n_bound, b, NCOL * NCOL * SIZE_COMPLEX, x, y, z, t) + enddo + enddo + enddo + enddo +end + +!------------------------------------------------------------------------------- +subroutine init_xbound_sc() + + use module_xbound_sc + use module_lattice + implicit none + + integer :: mu, block + + + block = NDIRAC * NCOL * SIZE_COMPLEX + + call init_xch_bound(n_bound, b, block, -1,0,0,0); i_bound(1, BWD) = n_bound + call init_xch_bound(n_bound, b, block, +1,0,0,0); i_bound(1, FWD) = n_bound + call init_xch_bound(n_bound, b, block, 0,-1,0,0); i_bound(2, BWD) = n_bound + call init_xch_bound(n_bound, b, block, 0,+1,0,0); i_bound(2, FWD) = n_bound + call init_xch_bound(n_bound, b, block, 0,0,-1,0); i_bound(3, BWD) = n_bound + call init_xch_bound(n_bound, b, block, 0,0,+1,0); i_bound(3, FWD) = n_bound + call init_xch_bound(n_bound, b, block, 0,0,0,-1); i_bound(4, BWD) = n_bound + call init_xch_bound(n_bound, b, block, 0,0,0,+1); i_bound(4, FWD) = n_bound + + do mu = 1, DIM + if (npe(mu) == 1) then + i_bound(mu, FWD) = 0 + i_bound(mu, BWD) = 0 + endif + enddo +end + +!------------------------------------------------------------------------------- +subroutine init_xbound_sc2() + + use module_xbound_sc2 + use module_lattice + implicit none + + integer :: mu, block + + + block = 2 * NCOL * SIZE_COMPLEX + + call init_xch_bound(n_bound, b, block, -1,0,0,0); i_bound(1, BWD) = n_bound + call init_xch_bound(n_bound, b, block, +1,0,0,0); i_bound(1, FWD) = n_bound + call init_xch_bound(n_bound, b, block, 0,-1,0,0); i_bound(2, BWD) = n_bound + call init_xch_bound(n_bound, b, block, 0,+1,0,0); i_bound(2, FWD) = n_bound + call init_xch_bound(n_bound, b, block, 0,0,-1,0); i_bound(3, BWD) = n_bound + call init_xch_bound(n_bound, b, block, 0,0,+1,0); i_bound(3, FWD) = n_bound + call init_xch_bound(n_bound, b, block, 0,0,0,-1); i_bound(4, BWD) = n_bound + call init_xch_bound(n_bound, b, block, 0,0,0,+1); i_bound(4, FWD) = n_bound + + do mu = 1, DIM + if (npe(mu) == 1) then + i_bound(mu, FWD) = 0 + i_bound(mu, BWD) = 0 + endif + enddo +end + +!------------------------------------------------------------------------------- +subroutine init_xch_bound(n_bound, b, block_size, xx, yy, zz, tt) + + use module_xbound + use module_function_decl + use module_nnpe + use module_offset + use module_lattice + use module_vol + implicit none + include 'mpif.h' + + integer, intent(inout) :: n_bound + type (type_xbound), intent(inout) :: b(max_bound) + integer, intent(in) :: block_size, xx, yy, zz, tt + + integer, dimension (DIM) :: dir, m, i, target, source + integer, external :: xyzt2i, n_sites, e_o + integer :: x, y, z, t, size, mu, stride, block_count, ierror + integer :: tmp_type1, tmp_type2, the_type + integer(MPI_ADDRESS_KIND):: true_lb, true_extent + integer :: extent + + logical :: special + + + if (nnpe(xx, yy, zz, tt) == my_pe()) return + + if (xx /= 0 .and. yy == 0 .and. zz /= 0 .and. tt == 0) then + special = .true. + else + special = .false. + endif + + + dir = (/ xx, yy, zz, tt /) + + do mu = 1, DIM + if (dir(mu) /= 0) then + m(mu) = 1 + else + m(mu) = NH(mu) + endif + + if (dir(mu) == -1) then + target(mu) = -1 + source(mu) = N(mu) - 1 + elseif (dir(mu) == +1) then + target(mu) = N(mu) + source(mu) = 0 + else + target(mu) = 0 + source(mu) = 0 + endif + enddo + + + size = block_size + do mu = 1, DIM + if (dir(mu) == 0) then + size = size * NH(mu) + m(mu) = 1 + else + exit + endif + enddo + + stride = block_size + do mu = 1, DIM + if (m(mu) == 1) then + stride = stride * NH(mu) + else + exit + endif + enddo + + block_count = 1 + do mu = 1, DIM + block_count = block_count * m(mu) + enddo + + n_bound = n_bound + 1 + ASSERT(n_bound <= max_bound) + + if (special) then ! (y,t)-plane + + ! MPY-type for y-line: + + block_count = NY + size = block_size + stride = block_size * NXH + + call mpi_type_vector(block_count, size, stride, BQCD_REAL, tmp_type1, ierror) + call mpi_type_commit(tmp_type1, ierror) + + +#ifdef ALTIX + ! use MPI-1 + call mpi_type_extent(BQCD_REAL, extent, ierror) + call mpi_type_struct(2, (/1, 1/), (/0, extent/), (/tmp_type1, MPI_UB/), & + tmp_type2, ierror) +#else + ! use MPI-2 + call mpi_type_get_true_extent(BQCD_REAL, true_lb, true_extent, ierror) + call mpi_type_create_resized(tmp_type1, true_lb, true_extent, tmp_type2, ierror) +#endif + call mpi_type_commit(tmp_type2, ierror) + + ! MPI-parameters for (y,t)-plane: + + block_count = NT + size = 1 + stride = block_size * NXH * NY * NZ + the_type = tmp_type2 + b(n_bound)%size = block_size * NY * NT + + else + + the_type = BQCD_REAL + b(n_bound)%size = block_count * size + + endif + + + b(n_bound)%i_source = xyzt2i(source) + b(n_bound)%i_target = xyzt2i(target) + b(n_bound)%pe_source = nnpe(xx, yy, zz, tt) + b(n_bound)%pe_target = nnpe(-xx, -yy, -zz, -tt) + + b(n_bound)%block_count = block_count + b(n_bound)%block_size = size + b(n_bound)%block_stride= stride + + call mpi_type_vector(block_count, size, stride, the_type, & + b(n_bound)%vector_type, ierror) + call mpi_type_commit(b(n_bound)%vector_type, ierror) + + !!if ( my_pe() == 0) write(6,*) xx,yy,zz,tt, block_count, size, stride + !!if ( my_pe() == 0) write(6,*) xx,yy,zz,tt, b(n_bound)%i_source, b(n_bound)%i_target, nnpe(xx,yy,zz,tt), my_pe() + + !!ASSERT(b(n_bound)%size == block_size * n_sites(DIM, dir, NH, NPE)) + + if (special) then + call mpi_type_free(tmp_type1, ierror) + call mpi_type_free(tmp_type2, ierror) + endif +end + +!------------------------------------------------------------------------------- +subroutine xbound_g_field(u) + + use module_function_decl + use module_vol + implicit none + + GAUGE_FIELD :: u + integer :: mu, eo, x, y, z, t + + if (num_pes() == 1) return + + do mu = 1, DIM + do eo = EVEN, ODD + call xbound_g(u, eo, mu) + enddo + enddo + +end + +!------------------------------------------------------------------------------- +subroutine xbound_g(u, eo, mu) + + use module_xbound_g + use module_function_decl + use module_vol + implicit none + include 'mpif.h' + + integer :: eo, mu, i, status(MPI_STATUS_SIZE), ierror + GAUGE_FIELD :: u + + if (num_pes() == 1) return + + do i = 1, n_bound + call mpi_sendrecv( & + u(1,1, b(i)%i_source, eo,mu), 1, b(i)%vector_type, b(i)%pe_target, 0,& + u(1,1, b(i)%i_target, eo,mu), b(i)%size, BQCD_REAL, b(i)%pe_source, 0,& + MPI_COMM_WORLD, status, ierror) + enddo +end + +!------------------------------------------------------------------------------- +subroutine xbound_sc_field(a) + + use module_function_decl + use module_vol + implicit none + include 'mpif.h' + + integer :: i, status(MPI_STATUS_SIZE), ierror + integer :: mu, fb + SPINCOL_FIELD :: a + + if (num_pes() == 1) return + + do mu = 1, DIM + call xbound_sc(a, mu) + enddo +end + +!------------------------------------------------------------------------------- +subroutine xbound_sc(a, direction) + + use module_xbound_sc + use module_function_decl + use module_vol + implicit none + include 'mpif.h' + + integer :: i, status(MPI_STATUS_SIZE), ierror + integer :: direction, mu, fb + SPINCOL_FIELD :: a + + if (num_pes() == 1) return + + mu = direction + do fb = FWD, BWD + i = i_bound(mu, fb) + if (i /= 0) then + call mpi_sendrecv( & + a(1,1, b(i)%i_source), 1, b(i)%vector_type, b(i)%pe_target, 0,& + a(1,1, b(i)%i_target), b(i)%size, BQCD_REAL, b(i)%pe_source, 0,& + MPI_COMM_WORLD, status, ierror) + endif + enddo +end + +!------------------------------------------------------------------------------- +subroutine xbound_sc2_field(a) + + use module_function_decl + use module_vol + implicit none + include 'mpif.h' + + SC2_FIELD :: a + integer :: mu + + if (num_pes() == 1) return + + do mu = 1, DIM + call xbound_sc2(a, mu) + enddo +end + +!------------------------------------------------------------------------------- +subroutine xbound_sc2_field_i(a) ! "i"mmediate MPI calls + + use module_xbound_sc2 + use module_function_decl + use module_vol + implicit none + include 'mpif.h' + + SC2_FIELD :: a + + integer, parameter :: max_request = 2 * 2 * DIM + + integer :: request(max_request), status(MPI_STATUS_SIZE, max_request), ierror + integer :: mu, fb, i, n_request + + if (num_pes() == 1) return + + n_request = 0 + + do mu = 1, DIM + do fb = FWD, BWD + i = i_bound(mu, fb) + if (i /= 0) then + n_request = n_request + 1 + call mpi_irecv( & + a(1,1, b(i)%i_target, mu,fb), b(i)%size, BQCD_REAL, b(i)%pe_source, 0,& + MPI_COMM_WORLD, request(n_request), ierror) + endif + enddo + enddo + + do mu = 1, DIM + do fb = FWD, BWD + i = i_bound(mu, fb) + if (i /= 0) then + n_request = n_request + 1 + call mpi_isend( & + a(1,1, b(i)%i_source, mu,fb), 1, b(i)%vector_type, b(i)%pe_target, 0,& + MPI_COMM_WORLD, request(n_request), ierror) + endif + enddo + enddo + + call mpi_waitall(n_request, request, status, ierror) +end + +!------------------------------------------------------------------------------- +subroutine xbound_sc2(a, direction) + + use module_xbound_sc2 + use module_function_decl + use module_vol + implicit none + include 'mpif.h' + + integer :: i, status(MPI_STATUS_SIZE), ierror + integer :: direction, mu, fb + SC2_FIELD :: a + + if (num_pes() == 1) return + + mu = direction + do fb = FWD, BWD + i = i_bound(mu, fb) + if (i /= 0) then + call mpi_sendrecv( & + a(1,1, b(i)%i_source, mu,fb), 1, b(i)%vector_type, b(i)%pe_target, 0,& + a(1,1, b(i)%i_target, mu,fb), b(i)%size, BQCD_REAL, b(i)%pe_source, 0,& + MPI_COMM_WORLD, status, ierror) + endif + enddo +end + +!=============================================================================== +! +! new stuff for libd3 +! +!------------------------------------------------------------------------------- +module module_xbound_d3 + + use module_xbound + implicit none + integer, save :: n_bound(DIM) + type (type_xbound), save :: b(2, DIM) + + integer, save :: xch_yf = 0 + integer, save :: xch_yb = 0 + integer, save :: xch_zf = 0 + integer, save :: xch_zb = 0 + integer, save :: xch_tf = 0 + integer, save :: xch_tb = 0 +#ifdef D3_BUFFER_VOL + integer, parameter :: d3_buffer_vol = D3_BUFFER_VOL +#else + integer, parameter :: d3_buffer_vol = 0 +#endif + + type (type_xbound), save :: byf, byb, bzf, bzb, btf, btb + + ! allocate buffer for MPI in static memory to speed-up communication on SR8000 + integer, parameter :: max_buffer = NDIRAC*NCOL*SIZE_COMPLEX*d3_buffer_vol + + REAL, dimension (max_buffer), save :: buffer_yf, buffer_yb, & + buffer_zf, buffer_zb +end + +!------------------------------------------------------------------------------- +integer function get_d3_buffer_vol() + + use module_xbound_d3 + implicit none + + get_d3_buffer_vol = d3_buffer_vol +end + +!------------------------------------------------------------------------------- +subroutine init_xbound_d3() + + use module_lattice + use module_xbound_d3 + implicit none + + if (npe(1) /= 1) call die("init_xbound_d3(): npe(1) /= 1") + + n_bound = 0 + + call init_xch_bound(n_bound(2), b(1,2), NDIRAC*NCOL*SIZE_COMPLEX, 0, 1,0,0) + call init_xch_bound(n_bound(2), b(1,2), NDIRAC*NCOL*SIZE_COMPLEX, 0,-1,0,0) + + call init_xch_bound(n_bound(3), b(1,3), NDIRAC*NCOL*SIZE_COMPLEX, 0,0, 1,0) + call init_xch_bound(n_bound(3), b(1,3), NDIRAC*NCOL*SIZE_COMPLEX, 0,0,-1,0) + + call init_xch_bound(n_bound(4), b(1,4), NDIRAC*NCOL*SIZE_COMPLEX, 0,0,0, 1) + call init_xch_bound(n_bound(4), b(1,4), NDIRAC*NCOL*SIZE_COMPLEX, 0,0,0,-1) + + call init_xch_bound(xch_yf, byf, NDIRAC*NCOL*SIZE_COMPLEX, 0, 1,0,0) + call init_xch_bound(xch_yb, byb, NDIRAC*NCOL*SIZE_COMPLEX, 0,-1,0,0) + + call init_xch_bound(xch_zf, bzf, NDIRAC*NCOL*SIZE_COMPLEX, 0,0, 1,0) + call init_xch_bound(xch_zb, bzb, NDIRAC*NCOL*SIZE_COMPLEX, 0,0,-1,0) + + call init_xch_bound(xch_tf, btf, NDIRAC*NCOL*SIZE_COMPLEX, 0,0,0, 1) + call init_xch_bound(xch_tb, btb, NDIRAC*NCOL*SIZE_COMPLEX, 0,0,0,-1) + + !!if (xch_yf /= 0) allocate(buffer_yf(byf%size)) + !!if (xch_yb /= 0) allocate(buffer_yb(byb%size)) + !!if (xch_zf /= 0) allocate(buffer_zf(bzf%size)) + !!if (xch_zb /= 0) allocate(buffer_zb(bzb%size)) + + if (byf%size > max_buffer) call die("init_xbound_d3(): byf%size") + if (byb%size > max_buffer) call die("init_xbound_d3(): byb%size") + if (bzf%size > max_buffer) call die("init_xbound_d3(): bzf%size") + if (bzb%size > max_buffer) call die("init_xbound_d3(): bzb%size") + + if (xch_yf /= xch_yb) call die("init_xbound_d3(): xch_y") + if (xch_zf /= xch_zb) call die("init_xbound_d3(): xch_z") + if (byf%block_size /= byb%block_size) call die("init_xbound_d3(): size") + if (byf%block_count /= byb%block_count) call die("init_xbound_d3(): count") + if (byf%block_stride /= byb%block_stride) call die("init_xbound_d3(): stride") +end + +!------------------------------------------------------------------------------- +subroutine xbound_d3(a, direction) + + use module_xbound_d3 + use module_function_decl + use module_vol + implicit none + include 'mpif.h' + + integer :: i, status(MPI_STATUS_SIZE), ierror, direction, d + SPINCOL_FIELD :: a + + if (num_pes() == 1) return + + d = direction + + do i = 1, n_bound(d) + call mpi_sendrecv( & + a(1, 1, b(i,d)%i_source), 1, b(i,d)%vector_type, b(i,d)%pe_target,0,& + a(1, 1, b(i,d)%i_target), b(i,d)%size, BQCD_REAL, b(i,d)%pe_source,0,& + MPI_COMM_WORLD, status, ierror) + enddo +end + +!------------------------------------------------------------------------------- +subroutine xbound_yf(a) + + use module_xbound_d3 + use module_function_decl + use module_vol + implicit none + include 'mpif.h' + + integer :: i, status(MPI_STATUS_SIZE), ierror + SPINCOL_FIELD :: a + + if (num_pes() == 1 .or. xch_yf == 0) return + + call mpi_sendrecv( & + a(1, 1, byf%i_source), 1, byf%vector_type, byf%pe_target,1,& + a(1, 1, byf%i_target), byf%size, BQCD_REAL, byf%pe_source,1,& + MPI_COMM_WORLD, status, ierror) +end + +!------------------------------------------------------------------------------- +subroutine xbound_yb(a) + + use module_xbound_d3 + use module_function_decl + use module_vol + implicit none + include 'mpif.h' + + integer :: i, status(MPI_STATUS_SIZE), ierror + SPINCOL_FIELD :: a + + if (num_pes() == 1 .or. xch_yb == 0) return + + call mpi_sendrecv( & + a(1, 1, byb%i_source), 1, byb%vector_type, byb%pe_target,2,& + a(1, 1, byb%i_target), byb%size, BQCD_REAL, byb%pe_source,2,& + MPI_COMM_WORLD, status, ierror) +end + +!------------------------------------------------------------------------------- +subroutine xbound_fill_buffer_y(a) + + use module_xbound_d3 + use module_function_decl + use module_vol + implicit none + + REAL :: a(*) + integer :: i, j, off_af, off_ab, off_b + integer :: count, size, stride, start_af, start_ab + + if (num_pes() == 1 .or. xch_yf == 0) return + + !$omp parallel private(i, j, off_af, off_ab, off_b, start_af, start_ab, & + !$omp count, size, stride) + start_af = (byf%i_source - 1) * NDIRAC * NCOL * SIZE_COMPLEX + 1 + start_ab = (byb%i_source - 1) * NDIRAC * NCOL * SIZE_COMPLEX + 1 + + count = byf%block_count + size = byf%block_size + stride = byf%block_stride + + !$omp do + do i = 0, count - 1 + off_af = start_af + i * stride + off_ab = start_ab + i * stride + off_b = i * size + 1 + do j = 0, size - 1 + buffer_yf(off_b + j) = a(off_af + j) + buffer_yb(off_b + j) = a(off_ab + j) + enddo + enddo + !$omp end parallel +end + +!------------------------------------------------------------------------------- +subroutine xbound_copy_buffer_y(a) + + use module_xbound_d3 + use module_function_decl + use module_vol + implicit none + include 'mpif.h' + + integer :: status(MPI_STATUS_SIZE), ierror + SPINCOL_FIELD :: a + + if (num_pes() == 1 .or. xch_yf == 0) return + + call mpi_sendrecv( & + buffer_yf(1), byf%size, BQCD_REAL, byf%pe_target, 0,& + a(1, 1, byf%i_target), byf%size, BQCD_REAL, byf%pe_source, 0,& + MPI_COMM_WORLD, status, ierror) + + call mpi_sendrecv( & + buffer_yb(1), byb%size, BQCD_REAL, byb%pe_target, 0,& + a(1, 1, byb%i_target), byb%size, BQCD_REAL, byb%pe_source, 0,& + MPI_COMM_WORLD, status, ierror) + +end + +!------------------------------------------------------------------------------- +subroutine xbound_fill_buffer_z(a) + + use module_xbound_d3 + use module_function_decl + use module_vol + implicit none + + REAL :: a(*) + integer :: i, j, off_af, off_ab, off_b + integer :: count, size, stride, start_af, start_ab + + if (num_pes() == 1 .or. xch_zf == 0) return + + !$omp parallel private(i, j, off_af, off_ab, off_b, start_af, start_ab, & + !$omp count, size, stride) + start_af = (bzf%i_source - 1) * NDIRAC * NCOL * SIZE_COMPLEX + 1 + start_ab = (bzb%i_source - 1) * NDIRAC * NCOL * SIZE_COMPLEX + 1 + + count = bzf%block_count + size = bzf%block_size + stride = bzf%block_stride + + !$omp do + do i = 0, count - 1 + off_af = start_af + i * stride + off_ab = start_ab + i * stride + off_b = i * size + 1 + do j = 0, size - 1 + buffer_zf(off_b + j) = a(off_af + j) + buffer_zb(off_b + j) = a(off_ab + j) + enddo + enddo + !$omp end parallel +end + +!------------------------------------------------------------------------------- +subroutine xbound_copy_buffer_z(a) + + use module_xbound_d3 + use module_function_decl + use module_vol + implicit none + include 'mpif.h' + + integer :: status(MPI_STATUS_SIZE), ierror + SPINCOL_FIELD :: a + + if (num_pes() == 1 .or. xch_zf == 0) return + + call mpi_sendrecv( & + buffer_zf(1), bzf%size, BQCD_REAL, bzf%pe_target, 0,& + a(1, 1, bzf%i_target), bzf%size, BQCD_REAL, bzf%pe_source, 0,& + MPI_COMM_WORLD, status, ierror) + + call mpi_sendrecv( & + buffer_zb(1), bzb%size, BQCD_REAL, bzb%pe_target, 0,& + a(1, 1, bzb%i_target), bzb%size, BQCD_REAL, bzb%pe_source, 0,& + MPI_COMM_WORLD, status, ierror) + +end + +!=============================================================================== +! +! stuff for lib_d4 +! +!------------------------------------------------------------------------------- +subroutine init_xbound_d4() + + use module_lattice + implicit none + + if (npe(1) /= 1) call die("init_xbound_d4(): npe(1) /= 1") + if (npe(2) /= 1) call die("init_xbound_d4(): npe(2) /= 1") + if (npe(3) /= 1) call die("init_xbound_d4(): npe(3) /= 1") + + call init_xbound_d3() +end + +!------------------------------------------------------------------------------- +subroutine xbound_tf(a) + + use module_xbound_d3 + use module_function_decl + use module_vol + implicit none + include 'mpif.h' + + integer :: i, status(MPI_STATUS_SIZE), ierror + SPINCOL_FIELD :: a + + if (num_pes() == 1 .or. xch_tf == 0) return + + call mpi_sendrecv( & + a(1, 1, btf%i_source), 1, btf%vector_type, btf%pe_target,1,& + a(1, 1, btf%i_target), btf%size, BQCD_REAL, btf%pe_source,1,& + MPI_COMM_WORLD, status, ierror) +end + +!------------------------------------------------------------------------------- +subroutine xbound_tb(a) + + use module_xbound_d3 + use module_function_decl + use module_vol + implicit none + include 'mpif.h' + + integer :: i, status(MPI_STATUS_SIZE), ierror + SPINCOL_FIELD :: a + + if (num_pes() == 1 .or. xch_tb == 0) return + + call mpi_sendrecv( & + a(1, 1, btb%i_source), 1, btb%vector_type, btb%pe_target,2,& + a(1, 1, btb%i_target), btb%size, BQCD_REAL, btb%pe_source,2,& + MPI_COMM_WORLD, status, ierror) +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/xbound_shmem.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/xbound_shmem.F90 new file mode 100644 index 0000000000000000000000000000000000000000..3cff78a725a096e8d19e47bd531e6789c347f6bd --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/xbound_shmem.F90 @@ -0,0 +1,208 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! xbound_shmem.F90 - boundary exchange with shmem +! +!------------------------------------------------------------------------------- +# include "defs.h" +# include "shmem.h" + +!------------------------------------------------------------------------------- +subroutine init_xbound() + + return +end + +!------------------------------------------------------------------------------- +subroutine xbound_g(u, eo, mu) + + use module_function_decl + use module_vol + implicit none + + integer :: eo, mu, x, y, z, t + GAUGE_FIELD :: u + + if (num_pes() == 1) return + + call barrier() + + do t = -1, 1 + do z = -1, 1 + do y = -1, 1 + do x = -1, 1 + call xch_bound(NCOL * NCOL * SIZE_COMPLEX, u(1, 1, 1, eo, mu), x, y, z, t) + call barrier() + enddo + enddo + enddo + enddo + +end + +!------------------------------------------------------------------------------- +subroutine xbound_g_field(u) + + use module_function_decl + use module_vol + implicit none + + GAUGE_FIELD :: u + integer :: mu, eo, x, y, z, t + + if (num_pes() == 1) return + + call barrier() + + do mu = 1, DIM + do eo = EVEN, ODD + call xbound_g(u, eo, mu) + enddo + enddo + +end + +!------------------------------------------------------------------------------- +subroutine xbound_sc_field(array) + + use module_function_decl + use module_vol + implicit none + + SPINCOL_FIELD :: array + integer :: x, y, z, t + + if (num_pes() == 1) return + + call barrier() + + do t = -1, 1 + do z = -1, 1 + do y = -1, 1 + do x = -1, 1 + if ((abs(x) + abs(y) + abs(z) + abs(t)) == 1) then + call xch_bound(NDIRAC * NCOL * SIZE_COMPLEX, array, x, y, z, t) + call barrier() + endif + enddo + enddo + enddo + enddo +end + +!------------------------------------------------------------------------------- +subroutine xbound_sc2_field_i(a) + + use module_function_decl + use module_lattice + use module_vol + implicit none + + SC2_FIELD :: a + integer :: x, y, z, t + + if (num_pes() == 1) return + + call barrier() + + call xch_bound(2 * NCOL * SIZE_COMPLEX, a(1, 1, 1, 1, FWD), +1, 0, 0, 0) + call xch_bound(2 * NCOL * SIZE_COMPLEX, a(1, 1, 1, 1, BWD), -1, 0, 0, 0) + call xch_bound(2 * NCOL * SIZE_COMPLEX, a(1, 1, 1, 2, FWD), 0, +1, 0, 0) + call xch_bound(2 * NCOL * SIZE_COMPLEX, a(1, 1, 1, 2, BWD), 0, -1, 0, 0) + call xch_bound(2 * NCOL * SIZE_COMPLEX, a(1, 1, 1, 3, FWD), 0, 0, +1, 0) + call xch_bound(2 * NCOL * SIZE_COMPLEX, a(1, 1, 1, 3, BWD), 0, 0, -1, 0) + call xch_bound(2 * NCOL * SIZE_COMPLEX, a(1, 1, 1, 4, FWD), 0, 0, 0, +1) + call xch_bound(2 * NCOL * SIZE_COMPLEX, a(1, 1, 1, 4, BWD), 0, 0, 0, -1) + + call barrier() + +end + +!------------------------------------------------------------------------------- +subroutine xch_bound(mm, array, xx, yy, zz, tt) + + use module_function_decl + use module_nnpe + use module_offset + use module_lattice + use module_vol + implicit none + include 'mpif.h' + + integer :: mm + REAL, dimension (mm, volh_tot) :: array + integer, dimension (DIM) :: dir, m, i, target, source + integer, external :: xyzt2i + integer :: xx, yy, zz, tt, x, y, z, t, pe, size, mu + + pe = nnpe(xx, yy, zz, tt) + if (pe == my_pe()) return + + dir = (/ xx, yy, zz, tt /) + + do mu = 1, DIM + if (dir(mu) /= 0) then + m(mu) = 1 + else + m(mu) = NH(mu) + endif + enddo + + size = mm + if (dir(1) == 0) then + size = size * NXH + m(1) = 1 + if (dir(2) == 0) then + size = size * N(2) + m(2) = 1 + if (dir(3) == 0) then + size = size * N(3) + m(3) = 1 + if (dir(4) == 0) then + size = size * N(4) + m(4) = 1 + endif + endif + endif + endif + + do t = 0, m(4) - 1 + do z = 0, m(3) - 1 + do y = 0, m(2) - 1 + do x = 0, m(1) - 1 + + i = (/ x, y, z, t /) + + do mu = 1, DIM + if (dir(mu) == -1) then + target(mu) = -1 + source(mu) = N(mu) - 1 + elseif (dir(mu) == +1) then + target(mu) = N(mu) + source(mu) = 0 + else + target(mu) = i(mu) + source(mu) = i(mu) + endif + enddo + +!!! call shmem_get(array(1, xyzt2i(target)), & +!!! array(1, xyzt2i(source)), size, pe) + call shmem_put(array(1, xyzt2i(target)), & + array(1, xyzt2i(source)), size, nnpe(-xx,-yy,-zz,-tt)) + + enddo + enddo + enddo + enddo + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/xbound_single_pe.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/xbound_single_pe.F90 new file mode 100644 index 0000000000000000000000000000000000000000..5fab9e1eeb57377b0ef2d1ecbd9de848a5a7fc3d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/xbound_single_pe.F90 @@ -0,0 +1,72 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! xbound_single_pe.F90 - dummy routines for boundary exchange +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine init_xbound() + return +end + +!------------------------------------------------------------------------------- +subroutine xbound_g(u, eo, mu) + + use module_vol + implicit none + integer :: eo, mu + GAUGE_FIELD :: u + + return +end + +!------------------------------------------------------------------------------- +subroutine xbound_g_field(u) + + use module_vol + implicit none + GAUGE_FIELD :: u + + return +end + +!------------------------------------------------------------------------------- +subroutine xbound_sc_field(array) + + use module_vol + implicit none + SPINCOL_FIELD :: array + + return +end + +!------------------------------------------------------------------------------- +subroutine xbound_sc2_field(array) + + use module_vol + implicit none + SC2_FIELD :: array + + return +end + +!------------------------------------------------------------------------------- +subroutine xbound_sc2_field_i(array) + + use module_vol + implicit none + SC2_FIELD :: array + + return +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/comm/xbound_test.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/xbound_test.F90 new file mode 100644 index 0000000000000000000000000000000000000000..f76c76cbc1fb12d5daa003f5e98b97ac16425a47 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/comm/xbound_test.F90 @@ -0,0 +1,177 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! xbound_test.F90 - test of xbound_g() +! all possible dimensions must be decomposed +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine xbound_test() + + use module_function_decl + use module_lattice + use module_vol + implicit none + GAUGE_FIELD :: u, v + integer :: x, y, z, t, j(4), i, eo, global(4) + integer :: is_bound_x, is_bound_y, is_bound_z, is_bound_t + integer :: dx, dy, dz, dt + integer, external :: xyzt2i, e_o + character(16) :: status + +!! call conf_zero(u) +!! +!! do i = 1, volh +!! u(1, 1, i, EVEN, 1) = 123 +!! u(1, 1, i, ODD, 1) = 789 +!! enddo +!! +!! call xbound_g(u, EVEN, 1) +!! call xbound_g(u, ODD, 1) +!! +!! do i = 1, volh_tot +!! ASSERT(u(1, 1, i, EVEN, 1) == 123) +!! ASSERT(u(1, 1, i, ODD, 1) == 789) +!! enddo +!! +!!!----------------------------------------------- + call conf_zero(u) + + u = cmplx(12345.0, 67890.0) + + call open_diag() + + do t = 0, NT - 1 + do z = 0, NZ - 1 + do y = 0, NY - 1 + do x = 0, NX - 1 + j = (/x, y, z, t/) + + i = xyzt2i(j) + eo = e_o(j) + + call local2global(my_pe(), j, global) + + u(1, 1, i, eo, 1) = global(1) + u(2, 2, i, eo, 1) = global(2) + u(3, 3, i, eo, 1) = global(3) + u(1, 2, i, eo, 1) = global(4) + + !!write(UDIAG, "(4i6, 2i8)") j, i, eo + + !!write(UDIAG, "(10i6)") j, global, i, eo + enddo + enddo + enddo + enddo + + call xbound_g(u, EVEN, 1) + call xbound_g(u, ODD, 1) + + write(UDIAG,*) + write(UDIAG,*) + + !!ASSERT(e_o((/0,0,0,0/)) == 0) + + + do i = 1, volh_tot + do eo = EVEN, ODD + write(UDIAG, "(a,2i6,4f8.1)") "alles: ", i, eo, & + real(u(1, 1, i, eo, 1)), & + real(u(2, 2, i, eo, 1)), & + real(u(3, 3, i, eo, 1)), & + real(u(1, 2, i, eo, 1)) + enddo + enddo + + write(UDIAG,*) + write(UDIAG,*) + + do t = -1, NT + is_bound_t = 0 + if (t == -1) is_bound_t = 1 + if (t == NT) is_bound_t = 1 + do z = -1, NZ + is_bound_z = 0 + if (z == -1) is_bound_z = 1 + if (z == NZ) is_bound_z = 1 + do y = -1, NY + is_bound_y = 0 + if (y == -1) is_bound_y = 1 + if (y == NY) is_bound_y = 1 + do x = -1, NX + is_bound_x = 0 + if (x == -1) is_bound_x = 1 + if (x == NX) is_bound_x = 1 + + if (is_bound_x + is_bound_y + is_bound_z + is_bound_t <= 2) then + + +!! do x = NX, NX + + j = (/x, y, z, t/) + j = (/x, y, z, t/) + + i = xyzt2i(j) + eo = e_o(j) + + call local2global(my_pe(), j, global) + + dx = -is_bound_x; if (x == NX) dx = 1 + dy = -is_bound_y; if (y == NY) dy = 1 + dz = -is_bound_z; if (z == NZ) dz = 1 + dt = -is_bound_t; if (t == NT) dt = 1 + + if (u(1, 1, i, eo, 1) == global(1) .and. & + u(2, 2, i, eo, 1) == global(2) .and. & + u(3, 3, i, eo, 1) == global(3) .and. & + u(1, 2, i, eo, 1) == global(4)) then + status = " okay" + !!elseif (u(1, 1, i, eo, 1) == global(1) + dx .and. & + !! u(2, 2, i, eo, 1) == global(2) + dy .and. & + !! u(3, 3, i, eo, 1) == global(3) + dz .and. & + !! u(1, 2, i, eo, 1) == global(4) + dt) then + !! status = " okay2" + else + status = "" + + dx = int(u(1, 1, i, eo, 1)) - global(1) + dy = int(u(2, 2, i, eo, 1)) - global(2) + dz = int(u(3, 3, i, eo, 1)) - global(3) + dt = int(u(1, 2, i, eo, 1)) - global(4) + + write(status,"(a,4i3,a)") " (", dx, dy, dz, dt, ")" + endif + + + !!if (eo == 0) write(UDIAG, "(4i6, 2x, 4i6, i8, 2i3)") j, i, eo + !write(UDIAG, "(10i6)") j, global, i, eo + !!ASSERT(eo == mod(4+x+y+z+t, 2)) + + write(UDIAG, "(10i6,4f8.1,a)") j, global, i, eo, & + real(u(1, 1, i, eo, 1)), & + real(u(2, 2, i, eo, 1)), & + real(u(3, 3, i, eo, 1)), & + real(u(1, 2, i, eo, 1)), status + + ASSERT(u(1, 1, i, eo, 1) == global(1)) + ASSERT(u(2, 2, i, eo, 1) == global(2)) + ASSERT(u(3, 3, i, eo, 1) == global(3)) + ASSERT(u(1, 2, i, eo, 1) == global(4)) + + endif + enddo + enddo + enddo + enddo + +end diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/conf.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/conf.F90 new file mode 100644 index 0000000000000000000000000000000000000000..94f937766b843e369c832555630c8e04fe1cb717 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/conf.F90 @@ -0,0 +1,461 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! conf.F90 - operations on gauge field and pseudo fermion field configurations +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine init_confs(para, conf) + + use typedef_para + use module_p_interface + use module_switches + implicit none + + type(type_para) :: para + type(hmc_conf), dimension(MAX_TEMPER) :: conf + integer :: i + + do i = 1, para%n_temper + call allocate_g_field(conf(i)%u) + call allocate_sc_field(conf(i)%phi) + if (switches%hasenbusch) call allocate_sc_field(conf(i)%phi2) + if (para%start == START_HOT .or. para%start == START_COLD) then + call init_u(conf(i)%u, para%start) + conf(i)%former = i + endif + enddo + + if (para%start == START_CONT) call conf_read(.true., para, conf) + if (para%start == START_FILE) call conf_read(.false., para, conf) + + do i = 1, para%n_temper + if (para%hmc(i)%csw_kappa /= ZERO) then + call allocate_clover_field_a(conf(i)%a) + call allocate_clover_field_a(conf(i)%i) + call allocate_clover_field_b(conf(i)%b) + call clover_init(conf(i)%a, conf(i)%i, conf(i)%b, & + conf(i)%u, para%hmc(i)%csw_kappa) + endif + enddo + +end + +!------------------------------------------------------------------------------- +subroutine init_u(u, start) ! initialization of u-field (at trajectory 0) + + use module_vol + implicit none + + GAUGE_FIELD :: u + integer :: start + + select case (start) + case (START_HOT) + call conf_hot(u) + case (START_COLD) + call conf_cold(u) + case default + call die("init_u(): don't know how to start") + end select + + call xbound_g_field(u) +end + +!------------------------------------------------------------------------------- +subroutine conf_check(u) ! checks if u-field is SU(3) + + use module_vol + implicit none + + GAUGE_FIELD, intent(in) :: u + SU3 :: v + SU3, parameter :: su3_one = reshape( & + (/ ONE,ZERO,ZERO, & + ZERO,ONE,ZERO, & + ZERO,ZERO,ONE /), & + (/ NCOL, NCOL /)) + REAL :: dev, d + integer :: i, j, k, eo, mu + + + dev = ZERO + do mu = 1, DIM + do eo = EVEN, ODD + do k = 1, VOLH + call su3_check_det(u(1, 1, k, eo, mu)) + call uud(v, u(1, 1, k, eo, mu), u(1, 1, k, eo, mu)) + do i = 1, NCOL + do j = 1, NCOL + d = abs(Re(v(i, j)) - Re(su3_one(i, j))) & + + abs(Im(v(i, j)) - Im(su3_one(i, j))) + enddo + enddo + dev = max(dev, d) + enddo + enddo + enddo + + if (dev > 1e-13) call die('conf_check(): dev > 1e-13') +!!write(0,'(x,a,e10.2)') 'conf_check(): max deviation is ', dev + +end + +!------------------------------------------------------------------------------- +subroutine conf_normalize(u) ! normalizes u-field to SU(3) + + use module_vol + implicit none + + GAUGE_FIELD, intent(inout) :: u + integer :: i, eo, mu + + do mu = 1, DIM + do eo = EVEN, ODD + do i = 1, volh + call u_normalize(u(1, 1, i, eo, mu)) + enddo + enddo + enddo + +end + +!------------------------------------------------------------------------------- +subroutine conf_zero(u) ! init ("OpenMP first touch") + + use module_vol + implicit none + + GAUGE_FIELD :: u + integer :: i, eo, mu + + do mu = 1, DIM + do eo = EVEN, ODD + !$omp parallel do + do i = 1, volh + u(1, 1, i, eo, mu) = ZERO + u(2, 1, i, eo, mu) = ZERO + u(3, 1, i, eo, mu) = ZERO + u(1, 2, i, eo, mu) = ZERO + u(2, 2, i, eo, mu) = ZERO + u(3, 2, i, eo, mu) = ZERO + u(1, 3, i, eo, mu) = ZERO + u(2, 3, i, eo, mu) = ZERO + u(3, 3, i, eo, mu) = ZERO + enddo + enddo + enddo + +end + +!------------------------------------------------------------------------------- +subroutine conf_cold(u) ! cold start + + use module_vol + implicit none + + GAUGE_FIELD :: u + integer :: i, eo, mu + + do mu = 1, DIM + do eo = EVEN, ODD + do i = 1, volh + u(1, 1, i, eo, mu) = ONE + u(2, 1, i, eo, mu) = ZERO + u(3, 1, i, eo, mu) = ZERO + u(1, 2, i, eo, mu) = ZERO + u(2, 2, i, eo, mu) = ONE + u(3, 2, i, eo, mu) = ZERO + u(1, 3, i, eo, mu) = ZERO + u(2, 3, i, eo, mu) = ZERO + u(3, 3, i, eo, mu) = ONE + enddo + enddo + enddo + +end + +!------------------------------------------------------------------------------- +subroutine conf_hot(u) ! hot start + + use module_vol + implicit none + + GAUGE_FIELD :: u + integer :: i, eo, mu + + do mu = 1, DIM + do eo = EVEN, ODD + call ran_gauss_volh(NCOL * NCOL, u(1, 1, 1, eo, mu), ONE, eo) + do i = 1, volh + call u_normalize(u(1, 1, i, eo, mu)) + enddo + enddo + enddo + +end + +!------------------------------------------------------------------------------- +subroutine conf_seq(action, u, u_io) ! arranges u-field for i/o + + use module_lattice_io + use module_decomp + use module_vol + implicit none + + character(*) :: action + GAUGE_FIELD :: u + GAUGE_FIELD_IO :: u_io + + integer, dimension(DIM) :: j + integer :: x, y, z, t, i, eo, mu, c1, c2 + integer, external :: std_xyzt2i, e_o + + do t = 0, NT - 1 + do z = 0, NZ - 1 + do y = 0, NY - 1 + do x = 0, NX - 1 + + j = (/x, y, z, t/) + + i = std_xyzt2i(j) + eo = e_o(j) + + do mu = 1, DIM + do c2 = 1, NCOL - 1 + do c1 = 1, NCOL + if (action == "read") then + u(c1, c2, i, eo, mu) = u_io(c1, c2, mu, x, y, z, t) + else + u_io(c1, c2, mu, x, y, z, t) = u(c1, c2, i, eo, mu) + endif + enddo + enddo + if (action == "read") call u_complete(u(1, 1, i, eo ,mu)) + enddo + enddo + enddo + enddo + enddo +end + +!------------------------------------------------------------------------------- +subroutine phi_seq(action, phi, phi_io) ! arranges phi-field for i/o + + use module_lattice_io + use module_decomp + use module_vol + implicit none + + character(*) :: action + SPINCOL_FIELD :: phi + SPINCOL_FIELD_IO :: phi_io + + integer, dimension(DIM) :: j + integer :: x, y, z, t, d, c, i + integer, external :: std_xyzt2i, e_o + + do t = 0, NT - 1 + do z = 0, NZ - 1 + do y = 0, NY - 1 + do x = 0, NX - 1 + + j = (/x, y, z, t/) + + if (e_o(j) == EVEN) then + i = std_xyzt2i(j) + do c = 1, NCOL + do d = 1, NDIRAC + if (action == "read") then + phi(d, c, i) = phi_io(d, c, x, y, z, t) + else + phi_io(d, c, x, y, z, t) = phi(d, c, i) + endif + enddo + enddo + endif + + enddo + enddo + enddo + enddo +end + +!------------------------------------------------------------------------------- +subroutine conf_read(restart, para, conf) + + use typedef_cksum + use typedef_para + use module_conf_info + use module_lattice_io + use module_p_interface + use module_switches + use module_vol + implicit none + + character(len = *), parameter :: READ = "read" + + logical :: restart + type(type_para) :: para + type(hmc_conf), dimension(MAX_TEMPER) :: conf + + type(type_conf_info) :: info + type(type_cksum), dimension(0:para%L(4)-1) :: cksum + P_GAUGE_FIELD_IO, save :: u_io + P_SPINCOL_FIELD_IO, save :: phi_io + FILENAME, external :: u_file, phi_file, info_file + FILENAME :: file + integer :: i, t + integer :: u_m, u_mx, phi_m, phi_mx + integer :: n_u_io, n_phi + + ALLOCATE_G_FIELD_IO(u_io) + ALLOCATE_SC_FIELD_IO(phi_io) + + u_m = NCOL * (NCOL - 1) * DIM + u_mx = NX + phi_m = NDIRAC * NCOL + phi_mx = NXH + n_u_io = u_m * vol * SIZE_COMPLEX + n_phi = size_sc_field + + do i = 1, para%n_temper + + if (restart) then + file = info_file(i) + else + file = para%info_file(i) + endif + + open(UINFO, file = file, action = READ, status = "old") + call read_conf_info_header(UINFO, info) + call check_conf_info_header(restart, info, para) + + if (restart) then + conf(i)%former = info%ensemble(2) + else + conf(i)%former = i + endif + + ! read U + + call read_cksum(restart, UINFO, cksum, para%L(4), i, u_file) + + TIMING_START(timing_bin_u_read) + call field_io(READ, u_m, u_mx, u_io, cksum) + TIMING_STOP(timing_bin_u_read) + + call conf_seq(READ, conf(i)%u, u_io) + call xbound_g_field(conf(i)%u) + + if (switches%tempering .and. switches%dynamical) then + ! read PHI + call read_cksum(restart, UINFO, cksum, para%L(4), i, phi_file) + call field_io(READ, phi_m, phi_mx, phi_io, cksum) + call phi_seq(READ, conf(i)%phi, phi_io) + call xbound_sc_field(conf(i)%phi) + endif + + close(UINFO) + enddo +end + +!------------------------------------------------------------------------------- +subroutine conf_write(restart, para, conf) + + use typedef_cksum + use typedef_para + use module_function_decl + use module_lattice_io + use module_p_interface + use module_switches + use module_vol + implicit none + + character(len = *), parameter :: WRITE = "write" + + logical :: restart + type(type_para) :: para + type(hmc_conf), dimension(MAX_TEMPER) :: conf + + type(type_cksum), dimension(0:para%L(4)-1) :: cksum + P_GAUGE_FIELD_IO, save :: u_io + P_SPINCOL_FIELD_IO, save :: phi_io + FILENAME, external :: u_file, phi_file, info_file + FILENAME, external :: conf_file, conf_info_file + FILENAME :: f_info + integer :: i, j, t + integer :: u_m, u_mx, phi_m, phi_mx + integer :: n_u_io, n_phi + REAL :: plaq + REAL, external :: sg + + + ALLOCATE_G_FIELD_IO(u_io) + ALLOCATE_SC_FIELD_IO(phi_io) + + u_m = NCOL * (NCOL - 1) * DIM + u_mx = NX + phi_m = NDIRAC * NCOL + phi_mx = NXH + n_u_io = u_m * vol * SIZE_COMPLEX + n_phi = size_sc_field + + call check_former(para%n_temper, conf) + + do i = 1, para%n_temper + + j = conf(i)%former + + if (restart) then + f_info = info_file(i) + else + f_info = conf_info_file(i, j) + endif + + if (my_pe() == 0) open(UINFO, file = f_info, action = WRITE) + plaq = sg(conf(i)%u) / (SIX * volume) + call write_conf_info_header(para, i, j, plaq) + + ! write U + + do t = 0, para%L(4) - 1 + if (restart) then + cksum(t)%file = u_file(i, t) + else + cksum(t)%file = conf_file(i, j, t) + endif + enddo + + call conf_seq(WRITE, conf(i)%u, u_io) + + TIMING_START(timing_bin_u_write) + call field_io(WRITE, u_m, u_mx, u_io, cksum) + TIMING_STOP(timing_bin_u_write) + + call write_cksum(UINFO, cksum, para%L(4)) + + if (switches%tempering .and. switches%dynamical .and. restart) then + ! write PHI + do t = 0, para%L(4) - 1 + cksum(t)%file = phi_file(i, t) + enddo + call phi_seq(WRITE, conf(i)%phi, phi_io) + call field_io(WRITE, phi_m, phi_mx, phi_io, cksum) + call write_cksum(UINFO, cksum, para%L(4)) + endif + + if (my_pe() == 0) close(UINFO) + enddo +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/conf_info.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/conf_info.F90 new file mode 100644 index 0000000000000000000000000000000000000000..a2b227a4b3b5b5753920c2ec33064f99a3640cb7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/conf_info.F90 @@ -0,0 +1,200 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! conf_info.F90 - read/write/check file containing configuration parameters +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine write_conf_info_header(para, i_ensemble1, i_ensemble2, plaq) + + use module_conf_info + use typedef_para + use module_bqcd + use module_counter + use module_decomp + use module_function_decl + implicit none + + type(type_para) :: para + REAL :: plaq + integer :: i_ensemble1, i_ensemble2 + integer :: i, e(2) + + e(1) = i_ensemble1 + e(2) = i_ensemble2 + + if (my_pe() == 0) then + call begin(UINFO, "ConfInfoHeader") + write(UINFO, *) k_format, conf_info_version + write(UINFO, 400) k_prog, prog_name, prog_version + write(UINFO, *) k_run, para%run + write(UINFO, *) k_traj, counter%traj + write(UINFO, 405) k_host, rechner() + write(UINFO, 400) k_date, datum(), uhrzeit() + write(UINFO, 410) k_L, decomp%std%L + write(UINFO, 410) k_bc, decomp%std%bc_fermions + write(UINFO, *) k_rkind, RKIND + write(UINFO, 420) k_plaq, plaq + + do i = 1, 2 + write(UINFO, *) trim(k_ensemble(i)), e(i) + write(UINFO, 405) trim(k_beta(i)), trim(para%c_hmc(e(i))%beta) + write(UINFO, 405) trim(k_kappa(i)), trim(para%c_hmc(e(i))%kappa) + write(UINFO, 405) trim(k_csw(i)), trim(para%c_hmc(e(i))%csw) + write(UINFO, 405) trim(k_csw_kappa(i)),trim(para%c_hmc(e(i))%csw_kappa) + write(UINFO, 405) trim(k_h(i)), trim(para%c_hmc(e(i))%h) + enddo + call end_A(UINFO, "ConfInfoHeader") + endif + +400 format (3(1x,a)) +405 format (2(1x,a)) +410 format (1x,a,4i3) +420 format (1x,a,1x,e25.14) +end + +!------------------------------------------------------------------------------- +subroutine read_conf_info_header(unit, info) + + use module_bqcd + use module_conf_info + implicit none + + type(type_conf_info) :: info + integer :: unit, v, i + integer, external :: pos_keyword + + call read_keyword_int(unit, k_format, v, 1) + + if (v /= conf_info_version) then + call die("read_conf_info_header(): wrong file format") + endif + + call read_keyword_int(unit, k_L, info%L, DIM) + call read_keyword_int(unit, k_bc, info%bc_fermions, DIM) + call read_keyword_int(unit, k_rkind, info%rkind, 1) + + do i = 1, 2 + call read_keyword_int (unit, k_ensemble(i), info%ensemble(i), 1) + call read_keyword_REAL(unit, k_beta(i), info%beta(i), 1) + call read_keyword_REAL(unit, k_kappa(i), info%kappa(i), 1) + call read_keyword_REAL(unit, k_csw(i), info%csw(i), 1) + call read_keyword_REAL(unit, k_csw_kappa(i),info%csw_kappa(i),1) + call read_keyword_REAL(unit, k_h(i), info%h(i), 1) + enddo + +end + +!------------------------------------------------------------------------------- +subroutine check_conf_info_header(restart, info, para) + + use module_conf_info + use module_decomp + use typedef_para + implicit none + + logical :: restart + type(type_conf_info) :: info + type(type_para) :: para + integer :: mu, i + + if (info%rkind /= RKIND) call die("check_conf_info_header(): RKIND wrong") + + do mu = 1, DIM + if (info%L(mu) /= decomp%std%L(mu)) then + call die("check_conf_info_header(): L inconsistent") + endif + enddo + + if (restart) then + do mu = 1, DIM + if (info%bc_fermions(mu) /= decomp%std%bc_fermions(mu)) then + call die("check_conf_info_header(): bc_fermions inconsistent") + endif + enddo + + do i = 1, 2 + if (info%ensemble(i) < 1 .or. info%ensemble(i) > para%n_temper) then + call die("check_conf_info_header(): i_ensemble out of range") + endif + + if (info%beta(i) /= para%hmc(info%ensemble(i))%beta) then + call die("check_conf_info_header(): beta inconsistent") + endif + + if (info%kappa(i) /= para%hmc(info%ensemble(i))%kappa) then + call die("check_conf_info_header(): kappa inconsistent") + endif + + if (info%csw(i) /= para%hmc(info%ensemble(i))%csw) then + call die("check_conf_info_header(): csw inconsistent") + endif + + if (abs(info%csw_kappa(i) - & + para%hmc(info%ensemble(i))%csw_kappa) > 1e-13 ) then + call die("check_conf_info_header(): csw_kappa inconsistent") + endif + + if (info%h(i) /= para%hmc(info%ensemble(i))%h) then + call die("check_conf_info_header(): h inconsistent") + endif + enddo + endif + +end + +!------------------------------------------------------------------------------- +subroutine read_cksum(restart, unit, cksum, LT, i_temper, file_name) + + use typedef_cksum + implicit none + logical :: restart + integer :: unit, LT, i_temper, t + FILENAME, external :: file_name + type(type_cksum), dimension(0:LT-1) :: cksum + + call pos_keyword(unit, ">BeginCheckSum") + read(unit,*) + + do t = 0, LT - 1 + read(unit,*) cksum(t)%file, cksum(t)%sum + + if (restart) then + if (cksum(t)%file /= file_name(i_temper, t)) then + call die("read_cksum(): file names inconsistent") + endif + endif + enddo + +end + +!------------------------------------------------------------------------------- +subroutine write_cksum(unit, cksum, LT) + + use typedef_cksum + use module_function_decl + implicit none + + integer :: unit, LT, i + type(type_cksum), dimension(LT) :: cksum + + if (my_pe() == 0) then + call begin(unit, "CheckSum") + do i = 1, LT + write(unit, *) trim(cksum(i)%file), cksum(i)%sum, cksum(i)%bytes + enddo + call end_A(unit, "CheckSum") + endif + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/cooling.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/cooling.F90 new file mode 100644 index 0000000000000000000000000000000000000000..fba73b1a8947cfb6f10f85cd709722a15f0e5a40 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/cooling.F90 @@ -0,0 +1,287 @@ +!------------------------------------------------------------------------------- +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! cooling.F90 - measurement of the topological charge using standard cooling +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module module_cooling + + integer, save :: n_cool + logical, dimension(:), pointer, save :: measure_q + +end + +!------------------------------------------------------------------------------- +subroutine init_cooling(list) + + use module_cooling + implicit none + character(*), intent(in) :: list + integer :: i, iostat + + if (list /= " ") then + open(ULIST, file = list, action = "read", status = "old") + + iostat = 0 + n_cool = 0 + do while (iostat == 0) + read(ULIST, *, iostat = iostat) i + if (i < 0) then + call die("init_cooling(): list has negative entries") + else + n_cool = max(n_cool, i) + endif + enddo + + allocate(measure_q(0:n_cool)) + + do i = 0, n_cool + measure_q(i) = .false. + enddo + + rewind(ULIST) + + iostat = 0 + do while (iostat == 0) + read(ULIST, *, iostat = iostat) i + measure_q(i) = .true. + enddo + + close(ULIST) + + else + n_cool = -1 + endif + +end + +!------------------------------------------------------------------------------- +subroutine cooling(u_in, traj, i_ensemble1, i_ensemble2) + + use module_cooling + use module_function_decl + use module_p_interface + use module_vol + implicit none + + integer, intent(in) :: traj, i_ensemble1, i_ensemble2 + GAUGE_FIELD, intent(in) :: u_in + P_GAUGE_FIELD, save :: u + integer :: i + character(len = *), parameter :: key = "%Qc" + REAL :: q, plaq + + if (n_cool < 0) return + + TIMING_START(timing_bin_cooling) + + ALLOCATE_G_FIELD(u) + + u = u_in + + call begin(UREC, "Cooling") + + if (my_pe() == 0) then + write(UREC, 400) "T", key, "traj", "e", "f", "i_cool", & + "Q_cool", "PlaqEnergy" + endif + + do i = 0, n_cool + if (measure_q(i)) then + call conf_check(u) + call top_charge(q, u, plaq) + if (my_pe() == 0) then + write(UREC, 410) key, traj, i_ensemble1, i_ensemble2, i, q, plaq + endif + endif + if (i < n_cool) call conf_relax(u) + enddo + + call end_A(UREC, "Cooling") + +400 format (1x, 2a, a6, 2a3, a8, a15, a15) +410 format (1x, a4, i6, 2i3, i8, f15.6, f15.10) + + TIMING_STOP(timing_bin_cooling) + +end + +!------------------------------------------------------------------------------- +subroutine conf_relax(u) + + use module_vol + implicit none + + GAUGE_FIELD, intent(inout) :: u + SU3 :: uuu, w, a + SU3, parameter :: su3_one = reshape( & + (/ ONE,ZERO,ZERO, & + ZERO,ONE,ZERO, & + ZERO,ZERO,ONE /), & + (/ NCOL, NCOL /)) + REAL :: p0, p1, p2, p3, fac + REAL :: a0, a1, a2, a3 + integer :: i, eo, mu, k, c1, c2 + + do mu = 1, DIM + do eo = EVEN, ODD + !$omp parallel do private(uuu, w, a, p0, p1, p2, p3, fac, & + !$omp a0, a1, a2, a3, k, c1, c2) + do i = 1, VOLH + call staple(uuu, u, i, eo, mu) + do k = 1, NCOL + if (k == 1) then + c1 = 1 + c2 = 2 + else if (k == 2) then + c1 = 1 + c2 = 3 + else if (k == 3) then + c1 = 2 + c2 = 3 + endif + + call uu(w, u(1, 1, i, eo, mu), uuu) ! w = u * uuu + + p0 = Re(w(c1, c1)) + Re(w(c2, c2)) + p1 = -(Im(w(c1, c2)) + Im(w(c2, c1))) + p2 = -(Re(w(c1, c2)) - Re(w(c2, c1))) + p3 = -(Im(w(c1, c1)) - Im(w(c2, c2))) + + fac = ONE / sqrt(p0**2 + p1**2 + p2**2 + p3**2) + + a0 = fac * p0 + a1 = fac * p1 + a2 = fac * p2 + a3 = fac * p3 + + a = su3_one + + a(c1, c1) = cmplx( a0, a3) + a(c1, c2) = cmplx( a2, a1) + a(c2, c1) = cmplx(-a2, a1) + a(c2, c2) = cmplx( a0, -a3) + + call u_update(u(1, 1, i, eo, mu), a) ! u = a * u + + enddo ! k + enddo ! i + call xbound_g(u, eo, mu) + enddo ! eo + enddo ! mu +end + +!------------------------------------------------------------------------------- +subroutine top_charge(qq, u, plaq_energy) + + use module_vol + use module_nn + implicit none + + REAL, intent(out) :: qq, plaq_energy + GAUGE_FIELD, intent(in) :: u + + integer :: e, o, mu, nu, i, j1, j2, j3, j4, j5, j6, j7, c1, c2 + SU3 :: uuu, left, right + COMPLEX, dimension(NCOL, NCOL, DIM - 1, DIM) :: ut ! U~(x,mu,nu) - h.c. + REAL :: q, plaq + REAL, external :: global_sum, Re_Tr_uu + + !---------------------------------------------------------------------- + ! + ! (j3, e) --->--- (j2, o) ---<--- x + ! | | | + ! | | | + ! ^ v ^ nu + ! | | | ^ + ! | | | | + ! (j4, o) ---<--- (i,e) --->--- (j1, o) x--> mu + ! | | | + ! | | | + ! v ^ v + ! | | | + ! | | | + ! (j5, e) --->--- (j6, o) ---<--- (j7, e) + ! + !---------------------------------------------------------------------- + + q = 0 + plaq = 0 + + do e = EVEN, ODD + o = EVEN + ODD - e + !$omp parallel do reduction(+: q, plaq) private(uuu, left, right, ut, & + !$omp mu, nu, i, j1, j2, j3, j4, j5, j6, j7, c1, c2) + do i = 1, VOLH + do mu = 1, DIM - 1 + do nu = mu + 1, DIM + + j1 = nn(i, e, mu, FWD) + j2 = nn(i, e, nu, FWD) + j3 = nn(j2, o, mu, BWD) + j4 = nn(j3, e, nu, BWD) + j5 = nn(j4, o, nu, BWD) + j6 = nn(j5, e, mu, FWD) + j7 = nn(j6, o, mu, FWD) + + uuu = 0 + + call uuu_fwd(uuu, u(1, 1, j1, o, nu), & + u(1, 1, j2, o, mu), & + u(1, 1, i, e, nu)) + + plaq = plaq + Re_Tr_uu(uuu, u(1, 1, i, e, mu)) + + call uuu_bwd_m(uuu, u(1, 1, j7, e, nu), & + u(1, 1, j6, o, mu), & + u(1, 1, j6, o, nu)) + + call uu(right, u(1, 1, i, e, mu), uuu) + + uuu = 0 + + call uuu_fwd(uuu, u(1, 1, i, e, nu), & + u(1, 1, j3, e, mu), & + u(1, 1, j4, o, nu)) + + call uuu_bwd_m(uuu, u(1, 1, j6, o, nu), & + u(1, 1, j5, e, mu), & + u(1, 1, j5, e, nu)) + + call uu(left, uuu, u(1, 1, j4, o, mu)) + + do c2 = 1, NCOL + do c1 = 1, NCOL + ut(c1, c2, mu, nu) = right(c1, c2) - conjg(right(c2, c1)) & + + left(c1, c2) - conjg(left(c2, c1)) + enddo + enddo + enddo ! nu + enddo ! mu + + q = q + Re_Tr_uu(ut(1, 1, 1, 2), ut(1, 1, 3, 4)) & + - Re_Tr_uu(ut(1, 1, 1, 3), ut(1, 1, 2, 4)) & + + Re_Tr_uu(ut(1, 1, 1, 4), ut(1, 1, 2, 3)) + + enddo ! i + enddo ! e/o + + q = global_sum(q) + plaq = global_sum(plaq) + + q = -q / (256 * PI**2) + qq = q + plaq_energy = ONE - plaq / (THREE * SIX * volume) +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/d/D.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D.F90 new file mode 100644 index 0000000000000000000000000000000000000000..dc1887d20347cadf207611ee826a99426962eb8d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D.F90 @@ -0,0 +1,52 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! D.F90 - multiplication with the Wilson hopping matrix D (or D^\dagger) +! (optimization for Cray T3E) +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine NAME(e, o, out, in, u) + +! out := NAME in +! +! NAME = d or d_dag +! +! out is of type "e" = EVEN or ODD +! in is of type "o" = ODD or EVEN + + use module_nn + use module_vol + implicit none + + integer :: e, o + SPINCOL_FIELD :: out, in + GAUGE_FIELD :: u + + TIMING_START(STRCAT(timing_bin_, NAME)) + + call xbound_sc_field(in) + + call STRCAT(NAME, _t )(out, in, u(1, 1, 1, e, 4), u(1, 1, 1, o, 4), & + nn(1, e, 4, FWD), nn(1, e, 4, BWD), VOLH) + call STRCAT(NAME, _zb)(out, in, u(1, 1, 1, o, 3), nn(1, e, 3, BWD), VOLH) + call STRCAT(NAME, _zf)(out, in, u(1, 1, 1, e, 3), nn(1, e, 3, FWD), VOLH) + call STRCAT(NAME, _yb)(out, in, u(1, 1, 1, o, 2), nn(1, e, 2, BWD), VOLH) + call STRCAT(NAME, _yf)(out, in, u(1, 1, 1, e, 2), nn(1, e, 2, FWD), VOLH) + call STRCAT(NAME, _xb)(out, in, u(1, 1, 1, o, 1), nn(1, e, 1, BWD), VOLH) + call STRCAT(NAME, _xf)(out, in, u(1, 1, 1, e, 1), nn(1, e, 1, FWD), VOLH) + + TIMING_STOP(STRCAT(timing_bin_, NAME)) + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/d/D2.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D2.F90 new file mode 100644 index 0000000000000000000000000000000000000000..fe9f9fc5618e274f62ee710478b11d8a2145bb59 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D2.F90 @@ -0,0 +1,52 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! D2.F90 - multiplication with the Wilson hopping matrix D (or D^\dagger) +! (optimization for Hitachi SR8000) +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine NAME(e, o, out, in, u) + +! out := NAME in +! +! NAME = d or d_dag +! +! out is of type "e" = EVEN or ODD +! in is of type "o" = ODD or EVEN + + use module_nn + use module_vol + implicit none + + integer :: e, o + SPINCOL_FIELD :: out, in + GAUGE_FIELD :: u + + TIMING_START(STRCAT(timing_bin_, NAME)) + + call xbound_sc_field(in) + + call STRCAT(NAME, _t )(out, in, u(1, 1, 1, e, 4), u(1, 1, 1, o, 4), & + nn(1, e, 4, FWD), nn(1, e, 4, BWD), VOLH) + call STRCAT(NAME, _zf)(out, in, u(1, 1, 1, e, 3), u(1, 1, 1, o, 3), & + nn(1, e, 3, FWD), nn(1, e, 3, BWD), VOLH) + call STRCAT(NAME, _yf)(out, in, u(1, 1, 1, e, 2), u(1, 1, 1, o, 2), & + nn(1, e, 2, FWD), nn(1, e, 2, BWD), VOLH) + call STRCAT(NAME, _xf)(out, in, u(1, 1, 1, e, 1), u(1, 1, 1, o, 1), & + nn(1, e, 1, FWD), nn(1, e, 1, BWD), VOLH) + + TIMING_STOP(STRCAT(timing_bin_, NAME)) + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/d/D21.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D21.F90 new file mode 100644 index 0000000000000000000000000000000000000000..0ce1150ece84f5ace8a67a3871a369565b63d570 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D21.F90 @@ -0,0 +1,173 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! D21.F90 - multiplication with the Wilson hopping matrix D (or D^\dagger) +! projection onto 2 spincol components +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine NAME(e, o, out, in, u) + +! out := NAME in +! +! NAME = d or d_dag +! +! out is of type "e" = EVEN or ODD +! in is of type "o" = ODD or EVEN + + use module_d21 + use module_nn + use module_vol + use module_p_interface + implicit none + + integer :: e, o + SPINCOL_FIELD :: out, in + GAUGE_FIELD :: u + + + TIMING_START(STRCAT(timing_bin_, NAME)) + + ALLOCATE_SC2_FIELD(a) + + call STRCAT(NAME, _projection)(a, in) + +!!call xbound_sc2_field(a) + call xbound_sc2_field_i(a) + + call STRCAT(NAME, _t )(out, a(1, 1, 1, 4, FWD), a(1, 1, 1, 4, BWD), & + u(1, 1, 1, e, 4), u(1, 1, 1, o, 4), & + nn(1, e, 4, FWD), nn(1, e, 4, BWD), VOLH) + + call STRCAT(NAME, _zf)(out, a(1, 1, 1, 3, FWD), a(1, 1, 1, 3, BWD), & + u(1, 1, 1, e, 3), u(1, 1, 1, o, 3), & + nn(1, e, 3, FWD), nn(1, e, 3, BWD), VOLH) + + call STRCAT(NAME, _yf)(out, a(1, 1, 1, 2, FWD), a(1, 1, 1, 2, BWD), & + u(1, 1, 1, e, 2), u(1, 1, 1, o, 2), & + nn(1, e, 2, FWD), nn(1, e, 2, BWD), VOLH) + + call STRCAT(NAME, _xf)(out, a(1, 1, 1, 1, FWD), a(1, 1, 1, 1, BWD), & + u(1, 1, 1, e, 1), u(1, 1, 1, o, 1), & + nn(1, e, 1, FWD), nn(1, e, 1, BWD), VOLH) + + TIMING_STOP(STRCAT(timing_bin_, NAME)) + +end + +!------------------------------------------------------------------------------- +subroutine STRCAT(NAME, _projection)(out, in) + + use module_vol + implicit none + + SC2_FIELD, intent(out) :: out + SPINCOL_FIELD, intent(in) :: in + integer :: i + + ! statement function: + + COMPLEX :: i_times, c + i_times(c) = cmplx(-aimag(c), real(c)) + + TIMING_START(timing_bin_sc2_projection) + +#ifdef DAGGER +# define PLUS - +# define MINUS + +# define D_T_ONE 3 +# define D_T_TWO 4 +# define D_T_THREE 1 +# define D_T_FOUR 2 +#else +# define PLUS + +# define MINUS - +# define D_T_ONE 1 +# define D_T_TWO 2 +# define D_T_THREE 3 +# define D_T_FOUR 4 +#endif + + + !$omp parallel do + do i = 1, volh + + out(1, 1, i, 1, FWD) = in(1, 1, i) MINUS i_times(in(4, 1, i)) + out(2, 1, i, 1, FWD) = in(2, 1, i) MINUS i_times(in(3, 1, i)) + out(1, 1, i, 1, BWD) = in(1, 1, i) PLUS i_times(in(4, 1, i)) + out(2, 1, i, 1, BWD) = in(2, 1, i) PLUS i_times(in(3, 1, i)) + + out(1, 2, i, 1, FWD) = in(1, 2, i) MINUS i_times(in(4, 2, i)) + out(2, 2, i, 1, FWD) = in(2, 2, i) MINUS i_times(in(3, 2, i)) + out(1, 2, i, 1, BWD) = in(1, 2, i) PLUS i_times(in(4, 2, i)) + out(2, 2, i, 1, BWD) = in(2, 2, i) PLUS i_times(in(3, 2, i)) + + out(1, 3, i, 1, FWD) = in(1, 3, i) MINUS i_times(in(4, 3, i)) + out(2, 3, i, 1, FWD) = in(2, 3, i) MINUS i_times(in(3, 3, i)) + out(1, 3, i, 1, BWD) = in(1, 3, i) PLUS i_times(in(4, 3, i)) + out(2, 3, i, 1, BWD) = in(2, 3, i) PLUS i_times(in(3, 3, i)) + + + out(1, 1, i, 2, FWD) = in(1, 1, i) MINUS in(4, 1, i) + out(2, 1, i, 2, FWD) = in(2, 1, i) PLUS in(3, 1, i) + out(1, 1, i, 2, BWD) = in(1, 1, i) PLUS in(4, 1, i) + out(2, 1, i, 2, BWD) = in(2, 1, i) MINUS in(3, 1, i) + + out(1, 2, i, 2, FWD) = in(1, 2, i) MINUS in(4, 2, i) + out(2, 2, i, 2, FWD) = in(2, 2, i) PLUS in(3, 2, i) + out(1, 2, i, 2, BWD) = in(1, 2, i) PLUS in(4, 2, i) + out(2, 2, i, 2, BWD) = in(2, 2, i) MINUS in(3, 2, i) + + out(1, 3, i, 2, FWD) = in(1, 3, i) MINUS in(4, 3, i) + out(2, 3, i, 2, FWD) = in(2, 3, i) PLUS in(3, 3, i) + out(1, 3, i, 2, BWD) = in(1, 3, i) PLUS in(4, 3, i) + out(2, 3, i, 2, BWD) = in(2, 3, i) MINUS in(3, 3, i) + + + out(1, 1, i, 3, FWD) = in(1, 1, i) MINUS i_times(in(3, 1, i)) + out(2, 1, i, 3, FWD) = in(2, 1, i) PLUS i_times(in(4, 1, i)) + out(1, 1, i, 3, BWD) = in(1, 1, i) PLUS i_times(in(3, 1, i)) + out(2, 1, i, 3, BWD) = in(2, 1, i) MINUS i_times(in(4, 1, i)) + + out(1, 2, i, 3, FWD) = in(1, 2, i) MINUS i_times(in(3, 2, i)) + out(2, 2, i, 3, FWD) = in(2, 2, i) PLUS i_times(in(4, 2, i)) + out(1, 2, i, 3, BWD) = in(1, 2, i) PLUS i_times(in(3, 2, i)) + out(2, 2, i, 3, BWD) = in(2, 2, i) MINUS i_times(in(4, 2, i)) + + out(1, 3, i, 3, FWD) = in(1, 3, i) MINUS i_times(in(3, 3, i)) + out(2, 3, i, 3, FWD) = in(2, 3, i) PLUS i_times(in(4, 3, i)) + out(1, 3, i, 3, BWD) = in(1, 3, i) PLUS i_times(in(3, 3, i)) + out(2, 3, i, 3, BWD) = in(2, 3, i) MINUS i_times(in(4, 3, i)) + + + out(1, 1, i, 4, FWD) = in(D_T_THREE, 1, i) + out(2, 1, i, 4, FWD) = in(D_T_FOUR, 1, i) + out(1, 1, i, 4, BWD) = in(D_T_ONE, 1, i) + out(2, 1, i, 4, BWD) = in(D_T_TWO, 1, i) + + out(1, 2, i, 4, FWD) = in(D_T_THREE, 2, i) + out(2, 2, i, 4, FWD) = in(D_T_FOUR, 2, i) + out(1, 2, i, 4, BWD) = in(D_T_ONE, 2, i) + out(2, 2, i, 4, BWD) = in(D_T_TWO, 2, i) + + out(1, 3, i, 4, FWD) = in(D_T_THREE, 3, i) + out(2, 3, i, 4, FWD) = in(D_T_FOUR, 3, i) + out(1, 3, i, 4, BWD) = in(D_T_ONE, 3, i) + out(2, 3, i, 4, BWD) = in(D_T_TWO, 3, i) + + enddo + + TIMING_STOP(timing_bin_sc2_projection) + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/d/D21xyzt.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D21xyzt.F90 new file mode 100644 index 0000000000000000000000000000000000000000..db8ef9618b9be3f0f02b2d7ba2385734a231d43e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D21xyzt.F90 @@ -0,0 +1,188 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! D21xyzt.F90 - routines needed in D21.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +# define GAMMA_AB1(C) a_bwd(1, C, jb) +# define GAMMA_AB2(C) a_bwd(2, C, jb) +# define GAMMA_AF1(C) a_fwd(1, C, jf) +# define GAMMA_AF2(C) a_fwd(2, C, jf) + +#ifdef DIR_T + +#ifdef DAGGER +# define GAMMA_B1(C) bf1_ ## C +# define GAMMA_B2(C) bf2_ ## C +# define GAMMA_B3(C) bb1_ ## C +# define GAMMA_B4(C) bb2_ ## C +#else +# define GAMMA_B1(C) bb1_ ## C +# define GAMMA_B2(C) bb2_ ## C +# define GAMMA_B3(C) bf1_ ## C +# define GAMMA_B4(C) bf2_ ## C +#endif + +# define UPDATE_B(S, C) b(S, C, i) = TWO * GAMMA_B ## S ## (C) + +#else + +#ifdef DAGGER +# define PLUS - +# define MINUS + +#else +# define PLUS + +# define MINUS - +#endif + +#ifdef DIR_X +# define GAMMA_B3(C) MINUS i_times(bb2_ ## C) PLUS i_times(bf2_ ## C) +# define GAMMA_B4(C) MINUS i_times(bb1_ ## C) PLUS i_times(bf1_ ## C) +#endif + +#ifdef DIR_Y +# define GAMMA_B3(C) MINUS bb2_ ## C PLUS bf2_ ## C +# define GAMMA_B4(C) PLUS bb1_ ## C MINUS bf1_ ## C +#endif + +#ifdef DIR_Z +# define GAMMA_B3(C) MINUS i_times(bb1_ ## C) PLUS i_times(bf1_ ## C) +# define GAMMA_B4(C) PLUS i_times(bb2_ ## C) MINUS i_times(bf2_ ## C) +#endif + +# define GAMMA_B1(C) + bb1_ ## C + bf1_ ## C +# define GAMMA_B2(C) + bb2_ ## C + bf2_ ## C + +# define UPDATE_B(S, C) b(S, C, i) = b(S, C, i) GAMMA_B ## S ## (C) + +#endif + +!------------------------------------------------------------------------------- +subroutine NAME(b, a_fwd, a_bwd, u_e, u_o, nn_fwd, nn_bwd, volh) + + implicit none + + COMPLEX, dimension (NDIRAC, NCOL, *), intent(inout) :: b + COMPLEX, dimension (2, NCOL, *), intent(in) :: a_fwd, a_bwd + COMPLEX, dimension (NCOL, NCOL, *), intent(in) :: u_e, u_o + INTEGER, dimension (*), intent(in) :: nn_fwd, nn_bwd + integer :: volh + + integer :: i, jf, jb + + COMPLEX :: ab1, ab2, af1, af2 + COMPLEX :: bf1_1, bf2_1 + COMPLEX :: bf1_2, bf2_2 + COMPLEX :: bf1_3, bf2_3 + COMPLEX :: bb1_1, bb2_1 + COMPLEX :: bb1_2, bb2_2 + COMPLEX :: bb1_3, bb2_3 + + ! statement function: + + COMPLEX :: i_times, c + i_times(c) = cmplx(-aimag(c), real(c)) + + + TIMING_START(STRCAT(timing_bin_, NAME)) + + !$omp parallel do private(jf, jb, ab1, ab2, af1, af2, & + !$omp bf1_1, bf2_1, bf1_2, bf2_2, bf1_3, bf2_3, & + !$omp bb1_1, bb2_1, bb1_2, bb2_2, bb1_3, bb2_3) + do i = 1, volh + jb = nn_bwd(i) + + ab1 = GAMMA_AB1(1) + ab2 = GAMMA_AB2(1) + + bb1_1 = ab1 * conjg(u_o(1, 1, jb)) + bb2_1 = ab2 * conjg(u_o(1, 1, jb)) + bb1_2 = ab1 * conjg(u_o(1, 2, jb)) + bb2_2 = ab2 * conjg(u_o(1, 2, jb)) + bb1_3 = ab1 * conjg(u_o(1, 3, jb)) + bb2_3 = ab2 * conjg(u_o(1, 3, jb)) + + jf = nn_fwd(i) + + af1 = GAMMA_AF1(1) + af2 = GAMMA_AF2(1) + + bf1_1 = af1 * u_e(1, 1, i) + bf2_1 = af2 * u_e(1, 1, i) + bf1_2 = af1 * u_e(2, 1, i) + bf2_2 = af2 * u_e(2, 1, i) + bf1_3 = af1 * u_e(3, 1, i) + bf2_3 = af2 * u_e(3, 1, i) + + ab1 = GAMMA_AB1(2) + ab2 = GAMMA_AB2(2) + + bb1_1 = bb1_1 + ab1 * conjg(u_o(2, 1, jb)) + bb2_1 = bb2_1 + ab2 * conjg(u_o(2, 1, jb)) + bb1_2 = bb1_2 + ab1 * conjg(u_o(2, 2, jb)) + bb2_2 = bb2_2 + ab2 * conjg(u_o(2, 2, jb)) + bb1_3 = bb1_3 + ab1 * conjg(u_o(2, 3, jb)) + bb2_3 = bb2_3 + ab2 * conjg(u_o(2, 3, jb)) + + af1 = GAMMA_AF1(2) + af2 = GAMMA_AF2(2) + + bf1_1 = bf1_1 + af1 * u_e(1, 2, i) + bf2_1 = bf2_1 + af2 * u_e(1, 2, i) + bf1_2 = bf1_2 + af1 * u_e(2, 2, i) + bf2_2 = bf2_2 + af2 * u_e(2, 2, i) + bf1_3 = bf1_3 + af1 * u_e(3, 2, i) + bf2_3 = bf2_3 + af2 * u_e(3, 2, i) + + ab1 = GAMMA_AB1(3) + ab2 = GAMMA_AB2(3) + + bb1_1 = bb1_1 + ab1 * conjg(u_o(3, 1, jb)) + bb2_1 = bb2_1 + ab2 * conjg(u_o(3, 1, jb)) + bb1_2 = bb1_2 + ab1 * conjg(u_o(3, 2, jb)) + bb2_2 = bb2_2 + ab2 * conjg(u_o(3, 2, jb)) + bb1_3 = bb1_3 + ab1 * conjg(u_o(3, 3, jb)) + bb2_3 = bb2_3 + ab2 * conjg(u_o(3, 3, jb)) + + af1 = GAMMA_AF1(3) + af2 = GAMMA_AF2(3) + + bf1_1 = bf1_1 + af1 * u_e(1, 3, i) + bf2_1 = bf2_1 + af2 * u_e(1, 3, i) + bf1_2 = bf1_2 + af1 * u_e(2, 3, i) + bf2_2 = bf2_2 + af2 * u_e(2, 3, i) + bf1_3 = bf1_3 + af1 * u_e(3, 3, i) + bf2_3 = bf2_3 + af2 * u_e(3, 3, i) + + + UPDATE_B(1, 1) + UPDATE_B(2, 1) + UPDATE_B(3, 1) + UPDATE_B(4, 1) + + UPDATE_B(1, 2) + UPDATE_B(2, 2) + UPDATE_B(3, 2) + UPDATE_B(4, 2) + + UPDATE_B(1, 3) + UPDATE_B(2, 3) + UPDATE_B(3, 3) + UPDATE_B(4, 3) + + enddo + + TIMING_STOP(STRCAT(timing_bin_, NAME)) + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/d/D2xyzt.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D2xyzt.F90 new file mode 100644 index 0000000000000000000000000000000000000000..01e0aa6e50b47ce1419a946d11e0ceb11fe02867 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D2xyzt.F90 @@ -0,0 +1,202 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin +! +!------------------------------------------------------------------------------- +! +! D2xyzt.F90 - routines needed in D2.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +#ifdef DIR_T + +#ifdef DAGGER +# define GAMMA_AB1(C) a(3, C, jb) +# define GAMMA_AB2(C) a(4, C, jb) +# define GAMMA_AF1(C) a(1, C, jf) +# define GAMMA_AF2(C) a(2, C, jf) +# define GAMMA_B1(C) bf1_ ## C +# define GAMMA_B2(C) bf2_ ## C +# define GAMMA_B3(C) bb1_ ## C +# define GAMMA_B4(C) bb2_ ## C +#else +# define GAMMA_AB1(C) a(1, C, jb) +# define GAMMA_AB2(C) a(2, C, jb) +# define GAMMA_AF1(C) a(3, C, jf) +# define GAMMA_AF2(C) a(4, C, jf) +# define GAMMA_B1(C) bb1_ ## C +# define GAMMA_B2(C) bb2_ ## C +# define GAMMA_B3(C) bf1_ ## C +# define GAMMA_B4(C) bf2_ ## C +#endif + +# define UPDATE_B(S, C) b(S, C, i) = TWO * GAMMA_B ## S ## (C) + +#else + +#ifdef DAGGER +# define PLUS - +# define MINUS + +#else +# define PLUS + +# define MINUS - +#endif + +#ifdef DIR_X +# define GAMMA_AB1(C) a(1, C, jb) PLUS i_times(a(4, C, jb)) +# define GAMMA_AB2(C) a(2, C, jb) PLUS i_times(a(3, C, jb)) +# define GAMMA_AF1(C) a(1, C, jf) MINUS i_times(a(4, C, jf)) +# define GAMMA_AF2(C) a(2, C, jf) MINUS i_times(a(3, C, jf)) +# define GAMMA_B3(C) MINUS i_times(bb2_ ## C) PLUS i_times(bf2_ ## C) +# define GAMMA_B4(C) MINUS i_times(bb1_ ## C) PLUS i_times(bf1_ ## C) +#endif + +#ifdef DIR_Y +# define GAMMA_AB1(C) a(1, C, jb) PLUS a(4, C, jb) +# define GAMMA_AB2(C) a(2, C, jb) MINUS a(3, C, jb) +# define GAMMA_AF1(C) a(1, C, jf) MINUS a(4, C, jf) +# define GAMMA_AF2(C) a(2, C, jf) PLUS a(3, C, jf) +# define GAMMA_B3(C) MINUS bb2_ ## C PLUS bf2_ ## C +# define GAMMA_B4(C) PLUS bb1_ ## C MINUS bf1_ ## C +#endif + +#ifdef DIR_Z +# define GAMMA_AB1(C) a(1, C, jb) PLUS i_times(a(3, C, jb)) +# define GAMMA_AB2(C) a(2, C, jb) MINUS i_times(a(4, C, jb)) +# define GAMMA_AF1(C) a(1, C, jf) MINUS i_times(a(3, C, jf)) +# define GAMMA_AF2(C) a(2, C, jf) PLUS i_times(a(4, C, jf)) +# define GAMMA_B3(C) MINUS i_times(bb1_ ## C) PLUS i_times(bf1_ ## C) +# define GAMMA_B4(C) PLUS i_times(bb2_ ## C) MINUS i_times(bf2_ ## C) +#endif + +# define GAMMA_B1(C) + bb1_ ## C + bf1_ ## C +# define GAMMA_B2(C) + bb2_ ## C + bf2_ ## C + +# define UPDATE_B(S, C) b(S, C, i) = b(S, C, i) GAMMA_B ## S ## (C) + +#endif + +!------------------------------------------------------------------------------- +subroutine NAME(b, a, u_e, u_o, nn_fwd, nn_bwd, volh) + + implicit none + + COMPLEX, dimension (NDIRAC, NCOL, *), intent(inout) :: b + COMPLEX, dimension (NDIRAC, NCOL, *), intent(in) :: a + COMPLEX, dimension (NCOL, NCOL, *), intent(in) :: u_e, u_o + INTEGER, dimension (*), intent(in) :: nn_fwd, nn_bwd + integer :: volh + + integer :: i, jf, jb + + COMPLEX :: ab1, ab2, af1, af2 + COMPLEX :: bf1_1, bf2_1 + COMPLEX :: bf1_2, bf2_2 + COMPLEX :: bf1_3, bf2_3 + COMPLEX :: bb1_1, bb2_1 + COMPLEX :: bb1_2, bb2_2 + COMPLEX :: bb1_3, bb2_3 + + ! statement function: + + COMPLEX :: i_times, c + i_times(c) = cmplx(-aimag(c), real(c)) + + TIMING_START(STRCAT(timing_bin_, NAME)) + + !$omp parallel do private(jf, jb, ab1, ab2, af1, af2, & + !$omp bf1_1, bf2_1, bf1_2, bf2_2, bf1_3, bf2_3, & + !$omp bb1_1, bb2_1, bb1_2, bb2_2, bb1_3, bb2_3) + do i = 1, volh + jb = nn_bwd(i) + + ab1 = GAMMA_AB1(1) + ab2 = GAMMA_AB2(1) + + bb1_1 = ab1 * conjg(u_o(1, 1, jb)) + bb2_1 = ab2 * conjg(u_o(1, 1, jb)) + bb1_2 = ab1 * conjg(u_o(1, 2, jb)) + bb2_2 = ab2 * conjg(u_o(1, 2, jb)) + bb1_3 = ab1 * conjg(u_o(1, 3, jb)) + bb2_3 = ab2 * conjg(u_o(1, 3, jb)) + + jf = nn_fwd(i) + + af1 = GAMMA_AF1(1) + af2 = GAMMA_AF2(1) + + bf1_1 = af1 * u_e(1, 1, i) + bf2_1 = af2 * u_e(1, 1, i) + bf1_2 = af1 * u_e(2, 1, i) + bf2_2 = af2 * u_e(2, 1, i) + bf1_3 = af1 * u_e(3, 1, i) + bf2_3 = af2 * u_e(3, 1, i) + + ab1 = GAMMA_AB1(2) + ab2 = GAMMA_AB2(2) + + bb1_1 = bb1_1 + ab1 * conjg(u_o(2, 1, jb)) + bb2_1 = bb2_1 + ab2 * conjg(u_o(2, 1, jb)) + bb1_2 = bb1_2 + ab1 * conjg(u_o(2, 2, jb)) + bb2_2 = bb2_2 + ab2 * conjg(u_o(2, 2, jb)) + bb1_3 = bb1_3 + ab1 * conjg(u_o(2, 3, jb)) + bb2_3 = bb2_3 + ab2 * conjg(u_o(2, 3, jb)) + + af1 = GAMMA_AF1(2) + af2 = GAMMA_AF2(2) + + bf1_1 = bf1_1 + af1 * u_e(1, 2, i) + bf2_1 = bf2_1 + af2 * u_e(1, 2, i) + bf1_2 = bf1_2 + af1 * u_e(2, 2, i) + bf2_2 = bf2_2 + af2 * u_e(2, 2, i) + bf1_3 = bf1_3 + af1 * u_e(3, 2, i) + bf2_3 = bf2_3 + af2 * u_e(3, 2, i) + + ab1 = GAMMA_AB1(3) + ab2 = GAMMA_AB2(3) + + bb1_1 = bb1_1 + ab1 * conjg(u_o(3, 1, jb)) + bb2_1 = bb2_1 + ab2 * conjg(u_o(3, 1, jb)) + bb1_2 = bb1_2 + ab1 * conjg(u_o(3, 2, jb)) + bb2_2 = bb2_2 + ab2 * conjg(u_o(3, 2, jb)) + bb1_3 = bb1_3 + ab1 * conjg(u_o(3, 3, jb)) + bb2_3 = bb2_3 + ab2 * conjg(u_o(3, 3, jb)) + + af1 = GAMMA_AF1(3) + af2 = GAMMA_AF2(3) + + bf1_1 = bf1_1 + af1 * u_e(1, 3, i) + bf2_1 = bf2_1 + af2 * u_e(1, 3, i) + bf1_2 = bf1_2 + af1 * u_e(2, 3, i) + bf2_2 = bf2_2 + af2 * u_e(2, 3, i) + bf1_3 = bf1_3 + af1 * u_e(3, 3, i) + bf2_3 = bf2_3 + af2 * u_e(3, 3, i) + + + UPDATE_B(1, 1) + UPDATE_B(2, 1) + UPDATE_B(3, 1) + UPDATE_B(4, 1) + + UPDATE_B(1, 2) + UPDATE_B(2, 2) + UPDATE_B(3, 2) + UPDATE_B(4, 2) + + UPDATE_B(1, 3) + UPDATE_B(2, 3) + UPDATE_B(3, 3) + UPDATE_B(4, 3) + + enddo + + TIMING_STOP(STRCAT(timing_bin_, NAME)) + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/d/D3.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D3.F90 new file mode 100644 index 0000000000000000000000000000000000000000..f2916b5d3d041ca90c915f51913ecbfb90c60b1f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D3.F90 @@ -0,0 +1,116 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! D3.F90 - multiplication with the Wilson hopping matrix D (or D^\dagger) +! (optimization for Hitachi SR8000: hybrid programming model, +! MPI + OpenMP + overlapping communication and computation) +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine NAME(ee, oo, out, in, u) + +! out := NAME in +! +! NAME = d or d_dag +! +! out is of type "e" = EVEN or ODD +! in is of type "o" = ODD or EVEN + + use module_nn + use module_vol + use module_thread + implicit none + + integer :: ee, oo + SPINCOL_FIELD :: out, in + GAUGE_FIELD :: u + + integer :: thread, i1, i2, omp_get_thread_num, e, o + + TIMING_START(STRCAT(timing_bin_, NAME)) + + call xbound_fill_buffer_y(in) + call xbound_fill_buffer_z(in) + + !$omp parallel private(thread, i1, i2, e, o) + + thread = omp_get_thread_num() + e = ee + o = oo + + !$omp barrier + + i1 = xyz_start(thread) + i2 = xyz_end(thread) + + if (thread == 0) then + TIMING_START(timing_bin_d_xf) + call xbound_copy_buffer_y(in) + else + call STRCAT(NAME, _xf)(out, in, u(1, 1, 1, e, 1), & + u(1, 1, 1, o, 1), & + nn(1, e, 1, FWD), & + nn(1, e, 1, BWD), i1, i2) + endif + + !$omp barrier + + if (thread == 0) then + TIMING_STOP(timing_bin_d_xf) + TIMING_START(timing_bin_d_yf) + call xbound_copy_buffer_z(in) + !!call xbound_d3(in, 3) + else + call STRCAT(NAME, _yf)(out, in, u(1, 1, 1, e, 2), & + u(1, 1, 1, o, 2), & + nn(1, e, 2, FWD), & + nn(1, e, 2, BWD), i1, i2) + endif + + !$omp barrier + + if (thread == 0) then + TIMING_STOP(timing_bin_d_yf) + TIMING_START(timing_bin_d_zf) + call xbound_d3(in, 4) + else + call STRCAT(NAME, _zf)(out, in, u(1, 1, 1, e, 3), & + u(1, 1, 1, o, 3), & + nn(1, e, 3, FWD), & + nn(1, e, 3, BWD), i1, i2) + endif + + !$omp barrier + +#ifdef TIMING + if (thread == 0) then + TIMING_STOP(timing_bin_d_zf) + TIMING_START(timing_bin_d_t) + endif +#endif + + i1 = t_start(thread) + i2 = t_end(thread) + + call STRCAT(NAME, _t )(out, in, u(1, 1, 1, e, 4), & + u(1, 1, 1, o, 4), & + nn(1, e, 4, FWD), & + nn(1, e, 4, BWD), i1, i2) + + !$omp end parallel + + TIMING_STOP(timing_bin_d_t) + TIMING_STOP(STRCAT(timing_bin_, NAME)) + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/d/D31.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D31.F90 new file mode 100644 index 0000000000000000000000000000000000000000..81da092c7102ce3c7954fe6c420110c38cdc368a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D31.F90 @@ -0,0 +1,104 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! D3.F90 - multiplication with the Wilson hopping matrix D (or D^\dagger) +! (optimization for Hitachi SR8000: hybrid programming model, +! MPI + OpenMP + overlapping communication and computation) +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine NAME(ee, oo, out, in, u) + +! out := NAME in +! +! NAME = d or d_dag +! +! out is of type "e" = EVEN or ODD +! in is of type "o" = ODD or EVEN + + use module_nn + use module_vol + use module_thread + implicit none + + integer :: ee, oo + SPINCOL_FIELD :: out, in + GAUGE_FIELD :: u + + integer :: thread, i1, i2, omp_get_thread_num, e, o + + TIMING_START(STRCAT(timing_bin_, NAME)) + + call xbound_fill_buffer_y(in) + call xbound_fill_buffer_z(in) + + !$omp parallel private(thread, i1, i2, e, o) + + thread = omp_get_thread_num() + e = ee + o = oo + + !$omp barrier + + i1 = xyz_start(thread) + i2 = xyz_end(thread) + + if (thread == 0) then + TIMING_START(timing_bin_d_xf) + call xbound_copy_buffer_y(in) + else + call STRCAT(NAME, _switch_0)(e, o, out, in, u, i1, i2, 1) + endif + + !$omp barrier + + if (thread == 0) then + TIMING_STOP(timing_bin_d_xf) + TIMING_START(timing_bin_d_yf) + call xbound_copy_buffer_z(in) + !!call xbound_d3(in, 3) + else + call STRCAT(NAME, _switch)(e, o, out, in, u, i1, i2, 2) + endif + + !$omp barrier + + if (thread == 0) then + TIMING_STOP(timing_bin_d_yf) + TIMING_START(timing_bin_d_zf) + call xbound_d3(in, 4) + else + call STRCAT(NAME, _switch)(e, o, out, in, u, i1, i2, 3) + endif + + !$omp barrier + +#ifdef TIMING + if (thread == 0) then + TIMING_STOP(timing_bin_d_zf) + TIMING_START(timing_bin_d_t) + endif +#endif + + i1 = t_start(thread) + i2 = t_end(thread) + + call STRCAT(NAME, _switch)(e, o, out, in, u, i1, i2, 4) + + !$omp end parallel + + TIMING_STOP(timing_bin_d_t) + TIMING_STOP(STRCAT(timing_bin_, NAME)) + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/d/D31_switch.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D31_switch.F90 new file mode 100644 index 0000000000000000000000000000000000000000..bf61467401f0815012d1aac01c2ff3ee15e2fc79 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D31_switch.F90 @@ -0,0 +1,59 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! D3_switch.F90 - switch layer for arbitrary choice of "gamma_index" +! +!------------------------------------------------------------------------------- +# include "defs.h" + +#ifdef INIT +# define THE_NAME(a) STRCAT3(NAME, a, _0) +#else +# define THE_NAME(a) STRCAT(NAME, a) +#endif + +! NAME = d or d_dag +!------------------------------------------------------------------------------- +subroutine THE_NAME(_switch)(e, o, out, in, u, i1, i2, mu) + + use module_lattice + use module_nn + use module_vol + implicit none + + integer :: e, o + SPINCOL_FIELD :: out, in + GAUGE_FIELD :: u + integer :: i1, i2, mu + + select case (gamma_index(mu)) + case (1) + call THE_NAME(_xf)(out, in, u(1, 1, 1, e, 1), & + u(1, 1, 1, o, 1), & + nn(1, e, 1, FWD), & + nn(1, e, 1, BWD), i1, i2) + case (2) + call THE_NAME(_yf)(out, in, u(1, 1, 1, e, 2), & + u(1, 1, 1, o, 2), & + nn(1, e, 2, FWD), & + nn(1, e, 2, BWD), i1, i2) + case (3) + call THE_NAME(_zf)(out, in, u(1, 1, 1, e, 3), & + u(1, 1, 1, o, 3), & + nn(1, e, 3, FWD), & + nn(1, e, 3, BWD), i1, i2) + case (4) + call THE_NAME(_t )(out, in, u(1, 1, 1, e, 4), & + u(1, 1, 1, o, 4), & + nn(1, e, 4, FWD), & + nn(1, e, 4, BWD), i1, i2) + end select +end +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/d/D31xyzt.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D31xyzt.F90 new file mode 100644 index 0000000000000000000000000000000000000000..534277d9eb8aba4c5de9085dea78aafc7f572640 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D31xyzt.F90 @@ -0,0 +1,213 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin +! +!------------------------------------------------------------------------------- +! +! D3xyzt.F90 - routines needed in D3.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +#ifdef DIR_T + +#ifdef DAGGER +# define GAMMA_AB1(C) a(3, C, jb) +# define GAMMA_AB2(C) a(4, C, jb) +# define GAMMA_AF1(C) a(1, C, jf) +# define GAMMA_AF2(C) a(2, C, jf) +# define GAMMA_B1(C) bf1_ ## C +# define GAMMA_B2(C) bf2_ ## C +# define GAMMA_B3(C) bb1_ ## C +# define GAMMA_B4(C) bb2_ ## C +#else +# define GAMMA_AB1(C) a(1, C, jb) +# define GAMMA_AB2(C) a(2, C, jb) +# define GAMMA_AF1(C) a(3, C, jf) +# define GAMMA_AF2(C) a(4, C, jf) +# define GAMMA_B1(C) bb1_ ## C +# define GAMMA_B2(C) bb2_ ## C +# define GAMMA_B3(C) bf1_ ## C +# define GAMMA_B4(C) bf2_ ## C +#endif + +#ifdef INIT +# define UPDATE_B(S, C) b(S, C, i) = TWO * GAMMA_B ## S ## (C) +#else +# define UPDATE_B(S, C) b(S, C, i) = b(S, C, i) + TWO * GAMMA_B ## S ## (C) +#endif + +#else + +#ifdef DAGGER +# define PLUS - +# define MINUS + +#else +# define PLUS + +# define MINUS - +#endif + +#ifdef DIR_X +# define GAMMA_AB1(C) a(1, C, jb) PLUS i_times(a(4, C, jb)) +# define GAMMA_AB2(C) a(2, C, jb) PLUS i_times(a(3, C, jb)) +# define GAMMA_AF1(C) a(1, C, jf) MINUS i_times(a(4, C, jf)) +# define GAMMA_AF2(C) a(2, C, jf) MINUS i_times(a(3, C, jf)) +# define GAMMA_B3(C) MINUS i_times(bb2_ ## C) PLUS i_times(bf2_ ## C) +# define GAMMA_B4(C) MINUS i_times(bb1_ ## C) PLUS i_times(bf1_ ## C) +#endif + +#ifdef DIR_Y +# define GAMMA_AB1(C) a(1, C, jb) PLUS a(4, C, jb) +# define GAMMA_AB2(C) a(2, C, jb) MINUS a(3, C, jb) +# define GAMMA_AF1(C) a(1, C, jf) MINUS a(4, C, jf) +# define GAMMA_AF2(C) a(2, C, jf) PLUS a(3, C, jf) +# define GAMMA_B3(C) MINUS bb2_ ## C PLUS bf2_ ## C +# define GAMMA_B4(C) PLUS bb1_ ## C MINUS bf1_ ## C +#endif + +#ifdef DIR_Z +# define GAMMA_AB1(C) a(1, C, jb) PLUS i_times(a(3, C, jb)) +# define GAMMA_AB2(C) a(2, C, jb) MINUS i_times(a(4, C, jb)) +# define GAMMA_AF1(C) a(1, C, jf) MINUS i_times(a(3, C, jf)) +# define GAMMA_AF2(C) a(2, C, jf) PLUS i_times(a(4, C, jf)) +# define GAMMA_B3(C) MINUS i_times(bb1_ ## C) PLUS i_times(bf1_ ## C) +# define GAMMA_B4(C) PLUS i_times(bb2_ ## C) MINUS i_times(bf2_ ## C) +#endif + +# define GAMMA_B1(C) + bb1_ ## C + bf1_ ## C +# define GAMMA_B2(C) + bb2_ ## C + bf2_ ## C + +#ifdef INIT +# define UPDATE_B(S, C) b(S, C, i) = GAMMA_B ## S ## (C) +#else +# define UPDATE_B(S, C) b(S, C, i) = b(S, C, i) GAMMA_B ## S ## (C) +#endif + +#endif + +#ifdef INIT +# define THE_NAME STRCAT(NAME, _0) +#else +# define THE_NAME NAME +#endif + +!------------------------------------------------------------------------------- +subroutine THE_NAME(b, a, u_e, u_o, nn_fwd, nn_bwd, i1, i2) + + implicit none + + COMPLEX, dimension (NDIRAC, NCOL, *), intent(inout) :: b + COMPLEX, dimension (NDIRAC, NCOL, *), intent(in) :: a + COMPLEX, dimension (NCOL, NCOL, *), intent(in) :: u_e, u_o + INTEGER, dimension (*), intent(in) :: nn_fwd, nn_bwd + integer :: i1, i2 + + integer :: i, jf, jb + + COMPLEX :: ab1, ab2, af1, af2 + COMPLEX :: bf1_1, bf2_1 + COMPLEX :: bf1_2, bf2_2 + COMPLEX :: bf1_3, bf2_3 + COMPLEX :: bb1_1, bb2_1 + COMPLEX :: bb1_2, bb2_2 + COMPLEX :: bb1_3, bb2_3 + + ! statement function: + + COMPLEX :: i_times, c + i_times(c) = cmplx(-aimag(c), real(c)) + + !!TIMING_START(STRCAT(timing_bin_, NAME)) + + do i = i1, i2 + jb = nn_bwd(i) + + ab1 = GAMMA_AB1(1) + ab2 = GAMMA_AB2(1) + + bb1_1 = ab1 * conjg(u_o(1, 1, jb)) + bb2_1 = ab2 * conjg(u_o(1, 1, jb)) + bb1_2 = ab1 * conjg(u_o(1, 2, jb)) + bb2_2 = ab2 * conjg(u_o(1, 2, jb)) + bb1_3 = ab1 * conjg(u_o(1, 3, jb)) + bb2_3 = ab2 * conjg(u_o(1, 3, jb)) + + jf = nn_fwd(i) + + af1 = GAMMA_AF1(1) + af2 = GAMMA_AF2(1) + + bf1_1 = af1 * u_e(1, 1, i) + bf2_1 = af2 * u_e(1, 1, i) + bf1_2 = af1 * u_e(2, 1, i) + bf2_2 = af2 * u_e(2, 1, i) + bf1_3 = af1 * u_e(3, 1, i) + bf2_3 = af2 * u_e(3, 1, i) + + ab1 = GAMMA_AB1(2) + ab2 = GAMMA_AB2(2) + + bb1_1 = bb1_1 + ab1 * conjg(u_o(2, 1, jb)) + bb2_1 = bb2_1 + ab2 * conjg(u_o(2, 1, jb)) + bb1_2 = bb1_2 + ab1 * conjg(u_o(2, 2, jb)) + bb2_2 = bb2_2 + ab2 * conjg(u_o(2, 2, jb)) + bb1_3 = bb1_3 + ab1 * conjg(u_o(2, 3, jb)) + bb2_3 = bb2_3 + ab2 * conjg(u_o(2, 3, jb)) + + af1 = GAMMA_AF1(2) + af2 = GAMMA_AF2(2) + + bf1_1 = bf1_1 + af1 * u_e(1, 2, i) + bf2_1 = bf2_1 + af2 * u_e(1, 2, i) + bf1_2 = bf1_2 + af1 * u_e(2, 2, i) + bf2_2 = bf2_2 + af2 * u_e(2, 2, i) + bf1_3 = bf1_3 + af1 * u_e(3, 2, i) + bf2_3 = bf2_3 + af2 * u_e(3, 2, i) + + ab1 = GAMMA_AB1(3) + ab2 = GAMMA_AB2(3) + + bb1_1 = bb1_1 + ab1 * conjg(u_o(3, 1, jb)) + bb2_1 = bb2_1 + ab2 * conjg(u_o(3, 1, jb)) + bb1_2 = bb1_2 + ab1 * conjg(u_o(3, 2, jb)) + bb2_2 = bb2_2 + ab2 * conjg(u_o(3, 2, jb)) + bb1_3 = bb1_3 + ab1 * conjg(u_o(3, 3, jb)) + bb2_3 = bb2_3 + ab2 * conjg(u_o(3, 3, jb)) + + af1 = GAMMA_AF1(3) + af2 = GAMMA_AF2(3) + + bf1_1 = bf1_1 + af1 * u_e(1, 3, i) + bf2_1 = bf2_1 + af2 * u_e(1, 3, i) + bf1_2 = bf1_2 + af1 * u_e(2, 3, i) + bf2_2 = bf2_2 + af2 * u_e(2, 3, i) + bf1_3 = bf1_3 + af1 * u_e(3, 3, i) + bf2_3 = bf2_3 + af2 * u_e(3, 3, i) + + + UPDATE_B(1, 1) + UPDATE_B(2, 1) + UPDATE_B(3, 1) + UPDATE_B(4, 1) + + UPDATE_B(1, 2) + UPDATE_B(2, 2) + UPDATE_B(3, 2) + UPDATE_B(4, 2) + + UPDATE_B(1, 3) + UPDATE_B(2, 3) + UPDATE_B(3, 3) + UPDATE_B(4, 3) + + enddo + + !!TIMING_STOP(STRCAT(timing_bin_, NAME)) + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/d/D3xyzt.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D3xyzt.F90 new file mode 100644 index 0000000000000000000000000000000000000000..391e3189644197b1eeb0d29c3db87ed3d60d579f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/d/D3xyzt.F90 @@ -0,0 +1,203 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin +! +!------------------------------------------------------------------------------- +! +! D3xyzt.F90 - routines needed in D3.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +#ifdef DIR_T + +#ifdef DAGGER +# define GAMMA_AB1(C) a(3, C, jb) +# define GAMMA_AB2(C) a(4, C, jb) +# define GAMMA_AF1(C) a(1, C, jf) +# define GAMMA_AF2(C) a(2, C, jf) +# define GAMMA_B1(C) bf1_ ## C +# define GAMMA_B2(C) bf2_ ## C +# define GAMMA_B3(C) bb1_ ## C +# define GAMMA_B4(C) bb2_ ## C +#else +# define GAMMA_AB1(C) a(1, C, jb) +# define GAMMA_AB2(C) a(2, C, jb) +# define GAMMA_AF1(C) a(3, C, jf) +# define GAMMA_AF2(C) a(4, C, jf) +# define GAMMA_B1(C) bb1_ ## C +# define GAMMA_B2(C) bb2_ ## C +# define GAMMA_B3(C) bf1_ ## C +# define GAMMA_B4(C) bf2_ ## C +#endif + +# define UPDATE_B(S, C) b(S, C, i) = b(S, C, i) + TWO * GAMMA_B ## S ## (C) + +#else + +#ifdef DAGGER +# define PLUS - +# define MINUS + +#else +# define PLUS + +# define MINUS - +#endif + +#ifdef DIR_X +# define GAMMA_AB1(C) a(1, C, jb) PLUS i_times(a(4, C, jb)) +# define GAMMA_AB2(C) a(2, C, jb) PLUS i_times(a(3, C, jb)) +# define GAMMA_AF1(C) a(1, C, jf) MINUS i_times(a(4, C, jf)) +# define GAMMA_AF2(C) a(2, C, jf) MINUS i_times(a(3, C, jf)) +# define GAMMA_B3(C) MINUS i_times(bb2_ ## C) PLUS i_times(bf2_ ## C) +# define GAMMA_B4(C) MINUS i_times(bb1_ ## C) PLUS i_times(bf1_ ## C) + +# define UPDATE_B(S, C) b(S, C, i) = GAMMA_B ## S ## (C) +#endif + +#ifdef DIR_Y +# define GAMMA_AB1(C) a(1, C, jb) PLUS a(4, C, jb) +# define GAMMA_AB2(C) a(2, C, jb) MINUS a(3, C, jb) +# define GAMMA_AF1(C) a(1, C, jf) MINUS a(4, C, jf) +# define GAMMA_AF2(C) a(2, C, jf) PLUS a(3, C, jf) +# define GAMMA_B3(C) MINUS bb2_ ## C PLUS bf2_ ## C +# define GAMMA_B4(C) PLUS bb1_ ## C MINUS bf1_ ## C + +# define UPDATE_B(S, C) b(S, C, i) = b(S, C, i) GAMMA_B ## S ## (C) +#endif + +#ifdef DIR_Z +# define GAMMA_AB1(C) a(1, C, jb) PLUS i_times(a(3, C, jb)) +# define GAMMA_AB2(C) a(2, C, jb) MINUS i_times(a(4, C, jb)) +# define GAMMA_AF1(C) a(1, C, jf) MINUS i_times(a(3, C, jf)) +# define GAMMA_AF2(C) a(2, C, jf) PLUS i_times(a(4, C, jf)) +# define GAMMA_B3(C) MINUS i_times(bb1_ ## C) PLUS i_times(bf1_ ## C) +# define GAMMA_B4(C) PLUS i_times(bb2_ ## C) MINUS i_times(bf2_ ## C) + +# define UPDATE_B(S, C) b(S, C, i) = b(S, C, i) GAMMA_B ## S ## (C) +#endif + +# define GAMMA_B1(C) + bb1_ ## C + bf1_ ## C +# define GAMMA_B2(C) + bb2_ ## C + bf2_ ## C + +#endif + +!------------------------------------------------------------------------------- +subroutine NAME(b, a, u_e, u_o, nn_fwd, nn_bwd, i1, i2) + + implicit none + + COMPLEX, dimension (NDIRAC, NCOL, *), intent(inout) :: b + COMPLEX, dimension (NDIRAC, NCOL, *), intent(in) :: a + COMPLEX, dimension (NCOL, NCOL, *), intent(in) :: u_e, u_o + INTEGER, dimension (*), intent(in) :: nn_fwd, nn_bwd + integer :: i1, i2 + + integer :: i, jf, jb + + COMPLEX :: ab1, ab2, af1, af2 + COMPLEX :: bf1_1, bf2_1 + COMPLEX :: bf1_2, bf2_2 + COMPLEX :: bf1_3, bf2_3 + COMPLEX :: bb1_1, bb2_1 + COMPLEX :: bb1_2, bb2_2 + COMPLEX :: bb1_3, bb2_3 + + ! statement function: + + COMPLEX :: i_times, c + i_times(c) = cmplx(-aimag(c), real(c)) + + !!TIMING_START(STRCAT(timing_bin_, NAME)) + + do i = i1, i2 + jb = nn_bwd(i) + + ab1 = GAMMA_AB1(1) + ab2 = GAMMA_AB2(1) + + bb1_1 = ab1 * conjg(u_o(1, 1, jb)) + bb2_1 = ab2 * conjg(u_o(1, 1, jb)) + bb1_2 = ab1 * conjg(u_o(1, 2, jb)) + bb2_2 = ab2 * conjg(u_o(1, 2, jb)) + bb1_3 = ab1 * conjg(u_o(1, 3, jb)) + bb2_3 = ab2 * conjg(u_o(1, 3, jb)) + + jf = nn_fwd(i) + + af1 = GAMMA_AF1(1) + af2 = GAMMA_AF2(1) + + bf1_1 = af1 * u_e(1, 1, i) + bf2_1 = af2 * u_e(1, 1, i) + bf1_2 = af1 * u_e(2, 1, i) + bf2_2 = af2 * u_e(2, 1, i) + bf1_3 = af1 * u_e(3, 1, i) + bf2_3 = af2 * u_e(3, 1, i) + + ab1 = GAMMA_AB1(2) + ab2 = GAMMA_AB2(2) + + bb1_1 = bb1_1 + ab1 * conjg(u_o(2, 1, jb)) + bb2_1 = bb2_1 + ab2 * conjg(u_o(2, 1, jb)) + bb1_2 = bb1_2 + ab1 * conjg(u_o(2, 2, jb)) + bb2_2 = bb2_2 + ab2 * conjg(u_o(2, 2, jb)) + bb1_3 = bb1_3 + ab1 * conjg(u_o(2, 3, jb)) + bb2_3 = bb2_3 + ab2 * conjg(u_o(2, 3, jb)) + + af1 = GAMMA_AF1(2) + af2 = GAMMA_AF2(2) + + bf1_1 = bf1_1 + af1 * u_e(1, 2, i) + bf2_1 = bf2_1 + af2 * u_e(1, 2, i) + bf1_2 = bf1_2 + af1 * u_e(2, 2, i) + bf2_2 = bf2_2 + af2 * u_e(2, 2, i) + bf1_3 = bf1_3 + af1 * u_e(3, 2, i) + bf2_3 = bf2_3 + af2 * u_e(3, 2, i) + + ab1 = GAMMA_AB1(3) + ab2 = GAMMA_AB2(3) + + bb1_1 = bb1_1 + ab1 * conjg(u_o(3, 1, jb)) + bb2_1 = bb2_1 + ab2 * conjg(u_o(3, 1, jb)) + bb1_2 = bb1_2 + ab1 * conjg(u_o(3, 2, jb)) + bb2_2 = bb2_2 + ab2 * conjg(u_o(3, 2, jb)) + bb1_3 = bb1_3 + ab1 * conjg(u_o(3, 3, jb)) + bb2_3 = bb2_3 + ab2 * conjg(u_o(3, 3, jb)) + + af1 = GAMMA_AF1(3) + af2 = GAMMA_AF2(3) + + bf1_1 = bf1_1 + af1 * u_e(1, 3, i) + bf2_1 = bf2_1 + af2 * u_e(1, 3, i) + bf1_2 = bf1_2 + af1 * u_e(2, 3, i) + bf2_2 = bf2_2 + af2 * u_e(2, 3, i) + bf1_3 = bf1_3 + af1 * u_e(3, 3, i) + bf2_3 = bf2_3 + af2 * u_e(3, 3, i) + + + UPDATE_B(1, 1) + UPDATE_B(2, 1) + UPDATE_B(3, 1) + UPDATE_B(4, 1) + + UPDATE_B(1, 2) + UPDATE_B(2, 2) + UPDATE_B(3, 2) + UPDATE_B(4, 2) + + UPDATE_B(1, 3) + UPDATE_B(2, 3) + UPDATE_B(3, 3) + UPDATE_B(4, 3) + + enddo + + !!TIMING_STOP(STRCAT(timing_bin_, NAME)) + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/d/DSFxyzt.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/d/DSFxyzt.F90 new file mode 100644 index 0000000000000000000000000000000000000000..23b0d6fb9600d4b9f4411d95e119babf591e61ea --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/d/DSFxyzt.F90 @@ -0,0 +1,108 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin +! +!------------------------------------------------------------------------------- +! +! DSFxyzt.F90 - routines (for standard Wilson fermions) needed in dsf.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +#ifdef DIR_X +# define GAMMA_A1(C) a(1, C, J) minus i_times(a(4, C, J)) +# define GAMMA_A2(C) a(2, C, J) minus i_times(a(3, C, J)) +# define GAMMA_A3(C) plus i_times(a2) +# define GAMMA_A4(C) plus i_times(a1) +#endif + +#ifdef DIR_Y +# define GAMMA_A1(C) a(1, C, J) minus a(4, C, J) +# define GAMMA_A2(C) a(2, C, J) plus a(3, C, J) +# define GAMMA_A3(C) plus a2 +# define GAMMA_A4(C) minus a1 +#endif + +#ifdef DIR_Z +# define GAMMA_A1(C) a(1, C, J) minus i_times(a(3, C, J)) +# define GAMMA_A2(C) a(2, C, J) plus i_times(a(4, C, J)) +# define GAMMA_A3(C) plus i_times(a1) +# define GAMMA_A4(C) minus i_times(a2) +#endif + +#ifdef DIR_T +#ifdef FORWARD +# define GAMMA_A1(C) ZERO +# define GAMMA_A2(C) ZERO +# define GAMMA_A3(C) TWO * a(3, C, J) +# define GAMMA_A4(C) TWO * a(4, C, J) +#else +# define GAMMA_A1(C) TWO * a(1, C, J) +# define GAMMA_A2(C) TWO * a(2, C, J) +# define GAMMA_A3(C) ZERO +# define GAMMA_A4(C) ZERO +#endif +#endif + +#ifdef FORWARD +# define UU(R, A, B) uu(R, A, B) +# define plus + +# define minus - +# define I i +# define J j +#else +# define UU(R, A, B) uud(R, B, A) +# define plus - +# define minus + +# define I j +# define J i +#endif + +!------------------------------------------------------------------------------- +subroutine NAME(p, b, a, s, u, nn, volh) + + implicit none + + REAL, dimension(NGEN, *), intent(inout) :: p + REAL, intent(in) :: s + COMPLEX, dimension (NDIRAC, NCOL, *), intent(in) :: b, a + COMPLEX, dimension (NCOL, NCOL, *), intent(in) :: u + INTEGER, intent(in) :: nn(*) + integer :: volh + + integer :: i, j, ca, cb + COMPLEX :: a1, a2, a3, a4 + SU3 :: v, w + + ! statement function: + + COMPLEX :: i_times, c + i_times(c) = cmplx(-aimag(c), real(c)) + + + !$omp parallel do private(j,ca,cb,a1,a2,a3,a4,w,v) + do i = 1, volh + j = nn(i) + do ca = 1, NCOL + a1 = GAMMA_A1(ca) + a2 = GAMMA_A2(ca) + a3 = GAMMA_A3(ca) + a4 = GAMMA_A4(ca) + do cb = 1, NCOL + w(ca, cb) = a1 * conjg(b(1, cb, I)) & + + a2 * conjg(b(2, cb, I)) & + + a3 * conjg(b(3, cb, I)) & + + a4 * conjg(b(4, cb, I)) + enddo + enddo + call UU(v, u(1, 1, i), w) + call im_tr_j(p(1, i), v, minus s) + enddo + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/d/DVersion.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/d/DVersion.F90 new file mode 100644 index 0000000000000000000000000000000000000000..7748291d4c1e1e9861b486f687425a94c53a4ef2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/d/DVersion.F90 @@ -0,0 +1,20 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2002, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! DVersion.F90 +! +!------------------------------------------------------------------------------- + +integer function version_of_d() + implicit none + version_of_d = VERSION +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/d/Dt.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/d/Dt.F90 new file mode 100644 index 0000000000000000000000000000000000000000..28f6232e7ad1083562c3dca27fac0b0c49445739 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/d/Dt.F90 @@ -0,0 +1,154 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2002, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! Dt.F90 - routines needed in D.F90 (t-direction) +! +!------------------------------------------------------------------------------- +# include "defs.h" + +#ifdef DAGGER +# define GAMMA_A1(C) a(3, C, jb) +# define GAMMA_A2(C) a(4, C, jb) +# define GAMMA_A3(C) a(1, C, jf) +# define GAMMA_A4(C) a(2, C, jf) +# define GAMMA_B1(C) f1_ ## C +# define GAMMA_B2(C) f2_ ## C +# define GAMMA_B3(C) b1_ ## C +# define GAMMA_B4(C) b2_ ## C +#else +# define GAMMA_A1(C) a(1, C, jb) +# define GAMMA_A2(C) a(2, C, jb) +# define GAMMA_A3(C) a(3, C, jf) +# define GAMMA_A4(C) a(4, C, jf) +# define GAMMA_B1(C) b1_ ## C +# define GAMMA_B2(C) b2_ ## C +# define GAMMA_B3(C) f1_ ## C +# define GAMMA_B4(C) f2_ ## C +#endif + +!------------------------------------------------------------------------------- +subroutine NAME(b, a, u_e, u_o, nn_fwd, nn_bwd, volh) + + implicit none + + COMPLEX, dimension (NDIRAC, NCOL, *), intent(inout) :: b + COMPLEX, dimension (NDIRAC, NCOL, *), intent(in) :: a + COMPLEX, dimension (NCOL, NCOL, *), intent(in) :: u_e, u_o + INTEGER, dimension (*), intent(in) :: nn_fwd, nn_bwd + integer :: volh + + integer :: i, jf, jb + + COMPLEX :: a1, a2, a3, a4 + COMPLEX :: f1_1, f2_1 + COMPLEX :: f1_2, f2_2 + COMPLEX :: f1_3, f2_3 + COMPLEX :: b1_1, b2_1 + COMPLEX :: b1_2, b2_2 + COMPLEX :: b1_3, b2_3 + + ! statement function: + + COMPLEX :: i_times, c + i_times(c) = cmplx(-aimag(c), real(c)) + + TIMING_START(STRCAT(timing_bin_, NAME)) + + !$omp parallel do private(jf, jb, a1, a2, a3, a4, & + !$omp f1_1, f2_1, f1_2, f2_2, f1_3, f2_3, & + !$omp b1_1, b2_1, b1_2, b2_2, b1_3, b2_3) + do i = 1, volh + jb = nn_bwd(i) + + a1 = GAMMA_A1(1) + a2 = GAMMA_A2(1) + + b1_1 = a1 * conjg(u_o(1, 1, jb)) + b2_1 = a2 * conjg(u_o(1, 1, jb)) + b1_2 = a1 * conjg(u_o(1, 2, jb)) + b2_2 = a2 * conjg(u_o(1, 2, jb)) + b1_3 = a1 * conjg(u_o(1, 3, jb)) + b2_3 = a2 * conjg(u_o(1, 3, jb)) + + jf = nn_fwd(i) + + a3 = GAMMA_A3(1) + a4 = GAMMA_A4(1) + + f1_1 = a3 * u_e(1, 1, i) + f2_1 = a4 * u_e(1, 1, i) + f1_2 = a3 * u_e(2, 1, i) + f2_2 = a4 * u_e(2, 1, i) + f1_3 = a3 * u_e(3, 1, i) + f2_3 = a4 * u_e(3, 1, i) + + a1 = GAMMA_A1(2) + a2 = GAMMA_A2(2) + + b1_1 = b1_1 + a1 * conjg(u_o(2, 1, jb)) + b2_1 = b2_1 + a2 * conjg(u_o(2, 1, jb)) + b1_2 = b1_2 + a1 * conjg(u_o(2, 2, jb)) + b2_2 = b2_2 + a2 * conjg(u_o(2, 2, jb)) + b1_3 = b1_3 + a1 * conjg(u_o(2, 3, jb)) + b2_3 = b2_3 + a2 * conjg(u_o(2, 3, jb)) + + a3 = GAMMA_A3(2) + a4 = GAMMA_A4(2) + + f1_1 = f1_1 + a3 * u_e(1, 2, i) + f2_1 = f2_1 + a4 * u_e(1, 2, i) + f1_2 = f1_2 + a3 * u_e(2, 2, i) + f2_2 = f2_2 + a4 * u_e(2, 2, i) + f1_3 = f1_3 + a3 * u_e(3, 2, i) + f2_3 = f2_3 + a4 * u_e(3, 2, i) + + a1 = GAMMA_A1(3) + a2 = GAMMA_A2(3) + + b1_1 = b1_1 + a1 * conjg(u_o(3, 1, jb)) + b2_1 = b2_1 + a2 * conjg(u_o(3, 1, jb)) + b1_2 = b1_2 + a1 * conjg(u_o(3, 2, jb)) + b2_2 = b2_2 + a2 * conjg(u_o(3, 2, jb)) + b1_3 = b1_3 + a1 * conjg(u_o(3, 3, jb)) + b2_3 = b2_3 + a2 * conjg(u_o(3, 3, jb)) + + a3 = GAMMA_A3(3) + a4 = GAMMA_A4(3) + + f1_1 = f1_1 + a3 * u_e(1, 3, i) + f2_1 = f2_1 + a4 * u_e(1, 3, i) + f1_2 = f1_2 + a3 * u_e(2, 3, i) + f2_2 = f2_2 + a4 * u_e(2, 3, i) + f1_3 = f1_3 + a3 * u_e(3, 3, i) + f2_3 = f2_3 + a4 * u_e(3, 3, i) + + + b(1, 1, i) = TWO * GAMMA_B1(1) + b(2, 1, i) = TWO * GAMMA_B2(1) + b(3, 1, i) = TWO * GAMMA_B3(1) + b(4, 1, i) = TWO * GAMMA_B4(1) + + b(1, 2, i) = TWO * GAMMA_B1(2) + b(2, 2, i) = TWO * GAMMA_B2(2) + b(3, 2, i) = TWO * GAMMA_B3(2) + b(4, 2, i) = TWO * GAMMA_B4(2) + + b(1, 3, i) = TWO * GAMMA_B1(3) + b(2, 3, i) = TWO * GAMMA_B2(3) + b(3, 3, i) = TWO * GAMMA_B3(3) + b(4, 3, i) = TWO * GAMMA_B4(3) + + enddo + + TIMING_STOP(STRCAT(timing_bin_, NAME)) + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/d/Dxyz.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/d/Dxyz.F90 new file mode 100644 index 0000000000000000000000000000000000000000..a60d023a9327832b816e1a5eb2b0b7f6b282acd7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/d/Dxyz.F90 @@ -0,0 +1,137 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2002, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! Dxyz.F90 - routines needed in D.F90 (x/y/z-directions) +! +!------------------------------------------------------------------------------- +# include "defs.h" + +#ifdef DIR_X +# define GAMMA_A1(C) a(1, C, j) minus i_times(a(4, C, j)) +# define GAMMA_A2(C) a(2, C, j) minus i_times(a(3, C, j)) +# define GAMMA_B1(C) b(3, C, i) plus i_times(b2_ ## C) +# define GAMMA_B2(C) b(4, C, i) plus i_times(b1_ ## C) +#endif + +#ifdef DIR_Y +# define GAMMA_A1(C) a(1, C, j) minus a(4, C, j) +# define GAMMA_A2(C) a(2, C, j) plus a(3, C, j) +# define GAMMA_B1(C) b(3, C, i) plus b2_ ## C +# define GAMMA_B2(C) b(4, C, i) minus b1_ ## C +#endif + +#ifdef DIR_Z +# define GAMMA_A1(C) a(1, C, j) minus i_times(a(3, C, j)) +# define GAMMA_A2(C) a(2, C, j) plus i_times(a(4, C, j)) +# define GAMMA_B1(C) b(3, C, i) plus i_times(b1_ ## C) +# define GAMMA_B2(C) b(4, C, i) minus i_times(b2_ ## C) +#endif + +#ifdef FORWARD +# define U(A, B) u(A, B, i) +# define minus MINUS +# define plus PLUS +#else +# define U(A, B) conjg(u(B, A, j)) +# define minus PLUS +# define plus MINUS +#endif + +#ifdef DAGGER +# define PLUS - +# define MINUS + +#else +# define PLUS + +# define MINUS - +#endif + +!------------------------------------------------------------------------------- +subroutine NAME(b, a, u, nn, volh) + + implicit none + + COMPLEX, dimension (NDIRAC, NCOL, *), intent(inout) :: b + COMPLEX, dimension (NDIRAC, NCOL, *), intent(in) :: a + COMPLEX, dimension (NCOL, NCOL, *), intent(in) :: u + INTEGER, dimension (*), intent(in) :: nn + integer :: volh + + integer :: i, j + + COMPLEX :: a1, a2 + COMPLEX :: b1_1, b2_1 + COMPLEX :: b1_2, b2_2 + COMPLEX :: b1_3, b2_3 + + ! statement function: + + COMPLEX :: i_times, c + i_times(c) = cmplx(-aimag(c), real(c)) + + TIMING_START(STRCAT(timing_bin_, NAME)) + + !$omp parallel do private(j, a1, a2, b1_1, b2_1, b1_2, b2_2, b1_3, b2_3) + do i = 1, volh + j = nn(i) + + a1 = GAMMA_A1(1) + a2 = GAMMA_A2(1) + + b1_1 = a1 * U(1, 1) + b2_1 = a2 * U(1, 1) + b1_2 = a1 * U(2, 1) + b2_2 = a2 * U(2, 1) + b1_3 = a1 * U(3, 1) + b2_3 = a2 * U(3, 1) + + a1 = GAMMA_A1(2) + a2 = GAMMA_A2(2) + + b1_1 = b1_1 + a1 * U(1, 2) + b2_1 = b2_1 + a2 * U(1, 2) + b1_2 = b1_2 + a1 * U(2, 2) + b2_2 = b2_2 + a2 * U(2, 2) + b1_3 = b1_3 + a1 * U(3, 2) + b2_3 = b2_3 + a2 * U(3, 2) + + a1 = GAMMA_A1(3) + a2 = GAMMA_A2(3) + + b1_1 = b1_1 + a1 * U(1, 3) + b2_1 = b2_1 + a2 * U(1, 3) + + b(1, 1, i) = b(1, 1, i) + b1_1 + b(2, 1, i) = b(2, 1, i) + b2_1 + b(3, 1, i) = GAMMA_B1(1) + b(4, 1, i) = GAMMA_B2(1) + + b1_2 = b1_2 + a1 * U(2, 3) + b2_2 = b2_2 + a2 * U(2, 3) + + b(1, 2, i) = b(1, 2, i) + b1_2 + b(2, 2, i) = b(2, 2, i) + b2_2 + b(3, 2, i) = GAMMA_B1(2) + b(4, 2, i) = GAMMA_B2(2) + + b1_3 = b1_3 + a1 * U(3, 3) + b2_3 = b2_3 + a2 * U(3, 3) + + b(1, 3, i) = b(1, 3, i) + b1_3 + b(2, 3, i) = b(2, 3, i) + b2_3 + b(3, 3, i) = GAMMA_B1(3) + b(4, 3, i) = GAMMA_B2(3) + + enddo + + TIMING_STOP(STRCAT(timing_bin_, NAME)) + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/d/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_A/d/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..64a19ab5e5d6b7c1cd34db7eab15e53845cb71be --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/d/Makefile @@ -0,0 +1,335 @@ +#=============================================================================== +# +# BQCD -- Berlin Quantum ChromoDynamics programme +# +# Author: Hinnerk Stueben +# +# Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin +# +#------------------------------------------------------------------------------- +# +# d/Makefile +# +#=============================================================================== + +include ../Makefile.defs + +fpp = $(FPP) -I.. $(FPPFLAGS) + +MODULES_DIR = ../modules + +.SUFFIXES: +.SUFFIXES: .a .o .f90 .F90 + +.f90.o: + $(F90) -c $(FFLAGS) -I$(MODULES_DIR) $< + +OBJS_D = \ + d.o \ + d_t.o \ + d_zb.o \ + d_zf.o \ + d_yb.o \ + d_yf.o \ + d_xb.o \ + d_xf.o \ + d_dag.o \ + d_dag_t.o \ + d_dag_zb.o \ + d_dag_zf.o \ + d_dag_yb.o \ + d_dag_yf.o \ + d_dag_xb.o \ + d_dag_xf.o \ + d_version.o + +OBJS_D2 = \ + d2.o \ + d2_t.o \ + d2_zf.o \ + d2_yf.o \ + d2_xf.o \ + d2_dag.o \ + d2_dag_t.o \ + d2_dag_zf.o \ + d2_dag_yf.o \ + d2_dag_xf.o \ + d2_version.o + +OBJS_D21 = \ + d21.o \ + d21_t.o \ + d21_zf.o \ + d21_yf.o \ + d21_xf.o \ + d21_dag.o \ + d21_dag_t.o \ + d21_dag_zf.o \ + d21_dag_yf.o \ + d21_dag_xf.o \ + d21_version.o + +OBJS_D3 = \ + d3.o \ + d3_t.o \ + d3_zf.o \ + d3_yf.o \ + d3_xf.o \ + d3_dag.o \ + d3_dag_t.o \ + d3_dag_zf.o \ + d3_dag_yf.o \ + d3_dag_xf.o \ + d3_version.o + +OBJS_D31 = \ + d31.o \ + d31_switch.o \ + d31_switch_0.o \ + d31_t.o \ + d31_t_0.o \ + d31_zf.o \ + d31_zf_0.o \ + d31_yf.o \ + d31_yf_0.o \ + d31_xf.o \ + d31_xf_0.o \ + d31_dag.o \ + d31_dag_switch.o \ + d31_dag_switch_0.o \ + d31_dag_t.o \ + d31_dag_t_0.o \ + d31_dag_zf.o \ + d31_dag_zf_0.o \ + d31_dag_yf.o \ + d31_dag_yf_0.o \ + d31_dag_xf.o \ + d31_dag_xf_0.o \ + d31_version.o + +OBJS_DSF = \ + dsf_xf.o \ + dsf_xb.o \ + dsf_yf.o \ + dsf_yb.o \ + dsf_zf.o \ + dsf_zb.o \ + dsf_tf.o \ + dsf_tb.o + + +#------------------------------------------------------------------------------- +$(LIBD): + +libd.a: $(OBJS_D) $(OBJS_DSF) + $(AR) $(ARFLAGS) $@ $(OBJS_D) $(OBJS_DSF) + $(RANLIB) $@ + +libd2.a: $(OBJS_D2) $(OBJS_DSF) + $(AR) $(ARFLAGS) $@ $(OBJS_D2) $(OBJS_DSF) + $(RANLIB) $@ + +libd21.a: $(OBJS_D21) $(OBJS_DSF) + $(AR) $(ARFLAGS) $@ $(OBJS_D21) $(OBJS_DSF) + $(RANLIB) $@ + +libd3.a: $(OBJS_D3) $(OBJS_DSF) + $(AR) $(ARFLAGS) $@ $(OBJS_D3) $(OBJS_DSF) + $(RANLIB) $@ + +libd31.a: $(OBJS_D31) $(OBJS_DSF) + $(AR) $(ARFLAGS) $@ $(OBJS_D31) $(OBJS_DSF) + $(RANLIB) $@ + +fast: + $(FAST_MAKE) + +#------------------------------------------------------------------------------- +d.f90: D.F90 $(DEPENDENCIES_D) + $(fpp) -DNAME=d D.F90 > $@ + +d_dag.f90: D.F90 $(DEPENDENCIES_D) + $(fpp) -DNAME=d_dag D.F90 > $@ + +d_xf.f90: Dxyz.F90 + $(fpp) -DNAME=d_xf -DDIR_X -DFORWARD -UDAGGER Dxyz.F90 > $@ + +d_yf.f90: Dxyz.F90 + $(fpp) -DNAME=d_yf -DDIR_Y -DFORWARD -UDAGGER Dxyz.F90 > $@ + +d_zf.f90: Dxyz.F90 + $(fpp) -DNAME=d_zf -DDIR_Z -DFORWARD -UDAGGER Dxyz.F90 > $@ + +d_dag_xf.f90: Dxyz.F90 + $(fpp) -DNAME=d_dag_xf -DDIR_X -DFORWARD -DDAGGER Dxyz.F90 > $@ + +d_dag_yf.f90: Dxyz.F90 + $(fpp) -DNAME=d_dag_yf -DDIR_Y -DFORWARD -DDAGGER Dxyz.F90 > $@ + +d_dag_zf.f90: Dxyz.F90 + $(fpp) -DNAME=d_dag_zf -DDIR_Z -DFORWARD -DDAGGER Dxyz.F90 > $@ + +d_xb.f90: Dxyz.F90 + $(fpp) -DNAME=d_xb -DDIR_X -UFORWARD -UDAGGER Dxyz.F90 > $@ + +d_yb.f90: Dxyz.F90 + $(fpp) -DNAME=d_yb -DDIR_Y -UFORWARD -UDAGGER Dxyz.F90 > $@ + +d_zb.f90: Dxyz.F90 + $(fpp) -DNAME=d_zb -DDIR_Z -UFORWARD -UDAGGER Dxyz.F90 > $@ + +d_dag_xb.f90: Dxyz.F90 + $(fpp) -DNAME=d_dag_xb -DDIR_X -UFORWARD -DDAGGER Dxyz.F90 > $@ + +d_dag_yb.f90: Dxyz.F90 + $(fpp) -DNAME=d_dag_yb -DDIR_Y -UFORWARD -DDAGGER Dxyz.F90 > $@ + +d_dag_zb.f90: Dxyz.F90 + $(fpp) -DNAME=d_dag_zb -DDIR_Z -UFORWARD -DDAGGER Dxyz.F90 > $@ + +d_t.f90: Dt.F90 + $(fpp) -DNAME=d_t -UDAGGER Dt.F90 > $@ + +d_dag_t.f90: Dt.F90 + $(fpp) -DNAME=d_dag_t -DDAGGER Dt.F90 > $@ + +d_version.f90: DVersion.F90 + $(fpp) -DVERSION=1 DVersion.F90 > $@ + + +#------------------------------------------------------------------------------- +d2.f90: D2.F90 $(DEPENDENCIES_D); $(fpp) -DNAME=d D2.F90 > $@ + +d2_dag.f90: D2.F90 $(DEPENDENCIES_D); $(fpp) -DNAME=d_dag D2.F90 > $@ + +d2_xf.f90: D2xyzt.F90; $(fpp) -DNAME=d_xf -DDIR_X -UDAGGER D2xyzt.F90 > $@ + +d2_yf.f90: D2xyzt.F90; $(fpp) -DNAME=d_yf -DDIR_Y -UDAGGER D2xyzt.F90 > $@ + +d2_zf.f90: D2xyzt.F90; $(fpp) -DNAME=d_zf -DDIR_Z -UDAGGER D2xyzt.F90 > $@ + +d2_dag_xf.f90: D2xyzt.F90; $(fpp) -DNAME=d_dag_xf -DDIR_X -DDAGGER D2xyzt.F90 > $@ + +d2_dag_yf.f90: D2xyzt.F90; $(fpp) -DNAME=d_dag_yf -DDIR_Y -DDAGGER D2xyzt.F90 > $@ + +d2_dag_zf.f90: D2xyzt.F90; $(fpp) -DNAME=d_dag_zf -DDIR_Z -DDAGGER D2xyzt.F90 > $@ + +d2_t.f90: D2xyzt.F90; $(fpp) -DNAME=d_t -DDIR_T -UDAGGER D2xyzt.F90 > $@ + +d2_dag_t.f90: D2xyzt.F90; $(fpp) -DNAME=d_dag_t -DDIR_T -DDAGGER D2xyzt.F90 > $@ + +d2_version.f90: DVersion.F90; $(fpp) -DVERSION=2 DVersion.F90 > $@ + + +#------------------------------------------------------------------------------- +d21.f90: D21.F90 $(DEPENDENCIES_D); $(fpp) -DNAME=d -UDAGGER D21.F90 > $@ + +d21_dag.f90: D21.F90 $(DEPENDENCIES_D); $(fpp) -DNAME=d_dag -DDAGGER D21.F90 > $@ + +d21_xf.f90: D21xyzt.F90; $(fpp) -DNAME=d_xf -DDIR_X -UDAGGER D21xyzt.F90 > $@ + +d21_yf.f90: D21xyzt.F90; $(fpp) -DNAME=d_yf -DDIR_Y -UDAGGER D21xyzt.F90 > $@ + +d21_zf.f90: D21xyzt.F90; $(fpp) -DNAME=d_zf -DDIR_Z -UDAGGER D21xyzt.F90 > $@ + +d21_dag_xf.f90: D21xyzt.F90; $(fpp) -DNAME=d_dag_xf -DDIR_X -DDAGGER D21xyzt.F90 > $@ + +d21_dag_yf.f90: D21xyzt.F90; $(fpp) -DNAME=d_dag_yf -DDIR_Y -DDAGGER D21xyzt.F90 > $@ + +d21_dag_zf.f90: D21xyzt.F90; $(fpp) -DNAME=d_dag_zf -DDIR_Z -DDAGGER D21xyzt.F90 > $@ + +d21_t.f90: D21xyzt.F90; $(fpp) -DNAME=d_t -DDIR_T -UDAGGER D21xyzt.F90 > $@ + +d21_dag_t.f90: D21xyzt.F90; $(fpp) -DNAME=d_dag_t -DDIR_T -DDAGGER D21xyzt.F90 > $@ + +d21_version.f90: DVersion.F90; $(fpp) -DVERSION=21 DVersion.F90 > $@ + + +#------------------------------------------------------------------------------- +d3.f90: D3.F90 $(DEPENDENCIES_D); $(fpp) -DNAME=d D3.F90 > $@ + +d3_dag.f90: D3.F90 $(DEPENDENCIES_D); $(fpp) -DNAME=d_dag D3.F90 > $@ + +d3_xf.f90: D3xyzt.F90; $(fpp) -DNAME=d_xf -DDIR_X -UDAGGER D3xyzt.F90 > $@ + +d3_yf.f90: D3xyzt.F90; $(fpp) -DNAME=d_yf -DDIR_Y -UDAGGER D3xyzt.F90 > $@ + +d3_zf.f90: D3xyzt.F90; $(fpp) -DNAME=d_zf -DDIR_Z -UDAGGER D3xyzt.F90 > $@ + +d3_dag_xf.f90: D3xyzt.F90; $(fpp) -DNAME=d_dag_xf -DDIR_X -DDAGGER D3xyzt.F90 > $@ + +d3_dag_yf.f90: D3xyzt.F90; $(fpp) -DNAME=d_dag_yf -DDIR_Y -DDAGGER D3xyzt.F90 > $@ + +d3_dag_zf.f90: D3xyzt.F90; $(fpp) -DNAME=d_dag_zf -DDIR_Z -DDAGGER D3xyzt.F90 > $@ + +d3_t.f90: D3xyzt.F90; $(fpp) -DNAME=d_t -DDIR_T -UDAGGER D3xyzt.F90 > $@ + +d3_dag_t.f90: D3xyzt.F90; $(fpp) -DNAME=d_dag_t -DDIR_T -DDAGGER D3xyzt.F90 > $@ + +d3_version.f90: DVersion.F90; $(fpp) -DVERSION=3 DVersion.F90 > $@ + + +#------------------------------------------------------------------------------- +d31.f90: D31.F90 $(DEPENDENCIES_D); $(fpp) -DNAME=d D31.F90 > $@ + +d31_dag.f90: D31.F90 $(DEPENDENCIES_D); $(fpp) -DNAME=d_dag D31.F90 > $@ + +d31_xf.f90: D31xyzt.F90; $(fpp) -DNAME=d_xf -DDIR_X -UDAGGER -UINIT D31xyzt.F90 > $@ +d31_xf_0.f90: D31xyzt.F90; $(fpp) -DNAME=d_xf -DDIR_X -UDAGGER -DINIT D31xyzt.F90 > $@ + +d31_yf.f90: D31xyzt.F90; $(fpp) -DNAME=d_yf -DDIR_Y -UDAGGER -UINIT D31xyzt.F90 > $@ +d31_yf_0.f90: D31xyzt.F90; $(fpp) -DNAME=d_yf -DDIR_Y -UDAGGER -DINIT D31xyzt.F90 > $@ + +d31_zf.f90: D31xyzt.F90; $(fpp) -DNAME=d_zf -DDIR_Z -UDAGGER -UINIT D31xyzt.F90 > $@ +d31_zf_0.f90: D31xyzt.F90; $(fpp) -DNAME=d_zf -DDIR_Z -UDAGGER -DINIT D31xyzt.F90 > $@ + +d31_dag_xf.f90: D31xyzt.F90; $(fpp) -DNAME=d_dag_xf -DDIR_X -DDAGGER -UINIT D31xyzt.F90 > $@ +d31_dag_xf_0.f90: D31xyzt.F90; $(fpp) -DNAME=d_dag_xf -DDIR_X -DDAGGER -DINIT D31xyzt.F90 > $@ + +d31_dag_yf.f90: D31xyzt.F90; $(fpp) -DNAME=d_dag_yf -DDIR_Y -DDAGGER -UINIT D31xyzt.F90 > $@ +d31_dag_yf_0.f90: D31xyzt.F90; $(fpp) -DNAME=d_dag_yf -DDIR_Y -DDAGGER -DINIT D31xyzt.F90 > $@ + +d31_dag_zf.f90: D31xyzt.F90; $(fpp) -DNAME=d_dag_zf -DDIR_Z -DDAGGER -UINIT D31xyzt.F90 > $@ +d31_dag_zf_0.f90: D31xyzt.F90; $(fpp) -DNAME=d_dag_zf -DDIR_Z -DDAGGER -DINIT D31xyzt.F90 > $@ + +d31_t.f90: D31xyzt.F90; $(fpp) -DNAME=d_t -DDIR_T -UDAGGER -UINIT D31xyzt.F90 > $@ +d31_t_0.f90: D31xyzt.F90; $(fpp) -DNAME=d_t -DDIR_T -UDAGGER -DINIT D31xyzt.F90 > $@ + +d31_dag_t.f90: D31xyzt.F90; $(fpp) -DNAME=d_dag_t -DDIR_T -DDAGGER -UINIT D31xyzt.F90 > $@ +d31_dag_t_0.f90: D31xyzt.F90; $(fpp) -DNAME=d_dag_t -DDIR_T -DDAGGER -DINIT D31xyzt.F90 > $@ + +d31_version.f90: DVersion.F90; $(fpp) -DVERSION=31 DVersion.F90 > $@ + +d31_switch.f90: D31_switch.F90; $(fpp) -DNAME=d -UINIT D31_switch.F90 > $@ +d31_switch_0.f90: D31_switch.F90; $(fpp) -DNAME=d -DINIT D31_switch.F90 > $@ + +d31_dag_switch.f90: D31_switch.F90; $(fpp) -DNAME=d_dag -UINIT D31_switch.F90 > $@ +d31_dag_switch_0.f90: D31_switch.F90; $(fpp) -DNAME=d_dag -DINIT D31_switch.F90 > $@ + + +#------------------------------------------------------------------------------- +dsf.f90: dsf.F90 $(DEPENDENCIES_DSF); $(fpp) dsf.F90 > $@ + +dsf_xf.f90: DSFxyzt.F90; $(fpp) -DNAME=dsf_xf -DDIR_X -DFORWARD DSFxyzt.F90 > $@ + +dsf_yf.f90: DSFxyzt.F90; $(fpp) -DNAME=dsf_yf -DDIR_Y -DFORWARD DSFxyzt.F90 > $@ + +dsf_zf.f90: DSFxyzt.F90; $(fpp) -DNAME=dsf_zf -DDIR_Z -DFORWARD DSFxyzt.F90 > $@ + +dsf_tf.f90: DSFxyzt.F90; $(fpp) -DNAME=dsf_tf -DDIR_T -DFORWARD DSFxyzt.F90 > $@ + +dsf_xb.f90: DSFxyzt.F90; $(fpp) -DNAME=dsf_xb -DDIR_X -UFORWARD DSFxyzt.F90 > $@ + +dsf_yb.f90: DSFxyzt.F90; $(fpp) -DNAME=dsf_yb -DDIR_Y -UFORWARD DSFxyzt.F90 > $@ + +dsf_zb.f90: DSFxyzt.F90; $(fpp) -DNAME=dsf_zb -DDIR_Z -UFORWARD DSFxyzt.F90 > $@ + +dsf_tb.f90: DSFxyzt.F90; $(fpp) -DNAME=dsf_tb -DDIR_T -UFORWARD DSFxyzt.F90 > $@ + + +#------------------------------------------------------------------------------- +clean: + rm -f *.[Tiod] *.f90 *.mod work.pc work.pcl + +clobber: clean + rm -f libd.a libd2.a libd21.a libd3.a libd31.a diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/d/bqcd.pcl b/qcd/part_cpu/applications/QCD/src/kernel_A/d/bqcd.pcl new file mode 100644 index 0000000000000000000000000000000000000000..906244500b31700684482c3dcfd32f6cec4279db --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/d/bqcd.pcl @@ -0,0 +1,2 @@ +work.pc +../modules/work.pc diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/defs.h b/qcd/part_cpu/applications/QCD/src/kernel_A/defs.h new file mode 100644 index 0000000000000000000000000000000000000000..05654d6e3e338d85e7aa8fc50ccbf893b7e1d222 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/defs.h @@ -0,0 +1,213 @@ +#ifndef BQCD_DEFS_H +#define BQCD_DEFS_H + +# define MAX_TEMPER 50 + +# define RKIND 8 + +# define BQCD_REAL mpi_real8 + +# define BQCD_CHECK_SUM mpi_integer8 +# define BQCD_SEED mpi_integer8 + +# define CHECK_SUM integer(8) +# define SEED integer(8) +# define SECONDS real(8) +# define COMM_METHOD character(40) + +#ifdef INTEL +# define RECL_UNIT 4 +#else +# define RECL_UNIT 1 +#endif + +# define DIM 4 +# define NCOL 3 +# define NDIRAC 4 +# define NGEN 8 +# define EVEN 0 +# define ODD 1 +# define FWD 0 +# define BWD 1 + +# define SIZE_COMPLEX 2 + +# define REAL real(RKIND) +# define INTEGER integer(4) +# define COMPLEX complex(RKIND) + +# define SU3 COMPLEX, dimension (NCOL, NCOL) +# define GENERATOR REAL, dimension (NGEN) + +# define GAUGE_FIELD_IO COMPLEX, dimension(NCOL, NCOL-1, DIM, 0:NX-1, 0:NY-1, 0:NZ-1, 0:NT-1) +# define SPINCOL_FIELD_IO COMPLEX, dimension(NDIRAC, NCOL, 0:NXH-1, 0:NY-1, 0:NZ-1, 0:NT-1) + +# define SU3_FIELD COMPLEX, dimension (NCOL, NCOL, volh_tot) +# define GAUGE_FIELD COMPLEX, dimension (NCOL, NCOL, volh_tot, EVEN:ODD, DIM) +# define GENERATOR_FIELD REAL, dimension (NGEN, volh_tot, EVEN:ODD, DIM) +# define SPINCOL_FIELD COMPLEX, dimension (NDIRAC, NCOL, volh_tot) +# define SC2_FIELD COMPLEX, dimension(2, NCOL, volh_tot, DIM, FWD:BWD) +# define CLOVER_FIELD_A type(type_clover_a), dimension(2, volh, EVEN:ODD) +# define CLOVER_FIELD_B type(type_clover_b), dimension(2, volh, EVEN:ODD) +# define CLOVER_FIELD_C COMPLEX, dimension(NDIRAC, NCOL, NDIRAC, NCOL, volh) + +# define P_GAUGE_FIELD COMPLEX, dimension(:, :, :, :, :), pointer +# define P_GAUGE_FIELD_IO COMPLEX, dimension(:, :, :, :, :, :, :), pointer +# define P_GENERATOR_FIELD REAL, dimension(:, :, :, :), pointer +# define P_SPINCOL_FIELD COMPLEX, dimension(:, :, :), pointer +# define P_SPINCOL_FIELD_IO COMPLEX, dimension(:, :, :, :, :, :), pointer +# define P_SC2_FIELD COMPLEX, dimension(:, :, :, :, :), pointer +# define P_CLOVER_FIELD_A type(type_clover_a), dimension(:, :, :), pointer +# define P_CLOVER_FIELD_B type(type_clover_b), dimension(:, :, :), pointer + +# define SPINCOL_OVERINDEXED REAL, dimension(SIZE_COMPLEX*NDIRAC*NCOL*volh_tot) +# define P_SPINCOL_OVERINDEXED REAL, dimension(:), pointer + +# define FILENAME character(len=80) +# define FILENAME_FORMAT character(len=80) + +# define Re(z) real(z) +# define Im(z) aimag(z) + +# define CAT(A, B) A ## B +# define STRCAT(A, B) CAT(A, B) +# define STRCAT3(A, B, C) STRCAT(STRCAT(A, B), C) + +# define PI STRCAT(3.1415926535897931_, RKIND) +# define TWOPI STRCAT(6.2831853071795862_, RKIND) +# define SQRT3 STRCAT(1.7320508075688772_, RKIND) + +# define ZERO STRCAT(0.0_, RKIND) +# define ONE STRCAT(1.0_, RKIND) +# define TWO STRCAT(2.0_, RKIND) +# define THREE STRCAT(3.0_, RKIND) +# define FOUR STRCAT(4.0_, RKIND) +# define SIX STRCAT(6.0_, RKIND) +# define EIGHT STRCAT(8.0_, RKIND) + +# define HALF STRCAT(0.5_, RKIND) +# define EIGHTH STRCAT(0.125_, RKIND) + +# define timing_bin_d_xf 1 +# define timing_bin_d_xb 2 +# define timing_bin_d_yf 3 +# define timing_bin_d_yb 4 +# define timing_bin_d_zf 5 +# define timing_bin_d_zb 6 +# define timing_bin_d_t 7 +# define timing_bin_d 8 +# define timing_bin_mtdagmt 9 +# define timing_bin_global_sum 10 +# define timing_bin_global_sum_vec 11 +# define timing_bin_sc_zero 12 +# define timing_bin_sc_copy 13 +# define timing_bin_sc_scale 14 +# define timing_bin_sc_norm2 15 +# define timing_bin_sc_dot 16 +# define timing_bin_sc_axpy 17 +# define timing_bin_sc_xpby 18 +# define timing_bin_sc_axpby 19 +# define timing_bin_sc_cdotc 20 +# define timing_bin_sc_caxpy 21 +# define timing_bin_sc_caxpy2 22 +# define timing_bin_sc_cax2 23 +# define timing_bin_cg 24 +# define timing_bin_hmc_init_p 25 +# define timing_bin_hmc_u 26 +# define timing_bin_dsg 27 +# define timing_bin_dsf 28 +# define timing_bin_clover_init 29 +# define timing_bin_clover_mult_a 30 +# define timing_bin_clover_mult_ao 31 +# define timing_bin_clover_mult_b 32 +# define timing_bin_clover_dsd 33 +# define timing_bin_clover_dsf 34 +# define timing_bin_hmc 35 +# define timing_bin_plaq 36 +# define timing_bin_cooling 37 +# define timing_bin_u_read 38 +# define timing_bin_u_write 39 +# define timing_bin_total 40 + +# define timing_bin_hmc_init 41 +# define timing_bin_hmc_momenta 42 +# define timing_bin_hmc_init_phi 43 +# define timing_bin_hmc_h_old 44 +# define timing_bin_hmc_backup 45 +# define timing_bin_hmc_half_step0 46 +# define timing_bin_hmc_half_step1 47 +# define timing_bin_hmc_xbound_g 48 +# define timing_bin_hmc_steps 49 +# define timing_bin_hmc_h_new 50 +# define timing_bin_hmc_rest 51 + +# define timing_bin_h_mult_a 52 +# define timing_bin_h_mult_b 53 +# define timing_bin_h_mult_c 54 + +# define timing_bin_sc2_projection 55 + +# define timing_bin_d_dag_xf timing_bin_d_xf +# define timing_bin_d_dag_xb timing_bin_d_xb +# define timing_bin_d_dag_yf timing_bin_d_yf +# define timing_bin_d_dag_yb timing_bin_d_yb +# define timing_bin_d_dag_zf timing_bin_d_zf +# define timing_bin_d_dag_zb timing_bin_d_zb +# define timing_bin_d_dag_t timing_bin_d_t +# define timing_bin_d_dag timing_bin_d + +#ifdef TIMING + +# define TIMING_START(bin) call timing_start(bin) +# define TIMING_STOP(bin) call timing_stop(bin) +# define TIMING_WRITE(unit) call timing_write(unit) + +#else + +# define TIMING_START(bin) +# define TIMING_STOP(bin) +# define TIMING_WRITE(unit) + +#endif + + +# define STDERR 0 +# define UINPUT 1 +# define UCONF 2 +# define URAN 3 +# define UCOUNT 4 +# define UREC 6 +# define UINFO 7 +# define ULIST 8 +# define UDIAG 99 + +# define START_HOT 0 +# define START_COLD 1 +# define START_CONT 2 +# define START_FILE 3 + +# define SWAP_DOWN -1 +# define SWAP_RANDOM 0 +# define SWAP_UP 1 + +# define HMC_TEST_FORWARDS 1 +# define HMC_TEST_NONE 0 +# define HMC_TEST_BACKWARDS -1 + +# define PUTSTR(unit, str) if (my_pe() == 0) write(unit,*) str +# define PUTVAL(unit, val) if (my_pe() == 0) write(unit,*) #val, ": ", val + +# define DIAGSTR(str) write(UDIAG,*) str +# define DIAGVAL(val) write(UDIAG,*) #val, ": ", val + +# define ALLOCATE_G_FIELD(x) if (.not. associated(x)) call allocate_g_field(x) +# define ALLOCATE_G_FIELD_IO(x) if (.not. associated(x)) call allocate_g_field_io(x) +# define ALLOCATE_GEN_FIELD(x) if (.not. associated(x)) call allocate_gen_field(x) +# define ALLOCATE_SC_FIELD(x) if (.not. associated(x)) call allocate_sc_field(x) +# define ALLOCATE_SC_FIELD_IO(x) if (.not. associated(x)) call allocate_sc_field_io(x) +# define ALLOCATE_SC_OVERINDEXED(x) if (.not. associated(x)) call allocate_sc_overindexed(x) +# define ALLOCATE_SC2_FIELD(x) if (.not. associated(x)) call allocate_sc2_field(x) + +# define ASSERT(condition) if (.not. (condition)) call assertion_failed(__FILE__, __LINE__, #condition) + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/dsd.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/dsd.F90 new file mode 100644 index 0000000000000000000000000000000000000000..0a5b9069a3cc78918cbbc1e6307cf0a25f94731c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/dsd.F90 @@ -0,0 +1,40 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003-2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! dsd.F90 --- p(j,x,mu) := p(j,x,mu) - step * D_{x,mu,j} S_det +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine dsd(p, conf, step, para) + + use typedef_hmc + use module_hmc_forces + use module_vol + implicit none + + type(hmc_para), intent(in) :: para + type(hmc_conf), intent(in) :: conf + GENERATOR_FIELD, intent(inout) :: p + REAL, intent(in) :: step + REAL :: s + + s = -step * TWO * (para%csw_kappa / EIGHT) + + if (s /= ZERO) then + call hmc_forces_old(p) + call clover_dsd(ODD, p, conf%b, s, conf%u) + call hmc_forces_new(p, step, i_sd) + endif + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/dsf.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/dsf.F90 new file mode 100644 index 0000000000000000000000000000000000000000..f06ff31df11deaa345d7c5a8895e3eb1fa9f8378 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/dsf.F90 @@ -0,0 +1,101 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003-2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! dsf.F90 - kernel of: p(j,x,mu) := p(j,x,mu) - step * D_{x,mu,j} S_{f 1|2} +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine dsf(p, conf, step, para, a, b) + + use typedef_hmc + use module_nn + use module_p_interface + use module_vol + implicit none + + type(hmc_para), intent(in) :: para + type(hmc_conf), intent(in) :: conf + GENERATOR_FIELD, intent(inout) :: p + REAL, intent(in) :: step + SPINCOL_FIELD, intent(in) :: a + SPINCOL_FIELD, intent(in) :: b + + P_GAUGE_FIELD :: u + P_SPINCOL_FIELD, save :: at, bt + REAL :: s, s1, s2 + + + !! call flip_bc(u) <- done in calling routine + + ALLOCATE_SC_FIELD(at) + ALLOCATE_SC_FIELD(bt) + + u => conf%u + + if (para%kappa /= ZERO) then + call d(ODD, EVEN, at, a, u) ! A~ = Doe A + call d_dag(ODD, EVEN, bt, b, u) ! B~ = Deo+ B + + if (para%csw_kappa /= ZERO) then + call clover_mult_b(conf%b(1,1,ODD), at, volh) ! A~ = inv(Too) A~ + call clover_mult_b(conf%b(1,1,ODD), bt, volh) ! B~ = inv(Too) B~ + endif + + if (para%h /= ZERO) then + call h_mult_b(-para%h, at, volh) ! A~ ~ inv(H) A~ + call h_mult_b( para%h, bt, volh) ! B~ ~ inv(H+) B~ + endif + + call xbound_sc_field(a) + call xbound_sc_field(b) + call xbound_sc_field(at) + call xbound_sc_field(bt) + endif + + TIMING_START(timing_bin_dsf) + + s = -step * TWO * para%kappa**2 / (ONE + para%h**2) + + if (s /= ZERO) then + call dsf_xf(p(1,1,EVEN,1), b, at, s, u(1,1,1,EVEN,1), nn(1,EVEN,1,FWD), VOLH) + call dsf_xf(p(1,1,ODD ,1), bt, a, s, u(1,1,1,ODD ,1), nn(1,ODD ,1,FWD), VOLH) + call dsf_xb(p(1,1,EVEN,1), bt, a, s, u(1,1,1,EVEN,1), nn(1,EVEN,1,FWD), VOLH) + call dsf_xb(p(1,1,ODD ,1), b, at, s, u(1,1,1,ODD ,1), nn(1,ODD ,1,FWD), VOLH) + + call dsf_yf(p(1,1,EVEN,2), b, at, s, u(1,1,1,EVEN,2), nn(1,EVEN,2,FWD), VOLH) + call dsf_yf(p(1,1,ODD ,2), bt, a, s, u(1,1,1,ODD ,2), nn(1,ODD ,2,FWD), VOLH) + call dsf_yb(p(1,1,EVEN,2), bt, a, s, u(1,1,1,EVEN,2), nn(1,EVEN,2,FWD), VOLH) + call dsf_yb(p(1,1,ODD ,2), b, at, s, u(1,1,1,ODD ,2), nn(1,ODD ,2,FWD), VOLH) + + call dsf_zf(p(1,1,EVEN,3), b, at, s, u(1,1,1,EVEN,3), nn(1,EVEN,3,FWD), VOLH) + call dsf_zf(p(1,1,ODD ,3), bt, a, s, u(1,1,1,ODD ,3), nn(1,ODD ,3,FWD), VOLH) + call dsf_zb(p(1,1,EVEN,3), bt, a, s, u(1,1,1,EVEN,3), nn(1,EVEN,3,FWD), VOLH) + call dsf_zb(p(1,1,ODD ,3), b, at, s, u(1,1,1,ODD ,3), nn(1,ODD ,3,FWD), VOLH) + + call dsf_tf(p(1,1,EVEN,4), b, at, s, u(1,1,1,EVEN,4), nn(1,EVEN,4,FWD), VOLH) + call dsf_tf(p(1,1,ODD ,4), bt, a, s, u(1,1,1,ODD ,4), nn(1,ODD ,4,FWD), VOLH) + call dsf_tb(p(1,1,EVEN,4), bt, a, s, u(1,1,1,EVEN,4), nn(1,EVEN,4,FWD), VOLH) + call dsf_tb(p(1,1,ODD ,4), b, at, s, u(1,1,1,ODD ,4), nn(1,ODD ,4,FWD), VOLH) + endif + + TIMING_STOP(timing_bin_dsf) + + call flip_bc(u) + + s1 = -step * TWO * (para%csw_kappa / EIGHT) + s2 = -step * TWO * (para%csw_kappa / EIGHT) * para%kappa**2 + + if (s1 /= ZERO) call clover_dsf(EVEN, p, b, a, s1, u) + if (s2 /= ZERO) call clover_dsf(ODD, p, bt, at, s2, u) +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/dsf1.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/dsf1.F90 new file mode 100644 index 0000000000000000000000000000000000000000..9796d065359348494c2d70385a5c3f5caa63ae69 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/dsf1.F90 @@ -0,0 +1,66 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! dsf1.F90 -- p(j,x,mu) := p(j,x,mu) - step * D_{x,mu,j} S_{f1} +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine dsf1(p, conf, step, calc_sf, sf, para) + + use typedef_hmc + use module_hmc_forces + use module_mre + use module_p_interface + use module_switches + use module_vol + implicit none + + GENERATOR_FIELD, intent(inout) :: p + type(hmc_conf), intent(in) :: conf + type(hmc_para), intent(in) :: para + integer, intent(in) :: calc_sf + REAL, intent(in) :: step + REAL, intent(out) :: sf + + type(type_mre), save :: solutions + P_SPINCOL_FIELD, save :: a, b + REAL, external :: dotprod + integer :: iterations + external :: w_mult + external :: w_dagger_w + + sf = ZERO + + if (switches%quenched) return + + ALLOCATE_SC_FIELD(a) + ALLOCATE_SC_FIELD(b) + + call flip_bc(conf%u) + + call mre_get(solutions, w_mult, a, conf%phi, para, conf) + call cg(w_dagger_w, a, conf%phi, para, conf, iterations) ! A = inv(W+ W~) Phi + call mre_put(solutions, a, calc_sf) ! calc_sf <=> reset + call w_mult(b, a, para, conf) ! B = W~ A + + if (calc_sf /= 0) sf = dotprod(b, b, SIZE_SC_FIELD) + + call hmc_forces_old(p) + call dsf(p, conf, step, para, a, b) + call hmc_forces_new(p, step, i_sf1) + + !! call flip_bc(conf%u) <- done in dsf() + + call iteration_count_f1(iterations) +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/dsf2.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/dsf2.F90 new file mode 100644 index 0000000000000000000000000000000000000000..9cf66fe4195ef2b0b213821c96f32bdfa7fbe211 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/dsf2.F90 @@ -0,0 +1,67 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! dsf2.F90 -- p(j,x,mu) := p(j,x,mu) - step * D_{x,mu,j} S_{f2} +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine dsf2(p, conf, step, calc_sf, sf, para) + + use typedef_hmc + use module_hmc_forces + use module_mre + use module_p_interface + use module_switches + use module_vol + implicit none + + GENERATOR_FIELD, intent(inout) :: p + type(hmc_conf), intent(in) :: conf + type(hmc_para), intent(in) :: para + integer, intent(in) :: calc_sf + REAL, intent(in) :: step + REAL, intent(out) :: sf + + type(type_mre), save :: solutions + P_SPINCOL_FIELD, save :: a, b + REAL, external :: dotprod + integer :: iterations + external :: mtil + external :: mtdagmt + + sf = ZERO + + if (.not. switches%hasenbusch) return + + ALLOCATE_SC_FIELD(a) + ALLOCATE_SC_FIELD(b) + + call flip_bc(conf%u) + + call w_mult_dag(b, conf%phi2, para, conf) ! B = W+ phi2 + call mre_get(solutions, mtil, a, b, para, conf) + call cg(mtdagmt, a, b, para, conf, iterations) ! A = inv(M~+ M~) W+ phi2 + call mre_put(solutions, a, calc_sf) ! calc_sf <=> reset + call mtil(b, a, para, conf) ! B = M~ A + if (calc_sf /= 0) sf = dotprod(b, b, SIZE_SC_FIELD) + call sc_axpy(b, conf%phi2, -ONE) ! B = B - phi2 + + call hmc_forces_old(p) + call dsf(p, conf, step, para, a, b) + call hmc_forces_new(p, step, i_sf2) + + !! call flip_bc(conf%u) <- done in dsf() + + call iteration_count_f2(iterations) +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/dsg.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/dsg.F90 new file mode 100644 index 0000000000000000000000000000000000000000..d15f70a21492382124813aa1e00ec6bea32549a5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/dsg.F90 @@ -0,0 +1,54 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! dsg.F90 - p(j,x,mu) := p(j,x,mu) - step * D_{x,mu,j} S_g +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine dsg(p, u, step, beta) + + use module_hmc_forces + use module_vol + implicit none + + GENERATOR_FIELD, intent(inout) :: p + GAUGE_FIELD, intent(in) :: u + REAL, intent(in) :: step, beta + REAL :: s + SU3 :: uuu, w + integer :: mu, eo, i + + if (beta == ZERO) return + + TIMING_START(timing_bin_dsg) + + call hmc_forces_old(p) + + s = -step * beta / THREE + + do mu = 1, DIM + do eo = EVEN, ODD + !$omp parallel do private(uuu, w) + do i = 1, volh + call staple(uuu, u, i, eo, mu) + call uu(w, u(1, 1, i, eo, mu), uuu) + call im_tr_j(p(1, i, eo, mu), w, s) + enddo + enddo + enddo + + call hmc_forces_new(p, step, i_sg) + + TIMING_STOP(timing_bin_dsg) +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/files.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/files.F90 new file mode 100644 index 0000000000000000000000000000000000000000..55da217cf65bd496a9389e6916728ca5e3a5c4ab --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/files.F90 @@ -0,0 +1,325 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! files.F90 +! +!------------------------------------------------------------------------------- +! +! restart_file: progname.run.{res|count|ran|stop} +! restart_conf_file: progname.run.s.time.{u|phi} +! info_file: progname.run.s.info +! +! conf_info_file: progname.run.s1.s2.traj.info +! conf_file: progname.run.s1.s2.traj.time.u +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module module_files ! formats of file name strings + + implicit none + + ! lengths of formats of file name strings + + integer, parameter :: l_name = 2 + integer, parameter :: l_ext = 2 + integer, parameter :: l_sep = 5 + integer, parameter :: l_num = 4 + + integer, parameter :: l_base = l_name + l_sep + l_num + integer, parameter :: l_conf = l_base + 3 * (l_sep + l_num) + + ! formats of file name strings + + character(len = l_name), save :: fmt_name = "(a" + character(len = l_name), save :: fmt_ext = "a)" + character(len = l_sep), save :: fmt_sep = ",'.'," + character(len = l_num), save :: fmt_run = "i3.3" + character(len = l_num), save :: fmt_ensemble = "i1.1" + character(len = l_num), save :: fmt_traj = "i5.5" + character(len = l_num), save :: fmt_time = "i3.3" + + character(len = *), parameter :: ext_count = "count" + character(len = *), parameter :: ext_ran = "ran" + character(len = *), parameter :: ext_res = "res" + character(len = *), parameter :: ext_u = "u" + character(len = *), parameter :: ext_phi = "phi" + character(len = *), parameter :: ext_info = "info" + character(len = *), parameter :: ext_stop = "STOP" + +CONTAINS + + character(len=l_base) function fmt_base() + fmt_base = fmt_name // fmt_sep // fmt_run + end function fmt_base + + character(len=l_conf) function fmt_conf() + fmt_conf = fmt_base() // fmt_sep // fmt_ensemble & + // fmt_sep // fmt_ensemble & + // fmt_sep // fmt_traj + end function fmt_conf + +end + +!------------------------------------------------------------------------------- +FILENAME function count_file() + + use module_files + implicit none + FILENAME :: restart_file + + count_file = restart_file(ext_count) +end + +!------------------------------------------------------------------------------- +FILENAME function ran_file() + + use module_files + implicit none + FILENAME :: restart_file + + ran_file = restart_file(ext_ran) +end + +!------------------------------------------------------------------------------- +FILENAME function res_file() + + use module_files + implicit none + FILENAME :: restart_file + + res_file = restart_file(ext_res) +end + +!------------------------------------------------------------------------------- +FILENAME function stop_file() + + use module_files + implicit none + FILENAME :: restart_file + + stop_file = restart_file(ext_stop) +end + +!------------------------------------------------------------------------------- +FILENAME function u_file(i_ensemble, time) + + use module_files + implicit none + integer, intent(in) :: i_ensemble, time + FILENAME :: restart_conf_file + + u_file = restart_conf_file(i_ensemble, time, ext_u) +end + +!------------------------------------------------------------------------------- +FILENAME function phi_file(i_ensemble, time) + + use module_files + implicit none + integer, intent(in) :: i_ensemble, time + FILENAME :: restart_conf_file + + phi_file = restart_conf_file(i_ensemble, time, ext_phi) +end + + +!------------------------------------------------------------------------------- +FILENAME function restart_file(ext) + + use module_bqcd + use module_counter + use module_files + implicit none + + character(len = *), intent(in) :: ext + FILENAME_FORMAT :: fmt + + fmt = fmt_base() // fmt_sep // fmt_ext + + write(restart_file, fmt) prog_name, counter%run, ext + +end + +!------------------------------------------------------------------------------- +FILENAME function restart_conf_file(i_ensemble, time, ext) + + use module_bqcd + use module_counter + use module_files + implicit none + + integer, intent(in) :: i_ensemble, time + character(len = *), intent(in) :: ext + FILENAME_FORMAT :: fmt + + fmt = fmt_base() // fmt_sep // fmt_ensemble & + // fmt_sep // fmt_time & + // fmt_sep // fmt_ext + + write(restart_conf_file, fmt) prog_name, counter%run, i_ensemble, time, ext + +end + +!------------------------------------------------------------------------------- +FILENAME function info_file(i_ensemble) + + use module_bqcd + use module_counter + use module_files + implicit none + + integer, intent(in) :: i_ensemble + FILENAME_FORMAT :: fmt + + fmt = fmt_base() // fmt_sep // fmt_ensemble // fmt_sep // fmt_ext + + write(info_file, fmt) prog_name, counter%run, i_ensemble, ext_info + +end + +!------------------------------------------------------------------------------- +FILENAME function conf_info_file(i_ensemble1, i_ensemble2) + + use module_bqcd + use module_counter + use module_files + implicit none + + integer, intent(in) :: i_ensemble1, i_ensemble2 + FILENAME_FORMAT :: fmt + + fmt = fmt_conf() // fmt_sep // fmt_ext + + write(conf_info_file, fmt) & + prog_name, counter%run, i_ensemble1, i_ensemble2, counter%traj, ext_info + +end + +!------------------------------------------------------------------------------- +FILENAME function conf_file(i_ensemble1, i_ensemble2, time) + + use module_bqcd + use module_counter + use module_files + implicit none + + integer, intent(in) :: i_ensemble1, i_ensemble2, time + FILENAME_FORMAT :: fmt + + fmt = fmt_conf() // fmt_sep // fmt_time // fmt_sep // fmt_ext + + write(conf_file, fmt) & + prog_name, counter%run, i_ensemble1, i_ensemble2, counter%traj, time, ext_u + +end + +!------------------------------------------------------------------------------- +subroutine check_fmt(run, max_temper, max_traj, max_time) + + use module_files + implicit none + integer :: run, max_temper, max_traj, max_time + + call check_len(run, fmt_run, "RUN") + call check_len(max_temper, fmt_ensemble, "TEMPER") + call check_len(max_traj, fmt_traj, "TRAJ") + call check_len(max_time, fmt_time, "TIME") + +CONTAINS + + subroutine check_len(counter, counter_fmt, counter_name) + + implicit none + integer :: i, len, counter + character(len = *) :: counter_fmt, counter_name + + i = index(counter_fmt, "i") + + if (i == 0) then + call die("check_fmt(): unable to check fmt for " // counter_name) + endif + + read(counter_fmt(i+1:i+1), *) len + + if (counter < 0 .or. counter >= 10**len) then + call die("check_fmt(): file name format unsuitable for " // counter_name) + endif + + end subroutine check_len + +end + +!------------------------------------------------------------------------------- +subroutine set_fmt_ensemble(N_temper) + + use module_files + implicit none + + integer, intent(in) :: N_temper + + if (N_temper < 10) then + fmt_ensemble = "i1.1" + else if (N_temper < 100) then + fmt_ensemble = "i2.2" + else + call die ("set_fmt_ensemble(): N_temper >= 100 ???") + endif + +end + + +!------------------------------------------------------------------------------- +function format_ensemble() + + use module_files + implicit none + character(len = l_num) :: format_ensemble + + format_ensemble = fmt_ensemble +end + +!------------------------------------------------------------------------------- +subroutine filename_test() + + use module_function_decl + implicit none + integer i + + FILENAME, external :: count_file + FILENAME, external :: ran_file + FILENAME, external :: res_file + FILENAME, external :: stop_file + FILENAME, external :: u_file + FILENAME, external :: phi_file + FILENAME, external :: restart_conf_file + FILENAME, external :: info_file + FILENAME, external :: conf_info_file + FILENAME, external :: conf_file + + do i = 1, 11, 10 + call set_fmt_ensemble(i) + + PUTVAL(6, count_file()) + PUTVAL(6, ran_file()) + PUTVAL(6, res_file()) + PUTVAL(6, stop_file()) + PUTVAL(6, u_file(3, 4)) + PUTVAL(6, phi_file(5, 6)) + PUTVAL(6, restart_conf_file(1, 2, 'conf')) + PUTVAL(6, info_file(7)) + PUTVAL(6, conf_info_file(7, 8)) + PUTVAL(6, conf_file(3, 4, 5)) + enddo + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/fixLouhiPP.sh b/qcd/part_cpu/applications/QCD/src/kernel_A/fixLouhiPP.sh new file mode 100644 index 0000000000000000000000000000000000000000..b6e1537ba25946b650663060d03a61e95895e258 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/fixLouhiPP.sh @@ -0,0 +1,8 @@ +sed 's/% ##/%/g' modules/module_input.F90 > tmp.dat; cp tmp.dat modules/module_input.F90; rm -f tmp.dat + +sed 's/## )/ )/g' clover/clover_init.F90 > tmp.dat; cp tmp.dat clover/clover_init.F90; rm -f tmp.dat + +sed 's/## (/ (/g' d/D21xyzt.F90 > tmp.dat; cp tmp.dat d/D21xyzt.F90; rm -f tmp.dat +sed 's/## (/ (/g' d/D2xyzt.F90 > tmp.dat; cp tmp.dat d/D2xyzt.F90; rm -f tmp.dat +sed 's/## (/ (/g' d/D31xyzt.F90 > tmp.dat; cp tmp.dat d/D31xyzt.F90; rm -f tmp.dat +sed 's/## (/ (/g' d/D3xyzt.F90 > tmp.dat; cp tmp.dat d/D3xyzt.F90; rm -f tmp.dat diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/flip_bc.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/flip_bc.F90 new file mode 100644 index 0000000000000000000000000000000000000000..f8cb9fbdb940fccfaa155b2712969f83fa5b5b51 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/flip_bc.F90 @@ -0,0 +1,112 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! flip_bc.F90 - flip fermionic boundary conditions +! (ie multiplication of corresponding links with -1) +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module module_flip_bc + + INTEGER, dimension(:, :, :), pointer, save :: flip_bc_list + integer, dimension(DIM), save :: flip_bc_len + +end + +!------------------------------------------------------------------------------- +subroutine init_flip_bc() + + use module_flip_bc + use module_function_decl + use module_lattice + use module_vol + implicit none + + integer, dimension(DIM) :: i0, i1, i_pe, j + integer :: me, mu, x, y, z, t, i, eo, count(EVEN:ODD) + integer, external :: xyzt2i, e_o + + allocate(flip_bc_list(volh_tot, EVEN:ODD, DIM)) + + me = my_pe() + call unlex(me, DIM, i_pe, NPE) + + do mu = 1, DIM + count = 0 + if (bc_fermions(mu) < 0) then + if (i_pe(mu) == (NPE(mu) - 1) .or. i_pe(mu) == 0) then + i0 = 0 + i1 = N - 1 + + if (i_pe(mu) == (NPE(mu) - 1)) then + i0(mu) = N(mu) - 1 + else + i0(mu) = -1 + endif + + i1(mu) = i0(mu) + + do t = i0(4), i1(4) + do z = i0(3), i1(3) + do y = i0(2), i1(2) + do x = i0(1), i1(1) + j = (/x, y, z, t/) + i = xyzt2i(j) + eo = e_o(j) + + count(eo) = count(eo) + 1 + flip_bc_list(count(eo), eo, mu) = i + enddo + enddo + enddo + enddo + + endif + endif + + if (count(EVEN) /= count(ODD)) then + call die ("init_flip_bc(): count(EVEN) /= count(ODD)") + else + flip_bc_len(mu) = count(EVEN) + endif + enddo + +end + +!------------------------------------------------------------------------------- +subroutine flip_bc(u) + + use module_flip_bc + use module_lattice + use module_vol + implicit none + + GAUGE_FIELD, intent(inout) :: u + integer :: mu, nu, count, i, eo, c1, c2 + + do mu = 1, DIM + nu = gamma_index(mu) + do eo = EVEN,ODD + do count = 1, flip_bc_len(mu) + i = flip_bc_list(count, eo, mu) + do c2 = 1, NCOL + do c1 = 1, NCOL + u(c1, c2, i, eo, nu) = -u(c1, c2, i, eo, nu) + enddo + enddo + enddo + enddo + enddo + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/h_mult.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/h_mult.F90 new file mode 100644 index 0000000000000000000000000000000000000000..67af646983001c34b430fb7402e5b962216ffedf --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/h_mult.F90 @@ -0,0 +1,107 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2000-2002, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! h_mult.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine h_mult_a(out, h, in, volh) ! out := out + i h gamma_5 in + + implicit none + COMPLEX, dimension (NDIRAC, *) :: out, in + REAL :: h + integer :: volh + + integer :: i + + ! statement function: + + COMPLEX :: i_times, c + i_times(c) = cmplx(-aimag(c), real(c)) + + TIMING_START(timing_bin_h_mult_a) + + !$omp parallel do + do i = 1, NCOL * volh + out(1, i) = out(1, i) + h * i_times(in(3, i)) + out(2, i) = out(2, i) + h * i_times(in(4, i)) + out(3, i) = out(3, i) + h * i_times(in(1, i)) + out(4, i) = out(4, i) + h * i_times(in(2, i)) + enddo + + TIMING_STOP(timing_bin_h_mult_a) +end + +!------------------------------------------------------------------------------- +subroutine h_mult_b(h, x, volh) ! x := (1 + i h gamma_5) x + + implicit none + REAL :: h + COMPLEX, dimension (NDIRAC, *) :: x + integer :: volh + + integer :: i + COMPLEX :: x1, x2, x3, x4 + + ! statement function: + + COMPLEX :: i_times, c + i_times(c) = cmplx(-aimag(c), real(c)) + + + TIMING_START(timing_bin_h_mult_b) + + !$omp parallel do private(x1, x2, x3, x4) + do i = 1, NCOL * volh + x1 = x(1, i) + x2 = x(2, i) + x3 = x(3, i) + x4 = x(4, i) + + x(1, i) = x(1, i) + h * i_times(x3) + x(2, i) = x(2, i) + h * i_times(x4) + x(3, i) = x(3, i) + h * i_times(x1) + x(4, i) = x(4, i) + h * i_times(x2) + enddo + + TIMING_STOP(timing_bin_h_mult_b) +end + +!------------------------------------------------------------------------------- +subroutine h_mult_c(out, h, in, volh) ! out = (1 + i h gamma_5) in + + implicit none + COMPLEX, dimension (NDIRAC, *) :: out, in + REAL :: h + integer :: volh + + integer :: i + + ! statement function: + + COMPLEX :: i_times, c + i_times(c) = cmplx(-aimag(c), real(c)) + + TIMING_START(timing_bin_h_mult_c) + + !$omp parallel do + do i = 1, NCOL * volh + out(1, i) = in(1, i) + h * i_times(in(3, i)) + out(2, i) = in(2, i) + h * i_times(in(4, i)) + out(3, i) = in(3, i) + h * i_times(in(1, i)) + out(4, i) = in(4, i) + h * i_times(in(2, i)) + enddo + + TIMING_STOP(timing_bin_h_mult_c) +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/hmc.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc.F90 new file mode 100644 index 0000000000000000000000000000000000000000..ddfed12989cc9eee43d773e8fce66013cbb78bae --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc.F90 @@ -0,0 +1,207 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! hmc.F90 - one Hybrid Monte Carlo step including Hasenbusch's and Bakeyev's +! accelerations +! +!// IR scale: tau +!// IR steps: ntau +!// UV scale: tau / m_scale +!// UV steps: m_scale +!// +!// models / splits of actions: +!// +!// model A: +!// S_UV = S_g +!// S_IR = S_det + S_f1 +!// +!// model B: +!// S_UV = S_g +!// S_IR = S_det + S_f1 + S_f2 +!// +!// model C: +!// S_UV = S_g + S_det + S_f1 +!// S_IR = S_f2 +!// +!// S_f1 = phi1+ inv(W+ W) phi1 +!// S_f2 = phi2+ W inv(M~+ M~) W+ phi2 +!// +!// W = M~ + rho +!// +!// => ir_steps = 1 and rho = 0 corresponds exactly to the previous verions +!// (rho = 0 is treated as S_f2 = 0), +!// and especially tau and ntau have the same meaning as before +!// +!// (In the whole program phi and phi2 are treated asymmetrically. +!// The reason for this is upward compatibility with the mode +!// "standard Wilson fermions + parallel tempering". +!// phi is needed for the tempering decisions.) +!// +!// Flags / switches: +!// +!// force_accept: Force acceptance after, eg, a hot or cold start. +!// test: For testing reversibility by forward/backward integration. +!// If (test /= HMC_TEST_NONE) force_accept has to be .true. +!// If (test == HMC_TEST_BACKWARDS) para%tau has to be reversed. +!// +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine hmc(para, conf, out, force_accept, test) + + use typedef_hmc + use module_function_decl + use module_p_interface + use module_switches + use module_vol + implicit none + + type(hmc_para), intent(in) :: para + type(hmc_conf), intent(inout) :: conf + type(hmc_out), intent(out) :: out + integer, intent(in) :: force_accept + integer, intent(in) :: test + + P_GAUGE_FIELD, save :: u_bck + P_SPINCOL_FIELD, save :: phi_bck + P_CLOVER_FIELD_A, save :: a_bck, i_bck + P_CLOVER_FIELD_B, save :: b_bck + P_GENERATOR_FIELD, save :: p + + REAL :: sd_old, sd_new + REAL :: sf1_old, sf1_new + REAL :: sf2_old, sf2_new + REAL :: sg_old, sg_new + REAL :: sp_old, sp_new + REAL :: hg_old, hg_new + REAL :: h_old, h_new + REAL :: sf1, sf2 + REAL, external :: sp, sg, clover_action + + + TIMING_START(timing_bin_hmc) + + TIMING_START(timing_bin_hmc_init) + + if (.not. associated(u_bck)) then + ALLOCATE_G_FIELD(u_bck) + ALLOCATE_GEN_FIELD(p) + ALLOCATE_SC_FIELD(phi_bck) + if (switches%clover) call allocate_clover_field_a(a_bck) + if (switches%clover) call allocate_clover_field_a(i_bck) + if (switches%clover) call allocate_clover_field_b(b_bck) + endif + + sd_old = ZERO; sd_new = ZERO + sf1_old = ZERO; sf1_new = ZERO + sf2_old = ZERO; sf2_new = ZERO + sg_old = ZERO; sg_new = ZERO + sp_old = ZERO; sp_new = ZERO + sf1 = ZERO; sf2 = ZERO + + call init_cg_stat() + TIMING_STOP(timing_bin_hmc_init) + + if (test /= HMC_TEST_BACKWARDS) then ! ie normally do: + + ! backups: + + TIMING_START(timing_bin_hmc_backup) + + call swap_p_sc_field(phi_bck, conf%phi) + + u_bck = conf%u + if (switches%clover) a_bck = conf%a + if (switches%clover) i_bck = conf%i + if (switches%clover) b_bck = conf%b + + TIMING_STOP(timing_bin_hmc_backup) + + ! initialize momenta p, phi, phi2 and old action: + + call hmc_init_p(p) + call hmc_init_phi(conf, para, sf1_old, sf2_old) + + TIMING_START(timing_bin_hmc_h_old) + if (switches%clover) sd_old = clover_action(conf%b(1,1,ODD)) + sg_old = sg(conf%u) + sp_old = sp(p) + hg_old = sg_old * para%beta + + h_old = sd_old + sp_old + hg_old + sf1_old + sf2_old + TIMING_STOP(timing_bin_hmc_h_old) + endif + + if (test == HMC_TEST_FORWARDS) then + call hmc_test_report(test, p, conf%u, & + sp_old, hg_old, sf1_old, sf2_old, sd_old) + endif + +! leap frog integration: + + call hmc_leap_frog(p, para, conf, sf1, sf2) + +! calculate Hamiltonian: + + TIMING_START(timing_bin_hmc_h_new) + if (switches%clover) sd_new = clover_action(conf%b(1,1,ODD)) + sf1_new = sf1 + sf2_new = sf2 + sg_new = sg(conf%u) + sp_new = sp(p) + hg_new = sg_new * para%beta + + h_new = sd_new + sp_new + hg_new + sf1_new + sf2_new + TIMING_STOP(timing_bin_hmc_h_new) + +! accept new U ? : + + TIMING_START(timing_bin_hmc_rest) + out%exp_dh = exp(h_old - h_new) + + if (force_accept /= 0) then + out%accepted = 1 + else + if (ranf() < out%exp_dh) then + out%accepted = 1 + else + out%accepted = 0 + endif + endif + + if (out%accepted == 1) then + out%sg = sg_new + out%sf = sf1_new + else + call swap_p_sc_field(conf%phi, phi_bck) + call swap_p_g_field(conf%u, u_bck) + call swap_p_clover_field_a(conf%a, a_bck) + call swap_p_clover_field_a(conf%i, i_bck) + call swap_p_clover_field_b(conf%b, b_bck) + endif + + call get_cg_stat(out%cg_ncall, out%cg_niter_max, out%cg_niter_tot) + + call iteration_count_write(UREC) + call hmc_forces_write(UREC) + + if (test == HMC_TEST_BACKWARDS) then + call hmc_test_report(test, p, conf%u, & + sp_new, hg_new,sf1_new, sf2_new, sd_new) + endif + + TIMING_STOP(timing_bin_hmc_rest) + + TIMING_STOP(timing_bin_hmc) + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_check.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_check.F90 new file mode 100644 index 0000000000000000000000000000000000000000..b0793399a0058613b5e32f9a498a75306db5bafe --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_check.F90 @@ -0,0 +1,122 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! hmc_check.F90 - check by forward/backward leap frog integration +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine hmc_check(check, p, u, hp, hg, hf1, hf2, hd) + + use module_function_decl + use module_vol + implicit none + + integer, intent(in) :: check + GENERATOR_FIELD, intent(in) :: p + GAUGE_FIELD, intent(in) :: u + REAL, intent(in) :: hp, hg, hf1, hf2, hd + + P_GENERATOR_FIELD, save :: p_start + P_GAUGE_FIELD, save :: u_start + REAL, save :: hp_start + REAL, save :: hg_start + REAL, save :: hf1_start + REAL, save :: hf2_start + REAL, save :: hd_start + + REAL :: diff_p, diff_u + integer :: i, eo, mu, j, c1, c2 + + if (.not. associated(p_start)) then + allocate(p_start(NGEN, volh_tot, EVEN:ODD, DIM)) + allocate(u_start(NCOL, NCOL, volh_tot, EVEN:ODD, DIM)) + endif + + if (check == HMC_CHECK_FORWARDS) then + + p_start = p + u_start = u + hp_start = hp + hg_start = hg + hf1_start = hf1 + hf2_start = hf2 + hd_start = hd + + else if (check == HMC_CHECK_BACKWARDS) then + + diff_p = ZERO + diff_u = ZERO + + do mu = 1, DIM + do eo = EVEN, ODD + do i = 1, volh + do j = 1, NGEN + diff_p = max(diff_p, abs(p_start(j,i,eo,mu) - p(j,i,eo,mu))) + enddo + do c2 = 1, NCOL + do c1 = 1, NCOL + diff_u = max(diff_u, & + abs(relative_change(Re(u_start(c1,c2,i,eo,mu)), & + Re(u(c1,c2,i,eo,mu))))) + diff_u = max(diff_u, & + abs(relative_change(Im(u_start(c1,c2,i,eo,mu)), & + Im(u(c1,c2,i,eo,mu))))) + enddo + enddo + enddo + enddo + enddo + + if (my_pe() == 0) then + call begin(UREC, "HMC-check") + write(UREC, *) + write(UREC,400) "Configuration changes (maximal abs. relative changes):" + write(UREC, *) + write(UREC,410) "Generator field:", diff_p + write(UREC,410) "Gauge field: ", diff_u + write(UREC, *) + write(UREC, *) + write(UREC,400) "Energy changes:" + write(UREC, *) + write(UREC,420) "Energy ", "old value", "rel.change" + write(UREC, *) + write(UREC,430) "H_generator", hp_start, relative_change(hp_start, hp) + write(UREC,430) "H_gauge ", hg_start, relative_change(hg_start, hg) + write(UREC,430) "H_fermion_1", hf1_start, relative_change(hf1_start, hf1) + write(UREC,430) "H_fermion_2", hf2_start, relative_change(hf2_start, hf2) + write(UREC,430) "H_det ", hd_start, relative_change(hd_start, hd) + write(UREC, *) + call end_A(UREC, "HMC-check") + endif + +400 format (1x, a) +410 format (1x, a, e8.1) +420 format (1x, a, a20, a12) +430 format (1x, a, e20.10, e12.1) + + else + call die("hmc_check(): unknown check flag.") + endif + +contains + + REAL function relative_change(old, new) + + implicit none + REAL, intent(in) :: old, new + + relative_change = old / (new - old) + + end function relative_change + +end +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_forces.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_forces.F90 new file mode 100644 index 0000000000000000000000000000000000000000..4f7e3985ee39a367b3299c23e03474b41bbfc710 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_forces.F90 @@ -0,0 +1,155 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! hmc_forces.F90 - calculation of HMC forces in Hasenbusch improvement, +! does not work with tempering +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!----------------------------------------------------------------------------- +subroutine hmc_forces_old(p) + + use module_hmc_forces + use module_p_interface + use module_switches + use module_vol + implicit none + + GENERATOR_FIELD, intent(in) :: p + integer :: mu, eo, i + + if (.not. switches%hasenbusch) return + + if (.not. associated(p_old)) then + call allocate_gen_field(p_old) + f_count = ZERO + f_avg = ZERO + f_max = ZERO + endif + + + do mu = 1, DIM + do eo = EVEN, ODD + !$omp parallel do + do i = 1, volh + p_old(1, i, eo, mu) = p(1, i, eo, mu) + p_old(2, i, eo, mu) = p(2, i, eo, mu) + p_old(3, i, eo, mu) = p(3, i, eo, mu) + p_old(4, i, eo, mu) = p(4, i, eo, mu) + p_old(5, i, eo, mu) = p(5, i, eo, mu) + p_old(6, i, eo, mu) = p(6, i, eo, mu) + p_old(7, i, eo, mu) = p(7, i, eo, mu) + p_old(8, i, eo, mu) = p(8, i, eo, mu) + enddo + enddo + enddo + +end + +!----------------------------------------------------------------------------- +subroutine hmc_forces_new(p, step, which) + + use module_hmc_forces + use module_function_decl + use module_switches + use module_vol + implicit none + + GENERATOR_FIELD, intent(in) :: p + REAL, intent(in) :: step + integer, intent(in) :: which + + integer :: mu, eo, i + REAL :: force + + if (.not. switches%hasenbusch) return + + force = ZERO + do mu = 1, DIM + do eo = EVEN, ODD + !$omp parallel do + do i = 1, volh + force = force & + + (p_old(1, i, eo, mu) - p(1, i, eo, mu))**2 & + + (p_old(2, i, eo, mu) - p(2, i, eo, mu))**2 & + + (p_old(3, i, eo, mu) - p(3, i, eo, mu))**2 & + + (p_old(4, i, eo, mu) - p(4, i, eo, mu))**2 & + + (p_old(5, i, eo, mu) - p(5, i, eo, mu))**2 & + + (p_old(6, i, eo, mu) - p(6, i, eo, mu))**2 & + + (p_old(7, i, eo, mu) - p(7, i, eo, mu))**2 & + + (p_old(8, i, eo, mu) - p(8, i, eo, mu))**2 + enddo + enddo + enddo + + force = global_sum(force) / (NGEN * volume * DIM) + force = sqrt(force) / abs(step) + + f_count(which) = f_count(which) + ONE + f_avg(which) = f_avg(which) + force + f_max(which) = max(f_max(which), force) + +end + + +!----------------------------------------------------------------------------- +subroutine hmc_forces_write(unit) + + use module_hmc_forces + use module_counter + use module_function_decl + use module_switches + + implicit none + + integer, intent(in) :: unit + integer, save :: written = 0 + integer :: i + + character(*), parameter :: key_avg = "%Favg" + character(*), parameter :: key_max = "%Fmax" + character(*), parameter :: fmt_h = "(1x, 2a, a6, 4a)" + character(*), parameter :: fmt_b = "(1x, a6, i6, 4g20.10)" + + character(20), dimension(n_force) :: f_name + + if (.not. switches%hasenbusch) return + + f_name(i_sg) = " F_gauge" + f_name(i_sd) = " F_det" + f_name(i_sf1) = " F_F1" + f_name(i_sf2) = " F_F2" + + do i = 1, n_force + if (f_count(i) /= ZERO) then + f_avg(i) = f_avg(i) / f_count(i) + endif + enddo + + if (written == 0 .and. my_pe() == 0) then + write(unit, fmt_h) "T", key_avg, "traj", (f_name(i), i = 1, n_force) + write(unit, fmt_h) "T", key_max, "traj", (f_name(i), i = 1, n_force) + endif + + if (my_pe() == 0) then + write(unit, fmt_b) key_avg, counter%traj, (f_avg(i), i = 1, n_force) + write(unit, fmt_b) key_max, counter%traj, (f_max(i), i = 1, n_force) + endif + + + written = written + 1 + f_count = ZERO + f_avg = ZERO + f_max = ZERO + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_init_p.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_init_p.F90 new file mode 100644 index 0000000000000000000000000000000000000000..4d40dbf9d606cd8849f4b112daa0467526ae3252 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_init_p.F90 @@ -0,0 +1,36 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! hmc_init_p.F90 - initialization of momenta +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine hmc_init_p(p) + + use module_vol + implicit none + + GENERATOR_FIELD, intent(out) :: p + integer :: mu, eo + + TIMING_START(timing_bin_hmc_init_p) + + do mu = 1, DIM + do eo = EVEN, ODD + call ran_gauss_volh(NGEN/2, p(1,1,eo,mu), ONE, eo) + enddo + enddo + + TIMING_STOP(timing_bin_hmc_init_p) +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_init_phi.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_init_phi.F90 new file mode 100644 index 0000000000000000000000000000000000000000..3e42f7ff0d01f2b7ee365e2b987dcfd256e45996 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_init_phi.F90 @@ -0,0 +1,66 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003-2005, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! hmc_init_phi.F90 - initialises phi, phi2 and calculates actions +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine hmc_init_phi(conf, para, sf1, sf2) + + use typedef_hmc + use module_function_decl + use module_p_interface + use module_switches + use module_vol + implicit none + + type(hmc_para), intent(in) :: para + type(hmc_conf), intent(inout) :: conf + REAL, intent(out) :: sf1 + REAL, intent(out) :: sf2 + + P_SPINCOL_FIELD, save :: tmp + + integer :: iterations + external :: w_dagger_w + + TIMING_START(timing_bin_hmc_init_phi) + + sf1 = ZERO + sf2 = ZERO + + if (.not. switches%dynamical) return + + ALLOCATE_SC_FIELD(tmp) + + call flip_bc(conf%u) + + + call ran_gauss_volh(NDIRAC * NCOL, tmp, HALF, EVEN) ! tmp = noise + sf1 = dotprod(tmp, tmp, SIZE_SC_FIELD) + call w_mult_dag(conf%phi, tmp, para, conf) ! phi = W+ noise + + if (switches%hasenbusch) then + call ran_gauss_volh(NDIRAC * NCOL, tmp, HALF, EVEN) ! tmp = noise + sf2 = dotprod(tmp, tmp, SIZE_SC_FIELD) + call mtil_dag(conf%phi2, tmp, para, conf) ! phi2 = M~+ noise + call cg(w_dagger_w, tmp, conf%phi2, para, conf, iterations) + ! tmp = inv(W+ W) M~+ noise + call w_mult(conf%phi2, tmp, para, conf) ! phi2 = inv(W+) M~+ noise + endif + + call flip_bc(conf%u) + + TIMING_STOP(timing_bin_hmc_init_phi) +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_integrator.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_integrator.F90 new file mode 100644 index 0000000000000000000000000000000000000000..609a126116e2d27014680aeecc8e8f97eec5fd7b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_integrator.F90 @@ -0,0 +1,94 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! hmc_integrator.F90 - integrators for the different models +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine hmc_integrator_p_ir(p, para, conf, step, calc_sf, sf1, sf2) + + use typedef_hmc + use module_switches + use module_vol + implicit none + + GENERATOR_FIELD, intent(inout) :: p + type(hmc_para), intent(in) :: para + type(hmc_conf), intent(in) :: conf + REAL, intent(in) :: step + integer, intent(in) :: calc_sf + REAL, intent(out) :: sf1, sf2 + + select case (para%model) + case ("A") + call dsf1(p, conf, step, calc_sf, sf1, para) + call dsd(p, conf, step, para) + case ("B") + call dsf1(p, conf, step, calc_sf, sf1, para) + call dsf2(p, conf, step, calc_sf, sf2, para) + call dsd(p, conf, step, para) + case ("C") + call dsf2(p, conf, step, calc_sf, sf2, para) + case default + call die("hmc_integrator_p_ir: " // para%model // ": unknown model") + end select + +end + +!------------------------------------------------------------------------------- +subroutine hmc_integrator_p_uv(p, para, conf, step, calc_sf, sf1, sf2) + + use typedef_hmc + use module_switches + use module_vol + implicit none + + GENERATOR_FIELD, intent(inout) :: p + type(hmc_para), intent(in) :: para + type(hmc_conf), intent(in) :: conf + REAL, intent(in) :: step + integer, intent(in) :: calc_sf + REAL, intent(out) :: sf1, sf2 + + select case (para%model) + case ("A") + call dsg(p, conf%u, step, para%beta) + case ("B") + call dsg(p, conf%u, step, para%beta) + case ("C") + call dsg(p, conf%u, step, para%beta) + call dsf1(p, conf, step, calc_sf, sf1, para) + call dsd(p, conf, step, para) + case default + call die("hmc_integrator_p_uv: " // para%model // ": unknown model") + end select + +end + +!------------------------------------------------------------------------------- +subroutine hmc_integrator_q(p, para, conf, step) + + use typedef_hmc + use module_switches + use module_vol + implicit none + + GENERATOR_FIELD, intent(in) :: p + type(hmc_para), intent(in) :: para + type(hmc_conf), intent(inout) :: conf + REAL, intent(in) :: step + + call hmc_u(p, conf, step, para) + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_leap_frog.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_leap_frog.F90 new file mode 100644 index 0000000000000000000000000000000000000000..246e5b31d409006998ccb2f5b3e252de02c97a5e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_leap_frog.F90 @@ -0,0 +1,70 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! hmc_leap_frog.F90 - two time scale leap frog integration +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine hmc_leap_frog(p, para, conf, sf1, sf2) + + use typedef_hmc + use module_switches + use module_vol + implicit none + + GENERATOR_FIELD, intent(inout) :: p + type(hmc_para), intent(in) :: para + type(hmc_conf), intent(inout) :: conf + REAL, intent(out) :: sf1, sf2 + + REAL :: step_ir, step_uv + integer :: calc_sf, itau, i_scale + + calc_sf = 0 + +! first half ir and uv step + + step_ir = HALF * para%tau + step_uv = HALF * para%tau / para%m_scale + + TIMING_START(timing_bin_hmc_half_step0) + call hmc_integrator_p_ir(p, para, conf, step_ir, calc_sf, sf1, sf2) + call hmc_integrator_p_uv(p, para, conf, step_uv, calc_sf, sf1, sf2) + TIMING_STOP(timing_bin_hmc_half_step0) + + step_ir = para%tau + step_uv = para%tau / para%m_scale + + do itau = 1, para%ntau + do i_scale = 1, para%m_scale + + call hmc_integrator_q(p, para, conf, step_uv) + + if (itau == para%ntau .and. i_scale == para%m_scale) then + step_ir = step_ir * HALF ! final half steps + step_uv = step_uv * HALF + calc_sf = 1 ! calculate new S_f + endif + + TIMING_START(timing_bin_hmc_steps) + call hmc_integrator_p_uv(p, para, conf, step_uv, calc_sf, sf1, sf2) + TIMING_STOP(timing_bin_hmc_steps) + enddo + + TIMING_START(timing_bin_hmc_steps) + call hmc_integrator_p_ir(p, para, conf, step_ir, calc_sf, sf1, sf2) + TIMING_STOP(timing_bin_hmc_steps) + enddo + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_test.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_test.F90 new file mode 100644 index 0000000000000000000000000000000000000000..c7233bf8d5cf916c16b903035804d0d50d6ce740 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_test.F90 @@ -0,0 +1,177 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! hmc_test.F90 - forward/backward leap frog integration +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine hmc_test(para, conf) + + use typedef_hmc + use module_function_decl + use module_vol + implicit none + + type(hmc_para), intent(inout) :: para + type(hmc_conf), intent(inout) :: conf + type(hmc_out) :: out + + call begin(UREC, "HMCtest") + + call hmc(para, conf, out, .true., HMC_TEST_FORWARDS) + + call write_out("forward ") + + para%tau = -para%tau + + call hmc(para, conf, out, .true., HMC_TEST_BACKWARDS) + + para%tau = -para%tau + + call write_out("backward ") + + call end_A(UREC, "HMCtest") + + +contains + + subroutine write_out(direction) + + character(*) :: direction + REAL :: plaq + + if (my_pe() == 0) then + plaq = out%sg / (6 * volume) + write(UREC, *) + write(UREC, 400) "Direction", "PlaqEnergy", & + "exp(-Delta_H)", "CGcalls", "CGitTot", "CGitMax" + write(UREC, 410) direction, plaq, out%exp_dh, & + out%cg_ncall, out%cg_niter_tot, out%cg_niter_max + write(UREC, *) + endif + +400 format (1x, a, 2a15, 3a8) +410 format (1x, a, 2f15.10, 3i8) + + end subroutine write_out + +end + +!------------------------------------------------------------------------------- +subroutine hmc_test_report(test, p, u, hp, hg, hf1, hf2, hd) + + use module_function_decl + use module_vol + implicit none + + integer, intent(in) :: test + GENERATOR_FIELD, intent(in) :: p + GAUGE_FIELD, intent(in) :: u + REAL, intent(in) :: hp, hg, hf1, hf2, hd + + P_GENERATOR_FIELD, save :: p_start + P_GAUGE_FIELD, save :: u_start + REAL, save :: hp_start + REAL, save :: hg_start + REAL, save :: hf1_start + REAL, save :: hf2_start + REAL, save :: hd_start + + REAL :: diff_p, diff_u + integer :: i, eo, mu, j, c1, c2 + + if (.not. associated(p_start)) then + allocate(p_start(NGEN, volh_tot, EVEN:ODD, DIM)) + allocate(u_start(NCOL, NCOL, volh_tot, EVEN:ODD, DIM)) + endif + + if (test == HMC_TEST_FORWARDS) then + + p_start = p + u_start = u + hp_start = hp + hg_start = hg + hf1_start = hf1 + hf2_start = hf2 + hd_start = hd + + else if (test == HMC_TEST_BACKWARDS) then + + diff_p = ZERO + diff_u = ZERO + + do mu = 1, DIM + do eo = EVEN, ODD + do i = 1, volh + do j = 1, NGEN + diff_p = max(diff_p, abs(p_start(j,i,eo,mu) - p(j,i,eo,mu))) + enddo + do c2 = 1, NCOL + do c1 = 1, NCOL + diff_u = max(diff_u, & + abs(relative_change(Re(u_start(c1,c2,i,eo,mu)), & + Re(u(c1,c2,i,eo,mu))))) + diff_u = max(diff_u, & + abs(relative_change(Im(u_start(c1,c2,i,eo,mu)), & + Im(u(c1,c2,i,eo,mu))))) + enddo + enddo + enddo + enddo + enddo + + if (my_pe() == 0) then + write(UREC, *) + write(UREC,400) "Configuration changes (maximal abs. relative changes):" + write(UREC, *) + write(UREC,410) "Generator field:", diff_p + write(UREC,410) "Gauge field: ", diff_u + write(UREC, *) + write(UREC, *) + write(UREC,400) "Energy changes:" + write(UREC, *) + write(UREC,420) "Energy ", "old value", "rel.change" + write(UREC, *) + write(UREC,430) "H_generator", hp_start, relative_change(hp_start, hp) + write(UREC,430) "H_gauge ", hg_start, relative_change(hg_start, hg) + write(UREC,430) "H_fermion_1", hf1_start, relative_change(hf1_start, hf1) + write(UREC,430) "H_fermion_2", hf2_start, relative_change(hf2_start, hf2) + write(UREC,430) "H_det ", hd_start, relative_change(hd_start, hd) + write(UREC, *) + endif + +400 format (1x, a) +410 format (1x, a, e8.1) +420 format (1x, a, a20, a12) +430 format (1x, a, e20.10, e12.1) + + else + call die("hmc_test_report(): illegal test flag.") + endif + +contains + + REAL function relative_change(old, new) + + implicit none + REAL, intent(in) :: old, new + + if (old == ZERO) then + relative_change = ZERO + else + relative_change = (new - old) / old + endif + + end function relative_change + +end +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_u.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_u.F90 new file mode 100644 index 0000000000000000000000000000000000000000..fa3ad5724f480953ad4c36b0f830178a80951b32 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/hmc_u.F90 @@ -0,0 +1,60 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! hmc_u.F90 - U := exp(i * lambda_j * P_j * step) * U +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine hmc_u(p, conf, step, para) + + use typedef_hmc + use module_switches + use module_vol + implicit none + + GENERATOR_FIELD, intent(in) :: p + type(hmc_conf), intent(inout) :: conf + REAL, intent(in) :: step + type(hmc_para), intent(in) :: para + + GENERATOR :: q + SU3 :: v + integer :: i, mu, eo, j + + TIMING_START(timing_bin_hmc_u) + + do mu = 1, DIM + do eo = EVEN, ODD + !$omp parallel do private(j, q, v) + do i = 1, VOLH + do j = 1, NGEN + q(j) = p(j, i, eo, mu) * step + enddo + call gen2u(v, q) + call u_update(conf%u(1, 1, i, eo, mu), v) ! u = v * u + call u_normalize(conf%u(1, 1, i, eo, mu)) + enddo + enddo + enddo + + TIMING_STOP(timing_bin_hmc_u) + + TIMING_START(timing_bin_hmc_xbound_g) + call xbound_g_field(conf%u) + TIMING_STOP(timing_bin_hmc_xbound_g) + + if (switches%clover) then + call clover_init(conf%a, conf%i, conf%b, conf%u, para%csw_kappa) + endif +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/index.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/index.F90 new file mode 100644 index 0000000000000000000000000000000000000000..57a35648262b1bd93ee1b786b7c6680e071f10aa --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/index.F90 @@ -0,0 +1,144 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! index.F90 - functions for index calculations +! these functions work "stand alone" +! +!------------------------------------------------------------------------------- + integer function i_e_o(dim, i) ! returns EVEN or ODD (0 or 1) + + implicit none + integer dim, i(dim), d + + i_e_o = 0 + do d = 1, dim + i_e_o = i_e_o + i(d) + enddo + + i_e_o = mod(abs(i_e_o), 2) + end + +!------------------------------------------------------------------------------- + integer function i_global(i_local, N_local, i_pe) + + implicit none + integer i_local, N_local, i_pe + + i_global = i_pe * N_local + i_local + + end + +!------------------------------------------------------------------------------- + integer function i_local(i_global, N_local, i_pe) + + implicit none + integer i_global, N_local, i_pe + + i_local = i_global - i_pe * N_local + + end + +!------------------------------------------------------------------------------- + integer function i_periodic(i, n) + + implicit none + integer i, n + + if (i .ge. 0) then + if (i .lt. n) then + i_periodic = i + elseif (i .lt. 2 * n) then + i_periodic = i - n + else + i_periodic = mod(i, n) + endif + else + if (i .ge. -n) then + i_periodic = i + n + else + i_periodic = i + (1 - (i + 1) / n) * n + endif + endif + end + +!------------------------------------------------------------------------------- + integer function ieo(dim, i, n) + + implicit none + integer dim, i(dim), n(dim), d, ilex + + ieo = ilex(dim, i, n) / 2 + end + +!------------------------------------------------------------------------------- + integer function ilex(dim, i, n) + + implicit none + integer dim, i(dim), n(dim), d + + ilex = i(dim) + do d = dim - 1, 1, -1 + ilex = ilex * n(d) + i(d) + enddo + + end + +!------------------------------------------------------------------------------- + integer function n_sites(dim, direction, n, npe) + +! returns number of sites of local grid and boundaries +! n_sites(dim, (/0, 0, ..., 0/), n, npe) is the (local) grid volume + + implicit none + integer dim, direction(dim), n(dim), npe(dim), d + + n_sites = 1 + do d = 1, dim + if (direction(d) .eq. 0) then + n_sites = n_sites * n(d) + else + if (npe(d) .eq. 1) then ! grid not partitioned in d-direction + n_sites = 0 + return + endif + endif + enddo + + end + +!------------------------------------------------------------------------------- +!! subroutine uneo(ieo, eo, dim, i, n) ! returns i for given (ieo, eo) +!! +!! implicit none +!! integer ieo, eo, dim, i(dim), n(dim), e_o +!! +!! call unlex(2 * ieo, dim, i, n) +!! i(1) = i(1) + ieor(e_o(dim-1, i(2)), eo) +!! +!! end +!! +!------------------------------------------------------------------------------- + subroutine unlex(ilex, dim, i, n) + + ! remember the range of ilex: 0 <= ilex < (n(1) * ... * n(dim)) + + implicit none + integer ilex, dim, i(dim), n(dim), j, d + + j = ilex + do d = 1, dim - 1 + i(d) = mod(j, n(d)) + j = j / n(d) ! integer division + enddo + i(dim) = j + + end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/index2.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/index2.F90 new file mode 100644 index 0000000000000000000000000000000000000000..1f1c3aa3dfabd03c13c5f921514cfe17b35cb6c2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/index2.F90 @@ -0,0 +1,55 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! index2.F90 - more functions for index calculations +! these functions use modules +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine local2global(home, local, global) + + use module_lattice + implicit none + + integer, intent(in) :: home ! home process of "local" + integer, intent(in), dimension(DIM) :: local ! local coordinates + integer, intent(out),dimension(DIM) :: global ! global coordinates + + integer, external :: i_global, i_periodic + integer :: coord_home(DIM), i + + + call unlex(home, DIM, coord_home, npe) + + do i = 1, DIM + global(i) = i_global(local(i), N(i), coord_home(i)) + global(i) = i_periodic(global(i), L(i)) + enddo + +end + +!------------------------------------------------------------------------------- +integer function e_o(local) !// returns EVEN or ODD (0 or 1) + + use module_function_decl + implicit none + + integer, intent(in), dimension(DIM) :: local ! local coordinates + integer, dimension(DIM) :: global ! global coordinates + integer, external :: i_e_o + + call local2global(my_pe(), local, global) + + e_o = i_e_o(DIM, global) +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/init_common.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/init_common.F90 new file mode 100644 index 0000000000000000000000000000000000000000..61a6cf938f34632a4e0c85981dc320e616ec3f0a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/init_common.F90 @@ -0,0 +1,396 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! init_common.F90 - initialize common blocks (now: mostly modules) +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine init_common(para) + + use typedef_para + implicit none + type(type_para), intent(in) :: para + + call init_common_lattice(para%L, para%NPE, para%bc_fermions, para%gamma_index) + call init_common_vol(para%L, para%NPE) + call init_common_nnpe(para%NPE) + call init_common_offset() + call init_common_nn() + call init_common_thread() +end + +!------------------------------------------------------------------------------- +subroutine init_common_lattice(ll, pe, bc_f, gamma_i) + + use module_function_decl + use module_lattice + implicit none + + integer, dimension(DIM), intent(in) :: ll, pe, bc_f, gamma_i + + integer :: i, count(DIM) + + L = ll + NPE = pe + N = ll / pe + NH = N + + NH(1) = N(1) / 2 + + bc_fermions = bc_f + gamma_index = gamma_i + + if (mod(L(1), 2) /= 0) call die("init_common_lattice(): LX must be even") + if (mod(L(2), 2) /= 0) call die("init_common_lattice(): LY must be even") + if (mod(L(3), 2) /= 0) call die("init_common_lattice(): LZ must be even") + if (mod(L(4), 2) /= 0) call die("init_common_lattice(): LT must be even") + + if (mod(N(1), 2) /= 0) call die("init_common_lattice(): NX must be even") + + if (NPE(1) * NPE(2) * NPE(3) * NPE(4) /= num_pes()) then + call die("init_common_lattice(): N_PEs wrong") + endif + +if (N(1)*NPE(1)/=L(1)) call die("init_common_lattice(): NPEX not divider of LX") +if (N(2)*NPE(2)/=L(2)) call die("init_common_lattice(): NPEY not divider of LY") +if (N(3)*NPE(3)/=L(3)) call die("init_common_lattice(): NPEZ not divider of LZ") +if (N(4)*NPE(4)/=L(4)) call die("init_common_lattice(): NPET not divider of LT") + + count = 0 + do i = 1, DIM + if (gamma_index(i) < 1 .or. gamma_index(i) > DIM) then + call die("init_common_lattice(): gamma_index: out of range") + endif + count(gamma_index(i)) = count(gamma_index(i)) + 1 + enddo + + do i = 1, DIM + if (count(i) /= 1) then + call die("init_common_lattice(): gamma_index: inconsistent") + endif + enddo + + select case (version_of_d()) + case(3,4,22) + do i = 1, DIM + if (gamma_index(i) /= i) then + call die( & + "init_common_lattice(): gamma_index: not changeable for this version of D()") + endif + enddo + end select + + do i = 1, DIM + decomp_direction(gamma_index(i)) = i + enddo + +end + +!------------------------------------------------------------------------------- +subroutine init_common_nn() + + use module_function_decl + use module_lattice + use module_vol + use module_nn + implicit none + + integer :: x, y, z, t, i, eo, mu, fb, dir, tmp + integer, dimension (DIM) :: j, start, end + integer, external :: e_o, xyzt2i, i_periodic + integer, parameter :: out_of_range = 2000000000 + + allocate(nn(volh_tot, EVEN:ODD, DIM, FWD:BWD)) + + do fb = FWD, BWD + do mu = 1, DIM + do eo = EVEN, ODD + !$omp parallel do + do i = 1, volh_tot + nn(i, eo, mu, fb) = out_of_range + enddo + enddo + enddo + enddo + + do mu = 1, DIM + if (NPE(mu) == 1) then + start(mu) = 0 + end(mu) = N(mu) - 1 + else + start(mu) = -1 + end(mu) = N(mu) + endif + enddo + + do t = start(4), end(4) + do z = start(3), end(3) + do y = start(2), end(2) + do x = start(1), end(1) + + j = (/x,y,z,t/) + + i = xyzt2i(j) + eo = e_o(j) + + do fb = FWD, BWD + if (fb == FWD) then + dir = +1 + else + dir = -1 + endif + do mu = 1, DIM + j = (/x, y, z, t/) + j(mu) = j(mu) + dir + + if (NPE(mu) == 1) j(mu) = i_periodic(j(mu), L(mu)) + + if (j(mu) < -1 .or. j(mu) > N(mu)) then + nn(i, eo, gamma_index(mu), fb) = out_of_range + else + nn(i, eo, gamma_index(mu), fb) = xyzt2i(j) + endif + enddo + enddo + + enddo + enddo + enddo + enddo + + do fb = FWD, BWD + do mu = 1, DIM + do eo = EVEN, ODD + do i = 1, volh + tmp = nn(i, eo, mu, fb) + if (tmp < 1 .or. tmp > volh_tot) call die("init_common_nn(): error1") + if (num_pes() == 1 .and. tmp > volh) call die("init_common_nn(): error2") + enddo + enddo + enddo + enddo + + do fb = FWD, BWD + do mu = 1, DIM + do eo = EVEN, ODD + do i = 1, volh + tmp = nn(i, eo, mu, fb) + if (nn(tmp, EVEN + ODD - eo, mu, FWD + BWD - fb) /= i) & + call die("init_common_nn(): error3") + enddo + enddo + enddo + enddo + +end + +!------------------------------------------------------------------------------- +subroutine init_common_nnpe(NPE) + + use module_function_decl + use module_nnpe + implicit none + + integer, dimension (DIM) :: NPE, i, j, start, end + integer, external :: i_periodic, ilex + integer :: me, pe, x, y, z, t, mu + + me = my_pe() + nnpe(:, :, :, :) = me + + call unlex(me, DIM, i, NPE) + + do mu = 1, DIM + if (NPE(mu) == 1) then + start(mu) = 0 + end(mu) = 0 + else + start(mu) = -1 + end(mu) = 1 + endif + enddo + + do t = start(4), end(4) + do z = start(3), end(3) + do y = start(2), end(2) + do x = start(1), end(1) + + j(1) = i_periodic(i(1) + x, NPE(1)) + j(2) = i_periodic(i(2) + y, NPE(2)) + j(3) = i_periodic(i(3) + z, NPE(3)) + j(4) = i_periodic(i(4) + t, NPE(4)) + + pe = ilex(DIM, j, NPE) + + if (pe < 0 .or. pe >= num_pes()) then + call die ("init_common_pe(): pe out of range") + endif + + nnpe(x, y, z, t) = pe + + enddo + enddo + enddo + enddo + + ASSERT(nnpe(0,0,0,0) == my_pe()) + +end + +!------------------------------------------------------------------------------- +subroutine init_common_offset() + + use module_lattice + use module_vol + use module_offset + implicit none + + integer, external :: n_sites + integer :: x, y, z, t, off, off2, mu + integer :: start(DIM), end(DIM) + + + !!ASSERT(n_sites(DIM, (/0,0,0,0/), N, NPE) == vol) + !!ASSERT(n_sites(DIM, (/0,0,0,0/), NH, NPE) == volh) + + do mu = 1, DIM + if (NPE(mu) == 1) then + start(mu) = 0 + end(mu) = 0 + else + start(mu) = -1 + end(mu) = 1 + endif + enddo + + off = n_sites(DIM,(/0,0,0,0/), NH, NPE) ! volh + + do t = start(4), end(4) + do z = start(3), end(3) + do y = start(2), end(2) + do x = start(1), end(1) + + if (x == 0 .and. y == 0 .and. z == 0 .and. t == 0) then + offset(x,y,z,t) = 0 + else + offset(x,y,z,t) = off + off = off + n_sites(DIM, (/x,y,z,t/), NH, NPE) + endif + + enddo + enddo + enddo + enddo + + off2 = 1 + do mu = 1, DIM + if (NPE(mu) == 1) then + off2 = off2 * NH(mu) + else + off2 = off2 * (NH(mu) + 2) + endif + enddo + + ASSERT(off == off2) + ASSERT(off <= volh_tot) + +end + +!------------------------------------------------------------------------------- +subroutine init_common_vol(L, NPE) + + use module_vol + implicit none + + integer, dimension(DIM), intent(in) :: L, NPE + integer, dimension(DIM) :: N + + N = L / NPE + + volume = L(1) * L(2) * L(3) * L(4) + vol = N(1) * N(2) * N(3) * N(4) + volh = vol / 2 + volh_tot = (N(1)/2 + 2) * (N(2) + 2) * (N(3) + 2) * (N(4) + 2) + + size_sc_field = SIZE_COMPLEX * NDIRAC * NCOL * volh + +end + +!------------------------------------------------------------------------------- +subroutine init_common_thread() + + use module_function_decl + use module_thread + use module_vol + implicit none + + integer :: i, size + +#ifdef _OPENMP + integer :: omp_get_max_threads + n_thread = omp_get_max_threads() +#else + n_thread = 1 +#endif + + if (version_of_d() /= 21 .and. & + version_of_d() /= 22 .and. & + version_of_d() > 2) then + if (n_thread < 2) then + call die("init_common_thread(): " // & + "need at least 2 threads for compiled version of D()") + endif + endif + + allocate(xyz_start(0:n_thread-1)) + allocate(xyz_end(0:n_thread-1)) + allocate(t_start(0:n_thread-1)) + allocate(t_end(0:n_thread-1)) + + if (n_thread == 1) then + xyz_start(0) = 1 + xyz_end(0) = volh + t_start(0) = 1 + t_end(0) = volh + else + xyz_start(0) = 0 + xyz_end(0) = -1 + call init_common_thread_split(n_thread - 1, volh, xyz_start(1), xyz_end(1)) + call init_common_thread_split(n_thread, volh, t_start, t_end) + endif +end + +!------------------------------------------------------------------------------- +subroutine init_common_thread_split(n, size, start, end) + + implicit none + integer, intent(in) :: n, size + integer, intent(out) :: start(n), end(n) + integer :: chunk, rest, i + + chunk = size / n + rest = size - (chunk * n) + + start(1) = 1 + end(1) = chunk + if (rest >= 1) end(1) = end(1) + 1 + + do i = 2, n + start(i) = end(i - 1) + 1 + end(i) = end(i - 1) + chunk + if (rest >= i) end(i) = end(i) + 1 + enddo + + ASSERT(end(n) == size) +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/init_modules.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/init_modules.F90 new file mode 100644 index 0000000000000000000000000000000000000000..ba4de3f936678abd83f1217655a1b81ed40c2f1e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/init_modules.F90 @@ -0,0 +1,119 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003-2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! init_modules.F90 - initialise (some) modules +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine init_modules() + + call init_module_decomp() + call init_module_lattice_io() + call init_module_sc_size() +end + +!------------------------------------------------------------------------------- +subroutine init_module_decomp() + + use module_decomp + use module_function_decl + use module_lattice + use module_vol + implicit none + + integer :: i, j, me, i_pe(DIM) + integer :: x, y, z, t, eo + integer :: x_std(DIM), i_std + integer :: x_act(DIM), i_act + integer :: me_act, me_std + + integer, external :: e_o + integer, external :: ieo + + + allocate(decomp%std%i(volh, EVEN:ODD)) + allocate(decomp%act%i(volh, EVEN:ODD)) + + me = my_pe() + call unlex(me, DIM, i_pe, NPE) + + decomp%act%L = L + decomp%act%NPE = NPE + decomp%act%N = N + decomp%act%NH = NH + decomp%act%i_pe = i_pe + decomp%act%bc_fermions = bc_fermions + decomp%gamma_index = gamma_index + decomp%direction = decomp_direction + + do j = 1, DIM + i = gamma_index(j) + + decomp%std%L(i) = L(j) + decomp%std%NPE(i) = NPE(j) + decomp%std%N(i) = N(j) + decomp%std%i_pe(i) = i_pe(j) + decomp%std%bc_fermions(i) = bc_fermions(j) + enddo + + decomp%std%NH(1) = decomp%std%N(1) / 2 + decomp%std%NH(2) = decomp%std%N(2) + decomp%std%NH(3) = decomp%std%N(3) + decomp%std%NH(4) = decomp%std%N(4) + + + decomp%std%i = 0 + decomp%act%i = 0 + + do t = 0, decomp%act%N(4) - 1 + do z = 0, decomp%act%N(3) - 1 + do y = 0, decomp%act%N(2) - 1 + do x = 0, decomp%act%N(1) - 1 + + x_act(1) = x + x_act(2) = y + x_act(3) = z + x_act(4) = t + + x_std(gamma_index(1)) = x_act(1) + x_std(gamma_index(2)) = x_act(2) + x_std(gamma_index(3)) = x_act(3) + x_std(gamma_index(4)) = x_act(4) + + i_std = ieo(DIM, x_std, decomp%std%N) + 1 + i_act = ieo(DIM, x_act, decomp%act%N) + 1 + eo = e_o(x_act) + + decomp%std%i(i_act, eo) = i_std + decomp%act%i(i_std, eo) = i_act + enddo + enddo + enddo + enddo + +end + +!------------------------------------------------------------------------------- +subroutine init_module_lattice_io() + + use module_decomp + use module_lattice_io + implicit none + + L = decomp%std%L + N = decomp%std%N + NH = decomp%std%NH + NPE = decomp%std%NPE + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/iteration_count.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/iteration_count.F90 new file mode 100644 index 0000000000000000000000000000000000000000..da324c4e50424ad482a3bb5384164393552af0c1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/iteration_count.F90 @@ -0,0 +1,72 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! iteration_count.F90 - counts iterations in Hasenbusch improvement, +! does not work with tempering +! +!------------------------------------------------------------------------------- +module module_iteration_count + + integer, save :: it_f1 = 0 ! iterations in dsf1() + integer, save :: it_f2 = 0 ! iterations in dsf2() +end + +!----------------------------------------------------------------------------- +subroutine iteration_count_f1(iter) + + use module_iteration_count + implicit none + integer :: iter + + it_f1 = it_f1 + iter +end + +!----------------------------------------------------------------------------- +subroutine iteration_count_f2(iter) + + use module_iteration_count + implicit none + integer :: iter + + it_f2 = it_f2 + iter +end + +!----------------------------------------------------------------------------- +subroutine iteration_count_write(unit) + + use module_counter + use module_function_decl + use module_iteration_count + use module_switches + + implicit none + + integer, intent(in) :: unit + integer, save :: written = 0 + + character(*), parameter :: key = "%it" + character(*), parameter :: fmt_h = "(1x, 2a, a6, 2a16)" + character(*), parameter :: fmt_b = "(1x, a4, i6, 2i16)" + + if (switches%hasenbusch) then + + if (written == 0 .and. my_pe() == 0) then + write(unit, fmt_h) "T", key, "traj", "iterations(F1)", "iterations(F2)" + endif + + if (my_pe() == 0) write(unit, fmt_b) key, counter%traj, it_f1, it_f2 + + endif + + written = written + 1 + it_f1 = 0 + it_f2 = 0 +end +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/m_tilde.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/m_tilde.F90 new file mode 100644 index 0000000000000000000000000000000000000000..9782c4ede8d1d0226bfa97cf24df93a1a00ee280 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/m_tilde.F90 @@ -0,0 +1,107 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! m_tilde.F90 - matrix multiplications involving : +! +! M~ := \tilde{M} +! M~+ := \tilde{M}^\dagger +! +! M~ = 1 - kappa^2 Deo Doe (Wilson fermions) +! M~ = Tee - kappa^2 Deo Inv(Too) Doe (Wilson fermions + clover) +! M~ = H - kappa^2 Deo Inv(H) Doe (Wilson fermions + external h) +! +! and +! +! \tilde{M}^\dagger =: M~+ = 1 - kappa^2 Doe+ Deo+ +! +! subroutine mtil: out = M~ in +! subroutine mtil_dag: out = M~+ in +! subroutine mtdagmt: out = (M~+ M~) in --> mtdagmt.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine mtil(out, in, para, conf) + + use typedef_hmc + use module_p_interface + use module_vol + implicit none + + type(hmc_para), intent(in) :: para + type(hmc_conf), intent(in) :: conf + + SPINCOL_FIELD, intent(out) :: out + SPINCOL_FIELD, intent(in) :: in + P_SPINCOL_FIELD, save :: tmp + REAL :: b + + + ALLOCATE_SC_FIELD(tmp) + + if (para%kappa /= ZERO) then + call d(ODD, EVEN, tmp, in, conf%u) + if (para%csw_kappa /= ZERO) call clover_mult_ao(conf%i(1,1,ODD), tmp, volh) + if (para%h /= ZERO) call h_mult_b(-para%h, tmp, volh) + call d(EVEN, ODD, out, tmp, conf%u) + endif + + b = -para%kappa**2 / (ONE + para%h**2) + + if (para%csw_kappa /= ZERO) then + call clover_mult_a(tmp, conf%a(1,1,EVEN), in, volh) + call sc_xpby(out, tmp, b) ! out = tmp - kappa**2 * out + else + call sc_xpby(out, in, b) ! out = in - kappa**2 * out + if (para%h /= ZERO) call h_mult_a(out, para%h, in, volh) + endif + +end + +!------------------------------------------------------------------------------- +subroutine mtil_dag(out, in, para, conf) + + use typedef_hmc + use module_p_interface + use module_vol + implicit none + + type(hmc_para), intent(in) :: para + type(hmc_conf), intent(in) :: conf + + SPINCOL_FIELD, intent(out) :: out + SPINCOL_FIELD, intent(in) :: in + P_SPINCOL_FIELD, save :: tmp + REAL :: b + + + ALLOCATE_SC_FIELD(tmp) + + if (para%kappa /= ZERO) then + call d_dag(ODD, EVEN, tmp, in, conf%u) + if (para%csw_kappa /= ZERO) call clover_mult_ao(conf%i(1,1,ODD), tmp, volh) + if (para%h /= ZERO) call h_mult_b(para%h, tmp, volh) + call d_dag(EVEN, ODD, out, tmp, conf%u) + endif + + b = -para%kappa**2 / (ONE + para%h**2) + + if (para%csw_kappa /= ZERO) then + call clover_mult_a(tmp, conf%a(1,1,EVEN), in, volh) + call sc_xpby(out, tmp, b) ! out = tmp - kappa**2 * out + else + call sc_xpby(out, in, b) ! out = in - kappa**2 * out + if (para%h /= ZERO) call h_mult_a(out, -para%h, in, volh) + endif + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/mc.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/mc.F90 new file mode 100644 index 0000000000000000000000000000000000000000..b63a456278c9dedbb0242dcdda866ad5ab444b29 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/mc.F90 @@ -0,0 +1,299 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! mc.F90 - Monte Carlo loop (including tempering) +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module typedef_mc_temper + + type temper_out + REAL :: delta_h + integer :: pair + integer :: swapped + end type temper_out + +end module typedef_mc_temper + +!------------------------------------------------------------------------------- +subroutine mc(para, conf) + + use typedef_mc_temper + use typedef_para + use module_counter + use module_function_decl + use module_switches + use module_vol + implicit none + + type(type_para), intent(in) :: para + type(hmc_conf), dimension(MAX_TEMPER), intent(inout) :: conf + + type(hmc_out), dimension(MAX_TEMPER) :: out + type(temper_out), dimension(MAX_TEMPER) :: tmpr + integer :: i, j + integer :: iforce, itraj + integer :: force_accept = 0 + character(len = *), parameter :: key_fa = "%fa" + character(len = *), parameter :: key_mc = "%mc" + character(len = *), parameter :: key_swap = "%sw" + REAL :: plaq + REAL, external :: sf, sg + + if (switches%hmc_test) then + if (para%n_temper /= 1) call die("mc(): HMC-test: n_ensemble has to be 1") + call hmc_test(para%hmc(1), conf(1)) + return + endif + + force_accept = 1 + iforce = 0 + do while (counter%traj < 0 .and. counter%j_traj < para%ntraj) + + iforce = iforce + 1 + counter%traj = counter%traj + 1 + counter%j_traj = counter%j_traj + 1 + + if (iforce == 1 .and. my_pe() == 0) then + call begin(UREC, "ForceAcceptance") + write(UREC, 405) "T", key_fa, "i_fa", "e", & + "PlaqEnergy", "exp(-Delta_H)", "CGcalls", "CGitTot", "CGitMax" + endif + + do i = 1, para%n_temper + call hmc(para%hmc(i), conf(i), out(i), force_accept, HMC_TEST_NONE) + + plaq = out(i)%sg / (SIX * volume) + + if (my_pe() == 0) then + write(UREC, 415) key_fa, counter%traj, i, plaq, out(i)%exp_dh, & + out(i)%cg_ncall, out(i)%cg_niter_tot, out(i)%cg_niter_max + endif + enddo + enddo + + if (iforce > 0) then + call end_A(UREC, "ForceAcceptance") + endif + +405 format (1x, 2a, 2a5, 2a15, 3a8) +415 format (1x, a4, 2i5, 2f15.10, 3i8) + + + force_accept = 0 + itraj = 0 + do while (counter%traj >= 0 .and. counter%traj < para%maxtraj & + .and. counter%j_traj < para%ntraj) + + itraj = itraj + 1 + counter%traj = counter%traj + 1 + counter%j_traj = counter%j_traj + 1 + + if (itraj == 1 .and. my_pe() == 0) then + call begin(UREC, "MC") + + if (para%n_temper > 1) write(UREC, 450) & + "T", key_swap, "traj", "ie", "e1", "e2", "Delta_H", "Acc" + + write(UREC, 400) "T", key_mc, "traj", "e", "f", & + "PlaqEnergy", "exp(-Delta_H)", "Acc", & + "CGcalls", "CGitTot", "CGitMax" + endif + + do i = 1, para%n_temper + if (itraj == 1) then + if (switches%tempering .and. switches%dynamical) then + out(i)%sf = sf(para%hmc(i), conf(i)) + else + out(i)%sf = ZERO + endif + out(i)%sg = sg(conf(i)%u) + endif + + call hmc(para%hmc(i), conf(i), out(i), force_accept, HMC_TEST_NONE) + enddo + + if (para%n_temper > 1 .and. counter%traj > para%nstd) then + call temper(para%n_temper, para%swap_seq, para%hmc, conf, out, tmpr) + do i = 1, para%n_temper - 1 + if (my_pe() == 0) write(UREC, 460) key_swap, counter%traj, i, & + tmpr(i)%pair, tmpr(i)%pair + 1, tmpr(i)%delta_h, tmpr(i)%swapped + enddo + endif + + do i = 1, para%n_temper + j = conf(i)%former + + if (my_pe() == 0) write(UREC,410) key_mc, counter%traj, i, j, & + out(i)%sg / (SIX * volume), out(i)%exp_dh, out(i)%accepted, & + out(i)%cg_ncall, out(i)%cg_niter_tot, out(i)%cg_niter_max + + if (switches%measure_traces) & + call traces(para%hmc(i), conf(i), counter%traj, i, j) + + if (switches%measure_polyakov_loop) & + call polyakov_loop(conf(i), counter%traj, i, j) + + call cooling(conf(i)%u, counter%traj, i, j) + enddo + + if (para%nsave > 0) then + if (mod(counter%traj, para%nsave) == 0) then + call conf_write(.false., para, conf) + endif + endif + + enddo + + if (itraj > 0) then + call end_A(UREC, "MC") + endif + +400 format (1x, 2a, a6, 2a3, 2a15, a4, 3a8) +410 format (1x, a4, i6, 2i3, 2f15.10, i4, 3i8) + +450 format (1x, 2a, a6, 3a3, a18, a4) +460 format (1x, a4, i6, 3i3, f18.10, i4) + +end + +!------------------------------------------------------------------------------- +subroutine temper(n_temper, swap_seq, para, conf, action, tmpr) + + use typedef_hmc + use typedef_mc_temper + use module_function_decl + use module_p_interface + implicit none + + integer, intent(in) :: n_temper, swap_seq + type(hmc_para), dimension(n_temper), intent(in) :: para + type(hmc_conf), dimension(n_temper), intent(inout) :: conf + type(hmc_out), dimension(n_temper), intent(inout) :: action + type(temper_out), dimension(n_temper), intent(out) :: tmpr + + integer, dimension(n_temper) :: pair + integer :: i_pair, n_pair, i, j + integer :: swapped + REAL, dimension(2, 2) :: hf, hg + REAL, external :: sf, sg + REAL :: h_old, h_new, delta_h + REAL :: random + + if (n_temper == 1) return + + do i = 1, n_temper + conf(i)%former = i + enddo + + n_pair = n_temper - 1 + + call swap_sequence(swap_seq, pair, n_pair) + + do i_pair = 1, n_pair + i = pair(i_pair) + j = pair(i_pair) + 1 + + hg(1, 1) = para(i)%beta * action(i)%sg + hg(2, 2) = para(j)%beta * action(j)%sg + + hg(1, 2) = para(i)%beta * action(j)%sg + hg(2, 1) = para(j)%beta * action(i)%sg + + hf(1, 1) = action(i)%sf + hf(2, 2) = action(j)%sf + hf(1, 2) = sf(para(i), conf(j)) + hf(2, 1) = sf(para(j), conf(i)) + + if (para(i)%kappa == ZERO) then + if (hf(1, 1) /= ZERO) call die("temper(): hf(1, 1) /= 0 ") + if (hf(2, 1) /= ZERO) call die("temper(): hf(2, 1) /= 0 ") + if (hf(1, 2) /= ZERO) call die("temper(): hf(1, 2) /= 0 ") + if (hf(2, 2) /= ZERO) call die("temper(): hf(2, 2) /= 0 ") + endif + + h_old = hg(1, 1) + hf(1, 1) + hg(2, 2) + hf(2, 2) + h_new = hg(1, 2) + hf(1, 2) + hg(2, 1) + hf(2, 1) + + delta_h = h_new - h_old + + if (ranf() < exp(-delta_h)) then + swapped = 1 + else + swapped = 0 + endif + + if (swapped /= 0) then + call swap_p_g_field(conf(i)%u, conf(j)%u) + call swap_p_sc_field(conf(i)%phi, conf(j)%phi) + call swap_integer(conf(i)%former, conf(j)%former) + call swap_real(action(i)%sg, action(j)%sg) + action(i)%sf = hf(1, 2) + action(j)%sf = hf(2, 1) + endif + + tmpr(i_pair)%pair = i + tmpr(i_pair)%swapped = swapped + tmpr(i_pair)%delta_h = delta_h + enddo + + call check_former(n_temper, conf) + +end + +!------------------------------------------------------------------------------- +subroutine check_former(n_temper, conf) + + use typedef_hmc + use module_function_decl + implicit none + + integer, intent(in) :: n_temper + type(hmc_conf), intent(inout) :: conf(n_temper) + integer :: count(n_temper), i + + count = 0 + do i = 1, n_temper + count(conf(i)%former) = count(conf(i)%former) + 1 + enddo + + do i = 1, n_temper + if (count(i) /= 1) call die("check_former(): error") + enddo +end + +!------------------------------------------------------------------------------- +subroutine swap_sequence(type, s, n) + + implicit none + integer, intent(in) :: type, n + integer, intent(out) :: s(n) + integer :: i + + select case (type) + case (SWAP_UP) + do i = 1, n + s(i) = i + enddo + case (SWAP_DOWN) + do i = 1, n + s(i) = n - i + 1 + enddo + case (SWAP_RANDOM) + call random_sequence(s, n) + case default + call die("swap_sequence(): don't know how to build the sequence") + end select + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/misc.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/misc.F90 new file mode 100644 index 0000000000000000000000000000000000000000..8bfc895a5a6841503e93170fdfa2ca5d63366f24 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/misc.F90 @@ -0,0 +1,225 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! misc.F90 - miscellaneous (service routines) +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine die(msg) ! write "msg" to stderr and abort + + implicit none + character(len = *) msg + + write(STDERR,*) msg + call abbruch() + +end + +!------------------------------------------------------------------------------- +subroutine warn(msg) ! write "msg" to stderr and unit UREC + implicit none + character(len = *) msg + + write(STDERR,*) "WARNING: ", msg, " !!!" + write(UREC,*) "WARNING: ", msg, " !!!" +end + +!------------------------------------------------------------------------------- +subroutine assertion_failed(file, line, condition) !// used by ASSERT makro + implicit none + character(len = *) file, condition + integer line + + character(len = 8) aline + + write(aline, *) line + + call die(file // " (" // trim(aline) // "): assertion failed: " // condition) + +end + +!------------------------------------------------------------------------------- +subroutine begin(unit, str) ! write "begin marker" + + use module_function_decl + implicit none + integer :: unit + character(len = *) :: str + + if (my_pe() == 0) write(unit, "(1x,2a)") ">Begin", str + +end + +!------------------------------------------------------------------------------- +subroutine end_A(unit, str) ! write "end marker" + + use module_function_decl + implicit none + integer :: unit + character(len = *) :: str + + if (my_pe() == 0) write(unit, "(1x,2a)") ">End", str + +end + +!------------------------------------------------------------------------------- +function datum() ! returns date as: YYYY-MM-DD + + implicit none + character(len = 10) :: datum + + call date_and_time(date = datum) + + datum(9:10) = datum(7:8) + datum(6:7) = datum(5:6) + + datum(5:5) = "-" + datum(8:8) = "-" + +end + +!------------------------------------------------------------------------------- +function uhrzeit() ! returns time as: hh:mm:ss.sss + + implicit none + character(len = 12) :: uhrzeit + + call date_and_time(time = uhrzeit) + + uhrzeit(7:12) = uhrzeit(5:10) + uhrzeit(4:5) = uhrzeit(3:4) + + uhrzeit(3:3) = ":" + uhrzeit(6:6) = ":" + +end + +!------------------------------------------------------------------------------- +function f_exist(file) ! check in file exists + + implicit none + logical :: f_exist + character(len = *) :: file + + inquire(file = file, exist = f_exist) + +end + +!------------------------------------------------------------------------------- +subroutine open_diag() ! open debug file on each process + + use module_function_decl + implicit none + FILENAME :: name + + write(name, '(i4.4)') my_pe() + name = 'diag.' // name + + open(UDIAG, file = name) + + write(UDIAG,*) 'Output from PE ', my_pe() + write(UDIAG,*) '~~~~~~~~~~~~~~~~~~~' + write(UDIAG,*) + +end + +!------------------------------------------------------------------------------- +subroutine pos_keyword(unit, keyword) ! positions unit at keyword + + implicit none + integer, intent(in) :: unit + character(len = *), intent(in) :: keyword + integer :: iostat + character(len = len(keyword) + 1) :: word + + iostat = 0 + do while (iostat == 0) + read(unit, *, iostat = iostat) word + if (word == keyword) then + backspace(unit) + return + endif + enddo + + call die("pos_keyword(): " // keyword // ": not found") + +end + +!------------------------------------------------------------------------------- +subroutine read_keyword_int(unit, keyword, int, dim) ! read integer(s) at keyw. + + implicit none + integer, intent(in) :: unit, dim + character(len = *), intent(in) :: keyword + integer, intent(out) :: int(dim) + character :: c + + call pos_keyword(unit, keyword) + read(unit, *) c, int + +end + +!------------------------------------------------------------------------------- +subroutine read_keyword_REAL(unit, keyword, x, dim) ! read float(s) at keyword + + implicit none + integer, intent(in) :: unit, dim + character(len = *), intent(in) :: keyword + REAL, intent(out) :: x(dim) + character :: c + + call pos_keyword(unit, keyword) + read(unit, *) c, x + +end + +!------------------------------------------------------------------------------- +subroutine swap_endian8(n, a) + + implicit none + integer, parameter :: i8 = 8 + integer, intent(in) :: n + integer :: i + integer(i8), intent(inout) :: a(n) + integer(i8), save :: mask1, mask2, mask3, mask4, & + mask5, mask6, mask7, mask8 + integer(i8) :: tmp + logical, external :: is_big_endian + + data mask1 /z'00000000000000FF'/ + data mask2 /z'000000000000FF00'/ + data mask3 /z'0000000000FF0000'/ + data mask4 /z'00000000FF000000'/ + data mask5 /z'000000FF00000000'/ + data mask6 /z'0000FF0000000000'/ + data mask7 /z'00FF000000000000'/ + data mask8 /z'FF00000000000000'/ + + + if (is_big_endian()) return + + do i = 1, n + tmp = 0_i8 + tmp = ior(tmp, ishft(iand(a(i), mask1), 56_i8)) + tmp = ior(tmp, ishft(iand(a(i), mask2), 40_i8)) + tmp = ior(tmp, ishft(iand(a(i), mask3), 24_i8)) + tmp = ior(tmp, ishft(iand(a(i), mask4), 8_i8)) + tmp = ior(tmp, ishft(iand(a(i), mask5), -8_i8)) + tmp = ior(tmp, ishft(iand(a(i), mask6),-24_i8)) + tmp = ior(tmp, ishft(iand(a(i), mask7),-40_i8)) + tmp = ior(tmp, ishft(iand(a(i), mask8),-56_i8)) + a(i) = tmp + enddo + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..884239c63b5f8b05e93c5aed91fcf2ffae14bb05 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/Makefile @@ -0,0 +1,59 @@ +#=============================================================================== +# +# BQCD -- Berlin Quantum ChromoDynamics programme +# +# Author: Hinnerk Stueben +# +# Copyright (C) 2003-2006, Hinnerk Stueben, Zuse-Institut Berlin +# +#------------------------------------------------------------------------------- +# +# modules/Makefile +# +#=============================================================================== + +include ../Makefile.defs + +.SUFFIXES: +.SUFFIXES: .a .o .F90 + +.F90.o: + $(FPP) -I.. $(FPPFLAGS) $< > $*.f90 + $(F90) -c $(FFLAGS) $*.f90 + +MODULES_DIR = . + +MODULES = \ + typedef_cksum.o \ + typedef_clover.o \ + typedef_flags.o \ + typedef_hmc.o \ + typedef_para.o \ + module_bqcd.o \ + module_counter.o \ + module_d21.o \ + module_decomp.o \ + module_hmc_forces.o \ + module_lattice.o \ + module_lattice_io.o \ + module_nn.o \ + module_nnpe.o \ + module_offset.o \ + module_thread.o \ + module_vol.o \ + module_conf_info.o \ + module_function_decl.o \ + module_input.o \ + module_switches.o \ + module_p_interface.o \ + module_mre.o + +modules: $(MODULES) + +fast: + $(MAKE) + +clean: + rm -f *.[Tiod] *.f90 *.mod work.pc work.pcl + +clobber: clean diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/README b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/README new file mode 100644 index 0000000000000000000000000000000000000000..2edcd731ef7bc59540583b4442f0463e6cd88ff9 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/README @@ -0,0 +1,23 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! modules/README +! +!------------------------------------------------------------------------------- + +This directory contain "modules" that are needed in more than one +source files. + +"modules" that are needed only in one file are kept in that file. + +For historical reasons some information is stored in more than one module. +Many "modules" were "common blocks" in older versions of the programme. + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/bqcd.pcl b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/bqcd.pcl new file mode 100644 index 0000000000000000000000000000000000000000..7f6c06e225907b3e12cb0daddf00f6b753a7ab89 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/bqcd.pcl @@ -0,0 +1 @@ +work.pc diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_bqcd.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_bqcd.F90 new file mode 100644 index 0000000000000000000000000000000000000000..ad4b746f7d9aac4a9008bbd0818d0ddd36d65710 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_bqcd.F90 @@ -0,0 +1,22 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003-2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! module_bqcd.F90 +! +!------------------------------------------------------------------------------- +module module_bqcd + + character(len = *), parameter :: prog_name = "bqcd" + character(len = *), parameter :: prog_version = "benchmark2" + integer, parameter :: input_version = 4 + integer, parameter :: conf_info_version = 3 + +end +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_conf_info.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_conf_info.F90 new file mode 100644 index 0000000000000000000000000000000000000000..81f8b1e214c3ca217eeb5037f6459fdf69d7f785 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_conf_info.F90 @@ -0,0 +1,47 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2002, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! module_conf_info.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module module_conf_info + + type type_conf_info + REAL, dimension(2) :: beta, kappa, csw, csw_kappa, h + REAL :: plaq + integer, dimension(DIM) :: L, bc_fermions + integer :: rkind + integer, dimension(2) :: ensemble + end type type_conf_info + + character(len = *), parameter :: k_format = "Format" + character(len = *), parameter :: k_prog = "Program" + character(len = *), parameter :: k_run = "Run" + character(len = *), parameter :: k_traj = "Traj" + character(len = *), parameter :: k_host = "Host" + character(len = *), parameter :: k_date = "Date" + character(len = *), parameter :: k_L = "L" + character(len = *), parameter :: k_bc = "bc_fermions" + character(len = *), parameter :: k_rkind = "REAL_kind" + character(len = *), parameter :: k_plaq = "PlaqEnergy" + + character(len = *), parameter, dimension(2) :: & + k_ensemble = (/ "ensemble ", "former_ensemble " /), & + k_beta = (/ "beta ", "former_beta " /), & + k_kappa = (/ "kappa ", "former_kappa " /), & + k_csw = (/ "csw ", "former_csw " /), & + k_csw_kappa = (/ "csw_kappa ", "former_csw_kappa" /), & + k_h = (/ "h ", "former_h " /) + +end +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_counter.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_counter.F90 new file mode 100644 index 0000000000000000000000000000000000000000..f539c06f9a7af052abcdc1e5acba841021ebea9a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_counter.F90 @@ -0,0 +1,30 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! module_counter.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module module_counter + + type type_counter + sequence + integer :: run + integer :: job + integer :: traj ! overall trajectory counter + integer :: j_traj ! job trajectory counter + end type type_counter + + type(type_counter), save :: counter + +end +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_d21.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_d21.F90 new file mode 100644 index 0000000000000000000000000000000000000000..d35a29d17f3dd7396bae9caeff19d5947c76a54e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_d21.F90 @@ -0,0 +1,21 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! module_d21.F90 - Two component spincol field used in d/D21.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module module_d21 + P_SC2_FIELD, save :: a +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_decomp.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_decomp.F90 new file mode 100644 index 0000000000000000000000000000000000000000..677e085e15f2e80d18840a0a86471bc7ae810657 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_decomp.F90 @@ -0,0 +1,36 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! module_decomp.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module module_decomp + + type type_decomp1 + integer, dimension(DIM) :: L, N, NH, NPE + integer, dimension(DIM) :: i_pe + integer, dimension(DIM) :: bc_fermions + INTEGER, dimension(:, :), pointer :: i + end type type_decomp1 + + type type_decomp2 + type(type_decomp1) :: std ! "standard" + type(type_decomp1) :: act ! "actual" (essentially module_lattice) + integer, dimension(DIM) :: gamma_index + integer, dimension(DIM) :: direction + end type type_decomp2 + + type(type_decomp2), save :: decomp + +end +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_function_decl.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_function_decl.F90 new file mode 100644 index 0000000000000000000000000000000000000000..4e4afc9bc67f8d465b82de35223322c32795dc96 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_function_decl.F90 @@ -0,0 +1,67 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! module_function_decl.F90 - declaration of functions +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module module_function_decl + + ! functions in misc.F90: + + character(len=10), external :: datum + character(len=12), external :: uhrzeit + character(len=20), external :: rechner + logical, external :: f_exist + + ! processes (-> comm/pes*.F90): + + integer, external :: num_pes + +#ifdef CRAY + integer, intrinsic :: my_pe ! gives the same id as MPI_Rank in MPI_COMM_WORLD +#else + integer, external :: my_pe +#endif + + ! global reduction (-> comm/reduction_*.F90) + + REAL, external :: dotprod + REAL, external :: global_sum + real, external :: global_min + real, external :: global_max + + ! sc-field (-> sc.F90) + + REAL, external :: sc_norm2 + REAL, external :: sc_dot + COMPLEX, external :: sc_cdotc + + ! ranf: + +#ifdef CRAY + real(8), intrinsic :: ranf +#else + real(8), external :: ranf +#endif + + ! identification of D (-> d/DVersion.F90) + + integer, external :: version_of_d + integer, external :: get_d3_buffer_vol + + ! communication method (-> comm/comm_*.F90) + + COMM_METHOD, external :: comm_method + +end +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_hmc_forces.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_hmc_forces.F90 new file mode 100644 index 0000000000000000000000000000000000000000..c4cd3dbabe5abd6a14c6392d166c44a6de32ac40 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_hmc_forces.F90 @@ -0,0 +1,33 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! module_hmc_forces.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module module_hmc_forces + + P_GENERATOR_FIELD, save :: p_old + + integer, parameter :: n_force = 4 + + integer, parameter :: i_sg = 1 + integer, parameter :: i_sd = 2 + integer, parameter :: i_sf1 = 3 + integer, parameter :: i_sf2 = 4 + + REAL, save :: f_count(n_force) + REAL, save :: f_avg(n_force) + REAL, save :: f_max(n_force) +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_input.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_input.F90 new file mode 100644 index 0000000000000000000000000000000000000000..96aede4988f919ae9b63c9745e0535a8b4a0a74c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_input.F90 @@ -0,0 +1,174 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! module_input.F90 +! +! Simple module to read "keyword value" formatted input. The keywords +! are defined in "module_input.h" together with a type definition and a +! default value. The input is stored in the structure "input". The +! components are called "input%". +! +! New input/keywords can be added or modified in "module_input.h". It +! suffices to modify "module_input.h" execpt for input of type +! INPUT_ARRAY_ENSEMBLES for which (re)allocation and initialisation +! routines have to be provided in this file (in subroutines "input_allocate" +! and "input_reallocate"). +! +!------------------------------------------------------------------------------- +# include "defs.h" + +# define INPUT_ARRAY_DIM integer, dimension(DIM) +# define INPUT_ARRAY_ENSEMBLES character(para_len), dimension(:), pointer +# define INPUT_DEFAULT_ENSEMBLES 1 + +!------------------------------------------------------------------------------- +module module_input + + implicit none + + !----------------------------------------------------------------------------- + public :: type_input ! data structure declaration + public :: input ! variable containing data structure + public :: input_read ! subroutine input_read(file) + public :: input_dump ! subroutine input_dump(unit) + !----------------------------------------------------------------------------- + + private + + integer, parameter :: comment_len = 80 + integer, parameter :: keyword_len = 32 + integer, parameter :: para_len = 32 + integer, parameter :: word_len = 8 + + type type_input + +#define INPUT_INPUT(var, type, default) type :: var +#include "module_input.h" + + end type type_input + + type(type_input), save :: input + +contains + + !============================================================================= + subroutine input_read(file) + + character(*), intent(in) :: file + integer :: iostat + character(keyword_len) :: keyword + + call input_allocate(INPUT_DEFAULT_ENSEMBLES) + call input_defaults() + + open(UINPUT, file = file, action = "read", status = "old") + + do + read(UINPUT, *, iostat = iostat) keyword + if (iostat /= 0) exit + if (keyword(1:1) == '#') cycle + backspace(UINPUT) + select case (keyword) + + +#undef INPUT_INPUT +#define INPUT_INPUT(var, type, default) \ + case (#var); read(UINPUT, *) keyword, input% ## var +#include "module_input.h" + + case default + call die("input_read(): " // trim(keyword) & + // ": unknown keyword") + + end select + + if (keyword == "ensembles") then + call input_reallocate(input%ensembles) + endif + + enddo + close(UINPUT) + + end subroutine input_read + + !----------------------------------------------------------------------------- + subroutine input_dump(unit) + integer :: unit + + if (.not. associated(input%beta)) then + call input_allocate(INPUT_DEFAULT_ENSEMBLES) + call input_defaults() + endif + +#undef INPUT_INPUT +#define INPUT_INPUT(var, type, default) write(unit,*) #var, " ", input% ## var +#include "module_input.h" + + end subroutine input_dump + + !----------------------------------------------------------------------------- + subroutine input_defaults() + +#undef INPUT_INPUT +#define INPUT_INPUT(var, type, default) input% ## var = default +#include "module_input.h" + + end subroutine input_defaults + + !----------------------------------------------------------------------------- + subroutine input_allocate(size) + + integer :: size + + allocate(input%beta(size)) + allocate(input%kappa(size)) + allocate(input%csw(size)) + allocate(input%h(size)) + allocate(input%hmc_trajectory_length(size)) + allocate(input%hmc_steps(size)) + allocate(input%hmc_rho(size)) + allocate(input%hmc_m_scale(size)) + allocate(input%start_info_file(size)) + + input%beta = "0.0" + input%kappa = "0.0" + input%csw = "0.0" + input%h = "0.0" + input%hmc_trajectory_length = "1" + input%hmc_steps = "0" + input%hmc_rho = "0.0" + input%hmc_m_scale = "1" + input%start_info_file = "" + + end subroutine input_allocate + + + !----------------------------------------------------------------------------- + subroutine input_reallocate(size) + + integer :: size + + if (size > INPUT_DEFAULT_ENSEMBLES) then + deallocate(input%beta) + deallocate(input%kappa) + deallocate(input%csw) + deallocate(input%h) + deallocate(input%hmc_trajectory_length) + deallocate(input%hmc_steps) + deallocate(input%hmc_rho) + deallocate(input%hmc_m_scale) + deallocate(input%start_info_file) + + call input_allocate(size) + endif + end subroutine input_reallocate + +end module module_input +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_input.h b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_input.h new file mode 100644 index 0000000000000000000000000000000000000000..a0aa2e325556644801432584b2d10bafc0a804d6 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_input.h @@ -0,0 +1,42 @@ +INPUT_INPUT(run, integer, 0) +INPUT_INPUT(comment, character(comment_len), "") + +INPUT_INPUT(lattice, INPUT_ARRAY_DIM, (/4, 4, 4, 4/)) +INPUT_INPUT(processes, INPUT_ARRAY_DIM, (/1, 1, 1, 1/)) +INPUT_INPUT(boundary_conditions_fermions, INPUT_ARRAY_DIM, (/1, 1, 1, -1/)) +INPUT_INPUT(gamma_index, INPUT_ARRAY_DIM, (/1, 2, 3, 4/)) + +INPUT_INPUT(ensembles, integer, INPUT_DEFAULT_ENSEMBLES) + +INPUT_INPUT(beta, INPUT_ARRAY_ENSEMBLES, "0.0") +INPUT_INPUT(kappa, INPUT_ARRAY_ENSEMBLES, "0.0") +INPUT_INPUT(csw, INPUT_ARRAY_ENSEMBLES, "0.0") +INPUT_INPUT(h, INPUT_ARRAY_ENSEMBLES, "0.0") + +INPUT_INPUT(tempering_swap_sequence, character(word_len), "random") +INPUT_INPUT(tempering_steps_without, integer, 0) + +INPUT_INPUT(hmc_model, character, "A") +INPUT_INPUT(hmc_trajectory_length, INPUT_ARRAY_ENSEMBLES, "1") +INPUT_INPUT(hmc_steps, INPUT_ARRAY_ENSEMBLES, "0") +INPUT_INPUT(hmc_rho, INPUT_ARRAY_ENSEMBLES, "0.0") +INPUT_INPUT(hmc_m_scale, INPUT_ARRAY_ENSEMBLES, "1") +INPUT_INPUT(hmc_accept_first, integer, 0) +INPUT_INPUT(hmc_test, integer, 0) + +INPUT_INPUT(start_configuration, character(word_len), "cold") +INPUT_INPUT(start_info_file, INPUT_ARRAY_ENSEMBLES, "") +INPUT_INPUT(start_random, character(para_len), "default") + +INPUT_INPUT(mc_total_steps, integer, 1) +INPUT_INPUT(mc_steps, integer, 1) +INPUT_INPUT(mc_save_frequency, integer, 0) + +INPUT_INPUT(solver_rest, character(para_len), "1e-8") +INPUT_INPUT(solver_maxiter, integer, 100) +INPUT_INPUT(solver_ignore_no_convergence, integer, 0) +INPUT_INPUT(solver_mre_vectors, integer, 0) + +INPUT_INPUT(measure_cooling_list, FILENAME, "") +INPUT_INPUT(measure_polyakov_loop, integer, 0) +INPUT_INPUT(measure_traces, integer, 0) diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_lattice.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_lattice.F90 new file mode 100644 index 0000000000000000000000000000000000000000..2cf8dcdc283a5f2e9fda4ca98e9f3ef3cbb48276 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_lattice.F90 @@ -0,0 +1,48 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! module_lattice.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module module_lattice + + !>> See also "module_lattice_io" !!! + + ! use a common block, because without, equivalence leads to errors + ! with Intel Fortran90 compiler + + integer, dimension(DIM) :: L, N, NH, NPE + + common /common_lattice/ L, N, NH, NPE + + integer :: LX, LY, LZ, LT + integer :: NX, NY, NZ, NT, NXH + + equivalence (L(1), LX) + equivalence (L(2), LY) + equivalence (L(3), LZ) + equivalence (L(4), LT) + + equivalence (N(1), NX) + equivalence (N(2), NY) + equivalence (N(3), NZ) + equivalence (N(4), NT) + + equivalence (NH(1), NXH) + + integer, dimension(DIM), save :: bc_fermions + integer, dimension(DIM), save :: gamma_index + integer, dimension(DIM), save :: decomp_direction + +end +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_lattice_io.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_lattice_io.F90 new file mode 100644 index 0000000000000000000000000000000000000000..492243725ea4bc60c781687d73d1afb3d6cd9765 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_lattice_io.F90 @@ -0,0 +1,47 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! module_lattice_io.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module module_lattice_io + + !>> The common block is syntactically identical to + !>> module_lattice/common_lattice. But it contains permutated + !>> values according to "gamma_index". + !>> This can be confusing when reading source code !!!! + + ! use a common block, because without, equivalence leads to errors + ! with Intel Fortran90 compiler + + integer, dimension(DIM) :: L, N, NH, NPE + + common /common_lattice_io/ L, N, NH, NPE + + integer :: LX, LY, LZ, LT + integer :: NX, NY, NZ, NT, NXH + + equivalence (L(1), LX) + equivalence (L(2), LY) + equivalence (L(3), LZ) + equivalence (L(4), LT) + + equivalence (N(1), NX) + equivalence (N(2), NY) + equivalence (N(3), NZ) + equivalence (N(4), NT) + + equivalence (NH(1), NXH) + +end +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_mre.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_mre.F90 new file mode 100644 index 0000000000000000000000000000000000000000..21d58a534d941c6a6dffc5fcd636ba91d06c4cda --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_mre.F90 @@ -0,0 +1,34 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! module_mre.F90 +! +! Important: type(type_mre) must be defined with the "save" attribute! +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module module_mre + + integer, save :: mre_n_vec = 0 + + type mre_pointer_to_sc_field + P_SPINCOL_FIELD :: sc + end type mre_pointer_to_sc_field + + type type_mre + integer :: rank + type(mre_pointer_to_sc_field), dimension(:), pointer :: vec + end type type_mre + + +end +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_nn.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_nn.F90 new file mode 100644 index 0000000000000000000000000000000000000000..20bcb9536e195c925e2ac1991c6ade199466cecc --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_nn.F90 @@ -0,0 +1,23 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! module_nn.F90 - pointer to nearest neighbour list +! nn(volh_tot, EVEN:ODD, DIM, FWD:BWD) +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module module_nn + + INTEGER, dimension(:, :, :, :), pointer, save :: nn + +end +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_nnpe.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_nnpe.F90 new file mode 100644 index 0000000000000000000000000000000000000000..4969e97237ab53e6d174ad888db59f265cc9489d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_nnpe.F90 @@ -0,0 +1,22 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! module_nnpe.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module module_nnpe + + integer, dimension (-1:1, -1:1, -1:1, -1:1), save :: nnpe + +end +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_offset.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_offset.F90 new file mode 100644 index 0000000000000000000000000000000000000000..167f47a0b3fd95c7da599bc578ffb26f81b294a7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_offset.F90 @@ -0,0 +1,22 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! module_offset.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module module_offset + + integer, dimension (-1:1, -1:1, -1:1, -1:1), save :: offset + +end +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_p_interface.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_p_interface.F90 new file mode 100644 index 0000000000000000000000000000000000000000..9163c7aa4113b65a4c87f3b9703a2596fbe708a3 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_p_interface.F90 @@ -0,0 +1,80 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! module_p_interface.F90 ! interfaces of pointer manipulating routines +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module module_p_interface + + interface + + subroutine allocate_g_field(u) + P_GAUGE_FIELD :: u + end subroutine allocate_g_field + + subroutine allocate_g_field_io(u) + P_GAUGE_FIELD_IO :: u + end subroutine allocate_g_field_io + + subroutine allocate_gen_field(p) + P_GENERATOR_FIELD :: p + end subroutine allocate_gen_field + + subroutine allocate_sc_field(a) + P_SPINCOL_FIELD :: a + end subroutine allocate_sc_field + + subroutine allocate_sc_field_io(a) + P_SPINCOL_FIELD_IO :: a + end subroutine allocate_sc_field_io + + subroutine allocate_sc_overindexed(a) + P_SPINCOL_OVERINDEXED :: a + end subroutine allocate_sc_overindexed + + subroutine allocate_sc2_field(a) + P_SC2_FIELD :: a + end subroutine allocate_sc2_field + + subroutine allocate_clover_field_a(a) + use typedef_clover + P_CLOVER_FIELD_A :: a + end subroutine allocate_clover_field_a + + subroutine allocate_clover_field_b(b) + use typedef_clover + P_CLOVER_FIELD_B :: b + end subroutine allocate_clover_field_b + + subroutine swap_p_g_field(u, v) + P_GAUGE_FIELD :: u, v + end subroutine swap_p_g_field + + subroutine swap_p_sc_field(a, b) + P_SPINCOL_FIELD :: a, b + end subroutine swap_p_sc_field + + subroutine swap_p_clover_field_a(x, y) + use typedef_clover + P_CLOVER_FIELD_A :: x, y + end subroutine swap_p_clover_field_a + + subroutine swap_p_clover_field_b(x, y) + use typedef_clover + P_CLOVER_FIELD_B :: x, y + end subroutine swap_p_clover_field_b + + end interface + +end +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_switches.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_switches.F90 new file mode 100644 index 0000000000000000000000000000000000000000..28543df8749de32397d153292b9a236cabeb03ee --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_switches.F90 @@ -0,0 +1,31 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! module_switches.F90 +! +!------------------------------------------------------------------------------- + +module module_switches + + type type_switches + logical :: quenched + logical :: dynamical + logical :: tempering + logical :: clover + logical :: h_ext + logical :: hasenbusch + logical :: hmc_test + logical :: measure_polyakov_loop + logical :: measure_traces + end type type_switches + + type (type_switches), save :: switches +end +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_thread.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_thread.F90 new file mode 100644 index 0000000000000000000000000000000000000000..bd51c2cfdea6bf1908a8533149139d42e0e62624 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_thread.F90 @@ -0,0 +1,26 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! module_thread.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module module_thread + + ! {xyz|t}_{start|end} (0:n_thread-1) + + integer, save :: n_thread + integer, dimension(:), pointer, save :: xyz_start, xyz_end + integer, dimension(:), pointer, save :: t_start, t_end + +end +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_vol.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_vol.F90 new file mode 100644 index 0000000000000000000000000000000000000000..388675e2d141943ca2847e0886c36777d5a65f25 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/module_vol.F90 @@ -0,0 +1,22 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! module_vol.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module module_vol + + integer, save :: volume, vol, volh, volh_tot, size_sc_field + +end +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/typedef_cksum.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/typedef_cksum.F90 new file mode 100644 index 0000000000000000000000000000000000000000..8338b75f7f099899f0737ee891e3d3466ad80e3f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/typedef_cksum.F90 @@ -0,0 +1,26 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! typedef_cksum.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module typedef_cksum + + type type_cksum + CHECK_SUM :: sum + CHECK_SUM :: bytes + FILENAME :: file + end type type_cksum + +end +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/typedef_clover.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/typedef_clover.F90 new file mode 100644 index 0000000000000000000000000000000000000000..e580e642e82fd6c766eac46696784822c65f058d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/typedef_clover.F90 @@ -0,0 +1,85 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! typedef_clover.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module typedef_clover + + type type_clover_a + sequence + + REAL i11 + REAL i22 + + COMPLEX i12 + COMPLEX i13 + COMPLEX i14 + COMPLEX i15 + COMPLEX i16 + + COMPLEX i23 + COMPLEX i24 + COMPLEX i25 + COMPLEX i26 + + REAL i33 + REAL i44 + + COMPLEX i34 + COMPLEX i35 + COMPLEX i36 + + COMPLEX i45 + COMPLEX i46 + + REAL i55 + REAL i66 + + COMPLEX i56 + end type type_clover_a + + + type type_clover_b + sequence + + COMPLEX i21 + + COMPLEX i31 + COMPLEX i32 + + COMPLEX i41 + COMPLEX i42 + COMPLEX i43 + + COMPLEX i51 + COMPLEX i52 + COMPLEX i53 + COMPLEX i54 + + COMPLEX i61 + COMPLEX i62 + COMPLEX i63 + COMPLEX i64 + COMPLEX i65 + + REAL i11 + REAL i22 + REAL i33 + REAL i44 + REAL i55 + REAL i66 + end type type_clover_b + +end +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/typedef_flags.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/typedef_flags.F90 new file mode 100644 index 0000000000000000000000000000000000000000..ca191bd165ace9d669cd785527c2802615243527 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/typedef_flags.F90 @@ -0,0 +1,26 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! typedef_flags.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module typedef_flags + + type type_flags + logical :: show_version + logical :: continuation_job + FILENAME :: input + end type type_flags + +end +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/typedef_hmc.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/typedef_hmc.F90 new file mode 100644 index 0000000000000000000000000000000000000000..3e1a1d1662053d88c4fe08eb40ea6901490cdc62 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/typedef_hmc.F90 @@ -0,0 +1,69 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! typedef_hmc.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module typedef_hmc + + use typedef_clover + + type hmc_para + REAL :: beta + REAL :: kappa + REAL :: csw + REAL :: csw_kappa + REAL :: h + REAL :: traj_length + REAL :: tau + REAL :: rho + integer :: ntau + integer :: m_scale + character :: model + end type hmc_para + + type hmc_para_char + character(len = 20) :: beta + character(len = 20) :: kappa + character(len = 20) :: csw + character(len = 20) :: csw_kappa + character(len = 20) :: h + character(len = 20) :: traj_length + character(len = 20) :: tau + character(len = 20) :: ntau + character(len = 20) :: rho + character(len = 20) :: m_scale + end type hmc_para_char + + type hmc_out + REAL :: exp_dh ! exp(-Delta H) + REAL :: sg ! without factor beta + REAL :: sf + integer :: accepted + integer :: cg_ncall + integer :: cg_niter_max + integer :: cg_niter_tot + end type hmc_out + + type hmc_conf + P_GAUGE_FIELD :: u + P_SPINCOL_FIELD :: phi + P_SPINCOL_FIELD :: phi2 + P_CLOVER_FIELD_A :: a ! A := 1 - kappa c_sw sigma F (in 6x6 blocks) + P_CLOVER_FIELD_A :: i ! inverse of A + P_CLOVER_FIELD_B :: b ! inverse of A in (L D L+) decomposed form + integer :: former ! ensemble index before tempering + end type hmc_conf + +end +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/modules/typedef_para.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/typedef_para.F90 new file mode 100644 index 0000000000000000000000000000000000000000..70fee2ca94708b611e562a166648339961cf0d79 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/modules/typedef_para.F90 @@ -0,0 +1,57 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! typedef_para.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module typedef_para + + use typedef_hmc + + type type_para + + integer :: run + + integer, dimension(DIM) :: L + integer, dimension(DIM) :: NPE + integer, dimension(DIM) :: bc_fermions + integer, dimension(DIM) :: gamma_index + + integer :: n_temper + + type(hmc_para), dimension(MAX_TEMPER) :: hmc + type(hmc_para_char), dimension(MAX_TEMPER) :: c_hmc + + integer :: start + SEED :: seed + integer :: swap_seq + + integer :: nforce + integer :: ntraj + integer :: nstd + integer :: maxtraj + + integer :: nsave + + real :: cg_rest + integer :: cg_maxiter + integer :: cg_log + + character(len = 20) c_cg_rest + + FILENAME, dimension(MAX_TEMPER) :: info_file + + end type type_para + +end +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/mre.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/mre.F90 new file mode 100644 index 0000000000000000000000000000000000000000..4e9078d1d1bc97321380ae2abc4930816aa4b077 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/mre.F90 @@ -0,0 +1,270 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! mre.F90 - Chronological Inverter by Minimal Residual Extrapolation +! hep-lat/9509012 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine mre_put(basis, sc_field, reset) ! add a solution + + use module_mre + use module_p_interface + use module_vol + implicit none + + type(type_mre), intent(inout) :: basis + SPINCOL_FIELD, intent(in) :: sc_field + integer, intent(in) :: reset + + P_SPINCOL_FIELD :: tmp + integer :: i + + if (mre_n_vec == 0) then + return + endif + + call mre_allocate(basis) + + if (reset /= 0) then + basis%rank = 0 + return + endif + + tmp => basis%vec(mre_n_vec)%sc + + do i = mre_n_vec, 2, -1 + basis%vec(i)%sc => basis%vec(i - 1)%sc + enddo + + basis%vec(1)%sc => tmp + call sc_copy(basis%vec(1)%sc, sc_field) + + if (basis%rank < mre_n_vec) basis%rank = basis%rank + 1 + +end + +!------------------------------------------------------------------------------- +subroutine mre_get(basis, matrix_mult, trial, phi, para, conf) + + ! get trial solution + + use typedef_hmc + use module_function_decl + use module_mre + use module_vol + implicit none + + type(type_mre), intent(inout) :: basis + external :: matrix_mult + SPINCOL_FIELD, intent(out) :: trial + SPINCOL_FIELD, intent(in) :: phi + type(hmc_para), intent(in) :: para + type(hmc_conf), intent(in) :: conf + + type(type_mre), save :: mv ! "Matrix * v" + COMPLEX, target :: g(mre_n_vec, mre_n_vec + 1) + COMPLEX, pointer :: b(:) + integer :: size_g + integer :: i + integer :: j + integer :: s + integer :: c + integer :: rest + + if (mre_n_vec == 0 .or. .not. associated(basis%vec) .or. basis%rank == 0) then + call sc_copy(trial, phi) + return + endif + + if (basis%rank == 1) then + call sc_copy(trial, basis%vec(1)%sc) + return + endif + + size_g = mre_n_vec * (mre_n_vec + 1) * SIZE_COMPLEX + + b => g(:, mre_n_vec + 1) ! storage arrangement for global sum + ! => one global sum for everything + + call mre_allocate(mv) + call mre_gram_schmidt(basis) + + do i = 1, basis%rank + b(i) = sc_cdotc(basis%vec(i)%sc, phi) + call matrix_mult(mv%vec(i)%sc, basis%vec(i)%sc, para, conf) + enddo + + do i = 1, basis%rank + g(i, i) = sc_norm2(mv%vec(i)%sc) + do j = i + 1, basis%rank + g(i, j) = sc_cdotc(mv%vec(i)%sc, mv%vec(j)%sc) + g(j, i) = conjg(g(i, j)) + enddo + enddo + + call global_sum_vec(size_g, g) + + call mre_gauss_jordan(g, b, basis%rank, mre_n_vec) + + ! calculation of "trial" with doubled data re-use: + + call sc_cax2(trial, basis%vec(1)%sc, b(1), basis%vec(2)%sc, b(2)) + + rest = mod(basis%rank, 2) + + do j = 3, basis%rank - rest, 2 + call sc_caxpy2(trial, basis%vec(j)%sc, b(j), & + basis%vec(j+1)%sc, b(j+1)) + enddo + + if (rest == 1) then + j = basis%rank + call sc_caxpy(trial, basis%vec(j)%sc, b(j)) + endif + +end + +!------------------------------------------------------------------------------- +subroutine mre_allocate(basis) + + use module_mre + use module_p_interface + use module_vol + implicit none + + type(type_mre), intent(inout) :: basis + integer :: i + + if (.not. associated(basis%vec)) then + allocate(basis%vec(mre_n_vec)) + do i = 1, mre_n_vec + nullify(basis%vec(i)%sc) + call allocate_sc_field(basis%vec(i)%sc) + enddo + basis%rank = 0 + endif + +end + +!------------------------------------------------------------------------------- +subroutine mre_gram_schmidt(basis) + + ! Golub and van Loon, Matrix Computations (3rd ed.), p. 232 + + use module_function_decl + use module_mre + use module_vol + implicit none + + type(type_mre), intent(inout) :: basis + + integer :: k, j + REAL :: r_kk, r_kj + + do k = 1, basis%rank + r_kk = sc_norm2(basis%vec(k)%sc) + r_kk = global_sum(r_kk) + r_kk = ONE / sqrt(r_kk) + call sc_scale(basis%vec(k)%sc, r_kk) + do j = k + 1, basis%rank + r_kj = sc_dot(basis%vec(k)%sc, basis%vec(j)%sc) + r_kj = global_sum(r_kj) + call sc_axpy(basis%vec(j)%sc, basis%vec(k)%sc, -r_kj) + enddo + enddo + +end + +!------------------------------------------------------------------------------- +subroutine mre_gauss_jordan(a, b, n, np) + + ! Numerical Recipes in Fortran (2nd ed.), p. 30 + ! inv(a) i.e. a is not unscrambled + + implicit none + + integer, intent(in) :: n, np + COMPLEX, intent(inout) :: a(np,np), b(np) + + integer :: ipiv(n) + integer :: i, j, k, l, ll + integer :: irow, icol + + REAL :: big, tmp + COMPLEX :: dum + COMPLEX :: pivinv + + ipiv = 0 + + do i = 1, n + + big = ZERO + do j = 1, n + if (ipiv(j) /= 1) then + do k = 1, n + if (ipiv(k) == 0) then + tmp = abs(a(j, k)) + if (tmp >= big) then + big = tmp + irow = j + icol = k + endif + else if (ipiv(k) > 1) then + call die("mre_gauss_jordan(): singular matrix 1") + endif + enddo + endif + enddo + + ipiv(icol) = ipiv(icol) + 1 + + if (irow /= icol) then + do l = 1, n + dum = a(irow, l) + a(irow, l) = a(icol, l) + a(icol, l) = dum + enddo + dum = b(irow) + b(irow) = b(icol) + b(icol) = dum + endif + + if (a(icol, icol) == ZERO ) then + call die("mre_gauss_jordan(): singular matrix 2") + endif + + pivinv = ONE / a(icol, icol) + !!a(icol, icol) = ONE !! only needed for inv(a) + + do l = 1, n + a(icol, l) = a(icol, l) * pivinv + enddo + + b(icol) = b(icol) * pivinv + + do ll = 1, n + if (ll /= icol) then + dum = a(ll, icol) + a(ll, icol) = ZERO + do l = 1, n + a(ll, l) = a(ll, l) - a(icol, l) * dum + enddo + b(ll) = b(ll) - b(icol) * dum + endif + enddo + + enddo + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/mtdagmt.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/mtdagmt.F90 new file mode 100644 index 0000000000000000000000000000000000000000..44e1323a953c59ed3eeebdb6865c41f1e64b4c67 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/mtdagmt.F90 @@ -0,0 +1,40 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2005, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! mtdagmt.F90 - -> see m_tilde.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine mtdagmt(out, in, para, conf) + + use typedef_hmc + use module_p_interface + use module_vol + implicit none + + type(hmc_para), intent(in) :: para + type(hmc_conf), intent(in) :: conf + + SPINCOL_FIELD :: out, in + P_SPINCOL_FIELD, save :: tmp + + TIMING_START(timing_bin_mtdagmt) + + ALLOCATE_SC_FIELD(tmp) + + call mtil(tmp, in, para, conf) + call mtil_dag(out, tmp, para, conf) + + TIMING_STOP(timing_bin_mtdagmt) +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-EXPLAINED.var b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-EXPLAINED.var new file mode 100644 index 0000000000000000000000000000000000000000..cec45a8248829ba1fee2fbf339b0144c80da4fdd --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-EXPLAINED.var @@ -0,0 +1,61 @@ +#=============================================================================== +# +# BQCD -- Berlin Quantum ChromoDynamics programme +# +# Author: Hinnerk Stueben +# +# Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin +# +#------------------------------------------------------------------------------- +# +# Makefile-EXPLAINED.var +# +#------------------------------------------------------------------------------- + +SHELL = /bin/ksh + +MODULES_FLAG = -I$(MODULES_DIR) # how to find "modules" + # MODULES_DIR is set in every Makefile + +FPP = cpp -C -P # Fortran preprocessor / ANSI C preprocessor +F90 = f90 # Fortran90 compiler +CC = cc # ANSI C compiler +AR = ar # ar command +RANLIB = echo # "ranlib" if necessary + +MYFLAGS = -DTIMING # or to switch time measurement off + # must always be set for benchmarking + -DD3_BUFFER_VOL=24*24*12*12 + # -sample for lattice='24 24 24 48',processes='1 1 2 4' + # - largest possible size of a local lattice boundary + # - determines the size of *static* arrays + # - only needed in libd3.a + # - should be set to 1 when using a different LIBD. + -D_OPENMP # has to be explicitly defined for OpenMP + +FFLAGS = -O3 # Fortran90 compiler flags + $(MODULES_FLAG) +CFLAGS = -O3 # C compiles flags +ARFLAGS = rv # ar flags + +LDFLAGS = # loader flags (the loader is: ${F90}) +SYSLIBS = # system libraries (e.g. for BLAS) + +FAST_MAKE = gmake -j 8 # parallel make + +CKSUM_O = cksum.o # do not change +RANDOM_O = ran.o ranf.o # or "ran.o" on Crays +UUU_O = uuu_f90.o # or "uuu_fwd.o uuu_bwd.o uuu_bwd_m.o" if C is + # faster than Fortran90 (was a small effect on T3E) + +LIBD = libd.a # Multiplication with Wilson hopping term: + # libd.a: Cray T3E version (MPI or shmem) + # libd2.a: Hitachi SR8000 version (MPI) + # libd3.a: Hitachi SR8000 version (MPI+OpenMP) + # libd21.a: version for high scalability + +LIBCOMM = lib_mpi.a # or "lib_single_pe.a" or "lib_shmempi.a" + +LIBCLOVER = libclover.a # do not change + +#=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-altix.var b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-altix.var new file mode 100644 index 0000000000000000000000000000000000000000..baf01efce305d994685f8ed67ce8a0909ed49181 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-altix.var @@ -0,0 +1,108 @@ +#=============================================================================== +# +# BQCD -- Berlin Quantum ChromoDynamics programme +# +# Author: Hinnerk Stueben +# +# Copyright (C) 2005, Hinnerk Stueben, Zuse-Institut Berlin +# +#------------------------------------------------------------------------------- +# +# Makefile-altix.var - settings on SGI-Altix +# +#------------------------------------------------------------------------------- + +timing = 1 +mpi = +omp = 1 +shmem = 1 +shmempi = 1 +debug = +libd = 2 +d3_buffer_vol = 32*32*16*16 + +#------------------------------------------------------------------------------- + +SHELL = /bin/ksh + +FPP = mpif90 -g -E +FPP2 = icc -E -C -P +F90 = mpif90 +CC = mpicc +AR = ar +RANLIB = echo + +MODULES_FLAG = -I$(MODULES_DIR) + +MYFLAGS = -DINTEL -DALTIX +FFLAGS_STD= $(MODULES_FLAG) +CFLAGS_STD= -DNamesToLower_ +ARFLAGS = rv + +LDFLAGS = -Vaxlib +SYSLIBS = + +FAST_MAKE = gmake -j 8 + +CKSUM_O = cksum.o +RANDOM_O = ran.o ranf.o +UUU_O = uuu_f90.o + +LIBD = +LIBCOMM = lib_single_pe.a +LIBCLOVER = libclover.a + +#------------------------------------------------------------------------------- + +ifdef timing + MYFLAGS += -DTIMING +endif + +ifdef mpi + LIBCOMM = lib_mpi.a +endif + +ifdef omp + F90 += -openmp + MYFLAGS += -D_OPENMP +endif + +ifdef shmem + LDFLAGS += -lsma + LIBCOMM = lib_shmem.a +endif + +ifdef shmempi + LDFLAGS += -lsma + LIBCOMM = lib_shmempi.a +endif + +ifdef debug + FFLAGS = -g -O0 $(FFLAGS_STD) + CFLAGS = -g -O0 $(CFLAGS_STD) +else + FFLAGS = -O2 $(FFLAGS_STD) + CFLAGS = -O2 $(CFLAGS_STD) +endif + +ifeq ($(libd),1) + LIBD = libd.a + MYFLAGS += -DD3_BUFFER_VOL=1 +endif + +ifeq ($(libd),2) + LIBD = libd2.a + MYFLAGS += -DD3_BUFFER_VOL=1 +endif + +ifeq ($(libd),21) + LIBD = libd21.a + MYFLAGS += -DD3_BUFFER_VOL=1 +endif + +ifeq ($(libd),3) + LIBD = libd3.a + MYFLAGS += -DD3_BUFFER_VOL='$(d3_buffer_vol)' +endif + +#=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-bgl.var b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-bgl.var new file mode 100644 index 0000000000000000000000000000000000000000..5841de7a5d1c09511eba640cae719b0e93016b2d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-bgl.var @@ -0,0 +1,105 @@ +#=============================================================================== +# +# BQCD -- Berlin Quantum ChromoDynamics programme +# +# Author: Hinnerk Stueben +# +# Copyright (C) 2006, Hinnerk Stueben, Zuse-Institut Berlin +# +#------------------------------------------------------------------------------- +# +# Makefile-bgl.var - settings on BlueGene/L +# +#------------------------------------------------------------------------------- + +timing = 1 +debug = +bits64 = +libd = 2 +d3_buffer_vol = 32*32*16*16 + +#------------------------------------------------------------------------------- + +SHELL = /bin/ksh + +FPP = /opt/ibmcmp/vac/7.0/bin/blrts_xlc -E -C -P +F90 = /opt/ibmcmp/xlf/9.1/bin/blrts_xlf90 -qsuffix=f=f90 +CC = /opt/ibmcmp/vac/7.0/bin/blrts_xlc +AR = ar +RANLIB = echo + +BGLSYS = /bgl/BlueLight/ppcfloor/bglsys + +MODULES_FLAG = -I$(MODULES_DIR) + +MYFLAGS = -DIBM +FFLAGS_STD= $(MODULES_FLAG) -I$(BGLSYS)/include +CFLAGS_STD= -DLongLong -DNamesToLower +ARFLAGS = rv + +LDFLAGS = -L$(BGLSYS)/lib +SYSLIBS = -lmpich.rts -lfmpich.rts -lmsglayer.rts -lrts.rts -ldevices.rts + +FAST_MAKE = gmake -j 8 + +CKSUM_O = cksum.o +RANDOM_O = ran.o ranf.o +UUU_O = uuu_f90.o + +LIBD = +LIBCOMM = lib_mpi.a +LIBCLOVER = libclover.a + +#------------------------------------------------------------------------------- + +ifdef timing + MYFLAGS += -DTIMING +endif + +ifdef mpi + LIBCOMM = lib_mpi.a +endif + +ifdef omp + F90 += -qsmp=omp + MYFLAGS += -D_OPENMP +endif + +ifdef debug + FFLAGS = -g -qfullpath $(FFLAGS_STD) + CFLAGS = -g -qfullpath $(CFLAGS_STD) +else + FFLAGS = -O3 -qhot $(FFLAGS_STD) + CFLAGS = -O2 $(CFLAGS_STD) +endif + +ifdef bits64 + F90 += -q64 + CFLAGS += -q64 + ARFLAGS += -X64 +else + LDFLAGS += +endif + + +ifeq ($(libd),1) + LIBD = libd1.a + MYFLAGS += -DD3_BUFFER_VOL=1 +endif + +ifeq ($(libd),2) + LIBD = libd2.a + MYFLAGS += -DD3_BUFFER_VOL=1 +endif + +ifeq ($(libd),21) + LIBD = libd21.a + MYFLAGS += -DD3_BUFFER_VOL=1 +endif + +ifeq ($(libd),3) + LIBD = libd3.a + MYFLAGS += -DD3_BUFFER_VOL='$(d3_buffer_vol)' +endif + +#=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-cray.var b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-cray.var new file mode 100644 index 0000000000000000000000000000000000000000..22aacaf584928dcf5fc43178b4da26caee0a25fe --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-cray.var @@ -0,0 +1,45 @@ +#=============================================================================== +# +# BQCD -- Berlin Quantum ChromoDynamics programme +# +# Author: Hinnerk Stueben +# +# Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin +# +#------------------------------------------------------------------------------- +# +# Makefile-cray.var - settings on Cray T3E +# +#------------------------------------------------------------------------------- + +SHELL = /bin/ksh + +MODULES_FLAG = -p $(MODULES_DIR) + +FPP = cpp -C -P +F90 = f90 +CC = cc +AR = ar +RANLIB = echo + +# "-M 801": suppress "unknown directive" messages (from OpenMP) + +MYFLAGS = -DTIMING -DD3_BUFFER_VOL=1 +FFLAGS = $(MODULES_FLAG) -g -M 801 +CFLAGS = -O3 +ARFLAGS = rv + +LDFLAGS = +SYSLIBS = + +FAST_MAKE = NPROC=4 make + +CKSUM_O = cksum.o +RANDOM_O = ran.o +UUU_O = uuu_fwd.o uuu_bwd.o uuu_bwd_m.o + +LIBD = libd.a +LIBCOMM = lib_mpi.a +LIBCLOVER = libclover.a + +#=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-hitachi-omp.var b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-hitachi-omp.var new file mode 100644 index 0000000000000000000000000000000000000000..975c7e235ef3a96047908424f22d4b3f91b8af06 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-hitachi-omp.var @@ -0,0 +1,44 @@ +#=============================================================================== +# +# BQCD -- Berlin Quantum ChromoDynamics programme +# +# Author: Hinnerk Stueben +# +# Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin +# +#------------------------------------------------------------------------------- +# +# Makefile-hitachi-omp.var - settings on Hitachi SR8000 +# (fastest version: MPI +OpenMP) +# +#------------------------------------------------------------------------------- + +SHELL = /bin/ksh + +MODULES_FLAG = -I$(MODULES_DIR) + +FPP = cpp -C -P -D_OPENMP +F90 = mpif90 +CC = cc +AR = ar +RANLIB = echo + +MYFLAGS = -DTIMING -DD3_BUFFER_VOL=24*24*12*12 +FFLAGS = $(MODULES_FLAG) -Oss -pvdiag -par=2 -pardiag=1 -omp -procnum=8 -nosave -contarea=2 +CFLAGS = -DLongLong +ARFLAGS = rv + +LDFLAGS = +BTLB -omp -rdma +SYSLIBS = /usr/local/lib/liblrz.a -lf90c -lpl + +FAST_MAKE = JOBTYPE=SS prun -p IAPAR gmake -j 8 + +CKSUM_O = cksum.o +RANDOM_O = ran.o ranf.o +UUU_O = uuu_f90.o + +LIBD = libd3.a +LIBCOMM = lib_mpi.a +LIBCLOVER = libclover.a + +#=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-hitachi.var b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-hitachi.var new file mode 100644 index 0000000000000000000000000000000000000000..24a06ad305c32af72d9827b01d70d41356ac1dfa --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-hitachi.var @@ -0,0 +1,43 @@ +#=============================================================================== +# +# BQCD -- Berlin Quantum ChromoDynamics programme +# +# Author: Hinnerk Stueben +# +# Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin +# +#------------------------------------------------------------------------------- +# +# Makefile-hitachi.var - settings on Hitachi SR8000 (pure MPI version) +# +#------------------------------------------------------------------------------- + +SHELL = /bin/ksh + +MODULES_FLAG = -I$(MODULES_DIR) + +FPP = cpp -C -P +F90 = mpif90 +CC = cc +AR = ar +RANLIB = echo + +MYFLAGS = -DTIMING -DD3_BUFFER_VOL=1 +FFLAGS = $(MODULES_FLAG) -opt=ss -par=0 -contarea=2 +CFLAGS = -DLongLong +ARFLAGS = rv + +LDFLAGS = +BTLB +SYSLIBS = /usr/local/lib/liblrz.a -lf90c -lpl + +FAST_MAKE = JOBTYPE=SS prun -p IAPAR gmake -j 8 + +CKSUM_O = cksum.o +RANDOM_O = ran.o ranf.o +UUU_O = uuu_f90.o + +LIBD = libd2.a +LIBCOMM = lib_mpi.a +LIBCLOVER = libclover.a + +#=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-hp.var b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-hp.var new file mode 100644 index 0000000000000000000000000000000000000000..a2050cfe4fdd0559ec513f6f1de864b193fbf9ca --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-hp.var @@ -0,0 +1,45 @@ +#=============================================================================== +# +# BQCD -- Berlin Quantum ChromoDynamics programme +# +# Author: Hinnerk Stueben +# +# Copyright (C) 2002-2003, Hinnerk Stueben, Zuse-Institut Berlin +# +#------------------------------------------------------------------------------- +# +# Makefile-hp.var - settings for HP-UX Fortran Compiler +# +#------------------------------------------------------------------------------- + +SHELL = /bin/ksh + +MODULES_FLAG = -I$(MODULES_DIR) + +FPP = /opt/langtools/lbin/cpp.ansi -P +F90 = mpif90 ####+Oopenmp +CC = cc +AR = ar +RANLIB = ranlib + +MYFLAGS = -DTIMING -DD3_BUFFER_VOL=1 #####-D_OPENMP +FFLAGS = $(MODULES_FLAG) \ + +r8 +DD64 +DSnative +O3 +Ocache_pad_common \ + +Olibcalls +Onolimit +Ofltacc=relaxed +FPD +CFLAGS = -DLongLong -DNamesToLower_ +DD64 +DSnative +ARFLAGS = rv + +LDFLAGS = +O3 +DD64 +DSnative +U77 +SYSLIBS = -L/opt/mlib/lib/hpux64 -lveclib + +FAST_MAKE = gmake -j 1 + +CKSUM_O = cksum.o +RANDOM_O = ran.o ranf.o +UUU_O = uuu_f90.o + +LIBD = libd.a +LIBCOMM = lib_mpi.a +LIBCLOVER = libclover.a + +#=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-ibm.var b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-ibm.var new file mode 100644 index 0000000000000000000000000000000000000000..8710c65d4cb392d964a86ebe422571a7c0d4adde --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-ibm.var @@ -0,0 +1,116 @@ +#=============================================================================== +# +# BQCD -- Berlin Quantum ChromoDynamics programme +# +# Author: Hinnerk Stueben +# +# Copyright (C) 2002-2006, Hinnerk Stueben, Zuse-Institut Berlin +# +#------------------------------------------------------------------------------- +# +# Makefile-ibm.var - settings on IBM +# +#------------------------------------------------------------------------------- + +timing = 1 +mpi = 1 +omp = +debug = +bits64 = 1 +libd = 2 +d3_buffer_vol = 24*24*12*12 + +#------------------------------------------------------------------------------- + +SHELL = /bin/ksh + +FPP = /usr/ccs/lib/cpp -C -P +F90 = xlf90_r -qsuffix=f=f90 +CC = cc_r +AR = ar +RANLIB = echo + +# suppressed messages: +# +# 1500-036 (I) The NOSTRICT option (default at OPT(3)) has the potential +# to alter the semantics of a program +# +# 1516-092 (E) If a statement function expression contains a reference +# to a function or a function dummy procedure, the +# reference must not require an explicit interface or be +# a transformational intrinsic + +MODULES_FLAG = -I$(MODULES_DIR) + +MYFLAGS = -DIBM +FFLAGS_STD= $(MODULES_FLAG) -qsuppress=1500-036:1516-092 #:1501-510:1516-092:1514-008 +CFLAGS_STD= -DLongLong -DNamesToLower +ARFLAGS = -r -v + +LDFLAGS = +SYSLIBS = + +FAST_MAKE = gmake -j 8 + +CKSUM_O = cksum.o +RANDOM_O = ran.o ranf.o +UUU_O = uuu_f90.o + +LIBD = +LIBCOMM = lib_single_pe.a +LIBCLOVER = libclover.a + +#------------------------------------------------------------------------------- + +ifdef timing + MYFLAGS += -DTIMING +endif + +ifdef mpi + F90 = mpxlf90_r -qsuffix=f=f90 + LIBCOMM = lib_mpi.a +endif + +ifdef omp + F90 += -qsmp=omp + MYFLAGS += -D_OPENMP +endif + +ifdef debug + FFLAGS = -g -qfullpath $(FFLAGS_STD) + CFLAGS = -g -qfullpath $(CFLAGS_STD) +else + FFLAGS = -O3 $(FFLAGS_STD) + CFLAGS = -O2 $(CFLAGS_STD) +endif + +ifdef bits64 + F90 += -q64 + CFLAGS += -q64 + ARFLAGS += -X64 +else + LDFLAGS += -bmaxdata:2000000000 -bmaxstack:250000000 +endif + + +ifeq ($(libd),1) + LIBD = libd1.a + MYFLAGS += -DD3_BUFFER_VOL=1 +endif + +ifeq ($(libd),2) + LIBD = libd2.a + MYFLAGS += -DD3_BUFFER_VOL=1 +endif + +ifeq ($(libd),21) + LIBD = libd21.a + MYFLAGS += -DD3_BUFFER_VOL=1 +endif + +ifeq ($(libd),3) + LIBD = libd3.a + MYFLAGS += -DD3_BUFFER_VOL='$(d3_buffer_vol)' +endif + +#=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-intel.var b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-intel.var new file mode 100644 index 0000000000000000000000000000000000000000..e1d0443529a8d277e2acb4b5bade295e4e6c73cb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-intel.var @@ -0,0 +1,43 @@ +#=============================================================================== +# +# BQCD -- Berlin Quantum ChromoDynamics programme +# +# Author: Hinnerk Stueben +# +# Copyright (C) 2002-2003, Hinnerk Stueben, Zuse-Institut Berlin +# +#------------------------------------------------------------------------------- +# +# Makefile-intel.var - settings for Intel Fortran Compiler +# +#------------------------------------------------------------------------------- + +SHELL = /bin/ksh + +MODULES_FLAG = -cl,bqcd.pcl + +FPP = cpp -C -P +F90 = ifc +CC = gcc +AR = ar +RANLIB = echo + +MYFLAGS = -DINTEL -DTIMING -DD3_BUFFER_VOL=1 +FFLAGS = $(MODULES_FLAG) +CFLAGS = -DLongLong -DNamesToLower_ +ARFLAGS = rv + +LDFLAGS = -Vaxlib +SYSLIBS = + +FAST_MAKE = gmake + +CKSUM_O = cksum.o +RANDOM_O = ran.o ranf.o +UUU_O = uuu_f90.o + +LIBD = libd.a +LIBCOMM = lib_single_pe.a +LIBCLOVER = libclover.a + +#=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-nec.var b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-nec.var new file mode 100644 index 0000000000000000000000000000000000000000..cfba0e88b8f2a10987e3f8e1ee7ac447d35fe20f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-nec.var @@ -0,0 +1,44 @@ +#=============================================================================== +# +# BQCD -- Berlin Quantum ChromoDynamics programme +# +# Author: Hinnerk Stueben +# +# Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin +# +#------------------------------------------------------------------------------- +# +# Makefile-nec.var - settings on NEC SX-8 +# +#------------------------------------------------------------------------------- + +SHELL = /bin/ksh + +MODULES_FLAG = I$(MODULES_DIR) + +FPP = cpp -C -P +F90 = sxmpif90 +CC = sxmpic++ +AR = sxar +RANLIB = echo + +MYFLAGS = -DTIMING -DD3_BUFFER_VOL=24*24*12*12 + +FFLAGS = -$(MODULES_FLAG) +CFLAGS = -DNamesToLower_ -DLongLong +ARFLAGS = rv + +LDFLAGS = +SYSLIBS = + +FAST_MAKE = make + +CKSUM_O = cksum.o +RANDOM_O = ran.o ranf.o +UUU_O = uuu_f90.o + +LIBD = libd2.a +LIBCOMM = lib_mpi.a +LIBCLOVER = libclover.a + +#=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-sun.var b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-sun.var new file mode 100644 index 0000000000000000000000000000000000000000..82f8a6f9029f08895c6f89a5efbd031f63d3eb4b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/Makefile-sun.var @@ -0,0 +1,44 @@ +#=============================================================================== +# +# BQCD -- Berlin Quantum ChromoDynamics programme +# +# Author: Hinnerk Stueben +# +# Copyright (C) 1998-2001, Hinnerk Stueben, Zuse-Institut Berlin +# +#------------------------------------------------------------------------------- +# +# Makefile-sun.var - settings on Sun +# +#------------------------------------------------------------------------------- + +SHELL = /bin/ksh + +MODULES_FLAG = -M$(MODULES_DIR) + +# use GNU C-preprocessor (well hidden on our Sun): + +FPP = /sw/sun4_56/egcs-1.1.2-2/lib/gcc-lib/sparc-sun-solaris2.6/egcs-2.91.66/cpp -C -P +F90 = /opt/SUNWspro/bin/f90 +CC = gcc +AR = ar +RANLIB = echo + +MYFLAGS = -DTIMING -DD3_BUFFER_VOL=1 +FFLAGS = -O3 $(MODULES_FLAG) +CFLAGS = -O3 -DNamesToLower_ -DLongLong +ARFLAGS = rv + +LDFLAGS = +SYSLIBS = + +FAST_MAKE = make + +RANDOM_O = ran.o ranf.o +UUU_O = uuu_f90.o + +LIBD = libd.a +LIBCOMM = lib_single_pe.a +LIBCLOVER = libclover.a + +#=============================================================================== \ No newline at end of file diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-EXPLAINED.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-EXPLAINED.F90 new file mode 100644 index 0000000000000000000000000000000000000000..5b2ef70328aceb5b96ad1bd051601aa94c361981 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-EXPLAINED.F90 @@ -0,0 +1,47 @@ +!=============================================================================== +! +! service.F90 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine abbruch() ! exit with error status and shutdown parallel application + ! can be mpi_abort(1) when using MPI + call errexit() +end + +!------------------------------------------------------------------------------- +SECONDS function sekunden() ! returns CPU-seconds + ! SECONDS has to be defined in "defs.h" + sekunden = tsecnd() +end + +!------------------------------------------------------------------------------- +! Arguments from the command line (the following works on many machines): +!------------------------------------------------------------------------------- +subroutine pxfgetarg(iarg, arg, larg, status) ! iarg = 0 ==> command name + ! iarg > 0 ==> argument + implicit none + integer :: iarg, larg, status + character(len = *) :: arg + character(len(arg) + 1) :: a + + call getarg(iarg, a) + + larg = len_trim(a) + arg = a + status = 0 +end + +!------------------------------------------------------------------------------- +integer function ipxfargc() ! ipxfargc = 0 ==> no arguments + ipxfargc = iargc() ! ipxfargc > 0 ==> number of arguments +end + +!------------------------------------------------------------------------------- +logical function is_big_endian() ! .false. on e.g. Intel + is_big_endian = .true. +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-altix.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-altix.F90 new file mode 100644 index 0000000000000000000000000000000000000000..cd3e165e433b54bbc27bec6d2e2862eb7f1a73d9 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-altix.F90 @@ -0,0 +1,71 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2005, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! service-altix.F90 - calls to service routines on SGI-Altix +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine abbruch() + + integer(4) status + status = 1 + call exit(status) + + !!include 'mpif.h' + !!integer errorcode, ierror + !!call mpi_abort(MPI_COMM_WORLD, errorcode, ierror) +end + +!------------------------------------------------------------------------------- +function rechner() ! returns hostname + + character(len = 20) rechner + character(len = 32) r + + call hostnm(r) + rechner = r +end + +!------------------------------------------------------------------------------- +SECONDS function sekunden() + + include 'mpif.h' + + sekunden = mpi_wtime() +end + +!------------------------------------------------------------------------------- +subroutine pxfgetarg(iarg, arg, larg, status) + + implicit none + integer :: iarg, larg, status + character(len = *) :: arg + character(100) :: a + + call getarg(iarg, a) + + larg = len_trim(a) + arg = a + status = 0 +end + +!------------------------------------------------------------------------------- +integer function ipxfargc() + ipxfargc = iargc() +end + +!------------------------------------------------------------------------------- +logical function is_big_endian() + is_big_endian = .false. +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-bgl.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-bgl.F90 new file mode 100644 index 0000000000000000000000000000000000000000..38e13dbe956dd405104094bee16885626577ca99 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-bgl.F90 @@ -0,0 +1,72 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! service-bgl.F90 - calls to service routines on BlueGene/L +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine abbruch() + + implicit none + include 'mpif.h' + + integer ierror + + call mpi_abort(MPI_COMM_WORLD, 1, ierror) +end + +!------------------------------------------------------------------------------- +function rechner() ! returns hostname + + character(len = 20) rechner + + rechner = "jubl" +end + +!------------------------------------------------------------------------------- +SECONDS function sekunden() + + integer(8), external :: rts_get_timebase + real(8), parameter :: speed = ONE / 700000000.0_8 + + sekunden = rts_get_timebase() * speed +end + +!------------------------------------------------------------------------------- +subroutine pxfgetarg(iarg, arg, larg, status) + + implicit none + integer :: iarg, larg, status + integer(4) :: i + character(len = *) :: arg + character(len(arg) + 1) :: a + + i = iarg + call getarg(i, a) + + larg = len_trim(a) + arg = a + status = 0 +end + +!------------------------------------------------------------------------------- +integer function ipxfargc() + integer(4) :: iargc + ipxfargc = iargc() +end + +!------------------------------------------------------------------------------- +logical function is_big_endian() + is_big_endian = .true. +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-cray.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-cray.F90 new file mode 100644 index 0000000000000000000000000000000000000000..f7981db72b83e9a6be52318313ac9aca84379ac9 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-cray.F90 @@ -0,0 +1,70 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2002, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! service-ibm.F90 - calls to service routines on IBM +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine abbruch() + + integer(4) status + status = 1 + call exit(status) +end + +!------------------------------------------------------------------------------- +function rechner() ! returns hostname + + character(len = 20) rechner + character(len = 32) r + + call hostnm(r) + rechner = r +end + +!------------------------------------------------------------------------------- +SECONDS function sekunden() + + real(8) rtc + + sekunden = rtc() !!!mclock() * 0.01 +end + +!------------------------------------------------------------------------------- +subroutine pxfgetarg(iarg, arg, larg, status) + + implicit none + integer :: iarg, larg, status + integer(4) :: i + character(len = *) :: arg + character(len(arg) + 1) :: a + + i = iarg + call getarg(i, a) + + larg = len_trim(a) + arg = a + status = 0 +end + +!------------------------------------------------------------------------------- +integer function ipxfargc() + integer(4) :: iargc + ipxfargc = iargc() +end + +!------------------------------------------------------------------------------- +logical function is_big_endian() + is_big_endian = .true. +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-hitachi-omp.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-hitachi-omp.F90 new file mode 100644 index 0000000000000000000000000000000000000000..55704528a43a2ce05db1ba16ea2c2f0f43da0520 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-hitachi-omp.F90 @@ -0,0 +1,68 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2002, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! service-hitachi.F90 - calls to service routines on HITACHI +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine abbruch() + + call fexit(1) +end + +!------------------------------------------------------------------------------- +function rechner() ! returns hostname + + character(len = 20) rechner + + call hostnm(rechner) +end + +!------------------------------------------------------------------------------- +SECONDS function sekunden() + + implicit none + +!!real(8) d +!!call xclock(d, 5) +!!sekunden = d + + real(8) dwalltime + sekunden = dwalltime() +end + +!------------------------------------------------------------------------------- +subroutine pxfgetarg(iarg, arg, larg, status) + + implicit none + integer :: iarg, larg, status + character(len = *) :: arg + character(len(arg) + 1) :: a + + call getarg(iarg + 1, a) + + larg = len_trim(a) + arg = a + status = 0 +end + +!------------------------------------------------------------------------------- +integer function ipxfargc() + ipxfargc = iargc() - 1 +end + +!------------------------------------------------------------------------------- +logical function is_big_endian() + is_big_endian = .true. +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-hitachi.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-hitachi.F90 new file mode 100644 index 0000000000000000000000000000000000000000..55704528a43a2ce05db1ba16ea2c2f0f43da0520 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-hitachi.F90 @@ -0,0 +1,68 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2002, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! service-hitachi.F90 - calls to service routines on HITACHI +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine abbruch() + + call fexit(1) +end + +!------------------------------------------------------------------------------- +function rechner() ! returns hostname + + character(len = 20) rechner + + call hostnm(rechner) +end + +!------------------------------------------------------------------------------- +SECONDS function sekunden() + + implicit none + +!!real(8) d +!!call xclock(d, 5) +!!sekunden = d + + real(8) dwalltime + sekunden = dwalltime() +end + +!------------------------------------------------------------------------------- +subroutine pxfgetarg(iarg, arg, larg, status) + + implicit none + integer :: iarg, larg, status + character(len = *) :: arg + character(len(arg) + 1) :: a + + call getarg(iarg + 1, a) + + larg = len_trim(a) + arg = a + status = 0 +end + +!------------------------------------------------------------------------------- +integer function ipxfargc() + ipxfargc = iargc() - 1 +end + +!------------------------------------------------------------------------------- +logical function is_big_endian() + is_big_endian = .true. +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-hp.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-hp.F90 new file mode 100644 index 0000000000000000000000000000000000000000..af019b707dab0d8d3ee8ac1c6ee8a2d6482b007c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-hp.F90 @@ -0,0 +1,68 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2002, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! service-hp.F90 - calls to service routine with HP-UX Fortran Compiler +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine abbruch() + + integer(4) status + status = 1 + call exit(status) +end + +!------------------------------------------------------------------------------- +function rechner() ! returns hostname + + character(len = 20) rechner + character(len = 32) r + + call hostnm(r) + rechner = r +end + +!------------------------------------------------------------------------------- +SECONDS function sekunden() + + real, external :: walltime + + sekunden = walltime(0.0) + +end + +!------------------------------------------------------------------------------- +subroutine pxfgetarg(iarg, arg, larg, status) + + implicit none + integer :: iarg, larg, status + character(len = *) :: arg + character(100) :: a + + call getarg(iarg, a) + + larg = len_trim(a) + arg = a + status = 0 +end + +!------------------------------------------------------------------------------- +integer function ipxfargc() + ipxfargc = iargc() +end + +!------------------------------------------------------------------------------- +logical function is_big_endian() + is_big_endian = .false. +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-ibm.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-ibm.F90 new file mode 100644 index 0000000000000000000000000000000000000000..288e22355b448f65111abaada245ca0aadd7fc10 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-ibm.F90 @@ -0,0 +1,70 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2002, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! service-ibm.F90 - calls to service routines on IBM +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine abbruch() + + integer(4) status + status = 1 + call exit_(status) +end + +!------------------------------------------------------------------------------- +function rechner() ! returns hostname + + character(len = 20) rechner + character(len = 32) r + + call hostnm_(r) + rechner = r +end + +!------------------------------------------------------------------------------- +SECONDS function sekunden() + + real(8) rtc + + sekunden = rtc() !!!mclock() * 0.01 +end + +!------------------------------------------------------------------------------- +subroutine pxfgetarg(iarg, arg, larg, status) + + implicit none + integer :: iarg, larg, status + integer(4) :: i + character(len = *) :: arg + character(len(arg) + 1) :: a + + i = iarg + call getarg(i, a) + + larg = len_trim(a) + arg = a + status = 0 +end + +!------------------------------------------------------------------------------- +integer function ipxfargc() + integer(4) :: iargc + ipxfargc = iargc() +end + +!------------------------------------------------------------------------------- +logical function is_big_endian() + is_big_endian = .true. +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-intel.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-intel.F90 new file mode 100644 index 0000000000000000000000000000000000000000..f60c35f249022f6d631797114418584d0247066a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-intel.F90 @@ -0,0 +1,65 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2002, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! service-intel.F90 - calls to service routine with Intel Fortran Compiler +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine abbruch() + + integer(4) status + status = 1 + call exit(status) +end + +!------------------------------------------------------------------------------- +function rechner() ! returns hostname + + character(len = 20) rechner + character(len = 32) r + + call hostnm(r) + rechner = r +end + +!------------------------------------------------------------------------------- +SECONDS function sekunden() + + call cpu_time(sekunden) +end + +!------------------------------------------------------------------------------- +subroutine pxfgetarg(iarg, arg, larg, status) + + implicit none + integer :: iarg, larg, status + character(len = *) :: arg + character(100) :: a + + call getarg(iarg, a) + + larg = len_trim(a) + arg = a + status = 0 +end + +!------------------------------------------------------------------------------- +integer function ipxfargc() + ipxfargc = iargc() +end + +!------------------------------------------------------------------------------- +logical function is_big_endian() + is_big_endian = .false. +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-nec.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-nec.F90 new file mode 100644 index 0000000000000000000000000000000000000000..db903a6691780fd249e828b6a12e7b7e6b176e68 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-nec.F90 @@ -0,0 +1,65 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! service-nec.F90 - calls service routines on NEC SX-8 +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine abbruch() + + integer(4) status + status = 1 + call exit(status) +end + +!------------------------------------------------------------------------------- +SECONDS function sekunden() + + call cpu_time(sekunden) +end + +!------------------------------------------------------------------------------- +function rechner() ! returns hostname + + character(len = 20) rechner + character(len = 32) r + + call hostnm(r) + rechner = r +end + +!------------------------------------------------------------------------------- +subroutine pxfgetarg(iarg, arg, larg, status) + + implicit none + integer :: iarg, larg, status + character(len = *) :: arg + character(len(arg) + 1) :: a + + call getarg(iarg, a) + + larg = len_trim(a) + arg = a + status = 0 +end + +!------------------------------------------------------------------------------- +integer function ipxfargc() + ipxfargc = iargc() +end + +!------------------------------------------------------------------------------- +logical function is_big_endian() + is_big_endian = .true. +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-sun.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-sun.F90 new file mode 100644 index 0000000000000000000000000000000000000000..017d07bdd74355e3d0d1e386342c92a7f936c44c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/platform/service-sun.F90 @@ -0,0 +1,64 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2002, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! service-sun.F90 - calls to service routines on SUN +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine abbruch() + + call exit(1) +end + +!------------------------------------------------------------------------------- +function rechner() ! returns hostname + + character(len = 20) rechner + + i = hostnm(rechner) +end + +!------------------------------------------------------------------------------- +SECONDS function sekunden() + + real time(2) + + call etime(time) + sekunden = time(1) +end + +!------------------------------------------------------------------------------- +subroutine pxfgetarg(iarg, arg, larg, status) + + implicit none + integer :: iarg, larg, status + character(len = *) :: arg + character(len(arg) + 1) :: a + + call getarg(iarg, a) + + larg = len_trim(a) + arg = a + status = 0 +end + +!------------------------------------------------------------------------------- +integer function ipxfargc() + ipxfargc = iargc() +end + +!------------------------------------------------------------------------------- +logical function is_big_endian() + is_big_endian = .false. +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/polyakov_loop.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/polyakov_loop.F90 new file mode 100644 index 0000000000000000000000000000000000000000..015be3504883e1cfd546382078bed8385143badf --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/polyakov_loop.F90 @@ -0,0 +1,98 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2000-2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! polyakov_loop.F90 - in gamma_4-direction, requires (NPE(gamma_4) == 1) +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine polyakov_loop(conf, traj, i_ensemble1, i_ensemble2) + + use typedef_hmc + use module_decomp + use module_function_decl + use module_vol + implicit none + + integer, intent(in) :: traj, i_ensemble1, i_ensemble2 + type(hmc_conf), intent(in) :: conf + + COMPLEX :: pl + REAL :: re_pl, im_pl + integer :: x, y, z, t, i, eo, j(DIM) + integer :: nx, ny, nz, nt, dir4, npe4 + integer, external :: ieo, e_o, std_xyzt2i + SU3 :: u + + SU3, parameter :: su3_one = reshape( & + (/ ONE,ZERO,ZERO, & + ZERO,ONE,ZERO, & + ZERO,ZERO,ONE /), & + (/ NCOL, NCOL /)) + + character(len=*), parameter :: key_pl = "%pl" + integer, save :: count = 0 + + + count = count + 1 + + dir4 = decomp%direction(4) + npe4 = decomp%act%npe(dir4) + + if (npe4 /= 1) then + call die("polyakov_loop(): gamma_4 direction must not be decomposed") + endif + + nx = decomp%std%N(1) + ny = decomp%std%N(2) + nz = decomp%std%N(3) + nt = decomp%std%N(4) + + pl = 0 + !$omp parallel do reduction(+: pl) private(x, y, z, t, i, j, eo, u) + do x = 0, nx - 1 + do y = 0, ny - 1 + do z = 0, nz - 1 + + u = su3_one + do t = 0, nt - 1 + j = (/x, y, z, t/) + + i = std_xyzt2i(j) + eo = e_o(j) + + call u_update2(u, conf%u(1, 1, i, eo, 4)) + enddo + + pl = pl + u(1,1) + u(2,2) + u(3,3) + enddo + enddo + enddo + + call global_sum_vec(SIZE_COMPLEX, pl) + + pl = pl / (THREE * decomp%std%L(1) * decomp%std%L(2) * decomp%std%L(3)) + + re_pl = Re(pl) + im_pl = Im(pl) + + if (my_pe() == 0) then + if (count == 1) write(UREC, 400) & + "T", key_pl, "traj", "e", "f", "Re(Polyakov_Loop)", "Im(Polyakov_Loop)" + + write(UREC, 410) key_pl, traj, i_ensemble1, i_ensemble2, re_pl, im_pl + endif + + +400 format (1x, 2a, a6, 2a3, 2a20) +410 format (1x, a4, i6, 2i3, 2g20.10) + +end diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/ran.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/ran.F90 new file mode 100644 index 0000000000000000000000000000000000000000..85b241f3f38cfde13470a63c84a7d5ad4d365683 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/ran.F90 @@ -0,0 +1,207 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! ran.F90 - random number related routines +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine ran_gauss_volh(m, ran, var, eo) + +! intitializes x with Gaussian random numbers in a manner that makes +! results independent of the lattice decomposition + + use module_decomp + use module_function_decl + use module_vol + implicit none + + integer, intent(in) :: m + COMPLEX, intent(out) :: ran(m, volh) + REAL, intent(in) :: var + integer, intent(in) :: eo + + integer, dimension (DIM) :: i_pe, block + integer :: i, ii, x, y, z, t, j + integer :: nxh, nx, ny, nz, nt, lx, ly, lz, npe(DIM) + REAL :: twovar, phi, r + + twovar = TWO * var + + i_pe = decomp%std%i_pe + npe = decomp%std%NPE + + nxh = decomp%std%NH(1) + + nx = decomp%std%N(1) + ny = decomp%std%N(2) + nz = decomp%std%N(3) + nt = decomp%std%N(4) + + lx = decomp%std%L(1) + ly = decomp%std%L(2) + lz = decomp%std%L(3) + + ! block() contains the number of random numbers to be skipped. + ! In the calculation of block() a factor 1/2 from "even/odd volume" + ! cancels with 2 from "two random numbers per site". + + block(1) = m * nx + block(2) = m * lx * ny + block(3) = m * lx * ly * nz + block(4) = m * lx * ly * lz * nt + + i = 0 + call ranskip(block(4) * i_pe(4)) + do t = 0, nt - 1 + call ranskip(block(3) * i_pe(3)) + do z = 0, nz - 1 + call ranskip(block(2) * i_pe(2)) + do y = 0, ny - 1 + call ranskip(block(1) * i_pe(1)) + do x = 0, nxh - 1 + i = i + 1 + ii = decomp%act%i(i, eo) + do j = 1, m + r = sqrt(-twovar * log(ranf())) + phi = TWOPI * ranf() + ran(j, ii) = cmplx(r * cos(phi), r * sin(phi)) + enddo + enddo + call ranskip(block(1) * (npe(1) - 1 - i_pe(1))) + enddo + call ranskip(block(2) * (npe(2) - 1 - i_pe(2))) + enddo + call ranskip(block(3) * (npe(3) - 1 - i_pe(3))) + enddo + call ranskip(block(4) * (npe(4) - 1 - i_pe(4))) + + +CONTAINS + + subroutine ranskip(n) + integer :: n + SEED :: seed, n_skip + + n_skip = n + call ranget(seed) + call ranset(seed, n_skip) + end subroutine ranskip + +end + +!------------------------------------------------------------------------------- +subroutine rancheck() ! checks if seed is equal on all PEs + + use module_function_decl + implicit none + SEED seed + + call ranget(seed) + call seed_compare(seed) + +end + +!------------------------------------------------------------------------------- +subroutine write_ran() ! save state of random number generator + + use module_function_decl + implicit none + SEED :: seed + FILENAME, external :: ran_file + + call rancheck() + call ranget(seed) + if (my_pe() == 0) then + open(URAN, file = ran_file(), action = "write") + write(URAN, *) seed + close(URAN) + endif + +end + +!------------------------------------------------------------------------------- +subroutine random_sequence(r, n) ! random permutation of [1..n] + + use module_function_decl + implicit none + integer, intent(in) :: n + integer, intent(out) :: r(n) + integer :: ran, seq(n), i, j, len_seq + integer :: ceiling + + len_seq = n + + do i = 1, len_seq + seq(i) = i + enddo + + do i = 1, n - 1 + ran = ceiling(len_seq * ranf()) + if (ran <= 0 .or. ran > len_seq) stop "random_sequence(): ran out of range" + r(i) = seq(ran) + do j = ran, len_seq - 1 + seq(j) = seq(j + 1) + enddo + len_seq = len_seq - 1 + enddo + r(n) = seq(1) + +end + +!------------------------------------------------------------------------------- +subroutine get_a_random_seed(seed) ! (try to) generate a random seed + + use module_function_decl + implicit none + SEED :: seed + integer :: count, rate, rate_10sec + integer :: pe, ierror + + if (my_pe() == 0) then + call system_clock(count = count, count_rate = rate) + + if (rate <= 0) call die("get_a_random_seed(): failed") + + rate_10sec = rate * 10 + seed = mod(count, rate_10sec) + endif + + call seed_broadcast(seed) +end + +!------------------------------------------------------------------------------- +subroutine init_ran(para, flags) ! initilize random number generator + + use typedef_para + use typedef_flags + implicit none + + type(type_para) :: para + type(type_flags) :: flags + FILENAME, external :: ran_file + SEED :: seed, null + + if (flags%continuation_job) then + open(URAN, file = ran_file(), action = "read", status = "old") + read(URAN, *) para%seed + close(URAN) + else + seed = para%seed + if (seed < 0) call get_a_random_seed(para%seed) + endif + + null = 0 + call ranset(para%seed, null) + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/ranf.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/ranf.F90 new file mode 100644 index 0000000000000000000000000000000000000000..3b3a8216d3642bd9c558a910298b587d15f377bd --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/ranf.F90 @@ -0,0 +1,112 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin +! +!------------------------------------------------------------------------------- +! +! ranf.F90 - Fortran90 implementation of the Cray random number generator ranf() +! +!------------------------------------------------------------------------------- +module module_ranf + + integer, parameter :: ikind = 8 + integer, parameter :: jkind = 8 + integer, parameter :: rkind = 8 + + integer(ikind), parameter :: mH = 2651554_ikind + integer(ikind), parameter :: mL = 15184245_ikind + integer(ikind), parameter :: mask24 = 16777215_ikind + integer(ikind), parameter :: mask48 = 281474976710655_ikind + integer(ikind), parameter :: default_seed = 48131768981101_ikind + integer(ikind), save :: seed = default_seed + +! integer(ikind), save :: mH +! integer(ikind), save :: mL +! integer(ikind), save :: mask24 +! integer(ikind), save :: mask48 +! integer(ikind), save :: default_seed +! integer(ikind), save :: seed +! +! data mH /o"12072642"/ +! data mL /o"71730565"/ +! data mask24 /o"77777777"/ +! data mask48 /o"7777777777777777"/ +! data default_seed /o"1274321477413155"/ +! data seed /o"1274321477413155"/ + +end + +!------------------------------------------------------------------------------- +function ranf() + + use module_ranf + implicit none + real(rkind) :: ranf + integer(ikind) :: seedH, seedL + + ! seed = mod(m * seed, 48): + + seedH = iand(mask24, ishft(seed, -24_jkind)) + seedL = iand(mask24, seed) + seed = iand(mask48, seedL * mL + ishft(seedL * mH + seedH * mL, 24_jkind)) + + ! normalize result: + + ranf = real(seed, kind = rkind) + ranf = set_exponent(fraction(ranf), exponent(ranf) - 48_jkind) + +end + +!------------------------------------------------------------------------------- +subroutine ranget(seed_out) + + use module_ranf + implicit none + integer(ikind), intent(out) :: seed_out + + seed_out = seed + +end + +!------------------------------------------------------------------------------- +subroutine ranset(seed_in, n_skip) + + use module_ranf + implicit none + integer(ikind), intent(in) :: seed_in, n_skip + integer(ikind) :: n, mm, mmH, mmL, seedH, seedL + + if (seed_in == 0_ikind) then + seed = default_seed + else + seed = iand(mask48, ibset(seed_in, 0_jkind)) + endif + + ! skip "n_skip" seeds [i.e. calculate seed = mod(m**n_skip * seed, 48)]: + + n = iand(mask48, n_skip) + mmH = mH + mmL = mL + + do while (n > 0_ikind) + + if (btest(n, 0_ikind)) then ! seed = seed * mm + seedH = iand(mask24, ishft(seed, -24_jkind)) + seedL = iand(mask24, seed) + seed = iand(mask48, seedL*mmL + ishft(seedL*mmH + seedH*mmL, 24_jkind)) + endif + + mm = iand(mask48, mmL * mmL + ishft(mmH * mmL, 25_jkind)) ! mm = mm * mm + mmH = iand(mask24, ishft(mm, -24_jkind)) + mmL = iand(mask24, mm) + + n = ishft(n, -1_jkind) + enddo + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/ranf_test.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/ranf_test.F90 new file mode 100644 index 0000000000000000000000000000000000000000..e87ba91aa5faa7fd9580607010cb9f0574de3b11 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/ranf_test.F90 @@ -0,0 +1,82 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin +! +!------------------------------------------------------------------------------- +! +! ranf_test - test of "ranf.F90" +! +!------------------------------------------------------------------------------- + +program ranf_test + + implicit none + integer(8), parameter :: null = 0 + integer(8) :: i, seed + real(8) :: x, ranf + + write(6,400) "default seed:" + + do i = 1, 10 + call ranget(seed) + x = ranf() + write(6,410) i, seed, x + enddo + + write(6,400) "seed = 4711:" + + call ranset(4711_8, null) + + do i = 1, 10 + call ranget(seed) + x = ranf() + write(6,410) i, seed, x + enddo + + write(6,400) "seed varies, no skip:" + + do i = -10, 20 + call ranset(i, null) + call ranget(seed) + x = ranf() + write(6,410) i, seed, x + enddo + + write(6,400) "default seed, skip varies:" + + do i = -10, 20 + call ranset(null, i) + call ranget(seed) + x = ranf() + write(6,410) i, seed, x + enddo + + write(6,400) "large seeds, no skip:" + + do i = 0, 47 + call ranset(2_8**i, null) + call ranget(seed) + x = ranf() + write(6,410) i, seed, x + enddo + + write(6,400) "default seeds, big skips:" + + do i = 0, 47 + call ranset(null, 2_8**i) + call ranget(seed) + x = ranf() + write(6,410) i, seed, x + enddo + + +400 format (//1x,a//) +410 format (i6,i24,f24.16) + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/ranf_test.reference b/qcd/part_cpu/applications/QCD/src/kernel_A/ranf_test.reference new file mode 100644 index 0000000000000000000000000000000000000000..cd9731dca6ac5f2b10293d5a2749ab1d7b2b12ab --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/ranf_test.reference @@ -0,0 +1,208 @@ + + + default seed: + + + 1 48131768981101 0.5801136485795872 + 2 163287475723473 0.9505127349807658 + 3 267545549941893 0.7863714253306036 + 4 221343878630857 0.2976202640037293 + 5 83772656879069 0.4536999002984921 + 6 127705168870145 0.0062619416061871 + 7 1762579867765 0.2757364263838760 + 8 77612904194681 0.3056509438704786 + 9 86033092307533 0.6891007107498730 + 10 193964606509617 0.3826622386562981 + + + seed = 4711: + + + 1 4711 0.5499394951912784 + 2 154794206601235 0.1941472565046602 + 3 54647594503087 0.6685086396955562 + 4 188168453789179 0.7925054533613682 + 5 223070454027959 0.8427178850184980 + 6 237203997059235 0.0340008174584874 + 7 9570379302271 0.6038999714237825 + 8 169982730392075 0.9377837430456815 + 9 263962657233415 0.8757572329452152 + 10 246503746747443 0.5980755473195210 + + + seed varies, no skip: + + + -10 281474976710647 0.5775951060376308 + -9 281474976710647 0.5775951060376308 + -8 281474976710649 0.8936850824737128 + -7 281474976710649 0.8936850824737128 + -6 281474976710651 0.2097750589097949 + -5 281474976710651 0.2097750589097949 + -4 281474976710653 0.5258650353458769 + -3 281474976710653 0.5258650353458769 + -2 281474976710655 0.8419550117819590 + -1 281474976710655 0.8419550117819590 + 0 48131768981101 0.5801136485795872 + 1 1 0.1580449882180410 + 2 3 0.4741349646541231 + 3 3 0.4741349646541231 + 4 5 0.7902249410902051 + 5 5 0.7902249410902051 + 6 7 0.1063149175262872 + 7 7 0.1063149175262872 + 8 9 0.4224048939623692 + 9 9 0.4224048939623692 + 10 11 0.7384948703984513 + 11 11 0.7384948703984513 + 12 13 0.0545848468345334 + 13 13 0.0545848468345334 + 14 15 0.3706748232706154 + 15 15 0.3706748232706154 + 16 17 0.6867647997066975 + 17 17 0.6867647997066975 + 18 19 0.0028547761427795 + 19 19 0.0028547761427795 + 20 21 0.3189447525788616 + + + default seed, skip varies: + + + -10 65394301920949 0.4024069778585933 + -9 113267494720953 0.0411126961321777 + -8 11572195186317 0.3413527205224831 + -7 96082249059185 0.7768798964931598 + -6 218672250772389 0.7323797526691216 + -5 206146573825897 0.4161318802907665 + -4 117130711313405 0.8580567658715417 + -3 241521508190113 0.9085015119759028 + -2 255720441925013 0.9242771334193982 + -1 260160884603417 0.1709983940440232 + 0 48131768981101 0.5801136485795872 + 1 163287475723473 0.9505127349807658 + 2 267545549941893 0.7863714253306036 + 3 221343878630857 0.2976202640037293 + 4 83772656879069 0.4536999002984921 + 5 127705168870145 0.0062619416061871 + 6 1762579867765 0.2757364263838760 + 7 77612904194681 0.3056509438704786 + 8 86033092307533 0.6891007107498730 + 9 193964606509617 0.3826622386562981 + 10 107709844713829 0.1329027054963809 + 11 37408785934377 0.8318579032090732 + 12 234147183932349 0.5829797958307417 + 13 164094224454241 0.0986253383374169 + 14 27760564811605 0.2765484551335682 + 15 77841469968089 0.6204460277969481 + 16 174640031224365 0.0835029668338088 + 17 23503995644817 0.9903771205956851 + 18 278766376954437 0.9793469434430655 + 19 275661658097289 0.6938844384181841 + 20 195311106143645 0.9344770142467986 + + + large seeds, no skip: + + + 0 1 0.1580449882180410 + 1 3 0.4741349646541231 + 2 5 0.7902249410902051 + 3 9 0.4224048939623692 + 4 17 0.6867647997066975 + 5 33 0.2154846111953539 + 6 65 0.2729242341726668 + 7 129 0.3878034801272925 + 8 257 0.6175619720365439 + 9 513 0.0770789558550469 + 10 1025 0.9961129234920527 + 11 2049 0.8341808587660644 + 12 4097 0.5103167293140878 + 13 8193 0.8625884704101345 + 14 16385 0.5671319526022280 + 15 32769 0.9762189169864151 + 16 65537 0.7943928457547891 + 17 131073 0.4307407032915371 + 18 262145 0.7034364183650332 + 19 524289 0.2488278485120254 + 20 1048577 0.3396107088060099 + 21 2097153 0.5211764293939787 + 22 4194305 0.8843078705699163 + 23 8388609 0.6105707529217916 + 24 16777217 0.0630965176255422 + 25 33554433 0.9681480470330435 + 26 67108865 0.7782511058480459 + 27 134217729 0.3984572234780508 + 28 268435457 0.6388694587380606 + 29 536870913 0.1196939292580801 + 30 1073741825 0.0813428702981192 + 31 2147483649 0.0046407523781973 + 32 4294967297 0.8512365165383535 + 33 8589934593 0.5444280448586660 + 34 17179869185 0.9308111014992910 + 35 34359738369 0.7035772147805410 + 36 68719476737 0.2491094413430410 + 37 137438953473 0.3401738944680410 + 38 274877906945 0.5223028007180410 + 39 549755813889 0.8865606132180410 + 40 1099511627777 0.6150762382180410 + 41 2199023255553 0.0721074882180410 + 42 4398046511105 0.9861699882180410 + 43 8796093022209 0.8142949882180410 + 44 17592186044417 0.4705449882180410 + 45 35184372088833 0.7830449882180410 + 46 70368744177665 0.4080449882180410 + 47 140737488355329 0.6580449882180410 + + + default seeds, big skips: + + + 0 163287475723473 0.9505127349807658 + 1 267545549941893 0.7863714253306036 + 2 83772656879069 0.4536999002984921 + 3 86033092307533 0.6891007107498730 + 4 174640031224365 0.0835029668338088 + 5 164662163201517 0.9030257778452473 + 6 187456359032173 0.6504473584280426 + 7 247956895216749 0.1291985634769084 + 8 256822357742189 0.7782475093925463 + 9 15047453609581 0.7480693881996707 + 10 59815078594157 0.7411042261636034 + 11 36250749236845 0.3277774080605163 + 12 65623225914989 0.2811012722930322 + 13 17231226705517 0.8281668150128247 + 14 127417138878061 0.6480316574714529 + 15 67360884790893 0.5031963704648810 + 16 92173458085485 0.5752659087564247 + 17 158548977129069 0.1663654345582621 + 18 76826528323181 0.1364062830369370 + 19 181387589981805 0.2278551674942868 + 20 55158666827373 0.0162216864089864 + 21 150146494895725 0.0148297242383855 + 22 41054988277357 0.6995457998971837 + 23 33978207573613 0.8189779512147801 + 24 19824646166125 0.0578422538499730 + 25 272992500061805 0.5355708591203587 + 26 216378254431853 0.4910280696611302 + 27 103149763171949 0.4019424907426732 + 28 158167757362797 0.2237713329057591 + 29 268203745744493 0.8674290172319310 + 30 206800745797229 0.1547443858842747 + 31 83994745902701 0.7293751231889622 + 32 119857722824301 0.8786365977983372 + 33 191583676667501 0.1771595470170872 + 34 53560607643245 0.7742054454545872 + 35 58989446305389 0.9682972423295872 + 36 69847123629677 0.3564808360795872 + 37 91562478278253 0.1328480235795872 + 38 134993187575405 0.6855823985795872 + 39 221854606169709 0.7910511485795872 + 40 114102466647661 0.0019886485795872 + 41 180073164314221 0.4238636485795872 + 42 30539582936685 0.2676136485795872 + 43 12947396892269 0.9551136485795872 + 44 259238001514093 0.3301136485795872 + 45 188869257336429 0.0801136485795872 + 46 48131768981101 0.5801136485795872 + 47 48131768981101 0.5801136485795872 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/sc.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/sc.F90 new file mode 100644 index 0000000000000000000000000000000000000000..47f85b02af90ccee3a894027c591284f6499918e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/sc.F90 @@ -0,0 +1,278 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! sc.F90 - routines for the Spin-Colour field +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module module_sc_size + + integer :: sc_n_real ! number of real numbers of an sc-field + integer :: sc_n_complex ! number of complex numbers of an sc-field + +end + +!------------------------------------------------------------------------------- +subroutine init_module_sc_size() + + use module_sc_size + use module_vol + implicit none + + sc_n_complex = NDIRAC * NCOL * volh + sc_n_real = NDIRAC * NCOL * volh * SIZE_COMPLEX + +end + +!------------------------------------------------------------------------------- +subroutine sc_zero(out) + + use module_sc_size + implicit none + REAL, intent(out) :: out(*) + integer :: i + + TIMING_START(timing_bin_sc_zero) + + !$omp parallel do + do i = 1, sc_n_real + out(i) = ZERO + enddo + + TIMING_STOP(timing_bin_sc_zero) +end + +!------------------------------------------------------------------------------- +subroutine sc_copy(out, in) + + use module_sc_size + implicit none + REAL, intent(out) :: out(*) + REAL, intent(in) :: in(*) + integer :: i + + TIMING_START(timing_bin_sc_copy) + + !$omp parallel do + do i = 1, sc_n_real + out(i) = in(i) + enddo + + TIMING_STOP(timing_bin_sc_copy) +end + +!------------------------------------------------------------------------------- +subroutine sc_scale(inout, factor) + + use module_sc_size + implicit none + REAL, intent(inout) :: inout(*) + REAL, intent(in) :: factor + integer :: i + + TIMING_START(timing_bin_sc_scale) + + !$omp parallel do + do i = 1, sc_n_real + inout(i) = inout(i) * factor + enddo + + TIMING_STOP(timing_bin_sc_scale) +end + +!------------------------------------------------------------------------------- +subroutine sc_cax2(out, in1, a1, in2, a2) ! out = a1 * in1 + a2 * in2 + + use module_sc_size + implicit none + COMPLEX, intent(out) :: out(*) + COMPLEX, intent(in) :: in1(*), in2(*) + COMPLEX, intent(in) :: a1, a2 + integer :: i + + TIMING_START(timing_bin_sc_cax2) + + !$omp parallel do + do i = 1, sc_n_complex + out(i) = a1 * in1(i) + a2 * in2(i) + enddo + + TIMING_STOP(timing_bin_sc_cax2) +end + +!------------------------------------------------------------------------------- +subroutine sc_axpy(inout, in, a) ! inout = inout + a * in + + use module_sc_size + implicit none + REAL, intent(inout) :: inout(*) + REAL, intent(in) :: in(*) + REAL, intent(in) :: a + integer :: i + + TIMING_START(timing_bin_sc_axpy) + + !$omp parallel do + do i = 1, sc_n_real + inout(i) = inout(i) + a * in(i) + enddo + + TIMING_STOP(timing_bin_sc_axpy) +end + +!------------------------------------------------------------------------------- +subroutine sc_caxpy(inout, in, a) ! inout = inout + a * in + + use module_sc_size + implicit none + COMPLEX, intent(inout) :: inout(*) + COMPLEX, intent(in) :: in(*) + COMPLEX, intent(in) :: a + integer :: i + + TIMING_START(timing_bin_sc_caxpy) + + !$omp parallel do + do i = 1, sc_n_complex + inout(i) = inout(i) + a * in(i) + enddo + + TIMING_STOP(timing_bin_sc_caxpy) +end + +!------------------------------------------------------------------------------- +subroutine sc_caxpy2(inout, in1, a1, in2, a2) ! inout = inout + a1*in1 + a2*in2 + + use module_sc_size + implicit none + COMPLEX, intent(inout) :: inout(*) + COMPLEX, intent(in) :: in1(*), in2(*) + COMPLEX, intent(in) :: a1, a2 + integer :: i + + TIMING_START(timing_bin_sc_caxpy2) + + !$omp parallel do + do i = 1, sc_n_complex + inout(i) = inout(i) + a1 * in1(i) + a2 * in2(i) + enddo + + TIMING_STOP(timing_bin_sc_caxpy2) +end + +!------------------------------------------------------------------------------- +subroutine sc_xpby(inout, in, b) ! inout = b * inout + in + + use module_sc_size + implicit none + REAL, intent(inout) :: inout(*) + REAL, intent(in) :: in(*) + REAL, intent(in) :: b + integer :: i + + TIMING_START(timing_bin_sc_xpby) + + !$omp parallel do + do i = 1, sc_n_real + inout(i) = b * inout(i) + in(i) + enddo + + TIMING_STOP(timing_bin_sc_xpby) +end + +!------------------------------------------------------------------------------- +subroutine sc_axpby(inout, in, b, a) ! inout = b * inout + a * in + + use module_sc_size + implicit none + REAL, intent(inout) :: inout(*) + REAL, intent(in) :: in(*) + REAL, intent(in) :: b, a + integer :: i + + TIMING_START(timing_bin_sc_axpby) + + !$omp parallel do + do i = 1, sc_n_real + inout(i) = b * inout(i) + a * in(i) + enddo + + TIMING_STOP(timing_bin_sc_axpby) +end + +!------------------------------------------------------------------------------- +REAL function sc_norm2(in) ! Sum_i abs(in_i)**2 + + use module_sc_size + implicit none + REAL, intent(in) :: in(*) + REAL :: tmp + integer :: i + + TIMING_START(timing_bin_sc_norm2) + + tmp = ZERO + !$omp parallel do reduction(+: tmp) + do i = 1, sc_n_real + tmp = tmp + in(i)**2 + enddo + + sc_norm2 = tmp + + TIMING_STOP(timing_bin_sc_norm2) +end + +!------------------------------------------------------------------------------- +REAL function sc_dot(x, y) ! Sum_i [Re(x_i) * Re(y_i) + Im(x_i) * Im(y_i)] + + use module_sc_size + implicit none + REAL, intent(in) :: x(*), y(*) + REAL :: tmp + integer :: i + + TIMING_START(timing_bin_sc_dot) + + tmp = ZERO + !$omp parallel do reduction(+: tmp) + do i = 1, sc_n_real + tmp = tmp + x(i) * y(i) + enddo + + sc_dot = tmp + + TIMING_STOP(timing_bin_sc_dot) +end + +!------------------------------------------------------------------------------- +COMPLEX function sc_cdotc(x, y) ! Sum_i conjg(x_i) * y_i + + use module_sc_size + implicit none + COMPLEX, intent(in) :: x(*), y(*) + COMPLEX :: tmp + integer :: i + + TIMING_START(timing_bin_sc_cdotc) + + tmp = ZERO + !$omp parallel do reduction(+: tmp) + do i = 1, sc_n_complex + tmp = tmp + conjg(x(i)) * y(i) + enddo + + sc_cdotc = tmp + + TIMING_STOP(timing_bin_sc_cdotc) +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/service.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/service.F90 new file mode 100644 index 0000000000000000000000000000000000000000..f7981db72b83e9a6be52318313ac9aca84379ac9 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/service.F90 @@ -0,0 +1,70 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2002, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! service-ibm.F90 - calls to service routines on IBM +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine abbruch() + + integer(4) status + status = 1 + call exit(status) +end + +!------------------------------------------------------------------------------- +function rechner() ! returns hostname + + character(len = 20) rechner + character(len = 32) r + + call hostnm(r) + rechner = r +end + +!------------------------------------------------------------------------------- +SECONDS function sekunden() + + real(8) rtc + + sekunden = rtc() !!!mclock() * 0.01 +end + +!------------------------------------------------------------------------------- +subroutine pxfgetarg(iarg, arg, larg, status) + + implicit none + integer :: iarg, larg, status + integer(4) :: i + character(len = *) :: arg + character(len(arg) + 1) :: a + + i = iarg + call getarg(i, a) + + larg = len_trim(a) + arg = a + status = 0 +end + +!------------------------------------------------------------------------------- +integer function ipxfargc() + integer(4) :: iargc + ipxfargc = iargc() +end + +!------------------------------------------------------------------------------- +logical function is_big_endian() + is_big_endian = .true. +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/staple.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/staple.F90 new file mode 100644 index 0000000000000000000000000000000000000000..a06be5d95f528d6deda28562173fa1aa6f902bcc --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/staple.F90 @@ -0,0 +1,67 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! staple.F90 - calculates sum of staples for one link +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine staple(uuu, u, i, e, mu) + + use module_nn + use module_vol + implicit none + + SU3, intent(out) :: uuu + GAUGE_FIELD, intent(in) :: u + integer, intent(in) :: i, e, mu + integer :: o, nu, j1, j2, j3, j4 + + o = EVEN + ODD - e + uuu = 0 + + do nu = 1, DIM + if (nu /= mu) then + + ! (j2,o) --<-- x nu + ! | | + ! v ^ ^ + ! | | | + ! (i,e) -->-- (j1,o) x--> mu + ! | | + ! ^ v + ! | | + ! (j3,o) --<-- (j4,e) + + + j1 = nn(i, e, mu, FWD) + j2 = nn(i, e, nu, FWD) + j3 = nn(i, e, nu, BWD) + j4 = nn(j3,o, mu, FWD) + + if (j4 /= nn(j1, o, nu, BWD)) call die('staple(): j4 inconsistent') + if (nn(j1, o, nu, FWD) /= nn(j2, o, mu, FWD)) & + call die('staple(): j12 inconsistent') + + call uuu_fwd(uuu, u(1, 1, j1, o, nu), & + u(1, 1, j2, o, mu), & + u(1, 1, i, e, nu)) + + call uuu_bwd(uuu, u(1, 1, j4, e, nu), & + u(1, 1, j3, o, mu), & + u(1, 1, j3, o, nu)) + + endif + enddo + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/su3.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/su3.F90 new file mode 100644 index 0000000000000000000000000000000000000000..ad5cc768abdbd2f60aecfa362596150e5a9df702 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/su3.F90 @@ -0,0 +1,477 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2002, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! su3.F90 - SU(3) routines +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine gen2u(u, h) ! u := exp(i lambda_j h_j) + +! adapted from: + +! Program qcdf90, module generator_algebra, version 4.0.0 + +! Copyright by Indranil Dasgupta, Andrea R. Levi, Vittorio Lubicz +! and Claudio Rebbi - Boston University - January 1996 +! This program may be freely copied and used as long as this notice +! is retained. + + implicit none + + GENERATOR, intent(in) :: h + SU3, intent(out) :: u + + REAL :: p, q, a, alpha, l1, l2, l3, l12, s, aux, c, d, cs1, cs2 + REAL :: a8, a12, a45, a67 + GENERATOR :: h2, hs, hk, hs2 + COMPLEX :: ck1, ck2, ck3, ck4 + SU3 :: ms, mk + integer :: i, j, k + + integer, parameter :: rkind = RKIND + COMPLEX, parameter :: iu = (ZERO, ONE) + REAL, parameter :: eps = 0.00000001_rkind + + REAL, parameter :: sqrt33 = SQRT3 / THREE + REAL, parameter :: twosqrt33 = TWO * sqrt33 + +! h2=.Sq.h, inlined: + + a8 = h(8)*SQRT33 + a12 = h(1)**2+h(2)**2 + a45 = h(4)**2+h(5)**2 + a67 = h(6)**2+h(7)**2 + h2(1) = 2*h(1)*a8+h(4)*h(6)+h(5)*h(7) + h2(2) = 2*h(2)*a8+h(5)*h(6)-h(4)*h(7) + h2(3) = 2*h(3)*a8+HALF*(a45-a67) + h2(4) = h(4)*(h(3)-a8)+h(1)*h(6)-h(2)*h(7) + h2(5) = h(5)*(h(3)-a8)+h(1)*h(7)+h(2)*h(6) + h2(6) = h(6)*(-h(3)-a8)+h(1)*h(4)+h(2)*h(5) + h2(7) = h(7)*(-h(3)-a8)+h(1)*h(5)-h(2)*h(4) + h2(8) = (h(3)**2-h(8)**2+a12-HALF*(a45+a67))*SQRT33 + +! q = .Tr.h, inlined: + + q = h(1)**2 + DO i = 2,8 + q = q+h(i)**2 + END DO + q = TWO*q + +! p = (h*h2)/THREE, inlined: + + p = h(1)*h2(1) + DO i = 2,8 + p = p+h(i)*h2(i) + END DO + p = TWO*p/THREE + + a = SQRT(TWO*q/THREE) + alpha = ACOS(FOUR*p/a**3)/THREE + IF(alpha <= PI/6) THEN + l1 = a*COS(alpha) + l2 = a*COS(alpha+2*PI/3) + ELSE + l2 = a*COS(alpha) + l1 = a*COS(alpha+2*PI/3) + ENDIF + l3 = -l1-l2 + + l12 = l1*l2 + s = -l1-2*l2 + + aux = (TWO*l3*l3+l12)*(l1-l2) + c = s*(l3*l3+TWO*l12)/aux + d = -THREE*s*l3/aux + +! hs = c*h+d*h2, and +! hk = h-hs, inlined: + + DO i = 1,8 + hs(i) = c*h(i)+d*h2(i) + hk(i) = h(i)-hs(i) + END DO + +! hs2 = .Sq.hs, inlined: + + a8 = hs(8)*SQRT33 + a12 = hs(1)**2+hs(2)**2 + a45 = hs(4)**2+hs(5)**2 + a67 = hs(6)**2+hs(7)**2 + hs2(1) = 2*hs(1)*a8+hs(4)*hs(6)+hs(5)*hs(7) + hs2(2) = 2*hs(2)*a8+hs(5)*hs(6)-hs(4)*hs(7) + hs2(3) = 2*hs(3)*a8+HALF*(a45-a67) + hs2(4) = hs(4)*(hs(3)-a8)+hs(1)*hs(6)-hs(2)*hs(7) + hs2(5) = hs(5)*(hs(3)-a8)+hs(1)*hs(7)+hs(2)*hs(6) + hs2(6) = hs(6)*(-hs(3)-a8)+hs(1)*hs(4)+hs(2)*hs(5) + hs2(7) = hs(7)*(-hs(3)-a8)+hs(1)*hs(5)-hs(2)*hs(4) + hs2(8) = (hs(3)**2-hs(8)**2+a12-HALF*(a45+a67))*SQRT33 + + IF(ABS(s) > eps) THEN + cs1 = SIN(s)/s + cs2 = (COS(s)-1)/s**2 + ELSE + cs1 = 1 + cs2 = -HALF + ENDIF + + ck1 = EXP(IU*l3) + ck2 = 1/ck1**2 + ck3 = (ck2+2*ck1)/3 + IF(ABS(l3) > eps) THEN + ck4 = (ck1-ck2)/(3*l3) + ELSE + ck4 = 3*IU + ENDIF + +! aux = .Tr.hs, inlined: + + aux = hs(1)**2 + DO i = 2,8 + aux = aux+hs(i)**2 + END DO + aux = TWO*aux + +! ms = UNIT+IU*cs1*(.Matrix.hs.)+cs2*(.Matrix.hs)*(.Matrix.hs), inlined: + + ms(1,1) = ONE+cs2*aux/THREE & + +CMPLX(cs2*(hs2(3)+SQRT33*hs2(8)),cs1*(hs(3)+SQRT33*hs(8)),RKIND) + ms(2,2) = ONE+cs2*aux/THREE & + +CMPLX(cs2*(-hs2(3)+SQRT33*hs2(8)),cs1*(-hs(3)+SQRT33*hs(8)),RKIND) + ms(3,3) = ONE+cs2*aux/THREE & + +CMPLX(-cs2*TWOSQRT33*hs2(8),-cs1*TWOSQRT33*hs(8),RKIND) + ms(1,2) = CMPLX(cs2*hs2(1)+cs1*hs(2),cs1*hs(1)-cs2*hs2(2),RKIND) + ms(2,1) = CMPLX(cs2*hs2(1)-cs1*hs(2),cs1*hs(1)+cs2*hs2(2),RKIND) + ms(1,3) = CMPLX(cs2*hs2(4)+cs1*hs(5),cs1*hs(4)-cs2*hs2(5),RKIND) + ms(3,1) = CMPLX(cs2*hs2(4)-cs1*hs(5),cs1*hs(4)+cs2*hs2(5),RKIND) + ms(2,3) = CMPLX(cs2*hs2(6)+cs1*hs(7),cs1*hs(6)-cs2*hs2(7),RKIND) + ms(3,2) = CMPLX(cs2*hs2(6)-cs1*hs(7),cs1*hs(6)+cs2*hs2(7),RKIND) + +! mk = ck3*UNIT+ck4*(.Matrix.hk), inlined: + + mk(1,1) = ck3+ck4*(hk(3)+SQRT33*hk(8)) + mk(2,2) = ck3+ck4*(-hk(3)+SQRT33*hk(8)) + mk(3,3) = ck3-ck4*TWOSQRT33*hk(8) + mk(1,2) = ck4*CMPLX(hk(1),-hk(2),RKIND) + mk(2,1) = ck4*CMPLX(hk(1),hk(2),RKIND) + mk(1,3) = ck4*CMPLX(hk(4),-hk(5),RKIND) + mk(3,1) = ck4*CMPLX(hk(4),hk(5),RKIND) + mk(2,3) = ck4*CMPLX(hk(6),-hk(7),RKIND) + mk(3,2) = ck4*CMPLX(hk(6),hk(7),RKIND) + +! u = ms*mk, inlined: + + DO i = 1,3 + DO j = 1,3 + u(i,j) = ms(i,1)*mk(1,j) + DO k = 2,3 + u(i,j) = u(i,j)+ms(i,k)*mk(k,j) + END DO + END DO + END DO + +END + +!------------------------------------------------------------------------------- +subroutine im_tr_j(p, u, s) ! p(j) := p(j) + s * Im Tr(lambda_j U) + + implicit none + + GENERATOR :: p + SU3 :: u + REAL :: s + + p(1) = p(1) + s * (Im(u(1, 2)) + Im(u(2, 1))) + p(2) = p(2) + s * (Re(u(1, 2)) - Re(u(2, 1))) + p(3) = p(3) + s * (Im(u(1, 1)) - Im(u(2, 2))) + p(4) = p(4) + s * (Im(u(1, 3)) + Im(u(3, 1))) + p(5) = p(5) + s * (Re(u(1, 3)) - Re(u(3, 1))) + p(6) = p(6) + s * (Im(u(2, 3)) + Im(u(3, 2))) + p(7) = p(7) + s * (Re(u(2, 3)) - Re(u(3, 2))) + p(8) = p(8) + s * (Im(u(1, 1)) + Im(u(2, 2)) - TWO * Im(u(3, 3))) / SQRT3 + +end + +!------------------------------------------------------------------------------- +subroutine re_tr_j(p, u, s) ! p(j) := p(j) + s * Re Tr(lambda_j U) + + implicit none + + GENERATOR :: p + SU3 :: u + REAL :: s + + p(1) = p(1) + s * (Re(u(1, 2)) + Re(u(2, 1))) + p(2) = p(2) + s * (Im(u(2, 1)) - Im(u(1, 2))) + p(3) = p(3) + s * (Re(u(1, 1)) - Re(u(2, 2))) + p(4) = p(4) + s * (Re(u(1, 3)) + Re(u(3, 1))) + p(5) = p(5) + s * (Im(u(3, 1)) - Im(u(1, 3))) + p(6) = p(6) + s * (Re(u(2, 3)) + Re(u(3, 2))) + p(7) = p(7) + s * (Im(u(3, 2)) - Im(u(2, 3))) + p(8) = p(8) + s * (Re(u(1, 1)) + Re(u(2, 2)) - TWO * Re(u(3, 3))) / SQRT3 + +end + +!------------------------------------------------------------------------------- +subroutine su3_check(u) ! checks if "u" is in SU(3) + + implicit none + SU3 :: u, v + SU3, parameter :: su3_one = reshape( & + (/ ONE,ZERO,ZERO, & + ZERO,ONE,ZERO, & + ZERO,ZERO,ONE /), & + (/ NCOL, NCOL /)) + REAL, parameter :: eps = 1e-13 + REAL :: dev + integer :: i, j + + call uud(v, u, u) + dev = ZERO + do i = 1, NCOL + do j = 1, NCOL + dev = dev + abs(Re(v(i, j)) - Re(su3_one(i, j))) & + + abs(Im(v(i, j)) - Im(su3_one(i, j))) + enddo + enddo + + if (dev > eps) call die('su3_check(): dev > eps') + + call su3_check_det(u) + +end + +!------------------------------------------------------------------------------- +subroutine su3_check_det(u) + + implicit none + COMPLEX :: det + SU3 :: u + REAL, parameter :: eps = 1e-13 + + det = u(1,1) * u(2,2) * u(3,3) & + + u(1,2) * u(2,3) * u(3,1) & + + u(1,3) * u(2,1) * u(3,2) & + - u(1,1) * u(2,3) * u(3,2) & + - u(1,2) * u(2,1) * u(3,3) & + - u(1,3) * u(2,2) * u(3,1) + + if (abs(Re(det) - ONE) > eps) call die("check_su3_det(): Re(det) /= 1") + if (abs(Im(det)) > eps) call die("check_su3_det(): Im(det) /= 0") + +end + +!------------------------------------------------------------------------------- +subroutine u_add(u, v) ! u := u + v + + implicit none + + SU3 :: u, v + integer i, j + + do j = 1, NCOL + do i = 1, NCOL + u(i, j) = u(i, j) + v(i, j) + enddo + enddo + +end + +!------------------------------------------------------------------------------- +subroutine u_complete(u) ! calculate 3rd column form the first two + + implicit none + SU3 :: u + + u(1,3) = conjg(u(2,1) * u(3,2) - u(3,1) * u(2,2)) + u(2,3) = conjg(u(3,1) * u(1,2) - u(1,1) * u(3,2)) + u(3,3) = conjg(u(1,1) * u(2,2) - u(2,1) * u(1,2)) + +end + +!------------------------------------------------------------------------------- +subroutine u_normalize(u) + +! from qcdsf_t3e program: +! +! u_normalize() takes a complex matrix and produces a true su3 matrix from +! the upper 6 entries (because of FORTRAN the first two rows -> DIFFERS +! FROM APE-PROGRAM!!) +! +! ( * * . ) +! ( * * . ) +! ( * * . ) (right 3 completely ignored) +! +! Normalization done by Gramm-Schmitt + + implicit none + SU3 :: u + COMPLEX :: f + REAL :: len + integer :: i + + len = real(u(1,1))**2 + aimag(u(1,1))**2 + & ! length u_1 + real(u(2,1))**2 + aimag(u(2,1))**2 + & + real(u(3,1))**2 + aimag(u(3,1))**2 + len = sqrt(len) + + do i = 1, NCOL + u(i,1) = u(i,1) / len ! normalize u_1 + enddo + + f = u(1,2) * conjg(u(1,1)) + & + u(2,2) * conjg(u(2,1)) + & + u(3,2) * conjg(u(3,1)) + + do i = 1, NCOL + u(i,2) = u(i,2) - f * u(i,1) ! orthogonalize + enddo + + len = real(u(1,2))**2 + aimag(u(1,2))**2 + & ! length u_2 + real(u(2,2))**2 + aimag(u(2,2))**2 + & + real(u(3,2))**2 + aimag(u(3,2))**2 + len = sqrt(len) + + do i = 1, NCOL + u(i,2) = u(i,2) / len ! normalize u_2 + enddo + + call u_complete(u) + +! u(1,3) = conjg(u(2,1) * u(3,2) - u(3,1) * u(2,2)) ! calculate u_3 +! u(2,3) = conjg(u(3,1) * u(1,2) - u(1,1) * u(3,2)) ! = u_1 x u_2 +! u(3,3) = conjg(u(1,1) * u(2,2) - u(2,1) * u(1,2)) + +end + +!------------------------------------------------------------------------------- +subroutine u_trans(u) ! u := transpose(u) + + implicit none + + SU3, intent(inout) :: u + COMPLEX :: tmp + integer :: i, j + + do j = 1, NCOL + do i = j + 1, NCOL + tmp = u(i, j) + u(i, j) = u(j, i) + u(j, i) = tmp + enddo + enddo + +end + +!------------------------------------------------------------------------------- +subroutine u_update(u, v) ! u = v * u + + implicit none + + SU3, intent(in) :: v + SU3, intent(inout) :: u + SU3 :: w + + w = u + call uu(u, v, w) + +end + +!------------------------------------------------------------------------------- +subroutine u_update2(u, v) ! u = u * v + + implicit none + + SU3, intent(in) :: v + SU3, intent(inout) :: u + SU3 :: w + + w = u + call uu(u, w, v) + +end + +!------------------------------------------------------------------------------- +subroutine uu(r, a, b) ! r = a * b + + implicit none + + SU3 :: r, a, b + integer :: i, j + + do i = 1, NCOL + do j = 1, NCOL + r(i, j) = a(i, 1) * b(1, j) & + + a(i, 2) * b(2, j) & + + a(i, 3) * b(3, j) + enddo + enddo + +end + +!------------------------------------------------------------------------------- +subroutine uud(r, a, b) ! U U^dagger: r = a * b+ + + implicit none + + SU3 :: r, a, b + integer :: i, j + + do i = 1, NCOL + do j = 1, NCOL + r(i, j) = a(i, 1) * conjg(b(j, 1)) & + + a(i, 2) * conjg(b(j, 2)) & + + a(i, 3) * conjg(b(j, 3)) + enddo + enddo + +end + +!------------------------------------------------------------------------------- +subroutine udu(r, a, b) ! U^dagger U: r = a+ * b + + implicit none + + SU3 :: r, a, b + integer :: i, j + + do i = 1, NCOL + do j = 1, NCOL + r(i, j) = conjg(a(1, i)) * b(1, j) & + + conjg(a(2, i)) * b(2, j) & + + conjg(a(3, i)) * b(3, j) + enddo + enddo + +end + +!------------------------------------------------------------------------------- +REAL function Re_Tr_uu(u, v) ! returns Re(Tr(u * v)) + + implicit none + SU3, intent(in) :: u, v + REAL :: p + integer :: c1, c2 + + p = 0 + do c2 = 1, NCOL + do c1 = 1, NCOL + p = p + Re(u(c2, c1)) * Re(v(c1, c2)) & + - Im(u(c2, c1)) * Im(v(c1, c2)) + enddo + enddo + + Re_Tr_uu = p + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/swap.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/swap.F90 new file mode 100644 index 0000000000000000000000000000000000000000..d4d924dc22f69551c01e5bd8c923a1d6df64e89f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/swap.F90 @@ -0,0 +1,90 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2003, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! swap.F90 - swap routines for various data types +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine swap_p_g_field(u, v) + + implicit none + P_GAUGE_FIELD :: u, v, tmp + + tmp => u + u => v + v => tmp + +end + +!------------------------------------------------------------------------------- +subroutine swap_p_sc_field(a, b) + + implicit none + P_SPINCOL_FIELD :: a, b, tmp + + tmp => a + a => b + b => tmp + +end + +!------------------------------------------------------------------------------- +subroutine swap_p_clover_field_a(x, y) + + use typedef_clover + implicit none + P_CLOVER_FIELD_A :: x, y, tmp + + tmp => x + x => y + y => tmp + +end + +!------------------------------------------------------------------------------- +subroutine swap_p_clover_field_b(x, y) + + use typedef_clover + implicit none + P_CLOVER_FIELD_B :: x, y, tmp + + tmp => x + x => y + y => tmp + +end + +!------------------------------------------------------------------------------- +subroutine swap_real(x, y) + + implicit none + REAL :: x, y, tmp + + tmp = x + x = y + y = tmp + +end + +!------------------------------------------------------------------------------- +subroutine swap_integer(x, y) + + implicit none + integer :: x, y, tmp + + tmp = x + x = y + y = tmp + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/test_echo.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/test_echo.F90 new file mode 100644 index 0000000000000000000000000000000000000000..4533c28887c30a2cf91018d0ae019770ae14e9ff --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/test_echo.F90 @@ -0,0 +1,9 @@ +program bqcd_echo + + character(2) :: arg + + do i = 1, ipxfargc() + call pxfgetarg(i, arg, length, istat) + write(6,*) i, ":", arg(1:length), ":" + enddo +end diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/timing.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/timing.F90 new file mode 100644 index 0000000000000000000000000000000000000000..7393ff71738c3b81f1153846dd19bdd8afa773c3 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/timing.F90 @@ -0,0 +1,324 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! timing.F90 - measurements of execution times and performance +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +module module_timing_meas + + implicit none + + type type_timing + SECONDS time + SECONDS total_time + integer n_call + integer fill_the_cache_line + end type type_timing + + integer, parameter :: n_timing = 55 + type(type_timing), save :: meas(n_timing) ! measurements +!dir$ cache_align meas + + data meas /n_timing * type_timing(0.0, 0.0, 0, 0)/ + +end + +!------------------------------------------------------------------------------- +module module_timing_write + + use module_timing_meas + implicit none + + + character(len = 16), save :: text(n_timing) ! region name + integer, save :: n_op(n_timing) ! # operations + + data text(timing_bin_d_xf) /"d_xf"/ + data text(timing_bin_d_xb) /"d_xb"/ + data text(timing_bin_d_yf) /"d_yf"/ + data text(timing_bin_d_yb) /"d_yb"/ + data text(timing_bin_d_zf) /"d_zf"/ + data text(timing_bin_d_zb) /"d_zb"/ + data text(timing_bin_d_t) /"d_t"/ + data text(timing_bin_d) /"D_TOTAL"/ + + data text(timing_bin_global_sum) /"global_sum"/ + data text(timing_bin_global_sum_vec) /"global_sum_vec"/ + data text(timing_bin_sc_zero) /"sc_zero"/ + data text(timing_bin_sc_copy) /"sc_copy"/ + data text(timing_bin_sc_scale) /"sc_scale"/ + data text(timing_bin_sc_axpy) /"sc_axpy"/ + data text(timing_bin_sc_caxpy) /"sc_caxpy"/ + data text(timing_bin_sc_caxpy2) /"sc_caxpy2"/ + data text(timing_bin_sc_cax2) /"sc_cax2"/ + data text(timing_bin_sc_xpby) /"sc_xpby"/ + data text(timing_bin_sc_axpby) /"sc_axpby"/ + data text(timing_bin_sc_norm2) /"sc_norm2"/ + data text(timing_bin_sc_dot) /"sc_dot"/ + data text(timing_bin_sc_cdotc) /"sc_cdotc"/ + + data text(timing_bin_plaq) /"plaquette"/ + data text(timing_bin_cooling) /"cooling"/ + data text(timing_bin_u_read) /"u_read"/ + data text(timing_bin_u_write) /"u_write"/ + + data text(timing_bin_total) /"TOTAL"/ + data text(timing_bin_hmc) /"HMC"/ + data text(timing_bin_cg) /"CG"/ + data text(timing_bin_mtdagmt) /"MTDAGMT"/ + + data text(timing_bin_dsf) /"dsf"/ + data text(timing_bin_dsg) /"dsg"/ + data text(timing_bin_hmc_u) /"hmc_u"/ + data text(timing_bin_hmc_init_p) /"hmc_init_p"/ + + data text(timing_bin_clover_init) /"clover_init"/ + data text(timing_bin_clover_mult_a) /"clover_mult_a"/ + data text(timing_bin_clover_mult_ao)/"clover_mult_ao"/ + data text(timing_bin_clover_mult_b) /"clover_mult_b"/ + data text(timing_bin_clover_dsd) /"clover_dsd"/ + data text(timing_bin_clover_dsf) /"clover_dsf"/ + + data text(timing_bin_hmc_init) /"hmc_init"/ + data text(timing_bin_hmc_momenta) /"hmc_momenta"/ + data text(timing_bin_hmc_init_phi) /"hmc_phi"/ + data text(timing_bin_hmc_h_old) /"hmc_h_old"/ + data text(timing_bin_hmc_backup) /"hmc_backup"/ + data text(timing_bin_hmc_half_step0)/"hmc_half_step0"/ + data text(timing_bin_hmc_half_step1)/"hmc_half_step1"/ + data text(timing_bin_hmc_xbound_g) /"hmc_xbound_g"/ + data text(timing_bin_hmc_steps) /"hmc_steps"/ + data text(timing_bin_hmc_h_new) /"hmc_h_new"/ + data text(timing_bin_hmc_rest) /"hmc_rest"/ + + data text(timing_bin_h_mult_a) /"h_mult_a"/ + data text(timing_bin_h_mult_b) /"h_mult_b"/ + data text(timing_bin_h_mult_c) /"h_mult_c"/ + + data text(timing_bin_sc2_projection)/"sc2_projection"/ + + integer, parameter :: op_add = 2 ! operations per complex add. + integer, parameter :: op_mult = 6 ! operations per complex mult. + + integer, parameter, private :: op_d_xyz = 18 * op_mult + 30 * op_add + integer, parameter, private :: op_d_t = 36 * op_mult + 24 * op_add + 24 + + integer, parameter :: op_d = 6 * op_d_xyz + op_d_t + + integer, parameter, private :: op_uuu = 162 * op_mult + integer, parameter, private :: op_re_tr = 36 + integer, parameter, private :: op_plaq = 2 * 6 * (op_uuu + op_re_tr) + + integer, parameter :: op_sc_r = NDIRAC * NCOL * SIZE_COMPLEX + integer, parameter :: op_sc_c = NDIRAC * NCOL + + integer, parameter :: op_blas_r1 = op_sc_r + integer, parameter :: op_blas_r2 = op_sc_r * 2 + integer, parameter :: op_blas_r3 = op_sc_r * 3 + integer, parameter :: op_blas_c1 = op_sc_c * op_mult + integer, parameter :: op_blas_c2 = op_sc_c * (op_mult + op_add) + integer, parameter :: op_blas_c3 = 2 * op_blas_c2 + + integer, parameter :: op_clov = 84 * op_mult + 60 * op_add + 24 + integer, parameter :: op_h_mult= NDIRAC * NCOL * 3 + + integer, parameter :: op_sc2_proj = 36 * op_add + + data n_op(timing_bin_d_xf) /op_d_xyz/ + data n_op(timing_bin_d_xb) /op_d_xyz/ + data n_op(timing_bin_d_yf) /op_d_xyz/ + data n_op(timing_bin_d_yb) /op_d_xyz/ + data n_op(timing_bin_d_zf) /op_d_xyz/ + data n_op(timing_bin_d_zb) /op_d_xyz/ + data n_op(timing_bin_d_t) /op_d_t/ + data n_op(timing_bin_d) /op_d/ + + data n_op(timing_bin_global_sum) /0/ + data n_op(timing_bin_global_sum_vec) /0/ + data n_op(timing_bin_sc_zero) /0/ + data n_op(timing_bin_sc_copy) /0/ + data n_op(timing_bin_sc_scale) /op_blas_r1/ + data n_op(timing_bin_sc_norm2) /op_blas_r2/ + data n_op(timing_bin_sc_dot) /op_blas_r2/ + data n_op(timing_bin_sc_axpy) /op_blas_r2/ + data n_op(timing_bin_sc_xpby) /op_blas_r2/ + data n_op(timing_bin_sc_axpby) /op_blas_r3/ + data n_op(timing_bin_sc_cdotc) /op_blas_c2/ + data n_op(timing_bin_sc_caxpy) /op_blas_c2/ + data n_op(timing_bin_sc_caxpy2) /op_blas_c3/ + data n_op(timing_bin_sc_cax2) /op_blas_c3/ + + data n_op(timing_bin_plaq) /op_plaq/ + data n_op(timing_bin_cooling) /0/ + data n_op(timing_bin_u_read) /0/ + data n_op(timing_bin_u_write) /0/ + + data n_op(timing_bin_total) /0/ + data n_op(timing_bin_hmc) /0/ + data n_op(timing_bin_cg) /0/ + data n_op(timing_bin_mtdagmt) /0/ + + data n_op(timing_bin_dsf) /0/ + data n_op(timing_bin_dsg) /0/ + data n_op(timing_bin_hmc_u) /0/ + + data n_op(timing_bin_clover_init) /0/ + data n_op(timing_bin_clover_mult_a) /op_clov/ + data n_op(timing_bin_clover_mult_ao)/op_clov/ + data n_op(timing_bin_clover_mult_b) /op_clov/ + data n_op(timing_bin_clover_dsd) /0/ + data n_op(timing_bin_clover_dsf) /0/ + + data n_op(timing_bin_hmc_init) /0/ + data n_op(timing_bin_hmc_momenta) /0/ + data n_op(timing_bin_hmc_init_phi) /0/ + data n_op(timing_bin_hmc_h_old) /0/ + data n_op(timing_bin_hmc_backup) /0/ + data n_op(timing_bin_hmc_half_step0)/0/ + data n_op(timing_bin_hmc_half_step1)/0/ + data n_op(timing_bin_hmc_xbound_g) /0/ + data n_op(timing_bin_hmc_steps) /0/ + data n_op(timing_bin_hmc_h_new) /0/ + data n_op(timing_bin_hmc_rest) /0/ + + data n_op(timing_bin_h_mult_a) /op_h_mult/ + data n_op(timing_bin_h_mult_b) /op_h_mult/ + data n_op(timing_bin_h_mult_c) /op_h_mult/ + data n_op(timing_bin_sc2_projection)/op_sc2_proj/ + +end + +!------------------------------------------------------------------------------- +subroutine timing_start(bin) + + use module_timing_meas + implicit none + integer bin + SECONDS sekunden + + meas(bin)%time = sekunden() +end + +!------------------------------------------------------------------------------- +subroutine timing_stop(bin) + + use module_timing_meas + implicit none + integer bin + SECONDS sekunden + + meas(bin)%total_time = meas(bin)%total_time + sekunden() - meas(bin)%time + meas(bin)%n_call = meas(bin)%n_call + 1 +end + +!------------------------------------------------------------------------------- +subroutine timing_write(unit) + + use module_timing_write + use module_cg + use module_function_decl + use module_switches + use module_thread + use module_vol + implicit none + + integer unit, i + integer ierror + integer op_mtdagmt + integer cg_calls, cg_iter + real mflops, mflops_mean, mflops_min, mflops_max + real total_gflops, time_mean + + + character(len = 8) :: a_mflops_mean, a_mflops_min, a_mflops_max, & + a_total_gflops, a_time_mean, a_n_call + + character(*), parameter :: ifmt = "(i8)", ffmt = "(f8.2)", & + tab_fmt ="(2(3(1x,a),2x),1x,a)" + + if (version_of_d() >= 2) then + n_op(timing_bin_d_xf) = n_op(timing_bin_d_xf) * 2 + n_op(timing_bin_d_yf) = n_op(timing_bin_d_yf) * 2 + n_op(timing_bin_d_zf) = n_op(timing_bin_d_zf) * 2 + endif + + op_mtdagmt = 2 * (2 * op_d + op_blas_r2) + if (switches%clover) op_mtdagmt = op_mtdagmt + 4 * op_clov + if (switches%h_ext) op_mtdagmt = op_mtdagmt + 4 * op_h_mult + + if (version_of_d() == 21 .or. version_of_d() == 22) then + n_op(timing_bin_d_xf) = n_op(timing_bin_d_xf) - 12 * op_add + n_op(timing_bin_d_yf) = n_op(timing_bin_d_yf) - 12 * op_add + n_op(timing_bin_d_zf) = n_op(timing_bin_d_zf) - 12 * op_add + endif + + cg_calls = meas(timing_bin_cg)%n_call + cg_iter = cg_iterations_total + + n_op(timing_bin_mtdagmt) = op_mtdagmt + n_op(timing_bin_cg) = cg_iter * (op_mtdagmt + 5 * op_blas_r2) & + + cg_calls * (op_mtdagmt + op_blas_r1) + n_op(timing_bin_cg) = nint(real(n_op(timing_bin_cg)) / real(cg_calls)) + + + call begin(unit, "Timing") + if (my_pe() == 0) then + write(unit,"(48x,a)") "Performance" + write(unit, tab_fmt) "region ", " #calls", " time", & + " mean", " min", " max", " Total" + write(unit, tab_fmt) " ", " ", " s", & + " Mflop/s", " Mflop/s", " Mflop/s", " Gflop/s" + write(unit, *) + endif + + do i = 1, n_timing + + write(a_n_call, ifmt) meas(i)%n_call + a_time_mean = " " + a_mflops_mean = " " + a_mflops_min = " " + a_mflops_max = " " + a_total_gflops = " " + + if (meas(i)%n_call /= 0) then ! must be true on all PEs !! + + time_mean = global_sum(real(meas(i)%total_time, kind=RKIND)) / num_pes() + write(a_time_mean, ffmt) time_mean + + if (n_op(i) /= 0) then + mflops = 1e-6 * n_op(i) * volh * meas(i)%n_call / meas(i)%total_time + mflops_mean = global_sum(real(mflops, kind=RKIND)) / num_pes() + + mflops_min = global_min(mflops) + mflops_max = global_max(mflops) + + total_gflops = 1e-3 * mflops_mean * num_pes() + + write(a_mflops_mean, ffmt) mflops_mean / n_thread + write(a_mflops_min, ffmt) mflops_min / n_thread + write(a_mflops_max, ffmt) mflops_max / n_thread + write(a_total_gflops, ffmt) total_gflops + endif + endif + + if (my_pe() == 0) then + write(unit, tab_fmt) text(i), a_n_call, a_time_mean, & + a_mflops_mean, a_mflops_min, a_mflops_max, & + a_total_gflops + endif + enddo + + call end_A(unit, "Timing") +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/traces.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/traces.F90 new file mode 100644 index 0000000000000000000000000000000000000000..7e05fcea473452153b5df5680c455eb8debf3dd5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/traces.F90 @@ -0,0 +1,166 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2000-2005, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! traces.F90 +! +! calculates: Tr(inv(M)) psibar psi (pbp) +! Tr(gamma5 inv(M)) psibar gamma5 psi (p5p) +! Tr(inv(M+ M)) pion norm (pinorm) +! +! traces of a matrix A are calculated with a stochastic estimator: +! +! Tr(A) = eta+ A eta (eta: Gaussian noise) +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine traces(para, conf, traj, i_ensemble1, i_ensemble2) + + use typedef_hmc + use module_function_decl + use module_p_interface + use module_vol + implicit none + + integer, intent(in) :: traj, i_ensemble1, i_ensemble2 + type(hmc_para), intent(in) :: para + type(hmc_conf), intent(in) :: conf + + P_SPINCOL_FIELD, save :: eta_e, eta_o, zeta_e, zeta_o + + character(len=*), parameter :: key_tr = "%tr" + integer, save :: count = 0 + integer :: n_sc_field, size_of_trace + integer :: cg_ncall, cg_niter_max, cg_niter_tot + REAL :: pinorm + REAL :: re_pbp, im_pbp, re_p5p, im_p5p + COMPLEX :: pbp, p5p + REAL :: res(5) + + + ALLOCATE_SC_FIELD(eta_e) + ALLOCATE_SC_FIELD(eta_o) + ALLOCATE_SC_FIELD(zeta_e) + ALLOCATE_SC_FIELD(zeta_o) + + count = count + 1 + n_sc_field = NDIRAC * NCOL * volh + size_of_trace = NDIRAC * NCOL * volume + + call ran_gauss_volh(NDIRAC * NCOL, eta_e, HALF, EVEN) + call ran_gauss_volh(NDIRAC * NCOL, eta_o, HALF, ODD) + + call init_cg_stat() + call solve(para, conf, zeta_e, zeta_o, eta_e, eta_o) ! zeta = inv(M) eta + + pbp = sc_cdotc(eta_e, zeta_e) + sc_cdotc(eta_o, zeta_o) + + pinorm = sc_dot(zeta_e, zeta_e) + sc_dot(zeta_o, zeta_o) + + call gamma5(zeta_e, volh) ! zeta = gamma5 inv(M) eta + call gamma5(zeta_o, volh) + + p5p = sc_cdotc(eta_e, zeta_e) + sc_cdotc(eta_o, zeta_o) + + res(1) = Re(pbp) + res(2) = Im(pbp) + res(3) = Re(p5p) + res(4) = Im(p5p) + res(5) = pinorm + + call global_sum_vec(5, res) + + re_pbp = res(1) / size_of_trace + im_pbp = res(2) / size_of_trace + re_p5p = res(3) / size_of_trace + im_p5p = res(4) / size_of_trace + pinorm = res(5) / size_of_trace + + call get_cg_stat(cg_ncall, cg_niter_max, cg_niter_tot) + + if (my_pe() == 0) then + if (count == 1) write(UREC, 400) & + "T", key_tr, "traj", "e", "f", & + "Re(pbp)", "Im(pbp)", "Re(p5p)", "-Im(p5p)", "PionNorm", "CGiter" + + write(UREC, 410) key_tr, traj, i_ensemble1, i_ensemble2, & + re_pbp, im_pbp, re_p5p, -im_p5p, pinorm, cg_niter_max + endif + + +400 format (1x, 2a, a6, 2a3, 5a20, a10) +410 format (1x, a4, i6, 2i3, 5g20.10, i10) + +end + +!------------------------------------------------------------------------------- +subroutine solve(para, conf, out_e, out_o, in_e, in_o) ! solves: M out = in + + use typedef_hmc + use module_vol + implicit none + + type(hmc_para), intent(in) :: para + type(hmc_conf), intent(in) :: conf + SPINCOL_FIELD, intent(out) :: out_e, out_o + SPINCOL_FIELD, intent(in) :: in_e, in_o + + REAL :: a, b + integer :: iterations + external :: mtdagmt + + b = para%kappa / (ONE + para%h**2) + + call h_mult_c(out_o, -para%h, in_o, volh) + + call d(EVEN, ODD, out_e, out_o, conf%u) + + call sc_xpby(out_e, in_e, b) + + call mtil_dag(out_o, out_e, para, conf) + + call cg(mtdagmt, out_e, out_o, para, conf, iterations) + + call d(ODD, EVEN, out_o, out_e, conf%u) + + a = ONE / (ONE + para%h**2) + + call sc_axpby(out_o, in_o, b, a) + + call h_mult_b(-para%h, out_o, volh) + +end + +!------------------------------------------------------------------------------- +subroutine gamma5(x, volh) + + implicit none + COMPLEX, dimension (NDIRAC, *) :: x + integer :: volh + + integer :: i + COMPLEX :: x1, x2, x3, x4 + + !$omp parallel do private(x1, x2, x3, x4) + do i = 1, NCOL * volh + x1 = x(1, i) + x2 = x(2, i) + x3 = x(3, i) + x4 = x(4, i) + + x(1, i) = x3 + x(2, i) = x4 + x(3, i) = x1 + x(4, i) = x2 + enddo +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/types.h b/qcd/part_cpu/applications/QCD/src/kernel_A/types.h new file mode 100644 index 0000000000000000000000000000000000000000..269565a5c9a46e4a9816db69b467a1b10a2c9d88 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/types.h @@ -0,0 +1,10 @@ +typedef int INTSTD; +typedef short INT4; +typedef long INT8; + +typedef double REALSTD; +typedef float REAL4; +typedef double REAL8; + +typedef struct { REAL4 r, i; } COMPLEX4; +typedef struct { REAL8 r, i; } COMPLEX8; diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/uuu_bwd.c b/qcd/part_cpu/applications/QCD/src/kernel_A/uuu_bwd.c new file mode 100644 index 0000000000000000000000000000000000000000..cdb2bd9ea4020e67a42a03b6aa677fd062ffdb4a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/uuu_bwd.c @@ -0,0 +1,353 @@ +/* +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2002, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! uuu_bwd.c - adds backward staple: r = r + a^\dagger b^\dagger c +! +!------------------------------------------------------------------------------- +*/ + +#include "types.h" + +#ifdef NamesToLower_ +# define UUU_BWD uuu_bwd_ +#endif + +#ifdef NamesToLower +# define UUU_BWD uuu_bwd +#endif + +void UUU_BWD(r, a, b, c) +COMPLEX8 *r, *a, *b, *c; +{ + register COMPLEX8 q__1, q__2; + register COMPLEX8 t1, t2, t3, x1, x2, x3; + + /* Parameter adjustments */ + c -= 4; + b -= 4; + a -= 4; + r -= 4; + + /* Function Body */ + q__1.r = a[4].r * b[4].r - a[4].i * b[4].i, + q__1.i = a[4].r * b[4].i + a[4].i * b[4].r; + t1.r = q__1.r, + t1.i = q__1.i; + q__1.r = a[4].r * b[5].r - a[4].i * b[5].i, + q__1.i = a[4].r * b[5].i + a[4].i * b[5].r; + t2.r = q__1.r, + t2.i = q__1.i; + q__1.r = a[4].r * b[6].r - a[4].i * b[6].i, + q__1.i = a[4].r * b[6].i + a[4].i * b[6].r; + t3.r = q__1.r, + t3.i = q__1.i; + q__2.r = a[5].r * b[7].r - a[5].i * b[7].i, + q__2.i = a[5].r * b[7].i + a[5].i * b[7].r; + q__1.r = t1.r + q__2.r, + q__1.i = t1.i + q__2.i; + t1.r = q__1.r, + t1.i = q__1.i; + q__2.r = a[5].r * b[8].r - a[5].i * b[8].i, + q__2.i = a[5].r * b[8].i + a[5].i * b[8].r; + q__1.r = t2.r + q__2.r, + q__1.i = t2.i + q__2.i; + t2.r = q__1.r, + t2.i = q__1.i; + q__2.r = a[5].r * b[9].r - a[5].i * b[9].i, + q__2.i = a[5].r * b[9].i + a[5].i * b[9].r; + q__1.r = t3.r + q__2.r, + q__1.i = t3.i + q__2.i; + t3.r = q__1.r, + t3.i = q__1.i; + q__2.r = a[6].r * b[10].r - a[6].i * b[10].i, + q__2.i = a[6].r * b[10].i + a[6].i * b[10].r; + q__1.r = t1.r + q__2.r, + q__1.i = t1.i + q__2.i; + t1.r = q__1.r, + t1.i = -q__1.i; + q__2.r = a[6].r * b[11].r - a[6].i * b[11].i, + q__2.i = a[6].r * b[11].i + a[6].i * b[11].r; + q__1.r = t2.r + q__2.r, + q__1.i = t2.i + q__2.i; + t2.r = q__1.r, + t2.i = -q__1.i; + q__2.r = a[6].r * b[12].r - a[6].i * b[12].i, + q__2.i = a[6].r * b[12].i + a[6].i * b[12].r; + q__1.r = t3.r + q__2.r, + q__1.i = t3.i + q__2.i; + t3.r = q__1.r, + t3.i = -q__1.i; + + q__2.r = t1.r * c[4].r - t1.i * c[4].i, + q__2.i = t1.r * c[4].i + t1.i * c[4].r; + q__1.r = r[4].r + q__2.r, + q__1.i = r[4].i + q__2.i; + x1.r = q__1.r, + x1.i = q__1.i; + q__2.r = t1.r * c[7].r - t1.i * c[7].i, + q__2.i = t1.r * c[7].i + t1.i * c[7].r; + q__1.r = r[7].r + q__2.r, + q__1.i = r[7].i + q__2.i; + x2.r = q__1.r, + x2.i = q__1.i; + q__2.r = t1.r * c[10].r - t1.i * c[10].i, + q__2.i = t1.r * c[10].i + t1.i * c[10].r; + q__1.r = r[10].r + q__2.r, + q__1.i = r[10].i + q__2.i; + x3.r = q__1.r, + x3.i = q__1.i; + q__2.r = t2.r * c[5].r - t2.i * c[5].i, + q__2.i = t2.r * c[5].i + t2.i * c[5].r; + q__1.r = x1.r + q__2.r, + q__1.i = x1.i + q__2.i; + x1.r = q__1.r, + x1.i = q__1.i; + q__2.r = t2.r * c[8].r - t2.i * c[8].i, + q__2.i = t2.r * c[8].i + t2.i * c[8].r; + q__1.r = x2.r + q__2.r, + q__1.i = x2.i + q__2.i; + x2.r = q__1.r, + x2.i = q__1.i; + q__2.r = t2.r * c[11].r - t2.i * c[11].i, + q__2.i = t2.r * c[11].i + t2.i * c[11].r; + q__1.r = x3.r + q__2.r, + q__1.i = x3.i + q__2.i; + x3.r = q__1.r, + x3.i = q__1.i; + q__2.r = t3.r * c[6].r - t3.i * c[6].i, + q__2.i = t3.r * c[6].i + t3.i * c[6].r; + q__1.r = x1.r + q__2.r, + q__1.i = x1.i + q__2.i; + r[4].r = q__1.r, + r[4].i = q__1.i; + q__2.r = t3.r * c[9].r - t3.i * c[9].i, + q__2.i = t3.r * c[9].i + t3.i * c[9].r; + q__1.r = x2.r + q__2.r, + q__1.i = x2.i + q__2.i; + r[7].r = q__1.r, + r[7].i = q__1.i; + q__2.r = t3.r * c[12].r - t3.i * c[12].i, + q__2.i = t3.r * c[12].i + t3.i * c[12].r; + q__1.r = x3.r + q__2.r, + q__1.i = x3.i + q__2.i; + r[10].r = q__1.r, + r[10].i = q__1.i; + + q__1.r = a[7].r * b[4].r - a[7].i * b[4].i, + q__1.i = a[7].r * b[4].i + a[7].i * b[4].r; + t1.r = q__1.r, + t1.i = q__1.i; + q__1.r = a[7].r * b[5].r - a[7].i * b[5].i, + q__1.i = a[7].r * b[5].i + a[7].i * b[5].r; + t2.r = q__1.r, + t2.i = q__1.i; + q__1.r = a[7].r * b[6].r - a[7].i * b[6].i, + q__1.i = a[7].r * b[6].i + a[7].i * b[6].r; + t3.r = q__1.r, + t3.i = q__1.i; + q__2.r = a[8].r * b[7].r - a[8].i * b[7].i, + q__2.i = a[8].r * b[7].i + a[8].i * b[7].r; + q__1.r = t1.r + q__2.r, + q__1.i = t1.i + q__2.i; + t1.r = q__1.r, + t1.i = q__1.i; + q__2.r = a[8].r * b[8].r - a[8].i * b[8].i, + q__2.i = a[8].r * b[8].i + a[8].i * b[8].r; + q__1.r = t2.r + q__2.r, + q__1.i = t2.i + q__2.i; + t2.r = q__1.r, + t2.i = q__1.i; + q__2.r = a[8].r * b[9].r - a[8].i * b[9].i, + q__2.i = a[8].r * b[9].i + a[8].i * b[9].r; + q__1.r = t3.r + q__2.r, + q__1.i = t3.i + q__2.i; + t3.r = q__1.r, + t3.i = q__1.i; + q__2.r = a[9].r * b[10].r - a[9].i * b[10].i, + q__2.i = a[9].r * b[10].i + a[9].i * b[10].r; + q__1.r = t1.r + q__2.r, + q__1.i = t1.i + q__2.i; + t1.r = q__1.r, + t1.i = -q__1.i; + q__2.r = a[9].r * b[11].r - a[9].i * b[11].i, + q__2.i = a[9].r * b[11].i + a[9].i * b[11].r; + q__1.r = t2.r + q__2.r, + q__1.i = t2.i + q__2.i; + t2.r = q__1.r, + t2.i = -q__1.i; + q__2.r = a[9].r * b[12].r - a[9].i * b[12].i, + q__2.i = a[9].r * b[12].i + a[9].i * b[12].r; + q__1.r = t3.r + q__2.r, + q__1.i = t3.i + q__2.i; + t3.r = q__1.r, + t3.i = -q__1.i; + + q__2.r = t1.r * c[4].r - t1.i * c[4].i, + q__2.i = t1.r * c[4].i + t1.i * c[4].r; + q__1.r = r[5].r + q__2.r, + q__1.i = r[5].i + q__2.i; + x1.r = q__1.r, + x1.i = q__1.i; + q__2.r = t1.r * c[7].r - t1.i * c[7].i, + q__2.i = t1.r * c[7].i + t1.i * c[7].r; + q__1.r = r[8].r + q__2.r, + q__1.i = r[8].i + q__2.i; + x2.r = q__1.r, + x2.i = q__1.i; + q__2.r = t1.r * c[10].r - t1.i * c[10].i, + q__2.i = t1.r * c[10].i + t1.i * c[10].r; + q__1.r = r[11].r + q__2.r, + q__1.i = r[11].i + q__2.i; + x3.r = q__1.r, + x3.i = q__1.i; + q__2.r = t2.r * c[5].r - t2.i * c[5].i, + q__2.i = t2.r * c[5].i + t2.i * c[5].r; + q__1.r = x1.r + q__2.r, + q__1.i = x1.i + q__2.i; + x1.r = q__1.r, + x1.i = q__1.i; + q__2.r = t2.r * c[8].r - t2.i * c[8].i, + q__2.i = t2.r * c[8].i + t2.i * c[8].r; + q__1.r = x2.r + q__2.r, + q__1.i = x2.i + q__2.i; + x2.r = q__1.r, + x2.i = q__1.i; + q__2.r = t2.r * c[11].r - t2.i * c[11].i, + q__2.i = t2.r * c[11].i + t2.i * c[11].r; + q__1.r = x3.r + q__2.r, + q__1.i = x3.i + q__2.i; + x3.r = q__1.r, + x3.i = q__1.i; + q__2.r = t3.r * c[6].r - t3.i * c[6].i, + q__2.i = t3.r * c[6].i + t3.i * c[6].r; + q__1.r = x1.r + q__2.r, + q__1.i = x1.i + q__2.i; + r[5].r = q__1.r, + r[5].i = q__1.i; + q__2.r = t3.r * c[9].r - t3.i * c[9].i, + q__2.i = t3.r * c[9].i + t3.i * c[9].r; + q__1.r = x2.r + q__2.r, + q__1.i = x2.i + q__2.i; + r[8].r = q__1.r, + r[8].i = q__1.i; + q__2.r = t3.r * c[12].r - t3.i * c[12].i, + q__2.i = t3.r * c[12].i + t3.i * c[12].r; + q__1.r = x3.r + q__2.r, + q__1.i = x3.i + q__2.i; + r[11].r = q__1.r, + r[11].i = q__1.i; + + q__1.r = a[10].r * b[4].r - a[10].i * b[4].i, + q__1.i = a[10].r * b[4].i + a[10].i * b[4].r; + t1.r = q__1.r, + t1.i = q__1.i; + q__1.r = a[10].r * b[5].r - a[10].i * b[5].i, + q__1.i = a[10].r * b[5].i + a[10].i * b[5].r; + t2.r = q__1.r, + t2.i = q__1.i; + q__1.r = a[10].r * b[6].r - a[10].i * b[6].i, + q__1.i = a[10].r * b[6].i + a[10].i * b[6].r; + t3.r = q__1.r, + t3.i = q__1.i; + q__2.r = a[11].r * b[7].r - a[11].i * b[7].i, + q__2.i = a[11].r * b[7].i + a[11].i * b[7].r; + q__1.r = t1.r + q__2.r, + q__1.i = t1.i + q__2.i; + t1.r = q__1.r, + t1.i = q__1.i; + q__2.r = a[11].r * b[8].r - a[11].i * b[8].i, + q__2.i = a[11].r * b[8].i + a[11].i * b[8].r; + q__1.r = t2.r + q__2.r, + q__1.i = t2.i + q__2.i; + t2.r = q__1.r, + t2.i = q__1.i; + q__2.r = a[11].r * b[9].r - a[11].i * b[9].i, + q__2.i = a[11].r * b[9].i + a[11].i * b[9].r; + q__1.r = t3.r + q__2.r, + q__1.i = t3.i + q__2.i; + t3.r = q__1.r, + t3.i = q__1.i; + q__2.r = a[12].r * b[10].r - a[12].i * b[10].i, + q__2.i = a[12].r * b[10].i + a[12].i * b[10].r; + q__1.r = t1.r + q__2.r, + q__1.i = t1.i + q__2.i; + t1.r = q__1.r, + t1.i = -q__1.i; + q__2.r = a[12].r * b[11].r - a[12].i * b[11].i, + q__2.i = a[12].r * b[11].i + a[12].i * b[11].r; + q__1.r = t2.r + q__2.r, + q__1.i = t2.i + q__2.i; + t2.r = q__1.r, + t2.i = -q__1.i; + q__2.r = a[12].r * b[12].r - a[12].i * b[12].i, + q__2.i = a[12].r * b[12].i + a[12].i * b[12].r; + q__1.r = t3.r + q__2.r, + q__1.i = t3.i + q__2.i; + t3.r = q__1.r, + t3.i = -q__1.i; + + q__2.r = t1.r * c[4].r - t1.i * c[4].i, + q__2.i = t1.r * c[4].i + t1.i * c[4].r; + q__1.r = r[6].r + q__2.r, + q__1.i = r[6].i + q__2.i; + x1.r = q__1.r, + x1.i = q__1.i; + q__2.r = t1.r * c[7].r - t1.i * c[7].i, + q__2.i = t1.r * c[7].i + t1.i * c[7].r; + q__1.r = r[9].r + q__2.r, + q__1.i = r[9].i + q__2.i; + x2.r = q__1.r, + x2.i = q__1.i; + q__2.r = t1.r * c[10].r - t1.i * c[10].i, + q__2.i = t1.r * c[10].i + t1.i * c[10].r; + q__1.r = r[12].r + q__2.r, + q__1.i = r[12].i + q__2.i; + x3.r = q__1.r, + x3.i = q__1.i; + q__2.r = t2.r * c[5].r - t2.i * c[5].i, + q__2.i = t2.r * c[5].i + t2.i * c[5].r; + q__1.r = x1.r + q__2.r, + q__1.i = x1.i + q__2.i; + x1.r = q__1.r, + x1.i = q__1.i; + q__2.r = t2.r * c[8].r - t2.i * c[8].i, + q__2.i = t2.r * c[8].i + t2.i * c[8].r; + q__1.r = x2.r + q__2.r, + q__1.i = x2.i + q__2.i; + x2.r = q__1.r, + x2.i = q__1.i; + q__2.r = t2.r * c[11].r - t2.i * c[11].i, + q__2.i = t2.r * c[11].i + t2.i * c[11].r; + q__1.r = x3.r + q__2.r, + q__1.i = x3.i + q__2.i; + x3.r = q__1.r, + x3.i = q__1.i; + q__2.r = t3.r * c[6].r - t3.i * c[6].i, + q__2.i = t3.r * c[6].i + t3.i * c[6].r; + q__1.r = x1.r + q__2.r, + q__1.i = x1.i + q__2.i; + r[6].r = q__1.r, + r[6].i = q__1.i; + q__2.r = t3.r * c[9].r - t3.i * c[9].i, + q__2.i = t3.r * c[9].i + t3.i * c[9].r; + q__1.r = x2.r + q__2.r, + q__1.i = x2.i + q__2.i; + r[9].r = q__1.r, + r[9].i = q__1.i; + q__2.r = t3.r * c[12].r - t3.i * c[12].i, + q__2.i = t3.r * c[12].i + t3.i * c[12].r; + q__1.r = x3.r + q__2.r, + q__1.i = x3.i + q__2.i; + r[12].r = q__1.r, + r[12].i = q__1.i; + + return; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/uuu_bwd_m.c b/qcd/part_cpu/applications/QCD/src/kernel_A/uuu_bwd_m.c new file mode 100644 index 0000000000000000000000000000000000000000..6d50ebce74b3f7d6bc889aad0010d79266909f02 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/uuu_bwd_m.c @@ -0,0 +1,353 @@ +/* +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2002, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! uuu_bwd_m.c - subtracts backward staple: r = r - a^\dagger b^\dagger c +! +!------------------------------------------------------------------------------- +*/ + +#include "types.h" + +#ifdef NamesToLower_ +# define UUU_BWD_M uuu_bwd_m_ +#endif + +#ifdef NamesToLower +# define UUU_BWD_M uuu_bwd_m +#endif + +void UUU_BWD_M(r, a, b, c) +COMPLEX8 *r, *a, *b, *c; +{ + register COMPLEX8 q__1, q__2; + register COMPLEX8 t1, t2, t3, x1, x2, x3; + + /* Parameter adjustments */ + c -= 4; + b -= 4; + a -= 4; + r -= 4; + + /* Function Body */ + q__1.r = -a[4].r * b[4].r + a[4].i * b[4].i, + q__1.i = -a[4].r * b[4].i - a[4].i * b[4].r; + t1.r = q__1.r, + t1.i = q__1.i; + q__1.r = -a[4].r * b[5].r + a[4].i * b[5].i, + q__1.i = -a[4].r * b[5].i - a[4].i * b[5].r; + t2.r = q__1.r, + t2.i = q__1.i; + q__1.r = -a[4].r * b[6].r + a[4].i * b[6].i, + q__1.i = -a[4].r * b[6].i - a[4].i * b[6].r; + t3.r = q__1.r, + t3.i = q__1.i; + q__2.r = -a[5].r * b[7].r + a[5].i * b[7].i, + q__2.i = -a[5].r * b[7].i - a[5].i * b[7].r; + q__1.r = t1.r + q__2.r, + q__1.i = t1.i + q__2.i; + t1.r = q__1.r, + t1.i = q__1.i; + q__2.r = -a[5].r * b[8].r + a[5].i * b[8].i, + q__2.i = -a[5].r * b[8].i - a[5].i * b[8].r; + q__1.r = t2.r + q__2.r, + q__1.i = t2.i + q__2.i; + t2.r = q__1.r, + t2.i = q__1.i; + q__2.r = -a[5].r * b[9].r + a[5].i * b[9].i, + q__2.i = -a[5].r * b[9].i - a[5].i * b[9].r; + q__1.r = t3.r + q__2.r, + q__1.i = t3.i + q__2.i; + t3.r = q__1.r, + t3.i = q__1.i; + q__2.r = -a[6].r * b[10].r + a[6].i * b[10].i, + q__2.i = -a[6].r * b[10].i - a[6].i * b[10].r; + q__1.r = t1.r + q__2.r, + q__1.i = t1.i + q__2.i; + t1.r = q__1.r, + t1.i = -q__1.i; + q__2.r = -a[6].r * b[11].r + a[6].i * b[11].i, + q__2.i = -a[6].r * b[11].i - a[6].i * b[11].r; + q__1.r = t2.r + q__2.r, + q__1.i = t2.i + q__2.i; + t2.r = q__1.r, + t2.i = -q__1.i; + q__2.r = -a[6].r * b[12].r + a[6].i * b[12].i, + q__2.i = -a[6].r * b[12].i - a[6].i * b[12].r; + q__1.r = t3.r + q__2.r, + q__1.i = t3.i + q__2.i; + t3.r = q__1.r, + t3.i = -q__1.i; + + q__2.r = t1.r * c[4].r - t1.i * c[4].i, + q__2.i = t1.r * c[4].i + t1.i * c[4].r; + q__1.r = r[4].r + q__2.r, + q__1.i = r[4].i + q__2.i; + x1.r = q__1.r, + x1.i = q__1.i; + q__2.r = t1.r * c[7].r - t1.i * c[7].i, + q__2.i = t1.r * c[7].i + t1.i * c[7].r; + q__1.r = r[7].r + q__2.r, + q__1.i = r[7].i + q__2.i; + x2.r = q__1.r, + x2.i = q__1.i; + q__2.r = t1.r * c[10].r - t1.i * c[10].i, + q__2.i = t1.r * c[10].i + t1.i * c[10].r; + q__1.r = r[10].r + q__2.r, + q__1.i = r[10].i + q__2.i; + x3.r = q__1.r, + x3.i = q__1.i; + q__2.r = t2.r * c[5].r - t2.i * c[5].i, + q__2.i = t2.r * c[5].i + t2.i * c[5].r; + q__1.r = x1.r + q__2.r, + q__1.i = x1.i + q__2.i; + x1.r = q__1.r, + x1.i = q__1.i; + q__2.r = t2.r * c[8].r - t2.i * c[8].i, + q__2.i = t2.r * c[8].i + t2.i * c[8].r; + q__1.r = x2.r + q__2.r, + q__1.i = x2.i + q__2.i; + x2.r = q__1.r, + x2.i = q__1.i; + q__2.r = t2.r * c[11].r - t2.i * c[11].i, + q__2.i = t2.r * c[11].i + t2.i * c[11].r; + q__1.r = x3.r + q__2.r, + q__1.i = x3.i + q__2.i; + x3.r = q__1.r, + x3.i = q__1.i; + q__2.r = t3.r * c[6].r - t3.i * c[6].i, + q__2.i = t3.r * c[6].i + t3.i * c[6].r; + q__1.r = x1.r + q__2.r, + q__1.i = x1.i + q__2.i; + r[4].r = q__1.r, + r[4].i = q__1.i; + q__2.r = t3.r * c[9].r - t3.i * c[9].i, + q__2.i = t3.r * c[9].i + t3.i * c[9].r; + q__1.r = x2.r + q__2.r, + q__1.i = x2.i + q__2.i; + r[7].r = q__1.r, + r[7].i = q__1.i; + q__2.r = t3.r * c[12].r - t3.i * c[12].i, + q__2.i = t3.r * c[12].i + t3.i * c[12].r; + q__1.r = x3.r + q__2.r, + q__1.i = x3.i + q__2.i; + r[10].r = q__1.r, + r[10].i = q__1.i; + + q__1.r = -a[7].r * b[4].r + a[7].i * b[4].i, + q__1.i = -a[7].r * b[4].i - a[7].i * b[4].r; + t1.r = q__1.r, + t1.i = q__1.i; + q__1.r = -a[7].r * b[5].r + a[7].i * b[5].i, + q__1.i = -a[7].r * b[5].i - a[7].i * b[5].r; + t2.r = q__1.r, + t2.i = q__1.i; + q__1.r = -a[7].r * b[6].r + a[7].i * b[6].i, + q__1.i = -a[7].r * b[6].i - a[7].i * b[6].r; + t3.r = q__1.r, + t3.i = q__1.i; + q__2.r = -a[8].r * b[7].r + a[8].i * b[7].i, + q__2.i = -a[8].r * b[7].i - a[8].i * b[7].r; + q__1.r = t1.r + q__2.r, + q__1.i = t1.i + q__2.i; + t1.r = q__1.r, + t1.i = q__1.i; + q__2.r = -a[8].r * b[8].r + a[8].i * b[8].i, + q__2.i = -a[8].r * b[8].i - a[8].i * b[8].r; + q__1.r = t2.r + q__2.r, + q__1.i = t2.i + q__2.i; + t2.r = q__1.r, + t2.i = q__1.i; + q__2.r = -a[8].r * b[9].r + a[8].i * b[9].i, + q__2.i = -a[8].r * b[9].i - a[8].i * b[9].r; + q__1.r = t3.r + q__2.r, + q__1.i = t3.i + q__2.i; + t3.r = q__1.r, + t3.i = q__1.i; + q__2.r = -a[9].r * b[10].r + a[9].i * b[10].i, + q__2.i = -a[9].r * b[10].i - a[9].i * b[10].r; + q__1.r = t1.r + q__2.r, + q__1.i = t1.i + q__2.i; + t1.r = q__1.r, + t1.i = -q__1.i; + q__2.r = -a[9].r * b[11].r + a[9].i * b[11].i, + q__2.i = -a[9].r * b[11].i - a[9].i * b[11].r; + q__1.r = t2.r + q__2.r, + q__1.i = t2.i + q__2.i; + t2.r = q__1.r, + t2.i = -q__1.i; + q__2.r = -a[9].r * b[12].r + a[9].i * b[12].i, + q__2.i = -a[9].r * b[12].i - a[9].i * b[12].r; + q__1.r = t3.r + q__2.r, + q__1.i = t3.i + q__2.i; + t3.r = q__1.r, + t3.i = -q__1.i; + + q__2.r = t1.r * c[4].r - t1.i * c[4].i, + q__2.i = t1.r * c[4].i + t1.i * c[4].r; + q__1.r = r[5].r + q__2.r, + q__1.i = r[5].i + q__2.i; + x1.r = q__1.r, + x1.i = q__1.i; + q__2.r = t1.r * c[7].r - t1.i * c[7].i, + q__2.i = t1.r * c[7].i + t1.i * c[7].r; + q__1.r = r[8].r + q__2.r, + q__1.i = r[8].i + q__2.i; + x2.r = q__1.r, + x2.i = q__1.i; + q__2.r = t1.r * c[10].r - t1.i * c[10].i, + q__2.i = t1.r * c[10].i + t1.i * c[10].r; + q__1.r = r[11].r + q__2.r, + q__1.i = r[11].i + q__2.i; + x3.r = q__1.r, + x3.i = q__1.i; + q__2.r = t2.r * c[5].r - t2.i * c[5].i, + q__2.i = t2.r * c[5].i + t2.i * c[5].r; + q__1.r = x1.r + q__2.r, + q__1.i = x1.i + q__2.i; + x1.r = q__1.r, + x1.i = q__1.i; + q__2.r = t2.r * c[8].r - t2.i * c[8].i, + q__2.i = t2.r * c[8].i + t2.i * c[8].r; + q__1.r = x2.r + q__2.r, + q__1.i = x2.i + q__2.i; + x2.r = q__1.r, + x2.i = q__1.i; + q__2.r = t2.r * c[11].r - t2.i * c[11].i, + q__2.i = t2.r * c[11].i + t2.i * c[11].r; + q__1.r = x3.r + q__2.r, + q__1.i = x3.i + q__2.i; + x3.r = q__1.r, + x3.i = q__1.i; + q__2.r = t3.r * c[6].r - t3.i * c[6].i, + q__2.i = t3.r * c[6].i + t3.i * c[6].r; + q__1.r = x1.r + q__2.r, + q__1.i = x1.i + q__2.i; + r[5].r = q__1.r, + r[5].i = q__1.i; + q__2.r = t3.r * c[9].r - t3.i * c[9].i, + q__2.i = t3.r * c[9].i + t3.i * c[9].r; + q__1.r = x2.r + q__2.r, + q__1.i = x2.i + q__2.i; + r[8].r = q__1.r, + r[8].i = q__1.i; + q__2.r = t3.r * c[12].r - t3.i * c[12].i, + q__2.i = t3.r * c[12].i + t3.i * c[12].r; + q__1.r = x3.r + q__2.r, + q__1.i = x3.i + q__2.i; + r[11].r = q__1.r, + r[11].i = q__1.i; + + q__1.r = -a[10].r * b[4].r + a[10].i * b[4].i, + q__1.i = -a[10].r * b[4].i - a[10].i * b[4].r; + t1.r = q__1.r, + t1.i = q__1.i; + q__1.r = -a[10].r * b[5].r + a[10].i * b[5].i, + q__1.i = -a[10].r * b[5].i - a[10].i * b[5].r; + t2.r = q__1.r, + t2.i = q__1.i; + q__1.r = -a[10].r * b[6].r + a[10].i * b[6].i, + q__1.i = -a[10].r * b[6].i - a[10].i * b[6].r; + t3.r = q__1.r, + t3.i = q__1.i; + q__2.r = -a[11].r * b[7].r + a[11].i * b[7].i, + q__2.i = -a[11].r * b[7].i - a[11].i * b[7].r; + q__1.r = t1.r + q__2.r, + q__1.i = t1.i + q__2.i; + t1.r = q__1.r, + t1.i = q__1.i; + q__2.r = -a[11].r * b[8].r + a[11].i * b[8].i, + q__2.i = -a[11].r * b[8].i - a[11].i * b[8].r; + q__1.r = t2.r + q__2.r, + q__1.i = t2.i + q__2.i; + t2.r = q__1.r, + t2.i = q__1.i; + q__2.r = -a[11].r * b[9].r + a[11].i * b[9].i, + q__2.i = -a[11].r * b[9].i - a[11].i * b[9].r; + q__1.r = t3.r + q__2.r, + q__1.i = t3.i + q__2.i; + t3.r = q__1.r, + t3.i = q__1.i; + q__2.r = -a[12].r * b[10].r + a[12].i * b[10].i, + q__2.i = -a[12].r * b[10].i - a[12].i * b[10].r; + q__1.r = t1.r + q__2.r, + q__1.i = t1.i + q__2.i; + t1.r = q__1.r, + t1.i = -q__1.i; + q__2.r = -a[12].r * b[11].r + a[12].i * b[11].i, + q__2.i = -a[12].r * b[11].i - a[12].i * b[11].r; + q__1.r = t2.r + q__2.r, + q__1.i = t2.i + q__2.i; + t2.r = q__1.r, + t2.i = -q__1.i; + q__2.r = -a[12].r * b[12].r + a[12].i * b[12].i, + q__2.i = -a[12].r * b[12].i - a[12].i * b[12].r; + q__1.r = t3.r + q__2.r, + q__1.i = t3.i + q__2.i; + t3.r = q__1.r, + t3.i = -q__1.i; + + q__2.r = t1.r * c[4].r - t1.i * c[4].i, + q__2.i = t1.r * c[4].i + t1.i * c[4].r; + q__1.r = r[6].r + q__2.r, + q__1.i = r[6].i + q__2.i; + x1.r = q__1.r, + x1.i = q__1.i; + q__2.r = t1.r * c[7].r - t1.i * c[7].i, + q__2.i = t1.r * c[7].i + t1.i * c[7].r; + q__1.r = r[9].r + q__2.r, + q__1.i = r[9].i + q__2.i; + x2.r = q__1.r, + x2.i = q__1.i; + q__2.r = t1.r * c[10].r - t1.i * c[10].i, + q__2.i = t1.r * c[10].i + t1.i * c[10].r; + q__1.r = r[12].r + q__2.r, + q__1.i = r[12].i + q__2.i; + x3.r = q__1.r, + x3.i = q__1.i; + q__2.r = t2.r * c[5].r - t2.i * c[5].i, + q__2.i = t2.r * c[5].i + t2.i * c[5].r; + q__1.r = x1.r + q__2.r, + q__1.i = x1.i + q__2.i; + x1.r = q__1.r, + x1.i = q__1.i; + q__2.r = t2.r * c[8].r - t2.i * c[8].i, + q__2.i = t2.r * c[8].i + t2.i * c[8].r; + q__1.r = x2.r + q__2.r, + q__1.i = x2.i + q__2.i; + x2.r = q__1.r, + x2.i = q__1.i; + q__2.r = t2.r * c[11].r - t2.i * c[11].i, + q__2.i = t2.r * c[11].i + t2.i * c[11].r; + q__1.r = x3.r + q__2.r, + q__1.i = x3.i + q__2.i; + x3.r = q__1.r, + x3.i = q__1.i; + q__2.r = t3.r * c[6].r - t3.i * c[6].i, + q__2.i = t3.r * c[6].i + t3.i * c[6].r; + q__1.r = x1.r + q__2.r, + q__1.i = x1.i + q__2.i; + r[6].r = q__1.r, + r[6].i = q__1.i; + q__2.r = t3.r * c[9].r - t3.i * c[9].i, + q__2.i = t3.r * c[9].i + t3.i * c[9].r; + q__1.r = x2.r + q__2.r, + q__1.i = x2.i + q__2.i; + r[9].r = q__1.r, + r[9].i = q__1.i; + q__2.r = t3.r * c[12].r - t3.i * c[12].i, + q__2.i = t3.r * c[12].i + t3.i * c[12].r; + q__1.r = x3.r + q__2.r, + q__1.i = x3.i + q__2.i; + r[12].r = q__1.r, + r[12].i = q__1.i; + + return; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/uuu_f90.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/uuu_f90.F90 new file mode 100644 index 0000000000000000000000000000000000000000..53ca89df005fe7caa5aab26fc99f99964534a967 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/uuu_f90.F90 @@ -0,0 +1,73 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2001, Hinnerk Stueben, Zuse Institute Berlin +! +!------------------------------------------------------------------------------- +! +! uuu_f90.F90 - Fortran loops for (U * U * U) +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine uuu_bwd(r, a, b, c) ! adds backward staple: + ! r = r + a^\dagger b^\dagger c + implicit none + SU3 :: r, a, b, c + integer :: i, j, k, m + + do i = 1, NCOL + do j = 1, NCOL + do k = 1, NCOL + do m = 1, NCOL + r(i,j) = r(i,j) + conjg(a(k,i)) * conjg(b(m,k)) * c(m,j) + enddo + enddo + enddo + enddo + +end + +!------------------------------------------------------------------------------- +subroutine uuu_bwd_m(r, a, b, c) ! subtracts backward staple: + ! r = r - a^\dagger b^\dagger c + implicit none + SU3 :: r, a, b, c + integer :: i, j, k, m + + do i = 1, NCOL + do j = 1, NCOL + do k = 1, NCOL + do m = 1, NCOL + r(i,j) = r(i,j) - conjg(a(k,i)) * conjg(b(m,k)) * c(m,j) + enddo + enddo + enddo + enddo + +end + +!------------------------------------------------------------------------------- +subroutine uuu_fwd(r, a, b, c) ! adds forward staple: + ! r = r + a b^\dagger c^\dagger + implicit none + SU3 :: r, a, b, c + integer :: i, j, k, m + + do i = 1, NCOL + do j = 1, NCOL + do k = 1, NCOL + do m = 1, NCOL + r(i,j) = r(i,j) + a(i,k) * conjg(b(m,k)) * conjg(c(j,m)) + enddo + enddo + enddo + enddo + +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/uuu_fwd.c b/qcd/part_cpu/applications/QCD/src/kernel_A/uuu_fwd.c new file mode 100644 index 0000000000000000000000000000000000000000..c0b544526f0e410f19639d6873d0b6ed4b9dcf08 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/uuu_fwd.c @@ -0,0 +1,353 @@ +/* +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2002, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! uuu_fwd.c - adds forward staple: r = r + a b^\dagger c^\dagger +! +!------------------------------------------------------------------------------- +*/ + +#include "types.h" + +#ifdef NamesToLower_ +# define UUU_FWD uuu_fwd_ +#endif + +#ifdef NamesToLower +# define UUU_FWD uuu_fwd +#endif + +void UUU_FWD(r, a, b, c) +COMPLEX8 *r, *a, *b, *c; +{ + register COMPLEX8 q__1, q__2; + register COMPLEX8 t1, t2, t3, x1, x2, x3; + + /* Parameter adjustments */ + c -= 4; + b -= 4; + a -= 4; + r -= 4; + + /* Function Body */ + q__1.r = a[4].r * b[4].r + a[4].i * b[4].i, + q__1.i = -a[4].r * b[4].i + a[4].i * b[4].r; + t1.r = q__1.r, + t1.i = q__1.i; + q__1.r = a[4].r * b[5].r + a[4].i * b[5].i, + q__1.i = -a[4].r * b[5].i + a[4].i * b[5].r; + t2.r = q__1.r, + t2.i = q__1.i; + q__1.r = a[4].r * b[6].r + a[4].i * b[6].i, + q__1.i = -a[4].r * b[6].i + a[4].i * b[6].r; + t3.r = q__1.r, + t3.i = q__1.i; + q__2.r = a[7].r * b[7].r + a[7].i * b[7].i, + q__2.i = -a[7].r * b[7].i + a[7].i * b[7].r; + q__1.r = t1.r + q__2.r, + q__1.i = t1.i + q__2.i; + t1.r = q__1.r, + t1.i = q__1.i; + q__2.r = a[7].r * b[8].r + a[7].i * b[8].i, + q__2.i = -a[7].r * b[8].i + a[7].i * b[8].r; + q__1.r = t2.r + q__2.r, + q__1.i = t2.i + q__2.i; + t2.r = q__1.r, + t2.i = q__1.i; + q__2.r = a[7].r * b[9].r + a[7].i * b[9].i, + q__2.i = -a[7].r * b[9].i + a[7].i * b[9].r; + q__1.r = t3.r + q__2.r, + q__1.i = t3.i + q__2.i; + t3.r = q__1.r, + t3.i = q__1.i; + q__2.r = a[10].r * b[10].r + a[10].i * b[10].i, + q__2.i = -a[10].r * b[10].i + a[10].i * b[10].r; + q__1.r = t1.r + q__2.r, + q__1.i = t1.i + q__2.i; + t1.r = q__1.r, + t1.i = q__1.i; + q__2.r = a[10].r * b[11].r + a[10].i * b[11].i, + q__2.i = -a[10].r * b[11].i + a[10].i * b[11].r; + q__1.r = t2.r + q__2.r, + q__1.i = t2.i + q__2.i; + t2.r = q__1.r, + t2.i = q__1.i; + q__2.r = a[10].r * b[12].r + a[10].i * b[12].i, + q__2.i = -a[10].r * b[12].i + a[10].i * b[12].r; + q__1.r = t3.r + q__2.r, + q__1.i = t3.i + q__2.i; + t3.r = q__1.r, + t3.i = q__1.i; + + q__2.r = t1.r * c[4].r + t1.i * c[4].i, + q__2.i = -t1.r * c[4].i + t1.i * c[4].r; + q__1.r = r[4].r + q__2.r, + q__1.i = r[4].i + q__2.i; + x1.r = q__1.r, + x1.i = q__1.i; + q__2.r = t1.r * c[5].r + t1.i * c[5].i, + q__2.i = -t1.r * c[5].i + t1.i * c[5].r; + q__1.r = r[7].r + q__2.r, + q__1.i = r[7].i + q__2.i; + x2.r = q__1.r, + x2.i = q__1.i; + q__2.r = t1.r * c[6].r + t1.i * c[6].i, + q__2.i = -t1.r * c[6].i + t1.i * c[6].r; + q__1.r = r[10].r + q__2.r, + q__1.i = r[10].i + q__2.i; + x3.r = q__1.r, + x3.i = q__1.i; + q__2.r = t2.r * c[7].r + t2.i * c[7].i, + q__2.i = -t2.r * c[7].i + t2.i * c[7].r; + q__1.r = x1.r + q__2.r, + q__1.i = x1.i + q__2.i; + x1.r = q__1.r, + x1.i = q__1.i; + q__2.r = t2.r * c[8].r + t2.i * c[8].i, + q__2.i = -t2.r * c[8].i + t2.i * c[8].r; + q__1.r = x2.r + q__2.r, + q__1.i = x2.i + q__2.i; + x2.r = q__1.r, + x2.i = q__1.i; + q__2.r = t2.r * c[9].r + t2.i * c[9].i, + q__2.i = -t2.r * c[9].i + t2.i * c[9].r; + q__1.r = x3.r + q__2.r, + q__1.i = x3.i + q__2.i; + x3.r = q__1.r, + x3.i = q__1.i; + q__2.r = t3.r * c[10].r + t3.i * c[10].i, + q__2.i = -t3.r * c[10].i + t3.i * c[10].r; + q__1.r = x1.r + q__2.r, + q__1.i = x1.i + q__2.i; + r[4].r = q__1.r, + r[4].i = q__1.i; + q__2.r = t3.r * c[11].r + t3.i * c[11].i, + q__2.i = -t3.r * c[11].i + t3.i * c[11].r; + q__1.r = x2.r + q__2.r, + q__1.i = x2.i + q__2.i; + r[7].r = q__1.r, + r[7].i = q__1.i; + q__2.r = t3.r * c[12].r + t3.i * c[12].i, + q__2.i = -t3.r * c[12].i + t3.i * c[12].r; + q__1.r = x3.r + q__2.r, + q__1.i = x3.i + q__2.i; + r[10].r = q__1.r, + r[10].i = q__1.i; + + q__1.r = a[5].r * b[4].r + a[5].i * b[4].i, + q__1.i = -a[5].r * b[4].i + a[5].i * b[4].r; + t1.r = q__1.r, + t1.i = q__1.i; + q__1.r = a[5].r * b[5].r + a[5].i * b[5].i, + q__1.i = -a[5].r * b[5].i + a[5].i * b[5].r; + t2.r = q__1.r, + t2.i = q__1.i; + q__1.r = a[5].r * b[6].r + a[5].i * b[6].i, + q__1.i = -a[5].r * b[6].i + a[5].i * b[6].r; + t3.r = q__1.r, + t3.i = q__1.i; + q__2.r = a[8].r * b[7].r + a[8].i * b[7].i, + q__2.i = -a[8].r * b[7].i + a[8].i * b[7].r; + q__1.r = t1.r + q__2.r, + q__1.i = t1.i + q__2.i; + t1.r = q__1.r, + t1.i = q__1.i; + q__2.r = a[8].r * b[8].r + a[8].i * b[8].i, + q__2.i = -a[8].r * b[8].i + a[8].i * b[8].r; + q__1.r = t2.r + q__2.r, + q__1.i = t2.i + q__2.i; + t2.r = q__1.r, + t2.i = q__1.i; + q__2.r = a[8].r * b[9].r + a[8].i * b[9].i, + q__2.i = -a[8].r * b[9].i + a[8].i * b[9].r; + q__1.r = t3.r + q__2.r, + q__1.i = t3.i + q__2.i; + t3.r = q__1.r, + t3.i = q__1.i; + q__2.r = a[11].r * b[10].r + a[11].i * b[10].i, + q__2.i = -a[11].r * b[10].i + a[11].i * b[10].r; + q__1.r = t1.r + q__2.r, + q__1.i = t1.i + q__2.i; + t1.r = q__1.r, + t1.i = q__1.i; + q__2.r = a[11].r * b[11].r + a[11].i * b[11].i, + q__2.i = -a[11].r * b[11].i + a[11].i * b[11].r; + q__1.r = t2.r + q__2.r, + q__1.i = t2.i + q__2.i; + t2.r = q__1.r, + t2.i = q__1.i; + q__2.r = a[11].r * b[12].r + a[11].i * b[12].i, + q__2.i = -a[11].r * b[12].i + a[11].i * b[12].r; + q__1.r = t3.r + q__2.r, + q__1.i = t3.i + q__2.i; + t3.r = q__1.r, + t3.i = q__1.i; + + q__2.r = t1.r * c[4].r + t1.i * c[4].i, + q__2.i = -t1.r * c[4].i + t1.i * c[4].r; + q__1.r = r[5].r + q__2.r, + q__1.i = r[5].i + q__2.i; + x1.r = q__1.r, + x1.i = q__1.i; + q__2.r = t1.r * c[5].r + t1.i * c[5].i, + q__2.i = -t1.r * c[5].i + t1.i * c[5].r; + q__1.r = r[8].r + q__2.r, + q__1.i = r[8].i + q__2.i; + x2.r = q__1.r, + x2.i = q__1.i; + q__2.r = t1.r * c[6].r + t1.i * c[6].i, + q__2.i = -t1.r * c[6].i + t1.i * c[6].r; + q__1.r = r[11].r + q__2.r, + q__1.i = r[11].i + q__2.i; + x3.r = q__1.r, + x3.i = q__1.i; + q__2.r = t2.r * c[7].r + t2.i * c[7].i, + q__2.i = -t2.r * c[7].i + t2.i * c[7].r; + q__1.r = x1.r + q__2.r, + q__1.i = x1.i + q__2.i; + x1.r = q__1.r, + x1.i = q__1.i; + q__2.r = t2.r * c[8].r + t2.i * c[8].i, + q__2.i = -t2.r * c[8].i + t2.i * c[8].r; + q__1.r = x2.r + q__2.r, + q__1.i = x2.i + q__2.i; + x2.r = q__1.r, + x2.i = q__1.i; + q__2.r = t2.r * c[9].r + t2.i * c[9].i, + q__2.i = -t2.r * c[9].i + t2.i * c[9].r; + q__1.r = x3.r + q__2.r, + q__1.i = x3.i + q__2.i; + x3.r = q__1.r, + x3.i = q__1.i; + q__2.r = t3.r * c[10].r + t3.i * c[10].i, + q__2.i = -t3.r * c[10].i + t3.i * c[10].r; + q__1.r = x1.r + q__2.r, + q__1.i = x1.i + q__2.i; + r[5].r = q__1.r, + r[5].i = q__1.i; + q__2.r = t3.r * c[11].r + t3.i * c[11].i, + q__2.i = -t3.r * c[11].i + t3.i * c[11].r; + q__1.r = x2.r + q__2.r, + q__1.i = x2.i + q__2.i; + r[8].r = q__1.r, + r[8].i = q__1.i; + q__2.r = t3.r * c[12].r + t3.i * c[12].i, + q__2.i = -t3.r * c[12].i + t3.i * c[12].r; + q__1.r = x3.r + q__2.r, + q__1.i = x3.i + q__2.i; + r[11].r = q__1.r, + r[11].i = q__1.i; + + q__1.r = a[6].r * b[4].r + a[6].i * b[4].i, + q__1.i = -a[6].r * b[4].i + a[6].i * b[4].r; + t1.r = q__1.r, + t1.i = q__1.i; + q__1.r = a[6].r * b[5].r + a[6].i * b[5].i, + q__1.i = -a[6].r * b[5].i + a[6].i * b[5].r; + t2.r = q__1.r, + t2.i = q__1.i; + q__1.r = a[6].r * b[6].r + a[6].i * b[6].i, + q__1.i = -a[6].r * b[6].i + a[6].i * b[6].r; + t3.r = q__1.r, + t3.i = q__1.i; + q__2.r = a[9].r * b[7].r + a[9].i * b[7].i, + q__2.i = -a[9].r * b[7].i + a[9].i * b[7].r; + q__1.r = t1.r + q__2.r, + q__1.i = t1.i + q__2.i; + t1.r = q__1.r, + t1.i = q__1.i; + q__2.r = a[9].r * b[8].r + a[9].i * b[8].i, + q__2.i = -a[9].r * b[8].i + a[9].i * b[8].r; + q__1.r = t2.r + q__2.r, + q__1.i = t2.i + q__2.i; + t2.r = q__1.r, + t2.i = q__1.i; + q__2.r = a[9].r * b[9].r + a[9].i * b[9].i, + q__2.i = -a[9].r * b[9].i + a[9].i * b[9].r; + q__1.r = t3.r + q__2.r, + q__1.i = t3.i + q__2.i; + t3.r = q__1.r, + t3.i = q__1.i; + q__2.r = a[12].r * b[10].r + a[12].i * b[10].i, + q__2.i = -a[12].r * b[10].i + a[12].i * b[10].r; + q__1.r = t1.r + q__2.r, + q__1.i = t1.i + q__2.i; + t1.r = q__1.r, + t1.i = q__1.i; + q__2.r = a[12].r * b[11].r + a[12].i * b[11].i, + q__2.i = -a[12].r * b[11].i + a[12].i * b[11].r; + q__1.r = t2.r + q__2.r, + q__1.i = t2.i + q__2.i; + t2.r = q__1.r, + t2.i = q__1.i; + q__2.r = a[12].r * b[12].r + a[12].i * b[12].i, + q__2.i = -a[12].r * b[12].i + a[12].i * b[12].r; + q__1.r = t3.r + q__2.r, + q__1.i = t3.i + q__2.i; + t3.r = q__1.r, + t3.i = q__1.i; + + q__2.r = t1.r * c[4].r + t1.i * c[4].i, + q__2.i = -t1.r * c[4].i + t1.i * c[4].r; + q__1.r = r[6].r + q__2.r, + q__1.i = r[6].i + q__2.i; + x1.r = q__1.r, + x1.i = q__1.i; + q__2.r = t1.r * c[5].r + t1.i * c[5].i, + q__2.i = -t1.r * c[5].i + t1.i * c[5].r; + q__1.r = r[9].r + q__2.r, + q__1.i = r[9].i + q__2.i; + x2.r = q__1.r, + x2.i = q__1.i; + q__2.r = t1.r * c[6].r + t1.i * c[6].i, + q__2.i = -t1.r * c[6].i + t1.i * c[6].r; + q__1.r = r[12].r + q__2.r, + q__1.i = r[12].i + q__2.i; + x3.r = q__1.r, + x3.i = q__1.i; + q__2.r = t2.r * c[7].r + t2.i * c[7].i, + q__2.i = -t2.r * c[7].i + t2.i * c[7].r; + q__1.r = x1.r + q__2.r, + q__1.i = x1.i + q__2.i; + x1.r = q__1.r, + x1.i = q__1.i; + q__2.r = t2.r * c[8].r + t2.i * c[8].i, + q__2.i = -t2.r * c[8].i + t2.i * c[8].r; + q__1.r = x2.r + q__2.r, + q__1.i = x2.i + q__2.i; + x2.r = q__1.r, + x2.i = q__1.i; + q__2.r = t2.r * c[9].r + t2.i * c[9].i, + q__2.i = -t2.r * c[9].i + t2.i * c[9].r; + q__1.r = x3.r + q__2.r, + q__1.i = x3.i + q__2.i; + x3.r = q__1.r, + x3.i = q__1.i; + q__2.r = t3.r * c[10].r + t3.i * c[10].i, + q__2.i = -t3.r * c[10].i + t3.i * c[10].r; + q__1.r = x1.r + q__2.r, + q__1.i = x1.i + q__2.i; + r[6].r = q__1.r, + r[6].i = q__1.i; + q__2.r = t3.r * c[11].r + t3.i * c[11].i, + q__2.i = -t3.r * c[11].i + t3.i * c[11].r; + q__1.r = x2.r + q__2.r, + q__1.i = x2.i + q__2.i; + r[9].r = q__1.r, + r[9].i = q__1.i; + q__2.r = t3.r * c[12].r + t3.i * c[12].i, + q__2.i = -t3.r * c[12].i + t3.i * c[12].r; + q__1.r = x3.r + q__2.r, + q__1.i = x3.i + q__2.i; + r[12].r = q__1.r, + r[12].i = q__1.i; + + return; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/w_mult.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/w_mult.F90 new file mode 100644 index 0000000000000000000000000000000000000000..1fafce298e018acbddfdb050ec791601a411a9d5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/w_mult.F90 @@ -0,0 +1,78 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics program +! +! Author: Hinnerk Stueben +! +! Copyright (C) 2003-2005, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! w_mult.F90 - W := M~ + rho (Hasenbusch improvement) +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +subroutine w_mult(out, in, para, conf) ! out = W in + + use typedef_hmc + use module_vol + implicit none + + type(hmc_para), intent(in) :: para + type(hmc_conf), intent(in) :: conf + + SPINCOL_FIELD, intent(out) :: out + SPINCOL_FIELD, intent(in) :: in + + call mtil(out, in, para, conf) + + call sc_axpy(out, in, para%rho) + +end + +!------------------------------------------------------------------------------- +subroutine w_mult_dag(out, in, para, conf) ! out = W+ in + + use typedef_hmc + use module_vol + implicit none + + type(hmc_para), intent(in) :: para + type(hmc_conf), intent(in) :: conf + + SPINCOL_FIELD, intent(out) :: out + SPINCOL_FIELD, intent(in) :: in + + call mtil_dag(out, in, para, conf) + + call sc_axpy(out, in, para%rho) + +end + +!------------------------------------------------------------------------------- +subroutine w_dagger_w(out, in, para, conf) ! out = (W+ W) in + + use typedef_hmc + use module_p_interface + use module_vol + implicit none + + type(hmc_para), intent(in) :: para + type(hmc_conf), intent(in) :: conf + + SPINCOL_FIELD :: out, in + P_SPINCOL_FIELD, save :: tmp + + TIMING_START(timing_bin_mtdagmt) + + ALLOCATE_SC_FIELD(tmp) + + call w_mult(tmp, in, para, conf) + call w_mult_dag(out, tmp, para, conf) + + TIMING_STOP(timing_bin_mtdagmt) +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_A/xyzt2i.F90 b/qcd/part_cpu/applications/QCD/src/kernel_A/xyzt2i.F90 new file mode 100644 index 0000000000000000000000000000000000000000..d87af69ff7a72b83981bed4df13362b5b8c64aa4 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_A/xyzt2i.F90 @@ -0,0 +1,97 @@ +!=============================================================================== +! +! BQCD -- Berlin Quantum ChromoDynamics programme +! +! Author: Hinnerk Stueben +! +! Copyright (C) 1998-2006, Hinnerk Stueben, Zuse-Institut Berlin +! +!------------------------------------------------------------------------------- +! +! xyzt2i.F90 - maps local coordinates (x,y,z,t) to even/odd index +! +!------------------------------------------------------------------------------- +# include "defs.h" + +!------------------------------------------------------------------------------- +integer function xyzt2i(x_in) + + ! x_in := (x,y,z,t) + ! -1 <= x(mu) <= N(mu) ; mu = 1,2,3,4 + ! xyzt2i >= 1 + + use module_function_decl + use module_lattice + use module_offset + implicit none + + integer, dimension (DIM), intent(in) :: x_in + integer, dimension (DIM) :: dir, i, m, x + integer :: count, mu + integer, external :: ieo, n_sites, i_periodic, ilex + + + count = 0 + do mu = 1, DIM + + x(mu) = x_in(mu) + + if (x(mu) < -1 .or. x(mu) > N(mu)) then + call die('xyzt2i(): x(mu) out of range') + endif + + if (NPE(mu) == 1) x(mu) = i_periodic(x(mu), N(mu)) + + if (x(mu) == -1) then + dir(mu) = -1 + elseif (x(mu) == N(mu)) then + dir(mu) = 1 + else + dir(mu) = 0 + count = count + 1 + endif + + if (dir(mu) /= 0) then + i(mu) = 0 + m(mu) = 1 + else + i(mu) = x(mu) + m(mu) = N(mu) + endif + enddo + + if (count == DIM) then + xyzt2i = offset(0,0,0,0) + ieo(DIM, x, N) + 1 + else + ASSERT(num_pes() /= 1) + + if (dir(1) /= 0) then + !!ASSERT(n_sites(DIM, dir, N, NPE) == n_sites(DIM, dir, NH, NPE)) + !!ASSERT(ilex(DIM, i, m) <= n_sites(DIM, dir, NH, NPE)) + xyzt2i = offset(dir(1),dir(2),dir(3),dir(4)) + ilex(DIM, i, m) + 1 + else + xyzt2i = offset(dir(1),dir(2),dir(3),dir(4)) + ieo(DIM, i, m) + 1 + endif + endif + +end + +!------------------------------------------------------------------------------- +integer function std_xyzt2i(x) + + use module_lattice + implicit none + + integer, dimension (DIM), intent(in) :: x + integer, dimension (DIM) :: x_act + integer, external :: xyzt2i + + x_act(1) = x(gamma_index(1)) + x_act(2) = x(gamma_index(2)) + x_act(3) = x(gamma_index(3)) + x_act(4) = x(gamma_index(4)) + + std_xyzt2i = xyzt2i(x_act) +end + +!=============================================================================== diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_B/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..31b2207cc57537d5ff8f443a772d78649b7167ed --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/Makefile @@ -0,0 +1,10 @@ +include Makefile.defs + +#MODULE_INIT# +#MODULE_CMD# #MODULE_FILES# + +kernel: + cd libraries && $(MAKE) all + cd su3h_n && $(MAKE) kernel-objects + $(AR) $(ARFLAGS) ../kernel_B.a libraries/*.o su3h_n/*.o + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/Makefile.defs.in b/qcd/part_cpu/applications/QCD/src/kernel_B/Makefile.defs.in new file mode 100644 index 0000000000000000000000000000000000000000..3a5a290170bef0e2a21cc3daa385544ec63bbdab --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/Makefile.defs.in @@ -0,0 +1,13 @@ +MAKE = #MAKE# + +RM = #RM# + +AR = #AR# +ARFLAGS = #ARFLAGS# + +CC = #CC# +CFLAGS = #CFLAGS# + +MPI_CC = #MPI_CC# + +LDFLAGS = #LDFLAGS# diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/aa/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_B/aa/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..0b41dc11a810a6d13720e9cbef1b7e298dc700e7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/aa/Makefile @@ -0,0 +1,12 @@ +OBJ = calclist.o dblarr.o halt.o io_unformat.o jacobi.o + +.PHONY : clean + +aa : $(OBJ) + $(CC) aa.c $(OBJ) -o aa -lm + +%.o : %.c + $(CC) -c $< -o $@ + +clean : + rm -f *.o diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/aa/aa.c b/qcd/part_cpu/applications/QCD/src/kernel_B/aa/aa.c new file mode 100644 index 0000000000000000000000000000000000000000..f92c87803c6e540f8ddd1aaf7ba110d59a359a9a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/aa/aa.c @@ -0,0 +1,1283 @@ +/************************************************************************* + * + * This c-program analyzes autocorrelations from + * histogram datafiles + * + * aa [opt] 'name' + * input files: + * histogram file + * output to standard-output + * + * COMPILE: + * cc -o aa aa.c io_unformat.c calclist.c dblarr.c jacobi.c halt.c -lm + * + *************************************************************************/ + + +#include +#include +#include +#include +#include +/* #include */ +#include + +#include "stuff.h" + +/* #include */ + +/* #define max(x,y) (((x) > (y)) ? (x) : (y)) */ + +/* prototypes... + */ + + +#define MAX_DATA 10000 + +#define PI 3.14159265358979 +#define pi2 (PI*2.0) + +double * dblalloc(int num); +void halt(char *message,char *msg); +int ccorr_matrix(double *dat[],int n,int length,int jack, + double *wd,int weight,double wnorm); +int history(int nv, double a[],int showlevel,double level_val, + double wd[],int weight); +int errorcalc(int nconf,double svect[],double *avep,double *sigp, + double *fsp,double *fser); +int autocorr(double *d,int nd,double *res,int nres); +int histgr2(double *a,double *b,int ndata,int bins,double *wd,int weight, + double low1,double high1,double low2,double high2); +double tunnelcalc(int nd,double *d,double tmin,double tmax); + +int noauto = 0; + +char usage[] = " usage: aa [-opt] hist-file\n\ + q : quiet\n\ + j nblocks : jackknife data\n\ + J nblocks : print jackknife blocks\n\ + A : print all measurements as jackknife blocks\n\ + i iters : use only i iterations\n\ + s iters : skip iteration\n\ + S n : use only every n iteration\n\ + d num : use only data num\n\ + D #2+#3,#4 : collect data and do the arithmetic [+-*/]\n\ + R r1:r2 : print data from r1 to r2\n\ + f num=val : filter data with vector num = value\n\ + l[L] a:b : data range from a to b [element 2]\n\ + X : print just the data indicated\n\ + C : cross-correlation of data\n\ + E : eigenvalue analysis of cross-correlation matrix\n\ + c num : correlation number num\n\ + cl length : force vector length\n\ + cc/cd num : print 2-dim. array number num\n\ + cos : cosine transform the data (only with -j, -A)\n\ + a : sum -D or -c into one data element\n\ + x d,i,num : datavector element d[index[num]], num optional\n\ + K n : take modulo of index\n\ + h : print histogram plot\n\ + v value : show value in the histogram\n\ + V : divide value by volume\n\ + Q : multiply value by sqrt of volume\n\ + w : use the weight factor (elem 1)\n\ + 0 : subtract the 0-moment (average)\n\ + 2 : print second moment\n\ + p power : print moment power\n\ + F bins : print histogram in weight format\n\ + r block : do the 'running blocking'\n\ + b block : do the 'normal blocking'\n\ + m/M value : analyze only up to minimum/maximum value\n\ + O : list measurements up to minimum/maximum value\n\ + P : lag the list by one unit\n\ + g [bins] : print histogram to the standard output, with bins bins\n\ + B [binw] : print histogram, with binwidth [1]\n\ + H : print history to the standard output\n\ + G bins : 2-component histogram\n\ + W b_old,b_new,\'act_string\' : reweight to new beta value (up to 10 times)\n\ + Y bg,mU_old,mU_new,T_old,T_new : reweight to new T value (susy)\n\ + Z mH_old,mH_new,b_old,b_new : reweight to new mH, betaH -values (su2-Higgs)\n\ + y tmin:tmax : print tunneling time\n\ + t : no autocorrelations\n\ + T length : print autocorrelation function for distance length\n"; + +main(argc,argv) + int argc; + char * argv[]; + +{ + double *fp,*dat[MAX_DATA],*tmparr,*wd; + double *tmpx; + int block,indexed,iv,dv,index; + int nblocking,length,jack,sum; + double temps,minval,maxval,tmin,tmax; + char * ss,*lists[MAX_DATA]; + int raw,eig; + int idata,nd; + int i,j,jj,k,iters,hist,printhist; + long lk; + int gram,rblock,bblock,skip,weight; + double hbin,error,naive,aveg,average[MAX_DATA],timerel,level_val; + int list,limits,lagged,showlevel,stagger,mom2,ccorr; + int vol,volume,wgram,irange1,irange2,icorr,printjack,index_mod; + int four,h2gram,reweight,susy,higgs,isarray,tunnel; + int quiet,sqrtV,icorrlen; + int filter; + double filtervalue; + double betar,wnorm,vnorm; + double mU,mUn,mH,mH2,T,Tn; + int autocorrelation = 0,sub_ave = 0; + double power = 0,beta1[10],beta2[10]; + char rwstring[10][100]; + FILE *ff; + e_header h; + double low1,high1,low2,high2; + double susy_mul,betag; + + hbin = 0.0; + block = nblocking = iters = hist = printhist = gram = idata = + rblock = bblock = jack = list = limits = stagger = mom2 = quiet = 0; + lagged = showlevel = weight = volume = wgram = 0; + irange1 = irange2 = icorr = icorrlen = 0; + ccorr = sqrtV = sum = four = h2gram = 0; + printjack = indexed = index_mod = reweight = 0; + susy = higgs = raw = eig = isarray = 0; + filter = tunnel = 0; + nd = 0; + minval = -1e-60; + maxval = 1e60; + + low1=low2=high1=high2=0; + + skip = 0; + if (argc <= 1) halt(usage,NULL); + + while (--argc > 0 && (*++argv)[0] == '-') { + ss = argv[0] + 1; + + while (*ss) { + switch(*ss++) { + + case 'q': quiet = 1; break; + case 'O': list = 1; break; + case 'P': lagged = 1; break; + case 't': noauto = 1; break; + case 'T': getnum("%d",&autocorrelation); break; + case 'y': tunnel = 1; get2num("%lg:%lg",&tmin,&tmax); break; + case 'w': weight = 1; break; + case '0': sub_ave = 1; break; + case '2': mom2 = 1; break; + case 'a': sum = 1; break; + case 'p': getnum("%lg",&power); break; + case 'F': wgram = weight = volume = 1; idata = 4; + getnum("%d",&gram); break; + case 'V': volume = 1; break; + case 'Q': sqrtV = 1; break; + case 'c': + if (strcmp(ss,"os") == 0) { four = 1; ss+=2; } + else if (*ss == 'l') { ss++; getnum("%d",&icorrlen); } + else { + if (*ss == 'c') { isarray=1; ss++; } + if (*ss == 'd') { isarray=2; ss++; } + getnum("%d",&icorr); + } + break; + case 'm': limits = 1; getnum("%lg",&minval); break; + case 'M': limits = 1; getnum("%lg",&maxval); break; + case 'i': getnum("%d",&iters); break; + case 's': getnum("%d",&skip); break; + case 'S': getnum("%d",&stagger); break; + case 'r': getnum("%d",&rblock); break; + case 'b': getnum("%d",&bblock); break; + case 'h': hist = 1; break; + case 'H': printhist = 1; break; + case 'v': showlevel = 1; getnum("%lg",&level_val); break; + case 'j': getnum("%d",&jack); break; + case 'J': printjack = 1; getnum("%d",&jack); break; + case 'A': printjack = 1; jack = -1; break; + case 'W': + /* take away white space */ + { char *p,*q; + p = q = ss; + while (*p) { + if (*p != ' ' && *p != '\t') *(q++) = *p; + p++; + } + *q = 0; + } + get3num("%lg,%lg,%s",&beta1[reweight],&beta2[reweight],rwstring[reweight]); + beta2[reweight] -= beta1[reweight]; + reweight++; + break; + case 'Y': reweight = susy = 1; + get5num("%lg,%lg,%lg,%lg,%lg",&betag,&mU,&mUn,&T,&Tn); + break; + case 'Z': reweight = higgs = 1; + get4num("%lg,%lg,%lg,%lg",&mH,&mH2,&beta1[0],&beta2[0]); + break; + + case 'd': getnum("%d",&idata); nd = 1; break; + case 'D': getlist(lists,nd); break; + + case 'f': + get2num("%d=%lg",&filter,&filtervalue); + break; + + case 'X': raw = 1; break; + case 'E': eig = 1; break; + + case 'R': + if (!(*ss) && --argc) ss = (++argv)[0]; + if (sscanf(ss,"%d:%d",&irange1,&irange2) != 2) halt(usage,NULL); + ss = strchr(ss,0); + break; + + case 'x': + indexed = 1; + if (!(*ss) && --argc) ss = (++argv)[0]; + if ((i = sscanf(ss,"%d,%d,%d",&dv,&iv,&index)) == 3) { + irange1 = irange2 = index; + } else if (i == 2) index = 0; + else halt(usage,NULL); + ss = strchr(ss,0); + break; + + case 'K': getnum("%d",&index_mod); break; + + case 'C': ccorr = 1; break; + + case 'B': getoptnum("%lg",&hbin,1.0); gram = 1; break; + case 'g': getoptnum("%d",&gram,100); break; + case 'G': getnum("%d",&h2gram); break; + + case 'l': get2num("%lg:%lg",&low1,&high1); break; + case 'L': get2num("%lg:%lg",&low2,&high2); break; + + default: halt(usage,NULL); + + } /* switch */ + } + } /* while */ + + if (argc == 0) halt(usage,NULL); + + /* now find the histogram file length */ + + ss = argv[0]; + + ff = fopen(ss,"r"); + if (ff == NULL) halt("Could not open file %s",ss); + + block = readheader(ff,&h); + if (icorrlen) h.d2 = icorrlen; + tmparr = dblarr(block); + + vol = h.lx*h.ly*h.lz*h.lt; + + if (indexed && !irange1) icorr = 1; + + if (icorr) { + if (h.d1 >= icorr || icorrlen) { + irange1 = 1 + (icorr-1)*h.d2; + irange2 = irange1 + h.d2 - 1; + } else if (h.d1 + h.d3 >= icorr) { + irange1 = 1 + h.d1*h.d2 + (icorr-1-h.d1)*h.d4; + irange2 = irange1 + h.d4 - 1; + } else halt("No such thing!",NULL); + } + + if (indexed) tmpx = dblarr(irange2-irange1+1); + + /* do not assume only doubles */ + if (iters) iters += skip; + + length = 0; + do { + length++; + lk = readdata(ff,tmparr); + } while (lk == length && iters != length); + if (lk != length) length--; + rewind(ff); + skipheader(ff); + if (icorrlen) h.d2 = icorrlen; + + length -= (skip); + if (!quiet) { + fprintf(stderr,"* Data: %d Measurements: %d\n",block,length); + fprintf(stderr," double %ld, float %ld, long %ld, char %ld\n", + h.n_double,h.n_float,h.n_long,h.n_char); + } + + if (length <= 0) halt("Measurements == 0",NULL); + if (jack < 0) jack = length; + + if (volume) vnorm = 1.0/vol; else vnorm = 1; + if (sqrtV) vnorm *= sqrt(vol); + + if (!raw) { + if (nd) { + for (i=0; i block) halt("Illegal data number",NULL); + if (irange1 > irange2 || irange2 > block) halt("Illegal range spec",NULL); + + if (weight || reweight) wd = dblarr(length); + + if (susy) susy_mul = pow(4.0/(betag * 4.0/9.0),3.0) * (h.lx*h.ly*h.lz); + + j = skip; + while (j-- > 0) lk = readdata(ff,tmparr); + + if (idata) fp = dat[0]; + for (jj=j=0; jj < length; jj++) { + extern int calclist_index; + + lk = readdata(ff,tmparr); + + calclist_index = jj; + + k=0; + if (!filter || tmparr[filter-1] == filtervalue) { + + if (reweight || weight) { + double w; + static int first=1; + static double w0; + + if (susy) + /*** THis is the `old' susy weight + w = 0.379*(sqr(100.0/T) - sqr(100.0/Tn)) * tmparr[8] + + 0.849*(sqr(mU/T) - sqr(mUn/Tn)) * tmparr[16]; + *****/ + + w = -susy_mul*( (1.0/sqr(T) - 1.0/sqr(Tn)) * + ( 18384.1*tmparr[3] - 3984.08*tmparr[4] - 2*1191.72*tmparr[7] + + 2*96.6867*tmparr[8] ) + - (sqr(mU/T) - sqr(mUn/Tn))*tmparr[14] ); + + else if (higgs) w = (beta2[0]/beta1[0]-1)*tmparr[2] + + (sqr(beta2[0]*mH2/(beta1[0]*mH))-1)*tmparr[4]; + else if (reweight) + for (w=i=0; i= minval) { + if (lagged && !lag_on) lag_on = 1; + else { + j++; + if (!isaccept) { + isaccept = 1; + begmeas = i; + } + } + } else { + lag_on = 0; + if (isaccept) { + isaccept = 0; + printf("%d %d\n",begmeas,i-1); + } + } + } + if (isaccept) printf("%d %d\n",begmeas,length-1); + + if (!quiet) fprintf(stderr," - accepted %d of %d measurements\n",j,length); + exit(-1); + } + + if (limits) { + double *dd[MAX_DATA],mval,Mval; + + for (idata=irange1; idata<=irange2; idata++) dd[idata] = dblarr(length); + for (i=j=0; i= minval) { + for (idata=irange1; idata<=irange2; idata++) + dd[idata][j] = dat[idata][i]; + j++; + } + } + for (idata=irange1; idata<=irange2; idata++) { + free(dat[idata]); + dat[idata] = dd[idata]; + } + if (!quiet) fprintf(stderr," - accepted %d of %d measurements\n",j,length); + length = j; + if (jack > length) jack = length; + } + + if (stagger) { + double * dd,val; + + for (idata=irange1; idata<=irange2; idata++) { + dd = (double *)calloc(length/stagger+1,sizeof(double)); + for (i=0; i length) jack = length; + } + + if (rblock) { + double * dd,val; + + for (idata=irange1; idata<=irange2; idata++) { + dd = (double *)calloc(length-rblock+1,sizeof(double)); + for (val=i=0; i length) jack = length; + } + + idata = irange1; + + if (printjack && gram) { + printhgram(dat[0],gram,length,wd,weight,jack,hbin); + return(1); + } + + if (printjack || jack) { + double *jave,*jnorm; + int dl,id; + + dl = irange2-irange1+1; + jave = dblarr(dl*jack); + jnorm = dblarr(jack); + + for (j=0; j 1) ftrans(jave+j*dl,dl); + if (printjack) { + for (idata=irange1; idata<=irange2; idata++) { + id = idata-irange1; + if (!isarray) printf("%d %.16lg\n",id,jave[id+j*dl]); + else if (isarray == 1) + printf("%ld %ld %.16lg\n",id/h.d3,id%h.d3,jave[id+j*dl]); + else { + printf("%.16lg ",jave[id+j*dl]); + if ((id+1)%h.d3 == 0) printf("\n"); + } + + /* fprintf(stderr,"."); fflush(stderr); */ + } + } + } + if (!printjack) { + double aveg,error; + + for (idata=irange1; idata<=irange2; idata++) { + id = idata-irange1; + aveg = error = 0.0; + for (j=0; j p[i]) ? p[i] : minv; + } + + if (hbin <= 0) { + hg = (double *)calloc(bins,sizeof(double)); + d = (maxv-minv)/(bins-1); + bn = bins; + maxv += d/2; minv -= d/2; + xadd = 0.5; + } else { + maxv = hbin * ceil(maxv/hbin); + minv = hbin * floor(minv/hbin); + bins = maxv/hbin - minv/hbin + 1; + bn = bins-1; + hg = (double *)calloc(bins,sizeof(double)); + d = hbin; + xadd = 0; + } + + if (jack < 2) { + + if (!weight) + for (i=0; i a[i]) ? a[i] : mina; + } + } + d = (maxa-mina)/(bins-1); + maxa += d/2; mina -= d/2; + + if (maxb <= minb) { + maxb = -1e60; minb = -maxb; + for (i=0; i b[i]) ? b[i] : minb; + } + } + d = (maxb-minb)/(bins-1); + maxb += d/2; minb -= d/2; + + for (i=0; i ac[j]) ? max : ac[j]; + } + + for (j=0; j= min && level_val <= max) { + for (j=0; j=0; j--) printf("%s\n",level[j]); + + free(his); + + for (i=0; i <= NLEV; i++) { + sp = level[i]; + for (j=0; j< bins; j++) sp[j] = ' '; + sp[j] = 0; + } + + if (showlevel && level_val >= min && level_val <= max) { + for (j=0; j ac[i]) ? dmax : ac[i]; + k++; + } + dave /= k; + dsig = sqrt(dsig/k - sqr(dave)); + level[ (int)(NLEV*(dmin-min)/(max-min)) ][j] = '.'; + level[ (int)(NLEV*(dmax-min)/(max-min)) ][j] = '.'; + if (dave+dsig <= max) + level[ (int)(NLEV*(dave+dsig-min)/(max-min)) ][j] = '-'; + if (dave-dsig >= min) + level[ (int)(NLEV*(dave-dsig-min)/(max-min)) ][j] = '-'; + level[ (int)(NLEV*(dave-min)/(max-min)) ][j] = '*'; + } + + printf("\nblocked history with %d blocks, size %d ...\n",bins,nv/bins); + + for (j=NLEV; j>=0; j--) printf("%s\n",level[j]); + + return(1); +} + +/******************************************************************** + * Assume here periodicity so that d[2*nd - 1 - i] = d[i] + */ + +int +ccorr_matrix(double *dat[],int n,int length,int jack, + double *wd,int weight,double wnorm) +{ + int i,j,k; + double *a,*w,*eval,*evec; + + a = dblarr(n); + w = dblarr(n*n); + eval = dblarr(n); + evec = dblarr(n*n); + + if (!weight) wnorm = 1.0/length; else wnorm /= length; + + for (j=0; j 1 && (sig == 0.0 || sig < 1.0e-12*sqr(ave) || noauto)) { + /* if (!noauto) fprintf(stderr," ** small sigma/ave: %g/%g\n",sqrt(sig),ave); */ + *avep = ave; + *sigp = sqrt(sig/(nconf-1)); + *fsp = *fser = 0.5; + return(1); + } + + if (nconf <= 1) { + *avep = ave; + *sigp = 0.0; + *fsp = *fser = 0.5; + return(1); + } + + /* time correlations */ + + it = 0; + fs = 0.5; + do { + it++; + ax = ay = fi = 0.0; + + nc=nconf-it; + for (j=0; j it && it < nconf/2); + + if (it >= nconf/2) fprintf(stderr," ** correlation > N/2*%d\n",TINT); + + sig=sqrt(sig/(nconf-1)); + + *sigp = sig; + *avep = ave; + *fsp = fs; + *fser = fs*sqrt(2.*(2.*it+1)/nc); + + return(0); +} + + +/******************************************************************** + * this routine calculates autocorrelation function + */ + +int +autocorr(d,nd,res,nres) + int nd,nres; + double d[], res[]; +{ + double ave,fs,sig,ax,ay,fi; + int i,j,nc,it,k; + + ave = sig = 0.0; + + for(i=0; i= tmax) { + if (!ismax) num++; + ismax = 1; ismin = 0; + } else if (d[i] <= tmin) { + if (!ismin) num++; + ismax = 0; ismin = 1; + } + } + + return(1.0*nd/num); +} + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/aa/calclist.c b/qcd/part_cpu/applications/QCD/src/kernel_B/aa/calclist.c new file mode 100644 index 0000000000000000000000000000000000000000..4e7a7779b1edf6ece4fb4ce3513663dcaf2b2f8c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/aa/calclist.c @@ -0,0 +1,237 @@ +#include +#include +#include +#include +#include +/* #include */ +#include + +#include "stuff.h" +#ifdef cray +double asinh(double x) { return(log(sqrt(x*x+1.0) + x)); } +double acosh(double x) { return(log(sqrt(x*x-1.0) + x)); } +double atanh(double x) { return(0.5*log((1.0+x)/(1.0-x))); } +#endif + +#define pi 3.1415926535897929 + +char * getnumber(char *s,double *d); +double evallist(double d[],int dn, int prec); +void eval2(double *val1, double *val2, double d[],int dn); + + +static char *cmd, *cmd0; +/* This is modified by the calling program, to produce correct + * index + */ +int calclist_index = 0; + +/********************************************************** + * this evaluates the data arithmetic string + */ + +double +calclist(double d[],int dn, char *incmd) +{ + double val; + + /* save current number of calclist calls - return with #i */ + + cmd0 = cmd = incmd; + val = evallist(d,dn,0); + if (*cmd == 0) return(val); + if (*cmd == ')') { + fprintf(stderr,"Extra \')\' : %s\n",cmd0); + exit(0); + } + if (*cmd == ',') { + fprintf(stderr,"Extra \',\' : %s\n",cmd0); + exit(0); + } + fprintf(stderr,"Parser error: %s\n",cmd0); +} + + +double +evallist(double d[],int dn,int prec) +{ + int id; + char *sp; + double val,val2; + + /* get argument first */ + while (*cmd == ' ') cmd++; + if (*cmd == '(') { + cmd++; + val = evallist(d,dn,0); + if (*cmd != ')') { + fprintf(stderr,"Expecting \')\': \'%s\'\n",cmd0); + exit(0); + } + cmd++; + } else if (*cmd == '#') { + cmd++; + if (*cmd == 'i') { + /* print out index */ + val = calclist_index; + cmd++; + } else { + if ((cmd=getnumber(cmd,&val)) == NULL) { + fprintf(stderr,"Expecting 'i'/number after #: \'%s\'\n",cmd0); + exit(0); + } + id = val; + if (id > dn) { + fprintf(stderr,"#%d too large: max %d\n",id,dn); + exit(0); + } + val = d[id-1]; + } + } + else if ((sp=getnumber(cmd,&val)) != NULL) cmd = sp; + else { + /* now something else as ( or #; check for function name */ + if (strncmp(cmd,"sqrt(",5) == 0) { cmd+=5; val = sqrt(evallist(d,dn,0)); } + else if (strncmp(cmd,"abs(",4) == 0) { cmd+=4; val = fabs(evallist(d,dn,0)); } + else if (strncmp(cmd,"sin(",4) == 0) { cmd+=4; val = sin(evallist(d,dn,0)); } + else if (strncmp(cmd,"cos(",4) == 0) { cmd+=4; val = cos(evallist(d,dn,0)); } + else if (strncmp(cmd,"tan(",4) == 0) { cmd+=4; val = tan(evallist(d,dn,0)); } + else if (strncmp(cmd,"exp(",4) == 0) { cmd+=4; val = exp(evallist(d,dn,0)); } + else if (strncmp(cmd,"log(",4) == 0) { cmd+=4; val = log(evallist(d,dn,0)); } + else if (strncmp(cmd,"asin(",5) == 0) { cmd+=5; val = asin(evallist(d,dn,0)); } + else if (strncmp(cmd,"acos(",5) == 0) { cmd+=5; val = acos(evallist(d,dn,0)); } + else if (strncmp(cmd,"atan(",5) == 0) { cmd+=5; val = atan(evallist(d,dn,0)); } + else if (strncmp(cmd,"sinh(",5) == 0) { cmd+=5; val = sinh(evallist(d,dn,0)); } + else if (strncmp(cmd,"cosh(",5) == 0) { cmd+=5; val = cosh(evallist(d,dn,0)); } + else if (strncmp(cmd,"tanh(",5) == 0) { cmd+=5; val = tanh(evallist(d,dn,0)); } + else if (strncmp(cmd,"asinh(",6) == 0){ cmd+=6; val = asinh(evallist(d,dn,0)); } + else if (strncmp(cmd,"acosh(",6) == 0){ cmd+=6; val = acosh(evallist(d,dn,0)); } + else if (strncmp(cmd,"atanh(",6) == 0){ cmd+=6; val = atanh(evallist(d,dn,0)); } + else if (strncmp(cmd,"ceil(",5) == 0) { cmd+=5; val = ceil(evallist(d,dn,0)); } + else if (strncmp(cmd,"floor(",6) == 0){ cmd+=6; val = floor(evallist(d,dn,0)); } + else if (strncmp(cmd,"min(",4) == 0) { + cmd+=4; eval2(&val,&val2,d,dn); val = smaller(val,val2); } + else if (strncmp(cmd,"max(",4) == 0) { + cmd+=4; eval2(&val,&val2,d,dn); val = greater(val,val2); } + else if (strncmp(cmd,"pi",2) == 0) { + cmd+=2; val = pi; } + else { fprintf(stderr,"Unknown stuff: %s\n",cmd0); exit(0); } + + if (*cmd != ')') { + fprintf(stderr,"Expecting \')\' after the function name: \'%s\'\n",cmd0); + exit(0); + } + cmd++; + } + + /* now the operator */ + while (*cmd == ' ') cmd++; + + while (*cmd) { + switch (*cmd) { + case ',': return(val); + case ')': return(val); + case '^': + if (prec >= 5) return(val); + cmd++; + val = pow(val,evallist(d,dn,5)); + break; + case '*': + if (prec >= 4) return(val); + cmd++; + val *= evallist(d,dn,4); + break; + case '/': + if (prec >= 4) return(val); + cmd++; + val /= evallist(d,dn,4); + break; + case '%': + if (prec >= 4) return(val); + cmd++; + val = fmod(val,evallist(d,dn,4)); + break; + case '+': + if (prec >= 3) return(val); + cmd++; + val += evallist(d,dn,3); + break; + case '-': + if (prec >= 3) return(val); + cmd++; + val -= evallist(d,dn,3); + break; + + case '=': + if (cmd[1] != '=') { fprintf(stderr,"Expecting '==': %s\n",cmd0); exit(0); } + if (prec >= 2) return(val); + cmd += 2; + val = (val == evallist(d,dn,2)); + break; + case '<': + if (prec >= 2) return(val); + cmd++; + if (cmd[0] == '=') { + val = (val <= evallist(d,dn,2)); cmd++; + } else val = (val < evallist(d,dn,2)); + break; + case '>': + if (prec >= 2) return(val); + cmd++; + if (cmd[0] == '=') { + val = (val >= evallist(d,dn,2)); cmd++; + } else val = (val > evallist(d,dn,2)); + break; + case '!': + if (cmd[1] != '=') { fprintf(stderr,"Expecting '!=': %s\n",cmd0); exit(0); } + if (prec >= 2) return(val); + cmd += 2; + val = (val != evallist(d,dn,2)); + break; + + default: + fprintf(stderr,"Unknown operator: %s\n",cmd0); + exit(0); + } + } + return(val); +} + +/************************************************************** + * eval 2 arguments -- must be comma! + */ + +void +eval2(double *val1, double *val2, double d[],int dn) +{ + *val1 = evallist(d,dn,0); + if (*cmd != ',') { fprintf(stderr,"Expecting ',' : %s\n",cmd0); exit(0);} + cmd++; + *val2 = evallist(d,dn,0); +} + + +/************************************************************** + * hop number + */ +char * +getnumber(char *s,double *d) +{ + int dot = 0; + + if (sscanf(s,"%lg",d) != 1) return(NULL); + + while (*s == ' ' || *s == '\t') s++; + if (*s == '+' || *s == '-') s++; + if (*s == '.') { s++; dot = 1; } + while (*s <= '9' && *s >= '0') s++; + if ((!dot) && *s == '.') s++; + while (*s <= '9' && *s >= '0') s++; + if (*s == 'e' || *s == 'E') { + s++; + if (*s == '+' || *s == '-') s++; + while (*s <= '9' && *s >= '0') s++; + } + return(s); +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/aa/dblarr.c b/qcd/part_cpu/applications/QCD/src/kernel_B/aa/dblarr.c new file mode 100644 index 0000000000000000000000000000000000000000..c6281ea10193fd37e64e154da0c5233dbfc830ec --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/aa/dblarr.c @@ -0,0 +1,48 @@ +#include +#include +#include +#include +#include +/* #include */ +#include + +/* #include */ + +#include "stuff.h" + +double * dblarr(int size) +{ + double * p; + + p = (double *)calloc(size,sizeof(double)); + if (p == NULL) { + fprintf(stderr," --- could not allocate double array of size %d\n",size); + exit(0); + } + return (p); +} + +float * fltarr(int size) +{ + float * p; + + p = (float *)calloc(size,sizeof(float)); + if (p == NULL) { + fprintf(stderr," --- could not allocate float array of size %d\n",size); + exit(0); + } + return (p); +} + + +int * intarr(int size) +{ + int * ip; + + ip = (int *)calloc(size,sizeof(int)); + if (ip == NULL) { + fprintf(stderr," --- could not allocate int array of size %d\n",size); + exit(0); + } + return(ip); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/aa/halt.c b/qcd/part_cpu/applications/QCD/src/kernel_B/aa/halt.c new file mode 100644 index 0000000000000000000000000000000000000000..02a432d38189e5c06fa5cf5ac233c36f540e8c86 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/aa/halt.c @@ -0,0 +1,16 @@ +#include +#include +#include +#include +#include +/* #include */ +#include + +int +halt(char *s,void *p) +{ + fprintf(stderr,s,p); + fprintf(stderr,"\n"); + exit(0); + return(0); /* just to get rid of an warning.. */ +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/aa/io_unformat.c b/qcd/part_cpu/applications/QCD/src/kernel_B/aa/io_unformat.c new file mode 100644 index 0000000000000000000000000000000000000000..b72c7cc17970ee3c16ded488d85648a3f0196221 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/aa/io_unformat.c @@ -0,0 +1,292 @@ +/* + * UNFORMATTED IO SYSTEM + * Kari Rummukainen 1990 - 1999 + */ + + +#include +#include +#include +#include +#include +/* #include */ +#include + +#include "stuff.h" + + +static int inv_bytes,long_mode,block,dblock,lblock,fblock,cblock,iblock; +static double *dtmparr; +static float *ftmparr; +static long *ltmparr; +static int *itmparr; +static char *ctmparr; +static int msg = 0; + +#define l_h (sizeof(e_header)/sizeof(long)) + +typedef union { + e_header h; + long l[l_h]; +} h_union; + +typedef union { + i_header h; + int l[l_h]; +} i_union; + +#define ll_h (sizeof(ll_header)/sizeof(int)) + +typedef union { + ll_header h; + int l[ll_h]; +} ll_union; + + + +/************************************************** + * invert the byte ordering + */ + +long +swaplong(long a) +{ + union { + long l; + char c[sizeof(long)]; + } t1,t2; + int i; + + t1.l = a; + for (i=0; i= 2) { /* now long is longer than our long */ + fread(&ll.h,sizeof(ll_header),1,ff); + + for (k=0, j=( (long_mode == 2) ? 0 : 1 ); jlz < 1) h->lz = 1; + if (h->lt < 1) h->lt = 1; + + lblock = iblock = 0; + block = dblock = l.h.n_double; + if (dblock) dtmparr = dblarr(l.h.n_double); + block += fblock = l.h.n_float; + if (fblock) ftmparr = (float *)calloc(l.h.n_float,sizeof(float)); + block += cblock = l.h.n_char; + if (cblock) ctmparr = (char *)calloc(l.h.n_char,sizeof(char)); + + if (l.h.n_long) { + if (long_mode == 0) { + block += lblock = l.h.n_long; + ltmparr = (long *)calloc(l.h.n_long,sizeof(long)); + } else if (long_mode == 1) { + block += iblock = l.h.n_long; + itmparr = (int *)calloc(l.h.n_long,sizeof(int)); + } else if (long_mode >= 2) { + block += iblock = l.h.n_long; + itmparr = (int *)calloc(2*l.h.n_long,sizeof(int)); + } + } + + return (block); + +} + +int +skipheader(FILE *ff) +{ + e_header e; + i_header i; + + if (long_mode == 0) fread(&e,sizeof(e_header),1,ff); + else if (long_mode == 1) fread(&i,sizeof(i_header),1,ff); + else if (long_mode >= 2) { + fread(&i,sizeof(i_header),1,ff); + fread(&i,sizeof(i_header),1,ff); + } + return(1); +} + + +#define itmp_index(i) ((long_mode < 2) ? i : 2*i + long_mode-2) + +long +readdata(FILE *ff,double *arr) +{ + int ik,k,i; + long lk; + + if (dblock) fread(dtmparr,dblock,sizeof(double),ff); + if (fblock) fread(ftmparr,fblock,sizeof(float),ff); + if (lblock) fread(ltmparr,lblock,sizeof(long),ff); + if (iblock && long_mode < 2) fread(itmparr,iblock,sizeof(int),ff); + if (iblock && long_mode >= 2) fread(itmparr,2*iblock,sizeof(int),ff); + if (cblock) fread(ctmparr,cblock,sizeof(char),ff); + + k = 0; + if (long_mode == 1) { + fread(&ik,1,sizeof(int),ff); if (inv_bytes) lk = swapint(ik); else lk = ik; + } else if (long_mode == 0) { + fread(&lk,1,sizeof(long),ff); if (inv_bytes) lk = swaplong(lk); + } else { + int a[2]; + fread(&a,2,sizeof(int),ff); + if (inv_bytes) lk = swapint(a[long_mode-2]); + else lk = a[long_mode-2]; + } + + if (inv_bytes) { + for (i=0; i +#include +#include + +double fabs(double); + +#define verysmall(a,b) ((fabs(a)+b) - fabs(a) == 0) + +void eigsrt(int n,double d[],double *v); + +int jacobi(int n, double *ap,double d[],double *vp,int is_ordered) +{ + double *a,*b,*z,*v; + int ip,iq,i,j,nrot; + double sm,tresh,g,h,t,theta,tau,c,s; + + a = (double *)calloc(n*n,sizeof(double)); + v = (double *)calloc(n*n,sizeof(double)); + b = (double *)calloc(n,sizeof(double)); + z = (double *)calloc(n,sizeof(double)); + + for (i=0; i 4 && verysmall(d[ip],g) && verysmall(d[iq],g)) a[ip*n+iq] = 0; + else if (fabs(a[ip*n+iq]) > tresh) { + h = d[iq]-d[ip]; + if (verysmall(h,g)) t = a[ip*n+iq]/h; + else { + theta = 0.5*h/a[ip*n+iq]; + t = 1./(fabs(theta)+sqrt(1.0+theta*theta)); + if (theta < 0) t = -t; + } + c = 1./sqrt(1+t*t); + s = t*c; + tau = s/(1.0+c); + h = t*a[ip*n+iq]; + z[ip] -= h; z[iq] += h; + d[ip] -= h; d[iq] += h; + a[ip*n+iq] = 0; + for (j=0; j<=ip-1; j++) { + g = a[j*n+ip]; h = a[j*n+iq]; + a[j*n+ip] = g - s*(h + g*tau); + a[j*n+iq] = h + s*(g - h*tau); + } + for (j=ip+1; j<=iq-1; j++) { + g = a[ip*n+j]; h = a[j*n+iq]; + a[ip*n+j] = g - s*(h + g*tau); + a[j*n+iq] = h + s*(g - h*tau); + } + for (j=iq+1; j tresh) */ + } /* iq =ip+1..n-1 */ + } /* ip = 0..n-2 */ + + for (ip=0; ip= p) p = d[k = j]; + if (k != i) { + d[k] = d[i]; + d[i] = p; + for (j=0; j (y)) ? (x) : (y)) +#define smaller(x,y) (((x) < (y)) ? (x) : (y)) + +#define getoptnum(par1,par2,val){ \ + char *sp; \ + sp = ss; \ + if (!*ss && (argc-1)) ss = (argv+1)[0]; \ + if (sscanf(ss,par1,par2) != 1) { \ + *par2 = val; ss = sp; \ + } else { \ + if (!*sp && --argc) argv++; \ + ss = strchr(ss,0); \ + } \ +} + +#define getnum(par1,par2){ \ + if (!*ss && --argc) ss = (++argv)[0]; \ + if (sscanf(ss,par1,par2) != 1) { \ + fprintf(stderr,usage); \ + exit(-1); \ + } \ + ss = strchr(ss,0);} + +#define get2num(str,p1,p2){ \ + if (!(*ss) && --argc) ss = (++argv)[0]; \ + if (sscanf(ss,str,p1,p2) != 2) { \ + fprintf(stderr,usage); \ + exit(-1); \ + } \ + ss = strchr(ss,0);} + +#define get3num(str,p1,p2,p3){ \ + if (!(*ss) && --argc) ss = (++argv)[0]; \ + if (sscanf(ss,str,p1,p2,p3) != 3) { \ + fprintf(stderr,usage); \ + exit(-1); \ + } \ + ss = strchr(ss,0);} + +#define get4num(str,p1,p2,p3,p4){ \ + if (!(*ss) && --argc) ss = (++argv)[0]; \ + if (sscanf(ss,str,p1,p2,p3,p4) != 4) { \ + fprintf(stderr,usage); \ + exit(-1); \ + } \ + ss = strchr(ss,0);} + +#define get5num(str,p1,p2,p3,p4,p5){ \ + if (!(*ss) && --argc) ss = (++argv)[0]; \ + if (sscanf(ss,str,p1,p2,p3,p4,p5) != 5) { \ + fprintf(stderr,usage); \ + exit(-1); \ + } \ + ss = strchr(ss,0);} + + +#define getlist(v,i){ \ + if (!(*ss) && --argc) ss = (++argv)[0]; \ + i = 0; \ + while (*ss) { \ + v[i] = ss; \ + i++; \ + if (strchr(ss,';') == NULL) break; \ + ss = strchr(ss,';'); \ + *ss = 0; \ + ss++; \ + } \ + if (i <= 0) { \ + fprintf(stderr,usage); \ + exit(-1); \ + } \ + ss = strchr(ss,0);} + +#define getnumlist(v,i,format){ \ + if (!(*ss) && --argc) ss = (++argv)[0]; \ + i = 0; \ + while (*ss) { \ + if (sscanf(ss,format,&v[i++]) != 1) { \ + fprintf(stderr,usage); exit(-1); \ + } \ + if (strchr(ss,',') == NULL) break; \ + ss = strchr(ss,','); \ + ss++; \ + } \ + if (i <= 0) { \ + fprintf(stderr,usage); \ + exit(-1); \ + } \ + ss = strchr(ss,0);} + + +#define get1or3num(str1,str3,p1,p2,p3,n){ \ + if (!(*ss) && --argc) ss = (++argv)[0]; \ + if (sscanf(ss,str3,&p1,&p2,&p3) != 3) { \ + if (sscanf(ss,str1,&p1) != 1) { \ + fprintf(stderr,usage); \ + exit(-1); \ + } \ + n = 1; \ + p3 = p1; p2 = 1; \ + } else n = 3; \ + ss = strchr(ss,0);} + + + +/********* headers for unformatted io **********/ + +typedef struct { + int headerid,f1,headersize,f2; + int n_double,f3,n_long,f4,n_int,f5,n_char,f6; + int lx,f7,ly,f8,lz,f9,lt,f10; + int d1,f11,d2,f12,d3,f13,d4,f14,d5,f15,d6,f16,d7,f17,d8,f18; +} ll_header; + +typedef struct { + long headerid,headersize; + long n_double,n_long,n_float,n_char; + long lx,ly,lz,lt; + long d1,d2,d3,d4,d5,d6,d7,d8; +} e_header; + +typedef struct { + int headerid,headersize; + int n_double,n_long,n_float,n_char; + int lx,ly,lz,lt; + int d1,d2,d3,d4,d5,d6,d7,d8; +} i_header; + +#define E_HEADER_ID 91919191 + +/* and a couple of protos */ +int readheader(FILE *ff,e_header *h); +int skipheader(FILE *ff); +long readdata(FILE *ff,double *tmparr); + + +/* other prototypes */ + +double calclist(double d[],int dl,char *cmd); +double * dblarr(int size);float * fltarr(int size); +int * intarr(int size); +double confidence(double chisq,int dof); +int gaussj(double* a,int n, int np, double* b,int m); +int svdecomp(double *a, int n, double *b, int m); +void fitfun(double *x,double *y,double *sig,int ndata,double *a,int ma, + double *covar,int *lista,int mfit,double *chisq,int print, + double funcs()); +void covarfit(double *x,double *y,double *cmat,int ndata,double *a,int ma, + double *covar,int *lista,int mfit,double *chisq,int print, + double funcs()); +void jackfit(double *x,double *y,double * sig,int ndata,int n1, int n2, + int jack,double *a,int ma, + double *covar,int *lista,int mfit,double *chisq,int print, + int fullcov,int simplex,double funcs()); +void jack_fit(double *x,double *y,double * sig,int ndata,int n1, int n2, + int jack,double *a,int ma, + double *covar,int *lista,int mfit,double *chisq,int print, + int fullcov,int simplex,double funcs(),double *av); +double fitfun_s(double *x,double *y,double *sig,int ndata,double *a,int ma, + double funcs()); +double brent(double ax,double bx,double cx,double f(),double tol,double *xmin); +double polyfit(int ndata,double x[],double y[],double sig[], + int deg,double par[],double ep[]); +double nelder(int ndim,double p[],double ftol,double funk(),int *i); +void simplexfit(double *x,double *y,double *sig,int ndata,double *a,int ma, + int *lista,int mfit,double *chisq,int print,double funcs()); +int jacobi(int n, double *ap,double d[],double *v,int is_ordered); +double simpson(double x[], double y[], double res[], int r); + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/compile.sh.in b/qcd/part_cpu/applications/QCD/src/kernel_B/compile.sh.in new file mode 100644 index 0000000000000000000000000000000000000000..89084ebbd9b55e3c405dd4a3392801b129c4c465 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/compile.sh.in @@ -0,0 +1,34 @@ +#!/bin/bash -l +############################################################################# +### ### +### Compile script for SU3 ### +### ### +### Last modified: 2008-09-08 ### +### ### +############################################################################# + +export MAKE="#MAKE#" +export EXECNAME="#EXECNAME#" +export AAPROG="#AAPROG#" +export RM="#RM#" +export AR="#AR#" +export ARFLAGS="#ARFLAGS#" +export CC="#CC#" +export CFLAGS="#CFLAGS#" +export MPI_CC="#MPI_CC#" +export LDFLAGS="#LDFLAGS#" + +#MODULE_INIT# +#MODULE_CMD# #MODULE_FILES# + +cd libraries/ && $MAKE all \ +&& cd ../su3h_n/ && $MAKE su3_ahiggs \ +&& cp su3_ahiggs $EXECNAME \ +&& cd ../ + +#MODULE_INIT2# +#MODULE_CMD2# #MODULE_FILES2# + +cd aa/ && $MAKE \ +&& cp aa $AAPROG \ +&& cd ../ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_field_complex.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_field_complex.c new file mode 100644 index 0000000000000000000000000000000000000000..1b74f279e3f609443e2c925f2b62d75681ae4f7a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_field_complex.c @@ -0,0 +1,10 @@ +/* block_field_complex -- does blocking on u1 higgs scalar field + * Kari Rummukainen 2002 + */ + +#include LATDEF + +#define FIELD complex +#define block_FIELD block_field_complex + +#include "block_field_generic.c" diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_field_generic.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_field_generic.c new file mode 100644 index 0000000000000000000000000000000000000000..d590598d58eab86da0dec95d2aedab234d38da25 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_field_generic.c @@ -0,0 +1,42 @@ +/********************************************************* + * Generic field blocking routine for SU(2) + * Kari Rummukainen 1998-2002 + */ + +FIELD * block_FIELD(FIELD *f, int newlev[NDIM], int free_old) +{ + /* Block the field(s) + */ + int dir,i,j,off,x[NDIM],step[NDIM],oldlev[NDIM]; + node_struct oldnode; + FIELD *th; + + /* first, catch the current node */ + oldnode = node; + + /* calculate the relative change */ + foralldir(dir) step[dir] = newlev[dir] - + (oldlev[dir] = current_blocking_level[dir]); + + /* set new block */ + set_blocking_level( newlev ); + + th = new_latfield( FIELD ); + + /* and loop over */ + off = 0; + forallsites(i) { + foralldir(dir) x[dir] = (coordinate(i,dir) << step[dir]); + + /* index to the corresponding site */ + th[i] = f[ node_index( x, &oldnode ) ]; + + } + + set_blocking_level( oldlev ); + + if (free_old) free_latfield( f ); + + return( th ); +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_field_su3adjoint.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_field_su3adjoint.c new file mode 100644 index 0000000000000000000000000000000000000000..c0069406652861e957fbdcf932dd647056018deb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_field_su3adjoint.c @@ -0,0 +1,10 @@ +/* block_su3_adjoint -- does blocking on su3 adjoint scalar field + * Kari Rummukainen 2002 + */ + +#include LATDEF + +#define FIELD adjoint_matrix +#define block_FIELD block_field_su3adjoint + +#include "block_field_generic.c" diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_lattice.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_lattice.c new file mode 100644 index 0000000000000000000000000000000000000000..5ff98245d3277db9eaa74eb7493ced40a0bd1793 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_lattice.c @@ -0,0 +1,199 @@ +/************************************************************************* + * Block lattice operations + */ + +#include "comdefs.h" +#include "generic.h" + +/* set_blocking_leve( int blev[ndim] ) sets the global structures + * for reducing the current lattice size by a factors given in blev, + * so that, for example, nx -> nx/2^blev[XUP]. + * Affects ONLY the gather, forallsites, etc. operations, does not + * touch individual variables. + * + * int * make_block_map( int blev[ndim] ) makes a index array from the + * current_blocking_level to blocking level blev. Thus, it returns + * an array of size current lattice size. It can go up or down, + * if up it fills the illegal sites with -(1<<30). + */ + +/* static variables for the blocking operations */ + +typedef struct block { + int level[NDIM]; /* blocking levels */ + struct block * next; /* ptr to next block level */ + lattice_struct lattice; /* saved lattice structure */ + node_struct node; /* saved node structure */ + node_struct * allnodes; /* ptr to node array for all nodes */ + site_struct * site; /* ptr to site struct */ + comlist_struct * comlist; /* and comlist for the gather */ + int *neighb[NDIRS]; /* neighbour arrays */ +} block_buf; + +static block_buf * b_buf = NULL; /* pointer to all of the saved blocking vars */ +static block_buf * base; /* will point to the base level */ +static int n_blocks = 0; /* number of blocking levels */ + +extern comlist_struct *comlist; /* ptr to comlist */ +extern node_struct *allnodes; + +void set_blocking_level( int blev[NDIM] ) +{ + int dir,i,found,d; + block_buf *b,*p; + + /* check first if it is the current level, nothing to do */ + found = 1; + foralldir(dir) + if (current_blocking_level[dir] != blev[dir]) found = 0; + if (found) return; + + /* Now something is happening */ + + if (b_buf == NULL) { + /* first time in, save the basic level here */ + base = b_buf = (block_buf *)memalloc(1, sizeof(block_buf) ); + foralldir(dir) base->level[dir] = 0; + for (dir=0; dirneighb[dir] = neighb[dir]; + + base->lattice = lattice; /* copy the lattice struct */ + base->node = node; /* copy the node struct */ + base->allnodes= allnodes; /* copy the address of the node array */ + base->site = site; /* and the address of the site array too */ + base->comlist = comlist; /* and comlist */ + base->next = NULL; + } + + /* ok, now chase the list and check if blev is defined */ + + found = 0; + for (b=b_buf; (!found) && b != NULL ; b=b->next) { + found = 1; + foralldir(dir) found = (found && (b->level[dir] == blev[dir])); + p = b; + } + + if (found) { + /* now the gather was found, copy the arrays */ + + lattice = p->lattice; + node = p->node; + allnodes = p->allnodes; + site = p->site; + comlist = p->comlist; + foralldir(dir) current_blocking_level[dir] = p->level[dir]; + for (dir=0; dirneighb[dir]; + + /* printf(" blocking level %d %d %d set up\n", + blev[XUP], blev[YUP], blev[ZUP]); + */ + fflush(stdout); + + } else { + + /* now it was NOT found -- make new level */ + + p->next = b = (block_buf *)memalloc(1, sizeof(block_buf) ); + + b->lattice.volume = 1; + foralldir(dir) { + current_blocking_level[dir] = b->level[dir] = blev[dir]; + + /* check if this is at all legal */ + if (base->lattice.size[dir] % (1 << blev[dir])) { + printf("Blocking error: cannot divide lattice by factors "); + foralldir(d) { + printf("%d",blev[d]); + if (d > 0) printf(" x "); + } + printf("\n"); + halt("Blocking error"); + } + + b->lattice.size[dir] = (base->lattice.size[dir]) / (1 << blev[dir]); + b->lattice.volume *= b->lattice.size[dir]; + + } + + lattice = b->lattice; + + /* make the structures -- THIS INITIALIZES THE ARRAYS */ + + /* printf(" blocking level %d %d %d set up, making arrays\n", + * blev[XUP], blev[YUP], blev[ZUP]); + */ + + make_lattice_arrays( &(b->lattice) ); + + /* and copy now the stuff to buffer */ + for (dir=0; dirneighb[dir] = neighb[dir]; + + b->node = node; /* copy the node struct */ + b->allnodes= allnodes; /* copy the address of the node array */ + b->site = site; /* and the address of the site array too */ + b->comlist = comlist; /* comlist */ + b->next = NULL; + + } +} + + +/****************************************************************/ + +void set_blocking_all( int lev ) +{ + int b[NDIM],dir; + + foralldir(dir) b[dir] = lev; + set_blocking_level( b ); +} + + +/************************************************************** + * Make mapping from current to blev + */ + +int *make_blocking_map( int newlev[NDIM] ) +{ + int dir,i,is_ok,x[NDIM],step[NDIM],oldlev[NDIM]; + node_struct newnode; + int *map; + + /* allocate map */ + map = (int *)memalloc( node.sites, sizeof(int) ); + + /* calculate the relative change */ + foralldir(dir) step[dir] = newlev[dir] - + (oldlev[dir] = current_blocking_level[dir]); + + /* set new blocking */ + set_blocking_level( newlev ); + /* catch the current node */ + newnode = node; + /* and just reset the level */ + set_blocking_level( oldlev ); + + /* and loop over */ + + forallsites(i) { + is_ok = 1; + foralldir(dir) { + if (step[dir] == 0) { + /* no change, coordinate as is */ + x[dir] = coordinate(i,dir); + } else if (step[dir] < 0) { + /* down, new is 2^n denser/larger than the old */ + x[dir] = (coordinate(i,dir) << (-step[dir])); + } else { + /* up, new is smaller than old */ + /* if (coordinate(i,dir) % (1<> step[dir]; + } + } + if (is_ok) map[i] = node_index( x, &newnode ); + else map[i] = -(1<<30); + + } + return( map ); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_link_complex.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_link_complex.c new file mode 100644 index 0000000000000000000000000000000000000000..ee428e704af7b1a0157ce3d9c268ed4e0b9b2278 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_link_complex.c @@ -0,0 +1,9 @@ +/****** block_link_complex.c -- compute the blocked link ******************/ + +/* MIMD version 3 */ + +#include LATDEF + +#define block_link_MATRIX block_link_complex + +#include "block_link_generic.c" diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_link_generic.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_link_generic.c new file mode 100644 index 0000000000000000000000000000000000000000..4f10469f8e69545c99268c2d0396fd2820f54caa --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_link_generic.c @@ -0,0 +1,68 @@ +/********************************************************* + * Generic gauge field blocking routine + * Kari Rummukainen 1998 - 2002 + */ + +void block_link_MATRIX( MATRIX *oldl[NDIM], MATRIX *newl[NDIM], + int newlev[NDIM], int free_old ) +{ + /* Just multiply the straight links U(x) U(x+1) -> U(x) + */ + + int i,j,dir; + int x[NDIM],step[NDIM],oldlev[NDIM]; + int *nf[NDIM]; + MATRIX *tmpl[NDIM]; + msg_tag *tag[NDIM]; + node_struct oldnode; + + /* first, start XYZ-direction: move the link to ->staple */ + + foralldir(dir) { + oldlev[dir] = current_blocking_level[dir]; + step[dir] = newlev[dir] - oldlev[dir]; + if (step[dir] == 1) { + tag[dir] = start_get( oldl[dir], dir, EVENODD ); + } else if (0 != step[dir]) halt(" Gauge blocking error" ); + } + + foralldir(dir) { + /* wait the gathers, this clears the buffers */ + if (step[dir]) wait_get(tag[dir]); + /* grab the old neighbour arrays */ + nf[dir] = neighb[dir]; + } + + /* copy the node, needed */ + oldnode = node; + + /* block the system */ + set_blocking_level( newlev ); + + foralldir(dir) tmpl[dir] = new_latfield( MATRIX ); + + /* and loop over */ + forallsites(i) { + foralldir(dir) x[dir] = (coordinate(i,dir)) << step[dir]; + /* index to the corresponding site */ + j = node_index( x, &oldnode ); + + /* and mult */ + foralldir(dir) { + if (step[dir]) { + mult_MATRIX_nn( oldl[dir][j], oldl[dir][nf[dir][j]], tmpl[dir][i] ); + } else { + tmpl[dir][i] = oldl[dir][j]; + } + } + } + + /* restore old level */ + set_blocking_level( oldlev ); + + if (free_old) foralldir(dir) free_latfield( oldl[dir] ); + /* set the pointer last - this makes it possible to use same + link in and out, if the old is freed first */ + foralldir(dir) newl[dir] = tmpl[dir]; + +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_link_su2.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_link_su2.c new file mode 100644 index 0000000000000000000000000000000000000000..d44235e56760ea8f1930cba7a62b314eba414f19 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_link_su2.c @@ -0,0 +1,9 @@ +/****** block_link_su2.c -- compute the blocked link ******************/ + +/* MIMD version 3 */ + +#include LATDEF +#include "generic_su2.h" + +#define block_link_MATRIX block_link_su2 +#include "block_link_generic.c" diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_link_su3.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_link_su3.c new file mode 100644 index 0000000000000000000000000000000000000000..da30c2dcd34cbd04427f4b0ae17a6b89058783ea --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/block_link_su3.c @@ -0,0 +1,8 @@ +/****** block_link_su3.c -- compute the blocked link ******************/ + +/* MIMD version 3 */ + +#include LATDEF + +#define block_link_MATRIX block_link_su3 +#include "block_link_generic.c" diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/bulk_update_mpi.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/bulk_update_mpi.c new file mode 100644 index 0000000000000000000000000000000000000000..6dbf17272d5ec4d72b5ea599ffa21c1ccbdbd1d3 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/bulk_update_mpi.c @@ -0,0 +1,21 @@ +/****************************************************************** + * here some MPI typical "bulk update" subroutines + */ + +#include "comdefs.h" +#include "generic.h" + +/* Is the link inside node? Thus, OK if the link is not _along_ any + * of the bottom slabs of the node. For example, reject + * x-links where y,z,.. coordinate == min on the node + */ + +int inside_node(int i, int dir) +{ + register int d,s; + + s = 1; + foralldir( d ) if ( d != dir && coordinate(i,d) == node.xmin[d] ) s = 0; + return( s ); +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/com_mpi.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/com_mpi.c new file mode 100644 index 0000000000000000000000000000000000000000..166748a2e2b5c6ebad48edcb49d3c2b5d7916d68 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/com_mpi.c @@ -0,0 +1,937 @@ +/****************** com_mpi.c ***************************************** + * + * Communications routines, for MPI interface + * Modified from the MILC lattice QCD one + * KR 2001 + * + + + + g_sync() provides a synchronization point for all nodes. + g_floatsum() sums a floating point number over all nodes. + g_doublesum() sums a double over all nodes. + g_vecdoublesum() sums a vector of doubles over all nodes. + g_floatmax() finds maximum of a floating point number over all nodes. + g_doublemax() finds maximum of a double over all nodes. + broadcast_float() broadcasts a single precision number from + node 0 to all nodes. + broadcast_double() broadcasts a double precision number + send_integer() sends an integer to one other node + receive_integer() receives an integer + terminate() kills the job on all processors + + start_gather() starts asynchronous sends and receives required + to gather neighbors. + wait_gather() waits for receives to finish, insuring that the + data has actually arrived. + + start_scatter() invese of gather + wait_scatter() + + send_field() sends a field to one other node. + receive_field() receives a field from some other node. +*/ + +/* load in definitions, variables etc. */ + +#include "comdefs.h" +#include "generic.h" +#include "timers.h" /* includes comm timer calculators */ + +#ifdef TIMERS +static timer_type total_gather_timer, start_gather_timer, wait_send_timer, + wait_receive_timer, g_sync_timer, + g_sum_timer, broadcast_timer, send_timer, total_time; +#endif +static double total_sent_data = 0.0, total_gather_data = 0.0; + + +#define MIN_GATHER_INDEX 100 +#define MAX_GATHER_INDEX 7100 /* allows 7000 concurrent gathers */ +#define FIELD_TYPE 11 /* used in send/receive field */ +#define SEND_INTEGER_TYPE 12 /* used in send/receive int */ + +#define SEND_FLAG 1 /* flags for marking msg_tags */ +#define RECEIVE_FLAG 2 + +extern comlist_struct *comlist; /* the comlist variables in layout.c */ + +/************************************************************************/ + +/* get all msg_tags in a single array, avoid allocating + * small bits and pieces of messages + */ + +#define N_MSG_TAG 50 + +static msg_tag msg_tag_arr[N_MSG_TAG]; +static msg_tag *msg_tag_free; + +void init_msg_tags() +{ + int i; + + for (i=0; inext; + } + + /* p points to the last in the list */ + msg_tag_free = p->next; + p->next = NULL; + return(r); +} + + +void release_msg_tags(msg_tag *tp) +{ + msg_tag *p; + + for (p=tp; p->next != NULL; p=p->next) ; + p->next = msg_tag_free; + msg_tag_free = tp; +} + +/************************************************************************/ + + +/* Machine initialization */ +#include +void initialize_machine() { +/* MPI_Init(&argc,&argv); */ + +#ifdef TIMERS + timer_reset( &total_time ); + timer_start( &total_time ); + + timer_reset( &total_gather_timer ); + timer_reset( &start_gather_timer ); + timer_reset( &wait_send_timer ); + timer_reset( &wait_receive_timer ); + timer_reset( &g_sync_timer ); + timer_reset( &g_sum_timer ); + timer_reset( &broadcast_timer ); + timer_reset( &send_timer ); +#endif + + init_msg_tags(); + +} + + +/************************************************************************/ + +/* this formats the wait_array, used by forallsites_waitA() + * should be made as fast as possible! + * + * wait_array[i] contains a bit at position 1<= node.sites ) wait_arr_[i] = wait_arr_[i] | (1<= node.sites ) wait_arr_[i] = wait_arr_[i] | (1< 4 + halt("forallsites_waitA requires NDIM <= 4!\n"); +#endif + + if (ntag > NA_MAX) halt("Error in forallsites_waitA: too many gathers"); + mask = 0; + for (i=0; idir)); + } + return( mask ); +} + + +#ifdef OLD_WAIT_ARR +int setup_wait_arr( unsigned char *wait_arr, msg_tag* tag_out[], + msg_tag* tag_in[], int ntag ) +{ + register int i,j; + int nt,dir[NA_MAX]; + + if (ntag > NA_MAX) halt("Error in forallsites_waitA: too many gathers"); + for (nt=i=0; idir; + nt++; + } + tag_out[nt] = (msg_tag *)NULL; /* needed for the last loop */ + + forallsites(i) { + wait_arr[i] = 0; /* basic, no wait */ + for (j=0; j= node.sites ) wait_arr[i] = j+1; + } + return( nt ); +} +#endif + + + + +/************************************************************************/ +/* GATHER ROUTINES */ +/* start_gather() returns a pointer to a list of msg_tag's, which + be used as input to subsequent wait_gather(). + + This list contains msg_tags for all receive buffers, followed by + end flag. + + If no messages at all are required, the routine will return NULL. + msg_buf==NULL should be a reliable indicator of no message. + + usage: tag = start_gather( source, size, direction, parity ) +*/ + + + +msg_tag* start_gather( field, size, dir, parity ) + /* arguments */ + char * field; /* pointer to some latfield */ + int size; /* size in bytes of the field (eg sizeof(su3_vector))*/ + int dir; /* direction to gather from. eg XUP - index into + neighbor tables */ + int parity; /* parity of sites whose neighbors we gather. + one of EVEN, ODD or EVENODD (EVEN+ODD). */ +{ + /* local variables */ + int i,j,k; /* scratch */ + int offset; /* number of sites in this receive or send */ + int *idx; /* index array pointer */ + int nsites; + char *tpt; /* temp ptr to buffer */ + msg_tag *mbuf, *mp; /* list of message tags, to be returned */ + comlist_struct *cp; + send_struct *sp; + receive_struct *rp; + static int index = MIN_GATHER_INDEX; /* index to identify the operation */ + + + if (dir < 0 || dir >= NDIRS) { + printf("No such gather %d, node %d\n",dir,mynode()); + terminate(1212); + } + + /* First, get the rolling index for the operation + * This MUST BE HERE even if there's nothing to do, because + * somebody else might be doing this! + */ + ++index ; if (index > MAX_GATHER_INDEX) index = MIN_GATHER_INDEX; + + cp = &comlist[dir]; + + /* Now if there's nothing to do, return - CHECK IF GATHERED */ + if( ( cp->n_send == 0 && cp->n_receive == 0 ) + || is_already_gathered( field, size, dir, parity) ) + return( (msg_tag *) NULL ); + + /* mark gathered, if needed */ + gather_mark_gathered( field, size, dir, parity ); + + /* allocate a buffer for the msg_tags. This is dynamically allocated + because there may be an arbitrary number of gathers in progress + in any direction. SIZE = n_msgs sent+received, for end flag + */ + + mbuf = get_msg_tags(cp->n_send + cp->n_receive); + +#ifdef TIMERS + /* mark start time for this gather */ + mbuf->start_time = timer_start( &start_gather_timer ); +#endif + + mp=mbuf; + /* HANDLE RECEIVES: loop over nodes which will send here */ + for (i=0, rp=cp->from_node; in_receive;i++, rp=rp->next, mp=mp->next) { + /* note--neighbors of EVEN sites are always first in the list! + * Thus, for ODD sites we must change the offset + */ + switch (parity) { + case EVEN: nsites = rp->n_even; offset = rp->offset; break; + case ODD: nsites = rp->n_odd; offset = rp->offset + rp->n_even; break; + case EVENODD: nsites = rp->n; offset = rp->offset; break; + } + + mp->flag = RECEIVE_FLAG; /* flag as normal receive */ + mp->dir = dir; + /* and post receive -- comes right on spot */ + MPI_Irecv( ((char *)field) + offset*size, nsites*size, MPI_BYTE, + rp->node, index, MPI_COMM_WORLD, &(mp->mpi) ); + + total_gather_data += nsites*size; + + } + + /* HANDLE SENDS - note: mp automatically correct */ + for(k=0,sp=cp->to_node; k < cp->n_send; k++,sp = sp->next, mp = mp->next) { + switch (parity) { + case EVEN: nsites = sp->n_even; offset = 0; break; + case ODD: nsites = sp->n_odd; offset = sp->n_even; break; + case EVENODD: nsites = sp->n; offset = 0; break; + } + + /* allocate buffer */ + tpt = (char *)malloc( nsites*size ); + if(tpt==NULL){printf("NO ROOM for tpt, node %d\n",mynode());exit(1);} + mp->flag = SEND_FLAG; /* flag as send */ + mp->dir = dir; + mp->buf = tpt; + /* gather data into the buffer */ + + idx = sp->sitelist + offset; /* initial offset */ + for (j=0; jbuf, nsites*size, MPI_BYTE, + sp->node, index, MPI_COMM_WORLD, &(mp->mpi) ); + + total_gather_data += nsites*size; + + } + + timer_end( &start_gather_timer ); + + /* return */ + return(mbuf); +} + + +msg_tag * wait_gather( msg_tag *mbuf ) +{ + MPI_Status status; + msg_tag *mp; + + if (mbuf == NULL) return((msg_tag *)NULL); + timer_start( &wait_receive_timer ); + /* wait for all receive messages */ + for(mp=mbuf; mp != NULL && mp->flag == RECEIVE_FLAG; mp=mp->next) { + MPI_Wait( &(mp->mpi), &status ); + } + timer_end( &wait_receive_timer ); + /* wait for all send messages */ + timer_start( &wait_send_timer ); + for( ; mp != NULL && mp->flag == SEND_FLAG; mp=mp->next) { + MPI_Wait( &(mp->mpi), &status ); + /* release the buffer */ + free( mp->buf ); + } +#ifdef TIMERS + total_gather_timer.total += + timer_end( &wait_send_timer ) - mbuf->start_time; + total_gather_timer.count ++; +#endif + + /* and free the mbuf */ + release_msg_tags( mbuf ); + return((msg_tag *)NULL); +} + + +void wait_gather_arr( msg_tag* tag[], int ntag ) +{ + int i; + for (i=0; i field[i_otherparity], on + * neighb. nodes. + * THIS MODIFIES THE LATTICE FIELD field ON OTHERPARITY. + * Thus, there is little sense using this on EVENODD (but it is possible) + */ + +msg_tag * start_scatter( field, size, dir, parity ) + char * field; /* pointer to some latfield */ + int size; /* size in bytes of the field (eg sizeof(su3_vector))*/ + int dir; /* direction to push the data, eg XUP - index into + neighbor tables */ + int parity; /* parity of sites from where we push + one of EVEN, ODD or EVENODD (EVEN+ODD). */ +{ + /* local variables */ + int i,k; /* scratch */ + int offset; /* number of sites in this receive or send */ + int nsites; + char *tpt; /* temp ptr to buffer */ + msg_tag *mbuf, *mp; /* list of message tags, to be returned */ + comlist_struct *cp; + send_struct *sp; + receive_struct *rp; + static int index = MIN_GATHER_INDEX; /* index to identify the operation */ + + + if (dir < 0 || dir >= NDIRS) { + printf("No such gather %d, node %d\n",dir,mynode()); + terminate(1212); + } + + /* First, get the rolling index for the operation + * This MUST BE HERE even if there's nothing to do, because + * somebody else might be doing this! + */ + ++index ; if (index > MAX_GATHER_INDEX) index = MIN_GATHER_INDEX; + + cp = &comlist[dir]; + + /* Now if there's nothing to do, return */ + if( cp->n_send == 0 && cp->n_receive == 0 ) + return( (msg_tag *) NULL ); + + /* allocate a buffer for the msg_tags */ + + mbuf = get_msg_tags(cp->n_send + cp->n_receive); + +#ifdef TIMERS + /* mark start time for this gather */ + mbuf->start_time = timer_start( &start_gather_timer ); +#endif + + mp=mbuf; + /* HANDLE RECEIVES: loop over nodes which will send here + * note difference to start_gather; now using to_node, n_send! + */ + for(k=0,sp=cp->to_node; k < cp->n_send; k++,sp = sp->next, mp=mp->next) { + switch (parity) { + case EVEN: nsites = sp->n_even; offset = 0; break; + case ODD: nsites = sp->n_odd; offset = sp->n_even; break; + case EVENODD: nsites = sp->n; offset = 0; break; + } + + /* allocate buffer */ + tpt = (char *)malloc( nsites*size ); + if(tpt==NULL){printf("NO ROOM for tpt, node %d\n",mynode());exit(1);} + mp->flag = RECEIVE_FLAG; /* flag as receive */ + mp->dir = opp_dir(dir); /* using opp_dir here, works with _wait etc. */ + mp->buf = tpt; + + mp->nsites = nsites; + mp->size = size; + mp->field = field; + mp->sitelist = sp->sitelist + offset; /* list of sites to scatter */ + + /* post receive */ + MPI_Irecv( mp->buf, nsites*size, MPI_BYTE, + sp->node, index, MPI_COMM_WORLD, &(mp->mpi) ); + + total_gather_data += nsites*size; + } + + /* and HANDLE SENDS - note: mp automatically correct */ + for (i=0, rp=cp->from_node; in_receive;i++, rp=rp->next, mp=mp->next) { + /* note--EVEN sites are always first in the list! + * Thus, for ODD sites we must change the offset + */ + switch (parity) { + case EVEN: nsites = rp->n_even; offset = rp->offset; break; + case ODD: nsites = rp->n_odd; offset = rp->offset + rp->n_even; break; + case EVENODD: nsites = rp->n; offset = rp->offset; break; + } + + mp->flag = SEND_FLAG; /* flag as normal receive */ + mp->dir = opp_dir(dir); + /* and post send -- comes right from spot */ + MPI_Issend( ((char *)field) + offset*size, nsites*size, MPI_BYTE, + rp->node, index, MPI_COMM_WORLD, &(mp->mpi) ); + + total_gather_data += nsites*size; + } + timer_end( &start_gather_timer ); + + /* return */ + return(mbuf); +} + + +msg_tag * wait_scatter( msg_tag *mbuf ) +{ + MPI_Status status; + msg_tag *mp; + int *idx,j; + char *tpt; + + if (mbuf == NULL) return((msg_tag *)NULL); + timer_start( &wait_receive_timer ); + /* wait for all receive messages */ + for(mp=mbuf; mp != NULL && mp->flag == RECEIVE_FLAG; mp=mp->next) { + MPI_Wait( &(mp->mpi), &status ); + + /* now copy data to right spot */ + idx = mp->sitelist; /* index list */ + tpt = mp->buf; + for (j=0; jnsites; j++, tpt += mp->size) { + memcpy( mp->field + idx[j]*mp->size, tpt, mp->size ); + } + /* and free the field */ + free( mp->buf ); + } + timer_end( &wait_receive_timer ); + /* wait for all send messages */ + timer_start( &wait_send_timer ); + for( ; mp != NULL && mp->flag == SEND_FLAG; mp=mp->next) { + MPI_Wait( &(mp->mpi), &status ); + } +#ifdef TIMERS + total_gather_timer.total += + timer_end( &wait_send_timer ) - mbuf->start_time; + total_gather_timer.count ++; +#endif + /* and free the mbuf */ + release_msg_tags( mbuf ); + return((msg_tag *)NULL); +} + + +/**************************************************************** + */ + + +/* SEND AND RECEIVE FIELD */ +/* send_field is to be called only by the node doing the sending */ +/* get_field is to be called only by the node to which the field was sent */ +void send_field(buf,size,tonode) + void *buf; int size,tonode; +{ + timer_start( &send_timer ); + MPI_Send(buf,size,MPI_BYTE,tonode,FIELD_TYPE,MPI_COMM_WORLD); + timer_end( &send_timer ); + total_sent_data += size; +} +void receive_field(buf,size) + void *buf; int size; +{ + MPI_Status status; + + timer_start( &send_timer ); + MPI_Recv(buf,size,MPI_BYTE,MPI_ANY_SOURCE,FIELD_TYPE, + MPI_COMM_WORLD,&status); + timer_end( &send_timer ); + total_sent_data += size; +} + +/* BASIC COMMUNICATIONS FUNCTIONS */ + +/* Tell what kind of machine we are on */ +static char name[]="MPI (portable)"; +char * machine_type(){ + return(name); +} + +/* Return my node number */ +int mynode() +{ + int node; + MPI_Comm_rank( MPI_COMM_WORLD, &node ); + return(node); +} + +/* Return number of nodes */ +int numnodes() +{ + int nodes; + MPI_Comm_size( MPI_COMM_WORLD, &nodes ); + return(nodes); +} + +/* Synchronize all nodes */ +void g_sync() +{ + timer_start( &g_sync_timer ); + MPI_Barrier( MPI_COMM_WORLD ); + timer_end( &g_sync_timer ); +} + +/* Sum float over all nodes. dist=1: distribute to all nodes */ +void g_floatsum( float * fpt, int dist ) +{ + float work; + + timer_start( &g_sum_timer ); + if (dist) { + MPI_Allreduce( fpt, &work, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD ); + *fpt = work; + } else { + MPI_Reduce ( fpt, &work, 1, MPI_FLOAT, MPI_SUM, 0 , MPI_COMM_WORLD ); + if (this_node == 0) *fpt = work; + } + timer_end( &g_sum_timer ); +} + +/* Sum double over all nodes, and scatter the result */ +void g_doublesum( double * dpt, int dist ) +{ + double work; + + timer_start( &g_sum_timer ); + if (dist) { + MPI_Allreduce( dpt, &work, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD ); + *dpt = work; + } else { + MPI_Reduce ( dpt, &work, 1, MPI_DOUBLE, MPI_SUM, 0 , MPI_COMM_WORLD ); + if (this_node == 0) *dpt = work; + } + timer_end( &g_sum_timer ); +} + +#define N_ELEM 100 + +/* Sum a vector of ints over all nodes */ +void g_vecintsum( int *dpt, int n, int dist) +{ + int *work, arr[N_ELEM]; + register int i; + + timer_start( &g_sum_timer ); + if (n <= N_ELEM) work = arr; else work = (int *)malloc(n*sizeof(int)); + if (dist) { + MPI_Allreduce( dpt, work, n, MPI_INT, MPI_SUM, MPI_COMM_WORLD ); + for (i=0; i N_ELEM) free(work); + timer_end( &g_sum_timer ); +} + + +/* Sum a vector of floats over all nodes */ +void g_vecfloatsum( float *dpt, int nfloats, int dist ) +{ + float *work, arr[N_ELEM]; + register int i; + + timer_start( &g_sum_timer ); + + if (nfloats <= N_ELEM) work = arr; + else work = (float *)malloc(nfloats*sizeof(float)); + + if (dist) { + MPI_Allreduce( dpt, work, nfloats, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD ); + for (i=0; i N_ELEM) free(work); + timer_end( &g_sum_timer ); + + /** + work = (float *)malloc(nfloats*sizeof(float)); + MPI_Allreduce( dpt, work, nfloats, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD ); + for(i=0;i N_ELEM) free(work); + timer_end( &g_sum_timer ); + + /** + work = (double *)malloc(ndoubles*sizeof(double)); + MPI_Allreduce( dpt, work, ndoubles, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD ); + for(i=0;istatus == GATHER_FOLLOW && (a->gathered[dir] ^ parity) == 0) { +#ifdef CHECK_GATHER_FIELDS + /* now looks like is gathered, check if the checkup field has changed */ + if (((parity & EVEN) && (*((unsigned int *)field) != a->check_even[dir] )) || + ((parity & ODD ) && (*((unsigned int *)(field+size*(node.sites-1))) + != a->check_odd[dir] ))) { + printf(" #### GATHER CHECK: Forgotten mark_changed() somewhere!\n"); + if (size < sizeof(int)) + printf(" Because field size %d < sizeof(int), can be spurious\n",size); + else { + printf(" Field size %d chars, dir %d parity %d\n",size, dir, parity); + } + halt(" ###### "); + } +#endif + n_gather_avoided++; + return(1); + } + n_gather_done++; + return(0); +} + +void gather_status_reset( char *field, int size ) +{ + struct gather_status_arr *a; + /* cast the latfield array */ + a = (struct gather_status_arr *) (field + size*node.latfield_size); + a->status = GATHER_NOT_FOLLOW; +} + +void gather_mark_dirty( char *field, int size, int parity ) +{ + int dir,p; + struct gather_status_arr *a; + + /* cast the latfield array */ + a = (struct gather_status_arr *) (field + size*node.latfield_size); + a->status = GATHER_FOLLOW; + p = opp_parity(parity); + /* mark opposite parity, because will fetch from there! */ + /* Remember that need to mark opposite directions too! */ + for(dir=0; dirgathered[dir] &= (!p); +} + +void gather_mark_gathered( char *field, int size, int dir, int parity ) +{ + struct gather_status_arr *a; + + /* cast the latfield array */ + a = (struct gather_status_arr *) (field + size*node.latfield_size); + if (a->status == GATHER_FOLLOW) a->gathered[dir] |= parity; +#ifdef CHECK_GATHER_FIELDS + if (parity & EVEN) a->check_even[dir] = *((unsigned int *)field); + if (parity & ODD ) a->check_odd[dir] = + *((unsigned int *)(field+size*(node.sites-1))); +#endif +} + + +/****************************************************************/ + + + +/* version of exit for multinode processes -- kill all nodes */ +void terminate(int status) +{ + printf("Termination: node %d, status = %d\n",this_node,status); + fflush(stdout); + MPI_Abort( MPI_COMM_WORLD, 0); + exit(status); +} + + + + +/* clean exit from all nodes */ +void finishrun() +{ +#ifdef TIMERS + report_comm_timers(); +#endif + + if (this_node == 0) { + extern int n_gather_done,n_gather_avoided; + + printf(" COMMS from node 0: %d done, %d (%.2g%%) optimized away\n", + n_gather_done, n_gather_avoided, + 100.0*n_gather_avoided/(n_gather_avoided+n_gather_done)); + } + + fflush(stdout); + fflush(NULL); /* for all open files */ + MPI_Finalize(); + exit(0); +} + + +void report_comm_timers() +{ +#ifdef TIMERS + double tot; + if (this_node == 0) { + printf(" *************************\n"); + printf(" MPI communications timers from node 0:\n"); + + printf(" start_get: "); + timer_report( &start_gather_timer ); + printf(" waiting send: "); + timer_report( &wait_send_timer ); + printf(" receive: "); + timer_report( &wait_receive_timer ); + printf(" g_sync: "); + timer_report( &g_sync_timer ); + printf(" g_sum: "); + timer_report( &g_sum_timer ); + printf(" broadcast: "); + timer_report( &broadcast_timer ); + printf(" send/receive: "); + timer_report( &send_timer ); + printf(" Total time from gather start -> end (does not count against comm.time)\n"); + printf(" "); + timer_report( &total_gather_timer ); + + printf(" Moved data:\n"); + printf(" * send/receive %g MB, bandwith %g MB/sec\n", + total_sent_data*1e-6, total_sent_data*1e-6/send_timer.total); + + printf(" * total pushed/pulled data %g MB\n",total_gather_data*1e-6); + printf(" * with optimistic bandwidth %g MB/sec, pessimistic %g MB/s\n", + total_gather_data*1e-6/ + (start_gather_timer.total + wait_send_timer.total + + wait_receive_timer.total), + total_gather_data*1e-6/total_gather_timer.total); + + tot = start_gather_timer.total + wait_send_timer.total + wait_receive_timer.total + + g_sync_timer.total + + g_sum_timer.total + broadcast_timer.total + send_timer.total ; + + /* find current time */ + timer_end( &total_time ); + + printf(" Total comm. time %.3g, total time %.3g, comm %.2g%%\n", + tot, total_time.total, 100*tot/total_time.total ); + printf(" ***** \n"); + } +#endif +} + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/com_mpi_2.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/com_mpi_2.c new file mode 100644 index 0000000000000000000000000000000000000000..343d49c9570e2a189cd8be8918a3b59a512c04fa --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/com_mpi_2.c @@ -0,0 +1,728 @@ +/****************** com_mpi.c ***************************************** + * + * Communications routines, for MPI interface + * Modified from the MILC lattice QCD one + * KR 2001 + * + + + + g_sync() provides a synchronization point for all nodes. + g_floatsum() sums a floating point number over all nodes. + g_doublesum() sums a double over all nodes. + g_vecdoublesum() sums a vector of doubles over all nodes. + g_floatmax() finds maximum of a floating point number over all nodes. + g_doublemax() finds maximum of a double over all nodes. + broadcast_float() broadcasts a single precision number from + node 0 to all nodes. + broadcast_double() broadcasts a double precision number + send_integer() sends an integer to one other node + receive_integer() receives an integer + terminate() kills the job on all processors + + start_gather() starts asynchronous sends and receives required + to gather neighbors. + wait_gather() waits for receives to finish, insuring that the + data has actually arrived. + + send_field() sends a field to one other node. + receive_field() receives a field from some other node. +*/ + +/* load in definitions, variables etc. */ + +#include "comdefs.h" +#include "generic.h" +#include "timers.h" /* includes comm timer calculators */ + +#ifdef TIMERS +static timer_type start_gather_timer, wait_send_timer, wait_receive_timer, g_sync_timer, + g_sum_timer, broadcast_timer, send_timer, total_time; +#endif +static double mpi_sent_data = 0.0; + + +#define MIN_GATHER_INDEX 100 +#define MAX_GATHER_INDEX 7100 /* allows 7000 concurrent gathers */ +#define FIELD_TYPE 11 /* used in send/receive field */ +#define SEND_INTEGER_TYPE 12 /* used in send/receive int */ + +#define SEND_FLAG 1 /* flags for marking msg_tags */ +#define RECEIVE_FLAG 2 + +extern comlist_struct *comlist; /* the comlist variables in layout.c */ + +/************************************************************************/ + +/* get all msg_tags in a single array, avoid allocating + * small bits and pieces of messages + */ + +#define N_MSG_TAG 40 + +static msg_tag msg_tag_arr[N_MSG_TAG]; +static msg_tag *msg_tag_free, *msg_tag_last; + +void init_msg_tags() +{ + int i; + + for (i=0; inext; + } + + /* p points to the last in the list */ + msg_tag_free = p->next; + p->next = NULL; + return(r); +} + + +void release_msg_tags(msg_tag *tp) +{ + msg_tag *p; + + for (p=tp; p->next != NULL; p=p->next) ; + p->next = msg_tag_free; + msg_tag_free = tp; +} + +/************************************************************************/ + + +/* Machine initialization */ +#include +void initialize_machine() { +/* MPI_Init(&argc,&argv); */ + +#ifdef TIMERS + timer_reset( &total_time ); + timer_start( &total_time ); + + timer_reset( &start_gather_timer ); + timer_reset( &wait_send_timer ); + timer_reset( &wait_receive_timer ); + timer_reset( &g_sync_timer ); + timer_reset( &g_sum_timer ); + timer_reset( &broadcast_timer ); + timer_reset( &send_timer ); +#endif + + init_msg_tags(); + +} + + +/************************************************************************/ + +/* this formats the wait_array, used by forallsites_waitA() + * should be made as fast as possible! + */ + +int setup_wait_arr( unsigned char *wait_arr, msg_tag* tag_out[], + msg_tag* tag_in[], int ntag ) +{ + register int i,j; + int nt,dir[NA_MAX]; + + if (ntag > NA_MAX) halt("Error in forallsites_waitA: too many gathers"); + for (nt=i=0; idir; + nt++; + } + tag_out[nt] = (msg_tag *)NULL; /* needed for the last loop */ + + forallsites(i) { + wait_arr[i] = 0; /* basic, no wait */ + for (j=0; j= node.sites ) wait_arr[i] = j+1; + } + return( nt ); +} + + +/************************************************************************/ +/* GATHER ROUTINES */ +/* start_gather() returns a pointer to a list of msg_tag's, which + be used as input to subsequent wait_gather(). + + This list contains msg_tags for all receive buffers, followed by + end flag. + + If no messages at all are required, the routine will return NULL. + msg_buf==NULL should be a reliable indicator of no message. + + usage: tag = start_gather( source, size, direction, parity ) +*/ + + + +msg_tag* start_gather( field, size, dir, parity ) + /* arguments */ + char * field; /* pointer to some latfield */ + int size; /* size in bytes of the field (eg sizeof(su3_vector))*/ + int dir; /* direction to gather from. eg XUP - index into + neighbor tables */ + int parity; /* parity of sites whose neighbors we gather. + one of EVEN, ODD or EVENODD (EVEN+ODD). */ +{ + /* local variables */ + int i,j,k,nodepar; /* scratch */ + int offset; /* number of sites in this receive or send */ + int *idx; /* index array pointer */ + int nsites; + char *tpt; /* temp ptr to buffer */ + msg_tag *mbuf, *mp; /* list of message tags, to be returned */ + comlist_struct *cp; + send_struct *sp; + receive_struct *rp; + static int index = MIN_GATHER_INDEX; /* index to identify the operation */ + + + if (dir < 0 || dir >= NDIRS) { + printf("No such gather %d, node %d\n",dir,mynode()); + terminate(1212); + } + + /* First, get the rolling index for the operation + * This MUST BE HERE even if there's nothing to do, because + * somebody else might be doing this! + */ + ++index ; if (index > MAX_GATHER_INDEX) index = MIN_GATHER_INDEX; + + cp = &comlist[dir]; + + /* Now if there's nothing to do, return - CHECK IF GATHERED */ + if( ( cp->n_send == 0 && cp->n_receive == 0 ) + || is_already_gathered( field, size, dir, parity) ) + return( (msg_tag *) NULL ); + + /* mark gathered, if needed */ + gather_mark_gathered( field, size, dir, parity ); + + timer_start( &start_gather_timer ); + + /* allocate a buffer for the msg_tags. This is dynamically allocated + because there may be an arbitrary number of gathers in progress + in any direction. SIZE = n_msgs sent+received, for end flag */ + + /* mbuf = (msg_tag *)malloc((cp->n_send + cp->n_receive + 1)*sizeof(msg_tag) ); + if(mbuf==NULL){ + printf("No room for mbuf, node %d\n",mynode()); + terminate(1212); + } + */ + + mbuf = get_msg_tags(cp->n_send + cp->n_receive); + + /* Loop over node parity */ + forbothparities(nodepar) { + + mp=mbuf; + /* HANDLE RECEIVES: loop over nodes which will send here */ + for (i=0, rp=cp->from_node; in_receive;i++, rp=rp->next, mp=mp->next) { + + if (nodepar == node.parity) { + MPI_Status status; + + /* note--neighbors of EVEN sites are always first in the list! + * Thus, for ODD sites we must change the offset + */ + switch (parity) { + case EVEN: nsites = rp->n_even; offset = rp->offset; break; + case ODD: nsites = rp->n_odd; offset = rp->offset + rp->n_even; break; + case EVENODD: nsites = rp->n; offset = rp->offset; break; + } + + mp->flag = RECEIVE_FLAG; /* flag as normal receive */ + mp->dir = dir; + /* and post receive -- comes right on spot */ + MPI_Recv( ((char *)field) + offset*size, nsites*size, MPI_BYTE, + rp->node, index, MPI_COMM_WORLD, &status ); + } + } + + /* HANDLE SENDS - note: mp automatically correct */ + for(k=0,sp=cp->to_node; k < cp->n_send; k++,sp = sp->next, mp = mp->next) { + if (nodepar != node.parity) { + switch (parity) { + case EVEN: nsites = sp->n_even; offset = 0; break; + case ODD: nsites = sp->n_odd; offset = sp->n_even; break; + case EVENODD: nsites = sp->n; offset = 0; break; + } + + /* allocate buffer */ + tpt = (char *)malloc( nsites*size ); + if(tpt==NULL){printf("NO ROOM for tpt, node %d\n",mynode());exit(1);} + mp->flag = SEND_FLAG; /* flag as send */ + mp->dir = dir; + mp->buf = tpt; + /* gather data into the buffer */ + + idx = sp->sitelist + offset; /* initial offset */ + for (j=0; jbuf, nsites*size, MPI_BYTE, + sp->node, index, MPI_COMM_WORLD ); + + free( mp->buf ); + } + } + + } + + timer_end( &start_gather_timer ); + + release_msg_tags( mbuf ); + + /* return */ + return((msg_tag*)NULL); +} + + +msg_tag * wait_gather( msg_tag *mbuf ) +{ + MPI_Status status; + msg_tag *mp; + + if (mbuf == NULL) return((msg_tag *)NULL); + timer_start( &wait_receive_timer ); + /* wait for all receive messages */ + for(mp=mbuf; mp != NULL && mp->flag == RECEIVE_FLAG; mp=mp->next) { + MPI_Wait( &(mp->mpi), &status ); + } + timer_end( &wait_receive_timer ); + /* wait for all send messages */ + timer_start( &wait_send_timer ); +#ifndef SEND_TEST + for( ; mp != NULL && mp->flag == SEND_FLAG; mp=mp->next) { + MPI_Wait( &(mp->mpi), &status ); + /* release the buffer */ + free( mp->buf ); + } +#endif + timer_end( &wait_send_timer ); + /* and free the mbuf */ + release_msg_tags( mbuf ); + return((msg_tag *)NULL); +} + + +/**************************************************************** + */ + + +/* SEND AND RECEIVE FIELD */ +/* send_field is to be called only by the node doing the sending */ +/* get_field is to be called only by the node to which the field was sent */ +void send_field(buf,size,tonode) + void *buf; int size,tonode; +{ + timer_start( &send_timer ); + MPI_Send(buf,size,MPI_BYTE,tonode,FIELD_TYPE,MPI_COMM_WORLD); + timer_end( &send_timer ); + mpi_sent_data += size; +} +void receive_field(buf,size) + void *buf; int size; +{ + MPI_Status status; + + timer_start( &send_timer ); + MPI_Recv(buf,size,MPI_BYTE,MPI_ANY_SOURCE,FIELD_TYPE, + MPI_COMM_WORLD,&status); + timer_end( &send_timer ); + mpi_sent_data += size; +} + +/* BASIC COMMUNICATIONS FUNCTIONS */ + +/* Tell what kind of machine we are on */ +static char name[]="MPI (portable)"; +char * machine_type(){ + return(name); +} + +/* Return my node number */ +int mynode() +{ + int node; + MPI_Comm_rank( MPI_COMM_WORLD, &node ); + return(node); +} + +/* Return number of nodes */ +int numnodes() +{ + int nodes; + MPI_Comm_size( MPI_COMM_WORLD, &nodes ); + return(nodes); +} + +/* Synchronize all nodes */ +void g_sync() +{ + timer_start( &g_sync_timer ); + MPI_Barrier( MPI_COMM_WORLD ); + timer_end( &g_sync_timer ); +} + +/* Sum float over all nodes to node 0 */ +void g_floatsum( float * fpt ) +{ + float work; + + timer_start( &g_sum_timer ); + if (this_node == 0) { + MPI_Reduce( fpt, &work, 1, MPI_FLOAT, MPI_SUM, 0 , MPI_COMM_WORLD ); + *fpt = work; + } else { + MPI_Reduce( fpt, &work, 1, MPI_FLOAT, MPI_SUM, 0 , MPI_COMM_WORLD ); + } + timer_end( &g_sum_timer ); +} + +/* Sum double over all nodes */ +void g_doublesum( double * dpt ) +{ + double work; + + timer_start( &g_sum_timer ); + if (this_node == 0) { + MPI_Reduce( dpt, &work, 1, MPI_DOUBLE, MPI_SUM, 0 , MPI_COMM_WORLD ); + *dpt = work; + } else { + MPI_Reduce( dpt, &work, 1, MPI_DOUBLE, MPI_SUM, 0 , MPI_COMM_WORLD ); + } + timer_end( &g_sum_timer ); +} + +/* Sum double over all nodes, and scatter the result */ +void g_doublesum_scatter( double * dpt ) +{ + double work; + + timer_start( &g_sum_timer ); + MPI_Allreduce( dpt, &work, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD ); + *dpt = work; + timer_end( &g_sum_timer ); +} + + + +/* Sum a vector of doubles over all nodes */ +void g_vecintsum( dpt, n ) + int *dpt; int n; +{ + register int *work; + register int i; + + timer_start( &g_sum_timer ); + work = (int *)malloc(n*sizeof(int)); + if (this_node == 0) { + MPI_Reduce( dpt, work, n, MPI_INT, MPI_SUM, 0 , MPI_COMM_WORLD ); + for (i=0; istatus == GATHER_FOLLOW && (a->gathered[dir] ^ parity) == 0) { +#ifdef CHECK_GATHER_FIELDS + /* now looks like is gathered, check if the checkup field has changed */ + if (((parity & EVEN) && (*((unsigned int *)field) != a->check_even[dir] )) || + ((parity & ODD ) && (*((unsigned int *)(field+size*(node.sites-1))) + != a->check_odd[dir] ))) { + printf(" #### GATHER CHECK: Forgotten mark_changed() somewhere!\n"); + if (size < sizeof(int)) + printf(" Because field size %d < sizeof(int), can be spurious\n",size); + else { + printf(" Field size %d chars, dir %d parity %d\n",size, dir, parity); + } + halt(" ###### "); + } +#endif + n_gather_avoided++; + return(1); + } + n_gather_done++; + return(0); +} + +void gather_status_reset( char *field, int size ) +{ + struct gather_status_arr *a; + /* cast the latfield array */ + a = (struct gather_status_arr *) (field + size*node.latfield_size); + a->status = GATHER_NOT_FOLLOW; +} + +void gather_mark_dirty( char *field, int size, int parity ) +{ + int dir,p; + struct gather_status_arr *a; + + /* cast the latfield array */ + a = (struct gather_status_arr *) (field + size*node.latfield_size); + a->status = GATHER_FOLLOW; + p = opp_parity(parity); + /* mark opposite parity, because will fetch from there! */ + /* Remember that need to mark opposite directions too! */ + for(dir=0; dirgathered[dir] &= (!p); +} + +void gather_mark_gathered( char *field, int size, int dir, int parity ) +{ + struct gather_status_arr *a; + + /* cast the latfield array */ + a = (struct gather_status_arr *) (field + size*node.latfield_size); + if (a->status == GATHER_FOLLOW) a->gathered[dir] |= parity; +#ifdef CHECK_GATHER_FIELDS + if (parity & EVEN) a->check_even[dir] = *((unsigned int *)field); + if (parity & ODD ) a->check_odd[dir] = + *((unsigned int *)(field+size*(node.sites-1))); +#endif +} + + +/****************************************************************/ + + + +/* version of exit for multinode processes -- kill all nodes */ +void terminate(int status) +{ + printf("Termination: node %d, status = %d\n",this_node,status); + fflush(stdout); + MPI_Abort( MPI_COMM_WORLD, 0); + exit(status); +} + + + + +/* clean exit from all nodes */ +void finishrun() +{ +#ifdef TIMERS + report_comm_timers(); +#endif + + if (this_node == 0) { + extern int n_gather_done,n_gather_avoided; + + printf(" COMMS from node 0: %d done, %d (%.2g%%) optimized away\n", + n_gather_done, n_gather_avoided, + 100.0*n_gather_avoided/(n_gather_avoided+n_gather_done)); + } + + fflush(stdout); + fflush(NULL); /* Try to flush all open files */ + MPI_Finalize(); + exit(0); +} + + +void report_comm_timers() +{ +#ifdef TIMERS + double tot; + if (this_node == 0) { + printf(" *************************\n"); + printf(" MPI communications timers from node 0:\n"); + + printf(" start_get: "); + timer_report( &start_gather_timer ); + printf(" waiting send: "); + timer_report( &wait_send_timer ); + printf(" receive: "); + timer_report( &wait_receive_timer ); + printf(" g_sync: "); + timer_report( &g_sync_timer ); + printf(" g_sum: "); + timer_report( &g_sum_timer ); + printf(" broadcast: "); + timer_report( &broadcast_timer ); + printf(" send/receive: "); + timer_report( &send_timer ); + printf(" send/receive %g MB, bandwith %g MB/sec\n", + mpi_sent_data*1e-6, mpi_sent_data*1e-6/send_timer.total); + + tot = start_gather_timer.total + wait_send_timer.total + wait_receive_timer.total + + g_sync_timer.total + + g_sum_timer.total + broadcast_timer.total + send_timer.total ; + + /* find current time */ + timer_end( &total_time ); + + printf(" Total comm. time %.3g, total time %.3g, comm %.2g%%\n", + tot, total_time.total, 100*tot/total_time.total ); + printf(" ***** \n"); + } +#endif +} + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/com_vanilla.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/com_vanilla.c new file mode 100644 index 0000000000000000000000000000000000000000..76d05233bd4cfb806a42c09632c8eaf6622ae5df --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/com_vanilla.c @@ -0,0 +1,208 @@ +/****************** com_vanilla.c *************************************** + * + * Communications routines, for single node interface + * + * + + g_sync() provides a synchronization point for all nodes. + g_floatsum() sums a floating point number over all nodes. + g_doublesum() sums a double over all nodes. + g_vecdoublesum() sums a vector of doubles over all nodes. + g_floatmax() finds maximum of a floating point number over all nodes. + g_doublemax() finds maximum of a double over all nodes. + broadcast_float() broadcasts a single precision number from + node 0 to all nodes. + broadcast_double() broadcasts a double precision number + send_integer() sends an integer to one other node + receive_integer() receives an integer + terminate() kills the job on all processors + + start_gather() starts asynchronous sends and receives required + to gather neighbors. + wait_gather() waits for receives to finish, insuring that the + data has actually arrived. + + send_field() sends a field to one other node. + receive_field() receives a field from some other node. +*/ + +/* load in definitions, variables etc. */ + +#include "comdefs.h" +#include "generic.h" + +#define MIN_GATHER_INDEX 100 +#define MAX_GATHER_INDEX 7100 /* allows 7000 concurrent gathers */ +#define FIELD_TYPE 11 /* used in send/receive field */ +#define SEND_INTEGER_TYPE 12 /* used in send/receive int */ + +#define SEND_FLAG 1 /* flags for marking msg_tags */ +#define RECEIVE_FLAG 2 +#define END_FLAG 3 + + +extern comlist_struct *comlist; /* the comlist variables in layout */ + + +/************************************************************************/ + +/* Machine initialization */ +#include +void initialize_machine(argc,argv) int argc; char **argv; { + /* MPI_Init(&argc,&argv); */ +} + + +/* GATHER ROUTINES */ +/* start_gather() returns a pointer to a list of msg_tag's, which + be used as input to subsequent wait_gather(). + + This list contains msg_tags for all receive buffers, followed by + end flag. + + If no messages at all are required, the routine will return NULL. + msg_buf=NULL should be a reliable indicator of no message. + + usage: tag = start_gather( source, size, direction, parity ) +*/ + +msg_tag * start_gather( field, size, dir, parity ) + /* arguments */ + char * field; /* pointer to some latfield */ + int size; /* size in bytes of the field (eg sizeof(su3_vector))*/ + int dir; /* direction to gather from. eg XUP - index into + neighbor tables */ + int parity; /* parity of sites whose neighbors we gather. + one of EVEN, ODD or EVENODD (EVEN+ODD). */ +{ + return((msg_tag *)NULL); +} + + +msg_tag * wait_gather( msg_tag *mbuf ) +{ + if (mbuf == NULL) return((msg_tag *)NULL); + halt ("Wait in com_vanilla! Never happens! "); + return((msg_tag *)NULL); +} + + +/***************************************************/ + +msg_tag * start_scatter( field, size, dir, parity ) + char * field; /* pointer to some latfield */ + int size; /* size in bytes of the field (eg sizeof(su3_vector))*/ + int dir; + int parity; /* parity of sites whose neighbors we scatter + one of EVEN or ODD */ +{ + return((msg_tag *)NULL); +} + +msg_tag * wait_scatter( msg_tag *mbuf ) +{ + if (mbuf == NULL) return((msg_tag *)NULL); + halt ("Wait in com_vanilla! Never happens! "); + return((msg_tag *)NULL); +} + +/***************************************************/ + + +/* SEND AND RECEIVE FIELD */ +/* send_field is to be called only by the node doing the sending */ +/* get_field is to be called only by the node to which the field was sent */ +void send_field(buf,size,tonode) + void *buf; int size,tonode; +{ + /* MPI_Send(buf,size,MPI_BYTE,tonode,FIELD_TYPE,MPI_COMM_WORLD); */ +} +void receive_field(buf,size) + void *buf; int size; +{ + /* MPI_Status status; + MPI_Recv(buf,size,MPI_BYTE,MPI_ANY_SOURCE,FIELD_TYPE, + MPI_COMM_WORLD,&status); + */ +} + +/* BASIC COMMUNICATIONS FUNCTIONS */ + +/* Tell what kind of machine we are on */ +static char name[]="Single node (vanilla)"; +char * machine_type(){ + return(name); +} + +/* Return my node number */ +int mynode() +{ + return( 0 ); +} + +/* Return number of nodes */ +int numnodes() +{ + return( 1 ); +} + +/* Synchronize all nodes */ +void g_sync() {} + +/* Sum float over all nodes to node 0 */ +void g_floatsum( float * fpt, int dist ) {} + +/* Sum double over all nodes */ +void g_doublesum( double * dpt, int dist ) {} + +/* Sum a vector of doubles over all nodes */ +void g_vecintsum( int *dpt, int nfloats, int dist ) {} + +/* Sum a vector of doubles over all nodes */ +void g_vecfloatsum( float *dpt, int nfloats, int dist ) {} + +/* Sum a vector of doubles over all nodes */ +void g_vecdoublesum( double *dpt, int ndoubles, int dist ) {} + +/* Find maximum of float over all nodes */ +void g_floatmax( float * fpt ) {} + +/* Find maximum of double over all nodes */ +void g_doublemax( double *dpt ) {} + +/* Broadcast a whole field */ +void broadcast_field(void *pt, int size) {} + +/* Broadcast floating point number from node zero */ +void broadcast_float(float *fpt) {} + +/* Broadcast double precision floating point number from node zero */ +void broadcast_double( double *dpt ) {} + +/* Broadcast double precision floating point number from node zero */ +void broadcast_int( int *dpt ) {} + +/* Send an integer to one other node */ +/* This is to be called only by the node doing the sending */ +void send_integer(tonode,address) + int tonode; int *address; +{ } + +/* Receive an integer from another node */ +/* Note we do not check if this was really meant for us */ +void receive_integer(address) + int *address; +{ } + +/* version of exit for multinode processes -- kill all nodes */ +void terminate(int status) +{ + printf("Termination: node %d, status = %d\n",this_node,status); + exit(status); +} + +/* clean exit from all nodes */ +void finishrun() +{ + exit(1); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/comdefs.h b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/comdefs.h new file mode 100644 index 0000000000000000000000000000000000000000..b8c6daeced97112f364d0f814851865a03fef789 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/comdefs.h @@ -0,0 +1,483 @@ +/************************* comdefs.h ************************************* + * Header file to define global (and hidden from user) variables + * and define macros etc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#ifdef CAN_DO_ALLOCA +#include /* needed for alloca */ +#endif + +#ifdef MPI +#include +#endif + +#ifdef __GNUC__ +#define INLINE inline +#else +#define INLINE +#endif + +#include "radix.h" + +/* This version divides the lattice by factors of two in any of the + four directions. It prefers to divide the longest dimensions, + which mimimizes the area of the surfaces. Similarly, it prefers + to divide dimensions which have already been divided, thus not + introducing more off-node directions. + + This requires that the lattice volume be divisible by the number + of nodes, which is a power of two. + + With the "GRAYCODE" option the node numbers are gray coded so that + adjacent lattice regions will physically be on adjacent nodes + in a hypercube architecture + + With the "EVENFIRST" option the even sites are listed contiguously + in the first part of the fields, and the odd sites in the last part. +*/ + +#ifndef NO_EVENFIRST /* use evenfirst in checkerboard-type parallel update */ +#define EVENFIRST +#endif +/* #define GRAYCODE */ + +/* Ensure correct sharing of variables */ +#ifdef CONTROL +#define EXTERN +#else +#define EXTERN extern +#endif + +/* Define parity variables */ + +#define EVEN 0x01 +#define ODD 0x02 +#define EVENODD 0x03 +#define ALL EVENODD + + +/* Directions, and a macro to give the opposite direction */ +/* Also define NDIRS = number of directions */ + +#ifndef DIMENSION +#define NDIM 3 +#else +#define NDIM DIMENSION +#endif + +#define NDIRS (2*NDIM) /* number of directions */ + +#if NDIM > 1 +#define XUP 0 +#define YUP 1 +#define XDOWN (NDIRS-1-XUP) +#define YDOWN (NDIRS-1-YUP) +#if NDIM > 2 +#define ZUP 2 +#define ZDOWN (NDIRS-1-ZUP) +#if NDIM > 3 +#define TUP 3 +#define TDOWN (NDIRS-1-TUP) +#endif +#endif +#endif + +#define opp_dir(dir) (NDIRS-1-(dir)) /* Opposite direction */ +#define is_up_dir(dir) (dir < NDIM) /* is it up-direction */ + +#define coordinate(i,dir) site[i].x[dir] +#define xcoord(i) coordinate(i,XUP) +#define ycoord(i) coordinate(i,YUP) +#define zcoord(i) coordinate(i,ZUP) +#define tcoord(i) coordinate(i,TUP) + +/************** some typedefs -- lattice, node, and site specific */ + +typedef struct lattice { + int volume,size[NDIM]; +} lattice_struct; + +typedef struct node { + int sites,evensites,oddsites; + int xmin[NDIM],nodesize[NDIM]; /* coordinate min and max values */ + int down_node[NDIM],up_node[NDIM]; /* indices of nodes up and down to each direction */ + int latfield_size; /* used in allocating latfields */ +} node_struct; + +typedef struct site { + int parity, index, x[NDIM]; +} site_struct; + +/* Structure to keep track of outstanding sends and receives */ +typedef struct msg_tag_struct { + int flag,dir; /* status of the msg, direction */ + double start_time; /* start time for this gather */ + int size,nsites; /* Info about comms - used only in scatter */ + char *buf; /* buffer for the send messages */ + char *field; /* ptr to latfield - used only in scatter */ + int *sitelist; /* site list pointer - used only by scatter */ + struct msg_tag_struct *next; /* next tag in the possible list */ +#ifdef MPI + MPI_Request mpi; /* message id returned by system call */ +#endif +} msg_tag; + +/* define COMLINK structures: this is defined for all + * nn-gathers. Each node contains + * comlink[dirs], which contains a list of + * send/receive node structs for each of the gathers. + * + * sendnode contains sitelist[], which is the list + * of sites which has to be copied to send buffer (allocated) + * The sites are ALWAYS ODD FIRST! Thus, when neighb. of + * even sites are collected, odd sites are moved (and offset = std.) + * For even sites we collect odd ones. + */ + +typedef struct sendnode { + int node; /* node index to send to */ + int n_even, n_odd, n; /* number of sites to be sent */ + int *sitelist; /* list of sites to be sent */ + struct sendnode *next; +} send_struct; + +typedef struct receivenode { + int node; /* node index to receive from */ + int n_even, n_odd, n; /* number of sites to be received */ + int offset; /* offset of the fields in latfield */ + struct receivenode *next; +} receive_struct; + +typedef struct comlist { + send_struct * to_node; + receive_struct * from_node; + int n_send,n_receive; +} comlist_struct; + +/*********************************************************/ +/* These routines check if we need to do fetching or not. + */ + +#ifdef MPI + +#define CHECK_GATHER_FIELDS /* define this to have additional check */ + +typedef struct gather_status_arr { + int status; + unsigned char gathered[NDIRS]; +#ifdef CHECK_GATHER_FIELDS + /* these are used to check if there is forgotten mark_changed */ + unsigned int check_even[NDIRS], check_odd[NDIRS]; +#endif +} gather_status_arr; + +#define GATHER_STATUS_SIZE sizeof(struct gather_status_arr) + +void gather_status_reset( char *field, int size ); +void gather_mark_dirty( char *field, int size, int parity ); +int is_already_gathered( char *field, int size, int dir, int parity ); +void gather_mark_gathered( char *field, int size, int dir, int parity ); + +/* Routine for marking the field 'dirty' */ +#define mark_changed( a, parity ) \ + gather_mark_dirty( (char *)a, ((char *)&(a[1])) - ((char *)&(a[0])), parity ) + +#else +/*** NON-MPI routines ***/ + +#define GATHER_STATUS_SIZE 0 + +#define mark_changed( a, parity ) /* nothing */ + +#endif + + +/***************** Critical global variables defined here ********/ + +EXTERN int *neighb[NDIRS]; /* neighbour arrays */ +EXTERN site_struct *site; /* site array hangs here */ +EXTERN lattice_struct lattice; /* lattice defn */ +EXTERN node_struct node; /* and node information */ +EXTERN int this_node; /* number of this node */ +EXTERN int number_of_nodes; + +EXTERN int current_blocking_level[NDIM]; /* currently running blocking level */ +EXTERN lattice_struct base_lattice; /* base lattice struct */ + +/************* MACROS for latfield *****************/ + +#define nb(dir,i) neighb[dir][i] + +#define new_latfield( typ ) \ + (typ *)latfield_alloc( sizeof(typ) ) + +#define new_latfield_size( siz ) (char *)latfield_alloc( siz ) + +#define free_latfield( field ) if (field != NULL) free( field ) + +char *copy_latfield_func( char *f, int siz); +#define copy_latfield( f, typ ) (typ *)copy_latfield_func((char *)f, sizeof(typ) ) + +/*------------ Do we have alloca? ----------------*/ +#ifdef CAN_DO_ALLOCA +/* allocate the tmp_latfield from the stack */ +static char *tmp_latf_ptr_; +#define tmp_latfield( typ ) \ + ( (tmp_latf_ptr_ = alloca( node.latfield_size * sizeof(typ) \ + + GATHER_STATUS_SIZE)) == NULL ? \ + (typ *)halt("alloca() error") : (typ *)tmp_latf_ptr_ ) +#define free_tmp( ptr ) /* nothing */ + +#else /* now not can alloca */ +#define tmp_latfield( typ ) new_latfield( typ ) +#define free_tmp( ptr ) free_latfield( ptr ) +#endif + +/*------------- General field blocking -----------*/ + +#define block_field( field, b, fr ) \ + block_field_prg( field, ((char *)&(field[1])) - ((char *)&(field[0])), b, fr ) + +/************* Gathering ***************************/ + +/* async gather and wait */ +#define start_get( a, dir, parity ) \ + start_gather( (char *)a, ((char *)&(a[1])) - ((char *)&(a[0])), dir, parity ) + +#define wait_get( tg ) wait_gather( tg ) + +/* synchronous gather */ +#define get_field( a, dir, parity ) wait_get( start_get( a, dir, parity ) ) + +/* async scatter and wait */ +#define start_put( a, dir, parity ) \ + start_scatter( (char *)a, ((char *)&(a[1])) - ((char *)&(a[0])), dir, parity ) + +#define wait_put( tg ) wait_scatter( tg ) + +/************* MACROS for looping ******************/ + +#define forbothparities(parity) for (parity=EVEN; parity<=ODD; parity++) + +#define forallsites(i) for(i=0; idir; wait_loop_=1; } else wait_loop_=0; \ +for(wait_i_=0; wait_i_<=wait_loop_; wait_i_++, tag = wait_gather( tag )) \ +forallsites(i) if ((!wait_loop_) || ((wait_i_) ^ (nb(wait_dir1_,i) < node.sites) )) + +#define forallsites_wait2(i,tag1,tag2) \ +if (tag1 != NULL) wait_dir1_=tag1->dir; \ +if (tag2 != NULL) wait_dir2_=tag2->dir; \ +if (tag1 != NULL || tag2 != NULL) wait_loop_=1; else wait_loop_=0; \ +if (tag1 == NULL) wait_dir1_ = wait_dir2_; /* short circuit these */ \ +if (tag2 == NULL) wait_dir2_ = wait_dir1_; \ +for(wait_i_=0; wait_i_<=wait_loop_; \ + wait_i_++, tag1 = wait_gather(tag1), tag2 = wait_gather(tag2)) \ +forallsites(i) if ((!wait_loop_) || ((wait_i_) ^ (nb(wait_dir1_,i) < node.sites && \ + nb(wait_dir2_,i) < node.sites) )) + +#define forparity_wait(i,par,tag) \ +if (tag != NULL) { wait_dir1_=tag->dir; wait_loop_=1; } else wait_loop_=0; \ +for(wait_i_=0; wait_i_<=wait_loop_; wait_i_++, tag = wait_gather( tag )) \ +forparity(i,par) if ((!wait_loop_) || ((wait_i_) ^ (nb(wait_dir1_,i) < node.sites) )) + +/********* now, make gather with arbitrary waits -- defined in com_mpi */ +void initialize_wait_arrays(); +unsigned int setup_wait_arr( msg_tag *t[], int ntag ); +#define NA_MAX 10 /* arbitrarily 10 gathers */ +EXTERN unsigned char *wait_arr_; +static unsigned int site_mask_; + +/* here wait_loop_ is 0 or 1 */ +#define forallsites_waitA(i,tag,ntag) \ +for (site_mask_ = setup_wait_arr( tag, ntag ), \ + wait_loop_ = (site_mask_ != 0), wait_i_=0; \ + wait_i_<=wait_loop_; wait_gather_arr(tag,ntag), wait_i_++) \ +forallsites(i) if (((site_mask_ & wait_arr_[i]) != 0) == wait_i_) + +/* here wait_loop_ is 0 or 1 */ +#define forparity_waitA(i,parity,tag,ntag) \ +for (site_mask_ = setup_wait_arr( tag, ntag ), \ + wait_loop_ = (site_mask_ != 0), wait_i_=0; \ + wait_i_<=wait_loop_; wait_gather_arr(tag,ntag), wait_i_++) \ +forparity(i,parity) if (((site_mask_ & wait_arr_[i]) != 0) == wait_i_) + + +#ifdef OLD_WAIT_ARR +int setup_wait_arr( unsigned char *wait_arr, msg_tag *to[], msg_tag *ti[], int ntag ); +EXTERN msg_tag *waitA_tags[NA_MAX+1]; + +#define forallsites_waitA(i,tag,ntag) \ +for (wait_loop_ = setup_wait_arr( wait_arr_, waitA_tags, tag, ntag ), \ + wait_i_=0; wait_i_<=wait_loop_; wait_gather(waitA_tags[wait_i_]), wait_i_++ ) \ +forallsites(i) if (wait_i_ == wait_arr_[i]) + +#define forparity_waitA(i,parity,tag,ntag) \ +for (wait_loop_ = setup_wait_arr( wait_arr_, waitA_tags, tag, ntag ), \ + wait_i_=0; wait_i_<=wait_loop_; wait_gather(waitA_tags[wait_i_]), wait_i_++ ) \ +forparity(i,parity) if (wait_i_ == wait_arr_[i]) +#endif + +/* now, if we define updates inside bulk most of the above stuff is + * superfluous. Let it be for compatibility though + */ +#ifdef NODE_UPDATE +int active_link(int i,int dir); +int active_site(int i); +#endif + + +/************************************************************************/ + +#else +/** Non-MPI versions **/ +#define forallsites_wait(i,tag) forallsites(i) +#define forallsites_wait2(i,tag1,tag2) forallsites(i) +#define forparity_wait(i,par,tag) forparity(i,par) +#define forallsites_wait3(i,tag1,tag2,tag3) forallsites(i) +#define forallsites_waitA(i,tag,ntag) forallsites(i) +#define forparity_waitA(i,parity,tag,ntag) forparity(i,parity) + +#ifdef NODE_UPDATE +#define active_link(i,dir) 1 +#define active_site(i,dir) 1 +#endif + + +#endif + + +/*********************************************************/ + +void zero_arr(int x[NDIM]); + +#define forallcoordinates(x) \ + for(zero_arr(x); is_allowed_coord(x,&lattice); step_coord(x,&lattice) ) + +#define foralldir(dir) for(dir=0; dir>1))) +/* Switches EVEN and ODD, leaves EVENODD*/ + +int is_allowed_coord(int x[NDIM],lattice_struct *l); +void step_coord(int x[NDIM],lattice_struct *l); + +/********** Some helpers *********************************/ + +/*********************************************************/ + +/* Communications routines */ +void send_field(void *,int,int); +void receive_field(void *,int); +char * machine_type(); + +void g_sync(); +void g_vecintsum(int *,int,int); +void g_floatsum(float *,int); +void g_vecfloatsum(float *,int,int); +void g_doublesum(double *,int); +void g_vecdoublesum(double *,int,int); +void g_floatmax(float *); +void g_doublemax(double *); +void broadcast_field(void *p,int siz); +void broadcast_float(float *); +void broadcast_double(double *); +void broadcast_int(int *); +void send_integer(int node,int *address); +void receive_integer(int *); +double dclock(); +void terminate(); void finishrun(); + +char *memalloc(int n, int size); +char *latfield_alloc(int size); + +msg_tag *start_gather(char *field, int size, int dir, int parity ); +msg_tag *wait_gather(msg_tag *mbuf); +void wait_gather_arr(msg_tag *mbuf[],int n); + +msg_tag *start_scatter(char *field, int size, int dir, int parity ); +msg_tag *wait_scatter(msg_tag *mbuf); + + +void copy_lat_data_to_node( void *dat, int dsize, + int xmin[NDIM], int xmax[NDIM], void *t, int node ); +void copy_lat_slice( void *dat, int dsize, int dir, int slice, void *t); + + +void setup_lattice(int size[NDIM]); +void make_lattice_arrays(lattice_struct * l); +void initialize_machine(); +void make_gathers(); +char *machine_type(); +int mynode(),numnodes(); +int node_number(int loc[NDIM]), node_index(int loc[NDIM],node_struct *s); +int is_on_node(int loc[NDIM]); + +void set_blocking_level(int b[NDIM]); +void set_blocking_all(int d); +int *make_blocking_map(int b[NDIM]); +#define reset_blocking_level() set_blocking_all(0) + +void report_comm_timers(); + +#ifdef RADIX_F +#define g_radixsum g_floatsum +#define g_vecradixsum g_vecfloatsum +#define broadcast_radix broadcast_float +#elif defined(RADIX_D) +#define g_radixsum g_doublesum +#define g_vecradixsum g_vecdoublesum +#define broadcast_radix broadcast_double +#else + no radix +#endif + +#define g_veccomplexsum(a, b, c) g_vecradixsum((radix *)a, 2*(b), c) +#define g_complexsum(a, b) g_vecradixsum((radix *)(a), 2, b) + + +/**** Other protos ****/ + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/copy_lat_data_to_zero.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/copy_lat_data_to_zero.c new file mode 100644 index 0000000000000000000000000000000000000000..0b08443df0550527d513a0c79c9dab76c6c9b5fa --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/copy_lat_data_to_zero.c @@ -0,0 +1,106 @@ +/**************************************************************** + * This routine copies 'latfield'-distributed data from all nodes + * to node zero. It copies a box determined by coordinates + * xmin and xmax. The target field t must be already allocated + */ + +#include "comdefs.h" +#include "generic.h" + +void copy_lat_data_to_node( void *dat, int dsize, + int xmin[NDIM], int xmax[NDIM], void *t, int target_node ) +{ + int i,d,n,ok,idx; + int x[NDIM],xsiz[NDIM],nmin[NDIM],nmax[NDIM],vmin[NDIM],vmax[NDIM],siz; + extern node_struct *allnodes; /* defined in layout.c */ + node_struct *np; + char *src, *trg; + + + foralldir(d) { + if (xmin[d] < 0 || xmax[d] > lattice.size[d] || xmin[d] > xmax[d]) + halt("Block size error in copy_lat_data"); + } + + foralldir(d) xsiz[d] = xmax[d] - xmin[d] + 1; + + /* Go through the nodes */ + for (n=0; nxmin[d]; + nmax[d] = np->xmin[d] + np->nodesize[d] - 1; + vmin[d] = greater(nmin[d],xmin[d]); + vmax[d] = smaller(nmax[d],xmax[d]); + ok = ok && (vmax[d] - vmin[d] >= 0); + siz = siz * (vmax[d] - vmin[d] + 1); + } + + /* printf("NODE %d, size %d, OK %d\n",n,siz,ok); + */ + if (ok) { + /* Now is included */ + char * cp = NULL; + if (n != target_node) cp = (char *)memalloc(siz,dsize); + + if (this_node == target_node && n != target_node) { + /* node target sends ack to n, and receives */ + send_field( &siz, sizeof(int), n ); + receive_field( cp, siz*dsize ); + } + + /* copy data */ + foralldir(d) x[d] = vmin[d]; + x[0]--; /* need to subtract 1 from 1st to make the addition */ + for (i=0; i vmax[d]) { x[d] = vmin[d]; d++; } + + /* copy from cp if we're node 0 receiving, else dat */ + if (this_node == target_node && n != target_node) src = cp + i*dsize; + else src = ((char *)dat) + node_index(x,np)*dsize; + + /* copy to t in node target, else to cp */ + if (this_node != target_node) trg = cp + i*dsize; + else { + idx = 0; + for(d=NDIM-1; d>=0; d--) idx = x[d]-xmin[d] + idx*xsiz[d]; + trg = ((char *)t) + idx*dsize; + } + + memcpy( trg, src, dsize ); + } + + if (this_node != target_node) { + /* receive ack, and send the stuff */ + receive_field( &i, sizeof(int) ); + if (i != siz) halt(" copy_lat_data siz error"); + send_field( cp, siz*dsize, target_node ); + } + + if (n != target_node) free( cp ); + } + } /* if this_node == target || n */ + } /* loop over nodes */ + + g_sync(); +} + + +void copy_lat_slice(void *dat, int dsize, int dir, int slice, void *t) +{ + int d,x1[NDIM], x2[NDIM]; + + foralldir(d) { + if (d != dir) { + x1[d] = 0; + x2[d] = lattice.size[d]; + } else x1[d] = x2[d] = slice; + } + + copy_lat_data_to_node( dat, dsize, x1, x2, *t, 0 ); +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/gauge_stuff.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/gauge_stuff.c new file mode 100644 index 0000000000000000000000000000000000000000..c2a985c61156b57164192cefa6159faa2dce2aad --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/gauge_stuff.c @@ -0,0 +1,465 @@ +/****** gauge_stuff.c -- ******************/ +/* MIMD version 6 */ +/* gauge action stuff for improved action +* T.D. and A.H. general gauge action updating code +* D.T. modified 5/97 +* D.T. modified 12/97, optimized gauge_force a little +* D.T. modified 3/99, gauge action in include file */ + +/**#define GFTIME**/ /* For timing gauge force calculation */ +#include "generic_includes.h" /* definitions files and prototypes */ + +#ifdef LOOPEND +#undef FORALLSITES +#define FORALLSITES(i,s) \ +{ register int loopend; loopend=sites_on_node; \ +for( i=0, s=lattice ; iTUP) +void printpath( int *path, int length ); + +#define GAUGE_ACTION_PART1 +/* defines NREPS NLOOP MAX_LENGTH MAX_NUM */ +#include +#undef GAUGE_ACTION_PART1 + +char gauge_action_description[128]; +int gauge_action_nloops=NLOOP; +int gauge_action_nreps=NREPS; +int loop_length[NLOOP]; /* lengths of various kinds of loops */ +int loop_num[NLOOP]; /* number of rotations/reflections for each kind */ + + /* table of directions, 1 for each kind of loop */ +int loop_ind[NLOOP][MAX_LENGTH]; + /* table of directions, for each rotation and reflection of each kind of + loop. tabulated with "canonical" starting point and direction. */ +int loop_table[NLOOP][MAX_NUM][MAX_LENGTH]; + /* table of coefficients in action, for various "representations" (actually, + powers of the trace) */ +float loop_coeff[NLOOP][NREPS]; + /* for each rotation/reflection, an integer distinct for each starting + point, or each cyclic permutation of the links */ +int loop_char[MAX_NUM]; + /* for each kind of loop for each rotation/reflection, the expectation + value of the loop */ +double loop_expect[NLOOP][NREPS][MAX_NUM]; + + +/* Make table of loops in action */ +void make_loop_table() { + + int perm[8],pp[8],ir[4]; + int length,iloop,i,j,chr; + int vec[MAX_LENGTH]; + int count,flag; + void char_num( int *dig, int *chr, int length); + +#define GAUGE_ACTION_PART2 +/* defines all loops and their coefficients */ +#include +#undef GAUGE_ACTION_PART2 + + for(iloop=0;iloopMAX_NUM){ + node0_printf("OOPS: MAX_NUM too small\n"); + exit(0); + } + loop_num[iloop]=count; + + } /* end reflection*/ + } /* end permutation if block */ + } /* end permutation */ + } /* end iloop */ + + /* print out the loop coefficients */ + node0_printf("loop coefficients: nloop rep loop_coeff multiplicity\n"); + for(i=0;i=0;j--) *chr= *chr*10+dig[j]; + + /* forward*/ + old=*chr; + for(j=length-1;j>=1;j--){ + newv=old-tenl*dig[j]; + newv=newv*10+dig[j]; + if(newv < *chr) *chr=newv; + old=newv; } + + /* backward*/ + for(j=0;j=0;j--) old=old*10+bdig[j]; + if(old < *chr ) *chr=old; + for(j=length-1;j>=1;j--){ + newv=old-tenl*bdig[j]; + newv=newv*10+bdig[j]; + if(newv < *chr) *chr=newv; + old=newv; } + +} /* char_num */ + +double imp_gauge_action() { + register int i; + int rep; + register site *s; + complex trace; + double g_action; + double action,act2,total_action; + int length; + + /* these are for loop_table */ + int ln,iloop; + + g_action=0.0; + + /* gauge action */ + for(iloop=0;ilooptempmat1 ); + action = 3.0 - (double)trace.real; + /* need the "3 -" for higher characters */ + total_action= (double)loop_coeff[iloop][0]*action; + act2=action; + for(rep=1;repstaple.e[j][k]=cmplx(0.0,0.0); + } END_LOOP + + ncount=0; + for(iloop=0;ilooptempmat1), &tmat1 ); + /* first we compute the fundamental term */ + new_term = loop_coeff[iloop][0]; + + /* now we add in the higher representations */ + if(NREPS > 1){ +node0_printf("WARNING: THIS CODE IS NOT TESTED\n"); exit(0); + act2=1.0; + action = 3.0 - realtrace_su3(&(st->link[dir]), + &tmat1 ); + + for(j=1;j 1 */ + + scalar_mult_add_su3_matrix( &(st->staple), &tmat1, + new_term, &(st->staple) ); + + } END_LOOP + + ncount++; + + } /* k (location in path) */ + } /* ln */ + } /* iloop */ + + /* Now multiply the staple sum by the link, then update momentum */ + FORALLSITES(i,st){ + mult_su3_na( &(st->link[dir]), &(st->staple), &tmat1 ); + momentum = (anti_hermitmat *)F_PT(st,mom_off); + uncompress_anti_hermitian( &momentum[dir], &tmat2 ); + scalar_mult_sub_su3_matrix( &tmat2, &tmat1, + eb3, &(st->staple) ); + make_anti_hermitian( &(st->staple), &momentum[dir] ); + } END_LOOP + } /* dir loop */ +#ifdef GFTIME +dtime+=dclock(); +node0_printf("GFTIME: time = %e (Symanzik1) mflops = %e\n",dtime, + nflop*volume/(1e6*dtime*numnodes()) ); +#endif +} /* imp_gauge_force.c */ + +/* Measure gauge observables: + Loops in action (time and space directions treated differently) + Polyakov loop + +*/ +void g_measure( ){ + double ss_plaquette, st_plaquette; + complex p_loop; + register int i; + register site *s; + complex trace; + double average[NREPS],action,act2,total_action; + int length; + /* these are for loop_table */ + int ln,iloop,rep; + + /* KS and BC minus signs should be out for this routine */ + d_plaquette( &ss_plaquette, &st_plaquette ); + if(this_node==0)printf("PLAQ:\t%f\t%f\n", ss_plaquette, st_plaquette ); + + p_loop = ploop(); + if(this_node==0)printf("P_LOOP:\t%e\t%e\n", p_loop.real, p_loop.imag ); + + /* gauge action, all loops that contribute */ + total_action=0.0; + for(iloop=0;ilooptempmat1 ); + average[0] += (double)trace.real; + action = 3.0 - (double)trace.real; + total_action += (double)loop_coeff[iloop][0]*action; + /* need the "3 -" for higher characters */ + act2=action; + for(rep=1;rep +void dsdu_qhb_subl(int dir, int subl) +{ +register site *st; +register int i; +int iloop, ln, k, j; +int dirs[MAX_LENGTH], length; +int path_dir[MAX_LENGTH], path_length; +su3_matrix tmat1; +int fsubl; + + assert(NREPS==1); /* This procedure designed only for NREPS = 1 */ + + FORSOMESUBLATTICE(i,st,subl) { + clear_su3mat(&(st->staple)); + } + + for(iloop=0;ilooptempmat1), &tmat1 ); + scalar_mult_add_su3_matrix(&(st->staple), &tmat1, + loop_coeff[iloop][0], &(st->staple) ); + } + } /* k (location in path) */ + } /* ln */ + } /* iloop */ + + g_sync(); + +} /* dsdu_qhb */ + +#endif /* N_SUBL32 */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/gaugefix.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/gaugefix.c new file mode 100644 index 0000000000000000000000000000000000000000..d8aaf1feb62f80a6884078a188d1dc39112808a3 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/gaugefix.c @@ -0,0 +1,550 @@ +/************************** gaugefix.c *******************************/ +/* Fix Coulomb or Lorentz gauge by doing successive SU(2) gauge hits */ +/* Uses double precision global sums */ +/* MIMD version 6 */ +/* C. DeTar 10-22-90 */ +/* T. DeGrand 1993 */ +/* U.M. Heller 8-31-95 */ +/* C. DeTar 10-11-97 converted to generic */ + +/* Prototype... + + void gaugefix(int gauge_dir,radix relax_boost,int max_gauge_iter, + radix gauge_fix_tol, field_offset diffmat, field_offset sumvec, + int nvector, field_offset vector_offset[], int vector_parity[], + int nantiherm, field_offset antiherm_offset[], + int antiherm_parity[] ) + ------------------------------------------------------------------- + + NOTE: For staggered fermion applications, it is necessary to remove + the KS phases from the gauge links before calling this procedure. + See "rephase" in setup.c. + + ------------------------------------------------------------------- + EXAMPLE: Fixing only the link matrices to Coulomb gauge with scratch + space in mp (su3_matrix) and chi (su3_vector): + + gaugefix(TUP,(float)1.5,500,(float)1.0e-7, + F_OFFSET(mp),F_OFFSET(chi),0,NULL,NULL,0,NULL,NULL); + + ------------------------------------------------------------------- + EXAMPLE: Fixing Coulomb gauge with respect to the y direction + in the staggered fermion scheme and simultaneously transforming + the pseudofermion fields and gauge-momenta involved in updating: + + int nvector = 3; + field_offset vector_offset[3] = { F_OFFSET(g_rand), F_OFFSET(phi), + F_OFFSET(xxx) }; + int vector_parity[3] = { EVENANDODD, EVEN, EVEN }; + int nantiherm = 4; + field_offset antiherm_offset[4] = { F_OFFSET(mom[0]), F_OFFSET(mom[1]), + F_OFFSET(mom[2]), F_OFFSET(mom[3]) }; + field_offset antiherm_parity[4] = { EVENANDODD, EVENANDODD, EVENANDODD, + EVENANDODD } + + rephase( OFF ); + gaugefix(YUP,(float)1.8,500,(float)2.0e-6, + F_OFFSET(tempmat1),F_OFFSET(tempvec[0]), + nvector,vector_offset,vector_parity, + nantiherm,antiherm_offset,antiherm_parity); + rephase( ON ); + + ------------------------------------------------------------------- + + gauge_dir specifies the direction of the "time"-like hyperplane + for the purposes of defining Coulomb or Lorentz gauge + TUP for evaluating propagators in the time-like direction + ZUP for screening lengths. + 8 for Lorentz gauge + relax_boost Overrelaxation parameter + max_gauge_iter Maximum number of iterations + gauge_fix_tol Stop if change is less than this + diffmat Scratch space for an su3 matrix + sumvec Scratch space for an su3 vector + NOTE: if diffmat or sumvec are negative, gaugefix mallocs its own + scratch space. */ + +#include "generic_includes.h" + +/* Generic definitions - could be useful elsewhere */ + +/* CDIF(a,b) a -= b */ + /* a -= b */ +#define CDIF(a,b) { (a).real -= (b).real; (a).imag -= (b).imag; } + +/* Scratch space */ + +su3_matrix *diffmatp; /* malloced diffmat pointer */ +su3_vector *sumvecp; /* malloced sumvec pointer */ +field_offset diffmat_offset,sumvec_offset; /* field offsets */ + +void mult_su2_mat_vec_elem_n(su2_matrix *u,complex *x0,complex *x1) +{ + /* Multiplies the complex column spinor (x0, x1) by the SU(2) matrix u */ + /* and puts the result in (x0,x1). */ + /* Thus x <- u * x */ + /* C. DeTar 3 Oct 1990 */ + + complex z0, z1, t0, t1; + + t0 = *x0; t1 = *x1; + + CMUL(u->e[0][0], t0, z0); + CMUL(u->e[0][1], t1, z1); + CADD(z0, z1, *x0); + CMUL(u->e[1][0], t0, z0); + CMUL(u->e[1][1], t1, z1); + CADD(z0, z1, *x1); + +} /* mult_su2_mat_vec_elem_n */ + +void mult_su2_mat_vec_elem_a(su2_matrix *u,complex *x0,complex *x1) +{ + /* Multiplies the complex row spinor (x0, x1) by the adjoint of the */ + /* SU(2) matrix u and puts the result in (x0,x1). */ + /* Thus x <- x * u-adj */ + /* C. DeTar 3 Oct 1990 */ + + complex z0, z1, t0, t1; + + t0 = *x0; t1 = *x1; + + CMUL_J(t0, u->e[0][0], z0); + CMUL_J(t1, u->e[0][1], z1); + CADD(z0, z1, *x0); + CMUL_J(t0, u->e[1][0], z0); + CMUL_J(t1, u->e[1][1], z1); + CADD(z0, z1, *x1); + +} /* mult_su2_mat_vec_elem_a */ + +void dumpsu2(su2_matrix *u) +{ + int i,j; + for(i=0;i<2;i++){ + for(j=0;j<2;j++)printf("(%.2e,%.2e)\t", + (double)u->e[i][j].real,(double)u->e[i][j].imag); + printf("\n"); + } + printf("\n"); +} + +void left_su2_hit_n(su2_matrix *u,int p,int q,su3_matrix *link) +{ + /* link <- u * link */ + /* The 0 row of the SU(2) matrix u matches row p of the SU(3) matrix */ + /* The 1 row of the SU(2) matrix u matches row q of the SU(3) matrix */ + /* C. DeTar 18 Oct 1990 */ + + register int m; + + for (m = 0; m < 3; m++) + mult_su2_mat_vec_elem_n(u, &(link->e[p][m]), &(link->e[q][m])); + +} /* left_su2_hit_n */ + +void right_su2_hit_a(su2_matrix *u,int p,int q,su3_matrix *link) +{ + /* link <- link * u adj */ + /* The 0 column of u-adjoint matches column p of the SU(3) matrix */ + /* The 1 column of u-adjoint matches column q of the SU(3) matrix */ + /* C. DeTar 18 Oct 1990 */ + + register int m; + + for (m = 0; m < 3; m++) + mult_su2_mat_vec_elem_a(u, &(link->e[m][p]), &(link->e[m][q])); + +} /*right_su2_hit_a */ + +void accum_gauge_hit(int gauge_dir,int parity) +{ + +/* Accumulates sums and differences of link matrices for determining optimum */ +/* hit for gauge fixing */ +/* Differences are kept in diffmat and the diagonal elements of the sums */ +/* in sumvec */ + + register int j; + register su3_matrix *m1,*m2; + register int dir,i; + + /* Clear sumvec and diffmat */ + + forsomeparity(i,parity) + { + if(diffmat_offset >= 0) + clear_su3mat((su3_matrix *)F_PT(s,diffmat_offset)); + else + clear_su3mat(&diffmatp[i]); + if(sumvec_offset >= 0) + clearvec((su3_vector *)F_PT(s,sumvec_offset)); + else + clearvec(&sumvecp[i]); + } + + /* Subtract upward link contributions */ + + FORSOMEPARITY(i,s,parity) + { + FORALLUPDIRBUT(gauge_dir,dir) + { + /* Upward link matrix */ + m1 = &(s->link[dir]); + if(diffmat_offset >= 0) + sub_su3_matrix((su3_matrix *)F_PT(s,diffmat_offset), + m1, (su3_matrix *)F_PT(s,diffmat_offset)); + else + sub_su3_matrix( &diffmatp[i], m1, &diffmatp[i]); + + if(sumvec_offset >= 0) + { + for(j=0;j<3;j++)CSUM( ((su3_vector *)F_PT(s,sumvec_offset))->c[j], + m1->e[j][j]); + } + else + { + for(j=0;j<3;j++)CSUM( sumvecp[i].c[j],m1->e[j][j]); + } + } + } + + /* Add downward link contributions */ + + FORSOMEPARITY(i,s,parity) + { + FORALLUPDIRBUT(gauge_dir,dir) + { + /* Downward link matrix */ + m2 = (su3_matrix *)gen_pt[dir][i]; + + if(diffmat_offset >= 0) + add_su3_matrix((su3_matrix *)F_PT(s,diffmat_offset), m2, + (su3_matrix *)F_PT(s,diffmat_offset)); + else + add_su3_matrix( &diffmatp[i], m2, &diffmatp[i]); + + if(sumvec_offset >= 0) + { + for(j=0;j<3;j++)CSUM( ((su3_vector *)F_PT(s,sumvec_offset))->c[j], + m2->e[j][j]); + } + else + { + for(j=0;j<3;j++)CSUM( sumvecp[i].c[j], m2->e[j][j]); + } + + /* Add diagonal elements to sumvec */ + } + } +} /* accum_gauge_hit */ + + +void do_hit(int gauge_dir, int parity, int p, int q, float relax_boost, + int nvector, field_offset vector_offset[], int vector_parity[], + int nantiherm, field_offset antiherm_offset[], + int antiherm_parity[] ) +{ + /* Do optimum SU(2) gauge hit for p, q subspace */ + + float a0,a1,a2,a3,asq,a0sq,x,r,xdr; + register int dir,i,j; + register site *s; + su2_matrix u; + su3_matrix htemp; + + /* Accumulate sums for determining optimum gauge hit */ + + accum_gauge_hit(gauge_dir,parity); + + FORSOMEPARITY(i,s,parity) + { + /* The SU(2) hit matrix is represented as a0 + i * Sum j (sigma j * aj)*/ + /* The locally optimum unnormalized components a0, aj are determined */ + /* from the current link in direction dir and the link downlink */ + /* in the same direction on the neighbor in the direction opposite dir */ + /* The expression is */ + /* a0 = Sum dir Tr Re 1 * (downlink dir + link dir) */ + /* aj = Sum dir Tr Im sigma j * (downlink dir - link dir) j = 1,2, 3 */ + /* where 1, sigma j are unit and Pauli matrices on the p,q subspace */ + /* + a0 = s->sumvec.c[p].real + s->sumvec.c[q].real; + a1 = s->diffmat.e[q][p].imag + s->diffmat.e[p][q].imag; + a2 = -s->diffmat.e[q][p].real + s->diffmat.e[p][q].real; + a3 = s->diffmat.e[p][p].imag - s->diffmat.e[q][q].imag; +*/ + if(sumvec_offset >= 0) + a0 = ((su3_vector *)F_PT(s,sumvec_offset))->c[p].real + + ((su3_vector *)F_PT(s,sumvec_offset))->c[q].real; + else + a0 = sumvecp[i].c[p].real + sumvecp[i].c[q].real; + + if(diffmat_offset >= 0) + { + a1 = ((su3_matrix *)F_PT(s,diffmat_offset))->e[q][p].imag + + ((su3_matrix *)F_PT(s,diffmat_offset))->e[p][q].imag; + a2 = -((su3_matrix *)F_PT(s,diffmat_offset))->e[q][p].real + + ((su3_matrix *)F_PT(s,diffmat_offset))->e[p][q].real; + a3 = ((su3_matrix *)F_PT(s,diffmat_offset))->e[p][p].imag - + ((su3_matrix *)F_PT(s,diffmat_offset))->e[q][q].imag; + } + else + { + a1 = diffmatp[i].e[q][p].imag + diffmatp[i].e[p][q].imag; + a2 = -diffmatp[i].e[q][p].real + diffmatp[i].e[p][q].real; + a3 = diffmatp[i].e[p][p].imag - diffmatp[i].e[q][q].imag; + } + + /* Over-relaxation boost */ + + /* This algorithm is designed to give little change for large |a| */ + /* and to scale up the gauge transformation by a factor of relax_boost*/ + /* for small |a| */ + + asq = a1*a1 + a2*a2 + a3*a3; + a0sq = a0*a0; + x = (relax_boost*a0sq + asq)/(a0sq + asq); + r = sqrt((double)(a0sq + x*x*asq)); + xdr = x/r; + /* Normalize and boost */ + a0 = a0/r; a1 = a1*xdr; a2 = a2*xdr; a3 = a3*xdr; + + /* Elements of SU(2) matrix */ + + u.e[0][0] = cmplx( a0, a3); + u.e[0][1] = cmplx( a2, a1); + u.e[1][0] = cmplx(-a2, a1); + u.e[1][1] = cmplx( a0,-a3); + + + /* Do SU(2) hit on all upward links */ + + FORALLUPDIR(dir) + left_su2_hit_n(&u,p,q,&(s->link[dir])); + + /* Do SU(2) hit on all downward links */ + + FORALLUPDIR(dir) + right_su2_hit_a(&u,p,q,(su3_matrix *)gen_pt[dir][i]); + + /* Transform vectors and gauge momentum if requested */ + + for(j = 0; j < nvector; j++) + + /* Do SU(2) hit on specified su3 vector for specified parity */ + + /* vector <- u * vector */ + if(vector_parity[j] == EVENANDODD || vector_parity[j] == parity) + mult_su2_mat_vec_elem_n(&u, + &((su3_vector *)F_PT(s,vector_offset[j]))->c[p], + &((su3_vector *)F_PT(s,vector_offset[j]))->c[q]); + + /* Transform antihermitian matrices if requested */ + + for(j = 0; j < nantiherm; j++) + /* antiherm <- u * antiherm * u^dagger */ + if(antiherm_parity[j] == EVENANDODD || antiherm_parity[j] == parity) + { + uncompress_anti_hermitian( + (anti_hermitmat *)F_PT(s,antiherm_offset[j]), &htemp); + /* If the next 2 steps prove too time consuming, */ + /* they can be simplified algebraically, and sped up by ~2 */ + left_su2_hit_n(&u,p,q,&htemp); + right_su2_hit_a(&u,p,q,&htemp); + make_anti_hermitian( &htemp, + (anti_hermitmat *)F_PT(s,antiherm_offset[j])); + } + } + + /* Exit with modified downward links left in communications buffer */ +} /* do_hit */ + +double get_gauge_fix_action(int gauge_dir,int parity) +{ + /* Adds up the gauge fixing action for sites of given parity */ + /* Returns average over these sites */ + /* The average is normalized to a maximum of 1 when all */ + /* links are unit matrices */ + + register int dir,i,ndir; + register site *s; + register su3_matrix *m1, *m2; + double gauge_fix_action; + complex trace; + + gauge_fix_action = 0.0; + + FORSOMEPARITY(i,s,parity) + { + FORALLUPDIRBUT(gauge_dir,dir) + { + m1 = &(s->link[dir]); + m2 = (su3_matrix *)gen_pt[dir][i]; + + trace = trace_su3(m1); + gauge_fix_action += (double)trace.real; + + trace = trace_su3(m2); + gauge_fix_action += (double)trace.real; + } + } + + /* Count number of terms to average */ + ndir = 0; FORALLUPDIRBUT(gauge_dir,dir)ndir++; + + /* Sum over all sites of this parity */ + g_doublesum( &gauge_fix_action); + + /* Average is normalized to max of 1/2 on sites of one parity */ + return(gauge_fix_action /((double)(6*ndir*nx*ny*nz*nt))); +} /* get_gauge_fix_action */ + +void gaugefixstep(int gauge_dir,double *av_gauge_fix_action,float relax_boost, + int nvector, field_offset vector_offset[], int vector_parity[], + int nantiherm, field_offset antiherm_offset[], + int antiherm_parity[] ) +{ + /* Carry out one iteration in the gauge-fixing process */ + + int parity; + msg_tag *mtag[8]; + float gauge_fix_action; + register int dir,i; + register site *s; + + /* Alternate parity to prevent interactions during gauge transformation */ + *av_gauge_fix_action = 0.; + g_sync(); + fflush(stdout); + + for(parity = ODD; parity <= EVEN; parity++) + { + /* Start gathers of downward links */ + + FORALLUPDIR(dir) + { + mtag[dir] = start_gather( F_OFFSET(link[dir]), sizeof(su3_matrix), + OPP_DIR(dir), parity, gen_pt[dir] ); + } + + /* Wait for gathers */ + + FORALLUPDIR(dir) + { + wait_gather(mtag[dir]); + } + + /* Total gauge fixing action for sites of this parity: Before */ + gauge_fix_action = get_gauge_fix_action(gauge_dir,parity); + + /* Do optimum gauge hit on various subspaces */ + + do_hit(gauge_dir,parity,0,1, relax_boost, + nvector, vector_offset, vector_parity, + nantiherm, antiherm_offset, antiherm_parity); + do_hit(gauge_dir,parity,1,2, relax_boost, + nvector, vector_offset, vector_parity, + nantiherm, antiherm_offset, antiherm_parity); + do_hit(gauge_dir,parity,2,0, relax_boost, + nvector, vector_offset, vector_parity, + nantiherm, antiherm_offset, antiherm_parity); + + /* Total gauge fixing action for sites of this parity: After */ + gauge_fix_action = get_gauge_fix_action(gauge_dir,parity); + + *av_gauge_fix_action += gauge_fix_action; + + /* Scatter downward link matrices by gathering to sites of */ + /* opposite parity */ + + FORALLUPDIR(dir) + { + /* Synchronize before scattering to be sure the new modified link */ + /* matrices are all ready to be scattered and diffmat is not */ + /* overwritten before it is used */ + g_sync(); + + /* First copy modified link for this dir */ + /* from comm buffer or node to diffmat */ + + FORSOMEPARITY(i,s,parity) + { + if(diffmat_offset >= 0) + su3mat_copy((su3_matrix *)(gen_pt[dir][i]),(su3_matrix *)F_PT(s,diffmat_offset)); + else + su3mat_copy((su3_matrix *)(gen_pt[dir][i]), &diffmatp[i]); + } + + /* Now we are finished with gen_pt[dir] */ + cleanup_gather(mtag[dir]); + + /* Synchronize to make sure the previous copy happens before the */ + /* subsequent gather below */ + g_sync(); + + /* Gather diffmat onto sites of opposite parity */ + if(diffmat_offset >= 0) + mtag[dir] = start_gather( diffmat_offset, sizeof(su3_matrix), + dir, OPP_PAR(parity), gen_pt[dir] ); + else + mtag[dir] = start_gather_from_temp( diffmatp, sizeof(su3_matrix), + dir, OPP_PAR(parity), gen_pt[dir] ); + + wait_gather(mtag[dir]); + + /* Copy modified matrices into proper location */ + + FORSOMEPARITY(i,s,OPP_PAR(parity)) + su3mat_copy((su3_matrix *)(gen_pt[dir][i]),&(s->link[dir])); + + cleanup_gather(mtag[dir]); + } + + } +} /* gaugefixstep */ + + +void gaugefix(int gauge_dir,radix relax_boost,int max_gauge_iter, + radix gauge_fix_tol, su3_matrix *diffmat, su3_vector *sumvec, + int nvector, field_offset vector_offset[], int vector_parity[], + int nantiherm, field_offset antiherm_offset[], + int antiherm_parity[] ) +{ + int gauge_iter; + int alloc_diffmat,alloc_sumvec; + double current_av, old_av, del_av; + + alloc_diffmat = alloc_sumvec = 0; + if (diffmat == NULL) { + diffmat = new_latfield( su3_matrix ); + alloc_diffmat = 1; + } + if (sumvec == NULL) { + sumvec = new_latfield( su3_vector ); + alloc_sumvec = 1; + } + + /* Do at most max_gauge_iter iterations, but stop after the second step if */ + /* the change in the avg gauge fixing action is smaller than gauge_fix_tol */ + + for (gauge_iter=0; gauge_iter < max_gauge_iter; gauge_iter++) + { + gaugefixstep(gauge_dir,¤t_av,relax_boost, + nvector, vector_offset, vector_parity, + nantiherm, antiherm_offset, antiherm_parity); + + if(gauge_iter != 0) + { + del_av = current_av - old_av; + if (fabs(del_av) < gauge_fix_tol) break; + } + old_av = current_av; + } + /* Free workspace */ + if (alloc_sumvec) free_latfield( sumvec ); + if (alloc_diffmat) free_latfield( diffmat ); + + if(this_node==0) + printf("GFIX WITHOUT REUNITARIZATION: Ended at step %d. Av gf action %.3e, delta %.3e\n", + gauge_iter,(double)current_av,(double)del_av); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/gaugefix2.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/gaugefix2.c new file mode 100644 index 0000000000000000000000000000000000000000..193f60bf080d9faa8ad04b6a8c34cf002b68a707 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/gaugefix2.c @@ -0,0 +1,332 @@ +/************************** gaugefix2.c *******************************/ +/* Fix Coulomb or Lorentz gauge by doing successive SU(2) gauge hits */ +/* Uses double precision global sums */ +/* This version does automatic reunitarization at preset intervals */ +/* MIMD version 6 */ +/* C. DeTar 10-22-90 */ +/* T. DeGrand 1993 */ +/* U.M. Heller 8-31-95 */ +/* C. DeTar 10-11-97 converted to generic */ +/* C. DeTar 12-26-97 added automatic reunitarization */ +/* C. DeTar 11-24-98 remove superfluous references to p2 (was for ks phases) */ + +/* Heavily modified by Kari Rummukainen 2005-6 */ + +/* Prototype... + +void gaugefix(int gauge_dir,double relax_boost,int max_gauge_iter, + double gauge_fix_tol, suN_matrix gauge ); + + if gauge == NULL do not return the gauge + + ------------------------------------------------------------------- + + NOTE: For staggered fermion applications, it is necessary to remove + the KS phases from the gauge links before calling this procedure. + See "rephase" in setup.c. + + ------------------------------------------------------------------- + EXAMPLE: Fixing only the link matrices to Coulomb gauge with scratch + space in mp (suN_matrix) and chi (suN_vector): + + gaugefix(TUP,1.5,500,1.0e-7,NULL); + + ------------------------------------------------------------------- + EXAMPLE: Fixing Coulomb gauge with respect to the y direction + in the staggered fermion scheme and simultaneously transforming + the pseudofermion fields and gauge-momenta involved in updating: + + int nvector = 3; + suN_vector * vec[3] = {g_rand, phi, xxx }; + int vector_parity[3] = { EVENODDODD, EVEN, EVEN }; + int nantiherm = 4; + int antiherm_parity[4] = { EVENODD, EVENODD, EVENODD, EVENODD } + + rephase( OFF ); + gauge = new_latfield(suN_matrix); + gaugefix( YUP, 1.8, 500, 2.0e-6, gauge ); + vec_fix_gauge( gauge, g_rand, EVENODD ); + vec_fix_gauge( gauge, phi, EVEN ); + vec_fix_gauge( gauge, xxx, EVEN ); + foralldir(d) ahmat_fix_gauge( gauge, mom[d], EVENODD ); + free_latfield( gauge ); + + rephase( ON ); + + ------------------------------------------------------------------- + + gauge_dir specifies the direction of the "time"-like hyperplane + for the purposes of defining Coulomb or Lorentz gauge + TUP for evaluating propagators in the time-like direction + ZUP for screening lengths. + -1 for Lorentz gauge + relax_boost Overrelaxation parameter + max_gauge_iter Maximum number of iterations + gauge_fix_tol Stop if change is less than this +*/ + +#include "lattice.h" +#define REUNIT_INTERVAL 20 + +#ifdef SU2 +DOES NOT WORK YET FOR SU2 +#endif + +/* CDIF(a,b) a -= b */ + /* a -= b */ +#define CDIF(a,b) { (a).real -= (b).real; (a).imag -= (b).imag; } + +/* Scratch space */ + +void accum_gauge_hit(int i, int gauge_dir, + su3_matrix *diffmat, su3_vector *sumvec ) +{ + +/* Accumulates sums and differences of link matrices for determining optimum */ +/* hit for gauge fixing */ +/* Differences are kept in diffmat and the diagonal elements of the sums */ +/* in sumvec */ + + register int j; + register su3_matrix *m1; + register int dir; + + /* Clear sumvec and diffmat */ + + clear_su3mat( diffmat ); + clearvec( sumvec ); + + /* Subtract upward link contributions */ + + foralldir(dir) if (dir != gauge_dir) { + int odir = opp_dir(dir); + + m1 = &(U[dir][i]); + sub_su3_matrix( diffmat, m1, diffmat); + /* Sum diagonal part */ + for(j=0; jc[j], m1->e[j][j] ); + + + /* Add downward link contributions */ + + m1 = &U[dir][nb(odir,i)]; + add_su3_matrix( diffmat, m1, diffmat ); + for(j=0; jc[j], m1->e[j][j] ); + } +} /* accum_gauge_hit */ + + +void do_hit(int gauge_dir, int parity, double relax_boost, su3_matrix *gauge ) +{ + /* Do optimum SU(2) gauge hit for p, q subspace */ + + double a0,a1,a2,a3,asq,a0sq,x,r,xdr; + int dir,i,p,q; + su2_matrix u; + su3_matrix diffmat; + su3_vector sumvec; + + /* Accumulate sums for determining optimum gauge hit - + * U's must have been fetched from down! */ + + /* accum_gauge_hit( gauge_dir, parity, diffmat, sumvec); */ + + forparity(i,parity) for (p=0; p = 1/2 + * This requires a random number generator named "dran()", returning + * a float uniformly distributed between zero and one. + */ + +#include +#include +#include + +#include "mersenne.h" +#define dran() mersenne() + +double gaussian_ran() +{ + static int iset=0; + static double gset; + register double fac,r,v1,v2; + + if (iset) { + iset = 0; + return(gset); + } + + do { + v1 = 2.0*dran() - 1.0; + v2 = 2.0*dran() - 1.0; + r = v1*v1 + v2*v2; + } while (r >= 1.0); + fac = sqrt( -log(r)/r ); + gset = v1*fac; + iset = 1; + return(v2*fac); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/generic.h b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/generic.h new file mode 100644 index 0000000000000000000000000000000000000000..2cbf58c8e711525feb7d74cbb69599fd9975aee1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/generic.h @@ -0,0 +1,76 @@ +/************************************************************************** + * Header file where to define headers for generic functions + * (not for communications, which are defined in comdefs.h) + */ + +/******************** Mersenne random numbers ***/ + +#include "mersenne.h" + +/** +#define MERSENNE_N 624 + +extern int mersenne_i; +extern double mersenne_array[MERSENNE_N]; + +#define mersenne() ( mersenne_i > 0 ? mersenne_array[--mersenne_i] : \ + mersenne_generate(&mersenne_i) ) + +void seed_mersenne(long); +double mersenne_generate(int *); + +**/ + +#define dran() mersenne() + + +/********************* General protos ***/ + +void initial_setup(); +void initialize_prn(long seed); +void *halt(char *); +double gaussian_ran(); +void restore_binary(FILE * f); +void save_binary(FILE * f); + +/*********************** Some defines ****/ + +#define smaller(a,b) ((a)<(b)? (a) : (b)) +#define greater(a,b) ((a)>(b)? (a) : (b)) +#define sqr(x) ((x)*(x)) +#define printf0 if (this_node != 0) { } else printf + +/********************** Parameter_io.c ***/ + +double get_d(FILE *f,char *s,int bcast); +int get_i(FILE *f,char *s,int bcast); +int get_s(FILE *f,char *s,char *target,int bcast); +int get_item(FILE *f,char *s,char *items[],int n_items, int bcast); +void print_d(FILE *f,char *s,double val); +void print_i(FILE *f,char *s,int val); +void print_s(FILE *f,char *s,char *val); + +/************************* TIMING STUFF **/ + +double added_cpu_time(); +double cputime(); +void timecheck(int iter, int maxiter, int status); +void inittimecheck(void); +int setup_timelimit(time_t t,int argc,char *argv); +void resettime(void); +void inittime(void); + +#define addtime(t) t += added_cpu_time() + +/************************ Multicanonical headers **/ + +int setmulti(); +double multi_weight(); +int mc_acceptance(int parity,double rt); +void set_mc_update(int parity); +void writemuca(); + +EXTERN int is_multicanonical, is_mucacalc; + +/**********************/ + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/generic_complex.h b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/generic_complex.h new file mode 100644 index 0000000000000000000000000000000000000000..10904fda5f779915ae001b78fe1115db3a46c775 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/generic_complex.h @@ -0,0 +1,21 @@ +/* Definitions which convert generic MATRIX definitions to su3 + */ +#define MATRIX complex +#define mult_MATRIX_nn(a,b,c) c_mul_nn( a, b, c ) +#define mult_MATRIX_na(a,b,c) c_mul_ni( a, b, c ) +#define mult_MATRIX_an(a,b,c) c_mul_in( a, b, c ) +#define mult_MATRIX_aa(a,b,c) c_mul_ii( a, b, c ) +#define add_MATRIX(a,b,c) c_add( a, b, c ) +#define scalar_mul_MATRIX(a,s,b) c_scalar_mul( a, s, b ) + +void smooth_field_complex(complex *link[NDIM], complex *cphi, int d[NDIM], + double c_mul_0, double c_mul_1); + +complex *block_field_complex(complex *f, int newlev[NDIM], int free_old); + +void block_link_complex( complex *oldl[NDIM], complex *newl[NDIM], + int newlev[NDIM], int free_old ); + +void smooth_link_complex( complex *link[NDIM], int d1[NDIM], int d2[NDIM], + double c_mul_0, double c_mul_1); + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/generic_su2.h b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/generic_su2.h new file mode 100644 index 0000000000000000000000000000000000000000..437e95493b5da3dc402140822e084b3fd08d716b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/generic_su2.h @@ -0,0 +1,25 @@ +/* Definitions which convert generic MATRIX definitions to su3 + */ + +#define MATRIX su2_matrix + +#define mult_MATRIX_nn(a,b,c) mult_su2_nn( (a), (b), (c) ) +#define mult_MATRIX_na(a,b,c) mult_su2_na( (a), (b), (c) ) +#define mult_MATRIX_an(a,b,c) mult_su2_an( (a), (b), (c) ) +#define mult_MATRIX_aa(a,b,c) mult_su2_aa( (a), (b), (c) ) +#define add_MATRIX(a,b,c) add_su2_matrix( (a), (b), (c) ) +#define scalar_mul_MATRIX(a,s,b) su2_scalar_mul( a, s, b ) + +#define prefetch_MATRIX( a ) prefetch_matrix( a ) + +void smooth_field_su2adjoint(su2_matrix *link[NDIM], adjoint *cphi, int d[NDIM], + double c_mul_0, double c_mul_1); + +adjoint *block_field_su2adjoint(adjoint *f, int newlev[NDIM], int free_old); + +void block_link_su2( su2_matrix *oldl[NDIM], su2_matrix *newl[NDIM], + int newlev[NDIM], int free_old ); + +void smooth_link_su2( su2_matrix *link[NDIM], int d1[NDIM], int d2[NDIM], + double c_mul_0, double c_mul_1); + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/generic_su3.h b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/generic_su3.h new file mode 100644 index 0000000000000000000000000000000000000000..eb7f5b72abbb553f92216cbb737f507668d1ce36 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/generic_su3.h @@ -0,0 +1,31 @@ +/* Definitions which convert generic MATRIX definitions to su3 + */ + + +#define MATRIX su3_matrix + +#define mult_MATRIX_nn(a,b,c) mult_su3_nn( &(a), &(b), &(c) ) +#define mult_MATRIX_na(a,b,c) mult_su3_na( &(a), &(b), &(c) ) +#define mult_MATRIX_an(a,b,c) mult_su3_an( &(a), &(b), &(c) ) +#define mult_MATRIX_aa(a,b,c) mult_su3_aa( &(a), &(b), &(c) ) +#define add_MATRIX(a,b,c) add_su3_matrix( &(a), &(b), &(c) ) +#define scalar_mul_MATRIX(a,s,b) scalar_mult_su3_matrix( &(a), s, &(b) ) + +#define prefetch_MATRIX( a ) prefetch_matrix( a ) + +void smooth_field_su3adjoint(su3_matrix *link[NDIM], adjoint_matrix *cphi, int d[NDIM], + double c_mul_0, double c_mul_1); + +adjoint_matrix *block_field_su3adjoint(adjoint_matrix *f, int newlev[NDIM], int free_old); + +void block_link_su3( su3_matrix *oldl[NDIM], su3_matrix *newl[NDIM], + int newlev[NDIM], int free_old ); + +void smooth_link_su3( su3_matrix *link[NDIM], int d1[NDIM], int d2[NDIM], + double c_mul_0, double c_mul_1); + +void reunitarize( su3_matrix *U[NDIM]); + +#ifdef SSE_INLINE +#include "../sse/inline_sse.h" +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/io_lattice_generic.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/io_lattice_generic.c new file mode 100644 index 0000000000000000000000000000000000000000..11472b3db38424912141d8692c91ed56e24b50ad --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/io_lattice_generic.c @@ -0,0 +1,313 @@ +/*********************** io_lattice_generic.c *************************/ +/* This reads and writes a (binary) lattice + * + * NOTE: THIS HAS TO BE ENCAPSULATED BY A FILE + * io_lattice.c + * WHICH DEFINES + * + * typedef struct { } allfields; + * + * void copy_fields(int site, allfields *s) copy from all latfields to s.(whatever) + * void set_fields(allfields *s, int site) copy s.(stuff) to lattice fields + * + * #include "../generic/io_lattice_generic.c" + */ + +/* read and write a binary lattice */ + +void save_binary_lattice(FILE *f); +void restore_lattice_slow(FILE * f); +void restore_lattice_fast(FILE * f); + +/* size of max comm buffer, in bytes - used in fast routines */ +#define N_MAX_COMMSIZE 50000 + +/* stored field */ + +typedef struct { + allfields a; + int x[NDIM]; +} stored_field; + + +void restore_binary(FILE * f) +{ + int j; + + if(this_node==0) { + int ok,i; + int dims[NDIM]; + + ok = (fread(&j,sizeof(int),1,f) == 1); + if (ok && j != NDIM) { + printf("* Lattice dimension error: in file %d, expecting %d\n",j,NDIM); + halt(" ####"); + } + + ok = ok && (fread(dims,sizeof(int),NDIM,f) == NDIM); + j = 0; + foralldir(i) if (dims[i] != lattice.size[i]) j = 1; + if (ok && j) { + printf("* Lattice size error: in file "); + for(i=0; i 1) { + /* # of nodes is the same, layout is likely the same - use + * fast io mode + */ + printf(" Loading config with fast I/O\n"); + j = 1; + } else { + printf(" Loading config with slow I/O\n"); + j = 0; /* or not */ + } + if (!ok) halt("config I/O error"); + } + + broadcast_int(&j); + + if (j) restore_lattice_fast(f); + else restore_lattice_slow(f); +} + + +void save_binary(FILE *f) +{ + + /* node 0 does all the writing */ + if(this_node==0){ + int i,j; + + i = NDIM; + j = sizeof(allfields); + if (fwrite(&i,sizeof(int),1,f) != 1 || + fwrite(lattice.size,sizeof(int),NDIM,f) != NDIM || + fwrite(&j,sizeof(int),1,f) != 1 || + fwrite(&number_of_nodes,sizeof(int),1,f) != 1) + halt("Error in writing lattice header"); + } + + save_binary_lattice(f); +} + +/************************************************************* + * Slow restore routine - 1 message/site + */ + +void restore_lattice_slow(FILE * f) +{ + stored_field nf; + + g_sync(); + + if (this_node == 0) { + int *sent; + int l,i,newnode; + + /* Node 0 reads, and sends site to correct node */ + + sent = (int *)memalloc(number_of_nodes,sizeof(int)); + for (l=0; l N_MAX_COMMSIZE) + buf_size = N_MAX_COMMSIZE/sizeof(stored_field); + else buf_size = node.sites; + + n_messages = node.sites / buf_size; + last_size = node.sites % buf_size; + if (last_size > 0) n_messages++; else last_size = buf_size; + + if (this_node == 0) { + + /* first read own stuff */ + forallsites(i) { + if (fread(&nf,sizeof(stored_field),1,f) != 1) + halt("Read error in restore_binary_fast"); + if (this_node != node_number(nf.x)) + halt ("Node number error in restore_binary_fast"); + idx = node_index(nf.x,&node); + set_fields( &nf.a, idx); + } + + if (number_of_nodes > 1) { + buf = (stored_field *)memalloc(buf_size,sizeof(stored_field)); + + for (n=1; n N_MAX_COMMSIZE) + buf_size = N_MAX_COMMSIZE / sizeof(stored_field); + else buf_size = node.sites; + + n_messages = node.sites / buf_size; + last_size = node.sites % buf_size; + if (last_size > 0) n_messages++; else last_size = buf_size; + + if (this_node == 0) { + + /* first write own stuff */ + forallsites(i) { + copy_fields(i, &nf.a); + foralldir(s) nf.x[s] = coordinate(i,s); + if (fwrite(&nf,sizeof(stored_field),1,f) != 1) + halt("Write error in save_binary"); + } + + if (number_of_nodes > 1) { + buf = (stored_field *)memalloc(buf_size,sizeof(stored_field)); + + for (n=1; n +#include +#include +#include "complex.h" +#include "su3.h" +#include "comdefs.h" +#include "generic.h" +#include "generic_su3.h" + +#ifndef check +#define check_action(a) /* nothing */ +#endif + +#define MAX_BOP 5 /* max number of blockings */ + +/* The following are global scalars */ +EXTERN long seed; /* random number seed */ +EXTERN int mc_steps,n_measurement,n_save; +EXTERN int n_iteration,n_thermal,iteration; +EXTERN double betag; +#ifdef HIGGS +EXTERN double p_x,p_y,betaA,beta4,beta2,betay; +EXTERN int n_correlation,w_correlation; +#endif + +EXTERN double wvalue; /*for multicanonical */ +EXTERN double timeu,timea,timerest; +EXTERN double ahitu,ahitua,ahithb,ahitax,ahitmc,ahitog; /* hit*/ +EXTERN int nhitu,nhitua,nhithb,nhitax,nhitmc,nhitog; +EXTERN int meas_sync,corr_sync; + +#ifdef HIGGS +/* correlation function globals */ +EXTERN int corrlen,n_corr; +EXTERN int n_bop,n_blocking,b_level[MAX_BOP]; + +/* correlation function pointers */ +#define N_CORR 8 +EXTERN float *c_array; +EXTERN float *cr2[MAX_BOP],*cr3[MAX_BOP],*ch0[MAX_BOP],*ch1[MAX_BOP]; +EXTERN float *cH0[MAX_BOP],*cH1[MAX_BOP],*cp0[MAX_BOP],*cp1[MAX_BOP]; + +#define b_const_a1 0.2 +#define b_const_a2 (0.25*(1.0-b_const_a1)) +#define b_const_g1 0.334 +#define b_const_g2 (0.5*(1.0-b_const_g1)) + +#endif + +/***************************************************************** + * Field variables + */ + +EXTERN su3_matrix *U[NDIM]; +#ifdef HIGGS +EXTERN adjoint_matrix *ahiggs; +#endif + +/*****************************************************************/ + +#define confname "config" + +/* PABS replace status by kernel_B.input.status */ +#define statname "kernel_B.input.status" + +#define measurename "measure" +#define corrname "correl" +#define wlname "wloop" + +/* PABS replace beta by kernel_B.input.beta */ +#define betaname "kernel_B.input.beta" + +#define weightname "weight" + +/* PABS replace parameters by kernel_B.input.parameters */ +#define paramname "kernel_B.input.parameters" + +#ifndef T3E +#define prefetch_adjoint(x) /* nothing */ +#define prefetch_matrix(x) /* nothing */ +#endif + +void reunitarize(su3_matrix *link[NDIM]); +int setup(void); +void load_config(int status); +void updatehiggs(int isover); +void measure(); void writemeas(); void hcorr(); void writecorr(); +void setfiles(int restart); +void dumpall(int status,int * maxiters); +void updategauge(int isrelax); +void relax(int dir, int parity, su3_matrix *link[NDIM], su3_matrix *staple +#ifdef HIGGS + , su3_matrix *ac +#endif + ); +void monte(int dir, int parity, su3_matrix *link[NDIM], su3_matrix *staple +#ifdef HIGGS + , su3_matrix *ac +#endif + ); +void staples_su3(su3_matrix *link[NDIM], su3_matrix *staple, int dir1,int parity); +double Xoverrelax(int parity, adjoint_matrix *ahiggs, adjoint_matrix *astaple); +double HBHiggs(int parity, adjoint_matrix *ahiggs, adjoint_matrix *astaple); +double act_gauge_adj(su3_matrix *a, su3_matrix *u,adjoint_matrix *b); + +complex measure_ploop(su3_matrix *link[NDIM], int dir); + +void staple1(int i, int dir1, MATRIX *link[NDIM], MATRIX *staple) ; + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/layout.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/layout.c new file mode 100644 index 0000000000000000000000000000000000000000..60503ff611050c4855b6d6d41cdcda501fbbe6c2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/layout.c @@ -0,0 +1,733 @@ +/**************************************************************** + * * + * Hypercubic lattice layout routines * + * Based on MILC lattice QCD code, pretty much modified * + * * + * These determine the distribution of sites on nodes, * + * and do the necessary setting up. * + * * + ***************************************************************/ + +#include "comdefs.h" +#include "generic.h" + +/* static variables for node calculations */ +int squaresize[NDIM]; /* dimensions of hypercubes */ +int nsquares[NDIM]; /* number of hypercubes in each direction */ + +/* GLOBALS for communications; needed by com_XXX.c and block_lattice.c */ +node_struct *allnodes; /* structure for all nodes on this run */ +comlist_struct *comlist; /* gather pointer for all gathers */ + +#define swap(a,b) {register int t; t=a; a=b; b=t; } + +void setup_layout( int siz[NDIM] ); +void test_gather( lattice_struct *lat ); +void make_gathers( lattice_struct *lat ); + +/*************************************************************** + * BASIC CALL FOR SETUP + * + * setup_lattice(int size[NDIM]); + */ + +void setup_lattice(int siz[NDIM]) +{ + + /* first, do the basic lattice layout */ + setup_layout( siz ); + + /* then, set up the comm arrays */ + make_lattice_arrays( &lattice ); + +#ifdef MPI + /* Initialize wait_array structures */ + initialize_wait_arrays(); +#endif +} + +/***************************************************************/ + +/* number of primes to be used in factorization */ +#define NPRIMES 4 +static int prime[NPRIMES] = {2,3,5,7}; + +/* Set up now squaresize and nsquares - arrays + * Print info to stdout as we proceed + */ + +void setup_layout( int siz[NDIM] ) +{ + int n,i,j,dir,nfactors[NPRIMES]; + + if(mynode()==0){ + printf(" LAYOUT OF THE LATTICE:\n %d dimensions, layout options: ",NDIM); +#ifdef GRAYCODE + printf("GRAYCODE "); +#endif +#ifdef EVENFIRST + printf("EVENFIRST "); +#endif + printf("\n"); + fflush(stdout); + } + + /* reset the blocking level (just in case) */ + foralldir(dir) current_blocking_level[dir] = 0; + + /* static global */ + this_node = mynode(); + + /* Figure out dimensions of rectangle */ + + lattice.volume = 1; + foralldir(dir) { + nsquares[dir] = 1; + squaresize[dir] = lattice.size[dir] = siz[dir]; + lattice.volume *= lattice.size[dir]; + } + + /* store the baseline */ + base_lattice = lattice; + + if (lattice.volume % numnodes()) { + printf0(" No hope of laying out the lattice using %d nodes\n",numnodes()); + finishrun(); + } + + /* Factorize the node number in primes + * These factors must be used in slicing the lattice! + */ + i = numnodes(); + for (n=0; n=0; n--) for(i=0; ij && squaresize[dir]%prime[n] == 0 ) j=squaresize[dir]; + + /* if one direction with largest dimension has already been + divided, divide it again. Otherwise divide first direction + with largest dimension. */ + + for (dir=0; dir1 && + squaresize[dir]%prime[n] == 0) break; + + /* not previously sliced, take one direction to slice */ + if (dir >= NDIM) for (dir=0; dir= NDIM) { + /* This cannot happen! */ + printf("CANNOT HAPPEN! in layout.c\n"); + finishrun(); + } + + /* Now slice it */ + squaresize[dir] /= prime[n]; nsquares[dir] *= prime[n]; + + } + + if (mynode() == 0) { + printf(" Processor layout: "); + foralldir(dir) { + if (dir > 0) printf(" x "); + printf("%d",nsquares[dir]); + } + printf("\n Sites on node: "); + foralldir(dir) { + if (dir > 0) printf(" x "); + printf("%d",squaresize[dir]); + } + printf("\n"); + } +} + +/**************** Get the node number for (BLOCKED) coordinates */ +int node_number(int loc[NDIM]) +{ + register int i,dir; + + i = (loc[NDIM-1] << current_blocking_level[NDIM-1]) / squaresize[NDIM-1]; + for (dir=NDIM-2; dir>=0; dir--) { + i = i*nsquares[dir] + + ((loc[dir] << current_blocking_level[dir]) / squaresize[dir]); + } + +#ifdef GRAYCODE + return( i ^ (i>>1) ); /* Gray code of i */ +#else + return( i ); +#endif +} + +/************** fast routine for clarifying if we're on THIS node */ + +int is_on_node(int loc[NDIM]) +{ + register int d,dir; + + foralldir(dir) { + d = loc[dir] - node.xmin[dir]; + if (d < 0 || d >= node.nodesize[dir] ) return(0); + } + return(1); +} + +/************** give site index for ON NODE sites */ + +int node_index(int loc[NDIM], node_struct *node) +{ + int dir,l,i,s; + + i = l = loc[NDIM-1] - node->xmin[NDIM-1]; + s = loc[NDIM-1]; + for (dir=NDIM-2; dir>=0; dir--) { + l = loc[dir] - node->xmin[dir]; + i = i*node->nodesize[dir] + l; + s += loc[dir]; + } + + /* now i contains the `running index' for site */ +#ifdef EVENFIRST + if (s%2 == 0) return( i/2 ); /* even site index */ + else return( i/2 + node->evensites ); /* odd site */ +#else + return( i ); +#endif +} + +/****************************************************** + * routines for stepping through the lattice in + * coordinates, as in + * #define forallcoordinates(x) \ + * for(zero_arr(x); is_coord(x,&lattice); step_coord(x,&lattice) ) + */ + +void zero_arr(int x[NDIM]) { register int d; foralldir(d) x[d] = 0; } + +int is_allowed_coord(int x[NDIM],lattice_struct *l) +{ + int d,i; + i = 1; + foralldir(d) i = (i && (x[d] >= 0) && (x[d] < l->size[d])); + return(i); +} + +void step_coord(int x[NDIM],lattice_struct *l) +{ + int d; + + for(d=0; d= l->size[d]; x[d++] = 0) ; + + /* check if the lattice is 'full' */ + if (d >= NDIM) x[NDIM-1] = l->size[NDIM-1]; +} + + +/************** + * set up the node structure for all of the nodes in + * the run (for BLOCKED coordinates). + */ + +void setup_node( int loc[NDIM], node_struct *n ) +{ + register int offset,dir,blev,l,c0,c1,s; + + n->sites = 1; + s = 0; + foralldir(dir) { + blev = 1 << current_blocking_level[dir]; + l = loc[dir] << current_blocking_level[dir]; /* normalized coord */ + offset = l % squaresize[dir]; /* normalized coord from node 'origin' */ + c0 = l - offset; /* coordinate of the origin */ + c1 = c0 + squaresize[dir] - 1; /* coordinate of the last point */ + + /* calculate the coordinate of the first blocked point on the + * node. If the origin is divisible by the blocking factor, + * then it belongs to the blocked lattice and the coordinate is + * just l0 / blev. However, if not, then the first blocked point + * is l0 / blev + 1. + */ + if (c0 % blev == 0) n->xmin[dir] = c0 >> current_blocking_level[dir]; + else n->xmin[dir] = (c0 >> current_blocking_level[dir]) + 1; + + /* Now the coordinate of the last blocked point. This is + * always c1/blev, regardless if it is divisible or not. + */ + + c1 = c1 >> current_blocking_level[dir]; + + /* now the length of the blocked lattice */ + n->nodesize[dir] = c1 - n->xmin[dir] + 1; + + /* need to accumulate size */ + n->sites *= n->nodesize[dir]; + /* and parity of the origin */ + s += n->xmin[dir]; + } + + if ( n->sites % 2 ) { + /* now odd sized node */ + if ( s % 2 == 0) n->evensites = n->sites/2 + 1; + else n->evensites = n->sites/2; + n->oddsites = n->sites - n->evensites; + } else { + n->evensites = n->oddsites = n->sites/2; + } +} + +/************************************************************ + * set up the node struct for all nodes + */ + +node_struct * setup_nodes(lattice_struct *lat) +{ + int i,l,d,n,x[NDIM]; + node_struct *p; + + /* allocate the node array */ + p = (node_struct *)memalloc( l=numnodes(), sizeof(node_struct) ); + for (i=0; isize[d]; /* neighbour of site */ + } else { + k = opp_dir(d); + x[k] = (x[k] - 1 + lat->size[k]) % lat->size[k]; /* neighbour of site */ + } + if (is_on_node(x)) neighb[d][i] = node_index(x,&node); + else { + nodes[num] = node_number(x); + index[num] = node_index(x, allnodes + nodes[num] ); + parity[num] = site[i].parity; /* parity of THIS */ + here[num] = i; + num++; + } + } + + comlist[d].n_receive = 0; + if (num > 0) { + /* now, get the number of nodes to be gathered from */ + for (i=0; inode; j++) r = &((*r)->next); + if (j == comlist[d].n_receive) { + /* NEW NODE to receive from */ + comlist[d].n_receive++; + (*r) = p = (receive_struct *)memalloc(1,sizeof(receive_struct)); + /* and fill in the node structure */ + p->node = nodes[i]; + p->n = 1; /* first site */ + p->n_even = p->n_odd = 0; + if ( parity[i] == EVEN ) p->n_even = 1; else p->n_odd = 1; + p->next = NULL; + } else { + /* add to OLD NODE */ + p = *r; + p->n ++; + if ( parity[i] == EVEN ) p->n_even ++; else p->n_odd ++; + } + } + + /* Calculate the offsets for the gathers */ + for (j=0, p=comlist[d].from_node; jnext) { + p->offset = c_offset; + c_offset += p->n; /* and increase the offset */ + } + + /* and NOW, finish the NEIGHBOR array */ + + for (j=0, p=comlist[d].from_node; jnext) { + /* Now, accumulate the locations to itmp-array, and sort the + * array according to the index of the sending node . + * First even neighbours + */ + for (par=EVEN; par<=ODD; par++) { + for (n=i=0; inode && parity[i] == par) { + itmp[n++] = i; + /* bubble sort the tmp-array */ + for (k=n-1; k > 0 && index[itmp[k]] < index[itmp[k-1]]; k--) + swap( itmp[k], itmp[k-1] ); + } + off = p->offset; + if (par == ODD) off += p->n_even; + /* finally, root indices according to offset */ + for (k=0; k 0 */ + + /* receive done, now opposite send. This is just the gather + * inverted + */ + + od = opp_dir(d); + comlist[od].n_send = comlist[d].n_receive; + + if (num > 0) { + p = comlist[d].from_node; + for (j=0, s=&(comlist[od].to_node); jnext), p = p->next) { + (*s) = q = (send_struct *)memalloc(1,sizeof(send_struct)); + q->node = p->node; + q->n = p->n; + q->n_even = p->n_odd; /* Note the swap ! even/odd refers to type of gather */ + q->n_odd = p->n_even; + q->next = NULL; + q->sitelist = (int *)memalloc(q->n, sizeof(int)); + + /* now, initialize sitelist -- Now, we first want ODD parity, since + * this is what even gather asks for! + */ + + for (n=0,par=ODD; par>=EVEN; par--) { + for (i=0; inode && parity[i] == par) { + (q->sitelist)[n++] = here[i]; + } + if (par == ODD && n != q->n_even) halt("Parity odd error 3"); + if (par == EVEN && n != q->n) halt("Parity even error 3"); + } + } + } + } /* directions */ + + free(nodes); + free(index); + free(parity); + free(here); + free(itmp); + + /* Finally, set the site to the final offset (better be right!) */ + node.latfield_size = c_offset; + +} + + +/************************************************************************ + * Do some test to validate the correctness of the gather + */ + +typedef struct t { + int x[NDIM],parity; +} tst_struct; + + +void gather_test_error( char *abuse, int dir, tst_struct *a, + tst_struct *n, int par ) +{ + int l; + + printf(" *** %s, parity %d, from dir %d: ( ",abuse,par,dir); + foralldir(l) printf("%d ",a->x[l]); + printf(") -> ( "); + foralldir(l) printf("%d ",n->x[l]); + printf("), parity %d -> %d\n",a->parity,n->parity); +} + + +void test_gather( lattice_struct *lat ) +{ + int i,d,k,j,n,off,dir,n_err,par,checkparity; + tst_struct *a; + msg_tag *tag[NDIM]; + + a = new_latfield( tst_struct ); + + /* ignore parity if blocked lattice - usually OK */ + checkparity = 1; + foralldir(d) if (current_blocking_level[d]) checkparity = 0; + + n_err = 0; + for (k=0; k<2; k++) { + for (par=EVEN; par<=EVENODD; par++) { + + forallsites(i) { + foralldir(d) a[i].x[d] = site[i].x[d]; + a[i].parity = site[i].parity; + } + + foralldir(d) { + if (k) dir = opp_dir(d); else dir = d; + tag[d] = start_get( a, dir, par ); + } + + foralldir(d) { + if (k) dir = opp_dir(d); else dir = d; + + wait_get(tag[d]); + + if (is_up_dir(dir)) off = 1; else off = lat->size[d] - 1; + + forparity(i,par) foralldir(j) { + n = nb(dir,i); + if (( j != d && a[n].x[j] != a[i].x[j]) || + ( j == d && a[n].x[j] != ((a[i].x[j] + off) % lat->size[d])) +#ifndef IGNORE_PARITY + || (( a[i].parity != opp_parity(a[n].parity)) && checkparity ) +#endif + ) { + if (n_err < 10) + gather_test_error("HALOO! Gather error",dir,a+i,a+n,par); + n_err ++; + } + } + } + } + } + + /* test scatter too - inverse. Sensible only for EVEN or ODD */ + /* can be up or down */ + for (dir=0; dirsize[d]); + else if (d == odir) + a[n].x[d] = ((site[i].x[d] - 1 + lat->size[d])%lat->size[d]); + else + a[n].x[d] = site[i].x[d]; + } + a[n].parity = opp_parity(site[i].parity); + } + + wait_put( start_put( a, dir, par ) ); + + forparity(i,opar) { + error = 0; +#ifndef IGNORE_PARITY + if (checkparity && a[i].parity != site[i].parity) error = 1; +#endif + foralldir(d) if (a[i].x[d] != site[i].x[d]) error = 1; + if (error) { + if (n_err < 10) gather_test_error("HALOO! Scatter error", + dir,a+i,a+nb(odir,i),par); + n_err ++; + } + } + } + } + + if (n_err > 0) halt(" Lattice layout error (BUG in com_mpi.c or layout.c)"); + else printf0(" Gather/Scatter tests passed\n"); + + free_latfield(a); +} + + +/****************************************************************/ + +char *copy_latfield_func( char *f, int siz ) +{ + char *t; + + t = new_latfield_size( siz ); + memcpy( t, f, (siz * node.latfield_size) ); + return( t ); +} + +/****************************************************************/ + +char *latfield_alloc(int size) +{ + char *t; + char f[150]; + + t = (char *)malloc(node.latfield_size * size + + GATHER_STATUS_SIZE ); + if (t == NULL) { + sprintf(f,"Could not allocate a latfield of %d chars",size); + halt(f); + } + +#ifdef MPI + gather_status_reset( t, size ); +#endif + return( t ); +} + + +char *memalloc(int n, int size) +{ + char *t; + char f[150]; + + t = (char *)malloc(n * size); + if (t == NULL) { + sprintf(f,"Memalloc: could not allocate %d x %d bytes",n,size); + halt(f); + } + return( t ); +} + + +void *halt(char *s) +{ + printf("*** %s\n",s); + terminate(0); + return((void *)NULL); +} + + +void time_stamp(char *msg) +{ + time_t time_stamp; + + if (this_node == 0) { + time(&time_stamp); + if (msg != NULL) printf("%s",msg); + printf("%s\n", ctime(&time_stamp)); + fflush(stdout); + } +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/mersenne.h b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/mersenne.h new file mode 100644 index 0000000000000000000000000000000000000000..e2aa28ed5410e3ba426e0b474d41f7b113ee1574 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/mersenne.h @@ -0,0 +1,14 @@ +/*************************************************************** + * mersenne.h + * for the inline version of the mersenne generator + */ + +#define MERSENNE_N 624 + +extern int mersenne_i; +extern double mersenne_array[MERSENNE_N]; + +#define mersenne() ( mersenne_i > 0 ? mersenne_array[--mersenne_i] : mersenne_generate(&mersenne_i) ) + +void seed_mersenne(long a); +double mersenne_generate(int *); diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/mersenne_inline.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/mersenne_inline.c new file mode 100644 index 0000000000000000000000000000000000000000..35c1e2fd05df642c7168b30014beb60c4ec9dd5e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/mersenne_inline.c @@ -0,0 +1,117 @@ +/* A C-program for MT19937: Real number version (1998/4/6) */ +/* genrand() generates one pseudorandom real number (double) */ +/* which is uniformly distributed on [0,1]-interval, for each */ +/* call. sgenrand(seed) set initial values to the working area */ +/* of 624 words. Before genrand(), sgenrand(seed) must be */ +/* called once. (seed is any 32-bit integer except for 0). */ +/* Integer generator is obtained by modifying two lines. */ +/* Coded by Takuji Nishimura, considering the suggestions by */ +/* Topher Cooper and Marc Rieffel in July-Aug. 1997. */ + +/* This library is free software; you can redistribute it and/or */ +/* modify it under the terms of the GNU Library General Public */ +/* License as published by the Free Software Foundation; either */ +/* version 2 of the License, or (at your option) any later */ +/* version. */ +/* This library is distributed in the hope that it will be useful, */ +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */ +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. */ +/* See the GNU Library General Public License for more details. */ +/* You should have received a copy of the GNU Library General */ +/* Public License along with this library; if not, write to the */ +/* Free Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA */ +/* 02111-1307 USA */ + +/* Copyright (C) 1997 Makoto Matsumoto and Takuji Nishimura. */ +/* When you use this, send an email to: matumoto@math.keio.ac.jp */ +/* with an appropriate reference to your work. */ + +/* REFERENCE */ +/* M. Matsumoto and T. Nishimura, */ +/* "Mersenne Twister: A 623-Dimensionally Equidistributed Uniform */ +/* Pseudo-Random Number Generator", */ +/* ACM Transactions on Modeling and Computer Simulation, */ +/* Vol. 8, No. 1, January 1998, pp 3--30. */ + +#include +#include + +/* Period parameters */ +#define N 624 +#define M 397 +#define MATRIX_A 0x9908b0df /* constant vector a */ +#define UPPER_MASK 0x80000000 /* most significant w-r bits */ +#define LOWER_MASK 0x7fffffff /* least significant r bits */ + +/* Tempering parameters */ +#define TEMPERING_MASK_B 0x9d2c5680 +#define TEMPERING_MASK_C 0xefc60000 +#define TEMPERING_SHIFT_U(y) (y >> 11) +#define TEMPERING_SHIFT_S(y) (y << 7) +#define TEMPERING_SHIFT_T(y) (y << 15) +#define TEMPERING_SHIFT_L(y) (y >> 18) + +static unsigned int mt[N]; /* the array for the state vector */ +int mersenne_i = -1; /* < 0 means mt[N] is not initialized */ +double mersenne_array[N]; + +/* initializing the array with a NONZERO seed */ +void +seed_mersenne(long seed) +{ + int mti; + mt[0]= seed & 0xffffffffUL; + for (mti=1; mti> 30)) + mti); + /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */ + /* In the previous versions, MSBs of the seed affect */ + /* only MSBs of the array mt[]. */ + /* 2002/01/09 modified by Makoto Matsumoto */ + mt[mti] &= 0xffffffffUL; + /* for >32 bit machines */ + } + mersenne_i = 0; +} + +double /* generating reals */ +/* unsigned int */ /* for integer generation */ +mersenne_generate(int *dummy) +{ + register unsigned int y; + register int kk; + static unsigned int mag01[2]={0x0, MATRIX_A}; + /* mag01[x] = x * MATRIX_A for x=0,1 */ + + if (mersenne_i < 0) { /* if sgenrand() has not been called, */ + printf("DUMMY: you did not seed the generator!\n"); + exit(0); + } + + /* generate N words at one time */ + + for (kk=0;kk> 1) ^ mag01[y & 0x1]; + } + for (;kk> 1) ^ mag01[y & 0x1]; + } + y = (mt[N-1]&UPPER_MASK)|(mt[0]&LOWER_MASK); + mt[N-1] = mt[M-1] ^ (y >> 1) ^ mag01[y & 0x1]; + + for (kk=0; kk mc_lim[ri]; ri++); + for ( ; p2 < mc_lim[ri]; ri--); + + /** calculate the corrected weight factor **/ + w = wr1[ri]*p2 + wr0[ri] - wr1[mc_i]*mc_par - wr0[mc_i]; + + nhitmc++; + if (exp((double)w) >= dran()) { + /* accept it */ + ahitmc += 1.0; + mc_par = p2; + radius[parity] = rad; /* save radius */ + mc_i = ri; + ok = 1; + } else { + ok = 0; + } + + if (is_mucacalc) multi_calc(); + + } /* this_node */ + + broadcast_field(&ok,sizeof(int)); + + if ( !ok ) { + j = 0; + forparity(i,parity) { + multi_field[i] = multi_buf[j++]; + } + } + g_sync(); + return ( ok ); +} + +/*************************************************** + * read in the multicanonical weight function + */ + +int readmulti() +{ + FILE *fil; + int i,j; + + if (this_node == 0) { + + if ((fil = fopen(weightname,"r")) == NULL) { + printf(" - Non-multicanonical run\n"); + is_mucacalc = is_multicanonical = 0; + + } else { + + is_multicanonical = 1; + + printf(" ***** Multicanonical - reading weight file\n"); + i = (fscanf(fil,"%d",&nweight) == 1); + + if (i) { + mc_lim = (double *)calloc(nweight+2,sizeof(double)); + wr0 = (double *)calloc(nweight+2,sizeof(double)); + wr1 = (double *)calloc(nweight+2,sizeof(double)); + wrp = (double *)calloc(nweight+2,sizeof(double)); + + for (j=1; j<=nweight && i; j++) + i = (fscanf(fil,"%lg %lg",&mc_lim[j],&wrp[j]) == 2); + } + if (!i) halt(" ** Read error in weight file"); + fclose(fil); + + mc_lim[0] = -1000; mc_lim[nweight+1] = 1000; + wrp[0] = wrp[1]; wrp[nweight+1] = wrp[nweight]; + + if ((fil = fopen(weightwrk,"r")) != NULL) { + is_mucacalc = 1; + + /* allocate balancing arrays */ + num_sweep = (double *)calloc(nweight+2,sizeof(double)); + + fscanf(fil,"%lg %lg %lg %d %d",&w_min,&w_max,&mc_delta,&tunnel,&last_up); + + printf(" ***** Weight function set up, range %g - %g\n",w_min,w_max); + printf(" ***** Starting with delta %g, tunnel %d\n",mc_delta,tunnel); + + fclose(fil); + } else is_mucacalc = 0; + + for (i=0; i 0) { + /* this is restart status */ + double t; + FILE *f; + + printf(" **** Reading MUCA-calculation files\n"); + + f = fopen(weightnew,"r"); + + fscanf(f,"%d\n",&j); + if (j != nweight) halt("Error in `weight.new'\n"); + for (i=1; i 0) num_sweep[id-1] += 3; + if (id > 1) num_sweep[id-2] += 1; + if (id < nweight-1) num_sweep[id+1] += 3; + if (id < nweight-2) num_sweep[id+2] += 1; + + nvisit = (nvisit + 1) % 8; + if (nvisit == 0) calcmulti(); +} + +/************************************************************* + * calculate muca-function again + */ + +void calcmulti() +{ + int i,j,idown,iup; + + idown = -1; + for (i=0; i mc_lim[i] ) { + if (idown < 0) idown = i; /* first index */ + wrp[i] -= (num_sweep[i] - num_sweep[idown]) * mc_delta/nweight; + iup = i; + } + else if ( w_max*lattice.volume <= mc_lim[i] ) wrp[i] = wrp[iup]; + } + + if ( last_up && num_sweep[idown] ) { + tunnel++; + last_up = 0; + } else if ( !last_up && num_sweep[iup] ) { + tunnel++; + last_up = 1; + } + + if (tunnel >= 2 ) { + mc_delta /= 4; + printf(" ** New multicanonical delta-par: %g\n",mc_delta); fflush(stdout); + tunnel = 0; + } + + for (i=0; i + +/* First, scan the file and find the value */ + +void scan_label_value(FILE *f,char *s,char *fmt,void *val) +{ + char *p,buf[200],line[200]; + + do { + if (fgets(line,198,f) == NULL) { + sprintf(buf," *** Error reading input element %s",s); + halt(buf); + } + for (p=line; *p == ' ' || *p == '\t'; p++) ; + } while (*p == '\n'); + + if (strncmp(p,s,strlen(s)) != 0) { + sprintf(buf," *** Input: should be '%s', does not match '%s'",s,p); + halt(buf); + } + p += strlen(s); + if (sscanf(p,fmt,val) != 1) { + sprintf(buf," *** Unable to get the value for input %s",s); + halt(buf); + } +} + + +double get_d(FILE *f,char *s,int bcast) +{ + double val; + + if (this_node == 0) { + scan_label_value(f,s," %lg",&val); + if (bcast >= 0) printf(" %-30s %g\n",s,val); + } else val = 0; + + if (bcast) broadcast_double( &val ); + return(val); +} + + +int get_i(FILE *f,char *s,int bcast) +{ + int val; + + if (this_node == 0) { + scan_label_value(f,s," %d",&val); + if (bcast >= 0) printf(" %-30s %d\n",s,val); + } else val = 0; + + if (bcast) broadcast_int( &val ); + return(val); +} + + +int get_s(FILE *f, char *s, char *target, int bcast) +{ + int len; + if (this_node == 0) { + scan_label_value(f,s," %s",target); + if (bcast >= 0) printf(" %-30s %s\n",s,target); + } + len = strlen(target); + + if (bcast) broadcast_field(target, len); + return(len); +} + +/* get one item from a list */ + +int get_item(FILE *f, char *s, char *items[],int n_items, int bcast) +{ + char label[200]; + int i; + + if (this_node == 0) { + scan_label_value(f,s," %s",label); + if (bcast >= 0) printf(" %-30s %s\n",s,label); + /* Find the matching string */ + for (i=0; i 1.0 ); + + /* make it su2 matrix */ + + rsq = 1.0/sqrt( rsq ); + r0 *= rsq; r1 *= rsq; r2 *= rsq; r3 *= rsq; + + m->e[0][0] = cmplx( r0, r3 ); + m->e[0][1] = cmplx( r2, r1 ); + m->e[1][0] = cmplx(-r2, r1 ); + m->e[1][1] = cmplx( r0,-r3 ); + +} + +/* Set su3 matrix to unit matrix + */ + +void su3_one(su3_matrix *r) +{ + register int i,j; + for (i=0; i<3; i++) for (j=0; j<3; j++) { + r->e[i][j].imag = 0; + if (i==j) r->e[i][j].real = 1; else r->e[i][j].real = 0; + } +} + + +/****************************************************************** + * a routine for generating a random su3 matrix. + */ + + +void random_su3P( su3_matrix *a, int n_random ) +{ + int i,ina,inb,index1,ii; + su2_cmat u; + /* su3_matrix m; + * complex t; + */ + + su3_one( a ); /* set the matrix first to unity */ + + for (i=0; i inb) { ii=ina; ina=inb; inb=ii;} + + su2_random( &u ); /* get a random su2 */ + + left_su2_hit_n( &u, ina, inb, a ); /* and hit the su3 matrix */ + + /* mat_mul_an( (*a), (*a), m ); + * t = trace_su3( &m ); + * printf("loop %d random: trace %g %g\n",i,t.real,t.imag); + */ + } + if (i % 4 == 0) reunit_su3( a ); /* keep it unitary */ + } +} + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/reunitarize.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/reunitarize.c new file mode 100644 index 0000000000000000000000000000000000000000..04cb00634d0443aebf5582c4409e8dcbda95a877 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/reunitarize.c @@ -0,0 +1,119 @@ +/*********************** reunitarize.c ***************************/ +/* MIMD version 3 */ + +/* reunitarize the link matrices */ +#include LATDEF + +/* canopy qcdlib code - stolen, of course */ +#define fixsu3(matrix) \ +{ \ + bj0r = (*matrix).e[0][0].real; \ + bj0i = (*matrix).e[0][0].imag; \ + bj1r = (*matrix).e[0][1].real; \ + bj1i = (*matrix).e[0][1].imag; \ + bj2r = (*matrix).e[0][2].real; \ + bj2i = (*matrix).e[0][2].imag; \ + ar = (*matrix).e[1][2].real; \ + ai = (*matrix).e[1][2].imag; \ + tr = bj1r*ar - bj1i*ai; \ + ti = bj1r*ai + bj1i*ar; \ + ar = (*matrix).e[1][1].real; \ + ai = (*matrix).e[1][1].imag; \ + tr = tr - bj2r*ar + bj2i*ai; \ + ti = ti - bj2r*ai - bj2i*ar; \ + (*matrix).e[2][0].real = tr; \ + (*matrix).e[2][0].imag = -ti; \ + ar = (*matrix).e[1][0].real; \ + ai = (*matrix).e[1][0].imag; \ + tr = bj2r*ar - bj2i*ai; \ + ti = bj2r*ai + bj2i*ar; \ + ar = (*matrix).e[1][2].real; \ + ai = (*matrix).e[1][2].imag; \ + tr = tr - bj0r*ar + bj0i*ai; \ + ti = ti - bj0r*ai - bj0i*ar; \ + (*matrix).e[2][1].real = tr; \ + (*matrix).e[2][1].imag = -ti; \ + ar = (*matrix).e[1][1].real; \ + ai = (*matrix).e[1][1].imag; \ + tr = bj0r*ar - bj0i*ai; \ + ti = bj0r*ai + bj0i*ar; \ + ar = (*matrix).e[1][0].real; \ + ai = (*matrix).e[1][0].imag; \ + tr = tr - bj1r*ar + bj1i*ai; \ + ti = ti - bj1r*ai - bj1i*ar; \ + (*matrix).e[2][2].real = tr; \ + (*matrix).e[2][2].imag = -ti; \ + } + +/* #pragma inline ( reunit_su3 ) */ + +void reunit_su3(su3_matrix *c) +{ + register float bj0r, bj0i, bj1r, bj1i, bj2r, bj2i; + register float ar, ai, tr, ti; + + /* first normalize row 0 */ + ar = (*c).e[0][0].real * (*c).e[0][0].real + /* sum of squares of row */ + (*c).e[0][0].imag * (*c).e[0][0].imag + + (*c).e[0][1].real * (*c).e[0][1].real + + (*c).e[0][1].imag * (*c).e[0][1].imag + + (*c).e[0][2].real * (*c).e[0][2].real + + (*c).e[0][2].imag * (*c).e[0][2].imag; + + ar = 1.0 / sqrt( (double)ar); /* used to normalize row */ + (*c).e[0][0].real *= ar; + (*c).e[0][0].imag *= ar; + (*c).e[0][1].real *= ar; + (*c).e[0][1].imag *= ar; + (*c).e[0][2].real *= ar; + (*c).e[0][2].imag *= ar; + + /* now make row 1 orthogonal to row 0 */ + ar = (*c).e[0][0].real * (*c).e[1][0].real + /* real part of 0 dot 1 */ + (*c).e[0][0].imag * (*c).e[1][0].imag + + (*c).e[0][1].real * (*c).e[1][1].real + + (*c).e[0][1].imag * (*c).e[1][1].imag + + (*c).e[0][2].real * (*c).e[1][2].real + + (*c).e[0][2].imag * (*c).e[1][2].imag; + ai = (*c).e[0][0].real * (*c).e[1][0].imag - /* imag part of 0 dot 1 */ + (*c).e[0][0].imag * (*c).e[1][0].real + + (*c).e[0][1].real * (*c).e[1][1].imag - + (*c).e[0][1].imag * (*c).e[1][1].real + + (*c).e[0][2].real * (*c).e[1][2].imag - + (*c).e[0][2].imag * (*c).e[1][2].real; + + /* row 2 -= a * row1 */ + (*c).e[1][0].real -= ar*(*c).e[0][0].real - ai*(*c).e[0][0].imag; + (*c).e[1][0].imag -= ar*(*c).e[0][0].imag + ai*(*c).e[0][0].real; + (*c).e[1][1].real -= ar*(*c).e[0][1].real - ai*(*c).e[0][1].imag; + (*c).e[1][1].imag -= ar*(*c).e[0][1].imag + ai*(*c).e[0][1].real; + (*c).e[1][2].real -= ar*(*c).e[0][2].real - ai*(*c).e[0][2].imag; + (*c).e[1][2].imag -= ar*(*c).e[0][2].imag + ai*(*c).e[0][2].real; + + /* now normalize row 1 */ + ar = (*c).e[1][0].real * (*c).e[1][0].real + /* sum of squares of row */ + (*c).e[1][0].imag * (*c).e[1][0].imag + + (*c).e[1][1].real * (*c).e[1][1].real + + (*c).e[1][1].imag * (*c).e[1][1].imag + + (*c).e[1][2].real * (*c).e[1][2].real + + (*c).e[1][2].imag * (*c).e[1][2].imag; + + ar = 1.0 / sqrt( (double)ar); /* used to normalize row */ + (*c).e[1][0].real *= ar; + (*c).e[1][0].imag *= ar; + (*c).e[1][1].real *= ar; + (*c).e[1][1].imag *= ar; + (*c).e[1][2].real *= ar; + (*c).e[1][2].imag *= ar; + + fixsu3(c); /* reconstruct row 2 */ + +} /* reunit_su3 */ + + +void reunitarize(su3_matrix *link[NDIM]) { + int i,dir; + + foralldir(dir) forallsites(i) reunit_su3( &link[dir][i] ); + +} /*reunitarize() */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/setup_basic.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/setup_basic.c new file mode 100644 index 0000000000000000000000000000000000000000..d219bc29d45b69f0d2fcfdee06014a9e7eba6d94 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/setup_basic.c @@ -0,0 +1,69 @@ +/******** setup_basic.c *********/ +/* MIMD code version 3 */ + +/* Here are basic setup routines, which do not depend on the + * program + */ + +#include LATDEF + +/* SETUP ROUTINES */ + + +void initial_setup() +{ + + /* First, adjust malloc so that glibc free() does not + * release space to the system + */ + +#ifdef __GNUC__ +#include "malloc.h" + mallopt( M_MMAP_MAX, 0 ); /* don't use mmap */ + /* HACK: don't release memory by calling sbrk */ + mallopt( M_TRIM_THRESHOLD, -1 ); +#endif + + /* Machine initialization first */ + initialize_machine(); + g_sync(); + + /* set the timing up */ + inittime(); + + /* basic static node variables */ + this_node = mynode(); + number_of_nodes = numnodes(); + +#ifdef __GNUC__ + printf0(" GNU c-library performance:\n using sbrk instead of mmap; not returning memory\n"); +#endif + +} + + +/************************************************** + * random number generators + */ + +void initialize_prn(long seed) +{ + int node; + + node = mynode(); + + if (seed == 0) { + if (this_node == 0) { + seed = time(NULL); + seed = seed^(seed<<26)^(seed<<9); + printf(" + Random seed from time %ld\n",seed); + } + broadcast_field(&seed,sizeof(long)); + } + seed += 1121*node; + seed = seed ^ ((532*node)<<18); + + seed_mersenne(seed); + +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/setup_files.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/setup_files.c new file mode 100644 index 0000000000000000000000000000000000000000..4f3b417abb500ae381a4a39c7e73350c63a625e7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/setup_files.c @@ -0,0 +1,82 @@ + +#include "comdefs.h" /* global variables for lattice fields */ + +int reposition(FILE *f,int nmeas); + +/************************************************** + * Set up the system for one run + */ + +FILE * setup_files(int restart, char *name, int nmeas, int mea, int *sync) +{ + e_header h; + FILE *f; + + if (this_node == 0) { + if (!restart) { + + h.headerid = E_HEADER_ID; + h.headersize = sizeof(e_header); + + h.lx = lattice.size[XUP]; h.ly = lattice.size[YUP]; + h.lz = lattice.size[ZUP]; h.lt = 1; +#if NDIM == 4 + h.lt = lattice.size[TUP]; +#endif + + h.n_float = nmeas; + h.n_long = h.n_double = h.n_char = 0; + + f = fopen(name,"w+"); + fwrite(&h,sizeof(e_header),1,f); + *sync = 0; + return(f); + } else { + + f = fopen(name,"r+"); + if (f == NULL) { + printf(" *** File %s does not exist?\n",name); + exit(0); + } + printf(" - Repositioning %s to position %d\n",name,mea); + + *sync = reposition(f,mea) + return( f ); + } + } /* this_node == 0 */ +} + +/************************************************** + * this routine repositions the measurement-files + */ + +int reposition(FILE *f,int nmeas) +{ + e_header h; + int length; + int l,j; + char *cbuf; + + fread(&h,sizeof(e_header),1,f); + j = 0; + length = h.n_double*sizeof(double) + + h.n_long*sizeof(long) + h.n_float*sizeof(float) +h.n_char*sizeof(char); + cbuf = (char *)malloc(length); + + while (j evenodd */ + + start=1; /* indicates staple sum not initialized */ + foralldir(dir2) if (dir2 != dir1) { + + odir = opp_dir(dir2); + + /* first, get link[dir2] from dir1 to all points */ + + /* get link[dir2] from direction dir1 */ + tag0 = start_get( link[dir2], dir1, EVENODD ); + + /* get link[dir1] from direction dir2 */ + tag1 = start_get( link[dir1], dir2, parity ); + + /* multiply link[dir2]^* link[dir1] link[dir2] at direction -dir2 */ + forparity_wait(i, otherparity, tag0) { + prefetch_MATRIX(&link[dir2][i+1]); + prefetch_MATRIX(&link[dir1][i+1]); + prefetch_MATRIX( &link[dir2][nb(dir1,i+1)] ); + + mult_MATRIX_an( link[dir2][i], link[dir1][i], tmat1 ); + mult_MATRIX_nn( tmat1, link[dir2][nb(dir1,i)], tmpmat[i] ); + } + + /* bottom staple ready, push up */ + tag2 = start_get( tmpmat, odir, parity ); + + wait_get(tag1); + /* just try to see what comes ..*/ + if(start){ /* this is the first contribution to staple */ + forparity(i,parity){ + prefetch_MATRIX( &link[dir2][nb(dir1,i)] ); + mult_MATRIX_nn( link[dir2][i], link[dir1][nb(dir2,i)], tmat1 ); + prefetch_MATRIX( &link[dir2][i+1] ); + prefetch_MATRIX( &link[dir1][nb(dir2,i+1)] ); + mult_MATRIX_na( tmat1, link[dir2][nb(dir1,i)], staple[i] ); + } + start=0; + } else { + forparity(i,parity){ + prefetch_MATRIX( &link[dir2][nb(dir1,i)] ); + mult_MATRIX_nn( link[dir2][i], link[dir1][nb(dir2,i)], tmat1 ); + prefetch_MATRIX( &link[dir2][i+1] ); + prefetch_MATRIX( &link[dir1][nb(dir2,i+1)] ); + mult_MATRIX_na( tmat1, link[dir2][nb(dir1,i)], tmat2 ); + add_MATRIX( staple[i], tmat2, staple[i] ); + } + } /* upper staple */ + + /* Lower staple */ + wait_get(tag2); + forparity(i,parity){ + prefetch_MATRIX( &staple[i+1] ); + prefetch_MATRIX( &tmpmat[nb(odir,i+1)]); + add_MATRIX( staple[i], tmpmat[nb(odir,i)], staple[i] ); + } /* lower staple */ + } + free_tmp( tmpmat ); +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/staples_su2.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/staples_su2.c new file mode 100644 index 0000000000000000000000000000000000000000..7971f8f18e6dda24c28a8e7a9a8e90766a3a444d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/staples_su2.c @@ -0,0 +1,9 @@ +/****** staples_su3.c -- compute the staple ******************/ + +/* MIMD version 3 */ + +#include LATDEF +#include "generic_su2.h" + +#define staples_MATRIX staples_su2 +#include "staples_generic.c" diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/staples_su3.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/staples_su3.c new file mode 100644 index 0000000000000000000000000000000000000000..c239f3ba509037d2b67637f8cd647bd2851290a7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/staples_su3.c @@ -0,0 +1,9 @@ +/****** staples_su3.c -- compute the staple ******************/ + +/* MIMD version 3 */ + +#include LATDEF +#include "generic_su3.h" + +#define staples_MATRIX staples_su3 +#include "staples_generic.c" diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/su2.h b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/su2.h new file mode 100644 index 0000000000000000000000000000000000000000..671f52c9255f893b0cb722a20e6b5ed3dac698d6 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/su2.h @@ -0,0 +1,234 @@ +/****************************** su2.h *********************************** + * * + * Define here the su2 + operators * + * MIMD version 3 * + * Kari Rummukainen 1997 * + */ + + +typedef struct { + radix_link a,b,c,d; +} su2_matrix; + +typedef struct { + radix a,b,c; +} adjoint; + +#ifdef T3E +void prefetch_su2(su2_matrix *); +#define prefetch_matrix(p) prefetch_su2((su2_matrix *)p) +#else +#define prefetch_matrix(par) +#define prefetch_su2(par) +#define prefetch_adjoint(par) +#endif + +#define nn_a(x,y) ( x.d*y.a + x.a*y.d - x.b*y.c + x.c*y.b) +#define nn_b(x,y) ( x.d*y.b + x.b*y.d - x.c*y.a + x.a*y.c) +#define nn_c(x,y) ( x.d*y.c + x.c*y.d - x.a*y.b + x.b*y.a) +#define nn_d(x,y) ( x.d*y.d - x.a*y.a - x.b*y.b - x.c*y.c) + +#define na_a(x,y) (-x.d*y.a + x.a*y.d + x.b*y.c - x.c*y.b) +#define na_b(x,y) (-x.d*y.b + x.b*y.d + x.c*y.a - x.a*y.c) +#define na_c(x,y) (-x.d*y.c + x.c*y.d + x.a*y.b - x.b*y.a) +#define na_d(x,y) ( x.d*y.d + x.a*y.a + x.b*y.b + x.c*y.c) + +#define an_a(x,y) ( x.d*y.a - x.a*y.d + x.b*y.c - x.c*y.b) +#define an_b(x,y) ( x.d*y.b - x.b*y.d + x.c*y.a - x.a*y.c) +#define an_c(x,y) ( x.d*y.c - x.c*y.d + x.a*y.b - x.b*y.a) +#define an_d(x,y) ( x.d*y.d + x.a*y.a + x.b*y.b + x.c*y.c) + +#define aa_a(x,y) (-x.d*y.a - x.a*y.d - x.b*y.c + x.c*y.b) +#define aa_b(x,y) (-x.d*y.b - x.b*y.d - x.c*y.a + x.a*y.c) +#define aa_c(x,y) (-x.d*y.c - x.c*y.d - x.a*y.b + x.b*y.a) +#define aa_d(x,y) ( x.d*y.d - x.a*y.a - x.b*y.b - x.c*y.c) + +#define mult_su2_nn(x,y,r) {\ +r.a = nn_a(x,y); r.b = nn_b(x,y); \ +r.c = nn_c(x,y); r.d = nn_d(x,y); } +#define mult_su2_na(x,y,r) {\ +r.a = na_a(x,y); r.b = na_b(x,y); \ +r.c = na_c(x,y); r.d = na_d(x,y); } +#define mult_su2_an(x,y,r) {\ +r.a = an_a(x,y); r.b = an_b(x,y); \ +r.c = an_c(x,y); r.d = an_d(x,y); } +#define mult_su2_aa(x,y,r) {\ +r.a = aa_a(x,y); r.b = aa_b(x,y);\ +r.c = aa_c(x,y); r.d = aa_d(x,y); } + +#define mult_su2_nn_a(x,y,r) {\ +r.a = nn_a(x,y); r.b = nn_b(x,y); r.c = nn_c(x,y); } +#define mult_su2_na_a(x,y,r) {\ +r.a = na_a(x,y); r.b = na_b(x,y); r.c = na_c(x,y); } +#define mult_su2_an_a(x,y,r) {\ +r.a = an_a(x,y); r.b = an_b(x,y); r.c = an_c(x,y); } +#define mult_su2_aa_a(x,y,r) {\ +r.a = aa_a(x,y); r.b = aa_b(x,y); r.c = aa_c(x,y); } + +#define add_su2_matrix(x,y,r) {\ +r.a = x.a + y.a; r.b = x.b + y.b; \ +r.c = x.c + y.c; r.d = x.d + y.d; } +#define sub_su2_matrix(x,y,r) {\ +r.a = x.a - y.a; r.b = x.b - y.b; \ +r.c = x.c - y.c; r.d = x.d - y.d; } +#define scalar_mult_sum_su2_matrix(x,s,r) {\ +r.a += (s)*x.a; r.b += (s)*x.b; \ +r.c += (s)*x.c; r.d += (s)*x.d; } +#define mult_su2_add(x,y,r) {\ +r.a += nn_a(x,y); r.b += nn_b(x,y); \ +r.c += nn_c(x,y); r.d += nn_d(x,y); } +#define su2_mul_inv_add(x,y,r) {\ +r.a -= nn_a(x,y); r.b -= nn_b(x,y); \ +r.c -= nn_c(x,y); r.d += nn_d(x,y); } + +#define su2_sqr(x) (x.a*x.a + x.b*x.b + x.c*x.c + x.d*x.d) +#define su2_det(x) su2_sqr(x) +#define su2_dot(x,y) (x.d*y.d - x.a*y.a - x.b*y.b - x.c*y.c) +#define su2_rdot(x,y) (x.d*y.d + x.a*y.a + x.b*y.b + x.c*y.c) +#define su2_tr(x) (2.0*x.d) +#define su2_tr2(x) x.d +#define su2_inv(x,r) { r.a=-x.a; r.b=-x.b; r.c=-x.c; r.d= x.d; } +#define su2_inv1(x) { x.a=-x.a; x.b=-x.b; x.c=-x.c; } +#define su2_cpy(x,r) r = x +#define su2_scalar_mul(x,s,r) {\ +r.a = (s)*x.a; r.b = (s)*x.b; r.c = (s)*x.c; r.d = (s)*x.d; } +#define su2_scalar_mul_add(x,s,r) {\ +r.a += (s)*x.a; r.b += (s)*x.b; r.c += (s)*x.c; r.d += (s)*x.d; } +#define su2_scalar_mul_inv_add(x,s,r) {\ +r.a -= (s)*x.a; r.b -= (s)*x.b; r.c -= (s)*x.c; r.d += (s)*x.d; } +#define su2_scalar_mul_sub(x,s,r) {\ +r.a -= (s)*x.a; r.b -= (s)*x.b; r.c -= (s)*x.c; r.d -= (s)*x.d; } +#define su2_add(x,r) { r.a += x.a; r.b += x.b; r.c += x.c; r.d += x.d; } +#define su2_add_inv(x,r) { r.a -= x.a; r.b -= x.b; r.c -= x.c; r.d += x.d; } +#define su2_zero(x) x.a = x.b = x.c = x.d = 0.0 +#define su2_one(x) { x.a = x.b = x.c = 0.0; x.d = 1.0; } +#define su2_scalar(s,x) { x.a = x.b = x.c = 0.0; x.d = s; } +#define su2_scalar_add(s,r) { r.d += s; } + +#define mult_su2_vec(m,v,t) mult_su2_nn(m,v,t) +#define mult_su2_vec_sum(m,v,t) mult_su2_add(m,v,t) +#define mult_adj_su2_vec(m,v,t) mult_su2_an(m,v,t) +#define add_su2_vector(x,y,t) add_su2_matrix(x,y,t) +#define scalar_mult_vec(a,s,t) su2_scalar_mul(a,s,t) +#define scalar_mult_add_vec(a,s,t) su2_scalar_mul_add(a,s,t) + +#define mult_su2_nadj(u,t,r) {\ +r.a = u.d*t.a - u.b*t.c + u.c*t.b; \ +r.b = u.d*t.b - u.c*t.a + u.a*t.c; \ +r.c = u.d*t.c - u.a*t.b + u.b*t.a; \ +r.d = - u.c*t.c - u.b*t.b - u.a*t.a; } +#define mult_su2_aadj(u,t,r) {\ +r.a = u.d*t.a + u.b*t.c - u.c*t.b; \ +r.b = u.d*t.b + u.c*t.a - u.a*t.c; \ +r.c = u.d*t.c + u.a*t.b - u.b*t.a; \ +r.d = u.c*t.c + u.b*t.b + u.a*t.a; } +#define mult_su2_adjn(t,u,r) {\ +r.a = u.d*t.a + u.b*t.c - u.c*t.b; \ +r.b = u.d*t.b + u.c*t.a - u.a*t.c; \ +r.c = u.d*t.c + u.a*t.b - u.b*t.a; \ +r.d = - u.c*t.c - u.b*t.b - u.a*t.a; } +#define mult_su2_adja(t,u,r) {\ +r.a = u.d*t.a - u.b*t.c + u.c*t.b; \ +r.b = u.d*t.b - u.c*t.a + u.a*t.c; \ +r.c = u.d*t.c - u.a*t.b + u.b*t.a; \ +r.d = u.c*t.c + u.b*t.b + u.a*t.a; } + +#define project_to_adjoint(u,s) { s.a = u.a; s.b = u.b; s.c = u.c; } +#define adjoint_to_matrix(s,u) { \ + u.d = 0; u.a = s.a; u.b = s.b; u.c = s.c; } + +#define adj_scalar(x,s) x.a = x.b = x.c = (s) +#define adj_sqr(x) (x.a*x.a + x.b*x.b + x.c*x.c) +#define adj_scalar_mul(x,s,r) {r.a = (s)*x.a; r.b = (s)*x.b; r.c = (s)*x.c;} +#define adj_scalar_mul_add(x,s,r) {\ +r.a += (s)*x.a; r.b += (s)*x.b; r.c += (s)*x.c;} +#define adj_scalar_mul_sub(x,s,r) {\ +r.a -= (s)*x.a; r.b -= (s)*x.b; r.c -= (s)*x.c;} +#define add_adjoint(u,t,r) {\ +r.a = u.a + t.a; r.b = u.b + t.b; r.c = u.c + t.c; } +#define sub_adjoint(u,t,r) {\ +r.a = u.a - t.a; r.b = u.b - t.b; r.c = u.c - t.c; } +#define adj_add(t,r) {r.a += t.a; r.b += t.b; r.c += t.c; } +#define adj_sub(t,r) {r.a -= t.a; r.b -= t.b; r.c -= t.c; } +#define adj_zero(x) x.a = x.b = x.c = 0.0 +#define adj_dot(x,y) (x.a*y.a + x.b*y.b + x.c*y.c) +#define adj_cpy(y,x) { x.a = y.a; x.b = y.b; x.c = y.c; } +#define adj_2scalar_mul(x,s,y,t,r) \ +{r.a = (s)*x.a + (t)*y.a; r.b = (s)*x.b + (t)*y.b; r.c = (s)*x.c + (t)*y.c;} + +#define trans_adj_up(u,e,r) { register radix t1,t2,t3; \ + t1 = 2.0*u.d; \ + t3 = t1*u.d - 1.0; \ + t2 = 2.0*(e.a*u.a + e.b*u.b + e.c*u.c); \ + r.a = e.a*t3 + u.a*t2 - t1*(e.b*u.c - e.c*u.b); \ + r.b = e.b*t3 + u.b*t2 - t1*(e.c*u.a - e.a*u.c); \ + r.c = e.c*t3 + u.c*t2 - t1*(e.a*u.b - e.b*u.a);} + +#define trans_adj_down(u,e,r) { register radix t1,t2,t3; \ + t1 = 2.0*u.d; \ + t3 = t1*u.d - 1.0; \ + t2 = 2.0*(e.a*u.a + e.b*u.b + e.c*u.c); \ + r.a = e.a*t3 + u.a*t2 + t1*(e.b*u.c - e.c*u.b); \ + r.b = e.b*t3 + u.b*t2 + t1*(e.c*u.a - e.a*u.c); \ + r.c = e.c*t3 + u.c*t2 + t1*(e.a*u.b - e.b*u.a);} + +/* exp of a matrix: exp(i E) = cos(|E|) + i E/|E| sin(|E|) + */ +#define su2_exp(x,u) { register radix r_t,s_t; \ + r_t = sqrt((double)adj_sqr(x)); \ + if (r_t>0) \ + { s_t = sin((double)r_t)/r_t; adj_scalar_mul(x,s_t,u); \ + u.d = cos((double)r_t); } \ + else su2_one(u); } + +/* log of a matrix: exp(i E) = U -> E = -i log[ U ] + * exp(i E) = cos(|E|) + i s_a E_a/|E| sin(|E|) = U_0 + i s_a U_a + * limit |E| to the interval [0,pi) + */ +#define su2_log(u,x) { register double s_q; register radix r_t; \ + s_q = adj_sqr(u); \ + if (s_q > 0 && u.d < 1.0) { \ + r_t = acos((double)u.d) / sqrt(s_q); adj_scalar_mul(u,r_t,x); } \ + else adj_zero(x); } + +#define su2_normalize(u,v) { register radix r_t; \ + r_t = 1.0/sqrt( (double)su2_sqr(u) ); su2_scalar_mul( u, r_t, v ); } + +/* gaussian_adjoint returns adjoint r with = w^2/2 + * so that if want p~exp( -d r^2 ), use w = 1/sqrt(d) + */ +#define gaussian_adjoint(r,w) { \ + r.a = (w) * gaussian_ran(); \ + r.b = (w) * gaussian_ran(); \ + r.c = (w) * gaussian_ran(); } + +#define metro_su2h( h, scale, r ) { \ +r.a = h.a + scale * (dran() - 0.5); \ +r.b = h.b + scale * (dran() - 0.5); \ +r.c = h.c + scale * (dran() - 0.5); \ +r.d = h.d + scale * (dran() - 0.5); } + +#define su2_R_I( h1, h2, R, I ) { R = an_d(h1,h2); I = -an_c(h1,h2); } +#define su2_isigma3( x, r ) { r.a = -x.b; r.b = x.a; r.c = x.d; r.d = -x.c; } +#define tr2_isigma3(x) (-x.c) + +/* this gives a gaussian vector of width = w^2/2, + * so that if we want distribution exp( -a g^2 ) + * we have to use w = 1/sqrt(a) + */ +#define gaussian_su2_vector( g, t ) { \ + /* double t = 1/sqrt((double)w); */ \ + g.a = t*gaussian_ran(); \ + g.b = t*gaussian_ran(); \ + g.c = t*gaussian_ran(); \ + g.d = t*gaussian_ran(); \ +} + +#define random_su2(u) { register radix t; \ + u.a = dran()-0.5; \ + u.b = dran()-0.5; \ + u.c = dran()-0.5; \ + u.d = dran()-0.5; \ + su2_normalize( u, u ); \ +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/timecheck.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/timecheck.c new file mode 100644 index 0000000000000000000000000000000000000000..f02407b94112bb4243c4c569d9be8fa4d00a6cf6 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/timecheck.c @@ -0,0 +1,143 @@ +/************************************************** + * time checking routines + */ + +#include LATDEF + + +/************************************************* + * resource clocks + */ + +static double time_last; + + +#ifdef MPI + +static double time_start; + +void inittime() +{ + time_start = time_last = MPI_Wtime(); +} + +void resettime() +{ + time_last = MPI_Wtime(); +} + +double cputime() +{ + return ( MPI_Wtime() - time_start ); +} + +double added_cpu_time() +{ + double t,t2; + + t = (t2 = MPI_Wtime()) - time_last; + time_last = t2; + return(t); +} + +#else + +double cputime() +{ + struct rusage resource; + extern int getrusage(); + + getrusage(RUSAGE_SELF,&resource); + return(resource.ru_utime.tv_sec + 1e-6*resource.ru_utime.tv_usec + + resource.ru_stime.tv_sec + 1e-6*resource.ru_stime.tv_usec); + +} + +void inittime() +{ + time_last = cputime(); +} + +void resettime() +{ + time_last = cputime(); +} + +double added_cpu_time() +{ + double t,t2; + + t = (t2 = cputime()) - time_last; + time_last = t2; + return(t); +} + +#endif + +/************************************************** + * Timing check routines + */ + +static int interval=0,starttime; +static time_t timelimit; + + +void timecheck(int iteration, int maxiter, int status) +{ + int temp,ttime=0; + + if (this_node == 0) { + ttime = time(NULL); + if (interval <= 0.0) { + interval = ttime - starttime; + printf(" -- approx %d seconds between time checks\n",interval); + fflush(stdout); + interval += 1200; /* leave good time (20 min) for the save etc. */ + } + if (iteration < maxiter && timelimit - ttime - interval < 0) temp = 1; + else temp = 0; + } else temp = 0; + + broadcast_field(&temp,sizeof(int)); + + if (temp == 1) { + dumpall(status,&maxiter); + /* normal exit here */ + if (this_node == 0) + printf("\n **** cpu time exit, remaining time %d seconds\n", + (int)(timelimit - ttime)); + finishrun(); + } +} + +void inittimecheck() +{ + starttime = time(NULL); + interval = 0; + if (this_node == 0) { + printf(" -- Available wallclock time %d seconds\n",(int)(timelimit-starttime)); + fflush(stdout); + } +} + + +int setup_timelimit(time_t t,int argc,char *argv) +{ + int tmp,istimelimit; + + if (this_node == 0) { + if (argc > 0) { + if (sscanf(argv,"%d",&tmp) != 1) + halt("Error reading in time limit"); + timelimit = tmp; /* use tmp to guarantee int */ + if (this_node == 0) + printf(" +++++ wallclock time limit %d seconds\n",(int)timelimit); + istimelimit = 1; + timelimit += t; /* this is the time at the end ... */ + } else istimelimit = 0; + } + + broadcast_int( &istimelimit ); + + return(istimelimit); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/timers.c b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/timers.c new file mode 100644 index 0000000000000000000000000000000000000000..ec56a1dc961fcf4ce902063bc0c2b246c1911f2f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/timers.c @@ -0,0 +1,65 @@ +#ifdef TIMERS + +#include +#include +#include +#include +#include "comdefs.h" +#include "timers.h" + +double timer_start( timer_type * t ) +{ + struct timeval resource; + + if (this_node == 0) { + gettimeofday(&resource,NULL); + t->start = resource.tv_sec + 1.0e-6*resource.tv_usec; + + /* t->start = clock()*1.0/(CLOCKS_PER_SEC); */ + return(t->start); + } else return(0.0); +} + +double timer_end( timer_type * t ) +{ + double e; + struct timeval resource; + + if (this_node == 0) { + gettimeofday(&resource,NULL); + e = resource.tv_sec + 1.0e-6*resource.tv_usec; + /* e = clock()*1.0/(CLOCKS_PER_SEC); */ + + t->total += (e - t->start); + t->count++; + return(e); + } else return(0.0); +} + +void timer_report( timer_type * t ) +{ + struct timeval resource; + + if (this_node == 0) { + gettimeofday(&resource,NULL); + /* time used during the counter activity */ + t->initial = resource.tv_sec + 1.0e-6*resource.tv_usec - t->initial; + if (t->count) + printf(" total %g sec, %d calls, %g usec/call, fraction %.2g of time\n", + t->total, t->count, 1e6 * t->total/t->count, t->total/t->initial ); + else + printf(" no timed calls made\n"); + } +} + +void timer_reset( timer_type * t ) { + struct timeval resource; + + t->total = t->count = t->start = 0; + if (this_node == 0) { + gettimeofday(&resource,NULL); + t->initial = resource.tv_sec + 1.0e-6*resource.tv_usec; + } +} + +#endif /* timers */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/generic/timers.h b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/timers.h new file mode 100644 index 0000000000000000000000000000000000000000..ce7cae3e46d571f1b05724905fcfeaad0252a0e5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/generic/timers.h @@ -0,0 +1,22 @@ + +#ifdef TIMERS + +typedef struct { + double total, start, initial; /* cumulated, work, and initial timeval */ + int count; +} timer_type; + + +double timer_start( timer_type * ); +double timer_end( timer_type * ); +void timer_reset( timer_type * t ); +void timer_report( timer_type * ); + +#else + +#define timer_start(a) +#define timer_end(a) +#define timer_reset(a) +#define timer_report(a) + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/Make_template b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/Make_template new file mode 100644 index 0000000000000000000000000000000000000000..a855f7b31fa7fd0aed655a99061f898a82a09cd3 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/Make_template @@ -0,0 +1,47 @@ +# Makefile for Libraries for QCD programs +# +# This template file defines rules and macros common to all architectures +# It is intended to be an include file for other Makefiles. +# Don't use it by itself! + +.c.o: + $(MPI_CC) $(CFLAGS) -c $*.c + +all: complex.a su3.a + +COMPLEXOBJS = cadd.o cdiv.o ce_itheta.o cexp.o clog.o cmplx.o cmul.o \ + conjg.o csqrt.o csub.o dcadd.o dcdiv.o dce_itheta.o \ + dcexp.o dclog.o dcmplx.o dcmul.o dconjg.o dcsqrt.o dcsub.o +$(COMPLEXOBJS) : complex.h +complex.a: complex.h $(COMPLEXOBJS) + $(AR) $(ARFLAGS) complex.a $(COMPLEXOBJS) + +SU3OBJS = addmat.o addvec.o cmp_ahmat.o cs_m_a_vec.o cs_m_a_mat.o cs_m_s_vec.o \ + cs_m_vec.o det_su3.o clear_mat.o dumpmat.o dumpvec.o clearvec.o gaussrand.o \ + m_amatvec_s.o m_amatvec.o m_amatvec_ns.o m_mat_an.o \ + m_mat_na.o m_mat_nn.o m_matvec.o m_matvec_ns.o m_matvec_s.o \ + make_ahmat.o rand_ahmat.o realtr.o complextr.o \ + s_m_a_mat.o s_m_a_vec.o s_m_s_mat.o s_m_s_vec.o s_m_sum_vec.o \ + s_m_vec.o s_m_mat.o cs_m_mat.o cs_m_s_mat.o \ + su3_adjoint.o su3_dot.o su3_rdot.o su3_proj.o su3mat_copy.o \ + su3vec_copy.o \ + submat.o subvec.o trace_su3.o uncmp_ahmat.o \ + msq_su3vec.o sub4vecs.o m_amv_4dir.o m_amv_4dir_2.o m_mv_s_4dir.o \ + flush_to_zero.o +#WILSON_OBJS = wp_shrink.o wp_grow.o wp_grow_a.o dump_wvec.o clear_wvec.o \ +# su3_proj_w.o copy_wvec.o add_wvec.o sub_wvec.o s_m_wvec.o \ +# s_m_hwvec.o msq_wvec.o wvec_dot.o wvec2_dot.o wvec_rdot.o \ +# s_m_a_wvec.o s_m_atm_wvec.o mb_gamma.o mb_gamma_l.o mb_gamma_r.o \ +# cs_m_a_wvec.o cs_m_a_wvec2.o \ +# m_mat_wvec.o m_mat_hwvec.o m_amat_wvec.o m_amat_hwvec.o \ +# grow4wvecs.o wp_shrink4.o +$(SU3OBJS) : complex.h su3.h +#$(WILSON_OBJS) : complex.h su3.h +su3.a:: su3.h $(SU3OBJS) + $(AR) $(ARFLAGS) su3.a $(SU3OBJS) +#su3.a:: su3.h $(WILSON_OBJS) +# $(AR) $(ARFLAGS) rcs su3.a $(WILSON_OBJS) + +clean: + $(RM) *.o + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/Make_vanilla b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/Make_vanilla new file mode 100644 index 0000000000000000000000000000000000000000..5070b2e2a808b7248e5d77d70ca8fc5fe37f72a7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/Make_vanilla @@ -0,0 +1,32 @@ +# Makefile for Libraries for QCD programs +# +# Library routines involve no communication, so are compiled +# as for a scalar processor. + +# +# This Makefile builds the purely C coded versions and should work +# for all architectures with a suitable choice of CFLAGS and +# CC below + +# The FAST option uses "fast" variants of the code, where available, and is +# recommended. The fast variants are C-coded with explicit loop unrolling +# and inlining. + +#CFLAGS = -O3 -DFAST -Wall -Wmissing-prototypes #gnu c compiler +#CFLAGS = -O -DFAST -float #Dec alpha compiler +#CFLAGS = -O -f -DFAST #Mips +CFLAGS = -O3 -DFAST -DNATIVEDOUBLE #IBM RS6000 (optimized) +#CFLAGS = -g -Wall -DFAST -DNATIVEDOUBLE #IBM RS6000 (profile/debug) +#CFLAGS = -DSGI -O3 -mips4 -64 -OPT:IEEE_arithmetic=3:roundoff=3:alias=restrict -TENV:X=1 -DFAST -DNATIVEDOUBLE # SGI Origin 2000 (from UCSB) +#CFLAGS = -O3 -mips4 -r10000 -OPT:IEEE_arithmetic=3:roundoff=3:alias=restrict -TENV:X=1 -DFAST -DNATIVEDOUBLE #NCSA SGI PC (untested for MILC version 5) +#CFLAGS= -O5 -dalign -libmil -fsimple=2 -fns #SUN Ultra +#CFLAGS = -O -DFAST -float sp_ops -float sp_const -noautopar #Convex Exemplar (untested for MILC version 5) + + +CC = cc #most +#CC = xlc #IBM RS6000 ANSI C +#CC = gcc #gnu c compiler + +include Make_template + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..146791ff96b24704168ca9dcce895fd36ebc6958 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/Makefile @@ -0,0 +1,31 @@ +# Makefile for Libraries for QCD programs +# +# Library routines involve no communication, so are compiled +# as for a scalar processor. + +# The specialized Makefiles are for processors for which we +# have some assembly coded substitutes. +# +# This Makefile builds the purely C coded versions and should work +# for all architectures with a suitable choice of CFLAGS and +# MPI_CC. + +# The FAST option uses "fast" variants of the code, where available, and is +# recommended. The fast variants are C-coded with explicit loop unrolling +# and inlining. + +include ../Makefile.defs + +CFLAGS += -DFAST + +#CFLAGS += -DFAST #gnu c compiler +#CFLAGS += -O -DFAST -float #Dec alpha compiler +#CFLAGS += -O -f -DFAST #Mips +#CFLAGS += -O3 -DFAST -DNATIVEDOUBLE #IBM RS6000 (optimized) +#CFLAGS += -g -Wall -DFAST -DNATIVEDOUBLE #IBM RS6000 (debug/profile) +#CFLAGS += -DSGI -O3 -mips4 -64 -OPT:IEEE_arithmetic=3:roundoff=3:alias=restrict -TENV:X=1 -DFAST -DNATIVEDOUBLE # SGI Origin 2000 (from UCSB) +#CFLAGS += -O3 -mips4 -r10000 -OPT:IEEE_arithmetic=3:roundoff=3:alias=restrict -TENV:X=1 -DFAST -DNATIVEDOUBLE #NCSA SGI PC (untested for MILC version 5) +#CFLAGS += -O5 -dalign -libmil -fsimple=2 -fns #SUN Ultra +#CFLAGS += -O -DFAST -float sp_ops -float sp_const -noautopar #Convex Exemplar (untested for MILC version 5) + +include Make_template diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/add_wvec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/add_wvec.c new file mode 100644 index 0000000000000000000000000000000000000000..4c9e5cc4bd72c484b4cebe82cca0cfeddd7207ea --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/add_wvec.c @@ -0,0 +1,14 @@ +/******************** add_wvec.c (in su3.a) ******************** +* +*void add_wilson_vector(wilson_vector *src1,*src2,*dest) +* add two Wilson vectors +* dest <- src1 + src2 +*/ +#include "complex.h" +#include "su3.h" + +void add_wilson_vector( wilson_vector *src1, wilson_vector *src2, + wilson_vector *dest ){ + register int i; + for(i=0;i<4;i++)add_su3_vector( &(src1->d[i]), &(src2->d[i]), &(dest->d[i])); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/addmat.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/addmat.c new file mode 100644 index 0000000000000000000000000000000000000000..9a493a809e67b84619c582ac8b86bd09109f013f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/addmat.c @@ -0,0 +1,13 @@ +/******************** addmat.c (in su3.a) ***************************** +* * +* Add two SU3 matrices * +*/ +#include "complex.h" +#include "su3.h" + +void add_su3_matrix( su3_matrix *a, su3_matrix *b, su3_matrix *c ) { +register int i,j; + for(i=0;i<3;i++)for(j=0;j<3;j++){ + CADD( a->e[i][j], b->e[i][j], c->e[i][j] ); + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/addvec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/addvec.c new file mode 100644 index 0000000000000000000000000000000000000000..b5d07e363483dbe2f90f30f4303e144890325607 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/addvec.c @@ -0,0 +1,13 @@ +/******************** addvec.c (in su3.a) ***************************** +* * +* Add two SU3 vectors * +*/ +#include "complex.h" +#include "su3.h" + +void add_su3_vector( su3_vector *a, su3_vector *b, su3_vector *c ){ +register int i; + for(i=0;i<3;i++){ + CADD( a->c[i], b->c[i], c->c[i] ); + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/asdef.alpha.h b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/asdef.alpha.h new file mode 100644 index 0000000000000000000000000000000000000000..aaf841a898ff967017bf80a6af87147d73e6fb78 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/asdef.alpha.h @@ -0,0 +1,65 @@ +#define v0 $0 /*Integer return value register*/ +#define t0 $1 /*Integer scratch registers (caller saved)*/ +#define t1 $2 +#define t2 $3 +#define t3 $4 +#define t4 $5 +#define t5 $6 +#define t6 $7 +#define t7 $8 +#define s0 $9 /*Integer save registers (callee saved)*/ +#define s1 $10 +#define s2 $11 +#define s3 $12 +#define s4 $13 +#define s5 $14 +#define fp $15 /*Private frame pointer register*/ +#define a0 $16 /*Integer argument registers*/ +#define a1 $17 +#define a2 $18 +#define a3 $19 +#define a4 $20 +#define a5 $21 +#define t8 $22 /*Scratch registers (continued)*/ +#define t9 $23 +#define t10 $24 +#define t11 $25 +#define ra $26 /*Return address register*/ +#define t12 $27 /*Scratch registers (continued)*/ +#define at $28 #reserved for assembler +#define gp $29 /*global pointer*/ +#define sp $30 /*Stack pointer register*/ +#define zero $31 /*Integer ReadAsZero/Sink register*/ + +#define fv0 $f0 /*Floating-point return value register*/ +#define fv1 $f1 +#define fs0 $f2 /*Floating-point save registers (callee saved)*/ +#define fs1 $f3 +#define fs2 $f4 +#define fs3 $f5 +#define fs4 $f6 +#define fs5 $f7 +#define fs6 $f8 +#define fs7 $f9 +#define ft0 $f10 /*Floating-point scratch registers*/ +#define ft1 $f11 +#define ft2 $f12 +#define ft3 $f13 +#define ft4 $f14 +#define ft5 $f15 +#define fa0 $f16 /*Floating-point argument registers*/ +#define fa1 $f17 +#define fa2 $f18 +#define fa3 $f19 +#define fa4 $f20 +#define fa5 $f21 +#define ft6 $f22 /*Floating-point scratch registers (continued)*/ +#define ft7 $f23 +#define ft8 $f24 +#define ft9 $f25 +#define ft10 $f26 +#define ft11 $f27 +#define ft12 $f28 +#define ft13 $f29 +#define ft14 $f30 +#define fzero $f31 /*Floating-point ReadAsZero/Sink register*/ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cadd.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cadd.c new file mode 100644 index 0000000000000000000000000000000000000000..8e6b5e8f7c0d4b6957c1c67d7f50bcdd8fc88d78 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cadd.c @@ -0,0 +1,10 @@ +/* Subroutines for operations on complex numbers */ +/* add two complex numbers */ +#include "complex.h" + +complex cadd( complex *a, complex *b ) { + complex c; + c.real = (*a).real + (*b).real; + c.imag = (*a).imag + (*b).imag; + return(c); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cdiv.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cdiv.c new file mode 100644 index 0000000000000000000000000000000000000000..a8d59a0e72db0711d267af6d2bdb1829b77bbdb2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cdiv.c @@ -0,0 +1,12 @@ +/* Subroutines for operations on complex numbers */ +/* Divide two complex numbers */ +#include "complex.h" + +complex cdiv( complex *a, complex *b ) { + complex c; + radix scale; + scale = 1.0/((*b).real*(*b).real+(*b).imag*(*b).imag); + c.real = scale*((*a).real*(*b).real + (*a).imag*(*b).imag); + c.imag = scale*((*a).imag*(*b).real - (*a).real*(*b).imag); + return(c); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/ce_itheta.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/ce_itheta.c new file mode 100644 index 0000000000000000000000000000000000000000..7eced480a1847556b95eacdc6ac4a7e3c372acfe --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/ce_itheta.c @@ -0,0 +1,12 @@ +/* Subroutines for operations on complex numbers */ +/* exp( i*theta ) */ +#include +#include "complex.h" + +complex ce_itheta( radix theta ){ + complex c; + c.real = (radix)cos( (double)theta ); + c.imag = (radix)sin( (double)theta ); + /* there must be a more efficient way */ + return( c ); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cexp.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cexp.c new file mode 100644 index 0000000000000000000000000000000000000000..501be0be8747ca04df3a66eeebe02b7ac887d63f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cexp.c @@ -0,0 +1,13 @@ +/* Subroutines for operations on complex numbers */ +/* complex exponential */ +#include +#include "complex.h" + +complex cexp( complex *a ){ + complex c; + radix mag; + mag = (radix)exp( (double)(*a).real ); + c.real = mag*(radix)cos( (double)(*a).imag ); + c.imag = mag*(radix)sin( (double)(*a).imag ); + return(c); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/clear_mat.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/clear_mat.c new file mode 100644 index 0000000000000000000000000000000000000000..33a4df85e0b07e2406a20b702abd6ecd6110d2a7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/clear_mat.c @@ -0,0 +1,15 @@ +/******************** clear_mat.c (in su3.a) ******************** +* +*void clear_su3mat( su3_matrix *dest ) +* clear an SU3 matrix +* dest <- zero_matrix +*/ +#include "complex.h" +#include "su3.h" + +void clear_su3mat( su3_matrix *dest ){ +register int i,j; + for(i=0;i<3;i++)for(j=0;j<3;j++){ + dest->e[i][j].real = dest->e[i][j].imag = 0.0; + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/clear_wvec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/clear_wvec.c new file mode 100644 index 0000000000000000000000000000000000000000..65fbed79f52f5283d4e501c7a14e215a2f53ab48 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/clear_wvec.c @@ -0,0 +1,15 @@ +/******************** clear_wvec.c (in su3.a) ******************** +* +*void clear_wilson_vector( wilson_vector *dest ) +* clear a Wilson vector +* dest <- zero_vector +*/ +#include "complex.h" +#include "su3.h" + +void clear_wvec( wilson_vector *dest ){ +register int i,j; + for(i=0;i<4;i++)for(j=0;j<3;j++){ + dest->d[i].c[j].real = dest->d[i].c[j].imag = 0.0; + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/clearvec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/clearvec.c new file mode 100644 index 0000000000000000000000000000000000000000..4072c9254ab726a553d1215b692a39acc527ee1d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/clearvec.c @@ -0,0 +1,14 @@ +/******************* clearvec.c (in su3.a) ***************************** +* * +* void clearvec( su3_vector *vec ) * +* print out a 3 element complex vector * +*/ +#include "complex.h" +#include "su3.h" + +void clearvec( su3_vector *v ) +{ + v->c[0].real = v->c[0].imag = 0.0; + v->c[1].real = v->c[1].imag = 0.0; + v->c[2].real = v->c[2].imag = 0.0; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/clog.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/clog.c new file mode 100644 index 0000000000000000000000000000000000000000..deead044aec545268ea79bf598236820e5fdaefb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/clog.c @@ -0,0 +1,11 @@ +/* Subroutines for operations on complex numbers */ +/* complex logarithm */ +#include +#include "complex.h" + +complex clog( complex *a ){ + complex c; + c.real = 0.5*(radix)log((double)((*a).real*(*a).real+(*a).imag*(*a).imag)); + c.imag = (radix)atan2( (double)(*a).imag, (double)(*a).real ); + return(c); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cmp_ahmat.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cmp_ahmat.c new file mode 100644 index 0000000000000000000000000000000000000000..1cf75efb2091fc2596f8a859e246d11add32cda8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cmp_ahmat.c @@ -0,0 +1,20 @@ +/***************** cmp_ahmat.c (in su3.a) ***************************** +* * +* Make an anti_hermitmat (anti Hermitian matrix in compressed form) * +* from an SU3 matrix (3x3 complex matrix). * +*/ +#include "complex.h" +#include "su3.h" + +void compress_anti_hermitian( su3_matrix *mat_su3, + anti_hermitmat *mat_antihermit ) { + mat_antihermit->m00im=mat_su3->e[0][0].imag; + mat_antihermit->m11im=mat_su3->e[1][1].imag; + mat_antihermit->m22im=mat_su3->e[2][2].imag; + mat_antihermit->m01.real=mat_su3->e[0][1].real; + mat_antihermit->m02.real=mat_su3->e[0][2].real; + mat_antihermit->m12.real=mat_su3->e[1][2].real; + mat_antihermit->m01.imag=mat_su3->e[0][1].imag; + mat_antihermit->m02.imag=mat_su3->e[0][2].imag; + mat_antihermit->m12.imag=mat_su3->e[1][2].imag; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cmplx.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cmplx.c new file mode 100644 index 0000000000000000000000000000000000000000..d7a513beaef9acad3cd0fadf4d2d28dd50db4232 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cmplx.c @@ -0,0 +1,9 @@ +/* Subroutines for operations on complex numbers */ +/* make a complex number from two real numbers */ +#include "complex.h" + +complex cmplx( radix x, radix y ) { + complex c; + c.real = x; c.imag = y; + return(c); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cmul.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cmul.c new file mode 100644 index 0000000000000000000000000000000000000000..8b5b3a2f0f6ec0877200cd414bf8785dc901fa10 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cmul.c @@ -0,0 +1,10 @@ +/* Subroutines for operations on complex numbers */ +/* multiply two complex numbers */ +#include "complex.h" + +complex cmul( complex *a, complex *b ) { + complex c; + c.real = (*a).real * (*b).real - (*a).imag * (*b).imag; + c.imag = (*a).imag * (*b).real + (*a).real * (*b).imag; + return(c); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/comdefs.h b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/comdefs.h new file mode 100644 index 0000000000000000000000000000000000000000..2d8dff178841f98b702a0e473e554679d0e2a154 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/comdefs.h @@ -0,0 +1,410 @@ +/************************* comdefs.h *************************************/ + +/* Definitions for communications for the SU3 program on the Intel machine, + version 4. + + Communications routines will assume that the lattice is stored as an + array of structures of type "site". + + MODIFIED FOR 3D CODE Kari Rummukainen 1997 +*/ + +/* message types (not all are used on all machines) */ +#define PARAM_TYPE 11 /* type of parameter message to nodes */ +#define FIELD_TYPE 12 /* type of field sent from one node to another */ +#define BROADCAST_FLOAT_TYPE 13 /* broadcast of radixing point number */ +#define BROADCAST_DOUBLE_TYPE 14 /* broadcast of double */ +#define BROADCAST_COMPLEX_TYPE 15 /* broadcast of single precision complex */ +#define BROADCAST_DCOMPLEX_TYPE 16 /* broadcast of double precision complex */ +#define SEND_INTEGER_TYPE 17 /* send an integer to one other node */ +#define CM_GATHER_TYPE 18 /* type for CM5 (cooperative) messages */ +#define SYNC_TYPE 50 /* Synchronize all nodes */ +#define SUM_FLOAT_TYPE 51 /* Sum radix over all nodes */ +#define SUM_DOUBLE_TYPE 52 /* Sum double over all nodes */ +#define SUM_COMPLEX_TYPE 53 /* Sum complex over all nodes */ +#define SUM_DCOMPLEX_TYPE 54 /* Sum double_complex over all nodes */ +#define MAX_FLOAT_TYPE 55 /* Maximum radix over all nodes */ +#define MAX_DOUBLE_TYPE 56 /* Maximum double over all nodes */ + +/* For MPI, need to include mpi.h to define MPI_Request */ +/* For MPI on paragon, mynode() and numnodes() can't replace nx routines */ +#ifdef MPI +#include +#define mynode MILC_mynode +#define numnodes MILC_numnodes +#define dclock MILC_dclock +#endif + +/* Added for pvm */ +#ifdef PVM +#define ANY_MSG -1 /* Any message */ + +/* Message structures for communication between host and nodes */ +/* The first two fields must always be the same for each type */ +/* The basic structure must be the shortest */ + +/* For most messages */ +struct hcs_basic { + int msg_type; + int node; /* integer identifies caller's instance */ + int arg1,arg2,arg3; /* Use depends on which routine */ +} ; +#define HCS_BASIC_SIZE (sizeof(struct hcs_basic)) +#ifdef PROTO +int put_hcs_basic( struct hcs_basic * hcs ); +int get_hcs_basic( struct hcs_basic * hcs ); +#endif + +/* For printf and scanf calls */ +#define STRING_LENGTH 256 +struct hcs_stdio { + int msg_type; + int node; /* integer identifies caller's instance */ + int length; + char s[STRING_LENGTH]; +} ; +#define HCS_STDIO_SIZE (sizeof(struct hcs_stdio)) +#ifdef PROTO +int put_hcs_stdio( struct hcs_stdio * hcs ); +int get_hcs_stdio( struct hcs_stdio * hcs ); +#endif + +/* For initialization call */ +#define MAX_NUMBER_NODES 8 +#define HOST_NAME_LENGTH 128 +struct hcs_ident { + int msg_type; + int node; /* integer identifies caller's instance */ + int your_node; /* logical node number for this node */ + int number_nodes; /* how many nodes for this partition */ + int node_instance[MAX_NUMBER_NODES]; /* instance number for logical node */ + char host_name[HOST_NAME_LENGTH]; /* name of host */ +} ; +#define HCS_IDENT_SIZE (sizeof(struct hcs_ident)) +#ifdef PROTO +int put_hcs_ident( struct hcs_ident * hcs ); +int get_hcs_ident( struct hcs_ident * hcs ); +#endif + +union { + struct hcs_basic basic; + struct hcs_stdio stdio; + struct hcs_ident ident; +} hcs; + +#define HOST_CALL 77 /* pvm message type for call to host for service */ +#define HOST_REPLY 87 + +/* Message subtypes internal to host-node service calls */ +/* (Not used by pvm to identify messages) */ +#define PRINTF_HOST_CALL 11 +#define SCANF_HOST_CALL 12 +#define FPRINTF_HOST_CALL 13 /* Not used */ +#define FSCANF_HOST_CALL 14 /* Not used */ +#define FFLUSH_HOST_CALL 20 +#define FOPEN_HOST_CALL 30 /* Not used */ +#define FCLOSE_HOST_CALL 31 /* Not used */ +#define OPEN_HOST_CALL 32 /* Not used */ +#define CLOSE_HOST_CALL 33 /* Not used */ +#define CREAT_HOST_CALL 34 /* Not used */ +#define READ_HOST_CALL 40 /* Not used */ +#define WRITE_HOST_CALL 41 /* Not used */ +#define NODE_IDENT_CALL 78 +#define NODES_DONE_HOST_CALL 99 + +/* end of pvm additions */ +#endif /* end ifdef PVM */ + +/* Added for pvm */ +#ifdef PVM24 +#define ANY_MSG -1 /* Any message */ + +/* Message structures for communication between host and nodes */ +/* The first two fields must always be the same for each type */ +/* The basic structure must be the shortest */ + +/* For initialization call */ +#define MAX_NUMBER_NODES 8 +#define HOST_NAME_LENGTH 128 +struct hcs_ident_struct { + int msg_type; + int node; /* integer identifies caller's instance */ + int your_node; /* logical node number for this node */ + int number_nodes; /* how many nodes for this partition */ + int node_instance[MAX_NUMBER_NODES]; /* instance number for logical node */ + char host_name[HOST_NAME_LENGTH]; /* name of host */ +} hcs_ident ; +#define HCS_IDENT_SIZE (sizeof(struct hcs_ident_struct)) +#ifdef PROTO +int put_hcs_ident( struct hcs_ident_struct * hcs ); +int get_hcs_ident( struct hcs_ident_struct * hcs ); +#endif + +#define NODE_IDENT_CALL 77 +#define terminate g_terminate /* Because of name conflict */ + +/* end of pvm version 2.4 additions */ +#endif /* end ifdef PVM24 */ + + +/* Added for pvm */ +#ifdef PVM3 +#define ANY_MSG -1 /* Any message */ +#define ANY_NODE -1 /* Any node */ + +/* Message structures for communication between host and nodes */ + +/* For initialization call */ +#define MAX_NUMBER_NODES 8 +#define HOST_NAME_LENGTH 128 +struct hcs_ident_struct { + int msg_type; + int node; /* integer identifies caller's instance */ + int your_node; /* logical node number for this node */ + int number_nodes; /* how many nodes for this partition */ + int node_tid[MAX_NUMBER_NODES]; /* instance number for logical node */ + char host_name[HOST_NAME_LENGTH]; /* name of host */ +} hcs_ident ; +#define HCS_IDENT_SIZE (sizeof(struct hcs_ident_struct)) +#ifdef PROTO +int put_hcs_ident( struct hcs_ident_struct * hcs ); +int get_hcs_ident( struct hcs_ident_struct * hcs ); +#endif + +#define NODE_IDENT_CALL 77 + +/* end of pvm version 3 additions */ +#endif /* end ifdef PVM3 */ + +/* Added for MPL */ +#ifdef MPL +#define MPL_NOT_COMPLETED -1 /* For mpc_status */ +#define MPL_INACTIVE -2 /* For mpc_status */ +#define DONTCARE -1 /* Any message or any node */ +int mperrno; /* Used for error reporting */ +/* The following are from /usr/lpp/poe/include/mpproto.h */ +extern int mpc_environ(int *howmany,int *whoami); +extern int mpc_stopall(int errcode); +extern int mpc_group(int gsize,int glist[],int label,int *gid); +extern int mpc_send(char *sarr,int len,int dest,int type,int *msgid); +extern int mpc_recv(char *darr,int len,int *src,int *type,int *msgid); +extern int mpc_bsend(char *sarr,int len,int dest,int type); +extern int mpc_brecv(char *darr,int len,int *src,int *type,int *nbytes); +extern int mpc_status(int msgid); +extern int mpc_wait(int *msgid,int *nbytes); +extern int mpc_sync(int gid); + +extern void s_vadd(radix in1[],radix in2[],radix out[],int *len); +extern void d_vadd(double in1[],double in2[],double out[],int *len); +extern void s_vmax(radix in1[],radix in2[],radix out[],int *len); +extern void d_vmax(double in1[],double in2[],double out[],int *len); +#define MAX_NUMBER_NODES 64 +#define HOST_NAME_LENGTH 128 + +/* end of MPL additions */ +#endif /* end ifdef MPL */ + +#define FIELD_REQUEST 100 /* used by field_pointer...() */ +#define FIELD_REPLY 101 /* used by field_pointer...() */ + +#define N_S_GATHERS 5 /* max number of concurrent sparse gathers */ +#define SPARSE_GATHER_BASE 1000 +#define SPARSE_GATHER_STEP 1000 + +#define GENERAL_GATHER_BASE 6000 /* types from this to this+number_of_nodes + are used by the general_gather routines */ +#define GATHER_BASE 7000 /* types greater than or equal to this are used + by the gather routines */ +/* pid */ +#define NODE_PID 0 +#define HOST_PID 0 +#define ALL_NODES -1 /* works for Ncube, Intel */ /* Don't use with pvm */ + +/* definitions of restore and save lattice commands */ +#define CONTINUE 10 +#define FRESH 11 +#define RELOAD_ASCII 12 +#define RELOAD_BINARY 13 +#define RELOAD_CHECKPOINT 14 +#define FORGET 20 +#define SAVE_ASCII 21 +#define SAVE_BINARY 22 +#define SAVE_CHECKPOINT 23 + +/* Directions, and a macro to give the opposite direction */ +/* These must go from 0 to 5 because they will be used to index an + array. */ +/* Also define NDIRS = number of directions */ +#define XUP 0 +#define YUP 1 +#define ZUP 2 +#define ZDOWN 3 +#define YDOWN 4 +#define XDOWN 5 + +#define OPP_DIR(dir) (5-(dir)) /* Opposite direction */ +#define NDIRS 6 /* number of directions */ + +#define MAX_GATHERS 24 /* Maximum number of gather tables */ + +/* arguments to the make_gather() routine */ +#define FORWARDS 1 +#define BACKWARDS (-1) /* BACKWARDS = -FORWARDS */ +#define OWN_INVERSE 0 +#define WANT_INVERSE 1 +#define NO_INVERSE 2 +#define ALLOW_EVEN_ODD 0 +#define NO_EVEN_ODD 1 +#define SAME_PARITY 0 +#define SWITCH_PARITY 1 +#define SCRAMBLE_PARITY 2 + +/* "comlink" is the basic structure used in gathering neighboring sites. + Each node will maintain one such structure for each direction for each + (other) node that contains sites that are neighbors of the sites on + this node. For example, if the XUP neighbors of sites on this node + are found on two other nodes, then this node will maintain a linked + list of two comlink structures for gathering from the XUP direction. +*/ +struct comlink { + /* pointer to next in list, NULL if this is last */ + struct comlink *nextcomlink; + /* number of the node to which we connect */ + int othernode; + /* number of even sites on this node that have neighbors on + other node connected by this "comlink", and same for odd + sites on this node. */ + int n_even_connected, n_odd_connected; + /* Address of list of indices of even sites (on this node) + whose neighbors are found through this comlink, same for odd. + The odd list follows the even list, so to get all sites you + start at esitelist and take n_even_connected+n_odd_connected + addresses. */ + /* When the comlink is describing sites to be sent, the "odd" + list lists sites whose neighbors are even. This convention + is natural for the nearest neighbor gathers. For gathers + which don't allow even and odd site gathers, the even list + is used for list of sites to be received and the odd + list for sites to be sent. Different comlink structures + may point to the same list. For example, the receive list + for one gather may be a send list for the opposite gather. */ + int *esitelist, *ositelist; +}; +typedef struct comlink comlink; + + +/* Structure to keep track of outstanding sends and receives */ +typedef struct { + /* node sending or receiving message */ + int msg_node; + /* size of message in bytes */ + int msg_size; + /* address of buffer malloc'd for message */ + char *msg_buf; + /* message id returned by system call */ +#ifdef MPI + MPI_Request msg_id; +#else + int msg_id; +#endif +#if defined(PVM) || defined(PVM24) || defined(PVM3) || defined(MPL) + int msg_OK; + /* flag to track the asynchronous arrival of messages */ +#endif +#ifdef MPL + int mpl_msgid; + /* MPL assigned message id for checking status with mpc_status */ +#endif +} msg_tag; + +/* Structure for requesting a field from another node */ +typedef struct { + int field; /* offset of field in site */ + int size; /* size of field */ + int index; /* index of field on other node */ +} msg_request; + + +/* Communications routines */ +void send_parameters(params *); +void get_parameters(params *); +void send_field(void *,int,int); +void get_field(void *,int); +char * machine_type(); +#ifndef PARAGON +int mynode(); +int numnodes(); +#endif +void g_sync(); +void g_radixsum(radix *); +void g_vecradixsum(radix *,int); +void g_doublesum(double *); +void g_vecdoublesum(double *,int); +void g_radixmax(radix *); +void g_doublemax(double *); +void g_complexsum(complex *); +void g_veccomplexsum(complex *,int); +void g_dcomplexsum(double_complex *); +void broadcast_field(void *p,int siz); +void broadcast_radix(radix *); +void broadcast_double(double *); +void broadcast_complex(complex *); +void broadcast_dcomplex(double_complex *); +void send_integer(int node,int *address); +void receive_integer(int *); +double dclock(); +int terminate(); int finishrun(); + +msg_tag *start_gather(field_offset,int size,int index,int parity,char **dest); +void wait_gather(msg_tag *mbuf); +void cleanup_gather(msg_tag *mbuf); +msg_tag *start_general_gather(field_offset f,int size,int *displacement, + int parity,char **dest); +void wait_general_gather(msg_tag *m); +void cleanup_general_gather(msg_tag *m); +char *field_pointer_at_coordinates( field_offset f, int size, + int x,int y,int z); +char *field_pointer_at_direction( field_offset f,int size, site *s, + int direction ); +void cleanup_field_pointer(char *b); + +msg_tag *start_sparse_gather(field_offset f,int size,int *displacement, + int xb, int yb ,char **dest); +void wait_sparse_gather(msg_tag *m); +void cleanup_sparse_gather(msg_tag *m); + + +/* Each node maintains a list of headers to lists of comlinks */ +/**EXTERN comlink * neighborlist[NDIRS];**/ +/* addresses of neighboring sites, NULL if off-node */ +/**EXTERN site ** neighbor[NDIRS];**/ + +#ifdef NONSENSE +#ifdef PE_CODE +#include +/* On CM5, redefine system calls */ +#define main CMPE_control +#define printf host_printf +#define scanf host_scanf +#define fprintf host_fprintf +#define fscanf host_fscanf +#define fflush host_fflush +#define fopen host_fopen +#define fclose host_fclose +#define open host_open +#define close host_close +#define creat host_creat +#define read host_read +#define write host_write +#endif +#endif + +#ifdef PVM +#ifndef HOST_CODE +#define terminate g_terminate /* Because of name conflict */ +#define scanf host_scanf +#define printf host_printf +#define fflush host_fflush +#endif +#endif + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/complex.h b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/complex.h new file mode 100644 index 0000000000000000000000000000000000000000..2bb95437d8c9d8627a3a183257916e5fe90427af --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/complex.h @@ -0,0 +1,200 @@ +/*============================================================================*/ +/* */ +/* Complex Numbers */ +/* */ +/* Typedefs are included for type complex (single-precision) and type */ +/* double_complex (double_precision) complex numbers. At this time, the */ +/* functions cannot be overloaded, so there are separate routines for the */ +/* single and double precision types. All of the macros, however, will work */ +/* with both types and mix types freely. */ +/* */ +/* The following functions are provided in single and double precision: */ +/* */ +/* complex cmplx(radix r, radix i); (r,i) */ +/* complex cadd(complex *a, complex *b); *a + *b */ +/* complex cmul(complex *a, complex *b); *a * *b */ +/* complex csub(complex *a, complex *b); *a - *b */ +/* complex cdiv(complex *a, complex *b); *a / *b */ +/* complex conjg(complex *a); conjugate of *a */ +/* complex cexp(complex *a); exp(*a) */ +/* complex clog(complex *a); ln(*a) */ +/* complex csqrt(complex *a); sqrt(a) */ +/* complex ce_itheta(radix theta); exp(i*theta) */ +/* */ +/* The following macros are provided, which work for BOTH single and double */ +/* precision and for mixtures: */ +/* */ +/* 1) Macros which appear to return values (radix or double, as appropriate): */ +/* cabs(*a) magnitude of the complex number *a */ +/* cabs_sq(*a) square of the magnitude (faster than cabs) */ +/* carg(*a) phase of the complex number *a */ +/* CABS(a) returns |a| */ +/* CABS_SQ(a) returns |a|^2 */ +/* CARG(a) returns phase of a */ +/* CRDOT(a,b) returns real part of conjg(a)*b */ +/* */ +/* 2) Macro to convert from single to double or double to single: */ +/* set_complex_equal(*a,*b) do *b=*a by components to convert */ +/* SET_COMPLEX(a,b,c) do c = a + ib */ +/* */ +/* 3) Macros for fast in-line operations: */ +/* CZERO(a) a = 0 */ +/* CONJG(a,b) b = conjg(a) */ +/* CADD(a,b,c) c = a + b */ +/* CSUM(a,b) a += b */ +/* CSUB(a,b,c) c = a - b */ +/* CMUL(a,b,c) c = a * b */ +/* CMUL_ADD(a,b,c) c += a * b = CMUL_SUM */ +/* CINV(a,b) b = 1 / a */ +/* CDIV(a,b,c) c = a / b */ +/* CMUL_J(a,b,c) c = a * conjg(b) _ADD */ +/* CMULJ_(a,b,c) c = conjg(a) * b _ADD */ +/* CMULJJ(a,b,c) c = conjg(a*b) _ADD */ +/* CNEGATE(a,b) b = -a */ +/* CMUL_I(a,b) b = ia */ +/* CMUL_MINUS_I(a,b) b = -ia */ +/* CMULREAL(a,b,c) c = ba with b real and a complex */ +/* CDIVREAL(a,b,c) c = a/b with a complex and b real */ +/* */ +/*============================================================================*/ + +/* Get in radix definition */ +#include "radix.h" + +/* On the paragon, under OSF, complex is defined in math.h, but not + quite the way we did it, so redefine it: +*/ + +#if ( defined PARAGON || defined HPUX ) +#define complex complexx +#endif + +/* The T3E UNICOS standard library has cexp, clog, and csqrt, but they + are for double-precision complex, while ours are single-precision */ +#ifdef T3E +#define cexp cexp_single +#define clog clog_single +#define csqrt csqrt_single +#endif + +/* The above bites us actually now with C99, so redefine it all here: */ +#define complex complexx +#define clog clog_single +#define cexp cexp_single +#define csqrt csqrt_single + +typedef struct { /* standard complex number declaration for single- */ + radix real; /* precision complex numbers */ + radix imag; +} complex; +typedef struct { /* standard complex number declaration for double- */ + double real; /* precision complex numbers */ + double imag; +} double_complex; + +/* define complex as a union to ensure alignment to doubleword boundary */ +/*typedef union { ** standard complex number declaration for single- ** + radix f[2]; ** precision complex numbers ** + double dummy; +} complex; +typedef struct { ** standard complex number declaration for double- ** + double f[2]; ** precision complex numbers ** +} double_complex; */ +/*#define real f[0] */ +/*#define imag f[1] */ + + +/* Function Prototypes for Complex Numbers */ +complex cmplx( radix x, radix y ); +complex cadd( complex *a, complex *b ); +complex cmul( complex *a, complex *b ); +complex csub( complex *a, complex *b ); +complex cdiv( complex *a, complex *b ); +complex conjg( complex *a ); +complex cexp( complex *a ); +complex clog( complex *a ); +complex csqrt( complex *z ); +complex ce_itheta( radix theta ); + +double_complex dcmplx( double x, double y ); +double_complex dcadd( double_complex *a, double_complex *b ); +double_complex dcmul( double_complex *a, double_complex *b ); +double_complex dcsub( double_complex *a, double_complex *b ); +double_complex dcdiv( double_complex *a, double_complex *b ); +double_complex dconjg( double_complex *a ); +double_complex dcexp( double_complex *a ); +double_complex dclog( double_complex *a ); +double_complex dcsqrt( double_complex *z ); +double_complex dce_itheta( double theta ); + +/* Macros for Complex Numbers */ + +#define CZERO(a) (a).real = (a).imag = ((radix)0.0) + /* *b = *a */ +#define set_complex_equal(a,b) { (*b).real=(*a).real; (*b).imag=(*a).imag; } + +#define SET_COMPLEX(a,b,c) { (c).real = a; (c).imag = b; } + /* |*a| */ +#define cabs(a) (sqrt( (*a).real*(*a).real + (*a).imag*(*a).imag ) ) +#define CABS(a) (sqrt( (a).real * (a).real + (a).imag * (a).imag ) ) + /* *a * *a* */ +#define dcabs cabs +#define cabs_sq(a) ( (*a).real*(*a).real + (*a).imag*(*a).imag ) +#define CABS_SQ(a) ( (a).real * (a).real + (a).imag * (a).imag ) + /* phase(*a) */ +#define carg(a) (atan2((double)(*a).imag, (double)(*a).real ) ) +#define CARG(a) (atan2((double)(a).imag, (double)(a).real ) ) +/* real of conjg(a)*b */ +#define CRDOT(a,b) ((a).real * (b).real + (a).imag * (b).imag) + /* b = a* */ +#define dcarg carg +#define CONJG(a,b) { (b).real = (a).real; (b).imag = -(a).imag; } + /* c = a + b */ +#define CADD(a,b,c) { (c).real = (a).real + (b).real; \ + (c).imag = (a).imag + (b).imag; } + /* a += b */ +#define CSUM(a,b) { (a).real += (b).real; (a).imag += (b).imag; } + /* c = a - b */ +#define CSUB(a,b,c) { (c).real = (a).real - (b).real; \ + (c).imag = (a).imag - (b).imag; } + /* c = a * b */ +#define CMUL(a,b,c) { (c).real = (a).real*(b).real - (a).imag*(b).imag; \ + (c).imag = (a).real*(b).imag + (a).imag*(b).real; } + /* c = a / b */ +#define CMUL_SUM(a,b,c) { (c).real += (a).real*(b).real - (a).imag*(b).imag; \ + (c).imag += (a).real*(b).imag + (a).imag*(b).real; } +#define CMUL_ADD CMUL_SUM + /* c += a / b */ +#define CDIV(a,b,c) { double t_t = (b).real*(b).real + (b).imag*(b).imag; \ + (c).real = ((a).real*(b).real + (a).imag*(b).imag)/t_t; \ + (c).imag = ((a).imag*(b).real - (a).real*(b).imag)/t_t; } + +#define CINV(a,b) { double t_t = (a).real*(a).real + (a).imag*(a).imag; \ + (b).real = (a).real/t_t; (b).imag = -(a).imag/t_t; } + /* c = a * b* */ +#define CMUL_J(a,b,c) { (c).real = (a).real*(b).real + (a).imag*(b).imag; \ + (c).imag = (a).imag*(b).real - (a).real*(b).imag; } + /* c = a * b* */ +#define CMUL_J_ADD(a,b,c) { (c).real += (a).real*(b).real + (a).imag*(b).imag; \ + (c).imag += (a).imag*(b).real - (a).real*(b).imag; } + /* c = a* * b */ +#define CMULJ_(a,b,c) { (c).real = (a).real*(b).real + (a).imag*(b).imag; \ + (c).imag = (a).real*(b).imag - (a).imag*(b).real; } + /* c = a* * b */ +#define CMULJ__ADD(a,b,c) { (c).real += (a).real*(b).real + (a).imag*(b).imag; \ + (c).imag += (a).real*(b).imag - (a).imag*(b).real; } + /* c = (a*b)* */ +#define CMULJJ(a,b,c) { (c).real = (a).real*(b).real - (a).imag*(b).imag; \ + (c).imag = -(a).real*(b).imag - (a).imag*(b).real; } +#define CMULJJ_ADD(a,b,c) { (c).real += (a).real*(b).real - (a).imag*(b).imag; \ + (c).imag += -(a).real*(b).imag - (a).imag*(b).real; } + /* b = - a */ +#define CNEGATE(a,b) { (b).real = -(a).real; (b).imag = -(a).imag; } + /* b = ia */ +#define CMUL_I(a,b) { (b).real = -(a).imag; (b).imag = (a).real; } + /* b = -ia */ +#define CMUL_MINUS_I(a,b) { (b).real = (a).imag; (b).imag = -(a).real; } + /* c = ba */ +#define CMULREAL(a,b,c) { (c).real = (b) * (a).real; (c).imag = (b)*(a).imag; } + /* c = a/b */ +#define CDIVREAL(a,b,c) { (c).real = (a).real/(b); (c).imag = (a).imag/(b); } diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/complextr.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/complextr.c new file mode 100644 index 0000000000000000000000000000000000000000..0afc2915f4ec50b087612a358ea39e80077ec471 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/complextr.c @@ -0,0 +1,19 @@ +/****************** complextr.c (in su3.a) **************************** +* * +* complex complextrace_su3( su3_matrix *a,*b) * +* return Tr( A_adjoint*B ) * +*/ +#include "complex.h" +#include "su3.h" + +complex complextrace_su3( su3_matrix *a, su3_matrix *b ) { +register int i,j; +register radix sumr, sumi; +complex sum; + for(sumr=0.0,sumi=0.0,i=0;i<3;i++)for(j=0;j<3;j++){ + sumr+= a->e[i][j].real*b->e[i][j].real + a->e[i][j].imag*b->e[i][j].imag; + sumi+= a->e[i][j].real*b->e[i][j].imag - a->e[i][j].imag*b->e[i][j].real; + } + sum.real= sumr; sum.imag=sumi; + return(sum); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/conjg.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/conjg.c new file mode 100644 index 0000000000000000000000000000000000000000..a34198589be3d7badbdee98b30e504fefd1509e4 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/conjg.c @@ -0,0 +1,10 @@ +/* Subroutines for operations on complex numbers */ +/* complex conjugate */ +#include "complex.h" + +complex conjg( complex *a ){ + complex c; + c.real = (*a).real; + c.imag = -(*a).imag; + return(c); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/copy_wvec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/copy_wvec.c new file mode 100644 index 0000000000000000000000000000000000000000..305eef040c4e6d6e7781ce9baaa9a7b8517757bd --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/copy_wvec.c @@ -0,0 +1,12 @@ +/******************** copy_wvec.c (in su3.a) ******************** +* +*void copy_wvec( wilson_vector *src,*dest ) +* copy a Wilson vector +* dest <- src +*/ +#include "complex.h" +#include "su3.h" + +void copy_wvec( wilson_vector *src, wilson_vector *dest ){ + *dest = *src; /* hardly worth a function */ +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cs_m_a_mat.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cs_m_a_mat.c new file mode 100644 index 0000000000000000000000000000000000000000..0f93c396e8e4eb85c3bae24b0bbc7fe7f077e04b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cs_m_a_mat.c @@ -0,0 +1,38 @@ +/****************** cs_m_a_mat.c (in su3.a) *************************** +* * +* c_scalar_mult_add_su3mat( su3_matrix *ma, su3_matrix *m2, * +* complex *phase, su3_matrix *m3) * +* multiply an su3 matrix by a complex scalar and add it to another * +* matrix: m3 <- m1 + number*m2 * +*/ +#include "complex.h" +#include "su3.h" + +void c_scalar_mult_add_su3mat( su3_matrix *m1, su3_matrix *m2, + complex *phase, su3_matrix *m3){ + +#ifndef NATIVEDOUBLE +register int i,j; +complex t; + for(i=0;i<3;i++)for(j=0;j<3;j++){ + t = cmul(&m2->e[i][j],phase); + m3->e[i][j] = cadd(&m1->e[i][j],&t); + } + +#else +register int i,j; +register double sr,si,br,bi,cr,ci; + + sr = (*phase).real; si = (*phase).imag; + + for(i=0;i<3;i++)for(j=0;j<3;j++){ + br=m2->e[i][j].real; bi=m2->e[i][j].imag; + + cr = sr*br - si*bi; + ci = sr*bi + si*br; + + m3->e[i][j].real = m1->e[i][j].real + cr; + m3->e[i][j].imag = m1->e[i][j].imag + ci; + } +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cs_m_a_vec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cs_m_a_vec.c new file mode 100644 index 0000000000000000000000000000000000000000..d0e36aaa6827fd6096c96e2ea5d73951a4d8a4e5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cs_m_a_vec.c @@ -0,0 +1,35 @@ +/****************** cs_m_a_vec.c (in su3.a) *************************** +* * +* c_scalar_mult_add_su3vec(): * +* multiply an su3 vector by a complex scalar and add it to another * +* vector: v1 <- v1 + number*v2 * +*/ +#include "complex.h" +#include "su3.h" + +void c_scalar_mult_add_su3vec( su3_vector *v1, complex *phase, su3_vector *v2 ){ + +#ifndef NATIVEDOUBLE +register int i; +complex t; + for(i=0;i<3;i++){ + t = cmul(&v2->c[i],phase); + v1->c[i] = cadd(&v1->c[i],&t); + } +#else +register int i; +register double sr,si,br,bi,cr,ci; + + sr = (*phase).real; si = (*phase).imag; + + for(i=0;i<3;i++){ + br=v2->c[i].real; bi=v2->c[i].imag; + + cr = sr*br - si*bi; + ci = sr*bi + si*br; + + v1->c[i].real += cr; + v1->c[i].imag += ci; + } +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cs_m_a_wvec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cs_m_a_wvec.c new file mode 100644 index 0000000000000000000000000000000000000000..ccc21d61b9e2503a7799e8999b3087e0c813fd87 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cs_m_a_wvec.c @@ -0,0 +1,42 @@ +/******************** cs_m_a_wvec.c (in su3.a) ******************** +* +*void c_scalar_mult_add_wvec(wilson_vector *src1, wilson_vector *src2, + complex *s, wilson_vector *dest) +* Multiply a Wilson vector by a complex scalar and add to another vector +* dest <- src1 + s*src2 +*/ +#include "complex.h" +#include "su3.h" + +void c_scalar_mult_add_wvec(wilson_vector *src1,wilson_vector *src2,complex + *phase, wilson_vector *dest) { + +#ifndef NATIVEDOUBLE +register int i,j; +complex t; + for(i=0;i<4;i++){ + for(j=0;j<3;j++){ + t = cmul(&src2->d[i].c[j],phase); + dest->d[i].c[j] = cadd(&src1->d[i].c[j],&t); + } + } + +#else +register int i,j; +register double sr,si,br,bi,cr,ci; + + sr = (*phase).real; si = (*phase).imag; + + for(i=0;i<4;i++){ + for(j=0;j<3;j++){ + br=src2->d[i].c[j].real; bi=src2->d[i].c[j].imag; + + cr = sr*br - si*bi; + ci = sr*bi + si*br; + + dest->d[i].c[j].real = src1->d[i].c[j].real + cr; + dest->d[i].c[j].imag = src1->d[i].c[j].imag + ci; + } + } +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cs_m_a_wvec2.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cs_m_a_wvec2.c new file mode 100644 index 0000000000000000000000000000000000000000..78f2f7f6eda6826cf604d557c6a6f59af1340ac4 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cs_m_a_wvec2.c @@ -0,0 +1,27 @@ +/******************** cs_m_a_wvec2.c (in su3.a) ******************** +* +*void c_scalar_mult_add_wvec2(wilson_vector *src1, wilson_vector *src2, + complex s, wilson_vector *dest) +* Multiply a Wilson vector by a complex scalar and add to another vector +* dest <- src1 + s*src2 +*/ +#include "complex.h" +#include "su3.h" + +void c_scalar_mult_add_wvec2( wilson_vector *src1,wilson_vector *src2, + complex s, wilson_vector *dest ){ + wilson_vector src3; + register int i,j; + + scalar_mult_add_wvec( src1, src2, (s.real), dest ); + + for(i=0;i<4;i++) { + for(j=0;j<3;j++) { + src3.d[i].c[j].real = -(src2->d[i].c[j].imag); + src3.d[i].c[j].imag = src2->d[i].c[j].real; + } + } + + scalar_mult_add_wvec( dest, &src3, (s.imag), dest); + +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cs_m_mat.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cs_m_mat.c new file mode 100644 index 0000000000000000000000000000000000000000..679a3aa0ce2147a5f46a7740425c350e28743534 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cs_m_mat.c @@ -0,0 +1,36 @@ +/**************** cs_m_mat.c (in su3.a) ******************************* +* * +* void c_scalar_mult_su3mat( su3_matrix *b, complex *s, su3_matrix *c) * +* C <- s*B, B and C matrices * +*/ +#include "complex.h" +#include "su3.h" + +/* c <- s*b, matrices */ +void c_scalar_mult_su3mat( su3_matrix *b, complex *s, su3_matrix *c ){ + +#ifndef NATIVEDOUBLE +register int i,j; + for(i=0;i<3;i++)for(j=0;j<3;j++){ + c->e[i][j] = cmul(&b->e[i][j], s); + /* old: c->e[i][j].real = s.real*b->e[i][j].real-s.imag*b->e[i][j].imag; + c->e[i][j].imag = s.real*b->e[i][j].imag + s.imag*b->e[i][j].real; */ + } + +#else +register int i,j; +register double sr,si,br,bi,cr,ci; + + sr = (*s).real; si = (*s).imag; + + for(i=0;i<3;i++)for(j=0;j<3;j++){ + br=b->e[i][j].real; bi=b->e[i][j].imag; + + cr = sr*br - si*bi; + ci = sr*bi + si*br; + + c->e[i][j].real = cr; + c->e[i][j].imag = ci; + } +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cs_m_s_mat.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cs_m_s_mat.c new file mode 100644 index 0000000000000000000000000000000000000000..9c284e9cf796f39da79f0e7f571385a55a86c331 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cs_m_s_mat.c @@ -0,0 +1,38 @@ +/**************** cs_m_s_mat.c (in su3.a) ***************************** +* * +* void c_scalar_mult_sub_su3mat( su3_matrix *a, su3_matrix *b, * +* complex *s, su3_matrix *c) * +* C <- A - s*B, A,B and C matrices * +*/ +#include "complex.h" +#include "su3.h" + +/* c <- a - s*b, matrices */ +void c_scalar_mult_sub_su3mat( su3_matrix *a, su3_matrix *b, complex *s, + su3_matrix *c){ + +#ifndef NATIVEDOUBLE +register int i,j; +complex t; + for(i=0;i<3;i++)for(j=0;j<3;j++){ + t = cmul(&b->e[i][j], s); + c->e[i][j] = csub(&a->e[i][j], &t); + } + +#else +register int i,j; +register double sr,si,br,bi,cr,ci; + + sr = (*s).real; si = (*s).imag; + + for(i=0;i<3;i++)for(j=0;j<3;j++){ + br=b->e[i][j].real; bi=b->e[i][j].imag; + + cr = sr*br - si*bi; + ci = sr*bi + si*br; + + c->e[i][j].real = a->e[i][j].real - cr; + c->e[i][j].imag = a->e[i][j].imag - ci; + } +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cs_m_s_vec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cs_m_s_vec.c new file mode 100644 index 0000000000000000000000000000000000000000..1c77c41eec647f06c4c63f1eb83b7c8e8d61e6a9 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cs_m_s_vec.c @@ -0,0 +1,35 @@ +/******************* cs_m_s_vec.c (in su3.a) ************************** +* * +* c_scalar_mult_sub_su3vec() * +* multiply an su3 vector by a complex scalar and subtract it from * +* another vector: v1 <- v1 - number*v2 * +*/ +#include "complex.h" +#include "su3.h" + +void c_scalar_mult_sub_su3vec( su3_vector *v1, complex *phase, su3_vector *v2 ){ + +#ifndef NATIVEDOUBLE +register int i; +complex t; + for(i=0;i<3;i++){ + t = cmul(&v2->c[i],phase); + v1->c[i] = csub(&v1->c[i],&t); + } +#else +register int i; +register double sr,si,br,bi,cr,ci; + + sr = (*phase).real; si = (*phase).imag; + + for(i=0;i<3;i++){ + br=v2->c[i].real; bi=v2->c[i].imag; + + cr = sr*br - si*bi; + ci = sr*bi + si*br; + + v1->c[i].real -= cr; + v1->c[i].imag -= ci; + } +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cs_m_vec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cs_m_vec.c new file mode 100644 index 0000000000000000000000000000000000000000..03619ee7fe8f76cba3d94fa6c631eaffebe32c38 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/cs_m_vec.c @@ -0,0 +1,34 @@ +/******************* cs_m_vec.c (in su3.a) **************************** +* * +* c_scalar_mult_su3vec(): * +* multiply an su3 vector by a complex scalar * +* dest <- number*src * +*/ +#include "complex.h" +#include "su3.h" + +void c_scalar_mult_su3vec( su3_vector *src, complex *phase, su3_vector *dest ){ + +#ifndef NATIVEDOUBLE +register int i; + for(i=0;i<3;i++){ + dest->c[i] = cmul(&src->c[i],phase); + } + +#else +register int i; +register double sr,si,br,bi,cr,ci; + + sr = (*phase).real; si = (*phase).imag; + + for(i=0;i<3;i++){ + br=src->c[i].real; bi=src->c[i].imag; + + cr = sr*br - si*bi; + ci = sr*bi + si*br; + + dest->c[i].real = cr; + dest->c[i].imag = ci; + } +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/csqrt.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/csqrt.c new file mode 100644 index 0000000000000000000000000000000000000000..941ce8caab658d8dd086cbf1d93dac47f67792a3 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/csqrt.c @@ -0,0 +1,14 @@ +/* Subroutines for operations on complex numbers */ +/* complex square root */ +#include +#include "complex.h" + +complex csqrt( complex *z ){ +complex c; +radix theta,r; + r = sqrt(hypot(z->real,z->imag)); + theta = 0.5*atan2(z->imag,z->real); + c = ce_itheta(theta); + c.real *=r; c.imag *= r; + return(c); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/csub.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/csub.c new file mode 100644 index 0000000000000000000000000000000000000000..cc6ada989dd1aa2c0a43fe83d21777ef2611343e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/csub.c @@ -0,0 +1,10 @@ +/* Subroutines for operations on complex numbers */ +/* complex subtract */ +#include "complex.h" + +complex csub( complex *a, complex *b ) { + complex c; + c.real = (*a).real - (*b).real; + c.imag = (*a).imag - (*b).imag; + return(c); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dcadd.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dcadd.c new file mode 100644 index 0000000000000000000000000000000000000000..fb10793d5d22b559d9f77206b1f8b96ec4d00ba8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dcadd.c @@ -0,0 +1,10 @@ +/* Subroutines for operations on complex numbers */ +/* double complex add */ +#include "complex.h" + +double_complex dcadd( double_complex *a, double_complex *b ){ + double_complex c; + c.real = (*a).real + (*b).real; + c.imag = (*a).imag + (*b).imag; + return(c); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dcdiv.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dcdiv.c new file mode 100644 index 0000000000000000000000000000000000000000..4157e474c2e69f666e2db7491240d081deacb50c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dcdiv.c @@ -0,0 +1,12 @@ +/* Subroutines for operations on complex numbers */ +/* double complex divide */ +#include "complex.h" + +double_complex dcdiv( double_complex *a, double_complex *b ){ + double_complex c; + double scale; + scale = 1.0/((*b).real*(*b).real+(*b).imag*(*b).imag); + c.real = scale*((*a).real*(*b).real + (*a).imag*(*b).imag); + c.imag = scale*((*a).imag*(*b).real - (*a).real*(*b).imag); + return(c); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dce_itheta.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dce_itheta.c new file mode 100644 index 0000000000000000000000000000000000000000..c7cae5f8e93780d44030c2a7cf7ca1280ffb6883 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dce_itheta.c @@ -0,0 +1,12 @@ +/* Subroutines for operations on complex numbers */ +/* double complex exp( i*theta ) */ +#include +#include "complex.h" + +double_complex dce_itheta( double theta ){ + double_complex c; + c.real = (double)cos( (double)theta ); + c.imag = (double)sin( (double)theta ); + /* there must be a more efficient way */ + return( c ); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dcexp.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dcexp.c new file mode 100644 index 0000000000000000000000000000000000000000..c36539a844f6d1bb893d1cd1a437ba6e93ae121c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dcexp.c @@ -0,0 +1,13 @@ +/* Subroutines for operations on complex numbers */ +/* double complex exponential */ +#include +#include "complex.h" + +double_complex dcexp( double_complex *a ){ + double_complex c; + double mag; + mag = (double)exp( (double)(*a).real ); + c.real = mag*(double)cos( (double)(*a).imag ); + c.imag = mag*(double)sin( (double)(*a).imag ); + return(c); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dclog.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dclog.c new file mode 100644 index 0000000000000000000000000000000000000000..2349b418a32648c200445a510489eb5501c6aaf9 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dclog.c @@ -0,0 +1,11 @@ +/* Subroutines for operations on complex numbers */ +/* double complex logarithm */ +#include +#include "complex.h" + +double_complex dclog( double_complex *a ){ + double_complex c; + c.real = 0.5*(double)log((double)((*a).real*(*a).real+(*a).imag*(*a).imag)); + c.imag = (double)atan2( (double)(*a).imag, (double)(*a).real ); + return(c); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dcmplx.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dcmplx.c new file mode 100644 index 0000000000000000000000000000000000000000..45a5e1468b67c434a3f0db91d9d6b9988944e06e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dcmplx.c @@ -0,0 +1,9 @@ +/* Subroutines for operations on complex numbers */ +/* make a double complex number from two double precision reals */ +#include "complex.h" + +double_complex dcmplx( double x, double y ){ + double_complex c; + c.real = x; c.imag = y; + return(c); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dcmul.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dcmul.c new file mode 100644 index 0000000000000000000000000000000000000000..8ef680fd43aba2bf9e7b68a4ff132bc76fbc203c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dcmul.c @@ -0,0 +1,10 @@ +/* Subroutines for operations on complex numbers */ +/* double complex multiply */ +#include "complex.h" + +double_complex dcmul( double_complex *a, double_complex *b ){ + double_complex c; + c.real = (*a).real * (*b).real - (*a).imag * (*b).imag; + c.imag = (*a).imag * (*b).real + (*a).real * (*b).imag; + return(c); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dconjg.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dconjg.c new file mode 100644 index 0000000000000000000000000000000000000000..62efb9c4f75205a2afe9289e84c478c0308513ca --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dconjg.c @@ -0,0 +1,10 @@ +/* Subroutines for operations on complex numbers */ +/* double precision complex conjugate */ +#include "complex.h" + +double_complex dconjg( double_complex *a ){ + double_complex c; + c.real = (*a).real; + c.imag = -(*a).imag; + return(c); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dcsqrt.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dcsqrt.c new file mode 100644 index 0000000000000000000000000000000000000000..6a1db45ea1b5707cef4436cff57d71c32539f201 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dcsqrt.c @@ -0,0 +1,15 @@ +/* Subroutines for operations on complex numbers */ +/* double precision complex square root */ +#include +#include "complex.h" + +double_complex dcsqrt( double_complex *z ){ +double_complex c; +double theta,r; + r = sqrt(hypot(z->real,z->imag)); + theta = 0.5*atan2(z->imag,z->real); + c = dce_itheta(theta); + c.real *=r; c.imag *= r; + return(c); +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dcsub.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dcsub.c new file mode 100644 index 0000000000000000000000000000000000000000..989254f8ea7f5b13bf52809eeabf80e5caf7fa34 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dcsub.c @@ -0,0 +1,10 @@ +/* Subroutines for operations on complex numbers */ +/* double complex subtract */ +#include "complex.h" + +double_complex dcsub( double_complex *a, double_complex *b ){ + double_complex c; + c.real = (*a).real - (*b).real; + c.imag = (*a).imag - (*b).imag; + return(c); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/det_su3.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/det_su3.c new file mode 100644 index 0000000000000000000000000000000000000000..13c273c7c883b2640e8afa8425e7a0657c682809 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/det_su3.c @@ -0,0 +1,31 @@ +/****************** det_su3.c (in su3.a) ****************************** +* * +* complex det_su3( su3_matrix *a ) * +* Complex determinant of an SU3 matrix * +*/ +#include "complex.h" +#include "su3.h" + +/* FIX THIS - more efficient to take cross product of first two + rows, dot with third. */ +complex det_su3( su3_matrix *a ) { +register complex cc,dd,sum; + CMUL(a->e[0][0],a->e[1][1],cc); + CMUL(cc,a->e[2][2],sum); + CMUL(a->e[0][0],a->e[1][2],cc); + CMUL(cc,a->e[2][1],dd); + CSUB(sum,dd,sum); + CMUL(a->e[0][1],a->e[1][2],cc); + CMUL(cc,a->e[2][0],dd); + CADD(sum,dd,sum); + CMUL(a->e[0][1],a->e[1][0],cc); + CMUL(cc,a->e[2][2],dd); + CSUB(sum,dd,sum); + CMUL(a->e[0][2],a->e[1][0],cc); + CMUL(cc,a->e[2][1],dd); + CADD(sum,dd,sum); + CMUL(a->e[0][2],a->e[1][1],cc); + CMUL(cc,a->e[2][0],dd); + CSUB(sum,dd,sum); + return(sum); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dump_wvec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dump_wvec.c new file mode 100644 index 0000000000000000000000000000000000000000..9c6e8ca5ba2b5ca7f327f3160d692deb483d62fd --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dump_wvec.c @@ -0,0 +1,18 @@ +/**************** dump_wvec.c (in su3.a) *********************** +* * +* void dump_wvec( wilson_vector *v ) * +* Print out a Wilson vector * +*/ +#include +#include "complex.h" +#include "su3.h" + +void dump_wvec( wilson_vector *v ){ +register int i,j; + for(i=0;i<4;i++){ + for(j=0;j<3;j++)printf("(%.2e,%.2e)\t", + v->d[i].c[j].real,v->d[i].c[j].imag); + printf("\n"); + } + printf("\n"); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dumpmat.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dumpmat.c new file mode 100644 index 0000000000000000000000000000000000000000..248a323418a72b3de7d8c3960b1d13e4c66422b9 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dumpmat.c @@ -0,0 +1,18 @@ +/****************** dumpmat.c (in su3.a) ****************************** +* * +* void dumpmat( su3_matrix *mat ) * +* print out a 3x3 complex matrix * +*/ +#include +#include "complex.h" +#include "su3.h" + +void dumpmat( su3_matrix *m ){ +int i,j; + for(i=0;i<3;i++){ + for(j=0;j<3;j++)printf("(%.2e,%.2e)\t", + m->e[i][j].real,m->e[i][j].imag); + printf("\n"); + } + printf("\n"); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dumpvec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dumpvec.c new file mode 100644 index 0000000000000000000000000000000000000000..7a34ec3190796a2f80f8dd8f0cef2b992c41d445 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/dumpvec.c @@ -0,0 +1,15 @@ +/******************* dumpvec.c (in su3.a) ***************************** +* * +* void dumpvec( su3_vector *vec ) * +* print out a 3 element complex vector * +*/ +#include +#include "complex.h" +#include "su3.h" + +void dumpvec( su3_vector *v ){ +int j; + for(j=0;j<3;j++)printf("(%.2e,%.2e)\t", + v->c[j].real,v->c[j].imag); + printf("\n"); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/flush_to_zero.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/flush_to_zero.c new file mode 100644 index 0000000000000000000000000000000000000000..bac931918ee2b5b38fbb3d61b8801094a16784d4 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/flush_to_zero.c @@ -0,0 +1,4 @@ +/** flush_to_zero.c ***/ + +/* DUMMY ROUTINE - nothing to do unless on Intel machine */ +void flush_to_zero(){} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/gaussrand.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/gaussrand.c new file mode 100644 index 0000000000000000000000000000000000000000..d58b790b5683961073a3fd360351d2c68f06fbb5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/gaussrand.c @@ -0,0 +1,35 @@ +/***************** gaussrand.c (in su3.a) ***************************** +* * +* radix gaussian_ran_no( passthru *prn_pt ) * +* Gaussian distributed random number * +* Probability distribution exp( -x*x ), so < x^2 > = 1/2 * +* This requires a random number generator named "myrand()", returning * +* a radix uniformly distributed between zero and one. The argument of * +* this routine is a pointer to be passed to myrand(). * +*/ + +#include +#include "complex.h" +#include "su3.h" + +radix gaussian_rand_no( void *prn_pt ){ +radix myrand(); +static int iset=0; +static radix gset; +radix fac,r,v1,v2; + + if (iset == 0) { + do { + v1=2.0*myrand(prn_pt)-1.0; + v2=2.0*myrand(prn_pt)-1.0; + r=v1*v1+v2*v2; + } while (r >= 1.0); + fac=sqrt( -log((double)r)/(double)r); + gset=v1*fac; + iset=1; + return v2*fac; + } else { + iset=0; + return gset; + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/generic_clover.h b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/generic_clover.h new file mode 100644 index 0000000000000000000000000000000000000000..50647075165c9e524c67a72b1a36ff1f9dd688b0 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/generic_clover.h @@ -0,0 +1,87 @@ +/************************ generic_clover.h ****************************** +* * +* Macros and declarations for generic_clover routines * +* This header is for codes that call generic_clover routines * +* MIMD version 5 * +* * +*/ + + +int clover_invert( /* Return value is number of iterations taken */ + field_offset src, /* type wilson_vector (source already created)*/ + field_offset dest, /* type wilson_vector (answer and initial guess) */ + field_offset tmp, /* type wilson_vector (workspace used only for bi-cg)*/ + field_offset sav, /* type wilson_vector (for saving source) */ + int MinCG, /* minimum number of iterations per restart */ + int MaxCG, /* maximum number of iterations per restart */ + int nrestart, /* maximum restarts */ + radix RsdCG, /* desired residual - + normalized as sqrt(r*r)/sqrt(src_e*src_e */ + radix *size_r, /* resulting residual */ + int start_flag, /* 0: use a zero initial guess; 1: use dest */ + radix Kappa, /* hopping */ + radix Clov_c, /* Perturbative clover coeff */ + radix U0, /* Tadpole correction to Clov_c */ + field_offset f_mn /* size of su3_matrix (workspace) */ + ); + +int clover_invert_lean( /* Return value is number of iterations taken */ + field_offset src, /* type wilson_vector (where source is to be created)*/ + field_offset dest, /* type wilson_vector (answer and initial guess) */ + field_offset tmp, /* type wilson_vector (workspace used only for bi-cg)*/ + void (*source_func)(field_offset src, + wilson_quark_source *wqs), /* source function */ + wilson_quark_source *wqs, /* source parameters */ + int MinCG, /* minimum number of iterations per restart */ + int MaxCG, /* maximum number of iterations per restart */ + int nrestart, /* maximum restarts */ + radix RsdCG, /* desired residual - + normalized as sqrt(r*r)/sqrt(src_e*src_e */ + radix *size_r, /* resulting residual */ + int start_flag, /* 0: use a zero initial guess; 1: use dest */ + radix Kappa, /* hopping */ + radix Clov_c, /* Perturbative clover coeff */ + radix U0, /* Tadpole correction to Clov_c */ + field_offset f_mn /* size of su3_matrix (workspace) */ + ); + +int cgilu_cl( /* Return value is number of iterations taken */ + field_offset src, /* type wilson_vector (source vector - OVERWRITTEN!)*/ + field_offset dest, /* type wilson_vector (answer and initial guess )*/ + int MinCG, /* minimum number of iterations per restart */ + int MaxCG, /* maximum number of iterations */ + radix RsdCG, /* desired residual - + normalized as sqrt(r*r)/sqrt(src_e*src_e */ + radix *size_r, /* resulting residual */ + int flag, /* 0: use a zero initial guess; 1: use dest */ + radix Kappa, /* hopping */ + radix Clov_c, /* Perturbative clover coeff */ + radix U0, /* Tadpole correction to Clov_c */ + field_offset f_mn /* Scratch space of size su3_matrix */ + ); +int bicgilu_cl( /* Return value is number of iterations taken */ + field_offset src, /* type wilson_vector (source vector - OVERWRITTEN!)*/ + field_offset dest, /* type wilson_vector (answer and initial guess )*/ + int MinCG, /* minimum number of iterations per restart */ + int MaxCG, /* maximum number of iterations */ + radix RsdCG, /* desired residual - + normalized as sqrt(r*r)/sqrt(src_e*src_e */ + radix *size_r, /* resulting residual */ + int flag, /* 0: use a zero initial guess; 1: use dest */ + radix Kappa, /* hopping */ + field_offset rv, /* Scratch space of size wilson_vector */ + radix Clov_c, /* Perturbative clover coeff */ + radix U0, /* Tadpole correction to Clov_c */ + field_offset f_mn /* Scratch space of size su3_matrix */ + ); +void f_mu_nu(field_offset f_mn,int mu,int nu); +void make_clov(radix Clov_c,field_offset f_mn); +void make_clovinv(); + +void mult_ldu( + field_offset src, /* type wilson_vector RECAST AS wilson_block_vector */ + field_offset dest, /* type wilson_vector RECAST AS wilson_block_vector */ + field_offset triang,/* type triangular */ + field_offset diag, /* type diagonal */ + int parity ); + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/generic_form.h b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/generic_form.h new file mode 100644 index 0000000000000000000000000000000000000000..3cdc33517fd99432d09d9ef72b501f74109aab2b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/generic_form.h @@ -0,0 +1,31 @@ +/************************ generic_form.h ************************************* +* * +* Macros and declarations for miscellaneous generic routines * +* This header is for codes that call generic_form routines * +* MIMD version 5 * +* * +*/ + +void c_scale_wilson_vector(wilson_vector *m , complex scale); +void copy_site_wilson_vector(field_offset src, field_offset dest) ; +void flip_source_re(field_offset quark_prop); +int load_momentum_from_disk(int mom_in[][3], char filename[], int max_mom); +void load_scalar_smear(radix *data, int dim, char filename[]); +void load_smearing(field_offset where_smear, char filename[80]); +void mult_gamma(int phase, gamma_matrix *g1, gamma_matrix *g2, gamma_matrix *g3); +void make_gammas(gamma_matrix *gamma); +void mult_sw_by_gamma_l(spin_wilson_vector * src, + spin_wilson_vector * dest, int dir); +void mult_sw_by_gamma_r(spin_wilson_vector * src, + spin_wilson_vector * dest, int dir); +void meson_cont_mom(complex prop[], + field_offset src1,field_offset src2, + int base_pt, int q_stride, int op_stride, + gamma_corr gamma_table[], int no_gamma_corr); +void load_wilson_source(field_offset src, field_offset dest,int color,int spin); + +void load_wvec(wilson_vector *dest, complex *z, int spin, int colour) ; + + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/generic_ks.h b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/generic_ks.h new file mode 100644 index 0000000000000000000000000000000000000000..b2069e359de5fc89b57ea62f78f57db47d53a0f1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/generic_ks.h @@ -0,0 +1,42 @@ +/************************ generic_ks.h ********************************** +* * +* Macros and declarations for generic_ks routines * +* This header is for codes that call generic_ks routines * +* MIMD version 5 * +* * +*/ + +int congrad( int niter, radix rsqmin, int parity, radix *rsq ); +void copy_latvec(field_offset src, field_offset dest, int parity); +void dslash( field_offset src, field_offset dest, int parity ); +void dslash_special( field_offset src, field_offset dest, + int parity, msg_tag **tag, int start ); +void clear_latvec(field_offset v,int parity); + +void scalar_mult_latvec(field_offset src, radix scalar, + field_offset dest, int parity); +void scalar_mult_add_latvec(field_offset src1, field_offset src2, + radix scalar, field_offset dest, int parity); +void grsource(int parity); +void checkmul(); +int spectrum(); +void make_lattice(); +void phaseset(); +void rephase( int flag ); + +void prefetch_vector( su3_vector * ); +void prefetch_matrix( su3_matrix * ); +void V_sma_and_rdot( su3_vector * ttt_pt, su3_vector * cg_p_pt, radix x, + double * pkp_pt, int nsites, int stride ); +void V_sma2_and_mag( su3_vector * xxx_pt, su3_vector * cg_p_pt, + su3_vector * resid_pt, su3_vector * ttt_pt, radix a, + double * rsq_pt, int nsites, int stride ); +void V_sma_vec( su3_vector * src1, su3_vector * src2, + radix scalar, su3_vector * dest, int nsites, int stride ); +void V_mult_adj_su3_mat_vec_4dir( su3_matrix * lpt, + su3_vector * srcpt, su3_vector * destpt, int nsites, int stride ); +void V_mult_su3_mat_vec_sum_4dir( su3_matrix * lpt, + su3_vector ** xpt, su3_vector ** ypt, su3_vector ** zpt, su3_vector ** tpt, + su3_vector * destpt, int nsites, int stride ); + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/generic_notused.h b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/generic_notused.h new file mode 100644 index 0000000000000000000000000000000000000000..bc0be55afdbf1f1a2961b1ef914a1bdd6ed0691e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/generic_notused.h @@ -0,0 +1,80 @@ +/************************ generic.h ************************************* +* * +* Macros and declarations for miscellaneous generic routines * +* This header is for codes that call generic routines * +* MIMD version 5 * +* * +*/ + +/* Other generic directory declarations are elsewhere: + + For com_*.c, see comdefs.h + For io_ansi.c, io_nonansi.c, io_piofs.c, io_paragon.c see io_lat.h + For io_wb.c, see io_wb.h +*/ + +/* bsd sum */ +#ifndef _type32 +#define _type32 +#ifdef SHORT32 +typedef unsigned short type32; +#else +typedef unsigned int type32; +#endif +#endif +type32 bsd_sum (char *data,type32 total_bytes); + +/* check_unitarity.c */ +void check_unitarity( void ); + +/* Routines in layout_*.c */ +void setup_layout( void ); +int node_number(int x,int y,int z,int t); +int node_index(int x,int y,int z,int t); +int num_sites(int node); + +/* ploop?.c */ +complex ploop( void ); + +/* d_plaq?.c */ +void d_plaquette(double *ss_plaq,double *st_plaq); + +/* plaquette_generic.c */ +void plaquette_generic(radix *ss_plaq,radix *st_plaq,field_offset su3_mat); + +/* plaquette4.c */ +void plaquette(radix *ss_plaq,radix *st_plaq); + +/* ploop_staple.c */ +complex ploop_staple(radix alpha_fuzz); + +/* ranstuff.c */ +void initialize_prn(double_prn *prn_pt, int seed, int index); +radix myrand(double_prn *prn_pt); + +/* ranmom.c */ +void ranmom(); + +/* restrict_fourier.c */ +void setup_restrict_fourier( int *key, int *restrict); +void restrict_fourier( + field_offset src, /* src is field to be transformed */ + field_offset space, /* space is working space, same size as src */ + field_offset space2,/* space2 is working space, same size as src */ + /* space2 is needed only for non power of 2 */ + int size, /* Size of field in bytes. The field must + consist of size/sizeof(complex) consecutive + complex numbers. For example, an su3_vector + is 3 complex numbers. */ + int isign); /* 1 for x -> k, -1 for k -> x */ + +/* gaugefix.c */ +void gaugefix(int gauge_dir,radix relax_boost,int max_gauge_iter, + radix gauge_fix_tol, field_offset diffmat, field_offset sumvec, + int nvector, field_offset vector_offset[], int vector_parity[], + int nantiherm, field_offset antiherm_offset[], + int antiherm_parity[] ); + +/* reunitarize.c */ +void reunitarize( void ); + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/generic_wilson.h b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/generic_wilson.h new file mode 100644 index 0000000000000000000000000000000000000000..cba3be6be66466750dc9cff959c3753caa4425cb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/generic_wilson.h @@ -0,0 +1,95 @@ +/************************ generic_wilson.h ****************************** +* * +* Macros and declarations for generic_wilson routines * +* This header is for codes that call generic_wilson routines * +* MIMD version 5 * +* * +*/ + +/* For various inversion routines. Not used sytematically yet. CD */ +enum guess_params { START_ZERO_GUESS = 0 , START_NONZERO_GUESS } ; + +int wilson_invert( /* Return value is number of iterations taken */ + field_offset src, /* type wilson_vector (source already created)*/ + field_offset dest, /* type wilson_vector (answer and initial guess) */ + field_offset tmp, /* type wilson_vector (workspace used only for bi-cg)*/ + field_offset sav, /* type wilson_vector (for saving source) */ + int MinCG, /* minimum number of iterations per restart */ + int MaxCG, /* maximum number of iterations per restart */ + int nrestart, /* maximum restarts */ + radix RsdCG, /* desired residual - + normalized as sqrt(r*r)/sqrt(src_e*src_e */ + radix *size_r, /* resulting residual */ + int start_flag, /* 0: use a zero initial guess; 1: use dest */ + radix Kappa /* hopping */ + ); + +int wilson_invert_lean( /* Return value is number of iterations taken */ + field_offset src, /* type wilson_vector (where source is to be created)*/ + field_offset dest, /* type wilson_vector (answer and initial guess) */ + field_offset tmp, /* type wilson_vector (workspace used only for bi-cg)*/ + void (*source_func)(field_offset src, + wilson_quark_source *wqs), /* source function */ + wilson_quark_source *wqs, /* source parameters */ + int MinCG, /* minimum number of iterations per restart */ + int MaxCG, /* maximum number of iterations per restart */ + int nrestart, /* maximum restarts */ + radix RsdCG, /* desired residual - + normalized as sqrt(r*r)/sqrt(src_e*src_e */ + radix *size_r, /* resulting residual */ + int start_flag, /* 0: use a zero initial guess; 1: use dest */ + radix Kappa /* hopping */ + ); + + +int congrad(int niter,radix rsqmin,radix *final_rsq_ptr); + +void copy_site_wilson_vector(field_offset src, field_offset dest); + +int cgilu_w( /* Return value is number of iterations taken */ + field_offset src, /* type wilson_vector (source vector - OVERWRITTEN!)*/ + field_offset dest, /* type wilson_vector (answer and initial guess )*/ + int MinCG, /* minimum number of iterations */ + int MaxCG, /* maximum number of iterations */ + radix RsdCG, /* desired residual - + normalized as sqrt(r*r)/sqrt(src_e*src_e */ + radix *size_r, /* resulting residual */ + int flag, /* 0: use a zero initial guess; 1: use dest */ + radix Kappa /* hopping */ + ); +int bicgilu_w( /* Return value is number of iterations taken */ + field_offset src, /* type wilson_vector (source vector - OVERWRITTEN!)*/ + field_offset dest, /* type wilson_vector (answer and initial guess )*/ + int MinCG, /* minimum number of iterations */ + int MaxCG, /* maximum number of iterations */ + radix RsdCG, /* desired residual - + normalized as sqrt(r*r)/sqrt(src_e*src_e */ + radix *size_r, /* resulting residual */ + int flag, /* 0: use a zero initial guess; 1: use dest */ + radix Kappa, /* hopping */ + field_offset rv /* Scratch space of size wilson_vector */ + ); +int mrilu_w_or(field_offset src,field_offset dest,int MinMR,int MaxMR,radix RsdMR, + radix *size_r,int flag,radix Kappa); + +/* For quark source routines */ +/* The Weyl representation types are included for w_source_h */ +enum source_type { + POINT = 1, GAUSSIAN, CUTOFF_GAUSSIAN, + POINT_WEYL, CUTOFF_GAUSSIAN_WEYL } ; +void w_source(field_offset src,wilson_quark_source *wqs); +void w_source_h(field_offset src,wilson_quark_source *wqs); +radix *make_template(radix gamma, int cutoff); +void w_sink(field_offset snk,wilson_quark_source *wqs); +int ask_quark_source( int prompt, int *type, char *descrp ); + +void bj_to_weyl( wilson_vector *src, wilson_vector *dest); +void dslash(field_offset src,field_offset dest, + int isign,int parity); +void dslash_special(field_offset src,field_offset dest, + int isign,int parity,msg_tag **tag,int is_started); +void w_meson(field_offset src1,field_offset src2,complex *prop[10]); +void w_baryon(field_offset src1,field_offset src2,field_offset src3, + complex *prop[4]); +void w_baryon_hl(field_offset src1,field_offset src2, + field_offset src3, complex *prop[6]); diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/grow4wvecs.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/grow4wvecs.c new file mode 100644 index 0000000000000000000000000000000000000000..f2c8856e52deae251a7906364b35bae762a6ddbd --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/grow4wvecs.c @@ -0,0 +1,178 @@ +/***************** grow4wvecs.c (in su3.a) **************************** +* * +* If sum=0, * +* Grow and add four wilson_vectors * +* If sum=1, * +* Grow and sum four wilson_vectors to another wilson_vector * +* void grow_four_wvecs(a,b1,b2,b3,b4,sign,sum) * +* wilson_vector *a; half_wilson_vector *b1,*b2,*b3,*b4; * +* int sign,sum; * +* A <- B1 + B2 + B3 + B4 or * +* A <- A + B1 + B2 + B3 + B4 * +* B1 is expanded using gamma_x, B2 using gamma_y, etc. * +*/ +#include "complex.h" +#include "su3.h" +/* Directions, and a macro to give the opposite direction */ +/* These must go from 0 to 7 because they will be used to index an + array. */ +/* Also define NDIRS = number of directions */ +#define XUP 0 +#define YUP 1 +#define ZUP 2 +#define TUP 3 +#define TDOWN 4 +#define ZDOWN 5 +#define YDOWN 6 +#define XDOWN 7 + +#define OPP_DIR(dir) (7-(dir)) /* Opposite direction */ +#define NDIRS 8 /* number of directions */ + +/* grow and sum four wilson_vectors */ + +#ifndef FAST + +void grow_add_four_wvecs( wilson_vector *a, half_wilson_vector *b1, + half_wilson_vector *b2, half_wilson_vector *b3, + half_wilson_vector *b4, int sign, int sum ){ + if(sum==0)wp_grow( b1,a,XUP,sign); + else wp_grow_add( b1,a,XUP,sign); + wp_grow_add( b2,a,YUP,sign); + wp_grow_add( b3,a,ZUP,sign); + wp_grow_add( b4,a,TUP,sign); +} + +#else /* "FAST" code has wp_grow_add inlined */ +/* For the RS6000 */ + +/* a += i*b, a += -i*b */ +#define CSUM_TPI(a,b) { (a).real -= (b).imag; (a).imag += (b).real; } +#define CSUM_TMI(a,b) { (a).real += (b).imag; (a).imag -= (b).real; } + +void grow_add_four_wvecs( wilson_vector *a, half_wilson_vector *b1, + half_wilson_vector *b2, half_wilson_vector *b3, + half_wilson_vector *b4, int sign, int sum ){ + int i; + if(sum==0) + { + /* wp_grow( b1,a,XUP,sign); */ + + /* case XUP: */ + if(sign==PLUS) + { + for(i=0;i<3;i++){ + a->d[0].c[i] = b1->h[0].c[i]; + a->d[1].c[i] = b1->h[1].c[i]; + TIMESMINUSI( b1->h[0].c[i], a->d[3].c[i]); + TIMESMINUSI( b1->h[1].c[i], a->d[2].c[i]); + } + } + else + { + /* case XDOWN: */ + for(i=0;i<3;i++){ + a->d[0].c[i] = b1->h[0].c[i]; + a->d[1].c[i] = b1->h[1].c[i]; + TIMESPLUSI( b1->h[0].c[i], a->d[3].c[i]); + TIMESPLUSI( b1->h[1].c[i], a->d[2].c[i]); + } + } + } + else + { + /*wp_grow_add( b1,a,XUP,sign); */ + + /* case XUP: */ + if(sign==PLUS) + { + for(i=0;i<3;i++){ + CSUM( a->d[0].c[i], b1->h[0].c[i]); + CSUM( a->d[1].c[i], b1->h[1].c[i]); + CSUM_TMI( a->d[2].c[i], b1->h[1].c[i] ); + CSUM_TMI( a->d[3].c[i], b1->h[0].c[i] ); + } + } + else + { + /* case XDOWN: */ + for(i=0;i<3;i++){ + CSUM( a->d[0].c[i], b1->h[0].c[i]); + CSUM( a->d[1].c[i], b1->h[1].c[i]); + CSUM_TPI( a->d[2].c[i], b1->h[1].c[i] ); + CSUM_TPI( a->d[3].c[i], b1->h[0].c[i] ); + } + } + } + + /* wp_grow_add( b2,a,YUP,sign); */ + + if(sign==PLUS) + { + /* case YUP: */ + for(i=0;i<3;i++){ + CSUM( a->d[0].c[i], b2->h[0].c[i]); + CSUM( a->d[1].c[i], b2->h[1].c[i]); + CSUM( a->d[2].c[i], b2->h[1].c[i]); + CSUB( a->d[3].c[i], b2->h[0].c[i], a->d[3].c[i] ); + } + } + else + { + /* case YDOWN: */ + for(i=0;i<3;i++){ + CSUM( a->d[0].c[i], b2->h[0].c[i]); + CSUM( a->d[1].c[i], b2->h[1].c[i]); + CSUB( a->d[2].c[i], b2->h[1].c[i], a->d[2].c[i] ); + CSUM( a->d[3].c[i], b2->h[0].c[i]); + } + } + + /* wp_grow_add( b3,a,ZUP,sign); */ + + if(sign==PLUS) + { + /* case ZUP: */ + for(i=0;i<3;i++){ + CSUM( a->d[0].c[i], b3->h[0].c[i]); + CSUM( a->d[1].c[i], b3->h[1].c[i]); + CSUM_TMI( a->d[2].c[i], b3->h[0].c[i] ); + CSUM_TPI( a->d[3].c[i], b3->h[1].c[i] ); + } + } + else + { + /* case ZDOWN:*/ + for(i=0;i<3;i++){ + CSUM( a->d[0].c[i], b3->h[0].c[i]); + CSUM( a->d[1].c[i], b3->h[1].c[i]); + CSUM_TPI( a->d[2].c[i], b3->h[0].c[i] ); + CSUM_TMI( a->d[3].c[i], b3->h[1].c[i] ); + } + } + + /* wp_grow_add( b4,a,TUP,sign); */ + + if(sign==PLUS) + { + /* case TUP: */ + for(i=0;i<3;i++){ + CSUM( a->d[0].c[i], b4->h[0].c[i]); + CSUM( a->d[1].c[i], b4->h[1].c[i]); + CSUM( a->d[2].c[i], b4->h[0].c[i]); + CSUM( a->d[3].c[i], b4->h[1].c[i]); + } + } + else + { + /* case TDOWN: */ + for(i=0;i<3;i++){ + CSUM( a->d[0].c[i], b4->h[0].c[i]); + CSUM( a->d[1].c[i], b4->h[1].c[i]); + CSUB( a->d[2].c[i], b4->h[0].c[i], a->d[2].c[i] ); + CSUB( a->d[3].c[i], b4->h[1].c[i], a->d[3].c[i] ); + } + } +} + +#endif /* "#ifndef FAST"*/ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/io_wb.h b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/io_wb.h new file mode 100644 index 0000000000000000000000000000000000000000..671874691d9bb42ac83de171e492111dc3a6ea72 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/io_wb.h @@ -0,0 +1,313 @@ +/************************ io_wb.h ************************************* +/* This header file defines the binary file format for the propagator file and + defines structures for file descriptors that include the file header information */ + +/* Define a 32-bit integer type. Machine-dependent. Add more as needed. */ +/* + Original by CD + 2/26/98 Changed type32 to signed CD + */ + +#ifndef _type32 +#define _type32 +#ifdef SHORT32 +typedef short type32; +#else +typedef int type32; +#endif +#endif + +#ifdef CONTROL +#define EXTERN +#else +#define EXTERN extern +#endif + +/**********************************************************************/ +/* Binary lattice formats */ +/**********************************************************************/ +/* In version 5 we have two binary lattice file formats: + serial files Data written in coordinate natural order + checkpoint files Data written in node dump order + + Further descriptive information is kept in a separate ASCII header + file. See below. + + */ + + +/*--------------------------------------------------------------------*/ +/* version 5 binary file format */ + +#define W_PROP_VERSION_NUMBER 12781 +#define MAX_TIME_STAMP 64 +#define MAX_SOURCE_SPINS 4 + +/* Begin definition of header stuctures */ + +/* Note that an effort is made to make all radixing point and integer + fields 32 bits long. However, byte ordering may vary across + platforms, and no effort is made in writing the file to produce + a standard byte order. The input routines attempt to compensate + for byte reversal automatically, by examining the magic number at + the beginning of the file */ + +/* 1. Header comes first */ + +typedef struct { + type32 magic_number; /* Identifies file format */ + char time_stamp[MAX_TIME_STAMP]; /* Date and time stamp - used to + check consistency between the + ASCII header file and the + lattice file */ + type32 dims[4]; /* Full lattice dimensions */ + type32 header_bytes; /* NOT WRITTEN TO THE FILE but + helpful for finding the data */ + + type32 order; /* 0 file is in natural order + no coordinate list is attached. + 1 file is in node-dump (checkpoint) + order. Coordinate list is attached. + 2 file is in node-dump (checkpoint) + order but one file per node. + Coordinate list is attached + before each file. */ + + type32 n_spins; /* Number of source spins in this file */ + type32 spins[MAX_SOURCE_SPINS]; /* List of source spin indices in file */ + +} w_prop_header; + +/* 2. Site list (ONLY for checkpoint files - i.e. node-dump order files) + + A listing of site coordinates for the data in this file + in the order of appearance. The number of coordinates must + be exactly nx*ny*nz*nt. The site coordinate is encoded + as nx*(ny*(nz*t + z) + y) + x in a 32-bit integer. + + */ + +/* 3. Next comes a spin - color index and checksum */ + +typedef struct { + type32 spin; + type32 color; + type32 sum29; + type32 sum31; +} w_prop_check; + +/* 4. Finally the Wilson vectors appear */ + +/**********************************************************************/ +/* Info file format */ + +/* List of admissible keywords for version 5 ASCII lattice info file */ + +#ifdef CONTROL +char *w_prop_info_keyword[] = { + "magic_number", + "time_stamp", + "nx", + "ny", + "nz", + "nt", + "gauge.filename", + "gauge.time_stamp", + "gauge.checksums", + "gauge.fix.description", + "gauge.fix.tolerance", + "gauge.fix.filename", + "gauge.fix.time_stamp", + "gauge.fix.checksums", + "quark.description", + "quark.kappa", + "quark.clover.clov_c", + "quark.clover.u0", + "quark.boundary_condition", + "source.description", + "source.size", + "source.x", + "source.y", + "source.z", + "source.t", + "source.n_spins", + "source.spins", + "" /* Last entry MUST be a zero-length keyword */ +}; +#else +extern char *w_prop_info_keyword[]; +#endif + +/* Used to create info file name */ + +#define ASCII_W_PROP_INFO_EXT ".info" + +/**********************************************************************/ +/* 1996 Binary file format follows */ +/* Kept for compatibility */ + +#define MAX_GAUGE_FIELD_DESCRIPT 200 +#define MAX_GAUGE_FIELD_PARAM 2 +#define MAX_DIRAC_DESCRIPT 200 +#define MAX_DIRAC_PARAM 3 +#define MAX_SOURCE_DESCRIPT 200 +#define MAX_SOURCE_PARAM 2 +#define IDENTITY_MAP -1 +#define NO_MAP -2 +#define W_PROP_VERSION_NUMBER_1996 48291 + +/* Begin definition of header stuctures */ + +/* Note that an effort is made to make all radixing point and integer + fields 32 bits long. However, byte ordering may vary across + platforms, and no effort is made in writing the file to produce + a standard byte order. The input routines attempt to compensate + for byte reversal automatically, by examining the magic number at + the beginning of the file */ + +/* 1. Header comes first */ + +typedef struct { + type32 magic_number; /* Identifies file format */ + type32 dims[4]; /* Full lattice dimensions */ + type32 header_bytes; /* Number of bytes for data belonging to + this structure -- NOT necessarily + the length of this structure! */ + type32 order; /* 0 means no coordinate list is attached + and the values are in coordinate serial order + Nonzero means that a coordinate list is attached, + specifying the order of values */ + struct { /* Gauge field parameters */ + type32 n_descript; /* Number of bytes in character string */ + char descript[MAX_GAUGE_FIELD_DESCRIPT]; /* Describes gauge field */ + type32 n_param; /* Number of gauge field parameters */ + radix param[MAX_GAUGE_FIELD_PARAM]; /* GF parameters */ + } gauge_field; + struct { /* Dirac operator parameters */ + type32 n_descript; /* Number of bytes in character string */ + char descript[MAX_DIRAC_DESCRIPT]; /* Describes Dirac operator */ + type32 n_param; /* Number of Dirac operator parameters */ + radix param[MAX_DIRAC_PARAM]; /* Dirac parameters */ + } dirac; + struct { /* Source parameters */ + type32 n_descript; /* Number of bytes in character string */ + char descript[MAX_SOURCE_DESCRIPT]; /* Describes source */ + type32 n_param; /* Number of source parameters */ + struct { /* Source parameters */ + type32 i1; + radix c1; + } param; + type32 n_spins; /* Number of source spins in this file */ + type32 spins[MAX_SOURCE_SPINS]; /* List of source spin indices in file */ + } source; +} w_prop_header_1996 ; + +/* 2. Parallel files only: + + Next comes a listing of site coordinates for the data in this file + in the order of appearance. The number of coordinates must + be exactly nx*ny*nz*nt. The site coordinate is encoded + as nx*(ny*(nz*t + z) + y) + x in a 32-bit integer. + + Serial files only: + + The site order of propagator elements is required to be in subscript + order (x,y,z,t) with x varying most rapidly, followed by y, etc. + so this list is omitted. + + */ + +/* Next, repeat Items 3 and 4 for each source spin and color */ + +/* 3. Next comes a check structure to introduce the propagator components + for a given source spin and color and node number. */ + +EXTERN struct { + type32 spin; + type32 color; + type32 checksum; +} w_prop_check_1996; + +/* 4. Finally, the propagator Wilson vectors appear */ + +/*----------------------------------------------------------------------*/ + +/* File data structure */ + +typedef struct { + FILE * fp; /* File pointer */ + w_prop_header* header; /* Pointer to header for file */ + char * filename; /* Pointer to file name string */ + int byterevflag; /* Byte reverse flag - used only for reading */ + type32 * rank2rcv; /* File site list - used only for + serial reading */ + int parallel; /* 0 if file was opened for serial reading + 1 if opened for parallel reading */ + w_prop_check check; /* Current checksum, spin, color indices */ +} w_prop_file; + +/**********************************************************************/ +/* Declarations for I/O routines in io_wb.c */ + +w_prop_file *r_ascii_w_i(char *filename); +int r_ascii_w(w_prop_file *wpf, int spin, int color, field_offset src); +void r_ascii_w_f(w_prop_file *wpf); + +w_prop_file *r_serial_w_i(char *filename); +int r_serial_w(w_prop_file *wpf, int spin, int color, field_offset src); +void r_serial_w_f(w_prop_file *wpf); + +w_prop_file *r_parallel_w_i(char *filename); +void r_parallel_w_o(w_prop_file *wpf); +int r_parallel_w(w_prop_file *wpf, int spin, int color, field_offset src); +void r_parallel_w_c(w_prop_file *wpf); +void r_parallel_w_f(w_prop_file *wpf); + +w_prop_file *w_ascii_w_i(char *filename); +void w_ascii_w(w_prop_file *wpf, int spin, int color, field_offset src); +void w_ascii_w_f(w_prop_file *wpf); + +w_prop_file *w_serial_w_i(char *filename); +void w_serial_w(w_prop_file *wpf, int spin, int color, field_offset src); +void w_serial_w_f(w_prop_file *wpf); + +w_prop_file *w_parallel_w_i(char *filename); +void w_parallel_w_o(w_prop_file *wpf); +void w_parallel_w(w_prop_file *wpf, int spin, int color, field_offset src); +void w_parallel_w_c(w_prop_file *wpf); +void w_parallel_w_f(w_prop_file *wpf); + +w_prop_file *w_checkpoint_w_i(char *filename); +void w_checkpoint_w_o(w_prop_file *wpf); +void w_checkpoint_w(w_prop_file *wpf, int spin, int color, field_offset src); +void w_checkpoint_w_c(w_prop_file *wpf); +void w_checkpoint_w_f(w_prop_file *wpf); + +int write_w_prop_info_item( FILE *fpout, /* ascii file pointer */ + char *keyword, /* keyword */ + char *fmt, /* output format - + must use s, d, f, or e */ + void *src, /* address of starting data */ + int count, /* number of data items if > 1 */ + int stride); /* byte stride of data if + count > 1 */ +/**********************************************************************/ +/* In clover_info.c or wilson_info.c (application dependent) */ +void write_appl_w_prop_info(FILE *fp); + +/**********************************************************************/ +/* Prototypes for io_helpers_w.c */ +w_prop_file *r_open_prop(int flag, char *filename); +w_prop_file *w_open_prop(int flag, char *filename); +int reload_propagator( int flag, w_prop_file *wpf, + int spin, int color, field_offset dest, int timing); +void save_propagator( int flag, w_prop_file *wpf, + int spin, int color, field_offset src, int timing); +int ask_starting_prop( int prompt, int *flag, char *filename ); +int ask_ending_prop( int prompt, int *flag, char *filename ); +void r_close_prop(int flag, w_prop_file *wpf); +void w_close_prop(int flag, w_prop_file *wpf); + + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_amat_hwvec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_amat_hwvec.c new file mode 100644 index 0000000000000000000000000000000000000000..b5cb8381c2902dbed7cce7f5e32e75efca7d6405 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_amat_hwvec.c @@ -0,0 +1,97 @@ +/************** m_amat_hwvec.c (in su3.a) ********************** +* * +* void mult_adj_su3_mat_hwvec( su3_matrix *mat, * +* half_wilson_vector *src,*dest ) * +* multiply a Wilson half-vector by the adjoint of a matrix * +*/ +#include "complex.h" +#include "su3.h" + +#ifndef FAST + +void mult_adj_su3_mat_hwvec( su3_matrix *mat, + half_wilson_vector *src, half_wilson_vector *dest ){ + mult_adj_su3_mat_vec(mat, &(src->h[0]), &(dest->h[0]) ); + mult_adj_su3_mat_vec(mat, &(src->h[1]), &(dest->h[1]) ); +} + +#else /* Fast version */ + +void mult_adj_su3_mat_hwvec( su3_matrix *mat, + half_wilson_vector *src, half_wilson_vector *dest ){ + +#ifdef NATIVEDOUBLE + register double a0r,a0i,a1r,a1i,a2r,a2i; + register double b0r,b0i,b1r,b1i,b2r,b2i; +#else + register radix a0r,a0i,a1r,a1i,a2r,a2i; + register radix b0r,b0i,b1r,b1i,b2r,b2i; +#endif + +/* mult_adj_su3_mat_vec(mat, &(src->h[0]), &(dest->h[0]) ); */ + + a0r=mat->e[0][0].real; a0i=mat->e[0][0].imag; + b0r=src->h[0].c[0].real; b0i=src->h[0].c[0].imag; + a1r=mat->e[1][0].real; a1i=mat->e[1][0].imag; + b1r=src->h[0].c[1].real; b1i=src->h[0].c[1].imag; + a2r=mat->e[2][0].real; a2i=mat->e[2][0].imag; + b2r=src->h[0].c[2].real; b2i=src->h[0].c[2].imag; + + dest->h[0].c[0].real = a0r*b0r + a0i*b0i + a1r*b1r + a1i*b1i + a2r*b2r + a2i*b2i; + dest->h[0].c[0].imag = a0r*b0i - a0i*b0r + a1r*b1i - a1i*b1r + a2r*b2i - a2i*b2r; + + a0r=mat->e[0][1].real; a0i=mat->e[0][1].imag; + b0r=src->h[0].c[0].real; b0i=src->h[0].c[0].imag; + a1r=mat->e[1][1].real; a1i=mat->e[1][1].imag; + b1r=src->h[0].c[1].real; b1i=src->h[0].c[1].imag; + a2r=mat->e[2][1].real; a2i=mat->e[2][1].imag; + b2r=src->h[0].c[2].real; b2i=src->h[0].c[2].imag; + + dest->h[0].c[1].real = a0r*b0r + a0i*b0i + a1r*b1r + a1i*b1i + a2r*b2r + a2i*b2i; + dest->h[0].c[1].imag = a0r*b0i - a0i*b0r + a1r*b1i - a1i*b1r + a2r*b2i - a2i*b2r; + + a0r=mat->e[0][2].real; a0i=mat->e[0][2].imag; + b0r=src->h[0].c[0].real; b0i=src->h[0].c[0].imag; + a1r=mat->e[1][2].real; a1i=mat->e[1][2].imag; + b1r=src->h[0].c[1].real; b1i=src->h[0].c[1].imag; + a2r=mat->e[2][2].real; a2i=mat->e[2][2].imag; + b2r=src->h[0].c[2].real; b2i=src->h[0].c[2].imag; + + dest->h[0].c[2].real = a0r*b0r + a0i*b0i + a1r*b1r + a1i*b1i + a2r*b2r + a2i*b2i; + dest->h[0].c[2].imag = a0r*b0i - a0i*b0r + a1r*b1i - a1i*b1r + a2r*b2i - a2i*b2r; + + +/* mult_adj_su3_mat_vec(mat, &(src->h[1]), &(dest->h[1]) ); */ + + a0r=mat->e[0][0].real; a0i=mat->e[0][0].imag; + b0r=src->h[1].c[0].real; b0i=src->h[1].c[0].imag; + a1r=mat->e[1][0].real; a1i=mat->e[1][0].imag; + b1r=src->h[1].c[1].real; b1i=src->h[1].c[1].imag; + a2r=mat->e[2][0].real; a2i=mat->e[2][0].imag; + b2r=src->h[1].c[2].real; b2i=src->h[1].c[2].imag; + + dest->h[1].c[0].real = a0r*b0r + a0i*b0i + a1r*b1r + a1i*b1i + a2r*b2r + a2i*b2i; + dest->h[1].c[0].imag = a0r*b0i - a0i*b0r + a1r*b1i - a1i*b1r + a2r*b2i - a2i*b2r; + + a0r=mat->e[0][1].real; a0i=mat->e[0][1].imag; + b0r=src->h[1].c[0].real; b0i=src->h[1].c[0].imag; + a1r=mat->e[1][1].real; a1i=mat->e[1][1].imag; + b1r=src->h[1].c[1].real; b1i=src->h[1].c[1].imag; + a2r=mat->e[2][1].real; a2i=mat->e[2][1].imag; + b2r=src->h[1].c[2].real; b2i=src->h[1].c[2].imag; + + dest->h[1].c[1].real = a0r*b0r + a0i*b0i + a1r*b1r + a1i*b1i + a2r*b2r + a2i*b2i; + dest->h[1].c[1].imag = a0r*b0i - a0i*b0r + a1r*b1i - a1i*b1r + a2r*b2i - a2i*b2r; + + a0r=mat->e[0][2].real; a0i=mat->e[0][2].imag; + b0r=src->h[1].c[0].real; b0i=src->h[1].c[0].imag; + a1r=mat->e[1][2].real; a1i=mat->e[1][2].imag; + b1r=src->h[1].c[1].real; b1i=src->h[1].c[1].imag; + a2r=mat->e[2][2].real; a2i=mat->e[2][2].imag; + b2r=src->h[1].c[2].real; b2i=src->h[1].c[2].imag; + + dest->h[1].c[2].real = a0r*b0r + a0i*b0i + a1r*b1r + a1i*b1i + a2r*b2r + a2i*b2i; + dest->h[1].c[2].imag = a0r*b0i - a0i*b0r + a1r*b1i - a1i*b1r + a2r*b2i - a2i*b2r; + +} +#endif /* "ifndef FAST" */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_amat_wvec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_amat_wvec.c new file mode 100644 index 0000000000000000000000000000000000000000..5fa5165a31d37622a04cd692db15a5daf016dd00 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_amat_wvec.c @@ -0,0 +1,14 @@ +/*************** m_amat_wvec.c (in su3.a) ********************** +* * +* void mult_adj_mat_wilson_vec( su3_matrix *mat, * +* wilson_vector *src,*dest) * +* multiply a Wilson vector by the adjoint of a matrix * +*/ +#include "complex.h" +#include "su3.h" + +void mult_adj_mat_wilson_vec( su3_matrix *mat, wilson_vector *src, + wilson_vector *dest ){ + register int i; + for(i=0;i<4;i++)mult_adj_su3_mat_vec(mat, &(src->d[i]), &(dest->d[i]) ); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_amatvec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_amatvec.c new file mode 100644 index 0000000000000000000000000000000000000000..b8d925b99e0864b3c94d51c56bc129488cb251a4 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_amatvec.c @@ -0,0 +1,135 @@ +/***************** m_amatvec.c (in su3.a) ***************************** +* * +* void mult_adj_su3_mat_vec( su3_matrix *a, su3_vector *b,*c ) * +* C <- A_adjoint * B * +*/ +#include "complex.h" +#include "su3.h" + +#ifndef FAST +/* adjoint matrix times vector multiply */ +void mult_adj_su3_mat_vec( su3_matrix *a, su3_vector *b, su3_vector *c ){ +register int i,j; +register complex x,y,z; + for(i=0;i<3;i++){ + x.real=x.imag=0.0; + for(j=0;j<3;j++){ + CONJG( a->e[j][i], z ); + CMUL( z , b->c[j], y ) + CSUM( x , y ); + } + c->c[i] = x; + } +} + +#else +#ifdef NATIVEDOUBLE /* IBM RS6000 version */ +void mult_adj_su3_mat_vec( su3_matrix *a, su3_vector *b, su3_vector *c ){ + + register double c0r,c0i,c1r,c1i,c2r,c2i; + register double br,bi,a0,a1,a2; + + br=b->c[0].real; bi=b->c[0].imag; + a0=a->e[0][0].real; + a1=a->e[0][1].real; + a2=a->e[0][2].real; + + c0r = a0*br; + c1r = a1*br; + c2r = a2*br; + c0i = a0*bi; + c1i = a1*bi; + c2i = a2*bi; + + a0=a->e[0][0].imag; + a1=a->e[0][1].imag; + a2=a->e[0][2].imag; + + c0r += a0*bi; + c1r += a1*bi; + c2r += a2*bi; + c0i -= a0*br; + c1i -= a1*br; + c2i -= a2*br; + + br=b->c[1].real; bi=b->c[1].imag; + a0=a->e[1][0].real; + a1=a->e[1][1].real; + a2=a->e[1][2].real; + + c0r += a0*br; + c1r += a1*br; + c2r += a2*br; + c0i += a0*bi; + c1i += a1*bi; + c2i += a2*bi; + + a0=a->e[1][0].imag; + a1=a->e[1][1].imag; + a2=a->e[1][2].imag; + + c0r += a0*bi; + c1r += a1*bi; + c2r += a2*bi; + c0i -= a0*br; + c1i -= a1*br; + c2i -= a2*br; + + br=b->c[2].real; bi=b->c[2].imag; + a0=a->e[2][0].real; + a1=a->e[2][1].real; + a2=a->e[2][2].real; + + c0r += a0*br; + c1r += a1*br; + c2r += a2*br; + c0i += a0*bi; + c1i += a1*bi; + c2i += a2*bi; + + a0=a->e[2][0].imag; + a1=a->e[2][1].imag; + a2=a->e[2][2].imag; + + c0r += a0*bi; + c1r += a1*bi; + c2r += a2*bi; + c0i -= a0*br; + c1i -= a1*br; + c2i -= a2*br; + + c->c[0].real = c0r; + c->c[0].imag = c0i; + c->c[1].real = c1r; + c->c[1].imag = c1i; + c->c[2].real = c2r; + c->c[2].imag = c2i; + +} +#else +void mult_adj_su3_mat_vec( su3_matrix *a, su3_vector *b, su3_vector *c ){ + int i; + register radix t,ar,ai,br,bi,cr,ci; + for(i=0;i<3;i++){ + + ar=a->e[0][i].real; ai=a->e[0][i].imag; + br=b->c[0].real; bi=b->c[0].imag; + cr=ar*br; t=ai*bi; cr += t; + ci=ar*bi; t=ai*br; ci -= t; + + ar=a->e[1][i].real; ai=a->e[1][i].imag; + br=b->c[1].real; bi=b->c[1].imag; + t=ar*br; cr += t; t=ai*bi; cr += t; + t=ar*bi; ci += t; t=ai*br; ci -= t; + + ar=a->e[2][i].real; ai=a->e[2][i].imag; + br=b->c[2].real; bi=b->c[2].imag; + t=ar*br; cr += t; t=ai*bi; cr += t; + t=ar*bi; ci += t; t=ai*br; ci -= t; + + c->c[i].real=cr; + c->c[i].imag=ci; + } +} +#endif /* End of "#ifdef NATIVEDOUBLE" */ +#endif /* End of "#ifndef FAST" */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_amatvec_ns.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_amatvec_ns.c new file mode 100644 index 0000000000000000000000000000000000000000..d4f9d389ab91550b0275e2b5c9662cf7acecacda --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_amatvec_ns.c @@ -0,0 +1,114 @@ +/****************** m_amatvec_ns.c (in su3.a) ************************* +* * +* void mult_adj_su3_mat_vec_nsum( su3_matrix *a, su3_vector *b,*c ) * +* adjoint matrix times vector multiply and subtract from another vector * +* C <- C - A_adjoint*B * +*/ +#include "complex.h" +#include "su3.h" + +#ifndef FAST +void mult_adj_su3_mat_vec_nsum( su3_matrix *a, su3_vector *b, su3_vector *c ){ +register int i,j; +register complex x,y,z; + for(i=0;i<3;i++){ + x.real=x.imag=0.0; + for(j=0;j<3;j++){ + CONJG( a->e[j][i], z ); + CMUL( z , b->c[j], y ) + CSUM( x , y ); + } + c->c[i].real -= x.real; + c->c[i].imag -= x.imag; + } +} + +#else +void mult_adj_su3_mat_vec_nsum( su3_matrix *a, su3_vector *b, su3_vector *c ){ + +#ifdef NATIVEDOUBLE + register double c0r,c0i,c1r,c1i,c2r,c2i; + register double br,bi,a0,a1,a2; +#else + register radix c0r,c0i,c1r,c1i,c2r,c2i; + register radix br,bi,a0,a1,a2; +#endif + + br=b->c[0].real; bi=b->c[0].imag; + a0=a->e[0][0].real; + a1=a->e[0][1].real; + a2=a->e[0][2].real; + + c0r = a0*br; + c1r = a1*br; + c2r = a2*br; + c0i = a0*bi; + c1i = a1*bi; + c2i = a2*bi; + + a0=a->e[0][0].imag; + a1=a->e[0][1].imag; + a2=a->e[0][2].imag; + + c0r += a0*bi; + c1r += a1*bi; + c2r += a2*bi; + c0i -= a0*br; + c1i -= a1*br; + c2i -= a2*br; + + br=b->c[1].real; bi=b->c[1].imag; + a0=a->e[1][0].real; + a1=a->e[1][1].real; + a2=a->e[1][2].real; + + c0r += a0*br; + c1r += a1*br; + c2r += a2*br; + c0i += a0*bi; + c1i += a1*bi; + c2i += a2*bi; + + a0=a->e[1][0].imag; + a1=a->e[1][1].imag; + a2=a->e[1][2].imag; + + c0r += a0*bi; + c1r += a1*bi; + c2r += a2*bi; + c0i -= a0*br; + c1i -= a1*br; + c2i -= a2*br; + + br=b->c[2].real; bi=b->c[2].imag; + a0=a->e[2][0].real; + a1=a->e[2][1].real; + a2=a->e[2][2].real; + + c0r += a0*br; + c1r += a1*br; + c2r += a2*br; + c0i += a0*bi; + c1i += a1*bi; + c2i += a2*bi; + + a0=a->e[2][0].imag; + a1=a->e[2][1].imag; + a2=a->e[2][2].imag; + + c0r += a0*bi; + c1r += a1*bi; + c2r += a2*bi; + c0i -= a0*br; + c1i -= a1*br; + c2i -= a2*br; + + c->c[0].real -= c0r; + c->c[0].imag -= c0i; + c->c[1].real -= c1r; + c->c[1].imag -= c1i; + c->c[2].real -= c2r; + c->c[2].imag -= c2i; + +} +#endif /* End of "#ifdef FAST" */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_amatvec_s.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_amatvec_s.c new file mode 100644 index 0000000000000000000000000000000000000000..0f090976cb5014364e6b608a161b527c29cc14a5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_amatvec_s.c @@ -0,0 +1,114 @@ +/******************* m_amatvec_s.c (in su3.a) ************************* +* * +* void mult_adj_su3_mat_vec_sum( su3_matrix *a, su3_vector *b,*c ) * +* adjoint matrix times vector multiply and add to another vector * +* C <- C + A_adjoint*B * +*/ +#include "complex.h" +#include "su3.h" + +#ifndef FAST +void mult_adj_su3_mat_vec_sum( su3_matrix *a, su3_vector *b, su3_vector *c ){ +register int i,j; +register complex x,y,z; + for(i=0;i<3;i++){ + x.real=x.imag=0.0; + for(j=0;j<3;j++){ + CONJG( a->e[j][i], z ); + CMUL( z , b->c[j], y ) + CSUM( x , y ); + } + c->c[i].real += x.real; + c->c[i].imag += x.imag; + } +} + +#else +void mult_adj_su3_mat_vec_sum( su3_matrix *a, su3_vector *b, su3_vector *c ){ + +#ifdef NATIVEDOUBLE + register double c0r,c0i,c1r,c1i,c2r,c2i; + register double br,bi,a0,a1,a2; +#else + register radix c0r,c0i,c1r,c1i,c2r,c2i; + register radix br,bi,a0,a1,a2; +#endif + + br=b->c[0].real; bi=b->c[0].imag; + a0=a->e[0][0].real; + a1=a->e[0][1].real; + a2=a->e[0][2].real; + + c0r = a0*br; + c1r = a1*br; + c2r = a2*br; + c0i = a0*bi; + c1i = a1*bi; + c2i = a2*bi; + + a0=a->e[0][0].imag; + a1=a->e[0][1].imag; + a2=a->e[0][2].imag; + + c0r += a0*bi; + c1r += a1*bi; + c2r += a2*bi; + c0i -= a0*br; + c1i -= a1*br; + c2i -= a2*br; + + br=b->c[1].real; bi=b->c[1].imag; + a0=a->e[1][0].real; + a1=a->e[1][1].real; + a2=a->e[1][2].real; + + c0r += a0*br; + c1r += a1*br; + c2r += a2*br; + c0i += a0*bi; + c1i += a1*bi; + c2i += a2*bi; + + a0=a->e[1][0].imag; + a1=a->e[1][1].imag; + a2=a->e[1][2].imag; + + c0r += a0*bi; + c1r += a1*bi; + c2r += a2*bi; + c0i -= a0*br; + c1i -= a1*br; + c2i -= a2*br; + + br=b->c[2].real; bi=b->c[2].imag; + a0=a->e[2][0].real; + a1=a->e[2][1].real; + a2=a->e[2][2].real; + + c0r += a0*br; + c1r += a1*br; + c2r += a2*br; + c0i += a0*bi; + c1i += a1*bi; + c2i += a2*bi; + + a0=a->e[2][0].imag; + a1=a->e[2][1].imag; + a2=a->e[2][2].imag; + + c0r += a0*bi; + c1r += a1*bi; + c2r += a2*bi; + c0i -= a0*br; + c1i -= a1*br; + c2i -= a2*br; + + c->c[0].real += c0r; + c->c[0].imag += c0i; + c->c[1].real += c1r; + c->c[1].imag += c1i; + c->c[2].real += c2r; + c->c[2].imag += c2i; +} + +#endif /* End of "#ifdef FAST" */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_amv_4dir.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_amv_4dir.c new file mode 100644 index 0000000000000000000000000000000000000000..816863dfbe5a87a684e2e46c3ad9b693765a5b34 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_amv_4dir.c @@ -0,0 +1,117 @@ +/***************** m_amv_4dir.c (in su3.a) ***************************** +* * +* void mult_adj_su3_mat_vec_4dir( su3_matrix *mat, * +* su3_vector *src, su3_vector *dest ) * +* Multiply an su3_vector by an array of four adjoint su3_matrices, * +* result in an array of four su3_vectors. * +* dest[i] <- A_adjoint[i] * src * +*/ +#include "complex.h" +#include "su3.h" + +#ifndef FAST +void mult_adj_su3_mat_vec_4dir( su3_matrix *mat, su3_vector *src, + su3_vector *dest ) { + mult_adj_su3_mat_vec( mat+0, src, dest+0 ); + mult_adj_su3_mat_vec( mat+1, src, dest+1 ); + mult_adj_su3_mat_vec( mat+2, src, dest+2 ); + mult_adj_su3_mat_vec( mat+3, src, dest+3 ); +} + +#else +/* Fast code, with subroutines inlined */ + +void mult_adj_su3_mat_vec_4dir( su3_matrix *mat, su3_vector *src, + su3_vector *dest ){ + register int n; +#ifdef NATIVEDOUBLE + register double c0r,c0i,c1r,c1i,c2r,c2i; + register double br,bi,a0,a1,a2; +#else + register radix c0r,c0i,c1r,c1i,c2r,c2i; + register radix br,bi,a0,a1,a2; +#endif + register su3_matrix *a; + register su3_vector *b,*c; + + a = mat; c = dest ; b = src; + for(n=0;n<4;n++,a++,c++){ + + br=b->c[0].real; bi=b->c[0].imag; + a0=a->e[0][0].real; + a1=a->e[0][1].real; + a2=a->e[0][2].real; + + c0r = a0*br; + c1r = a1*br; + c2r = a2*br; + c0i = a0*bi; + c1i = a1*bi; + c2i = a2*bi; + + a0=a->e[0][0].imag; + a1=a->e[0][1].imag; + a2=a->e[0][2].imag; + + c0r += a0*bi; + c1r += a1*bi; + c2r += a2*bi; + c0i -= a0*br; + c1i -= a1*br; + c2i -= a2*br; + + br=b->c[1].real; bi=b->c[1].imag; + a0=a->e[1][0].real; + a1=a->e[1][1].real; + a2=a->e[1][2].real; + + c0r += a0*br; + c1r += a1*br; + c2r += a2*br; + c0i += a0*bi; + c1i += a1*bi; + c2i += a2*bi; + + a0=a->e[1][0].imag; + a1=a->e[1][1].imag; + a2=a->e[1][2].imag; + + c0r += a0*bi; + c1r += a1*bi; + c2r += a2*bi; + c0i -= a0*br; + c1i -= a1*br; + c2i -= a2*br; + + br=b->c[2].real; bi=b->c[2].imag; + a0=a->e[2][0].real; + a1=a->e[2][1].real; + a2=a->e[2][2].real; + + c0r += a0*br; + c1r += a1*br; + c2r += a2*br; + c0i += a0*bi; + c1i += a1*bi; + c2i += a2*bi; + + a0=a->e[2][0].imag; + a1=a->e[2][1].imag; + a2=a->e[2][2].imag; + + c0r += a0*bi; + c1r += a1*bi; + c2r += a2*bi; + c0i -= a0*br; + c1i -= a1*br; + c2i -= a2*br; + + c->c[0].real = c0r; + c->c[0].imag = c0i; + c->c[1].real = c1r; + c->c[1].imag = c1i; + c->c[2].real = c2r; + c->c[2].imag = c2i; + } +} +#endif /* End of "#ifndef FAST" */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_amv_4dir_2.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_amv_4dir_2.c new file mode 100644 index 0000000000000000000000000000000000000000..d2772bbd69740a282dc4c74816e354669988f445 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_amv_4dir_2.c @@ -0,0 +1,19 @@ +/***************** m_amv_4dir_2.c (in su3.a) ***************************** +* * +* void mult_adj_su3_mat_vec_4dir_2( su3_matrix *mat, * +* su3_vector *src, su3_vector *dest ) * +* Multiply an su3_vector by an array of four adjoint su3_matrices, * +* result in an array of four su3_vectors. * +* dest[i] <- A_adjoint[i] * src * +*/ +#include "complex.h" +#include "su3.h" + +void mult_adj_su3_mat_vec_4dir_2( su3_matrix *mat, su3_vector *src, + su3_vector *xdest, su3_vector *ydest, su3_vector *zdest, + su3_vector *tdest ) { + mult_adj_su3_mat_vec( mat+0, src, xdest ); + mult_adj_su3_mat_vec( mat+1, src, ydest ); + mult_adj_su3_mat_vec( mat+2, src, zdest ); + mult_adj_su3_mat_vec( mat+3, src, tdest ); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_mat_an.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_mat_an.c new file mode 100644 index 0000000000000000000000000000000000000000..a0e387382c85a49cd87750d78c45c2b1cc511ea9 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_mat_an.c @@ -0,0 +1,73 @@ +/****************** m_mat_an.c (in su3.a) ***************************** +* * +* void mult_su3_an( su3_matrix *a,*b,*c ) * +* matrix multiply, first matrix is adjoint * +* C <- A_adjoint*B * +*/ +#include "complex.h" +#include "su3.h" + +#ifndef FAST +void mult_su3_an( su3_matrix *a, su3_matrix *b, su3_matrix *c ){ +register int i,j,k; +register complex x,y; + for(i=0;i<3;i++)for(j=0;j<3;j++){ + x.real=x.imag=0.0; + for(k=0;k<3;k++){ + CMULJ_( a->e[k][i] , b->e[k][j], y ); + CSUM( x , y ); + } + c->e[i][j] = x; + } +} + +/* "Hand coded" routines, clearer coding is up above */ +#else + +void mult_su3_an( su3_matrix *a, su3_matrix *b, su3_matrix *c ){ + int j; + +#ifdef NATIVEDOUBLE + register double a0r,a0i,a1r,a1i,a2r,a2i; + register double b0r,b0i,b1r,b1i,b2r,b2i; +#else + register radix a0r,a0i,a1r,a1i,a2r,a2i; + register radix b0r,b0i,b1r,b1i,b2r,b2i; +#endif + + for(j=0;j<3;j++){ + + a0r=a->e[0][0].real; a0i=a->e[0][0].imag; + b0r=b->e[0][j].real; b0i=b->e[0][j].imag; + a1r=a->e[1][0].real; a1i=a->e[1][0].imag; + b1r=b->e[1][j].real; b1i=b->e[1][j].imag; + a2r=a->e[2][0].real; a2i=a->e[2][0].imag; + b2r=b->e[2][j].real; b2i=b->e[2][j].imag; + + c->e[0][j].real = a0r*b0r + a0i*b0i + a1r*b1r + a1i*b1i + a2r*b2r + a2i*b2i; + c->e[0][j].imag = a0r*b0i - a0i*b0r + a1r*b1i - a1i*b1r + a2r*b2i - a2i*b2r; + + a0r=a->e[0][1].real; a0i=a->e[0][1].imag; + b0r=b->e[0][j].real; b0i=b->e[0][j].imag; + a1r=a->e[1][1].real; a1i=a->e[1][1].imag; + b1r=b->e[1][j].real; b1i=b->e[1][j].imag; + a2r=a->e[2][1].real; a2i=a->e[2][1].imag; + b2r=b->e[2][j].real; b2i=b->e[2][j].imag; + + c->e[1][j].real = a0r*b0r + a0i*b0i + a1r*b1r + a1i*b1i + a2r*b2r + a2i*b2i; + c->e[1][j].imag = a0r*b0i - a0i*b0r + a1r*b1i - a1i*b1r + a2r*b2i - a2i*b2r; + + a0r=a->e[0][2].real; a0i=a->e[0][2].imag; + b0r=b->e[0][j].real; b0i=b->e[0][j].imag; + a1r=a->e[1][2].real; a1i=a->e[1][2].imag; + b1r=b->e[1][j].real; b1i=b->e[1][j].imag; + a2r=a->e[2][2].real; a2i=a->e[2][2].imag; + b2r=b->e[2][j].real; b2i=b->e[2][j].imag; + + c->e[2][j].real = a0r*b0r + a0i*b0i + a1r*b1r + a1i*b1i + a2r*b2r + a2i*b2i; + c->e[2][j].imag = a0r*b0i - a0i*b0r + a1r*b1i - a1i*b1r + a2r*b2i - a2i*b2r; + + } +} + +#endif /* End of "#ifdef FAST" */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_mat_hwvec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_mat_hwvec.c new file mode 100644 index 0000000000000000000000000000000000000000..ed60ea4919aa8d634cd60ae5c39fc5c1818bcc5b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_mat_hwvec.c @@ -0,0 +1,99 @@ +/************** m_mat_hwvec.c (in su3.a) *********************** +* * +* void mult_su3_mat_hwvec(su3_matrix *mat, * +* half_wilson_vector *src,*dest) * +* multiply a Wilson half-vector by a matrix * +* dest <- mat*src * +*/ +#include "complex.h" +#include "su3.h" + +#ifndef FAST + +void mult_su3_mat_hwvec( su3_matrix *mat, half_wilson_vector *src, + half_wilson_vector *dest ){ + mult_su3_mat_vec(mat, &(src->h[0]), &(dest->h[0]) ); + mult_su3_mat_vec(mat, &(src->h[1]), &(dest->h[1]) ); +} + +#else /* Fast version */ + + +void mult_su3_mat_hwvec( su3_matrix *mat, half_wilson_vector *src, + half_wilson_vector *dest ){ + +#ifdef NATIVEDOUBLE + register double a0r,a0i,a1r,a1i,a2r,a2i; + register double b0r,b0i,b1r,b1i,b2r,b2i; +#else + register radix a0r,a0i,a1r,a1i,a2r,a2i; + register radix b0r,b0i,b1r,b1i,b2r,b2i; +#endif + +/* mult_su3_mat_vec(mat, &(src->h[0]), &(dest->h[0]) ); */ + + a0r=mat->e[0][0].real; a0i=mat->e[0][0].imag; + b0r=src->h[0].c[0].real; b0i=src->h[0].c[0].imag; + a1r=mat->e[0][1].real; a1i=mat->e[0][1].imag; + b1r=src->h[0].c[1].real; b1i=src->h[0].c[1].imag; + a2r=mat->e[0][2].real; a2i=mat->e[0][2].imag; + b2r=src->h[0].c[2].real; b2i=src->h[0].c[2].imag; + + dest->h[0].c[0].real = a0r*b0r - a0i*b0i + a1r*b1r - a1i*b1i + a2r*b2r - a2i*b2i; + dest->h[0].c[0].imag = a0r*b0i + a0i*b0r + a1r*b1i + a1i*b1r + a2r*b2i + a2i*b2r; + + a0r=mat->e[1][0].real; a0i=mat->e[1][0].imag; + b0r=src->h[0].c[0].real; b0i=src->h[0].c[0].imag; + a1r=mat->e[1][1].real; a1i=mat->e[1][1].imag; + b1r=src->h[0].c[1].real; b1i=src->h[0].c[1].imag; + a2r=mat->e[1][2].real; a2i=mat->e[1][2].imag; + b2r=src->h[0].c[2].real; b2i=src->h[0].c[2].imag; + + dest->h[0].c[1].real = a0r*b0r - a0i*b0i + a1r*b1r - a1i*b1i + a2r*b2r - a2i*b2i; + dest->h[0].c[1].imag = a0r*b0i + a0i*b0r + a1r*b1i + a1i*b1r + a2r*b2i + a2i*b2r; + + a0r=mat->e[2][0].real; a0i=mat->e[2][0].imag; + b0r=src->h[0].c[0].real; b0i=src->h[0].c[0].imag; + a1r=mat->e[2][1].real; a1i=mat->e[2][1].imag; + b1r=src->h[0].c[1].real; b1i=src->h[0].c[1].imag; + a2r=mat->e[2][2].real; a2i=mat->e[2][2].imag; + b2r=src->h[0].c[2].real; b2i=src->h[0].c[2].imag; + + dest->h[0].c[2].real = a0r*b0r - a0i*b0i + a1r*b1r - a1i*b1i + a2r*b2r - a2i*b2i; + dest->h[0].c[2].imag = a0r*b0i + a0i*b0r + a1r*b1i + a1i*b1r + a2r*b2i + a2i*b2r; + +/* mult_su3_mat_vec(mat, &(src->h[1]), &(dest->h[1]) ); */ + + a0r=mat->e[0][0].real; a0i=mat->e[0][0].imag; + b0r=src->h[1].c[0].real; b0i=src->h[1].c[0].imag; + a1r=mat->e[0][1].real; a1i=mat->e[0][1].imag; + b1r=src->h[1].c[1].real; b1i=src->h[1].c[1].imag; + a2r=mat->e[0][2].real; a2i=mat->e[0][2].imag; + b2r=src->h[1].c[2].real; b2i=src->h[1].c[2].imag; + + dest->h[1].c[0].real = a0r*b0r - a0i*b0i + a1r*b1r - a1i*b1i + a2r*b2r - a2i*b2i; + dest->h[1].c[0].imag = a0r*b0i + a0i*b0r + a1r*b1i + a1i*b1r + a2r*b2i + a2i*b2r; + + a0r=mat->e[1][0].real; a0i=mat->e[1][0].imag; + b0r=src->h[1].c[0].real; b0i=src->h[1].c[0].imag; + a1r=mat->e[1][1].real; a1i=mat->e[1][1].imag; + b1r=src->h[1].c[1].real; b1i=src->h[1].c[1].imag; + a2r=mat->e[1][2].real; a2i=mat->e[1][2].imag; + b2r=src->h[1].c[2].real; b2i=src->h[1].c[2].imag; + + dest->h[1].c[1].real = a0r*b0r - a0i*b0i + a1r*b1r - a1i*b1i + a2r*b2r - a2i*b2i; + dest->h[1].c[1].imag = a0r*b0i + a0i*b0r + a1r*b1i + a1i*b1r + a2r*b2i + a2i*b2r; + + a0r=mat->e[2][0].real; a0i=mat->e[2][0].imag; + b0r=src->h[1].c[0].real; b0i=src->h[1].c[0].imag; + a1r=mat->e[2][1].real; a1i=mat->e[2][1].imag; + b1r=src->h[1].c[1].real; b1i=src->h[1].c[1].imag; + a2r=mat->e[2][2].real; a2i=mat->e[2][2].imag; + b2r=src->h[1].c[2].real; b2i=src->h[1].c[2].imag; + + dest->h[1].c[2].real = a0r*b0r - a0i*b0i + a1r*b1r - a1i*b1i + a2r*b2r - a2i*b2i; + dest->h[1].c[2].imag = a0r*b0i + a0i*b0r + a1r*b1i + a1i*b1r + a2r*b2i + a2i*b2r; + +} + +#endif /* "ifndef FAST" */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_mat_na.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_mat_na.c new file mode 100644 index 0000000000000000000000000000000000000000..5d35d08f49bea8f1dde9f0794fff6fec029fde10 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_mat_na.c @@ -0,0 +1,52 @@ +/**************** m_mat_na.c (in su3.a) ******************************* +* * +* void mult_su3_na( su3_matrix *a,*b,*c ) * +* matrix multiply, second matrix is adjoint * +* C <- A*B_adjoint * +*/ +#include "complex.h" +#include "su3.h" + +#ifndef FAST +void mult_su3_na( su3_matrix *a, su3_matrix *b, su3_matrix *c ){ +register int i,j,k; +register complex x,y; + for(i=0;i<3;i++)for(j=0;j<3;j++){ + x.real=x.imag=0.0; + for(k=0;k<3;k++){ + CMUL_J( a->e[i][k] , b->e[j][k] , y ); + CSUM( x , y ); + } + c->e[i][j] = x; + } +} + +/* "Hand coded" routines, clearer coding is up above */ +#else + +void mult_su3_na( su3_matrix *a, su3_matrix *b, su3_matrix *c ){ +int i,j,k; +register radix t,ar,ai,br,bi,cr,ci; + for(i=0;i<3;i++)for(j=0;j<3;j++){ + + ar=a->e[i][0].real; ai=a->e[i][0].imag; + br=b->e[j][0].real; bi=b->e[j][0].imag; + cr=ar*br; t=ai*bi; cr += t; + ci=ai*br; t=ar*bi; ci -= t; + + ar=a->e[i][1].real; ai=a->e[i][1].imag; + br=b->e[j][1].real; bi=b->e[j][1].imag; + t=ar*br; cr += t; t=ai*bi; cr += t; + t=ar*bi; ci -= t; t=ai*br; ci += t; + + ar=a->e[i][2].real; ai=a->e[i][2].imag; + br=b->e[j][2].real; bi=b->e[j][2].imag; + t=ar*br; cr += t; t=ai*bi; cr += t; + t=ar*bi; ci -= t; t=ai*br; ci += t; + + c->e[i][j].real=cr; + c->e[i][j].imag=ci; + } +} + +#endif /* End of "#ifdef FAST" */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_mat_nn.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_mat_nn.c new file mode 100644 index 0000000000000000000000000000000000000000..61313f151825ccca8fda666473d9bb15f1cac605 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_mat_nn.c @@ -0,0 +1,94 @@ +/******************* m_mat_nn.c (in su3.a) **************************** +* * +* void mult_su3_nn( su3_matrix *a,*b,*c ) * +* matrix multiply, no adjoints * +* C <- A*B * +*/ +#include "complex.h" +#include "su3.h" + +#ifndef FAST +void mult_su3_nn( su3_matrix *a, su3_matrix *b, su3_matrix *c ){ +register int i,j,k; +register complex x,y; + for(i=0;i<3;i++)for(j=0;j<3;j++){ + x.real=x.imag=0.0; + for(k=0;k<3;k++){ + CMUL( a->e[i][k] , b->e[k][j] , y ); + CSUM( x , y ); + } + c->e[i][j] = x; + } +} + +/* "Hand coded" routines, clearer coding is up above */ +#else +#ifdef NATIVEDOUBLE /* RS6000 version */ + +void mult_su3_nn( su3_matrix *a, su3_matrix *b, su3_matrix *c ){ + int j; + register double a0r,a0i,a1r,a1i,a2r,a2i; + register double b0r,b0i,b1r,b1i,b2r,b2i; + + for(j=0;j<3;j++){ + + a0r=a->e[0][0].real; a0i=a->e[0][0].imag; + b0r=b->e[0][j].real; b0i=b->e[0][j].imag; + a1r=a->e[0][1].real; a1i=a->e[0][1].imag; + b1r=b->e[1][j].real; b1i=b->e[1][j].imag; + a2r=a->e[0][2].real; a2i=a->e[0][2].imag; + b2r=b->e[2][j].real; b2i=b->e[2][j].imag; + + c->e[0][j].real = a0r*b0r - a0i*b0i + a1r*b1r - a1i*b1i + a2r*b2r - a2i*b2i; + c->e[0][j].imag = a0r*b0i + a0i*b0r + a1r*b1i + a1i*b1r + a2r*b2i + a2i*b2r; + + a0r=a->e[1][0].real; a0i=a->e[1][0].imag; + b0r=b->e[0][j].real; b0i=b->e[0][j].imag; + a1r=a->e[1][1].real; a1i=a->e[1][1].imag; + b1r=b->e[1][j].real; b1i=b->e[1][j].imag; + a2r=a->e[1][2].real; a2i=a->e[1][2].imag; + b2r=b->e[2][j].real; b2i=b->e[2][j].imag; + + c->e[1][j].real = a0r*b0r - a0i*b0i + a1r*b1r - a1i*b1i + a2r*b2r - a2i*b2i; + c->e[1][j].imag = a0r*b0i + a0i*b0r + a1r*b1i + a1i*b1r + a2r*b2i + a2i*b2r; + + a0r=a->e[2][0].real; a0i=a->e[2][0].imag; + b0r=b->e[0][j].real; b0i=b->e[0][j].imag; + a1r=a->e[2][1].real; a1i=a->e[2][1].imag; + b1r=b->e[1][j].real; b1i=b->e[1][j].imag; + a2r=a->e[2][2].real; a2i=a->e[2][2].imag; + b2r=b->e[2][j].real; b2i=b->e[2][j].imag; + + c->e[2][j].real = a0r*b0r - a0i*b0i + a1r*b1r - a1i*b1i + a2r*b2r - a2i*b2i; + c->e[2][j].imag = a0r*b0i + a0i*b0r + a1r*b1i + a1i*b1r + a2r*b2i + a2i*b2r; + + } +} +#else + +void mult_su3_nn( su3_matrix *a, su3_matrix *b, su3_matrix *c ){ + int i,j,k; + register radix t,ar,ai,br,bi,cr,ci; + for(i=0;i<3;i++)for(j=0;j<3;j++){ + + ar=a->e[i][0].real; ai=a->e[i][0].imag; + br=b->e[0][j].real; bi=b->e[0][j].imag; + cr=ar*br; t=ai*bi; cr -= t; + ci=ar*bi; t=ai*br; ci += t; + + ar=a->e[i][1].real; ai=a->e[i][1].imag; + br=b->e[1][j].real; bi=b->e[1][j].imag; + t=ar*br; cr += t; t=ai*bi; cr -= t; + t=ar*bi; ci += t; t=ai*br; ci += t; + + ar=a->e[i][2].real; ai=a->e[i][2].imag; + br=b->e[2][j].real; bi=b->e[2][j].imag; + t=ar*br; cr += t; t=ai*bi; cr -= t; + t=ar*bi; ci += t; t=ai*br; ci += t; + + c->e[i][j].real=cr; + c->e[i][j].imag=ci; + } +} +#endif /* End of "#ifdef NATIVEDOUBLE" */ +#endif /* End of "#ifdef FAST" */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_mat_wvec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_mat_wvec.c new file mode 100644 index 0000000000000000000000000000000000000000..fbdaa7be78a630f7acf40ac0c2e7ff31ba3de20b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_mat_wvec.c @@ -0,0 +1,14 @@ +/****************** m_mat_wvec.c (in su3.a) ******************** +* * +*void mult_mat_wilson_vec(su3_matrix *mat, wilson_vector *src,*dest) * +* multiply a Wilson vector by a matrix * +* dest <- mat*src * +*/ +#include "complex.h" +#include "su3.h" + +void mult_mat_wilson_vec( su3_matrix *mat, wilson_vector *src, + wilson_vector *dest ){ + register int i; + for(i=0;i<4;i++)mult_su3_mat_vec(mat, &(src->d[i]), &(dest->d[i]) ); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_matvec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_matvec.c new file mode 100644 index 0000000000000000000000000000000000000000..aaff46ae5ac4496ab53f731676695e8f52ea9cf8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_matvec.c @@ -0,0 +1,88 @@ +/**************** m_matvec.c (in su3.a) ******************************* +* * +* void mult_su3_mat_vec( su3_matrix *a, su3_vector *b,*c ) * +* matrix times vector multiply, no adjoints * +* C <- A*B * +*/ +#include "complex.h" +#include "su3.h" + +#ifndef FAST +void mult_su3_mat_vec( su3_matrix *a, su3_vector *b, su3_vector *c ){ +register int i,j; +register complex x,y; + for(i=0;i<3;i++){ + x.real=x.imag=0.0; + for(j=0;j<3;j++){ + CMUL( a->e[i][j] , b->c[j] , y ) + CSUM( x , y ); + } + c->c[i] = x; + } +} +#else +#ifdef NATIVEDOUBLE /* RS6000 version */ +void mult_su3_mat_vec( su3_matrix *a, su3_vector *b, su3_vector *c ){ + + register double a0r,a0i,a1r,a1i,a2r,a2i; + register double b0r,b0i,b1r,b1i,b2r,b2i; + + a0r=a->e[0][0].real; a0i=a->e[0][0].imag; + b0r=b->c[0].real; b0i=b->c[0].imag; + a1r=a->e[0][1].real; a1i=a->e[0][1].imag; + b1r=b->c[1].real; b1i=b->c[1].imag; + a2r=a->e[0][2].real; a2i=a->e[0][2].imag; + b2r=b->c[2].real; b2i=b->c[2].imag; + + c->c[0].real = a0r*b0r - a0i*b0i + a1r*b1r - a1i*b1i + a2r*b2r - a2i*b2i; + c->c[0].imag = a0r*b0i + a0i*b0r + a1r*b1i + a1i*b1r + a2r*b2i + a2i*b2r; + + a0r=a->e[1][0].real; a0i=a->e[1][0].imag; + b0r=b->c[0].real; b0i=b->c[0].imag; + a1r=a->e[1][1].real; a1i=a->e[1][1].imag; + b1r=b->c[1].real; b1i=b->c[1].imag; + a2r=a->e[1][2].real; a2i=a->e[1][2].imag; + b2r=b->c[2].real; b2i=b->c[2].imag; + + c->c[1].real = a0r*b0r - a0i*b0i + a1r*b1r - a1i*b1i + a2r*b2r - a2i*b2i; + c->c[1].imag = a0r*b0i + a0i*b0r + a1r*b1i + a1i*b1r + a2r*b2i + a2i*b2r; + + a0r=a->e[2][0].real; a0i=a->e[2][0].imag; + b0r=b->c[0].real; b0i=b->c[0].imag; + a1r=a->e[2][1].real; a1i=a->e[2][1].imag; + b1r=b->c[1].real; b1i=b->c[1].imag; + a2r=a->e[2][2].real; a2i=a->e[2][2].imag; + b2r=b->c[2].real; b2i=b->c[2].imag; + + c->c[2].real = a0r*b0r - a0i*b0i + a1r*b1r - a1i*b1i + a2r*b2r - a2i*b2i; + c->c[2].imag = a0r*b0i + a0i*b0r + a1r*b1i + a1i*b1r + a2r*b2i + a2i*b2r; + +} + +#else +void mult_su3_mat_vec( su3_matrix *a, su3_vector *b, su3_vector *c ){ +int i,j,k; +register radix t,ar,ai,br,bi,cr,ci; + for(i=0;i<3;i++){ + + ar=a->e[i][0].real; ai=a->e[i][0].imag; + br=b->c[0].real; bi=b->c[0].imag; + cr=ar*br; t=ai*bi; cr -= t; + ci=ar*bi; t=ai*br; ci += t; + + ar=a->e[i][1].real; ai=a->e[i][1].imag; + br=b->c[1].real; bi=b->c[1].imag; + t=ar*br; cr += t; t=ai*bi; cr -= t; + t=ar*bi; ci += t; t=ai*br; ci += t; + + ar=a->e[i][2].real; ai=a->e[i][2].imag; + br=b->c[2].real; bi=b->c[2].imag; + t=ar*br; cr += t; t=ai*bi; cr -= t; + t=ar*bi; ci += t; t=ai*br; ci += t; + + c->c[i].real=cr; + c->c[i].imag=ci; + } +} +#endif /* End of "#ifdef NATIVEDOUBLE" */ +#endif /* End of "#infdef FAST" */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_matvec_ns.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_matvec_ns.c new file mode 100644 index 0000000000000000000000000000000000000000..9f32f8b5c5c53265e1bf1437745b262db9ef506f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_matvec_ns.c @@ -0,0 +1,146 @@ +/***************** m_matvec_ns.c (in su3.a) *************************** +* * +* void mult_su3_mat_vec_nsum( su3_matrix *a, su3_vector *b,*c ) * +* su3_matrix times su3_vector multiply and subtract from another * +* su3_vector * +* C <- C - A*B * +*/ +#include "complex.h" +#include "su3.h" + +#ifndef FAST +/* su3_matrix times su3_vector multiply and subtract from another su3_vector */ +/* c <- A*b-c */ +void mult_su3_mat_vec_nsum( su3_matrix *a, su3_vector *b, su3_vector *c ){ +register int i,j; +register complex x,y; + for(i=0;i<3;i++){ + x.real=x.imag=0.0; + for(j=0;j<3;j++){ + CMUL( a->e[i][j] , b->c[j] , y ) + CSUM( x , y ); + } + c->c[i].real -= x.real; + c->c[i].imag -= x.imag; + } +} + +#else +#ifdef NATIVEDOUBLE +void mult_su3_mat_vec_nsum( su3_matrix *a, su3_vector *b, su3_vector *c ){ + + register double c0r,c0i,c1r,c1i,c2r,c2i; + register double br,bi,a0,a1,a2; + + c0r = c->c[0].real; + c0i = c->c[0].imag; + c1r = c->c[1].real; + c1i = c->c[1].imag; + c2r = c->c[2].real; + c2i = c->c[2].imag; + + br=b->c[0].real; bi=b->c[0].imag; + a0=a->e[0][0].real; + a1=a->e[1][0].real; + a2=a->e[2][0].real; + + c0r -= a0*br; + c1r -= a1*br; + c2r -= a2*br; + c0i -= a0*bi; + c1i -= a1*bi; + c2i -= a2*bi; + + a0=a->e[0][0].imag; + a1=a->e[1][0].imag; + a2=a->e[2][0].imag; + + c0r += a0*bi; + c1r += a1*bi; + c2r += a2*bi; + c0i -= a0*br; + c1i -= a1*br; + c2i -= a2*br; + + br=b->c[1].real; bi=b->c[1].imag; + a0=a->e[0][1].real; + a1=a->e[1][1].real; + a2=a->e[2][1].real; + + c0r -= a0*br; + c1r -= a1*br; + c2r -= a2*br; + c0i -= a0*bi; + c1i -= a1*bi; + c2i -= a2*bi; + + a0=a->e[0][1].imag; + a1=a->e[1][1].imag; + a2=a->e[2][1].imag; + + c0r += a0*bi; + c1r += a1*bi; + c2r += a2*bi; + c0i -= a0*br; + c1i -= a1*br; + c2i -= a2*br; + + br=b->c[2].real; bi=b->c[2].imag; + a0=a->e[0][2].real; + a1=a->e[1][2].real; + a2=a->e[2][2].real; + + c0r -= a0*br; + c1r -= a1*br; + c2r -= a2*br; + c0i -= a0*bi; + c1i -= a1*bi; + c2i -= a2*bi; + + a0=a->e[0][2].imag; + a1=a->e[1][2].imag; + a2=a->e[2][2].imag; + + c0r += a0*bi; + c1r += a1*bi; + c2r += a2*bi; + c0i -= a0*br; + c1i -= a1*br; + c2i -= a2*br; + + c->c[0].real = c0r; + c->c[0].imag = c0i; + c->c[1].real = c1r; + c->c[1].imag = c1i; + c->c[2].real = c2r; + c->c[2].imag = c2i; + +} + +#else +void mult_su3_mat_vec_nsum( su3_matrix *a, su3_vector *b, su3_vector *c ){ +int i,j,k; +register radix t,ar,ai,br,bi,cr,ci; + for(i=0;i<3;i++){ + + ar=a->e[i][0].real; ai=a->e[i][0].imag; + br=b->c[0].real; bi=b->c[0].imag; + cr=ar*br; t=ai*bi; cr -= t; + ci=ar*bi; t=ai*br; ci += t; + + ar=a->e[i][1].real; ai=a->e[i][1].imag; + br=b->c[1].real; bi=b->c[1].imag; + t=ar*br; cr += t; t=ai*bi; cr -= t; + t=ar*bi; ci += t; t=ai*br; ci += t; + + ar=a->e[i][2].real; ai=a->e[i][2].imag; + br=b->c[2].real; bi=b->c[2].imag; + t=ar*br; cr += t; t=ai*bi; cr -= t; + t=ar*bi; ci += t; t=ai*br; ci += t; + + c->c[i].real -= cr; + c->c[i].imag -= ci; + } +} +#endif /* End of "#ifdef NATIVEDOUBLE" */ +#endif /* End of "#ifdef FAST" */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_matvec_s.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_matvec_s.c new file mode 100644 index 0000000000000000000000000000000000000000..b1bcab0d94d5bd6618470882deb9fd028e7e73cc --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_matvec_s.c @@ -0,0 +1,144 @@ +/**************** m_matvec_s.c (in su3.a) ***************************** +* * +* void mult_su3_mat_vec_sum( su3_matrix *a, su3_vector *b,*c ) * +* su3_matrix times su3_vector multiply and add to another su3_vector * +* C <- C + A*B * +*/ +#include "complex.h" +#include "su3.h" + +#ifndef FAST +/* su3_matrix times su3_vector multiply and add to another su3_vector */ +/* c <- A*b+c */ +void mult_su3_mat_vec_sum( su3_matrix *a, su3_vector *b, su3_vector *c ){ +register int i,j; +register complex x,y; + for(i=0;i<3;i++){ + x.real=x.imag=0.0; + for(j=0;j<3;j++){ + CMUL( a->e[i][j] , b->c[j] , y ) + CSUM( x , y ); + } + c->c[i].real += x.real; + c->c[i].imag += x.imag; + } +} + +#else +#ifdef NATIVEDOUBLE /* RS6000 version */ +void mult_su3_mat_vec_sum(a,b,c) su3_matrix *a; su3_vector *b,*c; { + + register double c0r,c0i,c1r,c1i,c2r,c2i; + register double br,bi,a0,a1,a2; + + c0r = c->c[0].real; + c0i = c->c[0].imag; + c1r = c->c[1].real; + c1i = c->c[1].imag; + c2r = c->c[2].real; + c2i = c->c[2].imag; + + br=b->c[0].real; bi=b->c[0].imag; + a0=a->e[0][0].real; + a1=a->e[1][0].real; + a2=a->e[2][0].real; + + c0r += a0*br; + c1r += a1*br; + c2r += a2*br; + c0i += a0*bi; + c1i += a1*bi; + c2i += a2*bi; + + a0=a->e[0][0].imag; + a1=a->e[1][0].imag; + a2=a->e[2][0].imag; + + c0r -= a0*bi; + c1r -= a1*bi; + c2r -= a2*bi; + c0i += a0*br; + c1i += a1*br; + c2i += a2*br; + + br=b->c[1].real; bi=b->c[1].imag; + a0=a->e[0][1].real; + a1=a->e[1][1].real; + a2=a->e[2][1].real; + + c0r += a0*br; + c1r += a1*br; + c2r += a2*br; + c0i += a0*bi; + c1i += a1*bi; + c2i += a2*bi; + + a0=a->e[0][1].imag; + a1=a->e[1][1].imag; + a2=a->e[2][1].imag; + + c0r -= a0*bi; + c1r -= a1*bi; + c2r -= a2*bi; + c0i += a0*br; + c1i += a1*br; + c2i += a2*br; + + br=b->c[2].real; bi=b->c[2].imag; + a0=a->e[0][2].real; + a1=a->e[1][2].real; + a2=a->e[2][2].real; + + c0r += a0*br; + c1r += a1*br; + c2r += a2*br; + c0i += a0*bi; + c1i += a1*bi; + c2i += a2*bi; + + a0=a->e[0][2].imag; + a1=a->e[1][2].imag; + a2=a->e[2][2].imag; + + c0r -= a0*bi; + c1r -= a1*bi; + c2r -= a2*bi; + c0i += a0*br; + c1i += a1*br; + c2i += a2*br; + + c->c[0].real = c0r; + c->c[0].imag = c0i; + c->c[1].real = c1r; + c->c[1].imag = c1i; + c->c[2].real = c2r; + c->c[2].imag = c2i; + +} +#else +void mult_su3_mat_vec_sum( su3_matrix *a, su3_vector *b, su3_vector *c ){ +int i,j,k; +register radix t,ar,ai,br,bi,cr,ci; + for(i=0;i<3;i++){ + + ar=a->e[i][0].real; ai=a->e[i][0].imag; + br=b->c[0].real; bi=b->c[0].imag; + cr=ar*br; t=ai*bi; cr -= t; + ci=ar*bi; t=ai*br; ci += t; + + ar=a->e[i][1].real; ai=a->e[i][1].imag; + br=b->c[1].real; bi=b->c[1].imag; + t=ar*br; cr += t; t=ai*bi; cr -= t; + t=ar*bi; ci += t; t=ai*br; ci += t; + + ar=a->e[i][2].real; ai=a->e[i][2].imag; + br=b->c[2].real; bi=b->c[2].imag; + t=ar*br; cr += t; t=ai*bi; cr -= t; + t=ar*bi; ci += t; t=ai*br; ci += t; + + c->c[i].real += cr; + c->c[i].imag += ci; + } +} +#endif /* End of "#ifdef NATIVEDOUBLE" */ +#endif /* End of "#ifdef FAST" */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_mv_s_4dir.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_mv_s_4dir.c new file mode 100644 index 0000000000000000000000000000000000000000..d8e928f382cc1b4985a0105837cf13c32dbf949f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/m_mv_s_4dir.c @@ -0,0 +1,168 @@ +/**************** m_mv_s_4dir.c (in su3.a) ***************************** +* * +* void mult_su3_mat_vec_sum_4dir( su3_matrix *a, su3_vector *b[0123],*c )* +* Multiply the elements of an array of four su3_matrices by the * +* four su3_vectors, and add the results to * +* produce a single su3_vector. * +* C <- A[0]*B[0]+A[1]*B[1]+A[2]*B[2]+A[3]*B[3] * +*/ +#include "complex.h" +#include "su3.h" + +#ifndef FAST +void mult_su3_mat_vec_sum_4dir( su3_matrix *a, su3_vector *b0, + su3_vector *b1, su3_vector *b2, su3_vector *b3, su3_vector *c ){ + mult_su3_mat_vec( a+0,b0,c ); + mult_su3_mat_vec_sum( a+1,b1,c ); + mult_su3_mat_vec_sum( a+2,b2,c ); + mult_su3_mat_vec_sum( a+3,b3,c ); +} + +#else +/* Fast code, with subroutines inlined */ +#ifdef NATIVEDOUBLE /* IBM RS6000 version */ +void mult_su3_mat_vec_sum_4dir( su3_matrix *a, su3_vector *b0, + su3_vector *b1, su3_vector *b2, su3_vector *b3, su3_vector *c ){ + + register int n; + register double c0r,c0i,c1r,c1i,c2r,c2i; + register double br,bi,a0,a1,a2; + register su3_matrix *mat; + register su3_vector *b; + + c0r = c0i = c1r = c1i = c2r = c2i = 0.0; + mat = a; + + for(n=0;n<4;n++,mat++){ + + switch(n){ + case(0): b=b0; break; + case(1): b=b1; break; + case(2): b=b2; break; + case(3): b=b3; break; + } + + br=b->c[0].real; bi=b->c[0].imag; + a0=mat->e[0][0].real; + a1=mat->e[1][0].real; + a2=mat->e[2][0].real; + + c0r += a0*br; + c1r += a1*br; + c2r += a2*br; + c0i += a0*bi; + c1i += a1*bi; + c2i += a2*bi; + + a0=mat->e[0][0].imag; + a1=mat->e[1][0].imag; + a2=mat->e[2][0].imag; + + c0r -= a0*bi; + c1r -= a1*bi; + c2r -= a2*bi; + c0i += a0*br; + c1i += a1*br; + c2i += a2*br; + + br=b->c[1].real; bi=b->c[1].imag; + a0=mat->e[0][1].real; + a1=mat->e[1][1].real; + a2=mat->e[2][1].real; + + c0r += a0*br; + c1r += a1*br; + c2r += a2*br; + c0i += a0*bi; + c1i += a1*bi; + c2i += a2*bi; + + a0=mat->e[0][1].imag; + a1=mat->e[1][1].imag; + a2=mat->e[2][1].imag; + + c0r -= a0*bi; + c1r -= a1*bi; + c2r -= a2*bi; + c0i += a0*br; + c1i += a1*br; + c2i += a2*br; + + br=b->c[2].real; bi=b->c[2].imag; + a0=mat->e[0][2].real; + a1=mat->e[1][2].real; + a2=mat->e[2][2].real; + + c0r += a0*br; + c1r += a1*br; + c2r += a2*br; + c0i += a0*bi; + c1i += a1*bi; + c2i += a2*bi; + + a0=mat->e[0][2].imag; + a1=mat->e[1][2].imag; + a2=mat->e[2][2].imag; + + c0r -= a0*bi; + c1r -= a1*bi; + c2r -= a2*bi; + c0i += a0*br; + c1i += a1*br; + c2i += a2*br; + + } + + c->c[0].real = c0r; + c->c[0].imag = c0i; + c->c[1].real = c1r; + c->c[1].imag = c1i; + c->c[2].real = c2r; + c->c[2].imag = c2i; + +} + +#else +void mult_su3_mat_vec_sum_4dir( su3_matrix *a, su3_vector *b0, + su3_vector *b1, su3_vector *b2, su3_vector *b3, su3_vector *c ){ + int i,n; + register su3_matrix *at; + register su3_vector *b; + register radix t,ar,ai,br,bi,cr,ci; + + for(i=0;i<3;i++){ + c->c[i].real = 0.0; + c->c[i].imag = 0.0; + } + for(n=0;n<4;n++){ + at = a+n; + switch(n){ + case(0): b=b0; break; + case(1): b=b1; break; + case(2): b=b2; break; + case(3): b=b3; break; + } + for(i=0;i<3;i++){ + + ar=at->e[i][0].real; ai=at->e[i][0].imag; + br=b->c[0].real; bi=b->c[0].imag; + cr=ar*br; t=ai*bi; cr -= t; + ci=ar*bi; t=ai*br; ci += t; + + ar=at->e[i][1].real; ai=at->e[i][1].imag; + br=b->c[1].real; bi=b->c[1].imag; + t=ar*br; cr += t; t=ai*bi; cr -= t; + t=ar*bi; ci += t; t=ai*br; ci += t; + + ar=at->e[i][2].real; ai=at->e[i][2].imag; + br=b->c[2].real; bi=b->c[2].imag; + t=ar*br; cr += t; t=ai*bi; cr -= t; + t=ar*bi; ci += t; t=ai*br; ci += t; + + c->c[i].real += cr; + c->c[i].imag += ci; + } + } +} +#endif /* End of "#ifdef NATIVEDOUBLE" */ +#endif /* End of "#ifdef FAST" */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/macros.h b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/macros.h new file mode 100644 index 0000000000000000000000000000000000000000..47b314f61715b58fc469f79bc48720f5117d0094 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/macros.h @@ -0,0 +1,49 @@ +/* macros for "field offset" and "field pointer", used when fields + are arguments to subroutines */ +/* Usage: fo = F_OFFSET( field ), where "field" is the name of a field + in lattice. + address = F_PT( &site , fo ), where &site is the address of the + site and fo is a field_offset. Usually, the result will have to be + cast to a pointer to the appropriate type. (It is naturally a char *). +*/ +typedef int field_offset; +#define F_OFFSET(a) \ + ((field_offset)(((char *)&(lattice[0]. a ))-((char *)&(lattice[0])) )) +#define F_PT( site , fo ) ((char *)( site ) + (fo)) + +/* macros to loop over sites of a given parity. + Usage: + int i; + site *s; + FOREVENSITES(i,s){ + commands, where s is a pointer to the current site and i is + the index of the site on the node + } +*/ +#ifdef EVENFIRST +#define FOREVENSITES(i,s) \ + for(i=0,s=lattice;iparity==EVEN) +#define FORODDSITES(i,s) \ + for(i=0,s=lattice;iparity==ODD) +#define FORSOMEPARITY(i,s,choice) \ + for(i=0,s=lattice;iparity & (choice)) != 0) +#endif /* end ifdef EVENFIRST */ +#define FORALLSITES(i,s) \ + for(i=0,s=lattice;ie[0][0].imag + m3->e[1][1].imag + m3->e[2][2].imag)*0.33333333; + ah3->m00im = m3->e[0][0].imag - temp; + ah3->m11im = m3->e[1][1].imag - temp; + ah3->m22im = m3->e[2][2].imag - temp; + ah3->m01.real = (m3->e[0][1].real - m3->e[1][0].real)*0.5; + ah3->m02.real = (m3->e[0][2].real - m3->e[2][0].real)*0.5; + ah3->m12.real = (m3->e[1][2].real - m3->e[2][1].real)*0.5; + ah3->m01.imag = (m3->e[0][1].imag + m3->e[1][0].imag)*0.5; + ah3->m02.imag = (m3->e[0][2].imag + m3->e[2][0].imag)*0.5; + ah3->m12.imag = (m3->e[1][2].imag + m3->e[2][1].imag)*0.5; + +}/* make_anti_hermitian */ + +#else +void make_anti_hermitian( su3_matrix *m3, anti_hermitmat *ah3 ) { +radix temp,temp2; + + temp = + (m3->e[0][0].imag + m3->e[1][1].imag); + temp2 = temp + m3->e[2][2].imag; + temp = temp2*0.33333333; + ah3->m00im = m3->e[0][0].imag - temp; + ah3->m11im = m3->e[1][1].imag - temp; + ah3->m22im = m3->e[2][2].imag - temp; + temp = m3->e[0][1].real - m3->e[1][0].real; ah3->m01.real = temp*0.5; + temp = m3->e[0][2].real - m3->e[2][0].real; ah3->m02.real = temp*0.5; + temp = m3->e[1][2].real - m3->e[2][1].real; ah3->m12.real = temp*0.5; + temp = m3->e[0][1].imag + m3->e[1][0].imag; ah3->m01.imag = temp*0.5; + temp = m3->e[0][2].imag + m3->e[2][0].imag; ah3->m02.imag = temp*0.5; + temp = m3->e[1][2].imag + m3->e[2][1].imag; ah3->m12.imag = temp*0.5; + +}/* make_anti_hermitian */ +#endif /*end ifdef FAST */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/mb_gamma.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/mb_gamma.c new file mode 100644 index 0000000000000000000000000000000000000000..061d3d82836c306edc08ff5103fbd192feaf548d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/mb_gamma.c @@ -0,0 +1,104 @@ +/************* mb_gamma.c (in su3.a) **************************/ +/* + Multiply a Wilson vector by a gamma matrix + usage: mult_by_gamma( wilson_vector *src, wilson_vector *dest, int dir ) + dir = XUP, YUP, ZUP, TUP or GAMMAFIVE + + gamma(XUP) + 0 0 0 i + 0 0 i 0 + 0 -i 0 0 + -i 0 0 0 + + gamma(YUP) + 0 0 0 -1 + 0 0 1 0 + 0 1 0 0 + -1 0 0 0 + + gamma(ZUP) + 0 0 i 0 + 0 0 0 -i + -i 0 0 0 + 0 i 0 0 + + gamma(TUP) + 0 0 1 0 + 0 0 0 1 + 1 0 0 0 + 0 1 0 0 + + gamma(FIVE) + 1 0 0 0 + 0 1 0 0 + 0 0 -1 0 + 0 0 0 -1 +*/ +#include +#include "complex.h" +#include "su3.h" +/* Directions, and a macro to give the opposite direction */ +/* These must go from 0 to 7 because they will be used to index an + array. */ +/* Also define NDIRS = number of directions */ +#define XUP 0 +#define YUP 1 +#define ZUP 2 +#define TUP 3 +#define TDOWN 4 +#define ZDOWN 5 +#define YDOWN 6 +#define XDOWN 7 + +#define OPP_DIR(dir) (7-(dir)) /* Opposite direction */ +#define NDIRS 8 /* number of directions */ + +void mult_by_gamma( wilson_vector *src, wilson_vector *dest, int dir ){ + register int i; /*color*/ + + switch(dir){ + case XUP: + for(i=0;i<3;i++){ + TIMESPLUSI( src->d[3].c[i], dest->d[0].c[i] ); + TIMESPLUSI( src->d[2].c[i], dest->d[1].c[i] ); + TIMESMINUSI( src->d[1].c[i], dest->d[2].c[i] ); + TIMESMINUSI( src->d[0].c[i], dest->d[3].c[i] ); + } + break; + case YUP: + for(i=0;i<3;i++){ + TIMESMINUSONE( src->d[3].c[i], dest->d[0].c[i] ); + TIMESPLUSONE( src->d[2].c[i], dest->d[1].c[i] ); + TIMESPLUSONE( src->d[1].c[i], dest->d[2].c[i] ); + TIMESMINUSONE( src->d[0].c[i], dest->d[3].c[i] ); + } + break; + case ZUP: + for(i=0;i<3;i++){ + TIMESPLUSI( src->d[2].c[i], dest->d[0].c[i] ); + TIMESMINUSI( src->d[3].c[i], dest->d[1].c[i] ); + TIMESMINUSI( src->d[0].c[i], dest->d[2].c[i] ); + TIMESPLUSI( src->d[1].c[i], dest->d[3].c[i] ); + } + break; + case TUP: + for(i=0;i<3;i++){ + TIMESPLUSONE( src->d[2].c[i], dest->d[0].c[i] ); + TIMESPLUSONE( src->d[3].c[i], dest->d[1].c[i] ); + TIMESPLUSONE( src->d[0].c[i], dest->d[2].c[i] ); + TIMESPLUSONE( src->d[1].c[i], dest->d[3].c[i] ); + } + break; + case GAMMAFIVE: + for(i=0;i<3;i++){ + TIMESPLUSONE( src->d[0].c[i], dest->d[0].c[i] ); + TIMESPLUSONE( src->d[1].c[i], dest->d[1].c[i] ); + TIMESMINUSONE( src->d[2].c[i], dest->d[2].c[i] ); + TIMESMINUSONE( src->d[3].c[i], dest->d[3].c[i] ); + } + break; + default: + printf("BAD CALL TO MULT_BY_GAMMA()\n"); + } +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/mb_gamma_l.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/mb_gamma_l.c new file mode 100644 index 0000000000000000000000000000000000000000..e049e757e4f244cb35082b771f1476ca3b17f1f5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/mb_gamma_l.c @@ -0,0 +1,126 @@ +/************* mb_gamma_l.c (in su3.a) **************************/ +/* + Multiply a Wilson matrix by a gamma matrix acting on the row index + (This is the first index, or equivalently, multiplication on the left) + usage: mult_by_gamma_left( wilson_matrix *src, wilson_matrix *dest, int dir ) + dir = XUP, YUP, ZUP, TUP or GAMMAFIVE + + gamma(XUP) + 0 0 0 i + 0 0 i 0 + 0 -i 0 0 + -i 0 0 0 + + gamma(YUP) + 0 0 0 -1 + 0 0 1 0 + 0 1 0 0 + -1 0 0 0 + + gamma(ZUP) + 0 0 i 0 + 0 0 0 -i + -i 0 0 0 + 0 i 0 0 + + gamma(TUP) + 0 0 1 0 + 0 0 0 1 + 1 0 0 0 + 0 1 0 0 + + gamma(FIVE) + 1 0 0 0 + 0 1 0 0 + 0 0 -1 0 + 0 0 0 -1 +*/ +#include +#include "complex.h" +#include "su3.h" +/* Directions, and a macro to give the opposite direction */ +/* These must go from 0 to 7 because they will be used to index an + array. */ +/* Also define NDIRS = number of directions */ +#define XUP 0 +#define YUP 1 +#define ZUP 2 +#define TUP 3 +#define TDOWN 4 +#define ZDOWN 5 +#define YDOWN 6 +#define XDOWN 7 + +#define OPP_DIR(dir) (7-(dir)) /* Opposite direction */ +#define NDIRS 8 /* number of directions */ + +void mult_by_gamma_left( wilson_matrix *src, wilson_matrix *dest, int dir ){ + register int i; /*color*/ + register int c2,s2; /* column indices, color and spin */ + + switch(dir){ + case XUP: + for(i=0;i<3;i++)for(s2=0;s2<4;s2++)for(c2=0;c2<3;c2++){ + TIMESPLUSI( src->d[3].c[i].d[s2].c[c2], + dest->d[0].c[i].d[s2].c[c2] ); + TIMESPLUSI( src->d[2].c[i].d[s2].c[c2], + dest->d[1].c[i].d[s2].c[c2] ); + TIMESMINUSI( src->d[1].c[i].d[s2].c[c2], + dest->d[2].c[i].d[s2].c[c2] ); + TIMESMINUSI( src->d[0].c[i].d[s2].c[c2], + dest->d[3].c[i].d[s2].c[c2] ); + } + break; + case YUP: + for(i=0;i<3;i++)for(s2=0;s2<4;s2++)for(c2=0;c2<3;c2++){ + TIMESMINUSONE( src->d[3].c[i].d[s2].c[c2], + dest->d[0].c[i].d[s2].c[c2] ); + TIMESPLUSONE( src->d[2].c[i].d[s2].c[c2], + dest->d[1].c[i].d[s2].c[c2] ); + TIMESPLUSONE( src->d[1].c[i].d[s2].c[c2], + dest->d[2].c[i].d[s2].c[c2] ); + TIMESMINUSONE( src->d[0].c[i].d[s2].c[c2], + dest->d[3].c[i].d[s2].c[c2] ); + } + break; + case ZUP: + for(i=0;i<3;i++)for(s2=0;s2<4;s2++)for(c2=0;c2<3;c2++){ + TIMESPLUSI( src->d[2].c[i].d[s2].c[c2], + dest->d[0].c[i].d[s2].c[c2] ); + TIMESMINUSI( src->d[3].c[i].d[s2].c[c2], + dest->d[1].c[i].d[s2].c[c2] ); + TIMESMINUSI( src->d[0].c[i].d[s2].c[c2], + dest->d[2].c[i].d[s2].c[c2] ); + TIMESPLUSI( src->d[1].c[i].d[s2].c[c2], + dest->d[3].c[i].d[s2].c[c2] ); + } + break; + case TUP: + for(i=0;i<3;i++)for(s2=0;s2<4;s2++)for(c2=0;c2<3;c2++){ + TIMESPLUSONE( src->d[2].c[i].d[s2].c[c2], + dest->d[0].c[i].d[s2].c[c2] ); + TIMESPLUSONE( src->d[3].c[i].d[s2].c[c2], + dest->d[1].c[i].d[s2].c[c2] ); + TIMESPLUSONE( src->d[0].c[i].d[s2].c[c2], + dest->d[2].c[i].d[s2].c[c2] ); + TIMESPLUSONE( src->d[1].c[i].d[s2].c[c2], + dest->d[3].c[i].d[s2].c[c2] ); + } + break; + case GAMMAFIVE: + for(i=0;i<3;i++)for(s2=0;s2<4;s2++)for(c2=0;c2<3;c2++){ + TIMESPLUSONE( src->d[0].c[i].d[s2].c[c2], + dest->d[0].c[i].d[s2].c[c2] ); + TIMESPLUSONE( src->d[1].c[i].d[s2].c[c2], + dest->d[1].c[i].d[s2].c[c2] ); + TIMESMINUSONE( src->d[2].c[i].d[s2].c[c2], + dest->d[2].c[i].d[s2].c[c2] ); + TIMESMINUSONE( src->d[3].c[i].d[s2].c[c2], + dest->d[3].c[i].d[s2].c[c2] ); + } + break; + default: + printf("BAD CALL TO MULT_BY_GAMMA_LEFT()\n"); + } +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/mb_gamma_r.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/mb_gamma_r.c new file mode 100644 index 0000000000000000000000000000000000000000..28a286659a5425eab5f695b4a12df72988183fb0 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/mb_gamma_r.c @@ -0,0 +1,127 @@ +/************* mb_gamma_r.c (in su3.a) **************************/ +/* + Multiply a Wilson matrix by a gamma matrix acting on the column index + (This is the second index, or equivalently, multiplication on the right) + usage: mult_by_gamma_right wilson_matrix *src, wilson_matrix *dest, + int dir ) + dir = XUP, YUP, ZUP, TUP or GAMMAFIVE + + gamma(XUP) + 0 0 0 i + 0 0 i 0 + 0 -i 0 0 + -i 0 0 0 + + gamma(YUP) + 0 0 0 -1 + 0 0 1 0 + 0 1 0 0 + -1 0 0 0 + + gamma(ZUP) + 0 0 i 0 + 0 0 0 -i + -i 0 0 0 + 0 i 0 0 + + gamma(TUP) + 0 0 1 0 + 0 0 0 1 + 1 0 0 0 + 0 1 0 0 + + gamma(FIVE) + 1 0 0 0 + 0 1 0 0 + 0 0 -1 0 + 0 0 0 -1 +*/ +#include +#include "complex.h" +#include "su3.h" +/* Directions, and a macro to give the opposite direction */ +/* These must go from 0 to 7 because they will be used to index an + array. */ +/* Also define NDIRS = number of directions */ +#define XUP 0 +#define YUP 1 +#define ZUP 2 +#define TUP 3 +#define TDOWN 4 +#define ZDOWN 5 +#define YDOWN 6 +#define XDOWN 7 + +#define OPP_DIR(dir) (7-(dir)) /* Opposite direction */ +#define NDIRS 8 /* number of directions */ + +void mult_by_gamma_right( wilson_matrix *src, wilson_matrix *dest, int dir ){ + register int i; /*color*/ + register int c1,s1; /* row indices, color and spin */ + + switch(dir){ + case XUP: + for(i=0;i<3;i++)for(s1=0;s1<4;s1++)for(c1=0;c1<3;c1++){ + TIMESMINUSI( src->d[s1].c[c1].d[3].c[i], + dest->d[s1].c[c1].d[0].c[i] ); + TIMESMINUSI( src->d[s1].c[c1].d[2].c[i], + dest->d[s1].c[c1].d[1].c[i] ); + TIMESPLUSI( src->d[s1].c[c1].d[1].c[i], + dest->d[s1].c[c1].d[2].c[i] ); + TIMESPLUSI( src->d[s1].c[c1].d[0].c[i], + dest->d[s1].c[c1].d[3].c[i] ); + } + break; + case YUP: + for(i=0;i<3;i++)for(s1=0;s1<4;s1++)for(c1=0;c1<3;c1++){ + TIMESMINUSONE( src->d[s1].c[c1].d[3].c[i], + dest->d[s1].c[c1].d[0].c[i] ); + TIMESPLUSONE( src->d[s1].c[c1].d[2].c[i], + dest->d[s1].c[c1].d[1].c[i] ); + TIMESPLUSONE( src->d[s1].c[c1].d[1].c[i], + dest->d[s1].c[c1].d[2].c[i] ); + TIMESMINUSONE( src->d[s1].c[c1].d[0].c[i], + dest->d[s1].c[c1].d[3].c[i] ); + } + break; + case ZUP: + for(i=0;i<3;i++)for(s1=0;s1<4;s1++)for(c1=0;c1<3;c1++){ + TIMESMINUSI( src->d[s1].c[c1].d[2].c[i], + dest->d[s1].c[c1].d[0].c[i] ); + TIMESPLUSI( src->d[s1].c[c1].d[3].c[i], + dest->d[s1].c[c1].d[1].c[i] ); + TIMESPLUSI( src->d[s1].c[c1].d[0].c[i], + dest->d[s1].c[c1].d[2].c[i] ); + TIMESMINUSI( src->d[s1].c[c1].d[1].c[i], + dest->d[s1].c[c1].d[3].c[i] ); + } + break; + case TUP: + for(i=0;i<3;i++)for(s1=0;s1<4;s1++)for(c1=0;c1<3;c1++){ + TIMESPLUSONE( src->d[s1].c[c1].d[2].c[i], + dest->d[s1].c[c1].d[0].c[i] ); + TIMESPLUSONE( src->d[s1].c[c1].d[3].c[i], + dest->d[s1].c[c1].d[1].c[i] ); + TIMESPLUSONE( src->d[s1].c[c1].d[0].c[i], + dest->d[s1].c[c1].d[2].c[i] ); + TIMESPLUSONE( src->d[s1].c[c1].d[1].c[i], + dest->d[s1].c[c1].d[3].c[i] ); + } + break; + case GAMMAFIVE: + for(i=0;i<3;i++)for(s1=0;s1<4;s1++)for(c1=0;c1<3;c1++){ + TIMESPLUSONE( src->d[s1].c[c1].d[0].c[i], + dest->d[s1].c[c1].d[0].c[i] ); + TIMESPLUSONE( src->d[s1].c[c1].d[1].c[i], + dest->d[s1].c[c1].d[1].c[i] ); + TIMESMINUSONE( src->d[s1].c[c1].d[2].c[i], + dest->d[s1].c[c1].d[2].c[i] ); + TIMESMINUSONE( src->d[s1].c[c1].d[3].c[i], + dest->d[s1].c[c1].d[3].c[i] ); + } + break; + default: + printf("BAD CALL TO MULT_BY_GAMMA_RIGHT()\n"); + } +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/msq_su3vec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/msq_su3vec.c new file mode 100644 index 0000000000000000000000000000000000000000..011826844e790aaf053a92a16a7e0f949fcee0a5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/msq_su3vec.c @@ -0,0 +1,48 @@ +/****************** magsq_su3vec.c (in su3.a) ****************************** +* * +* radix magsq_su3vec( su3_vector *a ) * +* return squared magnitude of an SU3 vector +*/ +#include "complex.h" +#include "su3.h" + +#ifndef FAST +radix magsq_su3vec( su3_vector *a ){ +register radix sum; +register int i; + for(i=0,sum=0.0;i<3;i++)sum += a->c[i].real*a->c[i].real + + a->c[i].imag*a->c[i].imag; + return(sum); +} + +#else +#ifdef NATIVEDOUBLE /* IBM RS6000 version */ +radix magsq_su3vec(su3_vector *a){ + + register double ar,ai,sum; + + ar=a->c[0].real; ai=a->c[0].imag; + sum = ar*ar + ai*ai; + + ar=a->c[1].real; ai=a->c[1].imag; + sum += ar*ar + ai*ai; + + ar=a->c[2].real; ai=a->c[2].imag; + sum += ar*ar + ai*ai; + + return((radix)sum); +} +#else +radix magsq_su3vec( su3_vector *a ){ +register radix temp,sum; + sum=0.0; + temp = a->c[0].real*a->c[0].real; sum += temp; + temp = a->c[0].imag*a->c[0].imag; sum += temp; + temp = a->c[1].real*a->c[1].real; sum += temp; + temp = a->c[1].imag*a->c[1].imag; sum += temp; + temp = a->c[2].real*a->c[2].real; sum += temp; + temp = a->c[2].imag*a->c[2].imag; sum += temp; + return(sum); +} +#endif /* End of "#ifdef NATIVEDOUBLE" */ +#endif /* end ifdef FAST */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/msq_wvec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/msq_wvec.c new file mode 100644 index 0000000000000000000000000000000000000000..05e1910dffdb7c220c7f0c93ac74d85f026dd169 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/msq_wvec.c @@ -0,0 +1,57 @@ + /******************** msq_wvec.c (in su3.a) ******************** +* +*radix msq_wvec(wilson_vector *vec) +* squared magnitude of a Wilson vector +* +*/ +#include "complex.h" +#include "su3.h" + +#ifndef FAST +radix magsq_wvec( wilson_vector *vec ){ + register int i; + register radix sum; + sum=0.0; + for(i=0;i<4;i++)sum += magsq_su3vec( &(vec->d[i]) ); + return(sum); + +#else /* Fast version */ +radix magsq_wvec( wilson_vector *vec ){ + +#ifdef NATIVEDOUBLE + register double ar,ai,sum; +#else + register radix ar,ai,sum; +#endif + + ar=vec->d[0].c[0].real; ai=vec->d[0].c[0].imag; + sum = ar*ar + ai*ai; + ar=vec->d[0].c[1].real; ai=vec->d[0].c[1].imag; + sum += ar*ar + ai*ai; + ar=vec->d[0].c[2].real; ai=vec->d[0].c[2].imag; + sum += ar*ar + ai*ai; + + ar=vec->d[1].c[0].real; ai=vec->d[1].c[0].imag; + sum += ar*ar + ai*ai; + ar=vec->d[1].c[1].real; ai=vec->d[1].c[1].imag; + sum += ar*ar + ai*ai; + ar=vec->d[1].c[2].real; ai=vec->d[1].c[2].imag; + sum += ar*ar + ai*ai; + + ar=vec->d[2].c[0].real; ai=vec->d[2].c[0].imag; + sum += ar*ar + ai*ai; + ar=vec->d[2].c[1].real; ai=vec->d[2].c[1].imag; + sum += ar*ar + ai*ai; + ar=vec->d[2].c[2].real; ai=vec->d[2].c[2].imag; + sum += ar*ar + ai*ai; + + ar=vec->d[3].c[0].real; ai=vec->d[3].c[0].imag; + sum += ar*ar + ai*ai; + ar=vec->d[3].c[1].real; ai=vec->d[3].c[1].imag; + sum += ar*ar + ai*ai; + ar=vec->d[3].c[2].real; ai=vec->d[3].c[2].imag; + sum += ar*ar + ai*ai; + + return((radix)sum); +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/pvm3.h b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/pvm3.h new file mode 100644 index 0000000000000000000000000000000000000000..f43a3c4524170484adeb39e54686a8e0c299f7b1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/pvm3.h @@ -0,0 +1,203 @@ + +/* + * PVM 3.0: Parallel Virtual Machine System 3.0 + * University of Tennessee, Knoxville TN. + * Oak Ridge National Laboratory, Oak Ridge TN. + * Emory University, Atlanta GA. + * Authors: A. L. Beguelin, J. J. Dongarra, G. A. Geist, + * R. J. Manchek, B. K. Moore, and V. S. Sunderam + * (C) 1992 All Rights Reserved + * + * NOTICE + * + * Permission to use, copy, modify, and distribute this software and + * its documentation for any purpose and without fee is hereby granted + * provided that the above copyright notice appear in all copies and + * that both the copyright notice and this permission notice appear in + * supporting documentation. + * + * Neither the Institutions (Emory University, Oak Ridge National + * Laboratory, and University of Tennessee) nor the Authors make any + * representations about the suitability of this software for any + * purpose. This software is provided ``as is'' without express or + * implied warranty. + * + * PVM 3.0 was funded in part by the U.S. Department of Energy, the + * National Science Foundation and the State of Tennessee. + */ + +#ifndef _PVM3_H_ + +#define _PVM3_H_ + +/* +* Data packing styles for pvm_initsend() +*/ + +#define PvmDataDefault 0 +#define PvmDataRaw 1 +#define PvmDataInPlace 2 +#define PvmDataFoo 3 + +/* +* pvm_spawn options +*/ + +#define PvmTaskDefault 0 +#define PvmTaskHost 1 /* specify host */ +#define PvmTaskArch 2 /* specify architecture */ +#define PvmTaskDebug 4 /* start task in debugger */ + +/* +* pvm_notify types +*/ + +#define PvmTaskExit 1 /* on task exit */ +#define PvmHostDelete 2 /* on host fail/delete */ +#define PvmHostAdd 3 /* on host startup */ + +/* +* Libpvm error codes +*/ + +#define PvmOk 0 /* okay */ + /* reserve -1 */ +#define PvmBadParam -2 /* bad parameter (neg msg id, etc) */ +#define PvmMismatch -3 /* barrier count mismatch */ +#define PvmNoData -5 /* read past end of buffer */ +#define PvmNoHost -6 /* no such host */ +#define PvmNoFile -7 /* no such executable */ +#define PvmNoMem -10 /* can't get memory */ +#define PvmBadMsg -12 /* received msg can't be decoded */ +#define PvmSysErr -14 /* can't contact our pvmd/some system error */ +#define PvmNoBuf -15 /* no current buffer */ +#define PvmNoSuchBuf -16 /* bad message id */ +#define PvmNullGroup -17 /* null group name is illegal */ +#define PvmDupGroup -18 /* already in group */ +#define PvmNoGroup -19 /* no group with name */ +#define PvmNotInGroup -20 /* task not in group */ +#define PvmNoInst -21 /* no such instance in group */ +#define PvmHostFail -22 /* host failed */ +#define PvmNoParent -23 /* no parent task */ +#define PvmNotImpl -24 /* function not implemented */ +#define PvmDSysErr -25 /* pvmd system error */ +#define PvmBadVersion -26 /* pvmd-pvmd protocol version mismatch */ +#define PvmOutOfRes -27 /* out of resources */ +#define PvmDupHost -28 /* host already configured */ +#define PvmCantStart -29 /* failed to exec new slave pvmd */ +#define PvmAlready -30 /* already doing operation */ +#define PvmNoTask -31 /* no such task */ +#define PvmNoEntry -32 /* no such name, index pair */ +#define PvmDupEntry -33 /* name, index pair already exists */ + +/* +* returned by pvm_config() +*/ + +struct hostinfo { + int hi_tid; /* pvmd tid */ + char *hi_name; /* host name */ + char *hi_arch; /* host arch */ + int hi_mtu; /* max packet length */ + int hi_speed; /* cpu relative speed */ +}; + +/* +* returned by pvm_tasks() +*/ + +struct taskinfo { + int ti_tid; /* task id */ + int ti_ptid; /* parent tid */ + int ti_host; /* pvmd tid */ + int ti_flag; /* status flags */ + char *ti_a_out; /* a.out name */ +}; + + +#ifdef __ProtoGlarp__ +#undef __ProtoGlarp__ +#endif +#ifdef __STDC__ +#define __ProtoGlarp__(x) x +#else +#define __ProtoGlarp__(x) () +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +int pvm_addhosts __ProtoGlarp__(( char **names, int count, int *svp )); +int pvm_barrier __ProtoGlarp__(( char *group, int count )); +int pvm_bcast __ProtoGlarp__(( char *group, int code )); +int pvm_bufinfo __ProtoGlarp__(( int mid, int *len, int *code, int *tid )); +int pvm_config __ProtoGlarp__(( int *nhostp, int *narchp, + struct hostinfo **hostp )); +int pvm_delete __ProtoGlarp__(( char *name, int req )); +int pvm_delhosts __ProtoGlarp__(( char **names, int count, int *svp )); +int pvm_exit __ProtoGlarp__(( void )); +int pvm_freebuf __ProtoGlarp__(( int mid )); +int pvm_getinst __ProtoGlarp__(( char *group, int tid )); +int pvm_getrbuf __ProtoGlarp__(( void )); +int pvm_getsbuf __ProtoGlarp__(( void )); +int pvm_gettid __ProtoGlarp__(( char *group, int inst )); +int pvm_gsize __ProtoGlarp__(( char *group )); +int pvm_halt __ProtoGlarp__(( void )); +int pvm_initsend __ProtoGlarp__(( int encod )); +int pvm_insert __ProtoGlarp__(( char *name, int req, int data )); +int pvm_joingroup __ProtoGlarp__(( char *group )); +int pvm_kill __ProtoGlarp__(( int tid )); +int pvm_lookup __ProtoGlarp__(( char *name, int req, int *datap )); +int pvm_lvgroup __ProtoGlarp__(( char *group )); +int pvm_mcast __ProtoGlarp__(( int *tids, int count, int code )); +int pvm_mkbuf __ProtoGlarp__(( int encod )); +int pvm_mstat __ProtoGlarp__(( char *host )); +int pvm_mytid __ProtoGlarp__(( void )); +int pvm_notify __ProtoGlarp__(( int what, int code, + int count, int *tids )); +int pvm_nrecv __ProtoGlarp__(( int tid, int code )); +int pvm_parent __ProtoGlarp__(( void )); +int pvm_perror __ProtoGlarp__(( char *msg )); +int pvm_pkbyte __ProtoGlarp__(( char *cp, int cnt, int std )); +int pvm_pkcplx __ProtoGlarp__(( radix *xp, int cnt, int std )); +int pvm_pkdcplx __ProtoGlarp__(( double *zp, int cnt, int std )); +int pvm_pkdouble __ProtoGlarp__(( double *dp, int cnt, int std )); +int pvm_pkradix __ProtoGlarp__(( radix *fp, int cnt, int std )); +int pvm_pkint __ProtoGlarp__(( int *np, int cnt, int std )); +int pvm_pklong __ProtoGlarp__(( long *np, int cnt, int std )); +int pvm_pkshort __ProtoGlarp__(( short *np, int cnt, int std )); +int pvm_pkstr __ProtoGlarp__(( char *cp )); +int pvm_pstat __ProtoGlarp__(( int tid )); +int pvm_recv __ProtoGlarp__(( int tid, int code )); +int (*pvm_recvf __ProtoGlarp__(( int (*new)() )) )(); +int pvm_send __ProtoGlarp__(( int tid, int code )); +int pvm_sendsig __ProtoGlarp__(( int tid, int signum )); +int pvm_serror __ProtoGlarp__(( int how )); +int pvm_setdebug __ProtoGlarp__(( int mask )); +int pvm_setrbuf __ProtoGlarp__(( int mid )); +int pvm_setsbuf __ProtoGlarp__(( int mid )); +int pvm_spawn __ProtoGlarp__(( char *file, char **argv, int flags, + char *where, int count, int *tids )); +int pvm_start_pvmd __ProtoGlarp__(( int argc, char **argv )); +int pvm_tasks __ProtoGlarp__(( int where, int *ntaskp, + struct taskinfo **taskp )); +int pvm_tickle __ProtoGlarp__(( int how )); +int pvm_tidtohost __ProtoGlarp__(( int tid )); +int pvm_upkbyte __ProtoGlarp__(( char *cp, int cnt, int std )); +int pvm_upkcplx __ProtoGlarp__(( radix *xp, int cnt, int std )); +int pvm_upkdcplx __ProtoGlarp__(( double *zp, int cnt, int std )); +int pvm_upkdouble __ProtoGlarp__(( double *dp, int cnt, int std )); +int pvm_upkradix __ProtoGlarp__(( radix *fp, int cnt, int std )); +int pvm_upkint __ProtoGlarp__(( int *np, int cnt, int std )); +int pvm_upklong __ProtoGlarp__(( long *np, int cnt, int std )); +int pvm_upkshort __ProtoGlarp__(( short *np, int cnt, int std )); +int pvm_upkstr __ProtoGlarp__(( char *cp )); +char *pvm_version __ProtoGlarp__(( void )); + +#ifdef __cplusplus +} +#endif + +#endif /*_PVM3_H_*/ + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/radix.h b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/radix.h new file mode 100644 index 0000000000000000000000000000000000000000..ac192dd3ca472af904227a2049dcc22aa0f4af98 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/radix.h @@ -0,0 +1,12 @@ +#ifndef RADIX +#define RADIX +/* this file just defines radix */ + +#define RADIX_F /* define symbol so that know radix is in use + * and is float + * another option: RADIX_D + */ + +typedef float radix; /* basic type */ + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/rand_ahmat.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/rand_ahmat.c new file mode 100644 index 0000000000000000000000000000000000000000..06aec2236a6553386803b2dfd9dca539ca52a0d4 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/rand_ahmat.c @@ -0,0 +1,31 @@ +/****************** rand_ahmat.c (in su3.a) *************************** +* * +* void random_anti_hermitian( anti_hermitmat *mat_antihermit, passthru *prn_pt)* +* Creates gaussian random anti-hermitian matrices * +* Normalization is < |m01|^2 > = 1, or < m01.real*m01.real > = 1/2 * +* The argument "prn_pt" is a pointer to be passed to gaussian_rand_no() * +* RS6000 may choke on void * * +*/ +typedef void passthru; +#include +#include "complex.h" +#include "su3.h" + +void random_anti_hermitian( anti_hermitmat *mat_antihermit, passthru *prn_pt) { +radix r3,r8; +radix sqrt_third; + + sqrt_third = sqrt( (double)(1.0/3.0) ); + r3=gaussian_rand_no(prn_pt); + r8=gaussian_rand_no(prn_pt); + mat_antihermit->m00im=r3+sqrt_third*r8; + mat_antihermit->m11im= -r3+sqrt_third*r8; + mat_antihermit->m22im= -2.0*sqrt_third*r8; + mat_antihermit->m01.real=gaussian_rand_no(prn_pt); + mat_antihermit->m02.real=gaussian_rand_no(prn_pt); + mat_antihermit->m12.real=gaussian_rand_no(prn_pt); + mat_antihermit->m01.imag=gaussian_rand_no(prn_pt); + mat_antihermit->m02.imag=gaussian_rand_no(prn_pt); + mat_antihermit->m12.imag=gaussian_rand_no(prn_pt); + +}/*random_anti_hermitian_*/ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/realtr.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/realtr.c new file mode 100644 index 0000000000000000000000000000000000000000..24c6f5289ada8c5df4697f1f06b32d295bd462f7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/realtr.c @@ -0,0 +1,15 @@ +/****************** realtr.c (in su3.a) ******************************* +* * +* radix realtrace_su3( su3_matrix *a,*b) * +* return Re( Tr( A_adjoint*B ) * +*/ +#include "complex.h" +#include "su3.h" + +radix realtrace_su3( su3_matrix *a, su3_matrix *b ){ +register int i,j; +register radix sum; + for(sum=0.0,i=0;i<3;i++)for(j=0;j<3;j++) + sum+= a->e[i][j].real*b->e[i][j].real + a->e[i][j].imag*b->e[i][j].imag; + return(sum); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_a_mat.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_a_mat.c new file mode 100644 index 0000000000000000000000000000000000000000..cc2cabdb6a1a8a2423d56f95e65b4270dac3e4d5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_a_mat.c @@ -0,0 +1,49 @@ +/**************** s_m_a_mat.c (in su3.a) ****************************** +* * +* void scalar_mult_add_su3_matrix( su3_matrix *a, su3_matrix *b, * +* radix s, su3_matrix *c) * +* C <- A + s*B * +*/ +#include "complex.h" +#include "su3.h" + +/* c <- a + s*b, matrices */ +void scalar_mult_add_su3_matrix(su3_matrix *a,su3_matrix *b,radix s, + su3_matrix *c){ + +#ifndef NATIVEDOUBLE +register int i,j; + for(i=0;i<3;i++)for(j=0;j<3;j++){ + c->e[i][j].real = a->e[i][j].real + s*b->e[i][j].real; + c->e[i][j].imag = a->e[i][j].imag + s*b->e[i][j].imag; + } + +#else /* RS6000 version */ + + register double ss; + + ss = s; + + c->e[0][0].real = a->e[0][0].real + ss*b->e[0][0].real; + c->e[0][0].imag = a->e[0][0].imag + ss*b->e[0][0].imag; + c->e[0][1].real = a->e[0][1].real + ss*b->e[0][1].real; + c->e[0][1].imag = a->e[0][1].imag + ss*b->e[0][1].imag; + c->e[0][2].real = a->e[0][2].real + ss*b->e[0][2].real; + c->e[0][2].imag = a->e[0][2].imag + ss*b->e[0][2].imag; + + c->e[1][0].real = a->e[1][0].real + ss*b->e[1][0].real; + c->e[1][0].imag = a->e[1][0].imag + ss*b->e[1][0].imag; + c->e[1][1].real = a->e[1][1].real + ss*b->e[1][1].real; + c->e[1][1].imag = a->e[1][1].imag + ss*b->e[1][1].imag; + c->e[1][2].real = a->e[1][2].real + ss*b->e[1][2].real; + c->e[1][2].imag = a->e[1][2].imag + ss*b->e[1][2].imag; + + c->e[2][0].real = a->e[2][0].real + ss*b->e[2][0].real; + c->e[2][0].imag = a->e[2][0].imag + ss*b->e[2][0].imag; + c->e[2][1].real = a->e[2][1].real + ss*b->e[2][1].real; + c->e[2][1].imag = a->e[2][1].imag + ss*b->e[2][1].imag; + c->e[2][2].real = a->e[2][2].real + ss*b->e[2][2].real; + c->e[2][2].imag = a->e[2][2].imag + ss*b->e[2][2].imag; + +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_a_vec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_a_vec.c new file mode 100644 index 0000000000000000000000000000000000000000..2a5090ffb2e3728b020749114789a1da463f56a2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_a_vec.c @@ -0,0 +1,36 @@ +/**************** s_m_a_vec.c (in su3.a) ****************************** +* * +* void scalar_mult_add_su3_vector( su3_vector *a, su3_vector *b, * +* radix s, su3_vector *c) * +* C <- A + s*B, A,B and C vectors * +*/ +#include "complex.h" +#include "su3.h" + +/* c <- a + s*b, vectors */ + +void scalar_mult_add_su3_vector(su3_vector *a, su3_vector *b, radix s, + su3_vector *c){ + +#ifndef NATIVEDOUBLE + register int i; + for(i=0;i<3;i++){ + c->c[i].real = a->c[i].real + s*b->c[i].real; + c->c[i].imag = a->c[i].imag + s*b->c[i].imag; + } + +#else /* RS6000 version */ + + register double ss; + + ss = s; + + c->c[0].real = a->c[0].real + ss*b->c[0].real; + c->c[0].imag = a->c[0].imag + ss*b->c[0].imag; + c->c[1].real = a->c[1].real + ss*b->c[1].real; + c->c[1].imag = a->c[1].imag + ss*b->c[1].imag; + c->c[2].real = a->c[2].real + ss*b->c[2].real; + c->c[2].imag = a->c[2].imag + ss*b->c[2].imag; + +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_a_wvec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_a_wvec.c new file mode 100644 index 0000000000000000000000000000000000000000..6805599b797832a2333b5d4ac160fe682533fea9 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_a_wvec.c @@ -0,0 +1,51 @@ +/******************** vol_s_m_a_wvec.c (in su3.a) ******************** +* +*void scalar_mult_add_wvec(wilson_vector *src1, wilson_vector *src2, + radix s, wilson_vector *dest) +* Multiply a Wilson vector by a scalar and add to another vector +* dest <- src1 + s*src2 +*/ + +#include "lattice.h" + +void scalar_mult_sum_wvec_V(wilson_vector s1[], wilson_vector s2[], radix ss, + wilson_vector d[]) +{ + int i; + wilson_vector *dest; + register wilson_vector src1,src2; + + for_active_sites(i) { + src1 = s1[i]; + src2 = s2[i]; + dest = &d[i] + + dest->d[0].c[0].real = src1.d[0].c[0].real + ss*src2.d[0].c[0].real; + dest->d[0].c[0].imag = src1.d[0].c[0].imag + ss*src2.d[0].c[0].imag; + dest->d[0].c[1].real = src1.d[0].c[1].real + ss*src2.d[0].c[1].real; + dest->d[0].c[1].imag = src1.d[0].c[1].imag + ss*src2.d[0].c[1].imag; + dest->d[0].c[2].real = src1.d[0].c[2].real + ss*src2.d[0].c[2].real; + dest->d[0].c[2].imag = src1.d[0].c[2].imag + ss*src2.d[0].c[2].imag; + + dest->d[1].c[0].real = src1.d[1].c[0].real + ss*src2.d[1].c[0].real; + dest->d[1].c[0].imag = src1.d[1].c[0].imag + ss*src2.d[1].c[0].imag; + dest->d[1].c[1].real = src1.d[1].c[1].real + ss*src2.d[1].c[1].real; + dest->d[1].c[1].imag = src1.d[1].c[1].imag + ss*src2.d[1].c[1].imag; + dest->d[1].c[2].real = src1.d[1].c[2].real + ss*src2.d[1].c[2].real; + dest->d[1].c[2].imag = src1.d[1].c[2].imag + ss*src2.d[1].c[2].imag; + + dest->d[2].c[0].real = src1.d[2].c[0].real + ss*src2.d[2].c[0].real; + dest->d[2].c[0].imag = src1.d[2].c[0].imag + ss*src2.d[2].c[0].imag; + dest->d[2].c[1].real = src1.d[2].c[1].real + ss*src2.d[2].c[1].real; + dest->d[2].c[1].imag = src1.d[2].c[1].imag + ss*src2.d[2].c[1].imag; + dest->d[2].c[2].real = src1.d[2].c[2].real + ss*src2.d[2].c[2].real; + dest->d[2].c[2].imag = src1.d[2].c[2].imag + ss*src2.d[2].c[2].imag; + + dest->d[3].c[0].real = src1.d[3].c[0].real + ss*src2.d[3].c[0].real; + dest->d[3].c[0].imag = src1.d[3].c[0].imag + ss*src2.d[3].c[0].imag; + dest->d[3].c[1].real = src1.d[3].c[1].real + ss*src2.d[3].c[1].real; + dest->d[3].c[1].imag = src1.d[3].c[1].imag + ss*src2.d[3].c[1].imag; + dest->d[3].c[2].real = src1.d[3].c[2].real + ss*src2.d[3].c[2].real; + dest->d[3].c[2].imag = src1.d[3].c[2].imag + ss*src2.d[3].c[2].imag; + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_atm_wvec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_atm_wvec.c new file mode 100644 index 0000000000000000000000000000000000000000..e18811e5c95e1174b12f4c57ff58c1ae7baf7fb7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_atm_wvec.c @@ -0,0 +1,61 @@ +/***************** s_m_atm_wvec.c (in su3.a) ******************** +* +*void scalar_mult_addtm_wvec(wilson_vector *src1, wilson_vector *src2, + radix s, wilson_vector *dest) +* Multiply a Wilson vector by a scalar and add to minus one times +* another vector +* dest <- (-1)*src1 + s*src2 +*/ +#include "complex.h" +#include "su3.h" + +void scalar_mult_addtm_wvec(wilson_vector *src1,wilson_vector *src2, + radix s,wilson_vector *dest){ + +#ifndef NATIVEDOUBLE + register int i,j; + for(i=0;i<4;i++){ /*spins*/ + for(j=0;j<3;j++){ /*colors*/ + dest->d[i].c[j].real = -src1->d[i].c[j].real + + s*src2->d[i].c[j].real; + dest->d[i].c[j].imag = -src1->d[i].c[j].imag + + s*src2->d[i].c[j].imag; + } + } + +#else /* RS6000 version */ + + register double ss; + ss = s; + + dest->d[0].c[0].real = -src1->d[0].c[0].real + ss*src2->d[0].c[0].real; + dest->d[0].c[0].imag = -src1->d[0].c[0].imag + ss*src2->d[0].c[0].imag; + dest->d[0].c[1].real = -src1->d[0].c[1].real + ss*src2->d[0].c[1].real; + dest->d[0].c[1].imag = -src1->d[0].c[1].imag + ss*src2->d[0].c[1].imag; + dest->d[0].c[2].real = -src1->d[0].c[2].real + ss*src2->d[0].c[2].real; + dest->d[0].c[2].imag = -src1->d[0].c[2].imag + ss*src2->d[0].c[2].imag; + + dest->d[1].c[0].real = -src1->d[1].c[0].real + ss*src2->d[1].c[0].real; + dest->d[1].c[0].imag = -src1->d[1].c[0].imag + ss*src2->d[1].c[0].imag; + dest->d[1].c[1].real = -src1->d[1].c[1].real + ss*src2->d[1].c[1].real; + dest->d[1].c[1].imag = -src1->d[1].c[1].imag + ss*src2->d[1].c[1].imag; + dest->d[1].c[2].real = -src1->d[1].c[2].real + ss*src2->d[1].c[2].real; + dest->d[1].c[2].imag = -src1->d[1].c[2].imag + ss*src2->d[1].c[2].imag; + + dest->d[2].c[0].real = -src1->d[2].c[0].real + ss*src2->d[2].c[0].real; + dest->d[2].c[0].imag = -src1->d[2].c[0].imag + ss*src2->d[2].c[0].imag; + dest->d[2].c[1].real = -src1->d[2].c[1].real + ss*src2->d[2].c[1].real; + dest->d[2].c[1].imag = -src1->d[2].c[1].imag + ss*src2->d[2].c[1].imag; + dest->d[2].c[2].real = -src1->d[2].c[2].real + ss*src2->d[2].c[2].real; + dest->d[2].c[2].imag = -src1->d[2].c[2].imag + ss*src2->d[2].c[2].imag; + + dest->d[3].c[0].real = -src1->d[3].c[0].real + ss*src2->d[3].c[0].real; + dest->d[3].c[0].imag = -src1->d[3].c[0].imag + ss*src2->d[3].c[0].imag; + dest->d[3].c[1].real = -src1->d[3].c[1].real + ss*src2->d[3].c[1].real; + dest->d[3].c[1].imag = -src1->d[3].c[1].imag + ss*src2->d[3].c[1].imag; + dest->d[3].c[2].real = -src1->d[3].c[2].real + ss*src2->d[3].c[2].real; + dest->d[3].c[2].imag = -src1->d[3].c[2].imag + ss*src2->d[3].c[2].imag; + +#endif +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_hwvec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_hwvec.c new file mode 100644 index 0000000000000000000000000000000000000000..977b9853d97d04ee8cda8fa5047578451d5dec10 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_hwvec.c @@ -0,0 +1,38 @@ +/******************** s_m_hwvec.c (in su3.a) ******************** +* +*void scalar_mult_hwvec(half_wilson_vector *src, radix s, + half_wilson_vector *dest) +* Multiply a half Wilson vector by a scalar +* dest <- s*src +*/ +#include "complex.h" +#include "su3.h" + +void scalar_mult_hwvec( half_wilson_vector *src, radix s, + half_wilson_vector *dest ){ + +#ifndef NATIVEDOUBLE + register int i; + for(i=0;i<2;i++)scalar_mult_su3_vector( &(src->h[i]), s, &(dest->h[i])); + +#else /* RS6000 version */ + + register double ss; + ss = s; + + dest->h[0].c[0].real = ss*src->h[0].c[0].real; + dest->h[0].c[0].imag = ss*src->h[0].c[0].imag; + dest->h[0].c[1].real = ss*src->h[0].c[1].real; + dest->h[0].c[1].imag = ss*src->h[0].c[1].imag; + dest->h[0].c[2].real = ss*src->h[0].c[2].real; + dest->h[0].c[2].imag = ss*src->h[0].c[2].imag; + + dest->h[1].c[0].real = ss*src->h[1].c[0].real; + dest->h[1].c[0].imag = ss*src->h[1].c[0].imag; + dest->h[1].c[1].real = ss*src->h[1].c[1].real; + dest->h[1].c[1].imag = ss*src->h[1].c[1].imag; + dest->h[1].c[2].real = ss*src->h[1].c[2].real; + dest->h[1].c[2].imag = ss*src->h[1].c[2].imag; + +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_mat.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_mat.c new file mode 100644 index 0000000000000000000000000000000000000000..e79276418c66646b267cb2a543f4bb7b66c867be --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_mat.c @@ -0,0 +1,47 @@ +/****************** s_m_mat.c (in su3.a) ****************************** +* * +* void scalar_mult_su3_matrix( su3_matrix *a, radix s, su3_matrix *b) * +* B <- s*A * +*/ +#include "complex.h" +#include "su3.h" + +/* b <- s*a, matrices */ +void scalar_mult_su3_matrix( su3_matrix *a, radix s, su3_matrix *b ){ + +#ifndef NATIVEDOUBLE +register int i,j; + for(i=0;i<3;i++)for(j=0;j<3;j++){ + b->e[i][j].real = s*a->e[i][j].real; + b->e[i][j].imag = s*a->e[i][j].imag; + } + +#else /* RS6000 version */ + + register double ss; + + ss = s; + + b->e[0][0].real = ss*a->e[0][0].real; + b->e[0][0].imag = ss*a->e[0][0].imag; + b->e[0][1].real = ss*a->e[0][1].real; + b->e[0][1].imag = ss*a->e[0][1].imag; + b->e[0][2].real = ss*a->e[0][2].real; + b->e[0][2].imag = ss*a->e[0][2].imag; + + b->e[1][0].real = ss*a->e[1][0].real; + b->e[1][0].imag = ss*a->e[1][0].imag; + b->e[1][1].real = ss*a->e[1][1].real; + b->e[1][1].imag = ss*a->e[1][1].imag; + b->e[1][2].real = ss*a->e[1][2].real; + b->e[1][2].imag = ss*a->e[1][2].imag; + + b->e[2][0].real = ss*a->e[2][0].real; + b->e[2][0].imag = ss*a->e[2][0].imag; + b->e[2][1].real = ss*a->e[2][1].real; + b->e[2][1].imag = ss*a->e[2][1].imag; + b->e[2][2].real = ss*a->e[2][2].real; + b->e[2][2].imag = ss*a->e[2][2].imag; + +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_s_mat.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_s_mat.c new file mode 100644 index 0000000000000000000000000000000000000000..81913a62a921f4160e0e2b0e7aa418fc2f1677cf --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_s_mat.c @@ -0,0 +1,49 @@ +/**************** s_m_s_mat.c (in su3.a) ****************************** +* * +* void scalar_mult_sub_su3_matrix( su3_matrix *a, su3_matrix *b, * +* radix s, su3_matrix *c) * +* C <- A - s*B, A,B and C matrices * +*/ +#include "complex.h" +#include "su3.h" + +/* c <- a - s*b, matrices */ +void scalar_mult_sub_su3_matrix(su3_matrix *a,su3_matrix *b,radix s, + su3_matrix *c){ + +#ifndef NATIVEDOUBLE +register int i,j; + for(i=0;i<3;i++)for(j=0;j<3;j++){ + c->e[i][j].real = a->e[i][j].real - s*b->e[i][j].real; + c->e[i][j].imag = a->e[i][j].imag - s*b->e[i][j].imag; + } + +#else /* RS6000 version */ + + register double ss; + + ss = s; + + c->e[0][0].real = a->e[0][0].real - ss*b->e[0][0].real; + c->e[0][0].imag = a->e[0][0].imag - ss*b->e[0][0].imag; + c->e[0][1].real = a->e[0][1].real - ss*b->e[0][1].real; + c->e[0][1].imag = a->e[0][1].imag - ss*b->e[0][1].imag; + c->e[0][2].real = a->e[0][2].real - ss*b->e[0][2].real; + c->e[0][2].imag = a->e[0][2].imag - ss*b->e[0][2].imag; + + c->e[1][0].real = a->e[1][0].real - ss*b->e[1][0].real; + c->e[1][0].imag = a->e[1][0].imag - ss*b->e[1][0].imag; + c->e[1][1].real = a->e[1][1].real - ss*b->e[1][1].real; + c->e[1][1].imag = a->e[1][1].imag - ss*b->e[1][1].imag; + c->e[1][2].real = a->e[1][2].real - ss*b->e[1][2].real; + c->e[1][2].imag = a->e[1][2].imag - ss*b->e[1][2].imag; + + c->e[2][0].real = a->e[2][0].real - ss*b->e[2][0].real; + c->e[2][0].imag = a->e[2][0].imag - ss*b->e[2][0].imag; + c->e[2][1].real = a->e[2][1].real - ss*b->e[2][1].real; + c->e[2][1].imag = a->e[2][1].imag - ss*b->e[2][1].imag; + c->e[2][2].real = a->e[2][2].real - ss*b->e[2][2].real; + c->e[2][2].imag = a->e[2][2].imag - ss*b->e[2][2].imag; + +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_s_vec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_s_vec.c new file mode 100644 index 0000000000000000000000000000000000000000..afa63246228d67effecee462aa85075133f1e409 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_s_vec.c @@ -0,0 +1,35 @@ +/***************** s_m_s_vec.c (in su3.a) ***************************** +* * +* void scalar_mult_sub_su3_vector( su3_vector *a, su3_vector *b, * +* radix s, su3_vector *c) * +* C <- A - s*B, A,B and C vectors * +*/ +#include "complex.h" +#include "su3.h" + +/* c <- a - s*b, vectors */ +void scalar_mult_sub_su3_vector(su3_vector *a,su3_vector *b,radix s, + su3_vector *c){ + +#ifndef NATIVEDOUBLE + register int i; + for(i=0;i<3;i++){ + c->c[i].real = a->c[i].real - s*b->c[i].real; + c->c[i].imag = a->c[i].imag - s*b->c[i].imag; + } + +#else /* RS6000 version */ + + register double ss; + + ss = s; + + c->c[0].real = a->c[0].real - ss*b->c[0].real; + c->c[0].imag = a->c[0].imag - ss*b->c[0].imag; + c->c[1].real = a->c[1].real - ss*b->c[1].real; + c->c[1].imag = a->c[1].imag - ss*b->c[1].imag; + c->c[2].real = a->c[2].real - ss*b->c[2].real; + c->c[2].imag = a->c[2].imag - ss*b->c[2].imag; + +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_sum_vec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_sum_vec.c new file mode 100644 index 0000000000000000000000000000000000000000..3ecd401077d95ed1f659980a69d6e90715d4e6cf --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_sum_vec.c @@ -0,0 +1,33 @@ +/**************** s_m_sum_vec.c (in su3.a) **************************** +* * +* void scalar_mult_sum_su3_vector( su3_vector *a, su3_vector *b, radix s )* +* A <- A + s*B, A and B vectors * +*/ +#include "complex.h" +#include "su3.h" + +/* a <- a + s*b, vectors */ +void scalar_mult_sum_su3_vector(su3_vector *a, su3_vector *b, radix s){ + +#ifndef NATIVEDOUBLE +register int i; + for(i=0;i<3;i++){ + a->c[i].real += s*b->c[i].real; + a->c[i].imag += s*b->c[i].imag; + } + +#else /* RS6000 version */ + + register double ss; + + ss = s; + + a->c[0].real += ss*b->c[0].real; + a->c[0].imag += ss*b->c[0].imag; + a->c[1].real += ss*b->c[1].real; + a->c[1].imag += ss*b->c[1].imag; + a->c[2].real += ss*b->c[2].real; + a->c[2].imag += ss*b->c[2].imag; + +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_vec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_vec.c new file mode 100644 index 0000000000000000000000000000000000000000..6ec0cf4bcade6f2dfe0c0521828f48f9d4068b73 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_vec.c @@ -0,0 +1,33 @@ +/****************** s_m_vec.c (in su3.a) ****************************** +* * +* void scalar_mult_su3_vector( su3_vector *a, radix s, su3_vector *c) * +* C <- s*A, A and C vectors * +*/ +#include "complex.h" +#include "su3.h" + +/* c <- s*a, vectors */ +void scalar_mult_su3_vector( su3_vector *a, radix s, su3_vector *c){ + +#ifndef NATIVEDOUBLE +register int i; + for(i=0;i<3;i++){ + c->c[i].real = s*a->c[i].real; + c->c[i].imag = s*a->c[i].imag; + } + +#else /* RS6000 version */ + + register double ss; + + ss = s; + + c->c[0].real = ss*a->c[0].real; + c->c[0].imag = ss*a->c[0].imag; + c->c[1].real = ss*a->c[1].real; + c->c[1].imag = ss*a->c[1].imag; + c->c[2].real = ss*a->c[2].real; + c->c[2].imag = ss*a->c[2].imag; + +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_wvec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_wvec.c new file mode 100644 index 0000000000000000000000000000000000000000..e2b5c2b8a22552a7a2dad387f2fa4f5066a13ef1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/s_m_wvec.c @@ -0,0 +1,50 @@ +/******************** s_m_wvec.c (in su3.a) ******************** +* +*void scalar_mult_wvec(wilson_vector *src, radix s, wilson_vector *dest) +* Multiply a Wilson vector by a scalar +* dest <- s*src +*/ +#include "complex.h" +#include "su3.h" + +void scalar_mult_wvec( wilson_vector *src, radix s, wilson_vector *dest){ + +#ifndef NATIVEDOUBLE +register int i; + for(i=0;i<4;i++)scalar_mult_su3_vector( &(src->d[i]), s, &(dest->d[i])); + +#else /* RS6000 version */ + + register double ss; + ss = s; + + dest->d[0].c[0].real = ss*src->d[0].c[0].real; + dest->d[0].c[0].imag = ss*src->d[0].c[0].imag; + dest->d[0].c[1].real = ss*src->d[0].c[1].real; + dest->d[0].c[1].imag = ss*src->d[0].c[1].imag; + dest->d[0].c[2].real = ss*src->d[0].c[2].real; + dest->d[0].c[2].imag = ss*src->d[0].c[2].imag; + + dest->d[1].c[0].real = ss*src->d[1].c[0].real; + dest->d[1].c[0].imag = ss*src->d[1].c[0].imag; + dest->d[1].c[1].real = ss*src->d[1].c[1].real; + dest->d[1].c[1].imag = ss*src->d[1].c[1].imag; + dest->d[1].c[2].real = ss*src->d[1].c[2].real; + dest->d[1].c[2].imag = ss*src->d[1].c[2].imag; + + dest->d[2].c[0].real = ss*src->d[2].c[0].real; + dest->d[2].c[0].imag = ss*src->d[2].c[0].imag; + dest->d[2].c[1].real = ss*src->d[2].c[1].real; + dest->d[2].c[1].imag = ss*src->d[2].c[1].imag; + dest->d[2].c[2].real = ss*src->d[2].c[2].real; + dest->d[2].c[2].imag = ss*src->d[2].c[2].imag; + + dest->d[3].c[0].real = ss*src->d[3].c[0].real; + dest->d[3].c[0].imag = ss*src->d[3].c[0].imag; + dest->d[3].c[1].real = ss*src->d[3].c[1].real; + dest->d[3].c[1].imag = ss*src->d[3].c[1].imag; + dest->d[3].c[2].real = ss*src->d[3].c[2].real; + dest->d[3].c[2].imag = ss*src->d[3].c[2].imag; + +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3.h b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3.h new file mode 100644 index 0000000000000000000000000000000000000000..0596ea46cf335de51d48ff6db545b5f36447fa72 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3.h @@ -0,0 +1,354 @@ +/****************************** su3.h ********************************** +* * +* Defines and subroutine declarations for SU3 simulation * +* MIMD version 3 * +* * +*/ +/* #define radix double takes radix from complex.h */ +typedef struct { complex e[3][3]; } su3_matrix; +typedef struct { complex c[3]; } su3_vector; +typedef struct + { complex m01,m02,m12; radix m00im,m11im,m22im; radix space; } anti_hermitmat; +typedef struct { su3_vector d[4]; } wilson_vector; +typedef struct { su3_vector h[2]; } half_wilson_vector; +typedef struct { wilson_vector c[3]; } color_wilson_vector; +typedef struct { color_wilson_vector d[4]; } wilson_matrix; + +typedef struct { radix l[8]; } adjoint_matrix; + +#define GAMMAFIVE -1 /* some integer which is not a direction */ +#define PLUS 1 /* flags for selecting M or M_adjoint */ +#define MINUS -1 +/* Macros to multiply complex numbers by +-1 and +-i */ +#define TIMESPLUSONE(a,b) { (b).real = (a).real; (b).imag = (a).imag; } +#define TIMESMINUSONE(a,b) { (b).real = -(a).real; (b).imag = -(a).imag; } +#define TIMESPLUSI(a,b) { (b).real = -(a).imag; (b).imag = (a).real; } +#define TIMESMINUSI(a,b) { (b).real = (a).imag; (b).imag = -(a).real; } + + +/* +* ROUTINES FOR SU(3) MATRIX OPERATIONS +* +* void mult_su3_nn( a,b,c ) +* su3_matrix *a,*b,*c; +* matrix multiply, no adjoints +* files "m_mat_nn.c", "m_mat_nn.m4" +* void mult_su3_na( a,b,c ) +* su3_matrix *a,*b,*c; +* matrix multiply, second matrix is adjoint +* files "m_mat_na.c", "m_mat_na.m4" +* void mult_su3_an( a,b,c ) +* su3_matrix *a,*b,*c; +* matrix multiply, first matrix is adjoint +* files "m_mat_an.c", "m_mat_an.m4" +* radix realtrace_su3(a,b) +* su3_matrix *a,*b; (Re(Tr( A_adjoint*B)) ) +* file "realtr.c" +* complex trace_su3(a) +* su3_matrix *a; +* file "trace_su3.c" +* complex complextrace_su3(a,b) +* su3_matrix *a,*b; (Tr( A_adjoint*B)) +* file "complextr.c" +* complex det_su3(a) +* su3_matrix *a; +* file "det_su3.c" +* void add_su3_matrix(a,b,c) +* su3_matrix *a,*b,*c; +* file "addmat.c" +* void sub_su3_matrix(a,b,c) +* su3_matrix *a,*b,*c; +* file "submat.c" +* void scalar_mult_su3_matrix(a,s,b) +* su3_matrix *a,*b; radix s; +* file "s_m_mat.c" +* void scalar_mult_add_su3_matrix(a,b,s,c) +* su3_matrix *a,*b,*c; radix s; +* file "s_m_a_mat.c" +* void scalar_mult_sub_su3_matrix(a,b,s,c) +* su3_matrix *a,*b,*c; radix s; +* file "s_m_s_mat.c" +* void c_scalar_mult_su3mat(m1,phase,m2) +* su3_matrix *m1,*m2; complex *phase; +* file "cs_m_mat.c" +* void c_scalar_mult_add_su3mat(m1,m2,phase,m3) +* su3_matrix *m1,*m2,*m3; complex *phase; +* file "cs_m_a_mat.c" +* void c_scalar_mult_sub_su3mat(m1,m2,phase,m3) +* su3_matrix *m1,*m2,*m3; complex *phase; +* file "cs_m_s_mat.c" +* void su3_adjoint(a,b) +* su3_matrix *a,*b; +* file "su3_adjoint.c" +* void make_anti_hermitian(m3,ah3) +* su3_matrix *m3; anti_hermitmat *ah3; +* file "make_ahmat.c" +* void random_anti_hermitian(mat_antihermit,prn_pt) +* anti_hermitmat *mat_antihermit; +* void *prn_pt; (passed through to myrand()) +* file "rand_ahmat.c" +* void uncompress_anti_hermitian(mat_antihermit,mat_su3) +* anti_hermitmat *mat_antihermit; su3_matrix *mat_su3; +* file "uncmp_ahmat.c" +* void compress_anti_hermitian(mat_su3,mat_antihermit) +* anti_hermitmat *mat_antihermit; su3_matrix *mat_su3; +* file "cmp_ahmat.c" +* void su3mat_copy(a,b) +* su3_matrix *a,*b; +* file "su3mat_copy.c" +* +* +* ROUTINES FOR su3_vector OPERATIONS ( 3 COMPONENT COMPLEX ) +* +* void c_scalar_mult_su3vec(v1,phase,v2) +* su3_vector *v1,*v2; complex *phase; +* file "cs_m_vec.c" +* void c_scalar_mult_add_su3vec(v1,phase,v2) +* su3_vector *v1,*v2; complex *phase; +* file "cs_m_a_vec.c" +* void c_scalar_mult_sub_su3vec(v1,phase,v2) +* su3_vector *v1,*v2; complex *phase; +* file "cs_m_s_vec.c" +* void su3_projector(a,b,c) +* su3_vector *a,*b; su3_matrix *c; +* ( outer product of A and B) +* file "su3_proj.c" +* void su3vec_copy(a,b) +* su3_vector *a,*b; +* file "su3vec_copy.c" +* +* void mult_su3_mat_vec( a,b,c ) +* su3_matrix *a; su3_vector *b,*c; +* file "m_matvec.c", "m_matvec.m4" +* void mult_su3_mat_vec_sum( a,b,c ) +* su3_matrix *a; su3_vector *b,*c; +* file "m_matvec_s.c", "m_matvec_s.m4" +* void mult_su3_mat_vec_sum_4dir( a,b0,b1,b2,b3,c ) +* su3_matrix *a; su3_vector *b0,*b1,*b2,*b3,*c; +* file "m_mv_s_4dir.c", "m_mv_s_4dir.m4" +* file "m_mv_s_4di2.m4" is alternate version with pipelined loads. +* Multiply four su3_vectors by elements of an array of su3_matrices, +* sum results. +* C <- A[0]*B0 + A[1]*B1 + A[2]*B2 + A[3]*B3 +* void mult_su3_mat_vec_nsum( a,b,c ) +* su3_matrix *a; su3_vector *b,*c; +* file "m_matvec_ns.c" +* void mult_adj_su3_mat_vec( a,b,c ) +* su3_matrix *a; su3_vector *b,*c; +* file "m_amatvec.c", "m_amatvec.m4" +* void mult_adj_su3_mat_vec_4dir( a,b,c ) +* su3_matrix *a; su3_vector *b,*c; +* file "m_amv_4dir.c", "m_amv_4dir.m4" +* file "m_amv_4di2.m4" is alternate version with pipelined loads. +* Multiply an su3_vector by adjoints of elements of an array +* of su3_matrices, results in an array of su3_vectors. +* C[i] <- A_adjoint[i]*B, i = 0,1,2,3 +* void mult_adj_su3_mat_vec_sum( a,b,c ) +* su3_matrix *a; su3_vector *b,*c; +* file "m_amatvec_s.c" +* void mult_adj_su3_mat_vec_nsum( a,b,c ) +* su3_matrix *a; su3_vector *b,*c; +* file "m_amatvec_ns.c" +* void add_su3_vector(a,b,c) +* su3_vector *a,*b,*c; +* file "addvec.c", "addvec.m4" +* void sub_su3_vector(a,b,c) +* su3_vector *a,*b,*c; +* file "subvec.c", "subvec.m4" +* void sub_four_su3_vecs(a,b1,b2,b3,b4) +* su3_vector *a,*b1,*b2,*b3,*b4; +* file "sub4vecs.c", "sub4vecs.m4" +* void scalar_mult_su3_vector(a,s,c) +* su3_vector *a,*c; radix s; +* file "s_m_vec.c" +* void scalar_mult_add_su3_vector(a,b,s,c) +* su3_vector *a,*b,*c; radix s; +* file "s_m_a_vec.c", "s_m_a_vec.m4" +* void scalar_mult_sum_su3_vector(a,b,s) +* su3_vector *a,*b; radix s; +* file "s_m_s_vec.c", "s_m_s_vec.m4" +* void scalar_mult_sub_su3_vector(a,b,s,c) +* su3_vector *a,*b,*c; radix s; +* file "s_m_s_vec.c" +* complex su3_dot(a,b) +* su3_vector *a,*b; +* file "su3_dot.c" +* radix su3_rdot(a,b) +* su3_vector *a,*b; +* file "su3_rdot.c", "su3_rdot.m4" +* radix magsq_su3vec(a) +* su3_vector *a; +* file "msq_su3vec.c", "msq_su3vec.m4" +* +* +* MISCELLANEOUS ROUTINES +* +* radix gaussian_rand_no(prn_pt) +* void *prn_pt; ( passed to myrand()) +* file "gaussrand.c" +* +* void dumpmat(m) +* su3_matrix *m; +* file "dumpmat.c" +* void dumpvec(v) +* su3_vector *v; +* file "dumpvec.c" +*/ + +/* Protoed by K.R */ + + +void mult_su3_nn (su3_matrix *, su3_matrix *, su3_matrix *); +void mult_su3_na (su3_matrix *, su3_matrix *, su3_matrix *); +void mult_su3_an (su3_matrix *, su3_matrix *, su3_matrix *); +radix realtrace_su3(su3_matrix *, su3_matrix *); +complex trace_su3(su3_matrix *); +complex complextrace_su3(su3_matrix *, su3_matrix *); +complex det_su3(su3_matrix *); +void add_su3_matrix(su3_matrix *, su3_matrix *, su3_matrix *); +void sub_su3_matrix(su3_matrix *, su3_matrix *, su3_matrix *); +void su3_adjoint(su3_matrix *, su3_matrix *); +void make_anti_hermitian(su3_matrix *, anti_hermitmat *ah3); +void random_anti_hermitian(anti_hermitmat *mat_antihermit,void *prn_pt); +void uncompress_anti_hermitian(anti_hermitmat *mat_antihermit,su3_matrix *mat_su3); +void compress_anti_hermitian(su3_matrix *mat_su3,anti_hermitmat *mat_antihermit); +void su3mat_copy(su3_matrix *, su3_matrix *); +void clear_su3mat( su3_matrix *dest ); +void clearvec( su3_vector *v ); + +void mult_su3_by_I(su3_matrix *,su3_matrix *); +void scalar_add_su3_matrix(su3_matrix *,radix , su3_matrix *); + +void c_scalar_mult_su3vec (su3_vector *, complex *,su3_vector *); +void c_scalar_mult_sub_su3vec(su3_vector *, complex *,su3_vector *); +void su3_projector(su3_vector *, su3_vector *, su3_matrix *); +void su3vec_copy(su3_vector *, su3_vector *); +void mult_su3_mat_vec(su3_matrix *,su3_vector *, su3_vector *); +void mult_su3_mat_vec_sum(su3_matrix *,su3_vector *, su3_vector *); +void mult_su3_mat_vec_sum_4dir(su3_matrix *,su3_vector *,su3_vector *, + su3_vector *,su3_vector *,su3_vector *); +void mult_su3_mat_vec_nsum(su3_matrix *,su3_vector *, su3_vector *); +void mult_adj_su3_mat_vec(su3_matrix *,su3_vector *, su3_vector *); +void mult_adj_su3_mat_vec_4dir(su3_matrix *,su3_vector *,su3_vector *); +void mult_adj_su3_mat_vec_sum(su3_matrix *,su3_vector *, su3_vector *); +void mult_adj_su3_mat_vec_nsum(su3_matrix *,su3_vector *, su3_vector *); +void add_su3_vector(su3_vector *,su3_vector *,su3_vector *); +void sub_su3_vector(su3_vector *,su3_vector *,su3_vector *); +void sub_four_su3_vecs(su3_vector *,su3_vector *,su3_vector *, + su3_vector *,su3_vector *); + +void scalar_mult_su3_vector( su3_vector *src, radix scalar, + su3_vector *dest); +void scalar_mult_add_su3_vector( su3_vector *src1, su3_vector *src2, + radix scalar, su3_vector *dest); +void scalar_mult_sum_su3_vector( su3_vector *src1, su3_vector *src2, + radix scalar); +void scalar_mult_sub_su3_vector( su3_vector *src1, su3_vector *src2, + radix scalar, su3_vector *dest); +void scalar_mult_su3_matrix( su3_matrix *src, radix scalar, + su3_matrix *dest); +void scalar_mult_add_su3_matrix( su3_matrix *src1, su3_matrix *src2, + radix scalar, su3_matrix *dest); +void scalar_mult_sub_su3_matrix( su3_matrix *src1, su3_matrix *src2, + radix scalar, su3_matrix *dest); +void c_scalar_mult_su3mat( su3_matrix *src, complex *scalar, + su3_matrix *dest); +void c_scalar_mult_add_su3mat( su3_matrix *src1, su3_matrix *src2, + complex *scalar, su3_matrix *dest); +void c_scalar_mult_sub_su3mat( su3_matrix *src1, su3_matrix *src2, + complex *scalar, su3_matrix *dest); +void scalar_mult_add_wvec( wilson_vector *src1, wilson_vector *src2, + radix scalar, wilson_vector *dest); +void scalar_mult_addtm_wvec( wilson_vector *src1, wilson_vector *src2, + radix scalar, wilson_vector *dest); +void c_scalar_mult_add_su3vec(su3_vector *v1, complex *phase, su3_vector +*v2); +void c_scalar_mult_add_wvec(wilson_vector *src1, wilson_vector *src2, complex *phase, wilson_vector *dest); + + +/* + * Adjoint Higgs protos + */ + +double act_gauge_adj(su3_matrix *a, su3_matrix *u,adjoint_matrix *b); +void compress_adjmat(su3_matrix *m3,adjoint_matrix *a3); +void uncompress_adjmat(adjoint_matrix *a3,su3_matrix *m3); +void make_adjointmat(su3_matrix *m3,adjoint_matrix *a3); +void add_adjmat(adjoint_matrix *a,adjoint_matrix *b,adjoint_matrix *t); +void adj_scalar_mul(adjoint_matrix *a,double d,adjoint_matrix *t); +void adj_scalar_mul_add(adjoint_matrix *a,double d,adjoint_matrix *t); +radix adj_sqr(adjoint_matrix *a); +radix adj_dot(adjoint_matrix *a,adjoint_matrix *b); +void mult_su3_ahiggs( su3_matrix *m, adjoint_matrix *a, adjoint_matrix *r ); +void mult_adj_su3_ahiggs( su3_matrix *m, adjoint_matrix *a, adjoint_matrix *r ); + +/* + * + * Added new ANSI protos -- Kari R. + * first, some Wilson operations + */ + +void mult_mat_wilson_vec( su3_matrix *mat, wilson_vector *src, + wilson_vector *dest); +void mult_su3_mat_hwvec( su3_matrix *mat, half_wilson_vector *src, + half_wilson_vector *dest); +void mult_adj_mat_wilson_vec( su3_matrix *mat, wilson_vector *src, + wilson_vector *dest); +void mult_adj_su3_mat_hwvec( su3_matrix *mat, + half_wilson_vector *src, half_wilson_vector *dest); +void add_wilson_vector( wilson_vector *src1, wilson_vector *src2, + wilson_vector *dest); +void sub_wilson_vector( wilson_vector *src1, wilson_vector *src2, + wilson_vector *dest); +void scalar_mult_wvec( wilson_vector *src, radix s, wilson_vector *dest); +void scalar_mult_hwvec( half_wilson_vector *src, radix s, + half_wilson_vector *dest); +radix magsq_wvec( wilson_vector *src); +complex wvec_dot( wilson_vector *src1, wilson_vector *src2 ); +complex wvec2_dot( wilson_vector *src1, wilson_vector *src2 ); +radix wvec_rdot( wilson_vector *src1, wilson_vector *src2 ); +void su3_projector_w(wilson_vector *a, wilson_vector *b, su3_matrix *c); +void copy_wvec( wilson_vector *src, wilson_vector *dest); +void clear_wvec( wilson_vector *dest); +void wp_shrink( wilson_vector *src, half_wilson_vector *dest,int dir, int sign); +void wp_shrink_4dir(wilson_vector *a,half_wilson_vector *b1, + half_wilson_vector *b2,half_wilson_vector *b3, + half_wilson_vector *b4,int sign); +void wp_grow( half_wilson_vector *src, wilson_vector *dest,int dir, int sign); +void wp_grow_add( half_wilson_vector *src, wilson_vector *dest, + int dir, int sign); +void grow_add_four_wvecs(wilson_vector *a,half_wilson_vector *b1, + half_wilson_vector *b2,half_wilson_vector *b3, + half_wilson_vector *b4,int sign,int sum); +void mult_by_gamma( wilson_vector *src, wilson_vector *dest, int dir ); +void mult_by_gamma_left( wilson_matrix *src, wilson_matrix *dest, int dir ); +void mult_by_gamma_right(wilson_matrix *src, wilson_matrix *dest, int dir ); +void dump_wilson_vec( wilson_vector *src); + +/* SOME SU3 PROTOS + * Kari R. + */ + +complex su3_dot(su3_vector *a,su3_vector *b); +radix su3_rdot(su3_vector *a,su3_vector *b); +radix magsq_su3vec(su3_vector *a); + +void reunit_su3( su3_matrix *l ); +void reunitarize( su3_matrix *link[] ); +void random_su3P( su3_matrix *l, int hits ); + + +/* + * MISCELLANEOUS ROUTINES + * Kari R. : wrote protos + */ + +radix gaussian_rand_no( void *); +void dumpmat(su3_matrix *m); +void dumpvec(su3_vector *v); + +int prefetch_matrix(su3_matrix *); +int prefetch_vector(su3_vector *); +int prefetch_adjoint(adjoint_matrix *); + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3.orig.h b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3.orig.h new file mode 100644 index 0000000000000000000000000000000000000000..e5610dfcdd9567649ed3827e2d2f6445b1c05640 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3.orig.h @@ -0,0 +1,446 @@ +/****************************** su3.h ********************************** +* * +* Defines and subroutine declarations for SU3 simulation * +* MIMD version 5 * +* * +*/ +typedef struct { complex e[3][3]; } su3_matrix; +typedef struct { complex c[3]; } su3_vector; +typedef struct + { complex m01,m02,m12; radix m00im,m11im,m22im; radix space; } anti_hermitmat; + +/* e.g. */ +/* wilson_propagator prop; */ +/* prop.c[ci].d[si].d[sf].c[cf] */ +/* -----------------------> complex */ +/* -----------------> su3_vector */ +/* -----------> wilson_vector */ +/* -----> spin_wilson_vector */ +/* e.g. */ +/* wilson_matrix matr; */ +/* matr.d[si].c[ci].d[sf].c[cf] */ +/* -----------------------> complex */ +/* -----------------> su3_vector */ +/* -----------> wilson_vector */ +/* -----> color_wilson_vector */ + +/* Object with two Dirac and two color indices. A given element + of a "wilson_propagator" is accessed by + object.c[color1].d[spin1].d[spin2].c[color2].real , etc. + As alway, "d" denotes a Dirac index and "c" a color index. + "1" refers to the source, "2" to the sink. +*/ + +typedef struct { su3_vector d[4]; } wilson_vector; +typedef struct { su3_vector h[2]; } half_wilson_vector; +typedef struct { wilson_vector c[3]; } color_wilson_vector; +typedef struct { wilson_vector d[4]; } spin_wilson_vector; +typedef struct { color_wilson_vector d[4]; } wilson_matrix; +typedef struct { spin_wilson_vector c[3]; } wilson_propagator; + +#define GAMMAFIVE -1 /* some integer which is not a direction */ +#define PLUS 1 /* flags for selecting M or M_adjoint */ +#define MINUS -1 +/* Macros to multiply complex numbers by +-1 and +-i */ +#define TIMESPLUSONE(a,b) { (b).real = (a).real; (b).imag = (a).imag; } +#define TIMESMINUSONE(a,b) { (b).real = -(a).real; (b).imag = -(a).imag; } +#define TIMESPLUSI(a,b) { (b).real = -(a).imag; (b).imag = (a).real; } +#define TIMESMINUSI(a,b) { (b).real = (a).imag; (b).imag = -(a).real; } + +/* random number routines */ +typedef struct { + /* We assume long is at least 32 bits */ + unsigned long r0,r1,r2,r3,r4,r5,r6; + unsigned long multiplier,addend,ic_state; + radix scale; +} double_prn; +void initialize_prn(double_prn *prn_pt, int seed, int index); +radix myrand( double_prn *prn_pt ); + + +/* +* ROUTINES FOR SU(3) MATRIX OPERATIONS +* +* void mult_su3_nn( su3_matrix *a, su3_matrix *b, su3_matrix *c ) +* matrix multiply, no adjoints +* files "m_mat_nn.c", "m_mat_nn.m4" +* void mult_su3_na( su3_matrix *a, su3_matrix *b, su3_matrix *c ) +* matrix multiply, second matrix is adjoint +* files "m_mat_na.c", "m_mat_na.m4" +* void mult_su3_an( su3_matrix *a, su3_matrix *b, su3_matrix *c ) +* matrix multiply, first matrix is adjoint +* files "m_mat_an.c", "m_mat_an.m4" +* radix realtrace_su3( su3_matrix *a, su3_matrix *b ) +* (Re(Tr( A_adjoint*B)) ) +* file "realtr.c" +* complex trace_su3( su3_matrix *a ) +* file "trace_su3.c" +* complex complextrace_su3( su3_matrix *a, su3_matrix *b ) +* (Tr( A_adjoint*B)) +* file "complextr.c" +* complex det_su3( su3_matrix *a ) +* file "det_su3.c" +* void add_su3_matrix( su3_matrix *a, su3_matrix *b, su3_matrix *c ) +* file "addmat.c" +* void sub_su3_matrix( su3_matrix *a, su3_matrix *b, su3_matrix *c ) +* file "submat.c" +* void scalar_mult_su3_matrix( su3_matrix *a, radix s, su3_matrix *b ) +* file "s_m_mat.c" +* void scalar_mult_add_su3_matrix( su3_matrix *a, su3_matrix *b, +* radix s, su3_matrix *c) +* file "s_m_a_mat.c" +* void scalar_mult_sub_su3_matrix( su3_matrix *a, su3_matrix *b, +* radix s, su3_matrix *c) +* file "s_m_s_mat.c" +* void c_scalar_mult_su3mat( su3_matrix *src, complex *phase, su3_matrix *dest) +* file "cs_m_mat.c" +* void c_scalar_mult_add_su3mat( su3_matrix *m1, su3_matrix *m2, +* complex *phase, su3_matrix *m3) +* file "cs_m_a_mat.c" +* void c_scalar_mult_sub_su3mat( su3_matrix *m1, su3_matrix *m2, +* complex *phase, su3_matrix *m3) +* file "cs_m_s_mat.c" +* void su3_adjoint( su3_matrix *a, su3_matrix *b ) +* file "su3_adjoint.c" +* void make_anti_hermitian( su3_matrix *m3, anti_hermitmat *ah3 ) +* file "make_ahmat.c" +* void random_anti_hermitian( anti_hermitmat *mat_antihermit, void *prn_pt ) +* (prn_pt passed through to myrand()) +* file "rand_ahmat.c" +* void uncompress_anti_hermitian( anti_hermitmat *mat_anti, su3_matrix *mat ) +* file "uncmp_ahmat.c" +* void compress_anti_hermitian( su3_matrix *mat, anti_hermitmat *mat_anti) +* file "cmp_ahmat.c" +* void clear_su3mat( su3_matrix *dest ); +* file clear_mat.c +* dest <- 0.0 +* void su3mat_copy( su3_matrix *a, su3_matrix *b ) +* file "su3mat_copy.c" +* void dumpmat( su3_matrix *m ) +* file "dumpmat.c" +* +* +* ROUTINES FOR su3_vector OPERATIONS ( 3 COMPONENT COMPLEX ) +* +* void su3_projector( su3_vector *a, su3_vector *b, su3_matrix *c ) +* ( outer product of A and B) +* file "su3_proj.c" +* complex su3_dot( su3_vector *a, su3_vector *b ) +* file "su3_dot.c" +* radix su3_rdot( su3_vector *a, su3_vector *b ) +* file "su3_rdot.c", "su3_rdot.m4" +* radix magsq_su3vec( su3_vector *a ) +* file "msq_su3vec.c", "msq_su3vec.m4" +* void su3vec_copy( su3_vector *a, su3_vector *b ) +* file "su3vec_copy.c" +* +* void mult_su3_mat_vec( su3_matrix *a, su3_vector *b, su3_vector *c ) +* C <- A*B +* file "m_matvec.c", "m_matvec.m4" +* void mult_su3_mat_vec_sum( su3_matrix *a, su3_vector *b, su3_vector *c ) +* C <- C + A*B +* file "m_matvec_s.c", "m_matvec_s.m4" +* void mult_su3_mat_vec_sum_4dir( su3_matrix *a, su3_vector *b0, +* su3_vector *b1, su3_vector *b2, su3_vector *b3, su3_vector *c ) +* file "m_mv_s_4dir.c", "m_mv_s_4dir.m4" +* file "m_mv_s_4di2.m4" is alternate version with pipelined loads. +* Multiply four su3_vectors by elements of an array of su3_matrices, +* sum results. +* C <- A[0]*B0 + A[1]*B1 + A[2]*B2 + A[3]*B3 +* void mult_su3_mat_vec_nsum( su3_matrix *a, su3_vector *b, su3_vector *c ) +* file "m_matvec_ns.c" +* void mult_adj_su3_mat_vec( su3_matrix *a, su3_vector *b, su3_vector *c ) +* file "m_amatvec.c", "m_amatvec.m4" +* void mult_adj_su3_mat_vec_4dir( su3_matrix *a, su3_vector *b, su3_vector *c ) +* file "m_amv_4dir.c", "m_amv_4dir.m4" +* file "m_amv_4di2.m4" is alternate version with pipelined loads. +* Multiply an su3_vector by adjoints of elements of an array +* of su3_matrices, results in an array of su3_vectors. +* C[i] <- A_adjoint[i]*B, i = 0,1,2,3 +* void mult_adj_su3_mat_vec_sum( su3_matrix *a, su3_vector *b, su3_vector *c ) +* file "m_amatvec_s.c" +* void mult_adj_su3_mat_vec_nsum( su3_matrix *a, su3_vector *b, su3_vector *c ) +* file "m_amatvec_ns.c" +* void add_su3_vector( su3_vector *a, su3_vector *b, su3_vector *c ) +* file "addvec.c", "addvec.m4" +* void sub_su3_vector( su3_vector *a, su3_vector *b, su3_vector *c ) +* file "subvec.c", "subvec.m4" +* void sub_four_su3_vecs( su3_vector *a, su3_vector *b1, su3_vector *b2, +* su3_vector *b3, su3_vector *b4 ) +* file "sub4vecs.c", "sub4vecs.m4" +* +* void scalar_mult_su3_vector( su3_vector *a, radix s, su3_vector *c ) +* file "s_m_vec.c" +* void scalar_mult_add_su3_vector( su3_vector *a, su3_vector *b, radix s, +* su3_vector *c) +* file "s_m_a_vec.c", "s_m_a_vec.m4" +* void scalar_mult_sum_su3_vector( su3_vector *a, su3_vector *b, radix s ) +* file "s_m_sum_vec.c", "s_m_sum_vec.m4" +* void scalar_mult_sub_su3_vector( su3_vector *a, su3_vector *b, radix s, +* su3_vector *c ) +* file "s_m_s_vec.c" +* void c_scalar_mult_su3vec( su3_vector *src, complex *phase, su3_vector *dest ) +* file "cs_m_vec.c" +* void c_scalar_mult_add_su3vec( su3_vector *v1, complex *phase, su3_vector *v2) +* file "cs_m_a_vec.c" +* void c_scalar_mult_sub_su3vec( su3_vector *v1, complex *phase, su3_vector *v2) +* file "cs_m_s_vec.c" +* void dumpvec( su3_vector *v ) +* file "dumpvec.c" +* void clearvec( su3_vector *v ) +* file "clearvec.c" +* +* ROUTINES FOR WILSON VECTORS +* +* void mult_mat_wilson_vec( su3_matrix *mat, wilson_vector *src, +* wilson_vector *dest ); +* file m_mat_wvec.c +* dest <- mat*src +* void mult_su3_mat_hwvec( su3_matrix *mat, half_wilson_vector *src, +* half_wilson_vector *dest ); +* file m_mat_hwvec.c +* dest <- mat*src +* void mult_adj_mat_wilson_vec( su3_matrix *mat, wilson_vector *src, +* wilson_vector *dest) +* file m_amat_wvec.c +* dest <- mat_adjoint*src +* void mult_adj_su3_mat_hwvec su3_matrix *mat, +* half_wilson_vector *src, half_wilson_vector *dest ) +* file m_amat_hwvec.c +* dest <- mat_adjoint*src +* +* void add_wilson_vector( wilson_vector *src1, wilson_vector *src2, +* wilson_vector *dest ); +* file add_wvec.c +* dest <- src1+src2 +* void sub_wilson_vector( wilson_vector *src1, wilson_vector *src2, +* wilson_vector *dest ); +* file sub_wvec.c +* dest <- src1-src2 +* +* void scalar_mult_wvec wilson_vector *src, radix s, wilson_vector *dest ) +* file s_m_wvec.c +* dest <- s*src +* void scalar_mult_hwvec( half_wilson_vector *src, radix s, +* half_wilson_vector *dest) +* file s_m_hwvec.c +* dest <- s*src +* radix magsq_wvec( wilson_vector *src ); +* file msq_wvec.c +* s <- squared magnitude of src +* complex wvec_dot( wilson_vector *src1, wilson_vector *src2 ); +* file wvec_dot.c +* c <- dot product of src1 and src2 +* complex wvec2_dot( wilson_vector *src1, wilson_vector *src2 ); +* file wvec2_dot.c +* c <- dot product of src1 and src2, Used only in Claude's +* mrilu.c, I don't know what the difference is. DT. +* radix wvec_rdot( wilson_vector *a, wilson_vector *b ) +* wilson_vector *a,*b; +* file "wvec_rdot.c", "wvec_rdot.m4" +* r <- real part of dot product of src1 and src2 +* void scalar_mult_add_wvec( wilson_vector *src1,*src2, radix s, +* wilson_vector *dest) +* file s_m_a_wvec.c +* dest <- src1 + s*src2 +* void scalar_mult_addtm_wvec( wilson_vector *src1,*src2, radix s, +* wilson_vector *dest) +* file s_m_atm_wvec.c +* dest <- -src1 + s*src2 ("atm"="add to minus") +* void c_scalar_mult_add_wvec( wilson_vector *v1, wilson_vector *v2, +* complex *phase, wilson_vector *v3) +* file "cs_m_a_wvec.c" +* void c_scalar_mult_add_wvec2( wilson_vector *v1, wilson_vector *v2, +* complex scalar, wilson_vector *v3) +* file "cs_m_a_wvec2.c" +* differs from previous one: value of scalar, not address is arg. +* void wp_shrink( wilson_vector *src, half_wilson_vector *dest, +* int dir, int sign ); +* file wp_shrink.c , wp_shrink.m4 +* if(dir = [XYZT]UP) dest <- components of src along eigenvectors +* of gamma_dir with eigenvalue +1 +* if(dir = [XYZT]DOWN) dest <- components of src along eigenvectors +* of gamma_dir with eigenvalue -1 +* if(sign==MINUS)switch roles of +1 and -1 +* void wp_shrink_4dir( wilson_vector *a, half_wilson_vector *b1, +* half_wilson_vector *b2, half_wilson_vector *b3, +* half_wilson_vector *b4, int sign ); +* file wp_shrink4.c wp_shrink4.m4 +* Shrink A in X,Y,Z,T directions respectively, results in B1-B4 +* void wp_grow( half_wilson_vector *src, wilson_vector *dest, +* int dir, int sign ); +* file wp_grow.c , wp_grow.m4 +* if(dir = [XYZT]UP) dest <- components of src times eigenvectors +* of gamma_dir with eigenvalue +1 +* if(dir = [XYZT]DOWN) dest <- components of src times eigenvectors +* of gamma_dir with eigenvalue -1 +* if(sign==MINUS)switch roles of +1 and -1 +* Note: wp_shrink( +-dir) followed by wp_grow( +-dir) amounts to +* multiplication by 1+-gamma_dir, or 1-+gamma_dir if sign=MINUS +* void wp_grow_add( half_wilson_vector *src, wilson_vector *dest, +* int dir, int sign ); +* file wp_grow_a.c , wp_grow_a.m4 +* wp_grow, and add result to previous contents of dest. +* void grow_add_four_wvecs( wilson_vector *a, half_wilson_vector *b1, +* half_wilson_vector *b2, half_wilson_vector *b3, +* half_wilson_vector *b4, int sign, int sum ); +* file grow4wvecs.c grow4wvecs.m4 +* If sum==0 +* Grow b1-b4 in X,Y,Z,T directions respectively, sum of results in A +* If sum==1 +* Grow b1-b4 in X,Y,Z,T directions respectively, add to current A +* +* void mult_by_gamma( wilson_vector *src, wilson_vector *dest, int dir ); +* file mb_gamma.c +* dest <- gamma[dir] * src, dir=[XYZT]UP,GAMMAFIVE +* void mult_by_gamma_left( wilson_matrix *src, wilson_matrix *dest, int dir ); +* file mb_gamma_l.c +* dest <- gamma[dir] * src, dir=[XYZT]UP,GAMMAFIVE +* acts on first index of matrix +* void mult_by_gamma_right( wilson_matrix *src, wilson_matrix *dest, int dir ); +* file mb_gamma_r.c +* dest_ij <- gamma[dir]_ik * src_jk, dir=[XYZT]UP,GAMMAFIVE +* acts on second index of matrix +* +* void su3_projector_w( wilson_vector *a, wilson_vector *b, su3_matrix *c ) +* sum over spins of outer product of A.d[s] and B.d[s] - a three +* by three complex matrix +* file "su3_proj_w.c" +* void clear_wvec( wilson_vector *dest ); +* file clear_wvec.c +* dest <- 0.0 +* void copy_wvec( wilson_vector *src, wilson_vector *dest ); +* file copy_wvec.c +* dest <- src +* void dump_wilson_vec( wilson_vector *src ); +* file dump_wvec.c +* print out a wilson vector +* +* MISCELLANEOUS ROUTINES +* +* radix gaussian_rand_no( void *prn_pt ) +* void *prn_pt; ( passed to myrand()) +* file "gaussrand.c" +* +*/ + +int prefetch_matrix(su3_matrix *); +int prefetch_vector(su3_vector *); +int prefetch_adjoint(adjoint_matrix *); + +void mult_su3_nn ( su3_matrix *a, su3_matrix *b, su3_matrix *c ); +void mult_su3_na ( su3_matrix *a, su3_matrix *b, su3_matrix *c ); +void mult_su3_an ( su3_matrix *a, su3_matrix *b, su3_matrix *c ); +radix realtrace_su3( su3_matrix *a, su3_matrix *b ); +complex trace_su3( su3_matrix *a ); +complex complextrace_su3( su3_matrix *a, su3_matrix *b ); +complex det_su3( su3_matrix *a ); +void add_su3_matrix( su3_matrix *a, su3_matrix *b, su3_matrix *c ); +void sub_su3_matrix( su3_matrix *a, su3_matrix *b, su3_matrix *c ); +void scalar_mult_su3_matrix( su3_matrix *src, radix scalar, su3_matrix *dest); +void scalar_mult_add_su3_matrix( su3_matrix *src1, su3_matrix *src2, + radix scalar, su3_matrix *dest); +void scalar_mult_sub_su3_matrix( su3_matrix *src1, su3_matrix *src2, + radix scalar, su3_matrix *dest); +void c_scalar_mult_su3mat( su3_matrix *src, complex *scalar, + su3_matrix *dest); +void c_scalar_mult_add_su3mat( su3_matrix *src1, su3_matrix *src2, + complex *scalar, su3_matrix *dest); +void c_scalar_mult_sub_su3mat( su3_matrix *src1, su3_matrix *src2, + complex *scalar, su3_matrix *dest); +void su3_adjoint( su3_matrix *a, su3_matrix *b ); +void make_anti_hermitian( su3_matrix *m3, anti_hermitmat *ah3 ); +void random_anti_hermitian( anti_hermitmat *mat_antihermit, void *prn_pt ); +void uncompress_anti_hermitian( anti_hermitmat *mat_anti, su3_matrix *mat ); +void compress_anti_hermitian( su3_matrix *mat, anti_hermitmat *mat_anti); +void clear_su3mat( su3_matrix *dest ); +void su3mat_copy( su3_matrix *a, su3_matrix *b ); +void dumpmat( su3_matrix *m ); + +void su3_projector( su3_vector *a, su3_vector *b, su3_matrix *c ); +complex su3_dot( su3_vector *a, su3_vector *b ); +radix su3_rdot( su3_vector *a, su3_vector *b ); +radix magsq_su3vec( su3_vector *a ); +void su3vec_copy( su3_vector *a, su3_vector *b ); +void dumpvec( su3_vector *v ); +void clearvec( su3_vector *v ); + +void mult_su3_mat_vec( su3_matrix *a, su3_vector *b, su3_vector *c ); +void mult_su3_mat_vec_sum( su3_matrix *a, su3_vector *b, su3_vector *c ); +void mult_su3_mat_vec_sum_4dir( su3_matrix *a, su3_vector *b0, + su3_vector *b1, su3_vector *b2, su3_vector *b3, su3_vector *c ); +void mult_su3_mat_vec_nsum( su3_matrix *a, su3_vector *b, su3_vector *c ); +void mult_adj_su3_mat_vec( su3_matrix *a, su3_vector *b, su3_vector *c ); +void mult_adj_su3_mat_vec_4dir( su3_matrix *a, su3_vector *b, su3_vector *c ); +void mult_adj_su3_mat_vec_sum( su3_matrix *a, su3_vector *b, su3_vector *c ); +void mult_adj_su3_mat_vec_nsum( su3_matrix *a, su3_vector *b, su3_vector *c ); + +void add_su3_vector( su3_vector *a, su3_vector *b, su3_vector *c ); +void sub_su3_vector( su3_vector *a, su3_vector *b, su3_vector *c ); +void sub_four_su3_vecs( su3_vector *a, su3_vector *b1, su3_vector *b2, + su3_vector *b3, su3_vector *b4 ); + +void scalar_mult_su3_vector( su3_vector *src, radix scalar, + su3_vector *dest); +void scalar_mult_add_su3_vector( su3_vector *src1, su3_vector *src2, + radix scalar, su3_vector *dest); +void scalar_mult_sum_su3_vector( su3_vector *src1, su3_vector *src2, + radix scalar); +void scalar_mult_sub_su3_vector( su3_vector *src1, su3_vector *src2, + radix scalar, su3_vector *dest); +void scalar_mult_wvec( wilson_vector *src, radix s, wilson_vector *dest ); +void scalar_mult_hwvec( half_wilson_vector *src, radix s, + half_wilson_vector *dest ); +void scalar_mult_add_wvec( wilson_vector *src1, wilson_vector *src2, + radix scalar, wilson_vector *dest ); +void scalar_mult_addtm_wvec( wilson_vector *src1, wilson_vector *src2, + radix scalar, wilson_vector *dest ); +void c_scalar_mult_add_wvec(wilson_vector *src1, wilson_vector *src2, + complex *phase, wilson_vector *dest ); +void c_scalar_mult_add_wvec2(wilson_vector *src1, wilson_vector *src2, + complex s, wilson_vector *dest ); +void c_scalar_mult_su3vec( su3_vector *src, complex *phase, su3_vector *dest ); +void c_scalar_mult_add_su3vec(su3_vector *v1, complex *phase, su3_vector *v2); +void c_scalar_mult_sub_su3vec(su3_vector *v1, complex *phase, su3_vector *v2); + +void mult_mat_wilson_vec( su3_matrix *mat, wilson_vector *src, + wilson_vector *dest ); +void mult_su3_mat_hwvec( su3_matrix *mat, half_wilson_vector *src, + half_wilson_vector *dest ); +void mult_adj_mat_wilson_vec( su3_matrix *mat, wilson_vector *src, + wilson_vector *dest); +void mult_adj_su3_mat_hwvec( su3_matrix *mat, half_wilson_vector *src, + half_wilson_vector *dest ); + +void add_wilson_vector( wilson_vector *src1, wilson_vector *src2, + wilson_vector *dest ); +void sub_wilson_vector( wilson_vector *src1, wilson_vector *src2, + wilson_vector *dest ); +radix magsq_wvec( wilson_vector *src ); +complex wvec_dot( wilson_vector *src1, wilson_vector *src2 ); +complex wvec2_dot( wilson_vector *src1, wilson_vector *src2 ); +radix wvec_rdot( wilson_vector *a, wilson_vector *b ); + +void wp_shrink( wilson_vector *src, half_wilson_vector *dest, + int dir, int sign ); +void wp_shrink_4dir( wilson_vector *a, half_wilson_vector *b1, + half_wilson_vector *b2, half_wilson_vector *b3, + half_wilson_vector *b4, int sign ); +void wp_grow( half_wilson_vector *src, wilson_vector *dest, + int dir, int sign ); +void wp_grow_add( half_wilson_vector *src, wilson_vector *dest, + int dir, int sign ); +void grow_add_four_wvecs( wilson_vector *a, half_wilson_vector *b1, + half_wilson_vector *b2, half_wilson_vector *b3, + half_wilson_vector *b4, int sign, int sum ); +void mult_by_gamma( wilson_vector *src, wilson_vector *dest, int dir ); +void mult_by_gamma_left( wilson_matrix *src, wilson_matrix *dest, int dir ); +void mult_by_gamma_right( wilson_matrix *src, wilson_matrix *dest, int dir ); + +void su3_projector_w( wilson_vector *a, wilson_vector *b, su3_matrix *c ); +void clear_wvec( wilson_vector *dest ); +void copy_wvec( wilson_vector *src, wilson_vector *dest ); +void dump_wilson_vec( wilson_vector *src ); + +radix gaussian_rand_no( void *prn_pt ); + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3_adjoint.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3_adjoint.c new file mode 100644 index 0000000000000000000000000000000000000000..c66a07729300be479fa19769734bb3e617bf54c0 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3_adjoint.c @@ -0,0 +1,15 @@ +/****************** su3_adjoint.c (in su3.a) ************************** +* * +* void su3_adjoint( su3_matrix *a, su3_matrix *b ) * +* B <- A_adjoint, adjoint of an SU3 matrix * +*/ +#include "complex.h" +#include "su3.h" + +/* adjoint of an SU3 matrix */ +void su3_adjoint( su3_matrix *a, su3_matrix *b ){ +register int i,j; + for(i=0;i<3;i++)for(j=0;j<3;j++){ + CONJG( a->e[j][i], b->e[i][j] ); + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3_dot.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3_dot.c new file mode 100644 index 0000000000000000000000000000000000000000..7e95ade7774072a323c09a09f530dfc478f15f99 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3_dot.c @@ -0,0 +1,45 @@ +/****************** su3_dot.c (in su3.a) ****************************** +* * +* complex su3_dot( su3_vector *a, su3_vector *b ) * +* return dot product of two su3_vectors * +*/ +#include "complex.h" +#include "su3.h" + +complex su3_dot( su3_vector *a, su3_vector *b ){ + +#ifndef NATIVEDOUBLE +complex temp1,temp2; + CMULJ_(a->c[0],b->c[0],temp1) + CMULJ_(a->c[1],b->c[1],temp2) + CSUM(temp1,temp2); + CMULJ_(a->c[2],b->c[2],temp2) + CSUM(temp1,temp2); + return(temp1); + +#else /* RS6000 version */ + + register double ar,ai,br,bi,cr,ci; + register complex cc; + + ar=a->c[0].real; ai=a->c[0].imag; + br=b->c[0].real; bi=b->c[0].imag; + cr = ar*br + ai*bi; + ci = ar*bi - ai*br; + + ar=a->c[1].real; ai=a->c[1].imag; + br=b->c[1].real; bi=b->c[1].imag; + cr += ar*br + ai*bi; + ci += ar*bi - ai*br; + + ar=a->c[2].real; ai=a->c[2].imag; + br=b->c[2].real; bi=b->c[2].imag; + cr += ar*br + ai*bi; + ci += ar*bi - ai*br; + + cc.real = cr; + cc.imag = ci; + return(cc); + +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3_proj.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3_proj.c new file mode 100644 index 0000000000000000000000000000000000000000..31f2aba29ed53f10685dc9a80bcea381bba9ced6 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3_proj.c @@ -0,0 +1,50 @@ +/***************** su3_proj.c (in su3.a) ****************************** +* * +* void su3_projector( su3_vector *a, su3_vector *b, su3_matrix *c ) * +* C <- outer product of A and B * +* C_ij = A_i * B_adjoint_j * +*/ +#include "complex.h" +#include "su3.h" + +#ifndef FAST +void su3_projector( su3_vector *a, su3_vector *b, su3_matrix *c ){ +register int i,j; + for(i=0;i<3;i++)for(j=0;j<3;j++){ + CMUL_J( a->c[i], b->c[j], c->e[i][j] ); + } +} + +#else +#ifdef NATIVEDOUBLE /* RS6000 version */ + +void su3_projector( su3_vector *a, su3_vector *b, su3_matrix *c ){ + + register int i,j; + register double ar,ai,br,bi; + + for(i=0;i<3;i++){ + ar=a->c[i].real; ai=a->c[i].imag; + for(j=0;j<3;j++){ + br=b->c[j].real; bi=b->c[j].imag; + c->e[i][j].real = ar*br + ai*bi; + c->e[i][j].imag = ai*br - ar*bi; + } + } +} +#else + +void su3_projector( su3_vector *a, su3_vector *b, su3_matrix *c ){ +register int i,j; +register radix tmp,tmp2; + for(i=0;i<3;i++)for(j=0;j<3;j++){ + tmp2 = a->c[i].real * b->c[j].real; + tmp = a->c[i].imag * b->c[j].imag; + c->e[i][j].real = tmp + tmp2; + tmp2 = a->c[i].real * b->c[j].imag; + tmp = a->c[i].imag * b->c[j].real; + c->e[i][j].imag = tmp - tmp2; + } +} +#endif /* End of "#ifdef NATIVEDOUBLE" */ +#endif /* end ifdef FAST */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3_proj_w.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3_proj_w.c new file mode 100644 index 0000000000000000000000000000000000000000..3a4bb0f5b6c13c2db3ac5840e0b10269d6af36fc --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3_proj_w.c @@ -0,0 +1,72 @@ +/***************** su3_projector_w.c (in su3.a) ****************************** +* * +* void su3_projector_w( wilson_vector *a, wilson_vector *b, su3_matrix *c ) +* C <- sum over spins of outer product of A.d[i] and B.d[i] * +* C_ij = sum( A_i * B_adjoint_j ) * +*/ +#include "complex.h" +#include "su3.h" + +#ifndef FAST +void su3_projector_w( wilson_vector *a, wilson_vector *b, su3_matrix *c ){ +register int i,j,k; +register complex cc; + for(i=0;i<3;i++)for(j=0;j<3;j++){ + c->e[i][j] = cmplx(0.0,0.0); + for(k=0;k<4;k++){ + CMUL_J( a->d[k].c[i], b->d[k].c[j], cc ); CSUM( c->e[i][j], cc ); + } + } +} + +#else +#ifdef NATIVEDOUBLE /* RS6000 version */ + +void su3_projector_w( wilson_vector *a, wilson_vector *b, su3_matrix *c ){ + register int i,j; + register double ar,ai,br,bi,cr,ci; + + for(i=0;i<3;i++)for(j=0;j<3;j++){ + ar=a->d[0].c[i].real; ai=a->d[0].c[i].imag; + br=b->d[0].c[j].real; bi=b->d[0].c[j].imag; + cr = ar*br + ai*bi; + ci = ai*br - ar*bi; + + ar=a->d[1].c[i].real; ai=a->d[1].c[i].imag; + br=b->d[1].c[j].real; bi=b->d[1].c[j].imag; + cr += ar*br + ai*bi; + ci += ai*br - ar*bi; + + ar=a->d[2].c[i].real; ai=a->d[2].c[i].imag; + br=b->d[2].c[j].real; bi=b->d[2].c[j].imag; + cr += ar*br + ai*bi; + ci += ai*br - ar*bi; + + ar=a->d[3].c[i].real; ai=a->d[3].c[i].imag; + br=b->d[3].c[j].real; bi=b->d[3].c[j].imag; + cr += ar*br + ai*bi; + ci += ai*br - ar*bi; + + c->e[i][j].real = cr; + c->e[i][j].imag = ci; + } +} +#else +void su3_projector_w( wilson_vector *a, wilson_vector *b, su3_matrix *c ){ +register int i,j,k; +register radix tmp_r,tmp_i,tmp2; + for(i=0;i<3;i++)for(j=0;j<3;j++){ + tmp_r = tmp_i = 0.0; + for(k=0;k<4;k++){ + tmp2 = a->d[k].c[i].real * b->d[k].c[j].real; tmp_r = tmp_r + tmp2; + tmp2 = a->d[k].c[i].imag * b->d[k].c[j].imag; tmp_r = tmp_r + tmp2; + tmp2 = a->d[k].c[i].imag * b->d[k].c[j].real; tmp_i = tmp_i + tmp2; + tmp2 = a->d[k].c[i].real * b->d[k].c[j].imag; tmp_i = tmp_i - tmp2; + } + + c->e[i][j].real = tmp_r; + c->e[i][j].imag = tmp_i; + } +} +#endif /* End of "#ifdef NATIVEDOUBLE" */ +#endif /* end ifdef FAST */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3_rdot.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3_rdot.c new file mode 100644 index 0000000000000000000000000000000000000000..cdacd825d94456113daf3fbeb341e0b5a9120cd9 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3_rdot.c @@ -0,0 +1,40 @@ +/***************** su3_rdot.c (in su3.a) ****************************** +* * +* radix su3_rdot( su3_vector *a, su3_vector *b ) * +* return real part of dot product of two su3_vectors * +*/ +#include "complex.h" +#include "su3.h" + +radix su3_rdot( su3_vector *a, su3_vector *b ){ + +#ifndef NATIVEDOUBLE +register radix temp1,temp2; + temp2 = a->c[0].real * b->c[0].real; + temp1 = a->c[0].imag * b->c[0].imag; temp2 += temp1; + temp1 = a->c[1].real * b->c[1].real; temp2 += temp1; + temp1 = a->c[1].imag * b->c[1].imag; temp2 += temp1; + temp1 = a->c[2].real * b->c[2].real; temp2 += temp1; + temp1 = a->c[2].imag * b->c[2].imag; temp2 += temp1; + return(temp2); + +#else /* RS6000 version */ + + register double ar,ai,br,bi,ss; + + ar=a->c[0].real; ai=a->c[0].imag; + br=b->c[0].real; bi=b->c[0].imag; + ss = ar*br + ai*bi; + + ar=a->c[1].real; ai=a->c[1].imag; + br=b->c[1].real; bi=b->c[1].imag; + ss += ar*br + ai*bi; + + ar=a->c[2].real; ai=a->c[2].imag; + br=b->c[2].real; bi=b->c[2].imag; + ss += ar*br + ai*bi; + + return(ss); + +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3mat_copy.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3mat_copy.c new file mode 100644 index 0000000000000000000000000000000000000000..fb4c14c62175afd17403d9a0307f5a6699bc753d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3mat_copy.c @@ -0,0 +1,16 @@ +/***************** su3mat_copy.c (in su3.a) *************************** +* * +* void su3mat_copy( su3_matrix *a, su3_matrix *b ) * +* Copy an su3 matrix: B <- A * +*/ +#include "complex.h" +#include "su3.h" + +/* Copy a su3 matrix: b <- a */ +void su3mat_copy( su3_matrix *a, su3_matrix *b ){ +register int i,j; + for(i=0;i<3;i++)for(j=0;j<3;j++){ + b->e[i][j].real = a->e[i][j].real; + b->e[i][j].imag = a->e[i][j].imag; + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3vec_copy.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3vec_copy.c new file mode 100644 index 0000000000000000000000000000000000000000..86d08e340387b7b058377ce78096f5fbe4f672b9 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/su3vec_copy.c @@ -0,0 +1,16 @@ +/***************** su3vec_copy.c (in su3.a) *************************** +* * +* void su3vec_copy( su3_vector *a, su3_vector *b ) * +* Copy an su3 vector: B <- A * +*/ +#include "complex.h" +#include "su3.h" + +/* Copy a su3 vector: b <- a */ +void su3vec_copy( su3_vector *a, su3_vector *b ){ +register int i; + for(i=0;i<3;i++){ + b->c[i].real = a->c[i].real; + b->c[i].imag = a->c[i].imag; + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/sub4vecs.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/sub4vecs.c new file mode 100644 index 0000000000000000000000000000000000000000..121c42657d654967d2c454f1de1f075ba51c584f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/sub4vecs.c @@ -0,0 +1,38 @@ +/***************** sub4vecs.c (in su3.a) ****************************** +* * +* Subtract four su3_vectors from an su3_vector * +* void sub_four_su3_vecs( su3_vector *a,*b1,*b2,*b3,*b4) * +* A <- A - B1 - B2 - B3 - B4 * +*/ +#include "complex.h" +#include "su3.h" + +/* subtract four su3 vectors */ +#ifndef FAST +void sub_four_su3_vecs( su3_vector *a, su3_vector *b1, su3_vector *b2, + su3_vector *b3, su3_vector *b4 ){ +register int i; + for(i=0;i<3;i++){ + CSUB( a->c[i], b1->c[i], a->c[i] ); + CSUB( a->c[i], b2->c[i], a->c[i] ); + CSUB( a->c[i], b3->c[i], a->c[i] ); + CSUB( a->c[i], b4->c[i], a->c[i] ); + } +} +#else +void sub_four_su3_vecs( su3_vector *a, su3_vector *b1, su3_vector *b2, + su3_vector *b3, su3_vector *b4 ){ + CSUB( a->c[0], b1->c[0], a->c[0] ); + CSUB( a->c[1], b1->c[1], a->c[1] ); + CSUB( a->c[2], b1->c[2], a->c[2] ); + CSUB( a->c[0], b2->c[0], a->c[0] ); + CSUB( a->c[1], b2->c[1], a->c[1] ); + CSUB( a->c[2], b2->c[2], a->c[2] ); + CSUB( a->c[0], b3->c[0], a->c[0] ); + CSUB( a->c[1], b3->c[1], a->c[1] ); + CSUB( a->c[2], b3->c[2], a->c[2] ); + CSUB( a->c[0], b4->c[0], a->c[0] ); + CSUB( a->c[1], b4->c[1], a->c[1] ); + CSUB( a->c[2], b4->c[2], a->c[2] ); +} +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/sub_wvec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/sub_wvec.c new file mode 100644 index 0000000000000000000000000000000000000000..1ecd3e4289a1c986c219a6f1c9e476876fe2d151 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/sub_wvec.c @@ -0,0 +1,14 @@ +/******************** sub_wvec.c (in su3.a) ******************** +* +*void sub_wilson_vector(wilson_vector *src1,*src2,*dest) +* sub two Wilson vectors +* dest <- src1 + src2 +*/ +#include "complex.h" +#include "su3.h" + +void sub_wilson_vector( wilson_vector *src1, wilson_vector *src2, + wilson_vector *dest ){ + register int i; + for(i=0;i<4;i++)sub_su3_vector( &(src1->d[i]), &(src2->d[i]), &(dest->d[i])); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/submat.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/submat.c new file mode 100644 index 0000000000000000000000000000000000000000..94c52b4e2d32fcd5246bfb5a6c7a0ffdcaaf6fa7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/submat.c @@ -0,0 +1,15 @@ +/******************* submat.c (in su3.a) ****************************** +* * +* void sub_su3_matrix(a,b,c) su3_matrix *a,*b,*c; * +* subtract su3 matrices: C <- A - B * +*/ +#include "complex.h" +#include "su3.h" + +/* subtract su3 matrices */ +void sub_su3_matrix( su3_matrix *a, su3_matrix *b, su3_matrix *c ) { +register int i,j; + for(i=0;i<3;i++)for(j=0;j<3;j++){ + CSUB( a->e[i][j], b->e[i][j], c->e[i][j] ); + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/subvec.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/subvec.c new file mode 100644 index 0000000000000000000000000000000000000000..ef5456bba2fc211455307abe142c160c27d2d54c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/subvec.c @@ -0,0 +1,15 @@ +/********************* subvec.c (in su3.a) **************************** +* * +* void sub_su3_vector(a,b,c) su3_vector *a,*b,*c; * +* subtract su3 vectors: C <- A - B * +*/ +#include "complex.h" +#include "su3.h" + +/* subtract su3 vectors */ +void sub_su3_vector( su3_vector *a, su3_vector *b, su3_vector *c ){ +register int i; + for(i=0;i<3;i++){ + CSUB( a->c[i], b->c[i], c->c[i] ); + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/trace_su3.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/trace_su3.c new file mode 100644 index 0000000000000000000000000000000000000000..e655a5d09a2976949b8472cbc91f3c1b7bb6d102 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/trace_su3.c @@ -0,0 +1,15 @@ +/******************* trace_su3.c (in su3.a) *************************** +* * +* complex trace_su3(a) su3_matrix *a; * +* return complex trace of an SU3 matrix * +*/ +#include "complex.h" +#include "su3.h" + +/* Complex trace of an SU3 matrix */ +complex trace_su3( su3_matrix *a ) { +register complex t1,t2; + CADD(a->e[0][0],a->e[1][1],t1); + CADD(t1,a->e[2][2],t2); + return(t2); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/uncmp_ahmat.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/uncmp_ahmat.c new file mode 100644 index 0000000000000000000000000000000000000000..1205508fc4836cc5ba93592b87e6465fd31c4acf --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/uncmp_ahmat.c @@ -0,0 +1,35 @@ +/************ uncmp_ahmat.c (in su3.a) ******************************** +* * +* void uncompress_anti_hermitian( anti_hermitmat *mat_antihermit, * +* su3_matrix *mat_su3 ) * +* uncompresses an anti_hermitian matrix to make a 3x3 complex matrix * +*/ +#include "complex.h" +#include "su3.h" + +void uncompress_anti_hermitian( anti_hermitmat *mat_antihermit, + su3_matrix *mat_su3 ) { +/* uncompresses an anti_hermitian su3 matrix */ + radix temp1; + mat_su3->e[0][0].imag=mat_antihermit->m00im; + mat_su3->e[0][0].real=0.; + mat_su3->e[1][1].imag=mat_antihermit->m11im; + mat_su3->e[1][1].real=0.; + mat_su3->e[2][2].imag=mat_antihermit->m22im; + mat_su3->e[2][2].real=0.; + mat_su3->e[0][1].imag=mat_antihermit->m01.imag; + temp1=mat_antihermit->m01.real; + mat_su3->e[0][1].real=temp1; + mat_su3->e[1][0].real= -temp1; + mat_su3->e[1][0].imag=mat_antihermit->m01.imag; + mat_su3->e[0][2].imag=mat_antihermit->m02.imag; + temp1=mat_antihermit->m02.real; + mat_su3->e[0][2].real=temp1; + mat_su3->e[2][0].real= -temp1; + mat_su3->e[2][0].imag=mat_antihermit->m02.imag; + mat_su3->e[1][2].imag=mat_antihermit->m12.imag; + temp1=mat_antihermit->m12.real; + mat_su3->e[1][2].real=temp1; + mat_su3->e[2][1].real= -temp1; + mat_su3->e[2][1].imag=mat_antihermit->m12.imag; +}/*uncompress_anti_hermitian*/ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/wp_grow.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/wp_grow.c new file mode 100644 index 0000000000000000000000000000000000000000..cfe5f1ca3ed6b58c23f46c630fb9474b65b82764 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/wp_grow.c @@ -0,0 +1,159 @@ +/***************** wp_grow.c (in su3.a) **************************/ +/* + Expand the "Wilson projection" of a Wilson fermion vector. + (1 +- gamma_j) is a projection operator, and we are given a + half_wilson_vector which contains the two components of a Wilson + vector projected out. This routine reexpands it to a four component + object. + + usage: wp_grow( half_wilson_vector *src, wilson_vector *dest, + int dir, int sign ); + + If dir is one of XUP,YUP,ZUP or TUP, the projection is + along the eigenvectors with eigenvalue +1, which survive + multiplcation by (1+gamma[dir]). + If dir is one of XDOWN,YDOWN,ZDOWN or TDOWN, the projection is + along the eigenvectors with eigenvalue -1, which survive + multiplication by (1-gamma[OPP_DIR(dir)]). + If sign=MINUS reverse the roles of +1 and -1 - in other words + use -gamma_dir instead of gamma_dir + + Here my eigenvectors are normalized to 2, so for XYZT directions + I won't explicitely multiply by 2. In other words, the matrix of + eigenvectors is sqrt(2) times a unitary matrix, and in reexpanding + the vector I will multiply by the adjoint of this matrix. + + For UP directions, hvec.h[0] and hvec.h[2] contain the projections + along the first and second eigenvectors respectively. + For DOWN directions, hvec.h[0] and hvec.h[2] contain the projections + along the third and fourth eigenvectors respectively. This results + in down directions differing from up directions only in the sign of + the addition. + + Note: wp_shrink( +-dir) followed by wp_grow( +-dir) amounts to multiplication + by 1+-gamma_dir + + gamma(XUP) eigenvectors eigenvalue + 0 0 0 i ( 1, 0, 0,-i) +1 + 0 0 i 0 ( 0, 1,-i, 0) +1 + 0 -i 0 0 ( 0, 1, 0,+i) -1 + -i 0 0 0 ( 1, 0,+i, 0) -1 + + gamma(YUP) eigenvectors eigenvalue + 0 0 0 -1 ( 1, 0, 0,-1) +1 + 0 0 1 0 ( 0, 1, 1, 0) +1 + 0 1 0 0 ( 1, 0, 0, 1) -1 + -1 0 0 0 ( 0, 1,-1, 0) -1 + + gamma(ZUP) eigenvectors eigenvalue + 0 0 i 0 ( 1, 0,-i, 0) +1 + 0 0 0 -i ( 0, 1, 0,+i) +1 + -i 0 0 0 ( 1, 0,+i, 0) -1 + 0 i 0 0 ( 0, 1, 0,-i) -1 + + gamma(TUP) eigenvectors eigenvalue + 0 0 1 0 ( 1, 0, 1, 0) +1 + 0 0 0 1 ( 0, 1, 0, 1) +1 + 1 0 0 0 ( 1, 0,-1, 0) -1 + 0 1 0 0 ( 0, 1, 0,-1) -1 + + gamma(FIVE) eigenvectors eigenvalue + 1 0 0 0 + 0 1 0 0 + 0 0 -1 0 + 0 0 0 -1 +*/ +#include +#include "complex.h" +#include "su3.h" +/* Directions, and a macro to give the opposite direction */ +/* These must go from 0 to 7 because they will be used to index an + array. */ +/* Also define NDIRS = number of directions */ +#define XUP 0 +#define YUP 1 +#define ZUP 2 +#define TUP 3 +#define TDOWN 4 +#define ZDOWN 5 +#define YDOWN 6 +#define XDOWN 7 + +#define OPP_DIR(dir) (7-(dir)) /* Opposite direction */ +#define NDIRS 8 /* number of directions */ + +void wp_grow( half_wilson_vector *src, wilson_vector *dest, + int dir, int sign ){ + register int i; /*color*/ + + if(sign==MINUS)dir=OPP_DIR(dir); /* two ways to get -gamma_dir ! */ + switch(dir){ + case XUP: + for(i=0;i<3;i++){ + dest->d[0].c[i] = src->h[0].c[i]; + dest->d[1].c[i] = src->h[1].c[i]; + TIMESMINUSI( src->h[0].c[i], dest->d[3].c[i]); + TIMESMINUSI( src->h[1].c[i], dest->d[2].c[i]); + } + break; + case XDOWN: + for(i=0;i<3;i++){ + dest->d[0].c[i] = src->h[0].c[i]; + dest->d[1].c[i] = src->h[1].c[i]; + TIMESPLUSI( src->h[0].c[i], dest->d[3].c[i]); + TIMESPLUSI( src->h[1].c[i], dest->d[2].c[i]); + } + break; + case YUP: + for(i=0;i<3;i++){ + dest->d[0].c[i] = src->h[0].c[i]; + dest->d[1].c[i] = src->h[1].c[i]; + TIMESMINUSONE( src->h[0].c[i], dest->d[3].c[i] ); + TIMESPLUSONE( src->h[1].c[i], dest->d[2].c[i] ); + } + break; + case YDOWN: + for(i=0;i<3;i++){ + dest->d[0].c[i] = src->h[0].c[i]; + dest->d[1].c[i] = src->h[1].c[i]; + TIMESPLUSONE( src->h[0].c[i], dest->d[3].c[i] ); + TIMESMINUSONE( src->h[1].c[i], dest->d[2].c[i] ); + } + break; + case ZUP: + for(i=0;i<3;i++){ + dest->d[0].c[i] = src->h[0].c[i]; + dest->d[1].c[i] = src->h[1].c[i]; + TIMESMINUSI( src->h[0].c[i], dest->d[2].c[i] ); + TIMESPLUSI( src->h[1].c[i], dest->d[3].c[i] ); + } + break; + case ZDOWN: + for(i=0;i<3;i++){ + dest->d[0].c[i] = src->h[0].c[i]; + dest->d[1].c[i] = src->h[1].c[i]; + TIMESPLUSI( src->h[0].c[i], dest->d[2].c[i] ); + TIMESMINUSI( src->h[1].c[i], dest->d[3].c[i] ); + } + break; + case TUP: + for(i=0;i<3;i++){ + dest->d[0].c[i] = src->h[0].c[i]; + dest->d[1].c[i] = src->h[1].c[i]; + dest->d[2].c[i] = src->h[0].c[i]; + dest->d[3].c[i] = src->h[1].c[i]; + } + break; + case TDOWN: + for(i=0;i<3;i++){ + dest->d[0].c[i] = src->h[0].c[i]; + dest->d[1].c[i] = src->h[1].c[i]; + TIMESMINUSONE( src->h[0].c[i], dest->d[2].c[i] ); + TIMESMINUSONE( src->h[1].c[i], dest->d[3].c[i] ); + } + break; + default: + printf("BAD CALL TO WP_GROW()\n"); + } +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/wp_grow_a.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/wp_grow_a.c new file mode 100644 index 0000000000000000000000000000000000000000..bec3d9e5dee0e2c161b31d0ba1e737703a9708c1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/wp_grow_a.c @@ -0,0 +1,162 @@ +/***************** wp_grow_a.c (in su3.a) **************************/ +/* + Expand the "Wilson projection" of a Wilson fermion vector. + (1 +- gamma_j) is a projection operator, and we are given a + half_wilson_vector which contains the two components of a Wilson + vector projected out. This routine reexpands it to a four component + object and adds it to another Wilson vector. + + usage: wp_grow_add( half_wilson_vector *src, wilson_vector *dest, + int dir, int sign ); + + If dir is one of XUP,YUP,ZUP or TUP, the projection is + along the eigenvectors with eigenvalue +1, which survive + multiplcation by (1+gamma[dir]). + If dir is one of XDOWN,YDOWN,ZDOWN or TDOWN, the projection is + along the eigenvectors with eigenvalue -1, which survive + multiplication by (1-gamma[OPP_DIR(dir)]). + If sign=MINUS reverse the roles of +1 and -1 - in other words + use -gamma_dir instead of gamma_dir + + Here my eigenvectors are normalized to 2, so for XYZT directions + I won't explicitely multiply by 2. In other words, the matrix of + eigenvectors is sqrt(2) times a unitary matrix, and in reexpanding + the vector I will multiply by the adjoint of this matrix. + + For UP directions, hvec.h[0] and hvec.h[2] contain the projections + along the first and second eigenvectors respectively. + For DOWN directions, hvec.h[0] and hvec.h[2] contain the projections + along the third and fourth eigenvectors respectively. This results + in down directions differing from up directions only in the sign of + the addition. + + Note: wp_shrink( +-dir) followed by wp_grow( +-dir) amounts to multiplication + by 1+-gamma_dir + + gamma(XUP) eigenvectors eigenvalue + 0 0 0 i ( 1, 0, 0,-i) +1 + 0 0 i 0 ( 0, 1,-i, 0) +1 + 0 -i 0 0 ( 0, 1, 0,+i) -1 + -i 0 0 0 ( 1, 0,+i ,0) -1 + + gamma(YUP) eigenvectors eigenvalue + 0 0 0 -1 ( 1, 0, 0,-1) +1 + 0 0 1 0 ( 0, 1, 1, 0) +1 + 0 1 0 0 ( 1, 0, 0, 1) -1 + -1 0 0 0 ( 0, 1,-1, 0) -1 + + gamma(ZUP) eigenvectors eigenvalue + 0 0 i 0 ( 1, 0,-i, 0) +1 + 0 0 0 -i ( 0, 1, 0,+i) +1 + -i 0 0 0 ( 1, 0,+i, 0) -1 + 0 i 0 0 ( 0, 1, 0,-i) -1 + + gamma(TUP) eigenvectors eigenvalue + 0 0 1 0 ( 1, 0, 1, 0) +1 + 0 0 0 1 ( 0, 1, 0, 1) +1 + 1 0 0 0 ( 1, 0,-1, 0) -1 + 0 1 0 0 ( 0, 1, 0,-1) -1 + + gamma(FIVE) eigenvectors eigenvalue + 1 0 0 0 + 0 1 0 0 + 0 0 -1 0 + 0 0 0 -1 +*/ +#include +#include "complex.h" +#include "su3.h" +/* Directions, and a macro to give the opposite direction */ +/* These must go from 0 to 7 because they will be used to index an + array. */ +/* Also define NDIRS = number of directions */ +#define XUP 0 +#define YUP 1 +#define ZUP 2 +#define TUP 3 +#define TDOWN 4 +#define ZDOWN 5 +#define YDOWN 6 +#define XDOWN 7 + +#define OPP_DIR(dir) (7-(dir)) /* Opposite direction */ +#define NDIRS 8 /* number of directions */ + +/* a += i*b, a += -i*b */ +#define CSUM_TPI(a,b) { (a).real -= (b).imag; (a).imag += (b).real; } +#define CSUM_TMI(a,b) { (a).real += (b).imag; (a).imag -= (b).real; } + +void wp_grow_add( half_wilson_vector *src, wilson_vector *dest, + int dir, int sign ){ + register int i; /*color*/ + + if(sign==MINUS)dir=OPP_DIR(dir); /* two ways to get -gamma_dir ! */ + switch(dir){ + case XUP: + for(i=0;i<3;i++){ + CSUM( dest->d[0].c[i], src->h[0].c[i]); + CSUM( dest->d[1].c[i], src->h[1].c[i]); + CSUM_TMI( dest->d[2].c[i], src->h[1].c[i] ); + CSUM_TMI( dest->d[3].c[i], src->h[0].c[i] ); + } + break; + case XDOWN: + for(i=0;i<3;i++){ + CSUM( dest->d[0].c[i], src->h[0].c[i]); + CSUM( dest->d[1].c[i], src->h[1].c[i]); + CSUM_TPI( dest->d[2].c[i], src->h[1].c[i] ); + CSUM_TPI( dest->d[3].c[i], src->h[0].c[i] ); + } + break; + case YUP: + for(i=0;i<3;i++){ + CSUM( dest->d[0].c[i], src->h[0].c[i]); + CSUM( dest->d[1].c[i], src->h[1].c[i]); + CSUM( dest->d[2].c[i], src->h[1].c[i]); + CSUB( dest->d[3].c[i], src->h[0].c[i], dest->d[3].c[i] ); + } + break; + case YDOWN: + for(i=0;i<3;i++){ + CSUM( dest->d[0].c[i], src->h[0].c[i]); + CSUM( dest->d[1].c[i], src->h[1].c[i]); + CSUB( dest->d[2].c[i], src->h[1].c[i], dest->d[2].c[i] ); + CSUM( dest->d[3].c[i], src->h[0].c[i]); + } + break; + case ZUP: + for(i=0;i<3;i++){ + CSUM( dest->d[0].c[i], src->h[0].c[i]); + CSUM( dest->d[1].c[i], src->h[1].c[i]); + CSUM_TMI( dest->d[2].c[i], src->h[0].c[i] ); + CSUM_TPI( dest->d[3].c[i], src->h[1].c[i] ); + } + break; + case ZDOWN: + for(i=0;i<3;i++){ + CSUM( dest->d[0].c[i], src->h[0].c[i]); + CSUM( dest->d[1].c[i], src->h[1].c[i]); + CSUM_TPI( dest->d[2].c[i], src->h[0].c[i] ); + CSUM_TMI( dest->d[3].c[i], src->h[1].c[i] ); + } + break; + case TUP: + for(i=0;i<3;i++){ + CSUM( dest->d[0].c[i], src->h[0].c[i]); + CSUM( dest->d[1].c[i], src->h[1].c[i]); + CSUM( dest->d[2].c[i], src->h[0].c[i]); + CSUM( dest->d[3].c[i], src->h[1].c[i]); + } + break; + case TDOWN: + for(i=0;i<3;i++){ + CSUM( dest->d[0].c[i], src->h[0].c[i]); + CSUM( dest->d[1].c[i], src->h[1].c[i]); + CSUB( dest->d[2].c[i], src->h[0].c[i], dest->d[2].c[i] ); + CSUB( dest->d[3].c[i], src->h[1].c[i], dest->d[3].c[i] ); + } + break; + default: + printf("BAD CALL TO WP_GROW()\n"); + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/wp_shrink.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/wp_shrink.c new file mode 100644 index 0000000000000000000000000000000000000000..3fae32bfbc3174eb266cfe54a732aa227ef937fd --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/wp_shrink.c @@ -0,0 +1,159 @@ +/************* wp_shrink.c (in su3.a) **************************/ +/* + Compute the "Wilson projection" of a Wilson fermion vector. + (1 +- gamma_j) is a projection operator, and we want to isolate + the components of the vector that it keeps. In other words, keep + the components of the vector along the eigenvectors of 1+-gamma_j + with eigenvalue 2, and throw away those with eigenvalue 0. + + usage: wp_shrink( wilson_vector *src, half_wilson_vector *dest, + int dir, int sign ) + + If dir is one of XUP,YUP,ZUP or TUP, take the projections + along the eigenvectors with eigenvalue +1, which survive + multiplication by (1+gamma[dir]). + If dir is one of XDOWN,YDOWN,ZDOWN or TDOWN, take the projections + along the eigenvectors with eigenvalue -1, which survive + multiplication by (1-gamma[OPP_DIR(dir)]). + If sign=MINUS, switch the roles of +1 and -1 (ie use -gamma_dir + instead of gamma_dir ) + + Here my eigenvectors are normalized to 2, so for XYZT directions + I won't explicitely multiply by 2. In other words, the matrix of + eigenvectors is sqrt(2) times a unitary matrix, and in reexpanding + the vector I will multiply by the adjoint of this matrix. + + For UP directions, hvec.h[0] and hvec.h[2] contain the projections + along the first and second eigenvectors respectively. + For DOWN directions, hvec.h[0] and hvec.h[2] contain the projections + along the third and fourth eigenvectors respectively. This results + in down directions differing from up directions only in the sign of + the addition. + + Note: wp_shrink( +-dir) followed by wp_grow( +-dir) amounts to multiplication + by 1+-gamma_dir + + gamma(XUP) eigenvectors eigenvalue + 0 0 0 i ( 1, 0, 0,-i) +1 + 0 0 i 0 ( 0, 1,-i, 0) +1 + 0 -i 0 0 ( 0, 1, 0,+i) -1 + -i 0 0 0 ( 1, 0,+i, 0) -1 + + gamma(YUP) eigenvectors eigenvalue + 0 0 0 -1 ( 1, 0, 0,-1) +1 + 0 0 1 0 ( 0, 1, 1, 0) +1 + 0 1 0 0 ( 1, 0, 0, 1) -1 + -1 0 0 0 ( 0, 1,-1, 0) -1 + + gamma(ZUP) eigenvectors eigenvalue + 0 0 i 0 ( 1, 0,-i, 0) +1 + 0 0 0 -i ( 0, 1, 0,+i) +1 + -i 0 0 0 ( 1, 0,+i, 0) -1 + 0 i 0 0 ( 0, 1, 0,-i) -1 + + gamma(TUP) eigenvectors eigenvalue + 0 0 1 0 ( 1, 0, 1, 0) +1 + 0 0 0 1 ( 0, 1, 0, 1) +1 + 1 0 0 0 ( 1, 0,-1, 0) -1 + 0 1 0 0 ( 0, 1, 0,-1) -1 + + gamma(FIVE) eigenvectors eigenvalue + 1 0 0 0 + 0 1 0 0 + 0 0 -1 0 + 0 0 0 -1 +*/ +#include +#include "complex.h" +#include "su3.h" +/* Directions, and a macro to give the opposite direction */ +/* These must go from 0 to 7 because they will be used to index an + array. */ +/* Also define NDIRS = number of directions */ +#define XUP 0 +#define YUP 1 +#define ZUP 2 +#define TUP 3 +#define TDOWN 4 +#define ZDOWN 5 +#define YDOWN 6 +#define XDOWN 7 + +#define OPP_DIR(dir) (7-(dir)) /* Opposite direction */ +#define NDIRS 8 /* number of directions */ + +void wp_shrink( wilson_vector *src, half_wilson_vector *dest, + int dir, int sign ){ + register int i; /*color*/ + + if(sign==MINUS)dir=OPP_DIR(dir); /* two ways to get -gamma_dir ! */ + switch(dir){ + case XUP: + for(i=0;i<3;i++){ + dest->h[0].c[i].real = src->d[0].c[i].real - src->d[3].c[i].imag; + dest->h[0].c[i].imag = src->d[0].c[i].imag + src->d[3].c[i].real; + dest->h[1].c[i].real = src->d[1].c[i].real - src->d[2].c[i].imag; + dest->h[1].c[i].imag = src->d[1].c[i].imag + src->d[2].c[i].real; + } + break; + case XDOWN: + for(i=0;i<3;i++){ + dest->h[0].c[i].real = src->d[0].c[i].real + src->d[3].c[i].imag; + dest->h[0].c[i].imag = src->d[0].c[i].imag - src->d[3].c[i].real; + dest->h[1].c[i].real = src->d[1].c[i].real + src->d[2].c[i].imag; + dest->h[1].c[i].imag = src->d[1].c[i].imag - src->d[2].c[i].real; + } + break; + case YUP: + for(i=0;i<3;i++){ + dest->h[0].c[i].real = src->d[0].c[i].real - src->d[3].c[i].real; + dest->h[0].c[i].imag = src->d[0].c[i].imag - src->d[3].c[i].imag; + dest->h[1].c[i].real = src->d[1].c[i].real + src->d[2].c[i].real; + dest->h[1].c[i].imag = src->d[1].c[i].imag + src->d[2].c[i].imag; + } + break; + case YDOWN: + for(i=0;i<3;i++){ + dest->h[0].c[i].real = src->d[0].c[i].real + src->d[3].c[i].real; + dest->h[0].c[i].imag = src->d[0].c[i].imag + src->d[3].c[i].imag; + dest->h[1].c[i].real = src->d[1].c[i].real - src->d[2].c[i].real; + dest->h[1].c[i].imag = src->d[1].c[i].imag - src->d[2].c[i].imag; + } + break; + case ZUP: + for(i=0;i<3;i++){ + dest->h[0].c[i].real = src->d[0].c[i].real - src->d[2].c[i].imag; + dest->h[0].c[i].imag = src->d[0].c[i].imag + src->d[2].c[i].real; + dest->h[1].c[i].real = src->d[1].c[i].real + src->d[3].c[i].imag; + dest->h[1].c[i].imag = src->d[1].c[i].imag - src->d[3].c[i].real; + } + break; + case ZDOWN: + for(i=0;i<3;i++){ + dest->h[0].c[i].real = src->d[0].c[i].real + src->d[2].c[i].imag; + dest->h[0].c[i].imag = src->d[0].c[i].imag - src->d[2].c[i].real; + dest->h[1].c[i].real = src->d[1].c[i].real - src->d[3].c[i].imag; + dest->h[1].c[i].imag = src->d[1].c[i].imag + src->d[3].c[i].real; + } + break; + case TUP: + for(i=0;i<3;i++){ + dest->h[0].c[i].real = src->d[0].c[i].real + src->d[2].c[i].real; + dest->h[0].c[i].imag = src->d[0].c[i].imag + src->d[2].c[i].imag; + dest->h[1].c[i].real = src->d[1].c[i].real + src->d[3].c[i].real; + dest->h[1].c[i].imag = src->d[1].c[i].imag + src->d[3].c[i].imag; + } + break; + case TDOWN: + for(i=0;i<3;i++){ + dest->h[0].c[i].real = src->d[0].c[i].real - src->d[2].c[i].real; + dest->h[0].c[i].imag = src->d[0].c[i].imag - src->d[2].c[i].imag; + dest->h[1].c[i].real = src->d[1].c[i].real - src->d[3].c[i].real; + dest->h[1].c[i].imag = src->d[1].c[i].imag - src->d[3].c[i].imag; + } + break; + default: + printf("BAD CALL TO WP_SHRINK()\n"); + } +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/wp_shrink4.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/wp_shrink4.c new file mode 100644 index 0000000000000000000000000000000000000000..b560707cd200198294878934c51a10f7d6198981 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/wp_shrink4.c @@ -0,0 +1,144 @@ +/***************** wp_shrink4.c (in su3.a) **************************** +* * +* Shrink a wilson vector in four directions, producing four * +* half_wilson_vectors. * +* void wp_shrink_4dir( wilson_vector *a, half_wilson_vector *b1, * +* half_wilson_vector *b2, half_wilson_vector *b3, * +* half_wilson_vector *b4, int sign ); * +* B1 <- (1 +- gamma_x)A,, projection * +* argument "sign" is sign of gamma matrix. * +* See wp_shrink.c for definitions of gamma matrices and eigenvectors. * +*/ +#include "complex.h" +#include "su3.h" +/* Directions, and a macro to give the opposite direction */ +/* These must go from 0 to 7 because they will be used to index an + array. */ +/* Also define NDIRS = number of directions */ +#define XUP 0 +#define YUP 1 +#define ZUP 2 +#define TUP 3 +#define TDOWN 4 +#define ZDOWN 5 +#define YDOWN 6 +#define XDOWN 7 + +#define OPP_DIR(dir) (7-(dir)) /* Opposite direction */ +#define NDIRS 8 /* number of directions */ + +#ifndef FAST /* "FAST", or IBM RS6000 version inlines calls */ + +void wp_shrink_4dir( wilson_vector *a, half_wilson_vector *b1, + half_wilson_vector *b2, half_wilson_vector *b3, + half_wilson_vector *b4, int sign ){ + wp_shrink( a,b1,XUP,sign); + wp_shrink( a,b2,YUP,sign); + wp_shrink( a,b3,ZUP,sign); + wp_shrink( a,b4,TUP,sign); +} + +#else /* "FAST" code inlines calls */ + +void wp_shrink_4dir( wilson_vector *a, half_wilson_vector *b1, + half_wilson_vector *b2, half_wilson_vector *b3, + half_wilson_vector *b4, int sign ){ + register int i; /*color*/ + +/* wp_shrink( a,b1,XUP,sign); */ + + if(sign==PLUS) + { + /* case XUP: */ + for(i=0;i<3;i++){ + b1->h[0].c[i].real = a->d[0].c[i].real - a->d[3].c[i].imag; + b1->h[0].c[i].imag = a->d[0].c[i].imag + a->d[3].c[i].real; + b1->h[1].c[i].real = a->d[1].c[i].real - a->d[2].c[i].imag; + b1->h[1].c[i].imag = a->d[1].c[i].imag + a->d[2].c[i].real; + } + } + else + { + /* case XDOWN: */ + for(i=0;i<3;i++){ + b1->h[0].c[i].real = a->d[0].c[i].real + a->d[3].c[i].imag; + b1->h[0].c[i].imag = a->d[0].c[i].imag - a->d[3].c[i].real; + b1->h[1].c[i].real = a->d[1].c[i].real + a->d[2].c[i].imag; + b1->h[1].c[i].imag = a->d[1].c[i].imag - a->d[2].c[i].real; + } + } + + + /* wp_shrink( a,b2,YUP,sign); */ + + if(sign==PLUS) + { + /* case YUP: */ + for(i=0;i<3;i++){ + b2->h[0].c[i].real = a->d[0].c[i].real - a->d[3].c[i].real; + b2->h[0].c[i].imag = a->d[0].c[i].imag - a->d[3].c[i].imag; + b2->h[1].c[i].real = a->d[1].c[i].real + a->d[2].c[i].real; + b2->h[1].c[i].imag = a->d[1].c[i].imag + a->d[2].c[i].imag; + } + + } + else + { + /* case YDOWN: */ + for(i=0;i<3;i++){ + b2->h[0].c[i].real = a->d[0].c[i].real + a->d[3].c[i].real; + b2->h[0].c[i].imag = a->d[0].c[i].imag + a->d[3].c[i].imag; + b2->h[1].c[i].real = a->d[1].c[i].real - a->d[2].c[i].real; + b2->h[1].c[i].imag = a->d[1].c[i].imag - a->d[2].c[i].imag; + } + } + + /* wp_shrink( a,b3,ZUP,sign); */ + + if(sign==PLUS) + { + /* case ZUP: */ + for(i=0;i<3;i++){ + b3->h[0].c[i].real = a->d[0].c[i].real - a->d[2].c[i].imag; + b3->h[0].c[i].imag = a->d[0].c[i].imag + a->d[2].c[i].real; + b3->h[1].c[i].real = a->d[1].c[i].real + a->d[3].c[i].imag; + b3->h[1].c[i].imag = a->d[1].c[i].imag - a->d[3].c[i].real; + } + } + else + { + /* case ZDOWN: */ + for(i=0;i<3;i++){ + b3->h[0].c[i].real = a->d[0].c[i].real + a->d[2].c[i].imag; + b3->h[0].c[i].imag = a->d[0].c[i].imag - a->d[2].c[i].real; + b3->h[1].c[i].real = a->d[1].c[i].real - a->d[3].c[i].imag; + b3->h[1].c[i].imag = a->d[1].c[i].imag + a->d[3].c[i].real; + } + + } + +/* wp_shrink( a,b4,TUP,sign); */ + + if(sign==PLUS) + { + /* case TUP: */ + for(i=0;i<3;i++){ + b4->h[0].c[i].real = a->d[0].c[i].real + a->d[2].c[i].real; + b4->h[0].c[i].imag = a->d[0].c[i].imag + a->d[2].c[i].imag; + b4->h[1].c[i].real = a->d[1].c[i].real + a->d[3].c[i].real; + b4->h[1].c[i].imag = a->d[1].c[i].imag + a->d[3].c[i].imag; + } + } + else + { + /* case TDOWN: */ + for(i=0;i<3;i++){ + b4->h[0].c[i].real = a->d[0].c[i].real - a->d[2].c[i].real; + b4->h[0].c[i].imag = a->d[0].c[i].imag - a->d[2].c[i].imag; + b4->h[1].c[i].real = a->d[1].c[i].real - a->d[3].c[i].real; + b4->h[1].c[i].imag = a->d[1].c[i].imag - a->d[3].c[i].imag; + } + } +} + +#endif /* "ifndef FAST */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/wp_shrink8.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/wp_shrink8.c new file mode 100644 index 0000000000000000000000000000000000000000..f7816c4beec9257894ece10d72a4d3147231aa86 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/wp_shrink8.c @@ -0,0 +1,39 @@ +/***************** wp_shrink8.c (in su3.a) **************************** +* * +* Shrink a wilson vector in eight directions, producing eight * +* half_wilson_vectors. * +* void wp_shrink_8dir(a,b,sign) * +* wilson_vector *a; half_wilson_vector *b; * +* int sign; * +* B1 <- (1 +- gamma_x)A,, projection * +* argument "sign" is sign of gamma matrix. * +* See wp_shrink.c for definitions of gamma matrices and eigenvectors. * +*/ +#include "complex.h" +#include "su3.h" +/* Directions, and a macro to give the opposite direction */ +/* These must go from 0 to 7 because they will be used to index an + array. */ +/* Also define NDIRS = number of directions */ +#define XUP 0 +#define YUP 1 +#define ZUP 2 +#define TUP 3 +#define TDOWN 4 +#define ZDOWN 5 +#define YDOWN 6 +#define XDOWN 7 + +#define OPP_DIR(dir) (7-(dir)) /* Opposite direction */ +#define NDIRS 8 /* number of directions */ + +void wp_shrink_8dir( wilson_vector *a, half_wilson_vector *b, int sign) { + wp_shrink( a,&(b[XUP]),XUP,sign); + wp_shrink( a,&(b[YUP]),YUP,sign); + wp_shrink( a,&(b[ZUP]),ZUP,sign); + wp_shrink( a,&(b[TUP]),TUP,sign); + wp_shrink( a,&(b[XDOWN]),XDOWN,sign); + wp_shrink( a,&(b[YDOWN]),YDOWN,sign); + wp_shrink( a,&(b[ZDOWN]),ZDOWN,sign); + wp_shrink( a,&(b[TDOWN]),TDOWN,sign); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/wvec2_dot.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/wvec2_dot.c new file mode 100644 index 0000000000000000000000000000000000000000..65cc34def00ff367f082b1334ec31199173cbdd2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/wvec2_dot.c @@ -0,0 +1,26 @@ +/****************** wvec_dot.c (in su3.a) ****************************** +* * +* complex wvec2_dot( wilson_vector *a, wilson_vector *b ) * +* return dot product of two wilson_vectors = a-dagger times b * +*/ +#include "complex.h" +#include "su3.h" + +complex wvec2_dot( wilson_vector *a, wilson_vector *b ){ + complex temp; + wilson_vector c; + register int i,j; + + temp.real = wvec_rdot(a,b); + + for(i=0;i<4;i++){ + for(j=0;j<3;j++){ + c.d[i].c[j].real = -(a->d[i].c[j].imag); + c.d[i].c[j].imag = a->d[i].c[j].real; + } + } + + temp.imag = wvec_rdot(&c,b); + + return(temp); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/wvec_dot.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/wvec_dot.c new file mode 100644 index 0000000000000000000000000000000000000000..28cd247686da0d35e54471c88abad30f78e3bab3 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/wvec_dot.c @@ -0,0 +1,84 @@ +/****************** wvec_dot.c (in su3.a) ****************************** +* * +* complex wvec_dot(a,b) wilson_vector *a,*b; * +* return dot product of two wilson_vectors * +*/ +#include "complex.h" +#include "su3.h" + +complex wvec_dot( wilson_vector *a, wilson_vector *b ){ + +#ifndef NATIVEDOUBLE + complex temp1,temp2; + register int i; + temp1.real = temp1.imag = 0.0; + for(i=0;i<4;i++){ + CMULJ_(a->d[i].c[0],b->d[i].c[0],temp2); CSUM(temp1,temp2); + CMULJ_(a->d[i].c[1],b->d[i].c[1],temp2); CSUM(temp1,temp2); + CMULJ_(a->d[i].c[2],b->d[i].c[2],temp2); CSUM(temp1,temp2); + } + return(temp1); + +#else /* RS6000 version */ + + register double ar,ai,br,bi,cr,ci; + register complex cc; + + ar=a->d[0].c[0].real; ai=a->d[0].c[0].imag; + br=b->d[0].c[0].real; bi=b->d[0].c[0].imag; + cr = ar*br + ai*bi; + ci = ar*bi - ai*br; + ar=a->d[0].c[1].real; ai=a->d[0].c[1].imag; + br=b->d[0].c[1].real; bi=b->d[0].c[1].imag; + cr += ar*br + ai*bi; + ci += ar*bi - ai*br; + ar=a->d[0].c[2].real; ai=a->d[0].c[2].imag; + br=b->d[0].c[2].real; bi=b->d[0].c[2].imag; + cr += ar*br + ai*bi; + ci += ar*bi - ai*br; + + ar=a->d[1].c[0].real; ai=a->d[1].c[0].imag; + br=b->d[1].c[0].real; bi=b->d[1].c[0].imag; + cr += ar*br + ai*bi; + ci += ar*bi - ai*br; + ar=a->d[1].c[1].real; ai=a->d[1].c[1].imag; + br=b->d[1].c[1].real; bi=b->d[1].c[1].imag; + cr += ar*br + ai*bi; + ci += ar*bi - ai*br; + ar=a->d[1].c[2].real; ai=a->d[1].c[2].imag; + br=b->d[1].c[2].real; bi=b->d[1].c[2].imag; + cr += ar*br + ai*bi; + ci += ar*bi - ai*br; + + ar=a->d[2].c[0].real; ai=a->d[2].c[0].imag; + br=b->d[2].c[0].real; bi=b->d[2].c[0].imag; + cr += ar*br + ai*bi; + ci += ar*bi - ai*br; + ar=a->d[2].c[1].real; ai=a->d[2].c[1].imag; + br=b->d[2].c[1].real; bi=b->d[2].c[1].imag; + cr += ar*br + ai*bi; + ci += ar*bi - ai*br; + ar=a->d[2].c[2].real; ai=a->d[2].c[2].imag; + br=b->d[2].c[2].real; bi=b->d[2].c[2].imag; + cr += ar*br + ai*bi; + ci += ar*bi - ai*br; + + ar=a->d[3].c[0].real; ai=a->d[3].c[0].imag; + br=b->d[3].c[0].real; bi=b->d[3].c[0].imag; + cr += ar*br + ai*bi; + ci += ar*bi - ai*br; + ar=a->d[3].c[1].real; ai=a->d[3].c[1].imag; + br=b->d[3].c[1].real; bi=b->d[3].c[1].imag; + cr += ar*br + ai*bi; + ci += ar*bi - ai*br; + ar=a->d[3].c[2].real; ai=a->d[3].c[2].imag; + br=b->d[3].c[2].real; bi=b->d[3].c[2].imag; + cr += ar*br + ai*bi; + ci += ar*bi - ai*br; + + cc.real = cr; + cc.imag = ci; + return(cc); + +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/wvec_rdot.c b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/wvec_rdot.c new file mode 100644 index 0000000000000000000000000000000000000000..13b7b744694b95bfa7d7cfef4c5bf92b6287a8c6 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/libraries/wvec_rdot.c @@ -0,0 +1,72 @@ +/***************** wvec_rdot.c (in su3.a) ****************************** +* * +* radix wvec_rdot( wilson_vector *a, wilson_vector *b ) * +* return real part of dot product of two wilson_vectors * +*/ +#include "complex.h" +#include "su3.h" + +radix wvec_rdot( wilson_vector *a, wilson_vector *b ){ + +#ifndef NATIVEDOUBLE + register radix temp1,temp2; + register int i; + temp2=0.0; + for(i=0;i<4;i++){ + temp1 = a->d[i].c[0].real * b->d[i].c[0].real; temp2 += temp1; + temp1 = a->d[i].c[0].imag * b->d[i].c[0].imag; temp2 += temp1; + temp1 = a->d[i].c[1].real * b->d[i].c[1].real; temp2 += temp1; + temp1 = a->d[i].c[1].imag * b->d[i].c[1].imag; temp2 += temp1; + temp1 = a->d[i].c[2].real * b->d[i].c[2].real; temp2 += temp1; + temp1 = a->d[i].c[2].imag * b->d[i].c[2].imag; temp2 += temp1; + } + return(temp2); + +#else /* RS6000 version */ + + register double ar,ai,br,bi,ss; + + ar=a->d[0].c[0].real; ai=a->d[0].c[0].imag; + br=b->d[0].c[0].real; bi=b->d[0].c[0].imag; + ss = ar*br + ai*bi; + ar=a->d[0].c[1].real; ai=a->d[0].c[1].imag; + br=b->d[0].c[1].real; bi=b->d[0].c[1].imag; + ss += ar*br + ai*bi; + ar=a->d[0].c[2].real; ai=a->d[0].c[2].imag; + br=b->d[0].c[2].real; bi=b->d[0].c[2].imag; + ss += ar*br + ai*bi; + + ar=a->d[1].c[0].real; ai=a->d[1].c[0].imag; + br=b->d[1].c[0].real; bi=b->d[1].c[0].imag; + ss += ar*br + ai*bi; + ar=a->d[1].c[1].real; ai=a->d[1].c[1].imag; + br=b->d[1].c[1].real; bi=b->d[1].c[1].imag; + ss += ar*br + ai*bi; + ar=a->d[1].c[2].real; ai=a->d[1].c[2].imag; + br=b->d[1].c[2].real; bi=b->d[1].c[2].imag; + ss += ar*br + ai*bi; + + ar=a->d[2].c[0].real; ai=a->d[2].c[0].imag; + br=b->d[2].c[0].real; bi=b->d[2].c[0].imag; + ss += ar*br + ai*bi; + ar=a->d[2].c[1].real; ai=a->d[2].c[1].imag; + br=b->d[2].c[1].real; bi=b->d[2].c[1].imag; + ss += ar*br + ai*bi; + ar=a->d[2].c[2].real; ai=a->d[2].c[2].imag; + br=b->d[2].c[2].real; bi=b->d[2].c[2].imag; + ss += ar*br + ai*bi; + + ar=a->d[3].c[0].real; ai=a->d[3].c[0].imag; + br=b->d[3].c[0].real; bi=b->d[3].c[0].imag; + ss += ar*br + ai*bi; + ar=a->d[3].c[1].real; ai=a->d[3].c[1].imag; + br=b->d[3].c[1].real; bi=b->d[3].c[1].imag; + ss += ar*br + ai*bi; + ar=a->d[3].c[2].real; ai=a->d[3].c[2].imag; + br=b->d[3].c[2].real; bi=b->d[3].c[2].imag; + ss += ar*br + ai*bi; + + return(ss); + +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/Goverrelax.c b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/Goverrelax.c new file mode 100644 index 0000000000000000000000000000000000000000..3f0863954bafb64225032c8217592398d5c4c921 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/Goverrelax.c @@ -0,0 +1,53 @@ +/********************* HBHiggs.c ************************/ + +#include "lattice.h" + +/* #include "gaussian_ran.c" + */ + +double Goverrelax(int parity, adjoint_matrix *ahiggs, adjoint_matrix *adjstaple) +{ + int i,j,nhit; + double rsum,sm,r2,r2o; + adjoint_matrix m; + + nhit = 0; + rsum = 0.0; + + sm = betaA/beta2; + + forparity(i,parity) { + + prefetch_adjoint(&ahiggs[i+1]); + prefetch_adjoint(&adjstaple[i+1]); + + /* perform a Gaussian overrelax for Higgs: Since + * Act = -bA A.S + b2 A^2 + b4 A^4, we can do + * Act = b2 (A - bA/(2b2) S)^2 + b4 A^4 + * Thus, reflect A using the gaussian potential: + * (A'-bA/2b2 S) = -(A-bA/2b2 S) => + * A' = bA/b2 S - A + * Accept/reject with the change in the A^4-term + */ + + for (r2o=j=0; j<8; j++) r2o += sqr( ahiggs[i].l[j] ); + + for (r2=j=0; j<8; j++) { + m.l[j] = sm * adjstaple[i].l[j] - ahiggs[i].l[j]; + r2 += sqr( m.l[j] ); + } + /* acc/rej with A^4 */ + if ( exp( beta4*(r2o*r2o - r2*r2) ) >= dran() ) { + ahiggs[i] = m; + nhit++; + rsum += r2; + } else rsum += r2o; + } + + nhitog++; + if (parity == EVEN) ahitog += 1.0*nhit/node.evensites; + else ahitog += 1.0*nhit/node.oddsites; + + return(rsum); +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/HBHiggs.c b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/HBHiggs.c new file mode 100644 index 0000000000000000000000000000000000000000..ef7448a22d707e46a394f7c2f850cfbce9d16ca6 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/HBHiggs.c @@ -0,0 +1,53 @@ +/********************* HBHiggs.c ************************/ + +#include "lattice.h" + +/* #include "gaussian_ran.c" + */ + +double HBHiggs(int parity, adjoint_matrix *ahiggs, adjoint_matrix *adjstaple) +{ + int i,j,ntry; + double rsum,w,sm,r2,r4o; + + ntry = 0; + rsum = 0.0; + + w = 1.0/sqrt(beta2); + sm = betaA/(2.0*beta2); + + forparity(i,parity) { + + prefetch_adjoint(&ahiggs[i+1]); + prefetch_adjoint(&adjstaple[i+1]); + + /* perform a heat bath update for Higgs: Since + * Act = -bA A.adjStaple + b2 A^2 + b4 A^4, we can do + * Act = b2 (A - bA/(2b2) adjStaple)^2 + b4 A^4 + * Thus, pull A from a gaussian distribution + * A = bA/2b2 S + 1/sqrt(b2) gaussian_ran() + * and acc/rej with the b4-term + */ + + /* for (r4o=j=0; j<8; j++) r4o += sqr( st->ahiggs.l[j] ); + r4o *= r4o; */ + + do { + ++ntry; + for (r2=j=0; j<8; j++) { + ahiggs[i].l[j] = sm * adjstaple[i].l[j] + w * gaussian_ran(); + r2 += sqr( ahiggs[i].l[j] ); + } + /* acc/rej with A^4 */ + } while ( exp( -beta4*r2*r2 ) < dran() ); /* loop until ok */ + + rsum += r2; + } + + nhithb++; + if (parity == EVEN) ahithb += 1.0*node.evensites/ntry; + else ahithb += 1.0*node.oddsites/ntry; + + return(rsum); +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/Make_sse_vanilla b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/Make_sse_vanilla new file mode 100644 index 0000000000000000000000000000000000000000..b52966753067b8538d9ae7c59b08b63ac2909713 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/Make_sse_vanilla @@ -0,0 +1,127 @@ +# Makefile for the hybrid molecular dynamics simulation with +# pure gauge SU3 +# MIMD version 3 +# + +#Where the complex and su3 libraries are +#c code +LIBDIR = ../libraries +GENERIC = ../generic +SSEDIR = ../sse +INCLDIR = -I$(GENERIC) -I$(LIBDIR) -I$(SSEDIR) + +HEADERS= $(LIBDIR)/complex.h $(LIBDIR)/su3.h $(GENERIC)/comdefs.h $(GENERIC)/generic.h $(GENERIC)/generic_su3.h lattice.h + +# include file defining site structure, etc., for compiling generic code */ +LATDEF = -DLATDEF='"../su3h_n/lattice.h"' +# The quotation marks are necessary, as is the "../pure_gauge", since +# some compilations will be done in another directory + +# Choose one of the lattice layout algorithms: + +OBJECTS=control.o updategauge.o relax.o monte.o measure.o \ + setup.o setup_basic.o \ + reunitarize.o staples_su3.o \ + layout.o io_lattice.o \ + mersenne_inline.o parameter_io.o timecheck.o \ + random_su3P.o +# check_unitarity.o is not used +# correlation.o + +HOBJECTS=updatehiggs.o Xoverrelax.o HBHiggs.o adjmat_operations.o \ + setcouplings_higgs.o \ + multican.o smooth_field_su3adjoint.o block_field_su3adjoint.o \ + block_lattice.o correlation.o smooth_link_su3.o block_link_su3.o \ + gaussian_ran.o + +#MACHINE_DEP = com_intelsim.o +#MACHINE_DEP = com_intel.o +MACHINE_DEP = com_vanilla.o + +#Library for multinode communication and information functions +#ILIB= /usr/local/lib/bsimlib.a #preon.physics.arizona.edu +#ILIB= -node #Intel machine +ILIB= #vanilla + +#Libraries for complex numbers and su3 functions +QCDLIB = $(LIBDIR)/su3.a $(LIBDIR)/complex.a + +#CFLAGS= -g -f $(INCLDIR) -DPROTO #MIPS +#CFLAGS= -g -fsingle $(INCLDIR) #Sun +CFLAGS= -O4 -DPROTO $(INCLDIR) $(LATDEF) #gnu c compiler +#CFLAGS = -non_shared -O4 -std1 -arch ev6 -DFAST -DPROTO $(INCLDIR) $(LATDEF) -float #Dec alpha compiler +#CFLAGS = -O4 -std1 -arch ev6 -DFAST -DPROTO $(INCLDIR) -float #Dec alpha compiler +#CFLAGS = -O4 -std1 -DFAST -DPROTO $(INCLDIR) -float #Dec alpha compiler +#CFLAGS = -O -DFAST -DPROTO $(INCLDIR) #Dec alpha compiler +#CFLAGS = -g -Wall -DFAST -DPROTO $(INCLDIR) #Dec alpha compiler + +COMPILER = cc #generic, for simulator +#COMPILER = gcc #Intel Green Hills compiler (SDSC only) +#COMPILER = icc #Intel pgcc + +DEFINES = -DCAN_DO_ALLOCA # can do alloca + + +.c.o: ; $(COMPILER) $(CFLAGS) -c $(DEFINES) $*.c + +$(OBJECTS) $(EXTRA_OBJECTS) : $(HEADERS) +su3_ahiggs:: + make -f Make_vanilla target "TARGET= su3_ahiggs" \ + "DEFINES= -DHIGGS -DSSE_INLINE" "EXTRA_OBJECTS= $(HOBJECTS)" + +su3_gauge:: + make -f Make_vanilla target "TARGET= su3_gauge" \ + "DEFINES= " "EXTRA_OBJECTS= setcouplings_gauge.o" + +su3_4d:: + make -f Make_vanilla target "TARGET= su3_4d" \ + "DEFINES= -DDIMENSION=4 -DP4 -DSSE -DSSE_INLINE" \ + "EXTRA_OBJECTS= setcouplings_gauge.o" + + +clean: + rm -f *.o + + +# Choose one of the lattice layout algorithms: +layout.o: ../generic/layout.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/layout.c +reunitarize.o: ../generic/reunitarize.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/reunitarize.c +com_vanilla.o: ../generic/com_vanilla.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/com_vanilla.c +mersenne_inline.o: ../generic/mersenne_inline.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/mersenne_inline.c +setup_basic.o: ../generic/setup_basic.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/setup_basic.c +staples_su3.o: ../generic/staples_su3.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/staples_su3.c +random_su3P.o: ../generic/random_su3P.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/random_su3P.c +parameter_io.o: ../generic/parameter_io.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/parameter_io.c +timecheck.o: ../generic/timecheck.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/timecheck.c + +gaussian_ran.o: ../generic/gaussian_ran.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/gaussian_ran.c + +block_lattice.o: ../generic/block_lattice.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/block_lattice.c +smooth_field_su3adjoint.o: ../generic/smooth_field_su3adjoint.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/smooth_field_su3adjoint.c +smooth_link_su3.o: ../generic/smooth_link_su3.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/smooth_link_su3.c +block_field_su3adjoint.o: ../generic/block_field_su3adjoint.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/block_field_su3adjoint.c +block_link_su3.o: ../generic/block_link_su3.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/block_link_su3.c + + + +target: $(OBJECTS) $(MACHINE_DEP) $(EXTRA_OBJECTS) $(QCDLIB) + $(COMPILER) $(CFLAGS) -o $(TARGET) $(DEFINES) \ + $(OBJECTS) $(MACHINE_DEP) $(EXTRA_OBJECTS) $(QCDLIB) $(ILIB) -lm + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/Make_vanilla b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/Make_vanilla new file mode 100644 index 0000000000000000000000000000000000000000..0d1866b29edadeb57c434ba4232b28da87c074da --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/Make_vanilla @@ -0,0 +1,126 @@ +# Makefile for the hybrid molecular dynamics simulation with +# pure gauge SU3 +# MIMD version 3 +# + +#Where the complex and su3 libraries are +#c code +LIBDIR = ../libraries +GENERIC = ../generic +INCLDIR = -I$(GENERIC) -I$(LIBDIR) + +HEADERS= $(LIBDIR)/complex.h $(LIBDIR)/su3.h $(GENERIC)/comdefs.h $(GENERIC)/generic.h $(GENERIC)/generic_su3.h lattice.h + +# include file defining site structure, etc., for compiling generic code */ +LATDEF = -DLATDEF='"../su3h_n/lattice.h"' +# The quotation marks are necessary, as is the "../pure_gauge", since +# some compilations will be done in another directory + +# Choose one of the lattice layout algorithms: + +OBJECTS=control.o updategauge.o relax.o monte.o measure.o \ + setup.o setup_basic.o \ + reunitarize.o staples_su3.o \ + layout.o io_lattice.o \ + mersenne_inline.o parameter_io.o timecheck.o \ + random_su3P.o gaugefix.o +# check_unitarity.o is not used +# correlation.o + +HOBJECTS=updatehiggs.o Xoverrelax.o Goverrelax.o HBHiggs.o adjmat_operations.o \ + setcouplings_higgs.o \ + multican.o smooth_field_su3adjoint.o block_field_su3adjoint.o \ + block_lattice.o correlation.o smooth_link_su3.o block_link_su3.o \ + gaussian_ran.o + +#MACHINE_DEP = com_intelsim.o +#MACHINE_DEP = com_intel.o +MACHINE_DEP = com_vanilla.o + +#Library for multinode communication and information functions +#ILIB= /usr/local/lib/bsimlib.a #preon.physics.arizona.edu +#ILIB= -node #Intel machine +ILIB= #vanilla + +#Libraries for complex numbers and su3 functions +QCDLIB = $(LIBDIR)/su3.a $(LIBDIR)/complex.a + +#CFLAGS= -g -f $(INCLDIR) -DPROTO #MIPS +#CFLAGS= -g -fsingle $(INCLDIR) #Sun +CFLAGS= -O4 -DPROTO $(INCLDIR) $(LATDEF) #gnu c compiler +#CFLAGS = -non_shared -O4 -std1 -arch ev6 -DFAST -DPROTO $(INCLDIR) $(LATDEF) -float #Dec alpha compiler +#CFLAGS = -O4 -std1 -arch ev6 -DFAST -DPROTO $(INCLDIR) -float #Dec alpha compiler +#CFLAGS = -O4 -std1 -DFAST -DPROTO $(INCLDIR) -float #Dec alpha compiler +#CFLAGS = -O -DFAST -DPROTO $(INCLDIR) #Dec alpha compiler +#CFLAGS = -g -Wall -DFAST -DPROTO $(INCLDIR) #Dec alpha compiler + +COMPILER = cc #generic, for simulator +#COMPILER = gcc #Intel Green Hills compiler (SDSC only) +#COMPILER = icc #Intel pgcc + +DEFINES = -DCAN_DO_ALLOCA # can do alloca + + +.c.o: ; $(COMPILER) $(CFLAGS) -c $(DEFINES) $*.c + +$(OBJECTS) $(EXTRA_OBJECTS) : $(HEADERS) +su3_ahiggs:: + make -f Make_vanilla target "TARGET= su3_ahiggs" \ + "DEFINES= -DHIGGS" "EXTRA_OBJECTS= $(HOBJECTS)" + +su3_gauge:: + make -f Make_vanilla target "TARGET= su3_gauge" \ + "EXTRA_OBJECTS= setcouplings_gauge.o" + +su3_4d:: + make -f Make_vanilla target "TARGET= su3_4d" \ + "DEFINES= -DDIMENSION=4" \ + "EXTRA_OBJECTS= setcouplings_gauge.o ploop.o" + + +clean: + rm -f *.o + + +# Choose one of the lattice layout algorithms: +layout.o: ../generic/layout.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/layout.c +reunitarize.o: ../generic/reunitarize.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/reunitarize.c +com_vanilla.o: ../generic/com_vanilla.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/com_vanilla.c +mersenne_inline.o: ../generic/mersenne_inline.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/mersenne_inline.c +setup_basic.o: ../generic/setup_basic.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/setup_basic.c +staples_su3.o: ../generic/staples_su3.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/staples_su3.c +random_su3P.o: ../generic/random_su3P.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/random_su3P.c +parameter_io.o: ../generic/parameter_io.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/parameter_io.c +timecheck.o: ../generic/timecheck.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/timecheck.c + +gaussian_ran.o: ../generic/gaussian_ran.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/gaussian_ran.c + +block_lattice.o: ../generic/block_lattice.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/block_lattice.c +smooth_field_su3adjoint.o: ../generic/smooth_field_su3adjoint.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/smooth_field_su3adjoint.c +smooth_link_su3.o: ../generic/smooth_link_su3.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/smooth_link_su3.c +block_field_su3adjoint.o: ../generic/block_field_su3adjoint.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/block_field_su3adjoint.c +block_link_su3.o: ../generic/block_link_su3.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/block_link_su3.c + + + +target: $(OBJECTS) $(MACHINE_DEP) $(EXTRA_OBJECTS) $(QCDLIB) + $(COMPILER) $(CFLAGS) -o $(TARGET) $(DEFINES) \ + $(OBJECTS) $(MACHINE_DEP) $(EXTRA_OBJECTS) $(QCDLIB) $(ILIB) -static -lm + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..8fdd943f82f64dd57b4353cf552ca7018450c575 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/Makefile @@ -0,0 +1,108 @@ +include ../Makefile.defs + +# MPI C compiler +# MPI_CC = mpcc_r #IBM RS6000 + +#Where the complex and su3 libraries are +LIBDIR = ../libraries +GENERIC = ../generic +INCLDIR = -I$(GENERIC) -I$(LIBDIR) + +HEADERS= $(LIBDIR)/complex.h $(LIBDIR)/su3.h $(GENERIC)/comdefs.h $(GENERIC)/generic.h $(GENERIC)/generic_su3.h lattice.h + +# include file defining site structure, etc., for compiling generic code */ +LATDEF = -DLATDEF='"../su3h_n/lattice.h"' + +OBJECTS=control.o updategauge.o relax.o monte.o measure.o \ + setup.o setup_basic.o timers.o \ + reunitarize.o staples_su3.o \ + layout.o io_lattice.o \ + mersenne_inline.o parameter_io.o timecheck.o \ + random_su3P.o +# check_unitarity.o is not used +# correlation.o + +HOBJECTS=updatehiggs.o Xoverrelax.o Goverrelax.o HBHiggs.o adjmat_operations.o \ + setcouplings_higgs.o \ + multican.o smooth_field_su3adjoint.o block_field_su3adjoint.o \ + block_lattice.o correlation.o smooth_link_su3.o block_link_su3.o \ + gaussian_ran.o + +MACHINE_DEP = com_mpi.o + +#Libraries for complex numbers and su3 functions +QCDLIB = $(LIBDIR)/su3.a $(LIBDIR)/complex.a + +#PABS -DHIGGS added +CFLAGS += -DMPI -DFAST $(INCLDIR) $(LATDEF) -DTIMERS -DHIGGS + +COMPILER = $(MPI_CC) + +.SUFFIXES: +.SUFFIXES: .o .t3e .c .y .l .s + +.c.o: + $(COMPILER) $(CFLAGS) -c $(DEFINES) $*.c +.s.o: + $(COMPILER) $(CFLAGS) -c $(DEFINES) $*.s + +$(OBJECTS) $(EXTRA_OBJECTS) : $(HEADERS) + +su3_ahiggs:: + $(MAKE) target "MYTARGET= su3_ahiggs" \ + "DEFINES= -DHIGGS" "EXTRA_OBJECTS= $(HOBJECTS)" + +su3_gauge:: + $(MAKE) target "MYTARGET= su3_gauge" \ + "EXTRA_OBJECTS= setcouplings_gauge.o" + +su3_4d:: + $(MAKE) target "MYTARGET= su3_4d" \ + "DEFINES= -DDIMENSION=4" \ + "EXTRA_OBJECTS= setcouplings_gauge.o ploop.o" + +clean: + $(RM) -f *.o + +# Choose one of the lattice layout algorithms: +layout.o: ../generic/layout.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/layout.c +reunitarize.o: ../generic/reunitarize.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/reunitarize.c +timers.o: ../generic/timers.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/timers.c +com_mpi.o: ../generic/com_mpi.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/com_mpi.c +mersenne_inline.o: ../generic/mersenne_inline.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/mersenne_inline.c +setup_basic.o: ../generic/setup_basic.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/setup_basic.c +staples_su3.o: ../generic/staples_su3.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/staples_su3.c +random_su3P.o: ../generic/random_su3P.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/random_su3P.c +parameter_io.o: ../generic/parameter_io.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/parameter_io.c +timecheck.o: ../generic/timecheck.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/timecheck.c + +gaussian_ran.o: ../generic/gaussian_ran.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/gaussian_ran.c + +block_lattice.o: ../generic/block_lattice.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/block_lattice.c +smooth_field_su3adjoint.o: ../generic/smooth_field_su3adjoint.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/smooth_field_su3adjoint.c +smooth_link_su3.o: ../generic/smooth_link_su3.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/smooth_link_su3.c +block_field_su3adjoint.o: ../generic/block_field_su3adjoint.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/block_field_su3adjoint.c +block_link_su3.o: ../generic/block_link_su3.c $(HEADERS) + $(COMPILER) $(CFLAGS) -c $(LATDEF) $(DEFINES) ../generic/block_link_su3.c + +target: $(OBJECTS) $(MACHINE_DEP) $(EXTRA_OBJECTS) $(QCDLIB) + $(COMPILER) $(CFLAGS) -o $(MYTARGET) $(DEFINES) \ + $(OBJECTS) $(MACHINE_DEP) $(EXTRA_OBJECTS) $(QCDLIB) $(LDFLAGS) + +kernel-objects: $(OBJECTS) $(MACHINE_DEP) $(HOBJECTS) $(QCDLIB) + echo "mache kernel_B" \ No newline at end of file diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/Xoverrelax.c b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/Xoverrelax.c new file mode 100644 index 0000000000000000000000000000000000000000..f78cfad596d113db447e5ffd3990f2ebc8482686 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/Xoverrelax.c @@ -0,0 +1,130 @@ +/************************************************************** + * * + * adjoint overrelaxation routine * + * * + *************************************************************/ + +#include "lattice.h" + + +double Xoverrelax(int parity, adjoint_matrix *ahiggs, adjoint_matrix *adjstaple) +{ + double rp,x,x2,y2,xn,a,b,c,vi,b0,b1,b2,f1,f1p3,f2,f3,f4,f5,f6; + double xa,xb,pr,r2,a1; + int i,j,nhits; + double rsum; + + nhits = rsum = 0; + + /* the coeffs of the polynomial + * s = -betaA v p + beta2 v^2 + beta4 v^4 + */ + + vi = 1.0/beta4; + + forparity(i,parity) { + + prefetch_adjoint(&ahiggs[i+1]); + prefetch_adjoint(&adjstaple[i+1]); + + rp = a1 = r2 = 0.0; + for (j=0; j<8; j++) { + a1 += ahiggs[i].l[j] * adjstaple[i].l[j]; + r2 += sqr(ahiggs[i].l[j]); + rp += sqr(adjstaple[i].l[j]); + } + rp = sqrt(rp); + + /* vector angle is given by V*P = cos theta; x = V*\hatP */ + + x = a1/rp; + x2 = sqr(x); + y2 = r2 - x2; + + /* NOW rr == -betaA * rp; + * vx2 == beta2 + * vx4 == beta4 + * act = rr*x + vx2*(x2+y2) + vx4*(x2+y2)^2 + * calculate the coeffs. of the 4-th order action polynomial + * act = rr*x + vx2*(x2+y2) + vx4*(x2+y2)^2 + * => vx4 x^4 + (vx2 + 2*vx4*y2) x^2 + rr x == v0 + * => x^4 + [(vx2 + 2*vx4*y2)/vx4] x^2 + rr/vx4 x + * + [-v0/vx4] == 0 + */ + + b2 = beta2*vi + 2.0*y2; + b1 = -betaA*rp*vi; + b0 = -(b1*x + x2*(b2 + x2)); + + /* (x-x0)(x^3 + ax^2 + bx + c) = x^4 + b2 x^2 + b1 x + b0 */ + + a = x; + b = b2 + x2; + c = b1 + x*b; + + /* if (abs(1.0 + c/(b0/a)) .gt. 1e-10) write(*,*)'c-errror',c,b0/a */ + + /* Now find the zeros of the 3-deg polynomial + * (x^3 + ax^2 + bx + c) + */ + + f1 = -sqr(a) + 3.0*b; + f1p3 = f1*f1*f1; + f2 = -2.0*a*a*a + 9.0*a*b; + f6 = f2 - 27.0*c; + f4 = 4.0*f1p3 + sqr(f6); + + if (f4 >= 0.0) { + + /* only one real solution exists now, this is all what is accepted */ + + f5 = sqrt(f4) + f6; + if (f5 > 0.0) { + f3 = pow(0.5*f5,((double)1.0)/((double)3.0)); + xn = (-a - f1/f3 + f3)*(1.0/3.0); + + /* Now accept/reject the update with the derivatives + * d[x^4 + b2 x^2 + b1 x + b0] = 4 x^3 + 2 b2 x + b1 + */ + + xa = x*(4.0*x2 + 2.0*b2) + b1; + xb = xn*(4.0*sqr(xn) + 2.0*b2) + b1; + pr = fabs(xa/xb); + + if (pr >= dran()) { + nhits++; + + /* generate new adj. -- now we have x and xn wrt. p => + * v <- v + (xn-x) \hat p + */ + + for (j=0; j<8; j++) { + ahiggs[i].l[j] += adjstaple[i].l[j] * ((xn-x)/rp); + } + + r2 = y2 + xn*xn; + } + + rsum += r2; /* cumulate r2-value */ + + } else { + printf(" *** OR branch 1, value of f4: %g f5: %g\n",f4,f5); + halt("****** OR stop"); + } + } else { + printf(" OR branch 2, value of f4: %g f5: %g\n",f4,f5); + halt("****** OR stop"); + } + } /* FORSOMEPARITY */ + + if (parity == EVEN) ahitax += 1.0*nhits/(node.evensites); + else ahitax += 1.0*nhits/(node.oddsites); + nhitax++; + + return(rsum); + +} + + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/adjmat_operations.c b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/adjmat_operations.c new file mode 100644 index 0000000000000000000000000000000000000000..75620420cb610cf7e7c397810e3eb3bd3b9ed995 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/adjmat_operations.c @@ -0,0 +1,163 @@ + +/* + * void make_adjointmat( su3_matrix *m3, adjointmat *ah3) + * takes the hermitian and traceless part of su3_matrix + * in terms of generators + */ +#include "complex.h" +#include "su3.h" + +#define sqrt3 1.7320508075688772 + +void +compress_adjmat(m3,a3) + su3_matrix *m3; + adjoint_matrix *a3; +{ + a3->l[0] = m3->e[0][0].real - m3->e[1][1].real; + a3->l[1] = (1.0/sqrt3)*(m3->e[0][0].real + + m3->e[1][1].real - 2.0*m3->e[2][2].real); + + a3->l[2] = m3->e[0][1].real + m3->e[1][0].real; + a3->l[3] = m3->e[0][1].imag - m3->e[1][0].imag; + a3->l[4] = m3->e[0][2].real + m3->e[2][0].real; + a3->l[5] = m3->e[0][2].imag - m3->e[2][0].imag; + a3->l[6] = m3->e[1][2].real + m3->e[2][1].real; + a3->l[7] = m3->e[1][2].imag - m3->e[2][1].imag; +}/* make_adjmat */ + + +/* + * void uncompress_adjointmat( su3_matrix *m3, adjointmat *ah3) + * takes the adjoint matrix and throws it in SU(3)-matrix + */ + +void +uncompress_adjmat(a3,m3) + su3_matrix *m3; + adjoint_matrix *a3; +{ + radix t; + + t = a3->l[1]*(1.0/sqrt3); + + m3->e[0][0].real = 0.5*(a3->l[0] + t); + m3->e[0][0].imag = 0.0; + m3->e[1][1].real = 0.5*(-a3->l[0] + t); + m3->e[1][1].imag = 0.0; + m3->e[2][2].real = -t; + m3->e[2][2].imag = 0.0; + + m3->e[0][1].real = m3->e[1][0].real = 0.5*a3->l[2]; + m3->e[0][1].imag = 0.5*a3->l[3]; + m3->e[1][0].imag = -0.5*a3->l[3]; + + m3->e[0][2].real = m3->e[2][0].real = 0.5*a3->l[4]; + m3->e[0][2].imag = 0.5*a3->l[5]; + m3->e[2][0].imag = -0.5*a3->l[5]; + + m3->e[1][2].real = m3->e[2][1].real = 0.5*a3->l[6]; + m3->e[1][2].imag = 0.5*a3->l[7]; + m3->e[2][1].imag = -0.5*a3->l[7]; +}/* uncmp_adjointmat */ + + +/* void make_adjointmat( su3_matrix *m3, adjointmat *ah3) + * takes the hermitian and traceless part of su3_matrix + * in terms of generators + */ + +void +make_adjointmat(m3,a3) + su3_matrix *m3; + adjoint_matrix *a3; +{ + compress_adjmat(m3,a3); +} + + +/****************************************************** + * + * adjoint arithmetics + * + *****************************************************/ + +void +add_adjmat(a,b,t) + adjoint_matrix *a,*b,*t; +{ + int i; + for (i=0; i<8; i++) t->l[i] = a->l[i] + b->l[i]; +} + + +void +adj_scalar_mul(a,s,t) + adjoint_matrix *a,*t; + double s; +{ + int i; + for (i=0; i<8; i++) t->l[i] = (s) * a->l[i]; +} + + +void +adj_scalar_mul_add(a,s,t) + adjoint_matrix *a,*t; + double s; +{ + int i; + for (i=0; i<8; i++) t->l[i] += (s) * a->l[i]; +} + +radix +adj_sqr(adjoint_matrix *a) +{ + int i; + radix f; + for (f=i=0; i<8; i++) f += a->l[i] * a->l[i]; + return(f); +} + +radix +adj_dot(adjoint_matrix *a,adjoint_matrix *b) +{ + int i; + radix f; + for (f=i=0; i<8; i++) f += a->l[i] * b->l[i]; + return(f); +} + + +void +mult_su3_ahiggs( su3_matrix *m, adjoint_matrix *a, adjoint_matrix *r ) +{ + su3_matrix tmat1,tmat2; + + uncompress_adjmat( a, &tmat1 ); + mult_su3_nn( m, &tmat1, &tmat2 ); + mult_su3_na( &tmat2, m, &tmat1 ); + compress_adjmat( &tmat1, r ); +} + +void +mult_adj_su3_ahiggs( su3_matrix *m, adjoint_matrix *a, adjoint_matrix *r ) +{ + su3_matrix tmat1,tmat2; + + uncompress_adjmat( a, &tmat1 ); + mult_su3_na( m, &tmat1, &tmat2 ); + mult_su3_nn( &tmat2, m, &tmat1 ); + compress_adjmat( &tmat1, r ); +} + + + + +void +mult_su3_by_I(su3_matrix *a, su3_matrix *b) +{ + int i,j; + + for(i=0;i<3;i++)for(j=0;j<3;j++) CMUL_I(a->e[i][j],b->e[i][j]); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/control.c b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/control.c new file mode 100644 index 0000000000000000000000000000000000000000..91c9261b512cc8bace7e2b8e46fdeb04828069be --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/control.c @@ -0,0 +1,230 @@ +/******************************************************************** + * * + * SU(3) adjoint Higgs field in 3d * + * * + * coupling constants: betag, x, y * + * action is always exp[-S] * + * * + * Kari Rummukainen, May 97 (MIMD) * + *******************************************************************/ + +#define CONTROL +#include "lattice.h" /* global variables for lattice fields */ + + +void runthis(int maxiters,int status); + +static int istimelimit = 0; + +/* PABS main -> kernel_b */ + +int +kernel_b() +{ + int kernel_number = 1; + int s_iteration; + int i,status; + time_t t; + double at; + + /* JuBE */ + /* call jube initial function */ + jube_kernel_init(&kernel_number); + + + t = time(NULL); + + /* Machine initialization first */ + initial_setup(); + + /* set up */ + status = setup(); + + /* allocate the variable fields */ + foralldir(i) U[i] = new_latfield( su3_matrix ); +#ifdef HIGGS + ahiggs = new_latfield( adjoint_matrix ); +#endif + + /* load in config, if it exists */ + load_config(status); + + /* Setup measurement etc files */ + setfiles((status == 0) && (iteration != 0)); + +#ifdef HIGGS + /* check if we want to do this multicanonically */ + setmulti(); +#endif + + /* timelimit, if it is used */ +/* PABS, no time limit */ +/* istimelimit = setup_timelimit(t,argc-1,argv[1]); */ + istimelimit = 0; + + fflush(stdout); + + /* first, set the Metropolis scales and thermalise */ + if (status >= 1) { + runthis(n_thermal,1); + } + + if (this_node == 0) { + printf(" - Time spent thermalising: %lg seconds\n",cputime()); + fflush(stdout); + } + + resettime(); /* reset the clock */ + + if (status > 0) timeu = timea = timerest = 0; + ahithb = nhithb = ahitu = nhitu = 0; +#ifdef HIGGS + ahitua = ahitax = ahitmc = ahitog = 0.0; + nhitua = nhitax = nhitmc = nhitog = 0; +#endif + s_iteration = iteration; + + /* JuBE */ + /* call jube run function */ + jube_kernel_run(); + + runthis(n_iteration,0); + + /* JuBE */ + /* call jube finalize function */ + jube_kernel_finalize(); + + + /* print the tail */ + + if (this_node == 0) { + printf("---------\n"); + + printf("Acceptances (after last start: %d iterations:\n", + n_iteration-s_iteration); + if (nhitu) + printf(" Kennedy-Pendleton for gauge: %g (%d sweeps)\n", + ahitu/nhitu,nhitu); +#ifdef HIGGS + if (nhitua) + printf(" Adjoint acceptance for gauge: %g (%d sweeps)\n", + ahitua/nhitua,nhitua); + if (nhitax) + printf(" X-overrelaxation for Higgs: %g (%d sweeps)\n", + ahitax/nhitax,nhitax); + if (nhitog) + printf(" Gaussian overrelaxation for Higgs: %g (%d sweeps)\n", + ahitog/nhitog,nhitog); + if (nhithb) + printf(" Heat bath for Higgs: %g (%d sweeps)\n", + ahithb/nhithb,nhithb); + if (nhitmc) + printf(" Multicanonical acceptance: %g (%d sweeps)\n", + ahitmc/nhitmc,nhitmc); +#endif + + at = timeu+timea+timerest; + printf("\nCpu times:\n"); + printf(" %9.1lf total time in seconds\n",at); + printf(" %9.3lf seconds for one cycle\n",at/n_iteration); + at = 1.0/((mc_steps+1) * n_iteration * lattice.volume); + printf(" %9.1lf seconds for su3 gauge field update\n",timeu); + printf(" %9.3lf microseconds/U/update\n",1e6*timeu*at/3); +#ifdef HIGGS + printf(" %9.1lf seconds for Higgs update\n",timea); + printf(" %9.3lf microseconds/Higgs/update\n",1e6*timea*at); + printf(" %9.1lf seconds for the rest\n",timerest); +#endif + +#ifdef check + print_check(); +#endif + at = cputime(); + printf("Resources:"); + printf(" Cpu: %lg\n",at); + t = time(NULL) - t; + printf(" Wallclock time %ld seconds, Cpu/Wall %lg\n",t,at/t); + + printf("#########\n"); + } +#ifdef MPI + report_comm_timers(); +/* MPI_Finalize(); */ +#endif + + /* JuBE */ + /* call jube finalize function */ + jube_kernel_end(); + + return 0; +} + + +/************************************************* + * do the whole run + */ + +void +runthis(int maxiters,int status) +{ + int meas,i; + + if (istimelimit) inittimecheck(); + + meas = (status == 0); + iteration ++; + for (; iteration <= maxiters; iteration++) { + for (i=0; i F + * + * 1/2 Tr(X UU F') + */ + + static int not_alloc = 1; + int z,zd,i,d,blev,nz; + double w,th,tk,tH,tK,tr2,tr3,tp,tap; + static double_complex *hz1[2][MAX_BOP]; + static double *hz0[2][MAX_BOP],*rz[2][MAX_BOP], *pz[2][MAX_BOP], *fz_array; + + nz = lattice.size[ZUP]; + + if (not_alloc) { + /* first, allocate the needed arrays and set pointers */ + + not_alloc = 0; + fz_array = (double *)calloc(nz*n_bop*(1*2*3 + 2*2*1),sizeof(double)); + for (i=0; i 1) { + free_latfield( b_higgs ); + foralldir(i) free_latfield( b_link[i] ); + } + + reset_blocking_level(); +} + +/**************************************************************** + * * + * Get the plaquette to ->staple * + * Symmetrize it to a clover form * + * * + ***************************************************************/ + +void +getclover(su3_matrix *b_link[NDIM], su3_matrix *clover) +{ + int i; + msg_tag *tag0,*tag1; + su3_matrix ta,tb,*tmpmat; + + /* gather up-links */ + + tmpmat = tmp_latfield( su3_matrix ); + + tag0 = start_get( b_link[YUP], XUP, EVENODD ); + tag1 = start_get( b_link[XUP], YUP, EVENODD ); + + /* multiply up-up -plaq */ + forallsites_wait2(i,tag0,tag1) { + mult_su3_nn( &b_link[XUP][i],&b_link[YUP][nb(XUP,i)], &ta); + mult_su3_na( &ta, &b_link[XUP][nb(YUP,i)], &tb); + mult_su3_na( &tb, &b_link[YUP][i], &clover[i] ); + /* shift it YUP too */ + mult_su3_an( &b_link[YUP][i], &tb, &tmpmat[i] ); + } + + /* move plaq YUP */ + tag0 = start_get( tmpmat, YDOWN, EVENODD ); + forallsites_wait(i,tag0) { + add_su3_matrix( &clover[i], &tmpmat[nb(YDOWN,i)], &clover[i] ); + } + + /* this can not be merged with the one above! */ + /* prepare for XUP */ + forallsites(i) { + mult_su3_an( &b_link[XUP][i], &clover[i], &ta ); + mult_su3_nn( &ta, &b_link[XUP][i], &tmpmat[i] ); + } + + /* move XUP */ + tag0 = start_get( tmpmat, XDOWN, EVENODD ); + forallsites_wait(i,tag0) { + add_su3_matrix( &clover[i], &tmpmat[nb(XDOWN,i)], &clover[i] ); + } + + free_tmp( tmpmat ); +} + +/**************************************************************** + * * + * Calculate the blocked correlations * + * * + ***************************************************************/ + +void +Hvalues(int i, su3_matrix *clover, adjoint_matrix *b_higgs, + double *h10, double_complex *h11, double *h20, + double_complex *h21) +{ + /* this now calculates H_i = i eps_ijk Tr A0 U_jk + * assumes that clover is in ->staple, and + * adjoint higgs in b_higgs + */ + + su3_matrix u,a,a2; + double td,tr; + + td = pi2*((double)xcoord(i))/lattice.size[XUP]; + + /* calculate I (U - U') */ + su3_adjoint( &clover[i], &u); + sub_su3_matrix( &clover[i], &u, &a); + mult_su3_by_I( &a, &u ); + + uncompress_adjmat( &b_higgs[i], &a); + + /* get A0 U_12, this is real */ + + *h10 = tr = realtrace_su3( &a, &u); /* a' *u */ + h11->real = cos(td) * tr; + h11->imag = sin(td) * tr; + + /* and also A0^2 U_12, real */ + + mult_su3_nn( &a, &a, &a2); + *h20 = tr = realtrace_su3( &a2, &u); /* also a2' * u */ + h21->real = cos(td) * tr; + h21->imag = sin(td) * tr; +} + + +/**************************************************************** + * * + * A0^3 from ->b_higgs * + * * + ***************************************************************/ + +double +R3value(int i, adjoint_matrix *b_higgs) +{ + su3_matrix a,a2; + + uncompress_adjmat(&b_higgs[i], &a); + mult_su3_nn(&a, &a, &a2); + return((double)realtrace_su3( &a2, &a)); /* a2' a */ +} + + +/**************************************************************** + * * + * calculate plaq. correlations * + * uses ->staple ! * + * and A0 in -> b_higgs * + * * + ***************************************************************/ + +double +Pvalue(int i, double *ap, su3_matrix *clover, adjoint_matrix *b_higgs) +{ + int k; + double p; + su3_matrix a,p2; + + for (p=k=0; k<3; k++) p += clover[i].e[k][k].real; + + su3_adjoint(&clover[i], &a); + add_su3_matrix(&clover[i], &a, &p2); + uncompress_adjmat(&b_higgs[i], &a); + + *ap = (double)realtrace_su3( &p2, &a); /* p2' * a */ + return(p); +} + +/*************************************************************/ + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/gaugefix.c b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/gaugefix.c new file mode 100644 index 0000000000000000000000000000000000000000..3beb4feeff7185bec825dae13b0bee91a64c8279 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/gaugefix.c @@ -0,0 +1,356 @@ +/************************** gaugefix.c *******************************/ +/* Fix Coulomb or Lorentz gauge by doing successive SU(2) gauge hits */ +/* Uses double precision global sums */ +/* This version does automatic reunitarization at preset intervals */ +/* MIMD version 6 */ +/* C. DeTar 10-22-90 */ +/* T. DeGrand 1993 */ +/* U.M. Heller 8-31-95 */ +/* C. DeTar 10-11-97 converted to generic */ +/* C. DeTar 12-26-97 added automatic reunitarization */ +/* C. DeTar 11-24-98 remove superfluous references to p2 (was for ks phases) */ + +/* Heavily modified by Kari Rummukainen 2005-6 */ + +/* Prototype... + +void gaugefix(int gauge_dir,double relax_boost,int max_gauge_iter, + double gauge_fix_tol, suN_matrix gauge ); + + if gauge == NULL do not return the gauge + + ------------------------------------------------------------------- + + NOTE: For staggered fermion applications, it is necessary to remove + the KS phases from the gauge links before calling this procedure. + See "rephase" in setup.c. + + ------------------------------------------------------------------- + EXAMPLE: Fixing only the link matrices to Coulomb gauge with scratch + space in mp (suN_matrix) and chi (suN_vector): + + gaugefix(TUP,1.5,500,1.0e-7,NULL); + + ------------------------------------------------------------------- + EXAMPLE: Fixing Coulomb gauge with respect to the y direction + in the staggered fermion scheme and simultaneously transforming + the pseudofermion fields and gauge-momenta involved in updating: + + rephase( OFF ); + gauge = new_latfield(suN_matrix); + gaugefix( YUP, 1.8, 500, 2.0e-6, gauge ); + vec_fix_gauge( gauge, g_rand, EVENODD ); + vec_fix_gauge( gauge, phi, EVEN ); + vec_fix_gauge( gauge, xxx, EVEN ); + free_latfield( gauge ); + rephase( ON ); + + ------------------------------------------------------------------- + + gauge_dir specifies the direction of the "time"-like hyperplane + for the purposes of defining Coulomb or Lorentz gauge + TUP for evaluating propagators in the time-like direction + ZUP for screening lengths. + -1 for Lorentz gauge + relax_boost Overrelaxation parameter + max_gauge_iter Maximum number of iterations + gauge_fix_tol Stop if change is less than this +*/ + +#include "lattice.h" +#define REUNIT_INTERVAL 50 + +#ifdef SU2 +DOES NOT WORK YET FOR SU2 +#endif + +typedef struct { complex e[2][2]; } su2_matrix; +#define Ncol 3 + +/* CDIF(a,b) a -= b */ + /* a -= b */ +#define CDIF(a,b) { (a).real -= (b).real; (a).imag -= (b).imag; } + +/* Useful routines, will be relocated */ + +void mult_su2_mat_vec_elem_a(su2_matrix *u,complex *x0,complex *x1) +{ + /* Multiplies the complex row spinor (x0, x1) by the adjoint of the */ + /* SU(2) matrix u and puts the result in (x0,x1). */ + /* Thus x <- x * u-adj */ + /* C. DeTar 3 Oct 1990 */ + + complex z0, z1, t0, t1; + + t0 = *x0; t1 = *x1; + + CMUL_J(t0, u->e[0][0], z0); + CMUL_J(t1, u->e[0][1], z1); + CADD(z0, z1, *x0); + CMUL_J(t0, u->e[1][0], z0); + CMUL_J(t1, u->e[1][1], z1); + CADD(z0, z1, *x1); + +} /* m_su2_mat_vec_a.c */ + + +void right_su2_hit_a(su2_matrix *u,int p,int q,su3_matrix *link) +{ + /* link <- link * u adj */ + /* The 0 column of u-adjoint matches column p of the SU(3) matrix */ + /* The 1 column of u-adjoint matches column q of the SU(3) matrix */ + /* C. DeTar 18 Oct 1990 */ + + register int m; + + for (m = 0; m < Ncol; m++) + mult_su2_mat_vec_elem_a(u, &(link->e[m][p]), &(link->e[m][q])); + +} /* r_su2_hit_a.c */ + + +/* Scratch space */ + +void accum_gauge_hit(int i, int gauge_dir, + su3_matrix *diffmat, su3_vector *sumvec ) +{ + + /* Accumulates sums and differences of link matrices for determining optimum */ + /* hit for gauge fixing */ + /* Differences are kept in diffmat and the diagonal elements of the sums */ + /* in sumvec */ + + register int j; + register su3_matrix *m1; + register int dir; + + /* Clear sumvec and diffmat */ + + clear_su3mat( diffmat ); + clearvec( sumvec ); + + /* Subtract upward link contributions */ + + foralldir(dir) if (dir != gauge_dir) { + int odir = opp_dir(dir); + + m1 = &(U[dir][i]); + sub_su3_matrix( diffmat, m1, diffmat); + /* Sum diagonal part */ + for(j=0; jc[j], m1->e[j][j] ); + + + /* Add downward link contributions */ + + m1 = &U[dir][nb(odir,i)]; + add_su3_matrix( diffmat, m1, diffmat ); + for(j=0; jc[j], m1->e[j][j] ); + } +} /* accum_gauge_hit */ + + +void do_hit(int gauge_dir, int parity, double relax_boost, su3_matrix *gauge ) +{ + /* Do optimum SU(2) gauge hit for p, q subspace */ + + double a0,a1,a2,a3,asq,a0sq,x,r,xdr; + int dir,i,p,q; + su2_matrix u; + su3_matrix diffmat; + su3_vector sumvec; + + /* Accumulate sums for determining optimum gauge hit - + * U's must have been fetched from down! */ + + /* accum_gauge_hit( gauge_dir, parity, diffmat, sumvec); */ + + forparity(i,parity) for (p=0; p X |-- uf --> af + * + * 2 beta tr (a [u an u']) + * + * Note that [ ] is also 'adjoint!' + */ + + register int i,dir, odir, otherparity; + msg_tag *tag0,*tag1; + su3_matrix tmat1,tmat2; + adjoint_matrix tadj; + + /* Loop over directions, computing force from links */ + + otherparity = opp_parity( parity ); + foralldir(dir) { + odir = opp_dir(dir); + + /* start gather of up-adjoint link */ + + tag0 = start_get( ahiggs, dir, parity ); + + /* multiply adjoint here with up-link, for opp-parity */ + forparity(i,otherparity) { + prefetch_matrix( &U[dir][i+1] ); + prefetch_adjoint( &ahiggs[i+1] ); + uncompress_adjmat(&ahiggs[i],&tmat1); + mult_su3_an( &U[dir][i], &tmat1, &tmat2 ); + mult_su3_nn( &tmat2, &U[dir][i], &tmat1 ); + compress_adjmat( &tmat1, &adjstaple[i] ); + } + + tag1 = start_get( adjstaple, odir, parity ); + + wait_get(tag0); + + /* multiply link with up-adjoint */ + if (dir == XUP) forparity(i,parity) { + prefetch_matrix( &U[dir][i+1] ); + prefetch_adjoint( &ahiggs[nb(dir,i+1)] ); + uncompress_adjmat( &ahiggs[nb(dir,i)], &tmat1); + mult_su3_nn( &U[dir][i], &tmat1, &tmat2 ); + mult_su3_na( &tmat2, &U[dir][i], &tmat1 ); + compress_adjmat( &tmat1, &adjstaple[i] ); + } else forparity(i,parity) { + prefetch_matrix( &U[dir][i+1] ); + prefetch_adjoint( &ahiggs[nb(dir,i+1)] ); + prefetch_adjoint( &adjstaple[i+1] ); + uncompress_adjmat( &ahiggs[nb(dir,i)], &tmat1); + mult_su3_nn( &U[dir][i], &tmat1, &tmat2 ); + mult_su3_na( &tmat2, &U[dir][i], &tmat1 ); + compress_adjmat( &tmat1, &tadj ); + add_adjmat( &adjstaple[i], &tadj, &adjstaple[i] ); + } + + wait_get(tag1); + forparity(i,parity) { + prefetch_adjoint( &adjstaple[i+1] ); + prefetch_adjoint( &adjstaple[nb(odir,i+1)] ); + add_adjmat( &adjstaple[i], &adjstaple[nb(odir,i)] , &adjstaple[i] ); + } + } +} + + +/****************************************************** + * needed for adjoint acceptance + * note-first su3_matrix contains the 'local' adjoint + * matrix uncompressed + * + * acc/rej with + * + * a |-- u --> an + * + * -2 beta tr (a u an u') + * + *****************************************************/ + +double act_gauge_adj(su3_matrix *a, su3_matrix *u,adjoint_matrix *b) +{ + su3_matrix am,m1,m2; + + uncompress_adjmat(b,&am); + mult_su3_nn(a,u,&m1); + mult_su3_na(u,&am,&m2); + return((-2.0)*betaA*realtrace_su3(&m1,&m2)); /* m2' * m1 */ +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/io_lattice.c b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/io_lattice.c new file mode 100644 index 0000000000000000000000000000000000000000..2f4dab17f12f06c182359d71e4ba7dae3d6ac208 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/io_lattice.c @@ -0,0 +1,46 @@ +/*********************** io_lattice.c *************************/ +/* This reads and writes a (binary) lattice + * + * THIS IS AN ENCAPSULATING FILE TO + * ../generic/io_lattice_generic.c + * HERE WE HAVE TO DEFINE + * + * typedef struct { } allfields; + * + * copy_fields(int site, allfields *s) copy from all latfields to s.(whatever) + * set_fields(allfields *s, int site) copy s.(stuff) to lattice fields + * + * #include "../generic/io_lattice_generic.c" + */ + +#include "lattice.h" + +typedef struct { + su3_matrix link[NDIM]; +#ifdef HIGGS + adjoint_matrix ahiggs; +#endif +} allfields; + +void set_fields( allfields *s, int i ) +{ + int dir; + + foralldir(dir) U[dir][i] = s->link[dir]; +#ifdef HIGGS + ahiggs[i] = s->ahiggs; +#endif +} + +void copy_fields( int i, allfields *s ) +{ + int dir; + + foralldir(dir) s->link[dir] = U[dir][i]; +#ifdef HIGGS + s->ahiggs = ahiggs[i]; +#endif +} + +#include "../generic/io_lattice_generic.c" + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/lattice.h b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/lattice.h new file mode 100644 index 0000000000000000000000000000000000000000..62708d36fcb7100d5b1e029cb2caf2b5b71c17f0 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/lattice.h @@ -0,0 +1,128 @@ +/****************************** lattice.h ********************************/ + +/* include file for SU3-adjoint Higgs program, version 2 + This file defines global scalars and the fields in the lattice. */ + +/* #define check */ + +#ifdef CONTROL +#define EXTERN +#else +#define EXTERN extern +#endif + +#define PI 3.14159265358979323846 +#define pi PI +#define pi2 (PI*2.0) + +#include +#include +#include +#include "complex.h" +#include "su3.h" +#include "comdefs.h" +#include "generic.h" +#include "generic_su3.h" + +#ifndef check +#define check_action(a) /* nothing */ +#endif + +#define MAX_BOP 5 /* max number of blockings */ + +/* The following are global scalars */ +EXTERN long seed; /* random number seed */ +EXTERN int mc_steps,n_measurement,n_save; +EXTERN int n_iteration,n_thermal,iteration; +EXTERN double betag; +#ifdef HIGGS +EXTERN double p_x,p_y,betaA,beta4,beta2,betay; +EXTERN int n_correlation,w_correlation; +#endif + +EXTERN double wvalue; /*for multicanonical */ +EXTERN double timeu,timea,timerest; +EXTERN double ahitu,ahitua,ahithb,ahitax,ahitmc,ahitog; /* hit*/ +EXTERN int nhitu,nhitua,nhithb,nhitax,nhitmc,nhitog; +EXTERN int meas_sync,corr_sync; + +#ifdef HIGGS +/* correlation function globals */ +EXTERN int corrlen,n_corr; +EXTERN int n_bop,n_blocking,b_level[MAX_BOP]; + +/* correlation function pointers */ +#define N_CORR 8 +EXTERN float *c_array; +EXTERN float *cr2[MAX_BOP],*cr3[MAX_BOP],*ch0[MAX_BOP],*ch1[MAX_BOP]; +EXTERN float *cH0[MAX_BOP],*cH1[MAX_BOP],*cp0[MAX_BOP],*cp1[MAX_BOP]; + +#define b_const_a1 0.2 +#define b_const_a2 (0.25*(1.0-b_const_a1)) +#define b_const_g1 0.334 +#define b_const_g2 (0.5*(1.0-b_const_g1)) + +#endif + +/***************************************************************** + * Field variables + */ + +EXTERN su3_matrix *U[NDIM]; +#ifdef HIGGS +EXTERN adjoint_matrix *ahiggs; +#endif + +/*****************************************************************/ + +#define confname "config" + +/* PABS replace status by kernel_B.input.status */ +#define statname "kernel_B.input.status" + +#define measurename "measure" +#define corrname "correl" +#define wlname "wloop" + +/* PABS replace beta by kernel_B.input.beta */ +#define betaname "kernel_B.input.beta" + +#define weightname "weight" + +/* PABS replace parameters by kernel_B.input.parameters */ +#define paramname "kernel_B.input.parameters" + +#ifndef T3E +#define prefetch_adjoint(x) /* nothing */ +#define prefetch_matrix(x) /* nothing */ +#endif + +void reunitarize(su3_matrix *link[NDIM]); +int setup(void); +void load_config(int status); +void updatehiggs(int isover); +void measure(); void writemeas(); void hcorr(); void writecorr(); +void setfiles(int restart); +void dumpall(int status,int * maxiters); +void updategauge(int isrelax); +void relax(int dir, int parity, su3_matrix *link[NDIM], su3_matrix *staple +#ifdef HIGGS + , su3_matrix *ac +#endif + ); +void monte(int dir, int parity, su3_matrix *link[NDIM], su3_matrix *staple +#ifdef HIGGS + , su3_matrix *ac +#endif + ); +void staples_su3(su3_matrix *link[NDIM], su3_matrix *staple, int dir1,int parity); +double Xoverrelax(int parity, adjoint_matrix *ahiggs, adjoint_matrix *astaple); +double HBHiggs(int parity, adjoint_matrix *ahiggs, adjoint_matrix *astaple); +double act_gauge_adj(su3_matrix *a, su3_matrix *u,adjoint_matrix *b); + +complex measure_ploop(su3_matrix *link[NDIM], int dir); + +void staple1(int i, int dir1, MATRIX *link[NDIM], MATRIX *staple) ; + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/measure.c b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/measure.c new file mode 100644 index 0000000000000000000000000000000000000000..074dff478a1785dc9a31bd0810145d9d9cc4de2f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/measure.c @@ -0,0 +1,458 @@ +/**************************************************************** + * MIMD SU3 - adjoint Higgs * + * * + * Measurement routines * + * Kari Rummukainen 1992 - 1996 * + * * + ***************************************************************/ + +#include "lattice.h" /* global variables for lattice fields */ + + +typedef struct { + int headerid,headersize; + int n_double,n_long,n_float,n_char; + int lx,ly,lz,lt; + int d1,d2,d3,d4,d5,d6,d7,d8; +} e_header; +#define E_HEADER_ID 91919191 + +static FILE *measfile, *corrfile; + +#ifdef HIGGS +#define m_plaq 1 +#define m_hopp 2 +#define m_a 3 +#define m_a2 4 +#define m_a3 5 +#define m_a4 6 +#define m_acty 7 +#define N_MEAS 8 +#elif NDIM == 3 +#define m_plaq 0 +#define N_MEAS 1 +#elif NDIM == 4 +#define m_plaq_s 0 +#define m_plaq_t 1 +#define ploop_r 2 +#define ploop_i 3 +#define N_MEAS 4 +#endif +double ma[N_MEAS]; + +void measure_plaq(); +void measure_higgs(); +int reposition(FILE *f,int nmeas); + +void +measure() +{ + int i; + complex ct; + + for (i=0; i F + * + * action is -1/2 beta Tr(X' UF) = -1/2 beta Tr(X F'U') + * + * Multiply and add RF*VF'*UF' + * + * GF: Now B,X,F = 1; + */ + + + msg_tag *tag[NDIM]; + int i,j,dir; + double r2; + su3_matrix a1,a2; + + /* start gathers of points up */ + foralldir(dir) tag[dir] = start_get( ahiggs, dir, EVENODD ); + + forallsites_waitA(i,tag,NDIM) { + uncompress_adjmat( &ahiggs[i], &a1); + + foralldir(dir) + ma[m_hopp] += act_gauge_adj(&a1, &U[dir][i], &ahiggs[nb(dir,i)]); + + for (r2=j=0; j<8; j++) r2 += sqr( ahiggs[i].l[j] ); + + ma[m_a2] += r2; + ma[m_a4] += r2*r2; + + ma[m_a] += sqrt(r2); + + /* and then A0^3 -- + * NOTE: now Tr(A^3)_cont = Tr(A^3)_latt * 2^(3/2) + * and below it is also divided by 3! + */ + mult_su3_nn( &a1, &a1, &a2); + ma[m_a3] += realtrace_su3( &a2, &a1); /* this is a2' * a1, but a2 + is hermitean */ + } + + ma[m_acty] = ma[m_a2] * betay; + + ma[m_hopp] /= 3*lattice.volume*(2.0*betaA); + ma[m_a] /= lattice.volume; + ma[m_a2] /= lattice.volume; + ma[m_a3] /= 3*lattice.volume; + ma[m_a4] /= lattice.volume; +} + +#endif + +/************************************************************ + * write measurements + */ + +void +writemeas() +{ + int i; + + /* sum it */ + g_vecdoublesum(ma, N_MEAS, 0); + + if (this_node == 0) { +#ifdef HIGGS + if (is_multicanonical ) ma[0] = multi_weight(); +#endif + + meas_sync++; + i = (fwrite(ma,sizeof(double),N_MEAS,measfile) == N_MEAS); + if (i) i = (fwrite(&meas_sync,sizeof(int),1,measfile) == 1); + + if (!i) halt("Could not write measurement file"); + + if (meas_sync % 100 == 0) fflush(measfile); + } +} + +#ifdef HIGGS + +/************************************************************ + * write correlations + */ + +void +writecorr() +{ + int i,j; + + corr_sync++; + + if (this_node == 0) { + for (j=0; j 0) { + if (this_node == 0) fil = fopen(confname,"w"); + t = cputime(); + save_binary(fil); + } + + if(this_node == 0){ + printf("+"); + pm++; if (pm >= 20) { printf(" iteration %d\n",iteration); pm = 0; } + fflush(stdout); + if (n_save > 0) fclose(fil); + } + + if (this_node == 0) fil = fopen(statname,"r"); + restart = get_i(fil,"restart",-1); + nn_i = get_i(fil,"n_iteration",-1); + nn_t = get_i(fil,"n_thermal",-1); + if ((nn_i != n_iteration || nn_t != n_thermal)) { + printf0(" -> New limits:thermal %d, work %d\n",nn_t,nn_i); + n_iteration = nn_i; + n_thermal = nn_t; + + if (status == 1) *maxiters = n_thermal; + else *maxiters = n_iteration; + } + + if (this_node == 0) fclose(fil); + + if (this_node == 0) { + + /* flush the files .. */ + /* if (fflush(measfile) != 0) halt(" FILE ERROR when flushing measurements"); + */ + /* if (fflush(corrfil) != 0) halt(" FILE ERROR when flushing correlations"); + */ + + fil = fopen(statname,"w"); + iseed = dran()*(1<<30); + + print_i(fil,"restart",1); /* write now restart */ + print_i(fil,"n_iteration",n_iteration); + print_i(fil,"n_thermal",n_thermal); + print_i(fil,"seed",iseed); + + print_i(fil,"run status",status); + print_i(fil,"iteration",iteration); + print_d(fil,"time: gauge",timeu); +#ifdef HIGGS + print_d(fil,"time: higgs",timea); +#endif + print_d(fil,"time: rest",timerest); + + fclose(fil); + +#ifdef HIGGS + if (is_mucacalc) writemuca(); +#endif + + fflush(stdout); + } +} + +#ifdef check + +/**************************************************************** + * * + * check the terms ... diagnostic routine * + * * + ***************************************************************/ + +static double car[N_MEAS],car2[N_MEAS]; +static int ii=0; + +int +check_action(int stat) +{ + static double arr[N_MEAS]; + int i; + + measure(); + if (stat == 0) { + for (i=0; i X |-- uf --> af + * + * 2 beta tr (a [u an u']) + * + * Note that [ ] is also 'adjoint!' + */ + + register int i,dir, odir, otherparity; + msg_tag *tag0,*tag1; + su3_matrix tmat1,tmat2; + adjoint_matrix tadj; + + /* Loop over directions, computing force from links */ + + otherparity = opp_parity( parity ); + foralldir(dir) { + odir = opp_dir(dir); + + /* start gather of up-adjoint link */ + + tag0 = start_get( ahiggs, dir, parity ); + + /* multiply adjoint here with up-link, for opp-parity */ + forparity(i,otherparity) { + prefetch_matrix( &U[dir][i+1] ); + prefetch_adjoint( &ahiggs[i+1] ); + uncompress_adjmat(&ahiggs[i],&tmat1); + mult_su3_an( &U[dir][i], &tmat1, &tmat2 ); + mult_su3_nn( &tmat2, &U[dir][i], &tmat1 ); + compress_adjmat( &tmat1, &adjstaple[i] ); + } + + tag1 = start_get( adjstaple, odir, parity ); + + wait_get(tag0); + + /* multiply link with up-adjoint */ + if (dir == XUP) forparity(i,parity) { + prefetch_matrix( &U[dir][i+1] ); + prefetch_adjoint( &ahiggs[nb(dir,i+1)] ); + uncompress_adjmat( &ahiggs[nb(dir,i)], &tmat1); + mult_su3_nn( &U[dir][i], &tmat1, &tmat2 ); + mult_su3_na( &tmat2, &U[dir][i], &tmat1 ); + compress_adjmat( &tmat1, &adjstaple[i] ); + } else forparity(i,parity) { + prefetch_matrix( &U[dir][i+1] ); + prefetch_adjoint( &ahiggs[nb(dir,i+1)] ); + prefetch_adjoint( &adjstaple[i+1] ); + uncompress_adjmat( &ahiggs[nb(dir,i)], &tmat1); + mult_su3_nn( &U[dir][i], &tmat1, &tmat2 ); + mult_su3_na( &tmat2, &U[dir][i], &tmat1 ); + compress_adjmat( &tmat1, &tadj ); + add_adjmat( &adjstaple[i], &tadj, &adjstaple[i] ); + } + + wait_get(tag1); + forparity(i,parity) { + prefetch_adjoint( &adjstaple[i+1] ); + prefetch_adjoint( &adjstaple[nb(odir,i+1)] ); + add_adjmat( &adjstaple[i], &adjstaple[nb(odir,i)] , &adjstaple[i] ); + } + } +} + + +/****************************************************** + * needed for adjoint acceptance + * note-first su3_matrix contains the 'local' adjoint + * matrix uncompressed + * + * acc/rej with + * + * a |-- u --> an + * + * -2 beta tr (a u an u') + * + *****************************************************/ + +double act_gauge_adj(su3_matrix *a, su3_matrix *u,adjoint_matrix *b) +{ + su3_matrix am,m1,m2; + + uncompress_adjmat(b,&am); + mult_su3_nn(a,u,&m1); + mult_su3_na(u,&am,&m2); + return((-2.0)*betaA*realtrace_su3(&m1,&m2)); /* m2' * m1 */ +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/monte.c b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/monte.c new file mode 100644 index 0000000000000000000000000000000000000000..d08ab03eeb39d6d21af5f03f2bdc792d4741dd17 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/monte.c @@ -0,0 +1,302 @@ +/************************** monte.c *******************************/ +/* Kennedy-Pendleton quasi heat bath on SU(2) subgroups */ +/* MIMD version 3 */ +/* T. DeGrand March 1991 */ +/* modified by K.R 97 & 2002 */ + +#include "lattice.h" + +#define Nc 3 + +/* Generic definitions - could be useful elsewhere */ + +typedef struct { complex e[2][2]; } su2_matrix; + +/* #pragma inline ( mult_su2_mat_vec_elem_n, left_su2_hit_n ) */ + +INLINE void mult_su2_mat_vec_elem_n(u,x0,x1) + su2_matrix *u; + complex *x0, *x1; +{ + /* Multiplies the complex column spinor (x0, x1) by the SU(2) matrix u */ + /* and puts the result in (x0,x1). */ + /* Thus x <- u * x */ + /* C. DeTar 3 Oct 1990 */ + + complex z0, z1, t0, t1; + + t0 = *x0; t1 = *x1; + + CMUL(u->e[0][0], t0, z0); + CMUL(u->e[0][1], t1, z1); + CADD(z0, z1, *x0); + CMUL(u->e[1][0], t0, z0); + CMUL(u->e[1][1], t1, z1); + CADD(z0, z1, *x1); + +} /* mult_su2_mat_vec_elem_n */ + + + +/* void dumpsu2(u) su2_matrix *u; { + * int i,j; + * for(i=0;i<2;i++){ + * for(j=0;j<2;j++)printf("(%.2e,%.2e)\t", + * (double)u->e[i][j].real,(double)u->e[i][j].imag); + * printf("\n"); + * } + * printf("\n"); + *} + */ + +INLINE void left_su2_hit_n(su2_matrix *u, int p, int q, su3_matrix *link) +{ + /* link <- u * link */ + /* The 0 row of the SU(2) matrix u matches row p of the SU(3) matrix */ + /* The 1 row of the SU(2) matrix u matches row q of the SU(3) matrix */ + /* C. DeTar 18 Oct 1990 */ + + register int m; + + for (m = 0; m < 3; m++) + mult_su2_mat_vec_elem_n(u, &(link->e[p][m]), &(link->e[q][m])); + +} /* left_su2_hit_n */ + + + +void monte(int dir,int parity, + su3_matrix *link[NDIM], su3_matrix *staple +#ifdef HIGGS + , su3_matrix *a_uncmpr +#endif + ) +{ + /* Do K-P quasi-heat bath by SU(2) subgroups */ + int Nhit, index1, ina, inb,ii,cb; + int gahit,gatry,utry,uhit; + double xr1,xr2,xr3,xr4; + double a0,a1,a2,a3; + double v0,v1,v2,v3, vsq; + double h0,h1,h2,h3; + double r,r2,rho,z; + double al,d, xl,xd; + int k, nacd, test; + double b3; + register int i; + su3_matrix action; + su2_matrix h; + + Nhit = 3; + + b3=betag/3.0; + + gahit = gatry = 0; utry = uhit = 1; + + /* now for the qhb updating */ + for(index1=0;index1 inb) { ii=ina; ina=inb; inb=ii;} + + forparity(i,parity){ + mult_su3_na( &link[dir][i], &staple[i], &action ); + + /* decompose the action into SU(2) subgroups using + * Pauli matrix expansion + * The SU(2) hit matrix is represented as + * a0 + i * Sum j (sigma j * aj) + */ + v0 = action.e[ina][ina].real + action.e[inb][inb].real; + v3 = action.e[ina][ina].imag - action.e[inb][inb].imag; + v1 = action.e[ina][inb].imag + action.e[inb][ina].imag; + v2 = action.e[ina][inb].real - action.e[inb][ina].real; + + vsq = v0*v0 + v1*v1 + v2*v2 + v3*v3; + + if (vsq <= 0.0) { + printf("monte: vsq error! node %d, vsq %g\n",this_node,vsq); + fflush(stdout); + terminate(0); + } + + z = sqrt(vsq ); + /* Normalize u */ + v0 = v0/z; v1 = v1/z; v2 = v2/z; v3 = v3/z; + /* end norm check--trial SU(2) matrix is a0 + i a(j)sigma(j)*/ +/* test +if(this_node == 0)printf("v= %e %e %e %e\n",v0,v1,v2,v3); +if(this_node == 0)printf("z= %e\n",z); +*/ + /* now begin qhb */ + /* get four random numbers */ + + xr1 = log(1.0 - dran()); + xr2 = log(1.0 - dran()); + xr3 = dran(); + xr4 = dran(); + + xr3 = cos(pi2*xr3); + +/* + if(this_node == 0)printf("rand= %e %e %e %e\n",xr1,xr2,xr3,xr4); +*/ + + /* + generate a0 component of su3 matrix + + first consider generating an su(2) matrix h + according to exp(bg/3 * re tr(h*s)) + rewrite re tr(h*s) as re tr(h*v)z where v is + an su(2) matrix and z is a real normalization constant + let v = z*v. (z is 2*xi in k-p notation) + v is represented in the form v(0) + i*sig*v (sig are pauli) + v(0) and vector v are real + + let a = h*v and now generate a + rewrite beta/3 * re tr(h*v) * z as al*a0 + a0 has prob(a0) = n0 * sqrt(1 - a0**2) * exp(al * a0) + */ + al = b3*z; +/*if(this_node == 0)printf("al= %e\n",al);*/ + + /* + let a0 = 1 - del**2 + get d = del**2 + such that prob2(del) = n1 * del**2 * exp(-al*del**2) + */ + + d = -(xr2 + xr1*xr3*xr3)/al; + + /* monte carlo prob1(del) = n2 * sqrt(1 - 0.5*del**2) + then prob(a0) = n3 * prob1(a0)*prob2(a0) + */ + + /* now beat each site into submission */ + nacd = 0; + if ((1.00 - 0.5*d) > xr4*xr4) nacd=1; + if(nacd == 0 && al > 2.0) { /* k-p algorithm */ + test=0; + for(k=0; k<20 && !test;k++) { + /* get four random numbers */ + xr1 = log(1.0 - dran()); + xr2 = log(1.0 - dran()); + xr3 = dran(); + xr4 = dran(); + + xr3 = cos(pi2*xr3); + + d = -(xr2 + xr1*xr3*xr3)/al; + if ((1.00 - 0.5*d) > xr4*xr4) test = 1; + } + utry += k; + uhit++; + + if (this_node == 0 && test != 1) + printf("site took 20 kp hits\n"); + } /* endif nacd */ + + if(nacd == 0 && al <= 2.0) { + /* creutz algorithm */ + xl=exp((double)(-2.0*al)); + xd= 1.0 - xl; + test=0; + for(k=0;k<20 && test == 0 ;k++) { + /* get two random numbers */ + xr1=dran(); + xr2=dran(); + + r = xl + xd*xr1; + a0 = 1.00 + log((double)r)/al; + if((1.0 -a0*a0) > xr2*xr2) test = 1; + } + d = 1.0 - a0; + utry += k; + uhit++; + + if(this_node == 0 && test !=1) + printf("site took 20 creutz hits\n"); + } /* endif nacd */ + + /* generate full su(2) matrix and update link matrix*/ + + /* find a0 = 1 - d*/ + a0 = 1.0 - d; + /* compute r */ + r2 = 1.0 - a0*a0; + r2 = fabs(r2); + r = sqrt(r2); + + /* compute a3 */ + a3=(2.0*dran() - 1.0)*r; + + prefetch_matrix(&a_uncmpr[i] ); + prefetch_adjoint(&ahiggs[nb(dir,i)]); + + /* compute a1 and a2 */ + rho = r2 - a3*a3; + rho = fabs(rho); + rho = sqrt(rho); + + /*xr2 is a random number between 0 and 2*pi */ + xr2 = pi2*dran(); + a1 = rho*cos((double)xr2); + a2 = rho*sin((double)xr2); + + /* now do the updating. h = a*v^dagger, new u = h*u */ + h0 = a0*v0 + a1*v1 + a2*v2 + a3*v3; + h1 = a1*v0 - a0*v1 + a2*v3 - a3*v2; + h2 = a2*v0 - a0*v2 + a3*v1 - a1*v3; + h3 = a3*v0 - a0*v3 + a1*v2 - a2*v1; + + /* Elements of SU(2) matrix */ + + h.e[0][0] = cmplx( h0, h3); + h.e[0][1] = cmplx( h2, h1); + h.e[1][0] = cmplx(-h2, h1); + h.e[1][1] = cmplx( h0,-h3); + + /* update the link */ + +#ifdef HIGGS + action = link[dir][i]; + left_su2_hit_n(&h,ina,inb,&action); + + /* remember: tmpmat contains uncompressed local adj. */ + + a1 = act_gauge_adj(&a_uncmpr[i],&link[dir][i],&ahiggs[nb(dir,i)]); + a2 = act_gauge_adj(&a_uncmpr[i],&action, &ahiggs[nb(dir,i)]); + + prefetch_matrix(&link[dir][i+1]); + prefetch_matrix(&staple[i+1]); + + if (exp(a1-a2) >= dran()) { + link[dir][i] = action; + gahit++; + } + gatry++; + +#else + + left_su2_hit_n(&h,ina,inb,&link[dir][i]); + +#endif + + } /* site */ + } /* hits */ + +#ifdef HIGGS + nhitua++; + ahitua += 1.0*gahit/gatry; +#endif + + nhitu++; + ahitu += 1.0*uhit/utry; + +} /* monte */ + + + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/multican.c b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/multican.c new file mode 100644 index 0000000000000000000000000000000000000000..fb77cc8d41878557aab8c99bb3b1204bdd52103d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/multican.c @@ -0,0 +1,11 @@ +/******** setup.c *********/ +/* MIMD code version 3 */ + +#include "lattice.h" + +typedef adjoint_matrix multi_type; /* defines type of MC field */ +#define multi_field ahiggs /* defines MC field */ +#define multi_order(i) adj_sqr( &ahiggs[i] ) /* defines MC order param. */ + +#include "../generic/multican_generic.c" + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/ploop.c b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/ploop.c new file mode 100644 index 0000000000000000000000000000000000000000..d9c1f38a3ae8f0f10e60260303b20c0bd7b4d333 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/ploop.c @@ -0,0 +1,43 @@ +/**************************************************************** + * MIMD SU3 + * * + * Measure the polyakov loop + * * + ***************************************************************/ + +#include "lattice.h" /* global variables for lattice fields */ + +complex measure_ploop(su3_matrix *U[NDIM], int dir) +{ + su3_matrix ploop,tm; + int i,j,loc,nt; + complex sum,ct; + + /** THIS SIMPLE VERSION WORKS ONLY IF THE dir-DIRECTION + * FITS COMPLETELY WITHIN ONE NODE. THUS, + */ + + sum = cmplx(0.0,0.0); + + if (node.nodesize[dir] != lattice.size[dir]) + halt(" PLOOP:: lattice size error!"); + + nt = lattice.size[dir]; + + /* Now, multiply all dir-links */ + forallsites(i) if (coordinate(i,dir) == 0) { + ploop = U[dir][i]; + loc = i; + for (j=1; je[m][p]), &(link->e[m][q])); + +} /* r_su2_hit_a.c */ + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/relax.c b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/relax.c new file mode 100644 index 0000000000000000000000000000000000000000..d53d34bf788cfebadf7b184821c4216f7d7af2c8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/relax.c @@ -0,0 +1,109 @@ +/************************** relax.c *******************************/ +/* Microcanonical overrelaxation by doing successive SU(2) gauge hits */ +/* MIMD version 3 */ +/* T. DeGrand March 1991 */ +/* Heavily modified K.R. 97 & 2002 */ + +#include "lattice.h" + +#define Nc 3 + +#ifdef T3E +#define prefetch +#endif + +typedef struct { complex e[2][2]; } su2_matrix; + +/* Codes for interpreting selection of gauge fixing options */ + +void left_su2_hit_n(su2_matrix *u, int p, int q, su3_matrix *link); + +void relax(int dir,int parity, + su3_matrix *link[NDIM], su3_matrix *staple +#ifdef HIGGS + , su3_matrix *a_uncmpr +#endif + ) +{ + /* Do overrelaxation by SU(2) subgroups */ + int Nhit,index1, ina, inb,ii; + int gahit,gatry; + double a0,a1,a2,a3,asq,r; + register int i; + su3_matrix action; + su2_matrix u; + + Nhit = 3; + + gahit = gatry = 0; + + /* now for the overrelaxed updating */ + forparity(i,parity) { + prefetch_matrix(&a_uncmpr[i]); + prefetch_matrix(&(link[dir][i+1])); + prefetch_matrix(&(staple[i+1])); + + for(index1=0;index1 inb) { ii=ina; ina=inb; inb=ii;} + + mult_su3_na( &(link[dir][i]), &(staple[i]), &action ); + + /* decompose the action into SU(2) subgroups using Pauli matrix + * expansion + * The SU(2) hit matrix is represented as + * a0 + i * Sum j (sigma j * aj) + */ + a0 = action.e[ina][ina].real + action.e[inb][inb].real; + a3 = action.e[ina][ina].imag - action.e[inb][inb].imag; + a1 = action.e[ina][inb].imag + action.e[inb][ina].imag; + a2 = action.e[ina][inb].real - action.e[inb][ina].real; + + /* Normalize and complex conjugate u */ + asq = a0*a0 + a1*a1 + a2*a2 + a3*a3; + r = sqrt( asq ); + a0 = a0/r; a1 = -a1/r; a2 = -a2/r; a3 = -a3/r; + /* Elements of SU(2) matrix */ + + u.e[0][0] = cmplx( a0, a3); + u.e[0][1] = cmplx( a2, a1); + u.e[1][0] = cmplx(-a2, a1); + u.e[1][1] = cmplx( a0,-a3); + + /* Do SU(2) hit on all links twice (to overrelax) */ + +#ifdef HIGGS + + action = link[dir][i]; + left_su2_hit_n(&u,ina,inb,&action); + left_su2_hit_n(&u,ina,inb,&action); + + /* remember: tmpmat contains uncompressed local adj. */ + a1 = act_gauge_adj(&a_uncmpr[i], &link[dir][i], &ahiggs[nb(dir,i)]); + a2 = act_gauge_adj(&a_uncmpr[i], &action, &ahiggs[nb(dir,i)]); + + if (exp(a1-a2) >= dran()) { + link[dir][i] = action; + gahit++; + } + gatry++; + +#else + + left_su2_hit_n(&u,ina,inb,&link[dir][i]); + left_su2_hit_n(&u,ina,inb,&link[dir][i]); + +#endif + + } /* st */ + } /* hits */ + +#ifdef HIGGS + nhitua++; + ahitua += 1.0*gahit/gatry; +#endif + +} /* relax */ + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/setcouplings_gauge.c b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/setcouplings_gauge.c new file mode 100644 index 0000000000000000000000000000000000000000..7a88952e184fdd62d222784af489b33dee09d557 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/setcouplings_gauge.c @@ -0,0 +1,15 @@ +/******** setcouplings_gauge.c *********/ +/* MIMD code version 3 */ + +#include "lattice.h" + +/* Each node has a params structure for passing simulation parameters */ + + +void setcouplings() +{ + if (this_node == 0) + printf(" Input couplings: betag %.8g\n",betag); + +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/setcouplings_higgs.c b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/setcouplings_higgs.c new file mode 100644 index 0000000000000000000000000000000000000000..64e0e0a7afa549b3f7ad398148d831481fbde9cb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/setcouplings_higgs.c @@ -0,0 +1,62 @@ +/******** setcouplings_higgs.c *********/ +/* MIMD code version 3 */ + +#include "lattice.h" + +/* Each node has a params structure for passing simulation parameters */ + +#define Sigma 3.1759115 +/* #define improve_y */ + + + +void setcouplings() +{ + if (this_node == 0) + printf(" Input couplings: betag %.8g x %.8g y %.8g\n",betag,p_x,p_y); + + /* Improved x: from eq. (2.8) in JHEP11 (1998) 011 + */ + +#ifdef improve_y + p_y *= 1 + (2.574608+0.72985*p_x) / betag; +#endif + +#define improve_x +#ifdef improve_x + p_x = p_x + ( 0.328432 - 0.835282 * p_x + 1.167759 * sqr(p_x) ) / betag; + if (this_node == 0) + printf(" USING IMPROVED x\n"); +#else + if (this_node == 0) + printf(" USING NON-IMPROVED x\n"); +#endif + + /* and calculate the lattice couplings: + * NOTE: This normalizes A0 -> sqrt(2) A0, compared to paper + * thus, now h^a h^a = 2 Tr h^2 = Tr A_cont + * AND (h^a h^a)^2 = (2 Tr h^2)^2 = (Tr A_cont)^2 + * + * Now Tr h^3 = 2^(-3/2) Tr A_cont^3 (note also extra 3 in the measurement) + */ + + betaA = 12/betag; + beta4 = p_x * 1.5 * sqr(betaA)/betag; + beta2 = 3*betaA * (1 + 6*p_y/sqr(betag) + - (6 + 10*p_x)*Sigma/(4*pi*betag) + - 6/(16*sqr(pi*betag)) * + ((60*p_x - 20*sqr(p_x))*(log(betag) + 0.08849) + + 34.768*p_x + 36.130)); + + betay = 3*betaA * 6/sqr(betag); + + if (this_node == 0) { + printf(" Non-improved couplings: betag %.8g x %.8g y %.8g\n",betag,p_x,p_y); + printf(" Other lattice couplings: betaA %.8g beta2 %.8g beta4 %.8g\n", + betaA,beta2,beta4); + + printf(" OUTPUT NORMALIZATION: #5 = Tr A0^2, #7 = (Tr A0^2)^2\n"); + + } +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/setup.c b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/setup.c new file mode 100644 index 0000000000000000000000000000000000000000..271e21a819aa144528ef7b1581b473824848466e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/setup.c @@ -0,0 +1,236 @@ +/******** setup.c *********/ +/* MIMD code version 3 */ + +#include "lattice.h" + +/* Each node has a params structure for passing simulation parameters */ + +void load_config(int status); +void coldlat(); void hotlat(); +void setcouplings(); + +/* SETUP ROUTINES */ + +int setup() +{ + int i,rstatus,restart,j; + int nx,ny,nz,nt,size[NDIM]; + FILE *fil; + + /* On node zero, read lattice size, seed, nflavors and send to others */ + /* print banner */ + if (this_node == 0) { + printf("--------------------------------------\n"); +#ifdef Higgs + printf("SU3 + adjoint Higgs in %d dimensions\n",NDIM); +#else + printf("SU3 gauge in %d dimensions\n",NDIM); +#endif + printf("Based on MILC MIMD version 3\n"); + printf("Machine = %s, with %d nodes\n",machine_type(),numnodes()); + printf("Overrelaxed/quasi-heat bath algorithm\n"); + + if ((fil = fopen(paramname,"r")) == NULL) halt(" ** No parameter file?"); + printf("\n READING LATTICE SIZE FROM PARAMETER FILE:\n"); + } + + nx = size[XUP] = get_i(fil,"nx",1); + ny = size[YUP] = get_i(fil,"ny",1); + nz = size[ZUP] = get_i(fil,"nz",1); +#if NDIM == 4 + nt = size[TUP] = get_i(fil,"nt",1); +#endif + + /************************************************** + * Initialize the layout and gather functions + */ + setup_lattice(size); + + printf0("\n READING REST OF THE PARAMETER FILE:\n"); + + mc_steps = get_i(fil,"micro steps",1); + n_measurement = get_i(fil,"n_measurement",1); +#ifdef HIGGS + n_correlation = get_i(fil,"n_correlation",1); + w_correlation = get_i(fil,"w_correlation",1); +#endif + n_save = get_i(fil,"n_save",1); + +#ifdef HIGGS + n_blocking = get_i(fil,"blocking levels",1); + for (j=n_bop=0; ja_uncmp */ + forparity(i,parity) { + prefetch_adjoint(ahiggs+2+i); + uncompress_adjmat(&(ahiggs[i]),&(a_uncmpr[i])); + } +#endif + + /* compute the gauge force */ + staples_su3(U, staple, dir, parity); /* goest to ->staple */ + +#ifdef HIGGS + wait_get(tag); /* wait for ahiggs from up */ + if (isrelax) relax(dir, parity, U, staple, a_uncmpr); + else monte(dir, parity, U, staple, a_uncmpr); +#else + if (isrelax) relax(dir, parity, U, staple ); + else monte(dir, parity, U, staple ); +#endif + + } + +#ifdef HIGGS + free_tmp( a_uncmpr ); +#endif + free_tmp( staple ); +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/updatehiggs.c b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/updatehiggs.c new file mode 100644 index 0000000000000000000000000000000000000000..3a6648363101090a6a3ca29c67d478ee3ab9d19a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_B/su3h_n/updatehiggs.c @@ -0,0 +1,129 @@ +/********************* updatehiggs.c ************************/ + +#include "lattice.h" + +void get_adjstaple(int parity, adjoint_matrix *as); + +void updatehiggs(int relax) +{ + int parity; + double rtot; + adjoint_matrix *adjstaple; + + adjstaple = tmp_latfield( adjoint_matrix ); + + forbothparities(parity) { + + if (is_multicanonical) set_mc_update(parity); + + get_adjstaple(parity, adjstaple); + + /* Here Goverrelax or Xoverrelax */ + if (relax) rtot = Goverrelax(parity, ahiggs, adjstaple); + else rtot = HBHiggs(parity, ahiggs, adjstaple); + + if (is_multicanonical) mc_acceptance(parity, rtot); + } + + free_tmp( adjstaple ); +} + + +/************************************************************* + * * + * calculate the gauge + adjoint Higgs link * + * * + ************************************************************/ + +void get_adjstaple(int parity, adjoint_matrix *adjstaple) +{ + /* calculate the adjoint-gauge link action ('staple') + * + * ab |-- ub --> X |-- uf --> af + * + * 2 beta tr (a [u an u']) + * + * Note that [ ] is also 'adjoint!' + */ + + register int i,dir, odir, otherparity; + msg_tag *tag0,*tag1; + su3_matrix tmat1,tmat2; + adjoint_matrix tadj; + + /* Loop over directions, computing force from links */ + + otherparity = opp_parity( parity ); + foralldir(dir) { + odir = opp_dir(dir); + + /* start gather of up-adjoint link */ + + tag0 = start_get( ahiggs, dir, parity ); + + /* multiply adjoint here with up-link, for opp-parity */ + forparity(i,otherparity) { + prefetch_matrix( &U[dir][i+1] ); + prefetch_adjoint( &ahiggs[i+1] ); + uncompress_adjmat(&ahiggs[i],&tmat1); + mult_su3_an( &U[dir][i], &tmat1, &tmat2 ); + mult_su3_nn( &tmat2, &U[dir][i], &tmat1 ); + compress_adjmat( &tmat1, &adjstaple[i] ); + } + + tag1 = start_get( adjstaple, odir, parity ); + + wait_get(tag0); + + /* multiply link with up-adjoint */ + if (dir == XUP) forparity(i,parity) { + prefetch_matrix( &U[dir][i+1] ); + prefetch_adjoint( &ahiggs[nb(dir,i+1)] ); + uncompress_adjmat( &ahiggs[nb(dir,i)], &tmat1); + mult_su3_nn( &U[dir][i], &tmat1, &tmat2 ); + mult_su3_na( &tmat2, &U[dir][i], &tmat1 ); + compress_adjmat( &tmat1, &adjstaple[i] ); + } else forparity(i,parity) { + prefetch_matrix( &U[dir][i+1] ); + prefetch_adjoint( &ahiggs[nb(dir,i+1)] ); + prefetch_adjoint( &adjstaple[i+1] ); + uncompress_adjmat( &ahiggs[nb(dir,i)], &tmat1); + mult_su3_nn( &U[dir][i], &tmat1, &tmat2 ); + mult_su3_na( &tmat2, &U[dir][i], &tmat1 ); + compress_adjmat( &tmat1, &tadj ); + add_adjmat( &adjstaple[i], &tadj, &adjstaple[i] ); + } + + wait_get(tag1); + forparity(i,parity) { + prefetch_adjoint( &adjstaple[i+1] ); + prefetch_adjoint( &adjstaple[nb(odir,i+1)] ); + add_adjmat( &adjstaple[i], &adjstaple[nb(odir,i)] , &adjstaple[i] ); + } + } +} + + +/****************************************************** + * needed for adjoint acceptance + * note-first su3_matrix contains the 'local' adjoint + * matrix uncompressed + * + * acc/rej with + * + * a |-- u --> an + * + * -2 beta tr (a u an u') + * + *****************************************************/ + +double act_gauge_adj(su3_matrix *a, su3_matrix *u,adjoint_matrix *b) +{ + su3_matrix am,m1,m2; + + uncompress_adjmat(b,&am); + mult_su3_nn(a,u,&m1); + mult_su3_na(u,&am,&m2); + return((-2.0)*betaA*realtrace_su3(&m1,&m2)); /* m2' * m1 */ +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/CHANGELOG b/qcd/part_cpu/applications/QCD/src/kernel_C/CHANGELOG new file mode 100644 index 0000000000000000000000000000000000000000..4202f06ea9f1a71632e3c5b9012fcd06f26c618c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/CHANGELOG @@ -0,0 +1,8 @@ + +22. September 2008 + +DD-HMC-BM-1.0: Initial release + +15. April 2016 + +kernel_C : Updated to openQCD-1.4-bgopt \ No newline at end of file diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/COPYING b/qcd/part_cpu/applications/QCD/src/kernel_C/COPYING new file mode 100644 index 0000000000000000000000000000000000000000..7a8e8abfd0057f374fbf59076c263f1f5d685b73 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/COPYING @@ -0,0 +1,340 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_C/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..052a9fa61a736a1cc30f4287ff47679e93327308 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/Makefile @@ -0,0 +1,2 @@ +kernel: + cd main && gmake kernel \ No newline at end of file diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/Makefile.defs.in b/qcd/part_cpu/applications/QCD/src/kernel_C/Makefile.defs.in new file mode 100644 index 0000000000000000000000000000000000000000..809981d6ecb398371b9164abf11e469e880de0a5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/Makefile.defs.in @@ -0,0 +1,12 @@ +CC = #MPI_CC# +CFLAGS = #CFLAGS# + +SHELL = #SHELL# + +AR = #AR# +ARFLAGS = #ARFLAGS# + +LD = #LD# +LDFLAGS = #LDFLAGS# + +RM = #RM# \ No newline at end of file diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/README b/qcd/part_cpu/applications/QCD/src/kernel_C/README new file mode 100644 index 0000000000000000000000000000000000000000..c97f5b5e6c58021403d0a9943b8c214baa0a1911 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/README @@ -0,0 +1,209 @@ + +################################################################################ + + QCD SPEED TESTS + +################################################################################ + + +This document is short guide to get started and run the speed tests. For +more detailed information see the README.extended. + + +PROGRAMS + +The benchmark programs are provided in source form and must be +compiled by the user on the machine that is to be tested. + +In addition the openQCD-1.4 package is needed. A tar-file of the +source code can be obtained from + +http://luscher.web.cern.ch/luscher/openQCD/ + +and should be extracted in the same directory level as this package. + +PROGRAM FEATURES + +All programs parallelize in 0,1,2,3 or 4 dimensions, depending on what is +specified at compilation time. They are highly optimized for machines with +current Intel or AMD processors, but will run correctly on any system that +complies with the ISO C89 (formerly ANSI C) and the MPI 1.2 standards. + +For the purpose of testing and code development, the programs can also +be run on a desktop or laptop computer. All what is needed for this is +a compliant C compiler and a local MPI installation such as Open MPI. + + +DOCUMENTATION + +The simulation program has a modular form, with strict prototyping and a +minimal use of external variables. Each program file contains a small number +of externally accessible functions whose functionality is described at the top +of the file. + +The data layout is explained in various README files and detailed instructions +are given on how to run the main programs. A set of further documentation +files are included in the doc directory, where the normalization conventions, +the chosen algorithms and other important program elements are described. + + +COMPILATION + +The compilation of the programs requires an ISO C89 compliant compiler and a +compatible MPI installation that complies with the MPI standard 1.2 (or later). + +In the main and devel directories, a GNU-style Makefile is included which +compiles and links the programs (just type "make" to compile everything; "make +clean" removes the files generated by "make"). The compiler options can be set +by editing the CFLAGS line in the Makefiles. + +The Makefiles assume that the following environment variables are set: + + GCC GNU C compiler command [Example: /usr/bin/gcc]. + + MPI_HOME MPI home directory [Example: /usr/lib64/mpi/gcc/openmpi]. + The mpicc command used is the one in $MPI_HOME/mpicc and + the MPI libraries are expected in $MPI_HOME/lib. + + MPI_INCLUDE Directory where the mpi.h file is to be found. + +All programs are then compiled using the $MPI_HOME/bin/mpicc command. The +compiler options that can be set in the CFLAGS line depend on which C compiler +the mpicc command invokes (the GCC compiler command is only used to resolve +the dependencies on the include files). + + +SSE/AVX ACCELERATION + +Current Intel and AMD processors are able to perform arithmetic operations on +short vectors of floating-point numbers in just one or two machine cycles, +using SSE and/or AVX instructions. The arithmetic performed by these +instructions fully complies with the IEEE 754 standard. + +Many programs in the module directories include SSE and AVX inline-assembly +code. On 64bit systems, and if the GNU or Intel C compiler is used, the code +can be activated by setting the compiler flags -Dx64 and -DAVX, respectively. +In addition, SSE prefetch instructions will be used if one of the following +options is specified: + + -DP4 Assume that prefetch instructions fetch 128 bytes at a time + (Pentium 4 and related Xeons). + + -DPM Assume that prefetch instructions fetch 64 bytes at a time + (Athlon, Opteron, Pentium M, Core, Core 2 and related Xeons). + + -DP3 Assume that prefetch instructions fetch 32 bytes at a time + (Pentium III). + +These options have an effect only if -Dx64 or -DAVX is set. The option +-DAVX implies -Dx64. + +On recent x86-64 machines with AMD Opteron or Intel Xeon processors, for +example, the recommended compiler flags are + + -std=c89 -O -mno-avx -DAVX -DPM + +For older machines that do not support the AVX instruction set, the +recommended flags are + + -std=c89 -O -mno-avx -Dx64 -DPM + +More aggressive optimization levels such as -O2 and -O3 tend to have little +effect on the execution speed of the programs, but the risk of generating +wrong code is higher. + +AVX instructions and the option -mno-avx may not be known to old versions +of the compilers, in which case one is limited to SSE accelerations with +option string -std=c89 -O -Dx64 -DPM. + + +DEBUGGING FLAGS + +For troubleshooting and parameter tuning, it may helpful to switch on some +debugging flags at compilation time. The simulation program then prints a +detailed report to the log file on the progress made in specified subprogram. + +The available flags are: + +-DCGNE_DBG CGNE solver. + +-DFGCR_DBG GCR solver. + +-FGCR4VD_DBG GCR solver for the little Dirac equation. + +-DMSCG_DBG MSCG solver. + +-DDFL_MODES_DBG Deflation subspace generation. + +-DMDINT_DBG Integration of the molecular-dynamics equations. + +-DRWRAT_DBG Computation of the rational function reweighting + factor. + + +RUNNING A SIMULATION + +The simulation programs reside in the directory "main". For each program, +there is a README file in this directory which describes the program +functionality and its parameters. + +Running a simulation for the first time requires its parameters to be chosen, +which tends to be a non-trivial task. The syntax of the input parameter files +and the meaning of the various parameters is described in some detail in +main/README.infiles and doc/parms.pdf. Examples of valid parameter files are +contained in the directory main/examples. + + +EXPORTED FIELD FORMAT + +The field configurations generated in the course of a simulation are written +to disk in a machine-independent format (see modules/misc/archive.c). +Independently of the machine endianness, the fields are written in little +endian format. A byte-reordering is therefore not required when machines with +different endianness are used for the simulation and the physics analysis. + + +AUTHORS + +The initial release of the openQCD package was written by Martin Lüscher and +Stefan Schaefer. Support for Schrödinger functional boundary conditions was +added by John Bulava. Several modules were taken over from the DD-HMC program +tree, which includes contributions from Luigi Del Debbio, Leonardo Giusti, +Björn Leder and Filippo Palombi. + + +ACKNOWLEDGEMENTS + +In the course of the development of the openQCD code, many people suggested +corrections and improvements or tested preliminary versions of the programs. +The authors are particularly grateful to Isabel Campos, Dalibor Djukanovic, +Georg Engel, Leonardo Giusti, Björn Leder, Carlos Pena and Hubert Simma for +their communications and help. + + +LICENSE + +The software may be used under the terms of the GNU General Public Licence +(GPL). + + +BUG REPORTS + +If a bug is discovered, please send a report to . + + +ALTERNATIVE PACKAGES AND COMPLEMENTARY PROGRAMS + +There is a publicly available BG/Q version of openQCD that takes advantage of +the machine-specific features of IBM BlueGene/Q computers. The version is +available at . + +The openQCD programs currently do not support reweighting in the quark +masses, but a module providing this functionality can be downloaded from +. + +Previously generated gauge-field configurations are often used as initial +configuration for a new run. If the old and new lattices or boundary +conditions are not the same, the old configuration may however need to be +adapted, using a field conversion tool such as the one available at +, before the new run is started. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/global.defs.in b/qcd/part_cpu/applications/QCD/src/kernel_C/global.defs.in new file mode 100644 index 0000000000000000000000000000000000000000..628a5315a95eb0494bcc4f4a5208170c68362448 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/global.defs.in @@ -0,0 +1,8 @@ +#define NPROC0 #NPROC0# +#define NPROC1 #NPROC1# +#define NPROC2 #NPROC2# +#define NPROC3 #NPROC3# +#define L0 #L0# +#define L1 #L1# +#define L2 #L2# +#define L3 #L3# diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/include/bm.h b/qcd/part_cpu/applications/QCD/src/kernel_C/include/bm.h new file mode 100644 index 0000000000000000000000000000000000000000..37dd8a8d7bca60a10f6fa9efd32a1e17ceeb6b8f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/include/bm.h @@ -0,0 +1,35 @@ +/******************************************************************************* +* +* File bm.h +* +* Copyright (C) 2008 Bjorn Leder +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef BM_H +#define BM_H + +#define NMR 4 +#define NCY 5 +#define NS 16 + +#ifndef CG_ITER_C +extern void time_cg_iter(FILE *flog, double *wdt); +#endif + +#ifndef CG_ITER_DBLE_C +extern void time_cg_iter_dble(FILE *flog, double *wdt); +#endif + +#ifndef TIME_MSAP_C +extern void time_msap(FILE *flog, double *wdt); +#endif + +#ifndef TIME_AWHAT_C +extern void time_Awhat(FILE *flog, double *wdt, int* nb); +#endif + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/main/.Makefile.oqcd.kate-swp b/qcd/part_cpu/applications/QCD/src/kernel_C/main/.Makefile.oqcd.kate-swp new file mode 100644 index 0000000000000000000000000000000000000000..2f97f3847e6a1850291c1922b749090459be5a6d Binary files /dev/null and b/qcd/part_cpu/applications/QCD/src/kernel_C/main/.Makefile.oqcd.kate-swp differ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/main/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_C/main/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..0a5af41bf67a4d34458572cb63ee7524a4ef04ed --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/main/Makefile @@ -0,0 +1,151 @@ +################################################################################ +# +# Makefile to compile and link C programs with MPI subroutines +# +# Version valid for Linux machines with MPICH +# +# "make" compiles and links the specified main programs and modules, +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files generated by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +include ../Makefile.defs + +all: rmxeq mkdep mkxeq +.PHONY: all + +# main programs and modules to be compiled + +MAIN = time1 time2 time3 time3test + +ARCHIVE = archive sarchive + +BLOCK = block blk_grid map_u2blk map_sw2blk map_s2blk + +DFL = dfl_geometry dfl_subspace ltl_gcr dfl_sap_gcr dfl_modes + +DIRAC = Dw_dble Dw Dw_bnd + +FLAGS = flags action_parms dfl_parms force_parms hmc_parms lat_parms \ + mdint_parms rat_parms rw_parms sap_parms solver_parms + +FORCES = force0 force1 force2 force3 force4 force5 \ + frcfcts genfrc tmcg tmcgm xtensor + +LATTICE = bcnds uidx ftidx geometry + +LINALG = salg salg_dble valg valg_dble liealg cmatrix_dble cmatrix + +LINSOLV = cgne mscg fgcr fgcr4vd + +LITTLE = Aw_gen Aw_com Aw_ops Aw_dble Aw ltl_modes + +MDFLDS = mdflds fcom + +RANDOM = ranlux ranlxs ranlxd gauss + +RATFCTS = elliptic zolotarev ratfcts + +SAP = sap_com sap_gcr sap blk_solv + +SFLDS = sflds scom sdcom Pbnd Pbnd_dble + +SU3FCTS = chexp su3prod su3ren cm3x3 random_su3 + +SW_TERM = pauli pauli_dble swflds sw_term + +TCHARGE = ftcom ftensor tcharge ym_action + +UFLDS = plaq_sum uflds udcom bstap + +UPDATE = chrono mdsteps counters mdint hmc rwtm rwtmeo rwrat + +UTILS = endian mutils utils wspace + +VFLDS = vflds vinit vcom vdcom + +BM = cg_iter cg_iter_dble time_msap time_Awhat + +MODULES = $(ARCHIVE) $(BLOCK) $(DFL) $(DIRAC) $(FLAGS) $(FORCES) \ + $(LATTICE) $(LINALG) $(LINSOLV) $(LITTLE) $(MDFLDS) $(RANDOM) \ + $(RATFCTS) $(SAP) $(SFLDS) $(SU3FCTS) $(SW_TERM) $(TCHARGE) \ + $(UFLDS) $(UPDATE) $(UTILS) $(VFLDS) $(WFLOW) $(BM) + + +# openQCD distribution + +OQCD = ../openQCD-1.4-bgopt + +# Logging option (-mpilog or -mpitrace or -mpianim) + +LOGOPTION = + + +# search path for modules + +MDIR = $(OQCD)/modules + +MDIR_BM = ../modules + +VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/archive:$(MDIR)/linalg:\ + $(MDIR)/random:$(MDIR)/uflds:$(MDIR)/mdflds:$(MDIR)/su3fcts:\ + $(MDIR)/utils:$(MDIR)/forces:$(MDIR)/sflds:$(MDIR)/dirac:\ + $(MDIR)/sw_term:$(MDIR)/tcharge:$(MDIR)/block:$(MDIR)/sap:\ + $(MDIR)/linsolv:$(MDIR)/dfl:$(MDIR)/vflds:$(MDIR)/little:\ + $(MDIR)/update:$(MDIR)/wflow:$(MDIR)/ratfcts:$(MDIR_BM)/bm + + +# additional include directories + +INCPATH = $(MPIR_HOME)/include $(OQCD)/include/ ../include + + +# additional libraries + +LIBS = m + +LIBPATH = $(MPIR_HOME)/lib + + +############################## do not change ################################### + +PGMS= $(MAIN) $(MODULES) + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + +# rule to link object files + +$(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(LD) $< $(addsuffix .o,$(MODULES)) $(CFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ + +kernel: $(addsuffix .o,$(MODULES)) Makefile time3.o + $(AR) $(ARFLAGS) ../../kernel_C.a $(addsuffix .o,$(MODULES)) time3.o + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables + +rmxeq: + $(RM) $(MAIN); \ + echo "delete old executables" + + +# clean directory + +clean: + $(RM) -r *.d *.o *.alog *.clog *.slog $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/main/Makefile.amd b/qcd/part_cpu/applications/QCD/src/kernel_C/main/Makefile.amd new file mode 100644 index 0000000000000000000000000000000000000000..6feb9cb6443997f7ac8e2bd888d18784bee02700 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/main/Makefile.amd @@ -0,0 +1,154 @@ +################################################################################ +# +# Makefile to compile and link C programs with MPI subroutines +# +# Version valid for Linux machines with MPICH +# +# "make" compiles and links the specified main programs and modules, +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files generated by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +all: rmxeq mkdep mkxeq +.PHONY: all + +# main programs and modules to be compiled + +MAIN = time1 time2 time3 + +RANDOM = ranlxs ranlxd gauss random_su3 + +START = geometry start sinit utils flipbc + +FLAGS = flags parms wspace + +MISC = mutils endian sse_fcts su3_fcts su3_prods \ + cmatrix_dble cmatrix + +LINALG = linalg linalg_dble valg valg_dble + +LINSOLV = fgcr fgcr4vd + +SW_TERM = pauli pauli_dble swinit sw_term blk_sw_term + +DIRAC = Pbnd Qbnd Qhat scom Pbnd_dble Qbnd_dble Qhat_dble sdcom + +UPDATE = ucom shift + +BLOCK = block blk_grid blk_umap blk_swmap blk_smap + +EVA = jacobi_dble + +DFL = vgrid vflds vinit vcom vdcom Zgen Zops Zhat Zhat_dble dfl_subspace \ + dfl dfl_sap_gcr dfl_modes + +SAP_GCR = blk_solv msap sap_gcr + +HMC = chrono liealg hmcflds + +BM = cg_iter cg_iter_dble time_msap time_Zhat + +MODULES = $(RANDOM) $(START) $(FLAGS) $(MISC) \ + $(LINALG) $(LINSOLV) $(SW_TERM) $(DIRAC) $(UPDATE) $(BLOCK) \ + $(EVA) $(DFL) $(SAP_GCR) $(HMC) $(BM) + + +# DD-HMC distribution + +DDHMC = ../../DD-HMC-1.2.2 + +# Logging option (-mpilog or -mpitrace or -mpianim) + +LOGOPTION = + + +# search path for modules + +MDIR = $(DDHMC)/modules + +MDIR_BM = ../modules + +VPATH = .:$(MDIR)/random:$(MDIR)/start:$(MDIR)/flags:$(MDIR)/misc:\ + $(MDIR)/linalg:$(MDIR)/linsolv:$(MDIR)/sw_term:$(MDIR)/dirac:\ + $(MDIR)/update:$(MDIR)/block:$(MDIR)/eva:$(MDIR)/dfl:$(MDIR)/sap_gcr:\ + $(MDIR)/hmc:$(MDIR_BM)/bm: + + +# additional include directories + +INCPATH = $(MPIR_HOME)/include $(DDHMC)/include/ ../include + + +# additional libraries + +LIBS = m + +LIBPATH = $(MPIR_HOME)/lib + + +# scheduling and optimization options + +CFLAGS = -std=c89 -pedantic -Wall -Wno-long-long \ + -O3 -m32 -malign-double -msse3 -DSSE3 -DPM \ + -Wstrict-prototypes -fstrict-aliasing -Werror + + +############################## do not change ################################### + +SHELL=/bin/bash +CC=$(MPIR_HOME)/bin/mpicc +CLINKER=$(CC) + +PGMS= $(MAIN) $(MODULES) + +-include $(addsuffix .d,$(PGMS)) + + +# rule to make dependencies + +$(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + +# rule to link object files + +$(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(CFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ + + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables + +rmxeq: + @ -rm -f $(MAIN); \ + echo "delete old executables" + + +# make dependencies + +mkdep: $(addsuffix .d,$(PGMS)) + @ echo "generate tables of dependencies" + + +# clean directory + +clean: + @ -rm -rf *.d *.o *.alog *.clog *.slog $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/main/Makefile.bgl b/qcd/part_cpu/applications/QCD/src/kernel_C/main/Makefile.bgl new file mode 100644 index 0000000000000000000000000000000000000000..bbe9d46603f58d7b4fb0ae25224d363a42822231 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/main/Makefile.bgl @@ -0,0 +1,159 @@ +################################################################################ +# +# Makefile to compile and link C programs with MPI subroutines +# +# Version valid for Linux machines with MPICH +# +# "make" compiles and links the specified main programs and modules, +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files generated by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +all: rmxeq mkdep mkxeq +.PHONY: all + +# main programs and modules to be compiled + +MAIN = time1 time2 time3 + +RANDOM = ranlxs ranlxd gauss random_su3 + +START = geometry start sinit utils flipbc + +FLAGS = flags parms wspace + +MISC = mutils endian sse_fcts su3_fcts su3_prods \ + cmatrix_dble cmatrix + +LINALG = linalg linalg_dble valg valg_dble + +LINSOLV = fgcr fgcr4vd + +SW_TERM = pauli pauli_dble swinit sw_term blk_sw_term + +DIRAC = Pbnd Qbnd Qhat scom Pbnd_dble Qbnd_dble Qhat_dble sdcom + +UPDATE = ucom shift + +BLOCK = block blk_grid blk_umap blk_swmap blk_smap + +EVA = jacobi_dble + +DFL = vgrid vflds vinit vcom vdcom Zgen Zops Zhat Zhat_dble dfl_subspace \ + dfl dfl_sap_gcr dfl_modes + +SAP_GCR = blk_solv msap sap_gcr + +HMC = chrono liealg hmcflds + +BM = cg_iter cg_iter_dble time_msap time_Zhat + +MODULES = $(RANDOM) $(START) $(FLAGS) $(MISC) \ + $(LINALG) $(LINSOLV) $(SW_TERM) $(DIRAC) $(UPDATE) $(BLOCK) \ + $(EVA) $(DFL) $(SAP_GCR) $(HMC) $(BM) + + +# DD-HMC distribution + +DDHMC = ../../DD-HMC-1.2.2 + +# Logging option (-mpilog or -mpitrace or -mpianim) + +LOGOPTION = + + +# search path for modules + +MDIR = $(DDHMC)/modules + +MDIR_BM = ../modules + +VPATH = .:$(MDIR)/random:$(MDIR)/start:$(MDIR)/flags:$(MDIR)/misc:\ + $(MDIR)/linalg:$(MDIR)/linsolv:$(MDIR)/sw_term:$(MDIR)/dirac:\ + $(MDIR)/update:$(MDIR)/block:$(MDIR)/eva:$(MDIR)/dfl:$(MDIR)/sap_gcr:\ + $(MDIR)/hmc:$(MDIR_BM)/bm: + + +# additional include directories + +BGLSYS = /bgl/BlueLight/ppcfloor/bglsys +INCPATH = $(BGLSYS)/include $(DDHMC)/include/ ../include + + +# additional libraries + +LIBS = m + +LIBPATH = $(BGLSYS)/lib + + +# scheduling and optimization options + +CFLAGS = -O3 -qstrict -qarch=440 -qtune=440 -DDH -DSF -DTWBC + +# modules with routines that use Double Hummer intrinsics +DH = linalg_dble linalg pauli_dble pauli Qhat_dble Qhat sdcom scom msap sinit + +# add -qarch=440d to CFLAGS only for modules with Double Hummer intrinsics +$(addsuffix .o,$(DH)): CFLAGS += -qarch=440d + +############################## do not change ################################### + +SHELL=/bin/bash +CC=$(BGLSYS)/bin/mpixlc +GCC=/usr/bin/gcc +CLINKER=$(CC) + +PGMS= $(MAIN) $(MODULES) + +-include $(addsuffix .d,$(PGMS)) + + +# rule to make dependencies + +$(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + +# rule to link object files + +$(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(CFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ + + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables + +rmxeq: + @ -rm -f $(MAIN); \ + echo "delete old executables" + + +# make dependencies + +mkdep: $(addsuffix .d,$(PGMS)) + @ echo "generate tables of dependencies" + + +# clean directory + +clean: + @ -rm -rf *.d *.o *.alog *.clog *.slog $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/main/Makefile.intel b/qcd/part_cpu/applications/QCD/src/kernel_C/main/Makefile.intel new file mode 100644 index 0000000000000000000000000000000000000000..fdf531ffaf436b23668e071364488f8527cb8b36 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/main/Makefile.intel @@ -0,0 +1,154 @@ +################################################################################ +# +# Makefile to compile and link C programs with MPI subroutines +# +# Version valid for Linux machines with MPICH +# +# "make" compiles and links the specified main programs and modules, +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files generated by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +all: rmxeq mkdep mkxeq +.PHONY: all + +# main programs and modules to be compiled + +MAIN = time1 time2 time3 + +RANDOM = ranlxs ranlxd gauss random_su3 + +START = geometry start sinit utils flipbc + +FLAGS = flags parms wspace + +MISC = mutils endian sse_fcts su3_fcts su3_prods \ + cmatrix_dble cmatrix + +LINALG = linalg linalg_dble valg valg_dble + +LINSOLV = fgcr fgcr4vd + +SW_TERM = pauli pauli_dble swinit sw_term blk_sw_term + +DIRAC = Pbnd Qbnd Qhat scom Pbnd_dble Qbnd_dble Qhat_dble sdcom + +UPDATE = ucom shift + +BLOCK = block blk_grid blk_umap blk_swmap blk_smap + +EVA = jacobi_dble + +DFL = vgrid vflds vinit vcom vdcom Zgen Zops Zhat Zhat_dble dfl_subspace \ + dfl dfl_sap_gcr dfl_modes + +SAP_GCR = blk_solv msap sap_gcr + +HMC = chrono liealg hmcflds + +BM = cg_iter cg_iter_dble time_msap time_Zhat + +MODULES = $(RANDOM) $(START) $(FLAGS) $(MISC) \ + $(LINALG) $(LINSOLV) $(SW_TERM) $(DIRAC) $(UPDATE) $(BLOCK) \ + $(EVA) $(DFL) $(SAP_GCR) $(HMC) $(BM) + + +# DD-HMC distribution + +DDHMC = ../../DD-HMC-1.2.2 + +# Logging option (-mpilog or -mpitrace or -mpianim) + +LOGOPTION = + + +# search path for modules + +MDIR = $(DDHMC)/modules + +MDIR_BM = ../modules + +VPATH = .:$(MDIR)/random:$(MDIR)/start:$(MDIR)/flags:$(MDIR)/misc:\ + $(MDIR)/linalg:$(MDIR)/linsolv:$(MDIR)/sw_term:$(MDIR)/dirac:\ + $(MDIR)/update:$(MDIR)/block:$(MDIR)/eva:$(MDIR)/dfl:$(MDIR)/sap_gcr:\ + $(MDIR)/hmc:$(MDIR_BM)/bm: + + +# additional include directories + +INCPATH = $(MPIR_HOME)/include $(DDHMC)/include/ ../include + + +# additional libraries + +LIBS = m + +LIBPATH = $(MPIR_HOME)/lib + + +# scheduling and optimization options + +CFLAGS = -std=c89 -pedantic -Wall -Wno-long-long \ + -O3 -m32 -malign-double -msse3 -DSSE3 -DP4 \ + -Wstrict-prototypes -fstrict-aliasing -Werror + + +############################## do not change ################################### + +SHELL=/bin/bash +CC=$(MPIR_HOME)/bin/mpicc +CLINKER=$(CC) + +PGMS= $(MAIN) $(MODULES) + +-include $(addsuffix .d,$(PGMS)) + + +# rule to make dependencies + +$(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + +# rule to link object files + +$(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(CFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ + + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables + +rmxeq: + @ -rm -f $(MAIN); \ + echo "delete old executables" + + +# make dependencies + +mkdep: $(addsuffix .d,$(PGMS)) + @ echo "generate tables of dependencies" + + +# clean directory + +clean: + @ -rm -rf *.d *.o *.alog *.clog *.slog $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/main/Makefile.oqcd b/qcd/part_cpu/applications/QCD/src/kernel_C/main/Makefile.oqcd new file mode 100644 index 0000000000000000000000000000000000000000..b2612ac023b149c7ec8f6f725ee508ebc0dc8e22 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/main/Makefile.oqcd @@ -0,0 +1,159 @@ +################################################################################ +# +# Makefile to compile and link C programs with MPI subroutines +# +# Version valid for Linux machines with MPICH +# +# "make" compiles and links the specified main programs and modules, +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files generated by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +CC=gcc +LD=mpicc +## include ../Makefile.defs + +all: rmxeq mkdep mkxeq +.PHONY: all + +# main programs and modules to be compiled + +MAIN = time1 time2 time3 time3test + +ARCHIVE = archive sarchive + +BLOCK = block blk_grid map_u2blk map_sw2blk map_s2blk + +DFL = dfl_geometry dfl_subspace ltl_gcr dfl_sap_gcr dfl_modes + +DIRAC = Dw_dble Dw Dw_bnd + +FLAGS = flags action_parms dfl_parms force_parms hmc_parms lat_parms \ + mdint_parms rat_parms rw_parms sap_parms solver_parms + +FORCES = force0 force1 force2 force3 force4 force5 \ + frcfcts genfrc tmcg tmcgm xtensor + +LATTICE = bcnds uidx ftidx geometry + +LINALG = salg salg_dble valg valg_dble liealg cmatrix_dble cmatrix + +LINSOLV = cgne mscg fgcr fgcr4vd + +LITTLE = Aw_gen Aw_com Aw_ops Aw_dble Aw ltl_modes + +MDFLDS = mdflds fcom + +RANDOM = ranlux ranlxs ranlxd gauss + +RATFCTS = elliptic zolotarev ratfcts + +SAP = sap_com sap_gcr sap blk_solv + +SFLDS = sflds scom sdcom Pbnd Pbnd_dble + +SU3FCTS = chexp su3prod su3ren cm3x3 random_su3 + +SW_TERM = pauli pauli_dble swflds sw_term + +TCHARGE = ftcom ftensor tcharge ym_action + +UFLDS = plaq_sum uflds udcom bstap + +UPDATE = chrono mdsteps counters mdint hmc rwtm rwtmeo rwrat + +UTILS = endian mutils utils wspace + +VFLDS = vflds vinit vcom vdcom + +BM = cg_iter cg_iter_dble time_msap time_Awhat + +##cg_iter_dble time_msap time_Zhat + +MODULES = $(ARCHIVE) $(BLOCK) $(DFL) $(DIRAC) $(FLAGS) $(FORCES) \ + $(LATTICE) $(LINALG) $(LINSOLV) $(LITTLE) $(MDFLDS) $(RANDOM) \ + $(RATFCTS) $(SAP) $(SFLDS) $(SU3FCTS) $(SW_TERM) $(TCHARGE) \ + $(UFLDS) $(UPDATE) $(UTILS) $(VFLDS) $(WFLOW) $(BM) + + +# DD-HMC distribution + +OQCD = ../openQCD-1.4-bgopt + +# Logging option (-mpilog or -mpitrace or -mpianim) + +LOGOPTION = + + +# search path for modules + +MDIR = $(OQCD)/modules + +MDIR_BM = ../modules + +VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/archive:$(MDIR)/linalg:\ + $(MDIR)/random:$(MDIR)/uflds:$(MDIR)/mdflds:$(MDIR)/su3fcts:\ + $(MDIR)/utils:$(MDIR)/forces:$(MDIR)/sflds:$(MDIR)/dirac:\ + $(MDIR)/sw_term:$(MDIR)/tcharge:$(MDIR)/block:$(MDIR)/sap:\ + $(MDIR)/linsolv:$(MDIR)/dfl:$(MDIR)/vflds:$(MDIR)/little:\ + $(MDIR)/update:$(MDIR)/wflow:$(MDIR)/ratfcts:$(MDIR_BM)/bm + + +# additional include directories + +INCPATH = /usr/lib/openmpi/include $(OQCD)/include/ ../include + + +# additional libraries + +LIBS = m + +LIBPATH = $(OMPIR_HOME)/lib + +CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ + -Wall \ + -O -Dx64 -DPM -Wno-long-long + +## -Wno-long-long -Wstrict-prototypes -Werror +############################## do not change ################################### + +PGMS= $(MAIN) $(MODULES) + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + +# rule to link object files + +$(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(LD) $< $(addsuffix .o,$(MODULES)) $(CFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ + +kernel: $(addsuffix .o,$(MODULES)) Makefile time3.o + $(AR) $(ARFLAGS) ../../kernel_C.a $(addsuffix .o,$(MODULES)) time3.o + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables + +rmxeq: + $(RM) $(MAIN); \ + echo "delete old executables" + + +# clean directory + +clean: + $(RM) -r *.d *.o *.alog *.clog *.slog $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/main/time1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/main/time1.c new file mode 100644 index 0000000000000000000000000000000000000000..ca5d262c41db17ad0d189aae9150d7efa4357bb5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/main/time1.c @@ -0,0 +1,113 @@ + +/******************************************************************************* +* +* File time1.c +* +* Copyright (C) 2008 Martin Luescher, Bjorn Leder , 2016 Jacob Finkenrath +* +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* QCD single-precision speed test +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "dirac.h" +#include "global.h" +#include "bm.h" +#include "flags.h" +#include "random.h" +#include "su3fcts.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "archive.h" +#include "forces.h" +#include "update.h" +#include "version.h" +#include "sw_term.h" + +int main(int argc,char *argv[]) +{ + int my_rank; + double cg_wdt[3],wdt; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("time1.log","w",stdout); + error_root(flog==NULL,1,"main [time1.c]","Unable to open log file"); + + printf("\n"); + printf("QCD single-precision speed test\n"); + printf("-------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + if (NPROC>1) + printf("There are %d MPI processes\n",NPROC); + else + printf("There is 1 MPI process\n"); + +#if (defined SSE3) + printf("Using inline assembly SSE3 instructions\n"); +#elif (defined SSE2) + printf("Using inline assembly SSE2 instructions\n"); +#elif (defined SSE) + printf("Using inline assembly SSE instructions\n"); +#endif + +#if (defined SSE) +#if (defined P3) + printf("Assuming SSE prefetch instructions fetch 32 bytes\n"); +#elif (defined PM) + printf("Assuming SSE prefetch instructions fetch 64 bytes\n"); +#elif (defined P4) + printf("Assuming SSE prefetch instructions fetch 128 bytes\n"); +#else + printf("SSE prefetch instructions are not used\n"); +#endif +#endif + + printf("\n"); + } + + time_cg_iter(flog,cg_wdt); + + wdt=2.0*cg_wdt[0]+3.0*cg_wdt[1]+2.0*cg_wdt[2]; + + if (my_rank==0) + { + printf("########################################################\n"); + printf("# #\n"); + printf("# SYNTHETIC QCD SPEED TEST #\n"); + printf("# #\n"); + printf("# Using single-precision (%d bit) data and programs #\n", + 8*(int)(sizeof(float))); + printf("# #\n"); + printf("# Time per lattice point: %8.3f micro sec #\n", + wdt); + printf("# Average speed: %8.3f Gflops/process #\n", + 1.0e-3*4200.0/wdt); + printf("# Total throughput: %8.3f Gflops #\n", + 1.0e-3*(double)(NPROC)*4200.0/wdt); + printf("# #\n"); + printf("########################################################\n\n"); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/main/time1.log b/qcd/part_cpu/applications/QCD/src/kernel_C/main/time1.log new file mode 100644 index 0000000000000000000000000000000000000000..d7dac4390bc866e0853492d7e92302ee47f8c246 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/main/time1.log @@ -0,0 +1,41 @@ + +QCD single-precision speed test +------------------------------- + +16x8x8x8 lattice, 2x1x1x1 process grid, 8x8x8x8 local lattice + +There are 2 MPI processes + + +Single-precision data and programs +------------------------------- + +The local size of the gauge field is 1152 KB +The local size of a quark field is 384 KB + +Lattice parameters: +beta = 5.5 +c0 = 1.0, c1 = 0.0 +csw = 1.978 + +Program norm_square: +Time per lattice point: 0.018 micro sec (2723 Mflops/process) + +Program mulc_spinor_add: +Time per lattice point: 0.052 micro sec (1855 Mflops/process) + +Program Dhat: +Time per lattice point: 0.183 micro sec (10402 Mflops/process) + +######################################################## +# # +# SYNTHETIC QCD SPEED TEST # +# # +# Using single-precision (32 bit) data and programs # +# # +# Time per lattice point: 0.557 micro sec # +# Average speed: 7.494 Gflops/process # +# Total throughput: 14.988 Gflops # +# # +######################################################## + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/main/time2.c b/qcd/part_cpu/applications/QCD/src/kernel_C/main/time2.c new file mode 100644 index 0000000000000000000000000000000000000000..f5953921851f71f379176230ecfb40e6057b3f1f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/main/time2.c @@ -0,0 +1,113 @@ + +/******************************************************************************* +* +* File time2.c +* +* Copyright (C) 2008 Martin Luescher, Bjorn Leder, 2016 Jacob Finkenrath +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* QCD double-precision speed test +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "global.h" +#include "bm.h" +#include "flags.h" +#include "random.h" +#include "su3fcts.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "archive.h" +#include "forces.h" +#include "update.h" +#include "version.h" +#include "sw_term.h" +#include "dirac.h" + +int main(int argc,char *argv[]) +{ + int my_rank; + double cgd_wdt[3],wdt; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("time2.log","w",stdout); + error_root(flog==NULL,1,"main [time2.c]","Unable to open log file"); + + printf("\n"); + printf("QCD double-precision speed test\n"); + printf("-------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + if (NPROC>1) + printf("There are %d MPI processes\n",NPROC); + else + printf("There is 1 MPI process\n"); + +#if (defined SSE3) + printf("Using inline assembly SSE3 instructions\n"); +#elif (defined SSE2) + printf("Using inline assembly SSE2 instructions\n"); +#elif (defined SSE) + printf("Using inline assembly SSE instructions\n"); +#endif + +#if (defined SSE) +#if (defined P3) + printf("Assuming SSE prefetch instructions fetch 32 bytes\n"); +#elif (defined PM) + printf("Assuming SSE prefetch instructions fetch 64 bytes\n"); +#elif (defined P4) + printf("Assuming SSE prefetch instructions fetch 128 bytes\n"); +#else + printf("SSE prefetch instructions are not used\n"); +#endif +#endif + + printf("\n"); + } + + time_cg_iter_dble(flog,cgd_wdt); + + wdt=2.0*cgd_wdt[0]+3.0*cgd_wdt[1]+2.0*cgd_wdt[2]; + + if (my_rank==0) + { + printf("########################################################\n"); + printf("# #\n"); + printf("# SYNTHETIC QCD SPEED TEST #\n"); + printf("# #\n"); + printf("# Using double-precision (%d bit) data and programs #\n", + 8*(int)(sizeof(double))); + printf("# #\n"); + printf("# Time per lattice point: %8.3f micro sec #\n", + wdt); + printf("# Average speed: %8.3f Gflops/process #\n", + 1.0e-3*4200.0/wdt); + printf("# Total throughput: %8.3f Gflops #\n", + 1.0e-3*(double)(NPROC)*4200.0/wdt); + printf("# #\n"); + printf("########################################################\n\n"); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/main/time2.log b/qcd/part_cpu/applications/QCD/src/kernel_C/main/time2.log new file mode 100644 index 0000000000000000000000000000000000000000..01221333ca2471b19b8b9671ef38a3ab554b6e7f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/main/time2.log @@ -0,0 +1,41 @@ + +QCD double-precision speed test +------------------------------- + +16x8x8x8 lattice, 2x1x1x1 process grid, 8x8x8x8 local lattice + +There are 2 MPI processes + + +Double-precision data and programs +------------------------------- + +The local size of the gauge field is 2304 KB +The local size of a quark field is 768 KB + +Lattice parameters: +beta = 5.5 +c0 = 1.0, c1 = 0.0 +csw = 1.978 + +Program norm_square_dble: +Time per lattice point: 0.035 micro sec (1358 Mflops/process) + +Program mulc_spinor_add_dble: +Time per lattice point: 0.110 micro sec (871 Mflops/process) + +Program Dhat_dble: +Time per lattice point: 0.377 micro sec (5063 Mflops/process) + +######################################################## +# # +# SYNTHETIC QCD SPEED TEST # +# # +# Using double-precision (64 bit) data and programs # +# # +# Time per lattice point: 1.155 micro sec # +# Average speed: 3.616 Gflops/process # +# Total throughput: 7.232 Gflops # +# # +######################################################## + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/main/time3.c b/qcd/part_cpu/applications/QCD/src/kernel_C/main/time3.c new file mode 100644 index 0000000000000000000000000000000000000000..46ee7fd394866b36f0fa46650f51e37edeef0a44 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/main/time3.c @@ -0,0 +1,190 @@ + +/******************************************************************************* +* +* File time3.c +* +* Copyright (C) 2008 Bjorn Leder, 2016 Jacob Finkenrath +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* QCD speed test +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "global.h" +#include "bm.h" +#include "flags.h" +#include "random.h" +#include "su3fcts.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "archive.h" +#include "forces.h" +#include "update.h" +#include "version.h" +#include "sw_term.h" +#include "dirac.h" + +#if (((L0%4)!=0)||((L1%4)!=0)||((L2%4)!=0)||((L3%4)!=0)) +#error: The local lattice sizes must be a multiple of 4 +#endif + + +int kernel_c() +{ + int my_rank,nb, kernel_number=2; + double cg_wdt[3],cgd_wdt[3],msap_wdt,ahat_wdt,wdt; + FILE *flog=NULL; + + /* JuBE */ + /* call jube initial function */ + jube_kernel_init(&kernel_number); + +/* MPI_Init(&argc,&argv); */ + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { +/* flog=freopen("time3.log","w",stdout); */ +/* error_root(flog==NULL,1,"main [time3.c]","Unable to open log file"); */ + + flog = stdout; + + printf("\n"); + printf("QCD speed test\n"); + printf("-------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + if (NPROC>1) + printf("There are %d MPI processes\n",NPROC); + else + printf("There is 1 MPI process\n"); + +#if (defined SSE3) + printf("Using inline assembly SSE3 instructions\n"); +#elif (defined SSE2) + printf("Using inline assembly SSE2 instructions\n"); +#elif (defined SSE) + printf("Using inline assembly SSE instructions\n"); +#endif + +#if (defined SSE) +#if (defined P3) + printf("Assuming SSE prefetch instructions fetch 32 bytes\n"); +#elif (defined PM) + printf("Assuming SSE prefetch instructions fetch 64 bytes\n"); +#elif (defined P4) + printf("Assuming SSE prefetch instructions fetch 128 bytes\n"); +#else + printf("SSE prefetch instructions are not used\n"); +#endif +#endif + + printf("\n"); + } + + /* JuBE */ + /* call jube run function */ + jube_kernel_run(); + + time_cg_iter(flog,cg_wdt); + time_cg_iter_dble(flog,cgd_wdt); + time_msap(flog,&msap_wdt); + time_Awhat(flog,&ahat_wdt,&nb); + + /* JuBE */ + /* call jube finalize function */ + jube_kernel_finalize(); + + wdt=2.0*cg_wdt[0]+3.0*cg_wdt[1]+2.0*cg_wdt[2]; + + if (my_rank==0) + { + printf("\n"); + printf("#########################################################\n"); + printf("# #\n"); + printf("# SYNTHETIC QCD SPEED TEST #\n"); + printf("# #\n"); + printf("# Using single-precision (%d bit) data and programs #\n", + 8*(int)(sizeof(float))); + printf("# #\n"); + printf("# Time per lattice point: %8.3f micro sec #\n", + wdt); + printf("# Average speed: %8.3f Gflops/process #\n", + 1.0e-3*4200.0/wdt); + printf("# Total throughput: %8.3f Gflops #\n", + 1.0e-3*(double)(NPROC)*4200.0/wdt); + printf("# #\n"); + } + + wdt=2.0*cgd_wdt[0]+3.0*cgd_wdt[1]+2.0*cgd_wdt[2]; + + if (my_rank==0) + { + printf("# #\n"); + printf("# Using double-precision (%d bit) data and programs #\n", + 8*(int)(sizeof(double))); + printf("# #\n"); + printf("# Time per lattice point: %8.3f micro sec #\n", + wdt); + printf("# Average speed: %8.3f Gflops/process #\n", + 1.0e-3*4200.0/wdt); + printf("# Total throughput: %8.3f Gflops #\n", + 1.0e-3*(double)(NPROC)*4200.0/wdt); + printf("# #\n"); + } + + if (my_rank==0) + { + printf("# #\n"); + printf("# Using the Schwarz preconditioner [%d bit arithmetic] #\n", + 8*(int)(sizeof(float))); + printf("# #\n"); + printf("# Time per lattice point: %8.3f micro sec #\n", + msap_wdt); + printf("# Average speed: %8.3f Gflops/process #\n", + 1.0e-3*((double)((NMR+1)*2076+48)+112.0*2.0)/msap_wdt); + printf("# Total throughput: %8.3f Gflops #\n", + 1.0e-3*((double)((NMR+1)*2076+48)+112.0*2.0)/msap_wdt); + printf("# #\n"); + } + + if (my_rank==0) + { + printf("# #\n"); + printf("# Using deflation (little Dirac operator) #\n"); + printf("# [%d bit arithmetic] #\n", + (int)(4*sizeof(complex))); + printf("# #\n"); + printf("# Time per lattice point: %8.3f micro sec #\n", + ahat_wdt/(double)(VOLUME)); + printf("# Average speed: %8.3f Gflops/process #\n", + 1.0e-3*64.0*(double)(nb*NS*NS)/ahat_wdt); + printf("# Total throughput: %8.3f Gflops #\n", + 1.0e-3*64.0*(double)(NPROC*nb*NS*NS)/ahat_wdt); + printf("# #\n"); + printf("#########################################################\n\n"); + +/* fclose(flog); */ + } + +/* MPI_Finalize(); */ + + /* JuBE */ + /* call jube end function */ + jube_kernel_end(); + + return 0; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/main/time3_bgl_256.log b/qcd/part_cpu/applications/QCD/src/kernel_C/main/time3_bgl_256.log new file mode 100644 index 0000000000000000000000000000000000000000..842f56bbb55c3b5cabdb44b422310219d6673e07 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/main/time3_bgl_256.log @@ -0,0 +1,109 @@ + +QCD speed test +------------------------------- + +32x32x32x32 lattice, 4x4x4x4 process grid, 8x8x8x8 local lattice + +There are 256 MPI processes + + +Single-precision data and programs +------------------------------- + +The local size of the gauge field is 1152 KB +The local size of a quark field is 384 KB + +Program norm_square: +Time per lattice point: 0.093 micro sec (513 Mflops/process) + +Program mulc_spinor_add: +Time per lattice point: 0.220 micro sec (436 Mflops/process) + +Program Qhat: +Time per lattice point: 4.422 micro sec (428 Mflops/process) + + +Double-precision data and programs +------------------------------- + +The local size of the gauge field is 2304 KB +The local size of a quark field is 768 KB + +Program norm_square_dble: +Time per lattice point: 0.202 micro sec (237 Mflops/process) + +Program mulc_spinor_add_dble: +Time per lattice point: 0.447 micro sec (214 Mflops/process) + +Program Qhat_dble: +Time per lattice point: 7.542 micro sec (251 Mflops/process) + + +Timing of the Schwarz preconditioner +------------------------------------ + +bs = 4 4 4 4 +nmr = 4 +ncy = 5 + +The number of blocks per process is 16 +The local size of the gauge field is 1152 KB +The local size of a quark field is 384 KB +The size of the block gauge field is 72 KB +The size of a block quark field is 24 KB + +Time per lattice point: 67.19 micro sec (about 759 Mflops [32 bit arithmetic]) +Time per lattice point and MR iteration: 3.36 micro sec + + +Timing of Zhat +-------------- + +bs = 4 4 4 4 +Ns = 16 + +Number of points = 4096 +Number of blocks = 16 +Number of points/block = 256 +Vector field size = 2.05 KB +Zhat array size = 0.26 MB + +Time per application of Zhat, including communications: +Time per block: 163.805 micro sec (100 Mflops [32 bit arithmetic]) +Time per point: 0.640 micro sec + + +######################################################### +# # +# SYNTHETIC QCD SPEED TEST # +# # +# Using single-precision (32 bit) data and programs # +# # +# Time per lattice point: 9.691 micro sec # +# Average speed: 0.431 Gflops/process # +# Total throughput: 110.314 Gflops # +# # +# # +# Using double-precision (64 bit) data and programs # +# # +# Time per lattice point: 16.830 micro sec # +# Average speed: 0.248 Gflops/process # +# Total throughput: 63.521 Gflops # +# # +# # +# Using the Schwarz preconditioner [32 bit arithmetic] # +# # +# Time per lattice point: 67.192 micro sec # +# Average speed: 0.759 Gflops/process # +# Total throughput: 194.399 Gflops # +# # +# # +# Using deflation (little Dirac operator) # +# [32 bit arithmetic] # +# # +# Time per lattice point: 0.640 micro sec # +# Average speed: 0.100 Gflops/process # +# Total throughput: 25.605 Gflops # +# # +######################################################### + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/main/time3_bgl_single.log b/qcd/part_cpu/applications/QCD/src/kernel_C/main/time3_bgl_single.log new file mode 100644 index 0000000000000000000000000000000000000000..29c7a7907ffaa8dd396556df3f6920d97520626a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/main/time3_bgl_single.log @@ -0,0 +1,109 @@ + +QCD speed test +------------------------------- + +8x8x8x8 lattice, 1x1x1x1 process grid, 8x8x8x8 local lattice + +There is 1 MPI process + + +Single-precision data and programs +------------------------------- + +The local size of the gauge field is 1152 KB +The local size of a quark field is 384 KB + +Program norm_square: +Time per lattice point: 0.089 micro sec (539 Mflops/process) + +Program mulc_spinor_add: +Time per lattice point: 0.143 micro sec (669 Mflops/process) + +Program Qhat: +Time per lattice point: 1.899 micro sec (998 Mflops/process) + + +Double-precision data and programs +------------------------------- + +The local size of the gauge field is 2304 KB +The local size of a quark field is 768 KB + +Program norm_square_dble: +Time per lattice point: 0.193 micro sec (248 Mflops/process) + +Program mulc_spinor_add_dble: +Time per lattice point: 0.283 micro sec (339 Mflops/process) + +Program Qhat_dble: +Time per lattice point: 2.586 micro sec (733 Mflops/process) + + +Timing of the Schwarz preconditioner +------------------------------------ + +bs = 4 4 4 4 +nmr = 4 +ncy = 5 + +The number of blocks is 16 +The local size of the gauge field is 1152 KB +The local size of a quark field is 384 KB +The size of the block gauge field is 72 KB +The size of a block quark field is 24 KB + +Time per lattice point: 56.74 micro sec (about 899 Mflops [32 bit arithmetic]) +Time per lattice point and MR iteration: 2.84 micro sec + + +Timing of Zhat +-------------- + +bs = 4 4 4 4 +Ns = 16 + +Number of points = 4096 +Number of blocks = 16 +Number of points/block = 256 +Vector field size = 2.05 KB +Zhat array size = 0.26 MB + +Time per application of Zhat, including communications: +Time per block: 135.231 micro sec (121 Mflops [32 bit arithmetic]) +Time per point: 0.528 micro sec + + +######################################################### +# # +# SYNTHETIC QCD SPEED TEST # +# # +# Using single-precision (32 bit) data and programs # +# # +# Time per lattice point: 4.406 micro sec # +# Average speed: 0.948 Gflops/process # +# Total throughput: 0.948 Gflops # +# # +# # +# Using double-precision (64 bit) data and programs # +# # +# Time per lattice point: 6.406 micro sec # +# Average speed: 0.652 Gflops/process # +# Total throughput: 0.652 Gflops # +# # +# # +# Using the Schwarz preconditioner [32 bit arithmetic] # +# # +# Time per lattice point: 56.740 micro sec # +# Average speed: 0.899 Gflops/process # +# Total throughput: 0.899 Gflops # +# # +# # +# Using deflation (little Dirac operator) # +# [32 bit arithmetic] # +# # +# Time per lattice point: 0.528 micro sec # +# Average speed: 0.121 Gflops/process # +# Total throughput: 0.121 Gflops # +# # +######################################################### + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/main/time3_intel_single.log b/qcd/part_cpu/applications/QCD/src/kernel_C/main/time3_intel_single.log new file mode 100644 index 0000000000000000000000000000000000000000..3ae12fc1db50a1977e7dee93e8df8c07f53ecd1c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/main/time3_intel_single.log @@ -0,0 +1,111 @@ + +QCD speed test +------------------------------- + +8x8x8x8 lattice, 1x1x1x1 process grid, 8x8x8x8 local lattice + +There is 1 MPI process +Using inline assembly SSE3 instructions +Assuming SSE prefetch instructions fetch 128 bytes + + +Single-precision data and programs +------------------------------- + +The local size of the gauge field is 1152 KB +The local size of a quark field is 384 KB + +Program norm_square: +Time per lattice point: 0.030 micro sec (1587 Mflops/process) + +Program mulc_spinor_add: +Time per lattice point: 0.088 micro sec (1088 Mflops/process) + +Program Qhat: +Time per lattice point: 0.515 micro sec (3683 Mflops/process) + + +Double-precision data and programs +------------------------------- + +The local size of the gauge field is 2304 KB +The local size of a quark field is 768 KB + +Program norm_square_dble: +Time per lattice point: 0.052 micro sec (919 Mflops/process) + +Program mulc_spinor_add_dble: +Time per lattice point: 0.164 micro sec (584 Mflops/process) + +Program Qhat_dble: +Time per lattice point: 0.819 micro sec (2315 Mflops/process) + + +Timing of the Schwarz preconditioner +------------------------------------ + +bs = 4 4 4 4 +nmr = 4 +ncy = 5 + +The number of blocks is 16 +The local size of the gauge field is 1152 KB +The local size of a quark field is 384 KB +The size of the block gauge field is 72 KB +The size of a block quark field is 24 KB + +Time per lattice point: 15.41 micro sec (about 3311 Mflops [32 bit arithmetic]) +Time per lattice point and MR iteration: 0.77 micro sec + + +Timing of Zhat +-------------- + +bs = 4 4 4 4 +Ns = 16 + +Number of points = 4096 +Number of blocks = 16 +Number of points/block = 256 +Vector field size = 2.05 KB +Zhat array size = 0.26 MB + +Time per application of Zhat, including communications: +Time per block: 3.845 micro sec (4260 Mflops [32 bit arithmetic]) +Time per point: 0.015 micro sec + + +######################################################### +# # +# SYNTHETIC QCD SPEED TEST # +# # +# Using single-precision (32 bit) data and programs # +# # +# Time per lattice point: 1.354 micro sec # +# Average speed: 3.083 Gflops/process # +# Total throughput: 3.083 Gflops # +# # +# # +# Using double-precision (64 bit) data and programs # +# # +# Time per lattice point: 2.235 micro sec # +# Average speed: 1.869 Gflops/process # +# Total throughput: 1.869 Gflops # +# # +# # +# Using the Schwarz preconditioner [32 bit arithmetic] # +# # +# Time per lattice point: 15.407 micro sec # +# Average speed: 3.312 Gflops/process # +# Total throughput: 3.312 Gflops # +# # +# # +# Using deflation (little Dirac operator) # +# [32 bit arithmetic] # +# # +# Time per lattice point: 0.015 micro sec # +# Average speed: 4.261 Gflops/process # +# Total throughput: 4.261 Gflops # +# # +######################################################### + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/main/time3test.c b/qcd/part_cpu/applications/QCD/src/kernel_C/main/time3test.c new file mode 100644 index 0000000000000000000000000000000000000000..e076c175824d1c4a763d5b3119bc5fb8d51df1a6 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/main/time3test.c @@ -0,0 +1,195 @@ + +/******************************************************************************* +* +* File time3test.c +* +* Copyright (C) 2008 Bjorn Leder, 2016 Jacob Finkenrath +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* QCD speed test +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "global.h" +#include "bm.h" +#include "flags.h" +#include "random.h" +#include "su3fcts.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "archive.h" +#include "forces.h" +#include "update.h" +#include "version.h" +#include "sw_term.h" +#include "dirac.h" + +#if (((L0%4)!=0)||((L1%4)!=0)||((L2%4)!=0)||((L3%4)!=0)) +#error: The local lattice sizes must be a multiple of 4 +#endif + + + +int main(int argc,char *argv[]) +{ + int my_rank,nb; + /*int kernel_number=2;*/ + double cg_wdt[3],cgd_wdt[3],msap_wdt,ahat_wdt,wdt; + FILE *flog=NULL; + + /* JuBE */ + /* call jube initial function */ + /*jube_kernel_init(&kernel_number);*/ + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("time3test.log","w",stdout); + error_root(flog==NULL,1,"main [time3test.c]","Unable to open log file"); + + flog = stdout; + + printf("\n"); + printf("QCD speed test\n"); + printf("-------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + if (NPROC>1) + printf("There are %d MPI processes\n",NPROC); + else + printf("There is 1 MPI process\n"); + +#if (defined SSE3) + printf("Using inline assembly SSE3 instructions\n"); +#elif (defined SSE2) + printf("Using inline assembly SSE2 instructions\n"); +#elif (defined SSE) + printf("Using inline assembly SSE instructions\n"); +#endif + +#if (defined SSE) +#if (defined P3) + printf("Assuming SSE prefetch instructions fetch 32 bytes\n"); +#elif (defined PM) + printf("Assuming SSE prefetch instructions fetch 64 bytes\n"); +#elif (defined P4) + printf("Assuming SSE prefetch instructions fetch 128 bytes\n"); +#else + printf("SSE prefetch instructions are not used\n"); +#endif +#endif + + printf("\n"); + } + + /* JuBE */ + /* call jube run function + jube_kernel_run();*/ + + time_cg_iter(flog,cg_wdt); + time_cg_iter_dble(flog,cgd_wdt); + time_msap(flog,&msap_wdt); + time_Awhat(flog,&ahat_wdt,&nb); + + /* JuBE */ + /* call jube finalize function + jube_kernel_finalize();*/ + + wdt=2.0*cg_wdt[0]+3.0*cg_wdt[1]+2.0*cg_wdt[2]; + + if (my_rank==0) + { + printf("\n"); + printf("#########################################################\n"); + printf("# #\n"); + printf("# SYNTHETIC QCD SPEED TEST #\n"); + printf("# #\n"); + printf("# Using single-precision (%d bit) data and programs #\n", + 8*(int)(sizeof(float))); + printf("# #\n"); + printf("# Time per lattice point: %8.3f micro sec #\n", + wdt); + printf("# Average speed: %8.3f Gflops/process #\n", + 1.0e-3*4200.0/wdt); + printf("# Total throughput: %8.3f Gflops #\n", + 1.0e-3*(double)(NPROC)*4200.0/wdt); + printf("# #\n"); + } + + wdt=2.0*cgd_wdt[0]+3.0*cgd_wdt[1]+2.0*cgd_wdt[2]; + + if (my_rank==0) + { + printf("# #\n"); + printf("# Using double-precision (%d bit) data and programs #\n", + 8*(int)(sizeof(double))); + printf("# #\n"); + printf("# Time per lattice point: %8.3f micro sec #\n", + wdt); + printf("# Average speed: %8.3f Gflops/process #\n", + 1.0e-3*4200.0/wdt); + printf("# Total throughput: %8.3f Gflops #\n", + 1.0e-3*(double)(NPROC)*4200.0/wdt); + printf("# #\n"); + } + + + if (my_rank==0) + { + printf("# #\n"); + printf("# Using the Schwarz preconditioner [%d bit arithmetic] #\n", + 8*(int)(sizeof(float))); + printf("# #\n"); + printf("# Time per lattice point: %8.3f micro sec #\n", + (double)(NCY)*(msap_wdt)); + printf("# Average speed: %8.3f Gflops/process #\n", + 1.0e-3*((double)((NMR+1)*2076+48)+112.0*2.0)/msap_wdt); + + printf("# Total throughput: %8.3f Gflops #\n", + 1.0e-3*NPROC*((double)((NMR+1)*2076+48)+112.0*2.0)/msap_wdt); + + printf("# #\n"); + } + + if (my_rank==0) + { + printf("# #\n"); + printf("# Using deflation (little Dirac operator) #\n"); + printf("# [%d bit arithmetic] #\n", + (int)(4*sizeof(complex))); + printf("# #\n"); + printf("# Time per lattice point: %8.3f micro sec #\n", + ahat_wdt/(double)(VOLUME)); + printf("# Average speed: %8.3f Gflops/process #\n", + 1.0e-3*64.0*(double)(nb*NS*NS)/ahat_wdt); + printf("# Total throughput: %8.3f Gflops #\n", + 1.0e-3*64.0*(double)(NPROC*nb*NS*NS)/ahat_wdt); + printf("# #\n"); + printf("#########################################################\n\n"); + +/* fclose(flog); */ + } + + MPI_Finalize(); + + /* JuBE */ + /* call jube end function + jube_kernel_end();*/ + + return 0; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/main/time3test.log b/qcd/part_cpu/applications/QCD/src/kernel_C/main/time3test.log new file mode 100644 index 0000000000000000000000000000000000000000..889d4340aa4b502ef7b85adce660fe28fed4bc60 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/main/time3test.log @@ -0,0 +1,144 @@ + +QCD speed test +------------------------------- + +16x8x8x8 lattice, 2x1x1x1 process grid, 8x8x8x8 local lattice + +There are 2 MPI processes + + +Single-precision data and programs +------------------------------- + +The local size of the gauge field is 1152 KB +The local size of a quark field is 384 KB + +Lattice parameters: +beta = 5.5 +c0 = 1.0, c1 = 0.0 +csw = 1.978 + +Program norm_square: +Time per lattice point: 0.019 micro sec (2471 Mflops/process) + +Program mulc_spinor_add: +Time per lattice point: 0.056 micro sec (1700 Mflops/process) + +Program Dhat: +Time per lattice point: 0.206 micro sec (9283 Mflops/process) + + +Double-precision data and programs +------------------------------- + +The local size of the gauge field is 2304 KB +The local size of a quark field is 768 KB + +Lattice parameters: +beta = 5.5 +c0 = 1.0, c1 = 0.0 +csw = 1.978 + +Program norm_square_dble: +Time per lattice point: 0.037 micro sec (1288 Mflops/process) + +Program mulc_spinor_add_dble: +Time per lattice point: 0.111 micro sec (867 Mflops/process) + +Program Dhat_dble: +Time per lattice point: 0.393 micro sec (4850 Mflops/process) + + +Timing of the SAP preconditioner +-------------------------------- + +16x8x8x8 lattice, 2x1x1x1 process grid, 8x8x8x8 local lattice + +Using SSE3 instructions and 16 xmm registers +Assuming SSE prefetch instructions fetch 64 bytes + +bs = 4 4 4 4 +ncy = 5 +nmr = 4 + +Lattice parameters: +beta = 5.5 +c0 = 1.0, c1 = 0.0 +csw = 1.978 + +Periodic boundary conditions + +Using the MinRes block solver: +Time per lattice point: 4.053 micro sec +Time per point and cycle: 0.811 micro sec (about 11439 Mflops) + +rbb is 2.000000 +Using the even-odd preconditioned MinRes block solver: +Time per lattice point: 4.566 micro sec +Time per point and cycle: 0.913 micro sec (about 11664 Mflops) + + +Timing of Awhat() +----------------- + +16x8x8x8 lattice, 2x1x1x1 process grid, 8x8x8x8 local lattice + +Lattice parameters: +beta = 5.5 +c0 = 1.0, c1 = 0.0 +csw = 1.978 + +Periodic boundary conditions + +Number of points = 4096 +Number of blocks = 16 +Number of points/block = 256 +Vector field size = 2.05 KB +Awhat array size = 0.26 MB + +Time per application of Awhat(), including communications: +Total: 0.020 msec +Per block: 1.274 usec (12855 Mflops [32 bit arithmetic]) +Per point: 0.005 usec + +There are 16 boundary blocks +Time per application of Awhat() for the communications: +Total: 0.004 msec +Per block: 0.231 usec +Per point: 0.001 usec + + +######################################################### +# # +# SYNTHETIC QCD SPEED TEST # +# # +# Using single-precision (32 bit) data and programs # +# # +# Time per lattice point: 0.619 micro sec # +# Average speed: 6.783 Gflops/process # +# Total throughput: 13.566 Gflops # +# # +# # +# Using double-precision (64 bit) data and programs # +# # +# Time per lattice point: 1.193 micro sec # +# Average speed: 3.520 Gflops/process # +# Total throughput: 7.040 Gflops # +# # +# # +# Using the Schwarz preconditioner [32 bit arithmetic] # +# # +# Time per lattice point: 4.566 micro sec # +# Average speed: 11.664 Gflops/process # +# Total throughput: 23.328 Gflops # +# # +# # +# Using deflation (little Dirac operator) # +# [32 bit arithmetic] # +# # +# Time per lattice point: 0.005 micro sec # +# Average speed: 12.856 Gflops/process # +# Total throughput: 25.711 Gflops # +# # +######################################################### + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/modules/bm/cg_iter.c b/qcd/part_cpu/applications/QCD/src/kernel_C/modules/bm/cg_iter.c new file mode 100644 index 0000000000000000000000000000000000000000..056be8f54a2539a1abfc7bdae54f9453821cb811 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/modules/bm/cg_iter.c @@ -0,0 +1,276 @@ +/******************************************************************************* +* +* File cg_iter.c +* +* Copyright (C) 2008 Bjorn Leder 2016 Jacob Finkenrath +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Based on QCDpbm-1.1 (http://luscher.web.cern.ch/luscher/QCDpbm/index.html) +* +*******************************************************************************/ + +#define CG_ITER_C + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "su3fcts.h" +#include "random.h" +#include "lattice.h" +#include "linalg.h" +#include "sw_term.h" +#include "dirac.h" +#include "sflds.h" +#include "flags.h" +#include "uflds.h" +#include "utils.h" +#include "global.h" + +spinor **ps; + +static double wt_norm_square(int nflds) +{ + int my_rank,nmax,n,i,ib; + /*float r;*/ + double wt1,wt2,wdt,wtav; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + nmax=1; + + for (ib=0;ib<1;nmax*=2) + { + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + + for (n=0;n2.0) + ib=1; + + wtav/=(double)(nmax*nflds); + } + + MPI_Bcast(&ib,1,MPI_INT,0,MPI_COMM_WORLD); + } + + MPI_Bcast(&wtav,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + return wtav; +} + + +static double wt_mulc_spinor_add(int nflds) +{ + int my_rank,nmax,n,i,ib; + complex z; + double wt1,wt2,wdt,wtav; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + z.re=0.123f; + z.im=0.456f; + nmax=1; + + for (ib=0;ib<1;nmax*=2) + { + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + + for (n=0;n2.0) + ib=1; + + wtav/=(double)((nmax*nflds)/2); + } + + MPI_Bcast(&ib,1,MPI_INT,0,MPI_COMM_WORLD); + } + + MPI_Bcast(&wtav,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + return wtav; +} + + +static double wt_Qhat(int nflds) +{ + int my_rank,nmax,n,i,ib; + double wt1,wt2,wdt,wtav; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + nmax=1; + + for (ib=0;ib<1;nmax*=2) + { + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + + for (n=0;n2.0) + ib=1; + + wtav/=(double)((nmax*nflds)/2); + } + + + MPI_Bcast(&ib,1,MPI_INT,0,MPI_COMM_WORLD); + } + + MPI_Bcast(&wtav,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + return wtav; +} + + +void time_cg_iter(FILE *flog, double *wdt) +{ + int my_rank,nflds,n; + double phi[2]; + double wdt0,wdt1,wdt2; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + printf("\n"); + printf("Single-precision data and programs\n"); + printf("-------------------------------\n\n"); + + if ((VOLUME*sizeof(float))<(64*1024)) + { + printf("The local size of the gauge field is %d KB\n", + (int)((72*VOLUME*sizeof(float))/(1024))); + printf("The local size of a quark field is %d KB\n", + (int)((24*VOLUME*sizeof(float))/(1024))); + } + else + { + printf("The local size of the gauge field is %d MB\n", + (int)((72*VOLUME*sizeof(float))/(1024*1024))); + printf("The local size of a quark field is %d MB\n", + (int)((24*VOLUME*sizeof(float))/(1024*1024))); + } + + printf("\n"); + } + + start_ranlux(0,12); + phi[0]=0.0; + phi[1]=0.0; + set_bc_parms(3,1.0,1.0,1.0,1.0,phi,phi); + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + set_sw_parms(-0.0123); + geometry(); + /*alloc_u(); + alloc_ud(); + alloc_sw(); + alloc_swd();*/ + + random_ud(); + chs_ubnd(-1); + assign_ud2u(); + + sw_term(ODD_PTS); + /*error(invert_swd(ODD_PTS)!=0,1,"main [time1.c]", + "Inversion of swd on the odd sites was not safe");*/ + assign_swd2sw(); + + nflds=(int)((4*1024*1024)/(VOLUME*sizeof(float)))+1; + if ((nflds%2)==1) + nflds+=1; + alloc_ws(nflds); + + ps=reserve_ws(nflds); + for (n=0;n +#include +#include +#include "mpi.h" +#include "su3.h" +#include "su3fcts.h" +#include "random.h" +#include "lattice.h" +#include "linalg.h" +#include "sw_term.h" +#include "dirac.h" +#include "sflds.h" +#include "flags.h" +#include "uflds.h" +#include "utils.h" +#include "global.h" + +spinor_dble **psd; + +static double wt_norm_square(int nflds) +{ + int my_rank,nmax,n,i,ib; + double wt1,wt2,wdt,wtav; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + nmax=1; + + for (ib=0;ib<1;nmax*=2) + { + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + + for (n=0;n2.0) + ib=1; + + wtav/=(double)(nmax*nflds); + } + + MPI_Bcast(&ib,1,MPI_INT,0,MPI_COMM_WORLD); + } + + MPI_Bcast(&wtav,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + return wtav; +} + + +static double wt_mulc_spinor_add(int nflds) +{ + int my_rank,nmax,n,i,ib; + complex_dble z; + double wt1,wt2,wdt,wtav; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + z.re=0.123; + z.im=0.456; + nmax=1; + + for (ib=0;ib<1;nmax*=2) + { + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + + for (n=0;n2.0) + ib=1; + + wtav/=(double)((nmax*nflds)/2); + } + + MPI_Bcast(&ib,1,MPI_INT,0,MPI_COMM_WORLD); + } + + MPI_Bcast(&wtav,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + return wtav; +} + + +static double wt_Qhat(int nflds) +{ + int my_rank,nmax,n,i,ib; + double wt1,wt2,wdt,wtav; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + nmax=1; + + for (ib=0;ib<1;nmax*=2) + { + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + + for (n=0;n2.0) + ib=1; + + wtav/=(double)((nmax*nflds)/2); + } + + MPI_Bcast(&ib,1,MPI_INT,0,MPI_COMM_WORLD); + } + + MPI_Bcast(&wtav,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + return wtav; +} + + +void time_cg_iter_dble(FILE *flog, double *wdt) +{ + int my_rank,nflds,n; + double phi[2]; + double wdt0,wdt1,wdt2; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + printf("\n"); + printf("Double-precision data and programs\n"); + printf("-------------------------------\n\n"); + + if ((VOLUME*sizeof(double))<(64*1024)) + { + printf("The local size of the gauge field is %d KB\n", + (int)((72*VOLUME*sizeof(double))/(1024))); + printf("The local size of a quark field is %d KB\n", + (int)((24*VOLUME*sizeof(double))/(1024))); + } + else + { + printf("The local size of the gauge field is %d MB\n", + (int)((72*VOLUME*sizeof(double))/(1024*1024))); + printf("The local size of a quark field is %d MB\n", + (int)((24*VOLUME*sizeof(double))/(1024*1024))); + } + + printf("\n"); + } + + start_ranlux(0,12); + phi[0]=0.0; + phi[1]=0.0; + set_bc_parms(3,1.0,1.0,1.0,1.0,phi,phi); + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + set_sw_parms(-0.0123); + + geometry(); + /*alloc_ud(); + alloc_swd();*/ + + random_ud(); + chs_ubnd(-1); + sw_term(ODD_PTS); + /*error(invert_swd(ODD_PTS)!=0,1,"main [time2.c]", + "Inversion of swd on the odd sites was not safe");*/ + + nflds=(int)((4*1024*1024)/(VOLUME*sizeof(double)))+1; + if ((nflds%2)==1) + nflds+=1; + alloc_wsd(nflds); + + psd=reserve_wsd(nflds); + for (n=0;n +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "vflds.h" +#include "linalg.h" +#include "dirac.h" +#include "dfl.h" +#include "little.h" +#include "global.h" +#include "bm.h" + +static int bs[4]={4,4,4,4}; + +static void random_basis(int Ns) +{ + int i; + spinor **ws; + + ws=reserve_ws(Ns); + + for (i=0;i1) + { + nt/=2; + if (nt==0) + nt=1; + wt=0.0; + + while (wt<5.0) + { + for (i=0;i +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "sap.h" +#include "global.h" +#include "bm.h" + +static int bs[4]={4,4,4,4}; + +void time_msap(FILE *flog, double *wdt) +{ + int my_rank,bc,count,nt; + int ncy,nmr; + int n,ie; + float mu; + double phi[2],phi_prime[2]; + double rbb,wt1,wt2; + spinor **ps; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + if (my_rank==0) + { + + printf("\n"); + printf("Timing of the SAP preconditioner\n"); + printf("--------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + +#if (defined x64) +#if (defined AVX) + printf("Using AVX instructions\n"); +#else + printf("Using SSE3 instructions and 16 xmm registers\n"); +#endif +#if (defined P3) + printf("Assuming SSE prefetch instructions fetch 32 bytes\n"); +#elif (defined PM) + printf("Assuming SSE prefetch instructions fetch 64 bytes\n"); +#elif (defined P4) + printf("Assuming SSE prefetch instructions fetch 128 bytes\n"); +#else + printf("SSE prefetch instructions are not used\n"); +#endif +#endif + printf("\n"); + + printf("bs = %d %d %d %d\n",bs[0],bs[1],bs[2],bs[3]); + printf("ncy = %d\n",NCY); + printf("nmr = %d\n\n",NMR); + + + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + ncy=NCY; + nmr=NMR; + bc=3; + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.55,0.78,0.9012,1.2034,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,12); + geometry(); + alloc_ws(3); + set_sap_parms(bs,0,1,1); + alloc_bgr(SAP_BLOCKS); + + set_sw_parms(0.0123); + mu=0.0785f; + rbb=2.0*(1.0/(double)(bs[0])+1.0/(double)(bs[1])+ + 1.0/(double)(bs[2])+1.0/(double)(bs[3])); + + random_ud(); + chs_ubnd(-1); + sw_term(NO_PTS); + assign_ud2ubgr(SAP_BLOCKS); + assign_swd2swbgr(SAP_BLOCKS,NO_PTS); + + ps=reserve_ws(3); + random_s(VOLUME,ps[2],1.0f); + bnd_s2zero(ALL_PTS,ps[2]); + normalize(VOLUME,1,ps[2]); + + nt=(int)(2.0e6/(double)(ncy*nmr*VOLUME)); + if (nt<2) + nt=2; + (*wdt)=0.0; + + while ((*wdt)<5.0) + { + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + for (count=0;count that allows the type of boundary condition to + be specified at run time. + +- Corrected a bug in Dwee_dble() [modules/dirac/Dw_dbl.c] that shows up in + some check programs if none of the local lattice sizes L1,L2,L3 is divisible + by 4. The functionality of the other modules and the main programs in ./main + was not affected by this bug, because Dwee_dble() is not called in any of + these programs. + +- Corrected modules/flags/rw_parms.c so as to allow for Hasenbusch factorized + reweighting factors. + +- Corrected and improved the descriptions at the top of many module files. + +- Corrected devel/ratfcts/INDEX. + +- Added forgotten "plots" directory in devel/nompi/main. + +- Replaced &irat in MPI_Bcast(&irat,3,MPI_INT,0,MPI_COMM_WORLD) by irat in + flags/force_parms.c [read_forc_parms() and read_force_parms2()]. This is not + a mistake but an unnatural and unintended use of the C language. Corrected + analogous cases in a number of check programs (thanks to Hubert Simma and + Georg Engel for noting these misprints). + +- Corrected check program block/check1.c (the point labeling does not need to + respect any time ordering). + + +12. May 2013 + +Version 1.2: 2nd public release. + +- Added AVX inline-assembly to the time-critical functions (Dirac operator, + linear algebra, SAP preconditioner, SU(3) functions). See the README file in + the top directory of the distribution. + +- Added support for blocked MPI process ranking, as is likely to be profitable + on parallel computers with mult-core nodes (see main/README.global). + +- Made the field import/export functions more efficient by avoiding the + previously excessive use of MPI_Barrier(). + +- Added import/export functions for the state of the random number generators. + Modified the initialization of the generators so as to be independent of the + ranking of the MPI processes. See the notes in modules/random/ranlux.c. Added + a check program in devel/random. + +- Continuation runs of qcd1,qcd2,ym1 and ms1 now normally reset the random + number generators to their state at the end of the previous run. The + programs initialize the generators in the traditional way if the option + -norng is set (see README.qcd1, for example). + +- Modified the deflated SAP+GCR solver (dfl/dfl_sap_gcr.c) by replacing the + deflation projectors through an inaccurate projection in the preconditioner + (as suggested by Frommer et al. [arXiv:1303:1377]; the deflation subspace + type and subspace generation algorithm are unchanged). This leads to a + structural simplification and, after some parameter tuning, to a slight + performance gain. NOTE: the deflation parameter set is changed too and the + number of status variables is reduced by 1 (see modules/flags/dfl_parms.c, + modules/dfl/dfl_sap_gcr.c and doc/parms.pdf). + +- Included a program (devel/dfl/check4.c) that allows the parameters of the + deflated SAP+GCR solver to be tuned on a given lattice. + +- Deleted the now superfluous module/dfl/dfl_projectors.c. + +- Added the function fdigits() [utils/mutils.c] that allows double-precision + floating point numbers to be printed with all significant decimal digits + (and only these). The main programs make use of this function to ensure that + the values of the decimal parameters are printed to the log files with as + many significant digits as were given on the input parameter file (assuming + not more digits were specified than can be represented by a double number). + +- Replaced "if" by "else if" on line 379 of main/ms2.c. This bug stopped the + program with an error message when the CGNE solver was used. It had no + effect when other solvers were used. + +- Changed the type of the variable "sf" to "int" in lines 257 and 440 of + forces/force0.c. This bug had no effect in view of the automatic type + conversions performed by the compiler. + +- Corrected sign in line 174 of devel/sap/check2.c. This bug led to wrong + check results, thus incorrectly suggesting that the SAP modules were + incorrect. + +- Corrected a mistake in devel/tcharge/check2.c and devel/tcharge/check5.c + that gave rise to wrong results suggesting that the tested modules were + incorrect. + + +14. June 2012 + +Version 1.0: Initial public release. + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/COPYING b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/COPYING new file mode 100644 index 0000000000000000000000000000000000000000..7a8e8abfd0057f374fbf59076c263f1f5d685b73 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/COPYING @@ -0,0 +1,340 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/INDEX b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/INDEX new file mode 100644 index 0000000000000000000000000000000000000000..5da2eb95bd0d02d0123feba6c759b3522e3d9fdd --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/INDEX @@ -0,0 +1,19 @@ + +At the top level, the openQCD package is structured as follows: + +devel Directories used for developing and testing the various + modules + +doc Documentation files + +include All include files. Typically there is one include file + per directory in the modules directory + +main Collection of main programs + +modules Source code of all modules + +In addition to the information provided by the notes in the doc directory, +short descriptions of the program functionalities are included in the source +directories and at the top of each program file. Further information is found +in various README files (such as main/README.global). diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/README b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/README new file mode 100644 index 0000000000000000000000000000000000000000..74cbe5541e6d8ed24f8f52ec785c8699e722b9d3 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/README @@ -0,0 +1,244 @@ + +******************************************************************************** + + openQCD Simulation Program + +******************************************************************************** + + +LATTICE THEORY + +Currently the common features of the supported lattice theories are the +following: + +* 4-dimensional hypercubic N0xN1xN2xN3 lattice with even sizes N0,N1,N2,N3. + Open, Schrödinger functional (SF), open-SF or periodic boundary conditions + in the time direction and periodic boundary conditions in the space + directions. + +* SU(3) gauge group, plaquette plus planar double-plaquette gauge action + (Wilson, Symanzik, Iwasaki,...). + +* O(a)-improved Wilson quarks in the fundamental representation of the gauge + group. Among the supported quark multiplets are the classical ones (pure + gauge, two-flavour theory, 2+1 and 2+1+1 flavour QCD), but doublets with a + twisted mass and theories with many doublets, for example, are also + supported. + +The O(a)-improvement includes the boundary counterterms required for the +improvement of the correlation functions near the boundaries of the lattice in +the time direction if open, SF or open-SF boundary conditions are chosen. + + +SIMULATION ALGORITHM + +The simulation program is based on the HMC algorithm. For the heavier quarks, +a version of the RHMC algorithm is used. Several advanced techniques are +implemented that can be configured at run time: + +* Nested hierarchical integrators for the molecular-dynamics equations, based + on any combination of the leapfrog, 2nd order Omelyan-Mryglod-Folk (OMF) and + 4th order OMF elementary integrators, are supported. + +* Twisted-mass Hasenbusch frequency splitting, with any number of factors + and twisted masses. Optionally with even-odd preconditioning. + +* Twisted-mass determinant reweighting. + +* Deflation acceleration and chronological solver along the molecular-dynamics + trajectories. + +* A choice of solvers (CGNE, MSCG, SAP+GCR, deflated SAP+GCR) for the Dirac + equation, separately configurable for each force component and + pseudo-fermion action. + +All of these depend on a number of parameters, whose values are passed to the +simulation program together with those of the action parameters (coupling +constants, quark masses, etc.) through a structured input parameter file. + + +PROGRAM FEATURES + +All programs parallelize in 0,1,2,3 or 4 dimensions, depending on what is +specified at compilation time. They are highly optimized for machines with +current Intel or AMD processors, but will run correctly on any system that +complies with the ISO C89 (formerly ANSI C) and the MPI 1.2 standards. + +For the purpose of testing and code development, the programs can also +be run on a desktop or laptop computer. All what is needed for this is +a compliant C compiler and a local MPI installation such as Open MPI. + + +DOCUMENTATION + +The simulation program has a modular form, with strict prototyping and a +minimal use of external variables. Each program file contains a small number +of externally accessible functions whose functionality is described at the top +of the file. + +The data layout is explained in various README files and detailed instructions +are given on how to run the main programs. A set of further documentation +files are included in the doc directory, where the normalization conventions, +the chosen algorithms and other important program elements are described. + + +COMPILATION + +The compilation of the programs requires an ISO C89 compliant compiler and a +compatible MPI installation that complies with the MPI standard 1.2 (or later). + +In the main and devel directories, a GNU-style Makefile is included which +compiles and links the programs (just type "make" to compile everything; "make +clean" removes the files generated by "make"). The compiler options can be set +by editing the CFLAGS line in the Makefiles. + +The Makefiles assume that the following environment variables are set: + + GCC GNU C compiler command [Example: /usr/bin/gcc]. + + MPI_HOME MPI home directory [Example: /usr/lib64/mpi/gcc/openmpi]. + The mpicc command used is the one in $MPI_HOME/mpicc and + the MPI libraries are expected in $MPI_HOME/lib. + + MPI_INCLUDE Directory where the mpi.h file is to be found. + +All programs are then compiled using the $MPI_HOME/bin/mpicc command. The +compiler options that can be set in the CFLAGS line depend on which C compiler +the mpicc command invokes (the GCC compiler command is only used to resolve +the dependencies on the include files). + + +SSE/AVX ACCELERATION + +Current Intel and AMD processors are able to perform arithmetic operations on +short vectors of floating-point numbers in just one or two machine cycles, +using SSE and/or AVX instructions. The arithmetic performed by these +instructions fully complies with the IEEE 754 standard. + +Many programs in the module directories include SSE and AVX inline-assembly +code. On 64bit systems, and if the GNU or Intel C compiler is used, the code +can be activated by setting the compiler flags -Dx64 and -DAVX, respectively. +In addition, SSE prefetch instructions will be used if one of the following +options is specified: + + -DP4 Assume that prefetch instructions fetch 128 bytes at a time + (Pentium 4 and related Xeons). + + -DPM Assume that prefetch instructions fetch 64 bytes at a time + (Athlon, Opteron, Pentium M, Core, Core 2 and related Xeons). + + -DP3 Assume that prefetch instructions fetch 32 bytes at a time + (Pentium III). + +These options have an effect only if -Dx64 or -DAVX is set. The option +-DAVX implies -Dx64. + +On recent x86-64 machines with AMD Opteron or Intel Xeon processors, for +example, the recommended compiler flags are + + -std=c89 -O -mno-avx -DAVX -DPM + +For older machines that do not support the AVX instruction set, the +recommended flags are + + -std=c89 -O -mno-avx -Dx64 -DPM + +More aggressive optimization levels such as -O2 and -O3 tend to have little +effect on the execution speed of the programs, but the risk of generating +wrong code is higher. + +AVX instructions and the option -mno-avx may not be known to old versions +of the compilers, in which case one is limited to SSE accelerations with +option string -std=c89 -O -Dx64 -DPM. + + +DEBUGGING FLAGS + +For troubleshooting and parameter tuning, it may helpful to switch on some +debugging flags at compilation time. The simulation program then prints a +detailed report to the log file on the progress made in specified subprogram. + +The available flags are: + +-DCGNE_DBG CGNE solver. + +-DFGCR_DBG GCR solver. + +-FGCR4VD_DBG GCR solver for the little Dirac equation. + +-DMSCG_DBG MSCG solver. + +-DDFL_MODES_DBG Deflation subspace generation. + +-DMDINT_DBG Integration of the molecular-dynamics equations. + +-DRWRAT_DBG Computation of the rational function reweighting + factor. + + +RUNNING A SIMULATION + +The simulation programs reside in the directory "main". For each program, +there is a README file in this directory which describes the program +functionality and its parameters. + +Running a simulation for the first time requires its parameters to be chosen, +which tends to be a non-trivial task. The syntax of the input parameter files +and the meaning of the various parameters is described in some detail in +main/README.infiles and doc/parms.pdf. Examples of valid parameter files are +contained in the directory main/examples. + + +EXPORTED FIELD FORMAT + +The field configurations generated in the course of a simulation are written +to disk in a machine-independent format (see modules/misc/archive.c). +Independently of the machine endianness, the fields are written in little +endian format. A byte-reordering is therefore not required when machines with +different endianness are used for the simulation and the physics analysis. + + +AUTHORS + +The initial release of the openQCD package was written by Martin Lüscher and +Stefan Schaefer. Support for Schrödinger functional boundary conditions was +added by John Bulava. Several modules were taken over from the DD-HMC program +tree, which includes contributions from Luigi Del Debbio, Leonardo Giusti, +Björn Leder and Filippo Palombi. + + +ACKNOWLEDGEMENTS + +In the course of the development of the openQCD code, many people suggested +corrections and improvements or tested preliminary versions of the programs. +The authors are particularly grateful to Isabel Campos, Dalibor Djukanovic, +Georg Engel, Leonardo Giusti, Björn Leder, Carlos Pena and Hubert Simma for +their communications and help. + + +LICENSE + +The software may be used under the terms of the GNU General Public Licence +(GPL). + + +BUG REPORTS + +If a bug is discovered, please send a report to . + + +ALTERNATIVE PACKAGES AND COMPLEMENTARY PROGRAMS + +There is a publicly available BG/Q version of openQCD that takes advantage of +the machine-specific features of IBM BlueGene/Q computers. The version is +available at . + +The openQCD programs currently do not support reweighting in the quark +masses, but a module providing this functionality can be downloaded from +. + +Previously generated gauge-field configurations are often used as initial +configuration for a new run. If the old and new lattices or boundary +conditions are not the same, the old configuration may however need to be +adapted, using a field conversion tool such as the one available at +, before the new run is started. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/README b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/README new file mode 100644 index 0000000000000000000000000000000000000000..4a03581d0fb9138e14869b976c4e5927bf1ce7d5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/README @@ -0,0 +1,11 @@ + +This directory contains the check programs for the various modules. +They should better be executed, for the chosen lattice parameters, +before a simulation is started. + +The check programs for some of the program files that do not use the +MPI library (such as the random number generators) are included in the +directory "nompi". All other directories correspond to the module +directories with the same name, and the check programs in these are +MPI main programs. + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/INDEX b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/INDEX new file mode 100644 index 0000000000000000000000000000000000000000..7d59c07d48402babefb5a874acb37354e53586e0 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/INDEX @@ -0,0 +1,19 @@ + +Saving/restoring field configurations to/from files. + +check1 Writing and reading of gauge configurations. + +check2 Exporting and importing gauge configurations. + +check3 Importing a previously exported configuration. + +check4 Writing and reading spinor fields. + +check5 Exporting and importing spinor fields. + +check6 Importing a previously exported spinor field. + +The programs check1,check2,check3 accept the option -bc that allows the +type of boundary condition to be chosen (open boundary conditions are assumed +if the option is not set). All other programs are insensitive to the boundary +conditions. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..0ecd1721ea0bdb18270d1ecb8054222cabaa38a4 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/Makefile @@ -0,0 +1,134 @@ +################################################################################ +# +# Makefile to compile and link C programs with MPI subroutines +# +# Version valid for Linux machines with MPICH +# +# "make" compiles and links the specified main programs and modules, +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files generated by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +all: rmxeq mkdep mkxeq +.PHONY: all + + +# main programs and modules to be compiled + +MAIN = check1 check2 check3 check4 check5 check6 + +FLAGS = flags lat_parms hmc_parms dfl_parms + +LATTICE = bcnds uidx geometry + +ARCHIVE = archive sarchive + +LINALG = liealg cmatrix_dble salg_dble + +RANDOM = ranlux ranlxs ranlxd gauss + +UFLDS = plaq_sum uflds udcom + +SFLDS = sflds + +SU3FCTS = chexp cm3x3 random_su3 su3prod su3ren + +UTILS = endian mutils utils wspace + +MODULES = $(FLAGS) $(LATTICE) $(ARCHIVE) $(LINALG) $(RANDOM) $(UFLDS) \ + $(SFLDS) $(SU3FCTS) $(UTILS) + + +# Logging option (-mpilog or -mpitrace or -mpianim) + +LOGOPTION = + + +# search path for modules + +MDIR = ../../modules + +VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/archive:$(MDIR)/linalg:\ + $(MDIR)/random:$(MDIR)/uflds:$(MDIR)/sflds:\ + $(MDIR)/su3fcts:$(MDIR)/utils + + +# additional include directories + +INCPATH = $(MPI_INCLUDE) ../../include + + +# additional libraries + +LIBS = m + +LIBPATH = $(MPI_HOME)/lib + + +# scheduling and optimization options + +CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ + -Wall -Wno-long-long -Wstrict-prototypes -Werror \ + -O -mno-avx -Dx64 -DPM + + +############################## do not change ################################### + +SHELL=/bin/bash +CC=$(MPI_HOME)/bin/mpicc +CLINKER=$(CC) + +PGMS= $(MAIN) $(MODULES) + +-include $(addsuffix .d,$(PGMS)) + + +# rule to make dependencies + +$(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + +# rule to link object files + +$(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(CFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ + + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables + +rmxeq: + @ -rm -f $(MAIN); \ + echo "delete old executables" + + +# make dependencies + +mkdep: $(addsuffix .d,$(PGMS)) + @ echo "generate tables of dependencies" + + +# clean directory + +clean: + @ -rm -rf *.d *.o *.alog *.clog *.slog \ + *.log~ *.dat *.dat~ $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check1.c new file mode 100644 index 0000000000000000000000000000000000000000..50f57fd51f3a377ec01a48ff731e78873302545e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check1.c @@ -0,0 +1,220 @@ + +/******************************************************************************* +* +* File check1.c +* +* Copyright (C) 2005, 2007, 2010, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Writing and reading gauge configurations. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "su3fcts.h" +#include "linalg.h" +#include "archive.h" +#include "global.h" + +static int *rlxs_state[2],*rlxd_state[2]; + + +static void save_ranlux(void) +{ + int nlxs,nlxd; + int *p; + + nlxs=rlxs_size(); + nlxd=rlxd_size(); + + p=malloc(2*(nlxs+nlxd)*sizeof(*p)); + error(p==NULL,1,"save_ranlux [check1.c]", + "Unable to allocate state arrays"); + rlxs_state[0]=p; + p+=nlxs; + rlxs_state[1]=p; + p+=nlxs; + rlxd_state[0]=p; + p+=nlxd; + rlxd_state[1]=p; + + rlxs_get(rlxs_state[0]); + rlxd_get(rlxd_state[0]); +} + + +static int check_ranlux(void) +{ + int nlxs,nlxd,k,ie; + + nlxs=rlxs_size(); + nlxd=rlxd_size(); + + rlxs_get(rlxs_state[1]); + rlxd_get(rlxd_state[1]); + ie=0; + + for (k=0;k]"); + } + + MPI_Bcast(loc_dir,NAME_SIZE,MPI_CHAR,0,MPI_COMM_WORLD); + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,1.0,1.0,1.0,1.0,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,123456); + geometry(); + alloc_wud(1); + + check_dir(loc_dir); + nsize=name_size("%s/testcnfg_%d",loc_dir,NPROC); + error_root(nsize>=NAME_SIZE,1,"main [check1.c]","loc_dir name is too long"); + sprintf(cnfg,"%s/testcnfg_%d",loc_dir,my_rank); + + if (my_rank==0) + { + printf("Write random field configuration to the files\n" + "%s/testcnfg_*\n" + "on the local disks.\n\n",loc_dir); + printf("Then read the field from there, compare with the saved field\n" + "and remove all files.\n\n"); + } + + usv=reserve_wud(1); + udb=udfld(); + + random_ud(); + cm3x3_assign(4*VOLUME,udb,usv[0]); + save_ranlux(); + write_cnfg(cnfg); + + random_ud(); + read_cnfg(cnfg); + remove(cnfg); + error_chk(); + + ie=(check_bc(0.0)^0x1); + ie|=check_ud(usv[0]); + error(ie!=0,1,"main [check1.c]","The gauge field is not properly restored"); + + ie=check_ranlux(); + error(ie!=0,1,"main [check1.c]", + "The random number generator is not properly restored"); + print_flags(); + + if (my_rank==0) + { + printf("No errors detected --- the fields are correctly written\n\n"); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check1.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check1.in new file mode 100644 index 0000000000000000000000000000000000000000..7a91706372cf5afb3ac77e539f402ba3c1cd4428 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check1.in @@ -0,0 +1 @@ +loc_dir /home/data/openQCD/cnfg diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check2.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check2.c new file mode 100644 index 0000000000000000000000000000000000000000..e159214fe76fbc10f891581bb5a7351c5e7d3b6a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check2.c @@ -0,0 +1,166 @@ + +/******************************************************************************* +* +* File check2.c +* +* Copyright (C) 2005, 2007, 2010, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Exporting and importing gauge configurations. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "su3fcts.h" +#include "linalg.h" +#include "archive.h" +#include "global.h" + + +static int cmp_ud(su3_dble *u,su3_dble *v) +{ + int it; + + it =((*u).c11.re!=(*v).c11.re); + it|=((*u).c11.im!=(*v).c11.im); + it|=((*u).c12.re!=(*v).c12.re); + it|=((*u).c12.im!=(*v).c12.im); + it|=((*u).c13.re!=(*v).c13.re); + it|=((*u).c13.im!=(*v).c13.im); + + it|=((*u).c21.re!=(*v).c21.re); + it|=((*u).c21.im!=(*v).c21.im); + it|=((*u).c22.re!=(*v).c22.re); + it|=((*u).c22.im!=(*v).c22.im); + it|=((*u).c23.re!=(*v).c23.re); + it|=((*u).c23.im!=(*v).c23.im); + + it|=((*u).c31.re!=(*v).c31.re); + it|=((*u).c31.im!=(*v).c31.im); + it|=((*u).c32.re!=(*v).c32.re); + it|=((*u).c32.im!=(*v).c32.im); + it|=((*u).c33.re!=(*v).c33.re); + it|=((*u).c33.im!=(*v).c33.im); + + return it; +} + + +static int check_ud(su3_dble *usv) +{ + int it; + su3_dble *u,*um; + + u=udfld(); + um=u+4*VOLUME; + it=0; + + for (;u]"); + } + + MPI_Bcast(cnfg_dir,NAME_SIZE,MPI_CHAR,0,MPI_COMM_WORLD); + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,1.0,1.0,1.0,1.0,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,123456); + geometry(); + alloc_wud(1); + + check_dir_root(cnfg_dir); + nsize=name_size("%s/testcnfg",cnfg_dir); + error_root(nsize>=NAME_SIZE,1,"main [check2.c]","cnfg_dir name is too long"); + + if (my_rank==0) + { + printf("Export random field configurations to the file\n" + "%s/testcnfg.\n",cnfg_dir); + printf("Then read the fields from there and compare with the saved " + "fields.\n\n"); + } + + udb=udfld(); + usv=reserve_wud(1); + random_ud(); + cm3x3_assign(4*VOLUME,udb,usv[0]); + + sprintf(cnfg,"%s/testcnfg",cnfg_dir); + export_cnfg(cnfg); + + random_ud(); + import_cnfg(cnfg); + error_chk(); + + ie=(check_bc(0.0)^0x1); + ie|=check_ud(usv[0]); + error(ie!=0,1,"main [check2.c]","The gauge field is not properly restored"); + print_flags(); + + if (my_rank==0) + { + printf("No errors detected --- the fields are correctly exported\n\n"); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check2.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check2.in new file mode 100644 index 0000000000000000000000000000000000000000..87f9119fb146db92fab20e244c40fcac21ee1274 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check2.in @@ -0,0 +1 @@ +cnfg_dir /home/data/openQCD/cnfg diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check3.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check3.c new file mode 100644 index 0000000000000000000000000000000000000000..741c225b346f248a2350655363d16dc3bb631d61 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check3.c @@ -0,0 +1,141 @@ + +/******************************************************************************* +* +* File check3.c +* +* Copyright (C) 2005, 2007, 2008, 2010, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Importing a configuration previously exported by check2. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "su3fcts.h" +#include "linalg.h" +#include "archive.h" +#include "global.h" + + +static double avg_plaq(void) +{ + double plaq; + + plaq=plaq_sum_dble(1); + + return plaq/((double)(6*NPROC)*(double)(VOLUME)); +} + + +int main(int argc,char *argv[]) +{ + int my_rank,bc,nsize,ir,ie; + stdint_t l[4]; + double phi[2],phi_prime[2]; + double plaq0,plaq1,plaq2; + char cnfg_dir[NAME_SIZE],cnfg[NAME_SIZE]; + FILE *flog=NULL,*fin=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check3.log","w",stdout); + fin=freopen("check3.in","r",stdin); + + printf("\n"); + printf("Importing gauge fields exported by check2\n"); + printf("-----------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + read_line("cnfg_dir","%s\n",cnfg_dir); + fclose(fin); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check3.c]", + "Syntax: check3 [-bc ]"); + } + + MPI_Bcast(cnfg_dir,NAME_SIZE,MPI_CHAR,0,MPI_COMM_WORLD); + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,1.0,1.0,1.0,1.0,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,9876); + geometry(); + random_ud(); + plaq0=avg_plaq(); + + check_dir_root(cnfg_dir); + nsize=name_size("%s/testcnfg",cnfg_dir); + error_root(nsize>=NAME_SIZE,1,"main [check3.c]","cnfg_dir name is too long"); + sprintf(cnfg,"%s/testcnfg",cnfg_dir); + + if (my_rank==0) + { + fin=fopen(cnfg,"rb"); + error_root(fin==NULL,1,"main [check3.c]","Unable to open input file"); + + ir=fread(l,sizeof(stdint_t),4,fin); + ir+=fread(&plaq1,sizeof(double),1,fin); + error_root(ir!=5,1,"main [check3.c]","Incorrect read count"); + fclose(fin); + + if (endianness()==BIG_ENDIAN) + { + bswap_int(4,l); + bswap_double(1,&plaq1); + } + + printf("Random gauge field, average plaquette = %.15e\n\n",plaq0); + printf("Now read gauge field from file\n" + "%s:\n",cnfg); + printf("%dx%dx%dx%d lattice\n", + (int)(l[0]),(int)(l[1]),(int)(l[2]),(int)(l[3])); + printf("Average plaquette = %.15e\n",plaq1); + } + + import_cnfg(cnfg); + ie=check_bc(0.0); + plaq2=avg_plaq(); + error_chk(); + error(ie!=1,1,"main [check3.c]","Boundary conditions are not preserved"); + + if (my_rank==0) + { + printf("Should be = %.15e\n\n",plaq2); + remove(cnfg); + } + + print_flags(); + + if (my_rank==0) + fclose(flog); + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check3.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check3.in new file mode 100644 index 0000000000000000000000000000000000000000..ddffa2380f2cafe172782e07bb0f9c6a24134554 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check3.in @@ -0,0 +1 @@ +cnfg_dir /home/data/openQCD/cnfg diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check4.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check4.c new file mode 100644 index 0000000000000000000000000000000000000000..cc7049ec847bfc31903eac0ce27e4ed0217ea066 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check4.c @@ -0,0 +1,112 @@ + +/******************************************************************************* +* +* File check4.c +* +* Copyright (C) 2007, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Writing and reading spinor fields. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "sflds.h" +#include "linalg.h" +#include "archive.h" +#include "global.h" + + +int main(int argc,char *argv[]) +{ + int my_rank,nsize,k; + double d,dmax; + spinor_dble **psd; + char loc_dir[NAME_SIZE],name[NAME_SIZE]; + FILE *flog=NULL,*fin=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check4.log","w",stdout); + fin=freopen("check4.in","r",stdin); + + printf("\n"); + printf("Writing and reading spinor fields\n"); + printf("---------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + read_line("loc_dir","%s\n",loc_dir); + fclose(fin); + } + + MPI_Bcast(loc_dir,NAME_SIZE,MPI_CHAR,0,MPI_COMM_WORLD); + + start_ranlux(0,123456); + geometry(); + alloc_wsd(6); + psd=reserve_wsd(6); + + check_dir(loc_dir); + nsize=name_size("%s/testsfld_%d.%d",loc_dir,NPROC,6); + error_root(nsize>=NAME_SIZE,1,"main [check4.c]","loc_dir name is too long"); + + for (k=0;k<3;k++) + { + random_sd(VOLUME,psd[k],1.0); + sprintf(name,"%s/testsfld_%d.%d",loc_dir,my_rank,k); + write_sfld(name,psd[k]); + } + + for (k=0;k<3;k++) + { + sprintf(name,"%s/testsfld_%d.%d",loc_dir,my_rank,k); + read_sfld(name,psd[k+3]); + remove(name); + } + + dmax=0.0; + + for (k=0;k<3;k++) + { + mulr_spinor_add_dble(VOLUME,psd[k],psd[k+3],-1.0); + d=norm_square_dble(VOLUME,1,psd[k]); + + if (d>dmax) + dmax=d; + } + + error_chk(); + + if (my_rank==0) + { + printf("Wrote 3 spinor fields to the files\n" + "%s/testsfld_*\n" + "on the local disks. ",loc_dir); + printf("Then read the fields from there and removed\n" + "the files.\n\n"); + printf("Maximal deviation = %.1e ",sqrt(dmax)); + printf("(should be exactly equal to 0.0)\n\n"); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check4.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check4.in new file mode 100644 index 0000000000000000000000000000000000000000..e4eb9a0f89f1afa33f7426c77c5479291f941b23 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check4.in @@ -0,0 +1 @@ +loc_dir /home/data/openQCD/scnfg diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check5.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check5.c new file mode 100644 index 0000000000000000000000000000000000000000..9d916277319e22d85b5faa0385ed055fd17c85ad --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check5.c @@ -0,0 +1,155 @@ + +/******************************************************************************* +* +* File check5.c +* +* Copyright (C) 2007, 2008, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Exporting and importing spinor fields. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "sflds.h" +#include "linalg.h" +#include "archive.h" +#include "global.h" + +static const spinor_dble sd0={{{0.0}}}; +static spinor_dble **psd; + + +static void ptfld(int k) +{ + int x0,x1,x2,x3,y0,y1,y2,y3,ix; + spinor_dble *s; + + y0=L0*cpr[0]; + y1=L1*cpr[1]; + y2=L2*cpr[2]; + y3=L3*cpr[3]; + + for (x0=0;x0=NAME_SIZE,1,"main [check5.c]","sfld_dir name is too long"); + + for (k=0;k<3;k++) + { + random_sd(VOLUME,psd[k],1.0); + sprintf(name,"%s/testsfld%d",sfld_dir,k); + export_sfld(name,psd[k]); + } + + for (k=0;k<3;k++) + { + sprintf(name,"%s/testsfld%d",sfld_dir,k); + import_sfld(name,psd[k+3]); + remove(name); + } + + dmax=0.0; + + for (k=0;k<3;k++) + { + mulr_spinor_add_dble(VOLUME,psd[k],psd[k+3],-1.0); + d=norm_square_dble(VOLUME,0,psd[k]); + + if (d>dmax) + dmax=d; + } + + error_chk(); + MPI_Reduce(&d,&dmax,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + + if (my_rank==0) + { + printf("Exported 3 spinor fields to the directory\n" + "%s\n",sfld_dir); + printf("Then reimported and deleted them\n\n"); + printf("Maximal deviation = %.1e ",sqrt(dmax)); + printf("(should be exactly equal to 0.0)\n\n"); + } + + ptfld(4); + sprintf(name,"%s/testsfld",sfld_dir); + export_sfld(name,psd[4]); + + if (my_rank==0) + { + printf("Point source field exported to file\n" + "%s\n\n",name); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check5.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check5.in new file mode 100644 index 0000000000000000000000000000000000000000..6a8c2bccbb02d4c50c1d1c6f2657af3cea1e87aa --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check5.in @@ -0,0 +1 @@ +sfld_dir /home/data/openQCD/scnfg diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check6.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check6.c new file mode 100644 index 0000000000000000000000000000000000000000..6fa80bc399a34266f232af9b5b7bd8f4346304a7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check6.c @@ -0,0 +1,124 @@ + +/******************************************************************************* +* +* File check6.c +* +* Copyright (C) 2007, 2008, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Importing a previously exported spinor field. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "sflds.h" +#include "linalg.h" +#include "archive.h" +#include "global.h" + +static spinor_dble **psd; +static const spinor_dble sd0={{{0.0}}}; + + +static void ptfld(int k) +{ + int x0,x1,x2,x3,y0,y1,y2,y3,ix; + spinor_dble *s; + + y0=L0*cpr[0]; + y1=L1*cpr[1]; + y2=L2*cpr[2]; + y3=L3*cpr[3]; + + for (x0=0;x0=NAME_SIZE,1,"main [check6.c]","sfld_dir name is too long"); + sprintf(name,"%s/testsfld",sfld_dir); + + import_sfld(name,psd[1]); + + mulr_spinor_add_dble(VOLUME,psd[0],psd[1],-1.0); + d=norm_square_dble(VOLUME,1,psd[0]); + + error_chk(); + + if (my_rank==0) + { + printf("Imported field from file\n" + "%s\n\n",name); + printf("Deviation = %.1e ",sqrt(d)); + printf("(should be exactly equal to 0.0)\n\n"); + remove(name); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check6.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check6.in new file mode 100644 index 0000000000000000000000000000000000000000..db4406f006b28dc110bda1b66a39a5aebaf65eed --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/archive/check6.in @@ -0,0 +1 @@ +sfld_dir /home/data/openQCD/scnfg \ No newline at end of file diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/block/INDEX b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/block/INDEX new file mode 100644 index 0000000000000000000000000000000000000000..ca84d1525e49de3d9f3545c0039b9e1e8b5eafd8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/block/INDEX @@ -0,0 +1,21 @@ + +Block grid allocation and field assignment programs + +check1 Checks on the local geometry arrays b.ipt,b.iup,b.idn, + b.imb,b.ibp and b.bb.ipp,b.bb.map,b.bb.imb for the + known block grids. This program also checks ipt_blk(). + +check2 Checks on the allocation and initialization of the gauge, + Dirac and Weyl fields on the known block grids. + +check3 Check of assign_ud2ubgr() and assign_ud2udblk(). + +check4 Check of assign_swd2swbgr() and assign_swd2swdblk(). + +check5 Check of assign_s2sblk(),...,assign_sdblk2sd(). + +The programs check1, check3 and check4 accept the option -bc that +allows the type of boundary condition to be chosen at runtime. When the option +is not set, open boundary conditions are assumed. + +The option may be set but has no effect in the case of the other programs. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/block/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/block/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..4c89cb557ddd1b02d70ed97ed4acbf52bd51b06e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/block/Makefile @@ -0,0 +1,139 @@ +################################################################################ +# +# Makefile to compile and link C programs with MPI subroutines +# +# Version valid for Linux machines with MPICH +# +# "make" compiles and links the specified main programs and modules, +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files generated by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +all: rmxeq mkdep mkxeq +.PHONY: all + + +# main programs and modules to be compiled + +MAIN = check1 check2 check3 check4 check5 + +FLAGS = flags lat_parms sap_parms dfl_parms + +RANDOM = ranlux ranlxs ranlxd gauss + +LATTICE = bcnds ftidx uidx geometry + +BLOCK = block blk_grid map_u2blk map_sw2blk map_s2blk + +UFLDS = uflds udcom shift + +SFLDS = sflds Pbnd + +LINALG = salg salg_dble liealg cmatrix_dble + +SU3FCTS = su3prod su3ren cm3x3 random_su3 + +UTILS = endian mutils utils wspace + +TCHARGE = ftcom ftensor + +SW_TERM = pauli pauli_dble swflds sw_term + +SAP = sap_com + +MODULES = $(FLAGS) $(RANDOM) $(LATTICE) $(BLOCK) $(UFLDS) $(SFLDS) \ + $(LINALG) $(SU3FCTS) $(UTILS) $(TCHARGE) $(SW_TERM) $(SAP) + + +# Logging option (-mpilog or -mpitrace or -mpianim) + +LOGOPTION = + + +# search path for modules + +MDIR = ../../modules + +VPATH = .:$(MDIR)/flags:$(MDIR)/random:$(MDIR)/lattice:$(MDIR)/block:\ + $(MDIR)/uflds:$(MDIR)/sflds:$(MDIR)/su3fcts:$(MDIR)/utils:\ + $(MDIR)/linalg:$(MDIR)/tcharge:$(MDIR)/sw_term:$(MDIR)/sap + + +# additional include directories + +INCPATH = $(MPI_INCLUDE) ../../include + + +# additional libraries + +LIBS = m + +LIBPATH = $(MPI_HOME)/lib + + +# scheduling and optimization options + +CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ + -Wall -Wno-long-long -Wstrict-prototypes -Werror \ + -O -mno-avx -Dx64 -DPM + + +############################## do not change ################################### + +SHELL=/bin/bash +CC=$(MPI_HOME)/bin/mpicc +CLINKER=$(CC) + +PGMS= $(MAIN) $(MODULES) + +-include $(addsuffix .d,$(PGMS)) + + +# rule to make dependencies + +$(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + +# rule to link object files + +$(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(CFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ + + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables + +rmxeq: + @ -rm -f $(MAIN); \ + echo "delete old executables" + + +# make dependencies + +mkdep: $(addsuffix .d,$(PGMS)) + @ echo "generate tables of dependencies" + + +# clean directory + +clean: + @ -rm -rf *.d *.o *.alog *.clog *.slog $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/block/check1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/block/check1.c new file mode 100644 index 0000000000000000000000000000000000000000..c41511d388837da834a544fa25919765fa24e678 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/block/check1.c @@ -0,0 +1,525 @@ + +/******************************************************************************* +* +* File check1.c +* +* Copyright (C) 2005, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Consistency checks on the geometry arrays in the known block grids. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "block.h" +#include "global.h" + +static int ix_test[VOLUME+BNDRY]; + + +static void test1(blk_grid_t grid,int *bs) +{ + int ix,iy,itest; + int nb,isw,vol,*imb; + block_t *b,*bm; + + for (ix=0;ix=VOLUME)) + itest=2; + else + { + ix_test[iy]+=1; + if (ix_test[iy]>1) + itest=3; + } + } + } + + for (ix=0;ixvol)) + itest=1; + else + { + if (iy!=imb[ix]) + itest=2; + + is=(x0+x1+x2+x3+bo[0]+bo[1]+bo[2]+bo[3])%2; + + if (((is==0)&&(ix>=(vol/2)))||((is!=0)&&(ix<(vol/2)))) + itest=3; + + for (mu=0;mu<4;mu++) + { + if ((x[mu]+1)0) + { + if (imb[(*b).idn[ix][mu]]!=idn[iy][mu]) + itest=6; + } + else + { + if ((*b).idn[ix][mu]!=vol) + itest=7; + } + } + } + } + } + } + } + } + + error(itest==1,1,"test2 [check1.c]", + "b.ipt is out of range"); + error(itest==2,1,"test2 [check1.c]", + "The blocks are not properly embedded"); + error(itest==3,1,"test2 [check1.c]", + "b.ipt does not respect the even-odd ordering"); + error(itest==4,1,"test2 [check1.c]", + "b.iup is incorrect"); + error(itest==5,1,"test2 [check1.c]", + "b.iup is incorrect at the block boundary"); + error(itest==6,1,"test2 [check1.c]", + "b.idn is incorrect"); + error(itest==7,1,"test2 [check1.c]", + "b.idn is incorrect at the block boundary"); +} + + +static void test3(blk_grid_t grid) +{ + int bc,ix,iy,ie,itest; + int nbp,nall,x[4]; + int nb,isw,vol,*bs,*imb; + block_t *b,*bm; + + bc=bc_type(); + itest=0; + nall=0; + b=blk_list(grid,&nb,&isw); + bm=b+nb; + + for (;b=vol)) + itest=2; + + if (iy>0) + { + if (ix<=(*b).ibp[iy-1]) + itest=3; + } + + ix=imb[ix]; + ie=((global_time(ix)==0)&&(bc!=3)); + ie|=((global_time(ix)==(NPROC0*L0-1))&&(bc==0)); + + if (ie==0) + itest=4; + } + } + + if ((cpr[0]==0)&&(bc!=3)) + nall-=(L1*L2*L3); + if ((cpr[0]==(NPROC0-1))&&(bc==0)) + nall-=(L1*L2*L3); + + error(itest==1,1,"test3 [check1.c]", + "b.nbp is incorrect"); + error(itest==2,1,"test3 [check1.c]", + "b.ibp is out of range"); + error(itest==3,1,"test3 [check1.c]", + "b.ibp is not properly ordered"); + error(itest==4,1,"test3 [check1.c]", + "The points b.ibp are not all on the boundary of the lattice"); + error(nall!=0,1,"test3 [check1.c]", + "Incorrect total count of points at time 0 and NPROC0*L0-1"); +} + + +static void test4(blk_grid_t grid) +{ + int ix,iy,ifc,mu,ib,itest; + int nb,isw,vol,*bs,*imb; + block_t *b,*bm; + bndry_t *bb; + + for (ix=0;ix<(VOLUME+BNDRY);ix++) + ix_test[ix]=0; + + itest=0; + b=blk_list(grid,&nb,&isw); + bm=b+nb; + + for (;b=vol)) + itest=2; + + iy=(*bb).imb[ix]; + + if ((iy<0)||(iy>=(VOLUME+BNDRY))) + itest=3; + else + ix_test[iy]+=1; + } + + bb+=1; + } + } + + b=blk_list(grid,&nb,&isw); + + for (;b=(vol/2))) + itest=1; + + iy=(*bb).imb[ix]; + iz=(*bb).imb[ix+(*bb).vol/2]; + + if ((iy>=(VOLUME+(BNDRY/2)))||((iy>=(VOLUME/2))&&(iy=VOLUME)&&(iz<(VOLUME+(BNDRY/2))))) + itest=2; + } + + mu=ifc/2; + + for (ix=0;ix<(*bb).vol;ix++) + { + iy=(*bb).ipp[ix]; + + if ((((ifc%2)==0)&&((*b).idn[iy][mu]!=vol))|| + (((ifc%2)==1)&&((*b).iup[iy][mu]!=vol))) + itest=3; + + iz=(*bb).map[ix]; + + if ((ifc%2)==0) + { + for (is=1;is]"); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,1.0,1.0,1.0,1.0,phi,phi_prime); + print_bc_parms(); + + geometry(); + set_sap_parms(bs,0,1,1); + set_dfl_parms(bs,2); + grid=BLK_GRIDS; + + for (igr=0;igr<(int)(BLK_GRIDS);igr++) + { + if (igr==0) + grid=SAP_BLOCKS; + else if (igr==1) + grid=DFL_BLOCKS; + else + error_root(1,1,"main [check1.c]","Unknown block grid"); + + alloc_bgr(grid); + + test1(grid,bs); + test2(grid); + test3(grid); + test4(grid); + test5(grid); + } + + error_chk(); + + if (my_rank==0) + { + printf("No errors detected\n\n"); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/block/check1.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/block/check1.in new file mode 100644 index 0000000000000000000000000000000000000000..bd654839cac6ab535881018a1109ac0080e8af27 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/block/check1.in @@ -0,0 +1 @@ +bs 4 4 4 4 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/block/check2.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/block/check2.c new file mode 100644 index 0000000000000000000000000000000000000000..c959c52188120362af34b81d223087772d049193 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/block/check2.c @@ -0,0 +1,588 @@ + +/******************************************************************************* +* +* File check2.c +* +* Copyright (C) 2005, 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Checks on the allocation and initialization of the gauge, Dirac and Weyl +* fields on the known block grids. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "block.h" +#include "global.h" + +typedef union +{ + su3 u; + float r[18]; +} umat_t; + +typedef union +{ + su3_dble u; + double r[18]; +} umat_dble_t; + +typedef union +{ + spinor s; + float r[24]; +} spin_t; + +typedef union +{ + spinor_dble s; + double r[24]; +} spin_dble_t; + +typedef union +{ + weyl w; + float r[12]; +} wspin_t; + +typedef union +{ + weyl_dble w; + double r[12]; +} wspin_dble_t; + + +static int check_u(int vol,su3 *u) +{ + int i; + umat_t *m,*mm; + + m=(umat_t*)(u); + mm=m+vol; + + for (;m=6)&&((*sw).u[i]!=0.0f)))) + return 1; + } + } + + return 0; +} + + +static int check_swd(int vol,pauli_dble *swd) +{ + int i; + pauli_dble *sm; + + sm=swd+vol; + + for (;swd=6)&&((*swd).u[i]!=0.0)))) + return 1; + } + } + + return 0; +} + + +static int check_s(int ns,int vol,spinor **s) +{ + int k,i; + spin_t *sp,*sm; + + for (k=0;k0) + { + if (((*b).s==NULL)||((shf&0x10)&&((*b0).s!=(*b).s))) + return 5; + if (check_s(ns,vol,(*b).s)) + return 5; + } + else + { + if ((*b).s!=NULL) + return 5; + } + + if ((*b).nsd!=nsd) + return 6; + + if (nsd>0) + { + if (((*b).sd==NULL)||((shf&0x20)&&((*b0).sd!=(*b).sd))) + return 6; + if (check_sd(nsd,vol,(*b).sd)) + return 6; + } + else + { + if ((*b).sd!=NULL) + return 6; + } + + return 0; +} + + +static int check_bnd(block_t *b0,block_t *b, + int iub,int iudb,int nw,int nwd,int shf) +{ + int vol,ifc; + bndry_t *bb0,*bb; + + bb0=(*b).bb; + bb=(*b).bb; + + for (ifc=0;ifc<8;ifc++) + { + vol=(*bb).vol; + + if (iub==1) + { + if (((*bb).u==NULL)||((shf&0x4)&&((*bb0).u!=(*bb).u))) + return 7; + if (check_u(vol,(*bb).u)) + return 7; + } + else + { + if ((*bb).u!=NULL) + return 7; + } + + if (iudb==1) + { + if (((*bb).ud==NULL)||((shf&0x8)&&((*bb0).ud!=(*bb).ud))) + return 8; + if (check_ud(vol,(*bb).ud)) + return 8; + } + else + { + if ((*bb).ud!=NULL) + return 8; + } + + if ((*bb).nw!=nw) + return 9; + + if (nw>0) + { + if (((*bb).w==NULL)||((shf&0x40)&&((*bb0).w!=(*bb).w))) + return 9; + if (check_w(nw,vol,(*bb).w)) + return 9; + } + else + { + if ((*bb).w!=NULL) + return 9; + } + + if ((*bb).nwd!=nwd) + return 10; + + if (nwd>0) + { + if (((*bb).wd==NULL)||((shf&0x80)&&((*bb0).wd!=(*bb).wd))) + return 10; + if (check_wd(nwd,vol,(*bb).wd)) + return 10; + } + else + { + if ((*bb).wd!=NULL) + return 10; + } + + bb0+=1; + bb+=1; + } + + return 0; +} + +int main(int argc,char *argv[]) +{ + int my_rank,n,n0,n1,n2,n3; + int igr,bs[4],nb,isw,itest; + int iu,iud,ns,nsd; + int iub,iudb,nw,nwd; + int shg,shu,shud,shs,shsd,shw,shwd,shf; + block_t *b0,*b; + blk_grid_t grid; + FILE *flog=NULL,*fin=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check2.log","w",stdout); + fin=freopen("check1.in","r",stdin); + + printf("\n"); + printf("Checks on the allocation and initialization of the gauge,\n" + "Dirac and Weyl fields on the known block grids.\n"); + printf("---------------------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + read_line("bs","%d %d %d %d",&bs[0],&bs[1],&bs[2],&bs[3]); + fclose(fin); + + printf("bs = %d %d %d %d\n\n",bs[0],bs[1],bs[2],bs[3]); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + geometry(); + set_sap_parms(bs,0,1,1); + set_dfl_parms(bs,2); + grid=BLK_GRIDS; + + for (igr=0;igr<(int)(BLK_GRIDS);igr++) + { + iu=0; + iud=0; + ns=0; + nsd=0; + iub=0; + iudb=0; + nw=0; + nwd=0; + + shg=1; + shu=0; + shud=0; + shs=0; + shsd=0; + shw=0; + shwd=0; + + if (igr==0) + { + grid=SAP_BLOCKS; + + iu=1; + ns=3; + nw=1; + iub=1; + shs=1; + } + else if (igr==1) + { + grid=DFL_BLOCKS; + + iud=1; + ns=3; + nsd=3; + shud=1; + } + else + error_root(1,1,"main [check2.c]","Unknown block grid"); + + shf=0x1|(shg<<1)|(shu<<2)|(shud<<3)|(shs<<4)|(shsd<<5)|(shw<<6)|(shwd<<7); + alloc_bgr(grid); + print_grid_flags(grid); + b0=blk_list(grid,&nb,&isw); + + n0=L0/bs[0]; + n1=L1/bs[1]; + n2=L2/bs[2]; + n3=L3/bs[3]; + n=n0*cpr[0]+n1*cpr[1]+n2*cpr[2]+n3*cpr[3]; + + error((b0==NULL)||(nb!=(n0*n1*n2*n3))||(isw!=(n%2)),1, + "main [check2.c]","Incorrect return values of blk_list"); + + if (my_rank==0) + { + printf("Share flag on the blocks = %#x\n",(*b0).shf); + printf("Should be %#x\n\n",shf); + } + + itest=0; + + for (n=0;nL0)|| + ((*b).bo[1]<0)||(((*b).bo[1]+bs[1])>L1)|| + ((*b).bo[2]<0)||(((*b).bo[2]+bs[2])>L2)|| + ((*b).bo[3]<0)||(((*b).bo[3]+bs[3])>L3),1, + "main [check2.c]","b.bo is out of range"); + + error((((*b).bo[0]%bs[0])!=0)||(((*b).bo[1]%bs[1])!=0)|| + (((*b).bo[2]%bs[2])!=0)||(((*b).bo[3]%bs[3])!=0),1, + "main [check2.c]","b.bo is not an integer multiple of bs"); + + n0=(*b).bo[0]/bs[0]; + n1=(*b).bo[1]/bs[1]; + n2=(*b).bo[2]/bs[2]; + n3=(*b).bo[3]/bs[3]; + + isw=(n0+n1+n2+n3)%2; + + error(((isw==0)&&(n>=(nb/2)))||((isw==1)&&(n<(nb/2))),1, + "main [check2.c]","Blocks are not locally even-odd ordered"); + + itest=check_blk(b0,b,iu,iud,ns,nsd,shf); + if (itest!=0) + break; + + error((*b).bb==NULL,1,"main [check2.c]", + "Block boundaries are not allocated"); + + itest=check_bnd(b0,b,iub,iudb,nw,nwd,shf); + if (itest!=0) + break; + } + + error(itest==1,1,"main [check2.c]","Unexpected share flag"); + error(itest==2,1,"main [check2.c]","Geometry arrays are not shared"); + error(itest==3,1,"main [check2.c]", + "b.u or b.sw is not in the proper condition"); + error(itest==4,1,"main [check2.c]", + "b.ud or b.swd is not in the proper condition"); + error(itest==5,1,"main [check2.c]", + "b.s is not in the proper condition"); + error(itest==6,1,"main [check2.c]", + "b.sd is not in the proper condition"); + error(itest==7,1,"main [check2.c]", + "b.bb.u is not in the proper condition"); + error(itest==8,1,"main [check2.c]", + "b.bb.ud is not in the proper condition"); + error(itest==9,1,"main [check2.c]", + "b.bb.w is not in the proper condition"); + error(itest==10,1,"main [check2.c]", + "b.bb.wd is not in the proper condition"); + } + + error_chk(); + + if (my_rank==0) + { + printf("No errors detected\n\n"); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/block/check3.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/block/check3.c new file mode 100644 index 0000000000000000000000000000000000000000..02189a97b31373c65f302c8fa3297c385b0cc8a6 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/block/check3.c @@ -0,0 +1,483 @@ + +/******************************************************************************* +* +* File check3.c +* +* Copyright (C) 2005, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of assign_ud2ubgr() and assign_ud2udblk(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "block.h" +#include "global.h" + +typedef union +{ + su3 u; + float r[18]; +} umat_t; + +typedef union +{ + su3_dble u; + double r[18]; +} umat_dble_t; + + +static void set_ud(void) +{ + int x0,x1,x2,x3,ix; + int y0,y1,y2,y3,ifc; + su3_dble *udb,*ud; + + random_ud(); + udb=udfld(); + + for (x0=0;x0]"); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,1.0,1.0,1.0,1.0,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,1234); + geometry(); + set_sap_parms(bs,0,1,1); + set_dfl_parms(bs,2); + alloc_bgr(SAP_BLOCKS); + alloc_bgr(DFL_BLOCKS); + + set_ud(); + assign_ud2ubgr(SAP_BLOCKS); + assign_ud2u(); + print_flags(); + print_grid_flags(SAP_BLOCKS); + + error(check_ubgr(SAP_BLOCKS),1,"main [check3.c]", + "assign_ud2ubgr() is incorrect"); + + b=blk_list(DFL_BLOCKS,&nb,&isw); + random_ud(); + assign_ud2udblk(DFL_BLOCKS,0); + set_ud(); + ie=0; + + for (n=0;n +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sw_term.h" +#include "block.h" +#include "global.h" + + +static int cmp_sw(pauli *r,pauli *s) +{ + int i; + + for (i=0;i<36;i++) + { + if ((r[0].u[i]!=s[0].u[i])||(r[1].u[i]!=s[1].u[i])) + return 1; + } + + return 0; +} + + +static int cmp_swd(pauli_dble *r,pauli_dble *s) +{ + int i; + + for (i=0;i<36;i++) + { + if ((r[0].u[i]!=s[0].u[i])||(r[1].u[i]!=s[1].u[i])) + return 1; + } + + return 0; +} + + +static int check_sw(block_t *b) +{ + int x0,x1,x2,x3,x[4]; + int y0,y1,y2,y3,ix,iy; + pauli *sw; + + sw=swfld(); + + for (x0=0;x0<(*b).bs[0];x0++) + { + for (x1=0;x1<(*b).bs[1];x1++) + { + for (x2=0;x2<(*b).bs[2];x2++) + { + for (x3=0;x3<(*b).bs[3];x3++) + { + x[0]=x0; + x[1]=x1; + x[2]=x2; + x[3]=x3; + + y0=(*b).bo[0]+x0; + y1=(*b).bo[1]+x1; + y2=(*b).bo[2]+x2; + y3=(*b).bo[3]+x3; + + ix=ipt_blk(b,x); + iy=ipt[y3+L3*y2+L2*L3*y1+L1*L2*L3*y0]; + + if (cmp_sw((*b).sw+2*ix,sw+2*iy)) + return 1; + } + } + } + } + + return 0; +} + + +static int check_swbgr(blk_grid_t grid) +{ + int nb,isw; + block_t *b,*bm; + + b=blk_list(grid,&nb,&isw); + bm=b+nb; + + for (;b]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,1.0,1.0,1.301,0.789,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,1234); + geometry(); + + set_sap_parms(bs,0,1,1); + set_dfl_parms(bs,2); + alloc_bgr(SAP_BLOCKS); + alloc_bgr(DFL_BLOCKS); + + set_sw_parms(0.05); + random_ud(); + ifail=0; + + for (iset=0;iset<(int)(PT_SETS);iset++) + { + if (iset==0) + set=ALL_PTS; + else if (iset==1) + set=EVEN_PTS; + else if (iset==2) + set=ODD_PTS; + else + set=NO_PTS; + + sw_term(NO_PTS); + ifail+=assign_swd2swbgr(SAP_BLOCKS,set); + ifail+=sw_term(set); + assign_swd2sw(); + error(check_swbgr(SAP_BLOCKS)!=0,1,"main [check4.c]", + "assign_swd2swbgr() is incorrect"); + + b=blk_list(DFL_BLOCKS,&nb,&isw); + + for (n=0;n +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "sflds.h" +#include "sw_term.h" +#include "block.h" +#include "global.h" + +typedef union +{ + spinor s; + float r[24]; +} spin_t; + +typedef union +{ + spinor_dble s; + double r[24]; +} spin_dble_t; + + +static int cmp_s(spinor *r,spinor *s) +{ + int i; + spin_t *rr,*rs; + + rr=(spin_t*)(r); + rs=(spin_t*)(s); + + for (i=0;i<24;i++) + { + if ((*rr).r[i]!=(*rs).r[i]) + return 1; + } + + return 0; +} + + +static int cmp_sd(spinor_dble *r,spinor_dble *s) +{ + int i; + spin_dble_t *rr,*rs; + + rr=(spin_dble_t*)(r); + rs=(spin_dble_t*)(s); + + for (i=0;i<24;i++) + { + if ((*rr).r[i]!=(*rs).r[i]) + return 1; + } + + return 0; +} + + +static int check_sb(block_t *b,ptset_t set,int k,spinor *s) +{ + int x0,x1,x2,x3,x[4]; + int y0,y1,y2,y3; + int ix,iy,is,n0,n1; + + for (x0=0;x0<(*b).bs[0];x0++) + { + for (x1=0;x1<(*b).bs[1];x1++) + { + for (x2=0;x2<(*b).bs[2];x2++) + { + for (x3=0;x3<(*b).bs[3];x3++) + { + x[0]=x0; + x[1]=x1; + x[2]=x2; + x[3]=x3; + + y0=(*b).bo[0]+x0; + y1=(*b).bo[1]+x1; + y2=(*b).bo[2]+x2; + y3=(*b).bo[3]+x3; + + ix=ipt_blk(b,x); + iy=ipt[y3+L3*y2+L2*L3*y1+L1*L2*L3*y0]; + is=(y0+y1+y2+y3)%2; + + n0=((is==0)&&((set==ALL_PTS)||(set==EVEN_PTS))); + n1=((is==1)&&((set==ALL_PTS)||(set==ODD_PTS))); + + if ((n0==1)||(n1==1)) + { + if (cmp_s((*b).s[k]+ix,s+iy)) + return 1; + } + } + } + } + } + + return 0; +} + + +static int check_sdb(block_t *b,ptset_t set,int k,spinor_dble *sd) +{ + int x0,x1,x2,x3,x[4],n0,n1; + int y0,y1,y2,y3; + int ix,iy,is; + + for (x0=0;x0<(*b).bs[0];x0++) + { + for (x1=0;x1<(*b).bs[1];x1++) + { + for (x2=0;x2<(*b).bs[2];x2++) + { + for (x3=0;x3<(*b).bs[3];x3++) + { + x[0]=x0; + x[1]=x1; + x[2]=x2; + x[3]=x3; + + y0=(*b).bo[0]+x0; + y1=(*b).bo[1]+x1; + y2=(*b).bo[2]+x2; + y3=(*b).bo[3]+x3; + + ix=ipt_blk(b,x); + iy=ipt[y3+L3*y2+L2*L3*y1+L1*L2*L3*y0]; + is=(y0+y1+y2+y3)%2; + + n0=((is==0)&&((set==ALL_PTS)||(set==EVEN_PTS))); + n1=((is==1)&&((set==ALL_PTS)||(set==ODD_PTS))); + + if ((n0==1)||(n1==1)) + { + if (cmp_sd((*b).sd[k]+ix,sd+iy)) + return 1; + } + } + } + } + } + + return 0; +} + + +static int diff_s(int vol,spinor *s,spinor *r) +{ + spinor *sm; + + sm=s+vol; + + for (;s that allows the +type of boundary condition to be chosen at runtime. When the option is not +set, open boundary conditions are assumed. + +The option may be set but has no effect in the case of check3 and check4 (the +boundary conditions are selected through the input parameter file in these +cases). diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dfl/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dfl/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..93081532918d5b0b1851290c14ffe0088b41ec58 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dfl/Makefile @@ -0,0 +1,155 @@ +################################################################################ +# +# Makefile to compile and link C programs with MPI subroutines +# +# Version valid for Linux machines with MPICH +# +# "make" compiles and links the specified main programs and modules, +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files generated by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +all: rmxeq mkdep mkxeq +.PHONY: all + + +# main programs and modules to be compiled + +MAIN = check1 check2 check3 check4 + +FLAGS = flags lat_parms sap_parms dfl_parms + +LATTICE = bcnds ftidx uidx geometry + +LINALG = salg salg_dble valg valg_dble liealg cmatrix_dble cmatrix + +LINSOLV = fgcr fgcr4vd + +RANDOM = ranlux ranlxs ranlxd gauss + +UFLDS = plaq_sum shift uflds udcom + +SU3FCTS = chexp su3prod su3ren cm3x3 random_su3 + +UTILS = endian mutils utils wspace + +SFLDS = sflds scom sdcom Pbnd Pbnd_dble + +TCHARGE = ftcom ftensor + +SW_TERM = pauli pauli_dble swflds sw_term + +DIRAC = Dw_dble Dw Dw_bnd + +BLOCK = block blk_grid map_u2blk map_sw2blk map_s2blk + +SAP = blk_solv sap_com sap sap_gcr + +ARCHIVE = archive + +DFL = dfl_geometry dfl_subspace ltl_gcr dfl_sap_gcr dfl_modes + +VFLDS = vflds vinit vcom vdcom + +LITTLE = Aw_gen Aw_com Aw_ops Aw_dble Aw ltl_modes + +MODULES = $(FLAGS) $(LATTICE) $(LINALG) $(LINSOLV) $(RANDOM) $(UFLDS) \ + $(SU3FCTS) $(UTILS) $(SFLDS) $(TCHARGE) $(SW_TERM) $(DIRAC) \ + $(BLOCK) $(SAP) $(ARCHIVE) $(DFL) $(VFLDS) $(LITTLE) + + +# Logging option (-mpilog or -mpitrace or -mpianim) + +LOGOPTION = + + +# search path for modules + +MDIR = ../../modules + +VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/linalg:$(MDIR)/linsolv:\ + $(MDIR)/random:$(MDIR)/uflds:$(MDIR)/su3fcts:$(MDIR)/utils:\ + $(MDIR)/sflds:$(MDIR)/tcharge:$(MDIR)/sw_term:$(MDIR)/dirac:\ + $(MDIR)/block:$(MDIR)/sap:$(MDIR)/archive:$(MDIR)/dfl:\ + $(MDIR)/vflds:$(MDIR)/little + +# additional include directories + +INCPATH = $(MPI_INCLUDE) ../../include + + +# additional libraries + +LIBS = m + +LIBPATH = $(MPI_HOME)/lib + + +# scheduling and optimization options + +CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ + -Wall -Wno-long-long -Wstrict-prototypes -Werror \ + -O -mno-avx -Dx64 -DPM + +# -DFGCR_DBG -DFGCR4VD_DBG -DDFL_MODES_DBG + + +############################## do not change ################################### + +SHELL=/bin/bash +CC=$(MPI_HOME)/bin/mpicc +CLINKER=$(CC) + +PGMS= $(MAIN) $(MODULES) + +-include $(addsuffix .d,$(PGMS)) + + +# rule to make dependencies + +$(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + +# rule to link object files + +$(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(CFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ + + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables + +rmxeq: + @ -rm -f $(MAIN); \ + echo "delete old executables" + + +# make dependencies + +mkdep: $(addsuffix .d,$(PGMS)) + @ echo "generate tables of dependencies" + + +# clean directory + +clean: + @ -rm -rf *.d *.o *.alog *.clog *.slog $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dfl/check1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dfl/check1.c new file mode 100644 index 0000000000000000000000000000000000000000..5b8244302847e4cab335c9667ffc604eb0719d24 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dfl/check1.c @@ -0,0 +1,286 @@ + +/******************************************************************************* +* +* File check1.c +* +* Copyright (C) 2007, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of the DFL_BLOCKS grid geometry arrays. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "block.h" +#include "dfl.h" +#include "global.h" + + +int main(int argc,char *argv[]) +{ + int my_rank,bc,bs[4],nbs,isw; + int nb,nbb,*nbbe,*nbbo,*obbe,*obbo; + int (*inn)[8],*idx,*ipp,*map; + int ix,iy,iz,ifc,ie; + int *bo1,*bo2; + int l[4],mu,is; + double phi[2],phi_prime[2]; + block_t *b; + dfl_grid_t dfl_grid; + FILE *flog=NULL,*fin=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check1.log","w",stdout); + fin=freopen("check1.in","r",stdin); + + printf("\n"); + printf("Check of the DFL_BLOCKS grid geometry arrays\n"); + printf("--------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + read_line("bs","%d %d %d %d",&bs[0],&bs[1],&bs[2],&bs[3]); + fclose(fin); + + printf("bs = %d %d %d %d\n\n",bs[0],bs[1],bs[2],bs[3]); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check1.c]", + "Syntax: check1 [-bc ]"); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.0; + phi[1]=0.0; + phi_prime[0]=0.0; + phi_prime[1]=0.0; + set_bc_parms(bc,1.0,1.0,1.0,1.0,phi,phi_prime); + print_bc_parms(); + + geometry(); + set_dfl_parms(bs,4); + dfl_grid=dfl_geometry(); + nb=dfl_grid.nb; + nbbe=dfl_grid.nbbe; + nbbo=dfl_grid.nbbo; + obbe=dfl_grid.obbe; + obbo=dfl_grid.obbo; + + alloc_bgr(DFL_BLOCKS); + b=blk_list(DFL_BLOCKS,&nbs,&isw); + + error((bs[0]!=(*b).bs[0])||(bs[1]!=(*b).bs[1])||(bs[2]!=(*b).bs[2])|| + (bs[3]!=(*b).bs[3])||(nb!=nbs),1,"main [check1.c]", + "Block sizes bs are incorrectly set or incorrect block number"); + + ie=0; + nbb=(nbbe[0]+nbbo[0]); + + if (obbe[0]!=0) + ie=1; + if (obbo[0]!=(obbe[7]+nbbe[7])) + ie=2; + + for (ifc=1;ifc<8;ifc++) + { + nbb+=(nbbe[ifc]+nbbo[ifc]); + + if (obbe[ifc]!=(obbe[ifc-1]+nbbe[ifc-1])) + ie=1; + if (obbo[ifc]!=(obbo[ifc-1]+nbbo[ifc-1])) + ie=2; + } + + error(nbb!=dfl_grid.nbb,1,"main [check1.c]","nbb is incorrect"); + error(ie==1,1,"main [check1.c]","Incorrect offsets obbe[ifc]"); + error(ie==2,1,"main [check1.c]","Incorrect offsets obbo[ifc]"); + + inn=dfl_grid.inn; + idx=dfl_grid.idx; + ipp=dfl_grid.ipp; + map=dfl_grid.map; + iz=0; + + for (ifc=0;ifc<8;ifc++) + { + for (ix=obbe[ifc];ix<(obbe[ifc]+nbbe[ifc]);ix++) + { + iy=ipp[ix]; + + if ((ix>obbe[ifc])&&(iy<=iz)) + ie=1; + + if (inn[iy][ifc]!=(nb+ix)) + ie=3; + + iz=iy; + } + + for (ix=obbo[ifc];ix<(obbo[ifc]+nbbo[ifc]);ix++) + { + iy=ipp[ix]; + + if ((ix>obbo[ifc])&&(iy<=iz)) + ie=2; + + if (inn[iy][ifc]!=(nb+ix)) + ie=3; + + iz=iy; + } + } + + error(ie==1,1,"main [check1.c]","Incorrect ipp at even boundary points"); + error(ie==2,1,"main [check1.c]","Incorrect ipp at odd boundary points"); + error(ie==3,1,"main [check1.c]","ipp and inn are inconsistent"); + + for (ix=0;ix0)&&(ix<(nb/2)))||(ix>(nb/2))) + { + if (idx[ix]!=idx[ix-1]+1) + ie=2; + } + + if (((ix==0)&&(isw==0))||((ix==(nb/2))&&(isw==1))) + { + bo1=b[idx[ix]].bo; + + for (mu=0;mu<4;mu++) + { + if (bo1[mu]!=0) + ie=3; + } + } + } + + error(ie==1,1,"main [check1.c]","Index array idx[ix] is not involutive"); + error(ie==2,1,"main [check1.c]","The ordering of idx[ix] is incorrect"); + error(ie==3,1,"main [check1.c]","Index of the first block is incorrect "); + + for (ix=0;ix=(nb+nbb))) + ie=1; + else + { + if (iy +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "utils.h" +#include "flags.h" +#include "lattice.h" +#include "block.h" +#include "linalg.h" +#include "sflds.h" +#include "vflds.h" +#include "dfl.h" +#include "global.h" + + +static void check_basis(int Ns,double *dev0,double *dev1) +{ + int nb,isw,i,j; + double dev,x[2],y[2]; + complex_dble z; + block_t *b,*bm; + + b=blk_list(DFL_BLOCKS,&nb,&isw); + bm=b+nb; + + x[0]=0.0; + x[1]=0.0; + + for (;bx[0]) + x[0]=dev; + } + + assign_s2sd((*b).vol,(*b).s[i],(*b).sd[0]); + mulr_spinor_add_dble((*b).vol,(*b).sd[0],(*b).sd[i],-1.0); + dev=norm_square_dble((*b).vol,0,(*b).sd[0]); + dev=sqrt(dev); + + if (dev>x[1]) + x[1]=dev; + } + } + + if (NPROC>1) + { + MPI_Reduce(x,y,2,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Bcast(y,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + + (*dev0)=y[0]; + (*dev1)=y[1]; + } + else + { + (*dev0)=x[0]; + (*dev1)=x[1]; + } +} + + +int main(int argc,char *argv[]) +{ + int my_rank,bc,i; + int bs[4],Ns,nv; + double phi[2],phi_prime[2]; + double dev,dev0,dev1; + complex **vm,**wv,z; + complex_dble **wvd; + spinor **ws; + spinor_dble **wsd; + FILE *fin=NULL,*flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check2.log","w",stdout); + fin=freopen("check2.in","r",stdin); + + printf("\n"); + printf("Check of the programs in the module dfl_subspace.c\n"); + printf("--------------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + read_line("bs","%d %d %d %d",&bs[0],&bs[1],&bs[2],&bs[3]); + read_line("Ns","%d",&Ns); + fclose(fin); + + printf("bs = %d %d %d %d\n",bs[0],bs[1],bs[2],bs[3]); + printf("Ns = %d\n\n",Ns); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check2.c]", + "Syntax: check2 [-bc ]"); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&Ns,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.0; + phi[1]=0.0; + phi_prime[0]=0.0; + phi_prime[1]=0.0; + set_bc_parms(bc,1.0,1.0,1.0,1.0,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,123456); + geometry(); + set_dfl_parms(bs,Ns); + + alloc_ws(Ns+1); + alloc_wsd(1); + alloc_wv(2); + alloc_wvd(2); + + ws=reserve_ws(Ns+1); + wsd=reserve_wsd(1); + vm=vflds()+Ns; + wv=reserve_wv(2); + wvd=reserve_wvd(2); + nv=Ns*VOLUME/(bs[0]*bs[1]*bs[2]*bs[3]); + + for (i=0;idev0) + dev0=dev; + + assign_s2s(VOLUME,ws[i],ws[Ns]); + dfl_sub_v2s(vm[i],ws[Ns]); + dev=(double)(norm_square(VOLUME,1,ws[Ns])/ + norm_square(VOLUME,1,ws[i])); + if (dev>dev1) + dev1=dev; + } + + if (my_rank==0) + { + printf("Check of the single-precision vector modes:\n"); + printf("Using dfl_v2s: %.1e\n",sqrt(dev0)); + printf("Using dfl_sub_v2s: %.1e\n\n",sqrt(dev1)); + } + + random_v(nv,wv[0],1.0f); + random_vd(nv,wvd[0],1.0); + + dfl_v2s(wv[0],ws[Ns]); + dfl_s2v(ws[Ns],wv[1]); + z.re=-1.0f; + z.im=0.0f; + mulc_vadd(nv,wv[0],wv[1],z); + dev0=(double)(vnorm_square(nv,1,wv[0])/ + vnorm_square(nv,1,wv[1])); + + dfl_vd2sd(wvd[0],wsd[0]); + dfl_sd2vd(wsd[0],wvd[1]); + diff_vd2v(nv,wvd[0],wvd[1],wv[0]); + assign_vd2v(nv,wvd[1],wv[1]); + dev1=(double)(vnorm_square(nv,1,wv[0])/ + vnorm_square(nv,1,wv[1])); + + dfl_sub_vd2sd(wvd[1],wsd[0]); + dev=norm_square_dble(VOLUME,1,wsd[0])/vnorm_square_dble(nv,1,wvd[1]); + if (dev>dev1) + dev1=dev; + + if (my_rank==0) + { + printf("Check of\n"); + printf("dfl_s2v,..: %.1e\n",sqrt(dev0)); + printf("dfl_sd2vd,..: %.1e\n\n",sqrt(dev1)); + } + + error_chk(); + if (my_rank==0) + fclose(flog); + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dfl/check2.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dfl/check2.in new file mode 100644 index 0000000000000000000000000000000000000000..20253b46f4d0a08e55b1fbbf01d1a7f70685d112 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dfl/check2.in @@ -0,0 +1,2 @@ +bs 8 4 4 4 +Ns 4 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dfl/check3.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dfl/check3.c new file mode 100644 index 0000000000000000000000000000000000000000..89185a0df62c1335f9b8bd394a26b22149946cd8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dfl/check3.c @@ -0,0 +1,317 @@ + +/******************************************************************************* +* +* File check3.c +* +* Copyright (C) 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of the solver for the little Dirac equation. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "archive.h" +#include "uflds.h" +#include "sw_term.h" +#include "sflds.h" +#include "vflds.h" +#include "linalg.h" +#include "dirac.h" +#include "sap.h" +#include "little.h" +#include "dfl.h" +#include "global.h" + +int my_rank,id,first,last,step; +int bs[4],Ns,nkv,nmx,eoflg,bc; +double kappa,csw,mu,cF,cF_prime; +double phi[2],phi_prime[2],m0,res; +char cnfg_dir[NAME_SIZE],cnfg_file[NAME_SIZE],nbase[NAME_SIZE]; + + +static void new_subspace(void) +{ + int nb,isw,ifail; + int n,nmax,k,l; + spinor **mds,**ws; + sap_parms_t sp; + + blk_list(SAP_BLOCKS,&nb,&isw); + + if (nb==0) + alloc_bgr(SAP_BLOCKS); + + assign_ud2ubgr(SAP_BLOCKS); + sw_term(NO_PTS); + ifail=assign_swd2swbgr(SAP_BLOCKS,ODD_PTS); + + error(ifail!=0,1,"new_subspace [check3.c]", + "Inversion of the SW term was not safe"); + + sp=sap_parms(); + nmax=6; + mds=reserve_ws(Ns); + ws=reserve_ws(1); + + for (k=0;k %sn%d in steps of %d\n\n", + nbase,first,nbase,last,step); + fflush(flog); + } + + error_root(((last-first)%step)!=0,1,"main [check3.c]", + "last-first is not a multiple of step"); + check_dir_root(cnfg_dir); + + nsize=name_size("%s/%sn%d",cnfg_dir,nbase,last); + error_root(nsize>=NAME_SIZE,1,"main [check3.c]", + "cnfg_dir name is too long"); + + for (icnfg=first;icnfg<=last;icnfg+=step) + { + sprintf(cnfg_file,"%s/%sn%d",cnfg_dir,nbase,icnfg); + import_cnfg(cnfg_file); + chs_ubnd(-1); + + if (my_rank==0) + { + printf("Configuration no %d\n",icnfg); + fflush(flog); + } + + new_subspace(); + random_vd(nv,wvd[0],1.0); + nrm=sqrt(vnorm_square_dble(nv,1,wvd[0])); + assign_vd2vd(nv,wvd[0],wvd[2]); + set_Awhat(mu); + + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + + rho=ltl_gcr(nkv,nmx,res,mu,wvd[0],wvd[1],&status); + + MPI_Barrier(MPI_COMM_WORLD); + wt2=MPI_Wtime(); + wdt=wt2-wt1; + + error_chk(); + z.re=-1.0; + z.im=0.0; + mulc_vadd_dble(nv,wvd[2],wvd[0],z); + del=vnorm_square_dble(nv,1,wvd[2]); + error_root(del!=0.0,1,"main [check3.c]", + "Source field is not preserved"); + + set_Aw(mu); + set_Awhat(mu); + Aw_dble(wvd[1],wvd[2]); + mulc_vadd_dble(nv,wvd[2],wvd[0],z); + Aweeinv_dble(wvd[2],wvd[3]); + assign_vd2vd(nv/2,wvd[3],wvd[2]); + del=sqrt(vnorm_square_dble(nv,1,wvd[2])); + + if (my_rank==0) + { + printf("status = %d\n",status); + printf("rho = %.2e, res = %.2e\n",rho,res); + printf("check = %.2e, check = %.2e\n",del,del/nrm); + printf("time = %.2e sec (total)\n",wdt); + if (status>0) + printf(" = %.2e usec (per point and GCR iteration)", + (1.0e6*wdt)/((double)(status)*(double)(VOLUME))); + printf("\n\n"); + fflush(flog); + } + + ltl_gcr(nkv,nmx,res,mu,wvd[0],wvd[0],&status); + mulc_vadd_dble(nv,wvd[0],wvd[1],z); + del=vnorm_square_dble(nv,1,wvd[0]); + error_root(del!=0.0,1,"main [check3.c]", + "Incorrect result when the input and output fields coincide"); + } + + if (my_rank==0) + fclose(flog); + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dfl/check3.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dfl/check3.in new file mode 100644 index 0000000000000000000000000000000000000000..d4e586de5590793ec90ccc044b3b5c6b57d9dadd --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dfl/check3.in @@ -0,0 +1,29 @@ + +[Configurations] +name 16x8x8x8b6.00id2 +cnfg_dir /home/data/openQCD/cnfg +first 7 +last 7 +step 1 + +[Lattice parameters] +kappa 0.1280 +csw 1.2 +mu 0.0123 +eoflg 1 + +[Boundary conditions] +type 0 +#phi 0.12 -0.56 +#phi' 0.92 0.76 +cF 0.95 +#cF' 0.90 + +[DFL] +bs 4 4 4 4 +Ns 4 + +[GCR] +nkv 16 +nmx 48 +res 1.0e-13 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dfl/check4.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dfl/check4.c new file mode 100644 index 0000000000000000000000000000000000000000..4b3829a5a6e1f7bcd3248b9fb61125108dd66679 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dfl/check4.c @@ -0,0 +1,339 @@ + +/******************************************************************************* +* +* File check4.c +* +* Copyright (C) 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check and performance of the deflated SAP+GCR solver. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "archive.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "dirac.h" +#include "dfl.h" +#include "global.h" + +int my_rank,id,first,last,step; +int bs_sap[4],nmr_sap,ncy_sap,nkv_gcr,nmx_gcr; +int bs_dfl[4],Ns,nkv_dfl,nmx_dfl,nkv_dpr,nmx_dpr,eoflg,bc; +int ninv_dgn,nmr_dgn,ncy_dgn; +double kappa,csw,mu,cF,cF_prime; +double phi[2],phi_prime[2],m0,res_gcr,res_dpr; +double kappa_dgn,mu_dgn; +char cnfg_dir[NAME_SIZE],cnfg_file[NAME_SIZE],nbase[NAME_SIZE]; + + +int main(int argc,char *argv[]) +{ + int nsize,icnfg,ncnfg; + int status[2],avgstat[2]; + double rho,nrm,del,resm; + double wt1,wt2,wdt,wta; + spinor_dble **psd; + lat_parms_t lat; + tm_parms_t tm; + dfl_pro_parms_t dpr; + FILE *flog=NULL,*fin=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check4.log","w",stdout); + fin=freopen("check4.in","r",stdin); + + printf("\n"); + printf("Check and performance of the deflated SAP+GCR solver\n"); + printf("----------------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + find_section("Configurations"); + read_line("name","%s",nbase); + read_line("cnfg_dir","%s",cnfg_dir); + read_line("first","%d",&first); + read_line("last","%d",&last); + read_line("step","%d",&step); + + find_section("Lattice parameters"); + read_line("kappa","%lf",&kappa); + read_line("csw","%lf",&csw); + read_line("mu","%lf",&mu); + read_line("eoflg","%d",&eoflg); + + find_section("Boundary conditions"); + read_line("type","%d",&bc); + + phi[0]=0.0; + phi[1]=0.0; + phi_prime[0]=0.0; + phi_prime[1]=0.0; + cF=1.0; + cF_prime=1.0; + + if (bc==1) + read_dprms("phi",2,phi); + + if ((bc==1)||(bc==2)) + read_dprms("phi'",2,phi_prime); + + if (bc!=3) + read_line("cF","%lf",&cF); + + if (bc==2) + read_line("cF'","%lf",&cF_prime); + else + cF_prime=cF; + + find_section("SAP"); + read_line("bs","%d %d %d %d",bs_sap,bs_sap+1,bs_sap+2,bs_sap+3); + read_line("nmr","%d",&nmr_sap); + read_line("ncy","%d",&ncy_sap); + + find_section("Deflation subspace"); + read_line("bs","%d %d %d %d",bs_dfl,bs_dfl+1,bs_dfl+2,bs_dfl+3); + read_line("Ns","%d",&Ns); + + find_section("Deflation subspace generation"); + read_line("kappa","%lf",&kappa_dgn); + read_line("mu","%lf",&mu_dgn); + read_line("ninv","%d",&ninv_dgn); + read_line("nmr","%d",&nmr_dgn); + read_line("ncy","%d",&ncy_dgn); + + find_section("Deflation projection"); + read_line("nkv","%d",&nkv_dpr); + read_line("nmx","%d",&nmx_dpr); + read_line("res","%lf",&res_dpr); + + find_section("GCR"); + read_line("nkv","%d",&nkv_gcr); + read_line("nmx","%d",&nmx_gcr); + read_line("res","%lf",&res_gcr); + + fclose(fin); + } + + MPI_Bcast(nbase,NAME_SIZE,MPI_CHAR,0,MPI_COMM_WORLD); + MPI_Bcast(cnfg_dir,NAME_SIZE,MPI_CHAR,0,MPI_COMM_WORLD); + MPI_Bcast(&first,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&last,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&step,1,MPI_INT,0,MPI_COMM_WORLD); + + MPI_Bcast(&kappa,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&csw,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cF,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&mu,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&eoflg,1,MPI_INT,0,MPI_COMM_WORLD); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(phi,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(phi_prime,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cF,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cF_prime,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + MPI_Bcast(bs_sap,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmr_sap,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&ncy_sap,1,MPI_INT,0,MPI_COMM_WORLD); + + MPI_Bcast(bs_dfl,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&Ns,1,MPI_INT,0,MPI_COMM_WORLD); + + MPI_Bcast(&kappa_dgn,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&mu_dgn,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&ninv_dgn,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmr_dgn,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&ncy_dgn,1,MPI_INT,0,MPI_COMM_WORLD); + + MPI_Bcast(&nkv_dpr,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmx_dpr,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&res_dpr,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + MPI_Bcast(&nkv_gcr,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmx_gcr,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&res_gcr,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + lat=set_lat_parms(5.5,1.0,1,&kappa,csw); + print_lat_parms(); + + set_bc_parms(bc,1.0,1.0,cF,cF_prime,phi,phi_prime); + print_bc_parms(); + + set_sap_parms(bs_sap,1,nmr_sap,ncy_sap); + m0=lat.m0[0]; + set_sw_parms(m0); + tm=set_tm_parms(eoflg); + set_dfl_parms(bs_dfl,Ns); + dpr=set_dfl_pro_parms(nkv_dpr,nmx_dpr,res_dpr); + set_dfl_gen_parms(kappa_dgn,mu_dgn,ninv_dgn,nmr_dgn,ncy_dgn); + + if (my_rank==0) + { + printf("mu = %.6f\n",mu); + printf("eoflg = %d\n\n",tm.eoflg); + } + + print_sap_parms(1); + print_dfl_parms(0); + + if (my_rank==0) + { + printf("GCR parameters:\n"); + printf("nkv = %d\n",nkv_gcr); + printf("nmx = %d\n",nmx_gcr); + printf("res = %.2e\n\n",res_gcr); + + printf("Configurations %sn%d -> %sn%d in steps of %d\n\n", + nbase,first,nbase,last,step); + fflush(flog); + } + + start_ranlux(0,1234); + geometry(); + + if (Ns<=(2*nkv_gcr)) + alloc_ws(2*nkv_gcr+2); + else + alloc_ws(Ns+2); + alloc_wsd(6); + alloc_wv(2*dpr.nkv+2); + alloc_wvd(4); + psd=reserve_wsd(3); + + error_root(((last-first)%step)!=0,1,"main [check4.c]", + "last-first is not a multiple of step"); + + nsize=name_size("%s/%sn%d",cnfg_dir,nbase,last); + error_root(nsize>=NAME_SIZE,1,"main [check4.c]", + "cnfg_dir name is too long"); + + ncnfg=0; + avgstat[0]=0; + avgstat[1]=0; + resm=0.0; + wta=0.0; + + for (icnfg=first;icnfg<=last;icnfg+=step) + { + sprintf(cnfg_file,"%s/%sn%d",cnfg_dir,nbase,icnfg); + import_cnfg(cnfg_file); + chs_ubnd(-1); + + if (my_rank==0) + { + printf("Configuration no %d\n",icnfg); + fflush(flog); + } + + dfl_modes(status); + error_root(status[0]<0,1,"main [check4.c]", + "Subspace generation failed"); + random_sd(VOLUME,psd[0],1.0); + bnd_sd2zero(ALL_PTS,psd[0]); + nrm=sqrt(norm_square_dble(VOLUME,1,psd[0])); + assign_sd2sd(VOLUME,psd[0],psd[2]); + + rho=dfl_sap_gcr(nkv_gcr,nmx_gcr,res_gcr,mu,psd[0],psd[1],status); + + error_chk(); + mulr_spinor_add_dble(VOLUME,psd[2],psd[0],-1.0); + del=norm_square_dble(VOLUME,1,psd[2]); + error_root(del!=0.0,1,"main [check4.c]", + "Source field is not preserved"); + + Dw_dble(mu,psd[1],psd[2]); + mulr_spinor_add_dble(VOLUME,psd[2],psd[0],-1.0); + del=sqrt(norm_square_dble(VOLUME,1,psd[2])); + + if (my_rank==0) + { + printf("status = %d,%d\n",status[0],status[1]); + printf("rho = %.2e, res = %.2e\n",rho,res_gcr); + printf("check = %.2e, check = %.2e\n",del,del/nrm); + fflush(flog); + } + + if ((status[0]>=0)&&(status[1]>=0)) + { + ncnfg+=1; + avgstat[0]+=status[0]; + avgstat[1]+=status[1]; + del/=nrm; + + if (del>resm) + resm=del; + + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + + rho=dfl_sap_gcr(nkv_gcr,nmx_gcr,res_gcr,mu,psd[0],psd[0],status); + + MPI_Barrier(MPI_COMM_WORLD); + wt2=MPI_Wtime(); + wdt=wt2-wt1; + wta+=wdt; + + if (my_rank==0) + { + printf("time = %.2e sec (w/o preparatory steps)\n",wdt); + if (status[0]>0) + printf(" = %.2e usec (per point and GCR iteration)", + (1.0e6*wdt)/((double)(status[0])*(double)(VOLUME))); + printf("\n\n"); + fflush(flog); + } + + mulr_spinor_add_dble(VOLUME,psd[0],psd[1],-1.0); + del=norm_square_dble(VOLUME,1,psd[0]); + error_root(del!=0.0,1,"main [check4.c]","Incorrect result when " + "the input and output fields coincide"); + } + } + + if (my_rank==0) + { + printf("Summary of results\n"); + printf("------------------\n\n"); + + printf("Processed %d configurations\n",ncnfg); + printf("Solver failed in %d cases\n",(last-first)/step+1-ncnfg); + printf("Maximal relative residue = %.1e\n",resm); + + status[0]=(avgstat[0]+ncnfg/2)/ncnfg; + status[1]=(avgstat[1]+ncnfg/2)/ncnfg; + wta/=(double)(ncnfg); + + printf("Average status = %d,%d\n",status[0],status[1]); + printf("Average time = %.2e sec (w/o preparatory steps)\n",wta); + if (status[0]>0) + printf(" = %.2e usec (per point and GCR iteration)", + (1.0e6*wta)/((double)(status[0])*(double)(VOLUME))); + printf("\n\n"); + + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dfl/check4.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dfl/check4.in new file mode 100644 index 0000000000000000000000000000000000000000..25bee13da74ae390e47019e1b064520f4f04953b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dfl/check4.in @@ -0,0 +1,46 @@ + +[Configurations] +name 16x8x8x8b6.00id2 +cnfg_dir /home/data/openQCD/cnfg +first 7 +last 7 +step 1 + +[Lattice parameters] +kappa 0.1280 +csw 1.2 +mu 0.0123 +eoflg 1 + +[Boundary conditions] +type 0 +#phi 0.12 -0.56 +#phi' 0.92 0.76 +cF 0.95 +#cF' 0.90 + +[SAP] +bs 4 4 4 4 +nmr 4 +ncy 5 + +[Deflation subspace] +bs 4 4 4 4 +Ns 8 + +[Deflation subspace generation] +kappa 0.1350 +mu 0.01 +ninv 5 +nmr 4 +ncy 5 + +[Deflation projection] +nkv 16 +nmx 64 +res 1.0e-2 + +[GCR] +nkv 16 +nmx 48 +res 1.0e-10 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/INDEX b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/INDEX new file mode 100644 index 0000000000000000000000000000000000000000..7dad660954b1127c9cafc91bd6971a0e8e952b6f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/INDEX @@ -0,0 +1,34 @@ + +Programs for the O(a)-improved Wilson-Dirac operator. + +check1 Gauge covariance of Dw(). + +check2 Action of Dw() on plane waves. + +check3 Hermiticity of Dw() and comparison with + Dwee(),..,Dwhat(). + +check4 Gauge covariance of Dw_dble(). + +check5 Action of Dw_dble() on plane waves. + +check6 Hermiticity of Dw_dble() and comparison with + Dwee_dble(),..,Dwhat_dble(). + +check7 Comparison of Dw_blk() with Dw(). + +check8 Comparison of Dw_blk_dble() with Dw_dble(). + +check9 Comparison of Dw_bnd() with Dw(). + +time1 Timing of Dw() and Dwhat(). + +time2 Timing of Dw_dble() and Dwhat_dble(). + +time3 Timing of Dw_blk() and Dwhat_blk(). + +time4 Timing of Dw_blk_dble() and Dwhat_blk_dble(). + +The programs check1,..,time4 accept the option -bc that allows the +type of boundary condition to be chosen at runtime. When the option is not +set, open boundary conditions are assumed. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..480067a1ff7a9c0f6af992156130bfcb7e13c1fe --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/Makefile @@ -0,0 +1,143 @@ +################################################################################ +# +# Makefile to compile and link C programs with MPI subroutines +# +# Version valid for Linux machines with MPICH +# +# "make" compiles and links the specified main programs and modules, +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files generated by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +all: rmxeq mkdep mkxeq +.PHONY: all + + +# main programs and modules to be compiled + +MAIN = check1 check2 check3 check4 check5 check6 check7 check8 check9 \ + time1 time2 time3 time4 + +FLAGS = flags lat_parms sap_parms dfl_parms + +LATTICE = bcnds ftidx uidx geometry + +LINALG = salg salg_dble liealg cmatrix_dble + +RANDOM = ranlux ranlxs ranlxd gauss + +UFLDS = plaq_sum shift uflds udcom + +SU3FCTS = chexp su3prod su3ren cm3x3 random_su3 + +UTILS = endian mutils utils wspace + +SFLDS = sflds scom sdcom Pbnd Pbnd_dble + +TCHARGE = ftcom ftensor + +SW_TERM = pauli pauli_dble swflds sw_term + +DIRAC = Dw_dble Dw Dw_bnd + +BLOCK = block blk_grid map_u2blk map_sw2blk map_s2blk + +SAP = sap_com + +MODULES = $(FLAGS) $(LATTICE) $(LINALG) $(RANDOM) $(UFLDS) \ + $(SU3FCTS) $(UTILS) $(SFLDS) $(TCHARGE) $(SW_TERM) \ + $(DIRAC) $(BLOCK) $(SAP) + + +# Logging option (-mpilog or -mpitrace or -mpianim) + +LOGOPTION = + + +# search path for modules + +MDIR = ../../modules + +VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/linalg:$(MDIR)/random:\ + $(MDIR)/uflds:$(MDIR)/su3fcts:$(MDIR)/utils:$(MDIR)/sflds:\ + $(MDIR)/tcharge:$(MDIR)/sw_term:$(MDIR)/dirac:$(MDIR)/block:\ + $(MDIR)/sap + +# additional include directories + +INCPATH = $(MPI_INCLUDE) ../../include + + +# additional libraries + +LIBS = m + +LIBPATH = $(MPI_HOME)/lib + + +# scheduling and optimization options + +CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ + -Wall -Wno-long-long -Wstrict-prototypes -Werror \ + -O -mno-avx -Dx64 -DPM + + +############################## do not change ################################### + +SHELL=/bin/bash +CC=$(MPI_HOME)/bin/mpicc +CLINKER=$(CC) + +PGMS= $(MAIN) $(MODULES) + +-include $(addsuffix .d,$(PGMS)) + + +# rule to make dependencies + +$(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + +# rule to link object files + +$(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(CFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ + + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables + +rmxeq: + @ -rm -f $(MAIN); \ + echo "delete old executables" + + +# make dependencies + +mkdep: $(addsuffix .d,$(PGMS)) + @ echo "generate tables of dependencies" + + +# clean directory + +clean: + @ -rm -rf *.d *.o *.alog *.clog *.slog $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check1.c new file mode 100644 index 0000000000000000000000000000000000000000..cc8e00111892a04fc1955d984182c2b9ee325c7c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check1.c @@ -0,0 +1,400 @@ + +/******************************************************************************* +* +* File check1.c +* +* Copyright (C) 2005, 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Gauge covariance of Dw(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "dirac.h" +#include "global.h" + +#define N0 (NPROC0*L0) + +static int bc,nfc[8],ofs[8]; +static const su3_dble ud0={{0.0}}; +static su3_dble *g,*gbuf; +static su3_dble wd ALIGNED16; + + +static void pack_gbuf(void) +{ + int ifc,ib,ix; + + nfc[0]=FACE0/2; + nfc[1]=FACE0/2; + nfc[2]=FACE1/2; + nfc[3]=FACE1/2; + nfc[4]=FACE2/2; + nfc[5]=FACE2/2; + nfc[6]=FACE3/2; + nfc[7]=FACE3/2; + + ofs[0]=0; + ofs[1]=ofs[0]+nfc[0]; + ofs[2]=ofs[1]+nfc[1]; + ofs[3]=ofs[2]+nfc[2]; + ofs[4]=ofs[3]+nfc[3]; + ofs[5]=ofs[4]+nfc[4]; + ofs[6]=ofs[5]+nfc[5]; + ofs[7]=ofs[6]+nfc[6]; + + for (ifc=0;ifc<8;ifc++) + { + for (ib=0;ib0) + { + tag=mpi_tag(); + saddr=npr[ifc^0x1]; + raddr=npr[ifc]; + sbuf=gbuf+ofs[ifc]; + rbuf=g+VOLUME+ofs[ifc]; + + if (np&0x1) + { + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + } + else + { + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + } + } + } +} + + +static void random_g(void) +{ + int ix,t; + su3_dble unity,*gx; + + unity=ud0; + unity.c11.re=1.0; + unity.c22.re=1.0; + unity.c33.re=1.0; + gx=g; + + for (ix=0;ix0)||(bc!=1)) + random_su3_dble(gx); + else + (*gx)=unity; + + gx+=1; + } + + if (BNDRY>0) + { + pack_gbuf(); + send_gbuf(); + } +} + + +static void transform_ud(void) +{ + int ix,iy,t,ifc; + su3_dble *u; + + u=udfld(); + + for (ix=(VOLUME/2);ix]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.55,0.78,0.9012,1.2034,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,12345); + geometry(); + alloc_wsd(5); + alloc_ws(5); + ps=reserve_ws(5); + psd=reserve_wsd(5); + + g=amalloc(NSPIN*sizeof(*g),4); + if (BNDRY!=0) + gbuf=amalloc((BNDRY/2)*sizeof(*gbuf),4); + + error((g==NULL)||((BNDRY!=0)&&(gbuf==NULL)),1,"main [check1.c]", + "Unable to allocate auxiliary arrays"); + + swp=set_sw_parms(-0.0123); + mu=0.0376; + + if (my_rank==0) + printf("m0 = %.4e, csw = %.4e, cF = %.4e, cF' = %.4e\n\n", + swp.m0,swp.csw,swp.cF[0],swp.cF[1]); + + random_g(); + random_ud(); + chs_ubnd(-1); + sw_term(NO_PTS); + + assign_ud2u(); + assign_swd2sw(); + + for (i=0;i<5;i++) + { + random_sd(NSPIN,psd[i],1.0); + assign_sd2s(NSPIN,psd[i],ps[i]); + } + + assign_s2s(VOLUME,ps[0],ps[4]); + bnd_s2zero(ALL_PTS,ps[4]); + Dw(mu,ps[0],ps[1]); + mulr_spinor_add(VOLUME,ps[4],ps[0],-1.0f); + d=norm_square(VOLUME,1,ps[4]); + error(d!=0.0f,1,"main [check1.c]","Dw() changes the input field"); + + Dw(mu,ps[0],ps[4]); + mulr_spinor_add(VOLUME,ps[4],ps[1],-1.0f); + d=norm_square(VOLUME,1,ps[4]); + error(d!=0.0f,1,"main [check1.c]","Action of Dw() depends " + "on the boundary values of the input field"); + + assign_s2s(VOLUME,ps[1],ps[4]); + bnd_s2zero(ALL_PTS,ps[4]); + mulr_spinor_add(VOLUME,ps[4],ps[1],-1.0f); + d=norm_square(VOLUME,1,ps[4]); + error(d!=0.0f,1,"main [check1.c]", + "Dw() does not preserve the zero boundary values"); + + transform_ud(); + transform_sd(psd[0],psd[2]); + sw_term(NO_PTS); + + assign_ud2u(); + assign_swd2sw(); + assign_sd2s(VOLUME,psd[2],ps[2]); + + Dw(mu,ps[2],ps[3]); + assign_s2sd(VOLUME,ps[1],psd[1]); + transform_sd(psd[1],psd[2]); + assign_sd2s(VOLUME,psd[2],ps[2]); + + mulr_spinor_add(VOLUME,ps[3],ps[2],-1.0f); + d=norm_square(VOLUME,1,ps[3])/norm_square(VOLUME,1,ps[0]); + error_chk(); + + if (my_rank==0) + { + printf("Normalized difference = %.2e\n",sqrt((double)(d))); + printf("(should be less than 1*10^(-6) or so)\n\n"); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check2.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check2.c new file mode 100644 index 0000000000000000000000000000000000000000..8e13e38e1c691e2a421f8aea886df38a746a231b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check2.c @@ -0,0 +1,345 @@ + +/******************************************************************************* +* +* File check2.c +* +* Copyright (C) 2005, 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Action of Dw() on plane waves. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "dirac.h" +#include "global.h" + +static spinor rs ALIGNED16; +static const spinor sd0={{{0.0}}}; + + +static su3_vector mul_cplx(complex z,su3_vector s) +{ + su3_vector r; + + r.c1.re=z.re*s.c1.re-z.im*s.c1.im; + r.c1.im=z.im*s.c1.re+z.re*s.c1.im; + r.c2.re=z.re*s.c2.re-z.im*s.c2.im; + r.c2.im=z.im*s.c2.re+z.re*s.c2.im; + r.c3.re=z.re*s.c3.re-z.im*s.c3.im; + r.c3.im=z.im*s.c3.re+z.re*s.c3.im; + + return r; +} + + +static spinor mul_gamma(int mu,spinor s) +{ + spinor r; + complex i,m_i,m_1; + + i.re=0.0f; + i.im=1.0f; + + m_i.re=0.0f; + m_i.im=-1.0f; + + m_1.re=-1.0f; + m_1.im=0.0f; + + if (mu==0) + { + r.c1=mul_cplx(m_1,s.c3); + r.c2=mul_cplx(m_1,s.c4); + r.c3=mul_cplx(m_1,s.c1); + r.c4=mul_cplx(m_1,s.c2); + } + else if (mu==1) + { + r.c1=mul_cplx(m_i,s.c4); + r.c2=mul_cplx(m_i,s.c3); + r.c3=mul_cplx(i,s.c2); + r.c4=mul_cplx(i,s.c1); + } + else if (mu==2) + { + r.c1=mul_cplx(m_1,s.c4); + r.c2=s.c3; + r.c3=s.c2; + r.c4=mul_cplx(m_1,s.c1); + } + else if (mu==3) + { + r.c1=mul_cplx(m_i,s.c3); + r.c2=mul_cplx(i,s.c4); + r.c3=mul_cplx(i,s.c1); + r.c4=mul_cplx(m_i,s.c2); + } + else + { + r.c1=s.c1; + r.c2=s.c2; + r.c3=mul_cplx(m_1,s.c3); + r.c4=mul_cplx(m_1,s.c4); + } + + return r; +} + + +int main(int argc,char *argv[]) +{ + int my_rank,bc; + int n,i,ix,nu,x0,x1,x2,x3; + int np[4],bo[4]; + float ran[4]; + float mu,pi,d,dmax; + float mp,pt,pv,p[4],sp[4]; + double phi[2],phi_prime[2]; + complex z; + spinor **ps,s0,s1,s2,s3,s4; + sw_parms_t swp; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check2.log","w",stdout); + printf("\n"); + printf("Action of Dw() on plane waves\n"); + printf("-----------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + printf("For this test to pass, the calculated differences delta\n"); + printf("should be at most 1*10^(-5) or so\n\n"); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check2.c]", + "Syntax: check2 [-bc ]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.0; + phi[1]=0.0; + phi_prime[0]=0.0; + phi_prime[1]=0.0; + set_bc_parms(bc,0.55,0.78,0.9012,1.2034,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,12345); + geometry(); + alloc_ws(3); + ps=reserve_ws(3); + + swp=set_sw_parms(-0.0123); + mu=0.0876f; + + if (my_rank==0) + printf("m0 = %.4e, csw = %.4e, cF = %.4e, cF' = %.4e\n\n", + swp.m0,swp.csw,swp.cF[0],swp.cF[1]); + + (void)udfld(); + chs_ubnd(-1); + sw_term(NO_PTS); + assign_ud2u(); + assign_swd2sw(); + pi=(float)(4.0*atan(1.0)); + n=10; + bo[0]=cpr[0]*L0; + bo[1]=cpr[1]*L1; + bo[2]=cpr[2]*L2; + bo[3]=cpr[3]*L3; + dmax=0.0f; + + for (i=0;idmax) + dmax=d; + + if (my_rank==0) + printf("Normalized deviation = %.1e at p=(%d,%d,%d,%d)\n", + d,np[0],np[1],np[2],np[3]); + } + + error_chk(); + + if (my_rank==0) + { + printf("\n"); + printf("Maximal normalized deviation = %.1e\n\n",dmax); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check3.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check3.c new file mode 100644 index 0000000000000000000000000000000000000000..06e819aad7b435cc113b89b764fb0426ef1d019a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check3.c @@ -0,0 +1,301 @@ + +/******************************************************************************* +* +* File check3.c +* +* Copyright (C) 2005, 2008, 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Hermiticity of Dw() and comparison with Dwee(),..,Dwhat(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "dirac.h" +#include "global.h" + + +int main(int argc,char *argv[]) +{ + int my_rank,bc,i; + float mu,d; + double phi[2],phi_prime[2]; + complex z1,z2; + spinor **ps; + sw_parms_t swp; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check3.log","w",stdout); + printf("\n"); + printf("Hermiticity of Dw() and comparison with Dwee(),..,Dwhat()\n"); + printf("---------------------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + printf("For this test to pass, the calculated differences\n"); + printf("should be at most 1*10^(-5) or so\n\n"); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check1.c]", + "Syntax: check1 [-bc ]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.55,0.78,0.9012,1.2034,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,12345); + geometry(); + alloc_ws(5); + ps=reserve_ws(5); + + swp=set_sw_parms(-0.0123); + mu=0.0376; + + if (my_rank==0) + printf("m0 = %.4e, csw = %.4e, cF = %.4e, cF' = %.4e\n\n", + swp.m0,swp.csw,swp.cF[0],swp.cF[1]); + + random_ud(); + chs_ubnd(-1); + sw_term(NO_PTS); + assign_ud2u(); + assign_swd2sw(); + + for (i=0;i<4;i++) + random_s(NSPIN,ps[i],1.0f); + + Dw(mu,ps[0],ps[2]); + mulg5(VOLUME,ps[2]); + Dw(-mu,ps[1],ps[3]); + mulg5(VOLUME,ps[3]); + + z1=spinor_prod(VOLUME,1,ps[0],ps[3]); + z2=spinor_prod(VOLUME,1,ps[2],ps[1]); + + d=(float)(sqrt((double)((z1.re-z2.re)*(z1.re-z2.re)+ + (z1.im-z2.im)*(z1.im-z2.im)))); + d/=(float)(sqrt((double)(12*NPROC)*(double)(VOLUME))); + error_chk(); + + if (my_rank==0) + printf("Deviation from gamma5-Hermiticity = %.1e\n",d); + + for (i=0;i<4;i++) + random_s(NSPIN,ps[i],1.0f); + + assign_s2s(VOLUME,ps[0],ps[1]); + assign_s2s(VOLUME,ps[2],ps[3]); + Dwee(mu,ps[1],ps[2]); + + bnd_s2zero(EVEN_PTS,ps[0]); + mulr_spinor_add(VOLUME,ps[1],ps[0],-1.0f); + d=norm_square(VOLUME,1,ps[1]); + + error(d!=0.0f,1,"main [check3.c]", + "Dwee() changes the input field in unexpected ways"); + + mulr_spinor_add(VOLUME/2,ps[2]+(VOLUME/2),ps[3]+(VOLUME/2),-1.0f); + assign_s2s(VOLUME/2,ps[2],ps[4]); + bnd_s2zero(EVEN_PTS,ps[4]); + mulr_spinor_add(VOLUME/2,ps[2],ps[4],-1.0f); + d=norm_square(VOLUME,1,ps[2]); + + error(d!=0.0f,1,"main [check3.c]", + "Dwee() changes the output field where it should not"); + + for (i=0;i<4;i++) + random_s(NSPIN,ps[i],1.0f); + + assign_s2s(VOLUME,ps[0],ps[1]); + assign_s2s(VOLUME,ps[2],ps[3]); + Dwoo(mu,ps[1],ps[2]); + + bnd_s2zero(ODD_PTS,ps[0]); + mulr_spinor_add(VOLUME,ps[1],ps[0],-1.0f); + d=norm_square(VOLUME,1,ps[1]); + + error(d!=0.0f,1,"main [check3.c]", + "Dwoo() changes the input field in unexpected ways"); + + mulr_spinor_add(VOLUME/2,ps[2],ps[3],-1.0f); + assign_s2s(VOLUME/2,ps[2]+(VOLUME/2),ps[4]+(VOLUME/2)); + bnd_s2zero(ODD_PTS,ps[4]); + mulr_spinor_add(VOLUME/2,ps[2]+(VOLUME/2),ps[4]+(VOLUME/2),-1.0f); + d=norm_square(VOLUME,1,ps[2]); + + error(d!=0.0f,1,"main [check3.c]", + "Dwoo() changes the output field where it should not"); + + for (i=0;i<4;i++) + random_s(NSPIN,ps[i],1.0f); + + assign_s2s(VOLUME,ps[0],ps[1]); + assign_s2s(VOLUME,ps[2],ps[3]); + Dwoe(ps[1],ps[2]); + + bnd_s2zero(EVEN_PTS,ps[0]); + mulr_spinor_add(VOLUME,ps[1],ps[0],-1.0f); + d=norm_square(VOLUME,1,ps[1]); + + error(d!=0.0f,1,"main [check3.c]", + "Dwoe() changes the input field in unexpected ways"); + + mulr_spinor_add(VOLUME/2,ps[2],ps[3],-1.0f); + assign_s2s(VOLUME/2,ps[2]+(VOLUME/2),ps[4]+(VOLUME/2)); + bnd_s2zero(ODD_PTS,ps[4]); + mulr_spinor_add(VOLUME/2,ps[2]+(VOLUME/2),ps[4]+(VOLUME/2),-1.0f); + d=norm_square(VOLUME,1,ps[2]); + + error(d!=0.0f,1,"main [check3.c]", + "Dwoe() changes the output field where it should not"); + + for (i=0;i<4;i++) + random_s(NSPIN,ps[i],1.0f); + + assign_s2s(VOLUME,ps[0],ps[1]); + assign_s2s(VOLUME,ps[2],ps[3]); + Dweo(ps[1],ps[2]); + + bnd_s2zero(ODD_PTS,ps[0]); + mulr_spinor_add(VOLUME,ps[1],ps[0],-1.0f); + d=norm_square(VOLUME,1,ps[1]); + + error(d!=0.0f,1,"main [check3.c]", + "Dweo() changes the input field in unexpected ways"); + + mulr_spinor_add(VOLUME/2,ps[2]+(VOLUME/2),ps[3]+(VOLUME/2),-1.0f); + assign_s2s(VOLUME/2,ps[2],ps[4]); + bnd_s2zero(EVEN_PTS,ps[4]); + mulr_spinor_add(VOLUME/2,ps[2],ps[4],-1.0f); + d=norm_square(VOLUME,1,ps[2]); + + error(d!=0.0f,1,"main [check3.c]", + "Dweo() changes the output field where it should not"); + + for (i=0;i<4;i++) + random_s(NSPIN,ps[i],1.0f); + + assign_s2s(VOLUME,ps[0],ps[1]); + assign_s2s(VOLUME,ps[2],ps[3]); + Dwhat(mu,ps[1],ps[2]); + + bnd_s2zero(EVEN_PTS,ps[0]); + mulr_spinor_add(VOLUME,ps[1],ps[0],-1.0f); + d=norm_square(VOLUME,1,ps[1]); + + error(d!=0.0f,1,"main [check3.c]", + "Dwhat() changes the input field in unexpected ways"); + + mulr_spinor_add(VOLUME/2,ps[2]+(VOLUME/2),ps[3]+(VOLUME/2),-1.0f); + assign_s2s(VOLUME/2,ps[2],ps[4]); + bnd_s2zero(EVEN_PTS,ps[4]); + mulr_spinor_add(VOLUME/2,ps[2],ps[4],-1.0f); + d=norm_square(VOLUME,1,ps[2]); + + error(d!=0.0f,1,"main [check3.c]", + "Dwhat() changes the output field where it should not"); + + for (i=0;i<4;i++) + random_s(NSPIN,ps[i],1.0f); + + assign_s2s(VOLUME,ps[0],ps[2]); + Dw(mu,ps[0],ps[1]); + Dwee(mu,ps[2],ps[3]); + set_s2zero(VOLUME/2,ps[0]); + mulr_spinor_add(VOLUME/2,ps[0],ps[3],-1.0f); + Dweo(ps[2],ps[0]); + set_s2zero(VOLUME/2,ps[3]); + mulr_spinor_add(VOLUME/2,ps[3],ps[0],-1.0f); + + Dwoo(mu,ps[2],ps[3]); + Dwoe(ps[2],ps[4]); + mulr_spinor_add(VOLUME/2,ps[3]+(VOLUME/2),ps[4]+(VOLUME/2),1.0f); + mulr_spinor_add(VOLUME,ps[3],ps[1],-1.0f); + d=norm_square(VOLUME,1,ps[3])/norm_square(VOLUME,1,ps[1]); + d=(float)(sqrt((double)(d))); + + if (my_rank==0) + printf("Deviation of Dw() from Dwee(),.. = %.1e\n",d); + + for (i=0;i<4;i++) + random_s(NSPIN,ps[i],1.0f); + + assign_s2s(NSPIN,ps[0],ps[1]); + Dwhat(mu,ps[0],ps[2]); + + Dwoe(ps[1],ps[1]); + Dwee(mu,ps[1],ps[1]); + Dwoo(0.0,ps[1],ps[1]); + Dweo(ps[1],ps[1]); + + mulr_spinor_add(VOLUME/2,ps[1],ps[2],-1.0f); + d=norm_square(VOLUME/2,1,ps[1])/norm_square(VOLUME/2,1,ps[2]); + d=(float)(sqrt((double)(d))); + + if (my_rank==0) + printf("Deviation of Dwhat() from Dwee(),.. = %.1e\n",d); + + for (i=0;i<4;i++) + random_s(NSPIN,ps[i],1.0f); + + assign_s2s(VOLUME,ps[0],ps[2]); + + set_tm_parms(1); + Dw(mu,ps[0],ps[1]); + set_tm_parms(0); + + Dwee(mu,ps[2],ps[3]); + mulr_spinor_add(VOLUME/2,ps[1],ps[3],-1.0f); + Dweo(ps[2],ps[1]); + Dwoe(ps[2],ps[3]); + mulr_spinor_add(VOLUME/2,ps[1]+(VOLUME/2),ps[3]+(VOLUME/2),-1.0f); + Dwoo(0.0f,ps[2],ps[3]); + mulr_spinor_add(VOLUME/2,ps[1]+(VOLUME/2),ps[3]+(VOLUME/2),-1.0f); + d=norm_square(VOLUME,1,ps[1])/norm_square(VOLUME,1,ps[2]); + d=(float)(sqrt((double)(d))); + + error_chk(); + + if (my_rank==0) + { + printf("Check of Dw()|eoflg=1 = %.1e\n\n",d); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check4.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check4.c new file mode 100644 index 0000000000000000000000000000000000000000..d6901bad026190ddfe041b6d7846af9cc683b2e2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check4.c @@ -0,0 +1,384 @@ + +/******************************************************************************* +* +* File check4.c +* +* Copyright (C) 2005, 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Gauge covariance of Dw_dble(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "dirac.h" +#include "global.h" + +#define N0 (NPROC0*L0) + +static int bc,nfc[8],ofs[8]; +static const su3_dble ud0={{0.0}}; +static su3_dble *g,*gbuf; +static su3_dble wd ALIGNED16; + + +static void pack_gbuf(void) +{ + int ifc,ib,ix; + + nfc[0]=FACE0/2; + nfc[1]=FACE0/2; + nfc[2]=FACE1/2; + nfc[3]=FACE1/2; + nfc[4]=FACE2/2; + nfc[5]=FACE2/2; + nfc[6]=FACE3/2; + nfc[7]=FACE3/2; + + ofs[0]=0; + ofs[1]=ofs[0]+nfc[0]; + ofs[2]=ofs[1]+nfc[1]; + ofs[3]=ofs[2]+nfc[2]; + ofs[4]=ofs[3]+nfc[3]; + ofs[5]=ofs[4]+nfc[4]; + ofs[6]=ofs[5]+nfc[5]; + ofs[7]=ofs[6]+nfc[6]; + + for (ifc=0;ifc<8;ifc++) + { + for (ib=0;ib0) + { + tag=mpi_tag(); + saddr=npr[ifc^0x1]; + raddr=npr[ifc]; + sbuf=gbuf+ofs[ifc]; + rbuf=g+VOLUME+ofs[ifc]; + + if (np&0x1) + { + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + } + else + { + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + } + } + } +} + + +static void random_g(void) +{ + int ix,t; + su3_dble unity,*gx; + + unity=ud0; + unity.c11.re=1.0; + unity.c22.re=1.0; + unity.c33.re=1.0; + gx=g; + + for (ix=0;ix0)||(bc!=1)) + random_su3_dble(gx); + else + (*gx)=unity; + + gx+=1; + } + + if (BNDRY>0) + { + pack_gbuf(); + send_gbuf(); + } +} + + +static void transform_ud(void) +{ + int ix,iy,t,ifc; + su3_dble *u; + + u=udfld(); + + for (ix=(VOLUME/2);ix]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.55,0.78,0.9012,1.2034,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,12345); + geometry(); + alloc_wsd(5); + psd=reserve_wsd(5); + + g=amalloc(NSPIN*sizeof(*g),4); + if (BNDRY!=0) + gbuf=amalloc((BNDRY/2)*sizeof(*gbuf),4); + + error((g==NULL)||((BNDRY!=0)&&(gbuf==NULL)),1,"main [check4.c]", + "Unable to allocate auxiliary arrays"); + + swp=set_sw_parms(-0.0123); + mu=0.0376; + + if (my_rank==0) + printf("m0 = %.4e, csw = %.4e, cF = %.4e, cF' = %.4e\n\n", + swp.m0,swp.csw,swp.cF[0],swp.cF[1]); + + random_g(); + random_ud(); + chs_ubnd(-1); + sw_term(NO_PTS); + + for (i=0;i<4;i++) + random_sd(NSPIN,psd[i],1.0); + + assign_sd2sd(VOLUME,psd[0],psd[4]); + bnd_sd2zero(ALL_PTS,psd[4]); + Dw_dble(mu,psd[0],psd[1]); + mulr_spinor_add_dble(VOLUME,psd[4],psd[0],-1.0); + d=norm_square_dble(VOLUME,1,psd[4]); + error(d!=0.0,1,"main [check4.c]","Dw_dble() changes the input field"); + + Dw_dble(mu,psd[0],psd[4]); + mulr_spinor_add_dble(VOLUME,psd[4],psd[1],-1.0); + d=norm_square_dble(VOLUME,1,psd[4]); + error(d!=0.0,1,"main [check4.c]","Action of Dw_dble() depends " + "on the boundary values of the input field"); + + assign_sd2sd(VOLUME,psd[1],psd[4]); + bnd_sd2zero(ALL_PTS,psd[4]); + mulr_spinor_add_dble(VOLUME,psd[4],psd[1],-1.0); + d=norm_square_dble(VOLUME,1,psd[4]); + error(d!=0.0,1,"main [check4.c]", + "Dw_dble() does not vanish at global time 0 and NPROC0*L0-1 "); + + transform_sd(psd[0],psd[2]); + transform_ud(); + sw_term(NO_PTS); + Dw_dble(mu,psd[2],psd[3]); + transform_sd(psd[1],psd[2]); + + mulr_spinor_add_dble(VOLUME,psd[3],psd[2],-1.0); + d=norm_square_dble(VOLUME,1,psd[3])/norm_square_dble(VOLUME,1,psd[0]); + error_chk(); + + if (my_rank==0) + { + printf("Normalized difference = %.2e\n",sqrt(d)); + printf("(should be around 1*10^(-15) or so)\n\n"); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check5.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check5.c new file mode 100644 index 0000000000000000000000000000000000000000..63e86da45c91e3565834cbf97d54967ad6093110 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check5.c @@ -0,0 +1,343 @@ + +/******************************************************************************* +* +* File check5.c +* +* Copyright (C) 2005, 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Action of Dw_dble() on plane waves. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "dirac.h" +#include "global.h" + +static spinor_dble rs ALIGNED16; +static const spinor_dble sd0={{{0.0}}}; + + +static su3_vector_dble mul_cplx(complex_dble z,su3_vector_dble s) +{ + su3_vector_dble r; + + r.c1.re=z.re*s.c1.re-z.im*s.c1.im; + r.c1.im=z.im*s.c1.re+z.re*s.c1.im; + r.c2.re=z.re*s.c2.re-z.im*s.c2.im; + r.c2.im=z.im*s.c2.re+z.re*s.c2.im; + r.c3.re=z.re*s.c3.re-z.im*s.c3.im; + r.c3.im=z.im*s.c3.re+z.re*s.c3.im; + + return r; +} + + +static spinor_dble mul_gamma(int mu,spinor_dble s) +{ + spinor_dble r; + complex_dble i,m_i,m_1; + + i.re=0.0; + i.im=1.0; + + m_i.re=0.0; + m_i.im=-1.0; + + m_1.re=-1.0; + m_1.im=0.0; + + if (mu==0) + { + r.c1=mul_cplx(m_1,s.c3); + r.c2=mul_cplx(m_1,s.c4); + r.c3=mul_cplx(m_1,s.c1); + r.c4=mul_cplx(m_1,s.c2); + } + else if (mu==1) + { + r.c1=mul_cplx(m_i,s.c4); + r.c2=mul_cplx(m_i,s.c3); + r.c3=mul_cplx(i,s.c2); + r.c4=mul_cplx(i,s.c1); + } + else if (mu==2) + { + r.c1=mul_cplx(m_1,s.c4); + r.c2=s.c3; + r.c3=s.c2; + r.c4=mul_cplx(m_1,s.c1); + } + else if (mu==3) + { + r.c1=mul_cplx(m_i,s.c3); + r.c2=mul_cplx(i,s.c4); + r.c3=mul_cplx(i,s.c1); + r.c4=mul_cplx(m_i,s.c2); + } + else + { + r.c1=s.c1; + r.c2=s.c2; + r.c3=mul_cplx(m_1,s.c3); + r.c4=mul_cplx(m_1,s.c4); + } + + return r; +} + + +int main(int argc,char *argv[]) +{ + int my_rank,bc; + int n,i,ix,nu,x0,x1,x2,x3; + int np[4],bo[4]; + float ran[4]; + double phi[2],phi_prime[2]; + double mu,pi,d,dmax; + double mp,pt,pv,p[4],sp[4]; + complex_dble z; + spinor_dble **psd,s0,s1,s2,s3,s4; + sw_parms_t swp; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check5.log","w",stdout); + printf("\n"); + printf("Action of Dw_dble() on plane waves\n"); + printf("----------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + printf("For this test to pass, the calculated differences delta\n"); + printf("should be at most 1*10^(-14) or so\n\n"); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check5.c]", + "Syntax: check5 [-bc ]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.0; + phi[1]=0.0; + phi_prime[0]=0.0; + phi_prime[1]=0.0; + set_bc_parms(bc,0.55,0.78,0.9012,1.2034,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,12345); + geometry(); + alloc_wsd(3); + psd=reserve_wsd(3); + + swp=set_sw_parms(-0.0123); + mu=0.0876; + + if (my_rank==0) + printf("m0 = %.4e, csw = %.4e, cF = %.4e, cF' = %.4e\n\n", + swp.m0,swp.csw,swp.cF[0],swp.cF[1]); + + (void)udfld(); + chs_ubnd(-1); + sw_term(NO_PTS); + pi=4.0*atan(1.0); + n=10; + bo[0]=cpr[0]*L0; + bo[1]=cpr[1]*L1; + bo[2]=cpr[2]*L2; + bo[3]=cpr[3]*L3; + dmax=0.0; + + for (i=0;idmax) + dmax=d; + + if (my_rank==0) + printf("Normalized deviation = %.1e at p=(%d,%d,%d,%d)\n", + d,np[0],np[1],np[2],np[3]); + } + + error_chk(); + + if (my_rank==0) + { + printf("\n"); + printf("Maximal normalized deviation = %.1e\n\n",dmax); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check6.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check6.c new file mode 100644 index 0000000000000000000000000000000000000000..86b1fdb1bd8285bf1a581c655d630b78f52a8b17 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check6.c @@ -0,0 +1,302 @@ + +/******************************************************************************* +* +* File check6.c +* +* Copyright (C) 2005, 2008, 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Hermiticity of Dw_dble() and comparison with Dwee_dble(),..,Dwhat_dble(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "dirac.h" +#include "global.h" + + +int main(int argc,char *argv[]) +{ + int my_rank,bc,i; + double phi[2],phi_prime[2]; + double mu,d; + complex_dble z1,z2; + spinor_dble **psd; + sw_parms_t swp; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check6.log","w",stdout); + printf("\n"); + printf("Hermiticity of Dw_dble() and comparison with Dwee_dble(),..," + "Dwhat_dble()\n"); + printf("------------------------------------------------------------" + "------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + printf("For this test to pass, the calculated differences\n"); + printf("should be at most 1*10^(-15) or so\n\n"); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check6.c]", + "Syntax: check6 [-bc ]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.55,0.78,0.9012,1.2034,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,12345); + geometry(); + alloc_wsd(5); + psd=reserve_wsd(5); + + swp=set_sw_parms(-0.0123); + mu=0.0376; + + if (my_rank==0) + printf("m0 = %.4e, csw = %.4e, cF = %.4e, cF' = %.4e\n\n", + swp.m0,swp.csw,swp.cF[0],swp.cF[1]); + + random_ud(); + chs_ubnd(-1); + sw_term(NO_PTS); + + for (i=0;i<4;i++) + random_sd(NSPIN,psd[i],1.0); + + Dw_dble(mu,psd[0],psd[2]); + mulg5_dble(VOLUME,psd[2]); + Dw_dble(-mu,psd[1],psd[3]); + mulg5_dble(VOLUME,psd[3]); + + z1=spinor_prod_dble(VOLUME,1,psd[0],psd[3]); + z2=spinor_prod_dble(VOLUME,1,psd[2],psd[1]); + + d=sqrt((z1.re-z2.re)*(z1.re-z2.re)+ + (z1.im-z2.im)*(z1.im-z2.im)); + d/=sqrt((double)(12*NPROC)*(double)(VOLUME)); + error_chk(); + + if (my_rank==0) + printf("Deviation from gamma5-Hermiticity = %.1e\n",d); + + for (i=0;i<4;i++) + random_sd(NSPIN,psd[i],1.0); + + assign_sd2sd(VOLUME,psd[0],psd[1]); + assign_sd2sd(VOLUME,psd[2],psd[3]); + Dwee_dble(mu,psd[1],psd[2]); + + bnd_sd2zero(EVEN_PTS,psd[0]); + mulr_spinor_add_dble(VOLUME,psd[1],psd[0],-1.0); + d=norm_square_dble(VOLUME,1,psd[1]); + + error(d!=0.0,1,"main [check6.c]", + "Dwee_dble() changes the input field in unexpected ways"); + + mulr_spinor_add_dble(VOLUME/2,psd[2]+(VOLUME/2),psd[3]+(VOLUME/2),-1.0); + assign_sd2sd(VOLUME/2,psd[2],psd[4]); + bnd_sd2zero(EVEN_PTS,psd[4]); + mulr_spinor_add_dble(VOLUME/2,psd[2],psd[4],-1.0); + d=norm_square_dble(VOLUME,1,psd[2]); + + error(d!=0.0,1,"main [check6.c]", + "Dwee_dble() changes the output field where it should not"); + + for (i=0;i<4;i++) + random_sd(NSPIN,psd[i],1.0); + + assign_sd2sd(VOLUME,psd[0],psd[1]); + assign_sd2sd(VOLUME,psd[2],psd[3]); + Dwoo_dble(mu,psd[1],psd[2]); + + bnd_sd2zero(ODD_PTS,psd[0]); + mulr_spinor_add_dble(VOLUME,psd[1],psd[0],-1.0); + d=norm_square_dble(VOLUME,1,psd[1]); + + error(d!=0.0,1,"main [check6.c]", + "Dwoo_dble() changes the input field in unexpected ways"); + + mulr_spinor_add_dble(VOLUME/2,psd[2],psd[3],-1.0); + assign_sd2sd(VOLUME/2,psd[2]+(VOLUME/2),psd[4]+(VOLUME/2)); + bnd_sd2zero(ODD_PTS,psd[4]); + mulr_spinor_add_dble(VOLUME/2,psd[2]+(VOLUME/2),psd[4]+(VOLUME/2),-1.0); + d=norm_square_dble(VOLUME,1,psd[2]); + + error(d!=0.0,1,"main [check6.c]", + "Dwoo_dble() changes the output field where it should not"); + + for (i=0;i<4;i++) + random_sd(NSPIN,psd[i],1.0); + + assign_sd2sd(VOLUME,psd[0],psd[1]); + assign_sd2sd(VOLUME,psd[2],psd[3]); + Dwoe_dble(psd[1],psd[2]); + + bnd_sd2zero(EVEN_PTS,psd[0]); + mulr_spinor_add_dble(VOLUME,psd[1],psd[0],-1.0); + d=norm_square_dble(VOLUME,1,psd[1]); + + error(d!=0.0,1,"main [check6.c]", + "Dwoe_dble() changes the input field in unexpected ways"); + + mulr_spinor_add_dble(VOLUME/2,psd[2],psd[3],-1.0); + assign_sd2sd(VOLUME/2,psd[2]+(VOLUME/2),psd[4]+(VOLUME/2)); + bnd_sd2zero(ODD_PTS,psd[4]); + mulr_spinor_add_dble(VOLUME/2,psd[2]+(VOLUME/2),psd[4]+(VOLUME/2),-1.0); + d=norm_square_dble(VOLUME,1,psd[2]); + + error(d!=0.0,1,"main [check6.c]", + "Dwoe_dble() changes the output field where it should not"); + + for (i=0;i<4;i++) + random_sd(NSPIN,psd[i],1.0); + + assign_sd2sd(VOLUME,psd[0],psd[1]); + assign_sd2sd(VOLUME,psd[2],psd[3]); + Dweo_dble(psd[1],psd[2]); + + bnd_sd2zero(ODD_PTS,psd[0]); + mulr_spinor_add_dble(VOLUME,psd[1],psd[0],-1.0); + d=norm_square_dble(VOLUME,1,psd[1]); + + error(d!=0.0,1,"main [check6.c]", + "Dweo_dble() changes the input field in unexpected ways"); + + mulr_spinor_add_dble(VOLUME/2,psd[2]+(VOLUME/2),psd[3]+(VOLUME/2),-1.0); + assign_sd2sd(VOLUME/2,psd[2],psd[4]); + bnd_sd2zero(EVEN_PTS,psd[4]); + mulr_spinor_add_dble(VOLUME/2,psd[2],psd[4],-1.0); + d=norm_square_dble(VOLUME,1,psd[2]); + + error(d!=0.0,1,"main [check6.c]", + "Dweo_dble() changes the output field where it should not"); + + for (i=0;i<4;i++) + random_sd(NSPIN,psd[i],1.0); + + assign_sd2sd(VOLUME,psd[0],psd[1]); + assign_sd2sd(VOLUME,psd[2],psd[3]); + Dwhat_dble(mu,psd[1],psd[2]); + + bnd_sd2zero(EVEN_PTS,psd[0]); + mulr_spinor_add_dble(VOLUME,psd[1],psd[0],-1.0); + d=norm_square_dble(VOLUME,1,psd[1]); + + error(d!=0.0,1,"main [check6.c]", + "Dwhat_dble() changes the input field in unexpected ways"); + + mulr_spinor_add_dble(VOLUME/2,psd[2]+(VOLUME/2),psd[3]+(VOLUME/2),-1.0); + assign_sd2sd(VOLUME/2,psd[2],psd[4]); + bnd_sd2zero(EVEN_PTS,psd[4]); + mulr_spinor_add_dble(VOLUME/2,psd[2],psd[4],-1.0); + d=norm_square_dble(VOLUME,1,psd[2]); + + error(d!=0.0,1,"main [check6.c]", + "Dwhat_dble() changes the output field where it should not"); + + for (i=0;i<4;i++) + random_sd(NSPIN,psd[i],1.0); + + assign_sd2sd(VOLUME,psd[0],psd[2]); + Dw_dble(mu,psd[0],psd[1]); + Dwee_dble(mu,psd[2],psd[3]); + set_sd2zero(VOLUME/2,psd[0]); + mulr_spinor_add_dble(VOLUME/2,psd[0],psd[3],-1.0); + Dweo_dble(psd[2],psd[0]); + set_sd2zero(VOLUME/2,psd[3]); + mulr_spinor_add_dble(VOLUME/2,psd[3],psd[0],-1.0); + + Dwoo_dble(mu,psd[2],psd[3]); + Dwoe_dble(psd[2],psd[4]); + mulr_spinor_add_dble(VOLUME/2,psd[3]+(VOLUME/2),psd[4]+(VOLUME/2),1.0); + + mulr_spinor_add_dble(VOLUME,psd[3],psd[1],-1.0); + d=norm_square_dble(VOLUME,1,psd[3])/norm_square_dble(VOLUME,1,psd[1]); + d=sqrt(d); + + if (my_rank==0) + printf("Deviation of Dw_dble() from Dwee_dble(),.. = %.1e\n",d); + + for (i=0;i<4;i++) + random_sd(NSPIN,psd[i],1.0); + + assign_sd2sd(NSPIN,psd[0],psd[1]); + Dwhat_dble(mu,psd[0],psd[2]); + + Dwoe_dble(psd[1],psd[1]); + Dwee_dble(mu,psd[1],psd[1]); + Dwoo_dble(0.0,psd[1],psd[1]); + Dweo_dble(psd[1],psd[1]); + + mulr_spinor_add_dble(VOLUME/2,psd[1],psd[2],-1.0); + d=norm_square_dble(VOLUME/2,1,psd[1])/norm_square_dble(VOLUME/2,1,psd[2]); + d=sqrt(d); + + if (my_rank==0) + printf("Deviation of Dwhat_dble() from Dwee_dble(),.. = %.1e\n",d); + + for (i=0;i<4;i++) + random_sd(NSPIN,psd[i],1.0); + + assign_sd2sd(VOLUME,psd[0],psd[2]); + + set_tm_parms(1); + Dw_dble(mu,psd[0],psd[1]); + set_tm_parms(0); + + Dwee_dble(mu,psd[2],psd[3]); + mulr_spinor_add_dble(VOLUME/2,psd[1],psd[3],-1.0); + Dweo_dble(psd[2],psd[1]); + Dwoe_dble(psd[2],psd[3]); + mulr_spinor_add_dble(VOLUME/2,psd[1]+(VOLUME/2),psd[3]+(VOLUME/2),-1.0); + Dwoo_dble(0.0,psd[2],psd[3]); + mulr_spinor_add_dble(VOLUME/2,psd[1]+(VOLUME/2),psd[3]+(VOLUME/2),-1.0); + d=norm_square_dble(VOLUME,1,psd[1])/norm_square_dble(VOLUME,1,psd[2]); + d=sqrt(d); + + error_chk(); + + if (my_rank==0) + { + printf("Check of Dw_dble()|eoflg=1 = %.1e\n\n",d); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check7.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check7.c new file mode 100644 index 0000000000000000000000000000000000000000..2ad98b8f43793f63f13288b9554d208e4be0709a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check7.c @@ -0,0 +1,429 @@ + +/******************************************************************************* +* +* File check7.c +* +* Copyright (C) 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Comparison of Dw_blk(),..,Dwhat_blk() with Dw(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "block.h" +#include "dirac.h" +#include "global.h" + + +static void blk_s2zero(int ic,spinor *s) +{ + int nb,isw; + int nbh,n,nm,vol; + block_t *b; + + b=blk_list(SAP_BLOCKS,&nb,&isw); + nbh=nb/2; + vol=(*b).vol; + + if (ic^isw) + n=nbh; + else + n=0; + + nm=n+nbh; + + for (;n]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.55,0.78,0.9012,1.2034,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,1234); + geometry(); + set_sap_parms(bs,0,1,1); + alloc_bgr(SAP_BLOCKS); + alloc_ws(4); + + swp=set_sw_parms(0.05); + mu=0.123f; + + if (my_rank==0) + printf("m0 = %.4e, mu = %.4e, csw = %.4e, cF = %.4e, cF' = %.4e\n\n", + swp.m0,mu,swp.csw,swp.cF[0],swp.cF[1]); + + random_ud(); + chs_ubnd(-1); + sw_term(NO_PTS); + + assign_ud2u(); + assign_swd2sw(); + assign_ud2ubgr(SAP_BLOCKS); + assign_swd2swbgr(SAP_BLOCKS,NO_PTS); + + ps=reserve_ws(4); + b=blk_list(SAP_BLOCKS,&nb,&isw); + nbh=nb/2; + vol=(*b).vol; + volh=vol/2; + + for (itm=0;itm<2;itm++) + { + ie=0; + dmax=0.0f; + set_tm_parms(itm); + + if (my_rank==0) + printf("Twisted-mass flag = %d\n",itm); + + for (ic=0;ic<2;ic++) + { + random_s(VOLUME,ps[0],1.0f); + random_s(VOLUME,ps[2],1.0f); + blk_s2zero(ic^0x1,ps[0]); + blk_s2zero(ic^0x1,ps[2]); + + if (ic^isw) + n=nbh; + else + n=0; + + nm=n+nbh; + + for (;ndmax) + dmax=d; + } + + error_chk(); + error(ie,1,"main [check7.c]", + "Dw_blk() changes the fields where it should not"); + + dmax=(float)(sqrt((double)(dmax))); + + if (my_rank==0) + { + printf("The maximal relative deviations are:\n\n"); + printf("Dw_blk(): %.1e\n",dmax); + } + + dmax=0.0f; + random_s(VOLUME,ps[0],1.0f); + random_s(VOLUME,ps[1],1.0f); + + for (n=0;ndmax) + dmax=d; + + random_s(VOLUME,ps[0],1.0f); + random_s(VOLUME,ps[1],1.0f); + + for (n=0;ndmax) + dmax=d; + + error_chk(); + error(ie,1,"main [check7.c]", + "Dwee_blk() or Dwoo_blk() changes the fields where it should not"); + + dmax=(float)(sqrt((double)(dmax))); + + if (my_rank==0) + printf("Dwee_blk(), Dwoo_blk(): %.1e\n",dmax); + + dmax=0.0f; + + for (ic=0;ic<2;ic++) + { + random_s(VOLUME,ps[0],1.0f); + random_s(VOLUME,ps[1],1.0f); + random_s(VOLUME,ps[2],1.0f); + blk_s2zero(ic^0x1,ps[0]); + blk_s2zero(ic^0x1,ps[2]); + + if (ic^isw) + n=nbh; + else + n=0; + + nm=n+nbh; + + for (;ndmax) + dmax=d; + } + + error_chk(); + error(ie,1,"main [check7.c]", + "Dweo_blk() changes the fields where it should not"); + + dmax=(float)(sqrt((double)(dmax))); + + if (my_rank==0) + printf("Dweo_blk(): %.1e\n",dmax); + + dmax=0.0f; + + for (ic=0;ic<2;ic++) + { + random_s(VOLUME,ps[0],1.0f); + random_s(VOLUME,ps[1],1.0f); + random_s(VOLUME,ps[2],1.0f); + blk_s2zero(ic^0x1,ps[0]); + blk_s2zero(ic^0x1,ps[2]); + + if (ic^isw) + n=nbh; + else + n=0; + + nm=n+nbh; + + for (;ndmax) + dmax=d; + } + + error_chk(); + error(ie,1,"main [check7.c]", + "Dwoe_blk() changes the fields where it should not"); + + dmax=(float)(sqrt((double)(dmax))); + + if (my_rank==0) + printf("Dwoe_blk(): %.1e\n",dmax); + + dmax=0.0f; + random_s(VOLUME,ps[0],1.0f); + random_s(VOLUME,ps[1],1.0f); + + for (n=0;ndmax) + dmax=d; + + assign_s2sblk(SAP_BLOCKS,n,ALL_PTS,ps[0],0); + mulr_spinor_add(volh,b[n].s[0]+volh,b[n].s[1]+volh,-1.0f); + if (norm_square(volh,0,b[n].s[0]+volh)!=0.0f) + ie=1; + } + + error_chk(); + error(ie,1,"main [check7.c]", + "Dwhat_blk() changes the fields where it should not"); + + dmax=(float)(sqrt((double)(dmax))); + + if (NPROC>1) + { + d=dmax; + MPI_Reduce(&d,&dmax,1,MPI_FLOAT,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Bcast(&dmax,1,MPI_FLOAT,0,MPI_COMM_WORLD); + } + + if (my_rank==0) + printf("Dwhat_blk(): %.1e\n\n",dmax); + } + + if (my_rank==0) + fclose(flog); + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check7.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check7.in new file mode 100644 index 0000000000000000000000000000000000000000..bd654839cac6ab535881018a1109ac0080e8af27 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check7.in @@ -0,0 +1 @@ +bs 4 4 4 4 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check8.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check8.c new file mode 100644 index 0000000000000000000000000000000000000000..c946bc7e322993b91925d948ce44ae89417ccdf3 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check8.c @@ -0,0 +1,444 @@ + +/******************************************************************************* +* +* File check8.c +* +* Copyright (C) 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Comparison of Dw_blk_dble(),..,Dwhat_blk_dble with Dw_dble(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "block.h" +#include "dirac.h" +#include "global.h" + + +static void blk_sd2zero(int ic,spinor_dble *sd) +{ + int nb,isw; + int nbh,n,nm,vol; + block_t *b; + + b=blk_list(DFL_BLOCKS,&nb,&isw); + nbh=nb/2; + vol=(*b).vol; + + if (ic^isw) + n=nbh; + else + n=0; + + nm=n+nbh; + + for (;n]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.0; + phi[1]=0.0; + phi_prime[0]=0.0; + phi_prime[1]=0.0; + set_bc_parms(bc,0.55,0.78,0.9012,1.2034,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,1234); + geometry(); + set_dfl_parms(bs,2); + alloc_bgr(DFL_BLOCKS); + alloc_wsd(4); + + swp=set_sw_parms(0.05); + mu=0.123; + + if (my_rank==0) + printf("m0 = %.4e, mu = %.4e, csw = %.4e, cF = %.4e, cF' = %.4e\n\n", + swp.m0,mu,swp.csw,swp.cF[0],swp.cF[1]); + + random_ud(); + chs_ubnd(-1); + sw_term(NO_PTS); + + psd=reserve_wsd(4); + b=blk_list(DFL_BLOCKS,&nb,&isw); + nbh=nb/2; + vol=(*b).vol; + volh=vol/2; + + for (itm=0;itm<2;itm++) + { + ie=0; + dmax=0.0; + set_tm_parms(itm); + + if (my_rank==0) + printf("Twisted-mass flag = %d\n",itm); + + for (ic=0;ic<2;ic++) + { + random_sd(VOLUME,psd[0],1.0); + random_sd(VOLUME,psd[2],1.0); + blk_sd2zero(ic^0x1,psd[0]); + blk_sd2zero(ic^0x1,psd[2]); + + if (ic^isw) + n=nbh; + else + n=0; + + nm=n+nbh; + + for (;ndmax) + dmax=d; + } + + error_chk(); + error(ie,1,"main [check8.c]", + "Dw_blk_dble() changes the fields where it should not"); + + dmax=sqrt(dmax); + + if (my_rank==0) + { + printf("The maximal relative deviations are:\n\n"); + printf("Dw_blk_dble(): %.1e\n",dmax); + } + + dmax=0.0; + random_sd(VOLUME,psd[0],1.0); + random_sd(VOLUME,psd[1],1.0); + + for (n=0;ndmax) + dmax=d; + + random_sd(VOLUME,psd[0],1.0); + random_sd(VOLUME,psd[1],1.0); + + for (n=0;ndmax) + dmax=d; + + error_chk(); + error(ie,1,"main [check8.c]","Dwee_blk_dble() or Dwoo_blk_dble() " + "changes the fields where it should not"); + + dmax=sqrt(dmax); + + if (my_rank==0) + printf("Dwee_blk_dble(), Dwoo_blk_dble(): %.1e\n",dmax); + + dmax=0.0; + + for (ic=0;ic<2;ic++) + { + random_sd(VOLUME,psd[0],1.0); + random_sd(VOLUME,psd[1],1.0); + random_sd(VOLUME,psd[2],1.0); + blk_sd2zero(ic^0x1,psd[0]); + blk_sd2zero(ic^0x1,psd[2]); + + if (ic^isw) + n=nbh; + else + n=0; + + nm=n+nbh; + + for (;ndmax) + dmax=d; + } + + error_chk(); + error(ie,1,"main [check8.c]", + "Dweo_blk_dble() changes the fields where it should not"); + + dmax=sqrt(dmax); + + if (my_rank==0) + printf("Dweo_blk_dble(): %.1e\n",dmax); + + dmax=0.0; + + for (ic=0;ic<2;ic++) + { + random_sd(VOLUME,psd[0],1.0); + random_sd(VOLUME,psd[1],1.0); + random_sd(VOLUME,psd[2],1.0); + blk_sd2zero(ic^0x1,psd[0]); + blk_sd2zero(ic^0x1,psd[2]); + + if (ic^isw) + n=nbh; + else + n=0; + + nm=n+nbh; + + for (;ndmax) + dmax=d; + } + + error_chk(); + error(ie,1,"main [check8.c]", + "Dwoe_blk_dble() changes the fields where it should not"); + + dmax=sqrt(dmax); + + if (my_rank==0) + printf("Dwoe_blk_dble(): %.1e\n",dmax); + + dmax=0.0; + random_sd(VOLUME,psd[0],1.0); + random_sd(VOLUME,psd[1],1.0); + + for (n=0;ndmax) + dmax=d; + + assign_sd2sdblk(DFL_BLOCKS,n,ALL_PTS,psd[0],0); + mulr_spinor_add_dble(volh,b[n].sd[0]+volh,b[n].sd[1]+volh,-1.0); + if (norm_square_dble(volh,0,b[n].sd[0]+volh)!=0.0) + ie=1; + } + + error_chk(); + error(ie,1,"main [check8.c]", + "Dwhat_blk_dble() changes the fields where it should not"); + + dmax=sqrt(dmax); + + if (NPROC>1) + { + d=dmax; + MPI_Reduce(&d,&dmax,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Bcast(&dmax,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + + if (my_rank==0) + printf("Dwhat_blk_dble(): %.1e\n\n",dmax); + } + + if (my_rank==0) + fclose(flog); + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check9.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check9.c new file mode 100644 index 0000000000000000000000000000000000000000..6836c34b8afb12d4350b27815f4ccfbf8d6b0cb0 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/dirac/check9.c @@ -0,0 +1,260 @@ + +/******************************************************************************* +* +* File check9.c +* +* Copyright (C) 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Comparison of Dw_bnd() with Dw(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "block.h" +#include "sap.h" +#include "dirac.h" +#include "global.h" + +typedef union +{ + weyl w; + float r[12]; +} spin_t; + + +static void blk_s2zero(int ic,spinor *s) +{ + int nb,isw; + int nbh,n,nm,vol; + block_t *b; + + b=blk_list(SAP_BLOCKS,&nb,&isw); + nbh=nb/2; + vol=(*b).vol; + + if (ic^isw) + n=nbh; + else + n=0; + + nm=n+nbh; + + for (;n]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.0; + phi[1]=0.0; + phi_prime[0]=0.0; + phi_prime[1]=0.0; + set_bc_parms(bc,0.55,0.78,0.9012,1.2034,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,1234); + geometry(); + set_sap_parms(bs,0,1,1); + alloc_bgr(SAP_BLOCKS); + alloc_ws(4); + + swp=set_sw_parms(0.05); + mu=0.123f; + + if (my_rank==0) + printf("m0 = %.4e, mu = %.4e, csw = %.4e, cF = %.4e, cF' = %.4e\n\n", + swp.m0,mu,swp.csw,swp.cF[0],swp.cF[1]); + + random_ud(); + chs_ubnd(-1); + sw_term(NO_PTS); + + assign_ud2u(); + assign_swd2sw(); + assign_ud2ubgr(SAP_BLOCKS); + + ps=reserve_ws(4); + b=blk_list(SAP_BLOCKS,&nb,&isw); + nbh=nb/2; + vol=(*b).vol; + + ie=0; + dmax=0.0f; + + for (ic=0;ic<2;ic++) + { + random_s(VOLUME,ps[0],1.0f); + assign_s2s(VOLUME,ps[0],ps[3]); + + if (ic^isw) + n=nbh; + else + n=0; + nm=n+nbh; + + for (;ndmax) + dmax=d; + + if (ic^isw) + n=nbh; + else + n=0; + nm=n+nbh; + + for (;n +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "dirac.h" +#include "global.h" + + +int main(int argc,char *argv[]) +{ + int my_rank,bc,count,nt; + int i,nflds; + float mu; + double phi[2],phi_prime[2]; + double wt1,wt2,wdt; + spinor **ps; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("time1.log","w",stdout); + + printf("\n"); + printf("Timing of Dw() and Dwhat()\n"); + printf("--------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + if (NPROC>1) + printf("There are %d MPI processes\n",NPROC); + else + printf("There is 1 MPI process\n"); + + if ((VOLUME*sizeof(float))<(64*1024)) + { + printf("The local size of the gauge field is %d KB\n", + (int)((72*VOLUME*sizeof(float))/(1024))); + printf("The local size of a quark field is %d KB\n", + (int)((24*VOLUME*sizeof(float))/(1024))); + } + else + { + printf("The local size of the gauge field is %d MB\n", + (int)((72*VOLUME*sizeof(float))/(1024*1024))); + printf("The local size of a quark field is %d MB\n", + (int)((24*VOLUME*sizeof(float))/(1024*1024))); + } + +#if (defined x64) +#if (defined AVX) + printf("Using AVX instructions\n"); +#else + printf("Using SSE3 instructions and 16 xmm registers\n"); +#endif +#if (defined P3) + printf("Assuming SSE prefetch instructions fetch 32 bytes\n"); +#elif (defined PM) + printf("Assuming SSE prefetch instructions fetch 64 bytes\n"); +#elif (defined P4) + printf("Assuming SSE prefetch instructions fetch 128 bytes\n"); +#else + printf("SSE prefetch instructions are not used\n"); +#endif +#endif + printf("\n"); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [time1.c]", + "Syntax: time1 [-bc ]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.55,0.78,0.9012,1.2034,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,12345); + geometry(); + + set_sw_parms(-0.0123); + mu=0.0785f; + + random_ud(); + chs_ubnd(-1); + sw_term(NO_PTS); + assign_ud2u(); + assign_swd2sw(); + + nflds=(int)((4*1024*1024)/(VOLUME*sizeof(float)))+1; + if ((nflds%2)==1) + nflds+=1; + alloc_ws(nflds); + ps=reserve_ws(nflds); + + for (i=0;i +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "dirac.h" +#include "global.h" + + +int main(int argc,char *argv[]) +{ + int my_rank,bc,count,nt; + int i,nflds; + double phi[2],phi_prime[2]; + double mu,wt1,wt2,wdt; + spinor_dble **psd; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("time2.log","w",stdout); + + printf("\n"); + printf("Timing of Dw_dble() and Dwhat_dble()\n"); + printf("------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + if (NPROC>1) + printf("There are %d MPI processes\n",NPROC); + else + printf("There is 1 MPI process\n"); + + if ((VOLUME*sizeof(double))<(64*1024)) + { + printf("The local size of the gauge field is %d KB\n", + (int)((72*VOLUME*sizeof(double))/(1024))); + printf("The local size of a quark field is %d KB\n", + (int)((24*VOLUME*sizeof(double))/(1024))); + } + else + { + printf("The local size of the gauge field is %d MB\n", + (int)((72*VOLUME*sizeof(double))/(1024*1024))); + printf("The local size of a quark field is %d MB\n", + (int)((24*VOLUME*sizeof(double))/(1024*1024))); + } + +#if (defined x64) +#if (defined AVX) + printf("Using AVX instructions\n"); +#else + printf("Using SSE3 instructions and 16 xmm registers\n"); +#endif +#if (defined P3) + printf("Assuming SSE prefetch instructions fetch 32 bytes\n"); +#elif (defined PM) + printf("Assuming SSE prefetch instructions fetch 64 bytes\n"); +#elif (defined P4) + printf("Assuming SSE prefetch instructions fetch 128 bytes\n"); +#else + printf("SSE prefetch instructions are not used\n"); +#endif +#endif + printf("\n"); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [time2.c]", + "Syntax: time2 [-bc ]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.55,0.78,0.9012,1.2034,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,12345); + geometry(); + + set_sw_parms(-0.0123); + mu=0.0785; + + random_ud(); + chs_ubnd(-1); + sw_term(NO_PTS); + + nflds=(int)((4*1024*1024)/(VOLUME*sizeof(double)))+1; + if ((nflds%2)==1) + nflds+=1; + alloc_wsd(nflds); + psd=reserve_wsd(nflds); + + for (i=0;i +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "dirac.h" +#include "global.h" + + +int main(int argc,char *argv[]) +{ + int my_rank,bc,count,nt; + int n,nb,isw,bs[4]; + float mu; + double phi[2],phi_prime[2]; + double wt1,wt2,wdt; + block_t *b; + FILE *flog=NULL,*fin=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("time3.log","w",stdout); + fin=freopen("check7.in","r",stdin); + + printf("\n"); + printf("Timing of Dw_blk() and Dwhat_blk()\n"); + printf("----------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + if (NPROC>1) + printf("There are %d MPI processes\n",NPROC); + else + printf("There is 1 MPI process\n"); + + if ((VOLUME*sizeof(float))<(64*1024)) + { + printf("The local size of the gauge field is %d KB\n", + (int)((72*VOLUME*sizeof(float))/(1024))); + printf("The local size of a quark field is %d KB\n", + (int)((24*VOLUME*sizeof(float))/(1024))); + } + else + { + printf("The local size of the gauge field is %d MB\n", + (int)((72*VOLUME*sizeof(float))/(1024*1024))); + printf("The local size of a quark field is %d MB\n", + (int)((24*VOLUME*sizeof(float))/(1024*1024))); + } + +#if (defined x64) +#if (defined AVX) + printf("Using AVX instructions\n"); +#else + printf("Using SSE3 instructions and 16 xmm registers\n"); +#endif +#if (defined P3) + printf("Assuming SSE prefetch instructions fetch 32 bytes\n"); +#elif (defined PM) + printf("Assuming SSE prefetch instructions fetch 64 bytes\n"); +#elif (defined P4) + printf("Assuming SSE prefetch instructions fetch 128 bytes\n"); +#else + printf("SSE prefetch instructions are not used\n"); +#endif +#endif + printf("\n"); + + read_line("bs","%d %d %d %d",&bs[0],&bs[1],&bs[2],&bs[3]); + fclose(fin); + + printf("bs = %d %d %d %d\n\n",bs[0],bs[1],bs[2],bs[3]); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [time3.c]", + "Syntax: time3 [-bc ]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.55,0.78,0.9012,1.2034,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,12345); + geometry(); + set_sap_parms(bs,0,1,1); + alloc_bgr(SAP_BLOCKS); + + set_sw_parms(-0.0123); + mu=0.0785f; + + random_ud(); + chs_ubnd(-1); + sw_term(NO_PTS); + assign_ud2ubgr(SAP_BLOCKS); + assign_swd2swbgr(SAP_BLOCKS,NO_PTS); + + b=blk_list(SAP_BLOCKS,&nb,&isw); + random_s((*b).vol,(*b).s[0],1.0f); + + nt=(int)(2.0e6f/(double)(VOLUME)); + if (nt<2) + nt=2; + wdt=0.0; + + while (wdt<5.0) + { + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + for (count=0;count +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "dirac.h" +#include "global.h" + + +int main(int argc,char *argv[]) +{ + int my_rank,bc,count,nt; + int n,nb,isw,bs[4]; + double phi[2],phi_prime[2]; + double mu,wt1,wt2,wdt; + block_t *b; + FILE *flog=NULL,*fin=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("time4.log","w",stdout); + fin=freopen("check7.in","r",stdin); + + printf("\n"); + printf("Timing of Dw_blk_dble() and Dwhat_blk_dble()\n"); + printf("--------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + if (NPROC>1) + printf("There are %d MPI processes\n",NPROC); + else + printf("There is 1 MPI process\n"); + + if ((VOLUME*sizeof(double))<(64*1024)) + { + printf("The local size of the gauge field is %d KB\n", + (int)((72*VOLUME*sizeof(double))/(1024))); + printf("The local size of a quark field is %d KB\n", + (int)((24*VOLUME*sizeof(double))/(1024))); + } + else + { + printf("The local size of the gauge field is %d MB\n", + (int)((72*VOLUME*sizeof(double))/(1024*1024))); + printf("The local size of a quark field is %d MB\n", + (int)((24*VOLUME*sizeof(double))/(1024*1024))); + } + +#if (defined x64) +#if (defined AVX) + printf("Using AVX instructions\n"); +#else + printf("Using SSE3 instructions and 16 xmm registers\n"); +#endif +#if (defined P3) + printf("Assuming SSE prefetch instructions fetch 32 bytes\n"); +#elif (defined PM) + printf("Assuming SSE prefetch instructions fetch 64 bytes\n"); +#elif (defined P4) + printf("Assuming SSE prefetch instructions fetch 128 bytes\n"); +#else + printf("SSE prefetch instructions are not used\n"); +#endif +#endif + printf("\n"); + + read_line("bs","%d %d %d %d",&bs[0],&bs[1],&bs[2],&bs[3]); + fclose(fin); + + printf("bs = %d %d %d %d\n\n",bs[0],bs[1],bs[2],bs[3]); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [time3.c]", + "Syntax: time3 [-bc ]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.55,0.78,0.9012,1.2034,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,12345); + geometry(); + set_dfl_parms(bs,4); + alloc_bgr(DFL_BLOCKS); + + set_sw_parms(-0.0123); + mu=0.0785; + + random_ud(); + chs_ubnd(-1); + sw_term(NO_PTS); + assign_ud2udblk(DFL_BLOCKS,0); + assign_swd2swdblk(DFL_BLOCKS,0,NO_PTS); + + b=blk_list(DFL_BLOCKS,&nb,&isw); + random_sd((*b).vol,(*b).sd[0],1.0); + + nt=(int)(2.0e6f/(double)(VOLUME)); + if (nt<2) + nt=2; + wdt=0.0; + + while (wdt<5.0) + { + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + for (count=0;count that +allows the type of boundary condition to be chosen at runtime. When the option +is not set, open boundary conditions are assumed. + +The option may be set but has no effect in the case of check5 and check8 (the +boundary conditions are selected through the input parameter file in these +cases). diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..74e1ee1814d120588b7ef6877e7bfd026468356a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/Makefile @@ -0,0 +1,170 @@ +################################################################################ +# +# Makefile to compile and link C programs with MPI subroutines +# +# Version valid for Linux machines with MPICH +# +# "make" compiles and links the specified main programs and modules, +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files generated by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +all: rmxeq mkdep mkxeq +.PHONY: all + + +# main programs and modules to be compiled + +MAIN = check1 check2 check3 check4 check5 check6 check7 \ + check8 check9 check10 check11 time1 + +ARCHIVE = archive + +BLOCK = block blk_grid map_u2blk map_sw2blk map_s2blk + +DFL = dfl_geometry dfl_subspace ltl_gcr dfl_sap_gcr dfl_modes + +DIRAC = Dw_dble Dw Dw_bnd + +FLAGS = flags action_parms dfl_parms force_parms hmc_parms lat_parms \ + sap_parms solver_parms mdint_parms rat_parms + +FORCES = force0 force1 force2 force3 force4 force5 \ + frcfcts genfrc tmcg tmcgm xtensor + +LATTICE = bcnds uidx ftidx geometry + +LINALG = salg salg_dble valg valg_dble liealg cmatrix_dble cmatrix + +LINSOLV = cgne fgcr fgcr4vd mscg + +LITTLE = Aw_gen Aw_com Aw_ops Aw_dble Aw ltl_modes + +MDFLDS = mdflds fcom + +RANDOM = ranlux ranlxs ranlxd gauss + +RATFCTS = elliptic zolotarev ratfcts + +SAP = sap_com sap_gcr sap blk_solv + +SFLDS = sflds scom sdcom Pbnd Pbnd_dble + +SU3FCTS = chexp su3prod su3ren cm3x3 random_su3 + +SW_TERM = pauli pauli_dble swflds sw_term + +TCHARGE = ftcom ftensor + +UFLDS = plaq_sum shift uflds udcom bstap + +UPDATE = chrono + +UTILS = endian mutils utils wspace + +VFLDS = vflds vinit vcom vdcom + +MODULES = $(ARCHIVE) $(BLOCK) $(DFL) $(DIRAC) $(FLAGS) $(FORCES) \ + $(LATTICE) $(LINALG) $(LINSOLV) $(LITTLE) $(MDFLDS) $(RANDOM) \ + $(RATFCTS) $(SAP) $(SFLDS) $(SU3FCTS) $(SW_TERM) $(TCHARGE) \ + $(UFLDS) $(UPDATE) $(UTILS) $(VFLDS) + + +# Logging option (-mpilog or -mpitrace or -mpianim) + +LOGOPTION = + + +# search path for modules + +MDIR = ../../modules + +VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/archive:$(MDIR)/linalg:\ + $(MDIR)/random:$(MDIR)/uflds:$(MDIR)/mdflds:$(MDIR)/su3fcts:\ + $(MDIR)/utils:$(MDIR)/forces:$(MDIR)/sflds:$(MDIR)/dirac:\ + $(MDIR)/sw_term:$(MDIR)/tcharge:$(MDIR)/block:$(MDIR)/sap:\ + $(MDIR)/linsolv:$(MDIR)/dfl:$(MDIR)/vflds:$(MDIR)/little:\ + $(MDIR)/update:$(MDIR)/ratfcts + + +# additional include directories + +INCPATH = $(MPI_INCLUDE) ../../include + + +# additional libraries + +LIBS = m + +LIBPATH = $(MPI_HOME)/lib + + +# scheduling and optimization options + +CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ + -Wall -Wno-long-long -Wstrict-prototypes -Werror \ + -O -mno-avx -Dx64 -DPM + +# -DCGNE_DBG -DFGCR_DBG -DMSCG_DBG +# -DDFL_MODES_DBG + + +############################## do not change ################################### + +SHELL=/bin/bash +CC=$(MPI_HOME)/bin/mpicc +CLINKER=$(CC) + +PGMS= $(MAIN) $(MODULES) + +-include $(addsuffix .d,$(PGMS)) + + +# rule to make dependencies + +$(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + +# rule to link object files + +$(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(CFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ + + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables + +rmxeq: + @ -rm -f $(MAIN); \ + echo "delete old executables" + + +# make dependencies + +mkdep: $(addsuffix .d,$(PGMS)) + @ echo "generate tables of dependencies" + + +# clean directory + +clean: + @ -rm -rf *.d *.o *.alog *.clog *.slog $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check1.c new file mode 100644 index 0000000000000000000000000000000000000000..0666fd85b96fc0305eaf8164cba2bc57db458554 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check1.c @@ -0,0 +1,433 @@ + +/******************************************************************************* +* +* File check1.c +* +* Copyright (C) 2012, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Gauge and translation invariance of the gauge action. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "forces.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define N1 (NPROC1*L1) +#define N2 (NPROC2*L2) +#define N3 (NPROC3*L3) + +static int bc,nfc[8],ofs[8]; +static const su3_dble ud0={{0.0}}; +static su3_dble *g,*gbuf; +static su3_dble wd ALIGNED16; + + +static double bnd_action(void) +{ + int i,j; + double c0,c1,*cG,*phi; + double s[3],d0[2],d1[2],act; + lat_parms_t lat; + bc_parms_t bcp; + + if ((bc==1)||(bc==2)) + { + lat=lat_parms(); + bcp=bc_parms(); + + s[0]=(double)(N1); + s[1]=(double)(N2); + s[2]=(double)(N3); + + for (i=0;i<2;i++) + { + d0[i]=0.0; + d1[i]=0.0; + phi=bcp.phi[i]; + + for (j=0;j<3;j++) + { + d0[i]-=(cos(phi[0]/s[j])+cos(phi[1]/s[j])+ + cos(phi[2]/s[j])-3.0); + d1[i]-=(cos(2.0*phi[0]/s[j])+cos(2.0*phi[1]/s[j])+ + cos(2.0*phi[2]/s[j])-3.0); + } + } + + c0=lat.c0; + c1=lat.c1; + cG=bcp.cG; + + act=c0*cG[1]*d0[1]+c1*d0[1]+c1*1.5*d1[1]; + + if (bc==1) + act+=(c0*cG[0]*d0[0]+c1*d0[0]+c1*1.5*d1[0]); + + return (lat.beta/3.0)*(double)(N1*N2*N3)*act; + } + else + return 0.0; +} + + + +static void pack_gbuf(void) +{ + int ifc,ib,ix; + + nfc[0]=FACE0/2; + nfc[1]=FACE0/2; + nfc[2]=FACE1/2; + nfc[3]=FACE1/2; + nfc[4]=FACE2/2; + nfc[5]=FACE2/2; + nfc[6]=FACE3/2; + nfc[7]=FACE3/2; + + ofs[0]=0; + ofs[1]=ofs[0]+nfc[0]; + ofs[2]=ofs[1]+nfc[1]; + ofs[3]=ofs[2]+nfc[2]; + ofs[4]=ofs[3]+nfc[3]; + ofs[5]=ofs[4]+nfc[4]; + ofs[6]=ofs[5]+nfc[5]; + ofs[7]=ofs[6]+nfc[6]; + + for (ifc=0;ifc<8;ifc++) + { + for (ib=0;ib0) + { + tag=mpi_tag(); + saddr=npr[ifc^0x1]; + raddr=npr[ifc]; + sbuf=gbuf+ofs[ifc]; + rbuf=g+VOLUME+ofs[ifc]; + + if (np&0x1) + { + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + } + else + { + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + } + } + } +} + + +static void random_g(void) +{ + int ix,t; + su3_dble unity,*gx; + + unity=ud0; + unity.c11.re=1.0; + unity.c22.re=1.0; + unity.c33.re=1.0; + gx=g; + + for (ix=0;ix0)||(bc!=1)) + random_su3_dble(gx); + else + (*gx)=unity; + + gx+=1; + } + + if (BNDRY>0) + { + pack_gbuf(); + send_gbuf(); + } +} + + +static void transform_ud(void) +{ + int ix,iy,t,ifc; + su3_dble *u; + + u=udfld(); + + for (ix=(VOLUME/2);ix(bs[mu]/2)) + svec[mu]-=bs[mu]; + } + + MPI_Bcast(svec,4,MPI_INT,0,MPI_COMM_WORLD); +} + + +int main(int argc,char *argv[]) +{ + int my_rank,n,s[4]; + double phi[2],phi_prime[2],p1,p2; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check1.log","w",stdout); + + printf("\n"); + printf("Gauge and translation invariance of the gauge action\n"); + printf("----------------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check1.c]", + "Syntax: check1 [-bc ]"); + } + + set_lat_parms(3.5,0.33,0,NULL,1.0); + print_lat_parms(); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.9012,1.2034,1.0,1.0,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,12345); + geometry(); + + g=amalloc(NSPIN*sizeof(*g),ALIGN); + if (BNDRY!=0) + gbuf=amalloc((BNDRY/2)*sizeof(*gbuf),ALIGN); + + error((g==NULL)||((BNDRY!=0)&&(gbuf==NULL)),1,"main [check1.c]", + "Unable to allocate auxiliary arrays"); + + chs_ubnd(-1); + p1=action0(1); + p2=bnd_action(); + + if (my_rank==0) + { + printf("Action after initialization = %.15e\n",p1); + printf("Expected value = %.15e\n\n",p2); + } + + random_ud(); + chs_ubnd(-1); + p1=action0(1); + random_g(); + transform_ud(); + p2=action0(1); + + if (my_rank==0) + { + printf("Random gauge field:\n"); + printf("Action = %.12e\n",p1); + printf("Gauge invariance: relative difference = %.1e\n\n", + fabs(1.0-p2/p1)); + } + + if (my_rank==0) + printf("Translation invariance:\n"); + + p1=action0(1); + + for (n=0;n<8;n++) + { + random_vec(s); + if (bc!=3) + s[0]=0; + chs_ubnd(1); + shift_ud(s); + chs_ubnd(-1); + p2=action0(1); + + if (my_rank==0) + { + printf("s=(% d, % d,% d,% d), ",s[0],s[1],s[2],s[3]); + printf("relative deviation = %.1e\n",fabs(1.0-p2/p1)); + } + } + + if (my_rank==0) + { + printf("\n"); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check10.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check10.c new file mode 100644 index 0000000000000000000000000000000000000000..3dcd7f2d66ac2606eea15c6c837abb3ec98998ea --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check10.c @@ -0,0 +1,450 @@ + +/******************************************************************************* +* +* File check10.c +* +* Copyright (C) 2012, 2013 Martin Luescher, Stefan Schaefer +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of force4() and action4(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "mdflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "dfl.h" +#include "forces.h" +#include "global.h" + +#define N0 (NPROC0*L0) + + +static void rot_ud(double eps) +{ + int bc,ix,t,ifc; + su3_dble *u; + su3_alg_dble *mom; + mdflds_t *mdfs; + + bc=bc_type(); + mdfs=mdflds(); + mom=(*mdfs).mom; + u=udfld(); + + for (ix=(VOLUME/2);ix]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.782); + print_lat_parms(); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + chi[0]=0.123; + chi[1]=-0.534; + chi_prime[0]=0.912; + chi_prime[1]=0.078; + set_bc_parms(bc,1.0,1.0,0.953,1.203,chi,chi_prime); + print_bc_parms(); + + if (my_rank==0) + { + find_section("SAP"); + read_iprms("bs",4,bs); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + set_sap_parms(bs,1,4,5); + + if (my_rank==0) + { + find_section("Deflation subspace"); + read_iprms("bs",4,bs); + read_line("Ns","%d",&Ns); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&Ns,1,MPI_INT,0,MPI_COMM_WORLD); + set_dfl_parms(bs,Ns); + + if (my_rank==0) + { + find_section("Deflation subspace generation"); + read_line("kappa","%lf",&kappa); + read_line("mu","%lf",&mu); + read_line("ninv","%d",&ninv); + read_line("nmr","%d",&nmr); + read_line("ncy","%d",&ncy); + } + + MPI_Bcast(&kappa,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&mu,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&ninv,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmr,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&ncy,1,MPI_INT,0,MPI_COMM_WORLD); + set_dfl_gen_parms(kappa,mu,ninv,nmr,ncy); + + if (my_rank==0) + { + find_section("Deflation projection"); + read_line("nkv","%d",&nkv); + read_line("nmx","%d",&nmx); + read_line("res","%lf",&res); + } + + MPI_Bcast(&nkv,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmx,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&res,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + set_dfl_pro_parms(nkv,nmx,res); + + set_hmc_parms(0,NULL,1,0,NULL,1,1.0); + mnkv=0; + + for (isp=0;isp<3;isp++) + { + read_solver_parms(isp); + sp=solver_parms(isp); + + if (sp.nkv>mnkv) + mnkv=sp.nkv; + } + + if (my_rank==0) + fclose(fin); + + print_solver_parms(&isap,&idfl); + print_sap_parms(1); + print_dfl_parms(0); + + start_ranlux(0,1245); + geometry(); + + set_sw_parms(-0.0123); + mnkv=2*mnkv+2; + if (mnkv<(Ns+2)) + mnkv=Ns+2; + if (mnkv<5) + mnkv=5; + + alloc_ws(mnkv); + alloc_wsd(7); + alloc_wv(2*nkv+2); + alloc_wvd(4); + + for (isw=0;isw<2;isw++) + { + for (isp=0;isp<3;isp++) + { + if (isp==0) + { + mu=1.0; + eps=1.0e-4; + } + else if (isp==1) + { + mu=0.1; + eps=2.0e-4; + } + else + { + mu=0.01; + eps=3.0e-4; + } + + random_ud(); + chs_ubnd(-1); + random_mom(); + + if (isp==2) + { + dfl_modes(status); + error_root(status[0]<0,1,"main [check10.c]", + "dfl_modes failed"); + } + + status[0]=0; + status[1]=0; + + act0=setpf4(mu,0,isw,0); + + act1=action4(mu,0,isw,isp,0,status); + error_root((status[0]<0)||(status[1]<0),1, + "main [check10.c]","action4 failed %d ",isp); + + rdmy=fabs(act1-act0); + MPI_Reduce(&rdmy,dev_act,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + rdmy=act1-act0; + MPI_Reduce(&rdmy,dev_act+1,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(dev_act,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + + rot_ud(eps); + dsdt=dSdt(mu,0,isw,isp,status); + + if (my_rank==0) + { + printf("Solver number %d, isw %d\n", + isp,isw); + + if (isp==0) + printf("Status = %d\n",status[0]); + else if (isp==1) + printf("Status = %d,%d\n",status[0],status[1]); + else + printf("Status = (%d,%d,%d),(%d,%d,%d)\n", + status[0],status[1],status[2],status[3], + status[4],status[5]); + + printf("Absolute action difference |setpf4-action4| = %.1e,", + fabs(dev_act[1])); + printf(" %.1e (local)\n",dev_act[0]); + fflush(flog); + } + + rot_ud(eps); + act0=2.0*action4(mu,0,isw,isp,0,status)/3.0; + rot_ud(-eps); + + rot_ud(-eps); + act1=2.0*action4(mu,0,isw,isp,0,status)/3.0; + rot_ud(eps); + + rot_ud(2.0*eps); + act0-=action4(mu,0,isw,isp,0,status)/12.0; + rot_ud(-2.0*eps); + + rot_ud(-2.0*eps); + act1-=action4(mu,0,isw,isp,0,status)/12.0; + rot_ud(2.0*eps); + + dact=1.2345*(act0-act1)/eps; + dev_frc=dsdt-dact; + sig_loss=-log10(fabs(1.0-act0/act1)); + + rdmy=dsdt; + MPI_Reduce(&rdmy,&dsdt,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(&dsdt,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + rdmy=dev_frc; + MPI_Reduce(&rdmy,&dev_frc,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(&dev_frc,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + rdmy=sig_loss; + MPI_Reduce(&rdmy,&sig_loss,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Bcast(&sig_loss,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + error_chk(); + + if (my_rank==0) + { + printf("Relative deviation of dS/dt = %.2e ",fabs(dev_frc/dsdt)); + printf("[significance loss = %d digits]\n\n",(int)(sig_loss)); + fflush(flog); + } + } + } + + if (my_rank==0) + fclose(flog); + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check10.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check10.in new file mode 100644 index 0000000000000000000000000000000000000000..ff9c44f39aa004b13ba19ba377713fad35faaa11 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check10.in @@ -0,0 +1,43 @@ + +[Solver 0] +solver CGNE +nmx 256 +res 1.0e-12 + +[Solver 1] +solver SAP_GCR +nmx 128 +nkv 16 +isolv 0 +nmr 4 +ncy 3 +res 1.0e-12 + +[Solver 2] +solver DFL_SAP_GCR +nmx 64 +nkv 16 +isolv 1 +nmr 4 +ncy 5 +res 1.0e-12 + +[SAP] +bs 4 4 4 4 + +[Deflation subspace] +bs 4 4 4 4 +Ns 8 + +[Deflation subspace generation] +kappa 0.1350 +mu 0.01 +ninv 5 +nmr 4 +ncy 5 + +[Deflation projection] +nkv 16 +nmx 64 +res 1.0e-2 + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check11.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check11.c new file mode 100644 index 0000000000000000000000000000000000000000..b3466b0c78314eabdcf484ff07300fde984791d7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check11.c @@ -0,0 +1,453 @@ + +/******************************************************************************* +* +* File check11.c +* +* Copyright (C) 2012, 2013 Martin Luescher, Stefan Schaefer +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of force5() and action5(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "mdflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "dfl.h" +#include "forces.h" +#include "global.h" + +#define N0 (NPROC0*L0) + + +static void rot_ud(double eps) +{ + int bc,ix,t,ifc; + su3_dble *u; + su3_alg_dble *mom; + mdflds_t *mdfs; + + bc=bc_type(); + mdfs=mdflds(); + mom=(*mdfs).mom; + u=udfld(); + + for (ix=(VOLUME/2);ix]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.782); + print_lat_parms(); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + chi[0]=0.123; + chi[1]=-0.534; + chi_prime[0]=0.912; + chi_prime[1]=0.078; + set_bc_parms(bc,1.0,1.0,0.953,1.203,chi,chi_prime); + print_bc_parms(); + + if (my_rank==0) + { + find_section("SAP"); + read_iprms("bs",4,bs); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + set_sap_parms(bs,1,4,5); + + if (my_rank==0) + { + find_section("Deflation subspace"); + read_iprms("bs",4,bs); + read_line("Ns","%d",&Ns); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&Ns,1,MPI_INT,0,MPI_COMM_WORLD); + set_dfl_parms(bs,Ns); + + if (my_rank==0) + { + find_section("Deflation subspace generation"); + read_line("kappa","%lf",&kappa); + read_line("mu","%lf",&mu); + read_line("ninv","%d",&ninv); + read_line("nmr","%d",&nmr); + read_line("ncy","%d",&ncy); + } + + MPI_Bcast(&kappa,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&mu,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&ninv,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmr,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&ncy,1,MPI_INT,0,MPI_COMM_WORLD); + set_dfl_gen_parms(kappa,mu,ninv,nmr,ncy); + + if (my_rank==0) + { + find_section("Deflation projection"); + read_line("nkv","%d",&nkv); + read_line("nmx","%d",&nmx); + read_line("res","%lf",&res); + } + + MPI_Bcast(&nkv,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmx,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&res,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + set_dfl_pro_parms(nkv,nmx,res); + + set_hmc_parms(0,NULL,1,0,NULL,1,1.0); + mnkv=0; + + for (isp=0;isp<3;isp++) + { + read_solver_parms(isp); + sp=solver_parms(isp); + + if (sp.nkv>mnkv) + mnkv=sp.nkv; + } + + if (my_rank==0) + fclose(fin); + + print_solver_parms(&isap,&idfl); + print_sap_parms(1); + print_dfl_parms(0); + + start_ranlux(0,1245); + geometry(); + + set_sw_parms(-0.0123); + mnkv=2*mnkv+2; + if (mnkv<(Ns+2)) + mnkv=Ns+2; + if (mnkv<5) + mnkv=5; + + alloc_ws(mnkv); + alloc_wsd(7); + alloc_wv(2*nkv+2); + alloc_wvd(4); + + for (isw=0;isw<2;isw++) + { + for (isp=0;isp<3;isp++) + { + if (isp==0) + { + mu0=1.0; + mu1=1.5; + eps=1.0e-4; + } + else if (isp==1) + { + mu0=0.1; + mu1=0.25; + eps=2.0e-4; + } + else + { + mu0=0.01; + mu1=0.02; + eps=3.0e-4; + } + + random_ud(); + chs_ubnd(-1); + random_mom(); + + if (isp==2) + { + dfl_modes(status); + error_root(status[0]<0,1,"main [check11.c]", + "dfl_modes failed"); + } + + status[0]=0; + status[1]=0; + + act0=setpf5(mu0,mu1,0,isp,0,status); + + act1=action5(mu0,mu1,0,isp,0,status); + error_root((status[0]<0)||(status[1]<0),1, + "main [check11.c]","action5 failed %d ",isp); + + rdmy=fabs(act1-act0); + MPI_Reduce(&rdmy,dev_act,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + rdmy=act1-act0; + MPI_Reduce(&rdmy,dev_act+1,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(dev_act,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + + rot_ud(eps); + dsdt=dSdt(mu0,mu1,0,isw,isp,status); + + if (my_rank==0) + { + printf("Solver number %d, isw %d\n", + isp,isw); + + if (isp==0) + printf("Status = %d\n",status[0]); + else if (isp==1) + printf("Status = %d,%d\n",status[0],status[1]); + else + printf("Status = (%d,%d,%d),(%d,%d,%d)\n", + status[0],status[1],status[2],status[3], + status[4],status[5]); + + printf("Absolute action difference |setpf5-action5| = %.1e,", + fabs(dev_act[1])); + printf(" %.1e (local)\n",dev_act[0]); + fflush(flog); + } + + rot_ud(eps); + act0=2.0*action5(mu0,mu1,0,isp,0,status)/3.0; + rot_ud(-eps); + + rot_ud(-eps); + act1=2.0*action5(mu0,mu1,0,isp,0,status)/3.0; + rot_ud(eps); + + rot_ud(2.0*eps); + act0-=action5(mu0,mu1,0,isp,0,status)/12.0; + rot_ud(-2.0*eps); + + rot_ud(-2.0*eps); + act1-=action5(mu0,mu1,0,isp,0,status)/12.0; + rot_ud(2.0*eps); + + dact=1.2345*(act0-act1)/eps; + dev_frc=dsdt-dact; + sig_loss=-log10(fabs(1.0-act0/act1)); + + rdmy=dsdt; + MPI_Reduce(&rdmy,&dsdt,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(&dsdt,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + rdmy=dev_frc; + MPI_Reduce(&rdmy,&dev_frc,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(&dev_frc,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + rdmy=sig_loss; + MPI_Reduce(&rdmy,&sig_loss,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Bcast(&sig_loss,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + error_chk(); + + if (my_rank==0) + { + printf("Relative deviation of dS/dt = %.2e ",fabs(dev_frc/dsdt)); + printf("[significance loss = %d digits]\n\n",(int)(sig_loss)); + fflush(flog); + } + } + } + + if (my_rank==0) + fclose(flog); + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check2.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check2.c new file mode 100644 index 0000000000000000000000000000000000000000..c2b841801a3dd1ee5a9edd0491ca67c41f124ba2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check2.c @@ -0,0 +1,459 @@ + +/******************************************************************************* +* +* File check2.c +* +* Copyright (C) 2012-2014 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Gauge action of constant Abelian background fields. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "forces.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define N1 (NPROC1*L1) +#define N2 (NPROC2*L2) +#define N3 (NPROC3*L3) + +static int bc,np[4],bo[4]; +static double mt[4][4],inp[4],twopi; +static su3_dble ud0={{0.0}}; + + +static double afld(int *x,int mu) +{ + int nu; + double xt[4],phi; + + xt[0]=(double)(safe_mod(x[0],N0)); + xt[1]=(double)(safe_mod(x[1],N1)); + xt[2]=(double)(safe_mod(x[2],N2)); + xt[3]=(double)(safe_mod(x[3],N3)); + + phi=0.0; + + for (nu=0;nu1) + { + MPI_Reduce(&rs0,s0,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Reduce(&rs1,s1,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(s0,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(s1,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + else + { + (*s0)=rs0; + (*s1)=rs1; + } +} + + +static double Amt(void) +{ + int mu,nu; + double c0,c1,*cG; + double smt0,smt1,sms0,sms1,pi; + double xl[4],phi,n0,s0,s1,bs0,bs1; + lat_parms_t lat; + bc_parms_t bcp; + + lat=lat_parms(); + c0=lat.c0; + c1=lat.c1; + bcp=bc_parms(); + cG=bcp.cG; + + xl[0]=(double)(N0); + xl[1]=(double)(N1); + xl[2]=(double)(N2); + xl[3]=(double)(N3); + + pi=4.0*atan(1.0); + smt0=0.0; + smt1=0.0; + sms0=0.0; + sms1=0.0; + + for (mu=1;mu<4;mu++) + { + for (nu=0;nu=(VOLUME/2)) + { + x[0]=bo[0]+x0; + x[1]=bo[1]+x1; + x[2]=bo[2]+x2; + x[3]=bo[3]+x3; + + u=udb+8*(ix-(VOLUME/2)); + + for (ifc=0;ifc<8;ifc++) + { + if (ifc&0x1) + x[ifc/2]-=1; + + phi=afld(x,ifc/2); + + if (ifc&0x1) + x[ifc/2]+=1; + + (*u)=ud0; + (*u).c11.re=cos(phi); + (*u).c11.im=sin(phi); + (*u).c22.re=(*u).c11.re; + (*u).c22.im=(*u).c11.im; + (*u).c33.re=cos(-2.0*phi); + (*u).c33.im=sin(-2.0*phi); + u+=1; + } + } + } + } + } + } + + set_bc(); + set_flags(UPDATED_UD); +} + + +int main(int argc,char *argv[]) +{ + int my_rank,i; + double A1,A2,d,dmax; + double phi[2],phi_prime[2]; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check2.log","w",stdout); + printf("\n"); + printf("Gauge action of constant Abelian background fields\n"); + printf("--------------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check2.c]", + "Syntax: check2 [-bc ]"); + } + + set_lat_parms(3.5,0.33,0,NULL,1.0); + print_lat_parms(); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.9012,1.2034,1.0,1.0,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,123); + geometry(); + + twopi=8.0*atan(1.0); + + np[0]=N0; + np[1]=N1; + np[2]=N2; + np[3]=N3; + + bo[0]=cpr[0]*L0; + bo[1]=cpr[1]*L1; + bo[2]=cpr[2]*L2; + bo[3]=cpr[3]*L3; + + inp[0]=1.0/(double)(np[0]); + inp[1]=1.0/(double)(np[1]); + inp[2]=1.0/(double)(np[2]); + inp[3]=1.0/(double)(np[3]); + + dmax=0.0; + + for (i=0;i<10;i++) + { + choose_mt(); + set_ud(); + + A1=Amt(); + A2=action0(1); + + if (my_rank==0) + printf("Field no = %2d, A1 = %12.6e, A2 = %12.6e\n",i+1,A1,A2); + + d=fabs(A1-A2)/A1; + if (d>dmax) + dmax=d; + } + + error_chk(); + + if (my_rank==0) + { + printf("\n"); + printf("Maximal relative deviation = %.1e\n\n",dmax); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check3.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check3.c new file mode 100644 index 0000000000000000000000000000000000000000..d4b69a7868ffcc5bd4d7d9f7fb0a749ac86e34f7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check3.c @@ -0,0 +1,337 @@ + +/******************************************************************************* +* +* File check3.c +* +* Copyright (C) 2005, 2008-2013 Martin Luescher, Filippo Palombi +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of the programs force0() and action0(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "mdflds.h" +#include "linalg.h" +#include "forces.h" +#include "global.h" + +#define N0 (NPROC0*L0) + + +static void rot_ud(double eps) +{ + int bc,ix,t,ifc; + su3_dble *u; + su3_alg_dble *mom; + mdflds_t *mdfs; + + bc=bc_type(); + mdfs=mdflds(); + mom=(*mdfs).mom; + u=udfld(); + + for (ix=(VOLUME/2);ix]"); + } + + set_lat_parms(3.5,0.33,0,NULL,1.0); + print_lat_parms(); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.9012,1.2034,1.0,1.0,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,1234); + geometry(); + alloc_wfd(1); + c=0.789; + chk_chs(c); + + for (k=0;k<4;k++) + { + random_ud(); + chs_ubnd(-1); + random_mom(); + dsdt=dSdt(c); + + eps=1.0e-4; + rot_ud(eps); + act0=2.0*action0(0)/3.0; + rot_ud(-eps); + + rot_ud(-eps); + act1=2.0*action0(0)/3.0; + rot_ud(eps); + + rot_ud(2.0*eps); + act0-=action0(0)/12.0; + rot_ud(-2.0*eps); + + rot_ud(-2.0*eps); + act1-=action0(0)/12.0; + rot_ud(2.0*eps); + + act0*=c; + act1*=c; + + dact=(act0-act1)/eps; + dev_frc=dsdt-dact; + sig_loss=-log10(fabs(1.0-act0/act1)); + + rdmy=dsdt; + MPI_Reduce(&rdmy,&dsdt,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(&dsdt,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + rdmy=dev_frc; + MPI_Reduce(&rdmy,&dev_frc,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(&dev_frc,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + rdmy=sig_loss; + MPI_Reduce(&rdmy,&sig_loss,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Bcast(&sig_loss,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + ie=check_bc(0.0); + error_root(ie!=1,1,"main [check3.c]", + "Operations did not preserve boundary conditions"); + + error_chk(); + + if (my_rank==0) + { + printf("Relative deviation of dS/dt = %.2e ",fabs(dev_frc/dsdt)); + printf("[significance loss = %d digits]\n",(int)(sig_loss)); + } + } + + if (my_rank==0) + { + printf("\n"); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check4.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check4.c new file mode 100644 index 0000000000000000000000000000000000000000..eb9ed18331c1c4878fd8948a46f820c9c3fab8eb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check4.c @@ -0,0 +1,737 @@ + +/******************************************************************************* +* +* File check4.c +* +* Copyright (C) 2005, 2008-2013 Martin Luescher, Filippo Palombi, +* Stefan Schaefer +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of sw_frc() and hop_frc(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "mdflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "dirac.h" +#include "forces.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define MAX_LEVELS 8 +#define BLK_LENGTH 8 + +static int cnt[MAX_LEVELS]; +static double smx[MAX_LEVELS]; + + +static int is_Xt_zero(u3_alg_dble *X) +{ + int ie; + + ie=1; + ie&=((*X).c1==0.0); + ie&=((*X).c2==0.0); + ie&=((*X).c3==0.0); + ie&=((*X).c4==0.0); + ie&=((*X).c5==0.0); + ie&=((*X).c6==0.0); + ie&=((*X).c7==0.0); + ie&=((*X).c8==0.0); + ie&=((*X).c9==0.0); + + return ie; +} + + +static int is_Xv_zero(su3_dble *X) +{ + int ie; + + ie=1; + ie&=((*X).c11.re==0.0); + ie&=((*X).c11.im==0.0); + ie&=((*X).c12.re==0.0); + ie&=((*X).c12.im==0.0); + ie&=((*X).c13.re==0.0); + ie&=((*X).c13.im==0.0); + + ie&=((*X).c21.re==0.0); + ie&=((*X).c21.im==0.0); + ie&=((*X).c22.re==0.0); + ie&=((*X).c22.im==0.0); + ie&=((*X).c23.re==0.0); + ie&=((*X).c23.im==0.0); + + ie&=((*X).c31.re==0.0); + ie&=((*X).c31.im==0.0); + ie&=((*X).c32.re==0.0); + ie&=((*X).c32.im==0.0); + ie&=((*X).c33.re==0.0); + ie&=((*X).c33.im==0.0); + + return ie; +} + + +static int is_frc_zero(su3_alg_dble *f) +{ + int ie; + + ie=1; + ie&=((*f).c1==0.0); + ie&=((*f).c2==0.0); + ie&=((*f).c3==0.0); + ie&=((*f).c4==0.0); + ie&=((*f).c5==0.0); + ie&=((*f).c6==0.0); + ie&=((*f).c7==0.0); + ie&=((*f).c8==0.0); + + return ie; +} + + +static void check_Xtbnd(ptset_t set) +{ + int bc,ix,t,n,ie; + int ia,ib; + u3_alg_dble **xt; + + bc=bc_type(); + xt=xtensor(); + ie=0; + ia=0; + ib=VOLUME; + + if (set==EVEN_PTS) + ib=(VOLUME/2); + else if (set==ODD_PTS) + ia=(VOLUME/2); + else if (set==NO_PTS) + ia=VOLUME; + + for (ix=0;ix=ia)&&(ix1.0) + c=pow(4.0+swp.m0,-6.0); + else + c=1.0; + + for (n=0;nvol) + im=vol; + p=1.0; + + for (;ix0)||(bc==3))&&((t<(N0-1))||(bc!=0))) + { + z=det_pauli_dble(0.0,m); + p*=(c*z.re); + z=det_pauli_dble(0.0,m+1); + p*=(c*z.re); + } + + m+=2; + } + + cnt[0]+=1; + smx[0]-=log(fabs(p)); + + for (n=1;(cnt[n-1]>=BLK_LENGTH)&&(n]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.782); + print_lat_parms(); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + chi[0]=0.123; + chi[1]=-0.534; + chi_prime[0]=0.912; + chi_prime[1]=0.078; + set_bc_parms(bc,1.0,1.0,0.953,1.203,chi,chi_prime); + print_bc_parms(); + + start_ranlux(0,1245); + geometry(); + + set_sw_parms(-0.0123); + alloc_wsd(6); + phi=reserve_wsd(1); + + for (k=1;k<=4;k++) + { + random_ud(); + chs_ubnd(-1); + random_mom(); + random_sd(VOLUME,phi[0],1.0); + bnd_sd2zero(ALL_PTS,phi[0]); + dsdt=dSdt(k,phi); + + eps=5.0e-5; + rot_ud(eps); + act0=2.0*action(k,phi)/3.0; + rot_ud(-eps); + + rot_ud(-eps); + act1=2.0*action(k,phi)/3.0; + rot_ud(eps); + + rot_ud(2.0*eps); + act0-=action(k,phi)/12.0; + rot_ud(-2.0*eps); + + rot_ud(-2.0*eps); + act1-=action(k,phi)/12.0; + rot_ud(2.0*eps); + + s[0]=dsdt-(act0-act1)/eps; + s[1]=dsdt; + + if (NPROC>1) + { + MPI_Reduce(s,r,2,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(r,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + else + { + r[0]=s[0]; + r[1]=s[1]; + } + + dev_frc=fabs(r[0]/r[1]); + sig_loss=-log10(fabs(1.0-act0/act1)); + + error_chk(); + + if (my_rank==0) + { + printf("Calculation of the force for S=(phi,Q^%d*phi):\n",k); + printf("Relative deviation of dS/dt = %.2e ",dev_frc); + printf("[significance loss = %d digits]\n\n",(int)(sig_loss)); + } + } + + if (my_rank==0) + printf("Calculation of the force for S=-2*Tr{ln(SW term)}:\n"); + + for (k=0;k<4;k++) + { + if (k==0) + set=NO_PTS; + else if (k==1) + set=EVEN_PTS; + else if (k==2) + set=ODD_PTS; + else + set=ALL_PTS; + + random_ud(); + chs_ubnd(-1); + random_mom(); + dsdt=dSdt_det(set); + + eps=5.0e-4; + rot_ud(eps); + act0=2.0*action_det(set)/3.0; + rot_ud(-eps); + + rot_ud(-eps); + act1=2.0*action_det(set)/3.0; + rot_ud(eps); + + rot_ud(2.0*eps); + act0-=action_det(set)/12.0; + rot_ud(-2.0*eps); + + rot_ud(-2.0*eps); + act1-=action_det(set)/12.0; + rot_ud(2.0*eps); + + s[0]=dsdt-(act0-act1)/eps; + s[1]=dsdt; + + if (NPROC>1) + { + MPI_Reduce(s,r,2,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(r,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + else + { + r[0]=s[0]; + r[1]=s[1]; + } + + if (k>0) + { + dev_frc=fabs(r[0]/r[1]); + sig_loss=-log10(fabs(1.0-act0/act1)); + } + else + dev_frc=fabs(r[0]); + + error_chk(); + + if (my_rank==0) + { + if (k==0) + printf("set=NO_PTS: "); + else if (k==1) + printf("set=EVEN_PTS: "); + else if (k==2) + printf("set=ODD_PTS: "); + else + printf("set=ALL_PTS: "); + + if (k>0) + { + printf("relative deviation of dS/dt = %.2e ",dev_frc); + printf("[significance loss = %d digits]\n",(int)(sig_loss)); + } + else + printf("absolute deviation of dS/dt = %.2e\n",dev_frc); + } + } + + if (my_rank==0) + { + printf("\n"); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check5.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check5.c new file mode 100644 index 0000000000000000000000000000000000000000..0643311962385df4b02ecb1985f3d15436ed31a4 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check5.c @@ -0,0 +1,296 @@ + +/******************************************************************************* +* +* File check5.c +* +* Copyright (C) 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check and performance of the CG solver. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "archive.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "dirac.h" +#include "linsolv.h" +#include "forces.h" +#include "global.h" + +static int my_rank,bc,first,last,step,nmx; +static double kappa,csw,mu,cF,cF_prime; +static double phi[2],phi_prime[2],m0,res; +static char cnfg_dir[NAME_SIZE],cnfg_file[NAME_SIZE],nbase[NAME_SIZE]; + + +static void Dhatop_dble(spinor_dble *s,spinor_dble *r) +{ + Dwhat_dble(mu,s,r); + mulg5_dble(VOLUME/2,r); + mu=-mu; +} + + +static void Dhatop(spinor *s,spinor *r) +{ + Dwhat((float)(mu),s,r); + mulg5(VOLUME/2,r); + mu=-mu; +} + + +int main(int argc,char *argv[]) +{ + int nsize,icnfg,status,ie; + double rho,nrm,del; + double wt1,wt2,wdt; + complex_dble z; + spinor **ws; + spinor_dble **wsd,**psd; + lat_parms_t lat; + FILE *flog=NULL,*fin=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check5.log","w",stdout); + fin=freopen("check5.in","r",stdin); + + printf("\n"); + printf("Check and performance of the CG solver\n"); + printf("--------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + find_section("Configurations"); + read_line("name","%s",nbase); + read_line("cnfg_dir","%s",cnfg_dir); + read_line("first","%d",&first); + read_line("last","%d",&last); + read_line("step","%d",&step); + + find_section("Lattice parameters"); + read_line("kappa","%lf",&kappa); + read_line("csw","%lf",&csw); + read_line("mu","%lf",&mu); + + find_section("Boundary conditions"); + read_line("type","%d",&bc); + + phi[0]=0.0; + phi[1]=0.0; + phi_prime[0]=0.0; + phi_prime[1]=0.0; + cF=1.0; + cF_prime=1.0; + + if (bc==1) + read_dprms("phi",2,phi); + + if ((bc==1)||(bc==2)) + read_dprms("phi'",2,phi_prime); + + if (bc!=3) + read_line("cF","%lf",&cF); + + if (bc==2) + read_line("cF'","%lf",&cF_prime); + else + cF_prime=cF; + + find_section("CG"); + read_line("nmx","%d",&nmx); + read_line("res","%lf",&res); + + fclose(fin); + } + + MPI_Bcast(nbase,NAME_SIZE,MPI_CHAR,0,MPI_COMM_WORLD); + MPI_Bcast(cnfg_dir,NAME_SIZE,MPI_CHAR,0,MPI_COMM_WORLD); + MPI_Bcast(&first,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&last,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&step,1,MPI_INT,0,MPI_COMM_WORLD); + + MPI_Bcast(&kappa,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&csw,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&mu,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(phi,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(phi_prime,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cF,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cF_prime,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + MPI_Bcast(&nmx,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&res,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + lat=set_lat_parms(5.5,1.0,1,&kappa,csw); + print_lat_parms(); + + set_bc_parms(bc,1.0,1.0,cF,cF_prime,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,1234); + geometry(); + + m0=lat.m0[0]; + set_sw_parms(m0); + + if (my_rank==0) + { + printf("mu = %.6f\n\n",mu); + + printf("CG parameters:\n"); + printf("nmx = %d\n",nmx); + printf("res = %.2e\n\n",res); + + printf("Configurations %sn%d -> %sn%d in steps of %d\n\n", + nbase,first,nbase,last,step); + fflush(flog); + } + + alloc_ws(5); + alloc_wsd(6); + psd=reserve_wsd(3); + + error_root(((last-first)%step)!=0,1,"main [check5.c]", + "last-first is not a multiple of step"); + check_dir_root(cnfg_dir); + nsize=name_size("%s/%sn%d",cnfg_dir,nbase,last); + error_root(nsize>=NAME_SIZE,1,"main [check5.c]", + "configuration file name is too long"); + + for (icnfg=first;icnfg<=last;icnfg+=step) + { + sprintf(cnfg_file,"%s/%sn%d",cnfg_dir,nbase,icnfg); + import_cnfg(cnfg_file); + + if (my_rank==0) + { + printf("Configuration no %d\n\n",icnfg); + fflush(flog); + } + + chs_ubnd(-1); + random_sd(VOLUME,psd[0],1.0); + bnd_sd2zero(ALL_PTS,psd[0]); + nrm=sqrt(norm_square_dble(VOLUME,1,psd[0])); + assign_sd2sd(VOLUME,psd[0],psd[2]); + + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + + rho=tmcg(nmx,res,mu,psd[0],psd[1],&status); + + MPI_Barrier(MPI_COMM_WORLD); + wt2=MPI_Wtime(); + wdt=wt2-wt1; + + error_chk(); + z.re=-1.0; + z.im=0.0; + mulc_spinor_add_dble(VOLUME,psd[2],psd[0],z); + del=norm_square_dble(VOLUME,1,psd[2]); + error_root(del!=0.0,1,"main [check5.c]", + "Source field is not preserved"); + + Dw_dble(mu,psd[1],psd[2]); + mulg5_dble(VOLUME,psd[2]); + Dw_dble(-mu,psd[2],psd[1]); + mulg5_dble(VOLUME,psd[1]); + mulc_spinor_add_dble(VOLUME,psd[1],psd[0],z); + del=sqrt(norm_square_dble(VOLUME,1,psd[1])); + + if (my_rank==0) + { + printf("Solution w/o eo-preconditioning:\n"); + printf("status = %d\n",status); + printf("rho = %.2e, res = %.2e\n",rho,res); + printf("check = %.2e, check = %.2e\n",del,del/nrm); + printf("time = %.2e sec (total)\n",wdt); + if (status>0) + printf(" = %.2e usec (per point and CG iteration)", + (1.0e6*wdt)/((double)(status)*(double)(VOLUME))); + printf("\n\n"); + fflush(flog); + } + + ws=reserve_ws(5); + wsd=reserve_wsd(2); + ie=sw_term(ODD_PTS); + error_root(ie!=0,1,"main [check5.c]", + "Inversion of the SW term failed"); + assign_swd2sw(); + + random_sd(VOLUME/2,psd[0],1.0); + bnd_sd2zero(ALL_PTS,psd[0]); + nrm=sqrt(norm_square_dble(VOLUME/2,1,psd[0])); + assign_sd2sd(VOLUME/2,psd[0],psd[2]); + + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + + rho=cgne(VOLUME/2,1,Dhatop,Dhatop_dble,ws,wsd,nmx,res, + psd[0],psd[1],&status); + + MPI_Barrier(MPI_COMM_WORLD); + wt2=MPI_Wtime(); + wdt=wt2-wt1; + + error_chk(); + z.re=-1.0; + z.im=0.0; + mulc_spinor_add_dble(VOLUME/2,psd[2],psd[0],z); + del=norm_square_dble(VOLUME/2,1,psd[2]); + error_root(del!=0.0,1,"main [check5.c]", + "Source field is not preserved"); + + Dhatop_dble(psd[1],psd[2]); + Dhatop_dble(psd[2],psd[1]); + mulc_spinor_add_dble(VOLUME/2,psd[1],psd[0],z); + del=sqrt(norm_square_dble(VOLUME/2,1,psd[1])); + + if (my_rank==0) + { + printf("Solution with eo-preconditioning:\n"); + printf("status = %d\n",status); + printf("rho = %.2e, res = %.2e\n",rho,res); + printf("check = %.2e, check = %.2e\n",del,del/nrm); + printf("time = %.2e sec (total)\n",wdt); + if (status>0) + printf(" = %.2e usec (per point and CG iteration)", + (1.0e6*wdt)/((double)(status)*(double)(VOLUME))); + printf("\n\n"); + fflush(flog); + } + + release_wsd(); + release_ws(); + } + + if (my_rank==0) + fclose(flog); + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check5.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check5.in new file mode 100644 index 0000000000000000000000000000000000000000..a9407b43b0af4193d51d669a6fb92d0835e17bc2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check5.in @@ -0,0 +1,23 @@ + +[Configurations] +name 16x8x8x8b6.00id2 +cnfg_dir /home/data/openQCD/cnfg +first 7 +last 7 +step 1 + +[Lattice parameters] +kappa 0.1280 +csw 1.2 +mu 1.0 + +[Boundary conditions] +type 0 +#phi 0.12 -0.56 +#phi' 0.92 0.76 +cF 0.95 +#cF' 0.90 + +[CG] +nmx 256 +res 1.0e-12 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check6.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check6.c new file mode 100644 index 0000000000000000000000000000000000000000..9cc69cc7319d107075a7790aa061500ce03be274 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check6.c @@ -0,0 +1,444 @@ + +/******************************************************************************* +* +* File check6.c +* +* Copyright (C) 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of force1() and action1(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "mdflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "dfl.h" +#include "forces.h" +#include "global.h" + +#define N0 (NPROC0*L0) + + +static void rot_ud(double eps) +{ + int bc,ix,t,ifc; + su3_dble *u; + su3_alg_dble *mom; + mdflds_t *mdfs; + + bc=bc_type(); + mdfs=mdflds(); + mom=(*mdfs).mom; + u=udfld(); + + for (ix=(VOLUME/2);ix]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.782); + print_lat_parms(); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + chi[0]=0.123; + chi[1]=-0.534; + chi_prime[0]=0.912; + chi_prime[1]=0.078; + set_bc_parms(bc,1.0,1.0,0.953,1.203,chi,chi_prime); + print_bc_parms(); + + if (my_rank==0) + { + find_section("SAP"); + read_iprms("bs",4,bs); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + set_sap_parms(bs,1,4,5); + + if (my_rank==0) + { + find_section("Deflation subspace"); + read_iprms("bs",4,bs); + read_line("Ns","%d",&Ns); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&Ns,1,MPI_INT,0,MPI_COMM_WORLD); + set_dfl_parms(bs,Ns); + + if (my_rank==0) + { + find_section("Deflation subspace generation"); + read_line("kappa","%lf",&kappa); + read_line("mu","%lf",&mu); + read_line("ninv","%d",&ninv); + read_line("nmr","%d",&nmr); + read_line("ncy","%d",&ncy); + } + + MPI_Bcast(&kappa,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&mu,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&ninv,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmr,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&ncy,1,MPI_INT,0,MPI_COMM_WORLD); + set_dfl_gen_parms(kappa,mu,ninv,nmr,ncy); + + if (my_rank==0) + { + find_section("Deflation projection"); + read_line("nkv","%d",&nkv); + read_line("nmx","%d",&nmx); + read_line("res","%lf",&res); + } + + MPI_Bcast(&nkv,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmx,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&res,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + set_dfl_pro_parms(nkv,nmx,res); + + set_hmc_parms(0,NULL,1,0,NULL,1,1.0); + mnkv=0; + + for (isp=0;isp<3;isp++) + { + read_solver_parms(isp); + sp=solver_parms(isp); + + if (sp.nkv>mnkv) + mnkv=sp.nkv; + } + + if (my_rank==0) + fclose(fin); + + print_solver_parms(&isap,&idfl); + print_sap_parms(1); + print_dfl_parms(0); + + start_ranlux(0,1245); + geometry(); + + set_sw_parms(-0.0123); + mnkv=2*mnkv+2; + if (mnkv<(Ns+2)) + mnkv=Ns+2; + if (mnkv<5) + mnkv=5; + + alloc_ws(mnkv); + alloc_wsd(6); + alloc_wv(2*nkv+2); + alloc_wvd(4); + + for (isp=0;isp<3;isp++) + { + if (isp==0) + { + mu=1.0; + eps=1.0e-4; + } + else if (isp==1) + { + mu=0.1; + eps=2.0e-4; + } + else + { + mu=0.01; + eps=3.0e-4; + } + + random_ud(); + chs_ubnd(-1); + random_mom(); + + if (isp==2) + { + dfl_modes(status); + error_root(status[0]<0,1,"main [check6.c]", + "dfl_modes failed"); + } + + status[0]=0; + status[1]=0; + + act0=setpf1(mu,0,0); + act1=action1(mu,0,isp,0,status); + error_root((status[0]<0)||(status[1]<0),1,"main [check6.c]", + "action1 failed (mu = %.2e, isp=%d)",mu,isp); + + rdmy=fabs(act1-act0); + MPI_Reduce(&rdmy,dev_act,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + rdmy=act1-act0; + MPI_Reduce(&rdmy,dev_act+1,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(dev_act,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + + dsdt=dSdt(mu,0,isp,status); + + if (my_rank==0) + { + printf("Solver number %d, mu = %.2e\n",isp,mu); + + if (isp==0) + printf("Status = %d\n",status[0]); + else if (isp==1) + printf("Status = %d,%d\n",status[0],status[1]); + else + printf("Status = (%d,%d,%d),(%d,%d,%d)\n", + status[0],status[1],status[2],status[3], + status[4],status[5]); + + printf("Absolute action difference |setpf1-action1| = %.1e,", + fabs(dev_act[1])); + printf(" %.1e (local)\n",dev_act[0]); + fflush(flog); + } + + rot_ud(eps); + act0=2.0*action1(mu,0,isp,0,status)/3.0; + rot_ud(-eps); + + rot_ud(-eps); + act1=2.0*action1(mu,0,isp,0,status)/3.0; + rot_ud(eps); + + rot_ud(2.0*eps); + act0-=action1(mu,0,isp,0,status)/12.0; + rot_ud(-2.0*eps); + + rot_ud(-2.0*eps); + act1-=action1(mu,0,isp,0,status)/12.0; + rot_ud(2.0*eps); + + dact=1.2345*(act0-act1)/eps; + dev_frc=dsdt-dact; + sig_loss=-log10(fabs(1.0-act0/act1)); + + rdmy=dsdt; + MPI_Reduce(&rdmy,&dsdt,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(&dsdt,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + rdmy=dev_frc; + MPI_Reduce(&rdmy,&dev_frc,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(&dev_frc,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + rdmy=sig_loss; + MPI_Reduce(&rdmy,&sig_loss,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Bcast(&sig_loss,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + error_chk(); + + if (my_rank==0) + { + printf("Relative deviation of dS/dt = %.2e ",fabs(dev_frc/dsdt)); + printf("[significance loss = %d digits]\n\n",(int)(sig_loss)); + fflush(flog); + } + } + + if (my_rank==0) + fclose(flog); + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check6.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check6.in new file mode 100644 index 0000000000000000000000000000000000000000..e7981ba77309d6d191c28b2c21d276e222934314 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check6.in @@ -0,0 +1,42 @@ + +[Solver 0] +solver CGNE +nmx 256 +res 1.0e-12 + +[Solver 1] +solver SAP_GCR +nmx 128 +nkv 16 +isolv 0 +nmr 4 +ncy 3 +res 1.0e-12 + +[Solver 2] +solver DFL_SAP_GCR +nmx 64 +nkv 16 +isolv 1 +nmr 4 +ncy 5 +res 1.0e-12 + +[SAP] +bs 4 4 4 4 + +[Deflation subspace] +bs 4 4 4 4 +Ns 8 + +[Deflation subspace generation] +kappa 0.1350 +mu 0.01 +ninv 5 +nmr 4 +ncy 5 + +[Deflation projection] +nkv 16 +nmx 64 +res 1.0e-2 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check7.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check7.c new file mode 100644 index 0000000000000000000000000000000000000000..ac36c292560f6242d8f47328f828aa1307532369 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check7.c @@ -0,0 +1,449 @@ + +/******************************************************************************* +* +* File check7.c +* +* Copyright (C) 2011-2013 Stefan Schaefer, Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of force2() and action2(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "mdflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "dfl.h" +#include "forces.h" +#include "global.h" + +#define N0 (NPROC0*L0) + + +static void rot_ud(double eps) +{ + int bc,ix,t,ifc; + su3_dble *u; + su3_alg_dble *mom; + mdflds_t *mdfs; + + bc=bc_type(); + mdfs=mdflds(); + mom=(*mdfs).mom; + u=udfld(); + + for (ix=(VOLUME/2);ix]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.782); + print_lat_parms(); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + chi[0]=0.123; + chi[1]=-0.534; + chi_prime[0]=0.912; + chi_prime[1]=0.078; + set_bc_parms(bc,1.0,1.0,0.953,1.203,chi,chi_prime); + print_bc_parms(); + + if (my_rank==0) + { + find_section("SAP"); + read_iprms("bs",4,bs); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + set_sap_parms(bs,1,4,5); + + if (my_rank==0) + { + find_section("Deflation subspace"); + read_iprms("bs",4,bs); + read_line("Ns","%d",&Ns); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&Ns,1,MPI_INT,0,MPI_COMM_WORLD); + set_dfl_parms(bs,Ns); + + if (my_rank==0) + { + find_section("Deflation subspace generation"); + read_line("kappa","%lf",&kappa); + read_line("mu","%lf",&mu); + read_line("ninv","%d",&ninv); + read_line("nmr","%d",&nmr); + read_line("ncy","%d",&ncy); + } + + MPI_Bcast(&kappa,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&mu,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&ninv,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmr,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&ncy,1,MPI_INT,0,MPI_COMM_WORLD); + set_dfl_gen_parms(kappa,mu,ninv,nmr,ncy); + + if (my_rank==0) + { + find_section("Deflation projection"); + read_line("nkv","%d",&nkv); + read_line("nmx","%d",&nmx); + read_line("res","%lf",&res); + } + + MPI_Bcast(&nkv,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmx,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&res,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + set_dfl_pro_parms(nkv,nmx,res); + + set_hmc_parms(0,NULL,1,0,NULL,1,1.0); + mnkv=0; + + for (isp=0;isp<3;isp++) + { + read_solver_parms(isp); + sp=solver_parms(isp); + + if (sp.nkv>mnkv) + mnkv=sp.nkv; + } + + if (my_rank==0) + fclose(fin); + + print_solver_parms(&isap,&idfl); + print_sap_parms(1); + print_dfl_parms(0); + + start_ranlux(0,1245); + geometry(); + + set_sw_parms(-0.0123); + mnkv=2*mnkv+2; + if (mnkv<(Ns+2)) + mnkv=Ns+2; + if (mnkv<5) + mnkv=5; + + alloc_ws(mnkv); + alloc_wsd(6); + alloc_wv(2*nkv+2); + alloc_wvd(4); + + for (isp=0;isp<3;isp++) + { + if (isp==0) + { + mu0=1.0; + mu1=1.5; + eps=1.0e-4; + } + else if (isp==1) + { + mu0=0.1; + mu1=0.25; + eps=2.0e-4; + } + else + { + mu0=0.01; + mu1=0.02; + eps=3.0e-4; + } + + random_ud(); + chs_ubnd(-1); + random_mom(); + + if (isp==2) + { + dfl_modes(status); + error_root(status[0]<0,1,"main [check7.c]", + "dfl_modes failed"); + } + + status[0]=0; + status[1]=0; + + act0=setpf2(mu0,mu1,0,isp,0,status); + error_root((status[0]<0)||(status[1]<0),1,"main [check7.c]", + "setpf2 failed (isp,mu0,mu1=%d,%.2e,%.2e)",isp,mu0,mu1); + act1=action2(mu0,mu1,0,isp,0,status); + error_root((status[0]<0)||(status[1]<0),1,"main [check7.c]", + "action2 failed (isp,mu0,mu1=%d,%.2e,%.2e)",isp,mu0,mu1); + + rdmy=fabs(act1-act0); + MPI_Reduce(&rdmy,dev_act,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + rdmy=act1-act0; + MPI_Reduce(&rdmy,dev_act+1,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(dev_act,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + + dsdt=dSdt(mu0,mu1,0,isp,status); + + if (my_rank==0) + { + printf("Solver number %d, mu0 = %.2e, mu1 = %.2e\n",isp,mu0,mu1); + + if (isp==0) + printf("Status = %d\n",status[0]); + else if (isp==1) + printf("Status = %d,%d\n",status[0],status[1]); + else + printf("Status = (%d,%d,%d),(%d,%d,%d)\n", + status[0],status[1],status[2],status[3], + status[4],status[5]); + + printf("Absolute action difference |setpf2-action2| = %.1e,", + fabs(dev_act[1])); + printf(" %.1e (local)\n",dev_act[0]); + fflush(flog); + } + + rot_ud(eps); + act0=2.0*action2(mu0,mu1,0,isp,0,status)/3.0; + rot_ud(-eps); + + rot_ud(-eps); + act1=2.0*action2(mu0,mu1,0,isp,0,status)/3.0; + rot_ud(eps); + + rot_ud(2.0*eps); + act0-=action2(mu0,mu1,0,isp,0,status)/12.0; + rot_ud(-2.0*eps); + + rot_ud(-2.0*eps); + act1-=action2(mu0,mu1,0,isp,0,status)/12.0; + rot_ud(2.0*eps); + + dact=1.2345*(act0-act1)/eps; + dev_frc=dsdt-dact; + sig_loss=-log10(fabs(1.0-act0/act1)); + + rdmy=dsdt; + MPI_Reduce(&rdmy,&dsdt,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(&dsdt,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + rdmy=dev_frc; + MPI_Reduce(&rdmy,&dev_frc,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(&dev_frc,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + rdmy=sig_loss; + MPI_Reduce(&rdmy,&sig_loss,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Bcast(&sig_loss,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + error_chk(); + + if (my_rank==0) + { + printf("Relative deviation of dS/dt = %.2e ",fabs(dev_frc/dsdt)); + printf("[significance loss = %d digits]\n\n",(int)(sig_loss)); + fflush(flog); + } + } + + if (my_rank==0) + fclose(flog); + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check8.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check8.c new file mode 100644 index 0000000000000000000000000000000000000000..49310a55bd4debd53e6a6c3044570f09834f4447 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check8.c @@ -0,0 +1,288 @@ + +/******************************************************************************* +* +* File check8.c +* +* Copyright (C) 2012, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check and performance of the multi-shift CG solver. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "archive.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "dirac.h" +#include "forces.h" +#include "global.h" + +static int my_rank,bc,first,last,step; +static int nmu,nmx; +static double kappa,csw,*mu,cF,cF_prime; +static double uphi[2],uphi_prime[2],m0,*res; +static char cnfg_dir[NAME_SIZE],cnfg_file[NAME_SIZE],nbase[NAME_SIZE]; + + +int main(int argc,char *argv[]) +{ + int nsize,icnfg,status,k,ie; + double nrm,del; + double wt1,wt2,wdt; + spinor_dble *eta,*chi,*phi,**psi,**wsd,**rsd; + lat_parms_t lat; + FILE *flog=NULL,*fin=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check8.log","w",stdout); + fin=freopen("check8.in","r",stdin); + + printf("\n"); + printf("Check and performance of the multi-shift CG solver\n"); + printf("--------------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + find_section("Configurations"); + read_line("name","%s",nbase); + read_line("cnfg_dir","%s",cnfg_dir); + read_line("first","%d",&first); + read_line("last","%d",&last); + read_line("step","%d",&step); + + find_section("Lattice parameters"); + read_line("kappa","%lf",&kappa); + read_line("csw","%lf",&csw); + nmu=count_tokens("mu"); + + find_section("Boundary conditions"); + read_line("type","%d",&bc); + + uphi[0]=0.0; + uphi[1]=0.0; + uphi_prime[0]=0.0; + uphi_prime[1]=0.0; + cF=1.0; + cF_prime=1.0; + + if (bc==1) + read_dprms("uphi",2,uphi); + + if ((bc==1)||(bc==2)) + read_dprms("uphi'",2,uphi_prime); + + if (bc!=3) + read_line("cF","%lf",&cF); + + if (bc==2) + read_line("cF'","%lf",&cF_prime); + else + cF_prime=cF; + } + + MPI_Bcast(nbase,NAME_SIZE,MPI_CHAR,0,MPI_COMM_WORLD); + MPI_Bcast(cnfg_dir,NAME_SIZE,MPI_CHAR,0,MPI_COMM_WORLD); + MPI_Bcast(&first,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&last,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&step,1,MPI_INT,0,MPI_COMM_WORLD); + + MPI_Bcast(&kappa,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&csw,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&nmu,1,MPI_INT,0,MPI_COMM_WORLD); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(uphi,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(uphi_prime,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cF,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cF_prime,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + mu=malloc(2*nmu*sizeof(*mu)); + error(mu==NULL,1,"main [check8.c]","Unable to allocate auxiliary arrays"); + res=mu+nmu; + + if (my_rank==0) + { + find_section("Lattice parameters"); + read_dprms("mu",nmu,mu); + + find_section("CG"); + read_line("nmx","%d",&nmx); + error_root(nmu!=count_tokens("res"),1,"main [check8.c]", + "The numbers of twisted masses and residues do not match"); + read_dprms("res",nmu,res); + + fclose(fin); + } + + MPI_Bcast(mu,nmu,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&nmx,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(res,nmu,MPI_DOUBLE,0,MPI_COMM_WORLD); + + lat=set_lat_parms(5.5,1.0,1,&kappa,csw); + print_lat_parms(); + + set_bc_parms(bc,1.0,1.0,cF,cF_prime,uphi,uphi_prime); + print_bc_parms(); + + start_ranlux(0,1234); + geometry(); + + m0=lat.m0[0]; + set_sw_parms(m0); + + if (my_rank==0) + { + printf("mu = %.6f",mu[0]); + for (k=1;k %sn%d in steps of %d\n\n", + nbase,first,nbase,last,step); + fflush(flog); + } + + if (nmu==1) + alloc_wsd(8); + else + alloc_wsd(5+2*nmu); + + wsd=reserve_wsd(2); + eta=wsd[0]; + chi=wsd[1]; + psi=reserve_wsd(nmu); + + error_root(((last-first)%step)!=0,1,"main [check8.c]", + "last-first is not a multiple of step"); + check_dir_root(cnfg_dir); + nsize=name_size("%s/%sn%d",cnfg_dir,nbase,last); + error_root(nsize>=NAME_SIZE,1,"main [check8.c]", + "configuration file name is too long"); + ie=0; + + for (icnfg=first;icnfg<=last;icnfg+=step) + { + sprintf(cnfg_file,"%s/%sn%d",cnfg_dir,nbase,icnfg); + import_cnfg(cnfg_file); + + if (my_rank==0) + { + printf("Configuration no %d\n\n",icnfg); + fflush(flog); + } + + chs_ubnd(-1); + random_sd(VOLUME,eta,1.0); + bnd_sd2zero(ALL_PTS,eta); + nrm=sqrt(norm_square_dble(VOLUME/2,1,eta)); + assign_sd2sd(VOLUME,eta,chi); + + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + + tmcgm(nmx,res,nmu,mu,eta,psi,&status); + + MPI_Barrier(MPI_COMM_WORLD); + wt2=MPI_Wtime(); + wdt=wt2-wt1; + + error_chk(); + mulr_spinor_add_dble(VOLUME,chi,eta,-1.0); + del=norm_square_dble(VOLUME,1,chi); + error_root(del!=0.0,1,"main [check8.c]", + "Source field is not preserved"); + + if (my_rank==0) + { + printf("status = %d\n",status); + printf("time = %.2e sec (total)\n",wdt); + if (status>0) + printf(" = %.2e usec (per point and CG iteration)\n", + (1.0e6*wdt)/((double)(status)*(double)(VOLUME))); + fflush(flog); + error_root(status<0,1,"main [check8.c]", + "Solver did not converge"); + printf("residues = "); + } + + rsd=reserve_wsd(1); + phi=rsd[0]; + status=0; + + for (k=0;k +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "mdflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "dfl.h" +#include "forces.h" +#include "global.h" + + +#define N0 (NPROC0*L0) + + +static void rot_ud(double eps) +{ + int bc,ix,t,ifc; + su3_dble *u; + su3_alg_dble *mom; + mdflds_t *mdfs; + + bc=bc_type(); + mdfs=mdflds(); + mom=(*mdfs).mom; + u=udfld(); + + for (ix=(VOLUME/2);ix]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.782); + print_lat_parms(); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + chi[0]=0.123; + chi[1]=-0.534; + chi_prime[0]=0.912; + chi_prime[1]=0.078; + set_bc_parms(bc,1.0,1.0,0.953,1.203,chi,chi_prime); + print_bc_parms(); + + read_rat_parms(0); + + if (my_rank==0) + { + find_section("SAP"); + read_iprms("bs",4,bs); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + set_sap_parms(bs,1,4,5); + + if (my_rank==0) + { + find_section("Deflation subspace"); + read_iprms("bs",4,bs); + read_line("Ns","%d",&Ns); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&Ns,1,MPI_INT,0,MPI_COMM_WORLD); + set_dfl_parms(bs,Ns); + + if (my_rank==0) + { + find_section("Deflation subspace generation"); + read_line("kappa","%lf",&kappa); + read_line("mu","%lf",&mu); + read_line("ninv","%d",&ninv); + read_line("nmr","%d",&nmr); + read_line("ncy","%d",&ncy); + } + + MPI_Bcast(&kappa,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&mu,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&ninv,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmr,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&ncy,1,MPI_INT,0,MPI_COMM_WORLD); + set_dfl_gen_parms(kappa,mu,ninv,nmr,ncy); + + if (my_rank==0) + { + find_section("Deflation projection"); + read_line("nkv","%d",&nkv); + read_line("nmx","%d",&nmx); + read_line("res","%lf",&res); + } + + MPI_Bcast(&nkv,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmx,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&res,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + set_dfl_pro_parms(nkv,nmx,res); + + set_hmc_parms(0,NULL,1,0,NULL,1,1.0); + mnkv=0; + + for (isp=0;isp<3;isp++) + { + read_solver_parms(isp); + sp=solver_parms(isp); + + if (sp.nkv>mnkv) + mnkv=sp.nkv; + } + + if (my_rank==0) + fclose(fin); + + print_rat_parms(); + print_solver_parms(&isap,&idfl); + print_sap_parms(1); + print_dfl_parms(0); + + start_ranlux(0,1245); + geometry(); + + set_sw_parms(-0.0123); + rp=rat_parms(0); + irat[0]=0; + + mnkv=2*mnkv+2; + if (mnkv<(Ns+2)) + mnkv=Ns+2; + if (mnkv<5) + mnkv=5; + + alloc_ws(mnkv); + + if (2*rp.degree>4) + alloc_wsd(2*rp.degree+3); + else + alloc_wsd(7); + + alloc_wv(2*nkv+2); + alloc_wvd(4); + + for (isw=0;isw<2;isw++) + { + for (isp=0;isp<3;isp++) + { + if (isp==0) + { + irat[1]=0; + irat[2]=rp.degree/3; + eps=1.0e-4; + } + else if (isp==1) + { + irat[1]=rp.degree/3+1; + irat[2]=(2*rp.degree)/3; + eps=2.0e-4; + } + else + { + irat[1]=(2*rp.degree)/3+1; + irat[2]=rp.degree-1; + eps=3.0e-4; + } + + random_ud(); + chs_ubnd(-1); + random_mom(); + + if (isp==2) + { + dfl_modes(status); + error_root(status[0]<0,1,"main [check9.c]", + "dfl_modes failed"); + } + + status[0]=0; + status[1]=0; + + act0=setpf3(irat,0,isw,isp,0,status); + error_root((status[0]<0)||(status[1]<0),1, + "main [check9.c]","setpf3 failed " + "(irat=(%d,%d,%d), isp=%d)",irat[0],irat[1],irat[2],isp); + + act1=action3(irat,0,isw,isp,0,status); + error_root((status[0]<0)||(status[1]<0),1, + "main [check9.c]","action3 failed " + "(irat=(%d,%d,%d), isp=%d)",irat[0],irat[1],irat[2],isp); + + rdmy=act1-act0; + MPI_Reduce(&rdmy,&dev_act,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(&dev_act,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + rot_ud(eps); + dsdt=dSdt(irat,0,isw,isp,status); + + if (my_rank==0) + { + printf("Solver number %d, poles %d,..,%d, isw %d\n", + isp,irat[1],irat[2],isw); + + if (isp==0) + printf("Status = %d\n",status[0]); + else if (isp==1) + printf("Status = %d,%d\n",status[0],status[1]); + else + printf("Status = (%d,%d,%d),(%d,%d,%d)\n", + status[0],status[1],status[2],status[3], + status[4],status[5]); + + printf("Absolute action difference |setpf3-action3| = %.1e\n", + fabs(dev_act)); + fflush(flog); + } + + rot_ud(eps); + act0=2.0*action3(irat,0,isw,isp,0,status)/3.0; + rot_ud(-eps); + + rot_ud(-eps); + act1=2.0*action3(irat,0,isw,isp,0,status)/3.0; + rot_ud(eps); + + rot_ud(2.0*eps); + act0-=action3(irat,0,isw,isp,0,status)/12.0; + rot_ud(-2.0*eps); + + rot_ud(-2.0*eps); + act1-=action3(irat,0,isw,isp,0,status)/12.0; + rot_ud(2.0*eps); + + dact=1.2345*(act0-act1)/eps; + dev_frc=dsdt-dact; + sig_loss=-log10(fabs(1.0-act0/act1)); + + rdmy=dsdt; + MPI_Reduce(&rdmy,&dsdt,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(&dsdt,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + rdmy=dev_frc; + MPI_Reduce(&rdmy,&dev_frc,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(&dev_frc,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + rdmy=sig_loss; + MPI_Reduce(&rdmy,&sig_loss,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Bcast(&sig_loss,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + error_chk(); + + if (my_rank==0) + { + printf("Relative deviation of dS/dt = %.2e ",fabs(dev_frc/dsdt)); + printf("[significance loss = %d digits]\n\n",(int)(sig_loss)); + fflush(flog); + } + } + } + + if (my_rank==0) + fclose(flog); + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check9.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check9.in new file mode 100644 index 0000000000000000000000000000000000000000..448bc613a73b1e96f8d3fe5975d910dd75af462e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/check9.in @@ -0,0 +1,47 @@ + +[Rational 0] +degree 12 +range 0.001 7.9 + +[Solver 0] +solver MSCG +nmx 256 +res 1.0e-12 + +[Solver 1] +solver SAP_GCR +nmx 128 +nkv 16 +isolv 0 +nmr 4 +ncy 3 +res 1.0e-12 + +[Solver 2] +solver DFL_SAP_GCR +nmx 64 +nkv 16 +isolv 1 +nmr 4 +ncy 5 +res 1.0e-12 + +[SAP] +bs 4 4 4 4 + +[Deflation subspace] +bs 4 4 4 4 +Ns 8 + +[Deflation subspace generation] +kappa 0.1350 +mu 0.01 +ninv 5 +nmr 4 +ncy 5 + +[Deflation projection] +nkv 16 +nmx 64 +res 1.0e-2 + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/time1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/time1.c new file mode 100644 index 0000000000000000000000000000000000000000..7d44aed8057e3b91864783c947ef2daee14295eb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/forces/time1.c @@ -0,0 +1,229 @@ + +/******************************************************************************* +* +* File time1.c +* +* Copyright (C) 2005, 2008-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Timing of plaq_frc(), sw_frc() and hop_frc(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "mdflds.h" +#include "forces.h" +#include "global.h" + + +int main(int argc,char *argv[]) +{ + int my_rank,bc,n,count; + double phi[2],phi_prime[2]; + double wt1,wt2,wdt; + FILE *flog=NULL; + spinor_dble **wsd; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("time1.log","w",stdout); + + printf("\n"); + printf("Timing of plaq_frc(), sw_frc() and hop_frc()\n"); + printf("--------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [time1.c]", + "Syntax: time1 [-bc ]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.55,0.78,0.9012,1.2034,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,12345); + geometry(); + + set_sw_parms(-0.1235); + alloc_wsd(2); + wsd=reserve_wsd(2); + + random_ud(); + chs_ubnd(-1); + random_sd(VOLUME,wsd[0],1.0); + random_sd(VOLUME,wsd[1],1.0); + bnd_sd2zero(ALL_PTS,wsd[0]); + bnd_sd2zero(ALL_PTS,wsd[1]); + + plaq_frc(); + set_frc2zero(); + set_xt2zero(); + add_prod2xt(-0.5,wsd[0],wsd[1]); + add_prod2xv(-0.5,wsd[0],wsd[1]); + sw_frc(1.0); + hop_frc(1.0); + + n=(int)(3.0e6/(double)(4*VOLUME)); + if (n<2) + n=2; + wdt=0.0; + + while (wdt<5.0) + { + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + for (count=0;count that allows the +type of boundary condition to be chosen at runtime. When the option is not +set, open boundary conditions are assumed. + +The option may be set but has no effect in the case of check1. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/lattice/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/lattice/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..9476981cd86ad9ab1abc85fb4334dff22acc0fb8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/lattice/Makefile @@ -0,0 +1,128 @@ +################################################################################ +# +# Makefile to compile and link C programs with MPI subroutines +# +# Version valid for Linux machines with MPICH +# +# "make" compiles and links the specified main programs and modules, +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files generated by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +all: rmxeq mkdep mkxeq +.PHONY: all + + +# main programs and modules to be compiled + +MAIN = check1 check2 check3 + +FLAGS = flags lat_parms dfl_parms + +LATTICE = bcnds uidx geometry + +RANDOM = ranlux ranlxs ranlxd gauss random_su3 + +UFLDS = plaq_sum shift uflds udcom bstap + +SFLDS = sflds + +SU3FCTS = su3prod su3ren cm3x3 + +UTILS = endian mutils utils wspace + +MODULES = $(FLAGS) $(LATTICE) $(RANDOM) $(UFLDS) $(SFLDS) $(SU3FCTS) \ + $(UTILS) + + +# Logging option (-mpilog or -mpitrace or -mpianim) + +LOGOPTION = + + +# search path for modules + +MDIR = ../../modules + +VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/random:$(MDIR)/uflds:\ + $(MDIR)/sflds:$(MDIR)/su3fcts:$(MDIR)/utils + + +# additional include directories + +INCPATH = $(MPI_INCLUDE) ../../include + + +# additional libraries + +LIBS = m + +LIBPATH = $(MPI_HOME)/lib + + +# scheduling and optimization options + +CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ + -Wall -Wno-long-long -Wstrict-prototypes -Werror \ + -O -mno-avx -Dx64 -DPM + + +############################## do not change ################################### + +SHELL=/bin/bash +CC=$(MPI_HOME)/bin/mpicc +CLINKER=$(CC) + +PGMS= $(MAIN) $(MODULES) + +-include $(addsuffix .d,$(PGMS)) + + +# rule to make dependencies + +$(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + +# rule to link object files + +$(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(CFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ + + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables + +rmxeq: + @ -rm -f $(MAIN); \ + echo "delete old executables" + + +# make dependencies + +mkdep: $(addsuffix .d,$(PGMS)) + @ echo "generate tables of dependencies" + + +# clean directory + +clean: + @ -rm -rf *.d *.o *.alog *.clog *.slog $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/lattice/check1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/lattice/check1.c new file mode 100644 index 0000000000000000000000000000000000000000..b4723d6850f8fe3d4cf0b8934538a9309e18b651 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/lattice/check1.c @@ -0,0 +1,327 @@ + +/******************************************************************************* +* +* File check1.c +* +* Copyright (C) 2005, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Consistency checks on the global index arrays cpr,...,map +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "utils.h" +#include "lattice.h" +#include "global.h" + +#define NPROC_BLK (NPROC0_BLK*NPROC1_BLK*NPROC2_BLK*NPROC3_BLK) + +static int ip_test[NPROC]; +static int ix_test[VOLUME]; +static int ia[2][9]; + +static void set_ia(void) +{ + int ifc; + + ia[0][0]=0; + ia[0][1]=ia[0][0]+(FACE0/2); + ia[0][2]=ia[0][1]+(FACE0/2); + ia[0][3]=ia[0][2]+(FACE1/2); + ia[0][4]=ia[0][3]+(FACE1/2); + ia[0][5]=ia[0][4]+(FACE2/2); + ia[0][6]=ia[0][5]+(FACE2/2); + ia[0][7]=ia[0][6]+(FACE3/2); + ia[0][8]=ia[0][7]+(FACE3/2); + + for (ifc=0;ifc<9;ifc++) + ia[1][ifc]=ia[0][ifc]+(BNDRY/2); +} + + +int main(int argc,char *argv[]) +{ + int my_rank,itest; + int in,ir,n[4]; + int mu,ix,x0,x1,x2,x3; + int iy0,iy1,iy2,iy3,iz0,iz1,iz2,iz3; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check1.log","w",stdout); + + printf("\n"); + printf("Consistency checks on the global index arrays cpr,...,map\n"); + printf("---------------------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d local lattice\n",L0,L1,L2,L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d grid blocks\n\n", + NPROC0_BLK,NPROC1_BLK,NPROC2_BLK,NPROC3_BLK); + } + + geometry(); + set_ia(); + + error(my_rank!=ipr_global(cpr),1, + "main [check1.c]","Processor coordinates are incorrect"); + + if (my_rank==0) + { + for (in=0;in=(ir+NPROC_BLK))) + itest=2; + } + + error(itest==1,1, + "main [check1.c]","ipr_global is process dependent"); + + error(itest==2,1, + "main [check1.c]","Processes are not properly blocked"); + + n[0]=cpr[0]; + n[1]=cpr[1]; + n[2]=cpr[2]; + n[3]=cpr[3]; + + for (mu=0;mu<4;mu++) + { + n[mu]-=1; + if (npr[2*mu]!=ipr_global(n)) + itest=1; + n[mu]+=2; + if (npr[2*mu+1]!=ipr_global(n)) + itest=1; + n[mu]-=1; + } + + error(itest==1,1, + "main [check1.c]","npr is incorrect"); + + for (ix=0;ix=VOLUME)) + itest=1; + else + ix_test[ix]+=1; + } + } + } + } + + error(itest==1,1, + "main [check1.c]","The index ipt is out of range"); + + for (ix=0;ix=(VOLUME/2)))||((ir==1)&&(ix<(VOLUME/2)))) + itest=1; + + ir=(ir+1)%2; + iy0=iup[ix][0]; + iz0=ipt[x3+L3*x2+L2*L3*x1+L1*L2*L3*((x0+1)%L0)]; + + if ((x0==(L0-1))&&(NPROC0>1)) + { + iy0-=VOLUME; + if ((iy0=ia[ir][2])) + itest=2; + else + iy0=map[iy0]; + } + + iy1=iup[ix][1]; + iz1=ipt[x3+L3*x2+L2*L3*((x1+1)%L1)+L1*L2*L3*x0]; + + if ((x1==(L1-1))&&(NPROC1>1)) + { + iy1-=VOLUME; + if ((iy1=ia[ir][4])) + itest=2; + else + iy1=map[iy1]; + } + + iy2=iup[ix][2]; + iz2=ipt[x3+L3*((x2+1)%L2)+L2*L3*x1+L1*L2*L3*x0]; + + if ((x2==(L2-1))&&(NPROC2>1)) + { + iy2-=VOLUME; + if ((iy2=ia[ir][6])) + itest=2; + else + iy2=map[iy2]; + } + + iy3=iup[ix][3]; + iz3=ipt[((x3+1)%L3)+L3*x2+L2*L3*x1+L1*L2*L3*x0]; + + if ((x3==(L3-1))&&(NPROC3>1)) + { + iy3-=VOLUME; + if ((iy3=ia[ir][8])) + itest=2; + else + iy3=map[iy3]; + } + + if ((iy0!=iz0)||(iy1!=iz1)||(iy2!=iz2)||(iy3!=iz3)) + itest=3; + + iy0=idn[ix][0]; + iz0=ipt[x3+L3*x2+L2*L3*x1+L1*L2*L3*((x0+L0-1)%L0)]; + + if ((x0==0)&&(NPROC0>1)) + { + iy0-=VOLUME; + if ((iy0=ia[ir][1])) + itest=4; + else + iy0=map[iy0]; + } + + iy1=idn[ix][1]; + iz1=ipt[x3+L3*x2+L2*L3*((x1+L1-1)%L1)+L1*L2*L3*x0]; + + if ((x1==0)&&(NPROC1>1)) + { + iy1-=VOLUME; + if ((iy1=ia[ir][3])) + itest=4; + else + iy1=map[iy1]; + } + + iy2=idn[ix][2]; + iz2=ipt[x3+L3*((x2+L2-1)%L2)+L2*L3*x1+L1*L2*L3*x0]; + + if ((x2==0)&&(NPROC2>1)) + { + iy2-=VOLUME; + if ((iy2=ia[ir][5])) + itest=4; + else + iy2=map[iy2]; + } + + iy3=idn[ix][3]; + iz3=ipt[((x3+L3-1)%L3)+L3*x2+L2*L3*x1+L1*L2*L3*x0]; + + if ((x3==0)&&(NPROC3>1)) + { + iy3-=VOLUME; + if ((iy3=ia[ir][7])) + itest=4; + else + iy3=map[iy3]; + } + + if ((iy0!=iz0)||(iy1!=iz1)||(iy2!=iz2)||(iy3!=iz3)) + itest=5; + } + } + } + } + + error(itest==1,1, + "main [check1.c]","The index ipt does not respect eo ordering"); + error(itest==2,1, + "main [check1.c]","The index iup is out of range at the boundaries"); + error(itest==3,1, + "main [check1.c]","The index iup (combined with map) is incorrect"); + error(itest==4,1, + "main [check1.c]","The index idn is out of range at the boundaries"); + error(itest==5,1, + "main [check1.c]","The index idn (combined with map) is incorrect"); + + if (my_rank==0) + { + printf("The lattice is correctly mapped by the global arrays\n\n"); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/lattice/check2.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/lattice/check2.c new file mode 100644 index 0000000000000000000000000000000000000000..6e1c93dfd61ef68e0b91d5505cdeb1c36c2b5a1f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/lattice/check2.c @@ -0,0 +1,594 @@ + +/******************************************************************************* +* +* File check2.c +* +* Copyright (C) 2010, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of the programs set_bc(), check_bc() and chs_ubnd(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "random.h" +#include "su3fcts.h" +#include "utils.h" +#include "uflds.h" +#include "lattice.h" +#include "global.h" + +#define N0 (NPROC0*L0) + + +static void new_fld(int ibnd) +{ + su3_dble *ud,*udm; + + ud=udfld(); + udm=ud+4*VOLUME; + + for (;ud0)&&(t<(N0-1)))|| + ((t==0)&&((ifc==0)||((ifc==1)&&(bc!=0))||((ifc>=2)&&(bc!=1))))|| + ((t==(N0-1))&&(bc!=0))) + ie|=cmp_ud(u,v); + + u+=1; + v+=1; + } + } + + return ie; +} + + +static int check_diag(su3_dble *u) +{ + int i,ie; + double r[18]; + complex_dble z; + + ie=0; + + r[ 0]=(*u).c11.re; + r[ 1]=(*u).c11.im; + r[ 2]=(*u).c12.re; + r[ 3]=(*u).c12.im; + r[ 4]=(*u).c13.re; + r[ 5]=(*u).c13.im; + + r[ 6]=(*u).c21.re; + r[ 7]=(*u).c21.im; + r[ 8]=(*u).c22.re; + r[ 9]=(*u).c22.im; + r[10]=(*u).c23.re; + r[11]=(*u).c23.im; + + r[12]=(*u).c31.re; + r[13]=(*u).c31.im; + r[14]=(*u).c32.re; + r[15]=(*u).c32.im; + r[16]=(*u).c33.re; + r[17]=(*u).c33.im; + + ie|=(fabs(r[ 0]*r[ 0]+r[ 1]*r[ 1]-1.0)>(8.0*DBL_EPSILON)); + ie|=(fabs(r[ 8]*r[ 8]+r[ 9]*r[ 9]-1.0)>(8.0*DBL_EPSILON)); + ie|=(fabs(r[16]*r[16]+r[17]*r[17]-1.0)>(8.0*DBL_EPSILON)); + + z.re=r[0]*r[8]-r[1]*r[9]; + z.im=r[0]*r[9]+r[1]*r[8]; + ie|=(fabs(z.re*r[16]-z.im*r[17]-1.0)>(16.0*DBL_EPSILON)); + ie|=(fabs(z.re*r[17]+z.im*r[16])>(16.0*DBL_EPSILON)); + + for (i=0;i<18;i++) + { + if (((i>1)&&(i<8))||((i>9)&&(i<16))) + ie|=(r[i]!=0.0); + } + + return ie; +} + + +static int check_bval(su3_dble *u) +{ + int bc,ie,ifc; + int ipt,npts,*pts; + + ie=0; + bc=bc_type(); + + if (bc==1) + { + pts=bnd_pts(&npts); + + if (npts>0) + { + pts+=(npts/2); + + ie|=check_diag(u+8*(pts[0]-(VOLUME/2))+2); + ie|=check_diag(u+8*(pts[0]-(VOLUME/2))+4); + ie|=check_diag(u+8*(pts[0]-(VOLUME/2))+6); + + for (ipt=0;ipt<(npts/2);ipt++) + { + for (ifc=2;ifc<8;ifc++) + ie|=cmp_ud(u+8*(pts[0]-(VOLUME/2))+2*(ifc/2), + u+8*(pts[ipt]-(VOLUME/2))+ifc); + } + } + } + + if (((bc==1)||(bc==2))&&(cpr[0]==(NPROC0-1))) + { + u+=4*VOLUME+7*(BNDRY/4); + + ie|=check_diag(u); + ie|=check_diag(u+1); + ie|=check_diag(u+2); + } + + return ie; +} + + +static complex_dble detu(su3_dble *u) +{ + complex_dble z,w; + + z.re= + (*u).c22.re*(*u).c33.re-(*u).c22.im*(*u).c33.im- + (*u).c32.re*(*u).c23.re+(*u).c32.im*(*u).c23.im; + + z.im= + (*u).c22.re*(*u).c33.im+(*u).c22.im*(*u).c33.re- + (*u).c32.re*(*u).c23.im-(*u).c32.im*(*u).c23.re; + + w.re=(*u).c11.re*z.re-(*u).c11.im*z.im; + w.im=(*u).c11.re*z.im+(*u).c11.im*z.re; + + z.re= + (*u).c32.re*(*u).c13.re-(*u).c32.im*(*u).c13.im- + (*u).c12.re*(*u).c33.re+(*u).c12.im*(*u).c33.im; + + z.im= + (*u).c32.re*(*u).c13.im+(*u).c32.im*(*u).c13.re- + (*u).c12.re*(*u).c33.im-(*u).c12.im*(*u).c33.re; + + w.re+=((*u).c21.re*z.re-(*u).c21.im*z.im); + w.im+=((*u).c21.re*z.im+(*u).c21.im*z.re); + + z.re= + (*u).c12.re*(*u).c23.re-(*u).c12.im*(*u).c23.im- + (*u).c22.re*(*u).c13.re+(*u).c22.im*(*u).c13.im; + + z.im= + (*u).c12.re*(*u).c23.im+(*u).c12.im*(*u).c23.re- + (*u).c22.re*(*u).c13.im-(*u).c22.im*(*u).c13.re; + + w.re+=((*u).c31.re*z.re-(*u).c31.im*z.im); + w.im+=((*u).c31.re*z.im+(*u).c31.im*z.re); + + return w; +} + + +static double check_detu(int ibc,su3_dble *u) +{ + int bc,ix,t,ifc; + double d,dmax; + complex_dble z; + + bc=bc_type(); + dmax=0.0; + + for (ix=(VOLUME/2);ixdmax) + dmax=d; + u+=1; + + z=detu(u); + + if ((bc==3)&&(ibc==-1)) + d=fabs(z.re+1.0)+fabs(z.im); + else if (bc==0) + d=fabs(z.re)+fabs(z.im); + else + d=fabs(z.re-1.0)+fabs(z.im); + + if (d>dmax) + dmax=d; + u+=1; + + for (ifc=2;ifc<8;ifc++) + { + z=detu(u); + d=fabs(z.re-1.0)+fabs(z.im); + if (d>dmax) + dmax=d; + u+=1; + } + } + else if (t==(N0-1)) + { + z=detu(u); + + if ((bc==3)&&(ibc==-1)) + d=fabs(z.re+1.0)+fabs(z.im); + else if (bc==0) + d=fabs(z.re)+fabs(z.im); + else + d=fabs(z.re-1.0)+fabs(z.im); + + if (d>dmax) + dmax=d; + u+=1; + + for (ifc=1;ifc<8;ifc++) + { + z=detu(u); + d=fabs(z.re-1.0)+fabs(z.im); + if (d>dmax) + dmax=d; + u+=1; + } + } + else + { + for (ifc=0;ifc<8;ifc++) + { + z=detu(u); + d=fabs(z.re-1.0)+fabs(z.im); + if (d>dmax) + dmax=d; + u+=1; + } + } + } + + if (NPROC>1) + { + d=dmax; + MPI_Reduce(&d,&dmax,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Bcast(&dmax,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + + return dmax; +} + + +static int scmp_ud(su3_dble *u,su3_dble *v) +{ + int i; + double r[18]; + + r[ 0]=(*u).c11.re+(*v).c11.re; + r[ 1]=(*u).c11.im+(*v).c11.im; + r[ 2]=(*u).c12.re+(*v).c12.re; + r[ 3]=(*u).c12.im+(*v).c12.im; + r[ 4]=(*u).c13.re+(*v).c13.re; + r[ 5]=(*u).c13.im+(*v).c13.im; + + r[ 6]=(*u).c21.re+(*v).c21.re; + r[ 7]=(*u).c21.im+(*v).c21.im; + r[ 8]=(*u).c22.re+(*v).c22.re; + r[ 9]=(*u).c22.im+(*v).c22.im; + r[10]=(*u).c23.re+(*v).c23.re; + r[11]=(*u).c23.im+(*v).c23.im; + + r[12]=(*u).c31.re+(*v).c31.re; + r[13]=(*u).c31.im+(*v).c31.im; + r[14]=(*u).c32.re+(*v).c32.re; + r[15]=(*u).c32.im+(*v).c32.im; + r[16]=(*u).c33.re+(*v).c33.re; + r[17]=(*u).c33.im+(*v).c33.im; + + for (i=0;i<18;i++) + { + if (r[i]!=0.0) + return 1; + } + + return 0; +} + + +static int cmp_all(int ibc,su3_dble *u,su3_dble *v) +{ + int ix,t,ifc,ie,bc; + + bc=bc_type(); + ie=0; + + for (ix=(VOLUME/2);ix<(VOLUME);ix++) + { + t=global_time(ix); + + if (t==0) + { + ie|=cmp_ud(u,v); + u+=1; + v+=1; + + if ((bc==3)&&(ibc==-1)) + ie|=scmp_ud(u,v); + else + ie|=cmp_ud(u,v); + + u+=1; + v+=1; + + for (ifc=2;ifc<8;ifc++) + { + ie|=cmp_ud(u,v); + u+=1; + v+=1; + } + + } + else if (t==(N0-1)) + { + if ((bc==3)&&(ibc==-1)) + ie|=scmp_ud(u,v); + else + ie|=cmp_ud(u,v); + + u+=1; + v+=1; + + for (ifc=1;ifc<8;ifc++) + { + ie|=cmp_ud(u,v); + u+=1; + v+=1; + } + } + else + { + for (ifc=0;ifc<8;ifc++) + { + ie|=cmp_ud(u,v); + u+=1; + v+=1; + } + } + } + + return ie; +} + + +int main(int argc,char *argv[]) +{ + int my_rank,bc,ie; + double phi[2],phi_prime[2]; + double cG,cG_prime,cF,cF_prime; + double dev0,dev1; + su3_dble *udb,**usv; + bc_parms_t bcp; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check2.log","w",stdout); + printf("\n"); + printf("Check of set_bc() and check_bc()\n"); + printf("--------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check2.c]", + "Syntax: check2 [-bc ]"); + } + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + cG=0.97; + cG_prime=1.056; + cF=0.82; + cF_prime=1.12; + set_bc_parms(bc,cG,cG_prime,cF,cF_prime,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,12345); + geometry(); + alloc_wud(1); + usv=reserve_wud(1); + udb=udfld(); + + ie=0; + bcp=bc_parms(); + error(bcp.type!=bc,1,"main [check2.c]", + "Type of boundary condition is not properly set"); + + if (bc!=3) + { + ie|=(cG!=bcp.cG[0]); + ie|=(cF!=bcp.cF[0]); + } + + if (bc<=1) + { + ie|=(bcp.cG[0]!=bcp.cG[1]); + ie|=(bcp.cF[0]!=bcp.cF[1]); + } + + if (bc==2) + { + ie|=(cG_prime!=bcp.cG[1]); + ie|=(cF_prime!=bcp.cF[1]); + } + + if (bc==1) + { + ie|=(phi[0]!=bcp.phi[0][0]); + ie|=(phi[1]!=bcp.phi[0][1]); + ie|=(bcp.phi[0][2]!=-bcp.phi[0][0]-bcp.phi[0][1]); + } + + if ((bc==1)||(bc==2)) + { + ie|=(phi_prime[0]!=bcp.phi[1][0]); + ie|=(phi_prime[1]!=bcp.phi[1][1]); + ie|=(bcp.phi[1][2]!=-bcp.phi[1][0]-bcp.phi[1][1]); + } + + error(ie,1,"main [check2.c]","Boundary parameters are not properly set"); + + ie=check_bc(0.0); + error(ie!=1,1,"main [check2.c]", + "check_bc() gives the wrong answer"); + + new_fld(0); + ie=check_bc(0.0); + error(((bc<2)&&(ie!=0))||((bc>=2)&&(ie!=1)),2,"main [check2.c]", + "check_bc() gives the wrong answer"); + + new_fld(1); + ie=check_bc(0.0); + error(((bc<3)&&(ie!=0))||((bc==3)&&(ie!=1)),2,"main [check2.c]", + "check_bc() gives the wrong answer"); + + cm3x3_assign(4*VOLUME,udb,usv[0]); + set_bc(); + ie=check_bc(0.0); + error(ie!=1,2,"main [check2.c]", + "check_bc() gives the wrong answer"); + + ie=cmp_active(udb,usv[0]); + error(ie!=0,2,"main [check2.c]", + "Active link variables are modified by set_bc()"); + + ie=check_bval(udb); + error(ie!=0,2,"main [check2.c]", + "Boundary values are not properly set by set_bc()"); + + random_ud(); + cm3x3_assign(4*VOLUME,udb,usv[0]); + dev0=check_detu(1,udb); + ie=chs_ubnd(-1); + error(((bc==3)&&(ie==0))||((bc!=3)&&(ie==1)),1,"main [check2.c]", + "Incorrect return value of chs_ubnd()"); + dev1=check_detu(-1,udb); + + if (my_rank==0) + { + printf("Maximal deviation |1-det{U}|=%.1e (random field)\n",dev0); + printf(" =%.1e (after chs_ubnd)\n\n",dev1); + } + + ie=cmp_all(-1,udb,usv[0]); + error(ie!=0,3,"main [check2.c]","Incorrect action of chs_ubnd()"); + ie=chs_ubnd(1); + error(((bc==3)&&(ie==0))||((bc!=3)&&(ie==1)),2,"main [check2.c]", + "Incorrect return value of chs_ubnd()"); + ie=cmp_all(1,udb,usv[0]); + error(ie!=0,4,"main [check2.c]","Incorrect action of chs_ubnd()"); + ie=chs_ubnd(1); + error(ie!=0,3,"main [check2.c]", + "Incorrect return value of chs_ubnd()"); + + if (my_rank==0) + { + printf("No errors detected --- all programs work correctly\n\n"); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/lattice/check3.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/lattice/check3.c new file mode 100644 index 0000000000000000000000000000000000000000..00a2345ac0fc056c1f019b2298aa015c9d975e98 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/lattice/check3.c @@ -0,0 +1,218 @@ + +/******************************************************************************* +* +* File check3.c +* +* Copyright (C) 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of the programs bnd_s2zero() and bnd_sd2zero(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "random.h" +#include "su3fcts.h" +#include "utils.h" +#include "sflds.h" +#include "lattice.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define NFLDS 3 + +typedef union +{ + spinor s; + float r[24]; +} spin_t; + +typedef union +{ + spinor_dble s; + double r[24]; +} spin_dble_t; + + +static int is_zero(spinor *s) +{ + int i,ie; + spin_t *sp; + + sp=(spin_t*)(s); + ie=1; + + for (i=0;i<24;i++) + ie&=((*sp).r[i]==0.0f); + + return ie; +} + + +static int is_zero_dble(spinor_dble *s) +{ + int i,ie; + spin_dble_t *sp; + + sp=(spin_dble_t*)(s); + ie=1; + + for (i=0;i<24;i++) + ie&=((*sp).r[i]==0.0); + + return ie; +} + + +static int check_sbnd(ptset_t set,spinor *s) +{ + int bc,ix,t; + int io,ie; + + bc=bc_type(); + ie=1; + + for (ix=0;ix=(VOLUME/2)))); + + if ((io!=0)&&(((t==0)&&(bc!=3))||((t==(N0-1))&&(bc==0)))) + ie&=is_zero(s); + else + ie&=(is_zero(s)^0x1); + + s+=1; + } + + return ie; +} + + +static int check_sbnd_dble(ptset_t set,spinor_dble *s) +{ + int bc,ix,t; + int io,ie; + + bc=bc_type(); + ie=1; + + for (ix=0;ix=(VOLUME/2)))); + + if ((io!=0)&&(((t==0)&&(bc!=3))||((t==(N0-1))&&(bc==0)))) + ie&=is_zero_dble(s); + else + ie&=(is_zero_dble(s)^0x1); + + s+=1; + } + + return ie; +} + + +int main(int argc,char *argv[]) +{ + int my_rank,bc,ie,is,k; + double phi[2],phi_prime[2]; + double cG,cG_prime,cF,cF_prime; + spinor **ps; + spinor_dble **psd; + ptset_t set; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check3.log","w",stdout); + printf("\n"); + printf("Check of the programs bnd_s2zero() and bnd_sd2zero()\n"); + printf("----------------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check3.c]", + "Syntax: check3 [-bc ]"); + } + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + cG=0.97; + cG_prime=1.056; + cF=0.82; + cF_prime=1.12; + set_bc_parms(bc,cG,cG_prime,cF,cF_prime,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,12345); + geometry(); + alloc_ws(NFLDS); + alloc_wsd(NFLDS); + + ps=reserve_ws(NFLDS); + psd=reserve_wsd(NFLDS); + ie=1; + + for (is=0;is<4;is++) + { + if (is==0) + set=EVEN_PTS; + else if (is==1) + set=ODD_PTS; + else if (is==2) + set=ALL_PTS; + else + set=NO_PTS; + + for (k=0;k +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "utils.h" +#include "lattice.h" +#include "linalg.h" +#include "global.h" + +#define NMOM 100033 + +static double var[64],var_all[64]; + + +int main(int argc,char *argv[]) +{ + int my_rank,n,i,j; + double dev,dmax,dmax_all; + double nsq1,nsq2,sprod1,sprod2; + double sm,r[8]; + double rn,cij,eij; + su3_dble *M,*m,w; + su3_alg_dble *X,*Y,*x; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check1.log","w",stdout); + + printf("\n"); + printf("Checks of the programs in the module liealg\n"); + printf("-------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + printf("Number of momenta: %d\n\n",NMOM); + } + + start_ranlux(0,123456); + geometry(); + + X=amalloc(2*NMOM*sizeof(*X),4); + M=amalloc(NMOM*sizeof(*M),4); + error((X==NULL)||(M==NULL),1, + "main [check1.c]","Unable to allocate field arrays"); + Y=X+NMOM; + + set_alg2zero(NMOM,X); + dmax=0.0; + + for (n=0;ndmax) + dmax=dev; + } + + X[n].c3=1.0; + } + + MPI_Reduce(&dmax,&dmax_all,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + + if (my_rank==0) + { + printf("Check of set_alg2zero():\n\n"); + printf("max|X| = %.1e (should be 0.0)\n\n",dmax_all); + } + + dmax=fabs(norm_square_alg(NMOM,1,X)-4.0*(double)(NMOM*NPROC)); + MPI_Reduce(&dmax,&dmax_all,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + + if (my_rank==0) + { + printf("Check of norm_square_alg():\n\n"); + printf("Element count = %.1e (should be 0.0)\n\n",dmax_all); + } + + sm=0.0; + dmax=0.0; + + for (n=0;ndmax) + dmax=dev; + + sm+=nsq2; + } + + MPI_Reduce(&dmax,&dmax_all,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + + if (my_rank==0) + { + printf("|1.0+2*tr{X^2}/||X||^2| = %.1e (single elements)\n",dmax_all); + printf("(should be less than %.1e or so)\n\n",DBL_EPSILON*sqrt(8.0)); + } + + dmax=fabs(1.0-sm/norm_square_alg(NMOM,0,X)); + MPI_Reduce(&dmax,&dmax_all,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + + if (my_rank==0) + { + printf("|1.0+2*tr{X^2}/||X||^2| = %.1e (whole vector)\n",dmax_all); + printf("(should be less than %.1e or so)\n\n", + DBL_EPSILON*sqrt(8.0*(double)(NMOM))); + } + + random_alg(NMOM,X); + random_alg(NMOM,Y); + + nsq1=norm_square_alg(NMOM,1,X); + nsq2=norm_square_alg(NMOM,1,Y); + sprod1=scalar_prod_alg(NMOM,1,X,Y); + + for (n=0;n1)) + { + cij=1.0/4.0; + eij=sqrt(2.0*rn)/4.0; + } + else if (i==j) + { + cij=1.0/9.0; + eij=sqrt(2.0*rn)/9.0; + } + else if ((i==0)&&(j==1)) + { + cij=1.0/18.0; + eij=sqrt(5.0*rn)/18.0; + } + else if ((i<2)&&(j>1)) + { + cij=0.0; + eij=sqrt(rn)/6.0; + } + else + { + cij=0.0; + eij=sqrt(rn)/4.0; + } + + var_all[8*i+j]*=rn; + + if (cij!=0.0) + { + printf(" = % .4e, deviation = %.1e+-%.1e\n", + i,j,var_all[8*i+j],fabs(var_all[8*i+j]-cij),eij); + } + else + { + dev=fabs(var_all[8*i+j])/eij; + + if (dev>dmax) + dmax=dev; + } + } + } + + eij=sqrt(rn)/4.0; + printf("\n"); + printf("For all other i,j, "); + printf("max|| = %.1e (should be %.1e or so)\n\n", + dmax*eij,2.0*eij); + } + + rn=-1.2345; + random_alg(NMOM,X); + random_alg(NMOM,Y); + + nsq1=norm_square_alg(NMOM,1,X); + nsq2=norm_square_alg(NMOM,1,Y); + sprod1=scalar_prod_alg(NMOM,1,X,Y); + + muladd_assign_alg(NMOM,rn,X,Y); + sm=norm_square_alg(NMOM,1,Y)-nsq2-rn*rn*nsq1-2.0*rn*sprod1; + sm=fabs(sm)/nsq1; + + if (my_rank==0) + { + printf("Check of muladd_assign_alg(): %.1e\n",sm); + printf("(should be less than 1.0e-15 or so)\n\n"); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/linalg/check2.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/linalg/check2.c new file mode 100644 index 0000000000000000000000000000000000000000..a68decfd327a24970637087da57e195b722fc87a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/linalg/check2.c @@ -0,0 +1,422 @@ + +/******************************************************************************* +* +* File check2.c +* +* Copyright (C) 2005, 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Checks on the programs in the module salg.c +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "sflds.h" +#include "linalg.h" +#include "global.h" + +#define _acc_sp(z,x,y) \ + (z).re+=(double)((x).re*(y).re+(x).im*(y).im); \ + (z).im+=(double)((x).re*(y).im-(x).im*(y).re) + +static complex v[25]; +static spinor *ppk[5]; + + +static complex sp(int vol,spinor *pk,spinor *pl) +{ + complex w; + complex_dble z; + spinor *pm; + + z.re=0.0; + z.im=0.0; + pm=pk+vol; + + for (;pk1)) + { + if (my_rank==0) + { + if (icom==1) + { + printf("Checks with global summation\n"); + printf("============================\n\n"); + } + else + { + printf("Checks without global summation\n"); + printf("===============================\n\n"); + } + } + + for (ieo=0;ieo<3;ieo++) + { + if (my_rank==0) + { + if (ieo==0) + printf("First case: full lattice\n\n"); + else if (ieo==1) + printf("Second case: even points\n\n"); + else + printf("Third case: odd points\n\n"); + } + + vol=VOLUME/2; + off=0; + + if (ieo==0) + vol=VOLUME; + if (ieo==2) + off=VOLUME/2; + + for (i=0;i<10;i++) + random_s(vol,ps[i]+off,1.0f); + + dmax=0.0; + + for (i=0;i<10;i++) + { + pk=ps[i]+off; + pl=ps[9-i]+off; + + if (icom==1) + { + z=sp(vol,pk,pl); + MPI_Reduce(&z.re,&w.re,2,MPI_FLOAT,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(&w.re,2,MPI_FLOAT,0,MPI_COMM_WORLD); + } + else + w=sp(vol,pk,pl); + + z=spinor_prod(vol,icom,pk,pl); + r=norm_square(vol,icom,pk)*norm_square(vol,icom,pl); + d=(double)((z.re-w.re)*(z.re-w.re)+(z.im-w.im)*(z.im-w.im)); + d=sqrt(d/(double)(r)); + if (d>dmax) + dmax=d; + + r=spinor_prod_re(vol,icom,pk,pl); + d=fabs((double)(z.re/r-1.0f)); + if (d>dmax) + dmax=d; + + z=spinor_prod(vol,icom,pk,pk); + r=norm_square(vol,icom,pk); + + d=fabs((double)(z.im/r)); + if (d>dmax) + dmax=d; + + d=fabs((double)(z.re/r-1.0f)); + if (d>dmax) + dmax=d; + } + + if (my_rank==0) + { + if (dmax>dall) + dall=dmax; + printf("Check of spinor_prod, spinor_prod_re\n"); + printf("and norm_square: %.2e\n\n",dmax); + } + + dmax=0.0; + z.re= 0.345f; + z.im=-0.876f; + zsq=z.re*z.re+z.im*z.im; + + for (i=0;i<9;i++) + { + pk=ps[i]+off; + pl=ps[i+1]+off; + + w=spinor_prod(vol,icom,pk,pl); + r=norm_square(vol,icom,pk)+zsq*norm_square(vol,icom,pl) + +2.0f*(z.re*w.re-z.im*w.im); + mulc_spinor_add(vol,pk,pl,z); + + d=fabs((double)(r/norm_square(vol,icom,pk)-1.0f)); + if (d>dmax) + dmax=d; + } + + if (my_rank==0) + { + if (dmax>dall) + dall=dmax; + printf("Consistency of spinor_prod, norm_square\n"); + printf("and mulc_spinor_add: %.2e\n\n",dmax); + } + + for (i=0;i<10;i++) + random_s(vol,ps[i]+off,1.0f); + + dmax=0.0; + r=-1.234f; + z.re=-r; + z.im=0.0f; + + for (i=0;i<8;i+=3) + { + pk=ps[i]+off; + pl=ps[i+1]+off; + pj=ps[i+2]+off; + + assign_s2s(vol,pk,pj); + mulr_spinor_add(vol,pk,pl,r); + mulc_spinor_add(vol,pk,pl,z); + mulr_spinor_add(vol,pk,pj,-1.0); + + d=(double)(norm_square(vol,icom,pk)/norm_square(vol,icom,pj)); + d=sqrt(d); + if (d>dmax) + dmax=d; + + assign_s2s(vol,pl,pk); + scale(vol,r,pk); + mulc_spinor_add(vol,pk,pl,z); + + d=(double)(norm_square(vol,icom,pk)/norm_square(vol,icom,pl)); + d=sqrt(d); + if (d>dmax) + dmax=d; + } + + if (my_rank==0) + { + if (dmax>dall) + dall=dmax; + printf("Consistency of mulr_spinor_add, scale\n"); + printf("and mulc_spinor_add: %.2e\n\n",dmax); + } + + for (i=0;i<10;i++) + random_s(vol,ps[i]+off,1.0f); + + dmax=0.0; + + for (i=0;i<10;i++) + { + pk=ps[i]+off; + + if (i>0) + { + pl=ps[i-1]+off; + project(vol,icom,pk,pl); + z=spinor_prod(vol,icom,pk,pl); + + d=(fabs((double)(z.re))+ + fabs((double)(z.im)))/ + sqrt((double)(norm_square(vol,icom,pk))); + + if (d>dmax) + dmax=d; + } + + normalize(vol,icom,pk); + r=norm_square(vol,icom,pk); + + d=fabs((double)(r-1.0f)); + if (d>dmax) + dmax=d; + } + + if (my_rank==0) + { + if (dmax>dall) + dall=dmax; + printf("Consistency of spinor_prod, norm_square,\n"); + printf("normalize and project: %.2e\n\n",dmax); + } + + for (i=0;i<5;i++) + { + pk=ps[i]+off; + pl=ps[i+5]+off; + + random_s(vol,ps[i]+off,1.0f); + assign_s2s(vol,pk,pl); + + for (j=0;j<5;j++) + { + v[5*i+j].re=0.1234f*(float)(i^2)-0.8976f*(float)(j); + v[5*i+j].im=0.2231f*(float)(i)+0.9922f*(float)(j^2); + } + + ppk[i]=pl; + } + + rotate(vol,5,ppk,v); + dmax=0.0; + + for (i=5;i<10;i++) + { + pk=ps[i]+off; + + for (j=0;j<5;j++) + { + z.re=-v[5*j+(i-5)].re; + z.im=-v[5*j+(i-5)].im; + + pl=ps[j]+off; + mulc_spinor_add(vol,pk,pl,z); + } + + r=norm_square(vol,icom,pk); + + d=fabs((double)(r)); + if (d>dmax) + dmax=d; + } + + dmax/=(double)(norm_square(vol,icom,ps[0]+off)); + dmax=sqrt(dmax); + + if (my_rank==0) + { + if (dmax>dall) + dall=dmax; + printf("Consistency of mulc_spinor_add\n"); + printf("and rotate: %.2e\n\n",dmax); + } + + dmax=0.0; + + for (i=0;i<5;i++) + { + pk=ps[i]+off; + pl=ps[9-i]+off; + random_s(vol,pk,1.0f); + assign_s2s(vol,pk,pl); + mulg5(vol,pk); + mulg5(vol,pk); + + z.re=-1.0f; + z.im=0.0f; + + mulc_spinor_add(vol,pl,pk,z); + r=norm_square(vol,icom,pl)/norm_square(vol,icom,pk); + d=sqrt((double)(r)); + if (d>dmax) + dmax=d; + + random_s(vol,pl,1.0f); + z=spinor_prod(vol,icom,pk,pl); + mulg5(vol,pk); + mulg5(vol,pl); + w=spinor_prod(vol,icom,pk,pl); + + d=(fabs((double)(z.re-w.re))+fabs((double)(z.im-w.im)))/ + (fabs((double)(z.re))+fabs((double)(z.im))); + if (d>dmax) + dmax=d; + + random_s(vol,pk,1.0f); + assign_s2s(vol,pk,pl); + mulg5(vol,pk); + mulmg5(vol,pk); + + z.re=1.0f; + z.im=0.0f; + + mulc_spinor_add(vol,pl,pk,z); + r=norm_square(vol,icom,pl)/norm_square(vol,icom,pk); + d=sqrt((double)(r)); + if (d>dmax) + dmax=d; + } + + if (my_rank==0) + { + if (dmax>dall) + dall=dmax; + printf("Check of mulg5 and mulmg5: %.2e\n\n",dmax); + } + } + } + } + + error_chk(); + + if (my_rank==0) + { + printf("Maximal deviation in all tests: %.2e\n\n",dall); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/linalg/check3.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/linalg/check3.c new file mode 100644 index 0000000000000000000000000000000000000000..65242421c7215e5a4eee714f9205a4a2557fea0f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/linalg/check3.c @@ -0,0 +1,475 @@ + +/******************************************************************************* +* +* File check3.c +* +* Copyright (C) 2005, 2011, 2012 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Checks on the programs in the module salg_dble.c +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "sflds.h" +#include "linalg.h" +#include "global.h" + +#define _acc_sp(z,x,y) \ + (z).re+=(double)((x).re*(y).re+(x).im*(y).im); \ + (z).im+=(double)((x).re*(y).im-(x).im*(y).re) + +static complex_dble v[25]; +static spinor_dble *ppk[5]; + + +static complex_dble sp(int vol,spinor_dble *pk,spinor_dble *pl) +{ + complex_dble z; + spinor_dble *pm; + + z.re=0.0; + z.im=0.0; + pm=pk+vol; + + for (;pk1)) + { + if (my_rank==0) + { + if (icom==1) + { + printf("Checks with global summation\n"); + printf("============================\n\n"); + } + else + { + printf("Checks without global summation\n"); + printf("===============================\n\n"); + } + } + + for (ieo=0;ieo<3;ieo++) + { + if (my_rank==0) + { + if (ieo==0) + printf("First case: full lattice\n\n"); + else if (ieo==1) + printf("Second case: even points\n\n"); + else + printf("Third case: odd points\n\n"); + } + + vol=VOLUME/2; + off=0; + + if (ieo==0) + vol=VOLUME; + if (ieo==2) + off=VOLUME/2; + + for (i=0;i<10;i++) + random_sd(vol,psd[i]+off,1.0); + + dmax=0.0; + + for (i=0;i<10;i++) + { + pk=psd[i]+off; + pl=psd[9-i]+off; + + if (icom==1) + { + z=sp(vol,pk,pl); + MPI_Reduce(&z.re,&w.re,2,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(&w.re,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + else + w=sp(vol,pk,pl); + + z=spinor_prod_dble(vol,icom,pk,pl); + r=norm_square_dble(vol,icom,pk)*norm_square_dble(vol,icom,pl); + d=(z.re-w.re)*(z.re-w.re)+(z.im-w.im)*(z.im-w.im); + d=sqrt(d/r); + if (d>dmax) + dmax=d; + + r=spinor_prod_re_dble(vol,icom,pk,pl); + + d=fabs(z.re/r-1.0); + if (d>dmax) + dmax=d; + + z=spinor_prod_dble(vol,icom,pk,pk); + r=norm_square_dble(vol,icom,pk); + + d=fabs(z.im/r); + if (d>dmax) + dmax=d; + + d=fabs(z.re/r-1.0); + if (d>dmax) + dmax=d; + } + + if (my_rank==0) + { + if (dmax>dall) + dall=dmax; + printf("Check of spinor_prod, spinor_prod_re\n"); + printf("and norm_square: %.2e\n\n",dmax); + } + + dmax=0.0; + + for (i=0;i<10;i++) + { + pk=psd[i]+off; + pl=psd[9-i]+off; + + z=spinor_prod5_dble(vol,icom,pk,pl); + mulg5_dble(vol,pl); + w=spinor_prod_dble(vol,icom,pk,pl); + + r=norm_square_dble(vol,icom,pk)*norm_square_dble(vol,icom,pl); + d=(z.re-w.re)*(z.re-w.re)+(z.im-w.im)*(z.im-w.im); + d=sqrt(d/r); + if (d>dmax) + dmax=d; + } + + if (my_rank==0) + { + if (dmax>dall) + dall=dmax; + printf("Consistency check of spinor_prod5, mulg5\n"); + printf("and spinor_prod: %.2e\n\n",dmax); + } + + dmax=0.0; + z.re= 0.345; + z.im=-0.876; + zsq=z.re*z.re+z.im*z.im; + + for (i=0;i<9;i++) + { + pk=psd[i]+off; + pl=psd[i+1]+off; + + w=spinor_prod_dble(vol,icom,pk,pl); + r=norm_square_dble(vol,icom,pk)+zsq*norm_square_dble(vol,icom,pl) + +2.0*(z.re*w.re-z.im*w.im); + mulc_spinor_add_dble(vol,pk,pl,z); + + d=fabs(r/norm_square_dble(vol,icom,pk)-1.0); + if (d>dmax) + dmax=d; + } + + if (my_rank==0) + { + if (dmax>dall) + dall=dmax; + printf("Consistency of spinor_prod, norm_square\n"); + printf("and mulc_spinor_add: %.2e\n\n",dmax); + } + + for (i=0;i<10;i++) + random_sd(vol,psd[i]+off,1.0); + + dmax=0.0; + r=-1.234; + z.re=-r; + z.im=0.0; + + for (i=0;i<8;i+=3) + { + pk=psd[i]+off; + pl=psd[i+1]+off; + pj=psd[i+2]+off; + + assign_sd2sd(vol,pk,pj); + mulr_spinor_add_dble(vol,pk,pl,r); + mulc_spinor_add_dble(vol,pk,pl,z); + mulr_spinor_add_dble(vol,pk,pj,-1.0); + + d=norm_square_dble(vol,icom,pk)/norm_square_dble(vol,icom,pj); + d=sqrt(d); + if (d>dmax) + dmax=d; + + assign_sd2sd(vol,pl,pk); + scale_dble(vol,r,pk); + mulc_spinor_add_dble(vol,pk,pl,z); + + d=norm_square_dble(vol,icom,pk)/norm_square_dble(vol,icom,pl); + d=sqrt(d); + if (d>dmax) + dmax=d; + } + + if (my_rank==0) + { + if (dmax>dall) + dall=dmax; + printf("Consistency of mulr_spinor_add, scale\n"); + printf("and mulc_spinor_add: %.2e\n\n",dmax); + } + + for (i=0;i<10;i++) + random_sd(vol,psd[i]+off,1.0); + + dmax=0.0; + cs=0.785; + cr=-1.567; + + for (i=0;i<8;i+=3) + { + pk=psd[i]+off; + pl=psd[i+1]+off; + pj=psd[i+2]+off; + + assign_sd2sd(vol,pk,pj); + combine_spinor_dble(vol,pk,pl,cs,cr); + scale_dble(vol,cs,pj); + mulr_spinor_add_dble(vol,pj,pl,cr); + mulr_spinor_add_dble(vol,pk,pj,-1.0); + + d=norm_square_dble(vol,icom,pk)/norm_square_dble(vol,icom,pj); + d=sqrt(d); + if (d>dmax) + dmax=d; + } + + if (my_rank==0) + { + if (dmax>dall) + dall=dmax; + printf("Consistency of mulr_spinor_add, scale\n"); + printf("and combine_spinor: %.2e\n\n",dmax); + } + + for (i=0;i<10;i++) + random_sd(vol,psd[i]+off,1.0); + + dmax=0.0; + + for (i=0;i<10;i++) + { + pk=psd[i]+off; + + if (i>0) + { + pl=psd[i-1]+off; + project_dble(vol,icom,pk,pl); + z=spinor_prod_dble(vol,icom,pk,pl); + + d=(fabs(z.re)+fabs(z.im))/sqrt(norm_square_dble(vol,icom,pk)); + + if (d>dmax) + dmax=d; + } + + normalize_dble(vol,icom,pk); + r=norm_square_dble(vol,icom,pk); + + d=fabs(r-1.0); + if (d>dmax) + dmax=d; + } + + if (my_rank==0) + { + if (dmax>dall) + dall=dmax; + printf("Consistency of spinor_prod, norm_square,\n"); + printf("normalize and project: %.2e\n\n",dmax); + } + + for (i=0;i<5;i++) + { + pk=psd[i]+off; + pl=psd[i+5]+off; + + random_sd(vol,psd[i]+off,1.0); + assign_sd2sd(vol,pk,pl); + + for (j=0;j<5;j++) + { + v[5*i+j].re=0.1234*(double)(i^2)-0.8976*(double)(j); + v[5*i+j].im=0.2231*(double)(i)+0.9922*(double)(j^2); + } + + ppk[i]=pl; + } + + rotate_dble(vol,5,ppk,v); + dmax=0.0; + + for (i=5;i<10;i++) + { + pk=psd[i]+off; + + for (j=0;j<5;j++) + { + z.re=-v[5*j+(i-5)].re; + z.im=-v[5*j+(i-5)].im; + + pl=psd[j]+off; + mulc_spinor_add_dble(vol,pk,pl,z); + } + + r=norm_square_dble(vol,icom,pk); + + d=fabs(r); + if (d>dmax) + dmax=d; + } + + dmax/=norm_square_dble(vol,icom,psd[0]+off); + dmax=sqrt(dmax); + + if (my_rank==0) + { + if (dmax>dall) + dall=dmax; + printf("Consistency of mulc_spinor_add\n"); + printf("and rotate: %.2e\n\n",dmax); + } + + dmax=0.0; + + for (i=0;i<5;i++) + { + pk=psd[i]+off; + pl=psd[9-i]+off; + random_sd(vol,pk,1.0); + assign_sd2sd(vol,pk,pl); + mulg5_dble(vol,pk); + mulg5_dble(vol,pk); + + z.re=-1.0; + z.im=0.0; + + mulc_spinor_add_dble(vol,pl,pk,z); + r=norm_square_dble(vol,icom,pl)/norm_square_dble(vol,icom,pk); + d=sqrt(r); + if (d>dmax) + dmax=d; + + random_sd(vol,pl,1.0); + z=spinor_prod_dble(vol,icom,pk,pl); + mulg5_dble(vol,pk); + mulg5_dble(vol,pl); + w=spinor_prod_dble(vol,icom,pk,pl); + + d=(fabs(z.re-w.re)+fabs(z.im-w.im))/ + (fabs(z.re)+fabs(z.im)); + if (d>dmax) + dmax=d; + + random_sd(vol,pk,1.0); + assign_sd2sd(vol,pk,pl); + mulg5_dble(vol,pk); + mulmg5_dble(vol,pk); + + z.re=1.0; + z.im=0.0; + + mulc_spinor_add_dble(vol,pl,pk,z); + r=norm_square_dble(vol,icom,pl)/norm_square_dble(vol,icom,pk); + d=sqrt(r); + if (d>dmax) + dmax=d; + } + + if (my_rank==0) + { + if (dmax>dall) + dall=dmax; + printf("Check of mulg5 and mulmg5: %.2e\n\n",dmax); + } + } + } + } + + error_chk(); + + if (my_rank==0) + { + printf("Maximal deviation in all tests: %.2e\n\n",dall); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/linalg/check4.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/linalg/check4.c new file mode 100644 index 0000000000000000000000000000000000000000..d5bfb3625d4547b7ed825508bb39a4a7773b91e7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/linalg/check4.c @@ -0,0 +1,321 @@ + +/******************************************************************************* +* +* File check4.c +* +* Copyright (C) 2007, 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Consistency checks on the programs in the module valg +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "vflds.h" +#include "linalg.h" +#include "global.h" + + +static complex v[25]; +static complex *ppk[5]; + + +static complex sp(int vol,complex *pk,complex *pl) +{ + int ix; + double x,y; + complex z; + + x=0.0; + y=0.0; + + for (ix=0;ix1)) + { + if (my_rank==0) + { + if (icom==1) + { + printf("Checks with global summation\n"); + printf("============================\n\n"); + } + else + { + printf("Checks without global summation\n"); + printf("===============================\n\n"); + } + } + + for (ieo=0;ieo<3;ieo++) + { + if (my_rank==0) + { + if (ieo==0) + printf("First case: full lattice\n\n"); + else if (ieo==1) + printf("Second case: even points\n\n"); + else + printf("Third case: odd points\n\n"); + } + + vol=nv/2; + off=0; + + if (ieo==0) + vol=nv; + if (ieo==2) + off=nv/2; + + for (i=0;i<10;i++) + random_v(vol,wv[i]+off,1.0f); + + dmax=0.0; + + for (i=0;i<10;i++) + { + pk=wv[i]+off; + pl=wv[9-i]+off; + + if (icom==1) + { + z=sp(vol,pk,pl); + MPI_Reduce(&z.re,&w.re,2,MPI_FLOAT,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(&w.re,2,MPI_FLOAT,0,MPI_COMM_WORLD); + } + else + w=sp(vol,pk,pl); + + z=vprod(vol,icom,pk,pl); + r=vnorm_square(vol,icom,pk)*vnorm_square(vol,icom,pl); + d=(double)((z.re-w.re)*(z.re-w.re)+(z.im-w.im)*(z.im-w.im)); + d=sqrt(d/(double)(r)); + if (d>dmax) + dmax=d; + + z=vprod(vol,icom,pk,pk); + r=vnorm_square(vol,icom,pk); + + d=fabs((double)(z.im/r)); + if (d>dmax) + dmax=d; + + d=fabs((double)(z.re/r-1.0f)); + if (d>dmax) + dmax=d; + } + + if (my_rank==0) + { + if (dmax>dall) + dall=dmax; + printf("Check of vprod and vnorm_square: %.2e\n\n",dmax); + } + + dmax=0.0; + z.re= 0.345f; + z.im=-0.876f; + zsq=z.re*z.re+z.im*z.im; + + for (i=0;i<9;i++) + { + pk=wv[i]+off; + pl=wv[i+1]+off; + + w=vprod(vol,icom,pk,pl); + r=vnorm_square(vol,icom,pk)+zsq*vnorm_square(vol,icom,pl) + +2.0f*(z.re*w.re-z.im*w.im); + mulc_vadd(vol,pk,pl,z); + + d=fabs((double)(r/vnorm_square(vol,icom,pk)-1.0f)); + if (d>dmax) + dmax=d; + } + + if (my_rank==0) + { + if (dmax>dall) + dall=dmax; + printf("Consistency of vprod, vnorm_square\n"); + printf("and mulc_vadd: %.2e\n\n",dmax); + } + + for (i=0;i<10;i++) + random_v(vol,wv[i]+off,1.0f); + + dmax=0.0; + + for (i=0;i<10;i++) + { + pk=wv[i]+off; + + if (i>0) + { + pl=wv[i-1]+off; + vproject(vol,icom,pk,pl); + z=vprod(vol,icom,pk,pl); + + d=(fabs((double)(z.re))+ + fabs((double)(z.im)))/ + sqrt((double)(vnorm_square(vol,icom,pk))); + + if (d>dmax) + dmax=d; + } + + vnormalize(vol,icom,pk); + r=vnorm_square(vol,icom,pk); + + d=fabs((double)(r-1.0f)); + if (d>dmax) + dmax=d; + } + + if (my_rank==0) + { + if (dmax>dall) + dall=dmax; + printf("Consistency of vprod, vnorm_square,\n"); + printf("vnormalize and vproject: %.2e\n\n",dmax); + } + + for (i=0;i<5;i++) + { + pk=wv[i]+off; + pl=wv[i+5]+off; + + random_v(vol,wv[i]+off,1.0f); + assign_v2v(vol,pk,pl); + + for (j=0;j<5;j++) + { + v[5*i+j].re=0.1234f*(float)(i^2)-0.8976f*(float)(j); + v[5*i+j].im=0.2231f*(float)(i)+0.9922f*(float)(j^2); + } + + ppk[i]=pl; + } + + vrotate(vol,5,ppk,v); + dmax=0.0; + + for (i=5;i<10;i++) + { + pk=wv[i]+off; + + for (j=0;j<5;j++) + { + z.re=-v[5*j+(i-5)].re; + z.im=-v[5*j+(i-5)].im; + + pl=wv[j]+off; + mulc_vadd(vol,pk,pl,z); + } + + r=vnorm_square(vol,icom,pk); + + d=fabs((double)(r)); + if (d>dmax) + dmax=d; + } + + dmax/=(double)(vnorm_square(vol,icom,wv[0]+off)); + dmax=sqrt(dmax); + + if (my_rank==0) + { + if (dmax>dall) + dall=dmax; + printf("Consistency of mulc_vadd\n"); + printf("and vrotate: %.2e\n\n",dmax); + } + } + } + } + + error_chk(); + + if (my_rank==0) + { + printf("Maximal deviation in all tests: %.2e\n\n",dall); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/linalg/check4.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/linalg/check4.in new file mode 100644 index 0000000000000000000000000000000000000000..d3202ab3cc8269cb289e4fdf5e9a62838f8a8119 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/linalg/check4.in @@ -0,0 +1 @@ +bs 4 4 4 4 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/linalg/check5.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/linalg/check5.c new file mode 100644 index 0000000000000000000000000000000000000000..ec9883519ae8fa038bdfe9f22f104586d16c5f1c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/linalg/check5.c @@ -0,0 +1,321 @@ + +/******************************************************************************* +* +* File check5.c +* +* Copyright (C) 2007, 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Checks on the programs in the module valg_dble +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "vflds.h" +#include "linalg.h" +#include "global.h" + +static complex_dble v[25]; +static complex_dble *ppk[5]; + + +static complex_dble sp(int vol,complex_dble *pk,complex_dble *pl) +{ + int ix; + double x,y; + complex_dble z; + + x=0.0; + y=0.0; + + for (ix=0;ix1)) + { + if (my_rank==0) + { + if (icom==1) + { + printf("Checks with global summation\n"); + printf("============================\n\n"); + } + else + { + printf("Checks without global summation\n"); + printf("===============================\n\n"); + } + } + + for (ieo=0;ieo<3;ieo++) + { + if (my_rank==0) + { + if (ieo==0) + printf("First case: full lattice\n\n"); + else if (ieo==1) + printf("Second case: even points\n\n"); + else + printf("Third case: odd points\n\n"); + } + + vol=nv/2; + off=0; + + if (ieo==0) + vol=nv; + if (ieo==2) + off=nv/2; + + for (i=0;i<10;i++) + random_vd(vol,wvd[i]+off,1.0f); + + dmax=0.0; + + for (i=0;i<10;i++) + { + pk=wvd[i]+off; + pl=wvd[9-i]+off; + + if (icom==1) + { + z=sp(vol,pk,pl); + MPI_Reduce(&z.re,&w.re,2,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(&w.re,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + else + w=sp(vol,pk,pl); + + z=vprod_dble(vol,icom,pk,pl); + r=vnorm_square_dble(vol,icom,pk)*vnorm_square_dble(vol,icom,pl); + d=(z.re-w.re)*(z.re-w.re)+(z.im-w.im)*(z.im-w.im); + d=sqrt(d/r); + if (d>dmax) + dmax=d; + + z=vprod_dble(vol,icom,pk,pk); + r=vnorm_square_dble(vol,icom,pk); + + d=fabs(z.im/r); + if (d>dmax) + dmax=d; + + d=fabs(z.re/r-1.0); + if (d>dmax) + dmax=d; + } + + if (my_rank==0) + { + if (dmax>dall) + dall=dmax; + printf("Check of vprod_dble and vnorm_square_dble: %.2e\n\n", + dmax); + } + + dmax=0.0; + z.re= 0.345; + z.im=-0.876; + zsq=z.re*z.re+z.im*z.im; + + for (i=0;i<9;i++) + { + pk=wvd[i]+off; + pl=wvd[i+1]+off; + + w=vprod_dble(vol,icom,pk,pl); + r=vnorm_square_dble(vol,icom,pk)+ + zsq*vnorm_square_dble(vol,icom,pl) + +2.0f*(z.re*w.re-z.im*w.im); + mulc_vadd_dble(vol,pk,pl,z); + + d=fabs(r/vnorm_square_dble(vol,icom,pk)-1.0); + if (d>dmax) + dmax=d; + } + + if (my_rank==0) + { + if (dmax>dall) + dall=dmax; + printf("Consistency of vprod_dble, vnorm_square_dble\n"); + printf("and mulc_vadd_dble: %.2e\n\n",dmax); + } + + for (i=0;i<10;i++) + random_vd(vol,wvd[i]+off,1.0f); + + dmax=0.0; + + for (i=0;i<10;i++) + { + pk=wvd[i]+off; + + if (i>0) + { + pl=wvd[i-1]+off; + vproject_dble(vol,icom,pk,pl); + z=vprod_dble(vol,icom,pk,pl); + + d=(fabs(z.re)+fabs(z.im))/ + sqrt(vnorm_square_dble(vol,icom,pk)); + + if (d>dmax) + dmax=d; + } + + vnormalize_dble(vol,icom,pk); + r=vnorm_square_dble(vol,icom,pk); + + d=fabs(r-1.0); + if (d>dmax) + dmax=d; + } + + if (my_rank==0) + { + if (dmax>dall) + dall=dmax; + printf("Consistency of vprod_dble, vnorm_square_dble,\n"); + printf("vnormalize_dble and vproject_dble: %.2e\n\n",dmax); + } + + for (i=0;i<5;i++) + { + pk=wvd[i]+off; + pl=wvd[i+5]+off; + + random_vd(vol,wvd[i]+off,1.0f); + assign_vd2vd(vol,pk,pl); + + for (j=0;j<5;j++) + { + v[5*i+j].re=0.1234*(double)(i^2)-0.8976*(double)(j); + v[5*i+j].im=0.2231*(double)(i)+0.9922*(double)(j^2); + } + + ppk[i]=pl; + } + + vrotate_dble(vol,5,ppk,v); + dmax=0.0; + + for (i=5;i<10;i++) + { + pk=wvd[i]+off; + + for (j=0;j<5;j++) + { + z.re=-v[5*j+(i-5)].re; + z.im=-v[5*j+(i-5)].im; + + pl=wvd[j]+off; + mulc_vadd_dble(vol,pk,pl,z); + } + + r=vnorm_square_dble(vol,icom,pk); + + d=fabs(r); + if (d>dmax) + dmax=d; + } + + dmax/=vnorm_square_dble(vol,icom,wvd[0]+off); + dmax=sqrt(dmax); + + if (my_rank==0) + { + if (dmax>dall) + dall=dmax; + printf("Consistency of mulc_vadd_dble\n"); + printf("and vrotate_dble: %.2e\n\n",dmax); + } + } + } + } + + error_chk(); + + if (my_rank==0) + { + printf("Maximal deviation in all tests: %.2e\n\n",dall); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/linalg/time1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/linalg/time1.c new file mode 100644 index 0000000000000000000000000000000000000000..de793e3409abc7028faa9a4f0fe1d429e876ac63 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/linalg/time1.c @@ -0,0 +1,503 @@ + +/******************************************************************************* +* +* File time1.c +* +* Copyright (C) 2005, 2008, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Timing of the salg routines +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "sflds.h" +#include "linalg.h" +#include "global.h" + +static complex *vmat,*wmat; +static spinor **ps,*ppk[5]; + + +static double wt_spinor_prod(int nflds,int icom) +{ + int my_rank,nmax,n,i,ib; + double wt1,wt2,wdt,wtav; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + for (i=0;i2.0) + ib=1; + + wtav/=(double)((nmax*nflds)/2); + } + + MPI_Bcast(&ib,1,MPI_INT,0,MPI_COMM_WORLD); + } + + MPI_Bcast(&wtav,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + return wtav; +} + + +static double wt_norm_square(int nflds,int icom) +{ + int my_rank,nmax,n,i,ib; + double wt1,wt2,wdt,wtav; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + nmax=1; + + for (ib=0;ib<1;nmax*=2) + { + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + + for (n=0;n2.0) + ib=1; + + wtav/=(double)(nmax*nflds); + } + + MPI_Bcast(&ib,1,MPI_INT,0,MPI_COMM_WORLD); + } + + MPI_Bcast(&wtav,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + return wtav; +} + + +static double wt_normalize(int nflds,int icom) +{ + int my_rank,nmax,n,i,ib; + double wt1,wt2,wdt,wtav; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + nmax=1; + + for (ib=0;ib<1;nmax*=2) + { + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + + for (n=0;n2.0) + ib=1; + + wtav/=(double)(nmax*nflds); + } + + MPI_Bcast(&ib,1,MPI_INT,0,MPI_COMM_WORLD); + } + + MPI_Bcast(&wtav,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + return wtav; +} + + +static double wt_mulc_spinor_add(int nflds) +{ + int my_rank,nmax,n,i,ib; + complex z; + double wt1,wt2,wdt,wtav; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + z.re=0.123f; + z.im=0.456f; + nmax=1; + + for (ib=0;ib<1;nmax*=2) + { + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + + for (n=0;n2.0) + ib=1; + + wtav/=(double)((nmax*nflds)/2); + } + + MPI_Bcast(&ib,1,MPI_INT,0,MPI_COMM_WORLD); + } + + MPI_Bcast(&wtav,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + return wtav; +} + + +static double wt_project(int nflds,int icom) +{ + int my_rank,nmax,n,i,ib; + double wt1,wt2,wdt,wtav; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + for (i=0;i2.0) + ib=1; + + wtav/=(double)((nmax*nflds)/2); + } + + MPI_Bcast(&ib,1,MPI_INT,0,MPI_COMM_WORLD); + } + + MPI_Bcast(&wtav,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + return wtav; +} + + +static void gram_schmidt(int n,spinor **s) +{ + int i,j,k; + + for (i=0;i2.0) + ib=1; + + wtav/=(double)(2*nmax); + } + + MPI_Bcast(&ib,1,MPI_INT,0,MPI_COMM_WORLD); + } + + MPI_Bcast(&wtav,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + return wtav; +} + + +int main(int argc,char *argv[]) +{ + int my_rank,icom,nflds; + double wdt; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("time1.log","w",stdout); + + printf("\n"); + printf("Timing of the salg routines\n"); + printf("---------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + if (NPROC>1) + printf("There are %d MPI processes\n",NPROC); + else + printf("There is 1 MPI process\n"); + + if ((VOLUME*sizeof(float))<(64*1024)) + printf("The local size of a quark field is %d KB\n", + (int)((24*VOLUME*sizeof(float))/(1024))); + else + printf("The local size of a quark field is %d MB\n", + (int)((24*VOLUME*sizeof(float))/(1024*1024))); + +#if (defined x64) +#if (defined AVX) + printf("Using AVX instructions\n"); +#else + printf("Using SSE3 instructions and 16 xmm registers\n"); +#endif +#if (defined P3) + printf("Assuming SSE prefetch instructions fetch 32 bytes\n"); +#elif (defined PM) + printf("Assuming SSE prefetch instructions fetch 64 bytes\n"); +#elif (defined P4) + printf("Assuming SSE prefetch instructions fetch 128 bytes\n"); +#else + printf("SSE prefetch instructions are not used\n"); +#endif +#endif + printf("\n"); + } + + icom=1; + start_ranlux(0,12345); + geometry(); + + nflds=(int)((4*1024*1024)/(VOLUME*sizeof(float)))+1; + if ((nflds%2)==1) + nflds+=1; + if (nflds<10) + nflds=10; + alloc_ws(nflds); + ps=reserve_ws(nflds); + + wdt=1.0e6*wt_spinor_prod(nflds,icom)/(double)(VOLUME); + + if (my_rank==0) + { + printf("Function spinor_prod:\n"); + printf("Time per lattice point: %4.3f micro sec\n",wdt); + printf("%d Mflops [%d bit arithmetic]\n\n", + (int)(96.0/wdt),(int)(sizeof(spinor))/3); + } + + wdt=1.0e6*wt_norm_square(nflds,icom)/(double)(VOLUME); + + if (my_rank==0) + { + printf("Function norm_square:\n"); + printf("Time per lattice point: %4.3f micro sec\n",wdt); + printf("%d Mflops [%d bit arithmetic]\n\n", + (int)(48.0/wdt),(int)(sizeof(spinor))/3); + } + + wdt=1.0e6*wt_normalize(nflds,icom)/(double)(VOLUME); + + if (my_rank==0) + { + printf("Function normalize:\n"); + printf("Time per lattice point: %4.3f micro sec\n",wdt); + printf("%d Mflops [%d bit arithmetic]\n\n", + (int)(72.0/wdt),(int)(sizeof(spinor))/3); + } + + wdt=1.0e6*wt_mulc_spinor_add(nflds)/(double)(VOLUME); + + if (my_rank==0) + { + printf("Function mulc_spinor_add:\n"); + printf("Time per lattice point: %4.3f micro sec\n",wdt); + printf("%d Mflops [%d bit arithmetic]\n\n", + (int)(96.0/wdt),(int)(sizeof(spinor))/3); + } + + wdt=1.0e6*wt_project(nflds,icom)/(double)(VOLUME); + + if (my_rank==0) + { + printf("Function project:\n"); + printf("Time per lattice point: %4.3f micro sec\n",wdt); + printf("%d Mflops [%d bit arithmetic]\n\n", + (int)(192.0/wdt),(int)(sizeof(spinor))/3); + } + + wdt=1.0e6*wt_rotate()/(double)(25*VOLUME); + error_chk(); + + if (my_rank==0) + { + printf("Function rotate (n=5 fields):\n"); + printf("Time per lattice point: %4.3f*n^2 micro sec\n",wdt); + printf("%d Mflops [%d bit arithmetic]\n\n", + (int)(91.2/wdt),(int)(sizeof(spinor))/3); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/linalg/time2.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/linalg/time2.c new file mode 100644 index 0000000000000000000000000000000000000000..0d4f1eff68034d21824b293481ffde4f46671ed5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/linalg/time2.c @@ -0,0 +1,503 @@ + +/******************************************************************************* +* +* File time2.c +* +* Copyright (C) 2005, 2008, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Timing of the salg_dble routines +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "sflds.h" +#include "linalg.h" +#include "global.h" + +static complex_dble *vmat,*wmat; +static spinor_dble **psd,*ppk[5]; + + +static double wt_spinor_prod_dble(int nflds,int icom) +{ + int my_rank,nmax,n,i,ib; + double wt1,wt2,wdt,wtav; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + for (i=0;i2.0) + ib=1; + + wtav/=(double)((nmax*nflds)/2); + } + + MPI_Bcast(&ib,1,MPI_INT,0,MPI_COMM_WORLD); + } + + MPI_Bcast(&wtav,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + return wtav; +} + + +static double wt_norm_square_dble(int nflds,int icom) +{ + int my_rank,nmax,n,i,ib; + double wt1,wt2,wdt,wtav; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + nmax=1; + + for (ib=0;ib<1;nmax*=2) + { + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + + for (n=0;n2.0) + ib=1; + + wtav/=(double)(nmax*nflds); + } + + MPI_Bcast(&ib,1,MPI_INT,0,MPI_COMM_WORLD); + } + + MPI_Bcast(&wtav,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + return wtav; +} + + +static double wt_normalize_dble(int nflds,int icom) +{ + int my_rank,nmax,n,i,ib; + double wt1,wt2,wdt,wtav; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + nmax=1; + + for (ib=0;ib<1;nmax*=2) + { + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + + for (n=0;n2.0) + ib=1; + + wtav/=(double)(nmax*nflds); + } + + MPI_Bcast(&ib,1,MPI_INT,0,MPI_COMM_WORLD); + } + + MPI_Bcast(&wtav,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + return wtav; +} + + +static double wt_mulc_spinor_add_dble(int nflds) +{ + int my_rank,nmax,n,i,ib; + complex_dble z; + double wt1,wt2,wdt,wtav; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + z.re=0.123; + z.im=0.456; + nmax=1; + + for (ib=0;ib<1;nmax*=2) + { + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + + for (n=0;n2.0) + ib=1; + + wtav/=(double)((nmax*nflds)/2); + } + + MPI_Bcast(&ib,1,MPI_INT,0,MPI_COMM_WORLD); + } + + MPI_Bcast(&wtav,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + return wtav; +} + + +static double wt_project_dble(int nflds,int icom) +{ + int my_rank,nmax,n,i,ib; + double wt1,wt2,wdt,wtav; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + for (i=0;i2.0) + ib=1; + + wtav/=(double)((nmax*nflds)/2); + } + + MPI_Bcast(&ib,1,MPI_INT,0,MPI_COMM_WORLD); + } + + MPI_Bcast(&wtav,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + return wtav; +} + + +static void gram_schmidt(int n,spinor_dble **s) +{ + int i,j,k; + + for (i=0;i2.0) + ib=1; + + wtav/=(double)(2*nmax); + } + + MPI_Bcast(&ib,1,MPI_INT,0,MPI_COMM_WORLD); + } + + MPI_Bcast(&wtav,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + return wtav; +} + + +int main(int argc,char *argv[]) +{ + int my_rank,icom,nflds; + double wdt; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("time2.log","w",stdout); + + printf("\n"); + printf("Timing of the salg_dble routines\n"); + printf("--------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + if (NPROC>1) + printf("There are %d MPI processes\n",NPROC); + else + printf("There is 1 MPI process\n"); + + if ((VOLUME*sizeof(double))<(64*1024)) + printf("The local size of a quark field is %d KB\n", + (int)((24*VOLUME*sizeof(double))/(1024))); + else + printf("The local size of a quark field is %d MB\n", + (int)((24*VOLUME*sizeof(double))/(1024*1024))); + +#if (defined x64) +#if (defined AVX) + printf("Using AVX instructions\n"); +#else + printf("Using SSE3 instructions and 16 xmm registers\n"); +#endif +#if (defined P3) + printf("Assuming SSE prefetch instructions fetch 32 bytes\n"); +#elif (defined PM) + printf("Assuming SSE prefetch instructions fetch 64 bytes\n"); +#elif (defined P4) + printf("Assuming SSE prefetch instructions fetch 128 bytes\n"); +#else + printf("SSE prefetch instructions are not used\n"); +#endif +#endif + printf("\n"); + } + + icom=1; + start_ranlux(0,12345); + geometry(); + + nflds=(int)((4*1024*1024)/(VOLUME*sizeof(double)))+1; + if ((nflds%2)==1) + nflds+=1; + if (nflds<10) + nflds=10; + alloc_wsd(nflds); + psd=reserve_wsd(nflds); + + wdt=1.0e6*wt_spinor_prod_dble(nflds,icom)/(double)(VOLUME); + + if (my_rank==0) + { + printf("Function spinor_prod_dble:\n"); + printf("Time per lattice point: %4.3f micro sec\n",wdt); + printf("%d Mflops [%d bit arithmetic]\n\n", + (int)(96.0/wdt),(int)(sizeof(spinor_dble))/3); + } + + wdt=1.0e6*wt_norm_square_dble(nflds,icom)/(double)(VOLUME); + + if (my_rank==0) + { + printf("Function norm_square_dble:\n"); + printf("Time per lattice point: %4.3f micro sec\n",wdt); + printf("%d Mflops [%d bit arithmetic]\n\n", + (int)(48.0/wdt),(int)(sizeof(spinor_dble))/3); + } + + wdt=1.0e6*wt_normalize_dble(nflds,icom)/(double)(VOLUME); + + if (my_rank==0) + { + printf("Function normalize_dble:\n"); + printf("Time per lattice point: %4.3f micro sec\n",wdt); + printf("%d Mflops [%d bit arithmetic]\n\n", + (int)(72.0/wdt),(int)(sizeof(spinor_dble))/3); + } + + wdt=1.0e6*wt_mulc_spinor_add_dble(nflds)/(double)(VOLUME); + + if (my_rank==0) + { + printf("Function mulc_spinor_add_dble:\n"); + printf("Time per lattice point: %4.3f micro sec\n",wdt); + printf("%d Mflops [%d bit arithmetic]\n\n", + (int)(96.0/wdt),(int)(sizeof(spinor_dble))/3); + } + + wdt=1.0e6*wt_project_dble(nflds,icom)/(double)(VOLUME); + + if (my_rank==0) + { + printf("Function project_dble:\n"); + printf("Time per lattice point: %4.3f micro sec\n",wdt); + printf("%d Mflops [%d bit arithmetic]\n\n", + (int)(192.0/wdt),(int)(sizeof(spinor_dble))/3); + } + + wdt=1.0e6*wt_rotate_dble()/(double)(25*VOLUME); + error_chk(); + + if (my_rank==0) + { + printf("Function rotate_dble (n=5 fields):\n"); + printf("Time per lattice point: %4.3f*n^2 micro sec\n",wdt); + printf("%d Mflops [%d bit arithmetic]\n\n", + (int)(91.2/wdt),(int)(sizeof(spinor_dble))/3); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/little/INDEX b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/little/INDEX new file mode 100644 index 0000000000000000000000000000000000000000..8ba001fee9b776ab2c98be6ecfbf114a75ba5fdb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/little/INDEX @@ -0,0 +1,20 @@ + +Little Dirac operator + +check1 Check of the programs in the module Aw_gen.c. + +check2 Check of the program b2b_flds(). + +check3 Direct check of Aw_dble() and Aw(). + +check4 Consistency checks on Aw_dble(),..,Awhat(). + +check5 Check of the program set_ltl_modes(). + +time1 Timing of Awhat(). + +The programs check2,..,time1 accept the option -bc that allows the +type of boundary condition to be chosen at runtime. When the option is not +set, open boundary conditions are assumed. + +The option may be set but has no effect in the case of check1. \ No newline at end of file diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/little/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/little/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..ae5f543ffa78dd23b188847a4b3395a7119432f6 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/little/Makefile @@ -0,0 +1,154 @@ +################################################################################ +# +# Makefile to compile and link C programs with MPI subroutines +# +# Version valid for Linux machines with MPICH +# +# "make" compiles and links the specified main programs and modules, +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files generated by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +all: rmxeq mkdep mkxeq +.PHONY: all + + +# main programs and modules to be compiled + +MAIN = check1 check2 check3 check4 check5 time1 + +FLAGS = flags lat_parms sap_parms dfl_parms + +LATTICE = bcnds ftidx uidx geometry + +LINALG = salg salg_dble valg valg_dble liealg cmatrix_dble cmatrix + +LINSOLV = fgcr + +RANDOM = ranlux ranlxs ranlxd gauss + +UFLDS = plaq_sum shift uflds udcom + +SU3FCTS = chexp su3prod su3ren cm3x3 random_su3 + +UTILS = endian mutils utils wspace + +SFLDS = sflds scom sdcom Pbnd Pbnd_dble + +TCHARGE = ftcom ftensor + +SW_TERM = pauli pauli_dble swflds sw_term + +DIRAC = Dw_dble Dw Dw_bnd + +BLOCK = block blk_grid map_u2blk map_sw2blk map_s2blk + +SAP = blk_solv sap_com sap sap_gcr + +ARCHIVE = archive + +DFL = dfl_geometry dfl_subspace + +VFLDS = vflds vinit vcom vdcom + +LITTLE = Aw_gen Aw_com Aw_ops Aw_dble Aw ltl_modes + +MODULES = $(FLAGS) $(LATTICE) $(LINALG) $(LINSOLV) $(RANDOM) $(UFLDS) \ + $(SU3FCTS) $(UTILS) $(SFLDS) $(TCHARGE) $(SW_TERM) $(DIRAC) \ + $(BLOCK) $(SAP) $(ARCHIVE) $(DFL) $(VFLDS) $(LITTLE) + + +# Logging option (-mpilog or -mpitrace or -mpianim) + +LOGOPTION = + + +# search path for modules + +MDIR = ../../modules + +VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/linalg:$(MDIR)/linsolv:\ + $(MDIR)/random:$(MDIR)/uflds:$(MDIR)/su3fcts:$(MDIR)/utils:\ + $(MDIR)/sflds:$(MDIR)/tcharge:$(MDIR)/sw_term:$(MDIR)/dirac:\ + $(MDIR)/block:$(MDIR)/sap:$(MDIR)/archive:$(MDIR)/dfl:\ + $(MDIR)/vflds:$(MDIR)/little + + +# additional include directories + +INCPATH = $(MPI_INCLUDE) ../../include + + +# additional libraries + +LIBS = m + +LIBPATH = $(MPI_HOME)/lib + + +# scheduling and optimization options + +CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ + -Wall -Wno-long-long -Wstrict-prototypes -Werror \ + -O -mno-avx -Dx64 -DPM + + +############################## do not change ################################### + +SHELL=/bin/bash +CC=$(MPI_HOME)/bin/mpicc +CLINKER=$(CC) + +PGMS= $(MAIN) $(MODULES) + +-include $(addsuffix .d,$(PGMS)) + + +# rule to make dependencies + +$(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + +# rule to link object files + +$(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(CFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ + + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables + +rmxeq: + @ -rm -f $(MAIN); \ + echo "delete old executables" + + +# make dependencies + +mkdep: $(addsuffix .d,$(PGMS)) + @ echo "generate tables of dependencies" + + +# clean directory + +clean: + @ -rm -rf *.d *.o *.alog *.clog *.slog $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/little/check1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/little/check1.c new file mode 100644 index 0000000000000000000000000000000000000000..ec8a9c6c05f09291e695dfd3e7988daa7f4f2706 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/little/check1.c @@ -0,0 +1,306 @@ + +/******************************************************************************* +* +* File check1.c +* +* Copyright (C) 2007, 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of the programs in the module Aw_gen.c. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "utils.h" +#include "flags.h" +#include "sflds.h" +#include "linalg.h" +#include "little.h" +#include "global.h" + +#define NPTS 2048 + +static int imb[NPTS]; +static su3_dble ud[NPTS],vd[NPTS] ALIGNED16; +static spinor_dble sd[3][NPTS] ALIGNED16; + + +static void random_imb(int vol) +{ + int i,j,a,b; + float r[2],rvol; + + for (i=0;i=vol) + a=vol-1; + b=(int)(rvol*r[1]); + if (b>=vol) + b=vol-1; + + j=imb[a]; + imb[a]=imb[b]; + imb[b]=j; + } +} + + +static void random_ufld(int vol) +{ + int i; + + for (i=0;i +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "dfl.h" +#include "little.h" +#include "global.h" + +static int bs[4],Ns,bc; +static int l[4],np[4]; +static const su3_dble ud0={{0.0}}; + + +static void set_ud(void) +{ + su3_dble unity,*ud,*um; + + unity=ud0; + unity.c11.re=1.0; + unity.c22.re=1.0; + unity.c33.re=1.0; + ud=udfld(); + um=ud+4*VOLUME; + + for (;ud=b[nu])) + ie=2; + } + } + + error(ie!=0,1,"chk_sde0 [check2.c]","Incorrect field components"); +} + + +static void chk_sde1(int mu,int vol,int ibn,int *bo,spinor_dble *sd) +{ + int a[4],b[4],y[4]; + int ix,nu,ie; + + for (nu=0;nu<4;nu++) + { + a[nu]=cpr[nu]*l[nu]+bo[nu]; + b[nu]=a[nu]+bs[nu]; + } + + a[mu]=cpr[mu]*l[mu]+bo[mu]; + if (ibn) + a[mu]=safe_mod(a[mu]+l[mu],np[mu]*l[mu]); + b[mu]=a[mu]+1; + ie=0; + + for (ix=0;ix=b[nu])) + ie=2; + } + } + + error(ie!=0,1,"chk_sde1 [check2.c]","Incorrect field components"); +} + + +static void chk_sdo0(int mu,int vol,int *bo,spinor_dble *sd) +{ + int a[4],b[4],y[4]; + int ix,nu,ie; + + for (nu=0;nu<4;nu++) + { + a[nu]=cpr[nu]*l[nu]+bo[nu]; + b[nu]=a[nu]+bs[nu]; + } + + a[mu]=cpr[mu]*l[mu]+bo[mu]+bs[mu]-1; + b[mu]=a[mu]+1; + ie=0; + + for (ix=0;ix=b[nu])) + ie=2; + } + } + + error(ie!=0,1,"chk_sdo0 [check2.c]","Incorrect field components %d",ie); +} + + +static void chk_sdo1(int mu,int vol,int *bo,spinor_dble *sd) +{ + int a[4],b[4],y[4]; + int ix,nu,ie; + + for (nu=0;nu<4;nu++) + { + a[nu]=cpr[nu]*l[nu]+bo[nu]; + b[nu]=a[nu]+bs[nu]; + } + + a[mu]=cpr[mu]*l[mu]+bo[mu]; + b[mu]=a[mu]+1; + ie=0; + + for (ix=0;ix=b[nu])) + ie=2; + } + } + + error(ie!=0,1,"chk_sdo1 [check2.c]","Incorrect field components"); +} + + +static void cmp_sde0_sdo1(int mu,int vol,int *bo,spinor_dble *sde, + spinor_dble *sdo) +{ + int ye[4],yo[4]; + int ix,nu,ie; + + ie=0; + + for (ix=0;ix]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.55,0.78,0.9012,1.2034,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,123456); + geometry(); + Ns=2; + set_dfl_parms(bs,Ns); + alloc_bgr(DFL_BLOCKS); + blk_list(DFL_BLOCKS,&nb,&isw); + + alloc_wsd(Ns); + wsd=reserve_wsd(Ns); + set_ud(); + + for (k=0;k +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "vflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "dirac.h" +#include "dfl.h" +#include "little.h" +#include "global.h" + + +static void random_basis(int Ns) +{ + int i; + spinor **ws; + + ws=reserve_ws(Ns); + + for (i=0;i]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&Ns,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.55,0.78,0.9012,1.2034,phi,phi_prime); + print_bc_parms(); + + set_sw_parms(-0.0123); + set_dfl_parms(bs,Ns); + mu=0.0376; + + start_ranlux(0,123456); + geometry(); + + alloc_ws(Ns+2); + alloc_wsd(2); + alloc_wv(3); + alloc_wvd(3); + + ws=reserve_ws(2); + wsd=reserve_wsd(2); + wv=reserve_wv(3); + wvd=reserve_wvd(3); + nb=VOLUME/(bs[0]*bs[1]*bs[2]*bs[3]); + nv=Ns*nb; + + random_ud(); + chs_ubnd(-1); + random_basis(Ns); + set_Aw(mu); + sw_term(NO_PTS); + assign_ud2u(); + assign_swd2sw(); + + random_vd(nv,wvd[0],1.0); + Aw_dble(wvd[0],wvd[1]); + dfl_vd2sd(wvd[0],wsd[0]); + Dw_dble(mu,wsd[0],wsd[1]); + dfl_sd2vd(wsd[1],wvd[2]); + + zd.re=-1.0; + zd.im=0.0; + mulc_vadd_dble(nv,wvd[2],wvd[1],zd); + dev=vnorm_square_dble(nv,1,wvd[2])/vnorm_square_dble(nv,1,wvd[1]); + + error_chk(); + + if (my_rank==0) + printf("Relative deviation (Aw_dble) = %.1e\n",sqrt(dev)); + + random_v(nv,wv[0],1.0f); + Aw(wv[0],wv[1]); + dfl_v2s(wv[0],ws[0]); + Dw((float)(mu),ws[0],ws[1]); + dfl_s2v(ws[1],wv[2]); + + z.re=-1.0f; + z.im=0.0f; + mulc_vadd(nv,wv[2],wv[1],z); + dev=(double)(vnorm_square(nv,1,wv[2])/vnorm_square(nv,1,wv[1])); + + error_chk(); + + if (my_rank==0) + { + printf("Relative deviation (Aw) = %.1e\n\n",sqrt(dev)); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/little/check3.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/little/check3.in new file mode 100644 index 0000000000000000000000000000000000000000..4d215004cd2b47624ceeec30351e2dfa500353ac --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/little/check3.in @@ -0,0 +1,2 @@ +bs 4 4 4 4 +Ns 20 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/little/check4.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/little/check4.c new file mode 100644 index 0000000000000000000000000000000000000000..133eeb170fbe185d0a422947811d84a979262ffa --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/little/check4.c @@ -0,0 +1,414 @@ + +/******************************************************************************* +* +* File check4.c +* +* Copyright (C) 2007, 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Consistency checks on Aw_dble(),..,Awhat(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "vflds.h" +#include "linalg.h" +#include "dirac.h" +#include "dfl.h" +#include "little.h" +#include "global.h" + +static int bc,Ns; + + +static void random_basis(int Ns) +{ + int i; + spinor **ws; + + ws=reserve_ws(Ns); + + for (i=0;i]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&Ns,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.55,0.78,0.9012,1.2034,phi,phi_prime); + print_bc_parms(); + + set_sw_parms(0.125); + set_dfl_parms(bs,Ns); + mu=0.0376; + + start_ranlux(0,123456); + geometry(); + + alloc_ws(Ns); + alloc_wv(4); + alloc_wvd(6); + + wv=reserve_wv(4); + wvd=reserve_wvd(4); + nb=VOLUME/(bs[0]*bs[1]*bs[2]*bs[3]); + nv=Ns*nb; + nvh=nv/2; + + random_ud(); + chs_ubnd(-1); + random_basis(Ns); + + ifail=set_Awhat(mu); + error(ifail!=0,1,"main [check4.c]","Inversion of Aee or Aoo failed"); + + zd.re=-1.0; + zd.im=0.0; + z.re=-1.0f; + z.im=0.0f; + + for (iop=0;iop<6;iop++) + { + if (iop==0) + { + op=Awhat; + pr= "Awhat() "; + op_dble=Awhat_dble; + prd="Awhat_dble() "; + } + else if (iop==1) + { + op=Aweeinv; + pr= "Aweeinv()"; + op_dble=Aweeinv_dble; + prd="Aweeinv_dble()"; + } + else if (iop==2) + { + op=Awooinv; + pr= "Awooinv()"; + op_dble=Awooinv_dble; + prd="Awooinv_dble()"; + } + else if (iop==3) + { + op=Awoe; + pr= "Awoe() "; + op_dble=Awoe_dble; + prd="Awoe_dble() "; + } + else if (iop==4) + { + op=Aweo; + pr= "Aweo() "; + op_dble=Aweo_dble; + prd="Aweo_dble() "; + } + else + { + op=Aw; + pr= "Aw() "; + op_dble=Aw_dble; + prd="Aw_dble() "; + } + + random_vd(nv,wvd[0],1.0); + random_vd(nv,wvd[1],1.0); + assign_vd2vd(nv,wvd[0],wvd[2]); + assign_vd2vd(nv,wvd[1],wvd[3]); + + assign_vd2v(nv,wvd[0],wv[0]); + assign_vd2v(nv,wvd[1],wv[1]); + assign_v2v(nv,wv[0],wv[2]); + assign_v2v(nv,wv[1],wv[3]); + + op_dble(wvd[0],wvd[1]); + op(wv[0],wv[1]); + + mulc_vadd_dble(nv,wvd[2],wvd[0],zd); + d=vnorm_square_dble(nv,0,wvd[2]); + error(d!=0.0,1,"main [check4.c]", + "%s modifies the input field",prd); + + mulc_vadd(nv,wv[2],wv[0],z); + d=(double)(vnorm_square(nv,0,wv[2])); + error(d!=0.0,1,"main [check4.c]", + "%s modifies the input field",pr); + + if ((iop<2)||(iop==4)) + { + mulc_vadd_dble(nvh,wvd[3]+nvh,wvd[1]+nvh,zd); + d=vnorm_square_dble(nvh,0,wvd[3]+nvh); + error(d!=0.0,1,"main [check4.c]", + "%s modifies the odd components of the output field",prd); + + mulc_vadd(nvh,wv[3]+nvh,wv[1]+nvh,z); + d=(double)(vnorm_square(nvh,0,wv[3]+nvh)); + error(d!=0.0,1,"main [check4.c]", + "%s modifies the odd components of the output field",pr); + + assign_vd2v(nvh,wvd[1],wv[0]); + mulc_vadd(nvh,wv[0],wv[1],z); + d=(double)(vnorm_square(nvh,1,wv[0])/ + vnorm_square(nvh,1,wv[1])); + if (my_rank==0) + printf("Deviation of %s from %s: %.1e\n",pr,prd,sqrt(d)); + } + + if ((iop==2)||(iop==3)) + { + mulc_vadd_dble(nvh,wvd[3],wvd[1],zd); + d=vnorm_square_dble(nvh,0,wvd[3]); + error(d!=0.0,1,"main [check4.c]", + "%s modifies the even components of the output field",prd); + + mulc_vadd(nvh,wv[3],wv[1],z); + d=(double)(vnorm_square(nvh,0,wv[3])); + error(d!=0.0,1,"main [check4.c]", + "%s modifies the even components of the output field",pr); + + assign_vd2v(nvh,wvd[1]+nvh,wv[0]+nvh); + mulc_vadd(nvh,wv[0]+nvh,wv[1]+nvh,z); + d=(double)(vnorm_square(nvh,1,wv[0]+nvh)/ + vnorm_square(nvh,1,wv[1]+nvh)); + + if (my_rank==0) + printf("Deviation of %s from %s: %.1e\n",pr,prd,sqrt(d)); + } + + if (iop==5) + { + assign_vd2v(nv,wvd[1],wv[0]); + mulc_vadd(nv,wv[0],wv[1],z); + d=(double)(vnorm_square(nv,1,wv[0])/ + vnorm_square(nv,1,wv[1])); + if (my_rank==0) + printf("Deviation of %s from %s: %.1e\n",pr,prd,sqrt(d)); + } + } + + ifail=set_Awhat(-mu); + error(ifail!=0,1,"main [check4.c]","Inversion of Aee or Aoo failed"); + + random_vd(nvh,wvd[0],1.0); + set_vd2zero(nvh,wvd[0]+nvh); + Aw_dble(wvd[0],wvd[1]); + + Aweeinv_dble(wvd[1],wvd[2]); + mulc_vadd_dble(nvh,wvd[2],wvd[0],zd); + d=vnorm_square_dble(nvh,1,wvd[2])/vnorm_square_dble(nvh,1,wvd[0]); + + if (my_rank==0) + { + printf("\n"); + printf("Comparison of Aweeinv_dble() and Aw_dble(): %.1e\n",sqrt(d)); + } + + Awoe_dble(wvd[0],wvd[2]); + mulc_vadd_dble(nvh,wvd[2]+nvh,wvd[1]+nvh,zd); + d=vnorm_square_dble(nvh,1,wvd[2]+nvh)/vnorm_square_dble(nvh,1,wvd[1]+nvh); + + if (my_rank==0) + printf("Comparison of Awoe_dble() and Aw_dble(): %.1e\n",sqrt(d)); + + random_vd(nvh,wvd[0]+nvh,1.0); + set_vd2zero(nvh,wvd[0]); + Aw_dble(wvd[0],wvd[1]); + + Awooinv_dble(wvd[1],wvd[2]); + mulc_vadd_dble(nvh,wvd[2]+nvh,wvd[0]+nvh,zd); + d=vnorm_square_dble(nvh,1,wvd[2]+nvh)/vnorm_square_dble(nvh,1,wvd[0]+nvh); + + if (my_rank==0) + printf("Comparison of Awooinv_dble() and Aw_dble(): %.1e\n",sqrt(d)); + + random_vd(nvh,wvd[2],1.0); + assign_vd2vd(nvh,wvd[2],wvd[3]); + Aweo_dble(wvd[0],wvd[2]); + mulc_vadd_dble(nvh,wvd[3],wvd[2],zd); + mulc_vadd_dble(nvh,wvd[3],wvd[1],zd); + d=vnorm_square_dble(nvh,1,wvd[3])/vnorm_square_dble(nvh,1,wvd[1]); + + if (my_rank==0) + printf("Comparison of Aweo_dble() and Aw_dble(): %.1e\n",sqrt(d)); + + random_vd(nv,wvd[0],1.0); + Awhat_dble(wvd[0],wvd[1]); + Awoe_dble(wvd[0],wvd[2]); + Awooinv_dble(wvd[2],wvd[3]); + set_vd2zero(nvh,wvd[0]+nvh); + Aw_dble(wvd[0],wvd[2]); + Aweo_dble(wvd[3],wvd[2]); + Aweeinv_dble(wvd[2],wvd[3]); + + mulc_vadd_dble(nvh,wvd[3],wvd[1],zd); + d=vnorm_square_dble(nvh,1,wvd[3])/vnorm_square_dble(nvh,1,wvd[1]); + + + if (my_rank==0) + { + printf("Comparison of Aweeinv_dble(), Awooinv_dble(), \n"); + printf(" Awoe_dble(), Aweo_dble() and Awhat_dble(): %.1e\n\n", + sqrt(d)); + fflush(flog); + } + + ifail=check_bndAwop(); + error(ifail!=0,1,"main [check4.c]", + "Hopping terms Aoe,Aeo at the lattice boundaries do not vanish"); + error_chk(); + + if (my_rank==0) + fclose(flog); + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/little/check5.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/little/check5.c new file mode 100644 index 0000000000000000000000000000000000000000..00debd8729fb236da30136876642f0bd8304afa6 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/little/check5.c @@ -0,0 +1,360 @@ + +/******************************************************************************* +* +* File check5.c +* +* Copyright (C) 2007, 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of the program set_ltl_modes(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "vflds.h" +#include "linalg.h" +#include "dirac.h" +#include "dfl.h" +#include "little.h" +#include "global.h" + + +static void random_basis(int Ns) +{ + int i; + spinor **ws; + + ws=reserve_ws(Ns); + + for (i=0;idev) + dev=d; + } + } + + return sqrt(dev); +} + + +static double check_Awvd(int Ns,int nvh) +{ + int i; + double d,dev; + complex_dble **vd,**wvd,z; + + vd=vdflds(); + wvd=reserve_wvd(2); + + dev=0.0; + z.re=-1.0; + z.im=0.0; + + for (i=0;idev) + dev=d; + } + + release_wvd(); + + return sqrt(dev); +} + + +static double check_ltl_matrix(int Ns,int nvh) +{ + int i,j,ie; + double dev; + complex_dble **vd,*amat,*bmat,*cmat,z; + + vd=vdflds(); + amat=ltl_matrix(); + bmat=amalloc(2*Ns*Ns*sizeof(*amat),ALIGN); + error(bmat==NULL,1,"check_ltl_matrix [check5.c]", + "Unable to allocate auxiliary arrays"); + cmat=bmat+Ns*Ns; + + for (i=0;idev) + dev=d; + } + + for (i=0;idev) + dev=d; + } + + release_wvd(); + + return sqrt(dev); +} + + +static double check_mds(int Ns,int nvh) +{ + int nv,k,l; + double d,dev; + complex **vs; + complex_dble **vd,**wvd; + spinor **mds,**ws; + + nv=2*nvh; + mds=reserve_ws(Ns); + ws=reserve_ws(1); + vs=vflds(); + dev=0.0; + + for (k=0;kdev) + dev=d; + } + + release_ws(); + release_ws(); + + vd=vdflds(); + wvd=reserve_wvd(1); + + for (k=0;kdev) + dev=d; + } + + release_wvd(); + + return sqrt(dev); +} + + +int main(int argc,char *argv[]) +{ + int my_rank,bc,ifail; + int bs[4],Ns,nb,nvh; + double phi[2],phi_prime[2]; + double mu,dev; + FILE *fin=NULL,*flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check5.log","w",stdout); + fin=freopen("check3.in","r",stdin); + + printf("\n"); + printf("Check of the program set_ltl_modes()\n"); + printf("------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + read_line("bs","%d %d %d %d",&bs[0],&bs[1],&bs[2],&bs[3]); + read_line("Ns","%d",&Ns); + fclose(fin); + + printf("bs = %d %d %d %d\n",bs[0],bs[1],bs[2],bs[3]); + printf("Ns = %d\n\n",Ns); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check5.c]", + "Syntax: check5 [-bc ]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&Ns,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.55,0.78,0.9012,1.2034,phi,phi_prime); + print_bc_parms(); + + set_sw_parms(0.125); + set_dfl_parms(bs,Ns); + mu=0.0376; + + start_ranlux(0,123456); + geometry(); + + alloc_ws(Ns+1); + alloc_wvd(3); + + nb=VOLUME/(bs[0]*bs[1]*bs[2]*bs[3]); + nvh=Ns*(nb/2); + + random_ud(); + chs_ubnd(-1); + random_basis(Ns); + ifail=set_Awhat(mu); + error_root(ifail!=0,1,"main [check5.c]", + "Computation of the little Dirac operator failed"); + + if (my_rank==0) + printf("Maximal relative deviations found:\n\n"); + + dev=check_vd(Ns,nvh); + + if (my_rank==0) + printf("Orthonormality of vdflds: %.2e\n",dev); + + dev=check_Awvd(Ns,nvh); + + if (my_rank==0) + printf("Awhat*vdflds: %.2e\n",dev); + + dev=check_ltl_matrix(Ns,nvh); + + if (my_rank==0) + printf("Little-little matrix: %.2e\n\n",dev); + + dev=check_vflds(Ns,nvh); + + if (my_rank==0) + printf("Single-precision fields: %.2e\n",dev); + + dev=check_mds(Ns,nvh); + error_chk(); + + if (my_rank==0) + { + printf("Global deflation modes: %.2e\n\n",dev); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/little/time1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/little/time1.c new file mode 100644 index 0000000000000000000000000000000000000000..92901785e1f9a894790a9bf12fcdb11ca936386c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/little/time1.c @@ -0,0 +1,235 @@ + +/******************************************************************************* +* +* File time1.c +* +* Copyright (C) 2007, 2008, 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Timing of Awhat(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "vflds.h" +#include "linalg.h" +#include "dirac.h" +#include "dfl.h" +#include "little.h" +#include "global.h" + + +static void random_basis(int Ns) +{ + int i; + spinor **ws; + + ws=reserve_ws(Ns); + + for (i=0;i]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&Ns,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.55,0.78,0.9012,1.2034,phi,phi_prime); + print_bc_parms(); + + set_sw_parms(0.125); + set_dfl_parms(bs,Ns); + mu=0.0376; + + nb=VOLUME/(bs[0]*bs[1]*bs[2]*bs[3]); + nbb=2*(FACE0/(bs[1]*bs[2]*bs[3])+ + FACE1/(bs[0]*bs[2]*bs[3])+ + FACE2/(bs[0]*bs[1]*bs[3])+ + FACE3/(bs[0]*bs[1]*bs[2])); + nv=Ns*nb; + + start_ranlux(0,123456); + geometry(); + + alloc_ws(Ns); + alloc_wvd(2); + random_ud(); + chs_ubnd(-1); + random_basis(Ns); + + ifail=set_Awhat(mu); + error(ifail!=0,1,"main [time1.c]","Inversion of Aee or Aoo failed"); + + if (my_rank==0) + { + printf("Number of points = %d\n",VOLUME); + printf("Number of blocks = %d\n",nb); + printf("Number of points/block = %d\n",bs[0]*bs[1]*bs[2]*bs[3]); + printf("Vector field size = %.2f KB\n", + (double)(sizeof(complex)*nv)*1.0e-3); + printf("Awhat array size = %.2f MB\n\n", + (double)(sizeof(complex)*8*Ns*nv)*1.0e-6); + fflush(flog); + } + + nflds=(int)(1.0e6/(double)(sizeof(complex)*nv)); + if ((nflds%2)!=0) + nflds+=1; + if (nflds==0) + nflds=2; + + alloc_wv(nflds); + wv=reserve_wv(nflds); + + for (i=0;i1) + { + nt/=2; + if (nt==0) + nt=1; + wdt=0.0; + + while (wdt<5.0) + { + for (i=0;i +#include +#include +#include "utils.h" +#include "extras.h" + + +int main(void) +{ + double x,y; + + printf("\n"); + printf("Modified Bessel function I0(x) [program i0m()]\n"); + printf("----------------------------------------------\n\n"); + + printf("Print selected values:\n\n"); + + for (;;) + { + printf("Specify x: "); + + if (scanf("%lf",&x)==1) + { + y=i0m(x); + printf("x = %.4e, exp(-x)*I0(x) = %.15e\n\n",x,y); + } + else + { + printf("No value specified, program stopped\n\n"); + break; + } + } + + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/forces/INDEX b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/forces/INDEX new file mode 100644 index 0000000000000000000000000000000000000000..d32a6fa775381b91c3ff77dffc20593c1deac355 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/forces/INDEX @@ -0,0 +1,7 @@ + +Generic functions for MD force calculations + +check1 Check of det2xt and prod2xt + +check2 Check of prod2xv + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/forces/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/forces/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..bad6db290fc42f20fb14ec8e729c54130c938141 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/forces/Makefile @@ -0,0 +1,126 @@ +################################################################################ +# +# Makefile to compile and link C programs +# +# Version valid for Linux machines +# +# "make" compiles and links the specified main programs and modules +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files created by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +all: rmxeq mkdep mkxeq +.PHONY: all + + +# main programs and required modules + +MAIN = check1 check2 + +RANDOM = ranlxs ranlxd gauss + +UTILS = utils + +LINALG = cmatrix cmatrix_dble + +FORCES = frcfcts + +SW_TERM = pauli pauli_dble + +SU3FCTS = random_su3 + +MODULES = $(RANDOM) $(UTILS) $(LINALG) $(FORCES) $(SW_TERM) $(SU3FCTS) + + +# search path for modules + +MDIR = ../../../modules + +VPATH = $(MDIR)/nompi/extras:$(MDIR)/nompi/utils:$(MDIR)/linalg:\ + $(MDIR)/random:$(MDIR)/frcfcts:$(MDIR)/sw_term:\ + $(MDIR)/su3fcts:$(MDIR)/forces + + +# additional include directories + +INCPATH = ../../../include/nompi ../../../include + + +# additional libraries to be included + +LIBS = m + +LIBPATH = + + +# scheduling and optimization options + +CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ + -Wall -Wno-long-long -Wstrict-prototypes -Werror \ + -O -mno-avx -Dx64 + + +############################## do not change ################################### + +SHELL=/bin/bash + +CC=$(GCC) + +PGMS= $(MAIN) $(MODULES) + +INCDIRS = $(addprefix -I,$(INCPATH)) + +OBJECTS = $(addsuffix .o,$(MODULES)) + +LDFLAGS = $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) + +-include $(addsuffix .d,$(PGMS)) + + +# rule to make dependencies + +$(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(CC) -MM -ansi $(INCDIRS) $< -o $@ + + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(INCDIRS) -o $@ + + +# rule to link object files + +$(MAIN): %: %.o $(OBJECTS) Makefile + $(CC) $< $(OBJECTS) $(CFLAGS) $(LDFLAGS) -o $@ + + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables and old error log file + +rmxeq: + @ -rm -f $(MAIN); \ + echo "delete old executables" + + +# make dependencies + +mkdep: $(addsuffix .d,$(PGMS)) + @ echo "generate tables of dependencies" + + +# clean directory + +clean: + @ -rm -rf *.d *.o $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/forces/check1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/forces/check1.c new file mode 100644 index 0000000000000000000000000000000000000000..e12cabc3f3d89890f2ec394d2f380dd1890b9767 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/forces/check1.c @@ -0,0 +1,323 @@ + +/******************************************************************************* +* +* File check1.c +* +* Copyright (C) 2005, 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of det2xt and prod2xt +* +*******************************************************************************/ + +#include +#include +#include +#include "su3.h" +#include "random.h" +#include "utils.h" +#include "su3fcts.h" +#include "sw_term.h" +#include "forces.h" + +typedef union +{ + spinor_dble s; + weyl_dble w[2]; + double r[24]; +} spin_t; + +typedef union +{ + su3_vector_dble v; + double r[6]; +} vec_t; + +static int pln[6][2]={{0,1},{0,2},{0,3},{2,3},{3,1},{1,2}}; +static const su3_vector_dble vd0={{0.0}}; +static const spinor_dble sd0={{{0.0}}}; + +static su3_dble Q ALIGNED16; +static spin_t s1,s2,s3,s4 ALIGNED16; +static pauli_dble m[2] ALIGNED16; + + +static su3_vector_dble mul_cplx(complex_dble z,su3_vector_dble s) +{ + su3_vector_dble r; + + r.c1.re=z.re*s.c1.re-z.im*s.c1.im; + r.c1.im=z.im*s.c1.re+z.re*s.c1.im; + r.c2.re=z.re*s.c2.re-z.im*s.c2.im; + r.c2.im=z.im*s.c2.re+z.re*s.c2.im; + r.c3.re=z.re*s.c3.re-z.im*s.c3.im; + r.c3.im=z.im*s.c3.re+z.re*s.c3.im; + + return r; +} + + +static spinor_dble mul_gamma(int mu,spinor_dble s) +{ + spinor_dble r; + complex_dble i,m_i,m_1; + + i.re=0.0; + i.im=1.0; + + m_i.re=0.0; + m_i.im=-1.0; + + m_1.re=-1.0; + m_1.im=0.0; + + if (mu==0) + { + r.c1=mul_cplx(m_1,s.c3); + r.c2=mul_cplx(m_1,s.c4); + r.c3=mul_cplx(m_1,s.c1); + r.c4=mul_cplx(m_1,s.c2); + } + else if (mu==1) + { + r.c1=mul_cplx(m_i,s.c4); + r.c2=mul_cplx(m_i,s.c3); + r.c3=mul_cplx(i,s.c2); + r.c4=mul_cplx(i,s.c1); + } + else if (mu==2) + { + r.c1=mul_cplx(m_1,s.c4); + r.c2=s.c3; + r.c3=s.c2; + r.c4=mul_cplx(m_1,s.c1); + } + else if (mu==3) + { + r.c1=mul_cplx(m_i,s.c3); + r.c2=mul_cplx(i,s.c4); + r.c3=mul_cplx(i,s.c1); + r.c4=mul_cplx(m_i,s.c2); + } + else + { + r.c1=s.c1; + r.c2=s.c2; + r.c3=mul_cplx(m_1,s.c3); + r.c4=mul_cplx(m_1,s.c4); + } + + return r; +} + + +static spinor_dble mul_sigma(int mu,int nu,spinor_dble s) +{ + complex_dble z; + spinor_dble r1,r2; + + r1=mul_gamma(nu,s); + r1=mul_gamma(mu,r1); + + r2=mul_gamma(mu,s); + r2=mul_gamma(nu,r2); + + _vector_sub_assign(r1.c1,r2.c1); + _vector_sub_assign(r1.c2,r2.c2); + _vector_sub_assign(r1.c3,r2.c3); + _vector_sub_assign(r1.c4,r2.c4); + + z.re=0.0; + z.im=0.5; + _vector_mulc(r2.c1,z,r1.c1); + _vector_mulc(r2.c2,z,r1.c2); + _vector_mulc(r2.c3,z,r1.c3); + _vector_mulc(r2.c4,z,r1.c4); + + return r2; +} + + +static spinor_dble mul_Fhat(su3_dble Q,spinor_dble s) +{ + su3_dble F; + spinor_dble r; + + F.c11.re=0.0; + F.c11.im=0.25*Q.c11.im; + F.c22.re=0.0; + F.c22.im=0.25*Q.c22.im; + F.c33.re=0.0; + F.c33.im=0.25*Q.c33.im; + + F.c12.re=0.125*(Q.c12.re-Q.c21.re); + F.c12.im=0.125*(Q.c12.im+Q.c21.im); + F.c21.re=-F.c12.re; + F.c21.im=F.c12.im; + + F.c13.re=0.125*(Q.c13.re-Q.c31.re); + F.c13.im=0.125*(Q.c13.im+Q.c31.im); + F.c31.re=-F.c13.re; + F.c31.im=F.c13.im; + + F.c23.re=0.125*(Q.c23.re-Q.c32.re); + F.c23.im=0.125*(Q.c23.im+Q.c32.im); + F.c32.re=-F.c23.re; + F.c32.im=F.c23.im; + + _su3_multiply(r.c1,F,s.c1); + _su3_multiply(r.c2,F,s.c2); + _su3_multiply(r.c3,F,s.c3); + _su3_multiply(r.c4,F,s.c4); + + return r; +} + + +static su3_vector_dble mul_X(u3_alg_dble X,su3_vector_dble s) +{ + su3_dble M; + su3_vector_dble r; + + M.c11.re=0.0; + M.c11.im=X.c1; + M.c22.re=0.0; + M.c22.im=X.c2; + M.c33.re=0.0; + M.c33.im=X.c3; + + M.c12.re=X.c4; + M.c12.im=X.c5; + M.c21.re=-X.c4; + M.c21.im=X.c5; + + M.c13.re=X.c6; + M.c13.im=X.c7; + M.c31.re=-X.c6; + M.c31.im=X.c7; + + M.c23.re=X.c8; + M.c23.im=X.c9; + M.c32.re=-X.c8; + M.c32.im=X.c9; + + _su3_multiply(r,M,s); + + return r; +} + + +int main(void) +{ + int n,mu,nu,i; + complex_dble z; + vec_t v1,v2,v3; + u3_alg_dble X[6]; + + printf("\n"); + printf("Check of det2xt and prod2xt\n"); + printf("---------------------------\n\n"); + + rlxd_init(1,23456); + + ranlxd(v1.r,6); + ranlxd(v2.r,6); + ranlxd(v3.r,6); + + ranlxd(s1.r,24); + ranlxd(s2.r,24); + ranlxd(s3.r,24); + ranlxd(s4.r,24); + + ranlxd(m[0].u,36); + ranlxd(m[1].u,36); + + det2xt(m,X); + + printf("det2xt:\n"); + + for (n=0;n<6;n++) + { + mu=pln[n][0]; + nu=pln[n][1]; + + random_su3_dble(&Q); + z.im=0.0; + + for (i=0;i<12;i++) + { + s1.s=sd0; + s1.r[2*i]=1.0; + + mul_pauli_dble(0.0,m,s1.w,s2.w); + mul_pauli_dble(0.0,m+1,s1.w+1,s2.w+1); + s1.s=mul_sigma(mu,nu,s2.s); + s2.s=mul_Fhat(Q,s1.s); + + z.im-=s2.r[2*i+1]; + } + + z.re=0.0; + + for (i=0;i<3;i++) + { + v1.v=vd0; + v1.r[2*i]=1.0; + + v2.v=mul_X(X[n],v1.v); + _su3_multiply(v3.v,Q,v2.v); + + z.re+=v3.r[2*i]; + } + + printf("mu,nu = %d,%d: %.2e\n", + mu,nu,fabs(2.0*z.re-8.0*z.im)); + } + + ranlxd(s1.r,24); + ranlxd(s2.r,24); + + prod2xt(&s1.s,&s2.s,X); + + printf("\n"); + printf("prod2xt:\n"); + + for (n=0;n<6;n++) + { + mu=pln[n][0]; + nu=pln[n][1]; + + random_su3_dble(&Q); + z.im=0.0; + + s3.s=mul_sigma(mu,nu,s2.s); + s4.s=mul_gamma(5,s3.s); + s3.s=mul_Fhat(Q,s4.s); + + z.im =_vector_prod_im(s1.s.c1,s3.s.c1); + z.im+=_vector_prod_im(s1.s.c2,s3.s.c2); + z.im+=_vector_prod_im(s1.s.c3,s3.s.c3); + z.im+=_vector_prod_im(s1.s.c4,s3.s.c4); + + z.re=0.0; + + for (i=0;i<3;i++) + { + v1.v=vd0; + v1.r[2*i]=1.0; + + v2.v=mul_X(X[n],v1.v); + _su3_multiply(v3.v,Q,v2.v); + + z.re+=v3.r[2*i]; + } + + printf("mu,nu = %d,%d: %.2e\n", + mu,nu,fabs(2.0*z.re+16.0*z.im)); + } + + printf("\n"); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/forces/check2.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/forces/check2.c new file mode 100644 index 0000000000000000000000000000000000000000..4bedffaeca45313fad99022e8d89d19dc3e30a37 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/forces/check2.c @@ -0,0 +1,206 @@ + +/******************************************************************************* +* +* File check2.c +* +* Copyright (C) 2005, 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of prod2xv +* +*******************************************************************************/ + +#include +#include +#include +#include "su3.h" +#include "random.h" +#include "utils.h" +#include "su3fcts.h" +#include "sw_term.h" +#include "forces.h" + +typedef union +{ + su3_dble u; + complex_dble c[9]; +} umat_t; + +static const su3_dble ud0={{0.0}}; +static su3_dble u,v ALIGNED16; +static spinor_dble rx,ry,sx,sy,sw ALIGNED16; + +#define _re(z,w) ((z).re*(w).re+(z).im*(w).im) +#define _im(z,w) ((z).im*(w).re-(z).re*(w).im) + + +static su3_vector_dble mul_cplx(complex_dble z,su3_vector_dble s) +{ + su3_vector_dble r; + + r.c1.re=z.re*s.c1.re-z.im*s.c1.im; + r.c1.im=z.im*s.c1.re+z.re*s.c1.im; + r.c2.re=z.re*s.c2.re-z.im*s.c2.im; + r.c2.im=z.im*s.c2.re+z.re*s.c2.im; + r.c3.re=z.re*s.c3.re-z.im*s.c3.im; + r.c3.im=z.im*s.c3.re+z.re*s.c3.im; + + return r; +} + + +static spinor_dble mul_gamma(int mu,spinor_dble s) +{ + spinor_dble r; + complex_dble i,m_i,m_1; + + i.re=0.0; + i.im=1.0; + + m_i.re=0.0; + m_i.im=-1.0; + + m_1.re=-1.0; + m_1.im=0.0; + + if (mu==0) + { + r.c1=mul_cplx(m_1,s.c3); + r.c2=mul_cplx(m_1,s.c4); + r.c3=mul_cplx(m_1,s.c1); + r.c4=mul_cplx(m_1,s.c2); + } + else if (mu==1) + { + r.c1=mul_cplx(m_i,s.c4); + r.c2=mul_cplx(m_i,s.c3); + r.c3=mul_cplx(i,s.c2); + r.c4=mul_cplx(i,s.c1); + } + else if (mu==2) + { + r.c1=mul_cplx(m_1,s.c4); + r.c2=s.c3; + r.c3=s.c2; + r.c4=mul_cplx(m_1,s.c1); + } + else if (mu==3) + { + r.c1=mul_cplx(m_i,s.c3); + r.c2=mul_cplx(i,s.c4); + r.c3=mul_cplx(i,s.c1); + r.c4=mul_cplx(m_i,s.c2); + } + else + { + r.c1=s.c1; + r.c2=s.c2; + r.c3=mul_cplx(m_1,s.c3); + r.c4=mul_cplx(m_1,s.c4); + } + + return r; +} + + +static void add_tensor(su3_vector_dble *r,su3_vector_dble *s,su3_dble *p) +{ + (*p).c11.re+=_re((*r).c1,(*s).c1); + (*p).c11.im+=_im((*r).c1,(*s).c1); + (*p).c12.re+=_re((*r).c1,(*s).c2); + (*p).c12.im+=_im((*r).c1,(*s).c2); + (*p).c13.re+=_re((*r).c1,(*s).c3); + (*p).c13.im+=_im((*r).c1,(*s).c3); + + (*p).c21.re+=_re((*r).c2,(*s).c1); + (*p).c21.im+=_im((*r).c2,(*s).c1); + (*p).c22.re+=_re((*r).c2,(*s).c2); + (*p).c22.im+=_im((*r).c2,(*s).c2); + (*p).c23.re+=_re((*r).c2,(*s).c3); + (*p).c23.im+=_im((*r).c2,(*s).c3); + + (*p).c31.re+=_re((*r).c3,(*s).c1); + (*p).c31.im+=_im((*r).c3,(*s).c1); + (*p).c32.re+=_re((*r).c3,(*s).c2); + (*p).c32.im+=_im((*r).c3,(*s).c2); + (*p).c33.re+=_re((*r).c3,(*s).c3); + (*p).c33.im+=_im((*r).c3,(*s).c3); +} + + +static double max_dev(su3_dble *u,su3_dble *v) +{ + int i; + double nrm,dev; + umat_t uu,uv; + + uu.u=(*u); + uv.u=(*v); + + nrm=0.0; + dev=0.0; + + for (i=0;i<9;i++) + { + nrm+=uu.c[i].re*uu.c[i].re+uu.c[i].im*uu.c[i].im; + + dev+=(uu.c[i].re-uv.c[i].re)*(uu.c[i].re-uv.c[i].re)+ + (uu.c[i].im-uv.c[i].im)*(uu.c[i].im-uv.c[i].im); + } + + return sqrt(dev/nrm); +} + + +int main(void) +{ + int mu; + + printf("\n"); + printf("Check of prod2xv\n"); + printf("-----------------\n\n"); + + rlxd_init(1,567); + + gauss_dble((double*)(&rx),24); + gauss_dble((double*)(&ry),24); + gauss_dble((double*)(&sx),24); + gauss_dble((double*)(&sy),24); + + for (mu=0;mu<4;mu++) + { + prod2xv[mu](&rx,&ry,&sx,&sy,&u); + v=ud0; + + sw=mul_gamma(mu,ry); + _vector_sub(sw.c1,ry.c1,sw.c1); + _vector_sub(sw.c2,ry.c2,sw.c2); + _vector_sub(sw.c3,ry.c3,sw.c3); + _vector_sub(sw.c4,ry.c4,sw.c4); + sw=mul_gamma(5,sw); + + add_tensor(&sw.c1,&sx.c1,&v); + add_tensor(&sw.c2,&sx.c2,&v); + add_tensor(&sw.c3,&sx.c3,&v); + add_tensor(&sw.c4,&sx.c4,&v); + + sw=mul_gamma(mu,sy); + _vector_sub(sw.c1,sy.c1,sw.c1); + _vector_sub(sw.c2,sy.c2,sw.c2); + _vector_sub(sw.c3,sy.c3,sw.c3); + _vector_sub(sw.c4,sy.c4,sw.c4); + sw=mul_gamma(5,sw); + + add_tensor(&sw.c1,&rx.c1,&v); + add_tensor(&sw.c2,&rx.c2,&v); + add_tensor(&sw.c3,&rx.c3,&v); + add_tensor(&sw.c4,&rx.c4,&v); + + printf("mu = %d: %.2e\n",mu,max_dev(&u,&v)); + } + + printf("\n"); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/linalg/INDEX b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/linalg/INDEX new file mode 100644 index 0000000000000000000000000000000000000000..b43d50153001c466be2f329a622d5f5a0394f1ec --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/linalg/INDEX @@ -0,0 +1,10 @@ + +Complex matrix functions + +check1 Check of cmat_vec, cmat_add, ... + +check2 Check of cmat_vec_dble, cmat_add_dble, ... + +time1 Timing of cmat_vec and cmat_mul + +time2 Timing of cmat_vec_dble, cmat_mul_dble and cmat_inv_dble diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/linalg/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/linalg/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..9337e77ae9ea19bf857cd9dc44bdf3376f61b9e5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/linalg/Makefile @@ -0,0 +1,119 @@ +################################################################################ +# +# Makefile to compile and link C programs +# +# Version valid for Linux machines +# +# "make" compiles and links the specified main programs and modules +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files created by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +all: rmxeq mkdep mkxeq +.PHONY: all + + +# main programs and required modules + +MAIN = check1 check2 time1 time2 + +RANDOM = ranlxs ranlxd gauss + +UTILS = utils + +LINALG = cmatrix cmatrix_dble + +MODULES = $(RANDOM) $(UTILS) $(LINALG) + + +# search path for modules + +MDIR = ../../../modules + +VPATH = $(MDIR)/nompi/utils:$(MDIR)/random:$(MDIR)/linalg + + +# additional include directories + +INCPATH = ../../../include/nompi ../../../include + + +# additional libraries to be included + +LIBS = m + +LIBPATH = + + +# scheduling and optimization options + +CFLAGS = -std=c89 -pedantic -fstrict-aliasing -fno-inline \ + -Wall -Wno-long-long -Wstrict-prototypes -Werror \ + -O -mno-avx -Dx64 -DPM + +# -Dx64 + +############################## do not change ################################### + +SHELL=/bin/bash + +CC=$(GCC) + +PGMS= $(MAIN) $(MODULES) + +INCDIRS = $(addprefix -I,$(INCPATH)) + +OBJECTS = $(addsuffix .o,$(MODULES)) + +LDFLAGS = $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) + +-include $(addsuffix .d,$(PGMS)) + + +# rule to make dependencies + +$(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(CC) -MM -ansi $(INCDIRS) $< -o $@ + + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(INCDIRS) -o $@ + + +# rule to link object files + +$(MAIN): %: %.o $(OBJECTS) Makefile + $(CC) $< $(OBJECTS) $(CFLAGS) $(LDFLAGS) -o $@ + + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables and old error log file + +rmxeq: + @ -rm -f $(MAIN); \ + echo "delete old executables" + + +# make dependencies + +mkdep: $(addsuffix .d,$(PGMS)) + @ echo "generate tables of dependencies" + + +# clean directory + +clean: + @ -rm -rf *.d *.o $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/linalg/check1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/linalg/check1.c new file mode 100644 index 0000000000000000000000000000000000000000..fedca05691fcef6e4f74ef6b9930808022fb8774 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/linalg/check1.c @@ -0,0 +1,387 @@ + +/******************************************************************************* +* +* File check1.c +* +* Copyright (C) 2007, 2009, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of cmat_vec_dble, cmat_add_dble, ... +* +*******************************************************************************/ + +#include +#include +#include +#include "su3.h" +#include "random.h" +#include "utils.h" +#include "linalg.h" + +#define NMAX 32 + +#define cadd(u,v,w) \ + (u).re=(v).re+(w).re;\ + (u).im=(v).im+(w).im + +#define csub(u,v,w) \ + (u).re=(v).re-(w).re;\ + (u).im=(v).im-(w).im + +#define cmul(u,v,w) \ + (u).re=(v).re*(w).re-(v).im*(w).im;\ + (u).im=(v).re*(w).im+(v).im*(w).re + +#define cmul_assign(u,v,w) \ + (u).re+=((v).re*(w).re-(v).im*(w).im);\ + (u).im+=((v).re*(w).im+(v).im*(w).re) + + +static void mvec(int n,complex *a,complex *v,complex *w) +{ + int i,j; + complex z; + + for (i=0;idmax) + dmax=d; + } + + return dmax; +} + + +static float mdev(int n,complex *a,complex *b) +{ + int i,j; + float d,dmax; + + dmax=0.0f; + + for (i=0;idmax) + dmax=d; + } + } + + return dmax; +} + + +static void rvec(int n,complex *v) +{ + gauss((float*)(v),2*n); +} + + +static void rmat(int n,complex *a) +{ + int i,j; + float r; + + r=0.1f/(float)(n*n); + + gauss((float*)(a),2*n*n); + + for (i=0;id1) + d1=d; + + error((mdev(n,a1,a2)!=0.0f)||(vdev(n,v1,v2)!=0.0f),1,"main [check1.c]", + "cmat_vec: input values have changed"); + + rvec(n,v1); + rvec(n,w1); + rmat(n,a1); + vec2vec(n,v1,v2); + vec2vec(n,w1,w2); + mat2mat(n,a1,a2); + + cmat_vec_assign(n,a1,v1,w1); + mvec_assign(n,a2,v2,w2); + + d=vdev(n,w1,w2); + if (d>d1) + d1=d; + + error((mdev(n,a1,a2)!=0.0f)||(vdev(n,v1,v2)!=0.0f),1,"main [check1.c]", + "cmat_vec_assign: input values have changed"); + + rmat(n,a1); + rmat(n,b1); + rmat(n,c1); + mat2mat(n,a1,a2); + mat2mat(n,b1,b2); + rmat(n,c2); + + cmat_add(n,a1,b1,c1); + madd(n,a2,b2,c2); + + d=mdev(n,c1,c2); + if (d>d2) + d2=d; + + error((mdev(n,a1,a2)!=0.0f)||(mdev(n,b1,b2)!=0.0f),1,"main [check1.c]", + "cmat_add: input values have changed"); + + rmat(n,a1); + rmat(n,b1); + rmat(n,c1); + mat2mat(n,a1,a2); + mat2mat(n,b1,b2); + rmat(n,c2); + + cmat_sub(n,a1,b1,c1); + msub(n,a2,b2,c2); + + d=mdev(n,c1,c2); + if (d>d3) + d3=d; + + error((mdev(n,a1,a2)!=0.0f)||(mdev(n,b1,b2)!=0.0f),1,"main [check1.c]", + "cmat_sub: input values have changed"); + + rmat(n,a1); + rmat(n,b1); + rmat(n,c1); + mat2mat(n,a1,a2); + mat2mat(n,b1,b2); + rmat(n,c2); + + cmat_mul(n,a1,b1,c1); + mmul(n,a2,b2,c2); + + d=mdev(n,c1,c2); + if (d>d4) + d4=d; + + error((mdev(n,a1,a2)!=0.0f)||(mdev(n,b1,b2)!=0.0f),1,"main [check1.c]", + "cmat_mul: input values have changed"); + + rmat(n,a1); + rmat(n,b1); + mat2mat(n,a1,a2); + rmat(n,b2); + + cmat_dag(n,a1,b1); + mdag(n,a2,b2); + + d=mdev(n,b1,b2); + if (d>d5) + d5=d; + + error(mdev(n,a1,a2)!=0.0f,1,"main [check1.c]", + "cmat_dag: input values have changed"); + } + + printf("Consider matrices of size up to %dx%d\n\n",NMAX,NMAX); + + printf("The maximal observed deviations are:\n\n"); + printf("cmat_vec: %.1e\n",d1); + printf("cmat_add: %.1e\n",d2); + printf("cmat_sub: %.1e\n",d3); + printf("cmat_mul: %.1e\n",d4); + printf("cmat_dag: %.1e\n\n",d5); + + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/linalg/check2.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/linalg/check2.c new file mode 100644 index 0000000000000000000000000000000000000000..d8e249d9573a323d7d1284127cfa585d2b30d654 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/linalg/check2.c @@ -0,0 +1,436 @@ + +/******************************************************************************* +* +* File check2.c +* +* Copyright (C) 2007, 2009, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of cmat_vec_dble, cmat_add_dble, ... +* +*******************************************************************************/ + +#include +#include +#include +#include "su3.h" +#include "random.h" +#include "utils.h" +#include "linalg.h" + +#define NMAX 32 + +#define cadd(u,v,w) \ + (u).re=(v).re+(w).re;\ + (u).im=(v).im+(w).im + +#define csub(u,v,w) \ + (u).re=(v).re-(w).re;\ + (u).im=(v).im-(w).im + +#define cmul(u,v,w) \ + (u).re=(v).re*(w).re-(v).im*(w).im;\ + (u).im=(v).re*(w).im+(v).im*(w).re + +#define cmul_assign(u,v,w) \ + (u).re+=((v).re*(w).re-(v).im*(w).im);\ + (u).im+=((v).re*(w).im+(v).im*(w).re) + + +static void mvec(int n,complex_dble *a,complex_dble *v,complex_dble *w) +{ + int i,j; + complex_dble z; + + for (i=0;idmax) + dmax=d; + } + + return dmax; +} + + +static double mdev(int n,complex_dble *a,complex_dble *b) +{ + int i,j; + double d,dmax; + + dmax=0.0; + + for (i=0;idmax) + dmax=d; + } + } + + return dmax; +} + + +static void rvec(int n,complex_dble *v) +{ + gauss_dble((double*)(v),2*n); +} + + +static void rmat(int n,complex_dble *a) +{ + int i,j; + double r; + + r=0.1/(double)(n*n); + + gauss_dble((double*)(a),2*n*n); + + for (i=0;id1) + d1=d; + + error((mdev(n,a1,a2)!=0.0)||(vdev(n,v1,v2)!=0.0),1,"main [check2.c]", + "cmat_vec_dble: input values have changed"); + + rvec(n,v1); + rvec(n,w1); + rmat(n,a1); + vec2vec(n,v1,v2); + vec2vec(n,w1,w2); + mat2mat(n,a1,a2); + + cmat_vec_assign_dble(n,a1,v1,w1); + mvec_assign(n,a2,v2,w2); + + d=vdev(n,w1,w2); + if (d>d1) + d1=d; + + error((mdev(n,a1,a2)!=0.0)||(vdev(n,v1,v2)!=0.0),1,"main [check2.c]", + "cmat_vec_assign_dble: input values have changed"); + + rmat(n,a1); + rmat(n,b1); + rmat(n,c1); + mat2mat(n,a1,a2); + mat2mat(n,b1,b2); + rmat(n,c2); + + cmat_add_dble(n,a1,b1,c1); + madd(n,a2,b2,c2); + + d=mdev(n,c1,c2); + if (d>d2) + d2=d; + + error((mdev(n,a1,a2)!=0.0)||(mdev(n,b1,b2)!=0.0),1,"main [check2.c]", + "cmat_add_dble: input values have changed"); + + rmat(n,a1); + rmat(n,b1); + rmat(n,c1); + mat2mat(n,a1,a2); + mat2mat(n,b1,b2); + rmat(n,c2); + + cmat_sub_dble(n,a1,b1,c1); + msub(n,a2,b2,c2); + + d=mdev(n,c1,c2); + if (d>d3) + d3=d; + + error((mdev(n,a1,a2)!=0.0)||(mdev(n,b1,b2)!=0.0),1,"main [check2.c]", + "cmat_sub_dble: input values have changed"); + + rmat(n,a1); + rmat(n,b1); + rmat(n,c1); + mat2mat(n,a1,a2); + mat2mat(n,b1,b2); + rmat(n,c2); + + cmat_mul_dble(n,a1,b1,c1); + mmul(n,a2,b2,c2); + + d=mdev(n,c1,c2); + if (d>d4) + d4=d; + + error((mdev(n,a1,a2)!=0.0)||(mdev(n,b1,b2)!=0.0),1,"main [check2.c]", + "cmat_mul_dble: input values have changed"); + + rmat(n,a1); + rmat(n,b1); + mat2mat(n,a1,a2); + rmat(n,b2); + + cmat_dag_dble(n,a1,b1); + mdag(n,a2,b2); + + d=mdev(n,b1,b2); + if (d>d5) + d5=d; + + error(mdev(n,a1,a2)!=0.0,1,"main [check2.c]", + "cmat_dag_dble: input values have changed"); + + rmat(n,a1); + rmat(n,b1); + mat2mat(n,a1,a2); + rmat(n,b2); + rmat(n,c2); + + ie=cmat_inv_dble(n,a1,b1,&k1); + mmul(n,a2,b1,b2); + mmul(n,a2,b2,c2); + + d=mdev(n,a2,c2); + if (d>d6) + d6=d; + + if (k1>kmax) + kmax=k1; + + k2=fnorm(n,a1)*fnorm(n,b1); + d=fabs(k2/k1-1.0); + if (d>d7) + d7=d; + + error(ie!=0,1,"main [check2.c]", + "cmat_inv_dble: singular matrix encountered"); + + error(mdev(n,a1,a2)!=0.0,1,"main [check2.c]", + "cmat_inv_dble: input values have changed"); + } + + printf("Consider matrices of size up to %dx%d\n\n",NMAX,NMAX); + + printf("The maximal observed deviations are:\n\n"); + printf("cmat_vec_dble: %.1e\n",d1); + printf("cmat_add_dble: %.1e\n",d2); + printf("cmat_sub_dble: %.1e\n",d3); + printf("cmat_mul_dble: %.1e\n",d4); + printf("cmat_dag_dble: %.1e\n",d5); + printf("cmat_inv_dble: %.1e, condition number: max=%.1e, dev=%.1e\n\n", + d6,kmax,d7); + + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/linalg/time1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/linalg/time1.c new file mode 100644 index 0000000000000000000000000000000000000000..5d4880a9cfd04e965d2d014f5b9dad500d6c741d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/linalg/time1.c @@ -0,0 +1,100 @@ + +/******************************************************************************* +* +* File time1.c +* +* Copyright (C) 2007, 2009, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Timing of cmat_vec and cmat_mul +* +*******************************************************************************/ + +#include +#include +#include +#include +#include "random.h" +#include "su3.h" +#include "utils.h" +#include "linalg.h" + + +int main(void) +{ + int ir,nm,n,count; + double t1,t2,dt; + complex *a,*b,*c,*v,*w; + + printf("\n"); + printf("Timing of cmat_vec and cmat_mul\n"); + printf("-------------------------------\n\n"); + +#if (defined AVX) + printf("Using AVX instructions\n\n"); +#elif (defined x64) + printf("Using SSE3 instructions and up to 16 xmm registers\n\n"); +#endif + + printf("Measurement made with all data in cache\n\n"); + + printf("Matrix size: "); + ir=scanf(" %d",&nm); + + error((ir!=1)||(nm<1),1,"main [time1.c]", + "Read error or improper matrix size"); + + a=amalloc((3*nm*nm+2*nm)*sizeof(*a),4); + error(a==NULL,1,"main [time1.c]","Unable to allocate auxiliary arrays"); + + rlxs_init(0,23456); + ranlxs((float*)(a),6*nm*nm+4*nm); + + b=a+nm*nm; + c=b+nm*nm; + v=c+nm*nm; + w=v+nm; + + n=(int)(1.0e7)/(nm*nm); + dt=0.0; + + while (dt<2.0) + { + t1=(double)clock(); + for (count=0;count +#include +#include +#include +#include "random.h" +#include "su3.h" +#include "utils.h" +#include "linalg.h" + + +int main(void) +{ + int ie,ir,nm,n,count; + double k,t1,t2,dt; + complex_dble *a,*b,*c,*v,*w; + + printf("\n"); + printf("Timing of cmat_vec_dble, cmat_mul_dble and cmat_inv_dble\n"); + printf("--------------------------------------------------------\n\n"); + +#if (defined AVX) + printf("Using AVX instructions\n\n"); +#elif (defined x64) + printf("Using SSE3 instructions and up to 16 xmm registers\n\n"); +#endif + + printf("Measurement made with all data in cache\n\n"); + + printf("Matrix size: "); + ir=scanf(" %d",&nm); + + error((ir!=1)||(nm<1),1,"main [time2.c]", + "Read error or improper matrix size"); + + a=amalloc((3*nm*nm+2*nm)*sizeof(*a),6); + error(a==NULL,1,"main [time2.c]","Unable to allocate auxiliary arrays"); + + rlxd_init(1,23456); + ranlxd((double*)(a),6*nm*nm+4*nm); + + b=a+nm*nm; + c=b+nm*nm; + v=c+nm*nm; + w=v+nm; + + n=(int)(1.0e7)/(nm*nm); + dt=0.0; + + while (dt<2.0) + { + t1=(double)clock(); + for (count=0;count.dat produced by the + simulation programs qcd1 and ym1. + +read2 Reads the data files produced by the measurement + program ms1. + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/main/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/main/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..2ee788070bcfa33bdc7e677b20a87bf356d2cfa5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/main/Makefile @@ -0,0 +1,115 @@ +################################################################################ +# +# Makefile to compile and link C programs +# +# Version valid for Linux machines +# +# "make" compiles and links the specified main programs and modules +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files created by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +all: rmxeq mkdep mkxeq +.PHONY: all + + +# main programs and required modules + +MAIN = read1 read2 + +EXTRAS = stat + +UTILS = utils mutils endian + +MODULES = $(EXTRAS) $(UTILS) + + +# search path for modules + +MDIR = ../../../modules + +VPATH = .:$(MDIR)/nompi/extras:$(MDIR)/nompi/utils + + +# additional include directories + +INCPATH = ../../../include/nompi + + +# additional libraries to be included + +LIBS = m + +LIBPATH = + + +# scheduling and optimization options + +CFLAGS = -std=c89 -pedantic -fno-strict-aliasing \ + -Wall -Wstrict-prototypes -Werror -O + + +############################## do not change ################################### + +SHELL=/bin/bash + +CC=$(GCC) + +PGMS= $(MAIN) $(MODULES) + +INCDIRS = $(addprefix -I,$(INCPATH)) + +OBJECTS = $(addsuffix .o,$(MODULES)) + +LDFLAGS = $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) + +-include $(addsuffix .d,$(PGMS)) + + +# rule to make dependencies + +$(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(CC) -MM -ansi $(INCDIRS) $< -o $@ + + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(INCDIRS) -o $@ + + +# rule to link object files + +$(MAIN): %: %.o $(OBJECTS) Makefile + $(CC) $< $(OBJECTS) $(CFLAGS) $(LDFLAGS) -o $@ + + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables and old error log file + +rmxeq: + @ -rm -f $(MAIN); \ + echo "delete old executables" + + +# make dependencies + +mkdep: $(addsuffix .d,$(PGMS)) + @ echo "generate tables of dependencies" + + +# clean directory + +clean: + @ -rm -rf *.d *.o $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/main/read1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/main/read1.c new file mode 100644 index 0000000000000000000000000000000000000000..7f23a163c52ec4de6c5022ed06051b16dc253cb5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/main/read1.c @@ -0,0 +1,328 @@ + +/******************************************************************************* +* +* File read1.c +* +* Copyright (C) 2010-2014 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Reads and evaluates data from the *.dat files created by the programs qcd1 +* and ym1. The file to be read has to be specified on the command line. +* +* This program writes the history of the MD energy deficit dH, the acceptance +* flag iac and the average plaquette to the file .run1.dat in the +* plots directory. In addition, some information about the distribution of dH +* and the integrated autocorrelation time of the plaquette are printed to +* stdout. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include +#include "utils.h" +#include "extras.h" + +typedef struct +{ + int nt,iac; + double dH,avpl; +} dat_t; + +static int nms,nfirst,nlast,neff; +static dat_t *adat; + + +static int read_dat(int n,dat_t *ndat,FILE *fin) +{ + int i,ir,ic,endian; + stdint_t istd[2]; + double dstd[2]; + + endian=endianness(); + ic=0; + + for (i=0;iamx); + + return (double)(ic)/(double)(n); +} + + +static double f(int nx,double x[]) +{ + return x[0]; +} + + +static void print_plot(char *fin) +{ + int n,ims; + char base[NAME_SIZE],plt_file[NAME_SIZE],*p; + dat_t *ndat; + FILE *fout; + + p=strstr(fin,".dat"); + error(p==NULL,1,"print_plot [read1.c]","Unexpected data file name"); + n=p-fin; + + p=strrchr(fin,'/'); + if (p==NULL) + p=fin; + else + p+=1; + n-=(p-fin); + + error(n>=NAME_SIZE,1,"print_plot [read1.c]","File name is too long"); + strncpy(base,p,n); + base[n]='\0'; + + error(name_size("plots/%s.run1.dat",base)>=NAME_SIZE,1, + "print_plot [read1.c]","File name is too long"); + sprintf(plt_file,"plots/%s.run1.dat",base); + fout=fopen(plt_file,"w"); + error(fout==NULL,1,"print_plot [read1.c]", + "Unable to open output file"); + + fprintf(fout,"#\n"); + fprintf(fout,"# Data written by the program ym1 or qcd1\n"); + fprintf(fout,"# ---------------------------------------\n"); + fprintf(fout,"#\n"); + fprintf(fout,"# Number of measurements = %d\n",nms); + fprintf(fout,"#\n"); + fprintf(fout,"# nt: trajectory number\n"); + fprintf(fout,"# dH: MD energy deficit\n"); + fprintf(fout,"# iac: acceptance flag\n"); + fprintf(fout,"#\n"); + fprintf(fout,"# nt dH iac \n"); + fprintf(fout,"#\n"); + + ndat=adat; + + for (ims=0;ims"); + + printf("\n"); + printf("HMC simulation of QCD\n"); + printf("---------------------\n\n"); + + read_file(argv[1]); + select_range(); + + a=malloc(neff*sizeof(double)); + error(a==NULL,1,"main [read1.c]", + "Unable to allocate data array"); + + for (n=0;n = %.3f (%.3f)\n", + average(neff,a),sigma0(neff,a)); + + for (n=0;n0.0) + a[n]=exp(-adat[nfirst+n].dH); + else + a[n]=1.0; + } + + printf(" = %.3f (%.3f)\n", + average(neff,a),sigma0(neff,a)); + + for (n=0;n = %.3f (%.3f)\n\n", + average(neff,a),sigma0(neff,a)); + + for (n=0;n100) + printf("using the\nnumerically determined " + "autocorrelation function.\n\n"); + else + printf("by binning the\ndata and by calculating " + "the jackknife errors of the binned series.\n\n"); + + printf("The autocorrelation times are given in numbers of measurements\n" + "separated by %d trajectories.\n\n", + adat[nfirst+1].nt-adat[nfirst].nt); + + if (neff>=100) + abar=print_auto(neff,a); + else + abar=print_jack(1,neff,&a,f); + + printf(" = %1.6f\n\n",abar); + + print_plot(argv[1]); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/main/read2.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/main/read2.c new file mode 100644 index 0000000000000000000000000000000000000000..badca6542645cf5b1e807b8e6a52371923076390 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/main/read2.c @@ -0,0 +1,606 @@ +/******************************************************************************* +* +* File read2.c +* +* Copyright (C) 2012-2014 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Reads and evaluates data from the data files created by the program ms1. +* The file to be read has to be specified on the command line. +* +* This program writes the history of the measured normalized reweighting +* factors to the file .run2.dat in the plots directory. The +* associated integrated autocorrelation times are estimated and printed +* to stdout. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include +#include "utils.h" +#include "extras.h" + +static struct +{ + int nrw; + int *nfct,*nsrc; +} file_head; + +static struct +{ + int nc; + double ***sqn,***lnr; +} data; + +static int endian; +static int first,last,step,nms; +static double ***avrw,***lnrw,*avtot,*lntot; + + +static void read_file_head(FILE *fdat) +{ + int nrw,*nfct,*nsrc; + int ir,ie,irw; + stdint_t istd[1]; + + ir=fread(istd,sizeof(stdint_t),1,fdat); + error(ir!=1,1,"read_file_head [read2.c]", + "Incorrect read count"); + + if (endian==BIG_ENDIAN) + bswap_int(1,istd); + + nrw=(int)(istd[0]); + error(nrw<1,1,"read_file_head [read2.c]", + "nrw is out of range"); + + nfct=malloc(2*nrw*sizeof(*nfct)); + error(nfct==NULL,1,"read_file_head [read2.c]", + "Unable to allocate data arrays"); + nsrc=nfct+nrw; + ie=0; + + for (irw=0;irwlst) + { + last=last-((last-lst)/step)*step; + if (last>lst) + last-=step; + } + + error((last=first)&&(nc<=last)&&(((nc-first)%step)==0)) + { + data2avrw(ims); + ims+=1; + } + } + + fclose(fdat); + error((ims!=nms)||(data.nc!=last),1,"read_file [read2.c]", + "Incorrect read count"); + + normalize_avrw(); +} + + +static double f(int nx,double x[]) +{ + return x[0]; +} + + +static void print_plot(char *fin) +{ + int n,nrw,irw,ims; + char base[NAME_SIZE],plt_file[NAME_SIZE],*p; + FILE *fout; + + p=strstr(fin,".ms1.dat"); + error(p==NULL,1,"print_plot [read2.c]","Unexpected data file name"); + n=p-fin; + + p=strrchr(fin,'/'); + if (p==NULL) + p=fin; + else + p+=1; + n-=(p-fin); + + error(n>=NAME_SIZE,1,"print_plot [read2.c]","File name is too long"); + strncpy(base,p,n); + base[n]='\0'; + + error(name_size("plots/%s.run2.dat",base)>=NAME_SIZE,1, + "print_plot [read2.c]","File name is too long"); + sprintf(plt_file,"plots/%s.run2.dat",base); + fout=fopen(plt_file,"w"); + error(fout==NULL,1,"print_plot [read2.c]", + "Unable to open output file"); + + nrw=file_head.nrw; + + fprintf(fout,"#\n"); + fprintf(fout,"# Data written by the program ms1\n"); + fprintf(fout,"# -------------------------------\n"); + fprintf(fout,"#\n"); + fprintf(fout,"# Number of measurements = %d\n",nms); + fprintf(fout,"#\n"); + fprintf(fout,"# nc: Configuration number\n"); + fprintf(fout,"# W: Normalized reweighting factors\n"); + fprintf(fout,"#\n"); + fprintf(fout,"# nc"); + + for (irw=0;irw"); + + printf("\n"); + printf("History of reweighting factors\n"); + printf("------------------------------\n\n"); + + read_file(argv[1]); + nrw=file_head.nrw; + nfct=file_head.nfct; + nsrc=file_head.nsrc; + + printf("The total number of measurements is %d.\n",nms); + printf("Integrated autocorrelation times and associated errors are "); + printf("estimated\n"); + + if (nms>100) + printf("using the numerically determined autocorrelation function.\n"); + else + printf("by binning and calculating jackknife errors.\n"); + + printf("Autocorrelation times are given in numbers of measurements.\n\n"); + + for (irw=0;irw1) + printf("Factorized into %d factors.\n",nfct[irw]); + if (nsrc[irw]>1) + printf("Using %d random source fields.\n\n",nsrc[irw]); + else + printf("Using 1 random source field.\n\n"); + + if (nms>=100) + print_auto(nms,avrw[irw][0]); + else + print_jack(1,nms,avrw[irw],f); + + printf("\n"); + } + + if (nrw!=1) + { + printf("Product of all reweighting factors:\n\n"); + + if (nms>=100) + print_auto(nms,avtot); + else + print_jack(1,nms,&avtot,f); + + printf("\n"); + } + + print_plot(argv[1]); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/random/INDEX b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/random/INDEX new file mode 100644 index 0000000000000000000000000000000000000000..39e3e1f63d02f94850ae3f4d22724fbe7032d93d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/random/INDEX @@ -0,0 +1,27 @@ + +Random number generation and related programs + + +check1 Correctness of ranlxs and ranlxd + +check2 Save state of ranlxs to a file and reset the generator + from the data on the file + +check3 Save state of ranlxd to a file and reset the generator + from the data on the file + +check4 Kolmogorov-Smirnov test of the random distribution produced + by gauss and gauss_dble + +check5 Statistical test of random_su3 + +check6 Statistical test of random_su3_dble + +check7 Reweighting of gaussian distributions [statistical test of + gauss_dble()] + +time1 Timing of ranlxs and gauss + +time2 Timing of ranlxd and gauss_dble + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/random/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/random/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..ad85d1a9a64cd6eb701ab9a0df1162a0bca6b766 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/random/Makefile @@ -0,0 +1,122 @@ +################################################################################ +# +# Makefile to compile and link C programs +# +# Version valid for Linux machines +# +# "make" compiles and links the specified main programs and modules +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files created by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +all: rmxeq mkdep mkxeq +.PHONY: all + + +# main programs and required modules + +MAIN = check1 check2 check3 check4 check5 check6 check7\ + time1 time2 + +RANDOM = ranlxs ranlxd gauss random_su3 + +UTILS = utils + +SU3FCTS = su3prod su3ren + +EXTRAS = chebyshev ks_test pchi_square stat + +MODULES = $(RANDOM) $(UTILS) $(SU3FCTS) $(EXTRAS) + + +# search path for modules + +MDIR = ../../../modules + +VPATH = $(MDIR)/nompi/extras:$(MDIR)/nompi/utils:\ + $(MDIR)/random:$(MDIR)/su3fcts + + +# additional include directories + +INCPATH = ../../../include/nompi ../../../include + + +# additional libraries to be included + +LIBS = m + +LIBPATH = + + +# scheduling and optimization options + +CFLAGS = -std=c89 -pedantic -fstrict-aliasing -fno-inline \ + -Wall -Wno-long-long -Wstrict-prototypes -Werror \ + -O -mno-avx -Dx64 -DPM + + +############################## do not change ################################### + +SHELL=/bin/bash + +CC=$(GCC) + +PGMS= $(MAIN) $(MODULES) + +INCDIRS = $(addprefix -I,$(INCPATH)) + +OBJECTS = $(addsuffix .o,$(MODULES)) + +LDFLAGS = $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) + +-include $(addsuffix .d,$(PGMS)) + + +# rule to make dependencies + +$(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(CC) -MM -ansi $(INCDIRS) $< -o $@ + + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(INCDIRS) -o $@ + + +# rule to link object files + +$(MAIN): %: %.o $(OBJECTS) Makefile + $(CC) $< $(OBJECTS) $(CFLAGS) $(LDFLAGS) -o $@ + + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables and old error log file + +rmxeq: + @ -rm -f $(MAIN); \ + echo "delete old executables" + + +# make dependencies + +mkdep: $(addsuffix .d,$(PGMS)) + @ echo "generate tables of dependencies" + + +# clean directory + +clean: + @ -rm -rf *.d *.o .tmp $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/random/check1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/random/check1.c new file mode 100644 index 0000000000000000000000000000000000000000..26a42c22b0c0ea2e5081071396263f8a1eb9746c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/random/check1.c @@ -0,0 +1,315 @@ + +/******************************************************************************* +* +* File check1.c +* +* Copyright (C) 2005 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* This program checks that ranlxs and ranlxd implement the basic algorithm +* correctly +* +*******************************************************************************/ + +#include +#include +#include +#include "random.h" + +#define NXS 204 +#define NXD 99 + + +int main(void) +{ + int k,test1,test2; + int *state1,*state2; + float sbase; + float xs[NXS],ys[NXS],xsn[96]; + double base; + double xd[NXD],yd[NXD],xdn[48]; + + sbase=(float)(ldexp(1.0,24)); + base=ldexp(1.0,48); + state1=malloc(rlxs_size()*sizeof(int)); + state2=malloc(rlxd_size()*sizeof(int)); + + rlxs_init(0,32767); + rlxd_init(1,32767); + + +/******************************************************************************* +* +* Check that the correct sequences of random numbers are obtained +* +*******************************************************************************/ + + for (k=0;k<20;k++) + { + ranlxs(xs,NXS); + ranlxd(xd,NXD); + } + + xsn[0]=13257445.0f; + xsn[1]=15738482.0f; + xsn[2]=5448599.0f; + xsn[3]=9610459.0f; + xsn[4]=1046025.0f; + xsn[5]=2811360.0f; + xsn[6]=14923726.0f; + xsn[7]=2287739.0f; + xsn[8]=16133204.0f; + xsn[9]=16328320.0f; + xsn[10]=12980218.0f; + xsn[11]=9256959.0f; + xsn[12]=5633754.0f; + xsn[13]=7422961.0f; + xsn[14]=6032411.0f; + xsn[15]=14970828.0f; + xsn[16]=10717272.0f; + xsn[17]=2520878.0f; + xsn[18]=8906135.0f; + xsn[19]=8507426.0f; + xsn[20]=11925022.0f; + xsn[21]=12042827.0f; + xsn[22]=12263021.0f; + xsn[23]=4828801.0f; + xsn[24]=5300508.0f; + xsn[25]=13346776.0f; + xsn[26]=10869790.0f; + xsn[27]=8520207.0f; + xsn[28]=11213953.0f; + xsn[29]=14439320.0f; + xsn[30]=5716476.0f; + xsn[31]=13600448.0f; + xsn[32]=12545579.0f; + xsn[33]=3466523.0f; + xsn[34]=113906.0f; + xsn[35]=10407879.0f; + xsn[36]=12058596.0f; + xsn[37]=4390921.0f; + xsn[38]=1634350.0f; + xsn[39]=9823280.0f; + xsn[40]=12569690.0f; + xsn[41]=8267856.0f; + xsn[42]=5869501.0f; + xsn[43]=7210219.0f; + xsn[44]=1362361.0f; + xsn[45]=2956909.0f; + xsn[46]=504465.0f; + xsn[47]=6664636.0f; + xsn[48]=6048963.0f; + xsn[49]=1098525.0f; + xsn[50]=1261330.0f; + xsn[51]=2401071.0f; + xsn[52]=8087317.0f; + xsn[53]=1293933.0f; + xsn[54]=555494.0f; + xsn[55]=14872475.0f; + xsn[56]=11261534.0f; + xsn[57]=166813.0f; + xsn[58]=13424516.0f; + xsn[59]=15280818.0f; + xsn[60]=4644497.0f; + xsn[61]=6333595.0f; + xsn[62]=10012569.0f; + xsn[63]=6878028.0f; + xsn[64]=9176136.0f; + xsn[65]=8379433.0f; + xsn[66]=11073957.0f; + xsn[67]=2465529.0f; + xsn[68]=13633550.0f; + xsn[69]=12721649.0f; + xsn[70]=569725.0f; + xsn[71]=6375015.0f; + xsn[72]=2164250.0f; + xsn[73]=6725885.0f; + xsn[74]=7223108.0f; + xsn[75]=4890858.0f; + xsn[76]=11298261.0f; + xsn[77]=12086020.0f; + xsn[78]=4447706.0f; + xsn[79]=1164782.0f; + xsn[80]=1904399.0f; + xsn[81]=16669839.0f; + xsn[82]=2586766.0f; + xsn[83]=3605708.0f; + xsn[84]=15761082.0f; + xsn[85]=14937769.0f; + xsn[86]=13965017.0f; + xsn[87]=2175021.0f; + xsn[88]=16668997.0f; + xsn[89]=13996602.0f; + xsn[90]=6313099.0f; + xsn[91]=15646036.0f; + xsn[92]=9746447.0f; + xsn[93]=9596781.0f; + xsn[94]=9244169.0f; + xsn[95]=4731726.0f; + + xdn[0]=135665102723086.0; + xdn[1]=259840970195871.0; + xdn[2]=110726726657103.0; + xdn[3]=53972500363809.0; + xdn[4]=199301297412157.0; + xdn[5]=63744794353870.0; + xdn[6]=178745978725904.0; + xdn[7]=243549380863176.0; + xdn[8]=244796821836177.0; + xdn[9]=223788809121855.0; + xdn[10]=113720856430443.0; + xdn[11]=124607822268499.0; + xdn[12]=25705458431399.0; + xdn[13]=155476863764950.0; + xdn[14]=195602097736933.0; + xdn[15]=183038707238950.0; + xdn[16]=62268883953527.0; + xdn[17]=157047615112119.0; + xdn[18]=58134973897037.0; + xdn[19]=26908869337679.0; + xdn[20]=259927185454290.0; + xdn[21]=130534606773507.0; + xdn[22]=205295065526788.0; + xdn[23]=40201323262686.0; + xdn[24]=193822255723177.0; + xdn[25]=239720285097881.0; + xdn[26]=54433631586673.0; + xdn[27]=31313178820772.0; + xdn[28]=152904879618865.0; + xdn[29]=256187025780734.0; + xdn[30]=110292144635528.0; + xdn[31]=26555117184469.0; + xdn[32]=228913371644996.0; + xdn[33]=126837665590799.0; + xdn[34]=141069100232139.0; + xdn[35]=96171028602910.0; + xdn[36]=259271018918511.0; + xdn[37]=65257892816619.0; + xdn[38]=14254344610711.0; + xdn[39]=137794868158301.0; + xdn[40]=269703238916504.0; + xdn[41]=35782602710520.0; + xdn[42]=51447305327263.0; + xdn[43]=247852246697199.0; + xdn[44]=65072958134912.0; + xdn[45]=273325640150591.0; + xdn[46]=2768714666444.0; + xdn[47]=173907458721736.0; + + test1=0; + test2=0; + + for (k=0;k<96;k++) + { + if (xsn[k]!=(xs[k+60]*sbase)) + test1=1; + } + + for (k=0;k<48;k++) + { + if (xdn[k]!=(xd[k+39]*base)) + test2=1; + } + + if (test1==1) + { + printf("\n"); + printf("Test failed: ranlxs gives incorrect results\n"); + printf("=> do not use ranlxs on this machine\n"); + printf("\n"); + } + + if (test2==1) + { + printf("\n"); + printf("Test failed: ranlxd gives incorrect results\n"); + printf("=> do not use ranlxd on this machine\n"); + printf("\n"); + } + + +/******************************************************************************* +* +* Check of the I/O routines +* +*******************************************************************************/ + + rlxs_get(state1); + rlxd_get(state2); + + for (k=0;k<10;k++) + { + ranlxs(xs,NXS); + ranlxd(xd,NXD); + } + + rlxs_reset(state1); + rlxd_reset(state2); + + for (k=0;k<10;k++) + { + ranlxs(ys,NXS); + ranlxd(yd,NXD); + } + + for (k=0;k do not use ranlxs on this machine\n"); + printf("\n"); + } + + if (test2==2) + { + printf("\n"); + printf("Test failed: I/O routines for ranlxd do not work properly\n"); + printf("=> do not use ranlxd on this machine\n"); + printf("\n"); + } + + +/******************************************************************************* +* +* Success messages +* +*******************************************************************************/ + + if ((test1==0)&&(test2==0)) + { + printf("\n"); + printf("All tests passed\n"); + printf("=> ranlxs and ranlxd work correctly on this machine\n"); + printf("\n"); + } + else if (test1==0) + { + printf("\n"); + printf("All tests on ranlxs passed\n"); + printf("=> ranlxs works correctly on this machine\n"); + printf("\n"); + } + else if (test2==0) + { + printf("\n"); + printf("All tests on ranlxd passed\n"); + printf("=> ranlxd works correctly on this machine\n"); + printf("\n"); + } + exit(0); +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/random/check2.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/random/check2.c new file mode 100644 index 0000000000000000000000000000000000000000..8a7045601bd145863522db5fa25b13c1297a6e18 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/random/check2.c @@ -0,0 +1,104 @@ + +/******************************************************************************* +* +* File check2.c +* +* Copyright (C) 2005, 2012 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Writes the state of ranlxs to a file together with the next 147 random +* numbers. Then reads the data back in and checks the correct reinitialization +* of the generator +* +*******************************************************************************/ + +#include +#include +#include +#include +#include "random.h" +#include "utils.h" + +#define N 147 + + +static void write_state(void) +{ + FILE *fp; + int k,ns,*state; + float base,r[N]; + + ns=rlxs_size(); + state=malloc(ns*sizeof(int)); + base=(float)(ldexp(1.0,24)); + rlxs_init(1,1234567); + + for (k=0;k<10;k++) + ranlxs(r,N); + + rlxs_get(state); + ranlxs(r,N); + + fp=fopen(".tmp","w"); + + for (k=0;k +#include +#include +#include +#include "random.h" +#include "utils.h" + +#define N 147 + + +static void write_state(void) +{ + FILE *fp; + int k,ns,*state; + double base,r[N]; + + ns=rlxd_size(); + state=malloc(ns*sizeof(int)); + base=(double)(ldexp(1.0,48)); + rlxd_init(1,1234567); + + for (k=0;k<10;k++) + ranlxd(r,N); + + rlxd_get(state); + ranlxd(r,N); + + fp=fopen(".tmp","w"); + + for (k=0;k +#include +#include +#include "random.h" +#include "utils.h" +#include "su3fcts.h" +#include "extras.h" + + +int main(void) +{ + int i,n; + float *r; + double *rd,*f,x; + double kp,km,pp,pm; + + printf("\n"); + printf("Check of the distribution produced by gauss and gauss_dble\n"); + printf("----------------------------------------------------------\n"); + + for (;;) + { + printf("\n"); + printf("Specify number of trials (0 exits): "); + + if (scanf("%d",&n)==1) + { + printf("\n"); + + if (n<=0) + exit(0); + + f=amalloc(n*sizeof(double),3); + r=amalloc(n*sizeof(float),3); + + gauss(r,n); + + for (i=0;i=0) + f[i]=0.5+0.5*pchi_square(2.0*x*x,1); + else + f[i]=0.5-0.5*pchi_square(2.0*x*x,1); + } + + ks_test(n,f,&kp,&km); + ks_prob(n,kp,km,&pp,&pm); + + printf("Distribution produced by gauss\n"); + printf("Kolmogorov-Smirnov test: K+ = %4.2f, K- = %4.2f\n",kp,km); + printf("This corresponds to Prob(K+) = %4.2f, Prob(K-) = %4.2f\n", + pp,pm); + printf("\n"); + + afree(r); + rd=amalloc(n*sizeof(double),3); + gauss_dble(rd,n); + + for (i=0;i=0) + f[i]=0.5+0.5*pchi_square(2.0*x*x,1); + else + f[i]=0.5-0.5*pchi_square(2.0*x*x,1); + } + + ks_test(n,f,&kp,&km); + ks_prob(n,kp,km,&pp,&pm); + + printf("Distribution produced by gauss_dble\n"); + printf("Kolmogorov-Smirnov test: K+ = %4.2f, K- = %4.2f\n",kp,km); + printf("This corresponds to Prob(K+) = %4.2f, Prob(K-) = %4.2f\n", + pp,pm); + printf("\n"); + + afree(f); + afree(rd); + } + else + { + printf("Invalid input, program stopped\n\n"); + break; + } + } + + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/random/check5.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/random/check5.c new file mode 100644 index 0000000000000000000000000000000000000000..f01ef1ac3686b93546e3828594bb84ff7d8eb10b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/random/check5.c @@ -0,0 +1,165 @@ + +/******************************************************************************* +* +* File check5.c +* +* Copyright (C) 2005 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Statistical test of random_su3 +* +*******************************************************************************/ + +#include +#include +#include +#include "random.h" +#include "utils.h" +#include "su3fcts.h" +#include "extras.h" + + +static void dev(su3 *u,double *d1,double *d2) +{ + int i; + float *r; + double d; + complex det1,det2,det3,det; + su3 v,w; + + _su3_dagger(v,(*u)); + _su3_times_su3(w,v,(*u)); + + w.c11.re-=1.0f; + w.c22.re-=1.0f; + w.c33.re-=1.0f; + + *d1=0.0; + r=(float*)(&w); + + for (i=0;i<18;i++) + { + d=fabs((double)(r[i])); + if (d>(*d1)) + *d1=d; + } + + det1.re= + ((*u).c22.re*(*u).c33.re-(*u).c22.im*(*u).c33.im)- + ((*u).c23.re*(*u).c32.re-(*u).c23.im*(*u).c32.im); + det1.im= + ((*u).c22.re*(*u).c33.im+(*u).c22.im*(*u).c33.re)- + ((*u).c23.re*(*u).c32.im+(*u).c23.im*(*u).c32.re); + det2.re= + ((*u).c21.re*(*u).c33.re-(*u).c21.im*(*u).c33.im)- + ((*u).c23.re*(*u).c31.re-(*u).c23.im*(*u).c31.im); + det2.im= + ((*u).c21.re*(*u).c33.im+(*u).c21.im*(*u).c33.re)- + ((*u).c23.re*(*u).c31.im+(*u).c23.im*(*u).c31.re); + det3.re= + ((*u).c21.re*(*u).c32.re-(*u).c21.im*(*u).c32.im)- + ((*u).c22.re*(*u).c31.re-(*u).c22.im*(*u).c31.im); + det3.im= + ((*u).c21.re*(*u).c32.im+(*u).c21.im*(*u).c32.re)- + ((*u).c22.re*(*u).c31.im+(*u).c22.im*(*u).c31.re); + + det.re= + ((*u).c11.re*det1.re-(*u).c11.im*det1.im)- + ((*u).c12.re*det2.re-(*u).c12.im*det2.im)+ + ((*u).c13.re*det3.re-(*u).c13.im*det3.im); + det.im= + ((*u).c11.re*det1.im+(*u).c11.im*det1.re)- + ((*u).c12.re*det2.im+(*u).c12.im*det2.re)+ + ((*u).c13.re*det3.im+(*u).c13.im*det3.re); + + *d2=0.0; + d=fabs((double)(det.re)-1.0); + if (d>(*d2)) + *d2=d; + d=fabs((double)(det.im)); + if (d>(*d2)) + *d2=d; +} + + +int main(void) +{ + int i,n; + float *rw,*rz,wsq,zsq; + double *a,abar,sig,d1,d2,dmax1,dmax2; + complex wuz; + su3_vector w,z,uz; + su3 u; + + printf("\n"); + printf("Statistical test of random_su3\n"); + printf("------------------------------\n\n"); + + dmax1=0.0; + dmax2=0.0; + + for (i=0;i<10000;i++) + { + random_su3(&u); + dev(&u,&d1,&d2); + + if (d1>dmax1) + dmax1=d1; + if (d2>dmax2) + dmax2=d2; + } + + printf("In 10000 trials:\n"); + printf("max |1-U^dag*U| = %.1e\n",dmax1); + printf("max |1-det U| = %.1e\n",dmax2); + + for (;;) + { + rw=(float*)(&w); + rz=(float*)(&z); + gauss(rw,6); + gauss(rz,6); + wsq=_vector_prod_re(w,w); + zsq=_vector_prod_re(z,z); + + printf("\n"); + printf("Specify number of trials (0 exits): "); + + if (scanf("%d",&n)==1) + { + printf("\n"); + + if (n<=0) + exit(0); + + a=amalloc(n*sizeof(double),3); + + for (i=0;i = %1.4f [error %.1e]\n",abar,sig); + printf("Exact = %1.4f\n",wsq*zsq/3.0f); + + afree(a); + } + else + { + printf("Invalid input, program stopped\n\n"); + break; + } + } + + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/random/check6.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/random/check6.c new file mode 100644 index 0000000000000000000000000000000000000000..1da9276c58764dd27f61d3a33000b53e2815f5a7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/random/check6.c @@ -0,0 +1,164 @@ + +/******************************************************************************* +* +* File check6.c +* +* Copyright (C) 2005, 2012 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Statistical test of random_su3_dble +* +*******************************************************************************/ + +#include +#include +#include +#include "random.h" +#include "utils.h" +#include "su3fcts.h" +#include "extras.h" + + +static void dev(su3_dble *u,double *d1,double *d2) +{ + int i; + double d,*r; + complex_dble det1,det2,det3,det; + su3_dble v,w; + + _su3_dagger(v,(*u)); + _su3_times_su3(w,v,(*u)); + + w.c11.re-=1.0; + w.c22.re-=1.0; + w.c33.re-=1.0; + + *d1=0.0; + r=(double*)(&w); + + for (i=0;i<18;i++) + { + d=fabs(r[i]); + if (d>(*d1)) + *d1=d; + } + + det1.re= + ((*u).c22.re*(*u).c33.re-(*u).c22.im*(*u).c33.im)- + ((*u).c23.re*(*u).c32.re-(*u).c23.im*(*u).c32.im); + det1.im= + ((*u).c22.re*(*u).c33.im+(*u).c22.im*(*u).c33.re)- + ((*u).c23.re*(*u).c32.im+(*u).c23.im*(*u).c32.re); + det2.re= + ((*u).c21.re*(*u).c33.re-(*u).c21.im*(*u).c33.im)- + ((*u).c23.re*(*u).c31.re-(*u).c23.im*(*u).c31.im); + det2.im= + ((*u).c21.re*(*u).c33.im+(*u).c21.im*(*u).c33.re)- + ((*u).c23.re*(*u).c31.im+(*u).c23.im*(*u).c31.re); + det3.re= + ((*u).c21.re*(*u).c32.re-(*u).c21.im*(*u).c32.im)- + ((*u).c22.re*(*u).c31.re-(*u).c22.im*(*u).c31.im); + det3.im= + ((*u).c21.re*(*u).c32.im+(*u).c21.im*(*u).c32.re)- + ((*u).c22.re*(*u).c31.im+(*u).c22.im*(*u).c31.re); + + det.re= + ((*u).c11.re*det1.re-(*u).c11.im*det1.im)- + ((*u).c12.re*det2.re-(*u).c12.im*det2.im)+ + ((*u).c13.re*det3.re-(*u).c13.im*det3.im); + det.im= + ((*u).c11.re*det1.im+(*u).c11.im*det1.re)- + ((*u).c12.re*det2.im+(*u).c12.im*det2.re)+ + ((*u).c13.re*det3.im+(*u).c13.im*det3.re); + + *d2=0.0; + d=fabs(det.re-1.0); + if (d>(*d2)) + *d2=d; + d=fabs(det.im); + if (d>(*d2)) + *d2=d; +} + + +int main(void) +{ + int i,n; + double *rw,*rz,wsq,zsq; + double *a,abar,sig,d1,d2,dmax1,dmax2; + complex_dble wuz; + su3_vector_dble w,z,uz; + su3_dble u; + + printf("\n"); + printf("Statistical test of random_su3_dble\n"); + printf("-----------------------------------\n\n"); + + dmax1=0.0; + dmax2=0.0; + + for (i=0;i<10000;i++) + { + random_su3_dble(&u); + dev(&u,&d1,&d2); + + if (d1>dmax1) + dmax1=d1; + if (d2>dmax2) + dmax2=d2; + } + + printf("In 10000 trials:\n"); + printf("max |1-U^dag*U| = %.1e\n",dmax1); + printf("max |1-det U| = %.1e\n",dmax2); + + for (;;) + { + rw=(double*)(&w); + rz=(double*)(&z); + gauss_dble(rw,6); + gauss_dble(rz,6); + wsq=_vector_prod_re(w,w); + zsq=_vector_prod_re(z,z); + + printf("\n"); + printf("Specify number of trials (0 exits): "); + + if (scanf("%d",&n)==1) + { + printf("\n"); + + if (n<=0) + exit(0); + + a=amalloc(n*sizeof(double),3); + + for (i=0;i = %1.4f [error %.1e]\n",abar,sig); + printf("Exact = %1.4f\n",wsq*zsq/3.0); + + afree(a); + } + else + { + printf("Invalid input, program stopped\n\n"); + break; + } + } + + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/random/check7.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/random/check7.c new file mode 100644 index 0000000000000000000000000000000000000000..ac9753f95c6ab2f13aa62d2c81cd0a0e79277a7e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/random/check7.c @@ -0,0 +1,182 @@ + +/******************************************************************************* +* +* File check7.c +* +* Copyright (C) 2010 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Reweighting of gaussian distributions [statistical test of gauss_dble()] +* +*******************************************************************************/ + +#include +#include +#include +#include "random.h" +#include "utils.h" +#include "extras.h" + +#define NSMALL 100 +#define NLARGE 1000 +#define NTEST 10000 + +#if (NLARGEdev[i]) + dev[i]=d; + } + } + + for (i=0;i<4;i++) + { + sig[i]/=(double)(NTEST); + sig[i]=sqrt(sig[i]); + } +} + + +int main(void) +{ + int i; + + printf("\n"); + printf("Reweighting of gaussian distributions\n"); + printf("-------------------------------------\n\n"); + + printf("Width of the distribution = 1.0\n"); + printf("Width of the observable = 2.0,3.0,4.0\n"); + printf("%d test simulations of size %d and %d\n\n",NTEST,NSMALL,NLARGE); + + alloc_arrays(); + + printf("Sample size %d:\n\n",NSMALL); + + for (i=0;i +#include +#include +#include "random.h" + +#define NRLX 100 +#define NGSS 24 +#define NLOOPS 100000 + + +int main(void) +{ + int k,level; + float t1,t2,dt; + float r[NRLX]; + + printf("\n"); + printf("Timing of ranlxs (average time per random number in microsec)\n\n"); + + for (level=0;level<=2;level++) + { + rlxs_init(level,1); + + t1=(float)clock(); + for (k=1;k<=NLOOPS;k++) + ranlxs(r,NRLX); + t2=(float)clock(); + + dt=(t2-t1)/(float)(CLOCKS_PER_SEC); + dt*=1.0e6f/(float)(NRLX*NLOOPS); + + printf("%4.3f (level %1d) ",dt,level); + } + + printf("\n\n"); + printf("Timing of gauss (average time per random number in microsec)\n\n"); + + for (level=0;level<=2;level++) + { + rlxs_init(level,1); + + t1=(float)clock(); + for (k=1;k<=NLOOPS;k++) + gauss(r,NGSS); + t2=(float)clock(); + + dt=(t2-t1)/(float)(CLOCKS_PER_SEC); + dt*=1.0e6f/(float)(NGSS*NLOOPS); + + printf("%4.3f (level %1d) ",dt,level); + } + + printf("\n\n"); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/random/time2.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/random/time2.c new file mode 100644 index 0000000000000000000000000000000000000000..ea04eb898a608d7ac216b89e605bf244e78c6878 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/random/time2.c @@ -0,0 +1,73 @@ + +/******************************************************************************* +* +* File time2.c +* +* Copyright (C) 2005 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Timing of ranlxd and gauss_dble +* +*******************************************************************************/ + +#include +#include +#include +#include "random.h" + +#define NRLX 100 +#define NGSS 24 +#define NLOOPS 100000 + + +int main(void) +{ + int k,level; + float t1,t2,dt; + double r[NRLX]; + + printf("\n"); + printf("Timing of ranlxd "); + printf("(average time per random number in microsec)\n\n"); + + for (level=1;level<=2;level++) + { + rlxd_init(level,1); + + t1=(float)clock(); + for (k=1;k<=NLOOPS;k++) + ranlxd(r,NRLX); + t2=(float)clock(); + + dt=(t2-t1)/(float)(CLOCKS_PER_SEC); + dt*=1.0e6f/(float)(NRLX*NLOOPS); + + printf("%4.3f (level %1d) ",dt,level); + } + + printf("\n\n"); + printf("Timing of gauss_dble "); + printf("(average time per random number in microsec)\n\n"); + + for (level=1;level<=2;level++) + { + rlxd_init(level,1); + + t1=(float)clock(); + for (k=1;k<=NLOOPS;k++) + gauss_dble(r,NGSS); + t2=(float)clock(); + + dt=(t2-t1)/(float)(CLOCKS_PER_SEC); + dt*=1.0e6f/(float)(NGSS*NLOOPS); + + printf("%4.3f (level %1d) ",dt,level); + } + + printf("\n\n"); + exit(0); +} + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/ratfcts/INDEX b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/ratfcts/INDEX new file mode 100644 index 0000000000000000000000000000000000000000..789dcbbed5cb2e0bf98b446db4fd39b0bb951c43 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/ratfcts/INDEX @@ -0,0 +1,12 @@ + +Rational approximations + +check1 Computation of the complete elliptic integral K(k) + +check2 Computation of the Jacobi elliptic functions sn,cn,dn + +check3 Zolotarev rational approximation to the sign function + +table1 Table of the relative error of the Zolotarev rational + approximation to the function f(x)=1/|x| (suitable for + plotting, for example) diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/ratfcts/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/ratfcts/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..bda117cabd83612d53b8b15fb39161f3c9003beb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/ratfcts/Makefile @@ -0,0 +1,119 @@ +################################################################################ +# +# Makefile to compile and link C programs +# +# Version valid for Linux machines +# +# "make" compiles and links the specified main programs and modules +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files created by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +all: rmxeq mkdep mkxeq +.PHONY: all + + +# main programs and required modules + +MAIN = check1 check2 check3 table1 + +RANDOM = ranlxd + +UTILS = utils + +RATFCTS = elliptic zolotarev + +MODULES = $(RANDOM) $(UTILS) $(RATFCTS) + + +# search path for modules + +MDIR = ../../../modules + +VPATH = $(MDIR)/nompi/utils:$(MDIR)/random:$(MDIR)/ratfcts:\ + $(MDIR)/utils + + +# additional include directories + +INCPATH = ../../../include/nompi ../../../include + + +# additional libraries to be included + +LIBS = m + +LIBPATH = + + +# scheduling and optimization options + +CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ + -Wall -Wno-long-long -Wstrict-prototypes -Werror \ + -O -mno-avx -Dx64 + + +############################## do not change ################################### + +SHELL=/bin/bash + +CC=$(GCC) + +PGMS= $(MAIN) $(MODULES) + +INCDIRS = $(addprefix -I,$(INCPATH)) + +OBJECTS = $(addsuffix .o,$(MODULES)) + +LDFLAGS = $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) + +-include $(addsuffix .d,$(PGMS)) + + +# rule to make dependencies + +$(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(CC) -MM -ansi $(INCDIRS) $< -o $@ + + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(INCDIRS) -o $@ + + +# rule to link object files + +$(MAIN): %: %.o $(OBJECTS) Makefile + $(CC) $< $(OBJECTS) $(CFLAGS) $(LDFLAGS) -o $@ + + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables and old error log file + +rmxeq: + @ -rm -f $(MAIN); \ + echo "delete old executables" + + +# make dependencies + +mkdep: $(addsuffix .d,$(PGMS)) + @ echo "generate tables of dependencies" + + +# clean directory + +clean: + @ -rm -rf *.d *.o $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/ratfcts/check1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/ratfcts/check1.c new file mode 100644 index 0000000000000000000000000000000000000000..77e138958379e04e63c9a582e2a6dc847ce6c59c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/ratfcts/check1.c @@ -0,0 +1,109 @@ + +/******************************************************************************* +* +* File check1.c +* +* Copyright (C) 2008, 2012 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Computation of the complete elliptic integral K(k) +* +*******************************************************************************/ + +#include +#include +#include +#include +#include "random.h" +#include "utils.h" +#include "ratfcts.h" + + +static double Ksmall(double rk) +{ + double c0,c1,c2,c3; + double k,p; + + c0=1.0; + c1=1.0/4.0; + c2=9.0/64.0; + c3=25.0/256.0; + + k=(rk*rk)/(1.0+rk*rk); + + p=c2+k*c3; + p=c1+k*p; + p=c0+k*p; + + return 2.0*atan(1.0)*p; +} + + +int main(void) +{ + int n; + double rk,k,kp,km,dev,dmax; + + printf("\n"); + printf("Computation of the complete elliptic integral K(k)\n"); + printf("--------------------------------------------------\n\n"); + + rlxd_init(1,1234); + + km=pow(DBL_EPSILON,0.125); + dmax=fabs(1.0-Ksmall(0.0)/ellipticK(0.0)); + + for (n=0;n<1000;n++) + { + ranlxd(&rk,1); + rk*=km; + + dev=fabs(1.0-Ksmall(rk)/ellipticK(rk)); + + if (dev>dmax) + dmax=dev; + } + + printf("Small k region: maximal relative error = %.1e\n",dmax); + + dmax=0.0; + + for (n=0;n<1000;n++) + { + ranlxd(&rk,1); + rk=rk/(1.0-rk); + + k=rk/sqrt(1.0+rk*rk); + kp=1.0/sqrt(1.0+rk*rk); + + dev=fabs(1.0- + ellipticK(2.0*sqrt(k)*(1.0+k)/(kp*kp))/ + ((1.0+k)*ellipticK(rk))); + + if (dev>dmax) + dmax=dev; + } + + printf("Gauss transformation: maximal relative error = %.1e\n\n",dmax); + printf("Print values at specfied k/k'\n\n"); + + for (;;) + { + printf("k/k' = "); + + if (scanf("%lf",&rk)==1) + { + printf("k = %.8e, k' = %.8e, K(k) = %.16e\n\n", + rk/sqrt(1.0+rk*rk),1.0/sqrt(1.0+rk*rk),ellipticK(rk)); + } + else + { + printf("Invalid input value, program stopped\n\n"); + break; + } + } + + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/ratfcts/check2.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/ratfcts/check2.c new file mode 100644 index 0000000000000000000000000000000000000000..00f4401bd7fe795c611483d397afcfe1e2f9e62e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/ratfcts/check2.c @@ -0,0 +1,266 @@ + +/******************************************************************************* +* +* File check2.c +* +* Copyright (C) 2008, 2012 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Computation of the Jacobi elliptic functions sn,cn,dn +* +*******************************************************************************/ + +#include +#include +#include +#include +#include "random.h" +#include "utils.h" +#include "ratfcts.h" + + +static void sncndn_smallk(double u,double rk,double *sn,double *cn,double *dn) +{ + int n,nmax; + double K,Kp,pi; + double t,v,tn,vn,k,r; + + K=ellipticK(rk); + Kp=ellipticK(1.0/rk); + pi=4.0*atan(1.0); + + t=pi*Kp/K; + v=(pi*u)/(2.0*K); + + nmax=(int)(-1.5*log(DBL_EPSILON)/t); + if (nmax<1) + nmax=1; + + (*sn)=0.0; + (*cn)=0.0; + (*dn)=0.0; + + for (n=nmax;n>=0;n--) + { + tn=(double)(n)*t; + vn=(double)(2*n+1)*v; + + (*sn)+=(exp(-tn)*sin(vn)/(1.0-exp(-2.0*tn-t))); + (*cn)+=(exp(-tn)*cos(vn)/(1.0+exp(-2.0*tn-t))); + + if (n>0) + { + vn=(double)(2*n)*v; + (*dn)+=(exp(-tn)*cos(vn)/(1.0+exp(-2.0*tn))); + } + } + + k=rk/sqrt(1.0+rk*rk); + r=(2.0*pi*exp(-0.5*t))/(k*K); + + (*sn)*=r; + (*cn)*=r; + (*dn)=(pi/(2.0*K))*(1.0+4.0*(*dn)); +} + + +static void sncndn_landen(double u,double rk,double *sn,double *cn,double *dn) +{ + double k,kp,kt,ktp,r; + + kp=1.0/sqrt(1.0+rk*rk); + k=rk*kp; + kt=k/(1.0+kp); + kt=(kt*kt); + ktp=2.0*sqrt(kp)/(1.0+kp); + + sncndn(u/(1.0+kt),kt/ktp,sn,cn,dn); + + r=1.0/(1.0+kt*(*sn)*(*sn)); + + (*sn)=(1.0+kt)*(*sn)*r; + (*cn)=(*cn)*(*dn)*r; + (*dn)=sqrt(kp*kp+k*k*(*cn)*(*cn)); +} + + +int main(void) +{ + int n; + double u,rk,K; + double sn,cn,dn,snsk,cnsk,dnsk; + double dmax_sn,dmax_cn,dmax_dn,dev; + + printf("\n"); + printf("Computation of the Jacobi elliptic functions sn,cn,dn\n"); + printf("-----------------------------------------------------\n\n"); + + rlxd_init(1,1234); + + dmax_sn=0.0; + dmax_cn=0.0; + dmax_dn=0.0; + + for (n=0;n<10000;n++) + { + ranlxd(&rk,1); + rk*=0.1; + K=ellipticK(rk); + ranlxd(&u,1); + u=K*(0.5-u); + + sncndn(u,rk,&sn,&cn,&dn); + sncndn_smallk(u,rk,&snsk,&cnsk,&dnsk); + + if (sn!=0.0) + { + dev=fabs(1.0-snsk/sn); + + if (dev>dmax_sn) + dmax_sn=dev; + } + + dev=fabs(1.0-cnsk/cn); + + if (dev>dmax_cn) + dmax_cn=dev; + + dev=fabs(1.0-dnsk/dn); + + if (dev>dmax_dn) + dmax_dn=dev; + } + + printf("-K/2<=u<=K/2, rk<=0.1:\n"); + printf("maximal relative error (sn,cn,dn) = (%.1e,%.1e,%.1e)\n\n", + dmax_sn,dmax_cn,dmax_dn); + + dmax_sn=0.0; + dmax_cn=0.0; + dmax_dn=0.0; + + for (n=0;n<10000;n++) + { + ranlxd(&rk,1); + rk*=0.1; + K=ellipticK(rk); + ranlxd(&u,1); + u=16.0*K*(0.5-u); + + sncndn(u,rk,&sn,&cn,&dn); + sncndn_smallk(u,rk,&snsk,&cnsk,&dnsk); + + dev=fabs(snsk-sn); + + if (dev>dmax_sn) + dmax_sn=dev; + + dev=fabs(cnsk-cn); + + if (dev>dmax_cn) + dmax_cn=dev; + + dev=fabs(dnsk-dn); + + if (dev>dmax_dn) + dmax_dn=dev; + } + + printf("-8*K<=u<=8K, rk<=0.1:\n"); + printf("maximal absolute error (sn,cn,dn) = (%.1e,%.1e,%.1e)\n\n", + dmax_sn,dmax_cn,dmax_dn); + + dmax_sn=0.0; + dmax_cn=0.0; + dmax_dn=0.0; + + for (n=0;n<10000;n++) + { + ranlxd(&rk,1); + rk=rk/(1.0-rk); + K=ellipticK(rk); + ranlxd(&u,1); + u=K*(0.5-u); + + sncndn(u,rk,&sn,&cn,&dn); + sncndn_landen(u,rk,&snsk,&cnsk,&dnsk); + + if (sn!=0.0) + { + dev=fabs(1.0-snsk/sn); + + if (dev>dmax_sn) + dmax_sn=dev; + } + + dev=fabs(1.0-cnsk/cn); + + if (dev>dmax_cn) + dmax_cn=dev; + + dev=fabs(1.0-dnsk/dn); + + if (dev>dmax_dn) + dmax_dn=dev; + } + + printf("-K/2<=u<=K/2, Landen recursion:\n"); + printf("maximal relative error (sn,cn,dn) = (%.1e,%.1e,%.1e)\n\n", + dmax_sn,dmax_cn,dmax_dn); + + dmax_sn=0.0; + dmax_cn=0.0; + dmax_dn=0.0; + + for (n=0;n<10000;n++) + { + ranlxd(&rk,1); + rk=rk/(1.0-rk); + K=ellipticK(rk); + ranlxd(&u,1); + u=16.0*K*(0.5-u); + + sncndn(u,rk,&sn,&cn,&dn); + sncndn_landen(u,rk,&snsk,&cnsk,&dnsk); + + dev=fabs(snsk-sn); + + if (dev>dmax_sn) + dmax_sn=dev; + + dev=fabs(cnsk-cn); + + if (dev>dmax_cn) + dmax_cn=dev; + + dev=fabs(dnsk-dn); + + if (dev>dmax_dn) + dmax_dn=dev; + } + + printf("-8*K<=u<=8K, Landen recursion:\n"); + printf("maximal absolute error (sn,cn,dn) = (%.1e,%.1e,%.1e)\n\n", + dmax_sn,dmax_cn,dmax_dn); + printf("Print values at specfied u and k/k'\n\n"); + + for (;;) + { + printf("u, k/k' = "); + + if (scanf("%lf %lf",&u,&rk)==2) + { + sncndn(u,rk,&sn,&cn,&dn); + printf("sn = %.16e, cn = %.16e, dn = %.16e\n",sn,cn,dn); + } + else + { + printf("Invalid input values, program stopped\n\n"); + break; + } + } + + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/ratfcts/check3.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/ratfcts/check3.c new file mode 100644 index 0000000000000000000000000000000000000000..91b8ce42138941c907fd0126aeeff47682e11f35 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/ratfcts/check3.c @@ -0,0 +1,122 @@ + +/******************************************************************************* +* +* File check3.c +* +* Copyright (C) 2008 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Zolotarev rational approximation to the function f(x)=1/|x| +* +*******************************************************************************/ + +#include +#include +#include +#include +#include "random.h" +#include "utils.h" +#include "ratfcts.h" + +static int ns=0; +static double As,*ars; + + +static void alloc_ars(int n) +{ + if (n<=ns) + return; + + if (ns!=0) + afree(ars); + + ars=amalloc(2*n*sizeof(double),3); + + error(ars==NULL,1,"alloc_ars [check3.c]", + "Unable to allocate coefficient array"); + + ns=n; +} + + +static double Zolo(int n,double y) +{ + int r; + double p; + + p=1.0; + + for (r=0;rdmax) + dmax=dev; + } + + printf("Relative error delta = %.1e (measured: %.1e)\n",delta,dmax); + printf("Amplitude A: %.1e\n",As); + printf("Coefficients a_r: Numerator Denominator\n"); + b=b*b; + + for (r=0;r<(2*n);r+=2) + { + printf(" %.3e %.3e\n", + ars[r]*b,ars[r+1]*b); + } + + printf("\n\n"); + } + + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/ratfcts/table1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/ratfcts/table1.c new file mode 100644 index 0000000000000000000000000000000000000000..fffedaa19d93936472fab9909a7e843363ffcd57 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/ratfcts/table1.c @@ -0,0 +1,125 @@ + +/******************************************************************************* +* +* File table1.c +* +* Copyright (C) 2012 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Table of the relative error of the Zolotarev rational approximation to the +* function f(x)=1/|x| (suitable for plotting, for example) +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "utils.h" +#include "ratfcts.h" + +static int n; +static double eps,delta,As,*ars; + + +static void alloc_ars(void) +{ + ars=amalloc(2*n*sizeof(double),3); + + error(ars==NULL,1,"alloc_ars [table1.c]", + "Unable to allocate coefficient array"); +} + + +static double zolotarev_sign(double x) +{ + int r; + double y,p; + + y=x*x; + p=1.0; + + for (r=0;r +#include +#include +#include "su3.h" +#include "random.h" +#include "utils.h" +#include "su3fcts.h" + + +static double max_dev(su3_dble *u,su3_dble *v) +{ + int i; + double r[18],s[18]; + double nrm,d,dmax; + + r[ 0]=(*u).c11.re; + r[ 1]=(*u).c11.im; + r[ 2]=(*u).c12.re; + r[ 3]=(*u).c12.im; + r[ 4]=(*u).c13.re; + r[ 5]=(*u).c13.im; + + r[ 6]=(*u).c21.re; + r[ 7]=(*u).c21.im; + r[ 8]=(*u).c22.re; + r[ 9]=(*u).c22.im; + r[10]=(*u).c23.re; + r[11]=(*u).c23.im; + + r[12]=(*u).c31.re; + r[13]=(*u).c31.im; + r[14]=(*u).c32.re; + r[15]=(*u).c32.im; + r[16]=(*u).c33.re; + r[17]=(*u).c33.im; + + s[ 0]=(*v).c11.re; + s[ 1]=(*v).c11.im; + s[ 2]=(*v).c12.re; + s[ 3]=(*v).c12.im; + s[ 4]=(*v).c13.re; + s[ 5]=(*v).c13.im; + + s[ 6]=(*v).c21.re; + s[ 7]=(*v).c21.im; + s[ 8]=(*v).c22.re; + s[ 9]=(*v).c22.im; + s[10]=(*v).c23.re; + s[11]=(*v).c23.im; + + s[12]=(*v).c31.re; + s[13]=(*v).c31.im; + s[14]=(*v).c32.re; + s[15]=(*v).c32.im; + s[16]=(*v).c33.re; + s[17]=(*v).c33.im; + + nrm=0.0; + dmax=0.0; + + for (i=0;i<18;i++) + { + nrm+=r[i]*r[i]; + d=(r[i]-s[i])*(r[i]-s[i]); + i+=1; + nrm+=r[i]*r[i]; + d+=(r[i]-s[i])*(r[i]-s[i]); + + if (d>dmax) + dmax=d; + } + + return sqrt(dmax/nrm); +} + + +static void random_u3alg(u3_alg_dble *X) +{ + double r[9]; + + ranlxd(r,9); + + (*X).c1=r[0]-0.5; + (*X).c2=r[1]-0.5; + (*X).c3=r[2]-0.5; + (*X).c4=r[3]-0.5; + (*X).c5=r[4]-0.5; + (*X).c6=r[5]-0.5; + (*X).c7=r[6]-0.5; + (*X).c8=r[7]-0.5; + (*X).c9=r[8]-0.5; +} + + +static void X2u(u3_alg_dble *X,su3_dble *u) +{ + (*u).c11.re=0.0; + (*u).c11.im= (*X).c1; + (*u).c22.re=0.0; + (*u).c22.im= (*X).c2; + (*u).c33.re=0.0; + (*u).c33.im= (*X).c3; + + (*u).c12.re= (*X).c4; + (*u).c12.im= (*X).c5; + (*u).c21.re=-(*X).c4; + (*u).c21.im= (*X).c5; + + (*u).c13.re= (*X).c6; + (*u).c13.im= (*X).c7; + (*u).c31.re=-(*X).c6; + (*u).c31.im= (*X).c7; + + (*u).c23.re= (*X).c8; + (*u).c23.im= (*X).c9; + (*u).c32.re=-(*X).c8; + (*u).c32.im= (*X).c9; +} + + +int main(void) +{ + double d1,d2,d3,d4; + su3_dble *u,*v,*w1,*w2; + u3_alg_dble *X; + + printf("\n"); + printf("Check of su3xsu3, su3dagxsu3, ...\n"); + printf("---------------------------------\n\n"); + +#if (defined AVX) + printf("Using AVX instructions\n\n"); +#elif (defined x64) + printf("Using SSE3 instructions and up to 16 xmm registers\n\n"); +#endif + + u=amalloc(4*sizeof(su3_dble),4); + X=amalloc(sizeof(u3_alg_dble),3); + error((u==NULL)||(X==NULL),1,"main [check1.c]", + "Unable to allocate auxiliary arrays"); + + v=u+1; + w1=u+2; + w2=u+3; + + rlxd_init(1,23456); + + random_su3_dble(u); + random_su3_dble(v); + su3xsu3(u,v,w1); + _su3_times_su3(*w2,*u,*v); + d1=max_dev(w1,w2); + + random_su3_dble(u); + random_su3_dble(v); + su3dagxsu3(u,v,w1); + _su3_dagger(*w2,*u); + *u=*w2; + _su3_times_su3(*w2,*u,*v); + d2=max_dev(w1,w2); + + random_su3_dble(u); + random_su3_dble(v); + su3xsu3dag(u,v,w1); + _su3_dagger(*w2,*v); + *v=*w2; + _su3_times_su3(*w2,*u,*v); + d3=max_dev(w1,w2); + + random_su3_dble(u); + random_su3_dble(v); + su3dagxsu3dag(u,v,w1); + _su3_dagger(*w2,*u); + *u=*w2; + _su3_dagger(*w2,*v); + *v=*w2; + _su3_times_su3(*w2,*u,*v); + d4=max_dev(w1,w2); + + printf("su3xsu3: %.2e\n",d1); + printf("su3dagxsu3: %.2e\n",d2); + printf("su3xsu3dag: %.2e\n",d3); + printf("su3dagxsu3dag: %.2e\n",d4); + + random_su3_dble(u); + random_u3alg(X); + su3xu3alg(u,X,w1); + X2u(X,v); + _su3_times_su3(*w2,*u,*v); + d1=max_dev(w1,w2); + + random_su3_dble(u); + random_u3alg(X); + su3dagxu3alg(u,X,w1); + _su3_dagger(*w2,*u); + *u=*w2; + X2u(X,v); + _su3_times_su3(*w2,*u,*v); + d2=max_dev(w1,w2); + + random_su3_dble(v); + random_u3alg(X); + u3algxsu3(X,v,w1); + X2u(X,u); + _su3_times_su3(*w2,*u,*v); + d3=max_dev(w1,w2); + + random_su3_dble(v); + random_u3alg(X); + u3algxsu3dag(X,v,w1); + X2u(X,u); + _su3_dagger(*w2,*v); + *v=*w2; + _su3_times_su3(*w2,*u,*v); + d4=max_dev(w1,w2); + + printf("su3xu3alg: %.2e\n",d1); + printf("su3dagxu3alg: %.2e\n",d2); + printf("u3algxsu3: %.2e\n",d3); + printf("u3algxsu3dag: %.2e\n\n",d4); + + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/check2.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/check2.c new file mode 100644 index 0000000000000000000000000000000000000000..3da3c2271d5bb36d7f0247f10e738288fa263988 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/check2.c @@ -0,0 +1,239 @@ + +/******************************************************************************* +* +* File check2.c +* +* Copyright (C) 2005, 2009, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of prod2su3alg, prod2u3alg and rotate_su3alg +* +*******************************************************************************/ + +#include +#include +#include +#include "su3.h" +#include "random.h" +#include "utils.h" +#include "su3fcts.h" + +static const su3_vector_dble vd0={{0.0}}; +static const spinor_dble sd0={{{0.0}}}; +static su3_dble Q,u,v ALIGNED16; +static su3_alg_dble X ALIGNED16; +static u3_alg_dble Y; + + +static void random_su3alg(su3_alg_dble *X) +{ + double r[8]; + + ranlxd(r,8); + + (*X).c1=r[0]-0.5; + (*X).c2=r[1]-0.5; + (*X).c3=r[2]-0.5; + (*X).c4=r[3]-0.5; + (*X).c5=r[4]-0.5; + (*X).c6=r[5]-0.5; + (*X).c7=r[6]-0.5; + (*X).c8=r[7]-0.5; +} + + +static void random_u3alg(u3_alg_dble *X) +{ + double r[9]; + + ranlxd(r,9); + + (*X).c1=r[0]-0.5; + (*X).c2=r[1]-0.5; + (*X).c3=r[2]-0.5; + (*X).c4=r[3]-0.5; + (*X).c5=r[4]-0.5; + (*X).c6=r[5]-0.5; + (*X).c7=r[6]-0.5; + (*X).c8=r[7]-0.5; + (*X).c9=r[8]-0.5; +} + + +static void X2u(su3_alg_dble *X,su3_dble *u) +{ + (*u).c11.re=0.0; + (*u).c11.im= (*X).c1+(*X).c2; + (*u).c22.re=0.0; + (*u).c22.im= (*X).c2-2.0*(*X).c1; + (*u).c33.re=0.0; + (*u).c33.im= (*X).c1-2.0*(*X).c2; + + (*u).c12.re= (*X).c3; + (*u).c12.im= (*X).c4; + (*u).c21.re=-(*X).c3; + (*u).c21.im= (*X).c4; + + (*u).c13.re= (*X).c5; + (*u).c13.im= (*X).c6; + (*u).c31.re=-(*X).c5; + (*u).c31.im= (*X).c6; + + (*u).c23.re= (*X).c7; + (*u).c23.im= (*X).c8; + (*u).c32.re=-(*X).c7; + (*u).c32.im= (*X).c8; +} + + +int main(void) +{ + double tr,d; + + printf("\n"); + printf("Check of prod2su3alg and rotate_su3alg\n"); + printf("--------------------------------------\n\n"); + +#if (defined AVX) + printf("Using AVX instructions\n\n"); +#elif (defined x64) + printf("Using SSE3 instructions and up to 16 xmm registers\n\n"); +#endif + + rlxd_init(1,23456); + + printf("prod2su3alg:\n"); + random_su3_dble(&u); + random_su3_dble(&v); + random_su3alg(&X); + + tr=prod2su3alg(&u,&v,&X); + _su3_times_su3(Q,u,v); + tr-=(Q.c11.re+Q.c22.re+Q.c33.re); + + Q.c11.re=0.5*(Q.c11.re-Q.c11.re); + Q.c11.im=0.5*(Q.c11.im+Q.c11.im); + Q.c12.re=0.5*(Q.c12.re-Q.c21.re); + Q.c12.im=0.5*(Q.c12.im+Q.c21.im); + Q.c13.re=0.5*(Q.c13.re-Q.c31.re); + Q.c13.im=0.5*(Q.c13.im+Q.c31.im); + + Q.c22.re=0.5*(Q.c22.re-Q.c22.re); + Q.c22.im=0.5*(Q.c22.im+Q.c22.im); + Q.c23.re=0.5*(Q.c23.re-Q.c32.re); + Q.c23.im=0.5*(Q.c23.im+Q.c32.im); + + Q.c33.re=0.5*(Q.c33.re-Q.c33.re); + Q.c33.im=0.5*(Q.c33.im+Q.c33.im); + + d=(Q.c11.im+Q.c22.im+Q.c33.im)/3.0; + Q.c11.im-=d; + Q.c22.im-=d; + Q.c33.im-=d; + + d=fabs(Q.c11.im-X.c1-X.c2); + printf("X.c11.im: %.2e\n",d); + d=fabs(Q.c22.im+2.0*X.c1-X.c2); + printf("X.c22.im: %.2e\n",d); + d=fabs(Q.c33.im-X.c1+2.0*X.c2); + printf("X.c33.im: %.2e\n",d); + + d=fabs(Q.c12.re-X.c3); + printf("X.c12.re: %.2e\n",d); + d=fabs(Q.c12.im-X.c4); + printf("X.c12.im: %.2e\n",d); + + d=fabs(Q.c13.re-X.c5); + printf("X.c13.re: %.2e\n",d); + d=fabs(Q.c13.im-X.c6); + printf("X.c13.im: %.2e\n",d); + + d=fabs(Q.c23.re-X.c7); + printf("X.c23.re: %.2e\n",d); + d=fabs(Q.c23.im-X.c8); + printf("X.c23.im: %.2e\n",d); + d=fabs(tr); + printf("Return value: %.2e\n\n",d); + + printf("prod2u3alg:\n"); + random_su3_dble(&u); + random_su3_dble(&v); + random_u3alg(&Y); + + prod2u3alg(&u,&v,&Y); + _su3_times_su3(Q,u,v); + + Q.c11.re=Q.c11.re-Q.c11.re; + Q.c11.im=Q.c11.im+Q.c11.im; + Q.c12.re=Q.c12.re-Q.c21.re; + Q.c12.im=Q.c12.im+Q.c21.im; + Q.c13.re=Q.c13.re-Q.c31.re; + Q.c13.im=Q.c13.im+Q.c31.im; + + Q.c22.re=Q.c22.re-Q.c22.re; + Q.c22.im=Q.c22.im+Q.c22.im; + Q.c23.re=Q.c23.re-Q.c32.re; + Q.c23.im=Q.c23.im+Q.c32.im; + + Q.c33.re=Q.c33.re-Q.c33.re; + Q.c33.im=Q.c33.im+Q.c33.im; + + d=fabs(Q.c11.im-Y.c1); + printf("X.c11.im: %.2e\n",d); + d=fabs(Q.c22.im-Y.c2); + printf("X.c22.im: %.2e\n",d); + d=fabs(Q.c33.im-Y.c3); + printf("X.c33.im: %.2e\n",d); + + d=fabs(Q.c12.re-Y.c4); + printf("X.c12.re: %.2e\n",d); + d=fabs(Q.c12.im-Y.c5); + printf("X.c12.im: %.2e\n",d); + + d=fabs(Q.c13.re-Y.c6); + printf("X.c13.re: %.2e\n",d); + d=fabs(Q.c13.im-Y.c7); + printf("X.c13.im: %.2e\n",d); + + d=fabs(Q.c23.re-Y.c8); + printf("X.c23.re: %.2e\n",d); + d=fabs(Q.c23.im-Y.c9); + printf("X.c23.im: %.2e\n\n",d); + + printf("rotate_su3alg:\n"); + random_su3_dble(&u); + random_su3alg(&X); + X2u(&X,&v); + + rotate_su3alg(&u,&X); + + _su3_times_su3(Q,u,v); + _su3_dagger(v,u); + _su3_times_su3(u,Q,v); + + d=fabs(u.c11.im-X.c1-X.c2); + printf("X.c11.im: %.2e\n",d); + d=fabs(u.c22.im+2.0*X.c1-X.c2); + printf("X.c22.im: %.2e\n",d); + d=fabs(u.c33.im-X.c1+2.0*X.c2); + printf("X.c33.im: %.2e\n",d); + + d=fabs(u.c12.re-X.c3); + printf("X.c12.re: %.2e\n",d); + d=fabs(u.c12.im-X.c4); + printf("X.c12.im: %.2e\n",d); + + d=fabs(u.c13.re-X.c5); + printf("X.c13.re: %.2e\n",d); + d=fabs(u.c13.im-X.c6); + printf("X.c13.im: %.2e\n",d); + + d=fabs(u.c23.re-X.c7); + printf("X.c23.re: %.2e\n",d); + d=fabs(u.c23.im-X.c8); + printf("X.c23.im: %.2e\n\n",d); + + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/check3.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/check3.c new file mode 100644 index 0000000000000000000000000000000000000000..2e13369897adf0e93b6e4af7918da43e7593191e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/check3.c @@ -0,0 +1,177 @@ + +/******************************************************************************* +* +* File check3.c +* +* Copyright (C) 2009, 2011 Filippo Palombi, Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of chexp_drv0() and ch2mat() using the spectral representation of X +* +*******************************************************************************/ + +#include +#include +#include +#include "su3.h" +#include "utils.h" +#include "random.h" +#include "su3fcts.h" + +#define NTEST 100000 +#define SEED 58693 + +static double mu[3],t,d; +static su3_alg_dble *X; +static su3_dble *r,*u,*v,*w; +static const su3_dble u0={{0.0}}; +static ch_drv0_t *sp; + + +static void alloc_Xu(void) +{ + X=amalloc(1*sizeof(*X),4); + r=amalloc(4*sizeof(*r),4); + sp=amalloc(1*sizeof(*sp),4); + + error((X==NULL)||(r==NULL)||(sp==NULL),1, + "alloc_Xu [check3.c]","Unable to allocate matrices"); + + u=r+1; + v=r+2; + w=r+3; +} + + +static void random_Xu(void) +{ + for (;;) + { + ranlxd(mu,2); + mu[0]=2.0*mu[0]-1.0; + mu[1]=2.0*mu[1]-1.0; + mu[2]=-mu[0]-mu[1]; + + if (fabs(mu[2])<=1.0) + break; + } + + t=0.5*(mu[0]*mu[0]+mu[1]*mu[1]+mu[2]*mu[2]); + d=mu[0]*mu[1]*mu[2]; + + (*u)=u0; + (*u).c11.im=mu[0]; + (*u).c22.im=mu[1]; + (*u).c33.im=mu[2]; + + random_su3_dble(r); + su3xsu3(r,u,w); + su3xsu3dag(w,r,u); + + (*X).c1=((*u).c11.im-(*u).c22.im)/3.0; + (*X).c2=((*u).c11.im-(*u).c33.im)/3.0; + (*X).c3=(*u).c12.re; + (*X).c4=(*u).c12.im; + (*X).c5=(*u).c13.re; + (*X).c6=(*u).c13.im; + (*X).c7=(*u).c23.re; + (*X).c8=(*u).c23.im; + + (*u)=u0; + (*u).c11.re=cos(mu[0]); + (*u).c22.re=cos(mu[1]); + (*u).c33.re=cos(mu[2]); + (*u).c11.im=sin(mu[0]); + (*u).c22.im=sin(mu[1]); + (*u).c33.im=sin(mu[2]); + + su3xsu3(r,u,w); + su3xsu3dag(w,r,u); +} + + +static double dev_uv(void) +{ + int i; + double r[18],dev,dmax; + + r[ 0]=(*u).c11.re-(*v).c11.re; + r[ 1]=(*u).c11.im-(*v).c11.im; + r[ 2]=(*u).c12.re-(*v).c12.re; + r[ 3]=(*u).c12.im-(*v).c12.im; + r[ 4]=(*u).c13.re-(*v).c13.re; + r[ 5]=(*u).c13.im-(*v).c13.im; + + r[ 6]=(*u).c21.re-(*v).c21.re; + r[ 7]=(*u).c21.im-(*v).c21.im; + r[ 8]=(*u).c22.re-(*v).c22.re; + r[ 9]=(*u).c22.im-(*v).c22.im; + r[10]=(*u).c23.re-(*v).c23.re; + r[11]=(*u).c23.im-(*v).c23.im; + + r[12]=(*u).c31.re-(*v).c31.re; + r[13]=(*u).c31.im-(*v).c31.im; + r[14]=(*u).c32.re-(*v).c32.re; + r[15]=(*u).c32.im-(*v).c32.im; + r[16]=(*u).c33.re-(*v).c33.re; + r[17]=(*u).c33.im-(*v).c33.im; + + dmax=0.0; + + for (i=0;i<18;i++) + { + dev=fabs(r[i]); + if (dev>dmax) + dmax=dev; + } + + return dmax; +} + + +int main(void) +{ + int i; + double dev,dmax1,dmax2,dmax3; + + printf("\n"); + printf("Check of chexp_drv0() and ch2mat()\n"); + printf("----------------------------------\n\n"); + + printf("Test performed on %d random matrices X using the\n",NTEST); + printf("spectral representation of X\n\n"); + + rlxd_init(1,SEED); + alloc_Xu(); + + dmax1=0.0; + dmax2=0.0; + dmax3=0.0; + + for (i=0;idmax1) + dmax1=dev; + + dev=fabs(d-(*sp).d); + if (dev>dmax2) + dmax2=dev; + + ch2mat((*sp).p,X,v); + dev=dev_uv(); + if (dev>dmax3) + dmax3=dev; + } + + printf ("Maximal deviation of t = %.1e\n",dmax1); + printf ("Maximal deviation of d = %.1e\n",dmax2); + printf ("Maximal deviation of exp(X) = %.1e\n\n",dmax3); + + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/check4.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/check4.c new file mode 100644 index 0000000000000000000000000000000000000000..74be208f776f0bea8d27ae50e9999c64ec6b7f3a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/check4.c @@ -0,0 +1,238 @@ + +/******************************************************************************* +* +* File check4.c +* +* Copyright (C) 2009, 2011 Filippo Palombi, Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Comparison of chexp_drv2() with chexp_drv0() and chexp_drv1() and +* invariance of the calculated coefficients under rotations of X +* +*******************************************************************************/ + +#include +#include +#include +#include "su3.h" +#include "utils.h" +#include "random.h" +#include "su3fcts.h" + +#define NTEST 100000 +#define SEED 773 + +static double mu[3]; +static su3_alg_dble *X; +static su3_dble *r,*u,*w; +static const su3_dble u0={{0.0}}; +static ch_drv0_t *sp; +static ch_drv1_t *sg; +static ch_drv2_t *sf; + + +static void alloc_Xu(void) +{ + X=amalloc(1*sizeof(*X),4); + r=amalloc(3*sizeof(*r),4); + sp=amalloc(1*sizeof(*sp),4); + sg=amalloc(1*sizeof(*sg),4); + sf=amalloc(2*sizeof(*sf),4); + + error((X==NULL)||(r==NULL)||(sp==NULL)||(sg==NULL)||(sf==NULL),1, + "alloc_Xu [check4.c]","Unable to allocate matrices"); + + u=r+1; + w=r+2; +} + + +static void random_Xu(void) +{ + for (;;) + { + ranlxd(mu,2); + mu[0]=2.0*mu[0]-1.0; + mu[1]=2.0*mu[1]-1.0; + mu[2]=-mu[0]-mu[1]; + + if (fabs(mu[2])<=1.0) + break; + } + + (*u)=u0; + (*u).c11.im=mu[0]; + (*u).c22.im=mu[1]; + (*u).c33.im=mu[2]; + + random_su3_dble(r); + su3xsu3(r,u,w); + su3xsu3dag(w,r,u); + + (*X).c1=((*u).c11.im-(*u).c22.im)/3.0; + (*X).c2=((*u).c11.im-(*u).c33.im)/3.0; + (*X).c3=(*u).c12.re; + (*X).c4=(*u).c12.im; + (*X).c5=(*u).c13.re; + (*X).c6=(*u).c13.im; + (*X).c7=(*u).c23.re; + (*X).c8=(*u).c23.im; +} + + +static double dev_sp(void) +{ + int i; + double r[8],dev,dmax; + + r[0]=(*sp).t-(*sf).t; + r[1]=(*sp).d-(*sf).d; + + for (i=0;i<3;i++) + { + r[2*i+2]=(*sp).p[i].re-(*sf).p[i].re; + r[2*i+3]=(*sp).p[i].im-(*sf).p[i].im; + } + + dmax=0.0; + + for (i=0;i<8;i++) + { + dev=fabs(r[i]); + if (dev>dmax) + dmax=dev; + } + + return dmax; +} + + +static double dev_sg(void) +{ + int i; + double r[20],dev,dmax; + + r[0]=(*sp).t-(*sf).t; + r[1]=(*sp).d-(*sf).d; + + for (i=0;i<3;i++) + { + r[6*i+2]=(*sg).p[i].re-(*sf).p[i].re; + r[6*i+3]=(*sg).p[i].im-(*sf).p[i].im; + + r[6*i+4]=(*sg).pt[i].re-(*sf).pt[i].re; + r[6*i+5]=(*sg).pt[i].im-(*sf).pt[i].im; + + r[6*i+6]=(*sg).pd[i].re-(*sf).pd[i].re; + r[6*i+7]=(*sg).pd[i].im-(*sf).pd[i].im; + } + + dmax=0.0; + + for (i=0;i<20;i++) + { + dev=fabs(r[i]); + if (dev>dmax) + dmax=dev; + } + + return dmax; +} + + +static double dev_sf(void) +{ + int i; + double r[38],dev,dmax; + ch_drv2_t *sf1,*sf2; + + sf1=sf; + sf2=sf+1; + + r[0]=(*sf1).t-(*sf2).t; + r[1]=(*sf1).d-(*sf2).d; + + for (i=0;i<3;i++) + { + r[12*i+2]=(*sf1).p[i].re-(*sf2).p[i].re; + r[12*i+3]=(*sf1).p[i].im-(*sf2).p[i].im; + + r[12*i+4]=(*sf1).pt[i].re-(*sf2).pt[i].re; + r[12*i+5]=(*sf1).pt[i].im-(*sf2).pt[i].im; + + r[12*i+6]=(*sf1).pd[i].re-(*sf2).pd[i].re; + r[12*i+7]=(*sf1).pd[i].im-(*sf2).pd[i].im; + + r[12*i+8]=(*sf1).ptt[i].re-(*sf2).ptt[i].re; + r[12*i+9]=(*sf1).ptt[i].im-(*sf2).ptt[i].im; + + r[12*i+10]=(*sf1).ptd[i].re-(*sf2).ptd[i].re; + r[12*i+11]=(*sf1).ptd[i].im-(*sf2).ptd[i].im; + + r[12*i+12]=(*sf1).pdd[i].re-(*sf2).pdd[i].re; + r[12*i+13]=(*sf1).pdd[i].im-(*sf2).pdd[i].im; + } + + dmax=0.0; + + for (i=0;i<38;i++) + { + dev=fabs(r[i]); + if (dev>dmax) + dmax=dev; + } + + return dmax; +} + + +int main(void) +{ + int i; + double dev,dmax1,dmax2,dmax3; + + printf("\n"); + printf("Invariance of chexp_drv2() under rotations of X\n"); + printf("-----------------------------------------------\n\n"); + + printf("Test performed on %d random matrices X\n\n",NTEST); + + rlxd_init(1,SEED); + alloc_Xu(); + + dmax1=0.0; + dmax2=0.0; + dmax3=0.0; + + for (i=0;idmax1) + dmax1=dev; + + dev=dev_sg(); + if (dev>dmax2) + dmax2=dev; + + random_su3_dble(r); + rotate_su3alg(r,X); + chexp_drv2(X,sf+1); + + dev=dev_sf(); + if (dev>dmax3) + dmax3=dev; + } + + printf ("Comparision of chexp_drv0 and chexp_drv2 = %.1e\n",dmax1); + printf ("Comparision of chexp_drv1 and chexp_drv2 = %.1e\n",dmax2); + printf ("Rotation invariance of chexp_drv2 = %.1e\n\n",dmax3); + + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/check5.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/check5.c new file mode 100644 index 0000000000000000000000000000000000000000..18bc3cc19ee49dfca5c4cf8096c4c65c62124623 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/check5.c @@ -0,0 +1,387 @@ + +/******************************************************************************* +* +* File check5.c +* +* Copyright (C) 2009, 2011 Filippo Palombi, Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of chexp_drv2() in the case of diagonal X +* +* This program verifies that eqs. (4.1)-(4.6) of the notes "SU(3) matrix +* functions" are satisfied by the coefficients obtained by chexp_drv2() +* +*******************************************************************************/ + +#include +#include +#include +#include "su3.h" +#include "utils.h" +#include "random.h" +#include "su3fcts.h" + +#define NTEST 100000 +#define SEED 8923 + +static double mu[3],xt[2],yt[2]; +static complex_dble t[2][3],x[3],y[3]; +static complex_dble xsq[3],stx[2][3],stt[2][2][3]; +static complex_dble ex[3],dex[2][3],ddex[2][2][3]; +static complex_dble df[2][3],ddf[2][2][3]; +static su3_alg_dble *X; +static ch_drv2_t *sf; + + +static void mul_vec(complex_dble *a,complex_dble *b,complex_dble *c) +{ + int i; + + for (i=0;i<3;i++) + { + c[i].re=a[i].re*b[i].re-a[i].im*b[i].im; + c[i].im=a[i].re*b[i].im+a[i].im*b[i].re; + } +} + + +static void add_vec(complex_dble z,complex_dble *a,complex_dble *b) +{ + int i; + + for (i=0;i<3;i++) + { + b[i].re+=(z.re*a[i].re-z.im*a[i].im); + b[i].im+=(z.re*a[i].im+z.im*a[i].re); + } +} + + +static void alloc_X(void) +{ + X=amalloc(1*sizeof(*X),4); + sf=amalloc(1*sizeof(*sf),4); + + error((X==NULL)||(sf==NULL),1, + "alloc_X [check5.c]","Unable to allocate matrices"); +} + + +static void set_tk(void) +{ + double r; + + t[0][0].re=0.0; + t[0][0].im=0.5; + t[0][1].re=0.0; + t[0][1].im=-0.5; + t[0][2].re=0.0; + t[0][2].im=0.0; + + r=1.0/(2.0*sqrt(3.0)); + + t[1][0].re=0.0; + t[1][0].im=r; + t[1][1].re=0.0; + t[1][1].im=r; + t[1][2].re=0.0; + t[1][2].im=-2.0*r; +} + + +static void random_X(void) +{ + double s; + + for (;;) + { + ranlxd(mu,2); + mu[0]=2.0*mu[0]-1.0; + mu[1]=2.0*mu[1]-1.0; + mu[2]=-mu[0]-mu[1]; + + if (fabs(mu[2])<=1.0) + break; + } + + (*X).c1=(mu[0]-mu[1])/3.0; + (*X).c2=(mu[0]-mu[2])/3.0; + (*X).c3=0.0; + (*X).c4=0.0; + (*X).c5=0.0; + (*X).c6=0.0; + (*X).c7=0.0; + (*X).c8=0.0; + + x[0].re=0.0; + x[0].im=mu[0]; + x[1].re=0.0; + x[1].im=mu[1]; + x[2].re=0.0; + x[2].im=mu[2]; + + xt[0]=x[0].im-x[1].im; + xt[1]=sqrt(3.0)*(x[0].im+x[1].im); + + s=(mu[0]*mu[0]+mu[1]*mu[1]+mu[2]*mu[2])/3.0; + + y[0].re=0.0; + y[0].im=mu[0]*mu[0]-s; + y[1].re=0.0; + y[1].im=mu[1]*mu[1]-s; + y[2].re=0.0; + y[2].im=mu[2]*mu[2]-s; + + yt[0]=y[0].im-y[1].im; + yt[1]=sqrt(3.0)*(y[0].im+y[1].im); + + ex[0].re=cos(mu[0]); + ex[0].im=sin(mu[0]); + ex[1].re=cos(mu[1]); + ex[1].im=sin(mu[1]); + ex[2].re=cos(mu[2]); + ex[2].im=sin(mu[2]); +} + + +static void diff_exp(void) +{ + int i,j; + + for (i=0;i<2;i++) + mul_vec(t[i],ex,dex[i]); + + for (i=0;i<2;i++) + { + for (j=0;j<2;j++) + mul_vec(t[i],dex[j],ddex[i][j]); + } +} + + +static void diff_fk(void) +{ + int i,j,k; + double d; + + for (i=0;i<2;i++) + { + for (k=0;k<3;k++) + { + df[i][k].re=0.5*(xt[i]*(*sf).pt[k].re+yt[i]*(*sf).pd[k].re); + df[i][k].im=0.5*(xt[i]*(*sf).pt[k].im+yt[i]*(*sf).pd[k].im); + } + } + + d=1.0/sqrt(3.0); + + for (k=0;k<3;k++) + { + ddf[0][0][k].re=0.5*((*sf).pt[k].re+d*xt[1]*(*sf).pd[k].re); + ddf[0][0][k].im=0.5*((*sf).pt[k].im+d*xt[1]*(*sf).pd[k].im); + + ddf[1][1][k].re=0.5*((*sf).pt[k].re-d*xt[1]*(*sf).pd[k].re); + ddf[1][1][k].im=0.5*((*sf).pt[k].im-d*xt[1]*(*sf).pd[k].im); + + ddf[0][1][k].re=0.5*d*xt[0]*(*sf).pd[k].re; + ddf[0][1][k].im=0.5*d*xt[0]*(*sf).pd[k].im; + + ddf[1][0][k].re=0.5*d*xt[0]*(*sf).pd[k].re; + ddf[1][0][k].im=0.5*d*xt[0]*(*sf).pd[k].im; + } + + for (i=0;i<2;i++) + { + for (j=0;j<2;j++) + { + for (k=0;k<3;k++) + { + ddf[i][j][k].re+=0.25*(xt[i]*xt[j]*(*sf).ptt[k].re+ + xt[i]*yt[j]*(*sf).ptd[k].re+ + yt[i]*xt[j]*(*sf).ptd[k].re+ + yt[i]*yt[j]*(*sf).pdd[k].re); + + ddf[i][j][k].im+=0.25*(xt[i]*xt[j]*(*sf).ptt[k].im+ + xt[i]*yt[j]*(*sf).ptd[k].im+ + yt[i]*xt[j]*(*sf).ptd[k].im+ + yt[i]*yt[j]*(*sf).pdd[k].im); + } + } + } +} + + +static void set_prods(void) +{ + int i,j; + + mul_vec(x,x,xsq); + + for (i=0;i<2;i++) + { + mul_vec(t[i],x,stx[i]); + + for (j=0;j<2;j++) + mul_vec(t[i],t[j],stt[i][j]); + } +} + + +static void subtract_chexp(void) +{ + int i,j,k; + complex_dble z; + + for (i=0;i<2;i++) + { + for (k=0;k<3;k++) + { + dex[i][k].re-=df[i][0].re; + dex[i][k].im-=df[i][0].im; + } + + z.re=-df[i][1].re; + z.im=-df[i][1].im; + add_vec(z,x,dex[i]); + + z.re=-df[i][2].re; + z.im=-df[i][2].im; + add_vec(z,xsq,dex[i]); + + z.re=-(*sf).p[1].re; + z.im=-(*sf).p[1].im; + add_vec(z,t[i],dex[i]); + + z.re=-2.0*(*sf).p[2].re; + z.im=-2.0*(*sf).p[2].im; + add_vec(z,stx[i],dex[i]); + + for (j=0;j<2;j++) + { + for (k=0;k<3;k++) + { + ddex[i][j][k].re-=ddf[i][j][0].re; + ddex[i][j][k].im-=ddf[i][j][0].im; + } + + z.re=-ddf[i][j][1].re; + z.im=-ddf[i][j][1].im; + add_vec(z,x,ddex[i][j]); + + z.re=-ddf[i][j][2].re; + z.im=-ddf[i][j][2].im; + add_vec(z,xsq,ddex[i][j]); + + z.re=-df[i][1].re; + z.im=-df[i][1].im; + add_vec(z,t[j],ddex[i][j]); + + z.re=-df[j][1].re; + z.im=-df[j][1].im; + add_vec(z,t[i],ddex[i][j]); + + z.re=-2.0*df[i][2].re; + z.im=-2.0*df[i][2].im; + add_vec(z,stx[j],ddex[i][j]); + + z.re=-2.0*df[j][2].re; + z.im=-2.0*df[j][2].im; + add_vec(z,stx[i],ddex[i][j]); + + z.re=-2.0*(*sf).p[2].re; + z.im=-2.0*(*sf).p[2].im; + add_vec(z,stt[i][j],ddex[i][j]); + } + } +} + + +static double dev_dex(void) +{ + int i,k; + double dev,dmax; + + dmax=0.0; + + for (i=0;i<2;i++) + { + for (k=0;k<3;k++) + { + dev=dex[i][k].re*dex[i][k].re+dex[i][k].im*dex[i][k].im; + if (dev>dmax) + dmax=dev; + } + } + + return sqrt(dmax); +} + + +static double dev_ddex(void) +{ + int i,j,k; + double dev,dmax; + + dmax=0.0; + + for (i=0;i<2;i++) + { + for (j=0;j<2;j++) + { + for (k=0;k<3;k++) + { + dev=ddex[i][j][k].re*ddex[i][j][k].re+ + ddex[i][j][k].im*ddex[i][j][k].im; + if (dev>dmax) + dmax=dev; + } + } + } + + return sqrt(dmax); +} + + +int main(void) +{ + int i; + double dev,dmax1,dmax2; + + printf("\n"); + printf("Check of chexp_drv2() for diagonal X\n"); + printf("------------------------------------\n\n"); + + printf("Test performed on %d random matrices X\n\n",NTEST); + + rlxd_init(1,SEED); + alloc_X(); + set_tk(); + + dmax1=0.0; + dmax2=0.0; + + for (i=0;idmax1) + dmax1=dev; + + dev=dev_ddex(); + if (dev>dmax2) + dmax2=dev; + } + + printf ("Maximal deviation of 1st derivatives = %.1e\n",dmax1); + printf ("Maximal deviation of 2nd derivatives = %.1e\n\n",dmax2); + + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/check6.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/check6.c new file mode 100644 index 0000000000000000000000000000000000000000..f9870d9cfb55bc80d5b94b1afc97f2746da40283 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/check6.c @@ -0,0 +1,180 @@ + +/******************************************************************************* +* +* File check6.c +* +* Copyright (C) 2009, 2011 Filippo Palombi, Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Verifies that exp(X) is in SU(3) and that exp(X)*exp(-X)=1 +* +*******************************************************************************/ + +#include +#include +#include +#include "su3.h" +#include "utils.h" +#include "random.h" +#include "su3fcts.h" + +#define NTEST 100000 +#define SEED 1234 + +static double mu[3]; +static su3_alg_dble *X; +static su3_dble *r,*u,*v,*w; +static const su3_dble u0={{0.0}}; +static ch_drv0_t *sp; + + +static void alloc_Xu(void) +{ + X=amalloc(1*sizeof(*X),4); + r=amalloc(4*sizeof(*r),4); + sp=amalloc(1*sizeof(*sp),4); + + error((X==NULL)||(r==NULL)||(sp==NULL),1, + "alloc_Xu [check6.c]","Unable to allocate matrices"); + + u=r+1; + v=r+2; + w=r+3; +} + + +static void random_Xu(void) +{ + for (;;) + { + ranlxd(mu,2); + mu[0]=2.0*mu[0]-1.0; + mu[1]=2.0*mu[1]-1.0; + mu[2]=-mu[0]-mu[1]; + + if (fabs(mu[2])<=1.0) + break; + } + + (*u)=u0; + (*u).c11.im=mu[0]; + (*u).c22.im=mu[1]; + (*u).c33.im=mu[2]; + + random_su3_dble(r); + su3xsu3(r,u,w); + su3xsu3dag(w,r,u); + + (*X).c1=((*u).c11.im-(*u).c22.im)/3.0; + (*X).c2=((*u).c11.im-(*u).c33.im)/3.0; + (*X).c3=(*u).c12.re; + (*X).c4=(*u).c12.im; + (*X).c5=(*u).c13.re; + (*X).c6=(*u).c13.im; + (*X).c7=(*u).c23.re; + (*X).c8=(*u).c23.im; +} + + +static void flip_signX(void) +{ + (*X).c1=-(*X).c1; + (*X).c2=-(*X).c2; + (*X).c3=-(*X).c3; + (*X).c4=-(*X).c4; + (*X).c5=-(*X).c5; + (*X).c6=-(*X).c6; + (*X).c7=-(*X).c7; + (*X).c8=-(*X).c8; +} + + +static double dev_uv(void) +{ + int i; + double r[18],dev,dmax; + + r[ 0]=(*u).c11.re-(*v).c11.re; + r[ 1]=(*u).c11.im-(*v).c11.im; + r[ 2]=(*u).c12.re-(*v).c12.re; + r[ 3]=(*u).c12.im-(*v).c12.im; + r[ 4]=(*u).c13.re-(*v).c13.re; + r[ 5]=(*u).c13.im-(*v).c13.im; + + r[ 6]=(*u).c21.re-(*v).c21.re; + r[ 7]=(*u).c21.im-(*v).c21.im; + r[ 8]=(*u).c22.re-(*v).c22.re; + r[ 9]=(*u).c22.im-(*v).c22.im; + r[10]=(*u).c23.re-(*v).c23.re; + r[11]=(*u).c23.im-(*v).c23.im; + + r[12]=(*u).c31.re-(*v).c31.re; + r[13]=(*u).c31.im-(*v).c31.im; + r[14]=(*u).c32.re-(*v).c32.re; + r[15]=(*u).c32.im-(*v).c32.im; + r[16]=(*u).c33.re-(*v).c33.re; + r[17]=(*u).c33.im-(*v).c33.im; + + dmax=0.0; + + for (i=0;i<18;i++) + { + dev=fabs(r[i]); + if (dev>dmax) + dmax=dev; + } + + return dmax; +} + + +int main(void) +{ + int i; + double dev,dmax1,dmax2; + + printf("\n"); + printf("Simple checks of exp(X) as calculated by chexp_drv0()\n"); + printf("-----------------------------------------------------\n\n"); + + printf("Test performed on %d random matrices X\n",NTEST); + + rlxd_init(1,SEED); + alloc_Xu(); + + dmax1=0.0; + dmax2=0.0; + + for (i=0;idmax1) + dmax1=dev; + + flip_signX(); + chexp_drv0(X,sp); + ch2mat((*sp).p,X,w); + su3xsu3(u,w,v); + (*u)=u0; + (*u).c11.re=1.0; + (*u).c22.re=1.0; + (*u).c33.re=1.0; + + dev=dev_uv(); + if (dev>dmax2) + dmax2=dev; + } + + printf ("Maximal deviation of exp(X) from SU(3) = %.1e\n",dmax1); + printf ("Maximal deviation of exp(X)*exp(-X) from 1 = %.1e\n\n",dmax2); + + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/check7.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/check7.c new file mode 100644 index 0000000000000000000000000000000000000000..afff5a3a1285506e3e2410e3a13984bb45ad577a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/check7.c @@ -0,0 +1,170 @@ + +/******************************************************************************* +* +* File check7.c +* +* Copyright (C) 2009, 2011 Martin Luescher, Filippo Palombi +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of expXsu3() using the spectral representation of X +* +*******************************************************************************/ + +#include +#include +#include +#include "su3.h" +#include "utils.h" +#include "random.h" +#include "su3fcts.h" + +#define NTEST 50000 +#define SEED 38579 + +static double mu[3],t,d,eps; +static su3_alg_dble *X; +static su3_dble *r,*u,*w,*y,*z; +static const su3_dble u0={{0.0}}; + + +static void alloc_Xu(void) +{ + X=amalloc(1*sizeof(*X),4); + r=amalloc(5*sizeof(*r),4); + + error((X==NULL)||(r==NULL),1, + "alloc_Xu [check7.c]","Unable to allocate matrices"); + + u=r+1; + w=r+2; + y=r+3; + z=r+4; +} + + +static void random_Xu(void) +{ + for (;;) + { + ranlxd(mu,2); + mu[0]=2.0*mu[0]-1.0; + mu[1]=2.0*mu[1]-1.0; + mu[2]=-mu[0]-mu[1]; + + if (fabs(mu[2])<=1.0) + break; + } + + t=0.5*(mu[0]*mu[0]+mu[1]*mu[1]+mu[2]*mu[2]); + d=mu[0]*mu[1]*mu[2]; + + (*u)=u0; + (*u).c11.im=mu[0]; + (*u).c22.im=mu[1]; + (*u).c33.im=mu[2]; + + random_su3_dble(r); + su3xsu3(r,u,w); + su3xsu3dag(w,r,u); + + (*X).c1=((*u).c11.im-(*u).c22.im)/3.0; + (*X).c2=((*u).c11.im-(*u).c33.im)/3.0; + (*X).c3=(*u).c12.re; + (*X).c4=(*u).c12.im; + (*X).c5=(*u).c13.re; + (*X).c6=(*u).c13.im; + (*X).c7=(*u).c23.re; + (*X).c8=(*u).c23.im; + + ranlxd(&eps,1); + eps*=20.0; + + (*u)=u0; + (*u).c11.re=cos(eps*mu[0]); + (*u).c22.re=cos(eps*mu[1]); + (*u).c33.re=cos(eps*mu[2]); + (*u).c11.im=sin(eps*mu[0]); + (*u).c22.im=sin(eps*mu[1]); + (*u).c33.im=sin(eps*mu[2]); + + su3xsu3(r,u,w); + su3xsu3dag(w,r,u); + + random_su3_dble(z); + su3xsu3(u,z,w); +} + + +static double dev_yw(void) +{ + int i; + double r[18],dev,dmax; + + r[ 0]=(*y).c11.re-(*w).c11.re; + r[ 1]=(*y).c11.im-(*w).c11.im; + r[ 2]=(*y).c12.re-(*w).c12.re; + r[ 3]=(*y).c12.im-(*w).c12.im; + r[ 4]=(*y).c13.re-(*w).c13.re; + r[ 5]=(*y).c13.im-(*w).c13.im; + + r[ 6]=(*y).c21.re-(*w).c21.re; + r[ 7]=(*y).c21.im-(*w).c21.im; + r[ 8]=(*y).c22.re-(*w).c22.re; + r[ 9]=(*y).c22.im-(*w).c22.im; + r[10]=(*y).c23.re-(*w).c23.re; + r[11]=(*y).c23.im-(*w).c23.im; + + r[12]=(*y).c31.re-(*w).c31.re; + r[13]=(*y).c31.im-(*w).c31.im; + r[14]=(*y).c32.re-(*w).c32.re; + r[15]=(*y).c32.im-(*w).c32.im; + r[16]=(*y).c33.re-(*w).c33.re; + r[17]=(*y).c33.im-(*w).c33.im; + + dmax=0.0; + + for (i=0;i<18;i++) + { + dev=fabs(r[i]); + if (dev>dmax) + dmax=dev; + } + + return dmax; +} + + +int main(void) +{ + int i; + double dev,dmax; + + printf("\n"); + printf("Check of expXsu3()\n"); + printf("------------------\n\n"); + + printf("Test performed on %d random matrices X and u using the\n",NTEST); + printf("spectral representation of X\n\n"); + + rlxd_init(1,SEED); + alloc_Xu(); + + dmax=0.0; + + for (i=0;idmax) + dmax=dev; + } + + printf ("Maximal deviation of exp(X)*u = %.1e\n\n",dmax); + + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/check8.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/check8.c new file mode 100644 index 0000000000000000000000000000000000000000..11b3e64428a4cb065e3f18fd526d6cf6c04d7d12 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/check8.c @@ -0,0 +1,558 @@ + +/******************************************************************************* +* +* File check8.c +* +* Copyright (C) 2009, 2010, 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of the programs cm3x3_zero(),...,cm3x3_lc2() +* +*******************************************************************************/ + +#include +#include +#include +#include "su3.h" +#include "utils.h" +#include "random.h" +#include "su3fcts.h" + +#define NTEST 1000 +#define SEED 376 + +static double rs,*rus,*rvs,*rws,*rzs; +static complex_dble *cs; +static su3_dble *us,*vs,*ws,*zs; +static su3_dble ud0={{0.0}}; +static complex_dble trs ALIGNED16; + + +static void alloc_matrices(void) +{ + rus=amalloc(90*sizeof(*rus),4); + cs=amalloc(3*sizeof(*cs),4); + us=amalloc(5*sizeof(*us),4); + + error((rus==NULL)||(cs==NULL)||(us==NULL),1, + "alloc_matrices [check8.c]","Unable to allocate matrices"); + + rvs=rus+36; + rws=rvs+18; + rzs=rws+18; + + vs=us+2; + ws=vs+1; + zs=ws+1; +} + + +static void mat2vec(su3_dble *u,double *ru) +{ + ru[ 0]=(*u).c11.re; + ru[ 1]=(*u).c11.im; + ru[ 2]=(*u).c12.re; + ru[ 3]=(*u).c12.im; + ru[ 4]=(*u).c13.re; + ru[ 5]=(*u).c13.im; + + ru[ 6]=(*u).c21.re; + ru[ 7]=(*u).c21.im; + ru[ 8]=(*u).c22.re; + ru[ 9]=(*u).c22.im; + ru[10]=(*u).c23.re; + ru[11]=(*u).c23.im; + + ru[12]=(*u).c31.re; + ru[13]=(*u).c31.im; + ru[14]=(*u).c32.re; + ru[15]=(*u).c32.im; + ru[16]=(*u).c33.re; + ru[17]=(*u).c33.im; +} + + +static void vec2mat(double *ru,su3_dble *u) +{ + (*u).c11.re=ru[ 0]; + (*u).c11.im=ru[ 1]; + (*u).c12.re=ru[ 2]; + (*u).c12.im=ru[ 3]; + (*u).c13.re=ru[ 4]; + (*u).c13.im=ru[ 5]; + + (*u).c21.re=ru[ 6]; + (*u).c21.im=ru[ 7]; + (*u).c22.re=ru[ 8]; + (*u).c22.im=ru[ 9]; + (*u).c23.re=ru[10]; + (*u).c23.im=ru[11]; + + (*u).c31.re=ru[12]; + (*u).c31.im=ru[13]; + (*u).c32.re=ru[14]; + (*u).c32.im=ru[15]; + (*u).c33.re=ru[16]; + (*u).c33.im=ru[17]; +} + + +static void add_vec(double *ru,double *rv,double *rw) +{ + int i; + + for (i=0;i<18;i++) + rw[i]=ru[i]+rv[i]; +} + + +static void mulr_vec(double r,double *ru,double *rv) +{ + int i; + + for (i=0;i<18;i++) + rv[i]=r*ru[i]; +} + + +static void mulc_vec(complex_dble c,double *ru,double *rv) +{ + int i; + + for (i=0;i<18;i+=2) + { + rv[i ]=c.re*ru[i ]-c.im*ru[i+1]; + rv[i+1]=c.re*ru[i+1]+c.im*ru[i ]; + } +} + + +static void dag_vec(double *ru,double *rv) +{ + int i,j; + + for (i=0;i<3;i++) + { + for (j=0;j<3;j++) + { + rv[6*i+2*j ]= ru[6*j+2*i ]; + rv[6*i+2*j+1]=-ru[6*j+2*i+1]; + } + } +} + + +static void random_matrix(su3_dble *u) +{ + int i; + double r[18]; + + ranlxd(r,18); + + for (i=0;i<18;i++) + r[i]=2.0*r[i]-1.0; + + vec2mat(r,u); +} + + +static void start_test(void) +{ + int i; + double r[6]; + + ranlxd(&rs,1); + rs=2.0*rs-1.0; + + ranlxd(r,6); + + for (i=0;i<6;i++) + r[i]=2.0*r[i]-1.0; + + cs[0].re=r[0]; + cs[0].im=r[1]; + cs[1].re=r[2]; + cs[1].im=r[3]; + cs[2].re=r[4]; + cs[2].im=r[5]; + + random_matrix(us); + random_matrix(us+1); + random_matrix(vs); + random_matrix(ws); + random_matrix(zs); +} + + +static double dev_uv(su3_dble *u,su3_dble *v) +{ + int i; + double r[18],dev,dmax; + + r[ 0]=(*u).c11.re-(*v).c11.re; + r[ 1]=(*u).c11.im-(*v).c11.im; + r[ 2]=(*u).c12.re-(*v).c12.re; + r[ 3]=(*u).c12.im-(*v).c12.im; + r[ 4]=(*u).c13.re-(*v).c13.re; + r[ 5]=(*u).c13.im-(*v).c13.im; + + r[ 6]=(*u).c21.re-(*v).c21.re; + r[ 7]=(*u).c21.im-(*v).c21.im; + r[ 8]=(*u).c22.re-(*v).c22.re; + r[ 9]=(*u).c22.im-(*v).c22.im; + r[10]=(*u).c23.re-(*v).c23.re; + r[11]=(*u).c23.im-(*v).c23.im; + + r[12]=(*u).c31.re-(*v).c31.re; + r[13]=(*u).c31.im-(*v).c31.im; + r[14]=(*u).c32.re-(*v).c32.re; + r[15]=(*u).c32.im-(*v).c32.im; + r[16]=(*u).c33.re-(*v).c33.re; + r[17]=(*u).c33.im-(*v).c33.im; + + dmax=0.0; + + for (i=0;i<18;i++) + { + dev=fabs(r[i]); + if (dev>dmax) + dmax=dev; + } + + return dmax; +} + + +int main(void) +{ + int i; + double dev,dmax[15]; + + printf("\n"); + printf("Check of the programs cm3x3_zero(),...,cm3x3_lc2()\n"); + printf("--------------------------------------------------\n\n"); + + printf("Test performed on %d random matrices\n\n",NTEST); + + rlxd_init(1,SEED); + alloc_matrices(); + + for (i=0;i<15;i++) + dmax[i]=0.0; + + for (i=0;idmax[0]) + dmax[0]=dev; + + cm3x3_unity(1,vs); + (*vs).c11.re-=1.0; + (*vs).c22.re-=1.0; + (*vs).c33.re-=1.0; + dev=dev_uv(&ud0,vs); + vec2mat(rvs,vs); + if (dev>dmax[1]) + dmax[1]=dev; + + mat2vec(us,rus); + mat2vec(ws,rws); + vec2mat(rus,ws); + cm3x3_assign(1,us,vs); + dev=dev_uv(ws,vs); + if (dev>dmax[2]) + dmax[2]=dev; + dev=dev_uv(ws,us); + if (dev>dmax[2]) + dmax[2]=dev; + vec2mat(rvs,vs); + vec2mat(rws,ws); + + mat2vec(us,rus); + mat2vec(vs,rvs); + mat2vec(ws,rws); + cm3x3_swap(1,us,vs); + vec2mat(rus,ws); + dev=dev_uv(ws,vs); + if (dev>dmax[14]) + dmax[14]=dev; + vec2mat(rvs,ws); + dev=dev_uv(ws,us); + if (dev>dmax[14]) + dmax[14]=dev; + vec2mat(rus,us); + vec2mat(rvs,vs); + vec2mat(rws,ws); + + mat2vec(us,rus); + dag_vec(rus,rzs); + vec2mat(rzs,ws); + cm3x3_dagger(us,vs); + dev=dev_uv(vs,ws); + if (dev>dmax[3]) + dmax[3]=dev; + vec2mat(rus,ws); + dev=dev_uv(us,ws); + if (dev>dmax[3]) + dmax[3]=dev; + + vec2mat(rzs,ws); + cm3x3_dagger(us,us); + dev=dev_uv(us,ws); + if (dev>dmax[3]) + dmax[3]=dev; + vec2mat(rus,us); + vec2mat(rvs,vs); + vec2mat(rws,ws); + + cm3x3_tr(us,us+1,&trs); + su3xsu3(us,us+1,vs); + trs.re-=((*vs).c11.re+(*vs).c22.re+(*vs).c33.re); + trs.im-=((*vs).c11.im+(*vs).c22.im+(*vs).c33.im); + dev=fabs(trs.re)+fabs(trs.im); + vec2mat(rvs,vs); + if (dev>dmax[4]) + dmax[4]=dev; + + cm3x3_tr(us,us,&trs); + su3xsu3(us,us,vs); + trs.re-=((*vs).c11.re+(*vs).c22.re+(*vs).c33.re); + trs.im-=((*vs).c11.im+(*vs).c22.im+(*vs).c33.im); + dev=fabs(trs.re)+fabs(trs.im); + vec2mat(rvs,vs); + if (dev>dmax[4]) + dmax[4]=dev; + + cm3x3_retr(us,us+1,&trs.re); + su3xsu3(us,us+1,vs); + trs.re-=((*vs).c11.re+(*vs).c22.re+(*vs).c33.re); + dev=fabs(trs.re); + vec2mat(rvs,vs); + if (dev>dmax[5]) + dmax[5]=dev; + + cm3x3_retr(us,us,&trs.re); + su3xsu3(us,us,vs); + trs.re-=((*vs).c11.re+(*vs).c22.re+(*vs).c33.re); + dev=fabs(trs.re); + vec2mat(rvs,vs); + if (dev>dmax[5]) + dmax[5]=dev; + + mat2vec(us,rus); + mat2vec(vs,rvs); + mat2vec(zs,rzs); + add_vec(rus,rvs,rws); + vec2mat(rws,zs); + cm3x3_add(us,vs); + dev=dev_uv(vs,zs); + vec2mat(rvs,vs); + vec2mat(rzs,zs); + if (dev>dmax[6]) + dmax[6]=dev; + + mat2vec(us,rvs); + add_vec(rus,rvs,rws); + vec2mat(rws,zs); + cm3x3_add(us,us); + dev=dev_uv(us,zs); + vec2mat(rus,us); + vec2mat(rzs,zs); + if (dev>dmax[6]) + dmax[6]=dev; + + mat2vec(ws,rws); + mat2vec(zs,rzs); + su3xsu3(us,vs,zs); + mat2vec(zs,rvs); + add_vec(rvs,rws,rus); + vec2mat(rus,zs); + cm3x3_mul_add(us,vs,ws); + vec2mat(rws,ws); + cm3x3_mul_add(us,vs,ws); + dev=dev_uv(ws,zs); + vec2mat(rws,ws); + vec2mat(rzs,zs); + if (dev>dmax[7]) + dmax[7]=dev; + + mat2vec(vs,rvs); + mat2vec(zs,rzs); + su3xsu3(us,vs,zs); + mat2vec(zs,rws); + add_vec(rws,rvs,rus); + vec2mat(rus,zs); + cm3x3_mul_add(us,vs,vs); + vec2mat(rvs,vs); + cm3x3_mul_add(us,vs,vs); + dev=dev_uv(vs,zs); + vec2mat(rvs,vs); + vec2mat(rzs,zs); + if (dev>dmax[7]) + dmax[7]=dev; + + mat2vec(ws,rws); + mat2vec(zs,rzs); + mat2vec(us,rus); + mulr_vec(rs,rus,rvs); + vec2mat(rvs,zs); + cm3x3_mulr(&rs,us,ws); + dev=dev_uv(ws,zs); + if (dev>dmax[8]) + dmax[8]=dev; + + cm3x3_mulr(&rs,us,us); + dev=dev_uv(us,zs); + vec2mat(rus,us); + vec2mat(rws,ws); + vec2mat(rzs,zs); + if (dev>dmax[8]) + dmax[8]=dev; + + mat2vec(us,rus); + mat2vec(vs,rvs); + mulr_vec(rs,rus,rws); + add_vec(rws,rvs,rzs); + mat2vec(ws,rws); + vec2mat(rzs,ws); + cm3x3_mulr_add(&rs,us,vs); + dev=dev_uv(vs,ws); + vec2mat(rvs,vs); + vec2mat(rws,ws); + if (dev>dmax[9]) + dmax[9]=dev; + + mulr_vec(rs,rus,rws); + add_vec(rus,rws,rvs); + mat2vec(ws,rws); + vec2mat(rvs,ws); + cm3x3_mulr_add(&rs,us,us); + dev=dev_uv(us,ws); + vec2mat(rus,us); + vec2mat(rws,ws); + if (dev>dmax[9]) + dmax[9]=dev; + + mat2vec(ws,rws); + mat2vec(zs,rzs); + mat2vec(us,rus); + mulc_vec(cs[0],rus,rvs); + vec2mat(rvs,zs); + cm3x3_mulc(cs,us,ws); + dev=dev_uv(ws,zs); + if (dev>dmax[10]) + dmax[10]=dev; + + cm3x3_mulc(cs,us,us); + dev=dev_uv(us,zs); + vec2mat(rus,us); + vec2mat(rws,ws); + vec2mat(rzs,zs); + if (dev>dmax[10]) + dmax[10]=dev; + + mat2vec(us,rus); + mat2vec(vs,rvs); + mulc_vec(cs[0],rus,rws); + add_vec(rws,rvs,rzs); + mat2vec(ws,rws); + vec2mat(rzs,ws); + cm3x3_mulc_add(cs,us,vs); + dev=dev_uv(vs,ws); + vec2mat(rvs,vs); + vec2mat(rws,ws); + if (dev>dmax[11]) + dmax[11]=dev; + + mulc_vec(cs[0],rus,rws); + add_vec(rus,rws,rvs); + mat2vec(ws,rws); + vec2mat(rvs,ws); + cm3x3_mulc_add(cs,us,us); + dev=dev_uv(us,ws); + vec2mat(rus,us); + vec2mat(rws,ws); + if (dev>dmax[11]) + dmax[11]=dev; + + mat2vec(us,rus); + mat2vec(vs,rvs); + mat2vec(ws,rws); + mulc_vec(cs[1],rus,rzs); + vec2mat(rzs,ws); + (*ws).c11.re+=cs[0].re; + (*ws).c11.im+=cs[0].im; + (*ws).c22.re+=cs[0].re; + (*ws).c22.im+=cs[0].im; + (*ws).c33.re+=cs[0].re; + (*ws).c33.im+=cs[0].im; + cm3x3_lc1(cs,us,vs); + dev=dev_uv(vs,ws); + if (dev>dmax[12]) + dmax[12]=dev; + + cm3x3_lc1(cs,us,us); + dev=dev_uv(us,ws); + vec2mat(rus,us); + vec2mat(rvs,vs); + vec2mat(rws,ws); + if (dev>dmax[12]) + dmax[12]=dev; + + mat2vec(us,rus); + mat2vec(us+1,rus+18); + mulc_vec(cs[1],rus,rvs); + mulc_vec(cs[2],rus+18,rws); + add_vec(rvs,rws,rzs); + vec2mat(rzs,ws); + (*ws).c11.re+=cs[0].re; + (*ws).c11.im+=cs[0].im; + (*ws).c22.re+=cs[0].re; + (*ws).c22.im+=cs[0].im; + (*ws).c33.re+=cs[0].re; + (*ws).c33.im+=cs[0].im; + cm3x3_lc2(cs,us,vs); + dev=dev_uv(vs,ws); + if (dev>dmax[13]) + dmax[13]=dev; + + cm3x3_lc2(cs,us,us); + vec2mat(rus,us); + cm3x3_lc2(cs,us,us); + dev=dev_uv(us,ws); + if (dev>dmax[13]) + dmax[13]=dev; + } + + printf("Maximal deviation of cm3x3_zero() = %.1e\n",dmax[0]); + printf("Maximal deviation of cm3x3_unity() = %.1e\n",dmax[1]); + printf("Maximal deviation of cm3x3_assign() = %.1e\n",dmax[2]); + printf("Maximal deviation of cm3x3_swap() = %.1e\n",dmax[14]); + printf("Maximal deviation of cm3x3_dagger() = %.1e\n",dmax[3]); + printf("Maximal deviation of cm3x3_tr() = %.1e\n",dmax[4]); + printf("Maximal deviation of cm3x3_retr() = %.1e\n",dmax[5]); + printf("Maximal deviation of cm3x3_add() = %.1e\n",dmax[6]); + printf("Maximal deviation of cm3x3_mul_add() = %.1e\n",dmax[7]); + printf("Maximal deviation of cm3x3_mulr() = %.1e\n",dmax[8]); + printf("Maximal deviation of cm3x3_mulr_add() = %.1e\n",dmax[9]); + printf("Maximal deviation of cm3x3_mulc() = %.1e\n",dmax[10]); + printf("Maximal deviation of cm3x3_mulc_add() = %.1e\n",dmax[11]); + printf("Maximal deviation of cm3x3_lc1() = %.1e\n",dmax[12]); + printf("Maximal deviation of cm3x3_lc2() = %.1e\n\n",dmax[13]); + + for (i=1;i<15;i++) + { + if (dmax[i]>dmax[0]) + dmax[0]=dmax[i]; + } + + printf("Maximal deviation (all tests) = %.1e\n\n",dmax[0]); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/time1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/time1.c new file mode 100644 index 0000000000000000000000000000000000000000..ef1ee2e14f8819d254752766eaf0952441064234 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/time1.c @@ -0,0 +1,385 @@ + +/******************************************************************************* +* +* File time1.c +* +* Copyright (C) 2005, 2008, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Timing of the SU(3) x SU(3)-vector multiplication (single-precision programs) +* +*******************************************************************************/ + +#include +#include +#include +#include +#include "su3.h" +#include "random.h" +#include "su3fcts.h" + +static su3 u[4] ALIGNED16; +static su3_vector s[8],r[8],t[8] ALIGNED16; + +#if (defined x64) +#if (defined AVX) +#include "avx.h" + +#define _avx_vector_quartet_load(s) \ +__asm__ __volatile__ ("vmovaps %0, %%xmm6 \n\t" \ + "vmovaps %2, %%xmm7 \n\t" \ + "vmovaps %4, %%xmm8" \ + : \ + : \ + "m" ((s[0]).c1), \ + "m" ((s[0]).c2), \ + "m" ((s[0]).c3), \ + "m" ((s[1]).c1), \ + "m" ((s[1]).c2), \ + "m" ((s[1]).c3) \ + : \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("vinsertf128 $0x1, %0, %%ymm6, %%ymm6 \n\t" \ + "vinsertf128 $0x1, %2, %%ymm7, %%ymm7 \n\t" \ + "vinsertf128 $0x1, %4, %%ymm8, %%ymm8" \ + : \ + : \ + "m" ((s[2]).c1), \ + "m" ((s[2]).c2), \ + "m" ((s[2]).c3), \ + "m" ((s[3]).c1), \ + "m" ((s[3]).c2), \ + "m" ((s[3]).c3) \ + : \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("vshufps $0xe4, %%ymm7, %%ymm6, %%ymm0 \n\t" \ + "vshufps $0x4e, %%ymm8, %%ymm6, %%ymm1 \n\t" \ + "vshufps $0xe4, %%ymm8, %%ymm7, %%ymm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2") + +#define _avx_vector_quartet_store_up(r) \ +__asm__ __volatile__ ("vshufps $0x44, %%ymm4, %%ymm3, %%ymm9 \n\t" \ + "vshufps $0xe4, %%ymm3, %%ymm5, %%ymm10 \n\t" \ + "vshufps $0xee, %%ymm5, %%ymm4, %%ymm11" \ + : \ + : \ + : \ + "xmm9", "xmm10", "xmm11"); \ +__asm__ __volatile__ ("vmovaps %%xmm9, %0 \n\t" \ + "vmovaps %%xmm10, %2 \n\t" \ + "vmovaps %%xmm11, %4" \ + : \ + : \ + "m" ((r[0]).c1), \ + "m" ((r[0]).c2), \ + "m" ((r[0]).c3), \ + "m" ((r[1]).c1), \ + "m" ((r[1]).c2), \ + "m" ((r[1]).c3)); \ +__asm__ __volatile__ ("vextractf128 $0x1, %%ymm9, %0 \n\t" \ + "vextractf128 $0x1, %%ymm10, %2 \n\t" \ + "vextractf128 $0x1, %%ymm11, %4" \ + : \ + : \ + "m" ((r[2]).c1), \ + "m" ((r[2]).c2), \ + "m" ((r[2]).c3), \ + "m" ((r[3]).c1), \ + "m" ((r[3]).c2), \ + "m" ((r[3]).c3)) + + +static void fast_multiply(su3 *ua,su3_vector *sa,su3_vector *ra) +{ + _avx_vector_quartet_load(sa); + _avx_su3_pair_multiply(ua[0],ua[1]); + _avx_vector_quartet_store_up(ra); + ua+=2; + sa+=4; + ra+=4; + _avx_vector_quartet_load(sa); + _avx_su3_pair_multiply(ua[0],ua[1]); + _avx_vector_quartet_store_up(ra); +} + + +static void fast_inverse_multiply(su3 *ua,su3_vector *sa,su3_vector *ra) +{ + _avx_vector_quartet_load(sa); + _avx_su3_pair_inverse_multiply(ua[0],ua[1]); + _avx_vector_quartet_store_up(ra); + ua+=2; + sa+=4; + ra+=4; + _avx_vector_quartet_load(sa); + _avx_su3_pair_inverse_multiply(ua[0],ua[1]); + _avx_vector_quartet_store_up(ra); +} + + +static void fast_mixed_multiply(su3 *ua,su3_vector *sa,su3_vector *ra) +{ + _avx_vector_quartet_load(sa); + _avx_su3_pair_mixed_multiply(ua[0],ua[1]); + _avx_vector_quartet_store_up(ra); + ua+=2; + sa+=4; + ra+=4; + _avx_vector_quartet_load(sa); + _avx_su3_pair_mixed_multiply(ua[0],ua[1]); + _avx_vector_quartet_store_up(ra); +} + + +static void slow_mixed_multiply(su3 *ua,su3_vector *sa,su3_vector *ra) +{ + _su3_multiply((*(ra )),(*(ua )),(*(sa ))); + _su3_multiply((*(ra+1)),(*(ua )),(*(sa+1))); + _su3_inverse_multiply((*(ra+2)),(*(ua+1)),(*(sa+2))); + _su3_inverse_multiply((*(ra+3)),(*(ua+1)),(*(sa+3))); + _su3_multiply((*(ra+4)),(*(ua+2)),(*(sa+4))); + _su3_multiply((*(ra+5)),(*(ua+2)),(*(sa+5))); + _su3_inverse_multiply((*(ra+6)),(*(ua+3)),(*(sa+6))); + _su3_inverse_multiply((*(ra+7)),(*(ua+3)),(*(sa+7))); +} + +#else +#include "sse2.h" + +#define _su3_fast_multiply(r1,r2,u,s1,s2) \ + _sse_pair_load(s1,s2); \ + _sse_su3_multiply(u); \ + _sse_pair_store_up(r1,r2) + +#define _su3_fast_inverse_multiply(r1,r2,u,s1,s2) \ + _sse_pair_load(s1,s2); \ + _sse_su3_inverse_multiply(u); \ + _sse_pair_store_up(r1,r2) + + +static void fast_multiply(su3 *ua,su3_vector *sa,su3_vector *ra) +{ + _su3_fast_multiply((*(ra )),(*(ra+1)),(*(ua )),(*(sa )),(*(sa+1))); + _su3_fast_multiply((*(ra+2)),(*(ra+3)),(*(ua+1)),(*(sa+2)),(*(sa+3))); + _su3_fast_multiply((*(ra+4)),(*(ra+5)),(*(ua+2)),(*(sa+4)),(*(sa+5))); + _su3_fast_multiply((*(ra+6)),(*(ra+7)),(*(ua+3)),(*(sa+6)),(*(sa+7))); +} + + +static void fast_inverse_multiply(su3 *ua,su3_vector *sa,su3_vector *ra) +{ + _su3_fast_inverse_multiply((*(ra )),(*(ra+1)),(*(ua )),(*(sa )),(*(sa+1))); + _su3_fast_inverse_multiply((*(ra+2)),(*(ra+3)),(*(ua+1)),(*(sa+2)),(*(sa+3))); + _su3_fast_inverse_multiply((*(ra+4)),(*(ra+5)),(*(ua+2)),(*(sa+4)),(*(sa+5))); + _su3_fast_inverse_multiply((*(ra+6)),(*(ra+7)),(*(ua+3)),(*(sa+6)),(*(sa+7))); +} + +#endif + +static void slow_inverse_multiply(su3 *ua,su3_vector *sa,su3_vector *ra) +{ + _su3_inverse_multiply((*(ra )),(*(ua )),(*(sa ))); + _su3_inverse_multiply((*(ra+1)),(*(ua )),(*(sa+1))); + _su3_inverse_multiply((*(ra+2)),(*(ua+1)),(*(sa+2))); + _su3_inverse_multiply((*(ra+3)),(*(ua+1)),(*(sa+3))); + _su3_inverse_multiply((*(ra+4)),(*(ua+2)),(*(sa+4))); + _su3_inverse_multiply((*(ra+5)),(*(ua+2)),(*(sa+5))); + _su3_inverse_multiply((*(ra+6)),(*(ua+3)),(*(sa+6))); + _su3_inverse_multiply((*(ra+7)),(*(ua+3)),(*(sa+7))); +} + +#endif + +static void slow_multiply(su3 *ua,su3_vector *sa,su3_vector *ra) +{ + _su3_multiply((*(ra )),(*(ua )),(*(sa ))); + _su3_multiply((*(ra+1)),(*(ua )),(*(sa+1))); + _su3_multiply((*(ra+2)),(*(ua+1)),(*(sa+2))); + _su3_multiply((*(ra+3)),(*(ua+1)),(*(sa+3))); + _su3_multiply((*(ra+4)),(*(ua+2)),(*(sa+4))); + _su3_multiply((*(ra+5)),(*(ua+2)),(*(sa+5))); + _su3_multiply((*(ra+6)),(*(ua+3)),(*(sa+6))); + _su3_multiply((*(ra+7)),(*(ua+3)),(*(sa+7))); +} + + +int main(void) +{ + int k,n,count; + double t1,t2,dt; +#if (defined x64) + double delta,diff,norm; +#endif + + printf("\n"); + printf("Time per single-precision SU(3) x SU(3)-vector multiplication\n"); + printf("-------------------------------------------------------------\n\n"); + +#if (defined AVX) + printf("Using AVX instructions\n"); +#elif (defined x64) + printf("Using SSE3 instructions and up to 16 xmm registers\n"); +#endif + + printf("Measurement made with all data in cache\n\n"); + + rlxs_init(0,123456); + + for (k=0;k<4;k++) + random_su3(u+k); + + gauss((float*)(s),48); + gauss((float*)(r),48); + gauss((float*)(t),48); + +#if (defined x64) + + n=(int)(1.0e6); + dt=0.0; + + while (dt<2.0) + { + t1=(double)clock(); + for (count=0;countdelta) + delta=diff; + } + +#if (defined AVX) + printf("||U*w_AVX-U*w_FPU||<= %.1e*||w||\n",delta); +#else + printf("||U*w_SSE-U*w_FPU||<= %.1e*||w||\n",delta); +#endif + + fast_inverse_multiply(u,s,r); + slow_inverse_multiply(u,s,t); + delta=0.0; + + for (k=0;k<8;k++) + { + _vector_sub_assign(r[k],t[k]); + diff=(double)(_vector_prod_re(r[k],r[k])); + norm=(double)(_vector_prod_re(s[k],s[k])); + diff=sqrt(diff/norm); + if (diff>delta) + delta=diff; + } + +#if (defined AVX) + printf("||U^dag*w_AVX-U^dag*w_FPU||<= %.1e*||w||\n",delta); +#else + printf("||U^dag*w_SSE-U^dag*w_FPU||<= %.1e*||w||\n",delta); +#endif + +#if (defined AVX) + + fast_mixed_multiply(u,s,r); + slow_mixed_multiply(u,s,t); + delta=0.0; + + for (k=0;k<8;k++) + { + _vector_sub_assign(r[k],t[k]); + diff=(double)(_vector_prod_re(r[k],r[k])); + norm=(double)(_vector_prod_re(s[k],s[k])); + diff=sqrt(diff/norm); + if (diff>delta) + delta=diff; + } + + printf("||U/U^dag*w_AVX-U/U^dag*w_FPU||<= %.1e*||w||\n",delta); + +#endif +#endif + + exit(0); +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/time2.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/time2.c new file mode 100644 index 0000000000000000000000000000000000000000..de177790645e8f7d061c1be2c24d2724bda10540 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/time2.c @@ -0,0 +1,263 @@ + +/******************************************************************************* +* +* File time2.c +* +* Copyright (C) 2005, 2008, 2009, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Timing of the SU(3) x SU(3)-vector multiplication (double-precision programs) +* +*******************************************************************************/ + +#include +#include +#include +#include +#include "su3.h" +#include "random.h" +#include "su3fcts.h" + +static su3_dble u[4] ALIGNED16; +static su3_vector_dble s[8],r[8],t[8] ALIGNED16; + +#if (defined x64) +#if (defined AVX) +#include "avx.h" + +#define _su3_fast_multiply(r1,r2,u,s1,s2) \ + _avx_pair_load_dble(s1,s2); \ + _avx_su3_multiply_pair_dble(u); \ + _avx_pair_store_up_dble(r1,r2) + +#define _su3_fast_inverse_multiply(r1,r2,u,s1,s2) \ + _avx_pair_load_dble(s1,s2); \ + _avx_su3_inverse_multiply_pair_dble(u); \ + _avx_pair_store_up_dble(r1,r2) + +static void fast_multiply(su3_dble *ua,su3_vector_dble *sa, + su3_vector_dble *ra) +{ + _su3_fast_multiply((*(ra )),(*(ra+1)),(*(ua )),(*(sa )),(*(sa+1))); + _su3_fast_multiply((*(ra+2)),(*(ra+3)),(*(ua+1)),(*(sa+2)),(*(sa+3))); + _su3_fast_multiply((*(ra+4)),(*(ra+5)),(*(ua+2)),(*(sa+4)),(*(sa+5))); + _su3_fast_multiply((*(ra+6)),(*(ra+7)),(*(ua+3)),(*(sa+6)),(*(sa+7))); +} + + +static void fast_inverse_multiply(su3_dble *ua,su3_vector_dble *sa, + su3_vector_dble *ra) +{ + _su3_fast_inverse_multiply((*(ra )),(*(ra+1)),(*(ua )),(*(sa )),(*(sa+1))); + _su3_fast_inverse_multiply((*(ra+2)),(*(ra+3)),(*(ua+1)),(*(sa+2)),(*(sa+3))); + _su3_fast_inverse_multiply((*(ra+4)),(*(ra+5)),(*(ua+2)),(*(sa+4)),(*(sa+5))); + _su3_fast_inverse_multiply((*(ra+6)),(*(ra+7)),(*(ua+3)),(*(sa+6)),(*(sa+7))); +} + +#else +#include "sse2.h" + +#define _su3_fast_multiply(r,u,s) \ + _sse_load_dble(s); \ + _sse_su3_multiply_dble(u); \ + _sse_store_up_dble(r) + +#define _su3_fast_inverse_multiply(r,u,s) \ + _sse_load_dble(s); \ + _sse_su3_inverse_multiply_dble(u); \ + _sse_store_up_dble(r) + + +static void fast_multiply(su3_dble *ua,su3_vector_dble *sa, + su3_vector_dble *ra) +{ + _su3_fast_multiply((*(ra )),(*(ua )),(*(sa ))); + _su3_fast_multiply((*(ra+1)),(*(ua )),(*(sa+1))); + _su3_fast_multiply((*(ra+2)),(*(ua+1)),(*(sa+2))); + _su3_fast_multiply((*(ra+3)),(*(ua+1)),(*(sa+3))); + _su3_fast_multiply((*(ra+4)),(*(ua+2)),(*(sa+4))); + _su3_fast_multiply((*(ra+5)),(*(ua+2)),(*(sa+5))); + _su3_fast_multiply((*(ra+6)),(*(ua+3)),(*(sa+6))); + _su3_fast_multiply((*(ra+7)),(*(ua+3)),(*(sa+7))); +} + + +static void fast_inverse_multiply(su3_dble *ua,su3_vector_dble *sa, + su3_vector_dble *ra) +{ + _su3_fast_inverse_multiply((*(ra )),(*(ua )),(*(sa ))); + _su3_fast_inverse_multiply((*(ra+1)),(*(ua )),(*(sa+1))); + _su3_fast_inverse_multiply((*(ra+2)),(*(ua+1)),(*(sa+2))); + _su3_fast_inverse_multiply((*(ra+3)),(*(ua+1)),(*(sa+3))); + _su3_fast_inverse_multiply((*(ra+4)),(*(ua+2)),(*(sa+4))); + _su3_fast_inverse_multiply((*(ra+5)),(*(ua+2)),(*(sa+5))); + _su3_fast_inverse_multiply((*(ra+6)),(*(ua+3)),(*(sa+6))); + _su3_fast_inverse_multiply((*(ra+7)),(*(ua+3)),(*(sa+7))); +} + +#endif + +static void slow_inverse_multiply(su3_dble *ua,su3_vector_dble *sa, + su3_vector_dble *ra) +{ + _su3_inverse_multiply((*(ra )),(*(ua )),(*(sa ))); + _su3_inverse_multiply((*(ra+1)),(*(ua )),(*(sa+1))); + _su3_inverse_multiply((*(ra+2)),(*(ua+1)),(*(sa+2))); + _su3_inverse_multiply((*(ra+3)),(*(ua+1)),(*(sa+3))); + _su3_inverse_multiply((*(ra+4)),(*(ua+2)),(*(sa+4))); + _su3_inverse_multiply((*(ra+5)),(*(ua+2)),(*(sa+5))); + _su3_inverse_multiply((*(ra+6)),(*(ua+3)),(*(sa+6))); + _su3_inverse_multiply((*(ra+7)),(*(ua+3)),(*(sa+7))); +} + +#endif + +static void slow_multiply(su3_dble *ua,su3_vector_dble *sa, + su3_vector_dble *ra) +{ + _su3_multiply((*(ra )),(*(ua )),(*(sa ))); + _su3_multiply((*(ra+1)),(*(ua )),(*(sa+1))); + _su3_multiply((*(ra+2)),(*(ua+1)),(*(sa+2))); + _su3_multiply((*(ra+3)),(*(ua+1)),(*(sa+3))); + _su3_multiply((*(ra+4)),(*(ua+2)),(*(sa+4))); + _su3_multiply((*(ra+5)),(*(ua+2)),(*(sa+5))); + _su3_multiply((*(ra+6)),(*(ua+3)),(*(sa+6))); + _su3_multiply((*(ra+7)),(*(ua+3)),(*(sa+7))); +} + + +int main(void) +{ + int k,n,count; + double t1,t2,dt; +#if (defined x64) + double delta,diff,norm; +#endif + + printf("\n"); + printf("Time per double-precision SU(3) x SU(3)-vector multiplication\n"); + printf("-------------------------------------------------------------\n\n"); + +#if (defined AVX) + printf("Using AVX instructions\n"); +#elif (defined x64) + printf("Using SSE3 instructions and up to 16 xmm registers\n"); +#endif + + printf("Measurement made with all data in cache\n\n"); + + rlxd_init(1,123456); + + for (k=0;k<4;k++) + random_su3_dble(u+k); + + gauss_dble((double*)(s),48); + gauss_dble((double*)(r),48); + gauss_dble((double*)(t),48); + +#if (defined x64) + + n=(int)(1.0e6); + dt=0.0; + + while (dt<2.0) + { + t1=(double)clock(); + for (count=0;countdelta) + delta=diff; + } + +#if (defined AVX) + printf("||U*w_AVX-U*w_FPU||<= %.1e*||w||\n",delta); +#else + printf("||U*w_SSE-U*w_FPU||<= %.1e*||w||\n",delta); +#endif + + fast_inverse_multiply(u,s,r); + slow_inverse_multiply(u,s,t); + delta=0.0; + + for (k=0;k<8;k++) + { + _vector_sub_assign(r[k],t[k]); + diff=_vector_prod_re(r[k],r[k]); + norm=_vector_prod_re(s[k],s[k]); + diff=sqrt(diff/norm); + if (diff>delta) + delta=diff; + } + +#if (defined AVX) + printf("||U^dag*w_AVX-U^dag*w_FPU||<= %.1e*||w||\n",delta); +#else + printf("||U^dag*w_SSE-U^dag*w_FPU||<= %.1e*||w||\n",delta); +#endif +#endif + + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/time3.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/time3.c new file mode 100644 index 0000000000000000000000000000000000000000..c0776e3591b01124df5325bba8091d6893cedaff --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/su3fcts/time3.c @@ -0,0 +1,194 @@ + +/******************************************************************************* +* +* File time3.c +* +* Copyright (C) 2005, 2009, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Timing of su3xsu3, su3dagxsu3, ... +* +*******************************************************************************/ + +#include +#include +#include +#include +#include "random.h" +#include "su3.h" +#include "utils.h" +#include "su3fcts.h" + + +int main(void) +{ + int n,count; + double t1,t2,dt; + su3_dble *u,*v,*w; + u3_alg_dble *X; + + printf("\n"); + printf("Timing of su3xsu3, su3dagxsu3, ...\n"); + printf("----------------------------------\n\n"); + +#if (defined AVX) + printf("Using AVX instructions\n"); +#elif (defined x64) + printf("Using SSE3 instructions and up to 16 xmm registers\n"); +#endif + + printf("Measurement made with all data in cache\n\n"); + + u=amalloc(3*sizeof(su3_dble),4); + X=amalloc(sizeof(u3_alg_dble),3); + error((u==NULL)||(X==NULL),1,"main [time3.c]", + "Unable to allocate auxiliary array"); + v=u+1; + w=u+2; + + rlxd_init(1,23456); + random_su3_dble(u); + random_su3_dble(v); + ranlxd((double*)(&(*X).c1),9); + + n=(int)(1.0e6); + dt=0.0; + + while (dt<2.0) + { + t1=(double)clock(); + for (count=0;count +#include +#include +#include +#include "random.h" +#include "su3.h" +#include "utils.h" +#include "su3fcts.h" + + +int main(void) +{ + int n,count; + double t1,t2,dt; + su3_dble *u,*v; + su3_alg_dble *X; + u3_alg_dble *Y; + + printf("\n"); + printf("Timing of prod2su3alg, prod2u3alg and rotate_su3alg\n"); + printf("---------------------------------------------------\n\n"); + +#if (defined AVX) + printf("Using AVX instructions\n"); +#elif (defined x64) + printf("Using SSE3 instructions and up to 16 xmm registers\n"); +#endif + + printf("Measurement made with all data in cache\n\n"); + + u=amalloc(2*sizeof(*u),4); + X=amalloc(sizeof(*X),4); + Y=amalloc(sizeof(*Y),4); + error((u==NULL)||(X==NULL)||(Y==NULL),1, + "main [time4.c]","Unable to allocate auxiliary variables"); + v=u+1; + + rlxd_init(1,23456); + random_su3_dble(u); + random_su3_dble(v); + ranlxd((double*)(X),8); + ranlxd((double*)(Y),9); + + n=(int)(1.0e6); + dt=0.0; + + while (dt<2.0) + { + t1=(double)clock(); + for (count=0;count +#include +#include +#include +#include +#include "utils.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" + +static double mu[3]; +static su3_alg_dble *X; +static su3_dble *r,*u,*v,*w,*uu; +static const su3_dble u0={{0.0}}; +static ch_drv0_t *sp; +static ch_drv1_t *sg; +static ch_drv2_t *sf; +static double eps; + + +static void alloc_Xu(void) +{ + X=amalloc(2*sizeof(*X),4); + r=amalloc(4*sizeof(*r),4); + sp=amalloc(2*sizeof(*sp),4); + sg=amalloc(2*sizeof(*sg),4); + sf=amalloc(2*sizeof(*sf),4); + uu=amalloc(2*sizeof(*uu),4); + + error((X==NULL)||(r==NULL)||(sp==NULL)||(sf==NULL)||(sg==NULL)||(uu==NULL),1, + "alloc_Xu [time5.c]","Unable to allocate matrices"); + + u=r+1; + v=r+2; + w=r+3; +} + + +static void random_X(void) +{ + int i; + + ranlxd(&eps,1); + eps*=0.5; + + for (i=0;i<2;i++) + { + for (;;) + { + ranlxd(mu,2); + mu[0]=2.0*mu[0]-1.0; + mu[1]=2.0*mu[1]-1.0; + mu[2]=-mu[0]-mu[1]; + + if (fabs(mu[2])<=1.0) + break; + } + + (*u)=u0; + (*u).c11.im=mu[0]; + (*u).c22.im=mu[1]; + (*u).c33.im=mu[2]; + + random_su3_dble(r); + su3xsu3(r,u,w); + su3xsu3dag(w,r,u); + + X[i].c1=((*u).c11.im-(*u).c22.im)/3.0; + X[i].c2=((*u).c11.im-(*u).c33.im)/3.0; + X[i].c3=(*u).c12.re; + X[i].c4=(*u).c12.im; + X[i].c5=(*u).c13.re; + X[i].c6=(*u).c13.im; + X[i].c7=(*u).c23.re; + X[i].c8=(*u).c23.im; + } + + random_su3_dble(uu); + random_su3_dble(uu+1); +} + + +static int eval_nsplt(double eps,su3_alg_dble *X) +{ + double nfrb; + int n; + + nfrb=4.0*(3.0*((*X).c1*(*X).c1+(*X).c2*(*X).c2-(*X).c1*(*X).c2)+ + (*X).c3*(*X).c3+(*X).c4*(*X).c4+(*X).c5*(*X).c5+ + (*X).c6*(*X).c6+(*X).c7*(*X).c7+(*X).c8*(*X).c8); + + nfrb*=eps*eps; + n=0; + while(nfrb>3.0) + { + nfrb*=0.25; + n++; + } + + return n; +} + + +static int find_N(void) +{ + int i; + double r; + + r=1.0; + + for (i=1;r>DBL_EPSILON;i++) + r/=(double)(i); + + i+=7; + + return i+(i%2); +} + + +int main(void) +{ + int k,n,count,ns,nsplt,nop; + double t1,t2,dt; + + printf("\n"); + printf("Timing of chexp_drv*(), ch2mat() and expXsu3()\n"); + printf("----------------------------------------------\n\n"); + +#if (defined AVX) + printf("Using AVX and SSE3 instructions\n"); +#elif (defined x64) + printf("Using SSE3 instructions and up to 16 xmm registers\n"); +#endif + + printf("Measurement made with all data in cache\n\n"); + + alloc_Xu(); + rlxd_init(1,12345); + random_X(); + ns=find_N(); + + n=(int)(1.0e6); + dt=0.0; + + while (dt<2.0) + { + t1=(double)clock(); + for (count=0;count +#include +#include +#include "random.h" +#include "utils.h" +#include "random.h" +#include "linalg.h" +#include "sw_term.h" + +typedef union +{ + weyl w; + complex c[6]; + float r[12]; +} spin_t; + +typedef union +{ + spinor s; + complex c[12]; + float r[24]; +} spin2_t; + +typedef union +{ + complex c[36]; + float r[72]; +} mat_t; + +static pauli mp[2] ALIGNED16; +static spin_t s1,s2,r1,r2 ALIGNED16; +static spin2_t sd1,sd2,rd1,rd2 ALIGNED16; +static mat_t mv[2] ALIGNED16; + + +static void cpvec(int n,complex *s,complex *r) +{ + int i; + + for (i=0;ii) + { + mv[im].c[6*i+j].re= mv[im].c[6*j+i].re; + mv[im].c[6*i+j].im=-mv[im].c[6*j+i].im; + } + else + mv[im].c[6*i+j].im=0.0f; + } + } + } + + for (im=0;im<2;im++) + { + k=6; + + for (i=0;i<6;i++) + { + mp[im].u[i]=mv[im].c[6*i+i].re; + + for (j=i+1;j<6;j++) + { + mp[im].u[k]=mv[im].c[6*i+j].re; + k+=1; + mp[im].u[k]=mv[im].c[6*i+j].im; + k+=1; + } + } + } + + gauss(s1.r,12); + cpvec(6,s1.c,s2.c); + mul_pauli(mu,mp,&(s1.w),&(r1.w)); + + error(diffvec(6,s1.c,s2.c),1,"main [check1.c]", + "mul_pauli() modifies the source spinor"); + + cmat_vec(6,mv[0].c,s2.c,r2.c); + + for (i=0;i<6;i++) + { + r2.c[i].re-=mu*s2.c[i].im; + r2.c[i].im+=mu*s2.c[i].re; + } + + printf("mul_pauli():\n"); + printf("r1: result, r2: expected result\n\n"); + + for (i=0;i<2;i++) + { + for (j=0;j<3;j++) + { + k=3*i+j; + printf("r1.c%d.c%d=(% .7e,% .7e)\n",i+1,j+1,r1.c[k].re,r1.c[k].im); + printf("r2.c%d.c%d=(% .7e,% .7e)\n",i+1,j+1,r2.c[k].re,r2.c[k].im); + printf("\n"); + } + } + + dmax=0.0f; + + for (i=0;i<12;i++) + { + d=(float)(fabs((double)(r1.r[i]-r2.r[i]))); + if (d>dmax) + dmax=d; + } + + printf("Maximal absolute deviation = %.1e\n",dmax); + + mul_pauli(mu,mp,&(s1.w),&(s1.w)); + error(diffvec(6,s1.c,r1.c),1,"main [check1.c]", + "mul_pauli() is incorrect when r=s"); + printf("Works correctly if input and output spinors coincide\n\n"); + + gauss(sd1.r,24); + cpvec(12,sd1.c,sd2.c); + mul_pauli2(mu,mp,&(sd1.s),&(rd1.s)); + + error(diffvec(12,sd1.c,sd2.c),1,"main [check1.c]", + "mul_pauli2() modifies the source spinor"); + + cmat_vec(6,mv[0].c,sd2.c,rd2.c); + cmat_vec(6,mv[1].c,sd2.c+6,rd2.c+6); + + for (i=0;i<6;i++) + { + rd2.c[i].re-=mu*sd2.c[i].im; + rd2.c[i].im+=mu*sd2.c[i].re; + } + + for (i=6;i<12;i++) + { + rd2.c[i].re+=mu*sd2.c[i].im; + rd2.c[i].im-=mu*sd2.c[i].re; + } + + printf("mul_pauli2():\n"); + printf("r1: result, r2: expected result\n\n"); + + for (i=0;i<4;i++) + { + for (j=0;j<3;j++) + { + k=3*i+j; + printf("r1.c%d.c%d=(% .7e,% .7e)\n",i+1,j+1,rd1.c[k].re,rd1.c[k].im); + printf("r2.c%d.c%d=(% .7e,% .7e)\n",i+1,j+1,rd2.c[k].re,rd2.c[k].im); + printf("\n"); + } + } + + dmax=0.0f; + + for (i=0;i<24;i++) + { + d=(float)(fabs((double)(rd1.r[i]-rd2.r[i]))); + if (d>dmax) + dmax=d; + } + + printf("Maximal absolute deviation = %.1e\n",dmax); + + mul_pauli2(mu,mp,&(sd1.s),&(sd1.s)); + error(diffvec(12,sd1.c,rd1.c),1,"main [check1.c]", + "mul_pauli2() is incorrect when r=s"); + printf("Works correctly if input and output spinors coincide\n\n"); + + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/sw_term/check2.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/sw_term/check2.c new file mode 100644 index 0000000000000000000000000000000000000000..038651072b11a801bcd76fe847e8144c906681b3 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/sw_term/check2.c @@ -0,0 +1,163 @@ + +/******************************************************************************* +* +* File check2.c +* +* Copyright (C) 2005, 2009, 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of mul_pauli_dble() +* +*******************************************************************************/ + +#include +#include +#include +#include "random.h" +#include "utils.h" +#include "random.h" +#include "linalg.h" +#include "sw_term.h" + +typedef union +{ + weyl_dble w; + complex_dble c[6]; + double r[12]; +} spin_t; + +typedef union +{ + complex_dble c[36]; + double r[72]; +} mat_t; + +#if (defined AVX) +static pauli_dble mp ALIGNED32; +static spin_t s1,s2,r1,r2 ALIGNED32; +static mat_t mv ALIGNED32; +#else +static pauli_dble mp ALIGNED16; +static spin_t s1,s2,r1,r2 ALIGNED16; +static mat_t mv ALIGNED16; +#endif + +static void cpvec(int n,complex_dble *s,complex_dble *r) +{ + int i; + + for (i=0;ii) + { + mv.c[6*i+j].re= mv.c[6*j+i].re; + mv.c[6*i+j].im=-mv.c[6*j+i].im; + } + else + mv.c[6*i+j].im=0.0; + } + } + + k=6; + + for (i=0;i<6;i++) + { + mp.u[i]=mv.c[6*i+i].re; + + for (j=i+1;j<6;j++) + { + mp.u[k]=mv.c[6*i+j].re; + k+=1; + mp.u[k]=mv.c[6*i+j].im; + k+=1; + } + } + + cpvec(6,s1.c,s2.c); + mul_pauli_dble(mu,&mp,&(s1.w),&(r1.w)); + + error(diffvec(6,s1.c,s2.c),1,"main [check2.c]", + "mul_pauli_dble() modifies the source spinor"); + + cmat_vec_dble(6,mv.c,s2.c,r2.c); + + for (i=0;i<6;i++) + { + r2.c[i].re-=mu*s2.c[i].im; + r2.c[i].im+=mu*s2.c[i].re; + } + + printf("r1: result, r2: expected result\n\n"); + + for (i=0;i<2;i++) + { + for (j=0;j<3;j++) + { + k=3*i+j; + printf("r1.c%d.c%d=(% .7e,% .7e)\n",i+1,j+1,r1.c[k].re,r1.c[k].im); + printf("r2.c%d.c%d=(% .7e,% .7e)\n",i+1,j+1,r2.c[k].re,r2.c[k].im); + printf("\n"); + } + } + + dmax=0.0; + + for (i=0;i<12;i++) + { + d=fabs(r1.r[i]-r2.r[i]); + if (d>dmax) + dmax=d; + } + + printf("Maximal absolute deviation = %.1e\n",dmax); + + mul_pauli_dble(mu,&mp,&(s1.w),&(s1.w)); + error(diffvec(6,s1.c,r1.c),1,"main [check2.c]", + "mul_pauli_dble() is incorrect when r=s"); + printf("Works correctly if input and output spinors coincide\n\n"); + + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/sw_term/check3.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/sw_term/check3.c new file mode 100644 index 0000000000000000000000000000000000000000..90d5df0988943d24052e41366703ec2bc3644440 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/sw_term/check3.c @@ -0,0 +1,200 @@ + +/******************************************************************************* +* +* File check3.c +* +* Copyright (C) 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of assign_pauli() and apply_sw() +* +*******************************************************************************/ + +#include +#include +#include +#include "random.h" +#include "utils.h" +#include "random.h" +#include "linalg.h" +#include "sw_term.h" + +#define NM 131 + +typedef union +{ + spinor s; + complex c[12]; + float r[24]; +} spin_t; + +static pauli m[2*NM] ALIGNED16; +static pauli_dble md[2*NM] ALIGNED16; +static spin_t sp1[NM],sp2[NM],rp1[NM],rp2[NM] ALIGNED16; +static complex mv[36] ALIGNED16; + + +static void random_pauli_dble(void) +{ + int i; + double *u; + + for (i=0;i<(2*NM);i++) + { + u=md[i].u; + gauss_dble(u,36); + } +} + + +static float diff_pauli(void) +{ + int i,j; + float d,dmax,*u; + double *ud; + + dmax=0.0f; + + for (i=0;i<(2*NM);i++) + { + u=m[i].u; + ud=md[i].u; + + for (j=0;j<36;j++) + { + d=u[j]-(float)(ud[j]); + if (d<0.0f) + d=-d; + if (d>dmax) + dmax=d; + } + } + + return dmax; +} + + +static void random_spin(void) +{ + int i; + + for (i=0;idmax) + dmax=d; + } + } + + return dmax; +} + + +static void pauli2mv(float mu,pauli *mp) +{ + int i,j,k; + float *u; + + u=(*mp).u; + k=6; + + for (i=0;i<6;i++) + { + mv[6*i+i].re=u[i]; + mv[6*i+i].im=mu; + + for (j=i+1;j<6;j++) + { + mv[6*i+j].re=u[k]; + mv[6*j+i].re=u[k]; + k+=1; + mv[6*i+j].im=u[k]; + mv[6*j+i].im=-u[k]; + k+=1; + } + } +} + + +int main(void) +{ + int i; + float mu; + spinor *s1,*r1; + + printf("\n"); + printf("Check of assign_pauli() and apply_sw()\n"); + printf("--------------------------------------\n\n"); + +#if (defined x64) + printf("Using SSE3 instructions and up to 16 xmm registers\n\n"); +#endif + + rlxs_init(0,3898); + random_pauli_dble(); + assign_pauli(2*NM,md,m); + + printf("Check of assign_pauli():\nAbsolute deviation = %.1e\n\n", + diff_pauli()); + + random_spin(); + cp_spin(sp1,sp2); + mu=0.1234f; + + s1=(spinor*)(sp1); + r1=(spinor*)(rp1); + apply_sw(NM,mu,m,s1,r1); + + error(diff_spin(sp1,sp2)!=0.0f,1,"main [check3.c]", + "apply_sw() does not preserve the input spinor field"); + + for (i=0;i +#include +#include +#include "random.h" +#include "utils.h" +#include "random.h" +#include "linalg.h" +#include "sw_term.h" + +#define NM 1001 + +typedef union +{ + spinor_dble s; + complex_dble c[12]; + double r[24]; +} spin_t; + +static pauli_dble m[2*NM] ALIGNED16; +static spin_t sp1[NM],sp2[NM],rp1[NM],rp2[NM] ALIGNED16; +static complex_dble mv[36] ALIGNED16; + + +static void random_pauli_dble(void) +{ + int i,j; + double *u; + + for (i=0;i<(2*NM);i++) + { + u=m[i].u; + gauss_dble(u,36); + + for (j=0;j<6;j++) + u[j]+=10.0; + } +} + + +static void random_spin(void) +{ + int i; + + for (i=0;idmax) + dmax=d; + } + } + + return dmax; +} + + +static void pauli2mv(double mu,pauli_dble *mp) +{ + int i,j,k; + double *u; + + u=(*mp).u; + k=6; + + for (i=0;i<6;i++) + { + mv[6*i+i].re=u[i]; + mv[6*i+i].im=mu; + + for (j=i+1;j<6;j++) + { + mv[6*i+j].re=u[k]; + mv[6*j+i].re=u[k]; + k+=1; + mv[6*i+j].im=u[k]; + mv[6*j+i].im=-u[k]; + k+=1; + } + } +} + + +int main(void) +{ + int i,ie; + double mu; + spinor_dble *s1,*r1; + + printf("\n"); + printf("Check of apply_sw_dble() and apply_swinv_dble()\n"); + printf("-----------------------------------------------\n\n"); + +#if (defined x64) + printf("Using SSE3 instructions and up to 16 xmm registers\n\n"); +#endif + + rlxd_init(1,3898); + s1=(spinor_dble*)(sp1); + r1=(spinor_dble*)(rp1); + mu=0.0123; + + random_pauli_dble(); + random_spin(); + cp_spin(sp1,sp2); + apply_sw_dble(NM,mu,m,s1,r1); + + error(diff_spin(sp1,sp2)!=0.0,1,"main [check4.c]", + "apply_sw_dble() does not preserve the input spinor field"); + + for (i=0;i +#include +#include +#include "random.h" +#include "utils.h" +#include "random.h" +#include "linalg.h" +#include "sw_term.h" + +static pauli_dble ma[3] ALIGNED16; +static complex_dble aa[4][36] ALIGNED16; + + +static void random_pauli(pauli_dble *m) +{ + int i; + double *u; + + u=(*m).u; + gauss_dble(u,36); + + for (i=0;i<6;i++) + (*m).u[i]+=10.0; +} + + +static void pauli2mat(pauli_dble *m,complex_dble *a) +{ + int i,j,k; + double *u; + + u=(*m).u; + k=6; + + for (i=0;i<6;i++) + { + a[6*i+i].re=u[i]; + a[6*i+i].im=0.0; + + for (j=i+1;j<6;j++) + { + a[6*i+j].re=u[k]; + a[6*j+i].re=u[k]; + k+=1; + a[6*i+j].im=u[k]; + a[6*j+i].im=-u[k]; + k+=1; + } + } +} + + +int main(void) +{ + int i,j,ie; + double mu,d,dmax; + + printf("\n"); + printf("Check of inv_pauli_dble()\n"); + printf("-------------------------\n\n"); + +#if (defined x64) + printf("Using SSE3 instructions and up to 16 xmm registers\n\n"); +#endif + + rlxd_init(1,3898); + mu=0.1234; + ie=1; + + while (ie) + { + random_pauli(ma); + ie=inv_pauli_dble(mu,ma,ma+1); + } + + pauli2mat(ma,aa[0]); + pauli2mat(ma+1,aa[1]); + cmat_mul_dble(6,aa[0],aa[0],aa[2]); + + for (i=0;i<6;i++) + aa[2][6*i+i].re+=mu*mu; + + cmat_mul_dble(6,aa[1],aa[2],aa[3]); + cmat_sub_dble(6,aa[3],aa[0],aa[2]); + dmax=0.0; + + for (i=0;i<6;i++) + { + for (j=0;j<6;j++) + { + d=aa[2][6*i+j].re; + + if (d<0.0) + d=-d; + if (d>dmax) + dmax=d; + + d=aa[2][6*i+j].im; + + if (d<0.0) + d=-d; + if (d>dmax) + dmax=d; + } + } + + printf("Maximal absolute deviation = %.1e\n",dmax); + + inv_pauli_dble(mu,ma,ma); + dmax=0.0; + + for (i=0;i<36;i++) + { + d=ma[0].u[i]-ma[1].u[i]; + + if (d<0.0) + d=-d; + if (d>dmax) + dmax=d; + } + + error(dmax!=0.0,1,"main [check5.c]", + "inv_pauli_dble() is incorrect when m=im"); + printf("Works correctly if input and output matrices coincide\n\n"); + + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/sw_term/check6.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/sw_term/check6.c new file mode 100644 index 0000000000000000000000000000000000000000..acadea1b0af8336b93b5432a340fe026f19a0c7b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/sw_term/check6.c @@ -0,0 +1,159 @@ + +/******************************************************************************* +* +* File check6.c +* +* Copyright (C) 2005, 2009, 2010 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Accuracy of inv_pauli_dble() +* +*******************************************************************************/ + +#include +#include +#include +#include "random.h" +#include "utils.h" +#include "random.h" +#include "linalg.h" +#include "sw_term.h" + +#define NM 10000 + +typedef union +{ + weyl w; + complex c[6]; +} spin_t; + +typedef union +{ + weyl_dble w; + complex_dble c[6]; +} spin_dble_t; + +static spin_t vs ALIGNED16; +static spin_dble_t vd ALIGNED16; +static const weyl vs0={{{0.0f}}}; +static const weyl_dble vd0={{{0.0}}}; + + +int main(void) +{ + int n,k,l,itot,*is; + double mu,fact,d,dmax; + pauli *ms,*ims,*msb,*imsb; + pauli_dble *md,*imd,*mdb,*imdb; + + printf("\n"); + printf("Accuracy of inv_pauli_dble()\n"); + printf("----------------------------\n\n"); + +#if (defined x64) + printf("Using SSE3 instructions and up to 16 xmm registers\n\n"); +#endif + + is=amalloc(NM*sizeof(*is),3); + msb=amalloc(3*NM*sizeof(*msb),4); + mdb=amalloc(3*NM*sizeof(*mdb),4); + error((is==NULL)||(msb==NULL)||(mdb==NULL),1, + "main [check6.c]","Unable to allocate auxiliary arrays"); + + imsb=msb+NM; + imdb=mdb+NM; + + rlxd_init(1,1234); + mu=0.0123; + md=mdb; + imd=imdb; + itot=0; + dmax=0.0; + fact=sqrt(2.0); + + for (n=0;ndmax) + dmax=d; + } + } + } + else + itot+=1; + + md+=1; + imd+=1; + } + + printf("Double-precision program, mu=%.4f:\n",mu); + printf("%d Gaussian random matrices, %d inversion failures\n",NM,itot); + printf("Maximal relative deviation = %.1e ",sqrt(dmax)); + printf("(safe cases only)\n\n"); + + assign_pauli(NM,mdb,msb); + assign_pauli(2*NM,imdb,imsb); + + ms=msb; + ims=imsb; + dmax=0.0; + + for (n=0;ndmax) + dmax=d; + } + } + } + + ms+=1; + ims+=1; + } + + printf("After assignment to single-precision matrices:\n"); + printf("Maximal relative deviation = %.1e ",sqrt(dmax)); + printf("(safe cases only)\n\n"); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/sw_term/check7.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/sw_term/check7.c new file mode 100644 index 0000000000000000000000000000000000000000..998b01d6726780ad2d3ed60937a92aac2c8f21fb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/sw_term/check7.c @@ -0,0 +1,242 @@ + +/******************************************************************************* +* +* File check7.c +* +* Copyright (C) 2005, 2009, 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of det_pauli_dble() +* +*******************************************************************************/ + +#include +#include +#include +#include "random.h" +#include "utils.h" +#include "random.h" +#include "linalg.h" +#include "sw_term.h" + +#define NM 10000 + +static double dd[6] ALIGNED16; +static complex_dble aa[36],bb[36],vv[36],ww[36] ALIGNED16; + + +static complex_dble random_dd(double mu) +{ + int i; + complex_dble det,z; + + ranlxd(dd,6); + det.re=1.0; + det.im=0.0; + + for (i=0;i<6;i++) + { + if (dd[i]<0.5) + dd[i]-=0.6; + else + dd[i]-=0.4; + + z.re=det.re*dd[i]-det.im*mu; + z.im=det.re*mu+det.im*dd[i]; + + det.re=z.re; + det.im=z.im; + } + + return det; +} + + +static double norm(complex_dble *v) +{ + int i; + double r; + + r=0.0; + + for (i=0;i<6;i++) + r+=(v[i].re*v[i].re+v[i].im*v[i].im); + + return sqrt(r); +} + + +static complex_dble prod(complex_dble *v,complex_dble *w) +{ + int i; + complex_dble z; + + z.re=0.0; + z.im=0.0; + + for (i=0;i<6;i++) + { + z.re+=(v[i].re*w[i].re+v[i].im*w[i].im); + z.im+=(v[i].re*w[i].im-v[i].im*w[i].re); + } + + return z; +} + + +static void proj(complex_dble *v,complex_dble *w) +{ + int i; + complex_dble z; + + z=prod(v,w); + + for (i=0;i<6;i++) + { + w[i].re-=(z.re*v[i].re-z.im*v[i].im); + w[i].im-=(z.re*v[i].im+z.im*v[i].re); + } +} + + +static void random_vv(void) +{ + int i,j; + double r,ri[12]; + complex_dble *vi; + + for (i=0;i<6;i++) + { + vi=vv+6*i; + r=0.0; + + while (r<1.0) + { + gauss_dble(ri,12); + + for (j=0;j<6;j++) + { + vi[j].re=ri[2*j]; + vi[j].im=ri[2*j+1]; + } + + for (j=0;jdmax) + dmax=d; + + md+=1; + } + + printf("%d Gaussian random matrices M, mu=%.4f\n",NM,mu); + printf("Maximal relative deviation of det(M+i*mu) = %.1e\n\n",dmax); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/sw_term/time1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/sw_term/time1.c new file mode 100644 index 0000000000000000000000000000000000000000..e046fcc79b91d70c51c765e58d80c0975014b36d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/nompi/sw_term/time1.c @@ -0,0 +1,117 @@ + +/******************************************************************************* +* +* File time1.c +* +* Copyright (C) 2005, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Timing of mul_pauli() and mul_pauli2() +* +*******************************************************************************/ + +#include +#include +#include +#include +#include "utils.h" +#include "random.h" +#include "linalg.h" +#include "sw_term.h" + +typedef union +{ + weyl w; + float r[12]; +} spin_t; + +typedef union +{ + spinor s; + float r[24]; +} spin2_t; + +static pauli mp[4] ALIGNED16; +static spin_t s1,s2,r1,r2 ALIGNED16; +static spin2_t sd1,sd2,rd1,rd2 ALIGNED16; + + +int main(void) +{ + int n,count; + float mu1,mu2; + double t1,t2,dt; + + printf("\n"); + printf("Timing of mul_pauli() and mul_pauli2()\n"); + printf("--------------------------------------\n\n"); + +#if (defined AVX) + printf("Using AVX instructions\n\n"); +#elif (defined x64) + printf("Using SSE3 instructions and up to 16 xmm registers\n\n"); +#endif + + printf("Measurement made with all data in cache\n\n"); + + rlxs_init(0,23456); + + for (n=0;n<4;n++) + ranlxs(mp[n].u,36); + + ranlxs(s1.r,12); + ranlxs(s2.r,12); + ranlxs(sd1.r,24); + ranlxs(sd2.r,24); + + mu1=0.1234f; + mu2=0.5678f; + + n=(int)(1.0e6); + dt=0.0; + + while (dt<2.0) + { + t1=(double)clock(); + for (count=0;count +#include +#include +#include +#include "utils.h" +#include "random.h" +#include "linalg.h" +#include "sw_term.h" + +typedef union +{ + weyl_dble w; + double r[12]; +} spin_t; + +#if (defined AVX) +static pauli_dble mp1,mp2 ALIGNED32; +static spin_t s1,s2,r1,r2 ALIGNED32; +#else +static pauli_dble mp1,mp2 ALIGNED16; +static spin_t s1,s2,r1,r2 ALIGNED16; +#endif + +int main(void) +{ + int n,count; + double mu1,mu2; + double t1,t2,dt; + + printf("\n"); + printf("Timing of mul_pauli_dble()\n"); + printf("--------------------------\n\n"); + +#if (defined AVX) + printf("Using AVX instructions\n\n"); +#elif (defined x64) + printf("Using SSE3 instructions and up to 16 xmm registers\n\n"); +#endif + + printf("Measurement made with all data in cache\n\n"); + + rlxd_init(1,23456); + ranlxd(mp1.u,36); + ranlxd(mp2.u,36); + ranlxd(s1.r,12); + ranlxd(s2.r,12); + mu1=0.1234; + mu2=0.5678; + + n=(int)(1.0e6); + dt=0.0; + + while (dt<2.0) + { + t1=(double)clock(); + for (count=0;count +#include +#include +#include +#include "random.h" +#include "utils.h" +#include "random.h" +#include "linalg.h" +#include "sw_term.h" + + +int main(void) +{ + int n,count,itest; + double t1,t2,dt,mu; + pauli_dble *m; + + printf("\n"); + printf("Timing of inv_pauli_dble() and det_pauli_dble()\n"); + printf("-----------------------------------------------\n\n"); + +#if (defined x64) + printf("Using SSE3 instructions and up to 16 xmm registers\n\n"); +#endif + + printf("Measurement made with all data in cache\n\n"); + + m=amalloc(2*sizeof(*m),4); + error(m==NULL,1,"main [time3.c]", + "Unable to allocate auxiliary arrays"); + + rlxd_init(1,23456); + ranlxd((*m).u,36); + mu=0.1234; + + for (n=0;n<6;n++) + (*m).u[n]=1.0; + + for (n=6;n<36;n++) + (*m).u[n]=0.01*((*m).u[n]-0.5); + + n=(int)(1.0e5); + dt=0.0; + itest=0; + + while (dt<2.0) + { + t1=(double)clock(); + for (count=0;count +#include +#include +#include "su3.h" +#include "random.h" +#include "utils.h" + +#define N 18000 + +static int istd[N],istds[N]; +static double dstd[N],dstds[N]; +static su3_dble ufld[N/18]; + + +static void set_u2v(su3_dble *u,double *v) +{ + v[ 0]=(*u).c11.re; + v[ 1]=(*u).c11.im; + v[ 2]=(*u).c12.re; + v[ 3]=(*u).c12.im; + v[ 4]=(*u).c13.re; + v[ 5]=(*u).c13.im; + + v[ 6]=(*u).c21.re; + v[ 7]=(*u).c21.im; + v[ 8]=(*u).c22.re; + v[ 9]=(*u).c22.im; + v[10]=(*u).c23.re; + v[11]=(*u).c23.im; + + v[12]=(*u).c31.re; + v[13]=(*u).c31.im; + v[14]=(*u).c32.re; + v[15]=(*u).c32.im; + v[16]=(*u).c33.re; + v[17]=(*u).c33.im; +} + + +static void set_v2u(double *v,su3_dble *u) +{ + (*u).c11.re=v[ 0]; + (*u).c11.im=v[ 1]; + (*u).c12.re=v[ 2]; + (*u).c12.im=v[ 3]; + (*u).c13.re=v[ 4]; + (*u).c13.im=v[ 5]; + + (*u).c21.re=v[ 6]; + (*u).c21.im=v[ 7]; + (*u).c22.re=v[ 8]; + (*u).c22.im=v[ 9]; + (*u).c23.re=v[10]; + (*u).c23.im=v[11]; + + (*u).c31.re=v[12]; + (*u).c31.im=v[13]; + (*u).c32.re=v[14]; + (*u).c32.im=v[15]; + (*u).c33.re=v[16]; + (*u).c33.im=v[17]; +} + + +int main(void) +{ + int ie,k,it; + stdint_t i[2]; + double d[2]; + char *ci[2],*cd[2]; + + printf("\n"); + printf("Test of the endianness and byte swapping programs\n"); + printf("-------------------------------------------------\n\n"); + + printf("sizeof(stdint_t) = %d\n",(int)(sizeof(stdint_t))); + printf("sizeof(double) = %d\n",(int)(sizeof(double))); + + ie=endianness(); + if (ie==LITTLE_ENDIAN) + printf("The machine is little endian\n\n"); + else if (ie==BIG_ENDIAN) + printf("The machine is big endian\n\n"); + else + printf("The machine has unknown endianness\n\n"); + + ci[0]=(char*)(i); + ci[1]=(char*)(i+1); + + ci[0][0]='A'; + ci[0][1]='B'; + ci[0][2]='C'; + ci[0][3]='D'; + + ci[1][0]='1'; + ci[1][1]='2'; + ci[1][2]='3'; + ci[1][3]='4'; + + printf("Byte swapping integers:\n"); + printf("%.4s, %.4s -> ",ci[0],ci[1]); + bswap_int(2,i); + printf("%.4s, %.4s\n\n",ci[0],ci[1]); + + cd[0]=(char*)(d); + cd[1]=(char*)(d+1); + + cd[0][0]='A'; + cd[0][1]='B'; + cd[0][2]='C'; + cd[0][3]='D'; + cd[0][4]='E'; + cd[0][5]='F'; + cd[0][6]='G'; + cd[0][7]='H'; + + cd[1][0]='1'; + cd[1][1]='2'; + cd[1][2]='3'; + cd[1][3]='4'; + cd[1][4]='5'; + cd[1][5]='6'; + cd[1][6]='7'; + cd[1][7]='8'; + + printf("Byte swapping double precision numbers:\n"); + printf("%.8s, %.8s -> ",cd[0],cd[1]); + bswap_double(2,d); + printf("%.8s, %.8s\n\n",cd[0],cd[1]); + + gauss_dble(dstd,N); + + for (k=0;k +#include +#include +#include "utils.h" + + +int main(void) +{ + int n; + double x; + + printf("\n"); + printf("Test of the program fdigits()\n"); + printf("-----------------------------\n\n"); + + while (1) + { + printf("x = "); + scanf("%lf",&x); + n=fdigits(x); + printf(" %.*f\n\n",n,x); + } + + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/random/INDEX b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/random/INDEX new file mode 100644 index 0000000000000000000000000000000000000000..983f469eaa45064ac27b61aca22e39a958c9825e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/random/INDEX @@ -0,0 +1,6 @@ + +Random number generation programs + +check1 Check of import/export functions for the ranlux generators + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/random/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/random/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..89d23571d1f9839760b4f43c37b594c1e00ee040 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/random/Makefile @@ -0,0 +1,122 @@ +################################################################################ +# +# Makefile to compile and link C programs with MPI subroutines +# +# Version valid for Linux machines with MPICH +# +# "make" compiles and links the specified main programs and modules, +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files generated by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +all: rmxeq mkdep mkxeq +.PHONY: all + + +# main programs and modules to be compiled + +MAIN = check1 + +FLAGS = flags lat_parms + +LATTICE = geometry + +RANDOM = gauss ranlux ranlxs ranlxd + +UTILS = mutils utils endian + +MODULES = $(FLAGS) $(LATTICE) $(RANDOM) $(UTILS) + + +# Logging option (-mpilog or -mpitrace or -mpianim) + +LOGOPTION = + + +# search path for modules + +MDIR = ../../modules + +VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/random:$(MDIR)/su3fcts:\ + $(MDIR)/uflds:$(MDIR)/utils + + +# additional include directories + +INCPATH = $(MPI_INCLUDE) ../../include + + +# additional libraries + +LIBS = m + +LIBPATH = $(MPI_HOME)/lib + + +# scheduling and optimization options + +CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ + -Wall -Wno-long-long -Wstrict-prototypes -Werror \ + -O -mno-avx -Dx64 -DPM + + +############################## do not change ################################### + +SHELL=/bin/bash +CC=$(MPI_HOME)/bin/mpicc +CLINKER=$(CC) + +PGMS= $(MAIN) $(MODULES) + +-include $(addsuffix .d,$(PGMS)) + + +# rule to make dependencies + +$(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + +# rule to link object files + +$(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(CFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ + + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables + +rmxeq: + @ -rm -f $(MAIN); \ + echo "delete old executables" + + +# make dependencies + +mkdep: $(addsuffix .d,$(PGMS)) + @ echo "generate tables of dependencies" + + +# clean directory + +clean: + @ -rm -rf *.d *.o *.alog *.clog *.slog \ + *.log~ *.dat *.dat~ $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/random/check1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/random/check1.c new file mode 100644 index 0000000000000000000000000000000000000000..9b01e6ba149458d6b39bf0087ea600e2d06de929 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/random/check1.c @@ -0,0 +1,91 @@ + +/******************************************************************************* +* +* File check1.c +* +* Copyright (C) 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of import/export functions for the ranlux generators +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "utils.h" +#include "global.h" + +#define NRAN 10000 + +static float r[2*NRAN]; +static double rd[2*NRAN]; + + +int main(int argc,char *argv[]) +{ + int my_rank,tag,k,ie,ied; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check1.log","w",stdout); + + printf("\n"); + printf("Check of import/export functions for the ranlux generators\n"); + printf("----------------------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + } + + start_ranlux(0,1234); + ranlxs(r,NRAN); + ranlxd(rd,NRAN); + tag=98029; + + export_ranlux(tag,"check1.dat"); + ranlxs(r,NRAN); + ranlxd(rd,NRAN); + + k=import_ranlux("check1.dat"); + error (k!=tag,1,"main [check1.c]", + "Import_ranlux() returns incorrect tag"); + + ranlxs(r+NRAN,NRAN); + ranlxd(rd+NRAN,NRAN); + + ie=0; + ied=0; + + for (k=0;k +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "ratfcts.h" +#include "global.h" + + +static double eval_rat1(ratfct_t *rf,double x) +{ + int np,i; + double *mu,*rmu,r; + + np=(*rf).np; + mu=(*rf).mu; + rmu=(*rf).rmu; + r=0.0; + + for (i=0;idmax) + dmax=d; + } + + d=dmax; + MPI_Reduce(&d,&dmax,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Bcast(&dmax,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + return dmax; +} + + +static double diff_rat2(double ra,double rb,ratfct_t *rf) +{ + int k; + double r,x,d,dmax; + + dmax=0.0; + + for (k=0;k<1000;k++) + { + ranlxd(&r,1); + x=ra+r*(rb-ra); + + d=fabs(1.0-eval_rat1(rf,x)*eval_rat2(rf,x)); + + if (d>dmax) + dmax=d; + } + + d=dmax; + MPI_Reduce(&d,&dmax,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Bcast(&dmax,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + return dmax; +} + + +static double diff_rat3(double ra,double rb,ratfct_t *rf) +{ + int k; + double r,x,d,dmax; + + dmax=0.0; + + for (k=0;k<1000;k++) + { + ranlxd(&r,1); + x=ra+r*(rb-ra); + + d=fabs(1.0-(eval_rat1(rf+1,x)*eval_rat1(rf+2,x))/eval_rat1(rf,x)); + + if (d>dmax) + dmax=d; + } + + d=dmax; + MPI_Reduce(&d,&dmax,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Bcast(&dmax,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + return dmax; +} + + +int main(int argc,char *argv[]) +{ + int my_rank,irat[3]; + int np1,i,j; + double dmax; + rat_parms_t rp; + ratfct_t rf[3]; + FILE *flog=NULL,*fin=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check1.log","w",stdout); + fin=freopen("check1.in","r",stdin); + + printf("\n"); + printf("Initialization of rational functions\n"); + printf("------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + } + + read_rat_parms(0); + + if (my_rank==0) + fclose(fin); + + print_rat_parms(); + start_ranlux(0,123456); + + rp=rat_parms(0); + irat[0]=0; + irat[1]=0; + irat[2]=rp.degree-1; + rf[0]=ratfct(irat); + + if (my_rank==0) + { + printf("Complete rational function:\n"); + printf("np= %2d, A = %.2e, delta = %.2e\n", + rf[0].np,rf[0].A,rf[0].delta); + + printf(" i mu[i] rmu[i]\n"); + + for (i=0;i0.1); + + if ((np1>0)&&(np1 that +allows the type of boundary condition to be chosen at runtime. When the option +is not set, open boundary conditions are assumed. + +The option may be set but has no effect in the case of check3 (the boundary +conditions are selected through the input parameter file in this case). diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sap/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sap/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..67b3f56cb3163f712b5e703fb6eb839a9f91bf2e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sap/Makefile @@ -0,0 +1,149 @@ +################################################################################ +# +# Makefile to compile and link C programs with MPI subroutines +# +# Version valid for Linux machines with MPICH +# +# "make" compiles and links the specified main programs and modules, +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files generated by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +all: rmxeq mkdep mkxeq +.PHONY: all + + +# main programs and modules to be compiled + +MAIN = check1 check2 check3 time1 time2 + +FLAGS = flags lat_parms sap_parms dfl_parms + +LATTICE = bcnds ftidx uidx geometry + +LINALG = salg salg_dble liealg cmatrix_dble + +LINSOLV = fgcr + +RANDOM = ranlux ranlxs ranlxd gauss + +UFLDS = plaq_sum shift uflds udcom + +SU3FCTS = chexp su3prod su3ren cm3x3 random_su3 + +UTILS = endian mutils utils wspace + +SFLDS = sflds scom sdcom Pbnd Pbnd_dble + +TCHARGE = ftcom ftensor + +SW_TERM = pauli pauli_dble swflds sw_term + +DIRAC = Dw_dble Dw Dw_bnd + +BLOCK = block blk_grid map_u2blk map_sw2blk map_s2blk + +SAP = blk_solv sap_com sap sap_gcr + +ARCHIVE = archive + +MODULES = $(FLAGS) $(LATTICE) $(LINALG) $(LINSOLV) $(RANDOM) $(UFLDS) \ + $(SU3FCTS) $(UTILS) $(SFLDS) $(TCHARGE) $(SW_TERM) $(DIRAC) \ + $(BLOCK) $(SAP) $(ARCHIVE) + + +# Logging option (-mpilog or -mpitrace or -mpianim) + +LOGOPTION = + + +# search path for modules + +MDIR = ../../modules + +VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/linalg:$(MDIR)/linsolv:\ + $(MDIR)/random:$(MDIR)/uflds:$(MDIR)/su3fcts:$(MDIR)/utils:\ + $(MDIR)/sflds:$(MDIR)/tcharge:$(MDIR)/sw_term:$(MDIR)/dirac:\ + $(MDIR)/block:$(MDIR)/sap:$(MDIR)/archive: + + +# additional include directories + +INCPATH = $(MPI_INCLUDE) ../../include + + +# additional libraries + +LIBS = m + +LIBPATH = $(MPI_HOME)/lib + + +# scheduling and optimization options + +CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ + -Wall -Wno-long-long -Wstrict-prototypes -Werror \ + -O -mno-avx -Dx64 -DPM + +# -DFGCR_DBG + + +############################## do not change ################################### + +SHELL=/bin/bash +CC=$(MPI_HOME)/bin/mpicc +CLINKER=$(CC) + +PGMS= $(MAIN) $(MODULES) + +-include $(addsuffix .d,$(PGMS)) + + +# rule to make dependencies + +$(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + +# rule to link object files + +$(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(CFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ + + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables + +rmxeq: + @ -rm -f $(MAIN); \ + echo "delete old executables" + + +# make dependencies + +mkdep: $(addsuffix .d,$(PGMS)) + @ echo "generate tables of dependencies" + + +# clean directory + +clean: + @ -rm -rf *.d *.o *.alog *.clog *.slog $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sap/check1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sap/check1.c new file mode 100644 index 0000000000000000000000000000000000000000..2b636e5a84f145be9dcc45eb21a4ee2d4ab7442b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sap/check1.c @@ -0,0 +1,225 @@ + +/******************************************************************************* +* +* File check1.c +* +* Copyright (C) 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of the block solver programs. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "block.h" +#include "dirac.h" +#include "sap.h" +#include "global.h" + + +int main(int argc,char *argv[]) +{ + int my_rank,bc; + int nb,isw,ie,itm; + int bs[4],n,k,vol,volh; + float mu,res0,res[8],res_max[8]; + double phi[2],phi_prime[2]; + spinor **ps; + block_t *b; + tm_parms_t tm; + FILE *flog=NULL,*fin=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check1.log","w",stdout); + fin=freopen("check1.in","r",stdin); + + printf("\n"); + printf("Check of the block solver programs\n"); + printf("----------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + read_line("bs","%d %d %d %d",&bs[0],&bs[1],&bs[2],&bs[3]); + fclose(fin); + + printf("bs = %d %d %d %d\n\n",bs[0],bs[1],bs[2],bs[3]); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check1.c]", + "Syntax: check1 [-bc ]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.55,0.78,0.9012,1.2034,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,1234); + geometry(); + set_sap_parms(bs,0,1,1); + alloc_bgr(SAP_BLOCKS); + alloc_ws(4); + + set_sw_parms(0.05); + mu=0.123f; + ps=reserve_ws(4); + + for (itm=0;itm<2;itm++) + { + if (itm==1) + set_tm_parms(1); + + random_ud(); + chs_ubnd(-1); + sw_term(NO_PTS); + assign_ud2ubgr(SAP_BLOCKS); + assign_swd2swbgr(SAP_BLOCKS,NO_PTS); + + b=blk_list(SAP_BLOCKS,&nb,&isw); + vol=(*b).vol; + volh=vol/2; + + for (k=0;k<8;k++) + res_max[k]=0.0f; + + random_s(VOLUME,ps[0],1.0f); + bnd_s2zero(ALL_PTS,ps[0]); + set_s2zero(VOLUME,ps[1]); + + for (n=0;nres_max[k]) + res_max[k]=res[k]; + } + } + + error_chk(); + + if (NPROC>1) + { + MPI_Reduce(res_max,res,8,MPI_FLOAT,MPI_MAX,0,MPI_COMM_WORLD); + + for (k=0;k<8;k++) + res_max[k]=res[k]; + } + + if (my_rank==0) + { + tm=tm_parms(); + printf("Twisted-mass flag = %d\n",tm.eoflg); + printf("Check of blk_mres():\n"); + + for (k=0;k<8;k++) + printf("nmr = %2d, res_max = %.1e\n", + 4*(k+1),sqrt((double)(res_max[k]))); + } + + for (k=0;k<8;k++) + res_max[k]=0.0f; + + ie=assign_swd2swbgr(SAP_BLOCKS,ODD_PTS); + error_root(ie,1,"main [check1.c]", + "The inversion of the SW term was not safe"); + + random_s(VOLUME,ps[0],1.0f); + bnd_s2zero(ALL_PTS,ps[0]); + set_s2zero(VOLUME,ps[1]); + + for (n=0;nres_max[k]) + res_max[k]=res[k]; + } + } + + error_chk(); + + if (NPROC>1) + { + MPI_Reduce(res_max,res,8,MPI_FLOAT,MPI_MAX,0,MPI_COMM_WORLD); + + for (k=0;k<8;k++) + res_max[k]=res[k]; + } + + if (my_rank==0) + { + printf("Check of blk_eo_mres():\n"); + + for (k=0;k<8;k++) + printf("nmr = %2d, res_max = %.1e\n", + 3*(k+1),sqrt((double)(res_max[k]))); + + printf("\n"); + } + } + + if (my_rank==0) + fclose(flog); + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sap/check1.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sap/check1.in new file mode 100644 index 0000000000000000000000000000000000000000..cb2b6435876b968f9ca35a085e965971e56ca69b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sap/check1.in @@ -0,0 +1 @@ +bs 8 4 4 4 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sap/check2.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sap/check2.c new file mode 100644 index 0000000000000000000000000000000000000000..59722bcb362ccc5554dc7b69dad146263613ed41 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sap/check2.c @@ -0,0 +1,224 @@ + +/******************************************************************************* +* +* File check2.c +* +* Copyright (C) 2005, 2008, 2012, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Direct test of the Schwarz alternating procedure. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "block.h" +#include "dirac.h" +#include "sap.h" +#include "global.h" + + +int main(int argc,char *argv[]) +{ + int my_rank,bc; + int n,ie,itm; + int bs[4],nmr; + float mu,res,del[3]; + double phi[2],phi_prime[2]; + spinor **ps; + tm_parms_t tm; + FILE *flog=NULL,*fin=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check2.log","w",stdout); + fin=freopen("check2.in","r",stdin); + + printf("\n"); + printf("Direct test of the Schwarz alternating procedure\n"); + printf("------------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + read_line("bs","%d %d %d %d",&bs[0],&bs[1],&bs[2],&bs[3]); + read_line("mu","%f",&mu); + read_line("nmr","%d",&nmr); + fclose(fin); + + printf("bs = %d %d %d %d\n",bs[0],bs[1],bs[2],bs[3]); + printf("mu = %.3e\n",mu); + printf("nmr = %d\n\n",nmr); + fflush(flog); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check2.c]", + "Syntax: check2 [-bc ]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&mu,1,MPI_FLOAT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmr,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.55,0.78,0.9012,1.2034,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,12345); + geometry(); + set_sap_parms(bs,0,1,1); + alloc_bgr(SAP_BLOCKS); + alloc_ws(4); + ps=reserve_ws(4); + + set_sw_parms(0.05); + + for (itm=0;itm<2;itm++) + { + if (itm==0) + set_tm_parms(1); + else + set_tm_parms(0); + + random_ud(); + chs_ubnd(-1); + sw_term(NO_PTS); + assign_ud2u(); + assign_swd2sw(); + assign_ud2ubgr(SAP_BLOCKS); + assign_swd2swbgr(SAP_BLOCKS,NO_PTS); + + set_s2zero(VOLUME,ps[0]); + random_s(VOLUME,ps[1],1.0f); + bnd_s2zero(ALL_PTS,ps[1]); + normalize(VOLUME,1,ps[1]); + assign_s2s(VOLUME,ps[1],ps[2]); + + if (my_rank==0) + { + tm=tm_parms(); + printf("Twisted-mass flag = %d\n",tm.eoflg); + printf("MinRes block solver:\n"); + } + + for (n=0;n<8;n++) + { + sap(mu,0,nmr,ps[0],ps[1]); + res=norm_square(VOLUME,1,ps[1]); + res=(float)(sqrt((double)(res))); + + if (my_rank==0) + printf("n = %d: \t residue = %.2e\t ",n+1,res); + + Dw(mu,ps[0],ps[3]); + mulr_spinor_add(VOLUME,ps[3],ps[2],-1.0f); + mulr_spinor_add(VOLUME,ps[3],ps[1],1.0f); + del[0]=norm_square(VOLUME,1,ps[3]); + del[0]=(float)(sqrt((double)(del[0]))); + + assign_s2s(VOLUME,ps[0],ps[3]); + bnd_s2zero(ALL_PTS,ps[3]); + mulr_spinor_add(VOLUME,ps[3],ps[0],-1.0f); + del[1]=norm_square(VOLUME,1,ps[3]); + del[1]=(float)(sqrt((double)(del[1]))); + + assign_s2s(VOLUME,ps[1],ps[3]); + bnd_s2zero(ALL_PTS,ps[3]); + mulr_spinor_add(VOLUME,ps[3],ps[1],-1.0f); + del[2]=norm_square(VOLUME,1,ps[3]); + del[2]=(float)(sqrt((double)(del[1]))); + + if (my_rank==0) + printf("check = %.2e, bnd checks = %.1e,%.1e\n", + del[0],del[1],del[2]); + } + + error_chk(); + + ie=assign_swd2swbgr(SAP_BLOCKS,ODD_PTS); + error_root(ie,1,"main [check2.c]", + "The inversion of the SW term was not safe"); + + set_s2zero(VOLUME,ps[0]); + random_s(VOLUME,ps[1],1.0f); + bnd_s2zero(ALL_PTS,ps[1]); + normalize(VOLUME,1,ps[1]); + assign_s2s(VOLUME,ps[1],ps[2]); + + if (my_rank==0) + { + printf("\n"); + printf("Even-odd preconditioned MinRes block solver:\n"); + } + + for (n=0;n<8;n++) + { + sap(mu,1,nmr,ps[0],ps[1]); + res=norm_square(VOLUME,1,ps[1]); + res=(float)(sqrt((double)(res))); + + if (my_rank==0) + printf("n = %d: \t residue = %.2e\t ",n+1,res); + + Dw(mu,ps[0],ps[3]); + mulr_spinor_add(VOLUME,ps[3],ps[2],-1.0f); + mulr_spinor_add(VOLUME,ps[3],ps[1],1.0f); + del[0]=norm_square(VOLUME,1,ps[3]); + del[0]=(float)(sqrt((double)(del[0]))); + + assign_s2s(VOLUME,ps[0],ps[3]); + bnd_s2zero(ALL_PTS,ps[3]); + mulr_spinor_add(VOLUME,ps[3],ps[0],-1.0f); + del[1]=norm_square(VOLUME,1,ps[3]); + del[1]=(float)(sqrt((double)(del[1]))); + + assign_s2s(VOLUME,ps[1],ps[3]); + bnd_s2zero(ALL_PTS,ps[3]); + mulr_spinor_add(VOLUME,ps[3],ps[1],-1.0f); + del[2]=norm_square(VOLUME,1,ps[3]); + del[2]=(float)(sqrt((double)(del[1]))); + + if (my_rank==0) + printf("check = %.2e, bnd checks = %.1e,%.1e\n", + del[0],del[1],del[2]); + } + + if (my_rank==0) + printf("\n"); + } + + error_chk(); + + if (my_rank==0) + fclose(flog); + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sap/check2.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sap/check2.in new file mode 100644 index 0000000000000000000000000000000000000000..df121eea37c1470e8dc1d19b2fb0958929cc9eb3 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sap/check2.in @@ -0,0 +1,3 @@ +bs 8 4 4 4 +mu 0.123 +nmr 4 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sap/check3.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sap/check3.c new file mode 100644 index 0000000000000000000000000000000000000000..527fa3f25f0f8c182fbdd435c9822a5d96c344ee --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sap/check3.c @@ -0,0 +1,260 @@ + +/******************************************************************************* +* +* File check3.c +* +* Copyright (C) 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check and performance of the SAP+GCR solver. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "archive.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "dirac.h" +#include "sap.h" +#include "global.h" + +int my_rank,id,first,last,step; +int bs[4],nmr,ncy,nkv,nmx,eoflg,bc; +double kappa,csw,mu,cF,cF_prime; +double phi[2],phi_prime[2],m0,res; +char cnfg_dir[NAME_SIZE],cnfg_file[NAME_SIZE],nbase[NAME_SIZE]; + + +int main(int argc,char *argv[]) +{ + int isolv,nsize,icnfg,status; + double rho,nrm,del; + double wt1,wt2,wdt; + spinor_dble **psd; + lat_parms_t lat; + sap_parms_t sap; + tm_parms_t tm; + FILE *flog=NULL,*fin=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check3.log","w",stdout); + fin=freopen("check3.in","r",stdin); + + printf("\n"); + printf("Check and performance of the SAP+GCR solver\n"); + printf("-------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + find_section("Configurations"); + read_line("name","%s",nbase); + read_line("cnfg_dir","%s",cnfg_dir); + read_line("first","%d",&first); + read_line("last","%d",&last); + read_line("step","%d",&step); + + find_section("Lattice parameters"); + read_line("kappa","%lf",&kappa); + read_line("csw","%lf",&csw); + read_line("mu","%lf",&mu); + read_line("eoflg","%d",&eoflg); + + find_section("Boundary conditions"); + read_line("type","%d",&bc); + + phi[0]=0.0; + phi[1]=0.0; + phi_prime[0]=0.0; + phi_prime[1]=0.0; + cF=1.0; + cF_prime=1.0; + + if (bc==1) + read_dprms("phi",2,phi); + + if ((bc==1)||(bc==2)) + read_dprms("phi'",2,phi_prime); + + if (bc!=3) + read_line("cF","%lf",&cF); + + if (bc==2) + read_line("cF'","%lf",&cF_prime); + else + cF_prime=cF; + + find_section("SAP"); + read_iprms("bs",4,bs); + read_line("nmr","%d",&nmr); + read_line("ncy","%d",&ncy); + + find_section("GCR"); + read_line("nkv","%d",&nkv); + read_line("nmx","%d",&nmx); + read_line("res","%lf",&res); + + fclose(fin); + } + + MPI_Bcast(nbase,NAME_SIZE,MPI_CHAR,0,MPI_COMM_WORLD); + MPI_Bcast(cnfg_dir,NAME_SIZE,MPI_CHAR,0,MPI_COMM_WORLD); + MPI_Bcast(&first,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&last,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&step,1,MPI_INT,0,MPI_COMM_WORLD); + + MPI_Bcast(&kappa,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&csw,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&mu,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&eoflg,1,MPI_INT,0,MPI_COMM_WORLD); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(phi,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(phi_prime,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cF,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cF_prime,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmr,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&ncy,1,MPI_INT,0,MPI_COMM_WORLD); + + MPI_Bcast(&nkv,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmx,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&res,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + lat=set_lat_parms(5.5,1.0,1,&kappa,csw); + print_lat_parms(); + + set_bc_parms(bc,1.0,1.0,cF,cF_prime,phi,phi_prime); + print_bc_parms(); + + sap=set_sap_parms(bs,0,nmr,ncy); + m0=lat.m0[0]; + (void)set_sw_parms(m0); + tm=set_tm_parms(eoflg); + + start_ranlux(0,1234); + geometry(); + alloc_ws(2*nkv+1); + alloc_wsd(5); + psd=reserve_wsd(3); + + if (my_rank==0) + { + printf("mu = %.6f\n",mu); + printf("eoflg = %d\n\n",tm.eoflg); + + printf("bs = (%d,%d,%d,%d)\n",sap.bs[0],sap.bs[1],sap.bs[2],sap.bs[3]); + printf("nmr = %d\n",sap.nmr); + printf("ncy = %d\n\n",sap.ncy); + + printf("nkv = %d\n",nkv); + printf("nmx = %d\n",nmx); + printf("res = %.2e\n\n",res); + + printf("Configurations %sn%d -> %sn%d in steps of %d\n\n", + nbase,first,nbase,last,step); + fflush(flog); + } + + error_root(((last-first)%step)!=0,1,"main [check3.c]", + "last-first is not a multiple of step"); + + nsize=name_size("%s/%sn%d",cnfg_dir,nbase,last); + error_root(nsize>=NAME_SIZE,1,"main [check3.c]", + "cnfg_dir name is too long"); + + for (icnfg=first;icnfg<=last;icnfg+=step) + { + sprintf(cnfg_file,"%s/%sn%d",cnfg_dir,nbase,icnfg); + import_cnfg(cnfg_file); + + if (my_rank==0) + { + printf("Configuration no %d\n",icnfg); + fflush(flog); + } + + chs_ubnd(-1); + random_sd(VOLUME,psd[0],1.0); + bnd_sd2zero(ALL_PTS,psd[0]); + nrm=sqrt(norm_square_dble(VOLUME,1,psd[0])); + + for (isolv=0;isolv<2;isolv++) + { + assign_sd2sd(VOLUME,psd[0],psd[2]); + set_sap_parms(bs,isolv,nmr,ncy); + + rho=sap_gcr(nkv,nmx,res,mu,psd[0],psd[1],&status); + + error_chk(); + mulr_spinor_add_dble(VOLUME,psd[2],psd[0],-1.0); + del=norm_square_dble(VOLUME,1,psd[2]); + error_root(del!=0.0,1,"main [check3.c]", + "Source field is not preserved"); + + Dw_dble(mu,psd[1],psd[2]); + mulr_spinor_add_dble(VOLUME,psd[2],psd[0],-1.0); + del=sqrt(norm_square_dble(VOLUME,1,psd[2])); + + if (my_rank==0) + { + printf("isolv = %d:\n",isolv); + printf("status = %d\n",status); + printf("rho = %.2e, res = %.2e\n",rho,res); + printf("check = %.2e, check = %.2e\n",del,del/nrm); + } + + assign_sd2sd(VOLUME,psd[0],psd[2]); + + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + + rho=sap_gcr(nkv,nmx,res,mu,psd[2],psd[2],&status); + + MPI_Barrier(MPI_COMM_WORLD); + wt2=MPI_Wtime(); + wdt=wt2-wt1; + + if (my_rank==0) + { + printf("time = %.2e sec (total)\n",wdt); + if (status>0) + printf(" = %.2e usec (per point and GCR iteration)", + (1.0e6*wdt)/((double)(status)*(double)(VOLUME))); + printf("\n\n"); + fflush(flog); + } + + mulr_spinor_add_dble(VOLUME,psd[2],psd[1],-1.0); + del=norm_square_dble(VOLUME,1,psd[2]); + error_root(del!=0.0,1,"main [check3.c]", + "Incorrect result when the input and " + "output fields coincide"); + } + } + + if (my_rank==0) + fclose(flog); + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sap/check3.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sap/check3.in new file mode 100644 index 0000000000000000000000000000000000000000..4979a85d71f8f9b34f7d7c90ba517b9a035fb9d5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sap/check3.in @@ -0,0 +1,30 @@ + +[Configurations] +name 16x8x8x8b6.00id2 +cnfg_dir /home/data/openQCD/cnfg +first 7 +last 7 +step 1 + +[Lattice parameters] +kappa 0.1280 +csw 1.2 +mu 0.0123 +eoflg 1 + +[Boundary conditions] +type 0 +#phi 0.12 -0.56 +#phi' 0.92 0.76 +cF 0.95 +#cF' 0.90 + +[SAP] +bs 4 4 4 4 +nmr 4 +ncy 5 + +[GCR] +nkv 16 +nmx 128 +res 1.0e-12 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sap/time1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sap/time1.c new file mode 100644 index 0000000000000000000000000000000000000000..09023a0157d209a5a5c87a0bd7407986b05ce7e1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sap/time1.c @@ -0,0 +1,206 @@ + +/******************************************************************************* +* +* File time1.c +* +* Copyright (C) 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Timing of blk_mres() and blk_eo_mres(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "sap.h" +#include "global.h" + + +int main(int argc,char *argv[]) +{ + int my_rank,bc,count,nt; + int nb,isw,nmr,bs[4]; + int n,ie; + float mu; + double phi[2],phi_prime[2]; + double wt1,wt2,wdt; + spinor **ps; + FILE *flog=NULL,*fin=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("time1.log","w",stdout); + fin=freopen("time1.in","r",stdin); + + printf("\n"); + printf("Timing of blk_mres() and blk_eo_mres()\n"); + printf("--------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + +#if (defined x64) +#if (defined AVX) + printf("Using AVX instructions\n"); +#else + printf("Using SSE3 instructions and 16 xmm registers\n"); +#endif +#if (defined P3) + printf("Assuming SSE prefetch instructions fetch 32 bytes\n"); +#elif (defined PM) + printf("Assuming SSE prefetch instructions fetch 64 bytes\n"); +#elif (defined P4) + printf("Assuming SSE prefetch instructions fetch 128 bytes\n"); +#else + printf("SSE prefetch instructions are not used\n"); +#endif +#endif + printf("\n"); + + read_line("bs","%d %d %d %d",&bs[0],&bs[1],&bs[2],&bs[3]); + read_line("nmr","%d",&nmr); + fclose(fin); + + printf("bs = %d %d %d %d\n",bs[0],bs[1],bs[2],bs[3]); + printf("nmr = %d\n\n",nmr); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [time1.c]", + "Syntax: time1 [-bc ]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmr,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.55,0.78,0.9012,1.2034,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,12345); + geometry(); + alloc_ws(1); + set_sap_parms(bs,0,1,1); + alloc_bgr(SAP_BLOCKS); + + set_sw_parms(0.0123); + mu=0.0785f; + + random_ud(); + chs_ubnd(-1); + sw_term(NO_PTS); + assign_ud2ubgr(SAP_BLOCKS); + assign_swd2swbgr(SAP_BLOCKS,NO_PTS); + + ps=reserve_ws(1); + random_s(VOLUME,ps[0],1.0f); + bnd_s2zero(ALL_PTS,ps[0]); + normalize(VOLUME,1,ps[0]); + blk_list(SAP_BLOCKS,&nb,&isw); + + nt=(int)(1.0e7/(double)(nmr*VOLUME)); + if (nt<2) + nt=2; + wdt=0.0; + + while (wdt<5.0) + { + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + for (count=0;count +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "sap.h" +#include "global.h" + + +int main(int argc,char *argv[]) +{ + int my_rank,bc,count,nt; + int ncy,nmr,bs[4]; + int n,ie; + float mu; + double phi[2],phi_prime[2]; + double rbb,wt1,wt2,wdt; + spinor **ps; + FILE *flog=NULL,*fin=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("time2.log","w",stdout); + fin=freopen("time2.in","r",stdin); + + printf("\n"); + printf("Timing of the SAP preconditioner\n"); + printf("--------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + +#if (defined x64) +#if (defined AVX) + printf("Using AVX instructions\n"); +#else + printf("Using SSE3 instructions and 16 xmm registers\n"); +#endif +#if (defined P3) + printf("Assuming SSE prefetch instructions fetch 32 bytes\n"); +#elif (defined PM) + printf("Assuming SSE prefetch instructions fetch 64 bytes\n"); +#elif (defined P4) + printf("Assuming SSE prefetch instructions fetch 128 bytes\n"); +#else + printf("SSE prefetch instructions are not used\n"); +#endif +#endif + printf("\n"); + + read_line("bs","%d %d %d %d",&bs[0],&bs[1],&bs[2],&bs[3]); + read_line("ncy","%d",&ncy); + read_line("nmr","%d",&nmr); + fclose(fin); + + printf("bs = %d %d %d %d\n",bs[0],bs[1],bs[2],bs[3]); + printf("ncy = %d\n",ncy); + printf("nmr = %d\n\n",nmr); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [time2.c]", + "Syntax: time2 [-bc ]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&ncy,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmr,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.55,0.78,0.9012,1.2034,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,12345); + geometry(); + alloc_ws(3); + set_sap_parms(bs,0,1,1); + alloc_bgr(SAP_BLOCKS); + + set_sw_parms(0.0123); + mu=0.0785f; + rbb=2.0*(1.0/(double)(bs[0])+1.0/(double)(bs[1])+ + 1.0/(double)(bs[2])+1.0/(double)(bs[3])); + + random_ud(); + chs_ubnd(-1); + sw_term(NO_PTS); + assign_ud2ubgr(SAP_BLOCKS); + assign_swd2swbgr(SAP_BLOCKS,NO_PTS); + + ps=reserve_ws(3); + random_s(VOLUME,ps[2],1.0f); + bnd_s2zero(ALL_PTS,ps[2]); + normalize(VOLUME,1,ps[2]); + + nt=(int)(2.0e6/(double)(ncy*nmr*VOLUME)); + if (nt<2) + nt=2; + wdt=0.0; + + while (wdt<5.0) + { + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + for (count=0;count that allows the type of +boundary condition to be chosen at runtime. When the option is not set, open +boundary conditions are assumed. + +The option may be set but has no effect in the case of check1. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sflds/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sflds/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..02b4aff6412743a9f5677c37f3f1bc712518c2b5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sflds/Makefile @@ -0,0 +1,130 @@ +################################################################################ +# +# Makefile to compile and link C programs with MPI subroutines +# +# Version valid for Linux machines with MPICH +# +# "make" compiles and links the specified main programs and modules, +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files generated by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +all: rmxeq mkdep mkxeq +.PHONY: all + + +# main programs and modules to be compiled + +MAIN = check1 check2 + +FLAGS = flags lat_parms dfl_parms + +LATTICE = bcnds geometry + +RANDOM = ranlux ranlxs ranlxd gauss + +LINALG = cmatrix_dble liealg salg salg_dble + +UTILS = endian mutils utils wspace + +UFLDS = uflds + +SFLDS = sflds Pbnd Pbnd_dble scom sdcom + +SU3FCTS = su3prod su3ren cm3x3 random_su3 + +MODULES = $(FLAGS) $(LATTICE) $(RANDOM) $(LINALG) $(UTILS) \ + $(UFLDS) $(SFLDS) $(SU3FCTS) + + +# Logging option (-mpilog or -mpitrace or -mpianim) + +LOGOPTION = + + +# search path for modules + +MDIR = ../../modules + +VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/random:$(MDIR)/linalg:\ + $(MDIR)/utils:$(MDIR)/uflds:$(MDIR)/sflds:$(MDIR)/su3fcts + + +# additional include directories + +INCPATH = $(MPI_INCLUDE) ../../include + + +# additional libraries + +LIBS = m + +LIBPATH = $(MPI_HOME)/lib + + +# scheduling and optimization options + +CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ + -Wall -Wno-long-long -Wstrict-prototypes -Werror \ + -O -mno-avx -Dx64 -DPM + + +############################## do not change ################################### + +SHELL=/bin/bash +CC=$(MPI_HOME)/bin/mpicc +CLINKER=$(CC) + +PGMS= $(MAIN) $(MODULES) + +-include $(addsuffix .d,$(PGMS)) + + +# rule to make dependencies + +$(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + +# rule to link object files + +$(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(CFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ + + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables + +rmxeq: + @ -rm -f $(MAIN); \ + echo "delete old executables" + + +# make dependencies + +mkdep: $(addsuffix .d,$(PGMS)) + @ echo "generate tables of dependencies" + + +# clean directory + +clean: + @ -rm -rf *.d *.o *.alog *.clog *.slog $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sflds/check1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sflds/check1.c new file mode 100644 index 0000000000000000000000000000000000000000..b9a0f1644855934fe81fa94f5e6eee21d684945c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sflds/check1.c @@ -0,0 +1,339 @@ + +/******************************************************************************* +* +* File check1.c +* +* Copyright (C) 2005, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of the programs in the module sflds.c. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "linalg.h" +#include "sflds.h" +#include "global.h" + +#define NFLDS 3 + +typedef union +{ + spinor s; + float r[24]; +} spin_t; + +typedef union +{ + spinor_dble s; + double r[24]; +} spin_dble_t; + +static float sig[NFLDS]; +static double sigd[NFLDS]; + + +int main(int argc,char *argv[]) +{ + int my_rank,ie,k,i,ix; + float *r; + double *rd,var,var_all,d,dmax; + spinor **ps; + spinor_dble **psd; + spin_t *sps; + spin_dble_t *spsd; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check1.log","w",stdout); + printf("\n"); + printf("Check of the programs in the module sflds.c\n"); + printf("-------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + } + + start_ranlux(0,12345); + geometry(); + alloc_ws(2*NFLDS); + alloc_wsd(2*NFLDS); + ps=reserve_ws(2*NFLDS); + psd=reserve_wsd(2*NFLDS); + ie=0; + + for (k=0;k = %.4e (sigma^2 = %.4e)\n", + k,var_all,sig[k]*sig[k]); + } + } + + ie=0; + + for (k=0;k = %.4e (sigma^2 = %.4e)\n", + k,var_all,sigd[k]*sigd[k]); + } + } + + ie=0; + + for (k=0;kdmax) + dmax=d; + } + + if (my_rank==0) + { + printf("\n"); + printf("Relative deviations (should be less than 1.0e-7 or so):\n"); + printf("diff_s2s(): %.1e\n",sqrt(dmax)); + } + + dmax=0.0; + + for (k=0;kdmax) + dmax=d; + } + + if (my_rank==0) + printf("add_s2sd(): %.1e\n",sqrt(dmax)); + + dmax=0.0; + + for (k=0;kdmax) + dmax=d; + } + + if (my_rank==0) + { + printf("diff_sd2s(): %.1e\n\n",sqrt(dmax)); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sflds/check2.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sflds/check2.c new file mode 100644 index 0000000000000000000000000000000000000000..71e53110398d1aa45b1797a0e26a2ffcc4ec05ac --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sflds/check2.c @@ -0,0 +1,768 @@ + +/******************************************************************************* +* +* File check2.c +* +* Copyright (C) 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of the communication programs in scom.c and sdcom.c. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "random.h" +#include "su3fcts.h" +#include "utils.h" +#include "sflds.h" +#include "linalg.h" +#include "lattice.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define N1 (NPROC1*L1) +#define N2 (NPROC2*L2) +#define N3 (NPROC3*L3) +#define NFLDS 4 + +typedef union +{ + spinor_dble s; + double r[24]; +} spin_dble_t; + +static double p[4]; +static spinor_dble rs ALIGNED16; +static const spinor_dble sd0={{{0.0}}}; + + +static int is_zero_dble(spinor_dble *s) +{ + int i,ie; + spin_dble_t *sp; + + sp=(spin_dble_t*)(s); + ie=1; + + for (i=0;i<24;i++) + ie&=((*sp).r[i]==0.0); + + return ie; +} + + +static int check_int_bnd_dble(spinor_dble *s) +{ + int bc,ix,iy,t; + int ie; + + bc=bc_type(); + ie=1; + + for (ix=0;ix=(VOLUME/2))&&(t==(N0-1))&&((bc==1)||(bc==2))) + { + iy=iup[ix][0]; + ie&=is_zero_dble(s+iy-ix); + } + else + ie&=(is_zero_dble(s)^0x1); + + s+=1; + } + + return ie; +} + + +static int check_ext_bnd_dble(spinor_dble *s) +{ + int bc,ix,t; + int ie; + + bc=bc_type(); + ie=1; + + for (ix=0;ix=VOLUME)&& + ((ifc>1)|| + ((ifc==0)&&((cpr[0]>0)||(bc==3)))|| + ((ifc==1)&&((cpr[0]<(NPROC0-1))||(bc==3))))) + { + pt=p[0]*(double)(x[0]+bo[0]); + pv=p[1]*(double)(x[1]+bo[1])+ + p[2]*(double)(x[2]+bo[2])+ + p[3]*(double)(x[3]+bo[3]); + + if (bc==3) + { + z.re=cos(pt+pv); + z.im=sin(pt+pv); + } + else + { + z.re=sin(pt)*cos(pv); + z.im=sin(pt)*sin(pv); + } + + s[iy].c1=mul_cplx(z,&(rs.c1)); + s[iy].c2=mul_cplx(z,&(rs.c2)); + s[iy].c3=mul_cplx(z,&(rs.c3)); + s[iy].c4=mul_cplx(z,&(rs.c4)); + s[iy]=theta(ifc^is,s+iy); + } + } + } + } + } + } + } + + bnd_sd2zero(EVEN_PTS,s); +} + + +static double check_cpsd_int(int is,spinor_dble *s) +{ + int bc,bo[4]; + int x0,x1,x2,x3,x[4]; + int ix,iy,ifc,mu,i; + double pt,pv,d,dmax; + complex_dble z; + spin_dble_t r,*sp; + + bc=bc_type(); + bo[0]=cpr[0]*L0; + bo[1]=cpr[1]*L1; + bo[2]=cpr[2]*L2; + bo[3]=cpr[3]*L3; + dmax=0.0; + + for (x0=0;x0=VOLUME)&& + ((ifc>1)|| + ((ifc==0)&&((cpr[0]>0)||(bc==3)))|| + ((ifc==1)&&((cpr[0]<(NPROC0-1))||(bc==3))))) + { + pt=p[0]*(double)(x[0]+bo[0]); + pv=p[1]*(double)(x[1]+bo[1])+ + p[2]*(double)(x[2]+bo[2])+ + p[3]*(double)(x[3]+bo[3]); + + if (bc==3) + { + z.re=cos(pt+pv); + z.im=sin(pt+pv); + } + else + { + z.re=sin(pt)*cos(pv); + z.im=sin(pt)*sin(pv); + } + + r.s.c1=mul_cplx(z,&(rs.c1)); + r.s.c2=mul_cplx(z,&(rs.c2)); + r.s.c3=mul_cplx(z,&(rs.c3)); + r.s.c4=mul_cplx(z,&(rs.c4)); + sp=(spin_dble_t*)(s+iy); + + for (i=0;i<18;i++) + r.r[i]-=(*sp).r[i]; + + r.s=theta((ifc^0x1)^is,&(r.s)); + + for (i=0;i<18;i++) + { + d=fabs(r.r[i]); + if (d>dmax) + dmax=d; + } + } + } + } + } + } + } + } + + if (NPROC>1) + { + d=dmax; + MPI_Reduce(&d,&dmax,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Bcast(&dmax,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + + return dmax; +} + + +static double check_cpsd_ext(int is,spinor_dble *s) +{ + int bc,bo[4]; + int x0,x1,x2,x3; + int ix,iy,ifc,mu,i; + double pt,pv,d,dmax; + complex_dble z; + spin_dble_t r,*sp; + + bc=bc_type(); + bo[0]=cpr[0]*L0; + bo[1]=cpr[1]*L1; + bo[2]=cpr[2]*L2; + bo[3]=cpr[3]*L3; + dmax=0.0; + + for (x0=0;x0=VOLUME)&& + ((ifc>1)|| + ((ifc==0)&&((cpr[0]>0)||(bc==3)))|| + ((ifc==1)&&((cpr[0]<(NPROC0-1))||(bc==3))))) + { + pt=p[0]*(double)(x0+bo[0]); + pv=p[1]*(double)(x1+bo[1])+ + p[2]*(double)(x2+bo[2])+ + p[3]*(double)(x3+bo[3]); + + if (bc==3) + { + z.re=cos(pt+pv); + z.im=sin(pt+pv); + } + else + { + z.re=sin(pt)*cos(pv); + z.im=sin(pt)*sin(pv); + } + + r.s.c1=mul_cplx(z,&(rs.c1)); + r.s.c2=mul_cplx(z,&(rs.c2)); + r.s.c3=mul_cplx(z,&(rs.c3)); + r.s.c4=mul_cplx(z,&(rs.c4)); + r.s=theta((ifc^0x1)^is,&(r.s)); + + for (i=0;i<18;i++) + (*sp).r[i]-=r.r[i]; + } + } + + for (i=0;i<18;i++) + { + d=fabs((*sp).r[i]); + if (d>dmax) + dmax=d; + } + } + else + { + for (i=0;i<18;i++) + { + d=fabs((*sp).r[i]); + if (d>dmax) + dmax=d; + } + } + } + } + } + } + + if (NPROC>1) + { + d=dmax; + MPI_Reduce(&d,&dmax,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Bcast(&dmax,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + + return dmax; +} + + +int main(int argc,char *argv[]) +{ + int my_rank,bc,ie,is,k; + double phi[2],phi_prime[2]; + double cG,cG_prime,cF,cF_prime; + double d,dmax; + spinor **ps; + spinor_dble **psd; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check2.log","w",stdout); + printf("\n"); + printf(" Check of the communication programs in scom.c and sdcom.c\n"); + printf("----------------------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check2.c]", + "Syntax: check2 [-bc ]"); + } + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + cG=0.97; + cG_prime=1.056; + cF=0.82; + cF_prime=1.12; + set_bc_parms(bc,cG,cG_prime,cF,cF_prime,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,12345); + geometry(); + alloc_ws(NFLDS); + alloc_wsd(NFLDS); + + ps=reserve_ws(NFLDS); + psd=reserve_wsd(NFLDS); + dmax=0.0; + + for (is=0;is<2;is++) + { + for (k=0;kdmax) + dmax=d; + + random_sd(NSPIN,psd[k],1.0); + assign_sd2s(NSPIN,psd[k],ps[k]); + d=(double)(norm_square(NSPIN,1,ps[k])); + cps_ext_bnd(is,ps[k]); + cpsd_ext_bnd(is,psd[k]); + assign_sd2s(NSPIN,psd[k],ps[k+1]); + mulr_spinor_add(NSPIN,ps[k],ps[k+1],-1.0f); + d=(double)(norm_square(NSPIN,1,ps[k]))/d; + d=sqrt(d); + if (d>dmax) + dmax=d; + } + } + + if (my_rank==0) + { + printf("Maximal relative deviation single-/double-precision programs" + " = %.1e\n",dmax); + printf("Now checking double-precision programs:\n"); + } + + ie=1; + + for (is=0;is<2;is++) + { + for (k=0;kdmax) + dmax=d; + d=check_cpsd_int(is,psd[0]); + if (d>dmax) + dmax=d; + } + + if (my_rank==0) + printf("Maximal deviation (cpsd_int_bnd) = %.1e\n",dmax); + + dmax=0.0; + + for (is=0;is<2;is++) + { + random_sd(NSPIN,psd[0],1.0); + set_sd_bnd(is,psd[0]); + assign_sd2sd(NSPIN,psd[0],psd[1]); + cpsd_ext_bnd(is,psd[0]); + mulr_spinor_add_dble(NSPIN-VOLUME,psd[1]+VOLUME,psd[0]+VOLUME,-1.0); + d=norm_square_dble(NSPIN-VOLUME,1,psd[1]+VOLUME); + if (d>dmax) + dmax=d; + mulr_spinor_add_dble(VOLUME,psd[0],psd[1],-1.0); + d=check_cpsd_ext(is,psd[0]); + if (d>dmax) + dmax=d; + } + + if (my_rank==0) + { + printf("Maximal deviation (cpsd_ext_bnd) = %.1e\n\n",dmax); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sw_term/INDEX b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sw_term/INDEX new file mode 100644 index 0000000000000000000000000000000000000000..cb78e01ea2be290c4b24decbdb5389c6212046e8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sw_term/INDEX @@ -0,0 +1,14 @@ + +Calculation of the Sheikholeslami-Wohlert term + +check1 Allocation, assignment and inversion of the global SW arrays. + +check2 Check of the gauge covariance of the SW term. + +check3 Check of the SW term for abelian background fields. + +time1 Timing of the program sw_term(). + +All programs accept the option -bc that allows the type of boundary +condition to be chosen. When the option is not set, open boundary conditions +are assumed. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sw_term/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sw_term/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..57cf393b761576cd861acc339ef6714fe5f37834 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sw_term/Makefile @@ -0,0 +1,136 @@ +################################################################################ +# +# Makefile to compile and link C programs with MPI subroutines +# +# Version valid for Linux machines with MPICH +# +# "make" compiles and links the specified main programs and modules, +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files generated by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +all: rmxeq mkdep mkxeq +.PHONY: all + + +# main programs and modules to be compiled + +MAIN = check1 check2 check3 time1 + +FLAGS = flags lat_parms dfl_parms + +LATTICE = bcnds uidx ftidx geometry + +LINALG = salg_dble liealg cmatrix_dble + +RANDOM = ranlux ranlxs ranlxd gauss + +UFLDS = plaq_sum shift uflds udcom + +SU3FCTS = chexp su3prod su3ren cm3x3 random_su3 + +UTILS = endian mutils utils wspace + +SFLDS = sflds + +TCHARGE = ftcom ftensor + +SW_TERM = pauli pauli_dble swflds sw_term + + +MODULES = $(FLAGS) $(LATTICE) $(LINALG) $(RANDOM) $(UFLDS) \ + $(SU3FCTS) $(UTILS) $(SFLDS) $(TCHARGE) $(SW_TERM) + + +# Logging option (-mpilog or -mpitrace or -mpianim) + +LOGOPTION = + + +# search path for modules + +MDIR = ../../modules + +VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/linalg:$(MDIR)/random:\ + $(MDIR)/uflds:$(MDIR)/su3fcts:$(MDIR)/utils:$(MDIR)/sflds:\ + $(MDIR)/tcharge:$(MDIR)/sw_term + + +# additional include directories + +INCPATH = $(MPI_INCLUDE) ../../include + + +# additional libraries + +LIBS = m + +LIBPATH = $(MPI_HOME)/lib + + +# scheduling and optimization options + +CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ + -Wall -Wno-long-long -Wstrict-prototypes -Werror \ + -O -mno-avx -Dx64 -DPM + + +############################## do not change ################################### + +SHELL=/bin/bash +CC=$(MPI_HOME)/bin/mpicc +CLINKER=$(CC) + +PGMS= $(MAIN) $(MODULES) + +-include $(addsuffix .d,$(PGMS)) + + +# rule to make dependencies + +$(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + +# rule to link object files + +$(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(CFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ + + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables + +rmxeq: + @ -rm -f $(MAIN); \ + echo "delete old executables" + + +# make dependencies + +mkdep: $(addsuffix .d,$(PGMS)) + @ echo "generate tables of dependencies" + + +# clean directory + +clean: + @ -rm -rf *.d *.o *.alog *.clog *.slog $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sw_term/check1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sw_term/check1.c new file mode 100644 index 0000000000000000000000000000000000000000..faf6f2a16be67e882d9e5b8ae6e835398ad6e90e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sw_term/check1.c @@ -0,0 +1,403 @@ + +/******************************************************************************* +* +* File check1.c +* +* Copyright (C) 2005, 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Allocation, assignment and inversion of the global SW arrays. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sw_term.h" +#include "global.h" + +#define N0 (NPROC0*L0) + +typedef union +{ + weyl_dble w; + complex_dble c[6]; +} spin_dble_t; + +static pauli_dble *sswd=NULL; +static spin_dble_t vd ALIGNED32; +static const weyl_dble vd0={{{0.0}}}; + + +static void save_swd(void) +{ + pauli_dble *pa,*pb,*pm; + + if (sswd==NULL) + { + sswd=amalloc(2*VOLUME*sizeof(*sswd),ALIGN); + error(sswd==NULL,1,"save_swd [check1.c]", + "Unable to allocate auxiliary array"); + } + + pa=swdfld(); + pb=sswd; + pm=pa+2*VOLUME; + + for (;padmax) + dmax=d; + } + + pb+=1; + } + + return dmax; +} + + +static double cmp_iswd(ptset_t set) +{ + int k,l; + double d,dmax; + pauli_dble *pa,*pb,*pm; + + pa=swdfld(); + pb=sswd; + pm=pa; + + if (set==EVEN_PTS) + pm=pa+VOLUME; + else if (set==ODD_PTS) + { + pa+=VOLUME; + pb+=VOLUME; + pm=pa+VOLUME; + } + else if (set==ALL_PTS) + pm=pa+2*VOLUME; + + dmax=0.0; + + for (;padmax) + dmax=d; + } + } + + pb+=1; + } + + return sqrt(dmax); +} + + +static double cmp_sw2swd(ptset_t set) +{ + int k; + double d,dmax; + pauli *pa,*pm; + pauli_dble *pb; + + pa=swfld(); + pb=swdfld(); + pm=pa; + + if (set==EVEN_PTS) + pm=pa+VOLUME; + else if (set==ODD_PTS) + { + pa+=VOLUME; + pb+=VOLUME; + pm=pa+VOLUME; + } + else if (set==ALL_PTS) + pm=pa+2*VOLUME; + + dmax=0.0; + + for (;padmax) + dmax=d; + } + + pb+=1; + } + + return dmax; +} + + +int main(int argc,char *argv[]) +{ + int my_rank,bc,ix,ie; + double phi[2],phi_prime[2]; + double d,dmax; + pauli *sw; + pauli_dble *swd; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check1.log","w",stdout); + printf("\n"); + printf("Initialization and inversion of the global SW arrays\n"); + printf("----------------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check1.c]", + "Syntax: check1 [-bc ]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,1.0,1.0,1.301,0.789,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,123456); + geometry(); + + set_sw_parms(-0.0123); + sw=swfld(); + swd=swdfld(); + ie=1; + + for (ix=0;ix<(2*VOLUME);ix++) + { + ie|=is_unity(sw); + ie|=is_unity_dble(swd); + sw+=1; + swd+=1; + } + + error(ie!=1,1,"main [check1.c]","SW fields are not correctly initialized"); + + print_flags(); + random_ud(); + sw_term(NO_PTS); + ie=check_swbnd(); + error(ie!=1,1,"main [check1.c]","SW field has incorrect boundary values"); + save_swd(); + + chs_ubnd(-1); + sw_term(NO_PTS); + d=cmp_swd(ALL_PTS); + error(d!=0.0,1,"main [check1.c]", + "SW term changed after calling chs_ubnd(-1)"); + + ie=sw_term(EVEN_PTS); + error(ie!=0,1,"main [check1.c]","Unsafe inversion of swd_e"); + d=cmp_iswd(EVEN_PTS); + MPI_Reduce(&d,&dmax,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + + if (my_rank==0) + { + printf("Inverted swd_e\n"); + printf("Maximal deviation of swd_e = %.1e\n",dmax); + } + + d=cmp_swd(ODD_PTS); + MPI_Reduce(&d,&dmax,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + + if (my_rank==0) + printf("Maximal deviation of swd_o = %.1e\n\n",dmax); + + print_flags(); + random_ud(); + sw_term(NO_PTS); + save_swd(); + + ie=sw_term(ODD_PTS); + error(ie!=0,1,"main [check1.c]","Unsafe inversion of swd_o"); + d=cmp_swd(EVEN_PTS); + MPI_Reduce(&d,&dmax,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + + if (my_rank==0) + { + printf("Inverted swd_o\n"); + printf("Maximal deviation of swd_e = %.1e\n",dmax); + } + + d=cmp_iswd(ODD_PTS); + MPI_Reduce(&d,&dmax,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + + if (my_rank==0) + printf("Maximal deviation of swd_o = %.1e\n\n",dmax); + + print_flags(); + assign_swd2sw(); + d=cmp_sw2swd(ALL_PTS); + MPI_Reduce(&d,&dmax,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + + if (my_rank==0) + { + printf("Assigned swd to sw\n"); + printf("Maximal deviation = %.1e\n\n",dmax); + } + + print_flags(); + random_ud(); + sw_term(NO_PTS); + save_swd(); + + ie=sw_term(ALL_PTS); + error(ie!=0,1,"main [check1.c]","Unsafe inversion of swd"); + d=cmp_iswd(ALL_PTS); + MPI_Reduce(&d,&dmax,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + + if (my_rank==0) + { + printf("Inverted swd\n"); + printf("Maximal deviation = %.1e\n\n",dmax); + } + + print_flags(); + + if (my_rank==0) + fclose(flog); + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sw_term/check2.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sw_term/check2.c new file mode 100644 index 0000000000000000000000000000000000000000..c9f91cbdcd84793fedaaec012fe4cd1fdbac5d26 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sw_term/check2.c @@ -0,0 +1,366 @@ + +/******************************************************************************* +* +* File check2.c +* +* Copyright (C) 2005, 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of the gauge covariance of the SW term. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "global.h" + +#define N0 (NPROC0*L0) + +static int bc,nfc[8],ofs[8]; +static const su3_dble ud0={{0.0}}; +static su3_dble *g,*gbuf; +static su3_dble wd ALIGNED16; + + +static void pack_gbuf(void) +{ + int ifc,ib,ix; + + nfc[0]=FACE0/2; + nfc[1]=FACE0/2; + nfc[2]=FACE1/2; + nfc[3]=FACE1/2; + nfc[4]=FACE2/2; + nfc[5]=FACE2/2; + nfc[6]=FACE3/2; + nfc[7]=FACE3/2; + + ofs[0]=0; + ofs[1]=ofs[0]+nfc[0]; + ofs[2]=ofs[1]+nfc[1]; + ofs[3]=ofs[2]+nfc[2]; + ofs[4]=ofs[3]+nfc[3]; + ofs[5]=ofs[4]+nfc[4]; + ofs[6]=ofs[5]+nfc[5]; + ofs[7]=ofs[6]+nfc[6]; + + for (ifc=0;ifc<8;ifc++) + { + for (ib=0;ib0) + { + tag=mpi_tag(); + saddr=npr[ifc^0x1]; + raddr=npr[ifc]; + sbuf=gbuf+ofs[ifc]; + rbuf=g+VOLUME+ofs[ifc]; + + if (np&0x1) + { + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + } + else + { + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + } + } + } +} + + +static void random_g(void) +{ + int ix,t; + su3_dble unity,*gx; + + unity=ud0; + unity.c11.re=1.0; + unity.c22.re=1.0; + unity.c33.re=1.0; + gx=g; + + for (ix=0;ix0)||(bc!=1)) + random_su3_dble(gx); + else + (*gx)=unity; + + gx+=1; + } + + if (BNDRY>0) + { + pack_gbuf(); + send_gbuf(); + } +} + + +static void transform_ud(void) +{ + int ix,iy,t,ifc; + su3_dble *u; + + u=udfld(); + + for (ix=(VOLUME/2);ix]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.55,0.78,0.9012,1.2034,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,123456); + geometry(); + alloc_wsd(4); + psd=reserve_wsd(4); + + g=amalloc(NSPIN*sizeof(*g),4); + if (BNDRY!=0) + gbuf=amalloc((BNDRY/2)*sizeof(*gbuf),4); + + error((g==NULL)||((BNDRY!=0)&&(gbuf==NULL)),1,"main [check2.c]", + "Unable to allocate auxiliary arrays"); + + swp=set_sw_parms(-0.0123); + + if (my_rank==0) + printf("m0 = %.4e, csw = %.4e, cF = %.4e, cF' = %.4e\n\n", + swp.m0,swp.csw,swp.cF[0],swp.cF[1]); + + random_g(); + random_ud(); + + for (i=0;i<4;i++) + random_sd(VOLUME,psd[i],1.0); + + (void)sw_term(NO_PTS); + sw=swdfld(); + apply_sw_dble(VOLUME,0.789,sw,psd[0],psd[1]); + + transform_sd(psd[0],psd[2]); + transform_ud(); + (void)sw_term(NO_PTS); + sw=swdfld(); + apply_sw_dble(VOLUME,0.789,sw,psd[2],psd[3]); + transform_sd(psd[1],psd[2]); + + mulr_spinor_add_dble(VOLUME,psd[3],psd[2],-1.0); + d=norm_square_dble(VOLUME,1,psd[3])/norm_square_dble(VOLUME,1,psd[0]); + error_chk(); + + if (my_rank==0) + { + printf("Maximal normalized difference = %.2e\n",sqrt(d)); + printf("(should be around 1*10^(-15) or so)\n\n"); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sw_term/check3.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sw_term/check3.c new file mode 100644 index 0000000000000000000000000000000000000000..f1c4e0462269a0869be2ea0f68bbd19d9f21f225 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sw_term/check3.c @@ -0,0 +1,585 @@ + +/******************************************************************************* +* +* File check3.c +* +* Copyright (C) 2005, 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of the SW term for abelian background fields. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define N1 (NPROC1*L1) +#define N2 (NPROC2*L2) +#define N3 (NPROC3*L3) + +static int bc,np[4]; +static double t[3],a[4],p[4],inp[4]; +static double (*Fhat)[3]; +static const su3_dble ud0={{0.0}}; +static spinor_dble ws; + + +static void alloc_Fhat(void) +{ + Fhat=amalloc(VOLUME*sizeof(*Fhat),3); + + error(Fhat==NULL,1,"alloc_Fhat [check3.c]", + "Unable to allocate auxiliary array"); +} + + +static void set_parms(void) +{ + int my_rank; + double pi; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + gauss_dble(t,2); + t[2]=-t[0]-t[1]; + + ranlxd(a,4); + + np[0]=(int)(a[0]*(double)(N0)); + np[1]=(int)(a[1]*(double)(N1)); + np[2]=(int)(a[2]*(double)(N2)); + np[3]=(int)(a[3]*(double)(N3)); + + pi=4.0*atan(1.0); + + p[0]=(double)(np[0])*2.0*pi/(double)(N0); + p[1]=(double)(np[1])*2.0*pi/(double)(N1); + p[2]=(double)(np[2])*2.0*pi/(double)(N2); + p[3]=(double)(np[3])*2.0*pi/(double)(N3); + + gauss_dble(a,4); + } + + MPI_Bcast(t,3,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(a,4,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(p,4,MPI_DOUBLE,0,MPI_COMM_WORLD); + + inp[0]=1.0/(double)(N0); + inp[1]=1.0/(double)(N1); + inp[2]=1.0/(double)(N2); + inp[3]=1.0/(double)(N3); +} + + +static double afld(int *x,int mu) +{ + double xt[4],px; + + xt[0]=(double)(safe_mod(x[0],N0)); + xt[1]=(double)(safe_mod(x[1],N1)); + xt[2]=(double)(safe_mod(x[2],N2)); + xt[3]=(double)(safe_mod(x[3],N3)); + + px=p[0]*xt[0]+p[1]*xt[1]+p[2]*xt[2]+p[3]*xt[3]; + + return a[mu]*sin(px); +} + + +static void ftplaq(int *x,int mu,int nu,double *ftp) +{ + double sm,om[3],*phi; + bc_parms_t bcp; + + bcp=bc_parms(); + + if ((x[0]==0)&&(mu==0)&&(bc==1)) + { + sm=afld(x,mu); + x[mu]+=1; + sm+=afld(x,nu); + x[mu]-=1; + x[nu]+=1; + sm-=afld(x,mu); + x[nu]-=1; + + phi=bcp.phi[0]; + om[0]=t[0]*sm-phi[0]*inp[nu]; + om[1]=t[1]*sm-phi[1]*inp[nu]; + om[2]=t[2]*sm-phi[2]*inp[nu]; + } + else if ((x[0]==(N0-1))&&(mu==0)&&((bc==1)||(bc==2))) + { + sm=afld(x,mu)-afld(x,nu); + x[nu]+=1; + sm-=afld(x,mu); + x[nu]-=1; + + phi=bcp.phi[1]; + om[0]=t[0]*sm+phi[0]*inp[nu]; + om[1]=t[1]*sm+phi[1]*inp[nu]; + om[2]=t[2]*sm+phi[2]*inp[nu]; + } + else + { + sm=afld(x,mu)-afld(x,nu); + x[mu]+=1; + sm+=afld(x,nu); + x[mu]-=1; + x[nu]+=1; + sm-=afld(x,mu); + x[nu]-=1; + + om[0]=t[0]*sm; + om[1]=t[1]*sm; + om[2]=t[2]*sm; + } + + ftp[0]=sin(om[0]); + ftp[1]=sin(om[1]); + ftp[2]=sin(om[2]); +} + + +static void set_ud(void) +{ + int bo[4],x[4]; + int x0,x1,x2,x3,ix,ifc,mu; + double r1,r2; + su3_dble *udb,*u; + + udb=udfld(); + bo[0]=cpr[0]*L0; + bo[1]=cpr[1]*L1; + bo[2]=cpr[2]*L2; + bo[3]=cpr[3]*L3; + + for (x0=0;x0=(VOLUME/2)) + { + u=udb+8*(ix-(VOLUME/2)); + + for (ifc=0;ifc<8;ifc++) + { + mu=ifc/2; + if (ifc&0x1) + x[mu]-=1; + r1=afld(x,mu); + if (ifc&0x1) + x[mu]+=1; + r2=t[0]*r1; + (*u)=ud0; + (*u).c11.re=cos(r2); + (*u).c11.im=sin(r2); + r2=t[1]*r1; + (*u).c22.re=cos(r2); + (*u).c22.im=sin(r2); + r2=t[2]*r1; + (*u).c33.re=cos(r2); + (*u).c33.im=sin(r2); + u+=1; + } + } + } + } + } + } + + set_bc(); + set_flags(UPDATED_UD); +} + + +static void compute_Fhat(int mu,int nu) +{ + int bo[4],x[4]; + int x0,x1,x2,x3,ix; + double ftp[4][3]; + + bo[0]=cpr[0]*L0; + bo[1]=cpr[1]*L1; + bo[2]=cpr[2]*L2; + bo[3]=cpr[3]*L3; + + for (x0=0;x00)&&((x[0]<(N0-1))||(bc!=0)))||(bc==3)) + { + ftplaq(x,mu,nu,ftp[0]); + x[mu]-=1; + ftplaq(x,mu,nu,ftp[1]); + x[nu]-=1; + ftplaq(x,mu,nu,ftp[2]); + x[mu]+=1; + ftplaq(x,mu,nu,ftp[3]); + + Fhat[ix][0]=0.25*(ftp[0][0]+ftp[1][0]+ftp[2][0]+ftp[3][0]); + Fhat[ix][1]=0.25*(ftp[0][1]+ftp[1][1]+ftp[2][1]+ftp[3][1]); + Fhat[ix][2]=0.25*(ftp[0][2]+ftp[1][2]+ftp[2][2]+ftp[3][2]); + } + else + { + Fhat[ix][0]=0.0; + Fhat[ix][1]=0.0; + Fhat[ix][2]=0.0; + } + } + } + } + } +} + + +static su3_vector_dble mul_cplx(complex_dble z,su3_vector_dble s) +{ + su3_vector_dble r; + + r.c1.re=z.re*s.c1.re-z.im*s.c1.im; + r.c1.im=z.im*s.c1.re+z.re*s.c1.im; + r.c2.re=z.re*s.c2.re-z.im*s.c2.im; + r.c2.im=z.im*s.c2.re+z.re*s.c2.im; + r.c3.re=z.re*s.c3.re-z.im*s.c3.im; + r.c3.im=z.im*s.c3.re+z.re*s.c3.im; + + return r; +} + + +static spinor_dble mul_gamma(int mu,spinor_dble s) +{ + spinor_dble r; + complex_dble i,m_i,m_1; + + i.re=0.0; + i.im=1.0; + + m_i.re=0.0; + m_i.im=-1.0; + + m_1.re=-1.0; + m_1.im=0.0; + + if (mu==0) + { + r.c1=mul_cplx(m_1,s.c3); + r.c2=mul_cplx(m_1,s.c4); + r.c3=mul_cplx(m_1,s.c1); + r.c4=mul_cplx(m_1,s.c2); + } + else if (mu==1) + { + r.c1=mul_cplx(m_i,s.c4); + r.c2=mul_cplx(m_i,s.c3); + r.c3=mul_cplx(i,s.c2); + r.c4=mul_cplx(i,s.c1); + } + else if (mu==2) + { + r.c1=mul_cplx(m_1,s.c4); + r.c2=s.c3; + r.c3=s.c2; + r.c4=mul_cplx(m_1,s.c1); + } + else if (mu==3) + { + r.c1=mul_cplx(m_i,s.c3); + r.c2=mul_cplx(i,s.c4); + r.c3=mul_cplx(i,s.c1); + r.c4=mul_cplx(m_i,s.c2); + } + else + { + r.c1=s.c1; + r.c2=s.c2; + r.c3=mul_cplx(m_1,s.c3); + r.c4=mul_cplx(m_1,s.c4); + } + + return r; +} + + +static spinor_dble mul_sigma(int mu,int nu,spinor_dble s) +{ + complex_dble z; + spinor_dble r1,r2; + + r1=mul_gamma(nu,s); + r1=mul_gamma(mu,r1); + + r2=mul_gamma(mu,s); + r2=mul_gamma(nu,r2); + + _vector_sub_assign(r1.c1,r2.c1); + _vector_sub_assign(r1.c2,r2.c2); + _vector_sub_assign(r1.c3,r2.c3); + _vector_sub_assign(r1.c4,r2.c4); + + z.re=0.0; + z.im=0.5; + _vector_mulc(r2.c1,z,r1.c1); + _vector_mulc(r2.c2,z,r1.c2); + _vector_mulc(r2.c3,z,r1.c3); + _vector_mulc(r2.c4,z,r1.c4); + + return r2; +} + + +static void muladd_pauli(double csw,int mu,int nu, + spinor_dble *pk,spinor_dble *pl) +{ + int ix; + double r; + + compute_Fhat(mu,nu); + + csw=(-0.25)*csw; + + for (ix=0;ix]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,1.0,1.0,1.301,0.789,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,12345); + geometry(); + alloc_Fhat(); + alloc_wsd(3); + psd=reserve_wsd(3); + + set_sw_parms(-0.0123); + swp=sw_parms(); + dmax=0.0; + + if (my_rank==0) + printf("m0=%.4e, csw=%.4e, cF=%.4e, cF'=%.4e\n\n", + swp.m0,swp.csw,swp.cF[0],swp.cF[1]); + + for (n=0;n<4;n++) + { + set_parms(); + set_ud(); + (void)sw_term(NO_PTS); + sw=swdfld(); + + random_sd(VOLUME,psd[0],1.0); + apply_sw_dble(VOLUME,0.0,sw,psd[0],psd[1]); + mul_swd(swp.m0,swp.csw,psd[0],psd[2]); + bnd_corr(swp.cF,psd[0],psd[2]); + + mulr_spinor_add_dble(VOLUME,psd[2],psd[1],-1.0); + d=norm_square_dble(VOLUME,1,psd[2])/norm_square_dble(VOLUME,1,psd[0]); + d=sqrt(d); + if (d>dmax) + dmax=d; + + if (my_rank==0) + { + printf("Field number %d:\n",n+1); + printf("The parameters are:\n"); + printf("t=%.2f,%.2f,%.2f, a=%.2f,%.2f,%.2f,%.2f, ", + t[0],t[1],t[2],a[0],a[1],a[2],a[3]); + printf("np=%d,%d,%d,%d\n",np[0],np[1],np[2],np[3]); + printf("delta = %.2e\n\n",d); + } + } + + error_chk(); + + if (my_rank==0) + { + printf("Maximal deviation = %.1e\n\n",dmax); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sw_term/time1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sw_term/time1.c new file mode 100644 index 0000000000000000000000000000000000000000..f7eef6c947cdb7a808353c84dee0cd8233a32acb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/sw_term/time1.c @@ -0,0 +1,145 @@ + +/******************************************************************************* +* +* File time1.c +* +* Copyright (C) 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Timing of the program sw_term(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "sw_term.h" +#include "global.h" + + +int main(int argc,char *argv[]) +{ + int my_rank,bc,count,nt; + double phi[2],phi_prime[2]; + double wt1,wt2,wdt; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("time1.log","w",stdout); + printf("\n"); + printf("Timing of the program sw_term()\n"); + printf("-------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + +#if (defined AVX) + printf("Using AVX instructions\n\n"); +#elif (defined x64) + printf("Using SSE3 instructions and up to 16 xmm registers\n\n"); +#endif + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [time1.c]", + "Syntax: time1 [-bc ]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.978); + print_lat_parms(); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,1.0,1.0,1.301,0.789,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,12345); + geometry(); + set_sw_parms(-0.0123); + random_ud(); + + nt=(int)(5.0e5/(double)(VOLUME)); + if (nt<2) + nt=2; + wdt=0.0; + + while (wdt<5.0) + { + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + for (count=0;count that allows the type of boundary +condition to be chosen. When the option is not set, open boundary conditions +are assumed. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/tcharge/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/tcharge/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..4828aa1177fbc44147c2451a109866e863f25686 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/tcharge/Makefile @@ -0,0 +1,141 @@ +################################################################################ +# +# Makefile to compile and link C programs with MPI subroutines +# +# Version valid for Linux machines with MPICH +# +# "make" compiles and links the specified main programs and modules, +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files generated by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +all: rmxeq mkdep mkxeq +.PHONY: all + + +# main programs and modules to be compiled + +MAIN = check1 check2 check3 check4 check5 check6 + +FORCES = force0 + +FLAGS = flags lat_parms hmc_parms dfl_parms + +LATTICE = bcnds uidx ftidx geometry + +LINALG = liealg cmatrix_dble + +MDFLDS = mdflds fcom + +RANDOM = ranlux ranlxs ranlxd gauss random_su3 + +SFLDS = sflds + +SU3FCTS = chexp su3prod su3ren cm3x3 + +TCHARGE = ftcom ftensor tcharge ym_action + +UFLDS = plaq_sum shift uflds udcom bstap + +UTILS = endian mutils utils wspace + +WFLOW = wflow + +MODULES = $(FLAGS) $(FORCES) $(LATTICE) $(LINALG) $(MDFLDS) \ + $(RANDOM) $(SFLDS) $(SU3FCTS) $(TCHARGE) $(UFLDS) \ + $(UTILS) $(WFLOW) + + +# Logging option (-mpilog or -mpitrace or -mpianim) + +LOGOPTION = + + +# search path for modules + +MDIR = ../../modules + +VPATH = .:$(MDIR)/flags:$(MDIR)/forces:$(MDIR)/lattice:$(MDIR)/linalg:\ + $(MDIR)/mdflds:$(MDIR)/random:$(MDIR)/su3fcts:$(MDIR)/sflds:\ + $(MDIR)/su3fcts:$(MDIR)/uflds:$(MDIR)/utils:$(MDIR)/tcharge:\ + $(MDIR)/wflow + + +# additional include directories + +INCPATH = $(MPI_INCLUDE) ../../include + + +# additional libraries + +LIBS = m + +LIBPATH = $(MPI_HOME)/lib + + +# scheduling and optimization options + +CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ + -Wall -Wno-long-long -Wstrict-prototypes -Werror \ + -O -mno-avx -Dx64 -DPM + + +############################## do not change ################################### + +SHELL=/bin/bash +CC=$(MPI_HOME)/bin/mpicc +CLINKER=$(CC) + +PGMS= $(MAIN) $(MODULES) + +-include $(addsuffix .d,$(PGMS)) + + +# rule to make dependencies + +$(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + +# rule to link object files + +$(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(CFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ + + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables + +rmxeq: + @ -rm -f $(MAIN); \ + echo "delete old executables" + + +# make dependencies + +mkdep: $(addsuffix .d,$(PGMS)) + @ echo "generate tables of dependencies" + + +# clean directory + +clean: + @ -rm -rf *.d *.o *.alog *.clog *.slog $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/tcharge/check1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/tcharge/check1.c new file mode 100644 index 0000000000000000000000000000000000000000..b42cf3ad8d1bb37e155cc42a520bc5fad1d4ca3a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/tcharge/check1.c @@ -0,0 +1,375 @@ + +/******************************************************************************* +* +* File check1.c +* +* Copyright (C) 2009-2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of the gauge and translation invariance of the topological charge. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "su3fcts.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "tcharge.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define N1 (NPROC1*L1) +#define N2 (NPROC2*L2) +#define N3 (NPROC3*L3) + +static int bc,nfc[8],ofs[8]; +static const su3_dble ud0={{0.0}}; +static su3_dble *g,*gbuf; +static su3_dble wd ALIGNED16; + + +static void pack_gbuf(void) +{ + int ifc,ib,ix; + + nfc[0]=FACE0/2; + nfc[1]=FACE0/2; + nfc[2]=FACE1/2; + nfc[3]=FACE1/2; + nfc[4]=FACE2/2; + nfc[5]=FACE2/2; + nfc[6]=FACE3/2; + nfc[7]=FACE3/2; + + ofs[0]=0; + ofs[1]=ofs[0]+nfc[0]; + ofs[2]=ofs[1]+nfc[1]; + ofs[3]=ofs[2]+nfc[2]; + ofs[4]=ofs[3]+nfc[3]; + ofs[5]=ofs[4]+nfc[4]; + ofs[6]=ofs[5]+nfc[5]; + ofs[7]=ofs[6]+nfc[6]; + + for (ifc=0;ifc<8;ifc++) + { + for (ib=0;ib0) + { + tag=mpi_tag(); + saddr=npr[ifc^0x1]; + raddr=npr[ifc]; + sbuf=gbuf+ofs[ifc]; + rbuf=g+VOLUME+ofs[ifc]; + + if (np&0x1) + { + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + } + else + { + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + } + } + } +} + + +static void random_g(void) +{ + int ix,t; + su3_dble unity,*gx; + + unity=ud0; + unity.c11.re=1.0; + unity.c22.re=1.0; + unity.c33.re=1.0; + gx=g; + + for (ix=0;ix0)||(bc!=1)) + random_su3_dble(gx); + else + (*gx)=unity; + + gx+=1; + } + + if (BNDRY>0) + { + pack_gbuf(); + send_gbuf(); + } +} + + +static void transform_ud(void) +{ + int ix,iy,t,ifc; + su3_dble *u; + + u=udfld(); + + for (ix=(VOLUME/2);ix(bs[mu]/2)) + svec[mu]-=bs[mu]; + } + + MPI_Bcast(svec,4,MPI_INT,0,MPI_COMM_WORLD); +} + + +int main(int argc,char *argv[]) +{ + int my_rank,i,s[4]; + double phi[2],phi_prime[2]; + double d,dmax1,dmax2; + double Q1,Q2,q1,q2; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check1.log","w",stdout); + printf("\n"); + printf("Gauge and translation invariance of the topological charge\n"); + printf("----------------------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check1.c]", + "Syntax: check1 [-bc ]"); + } + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,1.0,1.0,1.0,1.0,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,12345); + geometry(); + + g=amalloc(NSPIN*sizeof(*g),4); + + if (BNDRY>0) + gbuf=amalloc((BNDRY/2)*sizeof(*gbuf),4); + + error((g==NULL)||((BNDRY>0)&&(gbuf==NULL)),1,"main [check1.c]", + "Unable to allocate auxiliary arrays"); + + dmax1=0.0; + dmax2=0.0; + + for (i=0;i<8;i++) + { + random_ud(); + + Q1=tcharge(); + random_vec(s); + if (bc!=3) + s[0]=0; + shift_ud(s); + Q2=tcharge(); + + d=fabs(Q1-Q2); + if (d>dmax1) + dmax1=d; + + random_g(); + transform_ud(); + Q2=tcharge(); + + d=fabs(Q1-Q2); + if (d>dmax2) + dmax2=d; + + q1=Q1; + q2=Q2; + + MPI_Bcast(&q1,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&q2,1,MPI_INT,0,MPI_COMM_WORLD); + + error((q1!=Q1)||(q2!=Q2),1,"main [check1.c]", + "Charge is not globally the same"); + } + + error_chk(); + print_flags(); + + if (my_rank==0) + { + printf("Translation invariance = %.2e\n",dmax1); + printf("Gauge invariance = %.2e\n\n",dmax2); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/tcharge/check2.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/tcharge/check2.c new file mode 100644 index 0000000000000000000000000000000000000000..8759e74dccf6775b3076189fb931196931307797 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/tcharge/check2.c @@ -0,0 +1,463 @@ + +/******************************************************************************* +* +* File check2.c +* +* Copyright (C) 2009-2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Topological charge of constant abelian background fields. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "su3fcts.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "tcharge.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define N1 (NPROC1*L1) +#define N2 (NPROC2*L2) +#define N3 (NPROC3*L3) + +static int bc,np[4],bo[4]; +static double mt[4][4],inp[4],twopi; +static su3_dble ud0={{0.0}}; + + +static double afld(int *x,int mu) +{ + int nu; + double xt[4],phi; + + xt[0]=(double)(safe_mod(x[0],N0)); + xt[1]=(double)(safe_mod(x[1],N1)); + xt[2]=(double)(safe_mod(x[2],N2)); + xt[3]=(double)(safe_mod(x[3],N3)); + + phi=0.0; + + for (nu=0;nu1) + { + MPI_Reduce(&qloc,&qall,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(&qall,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + return qall; + } + else + return qloc; +} + + +static double Qmt(void) +{ + int i,mu,nu,ro,si; + double sm,phi,tr; + double ft1,ft2,ft3,fs1,fs2,fs3; + + sm=0.0; + mu=0; + nu=1; + ro=2; + si=3; + + for (i=0;i<3;i++) + { + phi=twopi*mt[mu][nu]*inp[mu]*inp[nu]; + + ft1=sin(phi); + ft2=ft1; + ft3=-sin(2.0*phi); + + tr=(ft1+ft2+ft3)/3.0; + + ft1-=tr; + ft2-=tr; + ft3-=tr; + + phi=twopi*mt[ro][si]*inp[ro]*inp[si]; + + fs1=sin(phi); + fs2=fs1; + fs3=-sin(2.0*phi); + + tr=(fs1+fs2+fs3)/3.0; + + fs1-=tr; + fs2-=tr; + fs3-=tr; + + sm+=(ft1*fs1+ft2*fs2+ft3*fs3); + + nu=nu+1; + ro=(ro+1)%4+(ro==3); + si=(si+1)%4+(si==3); + } + + sm/=(twopi*twopi); + + if (bc==0) + sm*=(double)((N0-2)*N1)*(double)(N2*N3); + else if (bc==1) + { + sm*=(double)((N0-3)*N1)*(double)(N2*N3); + sm+=Qtbnd(); + } + else if (bc==2) + { + sm*=(double)((N0-2)*N1)*(double)(N2*N3); + sm+=Qtbnd(); + } + else + sm*=(double)(N0*N1)*(double)(N2*N3); + + return sm; +} + + +static void choose_mt(void) +{ + int mu,nu; + double r[6]; + + ranlxd(r,6); + MPI_Bcast(r,6,MPI_DOUBLE,0,MPI_COMM_WORLD); + + mt[0][1]=(double)((int)(3.0*r[0])-1); + mt[0][2]=(double)((int)(3.0*r[1])-1); + mt[0][3]=(double)((int)(3.0*r[2])-1); + mt[1][2]=(double)((int)(3.0*r[3])-1); + mt[1][3]=(double)((int)(3.0*r[4])-1); + mt[2][3]=(double)((int)(3.0*r[5])-1); + + for (mu=0;mu<4;mu++) + { + mt[mu][mu]=0.0; + + for (nu=0;nu=(VOLUME/2)) + { + x[0]=bo[0]+x0; + x[1]=bo[1]+x1; + x[2]=bo[2]+x2; + x[3]=bo[3]+x3; + + u=udb+8*(ix-(VOLUME/2)); + + for (ifc=0;ifc<8;ifc++) + { + if (ifc&0x1) + x[ifc/2]-=1; + + phi=afld(x,ifc/2); + + if (ifc&0x1) + x[ifc/2]+=1; + + (*u)=ud0; + (*u).c11.re=cos(phi); + (*u).c11.im=sin(phi); + (*u).c22.re=(*u).c11.re; + (*u).c22.im=(*u).c11.im; + (*u).c33.re=cos(-2.0*phi); + (*u).c33.im=sin(-2.0*phi); + u+=1; + } + } + } + } + } + } + + set_bc(); + set_flags(UPDATED_UD); +} + + +int main(int argc,char *argv[]) +{ + int my_rank,i; + double phi[2],phi_prime[2]; + double Q1,Q2,d,dmax; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check2.log","w",stdout); + printf("\n"); + printf("Topological charge of constant abelian background fields\n"); + printf("--------------------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check2.c]", + "Syntax: check2 [-bc ]"); + } + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.9012,1.2034,1.0,1.0,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,123); + geometry(); + + twopi=8.0*atan(1.0); + + np[0]=N0; + np[1]=N1; + np[2]=N2; + np[3]=N3; + + bo[0]=cpr[0]*L0; + bo[1]=cpr[1]*L1; + bo[2]=cpr[2]*L2; + bo[3]=cpr[3]*L3; + + inp[0]=1.0/(double)(np[0]); + inp[1]=1.0/(double)(np[1]); + inp[2]=1.0/(double)(np[2]); + inp[3]=1.0/(double)(np[3]); + + dmax=0.0; + + for (i=0;i<10;i++) + { + choose_mt(); + set_ud(); + Q1=Qmt(); + Q2=tcharge(); + + if (my_rank==0) + printf("Field no = %2d, Q1 = % 8.4e, Q2 = % 8.4e\n",i+1,Q1,Q2); + + d=fabs(Q1-Q2); + if (d>dmax) + dmax=d; + } + + error_chk(); + + if (my_rank==0) + { + printf("\n"); + printf("Maximal absolute deviation = %.1e\n\n",dmax); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/tcharge/check3.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/tcharge/check3.c new file mode 100644 index 0000000000000000000000000000000000000000..a730fd4ea6919e39e9a6a5d438b4cb1f4d3b703c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/tcharge/check3.c @@ -0,0 +1,155 @@ + +/******************************************************************************* +* +* File check3.c +* +* Copyright (C) 2009-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of the program tcharge_slices(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "forces.h" +#include "wflow.h" +#include "tcharge.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define N1 (NPROC1*L1) +#define N2 (NPROC2*L2) +#define N3 (NPROC3*L3) + +static int bc,n,dn; +static double eps,Q1,Q2,Q[N0],Q0[N0]; + + +int main(int argc,char *argv[]) +{ + int my_rank,i,imax,t; + double phi[2],phi_prime[2]; + double nplaq,act,dev; + FILE *fin=NULL,*flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check3.log","w",stdout); + fin=freopen("check3.in","r",stdin); + + printf("\n"); + printf("Check of the program tcharge_slices()\n"); + printf("-------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + read_line("n","%d",&n); + read_line("dn","%d",&dn); + read_line("eps","%lf",&eps); + fclose(fin); + + printf("n = %d\n",n); + printf("dn = %d\n",dn); + printf("eps = %.2e\n\n",eps); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check3.c]", + "Syntax: check3 [-bc ]"); + } + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&n,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&dn,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&eps,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + set_lat_parms(6.0,1.0,0,NULL,1.0); + + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.9012,1.2034,1.0,1.0,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,123456); + geometry(); + alloc_wfd(2); + + if (bc==0) + nplaq=(double)(6*N0-6)*(double)(N1*N2*N3); + else + nplaq=(double)(6*N0)*(double)(N1*N2*N3); + + random_ud(); + imax=n/dn; + + for (i=0;i +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "su3fcts.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "tcharge.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define N1 (NPROC1*L1) +#define N2 (NPROC2*L2) +#define N3 (NPROC3*L3) + +static int bc,nfc[8],ofs[8]; +static const su3_dble ud0={{0.0}}; +static su3_dble *g,*gbuf; +static su3_dble wd ALIGNED16; + + +static void pack_gbuf(void) +{ + int ifc,ib,ix; + + nfc[0]=FACE0/2; + nfc[1]=FACE0/2; + nfc[2]=FACE1/2; + nfc[3]=FACE1/2; + nfc[4]=FACE2/2; + nfc[5]=FACE2/2; + nfc[6]=FACE3/2; + nfc[7]=FACE3/2; + + ofs[0]=0; + ofs[1]=ofs[0]+nfc[0]; + ofs[2]=ofs[1]+nfc[1]; + ofs[3]=ofs[2]+nfc[2]; + ofs[4]=ofs[3]+nfc[3]; + ofs[5]=ofs[4]+nfc[4]; + ofs[6]=ofs[5]+nfc[5]; + ofs[7]=ofs[6]+nfc[6]; + + for (ifc=0;ifc<8;ifc++) + { + for (ib=0;ib0) + { + tag=mpi_tag(); + saddr=npr[ifc^0x1]; + raddr=npr[ifc]; + sbuf=gbuf+ofs[ifc]; + rbuf=g+VOLUME+ofs[ifc]; + + if (np&0x1) + { + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + } + else + { + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + } + } + } +} + + +static void random_g(void) +{ + int ix,t; + su3_dble unity,*gx; + + unity=ud0; + unity.c11.re=1.0; + unity.c22.re=1.0; + unity.c33.re=1.0; + gx=g; + + for (ix=0;ix0)||(bc!=1)) + random_su3_dble(gx); + else + (*gx)=unity; + + gx+=1; + } + + if (BNDRY>0) + { + pack_gbuf(); + send_gbuf(); + } +} + + +static void transform_ud(void) +{ + int ix,iy,t,ifc; + su3_dble *u; + + u=udfld(); + + for (ix=(VOLUME/2);ix(bs[mu]/2)) + svec[mu]-=bs[mu]; + } + + MPI_Bcast(svec,4,MPI_INT,0,MPI_COMM_WORLD); +} + + +int main(int argc,char *argv[]) +{ + int my_rank,i,s[4]; + double phi[2],phi_prime[2]; + double d,dmax1,dmax2; + double A1,A2,a1,a2; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check4.log","w",stdout); + printf("\n"); + printf("Gauge and translation invariance of the Yang-Mills action\n"); + printf("---------------------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check4.c]", + "Syntax: check4 [-bc ]"); + } + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,1.0,1.0,1.0,1.0,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,12345); + geometry(); + + g=amalloc(NSPIN*sizeof(*g),4); + + if (BNDRY>0) + gbuf=amalloc((BNDRY/2)*sizeof(*gbuf),4); + + error((g==NULL)||((BNDRY>0)&&(gbuf==NULL)),1,"main [check4.c]", + "Unable to allocate auxiliary arrays"); + + dmax1=0.0; + dmax2=0.0; + + for (i=0;i<8;i++) + { + random_ud(); + + A1=ym_action(); + random_vec(s); + if (bc!=3) + s[0]=0; + shift_ud(s); + A2=ym_action(); + + d=fabs(A1-A2)/A1; + if (d>dmax1) + dmax1=d; + + random_g(); + transform_ud(); + A2=ym_action(); + + d=fabs(A1-A2)/A1; + if (d>dmax2) + dmax2=d; + + a1=A1; + a2=A2; + + MPI_Bcast(&a1,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&a2,1,MPI_INT,0,MPI_COMM_WORLD); + + error((a1!=A1)||(a2!=A2),1,"main [check4.c]", + "Action is not globally the same"); + } + + error_chk(); + print_flags(); + + if (my_rank==0) + { + printf("Translation invariance = %.2e\n",dmax1); + printf("Gauge invariance = %.2e\n\n",dmax2); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/tcharge/check5.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/tcharge/check5.c new file mode 100644 index 0000000000000000000000000000000000000000..b5f1ec31ccb1790f16a66a9c013c1288054ead77 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/tcharge/check5.c @@ -0,0 +1,417 @@ + +/******************************************************************************* +* +* File check5.c +* +* Copyright (C) 2010, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Yang-Mills action of constant abelian background fields. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "su3fcts.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "tcharge.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define N1 (NPROC1*L1) +#define N2 (NPROC2*L2) +#define N3 (NPROC3*L3) + +static int bc,np[4],bo[4]; +static double mt[4][4],inp[4],twopi; +static su3_dble ud0={{0.0}}; + + +static double afld(int *x,int mu) +{ + int nu; + double xt[4],phi; + + xt[0]=(double)(safe_mod(x[0],N0)); + xt[1]=(double)(safe_mod(x[1],N1)); + xt[2]=(double)(safe_mod(x[2],N2)); + xt[3]=(double)(safe_mod(x[3],N3)); + + phi=0.0; + + for (nu=0;nu1) + { + MPI_Reduce(&aloc,&aall,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(&aall,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + return aall; + } + else + return aloc; +} + + +static double Amt(void) +{ + int mu,nu; + double sm,pi; + double xl[4],phi,ft1,ft2,ft3,tr; + + xl[0]=(double)(NPROC0*L0); + xl[1]=(double)(NPROC1*L1); + xl[2]=(double)(NPROC2*L2); + xl[3]=(double)(NPROC3*L3); + + pi=4.0*atan(1.0); + sm=0.0; + + for (mu=1;mu<4;mu++) + { + for (nu=0;nu=(VOLUME/2)) + { + x[0]=bo[0]+x0; + x[1]=bo[1]+x1; + x[2]=bo[2]+x2; + x[3]=bo[3]+x3; + + u=udb+8*(ix-(VOLUME/2)); + + for (ifc=0;ifc<8;ifc++) + { + if (ifc&0x1) + x[ifc/2]-=1; + + phi=afld(x,ifc/2); + + if (ifc&0x1) + x[ifc/2]+=1; + + (*u)=ud0; + (*u).c11.re=cos(phi); + (*u).c11.im=sin(phi); + (*u).c22.re=(*u).c11.re; + (*u).c22.im=(*u).c11.im; + (*u).c33.re=cos(-2.0*phi); + (*u).c33.im=sin(-2.0*phi); + u+=1; + } + } + } + } + } + } + + set_bc(); + set_flags(UPDATED_UD); +} + + +int main(int argc,char *argv[]) +{ + int my_rank,i; + double phi[2],phi_prime[2]; + double A1,A2,d,dmax; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check5.log","w",stdout); + printf("\n"); + printf("Yang-Mills action of constant abelian background fields\n"); + printf("-------------------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check5.c]", + "Syntax: check5 [-bc ]"); + } + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.9012,1.2034,1.0,1.0,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,123); + geometry(); + + twopi=8.0*atan(1.0); + + np[0]=N0; + np[1]=N1; + np[2]=N2; + np[3]=N3; + + bo[0]=cpr[0]*L0; + bo[1]=cpr[1]*L1; + bo[2]=cpr[2]*L2; + bo[3]=cpr[3]*L3; + + inp[0]=1.0/(double)(np[0]); + inp[1]=1.0/(double)(np[1]); + inp[2]=1.0/(double)(np[2]); + inp[3]=1.0/(double)(np[3]); + + dmax=0.0; + + for (i=0;i<10;i++) + { + choose_mt(); + set_ud(); + + A1=Amt(); + A2=ym_action(); + + if (my_rank==0) + printf("Field no = %2d, A1 = %12.6e, A2 = %12.6e\n",i+1,A1,A2); + + d=fabs(A1-A2)/A1; + if (d>dmax) + dmax=d; + } + + error_chk(); + + if (my_rank==0) + { + printf("\n"); + printf("Maximal relative deviation = %.1e\n\n",dmax); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/tcharge/check6.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/tcharge/check6.c new file mode 100644 index 0000000000000000000000000000000000000000..17731ff6b04e175fa1173ce08ed381d28ba51113 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/tcharge/check6.c @@ -0,0 +1,146 @@ + +/******************************************************************************* +* +* File check6.c +* +* Copyright (C) 2010, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of the program ym_action_slices(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "random.h" +#include "su3fcts.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "wflow.h" +#include "tcharge.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define N1 (NPROC1*L1) +#define N2 (NPROC2*L2) +#define N3 (NPROC3*L3) + +static int bc,n,dn; +static double eps,A1,A2,A[N0],A0[N0]; + + +int main(int argc,char *argv[]) +{ + int my_rank,i,imax,t; + double phi[2],phi_prime[2]; + double dev; + FILE *fin=NULL,*flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check6.log","w",stdout); + fin=freopen("check6.in","r",stdin); + + printf("\n"); + printf("Check of the program ym_action_slices()\n"); + printf("---------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + read_line("n","%d\n",&n); + read_line("dn","%d\n",&dn); + read_line("eps","%lf",&eps); + fclose(fin); + + printf("n = %d\n",n); + printf("dn = %d\n",dn); + printf("eps = %.2e\n\n",eps); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check6.c]", + "Syntax: check6 [-bc ]"); + } + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&n,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&dn,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&eps,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.9012,1.2034,1.0,1.0,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,123456); + geometry(); + alloc_wfd(2); + + random_ud(); + imax=n/dn; + + for (i=0;i that allows the type of boundary +condition to be chosen. When the option is not set, open boundary conditions +are assumed. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/uflds/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/uflds/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..88ddadd5cfe62e977a818793ea88e32e8e36e726 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/uflds/Makefile @@ -0,0 +1,125 @@ +################################################################################ +# +# Makefile to compile and link C programs with MPI subroutines +# +# Version valid for Linux machines with MPICH +# +# "make" compiles and links the specified main programs and modules, +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files generated by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +all: rmxeq mkdep mkxeq +.PHONY: all + + +# main programs and modules to be compiled + +MAIN = check1 check2 check3 check4 check5 + +FLAGS = flags lat_parms dfl_parms + +LATTICE = bcnds uidx geometry + +RANDOM = ranlux ranlxs ranlxd gauss + +UFLDS = plaq_sum shift uflds udcom bstap + +SU3FCTS = su3prod su3ren cm3x3 random_su3 + +UTILS = endian mutils utils wspace + +MODULES = $(FLAGS) $(LATTICE) $(RANDOM) $(UFLDS) $(SU3FCTS) $(UTILS) + + +# Logging option (-mpilog or -mpitrace or -mpianim) + +LOGOPTION = + + +# search path for modules + +MDIR = ../../modules + +VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/random:$(MDIR)/uflds:\ + $(MDIR)/su3fcts:$(MDIR)/utils + + +# additional include directories + +INCPATH = $(MPI_INCLUDE) ../../include + + +# additional libraries + +LIBS = m + +LIBPATH = $(MPI_HOME)/lib + + +# scheduling and optimization options + +CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ + -Wall -Wno-long-long -Wstrict-prototypes -Werror \ + -O -mno-avx -Dx64 -DPM + + +############################## do not change ################################### + +SHELL=/bin/bash +CC=$(MPI_HOME)/bin/mpicc +CLINKER=$(CC) + +PGMS= $(MAIN) $(MODULES) + +-include $(addsuffix .d,$(PGMS)) + + +# rule to make dependencies + +$(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + +# rule to link object files + +$(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(CFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ + + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables + +rmxeq: + @ -rm -f $(MAIN); \ + echo "delete old executables" + + +# make dependencies + +mkdep: $(addsuffix .d,$(PGMS)) + @ echo "generate tables of dependencies" + + +# clean directory + +clean: + @ -rm -rf *.d *.o *.alog *.clog *.slog $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/uflds/check1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/uflds/check1.c new file mode 100644 index 0000000000000000000000000000000000000000..d97b092c45d43122b20046443ced57fc69c91f4c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/uflds/check1.c @@ -0,0 +1,505 @@ + +/******************************************************************************* +* +* File check1.c +* +* Copyright (C) 2009, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Initialization of the link variables. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define N1 (NPROC1*L1) +#define N2 (NPROC2*L2) +#define N3 (NPROC3*L3) + +static complex_dble det_dble(su3_dble *u) +{ + complex_dble det1,det2,det3,detu; + + det1.re= + ((*u).c22.re*(*u).c33.re-(*u).c22.im*(*u).c33.im)- + ((*u).c23.re*(*u).c32.re-(*u).c23.im*(*u).c32.im); + det1.im= + ((*u).c22.re*(*u).c33.im+(*u).c22.im*(*u).c33.re)- + ((*u).c23.re*(*u).c32.im+(*u).c23.im*(*u).c32.re); + det2.re= + ((*u).c21.re*(*u).c33.re-(*u).c21.im*(*u).c33.im)- + ((*u).c23.re*(*u).c31.re-(*u).c23.im*(*u).c31.im); + det2.im= + ((*u).c21.re*(*u).c33.im+(*u).c21.im*(*u).c33.re)- + ((*u).c23.re*(*u).c31.im+(*u).c23.im*(*u).c31.re); + det3.re= + ((*u).c21.re*(*u).c32.re-(*u).c21.im*(*u).c32.im)- + ((*u).c22.re*(*u).c31.re-(*u).c22.im*(*u).c31.im); + det3.im= + ((*u).c21.re*(*u).c32.im+(*u).c21.im*(*u).c32.re)- + ((*u).c22.re*(*u).c31.im+(*u).c22.im*(*u).c31.re); + + detu.re= + ((*u).c11.re*det1.re-(*u).c11.im*det1.im)- + ((*u).c12.re*det2.re-(*u).c12.im*det2.im)+ + ((*u).c13.re*det3.re-(*u).c13.im*det3.im); + detu.im= + ((*u).c11.re*det1.im+(*u).c11.im*det1.re)- + ((*u).c12.re*det2.im+(*u).c12.im*det2.re)+ + ((*u).c13.re*det3.im+(*u).c13.im*det3.re); + + return detu; +} + + +static double dev_unity(su3 *u) +{ + int i; + float r[18]; + double d,dmax; + + r[ 0]=(*u).c11.re-1.0f; + r[ 1]=(*u).c11.im; + r[ 2]=(*u).c12.re; + r[ 3]=(*u).c12.im; + r[ 4]=(*u).c13.re; + r[ 5]=(*u).c13.im; + + r[ 6]=(*u).c21.re; + r[ 7]=(*u).c21.im; + r[ 8]=(*u).c22.re-1.0f; + r[ 9]=(*u).c22.im; + r[10]=(*u).c23.re; + r[11]=(*u).c23.im; + + r[12]=(*u).c31.re; + r[13]=(*u).c31.im; + r[14]=(*u).c32.re; + r[15]=(*u).c32.im; + r[16]=(*u).c33.re-1.0f; + r[17]=(*u).c33.im; + + dmax=0.0; + + for (i=0;i<18;i++) + { + d=fabs((double)(r[i])); + if (d>dmax) + dmax=d; + } + + return dmax; +} + + +static double dev_unity_dble(su3_dble *u) +{ + int i; + double r[18],d,dmax; + + r[ 0]=(*u).c11.re-1.0; + r[ 1]=(*u).c11.im; + r[ 2]=(*u).c12.re; + r[ 3]=(*u).c12.im; + r[ 4]=(*u).c13.re; + r[ 5]=(*u).c13.im; + + r[ 6]=(*u).c21.re; + r[ 7]=(*u).c21.im; + r[ 8]=(*u).c22.re-1.0; + r[ 9]=(*u).c22.im; + r[10]=(*u).c23.re; + r[11]=(*u).c23.im; + + r[12]=(*u).c31.re; + r[13]=(*u).c31.im; + r[14]=(*u).c32.re; + r[15]=(*u).c32.im; + r[16]=(*u).c33.re-1.0; + r[17]=(*u).c33.im; + + dmax=0.0; + + for (i=0;i<18;i++) + { + d=fabs(r[i]); + if (d>dmax) + dmax=d; + } + + return dmax; +} + + +static double dev_zero_dble(su3_dble *u) +{ + int i; + double r[18],d,dmax; + + r[ 0]=(*u).c11.re; + r[ 1]=(*u).c11.im; + r[ 2]=(*u).c12.re; + r[ 3]=(*u).c12.im; + r[ 4]=(*u).c13.re; + r[ 5]=(*u).c13.im; + + r[ 6]=(*u).c21.re; + r[ 7]=(*u).c21.im; + r[ 8]=(*u).c22.re; + r[ 9]=(*u).c22.im; + r[10]=(*u).c23.re; + r[11]=(*u).c23.im; + + r[12]=(*u).c31.re; + r[13]=(*u).c31.im; + r[14]=(*u).c32.re; + r[15]=(*u).c32.im; + r[16]=(*u).c33.re; + r[17]=(*u).c33.im; + + dmax=0.0; + + for (i=0;i<18;i++) + { + d=fabs(r[i]); + if (d>dmax) + dmax=d; + } + + return dmax; +} + + +static double dev_bval_dble(int k,double *phi,su3_dble *u) +{ + int i; + double r[18],s[3],phi3,d,dmax; + + s[0]=(double)(N1); + s[1]=(double)(N2); + s[2]=(double)(N3); + phi3=-phi[0]-phi[1]; + + r[ 0]=(*u).c11.re-cos(phi[0]/s[k-1]); + r[ 1]=(*u).c11.im-sin(phi[0]/s[k-1]); + r[ 2]=(*u).c12.re; + r[ 3]=(*u).c12.im; + r[ 4]=(*u).c13.re; + r[ 5]=(*u).c13.im; + + r[ 6]=(*u).c21.re; + r[ 7]=(*u).c21.im; + r[ 8]=(*u).c22.re-cos(phi[1]/s[k-1]); + r[ 9]=(*u).c22.im-sin(phi[1]/s[k-1]); + r[10]=(*u).c23.re; + r[11]=(*u).c23.im; + + r[12]=(*u).c31.re; + r[13]=(*u).c31.im; + r[14]=(*u).c32.re; + r[15]=(*u).c32.im; + r[16]=(*u).c33.re-cos(phi3/s[k-1]); + r[17]=(*u).c33.im-sin(phi3/s[k-1]); + + dmax=0.0; + + for (i=0;i<18;i++) + { + d=fabs(r[i]); + if (d>dmax) + dmax=d; + } + + return dmax; +} + + +static double dev_uudag_dble(su3_dble *u) +{ + su3_dble udag,w; + + _su3_dagger(udag,(*u)); + _su3_times_su3(w,(*u),udag); + + return dev_unity_dble(&w); +} + + +static double dev_detu_dble(su3_dble *u) +{ + double d,dmax; + complex_dble detu; + + detu=det_dble(u); + dmax=0.0; + + d=fabs(1.0-detu.re); + if (d>dmax) + dmax=d; + d=fabs(detu.im); + if (d>dmax) + dmax=d; + + return dmax; +} + + +static double dev_udu_dble(su3_dble *ud,su3 *u) +{ + int i; + double r[18],d,dmax; + + r[ 0]=(*ud).c11.re-(double)((*u).c11.re); + r[ 1]=(*ud).c11.im-(double)((*u).c11.im); + r[ 2]=(*ud).c12.re-(double)((*u).c12.re); + r[ 3]=(*ud).c12.im-(double)((*u).c12.im); + r[ 4]=(*ud).c13.re-(double)((*u).c13.re); + r[ 5]=(*ud).c13.im-(double)((*u).c13.im); + r[ 6]=(*ud).c21.re-(double)((*u).c21.re); + r[ 7]=(*ud).c21.im-(double)((*u).c21.im); + r[ 8]=(*ud).c22.re-(double)((*u).c22.re); + r[ 9]=(*ud).c22.im-(double)((*u).c22.im); + r[10]=(*ud).c23.re-(double)((*u).c23.re); + r[11]=(*ud).c23.im-(double)((*u).c23.im); + r[12]=(*ud).c31.re-(double)((*u).c31.re); + r[13]=(*ud).c31.im-(double)((*u).c31.im); + r[14]=(*ud).c32.re-(double)((*u).c32.re); + r[15]=(*ud).c32.im-(double)((*u).c32.im); + r[16]=(*ud).c33.re-(double)((*u).c33.re); + r[17]=(*ud).c33.im-(double)((*u).c33.im); + + dmax=0.0; + + for (i=0;i<18;i++) + { + d=fabs(r[i]); + if (d>dmax) + dmax=d; + } + + return dmax; +} + + +int main(int argc,char *argv[]) +{ + int my_rank,bc; + int iu,ix,ifc,x0,k,ie; + double d1,d2,dmax1,dmax2; + double dmax1_all,dmax2_all; + double phi[2],phi_prime[2]; + su3 *u,*ub,*um; + su3_dble *ud,*udb,*udm; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check1.log","w",stdout); + + printf("\n"); + printf("Initialization of the link variables\n"); + printf("------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check1.c]", + "Syntax: check1 [-bc ]"); + } + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,1.0,1.0,1.0,1.0,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,123456); + geometry(); + + ub=ufld(); + um=ub+4*VOLUME; + dmax1=0.0; + + for (u=ub;udmax1) + dmax1=d1; + } + + MPI_Reduce(&dmax1,&dmax1_all,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + + if (my_rank==0) + { + printf("Allocate single-precision gauge field\n"); + printf("|u-1| = %.2e\n\n",dmax1_all); + } + + print_flags(); + + udb=udfld(); + + ie=check_bc(0.0); + error_root(ie==0,1,"main [check1.c]","Boundary conditions not properly set"); + + udm=udb+4*VOLUME; + dmax1=0.0; + dmax2=0.0; + + for (ud=udb;uddmax2) + dmax2=d2; + } + else if ((bc!=1)||(x0>0)||(ifc<2)) + { + d1=dev_unity_dble(ud); + if (d1>dmax1) + dmax1=d1; + } + else + { + d2=dev_bval_dble(ifc/2,phi,ud); + if (d2>dmax2) + dmax2=d2; + } + } + + if ((cpr[0]==(NPROC0-1))&&((bc==1)||(bc==2))) + { + ud=udb+4*VOLUME+7*(BNDRY/4); + + for (k=1;k<4;k++) + { + d2=dev_bval_dble(k,phi_prime,ud); + ud+=1; + + if (d2>dmax2) + dmax2=d2; + } + } + + MPI_Reduce(&dmax1,&dmax1_all,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Reduce(&dmax2,&dmax2_all,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + + if (my_rank==0) + { + printf("Allocate double-precision gauge field\n"); + printf("|ud-1| = %.2e\n",dmax1_all); + if (bc!=3) + printf("|ud-bval| = %.2e\n",dmax2_all); + printf("\n"); + } + + print_flags(); + + random_ud(); + assign_ud2u(); + + ie=check_bc(0.0); + error_root(ie==0,1,"main [check1.c]","Boundary conditions changed"); + + ud=udb; + udm=udb+4*VOLUME; + u=ub; + dmax1=0.0; + + for (ud=udb;uddmax1) + dmax1=d1; + + u+=1; + } + + MPI_Reduce(&dmax1,&dmax1_all,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + + if (my_rank==0) + { + printf("Random fields\n"); + printf("Assign double-precision to single-precision field\n"); + printf("Maximal deviation = %.2e\n\n",dmax1_all); + } + + print_flags(); + + random_ud(); + dmax1=0.0; + dmax2=0.0; + + for (ud=udb;uddmax1) + dmax1=d1; + if (d2>dmax2) + dmax2=d2; + } + + MPI_Reduce(&dmax1,&dmax1_all,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Reduce(&dmax2,&dmax2_all,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + + if (my_rank==0) + { + printf("Call random_ud\n"); + printf("|u^dag*u-1| = %.2e\n",dmax1_all); + printf("|det{u}-1| = %.2e\n\n",dmax2_all); + } + + print_flags(); + + if (my_rank==0) + fclose(flog); + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/uflds/check2.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/uflds/check2.c new file mode 100644 index 0000000000000000000000000000000000000000..039ae76be3b000ec8bc429d63aa2abeb0529b591 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/uflds/check2.c @@ -0,0 +1,396 @@ + +/******************************************************************************* +* +* File check2.c +* +* Copyright (C) 2010, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Renormalization of the link variables. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "global.h" + +#define N0 (NPROC0*L0) + +static int bc; + + +static int is_zero_dble(su3_dble *ud) +{ + int i; + double r[18]; + + r[ 0]=(*ud).c11.re; + r[ 1]=(*ud).c11.im; + r[ 2]=(*ud).c12.re; + r[ 3]=(*ud).c12.im; + r[ 4]=(*ud).c13.re; + r[ 5]=(*ud).c13.im; + + r[ 6]=(*ud).c21.re; + r[ 7]=(*ud).c21.im; + r[ 8]=(*ud).c22.re; + r[ 9]=(*ud).c22.im; + r[10]=(*ud).c23.re; + r[11]=(*ud).c23.im; + + r[12]=(*ud).c31.re; + r[13]=(*ud).c31.im; + r[14]=(*ud).c32.re; + r[15]=(*ud).c32.im; + r[16]=(*ud).c33.re; + r[17]=(*ud).c33.im; + + for (i=0;i<18;i++) + { + if (r[i]!=0.0) + return 0; + } + + return 1; +} + + +static complex_dble det_dble(su3_dble *u) +{ + complex_dble det1,det2,det3,detu; + + det1.re= + ((*u).c22.re*(*u).c33.re-(*u).c22.im*(*u).c33.im)- + ((*u).c23.re*(*u).c32.re-(*u).c23.im*(*u).c32.im); + det1.im= + ((*u).c22.re*(*u).c33.im+(*u).c22.im*(*u).c33.re)- + ((*u).c23.re*(*u).c32.im+(*u).c23.im*(*u).c32.re); + det2.re= + ((*u).c21.re*(*u).c33.re-(*u).c21.im*(*u).c33.im)- + ((*u).c23.re*(*u).c31.re-(*u).c23.im*(*u).c31.im); + det2.im= + ((*u).c21.re*(*u).c33.im+(*u).c21.im*(*u).c33.re)- + ((*u).c23.re*(*u).c31.im+(*u).c23.im*(*u).c31.re); + det3.re= + ((*u).c21.re*(*u).c32.re-(*u).c21.im*(*u).c32.im)- + ((*u).c22.re*(*u).c31.re-(*u).c22.im*(*u).c31.im); + det3.im= + ((*u).c21.re*(*u).c32.im+(*u).c21.im*(*u).c32.re)- + ((*u).c22.re*(*u).c31.im+(*u).c22.im*(*u).c31.re); + + detu.re= + ((*u).c11.re*det1.re-(*u).c11.im*det1.im)- + ((*u).c12.re*det2.re-(*u).c12.im*det2.im)+ + ((*u).c13.re*det3.re-(*u).c13.im*det3.im); + detu.im= + ((*u).c11.re*det1.im+(*u).c11.im*det1.re)- + ((*u).c12.re*det2.im+(*u).c12.im*det2.re)+ + ((*u).c13.re*det3.im+(*u).c13.im*det3.re); + + return detu; +} + + +static double dev_uudag_dble(su3_dble *u,su3_dble *v) +{ + int i; + double r[18],d,dmax; + su3_dble vdag,w; + + _su3_dagger(vdag,(*v)); + _su3_times_su3(w,(*u),vdag); + + w.c11.re-=1.0; + w.c22.re-=1.0; + w.c33.re-=1.0; + + r[ 0]=w.c11.re; + r[ 1]=w.c11.im; + r[ 2]=w.c12.re; + r[ 3]=w.c12.im; + r[ 4]=w.c13.re; + r[ 5]=w.c13.im; + + r[ 6]=w.c21.re; + r[ 7]=w.c21.im; + r[ 8]=w.c22.re; + r[ 9]=w.c22.im; + r[10]=w.c23.re; + r[11]=w.c23.im; + + r[12]=w.c31.re; + r[13]=w.c31.im; + r[14]=w.c32.re; + r[15]=w.c32.im; + r[16]=w.c33.re; + r[17]=w.c33.im; + + dmax=0.0; + + for (i=0;i<18;i++) + { + d=fabs(r[i]); + if (d>dmax) + dmax=d; + } + + return dmax; +} + + +static double dev_detu_dble(su3_dble *u) +{ + double d,dmax; + complex_dble detu; + + detu=det_dble(u); + dmax=0.0; + + d=fabs(1.0-detu.re); + if (d>dmax) + dmax=d; + d=fabs(detu.im); + if (d>dmax) + dmax=d; + + return dmax; +} + + +static void check_ud(double *dev1,double *dev2) +{ + int iu,ix,ifc,x0; + double d1,d2,dmax1,dmax2; + su3_dble *u,*ub,*um; + + ub=udfld(); + um=ub+4*VOLUME; + dmax1=0.0; + dmax2=0.0; + + for (u=ub;udmax1) + dmax1=d1; + if (d2>dmax2) + dmax2=d2; + } + + MPI_Reduce(&dmax1,dev1,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Reduce(&dmax2,dev2,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Bcast(dev1,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(dev2,1,MPI_DOUBLE,0,MPI_COMM_WORLD); +} + + +static double cmp_ud(su3_dble *usv) +{ + int ix,ifc,x0; + double d1,dmax1; + su3_dble *ub,*u,*v,*um; + + ub=udfld(); + um=ub+4*VOLUME; + v=usv; + dmax1=0.0; + + for (u=ub;udmax1) + dmax1=d1; + + v+=1; + } + + error_chk(); + + d1=dmax1; + MPI_Reduce(&d1,&dmax1,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Bcast(&dmax1,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + return dmax1; +} + + +static void tilt_ud(double eps) +{ + int ix,ifc,t; + double r[18]; + su3_dble *ud; + + ud=udfld(); + + for (ix=(VOLUME/2);ix0)))|| + ((ifc>=2)&&((bc!=1)||(t>0)))) + { + gauss_dble(r,18); + + (*ud).c11.re+=eps*r[ 0]; + (*ud).c11.im+=eps*r[ 1]; + (*ud).c12.re+=eps*r[ 2]; + (*ud).c12.im+=eps*r[ 3]; + (*ud).c13.re+=eps*r[ 4]; + (*ud).c13.im+=eps*r[ 5]; + + (*ud).c21.re+=eps*r[ 6]; + (*ud).c21.im+=eps*r[ 7]; + (*ud).c22.re+=eps*r[ 8]; + (*ud).c22.im+=eps*r[ 9]; + (*ud).c23.re+=eps*r[10]; + (*ud).c23.im+=eps*r[11]; + + (*ud).c31.re+=eps*r[12]; + (*ud).c31.im+=eps*r[13]; + (*ud).c32.re+=eps*r[14]; + (*ud).c32.im+=eps*r[15]; + (*ud).c33.re+=eps*r[16]; + (*ud).c33.im+=eps*r[17]; + } + + ud+=1; + } + } +} + + +int main(int argc,char *argv[]) +{ + int my_rank,ie; + double d1,d2,d3,d4,d5; + double phi[2],phi_prime[2]; + su3_dble *udb,**usv; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check2.log","w",stdout); + + printf("\n"); + printf("Renormalization of the link variables\n"); + printf("-------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check2.c]", + "Syntax: check2 [-bc ]"); + } + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,1.0,1.0,1.0,1.0,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,123456); + geometry(); + alloc_wud(1); + usv=reserve_wud(1); + udb=udfld(); + + random_ud(); + check_ud(&d1,&d2); + + if (my_rank==0) + { + printf("Random double-precision gauge field:\n"); + printf("|u^dag*u-1| = %.2e\n",d1); + printf("|det{u}-1| = %.2e\n\n",d2); + } + + cm3x3_assign(4*VOLUME,udb,usv[0]); + tilt_ud(50.0*DBL_EPSILON); + check_ud(&d1,&d2); + renormalize_ud(); + d3=cmp_ud(usv[0]); + check_ud(&d4,&d5); + + ie=check_bc(0.0); + error_root(ie==0,1,"main [check2.c]","Boundary conditions changed"); + + if (my_rank==0) + { + printf("Tilt double-precision gauge field:\n"); + printf("|u^dag*u-1| = %.2e\n",d1); + printf("|det{u}-1| = %.2e\n\n",d2); + + printf("After renormalization:\n"); + printf("|u^dag*u_old-1| = %.2e\n",d3); + printf("|u^dag*u-1| = %.2e\n",d4); + printf("|det{u}-1| = %.2e\n\n",d5); + } + + random_ud(); + cm3x3_assign(4*VOLUME,udb,usv[0]); + renormalize_ud(); + d1=cmp_ud(usv[0]); + + ie=check_bc(0.0); + error_root(ie==0,1,"main [check2.c]","Boundary conditions changed"); + + if (my_rank==0) + { + printf("Renormalization of a fresh random double-precision field:\n"); + printf("Maximal change in the link variables = %.2e\n\n",d1); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/uflds/check3.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/uflds/check3.c new file mode 100644 index 0000000000000000000000000000000000000000..8ceb1a136b408675e4a638a96e749cacc95122be --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/uflds/check3.c @@ -0,0 +1,448 @@ + +/******************************************************************************* +* +* File check3.c +* +* Copyright (C) 2005, 2007, 2011, 2012, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of the program that translates the double-precision gauge field. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "global.h" + +static int my_rank,ipsnd,iprcv,*isnd; +static su3_dble *uold,*unew,*ubuf; + + +static void alloc_bufs(void) +{ + isnd=amalloc(NPROC*sizeof(*isnd),3); + uold=amalloc(12*VOLUME*sizeof(*uold),ALIGN); + error((isnd==NULL)||(uold==NULL),1,"alloc_bufs [check3.c]", + "Unable to allocate auxiliary arrays"); + + unew=uold+4*VOLUME; + ubuf=unew+4*VOLUME; +} + + +static int range(int *dist,int *s,int *ra,int *rb) +{ + int io,l[4],nl[4]; + int mu,a,b; + + io=1; + + l[0]=L0; + l[1]=L1; + l[2]=L2; + l[3]=L3; + + nl[0]=L0*NPROC0; + nl[1]=L1*NPROC1; + nl[2]=L2*NPROC2; + nl[3]=L3*NPROC3; + + for (mu=0;mu<4;mu++) + { + a=dist[mu]+s[mu]; + b=a+l[mu]; + + a=safe_mod(a,nl[mu]); + b=safe_mod(b,nl[mu]); + + if (a==b) + { + ra[mu]=0; + rb[mu]=l[mu]; + } + else if (a1) + { + io=itest; + MPI_Reduce(&io,&itest,1,MPI_INT,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Bcast(&itest,1,MPI_INT,0,MPI_COMM_WORLD); + } + + return itest; +} + + +static void random_vec(int *svec) +{ + int mu,bs[4]; + double r[4]; + + bs[0]=NPROC0*L0; + bs[1]=NPROC1*L1; + bs[2]=NPROC2*L2; + bs[3]=NPROC3*L3; + + ranlxd(r,4); + + for (mu=0;mu<4;mu++) + { + svec[mu]=(int)((double)(bs[mu])*r[mu]); + if (svec[mu]>(bs[mu]/2)) + svec[mu]-=bs[mu]; + } + + MPI_Bcast(svec,4,MPI_INT,0,MPI_COMM_WORLD); +} + + +int main(int argc,char *argv[]) +{ + int bc,ie; + int ifc,mu,s[4],n,itest; + double phi[2],phi_prime[2]; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check3.log","w",stdout); + + printf("\n"); + printf("Translation of the double-precision gauge field\n"); + printf("-----------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check3.c]", + "Syntax: check3 [-bc ]"); + } + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,1.0,1.0,1.0,1.0,phi,phi_prime); + print_bc_parms(); + + geometry(); + alloc_bufs(); + + if (my_rank==0) + printf("Elementary shift vectors:\n\n"); + + for (ifc=0;ifc<8;ifc++) + { + if ((ifc>1)||(bc==3)) + { + random_ud(); + save_field(uold); + + s[0]=0; + s[1]=0; + s[2]=0; + s[3]=0; + mu=ifc/2; + + if ((ifc&0x1)==0) + s[mu]=1; + else + s[mu]=-1; + + shift_ud(s); + save_field(unew); + itest=cmp_field(s); + + ie=check_bc(0.0); + error_root(ie==0,1,"main [check3.c]","Boundary conditions changed"); + + if (my_rank==0) + { + printf("Shift vector (% 3d,% 3d,% 3d,% 3d): ", + s[0],s[1],s[2],s[3]); + + if (itest==0) + printf("ok\n"); + else + printf("failed\n"); + } + } + } + + if (my_rank==0) + { + printf("\n"); + printf("Random shift vectors:\n\n"); + } + + for (n=0;n<8;n++) + { + random_ud(); + save_field(uold); + + random_vec(s); + if (bc!=3) + s[0]=0; + shift_ud(s); + save_field(unew); + itest=cmp_field(s); + + ie=check_bc(0.0); + error_root(ie==0,1,"main [check3.c]","Boundary conditions changed"); + + if (my_rank==0) + { + printf("Shift vector (% 3d,% 3d,% 3d,% 3d): ", + s[0],s[1],s[2],s[3]); + + if (itest==0) + printf("ok\n"); + else + printf("failed\n"); + } + } + + error_chk(); + + if (my_rank==0) + { + printf("\n"); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/uflds/check4.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/uflds/check4.c new file mode 100644 index 0000000000000000000000000000000000000000..46f1b451b6a0c75d0648961525623c03fc360f8a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/uflds/check4.c @@ -0,0 +1,491 @@ + +/******************************************************************************* +* +* File check4.c +* +* Copyright (C) 2005, 2007-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of the programs for the plaquette sums of the double-precision +* gauge field. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define N1 (NPROC1*L1) +#define N2 (NPROC2*L2) +#define N3 (NPROC3*L3) + +static int bc,nfc[8],ofs[8]; +static double asl1[N0],asl2[N0]; +static const su3_dble ud0={{0.0}}; +static su3_dble *g,*gbuf; +static su3_dble wd ALIGNED16; + + +static void pack_gbuf(void) +{ + int ifc,ib,ix; + + nfc[0]=FACE0/2; + nfc[1]=FACE0/2; + nfc[2]=FACE1/2; + nfc[3]=FACE1/2; + nfc[4]=FACE2/2; + nfc[5]=FACE2/2; + nfc[6]=FACE3/2; + nfc[7]=FACE3/2; + + ofs[0]=0; + ofs[1]=ofs[0]+nfc[0]; + ofs[2]=ofs[1]+nfc[1]; + ofs[3]=ofs[2]+nfc[2]; + ofs[4]=ofs[3]+nfc[3]; + ofs[5]=ofs[4]+nfc[4]; + ofs[6]=ofs[5]+nfc[5]; + ofs[7]=ofs[6]+nfc[6]; + + for (ifc=0;ifc<8;ifc++) + { + for (ib=0;ib0) + { + tag=mpi_tag(); + saddr=npr[ifc^0x1]; + raddr=npr[ifc]; + sbuf=gbuf+ofs[ifc]; + rbuf=g+VOLUME+ofs[ifc]; + + if (np&0x1) + { + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + } + else + { + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + } + } + } +} + + +static void random_g(void) +{ + int ix,t; + su3_dble unity,*gx; + + unity=ud0; + unity.c11.re=1.0; + unity.c22.re=1.0; + unity.c33.re=1.0; + gx=g; + + for (ix=0;ix0)||(bc!=1)) + random_su3_dble(gx); + else + (*gx)=unity; + + gx+=1; + } + + if (BNDRY>0) + { + pack_gbuf(); + send_gbuf(); + } +} + + +static void transform_ud(void) +{ + int ix,iy,t,ifc; + su3_dble *u; + + u=udfld(); + + for (ix=(VOLUME/2);ix(bs[mu]/2)) + svec[mu]-=bs[mu]; + } + + MPI_Bcast(svec,4,MPI_INT,0,MPI_COMM_WORLD); +} + + +int main(int argc,char *argv[]) +{ + int my_rank,n,t,s[4]; + double phi[2],phi_prime[2],act1; + double nplaq1,nplaq2,p1,p2; + double d1,d2,d3; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check4.log","w",stdout); + + printf("\n"); + printf("Plaquette sums of the double-precision gauge field\n"); + printf("--------------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check4.c]", + "Syntax: check4 [-bc ]"); + } + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,1.0,1.0,1.0,1.0,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,12345); + geometry(); + + g=amalloc(NSPIN*sizeof(*g),4); + + if (BNDRY>0) + gbuf=amalloc((BNDRY/2)*sizeof(*gbuf),4); + + error((g==NULL)||((BNDRY>0)&&(gbuf==NULL)),1,"main [check4.c]", + "Unable to allocate auxiliary arrays"); + + p1=plaq_sum_dble(1); + p2=plaq_wsum_dble(1); + + if (bc==0) + { + nplaq1=(double)((6*N0-3)*N1)*(double)(N2*N3); + nplaq2=(double)((6*N0-6)*N1)*(double)(N2*N3); + } + else if (bc==3) + { + nplaq1=(double)(6*N0*N1)*(double)(N2*N3); + nplaq2=nplaq1; + } + else + { + nplaq1=(double)((6*N0+3)*N1)*(double)(N2*N3); + nplaq2=(double)(6*N0*N1)*(double)(N2*N3); + } + + d1=0.0; + d2=0.0; + + if (bc==1) + { + d1=cos(phi[0]/(double)(N1))+ + cos(phi[1]/(double)(N1))+ + cos((phi[0]+phi[1])/(double)(N1))+ + cos(phi[0]/(double)(N2))+ + cos(phi[1]/(double)(N2))+ + cos((phi[0]+phi[1])/(double)(N2))+ + cos(phi[0]/(double)(N3))+ + cos(phi[1]/(double)(N3))+ + cos((phi[0]+phi[1])/(double)(N3)); + + d1=(d1-9.0)*(double)(N1*N2*N3); + } + + if ((bc==1)||(bc==2)) + { + d2=cos(phi_prime[0]/(double)(N1))+ + cos(phi_prime[1]/(double)(N1))+ + cos((phi_prime[0]+phi_prime[1])/(double)(N1))+ + cos(phi_prime[0]/(double)(N2))+ + cos(phi_prime[1]/(double)(N2))+ + cos((phi_prime[0]+phi_prime[1])/(double)(N2))+ + cos(phi_prime[0]/(double)(N3))+ + cos(phi_prime[1]/(double)(N3))+ + cos((phi_prime[0]+phi_prime[1])/(double)(N3)); + + d2=(d2-9.0)*(double)(N1*N2*N3); + } + + if (my_rank==0) + { + printf("After field initialization:\n"); + printf("Deviation from expected value (plaq_sum) = %.1e\n", + fabs(1.0-p1/(3.0*nplaq1+d1+d2))); + printf("Deviation from expected value (plaq_wsum) = %.1e\n\n", + fabs(1.0-p2/(3.0*nplaq2+d1+d2))); + } + + print_flags(); + random_ud(); + + p1=plaq_sum_dble(1); + p2=plaq_wsum_dble(1); + act1=plaq_action_slices(asl1); + d1=act1; + + if ((bc==0)||(bc==3)) + { + for (t=0;t +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "global.h" + +static const int plns[6][2]={{0,1},{0,2},{0,3},{2,3},{3,1},{1,2}}; +static int bc,nfc[8],ofs[8],hofs[8]; +static double psum0[8],psum1[8]; +static su3_dble *udb,*hdb; +static su3_dble wd1,wd2 ALIGNED16; + + +static void set_ofs(void) +{ + nfc[0]=FACE0/2; + nfc[1]=FACE0/2; + nfc[2]=FACE1/2; + nfc[3]=FACE1/2; + nfc[4]=FACE2/2; + nfc[5]=FACE2/2; + nfc[6]=FACE3/2; + nfc[7]=FACE3/2; + + ofs[0]=0; + ofs[1]=ofs[0]+(FACE0/2); + ofs[2]=ofs[1]+(FACE0/2); + ofs[3]=ofs[2]+(FACE1/2); + ofs[4]=ofs[3]+(FACE1/2); + ofs[5]=ofs[4]+(FACE2/2); + ofs[6]=ofs[5]+(FACE2/2); + ofs[7]=ofs[6]+(FACE3/2); + + hofs[0]=0; + hofs[1]=hofs[0]+3*FACE0; + hofs[2]=hofs[1]+3*FACE0; + hofs[3]=hofs[2]+3*FACE1; + hofs[4]=hofs[3]+3*FACE1; + hofs[5]=hofs[4]+3*FACE2; + hofs[6]=hofs[5]+3*FACE2; + hofs[7]=hofs[6]+3*FACE3; +} + + +static double plaq0(int n,int ix) +{ + int ip[4]; + double sm; + + plaq_uidx(n,ix,ip); + + su3xsu3(udb+ip[0],udb+ip[1],&wd1); + su3dagxsu3dag(udb+ip[3],udb+ip[2],&wd2); + cm3x3_retr(&wd1,&wd2,&sm); + + return sm; +} + + +static double plaq1(int iu,int ih) +{ + su3xsu3dag(udb+iu,hdb+ih,&wd1); + + return wd1.c11.re+wd1.c22.re+wd1.c33.re; +} + + +static void set_psum0(void) +{ + int ifc,n,ix,mu,nu; + + for (ifc=0;ifc<8;ifc++) + psum0[ifc]=0.0; + + for (ix=0;ix=VOLUME) + psum0[2*mu+1]+=plaq0(n,ix); + + if (iup[ix][nu]>=VOLUME) + psum0[2*nu+1]+=plaq0(n,ix); + + if (idn[ix][mu]>=VOLUME) + psum0[2*mu]+=plaq0(n,ix); + + if (idn[ix][nu]>=VOLUME) + psum0[2*nu]+=plaq0(n,ix); + } + } +} + + +static void set_psum1(void) +{ + int ifc,n,ix,mu,nu,ip[4]; + int iy,ib,iu,ih; + + for (ifc=0;ifc<8;ifc++) + psum1[ifc]=0.0; + + for (ix=0;ix=VOLUME) + { + plaq_uidx(n,ix,ip); + iu=ip[1]; + + ifc=2*mu+1; + iy=iup[ix][mu]-VOLUME; + + if (iy<(BNDRY/2)) + ib=iy-ofs[ifc]; + else + ib=iy-ofs[ifc]-(BNDRY/2)+nfc[ifc]; + + ih=hofs[ifc]+3*ib+nu-(nu>mu); + + psum1[ifc]+=plaq1(iu,ih); + } + + if (iup[ix][nu]>=VOLUME) + { + plaq_uidx(n,ix,ip); + iu=ip[3]; + + ifc=2*nu+1; + iy=iup[ix][nu]-VOLUME; + + if (iy<(BNDRY/2)) + ib=iy-ofs[ifc]; + else + ib=iy-ofs[ifc]-(BNDRY/2)+nfc[ifc]; + + ih=hofs[ifc]+3*ib+mu-(mu>nu); + + psum1[ifc]+=plaq1(iu,ih); + } + + if (idn[ix][mu]>=VOLUME) + { + plaq_uidx(n,ix,ip); + iu=ip[2]; + + ifc=2*mu; + iy=idn[ix][mu]-VOLUME; + + if (iy<(BNDRY/2)) + ib=iy-ofs[ifc]; + else + ib=iy-ofs[ifc]-(BNDRY/2)+nfc[ifc]; + + ih=hofs[ifc]+3*ib+nu-(nu>mu); + + psum1[ifc]+=plaq1(iu,ih); + } + + if (idn[ix][nu]>=VOLUME) + { + plaq_uidx(n,ix,ip); + iu=ip[0]; + + ifc=2*nu; + iy=idn[ix][nu]-VOLUME; + + if (iy<(BNDRY/2)) + ib=iy-ofs[ifc]; + else + ib=iy-ofs[ifc]-(BNDRY/2)+nfc[ifc]; + + ih=hofs[ifc]+3*ib+mu-(mu>nu); + + psum1[ifc]+=plaq1(iu,ih); + } + } + } +} + + +static void check_psums(void) +{ + int ifc,np; + int saddr,raddr,nbf,tag; + double sbuf,rbuf,dmy[8]; + MPI_Status stat; + + np=(cpr[0]+cpr[1]+cpr[2]+cpr[3])&0x1; + + for (ifc=0;ifc<8;ifc++) + { + if (nfc[ifc]>0) + { + saddr=npr[ifc]; + raddr=npr[ifc^0x1]; + sbuf=psum0[ifc]; + nbf=1; + tag=mpi_tag(); + + if (np==0) + { + MPI_Send(&sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + MPI_Recv(&rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + } + else + { + MPI_Recv(&rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + MPI_Send(&sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + } + + if ((bc!=3)&& + (((cpr[0]==0)&&(ifc==1))||((cpr[0]==(NPROC0-1))&&(ifc==0)))) + psum1[ifc^0x1]=0.0; + else + psum1[ifc^0x1]-=rbuf; + } + } + + for (ifc=0;ifc<8;ifc++) + dmy[ifc]=fabs(psum0[ifc]); + + MPI_Reduce(dmy,psum0,8,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Bcast(psum0,8,MPI_DOUBLE,0,MPI_COMM_WORLD); + + for (ifc=0;ifc<8;ifc++) + dmy[ifc]=fabs(psum1[ifc]); + + MPI_Reduce(dmy,psum1,8,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Bcast(psum1,8,MPI_DOUBLE,0,MPI_COMM_WORLD); +} + + +int main(int argc,char *argv[]) +{ + int my_rank,ifc,ie; + double phi[2],phi_prime[2]; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check5.log","w",stdout); + + printf("\n"); + printf("Check of the program set_bstap()\n"); + printf("--------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check5.c]", + "Syntax: check5 [-bc ]"); + } + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,1.0,1.0,1.0,1.0,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,89103); + geometry(); + + print_flags(); + + random_ud(); + set_bstap(); + + print_flags(); + + udb=udfld(); + hdb=bstap(); + set_ofs(); + set_psum0(); + set_psum1(); + check_psums(); + + ie=check_bc(0.0); + error_root(ie==0,1,"main [check5.c]","Boundary conditions changed"); + + if (my_rank==0) + { + for (ifc=0;ifc<8;ifc++) + { + if (nfc[ifc]>0) + { + printf("ifc = %d, max|sum| = %.4e, maximal deviation = %.1e\n", + ifc,psum0[ifc],psum1[ifc]); + } + } + + printf("\n"); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/INDEX b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/INDEX new file mode 100644 index 0000000000000000000000000000000000000000..cd36ba36a838f84804ffb47d4a30da40e2401383 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/INDEX @@ -0,0 +1,22 @@ + +HMC algorithm + +check1 Check of the program set_mdsteps(). + +check2 Reversibility of the MD evolution. + +check3 Conservation of the Hamilton function by the MD evolution. + +check4 Check of add_chrono() and get_chrono(). + +check5 Comparison of rwtm*() with action1(). + +check6 Comparison of rwtm*eo() with action4(). + +The programs check5 and check6 accept the option -bc that allows the +type of boundary condition to be chosen at runtime. When the option is not +set, open boundary conditions are assumed. + +The option may be set but has no effect in the case of the other programs. In +the case of check2 and check3, the boundary conditions are selected through +the input parameter file. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..0a831c210391cb11f6f2fdb3f5ca9589b9b95a24 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/Makefile @@ -0,0 +1,168 @@ +################################################################################ +# +# Makefile to compile and link C programs with MPI subroutines +# +# Version valid for Linux machines with MPICH +# +# "make" compiles and links the specified main programs and modules, +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files generated by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +all: rmxeq mkdep mkxeq +.PHONY: all + + +# main programs and modules to be compiled + +MAIN = check1 check2 check3 check4 check5 check6 + +ARCHIVE = archive + +BLOCK = block blk_grid map_u2blk map_sw2blk map_s2blk + +DFL = dfl_geometry dfl_subspace ltl_gcr dfl_sap_gcr dfl_modes + +DIRAC = Dw_dble Dw Dw_bnd + +FLAGS = flags action_parms dfl_parms force_parms hmc_parms lat_parms \ + mdint_parms rat_parms sap_parms solver_parms + +FORCES = force0 force1 force2 force3 force4 force5 \ + frcfcts genfrc tmcg tmcgm xtensor + +LATTICE = bcnds uidx ftidx geometry + +LINALG = salg salg_dble valg valg_dble liealg cmatrix_dble cmatrix + +LINSOLV = cgne fgcr fgcr4vd mscg + +LITTLE = Aw_gen Aw_com Aw_ops Aw_dble Aw ltl_modes + +MDFLDS = mdflds fcom + +RANDOM = ranlux ranlxs ranlxd gauss + +RATFCTS = elliptic zolotarev ratfcts + +SAP = sap_com sap_gcr sap blk_solv + +SFLDS = sflds scom sdcom Pbnd Pbnd_dble + +SU3FCTS = chexp su3prod su3ren cm3x3 random_su3 + +SW_TERM = pauli pauli_dble swflds sw_term + +TCHARGE = ftcom ftensor + +UFLDS = plaq_sum uflds udcom bstap + +UPDATE = chrono mdsteps counters mdint hmc rwrat rwtm rwtmeo + +UTILS = endian mutils utils wspace + +VFLDS = vflds vinit vcom vdcom + +MODULES = $(ARCHIVE) $(BLOCK) $(DFL) $(DIRAC) $(FLAGS) $(FORCES) \ + $(LATTICE) $(LINALG) $(LINSOLV) $(LITTLE) $(MDFLDS) $(RANDOM) \ + $(RATFCTS) $(SAP) $(SFLDS) $(SU3FCTS) $(SW_TERM) $(TCHARGE) \ + $(UFLDS) $(UPDATE) $(UTILS) $(VFLDS) + + +# Logging option (-mpilog or -mpitrace or -mpianim) + +LOGOPTION = + + +# search path for modules + +MDIR = ../../modules + +VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/archive:$(MDIR)/linalg:\ + $(MDIR)/random:$(MDIR)/uflds:$(MDIR)/mdflds:$(MDIR)/su3fcts:\ + $(MDIR)/utils:$(MDIR)/forces:$(MDIR)/sflds:$(MDIR)/dirac:\ + $(MDIR)/sw_term:$(MDIR)/tcharge:$(MDIR)/block:$(MDIR)/sap:\ + $(MDIR)/linsolv:$(MDIR)/dfl:$(MDIR)/vflds:$(MDIR)/little:\ + $(MDIR)/update:$(MDIR)/ratfcts + + +# additional include directories + +INCPATH = $(MPI_INCLUDE) ../../include + + +# additional libraries + +LIBS = m + +LIBPATH = $(MPI_HOME)/lib + + +# scheduling and optimization options + +CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ + -Wall -Wno-long-long -Wstrict-prototypes -Werror \ + -O -mno-avx -Dx64 -DPM + +# -DMDINT_DBG -DRWRAT_DBG + + +############################## do not change ################################### + +SHELL=/bin/bash +CC=$(MPI_HOME)/bin/mpicc +CLINKER=$(CC) + +PGMS= $(MAIN) $(MODULES) + +-include $(addsuffix .d,$(PGMS)) + + +# rule to make dependencies + +$(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + +# rule to link object files + +$(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(CFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ + + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables + +rmxeq: + @ -rm -f $(MAIN); \ + echo "delete old executables" + + +# make dependencies + +mkdep: $(addsuffix .d,$(PGMS)) + @ echo "generate tables of dependencies" + + +# clean directory + +clean: + @ -rm -rf *.d *.o *.alog *.clog *.slog $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/check1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/check1.c new file mode 100644 index 0000000000000000000000000000000000000000..033baccbb528de1949ace998ee53d361dcfbebe1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/check1.c @@ -0,0 +1,207 @@ + +/******************************************************************************* +* +* File check1.c +* +* Copyright (C) 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of set_mdsteps(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "mdflds.h" +#include "update.h" +#include "global.h" + +static int my_rank; +static force_t force[]={FRG,FRF_TM1,FRF_TM1_EO,FRF_TM1_EO_SDET, + FRF_TM2,FRF_TM2_EO,FRF_RAT,FRF_RAT_SDET}; + + +static void read_hmc_parms(void) +{ + int nlv; + double tau; + + if (my_rank==0) + { + find_section("HMC parameters"); + read_line("nlv","%d",&nlv); + read_line("tau","%lf",&tau); + } + + MPI_Bcast(&nlv,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&tau,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + set_hmc_parms(0,NULL,0,0,NULL,nlv,tau); +} + + +static void read_integrator(void) +{ + int nlv,i,j,k,idf; + int irat[3],imu[4],isp[4],ncr[4]; + hmc_parms_t hmc; + mdint_parms_t mdp; + force_parms_t fp; + char line[NAME_SIZE]; + + for (i=0;i<3;i++) + irat[i]=0; + + for (i=0;i<4;i++) + { + imu[i]=0; + isp[i]=0; + ncr[i]=0; + } + + hmc=hmc_parms(); + nlv=hmc.nlv; + + for (i=0;i +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "mdflds.h" +#include "archive.h" +#include "forces.h" +#include "dfl.h" +#include "update.h" +#include "global.h" + +static int my_rank; + + +static void read_lat_parms(void) +{ + int nk; + double beta,c0,csw,*kappa; + + if (my_rank==0) + { + find_section("Lattice parameters"); + read_line("beta","%lf",&beta); + read_line("c0","%lf",&c0); + nk=count_tokens("kappa"); + read_line("csw","%lf",&csw); + } + + MPI_Bcast(&beta,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&c0,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&nk,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&csw,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + if (nk>0) + { + kappa=malloc(nk*sizeof(*kappa)); + error(kappa==NULL,1,"read_lat_parms [check2.c]", + "Unable to allocate parameter array"); + if (my_rank==0) + read_dprms("kappa",nk,kappa); + MPI_Bcast(kappa,nk,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + else + kappa=NULL; + + set_lat_parms(beta,c0,nk,kappa,csw); + + if (nk>0) + free(kappa); +} + + +static void read_bc_parms(void) +{ + int bc; + double cG,cG_prime,cF,cF_prime; + double phi[2],phi_prime[2]; + + find_section("Boundary conditions"); + read_line("type","%d",&bc); + + phi[0]=0.0; + phi[1]=0.0; + phi_prime[0]=0.0; + phi_prime[1]=0.0; + cG=1.0; + cG_prime=1.0; + cF=1.0; + cF_prime=1.0; + + if (bc==1) + read_dprms("phi",2,phi); + + if ((bc==1)||(bc==2)) + read_dprms("phi'",2,phi_prime); + + if (bc!=3) + { + read_line("cG","%lf",&cG); + read_line("cF","%lf",&cF); + } + + if (bc==2) + { + read_line("cG'","%lf",&cG_prime); + read_line("cF'","%lf",&cF_prime); + } + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(phi,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(phi_prime,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cG,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cG_prime,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cF,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cF_prime,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + set_bc_parms(bc,cG,cG_prime,cF,cF_prime,phi,phi_prime); +} + + +static void read_hmc_parms(void) +{ + int nact,*iact; + int npf,nmu,nlv; + double tau,*mu; + + if (my_rank==0) + { + find_section("HMC parameters"); + nact=count_tokens("actions"); + read_line("npf","%d",&npf); + nmu=count_tokens("mu"); + read_line("nlv","%d",&nlv); + read_line("tau","%lf",&tau); + } + + MPI_Bcast(&nact,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&npf,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmu,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nlv,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&tau,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + if (nact>0) + { + iact=malloc(nact*sizeof(*iact)); + error(iact==NULL,1,"read_hmc_parms [check2.c]", + "Unable to allocate temporary array"); + if (my_rank==0) + read_iprms("actions",nact,iact); + MPI_Bcast(iact,nact,MPI_INT,0,MPI_COMM_WORLD); + } + else + iact=NULL; + + if (nmu>0) + { + mu=malloc(nmu*sizeof(*mu)); + error(mu==NULL,1,"read_hmc_parms [check2.c]", + "Unable to allocate temporary array"); + if (my_rank==0) + read_dprms("mu",nmu,mu); + MPI_Bcast(mu,nmu,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + else + mu=NULL; + + set_hmc_parms(nact,iact,npf,nmu,mu,nlv,tau); + + if (nact>0) + free(iact); + if (nmu>0) + free(mu); +} + + +static void read_integrator(void) +{ + int nlv,i,j,k,l; + hmc_parms_t hmc; + mdint_parms_t mdp; + force_parms_t fp; + rat_parms_t rp; + + hmc=hmc_parms(); + nlv=hmc.nlv; + + for (i=0;i0)) + add2counter("modes",2,status+2); +} + + +static void start_hmc(double *act0,su3_dble *uold) +{ + int i,n,nact,*iact; + int status[3]; + double *mu; + su3_dble *udb; + dfl_parms_t dfl; + hmc_parms_t hmc; + action_parms_t ap; + + clear_counters(); + + udb=udfld(); + cm3x3_assign(4*VOLUME,udb,uold); + chs_ubnd(-1); + random_mom(); + + dfl=dfl_parms(); + + if (dfl.Ns) + { + dfl_modes(status); + error_root(status[0]<0,1,"start_hmc [hmc.c]", + "Deflation subspace generation failed (status = %d)", + status[0]); + add2counter("modes",0,status); + } + + hmc=hmc_parms(); + nact=hmc.nact; + iact=hmc.iact; + mu=hmc.mu; + n=2; + + for (i=0;idmax) + dmax=dev; + } + + return dmax; +} + + +static double max_dev_ud(su3_dble *v) +{ + double d,dmax; + su3_dble *u,*um; + + u=udfld(); + um=u+4*VOLUME; + dmax=0.0; + + for (;udmax) + dmax=d; + + v+=1; + } + + if (NPROC>1) + { + d=dmax; + MPI_Reduce(&d,&dmax,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Bcast(&dmax,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + + return sqrt(dmax); +} + + +int main(int argc,char *argv[]) +{ + int first,last,step; + int nc,nsize,icnfg,nact,i; + int isap,idfl; + int nwud,nws,nwsd,nwv,nwvd; + double *act0,*act1,*act2; + double sm0[2],sm1[2],dud,dH; + double dudmin,dudmax,dudavg,dHmin,dHmax,dHavg; + su3_dble **usv; + hmc_parms_t hmc; + char cnfg_dir[NAME_SIZE],cnfg_file[NAME_SIZE]; + char nbase[NAME_SIZE]; + FILE *flog=NULL,*fin=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check2.log","w",stdout); + fin=freopen("check2.in","r",stdin); + + printf("\n"); + printf("Reversibility of the MD evolution\n"); + printf("---------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + find_section("Configurations"); + read_line("cnfg_dir","%s",cnfg_dir); + read_line("name","%s",nbase); + read_line("first","%d",&first); + read_line("last","%d",&last); + read_line("step","%d",&step); + } + + MPI_Bcast(cnfg_dir,NAME_SIZE,MPI_CHAR,0,MPI_COMM_WORLD); + MPI_Bcast(nbase,NAME_SIZE,MPI_CHAR,0,MPI_COMM_WORLD); + MPI_Bcast(&first,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&last,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&step,1,MPI_INT,0,MPI_COMM_WORLD); + + read_lat_parms(); + read_bc_parms(); + read_hmc_parms(); + read_actions(); + read_integrator(); + read_solvers(); + + if (my_rank==0) + fclose(fin); + + hmc_wsize(&nwud,&nws,&nwsd,&nwv,&nwvd); + alloc_wud(nwud); + alloc_ws(nws); + alloc_wsd(nwsd); + alloc_wv(nwv); + alloc_wvd(nwvd); + usv=reserve_wud(1); + + hmc=hmc_parms(); + nact=hmc.nact; + act0=malloc(3*(nact+1)*sizeof(*act0)); + act1=act0+nact+1; + act2=act1+nact+1; + error(act0==NULL,1,"main [check2.c]","Unable to allocate action arrays"); + + print_lat_parms(); + print_bc_parms(); + print_hmc_parms(); + print_action_parms(); + print_rat_parms(); + print_mdint_parms(); + print_force_parms2(); + print_solver_parms(&isap,&idfl); + if (isap) + print_sap_parms(0); + if (idfl) + print_dfl_parms(1); + + if (my_rank==0) + { + printf("Configurations %sn%d -> %sn%d in steps of %d\n\n", + nbase,first,nbase,last,step); + fflush(flog); + } + + start_ranlux(0,1234); + geometry(); + + error_root(((last-first)%step)!=0,1,"main [check2.c]", + "last-first is not a multiple of step"); + check_dir_root(cnfg_dir); + + nsize=name_size("%s/%sn%d",cnfg_dir,nbase,last); + error_root(nsize>=NAME_SIZE,1,"main [check2.c]", + "Configuration file name is too long"); + + hmc_sanity_check(); + set_mdsteps(); + setup_counters(); + setup_chrono(); + + dudmin=0.0; + dudmax=0.0; + dudavg=0.0; + dHmin=0.0; + dHmax=0.0; + dHavg=0.0; + + for (icnfg=first;icnfg<=last;icnfg+=step) + { + sprintf(cnfg_file,"%s/%sn%d",cnfg_dir,nbase,icnfg); + import_cnfg(cnfg_file); + + if (my_rank==0) + { + printf("Configuration no %d\n",icnfg); + fflush(flog); + } + + start_hmc(act0,usv[0]); + dud=max_dev_ud(usv[0]); + run_mdint(); + end_hmc(act1); + + sm0[0]=0.0; + sm0[1]=0.0; + + for (i=0;i<=nact;i++) + { + sm0[0]+=act0[i]; + sm0[1]+=(act1[i]-act0[i]); + } + + MPI_Reduce(sm0,sm1,2,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(sm1,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + + if (my_rank==0) + { + printf("start_hmc:\n"); + printf("max|U_ij-U'_ij| = %.1e\n",dud); + printf("run_mdint:\n"); + printf("H = %.6e\n",sm1[0]); + printf("dH = %.2e\n",sm1[1]); + fflush(flog); + } + + print_all_avgstat(); + + flip_mom(); + run_mdint(); + end_hmc(act2); + + sm0[0]=0.0; + sm0[1]=0.0; + + for (i=0;i<=nact;i++) + { + sm0[0]+=act2[i]; + sm0[1]+=(act2[i]-act0[i]); + } + + MPI_Reduce(sm0,sm1,2,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(sm1,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + + dH=fabs(sm1[1]); + dud=max_dev_ud(usv[0]); + error_chk(); + + if (my_rank==0) + { + printf("Flip momenta and run_mdint:\n"); + printf("H = %.6e\n",sm1[0]); + printf("|dH| = % .2e\n",dH); + printf("max|U_ij-U'_ij| = %.2e\n\n",dud); + fflush(flog); + } + + if (icnfg==first) + { + dudmin=dud; + dudmax=dud; + dudavg=dud; + + dHmin=dH; + dHmax=dH; + dHavg=dH; + } + else + { + if (duddudmax) + dudmax=dud; + dudavg+=dud; + + if (dHdHmax) + dHmax=dH; + dHavg+=dH; + } + } + + if (my_rank==0) + { + nc=(last-first)/step+1; + + printf("Test summary\n"); + printf("------------\n\n"); + + printf("Considered %d configurations in the range %d -> %d\n\n", + nc,first,last); + + printf("The three figures quoted in each case are the minimal,\n"); + printf("maximal and average values\n\n"); + + printf("max|U_ij-U'_ij| = %.2e, %.2e, %.2e\n", + dudmin,dudmax,dudavg/(double)(nc)); + printf("|dH| = %.2e, %.2e, %.2e\n\n", + dHmin,dHmax,dHavg/(double)(nc)); + + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/check2.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/check2.in new file mode 100644 index 0000000000000000000000000000000000000000..f9bea7b590a6990e38c3ee8854dab0859aadf84f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/check2.in @@ -0,0 +1,161 @@ + +[Configurations] +cnfg_dir /home/data/openQCD/cnfg +name 16x8x8x8b6.00id2 +first 7 +last 7 +step 2 + +[Lattice parameters] +beta 6.0 +c0 1.6667 +kappa 0.1300 0.12505 +csw 1.234 + +[Boundary conditions] +type 0 +#phi 0.12 -0.56 +#phi' 0.92 0.76 +cG 1.10 +#cG' 1.05 +cF 0.95 +#cF' 0.90 + +[HMC parameters] +actions 0 1 2 3 4 +npf 4 +mu 0.1 1.0 +nlv 3 +tau 0.5 + +[Action 1] +action ACF_TM1 +ipf 0 +im0 0 +imu 1 +isp 0 + +[Action 0] +action ACG + +[Action 2] +action ACF_TM2 +ipf 1 +im0 0 +imu 0 1 +isp 1 0 + +[Action 3] +action ACF_RAT_SDET +ipf 2 +im0 1 +irat 0 0 6 +isp 2 + +[Action 4] +action ACF_RAT +ipf 3 +im0 1 +irat 0 7 11 +isp 2 + +[Rational 0] +degree 12 +range 0.001 7.7 + +[Level 0] +integrator OMF4 +nstep 1 +forces 0 + +[Level 1] +integrator OMF4 +nstep 1 +forces 1 2 3 + +[Level 2] +integrator LPFR +nstep 3 +forces 4 + +[Force 0] +force FRG + +[Force 1] +force FRF_TM1 +isp 3 +ncr 0 + +[Force 2] +force FRF_TM2 +isp 4 +ncr 0 + +[Force 3] +force FRF_RAT_SDET +isp 5 + +[Force 4] +force FRF_RAT +isp 5 + +[Solver 0] +solver CGNE +nmx 256 +res 1.0e-12 + +[Solver 1] +solver SAP_GCR +nkv 16 +isolv 1 +nmr 4 +ncy 5 +nmx 24 +res 1.0e-12 + +[Solver 2] +solver MSCG +nmx 256 +res 1.0e-12 + +[Solver 3] +solver CGNE +nmx 256 +res 1.0e-10 + +[Solver 4] +solver SAP_GCR +nkv 16 +isolv 1 +nmr 4 +ncy 5 +nmx 24 +res 1.0e-10 + +[Solver 5] +solver MSCG +nmx 256 +res 1.0e-10 + +[SAP] +bs 4 4 4 4 + +[Deflation subspace] +bs 4 4 4 4 +Ns 8 + +[Deflation subspace generation] +kappa 0.1350 +mu 0.01 +ninv 5 +nmr 4 +ncy 5 + +[Deflation projection] +nkv 16 +nmx 64 +res 1.0e-2 + +[Deflation update scheme] +dtau 0.3 +nsm 1 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/check3.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/check3.c new file mode 100644 index 0000000000000000000000000000000000000000..4c79ba504d2d1e6a40d37fcaa18dc9948e4d74c0 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/check3.c @@ -0,0 +1,812 @@ + +/******************************************************************************* +* +* File check3.c +* +* Copyright (C) 2005, 2007, 2009-2013 Martin Luescher, Filippo Palombi, +* Stefan Schaefer +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Conservation of the Hamilton function by the MD evolution. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "mdflds.h" +#include "linalg.h" +#include "archive.h" +#include "forces.h" +#include "dfl.h" +#include "update.h" +#include "global.h" + +static int my_rank; + + +static void read_lat_parms(void) +{ + int nk; + double beta,c0,csw,*kappa; + + if (my_rank==0) + { + find_section("Lattice parameters"); + read_line("beta","%lf",&beta); + read_line("c0","%lf",&c0); + nk=count_tokens("kappa"); + read_line("csw","%lf",&csw); + } + + MPI_Bcast(&beta,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&c0,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&nk,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&csw,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + if (nk>0) + { + kappa=malloc(nk*sizeof(*kappa)); + error(kappa==NULL,1,"read_lat_parms [check3.c]", + "Unable to allocate parameter array"); + if (my_rank==0) + read_dprms("kappa",nk,kappa); + MPI_Bcast(kappa,nk,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + else + kappa=NULL; + + set_lat_parms(beta,c0,nk,kappa,csw); + + if (nk>0) + free(kappa); +} + + +static void read_bc_parms(void) +{ + int bc; + double cG,cG_prime,cF,cF_prime; + double phi[2],phi_prime[2]; + + find_section("Boundary conditions"); + read_line("type","%d",&bc); + + phi[0]=0.0; + phi[1]=0.0; + phi_prime[0]=0.0; + phi_prime[1]=0.0; + cG=1.0; + cG_prime=1.0; + cF=1.0; + cF_prime=1.0; + + if (bc==1) + read_dprms("phi",2,phi); + + if ((bc==1)||(bc==2)) + read_dprms("phi'",2,phi_prime); + + if (bc!=3) + { + read_line("cG","%lf",&cG); + read_line("cF","%lf",&cF); + } + + if (bc==2) + { + read_line("cG'","%lf",&cG_prime); + read_line("cF'","%lf",&cF_prime); + } + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(phi,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(phi_prime,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cG,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cG_prime,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cF,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cF_prime,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + set_bc_parms(bc,cG,cG_prime,cF,cF_prime,phi,phi_prime); +} + + +static void read_hmc_parms(void) +{ + int nact,*iact; + int npf,nmu,nlv; + double tau,*mu; + + if (my_rank==0) + { + find_section("HMC parameters"); + nact=count_tokens("actions"); + read_line("npf","%d",&npf); + nmu=count_tokens("mu"); + read_line("nlv","%d",&nlv); + read_line("tau","%lf",&tau); + } + + MPI_Bcast(&nact,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&npf,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmu,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nlv,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&tau,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + if (nact>0) + { + iact=malloc(nact*sizeof(*iact)); + error(iact==NULL,1,"read_hmc_parms [check3.c]", + "Unable to allocate temporary array"); + if (my_rank==0) + read_iprms("actions",nact,iact); + MPI_Bcast(iact,nact,MPI_INT,0,MPI_COMM_WORLD); + } + else + iact=NULL; + + if (nmu>0) + { + mu=malloc(nmu*sizeof(*mu)); + error(mu==NULL,1,"read_hmc_parms [check3.c]", + "Unable to allocate temporary array"); + if (my_rank==0) + read_dprms("mu",nmu,mu); + MPI_Bcast(mu,nmu,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + else + mu=NULL; + + set_hmc_parms(nact,iact,npf,nmu,mu,nlv,tau); + + if (nact>0) + free(iact); + if (nmu>0) + free(mu); +} + + +static void read_integrator(void) +{ + int nlv,i,j,k,l; + hmc_parms_t hmc; + mdint_parms_t mdp; + force_parms_t fp; + rat_parms_t rp; + + hmc=hmc_parms(); + nlv=hmc.nlv; + + for (i=0;i0)) + add2counter("modes",2,status+2); +} + + +static void start_hmc(double *act0,su3_dble *uold,su3_alg_dble *mold) +{ + int i,n,nact,*iact; + int status[3]; + double *mu; + su3_dble *udb; + mdflds_t *mdfs; + dfl_parms_t dfl; + hmc_parms_t hmc; + action_parms_t ap; + + clear_counters(); + + udb=udfld(); + cm3x3_assign(4*VOLUME,udb,uold); + chs_ubnd(-1); + random_mom(); + mdfs=mdflds(); + assign_alg2alg(4*VOLUME,(*mdfs).mom,mold); + dfl=dfl_parms(); + + if (dfl.Ns) + { + dfl_modes(status); + error_root(status[0]<0,1,"start_hmc [hmc.c]", + "Deflation subspace generation failed (status = %d)", + status[0]); + add2counter("modes",0,status); + } + + hmc=hmc_parms(); + nact=hmc.nact; + iact=hmc.iact; + mu=hmc.mu; + n=2; + + for (i=0;i %sn%d in steps of %d\n\n", + nbase,first,nbase,last,step); + fflush(flog); + } + + start_ranlux(0,1234); + geometry(); + + error_root(((last-first)%step)!=0,1,"main [check3.c]", + "last-first is not a multiple of step"); + check_dir_root(cnfg_dir); + + nsize=name_size("%s/%sn%d",cnfg_dir,nbase,last); + error_root(nsize>=NAME_SIZE,1,"main [check3.c]", + "Configuration file name is too long"); + + hmc_sanity_check(); + setup_counters(); + setup_chrono(); + + for (icnfg=first;icnfg<=last;icnfg+=step) + { + sprintf(cnfg_file,"%s/%sn%d",cnfg_dir,nbase,icnfg); + import_cnfg(cnfg_file); + + if (my_rank==0) + { + printf("Configuration no %d\n",icnfg); + fflush(flog); + } + + for (i=0;i<4;i++) + { + set_hmc_parms(hmc.nact,hmc.iact,hmc.npf, + hmc.nmu,hmc.mu,hmc.nlv,tau[i]); + set_mdsteps(); + + if (i==0) + start_hmc(act0,usv[0],fsv[0]); + else + restart_hmc(usv[0],fsv[0]); + + run_mdint(); + end_hmc(act1); + + sm0[0]=0.0; + sm0[1]=0.0; + sm0[2]=0.0; + + for (j=0;j<=nact;j++) + { + sm0[0]+=act0[j]; + sm0[1]+=act1[j]; + sm0[2]+=(act1[j]-act0[j]); + } + + MPI_Reduce(sm0,sm1,3,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(sm1,3,MPI_DOUBLE,0,MPI_COMM_WORLD); + dH[i]=fabs(sm1[2]); + + if (my_rank==0) + { + if (i==0) + { + printf("start_hmc:\n"); + printf("H = %.6e\n",sm1[0]); + fflush(flog); + } + + printf("run_md:\n"); + printf("tau = %.3f\n",tau[i]); + printf("H = %.6e, |dH| = %.2e\n",sm1[1],dH[i]); + fflush(flog); + } + + print_all_avgstat(); + } + + error_chk(); + + if (my_rank==0) + { + printf("\n"); + printf("tau = %.2e, |dH| = %.2e\n",tau[0],dH[0]); + + for (i=1;i<4;i++) + { + printf("tau = %.2e, |dH| = %.2e, |dH[i]|/|dH[i-1]| = %.2e\n", + tau[i],dH[i],dH[i]/dH[i-1]); + } + + printf("\n"); + printf("(From one tau to the next, the scale factor s is 4^(1/3),\n" + "i.e. s^{-3,-4,-5} = {%.2e,%.2e,%.2e})\n\n", + pow(4.0,-1.0),pow(4.0,-4.0/3.0),pow(4.0,-5.0/3.0)); + fflush(flog); + } + } + + if (my_rank==0) + fclose(flog); + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/check3.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/check3.in new file mode 100644 index 0000000000000000000000000000000000000000..0d44690a453e510d120ef8a6783024a8ac951e8e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/check3.in @@ -0,0 +1,161 @@ + +[Configurations] +cnfg_dir /home/data/openQCD/cnfg +name 16x8x8x8b6.00id2 +first 7 +last 7 +step 2 + +[Lattice parameters] +beta 6.0 +c0 1.6667 +kappa 0.1300 0.12505 +csw 1.234 + +[Boundary conditions] +type 0 +#phi 0.12 -0.56 +#phi' 0.92 0.76 +cG 1.10 +#cG' 1.05 +cF 0.95 +#cF' 0.90 + +[HMC parameters] +actions 0 1 2 3 4 +npf 4 +mu 0.1 1.0 +nlv 3 +tau 0.5 + +[Action 1] +action ACF_TM1 +ipf 0 +im0 0 +imu 1 +isp 0 + +[Action 0] +action ACG + +[Action 2] +action ACF_TM2 +ipf 1 +im0 0 +imu 0 1 +isp 1 0 + +[Action 3] +action ACF_RAT_SDET +ipf 2 +im0 1 +irat 0 0 6 +isp 2 + +[Action 4] +action ACF_RAT +ipf 3 +im0 1 +irat 0 7 11 +isp 2 + +[Rational 0] +degree 12 +range 0.001 7.7 + +[Level 0] +integrator OMF4 +nstep 1 +forces 0 + +[Level 1] +integrator OMF4 +nstep 1 +forces 1 2 3 + +[Level 2] +integrator LPFR +nstep 2 +forces 4 + +[Force 0] +force FRG + +[Force 1] +force FRF_TM1 +isp 3 +ncr 0 + +[Force 2] +force FRF_TM2 +isp 4 +ncr 0 + +[Force 3] +force FRF_RAT_SDET +isp 5 + +[Force 4] +force FRF_RAT +isp 5 + +[Solver 0] +solver CGNE +nmx 256 +res 1.0e-12 + +[Solver 1] +solver SAP_GCR +nkv 16 +isolv 1 +nmr 4 +ncy 5 +nmx 24 +res 1.0e-12 + +[Solver 2] +solver MSCG +nmx 256 +res 1.0e-12 + +[Solver 3] +solver CGNE +nmx 256 +res 1.0e-10 + +[Solver 4] +solver SAP_GCR +nkv 16 +isolv 1 +nmr 4 +ncy 5 +nmx 24 +res 1.0e-10 + +[Solver 5] +solver MSCG +nmx 256 +res 1.0e-10 + +[SAP] +bs 4 4 4 4 + +[Deflation subspace] +bs 4 4 4 4 +Ns 8 + +[Deflation subspace generation] +kappa 0.1350 +mu 0.01 +ninv 5 +nmr 4 +ncy 5 + +[Deflation projection] +nkv 16 +nmx 64 +res 1.0e-2 + +[Deflation update scheme] +dtau 0.3 +nsm 1 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/check4.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/check4.c new file mode 100644 index 0000000000000000000000000000000000000000..bb22f314e42bb1e1cf62f6c8c1fd0c3c4decbb4a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/check4.c @@ -0,0 +1,149 @@ + +/******************************************************************************* +* +* File check4.c +* +* Copyright (C) 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of add_chrono() and get_chrono(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "sflds.h" +#include "linalg.h" +#include "mdflds.h" +#include "update.h" +#include "global.h" + + +static void set_psi(spinor_dble **chi,spinor_dble *psi) +{ + int i; + double t; + complex_dble z; + + t=mdtime(); + assign_sd2sd(VOLUME,chi[0],psi); + + for (i=1;i<4;i++) + { + z.re=pow(t,(double)(i)); + z.im=0.0; + mulc_spinor_add_dble(VOLUME,psi,chi[i],z); + } +} + + +int main(int argc,char *argv[]) +{ + int my_rank,i; + int nop,iop,itu; + int ncr,ifr,zero; + double phi[2],phi_prime[2]; + double kappa,mu,eps,dev; + spinor_dble **chi,**wsd; + mdstep_t *s,*sm; + FILE *flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check4.log","w",stdout); + + printf("\n"); + printf("Check of add_chrono() and get_chrono()\n"); + printf("--------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + } + + mu=0.5; + zero=0; + ncr=4; + + kappa=0.1365; + set_lat_parms(5.3,1.6667,1,&kappa,1.789); + phi[0]=0.378; + phi[1]=0.012; + phi_prime[0]=0.892; + phi_prime[1]=0.912; + set_bc_parms(0,1.23,1.27,0.98,1.03,phi,phi_prime); + + set_hmc_parms(0,NULL,1,1,&mu,2,2.0); + ifr=0; + set_mdint_parms(0,OMF4,0.0,1,1,&ifr); + ifr=1; + set_mdint_parms(1,OMF4,0.2,ncr,1,&ifr); + + set_force_parms(0,FRG,0,0,0,NULL,NULL,NULL); + set_force_parms(1,FRF_TM1,0,0,0,&zero,&zero,&ncr); + + print_mdint_parms(); + print_force_parms(); + + start_ranlux(0,1234); + geometry(); + alloc_wsd(6); + chi=reserve_wsd(4); + wsd=reserve_wsd(2); + + setup_chrono(); + set_mdsteps(); + s=mdsteps(&nop,&itu); + sm=s+nop; + + for (i=0;i<4;i++) + random_sd(VOLUME,chi[i],1.0); + + for (;s +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "mdflds.h" +#include "sflds.h" +#include "linalg.h" +#include "dirac.h" +#include "sap.h" +#include "dfl.h" +#include "forces.h" +#include "update.h" +#include "global.h" + + +static double random_pf(void) +{ + mdflds_t *mdfs; + + mdfs=mdflds(); + random_sd(VOLUME,(*mdfs).pf[0],1.0); + bnd_sd2zero(ALL_PTS,(*mdfs).pf[0]); + + return norm_square_dble(VOLUME,1,(*mdfs).pf[0]); +} + + +static void divide_pf(double mu,int isp,int *status) +{ + mdflds_t *mdfs; + spinor_dble *phi,*chi,**wsd; + solver_parms_t sp; + sap_parms_t sap; + + mdfs=mdflds(); + phi=(*mdfs).pf[0]; + sp=solver_parms(isp); + + if (sp.solver==CGNE) + { + tmcg(sp.nmx,sp.res,mu,phi,phi,status); + + error_root(status[0]<0,1,"divide_pf [check5.c]", + "CGNE solver failed (parameter set no %d, status = %d)", + isp,status[0]); + + wsd=reserve_wsd(1); + chi=wsd[0]; + assign_sd2sd(VOLUME,phi,chi); + Dw_dble(-mu,chi,phi); + mulg5_dble(VOLUME,phi); + release_wsd(); + } + else if (sp.solver==SAP_GCR) + { + sap=sap_parms(); + set_sap_parms(sap.bs,sp.isolv,sp.nmr,sp.ncy); + + mulg5_dble(VOLUME,phi); + sap_gcr(sp.nkv,sp.nmx,sp.res,mu,phi,phi,status); + + error_root(status[0]<0,1,"divide_pf [check5.c]", + "SAP_GCR solver failed (parameter set no %d, status = %d)", + isp,status[0]); + + set_sap_parms(sap.bs,sap.isolv,sap.nmr,sap.ncy); + } + else if (sp.solver==DFL_SAP_GCR) + { + sap=sap_parms(); + set_sap_parms(sap.bs,sp.isolv,sp.nmr,sp.ncy); + + mulg5_dble(VOLUME,phi); + dfl_sap_gcr(sp.nkv,sp.nmx,sp.res,mu,phi,phi,status); + + error_root((status[0]<0)||(status[1]<0),1, + "divide_pf [check5.c]","DFL_SAP_GCR solver failed " + "(parameter set no %d, status = (%d,%d,%d))", + isp,status[0],status[1],status[2]); + + set_sap_parms(sap.bs,sap.isolv,sap.nmr,sap.ncy); + } +} + + +int main(int argc,char *argv[]) +{ + int my_rank,bc,irw,isp,status[6],mnkv; + int bs[4],Ns,nmx,nkv,nmr,ncy,ninv; + double chi[2],chi_prime[2]; + double kappa,mu,res; + double mu1,mu2,act0,act1,sqn0,sqn1; + double da,ds,damx,dsmx; + solver_parms_t sp; + FILE *flog=NULL,*fin=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check5.log","w",stdout); + fin=freopen("check5.in","r",stdin); + + printf("\n"); + printf("Comparison of rwtm*() with action1()\n"); + printf("------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check6.c]", + "Syntax: check6 [-bc ]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.782); + print_lat_parms(); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + chi[0]=0.123; + chi[1]=-0.534; + chi_prime[0]=0.912; + chi_prime[1]=0.078; + set_bc_parms(bc,1.0,1.0,0.953,1.203,chi,chi_prime); + print_bc_parms(); + + mnkv=0; + + for (isp=0;isp<3;isp++) + { + read_solver_parms(isp); + sp=solver_parms(isp); + + if (sp.nkv>mnkv) + mnkv=sp.nkv; + } + + if (my_rank==0) + { + find_section("SAP"); + read_line("bs","%d %d %d %d",bs,bs+1,bs+2,bs+3); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + set_sap_parms(bs,0,1,1); + + if (my_rank==0) + { + find_section("Deflation subspace"); + read_line("bs","%d %d %d %d",bs,bs+1,bs+2,bs+3); + read_line("Ns","%d",&Ns); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&Ns,1,MPI_INT,0,MPI_COMM_WORLD); + set_dfl_parms(bs,Ns); + + if (my_rank==0) + { + find_section("Deflation subspace generation"); + read_line("kappa","%lf",&kappa); + read_line("mu","%lf",&mu); + read_line("ninv","%d",&ninv); + read_line("nmr","%d",&nmr); + read_line("ncy","%d",&ncy); + } + + MPI_Bcast(&kappa,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&mu,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&ninv,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmr,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&ncy,1,MPI_INT,0,MPI_COMM_WORLD); + set_dfl_gen_parms(kappa,mu,ninv,nmr,ncy); + + if (my_rank==0) + { + find_section("Deflation projection"); + read_line("nkv","%d",&nkv); + read_line("nmx","%d",&nmx); + read_line("res","%lf",&res); + fclose(fin); + } + + MPI_Bcast(&nkv,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmx,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&res,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + set_dfl_pro_parms(nkv,nmx,res); + set_hmc_parms(0,NULL,1,0,NULL,1,1.0); + + print_solver_parms(status,status+1); + print_sap_parms(0); + print_dfl_parms(0); + + start_ranlux(0,1245); + geometry(); + + mnkv=2*mnkv+2; + if (mnkv<(Ns+2)) + mnkv=Ns+2; + if (mnkv<5) + mnkv=5; + + alloc_ws(mnkv); + alloc_wsd(6); + alloc_wv(2*nkv+2); + alloc_wvd(4); + damx=0.0; + dsmx=0.0; + + for (irw=1;irw<5;irw++) + { + for (isp=0;isp<3;isp++) + { + if (isp==0) + { + set_sw_parms(1.0877); + if (irw<3) + mu1=1.0; + else + mu1=0.0; + mu2=1.23; + } + else if (isp==1) + { + set_sw_parms(0.0877); + if (irw<3) + mu1=0.1; + else + mu1=0.0; + mu2=0.123; + } + else + { + set_sw_parms(-0.0123); + if (irw<3) + mu1=0.01; + else + mu1=0.0; + mu2=0.0123; + } + + random_ud(); + chs_ubnd(-1); + + if (isp==2) + { + dfl_modes(status); + error_root(status[0]<0,1,"main [check5.c]", + "dfl_modes failed"); + } + + start_ranlux(0,8910+isp); + sqn0=random_pf(); + + if ((irw&0x1)==1) + act0=(mu2*mu2-mu1*mu1)*action1(mu1,0,isp,1,status); + else + { + if ((isp==0)||(isp==1)) + divide_pf(mu1,isp,status+1); + else + divide_pf(mu1,isp,status+3); + + act0=mu1*mu1*(mu2*mu2-mu1*mu1)*action1(mu1,0,isp,1,status); + act0+=2.0*mu2*mu2*mu2*mu2*action1(sqrt(2.0)*mu2,0,isp,1,status); + act0*=((mu2*mu2-mu1*mu1)/(2*mu2*mu2-mu1*mu1)); + } + + if (my_rank==0) + { + printf("Solver number %d, mu1 = %.2e, mu2 = %.2e\n",isp,mu1,mu2); + printf("action1(): "); + + if ((isp==0)||(isp==1)) + printf("status = %d\n",status[0]); + else if (isp==2) + printf("status = (%d,%d,%d)\n", + status[0],status[1],status[2]); + } + + start_ranlux(0,8910+isp); + + if ((irw&0x1)==1) + act1=rwtm1(mu1,mu2,isp,&sqn1,status); + else + act1=rwtm2(mu1,mu2,isp,&sqn1,status); + + da=fabs(1.0-act1/act0); + ds=fabs(1.0-sqn1/sqn0); + + if (da>damx) + damx=da; + if (ds>dsmx) + dsmx=ds; + + if (my_rank==0) + { + if ((irw&0x1)==1) + { + printf("rwtm1(): "); + + if ((isp==0)||(isp==1)) + printf("status = %d\n",status[0]); + else if (isp==2) + printf("status = (%d,%d,%d)\n", + status[0],status[1],status[2]); + } + else + { + printf("rwtm2(): "); + + if ((isp==0)||(isp==1)) + printf("status = %d,%d\n",status[0],status[1]); + else if (isp==2) + printf("status = (%d,%d,%d),(%d,%d,%d)\n", + status[0],status[1],status[2],status[3], + status[4],status[5]); + } + + printf("|1-act1/act0| = %.1e, |1-sqn1/sqn0| = %.1e\n\n",da,ds); + } + + error_chk(); + } + } + + if (my_rank==0) + { + printf("max|1-act1/act0| = %.1e, max|1-sqn1/sqn0| = %.1e\n\n",damx,dsmx); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/check5.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/check5.in new file mode 100644 index 0000000000000000000000000000000000000000..ff9c44f39aa004b13ba19ba377713fad35faaa11 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/check5.in @@ -0,0 +1,43 @@ + +[Solver 0] +solver CGNE +nmx 256 +res 1.0e-12 + +[Solver 1] +solver SAP_GCR +nmx 128 +nkv 16 +isolv 0 +nmr 4 +ncy 3 +res 1.0e-12 + +[Solver 2] +solver DFL_SAP_GCR +nmx 64 +nkv 16 +isolv 1 +nmr 4 +ncy 5 +res 1.0e-12 + +[SAP] +bs 4 4 4 4 + +[Deflation subspace] +bs 4 4 4 4 +Ns 8 + +[Deflation subspace generation] +kappa 0.1350 +mu 0.01 +ninv 5 +nmr 4 +ncy 5 + +[Deflation projection] +nkv 16 +nmx 64 +res 1.0e-2 + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/check6.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/check6.c new file mode 100644 index 0000000000000000000000000000000000000000..217a79f1a9d6a93792d028854f1aeac9f1da259c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/check6.c @@ -0,0 +1,370 @@ + +/******************************************************************************* +* +* File check6.c +* +* Copyright (C) 2012-2014 Stefan Schaefer, Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Comparison of rwtm*eo() with action4(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "mdflds.h" +#include "sflds.h" +#include "linalg.h" +#include "dirac.h" +#include "sap.h" +#include "dfl.h" +#include "forces.h" +#include "update.h" +#include "global.h" + + +static double random_pf(void) +{ + mdflds_t *mdfs; + + mdfs=mdflds(); + random_sd(VOLUME/2,(*mdfs).pf[0],1.0); + + set_sd2zero(VOLUME/2,(*mdfs).pf[0]+VOLUME/2); + bnd_sd2zero(ALL_PTS,(*mdfs).pf[0]); + + return norm_square_dble(VOLUME/2,1,(*mdfs).pf[0]); +} + + +static void divide_pf(double mu,int isp,int *status) +{ + mdflds_t *mdfs; + spinor_dble *phi,*chi,**wsd; + solver_parms_t sp; + sap_parms_t sap; + tm_parms_t tm; + + tm=tm_parms(); + if (tm.eoflg!=1) + set_tm_parms(1); + + mdfs=mdflds(); + phi=(*mdfs).pf[0]; + sp=solver_parms(isp); + + if (sp.solver==CGNE) + { + tmcgeo(sp.nmx,sp.res,mu,phi,phi,status); + + error_root(status[0]<0,1,"divide_pf [check6.c]", + "CGNE solver failed (parameter set no %d, status = %d)", + isp,status[0]); + + wsd=reserve_wsd(1); + chi=wsd[0]; + assign_sd2sd(VOLUME/2,phi,chi); + Dwhat_dble(-mu,chi,phi); + mulg5_dble(VOLUME/2,phi); + set_sd2zero(VOLUME/2,phi+VOLUME/2); + release_wsd(); + } + else if (sp.solver==SAP_GCR) + { + sap=sap_parms(); + set_sap_parms(sap.bs,sp.isolv,sp.nmr,sp.ncy); + + mulg5_dble(VOLUME/2,phi); + set_sd2zero(VOLUME/2,phi+VOLUME/2); + sap_gcr(sp.nkv,sp.nmx,sp.res,mu,phi,phi,status); + set_sd2zero(VOLUME/2,phi+VOLUME/2); + + error_root(status[0]<0,1,"divide_pf [check6.c]", + "SAP_GCR solver failed (parameter set no %d, status = %d)", + isp,status[0]); + } + else if (sp.solver==DFL_SAP_GCR) + { + sap=sap_parms(); + set_sap_parms(sap.bs,sp.isolv,sp.nmr,sp.ncy); + + mulg5_dble(VOLUME/2,phi); + set_sd2zero(VOLUME/2,phi+VOLUME/2); + dfl_sap_gcr2(sp.nkv,sp.nmx,sp.res,mu,phi,phi,status); + set_sd2zero(VOLUME/2,phi+VOLUME/2); + + error_root((status[0]<0)||(status[1]<0),1, + "divide_pf [check6.c]","DFL_SAP_GCR solver failed " + "(parameter set no %d, status = (%d,%d,%d))", + isp,status[0],status[1],status[2]); + } +} + + +int main(int argc,char *argv[]) +{ + int my_rank,bc,irw,isp,status[6],mnkv; + int bs[4],Ns,nmx,nkv,nmr,ncy,ninv; + double chi[2],chi_prime[2]; + double kappa,mu,res; + double mu1,mu2,act0,act1,sqn0,sqn1; + double da,ds,damx,dsmx; + solver_parms_t sp; + FILE *flog=NULL,*fin=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check6.log","w",stdout); + fin=freopen("check6.in","r",stdin); + + printf("\n"); + printf("Comparison of rwtm*eo() with action4()\n"); + printf("--------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check6.c]", + "Syntax: check6 [-bc ]"); + } + + set_lat_parms(5.5,1.0,0,NULL,1.782); + print_lat_parms(); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + chi[0]=0.123; + chi[1]=-0.534; + chi_prime[0]=0.912; + chi_prime[1]=0.078; + set_bc_parms(bc,1.0,1.0,0.953,1.203,chi,chi_prime); + print_bc_parms(); + + mnkv=0; + + for (isp=0;isp<3;isp++) + { + read_solver_parms(isp); + sp=solver_parms(isp); + + if (sp.nkv>mnkv) + mnkv=sp.nkv; + } + + if (my_rank==0) + { + find_section("SAP"); + read_line("bs","%d %d %d %d",bs,bs+1,bs+2,bs+3); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + set_sap_parms(bs,0,1,1); + + if (my_rank==0) + { + find_section("Deflation subspace"); + read_line("bs","%d %d %d %d",bs,bs+1,bs+2,bs+3); + read_line("Ns","%d",&Ns); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&Ns,1,MPI_INT,0,MPI_COMM_WORLD); + set_dfl_parms(bs,Ns); + + if (my_rank==0) + { + find_section("Deflation subspace generation"); + read_line("kappa","%lf",&kappa); + read_line("mu","%lf",&mu); + read_line("ninv","%d",&ninv); + read_line("nmr","%d",&nmr); + read_line("ncy","%d",&ncy); + } + + MPI_Bcast(&kappa,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&mu,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&ninv,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmr,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&ncy,1,MPI_INT,0,MPI_COMM_WORLD); + set_dfl_gen_parms(kappa,mu,ninv,nmr,ncy); + + if (my_rank==0) + { + find_section("Deflation projection"); + read_line("nkv","%d",&nkv); + read_line("nmx","%d",&nmx); + read_line("res","%lf",&res); + fclose(fin); + } + + MPI_Bcast(&nkv,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmx,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&res,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + set_dfl_pro_parms(nkv,nmx,res); + set_hmc_parms(0,NULL,1,0,NULL,1,1.0); + + print_solver_parms(status,status+1); + print_sap_parms(0); + print_dfl_parms(0); + + start_ranlux(0,1245); + geometry(); + + mnkv=2*mnkv+2; + if (mnkv<(Ns+2)) + mnkv=Ns+2; + if (mnkv<5) + mnkv=5; + + alloc_ws(mnkv); + alloc_wsd(6); + alloc_wv(2*nkv+2); + alloc_wvd(4); + damx=0.0; + dsmx=0.0; + + for (irw=1;irw<5;irw++) + { + for (isp=0;isp<3;isp++) + { + if (isp==0) + { + set_sw_parms(1.0877); + if (irw<3) + mu1=1.0; + else + mu1=0.0; + mu2=1.23; + } + else if (isp==1) + { + set_sw_parms(0.0877); + if (irw<3) + mu1=0.1; + else + mu1=0.0; + mu2=0.123; + } + else + { + set_sw_parms(-0.0123); + if (irw<3) + mu1=0.01; + else + mu1=0.0; + mu2=0.0123; + } + + random_ud(); + chs_ubnd(-1); + + if (isp==2) + { + dfl_modes(status); + error_root(status[0]<0,1,"main [check6.c]", + "dfl_modes failed"); + } + + start_ranlux(0,8910+isp); + sqn0=random_pf(); + + if ((irw&0x1)==1) + act0=(mu2*mu2-mu1*mu1)*action4(mu1,0,0,isp,1,status); + else + { + if ((isp==0)||(isp==1)) + divide_pf(mu1,isp,status+1); + else + divide_pf(mu1,isp,status+3); + + act0=mu1*mu1*(mu2*mu2-mu1*mu1)*action4(mu1,0,0,isp,1,status); + act0+=2.0*mu2*mu2*mu2*mu2*action4(sqrt(2.0)*mu2,0,0,isp,1,status); + act0*=((mu2*mu2-mu1*mu1)/(2*mu2*mu2-mu1*mu1)); + } + + if (my_rank==0) + { + printf("Solver number %d, mu1 = %.2e, mu2 = %.2e\n",isp,mu1,mu2); + printf("action4(): "); + + if ((isp==0)||(isp==1)) + printf("status = %d\n",status[0]); + else if (isp==2) + printf("status = (%d,%d,%d)\n", + status[0],status[1],status[2]); + } + + start_ranlux(0,8910+isp); + + if ((irw&0x1)==1) + act1=rwtm1eo(mu1,mu2,isp,&sqn1,status); + else + act1=rwtm2eo(mu1,mu2,isp,&sqn1,status); + + da=fabs(1.0-act1/act0); + ds=fabs(1.0-sqn1/sqn0); + + if (da>damx) + damx=da; + if (ds>dsmx) + dsmx=ds; + + if (my_rank==0) + { + if ((irw&0x1)==1) + { + printf("rwtm1eo(): "); + + if ((isp==0)||(isp==1)) + printf("status = %d\n",status[0]); + else if (isp==2) + printf("status = (%d,%d,%d)\n", + status[0],status[1],status[2]); + } + else + { + printf("rwtm2eo(): "); + + if ((isp==0)||(isp==1)) + printf("status = %d,%d\n",status[0],status[1]); + else if (isp==2) + printf("status = (%d,%d,%d),(%d,%d,%d)\n", + status[0],status[1],status[2],status[3], + status[4],status[5]); + } + + printf("|1-act1/act0| = %.1e, |1-sqn1/sqn0| = %.1e\n\n",da,ds); + } + + error_chk(); + } + } + + if (my_rank==0) + { + printf("max|1-act1/act0| = %.1e, max|1-sqn1/sqn0| = %.1e\n\n",damx,dsmx); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/check6.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/check6.in new file mode 100644 index 0000000000000000000000000000000000000000..e7981ba77309d6d191c28b2c21d276e222934314 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/update/check6.in @@ -0,0 +1,42 @@ + +[Solver 0] +solver CGNE +nmx 256 +res 1.0e-12 + +[Solver 1] +solver SAP_GCR +nmx 128 +nkv 16 +isolv 0 +nmr 4 +ncy 3 +res 1.0e-12 + +[Solver 2] +solver DFL_SAP_GCR +nmx 64 +nkv 16 +isolv 1 +nmr 4 +ncy 5 +res 1.0e-12 + +[SAP] +bs 4 4 4 4 + +[Deflation subspace] +bs 4 4 4 4 +Ns 8 + +[Deflation subspace generation] +kappa 0.1350 +mu 0.01 +ninv 5 +nmr 4 +ncy 5 + +[Deflation projection] +nkv 16 +nmx 64 +res 1.0e-2 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/utils/INDEX b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/utils/INDEX new file mode 100644 index 0000000000000000000000000000000000000000..d0738d2892913e8953f0f62c702b0c40a634f19d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/utils/INDEX @@ -0,0 +1,5 @@ + +Utility programs + +check1 Copying of files + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/utils/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/utils/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..ba45c0c5f9a62140f5c10d22cc1b9065941d172b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/utils/Makefile @@ -0,0 +1,121 @@ +################################################################################ +# +# Makefile to compile and link C programs with MPI subroutines +# +# Version valid for Linux machines with MPICH +# +# "make" compiles and links the specified main programs and modules, +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files generated by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +all: rmxeq mkdep mkxeq +.PHONY: all + + +# main programs and modules to be compiled + +MAIN = check1 + +FLAGS = flags lat_parms + +LATTICE = geometry + +RANDOM = ranlux ranlxs ranlxd + +UTILS = endian mutils utils + +MODULES = $(FLAGS) $(LATTICE) $(RANDOM) $(UTILS) + + +# Logging option (-mpilog or -mpitrace or -mpianim) + +LOGOPTION = + + +# search path for modules + +MDIR = ../../modules + +VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/random:$(MDIR)/utils + + +# additional include directories + +INCPATH = $(MPI_INCLUDE) ../../include + + +# additional libraries + +LIBS = m + +LIBPATH = $(MPI_HOME)/lib + + +# scheduling and optimization options + +CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ + -Wall -Wno-long-long -Wstrict-prototypes -Werror \ + -O -mno-avx -Dx64 -DPM + + +############################## do not change ################################### + +SHELL=/bin/bash +CC=$(MPI_HOME)/bin/mpicc +CLINKER=$(CC) + +PGMS= $(MAIN) $(MODULES) + +-include $(addsuffix .d,$(PGMS)) + + +# rule to make dependencies + +$(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + +# rule to link object files + +$(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(CFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ + + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables + +rmxeq: + @ -rm -f $(MAIN); \ + echo "delete old executables" + + +# make dependencies + +mkdep: $(addsuffix .d,$(PGMS)) + @ echo "generate tables of dependencies" + + +# clean directory + +clean: + @ -rm -rf *.d *.o *.alog *.clog *.slog \ + *.log~ *.dat *.dat~ $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/utils/check1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/utils/check1.c new file mode 100644 index 0000000000000000000000000000000000000000..708515b3d6dbdfb26164df1d6e0e4c37dbe45a8a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/utils/check1.c @@ -0,0 +1,91 @@ + +/******************************************************************************* +* +* File check1.c +* +* Copyright (C) 2005, 2008 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Copying of files. After running this program, one can verify that all +* bytes have been copied correctly using the diff utility +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "utils.h" +#include "archive.h" +#include "global.h" + +#define NRAN 10000 + +static float r[NRAN]; + + +int main(int argc,char *argv[]) +{ + int my_rank,n,err1,err2,iw; + FILE *flog=NULL,*fdat=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check1.log","w",stdout); + + printf("\n"); + printf("Copying of .log and .dat files from process 0\n"); + printf("---------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + } + + start_ranlux(0,1234); + ranlxs(r,NRAN); + + if (my_rank==0) + { + printf("Write 10 random numbers to check1.log (in asci format)\n"); + printf("and %d numbers to check1.dat (in binary format)\n\n",NRAN); + + fdat=fopen("check1.dat","wb"); + iw=fwrite(&r[0],sizeof(float),NRAN,fdat); + error_root(iw!=NRAN,1,"main [check1.c]","Incorrect write count"); + fclose(fdat); + + for (n=0;n<10;n++) + printf("r[%d] = %.6e\n",n,r[n]); + + printf("\n"); + printf("Copy the files to check1.log~ and check1.dat~ respectively.\n"); + printf("The copying may then be verified using the diff utility\n\n"); + fclose(flog); + + err1=copy_file("check1.log","check1.log~"); + err2=copy_file("check1.dat","check1.dat~"); + + flog=freopen("check1.log","a",stdout); + + if ((err1!=0)||(err2!=0)) + printf("Copying failed: err1 = %d, err2 = %d\n",err1,err2); + } + + error_chk(); + + if (my_rank==0) + fclose(flog); + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/vflds/INDEX b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/vflds/INDEX new file mode 100644 index 0000000000000000000000000000000000000000..f726c1f53ede4ea9d83143106ae471a3fa4371db --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/vflds/INDEX @@ -0,0 +1,19 @@ + +Basic utility programs for complex fields + +check1 Allocation and initialization of the global vector fields. + +check2 Check of the programs in the module vinit.c. + +check3 Check of the communication programs cpv_int_bnd() and + cpv_ext_bnd(). + +check4 Check of the communication programs cpvd_int_bnd() and + cpvd_ext_bnd(). + +The programs check3 and check4 accept the option -bc that allows the +type of boundary condition to be chosen at runtime. When the option is not +set, open boundary conditions are assumed. + +The option may be set but has no effect in the case of check1 and check2. + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/vflds/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/vflds/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..b51c6c4e5f10f399b42efccacf0135e365a13584 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/vflds/Makefile @@ -0,0 +1,151 @@ +################################################################################ +# +# Makefile to compile and link C programs with MPI subroutines +# +# Version valid for Linux machines with MPICH +# +# "make" compiles and links the specified main programs and modules, +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files generated by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +all: rmxeq mkdep mkxeq +.PHONY: all + + +# main programs and modules to be compiled + +MAIN = check1 check2 check3 check4 + +FLAGS = flags lat_parms sap_parms dfl_parms + +LATTICE = bcnds ftidx uidx geometry + +LINALG = salg salg_dble valg valg_dble liealg cmatrix_dble + +LINSOLV = fgcr + +RANDOM = ranlux ranlxs ranlxd gauss + +UFLDS = plaq_sum shift uflds udcom + +SU3FCTS = chexp su3prod su3ren cm3x3 random_su3 + +UTILS = endian mutils utils wspace + +SFLDS = sflds scom sdcom Pbnd Pbnd_dble + +TCHARGE = ftcom ftensor + +SW_TERM = pauli pauli_dble swflds sw_term + +DIRAC = Dw_dble Dw Dw_bnd + +BLOCK = block blk_grid map_u2blk map_sw2blk map_s2blk + +SAP = blk_solv sap_com sap sap_gcr + +ARCHIVE = archive + +DFL = dfl_geometry + +VFLDS = vflds vinit vcom vdcom + +MODULES = $(FLAGS) $(LATTICE) $(LINALG) $(LINSOLV) $(RANDOM) $(UFLDS) \ + $(SU3FCTS) $(UTILS) $(SFLDS) $(TCHARGE) $(SW_TERM) $(DIRAC) \ + $(BLOCK) $(SAP) $(ARCHIVE) $(DFL) $(VFLDS) + + +# Logging option (-mpilog or -mpitrace or -mpianim) + +LOGOPTION = + + +# search path for modules + +MDIR = ../../modules + +VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/linalg:$(MDIR)/linsolv:\ + $(MDIR)/random:$(MDIR)/uflds:$(MDIR)/su3fcts:$(MDIR)/utils:\ + $(MDIR)/sflds:$(MDIR)/tcharge:$(MDIR)/sw_term:$(MDIR)/dirac:\ + $(MDIR)/block:$(MDIR)/sap:$(MDIR)/archive:$(MDIR)/dfl:\ + $(MDIR)/vflds + +# additional include directories + +INCPATH = $(MPI_INCLUDE) ../../include + + +# additional libraries + +LIBS = m + +LIBPATH = $(MPI_HOME)/lib + + +# scheduling and optimization options + +CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ + -Wall -Wno-long-long -Wstrict-prototypes -Werror \ + -O -mno-avx -Dx64 -DPM + + +############################## do not change ################################### + +SHELL=/bin/bash +CC=$(MPI_HOME)/bin/mpicc +CLINKER=$(CC) + +PGMS= $(MAIN) $(MODULES) + +-include $(addsuffix .d,$(PGMS)) + + +# rule to make dependencies + +$(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + +# rule to link object files + +$(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(CFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ + + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables + +rmxeq: + @ -rm -f $(MAIN); \ + echo "delete old executables" + + +# make dependencies + +mkdep: $(addsuffix .d,$(PGMS)) + @ echo "generate tables of dependencies" + + +# clean directory + +clean: + @ -rm -rf *.d *.o *.alog *.clog *.slog $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/vflds/check1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/vflds/check1.c new file mode 100644 index 0000000000000000000000000000000000000000..c802aadb61cf807b8719678f699bbffe4e9249a6 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/vflds/check1.c @@ -0,0 +1,165 @@ + +/******************************************************************************* +* +* File check1.c +* +* Copyright (C) 2007, 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Allocation and initialization of the global vector fields +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "vflds.h" +#include "global.h" + +#define NFIELDS 7 + + +int main(int argc,char *argv[]) +{ + int my_rank,ie,k,ix; + int bs[4],Ns; + int nb,nbb,nv,nvec; + complex **wv; + complex_dble **wvd; + dfl_parms_t dfl; + FILE *fin=NULL,*flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check1.log","w",stdout); + fin=freopen("check1.in","r",stdin); + + printf("\n"); + printf("Allocation and initialization of the global vector fields\n"); + printf("---------------------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + read_line("bs","%d %d %d %d",&bs[0],&bs[1],&bs[2],&bs[3]); + read_line("Ns","%d",&Ns); + fclose(fin); + + printf("bs = %d %d %d %d\n",bs[0],bs[1],bs[2],bs[3]); + printf("Ns = %d\n\n",Ns); + fflush(flog); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&Ns,1,MPI_INT,0,MPI_COMM_WORLD); + + start_ranlux(0,123456); + geometry(); + dfl=set_dfl_parms(bs,Ns); + + error((bs[0]!=dfl.bs[0])||(bs[1]!=dfl.bs[1])|| + (bs[2]!=dfl.bs[2])||(bs[3]!=dfl.bs[3])||(Ns!=dfl.Ns),1, + "main [check1.c]","Parameter bs[4] or Ns are incorrectly set"); + + alloc_wv(NFIELDS); + alloc_wvd(NFIELDS); + wv=reserve_wv(NFIELDS); + wvd=reserve_wvd(NFIELDS); + + nb=VOLUME/(bs[0]*bs[1]*bs[2]*bs[3]); + nbb=(FACE0/(bs[1]*bs[2]*bs[3])+FACE1/(bs[0]*bs[2]*bs[3])+ + FACE2/(bs[0]*bs[1]*bs[3])+FACE3/(bs[0]*bs[1]*bs[2])); + nv=Ns*nb; + nvec=Ns*(nb+nbb); + ie=0; + + for (k=1;k +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "linalg.h" +#include "vflds.h" +#include "global.h" + +#define NFLDS 5 + +static float sig[NFLDS]; +static double sigd[NFLDS]; + + +int main(int argc,char *argv[]) +{ + int my_rank,ie,k,ix; + int bs[4],Ns,nb,nv; + double var,var_all,d,dmax; + complex z; + complex_dble zd; + complex **wv; + complex_dble **wvd; + FILE *fin=NULL,*flog=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check2.log","w",stdout); + fin=freopen("check1.in","r",stdin); + + printf("\n"); + printf("Check of the programs in the module vinit\n"); + printf("-----------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + read_line("bs","%d %d %d %d",&bs[0],&bs[1],&bs[2],&bs[3]); + read_line("Ns","%d",&Ns); + fclose(fin); + + printf("bs = %d %d %d %d\n",bs[0],bs[1],bs[2],bs[3]); + printf("Ns = %d\n\n",Ns); + fflush(flog); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&Ns,1,MPI_INT,0,MPI_COMM_WORLD); + + start_ranlux(0,12345); + geometry(); + set_dfl_parms(bs,Ns); + + alloc_wv(2*NFLDS); + alloc_wvd(2*NFLDS); + wv=reserve_wv(2*NFLDS); + wvd=reserve_wvd(2*NFLDS); + + nb=VOLUME/(bs[0]*bs[1]*bs[2]*bs[3]); + nv=Ns*nb; + z.im=0.0f; + zd.im=0.0; + ie=0; + + if (my_rank==0) + { + printf("Choose random single-precision fields\n"); + ranlxs(sig,NFLDS); + } + + MPI_Bcast(sig,NFLDS,MPI_FLOAT,0,MPI_COMM_WORLD); + + for (k=0;k = %.4e (sigma^2 = %.4e)\n", + k,var_all,sig[k]*sig[k]); + } + } + + if (my_rank==0) + { + printf("\n"); + printf("Choose random double-precision fields\n"); + ranlxd(sigd,NFLDS); + } + + MPI_Bcast(sigd,NFLDS,MPI_DOUBLE,0,MPI_COMM_WORLD); + + for (k=0;k = %.4e (sigma^2 = %.4e)\n", + k,var_all,sigd[k]*sigd[k]); + } + } + + for (k=0;kdmax) + dmax=d; + } + + if (my_rank==0) + { + printf("\n"); + printf("Relative deviations (should be less than 1.0e-7 or so):\n"); + printf("add_v2vd(): %.1e\n",sqrt(dmax)); + } + + dmax=0.0; + + for (k=0;kdmax) + dmax=d; + } + + if (my_rank==0) + { + printf("diff_vd2v(): %.1e\n\n",sqrt(dmax)); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/vflds/check3.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/vflds/check3.c new file mode 100644 index 0000000000000000000000000000000000000000..b252a15a9f4f127a3d57f90901cf24ee14e1922f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/vflds/check3.c @@ -0,0 +1,341 @@ + +/******************************************************************************* +* +* File check3.c +* +* Copyright (C) 2007, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of the communication programs cpv_int_bnd() and cpv_ext_bnd(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "linalg.h" +#include "dfl.h" +#include "vflds.h" +#include "global.h" + +static int bs[4],Ns,nv,nvec; +static int nb,nbb,*nbbe,*nbbo,*obbe,*obbo; +static int (*inn)[8],*ipp; + + +static void set_field(complex *v) +{ + int n[4],no[4],c[4]; + int i0,i1,i2,i3,ibe,ibo; + + n[0]=L0/bs[0]; + n[1]=L1/bs[1]; + n[2]=L2/bs[2]; + n[3]=L3/bs[3]; + + no[0]=cpr[0]*n[0]; + no[1]=cpr[1]*n[1]; + no[2]=cpr[2]*n[2]; + no[3]=cpr[3]*n[3]; + + set_v2zero(nv,v); + ibe=0; + ibo=(n[0]*n[1]*n[2]*n[3])/2; + + for (i0=0;i01)|| + ((ifc==0)&&(cpr[0]!=0))|| + ((ifc==1)&&(cpr[0]!=(NPROC0-1)))|| + (bc==3)) + { + for (ibb=obbe[ifc];ibb<(obbe[ifc]+nbbe[ifc]);ibb++) + { + ib=ipp[ibb]; + + for (mu=0;mu<4;mu++) + { + c[mu]=v[nv+ibb*Ns+mu].re-v[ib*Ns+mu].re; + + if (mu==(ifc/2)) + { + if ((ifc&0x1)==0x0) + { + c[mu]+=1.0f; + + if (cpr[mu]==0) + c[mu]-=n[mu]; + } + else + { + c[mu]-=1.0f; + + if (cpr[mu]==(np[mu]-1)) + c[mu]+=n[mu]; + } + } + } + + if ((c[0]!=0.0f)||(c[1]!=0.0f)||(c[2]!=0.0f)||(c[3]!=0.0f)) + ie=1; + } + } + else + { + for (ibb=obbe[ifc];ibb<(obbe[ifc]+nbbe[ifc]);ibb++) + { + for (i=0;i1)|| + ((ifc==0)&&(cpr[0]!=0))|| + ((ifc==1)&&(cpr[0]!=(NPROC0-1)))|| + (bc==3)) + { + for (ibb=obbo[ifc];ibb<(obbo[ifc]+nbbo[ifc]);ibb++) + { + ib=ipp[ibb]; + vv=v+ib*Ns; + ww=w+ib*Ns; + vm=vv+Ns; + + for (;vv]"); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.0; + phi[1]=0.0; + phi_prime[0]=0.0; + phi_prime[1]=0.0; + set_bc_parms(bc,1.0,1.0,1.0,1.0,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,123456); + geometry(); + Ns=6; + set_dfl_parms(bs,Ns); + + dfl_grid=dfl_geometry(); + nb=dfl_grid.nb; + nbb=dfl_grid.nbb; + nbbe=dfl_grid.nbbe; + nbbo=dfl_grid.nbbo; + obbe=dfl_grid.obbe; + obbo=dfl_grid.obbo; + inn=dfl_grid.inn; + ipp=dfl_grid.ipp; + + alloc_wv(4); + wv=reserve_wv(4); + + nv=Ns*nb; + nvec=Ns*(nb+nbb/2); + z.re=-1.0f; + z.im=0.0f; + + for (i=0;i<2;i++) + { + random_v(nvec,wv[i],1.0f); + set_field(wv[i]); + assign_v2v(nv,wv[i],wv[i+1]); + cpv_int_bnd(wv[i]); + mulc_vadd(nv,wv[i+1],wv[i],z); + d=vnorm_square(nv,1,wv[i+1]); + + error_root(d!=0.0f,1,"main [check3.c]", + "cpv_int_bnd() modifies the input field on the local grid"); + + ie=chk_ext_bnd(wv[i]); + error(ie==1,1,"main [check3.c]", + "Boundary values are incorrectly mapped by cpv_int_bnd()"); + error(ie==2,1,"main [check3.c]", + "Boundary values are not set to zero where they should"); + + random_iv(nvec,wv[i]); + cpv_int_bnd(wv[i]); + assign_v2v(nvec,wv[i],wv[i+1]); + cpv_ext_bnd(wv[i]); + mulc_vadd(nvec-nv,wv[i]+nv,wv[i+1]+nv,z); + d=vnorm_square(nvec-nv,1,wv[i]+nv); + + error_root(d!=0.0f,1,"main [check3.c]", + "cpv_ext_bnd() modifies the input field on the boundary"); + + ie=chk_int_bnd(wv[i],wv[i+1]); + error(ie==1,1,"main [check3.c]", + "Boundary values are incorrectly mapped by cpv_ext_bnd()"); + } + + error_chk(); + + if (my_rank==0) + { + printf("No errors detected\n\n"); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/vflds/check4.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/vflds/check4.c new file mode 100644 index 0000000000000000000000000000000000000000..236aecd00baf208d11e50c29b1000343905bf2be --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/vflds/check4.c @@ -0,0 +1,342 @@ + +/******************************************************************************* +* +* File check4.c +* +* Copyright (C) 2007, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Check of the communication programs cpvd_int_bnd() and cpvd_ext_bnd(). +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "linalg.h" +#include "dfl.h" +#include "vflds.h" +#include "global.h" + +static int bs[4],Ns,nv,nvec; +static int nb,nbb,*nbbe,*nbbo,*obbe,*obbo; +static int (*inn)[8],*ipp; + + +static void set_field(complex_dble *v) +{ + int n[4],no[4],c[4]; + int i0,i1,i2,i3,ibe,ibo; + + n[0]=L0/bs[0]; + n[1]=L1/bs[1]; + n[2]=L2/bs[2]; + n[3]=L3/bs[3]; + + no[0]=cpr[0]*n[0]; + no[1]=cpr[1]*n[1]; + no[2]=cpr[2]*n[2]; + no[3]=cpr[3]*n[3]; + + set_vd2zero(nv,v); + ibe=0; + ibo=(n[0]*n[1]*n[2]*n[3])/2; + + for (i0=0;i01)|| + ((ifc==0)&&(cpr[0]!=0))|| + ((ifc==1)&&(cpr[0]!=(NPROC0-1)))|| + (bc==3)) + { + for (ibb=obbe[ifc];ibb<(obbe[ifc]+nbbe[ifc]);ibb++) + { + ib=ipp[ibb]; + + for (mu=0;mu<4;mu++) + { + c[mu]=v[nv+ibb*Ns+mu].re-v[ib*Ns+mu].re; + + if (mu==(ifc/2)) + { + if ((ifc&0x1)==0x0) + { + c[mu]+=1.0; + + if (cpr[mu]==0) + c[mu]-=n[mu]; + } + else + { + c[mu]-=1.0; + + if (cpr[mu]==(np[mu]-1)) + c[mu]+=n[mu]; + } + } + } + + if ((c[0]!=0.0)||(c[1]!=0.0)||(c[2]!=0.0)||(c[3]!=0.0)) + ie=1; + } + } + else + { + for (ibb=obbe[ifc];ibb<(obbe[ifc]+nbbe[ifc]);ibb++) + { + for (i=0;i1)|| + ((ifc==0)&&(cpr[0]!=0))|| + ((ifc==1)&&(cpr[0]!=(NPROC0-1))) + ||(bc==3)) + { + for (ibb=obbo[ifc];ibb<(obbo[ifc]+nbbo[ifc]);ibb++) + { + ib=ipp[ibb]; + vv=v+ib*Ns; + ww=w+ib*Ns; + vm=vv+Ns; + + for (;vv]"); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + phi[0]=0.0; + phi[1]=0.0; + phi_prime[0]=0.0; + phi_prime[1]=0.0; + set_bc_parms(bc,1.0,1.0,1.0,1.0,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,123456); + geometry(); + Ns=4; + set_dfl_parms(bs,Ns); + + dfl_grid=dfl_geometry(); + nb=dfl_grid.nb; + nbb=dfl_grid.nbb; + nbbe=dfl_grid.nbbe; + nbbo=dfl_grid.nbbo; + obbe=dfl_grid.obbe; + obbo=dfl_grid.obbo; + inn=dfl_grid.inn; + ipp=dfl_grid.ipp; + + alloc_wvd(4); + wv=reserve_wvd(4); + + nv=Ns*nb; + nvec=Ns*(nb+nbb/2); + z.re=-1.0; + z.im=0.0; + + for (i=0;i<2;i++) + { + random_vd(nvec,wv[i],1.0); + set_field(wv[i]); + assign_vd2vd(nv,wv[i],wv[i+1]); + cpvd_int_bnd(wv[i]); + mulc_vadd_dble(nv,wv[i+1],wv[i],z); + d=vnorm_square_dble(nv,1,wv[i+1]); + + error_root(d!=0.0,1,"main [check4.c]", + "cpvd_int_bnd() modifies the input field on the local grid"); + + ie=chk_ext_bnd(wv[i]); + error(ie==1,1,"main [check4.c]", + "Boundary values are incorrectly mapped by cpvd_int_bnd()"); + error(ie==2,1,"main [check3.c]", + "Boundary values are not set to zero where they should"); + + random_iv(nvec,wv[i]); + cpvd_int_bnd(wv[i]); + assign_vd2vd(nvec,wv[i],wv[i+1]); + cpvd_ext_bnd(wv[i]); + mulc_vadd_dble(nvec-nv,wv[i]+nv,wv[i+1]+nv,z); + d=vnorm_square_dble(nvec-nv,1,wv[i]+nv); + + error_root(d!=0.0,1,"main [check4.c]", + "cpvd_ext_bnd() modifies the input field on the boundary"); + + ie=chk_int_bnd(wv[i],wv[i+1]); + error(ie==1,1,"main [check4.c]", + "Boundary values are incorrectly mapped by cpvd_ext_bnd()"); + } + + error_chk(); + + if (my_rank==0) + { + printf("No errors detected\n\n"); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/wflow/INDEX b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/wflow/INDEX new file mode 100644 index 0000000000000000000000000000000000000000..3a15a74deee7765c78faba902b6555709861b920 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/wflow/INDEX @@ -0,0 +1,14 @@ + +Integration of the Wilson flow + +check1 Basic checks on the implementation of the Wilson flow. + +check2 Gauge covariance of the Wilson flow. + +check3 Convergence of the numerical integration. + +The programs check1 and check2 accept the option -bc that allows the +type of boundary condition to be chosen (open boundary conditions are assumed +if the option is not set). In the case of check3, the boundary conditions are +set through the input parameter file. + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/wflow/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/wflow/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..5aa4b4baa64aabc0d4499f62c05f141575ec2b82 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/wflow/Makefile @@ -0,0 +1,143 @@ +################################################################################ +# +# Makefile to compile and link C programs with MPI subroutines +# +# Version valid for Linux machines with MPICH +# +# "make" compiles and links the specified main programs and modules, +# using the specified libraries (if any), and produces the executables +# +# "make clean" removes all files generated by "make" +# +# Dependencies on included files are automatically taken care of +# +################################################################################ + +all: rmxeq mkdep mkxeq +.PHONY: all + + +# main programs and modules to be compiled + +MAIN = check1 check2 check3 + +FLAGS = flags lat_parms hmc_parms dfl_parms + +LATTICE = bcnds uidx ftidx geometry + +ARCHIVE = archive + +LINALG = liealg cmatrix_dble + +RANDOM = ranlux ranlxs ranlxd gauss random_su3 + +UFLDS = plaq_sum shift uflds udcom bstap + +MDFLDS = mdflds fcom + +SFLDS = sflds + +SU3FCTS = chexp su3prod su3ren cm3x3 + +UTILS = endian mutils utils wspace + +FORCES = force0 + +TCHARGE = ftcom ftensor tcharge + +WFLOW = wflow + +MODULES = $(FLAGS) $(LATTICE) $(ARCHIVE) $(LINALG) $(RANDOM) $(UFLDS) \ + $(MDFLDS) $(SFLDS) $(SU3FCTS) $(UTILS) $(FORCES) $(TCHARGE) \ + $(WFLOW) + + +# Logging option (-mpilog or -mpitrace or -mpianim) + +LOGOPTION = + + +# search path for modules + +MDIR = ../../modules + +VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/archive:$(MDIR)/linalg:\ + $(MDIR)/random:$(MDIR)/uflds:$(MDIR)/mdflds:$(MDIR)/sflds:\ + $(MDIR)/su3fcts:$(MDIR)/utils:$(MDIR)/forces:$(MDIR)/tcharge:\ + $(MDIR)/wflow + + +# additional include directories + +INCPATH = $(MPI_INCLUDE) ../../include + + +# additional libraries + +LIBS = m + +LIBPATH = $(MPI_HOME)/lib + + +# scheduling and optimization options + +CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ + -Wall -Wno-long-long -Wstrict-prototypes -Werror \ + -O -mno-avx -Dx64 -DPM + + +############################## do not change ################################### + +SHELL=/bin/bash +CC=$(MPI_HOME)/bin/mpicc +CLINKER=$(CC) + +PGMS= $(MAIN) $(MODULES) + +-include $(addsuffix .d,$(PGMS)) + + +# rule to make dependencies + +$(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + +# rule to link object files + +$(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(CFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ + + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables + +rmxeq: + @ -rm -f $(MAIN); \ + echo "delete old executables" + + +# make dependencies + +mkdep: $(addsuffix .d,$(PGMS)) + @ echo "generate tables of dependencies" + + +# clean directory + +clean: + @ -rm -rf *.d *.o *.alog *.clog *.slog $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/wflow/check1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/wflow/check1.c new file mode 100644 index 0000000000000000000000000000000000000000..527d6c0cbee80c59b5d5b123b9d78ba1cac62363 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/wflow/check1.c @@ -0,0 +1,552 @@ + +/******************************************************************************* +* +* File check1.c +* +* Copyright (C) 2010-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Basic checks on the implementation of the Wilson flow. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "random.h" +#include "su3fcts.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "mdflds.h" +#include "linalg.h" +#include "forces.h" +#include "wflow.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define N1 (NPROC1*L1) +#define N2 (NPROC2*L2) +#define N3 (NPROC3*L3) + +static const su3_alg_dble fr0={0.0}; +static su3_alg_dble XX ALIGNED16; +static su3_dble mm,uu,vv ALIGNED16; + + +static double cmp_ud(su3_dble *u,su3_dble *v) +{ + int i; + double r[18],dev,dmax; + + r[ 0]=(*u).c11.re-(*v).c11.re; + r[ 1]=(*u).c11.im-(*v).c11.im; + r[ 2]=(*u).c12.re-(*v).c12.re; + r[ 3]=(*u).c12.im-(*v).c12.im; + r[ 4]=(*u).c13.re-(*v).c13.re; + r[ 5]=(*u).c13.im-(*v).c13.im; + + r[ 6]=(*u).c21.re-(*v).c21.re; + r[ 7]=(*u).c21.im-(*v).c21.im; + r[ 8]=(*u).c22.re-(*v).c22.re; + r[ 9]=(*u).c22.im-(*v).c22.im; + r[10]=(*u).c23.re-(*v).c23.re; + r[11]=(*u).c23.im-(*v).c23.im; + + r[12]=(*u).c31.re-(*v).c31.re; + r[13]=(*u).c31.im-(*v).c31.im; + r[14]=(*u).c32.re-(*v).c32.re; + r[15]=(*u).c32.im-(*v).c32.im; + r[16]=(*u).c33.re-(*v).c33.re; + r[17]=(*u).c33.im-(*v).c33.im; + + dmax=0.0; + + for (i=0;i<18;i+=2) + { + dev=r[i]*r[i]+r[i+1]*r[i+1]; + if (dev>dmax) + dmax=dev; + } + + return sqrt(dmax); +} + + +static double max_dev_ud(su3_dble *v) +{ + double d,dmax; + su3_dble *u,*um; + + u=udfld(); + um=u+4*VOLUME; + dmax=0.0; + + for (;udmax) + dmax=d; + + v+=1; + } + + if (NPROC>1) + { + d=dmax; + MPI_Reduce(&d,&dmax,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Bcast(&dmax,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + + return dmax; +} + + +static double cmp_fd(su3_alg_dble *f,su3_alg_dble *g) +{ + int i; + double r[8],dev,dmax; + + r[0]=(*f).c1-(*g).c1; + r[1]=(*f).c2-(*g).c2; + r[2]=(*f).c3-(*g).c3; + r[3]=(*f).c4-(*g).c4; + r[4]=(*f).c5-(*g).c5; + r[5]=(*f).c6-(*g).c6; + r[6]=(*f).c7-(*g).c7; + r[7]=(*f).c8-(*g).c8; + + dmax=0.0; + + for (i=0;i<8;i++) + { + dev=fabs(r[i]); + if (dev>dmax) + dmax=dev; + } + + return dmax; +} + + +static double max_dev_frc(su3_alg_dble *g) +{ + double d,dmax; + su3_alg_dble *f,*fm; + mdflds_t *mdfs; + + mdfs=mdflds(); + f=(*mdfs).frc; + fm=f+4*VOLUME; + dmax=0.0; + + for (;fdmax) + dmax=d; + + g+=1; + } + + if (NPROC>1) + { + d=dmax; + MPI_Reduce(&d,&dmax,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Bcast(&dmax,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + + return dmax; +} + + +static int is_zero(su3_alg_dble *X) +{ + int ie; + + ie=((*X).c1==0.0); + ie&=((*X).c2==0.0); + ie&=((*X).c3==0.0); + ie&=((*X).c4==0.0); + ie&=((*X).c5==0.0); + ie&=((*X).c6==0.0); + ie&=((*X).c7==0.0); + ie&=((*X).c8==0.0); + + return ie; +} + + +static int check_bnd_fld(su3_alg_dble *fld) +{ + int bc,npts,*pts,*ptm; + int ix,t,ifc,ie; + su3_alg_dble *f; + + bc=bc_type(); + pts=bnd_pts(&npts); + ptm=pts+npts; + pts+=(npts/2); + ie=0; + + for (;ptsdmax) + dmax=d; + } + } + } + } + } + + if (NPROC>1) + { + d=dmax; + MPI_Reduce(&d,&dmax,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Bcast(&dmax,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + + return dmax; +} + + +static void scale_bnd_frc(su3_alg_dble *frc) +{ + int bc,ifc,npts,*pts,*ptm; + su3_alg_dble *fr; + + bc=bc_type(); + + if ((bc==0)||(bc==2)) + { + pts=bnd_pts(&npts); + ptm=pts+npts; + pts+=(npts/2); + + for (;pts]"); + } + + MPI_Bcast(&n,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&eps,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + + set_lat_parms(6.0,1.0,0,NULL,1.0); + print_lat_parms(); + + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,1.0,1.0,1.0,1.0,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,1234); + geometry(); + alloc_wud(1); + alloc_wfd(2); + mdfs=mdflds(); + usv=reserve_wud(1); + fsv=reserve_wfd(1); + udb=udfld(); + + if (bc==0) + nplaq=(double)(6*N0-6)*(double)(N1*N2*N3); + else + nplaq=(double)(6*N0)*(double)(N1*N2*N3); + + random_ud(); + act0=action0(1); + act1=3.0*nplaq-plaq_wsum_dble(1); + + plaq_frc(); + ie=check_bnd_fld((*mdfs).frc); + error(ie!=0,1,"main [check1.c]", + "Force vanishes on an incorrect subset of links"); + assign_alg2alg(4*VOLUME,(*mdfs).frc,fsv[0]); + force0(1.0); + ie=check_bnd_fld((*mdfs).frc); + error(ie!=0,1,"main [check1.c]", + "Force vanishes on an incorrect subset of links"); + dev0=max_dev_frc(fsv[0]); + + if (my_rank==0) + { + printf("Random gauge field:\n"); + printf("Action (action0) = %.15e\n",act0); + printf("Action (plaq_wsum) = %.15e\n",2.0*act1); + printf("Deviation of force = %.1e\n\n",dev0); + } + + random_ud(); + cm3x3_assign(4*VOLUME,udb,usv[0]); + plaq_frc(); + assign_alg2alg(4*VOLUME,(*mdfs).frc,fsv[0]); + dev0=chkfrc(); + fwd_euler(1,eps); + frc=fsv[0]; + scale_bnd_frc(frc); + u=udb; + um=u+4*VOLUME; + + for (;uact0)&&(eps>=0.0))||((act1 +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "su3fcts.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "update.h" +#include "wflow.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define N1 (NPROC1*L1) +#define N2 (NPROC2*L2) +#define N3 (NPROC3*L3) + +static int bc,nfc[8],ofs[8]; +static const su3_dble ud0={{0.0}}; +static su3_dble *g,*gbuf; +static su3_dble wd ALIGNED16; + + +static void pack_gbuf(void) +{ + int ifc,ib,ix; + + nfc[0]=FACE0/2; + nfc[1]=FACE0/2; + nfc[2]=FACE1/2; + nfc[3]=FACE1/2; + nfc[4]=FACE2/2; + nfc[5]=FACE2/2; + nfc[6]=FACE3/2; + nfc[7]=FACE3/2; + + ofs[0]=0; + ofs[1]=ofs[0]+nfc[0]; + ofs[2]=ofs[1]+nfc[1]; + ofs[3]=ofs[2]+nfc[2]; + ofs[4]=ofs[3]+nfc[3]; + ofs[5]=ofs[4]+nfc[4]; + ofs[6]=ofs[5]+nfc[5]; + ofs[7]=ofs[6]+nfc[6]; + + for (ifc=0;ifc<8;ifc++) + { + for (ib=0;ib0) + { + tag=mpi_tag(); + saddr=npr[ifc^0x1]; + raddr=npr[ifc]; + sbuf=gbuf+ofs[ifc]; + rbuf=g+VOLUME+ofs[ifc]; + + if (np&0x1) + { + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + } + else + { + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + } + } + } +} + + +static void random_g(void) +{ + int ix,t; + su3_dble unity,*gx; + + unity=ud0; + unity.c11.re=1.0; + unity.c22.re=1.0; + unity.c33.re=1.0; + gx=g; + + for (ix=0;ix0)||(bc!=1)) + random_su3_dble(gx); + else + (*gx)=unity; + + gx+=1; + } + + if (BNDRY>0) + { + pack_gbuf(); + send_gbuf(); + } +} + + +static void transform_ud(void) +{ + int ix,iy,t,ifc; + su3_dble *u; + + u=udfld(); + + for (ix=(VOLUME/2);ixdmax) + dmax=dev; + } + + return dmax; +} + + +static double max_dev_ud(su3_dble *v) +{ + double d,dmax; + su3_dble *u,*um; + + u=udfld(); + um=u+4*VOLUME; + dmax=0.0; + + for (;udmax) + dmax=d; + + v+=1; + } + + if (NPROC>1) + { + d=dmax; + MPI_Reduce(&d,&dmax,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Bcast(&dmax,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + + return sqrt(dmax); +} + + +int main(int argc,char *argv[]) +{ + int my_rank,n; + double phi[2],phi_prime[2],eps,dev; + su3_dble *udb,**usv; + FILE *flog=NULL,*fin=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check2.log","w",stdout); + fin=freopen("check2.in","r",stdin); + + printf("\n"); + printf("Gauge covariance of the Wilson flow\n"); + printf("-----------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + read_line("n","%d\n",&n); + read_line("eps","%lf",&eps); + fclose(fin); + + printf("n = %d\n",n); + printf("eps = %.3e\n\n",eps); + + bc=find_opt(argc,argv,"-bc"); + + if (bc!=0) + error_root(sscanf(argv[bc+1],"%d",&bc)!=1,1,"main [check2.c]", + "Syntax: check2 [-bc ]"); + } + + MPI_Bcast(&n,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&eps,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + + phi[0]=0.123; + phi[1]=-0.534; + phi_prime[0]=0.912; + phi_prime[1]=0.078; + set_bc_parms(bc,0.973,1.127,1.0,1.0,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,1234); + geometry(); + alloc_wud(2); + alloc_wfd(1); + usv=reserve_wud(2); + udb=udfld(); + + g=amalloc(NSPIN*sizeof(*g),4); + + if (BNDRY>0) + gbuf=amalloc((BNDRY/2)*sizeof(*gbuf),4); + + error((g==NULL)||((BNDRY>0)&&(gbuf==NULL)),1,"main [check2.c]", + "Unable to allocate auxiliary arrays"); + + random_ud(); + random_g(); + cm3x3_assign(4*VOLUME,udb,usv[0]); + fwd_euler(n,eps); + transform_ud(); + cm3x3_assign(4*VOLUME,udb,usv[1]); + cm3x3_assign(4*VOLUME,usv[0],udb); + set_flags(UPDATED_UD); + transform_ud(); + fwd_euler(n,eps); + + dev=max_dev_ud(usv[1]); + error_chk(); + + if (my_rank==0) + { + printf("Maximal absolute deviation of U(x,mu) = %.1e\n\n",dev); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/wflow/check2.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/wflow/check2.in new file mode 100644 index 0000000000000000000000000000000000000000..d4adf9957594ad832baf6adcf1c986cde572979c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/wflow/check2.in @@ -0,0 +1,2 @@ +n 4 +eps 0.01 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/wflow/check3.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/wflow/check3.c new file mode 100644 index 0000000000000000000000000000000000000000..3fc0e2d6fcd95af58174cb18e2b59b2f374821cd --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/wflow/check3.c @@ -0,0 +1,378 @@ + +/******************************************************************************* +* +* File check3.c +* +* Copyright (C) 2009-2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Convergence of the numerical integration of the Wilson flow. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "random.h" +#include "su3fcts.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "archive.h" +#include "forces.h" +#include "tcharge.h" +#include "wflow.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define N1 (NPROC1*L1) +#define N2 (NPROC2*L2) +#define N3 (NPROC3*L3) + +static int my_rank; +static char cnfg_dir[NAME_SIZE],cnfg_file[NAME_SIZE]; +static char nbase[NAME_SIZE],end_file[NAME_SIZE]; + + +static void cmp_ud(su3_dble *u,su3_dble *v,double *dev) +{ + int i; + double r[18],d; + + r[ 0]=(*u).c11.re-(*v).c11.re; + r[ 1]=(*u).c11.im-(*v).c11.im; + r[ 2]=(*u).c12.re-(*v).c12.re; + r[ 3]=(*u).c12.im-(*v).c12.im; + r[ 4]=(*u).c13.re-(*v).c13.re; + r[ 5]=(*u).c13.im-(*v).c13.im; + + r[ 6]=(*u).c21.re-(*v).c21.re; + r[ 7]=(*u).c21.im-(*v).c21.im; + r[ 8]=(*u).c22.re-(*v).c22.re; + r[ 9]=(*u).c22.im-(*v).c22.im; + r[10]=(*u).c23.re-(*v).c23.re; + r[11]=(*u).c23.im-(*v).c23.im; + + r[12]=(*u).c31.re-(*v).c31.re; + r[13]=(*u).c31.im-(*v).c31.im; + r[14]=(*u).c32.re-(*v).c32.re; + r[15]=(*u).c32.im-(*v).c32.im; + r[16]=(*u).c33.re-(*v).c33.re; + r[17]=(*u).c33.im-(*v).c33.im; + + dev[0]=0.0; + dev[1]=0.0; + + for (i=0;i<18;i+=2) + { + d=sqrt(r[i]*r[i]+r[i+1]*r[i+1]); + + if (d>dev[0]) + dev[0]=d; + + dev[1]+=d; + } +} + + +static void dev_ud(su3_dble *v,double *dev) +{ + double d[2]; + su3_dble *u,*um; + + u=udfld(); + um=u+4*VOLUME; + dev[0]=0.0; + dev[1]=0.0; + + for (;udev[0]) + dev[0]=d[0]; + + dev[1]+=d[1]; + v+=1; + } + + if (NPROC>1) + { + d[0]=dev[0]; + d[1]=dev[1]; + MPI_Reduce(d,dev,1,MPI_DOUBLE,MPI_MAX,0,MPI_COMM_WORLD); + MPI_Reduce(d+1,dev+1,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(dev,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + + dev[1]/=((double)(9*NPROC)*(double)(4*VOLUME)); +} + + +static int check_end(void) +{ + int iend; + FILE *end; + + if (my_rank==0) + { + iend=0; + end=fopen(end_file,"r"); + + if (end!=NULL) + { + fclose(end); + remove(end_file); + iend=1; + printf("End flag set, run stopped\n\n"); + } + } + + MPI_Bcast(&iend,1,MPI_INT,0,MPI_COMM_WORLD); + + return iend; +} + + +int main(int argc,char *argv[]) +{ + int first,last,step; + int bc,n,rule,icnfg,ncnfg,nsize; + double phi[2],phi_prime[2]; + double eps,dE[3],dQ[3],dU[2]; + double act[2],qtop[2],dev[2],nplaq; + double wt1,wt2,wtavg; + su3_dble *udb,**usv; + FILE *flog=NULL,*fin=NULL; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + flog=freopen("check3.log","w",stdout); + fin=freopen("check3.in","r",stdin); + + printf("\n"); + printf("Convergence of the numerical integration of the Wilson flow\n"); + printf("-----------------------------------------------------------\n\n"); + + printf("%dx%dx%dx%d lattice, ",NPROC0*L0,NPROC1*L1,NPROC2*L2,NPROC3*L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d local lattice\n\n",L0,L1,L2,L3); + + find_section("Configurations"); + read_line("name","%s",nbase); + read_line("cnfg_dir","%s",cnfg_dir); + read_line("first","%d",&first); + read_line("last","%d",&last); + read_line("step","%d",&step); + + find_section("Boundary conditions"); + read_line("type","%d\n",&bc); + + phi[0]=0.0; + phi[1]=0.0; + phi_prime[0]=0.0; + phi_prime[1]=0.0; + + if (bc==1) + read_dprms("phi",2,phi); + + if ((bc==1)||(bc==2)) + read_dprms("phi'",2,phi_prime); + + find_section("Wilson flow"); + read_line("n","%d\n",&n); + read_line("eps","%lf\n",&eps); + read_line("rule","%d",&rule); + fclose(fin); + + error_root((rule<0)||(rule>3),1,"main [check3.c]", + "rule must be 1,2 or 3"); + } + + MPI_Bcast(nbase,NAME_SIZE,MPI_CHAR,0,MPI_COMM_WORLD); + MPI_Bcast(cnfg_dir,NAME_SIZE,MPI_CHAR,0,MPI_COMM_WORLD); + MPI_Bcast(&first,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&last,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&step,1,MPI_INT,0,MPI_COMM_WORLD); + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(phi,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(phi_prime,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + + MPI_Bcast(&n,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&eps,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&rule,1,MPI_INT,0,MPI_COMM_WORLD); + + set_bc_parms(bc,1.0,1.0,1.0,1.0,phi,phi_prime); + print_bc_parms(); + + start_ranlux(0,1234); + geometry(); + alloc_wud(2); + alloc_wfd(1); + usv=reserve_wud(2); + udb=udfld(); + + if (my_rank==0) + { + printf("n = %d\n",n); + printf("eps = %.3e\n",eps); + + if (rule==1) + printf("Using the Euler integrator\n\n"); + else if (rule==2) + printf("Using the 2nd order RK integrator\n\n"); + else + printf("Using the 3rd order RK integrator\n\n"); + + printf("Configurations %sn%d -> %sn%d in steps of %d\n\n", + nbase,first,nbase,last,step); + + printf("Comparison of the integrated fields at fixed t=n*eps=%.2e\n", + (double)(n)*eps); + printf("with a precise integration using 5x the input value of n\n\n"); + + printf("The deviation |U_ij-U'_ij| is calculated component by\n"); + printf("component on all links of the lattice\n\n"); + fflush(flog); + } + + error_root(((last-first)%step)!=0,1,"main [check3.c]", + "last-first is not a multiple of step"); + check_dir_root(cnfg_dir); + + nsize=name_size("%s/%sn%d",cnfg_dir,nbase,last); + error_root(nsize>=NAME_SIZE,1,"main [check3.c]", + "cnfg_dir name is too long"); + + sprintf(end_file,"check3.end"); + + if (bc==0) + nplaq=(double)(6*N0-6)*(double)(N1*N2*N3); + else + nplaq=(double)(6*N0)*(double)(N1*N2*N3); + + dE[0]=0.0; + dE[1]=0.0; + dE[2]=0.0; + dQ[0]=0.0; + dQ[1]=0.0; + dQ[2]=0.0; + dU[0]=0.0; + dU[1]=0.0; + wtavg=0.0; + + for (icnfg=first;icnfg<=last;icnfg+=step) + { + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + + if (my_rank==0) + { + printf("Configuration no %d:\n\n",icnfg); + fflush(flog); + } + + sprintf(cnfg_file,"%s/%sn%d",cnfg_dir,nbase,icnfg); + import_cnfg(cnfg_file); + cm3x3_assign(4*VOLUME,udb,usv[0]); + + if (rule==1) + fwd_euler(10*n,eps/10.0); + else if (rule==2) + fwd_rk2(4*n,eps/4.0); + else + fwd_rk3(3*n,eps/3.0); + + cm3x3_assign(4*VOLUME,udb,usv[1]); + act[0]=2.0*(3.0*nplaq-plaq_wsum_dble(1)); + qtop[0]=tcharge(); + + cm3x3_assign(4*VOLUME,usv[0],udb); + set_flags(UPDATED_UD); + + if (rule==1) + fwd_euler(n,eps); + else if (rule==2) + fwd_rk2(n,eps); + else + fwd_rk3(n,eps); + + act[1]=2.0*(3.0*nplaq-plaq_wsum_dble(1)); + qtop[1]=tcharge(); + + dev[0]=fabs(act[1]-act[0]); + if (dev[0]>dE[0]) + dE[0]=dev[0]; + dE[1]+=dev[0]; + dE[2]+=act[0]; + + dev[0]=fabs(qtop[1]-qtop[0]); + if (dev[0]>dQ[0]) + dQ[0]=dev[0]; + dQ[1]+=dev[0]; + dQ[2]+=fabs(qtop[0]); + + dev_ud(usv[1],dev); + if (dev[0]>dU[0]) + dU[0]=dev[0]; + dU[1]+=dev[1]; + + MPI_Barrier(MPI_COMM_WORLD); + wt2=MPI_Wtime(); + wtavg+=(wt2-wt1); + + if (my_rank==0) + { + printf("dE/E = %.1e, dQ = %.1e, max|dU| = %.1e, avg|dU| = %.1e\n\n", + fabs(1.0-act[0]/act[1]),fabs(qtop[1]-qtop[0]),dev[0],dev[1]); + printf("Configuration no %d fully processed in %.2e sec ", + icnfg,wt2-wt1); + printf("(average = %.2e sec)\n\n", + wtavg/(double)((icnfg-first)/step+1)); + fflush(flog); + } + + if (check_end()) + break; + } + + error_chk(); + + ncnfg=(last-first)/step+1; + dE[1]/=(double)(ncnfg); + dE[2]/=(double)(ncnfg); + dQ[1]/=(double)(ncnfg); + dQ[2]/=(double)(ncnfg); + dU[1]/=(double)(ncnfg); + + if (my_rank==0) + { + printf("\n"); + printf("Test summary\n"); + printf("------------\n\n"); + + printf("Processed %d configurations\n\n",ncnfg); + printf("max|dE|/E = %.1e, avg|dE|/E = %.1e\n", + dE[0]/dE[2],dE[1]/dE[2]); + printf("max|dQ| = %.1e, avg|dQ| = %.1e, avg|Q| = %.2e\n", + dQ[0],dQ[1],dQ[2]); + printf("max|dU| = %.1e, avg|dU| = %.1e\n\n", + dU[0],dU[1]); + + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/wflow/check3.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/wflow/check3.in new file mode 100644 index 0000000000000000000000000000000000000000..824fe24eb09c2256a45a2b433dd351b5242600dc --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/devel/wflow/check3.in @@ -0,0 +1,17 @@ + +[Configurations] +name 16x8x8x8b6.00id2 +cnfg_dir /home/data/openQCD/cnfg +first 7 +last 7 +step 1 + +[Boundary conditions] +type 0 +# phi 0.937 0.389 +# phi' -0.283 1.23 + +[Wilson flow] +n 100 +eps 0.02 +rule 3 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/INDEX b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/INDEX new file mode 100644 index 0000000000000000000000000000000000000000..2382083ee88e0f55ec7bf2c07e043f169ddc1c91 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/INDEX @@ -0,0 +1,31 @@ + +******************************************************************************** + + Collection of openQCD notes + +******************************************************************************** + +dirac.pdf Implementation of the lattice Dirac operator. + +forces.pdf Molecular-dynamics quark forces. + +gauge_action.pdf Gauge actions in openQCD simulations. + +mscg.pdf Multi-shift conjugate gradient algorithm. + +parms.pdf Parameters of the openQCD main programs. + +ranlux_guide.pdf User's guide for the random number generator ranlux. + Notes accompanying the program files ranlxs.c and + ranlxd.c in modules/random. + +ranlux_notes.pdf Description of the algorithms used in the current + version of the ranlux random number generator. + +rhmc.pdf Charm and strange quark in openQCD simulations. + +stat_fcts.pdf Statistical tests. Notes accompanying the program files + pchi_square.c and ks_test.c in modules/nompi/extras. + +su3_fcts.pdf SU(3) matrix functions. + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/dirac.pdf b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/dirac.pdf new file mode 100644 index 0000000000000000000000000000000000000000..b045bfabfa8ad71b52edea371afb88081d01d790 Binary files /dev/null and b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/dirac.pdf differ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/forces.pdf b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/forces.pdf new file mode 100644 index 0000000000000000000000000000000000000000..320f71054c609ced7b4257f9b42938e20abcdaee Binary files /dev/null and b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/forces.pdf differ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/gauge_action.pdf b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/gauge_action.pdf new file mode 100644 index 0000000000000000000000000000000000000000..fad950893b8ebe50ed9f618375ecf491ea4490e8 Binary files /dev/null and b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/gauge_action.pdf differ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/mscg.pdf b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/mscg.pdf new file mode 100644 index 0000000000000000000000000000000000000000..8fa12523959d7a9701dc65d3c30d20253d9eaca3 Binary files /dev/null and b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/mscg.pdf differ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/parms.pdf b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/parms.pdf new file mode 100644 index 0000000000000000000000000000000000000000..0025aa7dcce1a825cca887972532015b44ce9b77 Binary files /dev/null and b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/parms.pdf differ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/ranlux_guide.pdf b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/ranlux_guide.pdf new file mode 100644 index 0000000000000000000000000000000000000000..e02211bfabd2a025a7ce5feb64d96937b67386d7 Binary files /dev/null and b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/ranlux_guide.pdf differ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/ranlux_notes.pdf b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/ranlux_notes.pdf new file mode 100644 index 0000000000000000000000000000000000000000..a49a62e78293c06a33548c17b370b290f623ac3a Binary files /dev/null and b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/ranlux_notes.pdf differ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/rhmc.pdf b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/rhmc.pdf new file mode 100644 index 0000000000000000000000000000000000000000..18f9d4051dede3cc697b6686cd84c91d8cbd90ce Binary files /dev/null and b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/rhmc.pdf differ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/stat_fcts.pdf b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/stat_fcts.pdf new file mode 100644 index 0000000000000000000000000000000000000000..4c2174af899bd911102150c1a421dda085025c0d Binary files /dev/null and b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/stat_fcts.pdf differ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/su3_fcts.pdf b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/su3_fcts.pdf new file mode 100644 index 0000000000000000000000000000000000000000..19e2461e9c00f0db756887f5dfa3c33eb6e6b434 Binary files /dev/null and b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/doc/su3_fcts.pdf differ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/archive.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/archive.h new file mode 100644 index 0000000000000000000000000000000000000000..a18a188bbd6ce57d85d9a7a07342ba37c4ee0bb8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/archive.h @@ -0,0 +1,38 @@ + +/******************************************************************************* +* +* File archive.h +* +* Copyright (C) 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef ARCHIVE_H +#define ARCHIVE_H + +#ifndef SU3_H +#include "su3.h" +#endif + +/* ARCHIVE_C */ +extern void write_cnfg(char *out); +extern void read_cnfg(char *in); +extern void export_cnfg(char *out); +extern void import_cnfg(char *in); + +/* MARCHIVE_C */ +extern void write_mfld(char *out); +extern void read_mfld(char *in); +extern void export_mfld(char *out); +extern void import_mfld(char *in); + +/* SARCHIVE_C */ +extern void write_sfld(char *out,spinor_dble *sd); +extern void read_sfld(char *in,spinor_dble *sd); +extern void export_sfld(char *out,spinor_dble *sd); +extern void import_sfld(char *in,spinor_dble *sd); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/avx.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/avx.h new file mode 100644 index 0000000000000000000000000000000000000000..03d07137cd9f6009867cf7c48d76cfe3f9170126 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/avx.h @@ -0,0 +1,2754 @@ + +/******************************************************************************* +* +* File avx.h +* +* Copyright (C) 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Macros for Dirac spinors, SU(3) vectors and SU(3) matrices using inline +* assembly AVX instructions. The machine is assumed to comply with the +* x86-64 instruction set. +* +*******************************************************************************/ + +#ifndef AVX_H +#define AVX_H + +#ifndef SSE2_H +#include "sse2.h" +#endif + +typedef struct +{ + float c1,c2,c3,c4; + float c5,c6,c7,c8; +} avx_float __attribute__ ((aligned (32))); + +typedef struct +{ + double c1,c2,c3,c4; +} avx_double __attribute__ ((aligned (32))); + +static avx_double _avx_sgn12_dble __attribute__ ((unused)) ={-1.0,-1.0,1.0,1.0}; +static avx_double _avx_sgn13_dble __attribute__ ((unused)) ={-1.0,1.0,-1.0,1.0}; +static avx_double _avx_sgn14_dble __attribute__ ((unused)) ={-1.0,1.0,1.0,-1.0}; +static avx_double _avx_sgn23_dble __attribute__ ((unused)) ={1.0,-1.0,-1.0,1.0}; +static avx_double _avx_sgn24_dble __attribute__ ((unused)) ={1.0,-1.0,1.0,-1.0}; +static avx_double _avx_sgn34_dble __attribute__ ((unused)) ={1.0,1.0,-1.0,-1.0}; +static avx_double _avx_sgn_dble __attribute__ ((unused)) ={-1.0,-1.0,-1.0,-1.0}; + +static avx_float _avx_sgn_add __attribute__ ((unused)) +={1.0f,1.0f,1.0f,1.0f,-1.0f,-1.0f,-1.0f,-1.0f}; +static avx_float _avx_sgn_i_add __attribute__ ((unused)) +={-1.0f,1.0f,-1.0f,1.0f,1.0f,-1.0f,1.0f,-1.0f}; +static avx_float _avx_sgn_addsub __attribute__ ((unused)) +={1.0f,1.0f,-1.0f,-1.0f,-1.0f,-1.0f,1.0f,1.0f}; +static avx_float _avx_sgn_i_addsub __attribute__ ((unused)) +={-1.0f,1.0f,1.0f,-1.0f,1.0f,-1.0f,-1.0f,1.0f}; + +#define _avx_zeroall() \ +__asm__ __volatile__ ("vzeroall") + +#define _avx_zeroupper() \ +__asm__ __volatile__ ("vzeroupper") + +/******************************************************************************* +* +* Macros operating on single precision data +* +*******************************************************************************/ + +/******************************************************************************* +* +* Macros for spinors in su3_vector order +* +*******************************************************************************/ + +/* +* Loads two spinors sl and sh to the low and high lanes of ymm0,..,ymm5. The +* ordering of the spinor components in the low lane is +* +* xmm0 <- sl.c1.c1,sl.c2.c1 +* xmm1 <- sl.c1.c2,sl.c2.c2 +* xmm2 <- sl.c1.c3,sl.c2.c3 +* xmm3 <- sl.c3.c1,sl.c4.c1 +* xmm4 <- sl.c3.c2,sl.c4.c2 +* xmm5 <- sl.c3.c3,sl.c4.c3 +* +* and those in the high lane are arranged in the same way. The registers +* ymm6,..,ymm11 are changed on exit. +*/ + +#define _avx_spinor_pair_load34(sl,sh) \ +__asm__ __volatile__ ("vmovaps %0, %%xmm6 \n\t" \ + "vmovaps %2, %%xmm7 \n\t" \ + "vmovaps %4, %%xmm8" \ + : \ + : \ + "m" ((sl).c1.c1), \ + "m" ((sl).c1.c2), \ + "m" ((sl).c1.c3), \ + "m" ((sl).c2.c1), \ + "m" ((sl).c2.c2), \ + "m" ((sl).c2.c3) \ + : \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("vmovaps %0, %%xmm9 \n\t" \ + "vmovaps %2, %%xmm10 \n\t" \ + "vmovaps %4, %%xmm11" \ + : \ + : \ + "m" ((sl).c3.c1), \ + "m" ((sl).c3.c2), \ + "m" ((sl).c3.c3), \ + "m" ((sl).c4.c1), \ + "m" ((sl).c4.c2), \ + "m" ((sl).c4.c3) \ + : \ + "xmm9", "xmm10", "xmm11"); \ +__asm__ __volatile__ ("vinsertf128 $0x1, %0, %%ymm6, %%ymm6 \n\t" \ + "vinsertf128 $0x1, %2, %%ymm7, %%ymm7 \n\t" \ + "vinsertf128 $0x1, %4, %%ymm8, %%ymm8" \ + : \ + : \ + "m" ((sh).c1.c1), \ + "m" ((sh).c1.c2), \ + "m" ((sh).c1.c3), \ + "m" ((sh).c2.c1), \ + "m" ((sh).c2.c2), \ + "m" ((sh).c2.c3) \ + : \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("vinsertf128 $0x1, %0, %%ymm9, %%ymm9 \n\t" \ + "vinsertf128 $0x1, %2, %%ymm10, %%ymm10 \n\t" \ + "vinsertf128 $0x1, %4, %%ymm11, %%ymm11" \ + : \ + : \ + "m" ((sh).c3.c1), \ + "m" ((sh).c3.c2), \ + "m" ((sh).c3.c3), \ + "m" ((sh).c4.c1), \ + "m" ((sh).c4.c2), \ + "m" ((sh).c4.c3) \ + : \ + "xmm9", "xmm10", "xmm11"); \ +__asm__ __volatile__ ("vshufps $0xe4, %%ymm7, %%ymm6, %%ymm0 \n\t" \ + "vshufps $0xe4, %%ymm10, %%ymm9, %%ymm3 \n\t" \ + "vshufps $0x4e, %%ymm8, %%ymm6, %%ymm1 \n\t" \ + "vshufps $0x4e, %%ymm11, %%ymm9, %%ymm4 \n\t" \ + "vshufps $0xe4, %%ymm8, %%ymm7, %%ymm2 \n\t" \ + "vshufps $0xe4, %%ymm11, %%ymm10, %%ymm5" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5") + +/* +* Loads two spinors sl and sh to the low and high lanes of ymm0,..,ymm5. The +* ordering of the spinor components in the low lane is +* +* xmm0 <- sl.c1.c1,sl.c2.c1 +* xmm1 <- sl.c1.c2,sl.c2.c2 +* xmm2 <- sl.c1.c3,sl.c2.c3 +* xmm3 <- sl.c4.c1,sl.c3.c1 (note: unusual order) +* xmm4 <- sl.c4.c2,sl.c3.c2 +* xmm5 <- sl.c4.c3,sl.c3.c3 +* +* and those in the high lane are arranged in the same way. The registers +* ymm6,..,ymm11 are changed on exit. +*/ + +#define _avx_spinor_pair_load43(sl,sh) \ +__asm__ __volatile__ ("vmovaps %0, %%xmm6 \n\t" \ + "vmovaps %2, %%xmm7 \n\t" \ + "vmovaps %4, %%xmm8" \ + : \ + : \ + "m" ((sl).c1.c1), \ + "m" ((sl).c1.c2), \ + "m" ((sl).c1.c3), \ + "m" ((sl).c2.c1), \ + "m" ((sl).c2.c2), \ + "m" ((sl).c2.c3) \ + : \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("vmovaps %0, %%xmm9 \n\t" \ + "vmovaps %2, %%xmm10 \n\t" \ + "vmovaps %4, %%xmm11" \ + : \ + : \ + "m" ((sl).c3.c1), \ + "m" ((sl).c3.c2), \ + "m" ((sl).c3.c3), \ + "m" ((sl).c4.c1), \ + "m" ((sl).c4.c2), \ + "m" ((sl).c4.c3) \ + : \ + "xmm9", "xmm10", "xmm11"); \ +__asm__ __volatile__ ("vinsertf128 $0x1, %0, %%ymm6, %%ymm6 \n\t" \ + "vinsertf128 $0x1, %2, %%ymm7, %%ymm7 \n\t" \ + "vinsertf128 $0x1, %4, %%ymm8, %%ymm8" \ + : \ + : \ + "m" ((sh).c1.c1), \ + "m" ((sh).c1.c2), \ + "m" ((sh).c1.c3), \ + "m" ((sh).c2.c1), \ + "m" ((sh).c2.c2), \ + "m" ((sh).c2.c3) \ + : \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("vinsertf128 $0x1, %0, %%ymm9, %%ymm9 \n\t" \ + "vinsertf128 $0x1, %2, %%ymm10, %%ymm10 \n\t" \ + "vinsertf128 $0x1, %4, %%ymm11, %%ymm11" \ + : \ + : \ + "m" ((sh).c3.c1), \ + "m" ((sh).c3.c2), \ + "m" ((sh).c3.c3), \ + "m" ((sh).c4.c1), \ + "m" ((sh).c4.c2), \ + "m" ((sh).c4.c3) \ + : \ + "xmm9", "xmm10", "xmm11"); \ +__asm__ __volatile__ ("vshufps $0xe4, %%ymm7, %%ymm6, %%ymm0 \n\t" \ + "vshufps $0x4e, %%ymm9, %%ymm10, %%ymm3 \n\t" \ + "vshufps $0x4e, %%ymm8, %%ymm6, %%ymm1 \n\t" \ + "vshufps $0xe4, %%ymm9, %%ymm11, %%ymm4 \n\t" \ + "vshufps $0xe4, %%ymm8, %%ymm7, %%ymm2 \n\t" \ + "vshufps $0x4e, %%ymm10, %%ymm11, %%ymm5" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5") + +/* +* Loads the spinor s to xmm0,..,xmm5 in the order +* +* xmm0 <- s.c1.c1,s.c2.c1 +* xmm1 <- s.c1.c2,s.c2.c2 +* xmm2 <- s.c1.c3,s.c2.c3 +* xmm3 <- s.c3.c1,s.c4.c1 +* xmm4 <- s.c3.c2,s.c4.c2 +* xmm5 <- s.c3.c3,s.c4.c3 +* +* and duplicates these values to the upper lanes of ymm0,..ymm5. The registers +* ymm6,..,ymm11 are changed on exit. +*/ + +#define _avx_spinor_load_dup(s) \ +__asm__ __volatile__ ("vbroadcastf128 %0, %%ymm6 \n\t" \ + "vbroadcastf128 %2, %%ymm7 \n\t" \ + "vbroadcastf128 %4, %%ymm8" \ + : \ + : \ + "m" ((s).c1.c1), \ + "m" ((s).c1.c2), \ + "m" ((s).c1.c3), \ + "m" ((s).c2.c1), \ + "m" ((s).c2.c2), \ + "m" ((s).c2.c3) \ + : \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("vbroadcastf128 %0, %%ymm9 \n\t" \ + "vbroadcastf128 %2, %%ymm10 \n\t" \ + "vbroadcastf128 %4, %%ymm11" \ + : \ + : \ + "m" ((s).c3.c1), \ + "m" ((s).c3.c2), \ + "m" ((s).c3.c3), \ + "m" ((s).c4.c1), \ + "m" ((s).c4.c2), \ + "m" ((s).c4.c3) \ + : \ + "xmm9", "xmm10", "xmm11"); \ +__asm__ __volatile__ ("vshufps $0xe4, %%ymm7, %%ymm6, %%ymm0 \n\t" \ + "vshufps $0xe4, %%ymm10, %%ymm9, %%ymm3 \n\t" \ + "vshufps $0x4e, %%ymm8, %%ymm6, %%ymm1 \n\t" \ + "vshufps $0x4e, %%ymm11, %%ymm9, %%ymm4 \n\t" \ + "vshufps $0xe4, %%ymm8, %%ymm7, %%ymm2 \n\t" \ + "vshufps $0xe4, %%ymm11, %%ymm10, %%ymm5" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5") + +/* +* Stores the low and high lanes of ymm0,..,ymm5 to the spinors rl and rh, +* assuming the spinor components are ordered as if they were loaded with +* _avx_spinor_pair_load34(rl,rh). The registers ymm6,..,ymm11 are changed +* on exit. +*/ + +#define _avx_spinor_pair_store34(rl,rh) \ +__asm__ __volatile__ ("vshufps $0x44, %%ymm1, %%ymm0, %%ymm6 \n\t" \ + "vshufps $0x44, %%ymm4, %%ymm3, %%ymm9 \n\t" \ + "vshufps $0xe4, %%ymm0, %%ymm2, %%ymm7 \n\t" \ + "vshufps $0xe4, %%ymm3, %%ymm5, %%ymm10 \n\t" \ + "vshufps $0xee, %%ymm2, %%ymm1, %%ymm8 \n\t" \ + "vshufps $0xee, %%ymm5, %%ymm4, %%ymm11" \ + : \ + : \ + : \ + "xmm6", "xmm7", "xmm8", \ + "xmm9", "xmm10", "xmm11"); \ +__asm__ __volatile__ ("vmovaps %%xmm6, %0 \n\t" \ + "vmovaps %%xmm7, %2 \n\t" \ + "vmovaps %%xmm8, %4" \ + : \ + "=m" ((rl).c1.c1), \ + "=m" ((rl).c1.c2), \ + "=m" ((rl).c1.c3), \ + "=m" ((rl).c2.c1), \ + "=m" ((rl).c2.c2), \ + "=m" ((rl).c2.c3)); \ +__asm__ __volatile__ ("vmovaps %%xmm9, %0 \n\t" \ + "vmovaps %%xmm10, %2 \n\t" \ + "vmovaps %%xmm11, %4" \ + : \ + "=m" ((rl).c3.c1), \ + "=m" ((rl).c3.c2), \ + "=m" ((rl).c3.c3), \ + "=m" ((rl).c4.c1), \ + "=m" ((rl).c4.c2), \ + "=m" ((rl).c4.c3)); \ +__asm__ __volatile__ ("vextractf128 $0x1, %%ymm6, %0 \n\t" \ + "vextractf128 $0x1, %%ymm7, %2 \n\t" \ + "vextractf128 $0x1, %%ymm8, %4" \ + : \ + "=m" ((rh).c1.c1), \ + "=m" ((rh).c1.c2), \ + "=m" ((rh).c1.c3), \ + "=m" ((rh).c2.c1), \ + "=m" ((rh).c2.c2), \ + "=m" ((rh).c2.c3)); \ +__asm__ __volatile__ ("vextractf128 $0x1, %%ymm9, %0 \n\t" \ + "vextractf128 $0x1, %%ymm10, %2 \n\t" \ + "vextractf128 $0x1, %%ymm11, %4" \ + : \ + "=m" ((rh).c3.c1), \ + "=m" ((rh).c3.c2), \ + "=m" ((rh).c3.c3), \ + "=m" ((rh).c4.c1), \ + "=m" ((rh).c4.c2), \ + "=m" ((rh).c4.c3)) + +/* +* Stores the low and high lanes of ymm0,..,ymm5 to the spinors rl and rh, +* assuming the spinor components are ordered as if they were loaded with +* _avx_spinor_pair_load43(rl,rh). The registers ymm6,..,ymm11 are changed +* on exit. +*/ + +#define _avx_spinor_pair_store43(rl,rh) \ +__asm__ __volatile__ ("vshufps $0x44, %%ymm1, %%ymm0, %%ymm6 \n\t" \ + "vshufps $0xee, %%ymm4, %%ymm3, %%ymm9 \n\t" \ + "vshufps $0xe4, %%ymm0, %%ymm2, %%ymm7 \n\t" \ + "vshufps $0x4e, %%ymm3, %%ymm5, %%ymm10 \n\t" \ + "vshufps $0xee, %%ymm2, %%ymm1, %%ymm8 \n\t" \ + "vshufps $0x44, %%ymm5, %%ymm4, %%ymm11" \ + : \ + : \ + : \ + "xmm6", "xmm7", "xmm8", \ + "xmm9", "xmm10", "xmm11"); \ +__asm__ __volatile__ ("vmovaps %%xmm6, %0 \n\t" \ + "vmovaps %%xmm7, %2 \n\t" \ + "vmovaps %%xmm8, %4" \ + : \ + "=m" ((rl).c1.c1), \ + "=m" ((rl).c1.c2), \ + "=m" ((rl).c1.c3), \ + "=m" ((rl).c2.c1), \ + "=m" ((rl).c2.c2), \ + "=m" ((rl).c2.c3)); \ +__asm__ __volatile__ ("vmovaps %%xmm9, %0 \n\t" \ + "vmovaps %%xmm10, %2 \n\t" \ + "vmovaps %%xmm11, %4" \ + : \ + "=m" ((rl).c3.c1), \ + "=m" ((rl).c3.c2), \ + "=m" ((rl).c3.c3), \ + "=m" ((rl).c4.c1), \ + "=m" ((rl).c4.c2), \ + "=m" ((rl).c4.c3)); \ +__asm__ __volatile__ ("vextractf128 $0x1, %%ymm6, %0 \n\t" \ + "vextractf128 $0x1, %%ymm7, %2 \n\t" \ + "vextractf128 $0x1, %%ymm8, %4" \ + : \ + "=m" ((rh).c1.c1), \ + "=m" ((rh).c1.c2), \ + "=m" ((rh).c1.c3), \ + "=m" ((rh).c2.c1), \ + "=m" ((rh).c2.c2), \ + "=m" ((rh).c2.c3)); \ +__asm__ __volatile__ ("vextractf128 $0x1, %%ymm9, %0 \n\t" \ + "vextractf128 $0x1, %%ymm10, %2 \n\t" \ + "vextractf128 $0x1, %%ymm11, %4" \ + : \ + "=m" ((rh).c3.c1), \ + "=m" ((rh).c3.c2), \ + "=m" ((rh).c3.c3), \ + "=m" ((rh).c4.c1), \ + "=m" ((rh).c4.c2), \ + "=m" ((rh).c4.c3)) + +/* +* Loads the lower Weyl spinors of the Dirac spinors sl and sh to the low and +* high lanes of ymm0,..,ymm3. The ordering of the spinor components in the +* low lane is +* +* xmm0 <- sl.c1.c1,sl.c2.c1 +* xmm1 <- sl.c1.c2,sl.c2.c2 +* xmm2 <- sl.c1.c3,sl.c2.c3 +* +* and those in the high lane are arranged in the same way. The registers +* ymm6,..,ymm8 are changed on exit. Also applies if sl and sh are Weyl +* spinors. +*/ + +#define _avx_weyl_pair_load12(sl,sh) \ +__asm__ __volatile__ ("vmovaps %0, %%xmm6 \n\t" \ + "vmovaps %2, %%xmm7 \n\t" \ + "vmovaps %4, %%xmm8" \ + : \ + : \ + "m" ((sl).c1.c1), \ + "m" ((sl).c1.c2), \ + "m" ((sl).c1.c3), \ + "m" ((sl).c2.c1), \ + "m" ((sl).c2.c2), \ + "m" ((sl).c2.c3) \ + : \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("vinsertf128 $0x1, %0, %%ymm6, %%ymm6 \n\t" \ + "vinsertf128 $0x1, %2, %%ymm7, %%ymm7 \n\t" \ + "vinsertf128 $0x1, %4, %%ymm8, %%ymm8" \ + : \ + : \ + "m" ((sh).c1.c1), \ + "m" ((sh).c1.c2), \ + "m" ((sh).c1.c3), \ + "m" ((sh).c2.c1), \ + "m" ((sh).c2.c2), \ + "m" ((sh).c2.c3) \ + : \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("vshufps $0xe4, %%ymm7, %%ymm6, %%ymm0 \n\t" \ + "vshufps $0x4e, %%ymm8, %%ymm6, %%ymm1 \n\t" \ + "vshufps $0xe4, %%ymm8, %%ymm7, %%ymm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2") + +/* +* Loads the upper Weyl spinors of the Dirac spinors sl and sh to the low and +* high lanes of ymm0,..,ymm3. The ordering of the spinor components in the +* low lane is +* +* xmm0 <- sl.c3.c1,sl.c4.c1 +* xmm1 <- sl.c3.c2,sl.c4.c2 +* xmm2 <- sl.c3.c3,sl.c4.c3 +* +* and those in the high lane are arranged in the same way. The registers +* ymm6,..,ymm8 are changed on exit. +*/ + +#define _avx_weyl_pair_load34(sl,sh) \ +__asm__ __volatile__ ("vmovaps %0, %%xmm6 \n\t" \ + "vmovaps %2, %%xmm7 \n\t" \ + "vmovaps %4, %%xmm8" \ + : \ + : \ + "m" ((sl).c3.c1), \ + "m" ((sl).c3.c2), \ + "m" ((sl).c3.c3), \ + "m" ((sl).c4.c1), \ + "m" ((sl).c4.c2), \ + "m" ((sl).c4.c3) \ + : \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("vinsertf128 $0x1, %0, %%ymm6, %%ymm6 \n\t" \ + "vinsertf128 $0x1, %2, %%ymm7, %%ymm7 \n\t" \ + "vinsertf128 $0x1, %4, %%ymm8, %%ymm8" \ + : \ + : \ + "m" ((sh).c3.c1), \ + "m" ((sh).c3.c2), \ + "m" ((sh).c3.c3), \ + "m" ((sh).c4.c1), \ + "m" ((sh).c4.c2), \ + "m" ((sh).c4.c3) \ + : \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("vshufps $0xe4, %%ymm7, %%ymm6, %%ymm0 \n\t" \ + "vshufps $0x4e, %%ymm8, %%ymm6, %%ymm1 \n\t" \ + "vshufps $0xe4, %%ymm8, %%ymm7, %%ymm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2") + +/* +* Stores the low and high lanes of ymm0,..,ymm3 to the lower Weyl spinors +* of the Dirac spinors rl and rh, assuming the spinor components are ordered +* as if they were loaded with _avx_weyl_pair_load12(rl,rh). The registers +* ymm6,..,ymm8 are changed on exit. Also applies if rl and rh are Weyl +* spinors. +*/ + +#define _avx_weyl_pair_store12(rl,rh) \ +__asm__ __volatile__ ("vshufps $0x44, %%ymm1, %%ymm0, %%ymm6 \n\t" \ + "vshufps $0xe4, %%ymm0, %%ymm2, %%ymm7 \n\t" \ + "vshufps $0xee, %%ymm2, %%ymm1, %%ymm8" \ + : \ + : \ + : \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("vmovaps %%xmm6, %0 \n\t" \ + "vmovaps %%xmm7, %2 \n\t" \ + "vmovaps %%xmm8, %4" \ + : \ + "=m" ((rl).c1.c1), \ + "=m" ((rl).c1.c2), \ + "=m" ((rl).c1.c3), \ + "=m" ((rl).c2.c1), \ + "=m" ((rl).c2.c2), \ + "=m" ((rl).c2.c3)); \ +__asm__ __volatile__ ("vextractf128 $0x1, %%ymm6, %0 \n\t" \ + "vextractf128 $0x1, %%ymm7, %2 \n\t" \ + "vextractf128 $0x1, %%ymm8, %4" \ + : \ + "=m" ((rh).c1.c1), \ + "=m" ((rh).c1.c2), \ + "=m" ((rh).c1.c3), \ + "=m" ((rh).c2.c1), \ + "=m" ((rh).c2.c2), \ + "=m" ((rh).c2.c3)) + +/* +* Stores the low and high lanes of ymm0,..,ymm3 to the upper Weyl spinors +* of the Dirac spinors rl and rh, assuming the spinor components are ordered +* as if they were loaded with _avx_weyl_pair_load34(rl,rh). The registers +* ymm6,..,ymm8 are changed on exit. +*/ + +#define _avx_weyl_pair_store34(rl,rh) \ +__asm__ __volatile__ ("vshufps $0x44, %%ymm1, %%ymm0, %%ymm6 \n\t" \ + "vshufps $0xe4, %%ymm0, %%ymm2, %%ymm7 \n\t" \ + "vshufps $0xee, %%ymm2, %%ymm1, %%ymm8" \ + : \ + : \ + : \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("vmovaps %%xmm6, %0 \n\t" \ + "vmovaps %%xmm7, %2 \n\t" \ + "vmovaps %%xmm8, %4" \ + : \ + "=m" ((rl).c3.c1), \ + "=m" ((rl).c3.c2), \ + "=m" ((rl).c3.c3), \ + "=m" ((rl).c4.c1), \ + "=m" ((rl).c4.c2), \ + "=m" ((rl).c4.c3)); \ +__asm__ __volatile__ ("vextractf128 $0x1, %%ymm6, %0 \n\t" \ + "vextractf128 $0x1, %%ymm7, %2 \n\t" \ + "vextractf128 $0x1, %%ymm8, %4" \ + : \ + "=m" ((rh).c3.c1), \ + "=m" ((rh).c3.c2), \ + "=m" ((rh).c3.c3), \ + "=m" ((rh).c4.c1), \ + "=m" ((rh).c4.c2), \ + "=m" ((rh).c4.c3)) + +/* +* Splits the registers ymm3,..,ymm5 according to +* +* xmm3 <- ymm3_lo + ymm3_hi +* xmm4 <- ymm4_lo + ymm4_hi +* xmm5 <- ymm5_lo + ymm5_hi +* +* xmm6 <- ymm3_lo - ymm3_hi +* xmm7 <- ymm4_lo - ymm4_hi +* xmm8 <- ymm5_lo - ymm5_hi +* +* where *_lo and *_hi are the low and high lanes of the registers. The +* registers ymm9,..,ymm11 are used as workspace. +*/ + +#define _avx_spinor_split() \ +__asm__ __volatile__ ("vextractf128 $0x1, %%ymm3, %%xmm9 \n\t" \ + "vextractf128 $0x1, %%ymm4, %%xmm10 \n\t" \ + "vextractf128 $0x1, %%ymm5, %%xmm11 \n\t" \ + "vsubps %%xmm9, %%xmm3, %%xmm6 \n\t" \ + "vsubps %%xmm10, %%xmm4, %%xmm7 \n\t" \ + "vsubps %%xmm11, %%xmm5, %%xmm8 \n\t" \ + "vaddps %%xmm9, %%xmm3, %%xmm3 \n\t" \ + "vaddps %%xmm10, %%xmm4, %%xmm4 \n\t" \ + "vaddps %%xmm11, %%xmm5, %%xmm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5", \ + "xmm6", "xmm7", "xmm8", \ + "xmm9", "xmm10", "xmm11") + +/* +* Moves the lower lanes of ymm6,..,ymm8 to the upper lanes of ymm3,..,ymm5. +*/ + +#define _avx_spinor_unsplit() \ +__asm__ __volatile__ ("vinsertf128 $0x1, %%xmm6, %%ymm3, %%ymm3 \n\t" \ + "vinsertf128 $0x1, %%xmm7, %%ymm4, %%ymm4 \n\t" \ + "vinsertf128 $0x1, %%xmm8, %%ymm5, %%ymm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5") + +/* +* Multiplies ymm3,..,ymm5 by the avx_float c. The register ymm15 is used as +* workspace. +*/ + +#define _avx_spinor_mul_up(c) \ +__asm__ __volatile__ ("vmovaps %0, %%ymm15 \n\t" \ + "vmulps %%ymm15, %%ymm3, %%ymm3 \n\t" \ + "vmulps %%ymm15, %%ymm4, %%ymm4 \n\t" \ + "vmulps %%ymm15, %%ymm5, %%ymm5" \ + : \ + : \ + "m" (c) \ + : \ + "xmm3", "xmm4", "xmm5", "xmm15") + +/* +* Exchanges real and imaginary parts of the double words in ymm3,..,ymm5 +* and multiplies these registers by the avx_float c. The register ymm15 is +* used as workspace. +*/ + +#define _avx_spinor_imul_up(c) \ +__asm__ __volatile__ ("vmovaps %0, %%ymm15 \n\t" \ + "vpermilps $0xb1, %%ymm3, %%ymm3 \n\t" \ + "vpermilps $0xb1, %%ymm4, %%ymm4 \n\t" \ + "vpermilps $0xb1, %%ymm5, %%ymm5 \n\t" \ + "vmulps %%ymm15, %%ymm3, %%ymm3 \n\t" \ + "vmulps %%ymm15, %%ymm4, %%ymm4 \n\t" \ + "vmulps %%ymm15, %%ymm5, %%ymm5" \ + : \ + : \ + "m" (c) \ + : \ + "xmm3", "xmm4", "xmm5", "xmm15") + +/* +* Exchanges the high and low words in the two lanes of ymm3,..,ymm5. +*/ + +#define _avx_spinor_xch_up() \ +__asm__ __volatile__ ("vpermilps $0x4e, %%ymm3, %%ymm3 \n\t" \ + "vpermilps $0x4e, %%ymm4, %%ymm4 \n\t" \ + "vpermilps $0x4e, %%ymm5, %%ymm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5") + +/* +* Exchanges the high and low words in the two lanes of ymm3,..,ymm5, then the +* real and imaginary parts of the words and finally multiplies the registers +* by the avx_float c. The register ymm15 is used as workspace. +*/ + +#define _avx_spinor_xch_imul_up(c) \ +__asm__ __volatile__ ("vmovaps %0, %%ymm15 \n\t" \ + "vpermilps $0x1b, %%ymm3, %%ymm3 \n\t" \ + "vpermilps $0x1b, %%ymm4, %%ymm4 \n\t" \ + "vpermilps $0x1b, %%ymm5, %%ymm5 \n\t" \ + "vmulps %%ymm15, %%ymm3, %%ymm3 \n\t" \ + "vmulps %%ymm15, %%ymm4, %%ymm4 \n\t" \ + "vmulps %%ymm15, %%ymm5, %%ymm5" \ + : \ + : \ + "m" (c) \ + : \ + "xmm3", "xmm4", "xmm5", "xmm15") + +/* +* Multiplies xmm6,..,xmm8 by the sse_float c. The register ymm15 is used as +* workspace. +*/ + +#define _avx_weyl_mul(c) \ +__asm__ __volatile__ ("vmovaps %0, %%xmm15 \n\t" \ + "vmulps %%xmm15, %%xmm6, %%xmm6 \n\t" \ + "vmulps %%xmm15, %%xmm7, %%xmm7 \n\t" \ + "vmulps %%xmm15, %%xmm8, %%xmm8" \ + : \ + : \ + "m" (c) \ + : \ + "xmm6", "xmm7", "xmm8", "xmm15") + +/* +* Exchanges real and imaginary parts of the double words in xmm6,..,xmm8 +* and multiplies these registers by the sse_float c. The register ymm15 is +* used as workspace. +*/ + +#define _avx_weyl_imul(c) \ +__asm__ __volatile__ ("vmovaps %0, %%xmm15 \n\t" \ + "vpermilps $0xb1, %%xmm6, %%xmm6 \n\t" \ + "vpermilps $0xb1, %%xmm7, %%xmm7 \n\t" \ + "vpermilps $0xb1, %%xmm8, %%xmm8 \n\t" \ + "vmulps %%xmm15, %%xmm6, %%xmm6 \n\t" \ + "vmulps %%xmm15, %%xmm7, %%xmm7 \n\t" \ + "vmulps %%xmm15, %%xmm8, %%xmm8" \ + : \ + : \ + "m" (c) \ + : \ + "xmm6", "xmm7", "xmm8", "xmm15") + +/* +* Exchanges the high and low words of xmm6,..,xmm8. +*/ + +#define _avx_weyl_xch() \ +__asm__ __volatile__ ("vpermilps $0x4e, %%xmm6, %%xmm6 \n\t" \ + "vpermilps $0x4e, %%xmm7, %%xmm7 \n\t" \ + "vpermilps $0x4e, %%xmm8, %%xmm8" \ + : \ + : \ + : \ + "xmm6", "xmm7", "xmm8") + +/* +* Exchanges the high and low words of xmm6,..,xmm8, then the real and +* imaginary parts of the words and finally multiplies the registers by +* the sse_float c. The register ymm15 is used as workspace. +*/ + +#define _avx_weyl_xch_imul(c) \ +__asm__ __volatile__ ("vmovaps %0, %%xmm15 \n\t" \ + "vpermilps $0x1b, %%xmm6, %%xmm6 \n\t" \ + "vpermilps $0x1b, %%xmm7, %%xmm7 \n\t" \ + "vpermilps $0x1b, %%xmm8, %%xmm8 \n\t" \ + "vmulps %%xmm15, %%xmm6, %%xmm6 \n\t" \ + "vmulps %%xmm15, %%xmm7, %%xmm7 \n\t" \ + "vmulps %%xmm15, %%xmm8, %%xmm8" \ + : \ + : \ + "m" (c) \ + : \ + "xmm6", "xmm7", "xmm8", "xmm15") + +/* +* Adds ymm3,..,ymm5 to ymm0,..,ymm2 +*/ + +#define _avx_spinor_add() \ +__asm__ __volatile__ ("vaddps %%ymm3, %%ymm0, %%ymm0 \n\t" \ + "vaddps %%ymm4, %%ymm1, %%ymm1 \n\t" \ + "vaddps %%ymm5, %%ymm2, %%ymm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2") + +/* +* Subtracts ymm3,..,ymm5 from ymm0,..,ymm2 +*/ + +#define _avx_spinor_sub() \ +__asm__ __volatile__ ("vsubps %%ymm3, %%ymm0, %%ymm0 \n\t" \ + "vsubps %%ymm4, %%ymm1, %%ymm1 \n\t" \ + "vsubps %%ymm5, %%ymm2, %%ymm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2") + +/* +* Adds (subtracts) the low (high) words in the two lanes of ymm3,..,ymm5 +* to (from) ymm0,..,ymm2. The registers ymm6,ymm7,ymm8 are changed on exit. +*/ + +#define _avx_spinor_addsub() \ +__asm__ __volatile__ ("vaddps %%ymm3, %%ymm0, %%ymm6 \n\t" \ + "vaddps %%ymm4, %%ymm1, %%ymm7 \n\t" \ + "vaddps %%ymm5, %%ymm2, %%ymm8 \n\t" \ + "vsubps %%ymm3, %%ymm0, %%ymm0 \n\t" \ + "vsubps %%ymm4, %%ymm1, %%ymm1 \n\t" \ + "vsubps %%ymm5, %%ymm2, %%ymm2 \n\t" \ + "vblendps $0x33, %%ymm6, %%ymm0, %%ymm0 \n\t" \ + "vblendps $0x33, %%ymm7, %%ymm1, %%ymm1 \n\t" \ + "vblendps $0x33, %%ymm8, %%ymm2, %%ymm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm6", "xmm7", "xmm8") + +/* +* Adds (subtracts) the high (low) words in the two lanes of ymm3,..,ymm5 +* to (from) ymm0,..,ymm2. The registers ymm6,..,ymm8 are changed on exit. +*/ + +#define _avx_spinor_subadd() \ +__asm__ __volatile__ ("vaddps %%ymm3, %%ymm0, %%ymm6 \n\t" \ + "vaddps %%ymm4, %%ymm1, %%ymm7 \n\t" \ + "vaddps %%ymm5, %%ymm2, %%ymm8 \n\t" \ + "vsubps %%ymm3, %%ymm0, %%ymm0 \n\t" \ + "vsubps %%ymm4, %%ymm1, %%ymm1 \n\t" \ + "vsubps %%ymm5, %%ymm2, %%ymm2 \n\t" \ + "vblendps $0xcc, %%ymm6, %%ymm0, %%ymm0 \n\t" \ + "vblendps $0xcc, %%ymm7, %%ymm1, %%ymm1 \n\t" \ + "vblendps $0xcc, %%ymm8, %%ymm2, %%ymm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm6", "xmm7", "xmm8") + +/* +* Multiplies ymm3,..,ymm5 with i and adds them to ymm0,..,ymm2. The +* registers ymm3,..,ymm5 are changed on exit. +*/ + +#define _avx_spinor_i_add() \ +__asm__ __volatile__ ("vpermilps $0xb1, %%ymm3, %%ymm3 \n\t" \ + "vpermilps $0xb1, %%ymm4, %%ymm4 \n\t" \ + "vpermilps $0xb1, %%ymm5, %%ymm5 \n\t" \ + "vaddsubps %%ymm3, %%ymm0, %%ymm0 \n\t" \ + "vaddsubps %%ymm4, %%ymm1, %%ymm1 \n\t" \ + "vaddsubps %%ymm5, %%ymm2, %%ymm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5") + +/* +* Multiplies ymm3,..,ymm5 with i and subtracts them from ymm0,..,ymm2. +*/ + +#define _avx_spinor_i_sub() \ +__asm__ __volatile__ ("vpermilps $0xb1, %%ymm0, %%ymm0 \n\t" \ + "vpermilps $0xb1, %%ymm1, %%ymm1 \n\t" \ + "vpermilps $0xb1, %%ymm2, %%ymm2 \n\t" \ + "vaddsubps %%ymm3, %%ymm0, %%ymm0 \n\t" \ + "vaddsubps %%ymm4, %%ymm1, %%ymm1 \n\t" \ + "vaddsubps %%ymm5, %%ymm2, %%ymm2 \n\t" \ + "vpermilps $0xb1, %%ymm0, %%ymm0 \n\t" \ + "vpermilps $0xb1, %%ymm1, %%ymm1 \n\t" \ + "vpermilps $0xb1, %%ymm2, %%ymm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2") + +/* +* Exchanges the high and low words of ymm3,..,ymm5, multiplies them with i +* and adds the result to ymm0,..,ymm2. The registers ymm3,..,ymm5 are +* changed on exit. +*/ + +#define _avx_spinor_xch_i_add() \ +__asm__ __volatile__ ("vpermilps $0x1b, %%ymm3, %%ymm3 \n\t" \ + "vpermilps $0x1b, %%ymm4, %%ymm4 \n\t" \ + "vpermilps $0x1b, %%ymm5, %%ymm5 \n\t" \ + "vaddsubps %%ymm3, %%ymm0, %%ymm0 \n\t" \ + "vaddsubps %%ymm4, %%ymm1, %%ymm1 \n\t" \ + "vaddsubps %%ymm5, %%ymm2, %%ymm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5") + +/* +* Exchanges the high and low words of ymm3,..,ymm5, multiplies them with i +* and subtracts the result from ymm0,..,ymm2. +*/ + +#define _avx_spinor_xch_i_sub() \ +__asm__ __volatile__ ("vpermilps $0x1b, %%ymm0, %%ymm0 \n\t" \ + "vpermilps $0x1b, %%ymm1, %%ymm1 \n\t" \ + "vpermilps $0x1b, %%ymm2, %%ymm2 \n\t" \ + "vaddsubps %%ymm3, %%ymm0, %%ymm0 \n\t" \ + "vaddsubps %%ymm4, %%ymm1, %%ymm1 \n\t" \ + "vaddsubps %%ymm5, %%ymm2, %%ymm2 \n\t" \ + "vpermilps $0x1b, %%ymm0, %%ymm0 \n\t" \ + "vpermilps $0x1b, %%ymm1, %%ymm1 \n\t" \ + "vpermilps $0x1b, %%ymm2, %%ymm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2") + +/* +* Multiplies the low and high words in the two lanes of ymm3,..,ymm5 with +* i and -i respectively and adds these registers to ymm0,..,ymm2. The +* registers ymm3,..,ymm5 are changed on exit. +*/ + +#define _avx_spinor_i_addsub() \ +__asm__ __volatile__ ("vpermilps $0xb4, %%ymm0, %%ymm0 \n\t" \ + "vpermilps $0xb4, %%ymm1, %%ymm1 \n\t" \ + "vpermilps $0xb4, %%ymm2, %%ymm2 \n\t" \ + "vpermilps $0xe1, %%ymm3, %%ymm3 \n\t" \ + "vpermilps $0xe1, %%ymm4, %%ymm4 \n\t" \ + "vpermilps $0xe1, %%ymm5, %%ymm5 \n\t" \ + "vaddsubps %%ymm3, %%ymm0, %%ymm0 \n\t" \ + "vaddsubps %%ymm4, %%ymm1, %%ymm1 \n\t" \ + "vaddsubps %%ymm5, %%ymm2, %%ymm2 \n\t" \ + "vpermilps $0xb4, %%ymm0, %%ymm0 \n\t" \ + "vpermilps $0xb4, %%ymm1, %%ymm1 \n\t" \ + "vpermilps $0xb4, %%ymm2, %%ymm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5") + +/* +* Multiplies the low and high words in the two lanes of ymm3,..,ymm5 with +* -i and i respectively and adds these registers to ymm0,..,ymm2. The +* registers ymm3,..,ymm5 are changed on exit. +*/ + +#define _avx_spinor_i_subadd() \ +__asm__ __volatile__ ("vpermilps $0xe1, %%ymm0, %%ymm0 \n\t" \ + "vpermilps $0xe1, %%ymm1, %%ymm1 \n\t" \ + "vpermilps $0xe1, %%ymm2, %%ymm2 \n\t" \ + "vpermilps $0xb4, %%ymm3, %%ymm3 \n\t" \ + "vpermilps $0xb4, %%ymm4, %%ymm4 \n\t" \ + "vpermilps $0xb4, %%ymm5, %%ymm5 \n\t" \ + "vaddsubps %%ymm3, %%ymm0, %%ymm0 \n\t" \ + "vaddsubps %%ymm4, %%ymm1, %%ymm1 \n\t" \ + "vaddsubps %%ymm5, %%ymm2, %%ymm2 \n\t" \ + "vpermilps $0xe1, %%ymm0, %%ymm0 \n\t" \ + "vpermilps $0xe1, %%ymm1, %%ymm1 \n\t" \ + "vpermilps $0xe1, %%ymm2, %%ymm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5") + +/* +* Exchanges the high and low words in each lane of ymm3,..,ymm5. +*/ + +#define _avx_spinor_xch() \ +__asm__ __volatile__ ("vpermilps $0x4e, %%ymm3, %%ymm3 \n\t" \ + "vpermilps $0x4e, %%ymm4, %%ymm4 \n\t" \ + "vpermilps $0x4e, %%ymm5, %%ymm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5") + +/****************************************************************************** +* +* Action of su3 matrices on su3 vectors +* +******************************************************************************/ + +/* +* Multiplies pairs of su3 vectors, stored in the low and high lanes of +* ymm0,..,ymm2, with su3 matrices ul and uh, respectively. The vectors +* are assumed to be in vertical order and the products are returned in the +* same order in the registers ymm3,..,ymm5. All registers except for +* ymm15 are changed on exit. +*/ + +#define _avx_su3_pair_multiply(ul,uh) \ +__asm__ __volatile__ ("vbroadcastss %0, %%xmm3 \n\t" \ + "vbroadcastss %1, %%xmm6 \n\t" \ + "vbroadcastss %2, %%xmm4 \n\t" \ + "vbroadcastss %3, %%xmm9 \n\t" \ + "vbroadcastss %4, %%xmm10 \n\t" \ + "vbroadcastss %5, %%xmm11 \n\t" \ + "vinsertf128 $0x1, %%xmm9, %%ymm3, %%ymm3 \n\t" \ + "vinsertf128 $0x1, %%xmm10, %%ymm6, %%ymm6 \n\t" \ + "vinsertf128 $0x1, %%xmm11, %%ymm4, %%ymm4" \ + : \ + : \ + "m" ((ul).c11.re), \ + "m" ((ul).c12.re), \ + "m" ((ul).c21.re), \ + "m" ((uh).c11.re), \ + "m" ((uh).c12.re), \ + "m" ((uh).c21.re) \ + : \ + "xmm3", "xmm4", "xmm6", \ + "xmm9", "xmm10", "xmm11"); \ +__asm__ __volatile__ ("vbroadcastss %0, %%xmm7 \n\t" \ + "vbroadcastss %1, %%xmm5 \n\t" \ + "vbroadcastss %2, %%xmm8 \n\t" \ + "vbroadcastss %3, %%xmm12 \n\t" \ + "vbroadcastss %4, %%xmm13 \n\t" \ + "vbroadcastss %5, %%xmm14 \n\t" \ + "vinsertf128 $0x1, %%xmm12, %%ymm7, %%ymm7 \n\t" \ + "vinsertf128 $0x1, %%xmm13, %%ymm5, %%ymm5 \n\t" \ + "vinsertf128 $0x1, %%xmm14, %%ymm8, %%ymm8" \ + : \ + : \ + "m" ((ul).c22.re), \ + "m" ((ul).c31.re), \ + "m" ((ul).c32.re), \ + "m" ((uh).c22.re), \ + "m" ((uh).c31.re), \ + "m" ((uh).c32.re) \ + : \ + "xmm5", "xmm7", "xmm8", \ + "xmm12", "xmm13", "xmm14"); \ +__asm__ __volatile__ ("vmulps %%ymm0, %%ymm3, %%ymm3 \n\t" \ + "vmulps %%ymm1, %%ymm6, %%ymm6 \n\t" \ + "vmulps %%ymm0, %%ymm4, %%ymm4 \n\t" \ + "vmulps %%ymm1, %%ymm7, %%ymm7 \n\t" \ + "vmulps %%ymm0, %%ymm5, %%ymm5 \n\t" \ + "vmulps %%ymm1, %%ymm8, %%ymm8 \n\t" \ + "vaddps %%ymm6, %%ymm3, %%ymm3 \n\t" \ + "vaddps %%ymm7, %%ymm4, %%ymm4 \n\t" \ + "vaddps %%ymm8, %%ymm5, %%ymm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5", \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("vbroadcastss %0, %%xmm9 \n\t" \ + "vbroadcastss %1, %%xmm10 \n\t" \ + "vbroadcastss %2, %%xmm11 \n\t" \ + "vbroadcastss %3, %%xmm12 \n\t" \ + "vbroadcastss %4, %%xmm13 \n\t" \ + "vbroadcastss %5, %%xmm14 \n\t" \ + "vinsertf128 $0x1, %%xmm12, %%ymm9, %%ymm9 \n\t" \ + "vinsertf128 $0x1, %%xmm13, %%ymm10, %%ymm10 \n\t" \ + "vinsertf128 $0x1, %%xmm14, %%ymm11, %%ymm11 \n\t" \ + "vpermilps $0xb1, %%ymm0, %%ymm0" \ + : \ + : \ + "m" ((ul).c13.re), \ + "m" ((ul).c21.im), \ + "m" ((ul).c33.re), \ + "m" ((uh).c13.re), \ + "m" ((uh).c21.im), \ + "m" ((uh).c33.re) \ + : \ + "xmm0", "xmm9", "xmm10", "xmm11", \ + "xmm12", "xmm13", "xmm14"); \ +__asm__ __volatile__ ("vbroadcastss %0, %%xmm6 \n\t" \ + "vbroadcastss %1, %%xmm7 \n\t" \ + "vbroadcastss %2, %%xmm8 \n\t" \ + "vbroadcastss %3, %%xmm12 \n\t" \ + "vbroadcastss %4, %%xmm13 \n\t" \ + "vbroadcastss %5, %%xmm14 \n\t" \ + "vinsertf128 $0x1, %%xmm12, %%ymm6, %%ymm6 \n\t" \ + "vinsertf128 $0x1, %%xmm13, %%ymm7, %%ymm7 \n\t" \ + "vinsertf128 $0x1, %%xmm14, %%ymm8, %%ymm8" \ + : \ + : \ + "m" ((ul).c11.im), \ + "m" ((ul).c23.re), \ + "m" ((ul).c31.im), \ + "m" ((uh).c11.im), \ + "m" ((uh).c23.re), \ + "m" ((uh).c31.im) \ + : \ + "xmm6", "xmm7", "xmm8", \ + "xmm12", "xmm13", "xmm14"); \ +__asm__ __volatile__ ("vmulps %%ymm2, %%ymm9, %%ymm9 \n\t" \ + "vmulps %%ymm0, %%ymm10, %%ymm10 \n\t" \ + "vmulps %%ymm2, %%ymm11, %%ymm11 \n\t" \ + "vmulps %%ymm0, %%ymm6, %%ymm6 \n\t" \ + "vmulps %%ymm2, %%ymm7, %%ymm7 \n\t" \ + "vmulps %%ymm0, %%ymm8, %%ymm8 \n\t" \ + "vaddps %%ymm9, %%ymm3, %%ymm3 \n\t" \ + "vaddsubps %%ymm10, %%ymm4, %%ymm4 \n\t" \ + "vaddps %%ymm11, %%ymm5, %%ymm5 \n\t" \ + "vaddsubps %%ymm6, %%ymm3, %%ymm3 \n\t" \ + "vaddps %%ymm7, %%ymm4, %%ymm4 \n\t" \ + "vaddsubps %%ymm8, %%ymm5, %%ymm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5", \ + "xmm6", "xmm7", "xmm8", \ + "xmm9", "xmm10", "xmm11"); \ +__asm__ __volatile__ ("vpermilps $0xb1, %%ymm1, %%ymm1 \n\t" \ + "vpermilps $0xb1, %%ymm2, %%ymm2 \n\t" \ + "vbroadcastss %0, %%xmm12 \n\t" \ + "vbroadcastss %1, %%xmm13 \n\t" \ + "vbroadcastss %2, %%xmm14 \n\t" \ + "vbroadcastss %3, %%xmm9 \n\t" \ + "vbroadcastss %4, %%xmm10 \n\t" \ + "vbroadcastss %5, %%xmm11 \n\t" \ + "vinsertf128 $0x1, %%xmm12, %%ymm9, %%ymm9 \n\t" \ + "vinsertf128 $0x1, %%xmm13, %%ymm10, %%ymm10 \n\t" \ + "vinsertf128 $0x1, %%xmm14, %%ymm11, %%ymm11" \ + : \ + : \ + "m" ((uh).c12.im), \ + "m" ((uh).c23.im), \ + "m" ((uh).c32.im), \ + "m" ((ul).c12.im), \ + "m" ((ul).c23.im), \ + "m" ((ul).c32.im) \ + : \ + "xmm1", "xmm2", "xmm9", "xmm10", \ + "xmm11", "xmm12", "xmm13", "xmm14"); \ +__asm__ __volatile__ ("vbroadcastss %0, %%xmm6 \n\t" \ + "vbroadcastss %1, %%xmm7 \n\t" \ + "vbroadcastss %2, %%xmm8 \n\t" \ + "vbroadcastss %3, %%xmm12 \n\t" \ + "vbroadcastss %4, %%xmm13 \n\t" \ + "vbroadcastss %5, %%xmm14 \n\t" \ + "vinsertf128 $0x1, %%xmm12, %%ymm6, %%ymm6 \n\t" \ + "vinsertf128 $0x1, %%xmm13, %%ymm7, %%ymm7 \n\t" \ + "vinsertf128 $0x1, %%xmm14, %%ymm8, %%ymm8" \ + : \ + : \ + "m" ((ul).c13.im), \ + "m" ((ul).c22.im), \ + "m" ((ul).c33.im), \ + "m" ((uh).c13.im), \ + "m" ((uh).c22.im), \ + "m" ((uh).c33.im) \ + : \ + "xmm6", "xmm7", "xmm8", "xmm12", \ + "xmm13", "xmm14"); \ +__asm__ __volatile__ ("vmulps %%ymm1, %%ymm9, %%ymm9 \n\t" \ + "vmulps %%ymm2, %%ymm10, %%ymm10 \n\t" \ + "vmulps %%ymm1, %%ymm11, %%ymm11 \n\t" \ + "vmulps %%ymm2, %%ymm6, %%ymm6 \n\t" \ + "vmulps %%ymm1, %%ymm7, %%ymm7 \n\t" \ + "vmulps %%ymm2, %%ymm8, %%ymm8 \n\t" \ + "vaddsubps %%ymm9, %%ymm3, %%ymm3 \n\t" \ + "vaddsubps %%ymm10, %%ymm4, %%ymm4 \n\t" \ + "vaddsubps %%ymm11, %%ymm5, %%ymm5 \n\t" \ + "vaddsubps %%ymm6, %%ymm3, %%ymm3 \n\t" \ + "vaddsubps %%ymm7, %%ymm4, %%ymm4 \n\t" \ + "vaddsubps %%ymm8, %%ymm5, %%ymm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5", \ + "xmm6", "xmm7", "xmm8", \ + "xmm9", "xmm10", "xmm11") + +/* +* Multiplies pairs of su3 vectors, stored in the low and high lanes of +* ymm0,..,ymm2, by the su3 matrices ul^dagger and uh^dagger, respectively. +* The vectors are assumed to be in vertical order and the products are returned +* in the same order in the registers ymm3,..,ymm5. All registers except for +* ymm15 are changed on exit. +*/ + +#define _avx_su3_pair_inverse_multiply(ul,uh) \ +__asm__ __volatile__ ("vbroadcastss %0, %%xmm6 \n\t" \ + "vbroadcastss %1, %%xmm9 \n\t" \ + "vbroadcastss %2, %%xmm7 \n\t" \ + "vbroadcastss %3, %%xmm12 \n\t" \ + "vbroadcastss %4, %%xmm13 \n\t" \ + "vbroadcastss %5, %%xmm14 \n\t" \ + "vinsertf128 $0x1, %%xmm12, %%ymm6, %%ymm6 \n\t" \ + "vinsertf128 $0x1, %%xmm13, %%ymm9, %%ymm9 \n\t" \ + "vinsertf128 $0x1, %%xmm14, %%ymm7, %%ymm7" \ + : \ + : \ + "m" ((ul).c11.im), \ + "m" ((ul).c21.im), \ + "m" ((ul).c12.im), \ + "m" ((uh).c11.im), \ + "m" ((uh).c21.im), \ + "m" ((uh).c12.im) \ + : \ + "xmm6", "xmm7", "xmm9", \ + "xmm12", "xmm13", "xmm14"); \ +__asm__ __volatile__ ("vbroadcastss %0, %%xmm10 \n\t" \ + "vbroadcastss %1, %%xmm8 \n\t" \ + "vbroadcastss %2, %%xmm11 \n\t" \ + "vbroadcastss %3, %%xmm12 \n\t" \ + "vbroadcastss %4, %%xmm13 \n\t" \ + "vbroadcastss %5, %%xmm14 \n\t" \ + "vinsertf128 $0x1, %%xmm12, %%ymm10, %%ymm10 \n\t" \ + "vinsertf128 $0x1, %%xmm13, %%ymm8, %%ymm8 \n\t" \ + "vinsertf128 $0x1, %%xmm14, %%ymm11, %%ymm11" \ + : \ + : \ + "m" ((ul).c22.im), \ + "m" ((ul).c13.im), \ + "m" ((ul).c23.im), \ + "m" ((uh).c22.im), \ + "m" ((uh).c13.im), \ + "m" ((uh).c23.im) \ + : \ + "xmm8", "xmm10", "xmm11", \ + "xmm12", "xmm13", "xmm14"); \ +__asm__ __volatile__ ("vmulps %%ymm0, %%ymm6, %%ymm6 \n\t" \ + "vmulps %%ymm1, %%ymm9, %%ymm9 \n\t" \ + "vmulps %%ymm0, %%ymm7, %%ymm7 \n\t" \ + "vmulps %%ymm1, %%ymm10, %%ymm10 \n\t" \ + "vmulps %%ymm0, %%ymm8, %%ymm8 \n\t" \ + "vmulps %%ymm1, %%ymm11, %%ymm11 \n\t" \ + "vaddps %%ymm6, %%ymm9, %%ymm9 \n\t" \ + "vaddps %%ymm7, %%ymm10, %%ymm10 \n\t" \ + "vaddps %%ymm8, %%ymm11, %%ymm11" \ + : \ + : \ + : \ + "xmm6", "xmm7", "xmm8", \ + "xmm9", "xmm10", "xmm11"); \ +__asm__ __volatile__ ("vbroadcastss %0, %%xmm3 \n\t" \ + "vbroadcastss %1, %%xmm4 \n\t" \ + "vbroadcastss %2, %%xmm5 \n\t" \ + "vbroadcastss %3, %%xmm12 \n\t" \ + "vbroadcastss %4, %%xmm13 \n\t" \ + "vbroadcastss %5, %%xmm14 \n\t" \ + "vinsertf128 $0x1, %%xmm12, %%ymm3, %%ymm3 \n\t" \ + "vinsertf128 $0x1, %%xmm13, %%ymm4, %%ymm4 \n\t" \ + "vinsertf128 $0x1, %%xmm14, %%ymm5, %%ymm5 \n\t" \ + "vpermilps $0xb1, %%ymm0, %%ymm0" \ + : \ + : \ + "m" ((ul).c11.re), \ + "m" ((ul).c12.re), \ + "m" ((ul).c13.re), \ + "m" ((uh).c11.re), \ + "m" ((uh).c12.re), \ + "m" ((uh).c13.re) \ + : \ + "xmm0", "xmm3", "xmm4", "xmm5", \ + "xmm12", "xmm13", "xmm14"); \ +__asm__ __volatile__ ("vbroadcastss %0, %%xmm6 \n\t" \ + "vbroadcastss %1, %%xmm7 \n\t" \ + "vbroadcastss %2, %%xmm8 \n\t" \ + "vbroadcastss %3, %%xmm12 \n\t" \ + "vbroadcastss %4, %%xmm13 \n\t" \ + "vbroadcastss %5, %%xmm14 \n\t" \ + "vinsertf128 $0x1, %%xmm12, %%ymm6, %%ymm6 \n\t" \ + "vinsertf128 $0x1, %%xmm13, %%ymm7, %%ymm7 \n\t" \ + "vinsertf128 $0x1, %%xmm14, %%ymm8, %%ymm8" \ + : \ + : \ + "m" ((ul).c31.im), \ + "m" ((ul).c32.im), \ + "m" ((ul).c33.im), \ + "m" ((uh).c31.im), \ + "m" ((uh).c32.im), \ + "m" ((uh).c33.im) \ + : \ + "xmm6", "xmm7", "xmm8", \ + "xmm12", "xmm13", "xmm14"); \ +__asm__ __volatile__ ("vmulps %%ymm0, %%ymm3, %%ymm3 \n\t" \ + "vmulps %%ymm0, %%ymm4, %%ymm4 \n\t" \ + "vmulps %%ymm0, %%ymm5, %%ymm5 \n\t" \ + "vmulps %%ymm2, %%ymm6, %%ymm6 \n\t" \ + "vmulps %%ymm2, %%ymm7, %%ymm7 \n\t" \ + "vmulps %%ymm2, %%ymm8, %%ymm8 \n\t" \ + "vaddsubps %%ymm9, %%ymm3, %%ymm3 \n\t" \ + "vaddsubps %%ymm10, %%ymm4, %%ymm4 \n\t" \ + "vaddsubps %%ymm11, %%ymm5, %%ymm5 \n\t" \ + "vaddsubps %%ymm6, %%ymm3, %%ymm3 \n\t" \ + "vaddsubps %%ymm7, %%ymm4, %%ymm4 \n\t" \ + "vaddsubps %%ymm8, %%ymm5, %%ymm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5", \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("vpermilps $0xb1, %%ymm1, %%ymm1 \n\t" \ + "vpermilps $0xb1, %%ymm2, %%ymm2 \n\t" \ + "vbroadcastss %0, %%xmm12 \n\t" \ + "vbroadcastss %1, %%xmm13 \n\t" \ + "vbroadcastss %2, %%xmm14 \n\t" \ + "vbroadcastss %3, %%xmm9 \n\t" \ + "vbroadcastss %4, %%xmm10 \n\t" \ + "vbroadcastss %5, %%xmm11 \n\t" \ + "vinsertf128 $0x1, %%xmm12, %%ymm9, %%ymm9 \n\t" \ + "vinsertf128 $0x1, %%xmm13, %%ymm10, %%ymm10 \n\t" \ + "vinsertf128 $0x1, %%xmm14, %%ymm11, %%ymm11" \ + : \ + : \ + "m" ((uh).c21.re), \ + "m" ((uh).c32.re), \ + "m" ((uh).c23.re), \ + "m" ((ul).c21.re), \ + "m" ((ul).c32.re), \ + "m" ((ul).c23.re) \ + : \ + "xmm1", "xmm2", "xmm9", "xmm10", \ + "xmm11", "xmm12", "xmm13", "xmm14"); \ +__asm__ __volatile__ ("vbroadcastss %0, %%xmm6 \n\t" \ + "vbroadcastss %1, %%xmm7 \n\t" \ + "vbroadcastss %2, %%xmm8 \n\t" \ + "vbroadcastss %3, %%xmm12 \n\t" \ + "vbroadcastss %4, %%xmm13 \n\t" \ + "vbroadcastss %5, %%xmm14 \n\t" \ + "vinsertf128 $0x1, %%xmm12, %%ymm6, %%ymm6 \n\t" \ + "vinsertf128 $0x1, %%xmm13, %%ymm7, %%ymm7 \n\t" \ + "vinsertf128 $0x1, %%xmm14, %%ymm8, %%ymm8" \ + : \ + : \ + "m" ((ul).c31.re), \ + "m" ((ul).c22.re), \ + "m" ((ul).c33.re), \ + "m" ((uh).c31.re), \ + "m" ((uh).c22.re), \ + "m" ((uh).c33.re) \ + : \ + "xmm6", "xmm7", "xmm8", "xmm12", \ + "xmm13", "xmm14"); \ +__asm__ __volatile__ ("vmulps %%ymm1, %%ymm9, %%ymm9 \n\t" \ + "vmulps %%ymm2, %%ymm10, %%ymm10 \n\t" \ + "vmulps %%ymm1, %%ymm11, %%ymm11 \n\t" \ + "vmulps %%ymm2, %%ymm6, %%ymm6 \n\t" \ + "vmulps %%ymm1, %%ymm7, %%ymm7 \n\t" \ + "vmulps %%ymm2, %%ymm8, %%ymm8 \n\t" \ + "vaddps %%ymm9, %%ymm3, %%ymm3 \n\t" \ + "vaddps %%ymm10, %%ymm4, %%ymm4 \n\t" \ + "vaddps %%ymm11, %%ymm5, %%ymm5 \n\t" \ + "vaddps %%ymm6, %%ymm3, %%ymm3 \n\t" \ + "vaddps %%ymm7, %%ymm4, %%ymm4 \n\t" \ + "vaddps %%ymm8, %%ymm5, %%ymm5 \n\t" \ + "vpermilps $0xb1, %%ymm3, %%ymm3 \n\t" \ + "vpermilps $0xb1, %%ymm4, %%ymm4 \n\t" \ + "vpermilps $0xb1, %%ymm5, %%ymm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5", \ + "xmm6", "xmm7", "xmm8", \ + "xmm9", "xmm10", "xmm11") + + +/* +* Multiplies pairs of su3 vectors, stored in the low and high lanes of +* ymm0,..,ymm2, by the su3 matrices ul and uh^dagger, respectively. The +* vectors are assumed to be in vertical order and the products are returned +* in the same order in the registers ymm3,..,ymm5. All registers except +* for ymm15 are changed on exit. +*/ + +#define _avx_su3_pair_mixed_multiply(ul,uh) \ +__asm__ __volatile__ ("vbroadcastss %0, %%xmm3 \n\t" \ + "vbroadcastss %1, %%xmm6 \n\t" \ + "vbroadcastss %2, %%xmm4 \n\t" \ + "vbroadcastss %3, %%xmm9 \n\t" \ + "vbroadcastss %4, %%xmm10 \n\t" \ + "vbroadcastss %5, %%xmm11 \n\t" \ + "vinsertf128 $0x1, %%xmm9, %%ymm3, %%ymm3 \n\t" \ + "vinsertf128 $0x1, %%xmm10, %%ymm6, %%ymm6 \n\t" \ + "vinsertf128 $0x1, %%xmm11, %%ymm4, %%ymm4" \ + : \ + : \ + "m" ((ul).c11.re), \ + "m" ((ul).c12.re), \ + "m" ((ul).c21.re), \ + "m" ((uh).c11.re), \ + "m" ((uh).c21.re), \ + "m" ((uh).c12.re) \ + : \ + "xmm3", "xmm4", "xmm6", \ + "xmm9", "xmm10", "xmm11"); \ +__asm__ __volatile__ ("vbroadcastss %0, %%xmm7 \n\t" \ + "vbroadcastss %1, %%xmm5 \n\t" \ + "vbroadcastss %2, %%xmm8 \n\t" \ + "vbroadcastss %3, %%xmm12 \n\t" \ + "vbroadcastss %4, %%xmm13 \n\t" \ + "vbroadcastss %5, %%xmm14 \n\t" \ + "vinsertf128 $0x1, %%xmm12, %%ymm7, %%ymm7 \n\t" \ + "vinsertf128 $0x1, %%xmm13, %%ymm5, %%ymm5 \n\t" \ + "vinsertf128 $0x1, %%xmm14, %%ymm8, %%ymm8" \ + : \ + : \ + "m" ((ul).c22.re), \ + "m" ((ul).c31.re), \ + "m" ((ul).c32.re), \ + "m" ((uh).c22.re), \ + "m" ((uh).c13.re), \ + "m" ((uh).c23.re) \ + : \ + "xmm5", "xmm7", "xmm8", \ + "xmm12", "xmm13", "xmm14"); \ +__asm__ __volatile__ ("vmulps %%ymm0, %%ymm3, %%ymm3 \n\t" \ + "vmulps %%ymm1, %%ymm6, %%ymm6 \n\t" \ + "vmulps %%ymm0, %%ymm4, %%ymm4 \n\t" \ + "vmulps %%ymm1, %%ymm7, %%ymm7 \n\t" \ + "vmulps %%ymm0, %%ymm5, %%ymm5 \n\t" \ + "vmulps %%ymm1, %%ymm8, %%ymm8 \n\t" \ + "vaddps %%ymm6, %%ymm3, %%ymm3 \n\t" \ + "vaddps %%ymm7, %%ymm4, %%ymm4 \n\t" \ + "vaddps %%ymm8, %%ymm5, %%ymm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5", \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("vbroadcastss %0, %%xmm9 \n\t" \ + "vbroadcastss %1, %%xmm10 \n\t" \ + "vbroadcastss %2, %%xmm11 \n\t" \ + "vbroadcastss %3, %%xmm12 \n\t" \ + "vbroadcastss %4, %%xmm13 \n\t" \ + "vbroadcastss %5, %%xmm14 \n\t" \ + "vinsertf128 $0x1, %%xmm12, %%ymm9, %%ymm9 \n\t" \ + "vperm2f128 $0x1, %%ymm13, %%ymm13, %%ymm13 \n\t" \ + "vinsertf128 $0x1, %%xmm14, %%ymm11, %%ymm11 \n\t" \ + "vsubps %%ymm13, %%ymm10, %%ymm10 \n\t" \ + "vpermilps $0xb1, %%ymm0, %%ymm0" \ + : \ + : \ + "m" ((ul).c13.re), \ + "m" ((ul).c21.im), \ + "m" ((ul).c33.re), \ + "m" ((uh).c31.re), \ + "m" ((uh).c12.im), \ + "m" ((uh).c33.re) \ + : \ + "xmm0", "xmm9", "xmm10", "xmm11", \ + "xmm12", "xmm13", "xmm14"); \ +__asm__ __volatile__ ("vbroadcastss %0, %%xmm6 \n\t" \ + "vbroadcastss %1, %%xmm7 \n\t" \ + "vbroadcastss %2, %%xmm8 \n\t" \ + "vbroadcastss %3, %%xmm12 \n\t" \ + "vbroadcastss %4, %%xmm13 \n\t" \ + "vbroadcastss %5, %%xmm14 \n\t" \ + "vperm2f128 $0x1, %%ymm12, %%ymm12, %%ymm12 \n\t" \ + "vinsertf128 $0x1, %%xmm13, %%ymm7, %%ymm7 \n\t" \ + "vperm2f128 $0x1, %%ymm14, %%ymm14, %%ymm14 \n\t" \ + "vsubps %%ymm12, %%ymm6, %%ymm6 \n\t" \ + "vsubps %%ymm14, %%ymm8, %%ymm8" \ + : \ + : \ + "m" ((ul).c11.im), \ + "m" ((ul).c23.re), \ + "m" ((ul).c31.im), \ + "m" ((uh).c11.im), \ + "m" ((uh).c32.re), \ + "m" ((uh).c13.im) \ + : \ + "xmm6", "xmm7", "xmm8", \ + "xmm12", "xmm13", "xmm14"); \ +__asm__ __volatile__ ("vmulps %%ymm2, %%ymm9, %%ymm9 \n\t" \ + "vmulps %%ymm0, %%ymm10, %%ymm10 \n\t" \ + "vmulps %%ymm2, %%ymm11, %%ymm11 \n\t" \ + "vmulps %%ymm0, %%ymm6, %%ymm6 \n\t" \ + "vmulps %%ymm2, %%ymm7, %%ymm7 \n\t" \ + "vmulps %%ymm0, %%ymm8, %%ymm8 \n\t" \ + "vaddps %%ymm9, %%ymm3, %%ymm3 \n\t" \ + "vaddsubps %%ymm10, %%ymm4, %%ymm4 \n\t" \ + "vaddps %%ymm11, %%ymm5, %%ymm5 \n\t" \ + "vaddsubps %%ymm6, %%ymm3, %%ymm3 \n\t" \ + "vaddps %%ymm7, %%ymm4, %%ymm4 \n\t" \ + "vaddsubps %%ymm8, %%ymm5, %%ymm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5", \ + "xmm6", "xmm7", "xmm8", \ + "xmm9", "xmm10", "xmm11"); \ +__asm__ __volatile__ ("vpermilps $0xb1, %%ymm1, %%ymm1 \n\t" \ + "vpermilps $0xb1, %%ymm2, %%ymm2 \n\t" \ + "vbroadcastss %0, %%xmm12 \n\t" \ + "vbroadcastss %1, %%xmm13 \n\t" \ + "vbroadcastss %2, %%xmm14 \n\t" \ + "vbroadcastss %3, %%xmm9 \n\t" \ + "vbroadcastss %4, %%xmm10 \n\t" \ + "vbroadcastss %5, %%xmm11 \n\t" \ + "vperm2f128 $0x1, %%ymm12, %%ymm12, %%ymm12 \n\t" \ + "vperm2f128 $0x1, %%ymm13, %%ymm13, %%ymm13 \n\t" \ + "vperm2f128 $0x1, %%ymm14, %%ymm14, %%ymm14 \n\t" \ + "vsubps %%ymm12, %%ymm9, %%ymm9 \n\t" \ + "vsubps %%ymm13, %%ymm10, %%ymm10 \n\t" \ + "vsubps %%ymm14, %%ymm11, %%ymm11" \ + : \ + : \ + "m" ((uh).c21.im), \ + "m" ((uh).c32.im), \ + "m" ((uh).c23.im), \ + "m" ((ul).c12.im), \ + "m" ((ul).c23.im), \ + "m" ((ul).c32.im) \ + : \ + "xmm1", "xmm2", "xmm9", "xmm10", \ + "xmm11", "xmm12", "xmm13", "xmm14"); \ +__asm__ __volatile__ ("vbroadcastss %0, %%xmm6 \n\t" \ + "vbroadcastss %1, %%xmm7 \n\t" \ + "vbroadcastss %2, %%xmm8 \n\t" \ + "vbroadcastss %3, %%xmm12 \n\t" \ + "vbroadcastss %4, %%xmm13 \n\t" \ + "vbroadcastss %5, %%xmm14 \n\t" \ + "vperm2f128 $0x1, %%ymm12, %%ymm12, %%ymm12 \n\t" \ + "vperm2f128 $0x1, %%ymm13, %%ymm13, %%ymm13 \n\t" \ + "vperm2f128 $0x1, %%ymm14, %%ymm14, %%ymm14 \n\t" \ + "vsubps %%ymm12, %%ymm6, %%ymm6 \n\t" \ + "vsubps %%ymm13, %%ymm7, %%ymm7 \n\t" \ + "vsubps %%ymm14, %%ymm8, %%ymm8" \ + : \ + : \ + "m" ((ul).c13.im), \ + "m" ((ul).c22.im), \ + "m" ((ul).c33.im), \ + "m" ((uh).c31.im), \ + "m" ((uh).c22.im), \ + "m" ((uh).c33.im) \ + : \ + "xmm6", "xmm7", "xmm8", "xmm12", \ + "xmm13", "xmm14"); \ +__asm__ __volatile__ ("vmulps %%ymm1, %%ymm9, %%ymm9 \n\t" \ + "vmulps %%ymm2, %%ymm10, %%ymm10 \n\t" \ + "vmulps %%ymm1, %%ymm11, %%ymm11 \n\t" \ + "vmulps %%ymm2, %%ymm6, %%ymm6 \n\t" \ + "vmulps %%ymm1, %%ymm7, %%ymm7 \n\t" \ + "vmulps %%ymm2, %%ymm8, %%ymm8 \n\t" \ + "vaddsubps %%ymm9, %%ymm3, %%ymm3 \n\t" \ + "vaddsubps %%ymm10, %%ymm4, %%ymm4 \n\t" \ + "vaddsubps %%ymm11, %%ymm5, %%ymm5 \n\t" \ + "vaddsubps %%ymm6, %%ymm3, %%ymm3 \n\t" \ + "vaddsubps %%ymm7, %%ymm4, %%ymm4 \n\t" \ + "vaddsubps %%ymm8, %%ymm5, %%ymm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5", \ + "xmm6", "xmm7", "xmm8", \ + "xmm9", "xmm10", "xmm11") + +/****************************************************************************** +* +* Macros for single precision Dirac spinors in linear order +* +******************************************************************************/ + +/* +* Loads the spinor s to the registers ymm0,..,ymm2 in linear order. +*/ + +#define _avx_spinor_load(s) \ +__asm__ __volatile__ ("vmovaps %0, %%ymm0" \ + : \ + : \ + "m" ((s).c1.c1), \ + "m" ((s).c1.c2), \ + "m" ((s).c1.c3), \ + "m" ((s).c2.c1) \ + : \ + "xmm0"); \ +__asm__ __volatile__ ("vmovaps %0, %%ymm1" \ + : \ + : \ + "m" ((s).c2.c2), \ + "m" ((s).c2.c3), \ + "m" ((s).c3.c1), \ + "m" ((s).c3.c2) \ + : \ + "xmm1"); \ +__asm__ __volatile__ ("vmovaps %0, %%ymm2" \ + : \ + : \ + "m" ((s).c3.c3), \ + "m" ((s).c4.c1), \ + "m" ((s).c4.c2), \ + "m" ((s).c4.c3) \ + : \ + "xmm2") + +/* +* Loads the spinor s to the registers ymm3,..,ymm5 in linear order. +*/ + +#define _avx_spinor_load_up(s) \ +__asm__ __volatile__ ("vmovaps %0, %%ymm3" \ + : \ + : \ + "m" ((s).c1.c1), \ + "m" ((s).c1.c2), \ + "m" ((s).c1.c3), \ + "m" ((s).c2.c1) \ + : \ + "xmm3"); \ +__asm__ __volatile__ ("vmovaps %0, %%ymm4" \ + : \ + : \ + "m" ((s).c2.c2), \ + "m" ((s).c2.c3), \ + "m" ((s).c3.c1), \ + "m" ((s).c3.c2) \ + : \ + "xmm4"); \ +__asm__ __volatile__ ("vmovaps %0, %%ymm5" \ + : \ + : \ + "m" ((s).c3.c3), \ + "m" ((s).c4.c1), \ + "m" ((s).c4.c2), \ + "m" ((s).c4.c3) \ + : \ + "xmm5") + +/* +* Stores the registers ymm0,..,ymm2 to the spinor s in linear order. +*/ + +#define _avx_spinor_store(s) \ +__asm__ __volatile__ ("vmovaps %%ymm0, %0 \n\t" \ + : \ + "=m" ((s).c1.c1), \ + "=m" ((s).c1.c2), \ + "=m" ((s).c1.c3), \ + "=m" ((s).c2.c1)); \ +__asm__ __volatile__ ("vmovaps %%ymm1, %0 \n\t" \ + : \ + "=m" ((s).c2.c2), \ + "=m" ((s).c2.c3), \ + "=m" ((s).c3.c1), \ + "=m" ((s).c3.c2)); \ +__asm__ __volatile__ ("vmovaps %%ymm2, %0 \n\t" \ + : \ + "=m" ((s).c3.c3), \ + "=m" ((s).c4.c1), \ + "=m" ((s).c4.c2), \ + "=m" ((s).c4.c3)) + +/* +* Stores the registers ymm3,..,ymm5 to the spinor s in linear order. +*/ + +#define _avx_spinor_store_up(s) \ +__asm__ __volatile__ ("vmovaps %%ymm3, %0 \n\t" \ + : \ + "=m" ((s).c1.c1), \ + "=m" ((s).c1.c2), \ + "=m" ((s).c1.c3), \ + "=m" ((s).c2.c1)); \ +__asm__ __volatile__ ("vmovaps %%ymm4, %0 \n\t" \ + : \ + "=m" ((s).c2.c2), \ + "=m" ((s).c2.c3), \ + "=m" ((s).c3.c1), \ + "=m" ((s).c3.c2)); \ +__asm__ __volatile__ ("vmovaps %%ymm5, %0 \n\t" \ + : \ + "=m" ((s).c3.c3), \ + "=m" ((s).c4.c1), \ + "=m" ((s).c4.c2), \ + "=m" ((s).c4.c3)) + +/* +* Loads (z.re,z.re,..,z.re) to ymm12 and (-z.im,z.im,..,z.im) to ymm13. +*/ + +#define _avx_load_cmplx(z) \ +__asm__ __volatile__ ("vxorps %%ymm13, %%ymm13, %%ymm13 \n\t" \ + "vbroadcastss %0, %%ymm12 \n\t" \ + "vaddsubps %%ymm12, %%ymm13, %%ymm13 \n\t" \ + "vbroadcastss %1, %%ymm12" \ + : \ + : \ + "m" ((z).im), \ + "m" ((z).re) \ + : \ + "xmm12", "xmm13") + +/* +* Loads (z.re,z.re,..,z.re) to ymm14 and (-z.im,z.im,..,z.im) to ymm15 +*/ + +#define _avx_load_cmplx_up(z) \ +__asm__ __volatile__ ("vxorps %%ymm15, %%ymm15, %%ymm15 \n\t" \ + "vbroadcastss %0, %%ymm14 \n\t" \ + "vaddsubps %%ymm14, %%ymm15, %%ymm15 \n\t" \ + "vbroadcastss %1, %%ymm14" \ + : \ + : \ + "m" ((z).im), \ + "m" ((z).re) \ + : \ + "xmm14", "xmm15") + +/* +* Multiplies the spinor s by the complex number z and assigns the result to +* ymm0,..,ymm2, assuming z was loaded using _avx_load_cmplx(z). The registers +* ymm3,..,ymm5 are used as workspace. +*/ + +#define _avx_mulc_spinor(s) \ +_avx_spinor_load(s); \ +__asm__ __volatile__ ("vpermilps $0xb1, %%ymm0, %%ymm3 \n\t" \ + "vpermilps $0xb1, %%ymm1, %%ymm4 \n\t" \ + "vpermilps $0xb1, %%ymm2, %%ymm5 \n\t" \ + "vmulps %%ymm12, %%ymm0, %%ymm0 \n\t" \ + "vmulps %%ymm13, %%ymm3, %%ymm3 \n\t" \ + "vmulps %%ymm12, %%ymm1, %%ymm1 \n\t" \ + "vmulps %%ymm13, %%ymm4, %%ymm4 \n\t" \ + "vmulps %%ymm12, %%ymm2, %%ymm2 \n\t" \ + "vmulps %%ymm13, %%ymm5, %%ymm5" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5"); \ +__asm__ __volatile__ ("vaddps %%ymm3, %%ymm0, %%ymm0 \n\t" \ + "vaddps %%ymm4, %%ymm1, %%ymm1 \n\t" \ + "vaddps %%ymm5, %%ymm2, %%ymm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2") + +/* +* Multiplies the spinor s by the complex number z and adds the result to +* ymm0,..,ymm2, assuming z was loaded using _avx_load_cmplx_up(z). The +* registers ymm3,..,ymm8 are used as workspace. +*/ + +#define _avx_mulc_spinor_add(s) \ +_avx_spinor_load_up(s); \ +__asm__ __volatile__ ("vpermilps $0xb1, %%ymm3, %%ymm6 \n\t" \ + "vpermilps $0xb1, %%ymm4, %%ymm7 \n\t" \ + "vpermilps $0xb1, %%ymm5, %%ymm8 \n\t" \ + "vmulps %%ymm14, %%ymm3, %%ymm3 \n\t" \ + "vmulps %%ymm15, %%ymm6, %%ymm6 \n\t" \ + "vmulps %%ymm14, %%ymm4, %%ymm4 \n\t" \ + "vmulps %%ymm15, %%ymm7, %%ymm7 \n\t" \ + "vmulps %%ymm14, %%ymm5, %%ymm5 \n\t" \ + "vmulps %%ymm15, %%ymm8, %%ymm8 \n\t" \ + "vaddps %%ymm3, %%ymm0, %%ymm0 \n\t" \ + "vaddps %%ymm4, %%ymm1, %%ymm1 \n\t" \ + "vaddps %%ymm5, %%ymm2, %%ymm2 \n\t" \ + "vaddps %%ymm6, %%ymm0, %%ymm0 \n\t" \ + "vaddps %%ymm7, %%ymm1, %%ymm1 \n\t" \ + "vaddps %%ymm8, %%ymm2, %%ymm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5", \ + "xmm6", "xmm7", "xmm8") + +/* +* Loads (c,c,..,c) to ymm12 and ymm13. +*/ + +#define _avx_load_real(c) \ +__asm__ __volatile__ ("vbroadcastss %0, %%ymm12 \n\t" \ + "vbroadcastss %0, %%ymm13" \ + : \ + : \ + "m" (c) \ + : \ + "xmm12", "xmm13") + +/* +* Loads (c,c,..,c) to ymm14 and ymm15. +*/ + +#define _avx_load_real_up(c) \ +__asm__ __volatile__ ("vbroadcastss %0, %%ymm14 \n\t" \ + "vbroadcastss %0, %%ymm15" \ + : \ + : \ + "m" (c) \ + : \ + "xmm14", "xmm15") + +/* +* Multiplies the spinor s by the real number c and assigns the result to +* ymm0,..,ymm2, assuming c was loaded using _avx_load_real(c). +*/ + +#define _avx_mulr_spinor(s) \ +_avx_spinor_load(s); \ +__asm__ __volatile__ ("vmulps %%ymm12, %%ymm0, %%ymm0 \n\t" \ + "vmulps %%ymm13, %%ymm1, %%ymm1 \n\t" \ + "vmulps %%ymm12, %%ymm2, %%ymm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2") + +/* +* Multiplies the spinor s by the real number c and adds the result to +* ymm0,..,ymm2, assuming c was loaded using _avx_load_real_up(c). The +* registers ymm3,..,ymm5 are used as workspace. +*/ + +#define _avx_mulr_spinor_add(s) \ +_avx_spinor_load_up(s); \ +__asm__ __volatile__ ("vmulps %%ymm14, %%ymm3, %%ymm3 \n\t" \ + "vmulps %%ymm15, %%ymm4, %%ymm4 \n\t" \ + "vmulps %%ymm14, %%ymm5, %%ymm5 \n\t" \ + "vaddps %%ymm3, %%ymm0, %%ymm0 \n\t" \ + "vaddps %%ymm4, %%ymm1, %%ymm1 \n\t" \ + "vaddps %%ymm5, %%ymm2, %%ymm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5") + +/******************************************************************************* +* +* Macros operating on double precision data +* +*******************************************************************************/ + +/******************************************************************************* +* +* Macros for su3_vector data +* +* Most of these macros operate on pairs of su3 vectors that are stored +* in the low and high lanes of ymm0,..,ymm2 or ymm3,..,ymm5. For example, +* +* ymm0 <- sl.c1.re,sl.c1.im,sh.c1.re,sh.c1.im +* ymm1 <- sl.c2.re,sl.c2.im,sh.c2.re,sh.c2.im +* ymm2 <- sl.c3.re,sl.c3.im,sh.c3.re,sh.c3.im +* +* (where sl and sh are of type su3_vector). +* +*******************************************************************************/ + +/* +* Loads two su3 vectors sl and sh to the low and high lanes of ymm0,..,ymm2. +*/ + +#define _avx_pair_load_dble(sl,sh) \ +__asm__ __volatile__ ("vmovapd %0, %%xmm0 \n\t" \ + "vmovapd %1, %%xmm1 \n\t" \ + "vmovapd %2, %%xmm2 \n\t" \ + "vinsertf128 $0x1, %3, %%ymm0, %%ymm0 \n\t" \ + "vinsertf128 $0x1, %4, %%ymm1, %%ymm1 \n\t" \ + "vinsertf128 $0x1, %5, %%ymm2, %%ymm2" \ + : \ + : \ + "m" ((sl).c1), \ + "m" ((sl).c2), \ + "m" ((sl).c3), \ + "m" ((sh).c1), \ + "m" ((sh).c2), \ + "m" ((sh).c3) \ + : \ + "xmm0", "xmm1", "xmm2") + +/* +* Loads two su3 vectors sl and sh to the low and high lanes of ymm3,..,ymm5. +*/ + +#define _avx_pair_load_up_dble(sl,sh) \ +__asm__ __volatile__ ("vmovapd %0, %%xmm3 \n\t" \ + "vmovapd %1, %%xmm4 \n\t" \ + "vmovapd %2, %%xmm5 \n\t" \ + "vinsertf128 $0x1, %3, %%ymm3, %%ymm3 \n\t" \ + "vinsertf128 $0x1, %4, %%ymm4, %%ymm4 \n\t" \ + "vinsertf128 $0x1, %5, %%ymm5, %%ymm5" \ + : \ + : \ + "m" ((sl).c1), \ + "m" ((sl).c2), \ + "m" ((sl).c3), \ + "m" ((sh).c1), \ + "m" ((sh).c2), \ + "m" ((sh).c3) \ + : \ + "xmm3", "xmm4", "xmm5") + +/* +* Stores the low and high lanes of ymm0,..,ymm2 to the su3 vectors rl and rh. +*/ + +#define _avx_pair_store_dble(rl,rh) \ +__asm__ __volatile__ ("vmovapd %%xmm0, %0 \n\t" \ + "vmovapd %%xmm1, %1 \n\t" \ + "vmovapd %%xmm2, %2 \n\t" \ + "vextractf128 $0x1, %%ymm0, %3 \n\t" \ + "vextractf128 $0x1, %%ymm1, %4 \n\t" \ + "vextractf128 $0x1, %%ymm2, %5" \ + : \ + "=m" ((rl).c1), \ + "=m" ((rl).c2), \ + "=m" ((rl).c3), \ + "=m" ((rh).c1), \ + "=m" ((rh).c2), \ + "=m" ((rh).c3)) + +/* +* Stores the low and high lanes of ymm3,..,ymm5 to the su3 vectors rl and rh. +*/ + +#define _avx_pair_store_up_dble(rl,rh) \ +__asm__ __volatile__ ("vmovapd %%xmm3, %0 \n\t" \ + "vmovapd %%xmm4, %1 \n\t" \ + "vmovapd %%xmm5, %2 \n\t" \ + "vextractf128 $0x1, %%ymm3, %3 \n\t" \ + "vextractf128 $0x1, %%ymm4, %4 \n\t" \ + "vextractf128 $0x1, %%ymm5, %5" \ + : \ + "=m" ((rl).c1), \ + "=m" ((rl).c2), \ + "=m" ((rl).c3), \ + "=m" ((rh).c1), \ + "=m" ((rh).c2), \ + "=m" ((rh).c3)) + +/* +* Loads the components of a Weyl spinor s to ymm0,..,ymm2 in linear order. +*/ + +#define _avx_weyl_load_dble(s) \ +__asm__ __volatile__ ("vmovapd %0, %%ymm0 \n\t" \ + "vmovapd %2, %%ymm1 \n\t" \ + "vmovapd %4, %%ymm2" \ + : \ + : \ + "m" ((s).c1.c1), \ + "m" ((s).c1.c2), \ + "m" ((s).c1.c3), \ + "m" ((s).c2.c1), \ + "m" ((s).c2.c2), \ + "m" ((s).c2.c3) \ + : \ + "xmm0", "xmm1", "xmm2") + +/* +* Loads the components of a Weyl spinor s to ymm3,..,ymm5 in linear order. +*/ + +#define _avx_weyl_load_up_dble(s) \ +__asm__ __volatile__ ("vmovapd %0, %%ymm3 \n\t" \ + "vmovapd %2, %%ymm4 \n\t" \ + "vmovapd %4, %%ymm5" \ + : \ + : \ + "m" ((s).c1.c1), \ + "m" ((s).c1.c2), \ + "m" ((s).c1.c3), \ + "m" ((s).c2.c1), \ + "m" ((s).c2.c2), \ + "m" ((s).c2.c3) \ + : \ + "xmm3", "xmm4", "xmm5") + +/* +* Stores ymm0,..,ymm2 to the components of a Weyl spinor s in linear order. +*/ + +#define _avx_weyl_store_dble(s) \ +__asm__ __volatile__ ("vmovapd %%ymm0, %0 \n\t" \ + "vmovapd %%ymm1, %2 \n\t" \ + "vmovapd %%ymm2, %4" \ + : \ + "=m" ((s).c1.c1), \ + "=m" ((s).c1.c2), \ + "=m" ((s).c1.c3), \ + "=m" ((s).c2.c1), \ + "=m" ((s).c2.c2), \ + "=m" ((s).c2.c3)) + +/* +* Stores ymm3,..,ymm5 to the components of a Weyl spinor s in linear order. +*/ + +#define _avx_weyl_store_up_dble(s) \ +__asm__ __volatile__ ("vmovapd %%ymm3, %0 \n\t" \ + "vmovapd %%ymm4, %2 \n\t" \ + "vmovapd %%ymm5, %4" \ + : \ + "=m" ((s).c1.c1), \ + "=m" ((s).c1.c2), \ + "=m" ((s).c1.c3), \ + "=m" ((s).c2.c1), \ + "=m" ((s).c2.c2), \ + "=m" ((s).c2.c3)) + +/* +* Adds ymm3,..,ymm5 to ymm0,..,ymm2. +*/ + +#define _avx_vector_add_dble() \ +__asm__ __volatile__ ("vaddpd %%ymm3, %%ymm0, %%ymm0 \n\t" \ + "vaddpd %%ymm4, %%ymm1, %%ymm1 \n\t" \ + "vaddpd %%ymm5, %%ymm2, %%ymm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2") + +/* +* Subtracts ymm3,..,ymm5 from ymm0,..,ymm2. +*/ + +#define _avx_vector_sub_dble() \ +__asm__ __volatile__ ("vsubpd %%ymm3, %%ymm0, %%ymm0 \n\t" \ + "vsubpd %%ymm4, %%ymm1, %%ymm1 \n\t" \ + "vsubpd %%ymm5, %%ymm2, %%ymm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2") + +/* +* Multiplies the high lanes of ymm3,..,ymm5 by -1 and adds these registers +* to ymm0,..,ymm2. +*/ + +#define _avx_vector_addsub_dble() \ +__asm__ __volatile__ ("vmulpd %0, %%ymm3, %%ymm3 \n\t" \ + "vmulpd %0, %%ymm4, %%ymm4 \n\t" \ + "vmulpd %0, %%ymm5, %%ymm5 \n\t" \ + "vaddpd %%ymm3, %%ymm0, %%ymm0 \n\t" \ + "vaddpd %%ymm4, %%ymm1, %%ymm1 \n\t" \ + "vaddpd %%ymm5, %%ymm2, %%ymm2" \ + : \ + : \ + "m" (_avx_sgn34_dble) \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5") + +/* +* Multiplies the low lanes of ymm3,..,ymm5 by -1 and adds these registers +* to ymm0,..,ymm2. +*/ + +#define _avx_vector_subadd_dble() \ +__asm__ __volatile__ ("vmulpd %0, %%ymm3, %%ymm3 \n\t" \ + "vmulpd %0, %%ymm4, %%ymm4 \n\t" \ + "vmulpd %0, %%ymm5, %%ymm5 \n\t" \ + "vaddpd %%ymm3, %%ymm0, %%ymm0 \n\t" \ + "vaddpd %%ymm4, %%ymm1, %%ymm1 \n\t" \ + "vaddpd %%ymm5, %%ymm2, %%ymm2" \ + : \ + : \ + "m" (_avx_sgn12_dble) \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5") + +/* +* Multiplies the registers ymm3,..,ymm5 by i and adds them to ymm0,..,ymm2. +*/ + +#define _avx_vector_i_add_dble() \ +__asm__ __volatile__ ("vpermilpd $0x5, %%ymm3, %%ymm3 \n\t" \ + "vpermilpd $0x5, %%ymm4, %%ymm4 \n\t" \ + "vpermilpd $0x5, %%ymm5, %%ymm5 \n\t" \ + "vaddsubpd %%ymm3, %%ymm0, %%ymm0 \n\t" \ + "vaddsubpd %%ymm4, %%ymm1, %%ymm1 \n\t" \ + "vaddsubpd %%ymm5, %%ymm2, %%ymm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5") + +/* +* Multiplies the registers ymm3,..,ymm5 by i and subtracts them from +* ymm0,..,ymm2. +*/ + +#define _avx_vector_i_sub_dble() \ +__asm__ __volatile__ ("vpermilpd $0x5, %%ymm3, %%ymm3 \n\t" \ + "vpermilpd $0x5, %%ymm4, %%ymm4 \n\t" \ + "vpermilpd $0x5, %%ymm5, %%ymm5 \n\t" \ + "vmulpd %0, %%ymm3, %%ymm3 \n\t" \ + "vmulpd %0, %%ymm4, %%ymm4 \n\t" \ + "vmulpd %0, %%ymm5, %%ymm5 \n\t" \ + "vaddpd %%ymm3, %%ymm0, %%ymm0 \n\t" \ + "vaddpd %%ymm4, %%ymm1, %%ymm1 \n\t" \ + "vaddpd %%ymm5, %%ymm2, %%ymm2" \ + : \ + : \ + "m" (_avx_sgn24_dble) \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5") + +/* +* Exchanges the high and low lanes of ymm3,..,ymm5, multiplies them by i +* and adds the result to ymm0,..,ymm2. +*/ + +#define _avx_vector_xch_i_add_dble() \ +__asm__ __volatile__ ("vpermilpd $0x5, %%ymm3, %%ymm3 \n\t" \ + "vpermilpd $0x5, %%ymm4, %%ymm4 \n\t" \ + "vpermilpd $0x5, %%ymm5, %%ymm5 \n\t" \ + "vperm2f128 $0x1, %%ymm3, %%ymm3, %%ymm3 \n\t" \ + "vperm2f128 $0x1, %%ymm4, %%ymm4, %%ymm4 \n\t" \ + "vperm2f128 $0x1, %%ymm5, %%ymm5, %%ymm5 \n\t" \ + "vaddsubpd %%ymm3, %%ymm0, %%ymm0 \n\t" \ + "vaddsubpd %%ymm4, %%ymm1, %%ymm1 \n\t" \ + "vaddsubpd %%ymm5, %%ymm2, %%ymm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5") + +/* +* Exchanges the high and low lanes of ymm3,..,ymm5, multiplies them by i +* and subtracts the result from ymm0,..,ymm2. +*/ + +#define _avx_vector_xch_i_sub_dble() \ +__asm__ __volatile__ ("vpermilpd $0x5, %%ymm3, %%ymm3 \n\t" \ + "vpermilpd $0x5, %%ymm4, %%ymm4 \n\t" \ + "vpermilpd $0x5, %%ymm5, %%ymm5 \n\t" \ + "vperm2f128 $0x1, %%ymm3, %%ymm3, %%ymm3 \n\t" \ + "vperm2f128 $0x1, %%ymm4, %%ymm4, %%ymm4 \n\t" \ + "vperm2f128 $0x1, %%ymm5, %%ymm5, %%ymm5 \n\t" \ + "vmulpd %0, %%ymm3, %%ymm3 \n\t" \ + "vmulpd %0, %%ymm4, %%ymm4 \n\t" \ + "vmulpd %0, %%ymm5, %%ymm5 \n\t" \ + "vaddpd %%ymm3, %%ymm0, %%ymm0 \n\t" \ + "vaddpd %%ymm4, %%ymm1, %%ymm1 \n\t" \ + "vaddpd %%ymm5, %%ymm2, %%ymm2" \ + : \ + : \ + "m" (_avx_sgn24_dble) \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5") + +/* +* Multiplies the low and high lanes of ymm3,..,ymm5 by i and -i +* respectively and adds these registers to ymm0,..,ymm2. +*/ + +#define _avx_vector_i_addsub_dble() \ +__asm__ __volatile__ ("vpermilpd $0x5, %%ymm3, %%ymm3 \n\t" \ + "vpermilpd $0x5, %%ymm4, %%ymm4 \n\t" \ + "vpermilpd $0x5, %%ymm5, %%ymm5 \n\t" \ + "vmulpd %0, %%ymm3, %%ymm3 \n\t" \ + "vmulpd %0, %%ymm4, %%ymm4 \n\t" \ + "vmulpd %0, %%ymm5, %%ymm5 \n\t" \ + "vaddpd %%ymm3, %%ymm0, %%ymm0 \n\t" \ + "vaddpd %%ymm4, %%ymm1, %%ymm1 \n\t" \ + "vaddpd %%ymm5, %%ymm2, %%ymm2" \ + : \ + : \ + "m" (_avx_sgn14_dble) \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5") + +/* +* Multiplies the low and high words of ymm3,..,ymm5 by -i and i +* respectively and adds these registers to ymm0,..,ymm2. +*/ + +#define _avx_vector_i_subadd_dble() \ +__asm__ __volatile__ ("vpermilpd $0x5, %%ymm3, %%ymm3 \n\t" \ + "vpermilpd $0x5, %%ymm4, %%ymm4 \n\t" \ + "vpermilpd $0x5, %%ymm5, %%ymm5 \n\t" \ + "vmulpd %0, %%ymm3, %%ymm3 \n\t" \ + "vmulpd %0, %%ymm4, %%ymm4 \n\t" \ + "vmulpd %0, %%ymm5, %%ymm5 \n\t" \ + "vaddpd %%ymm3, %%ymm0, %%ymm0 \n\t" \ + "vaddpd %%ymm4, %%ymm1, %%ymm1 \n\t" \ + "vaddpd %%ymm5, %%ymm2, %%ymm2" \ + : \ + : \ + "m" (_avx_sgn23_dble) \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5") + +/* +* Exchanges the high and low lanes of ymm3,..,ymm5. +*/ + +#define _avx_vector_xch_dble() \ +__asm__ __volatile__ ("vperm2f128 $0x1, %%ymm3, %%ymm3, %%ymm3 \n\t" \ + "vperm2f128 $0x1, %%ymm4, %%ymm4, %%ymm4 \n\t" \ + "vperm2f128 $0x1, %%ymm5, %%ymm5, %%ymm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5") + +/****************************************************************************** +* +* Action of su3 matrices on su3 vectors +* +******************************************************************************/ + +/* +* Multiplies a pair sl,sh of su3 vectors by an su3 matrix u, assuming sl and +* sh are in the low and high lanes of ymm0,..,ymm2. On output the result is +* in ymm3,..,ymm5 and all registers except for ymm12,..,ymm15 are changed. +*/ + +#define _avx_su3_multiply_pair_dble(u) \ +__asm__ __volatile__ ("vbroadcastsd %0, %%ymm3 \n\t" \ + "vbroadcastsd %1, %%ymm6 \n\t" \ + "vbroadcastsd %2, %%ymm4 \n\t" \ + "vbroadcastsd %3, %%ymm7 \n\t" \ + "vbroadcastsd %4, %%ymm5 \n\t" \ + "vbroadcastsd %5, %%ymm8" \ + : \ + : \ + "m" ((u).c11.re), \ + "m" ((u).c12.re), \ + "m" ((u).c21.re), \ + "m" ((u).c22.re), \ + "m" ((u).c31.re), \ + "m" ((u).c32.re) \ + : \ + "xmm3", "xmm4", "xmm5", \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("vmulpd %%ymm0, %%ymm3, %%ymm3 \n\t" \ + "vmulpd %%ymm1, %%ymm6, %%ymm6 \n\t" \ + "vmulpd %%ymm0, %%ymm4, %%ymm4 \n\t" \ + "vmulpd %%ymm1, %%ymm7, %%ymm7 \n\t" \ + "vmulpd %%ymm0, %%ymm5, %%ymm5 \n\t" \ + "vmulpd %%ymm1, %%ymm8, %%ymm8 \n\t" \ + "vaddpd %%ymm6, %%ymm3, %%ymm3 \n\t" \ + "vaddpd %%ymm7, %%ymm4, %%ymm4 \n\t" \ + "vaddpd %%ymm8, %%ymm5, %%ymm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5", \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("vbroadcastsd %0, %%ymm9 \n\t" \ + "vbroadcastsd %1, %%ymm10 \n\t" \ + "vbroadcastsd %2, %%ymm11 \n\t" \ + "vbroadcastsd %3, %%ymm6 \n\t" \ + "vbroadcastsd %4, %%ymm7 \n\t" \ + "vbroadcastsd %5, %%ymm8 \n\t" \ + "vpermilpd $0x5, %%ymm0, %%ymm0 \n\t" \ + : \ + : \ + "m" ((u).c13.re), \ + "m" ((u).c21.im), \ + "m" ((u).c33.re), \ + "m" ((u).c11.im), \ + "m" ((u).c23.re), \ + "m" ((u).c31.im) \ + : \ + "xmm0", "xmm6", "xmm7", "xmm8", \ + "xmm9", "xmm10", "xmm11"); \ +__asm__ __volatile__ ("vmulpd %%ymm2, %%ymm9, %%ymm9 \n\t" \ + "vmulpd %%ymm0, %%ymm10, %%ymm10 \n\t" \ + "vmulpd %%ymm2, %%ymm11, %%ymm11 \n\t" \ + "vmulpd %%ymm0, %%ymm6, %%ymm6 \n\t" \ + "vmulpd %%ymm2, %%ymm7, %%ymm7 \n\t" \ + "vmulpd %%ymm0, %%ymm8, %%ymm8 \n\t" \ + "vaddpd %%ymm9, %%ymm3, %%ymm3 \n\t" \ + "vaddsubpd %%ymm10, %%ymm4, %%ymm4 \n\t" \ + "vaddpd %%ymm11, %%ymm5, %%ymm5 \n\t" \ + "vaddsubpd %%ymm6, %%ymm3, %%ymm3 \n\t" \ + "vaddpd %%ymm7, %%ymm4, %%ymm4 \n\t" \ + "vaddsubpd %%ymm8, %%ymm5, %%ymm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5", \ + "xmm6", "xmm7", "xmm8", \ + "xmm9", "xmm10", "xmm11"); \ +__asm__ __volatile__ ("vbroadcastsd %0, %%ymm9 \n\t" \ + "vbroadcastsd %1, %%ymm10 \n\t" \ + "vbroadcastsd %2, %%ymm11 \n\t" \ + "vbroadcastsd %3, %%ymm6 \n\t" \ + "vbroadcastsd %4, %%ymm7 \n\t" \ + "vbroadcastsd %5, %%ymm8 \n\t" \ + "vpermilpd $0x5, %%ymm1, %%ymm1 \n\t" \ + "vpermilpd $0x5, %%ymm2, %%ymm2" \ + : \ + : \ + "m" ((u).c12.im), \ + "m" ((u).c23.im), \ + "m" ((u).c32.im), \ + "m" ((u).c13.im), \ + "m" ((u).c22.im), \ + "m" ((u).c33.im) \ + : \ + "xmm1", "xmm2", "xmm6", "xmm7", \ + "xmm8", "xmm9", "xmm10", "xmm11"); \ +__asm__ __volatile__ ("vmulpd %%ymm1, %%ymm9, %%ymm9 \n\t" \ + "vmulpd %%ymm2, %%ymm10, %%ymm10 \n\t" \ + "vmulpd %%ymm1, %%ymm11, %%ymm11 \n\t" \ + "vmulpd %%ymm2, %%ymm6, %%ymm6 \n\t" \ + "vmulpd %%ymm1, %%ymm7, %%ymm7 \n\t" \ + "vmulpd %%ymm2, %%ymm8, %%ymm8 \n\t" \ + "vaddsubpd %%ymm9, %%ymm3, %%ymm3 \n\t" \ + "vaddsubpd %%ymm10, %%ymm4, %%ymm4 \n\t" \ + "vaddsubpd %%ymm11, %%ymm5, %%ymm5 \n\t" \ + "vaddsubpd %%ymm6, %%ymm3, %%ymm3 \n\t" \ + "vaddsubpd %%ymm7, %%ymm4, %%ymm4 \n\t" \ + "vaddsubpd %%ymm8, %%ymm5, %%ymm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5", \ + "xmm6", "xmm7", "xmm8", \ + "xmm9", "xmm10", "xmm11") + +/* +* Multiplies a pair sl,sh of su3 vectors by an su3 matrix u^dagger, assuming +* sl and sh are in the low and high lanes of ymm0,..,ymm2. On output the +* result is in ymm3,..,ymm5 and all registers are changed. +*/ + +#define _avx_su3_inverse_multiply_pair_dble(u) \ +__asm__ __volatile__ ("vbroadcastsd %0, %%ymm3 \n\t" \ + "vbroadcastsd %1, %%ymm6 \n\t" \ + "vbroadcastsd %2, %%ymm4 \n\t" \ + "vbroadcastsd %3, %%ymm7 \n\t" \ + "vbroadcastsd %4, %%ymm5 \n\t" \ + "vbroadcastsd %5, %%ymm8 \n\t" \ + "vxorpd %%ymm15, %%ymm15, %%ymm15 \n\t" \ + "vmulpd %%ymm0, %%ymm3, %%ymm3 \n\t" \ + "vmulpd %%ymm1, %%ymm6, %%ymm6 \n\t" \ + "vmulpd %%ymm0, %%ymm4, %%ymm4 \n\t" \ + "vmulpd %%ymm1, %%ymm7, %%ymm7 \n\t" \ + "vmulpd %%ymm0, %%ymm5, %%ymm5 \n\t" \ + "vmulpd %%ymm1, %%ymm8, %%ymm8 \n\t" \ + "vaddpd %%ymm6, %%ymm3, %%ymm3 \n\t" \ + "vsubpd %%ymm0, %%ymm15, %%ymm0 \n\t" \ + "vaddpd %%ymm7, %%ymm4, %%ymm4 \n\t" \ + "vaddpd %%ymm8, %%ymm5, %%ymm5 \n\t" \ + "vpermilpd $0x5, %%ymm0, %%ymm0" \ + : \ + : \ + "m" ((u).c11.re), \ + "m" ((u).c21.re), \ + "m" ((u).c12.re), \ + "m" ((u).c22.re), \ + "m" ((u).c13.re), \ + "m" ((u).c23.re) \ + : \ + "xmm0", "xmm3", "xmm4", "xmm5", \ + "xmm6", "xmm7", "xmm8", "xmm15"); \ +__asm__ __volatile__ ("vbroadcastsd %0, %%ymm9 \n\t" \ + "vbroadcastsd %1, %%ymm10 \n\t" \ + "vbroadcastsd %2, %%ymm11 \n\t" \ + "vbroadcastsd %3, %%ymm12 \n\t" \ + "vbroadcastsd %4, %%ymm13 \n\t" \ + "vbroadcastsd %5, %%ymm14 \n\t" \ + "vsubpd %%ymm1, %%ymm15, %%ymm1 \n\t" \ + "vmulpd %%ymm2, %%ymm9, %%ymm9 \n\t" \ + "vmulpd %%ymm0, %%ymm10, %%ymm10 \n\t" \ + "vmulpd %%ymm2, %%ymm11, %%ymm11 \n\t" \ + "vmulpd %%ymm0, %%ymm12, %%ymm12 \n\t" \ + "vaddpd %%ymm9, %%ymm3, %%ymm3 \n\t" \ + "vmulpd %%ymm2, %%ymm13, %%ymm13 \n\t" \ + "vpermilpd $0x5, %%ymm1, %%ymm1 \n\t" \ + "vaddsubpd %%ymm10, %%ymm4, %%ymm4 \n\t" \ + "vsubpd %%ymm2, %%ymm15, %%ymm2 \n\t" \ + "vaddpd %%ymm11, %%ymm5, %%ymm5 \n\t" \ + "vmulpd %%ymm0, %%ymm14, %%ymm14 \n\t" \ + "vpermilpd $0x5, %%ymm2, %%ymm2" \ + : \ + : \ + "m" ((u).c31.re), \ + "m" ((u).c12.im), \ + "m" ((u).c33.re), \ + "m" ((u).c11.im), \ + "m" ((u).c32.re), \ + "m" ((u).c13.im) \ + : \ + "xmm1", "xmm2", "xmm3", "xmm4", \ + "xmm5", "xmm9", "xmm10", "xmm11", \ + "xmm12", "xmm13", "xmm14"); \ +__asm__ __volatile__ ("vbroadcastsd %0, %%ymm6 \n\t" \ + "vbroadcastsd %1, %%ymm7 \n\t" \ + "vbroadcastsd %2, %%ymm8 \n\t" \ + "vbroadcastsd %3, %%ymm9 \n\t" \ + "vbroadcastsd %4, %%ymm10 \n\t" \ + "vbroadcastsd %5, %%ymm11 \n\t" \ + "vmulpd %%ymm1, %%ymm6, %%ymm6 \n\t" \ + "vaddsubpd %%ymm12, %%ymm3, %%ymm3 \n\t" \ + "vmulpd %%ymm2, %%ymm7, %%ymm7 \n\t" \ + "vaddpd %%ymm13, %%ymm4, %%ymm4 \n\t" \ + "vmulpd %%ymm1, %%ymm8, %%ymm8 \n\t" \ + "vaddsubpd %%ymm14, %%ymm5, %%ymm5 \n\t" \ + "vmulpd %%ymm2, %%ymm9, %%ymm9 \n\t" \ + "vaddsubpd %%ymm6, %%ymm3, %%ymm3 \n\t" \ + "vmulpd %%ymm1, %%ymm10, %%ymm10 \n\t" \ + "vaddsubpd %%ymm7, %%ymm4, %%ymm4 \n\t" \ + "vmulpd %%ymm2, %%ymm11, %%ymm11 \n\t" \ + "vaddsubpd %%ymm8, %%ymm5, %%ymm5" \ + : \ + : \ + "m" ((u).c21.im), \ + "m" ((u).c32.im), \ + "m" ((u).c23.im), \ + "m" ((u).c31.im), \ + "m" ((u).c22.im), \ + "m" ((u).c33.im) \ + : \ + "xmm3", "xmm4", "xmm5", "xmm6", \ + "xmm7", "xmm8", "xmm9", "xmm10", \ + "xmm11"); \ +__asm__ __volatile__ ("vaddsubpd %%ymm9, %%ymm3, %%ymm3 \n\t" \ + "vaddsubpd %%ymm10, %%ymm4, %%ymm4 \n\t" \ + "vaddsubpd %%ymm11, %%ymm5, %%ymm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5") + +/****************************************************************************** +* +* Macros for double precision Dirac spinors in linear order. +* +******************************************************************************/ + +/* +* Loads the spinor s to the registers ymm0,..,ymm5 in linear order. +*/ + +#define _avx_spinor_load_dble(s) \ +__asm__ __volatile__ ("vmovapd %0, %%ymm0 \n\t" \ + "vmovapd %2, %%ymm1 \n\t" \ + "vmovapd %4, %%ymm2" \ + : \ + : \ + "m" ((s).c1.c1), \ + "m" ((s).c1.c2), \ + "m" ((s).c1.c3), \ + "m" ((s).c2.c1), \ + "m" ((s).c2.c2), \ + "m" ((s).c2.c3) \ + : \ + "xmm0", "xmm1", "xmm2"); \ +__asm__ __volatile__ ("vmovapd %0, %%ymm3 \n\t" \ + "vmovapd %2, %%ymm4 \n\t" \ + "vmovapd %4, %%ymm5" \ + : \ + : \ + "m" ((s).c3.c1), \ + "m" ((s).c3.c2), \ + "m" ((s).c3.c3), \ + "m" ((s).c4.c1), \ + "m" ((s).c4.c2), \ + "m" ((s).c4.c3) \ + : \ + "xmm3", "xmm4", "xmm5") + +/* +* Loads the spinor s to the registers ymm6,..,ymm11 in linear order. +*/ + +#define _avx_spinor_load_up_dble(s) \ +__asm__ __volatile__ ("vmovapd %0, %%ymm6 \n\t" \ + "vmovapd %2, %%ymm7 \n\t" \ + "vmovapd %4, %%ymm8" \ + : \ + : \ + "m" ((s).c1.c1), \ + "m" ((s).c1.c2), \ + "m" ((s).c1.c3), \ + "m" ((s).c2.c1), \ + "m" ((s).c2.c2), \ + "m" ((s).c2.c3) \ + : \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("vmovapd %0, %%ymm9 \n\t" \ + "vmovapd %2, %%ymm10 \n\t" \ + "vmovapd %4, %%ymm11" \ + : \ + : \ + "m" ((s).c3.c1), \ + "m" ((s).c3.c2), \ + "m" ((s).c3.c3), \ + "m" ((s).c4.c1), \ + "m" ((s).c4.c2), \ + "m" ((s).c4.c3) \ + : \ + "xmm9", "xmm10", "xmm11") + +/* +* Stores the registers ymm0,..,ymm5 to the spinor s in linear order. +*/ + +#define _avx_spinor_store_dble(s) \ +__asm__ __volatile__ ("vmovapd %%ymm0, %0 \n\t" \ + "vmovapd %%ymm1, %2 \n\t" \ + "vmovapd %%ymm2, %4" \ + : \ + "=m" ((s).c1.c1), \ + "=m" ((s).c1.c2), \ + "=m" ((s).c1.c3), \ + "=m" ((s).c2.c1), \ + "=m" ((s).c2.c2), \ + "=m" ((s).c2.c3)); \ +__asm__ __volatile__ ("vmovapd %%ymm3, %0 \n\t" \ + "vmovapd %%ymm4, %2 \n\t" \ + "vmovapd %%ymm5, %4" \ + : \ + "=m" ((s).c3.c1), \ + "=m" ((s).c3.c2), \ + "=m" ((s).c3.c3), \ + "=m" ((s).c4.c1), \ + "=m" ((s).c4.c2), \ + "=m" ((s).c4.c3)) + +/* +* Stores the registers ymm6,..,ymm11 to the spinor s in linear order. +*/ + +#define _avx_spinor_store_up_dble(s) \ +__asm__ __volatile__ ("vmovapd %%ymm6, %0 \n\t" \ + "vmovapd %%ymm7, %2 \n\t" \ + "vmovapd %%ymm8, %4" \ + : \ + "=m" ((s).c1.c1), \ + "=m" ((s).c1.c2), \ + "=m" ((s).c1.c3), \ + "=m" ((s).c2.c1), \ + "=m" ((s).c2.c2), \ + "=m" ((s).c2.c3)); \ +__asm__ __volatile__ ("vmovapd %%ymm9, %0 \n\t" \ + "vmovapd %%ymm10, %2 \n\t" \ + "vmovapd %%ymm11, %4" \ + : \ + "=m" ((s).c3.c1), \ + "=m" ((s).c3.c2), \ + "=m" ((s).c3.c3), \ + "=m" ((s).c4.c1), \ + "=m" ((s).c4.c2), \ + "=m" ((s).c4.c3)) + +/* +* Loads (z.re,z.re,z.re,z.re) to ymm12 and (-z.im,z.im,-z.im,z.im) to ymm13. +*/ + +#define _avx_load_cmplx_dble(z) \ +__asm__ __volatile__ ("vxorpd %%ymm13, %%ymm13, %%ymm13 \n\t" \ + "vbroadcastsd %0, %%ymm12 \n\t" \ + "vaddsubpd %%ymm12, %%ymm13, %%ymm13 \n\t" \ + "vbroadcastsd %1, %%ymm12" \ + : \ + : \ + "m" ((z).im), \ + "m" ((z).re) \ + : \ + "xmm12", "xmm13") + +/* +* Loads (z.re,z.re,z.re,z.re) to ymm14 and (-z.im,z.im,-z.im,z.im) to ymm15. +*/ + +#define _avx_load_cmplx_up_dble(z) \ +__asm__ __volatile__ ("vxorpd %%ymm15, %%ymm15, %%ymm15 \n\t" \ + "vbroadcastsd %0, %%ymm14 \n\t" \ + "vaddsubpd %%ymm14, %%ymm15, %%ymm15 \n\t" \ + "vbroadcastsd %1, %%ymm14" \ + : \ + : \ + "m" ((z).im), \ + "m" ((z).re) \ + : \ + "xmm14", "xmm15") + +/* +* Multiplies the spinor s by the complex number z and assigns the result to +* ymm0,..,ymm5, assuming z was loaded using _avx_load_cmplx_dble(z). The +* registers ymm6,..,ymm11 are used as workspace. +*/ + +#define _avx_mulc_spinor_dble(s) \ +_avx_spinor_load_dble(s); \ +__asm__ __volatile__ ("vpermilpd $0x5, %%ymm0, %%ymm6 \n\t" \ + "vpermilpd $0x5, %%ymm1, %%ymm7 \n\t" \ + "vpermilpd $0x5, %%ymm2, %%ymm8 \n\t" \ + "vpermilpd $0x5, %%ymm3, %%ymm9 \n\t" \ + "vpermilpd $0x5, %%ymm4, %%ymm10 \n\t" \ + "vpermilpd $0x5, %%ymm5, %%ymm11 \n\t" \ + "vmulpd %%ymm12, %%ymm0, %%ymm0 \n\t" \ + "vmulpd %%ymm13, %%ymm6, %%ymm6 \n\t" \ + "vmulpd %%ymm12, %%ymm1, %%ymm1 \n\t" \ + "vmulpd %%ymm13, %%ymm7, %%ymm7 \n\t" \ + "vmulpd %%ymm12, %%ymm2, %%ymm2 \n\t" \ + "vmulpd %%ymm13, %%ymm8, %%ymm8" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm6", "xmm7", "xmm8", \ + "xmm9", "xmm10", "xmm11"); \ +__asm__ __volatile__ ("vmulpd %%ymm12, %%ymm3, %%ymm3 \n\t" \ + "vmulpd %%ymm13, %%ymm9, %%ymm9 \n\t" \ + "vmulpd %%ymm12, %%ymm4, %%ymm4 \n\t" \ + "vmulpd %%ymm13, %%ymm10, %%ymm10 \n\t" \ + "vmulpd %%ymm12, %%ymm5, %%ymm5 \n\t" \ + "vmulpd %%ymm13, %%ymm11, %%ymm11 \n\t" \ + "vaddpd %%ymm6, %%ymm0, %%ymm0 \n\t" \ + "vaddpd %%ymm7, %%ymm1, %%ymm1 \n\t" \ + "vaddpd %%ymm8, %%ymm2, %%ymm2 \n\t" \ + "vaddpd %%ymm9, %%ymm3, %%ymm3 \n\t" \ + "vaddpd %%ymm10, %%ymm4, %%ymm4 \n\t" \ + "vaddpd %%ymm11, %%ymm5, %%ymm5" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5", \ + "xmm9", "xmm10", "xmm11") + +/* +* Multiplies the spinor s by the complex number z and adds the result to +* ymm0,..,ymm5, assuming z was loaded using _avx_load_cmplx_up_dble(z). The +* registers ymm6,..,ymm11 are used as workspace. +*/ + +#define _avx_mulc_spinor_add_dble(s) \ +__asm__ __volatile__ ("vmovapd %0, %%ymm6 \n\t" \ + "vmovapd %2, %%ymm7 \n\t" \ + "vmovapd %4, %%ymm8" \ + : \ + : \ + "m" ((s).c1.c1), \ + "m" ((s).c1.c2), \ + "m" ((s).c1.c3), \ + "m" ((s).c2.c1), \ + "m" ((s).c2.c2), \ + "m" ((s).c2.c3) \ + : \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("vpermilpd $0x5, %%ymm6, %%ymm9 \n\t" \ + "vpermilpd $0x5, %%ymm7, %%ymm10 \n\t" \ + "vpermilpd $0x5, %%ymm8, %%ymm11 \n\t" \ + "vmulpd %%ymm14, %%ymm6, %%ymm6 \n\t" \ + "vmulpd %%ymm15, %%ymm9, %%ymm9 \n\t" \ + "vmulpd %%ymm14, %%ymm7, %%ymm7 \n\t" \ + "vmulpd %%ymm15, %%ymm10, %%ymm10 \n\t" \ + "vmulpd %%ymm14, %%ymm8, %%ymm8 \n\t" \ + "vmulpd %%ymm15, %%ymm11, %%ymm11 \n\t" \ + "vaddpd %%ymm6, %%ymm0, %%ymm0 \n\t" \ + "vaddpd %%ymm7, %%ymm1, %%ymm1 \n\t" \ + "vaddpd %%ymm8, %%ymm2, %%ymm2 \n\t" \ + "vaddpd %%ymm9, %%ymm0, %%ymm0 \n\t" \ + "vaddpd %%ymm10, %%ymm1, %%ymm1 \n\t" \ + "vaddpd %%ymm11, %%ymm2, %%ymm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm6", "xmm7", "xmm8", \ + "xmm9", "xmm10", "xmm11"); \ +__asm__ __volatile__ ("vmovapd %0, %%ymm6 \n\t" \ + "vmovapd %2, %%ymm7 \n\t" \ + "vmovapd %4, %%ymm8" \ + : \ + : \ + "m" ((s).c3.c1), \ + "m" ((s).c3.c2), \ + "m" ((s).c3.c3), \ + "m" ((s).c4.c1), \ + "m" ((s).c4.c2), \ + "m" ((s).c4.c3) \ + : \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("vpermilpd $0x5, %%ymm6, %%ymm9 \n\t" \ + "vpermilpd $0x5, %%ymm7, %%ymm10 \n\t" \ + "vpermilpd $0x5, %%ymm8, %%ymm11 \n\t" \ + "vmulpd %%ymm14, %%ymm6, %%ymm6 \n\t" \ + "vmulpd %%ymm15, %%ymm9, %%ymm9 \n\t" \ + "vmulpd %%ymm14, %%ymm7, %%ymm7 \n\t" \ + "vmulpd %%ymm15, %%ymm10, %%ymm10 \n\t" \ + "vmulpd %%ymm14, %%ymm8, %%ymm8 \n\t" \ + "vmulpd %%ymm15, %%ymm11, %%ymm11 \n\t" \ + "vaddpd %%ymm6, %%ymm3, %%ymm3 \n\t" \ + "vaddpd %%ymm7, %%ymm4, %%ymm4 \n\t" \ + "vaddpd %%ymm8, %%ymm5, %%ymm5 \n\t" \ + "vaddpd %%ymm9, %%ymm3, %%ymm3 \n\t" \ + "vaddpd %%ymm10, %%ymm4, %%ymm4 \n\t" \ + "vaddpd %%ymm11, %%ymm5, %%ymm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5", \ + "xmm6", "xmm7", "xmm8", \ + "xmm9", "xmm10", "xmm11") + +/* +* Loads (c,c,c,c) to ymm12 and ymm13. +*/ + +#define _avx_load_real_dble(c) \ +__asm__ __volatile__ ("vbroadcastsd %0, %%ymm12 \n\t" \ + "vbroadcastsd %0, %%ymm13" \ + : \ + : \ + "m" (c) \ + : \ + "xmm12", "xmm13") + +/* +* Loads (c,c,c,c) to ymm14 and ymm15. +*/ + +#define _avx_load_real_up_dble(c) \ +__asm__ __volatile__ ("vbroadcastsd %0, %%ymm14 \n\t" \ + "vbroadcastsd %0, %%ymm15" \ + : \ + : \ + "m" (c) \ + : \ + "xmm14", "xmm15") + +/* +* Multiplies the spinor s by the real number c and assigns the result to +* ymm0,..,ymm5, assuming c was loaded using _avx_load_real_dble(c). +*/ + +#define _avx_mulr_spinor_dble(s) \ +_avx_spinor_load_dble(s); \ +__asm__ __volatile__ ("vmulpd %%ymm12, %%ymm0, %%ymm0 \n\t" \ + "vmulpd %%ymm13, %%ymm1, %%ymm1 \n\t" \ + "vmulpd %%ymm12, %%ymm2, %%ymm2 \n\t" \ + "vmulpd %%ymm13, %%ymm3, %%ymm3 \n\t" \ + "vmulpd %%ymm12, %%ymm4, %%ymm4 \n\t" \ + "vmulpd %%ymm13, %%ymm5, %%ymm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5", \ + "xmm0", "xmm1", "xmm2") + + +/* +* Multiplies the spinor s by the real number c and adds the result to +* ymm0,..,ymm5, assuming c was loaded using _avx_load_real_up_dble(c). +* The registers ymm6,..,ymm11 are used as workspace. +*/ + +#define _avx_mulr_spinor_add_dble(s) \ +_avx_spinor_load_up_dble(s); \ +__asm__ __volatile__ ("vmulpd %%ymm14, %%ymm6, %%ymm6 \n\t" \ + "vmulpd %%ymm15, %%ymm7, %%ymm7 \n\t" \ + "vmulpd %%ymm14, %%ymm8, %%ymm8 \n\t" \ + "vmulpd %%ymm15, %%ymm9, %%ymm9 \n\t" \ + "vmulpd %%ymm14, %%ymm10, %%ymm10 \n\t" \ + "vmulpd %%ymm15, %%ymm11, %%ymm11 \n\t" \ + "vaddpd %%ymm6, %%ymm0, %%ymm0 \n\t" \ + "vaddpd %%ymm7, %%ymm1, %%ymm1 \n\t" \ + "vaddpd %%ymm8, %%ymm2, %%ymm2 \n\t" \ + "vaddpd %%ymm9, %%ymm3, %%ymm3 \n\t" \ + "vaddpd %%ymm10, %%ymm4, %%ymm4 \n\t" \ + "vaddpd %%ymm11, %%ymm5, %%ymm5" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5", \ + "xmm6", "xmm7", "xmm8", \ + "xmm9", "xmm10", "xmm11") + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/block.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/block.h new file mode 100644 index 0000000000000000000000000000000000000000..b8f9fd4bcf7cce8b5b791dff762268338028855c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/block.h @@ -0,0 +1,83 @@ + +/******************************************************************************* +* +* File block.h +* +* Copyright (C) 2005, 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef BLOCK_H +#define BLOCK_H + +#ifndef SU3_H +#include "su3.h" +#endif + +#ifndef UTILS_H +#include "utils.h" +#endif + +typedef struct +{ + int ifc,ibn,vol,nw,nwd; + int *ipp,*map,*imb; + su3 *u; + su3_dble *ud; + weyl **w; + weyl_dble **wd; +} bndry_t; + +typedef struct +{ + int *bo,*bs,vol,vbb,nbp,ns,nsd,shf; + int *ipt,*imb,*ibp; + int (*iup)[4],(*idn)[4]; + su3 *u; + su3_dble *ud; + pauli *sw; + pauli_dble *swd; + spinor **s; + spinor_dble **sd; + bndry_t *bb; +} block_t; + +typedef enum +{ + SAP_BLOCKS,DFL_BLOCKS, + BLK_GRIDS +} blk_grid_t; + +/* BLOCK_C */ +extern void alloc_blk(block_t *b,int *bo,int *bs, + int iu,int iud,int ns,int nsd); +extern void alloc_bnd(block_t *b,int iu,int iud,int nw,int nwd); +extern void clone_blk(block_t *b,int shf,int *bo,block_t *c); +extern void free_blk(block_t *b); +extern int ipt_blk(block_t *b,int *x); + +/* BLK_GRID_C */ +extern void alloc_bgr(blk_grid_t grid); +extern block_t *blk_list(blk_grid_t grid,int *nb,int *isw); + +/* MAP_U2BLK_C */ +extern void assign_ud2ubgr(blk_grid_t grid); +extern void assign_ud2udblk(blk_grid_t grid,int n); + +/* MAP_SW2BLK_C */ +extern int assign_swd2swbgr(blk_grid_t grid,ptset_t set); +extern int assign_swd2swdblk(blk_grid_t grid,int n,ptset_t set); + +/* MAP_S2BLK_C */ +extern void assign_s2sblk(blk_grid_t grid,int n,ptset_t set,spinor *s,int k); +extern void assign_sblk2s(blk_grid_t grid,int n,ptset_t set,int k,spinor *s); +extern void assign_s2sdblk(blk_grid_t grid,int n,ptset_t set,spinor *s,int k); +extern void assign_sd2sdblk(blk_grid_t grid,int n,ptset_t set, + spinor_dble *sd,int k); +extern void assign_sdblk2sd(blk_grid_t grid,int n,ptset_t set, + int k,spinor_dble *sd); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/dfl.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/dfl.h new file mode 100644 index 0000000000000000000000000000000000000000..352338dd7bb93d063afbb11af1b2935765b747c6 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/dfl.h @@ -0,0 +1,57 @@ + +/******************************************************************************* +* +* File dfl.h +* +* Copyright (C) 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef DFL_H +#define DFL_H + +#ifndef SU3_H +#include "su3.h" +#endif + +typedef struct +{ + int nb,nbb; + int nbbe[8],nbbo[8]; + int obbe[8],obbo[8]; + int (*inn)[8]; + int *idx,*ipp,*map; +} dfl_grid_t; + +/* DFL_GEOMETRY_C */ +extern dfl_grid_t dfl_geometry(void); + +/* DFL_MODES_C */ +extern void dfl_modes(int *status); +extern void dfl_update(int nsm,int *status); +extern void dfl_modes2(int *status); +extern void dfl_update2(int nsm,int *status); + +/* DFL_SAP_GCR_C */ +extern double dfl_sap_gcr(int nkv,int nmx,double res,double mu, + spinor_dble *eta,spinor_dble *psi,int *status); +extern double dfl_sap_gcr2(int nkv,int nmx,double res,double mu, + spinor_dble *eta,spinor_dble *psi,int *status); + +/* DFL_SUBSPACE_C */ +extern void dfl_sd2vd(spinor_dble *sd,complex_dble *vd); +extern void dfl_vd2sd(complex_dble *vd,spinor_dble *sd); +extern void dfl_sub_vd2sd(complex_dble *vd,spinor_dble *sd); +extern void dfl_s2v(spinor *s,complex *v); +extern void dfl_v2s(complex *v,spinor *s); +extern void dfl_sub_v2s(complex *v,spinor *s); +extern void dfl_subspace(spinor **mds); + +/* LTL_GCR */ +extern double ltl_gcr(int nkv,int nmx,double res,double mu, + complex_dble *eta,complex_dble *psi,int *status); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/dirac.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/dirac.h new file mode 100644 index 0000000000000000000000000000000000000000..4681830214158c8e2a076667f28316330bed9a38 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/dirac.h @@ -0,0 +1,55 @@ + +/******************************************************************************* +* +* File dirac.h +* +* Copyright (C) 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef DIRAC_H +#define DIRAC_H + +#ifndef SU3_H +#include "su3.h" +#endif + +#ifndef BLOCK_H +#include "block.h" +#endif + +/* DW_BND_C */ +extern void Dw_bnd(blk_grid_t grid,int n,int k,int l); + +/* DW_C */ +extern void Dw(float mu,spinor *s,spinor *r); +extern void Dwee(float mu,spinor *s,spinor *r); +extern void Dwoo(float mu,spinor *s,spinor *r); +extern void Dweo(spinor *s,spinor *r); +extern void Dwoe(spinor *s,spinor *r); +extern void Dwhat(float mu,spinor *s,spinor *r); +extern void Dw_blk(blk_grid_t grid,int n,float mu,int k,int l); +extern void Dwee_blk(blk_grid_t grid,int n,float mu,int k,int l); +extern void Dwoo_blk(blk_grid_t grid,int n,float mu,int k,int l); +extern void Dwoe_blk(blk_grid_t grid,int n,int k,int l); +extern void Dweo_blk(blk_grid_t grid,int n,int k,int l); +extern void Dwhat_blk(blk_grid_t grid,int n,float mu,int k,int l); + +/* DW_DBLE_C */ +extern void Dw_dble(double mu,spinor_dble *s,spinor_dble *r); +extern void Dwee_dble(double mu,spinor_dble *s,spinor_dble *r); +extern void Dwoo_dble(double mu,spinor_dble *s,spinor_dble *r); +extern void Dweo_dble(spinor_dble *s,spinor_dble *r); +extern void Dwoe_dble(spinor_dble *s,spinor_dble *r); +extern void Dwhat_dble(double mu,spinor_dble *s,spinor_dble *r); +extern void Dw_blk_dble(blk_grid_t grid,int n,double mu,int k,int l); +extern void Dwee_blk_dble(blk_grid_t grid,int n,double mu,int k,int l); +extern void Dwoo_blk_dble(blk_grid_t grid,int n,double mu,int k,int l); +extern void Dwoe_blk_dble(blk_grid_t grid,int n,int k,int l); +extern void Dweo_blk_dble(blk_grid_t grid,int n,int k,int l); +extern void Dwhat_blk_dble(blk_grid_t grid,int n,double mu,int k,int l); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/flags.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/flags.h new file mode 100644 index 0000000000000000000000000000000000000000..2545d8ecad07f12f83939151a91314e878866fef --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/flags.h @@ -0,0 +1,313 @@ + +/******************************************************************************* +* +* File flags.h +* +* Copyright (C) 2009-2014 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef FLAGS_H +#define FLAGS_H + +#ifndef BLOCK_H +#include "block.h" +#endif + +typedef enum +{ + UPDATED_U,UPDATED_UD,ASSIGNED_UD2U, + COPIED_BND_UD,SET_BSTAP,SHIFTED_UD,COMPUTED_FTS, + ERASED_SW,ERASED_SWD,COMPUTED_SWD,ASSIGNED_SWD2SW, + INVERTED_SW_E,INVERTED_SW_O, + INVERTED_SWD_E,INVERTED_SWD_O, + ASSIGNED_U2UBGR,ASSIGNED_UD2UBGR,ASSIGNED_UD2UDBGR, + ASSIGNED_SWD2SWBGR,ASSIGNED_SWD2SWDBGR, + ERASED_AW,ERASED_AWHAT,COMPUTED_AW,COMPUTED_AWHAT, + EVENTS +} event_t; + +typedef enum +{ + U_MATCH_UD,UDBUF_UP2DATE,BSTAP_UP2DATE, + FTS_UP2DATE,UBGR_MATCH_UD,UDBGR_MATCH_UD, + SW_UP2DATE,SW_E_INVERTED,SW_O_INVERTED, + SWD_UP2DATE,SWD_E_INVERTED,SWD_O_INVERTED, + AW_UP2DATE,AWHAT_UP2DATE, + QUERIES +} query_t; + +typedef enum +{ + ACG,ACF_TM1,ACF_TM1_EO,ACF_TM1_EO_SDET, + ACF_TM2,ACF_TM2_EO,ACF_RAT,ACF_RAT_SDET, + ACTIONS +} action_t; + +typedef enum +{ + LPFR,OMF2,OMF4, + INTEGRATORS +} integrator_t; + +typedef enum +{ + FRG,FRF_TM1,FRF_TM1_EO,FRF_TM1_EO_SDET, + FRF_TM2,FRF_TM2_EO,FRF_RAT,FRF_RAT_SDET, + FORCES +} force_t; + +typedef enum +{ + RWTM1,RWTM1_EO,RWTM2,RWTM2_EO,RWRAT, + RWFACTS +} rwfact_t; + +typedef enum +{ + CGNE,MSCG,SAP_GCR,DFL_SAP_GCR, + SOLVERS +} solver_t; + +typedef struct +{ + action_t action; + int ipf,im0; + int irat[3],imu[4]; + int isp[4]; +} action_parms_t; + +typedef struct +{ + int type; + double cG[2],cF[2]; + double phi[2][3]; +} bc_parms_t; + +typedef struct +{ + int bs[4]; + int Ns; +} dfl_parms_t; + +typedef struct +{ + int nkv,nmx; + double res; +} dfl_pro_parms_t; + +typedef struct +{ + int ninv,nmr,ncy; + double kappa,m0,mu; +} dfl_gen_parms_t; + +typedef struct +{ + int nsm; + double dtau; +} dfl_upd_parms_t; + +typedef struct +{ + force_t force; + int ipf,im0; + int irat[3],imu[4]; + int isp[4]; + int ncr[4],icr[4]; +} force_parms_t; + +typedef struct +{ + int npf,nlv; + int nact,nmu; + int *iact; + double tau,*mu; +} hmc_parms_t; + +typedef struct +{ + int nk; + double beta,c0,c1; + double *kappa,*m0; + double csw; +} lat_parms_t; + +typedef struct +{ + integrator_t integrator; + double lambda; + int nstep,nfr; + int *ifr; +} mdint_parms_t; + +typedef struct +{ + int degree; + double range[2]; +} rat_parms_t; + +typedef struct +{ + rwfact_t rwfact; + int im0,nsrc; + int irp,nfct; + double *mu; + int *np,*isp; +} rw_parms_t; + +typedef struct +{ + double m0,csw,cF[2]; +} sw_parms_t; + +typedef struct +{ + int bs[4]; + int isolv; + int nmr,ncy; +} sap_parms_t; + +typedef struct +{ + solver_t solver; + int nmx,nkv; + int isolv,nmr,ncy; + double res; +} solver_parms_t; + +typedef struct +{ + int eoflg; +} tm_parms_t; + +typedef struct +{ + int n; + double eps; +} wflow_parms_t; + +/* FLAGS_C */ +extern void set_flags(event_t event); +extern void set_grid_flags(blk_grid_t grid,event_t event); +extern int query_flags(query_t query); +extern int query_grid_flags(blk_grid_t grid,query_t query); +extern void print_flags(void); +extern void print_grid_flags(blk_grid_t grid); + +/* ACTION_PARMS_C */ +extern action_parms_t set_action_parms(int iact,action_t action,int ipf, + int im0,int *irat,int *imu,int *isp); +extern action_parms_t action_parms(int iact); +extern void read_action_parms(int iact); +extern void print_action_parms(void); +extern void write_action_parms(FILE *fdat); +extern void check_action_parms(FILE *fdat); + +/* DFL_PARMS_C */ +extern dfl_parms_t set_dfl_parms(int *bs,int Ns); +extern dfl_parms_t dfl_parms(void); +extern dfl_pro_parms_t set_dfl_pro_parms(int nkv,int nmx,double res); +extern dfl_pro_parms_t dfl_pro_parms(void); +extern dfl_gen_parms_t set_dfl_gen_parms(double kappa,double mu, + int ninv,int nmr,int ncy); +extern dfl_gen_parms_t dfl_gen_parms(void); +extern dfl_upd_parms_t set_dfl_upd_parms(double dtau,int nsm); +extern dfl_upd_parms_t dfl_upd_parms(void); +extern void print_dfl_parms(int ipr); +extern void write_dfl_parms(FILE *fdat); +extern void check_dfl_parms(FILE *fdat); + +/* FORCE_PARMS_C */ +extern force_parms_t set_force_parms(int ifr,force_t force,int ipf,int im0, + int *irat,int *imu,int *isp,int *ncr); +extern force_parms_t force_parms(int ifr); +extern void read_force_parms(int ifr); +extern void read_force_parms2(int ifr); +extern void print_force_parms(void); +extern void print_force_parms2(void); +extern void write_force_parms(FILE *fdat); +extern void check_force_parms(FILE *fdat); + +/* HMC_PARMS_C */ +extern hmc_parms_t set_hmc_parms(int nact,int *iact,int npf, + int nmu,double *mu,int nlv,double tau); +extern hmc_parms_t hmc_parms(void); +extern void print_hmc_parms(void); +extern void write_hmc_parms(FILE *fdat); +extern void check_hmc_parms(FILE *fdat); + +/* LAT_PARMS_C */ +extern lat_parms_t set_lat_parms(double beta,double c0, + int nk,double *kappa,double csw); +extern lat_parms_t lat_parms(void); +extern void print_lat_parms(void); +extern void write_lat_parms(FILE *fdat); +extern void check_lat_parms(FILE *fdat); +extern bc_parms_t set_bc_parms(int type, + double cG,double cG_prime, + double cF,double cF_prime, + double *phi,double *phi_prime); +extern bc_parms_t bc_parms(void); +extern void print_bc_parms(void); +extern void write_bc_parms(FILE *fdat); +extern void check_bc_parms(FILE *fdat); +extern double sea_quark_mass(int im0); +extern int bc_type(void); +extern sw_parms_t set_sw_parms(double m0); +extern sw_parms_t sw_parms(void); +extern tm_parms_t set_tm_parms(int eoflg); +extern tm_parms_t tm_parms(void); + +/* MDINT_PARMS_C */ +extern mdint_parms_t set_mdint_parms(int ilv, + integrator_t integrator,double lambda, + int nstep,int nfr,int *ifr); +extern mdint_parms_t mdint_parms(int ilv); +extern void read_mdint_parms(int ilv); +extern void print_mdint_parms(void); +extern void write_mdint_parms(FILE *fdat); +extern void check_mdint_parms(FILE *fdat); + +/* RAT_PARMS_C */ +extern rat_parms_t set_rat_parms(int irp,int degree,double *range); +extern rat_parms_t rat_parms(int irp); +extern void read_rat_parms(int irp); +extern void print_rat_parms(void); +extern void write_rat_parms(FILE *fdat); +extern void check_rat_parms(FILE *fdat); + +/* RW_PARMS_C */ +extern rw_parms_t set_rw_parms(int irw,rwfact_t rwfact,int im0,int nsrc, + int irp,int nfct,double *mu,int *np,int *isp); +extern rw_parms_t rw_parms(int irw); +extern void read_rw_parms(int irw); +extern void print_rw_parms(void); +extern void write_rw_parms(FILE *fdat); +extern void check_rw_parms(FILE *fdat); + +/* SAP_PARMS_C */ +extern sap_parms_t set_sap_parms(int *bs,int isolv,int nmr,int ncy); +extern sap_parms_t sap_parms(void); +extern void print_sap_parms(int ipr); +extern void write_sap_parms(FILE *fdat); +extern void check_sap_parms(FILE *fdat); + +/* SOLVER_PARMS_C */ +extern solver_parms_t set_solver_parms(int isp,solver_t solver, + int nkv,int isolv,int nmr,int ncy, + int nmx,double res); +extern solver_parms_t solver_parms(int isp); +extern void read_solver_parms(int isp); +extern void print_solver_parms(int *isap,int *idfl); +extern void write_solver_parms(FILE *fdat); +extern void check_solver_parms(FILE *fdat); + +/* WFLOW_PARMS_C */ +extern wflow_parms_t set_wflow_parms(int n,double eps); +extern wflow_parms_t wflow_parms(void); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/flags/events.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/flags/events.h new file mode 100644 index 0000000000000000000000000000000000000000..4067350c27bd1547895ad19d34a257069d8bc1fb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/flags/events.h @@ -0,0 +1,137 @@ + +/******************************************************************************* +* +* File flags/events.h +* +* Copyright (C) 2009, 2010, 2012 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Full-lattice events +* +*******************************************************************************/ + +#define EVENTS_H + +#if (defined FLAGS_C) + +static void (*event_fcts[(int)(EVENTS)+1])(void)={NULL}; + + +static void LatUpdatedU(void) +{ + lat.u=next_tag(); +} + +static void LatUpdatedUd(void) +{ + lat.ud=next_tag(); +} + +static void LatAssignedUd2u(void) +{ + lat.u=lat.ud; +} + +static void LatCopiedBndUd(void) +{ + lat.udbuf=lat.ud; +} + +static void LatSetBstap(void) +{ + lat.bstap=lat.ud; +} + +static void LatShiftedUd(void) +{ + lat.ud=next_tag(); + lat.udbuf=0; +} + +static void LatComputedFts(void) +{ + lat.fts=lat.ud; +} + +static void LatErasedSw(void) +{ + lat.sw[0]=0; + lat.sw[1]=0; + lat.sw[2]=0; +} + +static void LatErasedSwd(void) +{ + lat.swd[0]=0; + lat.swd[1]=0; + lat.swd[2]=0; +} + +static void LatComputedSwd(void) +{ + lat.swd[0]=lat.ud; + lat.swd[1]=0; + lat.swd[2]=0; +} + +static void LatAssignedSwd2sw(void) +{ + lat.sw[0]=lat.swd[0]; + lat.sw[1]=lat.swd[1]; + lat.sw[2]=lat.swd[2]; +} + +static void LatInvertedSwdE(void) +{ + lat.swd[1]^=0x1; +} + +static void LatInvertedSwdO(void) +{ + lat.swd[2]^=0x1; +} + +static void LatErasedAw(void) +{ + lat.aw=0; +} + +static void LatErasedAwhat(void) +{ + lat.awh=0; +} + +static void LatComputedAw(void) +{ + lat.aw=lat.ud; +} + +static void LatComputedAwhat(void) +{ + lat.awh=lat.ud; +} + +static void set_events(void) +{ + event_fcts[(int)(UPDATED_U)]=LatUpdatedU; + event_fcts[(int)(UPDATED_UD)]=LatUpdatedUd; + event_fcts[(int)(ASSIGNED_UD2U)]=LatAssignedUd2u; + event_fcts[(int)(COPIED_BND_UD)]=LatCopiedBndUd; + event_fcts[(int)(SET_BSTAP)]=LatSetBstap; + event_fcts[(int)(SHIFTED_UD)]=LatShiftedUd; + event_fcts[(int)(COMPUTED_FTS)]=LatComputedFts; + event_fcts[(int)(ERASED_SW)]=LatErasedSw; + event_fcts[(int)(ERASED_SWD)]=LatErasedSwd; + event_fcts[(int)(COMPUTED_SWD)]=LatComputedSwd; + event_fcts[(int)(ASSIGNED_SWD2SW)]=LatAssignedSwd2sw; + event_fcts[(int)(INVERTED_SWD_E)]=LatInvertedSwdE; + event_fcts[(int)(INVERTED_SWD_O)]=LatInvertedSwdO; + event_fcts[(int)(ERASED_AW)]=LatErasedAw; + event_fcts[(int)(ERASED_AWHAT)]=LatErasedAwhat; + event_fcts[(int)(COMPUTED_AW)]=LatComputedAw; + event_fcts[(int)(COMPUTED_AWHAT)]=LatComputedAwhat; +} + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/flags/grid_events.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/flags/grid_events.h new file mode 100644 index 0000000000000000000000000000000000000000..974c617f078193cd63429ee2e4cb1a0e5f1e1595 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/flags/grid_events.h @@ -0,0 +1,151 @@ + +/******************************************************************************* +* +* File grid_events.h +* +* Copyright (C) 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Block grid events +* +*******************************************************************************/ + +#define GRID_EVENTS_H + +#if (defined FLAGS_C) + +static void (*grid_event_fcts[(int)(EVENTS)+1])(void)={NULL}; + +static void GridAssignedU2ubgr(void) +{ + if ((*gf).shf&0x1) + error_root(1,1,"GridAssignedU2ubgr [grid_events.h]", + "Event involving shared fields"); + else + (*gf).u=lat.u; +} + +static void GridAssignedUd2ubgr(void) +{ + if ((*gf).shf&0x1) + error_root(1,1,"GridAssignedUd2ubgr [grid_events.h]", + "Event involving shared fields"); + else + (*gf).u=lat.ud; +} + +static void GridAssignedUd2udbgr(void) +{ + if ((*gf).shf&0x2) + error_root(1,1,"GridAssignedUd2udbgr [grid_events.h]", + "Event involving shared fields"); + else + (*gf).ud=lat.ud; +} + +static void GridAssignedSwd2swbgr(void) +{ + if ((*gf).shf&0x1) + error_root(1,1,"GridAssignedSwd2swbgr [grid_events.h]", + "Event involving shared fields"); + else + { + (*gf).sw[0]=lat.swd[0]; + (*gf).sw[1]=lat.swd[1]; + (*gf).sw[2]=lat.swd[2]; + } +} + +static void GridAssignedSwd2swdbgr(void) +{ + if ((*gf).shf&0x2) + error_root(1,1,"GridAssignedSwd2swdbgr [grid_events.h]", + "Event involving shared fields"); + else + { + (*gf).swd[0]=lat.swd[0]; + (*gf).swd[1]=lat.swd[1]; + (*gf).swd[2]=lat.swd[2]; + } +} + +static void GridInvertedSwdE(void) +{ + if ((*gf).shf&0x2) + error_root(1,1,"GridInvertedSwdE [grid_events.h]", + "Event involving shared fields"); + else + (*gf).swd[1]^=0x1; +} + +static void GridInvertedSwdO(void) +{ + if ((*gf).shf&0x2) + error_root(1,1,"GridInvertedSwdO [grid_events.h]", + "Event involving shared fields"); + else + (*gf).swd[2]^=0x1; +} + +static void GridInvertedSwE(void) +{ + if ((*gf).shf&0x1) + error_root(1,1,"GridInvertedSwE [grid_events.h]", + "Event involving shared fields"); + else + (*gf).sw[1]^=0x1; +} + +static void GridInvertedSwO(void) +{ + if ((*gf).shf&0x1) + error_root(1,1,"GridInvertedSwO [grid_events.h]", + "Event involving shared fields"); + else + (*gf).sw[2]^=0x1; +} + +static void GridErasedSw(void) +{ + if ((*gf).shf&0x1) + error_root(1,1,"GridErasedSw [grid_events.h]", + "Event involving shared fields"); + else + { + (*gf).sw[0]=0; + (*gf).sw[1]=0; + (*gf).sw[2]=0; + } +} + +static void GridErasedSwd(void) +{ + if ((*gf).shf&0x2) + error_root(1,1,"GridErasedSwd [grid_events.h]", + "Event involving shared fields"); + else + { + (*gf).swd[0]=0; + (*gf).swd[1]=0; + (*gf).swd[2]=0; + } +} + +static void set_grid_events(void) +{ + grid_event_fcts[(int)(ASSIGNED_U2UBGR)]=GridAssignedU2ubgr; + grid_event_fcts[(int)(ASSIGNED_UD2UBGR)]=GridAssignedUd2ubgr; + grid_event_fcts[(int)(ASSIGNED_UD2UDBGR)]=GridAssignedUd2udbgr; + grid_event_fcts[(int)(ASSIGNED_SWD2SWBGR)]=GridAssignedSwd2swbgr; + grid_event_fcts[(int)(ASSIGNED_SWD2SWDBGR)]=GridAssignedSwd2swdbgr; + grid_event_fcts[(int)(INVERTED_SWD_E)]=GridInvertedSwdE; + grid_event_fcts[(int)(INVERTED_SWD_O)]=GridInvertedSwdO; + grid_event_fcts[(int)(INVERTED_SW_E)]=GridInvertedSwE; + grid_event_fcts[(int)(INVERTED_SW_O)]=GridInvertedSwO; + grid_event_fcts[(int)(ERASED_SW)]=GridErasedSw; + grid_event_fcts[(int)(ERASED_SWD)]=GridErasedSwd; +} + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/flags/grid_queries.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/flags/grid_queries.h new file mode 100644 index 0000000000000000000000000000000000000000..a4daafc9be7bc974e80419368fb618df6fb02f68 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/flags/grid_queries.h @@ -0,0 +1,129 @@ + +/******************************************************************************* +* +* File grid_queries.h +* +* Copyright (C) 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Block grid queries +* +*******************************************************************************/ + +#define GRID_QUERIES_H + +#if (defined FLAGS_C) + +static int (*grid_query_fcts[(int)(QUERIES)+1])(void)={NULL}; + +static int GridQueryUbgrMatchUd(void) +{ + if ((*gf).shf&0x1) + { + error_loc(1,1,"GridQueryUbgrMatchUd [grid_queries.h]", + "Query involving shared fields"); + return -1; + } + else + return ((*gf).u==lat.ud); +} + +static int GridQueryUdbgrMatchUd(void) +{ + if ((*gf).shf&0x2) + { + error_loc(1,1,"GridQueryUdbgrMatchUd [grid_queries.h]", + "Query involving shared fields"); + return -1; + } + else + return ((*gf).ud==lat.ud); +} + +static int GridQuerySwUp2date(void) +{ + if ((*gf).shf&0x1) + { + error_loc(1,1,"GridQuerySwUp2date [grid_queries.h]", + "Query involving shared fields"); + return -1; + } + else + return ((*gf).sw[0]==(*gf).u); +} + +static int GridQuerySwEInverted(void) +{ + if ((*gf).shf&0x1) + { + error_loc(1,1,"GridQuerySwEInverted [grid_queries.h]", + "Query involving shared fields"); + return -1; + } + else + return ((*gf).sw[1]==1); +} + +static int GridQuerySwOInverted(void) +{ + if ((*gf).shf&0x1) + { + error_loc(1,1,"GridQuerySwOInverted [grid_queries.h]", + "Query involving shared fields"); + return -1; + } + else + return ((*gf).sw[2]==1); +} + +static int GridQuerySwdUp2date(void) +{ + if ((*gf).shf&0x2) + { + error_loc(1,1,"GridQuerySwdUp2date [grid_queries.h]", + "Query involving shared fields"); + return -1; + } + else + return ((*gf).swd[0]==(*gf).ud); +} + +static int GridQuerySwdEInverted(void) +{ + if ((*gf).shf&0x2) + { + error_loc(1,1,"GridQuerySwdEInverted [grid_queries.h]", + "Query involving shared fields"); + return -1; + } + else + return ((*gf).swd[1]==1); +} + +static int GridQuerySwdOInverted(void) +{ + if ((*gf).shf&0x2) + { + error_loc(1,1,"GridQuerySwdOInverted [grid_queries.h]", + "Query involving shared fields"); + return -1; + } + else + return ((*gf).swd[2]==1); +} + +static void set_grid_queries(void) +{ + grid_query_fcts[(int)(UBGR_MATCH_UD)]=GridQueryUbgrMatchUd; + grid_query_fcts[(int)(UDBGR_MATCH_UD)]=GridQueryUdbgrMatchUd; + grid_query_fcts[(int)(SW_UP2DATE)]=GridQuerySwUp2date; + grid_query_fcts[(int)(SW_E_INVERTED)]=GridQuerySwEInverted; + grid_query_fcts[(int)(SW_O_INVERTED)]=GridQuerySwOInverted; + grid_query_fcts[(int)(SWD_UP2DATE)]=GridQuerySwdUp2date; + grid_query_fcts[(int)(SWD_E_INVERTED)]=GridQuerySwdEInverted; + grid_query_fcts[(int)(SWD_O_INVERTED)]=GridQuerySwdOInverted; +} + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/flags/queries.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/flags/queries.h new file mode 100644 index 0000000000000000000000000000000000000000..7525afd86bbe5551fcfb7a9d5c60b351e6a8a996 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/flags/queries.h @@ -0,0 +1,97 @@ + +/******************************************************************************* +* +* File flags/queries.h +* +* Copyright (C) 2009, 2010, 2011, 2012 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Query descriptions +* +*******************************************************************************/ + +#define QUERIES_H + +#if (defined FLAGS_C) + +static int (*query_fcts[(int)(QUERIES)+1])(void)={NULL}; + +static int QueryUMatchUd(void) +{ + return (lat.u==lat.ud); +} + +static int QueryUdbufUp2date(void) +{ + return ((lat.ud>0)&&(lat.udbuf==lat.ud)); +} + +static int QueryBstapUp2date(void) +{ + return ((lat.ud>0)&&(lat.bstap==lat.ud)); +} + +static int QueryFtsUp2date(void) +{ + return ((lat.ud>0)&&(lat.fts==lat.ud)); +} + +static int QuerySwUp2date(void) +{ + return ((lat.u>0)&&(lat.sw[0]==lat.u)); +} + +static int QuerySwEInverted(void) +{ + return (lat.sw[1]==1); +} + +static int QuerySwOInverted(void) +{ + return (lat.sw[2]==1); +} + +static int QuerySwdUp2date(void) +{ + return ((lat.ud>0)&&(lat.swd[0]==lat.ud)); +} + +static int QuerySwdEInverted(void) +{ + return (lat.swd[1]==1); +} + +static int QuerySwdOInverted(void) +{ + return (lat.swd[2]==1); +} + +static int QueryAwUp2date(void) +{ + return ((lat.ud>0)&&(lat.aw==lat.ud)); +} + +static int QueryAwhatUp2date(void) +{ + return ((lat.ud>0)&&(lat.awh==lat.ud)); +} + +static void set_queries(void) +{ + query_fcts[(int)(U_MATCH_UD)]=QueryUMatchUd; + query_fcts[(int)(UDBUF_UP2DATE)]=QueryUdbufUp2date; + query_fcts[(int)(BSTAP_UP2DATE)]=QueryBstapUp2date; + query_fcts[(int)(FTS_UP2DATE)]=QueryFtsUp2date; + query_fcts[(int)(SW_UP2DATE)]=QuerySwUp2date; + query_fcts[(int)(SW_E_INVERTED)]=QuerySwEInverted; + query_fcts[(int)(SW_O_INVERTED)]=QuerySwOInverted; + query_fcts[(int)(SWD_UP2DATE)]=QuerySwdUp2date; + query_fcts[(int)(SWD_E_INVERTED)]=QuerySwdEInverted; + query_fcts[(int)(SWD_O_INVERTED)]=QuerySwdOInverted; + query_fcts[(int)(AW_UP2DATE)]=QueryAwUp2date; + query_fcts[(int)(AWHAT_UP2DATE)]=QueryAwhatUp2date; +} + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/forces.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/forces.h new file mode 100644 index 0000000000000000000000000000000000000000..90a8b50c7ab98bc91ff345badb17174f9ce671c6 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/forces.h @@ -0,0 +1,90 @@ + +/******************************************************************************* +* +* File forces.h +* +* Copyright (C) 2011, 2012 Martin Luescher, Stefan Schaefer +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef FORCES_H +#define FORCES_H + +#ifndef SU3_H +#include "su3.h" +#endif + +#ifndef UTILS_H +#include "utils.h" +#endif + +/* FORCE0_C */ +extern void plaq_frc(void); +extern void force0(double c); +extern double action0(int icom); + +/* FORCE1_C */ +extern double setpf1(double mu,int ipf,int icom); +extern void force1(double mu,int ipf,int isp,int icr,double c,int *status); +extern double action1(double mu,int ipf,int isp,int icom,int *status); + +/* FORCE2_C */ +extern double setpf2(double mu0,double mu1,int ipf,int isp, + int icom,int *status); +extern void force2(double mu0,double mu1,int ipf,int isp,int icr, + double c,int *status); +extern double action2(double mu0,double mu1,int ipf,int isp, + int icom,int *status); + +/* FORCE3_C */ +extern double setpf3(int *irat,int ipf,int isw,int isp,int icom,int *status); +extern void force3(int *irat,int ipf,int isw,int isp,double c,int *status); +extern double action3(int *irat,int ipf,int isw,int isp,int icom,int *status); + +/* FORCE4_C */ +extern double setpf4(double mu,int ipf,int isw,int icom); +extern void force4(double mu,int ipf,int isw,int isp,int icr,double c, + int *status); +extern double action4(double mu,int ipf,int isw,int isp,int icom,int *status); + +/* FORCE5_C */ +extern double setpf5(double mu0,double mu1,int ipf,int isp,int icom, + int *status); +extern void force5(double mu0,double mu1,int ipf,int isp,int icr, + double c,int *status); +extern double action5(double mu0,double mu1,int ipf,int isp,int icom, + int *status); + +/* FRCFCTS_C */ +extern void det2xt(pauli_dble *m,u3_alg_dble *X); +extern void prod2xt(spinor_dble *r,spinor_dble *s,u3_alg_dble *X); +extern void (*prod2xv[])(spinor_dble *rx,spinor_dble *ry, + spinor_dble *sx,spinor_dble *sy,su3_dble *u); + +/* GENFRC_C */ +extern void sw_frc(double c); +extern void hop_frc(double c); + +/* TMCG_C */ +extern double tmcg(int nmx,double res,double mu, + spinor_dble *eta,spinor_dble *psi,int *status); +extern double tmcgeo(int nmx,double res,double mu, + spinor_dble *eta,spinor_dble *psi,int *status); + +/* TMCGM_C */ +extern void tmcgm(int nmx,double *res,int nmu,double *mu, + spinor_dble *eta,spinor_dble **psi,int *status); + +/* XTENSOR_C */ +extern u3_alg_dble **xtensor(void); +extern void set_xt2zero(void); +extern int add_det2xt(double c,ptset_t set); +extern void add_prod2xt(double c,spinor_dble *r,spinor_dble *s); +extern su3_dble *xvector(void); +extern void set_xv2zero(void); +extern void add_prod2xv(double c,spinor_dble *r,spinor_dble *s); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/global.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/global.h new file mode 100644 index 0000000000000000000000000000000000000000..d5b73d14d7bf6e828fcb6ec12587c62817e975d2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/global.h @@ -0,0 +1,81 @@ + +/******************************************************************************* +* +* File global.h +* +* Copyright (C) 2009, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Global parameters and arrays +* +*******************************************************************************/ + +#ifndef GLOBAL_H +#define GLOBAL_H + +#include "../../global.defs" + +#define NPROC0_BLK 1 +#define NPROC1_BLK 1 +#define NPROC2_BLK 1 +#define NPROC3_BLK 1 + +#define NAME_SIZE 128 + +/****************************** do not change *********************************/ + +#if ((NPROC0<1)||(NPROC1<1)||(NPROC2<1)||(NPROC3<1)|| \ + ((NPROC0>1)&&((NPROC0%2)!=0))||((NPROC1>1)&&((NPROC1%2)!=0))|| \ + ((NPROC2>1)&&((NPROC2%2)!=0))||((NPROC3>1)&&((NPROC3%2)!=0))) +#error : The number of processes in each direction must be 1 or a multiple of 2 +#endif + +#if ((L0<4)||(L1<4)||(L2<4)||(L3<4)|| \ + ((L0%2)!=0)||((L1%2)!=0)||((L2%2)!=0)||((L3%2)!=0)) +#error : The local lattice sizes must be even and not smaller than 4 +#endif + +#if ((NPROC0_BLK<1)||(NBROC0_BLK>NPROC0)||((NPROC0%NPROC0_BLK)!=0)|| \ + (NPROC1_BLK<1)||(NBROC1_BLK>NPROC1)||((NPROC1%NPROC1_BLK)!=0)|| \ + (NPROC2_BLK<1)||(NBROC2_BLK>NPROC2)||((NPROC2%NPROC2_BLK)!=0)|| \ + (NPROC3_BLK<1)||(NBROC3_BLK>NPROC3)||((NPROC3%NPROC3_BLK)!=0)) +#error : Improper processor block sizes NPROC0_BLK,..,NPROC3_BLK +#endif + +#if (NAME_SIZE<128) +#error : NAME_SIZE must be greater or equal to 128 +#endif + +#define NPROC (NPROC0*NPROC1*NPROC2*NPROC3) +#define VOLUME (L0*L1*L2*L3) +#define FACE0 ((1-(NPROC0%2))*L1*L2*L3) +#define FACE1 ((1-(NPROC1%2))*L2*L3*L0) +#define FACE2 ((1-(NPROC2%2))*L3*L0*L1) +#define FACE3 ((1-(NPROC3%2))*L0*L1*L2) +#define BNDRY (2*(FACE0+FACE1+FACE2+FACE3)) +#define NSPIN (VOLUME+(BNDRY/2)) +#define ALIGN 6 + +#ifndef SU3_H +#include "su3.h" +#endif + +#if defined MAIN_PROGRAM + #define EXTERN +#else + #define EXTERN extern +#endif + +EXTERN int cpr[4]; +EXTERN int npr[8]; + +EXTERN int ipt[VOLUME]; +EXTERN int iup[VOLUME][4]; +EXTERN int idn[VOLUME][4]; +EXTERN int map[BNDRY+NPROC%2]; + +#undef EXTERN + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/lattice.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/lattice.h new file mode 100644 index 0000000000000000000000000000000000000000..0073ef4e9784ec5f976c198fa6bb16967a11be80 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/lattice.h @@ -0,0 +1,61 @@ + +/******************************************************************************* +* +* File lattice.h +* +* Copyright (C) 2011, 2012, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef LATTICE_H +#define LATTICE_H + +#ifndef BLOCK_H +#include "block.h" +#endif + +typedef struct +{ + int nu0,nuk; + int *iu0,*iuk; +} uidx_t; + +typedef struct +{ + int nft[2]; + int *ift[2]; +} ftidx_t; + +/* BCNDS_C */ +extern int *bnd_lks(int *n); +extern int *bnd_pts(int *n); +extern void set_bc(void); +extern int check_bc(double tol); +extern int chs_ubnd(int ibc); +extern void bnd_s2zero(ptset_t set,spinor *s); +extern void bnd_sd2zero(ptset_t set,spinor_dble *sd); + +/* FTIDX_C */ +extern ftidx_t *ftidx(void); +extern void plaq_ftidx(int n,int ix,int *ip); + +/* GEOMETRY_C */ +extern int ipr_global(int *n); +extern void ipt_global(int *x,int *ip,int *ix); +extern int global_time(int ix); +extern void geometry(void); +#if ((defined GEOMETRY_C)||(defined BLOCK_C)) +extern void blk_geometry(block_t *b); +extern void blk_imbed(block_t *b); +extern void bnd_geometry(block_t *b); +extern void bnd_imbed(block_t*b); +#endif + +/* UIDX_C */ +extern uidx_t *uidx(void); +extern void plaq_uidx(int n,int ix,int *ip); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/linalg.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/linalg.h new file mode 100644 index 0000000000000000000000000000000000000000..b961c2d86a801c62d094a125a4e9cf918959d7a5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/linalg.h @@ -0,0 +1,105 @@ + +/******************************************************************************* +* +* File linalg.h +* +* Copyright (C) 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef LINALG_H +#define LINALG_H + +#ifndef SU3_H +#include "su3.h" +#endif + +/* CMATRIX_C */ +extern void cmat_vec(int n,complex *a,complex *v,complex *w); +extern void cmat_vec_assign(int n,complex *a,complex *v,complex *w); +extern void cmat_add(int n,complex *a,complex *b,complex *c); +extern void cmat_sub(int n,complex *a,complex *b,complex *c); +extern void cmat_mul(int n,complex *a,complex *b,complex *c); +extern void cmat_dag(int n,complex *a,complex *b); + +/* CMATRIX_DBLE_C */ +extern void cmat_vec_dble(int n,complex_dble *a,complex_dble *v, + complex_dble *w); +extern void cmat_vec_assign_dble(int n,complex_dble *a,complex_dble *v, + complex_dble *w); +extern void cmat_add_dble(int n,complex_dble *a,complex_dble *b, + complex_dble *c); +extern void cmat_sub_dble(int n,complex_dble *a,complex_dble *b, + complex_dble *c); +extern void cmat_mul_dble(int n,complex_dble *a,complex_dble *b, + complex_dble *c); +extern void cmat_dag_dble(int n,complex_dble *a,complex_dble *b); +extern int cmat_inv_dble(int n,complex_dble *a,complex_dble *b,double *k); + +/* LIEALG_C */ +extern void random_alg(int vol,su3_alg_dble *X); +extern double norm_square_alg(int vol,int icom,su3_alg_dble *X); +extern double scalar_prod_alg(int vol,int icom,su3_alg_dble *X,su3_alg_dble *Y); +extern void set_alg2zero(int vol,su3_alg_dble *X); +extern void set_ualg2zero(int vol,u3_alg_dble *X); +extern void assign_alg2alg(int vol,su3_alg_dble *X,su3_alg_dble *Y); +extern void swap_alg(int vol,su3_alg_dble *X,su3_alg_dble *Y); +extern void muladd_assign_alg(int vol,double r,su3_alg_dble *X,su3_alg_dble *Y); + +/* SALG_C */ +extern complex spinor_prod(int vol,int icom,spinor *s,spinor *r); +extern float spinor_prod_re(int vol,int icom,spinor *s,spinor *r); +extern float norm_square(int vol,int icom,spinor *s); +extern void mulc_spinor_add(int vol,spinor *s,spinor *r,complex z); +extern void mulr_spinor_add(int vol,spinor *s,spinor *r,float c); +extern void project(int vol,int icom,spinor *s,spinor *r); +extern void scale(int vol,float c,spinor *s); +extern float normalize(int vol,int icom,spinor *s); +extern void rotate(int vol,int n,spinor **ppk,complex *v); +extern void mulg5(int vol,spinor *s); +extern void mulmg5(int vol,spinor *s); + +/* SALG_DBLE_C */ +extern complex_dble spinor_prod_dble(int vol,int icom,spinor_dble *s, + spinor_dble *r); +extern double spinor_prod_re_dble(int vol,int icom,spinor_dble *s, + spinor_dble *r); +extern complex_dble spinor_prod5_dble(int vol,int icom,spinor_dble *s, + spinor_dble *r); +extern double norm_square_dble(int vol,int icom,spinor_dble *s); +extern void mulc_spinor_add_dble(int vol,spinor_dble *s,spinor_dble *r, + complex_dble z); +extern void mulr_spinor_add_dble(int vol,spinor_dble *s,spinor_dble *r, + double c); +extern void combine_spinor_dble(int vol,spinor_dble *s,spinor_dble *r, + double cs,double cr); +extern void project_dble(int vol,int icom,spinor_dble *s,spinor_dble *r); +extern void scale_dble(int vol,double c,spinor_dble *s); +extern double normalize_dble(int vol,int icom,spinor_dble *s); +extern void rotate_dble(int vol,int n,spinor_dble **ppk,complex_dble *v); +extern void mulg5_dble(int vol,spinor_dble *s); +extern void mulmg5_dble(int vol,spinor_dble *s); + +/* VALG_C */ +extern complex vprod(int n,int icom,complex *v,complex *w); +extern float vnorm_square(int n,int icom,complex *v); +extern void mulc_vadd(int n,complex *v,complex *w,complex z); +extern void vproject(int n,int icom,complex *v,complex *w); +extern void vscale(int n,float r,complex *v); +extern float vnormalize(int n,int icom,complex *v); +extern void vrotate(int n,int nv,complex **pv,complex *a); + +/* VALG_DBLE_C */ +extern complex_dble vprod_dble(int n,int icom,complex_dble *v,complex_dble *w); +extern double vnorm_square_dble(int n,int icom,complex_dble *v); +extern void mulc_vadd_dble(int n,complex_dble *v,complex_dble *w, + complex_dble z); +extern void vproject_dble(int n,int icom,complex_dble *v,complex_dble *w); +extern void vscale_dble(int n,double r,complex_dble *v); +extern double vnormalize_dble(int n,int icom,complex_dble *v); +extern void vrotate_dble(int n,int nv,complex_dble **pv,complex_dble *a); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/linsolv.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/linsolv.h new file mode 100644 index 0000000000000000000000000000000000000000..cde535c958abf51c700577a6310283b36ad443b5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/linsolv.h @@ -0,0 +1,46 @@ + +/******************************************************************************* +* +* File linsolv.h +* +* Copyright (C) 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef LINSOLV_H +#define LINSOLV_H + +#ifndef SU3_H +#include "su3.h" +#endif + +/* CGNE_C */ +extern double cgne(int vol,int icom,void (*Dop)(spinor *s,spinor *r), + void (*Dop_dble)(spinor_dble *s,spinor_dble *r), + spinor **ws,spinor_dble **wsd,int nmx,double res, + spinor_dble *eta,spinor_dble *psi,int *status); + +/* FGCR4VD_C */ +extern double fgcr4vd(int vol,int icom, + void (*Dop)(complex_dble *v,complex_dble *w), + void (*Mop)(int k,complex *eta,complex *psi,complex *chi), + complex **wv,complex_dble **wvd,int nkv,int nmx,double res, + complex_dble *eta,complex_dble *psi,int *status); + +/* FGCR_C */ +extern double fgcr(int vol,int icom, + void (*Dop)(spinor_dble *s,spinor_dble *r), + void (*Mop)(int k,spinor *rho,spinor *phi,spinor *chi), + spinor **ws,spinor_dble **wsd,int nkv,int nmx,double res, + spinor_dble *eta,spinor_dble *psi,int *status); + +/* MSCG_C */ +extern void mscg(int vol,int icom,int nmu,double *mu, + void (*Dop_dble)(double mu,spinor_dble *s,spinor_dble *r), + spinor_dble **wsd,int nmx,double *res, + spinor_dble *eta,spinor_dble **psi,int *status); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/little.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/little.h new file mode 100644 index 0000000000000000000000000000000000000000..16918d3b3b785ae426d94aca1691cd3576344c54 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/little.h @@ -0,0 +1,83 @@ + +/******************************************************************************* +* +* File little.h +* +* Copyright (C) 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef LITTLE_H +#define LITTLE_H + +#ifndef SU3_H +#include "su3.h" +#endif + +typedef struct +{ + int Ns,nb; + complex **Aee,**Aoo,**Aoe,**Aeo; +} Aw_t; + +typedef struct +{ + int Ns,nb; + complex_dble **Aee,**Aoo,**Aoe,**Aeo; +} Aw_dble_t; + +typedef struct +{ + int n[2]; + int vol,ibn; + spinor_dble **sde[2]; + spinor_dble **sdo[2]; +} b2b_flds_t; + +/* AW_COM_C */ +extern b2b_flds_t *b2b_flds(int n,int mu); +extern void cpAoe_ext_bnd(void); +extern void cpAee_int_bnd(void); + +/* AW_C */ +extern void Aw(complex *v,complex *w); +extern void Aweeinv(complex *v,complex *w); +extern void Awooinv(complex *v,complex *w); +extern void Awoe(complex *v,complex *w); +extern void Aweo(complex *v,complex *w); +extern void Awhat(complex *v,complex *w); + +/* AW_DBLE_C */ +extern void Aw_dble(complex_dble *v,complex_dble *w); +extern void Aweeinv_dble(complex_dble *v,complex_dble *w); +extern void Awooinv_dble(complex_dble *v,complex_dble *w); +extern void Awoe_dble(complex_dble *v,complex_dble *w); +extern void Aweo_dble(complex_dble *v,complex_dble *w); +extern void Awhat_dble(complex_dble *v,complex_dble *w); + +/* AW_GEN_C */ +extern void gather_ud(int vol,int *imb,su3_dble *ud,su3_dble *vd); +extern void gather_sd(int vol,int *imb,spinor_dble *sd,spinor_dble *rd); +extern void apply_u2sd(int vol,int *imb,su3_dble *ud,spinor_dble *sd, + spinor_dble *rd); +extern void apply_udag2sd(int vol,int *imb,su3_dble *ud,spinor_dble *sd, + spinor_dble *rd); +extern void (*spinor_prod_gamma[])(int vol,spinor_dble *sd,spinor_dble *rd, + complex_dble *sp); + +/* AW_OPS_C */ +extern Aw_t Awop(void); +extern Aw_t Awophat(void); +extern Aw_dble_t Awop_dble(void); +extern Aw_dble_t Awophat_dble(void); +extern void set_Aw(double mu); +extern int set_Awhat(double mu); + +/* LTL_MODES_C */ +extern int set_ltl_modes(void); +extern complex_dble *ltl_matrix(void); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/mdflds.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/mdflds.h new file mode 100644 index 0000000000000000000000000000000000000000..7820439d885f30bd2f2cdcaf2e954f2494fd4c9f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/mdflds.h @@ -0,0 +1,38 @@ + +/******************************************************************************* +* +* File mdflds.h +* +* Copyright (C) 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef MDFLDS_H +#define MDFLDS_H + +#ifndef SU3_H +#include "su3.h" +#endif + +typedef struct +{ + int npf; + su3_alg_dble *mom,*frc; + spinor_dble **pf; +} mdflds_t; + +/* FCOM_C */ +extern void copy_bnd_frc(void); +extern void add_bnd_frc(void); + +/* MDFLDS_C */ +extern mdflds_t *mdflds(void); +extern void set_frc2zero(void); +extern void bnd_mom2zero(void); +extern void random_mom(void); +extern double momentum_action(int icom); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/nompi/extras.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/nompi/extras.h new file mode 100644 index 0000000000000000000000000000000000000000..ef6abb6e1d14587a797921900e5e1f23bad7fbf4 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/nompi/extras.h @@ -0,0 +1,53 @@ + +/******************************************************************************* +* +* File nompi/extras.h +* +* Copyright (C) 2009, 2010, 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef EXTRAS_H +#define EXTRAS_H + +/* CHEBYSHEV_C */ +extern int cheby_fit(double a,double b,double (*f)(double x), + int dmax,double eps,double c[]); +extern double cheby_int(double a,double b,double (*f)(double x), + int dmax,double eps); +extern double cheby_val(double a,double b,int n,double c[],double x); + +/* FSOLVE_C */ +extern double inverse_fct(double x1,double x2,double (*f)(double x),double y, + double omega1,double omega2); +extern double minimize_fct(double x0,double x1,double x2,double (*f)(double x), + double omega1,double omega2); +extern void powell(int n,double *x0,double *x1,double *x2, + double (*f)(int n,double *x),int imx,double omega1, + double omega2,double *xmin,int *status); + +/* I0M_C */ +extern double i0m(double x); + +/* KS_TEST_C */ +extern void ks_test(int n,double f[],double *pkp,double *pkm); +extern void ks_prob(int n,double kp,double km,double *pp,double *pm); + +/* PCHI_SQUARE_C */ +extern double pchi_square(double chi_square,int nu); + +/* STAT_C */ +extern double average(int n,double *a); +extern double sigma0(int n,double *a); +extern double auto_corr(int n,double *a,int tmax,double *g); +extern void sigma_auto_corr(int n,double *a,int tmax,int lambda,double *eg); +extern double tauint(int n,double *a,int tmax,int lambda,int *w,double *sigma); +extern double print_auto(int n,double *a); +extern double jack_err(int nx,int n,double **a,double (*f)(int nx,double *x), + int bmax,double *sig); +extern double print_jack(int nx,int n,double **a,double (*f)(int nx,double *x)); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/nompi/utils.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/nompi/utils.h new file mode 100644 index 0000000000000000000000000000000000000000..3e37c10ca625226ef856a386d620bdb6fea4e851 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/nompi/utils.h @@ -0,0 +1,79 @@ + +/******************************************************************************* +* +* File nompi/utils.h +* +* Copyright (C) 2009, 2010, 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef UTILS_H +#define UTILS_H + +#include +#include + +#define NAME_SIZE 128 + +#if ((DBL_MANT_DIG!=53)||(DBL_MIN_EXP!=-1021)||(DBL_MAX_EXP!=1024)) +#error : Machine is not compliant with the IEEE-754 standard +#endif + +#if (SHRT_MAX==0x7fffffff) +typedef short int stdint_t; +typedef unsigned short int stduint_t; +#elif (INT_MAX==0x7fffffff) +typedef int stdint_t; +typedef unsigned int stduint_t; +#elif (LONG_MAX==0x7fffffff) +typedef long int stdint_t; +typedef unsigned long int stduint_t; +#else +#error : There is no four-byte integer type on this machine +#endif + +#undef UNKNOWN_ENDIAN +#undef LITTLE_ENDIAN +#undef BIG_ENDIAN + +#define UNKNOWN_ENDIAN 0 +#define LITTLE_ENDIAN 1 +#define BIG_ENDIAN 2 + +#undef IMAX +#define IMAX(n,m) ((n)+((m)-(n))*((m)>(n))) + +typedef enum +{ + ALL_PTS,EVEN_PTS,ODD_PTS,NO_PTS,PT_SETS +} ptset_t; + +/* ENDIAN_C */ +extern int endianness(void); +extern void bswap_int(int n,void *a); +extern void bswap_double(int n,void *a); + +/* MUTILS_C */ +extern int find_opt(int argc,char *argv[],char *opt); +extern int digits(double x,double dx,char *fmt); +extern int fdigits(double x); +extern int name_size(char *format,...); +extern long find_section(FILE *stream,char *title); +extern long read_line(FILE *stream,char *tag,char *format,...); +extern int count_tokens(FILE *stream,char *tag); +extern void read_iprms(FILE *stream,char *tag,int n,int *iprms); +extern void read_dprms(FILE *stream,char *tag,int n,double *dprms); + +/* UTILS_C */ +extern int safe_mod(int x,int y); +extern void *amalloc(size_t size,int p); +extern void afree(void *addr); +extern void error(int test,int no,char *name,char *format,...); +extern void error_root(int test,int no,char *name,char *format,...); +extern int error_loc(int test,int no,char *name,char *format,...); +extern void message(char *format,...); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/qpx.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/qpx.h new file mode 100644 index 0000000000000000000000000000000000000000..8894e0eb29a4e49016401d0bc78c921687fea7a3 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/qpx.h @@ -0,0 +1,291 @@ +#ifndef QPX_H +#define QPX_H +/******************************************************************************* +* +* File qpx.h +* +* Copyright (C) 2013 Dalibor Djukanovic +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Type definitions and macros for fast manipulation of +* SU(3) matrices, SU(3) vectors and Dirac spinors exploiting the Quad FPU +* unit of BlueGene/Q +* +*******************************************************************************/ + +#ifdef PDToolkit +/* Needed for parsing with TAU */ +typedef double vector4double[4]; +#endif + +static vector4double qpx_r1, qpx_r2, qpx_r3, qpx_r4, qpx_r5, qpx_r6, qpx_r7, qpx_r8, qpx_r9; +static vector4double qpx_r10, qpx_r11, qpx_r12, qpx_r13, qpx_r14, qpx_r15, qpx_r16, qpx_r17; +static vector4double vec_i=(vector4double){0,1,0,1}; +static vector4double vec_i_s=(vector4double){0,1,0,-1}; +static vector4double sign0=(vector4double){1.,1.,1.,-1.}; +static vector4double sign1=(vector4double){1.,1.,-1.,-1.}; +static vector4double sign2=(vector4double){-1.,-1.,1.,1.}; + +/* Operands for qvfperm QRT QRA QRB QRC + QRC[msw] double QRC[12:14] QRT + 0x4000 0000 = 2.0 000 = 0 QRA0 + 0x4002 0000 = 2.25 001 = 1 QRA1 + 0x4004 0000 = 2.50 010 = 2 QRA2 + 0x4006 0000 = 2.75 011 = 3 QRA3 + 0x4008 0000 = 3.00 100 = 4 QRB0 + 0x400a 0000 = 3.25 101 = 5 QRB1 + 0x400c 0000 = 3.50 110 = 6 QRB2 + 0x400e 0000 = 3.75 111 = 7 QRB3 +*/ +static vector4double perm0011={2.000000,2.000000,2.250000,2.250000}; /* A0 A0 A1 A1 */ +static vector4double perm2233={2.500000,2.500000,2.750000,2.750000}; /* A2 A2 A3 A3 */ +static vector4double perm1={2.000000,2.250000,3.000000,3.250000}; /* A0 A1 B0 B1 */ +static vector4double perm2={2.500000,2.750000,3.500000,3.750000}; /* A2 A3 B2 B3 */ +static vector4double perml1={2.000000,3.00000,2.250000,3.000000}; /* A0 B0 A1 B0 */ +static vector4double perml2={2.500000,3.00000,2.750000,3.000000}; /* A2 B0 A3 B0 */ +static vector4double perm12={2.000000,2.250000,3.500000,3.750000}; /* A0 A1 B2 B3 */ +static vector4double perm21={3.500000,3.750000,2.000000,2.250000}; /* B2 B3 A0 A1 */ + +/* Prefetch */ + +#define _qpx_prefetch_su3_dp(addr)\ + __dcbt(((char*)((unsigned long int)(addr)))); \ + __dcbt(((char*)((unsigned long int)(addr)))+128); + +#define _qpx_prefetch_spinor_dp(addr)\ + __dcbt(((char*)((unsigned long int)(addr)))); \ + __dcbt(((char*)((unsigned long int)(addr)))+128); + +#define _qpx_prefetch_su3_sp(addr)\ + __dcbt(((char*)((unsigned long int)(addr)))); + +#define _qpx_prefetch_spinor_sp(addr)\ + __dcbt(((char*)((unsigned long int)(addr)))); + +/* Load and Store + + Asssume 32 Byte alignment for double precision structures + (spinor_dble, weyl_dble, and su3_dble) and 16 Byte alignment + for single-precision structures (spinor, weyl, su3) + + Use vec_lda and vec_sta to raise exception (SIG 7) + in case of incorrect alignment (if environment variable + BG_MAXALIGNEXP is set to a small value, e.g. 1) +*/ + +/* Load first Weyl spinor = components (c1, c2) of Dirac spinor: + psi11 <- c1.c1.re c1.c1.im c1.c2.re c1.c2.im + psi11 <- c1.c3.re c1.c3.im c2.c1.re c2.c1.im + psi11 <- c2.c2.re c2.c2.im c2.c3.re c2.c3.im +*/ +#define _qpx_load_w1(r,ps)\ + r##1=vec_lda(0,&(((ps)->c1).c1.re)); \ + r##2=vec_lda(0,&(((ps)->c1).c3.re)); \ + r##3=vec_lda(0,&(((ps)->c2).c2.re)); + +/* Load second Weyl spinor = components (c3, c4) of Dirac spinor: + psi11 <- c3.c1.re c3.c1.im c3.c2.re c3.c2.im + psi11 <- c3.c3.re c3.c3.im c4.c1.re c4.c1.im + psi11 <- c4.c2.re c4.c2.im c4.c3.re c4.c3.im +*/ +#define _qpx_load_w2(r,ps)\ + r##1=vec_lda(0,&(((ps)->c3).c1.re)); \ + r##2=vec_lda(0,&(((ps)->c3).c3.re)); \ + r##3=vec_lda(0,&(((ps)->c4).c2.re)); + +#define _qpx_store_w1(r,ps)\ + vec_sta(r##1,0,&(((ps)->c1).c1.re));\ + vec_sta(r##2,0,&(((ps)->c1).c3.re));\ + vec_sta(r##3,0,&(((ps)->c2).c2.re)); + +#define _qpx_store_w2(r,ps)\ + vec_sta(r##1,0,&(((ps)->c3).c1.re));\ + vec_sta(r##2,0,&(((ps)->c3).c3.re));\ + vec_sta(r##3,0,&(((ps)->c4).c2.re)); + +/* Permutation for Dirac Operators + res1 = ( v2.2 v2.3 v3.0 v3.1 ) + res2 = ( v3.2 v2.3 v1.0 v1.1 ) + res3 = ( v1.2 v1.3 v2.0 v2.1 ) +*/ +#define _qpx_vec_x(res,v)\ + res##1=vec_sldw(v##2,v##3,2); \ + res##2=vec_sldw(v##3,v##1,2); \ + res##3=vec_sldw(v##1,v##2,2); + + +/********************** Math functions ********************/ + +/******************* res = va + vb ***********************/ +#define _qpx_vec_add(res, va,vb) \ + res##1=vec_add(va##1,vb##1); \ + res##2=vec_add(va##2,vb##2); \ + res##3=vec_add(va##3,vb##3); + +/********************************************************* + res1 = va1 + vb1 + res2 = va2 + ( +vb2.0 +vb2.1 -vb2.2 -vb2.3 ) + res3 = va3 - vb3 + + If the operands are + va1 = ( psi_1 psi_1 ) vb1 = ( psi_4 psi_4 ) + va2 = ( psi_1 psi_2 ) vb2 = ( psi_4 psi_3 ) + va3 = ( psi_2 psi_2 ) vb3 = ( psi_3 psi_3 ) + then + res1 = ( phi_1 phi_1 ) + res2 = ( phi_1 phi_2 ) + res3 = ( phi_2 phi_2 ) + where + phi_1 = psi_1 + psi_4 + phi_2 = psi_2 - psi_3 + is the spinor combination for mu=+2 of eq. (A.12) of doc/dirac.pdf +*/ +#define _qpx_vec_add_n(res, va,vb) \ + res##1=vec_add(va##1,vb##1); \ + res##2=vec_madd(vb##2,sign1,va##2); \ + res##3=vec_sub(va##3,vb##3); + + +/******************* res = va - vb ***********************/ +#define _qpx_vec_sub(res, va,vb) \ + res##1=vec_sub(va##1,vb##1); \ + res##2=vec_sub(va##2,vb##2); \ + res##3=vec_sub(va##3,vb##3); + + +/********************************************************* + res1 = va1 - vb1 + res2 = va2 + ( -vb2.0 -vb2.1 +vb2.2 +vb2.3 ) + res3 = va3 + vb3 + + If the operands are + va1 = ( psi_1 psi_1 ) vb1 = ( psi_4 psi_4 ) + va2 = ( psi_1 psi_2 ) vb2 = ( psi_4 psi_3 ) + va3 = ( psi_2 psi_2 ) vb3 = ( psi_3 psi_3 ) + then + res1 = ( phi_1 phi_1 ) + res2 = ( phi_1 phi_2 ) + res3 = ( phi_2 phi_2 ) + where + phi_1 = psi_1 - psi_4 + phi_2 = psi_2 + psi_3 + is the spinor combination for mu=-2 of eq. (A.13) of doc/dirac.pdf +*/ +#define _qpx_vec_sub_n(res, va,vb) \ + res##1=vec_sub(va##1,vb##1); \ + res##2=vec_madd(sign2,vb##2,va##2); \ + res##3=vec_add(va##3,vb##3); + + +/******************* res = va - i vb **********************/ +#define _qpx_vec_i_sub(res,va,vb) \ + res##1=vec_xxcpnmadd(vb##1,vec_i,va##1);\ + res##2=vec_xxcpnmadd(vb##2,vec_i,va##2);\ + res##3=vec_xxcpnmadd(vb##3,vec_i,va##3); + +/********************************************************* + res1 = va1 - i vb1 + res2 = va2 + ( -i vb2.0, -i vb2.1, +i vb2.2, +i vb2.3 ) + res3 = va3 + i vb3 +*/ +#define _qpx_vec_i_sub_n(res,va,vb) \ + res##1=vec_xxcpnmadd(vb##1,vec_i,va##1);\ + res##2=vec_xxcpnmadd(vb##2,vec_i_s,va##2);\ + res##3=vec_xxnpmadd(vb##3,vec_i,va##3); + +/******************* res = va + i vb **********************/ +#define _qpx_vec_i_add(res,va,vb)\ + res##1=vec_xxnpmadd(vb##1,vec_i,va##1); \ + res##2=vec_xxnpmadd(vb##2,vec_i,va##2); \ + res##3=vec_xxnpmadd(vb##3,vec_i,va##3); + +/********************************************************* + res1 = va1 + i vb1 + res2 = va2 + ( +i vb2.0, +i vb2.1, -i vb2.2, -i vb2.3 ) + res3 = va3 - i vb3 +*/ +#define _qpx_vec_i_add_n(res,va,vb)\ + res##1=vec_xxnpmadd(vb##1,vec_i,va##1); \ + res##2=vec_xxnpmadd(vb##2,vec_i_s,va##2); \ + res##3=vec_xxcpnmadd(vb##3,vec_i,va##3); + +#define _qpx_su3_mul(res,u,psi) \ + qpx_r1=vec_ld2(0,&((u).c11.re)); \ + qpx_r2=vec_ld2(0,&((u).c21.re)); \ + qpx_r3=vec_ld2(0,&((u).c31.re)); \ + qpx_r4=vec_ld2(0,&((u).c12.re)); \ + qpx_r5=vec_ld2(0,&((u).c22.re)); \ + qpx_r6=vec_ld2(0,&((u).c32.re)); \ + qpx_r7=vec_ld2(0,&((u).c13.re)); \ + qpx_r8=vec_ld2(0,&((u).c23.re)); \ + qpx_r9=vec_ld2(0,&((u).c33.re)); \ + qpx_r10=vec_perm(psi##1,psi##2,perm12);\ + qpx_r11=vec_sldw(psi##1,psi##3,2);\ + qpx_r12=vec_perm(psi##2,psi##3,perm12);\ + qpx_r13=vec_xxnpmadd(qpx_r10,qpx_r1,vec_xmul(qpx_r1,qpx_r10));\ + qpx_r14=vec_xxnpmadd(qpx_r11,qpx_r4,vec_xmadd(qpx_r4,qpx_r11,qpx_r13));\ + qpx_r15=vec_xxnpmadd(qpx_r12,qpx_r7,vec_xmadd(qpx_r7,qpx_r12,qpx_r14));\ + qpx_r13=vec_xxnpmadd(qpx_r10,qpx_r2,vec_xmul(qpx_r2,qpx_r10));\ + qpx_r14=vec_xxnpmadd(qpx_r11,qpx_r5,vec_xmadd(qpx_r5,qpx_r11,qpx_r13));\ + qpx_r16=vec_xxnpmadd(qpx_r12,qpx_r8,vec_xmadd(qpx_r8,qpx_r12,qpx_r14));\ + qpx_r13=vec_xxnpmadd(qpx_r10,qpx_r3,vec_xmul(qpx_r3,qpx_r10));\ + qpx_r14=vec_xxnpmadd(qpx_r11,qpx_r6,vec_xmadd(qpx_r6,qpx_r11,qpx_r13));\ + qpx_r17=vec_xxnpmadd(qpx_r12,qpx_r9,vec_xmadd(qpx_r9,qpx_r12,qpx_r14));\ + res##1=vec_perm(qpx_r15,qpx_r16,perm1);\ + res##2=vec_perm(qpx_r17,qpx_r15,perm12);\ + res##3=vec_perm(qpx_r16,qpx_r17,perm2); + +#define _qpx_su3_inv_mul(res,u,psi) \ + qpx_r1=vec_ld2(0,&((u).c11.re)); \ + qpx_r2=vec_ld2(0,&((u).c12.re)); \ + qpx_r3=vec_ld2(0,&((u).c13.re)); \ + qpx_r4=vec_ld2(0,&((u).c21.re)); \ + qpx_r5=vec_ld2(0,&((u).c22.re)); \ + qpx_r6=vec_ld2(0,&((u).c23.re)); \ + qpx_r7=vec_ld2(0,&((u).c31.re)); \ + qpx_r8=vec_ld2(0,&((u).c32.re)); \ + qpx_r9=vec_ld2(0,&((u).c33.re)); \ + qpx_r10=vec_perm(psi##1,psi##2,perm12);\ + qpx_r11=vec_sldw(psi##1,psi##3,2);\ + qpx_r12=vec_perm(psi##2,psi##3,perm12);\ + qpx_r13=vec_xxcpnmadd(qpx_r10,qpx_r1,vec_xmul(qpx_r1,qpx_r10));\ + qpx_r14=vec_xxcpnmadd(qpx_r11,qpx_r4,vec_xmadd(qpx_r4,qpx_r11,qpx_r13));\ + qpx_r15=vec_xxcpnmadd(qpx_r12,qpx_r7,vec_xmadd(qpx_r7,qpx_r12,qpx_r14));\ + qpx_r13=vec_xxcpnmadd(qpx_r10,qpx_r2,vec_xmul(qpx_r2,qpx_r10));\ + qpx_r14=vec_xxcpnmadd(qpx_r11,qpx_r5,vec_xmadd(qpx_r5,qpx_r11,qpx_r13));\ + qpx_r16=vec_xxcpnmadd(qpx_r12,qpx_r8,vec_xmadd(qpx_r8,qpx_r12,qpx_r14));\ + qpx_r13=vec_xxcpnmadd(qpx_r10,qpx_r3,vec_xmul(qpx_r3,qpx_r10));\ + qpx_r14=vec_xxcpnmadd(qpx_r11,qpx_r6,vec_xmadd(qpx_r6,qpx_r11,qpx_r13));\ + qpx_r17=vec_xxcpnmadd(qpx_r12,qpx_r9,vec_xmadd(qpx_r9,qpx_r12,qpx_r14));\ + res##1=vec_perm(qpx_r15,qpx_r16,perm1);\ + res##2=vec_perm(qpx_r17,qpx_r15,perm12);\ + res##3=vec_perm(qpx_r16,qpx_r17,perm2); + +#define _qpx_vec_i_add_assign(res,va) \ + res##1=vec_xxnpmadd(va##1,vec_i,res##1); \ + res##2=vec_xxnpmadd(va##2,vec_i,res##2); \ + res##3=vec_xxnpmadd(va##3,vec_i,res##3); + +#define _qpx_vec_add_assign(va,vb) \ + va##1=vec_add(va##1,vb##1); \ + va##2=vec_add(va##2,vb##2); \ + va##3=vec_add(va##3,vb##3); + +#define _qpx_vec_sub_assign(va,vb) \ + va##1=vec_sub(va##1,vb##1); \ + va##2=vec_sub(va##2,vb##2); \ + va##3=vec_sub(va##3,vb##3); + +#define _qpx_vec_i_sub_assign(res,va) \ + res##1=vec_xxcpnmadd(va##1,vec_i,res##1); \ + res##2=vec_xxcpnmadd(va##2,vec_i,res##2); \ + res##3=vec_xxcpnmadd(va##3,vec_i,res##3); + +#define _qpx_vec_prod(a,b,res)\ + res##1=vec_xxcpnmadd(b##1,a##1,vec_xmadd(a##1,b##1,res##1));\ + res##2=vec_xxcpnmadd(b##2,a##2,vec_xmadd(a##2,b##2,res##2));\ + res##3=vec_xxcpnmadd(b##3,a##3,vec_xmadd(a##3,b##3,res##3)); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/random.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/random.h new file mode 100644 index 0000000000000000000000000000000000000000..191ee57b93128208cccd8f6785c9301b32bfd730 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/random.h @@ -0,0 +1,39 @@ + +/******************************************************************************* +* +* File random.h +* +* Copyright (C) 2005, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef RANDOM_H +#define RANDOM_H + +/* GAUSS_C */ +extern void gauss(float r[],int n); +extern void gauss_dble(double r[],int n); + +/* RANLUX_C */ +extern void start_ranlux(int level,int seed); +extern void export_ranlux(int tag,char *out); +extern int import_ranlux(char *in); + +/* RANLXS_C */ +extern void ranlxs(float r[],int n); +extern void rlxs_init(int level,int seed); +extern int rlxs_size(void); +extern void rlxs_get(int state[]); +extern void rlxs_reset(int state[]); + +/* RANLXD_C */ +extern void ranlxd(double r[],int n); +extern void rlxd_init(int level,int seed); +extern int rlxd_size(void); +extern void rlxd_get(int state[]); +extern void rlxd_reset(int state[]); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/ratfcts.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/ratfcts.h new file mode 100644 index 0000000000000000000000000000000000000000..882551ab17ec33b97f263ea0533543d248ea08a6 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/ratfcts.h @@ -0,0 +1,34 @@ + +/******************************************************************************* +* +* File ratfcts.h +* +* Copyright (C) 2012 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef RATFCTS_H +#define RATFCTS_H + +typedef struct +{ + int np; + double A,delta; + double *mu,*rmu; + double *nu,*rnu; +} ratfct_t; + +/* ELLIPTIC_C */ +extern double ellipticK(double rk); +extern void sncndn(double u,double rk,double *sn,double *cn,double *dn); + +/* RATFCTS_C */ +extern ratfct_t ratfct(int *irat); + +/* ZOLOTAREV_C */ +extern void zolotarev(int n,double eps,double *A,double *ar,double *delta); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/sap.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/sap.h new file mode 100644 index 0000000000000000000000000000000000000000..6a6efbab6f7d592bab74c1ab921d433cf524c37d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/sap.h @@ -0,0 +1,37 @@ + +/******************************************************************************* +* +* File sap.h +* +* Copyright (C) 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef SAP_H +#define SAP_H + +#ifndef SU3_H +#include "su3.h" +#endif + +/* BLK_SOLV_C */ +extern void blk_mres(int n,float mu,int nmr); +extern void blk_eo_mres(int n,float mu,int nmr); + +/* SAP_COM_C */ +#if ((defined SAP_COM_C)||(defined BLK_GRID_C )) +extern void alloc_sap_bufs(void); +#endif +extern void sap_com(int ic,spinor *r); + +/* SAP */ +extern void sap(float mu,int isolv,int nmr,spinor *psi,spinor *eta); + +/* SAP_GCR */ +extern double sap_gcr(int nkv,int nmx,double res,double mu, + spinor_dble *eta,spinor_dble *psi,int *status); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/sflds.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/sflds.h new file mode 100644 index 0000000000000000000000000000000000000000..03905e271e40de745b5008504cb475122b53b846 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/sflds.h @@ -0,0 +1,59 @@ + +/******************************************************************************* +* +* File sflds.h +* +* Copyright (C) 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef SFLDS_H +#define SFLDS_H + +#ifndef SU3_H +#include "su3.h" +#endif + +/* PBND_C */ +extern void (*assign_s2w[8])(int *imb,int vol,spinor *s,weyl *r); +extern void (*add_assign_w2s[8])(int *imb,int vol,weyl *s,spinor *r); +extern void (*sub_assign_w2s[8])(int *imb,int vol,weyl *s,spinor *r); +extern void (*mulg5_sub_assign_w2s[8])(int *imb,int vol,weyl *s,spinor *r); + +/* PBND_DBLE_C */ +extern void (*assign_sd2wd[8])(int *imb,int vol,spinor_dble *sd, + weyl_dble *rd); +extern void (*add_assign_wd2sd[8])(int *imb,int vol,weyl_dble *sd, + spinor_dble *rd); +extern void (*sub_assign_wd2sd[8])(int *imb,int vol,weyl_dble *sd, + spinor_dble *rd); +extern void (*mulg5_sub_assign_wd2sd[8])(int *imb,int vol,weyl_dble *sd, + spinor_dble *rd); + +/* SFLDS_C */ +extern void set_s2zero(int vol,spinor *s); +extern void set_sd2zero(int vol,spinor_dble *sd); +extern void random_s(int vol,spinor *s,float sigma); +extern void random_sd(int vol,spinor_dble *sd,double sigma); +extern void assign_s2s(int vol,spinor *s,spinor *r); +extern void assign_s2sd(int vol,spinor *s,spinor_dble *rd); +extern void assign_sd2s(int vol,spinor_dble *sd,spinor *r); +extern void assign_sd2sd(int vol,spinor_dble *sd,spinor_dble *rd); +extern void diff_s2s(int vol,spinor *s,spinor *r); +extern void add_s2sd(int vol,spinor *s,spinor_dble *rd); +extern void diff_sd2s(int vol,spinor_dble *sd,spinor_dble *rd,spinor *r); + +/* SCOM_C */ +extern void cps_int_bnd(int is,spinor *s); +extern void cps_ext_bnd(int is,spinor *s); + +/* SDCOM_C */ +extern void cpsd_int_bnd(int is,spinor_dble *sd); +extern void cpsd_ext_bnd(int is,spinor_dble *sd); + +#endif + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/sse.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/sse.h new file mode 100644 index 0000000000000000000000000000000000000000..20ed66b5b8c27aa940ccb5c9e75c26dccf3f3499 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/sse.h @@ -0,0 +1,1356 @@ + +/******************************************************************************* +* +* File sse.h +* +* Copyright (C) 2005, 2008, 2009, 2011 Martin Luescher, Filippo Palombi +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Macros for Dirac spinors, SU(3) vectors and SU(3) matrices using inline +* assembly SSE3 instructions. The machine is assumed to comply with the +* x86-64 instruction set. +* +*******************************************************************************/ + +#ifndef SSE_H +#define SSE_H + +typedef struct +{ + int c1,c2,c3,c4; +} sse_int __attribute__ ((aligned (16))); + +typedef struct +{ + float c1,c2,c3,c4; +} sse_float __attribute__ ((aligned (16))); + +typedef struct +{ + sse_float c1,c2,c3; +} sse_vector __attribute__ ((aligned (16))); + +static sse_float _sse_sgn12 __attribute__ ((unused)) ={-1.0f,-1.0f,1.0f,1.0f}; +static sse_float _sse_sgn13 __attribute__ ((unused)) ={-1.0f,1.0f,-1.0f,1.0f}; +static sse_float _sse_sgn14 __attribute__ ((unused)) ={-1.0f,1.0f,1.0f,-1.0f}; +static sse_float _sse_sgn23 __attribute__ ((unused)) ={1.0f,-1.0f,-1.0f,1.0f}; +static sse_float _sse_sgn24 __attribute__ ((unused)) ={1.0f,-1.0f,1.0f,-1.0f}; +static sse_float _sse_sgn34 __attribute__ ((unused)) ={1.0f,1.0f,-1.0f,-1.0f}; +static sse_float _sse_sgn __attribute__ ((unused)) ={-1.0f,-1.0f,-1.0f,-1.0f}; + +/******************************************************************************* +* +* Prefetch macros +* +*******************************************************************************/ + +#if (defined P4) + +#define _pfbase(addr) ((unsigned long)(addr)&(~0x7fL)) + +#define _prefetch_128b(addr) \ +__asm__ __volatile__ ("prefetcht0 %0" \ + : \ + : \ + "m" (*((char*)(_pfbase(addr))))) + +#define _prefetch_256b(addr) \ +__asm__ __volatile__ ("prefetcht0 %0 \n\t" \ + "prefetcht0 %1" \ + : \ + : \ + "m" (*((char*)(_pfbase(addr)))), \ + "m" (*((char*)(_pfbase(addr)+0x80L)))) + +#define _prefetch_384b(addr) \ +__asm__ __volatile__ ("prefetcht0 %0 \n\t" \ + "prefetcht0 %1 \n\t" \ + "prefetcht0 %2" \ + : \ + : \ + "m" (*((char*)(_pfbase(addr)))), \ + "m" (*((char*)(_pfbase(addr)+0x80L))), \ + "m" (*((char*)(_pfbase(addr)+0x100L)))) + +#define _prefetch_su3_alg_dble(addr) \ +_prefetch_128b((addr)) + +#define _prefetch_weyl(addr) \ +_prefetch_256b((addr)) + +#define _prefetch_spinor(addr) \ +_prefetch_256b((addr)) + +#define _prefetch_su3(addr) \ +_prefetch_256b((addr)) + +#define _prefetch_pauli(addr) \ +_prefetch_256b((addr)) + +#define _prefetch_weyl_dble(addr) \ +_prefetch_256b((addr)) + +#define _prefetch_spinor_dble(addr) \ +_prefetch_256b((addr)) + +#define _prefetch_su3_dble(addr) \ +_prefetch_256b((addr)) + +#define _prefetch_pauli_dble(addr) \ +_prefetch_384b((addr)) + +#elif (defined PM) + +#define _pfbase(addr) ((unsigned long)(addr)&(~0x3fL)) + +#define _prefetch_64b(addr) \ +__asm__ __volatile__ ("prefetcht0 %0" \ + : \ + : \ + "m" (*((char*)(_pfbase(addr))))) + +#define _prefetch_128b(addr) \ +__asm__ __volatile__ ("prefetcht0 %0 \n\t" \ + "prefetcht0 %1" \ + : \ + : \ + "m" (*((char*)(_pfbase(addr)))), \ + "m" (*((char*)(_pfbase(addr)+0x40L)))) + +#define _prefetch_192b(addr) \ +__asm__ __volatile__ ("prefetcht0 %0 \n\t" \ + "prefetcht0 %1 \n\t" \ + "prefetcht0 %2" \ + : \ + : \ + "m" (*((char*)(_pfbase(addr)))), \ + "m" (*((char*)(_pfbase(addr)+0x40L))), \ + "m" (*((char*)(_pfbase(addr)+0x80L)))) + +#define _prefetch_320b(addr) \ +__asm__ __volatile__ ("prefetcht0 %0 \n\t" \ + "prefetcht0 %1 \n\t" \ + "prefetcht0 %2 \n\t" \ + "prefetcht0 %3 \n\t" \ + "prefetcht0 %4" \ + : \ + : \ + "m" (*((char*)(_pfbase(addr)))), \ + "m" (*((char*)(_pfbase(addr)+0x40L))), \ + "m" (*((char*)(_pfbase(addr)+0x80L))), \ + "m" (*((char*)(_pfbase(addr)+0xc0L))), \ + "m" (*((char*)(_pfbase(addr)+0x100L)))) + +#define _prefetch_su3_alg_dble(addr) \ +_prefetch_64b((addr)) + +#define _prefetch_weyl(addr) \ +_prefetch_64b((addr)) + +#define _prefetch_spinor(addr) \ +_prefetch_128b((addr)) + +#define _prefetch_su3(addr) \ +_prefetch_128b((addr)) + +#define _prefetch_pauli(addr) \ +_prefetch_192b((addr)) + +#define _prefetch_weyl_dble(addr) \ +_prefetch_128b((addr)) + +#define _prefetch_spinor_dble(addr) \ +_prefetch_192b((addr)) + +#define _prefetch_su3_dble(addr) \ +_prefetch_192b((addr)) + +#define _prefetch_pauli_dble(addr) \ +_prefetch_320b((addr)) + +#elif (defined P3) + +#define _pfbase(addr) ((unsigned long)(addr)&(~0x1fL)) + +#define _prefetch_64b(addr) \ +__asm__ __volatile__ ("prefetcht0 %0 \n\t" \ + "prefetcht0 %1" \ + : \ + : \ + "m" (*((char*)(_pfbase(addr)))), \ + "m" (*((char*)(_pfbase(addr)+0x20L)))) + +#define _prefetch_96b(addr) \ +__asm__ __volatile__ ("prefetcht0 %0 \n\t" \ + "prefetcht0 %1 \n\t" \ + "prefetcht0 %2" \ + : \ + : \ + "m" (*((char*)(_pfbase(addr)))), \ + "m" (*((char*)(_pfbase(addr)+0x20L))), \ + "m" (*((char*)(_pfbase(addr)+0x40L)))) + +#define _prefetch_160b(addr) \ +__asm__ __volatile__ ("prefetcht0 %0 \n\t" \ + "prefetcht0 %1 \n\t" \ + "prefetcht0 %2 \n\t" \ + "prefetcht0 %3 \n\t" \ + "prefetcht0 %4" \ + : \ + : \ + "m" (*((char*)(_pfbase(addr)))), \ + "m" (*((char*)(_pfbase(addr)+0x20L))), \ + "m" (*((char*)(_pfbase(addr)+0x40L))), \ + "m" (*((char*)(_pfbase(addr)+0x60L))), \ + "m" (*((char*)(_pfbase(addr)+0x80L)))) + +#define _prefetch_192b(addr) \ +__asm__ __volatile__ ("prefetcht0 %0 \n\t" \ + "prefetcht0 %1 \n\t" \ + "prefetcht0 %2 \n\t" \ + "prefetcht0 %3 \n\t" \ + "prefetcht0 %4 \n\t" \ + "prefetcht0 %5" \ + : \ + : \ + "m" (*((char*)(_pfbase(addr)))), \ + "m" (*((char*)(_pfbase(addr)+0x20L))), \ + "m" (*((char*)(_pfbase(addr)+0x40L))), \ + "m" (*((char*)(_pfbase(addr)+0x60L))), \ + "m" (*((char*)(_pfbase(addr)+0x80L))), \ + "m" (*((char*)(_pfbase(addr)+0xa0L)))) + +#define _prefetch_288b(addr) \ +__asm__ __volatile__ ("prefetcht0 %0 \n\t" \ + "prefetcht0 %1 \n\t" \ + "prefetcht0 %2 \n\t" \ + "prefetcht0 %3 \n\t" \ + "prefetcht0 %4" \ + : \ + : \ + "m" (*((char*)(_pfbase(addr)))), \ + "m" (*((char*)(_pfbase(addr)+0x20L))), \ + "m" (*((char*)(_pfbase(addr)+0x40L))), \ + "m" (*((char*)(_pfbase(addr)+0x60L))), \ + "m" (*((char*)(_pfbase(addr)+0x80L)))); \ +__asm__ __volatile__ ("prefetcht0 %0 \n\t" \ + "prefetcht0 %1 \n\t" \ + "prefetcht0 %2 \n\t" \ + "prefetcht0 %3" \ + : \ + : \ + "m" (*((char*)(_pfbase(addr)+0xa0L))), \ + "m" (*((char*)(_pfbase(addr)+0xc0L))), \ + "m" (*((char*)(_pfbase(addr)+0xe0L))), \ + "m" (*((char*)(_pfbase(addr)+0x100L)))) + +#define _prefetch_su3_alg_dble(addr) \ +_prefetch_64b((addr)) + +#define _prefetch_weyl(addr) \ +_prefetch_64b((addr)) + +#define _prefetch_spinor(addr) \ +_prefetch_96b((addr)) + +#define _prefetch_su3(addr) \ +_prefetch_96b((addr)) + +#define _prefetch_pauli(addr) \ +_prefetch_160b((addr)) + +#define _prefetch_weyl_dble(addr) \ +_prefetch_96b((addr)) + +#define _prefetch_spinor_dble(addr) \ +_prefetch_192b((addr)) + +#define _prefetch_su3_dble(addr) \ +_prefetch_160b((addr)) + +#define _prefetch_pauli_dble(addr) \ +_prefetch_288b((addr)) + +#else + +#define _prefetch_su3_alg_dble(addr) + +#define _prefetch_weyl(addr) + +#define _prefetch_spinor(addr) + +#define _prefetch_su3(addr) + +#define _prefetch_pauli(addr) + +#define _prefetch_weyl_dble(addr) + +#define _prefetch_spinor_dble(addr) + +#define _prefetch_su3_dble(addr) + +#define _prefetch_pauli_dble(addr) + +#endif + +/******************************************************************************* +* +* Macros for su3_vector data +* +* Most of these macros operate on pairs of su3 vectors that are stored +* in the low and high words of xmm0,xmm1,xmm2 or xmm3,xmm4,xmm5. For example, +* +* xmm0 -> sl.c1.re,sl.c1.im,sh.c1.re,sh.c1.im +* xmm1 -> sl.c2.re,sl.c2.im,sh.c2.re,sh.c2.im +* xmm2 -> sl.c3.re,sl.c3.im,sh.c3.re,sh.c3.im +* +* (where sl and sh are of type su3_vector). This can also be interpreted as +* an sse_vector s that is stored in these registers according to +* +* xmm0 -> s.c1.c1,s.c1.c2,s.c1.c3,s.c1.c4 +* xmm1 -> s.c2.c1,s.c2.c2,s.c2.c3,s.c2.c4 +* xmm2 -> s.c3.c1,s.c3.c2,s.c3.c3,s.c3.c4 +* +* The load and store macros can be used to move data in either format +* from and to the xmm registers +* +*******************************************************************************/ + +/* +* Loads two su3 vectors sl and sh to the low and high words of xmm0,xmm1,xmm2 +*/ + +#define _sse_pair_load(sl,sh) \ +__asm__ __volatile__ ("movsd %0, %%xmm0 \n\t" \ + "movsd %1, %%xmm1 \n\t" \ + "movsd %2, %%xmm2 \n\t" \ + "movhps %3, %%xmm0 \n\t" \ + "movhps %4, %%xmm1 \n\t" \ + "movhps %5, %%xmm2" \ + : \ + : \ + "m" ((sl).c1), \ + "m" ((sl).c2), \ + "m" ((sl).c3), \ + "m" ((sh).c1), \ + "m" ((sh).c2), \ + "m" ((sh).c3) \ + : \ + "xmm0", "xmm1", "xmm2") + +/* +* Loads two su3 vectors sl and sh to the low and high words of xmm3,xmm4,xmm5 +*/ + +#define _sse_pair_load_up(sl,sh) \ +__asm__ __volatile__ ("movsd %0, %%xmm3 \n\t" \ + "movsd %1, %%xmm4 \n\t" \ + "movsd %2, %%xmm5 \n\t" \ + "movhps %3, %%xmm3 \n\t" \ + "movhps %4, %%xmm4 \n\t" \ + "movhps %5, %%xmm5" \ + : \ + : \ + "m" ((sl).c1), \ + "m" ((sl).c2), \ + "m" ((sl).c3), \ + "m" ((sh).c1), \ + "m" ((sh).c2), \ + "m" ((sh).c3) \ + : \ + "xmm3", "xmm4", "xmm5") + +/* +* Stores the low and high words of xmm0,xmm1,xmm2 to the su3 vectors rl and rh +*/ + +#define _sse_pair_store(rl,rh) \ +__asm__ __volatile__ ("movlps %%xmm0, %0 \n\t" \ + "movlps %%xmm1, %1 \n\t" \ + "movlps %%xmm2, %2 \n\t" \ + "movhps %%xmm0, %3 \n\t" \ + "movhps %%xmm1, %4 \n\t" \ + "movhps %%xmm2, %5" \ + : \ + "=m" ((rl).c1), \ + "=m" ((rl).c2), \ + "=m" ((rl).c3), \ + "=m" ((rh).c1), \ + "=m" ((rh).c2), \ + "=m" ((rh).c3)) + +/* +* Stores the low and high words of xmm3,xmm4,xmm5 to the su3 vectors rl and rh +*/ + +#define _sse_pair_store_up(rl,rh) \ +__asm__ __volatile__ ("movlps %%xmm3, %0 \n\t" \ + "movlps %%xmm4, %1 \n\t" \ + "movlps %%xmm5, %2 \n\t" \ + "movhps %%xmm3, %3 \n\t" \ + "movhps %%xmm4, %4 \n\t" \ + "movhps %%xmm5, %5" \ + : \ + "=m" ((rl).c1), \ + "=m" ((rl).c2), \ + "=m" ((rl).c3), \ + "=m" ((rh).c1), \ + "=m" ((rh).c2), \ + "=m" ((rh).c3)) + +/* +* Loads the components of a Weyl spinor s to xmm0,xmm1,xmm2 +*/ + +#define _sse_weyl_load(s) \ +__asm__ __volatile__ ("movaps %0, %%xmm0 \n\t" \ + "movaps %2, %%xmm1 \n\t" \ + "movaps %4, %%xmm2" \ + : \ + : \ + "m" ((s).c1.c1), \ + "m" ((s).c1.c2), \ + "m" ((s).c1.c3), \ + "m" ((s).c2.c1), \ + "m" ((s).c2.c2), \ + "m" ((s).c2.c3) \ + : \ + "xmm0", "xmm1", "xmm2") + +/* +* Loads the components of a Weyl spinor s to xmm3,xmm4,xmm5 +*/ + +#define _sse_weyl_load_up(s) \ +__asm__ __volatile__ ("movaps %0, %%xmm3 \n\t" \ + "movaps %2, %%xmm4 \n\t" \ + "movaps %4, %%xmm5" \ + : \ + : \ + "m" ((s).c1.c1), \ + "m" ((s).c1.c2), \ + "m" ((s).c1.c3), \ + "m" ((s).c2.c1), \ + "m" ((s).c2.c2), \ + "m" ((s).c2.c3) \ + : \ + "xmm3", "xmm4", "xmm5") + +/* +* Stores xmm0,xmm1,xmm2 to the components of a Weyl spinor s +*/ + +#define _sse_weyl_store(s) \ +__asm__ __volatile__ ("movaps %%xmm0, %0 \n\t" \ + "movaps %%xmm1, %2 \n\t" \ + "movaps %%xmm2, %4" \ + : \ + "=m" ((s).c1.c1), \ + "=m" ((s).c1.c2), \ + "=m" ((s).c1.c3), \ + "=m" ((s).c2.c1), \ + "=m" ((s).c2.c2), \ + "=m" ((s).c2.c3)) + +/* +* Stores xmm3,xmm4,xmm5 to the components of a Weyl spinor s +*/ + +#define _sse_weyl_store_up(s) \ +__asm__ __volatile__ ("movaps %%xmm3, %0 \n\t" \ + "movaps %%xmm4, %2 \n\t" \ + "movaps %%xmm5, %4" \ + : \ + "=m" ((s).c1.c1), \ + "=m" ((s).c1.c2), \ + "=m" ((s).c1.c3), \ + "=m" ((s).c2.c1), \ + "=m" ((s).c2.c2), \ + "=m" ((s).c2.c3)) + +/* +* Adds xmm3,xmm4,xmm5 to xmm0,xmm1,xmm2 +*/ + +#define _sse_vector_add() \ +__asm__ __volatile__ ("addps %%xmm3, %%xmm0 \n\t" \ + "addps %%xmm4, %%xmm1 \n\t" \ + "addps %%xmm5, %%xmm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2") + +/* +* Subtracts xmm3,xmm4,xmm5 from xmm0,xmm1,xmm2 +*/ + +#define _sse_vector_sub() \ +__asm__ __volatile__ ("subps %%xmm3, %%xmm0 \n\t" \ + "subps %%xmm4, %%xmm1 \n\t" \ + "subps %%xmm5, %%xmm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2") + +/* +* Multiplies the high words xmm3,xmm4,xmm5 with -1 and adds these registers +* to xmm0,xmm1,xmm2 +*/ + +#define _sse_vector_addsub() \ +__asm__ __volatile__ ("mulps %0, %%xmm3 \n\t" \ + "mulps %0, %%xmm4 \n\t" \ + "mulps %0, %%xmm5 \n\t" \ + "addps %%xmm3, %%xmm0 \n\t" \ + "addps %%xmm4, %%xmm1 \n\t" \ + "addps %%xmm5, %%xmm2" \ + : \ + : \ + "m" (_sse_sgn34) \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5") + +/* +* Multiplies the low words xmm3,xmm4,xmm5 with -1 and adds these registers +* to xmm0,xmm1,xmm2 +*/ + +#define _sse_vector_subadd() \ +__asm__ __volatile__ ("mulps %0, %%xmm3 \n\t" \ + "mulps %0, %%xmm4 \n\t" \ + "mulps %0, %%xmm5 \n\t" \ + "addps %%xmm3, %%xmm0 \n\t" \ + "addps %%xmm4, %%xmm1 \n\t" \ + "addps %%xmm5, %%xmm2" \ + : \ + : \ + "m" (_sse_sgn12) \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5") + +/* +* Multiplies xmm3,xmm4,xmm5 with i and adds them to xmm0,xmm1,xmm2 +*/ + +#define _sse_vector_i_add() \ +__asm__ __volatile__ ("shufps $0xb1, %%xmm3, %%xmm3 \n\t" \ + "shufps $0xb1, %%xmm4, %%xmm4 \n\t" \ + "shufps $0xb1, %%xmm5, %%xmm5 \n\t" \ + "addsubps %%xmm3, %%xmm0 \n\t" \ + "addsubps %%xmm4, %%xmm1 \n\t" \ + "addsubps %%xmm5, %%xmm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5") + +/* +* Multiplies xmm3,xmm4,xmm5 with i and subtracts them from xmm0,xmm1,xmm2 +*/ + +#define _sse_vector_i_sub() \ +__asm__ __volatile__ ("shufps $0xb1, %%xmm3, %%xmm3 \n\t" \ + "shufps $0xb1, %%xmm4, %%xmm4 \n\t" \ + "shufps $0xb1, %%xmm5, %%xmm5 \n\t" \ + "mulps %0, %%xmm3 \n\t" \ + "mulps %0, %%xmm4 \n\t" \ + "mulps %0, %%xmm5 \n\t" \ + "addps %%xmm3, %%xmm0 \n\t" \ + "addps %%xmm4, %%xmm1 \n\t" \ + "addps %%xmm5, %%xmm2" \ + : \ + : \ + "m" (_sse_sgn24) \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5") + +/* +* Exchanges the high and low words of xmm3,xmm4,xmm5, multiplies them with i +* and adds the result to xmm0,xmm1,xmm2 +*/ + +#define _sse_vector_xch_i_add() \ +__asm__ __volatile__ ("shufps $0x1b, %%xmm3, %%xmm3 \n\t" \ + "shufps $0x1b, %%xmm4, %%xmm4 \n\t" \ + "shufps $0x1b, %%xmm5, %%xmm5 \n\t" \ + "addsubps %%xmm3, %%xmm0 \n\t" \ + "addsubps %%xmm4, %%xmm1 \n\t" \ + "addsubps %%xmm5, %%xmm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5") + +/* +* Exchanges the high and low words of xmm3,xmm4,xmm5, multiplies them with i +* and subtracts the result from xmm0,xmm1,xmm2 +*/ + +#define _sse_vector_xch_i_sub() \ +__asm__ __volatile__ ("shufps $0x1b, %%xmm3, %%xmm3 \n\t" \ + "shufps $0x1b, %%xmm4, %%xmm4 \n\t" \ + "shufps $0x1b, %%xmm5, %%xmm5 \n\t" \ + "mulps %0, %%xmm3 \n\t" \ + "mulps %0, %%xmm4 \n\t" \ + "mulps %0, %%xmm5 \n\t" \ + "addps %%xmm3, %%xmm0 \n\t" \ + "addps %%xmm4, %%xmm1 \n\t" \ + "addps %%xmm5, %%xmm2" \ + : \ + : \ + "m" (_sse_sgn24) \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5") + +/* +* Multiplies the low and high words of xmm3,xmm4,xmm5 with i and -i +* respectively and adds these registers to xmm0,xmm1,xmm2 +*/ + +#define _sse_vector_i_addsub() \ +__asm__ __volatile__ ("shufps $0xb1, %%xmm3, %%xmm3 \n\t" \ + "shufps $0xb1, %%xmm4, %%xmm4 \n\t" \ + "shufps $0xb1, %%xmm5, %%xmm5 \n\t" \ + "mulps %0, %%xmm3 \n\t" \ + "mulps %0, %%xmm4 \n\t" \ + "mulps %0, %%xmm5 \n\t" \ + "addps %%xmm3, %%xmm0 \n\t" \ + "addps %%xmm4, %%xmm1 \n\t" \ + "addps %%xmm5, %%xmm2" \ + : \ + : \ + "m" (_sse_sgn14) \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5") + +/* +* Multiplies the low and high words of xmm3,xmm4,xmm5 with -i and i +* respectively and adds these registers to xmm0,xmm1,xmm2 +*/ + +#define _sse_vector_i_subadd() \ +__asm__ __volatile__ ("shufps $0xb1, %%xmm3, %%xmm3 \n\t" \ + "shufps $0xb1, %%xmm4, %%xmm4 \n\t" \ + "shufps $0xb1, %%xmm5, %%xmm5 \n\t" \ + "mulps %0, %%xmm3 \n\t" \ + "mulps %0, %%xmm4 \n\t" \ + "mulps %0, %%xmm5 \n\t" \ + "addps %%xmm3, %%xmm0 \n\t" \ + "addps %%xmm4, %%xmm1 \n\t" \ + "addps %%xmm5, %%xmm2" \ + : \ + : \ + "m" (_sse_sgn23) \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5") + +/* +* Exchanges the high and low words in xmm3,xmm4,xmm5 +*/ + +#define _sse_vector_xch() \ +__asm__ __volatile__ ("shufps $0x4e, %%xmm3, %%xmm3 \n\t" \ + "shufps $0x4e, %%xmm4, %%xmm4 \n\t" \ + "shufps $0x4e, %%xmm5, %%xmm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5") + +/****************************************************************************** +* +* Action of su3 matrices on su3 vectors +* +******************************************************************************/ + +/* +* Multiplies a pair sl,sh of su3 vectors with an su3 matrix u, +* assuming sl and sh are in the low and high words of xmm0,xmm1,xmm2 +* +* On output the result is in xmm3,xmm4,xmm5 and the registers +* xmm0,xmm1,xmm2 are changed +*/ + +#define _sse_su3_multiply(u) \ +__asm__ __volatile__ ("movss %0, %%xmm3 \n\t" \ + "movss %1, %%xmm6 \n\t" \ + "movss %2, %%xmm4 \n\t" \ + "movss %3, %%xmm7 \n\t" \ + "movss %4, %%xmm5 \n\t" \ + "movss %5, %%xmm8 \n\t" \ + "shufps $0x0, %%xmm3, %%xmm3 \n\t" \ + "shufps $0x0, %%xmm6, %%xmm6 \n\t" \ + "shufps $0x0, %%xmm4, %%xmm4 \n\t" \ + "shufps $0x0, %%xmm7, %%xmm7 \n\t" \ + "shufps $0x0, %%xmm5, %%xmm5 \n\t" \ + "shufps $0x0, %%xmm8, %%xmm8" \ + : \ + : \ + "m" ((u).c11.re), \ + "m" ((u).c12.re), \ + "m" ((u).c21.re), \ + "m" ((u).c22.re), \ + "m" ((u).c31.re), \ + "m" ((u).c32.re) \ + : \ + "xmm3", "xmm4", "xmm5", \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("mulps %%xmm0, %%xmm3 \n\t" \ + "mulps %%xmm1, %%xmm6 \n\t" \ + "mulps %%xmm0, %%xmm4 \n\t" \ + "mulps %%xmm1, %%xmm7 \n\t" \ + "mulps %%xmm0, %%xmm5 \n\t" \ + "mulps %%xmm1, %%xmm8 \n\t" \ + "addps %%xmm6, %%xmm3 \n\t" \ + "addps %%xmm7, %%xmm4 \n\t" \ + "addps %%xmm8, %%xmm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5", \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("movss %0, %%xmm9 \n\t" \ + "movss %1, %%xmm10 \n\t" \ + "movss %2, %%xmm11 \n\t" \ + "movss %3, %%xmm6 \n\t" \ + "movss %4, %%xmm7 \n\t" \ + "movss %5, %%xmm8 \n\t" \ + "shufps $0xb1, %%xmm0, %%xmm0 \n\t" \ + "shufps $0x0, %%xmm9, %%xmm9 \n\t" \ + "shufps $0x0, %%xmm10, %%xmm10 \n\t" \ + "shufps $0x0, %%xmm11, %%xmm11 \n\t" \ + "shufps $0x0, %%xmm6, %%xmm6 \n\t" \ + "shufps $0x0, %%xmm7, %%xmm7 \n\t" \ + "shufps $0x0, %%xmm8, %%xmm8" \ + : \ + : \ + "m" ((u).c13.re), \ + "m" ((u).c21.im), \ + "m" ((u).c33.re), \ + "m" ((u).c11.im), \ + "m" ((u).c23.re), \ + "m" ((u).c31.im) \ + : \ + "xmm0", "xmm6", "xmm7", "xmm8", \ + "xmm9", "xmm10", "xmm11"); \ +__asm__ __volatile__ ("mulps %%xmm2, %%xmm9 \n\t" \ + "mulps %%xmm0, %%xmm10 \n\t" \ + "mulps %%xmm2, %%xmm11 \n\t" \ + "mulps %%xmm0, %%xmm6 \n\t" \ + "mulps %%xmm2, %%xmm7 \n\t" \ + "mulps %%xmm0, %%xmm8 \n\t" \ + "addps %%xmm9, %%xmm3 \n\t" \ + "addsubps %%xmm10, %%xmm4 \n\t" \ + "addps %%xmm11, %%xmm5 \n\t" \ + "addsubps %%xmm6, %%xmm3 \n\t" \ + "addps %%xmm7, %%xmm4 \n\t" \ + "addsubps %%xmm8, %%xmm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5", \ + "xmm6", "xmm7", "xmm8", \ + "xmm9", "xmm10", "xmm11"); \ +__asm__ __volatile__ ("movss %0, %%xmm9 \n\t" \ + "movss %1, %%xmm10 \n\t" \ + "movss %2, %%xmm11 \n\t" \ + "movss %3, %%xmm6 \n\t" \ + "movss %4, %%xmm7 \n\t" \ + "movss %5, %%xmm8 \n\t" \ + "shufps $0xb1, %%xmm1, %%xmm1 \n\t" \ + "shufps $0xb1, %%xmm2, %%xmm2 \n\t" \ + "shufps $0x0, %%xmm9, %%xmm9 \n\t" \ + "shufps $0x0, %%xmm10, %%xmm10 \n\t" \ + "shufps $0x0, %%xmm11, %%xmm11 \n\t" \ + "shufps $0x0, %%xmm6, %%xmm6 \n\t" \ + "shufps $0x0, %%xmm7, %%xmm7 \n\t" \ + "shufps $0x0, %%xmm8, %%xmm8" \ + : \ + : \ + "m" ((u).c12.im), \ + "m" ((u).c23.im), \ + "m" ((u).c32.im), \ + "m" ((u).c13.im), \ + "m" ((u).c22.im), \ + "m" ((u).c33.im) \ + : \ + "xmm1", "xmm2", "xmm6", "xmm7", \ + "xmm8", "xmm9", "xmm10", "xmm11"); \ +__asm__ __volatile__ ("mulps %%xmm1, %%xmm9 \n\t" \ + "mulps %%xmm2, %%xmm10 \n\t" \ + "mulps %%xmm1, %%xmm11 \n\t" \ + "mulps %%xmm2, %%xmm6 \n\t" \ + "mulps %%xmm1, %%xmm7 \n\t" \ + "mulps %%xmm2, %%xmm8 \n\t" \ + "addsubps %%xmm9, %%xmm3 \n\t" \ + "addsubps %%xmm10, %%xmm4 \n\t" \ + "addsubps %%xmm11, %%xmm5 \n\t" \ + "addsubps %%xmm6, %%xmm3 \n\t" \ + "addsubps %%xmm7, %%xmm4 \n\t" \ + "addsubps %%xmm8, %%xmm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5", \ + "xmm6", "xmm7", "xmm8", \ + "xmm9", "xmm10", "xmm11") + +/* +* Multiplies a pair sl,sh of su3 vectors with an su3 matrix u^dagger, +* assuming sl and sh are in the low and high words of xmm0,xmm1,xmm2 +* +* On output the result is in xmm3,xmm4,xmm5 and the registers +* xmm0,xmm1,xmm2 are changed +*/ + +#define _sse_su3_inverse_multiply(u) \ +__asm__ __volatile__ ("movss %0, %%xmm6 \n\t" \ + "movss %1, %%xmm9 \n\t" \ + "movss %2, %%xmm7 \n\t" \ + "movss %3, %%xmm10 \n\t" \ + "movss %4, %%xmm8 \n\t" \ + "movss %5, %%xmm11 \n\t" \ + "shufps $0x0, %%xmm6, %%xmm6 \n\t" \ + "shufps $0x0, %%xmm9, %%xmm9 \n\t" \ + "shufps $0x0, %%xmm7, %%xmm7 \n\t" \ + "shufps $0x0, %%xmm10, %%xmm10 \n\t" \ + "shufps $0x0, %%xmm8, %%xmm8 \n\t" \ + "shufps $0x0, %%xmm11, %%xmm11" \ + : \ + : \ + "m" ((u).c11.im), \ + "m" ((u).c21.im), \ + "m" ((u).c12.im), \ + "m" ((u).c22.im), \ + "m" ((u).c13.im), \ + "m" ((u).c23.im) \ + : \ + "xmm6", "xmm7", "xmm8", \ + "xmm9", "xmm10", "xmm11"); \ +__asm__ __volatile__ ("mulps %%xmm0, %%xmm6 \n\t" \ + "mulps %%xmm1, %%xmm9 \n\t" \ + "mulps %%xmm0, %%xmm7 \n\t" \ + "mulps %%xmm1, %%xmm10 \n\t" \ + "mulps %%xmm0, %%xmm8 \n\t" \ + "mulps %%xmm1, %%xmm11 \n\t" \ + "addps %%xmm6, %%xmm9 \n\t" \ + "addps %%xmm7, %%xmm10 \n\t" \ + "addps %%xmm8, %%xmm11" \ + : \ + : \ + : \ + "xmm6", "xmm7", "xmm8", \ + "xmm9", "xmm10", "xmm11"); \ +__asm__ __volatile__ ("movss %0, %%xmm3 \n\t" \ + "movss %1, %%xmm4 \n\t" \ + "movss %2, %%xmm5 \n\t" \ + "movss %3, %%xmm6 \n\t" \ + "movss %4, %%xmm7 \n\t" \ + "movss %5, %%xmm8 \n\t" \ + "shufps $0xb1, %%xmm0, %%xmm0 \n\t" \ + "shufps $0x0, %%xmm3, %%xmm3 \n\t" \ + "shufps $0x0, %%xmm4, %%xmm4 \n\t" \ + "shufps $0x0, %%xmm5, %%xmm5 \n\t" \ + "shufps $0x0, %%xmm6, %%xmm6 \n\t" \ + "shufps $0x0, %%xmm7, %%xmm7 \n\t" \ + "shufps $0x0, %%xmm8, %%xmm8" \ + : \ + : \ + "m" ((u).c11.re), \ + "m" ((u).c12.re), \ + "m" ((u).c13.re), \ + "m" ((u).c31.im), \ + "m" ((u).c32.im), \ + "m" ((u).c33.im) \ + : \ + "xmm0", "xmm3", "xmm4", "xmm5", \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("mulps %%xmm0, %%xmm3 \n\t" \ + "mulps %%xmm0, %%xmm4 \n\t" \ + "mulps %%xmm0, %%xmm5 \n\t" \ + "mulps %%xmm2, %%xmm6 \n\t" \ + "mulps %%xmm2, %%xmm7 \n\t" \ + "mulps %%xmm2, %%xmm8 \n\t" \ + "addsubps %%xmm9, %%xmm3 \n\t" \ + "addsubps %%xmm10, %%xmm4 \n\t" \ + "addsubps %%xmm11, %%xmm5 \n\t" \ + "addsubps %%xmm6, %%xmm3 \n\t" \ + "addsubps %%xmm7, %%xmm4 \n\t" \ + "addsubps %%xmm8, %%xmm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5", \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("movss %0, %%xmm9 \n\t" \ + "movss %1, %%xmm10 \n\t" \ + "movss %2, %%xmm11 \n\t" \ + "movss %3, %%xmm6 \n\t" \ + "movss %4, %%xmm7 \n\t" \ + "movss %5, %%xmm8 \n\t" \ + "shufps $0xb1, %%xmm1, %%xmm1 \n\t" \ + "shufps $0xb1, %%xmm2, %%xmm2 \n\t" \ + "shufps $0x0, %%xmm9, %%xmm9 \n\t" \ + "shufps $0x0, %%xmm10, %%xmm10 \n\t" \ + "shufps $0x0, %%xmm11, %%xmm11 \n\t" \ + "shufps $0x0, %%xmm6, %%xmm6 \n\t" \ + "shufps $0x0, %%xmm7, %%xmm7 \n\t" \ + "shufps $0x0, %%xmm8, %%xmm8" \ + : \ + : \ + "m" ((u).c21.re), \ + "m" ((u).c32.re), \ + "m" ((u).c23.re), \ + "m" ((u).c31.re), \ + "m" ((u).c22.re), \ + "m" ((u).c33.re) \ + : \ + "xmm1", "xmm2", "xmm6", "xmm7", \ + "xmm8", "xmm9", "xmm10", "xmm11"); \ +__asm__ __volatile__ ("mulps %%xmm1, %%xmm9 \n\t" \ + "mulps %%xmm2, %%xmm10 \n\t" \ + "mulps %%xmm1, %%xmm11 \n\t" \ + "mulps %%xmm2, %%xmm6 \n\t" \ + "mulps %%xmm1, %%xmm7 \n\t" \ + "mulps %%xmm2, %%xmm8 \n\t" \ + "addps %%xmm9, %%xmm3 \n\t" \ + "addps %%xmm10, %%xmm4 \n\t" \ + "addps %%xmm11, %%xmm5 \n\t" \ + "addps %%xmm6, %%xmm3 \n\t" \ + "addps %%xmm7, %%xmm4 \n\t" \ + "addps %%xmm8, %%xmm5 \n\t" \ + "shufps $0xb1, %%xmm3, %%xmm3 \n\t" \ + "shufps $0xb1, %%xmm4, %%xmm4 \n\t" \ + "shufps $0xb1, %%xmm5, %%xmm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5", \ + "xmm6", "xmm7", "xmm8", \ + "xmm9", "xmm10", "xmm11") + +/****************************************************************************** +* +* Macros for Dirac spinors +* +******************************************************************************/ + +/* +* Loads the spinor s to the registers xmm0,..,xmm5 in linear order +*/ + +#define _sse_spinor_load(s) \ +__asm__ __volatile__ ("movaps %0, %%xmm0 \n\t" \ + "movaps %2, %%xmm1 \n\t" \ + "movaps %4, %%xmm2" \ + : \ + : \ + "m" ((s).c1.c1), \ + "m" ((s).c1.c2), \ + "m" ((s).c1.c3), \ + "m" ((s).c2.c1), \ + "m" ((s).c2.c2), \ + "m" ((s).c2.c3) \ + : \ + "xmm0", "xmm1", "xmm2"); \ +__asm__ __volatile__ ("movaps %0, %%xmm3 \n\t" \ + "movaps %2, %%xmm4 \n\t" \ + "movaps %4, %%xmm5" \ + : \ + : \ + "m" ((s).c3.c1), \ + "m" ((s).c3.c2), \ + "m" ((s).c3.c3), \ + "m" ((s).c4.c1), \ + "m" ((s).c4.c2), \ + "m" ((s).c4.c3) \ + : \ + "xmm3", "xmm4", "xmm5") + +/* +* Loads the spinor s to the registers xmm6,..,xmm11 in linear order +*/ + +#define _sse_spinor_load_up(s) \ +__asm__ __volatile__ ("movaps %0, %%xmm6 \n\t" \ + "movaps %2, %%xmm7 \n\t" \ + "movaps %4, %%xmm8" \ + : \ + : \ + "m" ((s).c1.c1), \ + "m" ((s).c1.c2), \ + "m" ((s).c1.c3), \ + "m" ((s).c2.c1), \ + "m" ((s).c2.c2), \ + "m" ((s).c2.c3) \ + : \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("movaps %0, %%xmm9 \n\t" \ + "movaps %2, %%xmm10 \n\t" \ + "movaps %4, %%xmm11" \ + : \ + : \ + "m" ((s).c3.c1), \ + "m" ((s).c3.c2), \ + "m" ((s).c3.c3), \ + "m" ((s).c4.c1), \ + "m" ((s).c4.c2), \ + "m" ((s).c4.c3) \ + : \ + "xmm9", "xmm10", "xmm11") + +/* +* Stores the registers xmm0,..,xmm5 to the spinor s in linear order +*/ + +#define _sse_spinor_store(s) \ +__asm__ __volatile__ ("movaps %%xmm0, %0 \n\t" \ + "movaps %%xmm1, %2 \n\t" \ + "movaps %%xmm2, %4" \ + : \ + "=m" ((s).c1.c1), \ + "=m" ((s).c1.c2), \ + "=m" ((s).c1.c3), \ + "=m" ((s).c2.c1), \ + "=m" ((s).c2.c2), \ + "=m" ((s).c2.c3)); \ +__asm__ __volatile__ ("movaps %%xmm3, %0 \n\t" \ + "movaps %%xmm4, %2 \n\t" \ + "movaps %%xmm5, %4" \ + : \ + "=m" ((s).c3.c1), \ + "=m" ((s).c3.c2), \ + "=m" ((s).c3.c3), \ + "=m" ((s).c4.c1), \ + "=m" ((s).c4.c2), \ + "=m" ((s).c4.c3)) + +/* +* Stores the registers xmm6,..,xmm11 to the spinor s in linear order +*/ + +#define _sse_spinor_store_up(s) \ +__asm__ __volatile__ ("movaps %%xmm6, %0 \n\t" \ + "movaps %%xmm7, %2 \n\t" \ + "movaps %%xmm8, %4" \ + : \ + "=m" ((s).c1.c1), \ + "=m" ((s).c1.c2), \ + "=m" ((s).c1.c3), \ + "=m" ((s).c2.c1), \ + "=m" ((s).c2.c2), \ + "=m" ((s).c2.c3)); \ +__asm__ __volatile__ ("movaps %%xmm9, %0 \n\t" \ + "movaps %%xmm10, %2 \n\t" \ + "movaps %%xmm11, %4" \ + : \ + "=m" ((s).c3.c1), \ + "=m" ((s).c3.c2), \ + "=m" ((s).c3.c3), \ + "=m" ((s).c4.c1), \ + "=m" ((s).c4.c2), \ + "=m" ((s).c4.c3)) + +/* +* Loads (z.re,z.re,z.re,z.re) to xmm6 and (-z.im,z.im,-z.im,z.im) to xmm7 +*/ + +#define _sse_load_cmplx(z) \ +__asm__ __volatile__ ("movss %0, %%xmm6 \n\t" \ + "movss %1, %%xmm7 \n\t" \ + "shufps $0x0, %%xmm6, %%xmm6 \n\t" \ + "shufps $0x0, %%xmm7, %%xmm7 \n\t" \ + "mulps %2, %%xmm7" \ + : \ + : \ + "m" ((z).re), \ + "m" ((z).im), \ + "m" (_sse_sgn13) \ + : \ + "xmm6", "xmm7") + +/* +* Multiplies the spinor s by the complex number z and assigns the result to +* xmm0,..,xmm5, assuming z was loaded to xmm6,xmm7 using _sse_load_cmplx(z) +*/ + +#define _sse_mulc_spinor(s) \ +__asm__ __volatile__ ("movaps %0, %%xmm0 \n\t" \ + "movaps %2, %%xmm1 \n\t" \ + "movaps %4, %%xmm2" \ + : \ + : \ + "m" ((s).c1.c1), \ + "m" ((s).c1.c2), \ + "m" ((s).c1.c3), \ + "m" ((s).c2.c1), \ + "m" ((s).c2.c2), \ + "m" ((s).c2.c3) \ + : \ + "xmm0", "xmm1", "xmm2"); \ +__asm__ __volatile__ ("movaps %%xmm0, %%xmm8 \n\t" \ + "movaps %%xmm1, %%xmm9 \n\t" \ + "movaps %%xmm2, %%xmm10 \n\t" \ + "mulps %%xmm6, %%xmm0 \n\t" \ + "mulps %%xmm6, %%xmm1 \n\t" \ + "mulps %%xmm6, %%xmm2 \n\t" \ + "shufps $0xb1, %%xmm8, %%xmm8 \n\t" \ + "shufps $0xb1, %%xmm9, %%xmm9 \n\t" \ + "shufps $0xb1, %%xmm10, %%xmm10 \n\t" \ + "mulps %%xmm7, %%xmm8 \n\t" \ + "mulps %%xmm7, %%xmm9 \n\t" \ + "mulps %%xmm7, %%xmm10 \n\t" \ + "addps %%xmm8, %%xmm0 \n\t" \ + "addps %%xmm9, %%xmm1 \n\t" \ + "addps %%xmm10, %%xmm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm8", "xmm9", "xmm10"); \ +__asm__ __volatile__ ("movaps %0, %%xmm3 \n\t" \ + "movaps %2, %%xmm4 \n\t" \ + "movaps %4, %%xmm5" \ + : \ + : \ + "m" ((s).c3.c1), \ + "m" ((s).c3.c2), \ + "m" ((s).c3.c3), \ + "m" ((s).c4.c1), \ + "m" ((s).c4.c2), \ + "m" ((s).c4.c3) \ + : \ + "xmm3", "xmm4", "xmm5"); \ +__asm__ __volatile__ ("movaps %%xmm3, %%xmm11 \n\t" \ + "movaps %%xmm4, %%xmm12 \n\t" \ + "movaps %%xmm5, %%xmm13 \n\t" \ + "mulps %%xmm6, %%xmm3 \n\t" \ + "mulps %%xmm6, %%xmm4 \n\t" \ + "mulps %%xmm6, %%xmm5 \n\t" \ + "shufps $0xb1, %%xmm11, %%xmm11 \n\t" \ + "shufps $0xb1, %%xmm12, %%xmm12 \n\t" \ + "shufps $0xb1, %%xmm13, %%xmm13 \n\t" \ + "mulps %%xmm7, %%xmm11 \n\t" \ + "mulps %%xmm7, %%xmm12 \n\t" \ + "mulps %%xmm7, %%xmm13 \n\t" \ + "addps %%xmm11, %%xmm3 \n\t" \ + "addps %%xmm12, %%xmm4 \n\t" \ + "addps %%xmm13, %%xmm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5", \ + "xmm11", "xmm12", "xmm13") + + +/* +* Multiplies the spinor s by the complex number z and adds the result to +* xmm0,..,xmm5, assuming z was loaded to xmm6,xmm7 using _sse_load_cmplx(z) +*/ + +#define _sse_mulc_spinor_add(s) \ +__asm__ __volatile__ ("movaps %0, %%xmm8 \n\t" \ + "movaps %2, %%xmm9 \n\t" \ + "movaps %4, %%xmm10" \ + : \ + : \ + "m" ((s).c1.c1), \ + "m" ((s).c1.c2), \ + "m" ((s).c1.c3), \ + "m" ((s).c2.c1), \ + "m" ((s).c2.c2), \ + "m" ((s).c2.c3) \ + : \ + "xmm8", "xmm9", "xmm10"); \ +__asm__ __volatile__ ("movaps %%xmm8, %%xmm11 \n\t" \ + "movaps %%xmm9, %%xmm12 \n\t" \ + "movaps %%xmm10, %%xmm13 \n\t" \ + "mulps %%xmm6, %%xmm8 \n\t" \ + "mulps %%xmm6, %%xmm9 \n\t" \ + "mulps %%xmm6, %%xmm10 \n\t" \ + "shufps $0xb1, %%xmm11, %%xmm11 \n\t" \ + "shufps $0xb1, %%xmm12, %%xmm12 \n\t" \ + "shufps $0xb1, %%xmm13, %%xmm13 \n\t" \ + "addps %%xmm8, %%xmm0 \n\t" \ + "addps %%xmm9, %%xmm1 \n\t" \ + "addps %%xmm10, %%xmm2 \n\t" \ + "mulps %%xmm7, %%xmm11 \n\t" \ + "mulps %%xmm7, %%xmm12 \n\t" \ + "mulps %%xmm7, %%xmm13 \n\t" \ + "addps %%xmm11, %%xmm0 \n\t" \ + "addps %%xmm12, %%xmm1 \n\t" \ + "addps %%xmm13, %%xmm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm8", "xmm9", "xmm10", \ + "xmm11", "xmm12", "xmm13"); \ +__asm__ __volatile__ ("movaps %0, %%xmm8 \n\t" \ + "movaps %2, %%xmm9 \n\t" \ + "movaps %4, %%xmm10" \ + : \ + : \ + "m" ((s).c3.c1), \ + "m" ((s).c3.c2), \ + "m" ((s).c3.c3), \ + "m" ((s).c4.c1), \ + "m" ((s).c4.c2), \ + "m" ((s).c4.c3) \ + : \ + "xmm8", "xmm9", "xmm10"); \ +__asm__ __volatile__ ("movaps %%xmm8, %%xmm11 \n\t" \ + "movaps %%xmm9, %%xmm12 \n\t" \ + "movaps %%xmm10, %%xmm13 \n\t" \ + "mulps %%xmm6, %%xmm8 \n\t" \ + "mulps %%xmm6, %%xmm9 \n\t" \ + "mulps %%xmm6, %%xmm10 \n\t" \ + "shufps $0xb1, %%xmm11, %%xmm11 \n\t" \ + "shufps $0xb1, %%xmm12, %%xmm12 \n\t" \ + "shufps $0xb1, %%xmm13, %%xmm13 \n\t" \ + "addps %%xmm8, %%xmm3 \n\t" \ + "addps %%xmm9, %%xmm4 \n\t" \ + "addps %%xmm10, %%xmm5 \n\t" \ + "mulps %%xmm7, %%xmm11 \n\t" \ + "mulps %%xmm7, %%xmm12 \n\t" \ + "mulps %%xmm7, %%xmm13 \n\t" \ + "addps %%xmm11, %%xmm3 \n\t" \ + "addps %%xmm12, %%xmm4 \n\t" \ + "addps %%xmm13, %%xmm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5", \ + "xmm8", "xmm9", "xmm10", \ + "xmm11", "xmm12", "xmm13") + +/* +* Loads (c,c,c,c) to xmm6 and xmm7 +*/ + +#define _sse_load_real(c) \ +__asm__ __volatile__ ("movss %0, %%xmm6 \n\t" \ + "movss %0, %%xmm7 \n\t" \ + "shufps $0x0, %%xmm6, %%xmm6 \n\t" \ + "shufps $0x0, %%xmm7, %%xmm7" \ + : \ + : \ + "m" (c) \ + : \ + "xmm6", "xmm7") + +/* +* Multiplies the spinor s by the real number c and assigns the result to +* xmm0,..,xmm5, assuming c was loaded to xmm6,xmm7 using _sse_load_real(c) +*/ + +#define _sse_mulr_spinor(s) \ +__asm__ __volatile__ ("movaps %0, %%xmm0 \n\t" \ + "movaps %2, %%xmm1 \n\t" \ + "movaps %4, %%xmm2" \ + : \ + : \ + "m" ((s).c1.c1), \ + "m" ((s).c1.c2), \ + "m" ((s).c1.c3), \ + "m" ((s).c2.c1), \ + "m" ((s).c2.c2), \ + "m" ((s).c2.c3) \ + : \ + "xmm0", "xmm1", "xmm2"); \ +__asm__ __volatile__ ("mulps %%xmm6, %%xmm0 \n\t" \ + "mulps %%xmm7, %%xmm1 \n\t" \ + "mulps %%xmm6, %%xmm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2"); \ +__asm__ __volatile__ ("movaps %0, %%xmm3 \n\t" \ + "movaps %2, %%xmm4 \n\t" \ + "movaps %4, %%xmm5" \ + : \ + : \ + "m" ((s).c3.c1), \ + "m" ((s).c3.c2), \ + "m" ((s).c3.c3), \ + "m" ((s).c4.c1), \ + "m" ((s).c4.c2), \ + "m" ((s).c4.c3) \ + : \ + "xmm3", "xmm4", "xmm5"); \ +__asm__ __volatile__ ("mulps %%xmm7, %%xmm3 \n\t" \ + "mulps %%xmm6, %%xmm4 \n\t" \ + "mulps %%xmm7, %%xmm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5") + +/* +* Multiplies the spinor s by the real number c and adds the result to +* xmm0,..,xmm5, assuming c was loaded to xmm6,xmm7 using _sse_load_real(c) +*/ + +#define _sse_mulr_spinor_add(s) \ +__asm__ __volatile__ ("movaps %0, %%xmm8 \n\t" \ + "movaps %2, %%xmm9 \n\t" \ + "movaps %4, %%xmm10" \ + : \ + : \ + "m" ((s).c1.c1), \ + "m" ((s).c1.c2), \ + "m" ((s).c1.c3), \ + "m" ((s).c2.c1), \ + "m" ((s).c2.c2), \ + "m" ((s).c2.c3) \ + : \ + "xmm8", "xmm9", "xmm10"); \ +__asm__ __volatile__ ("mulps %%xmm6, %%xmm8 \n\t" \ + "mulps %%xmm7, %%xmm9 \n\t" \ + "mulps %%xmm6, %%xmm10 \n\t" \ + "addps %%xmm8, %%xmm0 \n\t" \ + "addps %%xmm9, %%xmm1 \n\t" \ + "addps %%xmm10, %%xmm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm8", "xmm9", "xmm10"); \ +__asm__ __volatile__ ("movaps %0, %%xmm11 \n\t" \ + "movaps %2, %%xmm12 \n\t" \ + "movaps %4, %%xmm13" \ + : \ + : \ + "m" ((s).c3.c1), \ + "m" ((s).c3.c2), \ + "m" ((s).c3.c3), \ + "m" ((s).c4.c1), \ + "m" ((s).c4.c2), \ + "m" ((s).c4.c3) \ + : \ + "xmm11", "xmm12", "xmm13"); \ +__asm__ __volatile__ ("mulps %%xmm7, %%xmm11 \n\t" \ + "mulps %%xmm6, %%xmm12 \n\t" \ + "mulps %%xmm7, %%xmm13 \n\t" \ + "addps %%xmm11, %%xmm3 \n\t" \ + "addps %%xmm12, %%xmm4 \n\t" \ + "addps %%xmm13, %%xmm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5", \ + "xmm11", "xmm12", "xmm13") + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/sse2.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/sse2.h new file mode 100644 index 0000000000000000000000000000000000000000..a0c7fb168ad71ff55102b16e83e0cf8f75e4080e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/sse2.h @@ -0,0 +1,659 @@ + +/******************************************************************************* +* +* File sse2.h +* +* Copyright (C) 2005, 2008, 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Macros for Dirac spinors, SU(3) vectors and SU(3) matrices using inline +* assembly SSE3 instructions. The machine is assumed to comply with the +* x86-64 instruction set. +* +*******************************************************************************/ + +#ifndef SSE2_H +#define SSE2_H + +#ifndef SSE_H +#include "sse.h" +#endif + +typedef struct +{ + double c1,c2; +} sse_double __attribute__ ((aligned (16))); + +static sse_double _sse_sgn1_dble __attribute__ ((unused)) ={-1.0,1.0}; +static sse_double _sse_sgn2_dble __attribute__ ((unused)) ={1.0,-1.0}; +static sse_double _sse_sgn_dble __attribute__ ((unused)) ={-1.0,-1.0}; + +/******************************************************************************* +* +* Macros for double-precision su3 vectors +* +* Most of these macros operate on su3 vectors that are stored +* in xmm0,xmm1,xmm2 or xmm3,xmm4,xmm5. For example, +* +* xmm0 -> s.c1.re,s.c1.im +* xmm1 -> s.c2.re,s.c2.im +* xmm2 -> s.c3.re,s.c3.im +* +* where s is of type su3_vector_dble +* +*******************************************************************************/ + +/* +* Loads an su3 vector s to xmm0,xmm1,xmm2 +*/ + +#define _sse_load_dble(s) \ +__asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" \ + "movapd %1, %%xmm1 \n\t" \ + "movapd %2, %%xmm2" \ + : \ + : \ + "m" ((s).c1), \ + "m" ((s).c2), \ + "m" ((s).c3) \ + : \ + "xmm0", "xmm1", "xmm2") + +/* +* Loads an su3 vector s to xmm3,xmm4,xmm5 +*/ + +#define _sse_load_up_dble(s) \ +__asm__ __volatile__ ("movapd %0, %%xmm3 \n\t" \ + "movapd %1, %%xmm4 \n\t" \ + "movapd %2, %%xmm5" \ + : \ + : \ + "m" ((s).c1), \ + "m" ((s).c2), \ + "m" ((s).c3) \ + : \ + "xmm3", "xmm4", "xmm5") + +/* +* Stores xmm0,xmm1,xmm2 to the components r.c1,r.c2,r.c3 of an su3 vector +*/ + +#define _sse_store_dble(r) \ +__asm__ __volatile__ ("movapd %%xmm0, %0 \n\t" \ + "movapd %%xmm1, %1 \n\t" \ + "movapd %%xmm2, %2" \ + : \ + "=m" ((r).c1), \ + "=m" ((r).c2), \ + "=m" ((r).c3)) + +/* +* Stores xmm3,xmm4,xmm5 to the components r.c1,r.c2,r.c3 of an su3 vector +*/ + +#define _sse_store_up_dble(r) \ +__asm__ __volatile__ ("movapd %%xmm3, %0 \n\t" \ + "movapd %%xmm4, %1 \n\t" \ + "movapd %%xmm5, %2" \ + : \ + "=m" ((r).c1), \ + "=m" ((r).c2), \ + "=m" ((r).c3)) + +/* +* Multiplies xmm0,xmm1,xmm2 with a constant sse_double c +*/ + +#define _sse_vector_mul_dble(c) \ +__asm__ __volatile__ ("mulpd %0, %%xmm0 \n\t" \ + "mulpd %0, %%xmm1 \n\t" \ + "mulpd %0, %%xmm2" \ + : \ + : \ + "m" (c) \ + : \ + "xmm0", "xmm1", "xmm2") + +/* +* Multiplies xmm3,xmm4,xmm5 with a constant sse_double c +*/ + +#define _sse_vector_mul_up_dble(c) \ +__asm__ __volatile__ ("mulpd %0, %%xmm3 \n\t" \ + "mulpd %0, %%xmm4 \n\t" \ + "mulpd %0, %%xmm5" \ + : \ + : \ + "m" (c) \ + : \ + "xmm3", "xmm4", "xmm5") + +/* +* Adds xmm3,xmm4,xmm5 to xmm0,xmm1,xmm2 +*/ + +#define _sse_vector_add_dble() \ +__asm__ __volatile__ ("addpd %%xmm3, %%xmm0 \n\t" \ + "addpd %%xmm4, %%xmm1 \n\t" \ + "addpd %%xmm5, %%xmm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2") + +/* +* Subtracts xmm3,xmm4,xmm5 from xmm0,xmm1,xmm2 +*/ + +#define _sse_vector_sub_dble() \ +__asm__ __volatile__ ("subpd %%xmm3, %%xmm0 \n\t" \ + "subpd %%xmm4, %%xmm1 \n\t" \ + "subpd %%xmm5, %%xmm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2") + +/* +* Multiplies xmm3,xmm4,xmm5 with i +*/ + +#define _sse_vector_i_mul_dble() \ +__asm__ __volatile__ ("shufpd $0x1, %%xmm3, %%xmm3 \n\t" \ + "shufpd $0x1, %%xmm4, %%xmm4 \n\t" \ + "shufpd $0x1, %%xmm5, %%xmm5 \n\t" \ + "mulpd %0, %%xmm3 \n\t" \ + "mulpd %0, %%xmm4 \n\t" \ + "mulpd %0, %%xmm5" \ + : \ + : \ + "m" (_sse_sgn1_dble) \ + : \ + "xmm3", "xmm4", "xmm5") + +/* +* Multiplies xmm3,xmm4,xmm5 with i and adds them to xmm0,xmm1,xmm2 +*/ + +#define _sse_vector_i_add_dble() \ +__asm__ __volatile__ ("shufpd $0x1, %%xmm3, %%xmm3 \n\t" \ + "shufpd $0x1, %%xmm4, %%xmm4 \n\t" \ + "shufpd $0x1, %%xmm5, %%xmm5 \n\t" \ + "addsubpd %%xmm3, %%xmm0 \n\t" \ + "addsubpd %%xmm4, %%xmm1 \n\t" \ + "addsubpd %%xmm5, %%xmm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm3", "xmm4", "xmm5") + +/* +* Loads (z.re,z.re) to xmm6 and (-z.im,z.im) to xmm7 +*/ + +#define _sse_load_cmplx_dble(z) \ +__asm__ __volatile__ ("movddup %0, %%xmm6 \n\t" \ + "movddup %1, %%xmm7 \n\t" \ + "mulpd %2, %%xmm7" \ + : \ + : \ + "m" ((z).re), \ + "m" ((z).im), \ + "m" (_sse_sgn1_dble) \ + : \ + "xmm6", "xmm7") + +/* +* Multiplies the complex numbers in xmm0,xmm1,xmm2 by z, assuming z has +* been loaded to xmm6,xmm7 by _sse_load_cmplx_dble(z). The result appears +* in xmm0,xmm1,xmm2 and xmm3,xmm4,xmm5,xmm6,xmm7 are unchanged +*/ + +#define _sse_mulc_vector_dble() \ +__asm__ __volatile__ ("movapd %%xmm0, %%xmm8 \n\t" \ + "movapd %%xmm1, %%xmm9 \n\t" \ + "movapd %%xmm2, %%xmm10 \n\t" \ + "mulpd %%xmm6, %%xmm0 \n\t" \ + "mulpd %%xmm6, %%xmm1 \n\t" \ + "mulpd %%xmm6, %%xmm2 \n\t" \ + "shufpd $0x1, %%xmm8, %%xmm8 \n\t" \ + "shufpd $0x1, %%xmm9, %%xmm9 \n\t" \ + "shufpd $0x1, %%xmm10, %%xmm10 \n\t" \ + "mulpd %%xmm7, %%xmm8 \n\t" \ + "mulpd %%xmm7, %%xmm9 \n\t" \ + "mulpd %%xmm7, %%xmm10 \n\t" \ + "addpd %%xmm8, %%xmm0 \n\t" \ + "addpd %%xmm9, %%xmm1 \n\t" \ + "addpd %%xmm10, %%xmm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm8", "xmm9", "xmm10") + +/* +* Multiplies the complex numbers in xmm3,xmm4,xmm5 by z, assuming z has +* been loaded to xmm6,xmm7 by _sse_load_cmplx_dble(z). The result appears +* in xmm3,xmm4,xmm5 and xmm0,xmm1,xmm2,xmm6,xmm7 are unchanged +*/ + +#define _sse_mulc_vector_up_dble() \ +__asm__ __volatile__ ("movapd %%xmm3, %%xmm8 \n\t" \ + "movapd %%xmm4, %%xmm9 \n\t" \ + "movapd %%xmm5, %%xmm10 \n\t" \ + "mulpd %%xmm6, %%xmm3 \n\t" \ + "mulpd %%xmm6, %%xmm4 \n\t" \ + "mulpd %%xmm6, %%xmm5 \n\t" \ + "shufpd $0x1, %%xmm8, %%xmm8 \n\t" \ + "shufpd $0x1, %%xmm9, %%xmm9 \n\t" \ + "shufpd $0x1, %%xmm10, %%xmm10 \n\t" \ + "mulpd %%xmm7, %%xmm8 \n\t" \ + "mulpd %%xmm7, %%xmm9 \n\t" \ + "mulpd %%xmm7, %%xmm10 \n\t" \ + "addpd %%xmm8, %%xmm3 \n\t" \ + "addpd %%xmm9, %%xmm4 \n\t" \ + "addpd %%xmm10, %%xmm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5", \ + "xmm8", "xmm9", "xmm10") + +/* +* Computes s+z*r assuming s is stored in xmm0,xmm1,xmm2 and that z +* has been loaded to xmm6,xmm7 by _sse_load_cmplx_dble(z). The result +* appears in xmm0,xmm1,xmm2 and xmm3,xmm4,xmm5,xmm6,xmm7 are unchanged +*/ + +#define _sse_mulc_vector_add_dble(r) \ +__asm__ __volatile__ ("movapd %0, %%xmm8 \n\t" \ + "movapd %1, %%xmm9 \n\t" \ + "movapd %2, %%xmm10 \n\t" \ + "movapd %%xmm8, %%xmm11 \n\t" \ + "movapd %%xmm9, %%xmm12 \n\t" \ + "movapd %%xmm10, %%xmm13" \ + : \ + : \ + "m" ((r).c1), \ + "m" ((r).c2), \ + "m" ((r).c3) \ + : \ + "xmm8", "xmm9", "xmm10", \ + "xmm11", "xmm12", "xmm13"); \ +__asm__ __volatile__ ("mulpd %%xmm6, %%xmm8 \n\t" \ + "mulpd %%xmm6, %%xmm9 \n\t" \ + "mulpd %%xmm6, %%xmm10 \n\t" \ + "shufpd $0x1, %%xmm11, %%xmm11 \n\t" \ + "shufpd $0x1, %%xmm12, %%xmm12 \n\t" \ + "shufpd $0x1, %%xmm13, %%xmm13 \n\t" \ + "addpd %%xmm8, %%xmm0 \n\t" \ + "addpd %%xmm9, %%xmm1 \n\t" \ + "addpd %%xmm10, %%xmm2 \n\t" \ + "mulpd %%xmm7, %%xmm11 \n\t" \ + "mulpd %%xmm7, %%xmm12 \n\t" \ + "mulpd %%xmm7, %%xmm13 \n\t" \ + "addpd %%xmm11, %%xmm0 \n\t" \ + "addpd %%xmm12, %%xmm1 \n\t" \ + "addpd %%xmm13, %%xmm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2", \ + "xmm8", "xmm9", "xmm10", \ + "xmm11", "xmm12", "xmm13") + +/* +* Computes s+z*r assuming s is stored in xmm3,xmm4,xmm5 and that z +* has been loaded to xmm6,xmm7 by _sse_load_cmplx_dble(z). The result +* appears in xmm4,xmm5,xmm6 and xmm0,xmm1,xmm2,xmm6,xmm7 are unchanged +*/ + +#define _sse_mulc_vector_add_up_dble(r) \ +__asm__ __volatile__ ("movapd %0, %%xmm8 \n\t" \ + "movapd %1, %%xmm9 \n\t" \ + "movapd %2, %%xmm10 \n\t" \ + "movapd %%xmm8, %%xmm11 \n\t" \ + "movapd %%xmm9, %%xmm12 \n\t" \ + "movapd %%xmm10, %%xmm13" \ + : \ + : \ + "m" ((r).c1), \ + "m" ((r).c2), \ + "m" ((r).c3) \ + : \ + "xmm8", "xmm9", "xmm10", \ + "xmm11", "xmm12", "xmm13"); \ +__asm__ __volatile__ ("mulpd %%xmm6, %%xmm8 \n\t" \ + "mulpd %%xmm6, %%xmm9 \n\t" \ + "mulpd %%xmm6, %%xmm10 \n\t" \ + "shufpd $0x1, %%xmm11, %%xmm11 \n\t" \ + "shufpd $0x1, %%xmm12, %%xmm12 \n\t" \ + "shufpd $0x1, %%xmm13, %%xmm13 \n\t" \ + "addpd %%xmm8, %%xmm3 \n\t" \ + "addpd %%xmm9, %%xmm4 \n\t" \ + "addpd %%xmm10, %%xmm5 \n\t" \ + "mulpd %%xmm7, %%xmm11 \n\t" \ + "mulpd %%xmm7, %%xmm12 \n\t" \ + "mulpd %%xmm7, %%xmm13 \n\t" \ + "addpd %%xmm11, %%xmm3 \n\t" \ + "addpd %%xmm12, %%xmm4 \n\t" \ + "addpd %%xmm13, %%xmm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5", \ + "xmm8", "xmm9", "xmm10", \ + "xmm11", "xmm12", "xmm13") + +/* +* Loads (c,c) to xmm6 and xmm7 +*/ + +#define _sse_load_real_dble(c) \ +__asm__ __volatile__ ("movddup %0, %%xmm6 \n\t" \ + "movddup %0, %%xmm7" \ + : \ + : \ + "m" (c) \ + : \ + "xmm6", "xmm7") + +/* +* Multiplies the complex numbers in xmm0,xmm1,xmm2 by c, assuming c has +* been loaded to xmm6,xmm7 by _sse_load_real_dble(c). The result appears +* in xmm0,xmm1,xmm2 all other xmm registers are unchanged +*/ + +#define _sse_mulr_vector_dble() \ +__asm__ __volatile__ ("mulpd %%xmm6, %%xmm0 \n\t" \ + "mulpd %%xmm7, %%xmm1 \n\t" \ + "mulpd %%xmm6, %%xmm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2") + +/* +* Multiplies the complex numbers in xmm3,xmm4,xmm5 by c, assuming c has +* been loaded to xmm6,xmm7 by _sse_load_real_dble(z). The result appears +* in xmm3,xmm4,xmm5 all other xmm registers are unchanged +*/ + +#define _sse_mulr_vector_up_dble() \ +__asm__ __volatile__ ("mulpd %%xmm7, %%xmm3 \n\t" \ + "mulpd %%xmm6, %%xmm4 \n\t" \ + "mulpd %%xmm7, %%xmm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5") + +/* +* Computes s+c*r assuming r is stored in xmm0,xmm1,xmm2 and that c +* has been loaded to xmm6,xmm7 by _sse_load_real_dble(z). The result +* appears in xmm0,xmm1,xmm2 all other xmm registers are unchanged +*/ + +#define _sse_mulr_vector_add_dble(s) \ +__asm__ __volatile__ ("mulpd %%xmm6, %%xmm0 \n\t" \ + "mulpd %%xmm7, %%xmm1 \n\t" \ + "mulpd %%xmm6, %%xmm2 \n\t" \ + "addpd %0, %%xmm0 \n\t" \ + "addpd %1, %%xmm1 \n\t" \ + "addpd %2, %%xmm2" \ + : \ + : \ + "m" ((s).c1), \ + "m" ((s).c2), \ + "m" ((s).c3) \ + : \ + "xmm0", "xmm1", "xmm2") + +/* +* Computes s+c*r assuming r is stored in xmm3,xmm4,xmm5 and that c +* has been loaded to xmm6,xmm7 by _sse_load_real_dble(c). The result +* appears in xmm4,xmm5,xmm6 and all other xmm registers are unchanged +*/ + +#define _sse_mulr_vector_add_up_dble(s) \ +__asm__ __volatile__ ("mulpd %%xmm7, %%xmm3 \n\t" \ + "mulpd %%xmm6, %%xmm4 \n\t" \ + "mulpd %%xmm7, %%xmm5 \n\t" \ + "addpd %0, %%xmm3 \n\t" \ + "addpd %1, %%xmm4 \n\t" \ + "addpd %2, %%xmm5" \ + : \ + : \ + "m" ((s).c1), \ + "m" ((s).c2), \ + "m" ((s).c3) \ + : \ + "xmm3", "xmm4", "xmm5") + +/****************************************************************************** +* +* Action of su3 matrices on su3 vectors +* +******************************************************************************/ + +/* +* Multiplies an su3 vector s with an su3 matrix u, assuming s is +* stored in xmm0,xmm1,xmm2 +* +* On output the result is in xmm3,xmm4,xmm5 and the registers +* xmm0,xmm1,xmm2 are changed +*/ + +#define _sse_su3_multiply_dble(u) \ +__asm__ __volatile__ ("movddup %0, %%xmm3 \n\t" \ + "movddup %1, %%xmm6 \n\t" \ + "movddup %2, %%xmm4 \n\t" \ + "movddup %3, %%xmm7 \n\t" \ + "movddup %4, %%xmm5 \n\t" \ + "movddup %5, %%xmm8 \n\t" \ + "mulpd %%xmm0, %%xmm3 \n\t" \ + "mulpd %%xmm1, %%xmm6 \n\t" \ + "mulpd %%xmm0, %%xmm4 \n\t" \ + "mulpd %%xmm1, %%xmm7 \n\t" \ + "mulpd %%xmm0, %%xmm5 \n\t" \ + "addpd %%xmm6, %%xmm3 \n\t" \ + "mulpd %%xmm1, %%xmm8 \n\t" \ + "shufpd $0x1, %%xmm0, %%xmm0 \n\t" \ + "addpd %%xmm7, %%xmm4 \n\t" \ + "addpd %%xmm8, %%xmm5" \ + : \ + : \ + "m" ((u).c11.re), \ + "m" ((u).c12.re), \ + "m" ((u).c21.re), \ + "m" ((u).c22.re), \ + "m" ((u).c31.re), \ + "m" ((u).c32.re) \ + : \ + "xmm0", "xmm3", "xmm4", "xmm5", \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("movddup %0, %%xmm9 \n\t" \ + "movddup %1, %%xmm10 \n\t" \ + "movddup %2, %%xmm11 \n\t" \ + "movddup %3, %%xmm12 \n\t" \ + "movddup %4, %%xmm13 \n\t" \ + "movddup %5, %%xmm14 \n\t" \ + "mulpd %%xmm2, %%xmm9 \n\t" \ + "mulpd %%xmm0, %%xmm10 \n\t" \ + "mulpd %%xmm2, %%xmm11 \n\t" \ + "mulpd %%xmm0, %%xmm12 \n\t" \ + "addpd %%xmm9, %%xmm3 \n\t" \ + "mulpd %%xmm2, %%xmm13 \n\t" \ + "addsubpd %%xmm10, %%xmm4 \n\t" \ + "mulpd %%xmm0, %%xmm14 \n\t" \ + "addpd %%xmm11, %%xmm5" \ + : \ + : \ + "m" ((u).c13.re), \ + "m" ((u).c21.im), \ + "m" ((u).c33.re), \ + "m" ((u).c11.im), \ + "m" ((u).c23.re), \ + "m" ((u).c31.im) \ + : \ + "xmm3", "xmm4", "xmm5", "xmm9", \ + "xmm10", "xmm11", "xmm12", "xmm13", \ + "xmm14"); \ +__asm__ __volatile__ ("shufpd $0x1, %%xmm1, %%xmm1 \n\t" \ + "shufpd $0x1, %%xmm2, %%xmm2 \n\t" \ + "addsubpd %%xmm12, %%xmm3 \n\t" \ + "addpd %%xmm13, %%xmm4 \n\t" \ + "addsubpd %%xmm14, %%xmm5" \ + : \ + : \ + : \ + "xmm1", "xmm2", "xmm3", "xmm4", \ + "xmm5"); \ +__asm__ __volatile__ ("movddup %0, %%xmm6 \n\t" \ + "movddup %1, %%xmm7 \n\t" \ + "movddup %2, %%xmm8 \n\t" \ + "movddup %3, %%xmm9 \n\t" \ + "movddup %4, %%xmm10 \n\t" \ + "movddup %5, %%xmm11 \n\t" \ + "mulpd %%xmm1, %%xmm6 \n\t" \ + "mulpd %%xmm2, %%xmm7 \n\t" \ + "mulpd %%xmm1, %%xmm8 \n\t" \ + "mulpd %%xmm2, %%xmm9 \n\t" \ + "addsubpd %%xmm6, %%xmm3 \n\t" \ + "mulpd %%xmm1, %%xmm10 \n\t" \ + "addsubpd %%xmm7, %%xmm4 \n\t" \ + "mulpd %%xmm2, %%xmm11 \n\t" \ + "addsubpd %%xmm8, %%xmm5" \ + : \ + : \ + "m" ((u).c12.im), \ + "m" ((u).c23.im), \ + "m" ((u).c32.im), \ + "m" ((u).c13.im), \ + "m" ((u).c22.im), \ + "m" ((u).c33.im) \ + : \ + "xmm3", "xmm4", "xmm5", "xmm6", \ + "xmm7", "xmm8", "xmm9", "xmm10", \ + "xmm11"); \ +__asm__ __volatile__ ("addsubpd %%xmm9, %%xmm3 \n\t" \ + "addsubpd %%xmm10, %%xmm4 \n\t" \ + "addsubpd %%xmm11, %%xmm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5") + +/* +* Multiplies an su3 vector s with an su3 matrix u^dagger, assuming s is +* stored in xmm0,xmm1,xmm2 +* +* On output the result is in xmm3,xmm4,xmm5 and the registers +* xmm0,xmm1,xmm2 are changed +*/ + +#define _sse_su3_inverse_multiply_dble(u) \ +__asm__ __volatile__ ("movddup %0, %%xmm3 \n\t" \ + "movddup %1, %%xmm6 \n\t" \ + "movddup %2, %%xmm4 \n\t" \ + "movddup %3, %%xmm7 \n\t" \ + "movddup %4, %%xmm5 \n\t" \ + "movddup %5, %%xmm8 \n\t" \ + "mulpd %%xmm0, %%xmm3 \n\t" \ + "mulpd %%xmm1, %%xmm6 \n\t" \ + "mulpd %%xmm0, %%xmm4 \n\t" \ + "mulpd %%xmm1, %%xmm7 \n\t" \ + "mulpd %%xmm0, %%xmm5 \n\t" \ + "addpd %%xmm6, %%xmm3 \n\t" \ + "mulpd %6, %%xmm0 \n\t" \ + "mulpd %%xmm1, %%xmm8 \n\t" \ + "addpd %%xmm7, %%xmm4 \n\t" \ + "addpd %%xmm8, %%xmm5 \n\t" \ + "shufpd $0x1, %%xmm0, %%xmm0" \ + : \ + : \ + "m" ((u).c11.re), \ + "m" ((u).c21.re), \ + "m" ((u).c12.re), \ + "m" ((u).c22.re), \ + "m" ((u).c13.re), \ + "m" ((u).c23.re), \ + "m" (_sse_sgn1_dble) \ + : \ + "xmm0", "xmm3", "xmm4", "xmm5", \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("movddup %0, %%xmm9 \n\t" \ + "movddup %1, %%xmm10 \n\t" \ + "movddup %2, %%xmm11 \n\t" \ + "movddup %3, %%xmm12 \n\t" \ + "movddup %4, %%xmm13 \n\t" \ + "movddup %5, %%xmm14 \n\t" \ + "mulpd %%xmm2, %%xmm9 \n\t" \ + "mulpd %6, %%xmm1 \n\t" \ + "mulpd %%xmm0, %%xmm10 \n\t" \ + "mulpd %%xmm2, %%xmm11 \n\t" \ + "mulpd %%xmm0, %%xmm12 \n\t" \ + "addpd %%xmm9, %%xmm3 \n\t" \ + "mulpd %%xmm2, %%xmm13 \n\t" \ + "shufpd $0x1, %%xmm1, %%xmm1 \n\t" \ + "addpd %%xmm10, %%xmm4 \n\t" \ + "mulpd %6, %%xmm2 \n\t" \ + "addpd %%xmm11, %%xmm5 \n\t" \ + "mulpd %%xmm0, %%xmm14 \n\t" \ + "shufpd $0x1, %%xmm2, %%xmm2" \ + : \ + : \ + "m" ((u).c31.re), \ + "m" ((u).c12.im), \ + "m" ((u).c33.re), \ + "m" ((u).c11.im), \ + "m" ((u).c32.re), \ + "m" ((u).c13.im), \ + "m" (_sse_sgn1_dble) \ + : \ + "xmm1", "xmm2", "xmm3", "xmm4", \ + "xmm5", "xmm9", "xmm10", "xmm11", \ + "xmm12", "xmm13", "xmm14"); \ +__asm__ __volatile__ ("movddup %0, %%xmm6 \n\t" \ + "movddup %1, %%xmm7 \n\t" \ + "movddup %2, %%xmm8 \n\t" \ + "movddup %3, %%xmm9 \n\t" \ + "movddup %4, %%xmm10 \n\t" \ + "movddup %5, %%xmm11 \n\t" \ + "mulpd %%xmm1, %%xmm6 \n\t" \ + "addpd %%xmm12, %%xmm3 \n\t" \ + "mulpd %%xmm2, %%xmm7 \n\t" \ + "addpd %%xmm13, %%xmm4 \n\t" \ + "mulpd %%xmm1, %%xmm8 \n\t" \ + "addpd %%xmm14, %%xmm5 \n\t" \ + "mulpd %%xmm2, %%xmm9 \n\t" \ + "addpd %%xmm6, %%xmm3 \n\t" \ + "mulpd %%xmm1, %%xmm10 \n\t" \ + "addpd %%xmm7, %%xmm4 \n\t" \ + "mulpd %%xmm2, %%xmm11 \n\t" \ + "addpd %%xmm8, %%xmm5" \ + : \ + : \ + "m" ((u).c21.im), \ + "m" ((u).c32.im), \ + "m" ((u).c23.im), \ + "m" ((u).c31.im), \ + "m" ((u).c22.im), \ + "m" ((u).c33.im) \ + : \ + "xmm3", "xmm4", "xmm5", "xmm6", \ + "xmm7", "xmm8", "xmm9", "xmm10", \ + "xmm11"); \ +__asm__ __volatile__ ("addpd %%xmm9, %%xmm3 \n\t" \ + "addpd %%xmm10, %%xmm4 \n\t" \ + "addpd %%xmm11, %%xmm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5") + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/su3.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/su3.h new file mode 100644 index 0000000000000000000000000000000000000000..a7ce06e37455cccf55f1197798ce372639182a3c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/su3.h @@ -0,0 +1,658 @@ + +/******************************************************************************* +* +* File su3.h +* +* Copyright (C) 2005, 2009, 2011, 2013 Martin Luescher, Filippo Palombi +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Type definitions and macros for SU(3) matrices, SU(3) vectors and Dirac +* spinors +* +*******************************************************************************/ + +#ifndef SU3_H +#define SU3_H + +#if ((defined AVX)&&(!(defined x64))) +#define x64 +#endif + +#if (defined x64) +#define ALIGNED8 __attribute__ ((aligned (8))) +#define ALIGNED16 __attribute__ ((aligned (16))) +#define ALIGNED32 __attribute__ ((aligned (32))) +#else +#define ALIGNED8 +#define ALIGNED16 +#define ALIGNED32 +#endif + +typedef struct +{ + float re,im; +} complex; + +typedef struct +{ + complex c1,c2,c3; +} su3_vector; + +typedef struct +{ + complex c11,c12,c13,c21,c22,c23,c31,c32,c33; +} su3; + +typedef struct +{ + float c1,c2,c3,c4,c5,c6,c7,c8; +} su3_alg; + +typedef struct +{ + su3_vector c1,c2; +} weyl; + +typedef struct +{ + su3_vector c1,c2,c3,c4; +} spinor; + +typedef struct +{ + float u[36]; +} pauli; + +typedef struct +{ + float c1,c2,c3,c4,c5,c6,c7,c8,c9; +} u3_alg; + +typedef struct +{ + double re,im; +} complex_dble; + +typedef struct +{ + complex_dble c1,c2,c3; +} su3_vector_dble; + +typedef struct +{ + complex_dble c11,c12,c13,c21,c22,c23,c31,c32,c33; +} su3_dble; + +typedef struct +{ + double c1,c2,c3,c4,c5,c6,c7,c8; +} su3_alg_dble; + +typedef struct +{ + su3_vector_dble c1,c2; +} weyl_dble; + +typedef struct +{ + su3_vector_dble c1,c2,c3,c4; +} spinor_dble; + +typedef struct +{ + double u[36]; +} pauli_dble; + +typedef struct +{ + double c1,c2,c3,c4,c5,c6,c7,c8,c9; +} u3_alg_dble; + +/******************************************************************************* +* +* The following macros are the same for single and double precision types +* +* Depending on the macro, arguments are variables of type su3_vector and su3 +* (or su3_vector_dble and su3_dble) +* +*******************************************************************************/ + +/* +* r.c1=c*s.c1 (c real) +* r.c2=c*s.c2 +* r.c3=c*s.c3 +*/ + +#define _vector_mul(r,c,s) \ + (r).c1.re=(c)*(s).c1.re; \ + (r).c1.im=(c)*(s).c1.im; \ + (r).c2.re=(c)*(s).c2.re; \ + (r).c2.im=(c)*(s).c2.im; \ + (r).c3.re=(c)*(s).c3.re; \ + (r).c3.im=(c)*(s).c3.im + +/* +* r.c1=i*c*s.c1 (c real) +* r.c2=i*c*s.c2 +* r.c3=i*c*s.c3 +*/ + +#define _vector_imul(r,c,s) \ + (r).c1.re=-(c)*(s).c1.im; \ + (r).c1.im= (c)*(s).c1.re; \ + (r).c2.re=-(c)*(s).c2.im; \ + (r).c2.im= (c)*(s).c2.re; \ + (r).c3.re=-(c)*(s).c3.im; \ + (r).c3.im= (c)*(s).c3.re + +/* +* r.c1=c*s.c1 (c complex) +* r.c2=c*s.c2 +* r.c3=c*s.c3 +*/ + +#define _vector_mulc(r,c,s) \ + (r).c1.re=(c).re*(s).c1.re-(c).im*(s).c1.im; \ + (r).c1.im=(c).re*(s).c1.im+(c).im*(s).c1.re; \ + (r).c2.re=(c).re*(s).c2.re-(c).im*(s).c2.im; \ + (r).c2.im=(c).re*(s).c2.im+(c).im*(s).c2.re; \ + (r).c3.re=(c).re*(s).c3.re-(c).im*(s).c3.im; \ + (r).c3.im=(c).re*(s).c3.im+(c).im*(s).c3.re + +/* +* r.c1=s1.c1+s2.c1 +* r.c2=s1.c2+s2.c2 +* r.c3=s1.c3+s2.c3 +*/ + +#define _vector_add(r,s1,s2) \ + (r).c1.re=(s1).c1.re+(s2).c1.re; \ + (r).c1.im=(s1).c1.im+(s2).c1.im; \ + (r).c2.re=(s1).c2.re+(s2).c2.re; \ + (r).c2.im=(s1).c2.im+(s2).c2.im; \ + (r).c3.re=(s1).c3.re+(s2).c3.re; \ + (r).c3.im=(s1).c3.im+(s2).c3.im + +/* +* r.c1=s1.c1-s2.c1 +* r.c2=s1.c2-s2.c2 +* r.c3=s1.c3-s2.c3 +*/ + +#define _vector_sub(r,s1,s2) \ + (r).c1.re=(s1).c1.re-(s2).c1.re; \ + (r).c1.im=(s1).c1.im-(s2).c1.im; \ + (r).c2.re=(s1).c2.re-(s2).c2.re; \ + (r).c2.im=(s1).c2.im-(s2).c2.im; \ + (r).c3.re=(s1).c3.re-(s2).c3.re; \ + (r).c3.im=(s1).c3.im-(s2).c3.im + +/* +* r.c1=s1.c1+i*s2.c1 +* r.c2=s1.c2+i*s2.c2 +* r.c3=s1.c3+i*s2.c3 +*/ + +#define _vector_i_add(r,s1,s2) \ + (r).c1.re=(s1).c1.re-(s2).c1.im; \ + (r).c1.im=(s1).c1.im+(s2).c1.re; \ + (r).c2.re=(s1).c2.re-(s2).c2.im; \ + (r).c2.im=(s1).c2.im+(s2).c2.re; \ + (r).c3.re=(s1).c3.re-(s2).c3.im; \ + (r).c3.im=(s1).c3.im+(s2).c3.re + +/* +* r.c1=s1.c1+i*s2.c1 +* r.c2=s1.c2+i*s2.c2 +* r.c3=s1.c3+i*s2.c3 +*/ + +#define _vector_i_sub(r,s1,s2) \ + (r).c1.re=(s1).c1.re+(s2).c1.im; \ + (r).c1.im=(s1).c1.im-(s2).c1.re; \ + (r).c2.re=(s1).c2.re+(s2).c2.im; \ + (r).c2.im=(s1).c2.im-(s2).c2.re; \ + (r).c3.re=(s1).c3.re+(s2).c3.im; \ + (r).c3.im=(s1).c3.im-(s2).c3.re + +/* +* r.c1+=s.c1 +* r.c2+=s.c2 +* r.c3+=s.c3 +*/ + +#define _vector_add_assign(r,s) \ + (r).c1.re+=(s).c1.re; \ + (r).c1.im+=(s).c1.im; \ + (r).c2.re+=(s).c2.re; \ + (r).c2.im+=(s).c2.im; \ + (r).c3.re+=(s).c3.re; \ + (r).c3.im+=(s).c3.im + +/* +* r.c1-=s.c1 +* r.c2-=s.c2 +* r.c3-=s.c3 +*/ + +#define _vector_sub_assign(r,s) \ + (r).c1.re-=(s).c1.re; \ + (r).c1.im-=(s).c1.im; \ + (r).c2.re-=(s).c2.re; \ + (r).c2.im-=(s).c2.im; \ + (r).c3.re-=(s).c3.re; \ + (r).c3.im-=(s).c3.im + +/* +* r.c1+=i*s.c1 +* r.c2+=i*s.c2 +* r.c3+=i*s.c3 +*/ + +#define _vector_i_add_assign(r,s) \ + (r).c1.re-=(s).c1.im; \ + (r).c1.im+=(s).c1.re; \ + (r).c2.re-=(s).c2.im; \ + (r).c2.im+=(s).c2.re; \ + (r).c3.re-=(s).c3.im; \ + (r).c3.im+=(s).c3.re + +/* +* r.c1-=i*s.c1 +* r.c2-=i*s.c2 +* r.c3-=i*s.c3 +*/ + +#define _vector_i_sub_assign(r,s) \ + (r).c1.re+=(s).c1.im; \ + (r).c1.im-=(s).c1.re; \ + (r).c2.re+=(s).c2.im; \ + (r).c2.im-=(s).c2.re; \ + (r).c3.re+=(s).c3.im; \ + (r).c3.im-=(s).c3.re + +/* +* Real part of the scalar product (r,s) +*/ + +#define _vector_prod_re(r,s) \ + (r).c1.re*(s).c1.re+(r).c1.im*(s).c1.im+ \ + (r).c2.re*(s).c2.re+(r).c2.im*(s).c2.im+ \ + (r).c3.re*(s).c3.re+(r).c3.im*(s).c3.im + +/* +* Imaginary part of the scalar product (r,s) +*/ + +#define _vector_prod_im(r,s) \ + (r).c1.re*(s).c1.im-(r).c1.im*(s).c1.re+ \ + (r).c2.re*(s).c2.im-(r).c2.im*(s).c2.re+ \ + (r).c3.re*(s).c3.im-(r).c3.im*(s).c3.re + +/* +* r.c1+=c*s.c1 (c real) +* r.c2+=c*s.c2 +* r.c3+=c*s.c3 +*/ + +#define _vector_mulr_assign(r,c,s) \ + (r).c1.re+=(c)*(s).c1.re; \ + (r).c1.im+=(c)*(s).c1.im; \ + (r).c2.re+=(c)*(s).c2.re; \ + (r).c2.im+=(c)*(s).c2.im; \ + (r).c3.re+=(c)*(s).c3.re; \ + (r).c3.im+=(c)*(s).c3.im + +/* +* r.c1+=i*c*s.c1 (c real) +* r.c2+=i*c*s.c2 +* r.c3+=i*c*s.c3 +*/ + +#define _vector_mulir_assign(r,c,s) \ + (r).c1.re-=(c)*(s).c1.im; \ + (r).c1.im+=(c)*(s).c1.re; \ + (r).c2.re-=(c)*(s).c2.im; \ + (r).c2.im+=(c)*(s).c2.re; \ + (r).c3.re-=(c)*(s).c3.im; \ + (r).c3.im+=(c)*(s).c3.re + +/* +* r.c1+=z*s.c1 (z of type complex) +* r.c2+=z*s.c2 +* r.c3+=z*s.c3 +*/ + +#define _vector_mulc_assign(r,z,s) \ + (r).c1.re+=((z).re*(s).c1.re-(z).im*(s).c1.im); \ + (r).c1.im+=((z).re*(s).c1.im+(z).im*(s).c1.re); \ + (r).c2.re+=((z).re*(s).c2.re-(z).im*(s).c2.im); \ + (r).c2.im+=((z).re*(s).c2.im+(z).im*(s).c2.re); \ + (r).c3.re+=((z).re*(s).c3.re-(z).im*(s).c3.im); \ + (r).c3.im+=((z).re*(s).c3.im+(z).im*(s).c3.re) + +/* +* r.c1-=z*s.c1 (z of type complex) +* r.c2-=z*s.c2 +* r.c3-=z*s.c3 +*/ + +#define _vector_project(r,z,s) \ + (r).c1.re-=((z).re*(s).c1.re-(z).im*(s).c1.im); \ + (r).c1.im-=((z).re*(s).c1.im+(z).im*(s).c1.re); \ + (r).c2.re-=((z).re*(s).c2.re-(z).im*(s).c2.im); \ + (r).c2.im-=((z).re*(s).c2.im+(z).im*(s).c2.re); \ + (r).c3.re-=((z).re*(s).c3.re-(z).im*(s).c3.im); \ + (r).c3.im-=((z).re*(s).c3.im+(z).im*(s).c3.re) + +/* +* r.c1=c*r.c1+s.c1 (c real) +* r.c2=c*r.c2+s.c2 +* r.c3=c*r.c3+s.c3 +*/ + +#define _vector_mulr_add(r,c,s) \ + (r).c1.re=(c)*(r).c1.re+(s).c1.re; \ + (r).c1.im=(c)*(r).c1.im+(s).c1.im; \ + (r).c2.re=(c)*(r).c2.re+(s).c2.re; \ + (r).c2.im=(c)*(r).c2.im+(s).c2.im; \ + (r).c3.re=(c)*(r).c3.re+(s).c3.re; \ + (r).c3.im=(c)*(r).c3.im+(s).c3.im + +/* +* r.c1=cr*r.c1+cs*s.c1 (cr,cs real) +* r.c2=cr*r.c2+cs*s.c2 +* r.c3=cr*r.c3+cs*s.c3 +*/ + +#define _vector_combine(r,s,cr,cs) \ + (r).c1.re=(cr)*(r).c1.re+(cs)*(s).c1.re; \ + (r).c1.im=(cr)*(r).c1.im+(cs)*(s).c1.im; \ + (r).c2.re=(cr)*(r).c2.re+(cs)*(s).c2.re; \ + (r).c2.im=(cr)*(r).c2.im+(cs)*(s).c2.im; \ + (r).c3.re=(cr)*(r).c3.re+(cs)*(s).c3.re; \ + (r).c3.im=(cr)*(r).c3.im+(cs)*(s).c3.im + +/* +* v.c1=(w.c2*z.c3-w.c3*z.c2)^* +* v.c2=(w.c3*z.c1-w.c1*z.c3)^* +* v.c3=(w.c1*z.c2-w.c2*z.c1)^* +*/ + +#define _vector_cross_prod(v,w,z) \ + (v).c1.re= (w).c2.re*(z).c3.re-(w).c2.im*(z).c3.im \ + -(w).c3.re*(z).c2.re+(w).c3.im*(z).c2.im; \ + (v).c1.im= (w).c3.re*(z).c2.im+(w).c3.im*(z).c2.re \ + -(w).c2.re*(z).c3.im-(w).c2.im*(z).c3.re; \ + (v).c2.re= (w).c3.re*(z).c1.re-(w).c3.im*(z).c1.im \ + -(w).c1.re*(z).c3.re+(w).c1.im*(z).c3.im; \ + (v).c2.im= (w).c1.re*(z).c3.im+(w).c1.im*(z).c3.re \ + -(w).c3.re*(z).c1.im-(w).c3.im*(z).c1.re; \ + (v).c3.re= (w).c1.re*(z).c2.re-(w).c1.im*(z).c2.im \ + -(w).c2.re*(z).c1.re+(w).c2.im*(z).c1.im; \ + (v).c3.im= (w).c2.re*(z).c1.im+(w).c2.im*(z).c1.re \ + -(w).c1.re*(z).c2.im-(w).c1.im*(z).c2.re + +/* +* SU(3) matrix u times SU(3) vector s +* +* r.c1=(u*s).c1 +* r.c2=(u*s).c2 +* r.c3=(u*s).c3 +*/ + +#define _su3_multiply(r,u,s) \ + (r).c1.re= (u).c11.re*(s).c1.re-(u).c11.im*(s).c1.im \ + +(u).c12.re*(s).c2.re-(u).c12.im*(s).c2.im \ + +(u).c13.re*(s).c3.re-(u).c13.im*(s).c3.im; \ + (r).c1.im= (u).c11.re*(s).c1.im+(u).c11.im*(s).c1.re \ + +(u).c12.re*(s).c2.im+(u).c12.im*(s).c2.re \ + +(u).c13.re*(s).c3.im+(u).c13.im*(s).c3.re; \ + (r).c2.re= (u).c21.re*(s).c1.re-(u).c21.im*(s).c1.im \ + +(u).c22.re*(s).c2.re-(u).c22.im*(s).c2.im \ + +(u).c23.re*(s).c3.re-(u).c23.im*(s).c3.im; \ + (r).c2.im= (u).c21.re*(s).c1.im+(u).c21.im*(s).c1.re \ + +(u).c22.re*(s).c2.im+(u).c22.im*(s).c2.re \ + +(u).c23.re*(s).c3.im+(u).c23.im*(s).c3.re; \ + (r).c3.re= (u).c31.re*(s).c1.re-(u).c31.im*(s).c1.im \ + +(u).c32.re*(s).c2.re-(u).c32.im*(s).c2.im \ + +(u).c33.re*(s).c3.re-(u).c33.im*(s).c3.im; \ + (r).c3.im= (u).c31.re*(s).c1.im+(u).c31.im*(s).c1.re \ + +(u).c32.re*(s).c2.im+(u).c32.im*(s).c2.re \ + +(u).c33.re*(s).c3.im+(u).c33.im*(s).c3.re + +/* +* SU(3) matrix u^dagger times SU(3) vector s +* +* r.c1=(u^dagger*s).c1 +* r.c2=(u^dagger*s).c2 +* r.c3=(u^dagger*s).c3 +*/ + +#define _su3_inverse_multiply(r,u,s) \ + (r).c1.re= (u).c11.re*(s).c1.re+(u).c11.im*(s).c1.im \ + +(u).c21.re*(s).c2.re+(u).c21.im*(s).c2.im \ + +(u).c31.re*(s).c3.re+(u).c31.im*(s).c3.im; \ + (r).c1.im= (u).c11.re*(s).c1.im-(u).c11.im*(s).c1.re \ + +(u).c21.re*(s).c2.im-(u).c21.im*(s).c2.re \ + +(u).c31.re*(s).c3.im-(u).c31.im*(s).c3.re; \ + (r).c2.re= (u).c12.re*(s).c1.re+(u).c12.im*(s).c1.im \ + +(u).c22.re*(s).c2.re+(u).c22.im*(s).c2.im \ + +(u).c32.re*(s).c3.re+(u).c32.im*(s).c3.im; \ + (r).c2.im= (u).c12.re*(s).c1.im-(u).c12.im*(s).c1.re \ + +(u).c22.re*(s).c2.im-(u).c22.im*(s).c2.re \ + +(u).c32.re*(s).c3.im-(u).c32.im*(s).c3.re; \ + (r).c3.re= (u).c13.re*(s).c1.re+(u).c13.im*(s).c1.im \ + +(u).c23.re*(s).c2.re+(u).c23.im*(s).c2.im \ + +(u).c33.re*(s).c3.re+(u).c33.im*(s).c3.im; \ + (r).c3.im= (u).c13.re*(s).c1.im-(u).c13.im*(s).c1.re \ + +(u).c23.re*(s).c2.im-(u).c23.im*(s).c2.re \ + +(u).c33.re*(s).c3.im-(u).c33.im*(s).c3.re + +/******************************************************************************* +* +* Macros for SU(3) matrices +* +* Arguments are variables of type su3 +* +*******************************************************************************/ + +/* +* u=v^dagger +*/ + +#define _su3_dagger(u,v) \ + (u).c11.re= (v).c11.re; \ + (u).c11.im=-(v).c11.im; \ + (u).c12.re= (v).c21.re; \ + (u).c12.im=-(v).c21.im; \ + (u).c13.re= (v).c31.re; \ + (u).c13.im=-(v).c31.im; \ + (u).c21.re= (v).c12.re; \ + (u).c21.im=-(v).c12.im; \ + (u).c22.re= (v).c22.re; \ + (u).c22.im=-(v).c22.im; \ + (u).c23.re= (v).c32.re; \ + (u).c23.im=-(v).c32.im; \ + (u).c31.re= (v).c13.re; \ + (u).c31.im=-(v).c13.im; \ + (u).c32.re= (v).c23.re; \ + (u).c32.im=-(v).c23.im; \ + (u).c33.re= (v).c33.re; \ + (u).c33.im=-(v).c33.im + +/* +* u=v*w +*/ + +#define _su3_times_su3(u,v,w) \ + (u).c11.re= (v).c11.re*(w).c11.re-(v).c11.im*(w).c11.im \ + +(v).c12.re*(w).c21.re-(v).c12.im*(w).c21.im \ + +(v).c13.re*(w).c31.re-(v).c13.im*(w).c31.im; \ + (u).c11.im= (v).c11.re*(w).c11.im+(v).c11.im*(w).c11.re \ + +(v).c12.re*(w).c21.im+(v).c12.im*(w).c21.re \ + +(v).c13.re*(w).c31.im+(v).c13.im*(w).c31.re; \ + (u).c12.re= (v).c11.re*(w).c12.re-(v).c11.im*(w).c12.im \ + +(v).c12.re*(w).c22.re-(v).c12.im*(w).c22.im \ + +(v).c13.re*(w).c32.re-(v).c13.im*(w).c32.im; \ + (u).c12.im= (v).c11.re*(w).c12.im+(v).c11.im*(w).c12.re \ + +(v).c12.re*(w).c22.im+(v).c12.im*(w).c22.re \ + +(v).c13.re*(w).c32.im+(v).c13.im*(w).c32.re; \ + (u).c13.re= (v).c11.re*(w).c13.re-(v).c11.im*(w).c13.im \ + +(v).c12.re*(w).c23.re-(v).c12.im*(w).c23.im \ + +(v).c13.re*(w).c33.re-(v).c13.im*(w).c33.im; \ + (u).c13.im= (v).c11.re*(w).c13.im+(v).c11.im*(w).c13.re \ + +(v).c12.re*(w).c23.im+(v).c12.im*(w).c23.re \ + +(v).c13.re*(w).c33.im+(v).c13.im*(w).c33.re; \ + (u).c21.re= (v).c21.re*(w).c11.re-(v).c21.im*(w).c11.im \ + +(v).c22.re*(w).c21.re-(v).c22.im*(w).c21.im \ + +(v).c23.re*(w).c31.re-(v).c23.im*(w).c31.im; \ + (u).c21.im= (v).c21.re*(w).c11.im+(v).c21.im*(w).c11.re \ + +(v).c22.re*(w).c21.im+(v).c22.im*(w).c21.re \ + +(v).c23.re*(w).c31.im+(v).c23.im*(w).c31.re; \ + (u).c22.re= (v).c21.re*(w).c12.re-(v).c21.im*(w).c12.im \ + +(v).c22.re*(w).c22.re-(v).c22.im*(w).c22.im \ + +(v).c23.re*(w).c32.re-(v).c23.im*(w).c32.im; \ + (u).c22.im= (v).c21.re*(w).c12.im+(v).c21.im*(w).c12.re \ + +(v).c22.re*(w).c22.im+(v).c22.im*(w).c22.re \ + +(v).c23.re*(w).c32.im+(v).c23.im*(w).c32.re; \ + (u).c23.re= (v).c21.re*(w).c13.re-(v).c21.im*(w).c13.im \ + +(v).c22.re*(w).c23.re-(v).c22.im*(w).c23.im \ + +(v).c23.re*(w).c33.re-(v).c23.im*(w).c33.im; \ + (u).c23.im= (v).c21.re*(w).c13.im+(v).c21.im*(w).c13.re \ + +(v).c22.re*(w).c23.im+(v).c22.im*(w).c23.re \ + +(v).c23.re*(w).c33.im+(v).c23.im*(w).c33.re; \ + (u).c31.re= (v).c31.re*(w).c11.re-(v).c31.im*(w).c11.im \ + +(v).c32.re*(w).c21.re-(v).c32.im*(w).c21.im \ + +(v).c33.re*(w).c31.re-(v).c33.im*(w).c31.im; \ + (u).c31.im= (v).c31.re*(w).c11.im+(v).c31.im*(w).c11.re \ + +(v).c32.re*(w).c21.im+(v).c32.im*(w).c21.re \ + +(v).c33.re*(w).c31.im+(v).c33.im*(w).c31.re; \ + (u).c32.re= (v).c31.re*(w).c12.re-(v).c31.im*(w).c12.im \ + +(v).c32.re*(w).c22.re-(v).c32.im*(w).c22.im \ + +(v).c33.re*(w).c32.re-(v).c33.im*(w).c32.im; \ + (u).c32.im= (v).c31.re*(w).c12.im+(v).c31.im*(w).c12.re \ + +(v).c32.re*(w).c22.im+(v).c32.im*(w).c22.re \ + +(v).c33.re*(w).c32.im+(v).c33.im*(w).c32.re; \ + (u).c33.re= (v).c31.re*(w).c13.re-(v).c31.im*(w).c13.im \ + +(v).c32.re*(w).c23.re-(v).c32.im*(w).c23.im \ + +(v).c33.re*(w).c33.re-(v).c33.im*(w).c33.im; \ + (u).c33.im= (v).c31.re*(w).c13.im+(v).c31.im*(w).c13.re \ + +(v).c32.re*(w).c23.im+(v).c32.im*(w).c23.re \ + +(v).c33.re*(w).c33.im+(v).c33.im*(w).c33.re + +/******************************************************************************* +* +* Macros for variables of type su3_alg +* +*******************************************************************************/ + +/* +* r+=s +*/ + +#define _su3_alg_add_assign(r,s) \ + (r).c1+=(s).c1; \ + (r).c2+=(s).c2; \ + (r).c3+=(s).c3; \ + (r).c4+=(s).c4; \ + (r).c5+=(s).c5; \ + (r).c6+=(s).c6; \ + (r).c7+=(s).c7; \ + (r).c8+=(s).c8 + +/* +* r-=s +*/ + +#define _su3_alg_sub_assign(r,s) \ + (r).c1-=(s).c1; \ + (r).c2-=(s).c2; \ + (r).c3-=(s).c3; \ + (r).c4-=(s).c4; \ + (r).c5-=(s).c5; \ + (r).c6-=(s).c6; \ + (r).c7-=(s).c7; \ + (r).c8-=(s).c8 + +/* +* s*=c, c real +*/ + +#define _su3_alg_mul_assign(s,c) \ + (s).c1*=(c); \ + (s).c2*=(c); \ + (s).c3*=(c); \ + (s).c4*=(c); \ + (s).c5*=(c); \ + (s).c6*=(c); \ + (s).c7*=(c); \ + (s).c8*=(c) + +/* +* r+=c*s, c real +*/ + +#define _su3_alg_mul_add_assign(r,c,s) \ + (r).c1+=(c)*(s).c1; \ + (r).c2+=(c)*(s).c2; \ + (r).c3+=(c)*(s).c3; \ + (r).c4+=(c)*(s).c4; \ + (r).c5+=(c)*(s).c5; \ + (r).c6+=(c)*(s).c6; \ + (r).c7+=(c)*(s).c7; \ + (r).c8+=(c)*(s).c8 + +/* +* r-=c*s, c real +*/ + +#define _su3_alg_mul_sub_assign(r,c,s) \ + (r).c1-=(c)*(s).c1; \ + (r).c2-=(c)*(s).c2; \ + (r).c3-=(c)*(s).c3; \ + (r).c4-=(c)*(s).c4; \ + (r).c5-=(c)*(s).c5; \ + (r).c6-=(c)*(s).c6; \ + (r).c7-=(c)*(s).c7; \ + (r).c8-=(c)*(s).c8 + +/******************************************************************************* +* +* Macros for variables of type u3_alg +* +*******************************************************************************/ + +/* +* r=c*(u+v) +*/ + +#define _u3_alg_mul_add(r,c,u,v) \ + (r).c1=(c)*((u).c1+(v).c1); \ + (r).c2=(c)*((u).c2+(v).c2); \ + (r).c3=(c)*((u).c3+(v).c3); \ + (r).c4=(c)*((u).c4+(v).c4); \ + (r).c5=(c)*((u).c5+(v).c5); \ + (r).c6=(c)*((u).c6+(v).c6); \ + (r).c7=(c)*((u).c7+(v).c7); \ + (r).c8=(c)*((u).c8+(v).c8); \ + (r).c9=(c)*((u).c9+(v).c9) + +/* +* r=c*(u-v) +*/ + +#define _u3_alg_mul_sub(r,c,u,v) \ + (r).c1=(c)*((u).c1-(v).c1); \ + (r).c2=(c)*((u).c2-(v).c2); \ + (r).c3=(c)*((u).c3-(v).c3); \ + (r).c4=(c)*((u).c4-(v).c4); \ + (r).c5=(c)*((u).c5-(v).c5); \ + (r).c6=(c)*((u).c6-(v).c6); \ + (r).c7=(c)*((u).c7-(v).c7); \ + (r).c8=(c)*((u).c8-(v).c8); \ + (r).c9=(c)*((u).c9-(v).c9) + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/su3fcts.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/su3fcts.h new file mode 100644 index 0000000000000000000000000000000000000000..6f40ca8a7669087a0b0b78e601e8186b3c23dee8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/su3fcts.h @@ -0,0 +1,87 @@ + +/******************************************************************************* +* +* File su3fcts.h +* +* Copyright (C) 2010, 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef SU3FCTS_H +#define SU3FCTS_H + +#ifndef SU3_H +#include "su3.h" +#endif + +typedef struct +{ + double t,d; + complex_dble p[3]; +} ch_drv0_t; + +typedef struct +{ + double t,d; + complex_dble p[3]; + complex_dble pt[3],pd[3]; +} ch_drv1_t; + +typedef struct +{ + double t,d; + complex_dble p[3]; + complex_dble pt[3],pd[3]; + complex_dble ptt[3],ptd[3],pdd[3]; +} ch_drv2_t; + +/* CHEXP_C */ +extern void ch2mat(complex_dble *p,su3_alg_dble *X,su3_dble *u); +extern void chexp_drv0(su3_alg_dble *X,ch_drv0_t *s); +extern void chexp_drv1(su3_alg_dble *X,ch_drv1_t *s); +extern void chexp_drv2(su3_alg_dble *X,ch_drv2_t *s); +extern void expXsu3(double eps,su3_alg_dble *X,su3_dble *u); + +/* CM3X3_C */ +extern void cm3x3_zero(int vol,su3_dble *u); +extern void cm3x3_unity(int vol,su3_dble *u); +extern void cm3x3_assign(int vol,su3_dble *u,su3_dble *v); +extern void cm3x3_swap(int vol,su3_dble *u,su3_dble *v); +extern void cm3x3_dagger(su3_dble *u,su3_dble *v); +extern void cm3x3_tr(su3_dble *u,su3_dble *v,complex_dble *tr); +extern void cm3x3_retr(su3_dble *u,su3_dble *v,double *tr); +extern void cm3x3_imtr(su3_dble *u,su3_dble *v,double *tr); +extern void cm3x3_add(su3_dble *u,su3_dble *v); +extern void cm3x3_mul_add(su3_dble *u,su3_dble *v,su3_dble *w); +extern void cm3x3_mulr(double *r,su3_dble *u,su3_dble *v); +extern void cm3x3_mulr_add(double *r,su3_dble *u,su3_dble *v); +extern void cm3x3_mulc(complex_dble *c,su3_dble *u,su3_dble *v); +extern void cm3x3_mulc_add(complex_dble *c,su3_dble *u,su3_dble *v); +extern void cm3x3_lc1(complex_dble *c,su3_dble *u,su3_dble *v); +extern void cm3x3_lc2(complex_dble *c,su3_dble *u,su3_dble *v); + +/* RANDOM_SU3_C */ +extern void random_su3(su3 *u); +extern void random_su3_dble(su3_dble *u); + +/* SU3REN_C */ +extern void project_to_su3(su3 *u); +extern void project_to_su3_dble(su3_dble *u); + +/* SU3PROD_C */ +extern void su3xsu3(su3_dble *u,su3_dble *v,su3_dble *w); +extern void su3dagxsu3(su3_dble *u,su3_dble *v,su3_dble *w); +extern void su3xsu3dag(su3_dble *u,su3_dble *v,su3_dble *w); +extern void su3dagxsu3dag(su3_dble *u,su3_dble *v,su3_dble *w); +extern void su3xu3alg(su3_dble *u,u3_alg_dble *X,su3_dble *v); +extern void su3dagxu3alg(su3_dble *u,u3_alg_dble *X,su3_dble *v); +extern void u3algxsu3(u3_alg_dble *X,su3_dble *u,su3_dble *v); +extern void u3algxsu3dag(u3_alg_dble *X,su3_dble *u,su3_dble *v); +extern double prod2su3alg(su3_dble *u,su3_dble *v,su3_alg_dble *X); +extern void prod2u3alg(su3_dble *u,su3_dble *v,u3_alg_dble *X); +extern void rotate_su3alg(su3_dble *u,su3_alg_dble *X); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/sw_term.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/sw_term.h new file mode 100644 index 0000000000000000000000000000000000000000..f1ef62241ea64fc87e0dfbb0de4c8deab887525e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/sw_term.h @@ -0,0 +1,47 @@ + +/******************************************************************************* +* +* File sw_term.h +* +* Copyright (C) 2005, 2009, 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef SW_TERM_H +#define SW_TERM_H + +#ifndef SU3_H +#include "su3.h" +#endif + +#ifndef UTILS_H +#include "utils.h" +#endif + +/* PAULI_C */ +extern void mul_pauli(float mu,pauli *m,weyl *s,weyl *r); +extern void mul_pauli2(float mu,pauli *m,spinor *s,spinor *r); +extern void assign_pauli(int vol,pauli_dble *md,pauli *m); +extern void apply_sw(int vol,float mu,pauli *m,spinor *s,spinor *r); + +/* PAULI_DBLE_C */ +extern void mul_pauli_dble(double mu,pauli_dble *m,weyl_dble *s,weyl_dble *r); +extern int inv_pauli_dble(double mu,pauli_dble *m,pauli_dble *im); +extern complex_dble det_pauli_dble(double mu,pauli_dble *m); +extern void apply_sw_dble(int vol,double mu,pauli_dble *m,spinor_dble *s, + spinor_dble *r); +extern int apply_swinv_dble(int vol,double mu,pauli_dble *m,spinor_dble *s, + spinor_dble *r); + +/* SWFLDS_C */ +extern pauli *swfld(void); +extern pauli_dble *swdfld(void); +extern void assign_swd2sw(void); + +/* SW_TERM_C */ +extern int sw_term(ptset_t set); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/tcharge.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/tcharge.h new file mode 100644 index 0000000000000000000000000000000000000000..dc28cb7c10df1d102afed439fc0b5212781110d0 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/tcharge.h @@ -0,0 +1,35 @@ + +/******************************************************************************* +* +* File tcharge.h +* +* Copyright (C) 2010, 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef TCHARGE_H +#define TCHARGE_H + +#ifndef SU3_H +#include "su3.h" +#endif + +/* FTCOM_C */ +extern void copy_bnd_ft(int n,u3_alg_dble *ft); +extern void add_bnd_ft(int n,u3_alg_dble *ft); + +/* FTENSOR_C */ +extern u3_alg_dble **ftensor(void); + +/* TCHARGE_C */ +extern double tcharge(void); +extern double tcharge_slices(double *qsl); + +/* YM_ACTION_C */ +extern double ym_action(void); +extern double ym_action_slices(double *asl); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/uflds.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/uflds.h new file mode 100644 index 0000000000000000000000000000000000000000..1ddcb1d850a37d7b711bcf89613beffd6b8ecefa --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/uflds.h @@ -0,0 +1,42 @@ + +/******************************************************************************* +* +* File uflds.h +* +* Copyright (C) 2011, 2012, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef UFLDS_H +#define UFLDS_H + +#ifndef SU3_H +#include "su3.h" +#endif + +/* BSTAP_C */ +extern su3_dble *bstap(void); +extern void set_bstap(void); + +/* PLAQ_SUM_C */ +extern double plaq_sum_dble(int icom); +extern double plaq_wsum_dble(int icom); +extern double plaq_action_slices(double *asl); + +/* SHIFT_C */ +extern int shift_ud(int *s); + +/* UFLDS_C */ +extern su3 *ufld(void); +extern su3_dble *udfld(void); +extern void random_ud(void); +extern void renormalize_ud(void); +extern void assign_ud2u(void); + +/* UDCOM_C */ +extern void copy_bnd_ud(void); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/update.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/update.h new file mode 100644 index 0000000000000000000000000000000000000000..bc9e519b308a3acdd2721533cabfac56a64bc921 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/update.h @@ -0,0 +1,66 @@ + +/******************************************************************************* +* +* File update.h +* +* Copyright (C) 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef UPDATE_H +#define UPDATE_H + +#ifndef SU3_H +#include "su3.h" +#endif + +typedef struct +{ + int iop; + double eps; +} mdstep_t; + +/* CHRONO */ +extern void setup_chrono(void); +extern double mdtime(void); +extern void step_mdtime(double dt); +extern void add_chrono(int icr,spinor_dble *psi); +extern int get_chrono(int icr,spinor_dble *psi); +extern void reset_chrono(void); + +/* COUNTERS */ +extern void setup_counters(void); +extern void clear_counters(void); +extern void add2counter(char *type,int idx,int *status); +extern int get_count(char *type,int idx,int *status); +extern void print_avgstat(char *type,int idx); +extern void print_all_avgstat(void); + +/* MDSTEPS_C */ +extern void set_mdsteps(void); +extern mdstep_t *mdsteps(int *nop,int *itu); +extern void print_mdsteps(int ipr); + +/* MDINT_C */ +extern void run_mdint(void); + +/* HMC_C */ +extern void hmc_sanity_check(void); +extern void hmc_wsize(int *nwud,int *nws,int *nwsd,int *nwv,int *nwvd); +extern int run_hmc(double *act0,double *act1); + +/* RWRAT_C */ +extern double rwrat(int irp,int n,int *np,int *isp,double *sqn,int **status); + +/* RWTM_C */ +extern double rwtm1(double mu1,double mu2,int isp,double *sqn,int *status); +extern double rwtm2(double mu1,double mu2,int isp,double *sqn,int *status); + +/* RWTMEO_C */ +extern double rwtm1eo(double mu1,double mu2,int isp,double *sqn,int *status); +extern double rwtm2eo(double mu1,double mu2,int isp,double *sqn,int *status); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/utils.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/utils.h new file mode 100644 index 0000000000000000000000000000000000000000..340f88854e3b583497f5366ead5c0982ce955baa --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/utils.h @@ -0,0 +1,119 @@ + +/******************************************************************************* +* +* File utils.h +* +* Copyright (C) 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef UTILS_H +#define UTILS_H + +#include +#include + +#ifndef SU3_H +#include "su3.h" +#endif + +#if ((DBL_MANT_DIG!=53)||(DBL_MIN_EXP!=-1021)||(DBL_MAX_EXP!=1024)) +#error : Machine is not compliant with the IEEE-754 standard +#endif + +#if (SHRT_MAX==0x7fffffff) +typedef short int stdint_t; +typedef unsigned short int stduint_t; +#elif (INT_MAX==0x7fffffff) +typedef int stdint_t; +typedef unsigned int stduint_t; +#elif (LONG_MAX==0x7fffffff) +typedef long int stdint_t; +typedef unsigned long int stduint_t; +#else +#error : There is no four-byte integer type on this machine +#endif + +#undef UNKNOWN_ENDIAN +#undef LITTLE_ENDIAN +#undef BIG_ENDIAN + +#define UNKNOWN_ENDIAN 0 +#define LITTLE_ENDIAN 1 +#define BIG_ENDIAN 2 + +#undef IMAX +#define IMAX(n,m) ((n)+((m)-(n))*((m)>(n))) + +typedef enum +{ + ALL_PTS,EVEN_PTS,ODD_PTS,NO_PTS,ODD_PTS2,PT_SETS +} ptset_t; + +/* ENDIAN_C */ +extern int endianness(void); +extern void bswap_int(int n,void *a); +extern void bswap_double(int n,void *a); + +/* MUTILS_C */ +extern int find_opt(int argc,char *argv[],char *opt); +extern int fdigits(double x); +extern void check_dir(char* dir); +extern void check_dir_root(char* dir); +extern int name_size(char *format,...); +extern long find_section(char *title); +extern long read_line(char *tag,char *format,...); +extern int count_tokens(char *tag); +extern void read_iprms(char *tag,int n,int *iprms); +extern void read_dprms(char *tag,int n,double *dprms); +extern int copy_file(char *in,char *out); + +/* UTILS_C */ +extern int safe_mod(int x,int y); +extern void *amalloc(size_t size,int p); +extern void afree(void *addr); +extern double amem_use_mb(void); +extern double amem_max_mb(void); +extern int mpi_permanent_tag(void); +extern int mpi_tag(void); +extern void error(int test,int no,char *name,char *format,...); +extern void error_root(int test,int no,char *name,char *format,...); +extern int error_loc(int test,int no,char *name,char *message); +extern void error_chk(void); +extern void message(char *format,...); +extern void mpc_bcast_c(char *buf, int num); +extern void mpc_bcast_d(double *buf, int num); +extern void mpc_bcast_i(int *buf, int num); +extern void mpc_gsum_d(double *src, double *dst, int num); +extern void mpc_print_info(void); + +/* WSPACE_C */ +extern void alloc_wud(int n); +extern su3_dble **reserve_wud(int n); +extern int release_wud(void); +extern int wud_size(void); +extern void alloc_wfd(int n); +extern su3_alg_dble **reserve_wfd(int n); +extern int release_wfd(void); +extern int wfd_size(void); +extern void alloc_ws(int n); +extern spinor **reserve_ws(int n); +extern int release_ws(void); +extern int ws_size(void); +extern void alloc_wsd(int n); +extern spinor_dble **reserve_wsd(int n); +extern int release_wsd(void); +extern int wsd_size(void); +extern void alloc_wv(int n); +extern complex **reserve_wv(int n); +extern int release_wv(void); +extern int wv_size(void); +extern void alloc_wvd(int n); +extern complex_dble **reserve_wvd(int n); +extern int release_wvd(void); +extern int wvd_size(void); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/version.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/version.h new file mode 100644 index 0000000000000000000000000000000000000000..221321d2080a6f80a0cf5da832ae38e1925041d7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/version.h @@ -0,0 +1,18 @@ + +/******************************************************************************* +* +* File version.h +* +* Copyright (C) 2009 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef VERSION_H +#define VERSION_H + +#define openQCD_RELEASE "openQCD-1.4" + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/vflds.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/vflds.h new file mode 100644 index 0000000000000000000000000000000000000000..e152b72556098c54434193516e36189f834e292e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/vflds.h @@ -0,0 +1,44 @@ + +/******************************************************************************* +* +* File vflds.h +* +* Copyright (C) 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef VFLDS_H +#define VFLDS_H + +#ifndef SU3_H +#include "su3.h" +#endif + +/* VCOM_C */ +extern void cpv_int_bnd(complex *v); +extern void cpv_ext_bnd(complex *v); + +/* VDCOM_C */ +extern void cpvd_int_bnd(complex_dble *vd); +extern void cpvd_ext_bnd(complex_dble *vd); + +/* VFLDS_C */ +extern complex **vflds(void); +extern complex_dble **vdflds(void); + +/* VINIT_C */ +extern void set_v2zero(int n,complex *v); +extern void set_vd2zero(int n,complex_dble *vd); +extern void random_v(int n,complex *v,float sigma); +extern void random_vd(int n,complex_dble *vd,double sigma); +extern void assign_v2v(int n,complex *v,complex *w); +extern void assign_v2vd(int n,complex *v,complex_dble *wd); +extern void assign_vd2v(int n,complex_dble *vd,complex *w); +extern void assign_vd2vd(int n,complex_dble *vd,complex_dble *wd); +extern void add_v2vd(int n,complex *v,complex_dble *wd); +extern void diff_vd2v(int n,complex_dble *vd,complex_dble *wd,complex *w); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/wflow.h b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/wflow.h new file mode 100644 index 0000000000000000000000000000000000000000..5024ff4d47dec1132990b3fab6d8720a7347651b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/include/wflow.h @@ -0,0 +1,20 @@ +/******************************************************************************* +* +* File wflow.h +* +* Copyright (C) 2009, 2010, 2011, 2012 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +*******************************************************************************/ + +#ifndef WFLOW_H +#define WFLOW_H + +/* WFLOW_C */ +extern void fwd_euler(int n,double eps); +extern void fwd_rk2(int n,double eps); +extern void fwd_rk3(int n,double eps); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/INDEX b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/INDEX new file mode 100644 index 0000000000000000000000000000000000000000..381f6e31121fb044cd105134f34d9429cf4bfc6c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/INDEX @@ -0,0 +1,29 @@ + +******************************************************************************** + + Simulations of QCD with Wilson quarks + +******************************************************************************** + +Simulation programs + +qcd1 HMC simulation program for QCD with Wilson quarks. + +ym1 HMC simulation program for the (pure) SU(3) gauge theory. + + +Measurement programs + +ms1 Measurement of reweighting factors. + +ms2 Computation of the spectral range of the hermitian + Dirac operator. + +ms3 Computation of Wilson flow observables. + +ms4 Computation of quark propagators. + + +Some examples of valid input parameter files can be found in the directory +./examples. + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..7f97805f8ab8cb5e2be57c4fc35c658fe6d575a6 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/Makefile @@ -0,0 +1,173 @@ +################################################################################ +# +# Makefile to compile and link C programs with MPI subroutines. Version valid +# for Linux machines with GNU make. +# +# "make" compiles and links the specified main programs and modules, using the +# specified libraries (if any), and produces the executables. +# +# "make clean" removes all files generated by "make". +# +# Dependencies on included files are automatically taken care of. +# +################################################################################ + +all: rmxeq mkdep mkxeq +.PHONY: all + + +# main programs and modules to be compiled + +MAIN = ym1 qcd1 ms1 ms2 ms3 ms4 + +ARCHIVE = archive sarchive + +BLOCK = block blk_grid map_u2blk map_sw2blk map_s2blk + +DFL = dfl_geometry dfl_subspace ltl_gcr dfl_sap_gcr dfl_modes + +DIRAC = Dw_dble Dw Dw_bnd + +FLAGS = flags action_parms dfl_parms force_parms hmc_parms lat_parms \ + mdint_parms rat_parms rw_parms sap_parms solver_parms + +FORCES = force0 force1 force2 force3 force4 force5 \ + frcfcts genfrc tmcg tmcgm xtensor + +LATTICE = bcnds uidx ftidx geometry + +LINALG = salg salg_dble valg valg_dble liealg cmatrix_dble cmatrix + +LINSOLV = cgne mscg fgcr fgcr4vd + +LITTLE = Aw_gen Aw_com Aw_ops Aw_dble Aw ltl_modes + +MDFLDS = mdflds fcom + +RANDOM = ranlux ranlxs ranlxd gauss + +RATFCTS = elliptic zolotarev ratfcts + +SAP = sap_com sap_gcr sap blk_solv + +SFLDS = sflds scom sdcom Pbnd Pbnd_dble + +SU3FCTS = chexp su3prod su3ren cm3x3 random_su3 + +SW_TERM = pauli pauli_dble swflds sw_term + +TCHARGE = ftcom ftensor tcharge ym_action + +UFLDS = plaq_sum uflds udcom bstap + +UPDATE = chrono mdsteps counters mdint hmc rwtm rwtmeo rwrat + +UTILS = endian mutils utils wspace + +VFLDS = vflds vinit vcom vdcom + +WFLOW = wflow + +MODULES = $(ARCHIVE) $(BLOCK) $(DFL) $(DIRAC) $(FLAGS) $(FORCES) \ + $(LATTICE) $(LINALG) $(LINSOLV) $(LITTLE) $(MDFLDS) $(RANDOM) \ + $(RATFCTS) $(SAP) $(SFLDS) $(SU3FCTS) $(SW_TERM) $(TCHARGE) \ + $(UFLDS) $(UPDATE) $(UTILS) $(VFLDS) $(WFLOW) + + +# Logging option (-mpilog or -mpitrace or -mpianim) + +LOGOPTION = + + +# search path for modules + +MDIR = ../modules + +VPATH = .:$(MDIR)/flags:$(MDIR)/lattice:$(MDIR)/archive:$(MDIR)/linalg:\ + $(MDIR)/random:$(MDIR)/uflds:$(MDIR)/mdflds:$(MDIR)/su3fcts:\ + $(MDIR)/utils:$(MDIR)/forces:$(MDIR)/sflds:$(MDIR)/dirac:\ + $(MDIR)/sw_term:$(MDIR)/tcharge:$(MDIR)/block:$(MDIR)/sap:\ + $(MDIR)/linsolv:$(MDIR)/dfl:$(MDIR)/vflds:$(MDIR)/little:\ + $(MDIR)/update:$(MDIR)/wflow:$(MDIR)/ratfcts + + +# additional include directories + +INCPATH = $(MPI_INCLUDE) ../include + + +# additional libraries + +LIBS = m + +LIBPATH = $(MPI_HOME)/lib + + +# scheduling and optimization options + +CFLAGS = -std=c89 -pedantic -fstrict-aliasing \ + -Wall -Wno-long-long -Wstrict-prototypes -Werror \ + -O -mno-avx -Dx64 -DPM + + +# debugging flags (add to CFLAGS if needed) + +# -DCGNE_DBG -DFGCR_DBG -FGCR4VD_DBG -DMSCG_DBG +# -DDFL_MODES_DBG -DMDINT_DBG -DRWRAT_DBG + + +############################## do not change ################################### + +SHELL=/bin/bash +CC=$(MPI_HOME)/bin/mpicc +CLINKER=$(CC) + +PGMS= $(MAIN) $(MODULES) + +-include $(addsuffix .d,$(PGMS)) + + +# rule to make dependencies + +$(addsuffix .d,$(PGMS)): %.d: %.c Makefile + @ $(GCC) -ansi $< -MM $(addprefix -I,$(INCPATH)) -o $@ + + +# rule to compile source programs + +$(addsuffix .o,$(PGMS)): %.o: %.c Makefile + $(CC) $< -c $(CFLAGS) $(LOGOPTION) $(addprefix -I,$(INCPATH)) + + +# rule to link object files + +$(MAIN): %: %.o $(addsuffix .o,$(MODULES)) Makefile + $(CLINKER) $< $(addsuffix .o,$(MODULES)) $(CFLAGS) $(LOGOPTION) \ + $(addprefix -L,$(LIBPATH)) $(addprefix -l,$(LIBS)) -o $@ + + +# produce executables + +mkxeq: $(MAIN) + + +# remove old executables + +rmxeq: + @ -rm -f $(MAIN); \ + echo "delete old executables" + + +# make dependencies + +mkdep: $(addsuffix .d,$(PGMS)) + @ echo "generate tables of dependencies" + + +# clean directory + +clean: + @ -rm -rf *.d *.o *.alog *.clog *.slog $(MAIN) +.PHONY: clean + +################################################################################ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/README.global b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/README.global new file mode 100644 index 0000000000000000000000000000000000000000..af2293323dced4a360cb82fe925f915c54ca8527 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/README.global @@ -0,0 +1,337 @@ + +Header file global.h + +SYNTAX + +In the main program + +#define MAIN_PROGRAM +#include "global.h" + +In all other cases + +#include "global.h" + + +DESCRIPTION + +In this file the globally accessible constants, variables and arrays are +defined. It is here that the geometry of the lattice and its division into +processor sublattices is defined. + + +Lattice geometry +---------------- + +Currently the only constants that the user can specify are + + NPROC0 The processes are thought to be arranged in a hypercubic + NPROC1 grid with NPROC0,..,NPROC3 processes in direction 0,..,3. + NPROC2 If NPROCx=1 the lattice is not divided in direction x. + NPROC3 Otherwise NPROCx has to be even. + + L0 The local lattices are blocks of size L0xL1xL2xL3 that + L1 build up the full lattice in the obvious way. The sizes + L2 of the latter are thus (NPROC0*L0),..,(NPROC3*L3). It + L3 is assumed that L0,..,L3 are all even and at least 4. + + NPROC0_BLK The process grid is logically divided into hypercubic + NPROC1_BLK blocks of size NPROC0_BLK,..,NPROC3_BLK in direction + NPROC2_BLK 0,..,3. NPROCx_BLK must be greater or equal to 1 and + NPROC3_BLK NPROCx must be an integer multiple of NPROCx_BLK. + +The program verifies at compilation time that the values of these constants +are in the allowed range. See the section "MPI process ranking" below for +further explanation of the parameters NPROCx_BLK. + +All other macros that are defined in global.h are derived from these input +values. In particular + + NPROC Total number of processes. + + VOLUME Number of lattice points in the local lattice + [=L0*L1*L2*L3]. + +Independently of the boundary conditions imposed on the dynamical fields, the +lattice is considered to be a 4-dimensional torus. Depending on the process +numbers NPROC0,..,NPROC3, the local lattices can have non-empty boundaries on +up to 8 sides. A two-dimensional sketch of the situation is + + + + + + + + + * Volume points = the true + + * * * * * * * + local lattice. + + * * * * * * * + + + * * * * * * * + + Exterior boundary points = + + * * * * * * * + copies of the corresponding + + * * * * * * * + points of the local lattices + + * * * * * * * + on the neighbouring processes. + + * * * * * * * + + + * * * * * * * + + + + + + + + + + +Note that there is no boundary in direction x if NPROCx=1, since the exterior +boundary points in that direction coincide, in this case, with the interior +boundary points on the opposite side of the local lattice. The numbers of +exterior boundary points in direction 0,1,2,3 and the total number of boundary +points are + + FACE0 + FACE1 + FACE2 + FACE3 + + BNDRY = 2*(FACE0+FACE1+FACE2+FACE3) + +where, by definition, FACEx=0 if NPROCx=1. The boundaries of the local lattice +are labeled such that the face in direction -0 has label 0, the face in +direction +0 has label 1, the face in direction -1 has label 2, and so on. + +The global arrays that define the process grid are + + int cpr[4] Cartesian coordinates of the local process. + + int npr[8] Process ids of the 8 processes that operate on the 8 + neighbouring lattices of the local lattice. Explicitly, + npr[2*mu] is the id of the process in direction -mu and + npr[2*mu+1] the same in direction +mu. + +The global arrays that define the lattice geometry are + + int ipt[VOLUME] ipt[x3+L3*x2+L2*L3*x1+L1*L2*L3*x0] is the index of the + point on the local lattice with Cartesian coordinates + (x0,x1,x2,x3), where the coordinate x0 ranges from 0 + to L0-1, x1 from 0 to L1-1, and so on. + + int iup[VOLUME][4] iup[ix][mu] is the index of the nearest neighbour + point in the positive ("up") direction mu of the + point on the local lattice with index ix. If the + nearest neighbour point is on the boundary of the + lattice, the index iy=iup[ix][mu] is in the range + VOLUME<=iy [-noexp] [-a [-norng]] + + +DESCRIPTION + +This program reads gauge field configurations from disk and computes +stochastic estimates of reweighting factors. + + +COMMAND-LINE OPTIONS + +The program has only few options since most of the parameters are passed +through an input file. The options are + +-i Specifies the name of the input file. The name can be + a fully qualified name or be specified relative to the + working directory. + +-noexp Field configurations are normally read in exported + file format from the specified configuration directory. + If this option is set, the configurations are instead + expected in the imported file format on the local disks. + +-a This option specifies that the run is a continuation of + a previous run. All output data are appended to the + previous output files. + +-norng Continuation runs normally start from the saved state + of the random number generators. This option specifies + that the traditional initialization of the generators is + to be used (see section RANDOM NUMBER GENERATOR below). + NOTE: starting from the saved state is not possible if + the process grid sizes NPROC0,..,NPROC3 are changed. + + +INPUT PARAMETERS + +The lattice size and the process grid must be defined in the file global.h +(see README.global). All other parameters are read from the input file. An +example of a valid input file is ms1.in in this directory. The parameter +values specified in this file are: + +[Run name] +name Snoopy137 # Run name = configuration base name + +[Directories] +log_dir ../data/ms1/log # Log file directory +dat_dir ../data/ms1/dat # Data file directory +loc_dir /ndata/qcd1/cnfg # Local configuration directory +cnfg_dir /data/qcd1/cnfg # Exported configuration directory + +[Configurations] +first 1 # No of the first configuration to consider +last 4 # No of the last configuration +step 1 # Configuration separation (last-first must + # be an integer multiple of step) +nrw 2 # Number of reweighting factors to be + # computed in this run + +[Random number generator] +level 0 # Ranlux level +seed 73099 # Ranlux seed + +[Lattice parameters] +kappa 0.1300 0.1290 # List of sea-quark hopping parameters +csw 1.234 # Coefficient of the SW term in the + # Dirac operator + +[Boundary conditions] +type 2 # Type of boundary condition (0: open, + # 1: SF, 2: open-SF, 3: periodic) +phi 0.12 -0.56 # Boundary values of the gauge field at + # time 0 +phi' 0.92 0.76 # Boundary values of the gauge field at + # time NPROC0*L0 +cF 0.95 # Fermion action improvement coefficient + # at time 0 +cF' 0.90 # Fermion action improvement coefficient + # at time NPROC0*L0 + +Then follows a description of the reweighting factors labeled by an index that +runs from 0 to nrw-1 (see flags/rw_parms.c). The available reweighting factors +and associated parameter sections are described in the file doc/parms.pdf (see +the top of the modules update/rwtm.c, update/rwtmeo.c and update/rwrat.c for +further explanations). + +Reweighting factors of type RWRAT require a choice of a rational function. The +solvers to be used need to be specified too (see doc/parms.pdf). + +Superfluous sections and parameters may be deleted or commented out. If +present they are not read by the program and the specified values (if any) +have no effect on the run. As already mentioned, the indices of the parameter +sections describing the reweighting factors must increase in steps of 1 from 0 +to nrw-1. The indices of the solver sections can be freely chosen in the range +0,..,31. + + +FILES + +The program searches for exported field configurations + + n + +in the directory cnfg_dir, where is the configuration number. +Imported configurations + + n_0 (on process 0) + n_1 (on process 1) + n_2 (on process 2) + ... ... + +are searched in the directory loc_dir. + +The program writes the results of the computations to the files + + .ms1.log Log file + .ms1.log~ Backup log file + + .ms1.dat Measurement data file + .ms1.dat~ Backup data file + + .ms1.par Parameter data file + .ms1.par~ Backup parameter data file + + .ms1.rng Exported state of the random number generators + .ms1.rng~ Backup random number generator state file + +in the directories log_dir (log file) and dat_dir (data files). The parameter +file is created at the beginning of the run and remains unchanged after that. +The backup copies *.log~, *.dat~ and *.rng~ of the *.log, *.dat and *.rng file +are updated each time a configuration is fully processed. + +The directories log_dir and dat_dir, as well as the directory cnfg_dir if the +option -noexp is not set, must be accessible from process 0. If the -noexp +option is set, the directory loc_dir must be accessible from all processes. + + +OUTPUT DATA + +At the beginning of the data file the program writes the data contained in the +header structure + +static struct +{ + int nrw; + int *nfct,*nsrc; +} file_head; + +where nrw is the number of reweighting factors specified in the input file, +nfct[0],..,nfct[nrw-1] the array of the associated numbers of Hasenbusch +factors (set to 1 for RWRAT reweighting factors) and nsrc[0],..,nsrc[nrw-1] +the array of the associated numbers N of source fields. + +After the header data, the data file contains a sequence of structures + +static struct +{ + int nc; + double ***sqn,***lnr; +} data; + +labeled by the field configuration number nc. For each configuration, the data +are + + sqn[irw][ifct][isrc] Square norm of the source field number isrc + generated in the course of the calculation of + the factor number ifct of the reweighting factor + number irw. + + lnr[irw][ifct][isrc] The logarithm, -ln(r), of the associated stochastic + estimate of the reweighting factor r (irw=0,..,nrw-1, + ifct=0,..,nfct[irw-1], isrc=0,..,nsrc[irw]-1). + +See the functions write_file_head() and write_data() in the file ms1.c for the +exact order in which the data are written to the output files. + +From these data, the stochastic estimates W[irw] of the reweighting factor +number irw are obtained by calculating the averages + + w[irw][ifct]= + + (1/nsrc[irw])*sum_{isrc=0}^{nsrc[irw]-1} exp{-lnr[irw][ifct][isrc]} + +and the product + + W[irw]=prod_{ifct=0}^{nfct[irw]-1} w[irw][ifct] + +A simple main program that reads and analyses the data files is included in +the directory ../devel/nompi/main. + + +BINARY FILE FORMAT + +The log files are ASCII files that should be readable on any machine. The +data files, on the other hand, are written in binary format using the fwrite() +function. Integers are written as 4 byte signed integers and floating-point +numbers according to the IEEE-754 standard for double-precision numbers. + +Binary data written to disk are converted to little endian byte order if the +machine is big endian. Field configurations and measurement data stored on +disk are thus always in little endian byte order independently of the machine +that is used. + + +RANDOM NUMBER GENERATOR + +Random numbers are generated using the ranlux generator. Depending on the +context, either single- or double-precision random numbers are generated. The +initialization of the generator is as follows: + +- In the case of a new run, the program reads the parameters "level" and + "seed" from the input file and uses these to initialize the generator. + +- Continuation runs do the following: + + o If the option -norng is set, the parameters "level" and "seed" are read + from the input parameter file and the generator is initialized using + "seed"^n (bitwise exclusive or) as the seed value, where n is the number + of the last field configuration saved in the previous run. + + o Otherwise the state of the generator is read from the file + .ms1.rng. The generator is thus reset to the state it had at the + end of the previous run. Note that the process grid NPROC0x..xNPROC3 must + be unchanged in this case from one run to the next (an error occurs if it + is not). + +In a sequence of continuation runs, it is therefore recommended to leave the +process grid unchanged and to make no use of the option -norng. If the process +grid is changed at some point, the next run must start from an exported field +configuration and the option -norng must be set. In all cases, the parameters +"level" and "seed" on the input parameter file may be left unchanged. + +Note that if the configurations are read in imported form, the state of the +generator is *not* set to the one stored on the configuration file. The +generated random numbers, and consequently the computed reweighting factors, +are therefore independent of whether the configurations are read in imported +or exported form. + + +SAFETY MEASURES AND ERROR REPORTING + +A number of safety measures have been implemented: + +- It is not possible to overwrite an existing log or data file; these + must first be deleted or renamed by hand if a run is to be repeated. + +- Appending a run to a previous measurement run is only possible if the run + name and all relevant parameters match. Moreover, the new configuration + sequence must extend the previous one with the same configuration spacing. + +- The accessibility of the various directories and the compatibility + of the chosen parameters is checked at the beginning of the program. + +Any attempt to force illegal operations leads to an abnormal termination of +the program, with an informative message being written either to the log file +or the file STARTUP_ERROR in the program directory (if the error occurs before +the log file is opened). + +It should be noted that filenames may not be longer than 127 characters. The +program checks at an early stage whether this is the case or not. Longer +filenames can be accommodated by setting the macro NAME_SIZE in the header +file global.h to a larger value. + + +CHECKPOINTS AND EARLY TERMINATION + +The program can be stopped gracefully by touching a file in the log directory +with the same name as the log file but with extension .end instead of .log. It +may take a while until the program exits, because it will only do so when the +current field configuration is fully processed. + +If the machine crashes, or if the program was stopped in the way described, +the run can always be continued starting from the saved output files. However, +after a crash, the log and data files may be corrupted, in which case they +must first be restored from the backup files. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/README.ms2 b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/README.ms2 new file mode 100644 index 0000000000000000000000000000000000000000..1874fd3774aa4f12777e7d844e51e942e98b9bac --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/README.ms2 @@ -0,0 +1,140 @@ + +Main program ms2 + +SYNTAX + +ms2 -i [-noexp] + + +DESCRIPTION + +This program reads gauge field configurations from disk and estimates the +spectral range [ra,rb] of the even-odd preconditioned hermitian Dirac operator +(Dwhat^dagger*Dwhat)^(1/2) using the power method. A summary of results is +printed to the log file together with a table of suggested parameters of the +Zolotarev rational approximation for the operator (Dwhat^dagger*Dwhat)^(-1/2). + + +COMMAND-LINE OPTIONS + +The program has only few options since most of the parameters are passed +through an input file. The options are + +-i Specifies the name of the input file. The name can be + a fully qualified name or be specified relative to the + working directory. + +-noexp Field configurations are normally read in exported + file format from the specified configuration directory. + If this option is set, the configurations are instead + expected in the imported file format on the local disks. + + +INPUT PARAMETERS + +The lattice size and the process grid must be defined in the file global.h +(see README.global). All other parameters are read from the input file. An +example of a valid input file is ms2.in in this directory. The parameter +values specified in this file are: + +[Run name] +name Snoopy137 # Run name = configuration base name + +[Directories] +log_dir ../data/ms2/log # Log file directory +loc_dir /ndata/qcd1/cnfg # Local configuration directory +cnfg_dir /data/qcd1/cnfg # Exported configuration directory + +[Configurations] +first 1 # No of the first configuration to consider +last 4 # No of the last configuration +step 1 # Configuration separation (last-first must + +[Dirac operator] +kappa 0.1300 # Hopping parameter +csw 1.234 # Coefficient of the SW term + +[Boundary conditions] +type 2 # Type of boundary condition (0: open, + # 1: SF, 2: open-SF, 3: periodic) +phi 0.12 -0.56 # Boundary values of the gauge field at + # time 0 +phi' 0.92 0.76 # Boundary values of the gauge field at + # time NPROC0*L0 +cF 0.95 # Fermion action improvement coefficient + # at time 0 +cF' 0.90 # Fermion action improvement coefficient + # at time NPROC0*L0 + +[Power method] +np_ra 20 # Number of power iterations to be + # applied when estimating ra +np_rb 100 # Number of power iterations to be + # applied when estimating rb + +Then follows a description of the solver for the Dirac equation to be used in +the course of the inverse power iterations (see doc/parms.pdf). The supported +solvers are CGNE, SAP_GCR and DFL_SAP_GCR. + +Superfluous sections and parameters may be deleted or commented out. If +present they are not read by the program and the specified values (if any) +have no effect on the run. The solver index must be set to 0. + + +FILES + +The program searches for exported field configurations + + n + +in the directory cnfg_dir, where is the configuration number. +Imported configurations + + n_0 (on process 0) + n_1 (on process 1) + n_2 (on process 2) + ... ... + +are searched in the directory loc_dir. + +The program prints the results of the computations to the files + + .ms2.log Log file + .ms2.log~ Backup log file + +in the directory log_dir. The backup file is updated each time a configuration +is fully processed. + +The directory log_dir, as well as the directory cnfg_dir if the -noexp option +is not set, must be accessible from process 0. If the -noexp option is set, +the directory loc_dir must be accessible from all processes. + + +SAFETY MEASURES AND ERROR REPORTING + +A number of safety measures have been implemented: + +- It is not possible to overwrite an existing log file. The file + must first be deleted or renamed if a run is to be repeated. + +- The accessibility of the various directories and the compatibility + of the chosen parameters is checked at the beginning of the program. + +Any attempt to force illegal operations leads to an abnormal termination of +the program, with an informative message being written either to the log file +or the file STARTUP_ERROR in the program directory (if the error occurs before +the log file is opened). + +It should be noted that filenames may not be longer than 127 characters. The +program checks at an early stage whether this is the case or not. Longer +filenames can be accommodated by setting the macro NAME_SIZE in the header +file global.h to a larger value. + + +EARLY TERMINATION + +The program can be stopped gracefully by touching a file in the log directory +with the same name as the log file but with extension .end instead of .log. It +may take a while until the program exits, because it will only do so when the +current field configuration is fully processed. + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/README.ms3 b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/README.ms3 new file mode 100644 index 0000000000000000000000000000000000000000..6245f39d3a15ebd9403bd419e5722d3433ea6297 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/README.ms3 @@ -0,0 +1,200 @@ + +Main program ms3 + +SYNTAX + +ms3 -i [-noexp] [-a] + + +DESCRIPTION + +This program reads gauge field configurations from disk and computes +a set of Wilson flow observables. + + +COMMAND-LINE OPTIONS + +The program has only few options since most of the parameters are passed +through an input file. The options are + +-i Specifies the name of the input file. The name can be + a fully qualified name or be specified relative to the + working directory. + +-noexp Field configurations are normally read in exported + file format from the specified configuration directory. + If this option is set, the configurations are instead + expected in the imported file format on the local disks. + +-a This option specifies that the run is a continuation of + a previous run. All output data are appended to the + previous output files. + + +INPUT PARAMETERS + +The lattice size and the process grid must be defined in the file global.h +(see README.global). All other parameters are read from the input file. An +example of a valid input file is ms3.in in this directory. The parameter +values specified in this file are: + +[Run name] +name Snoopy137 # Run name = configuration base name + +[Directories] +log_dir ../data/ms3/log # Log file directory +dat_dir ../data/ms3/dat # Data file directory +loc_dir /ndata/qcd1/cnfg # Local configuration directory +cnfg_dir /data/qcd1/cnfg # Exported configuration directory + +[Configurations] +first 1 # No of the first configuration to consider +last 4 # No of the last configuration +step 1 # Configuration separation (last-first must + # be an integer multiple of step) + +[Boundary conditions] +type 2 # Type of boundary condition (0: open, + # 1: SF, 2: open-SF, 3: periodic) +phi 0.12 -0.56 # Boundary values of the gauge field at + # time 0 +phi' 0.92 0.76 # Boundary values of the gauge field at + # time NPROC0*L0 + +[Wilson flow] +integrator RK3 # EULER: Euler, RK2: 2nd order Runge-Kutta + # RK3: 3rd order Runge-Kutta +eps 2.0e-2 # Integration time step size +nstep 100 # Number of integration steps +dnms 10 # Number of integration steps between + # observable measurements + +Superfluous parameters may be deleted or commented out. If present they are +not read by the program and the specified values (if any) have no effect on +the run. + +FILES + +The program searches for exported field configurations + + n + +in the directory cnfg_dir, where is the configuration number. +Imported configurations + + n_0 (on process 0) + n_1 (on process 1) + n_2 (on process 2) + ... ... + +are searched in the directory loc_dir. + +The program writes the results of the computations to the files + + .ms3.log Log file + .ms3.log~ Backup log file + + .ms3.dat Measurement data file + .ms3.dat~ Backup data file + + .ms3.par Parameter data file + .ms3.par~ Backup parameter data file + +in the directories log_dir (log file) and dat_dir (data files). The parameter +file is created at the beginning of the run and remains unchanged after that. +The backup log and data files are updated each time a configuration is fully +processed. + +The directories log_dir and dat_dir, as well as the directory cnfg_dir if the +-noexp option is not set, must be accessible from process 0. If the -noexp +option is set, the directory loc_dir must be accessible from all processes. + + +OUTPUT DATA + +For each configuration, the Wilson flow is integrated from flow time 0 to time +"nstep"*"eps" in steps of eps using the specified integrator. After every +"dnms" integration steps, the time-slice sums of the densities of the Wilson +plaquette action, the Yang-Mills action and the topological charge are +computed (see uflds/plaq_sum.c, tcharge/ftensor.c, tcharge/ym_action.c and +tcharge/tcharge.c). + +At the beginning of the measurement data file the program writes the data +contained in the header structure + +static struct +{ + int dn,nn,tmax; + double eps; +} file_head; + +where dn="dnms", nn="nstep"/"dnms" and tmax=NPROC0*L0. After the header data, +the data file contains a sequence of data structures + +static struct +{ + int nc; + double **Wsl,**Ysl,**Qsl; +} data; + +labeled by the configuration number nc. In each case the time-slice sums of +the densities of the Wilson plaquette action, the Yang-Mills action and the +topological charge are written to the arrays + + Wsl[in][t] (in=0,..,nn, t=0,..,tmax-1) + Ysl[in][t] + Qsl[in][t] + +See the functions write_file_head() and write_data() in the program file +ms3.c for the exact order in which the data are written to the output files. + + +BINARY FILE FORMAT + +The log files are ASCII files that should be readable on any machine. The +data files, on the other hand, are written in binary format using the fwrite() +function. Integers are written as 4 byte signed integers and floating-point +numbers according to the IEEE-754 standard for double-precision numbers. + +Binary data written to disk are converted to little endian byte order if the +machine is big endian. Field configurations and measurement data stored on +disk are thus always in little endian byte order independently of the machine +that is used. + + +SAFETY MEASURES AND ERROR REPORTING + +A number of safety measures have been implemented: + +- It is not possible to overwrite an existing log or data file; these + must first be deleted or renamed by hand if a run is to be repeated. + +- Appending a run to a previous measurement run is only possible if the run + name and all relevant parameters match. Moreover, the new configuration + sequence must extend the previous one with the same configuration spacing. + +- The accessibility of the various directories and the compatibility + of the chosen parameters is checked at the beginning of the program. + +Any attempt to force illegal operations leads to an abnormal termination of +the program, with an informative message being written either to the log file +or the file STARTUP_ERROR in the program directory (if the error occurs before +the log file is opened). + +It should be noted that filenames may not be longer than 127 characters. The +program checks at an early stage whether this is the case or not. Longer +filenames can be accommodated by setting the macro NAME_SIZE in the header +file global.h to a larger value. + + +CHECKPOINTS AND EARLY TERMINATION + +The program can be stopped gracefully by touching a file in the log directory +with the same name as the log file but with extension .end instead of .log. It +may take a while until the program exits, because it will only do so when the +current field configuration is fully processed. + +If the machine crashes, or if the program was stopped in the way described, +the run can always be continued starting from the saved output files. However, +after a crash, the log and data files may be corrupted, in which case they +must first be restored from the backup files. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/README.ms4 b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/README.ms4 new file mode 100644 index 0000000000000000000000000000000000000000..c00dbf84a40f5dae8a92c3d02f2b1c373bd5bad4 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/README.ms4 @@ -0,0 +1,201 @@ + +Main program ms4 + +SYNTAX + +ms4 -i [-noexp] + + +DESCRIPTION + +This program reads gauge field configurations from disk and computes the quark +propagator for a number of gaussian random source fields at a specified time +x0. The calculated propagators are exported to disk in a universal format (see +archive/sarchive.c). The program has a very limited functionality and serves +for illustration only. + + +COMMAND-LINE OPTIONS + +The program has only few options since most of the parameters are passed +through an input file. The options are + +-i Specifies the name of the input file. The name can be + a fully qualified name or be specified relative to the + working directory. + +-noexp Field configurations are normally read in exported + file format from the specified configuration directory. + If this option is set, the configurations are instead + expected in the imported file format on the local disks. + + +INPUT PARAMETERS + +The lattice size and the process grid must be defined in the file global.h +(see README.global). All other parameters are read from the input file. An +example of a valid input file is ms4.in in this directory. The parameter +values specified in this file are: + +[Run name] +name Snoopy137 # Run name = configuration base name + +[Directories] +log_dir ../data/ms4/log # Log file directory +loc_dir /ndata/qcd1/cnfg # Local configuration directory +cnfg_dir /data/qcd1/cnfg # Exported configuration directory +sfld_dir /data/ms4/sfld # Exported propagator directory + +[Configurations] +first 1 # No of the first configuration to consider +last 4 # No of the last configuration +step 1 # Configuration separation (last-first must + # be an integer multiple of step) + +[Random number generator] +level 0 # Ranlux level +seed 73099 # Ranlux seed + +[Dirac operator] +kappa 0.1300 # Hopping parameter +mu 0.001 # Twisted mass +csw 1.234 # Coefficient of the SW term + +[Boundary conditions] +type 2 # Type of boundary condition (0: open, + # 1: SF, 2: open-SF, 3: periodic) +phi 0.12 -0.56 # Boundary values of the gauge field at + # time 0 +phi' 0.92 0.76 # Boundary values of the gauge field at + # time NPROC0*L0 +cF 0.95 # Fermion action improvement coefficient + # at time 0 +cF' 0.90 # Fermion action improvement coefficient + # at time NPROC0*L0 + +[Source fields] +x0 20 # Time at which the random source fields + # live (0<=x0n + +in the directory cnfg_dir, where is the configuration number. +Imported configurations + + n_0 (on process 0) + n_1 (on process 1) + n_2 (on process 2) + ... ... + +are searched in the directory loc_dir. + +The program prints some information on the progress of the computations +to the files + + .ms4.log Log file + .ms4.log~ Backup log file + +in the directory log_dir. The backup file is updated each time a configuration +is fully processed. + +The calculated solutions of the Dirac equation are stored in the files + + n.s0 (source no 0) + n.s1 (source no 1) + n.s2 (source no 2) + ... ... + +in the directory sfld_dir (nsrc files per gauge field configuration). These +files can be read using the program import_sfld() [archive/sarchive.c]. + +The directories log_dir and sfld_dir, as well as the directory cnfg_dir if the +-noexp option is not set, must be accessible from process 0. If the -noexp +option is set, the directory loc_dir must be accessible from all processes. + + +SOLVER PERFORMANCE + +The program prints the time required for the solution of the Dirac equation to +the log file. When selecting the solver, one should take into account that the +CGNE solver tends to be very slow at small quark masses. In the case of the +GCR solvers, the performance may be poor when the twisted quark mass mu is +larger than, say, 0.1 and much larger than the ordinary quark mass. The use of +the deflated solver is recommended if both masses are small. + +The processing times per gauge field configuration quoted in the log file +include the time required for the I/O operations. + + +BINARY FILE FORMAT + +The *.log files are ASCII files that should be readable on any machine. Data +and configuration files, on the other hand, are written in binary format using +the fwrite() function. Integers are written as 4 byte signed integers and +floating-point numbers according to the IEEE-754 standard for double-precision +numbers. + +Binary data written to disk are converted to little endian byte order if the +machine is big endian. Field configurations and measurement data stored on +disk are thus always in little endian byte order independently of the machine +that is used. + + +RANDOM NUMBER GENERATOR + +Random numbers are generated using the ranlux generator. The generator is +initialized using the values of the parameters "level" and "seed" specified in +the input file. If the configurations are read in imported form, the state of +the generator is *not* set to the one stored on the configuration file. The +generated random numbers, and consequently the random source fields, are +therefore independent of whether the configurations are read in imported or +exported form. + + +SAFETY MEASURES AND ERROR REPORTING + +A number of safety measures have been implemented: + +- It is not possible to overwrite an existing log file. This file + must first be deleted or renamed if a run is to be repeated. + +- The accessibility of the various directories and the compatibility + of the chosen parameters is checked at the beginning of the program. + +Any attempt to force illegal operations leads to an abnormal termination of +the program, with an informative message being written either to the log file +or the file STARTUP_ERROR in the program directory (if the error occurs before +the log file is opened). + +On the other hand, once a run started successfully, the calculated propagators +are saved unconditionally, i.e. any existing propagator files with matching +filenames are overwritten. + +It should be noted that filenames may not be longer than 127 characters. The +program checks at an early stage whether this is the case or not. Longer +filenames can be accommodated by setting the macro NAME_SIZE in the header +file global.h to a larger value. + + +EARLY TERMINATION + +The program can be stopped gracefully by touching a file in the log directory +with the same name as the log file but with extension .end instead of .log. It +may take a while until the program exits, because it will only do so when the +current field configuration is fully processed. + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/README.qcd1 b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/README.qcd1 new file mode 100644 index 0000000000000000000000000000000000000000..fa97c8ecc0b042026f6facb413bd4a918c8cae3e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/README.qcd1 @@ -0,0 +1,466 @@ + +Main program qcd1 + +SYNTAX + +qcd1 -i [-noloc] [-noexp] [-rmold] [-noms] + [-c [-a [-norng]]] + + +DESCRIPTION + +This program generates an ensemble of gauge fields representative of QCD with +a multiplet of Wilson quarks. Exactly which theory is simulated depends on the +parameters passed to the program. Moreover, one has a choice of boundary +conditions in time (open, SF, open-SF and periodic). The simulation is based +on a version of the HMC algorithm, which can be tuned in many ways via the +input parameters. + +In the course of the simulation, the average plaquette is measured +and the generated gauge field configurations are written out to files in +specified directories. Further observables, constructed using the Wilson flow, +are measured as well. + + +COMMAND-LINE OPTIONS + +The program has only few options since most of the parameters are passed +through an input file. The options are + +-i Specifies the name of the input file. The name can be + a fully qualified name or be specified relative to the + working directory. + +-noloc Normally the generated configurations are saved to the + local disks on the nodes of the machine. In addition they + are exported from process 0 using the export function (see + archive/archive.c). Initial configurations can be imported + or be read from the local disks. This option specifies that + the local disks should not be used. + +-noexp Do not export the generated field configurations. + +-rmold Remove old configurations and keep only the one which + was last saved to disk. The initial configuration + given on the command line is not removed unless the + -a option is set. + +-noms Do not measure any observables except for the average + plaquette. + +-c This option instructs the program to read the initial + gauge-field configuration from the specified file. The + file must be in one of the configuration directories + listed in the input file and its name must be of the form + described below. The run starts from a random gauge-field + configuration if this option is omitted. + +-a This option specifies that the run is a continuation of a + previous run. The -c option is required in this + case and must point to the last configuration saved by + the previous run. All output data are appended to the + previous output files. + +-norng Continuation runs normally start from the saved state + of the random number generators. This option specifies + that the traditional initialization of the generators is + to be used (see section RANDOM NUMBER GENERATOR below). + NOTE: starting from the saved state is not possible if + the process grid sizes NPROC0,..,NPROC3 are changed. + +The concurrent use of the options -noloc and -noexp (in which case the +generated configurations would not be saved anywhere) is considered to be an +error and is not permitted. In a sequence of continuation runs, the -noms +option must be set in either all or none of the runs. + + +INPUT PARAMETERS + +The lattice size and the process grid must be defined in the file global.h +(see README.global). All other parameters are read from the input file and the +command line. An example of a valid input file is qcd1.in in this directory. +The parameter values specified in this file are: + +[Run name] +name Snoopy137 # Run name = configuration base name + +[Directories] +log_dir ../data/qcd1/log # Log file directory +dat_dir ../data/qcd1/dat # Data file directory +loc_dir /ndata/qcd1/cnfg # Local configuration directory +cnfg_dir /data/qcd1/cnfg # Exported configuration directory + +[Lattice parameters] +beta 6.0 # Inverse gauge coupling +c0 1.6667 # Coefficient of the plaquette term + # in the gauge action +kappa 0.1300 # List of sea-quark hopping parameters +csw 1.234 # Coefficient of the SW term in the + # Dirac operator + +[Boundary conditions] +type 2 # Type of boundary condition (0: open, + # 1: SF, 2: open-SF, 3: periodic) +phi 0.12 -0.56 # Boundary values of the gauge field at + # time 0 +phi' 0.92 0.76 # Boundary values of the gauge field at + # time NPROC0*L0 +cG 1.10 # Gauge action improvement coefficient at + # time 0 +cG' 1.05 # Gauge action improvement coefficient at + # time NPROC0*L0 +cF 0.95 # Fermion action improvement coefficient + # at time 0 +cF' 0.90 # Fermion action improvement coefficient + # at time NPROC0*L0 + +[Random number generator] +level 0 # Ranlux level +seed 73099 # Ranlux seed + +[HMC parameters] +actions 0 1 2 # Gauge and pseudo-fermion actions included + # in the simulation +npf 2 # Number of pseudo-fermion fields +mu 0.01 1.0 # List of twisted-mass parameters +nlv 3 # Number of levels of the integrator for + # the molecular-dynamics (MD) equations +tau 0.5 # MD trajectory length + +[MD trajectories] +nth 320 # Number of thermalization trajectories +ntr 32000 # Total number of trajectories +dtr_log 4 # Separation of log entries +dtr_ms 8 # Separation of measurements +dtr_cnfg 32 # Separation of configuration saves + +Then follow the parameters of the integrator levels, the actions, the MD +forces and the solvers for the Dirac equation. Their format is described in +the file doc/parms.pdf and on top of the modules + +flags/mdint_parms.c +flags/action_parms.c +flags/force_parms.c +flags/solver_parms.c + +The integrator levels are labeled from 0 (innermost level, usually including +the force deriving from the chosen gauge action) to nlv-1 (outermost level). +Action, force and solver labels must be integers but may otherwise be chosen +arbitrarily. There must be a section for all actions, forces and solvers used, +and for each action section there must be a corresponding force section with +the same label. + +Finally, if measurements using the Wilson flow are to be made, the section + +[Wilson flow] +integrator RK3 # EULER: Euler, RK2: 2nd order Runge-Kutta + # RK3: 3rd order Runge-Kutta +eps 2.0e-2 # Integration step size +nstep 100 # Total number of integration steps +dnms 10 # Number of steps between measurements + +is required. + +The chosen parameter values must satisfy the following constraints: + +- "nth" and "ntr" must be integer multiples of "dtr_cnfg". + +- "nth" must be equal to zero in a continuation run (option -a). + +- "dtr_cnfg" must be a multiple of "dtr_log". + +- "dtr_cnfg" must be a multiple of "dtr_ms" and the latter must be + a multiple of "dtr_log". + +- The number "nstep" of Wilson flow integration steps must be a multiple + of "dnms". + +Depending on the specified options, the values of some parameters are ignored. +In particular, + +- "loc_dir" is not used if the -noloc option is set. + +- "cnfg_dir" is not used if -noexp is set and if the starting + configuration is not of the exported configuration type. + +- "lambda" is only required if the 2nd order OMF integrator is used. + +- The section "Wilson flow" and the parameter "dtr_ms" can be omitted + if the -noms option is set. + +Superfluous sections and parameters may be deleted or commented out. If +present they are not read by the program and have no effect on the run. In +particular, the constraints mentioned above involving these parameters need +not be satisfied. + + +INITIAL FIELD CONFIGURATION + +The initial field configuration specified on the command line with the -c +option can be in imported or exported form (see archive/archive.c). In the +case of imported configurations, each MPI process reads a file of the form + + _ + +where is the process number. On the command line, imported and exported +configurations are distinguished by an asterix (*) like + + * Imported configuration + + Exported configuration + +where it goes without saying that the string must not contain an +asterix at its end. + +Configurations in imported form are read from the directory loc_dir on the +local disks of the machine. The sizes of the current lattice and those read +from the files must be the same in this case. + +If the configuration is in exported form, it is read from the directory +cnfg_dir on a disk accessible from process 0. The sizes of the current lattice +need not be the same as those read from the configuration file, but must be +integer multiples of the latter. The field is periodically extended if the +lattice sizes do not match (see archive/archive.c for further explanations). + + +FILES + +The program writes the results of the calculations to the files + + .log Log file + .par Parameter file + .dat Data file + .ms.dat Measurement data file + .rng Exported state of the random number generators + + .log~ Backup log file + .par~ Backup parameter file + .dat~ Backup data file + .ms.dat~ Backup measurement data file + .rng~ Backup random number generator state file + + n3_0 Imported configuration file written by process 0 + n3_1 Imported configuration file written by process 1 + n3_2 Imported configuration file written by process 2 + ..... ..... + + n3 Exported configuration file + +Here n3 identifies configuration number 3. The directories in which these +files are stored are the ones specified in the input file. + +The directories "log_dir", "dat_dir" and "cnfg_dir" must be accessible from +process 0, while each process must be able to access the directory "loc_dir" +(unless the option -noloc is set). The "loc_dir" directory seen from different +processes may or may not be physically the same. + +Configurations are saved after the first "nth" trajectories and then after +every "dtr_cnfg" trajectories. The backup copies *.log~, *.dat~ and *.rng~ of +the *.log, *.dat and *.rng files are created each time a new configuration is +saved to disk. + +The parameter file *.par is created when a new run is started. It contains all +relevant lattice and run parameters in binary form. Continuation runs read the +file and check whether the parameter values match those read from the input +file. If a mismatch is discovered, the program is halted and an error message +is printed to the file STARTUP_ERROR in the program directory. + + +EXAMPLES + +The command + + qcd1 -i qcd1.in -c * + +starts a new run from the specified configuration which is searched for +in the "loc_dir" directory on the local disks of the machine. If instead the +run should be a continuation run, starting from the last configuration of a +previous run, the command would be + + qcd1 -i qcd1.in -c n3* -a + +In this case the *.log, *.par, *.dat and *.rng files of the previous run must +be found in the directories "log_dir" and "dat_dir", respectively. Using these +files, and the configuration name given on the command line, a number of +checks are performed to ensure that the run is indeed a continuation of the +previous one. + +In these two examples, the configuration filenames could also be and +n3 (i.e. without a "*") in which case the program assumes that the +configuration is an exported one. The configuration is then searched for in +the directory "cnfg_dir" by process 0 only. If the -c option is omitted, the +gauge field variables are set to uniformly distributed random SU(3) matrices. + + +RUN DATA + +The data taken after every "dtr_log" trajectories are collected in a structure + +typedef struct +{ + int nt,iac; + double dH,avpl; +} dat_t; + +with elements + +nt trajectory number, + +dH MD hamiltonian deficit at the end of the trajectory, + +iac 0 or 1 depending on whether the trajectory was accepted + or not, + +avpl average plaquette of the current gauge field. + +The average plaquette is equal to + + plaq_wsum_dble(1)/npl, + + npl=6*(N0-1)*N1*N2*N3 for open boundary conditions, + + =6*N0*N1*N2*N3 otherwise, + +where N0=NPROC0*L0, etc., are the lattice sizes (see uflds/plaq_sum.c). In the +course of the simulation, the collected data are written in binary form to the +*.dat file in a contiguous manner and without any header data at the beginning +of the file. They are also printed to the log file together with the average +solver iteration numbers and some further information. + +A simple main program that reads and analyses the run data files is included +in the directory ../devel/nompi/main. + + +MEASUREMENT DATA + +Unless the -noms option is set, the program performs measurements of a set of +observables based on the Wilson flow after every period of "dtr_ms" MD +trajectories. No measurements are performed in the thermalization phase (i.e. +at trajectory numbers less than "nth"). + +Each time a measurement is made, the Wilson flow is integrated from flow time +0 to time "nstep"*"eps" in steps of eps using the specified integrator. After +every "dnms" integration steps, the time-slice sums of the densities of the +Wilson plaquette action, the Yang-Mills action and the topological charge are +computed (see uflds/plaq_sum.c, tcharge/ftensor.c and tcharge/tcharge.c). + +At the beginning of the measurement data file the program writes the data +contained in the header structure + +static struct +{ + int dn,nn,tmax; + double eps; +} file_head; + +where dn="dnms", nn="nstep"/"dnms" and tmax=NPROC0*L0. After the header data, +the data file contains a sequence of data structures + +static struct +{ + int nt; + double **Wsl,**Ysl,**Qsl; +} data; + +labeled by the molecular-dynamics trajectory number nt where the measurement +was made. In each case the time-slice sums of the densities of the Wilson +plaquette action, the Yang-Mills action and the topological charge are written +to the arrays + + Wsl[in][t] (in=0,..,nn, t=0,..,tmax-1) + Ysl[in][t] + Qsl[in][t] + +See the functions write_file_head() and write_data() in the program file +qcd1.c for the exact order in which the data are written to the output files. + + +BINARY FILE FORMAT + +The log files are ASCII files that should be readable on any machine. +Configuration files and the data files, on the other hand, are written in +binary format using the fwrite() function. Integers are written as 4 byte +signed integers and floating-point numbers according to the IEEE-754 standard +for double-precision numbers. + +In the case of the exported configurations, the *.par and the *.dat files, and +if the machine is big endian, the data are converted to little endian byte +order before they are written to disk (see archive/archive.c and the functions +write_dat(), read_dat(), write_file_head() and write_data() defined in the +qcd1.c file). + + +RANDOM NUMBER GENERATOR + +Random numbers are generated using the ranlux generator. Depending on the +context, either single- or double-precision random numbers are generated. The +initialization of the generator is as follows: + +- In the case of a new run, the program reads the parameters "level" and + "seed" from the input file and uses these to initialize the generator. + +- Continuation runs starting from an imported field configuration read + the state of the generator from the configuration files. + +- Continuation runs starting from an exported field configuration do + the following: + + o If the option -norng is set, the parameters "level" and "seed" are read + from the input parameter file and the generator is initialized using + "seed"^n (bitwise exclusive or) as the seed value, where n is the number + of the last field configuration saved in the previous run. + + o Otherwise the state of the generator is read from the file .rng. + The generator is thus reset to the state it had at the end of the previous + run. Note that the process grid NPROC0x..xNPROC3 must be unchanged in this + case from one run to the next (an error occurs if it is not). + +In a sequence of continuation runs, it is therefore recommended to leave the +process grid unchanged and to make no use of the option -norng. If the process +grid is changed at some point, the next run must start from an exported field +configuration and the option -norng must be set. In all cases, the parameters +"level" and "seed" on the input parameter file may be left unchanged. + + +SAFETY MEASURES AND ERROR REPORTING + +A number of safety measures have been implemented: + +- It is not possible to overwrite an existing *.log or *.dat file; these + must first be deleted or renamed by hand if a run is to be repeated. + +- Appending a run to a previous run, but not from the last saved + configuration of that run, is not possible. + +- The accessibility of the various directories and the compatibility + of the selected options is checked at the beginning of the program. + +Any attempt to force illegal operations leads to an abnormal termination of +the program, with an informative message being written either to the *.log +file or the file STARTUP_ERROR in the program directory (if the error occurs +before the log file is opened). + +On the other hand, the following should be kept in mind: + +- Filenames may not be longer than 127 characters. The program + checks at an early stage whether this is the case or not. Longer + filenames can be accommodated by setting the macro NAME_SIZE in + the global.h header file to a larger value. + +- Once a run started successfully, the configurations generated + are saved unconditionally, i.e. any existing field configurations + with matching filenames are overwritten. + + +CHECKPOINTS AND EARLY TERMINATION + +The program can be stopped gracefully by touching a file in the log directory +with the same name as the log file but with extension .end instead of .log. It +may take a while until the program exits, because it will only do so at the +points where the gauge field configuration is saved to disk. + +If the machine crashes, or if the program was stopped in the way described, +the run can always be continued starting from the saved configuration and +output files. However, after a crash, the *.log and *.dat files may be +corrupted, in which case they must first be restored from the backup *.log~ +and *.dat~ files. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/README.ym1 b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/README.ym1 new file mode 100644 index 0000000000000000000000000000000000000000..0e409441908b4b3cbec87d80c0b31b1ba0164025 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/README.ym1 @@ -0,0 +1,439 @@ + +Main program ym1 + +SYNTAX + +ym1 -i [-noloc] [-noexp] [-rmold] [-noms] + [-c [-a [-norng]]] + + +DESCRIPTION + +This program generates an ensemble of gauge fields representative of the +(pure) SU(3) gauge theory. Exactly which theory is simulated depends on the +parameters passed to the program. Moreover, one has a choice of boundary +conditions in time (open, SF, open-SF and periodic). The simulation is +based on a version of the HMC algorithm, which can be tuned in many ways via +the input parameters. + +In the course of the simulation, the average plaquette is measured +and the generated gauge field configurations are written out to files in +specified directories. Further observables, constructed using the Wilson flow, +are measured as well. + + +COMMAND-LINE OPTIONS + +The program has only few options since most of the parameters are passed +through an input file. The options are + +-i Specifies the name of the input file. The name can be + a fully qualified name or be specified relative to the + working directory. + +-noloc Normally the generated configurations are saved to the + local disks on the nodes of the machine. In addition they + are exported from process 0 using the export function (see + archive/archive.c). Initial configurations can be imported + or be read from the local disks. This option specifies that + the local disks should not be used. + +-noexp Do not export the generated field configurations. + +-rmold Remove old configurations and keep only the one which + was last saved to disk. The initial configuration + given on the command line is not removed unless the + -a option is set. + +-noms Do not measure any observables except for the average + plaquette. + +-c This option instructs the program to read the initial + gauge-field configuration from the specified file. The + file must be in one of the configuration directories + listed in the input file and its name must be of the form + described below. The run starts from a random gauge-field + configuration if this option is omitted. + +-a This option specifies that the run is a continuation of a + previous run. The -c option is required in this + case and must point to the last configuration saved by + the previous run. All output data are appended to the + previous output files. + +-norng Continuation runs normally start from the saved state + of the random number generators. This option specifies + that the traditional initialization of the generators is + to be used (see section RANDOM NUMBER GENERATOR below). + NOTE: starting from the saved state is not possible if + the process grid sizes NPROC0,..,NPROC3 are changed. + +The concurrent use of the options -noloc and -noexp (in which case the +generated configurations would not be saved anywhere) is considered to be an +error and is not permitted. In a sequence of continuation runs, the -noms +option must be set in either all or none of the runs. + + +INPUT PARAMETERS + +The lattice size and the process grid must be defined in the file global.h +(see README.global). All other parameters are read from the input file and the +command line. An example of a valid input file is ym1.in in this directory. +The parameter values specified in this file are: + +[Run name] +name Snoopy137 # Run name = configuration base name + +[Directories] +log_dir ../data/ym1/log # Log file directory +dat_dir ../data/ym1/dat # Data file directory +loc_dir /ndata/ym1/cnfg # Local configuration directory +cnfg_dir /data/ym1/cnfg # Exported configuration directory + +[Lattice parameters] +beta 6.00 # Inverse gauge coupling +c0 1.6667 # Coefficient of the plaquette term + # in the gauge action +[Boundary conditions] +type 2 # Type of boundary condition (0: open, + # 1: SF, 2: open-SF, 3: periodic) +phi 0.12 -0.56 # Boundary values of the gauge field at + # time 0 +phi' 0.92 0.76 # Boundary values of the gauge field at + # time NPROC0*L0 +cG 1.10 # Gauge action improvement coefficient at + # time 0 +cG' 1.05 # Gauge action improvement coefficient at + # time NPROC0*L0 + +[Random number generator] +level 0 # Ranlux level +seed 73099 # Ranlux seed + +[Trajectory length] +tau 3.0 # Molecular-dynamics (MD) trajectory length + +[MD integrator] +integrator OMF4 # LPFR: leapfrog, OMF2: 2nd order OMF, + # OMF4: 4th order OMF +lambda 0.19 # Parameter of the OMF2 integrator +nstep 16 # Number of integration steps per trajectory + +[MD trajectories] +nth 320 # Number of thermalization trajectories +ntr 32000 # Total number of trajectories +dtr_log 4 # Separation of log entries +dtr_ms 8 # Separation of measurements +dtr_cnfg 32 # Separation of configuration saves + +[Wilson flow] +integrator RK3 # EULER: Euler, RK2: 2nd order Runge-Kutta + # RK3: 3rd order Runge-Kutta +eps 2.0e-2 # Integration step size +nstep 100 # Total number of integration steps +dnms 10 # Number of steps between measurements + +The chosen parameter values must satisfy the following constraints: + +- "nth" and "ntr" must be integer multiples of "dtr_cnfg". + +- "nth" must be equal to zero in a continuation run (option -a). + +- "dtr_cnfg" must be a multiple of "dtr_log". + +- "dtr_cnfg" must be a multiple of "dtr_ms" and the latter must be + a multiple of "dtr_log". + +- The number "nstep" of Wilson flow integration steps must be a multiple + of "dnms". + +Depending on the specified options, the values of some parameters are ignored. +In particular, + +- "loc_dir" is not used if the -noloc option is set. + +- "cnfg_dir" is not used if -noexp is set and if the starting + configuration is not of the exported configuration type. + +- "lambda" is only required if the 2nd order OMF integrator is used. + +- The section "Wilson flow" and the parameter "dtr_ms" can be omitted + if the -noms option is set. + +Superfluous sections and parameters may be deleted or commented out. If +present they are not read by the program and have no effect on the run. In +particular, the constraints mentioned above involving these parameters need +not be satisfied. + + +INITIAL FIELD CONFIGURATION + +The initial field configuration specified on the command line with the -c +option can be in imported or exported form (see archive/archive.c). In the +case of imported configurations, each MPI process reads a file of the form + + _ + +where is the process number. On the command line, imported and exported +configurations are distinguished by an asterix (*) like + + * Imported configuration + + Exported configuration + +where it goes without saying that the string must not contain an +asterix at its end. + +Configurations in imported form are read from the directory loc_dir on the +local disks of the machine. The sizes of the current lattice and those read +from the files must be the same in this case. + +If the configuration is in exported form, it is read from the directory +cnfg_dir on a disk accessible from process 0. The sizes of the current lattice +need not be the same as those read from the configuration file, but must be +integer multiples of the latter. The field is periodically extended if the +lattice sizes do not match (see archive/archive.c for further explanations). + + +FILES + +The program stores the results to a number of files with the following file +names: + + .log Log file + .par Parameter file + .dat Data file + .ms.dat Measurement data file + .rng Exported state of the random number generators + + .log~ Backup log file + .par~ Backup parameter file + .dat~ Backup data file + .ms.dat~ Backup measurement data file + .rng~ Backup random number generator state file + + n3_0 Imported configuration file written by process 0 + n3_1 Imported configuration file written by process 1 + n3_2 Imported configuration file written by process 2 + ..... ..... + + n3 Exported configuration file + +Here n3 identifies configuration number 3. The directories in which these +files are stored are the ones specified in the input file. + +The directories "log_dir", "dat_dir" and "cnfg_dir" must be accessible from +process 0, while each process must be able to access the directory "loc_dir" +(unless the option -noloc is set). The "loc_dir" directory seen from different +processes may or may not be physically the same. + +Configurations are saved after the first "nth" trajectories and then after +every "dtr_cnfg" trajectories. The backup copies *.log~, *.dat~ and *.rng~ of +the *.log, *.dat and *.rng files are created each time a new configuration is +saved to disk. + +The parameter file *.par is created when a new run is started. It contains all +relevant lattice and run parameters in binary form. Continuation runs read the +file and check whether the parameter values match those read from the input +file. If a mismatch is discovered, the program is halted and an error message +is printed to the file STARTUP_ERROR in the program directory. + + +EXAMPLES + +The command + + ym1 -i ym1.in -c * + +starts a new run from the specified configuration which is searched for +in the "loc_dir" directory on the local disks of the machine. If instead the +run should be a continuation run, starting from the last configuration of a +previous run, the command would be + + ym1 -i ym1.in -c n3* -a + +In this case the *.log, *.par, *.dat and *.rng files of the previous run must +be found in the directories "log_dir" and "dat_dir", respectively. Using these +files, and the configuration name given on the command line, a number of +checks are performed to ensure that the run is indeed a continuation of the +previous one. + +In these two examples, the configuration filenames could also be and +n3 (i.e. without a "*") in which case the program assumes that the +configuration is an exported one. The configuration is then searched for in +the directory "cnfg_dir" by process 0 only. If the -c option is omitted, the +gauge field variables are set to uniformly distributed random SU(3) matrices. + + +RUN DATA + +The data taken after every "dtr_log" trajectories are collected in a structure + +typedef struct +{ + int nt,iac; + double dH,avpl; +} dat_t; + +with elements + +nt trajectory number, + +dH MD hamiltonian deficit at the end of the trajectory, + +iac 0 or 1 depending on whether the trajectory was accepted + or not, + +avpl average plaquette of the current gauge field. + +The average plaquette is equal to + + plaq_wsum_dble(1)/npl, + + npl=6*(N0-1)*N1*N2*N3 for open boundary conditions, + + =6*N0*N1*N2*N3 otherwise, + +where N0=NPROC0*L0, etc., are the lattice sizes (see uflds/plaq_sum.c). In the +course of the simulation, the collected data are written in binary form to the +*.dat file in a contiguous manner and without any header data at the beginning +of the file. They are also printed to the log file together with the average +solver iteration numbers and some further information. + +A simple main program that reads and analyses the run data files is included +in the directory ../devel/nompi/main. + + +MEASUREMENT DATA + +Unless the -noms option is set, the program performs measurements of a set of +observables based on the Wilson flow after every period of "dtr_ms" MD +trajectories. No measurements are performed in the thermalization phase (i.e. +at trajectory numbers less than "nth"). + +Each time a measurement is made, the Wilson flow is integrated from flow time +0 to time "nstep"*"eps" in steps of eps using the specified integrator. After +every "dnms" integration steps, the time-slice sums of the densities of the +Wilson plaquette action, the Yang-Mills action and the topological charge are +computed (see uflds/plaq_sum.c, tcharge/ftensor.c and tcharge/tcharge.c). + +At the beginning of the measurement data file the program writes the data +contained in the header structure + +static struct +{ + int dn,nn,tmax; + double eps; +} file_head; + +where dn="dnms", nn="nstep"/"dnms" and tmax=NPROC0*L0. After the header data, +the data file contains a sequence of data structures + +static struct +{ + int nt; + double **Wsl,**Ysl,**Qsl; +} data; + +labeled by the molecular-dynamics trajectory number nt where the measurement +was made. In each case the time-slice sums of the densities of the Wilson +plaquette action, the Yang-Mills action and the topological charge are written +to the arrays + + Wsl[in][t] (in=0,..,nn, t=0,..,tmax-1) + Ysl[in][t] + Qsl[in][t] + +See the functions write_file_head() and write_data() in the program file +ym1.c for the exact order in which the data are written to the output files. + + +BINARY FILE FORMAT + +The *.log files are ASCII files that should be readable on any machine. +Configuration files and the *.dat files, on the other hand, are written in +binary format using the fwrite() function. Integers are written as 4 byte +signed integers and floating-point numbers according to the IEEE-754 standard +for double-precision numbers. + +In the case of the exported configurations, the *.par and the *.dat files, and +if the machine is big endian, the data are converted to little endian byte +order before they are written to disk (see archive/archive.c and the functions +write_dat(), read_dat(), write_file_head() and write_data() defined in the +ym1.c file). + + +RANDOM NUMBER GENERATOR + +Random numbers are generated using the ranlux generator. Depending on the +context, either single- or double-precision random numbers are generated. The +initialization of the generator is as follows: + +- In the case of a new run, the program reads the parameters "level" and + "seed" from the input file and uses these to initialize the generator. + +- Continuation runs starting from an imported field configuration read + the state of the generator from the configuration files. + +- Continuation runs starting from an exported field configuration do + the following: + + o If the option -norng is set, the parameters "level" and "seed" are read + from the input parameter file and the generator is initialized using + "seed"^n (bitwise exclusive or) as the seed value, where n is the number + of the last field configuration saved in the previous run. + + o Otherwise the state of the generator is read from the file .rng. + The generator is thus reset to the state it had at the end of the previous + run. Note that the process grid NPROC0x..xNPROC3 must be unchanged in this + case from one run to the next (an error occurs if it is not). + +In a sequence of continuation runs, it is therefore recommended to leave the +process grid unchanged and to make no use of the option -norng. If the process +grid is changed at some point, the next run must start from an exported field +configuration and the option -norng must be set. In all cases, the parameters +"level" and "seed" on the input parameter file may be left unchanged. + + +SAFETY MEASURES AND ERROR REPORTING + +A number of safety measures have been implemented: + +- It is not possible to overwrite an existing *.log or *.dat file; these + must first be deleted or renamed by hand if a run is to be repeated. + +- Appending a run to a previous run, but not from the last saved + configuration of that run, is not possible. + +- The accessibility of the various directories and the compatibility + of the selected options is checked at the beginning of the program. + +Any attempt to force illegal operations leads to an abnormal termination of +the program, with an informative message being written either to the *.log +file or the file STARTUP_ERROR in the program directory (if the error occurs +before the log file is opened). + +On the other hand, the following should be kept in mind: + +- Filenames may not be longer than 127 characters. The program + checks at an early stage whether this is the case or not. Longer + filenames can be accommodated by setting the macro NAME_SIZE in + the global.h header file to a larger value. + +- Once a run started successfully, the configurations generated + are saved unconditionally, i.e. any existing field configurations + with matching filenames are overwritten. + + +CHECKPOINTS AND EARLY TERMINATION + +The program can be stopped gracefully by touching a file in the log directory +with the same name as the log file but with extension .end instead of .log. It +may take a while until the program exits, because it will only do so at the +points where the gauge field configuration is saved to disk. + +If the machine crashes, or if the program was stopped in the way described, +the run can always be continued starting from the saved configuration and +output files. However, after a crash, the *.log and *.dat files may be +corrupted, in which case they must first be restored from the backup *.log~ +and *.dat~ files. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/README b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/README new file mode 100644 index 0000000000000000000000000000000000000000..d291cd7673bcb616900db0b64f8c9251a2d9ca95 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/README @@ -0,0 +1,11 @@ + +******************************************************************************** + + Examples of input parameter files + +******************************************************************************** + +Some of the input parameter files included in these directories have been used +in actual simulation and measurement runs. Note, however, that simulations +require thermalization in the course of which the parameters may have to be +chosen differently. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/ms1/48x24v1.ms1.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/ms1/48x24v1.ms1.in new file mode 100644 index 0000000000000000000000000000000000000000..ad826547dce830260280ecb6aaaef9f3f0cd3432 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/ms1/48x24v1.ms1.in @@ -0,0 +1,69 @@ + +################################################################################ +# +# Two-flavour QCD with open boundary conditions, twisted-mass reweighting of +# the first kind and decomposition of the reweighting factor in 2 factors. +# +################################################################################ + +[Run name] +name 48x24v1 + +[Directories] +log_dir /data/openQCD/ms1/log +dat_dir /data/openQCD/ms1/dat +loc_dir /ndata/openQCD/cnfg +cnfg_dir /data/openQCD/cnfg + +[Configurations] +first 100 +last 150 +step 1 +nrw 1 + +[Random number generator] +level 0 +seed 79232 + +[Lattice parameters] +kappa 0.13625 +csw 1.90952 + +[Boundary conditions] +type 0 +cF 1.0 + +[Reweighting factor 0] +rwfact RWTM1 +im0 0 +mu 0.001 0.003 +isp 0 +nsrc 24 + +[Solver 0] +solver DFL_SAP_GCR +nkv 16 +isolv 1 +nmr 4 +ncy 5 +nmx 128 +res 1.0e-11 + +[SAP] +bs 4 6 6 4 + +[Deflation subspace] +bs 4 6 6 4 +Ns 28 + +[Deflation subspace generation] +kappa 0.13635 +mu 0.001 +ninv 10 +nmr 4 +ncy 4 + +[Deflation projection] +nkv 16 +nmx 128 +res 1.0e-2 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/ms1/48x24v2.ms1.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/ms1/48x24v2.ms1.in new file mode 100644 index 0000000000000000000000000000000000000000..e1f99b2d1388ba68c569e6171ec3652c39122def --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/ms1/48x24v2.ms1.in @@ -0,0 +1,68 @@ + +################################################################################ +# +# Two-flavour QCD with periodic boundary conditions, twisted-mass reweighting +# of the second kind and decomposition of the reweighting factor in 3 factors. +# +################################################################################ + +[Run name] +name 48x24v2 + +[Directories] +log_dir /data/openQCD/ms1/log +dat_dir /data/openQCD/ms1/dat +loc_dir /ndata/openQCD/cnfg +cnfg_dir /data/openQCD/cnfg + +[Configurations] +first 10 +last 90 +step 2 +nrw 1 + +[Random number generator] +level 0 +seed 78711 + +[Lattice parameters] +kappa 0.13635 +csw 1.90952 + +[Boundary conditions] +type 3 + +[Reweighting factor 0] +rwfact RWTM2 +im0 0 +mu 0.0005 0.001 0.003 +isp 0 +nsrc 32 + +[Solver 0] +solver DFL_SAP_GCR +nkv 16 +isolv 1 +nmr 4 +ncy 5 +nmx 128 +res 1.0e-11 + +[SAP] +bs 4 6 6 4 + +[Deflation subspace] +bs 4 6 6 4 +Ns 28 + +[Deflation subspace generation] +kappa 0.13635 +mu 0.001 +ninv 10 +nmr 4 +ncy 4 + +[Deflation projection] +nkv 16 +nmx 128 +res 1.0e-2 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/ms1/48x24v3.ms1.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/ms1/48x24v3.ms1.in new file mode 100644 index 0000000000000000000000000000000000000000..817d57db1e035f9a0db4773723d40f8838c90fd9 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/ms1/48x24v3.ms1.in @@ -0,0 +1,72 @@ + +################################################################################ +# +# Two-flavour QCD with SF boundary conditions, twisted-mass reweighting of +# the second kind, even-odd preconditioning and no decomposition of the +# reweighting factor. +# +################################################################################ + +[Run name] +name 48x24v3 + +[Directories] +log_dir /data/openQCD/ms1/log +dat_dir /data/openQCD/ms1/dat +loc_dir /ndata/openQCD/cnfg +cnfg_dir /data/openQCD/cnfg + +[Configurations] +first 126 +last 225 +step 1 +nrw 1 + +[Random number generator] +level 0 +seed 887056 + +[Lattice parameters] +kappa 0.13635 +csw 1.90952 + +[Boundary conditions] +type 1 +phi 0.5 -0.25 +phi' 0.0 0.0 +cF 1.0 + +[Reweighting factor 0] +rwfact RWTM2_EO +im0 0 +mu 0.0045 +isp 0 +nsrc 24 + +[Solver 0] +solver DFL_SAP_GCR +nkv 16 +isolv 1 +nmr 4 +ncy 5 +nmx 128 +res 1.0e-11 + +[SAP] +bs 4 6 6 4 + +[Deflation subspace] +bs 4 6 6 4 +Ns 28 + +[Deflation subspace generation] +kappa 0.13635 +mu 0.001 +ninv 10 +nmr 4 +ncy 4 + +[Deflation projection] +nkv 16 +nmx 128 +res 1.0e-2 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/ms1/64x32v1.ms1.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/ms1/64x32v1.ms1.in new file mode 100644 index 0000000000000000000000000000000000000000..8b127a4868ae2df35e3aa2318313119ccfd55ff8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/ms1/64x32v1.ms1.in @@ -0,0 +1,91 @@ + +################################################################################ +# +# 2+1 flavour QCD, mixed boundary conditions, second kind of light-quark +# twisted-mass reweighting, even-odd preconditioning and decomposition of the +# light-quark reweighting factor in 2 factors. +# +################################################################################ + +[Run name] +name 64x32v1 + +[Directories] +log_dir /data/openQCD/ms1/log +dat_dir /data/openQCD/ms1/dat +loc_dir /ndata/openQCD/cnfg +cnfg_dir /data/openQCD/cnfg + +[Configurations] +first 30 +last 100 +step 1 +nrw 2 + +[Random number generator] +level 0 +seed 126819 + +[Lattice parameters] +kappa 0.13774 0.1366 +csw 1.715 + +[Boundary conditions] +type 2 +phi' 0.0 0.0 +cG 1.0 +cG' 1.0 +cF 1.0 +cF' 1.0 + +[Reweighting factor 0] +rwfact RWTM2_EO +im0 0 +mu 0.001 0.002 +isp 0 +nsrc 24 + +[Reweighting factor 1] +rwfact RWRAT +im0 1 +irp 0 +np 6 3 +isp 1 0 +nsrc 1 + +[Rational 0] +degree 9 +range 0.03 6.1 + +[Solver 0] +solver DFL_SAP_GCR +nkv 16 +isolv 1 +nmr 4 +ncy 5 +nmx 256 +res 1.0e-11 + +[Solver 1] +solver MSCG +nmx 2048 +res 1.0e-11 + +[SAP] +bs 4 4 4 4 + +[Deflation subspace] +bs 4 4 4 4 +Ns 28 + +[Deflation subspace generation] +kappa 0.13774 +mu 0.005 +ninv 10 +nmr 4 +ncy 4 + +[Deflation projection] +nkv 24 +nmx 128 +res 1.0e-2 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/ms1/64x32v2.ms1.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/ms1/64x32v2.ms1.in new file mode 100644 index 0000000000000000000000000000000000000000..d127eddc7073ea5482ceeebd5eb251d2a97dd586 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/ms1/64x32v2.ms1.in @@ -0,0 +1,88 @@ + +################################################################################ +# +# 2+1 flavour QCD "at the physical point", open boundary conditions, even-odd +# preconditioning, light-quark twisted-mass reweighting of the second kind and +# decomposition of the reweighting factor in 2 factors. +# +################################################################################ + +[Run name] +name 64x32v2 + +[Directories] +log_dir /data/openQCD/ms1/log +dat_dir /data/openQCD/ms1/dat +loc_dir /ndata/openQCD/cnfg +cnfg_dir /data/openQCD/cnfg + +[Configurations] +first 35 +last 52 +step 1 +nrw 2 + +[Random number generator] +level 0 +seed 193392 + +[Lattice parameters] +kappa 0.137796 0.136634 +csw 1.715000 + +[Boundary conditions] +type 0 +cG 1.0 +cF 1.0 + +[Reweighting factor 0] +rwfact RWTM2_EO +im0 0 +mu 0.0005 0.0012 +isp 0 +nsrc 24 + +[Reweighting factor 1] +rwfact RWRAT +im0 1 +irp 0 +np 6 3 +isp 1 0 +nsrc 4 + +[Rational 0] +degree 9 +range 0.030 6.10 + +[Solver 0] +solver DFL_SAP_GCR +nkv 32 +isolv 1 +nmr 4 +ncy 5 +nmx 128 +res 1.0e-11 + +[Solver 1] +solver MSCG +nmx 2048 +res 1.0e-11 + +[SAP] +bs 4 4 4 4 + +[Deflation subspace] +bs 4 4 4 4 +Ns 28 + +[Deflation subspace generation] +kappa 0.13770 +mu 0.001 +ninv 10 +nmr 4 +ncy 4 + +[Deflation projection] +nkv 24 +nmx 128 +res 1.0e-2 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/ms1/INDEX b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/ms1/INDEX new file mode 100644 index 0000000000000000000000000000000000000000..7c3ec962ba381fd43fe264869219fd49ad253909 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/ms1/INDEX @@ -0,0 +1,10 @@ + +******************************************************************************** + + Input parameter files for the program ms1 + +******************************************************************************** + +The file names correspond to those of the simulation input parameter files in +the directory examples/qcd1. + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/qcd1/48x24v1.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/qcd1/48x24v1.in new file mode 100644 index 0000000000000000000000000000000000000000..8a9a0fdee7a72687c598d567f1096502beaee26e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/qcd1/48x24v1.in @@ -0,0 +1,166 @@ + +################################################################################ +# +# Two-flavour QCD with Wilson plaquette action, open boundary conditions and +# twisted-mass reweighting of the first kind. +# +################################################################################ + +[Run name] +name 48x24v1 + +[Directories] +log_dir /data/openQCD/qcd1/log +dat_dir /data/openQCD/qcd1/dat +loc_dir /ndata/openQCD/cnfg +cnfg_dir /data/openQCD/cnfg + +[Lattice parameters] +beta 5.3 +c0 1.0 +kappa 0.13625 +csw 1.90952 + +[Boundary conditions] +type 0 +cG 1.0 +cF 1.0 + +[Random number generator] +level 0 +seed 787412 + +[HMC parameters] +actions 0 1 2 3 4 +npf 4 +mu 0.003 0.01 0.1 1.0 +nlv 2 +tau 2.0 + +[MD trajectories] +nth 0 +ntr 5000 +dtr_log 1 +dtr_ms 8 +dtr_cnfg 8 + +[Level 0] +integrator OMF4 +nstep 1 +forces 0 + +[Level 1] +integrator OMF4 +nstep 10 +forces 1 2 3 4 + +[Action 0] +action ACG + +[Action 1] +action ACF_TM1 +ipf 0 +im0 0 +imu 3 +isp 0 + +[Action 2] +action ACF_TM2 +ipf 1 +im0 0 +imu 2 3 +isp 1 0 + +[Action 3] +action ACF_TM2 +ipf 2 +im0 0 +imu 1 2 +isp 1 1 + +[Action 4] +action ACF_TM2 +ipf 3 +im0 0 +imu 0 1 +isp 1 1 + +[Force 0] +force FRG + +[Force 1] +force FRF_TM1 +isp 2 +ncr 3 + +[Force 2] +force FRF_TM2 +isp 3 +ncr 3 + +[Force 3] +force FRF_TM2 +isp 3 +ncr 3 + +[Force 4] +force FRF_TM2 +isp 3 +ncr 3 + +[Solver 0] +solver CGNE +nmx 512 +res 1.0e-11 + +[Solver 1] +solver DFL_SAP_GCR +nkv 16 +isolv 1 +nmr 4 +ncy 5 +nmx 128 +res 1.0e-11 + +[Solver 2] +solver CGNE +nmx 512 +res 1.0e-10 + +[Solver 3] +solver DFL_SAP_GCR +nkv 16 +isolv 1 +nmr 4 +ncy 5 +nmx 128 +res 1.0e-10 + +[SAP] +bs 4 6 6 4 + +[Deflation subspace] +bs 4 6 6 4 +Ns 28 + +[Deflation subspace generation] +kappa 0.13635 +mu 0.001 +ninv 9 +nmr 4 +ncy 4 + +[Deflation projection] +nkv 16 +nmx 128 +res 1.0e-2 + +[Deflation update scheme] +dtau 0.09 +nsm 1 + +[Wilson flow] +integrator RK3 +eps 2.0e-2 +nstep 400 +dnms 2 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/qcd1/48x24v2.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/qcd1/48x24v2.in new file mode 100644 index 0000000000000000000000000000000000000000..e75ade620e243b2ba2fe1855ae5149605c72187d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/qcd1/48x24v2.in @@ -0,0 +1,181 @@ + +################################################################################ +# +# Two-flavour QCD with Wilson plaquette action, periodic boundary conditions +# and twisted-mass reweighting of the second kind. +# +################################################################################ + +[Run name] +name 48x24v2 + +[Directories] +log_dir /data/openQCD/qcd1/log +dat_dir /data/openQCD/qcd1/dat +loc_dir /ndata/openQCD/cnfg +cnfg_dir /data/openQCD/cnfg + +[Lattice parameters] +beta 5.3 +c0 1.0 +kappa 0.13635 +csw 1.90952 + +[Boundary conditions] +type 3 + +[Random number generator] +level 0 +seed 807721 + +[HMC parameters] +actions 0 1 2 3 4 5 +npf 5 +mu 3.0e-3 4.2426406871192851e-3 0.01 0.1 1.0 +nlv 3 +tau 2.0 + +[MD trajectories] +nth 0 +ntr 5000 +dtr_log 1 +dtr_ms 8 +dtr_cnfg 8 + +[Level 0] +integrator OMF4 +nstep 1 +forces 0 + +[Level 1] +integrator OMF4 +nstep 1 +forces 1 2 3 4 + +[Level 2] +integrator LPFR +nstep 10 +forces 5 + +[Action 0] +action ACG + +[Action 1] +action ACF_TM1 +ipf 0 +im0 0 +imu 4 +isp 0 + +[Action 2] +action ACF_TM2 +ipf 1 +im0 0 +imu 3 4 +isp 1 0 + +[Action 3] +action ACF_TM2 +ipf 2 +im0 0 +imu 2 3 +isp 1 1 + +[Action 4] +action ACF_TM2 +ipf 3 +im0 0 +imu 0 2 +isp 1 1 + +[Action 5] +action ACF_TM2 +ipf 4 +im0 0 +imu 0 1 +isp 1 1 + +[Force 0] +force FRG + +[Force 1] +force FRF_TM1 +isp 2 +ncr 3 + +[Force 2] +force FRF_TM2 +isp 3 +ncr 3 + +[Force 3] +force FRF_TM2 +isp 3 +ncr 3 + +[Force 4] +force FRF_TM2 +isp 3 +ncr 3 + +[Force 5] +force FRF_TM2 +isp 3 +ncr 1 + +[Solver 0] +solver CGNE +nmx 512 +res 1.0e-11 + +[Solver 1] +solver DFL_SAP_GCR +nkv 16 +isolv 1 +nmr 4 +ncy 5 +nmx 128 +res 1.0e-11 + +[Solver 2] +solver CGNE +nmx 512 +res 1.0e-10 + +[Solver 3] +solver DFL_SAP_GCR +nkv 16 +isolv 1 +nmr 4 +ncy 5 +nmx 128 +res 1.0e-10 + +[SAP] +bs 4 6 6 4 + +[Deflation subspace] +bs 4 6 6 4 +Ns 28 + +[Deflation subspace generation] +kappa 0.13635 +mu 0.001 +ninv 9 +nmr 4 +ncy 4 + +[Deflation projection] +nkv 16 +nmx 128 +res 1.0e-2 + +[Deflation update scheme] +dtau 0.09 +nsm 1 + +[Wilson flow] +integrator RK3 +eps 2.0e-2 +nstep 400 +dnms 2 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/qcd1/48x24v3.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/qcd1/48x24v3.in new file mode 100644 index 0000000000000000000000000000000000000000..db211390ca2e65188b599670161c3e0260cbac99 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/qcd1/48x24v3.in @@ -0,0 +1,185 @@ + +################################################################################ +# +# Two-flavour QCD with Wilson plaquette action, SF boundary conditions, +# twisted-mass reweighting of the second kind and even-odd preconditioning. +# +################################################################################ + +[Run name] +name 48x24v3 + +[Directories] +log_dir /data/openQCD/qcd1/log +dat_dir /data/openQCD/qcd1/dat +loc_dir /ndata/openQCD/cnfg +cnfg_dir /data/openQCD/cnfg + +[Lattice parameters] +beta 5.3 +c0 1.0 +kappa 0.13635 +csw 1.90952 + +[Boundary conditions] +type 1 +phi 0.5 -0.25 +phi' 0.0 0.0 +cG 1.0 +cF 1.0 + +[Random number generator] +level 0 +seed 695959 + +[HMC parameters] +actions 0 1 2 3 4 5 +npf 5 +mu 4.5e-3 6.363961030678928e-3 0.01 0.1 1.0 +nlv 3 +tau 2.0 + +[MD trajectories] +nth 0 +ntr 5000 +dtr_log 1 +dtr_ms 8 +dtr_cnfg 8 + +[Level 0] +integrator OMF4 +nstep 1 +forces 0 + +[Level 1] +integrator OMF4 +nstep 1 +forces 1 2 3 4 + +[Level 2] +integrator LPFR +nstep 10 +forces 5 + +[Action 0] +action ACG + +[Action 1] +action ACF_TM1_EO_SDET +ipf 0 +im0 0 +imu 4 +isp 0 + +[Action 2] +action ACF_TM2_EO +ipf 1 +im0 0 +imu 3 4 +isp 1 0 + +[Action 3] +action ACF_TM2_EO +ipf 2 +im0 0 +imu 2 3 +isp 1 1 + +[Action 4] +action ACF_TM2_EO +ipf 3 +im0 0 +imu 0 2 +isp 1 1 + +[Action 5] +action ACF_TM2_EO +ipf 4 +im0 0 +imu 0 1 +isp 1 1 + +[Force 0] +force FRG + +[Force 1] +force FRF_TM1_EO_SDET +isp 2 +ncr 3 + +[Force 2] +force FRF_TM2_EO +isp 3 +ncr 3 + +[Force 3] +force FRF_TM2_EO +isp 3 +ncr 3 + +[Force 4] +force FRF_TM2_EO +isp 3 +ncr 3 + +[Force 5] +force FRF_TM2_EO +isp 3 +ncr 1 + +[Solver 0] +solver CGNE +nmx 512 +res 1.0e-11 + +[Solver 1] +solver DFL_SAP_GCR +nkv 16 +isolv 1 +nmr 4 +ncy 5 +nmx 128 +res 1.0e-11 + +[Solver 2] +solver CGNE +nmx 512 +res 1.0e-10 + +[Solver 3] +solver DFL_SAP_GCR +nkv 16 +isolv 1 +nmr 4 +ncy 5 +nmx 128 +res 1.0e-10 + +[SAP] +bs 4 6 6 4 + +[Deflation subspace] +bs 4 6 6 4 +Ns 28 + +[Deflation subspace generation] +kappa 0.13635 +mu 0.001 +ninv 10 +nmr 4 +ncy 4 + +[Deflation projection] +nkv 16 +nmx 128 +res 1.0e-2 + +[Deflation update scheme] +dtau 0.09 +nsm 1 + +[Wilson flow] +integrator RK3 +eps 2.0e-2 +nstep 400 +dnms 2 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/qcd1/64x32v1.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/qcd1/64x32v1.in new file mode 100644 index 0000000000000000000000000000000000000000..5c3a555e3c82847364c46409676d54477303f6a9 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/qcd1/64x32v1.in @@ -0,0 +1,233 @@ + +################################################################################ +# +# 2+1 flavour QCD with Iwasaki action, mixed boundary conditions, second kind +# of light-quark twisted-mass reweighting and even-odd preconditioning. +# +################################################################################ + +[Run name] +name 64x32v1 + +[Directories] +log_dir /data/openQCD/qcd1/log +dat_dir /data/openQCD/qcd1/dat +loc_dir /ndata/openQCD/cnfg +cnfg_dir /data/openQCD/cnfg + +[Lattice parameters] +beta 1.9 +c0 3.648 +kappa 0.13774 0.1366 +csw 1.715 + +[Boundary conditions] +type 2 +phi' 0.0 0.0 +cG 1.0 +cG' 1.0 +cF 1.0 +cF' 1.0 + +[Random number generator] +level 0 +seed 8641 + +[HMC parameters] +actions 0 1 2 3 4 5 6 7 8 +npf 8 +mu 0.002 0.002828427124746190 0.05 0.5 +nlv 3 +tau 1.2 + +[MD trajectories] +nth 0 +ntr 2400 +dtr_log 1 +dtr_ms 8 +dtr_cnfg 8 + +[Level 0] +integrator OMF4 +nstep 1 +forces 0 + +[Level 1] +integrator OMF4 +nstep 1 +forces 1 2 3 5 6 + +[Level 2] +integrator OMF2 +lambda 0.1666667 +nstep 4 +forces 4 7 8 + +[Rational 0] +degree 9 +range 0.03 6.1 + +[Action 0] +action ACG + +[Action 1] +action ACF_TM1_EO_SDET +ipf 0 +im0 0 +imu 3 +isp 0 + +[Action 2] +action ACF_TM2_EO +ipf 1 +im0 0 +imu 2 3 +isp 1 0 + +[Action 3] +action ACF_TM2_EO +ipf 2 +im0 0 +imu 0 2 +isp 1 1 + +[Action 4] +action ACF_TM2_EO +ipf 3 +im0 0 +imu 0 1 +isp 1 1 + +[Action 5] +action ACF_RAT_SDET +ipf 4 +im0 1 +irat 0 0 5 +isp 4 + +[Action 6] +action ACF_RAT +ipf 5 +im0 1 +irat 0 6 6 +isp 1 + +[Action 7] +action ACF_RAT +ipf 6 +im0 1 +irat 0 7 7 +isp 1 + +[Action 8] +action ACF_RAT +ipf 7 +im0 1 +irat 0 8 8 +isp 1 + +[Force 0] +force FRG + +[Force 1] +force FRF_TM1_EO_SDET +isp 2 +ncr 4 + +[Force 2] +force FRF_TM2_EO +isp 3 +ncr 3 + +[Force 3] +force FRF_TM2_EO +isp 3 +ncr 3 + +[Force 4] +force FRF_TM2_EO +isp 3 +ncr 1 + +[Force 5] +force FRF_RAT_SDET +isp 5 + +[Force 6] +force FRF_RAT +isp 3 + +[Force 7] +force FRF_RAT +isp 3 + +[Force 8] +force FRF_RAT +isp 3 + +[Solver 0] +solver CGNE +nmx 1024 +res 1.0e-11 + +[Solver 1] +solver DFL_SAP_GCR +nkv 16 +isolv 1 +nmr 4 +ncy 5 +nmx 128 +res 1.0e-11 + +[Solver 2] +solver CGNE +nmx 1024 +res 1.0e-10 + +[Solver 3] +solver DFL_SAP_GCR +nkv 16 +isolv 1 +nmr 4 +ncy 5 +nmx 128 +res 1.0e-10 + +[Solver 4] +solver MSCG +nmx 1024 +res 1.e-11 + +[Solver 5] +solver MSCG +nmx 1024 +res 1.e-10 + +[SAP] +bs 4 4 4 4 + +[Deflation subspace] +bs 4 4 4 4 +Ns 28 + +[Deflation subspace generation] +kappa 0.13774 +mu 0.001 +ninv 10 +nmr 4 +ncy 4 + +[Deflation projection] +nkv 24 +nmx 128 +res 1.0e-2 + +[Deflation update scheme] +dtau 0.05 +nsm 1 + +[Wilson flow] +integrator RK3 +eps 1.0e-2 +nstep 600 +dnms 10 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/qcd1/64x32v2.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/qcd1/64x32v2.in new file mode 100644 index 0000000000000000000000000000000000000000..1be8ea98b2d82d47a8352a19a26f2eb842e61b40 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/qcd1/64x32v2.in @@ -0,0 +1,231 @@ + +################################################################################ +# +# 2+1 flavour QCD "at the physical point" with Iwasaki action, open boundary +# conditions, second kind of light-quark twisted-mass reweighting and even-odd +# preconditioning. +# +################################################################################ + +[Run name] +name 64x32v2 + +[Directories] +log_dir /data/openQCD/qcd1/log +dat_dir /data/openQCD/qcd1/dat +loc_dir /ndata/openQCD/cnfg +cnfg_dir /data/openQCD/cnfg + +[Lattice parameters] +beta 1.9 +c0 3.648 +kappa 0.137796 0.136634 +csw 1.715 + +[Boundary conditions] +type 0 +cG 1.0 +cF 1.0 + +[Random number generator] +level 0 +seed 1026 + +[HMC parameters] +actions 0 1 2 3 4 5 6 7 8 +npf 8 +mu 0.0012 0.001697056274847714 0.05 0.5 +nlv 3 +tau 1.1 + +[MD trajectories] +nth 0 +ntr 8000 +dtr_log 1 +dtr_ms 4 +dtr_cnfg 4 + +[Level 0] +integrator OMF4 +nstep 1 +forces 0 + +[Level 1] +integrator OMF4 +nstep 1 +forces 1 2 3 5 6 + +[Level 2] +integrator OMF2 +lambda 0.1666667 +nstep 6 +forces 4 7 8 + +[Rational 0] +degree 9 +range 0.03 6.1 + +[Action 0] +action ACG + +[Action 1] +action ACF_TM1_EO_SDET +ipf 0 +im0 0 +imu 3 +isp 0 + +[Action 2] +action ACF_TM2_EO +ipf 1 +im0 0 +imu 2 3 +isp 1 0 + +[Action 3] +action ACF_TM2_EO +ipf 2 +im0 0 +imu 0 2 +isp 1 1 + +[Action 4] +action ACF_TM2_EO +ipf 3 +im0 0 +imu 0 1 +isp 1 1 + +[Action 5] +action ACF_RAT_SDET +ipf 4 +im0 1 +irat 0 0 5 +isp 4 + +[Action 6] +action ACF_RAT +ipf 5 +im0 1 +irat 0 6 6 +isp 1 + +[Action 7] +action ACF_RAT +ipf 6 +im0 1 +irat 0 7 7 +isp 1 + +[Action 8] +action ACF_RAT +ipf 7 +im0 1 +irat 0 8 8 +isp 1 + +[Force 0] +force FRG + +[Force 1] +force FRF_TM1_EO_SDET +isp 2 +ncr 4 + +[Force 2] +force FRF_TM2_EO +isp 3 +ncr 3 + +[Force 3] +force FRF_TM2_EO +isp 3 +ncr 3 + +[Force 4] +force FRF_TM2_EO +isp 3 +ncr 1 + +[Force 5] +force FRF_RAT_SDET +isp 5 + +[Force 6] +force FRF_RAT +isp 3 + +[Force 7] +force FRF_RAT +isp 3 + +[Force 8] +force FRF_RAT +isp 3 + +[Solver 0] +solver CGNE +nmx 1024 +res 1.0e-11 + +[Solver 1] +solver DFL_SAP_GCR +nkv 24 +isolv 1 +nmr 4 +ncy 5 +nmx 128 +res 1.0e-11 + +[Solver 2] +solver CGNE +nmx 1024 +res 1.0e-10 + +[Solver 3] +solver DFL_SAP_GCR +nkv 24 +isolv 1 +nmr 4 +ncy 5 +nmx 128 +res 1.0e-10 + +[Solver 4] +solver MSCG +nmx 1024 +res 1.e-11 + +[Solver 5] +solver MSCG +nmx 1024 +res 1.e-10 + +[SAP] +bs 4 4 4 4 + +[Deflation subspace] +bs 4 4 4 4 +Ns 28 + +[Deflation subspace generation] +kappa 0.13770 +mu 0.001 +ninv 9 +nmr 4 +ncy 4 + +[Deflation projection] +nkv 24 +nmx 128 +res 1.0e-2 + +[Deflation update scheme] +dtau 0.037 +nsm 1 + +[Wilson flow] +integrator RK3 +eps 1.0e-2 +nstep 600 +dnms 10 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/qcd1/INDEX b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/qcd1/INDEX new file mode 100644 index 0000000000000000000000000000000000000000..6f5cdf02a669c0a9e3c820232cff45fd19dfd37e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/examples/qcd1/INDEX @@ -0,0 +1,27 @@ + +******************************************************************************** + + Input parameter files for the program qcd1 + +******************************************************************************** + +48x24v1.in Two-flavour QCD with Wilson plaquette action, open + boundary conditions and twisted-mass reweighting of + the first kind. + +48x24v2.in Two-flavour QCD with Wilson plaquette action, periodic + boundary conditions and twisted-mass reweighting of the + second kind. + +48x24v3.in Two-flavour QCD with Wilson plaquette action, SF + boundary conditions, twisted-mass reweighting of the + second kind and even-odd preconditioning. + +64x32v1.in 2+1 flavour QCD with Iwasaki action, mixed boundary + conditions, second kind of light-quark twisted-mass + reweighting and even-odd preconditioning. + +64x32v2.in 2+1 flavour QCD "at the physical point" with Iwasaki + action, open boundary conditions, second kind of + light-quark twisted-mass reweighting and even-odd + preconditioning. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/ms1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/ms1.c new file mode 100644 index 0000000000000000000000000000000000000000..8b85a5457df2400d1c73f18aa9190ce8d7fa3665 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/ms1.c @@ -0,0 +1,1482 @@ + +/******************************************************************************* +* +* File ms1.c +* +* Copyright (C) 2012-2014 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Stochastic estimation of reweighting factors. +* +* Syntax: ms1 -i [-noexp] [-a [-norng]] +* +* For usage instructions see the file README.ms1. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include +#include "mpi.h" +#include "flags.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "archive.h" +#include "dfl.h" +#include "update.h" +#include "version.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define N1 (NPROC1*L1) +#define N2 (NPROC2*L2) +#define N3 (NPROC3*L3) + +#define MAX(n,m) \ + if ((n)<(m)) \ + (n)=(m) + +static struct +{ + int nrw; + int *nfct,*nsrc; +} file_head; + +static struct +{ + int nc; + double ***sqn,***lnr; +} data; + +static int my_rank,noexp,append,norng,endian; +static int first,last,step,level,seed; +static int ipgrd[2],**rwstat=NULL,*rlxs_state=NULL,*rlxd_state=NULL; + +static char line[NAME_SIZE]; +static char log_dir[NAME_SIZE],dat_dir[NAME_SIZE]; +static char loc_dir[NAME_SIZE],cnfg_dir[NAME_SIZE]; +static char log_file[NAME_SIZE],log_save[NAME_SIZE],end_file[NAME_SIZE]; +static char par_file[NAME_SIZE],par_save[NAME_SIZE]; +static char dat_file[NAME_SIZE],dat_save[NAME_SIZE]; +static char rng_file[NAME_SIZE],rng_save[NAME_SIZE]; +static char cnfg_file[NAME_SIZE],nbase[NAME_SIZE]; +static FILE *fin=NULL,*flog=NULL,*fdat=NULL,*fend=NULL; + +static lat_parms_t lat; +static bc_parms_t bcp; + + +static void alloc_data(void) +{ + int nrw,*nfct,*nsrc; + int i,irw,ifct,n1,n2,n3; + double ***ppp,**pp,*p; + + nrw=file_head.nrw; + nfct=file_head.nfct; + nsrc=file_head.nsrc; + n1=nrw; + n2=0; + n3=0; + + for (irw=0;irw=NAME_SIZE, + 1,"setup_files [ms1.c]","loc_dir name is too long"); + else + error_root(name_size("%s/%sn%d",cnfg_dir,nbase,last)>=NAME_SIZE, + 1,"setup_files [ms1.c]","cnfg_dir name is too long"); + + check_dir_root(log_dir); + check_dir_root(dat_dir); + error_root(name_size("%s/%s.ms1.log~",log_dir,nbase)>=NAME_SIZE, + 1,"setup_files [ms1.c]","log_dir name is too long"); + error_root(name_size("%s/%s.ms1.dat~",dat_dir,nbase)>=NAME_SIZE, + 1,"setup_files [ms1.c]","dat_dir name is too long"); + + sprintf(log_file,"%s/%s.ms1.log",log_dir,nbase); + sprintf(par_file,"%s/%s.ms1.par",dat_dir,nbase); + sprintf(dat_file,"%s/%s.ms1.dat",dat_dir,nbase); + sprintf(rng_file,"%s/%s.ms1.rng",dat_dir,nbase); + sprintf(end_file,"%s/%s.ms1.end",log_dir,nbase); + sprintf(log_save,"%s~",log_file); + sprintf(par_save,"%s~",par_file); + sprintf(dat_save,"%s~",dat_file); + sprintf(rng_save,"%s~",rng_file); +} + + +static void read_lat_parms(void) +{ + int nk; + double csw,*kappa; + + if (my_rank==0) + { + find_section("Lattice parameters"); + nk=count_tokens("kappa"); + error_root(nk<1,1,"read_lat_parms [ms1.c]", + "Missing hopping parameter values"); + read_line("csw","%lf",&csw); + } + + MPI_Bcast(&nk,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&csw,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + kappa=malloc(nk*sizeof(*kappa)); + error(kappa==NULL,1,"read_lat_parms [check2.c]", + "Unable to allocate parameter array"); + if (my_rank==0) + read_dprms("kappa",nk,kappa); + MPI_Bcast(kappa,nk,MPI_DOUBLE,0,MPI_COMM_WORLD); + + lat=set_lat_parms(0.0,1.0,nk,kappa,csw); + free(kappa); + + if (append) + check_lat_parms(fdat); + else + write_lat_parms(fdat); +} + + +static void read_bc_parms(void) +{ + int bc; + double cF,cF_prime; + double phi[2],phi_prime[2]; + + if (my_rank==0) + { + find_section("Boundary conditions"); + read_line("type","%d",&bc); + + phi[0]=0.0; + phi[1]=0.0; + phi_prime[0]=0.0; + phi_prime[1]=0.0; + cF=1.0; + cF_prime=1.0; + + if (bc==1) + read_dprms("phi",2,phi); + + if ((bc==1)||(bc==2)) + read_dprms("phi'",2,phi_prime); + + if (bc!=3) + read_line("cF","%lf",&cF); + + if (bc==2) + read_line("cF'","%lf",&cF_prime); + } + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(phi,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(phi_prime,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cF,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cF_prime,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + bcp=set_bc_parms(bc,1.0,1.0,cF,cF_prime,phi,phi_prime); + + if (append) + check_bc_parms(fdat); + else + write_bc_parms(fdat); +} + + +static void read_rw_factors(void) +{ + int nrw,*nfct,*nsrc,irw,irp; + rw_parms_t rwp; + rat_parms_t rp; + + nrw=file_head.nrw; + nfct=file_head.nfct; + nsrc=file_head.nsrc; + + for (irw=0;irw [-noexp] [-a [-norng]]"); + + error_root(endian==UNKNOWN_ENDIAN,1,"read_infile [ms1.c]", + "Machine has unknown endianness"); + + noexp=find_opt(argc,argv,"-noexp"); + append=find_opt(argc,argv,"-a"); + norng=find_opt(argc,argv,"-norng"); + + fin=freopen(argv[ifile+1],"r",stdin); + error_root(fin==NULL,1,"read_infile [ms1.c]", + "Unable to open input file"); + } + + MPI_Bcast(&endian,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&noexp,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&append,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&norng,1,MPI_INT,0,MPI_COMM_WORLD); + + read_dirs(); + setup_files(); + + if (my_rank==0) + { + if (append) + fdat=fopen(par_file,"rb"); + else + fdat=fopen(par_file,"wb"); + + error_root(fdat==NULL,1,"read_infile [ms1.c]", + "Unable to open parameter file"); + } + + if (my_rank==0) + { + find_section("Random number generator"); + read_line("level","%d",&level); + read_line("seed","%d",&seed); + } + + MPI_Bcast(&level,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&seed,1,MPI_INT,0,MPI_COMM_WORLD); + + read_lat_parms(); + read_bc_parms(); + read_rw_factors(); + read_solvers(); + + if (my_rank==0) + { + fclose(fin); + fclose(fdat); + + if (append==0) + copy_file(par_file,par_save); + } +} + + +static void check_old_log(int *fst,int *lst,int *stp) +{ + int ie,ic,isv; + int fc,lc,dc,pc; + int np[4],bp[4]; + + fend=fopen(log_file,"r"); + error_root(fend==NULL,1,"check_old_log [ms1.c]", + "Unable to open log file"); + + fc=0; + lc=0; + dc=0; + pc=0; + + ie=0x0; + ic=0; + isv=0; + + while (fgets(line,NAME_SIZE,fend)!=NULL) + { + if (strstr(line,"process grid")!=NULL) + { + if (sscanf(line,"%dx%dx%dx%d process grid, %dx%dx%dx%d", + np,np+1,np+2,np+3,bp,bp+1,bp+2,bp+3)==8) + { + ipgrd[0]=((np[0]!=NPROC0)||(np[1]!=NPROC1)|| + (np[2]!=NPROC2)||(np[3]!=NPROC3)); + ipgrd[1]=((bp[0]!=NPROC0_BLK)||(bp[1]!=NPROC1_BLK)|| + (bp[2]!=NPROC2_BLK)||(bp[3]!=NPROC3_BLK)); + } + else + ie|=0x1; + } + else if (strstr(line,"fully processed")!=NULL) + { + pc=lc; + + if (sscanf(line,"Configuration no %d",&lc)==1) + { + ic+=1; + isv=1; + } + else + ie|=0x1; + + if (ic==1) + fc=lc; + else if (ic==2) + dc=lc-fc; + else if ((ic>2)&&(lc!=(pc+dc))) + ie|=0x2; + } + else if (strstr(line,"Configuration no")!=NULL) + isv=0; + } + + fclose(fend); + + error_root((ie&0x1)!=0x0,1,"check_old_log [ms1.c]", + "Incorrect read count"); + error_root((ie&0x2)!=0x0,1,"check_old_log [ms1.c]", + "Configuration numbers are not equally spaced"); + error_root(isv==0,1,"check_old_log [ms1.c]", + "Log file extends beyond the last configuration save"); + + (*fst)=fc; + (*lst)=lc; + (*stp)=dc; +} + + +static void check_old_dat(int fst,int lst,int stp) +{ + int ie,ic; + int fc,lc,dc,pc; + + fdat=fopen(dat_file,"rb"); + error_root(fdat==NULL,1,"check_old_dat [ms1.c]", + "Unable to open data file"); + + check_file_head(); + + fc=0; + lc=0; + dc=0; + pc=0; + + ie=0x0; + ic=0; + + while (read_data()==1) + { + pc=lc; + lc=data.nc; + ic+=1; + + if (ic==1) + fc=lc; + else if (ic==2) + dc=lc-fc; + else if ((ic>2)&&(lc!=(pc+dc))) + ie|=0x1; + } + + fclose(fdat); + + error_root(ic==0,1,"check_old_dat [ms1.c]", + "No data records found"); + error_root((ie&0x1)!=0x0,1,"check_old_dat [ms1.c]", + "Configuration numbers are not equally spaced"); + error_root((fst!=fc)||(lst!=lc)||(stp!=dc),1,"check_old_dat [ms1.c]", + "Configuration range is not as reported in the log file"); +} + + +static void check_files(void) +{ + int fst,lst,stp; + + ipgrd[0]=0; + ipgrd[1]=0; + + if (my_rank==0) + { + if (append) + { + check_old_log(&fst,&lst,&stp); + check_old_dat(fst,lst,stp); + + error_root((fst!=lst)&&(stp!=step),1,"check_files [ms1.c]", + "Continuation run:\n" + "Previous run had a different configuration separation"); + error_root(first!=lst+step,1,"check_files [ms1.c]", + "Continuation run:\n" + "Configuration range does not continue the previous one"); + } + else + { + fin=fopen(log_file,"r"); + fdat=fopen(dat_file,"rb"); + + error_root((fin!=NULL)||(fdat!=NULL),1,"check_files [ms1.c]", + "Attempt to overwrite old *.log or *.dat file"); + + fdat=fopen(dat_file,"wb"); + error_root(fdat==NULL,1,"check_files [ms1.c]", + "Unable to open data file"); + write_file_head(); + fclose(fdat); + } + } +} + + +static void print_info(void) +{ + int isap,idfl,ik,n[3]; + long ip; + + if (my_rank==0) + { + ip=ftell(flog); + fclose(flog); + + if (ip==0L) + remove("STARTUP_ERROR"); + + if (append) + flog=freopen(log_file,"a",stdout); + else + flog=freopen(log_file,"w",stdout); + + error_root(flog==NULL,1,"print_info [ms1.c]","Unable to open log file"); + printf("\n"); + + if (append) + printf("Continuation run\n\n"); + else + { + printf("Measurement of reweighting factors\n"); + printf("----------------------------------\n\n"); + } + + printf("Program version %s\n",openQCD_RELEASE); + + if (endian==LITTLE_ENDIAN) + printf("The machine is little endian\n"); + else + printf("The machine is big endian\n"); + if (noexp) + printf("Configurations are read in imported file format\n\n"); + else + printf("Configurations are read in exported file format\n\n"); + + if ((ipgrd[0]!=0)&&(ipgrd[1]!=0)) + printf("Process grid and process block size changed:\n"); + else if (ipgrd[0]!=0) + printf("Process grid changed:\n"); + else if (ipgrd[1]!=0) + printf("Process block size changed:\n"); + + if ((append==0)||(ipgrd[0]!=0)||(ipgrd[1]!=0)) + { + printf("%dx%dx%dx%d lattice, ",N0,N1,N2,N3); + printf("%dx%dx%dx%d local lattice\n",L0,L1,L2,L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d process block size\n\n", + NPROC0_BLK,NPROC1_BLK,NPROC2_BLK,NPROC3_BLK); + } + + if (append) + { + printf("Random number generator:\n"); + + if (norng) + printf("level = %d, seed = %d, effective seed = %d\n\n", + level,seed,seed^(first-step)); + else + { + printf("State of ranlxs and ranlxd reset to the\n"); + printf("last exported state\n\n"); + } + } + else + { + printf("Random number generator:\n"); + printf("level = %d, seed = %d\n\n",level,seed); + + printf("Lattice parameters:\n"); + + for (ik=0;ik=11) + printf("kappa[%2d] = %.*f\n",ik,IMAX(n[0],6),lat.kappa[ik]); + else + printf("kappa[%1d] = %.*f\n",ik,IMAX(n[0],6),lat.kappa[ik]); + } + + n[0]=fdigits(lat.csw); + printf("csw = %.*f\n\n",IMAX(n[0],1),lat.csw); + + if (bcp.type==0) + { + printf("Open boundary conditions\n"); + + n[0]=fdigits(bcp.cF[0]); + printf("cF = %.*f\n\n",IMAX(n[0],1),bcp.cF[0]); + } + else if (bcp.type==1) + { + printf("SF boundary conditions\n"); + + n[0]=fdigits(bcp.cF[0]); + printf("cF = %.*f\n",IMAX(n[0],1),bcp.cF[0]); + + n[0]=fdigits(bcp.phi[0][0]); + n[1]=fdigits(bcp.phi[0][1]); + n[2]=fdigits(bcp.phi[0][2]); + printf("phi = %.*f,%.*f,%.*f\n",IMAX(n[0],1),bcp.phi[0][0], + IMAX(n[1],1),bcp.phi[0][1],IMAX(n[2],1),bcp.phi[0][2]); + + n[0]=fdigits(bcp.phi[1][0]); + n[1]=fdigits(bcp.phi[1][1]); + n[2]=fdigits(bcp.phi[1][2]); + printf("phi' = %.*f,%.*f,%.*f\n\n",IMAX(n[0],1),bcp.phi[1][0], + IMAX(n[1],1),bcp.phi[1][1],IMAX(n[2],1),bcp.phi[1][2]); + } + else if (bcp.type==2) + { + printf("Open-SF boundary conditions\n"); + + n[0]=fdigits(bcp.cF[0]); + printf("cF = %.*f\n",IMAX(n[0],1),bcp.cF[0]); + n[1]=fdigits(bcp.cF[1]); + printf("cF' = %.*f\n",IMAX(n[1],1),bcp.cF[1]); + + n[0]=fdigits(bcp.phi[1][0]); + n[1]=fdigits(bcp.phi[1][1]); + n[2]=fdigits(bcp.phi[1][2]); + printf("phi' = %.*f,%.*f,%.*f\n\n",IMAX(n[0],1),bcp.phi[1][0], + IMAX(n[1],1),bcp.phi[1][1],IMAX(n[2],1),bcp.phi[1][2]); + } + else + printf("Periodic boundary conditions\n\n"); + + print_rw_parms(); + print_rat_parms(); + print_solver_parms(&isap,&idfl); + + if (isap) + print_sap_parms(0); + + if (idfl) + print_dfl_parms(0); + } + + printf("Configurations no %d -> %d in steps of %d\n\n", + first,last,step); + fflush(flog); + } +} + + +static void dfl_wsize(int *nws,int *nwv,int *nwvd) +{ + dfl_parms_t dp; + dfl_pro_parms_t dpp; + + dp=dfl_parms(); + dpp=dfl_pro_parms(); + + MAX(*nws,dp.Ns+2); + MAX(*nwv,2*dpp.nkv+2); + MAX(*nwvd,4); +} + + +static void solver_wsize(int isp,int nsd,int np, + int *nws,int *nwsd,int *nwv,int *nwvd) +{ + solver_parms_t sp; + + sp=solver_parms(isp); + + if (sp.solver==CGNE) + { + MAX(*nws,5); + MAX(*nwsd,nsd+3); + } + else if (sp.solver==MSCG) + { + if (np>1) + { + MAX(*nwsd,nsd+np+3); + } + else + { + MAX(*nwsd,nsd+5); + } + } + else if (sp.solver==SAP_GCR) + { + MAX(*nws,2*sp.nkv+1); + MAX(*nwsd,nsd+2); + } + else if (sp.solver==DFL_SAP_GCR) + { + MAX(*nws,2*sp.nkv+2); + MAX(*nwsd,nsd+4); + dfl_wsize(nws,nwv,nwvd); + } +} + + +static void reweight_wsize(int *nws,int *nwsd,int *nwv,int *nwvd) +{ + int nrw,nfct; + int irw,ifct,nsd; + int *np,*isp; + rw_parms_t rwp; + solver_parms_t sp; + + (*nws)=0; + (*nwsd)=0; + (*nwv)=0; + (*nwvd)=0; + nrw=file_head.nrw; + + for (irw=0;irw0) + mu1=rwp.mu[ifct-1]; + else + mu1=0.0; + + mu2=rwp.mu[ifct]; + isp=rwp.isp[ifct]; + + for (isrc=0;isrc [-noexp] +* +* For usage instructions see the file README.ms2. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "archive.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "dirac.h" +#include "sap.h" +#include "dfl.h" +#include "ratfcts.h" +#include "forces.h" +#include "version.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define N1 (NPROC1*L1) +#define N2 (NPROC2*L2) +#define N3 (NPROC3*L3) + +#define MAX(n,m) \ + if ((n)<(m)) \ + (n)=(m) + +static int my_rank,noexp,endian; +static int first,last,step,np_ra,np_rb; +static int *rlxs_state=NULL,*rlxd_state=NULL; +static double ar[256]; + +static char log_dir[NAME_SIZE],loc_dir[NAME_SIZE],cnfg_dir[NAME_SIZE]; +static char log_file[NAME_SIZE],log_save[NAME_SIZE],end_file[NAME_SIZE]; +static char cnfg_file[NAME_SIZE],nbase[NAME_SIZE]; +static FILE *fin=NULL,*flog=NULL,*fend=NULL; + +static lat_parms_t lat; +static bc_parms_t bcp; + + +static void read_dirs(void) +{ + if (my_rank==0) + { + find_section("Run name"); + read_line("name","%s",nbase); + + find_section("Directories"); + read_line("log_dir","%s",log_dir); + + if (noexp) + { + read_line("loc_dir","%s",loc_dir); + cnfg_dir[0]='\0'; + } + else + { + read_line("cnfg_dir","%s",cnfg_dir); + loc_dir[0]='\0'; + } + + find_section("Configurations"); + read_line("first","%d",&first); + read_line("last","%d",&last); + read_line("step","%d",&step); + + error_root((last=NAME_SIZE, + 1,"setup_files [ms2.c]","loc_dir name is too long"); + else + error_root(name_size("%s/%sn%d",cnfg_dir,nbase,last)>=NAME_SIZE, + 1,"setup_files [ms2.c]","cnfg_dir name is too long"); + + check_dir_root(log_dir); + error_root(name_size("%s/%s.ms2.log~",log_dir,nbase)>=NAME_SIZE, + 1,"setup_files [ms2.c]","log_dir name is too long"); + + sprintf(log_file,"%s/%s.ms2.log",log_dir,nbase); + sprintf(end_file,"%s/%s.ms2.end",log_dir,nbase); + sprintf(log_save,"%s~",log_file); +} + + +static void read_lat_parms(void) +{ + double kappa,csw; + + if (my_rank==0) + { + find_section("Dirac operator"); + read_line("kappa","%lf",&kappa); + read_line("csw","%lf",&csw); + } + + MPI_Bcast(&kappa,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&csw,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + lat=set_lat_parms(0.0,1.0,1,&kappa,csw); +} + + +static void read_bc_parms(void) +{ + int bc; + double cF,cF_prime; + double phi[2],phi_prime[2]; + + if (my_rank==0) + { + find_section("Boundary conditions"); + read_line("type","%d",&bc); + + phi[0]=0.0; + phi[1]=0.0; + phi_prime[0]=0.0; + phi_prime[1]=0.0; + cF=1.0; + cF_prime=1.0; + + if (bc==1) + read_dprms("phi",2,phi); + + if ((bc==1)||(bc==2)) + read_dprms("phi'",2,phi_prime); + + if (bc!=3) + read_line("cF","%lf",&cF); + + if (bc==2) + read_line("cF'","%lf",&cF_prime); + } + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(phi,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(phi_prime,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cF,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cF_prime,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + bcp=set_bc_parms(bc,1.0,1.0,cF,cF_prime,phi,phi_prime); +} + + +static void read_sap_parms(void) +{ + int bs[4]; + + if (my_rank==0) + { + find_section("SAP"); + read_line("bs","%d %d %d %d",bs,bs+1,bs+2,bs+3); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + set_sap_parms(bs,1,4,5); +} + + +static void read_dfl_parms(void) +{ + int bs[4],Ns; + int ninv,nmr,ncy,nkv,nmx; + double kappa,mu,res; + + if (my_rank==0) + { + find_section("Deflation subspace"); + read_line("bs","%d %d %d %d",bs,bs+1,bs+2,bs+3); + read_line("Ns","%d",&Ns); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&Ns,1,MPI_INT,0,MPI_COMM_WORLD); + set_dfl_parms(bs,Ns); + + if (my_rank==0) + { + find_section("Deflation subspace generation"); + read_line("kappa","%lf",&kappa); + read_line("mu","%lf",&mu); + read_line("ninv","%d",&ninv); + read_line("nmr","%d",&nmr); + read_line("ncy","%d",&ncy); + } + + MPI_Bcast(&kappa,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&mu,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&ninv,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmr,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&ncy,1,MPI_INT,0,MPI_COMM_WORLD); + set_dfl_gen_parms(kappa,mu,ninv,nmr,ncy); + + if (my_rank==0) + { + find_section("Deflation projection"); + read_line("nkv","%d",&nkv); + read_line("nmx","%d",&nmx); + read_line("res","%lf",&res); + } + + MPI_Bcast(&nkv,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmx,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&res,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + set_dfl_pro_parms(nkv,nmx,res); +} + + +static void read_solver(void) +{ + solver_parms_t sp; + + read_solver_parms(0); + sp=solver_parms(0); + + if ((sp.solver==SAP_GCR)||(sp.solver==DFL_SAP_GCR)) + read_sap_parms(); + + if (sp.solver==DFL_SAP_GCR) + read_dfl_parms(); +} + + +static void read_infile(int argc,char *argv[]) +{ + int ifile; + + if (my_rank==0) + { + flog=freopen("STARTUP_ERROR","w",stdout); + + ifile=find_opt(argc,argv,"-i"); + endian=endianness(); + + error_root((ifile==0)||(ifile==(argc-1)),1,"read_infile [ms2.c]", + "Syntax: ms2 -i [-noexp]"); + + error_root(endian==UNKNOWN_ENDIAN,1,"read_infile [ms2.c]", + "Machine has unknown endianness"); + + noexp=find_opt(argc,argv,"-noexp"); + + fin=freopen(argv[ifile+1],"r",stdin); + error_root(fin==NULL,1,"read_infile [ms2.c]", + "Unable to open input file"); + } + + MPI_Bcast(&endian,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&noexp,1,MPI_INT,0,MPI_COMM_WORLD); + + read_dirs(); + setup_files(); + read_lat_parms(); + read_bc_parms(); + + if (my_rank==0) + { + find_section("Power method"); + read_line("np_ra","%d",&np_ra); + read_line("np_rb","%d",&np_rb); + error_root((np_ra<1)||(np_rb<1),1,"read_infile [ms2.c]", + "Power method iteration numbers must be at least 1"); + } + + MPI_Bcast(&np_ra,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&np_rb,1,MPI_INT,0,MPI_COMM_WORLD); + read_solver(); + + if (my_rank==0) + fclose(fin); +} + + +static void check_files(void) +{ + if (my_rank==0) + { + fin=fopen(log_file,"r"); + error_root(fin!=NULL,1,"check_files [ms2.c]", + "Attempt to overwrite old *.log file"); + } +} + + +static void print_info(void) +{ + int isap,idfl,n[3]; + long ip; + + if (my_rank==0) + { + ip=ftell(flog); + fclose(flog); + + if (ip==0L) + remove("STARTUP_ERROR"); + + flog=freopen(log_file,"w",stdout); + error_root(flog==NULL,1,"print_info [ms2.c]","Unable to open log file"); + printf("\n"); + + printf("Spectral range of the hermitian Dirac operator\n"); + printf("----------------------------------------------\n\n"); + + printf("Program version %s\n",openQCD_RELEASE); + + if (endian==LITTLE_ENDIAN) + printf("The machine is little endian\n"); + else + printf("The machine is big endian\n"); + if (noexp) + printf("Configurations are read in imported file format\n\n"); + else + printf("Configurations are read in exported file format\n\n"); + + printf("%dx%dx%dx%d lattice, ",N0,N1,N2,N3); + printf("%dx%dx%dx%d local lattice\n",L0,L1,L2,L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d process block size\n", + NPROC0_BLK,NPROC1_BLK,NPROC2_BLK,NPROC3_BLK); + printf("SF boundary conditions on the quark fields\n\n"); + + printf("Dirac operator:\n"); + n[0]=fdigits(lat.kappa[0]); + printf("kappa = %.*f\n",IMAX(n[0],6),lat.kappa[0]); + n[0]=fdigits(lat.csw); + printf("csw = %.*f\n\n",IMAX(n[0],1),lat.csw); + + if (bcp.type==0) + { + printf("Open boundary conditions\n"); + + n[0]=fdigits(bcp.cF[0]); + printf("cF = %.*f\n\n",IMAX(n[0],1),bcp.cF[0]); + } + else if (bcp.type==1) + { + printf("SF boundary conditions\n"); + + n[0]=fdigits(bcp.cF[0]); + printf("cF = %.*f\n",IMAX(n[0],1),bcp.cF[0]); + + n[0]=fdigits(bcp.phi[0][0]); + n[1]=fdigits(bcp.phi[0][1]); + n[2]=fdigits(bcp.phi[0][2]); + printf("phi = %.*f,%.*f,%.*f\n",IMAX(n[0],1),bcp.phi[0][0], + IMAX(n[1],1),bcp.phi[0][1],IMAX(n[2],1),bcp.phi[0][2]); + + n[0]=fdigits(bcp.phi[1][0]); + n[1]=fdigits(bcp.phi[1][1]); + n[2]=fdigits(bcp.phi[1][2]); + printf("phi' = %.*f,%.*f,%.*f\n\n",IMAX(n[0],1),bcp.phi[1][0], + IMAX(n[1],1),bcp.phi[1][1],IMAX(n[2],1),bcp.phi[1][2]); + } + else if (bcp.type==2) + { + printf("Open-SF boundary conditions\n"); + + n[0]=fdigits(bcp.cF[0]); + printf("cF = %.*f\n",IMAX(n[0],1),bcp.cF[0]); + n[1]=fdigits(bcp.cF[1]); + printf("cF' = %.*f\n",IMAX(n[1],1),bcp.cF[1]); + + n[0]=fdigits(bcp.phi[1][0]); + n[1]=fdigits(bcp.phi[1][1]); + n[2]=fdigits(bcp.phi[1][2]); + printf("phi' = %.*f,%.*f,%.*f\n\n",IMAX(n[0],1),bcp.phi[1][0], + IMAX(n[1],1),bcp.phi[1][1],IMAX(n[2],1),bcp.phi[1][2]); + } + else + printf("Periodic boundary conditions\n\n"); + + printf("Power method:\n"); + printf("np_ra = %d\n",np_ra); + printf("np_rb = %d\n\n",np_rb); + + print_solver_parms(&isap,&idfl); + + if (isap) + print_sap_parms(0); + + if (idfl) + print_dfl_parms(0); + + printf("Configurations no %d -> %d in steps of %d\n\n", + first,last,step); + fflush(flog); + } +} + + +static void dfl_wsize(int *nws,int *nwv,int *nwvd) +{ + dfl_parms_t dp; + dfl_pro_parms_t dpp; + + dp=dfl_parms(); + dpp=dfl_pro_parms(); + + MAX(*nws,dp.Ns+2); + MAX(*nwv,2*dpp.nkv+2); + MAX(*nwvd,4); +} + + +static void wsize(int *nws,int *nwsd,int *nwv,int *nwvd) +{ + int nsd; + solver_parms_t sp; + + (*nws)=0; + (*nwsd)=0; + (*nwv)=0; + (*nwvd)=0; + + sp=solver_parms(0); + + if (sp.solver==CGNE) + { + nsd=1; + MAX(*nws,5); + MAX(*nwsd,nsd+3); + } + else if (sp.solver==SAP_GCR) + { + nsd=2; + MAX(*nws,2*sp.nkv+1); + MAX(*nwsd,nsd+2); + } + else if (sp.solver==DFL_SAP_GCR) + { + nsd=2; + MAX(*nws,2*sp.nkv+2); + MAX(*nwsd,nsd+4); + dfl_wsize(nws,nwv,nwvd); + } + else + error_root(1,1,"wsize [ms2.c]", + "Unknown or unsupported solver"); +} + + +static double power1(int *status) +{ + int nsd,k,l,stat[6]; + double r; + spinor_dble **wsd; + solver_parms_t sp; + sap_parms_t sap; + + set_sw_parms(sea_quark_mass(0)); + sp=solver_parms(0); + + if (sp.solver==CGNE) + { + nsd=1; + status[0]=0; + } + else if (sp.solver==SAP_GCR) + { + nsd=2; + sap=sap_parms(); + set_sap_parms(sap.bs,sp.isolv,sp.nmr,sp.ncy); + status[0]=0; + } + else if (sp.solver==DFL_SAP_GCR) + { + nsd=2; + sap=sap_parms(); + set_sap_parms(sap.bs,sp.isolv,sp.nmr,sp.ncy); + + for (l=0;l<3;l++) + status[l]=0; + } + else + { + nsd=1; + error_root(1,1,"power1 [ms2.c]", + "Unknown or unsupported solver"); + } + + wsd=reserve_wsd(nsd); + random_sd(VOLUME/2,wsd[0],1.0); + bnd_sd2zero(EVEN_PTS,wsd[0]); + r=normalize_dble(VOLUME/2,1,wsd[0]); + + for (k=0;kramax) + ramax=ra; + raavg+=ra; + + if (rbrbmax) + rbmax=rb; + rbavg+=rb; + } + + MPI_Barrier(MPI_COMM_WORLD); + wt2=MPI_Wtime(); + wtavg+=(wt2-wt1); + error_chk(); + + if (my_rank==0) + { + printf("ra = %.2e, rb = %.2e, ",ra,rb); + + if (dfl.Ns) + printf("status = %d,%d,%d\n", + status[0],status[1],status[2]); + else + printf("status = %d\n",status[0]); + + printf("Configuration no %d fully processed in %.2e sec ", + nc,wt2-wt1); + printf("(average = %.2e sec)\n\n", + wtavg/(double)((nc-first)/step+1)); + + fflush(flog); + copy_file(log_file,log_save); + } + + check_endflag(&iend); + } + + if (my_rank==0) + { + last=nc-step; + nc=(last-first)/step+1; + + printf("Summary\n"); + printf("-------\n\n"); + + printf("Considered %d configurations in the range %d -> %d\n\n", + nc,first,last); + + printf("The three figures quoted in each case are the minimal,\n"); + printf("maximal and average values\n\n"); + + printf("Spectral gap ra = %.2e, %.2e, %.2e\n", + ramin,ramax,raavg/(double)(nc)); + printf("Spectral radius rb = %.2e, %.2e, %.2e\n\n", + rbmin,rbmax,rbavg/(double)(nc)); + + ra=0.90*ramin; + rb=1.03*rbmax; + eps=ra/rb; + eps=eps*eps; + Ne=0.5*(double)(NPROC0*L0-2)*(double)(NPROC1*NPROC2*NPROC3*L1*L2*L3); + + printf("Zolotarev rational approximation:\n\n"); + + printf("n: number of poles\n"); + printf("delta: approximation error\n"); + printf("Ne: number of even lattice points\n"); + printf("Suggested spectral range = [%.2e,%.2e]\n\n",ra,rb); + + printf(" n delta 12*Ne*delta 12*Ne*delta^2\n"); + + for (n=6;n<=128;n++) + { + zolotarev(n,eps,&A,ar,&delta); + d1=12.0*Ne*delta; + d2=d1*delta; + + printf(" %3d %.1e %.1e %.1e\n",n,delta,d1,d2); + + if ((d1<1.0e-2)&&(d2<1.0e-4)) + break; + } + + printf("\n"); + } + + error_chk(); + + if (my_rank==0) + { + fflush(flog); + copy_file(log_file,log_save); + fclose(flog); + } + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/ms2.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/ms2.in new file mode 100644 index 0000000000000000000000000000000000000000..d8b413890a4fffa7986bd381bcce408f4b01908f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/ms2.in @@ -0,0 +1,56 @@ + +[Run name] +name Snoopy137 + +[Directories] +log_dir ../data/ms2/log +loc_dir /ndata/qcd1/cnfg +cnfg_dir /data/qcd1/cnfg + +[Configurations] +first 1 +last 4 +step 1 + +[Dirac operator] +kappa 0.1300 +csw 1.234 + +[Boundary conditions] +type 2 +phi 0.12 -0.56 +phi' 0.92 0.76 +cF 0.95 +cF' 0.90 + +[Power method] +np_ra 20 +np_rb 100 + +[Solver 0] +solver DFL_SAP_GCR +nkv 16 +isolv 1 +nmr 4 +ncy 5 +nmx 128 +res 1.0e-8 + +[SAP] +bs 8 4 4 4 + +[Deflation subspace] +bs 4 4 4 4 +Ns 28 + +[Deflation subspace generation] +kappa 0.13635 +mu 0.005 +ninv 10 +nmr 4 +ncy 4 + +[Deflation projection] +nkv 24 +nmx 512 +res 1.0e-2 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/ms3.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/ms3.c new file mode 100644 index 0000000000000000000000000000000000000000..05ac9527f086c8490b0749eb3a123716bc0e95bf --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/ms3.c @@ -0,0 +1,953 @@ + +/******************************************************************************* +* +* File ms3.c +* +* Copyright (C) 2012, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Computation of Wilson flow observables. +* +* Syntax: ms3 -i [-noexp] [-a] +* +* For usage instructions see the file README.ms3. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include +#include "mpi.h" +#include "flags.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "archive.h" +#include "tcharge.h" +#include "wflow.h" +#include "version.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define N1 (NPROC1*L1) +#define N2 (NPROC2*L2) +#define N3 (NPROC3*L3) + +static struct +{ + int dn,nn,tmax; + double eps; +} file_head; + +static struct +{ + int nc; + double **Wsl,**Ysl,**Qsl; +} data; + +static int my_rank,noexp,append,endian; +static int first,last,step; +static int ipgrd[2],flint; +static double *Wact,*Yact,*Qtop; + +static char line[NAME_SIZE]; +static char log_dir[NAME_SIZE],dat_dir[NAME_SIZE]; +static char loc_dir[NAME_SIZE],cnfg_dir[NAME_SIZE]; +static char log_file[NAME_SIZE],log_save[NAME_SIZE],end_file[NAME_SIZE]; +static char par_file[NAME_SIZE],par_save[NAME_SIZE]; +static char dat_file[NAME_SIZE],dat_save[NAME_SIZE]; +static char cnfg_file[NAME_SIZE],nbase[NAME_SIZE]; +static FILE *fin=NULL,*flog=NULL,*fdat=NULL,*fend=NULL; + +static bc_parms_t bcp; + + +static void alloc_data(void) +{ + int nn,tmax; + int in; + double **pp,*p; + + nn=file_head.nn; + tmax=file_head.tmax; + + pp=amalloc(3*(nn+1)*sizeof(*pp),3); + p=amalloc(3*(nn+1)*(tmax+1)*sizeof(*p),4); + + error((pp==NULL)||(p==NULL),1,"alloc_data [ms3.c]", + "Unable to allocate data arrays"); + + data.Wsl=pp; + data.Ysl=pp+nn+1; + data.Qsl=pp+2*(nn+1); + + for (in=0;in<(3*(nn+1));in++) + { + *pp=p; + pp+=1; + p+=tmax; + } + + Wact=p; + p+=nn+1; + Yact=p; + p+=nn+1; + Qtop=p; +} + + +static void write_file_head(void) +{ + int iw; + stdint_t istd[3]; + double dstd[1]; + + istd[0]=(stdint_t)(file_head.dn); + istd[1]=(stdint_t)(file_head.nn); + istd[2]=(stdint_t)(file_head.tmax); + dstd[0]=file_head.eps; + + if (endian==BIG_ENDIAN) + { + bswap_int(3,istd); + bswap_double(1,dstd); + } + + iw=fwrite(istd,sizeof(stdint_t),3,fdat); + iw+=fwrite(dstd,sizeof(double),1,fdat); + + error_root(iw!=4,1,"write_file_head [ms3.c]", + "Incorrect write count"); +} + + +static void check_file_head(void) +{ + int ir; + stdint_t istd[3]; + double dstd[1]; + + ir=fread(istd,sizeof(stdint_t),3,fdat); + ir+=fread(dstd,sizeof(double),1,fdat); + + error_root(ir!=4,1,"check_file_head [ms3.c]", + "Incorrect read count"); + + if (endian==BIG_ENDIAN) + { + bswap_int(3,istd); + bswap_double(1,dstd); + } + + error_root(((int)(istd[0])!=file_head.dn)|| + ((int)(istd[1])!=file_head.nn)|| + ((int)(istd[2])!=file_head.tmax)|| + (dstd[0]!=file_head.eps),1,"check_file_head [ms3.c]", + "Unexpected value of dn,nn,tmax or eps"); +} + + +static void write_data(void) +{ + int iw,nn,tmax; + int in,t; + stdint_t istd[1]; + double dstd[1]; + + istd[0]=(stdint_t)(data.nc); + + if (endian==BIG_ENDIAN) + bswap_int(1,istd); + + iw=fwrite(istd,sizeof(stdint_t),1,fdat); + + nn=file_head.nn; + tmax=file_head.tmax; + + for (in=0;in<=nn;in++) + { + for (t=0;t=NAME_SIZE, + 1,"setup_files [ms3.c]","loc_dir name is too long"); + else + error_root(name_size("%s/%sn%d",cnfg_dir,nbase,last)>=NAME_SIZE, + 1,"setup_files [ms3.c]","cnfg_dir name is too long"); + + check_dir_root(log_dir); + check_dir_root(dat_dir); + error_root(name_size("%s/%s.ms3.log~",log_dir,nbase)>=NAME_SIZE, + 1,"setup_files [ms3.c]","log_dir name is too long"); + error_root(name_size("%s/%s.ms3.dat~",dat_dir,nbase)>=NAME_SIZE, + 1,"setup_files [ms3.c]","dat_dir name is too long"); + + sprintf(log_file,"%s/%s.ms3.log",log_dir,nbase); + sprintf(par_file,"%s/%s.ms3.par",dat_dir,nbase); + sprintf(dat_file,"%s/%s.ms3.dat",dat_dir,nbase); + sprintf(end_file,"%s/%s.ms3.end",log_dir,nbase); + sprintf(log_save,"%s~",log_file); + sprintf(par_save,"%s~",par_file); + sprintf(dat_save,"%s~",dat_file); +} + + +static void read_bc_parms(void) +{ + int bc; + double phi[2],phi_prime[2]; + + if (my_rank==0) + { + find_section("Boundary conditions"); + read_line("type","%d",&bc); + + phi[0]=0.0; + phi[1]=0.0; + phi_prime[0]=0.0; + phi_prime[1]=0.0; + + if (bc==1) + read_dprms("phi",2,phi); + + if ((bc==1)||(bc==2)) + read_dprms("phi'",2,phi_prime); + } + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(phi,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(phi_prime,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + + bcp=set_bc_parms(bc,1.0,1.0,1.0,1.0,phi,phi_prime); + + if (append) + check_bc_parms(fdat); + else + write_bc_parms(fdat); +} + + +static void read_wflow_parms(void) +{ + int nstep,dnms,ie,ir,iw; + stdint_t istd[3]; + double eps,dstd[1]; + + if (my_rank==0) + { + find_section("Wilson flow"); + read_line("integrator","%s",line); + read_line("eps","%lf",&eps); + read_line("nstep","%d",&nstep); + read_line("dnms","%d",&dnms); + + if (strcmp(line,"EULER")==0) + flint=0; + else if (strcmp(line,"RK2")==0) + flint=1; + else if (strcmp(line,"RK3")==0) + flint=2; + else + error_root(1,1,"read_wflow_parms [ms3.c]","Unknown integrator"); + + error_root((dnms<1)||(nstep [-noexp] [-a]"); + + error_root(endian==UNKNOWN_ENDIAN,1,"read_infile [ms3.c]", + "Machine has unknown endianness"); + + noexp=find_opt(argc,argv,"-noexp"); + append=find_opt(argc,argv,"-a"); + + fin=freopen(argv[ifile+1],"r",stdin); + error_root(fin==NULL,1,"read_infile [ms3.c]", + "Unable to open input file"); + } + + MPI_Bcast(&endian,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&noexp,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&append,1,MPI_INT,0,MPI_COMM_WORLD); + + read_dirs(); + setup_files(); + + if (my_rank==0) + { + if (append) + fdat=fopen(par_file,"rb"); + else + fdat=fopen(par_file,"wb"); + + error_root(fdat==NULL,1,"read_infile [ms3.c]", + "Unable to open parameter file"); + } + + read_bc_parms(); + read_wflow_parms(); + + if (my_rank==0) + { + fclose(fin); + fclose(fdat); + + if (append==0) + copy_file(par_file,par_save); + } +} + + +static void check_old_log(int *fst,int *lst,int *stp) +{ + int ie,ic,isv; + int fc,lc,dc,pc; + int np[4],bp[4]; + + fend=fopen(log_file,"r"); + error_root(fend==NULL,1,"check_old_log [ms3.c]", + "Unable to open log file"); + + fc=0; + lc=0; + dc=0; + pc=0; + + ie=0x0; + ic=0; + isv=0; + + while (fgets(line,NAME_SIZE,fend)!=NULL) + { + if (strstr(line,"process grid")!=NULL) + { + if (sscanf(line,"%dx%dx%dx%d process grid, %dx%dx%dx%d", + np,np+1,np+2,np+3,bp,bp+1,bp+2,bp+3)==8) + { + ipgrd[0]=((np[0]!=NPROC0)||(np[1]!=NPROC1)|| + (np[2]!=NPROC2)||(np[3]!=NPROC3)); + ipgrd[1]=((bp[0]!=NPROC0_BLK)||(bp[1]!=NPROC1_BLK)|| + (bp[2]!=NPROC2_BLK)||(bp[3]!=NPROC3_BLK)); + } + else + ie|=0x1; + } + else if (strstr(line,"fully processed")!=NULL) + { + pc=lc; + + if (sscanf(line,"Configuration no %d",&lc)==1) + { + ic+=1; + isv=1; + } + else + ie|=0x1; + + if (ic==1) + fc=lc; + else if (ic==2) + dc=lc-fc; + else if ((ic>2)&&(lc!=(pc+dc))) + ie|=0x2; + } + else if (strstr(line,"Configuration no")!=NULL) + isv=0; + } + + fclose(fend); + + error_root((ie&0x1)!=0x0,1,"check_old_log [ms3.c]", + "Incorrect read count"); + error_root((ie&0x2)!=0x0,1,"check_old_log [ms3.c]", + "Configuration numbers are not equally spaced"); + error_root(isv==0,1,"check_old_log [ms3.c]", + "Log file extends beyond the last configuration save"); + + (*fst)=fc; + (*lst)=lc; + (*stp)=dc; +} + + +static void check_old_dat(int fst,int lst,int stp) +{ + int ie,ic; + int fc,lc,dc,pc; + + fdat=fopen(dat_file,"rb"); + error_root(fdat==NULL,1,"check_old_dat [ms3.c]", + "Unable to open data file"); + + check_file_head(); + + fc=0; + lc=0; + dc=0; + pc=0; + + ie=0x0; + ic=0; + + while (read_data()==1) + { + pc=lc; + lc=data.nc; + ic+=1; + + if (ic==1) + fc=lc; + else if (ic==2) + dc=lc-fc; + else if ((ic>2)&&(lc!=(pc+dc))) + ie|=0x1; + } + + fclose(fdat); + + error_root(ic==0,1,"check_old_dat [ms3.c]", + "No data records found"); + error_root((ie&0x1)!=0x0,1,"check_old_dat [ms3.c]", + "Configuration numbers are not equally spaced"); + error_root((fst!=fc)||(lst!=lc)||(stp!=dc),1,"check_old_dat [ms3.c]", + "Configuration range is not as reported in the log file"); +} + + +static void check_files(void) +{ + int fst,lst,stp; + + ipgrd[0]=0; + ipgrd[1]=0; + + if (my_rank==0) + { + if (append) + { + check_old_log(&fst,&lst,&stp); + check_old_dat(fst,lst,stp); + + error_root((fst!=lst)&&(stp!=step),1,"check_files [ms3.c]", + "Continuation run:\n" + "Previous run had a different configuration separation"); + error_root(first!=lst+step,1,"check_files [ms3.c]", + "Continuation run:\n" + "Configuration range does not continue the previous one"); + } + else + { + fin=fopen(log_file,"r"); + fdat=fopen(dat_file,"rb"); + + error_root((fin!=NULL)||(fdat!=NULL),1,"check_files [ms3.c]", + "Attempt to overwrite old *.log or *.dat file"); + + fdat=fopen(dat_file,"wb"); + error_root(fdat==NULL,1,"check_files [ms3.c]", + "Unable to open data file"); + write_file_head(); + fclose(fdat); + } + } +} + + +static void print_info(void) +{ + int n[3]; + long ip; + + if (my_rank==0) + { + ip=ftell(flog); + fclose(flog); + + if (ip==0L) + remove("STARTUP_ERROR"); + + if (append) + flog=freopen(log_file,"a",stdout); + else + flog=freopen(log_file,"w",stdout); + + error_root(flog==NULL,1,"print_info [ms3.c]","Unable to open log file"); + printf("\n"); + + if (append) + printf("Continuation run\n\n"); + else + { + printf("Computation of Wilson flow observables\n"); + printf("--------------------------------------\n\n"); + } + + printf("Program version %s\n",openQCD_RELEASE); + + if (endian==LITTLE_ENDIAN) + printf("The machine is little endian\n"); + else + printf("The machine is big endian\n"); + if (noexp) + printf("Configurations are read in imported file format\n\n"); + else + printf("Configurations are read in exported file format\n\n"); + + if ((ipgrd[0]!=0)&&(ipgrd[1]!=0)) + printf("Process grid and process block size changed:\n"); + else if (ipgrd[0]!=0) + printf("Process grid changed:\n"); + else if (ipgrd[1]!=0) + printf("Process block size changed:\n"); + + if ((append==0)||(ipgrd[0]!=0)||(ipgrd[1]!=0)) + { + printf("%dx%dx%dx%d lattice, ",N0,N1,N2,N3); + printf("%dx%dx%dx%d local lattice\n",L0,L1,L2,L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d process block size\n\n", + NPROC0_BLK,NPROC1_BLK,NPROC2_BLK,NPROC3_BLK); + } + + if (append==0) + { + if (bcp.type==0) + printf("Open boundary conditions\n\n"); + else if (bcp.type==1) + { + printf("SF boundary conditions\n"); + + n[0]=fdigits(bcp.phi[0][0]); + n[1]=fdigits(bcp.phi[0][1]); + n[2]=fdigits(bcp.phi[0][2]); + printf("phi = %.*f,%.*f,%.*f\n",IMAX(n[0],1),bcp.phi[0][0], + IMAX(n[1],1),bcp.phi[0][1],IMAX(n[2],1),bcp.phi[0][2]); + + n[0]=fdigits(bcp.phi[1][0]); + n[1]=fdigits(bcp.phi[1][1]); + n[2]=fdigits(bcp.phi[1][2]); + printf("phi' = %.*f,%.*f,%.*f\n\n",IMAX(n[0],1),bcp.phi[1][0], + IMAX(n[1],1),bcp.phi[1][1],IMAX(n[2],1),bcp.phi[1][2]); + } + else if (bcp.type==2) + { + printf("Open-SF boundary conditions\n"); + + n[0]=fdigits(bcp.phi[1][0]); + n[1]=fdigits(bcp.phi[1][1]); + n[2]=fdigits(bcp.phi[1][2]); + printf("phi' = %.*f,%.*f,%.*f\n\n",IMAX(n[0],1),bcp.phi[1][0], + IMAX(n[1],1),bcp.phi[1][1],IMAX(n[2],1),bcp.phi[1][2]); + } + else + printf("Periodic boundary conditions\n\n"); + + printf("Wilson flow:\n"); + if (flint==0) + printf("Euler integrator\n"); + else if (flint==1) + printf("2nd order RK integrator\n"); + else + printf("3rd order RK integrator\n"); + n[0]=fdigits(file_head.eps); + printf("eps = %.*f\n",IMAX(n[0],1),file_head.eps); + printf("nstep = %d\n",file_head.dn*file_head.nn); + printf("dnms = %d\n\n",file_head.dn); + } + + printf("Configurations no %d -> %d in steps of %d\n\n", + first,last,step); + fflush(flog); + } +} + + +static void set_data(int nc) +{ + int in,dn,nn; + double eps; + + data.nc=nc; + dn=file_head.dn; + nn=file_head.nn; + eps=file_head.eps; + + for (in=0;in [-noexp] +* +* For usage instructions see the file README.ms4. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "archive.h" +#include "sflds.h" +#include "linalg.h" +#include "dirac.h" +#include "sap.h" +#include "dfl.h" +#include "forces.h" +#include "version.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define N1 (NPROC1*L1) +#define N2 (NPROC2*L2) +#define N3 (NPROC3*L3) + +#define MAX(n,m) \ + if ((n)<(m)) \ + (n)=(m) + +static int my_rank,noexp,endian; +static int first,last,step; +static int level,seed,x0,nsrc; +static int *rlxs_state=NULL,*rlxd_state=NULL; +static double mus; + +static char log_dir[NAME_SIZE],loc_dir[NAME_SIZE]; +static char cnfg_dir[NAME_SIZE],sfld_dir[NAME_SIZE]; +static char log_file[NAME_SIZE],log_save[NAME_SIZE],end_file[NAME_SIZE]; +static char cnfg_file[NAME_SIZE],sfld_file[NAME_SIZE],nbase[NAME_SIZE]; +static FILE *fin=NULL,*flog=NULL,*fend=NULL; + +static lat_parms_t lat; +static bc_parms_t bcp; + + +static void read_dirs(void) +{ + if (my_rank==0) + { + find_section("Run name"); + read_line("name","%s",nbase); + + find_section("Directories"); + read_line("log_dir","%s",log_dir); + + if (noexp) + { + read_line("loc_dir","%s",loc_dir); + cnfg_dir[0]='\0'; + } + else + { + read_line("cnfg_dir","%s",cnfg_dir); + loc_dir[0]='\0'; + } + + read_line("sfld_dir","%s",sfld_dir); + + find_section("Configurations"); + read_line("first","%d",&first); + read_line("last","%d",&last); + read_line("step","%d",&step); + + find_section("Random number generator"); + read_line("level","%d",&level); + read_line("seed","%d",&seed); + + error_root((last=NAME_SIZE, + 1,"setup_files [ms4.c]","loc_dir name is too long"); + else + error_root(name_size("%s/%sn%d",cnfg_dir,nbase,last)>=NAME_SIZE, + 1,"setup_files [ms4.c]","cnfg_dir name is too long"); + + check_dir_root(sfld_dir); + error_root(name_size("%s/%sn%d.s%d",sfld_dir,nbase,last,nsrc-1)>=NAME_SIZE, + 1,"setup_files [ms4.c]","sfld_dir name is too long"); + + check_dir_root(log_dir); + error_root(name_size("%s/%s.ms4.log~",log_dir,nbase)>=NAME_SIZE, + 1,"setup_files [ms4.c]","log_dir name is too long"); + + sprintf(log_file,"%s/%s.ms4.log",log_dir,nbase); + sprintf(end_file,"%s/%s.ms4.end",log_dir,nbase); + sprintf(log_save,"%s~",log_file); +} + + +static void read_lat_parms(void) +{ + double kappa,csw; + + if (my_rank==0) + { + find_section("Dirac operator"); + read_line("kappa","%lf",&kappa); + read_line("mu","%lf",&mus); + read_line("csw","%lf",&csw); + + find_section("Source fields"); + read_line("x0","%d",&x0); + read_line("nsrc","%d",&nsrc); + + error_root((x0<0)||(x0>=N0),1,"read_lat_parms [ms4.c]", + "Specified time x0 is out of range"); + error_root(nsrc<1,1,"read_lat_parms [ms4.c]", + "The number of source fields must be at least 1"); + } + + MPI_Bcast(&kappa,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&mus,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&csw,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + MPI_Bcast(&x0,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nsrc,1,MPI_INT,0,MPI_COMM_WORLD); + + lat=set_lat_parms(0.0,1.0,1,&kappa,csw); + set_sw_parms(sea_quark_mass(0)); +} + + +static void read_bc_parms(void) +{ + int bc; + double cF,cF_prime; + double phi[2],phi_prime[2]; + + if (my_rank==0) + { + find_section("Boundary conditions"); + read_line("type","%d",&bc); + + error_root(((x0==0)&&(bc!=3))||((x0=(N0-1))&&(bc==0)),1, + "read_bc_parms [ms4.c]","Incompatible choice of boundary " + "conditions and source time"); + + phi[0]=0.0; + phi[1]=0.0; + phi_prime[0]=0.0; + phi_prime[1]=0.0; + cF=1.0; + cF_prime=1.0; + + if (bc==1) + read_dprms("phi",2,phi); + + if ((bc==1)||(bc==2)) + read_dprms("phi'",2,phi_prime); + + if (bc!=3) + read_line("cF","%lf",&cF); + + if (bc==2) + read_line("cF'","%lf",&cF_prime); + } + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(phi,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(phi_prime,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cF,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cF_prime,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + bcp=set_bc_parms(bc,1.0,1.0,cF,cF_prime,phi,phi_prime); +} + + +static void read_sap_parms(void) +{ + int bs[4]; + + if (my_rank==0) + { + find_section("SAP"); + read_line("bs","%d %d %d %d",bs,bs+1,bs+2,bs+3); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + set_sap_parms(bs,1,4,5); +} + + +static void read_dfl_parms(void) +{ + int bs[4],Ns; + int ninv,nmr,ncy,nkv,nmx; + double kappa,mu,res; + + if (my_rank==0) + { + find_section("Deflation subspace"); + read_line("bs","%d %d %d %d",bs,bs+1,bs+2,bs+3); + read_line("Ns","%d",&Ns); + } + + MPI_Bcast(bs,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&Ns,1,MPI_INT,0,MPI_COMM_WORLD); + set_dfl_parms(bs,Ns); + + if (my_rank==0) + { + find_section("Deflation subspace generation"); + read_line("kappa","%lf",&kappa); + read_line("mu","%lf",&mu); + read_line("ninv","%d",&ninv); + read_line("nmr","%d",&nmr); + read_line("ncy","%d",&ncy); + } + + MPI_Bcast(&kappa,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&mu,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&ninv,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmr,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&ncy,1,MPI_INT,0,MPI_COMM_WORLD); + set_dfl_gen_parms(kappa,mu,ninv,nmr,ncy); + + if (my_rank==0) + { + find_section("Deflation projection"); + read_line("nkv","%d",&nkv); + read_line("nmx","%d",&nmx); + read_line("res","%lf",&res); + } + + MPI_Bcast(&nkv,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmx,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&res,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + set_dfl_pro_parms(nkv,nmx,res); +} + + +static void read_solver(void) +{ + solver_parms_t sp; + + read_solver_parms(0); + sp=solver_parms(0); + + if ((sp.solver==SAP_GCR)||(sp.solver==DFL_SAP_GCR)) + read_sap_parms(); + + if (sp.solver==DFL_SAP_GCR) + read_dfl_parms(); +} + + +static void read_infile(int argc,char *argv[]) +{ + int ifile; + + if (my_rank==0) + { + flog=freopen("STARTUP_ERROR","w",stdout); + + ifile=find_opt(argc,argv,"-i"); + endian=endianness(); + + error_root((ifile==0)||(ifile==(argc-1)),1,"read_infile [ms4.c]", + "Syntax: ms4 -i [-noexp]"); + + error_root(endian==UNKNOWN_ENDIAN,1,"read_infile [ms4.c]", + "Machine has unknown endianness"); + + noexp=find_opt(argc,argv,"-noexp"); + + fin=freopen(argv[ifile+1],"r",stdin); + error_root(fin==NULL,1,"read_infile [ms4.c]", + "Unable to open input file"); + } + + MPI_Bcast(&endian,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&noexp,1,MPI_INT,0,MPI_COMM_WORLD); + + read_dirs(); + setup_files(); + read_lat_parms(); + read_bc_parms(); + read_solver(); + + if (my_rank==0) + fclose(fin); +} + + +static void check_files(void) +{ + if (my_rank==0) + { + fin=fopen(log_file,"r"); + error_root(fin!=NULL,1,"check_files [ms4.c]", + "Attempt to overwrite old *.log file"); + } +} + + +static void print_info(void) +{ + int isap,idfl,n[3]; + long ip; + + if (my_rank==0) + { + ip=ftell(flog); + fclose(flog); + + if (ip==0L) + remove("STARTUP_ERROR"); + + flog=freopen(log_file,"w",stdout); + error_root(flog==NULL,1,"print_info [ms4.c]","Unable to open log file"); + printf("\n"); + + printf("Computation of quark propagators\n"); + printf("--------------------------------\n\n"); + + printf("Program version %s\n",openQCD_RELEASE); + + if (endian==LITTLE_ENDIAN) + printf("The machine is little endian\n"); + else + printf("The machine is big endian\n"); + if (noexp) + printf("Configurations are read in imported file format\n\n"); + else + printf("Configurations are read in exported file format\n\n"); + + printf("%dx%dx%dx%d lattice, ",N0,N1,N2,N3); + printf("%dx%dx%dx%d local lattice\n",L0,L1,L2,L3); + printf("%dx%dx%dx%d process grid, ",NPROC0,NPROC1,NPROC2,NPROC3); + printf("%dx%dx%dx%d process block size\n", + NPROC0_BLK,NPROC1_BLK,NPROC2_BLK,NPROC3_BLK); + printf("SF boundary conditions on the quark fields\n\n"); + + printf("Random number generator:\n"); + printf("level = %d, seed = %d\n\n",level,seed); + + printf("Dirac operator:\n"); + n[0]=fdigits(lat.kappa[0]); + printf("kappa = %.*f\n",IMAX(n[0],6),lat.kappa[0]); + n[0]=fdigits(mus); + printf("mu = %.*f\n",IMAX(n[0],1),mus); + n[0]=fdigits(lat.csw); + printf("csw = %.*f\n\n",IMAX(n[0],1),lat.csw); + + if (bcp.type==0) + { + printf("Open boundary conditions\n"); + + n[0]=fdigits(bcp.cF[0]); + printf("cF = %.*f\n\n",IMAX(n[0],1),bcp.cF[0]); + } + else if (bcp.type==1) + { + printf("SF boundary conditions\n"); + + n[0]=fdigits(bcp.cF[0]); + printf("cF = %.*f\n",IMAX(n[0],1),bcp.cF[0]); + + n[0]=fdigits(bcp.phi[0][0]); + n[1]=fdigits(bcp.phi[0][1]); + n[2]=fdigits(bcp.phi[0][2]); + printf("phi = %.*f,%.*f,%.*f\n",IMAX(n[0],1),bcp.phi[0][0], + IMAX(n[1],1),bcp.phi[0][1],IMAX(n[2],1),bcp.phi[0][2]); + + n[0]=fdigits(bcp.phi[1][0]); + n[1]=fdigits(bcp.phi[1][1]); + n[2]=fdigits(bcp.phi[1][2]); + printf("phi' = %.*f,%.*f,%.*f\n\n",IMAX(n[0],1),bcp.phi[1][0], + IMAX(n[1],1),bcp.phi[1][1],IMAX(n[2],1),bcp.phi[1][2]); + } + else if (bcp.type==2) + { + printf("Open-SF boundary conditions\n"); + + n[0]=fdigits(bcp.cF[0]); + printf("cF = %.*f\n",IMAX(n[0],1),bcp.cF[0]); + n[1]=fdigits(bcp.cF[1]); + printf("cF' = %.*f\n",IMAX(n[1],1),bcp.cF[1]); + + n[0]=fdigits(bcp.phi[1][0]); + n[1]=fdigits(bcp.phi[1][1]); + n[2]=fdigits(bcp.phi[1][2]); + printf("phi' = %.*f,%.*f,%.*f\n\n",IMAX(n[0],1),bcp.phi[1][0], + IMAX(n[1],1),bcp.phi[1][1],IMAX(n[2],1),bcp.phi[1][2]); + } + else + printf("Periodic boundary conditions\n\n"); + + printf("Source fields:\n"); + printf("x0 = %d\n",x0); + printf("nsrc = %d\n\n",nsrc); + + print_solver_parms(&isap,&idfl); + + if (isap) + print_sap_parms(0); + + if (idfl) + print_dfl_parms(0); + + printf("Configurations no %d -> %d in steps of %d\n\n", + first,last,step); + fflush(flog); + } +} + + +static void dfl_wsize(int *nws,int *nwv,int *nwvd) +{ + dfl_parms_t dp; + dfl_pro_parms_t dpp; + + dp=dfl_parms(); + dpp=dfl_pro_parms(); + + MAX(*nws,dp.Ns+2); + MAX(*nwv,2*dpp.nkv+2); + MAX(*nwvd,4); +} + + +static void wsize(int *nws,int *nwsd,int *nwv,int *nwvd) +{ + int nsd; + solver_parms_t sp; + + (*nws)=0; + (*nwsd)=0; + (*nwv)=0; + (*nwvd)=0; + + sp=solver_parms(0); + nsd=2; + + if (sp.solver==CGNE) + { + MAX(*nws,5); + MAX(*nwsd,nsd+3); + } + if (sp.solver==SAP_GCR) + { + MAX(*nws,2*sp.nkv+1); + MAX(*nwsd,nsd+2); + } + else if (sp.solver==DFL_SAP_GCR) + { + MAX(*nws,2*sp.nkv+2); + MAX(*nwsd,nsd+4); + dfl_wsize(nws,nwv,nwvd); + } + else + error_root(1,1,"wsize [ms4.c]", + "Unknown or unsupported solver"); +} + + +static void random_source(spinor_dble *eta) +{ + int y0,iy,ix; + + set_sd2zero(VOLUME,eta); + y0=x0-cpr[0]*L0; + + if ((y0>=0)&&(y0 [-noloc] [-noexp] [-rmold] [-noms] +* [-c [-a [-norng]]] +* +* For usage instructions see the file README.qcd1. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include +#include "mpi.h" +#include "flags.h" +#include "random.h" +#include "su3fcts.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "archive.h" +#include "forces.h" +#include "update.h" +#include "wflow.h" +#include "tcharge.h" +#include "version.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define N1 (NPROC1*L1) +#define N2 (NPROC2*L2) +#define N3 (NPROC3*L3) + +typedef struct +{ + int nt,iac; + double dH,avpl; +} dat_t; + +static struct +{ + int dn,nn,tmax; + double eps; +} file_head; + +static struct +{ + int nt; + double **Wsl,**Ysl,**Qsl; +} data; + +static int my_rank,noloc,noexp,rmold,noms,norng; +static int scnfg,append,endian; +static int level,seed; +static int nth,ntr,dtr_log,dtr_ms,dtr_cnfg; +static int ipgrd[2],flint; +static double *Wact,*Yact,*Qtop; + +static char line[NAME_SIZE]; +static char log_dir[NAME_SIZE],dat_dir[NAME_SIZE]; +static char loc_dir[NAME_SIZE],cnfg_dir[NAME_SIZE]; +static char log_file[NAME_SIZE],log_save[NAME_SIZE]; +static char par_file[NAME_SIZE],par_save[NAME_SIZE]; +static char dat_file[NAME_SIZE],dat_save[NAME_SIZE]; +static char msdat_file[NAME_SIZE],msdat_save[NAME_SIZE]; +static char rng_file[NAME_SIZE],rng_save[NAME_SIZE]; +static char cnfg_file[NAME_SIZE],end_file[NAME_SIZE]; +static char nbase[NAME_SIZE],cnfg[NAME_SIZE]; +static FILE *fin=NULL,*flog=NULL,*fdat=NULL,*fend=NULL; + +static hmc_parms_t hmc; + + +static int write_dat(int n,dat_t *ndat) +{ + int i,iw,ic; + stdint_t istd[2]; + double dstd[2]; + + ic=0; + + for (i=0;i=NAME_SIZE,1, + "setup_files [qcd1.c]","log_dir name is too long"); + error_root(name_size("%s/%s.ms.dat~",dat_dir,nbase)>=NAME_SIZE,1, + "setup_files [qcd1.c]","dat_dir name is too long"); + + sprintf(log_file,"%s/%s.log",log_dir,nbase); + sprintf(par_file,"%s/%s.par",dat_dir,nbase); + sprintf(dat_file,"%s/%s.dat",dat_dir,nbase); + sprintf(msdat_file,"%s/%s.ms.dat",dat_dir,nbase); + sprintf(rng_file,"%s/%s.rng",dat_dir,nbase); + sprintf(end_file,"%s/%s.end",log_dir,nbase); + sprintf(log_save,"%s~",log_file); + sprintf(par_save,"%s~",par_file); + sprintf(dat_save,"%s~",dat_file); + sprintf(msdat_save,"%s~",msdat_file); + sprintf(rng_save,"%s~",rng_file); +} + + +static void read_lat_parms(void) +{ + int nk; + double beta,c0,csw,*kappa; + + if (my_rank==0) + { + find_section("Lattice parameters"); + read_line("beta","%lf",&beta); + read_line("c0","%lf",&c0); + nk=count_tokens("kappa"); + read_line("csw","%lf",&csw); + } + + MPI_Bcast(&beta,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&c0,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&nk,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&csw,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + if (nk>0) + { + kappa=malloc(nk*sizeof(*kappa)); + error(kappa==NULL,1,"read_lat_parms [qcd1.c]", + "Unable to allocate parameter array"); + if (my_rank==0) + read_dprms("kappa",nk,kappa); + MPI_Bcast(kappa,nk,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + else + kappa=NULL; + + set_lat_parms(beta,c0,nk,kappa,csw); + + if (nk>0) + free(kappa); + + if (append) + check_lat_parms(fdat); + else + write_lat_parms(fdat); +} + + +static void read_bc_parms(void) +{ + int bc; + double cG,cG_prime,cF,cF_prime; + double phi[2],phi_prime[2]; + + if (my_rank==0) + { + find_section("Boundary conditions"); + read_line("type","%d",&bc); + + phi[0]=0.0; + phi[1]=0.0; + phi_prime[0]=0.0; + phi_prime[1]=0.0; + cG=1.0; + cG_prime=1.0; + cF=1.0; + cF_prime=1.0; + + if (bc==1) + read_dprms("phi",2,phi); + + if ((bc==1)||(bc==2)) + read_dprms("phi'",2,phi_prime); + + if (bc!=3) + { + read_line("cG","%lf",&cG); + read_line("cF","%lf",&cF); + } + + if (bc==2) + { + read_line("cG'","%lf",&cG_prime); + read_line("cF'","%lf",&cF_prime); + } + } + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(phi,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(phi_prime,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cG,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cG_prime,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cF,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cF_prime,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + set_bc_parms(bc,cG,cG_prime,cF,cF_prime,phi,phi_prime); + + if (append) + check_bc_parms(fdat); + else + write_bc_parms(fdat); +} + + +static void read_schedule(void) +{ + int ie,ir,iw; + stdint_t istd[3]; + + if (my_rank==0) + { + find_section("MD trajectories"); + read_line("nth","%d",&nth); + read_line("ntr","%d",&ntr); + read_line("dtr_log","%d",&dtr_log); + if (noms==0) + read_line("dtr_ms","%d",&dtr_ms); + else + dtr_ms=0; + read_line("dtr_cnfg","%d",&dtr_cnfg); + + error_root((append!=0)&&(nth!=0),1,"read_schedule [qcd1.c]", + "Continuation run: nth must be equal to zero"); + + ie=0; + ie|=(nth<0); + ie|=(ntr<1); + ie|=(dtr_log<1); + ie|=(dtr_log>dtr_cnfg); + ie|=((dtr_cnfg%dtr_log)!=0); + ie|=((nth%dtr_cnfg)!=0); + ie|=((ntr%dtr_cnfg)!=0); + + if (noms==0) + { + ie|=(dtr_msdtr_cnfg); + ie|=((dtr_ms%dtr_log)!=0); + ie|=((dtr_cnfg%dtr_ms)!=0); + } + + error_root(ie!=0,1,"read_schedule [qcd1.c]", + "Improper value of nth,ntr,dtr_log,dtr_ms or dtr_cnfg"); + } + + MPI_Bcast(&nth,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&ntr,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&dtr_log,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&dtr_ms,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&dtr_cnfg,1,MPI_INT,0,MPI_COMM_WORLD); + + if (my_rank==0) + { + if (append) + { + ir=fread(istd,sizeof(stdint_t),3,fdat); + error_root(ir!=3,1,"read_schedule [qcd1.c]", + "Incorrect read count"); + + if (endian==BIG_ENDIAN) + bswap_int(3,istd); + + ie=0; + ie|=(istd[0]!=(stdint_t)(dtr_log)); + ie|=(istd[1]!=(stdint_t)(dtr_ms)); + ie|=(istd[2]!=(stdint_t)(dtr_cnfg)); + + error_root(ie!=0,1,"read_schedule [qcd1.c]", + "Parameters do not match previous run"); + } + else + { + istd[0]=(stdint_t)(dtr_log); + istd[1]=(stdint_t)(dtr_ms); + istd[2]=(stdint_t)(dtr_cnfg); + + if (endian==BIG_ENDIAN) + bswap_int(3,istd); + + iw=fwrite(istd,sizeof(stdint_t),3,fdat); + error_root(iw!=3,1,"read_schedule [qcd1.c]", + "Incorrect write count"); + } + } +} + + +static void read_actions(void) +{ + int i,k,l,nact,*iact; + int npf,nlv,nmu; + double tau,*mu; + action_parms_t ap; + rat_parms_t rp; + + if (my_rank==0) + { + find_section("HMC parameters"); + nact=count_tokens("actions"); + read_line("npf","%d",&npf); + read_line("nlv","%d",&nlv); + read_line("tau","%lf",&tau); + } + + MPI_Bcast(&nact,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&npf,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nlv,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&tau,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + if (nact>0) + { + iact=malloc(nact*sizeof(*iact)); + error(iact==NULL,1,"read_actions [qcd1.c]", + "Unable to allocate temporary array"); + if (my_rank==0) + read_iprms("actions",nact,iact); + MPI_Bcast(iact,nact,MPI_INT,0,MPI_COMM_WORLD); + } + else + iact=NULL; + + nmu=0; + + for (i=0;i0) + { + mu=malloc(nmu*sizeof(*mu)); + error(mu==NULL,1,"read_actions [qcd1.c]", + "Unable to allocate temporary array"); + + if (my_rank==0) + { + find_section("HMC parameters"); + read_dprms("mu",nmu,mu); + } + + MPI_Bcast(mu,nmu,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + else + mu=NULL; + + hmc=set_hmc_parms(nact,iact,npf,nmu,mu,nlv,tau); + + if (nact>0) + free(iact); + if (nmu>0) + free(mu); + + if (append) + { + check_hmc_parms(fdat); + check_action_parms(fdat); + } + else + { + write_hmc_parms(fdat); + write_action_parms(fdat); + } +} + + +static void read_integrator(void) +{ + int nlv,i,j,k,l; + mdint_parms_t mdp; + force_parms_t fp; + rat_parms_t rp; + + nlv=hmc.nlv; + + for (i=0;i [-noloc] [-noexp] " + "[-rmold] [-noms] [-c [-a [-norng]]]"); + + error_root(endian==UNKNOWN_ENDIAN,1,"read_infile [qcd1.c]", + "Machine has unknown endianness"); + + error_root((noexp)&&(noloc),1,"read_infile [qcd1.c]", + "The concurrent use of -noloc and -noexp is not permitted"); + + if (scnfg) + { + strncpy(cnfg,argv[scnfg+1],NAME_SIZE-1); + cnfg[NAME_SIZE-1]='\0'; + } + else + cnfg[0]='\0'; + + fin=freopen(argv[ifile+1],"r",stdin); + error_root(fin==NULL,1,"read_infile [qcd1.c]", + "Unable to open input file"); + } + + MPI_Bcast(&noloc,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&noexp,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&rmold,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&noms,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&scnfg,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&append,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&norng,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&endian,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(cnfg,NAME_SIZE,MPI_CHAR,0,MPI_COMM_WORLD); + + if (my_rank==0) + { + find_section("Random number generator"); + read_line("level","%d",&level); + read_line("seed","%d",&seed); + } + + MPI_Bcast(&level,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&seed,1,MPI_INT,0,MPI_COMM_WORLD); + + read_dirs(); + setup_files(); + + if (my_rank==0) + { + if (append) + fdat=fopen(par_file,"rb"); + else + fdat=fopen(par_file,"wb"); + + error_root(fdat==NULL,1,"read_infile [qcd1.c]", + "Unable to open parameter file"); + } + + read_lat_parms(); + read_bc_parms(); + read_schedule(); + read_actions(); + read_integrator(); + read_solvers(); + read_wflow_parms(); + + if (my_rank==0) + { + fclose(fin); + fclose(fdat); + + if (append==0) + copy_file(par_file,par_save); + } +} + + +static void check_old_log(int ic,int *nl,int *icnfg) +{ + int ir,isv; + int np[4],bp[4]; + + fend=fopen(log_file,"r"); + error_root(fend==NULL,1,"check_old_log [qcd1.c]", + "Unable to open log file"); + (*nl)=0; + (*icnfg)=0; + ir=1; + isv=0; + + while (fgets(line,NAME_SIZE,fend)!=NULL) + { + if (strstr(line,"process grid")!=NULL) + { + ir&=(sscanf(line,"%dx%dx%dx%d process grid, %dx%dx%dx%d", + np,np+1,np+2,np+3,bp,bp+1,bp+2,bp+3)==8); + + ipgrd[0]=((np[0]!=NPROC0)||(np[1]!=NPROC1)|| + (np[2]!=NPROC2)||(np[3]!=NPROC3)); + ipgrd[1]=((bp[0]!=NPROC0_BLK)||(bp[1]!=NPROC1_BLK)|| + (bp[2]!=NPROC2_BLK)||(bp[3]!=NPROC3_BLK)); + } + else if (strstr(line,"Trajectory no")!=NULL) + { + ir&=(sscanf(line,"Trajectory no %d",nl)==1); + isv=0; + } + else if (strstr(line,"Configuration no")!=NULL) + { + ir&=(sscanf(line,"Configuration no %d",icnfg)==1); + isv=1; + } + } + + fclose(fend); + + error_root(ir!=1,1,"check_old_log [qcd1.c]","Incorrect read count"); + + error_root(ic!=(*icnfg),1,"check_old_log [qcd1.c]", + "Continuation run:\n" + "Initial configuration is not the last one of the previous run"); + + error_root(isv==0,1,"check_old_log [qcd1.c]", + "Continuation run:\n" + "The log file extends beyond the last configuration save"); +} + + +static void check_old_dat(int nl) +{ + int nt; + dat_t ndat; + + fdat=fopen(dat_file,"rb"); + error_root(fdat==NULL,1,"check_old_dat [qcd1.c]", + "Unable to open data file"); + nt=0; + + while (read_dat(1,&ndat)==1) + nt=ndat.nt; + + fclose(fdat); + + error_root(nt!=nl,1,"check_old_dat [qcd1.c]", + "Continuation run: Incomplete or too many data records"); +} + + +static void check_old_msdat(int nl) +{ + int ic,ir,nt,pnt,dnt; + + fdat=fopen(msdat_file,"rb"); + error_root(fdat==NULL,1,"check_old_msdat [qcd1.c]", + "Unable to open data file"); + + check_file_head(); + + nt=0; + dnt=0; + pnt=0; + + for (ic=0;;ic++) + { + ir=read_data(); + + if (ir==0) + { + error_root(ic==0,1,"check_old_msdat [qcd1.c]", + "No data records found"); + break; + } + + nt=data.nt; + + if (ic==1) + { + dnt=nt-pnt; + error_root(dnt<1,1,"check_old_msdat [qcd1.c]", + "Incorrect trajectory separation"); + } + else if (ic>1) + error_root(nt!=(pnt+dnt),1,"check_old_msdat [qcd1.c]", + "Trajectory sequence is not equally spaced"); + + pnt=nt; + } + + fclose(fdat); + + error_root((nt!=nl)||((ic>1)&&(dnt!=dtr_ms)),1, + "check_old_msdat [qcd1.c]","Last trajectory numbers " + "or the trajectory separations do not match"); +} + + +static void check_files(int *nl,int *icnfg) +{ + int icmax,ic; + + ipgrd[0]=0; + ipgrd[1]=0; + + if (my_rank==0) + { + if (noloc) + error_root(cnfg[strlen(cnfg)-1]=='*',1, + "check_files [qcd1.c]","Attempt to read an " + "imported configuration when -noloc is set"); + + if (append) + { + error_root(strstr(cnfg,nbase)!=cnfg,1,"check_files [qcd1.c]", + "Continuation run:\n" + "Run name does not match the previous one"); + error_root(sscanf(cnfg+strlen(nbase),"n%d",&ic)!=1,1, + "check_files [qcd1.c]","Continuation run:\n" + "Unable to read configuration number from file name"); + + check_old_log(ic,nl,icnfg); + check_old_dat(*nl); + if (noms==0) + check_old_msdat(*nl); + + (*icnfg)+=1; + } + else + { + fin=fopen(log_file,"r"); + fdat=fopen(dat_file,"rb"); + + if (noms==0) + fend=fopen(msdat_file,"rb"); + else + fend=NULL; + + error_root((fin!=NULL)||(fdat!=NULL)||(fend!=NULL),1, + "check_files [qcd1.c]", + "Attempt to overwrite old *.log or *.dat file"); + + if (noms==0) + { + fdat=fopen(msdat_file,"wb"); + error_root(fdat==NULL,1,"check_files [qcd1.c]", + "Unable to open measurement data file"); + write_file_head(); + fclose(fdat); + } + + (*nl)=0; + (*icnfg)=1; + } + + icmax=(*icnfg)+(ntr-nth)/dtr_cnfg; + + if (noloc==0) + error_root(name_size("%s/%sn%d_%d",loc_dir,nbase,icmax,NPROC-1)>= + NAME_SIZE,1,"check_files [qcd1.c]", + "loc_dir name is too long"); + + if (noexp==0) + error_root(name_size("%s/%sn%d",cnfg_dir,nbase,icmax)>=NAME_SIZE,1, + "check_files [qcd1.c]","cnfg_dir name is too long"); + + if (scnfg) + { + if (cnfg[strlen(cnfg)-1]=='*') + error_root(name_size("%s/%s%d",loc_dir,cnfg,NPROC-1)>=NAME_SIZE,1, + "check_files [qcd1.c]","loc_dir name is too long"); + else + error_root(name_size("%s/%s",cnfg_dir,cnfg)>=NAME_SIZE,1, + "check_files [qcd1.c]","cnfg_dir name is too long"); + } + } + + MPI_Bcast(nl,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(icnfg,1,MPI_INT,0,MPI_COMM_WORLD); +} + + +static void init_ud(void) +{ + char *p; + + if (scnfg) + { + if (cnfg[strlen(cnfg)-1]!='*') + { + sprintf(cnfg_file,"%s/%s",cnfg_dir,cnfg); + import_cnfg(cnfg_file); + } + else + { + sprintf(line,"%s/%s",loc_dir,cnfg); + p=line+strlen(line)-1; + p[0]='\0'; + sprintf(cnfg_file,"%s_%d",line,my_rank); + read_cnfg(cnfg_file); + } + } + else + random_ud(); +} + + +static void init_rng(int icnfg) +{ + int ic; + + if (append) + { + if (cnfg[strlen(cnfg)-1]!='*') + { + if (norng) + start_ranlux(level,seed^(icnfg-1)); + else + { + ic=import_ranlux(rng_file); + error_root(ic!=(icnfg-1),1,"init_rng [qcd1.c]", + "Configuration number mismatch (*.rng file)"); + } + } + } + else + start_ranlux(level,seed); +} + + +static void store_ud(su3_dble *usv) +{ + su3_dble *udb; + + udb=udfld(); + cm3x3_assign(4*VOLUME,udb,usv); +} + + +static void recall_ud(su3_dble *usv) +{ + su3_dble *udb; + + udb=udfld(); + cm3x3_assign(4*VOLUME,usv,udb); + set_flags(UPDATED_UD); +} + + +static void set_data(int nt) +{ + int in,dn,nn; + double eps; + + data.nt=nt; + dn=file_head.dn; + nn=file_head.nn; + eps=file_head.eps; + + for (in=0;in0); + dn=file_head.dn; + nn=file_head.nn; + eps=file_head.eps; + + din=nn/10; + if (din<1) + din=1; + + printf("Measurement run:\n\n"); + + for (in=0;in<=nn;in+=din) + printf("n = %3d, t = %.2e, Wact = %.6e, Yact = %.6e, Q = % .2e\n", + in*dn,eps*(double)(in*dn),Wact[in],Yact[in],Qtop[in]); + + printf("\n"); + printf("Configuration fully processed in %.2e sec ",wtms); + printf("(average = %.2e sec)\n",wtmsall/(double)(nms)); + printf("Measured data saved\n\n"); + fflush(flog); + } +} + + +static void save_cnfg(int icnfg) +{ + int ie; + + ie=check_bc(0.0)^0x1; + ie|=chs_ubnd(1); + error_root(ie!=0,1,"save_cnfg [qcd1.c]","Unexpected boundary values"); + + if (noloc==0) + { + sprintf(cnfg_file,"%s/%sn%d_%d",loc_dir,nbase,icnfg,my_rank); + write_cnfg(cnfg_file); + } + + if (noexp==0) + { + sprintf(cnfg_file,"%s/%sn%d",cnfg_dir,nbase,icnfg); + export_cnfg(cnfg_file); + } + + if (my_rank==0) + { + if ((noloc==0)&&(noexp==0)) + printf("Configuration no %d saved on the local disks " + "and exported\n\n",icnfg); + else if (noloc==0) + printf("Configuration no %d saved on the local disks\n\n",icnfg); + else if (noexp==0) + printf("Configuration no %d exported\n\n",icnfg); + } +} + + +static void check_endflag(int *iend) +{ + if (my_rank==0) + { + fend=fopen(end_file,"r"); + + if (fend!=NULL) + { + fclose(fend); + remove(end_file); + (*iend)=1; + printf("End flag set, run stopped\n\n"); + } + else + (*iend)=0; + } + + MPI_Bcast(iend,1,MPI_INT,0,MPI_COMM_WORLD); +} + + +static void remove_cnfg(int icnfg) +{ + if ((rmold)&&(icnfg>=1)) + { + if (noloc==0) + { + sprintf(cnfg_file,"%s/%sn%d_%d",loc_dir,nbase,icnfg,my_rank); + remove(cnfg_file); + } + + if ((noexp==0)&&(my_rank==0)) + { + sprintf(cnfg_file,"%s/%sn%d",cnfg_dir,nbase,icnfg); + remove(cnfg_file); + } + } +} + + +int main(int argc,char *argv[]) +{ + int nl,icnfg; + int nwud,nws,nwsd,nwv,nwvd; + int n,iend,iac,i; + double *act0,*act1,w0[2],w1[2],npl,siac; + double wt1,wt2,wtcyc,wtall,wtms,wtmsall; + su3_dble **usv; + dat_t ndat; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + read_infile(argc,argv); + if (noms==0) + alloc_data(); + check_files(&nl,&icnfg); + geometry(); + + hmc_wsize(&nwud,&nws,&nwsd,&nwv,&nwvd); + alloc_wud(nwud); + alloc_ws(nws); + alloc_wsd(nwsd); + alloc_wv(nwv); + alloc_wvd(nwvd); + if ((noms==0)&&(flint)) + alloc_wfd(1); + + act0=malloc(2*(hmc.nact+1)*sizeof(*act0)); + act1=act0+hmc.nact+1; + error(act0==NULL,1,"main [qcd1.c]","Unable to allocate action arrays"); + + print_info(icnfg); + hmc_sanity_check(); + set_mdsteps(); + setup_counters(); + setup_chrono(); + init_ud(); + init_rng(icnfg); + + if (bc_type()==0) + npl=(double)(6*(N0-1)*N1)*(double)(N2*N3); + else + npl=(double)(6*N0*N1)*(double)(N2*N3); + + iend=0; + siac=0.0; + wtcyc=0.0; + wtall=0.0; + wtms=0.0; + wtmsall=0.0; + + for (n=0;(iend==0)&&(n=nth)&&(((ntr-n-1)%dtr_ms)==0)) + { + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + + usv=reserve_wud(1); + store_ud(usv[0]); + set_data(nl+n+1); + recall_ud(usv[0]); + release_wud(); + + MPI_Barrier(MPI_COMM_WORLD); + wt2=MPI_Wtime(); + + wtms=wt2-wt1; + wtmsall+=wtms; + save_msdat(n,wtms,wtmsall); + } + } + + if (((n+1)>=nth)&&(((ntr-n-1)%dtr_cnfg)==0)) + { + save_cnfg(icnfg); + export_ranlux(icnfg,rng_file); + check_endflag(&iend); + error_chk(); + + if (my_rank==0) + { + fflush(flog); + copy_file(log_file,log_save); + copy_file(dat_file,dat_save); + if (noms==0) + copy_file(msdat_file,msdat_save); + copy_file(rng_file,rng_save); + } + + remove_cnfg(icnfg-1); + icnfg+=1; + } + } + + if (my_rank==0) + fclose(flog); + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/qcd1.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/qcd1.in new file mode 100644 index 0000000000000000000000000000000000000000..303c84de3d0ab82aadc3964366c51c21925eb9a9 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/qcd1.in @@ -0,0 +1,145 @@ + +[Run name] +name Snoopy137 + +[Directories] +log_dir ../data/qcd1/log +dat_dir ../data/qcd1/dat +loc_dir /ndata/qcd1/cnfg +cnfg_dir /data/qcd1/cnfg + +[Random number generator] +level 0 +seed 73099 + +[Lattice parameters] +beta 6.0 +c0 1.6667 +kappa 0.1300 +csw 1.234 + +[Boundary conditions] +type 2 +phi 0.12 -0.56 +phi' 0.92 0.76 +cG 1.10 +cG' 1.05 +cF 0.95 +cF' 0.90 + +[HMC parameters] +actions 0 1 2 +npf 2 +mu 0.01 1.0 +nlv 3 +tau 0.5 + +[MD trajectories] +nth 320 +ntr 32000 +dtr_log 4 +dtr_ms 8 +dtr_cnfg 32 + +[Level 0] +integrator OMF4 +nstep 1 +forces 0 + +[Level 1] +integrator OMF2 +lambda 0.2 +nstep 2 +forces 1 + +[Level 2] +integrator LPFR +nstep 6 +forces 2 + +[Action 0] +action ACG + +[Action 1] +action ACF_TM1 +ipf 0 +im0 0 +imu 1 +isp 0 + +[Action 2] +action ACF_TM2 +ipf 1 +im0 0 +imu 0 1 +isp 1 0 + +[Force 0] +force FRG + +[Force 1] +force FRF_TM1 +isp 2 +ncr 4 + +[Force 2] +force FRF_TM2 +isp 3 +ncr 0 + +[Solver 0] +solver CGNE +nmx 256 +res 1.0e-10 + +[Solver 1] +solver DFL_SAP_GCR +nkv 16 +isolv 1 +nmr 4 +ncy 5 +nmx 24 +res 1.0e-10 + +[Solver 2] +solver CGNE +nmx 256 +res 1.0e-8 + +[Solver 3] +solver DFL_SAP_GCR +nkv 16 +isolv 1 +nmr 4 +ncy 5 +nmx 24 +res 1.0e-8 + +[SAP] +bs 4 4 4 4 + +[Deflation subspace] +bs 4 4 4 4 +Ns 28 + +[Deflation subspace generation] +kappa 0.1350 +mu 0.01 +ninv 5 +nmr 4 +ncy 5 + +[Deflation projection] +nkv 24 +nmx 512 +res 1.0e-2 + +[Deflation update scheme] +dtau 0.3 +nsm 1 + +[Wilson flow] +integrator RK3 +eps 2.0e-2 +nstep 100 +dnms 10 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/ym1.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/ym1.c new file mode 100644 index 0000000000000000000000000000000000000000..676122ea15fea2d4c75922e720e7c9449055e087 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/ym1.c @@ -0,0 +1,1639 @@ + +/******************************************************************************* +* +* File ym1.c +* +* Copyright (C) 2010-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* HMC simulation program for the SU(3) gauge theory. +* +* Syntax: ym1 -i [-noloc] [-noexp] [-rmold] [-noms] +* [-c [-a [-norng]]] +* +* For usage instructions see the file README.ym1. +* +*******************************************************************************/ + +#define MAIN_PROGRAM + +#include +#include +#include +#include +#include "mpi.h" +#include "flags.h" +#include "random.h" +#include "su3fcts.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "archive.h" +#include "forces.h" +#include "update.h" +#include "wflow.h" +#include "tcharge.h" +#include "version.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define N1 (NPROC1*L1) +#define N2 (NPROC2*L2) +#define N3 (NPROC3*L3) + +typedef struct +{ + int nt,iac; + double dH,avpl; +} dat_t; + +static struct +{ + int dn,nn,tmax; + double eps; +} file_head; + +static struct +{ + int nt; + double **Wsl,**Ysl,**Qsl; +} data; + +static int my_rank,noloc,noexp,rmold,noms,norng; +static int scnfg,append,endian; +static int level,seed; +static int nth,ntr,dtr_log,dtr_ms,dtr_cnfg; +static int ipgrd[2],flint; +static double *Wact,*Yact,*Qtop; + +static char line[NAME_SIZE]; +static char log_dir[NAME_SIZE],dat_dir[NAME_SIZE]; +static char loc_dir[NAME_SIZE],cnfg_dir[NAME_SIZE]; +static char log_file[NAME_SIZE],log_save[NAME_SIZE]; +static char par_file[NAME_SIZE],par_save[NAME_SIZE]; +static char dat_file[NAME_SIZE],dat_save[NAME_SIZE]; +static char msdat_file[NAME_SIZE],msdat_save[NAME_SIZE]; +static char rng_file[NAME_SIZE],rng_save[NAME_SIZE]; +static char cnfg_file[NAME_SIZE],end_file[NAME_SIZE]; +static char nbase[NAME_SIZE],cnfg[NAME_SIZE]; +static FILE *fin=NULL,*flog=NULL,*fdat=NULL,*fend=NULL; + +static lat_parms_t lat; +static bc_parms_t bcp; +static hmc_parms_t hmc; + + +static int write_dat(int n,dat_t *ndat) +{ + int i,iw,ic; + stdint_t istd[2]; + double dstd[2]; + + ic=0; + + for (i=0;i=NAME_SIZE,1, + "setup_files [ym1.c]","log_dir name is too long"); + error_root(name_size("%s/%s.ms.dat~",dat_dir,nbase)>=NAME_SIZE,1, + "setup_files [ym1.c]","dat_dir name is too long"); + + sprintf(log_file,"%s/%s.log",log_dir,nbase); + sprintf(par_file,"%s/%s.par",dat_dir,nbase); + sprintf(dat_file,"%s/%s.dat",dat_dir,nbase); + sprintf(msdat_file,"%s/%s.ms.dat",dat_dir,nbase); + sprintf(rng_file,"%s/%s.rng",dat_dir,nbase); + sprintf(end_file,"%s/%s.end",log_dir,nbase); + sprintf(log_save,"%s~",log_file); + sprintf(par_save,"%s~",par_file); + sprintf(dat_save,"%s~",dat_file); + sprintf(msdat_save,"%s~",msdat_file); + sprintf(rng_save,"%s~",rng_file); +} + + +static void read_lat_parms(void) +{ + double beta,c0; + + if (my_rank==0) + { + find_section("Lattice parameters"); + read_line("beta","%lf",&beta); + read_line("c0","%lf",&c0); + } + + MPI_Bcast(&beta,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&c0,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + lat=set_lat_parms(beta,c0,0,NULL,1.0); + + if (append) + check_lat_parms(fdat); + else + write_lat_parms(fdat); +} + + +static void read_bc_parms(void) +{ + int bc; + double cG,cG_prime; + double phi[2],phi_prime[2]; + + if (my_rank==0) + { + find_section("Boundary conditions"); + read_line("type","%d",&bc); + + phi[0]=0.0; + phi[1]=0.0; + phi_prime[0]=0.0; + phi_prime[1]=0.0; + cG=1.0; + cG_prime=1.0; + + if (bc==1) + read_dprms("phi",2,phi); + + if ((bc==1)||(bc==2)) + read_dprms("phi'",2,phi_prime); + + if (bc!=3) + read_line("cG","%lf",&cG); + + if (bc==2) + read_line("cG'","%lf",&cG_prime); + } + + MPI_Bcast(&bc,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(phi,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(phi_prime,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cG,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&cG_prime,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + bcp=set_bc_parms(bc,cG,cG_prime,1.0,1.0,phi,phi_prime); + + if (append) + check_bc_parms(fdat); + else + write_bc_parms(fdat); +} + + +static void read_hmc_parms(void) +{ + int iact[1]; + double tau; + + if (my_rank==0) + { + find_section("Trajectory length"); + read_line("tau","%lf",&tau); + } + + MPI_Bcast(&tau,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + iact[0]=0; + hmc=set_hmc_parms(1,iact,0,0,NULL,1,tau); + + if (append) + check_hmc_parms(fdat); + else + write_hmc_parms(fdat); +} + + +static void read_integrator(void) +{ + int nstep,imd,ifr[1]; + double lambda; + + if (my_rank==0) + { + find_section("MD integrator"); + read_line("integrator","%s",line); + lambda=0.0; + + if (strcmp(line,"LPFR")==0) + imd=(int)(LPFR); + else if (strcmp(line,"OMF2")==0) + { + imd=(int)(OMF2); + read_line("lambda","%lf",&lambda); + } + else if (strcmp(line,"OMF4")==0) + imd=(int)(OMF4); + else + error_root(1,1,"read_integrator [ym1.c]","Unknown integrator"); + + read_line("nstep","%d",&nstep); + } + + MPI_Bcast(&imd,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&lambda,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + MPI_Bcast(&nstep,1,MPI_INT,0,MPI_COMM_WORLD); + + ifr[0]=0; + + if (imd==(int)(LPFR)) + set_mdint_parms(0,LPFR,lambda,nstep,1,ifr); + else if (imd==(int)(OMF2)) + set_mdint_parms(0,OMF2,lambda,nstep,1,ifr); + else if (imd==(int)(OMF4)) + set_mdint_parms(0,OMF4,lambda,nstep,1,ifr); + + set_action_parms(0,ACG,0,0,NULL,NULL,NULL); + set_force_parms(0,FRG,0,0,NULL,NULL,NULL,NULL); + + if (append) + { + check_mdint_parms(fdat); + check_action_parms(fdat); + check_force_parms(fdat); + } + else + { + write_mdint_parms(fdat); + write_action_parms(fdat); + write_force_parms(fdat); + } +} + + +static void read_schedule(void) +{ + int ie,ir,iw; + stdint_t istd[3]; + + if (my_rank==0) + { + find_section("MD trajectories"); + read_line("nth","%d",&nth); + read_line("ntr","%d",&ntr); + read_line("dtr_log","%d",&dtr_log); + if (noms==0) + read_line("dtr_ms","%d",&dtr_ms); + else + dtr_ms=0; + read_line("dtr_cnfg","%d",&dtr_cnfg); + + error_root((append!=0)&&(nth!=0),1,"read_schedule [ym1.c]", + "Continuation run: nth must be equal to zero"); + + ie=0; + ie|=(nth<0); + ie|=(ntr<1); + ie|=(dtr_log<1); + ie|=(dtr_log>dtr_cnfg); + ie|=((dtr_cnfg%dtr_log)!=0); + ie|=((nth%dtr_cnfg)!=0); + ie|=((ntr%dtr_cnfg)!=0); + + if (noms==0) + { + ie|=(dtr_msdtr_cnfg); + ie|=((dtr_ms%dtr_log)!=0); + ie|=((dtr_cnfg%dtr_ms)!=0); + } + + error_root(ie!=0,1,"read_schedule [ym1.c]", + "Improper value of nth,ntr,dtr_log,dtr_ms or dtr_cnfg"); + } + + MPI_Bcast(&nth,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&ntr,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&dtr_log,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&dtr_ms,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&dtr_cnfg,1,MPI_INT,0,MPI_COMM_WORLD); + + if (my_rank==0) + { + if (append) + { + ir=fread(istd,sizeof(stdint_t),3,fdat); + error_root(ir!=3,1,"read_schedule [ym1.c]", + "Incorrect read count"); + + if (endian==BIG_ENDIAN) + bswap_int(3,istd); + + ie=0; + ie|=(istd[0]!=(stdint_t)(dtr_log)); + ie|=(istd[1]!=(stdint_t)(dtr_ms)); + ie|=(istd[2]!=(stdint_t)(dtr_cnfg)); + + error_root(ie!=0,1,"read_schedule [ym1.c]", + "Parameters do not match previous run"); + } + else + { + istd[0]=(stdint_t)(dtr_log); + istd[1]=(stdint_t)(dtr_ms); + istd[2]=(stdint_t)(dtr_cnfg); + + if (endian==BIG_ENDIAN) + bswap_int(3,istd); + + iw=fwrite(istd,sizeof(stdint_t),3,fdat); + error_root(iw!=3,1,"read_schedule [ym1.c]", + "Incorrect write count"); + } + } +} + + +static void read_wflow_parms(void) +{ + int nstep,dnms,ie,ir,iw; + stdint_t istd[3]; + double eps,dstd[1]; + + if (my_rank==0) + { + if (append) + { + ir=fread(istd,sizeof(stdint_t),1,fdat); + error_root(ir!=1,1,"read_wflow_parms [ym1.c]", + "Incorrect read count"); + + if (endian==BIG_ENDIAN) + bswap_int(1,istd); + + error_root(istd[0]!=(stdint_t)(noms==0),1,"read_wflow_parms [ym1.c]", + "Attempt to mix measurement with other runs"); + } + else + { + istd[0]=(stdint_t)(noms==0); + + if (endian==BIG_ENDIAN) + bswap_int(1,istd); + + iw=fwrite(istd,sizeof(stdint_t),1,fdat); + error_root(iw!=1,1,"read_wflow_parms [ym1.c]", + "Incorrect write count"); + } + + if (noms==0) + { + find_section("Wilson flow"); + read_line("integrator","%s",line); + read_line("eps","%lf",&eps); + read_line("nstep","%d",&nstep); + read_line("dnms","%d",&dnms); + + if (strcmp(line,"EULER")==0) + flint=0; + else if (strcmp(line,"RK2")==0) + flint=1; + else if (strcmp(line,"RK3")==0) + flint=2; + else + error_root(1,1,"read_wflow_parms [ym1.c]","Unkown integrator"); + + error_root((dnms<1)||(nstep [-noloc] [-noexp] " + "[-rmold] [-noms] [-c [-a [-norng]]]"); + + error_root(endian==UNKNOWN_ENDIAN,1,"read_infile [ym1.c]", + "Machine has unknown endianness"); + + error_root((noexp)&&(noloc),1,"read_infile [ym1.c]", + "The concurrent use of -noloc and -noexp is not permitted"); + + if (scnfg) + { + strncpy(cnfg,argv[scnfg+1],NAME_SIZE-1); + cnfg[NAME_SIZE-1]='\0'; + } + else + cnfg[0]='\0'; + + fin=freopen(argv[ifile+1],"r",stdin); + error_root(fin==NULL,1,"read_infile [ym1.c]", + "Unable to open input file"); + } + + MPI_Bcast(&noloc,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&noexp,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&rmold,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&noms,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&scnfg,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&append,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&norng,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&endian,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(cnfg,NAME_SIZE,MPI_CHAR,0,MPI_COMM_WORLD); + + if (my_rank==0) + { + find_section("Random number generator"); + read_line("level","%d",&level); + read_line("seed","%d",&seed); + } + + MPI_Bcast(&level,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&seed,1,MPI_INT,0,MPI_COMM_WORLD); + + read_dirs(); + setup_files(); + + if (my_rank==0) + { + if (append) + fdat=fopen(par_file,"rb"); + else + fdat=fopen(par_file,"wb"); + + error_root(fdat==NULL,1,"read_infile [ym1.c]", + "Unable to open parameter file"); + } + + read_lat_parms(); + read_bc_parms(); + read_hmc_parms(); + read_schedule(); + read_integrator(); + read_wflow_parms(); + + if (my_rank==0) + { + fclose(fin); + fclose(fdat); + + if (append==0) + copy_file(par_file,par_save); + } +} + + +static void check_old_log(int ic,int *nl,int *icnfg) +{ + int ir,isv; + int np[4],bp[4]; + + fend=fopen(log_file,"r"); + error_root(fend==NULL,1,"check_old_log [ym1.c]", + "Unable to open log file"); + (*nl)=0; + (*icnfg)=0; + ir=1; + isv=0; + + while (fgets(line,NAME_SIZE,fend)!=NULL) + { + if (strstr(line,"process grid")!=NULL) + { + ir&=(sscanf(line,"%dx%dx%dx%d process grid, %dx%dx%dx%d", + np,np+1,np+2,np+3,bp,bp+1,bp+2,bp+3)==8); + + ipgrd[0]=((np[0]!=NPROC0)||(np[1]!=NPROC1)|| + (np[2]!=NPROC2)||(np[3]!=NPROC3)); + ipgrd[1]=((bp[0]!=NPROC0_BLK)||(bp[1]!=NPROC1_BLK)|| + (bp[2]!=NPROC2_BLK)||(bp[3]!=NPROC3_BLK)); + } + else if (strstr(line,"Trajectory no")!=NULL) + { + ir&=(sscanf(line,"Trajectory no %d",nl)==1); + isv=0; + } + else if (strstr(line,"Configuration no")!=NULL) + { + ir&=(sscanf(line,"Configuration no %d",icnfg)==1); + isv=1; + } + } + + fclose(fend); + + error_root(ir!=1,1,"check_old_log [ym1.c]","Incorrect read count"); + + error_root(ic!=(*icnfg),1,"check_old_log [ym1.c]", + "Continuation run:\n" + "Initial configuration is not the last one of the previous run"); + + error_root(isv==0,1,"check_old_log [ym1.c]", + "Continuation run:\n" + "The log file extends beyond the last configuration save"); +} + + +static void check_old_dat(int nl) +{ + int nt; + dat_t ndat; + + fdat=fopen(dat_file,"rb"); + error_root(fdat==NULL,1,"check_old_dat [ym1.c]", + "Unable to open data file"); + nt=0; + + while (read_dat(1,&ndat)==1) + nt=ndat.nt; + + fclose(fdat); + + error_root(nt!=nl,1,"check_old_dat [ym1.c]", + "Continuation run: Incomplete or too many data records"); +} + + +static void check_old_msdat(int nl) +{ + int ic,ir,nt,pnt,dnt; + + fdat=fopen(msdat_file,"rb"); + error_root(fdat==NULL,1,"check_old_msdat [ym1.c]", + "Unable to open data file"); + + check_file_head(); + + nt=0; + dnt=0; + pnt=0; + + for (ic=0;;ic++) + { + ir=read_data(); + + if (ir==0) + { + error_root(ic==0,1,"check_old_msdat [ym1.c]", + "No data records found"); + break; + } + + nt=data.nt; + + if (ic==1) + { + dnt=nt-pnt; + error_root(dnt<1,1,"check_old_msdat [ym1.c]", + "Incorrect trajectory separation"); + } + else if (ic>1) + error_root(nt!=(pnt+dnt),1,"check_old_msdat [ym1.c]", + "Trajectory sequence is not equally spaced"); + + pnt=nt; + } + + fclose(fdat); + + error_root((nt!=nl)||((ic>1)&&(dnt!=dtr_ms)),1, + "check_old_msdat [ym1.c]","Last trajectory numbers " + "or the trajectory separations do not match"); +} + + +static void check_files(int *nl,int *icnfg) +{ + int icmax,ic; + + ipgrd[0]=0; + ipgrd[1]=0; + + if (my_rank==0) + { + if (noloc) + error_root(cnfg[strlen(cnfg)-1]=='*',1, + "check_files [ym1.c]","Attempt to read an " + "imported configuration when -noloc is set"); + + if (append) + { + error_root(strstr(cnfg,nbase)!=cnfg,1,"check_files [ym1.c]", + "Continuation run:\n" + "Run name does not match the previous one"); + error_root(sscanf(cnfg+strlen(nbase),"n%d",&ic)!=1,1, + "check_files [ym1.c]","Continuation run:\n" + "Unable to read configuration number from file name"); + + check_old_log(ic,nl,icnfg); + check_old_dat(*nl); + if (noms==0) + check_old_msdat(*nl); + + (*icnfg)+=1; + } + else + { + fin=fopen(log_file,"r"); + fdat=fopen(dat_file,"rb"); + + if (noms==0) + fend=fopen(msdat_file,"rb"); + else + fend=NULL; + + error_root((fin!=NULL)||(fdat!=NULL)||(fend!=NULL),1, + "check_files [ym1.c]", + "Attempt to overwrite old *.log or *.dat file"); + + if (noms==0) + { + fdat=fopen(msdat_file,"wb"); + error_root(fdat==NULL,1,"check_files [ym1.c]", + "Unable to open measurement data file"); + write_file_head(); + fclose(fdat); + } + + (*nl)=0; + (*icnfg)=1; + } + + icmax=(*icnfg)+(ntr-nth)/dtr_cnfg; + + if (noloc==0) + error_root(name_size("%s/%sn%d_%d",loc_dir,nbase,icmax,NPROC-1)>= + NAME_SIZE,1,"check_files [ym1.c]", + "loc_dir name is too long"); + + if (noexp==0) + error_root(name_size("%s/%sn%d",cnfg_dir,nbase,icmax)>=NAME_SIZE,1, + "check_files [ym1.c]","cnfg_dir name is too long"); + + if (scnfg) + { + if (cnfg[strlen(cnfg)-1]=='*') + error_root(name_size("%s/%s%d",loc_dir,cnfg,NPROC-1)>=NAME_SIZE,1, + "check_files [ym1.c]","loc_dir name is too long"); + else + error_root(name_size("%s/%s",cnfg_dir,cnfg)>=NAME_SIZE,1, + "check_files [ym1.c]","cnfg_dir name is too long"); + } + } + + MPI_Bcast(nl,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(icnfg,1,MPI_INT,0,MPI_COMM_WORLD); +} + + +static void init_ud(void) +{ + char *p; + + if (scnfg) + { + if (cnfg[strlen(cnfg)-1]!='*') + { + sprintf(cnfg_file,"%s/%s",cnfg_dir,cnfg); + import_cnfg(cnfg_file); + } + else + { + sprintf(line,"%s/%s",loc_dir,cnfg); + p=line+strlen(line)-1; + p[0]='\0'; + sprintf(cnfg_file,"%s_%d",line,my_rank); + read_cnfg(cnfg_file); + } + } + else + random_ud(); +} + + +static void init_rng(int icnfg) +{ + int ic; + + if (append) + { + if (cnfg[strlen(cnfg)-1]!='*') + { + if (norng) + start_ranlux(level,seed^(icnfg-1)); + else + { + ic=import_ranlux(rng_file); + error_root(ic!=(icnfg-1),1,"init_rng [ym1.c]", + "Configuration number mismatch (*.rng file)"); + } + } + } + else + start_ranlux(level,seed); +} + + +static void store_ud(su3_dble *usv) +{ + su3_dble *udb; + + udb=udfld(); + cm3x3_assign(4*VOLUME,udb,usv); +} + + +static void recall_ud(su3_dble *usv) +{ + su3_dble *udb; + + udb=udfld(); + cm3x3_assign(4*VOLUME,usv,udb); + set_flags(UPDATED_UD); +} + + +static void set_data(int nt) +{ + int in,dn,nn; + double eps; + + data.nt=nt; + dn=file_head.dn; + nn=file_head.nn; + eps=file_head.eps; + + for (in=0;in0); + dn=file_head.dn; + nn=file_head.nn; + eps=file_head.eps; + + din=nn/10; + if (din<1) + din=1; + + printf("Measurement run:\n\n"); + + for (in=0;in<=nn;in+=din) + printf("n = %3d, t = %.2e, Wact = %.6e, Yact = %.6e, Q = % .2e\n", + in*dn,eps*(double)(in*dn),Wact[in],Yact[in],Qtop[in]); + + printf("\n"); + printf("Configuration fully processed in %.2e sec ",wtms); + printf("(average = %.2e sec)\n",wtmsall/(double)(nms)); + printf("Measured data saved\n\n"); + fflush(flog); + } +} + + +static void save_cnfg(int icnfg) +{ + int ie; + + ie=check_bc(0.0)^0x1; + ie|=chs_ubnd(1); + error_root(ie!=0,1,"save_cnfg [ym1.c]","Unexpected boundary values"); + + if (noloc==0) + { + sprintf(cnfg_file,"%s/%sn%d_%d",loc_dir,nbase,icnfg,my_rank); + write_cnfg(cnfg_file); + } + + if (noexp==0) + { + sprintf(cnfg_file,"%s/%sn%d",cnfg_dir,nbase,icnfg); + export_cnfg(cnfg_file); + } + + if (my_rank==0) + { + if ((noloc==0)&&(noexp==0)) + printf("Configuration no %d saved on the local disks " + "and exported\n\n",icnfg); + else if (noloc==0) + printf("Configuration no %d saved on the local disks\n\n",icnfg); + else if (noexp==0) + printf("Configuration no %d exported\n\n",icnfg); + } +} + + +static void check_endflag(int *iend) +{ + if (my_rank==0) + { + fend=fopen(end_file,"r"); + + if (fend!=NULL) + { + fclose(fend); + remove(end_file); + (*iend)=1; + printf("End flag set, run stopped\n\n"); + } + else + (*iend)=0; + } + + MPI_Bcast(iend,1,MPI_INT,0,MPI_COMM_WORLD); +} + + +static void remove_cnfg(int icnfg) +{ + if ((rmold)&&(icnfg>=1)) + { + if (noloc==0) + { + sprintf(cnfg_file,"%s/%sn%d_%d",loc_dir,nbase,icnfg,my_rank); + remove(cnfg_file); + } + + if ((noexp==0)&&(my_rank==0)) + { + sprintf(cnfg_file,"%s/%sn%d",cnfg_dir,nbase,icnfg); + remove(cnfg_file); + } + } +} + + +int main(int argc,char *argv[]) +{ + int nl,icnfg; + int n,iend,iac; + double act0[2],act1[2],w0[2],w1[2],npl,siac; + double wt1,wt2,wtcyc,wtall,wtms,wtmsall; + su3_dble **usv; + dat_t ndat; + + MPI_Init(&argc,&argv); + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + read_infile(argc,argv); + if (noms==0) + alloc_data(); + check_files(&nl,&icnfg); + geometry(); + alloc_wud(1); + + if (noms==0) + { + alloc_data(); + + if (flint) + alloc_wfd(1); + } + + print_info(icnfg); + set_mdsteps(); + init_ud(); + init_rng(icnfg); + + if (bc_type()==0) + npl=(double)(6*(N0-1)*N1)*(double)(N2*N3); + else + npl=(double)(6*N0*N1)*(double)(N2*N3); + + iend=0; + siac=0.0; + wtcyc=0.0; + wtall=0.0; + wtms=0.0; + wtmsall=0.0; + + for (n=0;(iend==0)&&(n=nth)&&(((ntr-n-1)%dtr_ms)==0)) + { + MPI_Barrier(MPI_COMM_WORLD); + wt1=MPI_Wtime(); + + usv=reserve_wud(1); + store_ud(usv[0]); + set_data(nl+n+1); + recall_ud(usv[0]); + release_wud(); + + MPI_Barrier(MPI_COMM_WORLD); + wt2=MPI_Wtime(); + + wtms=wt2-wt1; + wtmsall+=wtms; + save_msdat(n,wtms,wtmsall); + } + } + + if (((n+1)>=nth)&&(((ntr-n-1)%dtr_cnfg)==0)) + { + save_cnfg(icnfg); + export_ranlux(icnfg,rng_file); + check_endflag(&iend); + error_chk(); + + if (my_rank==0) + { + fflush(flog); + copy_file(log_file,log_save); + copy_file(dat_file,dat_save); + if (noms==0) + copy_file(msdat_file,msdat_save); + copy_file(rng_file,rng_save); + } + + remove_cnfg(icnfg-1); + icnfg+=1; + } + } + + if (my_rank==0) + fclose(flog); + + MPI_Finalize(); + exit(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/ym1.in b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/ym1.in new file mode 100644 index 0000000000000000000000000000000000000000..82e0c96af78674f86c9781d16477a84354e57c74 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/main/ym1.in @@ -0,0 +1,45 @@ + +[Run name] +name Snoopy137 + +[Directories] +log_dir ../data/ym1/log +dat_dir ../data/ym1/dat +loc_dir /ndata/ym1/cnfg +cnfg_dir /data/ym1/cnfg + +[Lattice parameters] +beta 6.00 +c0 1.6667 + +[Boundary conditions] +type 2 +phi 0.12 -0.56 +phi' 0.92 0.76 +cG 1.10 +cG' 1.05 + +[Random number generator] +level 0 +seed 73099 + +[Trajectory length] +tau 3.0 + +[MD integrator] +integrator OMF4 +lambda 0.19 +nstep 16 + +[MD trajectories] +nth 320 +ntr 32000 +dtr_log 4 +dtr_ms 8 +dtr_cnfg 32 + +[Wilson flow] +integrator RK3 +eps 2.0e-2 +nstep 100 +dnms 10 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/INDEX b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/INDEX new file mode 100644 index 0000000000000000000000000000000000000000..a037d62153c2163b5e1c6ad8d2a7a4f320b6f22a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/INDEX @@ -0,0 +1,68 @@ + +******************************************************************************** + + Module Directories + +******************************************************************************** + +archive Import and export programs for the double-precision + gauge and quark fields. + +block Definition of blocks and block grids. + +dirac Programs for the O(a)-improved Wilson-Dirac operator. + +dfl Deflation subspace generation and deflated SAP+GCR + solver. + +flags Flags and parameter data base. + +forces Molecular-dynamics forces and associated actions. + +lattice Lattice geometry and programs implementing the boundary + conditions in time. + +linalg Generic linear algebra programs for spinor fields, + fields with values in the Lie algebra of SU(3) and + complex scalar fields. + +linsolv Generic Krylov-space solvers. + +little Computation and action of the little Dirac operator + (= restriction of the Wilson-Dirac operator to the + the deflation subspace). + +mdflds Allocation of the fundamental momentum, force + and pseudo-fermion fields. + +nompi Programs used in non-MPI check and analysis programs. + +random Random number generator, gaussian random numbers, + initialization of ranlux. + +ratfcts Rational function data base. + +sap Schwarz alternating procedure and SAP+GCR solver. + +sflds Generic initialization and assignment programs for + spinor fields. + +su3fcts Collection of 3x3 matrix functions. + +sw_term Computation of the Sheikholeslami-Wohlert term. + +tcharge Symmetric field tensor and topological charge. + +uflds Allocation of the fundamental gauge fields. + +update Molecular-dynamics integration, HMC algorithm and + reweighting factors. + +utils Utility programs: aligned allocation, error functions, + endianess functions, functions needed to read input + files, workspace allocation. + +vflds Generic initialization and assignment programs for + for complex scalar fields. + +wflow Integration of the (Wilson) gradient flow. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/archive/README b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/archive/README new file mode 100644 index 0000000000000000000000000000000000000000..e3bbb506143854b2d27d96c894eed0f025645980 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/archive/README @@ -0,0 +1,69 @@ + +******************************************************************************** + + I/O functions for field configurations + +******************************************************************************** + + +Files +----- + +archive.c Programs to read and write gauge-field configurations. + +sarchive.c Programs to read and write global double-precision spinor + fields. + +Include file +------------ + +The file archive.h defines the prototypes for all externally accessible +functions that are defined in the *.c files listed above. + + +List of functions +----------------- + +void write_cnfg(char *out) + Writes the lattice sizes, the process grid sizes, the coordinates + of the calling process, the state of the random number generators, + the local plaquette sum and the local double-precision gauge field + to the file "out". + +void read_cnfg(char *in) + Reads the local double-precision gauge field from the file "in", + assuming it was written to the file by the program write_cnfg(). + The program then resets the random number generator and checks + that the restored field is compatible with the chosen boundary + conditions. + +void export_cnfg(char *out) + Writes the lattice sizes and the global double-precision gauge + field to the file "out" from process 0 in the universal format + specified below (see the notes). + +void import_cnfg(char *in) + Reads the global double-precision gauge field from the file "in" + on process 0, assuming the field was written to the file in the + universal format. The field is periodically extended if needed + and the program then checks that the configuration is compatible + with the chosen boundary conditions (see the notes). + +void write_sfld(char *out,spinor_dble *sd) + Writes the lattice sizes, the process grid sizes, the coordinates + of the calling process, the square of the norm of the spinor field + sd and the local part of the latter to the file "out". + +void read_sfld(char *in,spinor_dble *sd) + Reads the local part of the spinor field sd from the file "in", + assuming the field was written to the file by write_sfld(). + +void export_sfld(char *out,spinor_dble *sd) + Writes the lattice sizes and the spinor field sd to the file "out" + from process 0 in the universal format specified below (see the + notes). + +void import_sfld(char *in,spinor_dble *sd) + Reads the spinor field sd from the file "in" on process 0, assuming + the field was written to the file in the universal format (see the + notes). diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/archive/archive.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/archive/archive.c new file mode 100644 index 0000000000000000000000000000000000000000..925af77674a10bbbbd903935e06619361632bc7c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/archive/archive.c @@ -0,0 +1,597 @@ + +/******************************************************************************* +* +* File archive.c +* +* Copyright (C) 2005, 2007, 2009-2014 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Programs to read and write gauge-field configurations. +* +* The externally accessible functions are +* +* void write_cnfg(char *out) +* Writes the lattice sizes, the process grid sizes, the coordinates +* of the calling process, the state of the random number generators, +* the local plaquette sum and the local double-precision gauge field +* to the file "out". +* +* void read_cnfg(char *in) +* Reads the local double-precision gauge field from the file "in", +* assuming it was written to the file by the program write_cnfg(). +* The program then resets the random number generator and checks +* that the restored field is compatible with the chosen boundary +* conditions. +* +* void export_cnfg(char *out) +* Writes the lattice sizes and the global double-precision gauge +* field to the file "out" from process 0 in the universal format +* specified below (see the notes). +* +* void import_cnfg(char *in) +* Reads the global double-precision gauge field from the file "in" +* on process 0, assuming the field was written to the file in the +* universal format. The field is periodically extended if needed +* and the program then checks that the configuration is compatible +* with the chosen boundary conditions (see the notes). +* +* Notes: +* +* The program export_cnfg() first writes the lattice sizes and the average +* of the plaquette Re(tr{U(p)}) to the output file. Then follow the 8 link +* variables in the directions +0,-0,...,+3,-3 at the first odd point, the +* second odd point, and so on. The order of the point (x0,x1,x2,x3) with +* Cartesian coordinates in the range 0<=x0 +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "random.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "archive.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define N1 (NPROC1*L1) +#define N2 (NPROC2*L2) +#define N3 (NPROC3*L3) + +static int endian,ns,nd,*state=NULL; +static su3_dble *ubuf=NULL,*vbuf,*udb; + + +static void alloc_state(void) +{ + int n; + + ns=rlxs_size(); + nd=rlxd_size(); + + if (nseps); + set_bc(); + plaq1=plaq_sum_dble(0)/nplaq; + ie|=(fabs(plaq1-plaq0)>eps); + error_loc(ie!=0,1,"read_cnfg [archive.c]", + "Incorrect average plaquette"); + error_chk(); +} + + +static void check_machine(void) +{ + error_root(sizeof(stdint_t)!=4,1,"check_machine [archive.c]", + "Size of a stdint_t integer is not 4"); + error_root(sizeof(double)!=8,1,"check_machine [archive.c]", + "Size of a double is not 8"); + + endian=endianness(); + error_root(endian==UNKNOWN_ENDIAN,1,"check_machine [archive.c]", + "Unkown endianness"); +} + + +static void alloc_ubuf(int my_rank) +{ + if (my_rank==0) + { + ubuf=amalloc(4*(L3+N3)*sizeof(su3_dble),ALIGN); + vbuf=ubuf+4*L3; + } + else + { + ubuf=amalloc(4*L3*sizeof(su3_dble),ALIGN); + vbuf=NULL; + } + + error(ubuf==NULL,1,"alloc_ubuf [archive.c]", + "Unable to allocate auxiliary array"); +} + + +static void get_links(int iy) +{ + int y3,ifc; + su3_dble *u,*v; + + v=ubuf; + iy*=L3; + + if (ipt[iy]<(VOLUME/2)) + iy+=1; + + for (y3=0;y30) + { + if (my_rank==0) + { + MPI_Send(&dmy,1,MPI_INT,n,tag0,MPI_COMM_WORLD); + MPI_Recv(ubuf,4*L3*18,MPI_DOUBLE,n,tag1,MPI_COMM_WORLD,&stat); + } + else if (my_rank==n) + { + MPI_Recv(&dmy,1,MPI_INT,0,tag0,MPI_COMM_WORLD,&stat); + MPI_Send(ubuf,4*L3*18,MPI_DOUBLE,0,tag1,MPI_COMM_WORLD); + } + } + + if (my_rank==0) + { + if (endian==BIG_ENDIAN) + bswap_double(4*L3*18,ubuf); + iw=fwrite(ubuf,sizeof(su3_dble),4*L3,fout); + iwa|=(iw!=(4*L3)); + } + } + } + + if (my_rank==0) + { + error_root(iwa!=0,1,"export_cnfg [archive.c]", + "Incorrect write count"); + fclose(fout); + } +} + + +void import_cnfg(char *in) +{ + int my_rank,np[4],ir,ie; + int ira,dmy,tag0,tag1; + int n0,n1,n2,n3,nc0,nc1,nc2,nc3; + int x0,x1,x2,y0,y1,y2,y3,c0,c1,c2,ix,iy,ic; + int n,k,l; + stdint_t lsize[4]; + double nplaq,plaq0,plaq1,eps; + MPI_Status stat; + FILE *fin=NULL; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (ubuf==NULL) + { + check_machine(); + alloc_ubuf(my_rank); + } + + dmy=1; + tag0=mpi_tag(); + tag1=mpi_tag(); + udb=udfld(); + + if (my_rank==0) + { + fin=fopen(in,"rb"); + error_root(fin==NULL,1,"import_cnfg [archive.c]", + "Unable to open input file"); + + ir=fread(lsize,sizeof(stdint_t),4,fin); + ir+=fread(&plaq0,sizeof(double),1,fin); + error_root(ir!=5,1,"import_cnfg [archive.c]","Incorrect read count"); + + if (endian==BIG_ENDIAN) + { + bswap_int(4,lsize); + bswap_double(1,&plaq0); + } + + np[0]=(int)(lsize[0]); + np[1]=(int)(lsize[1]); + np[2]=(int)(lsize[2]); + np[3]=(int)(lsize[3]); + + error_root((np[0]<1)||((N0%np[0])!=0)|| + (np[1]<1)||((N1%np[1])!=0)|| + (np[2]<1)||((N2%np[2])!=0)|| + (np[3]<1)||((N3%np[3])!=0),1,"import_cnfg [archive.c]", + "Unexpected or incompatible lattice sizes"); + + error_root((np[0]!=N0)&&(bc_type()!=3),1,"import_cnfg [archive.c]", + "Periodic extension in time is only possible when\n" + "periodic boundary conditions are chosen"); + } + else + { + np[0]=0; + np[1]=0; + np[2]=0; + np[3]=0; + plaq0=0.0; + } + + MPI_Bcast(np,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&plaq0,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + n0=np[0]; + n1=np[1]; + n2=np[2]; + n3=np[3]; + + nc0=N0/n0; + nc1=N1/n1; + nc2=N2/n2; + nc3=N3/n3; + ira=0; + + for (ix=0;ix<(n0*n1*n2);ix++) + { + x0=ix/(n1*n2); + x1=(ix/n2)%n1; + x2=ix%n2; + + if (my_rank==0) + { + n=4*n3; + ir=fread(vbuf,sizeof(su3_dble),n,fin); + ira|=(ir!=n); + + if (endian==BIG_ENDIAN) + bswap_double(n*18,vbuf); + + for (k=1;k0) + { + if (my_rank==0) + { + MPI_Send(vbuf+4*y3,4*L3*18,MPI_DOUBLE,n,tag1,MPI_COMM_WORLD); + MPI_Recv(&dmy,1,MPI_INT,n,tag0,MPI_COMM_WORLD,&stat); + } + else if (my_rank==n) + { + MPI_Recv(ubuf,4*L3*18,MPI_DOUBLE,0,tag1,MPI_COMM_WORLD,&stat); + MPI_Send(&dmy,1,MPI_INT,0,tag0,MPI_COMM_WORLD); + } + } + else if (my_rank==0) + for (l=0;l<(4*L3);l++) + ubuf[l]=vbuf[4*y3+l]; + + if (my_rank==n) + set_links(iy); + } + } + } + + if (my_rank==0) + { + error_root(ira!=0,1,"import_cnfg [archive.c]","Incorrect read count"); + fclose(fin); + } + + set_flags(UPDATED_UD); + ie=check_bc(64.0*DBL_EPSILON); + error_root(ie!=1,1,"import_cnfg [archive.c]", + "Incompatible boundary conditions"); + + ie=0; + nplaq=(double)(6*N0*N1)*(double)(N2*N3); + eps=sqrt(nplaq)*DBL_EPSILON; + plaq1=plaq_sum_dble(1)/nplaq; + ie|=(fabs(plaq1-plaq0)>eps); + set_bc(); + plaq1=plaq_sum_dble(1)/nplaq; + ie|=(fabs(plaq1-plaq0)>eps); + error_root(ie!=0,1,"import_cnfg [archive.c]", + "Incorrect average plaquette"); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/archive/sarchive.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/archive/sarchive.c new file mode 100644 index 0000000000000000000000000000000000000000..c91a99f3f311216fbae16cbf7c0566f02a64bd9a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/archive/sarchive.c @@ -0,0 +1,450 @@ + +/******************************************************************************* +* +* File sarchive.c +* +* Copyright (C) 2007, 2008, 2011, 2013, 2014 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Programs to read and write global double-precision spinor fields. +* +* The externally accessible functions are +* +* void write_sfld(char *out,spinor_dble *sd) +* Writes the lattice sizes, the process grid sizes, the coordinates +* of the calling process, the square of the norm of the spinor field +* sd and the local part of the latter to the file "out". +* +* void read_sfld(char *in,spinor_dble *sd) +* Reads the local part of the spinor field sd from the file "in", +* assuming the field was written to the file by write_sfld(). +* +* void export_sfld(char *out,spinor_dble *sd) +* Writes the lattice sizes and the spinor field sd to the file "out" +* from process 0 in the universal format specified below (see the +* notes). +* +* void import_sfld(char *in,spinor_dble *sd) +* Reads the spinor field sd from the file "in" on process 0, assuming +* the field was written to the file in the universal format (see the +* notes). +* +* Notes: +* +* The spinor fields are assumed to be global quark fields as described in +* main/README.global. Only their physical components (i.e. the spinors on +* the local lattices) are written and read. +* +* The program export_sfld() first writes the global lattice sizes and the +* square-norm of the spinor field. Then follow the spinors at the first +* lattice point, the second point, and so on, in the order given by the +* index +* +* ix=x3+N3*x2+N2*N3*x1+N1*N2*N3*x0, +* +* where N0,N1,N2,N3 are the (global) lattice sizes and (x0,x1,x2,x3) the +* Cartesian coordinates of the points (0<=x0 +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "utils.h" +#include "lattice.h" +#include "linalg.h" +#include "archive.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define N1 (NPROC1*L1) +#define N2 (NPROC2*L2) +#define N3 (NPROC3*L3) + +static int endian; +static spinor_dble *sbuf=NULL; + + +void write_sfld(char *out,spinor_dble *sd) +{ + int ldat[16],iw; + double norm; + FILE *fout=NULL; + + error(sd==NULL,1,"write_sfld [sarchive.c]", + "Attempt to access unallocated memory space"); + error(iup[0][0]==0,1,"write_sfld [sarchive.c]", + "Geometry arrays are not set"); + + fout=fopen(out,"wb"); + error_loc(fout==NULL,1,"write_sfld [sarchive.c]", + "Unable to open output file"); + error_chk(); + + ldat[0]=NPROC0; + ldat[1]=NPROC1; + ldat[2]=NPROC2; + ldat[3]=NPROC3; + + ldat[4]=L0; + ldat[5]=L1; + ldat[6]=L2; + ldat[7]=L3; + + ldat[8]=NPROC0_BLK; + ldat[9]=NPROC1_BLK; + ldat[10]=NPROC2_BLK; + ldat[11]=NPROC3_BLK; + + ldat[12]=cpr[0]; + ldat[13]=cpr[1]; + ldat[14]=cpr[2]; + ldat[15]=cpr[3]; + + iw=fwrite(ldat,sizeof(int),16,fout); + norm=norm_square_dble(VOLUME,0,sd); + iw+=fwrite(&norm,sizeof(double),1,fout); + iw+=fwrite(sd,sizeof(spinor_dble),VOLUME,fout); + + error_loc(iw!=(17+VOLUME),1,"write_sfld [sarchive.c]", + "Incorrect write count"); + error_chk(); + fclose(fout); +} + + +void read_sfld(char *in,spinor_dble *sd) +{ + int ldat[16],ir,ie; + double norm0,norm1,eps; + FILE *fin=NULL; + + error(sd==NULL,1,"read_sfld [sarchive.c]", + "Attempt to access unallocated memory space"); + error(iup[0][0]==0,1,"read_sfld [sarchive.c]", + "Geometry arrays are not set"); + + fin=fopen(in,"rb"); + error_loc(fin==NULL,1,"read_sfld [sarchive.c]", + "Unable to open input file"); + error_chk(); + + ir=fread(ldat,sizeof(int),16,fin); + + ie=0; + ie|=((ldat[0]!=NPROC0)||(ldat[1]!=NPROC1)|| + (ldat[2]!=NPROC2)||(ldat[3]!=NPROC3)); + ie|=((ldat[4]!=L0)||(ldat[5]!=L1)|| + (ldat[6]!=L2)||(ldat[7]!=L3)); + ie|=((ldat[8]!=NPROC0_BLK)||(ldat[9]!=NPROC1_BLK)|| + (ldat[10]!=NPROC2_BLK)||(ldat[11]!=NPROC3_BLK)); + ie|=((ldat[12]!=cpr[0])||(ldat[13]!=cpr[1])|| + (ldat[14]!=cpr[2])||(ldat[15]!=cpr[3])); + error(ie!=0,1,"read_sfld [sarchive.c]","Unexpected lattice data"); + + ir+=fread(&norm0,sizeof(double),1,fin); + ir+=fread(sd,sizeof(spinor_dble),VOLUME,fin); + + error_loc(ir!=(17+VOLUME),1,"read_sfld [sarchive.c]", + "Incorrect read count"); + error_chk(); + fclose(fin); + + norm1=norm_square_dble(VOLUME,0,sd); + eps=sqrt(64.0*(double)(VOLUME))*DBL_EPSILON; + error_loc(fabs(norm1-norm0)>(eps*norm0),1,"read_sfld [sarchive.c]", + "Incorrect square norm"); + error_chk(); +} + + +static void check_machine(void) +{ + error_root(sizeof(stdint_t)!=4,1,"check_machine [sarchive.c]", + "Size of a stdint_t integer is not 4"); + error_root(sizeof(double)!=8,1,"check_machine [sarchive.c]", + "Size of a double is not 8"); + error_root(sizeof(spinor_dble)!=192,1,"check_machine [sarchive.c]", + "The spinor_dble structures are not properly packed"); + + endian=endianness(); + error_root(endian==UNKNOWN_ENDIAN,1,"check_machine [sarchive.c]", + "Unkown endianness"); +} + + +static void alloc_sbuf(void) +{ + error(iup[0][0]==0,1,"alloc_sbuf [sarchive.c]", + "Geometry arrays are not set"); + sbuf=amalloc(L3*sizeof(spinor_dble),ALIGN); + error(sbuf==NULL,1,"alloc_sbuf [sarchive.c]", + "Unable to allocate auxiliary array"); +} + + +static void get_spinors(int iy,spinor_dble *sd) +{ + int y3,iz; + spinor_dble *sb; + + sb=sbuf; + iy*=L3; + + for (y3=0;y30) + { + if (my_rank==0) + { + MPI_Send(&dmy,1,MPI_INT,n,tag0,MPI_COMM_WORLD); + MPI_Recv(sbuf,L3*24,MPI_DOUBLE,n,tag1,MPI_COMM_WORLD,&stat); + } + else if (my_rank==n) + { + MPI_Recv(&dmy,1,MPI_INT,0,tag0,MPI_COMM_WORLD,&stat); + MPI_Send(sbuf,L3*24,MPI_DOUBLE,0,tag1,MPI_COMM_WORLD); + } + } + + if (my_rank==0) + { + if (endian==BIG_ENDIAN) + bswap_double(L3*24,(double*)(sbuf)); + iw=fwrite(sbuf,sizeof(spinor_dble),L3,fout); + iwa|=(iw!=L3); + } + } + } + + if (my_rank==0) + { + error_root(iwa!=0,1,"export_sfld [sarchive.c]","Incorrect write count"); + fclose(fout); + } +} + + +void import_sfld(char *in,spinor_dble *sd) +{ + int my_rank,np[4],n,ir; + int ira,dmy,tag0,tag1; + int x0,x1,x2,x3,y0,y1,y2,ix,iy; + stdint_t lsize[4]; + double norm0,norm1,eps; + MPI_Status stat; + FILE *fin=NULL; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (sbuf==NULL) + { + check_machine(); + alloc_sbuf(); + } + + error(sd==NULL,1,"import_sfld [sarchive.c]", + "Attempt to access unallocated memory space"); + + dmy=1; + tag0=mpi_tag(); + tag1=mpi_tag(); + + if (my_rank==0) + { + fin=fopen(in,"rb"); + error_root(fin==NULL,1,"import_sfld [sarchive.c]", + "Unable to open input file"); + + ir=fread(lsize,sizeof(stdint_t),4,fin); + ir+=fread(&norm0,sizeof(double),1,fin); + error_root(ir!=5,1,"import_sfld [sarchive.c]","Incorrect read count"); + + if (endian==BIG_ENDIAN) + { + bswap_int(4,lsize); + bswap_double(1,&norm0); + } + + error_root((lsize[0]!=N0)||(lsize[1]!=N1)||(lsize[2]!=N2)|| + (lsize[3]!=N3),1,"import_sfld [sarchive.c]", + "Lattice sizes do not match"); + } + else + norm0=0.0; + + ira=0; + + for (ix=0;ix<(N0*N1*N2);ix++) + { + x0=ix/(N1*N2); + x1=(ix/N2)%N1; + x2=ix%N2; + + y0=x0%L0; + y1=x1%L1; + y2=x2%L2; + iy=y2+L2*y1+L1*L2*y0; + + np[0]=x0/L0; + np[1]=x1/L1; + np[2]=x2/L2; + + for (x3=0;x30) + { + if (my_rank==0) + { + MPI_Send(sbuf,L3*24,MPI_DOUBLE,n,tag1,MPI_COMM_WORLD); + MPI_Recv(&dmy,1,MPI_INT,n,tag0,MPI_COMM_WORLD,&stat); + } + else if (my_rank==n) + { + MPI_Recv(sbuf,L3*24,MPI_DOUBLE,0,tag1,MPI_COMM_WORLD,&stat); + MPI_Send(&dmy,1,MPI_INT,0,tag0,MPI_COMM_WORLD); + } + } + + if (my_rank==n) + set_spinors(iy,sd); + } + } + + if (my_rank==0) + { + error_root(ira!=0,1,"import_sfld [sarchive.c]","Incorrect read count"); + fclose(fin); + } + + norm1=norm_square_dble(VOLUME,1,sd); + eps=sqrt(64.0*(double)(N0*N1)*(double)(N2*N3))*DBL_EPSILON; + error_root(fabs(norm1-norm0)>(eps*norm0),1,"import_sfld [sarchive.c]", + "Incorrect square norm"); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/block/README b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/block/README new file mode 100644 index 0000000000000000000000000000000000000000..02ba0ff9102fd40fb73b100cac134579a2291067 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/block/README @@ -0,0 +1,152 @@ + +******************************************************************************** + + Block allocation and block field initialization programs + +******************************************************************************** + + +Files +----- + +blk_grid.c Block grid allocation. + +block.c Basic allocation programs for blocks of lattice points. + +map_s2blk.c Copying of spinor fields to and from the blocks in a + block grid. + +map_sw2blk.c Copying of the SW fields to the blocks in a block grid. + +map_u2blk.c Copying of the gauge fields to the blocks in a block grid. + + +Include file +------------ + +The file block.h defines the prototypes for all externally accessible +functions that are defined in the *.c files listed above. + +The types block_t, bndry_t and blk_grid_t are also defined there and the +meaning of the entries in these structures is explained in the file +README.block. + + +List of functions +----------------- + +void alloc_bgr(blk_grid_t grid) + Allocates the specified block grid. The block array and the block + fields are put in the static memory of this module and are properly + initialized. + +block_t *blk_list(blk_grid_t grid,int *nb,int *isw) + Returns the pointer to the block array of the specified grid. The + number of blocks on the local lattice is assigned to nb and isw is + set to 0 or 1 depending on whether the first block is black or white + (by definition it is black on the first process). If the block grid + is not allocated, the program returns NULL and sets nb and isw to 0. + +void alloc_blk(block_t *b,int *bo,int *bs, + int iu,int iud,int ns,int nsd) + Sets the offset and side-lengths of the block b to bo[4] and bs[4], + respectively, and allocates the block fields depending on the values + of the other parameters. The single-precision gauge and SW fields are + allocated if iu=1, the double-precision gauge and SW fields if iud=1, + while ns and nsd are the numbers of single- and double-precision Dirac + fields that are allocated. All elements of the block are properly + initialized and the share flag b.shf is set to 0x0 (see the notes). + +void alloc_bnd(block_t *b,int iu,int iud,int nw,int nwd) + Allocates the boundary structures b.bb in the block b and the fields + in there depending on the parameters iu,iud,nw and nwd. The single- + and double-precision gauge fields are allocated if iu=1 and iud=1, + respectively, while nw and nwd are the numbers of single- and double- + precision Weyl fields that are allocated. All elements of the block + are then properly initialized (see the notes). + +void clone_blk(block_t *b,int shf,int *bo,block_t *c) + Sets the offset of the block c to bo[4] and its side lengths to + b.bs[4]. The fields in c are then allocated depending on the bits + b1,b2,..,b8 (counting from the lowest) of the share flag shf. The + relevant bits are: + + b2=1: b.ipt,b.iup and b.idn are shared, + b3=1: b.u, b.bb.u and b.sw are shared, + b4=1: b.ud, b.bb.ud and b.swd are shared, + b5=1: b.s is shared, + b6=1: b.sd is shared. + b7=1: b.bb.w is shared, + b8=1: b.bb.wd is shared. + + All fields that are not shared and are allocated on b are allocated + on c as well, while the pointers to the shared fields are set to those + of b. An error occurs if a field is shared according to the share flag + b.shf on b but not according to shf. Moreover, the offset differences + bo[mu]-b.bo[mu] must be integer multiples of b.bs[mu] for all mu. The + share flag c.shf is set to shf. + +void free_blk(block_t *b) + Frees the arrays in the block b and in the boundaries b.bb that were + previously allocated by alloc_blk(), alloc_bnd() or clone_blk(). The + boundary structures are then freed too (if they were allocated) and + all entries in the block structure are set to 0 (or NULL). + +int ipt_blk(block_t *b,int *x) + Returns the index of the lattice point in the block b with Cartesian + coordinates x[4] relative to the base point of b. + +void assign_s2sblk(blk_grid_t grid,int n,ptset_t set,spinor *s,int k) + Assigns the relevant part of the global single-precision spinor field s + to the single-precision field b.s[k] on the n'th block of the specified + block grid. Depending on the specified point set, the field on the even, + odd or all points is copied. + +void assign_sblk2s(blk_grid_t grid,int n,ptset_t set,int k,spinor *s) + Assigns the single-precision spinor field b.s[k] on the n'th block of + the specified block grid to the relevant part of the global single- + precision field s. Depending on specified point set, the field on the + even, odd or all points is copied. + +void assign_s2sdblk(blk_grid_t grid,int n,ptset_t set,spinor *s,int k) + Assigns the relevant part of the global single-precision spinor field s + to the double-precision field b.sd[k] on the n'th block of the specified + block grid. Depending on the specified point set, the field on the even, + odd or all points is copied. + +void assign_sd2sdblk(blk_grid_t grid,int n,ptset_t set, + spinor_dble *sd,int k) + Assigns the relevant part of the global double-precision spinor field sd + to the double-precision field b.sd[k] on the n'th block of the specified + block grid. Depending on the specified point set, the field on the even, + odd or all points is copied. + +void assign_sdblk2sd(blk_grid_t grid,int n,ptset_t set, + int k,spinor_dble *sd) + Assigns the single-precision spinor field b.sd[k] on the n'th block of + the specified block grid to the relevant part of the global single- + precision field sd. Depending on specified point set, the field on the + even, odd or all points is copied. + +int assign_swd2swbgr(blk_grid_t grid,ptset_t set) + Assigns the global double-precision SW field to the corresponding + single-precision fields in the specified grid. On the given point + set, the copied Pauli matrices are inverted before assignment and + the program returns 0 or 1 depending on whether the inversions were + safe or not. + +int assign_swd2swdblk(blk_grid_t grid,int n,ptset_t set) + Assigns the global double-precision SW field to the corresponding + double-precision field on the n'th block of the specified grid. On + the given point set, the copied Pauli matrices are inverted before + assignment and the program returns 0 or 1 depending on whether the + inversions were safe or not. + +void assign_ud2ubgr(blk_grid_t grid) + Assigns the global double-precision gauge field to the corresponding + single-precision fields in the specified block grid (see the notes). + +void assign_ud2udblk(blk_grid_t grid,int n) + Assigns the global double-precision gauge field to the corresponding + double-precision field on the n'th block of the specified block grid + (see the notes). diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/block/README.block b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/block/README.block new file mode 100644 index 0000000000000000000000000000000000000000..316ca7a1b95cc1fb4c2d3cce3a316fb757d3e7b8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/block/README.block @@ -0,0 +1,235 @@ + +******************************************************************************** + + Block structure explained + +******************************************************************************** + +Rectangular blocks of lattice points and their exterior boundaries are +described by the structures block_t and bndry_t that are defined in the +header file block.h. These objects can be easily handled by passing the +corresponding pointers to the functions that operate on them. + +It is currently not possible to allocate blocks that are not fully +contained in the local lattice. Moreover, the block sizes must be even +and not smaller than 4. The exterior boundaries of a block may, +however, overlap with the lattices on the neighbouring processes. + + +Block structure +--------------- + +Block data and fields are collected in a structure + +typedef struct +{ + int *bo,*bs,vol,vbb,nbp,ns,nsd,shf; + int *ipt,*imb,*ibp; + int (*iup)[4],(*idn)[4]; + su3 *u; + su3_dble *ud; + pauli *sw; + pauli_dble *swd; + spinor **s; + spinor_dble **sd; + bndry_t *bb; +} block_t; + +block_t b; + +with the following entries: + +b.bo[4] Cartesian coordinates (in the local lattice) of the + base point of the block. + +b.bs[4] Linear sizes of the block in the four dimensions. The + local coordinates in direction mu of the points in the + block thus range from b.bo[mu] to b.bo[mu]+b.bs[mu]-1 + inclusive. + +b.vol Number of points in the block. + +b.vbb Total number of exterior boundary points of the + block. + +b.nbp Number of points in the block at global time 0 + (boundary conditions type 0,1 or 2)and and time + NPROC0*L0-1 (boundary conditions type 0). + +b.ns Number of single-precision spinor fields on the block. + +b.nsd Number of double-precision spinor fields on the block. + +b.shf The bits b1,b2,...,b7 in this number (counting from + the lowest) indicate that + + b1=1: The block is protected, + b2=1: The geometry arrays are shared, + b3=1: b.u, bb.u and b.sw are shared, + b4=1: b.ud, bb.ud and b.swd are shared, + b5=1: b.s is shared, + b6=1: b.sd is shared, + b7=1: bb.w is shared, + b8=1: bb.wd is shared + + (the last two bits refer to the Weyl fields on the + exterior boundaries of the block). As explained below, + block fields can be shared among the blocks in a block + grid. Protected blocks cannot be freed or reallocated. + +b.ipt[b.vol+1] The block points are labeled by an index ix. If + x0,x1,x2,x3 are the coordinates of a block point + relative to the base point, a primitive point label + is iy=x3+b.bs[3]*x2+...+b.bs[3]*b.bs[2]*b.bs[1]*x0. + This array returns the actual label ix=b.ipt[iy] + (the last entry in the array is not used). + +b.imb[b.vol+1] For a given block point with label ix, this array + returns the label iz=b.imb[ix] of the point in the + local lattice. The array thus defines the embedding + of the block in full lattice (the last entry in the + array is not used). + +b.ibp[b.nbp] Array of the labels ix of the block points at global + time 0 (boundary conditions type 0,1 or 2) and time + NPROC0*L0-1 (boundary conditions type 0). The labels + are in ascending order. In particular, the first and + second half of the array contain the labels of the + even and odd points, respectively. + +b.iup[b.vol][4] Block geometry arrays, giving the labels of the +b.idn[b.vol][4] neighbours of a given block point. If the neighbour + is on the exterior boundary of the block, the arrays + return the value b.vol. + +b.u[4*b.vol] The single-precision gauge field on the block is + stored in this array in such a way that the 8 link + variables at the odd point with label ix are the 8 + elements at b.u+8*(ix-b.vol/2) (as on the global + lattice). The links sticking out of the block are + special in the sense that the variables residing + there are not used. + +b.ud[4*b.vol] This array contains the double-precision gauge field + on the block. The storage conventions are the same as + in the case of the single-precision field. + +b.sw[2*b.vol] The single-precision SW term is allocated together + with the single-precision gauge field. The upper + and lower Pauli matrix at the point with label ix + are stored at b.sw[2*ix] and b.sw[2*ix+1]. + +b.swd[2*b.vol] The double-precision SW term is allocated together + with the double-precision gauge field. The upper + and lower Pauli matrix at the point with label ix + are stored at b.swd[2*ix] and b.swd[2*ix+1]. + +b.s[b.ns][b.vol+1] The value of the k'th single-precision spinor field + at the block point with label ix is b.s[k][ix]. In + each of these fields the last entry is not used. + +b.sd[b.nsd][b.vol+1] The value of the k'th double-precision spinor field + at the block point with label ix is b.sd[k][ix]. In + each of these fields the last entry is not used. + +b.bb[8] Array of boundary structures, one for each face + (see below). + +In general, not all field arrays are allocated. Some blocks may contain the +tsingle-precision gauge and SW fields but not the double-precision fields, +for example. Which fields are allocated and which are shared can be chosen +when the block is allocated (see alloc_blk() [block.c]). + +The phrase "... is not used" refers to an array element that serves as +a place-holder or for another technical purpose. At the beginning of any +subprogram, the variables stored there will, in general, contain random +values. + + +Boundary structure +------------------ + +The geometry of each face of the exterior boundary of a block and the +fields living there are described by a structure + +typedef struct +{ + int ifc,ibn,vol,nw,nwd; + int *ipp,*map,*imb; + su3 *u; + su3_dble *ud; + weyl **w; + weyl_dble **wd; +} bndry_t; + +bndry_t bb; + +with the following entries: + +bb.ifc The faces in the -0,+0,-1,+1,-2,+2,-3,+3 directions + are labeled by a number ifc ranging from 0 to 7. + +bb.ibn Indicates whether the face is contained in the + exterior boundary of the local lattice (bb.ibn=1) + or not (bb.ibn=0). + +bb.vol Number of points in the face. + +bb.nw Number of single-precision Weyl fields on the face. + +bb.nwd Number of double-precision Weyl fields on the face. + +bb.ipp[bb.vol+1] The points in the face are labeled by an index ix. + Each point has a unique nearest point on the block + (its "partner point") with label bb.ipp[ix] (the + last entry in the array is not used). + +bb.map[bb.vol+1] For a given face point with label ix, bb.map[ix] is + the label of the partner point of the corresponding + point on the opposite face of the block (the last + entry in the array is not used). + +bb.imb[bb.vol+1] For a given face point with label ix, bb.imb[ix] + is the label of the point in the local lattice + (or in its exterior boundary; see README.global). + The array thus defines the embedding of the face + in full lattice (the last entry in the array is + not used). + +bb.u[bb.vol] Array of the single-precision gauge-field variables + residing on the links that connect the face points + with their partner points on the block. + +bb.ud[bb.vol] Array of the double-precision gauge-field variables + residing on the links that connect the face points + with their partner points on the block. + +bb.w[bb.nw][bb.vol] The value of the k'th single-precision Weyl field + at the face point with label ix is bb.w[k][ix]. + +bb.wd[bb.nwd][bb.vol] The value of the k'th double-precision Weyl field + at the face point with label ix is bb.wd[k][ix]. + +Which field arrays are allocated may be chosen when the program alloc_bnd() +[block.c] is called. + + + +Block grids +----------- + +Grids of blocks that cover the whole lattice without overlaps can be +allocated and initialized using the programs in the module blk_grid.c. +The enumeration type blk_grid_t (see block.h) lists the currently +available block grids. + +The size of the blocks in a block grid can be chosen when the grid is +allocated, but is required to divide the local lattice. Moreover, the +number of blocks in the local lattice must be even and the total +number of blocks in any space-time direction must also be even. + +Among the blocks of a block grid, the gauge, Dirac spinor and Weyl +fields may be shared, i.e. they are allocated only on the first block +in the local lattice and their addresses are copied to the other +blocks. Which fields are shared can be determined by reading the bits +of the flag b.shf on any one of the blocks b. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/block/blk_grid.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/block/blk_grid.c new file mode 100644 index 0000000000000000000000000000000000000000..203622a7729d1d33b852400ab1bcbf47a8d4ec96 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/block/blk_grid.c @@ -0,0 +1,214 @@ + +/******************************************************************************* +* +* File blk_grid.c +* +* Copyright (C) 2005, 2007, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Block grid allocation. +* +* The externally accessible functions are +* +* void alloc_bgr(blk_grid_t grid) +* Allocates the specified block grid. The block array and the block +* fields are put in the static memory of this module and are properly +* initialized. +* +* block_t *blk_list(blk_grid_t grid,int *nb,int *isw) +* Returns the pointer to the block array of the specified grid. The +* number of blocks on the local lattice is assigned to nb and isw is +* set to 0 or 1 depending on whether the first block is black or white +* (by definition it is black on the first process). If the block grid +* is not allocated, the program returns NULL and sets nb and isw to 0. +* +* Notes: +* +* The block sizes bs[4] and other parameters of the specified block grid +* are obtained from the parameter data base. These and the lattice sizes +* must be such that the lattice can be covered by non-overlapping blocks. +* Moreover, the number of blocks in each direction must be even and the +* local lattices must contain an even number of blocks. This ensures that +* the block grid can be chessboard-coloured and that the number of blocks +* in the local lattice is the same for both colours. +* +* On all processes, the blocks at a given position in the array of blocks +* returned by blk_list() have the same position in the local lattice. The +* blocks are ordered such that the first half of them have the same colour. +* For a given colour, the blocks are ordered according to their index +* +* n[3]+nbl[3]*n[2]+nbl[2]*nbl[3]*n[1]+nbl[1]*nbl[2]*nbl[3]*n[0], +* +* where n[mu]=bo[mu]/bs[mu] are the Cartesian coordinates of the block in +* the block grid and nbl[mu] denotes the numbers of blocks in direction mu. +* All blocks have allocated boundaries and the protection flag set. +* +* The program alloc_bgr() involves communications and must be called on all +* processes simultaneously with the same parameters. A given block grid can +* be allocated only once. +* +*******************************************************************************/ + +#define BLK_GRID_C + +#include +#include +#include +#include "mpi.h" +#include "utils.h" +#include "flags.h" +#include "sap.h" +#include "block.h" +#include "global.h" + +typedef struct +{ + int nb,isw; + block_t *b; +} bgrid_t; + +static bgrid_t bgr[(int)(BLK_GRIDS)+1]={{0,0,NULL},{0,0,NULL},{0,0,NULL}}; + + +static block_t *blks(int *bs,int iu,int iud,int ns,int nsd, + int iub,int iudb,int nw,int nwd, + int shf,int *nb,int *isw) +{ + int bo[4]; + int n0,n1,n2,n3,m0,m1,m2,m3; + block_t *b,*rbe,*rbo; + + n0=L0/bs[0]; + n1=L1/bs[1]; + n2=L2/bs[2]; + n3=L3/bs[3]; + + (*nb)=n0*n1*n2*n3; + (*isw)=(cpr[0]*n0+cpr[1]*n1+cpr[2]*n2+cpr[3]*n3)&0x1; + + b=malloc((*nb)*sizeof(*b)); + error(b==NULL,1,"blks [blk_grid.c]","Unable to allocate block grid"); + + rbe=b; + rbo=b+(*nb)/2; + + for (m0=0;m01) + { + iprms[0]=igr; + + MPI_Bcast(iprms,1,MPI_INT,0,MPI_COMM_WORLD); + + error(iprms[0]!=igr,1,"alloc_bgr [blk_grid.c]", + "Parameter is not global"); + } + + error(bgr[igr].b!=NULL,1,"alloc_bgr [blk_grid.c]", + "Block grid is already allocated"); + + bs=NULL; + iu=0; + iud=0; + ns=0; + nsd=0; + iub=0; + iudb=0; + nw=0; + nwd=0; + shf=0x0; + + if (grid==SAP_BLOCKS) + { + sap=sap_parms(); + error_root(sap.ncy==0,1,"alloc_bgr [blk_grid.c]", + "SAP parameters are not set"); + + bs=sap.bs; + iu=1; + ns=3; + iub=1; + shf=0x13; + } + else if (grid==DFL_BLOCKS) + { + dfl=dfl_parms(); + error_root(dfl.Ns==0,1,"alloc_bgr [blk_grid.c]", + "Deflation subspace parameters are not set"); + + bs=dfl.bs; + iud=1; + ns=dfl.Ns+1; + nsd=dfl.Ns+1; + shf=0xb; + } + else + error_root(1,1,"alloc_bgr [blk_grid.c]","Unknown block grid"); + + bgr[igr].b=blks(bs,iu,iud,ns,nsd,iub,iudb,nw,nwd,shf, + &(bgr[igr].nb),&(bgr[grid].isw)); + + if (grid==SAP_BLOCKS) + alloc_sap_bufs(); +} + + +block_t *blk_list(blk_grid_t grid,int *nb,int *isw) +{ + int igr; + + igr=(int)(grid); + (*nb)=bgr[igr].nb; + (*isw)=bgr[igr].isw; + + return bgr[igr].b; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/block/block.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/block/block.c new file mode 100644 index 0000000000000000000000000000000000000000..ba5d82b97a28fc6665fc4ce813a3cc562a3b6408 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/block/block.c @@ -0,0 +1,940 @@ + +/******************************************************************************* +* +* File block.c +* +* Copyright (C) 2005, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Basic allocation programs for blocks of lattice points. +* +* The externally accessible functions are +* +* void alloc_blk(block_t *b,int *bo,int *bs, +* int iu,int iud,int ns,int nsd) +* Sets the offset and side-lengths of the block b to bo[4] and bs[4], +* respectively, and allocates the block fields depending on the values +* of the other parameters. The single-precision gauge and SW fields are +* allocated if iu=1, the double-precision gauge and SW fields if iud=1, +* while ns and nsd are the numbers of single- and double-precision Dirac +* fields that are allocated. All elements of the block are properly +* initialized and the share flag b.shf is set to 0x0 (see the notes). +* +* void alloc_bnd(block_t *b,int iu,int iud,int nw,int nwd) +* Allocates the boundary structures b.bb in the block b and the fields +* in there depending on the parameters iu,iud,nw and nwd. The single- +* and double-precision gauge fields are allocated if iu=1 and iud=1, +* respectively, while nw and nwd are the numbers of single- and double- +* precision Weyl fields that are allocated. All elements of the block +* are then properly initialized (see the notes). +* +* void clone_blk(block_t *b,int shf,int *bo,block_t *c) +* Sets the offset of the block c to bo[4] and its side lengths to +* b.bs[4]. The fields in c are then allocated depending on the bits +* b1,b2,..,b8 (counting from the lowest) of the share flag shf. The +* relevant bits are: +* +* b2=1: b.ipt,b.iup and b.idn are shared, +* b3=1: b.u, b.bb.u and b.sw are shared, +* b4=1: b.ud, b.bb.ud and b.swd are shared, +* b5=1: b.s is shared, +* b6=1: b.sd is shared. +* b7=1: b.bb.w is shared, +* b8=1: b.bb.wd is shared. +* +* All fields that are not shared and are allocated on b are allocated +* on c as well, while the pointers to the shared fields are set to those +* of b. An error occurs if a field is shared according to the share flag +* b.shf on b but not according to shf. Moreover, the offset differences +* bo[mu]-b.bo[mu] must be integer multiples of b.bs[mu] for all mu. The +* share flag c.shf is set to shf. +* +* void free_blk(block_t *b) +* Frees the arrays in the block b and in the boundaries b.bb that were +* previously allocated by alloc_blk(), alloc_bnd() or clone_blk(). The +* boundary structures are then freed too (if they were allocated) and +* all entries in the block structure are set to 0 (or NULL). +* +* int ipt_blk(block_t *b,int *x) +* Returns the index of the lattice point in the block b with Cartesian +* coordinates x[4] relative to the base point of b. +* +* Notes: +* +* The entries of the block and boundary structures are explained in the file +* README.block in this directory. +* +* It is currently not possible to allocate blocks that are not fully +* contained in the local lattice. Moreover, the block sizes must be even +* and not smaller than 4. The exterior boundaries of a block may, however, +* overlap with the lattices on the neighbouring processes. In all cases, +* the scalar elements of the structures and the geometry and field arrays +* are properly initialized (gauge and SW fields are set to 1, Dirac spinor +* and Weyl fields to 0). +* +* Block allocation is a global operation, i.e. alloc_blk(), alloc_bnd(), +* clone_blk() and free_blk() must be called on all processes simultaneously. +* The program ipt_blk() can be called locally. +* +* alloc_blk() and clone_blk() register the blocks as being allocated. In this +* way it is possible to exclude any misuses of the programs such as freeing +* an unallocated block (which could have unpredictable side-effects). An +* already allocated block is first freed and then reallocated by alloc_blk(). +* Blocks b and their boundary structures b.bb cannot be freed or reallocated +* if the lowest bit of the share flag b.shf is equal to 1. +* +*******************************************************************************/ + +#define BLOCK_C + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "sflds.h" +#include "block.h" +#include "global.h" + +static const su3 u0={{0.0f}}; +static const su3_dble ud0={{0.0}}; +static const pauli p0={{0.0f}}; +static const pauli_dble pd0={{0.0}}; +static const weyl w0={{{0.0f}}}; +static const weyl_dble wd0={{{0.0}}}; + +struct ablk_t +{ + block_t *b; + struct ablk_t *next; +}; + +static struct ablk_t *first=NULL; + + +static int ins_blk(block_t *b) +{ + struct ablk_t *p; + + p=malloc(sizeof(*p)); + + if (p!=NULL) + { + (*p).b=b; + (*p).next=first; + first=p; + + return 0; + } + else + return 1; +} + + +static void rmv_blk(block_t *b) +{ + struct ablk_t *p,*q; + + q=NULL; + + for (p=first;p!=NULL;p=(*p).next) + { + if ((*p).b==b) + { + if (q==NULL) + first=(*p).next; + else + (*q).next=(*p).next; + + free(p); + return; + } + + q=p; + } +} + + +static int fnd_blk(block_t *b) +{ + struct ablk_t *p; + + for (p=first;p!=NULL;p=(*p).next) + { + if ((*p).b==b) + return 1; + } + + return 0; +} + + +static void free_bnd(block_t *b) +{ + int shf; + bndry_t *bb; + + shf=(*b).shf; + bb=(*b).bb; + + if (bb==NULL) + return; + + if (!(shf&0x2)) + free((*bb).ipp); + + free((*bb).imb); + + if ((!(shf&0x4))&&((*bb).u!=NULL)) + afree((*bb).u); + + if ((!(shf&0x8))&&((*bb).ud!=NULL)) + afree((*bb).ud); + + if ((!(shf&0x40))&&((*bb).nw>0)) + { + afree((*bb).w[0]); + free((*bb).w); + } + + if ((!(shf&0x80))&&((*bb).nwd>0)) + { + afree((*bb).wd[0]); + free((*bb).wd); + } + + free((*b).bb); + (*b).bb=NULL; +} + + +void free_blk(block_t *b) +{ + int shf; + + if (fnd_blk(b)==0) + return; + + shf=(*b).shf; + error(shf&0x1,1,"free_blk [block.c]", + "Protected block"); + + free_bnd(b); + + free((*b).bo); + (*b).bo=NULL; + (*b).bs=NULL; + + if (!(shf&0x2)) + { + free((*b).ipt); + free((*b).iup); + } + + free((*b).imb); + + (*b).vol=0; + (*b).vbb=0; + (*b).nbp=0; + (*b).shf=0x0; + (*b).ipt=NULL; + (*b).imb=NULL; + (*b).ibp=NULL; + (*b).iup=NULL; + (*b).idn=NULL; + + if ((!(shf&0x4))&&((*b).u!=NULL)) + { + afree((*b).u); + afree((*b).sw); + } + + if ((!(shf&0x8))&&((*b).ud!=NULL)) + { + afree((*b).ud); + afree((*b).swd); + } + + if ((!(shf&0x10))&&((*b).ns>0)) + { + afree((*b).s[0]); + free((*b).s); + } + + if ((!(shf&0x20))&&((*b).nsd>0)) + { + afree((*b).sd[0]); + free((*b).sd); + } + + (*b).ns=0; + (*b).nsd=0; + (*b).u=NULL; + (*b).ud=NULL; + (*b).sw=NULL; + (*b).swd=NULL; + (*b).s=NULL; + (*b).sd=NULL; + + rmv_blk(b); +} + + +static void set_u2unity(int vol,su3 *u) +{ + su3 unity,*um; + + unity=u0; + unity.c11.re=1.0f; + unity.c22.re=1.0f; + unity.c33.re=1.0f; + + um=u+vol; + + for (;uL0)||(bs[0]<4)||((bs[0]%2)!=0)|| + (bo[1]<0)||((bo[1]+bs[1])>L1)||(bs[1]<4)||((bs[1]%2)!=0)|| + (bo[2]<0)||((bo[2]+bs[2])>L2)||(bs[2]<4)||((bs[2]%2)!=0)|| + (bo[3]<0)||((bo[3]+bs[3])>L3)||(bs[3]<4)||((bs[3]%2)!=0),1, + "new_blk [block.c]","Improper choice of block position or size"); + + error_root((ns<0)||(nsd<0),1,"new_blk [block.c]", + "Improper choice of the numbers of spinor fields"); + + (*b).bo=malloc(8*sizeof(*(*b).bo)); + error((*b).bo==NULL,1,"new_blk [block.c]", + "Unable to allocate size arrays"); + (*b).bs=(*b).bo+4; + + for (mu=0;mu<4;mu++) + { + (*b).bo[mu]=bo[mu]; + (*b).bs[mu]=bs[mu]; + } + + (*b).vol=bs[0]*bs[1]*bs[2]*bs[3]; + (*b).vbb=2*(bs[0]*bs[1]*bs[2]+bs[1]*bs[2]*bs[3]+ + bs[2]*bs[3]*bs[0]+bs[3]*bs[0]*bs[1]); + (*b).nbp=0; + + if ((cpr[0]==0)&&(bo[0]==0)&&(bc_type()!=3)) + (*b).nbp+=bs[1]*bs[2]*bs[3]; + if ((cpr[0]==(NPROC0-1))&&((bo[0]+bs[0])==L0)&&(bc_type()==0)) + (*b).nbp+=bs[1]*bs[2]*bs[3]; + + (*b).ns=ns; + (*b).nsd=nsd; + (*b).shf=shf; + + if (shf&0x2) + { + (*b).ipt=NULL; + (*b).iup=NULL; + (*b).idn=NULL; + } + else + { + (*b).ipt=malloc(((*b).vol+1)*sizeof(*(*b).ipt)); + (*b).iup=malloc(2*(*b).vol*sizeof(*(*b).iup)); + error(((*b).ipt==NULL)||((*b).iup==NULL),1, + "new_blk [block.c]","Unable to allocate the geometry arrays"); + (*b).idn=(*b).iup+(*b).vol; + } + + (*b).imb=malloc((((*b).vol+1)+(*b).nbp)*sizeof(*(*b).imb)); + (*b).ibp=(*b).imb+(*b).vol+1; + + if ((shf&0x4)||(iu!=1)) + { + (*b).u=NULL; + (*b).sw=NULL; + } + else + { + (*b).u=amalloc(4*(*b).vol*sizeof(*(*b).u),ALIGN); + (*b).sw=amalloc(2*(*b).vol*sizeof(*(*b).sw),ALIGN); + error(((*b).u==NULL)||((*b).sw==NULL),1,"new_blk [block.c]", + "Unable to allocate the single-precision gauge field"); + set_u2unity(4*(*b).vol,(*b).u); + set_sw2unity(2*(*b).vol,(*b).sw); + } + + if ((shf&0x8)||(iud!=1)) + { + (*b).ud=NULL; + (*b).swd=NULL; + } + else + { + (*b).ud=amalloc(4*(*b).vol*sizeof(*(*b).ud),ALIGN); + (*b).swd=amalloc(2*(*b).vol*sizeof(*(*b).swd),ALIGN); + error(((*b).ud==NULL)||((*b).swd==NULL),1,"new_blk [block.c]", + "Unable to allocate the double-precision gauge field"); + set_ud2unity(4*(*b).vol,(*b).ud); + set_swd2unity(2*(*b).vol,(*b).swd); + } + + if ((shf&0x10)||(ns==0)) + (*b).s=NULL; + else + { + (*b).s=malloc(ns*sizeof(*(*b).s)); + error((*b).s==NULL,1,"new_blk [block.c]", + "Unable to allocate the single-precision spinor fields"); + + (*b).s[0]=amalloc(ns*((*b).vol+1)*sizeof(*((*b).s[0])),ALIGN); + error((*b).s[0]==NULL,2,"new_blk [block.c]", + "Unable to allocate the single-precision spinor fields"); + + for (n=1;n1) + { + for (mu=0;mu<4;mu++) + { + iprms[mu]=bo[mu]; + iprms[4+mu]=bs[mu]; + } + + iprms[8]=iu; + iprms[9]=iud; + iprms[10]=ns; + iprms[11]=nsd; + + MPI_Bcast(iprms,12,MPI_INT,0,MPI_COMM_WORLD); + + ie=0; + + for (mu=0;mu<4;mu++) + if ((iprms[mu]!=bo[mu])||(iprms[4+mu]!=bs[mu])) + ie=1; + + error((ie)||(iprms[8]!=iu)||(iprms[9]!=iud)|| + (iprms[10]!=ns)||(iprms[11]!=nsd),1,"alloc_blk [block.c]", + "Parameters are not global"); + } + + error(iup[0][0]==0,1,"alloc_blk [block.c]", + "The global geometry arrays are not set"); + + new_blk(b,bo,bs,iu,iud,ns,nsd,0x0); + blk_geometry(b); + blk_imbed(b); +} + + +static void new_bnd(block_t *b,int iu,int iud,int nw,int nwd,int shf) +{ + int vol,ifc,n; + int *bs,*ipp,*map,*imb; + su3 *u; + su3_dble *ud; + weyl **w,*wb; + weyl_dble **wd,*wdb; + bndry_t *bb; + + error_root((nw<0)||(nwd<0),1,"new_bnd [block.c]", + "Improper choice of the numbers of Weyl fields"); + + free_bnd(b); + bb=malloc(8*sizeof(*bb)); + error(bb==NULL,1,"new_bnd [block.c]", + "Unable to allocate boundary structures"); + (*b).bb=bb; + + vol=(*b).vol; + bs=(*b).bs; + + for (ifc=0;ifc<8;ifc++) + { + bb[ifc].ifc=ifc; + bb[ifc].vol=vol/bs[ifc/2]; + bb[ifc].nw=nw; + bb[ifc].nwd=nwd; + } + + vol=(*b).vbb; + + if (shf&0x2) + { + for (ifc=0;ifc<8;ifc++) + { + bb[ifc].ipp=NULL; + bb[ifc].map=NULL; + } + } + else + { + ipp=malloc(2*(vol+8)*sizeof(*ipp)); + error(ipp==NULL,1,"new_bnd [block.c]", + "Unable to allocate the geometry arrays"); + map=ipp+vol+8; + + for (ifc=0;ifc<8;ifc++) + { + bb[ifc].ipp=ipp; + ipp+=(bb[ifc].vol+1); + bb[ifc].map=map; + map+=(bb[ifc].vol+1); + } + } + + imb=malloc((vol+8)*sizeof(*imb)); + error(imb==NULL,2,"new_bnd [block.c]", + "Unable to allocate the geometry arrays"); + + for (ifc=0;ifc<8;ifc++) + { + bb[ifc].imb=imb; + imb+=(bb[ifc].vol+1); + } + + if ((shf&0x4)||(iu!=1)) + { + for (ifc=0;ifc<8;ifc++) + bb[ifc].u=NULL; + } + else + { + u=amalloc(vol*sizeof(*u),ALIGN); + error(u==NULL,1,"new_bnd [block.c]", + "Unable to allocate the single-precision gauge field"); + set_u2unity(vol,u); + + for (ifc=0;ifc<8;ifc++) + { + bb[ifc].u=u; + u+=bb[ifc].vol; + } + } + + if ((shf&0x8)||(iud!=1)) + { + for (ifc=0;ifc<8;ifc++) + bb[ifc].ud=NULL; + } + else + { + ud=amalloc(vol*sizeof(*ud),ALIGN); + error(ud==NULL,1,"new_bnd [block.c]", + "Unable to allocate the double-precision gauge field"); + set_ud2unity(vol,ud); + + for (ifc=0;ifc<8;ifc++) + { + bb[ifc].ud=ud; + ud+=bb[ifc].vol; + } + } + + if ((shf&0x40)||(nw==0)) + { + for (ifc=0;ifc<8;ifc++) + bb[ifc].w=NULL; + } + else + { + w=malloc(8*nw*sizeof(*w)); + wb=amalloc(nw*vol*sizeof(*wb),ALIGN); + error((w==NULL)||(wb==NULL),1,"new_bnd [block.c]", + "Unable to allocate the single-precision Weyl fields"); + set_w2zero(nw*vol,wb); + + for (ifc=0;ifc<8;ifc++) + { + bb[ifc].w=w; + + for (n=0;n1) + { + bo=(*b).bo; + bs=(*b).bs; + + for (mu=0;mu<4;mu++) + { + iprms[mu]=bo[mu]; + iprms[4+mu]=bs[mu]; + } + + iprms[8]=iu; + iprms[9]=iud; + iprms[10]=nw; + iprms[11]=nwd; + + MPI_Bcast(iprms,12,MPI_INT,0,MPI_COMM_WORLD); + + ie=0; + + for (mu=0;mu<4;mu++) + if ((iprms[mu]!=bo[mu])||(iprms[4+mu]!=bs[mu])) + ie=1; + + error((ie)||(iprms[8]!=iu)||(iprms[9]!=iud)|| + (iprms[10]!=nw)||(iprms[11]!=nwd),1,"alloc_bnd [block.c]", + "Parameters are not global"); + } + + new_bnd(b,iu,iud,nw,nwd,0x0); + bnd_geometry(b); + bnd_imbed(b); +} + + +void clone_blk(block_t *b,int shf,int *bo,block_t *c) +{ + int iprms[23],mu,ie; + int *bbo,*bs,bshf; + int iu,iud,ns,nsd,iub,iudb,nw,nwd; + int ib,ifc; + + error(fnd_blk(b)==0,1,"clone_blk [block.c]", + "The block to be cloned is not allocated"); + + bbo=(*b).bo; + bs=(*b).bs; + bshf=(*b).shf; + iu=((*b).u!=NULL); + iud=((*b).ud!=NULL); + ns=(*b).ns; + nsd=(*b).nsd; + + if ((*b).bb!=NULL) + { + iub=((*b).bb[0].u!=NULL); + iudb=((*b).bb[0].ud!=NULL); + nw=(*b).bb[0].nw; + nwd=(*b).bb[0].nwd; + ib=1; + } + else + { + iub=0; + iudb=0; + nw=0; + nwd=0; + ib=0; + } + + if (NPROC>1) + { + for (mu=0;mu<4;mu++) + { + iprms[mu]=bbo[mu]; + iprms[4+mu]=bs[mu]; + iprms[8+mu]=bo[mu]; + } + + iprms[12]=bshf; + iprms[13]=iu; + iprms[14]=iud; + iprms[15]=ns; + iprms[16]=nsd; + iprms[17]=iub; + iprms[18]=iudb; + iprms[19]=nw; + iprms[20]=nwd; + iprms[21]=ib; + iprms[22]=shf; + + MPI_Bcast(iprms,23,MPI_INT,0,MPI_COMM_WORLD); + + ie=0; + + for (mu=0;mu<4;mu++) + { + if ((iprms[mu]!=bbo[mu])|| + (iprms[4+mu]!=bs[mu])|| + (iprms[8+mu]!=bo[mu])) + ie=1; + } + + error((ie)||(iprms[12]!=bshf)||(iprms[13]!=iu)||(iprms[14]!=iud)|| + (iprms[15]!=ns)||(iprms[16]!=nsd)||(iprms[17]!=iub)|| + (iprms[18]!=iudb)||(iprms[19]!=nw)||(iprms[20]!=nwd)|| + (iprms[21]!=ib)||(iprms[22]!=shf),1,"clone_blk [block.c]", + "Parameters are not global"); + } + + error_root((bo[0]<0)||((bo[0]+bs[0])>L0)||((abs(bo[0]-bbo[0])%bs[0])!=0)|| + (bo[1]<0)||((bo[1]+bs[1])>L1)||((abs(bo[1]-bbo[1])%bs[1])!=0)|| + (bo[2]<0)||((bo[2]+bs[2])>L2)||((abs(bo[2]-bbo[2])%bs[2])!=0)|| + (bo[3]<0)||((bo[3]+bs[3])>L3)||((abs(bo[3]-bbo[3])%bs[3])!=0),1, + "clone_blk [block.c]","Improper block offset"); + + error_root(((bshf&0x2)&&(!(shf&0x2)))|| + ((bshf&0x4)&&(!(shf&0x4))&&(iu!=0))|| + ((bshf&0x8)&&(!(shf&0x8))&&(iud!=0))|| + ((bshf&0x10)&&(!(shf&0x10))&&(ns>0))|| + ((bshf&0x20)&&(!(shf&0x20))&&(nsd>0)),1, + "clone_blk [block.c]","Share flag mismatch"); + + new_blk(c,bo,bs,iu,iud,ns,nsd,shf); + + if (shf&0x2) + { + (*c).ipt=(*b).ipt; + (*c).iup=(*b).iup; + (*c).idn=(*b).idn; + } + + if ((shf&0x4)&&(iu!=0)) + { + (*c).u=(*b).u; + (*c).sw=(*b).sw; + } + + if ((shf&0x8)&&(iud!=0)) + { + (*c).ud=(*b).ud; + (*c).swd=(*b).swd; + } + + if ((shf&0x10)&&(ns>0)) + (*c).s=(*b).s; + + if ((shf&0x20)&&(nsd>0)) + (*c).sd=(*b).sd; + + if (!(shf&0x2)) + blk_geometry(c); + blk_imbed(c); + + if (ib) + { + error_root(((bshf&0x4)&&(!(shf&0x4))&&(iub!=0))|| + ((bshf&0x8)&&(!(shf&0x8))&&(iudb!=0))|| + ((bshf&0x40)&&(!(shf&0x40))&&(nw>0))|| + ((bshf&0x80)&&(!(shf&0x80))&&(nwd>0)),2, + "clone_blk [block.c]","Share flag mismatch"); + + new_bnd(c,iub,iudb,nw,nwd,shf); + + for (ifc=0;ifc<8;ifc++) + { + if (shf&0x2) + { + (*c).bb[ifc].ipp=(*b).bb[ifc].ipp; + (*c).bb[ifc].map=(*b).bb[ifc].map; + } + + if ((shf&0x4)&&(iub!=0)) + (*c).bb[ifc].u=(*b).bb[ifc].u; + + if ((shf&0x8)&&(iudb!=0)) + (*c).bb[ifc].ud=(*b).bb[ifc].ud; + + if ((shf&0x40)&&(nw>0)) + (*c).bb[ifc].w=(*b).bb[ifc].w; + + if ((shf&0x80)&&(nwd>0)) + (*c).bb[ifc].wd=(*b).bb[ifc].wd; + } + + if (!(shf&0x2)) + bnd_geometry(c); + bnd_imbed(c); + } +} + + +int ipt_blk(block_t *b,int *x) +{ + int *bs,n,ix; + + bs=(*b).bs; + + n=((x[0]<0)||(x[0]>=bs[0])); + ix=x[0]; + + n|=((x[1]<0)||(x[1]>=bs[1])); + ix=x[1]+bs[1]*ix; + + n|=((x[2]<0)||(x[2]>=bs[2])); + ix=x[2]+bs[2]*ix; + + n|=((x[3]<0)||(x[3]>=bs[3])); + ix=x[3]+bs[3]*ix; + + if (n==0) + return (*b).ipt[ix]; + else + { + error_loc(1,1,"ipt_blk [block.c]","Point coordinates are out of range"); + return 0; + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/block/map_s2blk.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/block/map_s2blk.c new file mode 100644 index 0000000000000000000000000000000000000000..ca421c68574728e30d63d4570f7d8982ba7ad269 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/block/map_s2blk.c @@ -0,0 +1,890 @@ + +/******************************************************************************* +* +* File map_s2blk.c +* +* Copyright (C) 2005, 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Copying of the spinor fields to and from the blocks in a block grid +* +* The externally accessible functions are +* +* void assign_s2sblk(blk_grid_t grid,int n,ptset_t set,spinor *s,int k) +* Assigns the relevant part of the global single-precision spinor field s +* to the single-precision field b.s[k] on the n'th block of the specified +* block grid. Depending on the specified point set, the field on the even, +* odd or all points is copied. +* +* void assign_sblk2s(blk_grid_t grid,int n,ptset_t set,int k,spinor *s) +* Assigns the single-precision spinor field b.s[k] on the n'th block of +* the specified block grid to the relevant part of the global single- +* precision field s. Depending on the specified point set, the field on +* the even, odd or all points is copied. +* +* void assign_s2sdblk(blk_grid_t grid,int n,ptset_t set,spinor *s,int k) +* Assigns the relevant part of the global single-precision spinor field s +* to the double-precision field b.sd[k] on the n'th block of the specified +* block grid. Depending on the specified point set, the field on the even, +* odd or all points is copied. +* +* void assign_sd2sdblk(blk_grid_t grid,int n,ptset_t set, +* spinor_dble *sd,int k) +* Assigns the relevant part of the global double-precision spinor field sd +* to the double-precision field b.sd[k] on the n'th block of the specified +* block grid. Depending on the specified point set, the field on the even, +* odd or all points is copied. +* +* void assign_sdblk2sd(blk_grid_t grid,int n,ptset_t set, +* int k,spinor_dble *sd) +* Assigns the single-precision spinor field b.sd[k] on the n'th block of +* the specified block grid to the relevant part of the global single- +* precision field sd. Depending on the specified point set, the field on +* the even, odd or all points is copied. +* +* Notes: +* +* Only the spinors residing on the blocks (but not those on the boundaries +* of the blocks) are copied. All these programs can be called locally. +* +*******************************************************************************/ + +#define MAP_S2BLK_C + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "utils.h" +#include "block.h" +#include "global.h" + +#if (defined x64) +#include "sse2.h" + +void assign_s2sblk(blk_grid_t grid,int n,ptset_t set,spinor *s,int k) +{ + int nb,isw,vol,*imb; + spinor *sb,*sm,*rs1,*rs2; + block_t *b; + + b=blk_list(grid,&nb,&isw)+n; + + if ((n<0)||(n>=nb)) + { + error_loc(1,1,"assign_s2sblk [map_s2blk.c]", + "Block grid is not allocated or block number out of range"); + return; + } + + if ((k<0)||(k>=(*b).ns)) + { + error_loc(1,1,"assign_s2sblk [map_s2blk.c]", + "Block field number is out of range"); + return; + } + + vol=(*b).vol; + imb=(*b).imb; + sb=(*b).s[k]; + sm=sb; + + if (set==ALL_PTS) + sm+=vol; + else if (set==EVEN_PTS) + sm+=vol/2; + else if (set==ODD_PTS) + { + imb+=vol/2; + sb+=vol/2; + sm+=vol; + } + + rs2=s+(*imb); + + for (;sb=nb)) + { + error_loc(1,1,"assign_sblk2s [map_s2blk.c]", + "Block grid is not allocated or block number out of range"); + return; + } + + if ((k<0)||(k>=(*b).ns)) + { + error_loc(1,1,"assign_sblk2s [map_s2blk.c]", + "Block field number is out of range"); + return; + } + + vol=(*b).vol; + imb=(*b).imb; + sb=(*b).s[k]; + sm=sb; + + if (set==ALL_PTS) + sm+=vol; + else if (set==EVEN_PTS) + sm+=vol/2; + else if (set==ODD_PTS) + { + imb+=vol/2; + sb+=vol/2; + sm+=vol; + } + + for (;sb=nb)) + { + error_loc(1,1,"assign_s2sdblk [map_s2blk.c]", + "Block grid is not allocated or block number out of range"); + return; + } + + if ((k<0)||(k>=(*b).nsd)) + { + error_loc(1,1,"assign_s2sdblk [map_s2blk.c]", + "Block field number is out of range"); + return; + } + + vol=(*b).vol; + imb=(*b).imb; + sb=(*b).sd[k]; + sm=sb; + + if (set==ALL_PTS) + sm+=vol; + else if (set==EVEN_PTS) + sm+=vol/2; + else if (set==ODD_PTS) + { + imb+=vol/2; + sb+=vol/2; + sm+=vol; + } + + rs2=s+(*imb); + + for (;sb=nb)) + { + error_loc(1,1,"assign_sd2sdblk [map_s2blk.c]", + "Block grid is not allocated or block number out of range"); + return; + } + + if ((k<0)||(k>=(*b).nsd)) + { + error_loc(1,1,"assign_sd2sdblk [map_s2blk.c]", + "Block field number is out of range"); + return; + } + + vol=(*b).vol; + imb=(*b).imb; + sb=(*b).sd[k]; + sm=sb; + + if (set==ALL_PTS) + sm+=vol; + else if (set==EVEN_PTS) + sm+=vol/2; + else if (set==ODD_PTS) + { + imb+=vol/2; + sb+=vol/2; + sm+=vol; + } + + rs2=sd+(*imb); + + for (;sb=nb)) + { + error_loc(1,1,"assign_sdblk2sd [map_s2blk.c]", + "Block grid is not allocated or block number out of range"); + return; + } + + if ((k<0)||(k>=(*b).nsd)) + { + error_loc(1,1,"assign_sdblk2sd [map_s2blk.c]", + "Block field number is out of range"); + return; + } + + vol=(*b).vol; + imb=(*b).imb; + sb=(*b).sd[k]; + sm=sb; + + if (set==ALL_PTS) + sm+=vol; + else if (set==EVEN_PTS) + sm+=vol/2; + else if (set==ODD_PTS) + { + imb+=vol/2; + sb+=vol/2; + sm+=vol; + } + + for (;sb=nb)) + { + error_loc(1,1,"assign_s2sblk [map_s2blk.c]", + "Block grid is not allocated or block number out of range"); + return; + } + + if ((k<0)||(k>=(*b).ns)) + { + error_loc(1,1,"assign_s2sblk [map_s2blk.c]", + "Block field number is out of range"); + return; + } + + vol=(*b).vol; + imb=(*b).imb; + sb=(*b).s[k]; + sm=sb; + + if (set==ALL_PTS) + sm+=vol; + else if (set==EVEN_PTS) + sm+=vol/2; + else if (set==ODD_PTS) + { + imb+=vol/2; + sb+=vol/2; + sm+=vol; + } + + for (;sb=nb)) + { + error_loc(1,1,"assign_sblk2s [map_s2blk.c]", + "Block grid is not allocated or block number out of range"); + return; + } + + if ((k<0)||(k>=(*b).ns)) + { + error_loc(1,1,"assign_sblk2s [map_s2blk.c]", + "Block field number is out of range"); + return; + } + + vol=(*b).vol; + imb=(*b).imb; + sb=(*b).s[k]; + sm=sb; + + if (set==ALL_PTS) + sm+=vol; + else if (set==EVEN_PTS) + sm+=vol/2; + else if (set==ODD_PTS) + { + imb+=vol/2; + sb+=vol/2; + sm+=vol; + } + + for (;sb=nb)) + { + error_loc(1,1,"assign_s2sdblk [map_s2blk.c]", + "Block grid is not allocated or block number out of range"); + return; + } + + if ((k<0)||(k>=(*b).nsd)) + { + error_loc(1,1,"assign_s2sdblk [map_s2blk.c]", + "Block field number is out of range"); + return; + } + + vol=(*b).vol; + imb=(*b).imb; + sb=(*b).sd[k]; + sm=sb; + + if (set==ALL_PTS) + sm+=vol; + else if (set==EVEN_PTS) + sm+=vol/2; + else if (set==ODD_PTS) + { + imb+=vol/2; + sb+=vol/2; + sm+=vol; + } + + for (;sb=nb)) + { + error_loc(1,1,"assign_sd2sdblk [map_s2blk.c]", + "Block grid is not allocated or block number out of range"); + return; + } + + if ((k<0)||(k>=(*b).nsd)) + { + error_loc(1,1,"assign_sd2sdblk [map_s2blk.c]", + "Block field number is out of range"); + return; + } + + vol=(*b).vol; + imb=(*b).imb; + sb=(*b).sd[k]; + sm=sb; + + if (set==ALL_PTS) + sm+=vol; + else if (set==EVEN_PTS) + sm+=vol/2; + else if (set==ODD_PTS) + { + imb+=vol/2; + sb+=vol/2; + sm+=vol; + } + + for (;sb=nb)) + { + error_loc(1,1,"assign_sdblk2sd [map_s2blk.c]", + "Block grid is not allocated or block number out of range"); + return; + } + + if ((k<0)||(k>=(*b).nsd)) + { + error_loc(1,1,"assign_sdblk2sd [map_s2blk.c]", + "Block field number is out of range"); + return; + } + + vol=(*b).vol; + imb=(*b).imb; + sb=(*b).sd[k]; + sm=sb; + + if (set==ALL_PTS) + sm+=vol; + else if (set==EVEN_PTS) + sm+=vol/2; + else if (set==ODD_PTS) + { + imb+=vol/2; + sb+=vol/2; + sm+=vol; + } + + for (;sb +#include +#include +#include "mpi.h" +#include "utils.h" +#include "flags.h" +#include "sw_term.h" +#include "block.h" +#include "global.h" + +pauli_dble m[2] ALIGNED16; + + +static int cp_swd2sw(block_t *b,ptset_t set) +{ + int *imb,ifail; + pauli *pb,*pm; + pauli_dble *swd,*p; + + swd=swdfld(); + pb=(*b).sw; + pm=pb+(*b).vol; + imb=(*b).imb; + ifail=0; + + for (;pb1) + { + iprms[0]=(int)(grid); + iprms[1]=(int)(set); + + MPI_Bcast(iprms,2,MPI_INT,0,MPI_COMM_WORLD); + + error((iprms[0]!=(int)(grid))||(iprms[1]!=(int)(set)),1, + "assign_swd2swbgr [map_sw2blk.c]","Parameters are not global"); + } + + b=blk_list(grid,&nb,&isw); + + if (nb==0) + { + error_root(1,1,"assign_swd2swbgr [map_sw2blk.c]", + "Block grid is not allocated"); + return 0; + } + + if (((*b).sw==NULL)||((*b).shf&0x4)) + { + error_root(1,1,"assign_swd2swbgr [map_sw2blk.c]", + "SW field on the grid is not allocated or shared"); + return 0; + } + + ie=query_flags(SWD_E_INVERTED); + io=query_flags(SWD_O_INVERTED); + + error_root(((ie)&&((set==ALL_PTS)||(set==EVEN_PTS)))|| + ((io)&&((set==ALL_PTS)||(set==ODD_PTS))),1, + "assign_swd2swbgr [map_sw2blk.c]", + "Attempt to invert the SW field a second time"); + + bm=b+nb; + ifail=0; + + for (;b=nb)) + { + error_loc(1,1,"assign_swd2swdblk [map_sw2blk.c]", + "Block grid is not allocated or block number out of range"); + return 0; + } + + if (((*b).swd==NULL)||(!((*b).shf&0x8))) + { + error_loc(1,1,"assign_swd2swdblk [map_sw2blk.c]", + "Block field is not allocated or not shared"); + return 0; + } + + ie=query_flags(SWD_E_INVERTED); + io=query_flags(SWD_O_INVERTED); + + if (((ie)&&((set==ALL_PTS)||(set==EVEN_PTS)))|| + ((io)&&((set==ALL_PTS)||(set==ODD_PTS)))) + { + error_loc(1,1,"assign_swd2swdblk [map_sw2blk.c]", + "Attempt to invert the SW field a second time"); + return 0; + } + + return cp_swd2swd(b+n,set); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/block/map_u2blk.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/block/map_u2blk.c new file mode 100644 index 0000000000000000000000000000000000000000..9fb40cc023f7e42e5c5fd4c860e5e7d41331b161 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/block/map_u2blk.c @@ -0,0 +1,381 @@ + +/******************************************************************************* +* +* File map_u2blk.c +* +* Copyright (C) 2006, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Copying of the gauge fields to the blocks in a block grid. +* +* The externally accessible functions are +* +* void assign_ud2ubgr(blk_grid_t grid) +* Assigns the global double-precision gauge field to the corresponding +* single-precision fields in the specified block grid (see the notes). +* +* void assign_ud2udblk(blk_grid_t grid,int n) +* Assigns the global double-precision gauge field to the corresponding +* double-precision field on the n'th block of the specified block grid +* (see the notes). +* +* Notes: +* +* The program assign_ud2ubgr() copies the gauge field to all blocks and their +* exterior boundaries (if the field is allocated there). An error occurs if +* the single-precision gauge field on the blocks is shared. On the exterior +* block boundaries at time 0 (boundary conditions type 0,1 and 2) and time +* NPROC0*L0-1 (boundary condition type 0), the link variables are not copied +* and are instead set to zero. +* +* The program assign_ud2udblk() does *not* copy the link variables to the +* boundaries of the block. The double-precision gauge field on the blocks +* must be shared in this case. +* +* As explained in README.block, the field arrays on the blocks reserve space +* for all 8 link variables at the odd points, including those on the links +* that "stick out" of the block. While the latter are used for technical +* purposes only, the programs in this module copy these too. +* +* Both programs in this module may involve communications and must be called +* on all MPI processes simultaneously. +* +*******************************************************************************/ + +#define MAP_U2BLK_C + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "block.h" +#include "global.h" + +static int bc,np,nmu[8],nbf[8],ofs[8]; +static int sflg[8],rflg[8],tags[8],init=0; +static const su3 u0={{0.0f}}; +static su3 *ubuf; + + +static void alloc_ubuf(void) +{ + int ifc,ib; + + error(iup[0][0]==0,1,"alloc_ubuf [map_u2blk.c]", + "Geometry arrays are not set"); + + bc=bc_type(); + np=(cpr[0]+cpr[1]+cpr[2]+cpr[3])&0x1; + + nbf[0]=FACE0/2; + nbf[1]=FACE0/2; + nbf[2]=FACE1/2; + nbf[3]=FACE1/2; + nbf[4]=FACE2/2; + nbf[5]=FACE2/2; + nbf[6]=FACE3/2; + nbf[7]=FACE3/2; + + ofs[0]=0; + + for (ifc=0;ifc<8;ifc++) + { + nmu[ifc]=cpr[ifc/2]&0x1; + + if (ifc>0) + ofs[ifc]=ofs[ifc-1]+nbf[ifc-1]; + + sflg[ifc]=((ifc>1)|| + ((ifc==0)&&((cpr[0]!=0)||(bc!=0)))|| + ((ifc==1)&&((cpr[0]!=(NPROC0-1))||(bc==3)))); + + rflg[ifc]=((ifc>1)|| + ((ifc==0)&&((cpr[0]!=0)||(bc==3)))|| + ((ifc==1)&&((cpr[0]!=(NPROC0-1))||(bc!=0)))); + + tags[ifc]=mpi_permanent_tag(); + } + + if (BNDRY>0) + { + ubuf=amalloc(BNDRY*sizeof(*ubuf),ALIGN); + error(ubuf==NULL,1,"alloc_ubuf [map_u2blk.c]", + "Unable to allocate communication buffer"); + + for (ib=0;ib0) + { + io=(ifc^nmu[ifc])^0x1; + + sbuf=sbuf0+ofs[io^0x1]; + rbuf=rbuf0+ofs[io]; + saddr=npr[io]; + raddr=saddr; + + n=18*nbf[ifc]; + tag=tags[ifc]; + + if (np==0) + { + if (sflg[io]) + MPI_Send(sbuf,n,MPI_FLOAT,saddr,tag,MPI_COMM_WORLD); + if (rflg[io]) + MPI_Recv(rbuf,n,MPI_FLOAT,raddr,tag,MPI_COMM_WORLD,&stat); + } + else + { + if (rflg[io]) + MPI_Recv(rbuf,n,MPI_FLOAT,raddr,tag,MPI_COMM_WORLD,&stat); + if (sflg[io]) + MPI_Send(sbuf,n,MPI_FLOAT,saddr,tag,MPI_COMM_WORLD); + } + } + } +} + + +static void assign_ud2ub(block_t *b) +{ + int vol,volb,ifc,ibd,ibu; + int ix,iy,*imb,*ipp,*imbb; + su3 *u,*ub; + su3_dble *udb,*vd; + bndry_t *bb; + + vol=(*b).vol; + imb=(*b).imb; + + udb=udfld(); + u=(*b).u; + + for (ix=(vol/2);ix1)||((ifc==0)&&(ibd==0))||((ifc==1)&&(ibu==0))) + { + ipp=(*bb).ipp; + + for (ix=0;ix<(volb/2);ix++) + { + iy=ipp[ix]; + (*u)=ub[8*(iy-(vol/2))+(ifc^0x1)]; + u+=1; + } + + imbb=(*bb).imb; + + for (;ix1) + { + iprms[0]=(int)(grid); + + MPI_Bcast(iprms,1,MPI_INT,0,MPI_COMM_WORLD); + + error(iprms[0]!=(int)(grid),1,"assign_u2ubgr [map_u2blk.c]", + "Parameter is not global"); + } + + if (init==0) + alloc_ubuf(); + + b=blk_list(grid,&nb,&isw); + + error((b==NULL)||((*b).u==NULL)||((*b).shf&0x4),1, + "assign_u2ubgr [map_u2blk.c]","Unallocated or improper block grid"); + + if (NPROC>1) + { + fetch_bnd_u(); + send_bnd_u(); + } + + bm=b+nb; + + for (;b=nb)) + { + error_loc(1,1,"assign_ud2udblk [map_u2blk.c]", + "Block grid is not allocated or block number out of range"); + return; + } + + b+=n; + + if (((*b).ud==NULL)||(((*b).shf&0x8)==0)) + { + error_loc(1,1,"assign_ud2udblk [map_u2blk.c]", + "Block field is not allocated or not shared"); + return; + } + + vol=(*b).vol; + imb=(*b).imb; + ud=(*b).ud; + udb=udfld(); + + for (ix=(vol/2);ix=0, the deflation + subspace is regenerated by calling dfl_modes(). The solver program + dfl_sap_gcr() is then called again and the results are passed to + the calling program. + +void dfl_sd2vd(spinor_dble *sd,complex_dble *vd) + Assigns the components of the global double-precision spinor field + sd along the deflation subspace to the double-precision vector + field vd. + +void dfl_vd2sd(complex_dble *vd,spinor_dble *sd) + Assigns the element of the deflation subspace corresponding to the + double-precision vector field vd to the global double-precision spinor + field sd. + +void dfl_sub_vd2sd(complex_dble *vd,spinor_dble *sd) + Subtracts the element of the deflation subspace corresponding to the + double-precision vector field vd from the global double-precision + spinor field sd. + +void dfl_s2v(spinor *s,complex *v) + Assigns the components of the global single-precision spinor field + s along the deflation subspace to the single-precision vector + field v. + +void dfl_v2s(complex *v,spinor *s) + Assigns the element of the deflation subspace corresponding to the + single-precision vector field v to the global single-precision spinor + field s. + +void dfl_sub_v2s(complex *v,spinor *s) + Subtracts the element of the deflation subspace corresponding to the + double-precision vector field v from the global single-precision spinor + field s. + +void dfl_subspace(spinor **mds) + Copies the global single-precision spinor fields mds[0],..,mds[Ns-1] + to the fields b.sd[1],..,b.sd[Ns] on the blocks b of the DFL_BLOCKS + grid. The block fields are then orthonormalized and are assigned to + the single-precision block fields b.s[1],..,b.s[Ns]. + In this basis of fields, the modes mds[0],..,mds[Ns-1] are given by + fields vmds[0],..,vmds[Ns-1] of Ns*nb complex numbers, where nb is + the number of blocks in the block grid. These fields are assigned to + the last Ns single-precision vector fields of the array returned by + vflds() [vflds/vflds.c]. + +double ltl_gcr(int nkv,int nmx,double res,double mu, + complex_dble *eta,complex_dble *psi,int *status) + Obtains an approximate solution psi of the little Dirac equation for + given source eta using the even-odd preconditioned GCR algorithm. See + the notes for the explanation of the parameters of the program. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/dfl/dfl_geometry.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/dfl/dfl_geometry.c new file mode 100644 index 0000000000000000000000000000000000000000..74b9a05f6f4d95e21eb3a6434928ac280ac3c53d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/dfl/dfl_geometry.c @@ -0,0 +1,406 @@ + +/******************************************************************************* +* +* File dfl_geometry.c +* +* Copyright (C) 2007, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Geometry of the DFL_BLOCKS block grid. +* +* The externally accessible functions are +* +* dfl_grid_t dfl_geometry(void) +* Returns a structure containing the index arrays that describe the +* geometry of the DFL_BLOCKS block grid (see the notes). +* +* Notes: +* +* The blocks in the DFL_BLOCKS grid form a hypercubic lattice whose geometry +* is described by a structure of type dfl_grid_t. The elements of this +* structure are: +* +* nb Number of blocks in the local lattice. +* +* nbb Number of exterior boundary blocks of the local +* block lattice. +* +* inn[ix][ifc] Index of the nearest neighbour block in direction ifc +* of the block with index ix (ix=0,..,nb-1, ifc=0,..,7). +* The ordering of the directions ifc is -0,+0,..,-3,+3. +* +* idx[ix] Position of the block with index ix in the array of +* blocks returned by blk_list(). Note that ix=idx[ib] +* if ib=idx[ix]. +* +* ipp[ix] Index of the nearest neighbour (partner block) in the +* local lattice of the block on the exterior boundary +* with index nb+ix (ix=0,..,nbb-1). +* +* map[ix] Index of the partner block on the opposite face of the +* local lattice of the block on the exterior boundary +* with index nb+ix (ix=0,..,nbb-1). +* +* nbbe[ifc] Number of even (odd) blocks on the exterior boundary +* nbbo[ifc] in direction ifc. +* +* obbe[ifc] Offset of the index of the first even (odd) block on +* obbo[ifc] the exterior boundary in direction ifc. The offsets +* are given relative to the first block on the boundary. +* +* The blocks in the local lattice are ordered according to their Cartesian +* coordinates (n0,n1,n2,n3) in the total block lattice. First come all even +* blocks (those with (n0+n1+n2+n3)=0 mod 2) and then the odd ones. Within +* each of these two groups of blocks, the ordering is lexicographic, i.e. +* the block with coordinates n comes before the block with coordinates m if +* +* (n0 +#include +#include +#include "flags.h" +#include "utils.h" +#include "dfl.h" +#include "global.h" + +static int isw,init=0; +static int nbl[4],nbb[4]; +static dfl_grid_t dfl_grid; + + +static void set_grid_sizes(void) +{ + int mu,ifc; + int *bs,*nbbe,*nbbo,*obbe,*obbo; + dfl_parms_t dfl; + + dfl=dfl_parms(); + bs=dfl.bs; + + error_root(dfl.Ns==0,1,"set_grid_sizes [dfl_geometry.c]", + "Deflation subspace parameters are not set"); + + nbl[0]=L0/bs[0]; + nbl[1]=L1/bs[1]; + nbl[2]=L2/bs[2]; + nbl[3]=L3/bs[3]; + + nbb[0]=(NPROC0>1)*nbl[1]*nbl[2]*nbl[3]; + nbb[1]=(NPROC1>1)*nbl[2]*nbl[3]*nbl[0]; + nbb[2]=(NPROC2>1)*nbl[3]*nbl[0]*nbl[1]; + nbb[3]=(NPROC3>1)*nbl[0]*nbl[1]*nbl[2]; + + isw=(nbl[0]*cpr[0]+nbl[1]*cpr[1]+ + nbl[2]*cpr[2]+nbl[3]*cpr[3])&0x1; + + dfl_grid.nb=nbl[0]*nbl[1]*nbl[2]*nbl[3]; + dfl_grid.nbb=2*(nbb[0]+nbb[1]+nbb[2]+nbb[3]); + + nbbe=dfl_grid.nbbe; + nbbo=dfl_grid.nbbo; + obbe=dfl_grid.obbe; + obbo=dfl_grid.obbo; + + for (mu=0;mu<4;mu++) + { + if (isw) + { + nbbe[2*mu]=(nbb[mu]+1)/2; + nbbo[2*mu]=nbb[mu]-nbbe[2*mu]; + } + else + { + nbbo[2*mu]=(nbb[mu]+1)/2; + nbbe[2*mu]=nbb[mu]-nbbo[2*mu]; + } + + if (nbl[mu]&0x1) + { + nbbe[2*mu+1]=nbbe[2*mu]; + nbbo[2*mu+1]=nbbo[2*mu]; + } + else + { + nbbe[2*mu+1]=nbbo[2*mu]; + nbbo[2*mu+1]=nbbe[2*mu]; + } + } + + obbe[0]=0; + + for (ifc=1;ifc<8;ifc++) + obbe[ifc]=obbe[ifc-1]+nbbe[ifc-1]; + + obbo[0]=obbe[7]+nbbe[7]; + + for (ifc=1;ifc<8;ifc++) + obbo[ifc]=obbo[ifc-1]+nbbo[ifc-1]; +} + + +static void alloc_arrays(void) +{ + int nb,nbb; + int (*inn)[8],*idx; + + nb=dfl_grid.nb; + nbb=dfl_grid.nbb; + inn=malloc(nb*sizeof(*inn)); + idx=malloc((nb+2*nbb)*sizeof(*idx)); + + error((inn==NULL)||(idx==NULL),1,"alloc_arrays [dfl_geometry.c]", + "Unable to allocate index arrays"); + + dfl_grid.inn=inn; + dfl_grid.idx=idx; + idx+=nb; + dfl_grid.ipp=idx; + idx+=nbb; + dfl_grid.map=idx; +} + + +static void set_index(void) +{ + int n0,n1,n2,n3; + int in,ic[2],*idx; + + in=0; + ic[0]=0; + ic[1]=dfl_grid.nb/2; + idx=dfl_grid.idx; + + for (n0=0;n01) + { + if (n0==0) + inn[in][0]=nb; + if (n0==(nbl[0]-1)) + inn[in][1]=nb; + } + if (NPROC1>1) + { + if (n1==0) + inn[in][2]=nb; + if (n1==(nbl[1]-1)) + inn[in][3]=nb; + } + if (NPROC2>1) + { + if (n2==0) + inn[in][4]=nb; + if (n2==(nbl[2]-1)) + inn[in][5]=nb; + } + if (NPROC3>1) + { + if (n3==0) + inn[in][6]=nb; + if (n3==(nbl[3]-1)) + inn[in][7]=nb; + } + } + } + } + } + + obbe=dfl_grid.obbe; + obbo=dfl_grid.obbo; + + for (ifc=0;ifc<8;ifc++) + ic[ifc]=0; + + for (in=0;in=nb) + { + ipp[im-nb]=in; + + ip=in; + iq=in; + + while (ip=4). +* +* nmr Number of block minimal residual iterations to be +* used when the SAP smoother is applied. +* +* ncy Number of SAP cycles per inverse iteration. +* +* All these are set by set_dfl_gen_parms(). Additionally, the values of +* parameters +* +* nkv Maximal number of Krylov vectors to be used by the +* solver for the little Dirac equation before a restart. +* +* nmx Maximal total number of Krylov vectors generated by +* the solver for the little Dirac equation. +* +* res Required relative residue when solving the little +* Dirac equation. +* +* are set by set_dfl_pro_parms(). +* +* On exit the argument status[0] reports the average solver iteration numbers +* that were required for the solution of the little Dirac equation. A negative +* value indicates that the program failed (-1: the solver did not converge, -2: +* the inversion of the SW term was not safe, -3: the inversion of the diagonal +* part of the little Dirac operator was not safe). In all these cases, the +* deflation subspace is initialized with the fields that were computed before +* the failure occured. +* +* The programs dfl_modes2() and dfl_update2() can be used in place of the +* programs dfl_modes() and dfl_update(), respectively, if some protection +* against the rare cases, where the little Dirac operator turns out to be +* accidentally ill-conditioned, is desired. +* +* The programs in this module perform global operations and must be called +* simultaneously on all MPI processes. The required workspaces are +* +* spinor Ns+2 (Ns: number of deflation modes per block) +* complex 2*nkv+2 +* complex_dble 4 +* +* (see utils/wspace.c) +* +* Some debugging output is printed to stdout on process 0 if DFL_MODES_DBG is +* defined at compilation time. +* +*******************************************************************************/ + +#define DFL_MODES_C + +#include +#include +#include +#include "mpi.h" +#include "utils.h" +#include "flags.h" +#include "random.h" +#include "lattice.h" +#include "block.h" +#include "uflds.h" +#include "sflds.h" +#include "vflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "dirac.h" +#include "sap.h" +#include "little.h" +#include "dfl.h" +#include "global.h" + +typedef union +{ + spinor s; + float r[24]; +} spin_t; + +static int my_rank,eoflg; +static int Ns=0,nv,nrn; +static double m0; +static complex_dble *cs1,*cs2; +static dfl_pro_parms_t dpr; +static dfl_gen_parms_t dgn; + +#ifdef DFL_MODES_DBG + +static void print_res(spinor **mds) +{ + int k; + double r; + spinor **ws; + + ws=reserve_ws(1); + + for (k=0;k1) + { + MPI_Reduce(cs1,cs2,2*n,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(cs2,2*n,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + else + { + for (k=0;k0) + { + for (l=0;l %.1e, ratio = %.1e\n",k,r0,r1,r1/r0); + } + + release_ws(); +} + + +static void dfl_smooth_fields(spinor **mds,int *status) +{ + int k,l,stat; + double r0,r1; + complex **vs,**wv; + complex_dble **wvd; + spinor **ws; + + vs=vflds()+Ns; + wv=reserve_wv(1); + wvd=reserve_wvd(1); + ws=reserve_ws(2); + r0=1.0; + r1=1.0; + + for (k=0;k %.1e, ratio = %.1e\n", + k,stat,r0,r1,r1/r0); + + mulr_spinor_add(VOLUME,mds[k],ws[1],1.0f); + + if (status[0]>=0) + { + if (stat>=0) + status[0]+=stat; + else + status[0]=stat; + } + } + + release_ws(); + release_wvd(); + release_wv(); +} + +#else + +static void smooth_fields(int ncy,spinor **mds) +{ + int k,l; + spinor **ws; + + ws=reserve_ws(1); + + for (k=0;k=0) + { + if (stat>=0) + status[0]+=stat; + else + status[0]=stat; + } + } + + release_ws(); + release_wvd(); + release_wv(); +} + +#endif + +void dfl_modes(int *status) +{ + int n,ifail; + spinor **mds; + + status[0]=0; + ifail=set_frame(); + mds=reserve_ws(Ns); + random_fields(mds); + +#ifdef DFL_MODES_DBG + if (my_rank==0) + { + printf("Progress report [program dfl_modes]:\n\n"); + printf("Ns = %d, ninv = %d, nmr = %d, ncy = %d\n", + Ns,dgn.ninv,dgn.nmr,dgn.ncy); + printf("nkv = %d, nmx = %d, res = %.1e, ifail = %d\n\n", + dpr.nkv,dpr.nmx,dpr.res,ifail); + } +#endif + + if (ifail) + { + dfl_subspace(mds); + status[0]=-2; + } + else + { + for (n=0;n<3;n++) + { + smooth_fields(n+1,mds); + +#ifdef DFL_MODES_DBG + print_res(mds); +#endif + } + + for (;n3) + renormalize_fields(mds); + + dfl_subspace(mds); + ifail=set_Awhat(dgn.mu); + + if (ifail) + { + status[0]=-3; + break; + } + else + { + dfl_smooth_fields(mds,status); + nrn+=1; + + if (status[0]<0) + break; + +#ifdef DFL_MODES_DBG + print_res(mds); +#endif + } + } + + if (status[0]>=0) + { + dfl_subspace(mds); + n=Ns*(dgn.ninv-3); + status[0]=(status[0]+n/2)/n; + } + } + + release_ws(); + set_sw_parms(m0); + if (eoflg!=1) + set_tm_parms(eoflg); + +#ifdef DFL_MODES_DBG + if (my_rank==0) + { + printf("status = %d\n",status[0]); + printf("dfl_modes: all done\n\n"); + fflush(stdout); + } +#endif +} + + +void dfl_update(int nsm,int *status) +{ + int n,ifail,iprms[1]; + spinor **mds; + + if (NPROC>1) + { + iprms[0]=nsm; + + MPI_Bcast(iprms,1,MPI_INT,0,MPI_COMM_WORLD); + + error(iprms[0]!=nsm,1,"dfl_update [dfl_modes.c]", + "Parameters are not global"); + } + + status[0]=0; + ifail=set_frame(); + mds=reserve_ws(Ns); + restore_fields(mds); + +#ifdef DFL_MODES_DBG + if (my_rank==0) + { + printf("Progress report [program dfl_update]:\n\n"); + printf("nsm = %d\n",nsm); + printf("Ns = %d, ninv = %d, nmr = %d, ncy = %d\n", + Ns,dgn.ninv,dgn.nmr,dgn.ncy); + printf("nkv = %d, nmx = %d, res = %.1e, ifail = %d\n\n", + dpr.nkv,dpr.nmx,dpr.res,ifail); + } +#endif + + if (ifail) + status[0]=-2; + else + { + for (n=0;n3)&&(n<(nsm-1))) + renormalize_fields(mds); + + dfl_subspace(mds); + +#ifdef DFL_MODES_DBG + print_res(mds); +#endif + } + } + } + + if (status[0]>0) + { + n=Ns*nsm; + status[0]=(status[0]+n/2)/n; + } + + release_ws(); + set_sw_parms(m0); + if (eoflg!=1) + set_tm_parms(eoflg); + +#ifdef DFL_MODES_DBG + if (my_rank==0) + { + printf("status = %d\n",status[0]); + printf("dfl_update: all done\n\n"); + fflush(stdout); + } +#endif +} + + +void dfl_modes2(int *status) +{ + dfl_modes(status); + + if (status[0]==-3) + { +#ifdef DFL_MODES_DBG + if (my_rank==0) + { + printf("Generation of deflation subspace failed\n"); + printf("Start second attempt\n"); + fflush(stdout); + } +#endif + + dfl_modes(status+1); + } + else + status[1]=0; +} + + +void dfl_update2(int nsm,int *status) +{ + dfl_update(nsm,status); + + if (status[0]==-3) + { +#ifdef DFL_MODES_DBG + if (my_rank==0) + { + printf("Update of deflation subspace failed\n"); + printf("Attempt to regenerate subspace\n"); + fflush(stdout); + } +#endif + + dfl_modes(status+1); + } + else + status[1]=0; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/dfl/dfl_sap_gcr.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/dfl/dfl_sap_gcr.c new file mode 100644 index 0000000000000000000000000000000000000000..9c066e95dceca9f2928c8ddbc3189ccb37fac5fe --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/dfl/dfl_sap_gcr.c @@ -0,0 +1,381 @@ + +/******************************************************************************* +* +* File dfl_sap_gcr.c +* +* Copyright (C) 2007, 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* SAP+GCR solver for the Wilson-Dirac equation with local deflation. +* +* The externally accessible functions are +* +* double dfl_sap_gcr(int nkv,int nmx,double res,double mu, +* spinor_dble *eta,spinor_dble *psi,int *status) +* Obtains an approximate solution psi of the Wilson-Dirac equation for +* given source eta using the deflated SAP-preconditioned GCR algorithm. +* See the notes for the explanation of the parameters of the program. +* +* double dfl_sap_gcr2(int nkv,int nmx,double res,double mu, +* spinor_dble *eta,spinor_dble *psi,int *status) +* This program calls dfl_sap_gcr() with the parameters nkv,..,status. +* If the solver fails and status[0]=-3 or status[1]<0, the deflation +* subspace is regenerated by calling dfl_modes(). The solver program +* dfl_sap_gcr() is then called again and the results are passed to +* the calling program. +* +* Depending on whether the twisted-mass flag is set or not, the programs +* solve the equation +* +* (Dw+i*mu*gamma_5*1e)*psi=eta or (Dw+i*mu*gamma_5)*psi=eta +* +* respectively. The twisted-mass flag is retrieved from the parameter data +* base (see flags/lat_parms.c). + +* The program dfl_sap_gcr() is based on the flexible GCR algorithm (see +* linsolv/fgcr.c). Before the solver is launched, the following parameter- +* setting programs must have been called: +* +* set_lat_parms() SW improvement coefficient. +* +* set_bc_parms() Boundary conditions and associated improvement +* coefficients. +* +* set_sw_parms() Bare quark mass. +* +* set_sap_parms() Parameters of the SAP preconditioner. +* +* set_dfl_parms() Parameters of the deflation subspace. +* +* set_dfl_pro_parms() Parameters used for the deflation projection. +* +* See doc/parms.pdf and the relevant files in the modules/flags directory +* for further explanations. The deflation subspace must have been properly +* initialized by the program dfl_subspace(). +* +* All other parameters are passed through the argument list: +* +* nkv Maximal number of Krylov vectors generated before the GCR +* algorithm is restarted. +* +* nmx Maximal total number of Krylov vectors that may be generated. +* +* res Desired maximal relative residue |eta-D*psi|/|eta| of the +* calculated solution. +* +* mu Value of the twisted mass in the Dirac equation. +* +* eta Source field. Note that source fields must vanish at global +* time 0 and NPR0C0*L0-1, as has to be the case for physical +* quark fields. eta is unchanged on exit unless psi=eta (which +* is permissible). +* +* psi Calculated approximate solution of the Dirac equation. psi +* vanishes at global time 0 and NPROC0*L0-1. +* +* The argument status must point to an array of at least 2 and 3 integers +* in the case of the programs dfl_sap_gcr() and dfl_sap_gcr2(). On exit, +* the array elements contain the following values: +* +* status[0] If the program is able to solve the Dirac equation to the +* desired accuracy, status[0] reports the total number of Krylov +* vectors that were required for the solution. Negative values +* indicate that the program failed (-1: the algorithm did not +* converge, -2: the inversion of the SW term on the odd points +* was not safe, -3: the inversion of the diagonal parts of the +* little Dirac operator was not safe). +* +* status[1] Average number of GCR iterations needed for the solution of +* the little Dirac equation in the course of the deflation +* projection. +* +* The program dfl_sap_gcr2() in addition returns +* +* status[2] Average solver iteration numbers that were required for the +* solution of the little Dirac equation when the deflation sub- +* space had to be regenerated (if the regeneration fails, the +* dfl_sap_gcr2() program terminates with an error message). +* +* If status[0]>=-1 and status[1]>=0, the programs return the norm of the +* residue of the calculated approximate solution. Otherwise the field psi +* is set to zero and the program returns the norm of the source eta. +* +* The SAP_BLOCKS blocks grid is automatically allocated or reallocated if +* it is not already allocated with the correct block size. The SW term is +* recalculated when needed and the gauge and SW fields are copied to the +* SAP block grid if they are not in the proper condition. Similarly, the +* little Dirac operator is updated when needed. +* +* The program dfl_sap_gcr2() can be used in place of dfl_sap_gcr() if +* some protection against the rare cases, where the little Dirac operator +* turns out to be accidentally ill-conditioned, is desired. +* +* Evidently the SAP+GCR solver is a global program that must be called on +* all processes simultaneously. The required workspaces are +* +* spinor 2*nkv+2 +* spinor_dble 3 [2 in the case of dfl_sap_gcr()] +* complex 2*nkv_pro+2 +* complex_dble 4 +* +* (see utils/wspace.c), where nkv_pro, the maximal number of Krylov vectors +* generated before the GCR solver of the little Dirac equation is restarted, +* is a parameter set by dfl_pro_parms(). +* +*******************************************************************************/ + +#define DFL_SAP_GCR_C + +#include +#include +#include +#include "mpi.h" +#include "utils.h" +#include "flags.h" +#include "block.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "dirac.h" +#include "linsolv.h" +#include "sap.h" +#include "vflds.h" +#include "little.h" +#include "dfl.h" +#include "global.h" + +static int nit,stat,nv; +static float mus; +static double mud; +static sap_parms_t spr; +static dfl_pro_parms_t dpr; + + +static void Dop(spinor_dble *s,spinor_dble *r) +{ + Dw_dble(mud,s,r); +} + + +static void Mop(int k,spinor *rho,spinor *phi,spinor *chi) +{ + int n,status; + complex **wv; + complex_dble **wvd; + spinor **ws; + + wv=reserve_wv(1); + wvd=reserve_wvd(1); + ws=reserve_ws(1); + + dfl_s2v(rho,wv[0]); + assign_v2vd(nv,wv[0],wvd[0]); + ltl_gcr(dpr.nkv,dpr.nmx,dpr.res,mud,wvd[0],wvd[0],&status); + assign_vd2v(nv,wvd[0],wv[0]); + dfl_v2s(wv[0],ws[0]); + + Dw(mus,ws[0],chi); + diff_s2s(VOLUME,rho,chi); + set_s2zero(VOLUME,phi); + + for (n=0;n=0) + { + if (status>=0) + { + nit+=1; + stat+=status; + } + else + stat=status; + } + + release_ws(); + release_wvd(); + release_wv(); +} + + +double dfl_sap_gcr(int nkv,int nmx,double res,double mu, + spinor_dble *eta,spinor_dble *psi,int *status) +{ + int *bs,nb,isw,ifail; + int swde,swdo,swu,swe,swo; + double rho,rho0,fact; + spinor **ws; + spinor_dble **wsd,**rsd; + dfl_parms_t dfl; + + dfl=dfl_parms(); + error_root(dfl.Ns==0,1,"dfl_sap_gcr [dfl_sap_gcr.c]", + "Deflation parameters are not set"); + bs=dfl.bs; + nv=dfl.Ns*VOLUME/(bs[0]*bs[1]*bs[2]*bs[3]); + + spr=sap_parms(); + error_root(spr.ncy==0,1,"dfl_sap_gcr [dfl_sap_gcr.c]", + "SAP parameters are not set"); + + dpr=dfl_pro_parms(); + error_root(dpr.nkv==0,1,"dfl_sap_gcr [dfl_sap_gcr.c]", + "Deflation projector parameters are not set"); + + blk_list(SAP_BLOCKS,&nb,&isw); + + if (nb==0) + alloc_bgr(SAP_BLOCKS); + + if (query_grid_flags(SAP_BLOCKS,UBGR_MATCH_UD)!=1) + assign_ud2ubgr(SAP_BLOCKS); + + if (query_flags(SWD_UP2DATE)!=1) + sw_term(NO_PTS); + + swde=query_flags(SWD_E_INVERTED); + swdo=query_flags(SWD_O_INVERTED); + + swu=query_grid_flags(SAP_BLOCKS,SW_UP2DATE); + swe=query_grid_flags(SAP_BLOCKS,SW_E_INVERTED); + swo=query_grid_flags(SAP_BLOCKS,SW_O_INVERTED); + ifail=0; + + if (spr.isolv==0) + { + if ((swde==1)||(swdo==1)) + sw_term(NO_PTS); + + if ((swu!=1)||(swe==1)||(swo==1)) + assign_swd2swbgr(SAP_BLOCKS,NO_PTS); + } + else if (spr.isolv==1) + { + if ((swde!=1)&&(swdo==1)) + { + if ((swu!=1)||(swe==1)||(swo!=1)) + assign_swd2swbgr(SAP_BLOCKS,NO_PTS); + + sw_term(NO_PTS); + } + else + { + if ((swde==1)||(swdo==1)) + sw_term(NO_PTS); + + if ((swu!=1)||(swe==1)||(swo!=1)) + ifail=assign_swd2swbgr(SAP_BLOCKS,ODD_PTS); + } + } + else + error_root(1,1,"dfl_sap_gcr [dfl_sap_gcr.c]","Unknown block solver"); + + if (query_flags(U_MATCH_UD)!=1) + assign_ud2u(); + + if ((query_flags(SW_UP2DATE)!=1)|| + (query_flags(SW_E_INVERTED)==1)||(query_flags(SW_O_INVERTED)==1)) + assign_swd2sw(); + + rho0=sqrt(norm_square_dble(VOLUME,1,eta)); + rho=rho0; + status[0]=0; + status[1]=0; + + if (ifail) + status[0]=-2; + else + { + ifail=set_Awhat(mu); + + if (ifail) + status[0]=-3; + else + { + ws=reserve_ws(2*nkv+1); + wsd=reserve_wsd(1); + rsd=reserve_wsd(1); + + nit=0; + stat=0; + mus=(float)(mu); + mud=mu; + + fact=rho0/sqrt((double)(VOLUME)*(double)(24*NPROC)); + + if (fact!=0.0) + { + assign_sd2sd(VOLUME,eta,rsd[0]); + scale_dble(VOLUME,1.0/fact,rsd[0]); + + rho=fgcr(VOLUME,1,Dop,Mop,ws,wsd,nkv,nmx,res,rsd[0],psi,status); + + scale_dble(VOLUME,fact,psi); + rho*=fact; + + if ((nit>0)&&(stat>=0)) + status[1]=(stat+nit/2)/nit; + else if (stat<0) + status[1]=stat; + } + else + { + rho=0.0; + set_sd2zero(VOLUME,psi); + } + + release_wsd(); + release_wsd(); + release_ws(); + } + } + + if ((status[0]<-1)||(status[1]<0)) + { + rho=rho0; + set_sd2zero(VOLUME,psi); + } + + return rho; +} + + +double dfl_sap_gcr2(int nkv,int nmx,double res,double mu, + spinor_dble *eta,spinor_dble *psi,int *status) +{ + double rho; + spinor_dble **wsd; + + wsd=reserve_wsd(1); + + if (eta==psi) + { + assign_sd2sd(VOLUME,eta,wsd[0]); + eta=wsd[0]; + } + + rho=dfl_sap_gcr(nkv,nmx,res,mu,eta,psi,status); + + if ((status[0]==-3)||(status[1]<0)) + { + dfl_modes(status+2); + + error_root(status[2]<0,1,"dfl_sap_gcr2 [dfl_sap_gcr.c]", + "Deflation subspace regeneration failed (status = %d)", + status[2]); + + rho=dfl_sap_gcr(nkv,nmx,res,mu,eta,psi,status); + } + else + status[2]=0; + + release_wsd(); + + return rho; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/dfl/dfl_subspace.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/dfl/dfl_subspace.c new file mode 100644 index 0000000000000000000000000000000000000000..c196014100807d65a118e40b6aaeadd2f5270008 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/dfl/dfl_subspace.c @@ -0,0 +1,378 @@ + +/******************************************************************************* +* +* File dfl_subspace.c +* +* Copyright (C) 2007, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Basic utility programs related to the deflation subspace. +* +* The externally accessible functions are +* +* void dfl_sd2vd(spinor_dble *sd,complex_dble *vd) +* Assigns the components of the global double-precision spinor field +* sd along the deflation subspace to the double-precision vector +* field vd. +* +* void dfl_vd2sd(complex_dble *vd,spinor_dble *sd) +* Assigns the element of the deflation subspace corresponding to the +* double-precision vector field vd to the global double-precision spinor +* field sd. +* +* void dfl_sub_vd2sd(complex_dble *vd,spinor_dble *sd) +* Subtracts the element of the deflation subspace corresponding to the +* double-precision vector field vd from the global double-precision +* spinor field sd. +* +* void dfl_s2v(spinor *s,complex *v) +* Assigns the components of the global single-precision spinor field +* s along the deflation subspace to the single-precision vector +* field v. +* +* void dfl_v2s(complex *v,spinor *s) +* Assigns the element of the deflation subspace corresponding to the +* single-precision vector field v to the global single-precision spinor +* field s. +* +* void dfl_sub_v2s(complex *v,spinor *s) +* Subtracts the element of the deflation subspace corresponding to the +* double-precision vector field v from the global single-precision spinor +* field s. +* +* void dfl_subspace(spinor **mds) +* Copies the global single-precision spinor fields mds[0],..,mds[Ns-1] +* to the fields b.sd[1],..,b.sd[Ns] on the blocks b of the DFL_BLOCKS +* grid. The block fields are then orthonormalized and are assigned to +* the single-precision block fields b.s[1],..,b.s[Ns]. +* In this basis of fields, the modes mds[0],..,mds[Ns-1] are given by +* fields vmds[0],..,vmds[Ns-1] of Ns*nb complex numbers, where nb is +* the number of blocks in the block grid. These fields are assigned to +* the last Ns single-precision vector fields of the array returned by +* vflds() [vflds/vflds.c]. +* +* Notes: +* +* The deflation subspace is spanned by the fields (*b).sd[1],..,(*b).sd[Ns] +* on the blocks b of the DFL_BLOCKS grid. The number Ns of fields is set by +* the program dfl_set_parms() [flags/dfl_parms.c]. +* +* Any spinor field in the deflation subspace is a linear combination of the +* basis elements on the blocks. The associated complex coefficients form a +* vector field of the type described in vflds/vflds.c. Such fields are thus +* in one-to-one correspondence with the deflation modes. In particular, the +* deflation subspace contains the global spinor fields from which it was +* created by the program dfl_subspace(). +* +* The program dfl_subspace() allocates the DFL_BLOCKS block grid if it is +* not already allocated. This program involves global operations and must be +* called simultaneously on all processes. +* +*******************************************************************************/ + +#define DFL_SUBSPACE_C + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "linalg.h" +#include "sflds.h" +#include "block.h" +#include "vflds.h" +#include "dfl.h" +#include "global.h" + + +void dfl_sd2vd(spinor_dble *sd,complex_dble *vd) +{ + int Ns,nb,nbh,isw; + int n,m,i,vol; + block_t *b; + spinor_dble **sdb; + dfl_parms_t dfl; + + dfl=dfl_parms(); + Ns=dfl.Ns; + b=blk_list(DFL_BLOCKS,&nb,&isw); + nbh=nb/2; + vol=(*b).vol; + + for (n=0;n=-1, the program returns the norm of the residue of the +* calculated approximate solution of the even-odd preconditioned, globally +* deflated little Dirac equation. No action is performed if status=-2 +* and the program returns 1.0. +* +* The even-odd preconditioned little Dirac operator is updated if it is +* not up-to-date. Evidently the solver is a global program that must be +* called on all processes simultaneously. The required workspaces are +* +* complex 2*nkv+1 +* complex_dble 3 +* +* (see utils/wspace.c). +* +*******************************************************************************/ + +#define LTL_GCR_C + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "utils.h" +#include "flags.h" +#include "vflds.h" +#include "linalg.h" +#include "linsolv.h" +#include "little.h" +#include "dfl.h" +#include "global.h" + +static int Ns=0,nv,nvh; +static double rvol; +static complex **vs; +static complex_dble **vds,*awd,*cs1,*cs2; + + +static void set_constants(void) +{ + dfl_parms_t dfl; + dfl_grid_t grd; + + dfl=dfl_parms(); + grd=dfl_geometry(); + + Ns=dfl.Ns; + nv=Ns*grd.nb; + nvh=nv/2; + rvol=1.0/sqrt((double)(nv)*(double)(NPROC)); + + vs=vflds(); + vds=vdflds(); + awd=ltl_matrix(); + + cs1=amalloc(2*Ns*sizeof(*cs1),ALIGN); + error(cs1==NULL,1,"set_constants [ltl_gcr.c]", + "Unable to allocate auxiliary arrays"); + cs2=cs1+Ns; +} + + +static void sum_vprod(int n,complex_dble *z,complex_dble *w) +{ + int k; + + if (NPROC>1) + { + MPI_Reduce(z,w,2*n,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(w,2*n,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + else + { + for (k=0;k +#include +#include +#include "mpi.h" +#include "su3.h" +#include "utils.h" +#include "flags.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "sw_term.h" +#include "block.h" +#include "dirac.h" +#include "global.h" + +#define N0 (NPROC0*L0) + +typedef union +{ + spinor s; + weyl w[2]; +} spin_t; + +static float coe,ceo; +static const spinor s0={{{0.0}}}; +static spin_t rs ALIGNED32; + +#if (defined AVX) +#include "avx.h" + +#define _load_cst(c) \ +__asm__ __volatile__ ("vbroadcastss %0, %%ymm15 \n\t" \ + : \ + : \ + "m" (c) \ + : \ + "xmm15") + +#define _mul_cst() \ +__asm__ __volatile__ ("vmulps %%ymm15, %%ymm0, %%ymm0 \n\t" \ + "vmulps %%ymm15, %%ymm1, %%ymm1 \n\t" \ + "vmulps %%ymm15, %%ymm2, %%ymm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2") + + +static void doe(int *piup,int *pidn,su3 *u,spinor *pk) +{ + spinor *sp,*sm; + +/******************************** direction 0 *********************************/ + + sp=pk+piup[0]; + sm=pk+pidn[0]; + + _avx_spinor_pair_load34(*sp,*sm); + + sp=pk+piup[1]; + sm=pk+pidn[1]; + _prefetch_spinor(sp); + _prefetch_spinor(sm); + + _avx_spinor_mul_up(_avx_sgn_add); + _avx_spinor_add(); + + _avx_su3_pair_mixed_multiply(u[0],u[1]); + + _avx_spinor_split(); + _avx_spinor_unsplit(); + _avx_spinor_store_up(rs.s); + +/******************************** direction 1 *********************************/ + + _avx_spinor_pair_load43(*sp,*sm); + + sp=pk+piup[2]; + sm=pk+pidn[2]; + _prefetch_spinor(sp); + _prefetch_spinor(sm); + + _avx_spinor_imul_up(_avx_sgn_i_add); + _avx_spinor_add(); + + _avx_su3_pair_mixed_multiply(u[2],u[3]); + + _avx_spinor_split(); + _avx_spinor_load(rs.s); + _avx_weyl_xch_imul(_sse_sgn24); + _avx_spinor_unsplit(); + _avx_spinor_add(); + _avx_spinor_store(rs.s); + +/******************************** direction 2 *********************************/ + + _avx_spinor_pair_load43(*sp,*sm); + + sp=pk+piup[3]; + sm=pk+pidn[3]; + _prefetch_spinor(sp); + _prefetch_spinor(sm); + + _avx_spinor_mul_up(_avx_sgn_addsub); + _avx_spinor_add(); + + _avx_su3_pair_mixed_multiply(u[4],u[5]); + + _avx_spinor_split(); + _avx_spinor_load(rs.s); + _avx_weyl_xch(); + _avx_weyl_mul(_sse_sgn12); + _avx_spinor_unsplit(); + _avx_spinor_add(); + _avx_spinor_store(rs.s); + +/******************************** direction 3 *********************************/ + + _avx_spinor_pair_load34(*sp,*sm); + _avx_spinor_imul_up(_avx_sgn_i_addsub); + _avx_spinor_add(); + + _avx_su3_pair_mixed_multiply(u[6],u[7]); + + _avx_spinor_split(); + _avx_spinor_load(rs.s); + _avx_weyl_imul(_sse_sgn23); + _avx_spinor_unsplit(); + _load_cst(coe); + _avx_spinor_add(); + _mul_cst(); + _avx_weyl_pair_store12(rs.w[0],rs.w[1]); + + _avx_zeroupper(); +} + + +static void deo(int *piup,int *pidn,su3 *u,spinor *pl) +{ + spinor *sp,*sm; + + _load_cst(ceo); + _avx_spinor_load(rs.s); + _mul_cst(); + _avx_spinor_store(rs.s); + +/******************************** direction 0 *********************************/ + + sm=pl+pidn[0]; + sp=pl+piup[0]; + + _prefetch_spinor(sm); + _prefetch_spinor(sp); + + _avx_spinor_load_dup(rs.s); + _avx_spinor_mul_up(_avx_sgn_add); + _avx_spinor_add(); + + _avx_su3_pair_mixed_multiply(u[1],u[0]); + + _avx_weyl_pair_load12(*sm,*sp); + _avx_spinor_add(); + _avx_weyl_pair_store12(*sm,*sp); + + _avx_weyl_pair_load34(*sm,*sp); + _avx_spinor_mul_up(_avx_sgn_add); + _avx_spinor_add(); + _avx_weyl_pair_store34(*sm,*sp); + +/******************************** direction 1 *********************************/ + + sm=pl+pidn[1]; + sp=pl+piup[1]; + + _prefetch_spinor(sm); + _prefetch_spinor(sp); + + _avx_spinor_load_dup(rs.s); + _avx_spinor_xch_imul_up(_avx_sgn_i_add); + _avx_spinor_add(); + + _avx_su3_pair_mixed_multiply(u[3],u[2]); + + _avx_weyl_pair_load12(*sm,*sp); + _avx_spinor_add(); + _avx_weyl_pair_store12(*sm,*sp); + + _avx_weyl_pair_load34(*sm,*sp); + _avx_spinor_xch_imul_up(_avx_sgn_i_add); + _avx_spinor_sub(); + _avx_weyl_pair_store34(*sm,*sp); + +/******************************** direction 2 *********************************/ + + sm=pl+pidn[2]; + sp=pl+piup[2]; + + _prefetch_spinor(sm); + _prefetch_spinor(sp); + + _avx_spinor_load_dup(rs.s); + _avx_spinor_xch_up(); + _avx_spinor_mul_up(_avx_sgn_addsub); + _avx_spinor_add(); + + _avx_su3_pair_mixed_multiply(u[5],u[4]); + + _avx_weyl_pair_load12(*sm,*sp); + _avx_spinor_add(); + _avx_weyl_pair_store12(*sm,*sp); + + _avx_weyl_pair_load34(*sm,*sp); + _avx_spinor_xch_up(); + _avx_spinor_mul_up(_avx_sgn_addsub); + _avx_spinor_sub(); + _avx_weyl_pair_store34(*sm,*sp); + +/******************************** direction 3 *********************************/ + + sm=pl+pidn[3]; + sp=pl+piup[3]; + + _prefetch_spinor(sm); + _prefetch_spinor(sp); + + _avx_spinor_load_dup(rs.s); + _avx_spinor_imul_up(_avx_sgn_i_addsub); + _avx_spinor_add(); + + _avx_su3_pair_mixed_multiply(u[7],u[6]); + + _avx_weyl_pair_load12(*sm,*sp); + _avx_spinor_add(); + _avx_weyl_pair_store12(*sm,*sp); + + _avx_weyl_pair_load34(*sm,*sp); + _avx_spinor_imul_up(_avx_sgn_i_addsub); + _avx_spinor_sub(); + _avx_weyl_pair_store34(*sm,*sp); + + _avx_zeroupper(); +} + +#elif (defined x64) +#include "sse2.h" + +#define _load_cst(c) \ +__asm__ __volatile__ ("movss %0, %%xmm15 \n\t" \ + "shufps $0x0, %%xmm15, %%xmm15" \ + : \ + : \ + "m" (c) \ + : \ + "xmm15") + +#define _mul_cst() \ +__asm__ __volatile__ ("mulps %%xmm15, %%xmm0 \n\t" \ + "mulps %%xmm15, %%xmm1 \n\t" \ + "mulps %%xmm15, %%xmm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2") + +#define _mul_cst_up() \ +__asm__ __volatile__ ("mulps %%xmm15, %%xmm3 \n\t" \ + "mulps %%xmm15, %%xmm4 \n\t" \ + "mulps %%xmm15, %%xmm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5") + + +static void doe(int *piup,int *pidn,su3 *u,spinor *pk) +{ + spinor *sp,*sm; + +/******************************* direction +0 *********************************/ + + sp=pk+(*(piup++)); + + _sse_pair_load((*sp).c1,(*sp).c2); + _sse_pair_load_up((*sp).c3,(*sp).c4); + + sm=pk+(*(pidn++)); + _prefetch_spinor(sm); + + _sse_vector_add(); + sp=pk+(*(piup++)); + _prefetch_spinor(sp); + _sse_su3_multiply(*u); + + _sse_weyl_store_up(rs.w[0]); + _sse_weyl_store_up(rs.w[1]); + +/******************************* direction -0 *********************************/ + + _sse_pair_load((*sm).c1,(*sm).c2); + _sse_pair_load_up((*sm).c3,(*sm).c4); + + u+=2; + _prefetch_su3_dble(u); + u-=1; + _sse_vector_sub(); + sm=pk+(*(pidn++)); + _prefetch_spinor(sm); + _sse_su3_inverse_multiply(*u); + + _sse_weyl_load(rs.w[0]); + _sse_vector_add(); + _sse_weyl_store(rs.w[0]); + + _sse_weyl_load(rs.w[1]); + _sse_vector_sub(); + _sse_weyl_store(rs.w[1]); + +/******************************* direction +1 *********************************/ + + _sse_pair_load((*sp).c1,(*sp).c2); + _sse_pair_load_up((*sp).c4,(*sp).c3); + + _sse_vector_i_add(); + sp=pk+(*(piup++)); + _prefetch_spinor(sp); + u+=1; + _sse_su3_multiply(*u); + + _sse_weyl_load(rs.w[0]); + _sse_vector_add(); + _sse_weyl_store(rs.w[0]); + + _sse_weyl_load(rs.w[1]); + _sse_vector_xch_i_sub(); + _sse_weyl_store(rs.w[1]); + +/******************************* direction -1 *********************************/ + + _sse_pair_load((*sm).c1,(*sm).c2); + _sse_pair_load_up((*sm).c4,(*sm).c3); + + u+=2; + _prefetch_su3_dble(u); + u-=1; + _sse_vector_i_sub(); + sm=pk+(*(pidn++)); + _prefetch_spinor(sm); + _sse_su3_inverse_multiply(*u); + + _sse_weyl_load(rs.w[0]); + _sse_vector_add(); + _sse_weyl_store(rs.w[0]); + + _sse_weyl_load(rs.w[1]); + _sse_vector_xch_i_add(); + _sse_weyl_store(rs.w[1]); + +/******************************* direction +2 *********************************/ + + _sse_pair_load((*sp).c1,(*sp).c2); + _sse_pair_load_up((*sp).c4,(*sp).c3); + + _sse_vector_addsub(); + + u+=1; + _sse_su3_multiply(*u); + sp=pk+(*(piup)); + _prefetch_spinor(sp); + _sse_weyl_load(rs.w[0]); + _sse_vector_add(); + _sse_weyl_store(rs.w[0]); + + _sse_weyl_load(rs.w[1]); + _sse_vector_xch(); + _sse_vector_subadd(); + _sse_weyl_store(rs.w[1]); + +/******************************* direction -2 *********************************/ + + _sse_pair_load((*sm).c1,(*sm).c2); + _sse_pair_load_up((*sm).c4,(*sm).c3); + + u+=2; + _prefetch_su3_dble(u); + u-=1; + _sse_vector_subadd(); + sm=pk+(*(pidn)); + _prefetch_spinor(sm); + _sse_su3_inverse_multiply(*u); + + _sse_weyl_load(rs.w[0]); + _sse_vector_add(); + _sse_weyl_store(rs.w[0]); + + _sse_weyl_load(rs.w[1]); + _sse_vector_xch(); + _sse_vector_addsub(); + _sse_weyl_store(rs.w[1]); + +/******************************* direction +3 *********************************/ + + _sse_pair_load((*sp).c1,(*sp).c2); + _sse_pair_load_up((*sp).c3,(*sp).c4); + + _sse_vector_i_addsub(); + u+=1; + _sse_su3_multiply(*u); + + _sse_weyl_load(rs.w[0]); + _sse_vector_add(); + _sse_weyl_store(rs.w[0]); + + _sse_weyl_load(rs.w[1]); + _sse_vector_i_subadd(); + _sse_weyl_store(rs.w[1]); + +/******************************* direction -3 *********************************/ + + _sse_pair_load((*sm).c1,(*sm).c2); + _sse_pair_load_up((*sm).c3,(*sm).c4); + + u+=2; + _prefetch_su3_dble(u); + u-=1; + _sse_vector_i_subadd(); + _sse_su3_inverse_multiply(*u); + + _load_cst(coe); + _sse_weyl_load(rs.w[0]); + _sse_vector_add(); + _mul_cst(); + _sse_pair_store(rs.s.c1,rs.s.c2); + + _sse_weyl_load(rs.w[1]); + _sse_vector_i_addsub(); + _mul_cst(); + _sse_pair_store(rs.s.c3,rs.s.c4); +} + + +static void deo(int *piup,int *pidn,su3 *u,spinor *pl) +{ + spinor *sp,*sm; + +/******************************* direction +0 *********************************/ + + sp=pl+(*(piup++)); + _prefetch_spinor(sp); + + _load_cst(ceo); + _sse_pair_load(rs.s.c1,rs.s.c2); + _sse_pair_load_up(rs.s.c3,rs.s.c4); + _mul_cst(); + _mul_cst_up(); + _sse_weyl_store(rs.w[0]); + _sse_weyl_store_up(rs.w[1]); + + sm=pl+(*(pidn++)); + _prefetch_spinor(sm); + _sse_vector_sub(); + _sse_su3_inverse_multiply(*u); + + _sse_pair_load((*sp).c1,(*sp).c2); + _sse_vector_add(); + _sse_pair_store((*sp).c1,(*sp).c2); + + _sse_pair_load((*sp).c3,(*sp).c4); + _sse_vector_sub(); + _sse_pair_store((*sp).c3,(*sp).c4); + +/******************************* direction -0 *********************************/ + + _sse_weyl_load(rs.w[0]); + _sse_weyl_load_up(rs.w[1]); + + sp=pl+(*(piup++)); + _prefetch_spinor(sp); + _sse_vector_add(); + u+=1; + _sse_su3_multiply(*u); + + _sse_pair_load((*sm).c1,(*sm).c2); + _sse_vector_add(); + _sse_pair_store((*sm).c1,(*sm).c2); + + _sse_pair_load((*sm).c3,(*sm).c4); + _sse_vector_add(); + _sse_pair_store((*sm).c3,(*sm).c4); + +/******************************* direction +1 *********************************/ + + _sse_weyl_load(rs.w[0]); + _sse_weyl_load_up(rs.w[1]); + + sm=pl+(*(pidn++)); + _prefetch_spinor(sm); + _sse_vector_xch_i_sub(); + u+=1; + _sse_su3_inverse_multiply(*u); + + _sse_pair_load((*sp).c1,(*sp).c2); + _sse_vector_add(); + _sse_pair_store((*sp).c1,(*sp).c2); + + _sse_pair_load((*sp).c3,(*sp).c4); + _sse_vector_xch_i_add(); + _sse_pair_store((*sp).c3,(*sp).c4); + +/******************************* direction -1 *********************************/ + + _sse_weyl_load(rs.w[0]); + _sse_weyl_load_up(rs.w[1]); + + sp=pl+(*(piup++)); + _prefetch_spinor(sp); + _sse_vector_xch_i_add(); + u+=1; + _sse_su3_multiply(*u); + + _sse_pair_load((*sm).c1,(*sm).c2); + _sse_vector_add(); + _sse_pair_store((*sm).c1,(*sm).c2); + + _sse_pair_load((*sm).c3,(*sm).c4); + _sse_vector_xch_i_sub(); + _sse_pair_store((*sm).c3,(*sm).c4); + +/******************************* direction +2 *********************************/ + + _sse_weyl_load(rs.w[0]); + _sse_weyl_load_up(rs.w[1]); + + sm=pl+(*(pidn++)); + _prefetch_spinor(sm); + _sse_vector_xch(); + _sse_vector_subadd(); + u+=1; + _sse_su3_inverse_multiply(*u); + + _sse_pair_load((*sp).c1,(*sp).c2); + _sse_vector_add(); + _sse_pair_store((*sp).c1,(*sp).c2); + + _sse_pair_load((*sp).c3,(*sp).c4); + _sse_vector_xch(); + _sse_vector_addsub(); + _sse_pair_store((*sp).c3,(*sp).c4); + +/******************************* direction -2 *********************************/ + + _sse_weyl_load(rs.w[0]); + _sse_weyl_load_up(rs.w[1]); + + sp=pl+(*(piup)); + _prefetch_spinor(sp); + _sse_vector_xch(); + _sse_vector_addsub(); + u+=1; + _sse_su3_multiply(*u); + + _sse_pair_load((*sm).c1,(*sm).c2); + _sse_vector_add(); + _sse_pair_store((*sm).c1,(*sm).c2); + + _sse_pair_load((*sm).c3,(*sm).c4); + _sse_vector_xch(); + _sse_vector_subadd(); + _sse_pair_store((*sm).c3,(*sm).c4); + +/******************************* direction +3 *********************************/ + + _sse_weyl_load(rs.w[0]); + _sse_weyl_load_up(rs.w[1]); + + sm=pl+(*(pidn)); + _prefetch_spinor(sm); + _sse_vector_i_subadd(); + u+=1; + _sse_su3_inverse_multiply(*u); + + _sse_pair_load((*sp).c1,(*sp).c2); + _sse_vector_add(); + _sse_pair_store((*sp).c1,(*sp).c2); + + _sse_pair_load((*sp).c3,(*sp).c4); + _sse_vector_i_addsub(); + _sse_pair_store((*sp).c3,(*sp).c4); + +/******************************* direction -3 *********************************/ + + _sse_weyl_load(rs.w[0]); + _sse_weyl_load_up(rs.w[1]); + + _sse_vector_i_addsub(); + u+=1; + _sse_su3_multiply(*u); + + _sse_pair_load((*sm).c1,(*sm).c2); + _sse_vector_add(); + _sse_pair_store((*sm).c1,(*sm).c2); + + _sse_pair_load((*sm).c3,(*sm).c4); + _sse_vector_i_subadd(); + _sse_pair_store((*sm).c3,(*sm).c4); +} + +#else + +#define _vector_mul_assign(r,c) \ + (r).c1.re*=(c); \ + (r).c1.im*=(c); \ + (r).c2.re*=(c); \ + (r).c2.im*=(c); \ + (r).c3.re*=(c); \ + (r).c3.im*=(c) + + +static void doe(int *piup,int *pidn,su3 *u,spinor *pk) +{ + spinor *sp,*sm; + su3_vector psi,chi; + +/******************************* direction +0 *********************************/ + + sp=pk+(*(piup++)); + + _vector_add(psi,(*sp).c1,(*sp).c3); + _su3_multiply(rs.s.c1,*u,psi); + rs.s.c3=rs.s.c1; + + _vector_add(psi,(*sp).c2,(*sp).c4); + _su3_multiply(rs.s.c2,*u,psi); + rs.s.c4=rs.s.c2; + +/******************************* direction -0 *********************************/ + + sm=pk+(*(pidn++)); + u+=1; + + _vector_sub(psi,(*sm).c1,(*sm).c3); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c1,chi); + _vector_sub_assign(rs.s.c3,chi); + + _vector_sub(psi,(*sm).c2,(*sm).c4); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c2,chi); + _vector_sub_assign(rs.s.c4,chi); + +/******************************* direction +1 *********************************/ + + sp=pk+(*(piup++)); + u+=1; + + _vector_i_add(psi,(*sp).c1,(*sp).c4); + _su3_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c1,chi); + _vector_i_sub_assign(rs.s.c4,chi); + + _vector_i_add(psi,(*sp).c2,(*sp).c3); + _su3_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c2,chi); + _vector_i_sub_assign(rs.s.c3,chi); + +/******************************* direction -1 *********************************/ + + sm=pk+(*(pidn++)); + u+=1; + + _vector_i_sub(psi,(*sm).c1,(*sm).c4); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c1,chi); + _vector_i_add_assign(rs.s.c4,chi); + + _vector_i_sub(psi,(*sm).c2,(*sm).c3); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c2,chi); + _vector_i_add_assign(rs.s.c3,chi); + +/******************************* direction +2 *********************************/ + + sp=pk+(*(piup++)); + u+=1; + + _vector_add(psi,(*sp).c1,(*sp).c4); + _su3_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c1,chi); + _vector_add_assign(rs.s.c4,chi); + + _vector_sub(psi,(*sp).c2,(*sp).c3); + _su3_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c2,chi); + _vector_sub_assign(rs.s.c3,chi); + +/******************************* direction -2 *********************************/ + + sm=pk+(*(pidn++)); + u+=1; + + _vector_sub(psi,(*sm).c1,(*sm).c4); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c1,chi); + _vector_sub_assign(rs.s.c4,chi); + + _vector_add(psi,(*sm).c2,(*sm).c3); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c2,chi); + _vector_add_assign(rs.s.c3,chi); + +/******************************* direction +3 *********************************/ + + sp=pk+(*(piup)); + u+=1; + + _vector_i_add(psi,(*sp).c1,(*sp).c3); + _su3_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c1,chi); + _vector_i_sub_assign(rs.s.c3,chi); + + _vector_i_sub(psi,(*sp).c2,(*sp).c4); + _su3_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c2,chi); + _vector_i_add_assign(rs.s.c4,chi); + +/******************************* direction -3 *********************************/ + + sm=pk+(*(pidn)); + u+=1; + + _vector_i_sub(psi,(*sm).c1,(*sm).c3); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c1,chi); + _vector_i_add_assign(rs.s.c3,chi); + + _vector_i_add(psi,(*sm).c2,(*sm).c4); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c2,chi); + _vector_i_sub_assign(rs.s.c4,chi); + + _vector_mul_assign(rs.s.c1,coe); + _vector_mul_assign(rs.s.c2,coe); + _vector_mul_assign(rs.s.c3,coe); + _vector_mul_assign(rs.s.c4,coe); +} + + +static void deo(int *piup,int *pidn,su3 *u,spinor *pl) +{ + spinor *sp,*sm; + su3_vector psi,chi; + + _vector_mul_assign(rs.s.c1,ceo); + _vector_mul_assign(rs.s.c2,ceo); + _vector_mul_assign(rs.s.c3,ceo); + _vector_mul_assign(rs.s.c4,ceo); + +/******************************* direction +0 *********************************/ + + sp=pl+(*(piup++)); + + _vector_sub(psi,rs.s.c1,rs.s.c3); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign((*sp).c1,chi); + _vector_sub_assign((*sp).c3,chi); + + _vector_sub(psi,rs.s.c2,rs.s.c4); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign((*sp).c2,chi); + _vector_sub_assign((*sp).c4,chi); + +/******************************* direction -0 *********************************/ + + sm=pl+(*(pidn++)); + u+=1; + + _vector_add(psi,rs.s.c1,rs.s.c3); + _su3_multiply(chi,*u,psi); + _vector_add_assign((*sm).c1,chi); + _vector_add_assign((*sm).c3,chi); + + _vector_add(psi,rs.s.c2,rs.s.c4); + _su3_multiply(chi,*u,psi); + _vector_add_assign((*sm).c2,chi); + _vector_add_assign((*sm).c4,chi); + +/******************************* direction +1 *********************************/ + + sp=pl+(*(piup++)); + u+=1; + + _vector_i_sub(psi,rs.s.c1,rs.s.c4); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign((*sp).c1,chi); + _vector_i_add_assign((*sp).c4,chi); + + _vector_i_sub(psi,rs.s.c2,rs.s.c3); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign((*sp).c2,chi); + _vector_i_add_assign((*sp).c3,chi); + +/******************************* direction -1 *********************************/ + + sm=pl+(*(pidn++)); + u+=1; + + _vector_i_add(psi,rs.s.c1,rs.s.c4); + _su3_multiply(chi,*u,psi); + _vector_add_assign((*sm).c1,chi); + _vector_i_sub_assign((*sm).c4,chi); + + _vector_i_add(psi,rs.s.c2,rs.s.c3); + _su3_multiply(chi,*u,psi); + _vector_add_assign((*sm).c2,chi); + _vector_i_sub_assign((*sm).c3,chi); + +/******************************* direction +2 *********************************/ + + sp=pl+(*(piup++)); + u+=1; + + _vector_sub(psi,rs.s.c1,rs.s.c4); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign((*sp).c1,chi); + _vector_sub_assign((*sp).c4,chi); + + _vector_add(psi,rs.s.c2,rs.s.c3); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign((*sp).c2,chi); + _vector_add_assign((*sp).c3,chi); + +/******************************* direction -2 *********************************/ + + sm=pl+(*(pidn++)); + u+=1; + + _vector_add(psi,rs.s.c1,rs.s.c4); + _su3_multiply(chi,*u,psi); + _vector_add_assign((*sm).c1,chi); + _vector_add_assign((*sm).c4,chi); + + _vector_sub(psi,rs.s.c2,rs.s.c3); + _su3_multiply(chi,*u,psi); + _vector_add_assign((*sm).c2,chi); + _vector_sub_assign((*sm).c3,chi); + +/******************************* direction +3 *********************************/ + + sp=pl+(*(piup)); + u+=1; + + _vector_i_sub(psi,rs.s.c1,rs.s.c3); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign((*sp).c1,chi); + _vector_i_add_assign((*sp).c3,chi); + + _vector_i_add(psi,rs.s.c2,rs.s.c4); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign((*sp).c2,chi); + _vector_i_sub_assign((*sp).c4,chi); + +/******************************* direction -3 *********************************/ + + sm=pl+(*(pidn)); + u+=1; + + _vector_i_add(psi,rs.s.c1,rs.s.c3); + _su3_multiply(chi,*u,psi); + _vector_add_assign((*sm).c1,chi); + _vector_i_sub_assign((*sm).c3,chi); + + _vector_i_sub(psi,rs.s.c2,rs.s.c4); + _su3_multiply(chi,*u,psi); + _vector_add_assign((*sm).c2,chi); + _vector_i_add_assign((*sm).c4,chi); +} + +#endif + +void Dw(float mu,spinor *s,spinor *r) +{ + int bc,ix,t; + int *piup,*pidn; + su3 *u,*um; + pauli *m; + spin_t *so,*ro; + tm_parms_t tm; + + cps_int_bnd(0x1,s); + m=swfld(); + apply_sw(VOLUME/2,mu,m,s,r); + set_s2zero(BNDRY/2,r+VOLUME); + tm=tm_parms(); + if (tm.eoflg==1) + mu=0.0f; + + coe=-0.5f; + ceo=-0.5f; + bc=bc_type(); + piup=iup[VOLUME/2]; + pidn=idn[VOLUME/2]; + + so=(spin_t*)(s+(VOLUME/2)); + ro=(spin_t*)(r+(VOLUME/2)); + m+=VOLUME; + u=ufld(); + um=u+4*VOLUME; + + if (((cpr[0]==0)&&(bc!=3))||((cpr[0]==(NPROC0-1))&&(bc==0))) + { + ix=VOLUME/2; + + for (;u0)&&((t<(N0-1))||(bc!=0))) + { + doe(piup,pidn,u,s); + + mul_pauli2(mu,m,&((*so).s),&((*ro).s)); + + _vector_add_assign((*ro).s.c1,rs.s.c1); + _vector_add_assign((*ro).s.c2,rs.s.c2); + _vector_add_assign((*ro).s.c3,rs.s.c3); + _vector_add_assign((*ro).s.c4,rs.s.c4); + rs=(*so); + + deo(piup,pidn,u,r); + } + else + { + (*so).s=s0; + (*ro).s=s0; + } + + piup+=4; + pidn+=4; + so+=1; + ro+=1; + m+=2; + } + } + else + { + for (;u0)&&((t<(N0-1))||(bc!=0))) + mul_pauli2(mu,m,&((*se).s),&((*re).s)); + else + { + (*se).s=s0; + (*re).s=s0; + } + + se+=1; + re+=1; + } + } + else + { + for (;m0)&&((t<(N0-1))||(bc!=0))) + mul_pauli2(mu,m,&((*so).s),&((*ro).s)); + else + { + (*so).s=s0; + (*ro).s=s0; + } + + so+=1; + ro+=1; + } + } + else + { + for (;m0)&&((t<(N0-1))||(bc!=0))) + { + doe(piup,pidn,u,s); + (*ro)=rs; + } + else + (*ro).s=s0; + + piup+=4; + pidn+=4; + ro+=1; + } + } + else + { + for (;u0)&&((t<(N0-1))||(bc!=0))) + { + rs=(*so); + deo(piup,pidn,u,r); + } + else + (*so).s=s0; + + piup+=4; + pidn+=4; + so+=1; + } + } + else + { + for (;u0)&&((t<(N0-1))||(bc!=0))) + { + doe(piup,pidn,u,s); + + mul_pauli2(0.0f,m,&(rs.s),&(rs.s)); + + deo(piup,pidn,u,r); + } + + piup+=4; + pidn+=4; + m+=2; + } + } + else + { + for (;u=nb)) + { + error_loc(1,1,"Dw_blk [Dw.c]", + "Block grid is not allocated or block number out of range"); + return; + } + + if ((k<0)||(l<0)||(k==l)||(k>=(*b).ns)||(l>=(*b).ns)||((*b).u==NULL)) + { + error_loc(1,1,"Dw_blk [Dw.c]", + "Attempt to access unallocated memory space"); + return; + } + + b+=n; + vol=(*b).vol; + volh=vol/2; + s=(*b).s[k]; + r=(*b).s[l]; + so=(spin_t*)(s+volh); + ro=(spin_t*)(r+volh); + + s[vol]=s0; + r[vol]=s0; + m=(*b).sw; + apply_sw(volh,mu,m,s,r); + tm=tm_parms(); + if (tm.eoflg==1) + mu=0.0f; + + coe=-0.5f; + ceo=-0.5f; + piup=(*b).iup[volh]; + pidn=(*b).idn[volh]; + m+=vol; + u=(*b).u; + um=u+4*vol; + + if ((*b).nbp) + { + ibp=(*b).ibp; + ibm=ibp+(*b).nbp/2; + + for (;ibp=nb)) + { + error_loc(1,1,"Dwee_blk [Dw.c]", + "Block grid is not allocated or block number out of range"); + return; + } + + if ((k<0)||(l<0)||(k>=(*b).ns)||(l>=(*b).ns)||((*b).u==NULL)) + { + error_loc(1,1,"Dwee_blk [Dw.c]", + "Attempt to access unallocated memory space"); + return; + } + + b+=n; + vol=(*b).vol; + se=(spin_t*)((*b).s[k]); + re=(spin_t*)((*b).s[l]); + m=(*b).sw; + mm=m+vol; + + if ((*b).nbp) + { + piup=(*b).iup[0]; + pidn=(*b).idn[0]; + + ibu=((cpr[0]==(NPROC0-1))&&(((*b).bo[0]+(*b).bs[0])==L0)&&(bc_type()==0)); + ibd=((cpr[0]==0)&&((*b).bo[0]==0)); + + for (;m=nb)) + { + error_loc(1,1,"Dwoo_blk [Dw.c]", + "Block grid is not allocated or block number out of range"); + return; + } + + if ((k<0)||(l<0)||(k>=(*b).ns)||(l>=(*b).ns)||((*b).u==NULL)) + { + error_loc(1,1,"Dwoo_blk [Dw.c]", + "Attempt to access unallocated memory space"); + return; + } + + b+=n; + vol=(*b).vol; + volh=vol/2; + so=(spin_t*)((*b).s[k]+volh); + ro=(spin_t*)((*b).s[l]+volh); + tm=tm_parms(); + if (tm.eoflg==1) + mu=0.0f; + + m=(*b).sw+vol; + mm=m+vol; + + if ((*b).nbp) + { + piup=(*b).iup[volh]; + pidn=(*b).idn[volh]; + + ibu=((cpr[0]==(NPROC0-1))&&(((*b).bo[0]+(*b).bs[0])==L0)&&(bc_type()==0)); + ibd=((cpr[0]==0)&&((*b).bo[0]==0)); + + for (;m=nb)) + { + error_loc(1,1,"Dwoe_blk [Dw.c]", + "Block grid is not allocated or block number out of range"); + return; + } + + if ((k<0)||(l<0)||(k>=(*b).ns)||(l>=(*b).ns)||((*b).u==NULL)) + { + error_loc(1,1,"Dwoe_blk [Dw.c]", + "Attempt to access unallocated memory space"); + return; + } + + b+=n; + vol=(*b).vol; + volh=vol/2; + s=(*b).s[k]; + ro=(spin_t*)((*b).s[l]+volh); + s[vol]=s0; + + coe=-0.5f; + piup=(*b).iup[volh]; + pidn=(*b).idn[volh]; + u=(*b).u; + um=u+4*vol; + + if ((*b).nbp) + { + ibp=(*b).ibp; + ibm=ibp+(*b).nbp/2; + + for (;ibp=nb)) + { + error_loc(1,1,"Dweo_blk [Dw.c]", + "Block grid is not allocated or block number out of range"); + return; + } + + if ((k<0)||(l<0)||(k>=(*b).ns)||(l>=(*b).ns)||((*b).u==NULL)) + { + error_loc(1,1,"Dweo_blk [Dw.c]", + "Attempt to access unallocated memory space"); + return; + } + + b+=n; + vol=(*b).vol; + volh=vol/2; + so=(spin_t*)((*b).s[k]+volh); + r=(*b).s[l]; + r[vol]=s0; + + ceo=0.5f; + piup=(*b).iup[volh]; + pidn=(*b).idn[volh]; + u=(*b).u; + um=u+4*vol; + + if ((*b).nbp) + { + ibu=((cpr[0]==(NPROC0-1))&&(((*b).bo[0]+(*b).bs[0])==L0)&&(bc_type()==0)); + ibd=((cpr[0]==0)&&((*b).bo[0]==0)); + + for (;u=nb)) + { + error_loc(1,1,"Dwhat_blk [Dw.c]", + "Block grid is not allocated or block number out of range"); + return; + } + + if ((k<0)||(l<0)||(k==l)||(k>=(*b).ns)||(l>=(*b).ns)||((*b).u==NULL)) + { + error_loc(1,1,"Dweo_blk [Dw.c]", + "Attempt to access unallocated memory space"); + return; + } + + b+=n; + vol=(*b).vol; + volh=vol/2; + s=(*b).s[k]; + r=(*b).s[l]; + + s[vol]=s0; + r[vol]=s0; + m=(*b).sw; + apply_sw(volh,mu,m,s,r); + + coe=-0.5f; + ceo=0.5f; + piup=(*b).iup[volh]; + pidn=(*b).idn[volh]; + m+=vol; + u=(*b).u; + um=u+4*vol; + + if ((*b).nbp) + { + ibp=(*b).ibp; + ibm=ibp+(*b).nbp/2; + + for (;ibp +#include +#include +#include +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "block.h" +#include "dirac.h" +#include "global.h" + +#if (defined AVX) +#include "avx.h" + +#define _load_cst(c) \ +__asm__ __volatile__ ("vbroadcastss %0, %%ymm15 \n\t" \ + : \ + : \ + "m" (c) \ + : \ + "xmm15") + +#define _mul_cst() \ +__asm__ __volatile__ ("vmulps %%ymm15, %%ymm0, %%ymm0 \n\t" \ + "vmulps %%ymm15, %%ymm1, %%ymm1 \n\t" \ + "vmulps %%ymm15, %%ymm2, %%ymm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2") + +#define _load_zero() \ +__asm__ __volatile__ ("vxorps %%ymm0, %%ymm0, %%ymm0 \n\t" \ + "vxorps %%ymm1, %%ymm1, %%ymm1 \n\t" \ + "vxorps %%ymm2, %%ymm2, %%ymm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2") + +#define _set_s2zero(s) \ +__asm__ __volatile__ ("vmovaps %%ymm0, %0" \ + : \ + "=m" ((*s).c1.c1), \ + "=m" ((*s).c1.c2), \ + "=m" ((*s).c1.c3), \ + "=m" ((*s).c2.c1)); \ +__asm__ __volatile__ ("vmovaps %%ymm1, %0" \ + : \ + "=m" ((*s).c2.c2), \ + "=m" ((*s).c2.c3), \ + "=m" ((*s).c3.c1), \ + "=m" ((*s).c3.c2)); \ +__asm__ __volatile__ ("vmovaps %%ymm2, %0" \ + : \ + "=m" ((*s).c3.c3), \ + "=m" ((*s).c4.c1), \ + "=m" ((*s).c4.c2), \ + "=m" ((*s).c4.c3)) + +#define _set_w2zero(w) \ +__asm__ __volatile__ ("vmovaps %%ymm0, %0" \ + : \ + "=m" ((w[0]).c1.c1), \ + "=m" ((w[0]).c1.c2), \ + "=m" ((w[0]).c1.c3), \ + "=m" ((w[0]).c2.c1)); \ +__asm__ __volatile__ ("vmovaps %%ymm1, %0" \ + : \ + "=m" ((w[0]).c2.c2), \ + "=m" ((w[0]).c2.c3), \ + "=m" ((w[1]).c1.c1), \ + "=m" ((w[1]).c1.c2)); \ +__asm__ __volatile__ ("vmovaps %%ymm2, %0" \ + : \ + "=m" ((w[1]).c1.c3), \ + "=m" ((w[1]).c2.c1), \ + "=m" ((w[1]).c2.c2), \ + "=m" ((w[1]).c2.c3)) + +#define _weyl_pair_store(rl,rh) \ +__asm__ __volatile__ ("vshufps $0x44, %%ymm4, %%ymm3, %%ymm6 \n\t" \ + "vshufps $0xe4, %%ymm3, %%ymm5, %%ymm7 \n\t" \ + "vshufps $0xee, %%ymm5, %%ymm4, %%ymm8" \ + : \ + : \ + : \ + "xmm6", "xmm7", "xmm8"); \ +__asm__ __volatile__ ("vmovaps %%xmm6, %0 \n\t" \ + "vmovaps %%xmm7, %2 \n\t" \ + "vmovaps %%xmm8, %4" \ + : \ + "=m" ((rl).c1.c1), \ + "=m" ((rl).c1.c2), \ + "=m" ((rl).c1.c3), \ + "=m" ((rl).c2.c1), \ + "=m" ((rl).c2.c2), \ + "=m" ((rl).c2.c3)); \ +__asm__ __volatile__ ("vextractf128 $0x1, %%ymm6, %0 \n\t" \ + "vextractf128 $0x1, %%ymm7, %2 \n\t" \ + "vextractf128 $0x1, %%ymm8, %4" \ + : \ + "=m" ((rh).c1.c1), \ + "=m" ((rh).c1.c2), \ + "=m" ((rh).c1.c3), \ + "=m" ((rh).c2.c1), \ + "=m" ((rh).c2.c2), \ + "=m" ((rh).c2.c3)) + + +static void mul_umat(su3 *u) +{ + _avx_su3_pair_multiply(u[0],u[1]); +} + + +static void mul_uinv(su3 *u) +{ + _avx_su3_pair_inverse_multiply(u[0],u[1]); +} + + +void Dw_bnd(blk_grid_t grid,int n,int k,int l) +{ + int bc,nb,isw,*ipp; + float moh; + su3 *u; + weyl *w,*wm; + spinor *s,*sl,*sh; + block_t *b; + bndry_t *bb; + + b=blk_list(grid,&nb,&isw); + + if ((n<0)||(n>=nb)) + { + error_loc(1,1,"Dw_bnd [Dw_bnd.c]", + "Block grid is not allocated or block number out of range"); + return; + } + + b+=n; + bb=(*b).bb; + + if ((k<0)||(k>=(*b).ns)||((*b).u==NULL)||(bb==NULL)||(l>=(*bb).nw)) + { + error_loc(1,1,"Dw_bnd [Dw_bnd.c]", + "Attempt to access unallocated memory space"); + return; + } + + bc=bc_type(); + moh=-0.5f; + _load_cst(moh); + s=(*b).s[k]; + +/********************************** face -0 ***********************************/ + + ipp=(*bb).ipp; + w=(*bb).w[l]; + wm=w+(*bb).vol; + + if ((cpr[0]==0)&&((*b).bo[0]==0)&&(bc!=3)) + { + _load_zero(); + + for (;w=nb)) + { + error_loc(1,1,"Dw_bnd [Dw_bnd.c]", + "Block grid is not allocated or block number out of range"); + return; + } + + b+=n; + bb=(*b).bb; + + if ((k<0)||(k>=(*b).ns)||((*b).u==NULL)||(bb==NULL)||(l>=(*bb).nw)) + { + error_loc(1,1,"Dw_bnd [Dw_bnd.c]", + "Attempt to access unallocated memory space"); + return; + } + + bc=bc_type(); + moh=-0.5f; + _load_cst(moh); + s=(*b).s[k]; + +/********************************** face -0 ***********************************/ + + ipp=(*bb).ipp; + w=(*bb).w[l]; + wm=w+(*bb).vol; + + if ((cpr[0]==0)&&((*b).bo[0]==0)&&(bc!=3)) + { + _load_zero(); + + for (;w=nb)) + { + error_loc(1,1,"Dw_bnd [Dw_bnd.c]", + "Block grid is not allocated or block number out of range"); + return; + } + + b+=n; + bb=(*b).bb; + + if ((k<0)||(k>=(*b).ns)||((*b).u==NULL)||(bb==NULL)||(l>=(*bb).nw)) + { + error_loc(1,1,"Dw_bnd [Dw_bnd.c]", + "Attempt to access unallocated memory space"); + return; + } + + bc=bc_type(); + moh=-0.5f; + s=(*b).s[k]; + +/********************************** face -0 ***********************************/ + + ipp=(*bb).ipp; + w=(*bb).w[l]; + wm=w+(*bb).vol; + + if ((cpr[0]==0)&&((*b).bo[0]==0)&&(bc!=3)) + { + for (;w +#include +#include +#include "mpi.h" +#include "su3.h" +#include "utils.h" +#include "flags.h" +#include "lattice.h" +#include "uflds.h" +#include "sflds.h" +#include "sw_term.h" +#include "dirac.h" +#include "global.h" + +#define N0 (NPROC0*L0) + +typedef union +{ + spinor_dble s; + weyl_dble w[2]; +} spin_t; + +static double coe,ceo; +static const spinor_dble sd0={{{0.0}}}; +static spin_t rs ALIGNED32; + +#if (defined AVX) +#include "avx.h" + +#define _load_cst(c) \ +__asm__ __volatile__ ("vbroadcastsd %0, %%ymm15 \n\t" \ + : \ + : \ + "m" (c) \ + : \ + "xmm15") + +#define _mul_cst() \ +__asm__ __volatile__ ("vmulpd %%ymm15, %%ymm0, %%ymm0 \n\t" \ + "vmulpd %%ymm15, %%ymm1, %%ymm1 \n\t" \ + "vmulpd %%ymm15, %%ymm2, %%ymm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2") + +#define _mul_cst_up() \ +__asm__ __volatile__ ("vmulpd %%ymm15, %%ymm3, %%ymm3 \n\t" \ + "vmulpd %%ymm15, %%ymm4, %%ymm4 \n\t" \ + "vmulpd %%ymm15, %%ymm5, %%ymm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5") + + +static void doe(int *piup,int *pidn,su3_dble *u,spinor_dble *pk) +{ + spinor_dble *sp,*sm; + +/******************************* direction +0 *********************************/ + + sp=pk+(*(piup++)); + + _avx_pair_load_dble((*sp).c1,(*sp).c2); + _avx_pair_load_up_dble((*sp).c3,(*sp).c4); + + sm=pk+(*(pidn++)); + _prefetch_spinor_dble(sm); + + _avx_vector_add_dble(); + sp=pk+(*(piup++)); + _prefetch_spinor_dble(sp); + _avx_su3_multiply_pair_dble(*u); + + _avx_weyl_store_up_dble(rs.w[0]); + _avx_weyl_store_up_dble(rs.w[1]); + +/******************************* direction -0 *********************************/ + + _avx_pair_load_dble((*sm).c1,(*sm).c2); + _avx_pair_load_up_dble((*sm).c3,(*sm).c4); + + _avx_vector_sub_dble(); + sm=pk+(*(pidn++)); + _prefetch_spinor_dble(sm); + u+=1; + _avx_su3_inverse_multiply_pair_dble(*u); + + _avx_weyl_load_dble(rs.w[0]); + _avx_vector_add_dble(); + _avx_weyl_store_dble(rs.w[0]); + + _avx_weyl_load_dble(rs.w[1]); + _avx_vector_sub_dble(); + _avx_weyl_store_dble(rs.w[1]); + +/******************************* direction +1 *********************************/ + + _avx_pair_load_dble((*sp).c1,(*sp).c2); + _avx_pair_load_up_dble((*sp).c4,(*sp).c3); + + _avx_vector_i_add_dble(); + sp=pk+(*(piup++)); + _prefetch_spinor_dble(sp); + u+=1; + _avx_su3_multiply_pair_dble(*u); + + _avx_weyl_load_dble(rs.w[0]); + _avx_vector_add_dble(); + _avx_weyl_store_dble(rs.w[0]); + + _avx_weyl_load_dble(rs.w[1]); + _avx_vector_xch_i_sub_dble(); + _avx_weyl_store_dble(rs.w[1]); + +/******************************* direction -1 *********************************/ + + _avx_pair_load_dble((*sm).c1,(*sm).c2); + _avx_pair_load_up_dble((*sm).c4,(*sm).c3); + + _avx_vector_i_sub_dble(); + sm=pk+(*(pidn++)); + _prefetch_spinor_dble(sm); + u+=1; + _avx_su3_inverse_multiply_pair_dble(*u); + + _avx_weyl_load_dble(rs.w[0]); + _avx_vector_add_dble(); + _avx_weyl_store_dble(rs.w[0]); + + _avx_weyl_load_dble(rs.w[1]); + _avx_vector_xch_i_add_dble(); + _avx_weyl_store_dble(rs.w[1]); + +/******************************* direction +2 *********************************/ + + _avx_pair_load_dble((*sp).c1,(*sp).c2); + _avx_pair_load_up_dble((*sp).c4,(*sp).c3); + + _avx_vector_addsub_dble(); + u+=1; + _avx_su3_multiply_pair_dble(*u); + sp=pk+(*(piup)); + _prefetch_spinor_dble(sp); + _avx_weyl_load_dble(rs.w[0]); + _avx_vector_add_dble(); + _avx_weyl_store_dble(rs.w[0]); + + _avx_weyl_load_dble(rs.w[1]); + _avx_vector_xch_dble(); + _avx_vector_subadd_dble(); + _avx_weyl_store_dble(rs.w[1]); + +/******************************* direction -2 *********************************/ + + _avx_pair_load_dble((*sm).c1,(*sm).c2); + _avx_pair_load_up_dble((*sm).c4,(*sm).c3); + + _avx_vector_subadd_dble(); + sm=pk+(*(pidn)); + _prefetch_spinor_dble(sm); + u+=1; + _avx_su3_inverse_multiply_pair_dble(*u); + + _avx_weyl_load_dble(rs.w[0]); + _avx_vector_add_dble(); + _avx_weyl_store_dble(rs.w[0]); + + _avx_weyl_load_dble(rs.w[1]); + _avx_vector_xch_dble(); + _avx_vector_addsub_dble(); + _avx_weyl_store_dble(rs.w[1]); + +/******************************* direction +3 *********************************/ + + _avx_pair_load_dble((*sp).c1,(*sp).c2); + _avx_pair_load_up_dble((*sp).c3,(*sp).c4); + + _avx_vector_i_addsub_dble(); + u+=1; + _avx_su3_multiply_pair_dble(*u); + + _avx_weyl_load_dble(rs.w[0]); + _avx_vector_add_dble(); + _avx_weyl_store_dble(rs.w[0]); + + _avx_weyl_load_dble(rs.w[1]); + _avx_vector_i_subadd_dble(); + _avx_weyl_store_dble(rs.w[1]); + +/******************************* direction -3 *********************************/ + + _avx_pair_load_dble((*sm).c1,(*sm).c2); + _avx_pair_load_up_dble((*sm).c3,(*sm).c4); + + _avx_vector_i_subadd_dble(); + u+=1; + _avx_su3_inverse_multiply_pair_dble(*u); + + _load_cst(coe); + _avx_weyl_load_dble(rs.w[0]); + _avx_vector_add_dble(); + _mul_cst(); + _avx_pair_store_dble(rs.s.c1,rs.s.c2); + + _avx_weyl_load_dble(rs.w[1]); + _avx_vector_i_addsub_dble(); + _mul_cst(); + _avx_pair_store_dble(rs.s.c3,rs.s.c4); + + _avx_zeroupper(); +} + + +static void deo(int *piup,int *pidn,su3_dble *u,spinor_dble *pl) +{ + spinor_dble *sp,*sm; + +/******************************* direction +0 *********************************/ + + sp=pl+(*(piup++)); + _prefetch_spinor_dble(sp); + + _load_cst(ceo); + _avx_pair_load_dble(rs.s.c1,rs.s.c2); + _avx_pair_load_up_dble(rs.s.c3,rs.s.c4); + _mul_cst(); + _mul_cst_up(); + _avx_weyl_store_dble(rs.w[0]); + _avx_weyl_store_up_dble(rs.w[1]); + + sm=pl+(*(pidn++)); + _prefetch_spinor_dble(sm); + _avx_vector_sub_dble(); + _avx_su3_inverse_multiply_pair_dble(*u); + + _avx_pair_load_dble((*sp).c1,(*sp).c2); + _avx_vector_add_dble(); + _avx_pair_store_dble((*sp).c1,(*sp).c2); + + _avx_pair_load_dble((*sp).c3,(*sp).c4); + _avx_vector_sub_dble(); + _avx_pair_store_dble((*sp).c3,(*sp).c4); + +/******************************* direction -0 *********************************/ + + _avx_weyl_load_dble(rs.w[0]); + _avx_weyl_load_up_dble(rs.w[1]); + + sp=pl+(*(piup++)); + _prefetch_spinor_dble(sp); + _avx_vector_add_dble(); + u+=1; + _avx_su3_multiply_pair_dble(*u); + + _avx_pair_load_dble((*sm).c1,(*sm).c2); + _avx_vector_add_dble(); + _avx_pair_store_dble((*sm).c1,(*sm).c2); + + _avx_pair_load_dble((*sm).c3,(*sm).c4); + _avx_vector_add_dble(); + _avx_pair_store_dble((*sm).c3,(*sm).c4); + +/******************************* direction +1 *********************************/ + + _avx_weyl_load_dble(rs.w[0]); + _avx_weyl_load_up_dble(rs.w[1]); + + sm=pl+(*(pidn++)); + _prefetch_spinor_dble(sm); + _avx_vector_xch_i_sub_dble(); + u+=1; + _avx_su3_inverse_multiply_pair_dble(*u); + + _avx_pair_load_dble((*sp).c1,(*sp).c2); + _avx_vector_add_dble(); + _avx_pair_store_dble((*sp).c1,(*sp).c2); + + _avx_pair_load_dble((*sp).c3,(*sp).c4); + _avx_vector_xch_i_add_dble(); + _avx_pair_store_dble((*sp).c3,(*sp).c4); + +/******************************* direction -1 *********************************/ + + _avx_weyl_load_dble(rs.w[0]); + _avx_weyl_load_up_dble(rs.w[1]); + + sp=pl+(*(piup++)); + _prefetch_spinor_dble(sp); + _avx_vector_xch_i_add_dble(); + u+=1; + _avx_su3_multiply_pair_dble(*u); + + _avx_pair_load_dble((*sm).c1,(*sm).c2); + _avx_vector_add_dble(); + _avx_pair_store_dble((*sm).c1,(*sm).c2); + + _avx_pair_load_dble((*sm).c3,(*sm).c4); + _avx_vector_xch_i_sub_dble(); + _avx_pair_store_dble((*sm).c3,(*sm).c4); + +/******************************* direction +2 *********************************/ + + _avx_weyl_load_dble(rs.w[0]); + _avx_weyl_load_up_dble(rs.w[1]); + + sm=pl+(*(pidn++)); + _prefetch_spinor_dble(sm); + _avx_vector_xch_dble(); + _avx_vector_subadd_dble(); + u+=1; + _avx_su3_inverse_multiply_pair_dble(*u); + + _avx_pair_load_dble((*sp).c1,(*sp).c2); + _avx_vector_add_dble(); + _avx_pair_store_dble((*sp).c1,(*sp).c2); + + _avx_pair_load_dble((*sp).c3,(*sp).c4); + _avx_vector_xch_dble(); + _avx_vector_addsub_dble(); + _avx_pair_store_dble((*sp).c3,(*sp).c4); + +/******************************* direction -2 *********************************/ + + _avx_weyl_load_dble(rs.w[0]); + _avx_weyl_load_up_dble(rs.w[1]); + + sp=pl+(*(piup)); + _prefetch_spinor_dble(sp); + _avx_vector_xch_dble(); + _avx_vector_addsub_dble(); + u+=1; + _avx_su3_multiply_pair_dble(*u); + + _avx_pair_load_dble((*sm).c1,(*sm).c2); + _avx_vector_add_dble(); + _avx_pair_store_dble((*sm).c1,(*sm).c2); + + _avx_pair_load_dble((*sm).c3,(*sm).c4); + _avx_vector_xch_dble(); + _avx_vector_subadd_dble(); + _avx_pair_store_dble((*sm).c3,(*sm).c4); + +/******************************* direction +3 *********************************/ + + _avx_weyl_load_dble(rs.w[0]); + _avx_weyl_load_up_dble(rs.w[1]); + + sm=pl+(*(pidn)); + _prefetch_spinor_dble(sm); + _avx_vector_i_subadd_dble(); + u+=1; + _avx_su3_inverse_multiply_pair_dble(*u); + + _avx_pair_load_dble((*sp).c1,(*sp).c2); + _avx_vector_add_dble(); + _avx_pair_store_dble((*sp).c1,(*sp).c2); + + _avx_pair_load_dble((*sp).c3,(*sp).c4); + _avx_vector_i_addsub_dble(); + _avx_pair_store_dble((*sp).c3,(*sp).c4); + +/******************************* direction -3 *********************************/ + + _avx_weyl_load_dble(rs.w[0]); + _avx_weyl_load_up_dble(rs.w[1]); + + _avx_vector_i_addsub_dble(); + u+=1; + _avx_su3_multiply_pair_dble(*u); + + _avx_pair_load_dble((*sm).c1,(*sm).c2); + _avx_vector_add_dble(); + _avx_pair_store_dble((*sm).c1,(*sm).c2); + + _avx_pair_load_dble((*sm).c3,(*sm).c4); + _avx_vector_i_subadd_dble(); + _avx_pair_store_dble((*sm).c3,(*sm).c4); + + _avx_zeroupper(); +} + +#elif (defined x64) +#include "sse2.h" + +#define _load_cst(c) \ +__asm__ __volatile__ ("movddup %0, %%xmm15" \ + : \ + : \ + "m" (c) \ + : \ + "xmm15") + +#define _mul_cst() \ +__asm__ __volatile__ ("mulpd %%xmm15, %%xmm0 \n\t" \ + "mulpd %%xmm15, %%xmm1 \n\t" \ + "mulpd %%xmm15, %%xmm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2") + +#define _mul_cst_up() \ +__asm__ __volatile__ ("mulpd %%xmm15, %%xmm3 \n\t" \ + "mulpd %%xmm15, %%xmm4 \n\t" \ + "mulpd %%xmm15, %%xmm5" \ + : \ + : \ + : \ + "xmm3", "xmm4", "xmm5") + + +static void doe(int *piup,int *pidn,su3_dble *u,spinor_dble *pk) +{ + spinor_dble *sp,*sm; + +/******************************* direction +0 *********************************/ + + sp=pk+(*(piup++)); + + _sse_load_dble((*sp).c1); + _sse_load_up_dble((*sp).c3); + + sm=pk+(*(pidn++)); + _prefetch_spinor_dble(sm); + _sse_vector_add_dble(); + _sse_su3_multiply_dble(*u); + _sse_store_up_dble(rs.s.c1); + _sse_store_up_dble(rs.s.c3); + + _sse_load_dble((*sp).c2); + _sse_load_up_dble((*sp).c4); + + u+=1; + _prefetch_su3_dble(u); + u-=1; + + _sse_vector_add_dble(); + _sse_su3_multiply_dble(*u); + + _sse_store_up_dble(rs.s.c2); + _sse_store_up_dble(rs.s.c4); + +/******************************* direction -0 *********************************/ + + _sse_load_dble((*sm).c1); + _sse_load_up_dble((*sm).c3); + + sp=pk+(*(piup++)); + _prefetch_spinor_dble(sp); + _sse_vector_sub_dble(); + u+=1; + _sse_su3_inverse_multiply_dble(*u); + + _sse_load_dble(rs.s.c1); + _sse_vector_add_dble(); + _sse_store_dble(rs.s.c1); + + _sse_load_dble(rs.s.c3); + _sse_vector_sub_dble(); + _sse_store_dble(rs.s.c3); + + _sse_load_dble((*sm).c2); + _sse_load_up_dble((*sm).c4); + + u+=1; + _prefetch_su3_dble(u); + u-=1; + + _sse_vector_sub_dble(); + _sse_su3_inverse_multiply_dble(*u); + + _sse_load_dble(rs.s.c2); + _sse_vector_add_dble(); + _sse_store_dble(rs.s.c2); + + _sse_load_dble(rs.s.c4); + _sse_vector_sub_dble(); + _sse_store_dble(rs.s.c4); + +/******************************* direction +1 *********************************/ + + _sse_load_dble((*sp).c1); + _sse_load_up_dble((*sp).c4); + + sm=pk+(*(pidn++)); + _prefetch_spinor_dble(sm); + _sse_vector_i_add_dble(); + u+=1; + _sse_su3_multiply_dble(*u); + + _sse_load_dble(rs.s.c1); + _sse_vector_add_dble(); + _sse_store_dble(rs.s.c1); + + _sse_load_dble(rs.s.c4); + _sse_vector_i_mul_dble(); + _sse_vector_sub_dble(); + _sse_store_dble(rs.s.c4); + + _sse_load_dble((*sp).c2); + _sse_load_up_dble((*sp).c3); + + u+=1; + _prefetch_su3_dble(u); + u-=1; + + _sse_vector_i_add_dble(); + _sse_su3_multiply_dble(*u); + + _sse_load_dble(rs.s.c2); + _sse_vector_add_dble(); + _sse_store_dble(rs.s.c2); + + _sse_load_dble(rs.s.c3); + _sse_vector_i_mul_dble(); + _sse_vector_sub_dble(); + _sse_store_dble(rs.s.c3); + +/******************************* direction -1 *********************************/ + + _sse_load_dble((*sm).c1); + _sse_load_up_dble((*sm).c4); + + sp=pk+(*(piup++)); + _prefetch_spinor_dble(sp); + _sse_vector_i_mul_dble(); + _sse_vector_sub_dble(); + u+=1; + _sse_su3_inverse_multiply_dble(*u); + + _sse_load_dble(rs.s.c1); + _sse_vector_add_dble(); + _sse_store_dble(rs.s.c1); + + _sse_load_dble(rs.s.c4); + _sse_vector_i_add_dble(); + _sse_store_dble(rs.s.c4); + + _sse_load_dble((*sm).c2); + _sse_load_up_dble((*sm).c3); + + u+=1; + _prefetch_su3_dble(u); + u-=1; + + _sse_vector_i_mul_dble(); + _sse_vector_sub_dble(); + _sse_su3_inverse_multiply_dble(*u); + + _sse_load_dble(rs.s.c2); + _sse_vector_add_dble(); + _sse_store_dble(rs.s.c2); + + _sse_load_dble(rs.s.c3); + _sse_vector_i_add_dble(); + _sse_store_dble(rs.s.c3); + +/******************************* direction +2 *********************************/ + + _sse_load_dble((*sp).c1); + _sse_load_up_dble((*sp).c4); + + sm=pk+(*(pidn++)); + _prefetch_spinor_dble(sm); + _sse_vector_add_dble(); + u+=1; + _sse_su3_multiply_dble(*u); + + _sse_load_dble(rs.s.c1); + _sse_vector_add_dble(); + _sse_store_dble(rs.s.c1); + + _sse_load_dble(rs.s.c4); + _sse_vector_add_dble(); + _sse_store_dble(rs.s.c4); + + _sse_load_dble((*sp).c2); + _sse_load_up_dble((*sp).c3); + + u+=1; + _prefetch_su3_dble(u); + u-=1; + + _sse_vector_sub_dble(); + _sse_su3_multiply_dble(*u); + + _sse_load_dble(rs.s.c2); + _sse_vector_add_dble(); + _sse_store_dble(rs.s.c2); + + _sse_load_dble(rs.s.c3); + _sse_vector_sub_dble(); + _sse_store_dble(rs.s.c3); + +/******************************* direction -2 *********************************/ + + _sse_load_dble((*sm).c1); + _sse_load_up_dble((*sm).c4); + + sp=pk+(*(piup)); + _prefetch_spinor_dble(sp); + _sse_vector_sub_dble(); + u+=1; + _sse_su3_inverse_multiply_dble(*u); + + _sse_load_dble(rs.s.c1); + _sse_vector_add_dble(); + _sse_store_dble(rs.s.c1); + + _sse_load_dble(rs.s.c4); + _sse_vector_sub_dble(); + _sse_store_dble(rs.s.c4); + + _sse_load_dble((*sm).c2); + _sse_load_up_dble((*sm).c3); + + u+=1; + _prefetch_su3_dble(u); + u-=1; + + _sse_vector_add_dble(); + _sse_su3_inverse_multiply_dble(*u); + + _sse_load_dble(rs.s.c2); + _sse_vector_add_dble(); + _sse_store_dble(rs.s.c2); + + _sse_load_dble(rs.s.c3); + _sse_vector_add_dble(); + _sse_store_dble(rs.s.c3); + +/******************************* direction +3 *********************************/ + + _sse_load_dble((*sp).c1); + _sse_load_up_dble((*sp).c3); + + sm=pk+(*(pidn)); + _prefetch_spinor_dble(sm); + _sse_vector_i_add_dble(); + u+=1; + _sse_su3_multiply_dble(*u); + + _sse_load_dble(rs.s.c1); + _sse_vector_add_dble(); + _sse_store_dble(rs.s.c1); + + _sse_load_dble(rs.s.c3); + _sse_vector_i_mul_dble(); + _sse_vector_sub_dble(); + _sse_store_dble(rs.s.c3); + + _sse_load_dble((*sp).c2); + _sse_load_up_dble((*sp).c4); + + u+=1; + _prefetch_su3_dble(u); + u-=1; + + _sse_vector_i_mul_dble(); + _sse_vector_sub_dble(); + _sse_su3_multiply_dble(*u); + + _sse_load_dble(rs.s.c2); + _sse_vector_add_dble(); + _sse_store_dble(rs.s.c2); + + _sse_load_dble(rs.s.c4); + _sse_vector_i_add_dble(); + _sse_store_dble(rs.s.c4); + +/******************************* direction -3 *********************************/ + + _sse_load_dble((*sm).c1); + _sse_load_up_dble((*sm).c3); + _sse_vector_i_mul_dble(); + _sse_vector_sub_dble(); + u+=1; + _sse_su3_inverse_multiply_dble(*u); + + _load_cst(coe); + _sse_load_dble(rs.s.c1); + _sse_vector_add_dble(); + _mul_cst(); + _sse_store_dble(rs.s.c1); + + _sse_load_dble(rs.s.c3); + _sse_vector_i_add_dble(); + _mul_cst(); + _sse_store_dble(rs.s.c3); + + _sse_load_dble((*sm).c2); + _sse_load_up_dble((*sm).c4); + + u+=1; + _prefetch_su3_dble(u); + u-=1; + + _sse_vector_i_add_dble(); + _sse_su3_inverse_multiply_dble(*u); + + _load_cst(coe); + _sse_load_dble(rs.s.c2); + _sse_vector_add_dble(); + _mul_cst(); + _sse_store_dble(rs.s.c2); + + _sse_load_dble(rs.s.c4); + _sse_vector_i_mul_dble(); + _sse_vector_sub_dble(); + _mul_cst(); + _sse_store_dble(rs.s.c4); +} + + +static void deo(int *piup,int *pidn,su3_dble *u,spinor_dble *pl) +{ + spinor_dble *sp,*sm; + +/******************************* direction +0 *********************************/ + + sp=pl+(*(piup++)); + _prefetch_spinor_dble(sp); + + _load_cst(ceo); + _sse_load_dble(rs.s.c1); + _sse_load_up_dble(rs.s.c3); + _mul_cst(); + _mul_cst_up(); + _sse_store_dble(rs.s.c1); + _sse_store_up_dble(rs.s.c3); + + sm=pl+(*(pidn++)); + _prefetch_spinor_dble(sm); + _sse_vector_sub_dble(); + _sse_su3_inverse_multiply_dble(*u); + + _sse_load_dble((*sp).c1); + _sse_vector_add_dble(); + _sse_store_dble((*sp).c1); + + _sse_load_dble((*sp).c3); + _sse_vector_sub_dble(); + _sse_store_dble((*sp).c3); + + _load_cst(ceo); + _sse_load_dble(rs.s.c2); + _sse_load_up_dble(rs.s.c4); + _mul_cst(); + _mul_cst_up(); + _sse_store_dble(rs.s.c2); + _sse_store_up_dble(rs.s.c4); + + _sse_vector_sub_dble(); + _sse_su3_inverse_multiply_dble(*u); + + _sse_load_dble((*sp).c2); + _sse_vector_add_dble(); + _sse_store_dble((*sp).c2); + + _sse_load_dble((*sp).c4); + _sse_vector_sub_dble(); + _sse_store_dble((*sp).c4); + +/******************************* direction -0 *********************************/ + + _sse_load_dble(rs.s.c1); + _sse_load_up_dble(rs.s.c3); + + sp=pl+(*(piup++)); + _prefetch_spinor_dble(sp); + _sse_vector_add_dble(); + u+=1; + _sse_su3_multiply_dble(*u); + + _sse_load_dble((*sm).c1); + _sse_vector_add_dble(); + _sse_store_dble((*sm).c1); + + _sse_load_dble((*sm).c3); + _sse_vector_add_dble(); + _sse_store_dble((*sm).c3); + + _sse_load_dble(rs.s.c2); + _sse_load_up_dble(rs.s.c4); + + _sse_vector_add_dble(); + _sse_su3_multiply_dble(*u); + + _sse_load_dble((*sm).c2); + _sse_vector_add_dble(); + _sse_store_dble((*sm).c2); + + _sse_load_dble((*sm).c4); + _sse_vector_add_dble(); + _sse_store_dble((*sm).c4); + +/******************************* direction +1 *********************************/ + + _sse_load_dble(rs.s.c1); + _sse_load_up_dble(rs.s.c4); + + sm=pl+(*(pidn++)); + _prefetch_spinor_dble(sm); + _sse_vector_i_mul_dble(); + _sse_vector_sub_dble(); + u+=1; + _sse_su3_inverse_multiply_dble(*u); + + _sse_load_dble((*sp).c1); + _sse_vector_add_dble(); + _sse_store_dble((*sp).c1); + + _sse_load_dble((*sp).c4); + _sse_vector_i_add_dble(); + _sse_store_dble((*sp).c4); + + _sse_load_dble(rs.s.c2); + _sse_load_up_dble(rs.s.c3); + + _sse_vector_i_mul_dble(); + _sse_vector_sub_dble(); + _sse_su3_inverse_multiply_dble(*u); + + _sse_load_dble((*sp).c2); + _sse_vector_add_dble(); + _sse_store_dble((*sp).c2); + + _sse_load_dble((*sp).c3); + _sse_vector_i_add_dble(); + _sse_store_dble((*sp).c3); + +/******************************* direction -1 *********************************/ + + _sse_load_dble(rs.s.c1); + _sse_load_up_dble(rs.s.c4); + + sp=pl+(*(piup++)); + _prefetch_spinor_dble(sp); + _sse_vector_i_add_dble(); + u+=1; + _sse_su3_multiply_dble(*u); + + _sse_load_dble((*sm).c1); + _sse_vector_add_dble(); + _sse_store_dble((*sm).c1); + + _sse_load_dble((*sm).c4); + _sse_vector_i_mul_dble(); + _sse_vector_sub_dble(); + _sse_store_dble((*sm).c4); + + _sse_load_dble(rs.s.c2); + _sse_load_up_dble(rs.s.c3); + + _sse_vector_i_add_dble(); + _sse_su3_multiply_dble(*u); + + _sse_load_dble((*sm).c2); + _sse_vector_add_dble(); + _sse_store_dble((*sm).c2); + + _sse_load_dble((*sm).c3); + _sse_vector_i_mul_dble(); + _sse_vector_sub_dble(); + _sse_store_dble((*sm).c3); + +/******************************* direction +2 *********************************/ + + _sse_load_dble(rs.s.c1); + _sse_load_up_dble(rs.s.c4); + + sm=pl+(*(pidn++)); + _prefetch_spinor_dble(sm); + _sse_vector_sub_dble(); + u+=1; + _sse_su3_inverse_multiply_dble(*u); + + _sse_load_dble((*sp).c1); + _sse_vector_add_dble(); + _sse_store_dble((*sp).c1); + + _sse_load_dble((*sp).c4); + _sse_vector_sub_dble(); + _sse_store_dble((*sp).c4); + + _sse_load_dble(rs.s.c2); + _sse_load_up_dble(rs.s.c3); + + _sse_vector_add_dble(); + _sse_su3_inverse_multiply_dble(*u); + + _sse_load_dble((*sp).c2); + _sse_vector_add_dble(); + _sse_store_dble((*sp).c2); + + _sse_load_dble((*sp).c3); + _sse_vector_add_dble(); + _sse_store_dble((*sp).c3); + +/******************************* direction -2 *********************************/ + + _sse_load_dble(rs.s.c1); + _sse_load_up_dble(rs.s.c4); + + sp=pl+(*(piup)); + _prefetch_spinor_dble(sp); + _sse_vector_add_dble(); + u+=1; + _sse_su3_multiply_dble(*u); + + _sse_load_dble((*sm).c1); + _sse_vector_add_dble(); + _sse_store_dble((*sm).c1); + + _sse_load_dble((*sm).c4); + _sse_vector_add_dble(); + _sse_store_dble((*sm).c4); + + _sse_load_dble(rs.s.c2); + _sse_load_up_dble(rs.s.c3); + + _sse_vector_sub_dble(); + _sse_su3_multiply_dble(*u); + + _sse_load_dble((*sm).c2); + _sse_vector_add_dble(); + _sse_store_dble((*sm).c2); + + _sse_load_dble((*sm).c3); + _sse_vector_sub_dble(); + _sse_store_dble((*sm).c3); + +/******************************* direction +3 *********************************/ + + _sse_load_dble(rs.s.c1); + _sse_load_up_dble(rs.s.c3); + + sm=pl+(*(pidn)); + _prefetch_spinor_dble(sm); + _sse_vector_i_mul_dble(); + _sse_vector_sub_dble(); + u+=1; + _sse_su3_inverse_multiply_dble(*u); + + _sse_load_dble((*sp).c1); + _sse_vector_add_dble(); + _sse_store_dble((*sp).c1); + + _sse_load_dble((*sp).c3); + _sse_vector_i_add_dble(); + _sse_store_dble((*sp).c3); + + _sse_load_dble(rs.s.c2); + _sse_load_up_dble(rs.s.c4); + + _sse_vector_i_add_dble(); + _sse_su3_inverse_multiply_dble(*u); + + _sse_load_dble((*sp).c2); + _sse_vector_add_dble(); + _sse_store_dble((*sp).c2); + + _sse_load_dble((*sp).c4); + _sse_vector_i_mul_dble(); + _sse_vector_sub_dble(); + _sse_store_dble((*sp).c4); + +/******************************* direction -3 *********************************/ + + _sse_load_dble(rs.s.c1); + _sse_load_up_dble(rs.s.c3); + + _sse_vector_i_add_dble(); + u+=1; + _sse_su3_multiply_dble(*u); + + _sse_load_dble((*sm).c1); + _sse_vector_add_dble(); + _sse_store_dble((*sm).c1); + + _sse_load_dble((*sm).c3); + _sse_vector_i_mul_dble(); + _sse_vector_sub_dble(); + _sse_store_dble((*sm).c3); + + _sse_load_dble(rs.s.c2); + _sse_load_up_dble(rs.s.c4); + + _sse_vector_i_mul_dble(); + _sse_vector_sub_dble(); + _sse_su3_multiply_dble(*u); + + _sse_load_dble((*sm).c2); + _sse_vector_add_dble(); + _sse_store_dble((*sm).c2); + + _sse_load_dble((*sm).c4); + _sse_vector_i_add_dble(); + _sse_store_dble((*sm).c4); +} + +#else + +#define _vector_mul_assign(r,c) \ + (r).c1.re*=(c); \ + (r).c1.im*=(c); \ + (r).c2.re*=(c); \ + (r).c2.im*=(c); \ + (r).c3.re*=(c); \ + (r).c3.im*=(c) + + +static void doe(int *piup,int *pidn,su3_dble *u,spinor_dble *pk) +{ + spinor_dble *sp,*sm; + su3_vector_dble psi,chi; + +/******************************* direction +0 *********************************/ + + sp=pk+(*(piup++)); + + _vector_add(psi,(*sp).c1,(*sp).c3); + _su3_multiply(rs.s.c1,*u,psi); + rs.s.c3=rs.s.c1; + + _vector_add(psi,(*sp).c2,(*sp).c4); + _su3_multiply(rs.s.c2,*u,psi); + rs.s.c4=rs.s.c2; + +/******************************* direction -0 *********************************/ + + sm=pk+(*(pidn++)); + u+=1; + + _vector_sub(psi,(*sm).c1,(*sm).c3); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c1,chi); + _vector_sub_assign(rs.s.c3,chi); + + _vector_sub(psi,(*sm).c2,(*sm).c4); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c2,chi); + _vector_sub_assign(rs.s.c4,chi); + +/******************************* direction +1 *********************************/ + + sp=pk+(*(piup++)); + u+=1; + + _vector_i_add(psi,(*sp).c1,(*sp).c4); + _su3_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c1,chi); + _vector_i_sub_assign(rs.s.c4,chi); + + _vector_i_add(psi,(*sp).c2,(*sp).c3); + _su3_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c2,chi); + _vector_i_sub_assign(rs.s.c3,chi); + +/******************************* direction -1 *********************************/ + + sm=pk+(*(pidn++)); + u+=1; + + _vector_i_sub(psi,(*sm).c1,(*sm).c4); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c1,chi); + _vector_i_add_assign(rs.s.c4,chi); + + _vector_i_sub(psi,(*sm).c2,(*sm).c3); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c2,chi); + _vector_i_add_assign(rs.s.c3,chi); + +/******************************* direction +2 *********************************/ + + sp=pk+(*(piup++)); + u+=1; + + _vector_add(psi,(*sp).c1,(*sp).c4); + _su3_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c1,chi); + _vector_add_assign(rs.s.c4,chi); + + _vector_sub(psi,(*sp).c2,(*sp).c3); + _su3_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c2,chi); + _vector_sub_assign(rs.s.c3,chi); + +/******************************* direction -2 *********************************/ + + sm=pk+(*(pidn++)); + u+=1; + + _vector_sub(psi,(*sm).c1,(*sm).c4); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c1,chi); + _vector_sub_assign(rs.s.c4,chi); + + _vector_add(psi,(*sm).c2,(*sm).c3); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c2,chi); + _vector_add_assign(rs.s.c3,chi); + +/******************************* direction +3 *********************************/ + + sp=pk+(*(piup)); + u+=1; + + _vector_i_add(psi,(*sp).c1,(*sp).c3); + _su3_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c1,chi); + _vector_i_sub_assign(rs.s.c3,chi); + + _vector_i_sub(psi,(*sp).c2,(*sp).c4); + _su3_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c2,chi); + _vector_i_add_assign(rs.s.c4,chi); + +/******************************* direction -3 *********************************/ + + sm=pk+(*(pidn)); + u+=1; + + _vector_i_sub(psi,(*sm).c1,(*sm).c3); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c1,chi); + _vector_i_add_assign(rs.s.c3,chi); + + _vector_i_add(psi,(*sm).c2,(*sm).c4); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign(rs.s.c2,chi); + _vector_i_sub_assign(rs.s.c4,chi); + + _vector_mul_assign(rs.s.c1,coe); + _vector_mul_assign(rs.s.c2,coe); + _vector_mul_assign(rs.s.c3,coe); + _vector_mul_assign(rs.s.c4,coe); +} + + +static void deo(int *piup,int *pidn,su3_dble *u,spinor_dble *pl) +{ + spinor_dble *sp,*sm; + su3_vector_dble psi,chi; + + _vector_mul_assign(rs.s.c1,ceo); + _vector_mul_assign(rs.s.c2,ceo); + _vector_mul_assign(rs.s.c3,ceo); + _vector_mul_assign(rs.s.c4,ceo); + +/******************************* direction +0 *********************************/ + + sp=pl+(*(piup++)); + + _vector_sub(psi,rs.s.c1,rs.s.c3); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign((*sp).c1,chi); + _vector_sub_assign((*sp).c3,chi); + + _vector_sub(psi,rs.s.c2,rs.s.c4); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign((*sp).c2,chi); + _vector_sub_assign((*sp).c4,chi); + +/******************************* direction -0 *********************************/ + + sm=pl+(*(pidn++)); + u+=1; + + _vector_add(psi,rs.s.c1,rs.s.c3); + _su3_multiply(chi,*u,psi); + _vector_add_assign((*sm).c1,chi); + _vector_add_assign((*sm).c3,chi); + + _vector_add(psi,rs.s.c2,rs.s.c4); + _su3_multiply(chi,*u,psi); + _vector_add_assign((*sm).c2,chi); + _vector_add_assign((*sm).c4,chi); + +/******************************* direction +1 *********************************/ + + sp=pl+(*(piup++)); + u+=1; + + _vector_i_sub(psi,rs.s.c1,rs.s.c4); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign((*sp).c1,chi); + _vector_i_add_assign((*sp).c4,chi); + + _vector_i_sub(psi,rs.s.c2,rs.s.c3); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign((*sp).c2,chi); + _vector_i_add_assign((*sp).c3,chi); + +/******************************* direction -1 *********************************/ + + sm=pl+(*(pidn++)); + u+=1; + + _vector_i_add(psi,rs.s.c1,rs.s.c4); + _su3_multiply(chi,*u,psi); + _vector_add_assign((*sm).c1,chi); + _vector_i_sub_assign((*sm).c4,chi); + + _vector_i_add(psi,rs.s.c2,rs.s.c3); + _su3_multiply(chi,*u,psi); + _vector_add_assign((*sm).c2,chi); + _vector_i_sub_assign((*sm).c3,chi); + +/******************************* direction +2 *********************************/ + + sp=pl+(*(piup++)); + u+=1; + + _vector_sub(psi,rs.s.c1,rs.s.c4); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign((*sp).c1,chi); + _vector_sub_assign((*sp).c4,chi); + + _vector_add(psi,rs.s.c2,rs.s.c3); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign((*sp).c2,chi); + _vector_add_assign((*sp).c3,chi); + +/******************************* direction -2 *********************************/ + + sm=pl+(*(pidn++)); + u+=1; + + _vector_add(psi,rs.s.c1,rs.s.c4); + _su3_multiply(chi,*u,psi); + _vector_add_assign((*sm).c1,chi); + _vector_add_assign((*sm).c4,chi); + + _vector_sub(psi,rs.s.c2,rs.s.c3); + _su3_multiply(chi,*u,psi); + _vector_add_assign((*sm).c2,chi); + _vector_sub_assign((*sm).c3,chi); + +/******************************* direction +3 *********************************/ + + sp=pl+(*(piup)); + u+=1; + + _vector_i_sub(psi,rs.s.c1,rs.s.c3); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign((*sp).c1,chi); + _vector_i_add_assign((*sp).c3,chi); + + _vector_i_add(psi,rs.s.c2,rs.s.c4); + _su3_inverse_multiply(chi,*u,psi); + _vector_add_assign((*sp).c2,chi); + _vector_i_sub_assign((*sp).c4,chi); + +/******************************* direction -3 *********************************/ + + sm=pl+(*(pidn)); + u+=1; + + _vector_i_add(psi,rs.s.c1,rs.s.c3); + _su3_multiply(chi,*u,psi); + _vector_add_assign((*sm).c1,chi); + _vector_i_sub_assign((*sm).c3,chi); + + _vector_i_sub(psi,rs.s.c2,rs.s.c4); + _su3_multiply(chi,*u,psi); + _vector_add_assign((*sm).c2,chi); + _vector_i_add_assign((*sm).c4,chi); +} + +#endif + +void Dw_dble(double mu,spinor_dble *s,spinor_dble *r) +{ + int bc,ix,t; + int *piup,*pidn; + su3_dble *u,*um; + pauli_dble *m; + spin_t *so,*ro; + tm_parms_t tm; + + cpsd_int_bnd(0x1,s); + m=swdfld(); + apply_sw_dble(VOLUME/2,mu,m,s,r); + set_sd2zero(BNDRY/2,r+VOLUME); + tm=tm_parms(); + if (tm.eoflg==1) + mu=0.0; + + coe=-0.5; + ceo=-0.5; + bc=bc_type(); + piup=iup[VOLUME/2]; + pidn=idn[VOLUME/2]; + + so=(spin_t*)(s+(VOLUME/2)); + ro=(spin_t*)(r+(VOLUME/2)); + m+=VOLUME; + u=udfld(); + um=u+4*VOLUME; + + if (((cpr[0]==0)&&(bc!=3))||((cpr[0]==(NPROC0-1))&&(bc==0))) + { + ix=VOLUME/2; + + for (;u0)&&((t<(N0-1))||(bc!=0))) + { + doe(piup,pidn,u,s); + + mul_pauli_dble(mu,m,(*so).w,(*ro).w); + mul_pauli_dble(-mu,m+1,(*so).w+1,(*ro).w+1); + + _vector_add_assign((*ro).s.c1,rs.s.c1); + _vector_add_assign((*ro).s.c2,rs.s.c2); + _vector_add_assign((*ro).s.c3,rs.s.c3); + _vector_add_assign((*ro).s.c4,rs.s.c4); + rs=(*so); + + deo(piup,pidn,u,r); + } + else + { + (*so).s=sd0; + (*ro).s=sd0; + } + + piup+=4; + pidn+=4; + so+=1; + ro+=1; + m+=2; + } + } + else + { + for (;u0)&&((t<(N0-1))||(bc!=0))) + { + mul_pauli_dble(mu,m,(*se).w,(*re).w); + mul_pauli_dble(-mu,m+1,(*se).w+1,(*re).w+1); + } + else + { + (*se).s=sd0; + (*re).s=sd0; + } + + se+=1; + re+=1; + } + } + else + { + for (;m0)&&((t<(N0-1))||(bc!=0))) + { + mul_pauli_dble(mu,m,(*so).w,(*ro).w); + mul_pauli_dble(-mu,m+1,(*so).w+1,(*ro).w+1); + } + else + { + (*so).s=sd0; + (*ro).s=sd0; + } + + so+=1; + ro+=1; + } + } + else + { + for (;m0)&&((t<(N0-1))||(bc!=0))) + { + doe(piup,pidn,u,s); + (*ro)=rs; + } + else + (*ro).s=sd0; + + piup+=4; + pidn+=4; + ro+=1; + } + } + else + { + for (;u0)&&((t<(N0-1))||(bc!=0))) + { + rs=(*so); + deo(piup,pidn,u,r); + } + else + (*so).s=sd0; + + piup+=4; + pidn+=4; + so+=1; + } + } + else + { + for (;u0)&&((t<(N0-1))||(bc!=0))) + { + doe(piup,pidn,u,s); + + mul_pauli_dble(0.0,m,rs.w,rs.w); + mul_pauli_dble(0.0,m+1,rs.w+1,rs.w+1); + + deo(piup,pidn,u,r); + } + + piup+=4; + pidn+=4; + m+=2; + } + } + else + { + for (;u=nb)) + { + error_loc(1,1,"Dw_blk_dble [Dw_dble.c]", + "Block grid is not allocated or block number out of range"); + return; + } + + if ((k<0)||(l<0)||(k==l)||(k>=(*b).nsd)||(l>=(*b).nsd)||((*b).ud==NULL)) + { + error_loc(1,1,"Dw_blk_dbl [Dw_dble.c]", + "Attempt to access unallocated memory space"); + return; + } + + b+=n; + vol=(*b).vol; + volh=vol/2; + s=(*b).sd[k]; + r=(*b).sd[l]; + so=(spin_t*)(s+volh); + ro=(spin_t*)(r+volh); + + s[vol]=sd0; + r[vol]=sd0; + m=(*b).swd; + apply_sw_dble(volh,mu,m,s,r); + tm=tm_parms(); + if (tm.eoflg==1) + mu=0.0; + + coe=-0.5; + ceo=-0.5; + piup=(*b).iup[volh]; + pidn=(*b).idn[volh]; + m+=vol; + u=(*b).ud; + um=u+4*vol; + + if ((*b).nbp) + { + ibp=(*b).ibp; + ibm=ibp+(*b).nbp/2; + + for (;ibp=nb)) + { + error_loc(1,1,"Dwee_blk_dble [Dw_dble.c]", + "Block grid is not allocated or block number out of range"); + return; + } + + if ((k<0)||(l<0)||(k>=(*b).nsd)||(l>=(*b).nsd)||((*b).ud==NULL)) + { + error_loc(1,1,"Dwee_blk_dbl [Dw_dble.c]", + "Attempt to access unallocated memory space"); + return; + } + + b+=n; + vol=(*b).vol; + se=(spin_t*)((*b).sd[k]); + re=(spin_t*)((*b).sd[l]); + m=(*b).swd; + mm=m+vol; + + if ((*b).nbp) + { + piup=(*b).iup[0]; + pidn=(*b).idn[0]; + + ibu=((cpr[0]==(NPROC0-1))&&(((*b).bo[0]+(*b).bs[0])==L0)&&(bc_type()==0)); + ibd=((cpr[0]==0)&&((*b).bo[0]==0)); + + for (;m=nb)) + { + error_loc(1,1,"Dwoo_blk_dble [Dw_dble.c]", + "Block grid is not allocated or block number out of range"); + return; + } + + if ((k<0)||(l<0)||(k>=(*b).nsd)||(l>=(*b).nsd)||((*b).ud==NULL)) + { + error_loc(1,1,"Dwoo_blk_dbl [Dw_dble.c]", + "Attempt to access unallocated memory space"); + return; + } + + b+=n; + vol=(*b).vol; + volh=vol/2; + so=(spin_t*)((*b).sd[k]+volh); + ro=(spin_t*)((*b).sd[l]+volh); + tm=tm_parms(); + if (tm.eoflg==1) + mu=0.0; + + m=(*b).swd+vol; + mm=m+vol; + + if ((*b).nbp) + { + piup=(*b).iup[volh]; + pidn=(*b).idn[volh]; + + ibu=((cpr[0]==(NPROC0-1))&&(((*b).bo[0]+(*b).bs[0])==L0)&&(bc_type()==0)); + ibd=((cpr[0]==0)&&((*b).bo[0]==0)); + + for (;m=nb)) + { + error_loc(1,1,"Dwoe_blk_dble [Dw_dble.c]", + "Block grid is not allocated or block number out of range"); + return; + } + + if ((k<0)||(l<0)||(k>=(*b).nsd)||(l>=(*b).nsd)||((*b).ud==NULL)) + { + error_loc(1,1,"Dwoe_blk_dbl [Dw_dble.c]", + "Attempt to access unallocated memory space"); + return; + } + + b+=n; + vol=(*b).vol; + volh=vol/2; + s=(*b).sd[k]; + ro=(spin_t*)((*b).sd[l]+volh); + s[vol]=sd0; + + coe=-0.5; + piup=(*b).iup[volh]; + pidn=(*b).idn[volh]; + u=(*b).ud; + um=u+4*vol; + + if ((*b).nbp) + { + ibp=(*b).ibp; + ibm=ibp+(*b).nbp/2; + + for (;ibp=nb)) + { + error_loc(1,1,"Dweo_blk_dble [Dw_dble.c]", + "Block grid is not allocated or block number out of range"); + return; + } + + if ((k<0)||(l<0)||(k>=(*b).nsd)||(l>=(*b).nsd)||((*b).ud==NULL)) + { + error_loc(1,1,"Dweo_blk_dbl [Dw_dble.c]", + "Attempt to access unallocated memory space"); + return; + } + + b+=n; + vol=(*b).vol; + volh=vol/2; + so=(spin_t*)((*b).sd[k]+volh); + r=(*b).sd[l]; + r[vol]=sd0; + + ceo=0.5; + piup=(*b).iup[volh]; + pidn=(*b).idn[volh]; + u=(*b).ud; + um=u+4*vol; + + if ((*b).nbp) + { + ibu=((cpr[0]==(NPROC0-1))&&(((*b).bo[0]+(*b).bs[0])==L0)&&(bc_type()==0)); + ibd=((cpr[0]==0)&&((*b).bo[0]==0)); + + for (;u=nb)) + { + error_loc(1,1,"Dwhat_blk_dble [Dw_dble.c]", + "Block grid is not allocated or block number out of range"); + return; + } + + if ((k<0)||(l<0)||(k==l)||(k>=(*b).nsd)||(l>=(*b).nsd)||((*b).ud==NULL)) + { + error_loc(1,1,"Dwhat_blk_dbl [Dw_dble.c]", + "Attempt to access unallocated memory space"); + return; + } + + b+=n; + vol=(*b).vol; + volh=vol/2; + s=(*b).sd[k]; + r=(*b).sd[l]; + + s[vol]=sd0; + r[vol]=sd0; + m=(*b).swd; + apply_sw_dble(volh,mu,m,s,r); + + coe=-0.5; + ceo=0.5; + piup=(*b).iup[volh]; + pidn=(*b).idn[volh]; + m+=vol; + u=(*b).ud; + um=u+4*vol; + + if ((*b).nbp) + { + ibp=(*b).ibp; + ibm=ibp+(*b).nbp/2; + + for (;ibp]" (after any number of blanks), where is + the integer value passed by the argument. An error occurs if no such + line or more than one is found. The lines + + action + ipf + im0 + irat + imu [] + isp [] + + are then read using read_line() [utils/mutils.c]. Depending on the + value of "action", some lines are not read and can be omitted in + the input file. The number of integer items on the lines with tag + "imu" and "isp" depends on the action too. The data are then added + to the data base by calling set_action_parms(iact,...). + +void print_action_parms(void) + Prints the parameters of the defined actions to stdout on MPI + process 0. + +void write_action_parms(FILE *fdat) + Writes the parameters of the defined actions to the file fdat on + MPI process 0. + +void check_action_parms(FILE *fdat) + Compares the parameters of the defined actions with those stored + on the file fdat on MPI process 0, assuming the latter were written + to the file by the program write_action_parms(). + +dfl_parms_t set_dfl_parms(int *bs,int Ns) + Sets the parameters of the deflation subspace. The parameters are + + bs[4] Sizes of the blocks in DFL_BLOCKS block grid. + + Ns Number of deflation modes per block (must be + even and non-zero). + + The return value is a structure that contains the above parameters. + Note that these parameters can only be set once. + +dfl_parms_t dfl_parms(void) + Returns the parameters currently set for the deflation subspace. + +dfl_pro_parms_t set_dfl_pro_parms(int nkv,int nmx,double res) + Sets the parameters used when applying the deflation projection in the + deflated solver program dfl_sap_gcr(). The parameters are + + nkv Maximal number of Krylov vectors to be used by the + solver for the little Dirac equation before a restart. + + nmx Maximal total number of Krylov vectors generated by + the solver for the little Dirac equation. + + res Required relative residue when solving the little + Dirac equation. + + The return value is a structure that contains the above parameters. + +dfl_pro_parms_t dfl_pro_parms(void) + Returns the parameters currently set for the deflation projectors in + the deflated solver program dfl_sap_gcr(). + +dfl_gen_parms_t set_dfl_gen_parms(double kappa,double mu, + int ninv,int nmr,int ncy, + int nkv,int nmx,double res) + Sets the parameters of the inverse iteration procedure that generates + the deflation subspace. The parameters are + + kappa Hopping parameter of the Dirac operator. + + mu Twisted mass parameter. + + ninv Total number of inverse iteration steps (ninv>=4). + + nmr Number of block minimal residual iterations to be + used when the SAP smoother is applied. + + ncy Number of SAP cycles per inverse iteration. + + The return value is a structure that contains the above parameters and + the bare mass m0 that corresponds to the hopping parameter kappa. + +dfl_gen_parms_t dfl_gen_parms(void) + Returns the parameters currently set for the generation of the deflation + subspace plus the corresponding bare mass m0. + +dfl_upd_parms_t set_dfl_upd_parms(double dtau,int nsm) + Sets the parameters of the deflation subspace update scheme. The + parameters are + + dtau Molecular-dynamics time separation between + updates of the deflation subspace. + + nsm Number of deflated smoothing interations to be + applied when the subspace is updated. + + The return value is a structure that contains the above parameters. + +dfl_upd_parms_t dfl_upd_parms(void) + Returns the parameters currently set for the deflation subspace + update scheme. + +void print_dfl_parms(int ipr) + Prints the parameters of the deflation subspace, the projectors, the + subspace generation algorithm and the update scheme to stdout on MPI + process 0. The update scheme is omitted if ipr=0. + +void write_dfl_parms(FILE *fdat) + Writes the parameters of the deflation subspace, the projectors, the + subspace generation algorithm and the update scheme to the file fdat + on MPI process 0. + +void check_dfl_parms(FILE *fdat) + Compares the parameters of the deflation subspace, the projectors the + subspace generation algorithm and the update scheme with the values + stored on the file fdat on MPI process 0, assuming the latter were + written to the file by the program write_dfl_parms() (mismatches of + maximal solver iteration numbers are not considered to be an error). + +void set_flags(event_t event) + Reports an event to the data base, which changed the global gauge + gauge or SW fields. + +void set_grid_flags(blk_grid_t grid,event_t event) + Reports an event to the data base, which changed the gauge or SW + fields on the specified block grid. + +int query_flags(query_t query) + Queries the data base on the status of the global gauge or SW + fields. The program returns 1 or 0 depending on whether the answer + to the specified query is "yes" or "no". If the query is unknown to + the data base, the program returns -1. + +int query_grid_flags(blk_grid_t grid,query_t query) + Queries the data base on the status of the gauge or SW fields on + the specified block grid. The program returns 1 or 0 depending on + whether the answer to the specified query is "yes" or "no". If the + query is unknown to the data base, the program returns -1. + +void print_flags(void) + Prints the current values of all flags related to the global gauge + and SW fields to stdout from process 0. + +void print_grid_flags(blk_grid_t grid) + Prints the current values of all flags related to the gauge and SW + fields on the specified block grid to stdout from process 0. + +force_parms_t set_force_parms(int ifr,force_t force,int ipf,int im0, + int *irat,int *imu,int *isp,int *ncr) + Sets the parameters in the force parameter set number ifr and returns + a structure containing them (see the notes). + +force_parms_t force_parms(int ifr) + Returns a structure containing the force parameter set number ifr + (see the notes). + +void read_force_parms(int ifr) + On process 0, this program scans stdin for a line starting with the + string "[Force ]" (after any number of blanks), where is + the integer value passed by the argument. An error occurs if no such + line or more than one is found. The lines + + force + ipf + im0 + irat + imu [] + isp [] + ncr [] + + are then read using read_line() [utils/mutils.c]. Depending on the + value of "force", some lines are not read and can be omitted in the + input file. The number of integer items on the lines with tag "imu" + and "isp" and "ncr" depends on the force too. The data are then added + to the data base by calling set_force_parms(ifr,...). + +void read_force_parms2(int ifr) + Same as read_force_parms() except that only the lines + + force + isp [] + ncr [] + + are read from stdin. All other force parameters are inferred from + the parameters of the action no ifr so that the force is the one + deriving from that action. An error occurs if the parameters of the + action no ifr have not previously been added to the data base or + if the force and action types do not match. + +void print_force_parms(void) + Prints the parameters of the defined forces to stdout on MPI + process 0. + +void print_force_parms2(void) + Prints the parameters of the defined forces to stdout on MPI + process 0 in a short format corresponding to read_force_parms2(). + +void write_force_parms(FILE *fdat) + Writes the parameters of the defined forces to the file fdat on + MPI process 0. + +void check_force_parms(FILE *fdat) + Compares the parameters of the defined forces with those stored + on the file fdat on MPI process 0, assuming the latter were written + to the file by the program write_force_parms(). + +hmc_parms_t set_hmc_parms(int nact,int *iact,int npf,int nmu, + double *mu,int nlv,double tau) + Sets some basic parameters of the HMC algorithm. The parameters are + + nact Number of terms in the total action + + iact Indices iact[i] of the action terms (i=0,..,nact-1) + + npf Number of pseudo-fermion fields on which the action + depends + + nmu Number of twisted mass parameters on which the + pseudo-fermion actions and forces depend + + mu Twisted masses mu[i] (i=0,..,nmu-1) + + nlv Number of levels of the molecular-dynamics integrator + + tau Molecular-dynamics trajectory length + + The total action must include the gauge action, but pseudo-fermion + actions are optional and the momentum action is treated separately. + The program returns a structure that contains the parameters listed + above. + +hmc_parms_t hmc_parms(void) + Returns a structure containing the current values of the parameters + listed above. + +void print_hmc_parms(void) + Prints the lattice parameters to stdout on MPI process 0. + +lat_parms_t set_lat_parms(double beta,double c0, + double kappa_u,double kappa_s,double kappa_c, + double csw,double cG,double cF) + Sets the basic lattice parameters. The parameters are + + beta Inverse bare coupling (beta=6/g_0^2). + + c0 Coefficient of the plaquette loops in the gauge + action (see doc/gauge_action.pdf). + + kappa_{u,s,c} Hopping parameters of the u, s and c sea quarks. The + u and the d quark have the same hopping parameter and + quarks with vanishing hopping parameter are ignored. + + csw Coefficient of the Sheikholeslami-Wohlert term. + + cG,cF Coefficients of the gauge and fermion O(a) boundary + counterterms. + + The return value is a structure that contains the lattice parameters + and the associated bare quark masses m0u, m0s and m0c. + +lat_parms_t lat_parms(void) + Returns the current lattice parameters in a structure that contains + the above parameters plus the bare quark masses. + +void print_lat_parms(void) + Prints the lattice parameters to stdout on MPI process 0. + +bc_parms_t set_bc_parms(int type, + double cG,double cG_prime, + double cF,double cF_prime, + double *phi,double *phi_prime) + Sets the boundary conditions and the associated parameters of the + action. The parameters are + + type Chosen type of boundary condition (0: open, 1: SF, + 2: open-SF, 3: periodic). + + cG,cG_prime Gauge action improvement coefficients at time 0 + and T, respectively. + + cF,cF_prime Fermion action improvement coefficients at time 0 + and T, respectively. + + phi[0], First two angles that define the boundary values of + phi[1] the gauge field at time 0. + + phi_prime[0], First two angles that define the boundary values of + phi_prime[1] the gauge field at time T. + + The return value is a structure that contains these parameters plus + the third angles. In this structure, the improvement coefficients and + the angles are stored in the form of arrays cG[2],cF[2] and phi[2][3], + where cG[0],cF[0],phi[0][3] and cG[1],cF[1],phi[1][3] are the para- + meters at time 0 and T, respectively + Parameters that are not required for the specification of the chosen + boundary conditions are not read and are set to their default values + in the data base (angles to 0, improvement coefficients to 1). In the + case of SF boundary conditions (type 1), the program only reads cG,cF + and the angles phi,phi_prime and then sets cG_prime=cG,cF_prime=cF. + When open-SF boundary conditions are chosen, all parameters except for + the angles phi are read. + +bc_parms_t bc_parms(void) + Returns a structure that contains the boundary parameters. + +void print_bc_parms(void) + Prints the boundary parameters to stdout on MPI process 0. + +void write_bc_parms(FILE *fdat) + Writes the boundary parameters to the file fdat on MPI process 0. + +void check_bc_parms(FILE *fdat) + Compares the currently set boundary parameters with the values stored + on the file fdat on MPI process 0, assuming the latter were written to + the file by the program write_bc_parms(). + +double sea_quark_mass(int im0) + Returns the bare sea quark mass m0u if im0=0, m0s if im0=1 and m0c + if im0=2. In all other cases DBL_MAX is returned. + +sw_parms_t set_sw_parms(double m0) + Sets the parameters of the SW term. The parameter is + + m0 Bare quark mass. + + The return value is a structure that contains the mass m0 and the + improvement coefficients csw and cF, the latter being copied from + the list of the lattice parameters. + +sw_parms_t sw_parms(void) + Returns the parameters currently set for the SW term. The values + of the coefficients csw and cF are copied from the lattice parameter + list. + +tm_parms_t set_tm_parms(int io) + Sets the twisted-mass flag. The parameter is + + io Twisted-mass flag. If io=1, the twisted-mass term + in the Dirac operator, the SAP preconditioner and + the little Dirac operator is turned off on the odd + lattice sites. Otherwise it is applied everywhere. + + The return value is structure that contains the twisted-mass flag. + +tm_parms_t tm_parms(void) + Returns a structure containing the twisted-mass flag. + +mdint_parms_t set_mdint_parms(int ilv,integrator_t integrator,double lambda, + int nstep,int nfr,int *ifr) + Sets the parameters of the molecular-dynamics integrator at level + ilv and returns a structure containing them (see the notes). + +mdint_parms_t mdint_parms(int ilv) + Returns a structure containing the parameters of the integrator at + level ilv (see the notes). + +void read_mdint_parms(int ilv) + On process 0, this program scans stdin for a line starting with the + string "[Level ]" (after any number of blanks), where is + the integer value passed by the argument. An error occurs if no such + line or more than one is found. The lines + + integrator + lambda + nstep + forces [] + + are then read using read_line() [utils/mutils.c]. The line tagged + "lambda" is required only when the specified integrator is the 2nd + order OMF integrator. The line tagged "forces" must contain the + indices of the forces (separated by white space) that are to be + integrated at this level. On exit, the data are entered in the data + base by calling set_mdint_parms(ilv,...). + +void print_mdint_parms(void) + Prints the parameters of the defined integrator levels to stdout + on MPI process 0. + +void write_mdint_parms(FILE *fdat) + Writes the parameters of the defined integrator levels to the file + fdat on MPI process 0. + +void check_mdint_parms(FILE *fdat) + Compares the parameters of the defined integrator levels with those + stored on the file fdat on MPI process 0, assuming the latter were + written to the file by the program write_mdint_parms(). + +rat_parms_t set_rat_parms(int irp,int degree,double *range) + Sets the parameters in the rational function parameter set number + irp and returns a structure containing them (see the notes). + +rat_parms_t rat_parms(int irp) + Returns a structure containing the rational function parameter set + number irp (see the notes). + +void read_rat_parms(int irp) + On process 0, this program scans stdin for a line starting with the + string "[Rational ]" (after any number of blanks), where is + the integer value passed by the argument. An error occurs if no such + line or more than one is found. The lines + + degree + range + + are then read using read_line() [utils/mutils.c] and the data are + entered into the data base by calling set_rat_parms(). + +void print_rat_parms(void) + Prints the defined rational function parameter sets to stdout on MPI + process 0. + +void write_rat_parms(FILE *fdat) + Writes the defined rational function parameter sets to the file fdat + on MPI process 0. + +void check_rat_parms(FILE *fdat) + Compares the defined rational function parameter sets with those + on the file fdat on MPI process 0, assuming the latter were written + to the file by the program write_rat_parms(). + +rw_parms_t set_rw_parms(int irw,rwfact_t rwfact,int im0,int nsrc, + int irp,int nfct,double *mu,int *np,int *isp) + Sets the parameters in the reweighting factor parameter set number + irw and returns a structure containing them (see the notes). + +rw_parms_t rw_parms(int irw) + Returns a structure containing the reweighting factor parameter set + number irw (see the notes). + +void read_rw_parms(int irw) + On process 0, this program scans stdin for a line starting with the + string "[Reweighting factor ]" (after any number of blanks), where + is the integer value passed through the argument. An error occurs + if no such line or more than one is found. The lines + + rwfact + im0 + nsrc + irp + mu [] + np [] + isp [] + + are then read using read_line() [utils/mutils.c] and the data are + added to the data base by calling set_rw_parms(irw,...). Depending + on the value of "rwfact", some lines are not read and can be omitted + in the input file. The number of items on the lines with tag "mu", + "np" and "isp" depends on the reweighting factor too (see the notes). + +void print_rw_parms(void) + Prints the defined reweighting factor parameter sets to stdout on + MPI process 0. + +void write_rw_parms(FILE *fdat) + Writes the defined reweighting factor parameter sets to the file fdat + on MPI process 0. + +void check_rw_parms(FILE *fdat) + Compares the defined reweighting factor parameter sets with those + on the file fdat on MPI process 0, assuming the latter were written + to the file by the program write_rw_parms(). + +sap_parms_t set_sap_parms(int *bs,int isolv,int nmr,int ncy) + Sets the parameters of the SAP preconditioner. The parameters are + + bs[4] Sizes of the blocks in SAP_BLOCKS block grid. + + isolv Block solver to be used (0: plain MinRes, + 1: eo-preconditioned MinRes). + + nmr Number of block solver iterations. + + ncy Number of SAP cycles to be applied. + + The return value is a structure that contains the parameters of the + SAP preconditioners. The block sizes bs[4] can only be set once, but + the values of the other parameters may be changed by calling the + program again. + +sap_parms_t sap_parms(void) + Returns the parameters currently set for the SAP preconditioner. + +void print_sap_parms(int ipr) + Prints the SAP parameters to stdout on MPI process 0. Depending + on whether ipr!=0 or 0, the full information is printed or only + the block size. + +sf_parms_t set_sf_parms(double *phi,double *phi_prime) + Sets the parameters of the boundary fields in the Schroedinger + functional. The parameters are + + phi Angles phi[0],phi[1] at time 0. + + phi_prime Angles phi[0],phi[1] at time T. + + See the notes for further explanations. This program may only be + called once. + +sf_parms_t sf_parms(void) + Returns the parameters of the boundary fields currently set. + +void print_sf_parms(void) + Prints the parameters of the boundary fields to stdout on MPI + process 0. + +void write_sf_parms(FILE *fdat) + Writes the parameters of the boundary fields to the file fdat on + MPI process 0. + +void check_sf_parms(FILE *fdat) + Compares the parameters of the boundary fields with the values + stored on the file fdat on MPI process 0, assuming the latter were + written to the file by the program write_sf_parms(). + +int sf_flg(void) + Returns 1 if the Schroedinger-functional boundary values have been + initialized and 0 otherwise. + +solver_parms_t set_solver_parms(int isp,solver_t solver, + int nkv,int isolv,int nmr,int ncy, + int nmx,double res) + Sets the parameters in the solver parameter set number isp and returns + a structure containing them (see the notes). + +solver_parms_t solver_parms(int isp) + Returns a structure containing the solver parameter set number + isp (see the notes). + +void read_solver_parms(int isp) + On process 0, this program scans stdin for a line starting with the + string "[Solver ]" (after any number of blanks), where is + the integer value passed by the argument. An error occurs if no such + line or more than one is found. The lines + + solver + nkv + isolv + nmr + ncy + nmx + res + + are then read one by one using read_line() [utils/mutils.c]. The + lines with tags nkv,..,ncy may be absent in the case of the CGNE + and MSCG solvers (see the notes). The data are then added to the + data base by calling set_solver_parms(isp,...). + +void print_solver_parms(int *isap,int *idfl) + Prints the parameters of the defined solvers to stdout on MPI + process 0. On exit the flag isap is 1 or 0 depending on whether + one of the solvers makes use of the Schwarz Alternating Procedure + (SAP) or not. Similarly, the flag idfl is set 1 or 0 depending on + whether deflation is used or not. On MPI processes other than 0, + the program does nothing and sets isap and idfl to zero. + +void write_solver_parms(FILE *fdat) + Writes the parameters of the defined solvers to the file fdat on + MPI process 0. + +void check_solver_parms(FILE *fdat) + Compares the parameters of the defined solvers with those stored + on the file fdat on MPI process 0, assuming the latter were written + to the file by the program write_solver_parms(). Mismatches of the + maximal solver iteration number are not considered to be an error. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/flags/README.flags b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/flags/README.flags new file mode 100644 index 0000000000000000000000000000000000000000..d892065070035a696da75c7f9acdf8865cea3ac8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/flags/README.flags @@ -0,0 +1,149 @@ + +******************************************************************************** + + Flags data base explained + +******************************************************************************** + + +Summary +------- + +The tasks carried out in a main program depend on the relevant preparatory +steps being taken in the proper order. The data base that is maintained by the +programs in this module enables the programmer to to check whether the field +arrays are in the proper condition for a specific task using the query_flags() +function. + +Internally this works by assigning a unique tag to every new gauge field +configuration. Other fields that depend on the gauge fields then inherit the +tag when they are calculated. Clearly the data base must be informed about any +steps taken. The function set_flags() does that for the case when the global +fields are concerned. The current lists of events and queries are defined in +the file flags.h and should be self-explanatory. + +In general the philosophy underlying the data base is that the flags reflect +the current contents of the field arrays that are monitored. To ensure the +consistency of the data base, any program that changes the fields must include +a corresponding set_flags() statement. There should be no exception to this +rule. + + +Full-lattice flags +------------------ + +The flags related to the global fields are stored in a structure + +struct +{ + int u,ud; + int udbuf,bstap,fts; + int sw[3],swd[3]; + int aw,awh; +} lat + +with the following elements: + +lat.u Tag of the current single-precision gauge field. + +lat.ud Tag of the current double-precision gauge field. + +lat.udbuf Tag of the double-precision field when its values + at the boundaries of the local lattice were last + copied from the neighbouring MPI processes. + +lat.bstap Tag of the double-precision gauge field when the + boundary staples were last calculated. + +lat.fts Tag of the double-precision gauge field when the + gauge-field tensor was last calculated. + +lat.sw[0] Tag of the gauge field from which the current + single-precision SW-term was calculated. + +lat.sw[1] Indicates whether the single-precision SW-term on + the even sites is inverted (lat.sw[1]=1) or not + (lat.sw[1]=0). + +lat.sw[2] Indicates whether the single-precision SW-term on + the odd sites is inverted (lat.sw[2]=1) or not + (lat.sw[2]=0). + +lat.swd[0] Tag of the gauge field from which the current + double-precision SW-term was calculated. + +lat.swd[1] Indicates whether the double-precision SW-term on + the even sites is inverted (lat.swd[1]=1) or not + (lat.swd[1]=0). + +lat.swd[2] Indicates whether the double-precision SW-term on + the odd sites is inverted (lat.swd[2]=1) or not + (lat.swd[2]=0). + +lat.aw Tag of the double-precision gauge field when the + little Dirac operator was last calculated. + +lat.awh Tag of the double-precision gauge field when the + even-odd preconditioned little Dirac operator was + last calculated. + +Block-grid flags +---------------- + +The data base monitors the fields on the block grids too. Flags are currently +set for two block grids (GCR_BLOCKS and DFL_BLOCKS), but further grids could +easily be incorporated. + +A complication arises from the fact that blocks may share some of the fields. +The data base only keeps track of the fields that are *not* shared. Querying +the status of a shared field is an error recorded by the error_loc() function. + +For each grid, the associated flags are contained in a structure + +typedef struct +{ + int shf; + int u,ud; + int sw[3],swd[3]; +} gf + +with the following elements: + +gf.shf Share flags of the blocks on the block grid. + The bits b1,b2 (counting from the lowest) in + this number are + + b1=1: b.u and bb.u are shared, + b2=1: b.ud and bb.ud are shared. + + All other bits are set to zero. + +gf.u Tag of the single-precision gauge field on the + blocks (=0 if the field is shared). + +gf.ud Tag of the double-precision gauge field on the + blocks (=0 if the field is shared). + +gf.sw[0] Tag of the gauge field at which the current + single-precision SW term on the blocks was + calculated (=0 if the gauge field is shared). + +gf.sw[1] Indicates whether the single-precision SW term + on the even sites of the block is inverted + (gf.sw[1]=1) or not (gf.sw[1]=0). + +gf.sw[2] Indicates whether the single-precision SW term + on the odd sites of the block is inverted + (gf.sw[2]=1) or not (gf.sw[2]=0). + +gf.swd[0] Tag of the gauge field from which the current + double-precision SW term on the block was + calculated (=0 if the gauge field is shared). + +gf.swd[1] Indicates whether the double-precision SW term + on the even sites on the block is inverted + (gf.swd[1]=1) or not (gf.swd[1]=0) + +gf.swd[2] Indicates whether the double-precision SW term + on the odd sites of the block is inverted + (gf.swd[2]=1) or not (gf.swd[2]=0) diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/flags/action_parms.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/flags/action_parms.c new file mode 100644 index 0000000000000000000000000000000000000000..0d815baa385a8823ccd7f888db14f0e124be0eb0 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/flags/action_parms.c @@ -0,0 +1,540 @@ + +/******************************************************************************* +* +* File action_parms.c +* +* Copyright (C) 2011-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Action parameter data base. +* +* The externally accessible functions are +* +* action_parms_t set_action_parms(int iact,action_t action,int ipf, +* int im0,int *irat,int *imu,int *isp) +* Sets the parameters in the action parameter set number iact and returns +* a structure containing them (see the notes). +* +* action_parms_t action_parms(int iact) +* Returns a structure containing the action parameter set number iact +* (see the notes). +* +* void read_action_parms(int iact) +* On process 0, this program scans stdin for a line starting with the +* string "[Action ]" (after any number of blanks), where is +* the integer value passed by the argument. An error occurs if no such +* line or more than one is found. The lines +* +* action +* ipf +* im0 +* irat +* imu [] +* isp [] +* +* are then read using read_line() [utils/mutils.c]. Depending on the +* value of "action", some lines are not read and can be omitted in +* the input file. The number of integer items on the lines with tag +* "imu" and "isp" depends on the action too. The data are then added +* to the data base by calling set_action_parms(iact,...). +* +* void print_action_parms(void) +* Prints the parameters of the defined actions to stdout on MPI +* process 0. +* +* void write_action_parms(FILE *fdat) +* Writes the parameters of the defined actions to the file fdat on +* MPI process 0. +* +* void check_action_parms(FILE *fdat) +* Compares the parameters of the defined actions with those stored +* on the file fdat on MPI process 0, assuming the latter were written +* to the file by the program write_action_parms(). +* +* Notes: +* +* For a description of the supported actions and their parameters see +* forces/README.forces. +* +* The elements of a structure of type action_parms_t are +* +* action Action program used. This parameter is an enum type with +* one of the following values: +* +* ACG (program action0() [forces/force0.c]), +* +* ACF_TM1 (program action1() [forces/force1.c]), +* +* ACF_TM1_EO (program action4() [forces/force4.c]), +* +* ACF_TM1_EO_SDET (program action4() [forces/force4.c]), +* +* ACF_TM2 (program action2() [forces/force2.c]), +* +* ACF_TM2_EO (program action5() [forces/force5.c]), +* +* ACF_RAT (program action3() [forces/force3.c]), +* +* ACF_RAT_SDET (program action3() [forces/force3.c]), +* +* ipf Pseudo-fermion field index (see mdflds/mdflds.c), +* +* im0 Index of the bare sea quark mass in parameter data base +* (see flags/lat_parms.c), +* +* irat Indices specifying a rational function (see ratfcts/ratfcts.c), +* +* imu Twisted mass indices (see flags/hmc_parms.c), +* +* isp Solver parameter set indices (see flags/solver_parms.c). +* +* Depending on the action, some parameters are not used and are set to zero +* by set_action_parms() independently of the values of the arguments. In +* particular, for a given action, only the required number of integers are +* read from the arrays imu and isp passed to the program. +* +* The number of twisted mass indices and solver parameter set indices is +* 1 and 2 in the case of the actions ACF_TM1* and ACF_TM2*, where isp[k] is +* the solver parameter set used for the solution of the Dirac equation with +* twisted mass index imu[k]. +* +* Up to 32 action parameter sets, labeled by an index iact=0,1,..,31, can +* be specified. Once a set is specified, it cannot be changed by calling +* set_action_parms() again. Action parameters must be globally the same. +* +* Except for action_parms(), the programs in this module perform global +* operations and must be called simultaneously on all MPI processes. +* +*******************************************************************************/ + +#define ACTION_PARMS_C + +#include +#include +#include +#include +#include "mpi.h" +#include "utils.h" +#include "flags.h" +#include "global.h" + +#define IACMAX 32 + +static int init=0; +static action_t action[]={ACG,ACF_TM1,ACF_TM1_EO,ACF_TM1_EO_SDET, + ACF_TM2,ACF_TM2_EO,ACF_RAT,ACF_RAT_SDET}; +static action_parms_t ap[IACMAX+1]={{ACTIONS,0,0,{0,0,0},{0,0,0,0},{0,0,0,0}}}; + + +static void init_ap(void) +{ + int i; + + for (i=1;i<=IACMAX;i++) + ap[i]=ap[0]; + + init=1; +} + + +action_parms_t set_action_parms(int iact,action_t action, + int ipf,int im0,int *irat,int *imu,int *isp) +{ + int iprms[15],i,ie; + int rat[3],mu[4],sp[4]; + + if (init==0) + init_ap(); + + for (i=0;i<3;i++) + rat[i]=0; + + for (i=0;i<4;i++) + { + mu[i]=0; + sp[i]=0; + } + + if ((action==ACG)||(action==ACTIONS)) + { + ipf=0; + im0=0; + } + else if ((action==ACF_TM1)||(action==ACF_TM1_EO)||(action==ACF_TM1_EO_SDET)) + { + mu[0]=imu[0]; + sp[0]=isp[0]; + } + else if ((action==ACF_TM2)||(action==ACF_TM2_EO)) + { + mu[0]=imu[0]; + mu[1]=imu[1]; + sp[0]=isp[0]; + sp[1]=isp[1]; + } + else if ((action==ACF_RAT)||(action==ACF_RAT_SDET)) + { + rat[0]=irat[0]; + rat[1]=irat[1]; + rat[2]=irat[2]; + sp[0]=isp[0]; + } + + if (NPROC>1) + { + iprms[0]=iact; + iprms[1]=(int)(action); + iprms[2]=ipf; + iprms[3]=im0; + + for (i=0;i<3;i++) + iprms[4+i]=rat[i]; + + for (i=0;i<4;i++) + { + iprms[7+i]=mu[i]; + iprms[11+i]=sp[i]; + } + + MPI_Bcast(iprms,15,MPI_INT,0,MPI_COMM_WORLD); + + ie=0; + ie|=(iprms[0]!=iact); + ie|=(iprms[1]!=(int)(action)); + ie|=(iprms[2]!=ipf); + ie|=(iprms[3]!=im0); + + for (i=0;i<3;i++) + ie|=(iprms[4+i]!=rat[i]); + + for (i=0;i<4;i++) + { + ie|=(iprms[7+i]!=mu[i]); + ie|=(iprms[11+i]!=sp[i]); + } + + error(ie!=0,1,"set_action_parms [action_parms.c]", + "Parameters are not global"); + } + + ie=0; + ie|=((iact<0)||(iact>=IACMAX)); + ie|=(action==ACTIONS); + ie|=((ipf<0)||(im0<0)); + + for (i=0;i<3;i++) + ie|=(rat[i]<0); + + for (i=0;i<4;i++) + ie|=((mu[i]<0)||(sp[i]<0)); + + error_root(ie!=0,1,"set_action_parms [action_parms.c]", + "Parameters are out of range"); + + error_root(ap[iact].action!=ACTIONS,1,"set_action_parms [action_parms.c]", + "Attempt to reset already specified action parameters"); + + ap[iact].action=action; + ap[iact].ipf=ipf; + ap[iact].im0=im0; + + for (i=0;i<3;i++) + ap[iact].irat[i]=rat[i]; + + for (i=0;i<4;i++) + { + ap[iact].imu[i]=mu[i]; + ap[iact].isp[i]=sp[i]; + } + + return ap[iact]; +} + + +action_parms_t action_parms(int iact) +{ + if (init==0) + init_ap(); + + if ((iact>=0)&&(iact1) + { + MPI_Bcast(&ida,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&ipf,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&im0,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(irat,3,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(imu,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(isp,4,MPI_INT,0,MPI_COMM_WORLD); + } + + set_action_parms(iact,action[ida],ipf,im0,irat,imu,isp); +} + + +void print_action_parms(void) +{ + int my_rank,i; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if ((my_rank==0)&&(init==1)) + { + for (i=0;i=4). +* +* nmr Number of block minimal residual iterations to be +* used when the SAP smoother is applied. +* +* ncy Number of SAP cycles per inverse iteration. +* +* The return value is a structure that contains the above parameters and +* the bare mass m0 that corresponds to the hopping parameter kappa. +* +* dfl_gen_parms_t dfl_gen_parms(void) +* Returns the parameters currently set for the generation of the deflation +* subspace plus the corresponding bare mass m0. +* +* dfl_upd_parms_t set_dfl_upd_parms(double dtau,int nsm) +* Sets the parameters of the deflation subspace update scheme. The +* parameters are +* +* dtau Molecular-dynamics time separation between +* updates of the deflation subspace. +* +* nsm Number of deflated smoothing interations to be +* applied when the subspace is updated. +* +* The return value is a structure that contains the above parameters. +* +* dfl_upd_parms_t dfl_upd_parms(void) +* Returns the parameters currently set for the deflation subspace +* update scheme. +* +* void print_dfl_parms(int ipr) +* Prints the parameters of the deflation subspace, the projectors, the +* subspace generation algorithm and the update scheme to stdout on MPI +* process 0. The update scheme is omitted if ipr=0. +* +* void write_dfl_parms(FILE *fdat) +* Writes the parameters of the deflation subspace, the projectors, the +* subspace generation algorithm and the update scheme to the file fdat +* on MPI process 0. +* +* void check_dfl_parms(FILE *fdat) +* Compares the parameters of the deflation subspace, the projectors the +* subspace generation algorithm and the update scheme with the values +* stored on the file fdat on MPI process 0, assuming the latter were +* written to the file by the program write_dfl_parms() (mismatches of +* maximal solver iteration numbers are not considered to be an error). +* +* Notes: +* +* To ensure the consistency of the data base, the parameters must be set +* simultaneously on all processes. The types dfl_parms_t, ... are defined +* in the file flags.h. +* +*******************************************************************************/ + +#define DFL_PARMS_C + +#include +#include +#include +#include +#include "mpi.h" +#include "utils.h" +#include "flags.h" +#include "global.h" + +static dfl_parms_t dfl={{0,0,0,0},0}; +static dfl_pro_parms_t dfl_pro={0,0,1.0}; +static dfl_gen_parms_t dfl_gen={0,0,0,0.0,DBL_MAX,0.0}; +static dfl_upd_parms_t dfl_upd={0,0.0}; + + +static void check_block_size(int *bs) +{ + int n0,n1,n2,n3; + + error_root((bs[0]<4)||(bs[1]<4)||(bs[2]<4)||(bs[3]<4)|| + (bs[0]>L0)||(bs[1]>L1)||(bs[2]>L2)||(bs[3]>L3),1, + "check_block_size [dfl_parms.c]", + "Block sizes are out of range"); + + error_root((bs[0]%2)||(bs[1]%2)||(bs[2]%2)||(bs[3]%2),1, + "check_block_size [dfl_parms.c]", + "Block sizes must be even"); + + error_root((L0%bs[0])||(L1%bs[1])||(L2%bs[2])||(L3%bs[3]),1, + "check_block_size [dfl_parms.c]", + "Blocks do not divide the local lattice"); + + n0=L0/bs[0]; + n1=L1/bs[1]; + n2=L2/bs[2]; + n3=L3/bs[3]; + + error_root(((NPROC0*n0)%2)||((NPROC1*n1)%2)|| + ((NPROC2*n2)%2)||((NPROC3*n3)%2),1, + "check_block_size [dfl_parms.c]", + "There must be an even number of blocks in each direction"); + + error_root((n0*n1*n2*n3)%2,1, + "check_block_size [dfl_parms.c]", + "The number of blocks in the local lattice must be even"); +} + + +dfl_parms_t set_dfl_parms(int *bs,int Ns) +{ + int iprms[5]; + + if (NPROC>1) + { + iprms[0]=bs[0]; + iprms[1]=bs[1]; + iprms[2]=bs[2]; + iprms[3]=bs[3]; + iprms[4]=Ns; + + MPI_Bcast(iprms,5,MPI_INT,0,MPI_COMM_WORLD); + + error((iprms[0]!=bs[0])||(iprms[1]!=bs[1])||(iprms[2]!=bs[2])|| + (iprms[3]!=bs[3])||(iprms[4]!=Ns),1, + "set_dfl_parms [dfl_parms.c]","Parameters are not global"); + } + + error_root((dfl.Ns>0)&&((bs[0]!=dfl.bs[0])||(bs[1]!=dfl.bs[1])|| + (bs[2]!=dfl.bs[2])||(bs[3]!=dfl.bs[3])|| + (Ns!=dfl.Ns)),1, + "set_dfl_parms [dfl_parms.c]","bs[4] and Ns may be set only once"); + + check_block_size(bs); + error_root((Ns<2)||(Ns&0x1),1,"set_dfl_parms [dfl_parms.c]", + "Improper value of Ns"); + + dfl.bs[0]=bs[0]; + dfl.bs[1]=bs[1]; + dfl.bs[2]=bs[2]; + dfl.bs[3]=bs[3]; + dfl.Ns=Ns; + + return dfl; +} + + +dfl_parms_t dfl_parms(void) +{ + return dfl; +} + + +dfl_pro_parms_t set_dfl_pro_parms(int nkv,int nmx,double res) +{ + int iprms[2]; + double dprms[1]; + + if (NPROC>1) + { + iprms[0]=nkv; + iprms[1]=nmx; + + dprms[0]=res; + + MPI_Bcast(iprms,2,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(dprms,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + error((iprms[0]!=nkv)||(iprms[1]!=nmx)||(dprms[0]!=res),1, + "set_dfl_pro_parms [dfl_parms.c]","Parameters are not global"); + } + + error_root((nkv<1)||(nmx<1)||(res<=DBL_EPSILON),1, + "set_dfl_pro_parms [dfl_parms.c]","Improper parameter values"); + + dfl_pro.nkv=nkv; + dfl_pro.nmx=nmx; + dfl_pro.res=res; + + return dfl_pro; +} + + +dfl_pro_parms_t dfl_pro_parms(void) +{ + return dfl_pro; +} + + +dfl_gen_parms_t set_dfl_gen_parms(double kappa,double mu, + int ninv,int nmr,int ncy) +{ + int iprms[3]; + double dprms[2]; + + if (NPROC>1) + { + iprms[0]=ninv; + iprms[1]=nmr; + iprms[2]=ncy; + + dprms[0]=kappa; + dprms[1]=mu; + + MPI_Bcast(iprms,3,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(dprms,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + + error((iprms[0]!=ninv)||(iprms[1]!=nmr)||(iprms[2]!=ncy)|| + (dprms[0]!=kappa)||(dprms[1]!=mu),1, + "set_dfl_gen_parms [dfl_parms.c]","Parameters are not global"); + } + + error_root((ninv<4)||(nmr<1)||(ncy<1)||(kappa<0.0),1, + "set_dfl_gen_parms [dfl_parms.c]","Parameters are out of range"); + + dfl_gen.ninv=ninv; + dfl_gen.nmr=nmr; + dfl_gen.ncy=ncy; + + dfl_gen.kappa=kappa; + dfl_gen.mu=mu; + + if (kappa!=0.0) + dfl_gen.m0=1.0/(2.0*kappa)-4.0; + else + dfl_gen.m0=DBL_MAX; + + return dfl_gen; +} + + +dfl_gen_parms_t dfl_gen_parms(void) +{ + return dfl_gen; +} + + +dfl_upd_parms_t set_dfl_upd_parms(double dtau,int nsm) +{ + int iprms[1]; + double dprms[1]; + + if (NPROC>1) + { + iprms[0]=nsm; + dprms[0]=dtau; + + MPI_Bcast(iprms,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(dprms,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + error((iprms[0]!=nsm)||(dprms[0]!=dtau),1, + "set_dfl_upd_parms [dfl_parms.c]","Parameters are not global"); + } + + error_root((dtau<0.0)||(nsm<0),1, + "set_dfl_upd_parms [dfl_parms.c]","Improper parameter values"); + + dfl_upd.dtau=dtau; + dfl_upd.nsm=nsm; + + return dfl_upd; +} + + +dfl_upd_parms_t dfl_upd_parms(void) +{ + return dfl_upd; +} + + +void print_dfl_parms(int ipr) +{ + int my_rank,n; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + printf("Deflation subspace:\n"); + printf("bs = %d %d %d %d\n",dfl.bs[0],dfl.bs[1],dfl.bs[2],dfl.bs[3]); + printf("Ns = %d\n\n",dfl.Ns); + + printf("Deflation projection:\n"); + printf("nkv = %d\n",dfl_pro.nkv); + printf("nmx = %d\n",dfl_pro.nmx); + printf("res = %.1e\n\n",dfl_pro.res); + + printf("Deflation subspace generation:\n"); + n=fdigits(dfl_gen.kappa); + printf("kappa = %.*f\n",IMAX(n,6),dfl_gen.kappa); + n=fdigits(dfl_gen.mu); + printf("mu = %.*f\n",IMAX(n,1),dfl_gen.mu); + printf("ninv = %d\n",dfl_gen.ninv); + printf("nmr = %d\n",dfl_gen.nmr); + printf("ncy = %d\n\n",dfl_gen.ncy); + + if (ipr) + { + printf("Deflation subspace update scheme:\n"); + n=fdigits(dfl_upd.dtau); + printf("dtau = %.*f\n",IMAX(n,1),dfl_upd.dtau); + printf("nsm = %d\n\n",dfl_upd.nsm); + } + } +} + + +void write_dfl_parms(FILE *fdat) +{ + int my_rank,endian; + int i,iw; + stdint_t istd[11]; + double dstd[4]; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + endian=endianness(); + + if (my_rank==0) + { + for (i=0;i<4;i++) + istd[i]=(stdint_t)(dfl.bs[i]); + + istd[4]=(stdint_t)(dfl.Ns); + istd[5]=(stdint_t)(dfl_pro.nkv); + istd[6]=(stdint_t)(dfl_pro.nmx); + istd[7]=(stdint_t)(dfl_gen.ninv); + istd[8]=(stdint_t)(dfl_gen.nmr); + istd[9]=(stdint_t)(dfl_gen.ncy); + istd[10]=(stdint_t)(dfl_upd.nsm); + + dstd[0]=dfl_pro.res; + dstd[1]=dfl_gen.kappa; + dstd[2]=dfl_gen.mu; + dstd[3]=dfl_upd.dtau; + + if (endian==BIG_ENDIAN) + { + bswap_int(11,istd); + bswap_double(4,dstd); + } + + iw=fwrite(istd,sizeof(stdint_t),11,fdat); + iw+=fwrite(dstd,sizeof(double),4,fdat); + error_root(iw!=15,1,"write_dfl_parms [dfl_parms.c]", + "Incorrect write count"); + } +} + + +void check_dfl_parms(FILE *fdat) +{ + int my_rank,endian; + int i,ir,ie; + stdint_t istd[11]; + double dstd[4]; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + endian=endianness(); + + if (my_rank==0) + { + ir=fread(istd,sizeof(stdint_t),11,fdat); + ir+=fread(dstd,sizeof(double),4,fdat); + error_root(ir!=15,1,"check_dfl_parms [dfl_parms.c]", + "Incorrect read count"); + + if (endian==BIG_ENDIAN) + { + bswap_int(11,istd); + bswap_double(4,dstd); + } + + ie=0; + + for (i=0;i<4;i++) + ie|=(istd[i]!=(stdint_t)(dfl.bs[i])); + + ie|=(istd[4]!=(stdint_t)(dfl.Ns)); + ie|=(istd[5]!=(stdint_t)(dfl_pro.nkv)); + ie|=(istd[7]!=(stdint_t)(dfl_gen.ninv)); + ie|=(istd[8]!=(stdint_t)(dfl_gen.nmr)); + ie|=(istd[9]!=(stdint_t)(dfl_gen.ncy)); + ie|=(istd[10]!=(stdint_t)(dfl_upd.nsm)); + + ie|=(dstd[0]!=dfl_pro.res); + ie|=(dstd[1]!=dfl_gen.kappa); + ie|=(dstd[2]!=dfl_gen.mu); + ie|=(dstd[3]!=dfl_upd.dtau); + + error_root(ie!=0,1,"check_dfl_parms [dfl_parms.c]", + "Parameters do not match"); + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/flags/flags.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/flags/flags.c new file mode 100644 index 0000000000000000000000000000000000000000..94a49035c6af827a1a23cfd89203114033559330 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/flags/flags.c @@ -0,0 +1,391 @@ + +/******************************************************************************* +* +* File flags.c +* +* Copyright (C) 2009, 2011, 2012 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Flags data base input and query programs +* +* The externally accessible functions are +* +* void set_flags(event_t event) +* Reports an event to the data base, where some of the global field +* arrays are changed. +* +* void set_grid_flags(blk_grid_t grid,event_t event) +* Reports an event to the data base, where some of the field arrays +* on the specified block grid are changed. +* +* int query_flags(query_t query) +* Queries the data base on the status of the global field arrays. +* The program returns 1 or 0 depending on whether the answer to the +* specified query is "yes" or "no". If the query is unknown to the +* the data base, the program returns -1. +* +* int query_grid_flags(blk_grid_t grid,query_t query) +* Queries the data base on the status of the field arrays on the +* specified block grid. The program returns 1 or 0 depending on +* whether the answer to the specified query is "yes" or "no". If +* the query is unknown to the data base, the program returns -1. +* +* void print_flags(void) +* Prints the current values of all flags describing the state of +* the global field arrays to stdout on process 0. +* +* void print_grid_flags(blk_grid_t grid) +* Prints the current values of all flags describing the state of +* the field arrays on the specified block grid to stdout on +* process 0. +* +* Notes: +* +* The programs set_flags() and set_grid_flags() perform global operations +* and must be called on all processes simultaneously. As a consequence, +* the contents of the data base is the same everywhere. All other programs +* in this module can be called locally. +* +* The possible events and queries are defined in the header file flags.h. +* The associated actions are defined in the *.h files in the include/flags +* directory (application programs do not need to include these). +* +* For further explanations, see the file README.flags in this directory. +* +*******************************************************************************/ + +#define FLAGS_C + +#include +#include +#include +#include +#include +#include "mpi.h" +#include "utils.h" +#include "flags.h" +#include "global.h" + +#define NFLGS (9+4*(int)(BLK_GRIDS)) + +static struct +{ + int u,ud,udbuf; + int bstap,fts; + int sw[3],swd[3]; + int aw,awh; +} lat={0,0,0,0,0,{0,0,0},{0,0,0},0,0}; + +typedef struct +{ + int shf; + int u,ud; + int sw[3],swd[3]; +} grid_flags_t; + +static int init=0,tag=0; +static int flgs[NFLGS]; +static grid_flags_t gfs[(int)(BLK_GRIDS)+1]={{0x0,0,0,{0,0,0},{0,0,0}}},*gf; + + +static void set_flgs(void) +{ + int n,igr; + + flgs[0]=lat.u; + flgs[1]=lat.ud; + flgs[2]=lat.udbuf; + flgs[3]=lat.bstap; + flgs[4]=lat.fts; + flgs[5]=lat.sw[0]; + flgs[6]=lat.swd[0]; + flgs[7]=lat.aw; + flgs[8]=lat.awh; + + n=9; + + for (igr=0;igr<(int)(BLK_GRIDS);igr++) + { + flgs[n++]=gfs[igr].u; + flgs[n++]=gfs[igr].ud; + flgs[n++]=gfs[igr].sw[0]; + flgs[n++]=gfs[igr].swd[0]; + } +} + + +static void find_gap(int *a,int *d) +{ + int k,l; + int fk,h,hmax; + + (*a)=0; + (*d)=INT_MAX; + + for (k=0;k0)&&(h<(*d))) + (*d)=h; + } + + for (k=0;k0)&&(h(*d)) + { + (*a)=fk; + (*d)=hmax; + } + } +} + + +static void compress_flags(void) +{ + int k,a,d; + int n,igr; + + set_flgs(); + find_gap(&a,&d); + d-=1; + + for (k=0;ka) + flgs[k]-=d; + } + + lat.u=flgs[0]; + lat.ud=flgs[1]; + lat.udbuf=flgs[2]; + lat.bstap=flgs[3]; + lat.fts=flgs[4]; + lat.sw[0]=flgs[5]; + lat.swd[0]=flgs[6]; + lat.aw=flgs[7]; + lat.awh=flgs[8]; + + n=9; + + for (igr=0;igr<(int)(BLK_GRIDS);igr++) + { + gfs[igr].u=flgs[n++]; + gfs[igr].ud=flgs[n++]; + gfs[igr].sw[0]=flgs[n++]; + gfs[igr].swd[0]=flgs[n++]; + } + + tag-=d; +} + + +static int next_tag(void) +{ + if (tag==INT_MAX) + compress_flags(); + tag+=1; + + return tag; +} + +#include "flags/events.h" +#include "flags/grid_events.h" +#include "flags/queries.h" +#include "flags/grid_queries.h" + +static void set_arrays(void) +{ + int igr; + + for (igr=1;igr<=(int)(BLK_GRIDS);igr++) + gfs[igr]=gfs[0]; + + gfs[(int)(SAP_BLOCKS)].shf=0x0; + gfs[(int)(DFL_BLOCKS)].shf=0x2; + + set_events(); + set_grid_events(); + set_queries(); + set_grid_queries(); + + init=1; +} + + +void set_flags(event_t event) +{ + int iprms[1],iev; + + if (init==0) + set_arrays(); + + iev=(int)(event); + + if (NPROC>1) + { + iprms[0]=iev; + + MPI_Bcast(iprms,1,MPI_INT,0,MPI_COMM_WORLD); + + error(iprms[0]!=iev,1,"set_flags [flags.c]", + "Parameter is not global"); + } + + if (event_fcts[iev]==NULL) + error_root(1,1,"set_flags [flags.c]","No action associated to event"); + else + event_fcts[iev](); +} + + +void set_grid_flags(blk_grid_t grid,event_t event) +{ + int iprms[2],igr,iev; + + if (init==0) + set_arrays(); + + igr=(int)(grid); + iev=(int)(event); + + if (NPROC>1) + { + iprms[0]=igr; + iprms[1]=iev; + + MPI_Bcast(iprms,2,MPI_INT,0,MPI_COMM_WORLD); + + error((iprms[0]!=igr)||(iprms[1]!=iev),1, + "set_grid_flags [flags.c]","Parameters are not global"); + } + + if (grid==BLK_GRIDS) + error_root(1,1,"set_grid_flags [flags.c]", + "BLK_GRIDS is a dummy block grid"); + + if (grid_event_fcts[iev]==NULL) + error_root(1,1,"set_grid_flags [flags.c]", + "No action associated to event"); + else + { + gf=gfs+igr; + grid_event_fcts[iev](); + } +} + + +int query_flags(query_t query) +{ + int iqr; + + if (init==0) + set_arrays(); + + iqr=(int)(query); + + if (query_fcts[iqr]==NULL) + { + error_loc(1,1,"query_flags [flags.c]","No response to query"); + return -1; + } + else + return query_fcts[iqr](); +} + + +int query_grid_flags(blk_grid_t grid,query_t query) +{ + int iqr; + + if (init==0) + set_arrays(); + + iqr=(int)(query); + + if (grid_query_fcts[iqr]==NULL) + { + error_loc(1,1,"query_grid_flags [flags.c]","No response to query"); + return -1; + } + else + { + gf=gfs+(int)(grid); + return grid_query_fcts[iqr](); + } +} + + +void print_flags(void) +{ + int my_rank; + + if (init==0) + set_arrays(); + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + printf("Full lattice flags:\n"); + printf("u = %d\n",lat.u); + printf("ud,udbuf = %d,%d\n",lat.ud,lat.udbuf); + printf("bstap,fts = %d,%d\n",lat.bstap,lat.fts); + printf("sw = %d,%d,%d\n", + lat.sw[0],lat.sw[1],lat.sw[2]); + printf("swd = %d,%d,%d\n", + lat.swd[0],lat.swd[1],lat.swd[2]); + printf("aw,awh = %d,%d\n",lat.aw,lat.awh); + printf("\n"); + } +} + + +void print_grid_flags(blk_grid_t grid) +{ + int my_rank; + + if (init==0) + set_arrays(); + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + gf=gfs+(int)(grid); + + if (grid==SAP_BLOCKS) + printf("Flags on the SAP block grid:\n"); + else if (grid==DFL_BLOCKS) + printf("Flags on the DFL block grid:\n"); + else + error_root(1,1,"print_grid_flags [flags.c]","Unknown block grid"); + + printf("shf = %#x\n",(*gf).shf); + printf("u = %d\n",(*gf).u); + printf("ud = %d\n",(*gf).ud); + printf("sw = %d,%d,%d\n", + (*gf).sw[0],(*gf).sw[1],(*gf).sw[2]); + printf("swd = %d,%d,%d\n", + (*gf).swd[0],(*gf).swd[1],(*gf).swd[2]); + printf("\n"); + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/flags/force_parms.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/flags/force_parms.c new file mode 100644 index 0000000000000000000000000000000000000000..52cb72689dd4799ed148c022f210ce03b5d3d4c6 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/flags/force_parms.c @@ -0,0 +1,799 @@ + +/******************************************************************************* +* +* File force_parms.c +* +* Copyright (C) 2011, 2012 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Force parameter data base +* +* The externally accessible functions are +* +* force_parms_t set_force_parms(int ifr,force_t force,int ipf,int im0, +* int *irat,int *imu,int *isp,int *ncr) +* Sets the parameters in the force parameter set number ifr and returns +* a structure containing them (see the notes). +* +* force_parms_t force_parms(int ifr) +* Returns a structure containing the force parameter set number ifr +* (see the notes). +* +* void read_force_parms(int ifr) +* On process 0, this program scans stdin for a line starting with the +* string "[Force ]" (after any number of blanks), where is +* the integer value passed by the argument. An error occurs if no such +* line or more than one is found. The lines +* +* force +* ipf +* im0 +* irat +* imu [] +* isp [] +* ncr [] +* +* are then read using read_line() [utils/mutils.c]. Depending on the +* value of "force", some lines are not read and can be omitted in the +* input file. The number of integer items on the lines with tag "imu" +* and "isp" and "ncr" depends on the force too. The data are then added +* to the data base by calling set_force_parms(ifr,...). +* +* void read_force_parms2(int ifr) +* Same as read_force_parms() except that only the lines +* +* force +* isp [] +* ncr [] +* +* are read from stdin. All other force parameters are inferred from +* the parameters of the action no ifr so that the force is the one +* deriving from that action. An error occurs if the parameters of the +* action no ifr have not previously been added to the data base or +* if the force and action types do not match. +* +* void print_force_parms(void) +* Prints the parameters of the defined forces to stdout on MPI +* process 0. +* +* void print_force_parms2(void) +* Prints the parameters of the defined forces to stdout on MPI +* process 0 in a short format corresponding to read_force_parms2(). +* +* void write_force_parms(FILE *fdat) +* Writes the parameters of the defined forces to the file fdat on +* MPI process 0. +* +* void check_force_parms(FILE *fdat) +* Compares the parameters of the defined forces with those stored +* on the file fdat on MPI process 0, assuming the latter were written +* to the file by the program write_force_parms(). +* +* Notes: +* +* For a description of the supported forces and their parameters see +* forces/README.forces. +* +* The elements of a structure of type force_parms_t are +* +* force Force program used. This parameter is an enum type with +* one of the following values: +* +* FRG (program force0() [forces/force0.c]), +* +* FRF_TM1 (program force1() [forces/force1.c]), +* +* FRF_TM1_EO (program force4() [forces/force4.c]), +* +* FRF_TM1_EO_SDET (program force4() [forces/force4.c]), +* +* FRF_TM2 (program force2() [forces/force2.c]), +* +* FRF_TM2_EO (program force5() [forces/force5.c]), +* +* FRF_RAT (program force3() [forces/force3.c]), +* +* FRF_RAT_SDET (program force3() [forces/force3.c]), +* +* ipf Pseudo-fermion field index (see mdflds/mdflds.c), +* +* im0 Index of the bare sea quark mass in parameter data base +* (see flags/lat_parms.c), +* +* irat Indices specifying a rational function (see ratfcts/ratfcts.c), +* +* imu Twisted mass indices (see flags/hmc_parms.c), +* +* isp Solver parameter set indices (see flags/solver_parms.c), +* +* ncr Chronological solver stack sizes (see update/chrono.c), +* +* icr Chronological solver stack indices (set internally). +* +* Depending on the force, some parameters are not used and are set to zero +* by set_force_parms() independently of the values of the arguments. In +* particular, for a given force, only the required number of integers are +* read from the arrays imu, isp and ncr passed to the program. +* +* The number of twisted mass indices is 1 and 2 in the case of the forces +* FRF_TM1* and FRF_TM2*, respectively. These forces require a chronological +* solver stack size to be specified and 1 solver parameter set to be used +* for the solution of the Dirac equation with twisted mass index imu[0]. +* +* Up to 32 force parameter sets, labeled by an index ifr=0,1,..,31, can +* be specified. Once a set is specified, it cannot be changed by calling +* set_force_parms() again. Force parameters must be globally the same. +* +* Except for force_parms(), the programs in this module perform global +* operations and must be called simultaneously on all MPI processes. +* +*******************************************************************************/ + +#define FORCE_PARMS_C + +#include +#include +#include +#include +#include "mpi.h" +#include "utils.h" +#include "flags.h" +#include "global.h" + +#define IFRMAX 32 + +static int init=0,icr=0; +static force_t force[]={FRG,FRF_TM1,FRF_TM1_EO,FRF_TM1_EO_SDET, + FRF_TM2,FRF_TM2_EO,FRF_RAT,FRF_RAT_SDET}; +static force_parms_t fp[IFRMAX+1]={{FORCES,0,0,{0,0,0},{0,0,0,0},{0,0,0,0}, + {0,0,0,0},{0,0,0,0}}}; + + +static void init_fp(void) +{ + int i; + + for (i=1;i<=IFRMAX;i++) + fp[i]=fp[0]; + + init=1; +} + + +force_parms_t set_force_parms(int ifr,force_t force,int ipf,int im0, + int *irat,int *imu,int *isp,int *ncr) +{ + int iprms[23],i,ie; + int rat[3],mu[4],sp[4],nc[4],ic[4]; + + if (init==0) + init_fp(); + + for (i=0;i<3;i++) + rat[i]=0; + + for (i=0;i<4;i++) + { + mu[i]=0; + sp[i]=0; + nc[i]=0; + ic[i]=0; + } + + if ((force==FRG)||(force==FORCES)) + { + ipf=0; + im0=0; + } + else if ((force==FRF_TM1)||(force==FRF_TM1_EO)||(force==FRF_TM1_EO_SDET)) + { + mu[0]=imu[0]; + sp[0]=isp[0]; + + if (ncr[0]>0) + { + icr+=1; + nc[0]=ncr[0]; + ic[0]=icr; + } + } + else if ((force==FRF_TM2)||(force==FRF_TM2_EO)) + { + mu[0]=imu[0]; + mu[1]=imu[1]; + sp[0]=isp[0]; + + if (ncr[0]>0) + { + icr+=1; + nc[0]=ncr[0]; + ic[0]=icr; + } + } + else if ((force==FRF_RAT)||(force==FRF_RAT_SDET)) + { + rat[0]=irat[0]; + rat[1]=irat[1]; + rat[2]=irat[2]; + sp[0]=isp[0]; + } + + if (NPROC>1) + { + iprms[0]=ifr; + iprms[1]=(int)(force); + iprms[2]=ipf; + iprms[3]=im0; + + for (i=0;i<3;i++) + iprms[4+i]=rat[i]; + + for (i=0;i<4;i++) + { + iprms[7+i]=mu[i]; + iprms[11+i]=sp[i]; + iprms[15+i]=nc[i]; + iprms[19+i]=ic[i]; + } + + MPI_Bcast(iprms,23,MPI_INT,0,MPI_COMM_WORLD); + + ie=0; + ie|=(iprms[0]!=ifr); + ie|=(iprms[1]!=(int)(force)); + ie|=(iprms[2]!=ipf); + ie|=(iprms[3]!=im0); + + for (i=0;i<3;i++) + ie|=(iprms[4+i]!=rat[i]); + + for (i=0;i<4;i++) + { + ie|=(iprms[7+i]!=mu[i]); + ie|=(iprms[11+i]!=sp[i]); + ie|=(iprms[15+i]!=nc[i]); + ie|=(iprms[19+i]!=ic[i]); + } + + error(ie!=0,1,"set_force_parms [force_parms.c]", + "Parameters are not global"); + } + + ie=0; + ie|=((ifr<0)||(ifr>=IFRMAX)); + ie|=(force==FORCES); + ie|=((ipf<0)||(im0<0)); + + for (i=0;i<3;i++) + ie|=(rat[i]<0); + + for (i=0;i<4;i++) + { + ie|=(mu[i]<0); + ie|=(sp[i]<0); + ie|=(nc[i]<0); + } + + error_root(ie!=0,1,"set_force_parms [force_parms.c]", + "Parameters are out of range"); + + error_root(fp[ifr].force!=FORCES,1,"set_force_parms [force_parms.c]", + "Attempt to reset already specified force parameters"); + + fp[ifr].force=force; + fp[ifr].ipf=ipf; + fp[ifr].im0=im0; + + for (i=0;i<3;i++) + fp[ifr].irat[i]=rat[i]; + + for (i=0;i<4;i++) + { + fp[ifr].imu[i]=mu[i]; + fp[ifr].isp[i]=sp[i]; + fp[ifr].ncr[i]=nc[i]; + fp[ifr].icr[i]=ic[i]; + } + + return fp[ifr]; +} + + +force_parms_t force_parms(int ifr) +{ + if (init==0) + init_fp(); + + if ((ifr>=0)&&(ifr1) + { + MPI_Bcast(&idf,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&ipf,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&im0,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(irat,3,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(imu,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(isp,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(ncr,4,MPI_INT,0,MPI_COMM_WORLD); + } + + set_force_parms(ifr,force[idf],ipf,im0,irat,imu,isp,ncr); +} + + +void read_force_parms2(int ifr) +{ + int my_rank,i,ie,idf; + int ipf,im0,irat[3],imu[4],isp[4],ncr[4]; + char line[NAME_SIZE]; + action_parms_t ap; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + ie=0; + idf=0; + ipf=0; + im0=0; + + for (i=0;i<3;i++) + irat[i]=0; + + for (i=0;i<4;i++) + { + imu[i]=0; + isp[i]=0; + ncr[i]=0; + } + + if (my_rank==0) + { + ap=action_parms(ifr); + error_root(ap.action==ACTIONS,1,"read_force_parms2 [force_parms.c]", + "Undefined action"); + + sprintf(line,"Force %d",ifr); + find_section(line); + read_line("force","%s",line); + + if (ap.action==ACG) + ie=strcmp(line,"FRG"); + else if (ap.action==ACF_TM1) + { + ie=strcmp(line,"FRF_TM1"); + idf=1; + ipf=ap.ipf; + im0=ap.im0; + imu[0]=ap.imu[0]; + read_line("isp","%d",isp); + read_line("ncr","%d",ncr); + } + else if (ap.action==ACF_TM1_EO) + { + ie=strcmp(line,"FRF_TM1_EO"); + idf=2; + ipf=ap.ipf; + im0=ap.im0; + imu[0]=ap.imu[0]; + read_line("isp","%d",isp); + read_line("ncr","%d",ncr); + } + else if (ap.action==ACF_TM1_EO_SDET) + { + ie=strcmp(line,"FRF_TM1_EO_SDET"); + idf=3; + ipf=ap.ipf; + im0=ap.im0; + imu[0]=ap.imu[0]; + read_line("isp","%d",isp); + read_line("ncr","%d",ncr); + } + else if (ap.action==ACF_TM2) + { + ie=strcmp(line,"FRF_TM2"); + idf=4; + ipf=ap.ipf; + im0=ap.im0; + imu[0]=ap.imu[0]; + imu[1]=ap.imu[1]; + read_line("isp","%d",isp); + read_line("ncr","%d",ncr); + } + else if (ap.action==ACF_TM2_EO) + { + ie=strcmp(line,"FRF_TM2_EO"); + idf=5; + ipf=ap.ipf; + im0=ap.im0; + imu[0]=ap.imu[0]; + imu[1]=ap.imu[1]; + read_line("isp","%d",isp); + read_line("ncr","%d",ncr); + } + else if (ap.action==ACF_RAT) + { + ie=strcmp(line,"FRF_RAT"); + idf=6; + ipf=ap.ipf; + im0=ap.im0; + irat[0]=ap.irat[0]; + irat[1]=ap.irat[1]; + irat[2]=ap.irat[2]; + read_line("isp","%d",isp); + } + else if (ap.action==ACF_RAT_SDET) + { + ie=strcmp(line,"FRF_RAT_SDET"); + idf=7; + ipf=ap.ipf; + im0=ap.im0; + irat[0]=ap.irat[0]; + irat[1]=ap.irat[1]; + irat[2]=ap.irat[2]; + read_line("isp","%d",isp); + } + else + error_root(1,1,"read_force_parms2 [force_parms.c]", + "Unknown action"); + + error_root(ie!=0,1,"read_force_parms2 [force_parms.c]", + "Force and action types do not match"); + } + + if (NPROC>1) + { + MPI_Bcast(&idf,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&ipf,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&im0,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(irat,3,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(imu,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(isp,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(ncr,4,MPI_INT,0,MPI_COMM_WORLD); + } + + set_force_parms(ifr,force[idf],ipf,im0,irat,imu,isp,ncr); +} + + +void print_force_parms(void) +{ + int my_rank,i; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if ((my_rank==0)&&(init==1)) + { + for (i=0;i +#include +#include +#include +#include "mpi.h" +#include "utils.h" +#include "flags.h" +#include "global.h" + +static hmc_parms_t hmc={0,0,0,0,NULL,0.0,NULL}; + + +hmc_parms_t set_hmc_parms(int nact,int *iact,int npf,int nmu, + double *mu,int nlv,double tau) +{ + int iprms[4],i,ie; + double dprms[1]; + + if (NPROC>1) + { + iprms[0]=nact; + iprms[1]=npf; + iprms[2]=nmu; + iprms[3]=nlv; + dprms[0]=tau; + + MPI_Bcast(iprms,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(dprms,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + error((iprms[0]!=nact)||(iprms[1]!=npf)||(iprms[2]!=nmu)|| + (iprms[3]!=nlv)||(dprms[0]!=tau),1, + "set_hmc_parms [hmc_parms.c]","Parameters are not global"); + + ie=0; + + for (i=0;i0)&&(npf!=hmc.npf),1,"set_hmc_parms [hmc_parms.c]", + "Number of pseudo-fermion fields may be set only once"); + + if (nact!=hmc.nact) + { + if (hmc.iact!=NULL) + { + free(hmc.iact); + hmc.iact=NULL; + } + + if (nact>0) + { + hmc.iact=malloc(nact*sizeof(int)); + error(hmc.iact==NULL,1,"set_hmc_parms [hmc_parms.c]", + "Unable to allocate parameter array"); + } + } + + if (nmu!=hmc.nmu) + { + if (hmc.mu!=NULL) + { + free(hmc.mu); + hmc.mu=NULL; + } + + if (nmu>0) + { + hmc.mu=malloc(nmu*sizeof(double)); + error(hmc.mu==NULL,2,"set_hmc_parms [hmc_parms.c]", + "Unable to allocate parameter array"); + } + } + + hmc.nact=nact; + hmc.npf=npf; + hmc.nmu=nmu; + hmc.nlv=nlv; + hmc.tau=tau; + + for (i=0;i0) + { + printf("mu ="); + for (i=0;i +#include +#include +#include +#include "mpi.h" +#include "utils.h" +#include "flags.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define N1 (NPROC1*L1) +#define N2 (NPROC2*L2) +#define N3 (NPROC3*L3) + +static int flg_lat=0,flg_bc=0; +static lat_parms_t lat={0,0.0,1.0,0.0,NULL,NULL,1.0}; +static bc_parms_t bc={0,{1.0,1.0},{1.0,1.0},{{0.0,0.0,0.0},{0.0,0.0,0.0}}}; +static sw_parms_t sw={DBL_MAX,1.0,{1.0,1.0}}; +static tm_parms_t tm={0}; + + +lat_parms_t set_lat_parms(double beta,double c0, + int nk,double *kappa,double csw) +{ + int iprms[1],ik,ie; + double dprms[3],*k; + + if (flg_lat!=0) + return lat; + + error(flg_lat!=0,1,"set_lat_parms [lat_parms.c]", + "Attempt to reset the lattice parameters"); + + error(iup[0][0]!=0,1,"set_lat_parms [lat_parms.c]", + "Geometry arrays are already set"); + + if (NPROC>1) + { + iprms[0]=nk; + dprms[0]=beta; + dprms[1]=c0; + dprms[2]=csw; + + MPI_Bcast(iprms,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(dprms,3,MPI_DOUBLE,0,MPI_COMM_WORLD); + + error((iprms[0]!=nk)||(dprms[0]!=beta)||(dprms[1]!=c0)||(dprms[2]!=csw),1, + "set_lat_parms [lat_parms.c]","Parameters are not global"); + } + + error_root(nk<0,1,"set_lat_parms [lat_parms.c]", + "Number of kappa values must be non-negative"); + + error_root(c0<=0.0,1,"set_lat_parms [lat_parms.c]", + "Parameter c0 must be positive"); + + if (nk>0) + { + k=malloc(2*nk*sizeof(*k)); + error(k==NULL,1,"set_lat_parms [lat_parms.c]", + "Unable to allocate parameter array"); + } + else + k=NULL; + + lat.kappa=k; + lat.m0=k+nk; + + for (ik=0;ik1)&&(nk>0)) + { + for (ik=0;ik=11) + printf("kappa[%2d] = %.*f\n",ik,IMAX(n,6),lat.kappa[ik]); + else + printf("kappa[%1d] = %.*f\n",ik,IMAX(n,6),lat.kappa[ik]); + } + + n=fdigits(lat.csw); + printf("csw = %.*f\n\n",IMAX(n,1),lat.csw); + } +} + + +void write_lat_parms(FILE *fdat) +{ + int my_rank,endian; + int iw,ik; + stdint_t istd[5]; + double dstd[4]; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + endian=endianness(); + + if (my_rank==0) + { + istd[0]=(stdint_t)(N0); + istd[1]=(stdint_t)(N1); + istd[2]=(stdint_t)(N2); + istd[3]=(stdint_t)(N3); + istd[4]=(stdint_t)(lat.nk); + + dstd[0]=lat.beta; + dstd[1]=lat.c0; + dstd[2]=lat.c1; + dstd[3]=lat.csw; + + if (endian==BIG_ENDIAN) + { + bswap_int(5,istd); + bswap_double(4,dstd); + } + + iw=fwrite(istd,sizeof(stdint_t),5,fdat); + iw+=fwrite(dstd,sizeof(double),4,fdat); + + for (ik=0;ik1) + { + iprms[0]=type; + + MPI_Bcast(iprms,1,MPI_INT,0,MPI_COMM_WORLD); + + error(iprms[0]!=type,1,"set_bc_parms [lat_parms.c]", + "Parameters are not global"); + + if ((type>=0)&&(type<3)) + { + dprms[0]=cG; + dprms[1]=cF; + + if (type==0) + { + dprms[2]=0.0; + dprms[3]=0.0; + dprms[4]=0.0; + dprms[5]=0.0; + } + else if (type==1) + { + dprms[2]=phi[0]; + dprms[3]=phi[1]; + dprms[4]=phi_prime[0]; + dprms[5]=phi_prime[1]; + } + else if (type==2) + { + dprms[2]=cG_prime; + dprms[3]=cF_prime; + dprms[4]=phi_prime[0]; + dprms[5]=phi_prime[1]; + } + + MPI_Bcast(dprms,6,MPI_DOUBLE,0,MPI_COMM_WORLD); + + ie=((dprms[0]!=cG)||(dprms[1]!=cF)); + + if (type==1) + { + ie|=((dprms[2]!=phi[0])||(dprms[3]!=phi[1])); + ie|=((dprms[4]!=phi_prime[0])||(dprms[5]!=phi_prime[1])); + } + else if (type==2) + { + ie|=((dprms[2]!=cG_prime)||(dprms[3]!=cF_prime)); + ie|=((dprms[4]!=phi_prime[0])||(dprms[5]!=phi_prime[1])); + } + + error(ie!=0,1,"set_bc_parms [lat_parms.c]","Parameters are not global"); + } + } + + error_root((type<0)||(type>3),1,"set_bc_parms [lat_parms.c]", + "Unknown type of boundary condition"); + + bc.type=type; + + if ((type>=0)&&(type<3)) + { + bc.cG[0]=cG; + bc.cF[0]=cF; + + if (type==0) + { + bc.cG[1]=cG; + bc.cF[1]=cF; + } + else if (type==1) + { + bc.cG[1]=cG; + bc.cF[1]=cF; + + bc.phi[0][0]=phi[0]; + bc.phi[0][1]=phi[1]; + bc.phi[0][2]=-phi[0]-phi[1]; + + bc.phi[1][0]=phi_prime[0]; + bc.phi[1][1]=phi_prime[1]; + bc.phi[1][2]=-phi_prime[0]-phi_prime[1]; + } + else if (type==2) + { + bc.cG[1]=cG_prime; + bc.cF[1]=cF_prime; + + bc.phi[1][0]=phi_prime[0]; + bc.phi[1][1]=phi_prime[1]; + bc.phi[1][2]=-phi_prime[0]-phi_prime[1]; + } + } + + flg_bc=1; + + return bc; +} + + +bc_parms_t bc_parms(void) +{ + return bc; +} + + +void print_bc_parms(void) +{ + int my_rank,n[3]; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + if (bc.type==0) + { + printf("Open boundary conditions\n"); + + n[0]=fdigits(bc.cG[0]); + printf("cG = %.*f\n",IMAX(n[0],1),bc.cG[0]); + n[0]=fdigits(bc.cF[0]); + printf("cF = %.*f\n\n",IMAX(n[0],1),bc.cF[0]); + } + else if (bc.type==1) + { + printf("SF boundary conditions\n"); + + n[0]=fdigits(bc.cG[0]); + printf("cG = %.*f\n",IMAX(n[0],1),bc.cG[0]); + n[0]=fdigits(bc.cF[0]); + printf("cF = %.*f\n",IMAX(n[0],1),bc.cF[0]); + + n[0]=fdigits(bc.phi[0][0]); + n[1]=fdigits(bc.phi[0][1]); + n[2]=fdigits(bc.phi[0][2]); + printf("phi = %.*f,%.*f,%.*f\n",IMAX(n[0],1),bc.phi[0][0], + IMAX(n[1],1),bc.phi[0][1],IMAX(n[2],1),bc.phi[0][2]); + + n[0]=fdigits(bc.phi[1][0]); + n[1]=fdigits(bc.phi[1][1]); + n[2]=fdigits(bc.phi[1][2]); + printf("phi' = %.*f,%.*f,%.*f\n\n",IMAX(n[0],1),bc.phi[1][0], + IMAX(n[1],1),bc.phi[1][1],IMAX(n[2],1),bc.phi[1][2]); + } + else if (bc.type==2) + { + printf("Open-SF boundary conditions\n"); + + n[0]=fdigits(bc.cG[0]); + printf("cG = %.*f\n",IMAX(n[0],1),bc.cG[0]); + n[0]=fdigits(bc.cF[0]); + printf("cF = %.*f\n",IMAX(n[0],1),bc.cF[0]); + + n[1]=fdigits(bc.cG[1]); + printf("cG' = %.*f\n",IMAX(n[1],1),bc.cG[1]); + n[1]=fdigits(bc.cF[1]); + printf("cF' = %.*f\n",IMAX(n[1],1),bc.cF[1]); + + n[0]=fdigits(bc.phi[1][0]); + n[1]=fdigits(bc.phi[1][1]); + n[2]=fdigits(bc.phi[1][2]); + printf("phi' = %.*f,%.*f,%.*f\n\n",IMAX(n[0],1),bc.phi[1][0], + IMAX(n[1],1),bc.phi[1][1],IMAX(n[2],1),bc.phi[1][2]); + } + else + printf("Periodic boundary conditions\n\n"); + } +} + + +void write_bc_parms(FILE *fdat) +{ + int my_rank,endian,iw; + stdint_t istd[1]; + double dstd[10]; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + endian=endianness(); + + if (my_rank==0) + { + istd[0]=(stdint_t)(bc.type); + + dstd[0]=bc.cG[0]; + dstd[1]=bc.cG[1]; + dstd[2]=bc.cF[0]; + dstd[3]=bc.cF[1]; + dstd[4]=bc.phi[0][0]; + dstd[5]=bc.phi[0][1]; + dstd[6]=bc.phi[0][2]; + dstd[7]=bc.phi[1][0]; + dstd[8]=bc.phi[1][1]; + dstd[9]=bc.phi[1][2]; + + if (endian==BIG_ENDIAN) + { + bswap_int(1,istd); + bswap_double(10,dstd); + } + + iw=fwrite(istd,sizeof(stdint_t),1,fdat); + iw+=fwrite(dstd,sizeof(double),10,fdat); + + error_root(iw!=11,1,"write_bc_parms [bc_parms.c]", + "Incorrect write count"); + } +} + + +void check_bc_parms(FILE *fdat) +{ + int my_rank,endian,ir,ie; + stdint_t istd[1]; + double dstd[10]; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + endian=endianness(); + + if (my_rank==0) + { + ir=fread(istd,sizeof(stdint_t),1,fdat); + ir+=fread(dstd,sizeof(double),10,fdat); + + if (endian==BIG_ENDIAN) + { + bswap_int(1,istd); + bswap_double(10,dstd); + } + + ie=0; + ie|=(istd[0]!=(stdint_t)(bc.type)); + + ie|=(dstd[0]!=bc.cG[0]); + ie|=(dstd[1]!=bc.cG[1]); + ie|=(dstd[2]!=bc.cF[0]); + ie|=(dstd[3]!=bc.cF[1]); + ie|=(dstd[4]!=bc.phi[0][0]); + ie|=(dstd[5]!=bc.phi[0][1]); + ie|=(dstd[6]!=bc.phi[0][2]); + ie|=(dstd[7]!=bc.phi[1][0]); + ie|=(dstd[8]!=bc.phi[1][1]); + ie|=(dstd[9]!=bc.phi[1][2]); + + error_root(ir!=11,1,"check_bc_parms [bc_parms.c]", + "Incorrect read count"); + + error_root(ie!=0,1,"check_bc_parms [bc_parms.c]", + "Parameters do not match"); + } +} + + +double sea_quark_mass(int im0) +{ + if ((im0>=0)&&(im01) + { + dprms[0]=m0; + + MPI_Bcast(dprms,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + error(dprms[0]!=m0,1, + "set_sw_parms [lat_parms.c]","Parameter is not global"); + } + + if (m0!=sw.m0) + { + set_flags(ERASED_SW); + set_flags(ERASED_SWD); + set_grid_flags(SAP_BLOCKS,ERASED_SW); + set_flags(ERASED_AWHAT); + } + + sw.m0=m0; + sw.csw=lat.csw; + sw.cF[0]=bc.cF[0]; + sw.cF[1]=bc.cF[1]; + + return sw; +} + + +sw_parms_t sw_parms(void) +{ + sw.csw=lat.csw; + sw.cF[0]=bc.cF[0]; + sw.cF[1]=bc.cF[1]; + + return sw; +} + + +tm_parms_t set_tm_parms(int eoflg) +{ + int iprms[1]; + + if (NPROC>1) + { + iprms[0]=eoflg; + + MPI_Bcast(iprms,1,MPI_INT,0,MPI_COMM_WORLD); + + error(iprms[0]!=eoflg,1, + "set_tm_parms [lat_parms.c]","Parameter is not global"); + } + + if (eoflg!=tm.eoflg) + set_flags(ERASED_AWHAT); + + tm.eoflg=eoflg; + + return tm; +} + + +tm_parms_t tm_parms(void) +{ + return tm; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/flags/mdint_parms.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/flags/mdint_parms.c new file mode 100644 index 0000000000000000000000000000000000000000..450fae18f1fe2955ed414d34894caec894613a20 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/flags/mdint_parms.c @@ -0,0 +1,479 @@ + +/******************************************************************************* +* +* File mdint_parms.c +* +* Copyright (C) 2011, 2012 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Molecular-dynamics integrator data base +* +* The externally accessible functions are +* +* mdint_parms_t set_mdint_parms(int ilv,integrator_t integrator,double lambda, +* int nstep,int nfr,int *ifr) +* Sets the parameters of the molecular-dynamics integrator at level +* ilv and returns a structure containing them (see the notes). +* +* mdint_parms_t mdint_parms(int ilv) +* Returns a structure containing the parameters of the integrator at +* level ilv (see the notes). +* +* void read_mdint_parms(int ilv) +* On process 0, this program scans stdin for a line starting with the +* string "[Level ]" (after any number of blanks), where is +* the integer value passed by the argument. An error occurs if no such +* line or more than one is found. The lines +* +* integrator +* lambda +* nstep +* forces [] +* +* are then read using read_line() [utils/mutils.c]. The line tagged +* "lambda" is required only when the specified integrator is the 2nd +* order OMF integrator. The line tagged "forces" must contain the +* indices of the forces (separated by white space) that are to be +* integrated at this level. On exit, the data are entered in the data +* base by calling set_mdint_parms(ilv,...). +* +* void print_mdint_parms(void) +* Prints the parameters of the defined integrator levels to stdout +* on MPI process 0. +* +* void write_mdint_parms(FILE *fdat) +* Writes the parameters of the defined integrator levels to the file +* fdat on MPI process 0. +* +* void check_mdint_parms(FILE *fdat) +* Compares the parameters of the defined integrator levels with those +* stored on the file fdat on MPI process 0, assuming the latter were +* written to the file by the program write_mdint_parms(). +* +* Notes: +* +* A structure of type mdint_parms_t contains the parameters of a hierarchical +* molecular-dynamics integrator at a specified level (see update/README.mdint). +* Its elements are +* +* integrator Elementary integrator used. This parameter is an enum +* type with one of the following values: +* +* LPFR Leapfrog integrator +* +* OMF2 2nd order Omelyan-Mryglod-Folk integrator +* +* OMF4 4th order Omelyan-Mryglod-Folk integrator +* +* lambda Parameter of the 2nd order OMF integrator +* +* nstep Number of times the elementary integrator is applied +* at this level +* +* nfr Number of forces integrated at this level +* +* ifr Force indices ifr[i] (i=0,..,nfr-1) +* +* The parameter lambda is not used in the case of the leapfrog and the 4th +* order OMF integrator. Up to 32 integrator levels, labeled by an index +* ilv=0,1,..,31, can be specified. +* +* An example of valid section in an input file which can be read by calling +* read_mdint(3) is +* +* [Level 3] +* integrator OMF2 +* lambda 0.2 +* nstep 12 +* forces 2 4 5 +* +* In this case, there are three forces with index 2, 4 and 5. +* +* The programs set_mdint_parms() and read_mdint_parms() perform global +* operations and must be called simultaneously on all MPI processes. +* +*******************************************************************************/ + +#define MDINT_PARMS_C + +#include +#include +#include +#include +#include "mpi.h" +#include "utils.h" +#include "flags.h" +#include "global.h" + +#define ILVMAX 32 + +static int init=0; +static mdint_parms_t mdp[ILVMAX+1]={{INTEGRATORS,0.0,0,0,NULL}}; + + +static void init_mdp(void) +{ + int i; + + for (i=1;i<=ILVMAX;i++) + mdp[i]=mdp[0]; + + init=1; +} + + +static void alloc_ifr(int ilv,int nfr) +{ + int *ifr; + + if (mdp[ilv].nfr>0) + { + free(mdp[ilv].ifr); + mdp[ilv].nfr=0; + mdp[ilv].ifr=NULL; + } + + if (nfr>0) + { + ifr=malloc(nfr*sizeof(*ifr)); + error(ifr==NULL,1,"alloc_ifr [mdint_parms.c]", + "Unable to allocate index array"); + mdp[ilv].nfr=nfr; + mdp[ilv].ifr=ifr; + } +} + + +mdint_parms_t set_mdint_parms(int ilv,integrator_t integrator,double lambda, + int nstep,int nfr,int *ifr) +{ + int iprms[4],i,j,ie; + double dprms[1]; + + if (init==0) + init_mdp(); + + if (integrator!=OMF2) + lambda=0.0; + + if (NPROC>1) + { + iprms[0]=ilv; + iprms[1]=(int)(integrator); + iprms[2]=nstep; + iprms[3]=nfr; + dprms[0]=lambda; + + MPI_Bcast(iprms,4,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(dprms,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + ie=0; + ie|=(iprms[0]!=ilv); + ie|=(iprms[1]!=(int)(integrator)); + ie|=(iprms[2]!=nstep); + ie|=(iprms[3]!=nfr); + ie|=(dprms[0]!=lambda); + + for (i=0;i=ILVMAX); + ie|=(integrator==INTEGRATORS); + ie|=(nstep<1); + ie|=(nfr<1); + + for (i=0;i=0)&&(ilv0) + { + printf("Forces ="); + + for (j=0;jnmx) + nmx=nfr; + } + + istd=malloc((nmx+4)*sizeof(stdint_t)); + error_root(istd==NULL,1,"write_mdint_parms [mdint_parms.c]", + "Unable to allocate auxiliary array"); + + for (i=0;inmx) + nmx=nfr; + } + + istd=malloc((nmx+4)*sizeof(stdint_t)); + error_root(istd==NULL,1,"check_mdint_parms [mdint_parms.c]", + "Unable to allocate auxiliary array"); + + for (i=0;i]" (after any number of blanks), where is +* the integer value passed by the argument. An error occurs if no such +* line or more than one is found. The lines +* +* degree +* range +* +* are then read using read_line() [utils/mutils.c] and the data are +* entered into the data base by calling set_rat_parms(). +* +* void print_rat_parms(void) +* Prints the defined rational function parameter sets to stdout on MPI +* process 0. +* +* void write_rat_parms(FILE *fdat) +* Writes the defined rational function parameter sets to the file fdat +* on MPI process 0. +* +* void check_rat_parms(FILE *fdat) +* Compares the defined rational function parameter sets with those +* on the file fdat on MPI process 0, assuming the latter were written +* to the file by the program write_rat_parms(). +* +* Notes: +* +* Currently only Zolotorev rational functions are supported (see the modules +* ratfcts/zolotarev.c and ratfcts/ratfcts.c). The elements of a structure of +* type rat_parms_t are +* +* degree Degree of the rational function +* +* range[2] Lower and upper end of the approximation range (see +* ratfcts/ratfcts.c) +* +* Up to 32 parameter sets, labeled by an index irp=0,1,..,31, can be +* specified. Once a set is defined, it cannot be changed by calling +* set_rat_parms() again. Rational function parameters must be globally +* the same. +* +* Except for rat_parms(), the programs in this module perform global +* operations and must be called simultaneously on all MPI processes. +* +*******************************************************************************/ + +#define RAT_PARMS_C + +#include +#include +#include +#include +#include "mpi.h" +#include "utils.h" +#include "flags.h" +#include "global.h" + +#define IRPMAX 32 + +static int init=0; +static rat_parms_t rp[IRPMAX+1]={{0,{0.0,0.0}}}; + + +static void init_rp(void) +{ + int irp; + + for (irp=1;irp<=IRPMAX;irp++) + rp[irp]=rp[0]; + + init=1; +} + + +rat_parms_t set_rat_parms(int irp,int degree,double *range) +{ + int ie,iprms[2]; + double dprms[2]; + + if (init==0) + init_rp(); + + if (NPROC>1) + { + iprms[0]=irp; + iprms[1]=degree; + dprms[0]=range[0]; + dprms[1]=range[1]; + + MPI_Bcast(iprms,2,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(dprms,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + + ie=0; + ie|=(iprms[0]!=irp); + ie|=(iprms[1]!=degree); + ie|=(dprms[0]!=range[0]); + ie|=(dprms[1]!=range[1]); + + error(ie!=0,1,"set_rat_parms [rat_parms.c]", + "Parameters are not global"); + } + + ie=0; + ie|=((irp<0)||(irp>=IRPMAX)); + ie|=(degree<1); + ie|=(range[0]>=range[1]); + ie|=(range[0]<=0.0); + + error_root(ie!=0,1,"set_rat_parms [rat_parms.c]", + "Parameters are out of range"); + + error_root(rp[irp].degree!=0,1,"set_rat_parms [rat_parms.c]", + "Attempt to reset an already specified parameter set"); + + rp[irp].degree=degree; + rp[irp].range[0]=range[0]; + rp[irp].range[1]=range[1]; + + return rp[irp]; +} + + +rat_parms_t rat_parms(int irp) +{ + if (init==0) + init_rp(); + + if ((irp>=0)&&(irp1) + { + MPI_Bcast(°ree,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(range,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + + set_rat_parms(irp,degree,range); +} + + +void print_rat_parms(void) +{ + int my_rank,irp,n[2]; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if ((my_rank==0)&&(init==1)) + { + for (irp=0;irp]" (after any number of blanks), where +* is the integer value passed through the argument. An error occurs +* if no such line or more than one is found. The lines +* +* rwfact +* im0 +* nsrc +* irp +* mu [] +* np [] +* isp [] +* +* are then read using read_line() [utils/mutils.c] and the data are +* added to the data base by calling set_rw_parms(irw,...). Depending +* on the value of "rwfact", some lines are not read and can be omitted +* in the input file. The number of items on the lines with tag "mu", +* "np" and "isp" depends on the reweighting factor too (see the notes). +* +* void print_rw_parms(void) +* Prints the defined reweighting factor parameter sets to stdout on +* MPI process 0. +* +* void write_rw_parms(FILE *fdat) +* Writes the defined reweighting factor parameter sets to the file fdat +* on MPI process 0. +* +* void check_rw_parms(FILE *fdat) +* Compares the defined reweighting factor parameter sets with those +* on the file fdat on MPI process 0, assuming the latter were written +* to the file by the program write_rw_parms(). +* +* Notes: +* +* The elements of a structure of type rw_parms_t are: +* +* rwfact Reweighting factor program used. This parameter is an enum +* type with one of the following values: +* +* RWTM1 (program rwtm1() [update/rwtm.c]), +* +* RWTM1_EO (program rwtm1eo() [update/rwtmeo.c]), +* +* RWTM2 (program rwtm2() [update/rwtm.c]), +* +* RWTM2_EO (program rwtm2eo() [update/rwtmeo.c]), +* +* RWRAT (program rwrat() [update/rwrat.c]). +* +* im0 Index of the bare sea quark mass in the parameter data base +* (see flags/lat_parms.c). +* +* nsrc Number N of random source fields to be used for the stochastic +* estimation of the reweighting factor. If the latter is split +* into a product factors, N random fields are used for each of +* them. +* +* irp Rational function parameter set index. Only relevant if +* rwfact=RWRAT. +* +* nfct If rwfact=RWTM*: Number of Hasenbusch factors into which the +* reweighting factor is decomposed; +* If rwfact=RWRAT: Number of rational factors into which the +* rational function is decomposed. +* +* mu Array of twisted masses that define the Hasenbusch factors +* (nfct elements; 0 +#include +#include +#include +#include "mpi.h" +#include "utils.h" +#include "flags.h" +#include "global.h" + +#define IRWMAX 32 + +static int init=0; +static rwfact_t rwfact[]={RWTM1,RWTM1_EO,RWTM2,RWTM2_EO,RWRAT}; +static rw_parms_t rw[IRWMAX+1]={{RWFACTS,0,0,0,0,NULL,NULL,NULL}}; + + +static void init_rw(void) +{ + int irw; + + for (irw=1;irw<=IRWMAX;irw++) + rw[irw]=rw[0]; + + init=1; +} + + +rw_parms_t set_rw_parms(int irw,rwfact_t rwfact,int im0,int nsrc, + int irp,int nfct,double *mu,int *np,int *isp) +{ + int iprms[6],i,ie; + double dprms[1]; + + if (init==0) + init_rw(); + + error_root((rwfact!=RWTM1)&&(rwfact!=RWTM1_EO)&& + (rwfact!=RWTM2)&&(rwfact!=RWTM2_EO)&&(rwfact!=RWRAT),1, + "set_rw_parms [rw_parms.c]","Unknown type of reweighting factor"); + + if (rwfact!=RWRAT) + irp=0; + + if (NPROC>1) + { + iprms[0]=irw; + iprms[1]=(int)(rwfact); + iprms[2]=im0; + iprms[3]=nsrc; + iprms[4]=irp; + iprms[5]=nfct; + + MPI_Bcast(iprms,6,MPI_INT,0,MPI_COMM_WORLD); + + ie=0; + ie|=(iprms[0]!=irw); + ie|=(iprms[1]!=(int)(rwfact)); + ie|=(iprms[2]!=im0); + ie|=(iprms[3]!=nsrc); + ie|=(iprms[4]!=irp); + ie|=(iprms[5]!=nfct); + + error(ie!=0,1,"set_rw_parms [rw_parms.c]", + "Parameters are not global"); + } + + ie=0; + ie|=((irw<0)||(irw>=IRWMAX)); + ie|=(im0<0); + ie|=(nsrc<1); + ie|=(irp<0); + ie|=(nfct<1); + + error_root(ie!=0,1,"set_rw_parms [rw_parms.c]", + "Parameters are out of range"); + + if (NPROC>1) + { + if (rwfact!=RWRAT) + { + for (i=0;i=0)&&(irw1) + { + MPI_Bcast(&idr,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&im0,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nsrc,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&irp,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nfct,1,MPI_INT,0,MPI_COMM_WORLD); + } + + if (idr<4) + { + mu=malloc(nfct*sizeof(*mu)); + np=NULL; + isp=malloc(nfct*sizeof(*isp)); + error((mu==NULL)||(isp==NULL),1,"read_rw_parms [rw_parms.c]", + "Unable to allocated data arrays"); + } + else + { + mu=NULL; + np=malloc(2*nfct*sizeof(*np)); + isp=np+nfct; + error(np==NULL,1,"read_rw_parms [rw_parms.c]", + "Unable to allocated data arrays"); + } + + if (my_rank==0) + { + if (idr<4) + read_dprms("mu",nfct,mu); + else + read_iprms("np",nfct,np); + + n=count_tokens("isp"); + error_root(n<1,1,"read_rw_parms [rw_parms.c]", + "No data on the line with tag isp"); + + if (n>nfct) + n=nfct; + read_iprms("isp",n,isp); + + for (i=n;i1) + { + if (idr<4) + MPI_Bcast(mu,nfct,MPI_DOUBLE,0,MPI_COMM_WORLD); + else + MPI_Bcast(np,nfct,MPI_INT,0,MPI_COMM_WORLD); + + MPI_Bcast(isp,nfct,MPI_INT,0,MPI_COMM_WORLD); + } + + set_rw_parms(irw,rwfact[idr],im0,nsrc,irp,nfct,mu,np,isp); + + if (idr<4) + { + free(mu); + free(isp); + } + else + free(np); +} + + +void print_rw_parms(void) +{ + int my_rank,irw,idr,nfct,n,i; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if ((my_rank==0)&&(init==1)) + { + for (irw=0;irw +#include +#include +#include +#include "mpi.h" +#include "utils.h" +#include "flags.h" +#include "global.h" + +static sap_parms_t sap={{0,0,0,0},0,0,0}; + + +static void check_block_size(int *bs) +{ + int n0,n1,n2,n3; + + error_root((bs[0]<4)||(bs[1]<4)||(bs[2]<4)||(bs[3]<4)|| + (bs[0]>L0)||(bs[1]>L1)||(bs[2]>L2)||(bs[3]>L3),1, + "check_block_size [sap_parms.c]", + "Block sizes are out of range"); + + error_root((bs[0]%2)||(bs[1]%2)||(bs[2]%2)||(bs[3]%2),1, + "check_block_size [sap_parms.c]", + "Block sizes must be even"); + + error_root((L0%bs[0])||(L1%bs[1])||(L2%bs[2])||(L3%bs[3]),1, + "check_block_size [sap_parms.c]", + "Blocks do not divide the local lattice"); + + n0=L0/bs[0]; + n1=L1/bs[1]; + n2=L2/bs[2]; + n3=L3/bs[3]; + + error_root(((NPROC0*n0)%2)||((NPROC1*n1)%2)|| + ((NPROC2*n2)%2)||((NPROC3*n3)%2),1, + "check_block_size [sap_parms.c]", + "There must be an even number of blocks in each direction"); + + error_root((n0*n1*n2*n3)%2,1, + "check_block_size [sap_parms.c]", + "The number of blocks in the local lattice must be even"); +} + + + +sap_parms_t set_sap_parms(int *bs,int isolv,int nmr,int ncy) +{ + int iprms[7]; + + if (NPROC>1) + { + iprms[0]=bs[0]; + iprms[1]=bs[1]; + iprms[2]=bs[2]; + iprms[3]=bs[3]; + iprms[4]=isolv; + iprms[5]=nmr; + iprms[6]=ncy; + + MPI_Bcast(iprms,7,MPI_INT,0,MPI_COMM_WORLD); + + error((iprms[0]!=bs[0])||(iprms[1]!=bs[1])||(iprms[2]!=bs[2])|| + (iprms[3]!=bs[3])||(iprms[4]!=isolv)||(iprms[5]!=nmr)|| + (iprms[6]!=ncy),1, + "set_sap_parms [sap_parms.c]","Parameters are not global"); + } + + if (sap.ncy>0) + { + error_root((bs[0]!=sap.bs[0])||(bs[1]!=sap.bs[1])|| + (bs[2]!=sap.bs[2])||(bs[3]!=sap.bs[3]),1, + "set_sap_parms [sap_parms.c]","bs[4] may be set only once"); + } + else + { + check_block_size(bs); + sap.bs[0]=bs[0]; + sap.bs[1]=bs[1]; + sap.bs[2]=bs[2]; + sap.bs[3]=bs[3]; + } + + error_root((isolv<0)||(isolv>1)||(nmr<1)||(ncy<1),1, + "set_sap_parms [sap_parms.c]", + "Improper value of isolv, nmr or ncy"); + + sap.isolv=isolv; + sap.nmr=nmr; + sap.ncy=ncy; + + return sap; +} + + +sap_parms_t sap_parms(void) +{ + return sap; +} + + +void print_sap_parms(int ipr) +{ + int my_rank; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + if (ipr) + { + printf("SAP parameters:\n"); + printf("bs = %d %d %d %d\n", + sap.bs[0],sap.bs[1],sap.bs[2],sap.bs[3]); + printf("isolv = %d\n",sap.isolv); + printf("nmr = %d\n",sap.nmr); + printf("ncy = %d\n\n",sap.ncy); + } + else + { + printf("SAP block size:\n"); + printf("bs = %d %d %d %d\n\n", + sap.bs[0],sap.bs[1],sap.bs[2],sap.bs[3]); + } + } +} + + +void write_sap_parms(FILE *fdat) +{ + int my_rank,endian; + int i,iw; + stdint_t istd[7]; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + endian=endianness(); + + if (my_rank==0) + { + for (i=0;i<4;i++) + istd[i]=(stdint_t)(sap.bs[i]); + + istd[4]=(stdint_t)(sap.isolv); + istd[5]=(stdint_t)(sap.nmr); + istd[6]=(stdint_t)(sap.ncy); + + if (endian==BIG_ENDIAN) + bswap_int(7,istd); + + iw=fwrite(istd,sizeof(stdint_t),7,fdat); + error_root(iw!=7,1,"write_sap_parms [sap_parms.c]", + "Incorrect write count"); + } +} + + +void check_sap_parms(FILE *fdat) +{ + int my_rank,endian; + int i,ir,ie; + stdint_t istd[7]; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + endian=endianness(); + + if (my_rank==0) + { + ir=fread(istd,sizeof(stdint_t),7,fdat); + error_root(ir!=7,1,"check_sap_parms [sap_parms.c]", + "Incorrect read count"); + + if (endian==BIG_ENDIAN) + bswap_int(7,istd); + + ie=0; + + for (i=0;i<4;i++) + ie|=(istd[i]!=(stdint_t)(sap.bs[i])); + + ie|=(istd[4]!=(stdint_t)(sap.isolv)); + ie|=(istd[5]!=(stdint_t)(sap.nmr)); + ie|=(istd[6]!=(stdint_t)(sap.ncy)); + + error_root(ie!=0,1,"check_sap_parms [sap_parms.c]", + "Parameters do not match"); + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/flags/solver_parms.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/flags/solver_parms.c new file mode 100644 index 0000000000000000000000000000000000000000..fa83f91d4d3973dc25e75db311cf7fa5686bb550 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/flags/solver_parms.c @@ -0,0 +1,433 @@ + +/******************************************************************************* +* +* File solver_parms.c +* +* Copyright (C) 2011, 2012 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Solver parameter data base +* +* The externally accessible functions are +* +* solver_parms_t set_solver_parms(int isp,solver_t solver, +* int nkv,int isolv,int nmr,int ncy, +* int nmx,double res) +* Sets the parameters in the solver parameter set number isp and returns +* a structure containing them (see the notes). +* +* solver_parms_t solver_parms(int isp) +* Returns a structure containing the solver parameter set number +* isp (see the notes). +* +* void read_solver_parms(int isp) +* On process 0, this program scans stdin for a line starting with the +* string "[Solver ]" (after any number of blanks), where is +* the integer value passed by the argument. An error occurs if no such +* line or more than one is found. The lines +* +* solver +* nkv +* isolv +* nmr +* ncy +* nmx +* res +* +* are then read one by one using read_line() [utils/mutils.c]. The +* lines with tags nkv,..,ncy may be absent in the case of the CGNE +* and MSCG solvers (see the notes). The data are then added to the +* data base by calling set_solver_parms(isp,...). +* +* void print_solver_parms(int *isap,int *idfl) +* Prints the parameters of the defined solvers to stdout on MPI +* process 0. On exit the flag isap is 1 or 0 depending on whether +* one of the solvers makes use of the Schwarz Alternating Procedure +* (SAP) or not. Similarly, the flag idfl is set 1 or 0 depending on +* whether deflation is used or not. On MPI processes other than 0, +* the program does nothing and sets isap and idfl to zero. +* +* void write_solver_parms(FILE *fdat) +* Writes the parameters of the defined solvers to the file fdat on +* MPI process 0. +* +* void check_solver_parms(FILE *fdat) +* Compares the parameters of the defined solvers with those stored +* on the file fdat on MPI process 0, assuming the latter were written +* to the file by the program write_solver_parms() (mismatches of the +* maximal solver iteration number are not considered to be an error). +* +* Notes: +* +* The elements of a structure of type solver_parms_t are +* +* solver Solver program used. This parameter is an enum type with +* one of the following values: +* +* CGNE Program tmcg() [forces/tmcg.c]. +* +* MSCG Program tmcgm() [forces/tmcgm.c]. +* +* SAP_GCR Program sap_gcr() [sap/sap_gcr.c]. +* +* DFL_SAP_GCR Program dfl_sap_gcr() [dfl/dfl_sap_gcr.c]. +* +* nkv Maximal number of Krylov vectors generated before the GCR +* algorithm is restarted if solver=*_GCR. +* +* isolv Block solver to be used if solver=*SAP_GCR (0: plain MinRes, +* 1: eo-preconditioned MinRes). +* +* nmr Number of block solver iterations if solver=*SAP_GCR. +* +* ncy Number of SAP cycles to be applied if solver=*SAP_GCR. +* +* nmx Maximal number of CG iterations if solver={CGNE,MSCG} or +* maximal total number of Krylov vectors that may be generated +* if solver={SAP_GCR,DFL_SAP_GCR}. +* +* res Desired maximal relative residue of the calculated solution. +* +* Depending on the solver, some parameters are not used. These are set to +* zero by the program set_solver_parms() independently of the values of +* the arguments. +* +* Up to 32 solver parameter sets, labeled by an index isp=0,1,..,31, can +* be specified. Once a set is specified, it cannot be changed by calling +* set_solver_parms() again. Solver parameters must be globally the same. +* +* Except for solver_parms(), the programs in this module perform global +* operations and must be called simultaneously on all MPI processes. +* +*******************************************************************************/ + +#define SOLVER_PARMS_C + +#include +#include +#include +#include +#include "mpi.h" +#include "utils.h" +#include "flags.h" +#include "global.h" + +#define ISPMAX 32 + +static int init=0; +static solver_t solver[]={CGNE,MSCG,SAP_GCR,DFL_SAP_GCR}; +static solver_parms_t sp[ISPMAX+1]={{SOLVERS,0,0,0,0,0,0.0}}; + + +static void init_sp(void) +{ + int i; + + for (i=1;i<=ISPMAX;i++) + sp[i]=sp[0]; + + init=1; +} + + +solver_parms_t set_solver_parms(int isp,solver_t solver, + int nkv,int isolv,int nmr,int ncy, + int nmx,double res) +{ + int ie,iprms[7]; + double dprms[1]; + + if (init==0) + init_sp(); + + if ((solver==CGNE)||(solver==MSCG)) + { + nkv=0; + isolv=0; + nmr=0; + ncy=0; + } + + if (NPROC>1) + { + iprms[0]=isp; + iprms[1]=(int)(solver); + iprms[2]=nkv; + iprms[3]=isolv; + iprms[4]=nmr; + iprms[5]=ncy; + iprms[6]=nmx; + dprms[0]=res; + + MPI_Bcast(iprms,7,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(dprms,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + ie=0; + ie|=(iprms[0]!=isp); + ie|=(iprms[1]!=(int)(solver)); + ie|=(iprms[2]!=nkv); + ie|=(iprms[3]!=isolv); + ie|=(iprms[4]!=nmr); + ie|=(iprms[5]!=ncy); + ie|=(iprms[6]!=nmx); + ie|=(dprms[0]!=res); + + error(ie!=0,1,"set_solver_parms [solver_parms.c]", + "Parameters are not global"); + } + + ie=0; + ie|=(isp<0)||(isp>=ISPMAX); + ie|=(solver==SOLVERS); + ie|=(nmx<1); + + if ((solver==SAP_GCR)||(solver==DFL_SAP_GCR)) + { + ie|=(isolv<0)||(isolv>1); + ie|=(nmr<1); + ie|=(ncy<1); + } + + error_root(ie!=0,1,"set_solver_parms [solver_parms.c]", + "Parameters are out of range"); + + error_root(sp[isp].solver!=SOLVERS,1,"set_solver_parms [solver_parms.c]", + "Attempt to reset an already specified solver parameter set"); + + sp[isp].solver=solver; + sp[isp].nkv=nkv; + sp[isp].isolv=isolv; + sp[isp].nmr=nmr; + sp[isp].ncy=ncy; + sp[isp].nmx=nmx; + sp[isp].res=res; + + return sp[isp]; +} + + +solver_parms_t solver_parms(int isp) +{ + if (init==0) + init_sp(); + + if ((isp>=0)&&(isp1) + { + MPI_Bcast(&ids,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nkv,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&isolv,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmr,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&ncy,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&nmx,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(&res,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + + set_solver_parms(isp,solver[ids],nkv,isolv,nmr,ncy,nmx,res); +} + + +void print_solver_parms(int *isap,int *idfl) +{ + int my_rank,i; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + (*isap)=0; + (*idfl)=0; + + if ((my_rank==0)&&(init==1)) + { + for (i=0;iexp(t*T^a)*U(x,mu)}. + +The generators are assumed to be normalized such that + + tr{T^a*T^b}=-(1/2)*delta^{ab}, + +but the 3x3 matrices F(x,mu)^a*T^a (summed over a) do not depend on this +convention. + + +Supported actions +----------------- + +- Gauge action. + Program: action0(). + Symbol: ACG. + Parameters: none. + +- One-parameter twisted-mass pseudo-fermion action. + Program: action1(). + Symbol: ACF_TM1. + Parameters: mu,ipf,isp [see force1.c], m0 [bare mass]. + +- One-parameter twisted-mass pseudo-fermion action with even-odd + preconditioning. + Program: action4(). + Symbol: ACF_TM1_EO. + Parameters: mu,ipf,isp [see force4.c], m0 [bare mass]. + +- One-parameter twisted-mass pseudo-fermion action with even-odd + preconditioning plus "small determinant" action. + Program: action4(). + Symbol: ACF_TM1_EO_SDET. + Parameters: mu,ipf,isp [see force4.c], m0 [bare mass]. + +- Two-parameter (Hasenbusch) twisted-mass pseudo-fermion action. + Program: action2(). + Symbol: ACF_TM2. + Parameters: mu0,mu1,ipf,isp [see force2.c], m0 [bare mass]. + +- Two-parameter (Hasenbusch) twisted-mass pseudo-fermion action + with even-odd preconditioning. + Program: action5(). + Symbol: ACF_TM2_EO. + Parameters: mu0,mu1,ipf,isp [see force5.c], m0 [bare mass]. + +- Rational function pseudo-fermion action. + Program: action3(). + Symbol: ACF_RAT. + Parameters: irat,ipf,isp [see force3.c], m0 [bare mass]. + +- Rational function pseudo-fermion action plus "small determinant" + action. + Program: action3(). + Symbol: ACF_RAT_SDET. + Parameters: irat,ipf,isp [see force3.c], m0 [bare mass]. + + +Associated forces +----------------- + +- Gauge force. + Program: force0(). + Symbol: FRG. + Parameters: none. + +- One-parameter twisted-mass pseudo-fermion force. + Program: force1(). + Symbol: FRF_TM1. + Parameters: mu,ipf,isp,icr [see force1.c], m0 [bare mass]. + +- One-parameter twisted-mass pseudo-fermion force with even-odd + preconditioning. + Program: force4(). + Symbol: FRF_TM1_EO. + Parameters: mu,ipf,isp,icr [see force4.c], m0 [bare mass]. + +- One-parameter twisted-mass pseudo-fermion force with even-odd + preconditioning plus "small determinant" force. + Program: force4(). + Symbol: FRF_TM1_EO_SDET. + Parameters: mu,ipf,isp,icr [see force4.c], m0 [bare mass]. + +- Two-parameter (Hasenbusch) twisted-mass pseudo-fermion force. + Program: force2(). + Symbol: FRF_TM2. + Parameters: mu0,mu1,ipf,isp,icr [see force2.c], m0 [bare mass]. + +- Two-parameter (Hasenbusch) twisted-mass pseudo-fermion force with + even-odd preconditioning. + Program: force5(). + Symbol: FRF_TM2_EO. + Parameters: mu0,mu1,ipf,isp,icr [see force5.c], m0 [bare mass]. + +- Rational function pseudo-fermion force. + Program: force3(). + Symbol: FRF_RAT. + Parameters: irat,ipf,isp [see force3.c], m0 [bare mass]. + +- Rational function pseudo-fermion plus "small determinant" force. + Program: force3(). + Symbol: FRF_RAT_SDET. + Parameters: irat,ipf,isp [see force3.c], m0 [bare mass]. + + +Pseudo-fermion fields +--------------------- + +Pseudo-fermion fields are allocated permanently at the start of the simulation +program. They are administered by the module mdflds/mdflds.c together with the +momentum and the force fields. + +The maximal number npf of pseudo-fermion fields is set together with the other +parameters of the HMC algorithm (see flags/hmc_parms.c). + + +Solver programs +--------------- + +The available solver programs for the Dirac equation are + +- Conjugate gradient algorithm for the normal Dirac equation. + Programs: tmcg() and tmcgeo() [see tmcg.c]. + Symbol: CGNE. + +- Multi-shift conjugate gradient algorithm for the normal even-odd + preconditioned Dirac equation. + Program: tmcgm() [see tmcgm.c]. + Symbol: MSCG. + +- SAP-preconditioned GCR algorithm for the Dirac equation. + Program: sap_gcr() [see sap_gcr.c]. + Symbol: SAP_GCR. + +- Deflated SAP-preconditioned GCR algorithm for the Dirac equation. + Program: dfl_sap_gcr() and dfl_sap_gcr2() [see dfl_sap_gcr.c]. + Symbol: DFL_SAP_GCR. + +A particular solver is thus described by the solver symbol, the values of the +program arguments and further parameters (the bare quark mass, the parameters +of the SAP preconditioner and those related to the deflation subspace). + + +Chronological solver +-------------------- + +The force programs force1() and force2() can be instructed to propagate the +solutions of the Dirac equation along the molecular-dynamics trajectories. The +stacks of previous solutions are handled by the module chrono.c. + + +Action, force and solver data base +---------------------------------- + +The parameters of the actions, forces and solvers used in a simulation are +stored in a data base. At the beginning of the simulation program, the list of +all actions, forces and solvers must be defined. These data are then entered +in the data base using the utility programs in the flags module directory (see +action_parms.c, force_parms.c and solver_parms.c). + + +Rational function data base +--------------------------- + +For the charm and the strange quark, a version of the RHMC algorithm is used. +The basic rational functions are [n,n] Zolotarev rational functions, but in +the simulation programs it is advantageous to split these into a few rational +functions of lower degree and to use a pseudo-fermion action for each of them. + +The data base for rational functions consists of two parts, one for the +parameters of the basic Zolotarev rational functions (flags/rat_parms.c) and +the other for the rational functions that occur in the pseudo-fermion actions +(see ratfcts/ratfcts.c). diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/forces/force0.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/forces/force0.c new file mode 100644 index 0000000000000000000000000000000000000000..6e84e5d199c36447e93e9d82bc480c9f29bd3535 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/forces/force0.c @@ -0,0 +1,712 @@ + +/******************************************************************************* +* +* File force0.c +* +* Copyright (C) 2005, 2009-2014 Martin Luescher, John Bulava +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Action of the double-precision gauge field and associated force. +* +* The externally accessible functions are +* +* void plaq_frc(void) +* Computes the force deriving from the Wilson plaquette action, +* omitting the prefactor 1/g0^2, and assigns the result to the MD +* force field. In the case of open, SF or open-SF boundary conditions, +* the boundary improvement coefficients are set to their tree-level +* value independently of the values stored in the parameter data base. +* +* void force0(double c) +* Computes the force deriving from the gauge action, including the +* prefactor 1/g0^2, multiplies the calculated force by c and assigns +* the result to the MD force field. The coupling g0 and the other +* parameters of the gauge action are retrieved from the parameter +* data base. +* +* double action0(int icom) +* Computes the local part of the gauge action including the prefactor +* 1/g0^2. The coupling g0 and the other parameters of the action are +* retrieved from the parameter data base. The program returns the sum +* of the local parts of the action over all MPI processes if icom=1 +* and otherwise just the local part. +* +* Notes: +* +* See the notes doc/gauge_action.pdf for the definition of the gauge action +* and a description of the computation of the force deriving from it. The +* molecular-dynamics (MD) force field is the one returned by the program +* mdflds() (see mdflds/mdflds.c). +* +* On the links in the local lattice where the static link variables reside, +* the programs plaq_frc() and force0() set the force field to zero. +* +* The programs in this module perform global communications and must be +* called simultaneously on all MPI processes. +* +*******************************************************************************/ + +#define FORCE0_C + +#include +#include +#include +#include "mpi.h" +#include "flags.h" +#include "su3fcts.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "mdflds.h" +#include "forces.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define MAX_LEVELS 8 +#define BLK_LENGTH 8 + +static const int plns[6][2]={{0,1},{0,2},{0,3},{2,3},{3,1},{1,2}}; +static int nfc[8],ofs[8],hofs[8],cnt[MAX_LEVELS],init=0; +static double smx[MAX_LEVELS]; +static su3_dble *udb,*hdb; +static su3_dble wd[3],vd[4] ALIGNED16; +static su3_alg_dble X ALIGNED16; + + +static void set_ofs(void) +{ + nfc[0]=FACE0/2; + nfc[1]=FACE0/2; + nfc[2]=FACE1/2; + nfc[3]=FACE1/2; + nfc[4]=FACE2/2; + nfc[5]=FACE2/2; + nfc[6]=FACE3/2; + nfc[7]=FACE3/2; + + ofs[0]=VOLUME; + ofs[1]=ofs[0]+(FACE0/2); + ofs[2]=ofs[1]+(FACE0/2); + ofs[3]=ofs[2]+(FACE1/2); + ofs[4]=ofs[3]+(FACE1/2); + ofs[5]=ofs[4]+(FACE2/2); + ofs[6]=ofs[5]+(FACE2/2); + ofs[7]=ofs[6]+(FACE3/2); + + hofs[0]=0; + hofs[1]=hofs[0]+3*FACE0; + hofs[2]=hofs[1]+3*FACE0; + hofs[3]=hofs[2]+3*FACE1; + hofs[4]=hofs[3]+3*FACE1; + hofs[5]=hofs[4]+3*FACE2; + hofs[6]=hofs[5]+3*FACE2; + hofs[7]=hofs[6]+3*FACE3; + + init=1; +} + + +static void set_staples(int n,int ix,int ia) +{ + int mu,nu,ifc; + int iy,ib,ip[4]; + + mu=plns[n][0]; + nu=plns[n][1]; + + if (!ia) + { + iy=idn[ix][nu]; + + if (iynu)]; + } + } + + iy=iup[ix][mu]; + + if (iymu)]; + } + + if (!ia) + { + iy=idn[ix][mu]; + + if (iymu)]; + } + } + + iy=iup[ix][nu]; + + if (iynu)]; + } +} + + +void plaq_frc(void) +{ + int bc,n,ix,t,ip[4]; + double r; + su3_alg_dble *fdb; + mdflds_t *mdfs; + + if (query_flags(UDBUF_UP2DATE)!=1) + copy_bnd_ud(); + + bc=bc_type(); + udb=udfld(); + mdfs=mdflds(); + fdb=(*mdfs).frc; + set_frc2zero(); + + for (ix=0;ix0)||(bc!=1)) + { + _su3_alg_sub_assign(*(fdb+ip[2]),X); + } + } + } + + if ((t>0)||(bc!=1)) + { + r=1.0; + + if (((t==0)&&(bc!=3))||((t==(N0-1))&&(bc==0))) + r=0.5; + + for (n=3;n<6;n++) + { + plaq_uidx(n,ix,ip); + + su3xsu3dag(udb+ip[1],udb+ip[3],wd); + su3dagxsu3(udb+ip[2],udb+ip[0],wd+1); + prod2su3alg(wd,wd+1,&X); + _su3_alg_mul_add_assign(*(fdb+ip[1]),r,X); + prod2su3alg(wd+1,wd,&X); + _su3_alg_mul_sub_assign(*(fdb+ip[3]),r,X); + + su3xsu3dag(wd,udb+ip[2],wd+1); + prod2su3alg(udb+ip[0],wd+1,&X); + _su3_alg_mul_add_assign(*(fdb+ip[0]),r,X); + _su3_alg_mul_sub_assign(*(fdb+ip[2]),r,X); + } + } + } + + add_bnd_frc(); +} + + +void force0(double c) +{ + int bc,n,ix,t,ip[4]; + double c0,c1,*cG; + double r0,r1; + su3_alg_dble *fdb; + mdflds_t *mdfs; + lat_parms_t lat; + bc_parms_t bcp; + + lat=lat_parms(); + c*=(lat.beta/6.0); + c0=lat.c0; + c1=lat.c1; + + bcp=bc_parms(); + bc=bcp.type; + cG=bcp.cG; + + if (query_flags(UDBUF_UP2DATE)!=1) + copy_bnd_ud(); + + udb=udfld(); + mdfs=mdflds(); + fdb=(*mdfs).frc; + set_frc2zero(); + + if (c0==1.0) + hdb=NULL; + else + { + if (init==0) + set_ofs(); + + if (query_flags(BSTAP_UP2DATE)!=1) + set_bstap(); + hdb=bstap(); + } + + for (ix=0;ix0)||(bc!=1)) + { + _su3_alg_mul_sub_assign(*(fdb+ip[2]),r0,X); + } + + if (c0!=1.0) + { + set_staples(n,ix,0); + + if ((t==0)&&(bc==1)) + { + su3xsu3(wd+1,udb+ip[0],wd+2); + su3xsu3(udb+ip[0],wd+2,wd+2); + + prod2su3alg(wd+1,wd+2,&X); + _su3_alg_mul_add_assign(*(fdb+ip[1]),r1,X); + + prod2su3alg(wd+2,wd+1,&X); + _su3_alg_mul_add_assign(*(fdb+ip[0]),r1,X); + + su3dagxsu3(udb+ip[2],wd+2,wd+2); + + prod2su3alg(wd+2,wd,&X); + _su3_alg_mul_sub_assign(*(fdb+ip[3]),r1,X); + } + + if ((t==(N0-1))&&(bc!=3)) + { + su3xsu3(wd+1,udb+ip[0],wd+2); + su3xsu3(udb+ip[0],wd+2,wd+2); + + prod2su3alg(wd+2,wd+1,&X); + _su3_alg_mul_add_assign(*(fdb+ip[0]),r1,X); + _su3_alg_mul_sub_assign(*(fdb+ip[2]),r1,X); + + su3dagxsu3(udb+ip[2],wd+2,wd+2); + + prod2su3alg(wd+2,wd,&X); + _su3_alg_mul_sub_assign(*(fdb+ip[3]),r1,X); + } + + if ((t<(N0-1))||(bc==3)) + { + prod2su3alg(wd+1,vd,&X); + _su3_alg_mul_add_assign(*(fdb+ip[1]),r1,X); + } + + if ((t>0)||(bc!=1)) + { + prod2su3alg(vd,wd+1,&X); + _su3_alg_mul_sub_assign(*(fdb+ip[2]),r1,X); + } + + su3dagxsu3(udb+ip[2],vd,wd+1); + prod2su3alg(wd+1,wd,&X); + _su3_alg_mul_sub_assign(*(fdb+ip[3]),r1,X); + + if ((t<(N0-2))||((t==(N0-2))&&(bc!=0))||(bc==3)) + { + su3xsu3dag(udb+ip[3],vd+1,wd+1); + su3xsu3dag(wd+1,udb+ip[0],wd+2); + prod2su3alg(udb+ip[2],wd+2,&X); + _su3_alg_mul_sub_assign(*(fdb+ip[0]),r1,X); + + if ((t>0)||(bc!=1)) + { + _su3_alg_mul_add_assign(*(fdb+ip[2]),r1,X); + } + + prod2su3alg(wd+2,udb+ip[2],&X); + _su3_alg_mul_add_assign(*(fdb+ip[3]),r1,X); + } + + if ((t>0)||(bc==3)) + { + su3xsu3dag(wd,vd+2,wd+1); + prod2su3alg(udb+ip[0],wd+1,&X); + _su3_alg_mul_add_assign(*(fdb+ip[0]),r1,X); + + if ((t<(N0-1))||(bc==3)) + { + prod2su3alg(wd+1,udb+ip[0],&X); + _su3_alg_mul_add_assign(*(fdb+ip[1]),r1,X); + } + + su3dagxsu3(vd+2,udb+ip[0],wd+1); + prod2su3alg(wd+1,wd,&X); + _su3_alg_mul_sub_assign(*(fdb+ip[3]),r1,X); + } + + su3xsu3dag(udb+ip[1],vd+3,wd); + su3xsu3dag(wd,udb+ip[2],wd+1); + prod2su3alg(udb+ip[0],wd+1,&X); + _su3_alg_mul_add_assign(*(fdb+ip[0]),r1,X); + + if ((t>0)||(bc!=1)) + { + _su3_alg_mul_sub_assign(*(fdb+ip[2]),r1,X); + } + + if ((t<(N0-1))||(bc==3)) + { + prod2su3alg(wd+1,udb+ip[0],&X); + _su3_alg_mul_add_assign(*(fdb+ip[1]),r1,X); + } + } + } + } + + if ((t>0)||(bc!=1)) + { + r0=c*c0; + r1=c*c1; + + if ((t==0)&&(bc!=3)) + { + r0*=(0.5*cG[0]); + r1*=(0.5*cG[0]); + } + else if ((t==(N0-1))&&(bc==0)) + { + r0*=(0.5*cG[1]); + r1*=(0.5*cG[1]); + } + + for (n=3;n<6;n++) + { + plaq_uidx(n,ix,ip); + + su3xsu3dag(udb+ip[1],udb+ip[3],wd); + su3dagxsu3(udb+ip[2],udb+ip[0],wd+1); + prod2su3alg(wd,wd+1,&X); + _su3_alg_mul_add_assign(*(fdb+ip[1]),r0,X); + + prod2su3alg(wd+1,wd,&X); + _su3_alg_mul_sub_assign(*(fdb+ip[3]),r0,X); + + su3xsu3dag(wd,udb+ip[2],wd+1); + prod2su3alg(udb+ip[0],wd+1,&X); + _su3_alg_mul_add_assign(*(fdb+ip[0]),r0,X); + _su3_alg_mul_sub_assign(*(fdb+ip[2]),r0,X); + + if (c0!=1.0) + { + set_staples(n,ix,0); + + prod2su3alg(wd+1,vd,&X); + _su3_alg_mul_add_assign(*(fdb+ip[1]),r1,X); + + prod2su3alg(vd,wd+1,&X); + _su3_alg_mul_sub_assign(*(fdb+ip[2]),r1,X); + + su3dagxsu3(udb+ip[2],vd,wd+1); + prod2su3alg(wd+1,wd,&X); + _su3_alg_mul_sub_assign(*(fdb+ip[3]),r1,X); + + su3xsu3dag(udb+ip[3],vd+1,wd+1); + su3xsu3dag(wd+1,udb+ip[0],wd+2); + prod2su3alg(udb+ip[2],wd+2,&X); + _su3_alg_mul_sub_assign(*(fdb+ip[0]),r1,X); + _su3_alg_mul_add_assign(*(fdb+ip[2]),r1,X); + + prod2su3alg(wd+2,udb+ip[2],&X); + _su3_alg_mul_add_assign(*(fdb+ip[3]),r1,X); + + su3xsu3dag(wd,vd+2,wd+1); + prod2su3alg(udb+ip[0],wd+1,&X); + _su3_alg_mul_add_assign(*(fdb+ip[0]),r1,X); + + prod2su3alg(wd+1,udb+ip[0],&X); + _su3_alg_mul_add_assign(*(fdb+ip[1]),r1,X); + + su3dagxsu3(vd+2,udb+ip[0],wd+1); + prod2su3alg(wd+1,wd,&X); + _su3_alg_mul_sub_assign(*(fdb+ip[3]),r1,X); + + su3xsu3dag(udb+ip[1],vd+3,wd); + su3xsu3dag(wd,udb+ip[2],wd+1); + prod2su3alg(udb+ip[0],wd+1,&X); + _su3_alg_mul_add_assign(*(fdb+ip[0]),r1,X); + _su3_alg_mul_sub_assign(*(fdb+ip[2]),r1,X); + + prod2su3alg(wd+1,udb+ip[0],&X); + _su3_alg_mul_add_assign(*(fdb+ip[1]),r1,X); + } + } + } + } + + add_bnd_frc(); +} + + +static void wloops(int n,int ix,int t,double c0,double *trU) +{ + int bc,ip[4]; + + bc=bc_type(); + plaq_uidx(n,ix,ip); + + trU[0]=0.0; + trU[1]=0.0; + trU[2]=0.0; + trU[3]=0.0; + + if ((n>=3)||(t<(N0-1))||(bc!=0)) + { + su3dagxsu3(udb+ip[2],udb+ip[0],wd); + su3xsu3dag(udb+ip[1],udb+ip[3],wd+1); + cm3x3_retr(wd,wd+1,trU); + trU[0]=3.0-trU[0]; + } + + if (c0!=1.0) + { + set_staples(n,ix,1); + + if ((n<3)&&(((t==0)&&(bc==1))|| + ((t==(N0-1))&&((bc==1)||(bc==2))))) + { + su3xsu3(wd,wd+1,wd+1); + cm3x3_retr(wd+1,wd+1,trU+3); + trU[3]=3.0-trU[3]; + } + + if ((n>=3)||(t<(N0-1))||(bc!=0)) + { + su3xsu3dag(udb+ip[1],vd+3,wd+1); + cm3x3_retr(wd,wd+1,trU+1); + trU[1]=3.0-trU[1]; + } + + if ((n>=3)||(t<(N0-2))||((t==(N0-2))&&(bc!=0))||(bc==3)) + { + su3xsu3dag(vd+1,udb+ip[3],wd+1); + cm3x3_retr(wd,wd+1,trU+2); + trU[2]=3.0-trU[2]; + } + } +} + + +double action0(int icom) +{ + int bc,n,ix,t; + double c0,c1,*cG; + double r0,r1,trU[4],act; + lat_parms_t lat; + bc_parms_t bcp; + + lat=lat_parms(); + c0=lat.c0; + c1=lat.c1; + + bcp=bc_parms(); + bc=bcp.type; + cG=bcp.cG; + + if (query_flags(UDBUF_UP2DATE)!=1) + copy_bnd_ud(); + udb=udfld(); + + if (c0==1.0) + hdb=NULL; + else + { + if (init==0) + set_ofs(); + + if (query_flags(BSTAP_UP2DATE)!=1) + set_bstap(); + hdb=bstap(); + } + + for (n=0;n0)||(bc!=1)) + { + r0=c0; + r1=c1; + + if ((t==0)&&(bc!=3)) + { + r0*=(0.5*cG[0]); + r1*=(0.5*cG[0]); + } + else if ((t==(N0-1))&&(bc==0)) + { + r0*=(0.5*cG[1]); + r1*=(0.5*cG[1]); + } + + for (n=3;n<6;n++) + { + wloops(n,ix,t,c0,trU); + act+=(r0*trU[0]+r1*(trU[1]+trU[2])); + } + } + + cnt[0]+=1; + smx[0]+=act; + + for (n=1;(cnt[n-1]>=BLK_LENGTH)&&(n0) 2+2*(icr>0) 2+2*(icr>0) +* action1() 1 1 1 +* +* (these figures do not include the workspace required by the solvers). +* +* The programs in this module perform global communications and must be +* called simultaneously on all MPI processes. +* +*******************************************************************************/ + +#define FORCE1_C + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "mdflds.h" +#include "sw_term.h" +#include "sflds.h" +#include "dirac.h" +#include "linalg.h" +#include "sap.h" +#include "dfl.h" +#include "update.h" +#include "forces.h" +#include "global.h" + + +double setpf1(double mu,int ipf,int icom) +{ + double act; + spinor_dble **wsd,*phi; + mdflds_t *mdfs; + tm_parms_t tm; + + tm=tm_parms(); + if (tm.eoflg==1) + set_tm_parms(0); + + wsd=reserve_wsd(1); + random_sd(VOLUME,wsd[0],1.0); + bnd_sd2zero(ALL_PTS,wsd[0]); + act=norm_square_dble(VOLUME,icom,wsd[0]); + + sw_term(NO_PTS); + + mdfs=mdflds(); + phi=(*mdfs).pf[ipf]; + Dw_dble(mu,wsd[0],phi); + mulg5_dble(VOLUME,phi); + release_wsd(); + + return act; +} + + +void force1(double mu,int ipf,int isp,int icr,double c,int *status) +{ + int l; + double res0,res1; + spinor_dble *phi,*chi,*psi,**wsd; + spinor_dble *rho,*eta,**rsd; + mdflds_t *mdfs; + solver_parms_t sp; + sap_parms_t sap; + tm_parms_t tm; + + tm=tm_parms(); + if (tm.eoflg==1) + set_tm_parms(0); + + mdfs=mdflds(); + sp=solver_parms(isp); + sw_term(NO_PTS); + + wsd=reserve_wsd(2); + phi=(*mdfs).pf[ipf]; + psi=wsd[0]; + chi=wsd[1]; + + if (sp.solver==CGNE) + { + if (get_chrono(icr,chi)) + { + rsd=reserve_wsd(1); + rho=rsd[0]; + + Dw_dble(-mu,chi,psi); + mulg5_dble(VOLUME,psi); + Dw_dble(mu,psi,rho); + mulg5_dble(VOLUME,rho); + mulr_spinor_add_dble(VOLUME,rho,phi,-1.0); + + res0=norm_square_dble(VOLUME,1,phi); + res1=norm_square_dble(VOLUME,1,rho); + res1=sqrt(res1/res0); + + if (res1<1.0) + { + if (res1>sp.res) + { + tmcg(sp.nmx,sp.res/res1,mu,rho,psi,status); + mulr_spinor_add_dble(VOLUME,chi,psi,-1.0); + } + else + status[0]=0; + } + else + tmcg(sp.nmx,sp.res,mu,phi,chi,status); + + release_wsd(); + } + else + tmcg(sp.nmx,sp.res,mu,phi,chi,status); + + error_root(status[0]<0,1,"force1 [force1.c]", + "CGNE solver failed (mu = %.4e, parameter set no %d, " + "status = %d)",mu,isp,status[0]); + if (icr) + add_chrono(icr,chi); + Dw_dble(-mu,chi,psi); + mulg5_dble(VOLUME,psi); + } + else if (sp.solver==SAP_GCR) + { + sap=sap_parms(); + set_sap_parms(sap.bs,sp.isolv,sp.nmr,sp.ncy); + + if (get_chrono(icr,chi)) + { + rsd=reserve_wsd(2); + rho=rsd[0]; + eta=rsd[1]; + + Dw_dble(-mu,chi,psi); + mulg5_dble(VOLUME,psi); + Dw_dble(mu,psi,rho); + mulg5_dble(VOLUME,rho); + mulr_spinor_add_dble(VOLUME,rho,phi,-1.0); + + res0=norm_square_dble(VOLUME,1,phi); + res1=norm_square_dble(VOLUME,1,rho); + res1=sqrt(res1/res0); + + if (res1<1.0) + { + if (res1>sp.res) + { + mulg5_dble(VOLUME,rho); + sap_gcr(sp.nkv,sp.nmx,sp.res/res1,mu,rho,eta,status); + mulr_spinor_add_dble(VOLUME,psi,eta,-1.0); + + res0=norm_square_dble(VOLUME,1,psi); + res1=norm_square_dble(VOLUME,1,eta); + res1=sqrt(res1/res0); + + if (res1<1.0) + { + if (res1>sp.res) + { + mulg5_dble(VOLUME,eta); + sap_gcr(sp.nkv,sp.nmx,sp.res/res1,-mu,eta,rho,status+1); + mulr_spinor_add_dble(VOLUME,chi,rho,-1.0); + } + else + status[1]=0; + } + else + { + mulg5_dble(VOLUME,psi); + sap_gcr(sp.nkv,sp.nmx,sp.res,-mu,psi,chi,status+1); + mulg5_dble(VOLUME,psi); + } + } + else + { + status[0]=0; + status[1]=0; + } + } + else + { + mulg5_dble(VOLUME,phi); + sap_gcr(sp.nkv,sp.nmx,sp.res,mu,phi,psi,status); + mulg5_dble(VOLUME,phi); + mulg5_dble(VOLUME,psi); + sap_gcr(sp.nkv,sp.nmx,sp.res,-mu,psi,chi,status+1); + mulg5_dble(VOLUME,psi); + } + + release_wsd(); + } + else + { + mulg5_dble(VOLUME,phi); + sap_gcr(sp.nkv,sp.nmx,sp.res,mu,phi,psi,status); + mulg5_dble(VOLUME,phi); + mulg5_dble(VOLUME,psi); + sap_gcr(sp.nkv,sp.nmx,sp.res,-mu,psi,chi,status+1); + mulg5_dble(VOLUME,psi); + } + + error_root((status[0]<0)||(status[1]<0),1,"force1 [force1.c]", + "SAP_GCR solver failed (mu = %.4e, parameter set no %d, " + "status = %d;%d)",mu,isp,status[0],status[1]); + if (icr) + add_chrono(icr,chi); + } + else if (sp.solver==DFL_SAP_GCR) + { + sap=sap_parms(); + set_sap_parms(sap.bs,sp.isolv,sp.nmr,sp.ncy); + + if (get_chrono(icr,chi)) + { + rsd=reserve_wsd(2); + rho=rsd[0]; + eta=rsd[1]; + + Dw_dble(-mu,chi,psi); + mulg5_dble(VOLUME,psi); + Dw_dble(mu,psi,rho); + mulg5_dble(VOLUME,rho); + mulr_spinor_add_dble(VOLUME,rho,phi,-1.0); + + res0=norm_square_dble(VOLUME,1,phi); + res1=norm_square_dble(VOLUME,1,rho); + res1=sqrt(res1/res0); + + if (res1<1.0) + { + if (res1>sp.res) + { + mulg5_dble(VOLUME,rho); + dfl_sap_gcr2(sp.nkv,sp.nmx,sp.res/res1,mu,rho,eta,status); + mulr_spinor_add_dble(VOLUME,psi,eta,-1.0); + + res0=norm_square_dble(VOLUME,1,psi); + res1=norm_square_dble(VOLUME,1,eta); + res1=sqrt(res1/res0); + + if (res1<1.0) + { + if (res1>sp.res) + { + mulg5_dble(VOLUME,eta); + dfl_sap_gcr2(sp.nkv,sp.nmx,sp.res/res1,-mu,eta,rho, + status+3); + mulr_spinor_add_dble(VOLUME,chi,rho,-1.0); + } + else + { + for (l=3;l<6;l++) + status[l]=0; + } + } + else + { + mulg5_dble(VOLUME,psi); + dfl_sap_gcr2(sp.nkv,sp.nmx,sp.res,-mu,psi,chi,status+3); + mulg5_dble(VOLUME,psi); + } + } + else + { + for (l=0;l<6;l++) + status[l]=0; + } + } + else + { + mulg5_dble(VOLUME,phi); + dfl_sap_gcr2(sp.nkv,sp.nmx,sp.res,mu,phi,psi,status); + mulg5_dble(VOLUME,phi); + mulg5_dble(VOLUME,psi); + dfl_sap_gcr2(sp.nkv,sp.nmx,sp.res,-mu,psi,chi,status+3); + mulg5_dble(VOLUME,psi); + } + + release_wsd(); + } + else + { + mulg5_dble(VOLUME,phi); + dfl_sap_gcr2(sp.nkv,sp.nmx,sp.res,mu,phi,psi,status); + mulg5_dble(VOLUME,phi); + mulg5_dble(VOLUME,psi); + dfl_sap_gcr2(sp.nkv,sp.nmx,sp.res,-mu,psi,chi,status+3); + mulg5_dble(VOLUME,psi); + } + + error_root((status[0]<0)||(status[1]<0)||(status[3]<0)||(status[4]<0),1, + "force1 [force1.c]","DFL_SAP_GCR solver failed " + "(mu = %.4e, parameter set no %d, status = %d,%d,%d;%d,%d,%d)", + mu,isp,status[0],status[1],status[2], + status[3],status[4],status[5]); + + if (icr) + add_chrono(icr,chi); + } + else + error_root(1,1,"force1 [force1.c]","Unknown solver"); + + set_xt2zero(); + add_prod2xt(1.0,chi,psi); + sw_frc(c); + + set_xv2zero(); + add_prod2xv(1.0,chi,psi); + hop_frc(c); + + release_wsd(); +} + + +double action1(double mu,int ipf,int isp,int icom,int *status) +{ + double act; + spinor_dble *phi,*psi,**wsd,**rsd; + mdflds_t *mdfs; + solver_parms_t sp; + sap_parms_t sap; + tm_parms_t tm; + + tm=tm_parms(); + if (tm.eoflg==1) + set_tm_parms(0); + + mdfs=mdflds(); + sp=solver_parms(isp); + + wsd=reserve_wsd(1); + psi=wsd[0]; + phi=(*mdfs).pf[ipf]; + + if (sp.solver==CGNE) + { + tmcg(sp.nmx,sp.res,mu,phi,psi,status); + + error_root(status[0]<0,1,"action1 [force1.c]", + "CGNE solver failed (mu = %.4e, parameter set no %d, " + "status = %d)",mu,isp,status[0]); + + rsd=reserve_wsd(1); + Dw_dble(-mu,psi,rsd[0]); + act=norm_square_dble(VOLUME,icom,rsd[0]); + release_wsd(); + } + else if (sp.solver==SAP_GCR) + { + sap=sap_parms(); + set_sap_parms(sap.bs,sp.isolv,sp.nmr,sp.ncy); + + mulg5_dble(VOLUME,phi); + sap_gcr(sp.nkv,sp.nmx,sp.res,mu,phi,psi,status); + mulg5_dble(VOLUME,phi); + + error_root(status[0]<0,1,"action1 [force1.c]", + "SAP_GCR solver failed (mu = %.4e, parameter set no %d, " + "status = %d)",mu,isp,status[0]); + + act=norm_square_dble(VOLUME,icom,psi); + } + else if (sp.solver==DFL_SAP_GCR) + { + sap=sap_parms(); + set_sap_parms(sap.bs,sp.isolv,sp.nmr,sp.ncy); + + mulg5_dble(VOLUME,phi); + dfl_sap_gcr2(sp.nkv,sp.nmx,sp.res,mu,phi,psi,status); + mulg5_dble(VOLUME,phi); + + error_root((status[0]<0)||(status[1]<0),1, + "action1 [force1.c]","DFL_SAP_GCR solver failed " + "(mu = %.4e, parameter set no %d, status = %d,%d,%d)", + mu,isp,status[0],status[1],status[2]); + + act=norm_square_dble(VOLUME,icom,psi); + } + else + { + error_root(1,1,"action1 [force1.c]","Unknown solver"); + act=0.0; + } + + release_wsd(); + + return act; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/forces/force2.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/forces/force2.c new file mode 100644 index 0000000000000000000000000000000000000000..53c37312534d5127cec2856b9ee08dacabd7fb8f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/forces/force2.c @@ -0,0 +1,218 @@ + +/******************************************************************************* +* +* File force2.c +* +* Copyright (C) 2011-2013 Stefan Schaefer, Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Hasenbusch twisted_mass pseudo-fermion action and force. +* +* The externally accessible functions are +* +* double setpf2(double mu0,double mu1,int ipf,int isp,int icom, +* int *status) +* Generates a pseudo-fermion field phi with probability proportional +* to exp(-Spf) and returns the action Spf-(phi,phi) (see the notes). +* +* void force2(double mu0,int mu1,int ipf,int isp,int icr,double c, +* int *status) +* Computes the force deriving from the action Spf (see the notes). +* The calculated force is multiplied by c and added to the molecular- +* dynamics force field. +* +* double action2(double mu0,double mu1,int ipf,int isp,int icom, +* int *status) +* Returns the action Spf-(phi,phi) (see the notes). +* +* Notes: +* +* The pseudo-fermion action Spf is given by +* +* Spf=(phi,(Dw^dag*Dw+mu1^2)(Dw^dag*Dw+mu0^2)^(-1)*phi) +* +* =(phi,phi)+(mu1^2-mu0^2)*(phi,(Dw^dag*Dw+mu0^2)^(-1)*phi) +* +* where Dw denotes the (improved) Wilson-Dirac operator and phi the pseudo- +* fermion field. +* +* The common parameters of the programs in this module are: +* +* mu0,mu1 Twisted mass parameters in Spf. +* +* ipf Index of the pseudo-fermion field phi in the +* structure returned by mdflds() [mdflds.c]. +* +* isp Index of the solver parameter set that describes +* the solver to be used for the solution of the +* Dirac equation. +* +* icom The action returned by the programs setpf3() and +* action3() is summed over all MPI processes if icom=1. +* Otherwise the local part of the action is returned. +* +* status Status values returned by the solver used for the +* solution of the Dirac equation. +* +* The supported solvers are CGNE, SAP_GCR and DFL_SAP_GCR. Depending +* on the program and the solver, the number of status variables varies +* and is given by: +* +* CGNE SAP_GCR DFL_SAP_GCR +* setpf2() 1 1 3 +* force2() 1 2 6 +* action2() 1 1 3 +* +* The solver used in the case of setpf2() is for the Dirac equation with +* twisted mass mu1, while force2() and action2() use the solver for the +* equation with twisted mass mu0. Different solvers may be needed in the +* two cases if mu1>>mu0, for example. +* +* Note that, in force2(), the GCR solvers solve the Dirac equations twice. +* In these cases, the program writes the status values one after the other +* to the array. The bare quark mass m0 is the one last set by sw_parms() +* [flags/lat_parms.c] and it is taken for granted that the parameters of +* the solver have been set by set_solver_parms() [flags/solver_parms.c]. +* +* The program force2() attempts to propagate the solutions of the Dirac +* equation along the molecular-dynamics trajectories, using the field +* stack number icr (no fields are propagated if icr=0). If this feature +* is used, the program setup_chrono() [update/chrono.c] must be called +* before force2() is called for the first time. +* +* The required workspaces of double-precision spinor fields are +* +* CGNE SAP_GCR DFL_SAP_GCR +* setpf2() 1 1 1 +* force2() 2+(icr>0) 2+2*(icr>0) 2+2*(icr>0) +* action2() 1 1 1 +* +* (these figures do not include the workspace required by the solvers). +* +* The programs in this module perform global communications and must be +* called simultaneously on all MPI processes. +* +*******************************************************************************/ + +#define FORCE2_C + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "mdflds.h" +#include "sw_term.h" +#include "sflds.h" +#include "dirac.h" +#include "linalg.h" +#include "sap.h" +#include "dfl.h" +#include "forces.h" +#include "global.h" + + +double setpf2(double mu0,double mu1,int ipf,int isp,int icom,int *status) +{ + double act; + complex_dble z; + spinor_dble **wsd,**rsd; + spinor_dble *phi,*psi,*chi; + mdflds_t *mdfs; + solver_parms_t sp; + sap_parms_t sap; + tm_parms_t tm; + + tm=tm_parms(); + if (tm.eoflg==1) + set_tm_parms(0); + + mdfs=mdflds(); + phi=(*mdfs).pf[ipf]; + wsd=reserve_wsd(1); + psi=wsd[0]; + + random_sd(VOLUME,phi,1.0); + bnd_sd2zero(ALL_PTS,phi); + sp=solver_parms(isp); + + if (sp.solver==CGNE) + { + tmcg(sp.nmx,sp.res,mu1,phi,psi,status); + + error_root(status[0]<0,1,"setpf2 [force2.c]","CGNE solver failed " + "(mu = %.4e, parameter set no %d, status = %d)", + mu1,isp,status[0]); + + rsd=reserve_wsd(1); + chi=rsd[0]; + assign_sd2sd(VOLUME,psi,chi); + Dw_dble(-mu1,chi,psi); + mulg5_dble(VOLUME,psi); + release_wsd(); + } + else if (sp.solver==SAP_GCR) + { + sap=sap_parms(); + set_sap_parms(sap.bs,sp.isolv,sp.nmr,sp.ncy); + + mulg5_dble(VOLUME,phi); + sap_gcr(sp.nkv,sp.nmx,sp.res,mu1,phi,psi,status); + mulg5_dble(VOLUME,phi); + + error_root(status[0]<0,1,"setpf2 [force2.c]","SAP_GCR solver failed " + "(mu = %.4e, parameter set no %d, status = %d)", + mu1,isp,status[0]); + } + else if (sp.solver==DFL_SAP_GCR) + { + sap=sap_parms(); + set_sap_parms(sap.bs,sp.isolv,sp.nmr,sp.ncy); + + mulg5_dble(VOLUME,phi); + dfl_sap_gcr2(sp.nkv,sp.nmx,sp.res,mu1,phi,psi,status); + mulg5_dble(VOLUME,phi); + + error_root((status[0]<0)||(status[1]<0),1, + "setpf2 [force2.c]","DFL_SAP_GCR solver failed " + "(mu = %.4e, parameter set no %d, status = %d,%d,%d)", + mu1,isp,status[0],status[1],status[2]); + } + else + error_root(1,1,"setpf2 [force2.c]","Unknown solver"); + + z.re=0.0; + z.im=mu0-mu1; + mulc_spinor_add_dble(VOLUME,phi,psi,z); + act=(mu1*mu1-mu0*mu0)*norm_square_dble(VOLUME,icom,psi); + release_wsd(); + + return act; +} + + +void force2(double mu0,double mu1,int ipf,int isp,int icr, + double c,int *status) +{ + double dmu2; + + dmu2=mu1*mu1-mu0*mu0; + + force1(mu0,ipf,isp,icr,dmu2*c,status); +} + + +double action2(double mu0,double mu1,int ipf,int isp,int icom,int *status) +{ + double dmu2,act; + + dmu2=mu1*mu1-mu0*mu0; + act=dmu2*action1(mu0,ipf,isp,icom,status); + + return act; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/forces/force3.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/forces/force3.c new file mode 100644 index 0000000000000000000000000000000000000000..d39d2c7f743fdfcc02676d44edbe3873fb2cce98 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/forces/force3.c @@ -0,0 +1,674 @@ + +/******************************************************************************* +* +* File force3.c +* +* Copyright (C) 2012, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Rational function forces. +* +* The externally accessible functions are +* +* double setpf3(int *irat,int ipf,int isw,int isp,int icom,int *status) +* Generates a pseudo-fermion field phi with probability proportional +* to exp(-Spf) and returns the action Spf+Sdet-(phi,phi) if isw=1 or +* Spf-(phi,phi) if isw!=1 (see the notes). +* +* void force3(int *irat,int ipf,int isw,int isp,double c,int *status) +* Computes the force deriving from the action Spf+Sdet if isw=1 or +* Spf if isw!=1 (see the notes). The calculated force is multiplied +* by c and added to the molecular-dynamics force field. +* +* double action3(int *irat,int ipf,int isw,int isp,int icom,int *status) +* Returns the action Spf+Sdet-(phi,phi) if isw=1 or Spf-(phi,phi) if +* isw!=1 (see the notes). +* +* Notes: +* +* Simulations including the charm and/or the strange quark are based on +* a version of the RHMC algorithm. See the notes "Charm and strange quark +* in openQCD simulations" (file doc/rhmc.pdf). +* +* The pseudo-fermion action Spf is given by +* +* Spf=(phi,P_{k,l}*phi), +* +* where P_{k,l} is the fraction of a Zolotarev rational function, which +* is defined by the parameters: +* +* irat[0] Index of the Zolotarev rational function in the +* parameter data base. +* +* irat[1] Lower end k of the selected coefficient range. +* +* irat[2] Upper end l of the selected coefficient range. +* +* See ratfcts/ratfcts.c for further explanations. The inclusion of the +* "small quark determinant" amounts to adding the action +* +* Sdet=-ln{det(1e+Doo)}+constant +* +* to the molecular-dynamics Hamilton function, where 1e is the projector +* to the quark fields that vanish on the odd lattice sites and Doo the +* odd-odd component of the Dirac operator (the constant is adjusted so +* as to reduce the significance losses when the action differences are +* computed at the end of the molecular-dynamics trajectories). +* +* The other parameters of the programs in this module are: +* +* ipf Index of the pseudo-fermion field phi in the +* structure returned by mdflds() [mdflds.c]. +* +* isp Index of the solver parameter set that describes +* the solver to be used for the solution of the +* Dirac equation. +* +* icom The action returned by the programs setpf3() and +* action3() is summed over all MPI processes if icom=1. +* Otherwise the local part of the action is returned. +* +* status Array of the average status values returned by the +* solver used for the solution of the Dirac equation +* (in the case of the DFL_SAP_GCR solver, status[2] +* and status[5] are not averaged). +* +* The supported solvers are MSCG, SAP_GCR and DFL_SAP_GCR. Depending +* on the program and the solver, the number of status variables varies +* and is given by: +* +* MSCG SAP_GCR DFL_SAP_GCR +* setpf3() 1 1 3 +* force3() 1 2 6 +* action3() 1 1 3 +* +* Note that, in force3(), the GCR solvers solve the Dirac equations twice. +* In these cases, the program writes the status values one after the other +* to the array. The bare quark mass m0 is the one last set by sw_parms() +* [flags/lat_parms.c] and it is taken for granted that the parameters of +* the solver have been set by set_solver_parms() [flags/solver_parms.c]. +* +* The required workspaces of double-precision spinor fields are +* +* MSCG SAP_GCR DFL_SAP_GCR +* setpf3() np 2 2 +* force3() np 3 3 +* action3() np 1 1 +* +* where np is the number of poles of P_{k,l} (these figures do not include +* the workspace required by the solvers). +* +* The programs in this module perform global communications and must be +* called simultaneously on all MPI processes. +* +*******************************************************************************/ + +#define FORCE3_C + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "mdflds.h" +#include "sw_term.h" +#include "sflds.h" +#include "dirac.h" +#include "linalg.h" +#include "sap.h" +#include "dfl.h" +#include "ratfcts.h" +#include "forces.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define MAX_LEVELS 8 +#define BLK_LENGTH 8 + +static int cnt[MAX_LEVELS],nps=0; +static double smx[MAX_LEVELS],*rs; + + +static void set_res(int np,double res) +{ + int k; + + if (np>nps) + { + if (nps>0) + free(rs); + + rs=malloc(np*sizeof(*rs)); + error(rs==NULL,1,"set_res [force3.c]", + "Unable to allocate auxiliary array"); + } + + for (k=0;k1.0) + c=pow(4.0+swp.m0,-6.0); + else + c=1.0; + + for (n=0;nVOLUME) + iy=VOLUME; + + for (;ix0)||(bc==3))&&((t<(N0-1))||(bc!=0))) + { + z=det_pauli_dble(0.0,m); + + if (z.re>0.0) + p*=(c*z.re); + else + ie=1; + + z=det_pauli_dble(0.0,m+1); + + if (z.re>0.0) + p*=(c*z.re); + else + ie=1; + } + + m+=2; + } + + if (p>0.0) + { + cnt[0]+=1; + smx[0]-=log(p); + + for (n=1;(cnt[n-1]>=BLK_LENGTH)&&(n0) 2+2*(icr>0) 2+2*(icr>0) +* action4() 1 1 1 +* +* (these figures do not include the workspace required by the solvers). +* +* The programs in this module perform global communications and must be +* called simultaneously on all MPI processes. +* +*******************************************************************************/ + +#define FORCE4_C + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "mdflds.h" +#include "sw_term.h" +#include "sflds.h" +#include "dirac.h" +#include "linalg.h" +#include "sap.h" +#include "dfl.h" +#include "update.h" +#include "forces.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define MAX_LEVELS 8 +#define BLK_LENGTH 8 + +static int cnt[MAX_LEVELS]; +static double smx[MAX_LEVELS]; + + +static double sdet(void) +{ + int bc,ix,iy,t,n,ie; + double c,p; + complex_dble z; + pauli_dble *m; + sw_parms_t swp; + + swp=sw_parms(); + + if ((4.0+swp.m0)>1.0) + c=pow(4.0+swp.m0,-6.0); + else + c=1.0; + + for (n=0;nVOLUME) + iy=VOLUME; + + for (;ix0)||(bc==3))&&((t<(N0-1))||(bc!=0))) + { + z=det_pauli_dble(0.0,m); + + if (z.re>0.0) + p*=(c*z.re); + else + ie=1; + + z=det_pauli_dble(0.0,m+1); + + if (z.re>0.0) + p*=(c*z.re); + else + ie=1; + } + + m+=2; + } + + if (p!=0.0) + { + cnt[0]+=1; + smx[0]-=2.0*log(p); + + for (n=1;(cnt[n-1]>=BLK_LENGTH)&&(nsp.res) + { + tmcgeo(sp.nmx,sp.res/res1,mu,rho,psi,status); + mulr_spinor_add_dble(VOLUME/2,chi,psi,-1.0); + } + else + status[0]=0; + } + else + tmcgeo(sp.nmx,sp.res,mu,phi,chi,status); + + release_wsd(); + } + else + tmcgeo(sp.nmx,sp.res,mu,phi,chi,status); + + error_root(status[0]<0,1,"force4 [force4.c]", + "CGNE solver failed (mu = %.4e, parameter set no %d, " + "status = %d)",mu,isp,status[0]); + + Dwoe_dble(chi,chi); + Dwoo_dble(0.0,chi,chi); + Dwhat_dble(-mu,chi,psi); + mulg5_dble(VOLUME/2,psi); + Dwoe_dble(psi,psi); + Dwoo_dble(0.0,psi,psi); + + if (icr) + add_chrono(icr,chi); + + add_prod2xt(1.0,chi,psi); + add_prod2xv(-1.0,chi,psi); + } + else if (sp.solver==SAP_GCR) + { + sap=sap_parms(); + set_sap_parms(sap.bs,sp.isolv,sp.nmr,sp.ncy); + + if (get_chrono(icr,chi)) + { + rsd=reserve_wsd(2); + rho=rsd[0]; + eta=rsd[1]; + + ifail=sw_term(ODD_PTS); + error_root(ifail!=0,1,"force4 [force4.c]", + "Inversion of the SW term was not safe"); + + Dwhat_dble(-mu,chi,psi); + mulg5_dble(VOLUME/2,psi); + Dwhat_dble(mu,psi,rho); + mulg5_dble(VOLUME/2,rho); + mulr_spinor_add_dble(VOLUME/2,rho,phi,-1.0); + + res0=norm_square_dble(VOLUME/2,1,phi); + res1=norm_square_dble(VOLUME/2,1,rho); + res1=sqrt(res1/res0); + + if (res1<1.0) + { + Dwoe_dble(chi,chi); + Dwoo_dble(0.0,chi,chi); + scale_dble(VOLUME/2,-1.0,chi+(VOLUME/2)); + + Dwoe_dble(psi,psi); + Dwoo_dble(0.0,psi,psi); + scale_dble(VOLUME/2,-1.0,psi+(VOLUME/2)); + + if (res1>sp.res) + { + mulg5_dble(VOLUME/2,rho); + set_sd2zero(VOLUME/2,rho+(VOLUME/2)); + + sap_gcr(sp.nkv,sp.nmx,sp.res/res1,mu,rho,eta,status); + + mulr_spinor_add_dble(VOLUME,psi,eta,-1.0); + + res0=norm_square_dble(VOLUME/2,1,psi); + res1=norm_square_dble(VOLUME/2,1,eta); + res1=sqrt(res1/res0); + + if (res1<1.0) + { + if (res1>sp.res) + { + mulg5_dble(VOLUME/2,eta); + set_sd2zero(VOLUME/2,eta+(VOLUME/2)); + + sap_gcr(sp.nkv,sp.nmx,sp.res/res1,-mu,eta,rho,status+1); + + mulr_spinor_add_dble(VOLUME,chi,rho,-1.0); + } + else + status[1]=0; + } + else + { + assign_sd2sd(VOLUME/2,psi,eta); + mulg5_dble(VOLUME/2,eta); + set_sd2zero(VOLUME/2,eta+(VOLUME/2)); + + sap_gcr(sp.nkv,sp.nmx,sp.res,-mu,eta,chi,status+1); + } + } + else + { + status[0]=0; + status[1]=0; + } + } + else + { + mulg5_dble(VOLUME/2,phi); + set_sd2zero(VOLUME/2,phi+(VOLUME/2)); + + sap_gcr(sp.nkv,sp.nmx,sp.res,mu,phi,psi,status); + + mulg5_dble(VOLUME/2,phi); + assign_sd2sd(VOLUME/2,psi,eta); + mulg5_dble(VOLUME/2,eta); + set_sd2zero(VOLUME/2,eta+(VOLUME/2)); + + sap_gcr(sp.nkv,sp.nmx,sp.res,-mu,eta,chi,status+1); + } + + release_wsd(); + } + else + { + rsd=reserve_wsd(1); + eta=rsd[0]; + + mulg5_dble(VOLUME/2,phi); + set_sd2zero(VOLUME/2,phi+(VOLUME/2)); + + sap_gcr(sp.nkv,sp.nmx,sp.res,mu,phi,psi,status); + + mulg5_dble(VOLUME/2,phi); + assign_sd2sd(VOLUME/2,psi,eta); + mulg5_dble(VOLUME/2,eta); + set_sd2zero(VOLUME/2,eta+(VOLUME/2)); + + sap_gcr(sp.nkv,sp.nmx,sp.res,-mu,eta,chi,status+1); + + release_wsd(); + } + + error_root((status[0]<0)||(status[1]<0),1,"force4 [force4.c]", + "SAP_GCR solver failed (mu = %.4e, parameter set no %d, " + "status = %d;%d)",mu,isp,status[0],status[1]); + + if (icr) + add_chrono(icr,chi); + + add_prod2xt(1.0,chi,psi); + add_prod2xv(1.0,chi,psi); + } + else if (sp.solver==DFL_SAP_GCR) + { + sap=sap_parms(); + set_sap_parms(sap.bs,sp.isolv,sp.nmr,sp.ncy); + + if (get_chrono(icr,chi)) + { + rsd=reserve_wsd(2); + rho=rsd[0]; + eta=rsd[1]; + + ifail=sw_term(ODD_PTS); + error_root(ifail!=0,1,"force4 [force4.c]", + "Inversion of the SW term was not safe"); + + Dwhat_dble(-mu,chi,psi); + mulg5_dble(VOLUME/2,psi); + Dwhat_dble(mu,psi,rho); + mulg5_dble(VOLUME/2,rho); + mulr_spinor_add_dble(VOLUME/2,rho,phi,-1.0); + + res0=norm_square_dble(VOLUME/2,1,phi); + res1=norm_square_dble(VOLUME/2,1,rho); + res1=sqrt(res1/res0); + + if (res1<1.0) + { + Dwoe_dble(chi,chi); + Dwoo_dble(0.0,chi,chi); + scale_dble(VOLUME/2,-1.0,chi+(VOLUME/2)); + + Dwoe_dble(psi,psi); + Dwoo_dble(0.0,psi,psi); + scale_dble(VOLUME/2,-1.0,psi+(VOLUME/2)); + + if (res1>sp.res) + { + mulg5_dble(VOLUME/2,rho); + set_sd2zero(VOLUME/2,rho+(VOLUME/2)); + + dfl_sap_gcr2(sp.nkv,sp.nmx,sp.res/res1,mu,rho,eta, + status); + + mulr_spinor_add_dble(VOLUME,psi,eta,-1.0); + + res0=norm_square_dble(VOLUME/2,1,psi); + res1=norm_square_dble(VOLUME/2,1,eta); + res1=sqrt(res1/res0); + + if (res1<1.0) + { + if (res1>sp.res) + { + mulg5_dble(VOLUME/2,eta); + set_sd2zero(VOLUME/2,eta+(VOLUME/2)); + + dfl_sap_gcr2(sp.nkv,sp.nmx,sp.res/res1,-mu,eta,rho, + status+3); + + mulr_spinor_add_dble(VOLUME,chi,rho,-1.0); + } + else + { + for (l=3;l<6;l++) + status[l]=0; + } + } + else + { + assign_sd2sd(VOLUME/2,psi,eta); + mulg5_dble(VOLUME/2,eta); + set_sd2zero(VOLUME/2,eta+(VOLUME/2)); + + dfl_sap_gcr2(sp.nkv,sp.nmx,sp.res,-mu,eta,chi,status+3); + } + } + else + { + for (l=0;l<6;l++) + status[l]=0; + } + } + else + { + mulg5_dble(VOLUME/2,phi); + set_sd2zero(VOLUME/2,phi+(VOLUME/2)); + + dfl_sap_gcr2(sp.nkv,sp.nmx,sp.res,mu,phi,psi,status); + + mulg5_dble(VOLUME/2,phi); + assign_sd2sd(VOLUME/2,psi,eta); + mulg5_dble(VOLUME/2,eta); + set_sd2zero(VOLUME/2,eta+(VOLUME/2)); + + dfl_sap_gcr2(sp.nkv,sp.nmx,sp.res,-mu,eta,chi,status+3); + } + + release_wsd(); + } + else + { + rsd=reserve_wsd(1); + eta=rsd[0]; + + mulg5_dble(VOLUME/2,phi); + set_sd2zero(VOLUME/2,phi+(VOLUME/2)); + + dfl_sap_gcr2(sp.nkv,sp.nmx,sp.res,mu,phi,psi,status); + + mulg5_dble(VOLUME/2,phi); + assign_sd2sd(VOLUME/2,psi,eta); + mulg5_dble(VOLUME/2,eta); + set_sd2zero(VOLUME/2,eta+(VOLUME/2)); + + dfl_sap_gcr2(sp.nkv,sp.nmx,sp.res,-mu,eta,chi,status+3); + + release_wsd(); + } + + error_root((status[0]<0)||(status[1]<0)||(status[3]<0)||(status[4]<0),1, + "force4 [force4.c]","DFL_SAP_GCR solver failed " + "(mu = %.4e, parameter set no %d, status = %d,%d,%d;%d,%d,%d)", + mu,isp,status[0],status[1],status[2],status[3], + status[4],status[5]); + + if (icr) + add_chrono(icr,chi); + + add_prod2xt(1.0,chi,psi); + add_prod2xv(1.0,chi,psi); + } + else + error_root(1,1,"force4 [force4.c]","Unknown solver"); + + sw_frc(c); + hop_frc(c); + + release_wsd(); +} + + +double action4(double mu,int ipf,int isw,int isp,int icom,int *status) +{ + double act,r; + spinor_dble *phi,*chi,*psi; + spinor_dble **rsd,**wsd; + mdflds_t *mdfs; + solver_parms_t sp; + sap_parms_t sap; + tm_parms_t tm; + + tm=tm_parms(); + if (tm.eoflg!=1) + set_tm_parms(1); + + mdfs=mdflds(); + phi=(*mdfs).pf[ipf]; + sp=solver_parms(isp); + + if (isw==1) + act=sdet(); + else + act=0.0; + + if (sp.solver==CGNE) + { + rsd=reserve_wsd(1); + chi=rsd[0]; + + tmcgeo(sp.nmx,sp.res,mu,phi,chi,status); + + error_root(status[0]<0,1,"action4 [force4.c]", + "CGNE solver failed (mu = %.4e, parameter set no %d, " + "status = %d)",mu,isp,status[0]); + + wsd=reserve_wsd(1); + psi=wsd[0]; + + Dwhat_dble(-mu,chi,psi); + act+=norm_square_dble(VOLUME/2,0,psi); + + release_wsd(); + release_wsd(); + } + else if (sp.solver==SAP_GCR) + { + rsd=reserve_wsd(1); + psi=rsd[0]; + + sap=sap_parms(); + set_sap_parms(sap.bs,sp.isolv,sp.nmr,sp.ncy); + mulg5_dble(VOLUME/2,phi); + set_sd2zero(VOLUME/2,phi+(VOLUME/2)); + + sap_gcr(sp.nkv,sp.nmx,sp.res,mu,phi,psi,status); + + error_root(status[0]<0,1,"action4 [force4.c]", + "SAP_GCR solver failed (mu = %.4e, parameter set no %d, " + "status = %d)",mu,isp,status[0]); + + mulg5_dble(VOLUME/2,phi); + act+=norm_square_dble(VOLUME/2,0,psi); + + release_wsd(); + } + else if (sp.solver==DFL_SAP_GCR) + { + rsd=reserve_wsd(1); + psi=rsd[0]; + + sap=sap_parms(); + set_sap_parms(sap.bs,sp.isolv,sp.nmr,sp.ncy); + mulg5_dble(VOLUME/2,phi); + set_sd2zero(VOLUME/2,phi+(VOLUME/2)); + + dfl_sap_gcr2(sp.nkv,sp.nmx,sp.res,mu,phi,psi,status); + + error_root((status[0]<0)||(status[1]<0),1,"action4 [force4.c]", + "DFL_SAP_GCR solver failed (mu = %.4e, parameter set " + "no %d, status = %d,%d,%d)",mu,isp, + status[0],status[1],status[2]); + + mulg5_dble(VOLUME/2,phi); + act+=norm_square_dble(VOLUME/2,0,psi); + + release_wsd(); + } + else + error_root(1,1,"action4 [force4.c]","Unknown solver"); + + if (icom==1) + { + r=act; + MPI_Reduce(&r,&act,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(&act,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + + return act; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/forces/force5.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/forces/force5.c new file mode 100644 index 0000000000000000000000000000000000000000..3d8d9f19a856709c92dcb198e0f8d6be81380be2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/forces/force5.c @@ -0,0 +1,220 @@ + +/******************************************************************************* +* +* File force5.c +* +* Copyright (C) 2011-2013 Stefan Schaefer, Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Hasenbusch twisted mass pseudo-fermion action and force with even-odd +* precconditioning. +* +* The externally accessible functions are +* +* double setpf5(double mu0,double mu1,int ipf,int isp,int icom, +* int *status) +* Generates a pseudo-fermion field phi with probability proportional +* to exp(-Spf) and returns the action Spf-(phi,phi) (see the notes). +* +* void force5(double mu0,int mu1,int ipf,int isp,int icr,double c, +* int *status) +* Computes the force deriving from the action Spf (see the notes). +* The calculated force is multiplied by c and added to the molecular- +* dynamics force field. +* +* double action5(double mu0,double mu1,int ipf,int isp,int icom, +* int *status) +* Returns the action Spf-(phi,phi) (see the notes). +* +* Notes: +* +* The pseudo-fermion action Spf is given by +* +* Spf=(phi,(Dwhat^dag*Dwhat+mu1^2)(Dwhat^dag*Dwhat+mu0^2)^(-1)*phi) +* +* =(phi,phi)+(mu1^2-mu0^2)*(phi,(Dwhat^dag*Dwhat+mu0^2)^(-1)*phi) +* +* where Dwhat denotes the even-odd preconditioned (improved) Wilson-Dirac +* operator and phi the pseudo-fermion field. The latter vanishes on the +* odd lattice sites. +* +* The common parameters of the programs in this module are: +* +* mu0,mu1 Twisted mass parameters in Spf. +* +* ipf Index of the pseudo-fermion field phi in the +* structure returned by mdflds() [mdflds.c]. +* +* isp Index of the solver parameter set that describes +* the solver to be used for the solution of the +* Dirac equation. +* +* icom The action returned by the programs setpf3() and +* action3() is summed over all MPI processes if icom=1. +* Otherwise the local part of the action is returned. +* +* status Status values returned by the solver used for the +* solution of the Dirac equation. +* +* The supported solvers are CGNE, SAP_GCR and DFL_SAP_GCR. Depending +* on the program and the solver, the number of status variables varies +* and is given by: +* +* CGNE SAP_GCR DFL_SAP_GCR +* setpf5() 1 1 3 +* force5() 1 2 6 +* action5() 1 1 3 +* +* The solver used in the case of setpf5() is for the Dirac equation with +* twisted mass mu1, while force5() and action5() use the solver for the +* equation with twisted mass mu0. Different solvers may be needed in the +* two cases if mu1>>mu0, for example. +* +* Note that, in force5(), the GCR solvers solve the Dirac equations twice. +* In these cases, the program writes the status values one after the other +* to the array. The bare quark mass m0 is the one last set by sw_parms() +* [flags/lat_parms.c] and it is taken for granted that the parameters of +* the solver have been set by set_solver_parms() [flags/solver_parms.c]. +* +* The program force5() attempts to propagate the solutions of the Dirac +* equation along the molecular-dynamics trajectories, using the field +* stack number icr (no fields are propagated if icr=0). If this feature +* is used, the program setup_chrono() [update/chrono.c] must be called +* before force5() is called for the first time. +* +* The required workspaces of double-precision spinor fields are +* +* CGNE SAP_GCR DFL_SAP_GCR +* setpf5() 1 1 1 +* force5() 2+(icr>0) 2+2*(icr>0) 2+2*(icr>0) +* action5() 1 1 1 +* +* (these figures do not include the workspace required by the solvers). +* +* The programs in this module perform global communications and must be +* called simultaneously on all MPI processes. +* +*******************************************************************************/ + +#define FORCE5_C + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "mdflds.h" +#include "sw_term.h" +#include "sflds.h" +#include "dirac.h" +#include "linalg.h" +#include "sap.h" +#include "dfl.h" +#include "forces.h" +#include "global.h" + + +double setpf5(double mu0,double mu1,int ipf,int isp,int icom,int *status) +{ + double act; + complex_dble z; + spinor_dble **wsd,**rsd; + spinor_dble *phi,*psi,*chi; + mdflds_t *mdfs; + solver_parms_t sp; + sap_parms_t sap; + tm_parms_t tm; + + tm=tm_parms(); + if (tm.eoflg!=1) + set_tm_parms(1); + + mdfs=mdflds(); + phi=(*mdfs).pf[ipf]; + wsd=reserve_wsd(1); + psi=wsd[0]; + + random_sd(VOLUME/2,phi,1.0); + set_sd2zero(VOLUME/2,phi+(VOLUME/2)); + bnd_sd2zero(EVEN_PTS,phi); + sp=solver_parms(isp); + + if (sp.solver==CGNE) + { + tmcgeo(sp.nmx,sp.res,mu1,phi,psi,status); + + error_root(status[0]<0,1,"setpf5 [force5.c]","CGNE solver failed " + "(mu = %.4e, parameter set no %d, status = %d)", + mu1,isp,status[0]); + + rsd=reserve_wsd(1); + chi=rsd[0]; + assign_sd2sd(VOLUME/2,psi,chi); + Dwhat_dble(-mu1,chi,psi); + mulg5_dble(VOLUME/2,psi); + release_wsd(); + } + else if (sp.solver==SAP_GCR) + { + sap=sap_parms(); + set_sap_parms(sap.bs,sp.isolv,sp.nmr,sp.ncy); + + mulg5_dble(VOLUME/2,phi); + sap_gcr(sp.nkv,sp.nmx,sp.res,mu1,phi,psi,status); + mulg5_dble(VOLUME/2,phi); + + error_root(status[0]<0,1,"setpf5 [force5.c]","SAP_GCR solver failed " + "(mu = %.4e, parameter set no %d, status = %d)", + mu1,isp,status[0]); + } + else if (sp.solver==DFL_SAP_GCR) + { + sap=sap_parms(); + set_sap_parms(sap.bs,sp.isolv,sp.nmr,sp.ncy); + + mulg5_dble(VOLUME/2,phi); + dfl_sap_gcr2(sp.nkv,sp.nmx,sp.res,mu1,phi,psi,status); + mulg5_dble(VOLUME/2,phi); + + error_root((status[0]<0)||(status[1]<0),1,"setpf5 [force5.c]", + "DFL_SAP_GCR solver failed (mu = %.4e, parameter set " + "no %d, status = %d,%d,%d)",mu1,isp, + status[0],status[1],status[2]); + } + else + error_root(1,1,"setpf5 [force5.c]","Unknown solver"); + + z.re=0.0; + z.im=mu0-mu1; + mulc_spinor_add_dble(VOLUME/2,phi,psi,z); + act=(mu1*mu1-mu0*mu0)*norm_square_dble(VOLUME/2,icom,psi); + release_wsd(); + + return act; +} + + +void force5(double mu0,double mu1,int ipf,int isp,int icr, + double c,int *status) +{ + double dmu2; + + dmu2=mu1*mu1-mu0*mu0; + force4(mu0,ipf,0,isp,icr,dmu2*c,status); +} + + +double action5(double mu0,double mu1,int ipf,int isp,int icom,int *status) +{ + double dmu2,act; + + dmu2=mu1*mu1-mu0*mu0; + act=dmu2*action4(mu0,ipf,0,isp,icom,status); + + return act; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/forces/frcfcts.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/forces/frcfcts.c new file mode 100644 index 0000000000000000000000000000000000000000..3539839592927473fda98d190ea9caa0a7b188bb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/forces/frcfcts.c @@ -0,0 +1,688 @@ + +/******************************************************************************* +* +* File frcfcts.c +* +* Copyright (C) 2005, 2011, 2012 Martin Luescher, Stefan Schaefer +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Generic functions used for the force calculation. +* +* The externally accessible functions are +* +* void det2xt(pauli_dble *m,u3_alg_dble *X) +* Computes the matrices X[0],..,X[5] associated to the SW term on a +* given lattice point (see the notes). The program expects that m[0] +* and m[1] contain the hermitian part of the inverse of the SW term +* at the chosen point. +* +* void prod2xt(spinor_dble *r,spinor_dble *s,u3_alg_dble *X) +* Computes the matrices X[0],..,X[5] associated to a pair of spinors +* r and s at a given lattice point (see the notes). +* +* The following is an array of functions indexed by the direction mu=0,..,3: +* +* void (*prod2xv[])(spinor_dble *rx,spinor_dble *ry, +* spinor_dble *sx,spinor_dble *sy,su3_dble *u) +* Computes the complex 3x3 matrix +* +* u=tr{gamma_5*(1-gamma_mu)*[(sy x rx^dag)+(ry x sx^dag)]} +* +* where ..x.. denotes the tensor product in spinor space and the trace +* is taken over the Dirac indices. +* +* Notes: +* +* As discussed in the notes +* +* M. Luescher: "Molecular-dynamics quark forces" (January 2012) +* +* the programs in this module serve to compute the spin part of the quark +* forces. The data type u3_alg_dble is described at the top of the module +* su3fcts/su3prod.c. +* +* The matrices computed by the program det2xt() are +* +* X[n]=i*tr{sigma_{mu,nu}*diag(m[0],m[1])} +* +* where (mu,nu)=(0,1),(0,2),(0,3),(2,3),(3,1),(1,2) for n=0,..,5. Similarly, +* the program prod2xt() computes +* +* X[n]=i*tr{(gamma_5*sigma_{mu,nu}*s) x (r^dag)+(s<->r)} +* +* where ..x.. denotes the tensor product in spinor space. In both cases, +* the trace is taken over the Dirac indices only. +* +*******************************************************************************/ + +#define FRCFCTS_C + +#include +#include +#include +#include "su3.h" +#include "forces.h" + +static su3_vector_dble psi1,psi2,chi1,chi2 ALIGNED16; +static pauli_dble ms[2] ALIGNED16; + +typedef union +{ + spinor_dble s; + weyl_dble w[2]; +} spin_t; + +#define _re(z,w) ((z).re*(w).re+(z).im*(w).im) +#define _im(z,w) ((z).im*(w).re-(z).re*(w).im) + + +void det2xt(pauli_dble *m,u3_alg_dble *X) +{ + double x,*up,*um; + u3_alg_dble *X0,*X1; + + up=m[0].u; + um=m[1].u; + + X0=X; + X1=X+3; + + x=up[10]+up[10]; + (*X0).c1=x; + (*X1).c1=-x; + x=um[10]+um[10]; + (*X0).c1-=x; + (*X1).c1-=x; + + x=up[20]+up[20]; + (*X0).c2=x; + (*X1).c2=-x; + x=um[20]+um[20]; + (*X0).c2-=x; + (*X1).c2-=x; + + x=up[28]+up[28]; + (*X0).c3=x; + (*X1).c3=-x; + x=um[28]+um[28]; + (*X0).c3-=x; + (*X1).c3-=x; + + x=up[19]-up[13]; + (*X0).c4=x; + (*X1).c4=-x; + x=um[19]-um[13]; + (*X0).c4-=x; + (*X1).c4-=x; + + x=up[12]+up[18]; + (*X0).c5=x; + (*X1).c5=-x; + x=um[12]+um[18]; + (*X0).c5-=x; + (*X1).c5-=x; + + x=up[25]-up[15]; + (*X0).c6=x; + (*X1).c6=-x; + x=um[25]-um[15]; + (*X0).c6-=x; + (*X1).c6-=x; + + x=up[14]+up[24]; + (*X0).c7=x; + (*X1).c7=-x; + x=um[14]+um[24]; + (*X0).c7-=x; + (*X1).c7-=x; + + x=up[27]-up[23]; + (*X0).c8=x; + (*X1).c8=-x; + x=um[27]-um[23]; + (*X0).c8-=x; + (*X1).c8-=x; + + x=up[22]+up[26]; + (*X0).c9=x; + (*X1).c9=-x; + x=um[22]+um[26]; + (*X0).c9-=x; + (*X1).c9-=x; + + X0=X+1; + X1=X+4; + + x=up[11]+up[11]; + (*X0).c1=-x; + (*X1).c1=x; + x=um[11]+um[11]; + (*X0).c1+=x; + (*X1).c1+=x; + + x=up[21]+up[21]; + (*X0).c2=-x; + (*X1).c2=x; + x=um[21]+um[21]; + (*X0).c2+=x; + (*X1).c2+=x; + + x=up[29]+up[29]; + (*X0).c3=-x; + (*X1).c3=x; + x=um[29]+um[29]; + (*X0).c3+=x; + (*X1).c3+=x; + + x=up[18]-up[12]; + (*X0).c4=x; + (*X1).c4=-x; + x=um[18]-um[12]; + (*X0).c4-=x; + (*X1).c4-=x; + + x=up[13]+up[19]; + (*X0).c5=-x; + (*X1).c5=x; + x=um[13]+um[19]; + (*X0).c5+=x; + (*X1).c5+=x; + + x=up[24]-up[14]; + (*X0).c6=x; + (*X1).c6=-x; + x=um[24]-um[14]; + (*X0).c6-=x; + (*X1).c6-=x; + + x=up[25]+up[15]; + (*X0).c7=-x; + (*X1).c7=x; + x=um[25]+um[15]; + (*X0).c7+=x; + (*X1).c7+=x; + + x=up[26]-up[22]; + (*X0).c8=x; + (*X1).c8=-x; + x=um[26]-um[22]; + (*X0).c8-=x; + (*X1).c8-=x; + + x=up[27]+up[23]; + (*X0).c9=-x; + (*X1).c9=x; + x=um[27]+um[23]; + (*X0).c9+=x; + (*X1).c9+=x; + + X0=X+2; + X1=X+5; + + x=up[0]-up[3]; + (*X0).c1=x; + (*X1).c1=-x; + x=um[0]-um[3]; + (*X0).c1-=x; + (*X1).c1-=x; + + x=up[1]-up[4]; + (*X0).c2=x; + (*X1).c2=-x; + x=um[1]-um[4]; + (*X0).c2-=x; + (*X1).c2-=x; + + x=up[2]-up[5]; + (*X0).c3=x; + (*X1).c3=-x; + x=um[2]-um[5]; + (*X0).c3-=x; + (*X1).c3-=x; + + x=up[31]-up[7]; + (*X0).c4=x; + (*X1).c4=-x; + x=um[31]-um[7]; + (*X0).c4-=x; + (*X1).c4-=x; + + x=up[6]-up[30]; + (*X0).c5=x; + (*X1).c5=-x; + x=um[6]-um[30]; + (*X0).c5-=x; + (*X1).c5-=x; + + x=up[33]-up[9]; + (*X0).c6=x; + (*X1).c6=-x; + x=um[33]-um[9]; + (*X0).c6-=x; + (*X1).c6-=x; + + x=up[8]-up[32]; + (*X0).c7=x; + (*X1).c7=-x; + x=um[8]-um[32]; + (*X0).c7-=x; + (*X1).c7-=x; + + x=up[35]-up[17]; + (*X0).c8=x; + (*X1).c8=-x; + x=um[35]-um[17]; + (*X0).c8-=x; + (*X1).c8-=x; + + x=up[16]-up[34]; + (*X0).c9=x; + (*X1).c9=-x; + x=um[16]-um[34]; + (*X0).c9-=x; + (*X1).c9-=x; +} + + +static void det2xt5(pauli_dble *m,u3_alg_dble *X) +{ + double x,*up,*um; + u3_alg_dble *X0,*X1; + + up=m[0].u; + um=m[1].u; + + X0=X; + X1=X+3; + + x=up[10]+up[10]; + (*X0).c1=x; + (*X1).c1=-x; + x=um[10]+um[10]; + (*X0).c1+=x; + (*X1).c1+=x; + + x=up[20]+up[20]; + (*X0).c2=x; + (*X1).c2=-x; + x=um[20]+um[20]; + (*X0).c2+=x; + (*X1).c2+=x; + + x=up[28]+up[28]; + (*X0).c3=x; + (*X1).c3=-x; + x=um[28]+um[28]; + (*X0).c3+=x; + (*X1).c3+=x; + + x=up[19]-up[13]; + (*X0).c4=x; + (*X1).c4=-x; + x=um[19]-um[13]; + (*X0).c4+=x; + (*X1).c4+=x; + + x=up[12]+up[18]; + (*X0).c5=x; + (*X1).c5=-x; + x=um[12]+um[18]; + (*X0).c5+=x; + (*X1).c5+=x; + + x=up[25]-up[15]; + (*X0).c6=x; + (*X1).c6=-x; + x=um[25]-um[15]; + (*X0).c6+=x; + (*X1).c6+=x; + + x=up[14]+up[24]; + (*X0).c7=x; + (*X1).c7=-x; + x=um[14]+um[24]; + (*X0).c7+=x; + (*X1).c7+=x; + + x=up[27]-up[23]; + (*X0).c8=x; + (*X1).c8=-x; + x=um[27]-um[23]; + (*X0).c8+=x; + (*X1).c8+=x; + + x=up[22]+up[26]; + (*X0).c9=x; + (*X1).c9=-x; + x=um[22]+um[26]; + (*X0).c9+=x; + (*X1).c9+=x; + + X0=X+1; + X1=X+4; + + x=up[11]+up[11]; + (*X0).c1=-x; + (*X1).c1=x; + x=um[11]+um[11]; + (*X0).c1-=x; + (*X1).c1-=x; + + x=up[21]+up[21]; + (*X0).c2=-x; + (*X1).c2=x; + x=um[21]+um[21]; + (*X0).c2-=x; + (*X1).c2-=x; + + x=up[29]+up[29]; + (*X0).c3=-x; + (*X1).c3=x; + x=um[29]+um[29]; + (*X0).c3-=x; + (*X1).c3-=x; + + x=up[18]-up[12]; + (*X0).c4=x; + (*X1).c4=-x; + x=um[18]-um[12]; + (*X0).c4+=x; + (*X1).c4+=x; + + x=up[13]+up[19]; + (*X0).c5=-x; + (*X1).c5=x; + x=um[13]+um[19]; + (*X0).c5-=x; + (*X1).c5-=x; + + x=up[24]-up[14]; + (*X0).c6=x; + (*X1).c6=-x; + x=um[24]-um[14]; + (*X0).c6+=x; + (*X1).c6+=x; + + x=up[25]+up[15]; + (*X0).c7=-x; + (*X1).c7=x; + x=um[25]+um[15]; + (*X0).c7-=x; + (*X1).c7-=x; + + x=up[26]-up[22]; + (*X0).c8=x; + (*X1).c8=-x; + x=um[26]-um[22]; + (*X0).c8+=x; + (*X1).c8+=x; + + x=up[27]+up[23]; + (*X0).c9=-x; + (*X1).c9=x; + x=um[27]+um[23]; + (*X0).c9-=x; + (*X1).c9-=x; + + X0=X+2; + X1=X+5; + + x=up[0]-up[3]; + (*X0).c1=x; + (*X1).c1=-x; + x=um[0]-um[3]; + (*X0).c1+=x; + (*X1).c1+=x; + + x=up[1]-up[4]; + (*X0).c2=x; + (*X1).c2=-x; + x=um[1]-um[4]; + (*X0).c2+=x; + (*X1).c2+=x; + + x=up[2]-up[5]; + (*X0).c3=x; + (*X1).c3=-x; + x=um[2]-um[5]; + (*X0).c3+=x; + (*X1).c3+=x; + + x=up[31]-up[7]; + (*X0).c4=x; + (*X1).c4=-x; + x=um[31]-um[7]; + (*X0).c4+=x; + (*X1).c4+=x; + + x=up[6]-up[30]; + (*X0).c5=x; + (*X1).c5=-x; + x=um[6]-um[30]; + (*X0).c5+=x; + (*X1).c5+=x; + + x=up[33]-up[9]; + (*X0).c6=x; + (*X1).c6=-x; + x=um[33]-um[9]; + (*X0).c6+=x; + (*X1).c6+=x; + + x=up[8]-up[32]; + (*X0).c7=x; + (*X1).c7=-x; + x=um[8]-um[32]; + (*X0).c7+=x; + (*X1).c7+=x; + + x=up[35]-up[17]; + (*X0).c8=x; + (*X1).c8=-x; + x=um[35]-um[17]; + (*X0).c8+=x; + (*X1).c8+=x; + + x=up[16]-up[34]; + (*X0).c9=x; + (*X1).c9=-x; + x=um[16]-um[34]; + (*X0).c9+=x; + (*X1).c9+=x; +} + + +static void vec2pauli(weyl_dble *r,weyl_dble *s,pauli_dble *m) +{ + double *u; + su3_vector_dble *r1,*r2,*s1,*s2; + + u=(*m).u; + r1=&((*r).c1); + r2=&((*r).c2); + s1=&((*s).c1); + s2=&((*s).c2); + + u[ 0]=_re((*s1).c1,(*r1).c1)+_re((*s1).c1,(*r1).c1); + u[ 1]=_re((*s1).c2,(*r1).c2)+_re((*s1).c2,(*r1).c2); + u[ 2]=_re((*s1).c3,(*r1).c3)+_re((*s1).c3,(*r1).c3); + + u[ 3]=_re((*s2).c1,(*r2).c1)+_re((*s2).c1,(*r2).c1); + u[ 4]=_re((*s2).c2,(*r2).c2)+_re((*s2).c2,(*r2).c2); + u[ 5]=_re((*s2).c3,(*r2).c3)+_re((*s2).c3,(*r2).c3); + + u[ 6]=_re((*s1).c1,(*r1).c2)+_re((*r1).c1,(*s1).c2); + u[ 7]=_im((*s1).c1,(*r1).c2)+_im((*r1).c1,(*s1).c2); + u[ 8]=_re((*s1).c1,(*r1).c3)+_re((*r1).c1,(*s1).c3); + u[ 9]=_im((*s1).c1,(*r1).c3)+_im((*r1).c1,(*s1).c3); + + u[10]=_re((*s1).c1,(*r2).c1)+_re((*r1).c1,(*s2).c1); + u[11]=_im((*s1).c1,(*r2).c1)+_im((*r1).c1,(*s2).c1); + u[12]=_re((*s1).c1,(*r2).c2)+_re((*r1).c1,(*s2).c2); + u[13]=_im((*s1).c1,(*r2).c2)+_im((*r1).c1,(*s2).c2); + u[14]=_re((*s1).c1,(*r2).c3)+_re((*r1).c1,(*s2).c3); + u[15]=_im((*s1).c1,(*r2).c3)+_im((*r1).c1,(*s2).c3); + + u[16]=_re((*s1).c2,(*r1).c3)+_re((*r1).c2,(*s1).c3); + u[17]=_im((*s1).c2,(*r1).c3)+_im((*r1).c2,(*s1).c3); + + u[18]=_re((*s1).c2,(*r2).c1)+_re((*r1).c2,(*s2).c1); + u[19]=_im((*s1).c2,(*r2).c1)+_im((*r1).c2,(*s2).c1); + u[20]=_re((*s1).c2,(*r2).c2)+_re((*r1).c2,(*s2).c2); + u[21]=_im((*s1).c2,(*r2).c2)+_im((*r1).c2,(*s2).c2); + u[22]=_re((*s1).c2,(*r2).c3)+_re((*r1).c2,(*s2).c3); + u[23]=_im((*s1).c2,(*r2).c3)+_im((*r1).c2,(*s2).c3); + + u[24]=_re((*s1).c3,(*r2).c1)+_re((*r1).c3,(*s2).c1); + u[25]=_im((*s1).c3,(*r2).c1)+_im((*r1).c3,(*s2).c1); + u[26]=_re((*s1).c3,(*r2).c2)+_re((*r1).c3,(*s2).c2); + u[27]=_im((*s1).c3,(*r2).c2)+_im((*r1).c3,(*s2).c2); + u[28]=_re((*s1).c3,(*r2).c3)+_re((*r1).c3,(*s2).c3); + u[29]=_im((*s1).c3,(*r2).c3)+_im((*r1).c3,(*s2).c3); + + u[30]=_re((*s2).c1,(*r2).c2)+_re((*r2).c1,(*s2).c2); + u[31]=_im((*s2).c1,(*r2).c2)+_im((*r2).c1,(*s2).c2); + u[32]=_re((*s2).c1,(*r2).c3)+_re((*r2).c1,(*s2).c3); + u[33]=_im((*s2).c1,(*r2).c3)+_im((*r2).c1,(*s2).c3); + u[34]=_re((*s2).c2,(*r2).c3)+_re((*r2).c2,(*s2).c3); + u[35]=_im((*s2).c2,(*r2).c3)+_im((*r2).c2,(*s2).c3); +} + + +void prod2xt(spinor_dble *r,spinor_dble *s,u3_alg_dble *X) +{ + spin_t *spr,*sps; + + spr=(spin_t*)(r); + sps=(spin_t*)(s); + + vec2pauli((*spr).w,(*sps).w,ms); + vec2pauli((*spr).w+1,(*sps).w+1,ms+1); + + det2xt5(ms,X); +} + + +static void set2mat(su3_dble *u) +{ + (*u).c11.re=_re(psi1.c1,chi1.c1)+_re(psi2.c1,chi2.c1); + (*u).c11.im=_im(psi1.c1,chi1.c1)+_im(psi2.c1,chi2.c1); + (*u).c12.re=_re(psi1.c1,chi1.c2)+_re(psi2.c1,chi2.c2); + (*u).c12.im=_im(psi1.c1,chi1.c2)+_im(psi2.c1,chi2.c2); + (*u).c13.re=_re(psi1.c1,chi1.c3)+_re(psi2.c1,chi2.c3); + (*u).c13.im=_im(psi1.c1,chi1.c3)+_im(psi2.c1,chi2.c3); + + (*u).c21.re=_re(psi1.c2,chi1.c1)+_re(psi2.c2,chi2.c1); + (*u).c21.im=_im(psi1.c2,chi1.c1)+_im(psi2.c2,chi2.c1); + (*u).c22.re=_re(psi1.c2,chi1.c2)+_re(psi2.c2,chi2.c2); + (*u).c22.im=_im(psi1.c2,chi1.c2)+_im(psi2.c2,chi2.c2); + (*u).c23.re=_re(psi1.c2,chi1.c3)+_re(psi2.c2,chi2.c3); + (*u).c23.im=_im(psi1.c2,chi1.c3)+_im(psi2.c2,chi2.c3); + + (*u).c31.re=_re(psi1.c3,chi1.c1)+_re(psi2.c3,chi2.c1); + (*u).c31.im=_im(psi1.c3,chi1.c1)+_im(psi2.c3,chi2.c1); + (*u).c32.re=_re(psi1.c3,chi1.c2)+_re(psi2.c3,chi2.c2); + (*u).c32.im=_im(psi1.c3,chi1.c2)+_im(psi2.c3,chi2.c2); + (*u).c33.re=_re(psi1.c3,chi1.c3)+_re(psi2.c3,chi2.c3); + (*u).c33.im=_im(psi1.c3,chi1.c3)+_im(psi2.c3,chi2.c3); +} + + +static void add2mat(su3_dble *u) +{ + (*u).c11.re+=_re(psi1.c1,chi1.c1)+_re(psi2.c1,chi2.c1); + (*u).c11.im+=_im(psi1.c1,chi1.c1)+_im(psi2.c1,chi2.c1); + (*u).c12.re+=_re(psi1.c1,chi1.c2)+_re(psi2.c1,chi2.c2); + (*u).c12.im+=_im(psi1.c1,chi1.c2)+_im(psi2.c1,chi2.c2); + (*u).c13.re+=_re(psi1.c1,chi1.c3)+_re(psi2.c1,chi2.c3); + (*u).c13.im+=_im(psi1.c1,chi1.c3)+_im(psi2.c1,chi2.c3); + + (*u).c21.re+=_re(psi1.c2,chi1.c1)+_re(psi2.c2,chi2.c1); + (*u).c21.im+=_im(psi1.c2,chi1.c1)+_im(psi2.c2,chi2.c1); + (*u).c22.re+=_re(psi1.c2,chi1.c2)+_re(psi2.c2,chi2.c2); + (*u).c22.im+=_im(psi1.c2,chi1.c2)+_im(psi2.c2,chi2.c2); + (*u).c23.re+=_re(psi1.c2,chi1.c3)+_re(psi2.c2,chi2.c3); + (*u).c23.im+=_im(psi1.c2,chi1.c3)+_im(psi2.c2,chi2.c3); + + (*u).c31.re+=_re(psi1.c3,chi1.c1)+_re(psi2.c3,chi2.c1); + (*u).c31.im+=_im(psi1.c3,chi1.c1)+_im(psi2.c3,chi2.c1); + (*u).c32.re+=_re(psi1.c3,chi1.c2)+_re(psi2.c3,chi2.c2); + (*u).c32.im+=_im(psi1.c3,chi1.c2)+_im(psi2.c3,chi2.c2); + (*u).c33.re+=_re(psi1.c3,chi1.c3)+_re(psi2.c3,chi2.c3); + (*u).c33.im+=_im(psi1.c3,chi1.c3)+_im(psi2.c3,chi2.c3); +} + + +static void prod2xv0(spinor_dble *rx,spinor_dble *ry, + spinor_dble *sx,spinor_dble *sy,su3_dble *u) +{ + _vector_add(psi1,(*ry).c1,(*ry).c3); + _vector_add(psi2,(*ry).c2,(*ry).c4); + _vector_sub(chi1,(*sx).c1,(*sx).c3); + _vector_sub(chi2,(*sx).c2,(*sx).c4); + set2mat(u); + + _vector_add(psi1,(*sy).c1,(*sy).c3); + _vector_add(psi2,(*sy).c2,(*sy).c4); + _vector_sub(chi1,(*rx).c1,(*rx).c3); + _vector_sub(chi2,(*rx).c2,(*rx).c4); + add2mat(u); +} + + +static void prod2xv1(spinor_dble *rx,spinor_dble *ry, + spinor_dble *sx,spinor_dble *sy,su3_dble *u) +{ + _vector_i_add(psi1,(*ry).c1,(*ry).c4); + _vector_i_add(psi2,(*ry).c2,(*ry).c3); + _vector_i_sub(chi1,(*sx).c1,(*sx).c4); + _vector_i_sub(chi2,(*sx).c2,(*sx).c3); + set2mat(u); + + _vector_i_add(psi1,(*sy).c1,(*sy).c4); + _vector_i_add(psi2,(*sy).c2,(*sy).c3); + _vector_i_sub(chi1,(*rx).c1,(*rx).c4); + _vector_i_sub(chi2,(*rx).c2,(*rx).c3); + add2mat(u); +} + + +static void prod2xv2(spinor_dble *rx,spinor_dble *ry, + spinor_dble *sx,spinor_dble *sy,su3_dble *u) +{ + _vector_add(psi1,(*ry).c1,(*ry).c4); + _vector_sub(psi2,(*ry).c2,(*ry).c3); + _vector_sub(chi1,(*sx).c1,(*sx).c4); + _vector_add(chi2,(*sx).c2,(*sx).c3); + set2mat(u); + + _vector_add(psi1,(*sy).c1,(*sy).c4); + _vector_sub(psi2,(*sy).c2,(*sy).c3); + _vector_sub(chi1,(*rx).c1,(*rx).c4); + _vector_add(chi2,(*rx).c2,(*rx).c3); + add2mat(u); +} + + +static void prod2xv3(spinor_dble *rx,spinor_dble *ry, + spinor_dble *sx,spinor_dble *sy,su3_dble *u) +{ + _vector_i_add(psi1,(*ry).c1,(*ry).c3); + _vector_i_sub(psi2,(*ry).c2,(*ry).c4); + _vector_i_sub(chi1,(*sx).c1,(*sx).c3); + _vector_i_add(chi2,(*sx).c2,(*sx).c4); + set2mat(u); + + _vector_i_add(psi1,(*sy).c1,(*sy).c3); + _vector_i_sub(psi2,(*sy).c2,(*sy).c4); + _vector_i_sub(chi1,(*rx).c1,(*rx).c3); + _vector_i_add(chi2,(*rx).c2,(*rx).c4); + add2mat(u); +} + + +void (*prod2xv[4])(spinor_dble *rx,spinor_dble *ry, + spinor_dble *sx,spinor_dble *sy,su3_dble *u)= +{prod2xv0,prod2xv1,prod2xv2,prod2xv3}; diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/forces/genfrc.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/forces/genfrc.c new file mode 100644 index 0000000000000000000000000000000000000000..3fc357614864a327328158591e6e2169e457d36e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/forces/genfrc.c @@ -0,0 +1,195 @@ +/******************************************************************************* +* +* File genfrc.c +* +* Copyright (C) 2006, 2011, 2013 Martin Luescher, Stefan Schaefer +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Calculation of quark forces. +* +* The externally accessible functions are +* +* void sw_frc(double c) +* Computes the SW part of the quark force, using the current value +* of the X tensor field (see the notes). The calculated force is then +* multiplied by c and added to the MD force field. +* +* void hop_frc(double c) +* Computes the hopping part of the quark force, using the current +* value of the X vector field (see the notes). The calculated force +* is then multiplied by c and added to the MD force field. +* +* Notes: +* +* The computation of the quark forces is described in the notes +* +* M. Luescher: "Molecular-dynamics quark forces" (January 2012) +* +* For explanations of the X tensor and vector fields, see xtensor.c and +* frcfcts.c. The MD force field is the one returned by the program mdflds() +* (see mdflds/mdflds.c). +* +* If the X tensor field is obtained from the SW term calculated by sw_term(), +* and if the X vector field is obtained from quark fields vanishing at the +* boundaries of the lattice, as required by the chosen boundary conditions, +* the programs sw_frc() and hop_frc() leave the force field on the static +* links unchanged. +* +* The coefficient csw of the SW term is retrieved from the parameter data +* base (flags/lat_parms.c). The programs in this module perform global +* operations and must be called simultaneously on all MPI processes. +* +*******************************************************************************/ + +#define GENFRC_C + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "su3fcts.h" +#include "flags.h" +#include "lattice.h" +#include "mdflds.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "tcharge.h" +#include "forces.h" +#include "global.h" + +#define N0 (NPROC0*L0) + +static su3_dble w[6] ALIGNED16; +static su3_alg_dble Y[8] ALIGNED16; + + +void sw_frc(double c) +{ + int bc,n,ix,t; + int ipu[4],ipx[4]; + u3_alg_dble **xt,*Xb; + su3_alg_dble *fb,*fr; + su3_dble *ub; + mdflds_t *mdfs; + sw_parms_t swp; + + bc=bc_type(); + swp=sw_parms(); + c*=0.0625*swp.csw; + + mdfs=mdflds(); + fb=(*mdfs).frc; + set_alg2zero(7*(BNDRY/4),fb+4*VOLUME); + + if (query_flags(UDBUF_UP2DATE)!=1) + copy_bnd_ud(); + ub=udfld(); + xt=xtensor(); + + for (n=0;n<6;n++) + { + Xb=xt[n]; + copy_bnd_ft(n,Xb); + + for (ix=0;ix=3)||(bc==0)||(bc==3)) + { + fr=fb+ipu[1]; + _su3_alg_mul_add_assign(*fr,c,Y[6]); + fr=fb+ipu[2]; + _su3_alg_mul_sub_assign(*fr,c,Y[1]); + } + else + { + if (t<(N0-1)) + { + fr=fb+ipu[1]; + _su3_alg_mul_add_assign(*fr,c,Y[6]); + } + + if ((t>0)||(bc==2)) + { + fr=fb+ipu[2]; + _su3_alg_mul_sub_assign(*fr,c,Y[1]); + } + } + + fr=fb+ipu[3]; + _su3_alg_mul_sub_assign(*fr,c,Y[3]); + } + } + + add_bnd_frc(); +} + + +void hop_frc(double c) +{ + su3_alg_dble *fr; + su3_dble *xv,*u,*um; + mdflds_t *mdfs; + + xv=xvector(); + mdfs=mdflds(); + fr=(*mdfs).frc; + + u=udfld(); + um=u+4*VOLUME; + c*=-0.5; + + for (;u=-1. Otherwise the field psi is set to zero and the +* programs return the norm of the source eta. +* +* The SW term is recalculated when needed. Evidently the solver is a global +* program that must be called on all processes simultaneously. The required +* workspaces are +* +* spinor 5 +* spinor_dble 3 +* +* (see utils/wspace.c). +* +*******************************************************************************/ + +#define TMCG_C + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "dirac.h" +#include "linsolv.h" +#include "forces.h" +#include "global.h" + +static float mus; +static double mud; + + +static void Dop(spinor *s,spinor *r) +{ + Dw(mus,s,r); + mulg5(VOLUME,r); + mus=-mus; +} + + +static void Dop_dble(spinor_dble *s,spinor_dble *r) +{ + Dw_dble(mud,s,r); + mulg5_dble(VOLUME,r); + mud=-mud; +} + + +double tmcg(int nmx,double res,double mu, + spinor_dble *eta,spinor_dble *psi,int *status) +{ + double rho,rho0,fact; + spinor **ws; + spinor_dble **rsd,**wsd; + tm_parms_t tm; + + tm=tm_parms(); + if (tm.eoflg==1) + set_tm_parms(0); + + if (query_flags(U_MATCH_UD)!=1) + assign_ud2u(); + + sw_term(NO_PTS); + + if ((query_flags(SW_UP2DATE)!=1)|| + (query_flags(SW_E_INVERTED)!=0)||(query_flags(SW_O_INVERTED)!=0)) + assign_swd2sw(); + + ws=reserve_ws(5); + wsd=reserve_wsd(2); + rsd=reserve_wsd(1); + + mus=(float)(mu); + mud=mu; + rho0=sqrt(norm_square_dble(VOLUME,1,eta)); + fact=rho0/sqrt((double)(VOLUME)*(double)(24*NPROC)); + + if (fact!=0.0) + { + assign_sd2sd(VOLUME,eta,rsd[0]); + scale_dble(VOLUME,1.0/fact,rsd[0]); + + rho=cgne(VOLUME,1,Dop,Dop_dble,ws,wsd,nmx,res,rsd[0],psi,status); + + scale_dble(VOLUME,fact,psi); + rho*=fact; + } + else + { + status[0]=0; + rho=0.0; + set_sd2zero(VOLUME,psi); + } + + release_wsd(); + release_wsd(); + release_ws(); + + if (status[0]<-1) + { + rho=rho0; + set_sd2zero(VOLUME,psi); + } + + return rho; +} + + +static void Doph(spinor *s,spinor *r) +{ + Dwhat(mus,s,r); + mulg5(VOLUME/2,r); + mus=-mus; +} + + +static void Doph_dble(spinor_dble *s,spinor_dble *r) +{ + Dwhat_dble(mud,s,r); + mulg5_dble(VOLUME/2,r); + mud=-mud; +} + + +double tmcgeo(int nmx,double res,double mu, + spinor_dble *eta,spinor_dble *psi,int *status) +{ + int ifail; + double rho,rho0,fact; + spinor **ws; + spinor_dble **rsd,**wsd; + + rho0=sqrt(norm_square_dble(VOLUME/2,1,eta)); + ifail=sw_term(ODD_PTS); + + if (ifail) + { + status[0]=-2; + rho=rho0; + } + else + { + if (query_flags(U_MATCH_UD)!=1) + assign_ud2u(); + + if ((query_flags(SW_UP2DATE)!=1)|| + (query_flags(SW_E_INVERTED)!=0)||(query_flags(SW_O_INVERTED)!=1)) + assign_swd2sw(); + + ws=reserve_ws(5); + wsd=reserve_wsd(2); + rsd=reserve_wsd(1); + + mus=(float)(mu); + mud=mu; + fact=rho0/sqrt((double)(VOLUME/2)*(double)(24*NPROC)); + + if (fact!=0.0) + { + assign_sd2sd(VOLUME/2,eta,rsd[0]); + scale_dble(VOLUME/2,1.0/fact,rsd[0]); + + rho=cgne(VOLUME/2,1,Doph,Doph_dble,ws,wsd,nmx,res,rsd[0],psi,status); + + scale_dble(VOLUME/2,fact,psi); + rho*=fact; + } + else + { + status[0]=0; + rho=0.0; + set_sd2zero(VOLUME/2,psi); + } + + release_wsd(); + release_wsd(); + release_ws(); + } + + if (status[0]<-1) + { + rho=rho0; + set_sd2zero(VOLUME/2,psi); + } + + return rho; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/forces/tmcgm.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/forces/tmcgm.c new file mode 100644 index 0000000000000000000000000000000000000000..eecc97ce2f4d8f874db511bc1edbe73bad989b77 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/forces/tmcgm.c @@ -0,0 +1,126 @@ + +/******************************************************************************* +* +* File tmcgm.c +* +* Copyright (C) 2012, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Multi-shift CG solver for the normal even-odd preconditioned Wilson-Dirac +* equation (Dwhat^dag*Dwhat+mu^2)*psi=eta with a twisted-mass term. +* +* The externally accessible function is +* +* void tmcgm(int nmx,double *res,int nmu,double *mu, +* spinor_dble *eta,spinor_dble **psi,int *status) +* Obtains approximate solutions psi[0],..,psi[nmu-1] of the normal +* even-odd preconditioned Wilson-Dirac equation for given source eta +* and nmu values of the twisted-mass parameter mu. See the notes for +* the explanation of the parameters of the program. +* +* Notes: +* +* The program is based on the multi-shift CG algorithm (see linsolv/mscg.c). +* It assumes that the improvement coefficients and the quark mass in the +* SW term have been set through set_lat_parms() and set_sw_parms() (see +* flags/lat_parms.c). +* +* All other parameters are passed through the argument list: +* +* nmx Maximal total number of CG iterations that may be performed. +* +* res Array of the desired maximal relative residues of the +* calculated solutions (nmu elements) +* +* nmu Number of twisted masses mu. +* +* mu Array of the twisted masses (nmu elements) +* +* eta Source field. Note that source fields must respect the chosen +* boundary conditions at time 0 and NPR0C0*L0-1, as has to be the +* the case for physical quark fields (see doc/dirac.pdf). +* +* psi Array of the calculated approximate solutions of the Dirac +* equations (Dwhat^dag*Dwhat+mu^2)*psi=eta (nmu elements). +* +* status If the program was able to solve the Dirac equations to the +* desired accuracy, status[0] reports the total number of CG +* iterations that were required. Negative values indicate that +* the program failed (-1: the algorithm did not converge, -2: +* the inversion of the SW term on the odd points was not safe). +* +* The source field eta must be different from psi[0],..,psi[nmu-1]. If +* status[0]>=-1 the calculated approximate solutions are returned. In +* all other cases, the fields are set to zero. +* +* The SW term is recalculated when needed. Evidently the solver is a global +* program that must be called on all processes simultaneously. The required +* workspace is +* +* spinor_dble 3+nmu (5 if nmu=1) +* +* (see utils/wspace.c). +* +*******************************************************************************/ + +#define TMCGM_C + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "uflds.h" +#include "sflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "dirac.h" +#include "linsolv.h" +#include "forces.h" +#include "global.h" + +static int iop=0; + + +static void Dop_dble(double mu,spinor_dble *s,spinor_dble *r) +{ + if (iop==0) + Dwhat_dble(mu,s,r); + else + Dwhat_dble(-mu,s,r); + + mulg5_dble(VOLUME/2,r); + iop^=0x1; +} + + +void tmcgm(int nmx,double *res,int nmu,double *mu, + spinor_dble *eta,spinor_dble **psi,int *status) +{ + int ifail,k; + spinor_dble **wsd; + + ifail=sw_term(ODD_PTS); + + if (ifail) + { + status[0]=-2; + + for (k=0;kr)} +* +* The contribution of the fields r,s to the X vector component on the link +* (x,x+mu) is given by +* +* X=tr{[gamma_5*(1-gamma_mu)*s(x+mu) x r^dag(x)]+(s<->r)} +* +* In all cases, the trace is taken over the Dirac indices only. +* +* The components of the X tensor field are of type u3_alg_dble. As in the +* case of symmetric gauge-field tensor, the field array includes additional +* space for the field components on the boundaries of the local lattice +* (see tcharge/ftensor.c and lattice/README.ftidx). The type u3_alg_dble +* is explained in the module su3fcts/su3prod.c. +* +* The programs in this module may perform global operations and must be +* called simultaneously on all MPI processes. +* +*******************************************************************************/ + +#define XTENSOR_C + +#include +#include +#include +#include "su3.h" +#include "utils.h" +#include "lattice.h" +#include "sw_term.h" +#include "sflds.h" +#include "linalg.h" +#include "forces.h" +#include "global.h" + +#define _u3_alg_mul_add_assign(r,c,s) \ + (r).c1+=(c)*(s).c1; \ + (r).c2+=(c)*(s).c2; \ + (r).c3+=(c)*(s).c3; \ + (r).c4+=(c)*(s).c4; \ + (r).c5+=(c)*(s).c5; \ + (r).c6+=(c)*(s).c6; \ + (r).c7+=(c)*(s).c7; \ + (r).c8+=(c)*(s).c8; \ + (r).c9+=(c)*(s).c9 + +#define _su3_mul_add_assign(r,c,s) \ + (r).c11.re+=(c)*(s).c11.re; \ + (r).c11.im+=(c)*(s).c11.im; \ + (r).c12.re+=(c)*(s).c12.re; \ + (r).c12.im+=(c)*(s).c12.im; \ + (r).c13.re+=(c)*(s).c13.re; \ + (r).c13.im+=(c)*(s).c13.im; \ + (r).c21.re+=(c)*(s).c21.re; \ + (r).c21.im+=(c)*(s).c21.im; \ + (r).c22.re+=(c)*(s).c22.re; \ + (r).c22.im+=(c)*(s).c22.im; \ + (r).c23.re+=(c)*(s).c23.re; \ + (r).c23.im+=(c)*(s).c23.im; \ + (r).c31.re+=(c)*(s).c31.re; \ + (r).c31.im+=(c)*(s).c31.im; \ + (r).c32.re+=(c)*(s).c32.re; \ + (r).c32.im+=(c)*(s).c32.im; \ + (r).c33.re+=(c)*(s).c33.re; \ + (r).c33.im+=(c)*(s).c33.im + +static u3_alg_dble X[6]; +static u3_alg_dble **xts=NULL,**xt; +static const su3_dble ud0={{0.0}}; +static su3_dble w ALIGNED16; +static su3_dble *xvs=NULL,*xv; + + +static void alloc_xts(void) +{ + int n,nt,nxt[6]; + u3_alg_dble **pp,*p; + ftidx_t *idx; + + idx=ftidx(); + nt=0; + + for (n=0;n<6;n++) + { + nxt[n]=VOLUME+idx[n].nft[0]+idx[n].nft[1]; + nt+=nxt[n]; + } + + pp=malloc(12*sizeof(*pp)); + p=amalloc(nt*sizeof(*p),ALIGN); + error((pp==NULL)||(p==NULL),1,"alloc_xts [xtensor.c]", + "Unable to allocate field arrays"); + + set_ualg2zero(nt,p); + xts=pp; + xt=pp+6; + + for (n=0;n<6;n++) + { + (*pp)=p; + pp+=1; + p+=nxt[n]; + } +} + + +u3_alg_dble **xtensor(void) +{ + int n; + + if (xts==NULL) + alloc_xts(); + + for (n=0;n<6;n++) + xt[n]=xts[n]; + + return xt; +} + + +void set_xt2zero(void) +{ + int n; + + if (xts==NULL) + alloc_xts(); + else + { + for (n=0;n<6;n++) + set_ualg2zero(VOLUME,xts[n]); + } +} + + +int add_det2xt(double c,ptset_t set) +{ + int n,ifail; + pauli_dble *m,*mm; + + if (set==NO_PTS) + return 0; + + ifail=sw_term(set); + + if (ifail!=0) + return ifail; + + if (xts==NULL) + alloc_xts(); + + if (set==ODD_PTS) + { + for (n=0;n<6;n++) + xt[n]=xts[n]+(VOLUME/2); + + m=swdfld()+VOLUME; + } + else + { + for (n=0;n<6;n++) + xt[n]=xts[n]; + + m=swdfld(); + } + + if (set==ALL_PTS) + mm=m+(2*VOLUME); + else + mm=m+VOLUME; + + for (;m=0.0 sets an upper bound on + the tolerated difference of the boundary values of the gauge field from + the expected ones in the case of SF and open-SF boundary conditions. + +int chs_ubnd(int ibc) + Multiplies the double-precision link variables on the time-like links + at time NPROC0*L0-1 by -1 if the following conditions are met: (1) ibc + and the determinants of the link variables have opposite sign, (2) the + boundary conditions are of type 3 (periodic for the gauge field). The + program returns 1 if the link variables are changed and 0 otherwise. + +void bnd_s2zero(ptset_t set,spinor *s) + Sets the components of the single-precision spinor field s on the + specified set of points at global time 0 and T (in the case of + open boundary conditions) to zero. + +void bnd_sd2zero(ptset_t set,spinor_dble *sd) + Sets the components of the double-precision spinor field sd on the + specified set of points at global time 0 and T (in the case of + open boundary conditions) to zero. + +ftidx_t *ftidx(void) + Returns an array idx[6] of ftidx_t structures containing the offsets + of the field tensor components on the boundaries of the local lattice + (see the file README.ftidx). + +void plaq_ftidx(int n,int ix,int *ip) + Calculates the offsets ip[4] of the field tensor components at the + corners of the (mu,nu)-plaquette at the point in the local lattice + with label ix. The indices (mu,nu) are determined by the parameter + n=0,..,5 (see the notes). + +int ipr_global(int n[]) + This program returns the number of the process with cartesian + coordinates n[0],..,n[3] in the process grid + +void ipt_global(int x[],int *ip,int *ix) + Given the coordinates x[0],..,x[3] of a point on the full lattice, + this program determines the number ip of the process that operates + on the corresponding local lattice and the associated local point + index ix (0<=ix=VOLUME are reserved for copies of the field tensor on the boundaries +of the local lattice in directions +mu and +nu: + + + Cross section of the lattice in the (mu,nu)-plane + + - - - - - - - - - - - - + + * * * * * * * * * * * * + *: local lattice + * * * * * * * * * * * * + + * * * * * * * * * * * * + +: mu-face + * * * * * * * * * * * * + nu + * * * * * * * * * * * * + ^ -: nu-face + * * * * * * * * * * * * + | + * * * * * * * * * * * * + ---> mu + + +The first of them, the "mu-face", includes the points at the (+mu,+nu) +corner of the local lattice. The numbers of points in these faces are +denoted by nft[n][0] an nft[n][1]. + + +Contents of the structures of type ftidx_t +------------------------------------------ + +A structure of type ftidx_t contains index data that refer to some +(mu,nu)-plane. The elements of the structure are + + nft[0]: Number of points in the mu-face. + + nft[1]: Number of points in the nu-face. + + ift[0][n]: Offsets of the field components in the local lattice + and the nu-face that correspond to the field components + in the mu-face on the MPI process in direction -mu + (n=0,..,nft[0]-1). + + ift[1][n]: Offsets of the field components in the local lattice + that correspond to the field components in the nu-face + on the MPI process in direction -nu (n=0,..,nft[1]-1). + +Using these index arrays, the field components on the mu- and nu-faces are +easily extracted from the local gauge fields on the neighbouring MPI +processes. + +Note that copying of the field tensor must be performed in a particular order +to ensure that the components at the (+mu,+nu)-corner of the local lattice are +correctly copied. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/lattice/README.uidx b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/lattice/README.uidx new file mode 100644 index 0000000000000000000000000000000000000000..ac600f7b94260a2156779b7a563a12ce5cd285f4 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/lattice/README.uidx @@ -0,0 +1,121 @@ + +******************************************************************************** + + Layout of the double-precision gauge field array + +******************************************************************************** + +As explained in main/README.global, the gauge field on the local lattice is +defined by its values on the 8 links attached to the odd lattice points. The +integer offset from the base address of the link variable U(x,mu) at the odd +point x is + + 8*(ix-(VOLUME/2))+2*mu + +while the one of U(x-mu,mu) is + + 8*(ix-(VOLUME/2))+2*mu+1 + +where ix denotes the index of x on the local lattice. + +When the double-precision gauge field is allocated, space is reserved for +further 7*BNDRY/4 link variables at the end of the field array. The additional +space is used for copies of the link variables at the external boundaries of +the local lattice in the directions +0,+1,+2 and +3. + +When SF or open-SF boundary conditions are chosen, the boundary values of the +field at time T are stored in 3 link variables appended to the field array on +the MPI processes with cpr[0]=NPROC0-1. The total size of the array in this +case is thus 4*VOLUME+7*(BNDRY/4)+3, while in all other cases it is +4*VOLUME+7*(BNDRY/4). + + +Labeling of the boundary points +------------------------------- + +The faces in direction -0,+0,..,-3+3 of the local lattice are labeled by an +index ifc=0,1,..,7 and so are its exterior boundaries. In the following, the +term "boundary segment" is used for the set of even (or odd) exterior boundary +points in a given direction ifc. There are thus 16 boundary segments. + +Each point y in a boundary segment has a unique "partner point" x on the local +lattice such that |x-y|=1. The points in the local lattice are totally ordered +by their index ix=0,..,VOLUME-1. It is then natural to label the points y in +the boundary segment by an index ib=0,1,2,.. that respects the order of the +partner points x. + +If x and y are as above, and if y is on the face with label ifc, the index ib +of y is explicitly given by + + ib=iy-ofs[ifc] if y is even, + + ib=iy-ofs[ifc]-BNDRY/2 if y is odd, + +where iy is the index of y and + + ofs[0]=VOLUME + ofs[1]=ofs[0]+FACE0/2 + ofs[2]=ofs[1]+FACE0/2 + ofs[3]=ofs[2]+FACE1/2 + ofs[4]=ofs[3]+FACE1/2 + ofs[5]=ofs[4]+FACE2/2 + ofs[6]=ofs[5]+FACE2/2 + ofs[7]=ofs[6]+FACE3/2 + +Note that + + iy=iup[ix][mu] on the face in direction +mu, + + iy=idn[ix][mu] on the face in direction -mu. + +While the labeling of the points in a boundary segment is always taken to be +the one described here, the 16 boundary segments may be ordered in various +ways depending on the context. + + +Boundary fields +--------------- + +Along the faces in direction +mu, two kinds of link variables must be +distinguished (assuming x and y are as above): + +(1) The link variables on the link (x,y), where x is even. Note that these are + not part of the local gauge field. Their total number is equal to half the + number of points on the face. + +(2) The link variables U(y,nu) where nu!=mu. None of these is contained in the + local gauge field. Their total number is 3 times the number of points on + the face. + +The number of all these link variables on the faces in direction +0,+1,+2 and ++3 is thus equal to BNDRY/4+3*BNDRY/2=7*BNDRY/4. + +In the gauge field array, the link variables of type (1) in direction +0,+1,+2 +and +3 come just after the local gauge field at offset=4*VOLUME. Then follow +the link variables of type (2) at the even points y on the face in direction ++0, then those at the odd points y on that face, then those at the even points +y on the face in direction +1, and so on. Within each boundary segment, the +link variables are ordered in the same way as the points y. + + +Contents of the structures of type uidx_t +----------------------------------------- + +A structure of type uidx_t contains index data that refer to the face +in a direction +mu. The elements of the structure are + + nu0: Number of link variables of type (1) on the face. + + nuk: Number of link variables of type (2) on the face. + + iu0[n]: Offsets of the link variables on the neighbouring MPI process in + direction +mu that correspond to the link variables of type (1) + on the face (n=0,..,nu0-1). + + iuk[n]: Offsets of the link variables on the neighbouring MPI process in + direction +mu that correspond to the link variables of type (2) + on the face (n=0,..,nuk-1). + +Using these index arrays, the boundary link variables are easily extracted +from the local gauge fields on the neighbouring MPI processes and copied +to the current process. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/lattice/bcnds.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/lattice/bcnds.c new file mode 100644 index 0000000000000000000000000000000000000000..c6e5370235f3a4e94b547455b665450fdef363ad --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/lattice/bcnds.c @@ -0,0 +1,699 @@ + +/******************************************************************************* +* +* File bcnds.c +* +* Copyright (C) 2005, 2010-2014 Martin Luescher, John Bulava +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Programs related to the boundary conditions in the time direction. +* +* int *bnd_lks(int *n) +* Returns the starting address of an array of length n whose elements +* are the integer offsets of the time-like link variables on the local +* lattice at global time NPROC0*L0-1. +* +* int *bnd_pts(int *n) +* Returns the starting address of an array of length n whose elements +* are the indices of the points on the local lattice at global time 0 +* (boundary conditions type 0,1 or 2) and time NPROC0*L0-1 (boundary +* conditions type 0). The ordering of the indices is such that the n/2 +* even points come first. +* +* void set_bc(void) +* Sets the double-precision link variables at time 0 and T to the +* values required by the chosen boundary conditions (see the notes). +* +* int check_bc(double tol) +* Returns 1 if the double-precision gauge field has the proper boundary +* values and if no active link variables are equal to zero. Otherwise +* the program returns 0. The parameter tol>=0.0 sets an upper bound on +* the tolerated difference of the boundary values of the gauge field from +* the expected ones in the case of SF and open-SF boundary conditions. +* +* int chs_ubnd(int ibc) +* Multiplies the double-precision link variables on the time-like links +* at time NPROC0*L0-1 by -1 if the following conditions are met: (1) ibc +* and the determinants of the link variables have opposite sign, (2) the +* boundary conditions are of type 3 (periodic for the gauge field). The +* program returns 1 if the link variables are changed and 0 otherwise. +* +* void bnd_s2zero(ptset_t set,spinor *s) +* Sets the components of the single-precision spinor field s on the +* specified set of points at global time 0 (boundary conditions type +* 0,1 or 2) and time NPROC0*L0-1 (boundary conditions type 0) to zero. +* +* void bnd_sd2zero(ptset_t set,spinor_dble *sd) +* Sets the components of the double-precision spinor field sd on the +* specified set of points at global time 0 (boundary conditions type +* 0,1 or 2) and time NPROC0*L0-1 (boundary conditions type 0) to zero. +* +* Notes: +* +* The time extent T of the lattice is +* +* NPROC0*L0-1 for open boundary conditions, +* +* NPROC0*L0 for SF, open-SF and periodic boundary conditions. +* +* Note that in the latter cases the points at time T are not in the local +* lattice and are omitted in the programs bnd_pts(), bnd_s2zero() and +* bnd_sd2zero(). +* +* The action performed by set_bc() is the following: +* +* Open bc: Set all link variables U(x,0) at time T to zero. +* +* SF bc: Reads the boundary values of the gauge field from the +* data base and assigns them to the link variables at +* time 0 and T. At time T the link variables are stored +* in the buffers appended to the local field on the MPI +* processes where cpr[0]=NPROC0-1. +* +* Open-SF bc: Same as SF bc, but omitting the assignment of the link +* variables at time 0. +* +* Periodic bc: No action is performed. +* +* Then the program checks whether any active link variables are equal to +* zero and, if some are found, aborts the program with an error message. +* +* The programs in this module act globally and should be called simultaneously +* on all MPI processes. After the first time, the programs bnd_s2zero() and +* bnd_sd2zero() may be locally called. +* +*******************************************************************************/ + +#define BCNDS_C + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "uflds.h" +#include "lattice.h" +#include "global.h" + +#define N0 (NPROC0*L0) + +typedef union +{ + su3_dble u; + double r[18]; +} umat_t; + +static int init0=0,nlks,*lks; +static int init1=0,npts,*pts; +static int init2=0; +static const su3_dble ud0={{0.0}}; +static const spinor s0={{{0.0f}}}; +static const spinor_dble sd0={{{0.0}}}; +static su3_dble ubnd[2][3]; + + +static void alloc_lks(void) +{ + int ix,t,*lk; + + error(iup[0][0]==0,1,"alloc_lks [bcnds.c]","Geometry arrays are not set"); + + if ((cpr[0]==0)||(cpr[0]==(NPROC0-1))) + { + if (NPROC0>1) + nlks=(L1*L2*L3)/2; + else + nlks=L1*L2*L3; + + lks=malloc(nlks*sizeof(*lks)); + + if (lks!=NULL) + { + lk=lks; + + for (ix=(VOLUME/2);ix0)&&(lks==NULL),1,"alloc_lks [bcnds.c]", + "Unable to allocate index array"); + init0=1; +} + + +static void alloc_pts(void) +{ + int bc,ix,t,*pt; + + error(iup[0][0]==0,1,"alloc_pts [bcnds.c]","Geometry arrays are not set"); + bc=bc_type(); + + if (((cpr[0]==0)&&(bc!=3))||((cpr[0]==(NPROC0-1))&&(bc==0))) + { + if ((NPROC0==1)&&(bc==0)) + npts=2*L1*L2*L3; + else + npts=L1*L2*L3; + + pts=malloc(npts*sizeof(*pts)); + + if (pts!=NULL) + { + pt=pts; + + for (ix=0;ix0)&&(pts==NULL),1,"alloc_pts [bcnds.c]", + "Unable to allocate index array"); + init1=1; +} + + +int *bnd_lks(int *n) +{ + if (init0==0) + alloc_lks(); + + (*n)=nlks; + + return lks; +} + + +int *bnd_pts(int *n) +{ + if (init1==0) + alloc_pts(); + + (*n)=npts; + + return pts; +} + + +static int is_zero(su3_dble *u) +{ + int i,it; + umat_t *um; + + um=(umat_t*)(u); + it=1; + + for (i=0;i<18;i++) + it&=((*um).r[i]==0.0); + + return it; +} + + +static int is_equal(double tol,su3_dble *u,su3_dble *v) +{ + int i,it; + umat_t *um,*vm; + + um=(umat_t*)(u); + vm=(umat_t*)(v); + it=1; + + for (i=0;i<18;i++) + it&=(fabs((*um).r[i]-(*vm).r[i])<=tol); + + return it; +} + + +static int check_zero(int bc) +{ + int it,ix,t,ifc; + su3_dble *u; + + it=1; + u=udfld(); + + for (ix=(VOLUME/2);ix1) + { + dprms[0]=tol; + MPI_Bcast(dprms,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + error(dprms[0]!=tol,1,"check_bc [bcnds.c]","Parameter is not global"); + } + + bc=bc_type(); + it=check_zero(bc); + + if (bc==1) + it&=check_SF(tol); + else if (bc==2) + it&=check_openSF(tol); + + if (NPROC>1) + { + is=it; + MPI_Allreduce(&is,&it,1,MPI_INT,MPI_MIN,MPI_COMM_WORLD); + } + + return it; +} + + +static int sdet(su3_dble *u) +{ + double r; + complex_dble z; + + z.re= + (*u).c22.re*(*u).c33.re-(*u).c22.im*(*u).c33.im- + (*u).c32.re*(*u).c23.re+(*u).c32.im*(*u).c23.im; + + z.im= + (*u).c22.re*(*u).c33.im+(*u).c22.im*(*u).c33.re- + (*u).c32.re*(*u).c23.im-(*u).c32.im*(*u).c23.re; + + r=(*u).c11.re*z.re-(*u).c11.im*z.im; + + z.re= + (*u).c32.re*(*u).c13.re-(*u).c32.im*(*u).c13.im- + (*u).c12.re*(*u).c33.re+(*u).c12.im*(*u).c33.im; + + z.im= + (*u).c32.re*(*u).c13.im+(*u).c32.im*(*u).c13.re- + (*u).c12.re*(*u).c33.im-(*u).c12.im*(*u).c33.re; + + r+=((*u).c21.re*z.re-(*u).c21.im*z.im); + + z.re= + (*u).c12.re*(*u).c23.re-(*u).c12.im*(*u).c23.im- + (*u).c22.re*(*u).c13.re+(*u).c22.im*(*u).c13.im; + + z.im= + (*u).c12.re*(*u).c23.im+(*u).c12.im*(*u).c23.re- + (*u).c22.re*(*u).c13.im-(*u).c22.im*(*u).c13.re; + + r+=((*u).c31.re*z.re-(*u).c31.im*z.im); + + if (r>=0.0) + return 1; + else + return -1; +} + + +int chs_ubnd(int ibc) +{ + int iprms[1],i,ich,ichs; + int *lk,*lkm; + su3_dble *ub; + umat_t *um; + + if (bc_type()==3) + { + if (NPROC>1) + { + iprms[0]=ibc; + MPI_Bcast(iprms,1,MPI_INT,0,MPI_COMM_WORLD); + error(iprms[0]!=ibc,1,"chs_ubnd [bcnds.c]", + "Parameter is not global"); + } + + if (init0==0) + alloc_lks(); + + if (ibc>=0) + ibc=1; + else + ibc=-1; + + ub=udfld(); + ich=0; + + if (nlks>0) + { + lk=lks; + + if (sdet(ub+(*lk))!=ibc) + { + ich=1; + lkm=lk+nlks; + + for (;lk0) + { + if (set==ALL_PTS) + { + pt=pts; + pm=pts+npts; + } + else if (set==EVEN_PTS) + { + pt=pts; + pm=pts+npts/2; + } + else if (set==ODD_PTS) + { + pt=pts+npts/2; + pm=pts+npts; + } + else + return; + + for (;pt0) + { + if (set==ALL_PTS) + { + pt=pts; + pm=pts+npts; + } + else if (set==EVEN_PTS) + { + pt=pts; + pm=pts+npts/2; + } + else if (set==ODD_PTS) + { + pt=pts+npts/2; + pm=pts+npts; + } + else + return; + + for (;pt F_{mu nu}(x) +* ip[1] -> F_{mu nu}(x+mu) +* ip[2] -> F_{mu nu}(x+nu) +* ip[3] -> F_{mu nu}(x+mu+nu) +* +* In the program plaq_ftidx() it is taken for granted that 0<=ix +#include +#include +#include "su3.h" +#include "utils.h" +#include "lattice.h" +#include "global.h" + +static const int plns[6][2]={{0,1},{0,2},{0,3},{2,3},{3,1},{1,2}}; +static int nfc[4],ofs[4],*cn[6][2],init=0; +static ftidx_t idx[6]; + + +static void set_nft(void) +{ + int bs[4]; + int n,mu,nu; + + bs[0]=L0; + bs[1]=L1; + bs[2]=L2; + bs[3]=L3; + + nfc[0]=FACE0; + nfc[1]=FACE1; + nfc[2]=FACE2; + nfc[3]=FACE3; + + ofs[0]=VOLUME; + ofs[1]=ofs[0]+FACE0; + ofs[2]=ofs[1]+FACE1; + ofs[3]=ofs[2]+FACE2; + + for (n=0;n<6;n++) + { + mu=plns[n][0]; + nu=plns[n][1]; + + idx[n].nft[0]=nfc[mu]; + idx[n].nft[1]=nfc[nu]; + + if (nfc[nu]>0) + idx[n].nft[0]+=(nfc[mu]/bs[nu]); + } +} + + +static void alloc_idx(void) +{ + int n,mu,nu; + int np,*iw; + + set_nft(); + np=0; + + for (n=0;n<6;n++) + np+=(idx[n].nft[0]+idx[n].nft[1]); + + if (BNDRY>0) + { + iw=malloc((np+9*(BNDRY/2))*sizeof(*iw)); + error(iw==NULL,1,"alloc_idx [ftidx.c]", + "Unable to allocate index arrays"); + } + else + iw=NULL; + + for (n=0;n<6;n++) + { + idx[n].ift[0]=iw; + iw+=idx[n].nft[0]; + + idx[n].ift[1]=iw; + iw+=idx[n].nft[1]; + } + + for (n=0;n<6;n++) + { + mu=plns[n][0]; + nu=plns[n][1]; + + cn[n][0]=iw; + iw+=3*nfc[mu]; + + cn[n][1]=iw; + iw+=3*nfc[nu]; + } +} + + +static int ibnd(int mu,int iy) +{ + if (iy>(VOLUME+(BNDRY/2))) + return iy-ofs[mu]-BNDRY/2; + else + return iy-ofs[mu]-nfc[mu]/2; +} + + +static void set_idx(void) +{ + int n,mu,nu; + int ix,iy,iw,iz; + int iby,ibw,ibz; + int *ift[2],*cnn[2],nft0,nfc0,icn; + + alloc_idx(); + + for (n=0;n<6;n++) + { + mu=plns[n][0]; + nu=plns[n][1]; + + ift[0]=idx[n].ift[0]; + ift[1]=idx[n].ift[1]; + cnn[0]=cn[n][0]; + cnn[1]=cn[n][1]; + + nft0=idx[n].nft[0]; + nfc0=nfc[mu]; + icn=0; + + for (ix=0;ix=VOLUME) + { + iby=ibnd(mu,iy); + ift[0][iby]=map[iy-VOLUME]; + + if (iw>=VOLUME) + { + ibw=ibnd(nu,iw); + ift[1][ibw]=map[iw-VOLUME]; + + iz=map[iy-VOLUME]; + iz=iup[iz][nu]; + ibz=ibnd(nu,iz); + ift[0][nfc0+icn]=VOLUME+nft0+ibz; + + cnn[0][3*iby ]=VOLUME+iby; + cnn[0][3*iby+1]=VOLUME+nft0+ibw; + cnn[0][3*iby+2]=VOLUME+nfc0+icn; + + cnn[1][3*ibw ]=cnn[0][3*iby ]; + cnn[1][3*ibw+1]=cnn[0][3*iby+1]; + cnn[1][3*ibw+2]=cnn[0][3*iby+2]; + + icn+=1; + } + else + { + iz=iup[iw][mu]; + ibz=ibnd(mu,iz); + + cnn[0][3*iby ]=VOLUME+iby; + cnn[0][3*iby+1]=iw; + cnn[0][3*iby+2]=VOLUME+ibz; + } + } + else if (iw>=VOLUME) + { + ibw=ibnd(nu,iw); + ift[1][ibw]=map[iw-VOLUME]; + + iz=iup[iy][nu]; + ibz=ibnd(nu,iz); + + cnn[1][3*ibw ]=iy; + cnn[1][3*ibw+1]=VOLUME+nft0+ibw; + cnn[1][3*ibw+2]=VOLUME+nft0+ibz; + } + } + } + + init=1; +} + + +ftidx_t *ftidx(void) +{ + if (init==0) + set_idx(); + + return idx; +} + + +void plaq_ftidx(int n,int ix,int *ip) +{ + int mu,nu; + int iy,iw,k; + + if (init==0) + set_idx(); + + mu=plns[n][0]; + nu=plns[n][1]; + + iy=iup[ix][mu]; + iw=iup[ix][nu]; + ip[0]=ix; + + if (iy>=VOLUME) + { + k=3*ibnd(mu,iy); + ip[1]=cn[n][0][k]; + ip[2]=cn[n][0][k+1]; + ip[3]=cn[n][0][k+2]; + } + else if (iw>=VOLUME) + { + k=3*ibnd(nu,iw); + ip[1]=cn[n][1][k]; + ip[2]=cn[n][1][k+1]; + ip[3]=cn[n][1][k+2]; + } + else + { + ip[1]=iy; + ip[2]=iw; + ip[3]=iup[iy][nu]; + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/lattice/geometry.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/lattice/geometry.c new file mode 100644 index 0000000000000000000000000000000000000000..345ef95994f878b38aa2835354d174f8a8dc3762 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/lattice/geometry.c @@ -0,0 +1,693 @@ + +/******************************************************************************* +* +* File geometry.c +* +* Copyright (C) 2005, 2008, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Programs related to the lattice and block geometry. +* +* The externally accessible functions are +* +* int ipr_global(int *n) +* This program returns the rank of the MPI process with Cartesian +* coordinates n[0],..,n[3] in the process grid. +* +* void ipt_global(int *x,int *ip,int *ix) +* Given the Cartesian coordinates x[0],..,x[3] of a point on the full +* lattice, this program finds the local lattice containing x. On exit +* the rank of the associated MPI process is assigned to ip and the +* local index of the point to ix. +* +* int global_time(int ix) +* Returns the (global) time coordinate of the lattice point with local +* index ix. +* +* void geometry(void) +* Computes the global arrays cpr,npr describing the MPI process grid +* and the index arrays ipt,iup,idn and map that characterize the lattice +* geometry (see main/global.h). +* +* void blk_geometry(block_t *b) +* Computes the index arrays b.ipt,b.iup and b.idn that describe the +* geometry of the block b. +* +* void blk_imbed(block_t *b) +* Computes the index arrays b.imb and b.ibp that describe the +* embedding of the block b in the full lattice. +* +* void bnd_geometry(block_t *b) +* Computes the index arrays bb.ipp and bb.map that describe the +* geometry of the exterior boundaries bb of the block b. +* +* void bnd_imbed(block_t *b) +* Computes the index arrays bb.imb that describe the embedding +* of the exterior boundaries bb of the block b in the full lattice. +* +* Notes: +* +* See main/README.global for a description of the lattice geometry and +* block/README.block for explanations of the block structure. +* +* The programs geometry() and blk_geometry() may involve communications and +* must be called simultaneously on all processes. All other programs can be +* called locally. +* +*******************************************************************************/ + +#define GEOMETRY_C + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "global.h" + +#define NPROC_BLK (NPROC0_BLK*NPROC1_BLK*NPROC2_BLK*NPROC3_BLK) +#define NBLK0 (NPROC0/NPROC0_BLK) +#define NBLK1 (NPROC1/NPROC1_BLK) +#define NBLK2 (NPROC2/NPROC2_BLK) +#define NBLK3 (NPROC3/NPROC3_BLK) + +static int cbs[4],cbn[4],*cbix=NULL; +static int *tms=NULL; + + +int ipr_global(int *n) +{ + int ib,ip; + int n0,n1,n2,n3; + int nb0,nb1,nb2,nb3; + int np0,np1,np2,np3; + + n0=safe_mod(n[0],NPROC0); + n1=safe_mod(n[1],NPROC1); + n2=safe_mod(n[2],NPROC2); + n3=safe_mod(n[3],NPROC3); + + nb0=n0/NPROC0_BLK; + nb1=n1/NPROC1_BLK; + nb2=n2/NPROC2_BLK; + nb3=n3/NPROC3_BLK; + + np0=n0%NPROC0_BLK; + np1=n1%NPROC1_BLK; + np2=n2%NPROC2_BLK; + np3=n3%NPROC3_BLK; + + ib=nb0; + ib=ib*NBLK1+nb1; + ib=ib*NBLK2+nb2; + ib=ib*NBLK3+nb3; + + ip=np0; + ip=ip*NPROC1_BLK+np1; + ip=ip*NPROC2_BLK+np2; + ip=ip*NPROC3_BLK+np3; + + return ip+ib*NPROC_BLK; +} + + +void ipt_global(int *x,int *ip,int *ix) +{ + int x0,x1,x2,x3; + int n[4]; + + x0=safe_mod(x[0],NPROC0*L0); + x1=safe_mod(x[1],NPROC1*L1); + x2=safe_mod(x[2],NPROC2*L2); + x3=safe_mod(x[3],NPROC3*L3); + + n[0]=x0/L0; + n[1]=x1/L1; + n[2]=x2/L2; + n[3]=x3/L3; + + (*ip)=ipr_global(n); + + x0=x0%L0; + x1=x1%L1; + x2=x2%L2; + x3=x3%L3; + + (*ix)=ipt[x3+L3*x2+L2*L3*x1+L1*L2*L3*x0]; +} + + +int global_time(int ix) +{ + if ((tms!=NULL)&&(ix>=0)&&(ix=NPROC),1,"set_cpr [geometry.c]", + "Rank of process is out of range"); + + ib=nr/NPROC_BLK; + ip=nr%NPROC_BLK; + + cpr[3]=(ib%NBLK3)*NPROC3_BLK+(ip%NPROC3_BLK); + ib/=NBLK3; + ip/=NPROC3_BLK; + + cpr[2]=(ib%NBLK2)*NPROC2_BLK+(ip%NPROC2_BLK); + ib/=NBLK2; + ip/=NPROC2_BLK; + + cpr[1]=(ib%NBLK1)*NPROC1_BLK+(ip%NPROC1_BLK); + ib/=NBLK1; + ip/=NPROC1_BLK; + + cpr[0]=ib*NPROC0_BLK+ip; +} + + +static void set_npr(void) +{ + int mu,n[4]; + + for (mu=0;mu<4;mu++) + n[mu]=cpr[mu]; + + for (mu=0;mu<4;mu++) + { + n[mu]-=1; + npr[2*mu]=ipr_global(n); + n[mu]+=2; + npr[2*mu+1]=ipr_global(n); + n[mu]-=1; + } +} + + +static void cache_block(int *bs) +{ + int mu; + + cbs[0]=bs[0]; + cbn[0]=1; + + for (mu=1;mu<4;mu++) + { + if ((bs[mu]%4)==0) + cbs[mu]=4; + else if ((bs[mu]%3)==0) + cbs[mu]=3; + else if ((bs[mu]%2)==0) + cbs[mu]=2; + else + cbs[mu]=1; + + cbn[mu]=bs[mu]/cbs[mu]; + } + + if (cbix!=NULL) + free(cbix); + + cbix=malloc(cbs[0]*cbs[1]*cbs[2]*cbs[3]*sizeof(*cbix)); + error(cbix==NULL,1,"cache_block [geometry.c]", + "Unable to allocate auxiliary array"); +} + + +static void set_cbix(void) +{ + int x0,x1,x2,x3; + int ig,iu,ib,is; + + ig=0; + iu=0; + + for (x0=0;x01)) + iup[ix][0]=VOLUME; + if ((x0==0)&&(NPROC0>1)) + idn[ix][0]=VOLUME; + + if ((x1==(L1-1))&&(NPROC1>1)) + iup[ix][1]=VOLUME; + if ((x1==0)&&(NPROC1>1)) + idn[ix][1]=VOLUME; + + if ((x2==(L2-1))&&(NPROC2>1)) + iup[ix][2]=VOLUME; + if ((x2==0)&&(NPROC2>1)) + idn[ix][2]=VOLUME; + + if ((x3==(L3-1))&&(NPROC3>1)) + iup[ix][3]=VOLUME; + if ((x3==0)&&(NPROC3>1)) + idn[ix][3]=VOLUME; + } + } + } + } + + ifc[0]=0; + ifc[1]=ifc[0]+(FACE0/2); + ifc[2]=ifc[1]+(FACE0/2); + ifc[3]=ifc[2]+(FACE1/2); + ifc[4]=ifc[3]+(FACE1/2); + ifc[5]=ifc[4]+(FACE2/2); + ifc[6]=ifc[5]+(FACE2/2); + ifc[7]=ifc[6]+(FACE3/2); + + for (ix=0;ix0) + (*b).idn[ix][0]=index(bo,bs,x0-1,x1,x2,x3); + else + (*b).idn[ix][0]=(*b).vol; + + if ((x1+1)0) + (*b).idn[ix][1]=index(bo,bs,x0,x1-1,x2,x3); + else + (*b).idn[ix][1]=(*b).vol; + + if ((x2+1)0) + (*b).idn[ix][2]=index(bo,bs,x0,x1,x2-1,x3); + else + (*b).idn[ix][2]=(*b).vol; + + if ((x3+1)0) + (*b).idn[ix][3]=index(bo,bs,x0,x1,x2,x3-1); + else + (*b).idn[ix][3]=(*b).vol; + } + } + } + } + + (*b).ipt[(*b).vol]=(*b).ipt[0]; + + free(cbix); + cbix=NULL; +} + + +void blk_imbed(block_t *b) +{ + int *bo,*bs; + int x0,x1,x2,x3; + int ix,iy,ibd,ibu,*ibp; + + bo=(*b).bo; + bs=(*b).bs; + + for (x0=0;x0=VOLUME) + (*bb).ibn=1; + else + (*bb).ibn=0; + + bb+=1; + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/lattice/uidx.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/lattice/uidx.c new file mode 100644 index 0000000000000000000000000000000000000000..0dc239d2cd06e17da07540b964824ced590ccd5d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/lattice/uidx.c @@ -0,0 +1,262 @@ + +/******************************************************************************* +* +* File uidx.c +* +* Copyright (C) 2010, 2011, 2012, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Labeling of the link variables on the faces of the local lattice. +* +* The externally accessible functions are +* +* uidx_t *uidx(void) +* Returns an array idx[4] of uidx_t structures containing the offsets +* of the link variables at the faces of the local lattice. +* +* void plaq_uidx(int n,int ix,int *ip) +* Calculates the offsets ip[4] of the links in the (mu,nu)-plaquette at +* the point on the local lattice with label ix. The indices (mu,nu) are +* determined by the parameter n=0,..,5. +* +* Notes: +* +* The layout of the double-precision gauge field array and contents of the +* index structures returned by uidx() are described in the file README.uidx +* in this directory. The index arrays calculated by uidx() are determined +* by the local geometry of the lattice and are therefore independent of the +* boundary conditions. +* +* There are six planes +* +* (mu,nu)={(0,1),(0,2),(0,3),(2,3),(3,1),(1,2)} +* +* labeled by an integer n running from 0 to 5 and the links in the +* (mu,nu)-plaquette at the point x are ordered such that +* +* ip[0] -> U(x,mu) +* ip[1] -> U(x+mu,nu) +* ip[2] -> U(x,nu) +* ip[3] -> U(x+nu,mu) +* +* In the program plaq_uidx() it is taken for granted that 0<=ix +#include +#include +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "global.h" + +#define N0 (NPROC0*L0) + +static const int plns[6][2]={{0,1},{0,2},{0,3},{2,3},{3,1},{1,2}}; +static int type,nfc[4],ofs[4],snu[4],init=0; +static uidx_t idx[4]; + + +static void alloc_idx(void) +{ + int mu,nu0,nuk; + int *iu0,*iuk; + + error(iup[0][0]==0,1,"alloc_idx [uidx.c]", + "Geometry arrays are not set"); + + type=bc_type(); + nfc[0]=FACE0/2; + nfc[1]=FACE1/2; + nfc[2]=FACE2/2; + nfc[3]=FACE3/2; + + ofs[0]=VOLUME+(FACE0/2); + ofs[1]=ofs[0]+(FACE0/2)+(FACE1/2); + ofs[2]=ofs[1]+(FACE1/2)+(FACE2/2); + ofs[3]=ofs[2]+(FACE2/2)+(FACE3/2); + + snu[0]=0; + snu[1]=snu[0]+(FACE0/2); + snu[2]=snu[1]+(FACE1/2); + snu[3]=snu[2]+(FACE2/2); + + if (BNDRY>0) + { + iu0=malloc(7*(BNDRY/4)*sizeof(*iu0)); + error(iu0==NULL,1,"alloc_idx [uidx.c]", + "Unable to allocate index array"); + iuk=iu0+(BNDRY/4); + } + else + { + iu0=NULL; + iuk=NULL; + } + + for (mu=0;mu<4;mu++) + { + nu0=nfc[mu]; + nuk=6*nfc[mu]; + + idx[mu].nu0=nu0; + idx[mu].nuk=nuk; + + if (nu0>0) + { + idx[mu].iu0=iu0; + idx[mu].iuk=iuk; + iu0+=nu0; + iuk+=nuk; + } + else + { + idx[mu].iu0=NULL; + idx[mu].iuk=NULL; + } + } +} + + +static int offset(int ix,int mu) +{ + int iy,ib; + + if (ix<(VOLUME/2)) + { + iy=iup[ix][mu]; + + if (iy=mu); + iuk[3*ib+k]=offset(iz,nu); + } + } + + for (ib=0;ib=mu); + iuk[3*(ib+nu0)+k]=offset(iz,nu); + } + } + } + + init=1; +} + + +uidx_t *uidx(void) +{ + if (init==0) + set_idx(); + + return idx; +} + + +void plaq_uidx(int n,int ix,int *ip) +{ + int mu,nu; + int iy,ic; + + if (init==0) + set_idx(); + + mu=plns[n][0]; + nu=plns[n][1]; + + ip[0]=offset(ix,mu); + + if ((mu==0)&&(global_time(ix)==(N0-1))&&((type==1)||(type==2))) + { + ip[1]=4*VOLUME+7*(BNDRY/4)+nu-1; + } + else + { + iy=iup[ix][mu]; + + if (iymu); + } + } + + ip[2]=offset(ix,nu); + iy=iup[ix][nu]; + + if (iynu); + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/linalg/README b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/linalg/README new file mode 100644 index 0000000000000000000000000000000000000000..ef307cb4de26df5202e6a985831accfbb5c1d39b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/linalg/README @@ -0,0 +1,233 @@ + +******************************************************************************** + + Linear algebra + +******************************************************************************** + + +Files +----- + +cmatrix.c Complex matrix algebra (single-precision version) + +cmatrix_dble.c Complex matrix algebra (double-precision version) + +liealg.c Basic functions for fields with values in the Lie + algebra of SU(3) + +salg.c Generic linear algebra routines for single-precision + spinor fields + +salg_dble.c Generic linear algebra routines for double-precision + spinor fields + +valg.c Generic linear algebra routines for single-precision + complex fields + +valg_dble.c Generic linear algebra routines for double-precision + complex fields + + +Include file +------------ + +The file linalg.h defines the prototypes for all externally accessible +functions that are defined in the *.c files listed above. + + +List of functions +----------------- + +void cmat_vec(int n,complex *a,complex *v,complex *w) + Computes w=a*v, where v and w are n-vectors and a an nxn matrix. + +void cmat_vec_assign(int n,complex *a,complex *v,complex *w) + Adds a*v to w, where v and w are n-vectors and a an nxn matrix. + +void cmat_add(int n,complex *a,complex *b,complex *c) + Computes the sum c=a+b of two nxn matrices a and b. + +void cmat_sub(int n,complex *a,complex *b,complex *c) + Computes the difference c=a-b of two nxn matrices a and b. + +void cmat_mul(int n,complex *a,complex *b,complex *c) + Computes the product c=a*b of two nxn matrices a and b. + +void cmat_dag(int n,complex *a,complex *b) + Assigns the hermitian conjugate of a to b. + +void cmat_vec_dble(int n,complex_dble *a,complex_dble *v,complex_dble *w) + Computes w=a*v, where v and w are n-vectors and a an nxn matrix. + +void cmat_vec_assign_dble(int n,complex_dble *a,complex_dble *v, + complex_dble *w) + Adds a*v to w, where v and w are n-vectors and a an nxn matrix. + +void cmat_add_dble(int n,complex_dble *a,complex_dble *b,complex_dble *c) + Computes the sum c=a+b of two nxn matrices a and b. + +void cmat_sub_dble(int n,complex_dble *a,complex_dble *b,complex_dble *c) + Computes the difference c=a-b of two nxn matrices a and b. + +void cmat_mul_dble(int n,complex_dble *a,complex_dble *b,complex_dble *c) + Computes the product c=a*b of two nxn matrices a and b. + +void cmat_dag_dble(int n,complex_dble *a,complex_dble *b) + Assigns the hermitian conjugate of a to b. + +int cmat_inv_dble(int n,complex_dble *a,complex_dble *b,double *k) + Computes the inverse b of the nxn matrix a, using Householder + reflections. The Frobenius condition number k of a is also computed. + A non-zero return value indicates that the input matrix was found to + be singular within rounding errors and that the program terminated + prematurely. + +void random_alg(int vol,su3_alg_dble *X) + Initializes the Lie algebra elements X to random values + with distribution proportional to exp{tr[X^2]}. + +double norm_square_alg(int vol,int icom,su3_alg_dble *X) + Computes the square of the norm of the norm squared of the field X. + +double scalar_prod_alg(int vol,int icom,su3_alg_dble *X,su3_alg_dble *Y) + Computes the scalar product of the fields X and Y. + +void set_alg2zero(int vol,su3_alg_dble *X) + Sets the array elements X to zero. + +void set_ualg2zero(int vol,u3_alg_dble *X) + Sets the array elements X to zero. + +void assign_alg2alg(int vol,su3_alg_dble *X,su3_alg_dble *Y) + Assigns the field X to the field Y. + +void swap_alg(int vol,su3_alg_dble *X,su3_alg_dble *Y) + Swaps the fields X and Y. + +void muladd_assign_alg(int vol,double r,su3_alg_dble *X,su3_alg_dble *Y) + Adds r*X to Y. + +complex spinor_prod(int vol,int icom,spinor *s,spinor *r) + Computes the scalar product of the fields s and r. + +float spinor_prod_re(int vol,int icom,spinor *s,spinor *r) + Computes the real part of the scalar product of the fields + s and r. + +float norm_square(int vol,int icom,spinor *s) + Computes the square of the norm of the field s. + +void mulc_spinor_add(int vol,spinor *s,spinor *r,complex z) + Replaces the field s by s+z*r. + +void mulr_spinor_add(int vol,spinor *s,spinor *r,float c) + Replaces the field s by s+c*r. + +void project(int vol,int icom,spinor *s,spinor *r) + Replaces the field s by s-(r,s)*r. + +void scale(int vol,float c,spinor *s) + Replaces the field s by c*s. + +float normalize(int vol,int icom,spinor *s) + Replaces the field s by s/||s|| and returns the norm ||s||. + +void rotate(int vol,int n,spinor **ppk,complex *v) + Replaces the fields pk[] by sum_j pj*v[n*j+k] where 0<=k,j +#include +#include +#include "su3.h" +#include "utils.h" +#include "linalg.h" + +#if (defined AVX) +#include "avx.h" + +void cmat_vec(int n,complex *a,complex *v,complex *w) +{ + complex *b,*vv,*vm,*wm; + + if ((n&0x3)==0x0) + { + vm=v+n; + wm=w+n; + b=a; + + for (;w +#include +#include +#include +#include "su3.h" +#include "utils.h" +#include "linalg.h" + +#ifndef ALIGN +#define ALIGN 6 +#endif + +static int nmax=0; +static double *rsv; +static complex_dble *dsv; + +#if (defined AVX) +#include "avx.h" + +void cmat_vec_dble(int n,complex_dble *a,complex_dble *v,complex_dble *w) +{ + complex_dble *vv,*vm,*wm;; + + if ((n&0x3)==0x0) + { + vm=v+n; + wm=w+n; + + for (;w0) + { + nmax=0; + afree(rsv); + afree(dsv); + rsv=NULL; + dsv=NULL; + } + + if (n>0) + { + rsv=amalloc(n*sizeof(*rsv),ALIGN); + dsv=amalloc(n*sizeof(*dsv),ALIGN); + + if (error_loc((rsv==NULL)||(dsv==NULL),1,"alloc_arrays [cmatrix_dble.c]", + "Unable to allocate auxiliary arrays")==0) + { + nmax=n; + return 0; + } + else + { + if (rsv!=NULL) + afree(rsv); + if (dsv!=NULL) + afree(dsv); + rsv=NULL; + dsv=NULL; + + return 1; + } + } + + return 0; +} + + +static int fwd_house(int n,complex_dble *a,complex_dble *b,double *fnsq) +{ + int i,j,k; + double eps,r1,r2,r3; + complex_dble z,*bb,*bm,*bk,*bj; + + *fnsq=0.0; + bm=b+n*n; + + for (bb=b;bb=eps) + r1=sqrt(r1); + else + return 3; + + if (r2>=(DBL_EPSILON*r1)) + { + r3=1.0/r2; + z.re=r3*b[n*k+k].re; + z.im=r3*b[n*k+k].im; + } + else + { + z.re=1.0; + z.im=0.0; + } + + b[n*k+k].re+=r1*z.re; + b[n*k+k].im+=r1*z.im; + + r3=1.0/(r1*(r1+r2)); + rsv[k]=r3; + dsv[k].re=-(r1+r2)*r3*z.re; + dsv[k].im= (r1+r2)*r3*z.im; + + for (j=(k+1);j=eps) + r1=1.0/r1; + else + return 3; + + dsv[n-1].re= r1*(*bb).re; + dsv[n-1].im=-r1*(*bb).im; + + return 0; +} + + +static void solv_sys(int n,complex_dble *b) +{ + int i,j,k; + complex_dble *bi,*bk,z; + + for (k=(n-1);k>0;k--) + { + for (i=(k-1);i>=0;i--) + { + bi=b+n*i+k; + bk=b+n*k-n+k; + z.re=(*bi).re*dsv[k].re-(*bi).im*dsv[k].im; + z.im=(*bi).re*dsv[k].im+(*bi).im*dsv[k].re; + + for (j=(k-1);j>i;j--) + { + bi-=1; + z.re+=((*bi).re*(*bk).re-(*bi).im*(*bk).im); + z.im+=((*bi).re*(*bk).im+(*bi).im*(*bk).re); + bk-=n; + } + + (*bk).re=-dsv[i].re*z.re+dsv[i].im*z.im; + (*bk).im=-dsv[i].re*z.im-dsv[i].im*z.re; + } + } +} + + +static void bck_house(int n,complex_dble *b) +{ + int i,j,k; + complex_dble *bi,*dj,z; + + b[n*n-1].re=dsv[n-1].re; + b[n*n-1].im=dsv[n-1].im; + + for (k=(n-2);k>=0;k--) + { + z.re=dsv[k].re; + z.im=dsv[k].im; + dsv[k].re=b[n*k+k].re; + dsv[k].im=b[n*k+k].im; + b[n*k+k].re=z.re; + b[n*k+k].im=z.im; + + for (j=(k+1);jnmax) + { + if (alloc_arrays(n)!=0) + return 1; + } + + ie=fwd_house(n,a,b,&fnsq); + + if (ie!=0) + return ie; + + solv_sys(n,b); + bck_house(n,b); + + bb=b; + bm=bb+n*n; + fnsqi=0.0; + + for (;bb +#include +#include +#include "mpi.h" +#include "su3.h" +#include "utils.h" +#include "random.h" +#include "linalg.h" +#include "global.h" + +#define MAX_LEVELS 12 +#define BLK_LENGTH 8 + +static int cnt[MAX_LEVELS]; +static double smx[MAX_LEVELS]; +static double c1=0.0,c2,c3,rb[8],sm; + + +void random_alg(int vol,su3_alg_dble *X) +{ + su3_alg_dble *Xm; + + if (c1==0.0) + { + c1=(sqrt(3.0)+1.0)/6.0; + c2=(sqrt(3.0)-1.0)/6.0; + c3=1.0/sqrt(2.0); + } + + Xm=X+vol; + + for (;X=BLK_LENGTH)&&(n1)) + { + sm=smx[0]; + MPI_Reduce(&sm,smx,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(smx,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + + return 4.0*smx[0]; +} + + +double scalar_prod_alg(int vol,int icom,su3_alg_dble *X,su3_alg_dble *Y) +{ + int n; + su3_alg_dble *Xm; + + for (n=0;n=BLK_LENGTH)&&(n1)) + { + sm=smx[0]; + MPI_Reduce(&sm,smx,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(smx,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + + return smx[0]; +} + + +void set_alg2zero(int vol,su3_alg_dble *X) +{ + su3_alg_dble *Xm; + + Xm=X+vol; + + for (;X +#include +#include +#include "mpi.h" +#include "su3.h" +#include "utils.h" +#include "sflds.h" +#include "linalg.h" +#include "global.h" + +static int nrot=0,ifail=0; +static spinor *psi; + + +static void alloc_wrotate(int n) +{ + if (nrot>0) + afree(psi); + + psi=amalloc(n*sizeof(*psi),ALIGN); + + if (psi==NULL) + { + error_loc(1,1,"alloc_wrotate [salg.c]", + "Unable to allocate workspace"); + nrot=0; + ifail=1; + } + else + { + nrot=n; + set_s2zero(n,psi); + } +} + +#if (defined AVX) +#include "avx.h" + +complex spinor_prod(int vol,int icom,spinor *s,spinor *r) +{ + complex z; + complex_dble v,w; + spinor *sm; + + __asm__ __volatile__ ("vxorpd %%ymm9, %%ymm9, %%ymm9 \n\t" + "vxorpd %%ymm10, %%ymm10, %%ymm10 \n\t" + "vxorpd %%ymm11, %%ymm11, %%ymm11 \n\t" + "vxorps %%ymm12, %%ymm12, %%ymm12 \n\t" + "vxorps %%ymm13, %%ymm13, %%ymm13 \n\t" + "vxorps %%ymm14, %%ymm14, %%ymm14" + : + : + : + "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14"); + + sm=s+vol; + + for (;s1)) + { + mpc_gsum_d(&v.re,&w.re,2); + + z.re=(float)(w.re); + z.im=(float)(w.im); + } + else + { + z.re=(float)(v.re); + z.im=(float)(v.im); + } + + return z; +} + + +float spinor_prod_re(int vol,int icom,spinor *s,spinor *r) +{ + double x,y; + spinor *sm; + + __asm__ __volatile__ ("vxorpd %%ymm9, %%ymm9, %%ymm9 \n\t" + "vxorpd %%ymm10, %%ymm10, %%ymm10 \n\t" + "vxorpd %%ymm11, %%ymm11, %%ymm11 \n\t" + : + : + : + "xmm9", "xmm10", "xmm11"); + + sm=s+vol; + + for (;s1)) + { + mpc_gsum_d(&x,&y,1); + return (float)(y); + } + else + return (float)(x); +} + + +float norm_square(int vol,int icom,spinor *s) +{ + double x,y; + spinor *sm; + + __asm__ __volatile__ ("vxorpd %%ymm9, %%ymm9, %%ymm9 \n\t" + "vxorpd %%ymm10, %%ymm10, %%ymm10 \n\t" + "vxorpd %%ymm11, %%ymm11, %%ymm11 \n\t" + : + : + : + "xmm9", "xmm10", "xmm11"); + + sm=s+vol; + + for (;s1)) + { + mpc_gsum_d(&x,&y,1); + return (float)(y); + } + else + return (float)(x); +} + + +void mulc_spinor_add(int vol,spinor *s,spinor *r,complex z) +{ + spinor *sm; + + _avx_load_cmplx_up(z); + sm=s+vol; + + for (;snrot)&&(ifail==0)) + alloc_wrotate(n); + + if ((n>0)&&(ifail==0)) + { + for (ix=0;ix1)) + { + v.re=x; + v.im=-y; + + mpc_gsum_d(&v.re,&w.re,2); + + z.re=(float)(w.re); + z.im=(float)(w.im); + } + else + { + z.re=(float)(x); + z.im=(float)(-y); + } + + return z; +} + + +float spinor_prod_re(int vol,int icom,spinor *s,spinor *r) +{ + double x,y; + spinor *sm; + + __asm__ __volatile__ ("xorpd %%xmm10, %%xmm10 \n\t" + "xorpd %%xmm11, %%xmm11 \n\t" + "xorpd %%xmm12, %%xmm12" + : + : + : + "xmm10", "xmm11", "xmm12"); + + sm=s+vol; + + for (;s1)) + { + mpc_gsum_d(&x,&y,1); + return (float)(y); + } + else + return (float)(x); +} + + +float norm_square(int vol,int icom,spinor *s) +{ + double x,y; + spinor *sm; + + __asm__ __volatile__ ("xorpd %%xmm10, %%xmm10 \n\t" + "xorpd %%xmm11, %%xmm11 \n\t" + "xorpd %%xmm12, %%xmm12" + : + : + : + "xmm10", "xmm11", "xmm12"); + + sm=s+vol; + + for (;s1)) + { + mpc_gsum_d(&x,&y,1); + return (float)(y); + } + else + return (float)(x); +} + + +void mulc_spinor_add(int vol,spinor *s,spinor *r,complex z) +{ + spinor *sm; + + _sse_load_cmplx(z); + sm=s+vol; + + for (;snrot)&&(ifail==0)) + alloc_wrotate(n); + + if ((n>0)&&(ifail==0)) + { + for (ix=0;ixnrot)&&(ifail==0)) + alloc_wrotate(n); + + if ((n>0)&&(ifail==0)) + { + for (ix=0;ixnrot)&&(ifail==0)) + alloc_wrotate(n); + + if ((n>0)&&(ifail==0)) + { + for (ix=0;ix +#include +#include +#include "mpi.h" +#include "su3.h" +#include "utils.h" +#include "sflds.h" +#include "linalg.h" +#include "global.h" + +#define MAX_LEVELS 12 +#define BLK_LENGTH 8 + +static int nrot=0,ifail=0; +static int cnt[MAX_LEVELS]; +static double smx[MAX_LEVELS] ALIGNED16; +#if (defined QPX) +static double smy[MAX_LEVELS] ALIGNED16; +#endif +static complex_dble smz[MAX_LEVELS] ALIGNED16; +static spinor_dble *psi; + + +static void alloc_wrotate(int n) +{ + if (nrot>0) + afree(psi); + + psi=amalloc(n*sizeof(*psi),ALIGN); + + if (psi==NULL) + { + error_loc(1,1,"alloc_wrotate [salg_dble.c]", + "Unable to allocate workspace"); + nrot=0; + ifail=1; + } + else + { + nrot=n; + set_sd2zero(n,psi); + } +} + +#if (defined AVX) +#include "avx.h" + +complex_dble spinor_prod_dble(int vol,int icom,spinor_dble *s,spinor_dble *r) +{ + int n; + complex_dble w,z; + spinor_dble *sm,*smb; + + for (n=0;nsm) + smb=sm; + + __asm__ __volatile__ ("vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorpd %%ymm1, %%ymm1, %%ymm1 \n\t" + "vxorpd %%ymm2, %%ymm2, %%ymm2 \n\t" + "vxorpd %%ymm3, %%ymm3, %%ymm3 \n\t" + "vxorpd %%ymm4, %%ymm4, %%ymm4 \n\t" + "vxorpd %%ymm5, %%ymm5, %%ymm5" + : + : + : + "xmm0", "xmm1", "xmm2", + "xmm3", "xmm4", "xmm5"); + + for (;s=BLK_LENGTH)&&(n1)) + { + mpc_gsum_d(&v.re,&w.re,2); + return z; + } + else + return w; +} + + +double spinor_prod_re_dble(int vol,int icom,spinor_dble *s,spinor_dble *r) +{ + int n; + double x,y; + spinor_dble *sm,*smb; + + for (n=0;nsm) + smb=sm; + + __asm__ __volatile__ ("vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorpd %%ymm1, %%ymm1, %%ymm1 \n\t" + "vxorpd %%ymm2, %%ymm2, %%ymm2 \n\t" + : + : + : + "xmm0", "xmm1", "xmm2"); + + for (;s=BLK_LENGTH)&&(n1)) + { + mpc_gsum_d(&x,&y,1); + return y; + } + else + return x; +} + + +complex_dble spinor_prod5_dble(int vol,int icom,spinor_dble *s,spinor_dble *r) +{ + int n; + complex_dble w,z; + spinor_dble *sm,*smb; + + for (n=0;nsm) + smb=sm; + + __asm__ __volatile__ ("vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" + "vxorpd %%ymm1, %%ymm1, %%ymm1 \n\t" + "vxorpd %%ymm2, %%ymm2, %%ymm2 \n\t" + "vxorpd %%ymm3, %%ymm3, %%ymm3 \n\t" + "vxorpd %%ymm4, %%ymm4, %%ymm4 \n\t" + "vxorpd %%ymm5, %%ymm5, %%ymm5" + : + : + : + "xmm0", "xmm1", "xmm2", + "xmm3", "xmm4", "xmm5"); + + for (;s=BLK_LENGTH)&&(n1)) + { + mpc_gsum_d(&w.re,&z.re,2); + return z; + } + else + return w; +} + + +double norm_square_dble(int vol,int icom,spinor_dble *s) +{ + int n; + double x,y; + spinor_dble *sm,*smb; + + for (n=0;nsm) + smb=sm; + + __asm__ __volatile__ ("vxorpd %%ymm6, %%ymm6, %%ymm6 \n\t" + "vxorpd %%ymm7, %%ymm7, %%ymm7 \n\t" + "vxorpd %%ymm8, %%ymm8, %%ymm8 \n\t" + "vxorpd %%ymm9, %%ymm9, %%ymm9 \n\t" + "vxorpd %%ymm10, %%ymm10, %%ymm10 \n\t" + "vxorpd %%ymm11, %%ymm11, %%ymm11" + : + : + : + "xmm6", "xmm7", "xmm8", + "xmm9", "xmm10", "xmm11"); + + for (;s=BLK_LENGTH)&&(n1)) + { + mpc_gsum_d(&y,&x,1); + return x; + } + else + return y; +} + + +void mulc_spinor_add_dble(int vol,spinor_dble *s,spinor_dble *r, + complex_dble z) +{ + spinor_dble *sm; + + _avx_load_cmplx_up_dble(z); + sm=s+vol; + + for (;snrot)&&(ifail==0)) + alloc_wrotate(n); + + if ((n>0)&&(ifail==0)) + { + for (ix=0;ixsm) + smb=sm; + + __asm__ __volatile__ ("xorpd %%xmm6, %%xmm6 \n\t" + "xorpd %%xmm7, %%xmm7 \n\t" + "xorpd %%xmm8, %%xmm8 \n\t" + "xorpd %%xmm9, %%xmm9 \n\t" + "xorpd %%xmm10, %%xmm10 \n\t" + "xorpd %%xmm11, %%xmm11" + : + : + : + "xmm6", "xmm7", "xmm8", + "xmm9", "xmm10", "xmm11"); + + for (;s=BLK_LENGTH)&&(n1)) + { + mpc_gsum_d(&w.re,&z.re,2); + return z; + } + else + return w; +} + + +double spinor_prod_re_dble(int vol,int icom,spinor_dble *s,spinor_dble *r) +{ + int n; + double x,y; + spinor_dble *sm,*smb; + + for (n=0;nsm) + smb=sm; + + __asm__ __volatile__ ("xorpd %%xmm6, %%xmm6 \n\t" + "xorpd %%xmm7, %%xmm7 \n\t" + "xorpd %%xmm8, %%xmm8" + : + : + : + "xmm6", "xmm7", "xmm8"); + + for (;s=BLK_LENGTH)&&(n1)) + { + mpc_gsum_d(&y,&x,1); + return x; + } + else + return y; +} + + +complex_dble spinor_prod5_dble(int vol,int icom,spinor_dble *s,spinor_dble *r) +{ + int n; + complex_dble w,z; + spinor_dble *sm,*smb; + + for (n=0;nsm) + smb=sm; + + __asm__ __volatile__ ("xorpd %%xmm6, %%xmm6 \n\t" + "xorpd %%xmm7, %%xmm7 \n\t" + "xorpd %%xmm8, %%xmm8 \n\t" + "xorpd %%xmm9, %%xmm9 \n\t" + "xorpd %%xmm10, %%xmm10 \n\t" + "xorpd %%xmm11, %%xmm11" + : + : + : + "xmm6", "xmm7", "xmm8", + "xmm9", "xmm10", "xmm11"); + + for (;s=BLK_LENGTH)&&(n1)) + { + mpc_gsum_d(&w.re,&z.re,2); + return z; + } + else + return w; +} + + +double norm_square_dble(int vol,int icom,spinor_dble *s) +{ + int n; + double x,y; + spinor_dble *sm,*smb; + + for (n=0;nsm) + smb=sm; + + __asm__ __volatile__ ("xorpd %%xmm6, %%xmm6 \n\t" + "xorpd %%xmm7, %%xmm7 \n\t" + "xorpd %%xmm8, %%xmm8" + : + : + : + "xmm6", "xmm7", "xmm8"); + + for (;s=BLK_LENGTH)&&(n1)) + { + mpc_gsum_d(&y,&x,1); + return x; + } + else + return y; +} + + +void mulc_spinor_add_dble(int vol,spinor_dble *s,spinor_dble *r, + complex_dble z) +{ + spinor_dble *sm; + + _sse_load_cmplx_dble(z); + sm=s+vol; + + for (;snrot)&&(ifail==0)) + alloc_wrotate(n); + + if ((n>0)&&(ifail==0)) + { + for (ix=0;ixsm) + smb=sm; + + x=0.0; + y=0.0; + + for (;s=BLK_LENGTH)&&(nsm) + smb=sm; + + x=0.0; + + for (;s=BLK_LENGTH)&&(nsm) + smb=sm; + + x=0.0; + y=0.0; + + for (;s=BLK_LENGTH)&&(nsm) + smb=sm; + + x=0.0; + + for (;s=BLK_LENGTH)&&(nnrot)&&(ifail==0)) + alloc_wrotate(n); + + if ((n>0)&&(ifail==0)) + { + for (ix=0;ixsm) + smb=sm; + + z.re=0.0; + z.im=0.0; + + for (;s=BLK_LENGTH)&&(n1)) + { + mpc_gsum_d(&w.re,&z.re,2); + return z; + } + else + return w; +} + + +double spinor_prod_re_dble(int vol,int icom,spinor_dble *s,spinor_dble *r) +{ + int n; + double x,y; + spinor_dble *sm,*smb; + + for (n=0;nsm) + smb=sm; + + x=0.0; + + for (;s=BLK_LENGTH)&&(n1)) + { + mpc_gsum_d(&y,&x,1); + return x; + } + else + return y; +} + + +complex_dble spinor_prod5_dble(int vol,int icom,spinor_dble *s,spinor_dble *r) +{ + int n; + complex_dble w,z; + spinor_dble *sm,*smb; + + for (n=0;nsm) + smb=sm; + + z.re=0.0; + z.im=0.0; + + for (;s=BLK_LENGTH)&&(n1)) + { + mpc_gsum_d(&w.re,&z.re,2); + return z; + } + else + return w; +} + + +double norm_square_dble(int vol,int icom,spinor_dble *s) +{ + int n; + double x,y; + spinor_dble *sm,*smb; + + for (n=0;nsm) + smb=sm; + + x=0.0; + + for (;s=BLK_LENGTH)&&(n1)) + { + mpc_gsum_d(&y,&x,2); + return x; + } + else + return y; +} + + +void mulc_spinor_add_dble(int vol,spinor_dble *s,spinor_dble *r, + complex_dble z) +{ + spinor_dble *sm; + + sm=s+vol; + + for (;snrot)&&(ifail==0)) + alloc_wrotate(n); + + if ((n>0)&&(ifail==0)) + { + for (ix=0;ix +#include +#include +#include "mpi.h" +#include "utils.h" +#include "linalg.h" +#include "global.h" + +static int nrot=0,ifail=0; +static complex *psi; + + +static void alloc_wrotate(int n) +{ + if (nrot>0) + afree(psi); + + psi=amalloc(n*sizeof(*psi),ALIGN); + + if (psi==NULL) + { + error_loc(1,1,"alloc_wrotate [valg.c]","Unable to allocate workspace"); + nrot=0; + ifail=1; + } + else + nrot=n; +} + + +complex vprod(int n,int icom,complex *v,complex *w) +{ + complex z,*vm; + complex_dble vd,wd; + + vd.re=0.0; + vd.im=0.0; + vm=v+n; + + for (;vnrot)&&(ifail==0)) + alloc_wrotate(nv); + + if ((nv>0)&&(ifail==0)) + { + for (i=0;i +#include +#include +#include "mpi.h" +#include "utils.h" +#include "linalg.h" +#include "global.h" + +#define MAX_LEVELS 8 +#define BLK_LENGTH 32 + +static int nrot=0,ifail=0; +static int cnt[MAX_LEVELS]; +static double smx[MAX_LEVELS],smy[MAX_LEVELS]; +static complex_dble *psi; + + +static void alloc_wrotate(int n) +{ + if (nrot>0) + afree(psi); + + psi=amalloc(n*sizeof(*psi),ALIGN); + + if (psi==NULL) + { + error_loc(1,1,"alloc_wrotate [valg_dble.c]", + "Unable to allocate workspace"); + nrot=0; + ifail=1; + } + else + nrot=n; +} + + +complex_dble vprod_dble(int n,int icom,complex_dble *v,complex_dble *w) +{ + int k; + complex_dble s,t; + complex_dble *vm,*vb; + + for (k=0;kvm) + vb=vm; + s.re=0.0; + s.im=0.0; + + for (;v=BLK_LENGTH)&&(kvm) + vb=vm; + s=0.0; + + for (;v=BLK_LENGTH)&&(knrot)&&(ifail==0)) + alloc_wrotate(nv); + + if ((nv>0)&&(ifail==0)) + { + for (i=0;i +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "utils.h" +#include "sflds.h" +#include "linalg.h" +#include "linsolv.h" +#include "global.h" + +#define PRECISION_LIMIT ((double)(100.0f*FLT_EPSILON)) + +static float rsq,rsq_old,ai,bi; +static spinor *psx,*psr,*psp,*psap,*psw; +static spinor_dble *pdb,*pdx,*pdw,*pdv; + +#if (defined x64) +#include "sse2.h" + +static void update_g(int vol) +{ + float c; + spinor *r,*s,*sm; + + c=-ai; + + __asm__ __volatile__ ("movss %0, %%xmm6 \n\t" + "shufps $0x0, %%xmm6, %%xmm6 \n\t" + "movaps %%xmm6, %%xmm7 \n\t" + "movaps %%xmm6, %%xmm8" + : + : + "m" (c) + : + "xmm6", "xmm7", "xmm8"); + + r=psr; + s=psap; + sm=s+vol; + + for (;s1)) + { + iprms[0]=vol; + iprms[1]=nmx; + dprms[0]=res; + + MPI_Bcast(iprms,2,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(dprms,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + error((iprms[0]!=vol)||(iprms[1]!=nmx)||(dprms[0]!=res),1, + "cgne [cgne.c]","Parameters are not global"); + + error_root((vol<=0)||(nmx<1)||(res<=DBL_EPSILON),1, + "cgne [cgne.c]","Improper choice of vol,nmx or res"); + } + else + { + if ((vol<=0)||(nmx<1)||(res<=DBL_EPSILON)) + { + error_loc(1,1,"cgne [cgne.c]", + "Improper choice of vol,nmx or res"); + (*status)=0; + return 1.0; + } + } + + cg_init(vol,icom,ws,wsd,eta,psi); + rn=sqrt((double)(rsq)); + tol=res*rn; + (*status)=0; + + xn=(double)(norm_square(vol,icom,psx)); + xn=sqrt(xn); + + while (rn>tol) + { +#ifdef CGNE_DBG + message("[cgne]: rn_old = %.2e\n",rn); +#endif + ncg=0; + + for (;;) + { + cg_step(vol,icom,Dop); + ncg+=1; + (*status)+=1; + + xn=(double)(norm_square(vol,icom,psx)); + xn=sqrt(xn); + rn=sqrt((double)(rsq)); +#ifdef CGNE_DBG + message("[cgne]: ncg = %d, xn = %.2e, rn = %.2e\n",(*status),xn,rn); +#endif + if ((rn<=tol)||(rn<=(PRECISION_LIMIT*xn))||(ncg>=100)|| + ((*status)>=nmx)) + break; + } + + add_s2sd(vol,psx,pdx); + xn=norm_square_dble(vol,icom,pdx); + xn=sqrt(xn); + cg_reset(vol,icom,Dop,Dop_dble); + rn=sqrt((double)(rsq)); + + if (((*status)>=nmx)&&(rn>tol)) + { + (*status)=-1; + break; + } + + if ((100.0*DBL_EPSILON*xn)>tol) + { + (*status)=-2; + break; + } + } + + return rn; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/linsolv/fgcr.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/linsolv/fgcr.c new file mode 100644 index 0000000000000000000000000000000000000000..3f50ce946a0d35191ea5c105f19d784530df45b6 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/linsolv/fgcr.c @@ -0,0 +1,300 @@ + +/******************************************************************************* +* +* File fgcr.c +* +* Copyright (C) 2005, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Generic flexible GCR solver program for the lattice Dirac equation. +* +* The externally accessible function is +* +* double fgcr(int vol,int icom, +* void (*Dop)(spinor_dble *s,spinor_dble *r), +* void (*Mop)(int k,spinor *rho,spinor *phi,spinor *chi), +* spinor **ws,spinor_dble **wsd,int nkv,int nmx,double res, +* spinor_dble *eta,spinor_dble *psi,int *status) +* Solution of the Dirac equation D*psi=eta for given source eta, using +* the preconditioned GCR algorithm. See the notes for the explanation +* of the parameters of the program. +* +* Notes: +* +* This program uses single-precision arithmetic to reduce the execution +* time, but obtains the solution with double-precision accuracy. +* +* The programs Dop() and Mop() for the operator D and the preconditioner M +* are assumed to have the following properties: +* +* void Dop(spinor_dble *s,spinor_dble *r) +* Application of the operator D to the Dirac field s and assignment of +* the result to r. On exit s may be changed but must satisfy D*s=r. +* +* void Mop(int k,spinor *rho,spinor *phi,spinor *chi) +* Approximate solution of the equation D*phi=rho in the k'th step of +* the GCR algorithm. On exit rho is unchanged and chi=D*phi. +* +* Mop() is not required to be a linear operator and may involve an iterative +* procedure with a dynamical stopping criterion, for example. The field phi +* merely defines the next search direction and can in principle be chosen +* arbitrarily. +* +* The other parameters of the program fgcr() are: +* +* vol Number of spinors in the Dirac fields. +* +* icom Indicates whether the equation to be solved is a local +* equation (icom=0) or a global one (icom=1). Scalar products +* are summed over all MPI processes if icom=1, while no +* communications are performed if icom=0. +* +* nkv Maximal number of Krylov vectors generated before the GCR +* algorithm is restarted. +* +* nmx Maximal total number of Krylov vectors that may be generated. +* +* res Desired maximal relative residue |eta-D*psi|/|eta| of the +* calculated solution. +* +* ws Array of at least 2*nkv+1 single-precision spinor fields +* (used as work space). +* +* wsd Array of at least 1 double-precision spinor field (used +* as work space). +* +* eta Source field (unchanged on exit). +* +* psi Calculated approximate solution of the Dirac equation +* D*psi=eta. +* +* status On exit, this parameter reports the total number of Krylov +* vectors that were generated, or a negative value if the +* program failed. +* +* Independently of whether the program succeeds in solving the Dirac equation +* to the desired accuracy, the program returns the norm of the residue of +* the field psi. +* +* Some debugging output is printed to stdout on process 0 if FGCR_DBG is +* defined at compilation time. +* +*******************************************************************************/ + +#define FGCR_C + +#include +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "utils.h" +#include "sflds.h" +#include "linalg.h" +#include "linsolv.h" +#include "global.h" + +#define PRECISION_LIMIT ((double)(100.0f*FLT_EPSILON)) + +static int nkm=0; +static float *b; +static complex *a,*c; +static double rn; +static spinor **phi,**chi,*rho; +static spinor_dble *wrk; + + +static int alloc_arrays(int nkv) +{ + if (nkm>0) + { + afree(a); + afree(b); + } + + a=amalloc(nkv*(nkv+1)*sizeof(*a),ALIGN); + b=amalloc(nkv*sizeof(*b),ALIGN); + + if ((a==NULL)||(b==NULL)) + return 1; + + c=a+nkv*nkv; + nkm=nkv; + + return 0; +} + + +static void gcr_init(int vol,int icom,int nkv,spinor **ws,spinor_dble **wsd, + spinor_dble *eta,spinor_dble *psi) +{ + phi=ws; + rho=ws[nkv]; + chi=ws+nkv+1; + wrk=wsd[0]; + + set_sd2zero(vol,psi); + assign_sd2s(vol,eta,rho); + + rn=(double)(norm_square(vol,icom,rho)); + rn=sqrt(rn); +} + + +static void gcr_step(int vol,int icom,int k,int nkv, + void (*Mop)(int k,spinor *rho,spinor *phi,spinor *chi)) +{ + int l; + complex z; + + (*Mop)(k,rho,phi[k],chi[k]); + + for (l=0;l=0;l--) + { + z.re=c[l].re; + z.im=c[l].im; + + for (i=(l+1);i<=k;i++) + { + z.re-=(a[l*nkv+i].re*c[i].re-a[l*nkv+i].im*c[i].im); + z.im-=(a[l*nkv+i].re*c[i].im+a[l*nkv+i].im*c[i].re); + } + + r=1.0f/b[l]; + c[l].re=z.re*r; + c[l].im=z.im*r; + } + + set_s2zero(vol,rho); + + for (l=k;l>=0;l--) + mulc_spinor_add(vol,rho,phi[l],c[l]); + + add_s2sd(vol,rho,psi); + (*Dop)(psi,wrk); + diff_sd2s(vol,eta,wrk,rho); + + rn=(double)(norm_square(vol,icom,rho)); + rn=sqrt(rn); +} + + +double fgcr(int vol,int icom, + void (*Dop)(spinor_dble *s,spinor_dble *r), + void (*Mop)(int k,spinor *eta,spinor *psi,spinor *chi), + spinor **ws,spinor_dble **wsd,int nkv,int nmx,double res, + spinor_dble *eta,spinor_dble *psi,int *status) +{ + int ie,k,iprms[3]; + double rn_old,tol,dprms[1]; + + if ((icom==1)&&(NPROC>1)) + { + iprms[0]=vol; + iprms[1]=nkv; + iprms[2]=nmx; + dprms[0]=res; + + MPI_Bcast(iprms,3,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(dprms,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + error((iprms[0]!=vol)||(iprms[1]!=nkv)||(iprms[2]!=nmx)|| + (dprms[0]!=res),1,"fgcr [fgcr.c]","Parameters are not global"); + + error_root((vol<=0)||(nkv<1)||(nmx<1)||(res<=DBL_EPSILON),1, + "fgcr [fgcr.c]","Improper choice of vol,nkv,nmx or res"); + + if (nkv>nkm) + { + ie=alloc_arrays(nkv); + error(ie,1,"fgcr [fgcr.c]","Unable to allocate auxiliary arrays"); + } + } + else + { + if ((vol<=0)||(nkv<1)||(nmx<1)||(res<=DBL_EPSILON)) + { + error_loc(1,1,"fgcr [fgcr.c]", + "Improper choice of vol,nkv,nmx or res"); + (*status)=0; + return 1.0; + } + + if (nkv>nkm) + { + ie=alloc_arrays(nkv); + + if (ie) + { + error_loc(1,1,"fgcr [fgcr.c]", + "Unable to allocate auxiliary arrays"); + (*status)=0; + return 1.0; + } + } + } + + gcr_init(vol,icom,nkv,ws,wsd,eta,psi); + tol=res*rn; + (*status)=0; + + while (rn>tol) + { +#ifdef FGCR_DBG + message("[fgcr]: rn_old = %.2e\n",rn); +#endif + rn_old=rn; + + for (k=0;;k++) + { + gcr_step(vol,icom,k,nkv,Mop); + (*status)+=1; +#ifdef FGCR_DBG + message("[fgcr]: k = %d, rn = %.2e\n",k,rn); +#endif + if ((rn<=tol)||(rn<(PRECISION_LIMIT*rn_old))|| + ((k+1)==nkv)||((*status)==nmx)) + break; + } + + update_psi(vol,icom,k,nkv,eta,psi,Dop); + + if (((*status)==nmx)&&(rn>tol)) + { + (*status)=-1; + return rn; + } + } + + return rn; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/linsolv/fgcr4vd.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/linsolv/fgcr4vd.c new file mode 100644 index 0000000000000000000000000000000000000000..fd5cf8dfc81eacbf0c0234d9ddcd6dd761270416 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/linsolv/fgcr4vd.c @@ -0,0 +1,354 @@ + +/******************************************************************************* +* +* File fgcr4vd.c +* +* Copyright (C) 2007, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Generic flexible GCR solver program for the little Dirac equation. +* +* The externally accessible function is +* +* double fgcr4vd(int vol,int icom, +* void (*Dop)(complex_dble *v,complex_dble *w), +* void (*Mop)(int k,complex *rho,complex *phi,complex *chi), +* complex *wv[],complex_dble *wvd[],int nkv,int nmx,double res, +* complex_dble *eta,complex_dble *psi,int *status) +* Solution of the little equation D*psi=eta for given source eta, using +* the preconditioned GCR algorithm. See the notes for the explanation +* of the parameters of the program. +* +* Notes: +* +* This program uses single-precision arithmetic to reduce the execution +* time, but obtains the solution with double-precision accuracy. +* +* The programs Dop() and Mop() for the operator D and the preconditioner M +* are assumed to have the following properties: +* +* void Dop(complex_dble *v,complex_dble *w) +* Application of the operator D to the complex field v and assignment +* of the result to w. On exit v may be changed but must satisfy D*v=w. +* +* void Mop(int k,complex *rho,complex *phi,complex *chi) +* Approximate solution of the equation D*phi=rho in the k'th step of +* the GCR algorithm. On exit rho is unchanged and chi=D*phi. +* +* Mop() is not required to be a linear operator and may involve an iterative +* procedure with a dynamical stopping criterion, for example. The field phi +* merely defines the next search direction and can in principle be chosen +* arbitrarily. +* +* The other parameters of the program fgcr4vd() are: +* +* vol Number of complex components of the fields on which the +* operator D acts. +* +* icom Indicates whether the equation to be solved is a local +* equation (icom=0) or a global one (icom=1). Scalar products +* are summed over all MPI processes if icom=1, while no +* communications are performed if icom=0. +* +* nkv Maximal number of Krylov vectors generated before the GCR +* algorithm is restarted. +* +* nmx Maximal total number of Krylov vectors that may be generated. +* +* res Desired maximal relative residue |eta-D*psi|/|eta| of the +* calculated solution. +* +* wv Array of at least 2*nkv+1 single-precision complex fields +* (used as work space). +* +* wvd Array of at least 1 double-precision complex field (used +* as work space). +* +* eta Source field (unchanged on exit). +* +* psi Calculated approximate solution of the little equation +* D*psi=eta. +* +* status On exit, this parameter reports the total number of Krylov +* vectors that were generated or -1 if the algorithm did not +* converge. +* +* Independently of whether the program succeeds in solving the little equation +* to the desired accuracy, the program returns the norm of the residue of +* the field psi. +* +* Some debugging output is printed to stdout on process 0 if FGCR4VD_DBG is +* defined at compilation time. +* +*******************************************************************************/ + +#define FGCR4VD_C + +#include +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "utils.h" +#include "vflds.h" +#include "linalg.h" +#include "linsolv.h" +#include "global.h" + +#define PRECISION_LIMIT ((double)(100.0f*FLT_EPSILON)) + +static int nkm=0; +static float *b; +static complex *a,*c; +static double rn; +static complex **phi,**chi,*rho; +static complex_dble *wrk,*cs1,*cs2; + + +static int alloc_arrays(int nkv) +{ + if (nkm>0) + { + afree(a); + afree(b); + afree(cs1); + } + + a=amalloc(nkv*(nkv+1)*sizeof(*a),ALIGN); + b=amalloc(nkv*sizeof(*b),ALIGN); + cs1=amalloc(2*(nkv+2)*sizeof(*cs1),ALIGN); + + if ((a==NULL)||(b==NULL)||(cs1==NULL)) + return 1; + + c=a+nkv*nkv; + cs2=cs1+nkv+2; + nkm=nkv; + + return 0; +} + + +static void gcr_init(int vol,int icom,int nkv,complex **wv,complex_dble **wvd, + complex_dble *eta,complex_dble *psi) +{ + phi=wv; + rho=wv[nkv]; + chi=wv+nkv+1; + wrk=wvd[0]; + + set_vd2zero(vol,psi); + assign_vd2v(vol,eta,rho); + + rn=(double)(vnorm_square(vol,icom,rho)); + rn=sqrt(rn); +} + + +static void sum_vprod(int icom,int n) +{ + int i; + + if ((icom==1)&&(NPROC>1)) + { + MPI_Reduce((double*)(cs1),(double*)(cs2),2*n,MPI_DOUBLE, + MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast((double*)(cs2),2*n,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + else + { + for (i=0;i=0;l--) + { + z.re=c[l].re; + z.im=c[l].im; + + for (i=(l+1);i<=k;i++) + { + z.re-=(a[l*nkv+i].re*c[i].re-a[l*nkv+i].im*c[i].im); + z.im-=(a[l*nkv+i].re*c[i].im+a[l*nkv+i].im*c[i].re); + } + + r=1.0f/b[l]; + c[l].re=z.re*r; + c[l].im=z.im*r; + } + + set_v2zero(vol,rho); + + for (l=k;l>=0;l--) + mulc_vadd(vol,rho,phi[l],c[l]); + + add_v2vd(vol,rho,psi); + (*Dop)(psi,wrk); + diff_vd2v(vol,eta,wrk,rho); + + rn=(double)(vnorm_square(vol,icom,rho)); + rn=sqrt(rn); +} + + +double fgcr4vd(int vol,int icom, + void (*Dop)(complex_dble *v,complex_dble *w), + void (*Mop)(int k,complex *eta,complex *psi,complex *chi), + complex **wv,complex_dble **wvd,int nkv,int nmx,double res, + complex_dble *eta,complex_dble *psi,int *status) +{ + int ie,k,iprms[3]; + double rn_old,tol,dprms[1]; + + if ((icom==1)&&(NPROC>1)) + { + iprms[0]=vol; + iprms[1]=nkv; + iprms[2]=nmx; + dprms[0]=res; + + MPI_Bcast(iprms,3,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(dprms,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + error((iprms[0]!=vol)||(iprms[1]!=nkv)||(iprms[2]!=nmx)|| + (dprms[0]!=res),1,"fgcr4vd [fgcr4vd.c]", + "Parameters are not global"); + + error_root((vol<=0)||(nkv<1)||(nmx<1)||(res<=DBL_EPSILON),1, + "fgcr4vd [fgcr4vd.c]", + "Improper choice of vol,nkv,nmx or res"); + + if (nkv>nkm) + { + ie=alloc_arrays(nkv); + error(ie,1,"fgcr4vd [fgcr4vd.c]", + "Unable to allocate auxiliary arrays"); + } + } + else + { + if ((vol<=0)||(nkv<1)||(nmx<1)||(res<=DBL_EPSILON)) + { + error_loc(1,1,"fgcr4vd [fgcrvvd.c]", + "Improper choice of vol,nkv,nmx or res"); + (*status)=0; + return 1.0; + } + + if (nkv>nkm) + { + ie=alloc_arrays(nkv); + + if (ie) + { + error_loc(1,1,"fgcr4vd [fgcr4vd.c]", + "Unable to allocate auxiliary arrays"); + (*status)=0; + return 1.0; + } + } + } + + gcr_init(vol,icom,nkv,wv,wvd,eta,psi); + tol=res*rn; + (*status)=0; + + while (rn>tol) + { +#ifdef FGCR4VD_DBG + message("[fgcr4vd]: rn_old = %.2e\n",rn); +#endif + rn_old=rn; + + for (k=0;;k++) + { + gcr_step(vol,icom,k,nkv,Mop); + (*status)+=1; +#ifdef FGCR4VD_DBG + message("[fgcr4vd]: k = %d, rn = %.2e\n",k,rn); +#endif + if ((rn<=tol)||(rn<(PRECISION_LIMIT*rn_old))|| + ((k+1)==nkv)||((*status)==nmx)) + break; + } + + update_psi(vol,icom,k,nkv,eta,psi,Dop); + + if (((*status)==nmx)&&(rn>tol)) + { + (*status)=-1; + return rn; + } + } + + return rn; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/linsolv/mscg.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/linsolv/mscg.c new file mode 100644 index 0000000000000000000000000000000000000000..e0060df7c57456d9a285778f9dd76f660f32de1d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/linsolv/mscg.c @@ -0,0 +1,526 @@ + +/******************************************************************************* +* +* File mscg.c +* +* Copyright (C) 2012 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Generic multi-shift CG solver program for the lattice Dirac equation +* +* The externally accessible function is +* +* void mscg(int vol,int icom,int nmu,double *mu, +* void (*Dop_dble)(double mu,spinor_dble *s,spinor_dble *r), +* spinor_dble **wsd,int nmx,double *res, +* spinor_dble *eta,spinor_dble **psi,int *status) +* Solution of the Dirac equation (D^dag*D+mu^2)*psi=eta for a given +* source eta and one or more values of mu using the multi-shift CG +* algorithm. See the notes for the explanation of the parameters of +* the program. +* +* Notes: +* +* The algorithm implemented in this module is described in the notes +* "Multi-shift conjugate gradient algorithm" (file doc/mscg.pdf). +* +* The program Dop_dble() for the Dirac operator is assumed to have the +* following properties: +* +* void Dop_dble(double mu,spinor_dble *s,spinor_dble *r) +* Application of an operator Op or its hermitian conjugate Op^dag +* to the double-precision Dirac field s and assignment of the result +* to r (where r is different from s). The operator must be such that +* the identity Op^dag*Op=D^dag*D+mu^2 holds. Op and Op^dag are applied +* alternatingly, i.e. the first call of the program applies Op, the +* next call Op^dag, then Op again and so on. In all cases, the source +* field s remains unchanged. +* +* The other parameters of the program mscg() are: +* +* vol Number of spinors in the Dirac fields. +* +* icom Indicates whether the equation to be solved is a local +* equation (icom=0) or a global one (icom=1). Scalar products +* are summed over all MPI processes if icom=1, while no +* communications are performed if icom=0. +* +* nmu Number of shifts mu. +* +* mu Array of the shifts mu (nmu elements). +* +* nmx Maximal number of CG iterations that may be applied. +* +* res Array of the desired maximal relative residues of the +* calculated solutions (nmu elements). +* +* wsd Array of at least 3+nmu (5 if nmu=1) double-precision spinor +* fields (used as work space). +* +* eta Source field (unchanged on exit). +* +* psi Array of the calculated approximate solutions of the Dirac +* equations (D^dag*D+mu^2)*psi=eta (nmu elements). +* +* status On exit, this parameter reports the number of CG iterations +* that were required, or a negative value if the program failed. +* +* The spinor fields must have at least vol elements and must be such that +* the program Dop_dble() acts correctly on them. Some debugging output is +* printed to stdout on process 0 if the macro MSCG_DBG is defined. +* +*******************************************************************************/ + +#define MSCG_C + +#include +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "utils.h" +#include "sflds.h" +#include "linalg.h" +#include "linsolv.h" +#include "global.h" + +typedef struct +{ + int stop; + double s,tol; + double ah,bh,gh,rh; + spinor_dble *xh,*ph; +} cgsh_t; + +typedef struct +{ + int k,stop; + double mu,tol; + double a,b; + double rn0,rn,rnsq; + spinor_dble *x,*r,*p,*ap,*w; +} cgs_t; + +static int ns=0; +static double *dprms; +static cgs_t cgs; +static cgsh_t *cgsh; + + +static int alloc_cgs(int nmu,double *mu,double *res,spinor_dble **wsd, + spinor_dble **psi) +{ + int k,l,k0; + + if (nmu>ns) + { + if (ns>0) + free(dprms); + if (ns>1) + free(cgsh); + + dprms=malloc(2*nmu*sizeof(*dprms)); + if (dprms==NULL) + return 1; + + if (nmu>1) + { + cgsh=malloc((nmu-1)*sizeof(*cgsh)); + if (cgsh==NULL) + return 1; + } + + ns=nmu; + } + + k0=0; + + for (k=1;k=nmx) + return 1; + + x=wsd[2]; + p=wsd[3]; + ap=wsd[4]; + + set_sd2zero(vol,x); + assign_sd2sd(vol,r,p); + + while ((rn>tol)&&((*ncg)1)) + { + iprms[0]=vol; + iprms[1]=nmu; + iprms[2]=nmx; + + MPI_Bcast(iprms,3,MPI_INT,0,MPI_COMM_WORLD); + error((iprms[0]!=vol)||(iprms[1]!=nmu)||(iprms[2]!=nmx),1, + "mscg [mscg.c]","Integer parameters are not global"); + error_root((vol<1)||(nmu<1)||(nmx<1),1,"mscg [mscg.c]", + "Improper choice of vol,nmu or nmx"); + + ie=alloc_cgs(nmu,mu,res,wsd,psi); + error(ie!=0,1,"mscg [mscg.c]","Unable to allocate auxiliary arrays"); + + for (k=0;k +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "vflds.h" +#include "linalg.h" +#include "dfl.h" +#include "little.h" +#include "global.h" + +static int Ns=0,nb,nbh; +static int nbbh,(*inn)[8]; +static complex *vs; + + +static void alloc_vs(void) +{ + dfl_parms_t dfl; + dfl_grid_t grd; + + dfl=dfl_parms(); + grd=dfl_geometry(); + + Ns=dfl.Ns; + nb=grd.nb; + nbh=nb/2; + nbbh=grd.nbb/2; + inn=grd.inn; + + vs=amalloc(Ns*sizeof(*vs),ALIGN); + + error(vs==NULL,1,"alloc_vs [Aw.c]", + "Unable to allocate auxiliary array"); +} + + +static void apply_Aoe(int *nn,complex **A,complex *v) +{ + int ifc; + + cmat_vec(Ns,*A,v+nn[0]*Ns,vs); + A+=1; + + for (ifc=1;ifc<8;ifc++) + { + cmat_vec_assign(Ns,*A,v+nn[ifc]*Ns,vs); + A+=1; + } +} + + +static void apply_Aeo(int *nn,complex **A,complex *v) +{ + int ifc; + + for (ifc=0;ifc<8;ifc++) + { + cmat_vec_assign(Ns,*A,vs,v+nn[ifc]*Ns); + A+=1; + } +} + + +static void apply_Aee(complex **A,complex *v,complex *w) +{ + complex **Am; + + Am=A+nbh; + + for (;A1) + { + set_v2zero(nbbh*Ns,w+nb*Ns); + cpv_int_bnd(v); + } + + Aoe=Aw.Aoe; + Aeo=Aw.Aeo; + rv=v+nbh*Ns; + rw=w+nbh*Ns; + + nn=inn+nbh; + nm=inn+nb; + + for (;nn1) + cpv_ext_bnd(w); +} + + +void Aweeinv(complex *v,complex *w) +{ + Aw_t Aw; + + if (Ns==0) + alloc_vs(); + + Aw=Awophat(); + apply_Aee(Aw.Aee,v,w); +} + + +void Awooinv(complex *v,complex *w) +{ + Aw_t Aw; + + if (Ns==0) + alloc_vs(); + + Aw=Awophat(); + apply_Aoo(Aw.Aoo,v,w); +} + + +void Awoe(complex *v,complex *w) +{ + int (*nn)[8],(*nm)[8]; + complex *rw,*rs,*rm; + complex **Aoe; + Aw_t Aw; + + if (Ns==0) + alloc_vs(); + + if (NPROC>1) + cpv_int_bnd(v); + + Aw=Awop(); + Aoe=Aw.Aoe; + rw=w+nbh*Ns; + + nn=inn+nbh; + nm=inn+nb; + + for (;nn1) + set_v2zero(nbbh*Ns,w+nb*Ns); + + Aw=Awop(); + Aeo=Aw.Aeo; + rv=v+nbh*Ns; + + nn=inn+nbh; + nm=inn+nb; + + for (;nn1) + cpv_ext_bnd(w); +} + + +void Awhat(complex *v,complex *w) +{ + int (*nn)[8],(*nm)[8]; + complex *rs,*rm; + complex **Aeo,**Aoe; + Aw_t Aw; + + if (Ns==0) + alloc_vs(); + + assign_v2v(nbh*Ns,v,w); + + if (NPROC>1) + { + set_v2zero(nbbh*Ns,w+nb*Ns); + cpv_int_bnd(v); + } + + Aw=Awophat(); + Aoe=Aw.Aoe; + Aeo=Aw.Aeo; + + nn=inn+nbh; + nm=inn+nb; + + for (;nn1) + cpv_ext_bnd(w); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/little/Aw_com.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/little/Aw_com.c new file mode 100644 index 0000000000000000000000000000000000000000..76b5d480c8408a56743b9bc898d1003e2799d8bb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/little/Aw_com.c @@ -0,0 +1,799 @@ + +/******************************************************************************* +* +* File Aw_com.c +* +* Copyright (C) 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Communication functions needed for the computation of the little Dirac +* operator. +* +* b2b_flds_t *b2b_flds(int n,int mu) +* Extracts the spinor fields on the interior boundaries of the n'th +* block of the DFL_BLOCKS grid and its neighbouring block in direction +* mu. The spinors on the odd sites are multiplied by the link variables +* in direction mu and -mu respectively. If the two blocks touch the +* boundary of the local lattice, the fields extracted from the even +* sites are copied to the neighbouring process. The program returns a +* structure containing the extracted field arrays (see README.Aw_com +* for detailed explanations). +* +* void cpAoe_ext_bnd(void) +* Copies the hopping terms Aoe and Aeo of the double-precision little +* Dirac operator on the odd exterior boundary points of the local block +* lattice to the neighbouring MPI processes and *adds* them to the hop- +* ping terms on the matching blocks on the target lattices. +* +* void cpAee_int_bnd(void) +* Copies the even-even terms Aee of the double-precision little Dirac +* operator on the (even) interior boundary points of the local block +* lattice to the neighbouring MPI processes. +* +* Notes: +* +* The program b2b_flds() writes the extracted spinor fields to internally +* allocated field arrays. These are reused when the program is called +* the next time. The data in the field arrays returned by b2b_flds() are +* therefore preserved only up to the next call of the program. +* +*******************************************************************************/ + +#define AW_COM_C + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "uflds.h" +#include "sflds.h" +#include "vflds.h" +#include "dfl.h" +#include "little.h" +#include "global.h" + +typedef struct +{ + int *iud[2]; + int *ise[2]; + int *iso[2]; + su3_dble *ud[2]; + spinor_dble **sd[2]; + spinor_dble **snd_buf[2]; + b2b_flds_t b2b; +} bsd_t; + +static int init_bsd=0,init_Aoe=0,init_Aee=0; +static int np,nmu[8]; +static int Ns=0,nb,nbb,nbh,nbbh; +static int nbbe[8],nbbo[8],obbe[8],obbo[8]; +static int (*inn)[8],*idx,*ipp,*mp; +static int nsnd,sfc[8]; + +static complex_dble **snd_buf_Aee[8]; +static complex_dble **rcv_buf_Aoe[8],**rcv_buf_Aeo[8]; +static bsd_t (*bsd)[4]; + +static MPI_Request snd_req_bsd[8],rcv_req_bsd[8]; +static MPI_Request snd_req_Aee[8],rcv_req_Aee[8]; +static MPI_Request snd_req_Aoe[8],rcv_req_Aoe[8]; +static MPI_Request snd_req_Aeo[8],rcv_req_Aeo[8]; + + +static void set_constants(void) +{ + int ifc; + dfl_parms_t dfl; + dfl_grid_t grd; + + dfl=dfl_parms(); + grd=dfl_geometry(); + + Ns=dfl.Ns; + nb=grd.nb; + nbb=grd.nbb; + nbh=nb/2; + nbbh=nbb/2; + + for (ifc=0;ifc<8;ifc++) + { + nbbe[ifc]=grd.nbbe[ifc]; + nbbo[ifc]=grd.nbbo[ifc]; + obbe[ifc]=grd.obbe[ifc]; + obbo[ifc]=grd.obbo[ifc]; + } + + inn=grd.inn; + idx=grd.idx; + ipp=grd.ipp; + mp=grd.map; + + np=(cpr[0]+cpr[1]+cpr[2]+cpr[3])&0x1; + nsnd=0; + + for (ifc=0;ifc<8;ifc++) + { + nmu[ifc]=cpr[ifc/2]&0x1; + + if (nbbe[ifc]+nbbo[ifc]) + { + sfc[nsnd]=ifc; + nsnd+=1; + } + } +} + + +static int fnd_nn(int n,int ifc) +{ + n=idx[n]; + n=inn[n][ifc]; + + if (n>=nb) + n=mp[n-nb]; + + return idx[n]; +} + + +static void set_snd_req_bsd(void) +{ + int ifc,vol,nbf; + int tag,saddr,raddr; + bsd_t *brd; + b2b_flds_t *b2b; + + for (ifc=0;ifc<8;ifc++) + { + brd=bsd[0]+(ifc/2); + b2b=&(*brd).b2b; + vol=(*b2b).vol; + + nbf=24*Ns*vol; + saddr=npr[ifc]; + raddr=npr[ifc^0x1]; + tag=mpi_permanent_tag(); + + MPI_Send_init((*brd).snd_buf[(ifc&0x1)^0x1][0],nbf, + MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD,&snd_req_bsd[ifc]); + MPI_Recv_init((*b2b).sde[(ifc&0x1)^0x1][0],nbf, + MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&rcv_req_bsd[ifc]); + } +} + + +static void alloc_bsd(void) +{ + int nbs,isw,vbb,vbm; + int n,m,mu,ifc,vol,*iud; + int ix,iy,k; + su3_dble *ud; + spinor_dble **psd,*sd; + block_t *b; + bndry_t *bb; + bsd_t *brd; + + if (Ns==0) + set_constants(); + + b=blk_list(DFL_BLOCKS,&nbs,&isw); + error(nbs==0,1,"alloc_bsd [Aw_com.c]", + "DFL_BLOCKS grid is not allocated"); + + bb=(*b).bb; + vbb=0; + vbm=0; + + for (mu=0;mu<4;mu++) + { + vol=bb[2*mu].vol; + vbb+=vol; + + if (vol>vbm) + vbm=vol; + } + + bsd=malloc(nb*sizeof(*bsd)); + iud=malloc(nb*vbb*sizeof(*iud)); + ud=amalloc(vbm*sizeof(*ud),ALIGN); + psd=malloc(24*Ns*sizeof(*psd)); + sd=amalloc(3*Ns*vbm*sizeof(*sd),ALIGN); + + error((bsd==NULL)||(iud==NULL)||(ud==NULL)||(psd==NULL)||(sd==NULL),1, + "alloc_bsd [Aw_com.c]","Unable to allocate buffers"); + + set_sd2zero(3*Ns*vbm,sd); + + for (n=0;n0) + send_bufs_Aee(sfc[m],eo); + + ifc=sfc[n]; + io=ifc^nmu[ifc]; + + get_mat(nbbo[io],ipp+obbo[io],Aw.Aee,snd_buf_Aee[io]); + + if (n>0) + { + wait_bufs_Aee(sfc[m],eo); + m+=eo; + eo^=0x1; + } + } + + while (m +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "vflds.h" +#include "linalg.h" +#include "dfl.h" +#include "little.h" +#include "global.h" + +static int Ns=0,nb,nbh; +static int nbbh,(*inn)[8]; +static complex_dble *vs; + + +static void alloc_vs(void) +{ + dfl_parms_t dfl; + dfl_grid_t grd; + + dfl=dfl_parms(); + grd=dfl_geometry(); + + Ns=dfl.Ns; + nb=grd.nb; + nbh=nb/2; + nbbh=grd.nbb/2; + inn=grd.inn; + + vs=amalloc(Ns*sizeof(*vs),ALIGN); + + error(vs==NULL,1,"alloc_vs [Aw_dble.c]", + "Unable to allocate auxiliary array"); +} + + +static void apply_Aoe(int *nn,complex_dble **A,complex_dble *v) +{ + int ifc; + + cmat_vec_dble(Ns,*A,v+nn[0]*Ns,vs); + A+=1; + + for (ifc=1;ifc<8;ifc++) + { + cmat_vec_assign_dble(Ns,*A,v+nn[ifc]*Ns,vs); + A+=1; + } +} + + +static void apply_Aeo(int *nn,complex_dble **A,complex_dble *v) +{ + int ifc; + + for (ifc=0;ifc<8;ifc++) + { + cmat_vec_assign_dble(Ns,*A,vs,v+nn[ifc]*Ns); + A+=1; + } +} + + +static void apply_Aee(complex_dble **A,complex_dble *v,complex_dble *w) +{ + complex_dble **Am; + + Am=A+nbh; + + for (;A1) + { + set_vd2zero(nbbh*Ns,w+nb*Ns); + cpvd_int_bnd(v); + } + + Aoe=Aw.Aoe; + Aeo=Aw.Aeo; + rv=v+nbh*Ns; + rw=w+nbh*Ns; + + nn=inn+nbh; + nm=inn+nb; + + for (;nn1) + cpvd_ext_bnd(w); +} + + +void Aweeinv_dble(complex_dble *v,complex_dble *w) +{ + Aw_dble_t Aw; + + if (Ns==0) + alloc_vs(); + + Aw=Awophat_dble(); + apply_Aee(Aw.Aee,v,w); +} + + +void Awooinv_dble(complex_dble *v,complex_dble *w) +{ + Aw_dble_t Aw; + + if (Ns==0) + alloc_vs(); + + Aw=Awophat_dble(); + apply_Aoo(Aw.Aoo,v,w); +} + + +void Awoe_dble(complex_dble *v,complex_dble *w) +{ + int (*nn)[8],(*nm)[8]; + complex_dble *rw,*rs,*rm; + complex_dble **Aoe; + Aw_dble_t Aw; + + if (Ns==0) + alloc_vs(); + + if (NPROC>1) + cpvd_int_bnd(v); + + Aw=Awop_dble(); + Aoe=Aw.Aoe; + rw=w+nbh*Ns; + + nn=inn+nbh; + nm=inn+nb; + + for (;nn1) + set_vd2zero(nbbh*Ns,w+nb*Ns); + + Aw=Awop_dble(); + Aeo=Aw.Aeo; + rv=v+nbh*Ns; + + nn=inn+nbh; + nm=inn+nb; + + for (;nn1) + cpvd_ext_bnd(w); +} + + +void Awhat_dble(complex_dble *v,complex_dble *w) +{ + int (*nn)[8],(*nm)[8]; + complex_dble *rs,*rm; + complex_dble **Aeo,**Aoe; + Aw_dble_t Aw; + + if (Ns==0) + alloc_vs(); + + assign_vd2vd(nbh*Ns,v,w); + + if (NPROC>1) + { + set_vd2zero(nbbh*Ns,w+nb*Ns); + cpvd_int_bnd(v); + } + + Aw=Awophat_dble(); + Aoe=Aw.Aoe; + Aeo=Aw.Aeo; + + nn=inn+nbh; + nm=inn+nb; + + for (;nn1) + cpvd_ext_bnd(w); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/little/Aw_gen.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/little/Aw_gen.c new file mode 100644 index 0000000000000000000000000000000000000000..f1a9c7a99bb26c99684b1ca2617a9c2a14c1bb36 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/little/Aw_gen.c @@ -0,0 +1,860 @@ + +/******************************************************************************* +* +* File Aw_gen.c +* +* Copyright (C) 2007, 2008, 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Generic programs needed for the computation of the little Dirac operator +* +* The externally accessible functions are +* +* void gather_ud(int vol,int *imb,su3_dble *ud,su3_dble *vd) +* Assigns the 3x3 matrices ud[imb[i]] to vd[i] (i=0,..,vol-1). +* +* void gather_sd(int vol,int *imb,spinor_dble *sd,spinor_dble *rd) +* Assigns the spinors sd[imb[i]] to rd[i] (i=0,..,vol-1). +* +* void apply_u2sd(int vol,int *imb,su3_dble *ud,spinor_dble *sd, +* spinor_dble *rd) +* Multiplies the spinors sd[imb[i]] by the 3x3 matrices ud[i] and +* assigns the result to rd[i] (i=0,..,vol-1). +* +* void apply_udag2sd(int vol,int *imb,su3_dble *ud,spinor_dble *sd, +* spinor_dble *rd) +* Multiplies the spinors sd[imb[i]] by the adjoint of the 3x3 matrices +* ud[i] and assigns the result to rd[i] (i=0,..,vol-1). +* +* The following is an array of functions indexed by the direction mu=0,..,3: +* +* void (*spinor_prod_gamma[])(int vol,spinor_dble *sd,spinor_dble *rd, +* complex_dble *sp) +* Computes the scalar products (sd,rd) and (sd,gamma_mu*rd), where +* gamma_mu denotes the Dirac matrix with index mu and the spinor +* fields are assumed to have vol elements. On exit the calculated +* products are assigned to sp[0] and sp[1], respectively. +* +* Notes: +* +* The representation of the Dirac matrices is specified in the notes +* "Implementation of the lattice Dirac operator" (file doc/dirac.pdf). +* The input and output fields may not overlap in the case of the programs +* gather_ud(), gather_sd(), apply_u2sd() and apply_udag2sd(). +* +* All these programs can be called locally. If SSE inline-assembly is used +* (i.e. if x64 is set), it is taken for granted that the field arrays are +* aligned to 16 byte boundaries. +* +*******************************************************************************/ + +#define AW_GEN_C + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "utils.h" +#include "little.h" + +#define MAX_LEVELS 8 +#define BLK_LENGTH 8 + +static int cnt[MAX_LEVELS]; +static complex_dble sm0[MAX_LEVELS] ALIGNED16; +static complex_dble sm1[MAX_LEVELS] ALIGNED16; + + +static void init_sm(void) +{ + int n; + + for (n=0;n=BLK_LENGTH)&&(nrt) + rm=rt; + + _start_sm(); + + for (;rdrt) + rm=rt; + + _start_sm(); + + for (;rdrt) + rm=rt; + + _start_sm(); + + for (;rdrt) + rm=rt; + + _start_sm(); + + for (;rdrt) + rm=rt; + + z0.re=0.0; + z0.im=0.0; + z1.re=0.0; + z1.im=0.0; + + for (;rdrt) + rm=rt; + + z0.re=0.0; + z0.im=0.0; + z1.re=0.0; + z1.im=0.0; + + for (;rdrt) + rm=rt; + + z0.re=0.0; + z0.im=0.0; + z1.re=0.0; + z1.im=0.0; + + for (;rdrt) + rm=rt; + + z0.re=0.0; + z0.im=0.0; + z1.re=0.0; + z1.im=0.0; + + for (;rd +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "vflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "dirac.h" +#include "block.h" +#include "dfl.h" +#include "little.h" +#include "global.h" + +#define MAX_FROBENIUS 1.0e6 +#define MAX_UPDATE 128 + +static int Ns=0,nb,nbh,nbbh; +static int *idx,(*inn)[8]; +static int old_eo[2],nupd=0; +static double old_m0[2],old_mu[2]; +static Aw_dble_t Awd={0},Awdhat={0}; +static Aw_t Aws={0},Awshat={0}; + + +static void set_constants(void) +{ + dfl_parms_t dfl; + dfl_grid_t grd; + + dfl=dfl_parms(); + grd=dfl_geometry(); + + Ns=dfl.Ns; + nb=grd.nb; + nbh=nb/2; + nbbh=grd.nbb/2; + idx=grd.idx; + inn=grd.inn; +} + + +static void alloc_Awd(Aw_dble_t *Aw) +{ + int n,k,nmat,nbee,nboe; + complex_dble **ww,*w; + + if (Ns==0) + set_constants(); + + nmat=Ns*Ns; + nbee=0; + nboe=0; + if (Aw==(&Awd)) + nboe=nbbh; + if (Aw==(&Awdhat)) + nbee=nbbh; + n=18*nbh+nbee+2*nboe; + ww=malloc(n*sizeof(*ww)); + w=amalloc(n*nmat*sizeof(*w),ALIGN); + error((ww==NULL)||(w==NULL),1,"alloc_Awd [Aw_ops.c]", + "Unable to allocate matrix arrays"); + + for (k=0;k1) + { + dprms[0]=mu; + MPI_Bcast(dprms,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + error(dprms[0]!=mu,1, + "set_Aw [Aw_ops.c]","Parameters are not global"); + } + + if (Awd.Ns==0) + alloc_Awd(&Awd); + + sw=sw_parms(); + m0=sw.m0; + tm=tm_parms(); + eo=tm.eoflg; + + if (query_flags(AW_UP2DATE)==1) + { + if ((m0!=old_m0[0])||(mu!=old_mu[0])||(eo!=old_eo[0])) + update_Awdiag(m0,mu,eo); + return; + } + + sw_term(NO_PTS); + b=blk_list(DFL_BLOCKS,&nbs,&isw); + + for (n=0;n=nbh) + { + z=Awd.Aeo[8*(msw-nbh)+(ifc^0x1)]; + w=Awd.Aoe[8*(msw-nbh)+(ifc^0x1)]; + } + else if (ibn) + { + mbd=inn[msw][ifc^0x1]-nb-nbbh; + z=Awd.Aoe[8*nbh+mbd]; + w=Awd.Aeo[8*nbh+mbd]; + } + else + { + z=Awd.Aoe[8*(nsw-nbh)+ifc]; + w=Awd.Aeo[8*(nsw-nbh)+ifc]; + } + + for (k=0;k=nbh) + { + z=Awd.Aoe[8*(nsw-nbh)+ifc]; + w=Awd.Aeo[8*(nsw-nbh)+ifc]; + } + else if (ibn) + { + nbd=inn[nsw][ifc]-nb-nbbh; + z=Awd.Aeo[8*nbh+nbd]; + w=Awd.Aoe[8*nbh+nbd]; + } + else + { + z=Awd.Aeo[8*(msw-nbh)+(ifc^0x1)]; + w=Awd.Aoe[8*(msw-nbh)+(ifc^0x1)]; + } + + for (k=0;kMAX_FROBENIUS) + ifail=1; + } + + cpAee_int_bnd(); + + for (n=0;n=nb) + m-=nbh; + + cmat_mul_dble(Ns,Awdhat.Aee[m],Awd.Aeo[8*n+ifc], + Awdhat.Aeo[8*n+ifc]); + } + } + + for (n=0;nMAX_FROBENIUS) + ifail=1; + + for (ifc=0;ifc<8;ifc++) + cmat_mul_dble(Ns,Awdhat.Aoo[n],Awd.Aoe[8*n+ifc],Awdhat.Aoe[8*n+ifc]); + } + + if (Awshat.Ns==0) + alloc_Aws(&Awshat); + assign_Awd2Aw(&Awdhat,&Awshat); + set_flags(COMPUTED_AWHAT); + + old_m0[1]=m0; + old_mu[1]=mu; + old_eo[1]=eo; + ifail|=set_ltl_modes(); + MPI_Allreduce(&ifail,&n,1,MPI_INT,MPI_MAX,MPI_COMM_WORLD); + + return n; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/little/README b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/little/README new file mode 100644 index 0000000000000000000000000000000000000000..9bfe00eeefa082961ce6cc69e4258a5d30f2c1f5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/little/README @@ -0,0 +1,179 @@ + +******************************************************************************** + + Little Dirac Operator + +******************************************************************************** + + +Files +----- + +Aw_gen.c Generic programs needed for the computation of the + little Dirac operator + +Aw_com.c Communication program needed in the computation of + the little Dirac operator + +Aw_ops.c Computation of the little Dirac operator + +Aw.c Application of the single_precision little Wilson-Dirac + operator Aw + +Aw_dble.c Application of the double-precision little Wilson-Dirac + operator Aw + +ltl_modes.c Computation of the little modes + + +Include file +------------ + +The file little.h defines the prototypes for all externally accessible +functions that are defined in the *.c files listed above. + + +List of functions +----------------- + +void gather_ud(int vol,int *imb,su3_dble *ud,su3_dble *vd) + Assigns the 3x3 matrices ud[imb[i]] to vd[i] (i=0,..,vol-1). + +void gather_sd(int vol,int *imb,spinor_dble *sd,spinor_dble *rd) + Assigns the spinors sd[imb[i]] to rd[i] (i=0,..,vol-1). + +void apply_u2sd(int vol,int *imb,su3_dble *ud,spinor_dble *sd, + spinor_dble *rd) + Multiplies the spinors sd[imb[i]] by the 3x3 matrices ud[i] and + assigns the result to rd[i] (i=0,..,vol-1). + +void apply_udag2sd(int vol,int *imb,su3_dble *ud,spinor_dble *sd, + spinor_dble *rd) + Multiplies the spinors sd[imb[i]] by the adjoint of the 3x3 matrices + ud[i] and assigns the result to rd[i] (i=0,..,vol-1). + +The following is an array of functions indexed by the direction mu=0,..,3: + +void (*spinor_prod_gamma[])(int vol,spinor_dble *sd,spinor_dble *rd, + complex_dble *sp) + Computes the scalar products (sd,rd) and (sd,gamma_mu*rd), where + gamma_mu denotes the Dirac matrix with index mu and the spinor + fields are assumed to have vol elements. On exit the calculated + products are assigned to sp[0] and sp[1], respectively. + +b2b_flds_t *b2b_flds(int n,int mu) + Extracts the spinor fields on the interior boundaries of the n'th + block of the DFL_BLOCKS grid and its neighbouring block in direction + mu. The spinors on the odd sites are multiplied by the link variables + in direction mu and -mu respectively. If the two blocks touch the + boundary of the local lattice, the fields extracted from the even + sites are copied to the neighbouring process. The program returns a + structure containing the extracted field arrays (see README.Aw_com + for detailed explanations). + +void cpAoe_ext_bnd(void) + Copies the hopping terms Aoe and Aeo of the double-precision little + Dirac operator on the odd exterior boundary points of the local block + lattice to the neighbouring processes and *adds* them to the hopping + terms on the matching blocks on the target lattices. + +void cpAee_int_bnd(void) + Copies the even-even terms Aee of the double-precision little Dirac + operator on the (even) interior boundary points of the local block + lattice to the neighbouring processes. + +Aw_t Awop(void) + Returns a structure containing the matrices that describe the + single-precision little Dirac operator. + +Aw_t Awophat(void) + Returns a structure containing the matrices that describe the + single-precision even-odd preconditioned little Dirac operator. + +Aw_dble_t Awop_dble(void) + Returns a structure containing the matrices that describe the + double-precision little Dirac operator. + +Aw_dble_t Awophat_dble(void) + Returns a structure containing the matrices that describe the + double-precision even-odd preconditioned little Dirac operator. + +void set_Aw(double mu) + Computes the single- and the double-precision little Dirac operator. + The SW term is updated if needed and the twisted mass is set to mu. + If the twisted-mass flag is set, the twisted-mass term is switched + on the odd sites of the lattice. + +int set_Awhat(double mu) + Computes the single- and the double-precision even-odd preconditioned + little Dirac operator. The program calls set_Aw(mu) and thus updates + the operator w/o even-odd preconditioning too. The little modes are + updated as well (see ltl_modes.c). On exit the program returns 0 if + all matrix inversions were safe and 1 if not. + +void Aw(complex *v,complex *w) + Applies the little Dirac operator to the field v and assigns the + result to the field w. + +void Aweeinv(complex *v,complex *w) + Applies the inverse of the even-even part of the little Dirac operator + to the field v and assigns the result to the field w on the even blocks. + On the odd blocks, w is unchanged. + +void Awooinv(complex *v,complex *w) + Applies the inverse of the odd-odd part of the little Dirac operator + to the field v and assigns the result to the field w on the odd blocks. + On the even blocks, w is unchanged. + +void Awoe(complex *v,complex *w) + Applies the odd-even part of the little Dirac operator to the field v + and assigns the result to the field w on the odd blocks. On the even + blocks, w is unchanged. + +void Aweo(complex *v,complex *w) + Applies the even-odd part of the little Dirac operator to the field v + and *subtracts* the result from the field w on the even blocks. On the + odd blocks, w is unchanged. + +void Awhat(complex *v,complex *w) + Applies the even-odd preconditioned little Dirac operator to the field + v and assigns the result to the field w on the even blocks. On the odd + blocks, w is unchanged. + +void Aw_dble(complex_dble *v,complex_dble *w) + Applies the little Dirac operator to the field v and assigns the + result to the field w. + +void Aweeinv_dble(complex_dble *v,complex_dble *w) + Applies the inverse of the even-even part of the little Dirac operator + to the field v and assigns the result to the field w on the even blocks. + On the odd blocks, w is unchanged. + +void Awooinv_dble(complex_dble *v,complex_dble *w) + Applies the inverse of the odd-odd part of the little Dirac operator + to the field v and assigns the result to the field w on the odd blocks. + On the even blocks, w is unchanged. + +void Awoe_dble(complex_dble *v,complex_dble *w) + Applies the odd-even part of the little Dirac operator to the field v + and assigns the result to the field w on the odd blocks. On the even + blocks, w is unchanged. + +void Aweo_dble(complex_dble *v,complex_dble *w) + Applies the even-odd part of the little Dirac operator to the field v + and *subtracts* the result from the field w on the even blocks. On the + odd blocks, w is unchanged. + +void Awhat_dble(complex_dble *v,complex_dble *w) + Applies the even-odd preconditioned little Dirac operator to the field + v and assigns the result to the field w on the even blocks. On the odd + blocks, w is unchanged. + +int set_ltl_modes(void) + Computes the little modes, the associated little-little Dirac + operator and its inverse. The program returns 0 if the inversion + was safe and 1 if not. + +complex_dble *ltl_matrix(void) + Returns the pointer to an Ns x Ns matrix that represents the + *inverse* of the double-precision little-little Dirac operator. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/little/README.Aw b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/little/README.Aw new file mode 100644 index 0000000000000000000000000000000000000000..63539d6b1c81b151532ec80c58b86fd8305decb3 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/little/README.Aw @@ -0,0 +1,149 @@ + +******************************************************************************** + + Definition of the little Dirac operator + +******************************************************************************** + +The little Dirac operator was introduced in + + M. Luescher: "Local coherence and deflation of the low quark modes + in lattice QCD", JHEP 0707 (2007) 081 + +Here the data structures used to represent the operator are described. + + +Definition +---------- + +The deflation subspace is spanned by the fields (*b).sd[1],..,(*b).sd[Ns] on +the blocks b of the DFL_BLOCKS grid. When the subspace is created, the basis +fields are orthonormalized on each block. The restriction of the Wilson-Dirac +operator Dw+i*mu*gamma_5 to the deflation subspace is referred to as the +little Dirac operator. It is completely specified by the matrix elements + + A_{n,k;m,l}=(v_{n,k},(Dw+i*mu*gamma_5)*v_{m,l}) + +where v_{n,0},v_{n,1},..,v_{n,Ns-1} are the basis vectors on the block with +index n. + + +Matrix arrays +------------- + +The DFL_BLOCKS grid consists of the sublattices of the even and odd blocks +(see dfl/dfl_geometry.c). In each local lattice, there are nb blocks, half of +which are even and half odd. The number nbb of blocks on the exterior boundary +of the local lattice also divides into equal numbers of even and odd blocks. + +The matrix A_{n,k;m,l} decomposes into four parts Aee, Aoo, Aoe and Aeo in the +obvious way. Each of these parts may be stored in the form of one-dimensional +arrays of complex Ns x Ns matrices. Explicitly + + Aee[n][Ns*k+l] = (v_{n,k},(Dw+i*mu*gamma_5)*v_{n,l}), + + Aoo[n][Ns*k+l] = (v_{m,k},(Dw+i*mu*gamma_5)*v_{m,l}), m=n+nb/2, + + Aoe[8*n+ifc][Ns*k+l] = (v_{m,k},(Dw+i*mu*gamma_5)*v_{inn[m][ifc],l}), + + Aeo(8*n+ifc][Ns*k+l] = (v_{inn[m][ifc],k},(Dw+i*mu*gamma_5)*v_{m,l}), + +where n=0,..,nb/2-1 labels the even blocks, m=nb/2,..,nb-1 the odd blocks, +ifc=0,..,7 the 8 coordinate directions -0,+0,..,-3,+3, while inn[m][ifc] is +the index of the block in direction ifc of the block with index m. + +In the case of the double-precision operator, the length of the arrays Aoe and +Aeo is 4*nb+nbb/2 rather than 4*nb. The additional nbb/2 elements at end of +the arrays are used at intermediate stages of the computations as buffers for +the matrices on the odd exterior boundary points of the block lattice. These +are stored in the order of the boundary points (see dfl/dfl_geometry.c and +README.Aw_com). + + +Even-odd flag +------------- + +The even-odd flag can be set and unset by calling set_tm_parms() (see +flags/lat_parms.c). Initially the flag is not set. + +The programs for the Dirac operator and thus those that construct the little +Dirac operator apply the twisted mass term i*mu*gamma_5 on the even sites of +the lattice only if the flag is set. The associated deflation projectors are +suitable for the solution of the Dirac operator with such a twisted-mass term. + + +Data structure +-------------- + +The single- and double-precision arrays representing the little Dirac operator +are collected in the structures Aw_t and Aw_dble_t (see include/little.h). The +elements of these structures are + + Ns,nb + Aee[nb/2][Ns*Ns] + Aoo[nb/2][Ns*Ns] + Aoe[4*nb][Ns*Ns] + Aeo[4*nb][Ns*Ns] + +As already mentioned, the length of the last two arrays is 4*nb+nbb/2 +rather than 4*nb in the case of the double precision operator. + + +Even-odd preconditioned operator +-------------------------------- + +The even-odd preconditioned little operator Ahat acts on fields supported on +the even blocks. It is related to the little operator A through + + Ahat=1-Aee^(-1)*Aeo*Aoo^(-1)*Aoe + +The preconditioned operator may be represented by Aw_t and Aw_dble_t +structures containing the matrix arrays + + Aee^(-1), Aoo^(-1), Aee^(-1)*Aeo and Aoo^(-1)*Aoe + +instead of Aee, Aoo, Aoe and Aeo. + +In the case of the double-precision preconditioned operator, the array of the +even-even terms is of length nb/2+nbb/2 instead of nb/2. The additional nbb/2 +elements at end of the array are used as buffers for the matrices on the even +interior boundary points of the block lattice (see README.Aw_com). + + +Little-little Dirac operator +---------------------------- + +The deflation subspace is constructed by projecting Ns global spinor fields to +the blocks of the DFL_BLOCKS grid. These global fields are linear combinations +of the basis fields v_{n,k} and span a subspace of dimension Ns within the +deflation subspace (which has dimension nb*Ns). + +The even-odd preconditioned little Dirac operator may be deflated using the +restriction of the global modes to the even blocks as the deflation modes. +These fields (which are also contained in the deflation subspace) are referred +to as the little modes, and the restriction of the even-odd preconditioned +little Dirac operator to the space spanned by them as the little-little Dirac +operator. Its action is completely specified by its matrix elements in the +space of the little modes, i.e. by a complex Ns x Ns matrix (the program +set_ltl_modes() orthonormalizes the little modes before the little-little +Dirac operator is calculated). + +The single-precision little modes md_k (k=0,..,Ns-1) and Awhat*md_k are stored +in the first and second half of the first Ns fields returned by vflds(). The +double-precision fields are stored in the same way in the Ns fields returned +by vdflds(). + + +Boundary conditions +------------------- + +In the case of boundary conditions of type 0,1 and 2, the hopping terms Aeo +and Aoe that go across the boundaries of the lattice at global time 0 and +NPROC0*L0-1 are equal to zero. + +The programs in this directory obtain the little Dirac operator always in the +same way and thus effectively as if periodic boundary conditions were imposed +in the time direction. Since the quark fields vanish at time 0 when boundary +conditions of type 0,1 or 2 are chosen, the calculation gives the correct +result (i.e. vanishing hopping terms across the lattice boundaries) also in +these cases. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/little/README.Aw_com b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/little/README.Aw_com new file mode 100644 index 0000000000000000000000000000000000000000..de5dbe03877483fc99fa19efffa3b3ff55d4c4bd --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/little/README.Aw_com @@ -0,0 +1,204 @@ + +******************************************************************************** + + Calculation of the little Dirac operator + +******************************************************************************** + +The computation of the matrix arrays Aoe and Aeo representing the hopping +terms of the little Dirac operator require the spinor fields at the interior +boundaries of the local lattice to be communicated to the neighbouring +lattices. Some communications are also required when the matrices representing +the even-odd preconditioned operator are calculated. + + +Extraction of boundary fields +----------------------------- + +The computation of the matrix elements Aoe and Aeo involves a computation of +the scalar products of spinor fields residing at the interior boundary points +of the blocks in the DFL_BLOCKS grid. If b0 and b1 are neighbouring blocks +with indices n0 and n1, respectively, where b1 is displaced from b0 in the +positive direction mu, the geometrical situation is as follows: + + -----> x_mu + + ----- ----- + e | | o + e | | o + o | | e + o | | e + ----- ----- + block b0 block b1 + +Here "e" and "o" denote even and odd interior boundary points. The scalar +products to be computed are then + + sp[0][Ns*k+l] = -1/2*(v_{n0,k},(1-gamma_mu)*U*v_{n1,l}), + + sp[1][Ns*k,l] = -1/2*(v_{n0,k},(1+gamma_mu)*U*v_{n1,l}), + +where v_{n0,k} and v_{n1,l} (k,l=0,..,Ns-1) are the deflation modes on block +b0 and b1 respectively. In these scalar products, one sums over the interface +points only and "U" stands for the link variables across the interface. + +It is helpful to split the sum in the scalar products in two sums, one going +over the (e,o) pairs of points and the other over the (o,e) pairs (see the +figure above). The computation then proceeds by first extracting + + psi_{k,e}=v_{n0,k}_e, chi_{l,e}=v_{n1,l}_e, + + psi_{k,o}=U^dag*v_{n0,k}_o, chi_{l,o}=U*v_{n1,l}_o. + +Once this is done, the scalar products + + (psi_{k,e},chi_{l,o}), (psi_{k,e},gamma_mu*chi_{l,o}), + + (psi_{k,o},chi_{l,e}), (psi_{k,o},gamma_mu*chi_{l,e}), + +may be calculated, from which the matrices sp[0] and sp[1] are obtained by +taking simple linear combinations. + + +Communication of spinor fields +------------------------------ + +When the block b0 touches the boundary of the local lattice in direction mu, +the neighbouring block b1 is on the neighbouring MPI process. At the same time +the local lattice contains another block b1', with index n1', on the opposite +face of the local lattice, which is the neighbour in direction mu of the n0'th +block on the process in direction -mu: + + + -------- ---------------------- -------- + | | | | + | | | | + | | | | + | | | | + *****| |***** ***** | | ***** + * *| |* * * * | | * * + *****| |***** ***** | | ***** + b0' | | b1' b0 | | b1 + | | | | + | | | | + -------- ---------------------- -------- + + +Before the scalar products can be computed, some fields need to be moved from +and to the neighbouring processes. The program b2b_flds() moves + + psi_{k,e} from b0 to b1 and + + chi_{l,e} from b1 to b0 + +across the interface that separates b0 from b1. Note that b1' is the neighbour +of b0' on the local lattice to the left. Exchanges of fields across that +boundary are performed as in the case of the b0,b1 pair of blocks. + + +Elements of the b2b_flds_t structure +------------------------------------ + +The b2b_flds_t structure returned by the program b2b_flds() contain the +following data: + + n[2] n[0]=n0. n[1]=n1 or n1' depending on whether + b1 is on the local lattice or not. + + vol Number of points on the interface. + + ibn Indicates whether b1 is on a different + local lattice (ibn=1) or not (ibn=0). + + sde[2][Ns][vol] Extracted field arrays. + sdo[2][Ns][vol] + +The contents of the field arrays depends on whether a communication was needed +or not: + +ibn=0 (no communication): + + sde[0][k] = psi_{k,e} + sde[1][l] = chi_{l,e} + + sdo[0][k] = psi_{k,o} + sdo[1][l] = chi_{l,o} + +ibn=1: + + sde[0][k] = psi_{k,e}' (field communicated from b0') + sde[1][l] = chi_{l,e} (field communicated from b1) + + sdo[0][k] = psi_{k,o} (field extracted from b0) + sdo[1][l] = chi_{l,o}' (field extracted from b1') + + +Computation of scalar products +------------------------------ + +The calculation of the hopping terms Aoe and Aeo proceeds by running through +all block pairs b0,b1, extracting the boundary fields using b2b_flds() and +calculating the scalar products of the extracted fields. In the case of the +block pairs with ibn=0, the extracted fields are exactly those required for +these scalar products. However, if ibn=1, the scalar products that can be +formed (without further communication) are + + (psi_{k,e}',chi_{l,o}'), (psi_{k,e}',gamma_mu*chi_{l,o}') + +and + + (psi_{k,o},chi_{l,e}), (psi_{k,o},gamma_mu*chi_{l,e}). + +The first of these contribute to the hopping terms Aoe,Aeo to/from b0',b1' +and the second to those to/from b0,b1. + + +Assignment of the hopping terms +------------------------------- + +The calculated scalar products finally need to be assigned to the arrays Aoe +and Aeo in the data structures that define the little Dirac operator (see +README.Aw). In doing so, one should take into account that the labeling of the +blocks, as used in the description of the geometry of the DFL_BLOCKS grid, is +not guaranteed to coincide with the ordering of the blocks in block list +returned by blk_list(). The geometric label of the n'th block in the list is + + nsw=grd.idx[n], + +where grd=dfl_geometry() is the structure containing the grid geometry arrays. +The ordering of the matrices in the arrays Aoe and Aeo is the geometric one, +while the program b2b_flds() uses the natural ordering in the block list. + +The mapping of the scalar products is thus + +ibn=0: + + (psi_{k,o},chi_{l,e}), .. are assigned to Aoe[m+ifc],Aeo[m+ifc] where + m=grd.idx[n0] and ifc=2*mu+1 if b0 is odd or + m=grd.idx[n1] and ifc=2*mu if b0 is even. + +ibn=1: + + (psi_{k,o},chi_{l,e}), .. are assigned to Aoe[m+ifc],Aeo[m+ifc] where + m=grd.idx[n0] and ifc=2*mu+1 if b0 is odd. + + (psi_{k,o}',chi_{l,e}'), .. are assigned to Aoe[m+ifc],Aeo[m+ifc] where + m=grd.idx[n1'] and ifc=2*mu if b1' is odd. + + If b0 and/or b1' is even, the scalar products must be copied to the + neighbouring processes in direction +mu and -mu respectively. They are + first assigned to matrices at the end of the Aoe and Aeo arrays and + eventually (after all pairs of blocks are processed) are communicated + by the program cpAoe_ext_bnd(). + + +Even-odd preconditioned operator +-------------------------------- + +As explained in README.Aw, the even-odd preconditioned little Dirac operator +requires the computation of the products Aee^(-1)*Aeo and Aoo^(-1)*Aoe. All +matrices in the second product are locally available, but the first product +can only be formed after communicating the matrices Aee^(-1) residing at the +interior boundary of the local block lattice to the neighbouring processes. +The program cpAee_int_bnd() does that along the lines of the communication +programs for complex and spinor fields (see vflds/vdcom.c and sflds/sdcom.c). diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/little/ltl_modes.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/little/ltl_modes.c new file mode 100644 index 0000000000000000000000000000000000000000..b02992351c7f3d9a61ea37ff1b26c0e01d1bda77 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/little/ltl_modes.c @@ -0,0 +1,189 @@ + +/******************************************************************************* +* +* File ltl_modes.c +* +* Copyright (C) 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Computation of the little modes. +* +* The externally accessible functions are +* +* int set_ltl_modes(void) +* Computes the little modes, the associated little-little Dirac +* operator and its inverse. The program returns 0 if the inversion +* was safe and 1 if not. +* +* complex_dble *ltl_matrix(void) +* Returns the pointer to an Ns x Ns matrix that represents the +* *inverse* of the double-precision little-little Dirac operator. +* +* Notes: +* +* For a description of the little Dirac operator and the associated data +* structures see README.Aw. As usual, Ns denotes the number of deflation +* modes in each block of the DFL_BLOCKS grid. +* +* The inversion of a double-precision complex matrix is considered to be +* safe if and only if its Frobenius condition number is less than 10^6. +* +* All programs in this module may involve global communications and must +* be called simultaneously on all MPI processes. +* +*******************************************************************************/ + +#define LTL_MODES_C + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "vflds.h" +#include "linalg.h" +#include "sw_term.h" +#include "dirac.h" +#include "block.h" +#include "dfl.h" +#include "little.h" +#include "global.h" + +#define MAX_FROBENIUS 1.0e6 + +static int Ns=0,nv,nvh; +static complex **vs; +static complex_dble **vds,*Ads,*Bds,*Cds; + + +static void sum_vprod(int n,complex_dble *z,complex_dble *w) +{ + int k; + + if (NPROC>1) + { + MPI_Reduce(z,w,2*n,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(w,2*n,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + else + { + for (k=0;k0) + { + for (l=0;lMAX_FROBENIUS) + ifail=1; + + return ifail; +} + + +complex_dble *ltl_matrix(void) +{ + if (Ns==0) + alloc_matrices(); + + return Bds; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/mdflds/README b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/mdflds/README new file mode 100644 index 0000000000000000000000000000000000000000..20f58d885dc507bd84453c269da4d04242a5dc9c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/mdflds/README @@ -0,0 +1,59 @@ + +******************************************************************************** + + Molecular-dynamics auxiliary fields + +******************************************************************************** + + +Files +----- + +fcom.c Communication of the force variables residing at the + boundaries of the local lattices. + +mdflds.c Allocation and initialization of the MD auxiliary fields. + + + +Include file +------------ + +The file mdflds.h defines the prototypes for all externally accessible +functions that are defined in the *.c files listed above. + + +List of functions +----------------- + +void copy_bnd_frc(void) + Copies the force variables on the boundaries of the local lattice + from the neighbouring processes. The force variables on the spatial + links at time T are fetched only in the case of periodic boundary + conditions. + +void add_bnd_frc(void) + Adds the values of the force variables on the boundaries of the + local lattice to the force field on the neighbouring processes. + The force variables on the spatial links at time T are added only + in the case of periodic boundary conditions. + +mdflds_t *mdflds(void) + Returns the pointer to a mdflds_t structure containing the force and + momentum field. The fields are automatically allocated if needed. + +void set_frc2zero(void) + Sets all force variables, including those on the boundary, to zero. + +void bnd_mom2zero(void) + Sets the components of the momentum field on the static links + to zero (see the notes). + +void random_mom(void) + Sets the elements X of the momentum field on the active links to + random values with distribution proportional to exp(tr{X^2}). On + the static links the field is set to zero (see the notes). + +double momentum_action(int icom) + Returns the action of the momentum field. The action is summed + over all MPI processes if (and only if) icom=1. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/mdflds/fcom.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/mdflds/fcom.c new file mode 100644 index 0000000000000000000000000000000000000000..6265fb8170b330ae64d2115fb89657aa2da232fb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/mdflds/fcom.c @@ -0,0 +1,410 @@ + +/******************************************************************************* +* +* File fcom.c +* +* Copyright (C) 2010, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Communication of the force variables residing at the exterior boundaries +* of the local lattices. +* +* The externally accessible functions are +* +* void copy_bnd_frc(void) +* Copies the force variables from the neighbouring MPI processes to +* the exterior boundaries of the local lattice. The field variables +* on the spatial links at time NPROC0*L0 are fetched only in the case +* of periodic boundary conditions. +* +* void add_bnd_frc(void) +* Adds the force variables on the exterior boundaries of the local +* lattice to the field variables on the neighbouring MPI processes. +* The field variables on the spatial links at time NPROC0*L0 are +* added only in the case of periodic boundary conditions. +* +* Notes: +* +* The force field is the one returned by mdflds(). Its elements are ordered +* in the same way as those of the global gauge fields (see main/README.global +* and lattice/README.uidx). +* +* The programs in this module perform global communications and must be +* called simultaneously on all MPI processes. +* +*******************************************************************************/ + +#define FCOM_C + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "mdflds.h" +#include "global.h" + +static int bc,np; +static const su3_alg_dble fd0={0.0}; +static su3_alg_dble *sbuf_f0=NULL,*sbuf_fk,*rbuf_f0,*rbuf_fk; +static mdflds_t *mdfs; +static uidx_t *idx; + + +static void alloc_frcbufs(void) +{ + int ib; + + bc=bc_type(); + np=(cpr[0]+cpr[1]+cpr[2]+cpr[3])&0x1; + mdfs=mdflds(); + idx=uidx(); + + sbuf_f0=amalloc(7*(BNDRY/4)*sizeof(*sbuf_f0),ALIGN); + error(sbuf_f0==NULL,1,"alloc_frcbufs [fcom.c]", + "Unable to allocate communication buffers"); + + sbuf_fk=sbuf_f0+(BNDRY/4); + rbuf_f0=(*mdfs).frc+4*VOLUME; + rbuf_fk=rbuf_f0+(BNDRY/4); + + for (ib=0;ib<(7*(BNDRY/4));ib++) + sbuf_f0[ib]=fd0; +} + + +static void pack_f0(void) +{ + int mu,nu0; + int *iu,*ium; + su3_alg_dble *f,*fb; + + fb=(*mdfs).frc; + f=sbuf_f0; + + for (mu=0;mu<4;mu++) + { + nu0=idx[mu].nu0; + + if (nu0>0) + { + iu=idx[mu].iu0; + ium=iu+nu0; + + for (;iu0) + { + if ((mu>0)||(cpr[0]>0)||(bc==3)) + { + iu=idx[mu].iuk; + ium=iu+nuk; + + for (;iu0) + { + tag=mpi_tag(); + saddr=npr[2*mu]; + raddr=npr[2*mu+1]; + nbf=8*nuk; + + if (np==0) + { + if ((mu>0)||(cpr[0]>0)||(bc==3)) + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + if ((mu>0)||(cpr[0]<(NPROC0-1))||(bc==3)) + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + } + else + { + if ((mu>0)||(cpr[0]<(NPROC0-1))||(bc==3)) + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + if ((mu>0)||(cpr[0]>0)||(bc==3)) + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + } + + sbuf+=nuk; + rbuf+=nuk; + } + } +} + + +void copy_bnd_frc(void) +{ + if (NPROC>1) + { + if (sbuf_f0==NULL) + alloc_frcbufs(); + + pack_f0(); + fwd_send_f0(); + pack_fk(); + fwd_send_fk(); + } +} + + +static void bck_send_f0(void) +{ + int mu,nu0,nbf; + int tag,saddr,raddr; + su3_alg_dble *sbuf,*rbuf; + MPI_Status stat; + + sbuf=rbuf_f0; + rbuf=sbuf_f0; + + for (mu=0;mu<4;mu++) + { + nu0=idx[mu].nu0; + + if (nu0>0) + { + tag=mpi_tag(); + saddr=npr[2*mu+1]; + raddr=npr[2*mu]; + nbf=8*nu0; + + if (np==0) + { + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + } + else + { + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + } + + sbuf+=nu0; + rbuf+=nu0; + } + } +} + + +static void bck_send_fk(void) +{ + int mu,nuk,nbf; + int tag,saddr,raddr; + su3_alg_dble *sbuf,*rbuf; + MPI_Status stat; + + sbuf=rbuf_fk; + rbuf=sbuf_fk; + + for (mu=0;mu<4;mu++) + { + nuk=idx[mu].nuk; + + if (nuk>0) + { + tag=mpi_tag(); + saddr=npr[2*mu+1]; + raddr=npr[2*mu]; + nbf=8*nuk; + + if (np==0) + { + if ((mu>0)||(cpr[0]<(NPROC0-1))||(bc==3)) + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + if ((mu>0)||(cpr[0]>0)||(bc==3)) + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + } + else + { + if ((mu>0)||(cpr[0]>0)||(bc==3)) + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + if ((mu>0)||(cpr[0]<(NPROC0-1))||(bc==3)) + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + } + + sbuf+=nuk; + rbuf+=nuk; + } + } +} + + +static void add_f0(void) +{ + int mu,nu0; + int *iu,*ium; + su3_alg_dble *f,*fb,*frc; + + fb=(*mdfs).frc; + f=sbuf_f0; + + for (mu=0;mu<4;mu++) + { + nu0=idx[mu].nu0; + + if (nu0>0) + { + iu=idx[mu].iu0; + ium=iu+nu0; + + for (;iu0) + { + if ((mu>0)||(cpr[0]>0)||(bc==3)) + { + iu=idx[mu].iuk; + ium=iu+nuk; + + for (;iu1) + { + if (sbuf_f0==NULL) + alloc_frcbufs(); + + bck_send_fk(); + add_fk(); + bck_send_f0(); + add_f0(); + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/mdflds/mdflds.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/mdflds/mdflds.c new file mode 100644 index 0000000000000000000000000000000000000000..7c5926f3ee8be3fa4eadc011f238a9c74d600d94 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/mdflds/mdflds.c @@ -0,0 +1,201 @@ + +/******************************************************************************* +* +* File mdflds.c +* +* Copyright (C) 2011, 2012, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Allocation and initialization of the MD auxiliary fields. +* +* The externally accessible functions are +* +* mdflds_t *mdflds(void) +* Returns the pointer to a mdflds_t structure containing the force and +* momentum field. The fields are automatically allocated if needed. +* +* void set_frc2zero(void) +* Sets all force variables, including those on the boundary, to zero. +* +* void bnd_mom2zero(void) +* Sets the components of the momentum field on the static links +* to zero (see the notes). +* +* void random_mom(void) +* Sets the elements X of the momentum field on the active links to +* random values with distribution proportional to exp(tr{X^2}). On +* the static links the field is set to zero (see the notes). +* +* double momentum_action(int icom) +* Returns the action of the momentum field. The action is summed +* over all MPI processes if (and only if) icom=1. +* +* Notes: +* +* The arrays *.mom and *.frc in the structure returned by mflds() are the +* molecular-dynamics momentum and force fields. Their elements are ordered +* in the same way as the link variables (see main/README.global). Moreover, +* the force field includes space for 7*(BNDRY/4) additional links as do the +* gauge fields (see lattice/README.uidx). +* +* Before the momentum and force fields are allocated, the geometry arrays +* must be set. The sets of static and active links depend on the chosen +* boundary conditions. Only the field variables on the active links are +* updated in the simulations. +* +* The number npf of pseudo-fermion fields is retrieved from the parameter +* data base (see flags/hmc_parms.c). It is thus assumed that npf has been +* set when the programs in this module are called for the first time (the +* field array is otherwise set to NULL). +* +* Pseudo-fermion fields are of the same size NSPIN as other quark fields. +* In the structure returned by mdflds(), the address of the pseudo-fermion +* field with index ipf is *.pf[ipf]. +* +* The programs potentially perform global operations and must be called +* simultaneously on all MPI processes. +* +*******************************************************************************/ + +#define MDFLDS_C + +#include +#include +#include +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "sflds.h" +#include "linalg.h" +#include "mdflds.h" +#include "global.h" + +static const su3_alg_dble md0={0.0}; +static mdflds_t *mdfs=NULL; + + +static void alloc_mdflds(void) +{ + int npf,ipf; + su3_alg_dble *mom; + spinor_dble **pp,*p; + hmc_parms_t hmc; + + error_root(sizeof(su3_alg_dble)!=(8*sizeof(double)),1, + "alloc_mdflds [mdflds.c]", + "The su3_alg_dble structures are not properly packed"); + + error(iup[0][0]==0,1,"alloc_mdflds [mdflds.c]", + "The geometry arrays are not set"); + + mdfs=malloc(sizeof(*mdfs)); + mom=amalloc((8*VOLUME+7*(BNDRY/4))*sizeof(*mom),ALIGN); + error((mdfs==NULL)||(mom==NULL),1,"alloc_mdflds [mdflds.c]", + "Unable to allocate momentum and force fields"); + + set_alg2zero(8*VOLUME+7*(BNDRY/4),mom); + (*mdfs).mom=mom; + (*mdfs).frc=mom+4*VOLUME; + + hmc=hmc_parms(); + npf=hmc.npf; + + if (npf>0) + { + pp=malloc(npf*sizeof(*pp)); + p=amalloc(npf*NSPIN*sizeof(*p),ALIGN); + error((pp==NULL)||(p==NULL),1,"alloc_mdflds [mdflds.c]", + "Unable to allocate pseudo-fermion fields"); + set_sd2zero(npf*NSPIN,p); + + for (ipf=0;ipf=0. An error occurs if x is negative + +void ks_test(int n,double f[],double *pkp,double *pkm) + For a given array f[0],f[1],...,f[n-1], the program calculates + the Kolmogorov-Smirnov statistics K_n^{+}=*pkp and K_n^{-}=*pkm + +void ks_prob(int n,double kp,double km,double *pp,double *pm) + Computes the approximate probabilites *pp and *pm for the Kolmogorov- + Smirnov statistics K_n^{+} and K_n^{-} to be less than or equal to + kp and km respectively (eq.(4) in the notes). + +double pchi_square(double chi_square,int nu) + For chi_square>=0 and nu=1,2,...,1000 the program returns an + approximation for P(chi_square|nu) which deviates from the exact + distribution by less than 10^(-8) [10^(-9) if nu=1] + +double average(int n,double *a) + Returns the average of the array elements a[0],..,a[n-1] + +double sigma0(int n,double *a) + Returns the naive statistical error of the average of the array + elements a[0],..,a[n-1] + +double auto_corr(int n,double *a,int tmax,double *g) + Computes the normalized autocorrelation function g[t] at time + separations t=0,..,tmax-1 of the sequence a[0],..,a[n-1] and + returns the value of the (unnormalized) autocorrelation function + at t=0. The inequality tmax<=n must be respected + +void sigma_auto_corr(int n,double *a,int tmax,int lambda,double *eg) + Computes the statistical error eg[t] at time t=0,..,tmax-1 of the + normalized autocorrelation function of the sequence a[0],..,a[n-1]. + The choice of the summation cutoff lambda is not critical, but it + should be set to a value not smaller than a few times the integrated + autocorrelation time of the sequence (see the notes below). The + inequality 2*tmax+lambda-1<=n must be respected + +double tauint(int n,double *a,int tmax,int lambda,int *w,double *sigma) + Returns an estimate of the integrated autocorrelation time of the + sequence a[0],..,a[n-1]. On exit the summation window determined by + the program is assigned to *w and an estimate of the statistical + error on the calculated autocorrelation time is assigned to *sigma. + The parameter tmax sets an upper limit on the summation window and + the summation cutoff lambda should be set to a value not smaller than + a few times the integrated autocorrelation time (see the notes below). + The inequality 2*tmax+lambda-1<=n must be respected + +double print_auto(int n,double *a) + Prints a table of the approximate integrated auto-correlation time + tau(w)=1/2+sum_{t=1}^w g[t] and the associated statistical error + sigma(w)=sigma0*sqrt{2*tau(w)}, where g[t] denotes the normalized + autocorrelation function of the sequence a[0],..,a[n-1]. On exit + the program returns the average of the array elements + +double jack_err(int nx,int n,double **a,double (*f)(int nx,double *x), + int bmax,double *sig) + Computes the standard estimate of an arbitrary function f() of + nx primary stochastic variables x[k], k=0,..,nx-1, for a given + sequence a[k][0],..,a[k][n-1] of values of these. The associated + jackknife errors sig[bs-1] for bin size bs=1,..,bmax are also + computed. On exit the program returns the standard estimate of + the function f() + +double print_jack(int nx,int n,double **a,double (*f)(int nx,double *x)) + Prints a table of the jackknife errors calculated by the program + jack_err(), together with the estimated integrated autocorrelation + times, as a function of the bin size bs. On exit the program returns + the standard estimate of the function f() diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/nompi/extras/chebyshev.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/nompi/extras/chebyshev.c new file mode 100644 index 0000000000000000000000000000000000000000..a75363d31ddf95fff0c0c5fd629ab2df9b811ff1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/nompi/extras/chebyshev.c @@ -0,0 +1,376 @@ + +/******************************************************************************* +* +* File chebyshev.c +* +* Copyright (C) 2005, 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Chebyshev approximation and integration +* +* The externally accessible functions are +* +* int cheby_fit(double a,double b,double (*f)(double x), +* int nmax,double eps,double c[]) +* Computes the coefficients c[0],...,c[n], with n<=nmax being the +* value returned by the program and eps the desired absolute precision +* of the approximation +* +* double cheby_val(double a,double b,int n,double c[],double x) +* Computes the value of the Chebyshev approximation at x, assuming +* the coefficients c_k are stored in the array c[0],...,c[n] +* +* double cheby_int(double a,double b,double (*f)(double x), +* int nmax,double eps) +* Computes the definite integral of f(x) in the range a<=x<=b to an +* absolute precision eps, using Chebyshev polynomials of degree n<=nmax +* +* Notes: +* +* For the numerical approximation and integration of a given function f(x), +* using the Chebyshev polynomials +* +* T_k(z)=cos(k*theta), z=cos(theta), -1<=z<=1, +* +* the function is assumed to be defined in the range a<=x<=b and to be +* available as a function program. The approximation is then of the form +* +* f(x)=sum{c_k*T_k(z),k=0..n}+r(x), z=(a+b-2*x)/(a-b) +* +* |r(x)| +#include +#include +#include +#include "extras.h" + +static int max_degree; +static double *alist,*clist,*flist; + + +static void allocate_arrays(int nmax) +{ + for (max_degree=16;max_degree<=nmax;) + max_degree*=2; + + alist=malloc((max_degree+1)*sizeof(double)); + clist=malloc((max_degree*2)*sizeof(double)); + flist=malloc((max_degree+1)*sizeof(double)); +} + + +static void free_arrays(void) +{ + free(alist); + free(clist); + free(flist); +} + + +static void update_clist(int n) +{ + int k,kmin,kmax,dk; + double pi,x,dx; + + pi=4.0*atan(1.0); + dx=pi/(double)(max_degree); + + kmin=0; + kmax=2*max_degree; + dk=max_degree/n; + + if (n>32) + { + kmin=dk; + dk*=2; + } + + for (k=kmin;k32) + { + kmin=dk; + kmax-=dk; + dk*=2; + } + + for (k=kmin;k<=kmax;k+=dk) + { + x=0.5*(a+b-(a-b)*clist[k]); + flist[k]=(*f)(x); + } +} + + +static void compute_alist(int n) +{ + int i,k,dk; + double sum,r; + + dk=max_degree/n; + r=2.0/(double)n; + + for (i=0;i<=n;++i) + { + sum=0.5*(flist[0]+flist[max_degree]); + if (i%2==1) + sum-=flist[max_degree]; + + for (k=dk;km[i]) + m[i]=a; + } + } + + if ((m[0]>=1.0e2*m[1])&&(m[0]>=1.0e4*m[2])&&(m[0]>=1.0e6*m[3])) + return(0); + + return(1); +} + + +static double abs_error(int n) +{ + int k,kmin; + double err; + + kmin=n/2+1; + err=0.0; + + for (k=0;k=1;--k) + { + r+=fabs(c[k]); + if (r>=eps) + break; + } + + return(k); +} + + +int cheby_fit(double a,double b,double (*f)(double x), + int nmax,double eps,double c[]) +{ + int n,k,itest; + double err; + + if ((a>=b)||(nmax<16)||(eps<=0.0)) + { + printf("Error in cheby_fit\n"); + printf("Arguments out of range\n"); + printf("Program aborted\n\n"); + exit(0); + } + + itest=1; + err=eps; + allocate_arrays(nmax); + + for (n=32;n<=max_degree;n*=2) + { + update_clist(n); + update_flist(n,a,b,f); + compute_alist(n); + + itest=test_convergence(n); + err=abs_error(n); + + if ((itest==0)&&(err=eps)) + { + printf("Error in cheby_fit\n"); + printf("Specified accuracy has not been reached\n"); + printf("Program aborted\n\n"); + exit(0); + } + + n=economize(n,eps,err,c); + return(n); +} + + +double cheby_val(double a,double b,int n,double c[],double x) +{ + int k; + double u,v,w,z; + + if ((n<0)||(a>=b)||(x>b)||(x=0;--k) + { + w=z*u+v; + v=c[k]-u; + u=w; + } + + return(0.5*z*u+v); +} + + +double cheby_int(double a,double b,double (*f)(double x), + int nmax,double eps) +{ + int n,k,itest; + double err,sum; + + if ((a>=b)||(nmax<16)||(eps<=0.0)) + { + printf("Error in cheby_int\n"); + printf("Arguments out of range\n"); + printf("Program aborted\n\n"); + exit(0); + } + + itest=1; + err=eps; + sum=0.0; + allocate_arrays(nmax); + + for (n=32;n<=max_degree;n*=2) + { + update_clist(n); + update_flist(n,a,b,f); + compute_blist(n,a,b); + + itest=test_convergence(n); + err=abs_error(n); + + if ((itest==0)&&(err=0;k-=2) + sum+=alist[k]; + break; + } + } + + free_arrays(); + + if ((itest!=0)||(err>=eps)) + { + printf("Error in cheby_int\n"); + printf("Specified accuracy has not been reached\n"); + printf("Program aborted\n\n"); + exit(0); + } + + return(sum); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/nompi/extras/fsolve.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/nompi/extras/fsolve.c new file mode 100644 index 0000000000000000000000000000000000000000..2890e9a60185782498426b5e7d60bca032d6a8f1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/nompi/extras/fsolve.c @@ -0,0 +1,543 @@ + +/******************************************************************************* +* +* File fsolve.c +* +* Copyright (C) 2008, 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* General purpose equation solver and function minimizers +* +* The externally accessible functions are +* +* double inverse_fct(double y,double x1,double x2,double (*f)(double x), +* double omega1,double omega2) +* Finds a solution x of the equation f(x)=y in the interval [x1,x2] +* to an absolute precision omega1 or a relative precision omega2 +* (whichever is reached first). The points x1,x2 must be such that +* f(x1) and f(x2) have different sign +* +* double minimize_fct(double x0,double x1,double x2,double (*f)(double x), +* double omega1,double omega2) +* Finds a local minimum x of f(x) in the interval [x0,x2] to an +* absolute precision omega1 or a relative precision omega2 (whichever +* is reached first). The point x1 is taken as an initial guess of the +* position of the minimum (x0 +#include +#include +#include +#include "utils.h" +#include "extras.h" + +static int nsv=0,isv; +static double *osv,*psv,**vsv,*xsv; +static double (*fsv)(int n,double *x); + + +static int relative_sign(double f1,double f2) +{ + if (((f1>=0.0)&&(f2<=0.0))||((f1<=0.0)&&(f2>=0.0))) + return 1; + else + return 0; +} + + +double inverse_fct(double x1,double x2,double (*f)(double x),double y, + double omega1,double omega2) +{ + double x3,f1,f2,f3,dx; + double lambda,eps; + + f1=f(x1)-y; + f2=f(x2)-y; + + error((x1>x2)||(relative_sign(f1,f2)==0),1,"inverse_fct [fsolve.c]", + "Improper bracket [x1,x2]"); + + eps=0.1; + omega2*=0.5; + dx=x2-x1; + + while ((dx>omega1)&&(dx>(omega2*(x1+x2)))) + { + if (fabs(f1)f1)&&(f2>f1)) + return 0; + + if ((f(y0)>f1)&&(f(y2)>f1)) + { + (*x0)=y0; + (*x2)=y2; + return 0; + } + + for (ic=1;ic<20;ic++) + { + if (f1>f2) + { + (*x2)=(*x1); + f2=f1; + } + + (*x1)=(*x0); + f1=f0; + + (*x0)+=d0; + + if ((*x0)f1)&&(f2>f1)) + return 0; + } + + (*x0)=y1+d0; + (*x1)=y1; + (*x2)=y1+d2; + + f0=f(*x0); + f1=f(*x1); + f2=f(*x2); + + for (ic=1;ic<20;ic++) + { + if (f0y2) + (*x2)=y2; + + f2=f(*x2); + + if ((f0>f1)&&(f2>f1)) + return 0; + } + + return 1; +} + + +static double mini_fct(double x0,double x1,double x2,double (*f)(double x), + double omega1,double omega2) +{ + double s,x3,f1,f2,dx; + + omega2*=0.5; + s=0.5*(3.0-sqrt(5.0)); + x3=x2; + dx=x3-x0; + f1=f(x1); + + if ((x1-x0)<(x3-x1)) + { + x2=x1+s*(x3-x1); + f2=f(x2); + } + else + { + x2=x1; + f2=f1; + x1=x2-s*(x2-x0); + f1=f(x1); + } + + while ((dx>omega1)&&(dx>(omega2*(fabs(x0)+fabs(x3))))) + { + if (f1=x2),1,"minimize_fct [fsolve.c]", + "Improper input values x0,x1,x2"); + + error(find_bracket(&x0,&x1,&x2,f),1,"minimize_fct [fsolve.c]", + "Unable to bracket minimum"); + + return mini_fct(x0,x1,x2,f,omega1,omega2); +} + + +static void alloc_arrays(int n) +{ + int i,j; + + if (nsv!=n) + { + if (nsv!=0) + { + afree(osv); + afree(vsv); + } + + if (n>0) + { + osv=amalloc(n*(n+3)*sizeof(*psv),3); + vsv=amalloc(n*sizeof(*vsv),3); + + error((osv==NULL)||(vsv==NULL),1,"alloc_arrays [fsolve.c]", + "Unable to allocate auxiliary arrays"); + + psv=osv+n; + vsv[0]=psv+n; + + for (i=1;i0.0) + { + (*r0)=(x0[k]-psv[k])/v[k]; + (*r2)=(x2[k]-psv[k])/v[k]; + } + else + { + (*r0)=(x2[k]-psv[k])/v[k]; + (*r2)=(x0[k]-psv[k])/v[k]; + } + + for (j=0;jpa) + (*rom2)=pa/va; + + if (v[j]>0.0) + { + if ((psv[j]+(*r0)*v[j])x2[j]) + (*r2)=(x2[j]-psv[j])/v[j]; + } + else + { + if ((psv[j]+(*r0)*v[j])>x2[j]) + (*r0)=(x2[j]-psv[j])/v[j]; + if ((psv[j]+(*r2)*v[j])=x1[j])||(x2[j]<=x1[j])) + ifn=1; + } + + error(ifn,1,"powell [fsolve.c]","Improper parameter arrays x0,x1,x2"); + error((imx<4)||((omega1<=0.0)&&(omega2<=0.0)),1,"powell [fsolve.c]", + "Improper parameters imx,omega1 or omega2"); + + fsv=f; + alloc_arrays(n); + + for (j=0;jdel) + { + del=r0; + k=isv; + } + + fp=fe; + } + else if (i>=2) + { + for (j=0;j=x2[j])) + ifn=1; + } + + if (ifn==0) + { + fe=f(n,xsv); + r0=fe-fo; + r1=fo-fp-del; + r2=2.0*(fo-2.0*fp+fe)*r1*r1-del*r0*r0; + + if ((r0<(-4.0*DBL_EPSILON*fabs(fo)))&&(r2<0.0)) + { + for (j=0;j0) + vsv[k][j]=vsv[0][j]; + vsv[0][j]=psv[j]-osv[j]; + } + } + } + + io1=1; + io2=1; + + for (j=0;jomega1) + io1=0; + if (r0>(omega2*psv[j])) + io2=0; + + osv[j]=psv[j]; + } + + fo=fp; + + if ((i>=3)&&((io1==1)||(io2==1))) + break; + } + + for (j=0;j=0. An error occurs if x is negative +* +* Notes: +* +* The Bessel function is calculated by evaluating the integral +* +* exp(-x)*I_0(x)=int_0^Pi (dt/Pi)*exp(-x*(1-cos(t))) +* +* using Chebyshev polynomials +* +*******************************************************************************/ + +#define I0M_C + +#include +#include +#include +#include +#include "utils.h" +#include "extras.h" + +static double pi,xs; + + +static double maxt(double x) +{ + double r; + + pi=4.0*atan(1.0); + + if (x<1.0) + return pi; + + r=1.0-(0.5*log(2.0*pi*x)-log(DBL_EPSILON))/x; + + if (r>=1.0) + return 0.0; + else if (r<=-1.0) + return pi; + else + return acos(r); +} + + +static double f(double t) +{ + return exp(-xs*(1.0-cos(t))); +} + + +double i0m(double x) +{ + double a,b; + + if (x==0.0) + return 1.0; + + error(x<0.0,1,"i0m [i0.c]","The argument x must be non-negative"); + + a=0.0; + b=maxt(x); + xs=x; + + if (b==0.0) + return (1.0/sqrt(2.0*pi*x))*(1.0+1.0/(8.0*x)); + + return cheby_int(a,b,f,512,10.0*DBL_EPSILON)/pi; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/nompi/extras/ks_test.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/nompi/extras/ks_test.c new file mode 100644 index 0000000000000000000000000000000000000000..f0651e97f44988fdb922532fa2697e479af487bd --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/nompi/extras/ks_test.c @@ -0,0 +1,147 @@ + +/******************************************************************************* +* +* File ks_test.c +* +* Copyright (C) 2005, 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Kolmogorov-Smirnov test +* +* The externally accessible functions are +* +* void ks_test(int n,double f[],double *pkp,double *pkm) +* For a given array f[0],f[1],...,f[n-1], the program calculates +* the Kolmogorov-Smirnov statistics K_n^{+}=*pkp and K_n^{-}=*pkm +* +* void ks_prob(int n,double kp,double km,double *pp,double *pm) +* Computes the approximate probabilites *pp and *pm for the Kolmogorov- +* Smirnov statistics K_n^{+} and K_n^{-} to be less than or equal to +* kp and km respectively (eq.(4) in the notes). +* +* Notes: +* +* See the notes +* +* M. Luescher: Statistical tests +* +* for a detailed description. +* +*******************************************************************************/ + +#define KS_TEST_C + +#include +#include +#include +#include "extras.h" + + +void ks_test(int n, double f[],double *pkp,double *pkm) +{ + int *pn,k,i; + double *pu,*pv,xn,sn,x,kp,km; + + if (n<=0) + { + printf("Error in ks_test: argument out of range\n"); + printf("Program aborted\n\n"); + exit(0); + } + + pn=malloc((n+1)*sizeof(int)); + pu=malloc((n+1)*sizeof(double)); + pv=malloc((n+1)*sizeof(double)); + xn=(double)n; + + if (pn&&pu&&pv) + { + for (k=0;k<=n;k++) + { + pn[k]=0; + pu[k]=xn; + pv[k]=0.0; + } + } + else + { + printf("Error in ks_test: could not allocate auxiliary arrays\n"); + printf("Program aborted\n\n"); + exit(0); + } + + for (i=0;ixn)) + { + printf("Error in ks_test: argument out of range\n"); + printf("Program aborted\n\n"); + exit(0); + } + + k=(int)x; + pn[k]+=1; + if (xpv[k]) + pv[k]=x; + } + + sn=0.0; + kp=0.0; + km=0.0; + + for (k=0;k<=n;k++) + { + if (pn[k]>0) + { + x=pu[k]-sn; + if (x>km) + km=x; + sn+=(double)pn[k]; + x=sn-pv[k]; + if (x>kp) + kp=x; + } + } + + *pkp=kp/sqrt(xn); + *pkm=km/sqrt(xn); + + free(pn); + free(pu); + free(pv); +} + + +void ks_prob(int n,double kp,double km,double *pp,double *pm) +{ + double xn; + + if (n<=0) + { + printf("Error in ks_prob: argument out of range\n"); + printf("Program aborted\n\n"); + exit(0); + } + + xn=(double)n; + + if (kp<1e-8) + *pp=0.0; + else if (kp>3.5) + *pp=1.0; + else + *pp=1.0-exp(-2.0*kp*kp)*(1.0-2.0*kp/(3.0*sqrt(xn))); + + if (km<1e-8) + *pm=0.0; + else if (km>3.5) + *pm=1.0; + else + *pm=1.0-exp(-2.0*km*km)*(1.0-2.0*km/(3.0*sqrt(xn))); +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/nompi/extras/pchi_square.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/nompi/extras/pchi_square.c new file mode 100644 index 0000000000000000000000000000000000000000..cdc28d9c0f8178d37d0e668a0c64c6cb0667d1ff --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/nompi/extras/pchi_square.c @@ -0,0 +1,205 @@ + +/******************************************************************************* +* +* File pchi_square.c +* +* Copyright (C) 2005, 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Chi-square probability distribution +* +* The externally accessible function is +* +* double pchi_square(double chi_square,int nu) +* For chi_square>=0 and nu=1,2,...,1000 the program returns an +* approximation for P(chi_square|nu) which deviates from the exact +* distribution by less than 10^(-8) [10^(-9) if nu=1] +* +* Notes: +* +* See the notes +* +* M. Luescher: Statistical tests +* +* for a detailed description. +* +*******************************************************************************/ + +#define PCHI_SQUARE_C + +#include +#include +#include +#include "extras.h" + +static int init=0; +static double c0,c1,c2,c3,c4,c5,lng[40]; +static double xd0,xd1,xd2,xd3,xd4,pi; + + +static void define_constants(void) +{ + int n; + double x; + + xd0=0.0; + xd1=1.0; + xd2=2.0; + xd3=3.0; + xd4=4.0; + + pi=xd4*atan(xd1); + + c0=-xd1/xd2; + c1=log(xd2*pi)/xd2; + c2= xd1/12.0; + c3=-xd1/360.0; + c4= xd1/1260.0; + c5=-xd1/1680.0; + + lng[1]=log(pi)/xd2; + lng[2]=xd0; + + for (n=3;n<40;++n) + { + x=(double)(n-2); + lng[n]=lng[n-2]+log(x/xd2); + } + + init=1; +} + + +static double ln_gamma(int k) +{ + double y,z,zm1,zm2; + + if (k<40) + return lng[k]; + + z=(double)k; + z=z/xd2; + zm1=xd1/z; + zm2=zm1*zm1; + + y=c5; + y=y*zm2+c4; + y=y*zm2+c3; + y=y*zm2+c2; + y=y*zm1+c1; + + return (z+c0)*log(z)-z+y; +} + + +static double pchi1(double chi_square) +{ + double x,y,z,a,p; + + x=chi_square; + + if (x<=1.0e-18) + return xd0; + if (x>=40.0) + return xd1; + + z=x/xd2; + a=xd2*sqrt(z/pi)*exp(-z); + + y=xd0; + z=xd3; + p=x/z; + + for (;a>(xd1-p)*1.0e-9;) + { + y+=a; + a*=p; + z+=xd2; + p=x/z; + } + return y; +} + + +static double pchi2(double chi_square,int nu) +{ + double x,y,z,xnu,lna,a,p; + + x=chi_square; + xnu=(double)nu; + + if (x<=1.0e-18) + return xd0; + + if (x<=xnu) + { + z=x/xd2; + lna=(xnu/xd2)*log(z)-z-ln_gamma(nu+2); + + z=xnu+xd2; + p=x/z; + y=xd0; + + if ((lna-log(xd1-p))<-18.5) + return y; + + a=exp(lna); + + for (;a>(xd1-p)*1.0e-8;) + { + y+=a; + a*=p; + z+=xd2; + p=x/z; + } + return y; + } + else + { + z=x/xd2; + lna=((xnu/xd2-xd1)*log(z)-z)-ln_gamma(nu); + + z=xnu-xd2; + p=z/x; + if (nu%2==1) + y=pchi1(x); + else + y=xd1; + + if ((lna-log(xd1-p))<-18.5) + return y; + + a=exp(lna); + + for (;(z>=xd0)&&(a>(xd1-p)*9.0e-9);) + { + y-=a; + a*=p; + z-=xd2; + p=z/x; + } + return y; + } +} + + +double pchi_square(double chi_square,int nu) +{ + if (init==0) + define_constants(); + + if ((nu<1)||(nu>1000)||(chi_square +* +* The computation of the autocorrelation function and the integrated +* autocorrelation time follows the lines of appendix A of +* +* M. L"uscher, Schwarz-preconditioned HMC algorithm for two-flavor +* lattice QCD, Comput. Phys. Commun. 165 (2005) 199 [hep-lat/0409106] +* +* In particular, the summation cutoff lambda is introduced there and +* the selection of the summation window *w is explained +* +* The programs in this module may be used in MPI programs, but should then +* only be called from the root process +* +*******************************************************************************/ + +#define STAT_C + +#include +#include +#include +#include +#include "utils.h" +#include "extras.h" + + +double average(int n,double *a) +{ + int i; + double abar; + + error_root(n<1,1,"average [stat.c]", + "Argument n is out of range (should be at least 1)"); + + abar=0.0; + + for (i=0;in),1,"auto_corr [stat.c]", + "Argument n or tmax is out of range"); + + abar=average(n,a); + g0=sigma0(n,a); + + if (g0<=(10.0*DBL_EPSILON*fabs(abar))) + { + g0=0.0; + + for (t=0;tn),1, + "sigma_auto_corr [stat.c]", + "Argument n, tmax or lambda is out of range"); + + g=amalloc(tmaxx*sizeof(*g),3); + error_root(g==NULL,1,"sigma_auto_corr [stat.c]", + "Unable to allocate auxiliary array"); + + auto_corr(n,a,tmaxx,g); + sigma_corr(n,tmax,lambda,g,eg); + + afree(g); +} + + +double tauint(int n,double *a,int tmax,int lambda,int *w,double *sigma) +{ + int t,tmaxx; + double tau,g0; + double *g,*eg; + + tmaxx=2*tmax+lambda-1; + + error_root((n<2)||(tmax<1)||(lambda<1)||(tmaxx>n),1,"tauint [stat.c]", + "Argument n, tmax or lambda is out of range"); + + g=amalloc(tmaxx*sizeof(*g),3); + eg=amalloc(tmax*sizeof(*eg),3); + + error_root((g==NULL)||(eg==NULL),1,"tauint [stat.c]", + "Unable to allocate auxiliary arrays"); + + g0=auto_corr(n,a,tmaxx,g); + sigma_corr(n,tmax,lambda,g,eg); + + tau=0.5; + (*w)=1; + (*sigma)=0.0; + + if (g0!=0.0) + { + for (t=1;t=8)&&(iw>=4)) + { + iw=0; + dw*=2; + } + + w+=dw; + iw+=1; + } + } + + printf("\n"); + afree(ga); + afree(ta); + + return abar; +} + + +static double javg(int nx,int n,double **a,double (*f)(int nx,double *x)) +{ + int i; + double *x,fbar; + + x=amalloc(nx*sizeof(*x),3); + error_root(x==NULL,1,"javg [stat.c]","Unable to allocate auxiliary array"); + + for (i=0;in),1,"jack_err [stat.c]", + "Argument nx,n or bmax is out of range"); + + b=amalloc(nx*sizeof(*b),3); + p=amalloc(nx*n*sizeof(*p),3); + + error_root((b==NULL)||(p==NULL),1,"jack_err [stat.c]", + "Unable to allocate auxiliary arrays"); + + for (i=0;i=8)&&(ibs>=4)) + { + ibs=0; + dbs*=2; + } + + bs+=dbs; + ibs+=1; + } + + printf("\n"); + afree(sig); + + return fbar; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/nompi/utils/README b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/nompi/utils/README new file mode 100644 index 0000000000000000000000000000000000000000..b6d13b3aef05227eea2a3bd989b2cc9e09e75834 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/nompi/utils/README @@ -0,0 +1,109 @@ + +******************************************************************************** + + Basic utility functions + +******************************************************************************** + + +Files +----- + +endian.c Byte swapping programs + +mutils.c Utility programs used in main programs + +utils.c Collection of basic utility programs + + +Include file +------------ + +The file utils.h defines the prototypes for all externally accessible +functions that are defined in the *.c files listed above. + + +List of functions +----------------- + +int endianness(void) + Returns LITTLE_ENDIAN if the machine is little endian and BIG_ENDIAN + if it is big endian. Otherwise the return value is UNKNOWN_ENDIAN + +void bswap_int(int n,void *a) + Inverts the byte order of the array elements a[0],..,a[n-1] + assuming these are 4 byte long + +void bswap_double(int n,void *a) + Inverts the byte order of the array elements a[0],..,a[n-1] + assuming these are 8 byte long + +int find_opt(int argc,char *argv[],char *opt) + Returns the position of the option opt in the array argv[]. Only + the elements argv[1],..,argv[argc-1] are scanned and 0 is returned + if opt is not found. + +int digits(double x,double dx,char *fmt) + Assuming x is a value with error dx, this program returns the number n + of fractional digits to print so that all significant digits plus two + more are shown. The print format fmt has to be "e" or "f" depending on + whether the number is to be printed using the "%.ne" or "%.nf" format + string. In the second case dx has to be in the range 0 +#include +#include "utils.h" + + +int endianness(void) +{ + stduint_t i; + unsigned char *b; + + i=0x04030201; + b=(unsigned char*)(&i); + + if ((b[0]==1u)&&(b[1]==2u)&&(b[2]==3u)&&(b[3]==4u)) + return LITTLE_ENDIAN; + else if ((b[0]==4u)&&(b[1]==3u)&&(b[2]==2u)&&(b[3]==1u)) + return BIG_ENDIAN; + else return UNKNOWN_ENDIAN; +} + + +void bswap_int(int n,void *a) +{ + unsigned char *ba,*bam,bas; + + ba=(unsigned char*)(a); + bam=ba+4*n; + + for (;ba +#include +#include +#include +#include +#include "utils.h" + +static char line[NAME_SIZE+1]; +static char inum[3*sizeof(int)+4]; + + +int find_opt(int argc,char *argv[],char *opt) +{ + int k; + + for (k=1;k=fabs(x)) + return 1; + else + return (int)(floor(1.0+log10(fabs(x)))-floor(log10(dx))); + } + else if (strcmp(fmt,"f")==0) + { + error((dx==0.0)||(dx>=1.0),1,"digits [mutils.c]", + "Improper input data (error out of range for fixed format)"); + + return (int)(1.0-floor(log10(dx))); + } + else + error(1,1,"digits [mutils.c]","Unknown data format"); + + return 0; +} + + +int fdigits(double x) +{ + int m,n,ne,k; + double y,z; + + if (x==0.0) + return 0; + + y=fabs(x); + z=DBL_EPSILON*y; + m=floor(log10(y+z)); + n=0; + ne=1; + + for (k=0;k<(DBL_DIG-m);k++) + { + z=sqrt((double)(ne))*DBL_EPSILON*y; + + if (((y-floor(y))<=z)||((ceil(y)-y)<=z)) + break; + + y*=10.0; + ne+=1; + n+=1; + } + + return n; +} + + +int name_size(char *format,...) +{ + int nlen,ie,n; + double dmy; + char *pp,*pc; + va_list args; + + va_start(args,format); + pc=format; + nlen=strlen(format); + ie=0; + n=0; + + for (;;) + { + pp=strchr(pc,'%'); + + if (pp==NULL) + break; + + pc=pp+1; + + if (pc[0]=='s') + nlen+=(strlen(va_arg(args,char*))-2); + else if (pc[0]=='d') + { + sprintf(inum,"%d",va_arg(args,int)); + nlen+=(strlen(inum)-2); + } + else if (pc[0]=='.') + { + if (sscanf(pc,".%d",&n)!=1) + { + ie=1; + break; + } + + sprintf(inum,".%df",n); + pp=strstr(pc,inum); + + if (pp!=pc) + { + ie=2; + break; + } + + nlen+=(n+1-strlen(inum)); + dmy=va_arg(args,double); + if (dmy<0.0) + nlen+=1; + } + else + { + ie=3; + break; + } + } + + va_end(args); + error(ie!=0,1,"name_size [mutils.c]", + "Incorrect format string %s (ie=%d)",format,ie); + return nlen; +} + + +static int cmp_text(char *text1,char *text2) +{ + size_t n1,n2; + char *p1,*p2; + + p1=text1; + p2=text2; + + while (1) + { + p1+=strspn(p1," \t\n"); + p2+=strspn(p2," \t\n"); + n1=strcspn(p1," \t\n"); + n2=strcspn(p2," \t\n"); + + if (n1!=n2) + return 0; + if (n1==0) + return 1; + if (strncmp(p1,p2,n1)!=0) + return 0; + + p1+=n1; + p2+=n1; + } +} + + +static char *get_line(FILE *stream) +{ + char *s,*c; + + s=fgets(line,NAME_SIZE+1,stream); + + if (s!=NULL) + { + error(strlen(line)==NAME_SIZE,1,"get_line [mutils.c]", + "Input line is longer than NAME_SIZE-1"); + + c=strchr(line,'#'); + if (c!=NULL) + c[0]='\0'; + } + + return s; +} + + +long find_section(FILE *stream,char *title) +{ + int ie; + long ofs,sofs; + char *s,*pl,*pr; + + + rewind(stream); + sofs=-1L; + ofs=ftell(stream); + s=get_line(stream); + + while (s!=NULL) + { + pl=strchr(line,'['); + pr=strchr(line,']'); + + if ((pl==(line+strspn(line," \t")))&&(pr>pl)) + { + pl+=1; + pr[0]='\0'; + + if (cmp_text(pl,title)==1) + { + error(sofs>=0L,1,"find_section [mutils.c]", + "Section [%s] occurs more than once",title); + sofs=ofs; + } + } + + ofs=ftell(stream); + s=get_line(stream); + } + + error(sofs==-1L,1,"find_section [mutils.c]", + "Section [%s] not found",title); + ie=fseek(stream,sofs,SEEK_SET); + error(ie!=0,1,"find_section [mutils.c]", + "Unable to go to section [%s]",title); + get_line(stream); + + return sofs; +} + + +static void check_tag(char *tag) +{ + if (tag[0]=='\0') + return; + + error((strspn(tag," 0123456789.")!=0L)|| + (strcspn(tag," \n")!=strlen(tag)),1, + "check_tag [mutils.c]","Improper tag %s",tag); +} + + +static long find_tag(FILE *stream,char *tag) +{ + int ie; + long tofs,lofs,ofs; + char *s,*pl,*pr; + + ie=0; + tofs=-1L; + lofs=ftell(stream); + rewind(stream); + ofs=ftell(stream); + s=get_line(stream); + + while (s!=NULL) + { + pl=strchr(line,'['); + pr=strchr(line,']'); + + if ((pl==(line+strspn(line," \t")))&&(pr>pl)) + { + if (ofs +#include +#include +#include +#include "utils.h" + +struct addr_t +{ + char *addr; + char *true_addr; + struct addr_t *last,*next; +}; + +static struct addr_t *rpos=NULL; + + +int safe_mod(int x,int y) +{ + if (x>=0) + return(x%y); + else + return((y-(abs(x)%y))%y); +} + + +void *amalloc(size_t size,int p) +{ + int shift; + char *true_addr,*addr; + unsigned long mask; + struct addr_t *new,*rnxt; + + if ((size<=0)||(p<0)) + return(NULL); + + shift=1<=rlxd_size() + +void rlxd_reset(int state[]) + Resets the generator to the state defined by the array state[N] + +void ranlxs(float r[],int n) + Computes the next n single-precision random numbers and + assigns them to the elements r[0],...,r[n-1] of the array r[] + +void rlxs_init(int level,int seed) + Initialization of the generator + +int rlxs_size(void) + Returns the number of integers required to save the state of + the generator + +void rlxs_get(int state[]) + Extracts the current state of the generator and stores the + information in the array state[N] where N>=rlxs_size() + +void rlxs_reset(int state[]) + Resets the generator to the state defined by the array state[N] diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/random/gauss.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/random/gauss.c new file mode 100644 index 0000000000000000000000000000000000000000..463cd662b27531d468526a36d8f51057f9086fad --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/random/gauss.c @@ -0,0 +1,97 @@ + +/******************************************************************************* +* +* File gauss.c +* +* Copyright (C) 2005 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Generation of Gaussian random numbers +* +* The externally accessible functions are +* +* void gauss(float r[],int n) +* Generates n single-precision Gaussian random numbers x with distribution +* proportional to exp(-x^2) and assigns them to r[0],..,r[n-1] +* +* void gauss_dble(double rd[],int n) +* Generates n double-precision Gaussian random numbers x with distribution +* proportional to exp(-x^2) and assigns them to rd[0],..,rd[n-1] +* +*******************************************************************************/ + +#define GAUSS_C + +#include +#include +#include +#include "utils.h" +#include "random.h" + +static int init=0; +static double twopi; + + +void gauss(float r[],int n) +{ + int k; + float u[2]; + double x1,x2,rho,y1,y2; + + if (init==0) + { + twopi=8.0*atan(1.0); + init=1; + } + + for (k=0;k +#include +#include +#include "mpi.h" +#include "utils.h" +#include "lattice.h" +#include "random.h" +#include "global.h" + +static int *rlxs_state=NULL,*rlxd_state; +static stdint_t *state; + + +static int check_machine(void) +{ + int ie; + + error_root(sizeof(stdint_t)!=4,1,"check_machine [ranlux.c]", + "Size of a stdint_t integer is not 4"); + + ie=endianness(); + error_root(ie==UNKNOWN_ENDIAN,1,"check_machine [ranlux.c]", + "Unkown endianness"); + + return ie; +} + + +static int alloc_state(void) +{ + int nlxs,nlxd,n; + + nlxs=rlxs_size(); + nlxd=rlxd_size(); + n=nlxs+nlxd; + + if (rlxs_state==NULL) + { + rlxs_state=malloc(n*sizeof(int)); + rlxd_state=rlxs_state+nlxs; + state=malloc(n*sizeof(stdint_t)); + error((rlxs_state==NULL)||(state==NULL),1,"alloc_state [ranlux.c]", + "Unable to allocate state arrays"); + } + + return n; +} + + +static int get_ip(int n) +{ + int np[4]; + + np[3]=n%NPROC3; + n/=NPROC3; + np[2]=n%NPROC2; + n/=NPROC2; + np[1]=n%NPROC1; + n/=NPROC1; + np[0]=n; + + return ipr_global(np); +} + + +static void get_state(void) +{ + rlxs_get(rlxs_state); + rlxd_get(rlxd_state); +} + + +static void reset_state(void) +{ + rlxs_reset(rlxs_state); + rlxd_reset(rlxd_state); +} + + +void start_ranlux(int level,int seed) +{ + int my_rank,max_seed,loc_seed; + int n,iprms[2]; + + if (NPROC>1) + { + iprms[0]=level; + iprms[1]=seed; + + MPI_Bcast(iprms,2,MPI_INT,0,MPI_COMM_WORLD); + + error((iprms[0]!=level)||(iprms[1]!=seed),1, + "start_ranlux [ranlux.c]","Input parameters are not global"); + } + + max_seed=INT_MAX/NPROC; + + error_root((level<0)||(level>1)||(seed<1)||(seed>max_seed),1, + "start_ranlux [ranlux.c]","Parameters are out of range"); + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + loc_seed=0; + + for (n=0;n0) + { + if (my_rank==0) + { + MPI_Send(&dmy,1,MPI_INT,ip,tag0,MPI_COMM_WORLD); + MPI_Recv(rlxs_state,ns,MPI_INT,ip,tag1,MPI_COMM_WORLD,&stat); + } + else if (my_rank==ip) + { + get_state(); + MPI_Recv(&dmy,1,MPI_INT,0,tag0,MPI_COMM_WORLD,&stat); + MPI_Send(rlxs_state,ns,MPI_INT,0,tag1,MPI_COMM_WORLD); + } + } + else if (my_rank==0) + get_state(); + + if (my_rank==0) + { + for (k=0;k0) + { + if (my_rank==0) + { + MPI_Send(rlxs_state,ns,MPI_INT,ip,tag1,MPI_COMM_WORLD); + MPI_Recv(&dmy,1,MPI_INT,ip,tag0,MPI_COMM_WORLD,&stat); + } + else if (my_rank==ip) + { + MPI_Recv(rlxs_state,ns,MPI_INT,0,tag1,MPI_COMM_WORLD,&stat); + MPI_Send(&dmy,1,MPI_INT,0,tag0,MPI_COMM_WORLD); + reset_state(); + } + } + else if (my_rank==0) + reset_state(); + } + + error_chk(); + + if (my_rank==0) + { + n=(int)(lsize[0]); + error_root(ir!=(9+NPROC*ns),1,"import_ranlux [ranlux.c]", + "Incorrect read count"); + fclose(fin); + } + + MPI_Bcast(&n,1,MPI_INT,0,MPI_COMM_WORLD); + + return n; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/random/ranlxd.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/random/ranlxd.c new file mode 100644 index 0000000000000000000000000000000000000000..5c9cee3a0c6bcda3029a52728c2641ff8fae5a3e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/random/ranlxd.c @@ -0,0 +1,610 @@ + +/******************************************************************************* +* +* File ranlxd.c +* +* Copyright (C) 2005, 2008, 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Random number generator "ranlxd" version 3.0. See the notes +* +* "User's guide for ranlxs and ranlxd v3.0" (May 2001) +* +* "Algorithms used in ranlux v3.0" (May 2001) +* +* for a detailed description. +* +* The externally accessible functions are +* +* void ranlxd(double r[],int n) +* Computes the next n double-precision random numbers and +* assigns them to the elements r[0],...,r[n-1] of the array r[] +* +* void rlxd_init(int level,int seed) +* Initialization of the generator +* +* int rlxd_size(void) +* Returns the number of integers required to save the state of +* the generator +* +* void rlxd_get(int state[]) +* Extracts the current state of the generator and stores the +* information in the array state[N] where N>=rlxd_size() +* +* void rlxd_reset(int state[]) +* Resets the generator to the state defined by the array state[N] +* +*******************************************************************************/ + +#define RANLXD_C + +#include +#include +#include +#include +#include +#include "utils.h" +#include "random.h" + +#if (defined x64) + +typedef struct +{ + float c1,c2,c3,c4; +} vec_t __attribute__ ((aligned (16))); + +typedef struct +{ + vec_t c1,c2; +} dble_vec_t __attribute__ ((aligned (16))); + +static int init=0,pr,prm,ir,jr,is,is_old,next[96]; +static vec_t one,one_bit,carry; + +static union +{ + dble_vec_t vec[12]; + float num[96]; +} x __attribute__ ((aligned (16))); + +#define STEP(pi,pj) \ + __asm__ __volatile__ ("movaps %4, %%xmm4 \n\t" \ + "movaps %%xmm2, %%xmm3 \n\t" \ + "subps %2, %%xmm4 \n\t" \ + "movaps %%xmm1, %%xmm5 \n\t" \ + "cmpps $0x6, %%xmm4, %%xmm2 \n\t" \ + "andps %%xmm2, %%xmm5 \n\t" \ + "subps %%xmm3, %%xmm4 \n\t" \ + "andps %%xmm0, %%xmm2 \n\t" \ + "addps %%xmm4, %%xmm5 \n\t" \ + "movaps %%xmm5, %0 \n\t" \ + "movaps %5, %%xmm6 \n\t" \ + "movaps %%xmm2, %%xmm3 \n\t" \ + "subps %3, %%xmm6 \n\t" \ + "movaps %%xmm1, %%xmm7 \n\t" \ + "cmpps $0x6, %%xmm6, %%xmm2 \n\t" \ + "andps %%xmm2, %%xmm7 \n\t" \ + "subps %%xmm3, %%xmm6 \n\t" \ + "andps %%xmm0, %%xmm2 \n\t" \ + "addps %%xmm6, %%xmm7 \n\t" \ + "movaps %%xmm7, %1" \ + : \ + "=m" ((*pi).c1), \ + "=m" ((*pi).c2) \ + : \ + "m" ((*pi).c1), \ + "m" ((*pi).c2), \ + "m" ((*pj).c1), \ + "m" ((*pj).c2) \ + : \ + "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7") + + +static void update(void) +{ + int k,kmax; + dble_vec_t *pmin,*pmax,*pi,*pj; + + kmax=pr; + pmin=&x.vec[0]; + pmax=pmin+12; + pi=&x.vec[ir]; + pj=&x.vec[jr]; + + __asm__ __volatile__ ("movaps %0, %%xmm0 \n\t" + "movaps %1, %%xmm1 \n\t" + "movaps %2, %%xmm2" + : + : + "m" (one_bit), + "m" (one), + "m" (carry) + : + "xmm0", "xmm1", "xmm2"); + + for (k=0;k=12) + ir-=12; + if (jr>=12) + jr-=12; + is=8*ir; + is_old=is; +} + + +static void define_constants(void) +{ + int k; + float b; + + one.c1=1.0f; + one.c2=1.0f; + one.c3=1.0f; + one.c4=1.0f; + + b=(float)(ldexp(1.0,-24)); + one_bit.c1=b; + one_bit.c2=b; + one_bit.c3=b; + one_bit.c4=b; + + for (k=0;k<96;k++) + { + next[k]=(k+1)%96; + if ((k%4)==3) + next[k]=(k+5)%96; + } +} + + +void rlxd_init(int level,int seed) +{ + int i,k,l; + int ibit,jbit,xbit[31]; + int ix,iy; + + define_constants(); + + error_loc((level<1)||(level>2),1,"rlxd_init [ranlxd.c]", + "Bad choice of luxury level (should be 1 or 2)"); + + if (level==1) + pr=202; + else if (level==2) + pr=397; + + i=seed; + + for (k=0;k<31;k++) + { + xbit[k]=i%2; + i/=2; + } + + error_loc((seed<=0)||(i!=0),1,"rlxd_init [ranlxd.c]", + "Bad choice of seed (should be between 1 and 2^31-1)"); + + ibit=0; + jbit=18; + + for (i=0;i<4;i++) + { + for (k=0;k<24;k++) + { + ix=0; + + for (l=0;l<24;l++) + { + iy=xbit[ibit]; + ix=2*ix+iy; + + xbit[ibit]=(xbit[ibit]+xbit[jbit])%2; + ibit=(ibit+1)%31; + jbit=(jbit+1)%31; + } + + if ((k%4)!=i) + ix=16777215-ix; + + x.num[4*k+i]=(float)(ldexp((double)(ix),-24)); + } + } + + carry.c1=0.0f; + carry.c2=0.0f; + carry.c3=0.0f; + carry.c4=0.0f; + + ir=0; + jr=7; + is=91; + is_old=0; + prm=pr%12; + init=1; +} + + +void ranlxd(double r[],int n) +{ + int k; + + if (init==0) + rlxd_init(1,1); + + for (k=0;k=167777216),1, + "rlxd_reset [ranlxd.c]","Unexpected input data"); + + x.num[k]=(float)(ldexp((double)(state[k+1]),-24)); + } + + error_loc(((state[97]!=0)&&(state[97]!=1))|| + ((state[98]!=0)&&(state[98]!=1))|| + ((state[99]!=0)&&(state[99]!=1))|| + ((state[100]!=0)&&(state[100]!=1)),1, + "rlxd_reset [ranlxd.c]","Unexpected input data"); + + carry.c1=(float)(ldexp((double)(state[97]),-24)); + carry.c2=(float)(ldexp((double)(state[98]),-24)); + carry.c3=(float)(ldexp((double)(state[99]),-24)); + carry.c4=(float)(ldexp((double)(state[100]),-24)); + + pr=state[101]; + ir=state[102]; + jr=state[103]; + is=state[104]; + is_old=8*ir; + prm=pr%12; + init=1; + + error_loc(((pr!=202)&&(pr!=397))|| + (ir<0)||(ir>11)||(jr<0)||(jr>11)||(jr!=((ir+7)%12))|| + (is<0)||(is>91),1, + "rlxd_reset [ranlxd.c]","Unexpected input data"); +} + +#else + +#define BASE 0x1000000 +#define MASK 0xffffff + +typedef struct +{ + int c1,c2,c3,c4; +} vec_t; + +typedef struct +{ + vec_t c1,c2; +} dble_vec_t; + +static int init=0,pr,prm,ir,jr,is,is_old,next[96]; +static double one_bit; +static vec_t carry; + +static union +{ + dble_vec_t vec[12]; + int num[96]; +} x; + +#define STEP(pi,pj) \ + d=(*pj).c1.c1-(*pi).c1.c1-carry.c1; \ + (*pi).c2.c1+=(d<0); \ + d+=BASE; \ + (*pi).c1.c1=d&MASK; \ + d=(*pj).c1.c2-(*pi).c1.c2-carry.c2; \ + (*pi).c2.c2+=(d<0); \ + d+=BASE; \ + (*pi).c1.c2=d&MASK; \ + d=(*pj).c1.c3-(*pi).c1.c3-carry.c3; \ + (*pi).c2.c3+=(d<0); \ + d+=BASE; \ + (*pi).c1.c3=d&MASK; \ + d=(*pj).c1.c4-(*pi).c1.c4-carry.c4; \ + (*pi).c2.c4+=(d<0); \ + d+=BASE; \ + (*pi).c1.c4=d&MASK; \ + d=(*pj).c2.c1-(*pi).c2.c1; \ + carry.c1=(d<0); \ + d+=BASE; \ + (*pi).c2.c1=d&MASK; \ + d=(*pj).c2.c2-(*pi).c2.c2; \ + carry.c2=(d<0); \ + d+=BASE; \ + (*pi).c2.c2=d&MASK; \ + d=(*pj).c2.c3-(*pi).c2.c3; \ + carry.c3=(d<0); \ + d+=BASE; \ + (*pi).c2.c3=d&MASK; \ + d=(*pj).c2.c4-(*pi).c2.c4; \ + carry.c4=(d<0); \ + d+=BASE; \ + (*pi).c2.c4=d&MASK + + +static void update(void) +{ + int k,kmax,d; + dble_vec_t *pmin,*pmax,*pi,*pj; + + kmax=pr; + pmin=&x.vec[0]; + pmax=pmin+12; + pi=&x.vec[ir]; + pj=&x.vec[jr]; + + for (k=0;k=12) + ir-=12; + if (jr>=12) + jr-=12; + is=8*ir; + is_old=is; +} + + +static void define_constants(void) +{ + int k; + + one_bit=ldexp(1.0,-24); + + for (k=0;k<96;k++) + { + next[k]=(k+1)%96; + if ((k%4)==3) + next[k]=(k+5)%96; + } +} + + +void rlxd_init(int level,int seed) +{ + int i,k,l; + int ibit,jbit,xbit[31]; + int ix,iy; + + error_loc((INT_MAX<2147483647)||(FLT_RADIX!=2)||(FLT_MANT_DIG<24)|| + (DBL_MANT_DIG<48),1,"rlxd_init [ranlxd.c]", + "Arithmetic on this machine is not suitable for ranlxd"); + + define_constants(); + + error_loc((level<1)||(level>2),1,"rlxd_init [ranlxd.c]", + "Bad choice of luxury level (should be 1 or 2)"); + + if (level==1) + pr=202; + else if (level==2) + pr=397; + + i=seed; + + for (k=0;k<31;k++) + { + xbit[k]=i%2; + i/=2; + } + + error_loc((seed<=0)||(i!=0),1,"rlxd_init [ranlxd.c]", + "Bad choice of seed (should be between 1 and 2^31-1)"); + + ibit=0; + jbit=18; + + for (i=0;i<4;i++) + { + for (k=0;k<24;k++) + { + ix=0; + + for (l=0;l<24;l++) + { + iy=xbit[ibit]; + ix=2*ix+iy; + + xbit[ibit]=(xbit[ibit]+xbit[jbit])%2; + ibit=(ibit+1)%31; + jbit=(jbit+1)%31; + } + + if ((k%4)!=i) + ix=16777215-ix; + + x.num[4*k+i]=ix; + } + } + + carry.c1=0; + carry.c2=0; + carry.c3=0; + carry.c4=0; + + ir=0; + jr=7; + is=91; + is_old=0; + prm=pr%12; + init=1; +} + + +void ranlxd(double r[],int n) +{ + int k; + + if (init==0) + rlxd_init(1,1); + + for (k=0;k=167777216),1, + "rlxd_reset [ranlxd.c]","Unexpected input data"); + + x.num[k]=state[k+1]; + } + + error_loc(((state[97]!=0)&&(state[97]!=1))|| + ((state[98]!=0)&&(state[98]!=1))|| + ((state[99]!=0)&&(state[99]!=1))|| + ((state[100]!=0)&&(state[100]!=1)),1, + "rlxd_reset [ranlxd.c]","Unexpected input data"); + + carry.c1=state[97]; + carry.c2=state[98]; + carry.c3=state[99]; + carry.c4=state[100]; + + pr=state[101]; + ir=state[102]; + jr=state[103]; + is=state[104]; + is_old=8*ir; + prm=pr%12; + init=1; + + error_loc(((pr!=202)&&(pr!=397))|| + (ir<0)||(ir>11)||(jr<0)||(jr>11)||(jr!=((ir+7)%12))|| + (is<0)||(is>91),1, + "rlxd_reset [ranlxd.c]","Unexpected input data"); +} + +#endif + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/random/ranlxs.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/random/ranlxs.c new file mode 100644 index 0000000000000000000000000000000000000000..10752f3fd667a21fb16e2a298bd73b020d087a53 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/random/ranlxs.c @@ -0,0 +1,605 @@ + +/******************************************************************************* +* +* File ranlxs.c +* +* Copyright (C) 2005, 2008, 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Random number generator "ranlxs" version 3.0. See the notes +* +* "User's guide for ranlxs and ranlxd v3.0" (May 2001) +* +* "Algorithms used in ranlux v3.0" (May 2001) +* +* for a detailed description. +* +* The externally accessible functions are +* +* void ranlxs(float r[],int n) +* Computes the next n single-precision random numbers and +* assigns them to the elements r[0],...,r[n-1] of the array r[] +* +* void rlxs_init(int level,int seed) +* Initialization of the generator +* +* int rlxs_size(void) +* Returns the number of integers required to save the state of +* the generator +* +* void rlxs_get(int state[]) +* Extracts the current state of the generator and stores the +* information in the array state[N] where N>=rlxs_size() +* +* void rlxs_reset(int state[]) +* Resets the generator to the state defined by the array state[N] +* +*******************************************************************************/ + +#define RANLXS_C + +#include +#include +#include +#include +#include +#include "utils.h" +#include "random.h" + +#if (defined x64) + +typedef struct +{ + float c1,c2,c3,c4; +} vec_t __attribute__ ((aligned (16))); + +typedef struct +{ + vec_t c1,c2; +} dble_vec_t __attribute__ ((aligned (16))); + +static int init=0,pr,prm,ir,jr,is,is_old,next[96]; +static vec_t one,one_bit,carry; + +static union +{ + dble_vec_t vec[12]; + float num[96]; +} x __attribute__ ((aligned (16))); + +#define STEP(pi,pj) \ + __asm__ __volatile__ ("movaps %4, %%xmm4 \n\t" \ + "movaps %%xmm2, %%xmm3 \n\t" \ + "subps %2, %%xmm4 \n\t" \ + "movaps %%xmm1, %%xmm5 \n\t" \ + "cmpps $0x6, %%xmm4, %%xmm2 \n\t" \ + "andps %%xmm2, %%xmm5 \n\t" \ + "subps %%xmm3, %%xmm4 \n\t" \ + "andps %%xmm0, %%xmm2 \n\t" \ + "addps %%xmm4, %%xmm5 \n\t" \ + "movaps %%xmm5, %0 \n\t" \ + "movaps %5, %%xmm6 \n\t" \ + "movaps %%xmm2, %%xmm3 \n\t" \ + "subps %3, %%xmm6 \n\t" \ + "movaps %%xmm1, %%xmm7 \n\t" \ + "cmpps $0x6, %%xmm6, %%xmm2 \n\t" \ + "andps %%xmm2, %%xmm7 \n\t" \ + "subps %%xmm3, %%xmm6 \n\t" \ + "andps %%xmm0, %%xmm2 \n\t" \ + "addps %%xmm6, %%xmm7 \n\t" \ + "movaps %%xmm7, %1" \ + : \ + "=m" ((*pi).c1), \ + "=m" ((*pi).c2) \ + : \ + "m" ((*pi).c1), \ + "m" ((*pi).c2), \ + "m" ((*pj).c1), \ + "m" ((*pj).c2) \ + : \ + "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7") + + +static void update(void) +{ + int k,kmax; + dble_vec_t *pmin,*pmax,*pi,*pj; + + kmax=pr; + pmin=&x.vec[0]; + pmax=pmin+12; + pi=&x.vec[ir]; + pj=&x.vec[jr]; + + __asm__ __volatile__ ("movaps %0, %%xmm0 \n\t" + "movaps %1, %%xmm1 \n\t" + "movaps %2, %%xmm2" + : + : + "m" (one_bit), + "m" (one), + "m" (carry) + : + "xmm0", "xmm1", "xmm2"); + + for (k=0;k=12) + ir-=12; + if (jr>=12) + jr-=12; + is=8*ir; + is_old=is; +} + + +static void define_constants(void) +{ + int k; + float b; + + one.c1=1.0f; + one.c2=1.0f; + one.c3=1.0f; + one.c4=1.0f; + + b=(float)(ldexp(1.0,-24)); + one_bit.c1=b; + one_bit.c2=b; + one_bit.c3=b; + one_bit.c4=b; + + for (k=0;k<96;k++) + next[k]=(k+1)%96; +} + + +void rlxs_init(int level,int seed) +{ + int i,k,l; + int ibit,jbit,xbit[31]; + int ix,iy; + + define_constants(); + + error_loc((level<0)||(level>2),1,"rlxs_init [ranlxs.c]", + "Bad choice of luxury level (should be 0,1 or 2)"); + + if (level==0) + pr=109; + else if (level==1) + pr=202; + else if (level==2) + pr=397; + + i=seed; + + for (k=0;k<31;k++) + { + xbit[k]=i%2; + i/=2; + } + + error_loc((seed<=0)||(i!=0),1,"rlxs_init [ranlxs.c]", + "Bad choice of seed (should be between 1 and 2^31-1)"); + + ibit=0; + jbit=18; + + for (i=0;i<4;i++) + { + for (k=0;k<24;k++) + { + ix=0; + + for (l=0;l<24;l++) + { + iy=xbit[ibit]; + ix=2*ix+iy; + + xbit[ibit]=(xbit[ibit]+xbit[jbit])%2; + ibit=(ibit+1)%31; + jbit=(jbit+1)%31; + } + + if ((k%4)==i) + ix=16777215-ix; + + x.num[4*k+i]=(float)(ldexp((double)(ix),-24)); + } + } + + carry.c1=0.0f; + carry.c2=0.0f; + carry.c3=0.0f; + carry.c4=0.0f; + + ir=0; + jr=7; + is=95; + is_old=0; + prm=pr%12; + init=1; +} + + +void ranlxs(float r[],int n) +{ + int k; + + if (init==0) + rlxs_init(0,1); + + for (k=0;k=167777216),1, + "rlxs_reset [ranlxs.c]","Unexpected input data"); + + x.num[k]=(float)(ldexp((double)(state[k+1]),-24)); + } + + error_loc(((state[97]!=0)&&(state[97]!=1))|| + ((state[98]!=0)&&(state[98]!=1))|| + ((state[99]!=0)&&(state[99]!=1))|| + ((state[100]!=0)&&(state[100]!=1)),1, + "rlxs_reset [ranlxs.c]","Unexpected input data"); + + carry.c1=(float)(ldexp((double)(state[97]),-24)); + carry.c2=(float)(ldexp((double)(state[98]),-24)); + carry.c3=(float)(ldexp((double)(state[99]),-24)); + carry.c4=(float)(ldexp((double)(state[100]),-24)); + + pr=state[101]; + ir=state[102]; + jr=state[103]; + is=state[104]; + is_old=8*ir; + prm=pr%12; + init=1; + + error_loc(((pr!=109)&&(pr!=202)&&(pr!=397))|| + (ir<0)||(ir>11)||(jr<0)||(jr>11)||(jr!=((ir+7)%12))|| + (is<0)||(is>95),1, + "rlxs_reset [ranlxs.c]","Unexpected input data"); +} + +#else + +#define BASE 0x1000000 +#define MASK 0xffffff + +typedef struct +{ + int c1,c2,c3,c4; +} vec_t; + +typedef struct +{ + vec_t c1,c2; +} dble_vec_t; + +static int init=0,pr,prm,ir,jr,is,is_old,next[96]; +static float one_bit; +static vec_t carry; + +static union +{ + dble_vec_t vec[12]; + int num[96]; +} x; + +#define STEP(pi,pj) \ + d=(*pj).c1.c1-(*pi).c1.c1-carry.c1; \ + (*pi).c2.c1+=(d<0); \ + d+=BASE; \ + (*pi).c1.c1=d&MASK; \ + d=(*pj).c1.c2-(*pi).c1.c2-carry.c2; \ + (*pi).c2.c2+=(d<0); \ + d+=BASE; \ + (*pi).c1.c2=d&MASK; \ + d=(*pj).c1.c3-(*pi).c1.c3-carry.c3; \ + (*pi).c2.c3+=(d<0); \ + d+=BASE; \ + (*pi).c1.c3=d&MASK; \ + d=(*pj).c1.c4-(*pi).c1.c4-carry.c4; \ + (*pi).c2.c4+=(d<0); \ + d+=BASE; \ + (*pi).c1.c4=d&MASK; \ + d=(*pj).c2.c1-(*pi).c2.c1; \ + carry.c1=(d<0); \ + d+=BASE; \ + (*pi).c2.c1=d&MASK; \ + d=(*pj).c2.c2-(*pi).c2.c2; \ + carry.c2=(d<0); \ + d+=BASE; \ + (*pi).c2.c2=d&MASK; \ + d=(*pj).c2.c3-(*pi).c2.c3; \ + carry.c3=(d<0); \ + d+=BASE; \ + (*pi).c2.c3=d&MASK; \ + d=(*pj).c2.c4-(*pi).c2.c4; \ + carry.c4=(d<0); \ + d+=BASE; \ + (*pi).c2.c4=d&MASK + + +static void update(void) +{ + int k,kmax,d; + dble_vec_t *pmin,*pmax,*pi,*pj; + + kmax=pr; + pmin=&x.vec[0]; + pmax=pmin+12; + pi=&x.vec[ir]; + pj=&x.vec[jr]; + + for (k=0;k=12) + ir-=12; + if (jr>=12) + jr-=12; + is=8*ir; + is_old=is; +} + + +static void define_constants(void) +{ + int k; + + one_bit=(float)(ldexp(1.0,-24)); + + for (k=0;k<96;k++) + next[k]=(k+1)%96; +} + + +void rlxs_init(int level,int seed) +{ + int i,k,l; + int ibit,jbit,xbit[31]; + int ix,iy; + + error_loc((INT_MAX<2147483647)||(FLT_RADIX!=2)||(FLT_MANT_DIG<24),1, + "rlxs_init [ranlxs.c]", + "Arithmetic on this machine is not suitable for ranlxs"); + + define_constants(); + + error_loc((level<0)||(level>2),1,"rlxs_init [ranlxs.c]", + "Bad choice of luxury level (should be 0,1 or 2)"); + + if (level==0) + pr=109; + else if (level==1) + pr=202; + else if (level==2) + pr=397; + + i=seed; + + for (k=0;k<31;k++) + { + xbit[k]=i%2; + i/=2; + } + + error_loc((seed<=0)||(i!=0),1,"rlxs_init [ranlxs.c]", + "Bad choice of seed (should be between 1 and 2^31-1)"); + + ibit=0; + jbit=18; + + for (i=0;i<4;i++) + { + for (k=0;k<24;k++) + { + ix=0; + + for (l=0;l<24;l++) + { + iy=xbit[ibit]; + ix=2*ix+iy; + + xbit[ibit]=(xbit[ibit]+xbit[jbit])%2; + ibit=(ibit+1)%31; + jbit=(jbit+1)%31; + } + + if ((k%4)==i) + ix=16777215-ix; + + x.num[4*k+i]=ix; + } + } + + carry.c1=0; + carry.c2=0; + carry.c3=0; + carry.c4=0; + + ir=0; + jr=7; + is=95; + is_old=0; + prm=pr%12; + init=1; +} + + +void ranlxs(float r[],int n) +{ + int k; + + if (init==0) + rlxs_init(0,1); + + for (k=0;k=167777216),1, + "rlxs_reset [ranlxs.c]","Unexpected input data"); + + x.num[k]=state[k+1]; + } + + error_loc(((state[97]!=0)&&(state[97]!=1))|| + ((state[98]!=0)&&(state[98]!=1))|| + ((state[99]!=0)&&(state[99]!=1))|| + ((state[100]!=0)&&(state[100]!=1)),1, + "rlxs_reset [ranlxs.c]","Unexpected input data"); + + carry.c1=state[97]; + carry.c2=state[98]; + carry.c3=state[99]; + carry.c4=state[100]; + + pr=state[101]; + ir=state[102]; + jr=state[103]; + is=state[104]; + is_old=8*ir; + prm=pr%12; + init=1; + + error_loc(((pr!=109)&&(pr!=202)&&(pr!=397))|| + (ir<0)||(ir>11)||(jr<0)||(jr>11)||(jr!=((ir+7)%12))|| + (is<0)||(is>95),1, + "rlxs_reset [ranlxs.c]","Unexpected input data"); +} + +#endif + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/ratfcts/README b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/ratfcts/README new file mode 100644 index 0000000000000000000000000000000000000000..fcdfa4dddc12583226bbfd726ff5dd19726d2b8d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/ratfcts/README @@ -0,0 +1,46 @@ + +******************************************************************************** + + Rational functions + +******************************************************************************** + + +Files +----- + +elliptic.c Computation of the Jacobi elliptic functions sn, cn + and dn + +ratfcts.c Rational function coefficients data base + +zolotarev.c Computation of the Zolotarev rational approximation + to 1/sqrt(y) + +Include file +------------ + +The file ratfcts.h defines the prototypes for all externally accessible +functions that are defined in the *.c files listed above. + + +List of functions +----------------- + +double ellipticK(double rk) + Returns the complete elliptic integral K(k) for 0<=k<1. The value + of k is to be passed through the argument rk=k/k' (see the notes). + +void sncndn(double u,double rk,double *sn,double *cn,double *dn) + Computes the Jacobi elliptic functions sn(u,k), cn(u,k), dn(u,k) + for specified real u and 0<=k<1. The value of k is to be passed + through the argument rk=k/k' (see the notes). + +ratfct_t ratfct(int *irat) + Returns a structure containing the coefficients of the rational + function specified by the integers irat[3] (see the notes). + +void zolotarev(int n,double eps,double *A,double *ar,double *delta) + Computes the amplitude A, the coefficients ar[r-1]=a_r, r=1,..,2n, + and the error delta of the Zolotarev optimal rational approximation + of degree [n,n] to the function f(y)=1/sqrt(y). diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/ratfcts/elliptic.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/ratfcts/elliptic.c new file mode 100644 index 0000000000000000000000000000000000000000..2d35557658cffccd089032723276e1f43fb17f02 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/ratfcts/elliptic.c @@ -0,0 +1,264 @@ + +/******************************************************************************* +* +* File elliptic.c +* +* Copyright (C) 2008, 2012 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Computation of the Jacobi elliptic functions sn, cn and dn +* +* The externally accessible functions are +* +* double ellipticK(double rk) +* Returns the complete elliptic integral K(k) for 0<=k<1. The value +* of k is to be passed through the argument rk=k/k' (see the notes). +* +* void sncndn(double u,double rk,double *sn,double *cn,double *dn) +* Computes the Jacobi elliptic functions sn(u,k), cn(u,k), dn(u,k) +* for specified real u and 0<=k<1. The value of k is to be passed +* through the argument rk=k/k' (see the notes). +* +* Notes: +* +* The complete elliptic integral and the Jacobi elliptic functions in the +* range -K/2<=u<=K/2 are obtained practically to machine precision. In +* particular, sn(u,k)=u+O(u^3) and cn(u,k)=1-u^2/2+O(u^4) exactly. +* +* Other values of u are first mapped to the interval 0<=u<=K/2 using the +* symmetry properties of the elliptic functions and the numerically computed +* value of K. In general this implies a loss of significance of the argument +* which propagates to the computed functions. +* +* The complete elliptic integral is obtained via the arithmetic-geometric +* mean. For small u, the Jacobi elliptic functions are calculated using +* the Taylor expansion. Elsewhere the descending Landen transformation is +* used. See +* +* M. Abramowitz, I. A. Stegun: "Handbook of mathematical functions", +* (Dover Publications, New York, 1972) +* +* for example. +* +* These methods eventually require both k and k'=sqrt(1-k*k) as input. While +* k' can be computed for given k, there can be important significance losses +* at this point if k is close to 1. On the other hand, if rk=k/k' is given, +* k and k' can be computed with negligible significance losses through +* +* k=rk/sqrt(1+rk^2), k'=1/sqrt(1+rk^2). +* +* This is why rk is chosen as input parameter in the programs in this file. +* +*******************************************************************************/ + +#define ELLIPTIC_C + +#include +#include +#include +#include +#include "utils.h" +#include "ratfcts.h" + + +static double agm(double x,double y) +{ + double px,py; + + for (;;) + { + px=x; + py=y; + + x=0.5*(px+py); + y=sqrt(px*py); + + if ((x<=y)||(x>=px)||(y<=py)) + return x; + } +} + + +double ellipticK(double rk) +{ + double x,y; + + if (rk<0.0) + { + error_loc(1,1,"ellipticK [elliptic.c]","Argument rk is out of range"); + + return 1.0; + } + + x=1.0+rk/sqrt(1.0+rk*rk); + y=1.0/(x*(1.0+rk*rk)); + + return (2.0*atan(1.0))/agm(x,y); +} + + +static double sn_small(double u,double rk) +{ + double m,u2,sn; + double s0,s2,s4,s6; + + m=(rk*rk)/(1.0+rk*rk); + + s0=1.0; + s2=-(1.0+m)/6.0; + s4=(1.0+14.0*m+m*m)/120.0; + s6=-(1.0+135.0*m*(1.0+m)+m*m*m)/5040.0; + + u2=u*u; + sn=s4+s6*u2; + sn=s2+sn*u2; + sn=s0+sn*u2; + + return sn*u; +} + + +static void sncn_limit(double u,double rk,double *sn,double *cn) +{ + double k,m,s,c,r; + + k=rk/sqrt(1.0+rk*rk); + m=k*k; + + s=sin(u); + c=cos(u); + r=0.25*m*(u-s*c); + + (*sn)=s-r*c; + (*cn)=c+r*s; +} + + +static void landen(double u,double rk,double *sn,double *cn) +{ + int n; + double k,kp,kt,ktp; + double delta,fact; + + delta=sqrt(DBL_EPSILON); + kp=1.0/sqrt(1.0+rk*rk); + k=rk*kp; + + for (n=0;k>delta;n++) + { + kt=(k*k)/((1.0+kp)*(1.0+kp)); + ktp=(2.0*sqrt(kp))/(1.0+kp); + u*=(0.5+0.5*kp); + + k=kt; + kp=ktp; + } + + sncn_limit(u,k/kp,sn,cn); + + kt=k; + ktp=kp; + + for (;n>0;n--) + { + k=(2.0*sqrt(kt))/(1.0+kt); + kp=(ktp*ktp)/((1.0+kt)*(1.0+kt)); + + fact=1.0/(1.0+kt*(*sn)*(*sn)); + (*sn)=(1.0+kt)*(*sn)*fact; + (*cn)=(*cn)*sqrt(ktp*ktp+kt*kt*(*cn)*(*cn))*fact; + + kt=k; + ktp=kp; + } +} + + +void sncndn(double u,double rk,double *sn,double *cn,double *dn) +{ + int n,flip; + double k,kp,K,delta,cd,sd,nd; + double sgn_sn,sgn_cn; + + if (rk<0.0) + { + error_loc(1,1,"sncndn [elliptic.c]","Argument rk is out of range"); + + (*sn)=0.0; + (*cn)=1.0; + (*dn)=0.0; + + return; + } + + sgn_sn=1.0; + sgn_cn=1.0; + + if (u<0.0) + { + u=-u; + sgn_sn*=-1.0; + } + + K=ellipticK(rk); + n=(int)(u/K); + u-=(double)(n)*K; + n=n%4; + + if (n==1) + { + u=K-u; + sgn_cn*=-1.0; + } + else if (n==2) + { + sgn_sn*=-1.0; + sgn_cn*=-1.0; + } + else if (n==3) + { + u=K-u; + sgn_sn*=-1.0; + } + + if ((2.0*u)<=K) + flip=0; + else + { + u=K-u; + flip=1; + } + + kp=1.0/sqrt(1.0+rk*rk); + k=rk*kp; + + delta=pow(DBL_EPSILON,0.125); + if (delta>1.0e-3) + delta=1.0e-3; + + if (fabs(u)0. The functions provided by this module +* instead approximate the function 1/|x| in a range ra<=|x|<=rb specified +* in the parameter data base. The relation between x and y is +* +* y=x^2/rb^2 +* +* and thus eps=(ra/rb)^2. +* +* The coefficients a[r], r=0,..,2*n-1, returned by the program zolotarev() +* are ordered such that +* +* a[0]>a[1]>..>a[2*n-1]>0. +* +* For any given integers k,l satisfying k>=0 and k<=l +#include +#include +#include "mpi.h" +#include "flags.h" +#include "utils.h" +#include "ratfcts.h" + +#define IRMAX 32 + +static int init=0,ns,irs,irats[IRMAX][3]; +static double *ars; +static ratfct_t rats[IRMAX]={{0,0.0,1.0,NULL,NULL,NULL,NULL}}; + + +static void init_rat(void) +{ + int ir; + + for (ir=0;ir0) + rats[ir]=rats[0]; + + irats[ir][0]=0; + irats[ir][1]=0; + irats[ir][2]=0; + } + + ns=0; + irs=0; + ars=NULL; + init=1; +} + + +static int fnd_rat(int *irat) +{ + int ir; + + for (ir=0;ir=n),1,"alloc_rat [ratfcts.c]", + "Improper coefficient range or undefined rational function"); + + if (n>ns) + { + if (ns>0) + free(ars); + ars=malloc(2*n*sizeof(*ars)); + ns=n; + } + + mu=malloc(4*np*sizeof(*mu)); + + error((ars==NULL)||(mu==NULL),1,"alloc_rat [ratfcts.c]", + "Unable to allocate coefficient arrays"); + + rats[irs].np=np; + rats[irs].mu=mu; + rats[irs].rmu=mu+np; + rats[irs].nu=mu+2*np; + rats[irs].rnu=mu+3*np; + + irats[irs][0]=irat[0]; + irats[irs][1]=irat[1]; + irats[irs][2]=irat[2]; +} + + +static void set_rat(int *irat) +{ + int n,np,k,l,i,j; + double ra,rb,pmu,pnu; + double eps,A,delta,*ar; + double *mu,*nu,*rmu,*rnu; + rat_parms_t rp; + + rp=rat_parms(irat[0]); + n=rp.degree; + k=irat[1]; + l=irat[2]; + np=l-k+1; + + ra=rp.range[0]; + rb=rp.range[1]; + eps=ra/rb; + eps=eps*eps; + + zolotarev(n,eps,&A,ars,&delta); + rats[irs].A=A/rb; + rats[irs].delta=delta; + + ar=ars+2*k; + mu=rats[irs].mu; + nu=rats[irs].nu; + rmu=rats[irs].rmu; + rnu=rats[irs].rnu; + + for (i=0;i +#include +#include +#include "utils.h" +#include "ratfcts.h" + + +void zolotarev(int n,double eps,double *A,double *ar,double *delta) +{ + int r; + double v,k,rk,d,s; + double sn,cn,dn,snx,cnx,dnx; + + if ((n<1)||(eps<=0.0)||(eps>=1.0)) + { + error_loc(1,1,"zolotarev [zolotarev.c]","Arguments are out of range"); + + (*A)=1.0; + (*delta)=1.0; + + return; + } + + k=sqrt(1.0-eps); + rk=k/sqrt(eps); + v=ellipticK(rk)/(double)(2*n+1); + + (*A)=1.0; + d=k; + + for (r=1;r<=(2*n);r++) + { + if (r<=n) + { + sncndn((double)(r)*v,rk,&sn,&cn,&dn); + ar[r-1]=(cn*cn)/(sn*sn); + } + else + { + sncndn((double)(2*n+1-r)*v,rk,&snx,&cnx,&dnx); + ar[r-1]=eps*((snx*snx)/(cnx*cnx)); + sn=cnx/dnx; + } + + s=sn*sn; + + if ((r%2)==0) + (*A)/=s; + else + { + (*A)*=s; + s*=k; + d*=(s*s); + } + } + + s=1.0+sqrt(1.0-d*d); + (*A)*=(2.0/s); + (*delta)=(d*d)/(s*s); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/sap/README b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/sap/README new file mode 100644 index 0000000000000000000000000000000000000000..13148190d9a601de780c1e9abb81132aa7a3e29e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/sap/README @@ -0,0 +1,73 @@ + +******************************************************************************** + + Schwarz Alternating Procedure (SAP) + +******************************************************************************** + + +Files +----- + +blk_solv.c Solution of the Dirac equation on the blocks of the + SAP_BLOCKS grid. + +sap.c Multiplicative alternating Schwarz procedure for the + solution of the Wilson-Dirac equation. + +sap_com.c SAP communication program. + +sap_gcr.c SAP+GCR solver for the Wilson-Dirac equation. + + +Include file +------------ + +The file sap.h defines the prototypes for all externally accessible +functions that are defined in the *.c files listed above. + + +List of functions +----------------- + +void blk_mres(int n,float mu,int nmr) + Depending on whether the twisted-mass flag is set or not, this + program approximately solves (Dw+i*mu*gamma_5*1e)*b.s[0]=b.s[1] or + (Dw+i*mu*gamma_5)*b.s[0]=b.s[1] on the n'th block b of the SAP_BLOCKS + grid. The solution is obtained by applying nmr minimal residual steps, + using b.s[2] as workspace. On exit, the approximate solution and its + residue are in b.s[0] and b.s[1], respectively. + +void blk_eo_mres(int n,float mu,int nmr) + Approximate solution of (Dwhat+i*mu*gamma_5)*b.s[0]=b.s[1] for given + b.s[1] on the n'th block b of the SAP_BLOCKS grid. The solution is + obtained by applying nmr minimal residual steps, using b.s[2] as + workspace. On exit, the approximate solution and its residue are in + b.s[0] and b.s[1], respectively, while b.s[0],b.s[1] and b.s[2] are + unchanged on the odd points. + +void sap(float mu,int isolv,int nmr,spinor *psi,spinor *rho) + Application of one cycle of the multiplicative Schwarz procedure to + the approximate solution psi of the Wilson-Dirac equation, assuming + the associated residue is stored in the field rho (see the notes). The + block Dirac equation is solved using nmr iterations of the ordinary + (isolv=0) or the even-odd preconditioned (isolv=1) minimal residual + algorithm. On exit, the new approximate solution and its residue are + returned in the fields psi and rho. + +void alloc_sap_bufs(void) + Allocates and initializes the buffers and index arrays needed for + the program sap_com(). + + void sap_com(int ic,spinor *r) + Subtracts the Weyl field b.bb.w[0] on the boundaries of all black + (if ic=0) or all white (if ic=1) blocks b of the SAP_BLOCKS grid + from the global spinor field r. Before subtraction, the Weyl fields + on the block faces in direction ifc are expanded to Dirac spinor + fields s satisfying theta[ifc]*s=0. + +double sap_gcr(int nkv,int nmx,double res,double mu, + spinor_dble *eta,spinor_dble *psi,int *status) + Obtains an approximate solution psi of the Wilson-Dirac equation for + given source eta using the SAP-preconditioned GCR algorithm. See the + notes for the explanation of the parameters of the program. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/sap/README.sap_com b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/sap/README.sap_com new file mode 100644 index 0000000000000000000000000000000000000000..a5d967cb7a1a1414de18fd0cd76a5192169162ef --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/sap/README.sap_com @@ -0,0 +1,153 @@ + +******************************************************************************** + + SAP communication program + +******************************************************************************** + +The application of the Schwarz Alternating Procedure (SAP) as a preconditioner +for the Wilson-Dirac operator in lattice QCD is described in + + M. Luescher: "Solution of the Dirac equation in lattice QCD using a domain + decomposition method", Comp. Phys. Commun. 156 (2004) 209. + +The SAP approximately solves the Dirac equation by running through the blocks +in the SAP_BLOCKS block grid. On each block, the Dirac equation is solved, to +some accuracy, on the interior points of the block using an iterative method +such as the minimal residual algorithm. The current solution on the full +lattice is then updated and the algorithm proceeds to the next block. + +In practice the solution is updated on all black blocks simultaneously and +subsequently on all white blocks. Communications are then required after all +blocks of given colour are processed. It is important that the communication +is done efficiently. The programs in the module sap_com.c achieve this goal +using an adapted layout of the field arrays and non-blocking communications. + + +Block boundary fields +--------------------- + +Once the Dirac equation is solved on all black (ic=0) [or all white (ic=1)] +blocks, the approximate solution and its residue on the full lattice must be +updated. In particular, the residue receives a correction at the exterior +boundaries of the blocks. The correction amounts to subtracting a Weyl field +on the block boundaries from the residue (see below). However, before the +correction can be applied, the Weyl field on the block faces that are not +contained in the local lattice must be copied to the neighbouring MPI +processes. + +The block faces in the -0,+0,..,-3,+3 direction are labeled by an index +ifc=0,..,7. It is advantageous to organize the field communications in such a +way that the fields on the block faces with fixed index ifc are processed +together. In memory the Weyl fields are therefore arranged in two arrays + + weyl snd_buf[2][8][], + + weyl loc_buf[2][8][], + +where the first index is the colour ic of the blocks, the second the face +index ifc and the third a point index. For a given colour ic and a given face +index ifc, the Weyl fields collected in the array snd_buf[ic][ifc] are those +on the faces b.bb[ifc] of the blocks b with colour ic where b.bb[ifc].ibn=1. +Similarly the fields in the array loc_buf[ic][ifc] are those on the faces +where b.bb[ifc].ibn=0. Within each of the arrays, the fields are ordered in +block order and the Weyl spinors in each block segment are ordered according +to the block geometry arrays (see block/README.block). + + +Communication +------------- + +After solving the Dirac equation on all blocks of a given colour ic, the +buffers snd_buf[ic][ifc], ifc=0,..,7, need to be sent to the nodes with rank +npr[ifc]. The data sent are received from the nodes with rank npr[ifc^0x1] +and are stored in the [ic][ifc] components of the array + + weyl rcv_buf[2][8][]. + +Note that snd_buf[ic][ifc] has the same size on all MPI processes (and, +consequently, the same size as rcv_buf[ic][ifc]). Translation invariance +implies this to be so if the number of blocks on the local lattice in +direction ifc is even. In the other case, there must be an even number of +blocks touching the face of the local lattice with index ifc, because the +total number of blocks in the local lattice is even. Half of these blocks have +colour ic and the size of snd_buf[ic][ifc] is then again independent of the +rank of the process. + +As explained in main/README.global, the MPI processes form a hypercubic grid. +In this grid, each process has Cartesian coordinates cpr[mu] (mu=0,1,2,3). One +can then define the parity bits + + np=(cpr[0]+cpr[1]+cpr[2]+cpr[3])&0x1, + + nmu[ifc]=cpr[ifc/2]&0x1, + +and first perform the communication from the np=0 to the np=1 nodes according +to + + np=0 nodes np=1 nodes + + io=ifc^nmu[ifc] io=(ifc^nmu[ifc])^0x1 + snd_buf[ic][io] -> npr[io] rcv_buf[ic][io] <- npr[io^0x1] + +The communication from the np=1 to the np=0 nodes is then performed according +to + + np=0 nodes np=1 nodes + + io=(ifc^nmu[ifc])^0x1 io=ifc^nmu[ifc] + rcv_buf[ic][io] <- npr[io^0x1] snd_buf[ic][io] -> npr[io] + +The send and receive buffers are properly paired in both cases. Moreover, +in each case, the size of the buffers communicated is the same on all nodes. +All nodes thus have exactly the same communication load. + +This pattern is such that the communication proceeds, in each case, across the +hyperplanes orthogonal to the direction ifc, the planes being separated by +2x(local lattice size in that direction). If boundary conditions of type 0,1 +or 2 are chosen, no communications across the boundaries of the lattice at +time 0 and NPROC0*L0-1 are performed. The chosen scheme allows these to be +easily omitted. + + +Subtraction from the residue +---------------------------- + +After communicating the Weyl fields, the fields to be subtracted from the +residue are contained in the arrays loc_buf[ic][ifc] and rcv_buf[ic][ifc]. For +any given colour index ic and face index ifc, these two arrays come one after +the other in memory so that one has in fact a single array of Weyl spinors +with address loc_buf[ic][ifc]. + +The Weyl spinors w on the block boundaries are the first two components +of the Dirac spinors s obtained by applying the block boundary part of +the Dirac operator to a field on the block (see dirac/Dw_bnd.c). Since + + theta[ifc]*s=0 + +where + + theta[ifc] = (1/2)*(1+gamma_mu) if ifc=2*mu, + + = (1/2)*(1-gamma_mu) if ifc=2*mu+1, + +the knowledge of w allows s to be reconstructed uniquely. The reconstruction +of the Dirac spinors is done on the fly by the program + + sub_assign_w2s[ifc^0x1](imb[ic][ifc],nlbf[ic][ifc]+nsbf[ic][ifc], + loc_buf[ic][ifc],res) + +which then subtracts the spinors from the residue field res on the full lattice +(see sflds/Pbnd.c). The other parameters in this function call are: + + nlbf[ic][ifc] Number of elements of the buffer loc_buf[ic][ifc], + + nsbf[ic][ifc] Number of elements of the buffer snd_buf[ic][ifc] + (= number of elements of rcv_buf[ic][ifc]), + + imb[ic][ifc][] Array of the indices of the points in the local lattice + where the reconstructed spinors are to be subtracted + from the residue. + +The index array imb[ic][ifc] is calculated and stored in the static memory of +the module sap_com.c when the communication buffers are allocated. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/sap/blk_solv.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/sap/blk_solv.c new file mode 100644 index 0000000000000000000000000000000000000000..3c59fbdfc1f949993de5950c81357c4b4823a2c8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/sap/blk_solv.c @@ -0,0 +1,1026 @@ + +/******************************************************************************* +* +* File blk_solv.c +* +* Copyright (C) 2005, 2011, 2012, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Solution of the Dirac equation on the blocks of the SAP_BLOCKS grid +* +* The externally accessible functions are +* +* void blk_mres(int n,float mu,int nmr) +* Depending on whether the twisted-mass flag is set or not, this +* program approximately solves (Dw+i*mu*gamma_5*1e)*b.s[0]=b.s[1] or +* (Dw+i*mu*gamma_5)*b.s[0]=b.s[1] on the n'th block b of the SAP_BLOCKS +* grid. The solution is obtained by applying nmr minimal residual steps, +* using b.s[2] as workspace. On exit, the approximate solution and its +* residue are in b.s[0] and b.s[1], respectively. +* +* void blk_eo_mres(int n,float mu,int nmr) +* Approximate solution of (Dwhat+i*mu*gamma_5)*b.s[0]=b.s[1] for given +* b.s[1] on the n'th block b of the SAP_BLOCKS grid. The solution is +* obtained by applying nmr minimal residual steps, using b.s[2] as +* workspace. On exit, the approximate solution and its residue are in +* b.s[0] and b.s[1], respectively, while b.s[0],b.s[1] and b.s[2] are +* unchanged on the odd points. +* +* Notes: +* +* The twisted-mass flag is retrieved from the parameter data base (see +* flags/lat_parms.c). These programs do not perform any communications and +* can be called locally. It is taken for granted that the SAP_BLOCKS grid +* is allocated and that the gauge field and the SW term on the blocks are +* in the proper condition. +* +*******************************************************************************/ + +#define BLK_SOLV_C + +#include +#include +#include +#include +#include "su3.h" +#include "utils.h" +#include "sflds.h" +#include "linalg.h" +#include "block.h" +#include "dirac.h" +#include "sap.h" + +static int vol; +static spinor **s; + +#if (defined x64) +#include "sse2.h" + +#if (defined AVX) +#include "avx.h" + +static float unity=1.0f; + + +static void scalar_prods(float *r,complex *z) +{ + spinor *s1,*s2,*sm; + + __asm__ __volatile__ ("vxorpd %%ymm12, %%ymm12, %%ymm12 \n\t" + "vxorpd %%ymm13, %%ymm13, %%ymm13 \n\t" + "vxorpd %%ymm14, %%ymm14, %%ymm14" + : + : + : + "xmm12", "xmm13", "xmm14"); + + s1=s[1]; + s2=s[2]; + sm=s1+vol; + + for (;s1 +#include +#include +#include +#include "su3.h" +#include "flags.h" +#include "block.h" +#include "dirac.h" +#include "sap.h" +#include "global.h" + +static int vol; +static spinor **s; + +#if (defined AVX) +#include "avx.h" + +static void update_flds0(int *imb,spinor *psi,spinor *rho) +{ + spinor *sb,*rb,*sm; + spinor *sl,*rl,*sln; + + sb=s[0]; + rb=s[1]; + sm=sb+vol; + sln=psi+imb[0]; + + for (;sb +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "sflds.h" +#include "block.h" +#include "sap.h" +#include "global.h" + +static int nb,nbh,isw,init=0; +static int bc,np,nmu[8],sflg[8]; +static int nsbf[2][8],nlbf[2][8],*imb[2][8]; +static weyl *snd_buf[2][8],*loc_buf[2][8],*rcv_buf[2][8]; +static const weyl w0={{{0.0f}}}; +static block_t *b0; +static MPI_Request snd_req[2][8],rcv_req[2][8]; + + +static void set_nbf(void) +{ + int ifc,ibu,ibd; + int *bo,*bs; + block_t *b,*bm; + bndry_t *bb; + + bc=bc_type(); + np=(cpr[0]+cpr[1]+cpr[2]+cpr[3])&0x1; + + bs=(*b0).bs; + ibu=((cpr[0]==(NPROC0-1))&&(bc!=3)); + ibd=((cpr[0]==0)&&(bc!=3)); + + for (ifc=0;ifc<8;ifc++) + { + nmu[ifc]=cpr[ifc/2]&0x1; + sflg[ifc]=((ifc>1)|| + ((ifc==0)&&(cpr[0]!=0))|| + ((ifc==1)&&(cpr[0]!=(NPROC0-1)))|| + (bc==3)); + + nlbf[0][ifc]=0; + nsbf[0][ifc]=0; + nlbf[1][ifc]=0; + nsbf[1][ifc]=0; + + b=b0; + bm=b+nbh; + + for (;b=-1. Otherwise the field psi is set to zero and the +* program returns the norm of the source eta. +* +* The SAP_BLOCKS blocks grid is automatically allocated and the SW term is +* recalculated when needed. The gauge and SW fields are then copied to the +* block grid if they are not in the proper condition. +* +* Evidently the SAP+GCR solver is a global program that must be called on +* all processes simultaneously. The required workspaces are +* +* spinor 2*nkv+1 +* spinor_dble 2 +* +* (see utils/wspace.c). +* +*******************************************************************************/ + +#define SAP_GCR_C + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "sflds.h" +#include "linalg.h" +#include "block.h" +#include "sw_term.h" +#include "dirac.h" +#include "linsolv.h" +#include "sap.h" +#include "global.h" + +static double mud; +static sap_parms_t spr; + + +static void Dop(spinor_dble *s,spinor_dble *r) +{ + Dw_dble(mud,s,r); +} + + +static void Mop(int k,spinor *rho,spinor *phi,spinor *chi) +{ + int n; + + set_s2zero(VOLUME,phi); + assign_s2s(VOLUME,rho,chi); + + for (n=0;n +#include +#include +#include +#include "su3.h" +#include "sflds.h" + +#if (defined x64) +#include "sse2.h" + +#define _load_cst(c) \ +__asm__ __volatile__ ("movss %0, %%xmm15 \n\t" \ + "shufps $0x0, %%xmm15, %%xmm15" \ + : \ + : \ + "m" (c) \ + : \ + "xmm15") + +#define _mul_cst() \ +__asm__ __volatile__ ("mulps %%xmm15, %%xmm0 \n\t" \ + "mulps %%xmm15, %%xmm1 \n\t" \ + "mulps %%xmm15, %%xmm2" \ + : \ + : \ + : \ + "xmm0", "xmm1", "xmm2") + +static const float poh=0.5f; + + +static void assign_s2w0(int *imb,int vol,spinor *s,weyl *r) +{ + weyl *rm; + spinor *si,*sin; + + _load_cst(poh); + rm=r+vol; + si=s+(*imb); + imb+=(r<(rm-1)); + sin=s+(*imb); + + for (;r +#include +#include +#include +#include "su3.h" +#include "sflds.h" + +#if (defined x64) +#include "sse2.h" + +static const sse_double poh={0.5,0.5}; + + +static void assign_sd2wd0(int *imb,int vol,spinor_dble *sd,weyl_dble *rd) +{ + weyl_dble *rm; + spinor_dble *si,*sin; + + rm=rd+vol; + si=sd+(*imb); + imb+=(rd<(rm-1)); + sin=sd+(*imb); + + for (;rd +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "sflds.h" +#include "global.h" + +static int bc,np,nmu[8],nbf[8],ofs[8]; +static int ns,sfc[8],rfc[8],sflg[8]; +static int itags=0,tags[8]; +static weyl *wb=NULL,*snd_buf[8],*rcv_buf[8]; +static const weyl w0={{{0.0f}}}; +static MPI_Request snd_req[8],rcv_req[8]; + + +static void get_tags(void) +{ + int i; + + if (itags==0) + { + for (i=0;i<8;i++) + tags[i]=mpi_permanent_tag(); + + itags=1; + } +} + + +static void alloc_sbufs(void) +{ + int n,ifc,tag,saddr,raddr; + weyl *w,*wm; + + error(iup[0][0]==0,1,"alloc_sbufs [scom.c]", + "Geometry arrays are not set"); + + wb=amalloc(BNDRY*sizeof(*wb),ALIGN); + error(wb==NULL,1,"alloc_sbufs [scom.c]", + "Unable to allocate communication buffers"); + + w=wb; + wm=wb+BNDRY; + + for (;w0) + ofs[ifc]=ofs[ifc-1]+nbf[ifc-1]; + + if (nbf[ifc]>0) + { + sfc[ns]=ifc; + ns+=1; + + snd_buf[ifc]=w; + w+=nbf[ifc]; + rcv_buf[ifc]=w; + w+=nbf[ifc]; + + tag=tags[ifc]; + saddr=npr[ifc]; + raddr=npr[ifc^0x1]; + + MPI_Send_init(snd_buf[ifc],12*nbf[ifc],MPI_FLOAT,saddr, + tag,MPI_COMM_WORLD,&snd_req[ifc]); + MPI_Recv_init(rcv_buf[ifc],12*nbf[ifc],MPI_FLOAT,raddr, + tag,MPI_COMM_WORLD,&rcv_req[ifc]); + } + + sflg[ifc]=((ifc>1)|| + ((ifc==0)&&(cpr[0]!=0))|| + ((ifc==1)&&(cpr[0]!=(NPROC0-1)))|| + (bc==3)); + } + + for (n=0;n0) + send_bufs(sfc[m],eo); + + ifc=sfc[n]; + io=ifc^nmu[ifc]; + + if (sflg[io]) + assign_s2w[io^is](map+ofs[io^0x1],nbf[io],s,snd_buf[io]); + else + bnd_s2zero(EVEN_PTS,s); + + if (n>0) + { + wait_bufs(sfc[m],eo); + m+=eo; + eo^=0x1; + } + } + + for (n=0;n<2;n++) + { + send_bufs(sfc[m],eo); + wait_bufs(sfc[m],eo); + m+=eo; + eo^=0x1; + } + + for (n=0;n0) + send_bufs(rfc[m],eo); + + ifc=rfc[n]; + io=ifc^nmu[ifc]; + + if (sflg[io]) + zip_weyl(nbf[io],sb+ofs[io],snd_buf[io]); + + if (n>0) + { + wait_bufs(rfc[m],eo); + m+=eo; + eo^=0x1; + } + } + + for (n=0;n<2;n++) + { + send_bufs(rfc[m],eo); + wait_bufs(rfc[m],eo); + m+=eo; + eo^=0x1; + } + + for (n=0;n +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "sflds.h" +#include "global.h" + +static int bc,np,nmu[8],nbf[8],ofs[8]; +static int ns,sfc[8],rfc[8],sflg[8]; +static int itags=0,tags[8]; +static weyl_dble *wb=NULL,*snd_buf[8],*rcv_buf[8]; +static const weyl_dble w0={{{0.0}}}; +static MPI_Request snd_req[8],rcv_req[8]; + + +static void get_tags(void) +{ + int i; + + if (itags==0) + { + for (i=0;i<8;i++) + tags[i]=mpi_permanent_tag(); + + itags=1; + } +} + + +static void alloc_sdbufs(void) +{ + int n,ifc,tag,saddr,raddr; + weyl_dble *w,*wm; + + error(iup[0][0]==0,1,"alloc_sdbufs [sdcom.c]", + "Geometry arrays are not initialized"); + + wb=amalloc(BNDRY*sizeof(*wb),ALIGN); + error(wb==NULL,1,"alloc_sdbufs [sdcom.c]", + "Unable to allocate communication buffers"); + + w=wb; + wm=wb+BNDRY; + + for (;w0) + ofs[ifc]=ofs[ifc-1]+nbf[ifc-1]; + + if (nbf[ifc]>0) + { + sfc[ns]=ifc; + ns+=1; + + snd_buf[ifc]=w; + w+=nbf[ifc]; + rcv_buf[ifc]=w; + w+=nbf[ifc]; + + tag=tags[ifc]; + saddr=npr[ifc]; + raddr=npr[ifc^0x1]; + + MPI_Send_init((double*)(snd_buf[ifc]),12*nbf[ifc],MPI_DOUBLE,saddr, + tag,MPI_COMM_WORLD,&snd_req[ifc]); + MPI_Recv_init((double*)(rcv_buf[ifc]),12*nbf[ifc],MPI_DOUBLE,raddr, + tag,MPI_COMM_WORLD,&rcv_req[ifc]); + } + + sflg[ifc]=((ifc>1)|| + ((ifc==0)&&(cpr[0]!=0))|| + ((ifc==1)&&(cpr[0]!=(NPROC0-1)))|| + (bc==3)); + } + + for (n=0;n0) + send_bufs(sfc[m],eo); + + ifc=sfc[n]; + io=ifc^nmu[ifc]; + + if (sflg[io]) + assign_sd2wd[io^is](map+ofs[io^0x1],nbf[io],sd,snd_buf[io]); + else + bnd_sd2zero(EVEN_PTS,sd); + + if (n>0) + { + wait_bufs(sfc[m],eo); + m+=eo; + eo^=0x1; + } + } + + for (n=0;n<2;n++) + { + send_bufs(sfc[m],eo); + wait_bufs(sfc[m],eo); + m+=eo; + eo^=0x1; + } + + for (n=0;n0) + send_bufs(rfc[m],eo); + + ifc=rfc[n]; + io=ifc^nmu[ifc]; + + if (sflg[io]) + zip_weyl(nbf[io],sdb+ofs[io],snd_buf[io]); + + if (n>0) + { + wait_bufs(rfc[m],eo); + m+=eo; + eo^=0x1; + } + } + + for (n=0;n<2;n++) + { + send_bufs(rfc[m],eo); + wait_bufs(rfc[m],eo); + m+=eo; + eo^=0x1; + } + + for (n=0;n +#include +#include +#include "su3.h" +#include "random.h" +#include "sflds.h" + +#if (defined x64) +#include "sse2.h" + +void set_s2zero(int vol,spinor *s) +{ + spinor *sm; + + __asm__ __volatile__ ("xorps %%xmm0, %%xmm0 \n\t" + "xorps %%xmm1, %%xmm1 \n\t" + "xorps %%xmm2, %%xmm2" + : + : + : + "xmm0", "xmm1", "xmm2"); + + sm=s+vol; + + for (;s +#include +#include +#include +#include "utils.h" +#include "su3.h" +#include "su3fcts.h" + +#ifndef ALIGN +#define ALIGN 6 +#endif + +static void mapX2v(su3_alg_dble *X); +static void eval_td(su3_alg_dble *X); +static void ch_init(void); + +static int N,init_flag=0; +static double *c,t,d; +static su3_vector_dble v1,v2,v3,w ALIGNED16; +static su3_dble umat1,umat2 ALIGNED16; +static su3_alg_dble Y ALIGNED16; +static ch_drv0_t ALIGNED16 s; +static const ch_drv0_t sp0 ALIGNED16 ={0.0}; +static const ch_drv1_t sp1 ALIGNED16 ={0.0}; +static const ch_drv2_t sp2 ALIGNED16 ={0.0}; + + +static void eval_td(su3_alg_dble *X) +{ + t=3.0*((*X).c1*(*X).c1+(*X).c2*(*X).c2-(*X).c1*(*X).c2)+ + (*X).c3*(*X).c3+(*X).c4*(*X).c4+(*X).c5*(*X).c5+ + (*X).c6*(*X).c6+(*X).c7*(*X).c7+(*X).c8*(*X).c8; + + mapX2v(X); + _vector_cross_prod(w,v2,v3); + d=_vector_prod_im(v1,w); + + error_loc(fabs(d)>(1.000001*(1.000002-t)),1,"eval_td [chexp.c]", + "The norm of X is larger than 1"); +} + + +static void ch_init(void) +{ + int k; + double fctr; + + N=7; + fctr=1.0; + + while (fctr>DBL_EPSILON) + { + N++; + fctr/=(double)(N-7); + } + + N+=(N%2); + c=amalloc((N+1)*sizeof(*c),ALIGN); + + if (error_loc(c==NULL,1,"ch_init [chexp.c]", + "Unable to allocate auxiliary array")==0) + { + c[0]=1.0; + for (k=0;k3.0) + { + nfrb*=0.25; + eps*=0.5; + n++; + } + + Y.c1=eps*(*X).c1; + Y.c2=eps*(*X).c2; + Y.c3=eps*(*X).c3; + Y.c4=eps*(*X).c4; + Y.c5=eps*(*X).c5; + Y.c6=eps*(*X).c6; + Y.c7=eps*(*X).c7; + Y.c8=eps*(*X).c8; + + u1=&umat1; + u2=&umat2; + + chexp_drv0(&Y,&s); + ch2mat(s.p,&Y,u2); + + for (k=0;k0;n-=2) + { + __asm__ __volatile__("movapd %%xmm2, %%xmm4 \n\t" + "mulpd %%xmm6, %%xmm2 \n\t" + "shufpd $0x1, %%xmm4, %%xmm4 \n\t" + "addpd %%xmm0, %%xmm2\n\t" + "mulpd %%xmm7, %%xmm4 \n\t" + "movapd %%xmm1, %%xmm0 \n\t" + "addsd %0, %%xmm4 \n\t" + "shufpd $0x1, %%xmm0, %%xmm0 \n\t" + "mulpd %%xmm6, %%xmm1 \n\t" + "mulpd %%xmm7, %%xmm0 \n\t" + "addpd %%xmm4, %%xmm1 \n\t" + "addsd %1, %%xmm0" + : + : + "m" (c[n]), + "m" (c[n-1]) + : + "xmm0", "xmm1", "xmm2", "xmm4"); + } + + __asm__ __volatile__ ("movapd %%xmm0, %0 \n\t" + "movapd %%xmm1, %1 \n\t" + "movapd %%xmm2, %2" + : + "=m" ((*s).p[0]), + "=m" ((*s).p[1]), + "=m" ((*s).p[2])); +} + + +void chexp_drv1(su3_alg_dble *X,ch_drv1_t *s) +{ + int n; + + if (init_flag==0) + ch_init(); + + if (init_flag==2) + { + (*s)=sp1; + (*s).p[0].re=1.0; + return; + } + + eval_td(X); + (*s).t=t; + (*s).d=d; + + __asm__ __volatile__("movddup %0, %%xmm14 \n\t" + "movddup %1, %%xmm15 \n\t" + "movsd %2, %%xmm0 \n\t" + "xorpd %%xmm1, %%xmm1 \n\t" + "xorpd %%xmm2, %%xmm2 \n\t" + "mulpd %3, %%xmm14 \n\t" + "mulpd %3, %%xmm15 \n\t" + "xorpd %%xmm3, %%xmm3 \n\t" + "xorpd %%xmm4, %%xmm4 \n\t" + "xorpd %%xmm5, %%xmm5 \n\t" + "shufpd $0x0, %%xmm14, %%xmm14 \n\t" + "shufpd $0x1, %%xmm15, %%xmm15" + : + : + "m" ((*s).t), + "m" ((*s).d), + "m" (c[N-3]), + "m" (_sse_sgn1_dble) + : + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm14", "xmm15"); + + for (n=N-4;n>=0;n--) + { + __asm__ __volatile__("movapd %%xmm2, %%xmm6 \n\t" + "movapd %%xmm5, %%xmm7 \n\t" + "movapd %%xmm2, %%xmm8 \n\t" + "movapd %%xmm5, %%xmm9 \n\t" + "shufpd $0x1, %%xmm6, %%xmm6 \n\t" + "shufpd $0x1, %%xmm7, %%xmm7 \n\t" + "mulpd %0, %%xmm2 \n\t" + "mulpd %%xmm14, %%xmm8 \n\t" + "mulpd %%xmm14, %%xmm9 \n\t" + "mulpd %%xmm15, %%xmm6 \n\t" + "mulpd %%xmm15, %%xmm7 \n\t" + "shufpd $0x1, %%xmm2, %%xmm2 \n\t" + "addsd %1, %%xmm6" + : + : + "m" (_sse_sgn1_dble), + "m" (c[n]) + : + "xmm2", "xmm6", "xmm7", "xmm8", + "xmm9"); + + __asm__ __volatile__("addpd %%xmm0, %%xmm8 \n\t" + "addpd %%xmm2, %%xmm7 \n\t" + "addpd %%xmm3, %%xmm9 \n\t" + "movapd %%xmm1, %%xmm2 \n\t" + "movapd %%xmm6, %%xmm0 \n\t" + "movapd %%xmm8, %%xmm1 \n\t" + "movapd %%xmm4, %%xmm5 \n\t" + "movapd %%xmm7, %%xmm3 \n\t" + "movapd %%xmm9, %%xmm4" + : + : + : + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm7", "xmm8", + "xmm9"); + } + + __asm__ __volatile__ ("movapd %%xmm0, %0 \n\t" + "movapd %%xmm1, %1 \n\t" + "movapd %%xmm2, %2 \n\t" + "movapd %%xmm3, %3 \n\t" + "movapd %%xmm4, %4 \n\t" + "movapd %%xmm5, %5" + : + "=m" ((*s).p[0]), + "=m" ((*s).p[1]), + "=m" ((*s).p[2]), + "=m" ((*s).pd[0]), + "=m" ((*s).pd[1]), + "=m" ((*s).pd[2])); + + (*s).pt[0].re=-d*(*s).pd[2].re; + (*s).pt[0].im=-d*(*s).pd[2].im; + (*s).pt[1].re= (*s).pd[0].im-t*(*s).pd[2].im; + (*s).pt[1].im=-(*s).pd[0].re+t*(*s).pd[2].re; + (*s).pt[2].re= (*s).pd[1].im; + (*s).pt[2].im=-(*s).pd[1].re; +} + + +void chexp_drv2(su3_alg_dble *X,ch_drv2_t *s) +{ + int n; + + if (init_flag==0) + ch_init(); + + if (init_flag==2) + { + (*s)=sp2; + (*s).p[0].re=1.0; + return; + } + + eval_td(X); + (*s).t=t; + (*s).d=d; + + __asm__ __volatile__("movddup %0, %%xmm14 \n\t" + "movddup %1, %%xmm15 \n\t" + "movsd %2, %%xmm0 \n\t" + "xorpd %%xmm1, %%xmm1 \n\t" + "xorpd %%xmm2, %%xmm2 \n\t" + "mulpd %3, %%xmm14 \n\t" + "xorpd %%xmm3, %%xmm3 \n\t" + "xorpd %%xmm4, %%xmm4 \n\t" + "xorpd %%xmm5, %%xmm5 \n\t" + "shufpd $0x1, %%xmm14, %%xmm14 \n\t" + "xorpd %%xmm6, %%xmm6 \n\t" + "xorpd %%xmm7, %%xmm7 \n\t" + "xorpd %%xmm8, %%xmm8" + : + : + "m" ((*s).d), + "m" ((*s).t), + "m" (c[N]), + "m" (_sse_sgn1_dble) + : + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm14", "xmm15"); + + for (n=N-1;n>=0;n--) + { + __asm__ __volatile__("movapd %%xmm2, %%xmm9 \n\t" + "movapd %%xmm0, %%xmm10 \n\t" + "shufpd $0x1, %%xmm9, %%xmm9 \n\t" + "movapd %%xmm2, %%xmm11 \n\t" + "mulpd %%xmm14, %%xmm9 \n\t" + "mulpd %%xmm15, %%xmm11 \n\t" + "addsd %0, %%xmm9 \n\t" + "subpd %%xmm11, %%xmm10" + : + : + "m" (c[n]) + : + "xmm9", "xmm10", "xmm11"); + + __asm__ __volatile__("movapd %%xmm5, %%xmm11 \n\t" + "shufpd $0x1, %%xmm2, %%xmm2 \n\t" + "movapd %%xmm3, %%xmm12 \n\t" + "shufpd $0x1, %%xmm11, %%xmm11 \n\t" + "mulpd %0, %%xmm2 \n\t" + "movapd %%xmm5, %%xmm13 \n\t" + "mulpd %%xmm14, %%xmm11 \n\t" + "mulpd %%xmm15, %%xmm13 \n\t" + "addpd %%xmm5, %%xmm5 \n\t" + "subpd %%xmm2, %%xmm11 \n\t" + "subpd %%xmm13, %%xmm12" + : + : + "m" (_sse_sgn1_dble) + : + "xmm2", "xmm5", "xmm11", "xmm12", + "xmm13"); + + __asm__ __volatile__("movapd %%xmm1, %%xmm2 \n\t" + "movapd %%xmm9, %%xmm0 \n\t" + "movapd %%xmm10, %%xmm1" + : + : + : + "xmm0", "xmm1", "xmm2"); + + __asm__ __volatile__("movapd %%xmm8, %%xmm9 \n\t" + "shufpd $0x1, %%xmm5, %%xmm5 \n\t" + "movapd %%xmm6, %%xmm10 \n\t" + "shufpd $0x1, %%xmm9, %%xmm9 \n\t" + "mulpd %0, %%xmm5 \n\t" + "movapd %%xmm8, %%xmm13 \n\t" + "mulpd %%xmm14, %%xmm9 \n\t" + "mulpd %%xmm15, %%xmm13 \n\t" + "subpd %%xmm5, %%xmm9 \n\t" + "subpd %%xmm13, %%xmm10" + : + : + "m" (_sse_sgn1_dble) + : + "xmm5", "xmm9", "xmm10", "xmm13"); + + __asm__ __volatile__("movapd %%xmm4, %%xmm5 \n\t" + "movapd %%xmm11, %%xmm3 \n\t" + "movapd %%xmm12, %%xmm4 \n\t" + "movapd %%xmm7, %%xmm8 \n\t" + "movapd %%xmm9, %%xmm6 \n\t" + "movapd %%xmm10, %%xmm7" + : + : + : + "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7", "xmm8"); + } + + __asm__ __volatile__ ("movapd %%xmm0, %0 \n\t" + "movapd %%xmm1, %1 \n\t" + "movapd %%xmm2, %2 \n\t" + "movapd %%xmm3, %3 \n\t" + "movapd %%xmm4, %4 \n\t" + "movapd %%xmm5, %5 \n\t" + "movapd %%xmm6, %6 \n\t" + "movapd %%xmm7, %7 \n\t" + "movapd %%xmm8, %8" + : + "=m" ((*s).p[0]), + "=m" ((*s).p[1]), + "=m" ((*s).p[2]), + "=m" ((*s).pd[0]), + "=m" ((*s).pd[1]), + "=m" ((*s).pd[2]), + "=m" ((*s).pdd[0]), + "=m" ((*s).pdd[1]), + "=m" ((*s).pdd[2])); + + (*s).pt[0].re=-d*(*s).pd[2].re; + (*s).pt[0].im=-d*(*s).pd[2].im; + (*s).pt[1].re= (*s).pd[0].im-t*(*s).pd[2].im; + (*s).pt[1].im=-(*s).pd[0].re+t*(*s).pd[2].re; + (*s).pt[2].re= (*s).pd[1].im; + (*s).pt[2].im=-(*s).pd[1].re; + + (*s).ptd[0].re=-(*s).pd[2].re-d*(*s).pdd[2].re; + (*s).ptd[0].im=-(*s).pd[2].im-d*(*s).pdd[2].im; + (*s).ptd[1].re= (*s).pdd[0].im-t*(*s).pdd[2].im; + (*s).ptd[1].im=-(*s).pdd[0].re+t*(*s).pdd[2].re; + (*s).ptd[2].re= (*s).pdd[1].im; + (*s).ptd[2].im=-(*s).pdd[1].re; + + (*s).ptt[0].re=-d*(*s).pdd[1].im; + (*s).ptt[0].im= d*(*s).pdd[1].re; + (*s).ptt[1].re=-2.0*(*s).pd[2].im+t*(*s).pdd[1].re-d*(*s).pdd[2].im; + (*s).ptt[1].im= 2.0*(*s).pd[2].re+t*(*s).pdd[1].im+d*(*s).pdd[2].re; + (*s).ptt[2].re=-(*s).pdd[0].re+t*(*s).pdd[2].re; + (*s).ptt[2].im=-(*s).pdd[0].im+t*(*s).pdd[2].im; +} + +#else + +static void mapX2v(su3_alg_dble *X) +{ + v1.c1.re=0.0; + v1.c1.im=(*X).c1+(*X).c2; + v1.c2.re=(*X).c3; + v1.c2.im=(*X).c4; + v1.c3.re=(*X).c5; + v1.c3.im=(*X).c6; + + v2.c1.re=-(*X).c3; + v2.c1.im=(*X).c4; + v2.c2.re=0.0; + v2.c2.im=(*X).c2-2.0*(*X).c1; + v2.c3.re=(*X).c7; + v2.c3.im=(*X).c8; + + v3.c1.re=-(*X).c5; + v3.c1.im=(*X).c6; + v3.c2.re=-(*X).c7; + v3.c2.im=(*X).c8; + v3.c3.re=0.0; + v3.c3.im=(*X).c1-2.0*(*X).c2; +} + + +void ch2mat(complex_dble *p,su3_alg_dble *X,su3_dble *u) +{ + complex_dble z; + + mapX2v(X); + + (*u).c11.re=p[0].re-p[1].im*v1.c1.im; + (*u).c11.im=p[0].im+p[1].re*v1.c1.im; + (*u).c12.re=p[1].re*v1.c2.re-p[1].im*v1.c2.im; + (*u).c12.im=p[1].re*v1.c2.im+p[1].im*v1.c2.re; + (*u).c13.re=p[1].re*v1.c3.re-p[1].im*v1.c3.im; + (*u).c13.im=p[1].re*v1.c3.im+p[1].im*v1.c3.re; + + (*u).c21.re=p[1].re*v2.c1.re-p[1].im*v2.c1.im; + (*u).c21.im=p[1].re*v2.c1.im+p[1].im*v2.c1.re; + (*u).c22.re=p[0].re-p[1].im*v2.c2.im; + (*u).c22.im=p[0].im+p[1].re*v2.c2.im; + (*u).c23.re=p[1].re*v2.c3.re-p[1].im*v2.c3.im; + (*u).c23.im=p[1].re*v2.c3.im+p[1].im*v2.c3.re; + + (*u).c31.re=p[1].re*v3.c1.re-p[1].im*v3.c1.im; + (*u).c31.im=p[1].re*v3.c1.im+p[1].im*v3.c1.re; + (*u).c32.re=p[1].re*v3.c2.re-p[1].im*v3.c2.im; + (*u).c32.im=p[1].re*v3.c2.im+p[1].im*v3.c2.re; + (*u).c33.re=p[0].re-p[1].im*v3.c3.im; + (*u).c33.im=p[0].im+p[1].re*v3.c3.im; + + z.re=_vector_prod_re(v1,v1); + (*u).c11.re-=p[2].re*z.re; + (*u).c11.im-=p[2].im*z.re; + + z.re=_vector_prod_re(v2,v2); + (*u).c22.re-=p[2].re*z.re; + (*u).c22.im-=p[2].im*z.re; + + z.re=_vector_prod_re(v3,v3); + (*u).c33.re-=p[2].re*z.re; + (*u).c33.im-=p[2].im*z.re; + + z.re=_vector_prod_re(v1,v2); + z.im=_vector_prod_im(v1,v2); + (*u).c12.re-=p[2].re*z.re+p[2].im*z.im; + (*u).c12.im-=p[2].im*z.re-p[2].re*z.im; + (*u).c21.re-=p[2].re*z.re-p[2].im*z.im; + (*u).c21.im-=p[2].im*z.re+p[2].re*z.im; + + z.re=_vector_prod_re(v1,v3); + z.im=_vector_prod_im(v1,v3); + (*u).c13.re-=p[2].re*z.re+p[2].im*z.im; + (*u).c13.im-=p[2].im*z.re-p[2].re*z.im; + (*u).c31.re-=p[2].re*z.re-p[2].im*z.im; + (*u).c31.im-=p[2].im*z.re+p[2].re*z.im; + + z.re=_vector_prod_re(v2,v3); + z.im=_vector_prod_im(v2,v3); + (*u).c23.re-=p[2].re*z.re+p[2].im*z.im; + (*u).c23.im-=p[2].im*z.re-p[2].re*z.im; + (*u).c32.re-=p[2].re*z.re-p[2].im*z.im; + (*u).c32.im-=p[2].im*z.re+p[2].re*z.im; +} + + +void chexp_drv0(su3_alg_dble *X,ch_drv0_t *s) +{ + int n; + complex_dble q0,q1,q2; + + if (init_flag==0) + ch_init(); + + if (init_flag==2) + { + (*s)=sp0; + (*s).p[0].re=1.0; + return; + } + + eval_td(X); + (*s)=sp0; + (*s).t=t; + (*s).d=d; + (*s).p[0].re=c[N-6]; + + for (n=(N-7);n>=0;n--) + { + q0=(*s).p[0]; + q1=(*s).p[1]; + q2=(*s).p[2]; + + (*s).p[0].re=c[n]+d*q2.im; + (*s).p[0].im=-d*q2.re; + (*s).p[1].re=q0.re-t*q2.re; + (*s).p[1].im=q0.im-t*q2.im; + (*s).p[2].re=q1.re; + (*s).p[2].im=q1.im; + } +} + + +void chexp_drv1(su3_alg_dble *X,ch_drv1_t *s) +{ + int n; + complex_dble q0,q1,q2; + complex_dble q0d,q1d,q2d; + + if (init_flag==0) + ch_init(); + + if (init_flag==2) + { + (*s)=sp1; + (*s).p[0].re=1.0; + return; + } + + eval_td(X); + (*s)=sp1; + (*s).t=t; + (*s).d=d; + (*s).p[0].re=c[N-3]; + + for (n=(N-4);n>=0;n--) + { + q0=(*s).p[0]; + q1=(*s).p[1]; + q2=(*s).p[2]; + + (*s).p[0].re=c[n]+d*q2.im; + (*s).p[0].im=-d*q2.re; + (*s).p[1].re=q0.re-t*q2.re; + (*s).p[1].im=q0.im-t*q2.im; + (*s).p[2].re=q1.re; + (*s).p[2].im=q1.im; + + q0d=(*s).pd[0]; + q1d=(*s).pd[1]; + q2d=(*s).pd[2]; + + (*s).pd[0].re= q2.im+d*q2d.im; + (*s).pd[0].im=-q2.re-d*q2d.re; + (*s).pd[1].re=q0d.re-t*q2d.re; + (*s).pd[1].im=q0d.im-t*q2d.im; + (*s).pd[2].re=q1d.re; + (*s).pd[2].im=q1d.im; + } + + (*s).pt[0].re=-d*(*s).pd[2].re; + (*s).pt[0].im=-d*(*s).pd[2].im; + (*s).pt[1].re= (*s).pd[0].im-t*(*s).pd[2].im; + (*s).pt[1].im=-(*s).pd[0].re+t*(*s).pd[2].re; + (*s).pt[2].re= (*s).pd[1].im; + (*s).pt[2].im=-(*s).pd[1].re; +} + + +void chexp_drv2(su3_alg_dble *X,ch_drv2_t *s) +{ + int n; + complex_dble q0,q1,q2; + complex_dble q0d,q1d,q2d; + complex_dble q0dd,q1dd,q2dd; + + if (init_flag==0) + ch_init(); + + if (init_flag==2) + { + (*s)=sp2; + (*s).p[0].re=1.0; + return; + } + + eval_td(X); + (*s)=sp2; + (*s).t=t; + (*s).d=d; + (*s).p[0].re=c[N]; + + for (n=(N-1);n>=0;n--) + { + q0=(*s).p[0]; + q1=(*s).p[1]; + q2=(*s).p[2]; + + (*s).p[0].re=c[n]+d*q2.im; + (*s).p[0].im=-d*q2.re; + (*s).p[1].re=q0.re-t*q2.re; + (*s).p[1].im=q0.im-t*q2.im; + (*s).p[2].re=q1.re; + (*s).p[2].im=q1.im; + + q0d=(*s).pd[0]; + q1d=(*s).pd[1]; + q2d=(*s).pd[2]; + + (*s).pd[0].re= q2.im+d*q2d.im; + (*s).pd[0].im=-q2.re-d*q2d.re; + (*s).pd[1].re=q0d.re-t*q2d.re; + (*s).pd[1].im=q0d.im-t*q2d.im; + (*s).pd[2].re=q1d.re; + (*s).pd[2].im=q1d.im; + + q0dd=(*s).pdd[0]; + q1dd=(*s).pdd[1]; + q2dd=(*s).pdd[2]; + + (*s).pdd[0].re= 2.0*q2d.im+d*q2dd.im; + (*s).pdd[0].im=-2.0*q2d.re-d*q2dd.re; + (*s).pdd[1].re=q0dd.re-t*q2dd.re; + (*s).pdd[1].im=q0dd.im-t*q2dd.im; + (*s).pdd[2].re=q1dd.re; + (*s).pdd[2].im=q1dd.im; + } + + (*s).pt[0].re=-d*(*s).pd[2].re; + (*s).pt[0].im=-d*(*s).pd[2].im; + (*s).pt[1].re= (*s).pd[0].im-t*(*s).pd[2].im; + (*s).pt[1].im=-(*s).pd[0].re+t*(*s).pd[2].re; + (*s).pt[2].re= (*s).pd[1].im; + (*s).pt[2].im=-(*s).pd[1].re; + + (*s).ptd[0].re=-(*s).pd[2].re-d*(*s).pdd[2].re; + (*s).ptd[0].im=-(*s).pd[2].im-d*(*s).pdd[2].im; + (*s).ptd[1].re= (*s).pdd[0].im-t*(*s).pdd[2].im; + (*s).ptd[1].im=-(*s).pdd[0].re+t*(*s).pdd[2].re; + (*s).ptd[2].re= (*s).pdd[1].im; + (*s).ptd[2].im=-(*s).pdd[1].re; + + (*s).ptt[0].re=-d*(*s).pdd[1].im; + (*s).ptt[0].im= d*(*s).pdd[1].re; + (*s).ptt[1].re=-2.0*(*s).pd[2].im+t*(*s).pdd[1].re-d*(*s).pdd[2].im; + (*s).ptt[1].im= 2.0*(*s).pd[2].re+t*(*s).pdd[1].im+d*(*s).pdd[2].re; + (*s).ptt[2].re=-(*s).pdd[0].re+t*(*s).pdd[2].re; + (*s).ptt[2].im=-(*s).pdd[0].im+t*(*s).pdd[2].im; +} + +#endif + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/su3fcts/cm3x3.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/su3fcts/cm3x3.c new file mode 100644 index 0000000000000000000000000000000000000000..8db577d45b0b42fdb654a3227ea6f9c492578f91 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/su3fcts/cm3x3.c @@ -0,0 +1,1953 @@ + +/******************************************************************************* +* +* File cm3x3.c +* +* Copyright (C) 2009, 2010, 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Complex 3x3 matrix operations +* +* The externally accessible functions are +* +* void cm3x3_zero(int vol,su3_dble *u) +* Sets the elements of the array u[] to zero +* +* void cm3x3_unity(int vol,su3_dble *u) +* Sets the elements of the array u[] to the unit matrix +* +* void cm3x3_assign(int vol,su3_dble *u,su3_dble *v) +* Assigns the elements of the array u[] to those of the array v[] +* +* void cm3x3_swap(int vol,su3_dble *u,su3_dble *v) +* Swaps the elements of the array u[] with those of the array v[] +* +* void cm3x3_dagger(su3_dble *u,su3_dble *v) +* Assigns the hermitian conjugate of (*u) to (*v) +* +* void cm3x3_tr(su3_dble *u,su3_dble *v,complex_dble *tr) +* Assigns the trace of (*u)*(*v) to (*tr) +* +* void cm3x3_retr(su3_dble *u,su3_dble *v,double *tr) +* Assigns the real part of the trace of (*u)*(*v) to (*tr) +* +* void cm3x3_imtr(su3_dble *u,su3_dble *v,double *tr) +* Assigns the imaginary part of the trace of (*u)*(*v) to (*tr) +* +* void cm3x3_add(su3_dble *u,su3_dble *v) +* Adds (*u) to (*v). The input matrix is unchanged unless u=v +* +* void cm3x3_mul_add(su3_dble *u,su3_dble *v,su3_dble *w) +* Adds (*u)*(*v) to (*w) assuming that w!=u. The input matrix (*u) +* is unchanged and also (*v) unless v=w +* +* void cm3x3_mulr(double *r,su3_dble *u,su3_dble *v) +* Assigns (*r)*(*u) to (*v). The input matrix is unchanged +* unless u=v +* +* void cm3x3_mulr_add(double *r,su3_dble *u,su3_dble *v) +* Adds (*r)*(*u) to (*v). The input matrix is unchanged +* unless u=v +* +* void cm3x3_mulc(complex_dble *c,su3_dble *u,su3_dble *v) +* Assigns (*c)*(*u) to (*v). The input matrix is unchanged +* unless u=v +* +* void cm3x3_mulc_add(complex_dble *c,su3_dble *u,su3_dble *v) +* Adds (*c)*(*u) to (*v). The input matrix is unchanged +* unless u=v +* +* void cm3x3_lc1(complex_dble *c,su3_dble *u,su3_dble *v) +* Assigns c[0]+c[1]*(*u) to (*v). The input matrix is unchanged +* unless u=v +* +* void cm3x3_lc2(complex_dble *c,su3_dble *u,su3_dble *v) +* Assigns c[0]+c[1]*u[0]+c[2]*u[1] to (*v) assuming v!=u+1. The +* input matrix u[1] is unchanged and also u[0] unless u=v +* +* Notes: +* +* The programs in this module do not perform any communications and can be +* called locally. The parameter vol specifies the number of elements of +* the arrays in the argument list. +* +* If SSE2 instructions are used, it is assumed that the matrices and complex +* coefficients are aligned to a 16 byte boundary. +* +*******************************************************************************/ + +#define CM3X3_C + +#include +#include +#include "su3.h" +#include "su3fcts.h" + +#if (defined x64) +#include "sse2.h" + +static const double one=1.0; + + +void cm3x3_zero(int vol,su3_dble *u) +{ + su3_dble *um; + + __asm__ __volatile__ ("xorpd %%xmm0, %%xmm0 \n\t" + "xorpd %%xmm1, %%xmm1 \n\t" + "xorpd %%xmm2, %%xmm2" + : + : + : + "xmm0", "xmm1", "xmm2"); + + um=u+vol; + + for (;u +#include +#include +#include +#include "su3.h" +#include "random.h" +#include "su3fcts.h" + +static float rs[6]; +static double rd[6]; +static su3_vector vs1,vs2,vs3; +static su3_vector_dble vd1,vd2,vd3; + + +static void random_su3_vector(su3_vector *v) +{ + float norm,fact; + + norm=0.0f; + + while (norm<=0.1f) + { + gauss(rs,6); + norm=rs[0]*rs[0]+rs[1]*rs[1]+rs[2]*rs[2]+ + rs[3]*rs[3]+rs[4]*rs[4]+rs[5]*rs[5]; + } + + fact=1.0f/(float)sqrt((double)(norm)); + + (*v).c1.re=fact*rs[0]; + (*v).c1.im=fact*rs[1]; + (*v).c2.re=fact*rs[2]; + (*v).c2.im=fact*rs[3]; + (*v).c3.re=fact*rs[4]; + (*v).c3.im=fact*rs[5]; +} + + +void random_su3(su3 *u) +{ + float norm,fact; + + random_su3_vector(&vs1); + norm=0.0f; + + while (norm<=0.1f) + { + random_su3_vector(&vs2); + _vector_cross_prod(vs3,vs1,vs2); + norm=_vector_prod_re(vs3,vs3); + } + + fact=1.0f/(float)sqrt((double)(norm)); + + vs3.c1.re*=fact; + vs3.c1.im*=fact; + vs3.c2.re*=fact; + vs3.c2.im*=fact; + vs3.c3.re*=fact; + vs3.c3.im*=fact; + + _vector_cross_prod(vs2,vs3,vs1); + + (*u).c11.re=vs1.c1.re; + (*u).c11.im=vs1.c1.im; + (*u).c12.re=vs1.c2.re; + (*u).c12.im=vs1.c2.im; + (*u).c13.re=vs1.c3.re; + (*u).c13.im=vs1.c3.im; + + (*u).c21.re=vs2.c1.re; + (*u).c21.im=vs2.c1.im; + (*u).c22.re=vs2.c2.re; + (*u).c22.im=vs2.c2.im; + (*u).c23.re=vs2.c3.re; + (*u).c23.im=vs2.c3.im; + + (*u).c31.re=vs3.c1.re; + (*u).c31.im=vs3.c1.im; + (*u).c32.re=vs3.c2.re; + (*u).c32.im=vs3.c2.im; + (*u).c33.re=vs3.c3.re; + (*u).c33.im=vs3.c3.im; +} + + +static void random_su3_vector_dble(su3_vector_dble *v) +{ + double norm,fact; + + norm=0.0; + + while (norm<=0.1) + { + gauss_dble(rd,6); + norm=rd[0]*rd[0]+rd[1]*rd[1]+rd[2]*rd[2]+ + rd[3]*rd[3]+rd[4]*rd[4]+rd[5]*rd[5]; + } + + fact=1.0/sqrt(norm); + + (*v).c1.re=fact*rd[0]; + (*v).c1.im=fact*rd[1]; + (*v).c2.re=fact*rd[2]; + (*v).c2.im=fact*rd[3]; + (*v).c3.re=fact*rd[4]; + (*v).c3.im=fact*rd[5]; +} + + +void random_su3_dble(su3_dble *u) +{ + double norm,fact; + + random_su3_vector_dble(&vd1); + norm=0.0; + + while (norm<=0.1) + { + random_su3_vector_dble(&vd2); + _vector_cross_prod(vd3,vd1,vd2); + norm=_vector_prod_re(vd3,vd3); + } + + fact=1.0/sqrt(norm); + + vd3.c1.re*=fact; + vd3.c1.im*=fact; + vd3.c2.re*=fact; + vd3.c2.im*=fact; + vd3.c3.re*=fact; + vd3.c3.im*=fact; + + _vector_cross_prod(vd2,vd3,vd1); + + (*u).c11.re=vd1.c1.re; + (*u).c11.im=vd1.c1.im; + (*u).c12.re=vd1.c2.re; + (*u).c12.im=vd1.c2.im; + (*u).c13.re=vd1.c3.re; + (*u).c13.im=vd1.c3.im; + + (*u).c21.re=vd2.c1.re; + (*u).c21.im=vd2.c1.im; + (*u).c22.re=vd2.c2.re; + (*u).c22.im=vd2.c2.im; + (*u).c23.re=vd2.c3.re; + (*u).c23.im=vd2.c3.im; + + (*u).c31.re=vd3.c1.re; + (*u).c31.im=vd3.c1.im; + (*u).c32.re=vd3.c2.re; + (*u).c32.im=vd3.c2.im; + (*u).c33.re=vd3.c3.re; + (*u).c33.im=vd3.c3.im; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/su3fcts/su3prod.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/su3fcts/su3prod.c new file mode 100644 index 0000000000000000000000000000000000000000..fbe4b2adaa1df97b310c4c3fcd6d4364d7f2c3e7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/su3fcts/su3prod.c @@ -0,0 +1,2489 @@ + +/******************************************************************************* +* +* File su3prod.c +* +* Copyright (C) 2005, 2009, 2010, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Products of double-precision 3x3 matrices +* +* The externally accessible functions are +* +* void su3xsu3(su3_dble *u,su3_dble *v,su3_dble *w) +* Computes w=u*v assuming that w is different from u. +* +* void su3dagxsu3(su3_dble *u,su3_dble *v,su3_dble *w) +* Computes w=u^dag*v assuming that w is different from u. +* +* void su3xsu3dag(su3_dble *u,su3_dble *v,su3_dble *w) +* Computes w=u*v^dag assuming that w is different from u and v. +* +* void su3dagxsu3dag(su3_dble *u,su3_dble *v,su3_dble *w) +* Computes w=u^dag*v^dag assuming that w is different from u and v. +* +* void su3xu3alg(su3_dble *u,u3_alg_dble *X,su3_dble *v) +* Computes v=u*X assuming that v is different from u. +* +* void su3dagxu3alg(su3_dble *u,u3_alg_dble *X,su3_dble *v) +* Computes v=u^dag*X assuming that v is different from u. +* +* void u3algxsu3(u3_alg_dble *X,su3_dble *u,su3_dble *v) +* Computes v=X*u assuming that v is different from u. +* +* void u3algxsu3dag(u3_alg_dble *X,su3_dble *u,su3_dble *v) +* Computes v=X*u^dag assuming that v is different from u. +* +* double prod2su3alg(su3_dble *u,su3_dble *v,su3_alg_dble *X) +* Computes the product w=u*v and assigns its traceless antihermitian +* part (1/2)*[w-w^dag-(1/3)*tr{w-w^dag}] to X. The program returns +* the real part of tr{w}. +* +* void prod2u3alg(su3_dble *u,su3_dble *v,u3_alg_dble *X) +* Computes the product w=u*v and assigns w-w^dag to X. +* +* void rotate_su3alg(su3_dble *u,su3_alg_dble *X) +* Replaces X by u*X*u^dag. The matrix u must be unitary but its +* determinant may be different from 1. +* +* Notes: +* +* Unless stated otherwise, the matrices of type su3_dble are not assumed to +* be unitary or unimodular. They are just treated as general 3x3 complex +* matrices and the operations are applied to them as described. +* +* The elements X of the Lie algebra of U(3) are antihermitian 3x3 matrices +* that are represented by structures X with real entries X.c1,...,X.c9 +* through +* +* X_11=i*X.c1, X_22=i*X.c2, X_33=i*X.c3, +* +* X_12=X.c4+i*X.c5, X_13=X.c6+i*X.c7, X_23=X.c8+i*X.c9 +* +* The type su3_alg_dble [which represents elements of the Lie algebra of SU(3)] +* is described in the file linalg/liealg.c. +* +* If SSE2 or AVX instructions are used, all su3_dble and su3_alg_dble matrices +* are assumed to be aligned to 16 byte boundaries. +* +*******************************************************************************/ + +#define SU3PROD_C + +#include +#include +#include +#include "su3.h" +#include "su3fcts.h" + +#if (defined AVX) +#include "avx.h" + +static const sse_double c0={0.5,0.5},c1={-1.0/3.0,-1.0/3.0}; +static su3_dble uX ALIGNED16; +static double tr ALIGNED8; + + +static void su3xsu3vec(su3_dble *u) +{ + _sse_su3_multiply_dble(*u); +} + + +static void su3xsu3vec_pair(su3_dble *u) +{ + _avx_su3_multiply_pair_dble(*u); +} + + +static void su3dagxsu3vec(su3_dble *u) +{ + _sse_su3_inverse_multiply_dble(*u); +} + + +static void su3dagxsu3vec_pair(su3_dble *u) +{ + _avx_su3_inverse_multiply_pair_dble(*u); +} + + +void su3xsu3(su3_dble *u,su3_dble *v,su3_dble *w) +{ + __asm__ __volatile__ ("vmovapd %0, %%xmm0 \n\t" + "vmovapd %1, %%xmm1 \n\t" + "vmovapd %2, %%xmm2 \n\t" + "vinsertf128 $0x1, %3, %%ymm0, %%ymm0 \n\t" + "vinsertf128 $0x1, %4, %%ymm1, %%ymm1 \n\t" + "vinsertf128 $0x1, %5, %%ymm2, %%ymm2" + : + : + "m" ((*v).c11), + "m" ((*v).c21), + "m" ((*v).c31), + "m" ((*v).c12), + "m" ((*v).c22), + "m" ((*v).c32) + : + "xmm0", "xmm1", "xmm2"); + + su3xsu3vec_pair(u); + + __asm__ __volatile__ ("vmovapd %%xmm3, %0 \n\t" + "vmovapd %%xmm4, %1 \n\t" + "vmovapd %%xmm5, %2 \n\t" + "vextractf128 $0x1, %%ymm3, %3 \n\t" + "vextractf128 $0x1, %%ymm4, %4 \n\t" + "vextractf128 $0x1, %%ymm5, %5" + : + "=m" ((*w).c11), + "=m" ((*w).c21), + "=m" ((*w).c31), + "=m" ((*w).c12), + "=m" ((*w).c22), + "=m" ((*w).c32)); + + _avx_zeroupper(); + + __asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" + "movapd %1, %%xmm1 \n\t" + "movapd %2, %%xmm2" + : + : + "m" ((*v).c13), + "m" ((*v).c23), + "m" ((*v).c33) + : + "xmm0", "xmm1", "xmm2"); + + su3xsu3vec(u); + + __asm__ __volatile__ ("movapd %%xmm3, %0 \n\t" + "movapd %%xmm4, %1 \n\t" + "movapd %%xmm5, %2" + : + "=m" ((*w).c13), + "=m" ((*w).c23), + "=m" ((*w).c33)); +} + + +void su3dagxsu3(su3_dble *u,su3_dble *v,su3_dble *w) +{ + __asm__ __volatile__ ("vmovapd %0, %%xmm0 \n\t" + "vmovapd %1, %%xmm1 \n\t" + "vmovapd %2, %%xmm2 \n\t" + "vinsertf128 $0x1, %3, %%ymm0, %%ymm0 \n\t" + "vinsertf128 $0x1, %4, %%ymm1, %%ymm1 \n\t" + "vinsertf128 $0x1, %5, %%ymm2, %%ymm2" + : + : + "m" ((*v).c11), + "m" ((*v).c21), + "m" ((*v).c31), + "m" ((*v).c12), + "m" ((*v).c22), + "m" ((*v).c32) + : + "xmm0", "xmm1", "xmm2"); + + su3dagxsu3vec_pair(u); + + __asm__ __volatile__ ("vmovapd %%xmm3, %0 \n\t" + "vmovapd %%xmm4, %1 \n\t" + "vmovapd %%xmm5, %2 \n\t" + "vextractf128 $0x1, %%ymm3, %3 \n\t" + "vextractf128 $0x1, %%ymm4, %4 \n\t" + "vextractf128 $0x1, %%ymm5, %5" + : + "=m" ((*w).c11), + "=m" ((*w).c21), + "=m" ((*w).c31), + "=m" ((*w).c12), + "=m" ((*w).c22), + "=m" ((*w).c32)); + + _avx_zeroupper(); + + __asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" + "movapd %1, %%xmm1 \n\t" + "movapd %2, %%xmm2" + : + : + "m" ((*v).c13), + "m" ((*v).c23), + "m" ((*v).c33) + : + "xmm0", "xmm1", "xmm2"); + + su3dagxsu3vec(u); + + __asm__ __volatile__ ("movapd %%xmm3, %0 \n\t" + "movapd %%xmm4, %1 \n\t" + "movapd %%xmm5, %2" + : + "=m" ((*w).c13), + "=m" ((*w).c23), + "=m" ((*w).c33)); +} + + +void su3xsu3dag(su3_dble *u,su3_dble *v,su3_dble *w) +{ + __asm__ __volatile__ ("vbroadcastf128 %0, %%ymm3 \n\t" + "vmovapd %1, %%xmm0 \n\t" + "vmovapd %2, %%xmm1 \n\t" + "vmovapd %3, %%xmm2 \n\t" + "vinsertf128 $0x1, %4, %%ymm0, %%ymm0 \n\t" + "vinsertf128 $0x1, %5, %%ymm1, %%ymm1 \n\t" + "vinsertf128 $0x1, %6, %%ymm2, %%ymm2 \n\t" + "vmulpd %%ymm3, %%ymm0, %%ymm0 \n\t" + "vmulpd %%ymm3, %%ymm1, %%ymm1 \n\t" + "vmulpd %%ymm3, %%ymm2, %%ymm2" + : + : + "m" (_sse_sgn2_dble), + "m" ((*v).c11), + "m" ((*v).c12), + "m" ((*v).c13), + "m" ((*v).c21), + "m" ((*v).c22), + "m" ((*v).c23) + : + "xmm0", "xmm1", "xmm2", "xmm3"); + + su3xsu3vec_pair(u); + + __asm__ __volatile__ ("vmovapd %%xmm3, %0 \n\t" + "vmovapd %%xmm4, %1 \n\t" + "vmovapd %%xmm5, %2 \n\t" + "vextractf128 $0x1, %%ymm3, %3 \n\t" + "vextractf128 $0x1, %%ymm4, %4 \n\t" + "vextractf128 $0x1, %%ymm5, %5" + : + "=m" ((*w).c11), + "=m" ((*w).c21), + "=m" ((*w).c31), + "=m" ((*w).c12), + "=m" ((*w).c22), + "=m" ((*w).c32)); + + _avx_zeroupper(); + + __asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" + "movapd %1, %%xmm1 \n\t" + "movapd %2, %%xmm2 \n\t" + "mulpd %3, %%xmm0 \n\t" + "mulpd %3, %%xmm1 \n\t" + "mulpd %3, %%xmm2" + : + : + "m" ((*v).c31), + "m" ((*v).c32), + "m" ((*v).c33), + "m" (_sse_sgn2_dble) + : + "xmm0", "xmm1", "xmm2"); + + su3xsu3vec(u); + + __asm__ __volatile__ ("movapd %%xmm3, %0 \n\t" + "movapd %%xmm4, %1 \n\t" + "movapd %%xmm5, %2" + : + "=m" ((*w).c13), + "=m" ((*w).c23), + "=m" ((*w).c33)); +} + + +void su3dagxsu3dag(su3_dble *u,su3_dble *v,su3_dble *w) +{ + __asm__ __volatile__ ("vbroadcastf128 %0, %%ymm3 \n\t" + "vmovapd %1, %%xmm0 \n\t" + "vmovapd %2, %%xmm1 \n\t" + "vmovapd %3, %%xmm2 \n\t" + "vinsertf128 $0x1, %4, %%ymm0, %%ymm0 \n\t" + "vinsertf128 $0x1, %5, %%ymm1, %%ymm1 \n\t" + "vinsertf128 $0x1, %6, %%ymm2, %%ymm2 \n\t" + "vmulpd %%ymm3, %%ymm0, %%ymm0 \n\t" + "vmulpd %%ymm3, %%ymm1, %%ymm1 \n\t" + "vmulpd %%ymm3, %%ymm2, %%ymm2" + : + : + "m" (_sse_sgn2_dble), + "m" ((*v).c11), + "m" ((*v).c12), + "m" ((*v).c13), + "m" ((*v).c21), + "m" ((*v).c22), + "m" ((*v).c23) + : + "xmm0", "xmm1", "xmm2", "xmm3"); + + su3dagxsu3vec_pair(u); + + __asm__ __volatile__ ("vmovapd %%xmm3, %0 \n\t" + "vmovapd %%xmm4, %1 \n\t" + "vmovapd %%xmm5, %2 \n\t" + "vextractf128 $0x1, %%ymm3, %3 \n\t" + "vextractf128 $0x1, %%ymm4, %4 \n\t" + "vextractf128 $0x1, %%ymm5, %5" + : + "=m" ((*w).c11), + "=m" ((*w).c21), + "=m" ((*w).c31), + "=m" ((*w).c12), + "=m" ((*w).c22), + "=m" ((*w).c32)); + + _avx_zeroupper(); + + __asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" + "movapd %1, %%xmm1 \n\t" + "movapd %2, %%xmm2 \n\t" + "mulpd %3, %%xmm0 \n\t" + "mulpd %3, %%xmm1 \n\t" + "mulpd %3, %%xmm2" + : + : + "m" ((*v).c31), + "m" ((*v).c32), + "m" ((*v).c33), + "m" (_sse_sgn2_dble) + : + "xmm0", "xmm1", "xmm2"); + + su3dagxsu3vec(u); + + __asm__ __volatile__ ("movapd %%xmm3, %0 \n\t" + "movapd %%xmm4, %1 \n\t" + "movapd %%xmm5, %2" + : + "=m" ((*w).c13), + "=m" ((*w).c23), + "=m" ((*w).c33)); +} + + +void su3xu3alg(su3_dble *u,u3_alg_dble *X,su3_dble *v) +{ + __asm__ __volatile__ ("vxorpd %%ymm5, %%ymm5, %%ymm5 \n\t" + "vbroadcastf128 %8, %%ymm6 \n\t" + "vmovupd %0, %%xmm1 \n\t" + "vmovupd %2, %%xmm2 \n\t" + "vmovhpd %4, %%xmm5, %%xmm0 \n\t" + "vmovhpd %5, %%xmm5, %%xmm3 \n\t" + "vinsertf128 $0x1, %6, %%ymm2, %%ymm2 \n\t" + "vinsertf128 $0x1, %%xmm1, %%ymm0, %%ymm0 \n\t" + "vmulpd %%xmm6, %%xmm1, %%xmm1 \n\t" + "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t" + "vinsertf128 $0x1, %%xmm3, %%ymm1, %%ymm1" + : + : + "m" ((*X).c4), + "m" ((*X).c5), + "m" ((*X).c6), + "m" ((*X).c7), + "m" ((*X).c1), + "m" ((*X).c2), + "m" ((*X).c8), + "m" ((*X).c9), + "m" (_sse_sgn1_dble) + : + "xmm0", "xmm1", "xmm2", + "xmm3", "xmm5", "xmm6"); + + su3xsu3vec_pair(u); + + __asm__ __volatile__ ("vmovapd %%xmm3, %0 \n\t" + "vmovapd %%xmm4, %1 \n\t" + "vmovapd %%xmm5, %2 \n\t" + "vextractf128 $0x1, %%ymm3, %3 \n\t" + "vextractf128 $0x1, %%ymm4, %4 \n\t" + "vextractf128 $0x1, %%ymm5, %5" + : + "=m" ((*v).c11), + "=m" ((*v).c21), + "=m" ((*v).c31), + "=m" ((*v).c12), + "=m" ((*v).c22), + "=m" ((*v).c32)); + + _avx_zeroupper(); + + __asm__ __volatile__ ("movupd %0, %%xmm0 \n\t" + "movupd %2, %%xmm1 \n\t" + "xorpd %%xmm2, %%xmm2\n\t" + "movhpd %4, %%xmm2" + : + : + "m" ((*X).c6), + "m" ((*X).c7), + "m" ((*X).c8), + "m" ((*X).c9), + "m" ((*X).c3) + : + "xmm0", "xmm1", "xmm2"); + + su3xsu3vec(u); + + __asm__ __volatile__ ("movapd %%xmm3, %0 \n\t" + "movapd %%xmm4, %1 \n\t" + "movapd %%xmm5, %2" + : + "=m" ((*v).c13), + "=m" ((*v).c23), + "=m" ((*v).c33)); +} + + +void su3dagxu3alg(su3_dble *u,u3_alg_dble *X,su3_dble *v) +{ + __asm__ __volatile__ ("vxorpd %%ymm5, %%ymm5, %%ymm5 \n\t" + "vbroadcastf128 %8, %%ymm6 \n\t" + "vmovupd %0, %%xmm1 \n\t" + "vmovupd %2, %%xmm2 \n\t" + "vmovhpd %4, %%xmm5, %%xmm0 \n\t" + "vmovhpd %5, %%xmm5, %%xmm3 \n\t" + "vinsertf128 $0x1, %6, %%ymm2, %%ymm2 \n\t" + "vinsertf128 $0x1, %%xmm1, %%ymm0, %%ymm0 \n\t" + "vmulpd %%xmm6, %%xmm1, %%xmm1 \n\t" + "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t" + "vinsertf128 $0x1, %%xmm3, %%ymm1, %%ymm1" + : + : + "m" ((*X).c4), + "m" ((*X).c5), + "m" ((*X).c6), + "m" ((*X).c7), + "m" ((*X).c1), + "m" ((*X).c2), + "m" ((*X).c8), + "m" ((*X).c9), + "m" (_sse_sgn1_dble) + : + "xmm0", "xmm1", "xmm2", + "xmm3", "xmm5", "xmm6"); + + su3dagxsu3vec_pair(u); + + __asm__ __volatile__ ("vmovapd %%xmm3, %0 \n\t" + "vmovapd %%xmm4, %1 \n\t" + "vmovapd %%xmm5, %2 \n\t" + "vextractf128 $0x1, %%ymm3, %3 \n\t" + "vextractf128 $0x1, %%ymm4, %4 \n\t" + "vextractf128 $0x1, %%ymm5, %5" + : + "=m" ((*v).c11), + "=m" ((*v).c21), + "=m" ((*v).c31), + "=m" ((*v).c12), + "=m" ((*v).c22), + "=m" ((*v).c32)); + + _avx_zeroupper(); + + __asm__ __volatile__ ("movupd %0, %%xmm0 \n\t" + "movupd %2, %%xmm1 \n\t" + "xorpd %%xmm2, %%xmm2\n\t" + "movhpd %4, %%xmm2" + : + : + "m" ((*X).c6), + "m" ((*X).c7), + "m" ((*X).c8), + "m" ((*X).c9), + "m" ((*X).c3) + : + "xmm0", "xmm1", "xmm2"); + + su3dagxsu3vec(u); + + __asm__ __volatile__ ("movapd %%xmm3, %0 \n\t" + "movapd %%xmm4, %1 \n\t" + "movapd %%xmm5, %2" + : + "=m" ((*v).c13), + "=m" ((*v).c23), + "=m" ((*v).c33)); +} + + +void u3algxsu3(u3_alg_dble *X,su3_dble *u,su3_dble *v) +{ + __asm__ __volatile__ ("vxorpd %%ymm5, %%ymm5, %%ymm5 \n\t" + "vbroadcastf128 %8, %%ymm6 \n\t" + "vmovupd %0, %%xmm1 \n\t" + "vmovupd %2, %%xmm2 \n\t" + "vmovhpd %4, %%xmm5, %%xmm0 \n\t" + "vmovhpd %5, %%xmm5, %%xmm3 \n\t" + "vinsertf128 $0x1, %6, %%ymm2, %%ymm2 \n\t" + "vinsertf128 $0x1, %%xmm1, %%ymm0, %%ymm0 \n\t" + "vmulpd %%xmm6, %%xmm1, %%xmm1 \n\t" + "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t" + "vinsertf128 $0x1, %%xmm3, %%ymm1, %%ymm1" + : + : + "m" ((*X).c4), + "m" ((*X).c5), + "m" ((*X).c6), + "m" ((*X).c7), + "m" ((*X).c1), + "m" ((*X).c2), + "m" ((*X).c8), + "m" ((*X).c9), + "m" (_sse_sgn1_dble) + : + "xmm0", "xmm1", "xmm2", + "xmm3", "xmm5", "xmm6"); + + su3dagxsu3vec_pair(u); + + __asm__ __volatile__ ("vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" + "vaddsubpd %%ymm3, %%ymm0, %%ymm3 \n\t" + "vaddsubpd %%ymm4, %%ymm0, %%ymm4 \n\t" + "vaddsubpd %%ymm5, %%ymm0, %%ymm5 \n\t" + "vmovapd %%xmm3, %0 \n\t" + "vmovapd %%xmm4, %1 \n\t" + "vmovapd %%xmm5, %2 \n\t" + "vextractf128 $0x1, %%ymm3, %3 \n\t" + "vextractf128 $0x1, %%ymm4, %4 \n\t" + "vextractf128 $0x1, %%ymm5, %5" + : + "=m" ((*v).c11), + "=m" ((*v).c12), + "=m" ((*v).c13), + "=m" ((*v).c21), + "=m" ((*v).c22), + "=m" ((*v).c23) + : + : + "xmm0", "xmm3", "xmm4", "xmm5"); + + _avx_zeroupper(); + + __asm__ __volatile__ ("movupd %0, %%xmm0 \n\t" + "movupd %2, %%xmm1 \n\t" + "xorpd %%xmm2, %%xmm2\n\t" + "movhpd %4, %%xmm2" + : + : + "m" ((*X).c6), + "m" ((*X).c7), + "m" ((*X).c8), + "m" ((*X).c9), + "m" ((*X).c3) + : + "xmm0", "xmm1", "xmm2"); + + su3dagxsu3vec(u); + + __asm__ __volatile__ ("mulpd %3, %%xmm3\n\t" + "mulpd %3, %%xmm4\n\t" + "mulpd %3, %%xmm5\n\t" + "movapd %%xmm3, %0 \n\t" + "movapd %%xmm4, %1 \n\t" + "movapd %%xmm5, %2" + : + "=m" ((*v).c31), + "=m" ((*v).c32), + "=m" ((*v).c33) + : + "m" (_sse_sgn1_dble) + : + "xmm3", "xmm4", "xmm5"); +} + + +void u3algxsu3dag(u3_alg_dble *X,su3_dble *u,su3_dble *v) +{ + __asm__ __volatile__ ("vxorpd %%ymm5, %%ymm5, %%ymm5 \n\t" + "vbroadcastf128 %8, %%ymm6 \n\t" + "vmovupd %0, %%xmm1 \n\t" + "vmovupd %2, %%xmm2 \n\t" + "vmovhpd %4, %%xmm5, %%xmm0 \n\t" + "vmovhpd %5, %%xmm5, %%xmm3 \n\t" + "vinsertf128 $0x1, %6, %%ymm2, %%ymm2 \n\t" + "vinsertf128 $0x1, %%xmm1, %%ymm0, %%ymm0 \n\t" + "vmulpd %%xmm6, %%xmm1, %%xmm1 \n\t" + "vmulpd %%ymm6, %%ymm2, %%ymm2 \n\t" + "vinsertf128 $0x1, %%xmm3, %%ymm1, %%ymm1" + : + : + "m" ((*X).c4), + "m" ((*X).c5), + "m" ((*X).c6), + "m" ((*X).c7), + "m" ((*X).c1), + "m" ((*X).c2), + "m" ((*X).c8), + "m" ((*X).c9), + "m" (_sse_sgn1_dble) + : + "xmm0", "xmm1", "xmm2", + "xmm3", "xmm5", "xmm6"); + + su3xsu3vec_pair(u); + + __asm__ __volatile__ ("vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" + "vaddsubpd %%ymm3, %%ymm0, %%ymm3 \n\t" + "vaddsubpd %%ymm4, %%ymm0, %%ymm4 \n\t" + "vaddsubpd %%ymm5, %%ymm0, %%ymm5 \n\t" + "vmovapd %%xmm3, %0 \n\t" + "vmovapd %%xmm4, %1 \n\t" + "vmovapd %%xmm5, %2 \n\t" + "vextractf128 $0x1, %%ymm3, %3 \n\t" + "vextractf128 $0x1, %%ymm4, %4 \n\t" + "vextractf128 $0x1, %%ymm5, %5" + : + "=m" ((*v).c11), + "=m" ((*v).c12), + "=m" ((*v).c13), + "=m" ((*v).c21), + "=m" ((*v).c22), + "=m" ((*v).c23) + : + : + "xmm0", "xmm3", "xmm4", "xmm5"); + + _avx_zeroupper(); + + __asm__ __volatile__ ("movupd %0, %%xmm0 \n\t" + "movupd %2, %%xmm1 \n\t" + "xorpd %%xmm2, %%xmm2\n\t" + "movhpd %4, %%xmm2" + : + : + "m" ((*X).c6), + "m" ((*X).c7), + "m" ((*X).c8), + "m" ((*X).c9), + "m" ((*X).c3) + : + "xmm0", "xmm1", "xmm2"); + + su3xsu3vec(u); + + __asm__ __volatile__ ("mulpd %3, %%xmm3\n\t" + "mulpd %3, %%xmm4\n\t" + "mulpd %3, %%xmm5\n\t" + "movapd %%xmm3, %0 \n\t" + "movapd %%xmm4, %1 \n\t" + "movapd %%xmm5, %2" + : + "=m" ((*v).c31), + "=m" ((*v).c32), + "=m" ((*v).c33) + : + "m" (_sse_sgn1_dble) + : + "xmm3", "xmm4", "xmm5"); +} + + +double prod2su3alg(su3_dble *u,su3_dble *v,su3_alg_dble *X) +{ + __asm__ __volatile__ ("vmovapd %0, %%xmm0 \n\t" + "vmovapd %1, %%xmm1 \n\t" + "vmovapd %2, %%xmm2 \n\t" + "vinsertf128 $0x1, %3, %%ymm0, %%ymm0 \n\t" + "vinsertf128 $0x1, %4, %%ymm1, %%ymm1 \n\t" + "vinsertf128 $0x1, %5, %%ymm2, %%ymm2" + : + : + "m" ((*v).c11), + "m" ((*v).c21), + "m" ((*v).c31), + "m" ((*v).c12), + "m" ((*v).c22), + "m" ((*v).c32) + : + "xmm0", "xmm1", "xmm2"); + + su3xsu3vec_pair(u); + + __asm__ __volatile__ ("vmovapd %%xmm5, %0 \n\t" + "vextractf128 $0x1, %%ymm3, %%xmm6 \n\t" + "vextractf128 $0x1, %%ymm4, %%xmm7 \n\t" + "vextractf128 $0x1, %%ymm5, %2 \n\t" + "vaddsubpd %%xmm4, %%xmm6, %%xmm6" + : + "=m" ((*X).c5), + "=m" ((*X).c6), + "=m" ((*X).c7), + "=m" ((*X).c8) + : + : + "xmm6", "xmm7"); + + __asm__ __volatile__ ("vmovhpd %%xmm3, %0 \n\t" + "vmovhpd %%xmm7, %1 \n\t" + "vaddsd %%xmm7, %%xmm3, %%xmm3 \n\t" + "vmulpd %5, %%xmm6, %%xmm6 \n\t" + "vmovlpd %%xmm3, %2 \n\t" + "vmovapd %%xmm6, %3" + : + "=m" ((*X).c1), + "=m" ((*X).c2), + "=m" (tr), + "=m" ((*X).c3), + "=m" ((*X).c4) + : + "m" (c0) + : + "xmm3", "xmm6"); + + _avx_zeroupper(); + + __asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" + "movapd %1, %%xmm1 \n\t" + "movapd %2, %%xmm2" + : + : + "m" ((*v).c13), + "m" ((*v).c23), + "m" ((*v).c33) + : + "xmm0", "xmm1", "xmm2"); + + su3xsu3vec(u); + + __asm__ __volatile__ ("addsd %1, %%xmm5\n\t" + "movlpd %%xmm5, %0" + : + "=m" (tr) + : + "m" (tr) + : + "xmm5"); + + __asm__ __volatile__ ("addsubpd %0, %%xmm3 \n\t" + "addsubpd %2, %%xmm4 \n\t" + "movlpd %4, %%xmm5 \n\t" + "movddup %5, %%xmm6 \n\t" + "mulpd %6, %%xmm3 \n\t" + "subpd %%xmm6, %%xmm5 \n\t" + "mulpd %6, %%xmm4 \n\t" + "mulpd %7, %%xmm5" + : + : + "m" ((*X).c5), + "m" ((*X).c6), + "m" ((*X).c7), + "m" ((*X).c8), + "m" ((*X).c2), + "m" ((*X).c1), + "m" (c0), + "m" (c1) + : + "xmm3", "xmm4", "xmm5", "xmm6"); + + __asm__ __volatile__ ("movapd %%xmm3, %0 \n\t" + "movapd %%xmm4, %2 \n\t" + "movapd %%xmm5, %4" + : + "=m" ((*X).c5), + "=m" ((*X).c6), + "=m" ((*X).c7), + "=m" ((*X).c8), + "=m" ((*X).c1)); + + return tr; +} + + +void prod2u3alg(su3_dble *u,su3_dble *v,u3_alg_dble *X) +{ + __asm__ __volatile__ ("vmovapd %0, %%xmm0 \n\t" + "vmovapd %1, %%xmm1 \n\t" + "vmovapd %2, %%xmm2 \n\t" + "vinsertf128 $0x1, %3, %%ymm0, %%ymm0 \n\t" + "vinsertf128 $0x1, %4, %%ymm1, %%ymm1 \n\t" + "vinsertf128 $0x1, %5, %%ymm2, %%ymm2" + : + : + "m" ((*v).c11), + "m" ((*v).c21), + "m" ((*v).c31), + "m" ((*v).c12), + "m" ((*v).c22), + "m" ((*v).c32) + : + "xmm0", "xmm1", "xmm2"); + + su3xsu3vec_pair(u); + + __asm__ __volatile__ ("vmovupd %%xmm5, %0 \n\t" + "vextractf128 $0x1, %%ymm3, %%xmm6 \n\t" + "vextractf128 $0x1, %%ymm4, %%xmm7 \n\t" + "vextractf128 $0x1, %%ymm5, %2 \n\t" + "vaddsubpd %%xmm4, %%xmm6, %%xmm6 \n\t" + "vaddsubpd %%xmm3, %%xmm3, %%xmm3 \n\t" + "vaddsubpd %%xmm7, %%xmm7, %%xmm7" + : + "=m" ((*X).c6), + "=m" ((*X).c7), + "=m" ((*X).c8), + "=m" ((*X).c9) + : + : + "xmm3", "xmm6", "xmm7"); + + __asm__ __volatile__ ("vmovupd %%xmm6, %0 \n\t" + "vmovhpd %%xmm3, %2 \n\t" + "vmovhpd %%xmm7, %3" + : + "=m" ((*X).c4), + "=m" ((*X).c5), + "=m" ((*X).c1), + "=m" ((*X).c2)); + + _avx_zeroupper(); + + __asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" + "movapd %1, %%xmm1 \n\t" + "movapd %2, %%xmm2" + : + : + "m" ((*v).c13), + "m" ((*v).c23), + "m" ((*v).c33) + : + "xmm0", "xmm1", "xmm2"); + + su3xsu3vec(u); + + __asm__ __volatile__ ("movupd %0, %%xmm0 \n\t" + "movupd %2, %%xmm1 \n\t" + "addsubpd %%xmm5, %%xmm5 \n\t" + "addsubpd %%xmm0, %%xmm3 \n\t" + "addsubpd %%xmm1, %%xmm4" + : + : + "m" ((*X).c6), + "m" ((*X).c7), + "m" ((*X).c8), + "m" ((*X).c9) + : + "xmm0", "xmm1", "xmm3", + "xmm4", "xmm5"); + + __asm__ __volatile__ ("movhpd %%xmm5, %0 \n\t" + "movupd %%xmm3, %1 \n\t" + "movupd %%xmm4, %3" + : + "=m" ((*X).c3), + "=m" ((*X).c6), + "=m" ((*X).c7), + "=m" ((*X).c8), + "=m" ((*X).c9)); +} + + +void rotate_su3alg(su3_dble *u,su3_alg_dble *X) +{ + __asm__ __volatile__ ("vxorpd %%xmm5, %%xmm5, %%xmm5 \n\t" + "vmovsd %0, %%xmm0 \n\t" + "vmovsd %1, %%xmm1 \n\t" + "vmovapd %2, %%xmm2 \n\t" + "vmovapd %4, %%xmm3 \n\t" + "vmovapd %6, %%xmm4" + : + : + "m" ((*X).c1), + "m" ((*X).c2), + "m" ((*X).c3), + "m" ((*X).c4), + "m" ((*X).c5), + "m" ((*X).c6), + "m" ((*X).c7), + "m" ((*X).c8) + : + "xmm0", "xmm1", "xmm2", + "xmm3", "xmm4", "xmm5"); + + __asm__ __volatile__ ("vaddsubpd %%xmm2, %%xmm5, %%xmm6 \n\t" + "vaddsubpd %%xmm3, %%xmm5, %%xmm7 \n\t" + "vaddsubpd %%xmm4, %%xmm5, %%xmm8 \n\t" + "vmovapd %%xmm2, %0 \n\t" + "vmovapd %%xmm3, %1 \n\t" + "vmovapd %%xmm4, %2 \n\t" + "vmovapd %%xmm6, %3 \n\t" + "vmovapd %%xmm7, %4 \n\t" + "vmovapd %%xmm8, %5" + : + "=m" (uX.c12), + "=m" (uX.c13), + "=m" (uX.c23), + "=m" (uX.c21), + "=m" (uX.c31), + "=m" (uX.c32) + : + : + "xmm6", "xmm7", "xmm8"); + + __asm__ __volatile__ ("vaddsd %%xmm0, %%xmm1, %%xmm2 \n\t" + "vsubsd %%xmm0, %%xmm1, %%xmm3 \n\t" + "vsubsd %%xmm1, %%xmm0, %%xmm4 \n\t" + "vsubsd %%xmm0, %%xmm3, %%xmm3 \n\t" + "vsubsd %%xmm1, %%xmm4, %%xmm4 \n\t" + "vpermilpd $0x1, %%xmm2, %%xmm2 \n\t" + "vpermilpd $0x1, %%xmm3, %%xmm3 \n\t" + "vpermilpd $0x1, %%xmm4, %%xmm4 \n\t" + "vmovapd %%xmm2, %0 \n\t" + "vmovapd %%xmm3, %1 \n\t" + "vmovapd %%xmm4, %2" + : + "=m" (uX.c11), + "=m" (uX.c22), + "=m" (uX.c33) + : + : + "xmm2", "xmm3", "xmm4"); + + __asm__ __volatile__ ("vbroadcastf128 %6, %%ymm3 \n\t" + "vmovapd %0, %%xmm0 \n\t" + "vmovapd %1, %%xmm1 \n\t" + "vmovapd %2, %%xmm2 \n\t" + "vinsertf128 $0x1, %3, %%ymm0, %%ymm0 \n\t" + "vinsertf128 $0x1, %4, %%ymm1, %%ymm1 \n\t" + "vinsertf128 $0x1, %5, %%ymm2, %%ymm2 \n\t" + "vmulpd %%ymm3, %%ymm0, %%ymm0 \n\t" + "vmulpd %%ymm3, %%ymm1, %%ymm1 \n\t" + "vmulpd %%ymm3, %%ymm2, %%ymm2" + : + : + "m" ((*u).c11), + "m" ((*u).c12), + "m" ((*u).c13), + "m" ((*u).c21), + "m" ((*u).c22), + "m" ((*u).c23), + "m" (_sse_sgn2_dble) + : + "xmm0", "xmm1", "xmm2", "xmm3"); + + su3xsu3vec_pair(&uX); + + __asm__ __volatile__ ("vmovapd %%ymm3, %%ymm0 \n\t" + "vmovapd %%ymm4, %%ymm1 \n\t" + "vmovapd %%ymm5, %%ymm2" + : + : + : + "xmm0", "xmm1", "xmm2"); + + su3xsu3vec_pair(u); + + __asm__ __volatile__ ("vxorpd %%ymm0, %%ymm0, %%ymm0 \n\t" + "vextractf128 $0x1, %%ymm4, %%xmm2 \n\t" + "vextractf128 $0x1, %%ymm3, %0 \n\t" + "vaddsubpd %%ymm5, %%ymm0, %%ymm0 \n\t" + "vpermilpd $0x1, %%xmm2, %%xmm2 \n\t" + "vpermilpd $0x1, %%xmm3, %%xmm3 \n\t" + "vmovapd %%xmm0, %2 \n\t" + "vextractf128 $0x1, %%ymm0, %4 \n\t" + "vsubsd %%xmm3, %%xmm2, %%xmm1 \n\t" + "vmulsd %8, %%xmm1, %%xmm1 \n\t" + "vsubsd %%xmm1, %%xmm3, %%xmm2 \n\t" + "vmovsd %%xmm1, %6 \n\t" + "vmovsd %%xmm2, %7" + : + "=m" ((*X).c3), + "=m" ((*X).c4), + "=m" ((*X).c5), + "=m" ((*X).c6), + "=m" ((*X).c7), + "=m" ((*X).c8), + "=m" ((*X).c1), + "=m" ((*X).c2) + : + "m" (c1.c1) + : + "xmm0", "xmm1", "xmm2", "xmm3"); + + _avx_zeroupper(); +} + +#elif (defined x64) +#include "sse2.h" + +static const sse_double c0={0.5,0.5},c1={-1.0/3.0,-1.0/3.0}; +static su3_dble uX ALIGNED16; +static double tr ALIGNED8; + + +static void su3xsu3vec(su3_dble *u) +{ + _sse_su3_multiply_dble(*u); +} + + +static void su3dagxsu3vec(su3_dble *u) +{ + _sse_su3_inverse_multiply_dble(*u); +} + + +void su3xsu3(su3_dble *u,su3_dble *v,su3_dble *w) +{ + __asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" + "movapd %1, %%xmm1 \n\t" + "movapd %2, %%xmm2" + : + : + "m" ((*v).c11), + "m" ((*v).c21), + "m" ((*v).c31) + : + "xmm0", "xmm1", "xmm2"); + su3xsu3vec(u); + __asm__ __volatile__ ("movapd %%xmm3, %0 \n\t" + "movapd %%xmm4, %1 \n\t" + "movapd %%xmm5, %2" + : + "=m" ((*w).c11), + "=m" ((*w).c21), + "=m" ((*w).c31)); + + __asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" + "movapd %1, %%xmm1 \n\t" + "movapd %2, %%xmm2" + : + : + "m" ((*v).c12), + "m" ((*v).c22), + "m" ((*v).c32) + : + "xmm0", "xmm1", "xmm2"); + su3xsu3vec(u); + __asm__ __volatile__ ("movapd %%xmm3, %0 \n\t" + "movapd %%xmm4, %1 \n\t" + "movapd %%xmm5, %2" + : + "=m" ((*w).c12), + "=m" ((*w).c22), + "=m" ((*w).c32)); + + __asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" + "movapd %1, %%xmm1 \n\t" + "movapd %2, %%xmm2" + : + : + "m" ((*v).c13), + "m" ((*v).c23), + "m" ((*v).c33) + : + "xmm0", "xmm1", "xmm2"); + su3xsu3vec(u); + __asm__ __volatile__ ("movapd %%xmm3, %0 \n\t" + "movapd %%xmm4, %1 \n\t" + "movapd %%xmm5, %2" + : + "=m" ((*w).c13), + "=m" ((*w).c23), + "=m" ((*w).c33)); +} + + +void su3dagxsu3(su3_dble *u,su3_dble *v,su3_dble *w) +{ + __asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" + "movapd %1, %%xmm1 \n\t" + "movapd %2, %%xmm2" + : + : + "m" ((*v).c11), + "m" ((*v).c21), + "m" ((*v).c31) + : + "xmm0", "xmm1", "xmm2"); + su3dagxsu3vec(u); + __asm__ __volatile__ ("movapd %%xmm3, %0 \n\t" + "movapd %%xmm4, %1 \n\t" + "movapd %%xmm5, %2" + : + "=m" ((*w).c11), + "=m" ((*w).c21), + "=m" ((*w).c31)); + + __asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" + "movapd %1, %%xmm1 \n\t" + "movapd %2, %%xmm2" + : + : + "m" ((*v).c12), + "m" ((*v).c22), + "m" ((*v).c32) + : + "xmm0", "xmm1", "xmm2"); + su3dagxsu3vec(u); + __asm__ __volatile__ ("movapd %%xmm3, %0 \n\t" + "movapd %%xmm4, %1 \n\t" + "movapd %%xmm5, %2" + : + "=m" ((*w).c12), + "=m" ((*w).c22), + "=m" ((*w).c32)); + + __asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" + "movapd %1, %%xmm1 \n\t" + "movapd %2, %%xmm2" + : + : + "m" ((*v).c13), + "m" ((*v).c23), + "m" ((*v).c33) + : + "xmm0", "xmm1", "xmm2"); + su3dagxsu3vec(u); + __asm__ __volatile__ ("movapd %%xmm3, %0 \n\t" + "movapd %%xmm4, %1 \n\t" + "movapd %%xmm5, %2" + : + "=m" ((*w).c13), + "=m" ((*w).c23), + "=m" ((*w).c33)); +} + + +void su3xsu3dag(su3_dble *u,su3_dble *v,su3_dble *w) +{ + __asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" + "movapd %1, %%xmm1 \n\t" + "movapd %2, %%xmm2 \n\t" + "mulpd %3, %%xmm0 \n\t" + "mulpd %3, %%xmm1 \n\t" + "mulpd %3, %%xmm2" + : + : + "m" ((*v).c11), + "m" ((*v).c12), + "m" ((*v).c13), + "m" (_sse_sgn2_dble) + : + "xmm0", "xmm1", "xmm2"); + su3xsu3vec(u); + __asm__ __volatile__ ("movapd %%xmm3, %0 \n\t" + "movapd %%xmm4, %1 \n\t" + "movapd %%xmm5, %2" + : + "=m" ((*w).c11), + "=m" ((*w).c21), + "=m" ((*w).c31)); + + __asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" + "movapd %1, %%xmm1 \n\t" + "movapd %2, %%xmm2 \n\t" + "mulpd %3, %%xmm0 \n\t" + "mulpd %3, %%xmm1 \n\t" + "mulpd %3, %%xmm2" + : + : + "m" ((*v).c21), + "m" ((*v).c22), + "m" ((*v).c23), + "m" (_sse_sgn2_dble) + : + "xmm0", "xmm1", "xmm2"); + su3xsu3vec(u); + __asm__ __volatile__ ("movapd %%xmm3, %0 \n\t" + "movapd %%xmm4, %1 \n\t" + "movapd %%xmm5, %2" + : + "=m" ((*w).c12), + "=m" ((*w).c22), + "=m" ((*w).c32)); + + __asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" + "movapd %1, %%xmm1 \n\t" + "movapd %2, %%xmm2 \n\t" + "mulpd %3, %%xmm0 \n\t" + "mulpd %3, %%xmm1 \n\t" + "mulpd %3, %%xmm2" + : + : + "m" ((*v).c31), + "m" ((*v).c32), + "m" ((*v).c33), + "m" (_sse_sgn2_dble) + : + "xmm0", "xmm1", "xmm2"); + su3xsu3vec(u); + __asm__ __volatile__ ("movapd %%xmm3, %0 \n\t" + "movapd %%xmm4, %1 \n\t" + "movapd %%xmm5, %2" + : + "=m" ((*w).c13), + "=m" ((*w).c23), + "=m" ((*w).c33)); +} + + +void su3dagxsu3dag(su3_dble *u,su3_dble *v,su3_dble *w) +{ + __asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" + "movapd %1, %%xmm1 \n\t" + "movapd %2, %%xmm2 \n\t" + "mulpd %3, %%xmm0 \n\t" + "mulpd %3, %%xmm1 \n\t" + "mulpd %3, %%xmm2" + : + : + "m" ((*v).c11), + "m" ((*v).c12), + "m" ((*v).c13), + "m" (_sse_sgn2_dble) + : + "xmm0", "xmm1", "xmm2"); + su3dagxsu3vec(u); + __asm__ __volatile__ ("movapd %%xmm3, %0 \n\t" + "movapd %%xmm4, %1 \n\t" + "movapd %%xmm5, %2" + : + "=m" ((*w).c11), + "=m" ((*w).c21), + "=m" ((*w).c31)); + + __asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" + "movapd %1, %%xmm1 \n\t" + "movapd %2, %%xmm2 \n\t" + "mulpd %3, %%xmm0 \n\t" + "mulpd %3, %%xmm1 \n\t" + "mulpd %3, %%xmm2" + : + : + "m" ((*v).c21), + "m" ((*v).c22), + "m" ((*v).c23), + "m" (_sse_sgn2_dble) + : + "xmm0", "xmm1", "xmm2"); + su3dagxsu3vec(u); + __asm__ __volatile__ ("movapd %%xmm3, %0 \n\t" + "movapd %%xmm4, %1 \n\t" + "movapd %%xmm5, %2" + : + "=m" ((*w).c12), + "=m" ((*w).c22), + "=m" ((*w).c32)); + + __asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" + "movapd %1, %%xmm1 \n\t" + "movapd %2, %%xmm2 \n\t" + "mulpd %3, %%xmm0 \n\t" + "mulpd %3, %%xmm1 \n\t" + "mulpd %3, %%xmm2" + : + : + "m" ((*v).c31), + "m" ((*v).c32), + "m" ((*v).c33), + "m" (_sse_sgn2_dble) + : + "xmm0", "xmm1", "xmm2"); + su3dagxsu3vec(u); + __asm__ __volatile__ ("movapd %%xmm3, %0 \n\t" + "movapd %%xmm4, %1 \n\t" + "movapd %%xmm5, %2" + : + "=m" ((*w).c13), + "=m" ((*w).c23), + "=m" ((*w).c33)); +} + + +void su3xu3alg(su3_dble *u,u3_alg_dble *X,su3_dble *v) +{ + __asm__ __volatile__ ("xorpd %%xmm0, %%xmm0\n\t" + "movhpd %0, %%xmm0 \n\t" + "movupd %1, %%xmm1 \n\t" + "movupd %2, %%xmm2 \n\t" + "mulpd %3, %%xmm1 \n\t" + "mulpd %3, %%xmm2" + : + : + "m" ((*X).c1), + "m" ((*X).c4), + "m" ((*X).c6), + "m" (_sse_sgn1_dble) + : + "xmm0", "xmm1", "xmm2"); + su3xsu3vec(u); + __asm__ __volatile__ ("movapd %%xmm3, %0 \n\t" + "movapd %%xmm4, %1 \n\t" + "movapd %%xmm5, %2" + : + "=m" ((*v).c11), + "=m" ((*v).c21), + "=m" ((*v).c31)); + + __asm__ __volatile__ ("movupd %0, %%xmm0 \n\t" + "xorpd %%xmm1, %%xmm1\n\t" + "movhpd %1, %%xmm1 \n\t" + "movupd %2, %%xmm2 \n\t" + "mulpd %3, %%xmm2" + : + : + "m" ((*X).c4), + "m" ((*X).c2), + "m" ((*X).c8), + "m" (_sse_sgn1_dble) + : + "xmm0", "xmm1", "xmm2"); + su3xsu3vec(u); + __asm__ __volatile__ ("movapd %%xmm3, %0 \n\t" + "movapd %%xmm4, %1 \n\t" + "movapd %%xmm5, %2" + : + "=m" ((*v).c12), + "=m" ((*v).c22), + "=m" ((*v).c32)); + + + __asm__ __volatile__ ("movupd %0, %%xmm0 \n\t" + "movupd %1, %%xmm1 \n\t" + "xorpd %%xmm2, %%xmm2\n\t" + "movhpd %2, %%xmm2" + : + : + "m" ((*X).c6), + "m" ((*X).c8), + "m" ((*X).c3) + : + "xmm0", "xmm1", "xmm2"); + su3xsu3vec(u); + __asm__ __volatile__ ("movapd %%xmm3, %0 \n\t" + "movapd %%xmm4, %1 \n\t" + "movapd %%xmm5, %2" + : + "=m" ((*v).c13), + "=m" ((*v).c23), + "=m" ((*v).c33)); +} + + +void su3dagxu3alg(su3_dble *u,u3_alg_dble *X,su3_dble *v) +{ + __asm__ __volatile__ ("xorpd %%xmm0, %%xmm0\n\t" + "movhpd %0, %%xmm0 \n\t" + "movupd %1, %%xmm1 \n\t" + "movupd %3, %%xmm2 \n\t" + "mulpd %5, %%xmm1 \n\t" + "mulpd %5, %%xmm2" + : + : + "m" ((*X).c1), + "m" ((*X).c4), + "m" ((*X).c5), + "m" ((*X).c6), + "m" ((*X).c7), + "m" (_sse_sgn1_dble) + : + "xmm0", "xmm1", "xmm2"); + su3dagxsu3vec(u); + __asm__ __volatile__ ("movapd %%xmm3, %0 \n\t" + "movapd %%xmm4, %1 \n\t" + "movapd %%xmm5, %2" + : + "=m" ((*v).c11), + "=m" ((*v).c21), + "=m" ((*v).c31)); + + __asm__ __volatile__ ("movupd %0, %%xmm0 \n\t" + "xorpd %%xmm1, %%xmm1\n\t" + "movhpd %2, %%xmm1 \n\t" + "movupd %3, %%xmm2 \n\t" + "mulpd %5, %%xmm2" + : + : + "m" ((*X).c4), + "m" ((*X).c5), + "m" ((*X).c2), + "m" ((*X).c8), + "m" ((*X).c9), + "m" (_sse_sgn1_dble) + : + "xmm0", "xmm1", "xmm2"); + su3dagxsu3vec(u); + __asm__ __volatile__ ("movapd %%xmm3, %0 \n\t" + "movapd %%xmm4, %1 \n\t" + "movapd %%xmm5, %2" + : + "=m" ((*v).c12), + "=m" ((*v).c22), + "=m" ((*v).c32)); + + + __asm__ __volatile__ ("movupd %0, %%xmm0 \n\t" + "movupd %2, %%xmm1 \n\t" + "xorpd %%xmm2, %%xmm2\n\t" + "movhpd %4, %%xmm2" + : + : + "m" ((*X).c6), + "m" ((*X).c7), + "m" ((*X).c8), + "m" ((*X).c9), + "m" ((*X).c3) + : + "xmm0", "xmm1", "xmm2"); + su3dagxsu3vec(u); + __asm__ __volatile__ ("movapd %%xmm3, %0 \n\t" + "movapd %%xmm4, %1 \n\t" + "movapd %%xmm5, %2" + : + "=m" ((*v).c13), + "=m" ((*v).c23), + "=m" ((*v).c33)); +} + + +void u3algxsu3(u3_alg_dble *X,su3_dble *u,su3_dble *v) +{ + __asm__ __volatile__ ("xorpd %%xmm0, %%xmm0\n\t" + "movhpd %0, %%xmm0 \n\t" + "movupd %1, %%xmm1 \n\t" + "movupd %3, %%xmm2 \n\t" + "mulpd %5, %%xmm1 \n\t" + "mulpd %5, %%xmm2" + : + : + "m" ((*X).c1), + "m" ((*X).c4), + "m" ((*X).c5), + "m" ((*X).c6), + "m" ((*X).c7), + "m" (_sse_sgn1_dble) + : + "xmm0", "xmm1", "xmm2"); + su3dagxsu3vec(u); + __asm__ __volatile__ ("mulpd %3, %%xmm3\n\t" + "mulpd %3, %%xmm4\n\t" + "mulpd %3, %%xmm5\n\t" + "movapd %%xmm3, %0 \n\t" + "movapd %%xmm4, %1 \n\t" + "movapd %%xmm5, %2" + : + "=m" ((*v).c11), + "=m" ((*v).c12), + "=m" ((*v).c13) + : + "m" (_sse_sgn1_dble) + : + "xmm3", "xmm4", "xmm5"); + + __asm__ __volatile__ ("movupd %0, %%xmm0 \n\t" + "xorpd %%xmm1, %%xmm1\n\t" + "movhpd %2, %%xmm1 \n\t" + "movupd %3, %%xmm2 \n\t" + "mulpd %5, %%xmm2" + : + : + "m" ((*X).c4), + "m" ((*X).c5), + "m" ((*X).c2), + "m" ((*X).c8), + "m" ((*X).c9), + "m" (_sse_sgn1_dble) + : + "xmm0", "xmm1", "xmm2"); + su3dagxsu3vec(u); + __asm__ __volatile__ ("mulpd %3, %%xmm3\n\t" + "mulpd %3, %%xmm4\n\t" + "mulpd %3, %%xmm5\n\t" + "movapd %%xmm3, %0 \n\t" + "movapd %%xmm4, %1 \n\t" + "movapd %%xmm5, %2" + : + "=m" ((*v).c21), + "=m" ((*v).c22), + "=m" ((*v).c23) + : + "m" (_sse_sgn1_dble) + : + "xmm3", "xmm4", "xmm5"); + + __asm__ __volatile__ ("movupd %0, %%xmm0 \n\t" + "movupd %2, %%xmm1 \n\t" + "xorpd %%xmm2, %%xmm2\n\t" + "movhpd %4, %%xmm2" + : + : + "m" ((*X).c6), + "m" ((*X).c7), + "m" ((*X).c8), + "m" ((*X).c9), + "m" ((*X).c3) + : + "xmm0", "xmm1", "xmm2"); + su3dagxsu3vec(u); + __asm__ __volatile__ ("mulpd %3, %%xmm3\n\t" + "mulpd %3, %%xmm4\n\t" + "mulpd %3, %%xmm5\n\t" + "movapd %%xmm3, %0 \n\t" + "movapd %%xmm4, %1 \n\t" + "movapd %%xmm5, %2" + : + "=m" ((*v).c31), + "=m" ((*v).c32), + "=m" ((*v).c33) + : + "m" (_sse_sgn1_dble) + : + "xmm3", "xmm4", "xmm5"); +} + + +void u3algxsu3dag(u3_alg_dble *X,su3_dble *u,su3_dble *v) +{ + __asm__ __volatile__ ("xorpd %%xmm0, %%xmm0\n\t" + "movhpd %0, %%xmm0 \n\t" + "movupd %1, %%xmm1 \n\t" + "movupd %3, %%xmm2 \n\t" + "mulpd %5, %%xmm1 \n\t" + "mulpd %5, %%xmm2" + : + : + "m" ((*X).c1), + "m" ((*X).c4), + "m" ((*X).c5), + "m" ((*X).c6), + "m" ((*X).c7), + "m" (_sse_sgn1_dble) + : + "xmm0", "xmm1", "xmm2"); + su3xsu3vec(u); + __asm__ __volatile__ ("mulpd %3, %%xmm3\n\t" + "mulpd %3, %%xmm4\n\t" + "mulpd %3, %%xmm5\n\t" + "movapd %%xmm3, %0 \n\t" + "movapd %%xmm4, %1 \n\t" + "movapd %%xmm5, %2" + : + "=m" ((*v).c11), + "=m" ((*v).c12), + "=m" ((*v).c13) + : + "m" (_sse_sgn1_dble) + : + "xmm3", "xmm4", "xmm5"); + + __asm__ __volatile__ ("movupd %0, %%xmm0 \n\t" + "xorpd %%xmm1, %%xmm1\n\t" + "movhpd %2, %%xmm1 \n\t" + "movupd %3, %%xmm2 \n\t" + "mulpd %5, %%xmm2" + : + : + "m" ((*X).c4), + "m" ((*X).c5), + "m" ((*X).c2), + "m" ((*X).c8), + "m" ((*X).c9), + "m" (_sse_sgn1_dble) + : + "xmm0", "xmm1", "xmm2"); + su3xsu3vec(u); + __asm__ __volatile__ ("mulpd %3, %%xmm3\n\t" + "mulpd %3, %%xmm4\n\t" + "mulpd %3, %%xmm5\n\t" + "movapd %%xmm3, %0 \n\t" + "movapd %%xmm4, %1 \n\t" + "movapd %%xmm5, %2" + : + "=m" ((*v).c21), + "=m" ((*v).c22), + "=m" ((*v).c23) + : + "m" (_sse_sgn1_dble) + : + "xmm3", "xmm4", "xmm5"); + + __asm__ __volatile__ ("movupd %0, %%xmm0 \n\t" + "movupd %2, %%xmm1 \n\t" + "xorpd %%xmm2, %%xmm2\n\t" + "movhpd %4, %%xmm2" + : + : + "m" ((*X).c6), + "m" ((*X).c7), + "m" ((*X).c8), + "m" ((*X).c9), + "m" ((*X).c3) + : + "xmm0", "xmm1", "xmm2"); + su3xsu3vec(u); + __asm__ __volatile__ ("mulpd %3, %%xmm3\n\t" + "mulpd %3, %%xmm4\n\t" + "mulpd %3, %%xmm5\n\t" + "movapd %%xmm3, %0 \n\t" + "movapd %%xmm4, %1 \n\t" + "movapd %%xmm5, %2" + : + "=m" ((*v).c31), + "=m" ((*v).c32), + "=m" ((*v).c33) + : + "m" (_sse_sgn1_dble) + : + "xmm3", "xmm4", "xmm5"); +} + + +double prod2su3alg(su3_dble *u,su3_dble *v,su3_alg_dble *X) +{ + __asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" + "movapd %1, %%xmm1 \n\t" + "movapd %2, %%xmm2" + : + : + "m" ((*v).c11), + "m" ((*v).c21), + "m" ((*v).c31) + : + "xmm0", "xmm1", "xmm2"); + su3xsu3vec(u); + __asm__ __volatile__ ("movlpd %%xmm3, %0" + : + "=m" (tr)); + __asm__ __volatile__ ("mulpd %6, %%xmm4 \n\t" + "mulpd %6, %%xmm5 \n\t" + "movhpd %%xmm3, %0 \n\t" + "movhpd %%xmm3, %1 \n\t" + "movapd %%xmm4, %2 \n\t" + "movapd %%xmm5, %4" + : + "=m" ((*X).c1), + "=m" ((*X).c2), + "=m" ((*X).c3), + "=m" ((*X).c4), + "=m" ((*X).c5), + "=m" ((*X).c6) + : + "m" (_sse_sgn1_dble) + : + "xmm4", "xmm5"); + + __asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" + "movapd %1, %%xmm1 \n\t" + "movapd %2, %%xmm2" + : + : + "m" ((*v).c12), + "m" ((*v).c22), + "m" ((*v).c32) + : + "xmm0", "xmm1", "xmm2"); + su3xsu3vec(u); + __asm__ __volatile__ ("addsd %1, %%xmm4\n\t" + "movlpd %%xmm4, %0" + : + "=m" (tr) + : + "m" (tr) + : + "xmm4"); + __asm__ __volatile__ ("addpd %0, %%xmm3 \n\t" + "shufpd $0x1, %%xmm4, %%xmm4 \n\t" + "mulpd %2, %%xmm3 \n\t" + "subsd %3, %%xmm4 \n\t" + "mulpd %4, %%xmm5 \n\t" + "mulsd %5, %%xmm4" + : + : + "m" ((*X).c3), + "m" ((*X).c4), + "m" (c0), + "m" ((*X).c1), + "m" (_sse_sgn1_dble), + "m" (c1) + : + "xmm3", "xmm4", "xmm5"); + + __asm__ __volatile__ ("movapd %%xmm3, %0 \n\t" + "movlpd %%xmm4, %2 \n\t" + "movapd %%xmm5, %3" + : + "=m" ((*X).c3), + "=m" ((*X).c4), + "=m" ((*X).c1), + "=m" ((*X).c7), + "=m" ((*X).c8)); + + __asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" + "movapd %1, %%xmm1 \n\t" + "movapd %2, %%xmm2" + : + : + "m" ((*v).c13), + "m" ((*v).c23), + "m" ((*v).c33) + : + "xmm0", "xmm1", "xmm2"); + su3xsu3vec(u); + __asm__ __volatile__ ("addsd %1, %%xmm5\n\t" + "movlpd %%xmm5, %0" + : + "=m" (tr) + : + "m" (tr) + : + "xmm5"); + __asm__ __volatile__ ("addpd %0, %%xmm3 \n\t" + "addpd %2, %%xmm4 \n\t" + "shufpd $0x1, %%xmm5, %%xmm5 \n\t" + "mulpd %4, %%xmm3 \n\t" + "subsd %5, %%xmm5 \n\t" + "mulpd %4, %%xmm4 \n\t" + "mulsd %6, %%xmm5" + : + : + "m" ((*X).c5), + "m" ((*X).c6), + "m" ((*X).c7), + "m" ((*X).c8), + "m" (c0), + "m" ((*X).c2), + "m" (c1) + : + "xmm3", "xmm4", "xmm5"); + + __asm__ __volatile__ ("movapd %%xmm3, %0 \n\t" + "movapd %%xmm4, %2 \n\t" + "movlpd %%xmm5, %4" + : + "=m" ((*X).c5), + "=m" ((*X).c6), + "=m" ((*X).c7), + "=m" ((*X).c8), + "=m" ((*X).c2)); + + return tr; +} + + +void prod2u3alg(su3_dble *u,su3_dble *v,u3_alg_dble *X) +{ + __asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" + "movapd %1, %%xmm1 \n\t" + "movapd %2, %%xmm2" + : + : + "m" ((*v).c11), + "m" ((*v).c21), + "m" ((*v).c31) + : + "xmm0", "xmm1", "xmm2"); + su3xsu3vec(u); + __asm__ __volatile__ ("shufpd $0x1, %%xmm3, %%xmm3 \n\t" + "mulpd %0, %%xmm4 \n\t" + "mulpd %0, %%xmm5 \n\t" + "addsd %%xmm3, %%xmm3" + : + : + "m" (_sse_sgn1_dble) + : + "xmm3", "xmm4", "xmm5"); + + __asm__ __volatile__ ("movupd %%xmm4, %0 \n\t" + "movupd %%xmm5, %2 \n\t" + "movlpd %%xmm3, %4" + : + "=m" ((*X).c4), + "=m" ((*X).c5), + "=m" ((*X).c6), + "=m" ((*X).c7), + "=m" ((*X).c1)); + + __asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" + "movapd %1, %%xmm1 \n\t" + "movapd %2, %%xmm2" + : + : + "m" ((*v).c12), + "m" ((*v).c22), + "m" ((*v).c32) + : + "xmm0", "xmm1", "xmm2"); + su3xsu3vec(u); + __asm__ __volatile__ ("movupd %0, %%xmm6 \n\t" + "shufpd $0x1, %%xmm4, %%xmm4 \n\t" + "mulpd %2, %%xmm5 \n\t" + "addpd %%xmm6, %%xmm3 \n\t" + "addsd %%xmm4, %%xmm4" + : + : + "m" ((*X).c4), + "m" ((*X).c5), + "m" (_sse_sgn1_dble) + : + "xmm3", "xmm4", "xmm5", + "xmm6"); + + __asm__ __volatile__ ("movupd %%xmm5, %0 \n\t" + "movupd %%xmm3, %2 \n\t" + "movlpd %%xmm4, %4" + : + "=m" ((*X).c8), + "=m" ((*X).c9), + "=m" ((*X).c4), + "=m" ((*X).c5), + "=m" ((*X).c2)); + + __asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" + "movapd %1, %%xmm1 \n\t" + "movapd %2, %%xmm2" + : + : + "m" ((*v).c13), + "m" ((*v).c23), + "m" ((*v).c33) + : + "xmm0", "xmm1", "xmm2"); + su3xsu3vec(u); + __asm__ __volatile__ ("movupd %0, %%xmm6 \n\t" + "movupd %2, %%xmm7 \n\t" + "shufpd $0x1, %%xmm5, %%xmm5 \n\t" + "addpd %%xmm6, %%xmm3 \n\t" + "addpd %%xmm7, %%xmm4 \n\t" + "addsd %%xmm5, %%xmm5" + : + : + "m" ((*X).c6), + "m" ((*X).c7), + "m" ((*X).c8), + "m" ((*X).c9) + : + "xmm3", "xmm4", "xmm5", + "xmm6", "xmm7"); + + __asm__ __volatile__ ("movupd %%xmm3, %0 \n\t" + "movupd %%xmm4, %2 \n\t" + "movlpd %%xmm5, %4" + : + "=m" ((*X).c6), + "=m" ((*X).c7), + "=m" ((*X).c8), + "=m" ((*X).c9), + "=m" ((*X).c3)); +} + + +void rotate_su3alg(su3_dble *u,su3_alg_dble *X) +{ + __asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" + "movapd %2, %%xmm3 \n\t" + "movapd %4, %%xmm5 \n\t" + "movapd %6, %%xmm7 \n\t" + "movapd %%xmm0, %%xmm1 \n\t" + "movsd %%xmm0, %%xmm2 \n\t" + : + : + "m" ((*X).c1), + "m" ((*X).c2), + "m" ((*X).c3), + "m" ((*X).c4), + "m" ((*X).c5), + "m" ((*X).c6), + "m" ((*X).c7), + "m" ((*X).c8) + : + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm5", "xmm7"); + + __asm__ __volatile__ ("movapd %%xmm3, %0 \n\t" + "movapd %%xmm5, %1 \n\t" + "movapd %%xmm7, %2 \n\t" + "mulpd %6, %%xmm3 \n\t" + "mulpd %6, %%xmm5 \n\t" + "mulpd %6, %%xmm7 \n\t" + "movapd %%xmm3, %3 \n\t" + "movapd %%xmm5, %4 \n\t" + "movapd %%xmm7, %5" + : + "=m" (uX.c12), + "=m" (uX.c13), + "=m" (uX.c23), + "=m" (uX.c21), + "=m" (uX.c31), + "=m" (uX.c32) + : + "m" (_sse_sgn1_dble) + : + "xmm3", "xmm5", "xmm7"); + + __asm__ __volatile__ ("shufpd $0x1, %%xmm0, %%xmm0 \n\t" + "addpd %%xmm1, %%xmm1 \n\t" + "addsd %%xmm0, %%xmm2 \n\t" + "subpd %%xmm1, %%xmm0 \n\t" + "xorpd %%xmm3, %%xmm3 \n\t" + "movlpd %%xmm2, %0 \n\t" + "movlpd %%xmm0, %1 \n\t" + "movhpd %%xmm0, %2 \n\t" + "movlpd %%xmm3, %3 \n\t" + "movlpd %%xmm3, %4 \n\t" + "movlpd %%xmm3, %5" + : + "=m" (uX.c11.im), + "=m" (uX.c22.im), + "=m" (uX.c33.im), + "=m" (uX.c11.re), + "=m" (uX.c22.re), + "=m" (uX.c33.re) + : + : + "xmm0", "xmm1", "xmm2", "xmm3"); + + __asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" + "movapd %1, %%xmm1 \n\t" + "movapd %2, %%xmm2 \n\t" + "mulpd %3, %%xmm0 \n\t" + "mulpd %3, %%xmm1 \n\t" + "mulpd %3, %%xmm2" + : + : + "m" ((*u).c11), + "m" ((*u).c12), + "m" ((*u).c13), + "m" (_sse_sgn2_dble) + : + "xmm0", "xmm1", "xmm2"); + su3xsu3vec(&uX); + __asm__ __volatile__ ("movapd %%xmm3, %%xmm0 \n\t" + "movapd %%xmm4, %%xmm1 \n\t" + "movapd %%xmm5, %%xmm2" + : + : + : + "xmm0", "xmm1", "xmm2"); + su3xsu3vec(u); + __asm__ __volatile__ ("movhpd %%xmm3, %0 \n\t" + "mulpd %4, %%xmm5 \n\t" + "addpd %%xmm3, %%xmm3 \n\t" + "movapd %%xmm5, %1 \n\t" + "movhpd %%xmm3, %3" + : + "=m" ((*X).c1), + "=m" ((*X).c5), + "=m" ((*X).c6), + "=m" ((*X).c2) + : + "m" (_sse_sgn1_dble) + : + "xmm3", "xmm5"); + + __asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" + "movapd %1, %%xmm1 \n\t" + "movapd %2, %%xmm2 \n\t" + "mulpd %3, %%xmm0 \n\t" + "mulpd %3, %%xmm1 \n\t" + "mulpd %3, %%xmm2" + : + : + "m" ((*u).c21), + "m" ((*u).c22), + "m" ((*u).c23), + "m" (_sse_sgn2_dble) + : + "xmm0", "xmm1", "xmm2"); + su3xsu3vec(&uX); + __asm__ __volatile__ ("movapd %%xmm3, %%xmm0 \n\t" + "movapd %%xmm4, %%xmm1 \n\t" + "movapd %%xmm5, %%xmm2" + : + : + : + "xmm0", "xmm1", "xmm2"); + su3xsu3vec(u); + __asm__ __volatile__ ("movapd %%xmm3, %0 \n\t" + "unpckhpd %%xmm4, %%xmm4 \n\t" + "mulpd %2, %%xmm5 \n\t" + "mulpd %3, %%xmm4 \n\t" + "subpd %4, %%xmm4" + : + "=m" ((*X).c3), + "=m" ((*X).c4) + : + "m" (_sse_sgn1_dble), + "m" (_sse_sgn2_dble), + "m" ((*X).c1), + "m" ((*X).c2) + : + "xmm4", "xmm5"); + + __asm__ __volatile__ ("movapd %%xmm5, %0 \n\t" + "mulpd %4, %%xmm4 \n\t" + "movapd %%xmm4, %2" + : + "=m" ((*X).c7), + "=m" ((*X).c8), + "=m" ((*X).c1), + "=m" ((*X).c2) + : + "m" (c1) + : + "xmm4"); +} + +#else + +static su3_vector_dble psi,chi; +static su3_dble uX; + + +static void su3xsu3vec(su3_dble *u) +{ + _su3_multiply(chi,*u,psi); +} + + +static void su3dagxsu3vec(su3_dble *u) +{ + _su3_inverse_multiply(chi,*u,psi); +} + + +void su3xsu3(su3_dble *u,su3_dble *v,su3_dble *w) +{ + psi.c1=(*v).c11; + psi.c2=(*v).c21; + psi.c3=(*v).c31; + su3xsu3vec(u); + (*w).c11=chi.c1; + (*w).c21=chi.c2; + (*w).c31=chi.c3; + + psi.c1=(*v).c12; + psi.c2=(*v).c22; + psi.c3=(*v).c32; + su3xsu3vec(u); + (*w).c12=chi.c1; + (*w).c22=chi.c2; + (*w).c32=chi.c3; + + psi.c1=(*v).c13; + psi.c2=(*v).c23; + psi.c3=(*v).c33; + su3xsu3vec(u); + (*w).c13=chi.c1; + (*w).c23=chi.c2; + (*w).c33=chi.c3; +} + + +void su3dagxsu3(su3_dble *u,su3_dble *v,su3_dble *w) +{ + psi.c1=(*v).c11; + psi.c2=(*v).c21; + psi.c3=(*v).c31; + su3dagxsu3vec(u); + (*w).c11=chi.c1; + (*w).c21=chi.c2; + (*w).c31=chi.c3; + + psi.c1=(*v).c12; + psi.c2=(*v).c22; + psi.c3=(*v).c32; + su3dagxsu3vec(u); + (*w).c12=chi.c1; + (*w).c22=chi.c2; + (*w).c32=chi.c3; + + psi.c1=(*v).c13; + psi.c2=(*v).c23; + psi.c3=(*v).c33; + su3dagxsu3vec(u); + (*w).c13=chi.c1; + (*w).c23=chi.c2; + (*w).c33=chi.c3; +} + + +void su3xsu3dag(su3_dble *u,su3_dble *v,su3_dble *w) +{ + psi.c1.re= (*v).c11.re; + psi.c1.im=-(*v).c11.im; + psi.c2.re= (*v).c12.re; + psi.c2.im=-(*v).c12.im; + psi.c3.re= (*v).c13.re; + psi.c3.im=-(*v).c13.im; + su3xsu3vec(u); + (*w).c11=chi.c1; + (*w).c21=chi.c2; + (*w).c31=chi.c3; + + psi.c1.re= (*v).c21.re; + psi.c1.im=-(*v).c21.im; + psi.c2.re= (*v).c22.re; + psi.c2.im=-(*v).c22.im; + psi.c3.re= (*v).c23.re; + psi.c3.im=-(*v).c23.im; + su3xsu3vec(u); + (*w).c12=chi.c1; + (*w).c22=chi.c2; + (*w).c32=chi.c3; + + psi.c1.re= (*v).c31.re; + psi.c1.im=-(*v).c31.im; + psi.c2.re= (*v).c32.re; + psi.c2.im=-(*v).c32.im; + psi.c3.re= (*v).c33.re; + psi.c3.im=-(*v).c33.im; + su3xsu3vec(u); + (*w).c13=chi.c1; + (*w).c23=chi.c2; + (*w).c33=chi.c3; +} + + +void su3dagxsu3dag(su3_dble *u,su3_dble *v,su3_dble *w) +{ + psi.c1.re= (*v).c11.re; + psi.c1.im=-(*v).c11.im; + psi.c2.re= (*v).c12.re; + psi.c2.im=-(*v).c12.im; + psi.c3.re= (*v).c13.re; + psi.c3.im=-(*v).c13.im; + su3dagxsu3vec(u); + (*w).c11=chi.c1; + (*w).c21=chi.c2; + (*w).c31=chi.c3; + + psi.c1.re= (*v).c21.re; + psi.c1.im=-(*v).c21.im; + psi.c2.re= (*v).c22.re; + psi.c2.im=-(*v).c22.im; + psi.c3.re= (*v).c23.re; + psi.c3.im=-(*v).c23.im; + su3dagxsu3vec(u); + (*w).c12=chi.c1; + (*w).c22=chi.c2; + (*w).c32=chi.c3; + + psi.c1.re= (*v).c31.re; + psi.c1.im=-(*v).c31.im; + psi.c2.re= (*v).c32.re; + psi.c2.im=-(*v).c32.im; + psi.c3.re= (*v).c33.re; + psi.c3.im=-(*v).c33.im; + su3dagxsu3vec(u); + (*w).c13=chi.c1; + (*w).c23=chi.c2; + (*w).c33=chi.c3; +} + + +void su3xu3alg(su3_dble *u,u3_alg_dble *X,su3_dble *v) +{ + psi.c1.re=0.0; + psi.c1.im= (*X).c1; + psi.c2.re=-(*X).c4; + psi.c2.im= (*X).c5; + psi.c3.re=-(*X).c6; + psi.c3.im= (*X).c7; + su3xsu3vec(u); + (*v).c11=chi.c1; + (*v).c21=chi.c2; + (*v).c31=chi.c3; + + psi.c1.re= (*X).c4; + psi.c1.im= (*X).c5; + psi.c2.re=0.0; + psi.c2.im= (*X).c2; + psi.c3.re=-(*X).c8; + psi.c3.im= (*X).c9; + su3xsu3vec(u); + (*v).c12=chi.c1; + (*v).c22=chi.c2; + (*v).c32=chi.c3; + + psi.c1.re= (*X).c6; + psi.c1.im= (*X).c7; + psi.c2.re= (*X).c8; + psi.c2.im= (*X).c9; + psi.c3.re=0.0; + psi.c3.im= (*X).c3; + su3xsu3vec(u); + (*v).c13=chi.c1; + (*v).c23=chi.c2; + (*v).c33=chi.c3; +} + + +void su3dagxu3alg(su3_dble *u,u3_alg_dble *X,su3_dble *v) +{ + psi.c1.re=0.0; + psi.c1.im= (*X).c1; + psi.c2.re=-(*X).c4; + psi.c2.im= (*X).c5; + psi.c3.re=-(*X).c6; + psi.c3.im= (*X).c7; + su3dagxsu3vec(u); + (*v).c11=chi.c1; + (*v).c21=chi.c2; + (*v).c31=chi.c3; + + psi.c1.re= (*X).c4; + psi.c1.im= (*X).c5; + psi.c2.re=0.0; + psi.c2.im= (*X).c2; + psi.c3.re=-(*X).c8; + psi.c3.im= (*X).c9; + su3dagxsu3vec(u); + (*v).c12=chi.c1; + (*v).c22=chi.c2; + (*v).c32=chi.c3; + + psi.c1.re= (*X).c6; + psi.c1.im= (*X).c7; + psi.c2.re= (*X).c8; + psi.c2.im= (*X).c9; + psi.c3.re=0.0; + psi.c3.im= (*X).c3; + su3dagxsu3vec(u); + (*v).c13=chi.c1; + (*v).c23=chi.c2; + (*v).c33=chi.c3; +} + + +void u3algxsu3(u3_alg_dble *X,su3_dble *u,su3_dble *v) +{ + psi.c1.re=0.0; + psi.c1.im= (*X).c1; + psi.c2.re=-(*X).c4; + psi.c2.im= (*X).c5; + psi.c3.re=-(*X).c6; + psi.c3.im= (*X).c7; + su3dagxsu3vec(u); + (*v).c11.re=-chi.c1.re; + (*v).c11.im= chi.c1.im; + (*v).c12.re=-chi.c2.re; + (*v).c12.im= chi.c2.im; + (*v).c13.re=-chi.c3.re; + (*v).c13.im= chi.c3.im; + + psi.c1.re= (*X).c4; + psi.c1.im= (*X).c5; + psi.c2.re=0.0; + psi.c2.im= (*X).c2; + psi.c3.re=-(*X).c8; + psi.c3.im= (*X).c9; + su3dagxsu3vec(u); + (*v).c21.re=-chi.c1.re; + (*v).c21.im= chi.c1.im; + (*v).c22.re=-chi.c2.re; + (*v).c22.im= chi.c2.im; + (*v).c23.re=-chi.c3.re; + (*v).c23.im= chi.c3.im; + + psi.c1.re= (*X).c6; + psi.c1.im= (*X).c7; + psi.c2.re= (*X).c8; + psi.c2.im= (*X).c9; + psi.c3.re=0.0; + psi.c3.im= (*X).c3; + su3dagxsu3vec(u); + (*v).c31.re=-chi.c1.re; + (*v).c31.im= chi.c1.im; + (*v).c32.re=-chi.c2.re; + (*v).c32.im= chi.c2.im; + (*v).c33.re=-chi.c3.re; + (*v).c33.im= chi.c3.im; +} + + +void u3algxsu3dag(u3_alg_dble *X,su3_dble *u,su3_dble *v) +{ + psi.c1.re=0.0; + psi.c1.im= (*X).c1; + psi.c2.re=-(*X).c4; + psi.c2.im= (*X).c5; + psi.c3.re=-(*X).c6; + psi.c3.im= (*X).c7; + su3xsu3vec(u); + (*v).c11.re=-chi.c1.re; + (*v).c11.im= chi.c1.im; + (*v).c12.re=-chi.c2.re; + (*v).c12.im= chi.c2.im; + (*v).c13.re=-chi.c3.re; + (*v).c13.im= chi.c3.im; + + psi.c1.re= (*X).c4; + psi.c1.im= (*X).c5; + psi.c2.re=0.0; + psi.c2.im= (*X).c2; + psi.c3.re=-(*X).c8; + psi.c3.im= (*X).c9; + su3xsu3vec(u); + (*v).c21.re=-chi.c1.re; + (*v).c21.im= chi.c1.im; + (*v).c22.re=-chi.c2.re; + (*v).c22.im= chi.c2.im; + (*v).c23.re=-chi.c3.re; + (*v).c23.im= chi.c3.im; + + psi.c1.re= (*X).c6; + psi.c1.im= (*X).c7; + psi.c2.re= (*X).c8; + psi.c2.im= (*X).c9; + psi.c3.re=0.0; + psi.c3.im= (*X).c3; + su3xsu3vec(u); + (*v).c31.re=-chi.c1.re; + (*v).c31.im= chi.c1.im; + (*v).c32.re=-chi.c2.re; + (*v).c32.im= chi.c2.im; + (*v).c33.re=-chi.c3.re; + (*v).c33.im= chi.c3.im; +} + + +double prod2su3alg(su3_dble *u,su3_dble *v,su3_alg_dble *X) +{ + double tr; + + psi.c1=(*v).c11; + psi.c2=(*v).c21; + psi.c3=(*v).c31; + su3xsu3vec(u); + tr=chi.c1.re; + (*X).c1 = chi.c1.im; + (*X).c2 = chi.c1.im; + (*X).c3 =-chi.c2.re; + (*X).c4 = chi.c2.im; + (*X).c5 =-chi.c3.re; + (*X).c6 = chi.c3.im; + + psi.c1=(*v).c12; + psi.c2=(*v).c22; + psi.c3=(*v).c32; + su3xsu3vec(u); + tr+=chi.c2.re; + (*X).c3+= chi.c1.re; + (*X).c4+= chi.c1.im; + (*X).c1-= chi.c2.im; + (*X).c7 =-chi.c3.re; + (*X).c8 = chi.c3.im; + + psi.c1=(*v).c13; + psi.c2=(*v).c23; + psi.c3=(*v).c33; + su3xsu3vec(u); + tr+=chi.c3.re; + (*X).c5+= chi.c1.re; + (*X).c6+= chi.c1.im; + (*X).c7+= chi.c2.re; + (*X).c8+= chi.c2.im; + (*X).c2-= chi.c3.im; + + (*X).c1*=(1.0/3.0); + (*X).c2*=(1.0/3.0); + (*X).c3*=0.5; + (*X).c4*=0.5; + (*X).c5*=0.5; + (*X).c6*=0.5; + (*X).c7*=0.5; + (*X).c8*=0.5; + + return tr; +} + + +void prod2u3alg(su3_dble *u,su3_dble *v,u3_alg_dble *X) +{ + psi.c1=(*v).c11; + psi.c2=(*v).c21; + psi.c3=(*v).c31; + su3xsu3vec(u); + (*X).c1=chi.c1.im+chi.c1.im; + (*X).c4=-chi.c2.re; + (*X).c5=chi.c2.im; + (*X).c6=-chi.c3.re; + (*X).c7=chi.c3.im; + + psi.c1=(*v).c12; + psi.c2=(*v).c22; + psi.c3=(*v).c32; + su3xsu3vec(u); + (*X).c4+=chi.c1.re; + (*X).c5+=chi.c1.im; + (*X).c2=chi.c2.im+chi.c2.im; + (*X).c8=-chi.c3.re; + (*X).c9=chi.c3.im; + + psi.c1=(*v).c13; + psi.c2=(*v).c23; + psi.c3=(*v).c33; + su3xsu3vec(u); + (*X).c6+=chi.c1.re; + (*X).c7+=chi.c1.im; + (*X).c8+=chi.c2.re; + (*X).c9+=chi.c2.im; + (*X).c3=chi.c3.im+chi.c3.im; +} + + +void rotate_su3alg(su3_dble *u,su3_alg_dble *X) +{ + uX.c11.re=0.0; + uX.c11.im=(*X).c1+(*X).c2; + uX.c22.re=0.0; + uX.c22.im=(*X).c2-(*X).c1-(*X).c1; + uX.c33.re=0.0; + uX.c33.im=(*X).c1-(*X).c2-(*X).c2; + + uX.c12.re= (*X).c3; + uX.c12.im= (*X).c4; + uX.c21.re=-(*X).c3; + uX.c21.im= (*X).c4; + + uX.c13.re= (*X).c5; + uX.c13.im= (*X).c6; + uX.c31.re=-(*X).c5; + uX.c31.im= (*X).c6; + + uX.c23.re= (*X).c7; + uX.c23.im= (*X).c8; + uX.c32.re=-(*X).c7; + uX.c32.im= (*X).c8; + + psi.c1.re= (*u).c11.re; + psi.c1.im=-(*u).c11.im; + psi.c2.re= (*u).c12.re; + psi.c2.im=-(*u).c12.im; + psi.c3.re= (*u).c13.re; + psi.c3.im=-(*u).c13.im; + su3xsu3vec(&uX); + psi=chi; + su3xsu3vec(u); + (*X).c1= chi.c1.im; + (*X).c2= chi.c1.im+chi.c1.im; + (*X).c5=-chi.c3.re; + (*X).c6= chi.c3.im; + + psi.c1.re= (*u).c21.re; + psi.c1.im=-(*u).c21.im; + psi.c2.re= (*u).c22.re; + psi.c2.im=-(*u).c22.im; + psi.c3.re= (*u).c23.re; + psi.c3.im=-(*u).c23.im; + su3xsu3vec(&uX); + psi=chi; + su3xsu3vec(u); + (*X).c3= chi.c1.re; + (*X).c4= chi.c1.im; + (*X).c1-=chi.c2.im; + (*X).c2+=chi.c2.im; + (*X).c7=-chi.c3.re; + (*X).c8= chi.c3.im; + + (*X).c1*=(1.0/3.0); + (*X).c2*=(1.0/3.0); +} + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/su3fcts/su3ren.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/su3fcts/su3ren.c new file mode 100644 index 0000000000000000000000000000000000000000..26f4423d1561c04dbf849a21b67b5611d37270c6 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/su3fcts/su3ren.c @@ -0,0 +1,188 @@ + +/******************************************************************************* +* +* File su3ren.c +* +* Copyright (C) 2005, 2009, 2010, 2011 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Renormalization of SU(3) matrices +* +* The externally accessible function are +* +* void project_to_su3(su3 *u) +* Projects an approximate single-precision SU(3) matrix back to SU(3). +* No action is performed if the matrix is degenerate +* +* void project_to_su3_dble(su3_dble *u) +* Projects an approximate double-precision SU(3) matrix back to SU(3). +* No action is performed if the matrix is degenerate +* +* Notes: +* +* The programs in this module do not perform any communications and can be +* called locally. A matrix is considered to be degenerate if the first +* column vector or the cross-product of the first and the second vector is +* exactly equal to zero. +* +*******************************************************************************/ + +#define SU3REN_C + +#include +#include +#include +#include "su3.h" +#include "su3fcts.h" + + +static int normalize(su3_vector *v) +{ + float r; + + r=_vector_prod_re((*v),(*v)); + r=(float)sqrt((double)(r)); + + if (r==0.0f) + return 1; + else + { + r=1.0f/r; + _vector_mul((*v),r,(*v)); + return 0; + } +} + + +static int normalize_dble(su3_vector_dble *v) +{ + double r; + + r=_vector_prod_re((*v),(*v)); + r=sqrt(r); + + if (r==0.0) + return 1; + else + { + r=1.0/r; + _vector_mul((*v),r,(*v)); + return 0; + } +} + + +void project_to_su3(su3 *u) +{ + int it; + su3_vector v1,v2,v3; + + v1.c1.re=(*u).c11.re; + v1.c1.im=(*u).c11.im; + v1.c2.re=(*u).c12.re; + v1.c2.im=(*u).c12.im; + v1.c3.re=(*u).c13.re; + v1.c3.im=(*u).c13.im; + + v2.c1.re=(*u).c21.re; + v2.c1.im=(*u).c21.im; + v2.c2.re=(*u).c22.re; + v2.c2.im=(*u).c22.im; + v2.c3.re=(*u).c23.re; + v2.c3.im=(*u).c23.im; + + v3.c1.re=(*u).c31.re; + v3.c1.im=(*u).c31.im; + v3.c2.re=(*u).c32.re; + v3.c2.im=(*u).c32.im; + v3.c3.re=(*u).c33.re; + v3.c3.im=(*u).c33.im; + + it=normalize(&v1); + _vector_cross_prod(v3,v1,v2); + it|=normalize(&v3); + _vector_cross_prod(v2,v3,v1); + + if (it==0) + { + (*u).c11.re=v1.c1.re; + (*u).c11.im=v1.c1.im; + (*u).c12.re=v1.c2.re; + (*u).c12.im=v1.c2.im; + (*u).c13.re=v1.c3.re; + (*u).c13.im=v1.c3.im; + + (*u).c21.re=v2.c1.re; + (*u).c21.im=v2.c1.im; + (*u).c22.re=v2.c2.re; + (*u).c22.im=v2.c2.im; + (*u).c23.re=v2.c3.re; + (*u).c23.im=v2.c3.im; + + (*u).c31.re=v3.c1.re; + (*u).c31.im=v3.c1.im; + (*u).c32.re=v3.c2.re; + (*u).c32.im=v3.c2.im; + (*u).c33.re=v3.c3.re; + (*u).c33.im=v3.c3.im; + } +} + + +void project_to_su3_dble(su3_dble *u) +{ + int it; + su3_vector_dble v1,v2,v3; + + v1.c1.re=(*u).c11.re; + v1.c1.im=(*u).c11.im; + v1.c2.re=(*u).c12.re; + v1.c2.im=(*u).c12.im; + v1.c3.re=(*u).c13.re; + v1.c3.im=(*u).c13.im; + + v2.c1.re=(*u).c21.re; + v2.c1.im=(*u).c21.im; + v2.c2.re=(*u).c22.re; + v2.c2.im=(*u).c22.im; + v2.c3.re=(*u).c23.re; + v2.c3.im=(*u).c23.im; + + v3.c1.re=(*u).c31.re; + v3.c1.im=(*u).c31.im; + v3.c2.re=(*u).c32.re; + v3.c2.im=(*u).c32.im; + v3.c3.re=(*u).c33.re; + v3.c3.im=(*u).c33.im; + + it=normalize_dble(&v1); + _vector_cross_prod(v3,v1,v2); + it|=normalize_dble(&v3); + _vector_cross_prod(v2,v3,v1); + + if (it==0) + { + (*u).c11.re=v1.c1.re; + (*u).c11.im=v1.c1.im; + (*u).c12.re=v1.c2.re; + (*u).c12.im=v1.c2.im; + (*u).c13.re=v1.c3.re; + (*u).c13.im=v1.c3.im; + + (*u).c21.re=v2.c1.re; + (*u).c21.im=v2.c1.im; + (*u).c22.re=v2.c2.re; + (*u).c22.im=v2.c2.im; + (*u).c23.re=v2.c3.re; + (*u).c23.im=v2.c3.im; + + (*u).c31.re=v3.c1.re; + (*u).c31.im=v3.c1.im; + (*u).c32.re=v3.c2.re; + (*u).c32.im=v3.c2.im; + (*u).c33.re=v3.c3.re; + (*u).c33.im=v3.c3.im; + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/sw_term/README b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/sw_term/README new file mode 100644 index 0000000000000000000000000000000000000000..7d752c3e3678603ea1e6427496b73aa43fc52a12 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/sw_term/README @@ -0,0 +1,105 @@ + +******************************************************************************** + + Sheikholeslami-Wohlert term + +******************************************************************************** + +Computation and action of the Pauli term in the O(a) improved Wilson-Dirac +operator. The check programs for the programs in the files pauli.c and +pauli_dble.c are found in the devel/nompi/sw_term directory and those for +all other programs in the devel/sw_term directory. + + +Files +----- + +pauli.c Basic functions for single-precision Hermitian 6x6 matrices. + +pauli_dble.c Basic functions for double-precision Hermitian 6x6 matrices. + +swflds.c Allocation and initialization of the global SW fields. + +sw_term.c Computation of the SW term. + + + +Include file +------------ + +The file sw_term.h defines the prototypes for all externally accessible +functions that are defined in the *.c files listed above. The types +pauli, pauli_dble and u3_alg_dble are defined in su3.h. + + +List of functions +----------------- + +void mul_pauli(float mu,pauli *m,weyl *s,weyl *r) + Multiplies the Weyl spinor s by the matrix m+i*mu and assigns + the result to the Weyl spinor r. The source spinor is overwritten + if r=s and otherwise left unchanged. + +void mul_pauli2(float mu,pauli *m,spinor *s,spinor *r) + Multiplies the spinor s by the matrix m+i*mu*gamma_5 and assigns + the result to the spinor r. The source spinor is overwritten + if r=s and otherwise left unchanged. + +void assign_pauli(int vol,pauli_dble *md,pauli *m) + Assigns the field md[vol] of double-precision matrices to the field + m[vol] of single-precision matrices. + +void apply_sw(int vol,float mu,pauli *m,spinor *s,spinor *r) + Applies the matrix field m[2*vol]+i*mu*gamma_5 to the spinor field + s[vol] and assigns the result to the field r[vol]. The source field + is overwritten if r=s and otherwise left unchanged (the arrays may + not overlap in this case). + +void mul_pauli_dble(double mu,pauli_dble *m,weyl_dble *s,weyl_dble *r) + Multiplies the Weyl spinor s by the matrix m+i*mu and assigns the + result to the Weyl spinor r. The source spinor is overwritten if + r=s and otherwise left unchanged. + +int inv_pauli_dble(double mu,pauli_dble *m,pauli_dble *im) + Assigns the Hermitian part of the matrix (m+i*mu)^(-1) to im. The + matrix is overwritten if im=m and otherwise left unchanged. On + exit the program returns 0 or 1 depending on whether the inversion + was safe or not (in which case the calculated matrix is unusable). + +complex_dble det_pauli_dble(double mu,pauli_dble *m) + Returns the determinant of the matrix m+i*mu. + +void apply_sw_dble(int vol,double mu,pauli_dble *m,spinor_dble *s, + spinor_dble *r) + Applies the matrix field m[2*vol]+i*mu*gamma_5 to the spinor field + s[vol] and assigns the result to the field r[vol]. The source field + is overwritten if r=s and otherwise left unchanged (the arrays may + not overlap in this case). + +int apply_swinv_dble(int vol,double mu,pauli_dble *m,spinor_dble *s, + spinor_dble *r) + Applies the inverse of the matrix field m[2*vol]+i*mu*gamma_5 to the + spinor field s[vol] and assigns the result to the field r[vol]. The + source field is overwritten if r=s and otherwise left unchanged (the + arrays may not overlap in this case). On exit the program returns 0 + or 1 depending on whether the matrix inversions were safe or not (in + the latter case, the output field is unusable). + +pauli *swfld(void) + Returns the base address of the single-precision SW field. If it + is not already allocated, the field is allocated and initialized + to unity. + +pauli_dble *swdfld(void) + Returns the base address of the double-precision SW field. If it + is not already allocated, the field is allocated and initialized + to unity. + +void assign_swd2sw(void) + Assigns the double-precision to the single-precision SW field. + +int sw_term(ptset_t set) + Computes the SW term for the current double-precision gauge field + and assigns the matrix to the global double-precision SW field. The + matrices on the specified point set are then inverted and 0 or 1 + is returned depending on whether all inversions were safe or not. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/sw_term/pauli.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/sw_term/pauli.c new file mode 100644 index 0000000000000000000000000000000000000000..daaa21361ef618103f2c33427f4e35f6b2378c53 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/sw_term/pauli.c @@ -0,0 +1,1260 @@ +/******************************************************************************* +* +* File pauli.c +* +* Copyright (C) 2005, 2009, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Basic functions for single-precision Hermitian 6x6 matrices +* +* The externally accessible functions are +* +* void mul_pauli(float mu,pauli *m,weyl *s,weyl *r) +* Multiplies the Weyl spinor s by the matrix m+i*mu and assigns +* the result to the Weyl spinor r. The source spinor is overwritten +* if r=s and otherwise left unchanged. +* +* void mul_pauli2(float mu,pauli *m,spinor *s,spinor *r) +* Multiplies the spinor s by the matrix m+i*mu*gamma_5 and assigns +* the result to the spinor r. The source spinor is overwritten +* if r=s and otherwise left unchanged. +* +* void assign_pauli(int vol,pauli_dble *md,pauli *m) +* Assigns the field md[vol] of double-precision matrices to the field +* m[vol] of single-precision matrices. +* +* void apply_sw(int vol,float mu,pauli *m,spinor *s,spinor *r) +* Applies the matrix field m[2*vol]+i*mu*gamma_5 to the spinor field +* s[vol] and assigns the result to the field r[vol]. The source field +* is overwritten if r=s and otherwise left unchanged (the arrays may +* not overlap in this case). +* +* Notes: +* +* The storage format for Hermitian 6x6 matrices is described in the notes +* "Implementation of the lattice Dirac operator" (file doc/dirac.pdf). +* +* The programs perform no communications and can be called locally. If SSE +* or AVX instructions are used, the Pauli matrices, Weyl and Dirac spinors +* must be aligned to a 16 byte boundary. +* +*******************************************************************************/ + +#define PAULI_C + +#include +#include +#include +#include "su3.h" +#include "sw_term.h" + +typedef union +{ + spinor s; + weyl w[2]; +} spin_t; + +#if (defined x64) +#include "sse2.h" + +void mul_pauli(float mu,pauli *m,weyl *s,weyl *r) +{ + m+=4; + _prefetch_pauli(m); + m-=4; + + __asm__ __volatile__ ("movss %0, %%xmm14 \n\t" + "movss %1, %%xmm2 \n\t" + "movss %2, %%xmm3 \n\t" + "movsd %3, %%xmm4 \n\t" + "shufps $0xb1, %%xmm14, %%xmm14" + : + : + "m" (mu), + "m" ((*m).u[0]), + "m" ((*m).u[1]), + "m" ((*m).u[8]), + "m" ((*m).u[9]) + : + "xmm2", "xmm3", "xmm4", "xmm14"); + + __asm__ __volatile__ ("movhps %0, %%xmm2 \n\t" + "movhps %0, %%xmm3 \n\t" + "movhps %2, %%xmm4 \n\t" + "movsldup %4, %%xmm0 \n\t" + "movshdup %4, %%xmm1 \n\t" + "addps %%xmm14, %%xmm2 \n\t" + "subps %%xmm14, %%xmm3 \n\t" + "movaps %%xmm4, %%xmm10 \n\t" + "movaps %%xmm2, %%xmm8 \n\t" + "movaps %%xmm3, %%xmm9 \n\t" + "shufps $0x4e, %%xmm3, %%xmm3 \n\t" + "shufps $0xb1, %%xmm10, %%xmm10 \n\t" + "shufps $0xb1, %%xmm8, %%xmm8 \n\t" + "shufps $0x1b, %%xmm9, %%xmm9" + : + : + "m" ((*m).u[6]), + "m" ((*m).u[7]), + "m" ((*m).u[16]), + "m" ((*m).u[17]), + "m" ((*s).c1.c1), + "m" ((*s).c1.c2) + : + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm8", "xmm9", "xmm10"); + + __asm__ __volatile__ ("mulps %%xmm0, %%xmm2 \n\t" + "mulps %%xmm1, %%xmm3 \n\t" + "mulps %%xmm1, %%xmm4 \n\t" + "movsd %0, %%xmm5 \n\t" + "movsd %2, %%xmm6 \n\t" + "movsd %4, %%xmm7" + : + : + "m" ((*m).u[10]), + "m" ((*m).u[11]), + "m" ((*m).u[12]), + "m" ((*m).u[13]), + "m" ((*m).u[14]), + "m" ((*m).u[15]) + : + "xmm2", "xmm3", "xmm4", "xmm5", + "xmm6", "xmm7"); + + s+=4; + _prefetch_weyl(s); + s-=4; + + __asm__ __volatile__ ("mulps %%xmm1, %%xmm8 \n\t" + "mulps %%xmm0, %%xmm9 \n\t" + "mulps %%xmm0, %%xmm10 \n\t" + "movhps %0, %%xmm5 \n\t" + "movhps %2, %%xmm6 \n\t" + "movhps %4, %%xmm7 \n\t" + "addsubps %%xmm8, %%xmm2 \n\t" + "addsubps %%xmm9, %%xmm3 \n\t" + "addsubps %%xmm10, %%xmm4" + : + : + "m" ((*m).u[18]), + "m" ((*m).u[19]), + "m" ((*m).u[20]), + "m" ((*m).u[21]), + "m" ((*m).u[22]), + "m" ((*m).u[23]) + : + "xmm2", "xmm3", "xmm4", "xmm5", + "xmm6", "xmm7", "xmm8", "xmm9", + "xmm10"); + + __asm__ __volatile__ ("movaps %%xmm5, %%xmm8 \n\t" + "movaps %%xmm6, %%xmm9 \n\t" + "movaps %%xmm7, %%xmm10 \n\t" + "shufps $0xb1, %%xmm3, %%xmm3 \n\t" + "shufps $0xb1, %%xmm4, %%xmm4 \n\t" + "mulps %%xmm1, %%xmm5 \n\t" + "mulps %%xmm1, %%xmm6 \n\t" + "mulps %%xmm1, %%xmm7 \n\t" + "shufps $0xb1, %%xmm8, %%xmm8 \n\t" + "shufps $0xb1, %%xmm9, %%xmm9 \n\t" + "shufps $0xb1, %%xmm10, %%xmm10 \n\t" + "mulps %%xmm0, %%xmm8 \n\t" + "mulps %%xmm0, %%xmm9 \n\t" + "mulps %%xmm0, %%xmm10" + : + : + : + "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7", "xmm8", "xmm9", "xmm10"); + + __asm__ __volatile__ ("movss %0, %%xmm13 \n\t" + "movaps %1, %%xmm11" + : + : + "m" ((*m).u[2]), + "m" ((*m).u[8]), + "m" ((*m).u[9]), + "m" ((*m).u[10]), + "m" ((*m).u[11]) + : + "xmm11", "xmm13"); + + __asm__ __volatile__ ("movaps %0, %%xmm12 \n\t" + "addsubps %%xmm8, %%xmm5 \n\t" + "addsubps %%xmm9, %%xmm6 \n\t" + "addsubps %%xmm10, %%xmm7 \n\t" + "movhps %4, %%xmm13 \n\t" + "movsldup %6, %%xmm0 \n\t" + "movshdup %6, %%xmm1 \n\t" + "addps %%xmm14, %%xmm13 \n\t" + "movaps %%xmm11, %%xmm8 \n\t" + "movaps %%xmm12, %%xmm9 \n\t" + "movaps %%xmm13, %%xmm10 \n\t" + "shufps $0xb1, %%xmm8, %%xmm8 \n\t" + "shufps $0xb1, %%xmm9, %%xmm9 \n\t" + "shufps $0xb1, %%xmm10, %%xmm10" + : + : + "m" ((*m).u[16]), + "m" ((*m).u[17]), + "m" ((*m).u[18]), + "m" ((*m).u[19]), + "m" ((*m).u[24]), + "m" ((*m).u[25]), + "m" ((*s).c1.c3), + "m" ((*s).c2.c1) + : + "xmm0", "xmm1", "xmm5", "xmm6", + "xmm7", "xmm8", "xmm9", "xmm10", + "xmm12", "xmm13"); + + __asm__ __volatile__ ("mulps %%xmm0, %%xmm11 \n\t" + "mulps %%xmm0, %%xmm12 \n\t" + "mulps %%xmm0, %%xmm13 \n\t" + "addps %%xmm11, %%xmm2 \n\t" + "addps %%xmm12, %%xmm3 \n\t" + "addps %%xmm13, %%xmm4 \n\t" + "movss %0, %%xmm11 \n\t" + "movsd %1, %%xmm12 \n\t" + "movsd %3, %%xmm13 \n\t" + "mulps %%xmm1, %%xmm8 \n\t" + "mulps %%xmm1, %%xmm9 \n\t" + "mulps %%xmm1, %%xmm10" + : + : + "m" ((*m).u[3]), + "m" ((*m).u[26]), + "m" ((*m).u[27]), + "m" ((*m).u[28]), + "m" ((*m).u[29]) + : + "xmm2", "xmm3", "xmm4", "xmm8", + "xmm9", "xmm10", "xmm11", "xmm12", + "xmm13"); + + __asm__ __volatile__ ("movhps %0, %%xmm11 \n\t" + "movhps %2, %%xmm12 \n\t" + "movhps %4, %%xmm13 \n\t" + "subps %%xmm14, %%xmm11 \n\t" + "addsubps %%xmm8, %%xmm2 \n\t" + "addsubps %%xmm9, %%xmm3 \n\t" + "addsubps %%xmm10, %%xmm4" + : + : + "m" ((*m).u[24]), + "m" ((*m).u[25]), + "m" ((*m).u[30]), + "m" ((*m).u[31]), + "m" ((*m).u[32]), + "m" ((*m).u[33]) + : + "xmm2", "xmm3", "xmm4", "xmm11", + "xmm12", "xmm13"); + + __asm__ __volatile__ ("movaps %%xmm11, %%xmm8 \n\t" + "movaps %%xmm12, %%xmm9 \n\t" + "movaps %%xmm13, %%xmm10 \n\t" + "shufps $0x4e, %%xmm11, %%xmm11 \n\t" + "shufps $0x1b, %%xmm8, %%xmm8 \n\t" + "shufps $0xb1, %%xmm9, %%xmm9 \n\t" + "shufps $0xb1, %%xmm10, %%xmm10 \n\t" + "mulps %%xmm1, %%xmm11 \n\t" + "mulps %%xmm1, %%xmm12 \n\t" + "mulps %%xmm1, %%xmm13 \n\t" + "addps %%xmm11, %%xmm5 \n\t" + "addps %%xmm12, %%xmm6 \n\t" + "addps %%xmm13, %%xmm7 \n\t" + "mulps %%xmm0, %%xmm8 \n\t" + "mulps %%xmm0, %%xmm9 \n\t" + "mulps %%xmm0, %%xmm10" + : + : + : + "xmm5", "xmm6","xmm7", "xmm8", + "xmm9", "xmm10", "xmm11", "xmm12", + "xmm13"); + + __asm__ __volatile__ ("movaps %0, %%xmm11 \n\t" + "movaps %4, %%xmm12" + : + : + "m" ((*m).u[12]), + "m" ((*m).u[13]), + "m" ((*m).u[14]), + "m" ((*m).u[15]), + "m" ((*m).u[20]), + "m" ((*m).u[21]), + "m" ((*m).u[22]), + "m" ((*m).u[23]) + : + "xmm11", "xmm12"); + + __asm__ __volatile__ ("movups %0, %%xmm13 \n\t" + "addsubps %%xmm8, %%xmm5 \n\t" + "addsubps %%xmm9, %%xmm6 \n\t" + "addsubps %%xmm10, %%xmm7 \n\t" + "movaps %%xmm11, %%xmm8 \n\t" + "movaps %%xmm12, %%xmm9 \n\t" + "movaps %%xmm13, %%xmm10 \n\t" + "movsldup %4, %%xmm0 \n\t" + "movshdup %4, %%xmm1 \n\t" + "shufps $0xb1, %%xmm8, %%xmm8 \n\t" + "shufps $0xb1, %%xmm9, %%xmm9 \n\t" + "shufps $0xb1, %%xmm10, %%xmm10" + : + : + "m" ((*m).u[26]), + "m" ((*m).u[27]), + "m" ((*m).u[28]), + "m" ((*m).u[29]), + "m" ((*s).c2.c2), + "m" ((*s).c2.c3) + : + "xmm0", "xmm1", "xmm5", "xmm6", + "xmm7", "xmm8", "xmm9", "xmm10", + "xmm13"); + + __asm__ __volatile__ ("mulps %%xmm0, %%xmm11 \n\t" + "mulps %%xmm0, %%xmm12 \n\t" + "mulps %%xmm0, %%xmm13 \n\t" + "addps %%xmm11, %%xmm2 \n\t" + "addps %%xmm12, %%xmm3 \n\t" + "addps %%xmm13, %%xmm4 \n\t" + "movups %0, %%xmm11 \n\t" + "movss %4, %%xmm12 \n\t" + "movss %5, %%xmm13 \n\t" + "mulps %%xmm1, %%xmm8 \n\t" + "mulps %%xmm1, %%xmm9 \n\t" + "mulps %%xmm1, %%xmm10" + : + : + "m" ((*m).u[30]), + "m" ((*m).u[31]), + "m" ((*m).u[32]), + "m" ((*m).u[33]), + "m" ((*m).u[4]), + "m" ((*m).u[5]) + : + "xmm2", "xmm3", "xmm4", "xmm8", + "xmm9", "xmm10", "xmm11", "xmm12", + "xmm13"); + + __asm__ __volatile__ ("movhps %0, %%xmm12 \n\t" + "movhps %0, %%xmm13 \n\t" + "addsubps %%xmm8, %%xmm2 \n\t" + "addsubps %%xmm9, %%xmm3 \n\t" + "addps %%xmm14, %%xmm12 \n\t" + "subps %%xmm14, %%xmm13 \n\t" + "addsubps %%xmm10, %%xmm4" + : + : + "m" ((*m).u[34]), + "m" ((*m).u[35]) + : + "xmm2", "xmm3", "xmm4", "xmm12", + "xmm13"); + + __asm__ __volatile__ ("movaps %%xmm11, %%xmm8 \n\t" + "movaps %%xmm12, %%xmm9 \n\t" + "movaps %%xmm13, %%xmm10 \n\t" + "shufps $0xb1, %%xmm8, %%xmm8 \n\t" + "shufps $0xb1, %%xmm9, %%xmm9 \n\t" + "shufps $0x1b, %%xmm10, %%xmm10 \n\t" + "mulps %%xmm1, %%xmm8 \n\t" + "mulps %%xmm1, %%xmm9 \n\t" + "mulps %%xmm0, %%xmm10 \n\t" + "shufps $0x4e, %%xmm13, %%xmm13 \n\t" + "shufps $0xb1, %%xmm5, %%xmm5 \n\t" + "shufps $0xb1, %%xmm6, %%xmm6 \n\t" + "mulps %%xmm0, %%xmm11 \n\t" + "mulps %%xmm0, %%xmm12 \n\t" + "mulps %%xmm1, %%xmm13" + : + : + : + "xmm5", "xmm6", "xmm8", "xmm9", + "xmm10", "xmm11", "xmm12", "xmm13"); + + __asm__ __volatile__ ("addsubps %%xmm8, %%xmm5 \n\t" + "addsubps %%xmm9, %%xmm6 \n\t" + "addsubps %%xmm10, %%xmm7 \n\t" + "shufps $0xd8, %%xmm2, %%xmm2 \n\t" + "shufps $0xd8, %%xmm3, %%xmm3 \n\t" + "shufps $0xd8, %%xmm4, %%xmm4 \n\t" + "addps %%xmm11, %%xmm5 \n\t" + "addps %%xmm12, %%xmm6 \n\t" + "addps %%xmm13, %%xmm7 \n\t" + "shufps $0xd8, %%xmm5, %%xmm5 \n\t" + "shufps $0xd8, %%xmm6, %%xmm6 \n\t" + "shufps $0x8d, %%xmm7, %%xmm7 \n\t" + "haddps %%xmm3, %%xmm2 \n\t" + "haddps %%xmm5, %%xmm4 \n\t" + "haddps %%xmm7, %%xmm6 \n\t" + "movaps %%xmm2, %0 \n\t" + "movaps %%xmm4, %2 \n\t" + "movaps %%xmm6, %4" + : + "=m" ((*r).c1.c1), + "=m" ((*r).c1.c2), + "=m" ((*r).c1.c3), + "=m" ((*r).c2.c1), + "=m" ((*r).c2.c2), + "=m" ((*r).c2.c3) + : + : + "xmm2", "xmm3", "xmm4", "xmm5", + "xmm6", "xmm7"); +} + +#elif (defined QPX) + +#include "qpx.h" + +void mul_pauli_qpx(pauli *m,vector4double *im1[3], vector4double *im2[3] ) +{ + vector4double s1,s2,s3,s4,s5,s6,s10,s11; + vector4double v1,v2,v3,v4,v5,v6,v7,v71,v8,v9,v10,v11,v12,v13,v14,v15,v100; + vector4double r10,r11, r12,r13,r14,r15, r100,r101,r102,r110,r111,ri1,ri2; + + s1=vec_perm(*(im1[0]),*(im1[0]),perm1); + s2=vec_perm(*(im1[0]),*(im1[0]),perm2); + s3=vec_perm(*(im1[1]),*(im1[1]),perm1); + s4=vec_perm(*(im1[1]),*(im1[1]),perm2); + s5=vec_perm(*(im1[2]),*(im1[2]),perm1); + s6=vec_perm(*(im1[2]),*(im1[2]),perm2); + + + v10=vec_ld(0,&((*m).u[0])); + v11=vec_ld2(0,&((*m).u[4])); + v1=vec_ld2(0,&((*m).u[6])); + v2=vec_ld(0,&((*m).u[8])); + v3=vec_ld(0,&((*m).u[12])); + v4=vec_ld(0,&((*m).u[16])); + v5=vec_ld(0,&((*m).u[20])); + v6=vec_ld2(0,&((*m).u[24])); + v7=vec_ld2(0,&((*m).u[26])); + v71=vec_ld2(0,&((*m).u[28])); + v8=vec_ld2(0,&((*m).u[30])); + v9=vec_ld2(0,&((*m).u[32])); + v7=vec_sldw(v7,v71,2); + v8=vec_sldw(v8,v9,2); + v9=vec_ld2(0,&((*m).u[34])); + + + v100=vec_perm(v10,v10,perm0011); + s10=vec_sldw(s1,s2,2); + s11=vec_sldw(s2,s1,2); + v12=vec_perm(v2,v4,perm1); + v13=vec_perm(v2,v4,perm2); + v14=vec_perm(v3,v5,perm1); + v15=vec_perm(v3,v5,perm2); + + r11=vec_xxnpmadd(s11,vec_mul(sign0,v1),vec_xmul(v1,s11)); + r12=vec_xxnpmadd(v12,s3,vec_xmadd(s3,v12,r11)); + r13=vec_xxnpmadd(v13,s4,vec_xmadd(s4,v13,r12)); + r14=vec_xxnpmadd(v14,s5,vec_xmadd(s5,v14,r13)); + r15=vec_xxnpmadd(v15,s6,vec_xmadd(s6,v15,r14)); + *(im2[0])=vec_xmadd(v100,s10,r15); + + v100=vec_perm(v10,v10,perm2233); + s10=vec_sldw(s3,s4,2); + s11=vec_sldw(s4,s3,2); + v12=vec_perm(v7,v8,perm1); + v13=vec_perm(v7,v8,perm2); + + r10=vec_xxcpnmadd(s1,v2,vec_xmul(v2,s1)); + r11=vec_xxcpnmadd(s2,v4,vec_xmadd(v4,s2,r10)); + r13=vec_xxnpmadd(s11,vec_mul(sign0,v6),vec_xmadd(v6,s11,r11)); + r14=vec_xxnpmadd(v12,s5,vec_xmadd(s5,v12,r13)); + r15=vec_xxnpmadd(v13,s6,vec_xmadd(s6,v13,r14)); + *(im2[1])=vec_xmadd(v100,s10,r15); + + v100=vec_perm(v11,v11,perm0011); + s10=vec_sldw(s5,s6,2); + s11=vec_sldw(s6,s5,2); + + r10=vec_xxcpnmadd(s1,v3,vec_xmul(v3,s1)); + r11=vec_xxcpnmadd(s2,v5,vec_xmadd(v5,s2,r10)); + r12=vec_xxcpnmadd(s3,v7,vec_xmadd(v7,s3,r11)); + r13=vec_xxcpnmadd(s4,v8,vec_xmadd(v8,s4,r12)); + r15=vec_xxnpmadd(s11,vec_mul(sign0,v9),vec_xmadd(v9,s11,r13)); + *(im2[2])=vec_xmadd(v100,s10,r15); +} + + +void mul_pauli(float mu,pauli *m,weyl *s,weyl *r) +{ + vector4double s1,s2,s3,s4,s5,s6,s10,s11; + vector4double v1,v2,v3,v4,v5,v6,v7,v71,v8,v9,v10,v11,v12,v13,v14,v15,v100,v16,v17,v18; + vector4double r10,r11, r12,r13,r14,r15, r100,r101,r102,r110,r111; + + s1=vec_ld2(0,&((*s).c1.c1.re)); + s2=vec_ld2(0,&((*s).c1.c2.re)); + s3=vec_ld2(0,&((*s).c1.c3.re)); + s4=vec_ld2(0,&((*s).c2.c1.re)); + s5=vec_ld2(0,&((*s).c2.c2.re)); + s6=vec_ld2(0,&((*s).c2.c3.re)); + v16=vec_splats(mu); + v10=vec_ld(0,&((*m).u[0])); + v11=vec_ld2(0,&((*m).u[4])); + v1=vec_ld2(0,&((*m).u[6])); + v2=vec_ld(0,&((*m).u[8])); + v3=vec_ld(0,&((*m).u[12])); + v4=vec_ld(0,&((*m).u[16])); + v5=vec_ld(0,&((*m).u[20])); + v6=vec_ld2(0,&((*m).u[24])); + v7=vec_ld2(0,&((*m).u[26])); + v71=vec_ld2(0,&((*m).u[28])); + v8=vec_ld2(0,&((*m).u[30])); + v9=vec_ld2(0,&((*m).u[32])); + v7=vec_sldw(v7,v71,2); + v8=vec_sldw(v8,v9,2); + v9=vec_ld2(0,&((*m).u[34])); + + + v100=vec_perm(v10,v16,perml1); + v17=vec_mul(sign0,vec_perm(v100,v1,perm1)); + v18=vec_perm(v1,v100,perm2); + r10=vec_xxnpmadd(v17,s1,vec_xmul(s1,v17)); + r11=vec_xxnpmadd(v18,s2,vec_xmadd(s2,v18,r10)); + v12=vec_perm(v2,v4,perm1); + v13=vec_perm(v2,v4,perm2); + r12=vec_xxnpmadd(v12,s3,vec_xmadd(s3,v12,r11)); + r13=vec_xxnpmadd(v13,s4,vec_xmadd(s4,v13,r12)); + v14=vec_perm(v3,v5,perm1); + v15=vec_perm(v3,v5,perm2); + r14=vec_xxnpmadd(v14,s5,vec_xmadd(s5,v14,r13)); + r15=vec_xxnpmadd(v15,s6,vec_xmadd(s6,v15,r14)); + vec_sta(r15,0,&((*r).c1.c1.re)); + + v100=vec_perm(v10,v16,perml2); + v17=vec_mul(sign0,vec_perm(v100,v6,perm1)); + v18=vec_perm(v6,v100,perm2); + r10=vec_xxcpnmadd(s1,v2,vec_xmul(v2,s1)); + r11=vec_xxcpnmadd(s2,v4,vec_xmadd(v4,s2,r10)); + r12=vec_xxnpmadd(v17,s3,vec_xmadd(s3,v17,r11)); + r13=vec_xxnpmadd(v18,s4,vec_xmadd(s4,v18,r12)); + v12=vec_perm(v7,v8,perm1); + v13=vec_perm(v7,v8,perm2); + r14=vec_xxnpmadd(v12,s5,vec_xmadd(s5,v12,r13)); + r15=vec_xxnpmadd(v13,s6,vec_xmadd(s6,v13,r14)); + vec_sta(r15,0,&((*r).c1.c3.re)); + + v100=vec_perm(v11,v16,perml1); + v17=vec_mul(sign0,vec_perm(v100,v9,perm1)); + v18=vec_perm(v9,v100,perm2); + r10=vec_xxcpnmadd(s1,v3,vec_xmul(v3,s1)); + r11=vec_xxcpnmadd(s2,v5,vec_xmadd(v5,s2,r10)); + r12=vec_xxcpnmadd(s3,v7,vec_xmadd(v7,s3,r11)); + r13=vec_xxcpnmadd(s4,v8,vec_xmadd(v8,s4,r12)); + r14=vec_xxnpmadd(v17,s5,vec_xmadd(s5,v17,r13)); + r15=vec_xxnpmadd(v18,s6,vec_xmadd(s6,v18,r14)); + vec_sta(r15,0,&((*r).c2.c2.re)); +} + +#else + +static weyl rs; + + +void mul_pauli(float mu,pauli *m,weyl *s,weyl *r) +{ + float *u; + + u=(*m).u; + + rs.c1.c1.re= + u[ 0]*(*s).c1.c1.re- mu*(*s).c1.c1.im+ + u[ 6]*(*s).c1.c2.re-u[ 7]*(*s).c1.c2.im+ + u[ 8]*(*s).c1.c3.re-u[ 9]*(*s).c1.c3.im+ + u[10]*(*s).c2.c1.re-u[11]*(*s).c2.c1.im+ + u[12]*(*s).c2.c2.re-u[13]*(*s).c2.c2.im+ + u[14]*(*s).c2.c3.re-u[15]*(*s).c2.c3.im; + + rs.c1.c1.im= + u[ 0]*(*s).c1.c1.im+ mu*(*s).c1.c1.re+ + u[ 6]*(*s).c1.c2.im+u[ 7]*(*s).c1.c2.re+ + u[ 8]*(*s).c1.c3.im+u[ 9]*(*s).c1.c3.re+ + u[10]*(*s).c2.c1.im+u[11]*(*s).c2.c1.re+ + u[12]*(*s).c2.c2.im+u[13]*(*s).c2.c2.re+ + u[14]*(*s).c2.c3.im+u[15]*(*s).c2.c3.re; + + rs.c1.c2.re= + u[ 6]*(*s).c1.c1.re+u[ 7]*(*s).c1.c1.im+ + u[ 1]*(*s).c1.c2.re- mu*(*s).c1.c2.im+ + u[16]*(*s).c1.c3.re-u[17]*(*s).c1.c3.im+ + u[18]*(*s).c2.c1.re-u[19]*(*s).c2.c1.im+ + u[20]*(*s).c2.c2.re-u[21]*(*s).c2.c2.im+ + u[22]*(*s).c2.c3.re-u[23]*(*s).c2.c3.im; + + rs.c1.c2.im= + u[ 6]*(*s).c1.c1.im-u[ 7]*(*s).c1.c1.re+ + u[ 1]*(*s).c1.c2.im+ mu*(*s).c1.c2.re+ + u[16]*(*s).c1.c3.im+u[17]*(*s).c1.c3.re+ + u[18]*(*s).c2.c1.im+u[19]*(*s).c2.c1.re+ + u[20]*(*s).c2.c2.im+u[21]*(*s).c2.c2.re+ + u[22]*(*s).c2.c3.im+u[23]*(*s).c2.c3.re; + + rs.c1.c3.re= + u[ 8]*(*s).c1.c1.re+u[ 9]*(*s).c1.c1.im+ + u[16]*(*s).c1.c2.re+u[17]*(*s).c1.c2.im+ + u[ 2]*(*s).c1.c3.re- mu*(*s).c1.c3.im+ + u[24]*(*s).c2.c1.re-u[25]*(*s).c2.c1.im+ + u[26]*(*s).c2.c2.re-u[27]*(*s).c2.c2.im+ + u[28]*(*s).c2.c3.re-u[29]*(*s).c2.c3.im; + + rs.c1.c3.im= + u[ 8]*(*s).c1.c1.im-u[ 9]*(*s).c1.c1.re+ + u[16]*(*s).c1.c2.im-u[17]*(*s).c1.c2.re+ + u[ 2]*(*s).c1.c3.im+ mu*(*s).c1.c3.re+ + u[24]*(*s).c2.c1.im+u[25]*(*s).c2.c1.re+ + u[26]*(*s).c2.c2.im+u[27]*(*s).c2.c2.re+ + u[28]*(*s).c2.c3.im+u[29]*(*s).c2.c3.re; + + rs.c2.c1.re= + u[10]*(*s).c1.c1.re+u[11]*(*s).c1.c1.im+ + u[18]*(*s).c1.c2.re+u[19]*(*s).c1.c2.im+ + u[24]*(*s).c1.c3.re+u[25]*(*s).c1.c3.im+ + u[ 3]*(*s).c2.c1.re- mu*(*s).c2.c1.im+ + u[30]*(*s).c2.c2.re-u[31]*(*s).c2.c2.im+ + u[32]*(*s).c2.c3.re-u[33]*(*s).c2.c3.im; + + rs.c2.c1.im= + u[10]*(*s).c1.c1.im-u[11]*(*s).c1.c1.re+ + u[18]*(*s).c1.c2.im-u[19]*(*s).c1.c2.re+ + u[24]*(*s).c1.c3.im-u[25]*(*s).c1.c3.re+ + u[ 3]*(*s).c2.c1.im+ mu*(*s).c2.c1.re+ + u[30]*(*s).c2.c2.im+u[31]*(*s).c2.c2.re+ + u[32]*(*s).c2.c3.im+u[33]*(*s).c2.c3.re; + + rs.c2.c2.re= + u[12]*(*s).c1.c1.re+u[13]*(*s).c1.c1.im+ + u[20]*(*s).c1.c2.re+u[21]*(*s).c1.c2.im+ + u[26]*(*s).c1.c3.re+u[27]*(*s).c1.c3.im+ + u[30]*(*s).c2.c1.re+u[31]*(*s).c2.c1.im+ + u[ 4]*(*s).c2.c2.re- mu*(*s).c2.c2.im+ + u[34]*(*s).c2.c3.re-u[35]*(*s).c2.c3.im; + + rs.c2.c2.im= + u[12]*(*s).c1.c1.im-u[13]*(*s).c1.c1.re+ + u[20]*(*s).c1.c2.im-u[21]*(*s).c1.c2.re+ + u[26]*(*s).c1.c3.im-u[27]*(*s).c1.c3.re+ + u[30]*(*s).c2.c1.im-u[31]*(*s).c2.c1.re+ + u[ 4]*(*s).c2.c2.im+ mu*(*s).c2.c2.re+ + u[34]*(*s).c2.c3.im+u[35]*(*s).c2.c3.re; + + rs.c2.c3.re= + u[14]*(*s).c1.c1.re+u[15]*(*s).c1.c1.im+ + u[22]*(*s).c1.c2.re+u[23]*(*s).c1.c2.im+ + u[28]*(*s).c1.c3.re+u[29]*(*s).c1.c3.im+ + u[32]*(*s).c2.c1.re+u[33]*(*s).c2.c1.im+ + u[34]*(*s).c2.c2.re+u[35]*(*s).c2.c2.im+ + u[ 5]*(*s).c2.c3.re- mu*(*s).c2.c3.im; + + rs.c2.c3.im= + u[14]*(*s).c1.c1.im-u[15]*(*s).c1.c1.re+ + u[22]*(*s).c1.c2.im-u[23]*(*s).c1.c2.re+ + u[28]*(*s).c1.c3.im-u[29]*(*s).c1.c3.re+ + u[32]*(*s).c2.c1.im-u[33]*(*s).c2.c1.re+ + u[34]*(*s).c2.c2.im-u[35]*(*s).c2.c2.re+ + u[ 5]*(*s).c2.c3.im+ mu*(*s).c2.c3.re; + + (*r)=rs; +} + +#endif + +#if (defined AVX) +#include "avx.h" + +void mul_pauli2(float mu,pauli *m,spinor *s,spinor *r) +{ + m+=4; + _prefetch_pauli_dble(m); + m-=4; + + __asm__ __volatile__ ("vmovss %0, %%xmm14 \n\t" + "vmovss %1, %%xmm2 \n\t" + "vmovss %2, %%xmm3 \n\t" + "vmovsd %3, %%xmm4 \n\t" + "vpermilps $0xb1, %%xmm14, %%xmm14 \n\t" + "vxorps %%xmm15, %%xmm15, %%xmm15 \n\t" + "vmovss %5, %%xmm8 \n\t" + "vmovss %6, %%xmm9 \n\t" + "vmovsd %7, %%xmm10 \n\t" + "vsubps %%xmm14, %%xmm15, %%xmm15" + : + : + "m" (mu), + "m" (m[0].u[0]), + "m" (m[0].u[1]), + "m" (m[0].u[8]), + "m" (m[0].u[9]), + "m" (m[1].u[0]), + "m" (m[1].u[1]), + "m" (m[1].u[8]), + "m" (m[1].u[9]) + : + "xmm2", "xmm3", "xmm4", "xmm8", + "xmm9", "xmm10", "xmm14", "xmm15"); + + __asm__ __volatile__ ("vmovhps %0, %%xmm2, %%xmm2 \n\t" + "vmovhps %0, %%xmm3, %%xmm3 \n\t" + "vmovhps %2, %%xmm4, %%xmm4 \n\t" + "vmovaps %4, %%xmm0" + : + : + "m" (m[0].u[6]), + "m" (m[0].u[7]), + "m" (m[0].u[16]), + "m" (m[0].u[17]), + "m" ((*s).c1.c1), + "m" ((*s).c1.c2) + : + "xmm0", "xmm2", "xmm3", "xmm4"); + + __asm__ __volatile__ ("vmovhps %0, %%xmm8, %%xmm8 \n\t" + "vmovhps %0, %%xmm9, %%xmm9 \n\t" + "vmovhps %2, %%xmm10, %%xmm10 \n\t" + "vinsertf128 $0x1, %4, %%ymm0, %%ymm0 \n\t" + "vinsertf128 $0x1, %%xmm15, %%ymm14, %%ymm14 \n\t" + "vinsertf128 $0x1, %%xmm8, %%ymm2, %%ymm2 \n\t" + "vinsertf128 $0x1, %%xmm9, %%ymm3, %%ymm3 \n\t" + "vinsertf128 $0x1, %%xmm10, %%ymm4, %%ymm4 \n\t" + "vmovshdup %%ymm0, %%ymm1 \n\t" + "vmovsldup %%ymm0, %%ymm0" + : + : + "m" (m[1].u[6]), + "m" (m[1].u[7]), + "m" (m[1].u[16]), + "m" (m[1].u[17]), + "m" ((*s).c3.c1), + "m" ((*s).c3.c2) + : + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm8", "xmm9", "xmm10", + "xmm14"); + + __asm__ __volatile__ ("vaddps %%ymm14, %%ymm2, %%ymm2 \n\t" + "vsubps %%ymm14, %%ymm3, %%ymm3 \n\t" + "vpermilps $0xb1, %%ymm4, %%ymm10 \n\t" + "vpermilps $0x4e, %%ymm3, %%ymm3 \n\t" + "vpermilps $0xb1, %%ymm2, %%ymm8 \n\t" + "vpermilps $0xb1, %%ymm3, %%ymm9" + : + : + : + "xmm2", "xmm3", "xmm8", "xmm9", + "xmm10"); + + __asm__ __volatile__ ("vmulps %%ymm0, %%ymm2, %%ymm2 \n\t" + "vmulps %%ymm1, %%ymm3, %%ymm3 \n\t" + "vmulps %%ymm1, %%ymm4, %%ymm4 \n\t" + "vmovsd %0, %%xmm5 \n\t" + "vmovsd %2, %%xmm6 \n\t" + "vmovsd %4, %%xmm7" + : + : + "m" (m[0].u[10]), + "m" (m[0].u[11]), + "m" (m[0].u[12]), + "m" (m[0].u[13]), + "m" (m[0].u[14]), + "m" (m[0].u[15]) + : + "xmm2", "xmm3", "xmm4", "xmm5", + "xmm6", "xmm7"); + + __asm__ __volatile__ ("vmovsd %0, %%xmm11 \n\t" + "vmovsd %2, %%xmm12 \n\t" + "vmovsd %4, %%xmm13 \n\t" + "vmulps %%ymm1, %%ymm8, %%ymm8 \n\t" + "vmulps %%ymm0, %%ymm9, %%ymm9 \n\t" + "vmulps %%ymm0, %%ymm10, %%ymm10" + : + : + "m" (m[1].u[10]), + "m" (m[1].u[11]), + "m" (m[1].u[12]), + "m" (m[1].u[13]), + "m" (m[1].u[14]), + "m" (m[1].u[15]) + : + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13"); + + __asm__ __volatile__ ("vmovhps %0, %%xmm5, %%xmm5 \n\t" + "vmovhps %2, %%xmm6, %%xmm6 \n\t" + "vmovhps %4, %%xmm7, %%xmm7 \n\t" + "vaddsubps %%ymm8, %%ymm2, %%ymm2 \n\t" + "vaddsubps %%ymm9, %%ymm3, %%ymm3 \n\t" + "vaddsubps %%ymm10, %%ymm4, %%ymm4" + : + : + "m" (m[0].u[18]), + "m" (m[0].u[19]), + "m" (m[0].u[20]), + "m" (m[0].u[21]), + "m" (m[0].u[22]), + "m" (m[0].u[23]) + : + "xmm2", "xmm3", "xmm4", "xmm5", + "xmm6", "xmm7"); + + __asm__ __volatile__ ("vmovhps %0, %%xmm11, %%xmm11 \n\t" + "vmovhps %2, %%xmm12, %%xmm12 \n\t" + "vmovhps %4, %%xmm13, %%xmm13 \n\t" + "vinsertf128 $0x1, %%xmm11, %%ymm5, %%ymm5 \n\t" + "vinsertf128 $0x1, %%xmm12, %%ymm6, %%ymm6 \n\t" + "vinsertf128 $0x1, %%xmm13, %%ymm7, %%ymm7" + : + : + "m" (m[1].u[18]), + "m" (m[1].u[19]), + "m" (m[1].u[20]), + "m" (m[1].u[21]), + "m" (m[1].u[22]), + "m" (m[1].u[23]) + : + "xmm5", "xmm6", "xmm7", "xmm11", + "xmm12", "xmm13"); + + s+=4; + _prefetch_spinor(s); + s-=4; + + __asm__ __volatile__ ("vpermilps $0xb1, %%ymm5, %%ymm8 \n\t" + "vpermilps $0xb1, %%ymm6, %%ymm9 \n\t" + "vpermilps $0xb1, %%ymm7, %%ymm10 \n\t" + "vpermilps $0xb1, %%ymm3, %%ymm3 \n\t" + "vpermilps $0xb1, %%ymm4, %%ymm4 \n\t" + "vmulps %%ymm1, %%ymm5, %%ymm5 \n\t" + "vmulps %%ymm1, %%ymm6, %%ymm6 \n\t" + "vmulps %%ymm1, %%ymm7, %%ymm7 \n\t" + "vmulps %%ymm0, %%ymm8, %%ymm8 \n\t" + "vmulps %%ymm0, %%ymm9, %%ymm9 \n\t" + "vmulps %%ymm0, %%ymm10, %%ymm10" + : + : + : + "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7", "xmm8", "xmm9", "xmm10"); + + __asm__ __volatile__ ("vmovaps %0, %%xmm11 \n\t" + "vmovaps %4, %%xmm12 \n\t" + "vmovss %8, %%xmm13 \n\t" + "vaddsubps %%ymm8, %%ymm5, %%ymm5 \n\t" + "vaddsubps %%ymm9, %%ymm6, %%ymm6 \n\t" + "vaddsubps %%ymm10, %%ymm7, %%ymm7" + : + : + "m" (m[0].u[8]), + "m" (m[0].u[9]), + "m" (m[0].u[10]), + "m" (m[0].u[11]), + "m" (m[0].u[16]), + "m" (m[0].u[17]), + "m" (m[0].u[18]), + "m" (m[0].u[19]), + "m" (m[0].u[2]) + : + "xmm5", "xmm6", "xmm7", "xmm11", + "xmm12", "xmm13"); + + __asm__ __volatile__ ("vmovaps %0, %%xmm8 \n\t" + "vmovaps %4, %%xmm9 \n\t" + "vmovss %8, %%xmm10" + : + : + "m" (m[1].u[8]), + "m" (m[1].u[9]), + "m" (m[1].u[10]), + "m" (m[1].u[11]), + "m" (m[1].u[16]), + "m" (m[1].u[17]), + "m" (m[1].u[18]), + "m" (m[1].u[19]), + "m" (m[1].u[2]) + : + "xmm8", "xmm9", "xmm10"); + + __asm__ __volatile__ ("vmovaps %0, %%xmm0 \n\t" + "vmovhps %2, %%xmm13, %%xmm13 \n\t" + "vmovhps %4, %%xmm10, %%xmm10 \n\t" + "vinsertf128 $0x1, %6, %%ymm0, %%ymm0 \n\t" + "vinsertf128 $0x1, %%xmm8, %%ymm11, %%ymm11 \n\t" + "vinsertf128 $0x1, %%xmm9, %%ymm12, %%ymm12 \n\t" + "vinsertf128 $0x1, %%xmm10, %%ymm13, %%ymm13 \n\t" + "vmovshdup %%ymm0, %%ymm1 \n\t" + "vmovsldup %%ymm0, %%ymm0 \n\t" + "vaddps %%ymm14, %%ymm13, %%ymm13 \n\t" + "vpermilps $0xb1, %%ymm11, %%ymm8 \n\t" + "vpermilps $0xb1, %%ymm12, %%ymm9 \n\t" + "vpermilps $0xb1, %%ymm13, %%ymm10" + : + : + "m" ((*s).c1.c3), + "m" ((*s).c2.c1), + "m" (m[0].u[24]), + "m" (m[0].u[25]), + "m" (m[1].u[24]), + "m" (m[1].u[25]), + "m" ((*s).c3.c3), + "m" ((*s).c4.c1) + : + "xmm0", "xmm1", "xmm8", "xmm9", + "xmm10", "xmm11", "xmm12", "xmm13"); + + __asm__ __volatile__ ("vmulps %%ymm0, %%ymm11, %%ymm11 \n\t" + "vmulps %%ymm0, %%ymm12, %%ymm12 \n\t" + "vmulps %%ymm0, %%ymm13, %%ymm13 \n\t" + "vaddps %%ymm11, %%ymm2, %%ymm2 \n\t" + "vaddps %%ymm12, %%ymm3, %%ymm3 \n\t" + "vaddps %%ymm13, %%ymm4, %%ymm4 \n\t" + "vmovss %0, %%xmm11 \n\t" + "vmovsd %1, %%xmm12 \n\t" + "vmovsd %3, %%xmm13 \n\t" + "vmulps %%ymm1, %%ymm8, %%ymm8 \n\t" + "vmulps %%ymm1, %%ymm9, %%ymm9 \n\t" + "vmulps %%ymm1, %%ymm10, %%ymm10" + : + : + "m" (m[0].u[3]), + "m" (m[0].u[26]), + "m" (m[0].u[27]), + "m" (m[0].u[28]), + "m" (m[0].u[29]) + : + "xmm2", "xmm3", "xmm4", "xmm8", + "xmm9", "xmm10", "xmm11", "xmm12", + "xmm13"); + + __asm__ __volatile__ ("vmovhps %0, %%xmm11, %%xmm11 \n\t" + "vmovhps %2, %%xmm12, %%xmm12 \n\t" + "vmovhps %4, %%xmm13, %%xmm13 \n\t" + "vaddsubps %%ymm8, %%ymm2, %%ymm2 \n\t" + "vaddsubps %%ymm9, %%ymm3, %%ymm3 \n\t" + "vaddsubps %%ymm10, %%ymm4, %%ymm4" + : + : + "m" (m[0].u[24]), + "m" (m[0].u[25]), + "m" (m[0].u[30]), + "m" (m[0].u[31]), + "m" (m[0].u[32]), + "m" (m[0].u[33]) + : + "xmm2", "xmm3", "xmm4", "xmm11", + "xmm12", "xmm13"); + + __asm__ __volatile__ ("vmovss %0, %%xmm8 \n\t" + "vmovsd %1, %%xmm9 \n\t" + "vmovsd %3, %%xmm10" + : + : + "m" (m[1].u[3]), + "m" (m[1].u[26]), + "m" (m[1].u[27]), + "m" (m[1].u[28]), + "m" (m[1].u[29]) + : + "xmm8", "xmm9", "xmm10"); + + __asm__ __volatile__ ("vmovhps %0, %%xmm8, %%xmm8 \n\t" + "vmovhps %2, %%xmm9, %%xmm9 \n\t" + "vmovhps %4, %%xmm10, %%xmm10 \n\t" + "vinsertf128 $0x1, %%xmm8, %%ymm11, %%ymm11 \n\t" + "vinsertf128 $0x1, %%xmm9, %%ymm12, %%ymm12 \n\t" + "vinsertf128 $0x1, %%xmm10, %%ymm13, %%ymm13 \n\t" + "vsubps %%ymm14, %%ymm11, %%ymm11" + : + : + "m" (m[1].u[24]), + "m" (m[1].u[25]), + "m" (m[1].u[30]), + "m" (m[1].u[31]), + "m" (m[1].u[32]), + "m" (m[1].u[33]) + : + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13"); + + __asm__ __volatile__ ("vpermilps $0x4e, %%ymm11, %%ymm11 \n\t" + "vpermilps $0xb1, %%ymm11, %%ymm8 \n\t" + "vpermilps $0xb1, %%ymm12, %%ymm9 \n\t" + "vpermilps $0xb1, %%ymm13, %%ymm10 \n\t" + "vmulps %%ymm1, %%ymm11, %%ymm11 \n\t" + "vmulps %%ymm1, %%ymm12, %%ymm12 \n\t" + "vmulps %%ymm1, %%ymm13, %%ymm13 \n\t" + "vaddps %%ymm11, %%ymm5, %%ymm5 \n\t" + "vaddps %%ymm12, %%ymm6, %%ymm6 \n\t" + "vaddps %%ymm13, %%ymm7, %%ymm7 \n\t" + "vmulps %%ymm0, %%ymm8, %%ymm8 \n\t" + "vmulps %%ymm0, %%ymm9, %%ymm9 \n\t" + "vmulps %%ymm0, %%ymm10, %%ymm10" + : + : + : + "xmm5", "xmm6", "xmm7", "xmm8", + "xmm9", "xmm10", "xmm11", "xmm12", + "xmm13"); + + __asm__ __volatile__ ("vmovaps %0, %%xmm11 \n\t" + "vmovaps %4, %%xmm12 \n\t" + "vaddsubps %%ymm8, %%ymm5, %%ymm5 \n\t" + "vaddsubps %%ymm9, %%ymm6, %%ymm6 \n\t" + "vaddsubps %%ymm10, %%ymm7, %%ymm7" + : + : + "m" (m[0].u[12]), + "m" (m[0].u[13]), + "m" (m[0].u[14]), + "m" (m[0].u[15]), + "m" (m[0].u[20]), + "m" (m[0].u[21]), + "m" (m[0].u[22]), + "m" (m[0].u[23]) + : + "xmm5", "xmm6", "xmm7", "xmm11", + "xmm12"); + + __asm__ __volatile__ ("vmovups %0, %%xmm13 \n\t" + "vmovaps %4, %%xmm0" + : + : + "m" (m[0].u[26]), + "m" (m[0].u[27]), + "m" (m[0].u[28]), + "m" (m[0].u[29]), + "m" ((*s).c2.c2), + "m" ((*s).c2.c3) + : + "xmm0", "xmm13"); + + __asm__ __volatile__ ("vmovaps %0, %%xmm8 \n\t" + "vmovaps %4, %%xmm9" + : + : + "m" (m[1].u[12]), + "m" (m[1].u[13]), + "m" (m[1].u[14]), + "m" (m[1].u[15]), + "m" (m[1].u[20]), + "m" (m[1].u[21]), + "m" (m[1].u[22]), + "m" (m[1].u[23]) + : + "xmm8", "xmm9"); + + __asm__ __volatile__ ("vmovups %0, %%xmm10 \n\t" + "vinsertf128 $0x1, %4, %%ymm0, %%ymm0 \n\t" + "vinsertf128 $0x1, %%xmm8, %%ymm11, %%ymm11 \n\t" + "vinsertf128 $0x1, %%xmm9, %%ymm12, %%ymm12 \n\t" + "vinsertf128 $0x1, %%xmm10, %%ymm13, %%ymm13 \n\t" + "vmovshdup %%ymm0, %%ymm1 \n\t" + "vmovsldup %%ymm0, %%ymm0 \n\t" + "vpermilps $0xb1, %%ymm11, %%ymm8 \n\t" + "vpermilps $0xb1, %%ymm12, %%ymm9 \n\t" + "vpermilps $0xb1, %%ymm13, %%ymm10" + : + : + "m" (m[1].u[26]), + "m" (m[1].u[27]), + "m" (m[1].u[28]), + "m" (m[1].u[29]), + "m" ((*s).c4.c2), + "m" ((*s).c4.c3) + : + "xmm0", "xmm1", "xmm8", "xmm9", + "xmm10", "xmm11", "xmm12", "xmm13"); + + __asm__ __volatile__ ("vmulps %%ymm0, %%ymm11, %%ymm11 \n\t" + "vmulps %%ymm0, %%ymm12, %%ymm12 \n\t" + "vmulps %%ymm0, %%ymm13, %%ymm13 \n\t" + "vaddps %%ymm11, %%ymm2, %%ymm2 \n\t" + "vaddps %%ymm12, %%ymm3, %%ymm3 \n\t" + "vaddps %%ymm13, %%ymm4, %%ymm4 \n\t" + "vmovups %0, %%xmm11 \n\t" + "vmovss %4, %%xmm12 \n\t" + "vmovss %5, %%xmm13 \n\t" + "vmulps %%ymm1, %%ymm8, %%ymm8 \n\t" + "vmulps %%ymm1, %%ymm9, %%ymm9 \n\t" + "vmulps %%ymm1, %%ymm10, %%ymm10" + : + : + "m" (m[0].u[30]), + "m" (m[0].u[31]), + "m" (m[0].u[32]), + "m" (m[0].u[33]), + "m" (m[0].u[4]), + "m" (m[0].u[5]) + : + "xmm2", "xmm3", "xmm4", "xmm8", + "xmm9", "xmm10", "xmm11", "xmm12", + "xmm13"); + + __asm__ __volatile__ ("vmovhps %0, %%xmm12, %%xmm12 \n\t" + "vmovhps %0, %%xmm13, %%xmm13 \n\t" + "vaddsubps %%ymm8, %%ymm2, %%ymm2 \n\t" + "vaddsubps %%ymm9, %%ymm3, %%ymm3 \n\t" + "vaddsubps %%ymm10, %%ymm4, %%ymm4" + : + : + "m" (m[0].u[34]), + "m" (m[0].u[35]) + : + "xmm2", "xmm3", "xmm4", "xmm12", + "xmm13"); + + __asm__ __volatile__ ("vmovss %0, %%xmm9 \n\t" + "vmovss %1, %%xmm10 \n\t" + "vmovups %2, %%xmm8 \n\t" + "vmovhps %6, %%xmm9, %%xmm9 \n\t" + "vmovhps %6, %%xmm10, %%xmm10 \n\t" + "vinsertf128 $0x1, %%xmm8, %%ymm11, %%ymm11 \n\t" + "vinsertf128 $0x1, %%xmm9, %%ymm12, %%ymm12 \n\t" + "vinsertf128 $0x1, %%xmm10, %%ymm13, %%ymm13 \n\t" + "vaddps %%ymm14, %%ymm12, %%ymm12 \n\t" + "vsubps %%ymm14, %%ymm13, %%ymm13" + : + : + "m" (m[1].u[4]), + "m" (m[1].u[5]), + "m" (m[1].u[30]), + "m" (m[1].u[31]), + "m" (m[1].u[32]), + "m" (m[1].u[33]), + "m" (m[1].u[34]), + "m" (m[1].u[35]) + : + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13"); + + __asm__ __volatile__ ("vpermilps $0xb1, %%ymm5, %%ymm5 \n\t" + "vpermilps $0xb1, %%ymm6, %%ymm6 \n\t" + "vpermilps $0x4e, %%ymm13, %%ymm13 \n\t" + "vpermilps $0xb1, %%ymm11, %%ymm8 \n\t" + "vpermilps $0xb1, %%ymm12, %%ymm9 \n\t" + "vpermilps $0xb1, %%ymm13, %%ymm10 \n\t" + "vmulps %%ymm1, %%ymm8, %%ymm8 \n\t" + "vmulps %%ymm1, %%ymm9, %%ymm9 \n\t" + "vmulps %%ymm0, %%ymm10, %%ymm10 \n\t" + "vmulps %%ymm0, %%ymm11, %%ymm11 \n\t" + "vmulps %%ymm0, %%ymm12, %%ymm12 \n\t" + "vmulps %%ymm1, %%ymm13, %%ymm13" + : + : + : + "xmm5", "xmm6", "xmm8", "xmm9", + "xmm10", "xmm11", "xmm12", "xmm13"); + + __asm__ __volatile__ ("vaddsubps %%ymm8, %%ymm5, %%ymm5 \n\t" + "vaddsubps %%ymm9, %%ymm6, %%ymm6 \n\t" + "vaddsubps %%ymm10, %%ymm7, %%ymm7 \n\t" + "vpermilps $0xd8, %%ymm2, %%ymm2 \n\t" + "vpermilps $0xd8, %%ymm3, %%ymm3 \n\t" + "vpermilps $0xd8, %%ymm4, %%ymm4 \n\t" + "vaddps %%ymm11, %%ymm5, %%ymm5 \n\t" + "vaddps %%ymm12, %%ymm6, %%ymm6 \n\t" + "vaddps %%ymm13, %%ymm7, %%ymm7 \n\t" + "vpermilps $0xd8, %%ymm5, %%ymm5 \n\t" + "vpermilps $0xd8, %%ymm6, %%ymm6 \n\t" + "vpermilps $0x8d, %%ymm7, %%ymm7 \n\t" + "vhaddps %%ymm3, %%ymm2, %%ymm2 \n\t" + "vhaddps %%ymm5, %%ymm4, %%ymm4 \n\t" + "vhaddps %%ymm7, %%ymm6, %%ymm6 \n\t" + "vmovaps %%xmm2, %0 \n\t" + "vmovaps %%xmm4, %2 \n\t" + "vmovaps %%xmm6, %4" + : + "=m" ((*r).c1.c1), + "=m" ((*r).c1.c2), + "=m" ((*r).c1.c3), + "=m" ((*r).c2.c1), + "=m" ((*r).c2.c2), + "=m" ((*r).c2.c3) + : + : + "xmm2", "xmm3", "xmm4", "xmm5", + "xmm6", "xmm7"); + + __asm__ __volatile__ ("vextractf128 $0x1, %%ymm2, %0 \n\t" + "vextractf128 $0x1, %%ymm4, %2 \n\t" + "vextractf128 $0x1, %%ymm6, %4" + : + "=m" ((*r).c3.c1), + "=m" ((*r).c3.c2), + "=m" ((*r).c3.c3), + "=m" ((*r).c4.c1), + "=m" ((*r).c4.c2), + "=m" ((*r).c4.c3)); + + _avx_zeroupper(); +} + +#else + +void mul_pauli2(float mu,pauli *m,spinor *s,spinor *r) +{ + spin_t *ps,*pr; + + ps=(spin_t*)(s); + pr=(spin_t*)(r); + + mul_pauli(mu,m,(*ps).w,(*pr).w); + mul_pauli(-mu,m+1,(*ps).w+1,(*pr).w+1); +} + +#endif + +void assign_pauli(int vol,pauli_dble *md,pauli *m) +{ + float *u; + double *ud,*um; + pauli_dble *mm; + + mm=md+vol; + + for (;md +#include +#include +#include +#include "su3.h" +#include "linalg.h" +#include "sw_term.h" + +#define DELTA 1.0e-04 + +typedef union +{ + spinor_dble s; + weyl_dble w[2]; + complex_dble c[12]; +} spin_t; + +static double rr[5] ALIGNED16; +static complex_dble aa[36],cc[6],dd[6] ALIGNED16; + +#if (defined QPX) +static pauli_dble m1; +#endif + +#if (defined x64) +#include "sse2.h" + +#if (defined AVX) +#include "avx.h" + +void mul_pauli_dble(double mu,pauli_dble *m,weyl_dble *s,weyl_dble *r) +{ + m+=2; + _prefetch_pauli_dble(m); + m-=2; + + __asm__ __volatile__ ("vmovsd %0, %%xmm14 \n\t" + "vmovsd %1, %%xmm2 \n\t" + "vmovsd %2, %%xmm3 \n\t" + "vmovapd %3, %%xmm4 \n\t" + "vpermilpd $0x1, %%xmm14, %%xmm14" + : + : + "m" (mu), + "m" ((*m).u[0]), + "m" ((*m).u[1]), + "m" ((*m).u[8]), + "m" ((*m).u[9]) + : + "xmm2", "xmm3", "xmm4", "xmm14"); + + __asm__ __volatile__ ("vinsertf128 $0x1, %0, %%ymm2, %%ymm2 \n\t" + "vinsertf128 $0x1, %0, %%ymm3, %%ymm3 \n\t" + "vinsertf128 $0x1, %2, %%ymm4, %%ymm4 \n\t" + "vmovddup %4, %%ymm0 \n\t" + "vmovddup %5, %%ymm1 \n\t" + "vaddpd %%ymm14, %%ymm2, %%ymm2 \n\t" + "vsubpd %%ymm14, %%ymm3, %%ymm3 \n\t" + "vpermilpd $0x5, %%ymm4, %%ymm10 \n\t" + "vperm2f128 $0x1, %%ymm3, %%ymm3, %%ymm3 \n\t" + "vpermilpd $0x5, %%ymm2, %%ymm8 \n\t" + "vpermilpd $0x5, %%ymm3, %%ymm9" + : + : + "m" ((*m).u[6]), + "m" ((*m).u[7]), + "m" ((*m).u[16]), + "m" ((*m).u[17]), + "m" ((*s).c1.c1.re), + "m" ((*s).c1.c1.im), + "m" ((*s).c1.c2.re), + "m" ((*s).c1.c2.im) + : + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm8", "xmm9", "xmm10"); + + __asm__ __volatile__ ("vmulpd %%ymm0, %%ymm2, %%ymm2 \n\t" + "vmulpd %%ymm1, %%ymm3, %%ymm3 \n\t" + "vmulpd %%ymm1, %%ymm4, %%ymm4 \n\t" + "vmovapd %0, %%xmm5 \n\t" + "vmovapd %2, %%xmm6 \n\t" + "vmovapd %4, %%xmm7" + : + : + "m" ((*m).u[10]), + "m" ((*m).u[11]), + "m" ((*m).u[12]), + "m" ((*m).u[13]), + "m" ((*m).u[14]), + "m" ((*m).u[15]) + : + "xmm2", "xmm3", "xmm4", "xmm5", + "xmm6", "xmm7"); + + s+=4; + _prefetch_weyl(s); + s-=4; + + __asm__ __volatile__ ("vmulpd %%ymm1, %%ymm8, %%ymm8 \n\t" + "vmulpd %%ymm0, %%ymm9, %%ymm9 \n\t" + "vmulpd %%ymm0, %%ymm10, %%ymm10 \n\t" + "vinsertf128 $0x1, %0, %%ymm5, %%ymm5 \n\t" + "vinsertf128 $0x1, %2, %%ymm6, %%ymm6 \n\t" + "vinsertf128 $0x1, %4, %%ymm7, %%ymm7 \n\t" + "vaddsubpd %%ymm8, %%ymm2, %%ymm2 \n\t" + "vaddsubpd %%ymm9, %%ymm3, %%ymm3 \n\t" + "vaddsubpd %%ymm10, %%ymm4, %%ymm4" + : + : + "m" ((*m).u[18]), + "m" ((*m).u[19]), + "m" ((*m).u[20]), + "m" ((*m).u[21]), + "m" ((*m).u[22]), + "m" ((*m).u[23]) + : + "xmm2", "xmm3", "xmm4", "xmm5", + "xmm6", "xmm7", "xmm8", "xmm9", + "xmm10"); + + __asm__ __volatile__ ("vpermilpd $0x5, %%ymm5, %%ymm8 \n\t" + "vpermilpd $0x5, %%ymm6, %%ymm9 \n\t" + "vpermilpd $0x5, %%ymm7, %%ymm10 \n\t" + "vpermilpd $0x5, %%ymm3, %%ymm3 \n\t" + "vpermilpd $0x5, %%ymm4, %%ymm4 \n\t" + "vmulpd %%ymm1, %%ymm5, %%ymm5 \n\t" + "vmulpd %%ymm1, %%ymm6, %%ymm6 \n\t" + "vmulpd %%ymm1, %%ymm7, %%ymm7 \n\t" + "vmulpd %%ymm0, %%ymm8, %%ymm8 \n\t" + "vmulpd %%ymm0, %%ymm9, %%ymm9 \n\t" + "vmulpd %%ymm0, %%ymm10, %%ymm10" + : + : + : + "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7", "xmm8", "xmm9", "xmm10"); + + __asm__ __volatile__ ("vmovsd %0, %%xmm13 \n\t" + "vmovapd %1, %%ymm11" + : + : + "m" ((*m).u[2]), + "m" ((*m).u[8]), + "m" ((*m).u[9]), + "m" ((*m).u[10]), + "m" ((*m).u[11]) + : + "xmm11", "xmm13"); + + __asm__ __volatile__ ("vmovapd %0, %%ymm12 \n\t" + "vaddsubpd %%ymm8, %%ymm5, %%ymm5 \n\t" + "vaddsubpd %%ymm9, %%ymm6, %%ymm6 \n\t" + "vaddsubpd %%ymm10, %%ymm7, %%ymm7 \n\t" + "vinsertf128 $0x1, %4, %%ymm13, %%ymm13 \n\t" + "vmovddup %6, %%ymm0 \n\t" + "vmovddup %7, %%ymm1 \n\t" + "vaddpd %%ymm14, %%ymm13, %%ymm13 \n\t" + "vpermilpd $0x5, %%ymm11, %%ymm8 \n\t" + "vpermilpd $0x5, %%ymm12, %%ymm9 \n\t" + "vpermilpd $0x5, %%ymm13, %%ymm10" + : + : + "m" ((*m).u[16]), + "m" ((*m).u[17]), + "m" ((*m).u[18]), + "m" ((*m).u[19]), + "m" ((*m).u[24]), + "m" ((*m).u[25]), + "m" ((*s).c1.c3.re), + "m" ((*s).c1.c3.im), + "m" ((*s).c2.c1.re), + "m" ((*s).c2.c1.im) + : + "xmm0", "xmm1", "xmm5", "xmm6", + "xmm7", "xmm8", "xmm9", "xmm10", + "xmm12", "xmm13"); + + __asm__ __volatile__ ("vmulpd %%ymm0, %%ymm11, %%ymm11 \n\t" + "vmulpd %%ymm0, %%ymm12, %%ymm12 \n\t" + "vmulpd %%ymm0, %%ymm13, %%ymm13 \n\t" + "vaddpd %%ymm11, %%ymm2, %%ymm2 \n\t" + "vaddpd %%ymm12, %%ymm3, %%ymm3 \n\t" + "vaddpd %%ymm13, %%ymm4, %%ymm4 \n\t" + "vmovsd %0, %%xmm11 \n\t" + "vmovapd %1, %%xmm12 \n\t" + "vmovapd %3, %%xmm13 \n\t" + "vmulpd %%ymm1, %%ymm8, %%ymm8 \n\t" + "vmulpd %%ymm1, %%ymm9, %%ymm9 \n\t" + "vmulpd %%ymm1, %%ymm10, %%ymm10" + : + : + "m" ((*m).u[3]), + "m" ((*m).u[26]), + "m" ((*m).u[27]), + "m" ((*m).u[28]), + "m" ((*m).u[29]) + : + "xmm2", "xmm3", "xmm4", "xmm8", + "xmm9", "xmm10", "xmm11", "xmm12", + "xmm13"); + + __asm__ __volatile__ ("vinsertf128 $0x1, %0, %%ymm11, %%ymm11 \n\t" + "vinsertf128 $0x1, %2, %%ymm12, %%ymm12 \n\t" + "vinsertf128 $0x1, %4, %%ymm13, %%ymm13 \n\t" + "vsubpd %%ymm14, %%ymm11, %%ymm11 \n\t" + "vaddsubpd %%ymm8, %%ymm2, %%ymm2 \n\t" + "vaddsubpd %%ymm9, %%ymm3, %%ymm3 \n\t" + "vaddsubpd %%ymm10, %%ymm4, %%ymm4" + : + : + "m" ((*m).u[24]), + "m" ((*m).u[25]), + "m" ((*m).u[30]), + "m" ((*m).u[31]), + "m" ((*m).u[32]), + "m" ((*m).u[33]) + : + "xmm2", "xmm3", "xmm4", "xmm11", + "xmm12", "xmm13"); + + __asm__ __volatile__ ("vperm2f128 $0x1, %%ymm11, %%ymm11, %%ymm11\n\t" + "vpermilpd $0x5, %%ymm11, %%ymm8 \n\t" + "vpermilpd $0x5, %%ymm12, %%ymm9 \n\t" + "vpermilpd $0x5, %%ymm13, %%ymm10 \n\t" + "vmulpd %%ymm1, %%ymm11, %%ymm11 \n\t" + "vmulpd %%ymm1, %%ymm12, %%ymm12 \n\t" + "vmulpd %%ymm1, %%ymm13, %%ymm13 \n\t" + "vaddpd %%ymm11, %%ymm5, %%ymm5 \n\t" + "vaddpd %%ymm12, %%ymm6, %%ymm6 \n\t" + "vaddpd %%ymm13, %%ymm7, %%ymm7 \n\t" + "vmulpd %%ymm0, %%ymm8, %%ymm8 \n\t" + "vmulpd %%ymm0, %%ymm9, %%ymm9 \n\t" + "vmulpd %%ymm0, %%ymm10, %%ymm10" + : + : + : + "xmm5", "xmm6","xmm7", "xmm8", + "xmm9", "xmm10", "xmm11", "xmm12", + "xmm13"); + + __asm__ __volatile__ ("vmovapd %0, %%ymm11 \n\t" + "vmovapd %4, %%ymm12" + : + : + "m" ((*m).u[12]), + "m" ((*m).u[13]), + "m" ((*m).u[14]), + "m" ((*m).u[15]), + "m" ((*m).u[20]), + "m" ((*m).u[21]), + "m" ((*m).u[22]), + "m" ((*m).u[23]) + : + "xmm11", "xmm12"); + + __asm__ __volatile__ ("vmovupd %0, %%ymm13 \n\t" + "vaddsubpd %%ymm8, %%ymm5, %%ymm5 \n\t" + "vaddsubpd %%ymm9, %%ymm6, %%ymm6 \n\t" + "vaddsubpd %%ymm10, %%ymm7, %%ymm7 \n\t" + "vmovddup %4, %%ymm0 \n\t" + "vmovddup %5, %%ymm1 \n\t" + "vpermilpd $0x5, %%ymm11, %%ymm8 \n\t" + "vpermilpd $0x5, %%ymm12, %%ymm9 \n\t" + "vpermilpd $0x5, %%ymm13, %%ymm10" + : + : + "m" ((*m).u[26]), + "m" ((*m).u[27]), + "m" ((*m).u[28]), + "m" ((*m).u[29]), + "m" ((*s).c2.c2.re), + "m" ((*s).c2.c2.im), + "m" ((*s).c2.c3.re), + "m" ((*s).c2.c3.im) + + : + "xmm0", "xmm1", "xmm5", "xmm6", + "xmm7", "xmm8", "xmm9", "xmm10", + "xmm13"); + + __asm__ __volatile__ ("vmulpd %%ymm0, %%ymm11, %%ymm11 \n\t" + "vmulpd %%ymm0, %%ymm12, %%ymm12 \n\t" + "vmulpd %%ymm0, %%ymm13, %%ymm13 \n\t" + "vaddpd %%ymm11, %%ymm2, %%ymm2 \n\t" + "vaddpd %%ymm12, %%ymm3, %%ymm3 \n\t" + "vaddpd %%ymm13, %%ymm4, %%ymm4 \n\t" + "vmovupd %0, %%ymm11 \n\t" + "vmovsd %4, %%xmm12 \n\t" + "vmovsd %5, %%xmm13 \n\t" + "vmulpd %%ymm1, %%ymm8, %%ymm8 \n\t" + "vmulpd %%ymm1, %%ymm9, %%ymm9 \n\t" + "vmulpd %%ymm1, %%ymm10, %%ymm10" + : + : + "m" ((*m).u[30]), + "m" ((*m).u[31]), + "m" ((*m).u[32]), + "m" ((*m).u[33]), + "m" ((*m).u[4]), + "m" ((*m).u[5]) + : + "xmm2", "xmm3", "xmm4", "xmm8", + "xmm9", "xmm10", "xmm11", "xmm12", + "xmm13"); + + __asm__ __volatile__ ("vinsertf128 $0x1, %0, %%ymm12, %%ymm12 \n\t" + "vinsertf128 $0x1, %0, %%ymm13, %%ymm13 \n\t" + "vaddsubpd %%ymm8, %%ymm2, %%ymm2 \n\t" + "vaddsubpd %%ymm9, %%ymm3, %%ymm3 \n\t" + "vaddpd %%ymm14, %%ymm12, %%ymm12 \n\t" + "vsubpd %%ymm14, %%ymm13, %%ymm13 \n\t" + "vaddsubpd %%ymm10, %%ymm4, %%ymm4" + : + : + "m" ((*m).u[34]), + "m" ((*m).u[35]) + : + "xmm2", "xmm3", "xmm4", "xmm12", + "xmm13"); + + __asm__ __volatile__ ("vperm2f128 $0x1, %%ymm13, %%ymm13, %%ymm13 \n\t" + "vpermilpd $0x5, %%ymm11, %%ymm8 \n\t" + "vpermilpd $0x5, %%ymm12, %%ymm9 \n\t" + "vpermilpd $0x5, %%ymm13, %%ymm10 \n\t" + "vmulpd %%ymm1, %%ymm8, %%ymm8 \n\t" + "vmulpd %%ymm1, %%ymm9, %%ymm9 \n\t" + "vmulpd %%ymm0, %%ymm10, %%ymm10 \n\t" + "vpermilpd $0x5, %%ymm5, %%ymm5 \n\t" + "vpermilpd $0x5, %%ymm6, %%ymm6 \n\t" + "vmulpd %%ymm0, %%ymm11, %%ymm11 \n\t" + "vmulpd %%ymm0, %%ymm12, %%ymm12 \n\t" + "vmulpd %%ymm1, %%ymm13, %%ymm13" + : + : + : + "xmm5", "xmm6", "xmm8", "xmm9", + "xmm10", "xmm11", "xmm12", "xmm13"); + + __asm__ __volatile__ ("vaddsubpd %%ymm8, %%ymm5, %%ymm5 \n\t" + "vaddsubpd %%ymm9, %%ymm6, %%ymm6 \n\t" + "vaddsubpd %%ymm10, %%ymm7, %%ymm7 \n\t" + "vaddpd %%ymm11, %%ymm5, %%ymm5 \n\t" + "vaddpd %%ymm12, %%ymm6, %%ymm6 \n\t" + "vaddpd %%ymm13, %%ymm7, %%ymm7 \n\t" + "vpermilpd $0x5, %%ymm7, %%ymm7 \n\t" + "vblendpd $0x3, %%ymm2, %%ymm3, %%ymm8 \n\t" + "vblendpd $0x3, %%ymm3, %%ymm2, %%ymm9 \n\t" + "vblendpd $0x3, %%ymm4, %%ymm5, %%ymm10 \n\t" + "vblendpd $0x3, %%ymm5, %%ymm4, %%ymm11 \n\t" + "vblendpd $0x3, %%ymm6, %%ymm7, %%ymm12 \n\t" + "vblendpd $0x3, %%ymm7, %%ymm6, %%ymm13 \n\t" + "vperm2f128 $0x1, %%ymm9, %%ymm9, %%ymm9 \n\t" + "vperm2f128 $0x1, %%ymm11, %%ymm11, %%ymm11 \n\t" + "vperm2f128 $0x1, %%ymm13, %%ymm13, %%ymm13 \n\t" + "vaddpd %%ymm8, %%ymm9, %%ymm2 \n\t" + "vaddpd %%ymm10, %%ymm11, %%ymm4 \n\t" + "vaddpd %%ymm12, %%ymm13, %%ymm6 \n\t" + "vmovapd %%ymm2, %0 \n\t" + "vmovapd %%ymm4, %2 \n\t" + "vmovapd %%ymm6, %4" + : + "=m" ((*r).c1.c1), + "=m" ((*r).c1.c2), + "=m" ((*r).c1.c3), + "=m" ((*r).c2.c1), + "=m" ((*r).c2.c2), + "=m" ((*r).c2.c3) + : + : + "xmm2", "xmm4", "xmm5", "xmm6", + "xmm7", "xmm8", "xmm9", "xmm10", + "xmm11", "xmm12", "xmm13"); + + _avx_zeroupper(); +} + +#else + +void mul_pauli_dble(double mu,pauli_dble *m,weyl_dble *s,weyl_dble *r) +{ + m+=2; + _prefetch_pauli_dble(m); + m-=2; + + __asm__ __volatile__ ("movddup %0, %%xmm10 \n\t" + "movapd %1, %%xmm13 \n\t" + "movapd %%xmm10, %%xmm11 \n\t" + "movapd %%xmm13, %%xmm14 \n\t" + "movapd %%xmm10, %%xmm12 \n\t" + "movapd %%xmm13, %%xmm15" + : + : + "m" (mu), + "m" (_sse_sgn2_dble) + : + "xmm10", "xmm11", "xmm12", "xmm13", + "xmm14", "xmm15"); + + __asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" + "movapd %1, %%xmm1 \n\t" + "movapd %2, %%xmm2 \n\t" + "movapd %3, %%xmm3 \n\t" + "movapd %4, %%xmm4 \n\t" + "movapd %5, %%xmm5 \n\t" + "mulpd %%xmm10, %%xmm0 \n\t" + "mulpd %%xmm11, %%xmm1 \n\t" + "mulpd %%xmm12, %%xmm2 \n\t" + "mulpd %%xmm10, %%xmm3 \n\t" + "mulpd %%xmm11, %%xmm4 \n\t" + "mulpd %%xmm12, %%xmm5" + : + : + "m" ((*s).c1.c1), + "m" ((*s).c1.c2), + "m" ((*s).c1.c3), + "m" ((*s).c2.c1), + "m" ((*s).c2.c2), + "m" ((*s).c2.c3) + : + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5"); + + __asm__ __volatile__ ("movddup %0, %%xmm6 \n\t" + "movddup %1, %%xmm7 \n\t" + "movddup %2, %%xmm8 \n\t" + "movddup %3, %%xmm9 \n\t" + "movddup %4, %%xmm10 \n\t" + "movddup %5, %%xmm11" + : + : + "m" ((*m).u[7]), + "m" ((*m).u[17]), + "m" ((*m).u[25]), + "m" ((*m).u[31]), + "m" ((*m).u[35]), + "m" ((*m).u[15]) + : + "xmm6", "xmm7", "xmm8", "xmm9", + "xmm10", "xmm11"); + + s+=2; + _prefetch_weyl_dble(s); + s-=2; + + __asm__ __volatile__ ("mulpd %0, %%xmm6 \n\t" + "mulpd %1, %%xmm7 \n\t" + "mulpd %2, %%xmm8 \n\t" + "mulpd %3, %%xmm9 \n\t" + "mulpd %4, %%xmm10 \n\t" + "mulpd %5, %%xmm11 \n\t" + "addpd %%xmm6, %%xmm0 \n\t" + "addpd %%xmm7, %%xmm1 \n\t" + "addpd %%xmm8, %%xmm2 \n\t" + "addpd %%xmm9, %%xmm3 \n\t" + "addpd %%xmm10, %%xmm4 \n\t" + "subpd %%xmm11, %%xmm5" + : + : + "m" ((*s).c1.c2), + "m" ((*s).c1.c3), + "m" ((*s).c2.c1), + "m" ((*s).c2.c2), + "m" ((*s).c2.c3), + "m" ((*s).c1.c1) + : + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11"); + + __asm__ __volatile__ ("movddup %0, %%xmm6 \n\t" + "movddup %1, %%xmm7 \n\t" + "movddup %2, %%xmm8 \n\t" + "movddup %3, %%xmm9 \n\t" + "movddup %4, %%xmm10 \n\t" + "movddup %5, %%xmm11" + : + : + "m" ((*m).u[9]), + "m" ((*m).u[19]), + "m" ((*m).u[27]), + "m" ((*m).u[33]), + "m" ((*m).u[13]), + "m" ((*m).u[23]) + : + "xmm6", "xmm7", "xmm8", "xmm9", + "xmm10", "xmm11"); + + __asm__ __volatile__ ("mulpd %0, %%xmm6 \n\t" + "mulpd %1, %%xmm7 \n\t" + "mulpd %2, %%xmm8 \n\t" + "mulpd %3, %%xmm9 \n\t" + "mulpd %4, %%xmm10 \n\t" + "mulpd %5, %%xmm11 \n\t" + "addpd %%xmm6, %%xmm0 \n\t" + "addpd %%xmm7, %%xmm1 \n\t" + "addpd %%xmm8, %%xmm2 \n\t" + "addpd %%xmm9, %%xmm3 \n\t" + "subpd %%xmm10, %%xmm4 \n\t" + "subpd %%xmm11, %%xmm5" + : + : + "m" ((*s).c1.c3), + "m" ((*s).c2.c1), + "m" ((*s).c2.c2), + "m" ((*s).c2.c3), + "m" ((*s).c1.c1), + "m" ((*s).c1.c2) + : + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11"); + + __asm__ __volatile__ ("movddup %0, %%xmm6 \n\t" + "movddup %1, %%xmm7 \n\t" + "movddup %2, %%xmm8 \n\t" + "movddup %3, %%xmm9 \n\t" + "movddup %4, %%xmm10 \n\t" + "movddup %5, %%xmm11" + : + : + "m" ((*m).u[11]), + "m" ((*m).u[21]), + "m" ((*m).u[29]), + "m" ((*m).u[11]), + "m" ((*m).u[21]), + "m" ((*m).u[29]) + : + "xmm6", "xmm7", "xmm8", "xmm9", + "xmm10", "xmm11"); + + __asm__ __volatile__ ("mulpd %0, %%xmm6 \n\t" + "mulpd %1, %%xmm7 \n\t" + "mulpd %2, %%xmm8 \n\t" + "mulpd %3, %%xmm9 \n\t" + "mulpd %4, %%xmm10 \n\t" + "mulpd %5, %%xmm11 \n\t" + "addpd %%xmm6, %%xmm0 \n\t" + "addpd %%xmm7, %%xmm1 \n\t" + "addpd %%xmm8, %%xmm2 \n\t" + "subpd %%xmm9, %%xmm3 \n\t" + "subpd %%xmm10, %%xmm4 \n\t" + "subpd %%xmm11, %%xmm5" + : + : + "m" ((*s).c2.c1), + "m" ((*s).c2.c2), + "m" ((*s).c2.c3), + "m" ((*s).c1.c1), + "m" ((*s).c1.c2), + "m" ((*s).c1.c3) + : + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11"); + + __asm__ __volatile__ ("movddup %0, %%xmm6 \n\t" + "movddup %1, %%xmm7 \n\t" + "movddup %2, %%xmm8 \n\t" + "movddup %3, %%xmm9 \n\t" + "movddup %4, %%xmm10 \n\t" + "movddup %5, %%xmm11" + : + : + "m" ((*m).u[13]), + "m" ((*m).u[23]), + "m" ((*m).u[9]), + "m" ((*m).u[19]), + "m" ((*m).u[27]), + "m" ((*m).u[33]) + : + "xmm6", "xmm7", "xmm8", "xmm9", + "xmm10", "xmm11"); + + __asm__ __volatile__ ("mulpd %0, %%xmm6 \n\t" + "mulpd %1, %%xmm7 \n\t" + "mulpd %2, %%xmm8 \n\t" + "mulpd %3, %%xmm9 \n\t" + "mulpd %4, %%xmm10 \n\t" + "mulpd %5, %%xmm11 \n\t" + "addpd %%xmm6, %%xmm0 \n\t" + "addpd %%xmm7, %%xmm1 \n\t" + "subpd %%xmm8, %%xmm2 \n\t" + "subpd %%xmm9, %%xmm3 \n\t" + "subpd %%xmm10, %%xmm4 \n\t" + "subpd %%xmm11, %%xmm5" + : + : + "m" ((*s).c2.c2), + "m" ((*s).c2.c3), + "m" ((*s).c1.c1), + "m" ((*s).c1.c2), + "m" ((*s).c1.c3), + "m" ((*s).c2.c1) + : + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11"); + + __asm__ __volatile__ ("movddup %0, %%xmm6 \n\t" + "movddup %1, %%xmm7 \n\t" + "movddup %2, %%xmm8 \n\t" + "movddup %3, %%xmm9 \n\t" + "movddup %4, %%xmm10 \n\t" + "movddup %5, %%xmm11" + : + : + "m" ((*m).u[15]), + "m" ((*m).u[7]), + "m" ((*m).u[17]), + "m" ((*m).u[25]), + "m" ((*m).u[31]), + "m" ((*m).u[35]) + : + "xmm6", "xmm7", "xmm8", "xmm9", + "xmm10", "xmm11"); + + __asm__ __volatile__ ("mulpd %0, %%xmm6 \n\t" + "mulpd %1, %%xmm7 \n\t" + "mulpd %2, %%xmm8 \n\t" + "mulpd %3, %%xmm9 \n\t" + "mulpd %4, %%xmm10 \n\t" + "mulpd %5, %%xmm11 \n\t" + "addpd %%xmm6, %%xmm0 \n\t" + "subpd %%xmm7, %%xmm1 \n\t" + "subpd %%xmm8, %%xmm2 \n\t" + "subpd %%xmm9, %%xmm3 \n\t" + "subpd %%xmm10, %%xmm4 \n\t" + "subpd %%xmm11, %%xmm5" + : + : + "m" ((*s).c2.c3), + "m" ((*s).c1.c1), + "m" ((*s).c1.c2), + "m" ((*s).c1.c3), + "m" ((*s).c2.c1), + "m" ((*s).c2.c2) + : + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11"); + + __asm__ __volatile__ ("mulpd %%xmm13, %%xmm0 \n\t" + "mulpd %%xmm14, %%xmm1 \n\t" + "mulpd %%xmm15, %%xmm2 \n\t" + "mulpd %%xmm13, %%xmm3 \n\t" + "mulpd %%xmm14, %%xmm4 \n\t" + "mulpd %%xmm15, %%xmm5 \n\t" + "shufpd $0x1, %%xmm0, %%xmm0 \n\t" + "shufpd $0x1, %%xmm1, %%xmm1 \n\t" + "shufpd $0x1, %%xmm2, %%xmm2 \n\t" + "shufpd $0x1, %%xmm3, %%xmm3 \n\t" + "shufpd $0x1, %%xmm4, %%xmm4 \n\t" + "shufpd $0x1, %%xmm5, %%xmm5" + : + : + : + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5"); + + __asm__ __volatile__ ("movddup %0, %%xmm6 \n\t" + "movddup %1, %%xmm7 \n\t" + "movddup %2, %%xmm8 \n\t" + "movddup %3, %%xmm9 \n\t" + "movddup %4, %%xmm10 \n\t" + "movddup %5, %%xmm11" + : + : + "m" ((*m).u[0]), + "m" ((*m).u[1]), + "m" ((*m).u[2]), + "m" ((*m).u[3]), + "m" ((*m).u[4]), + "m" ((*m).u[5]) + : + "xmm6", "xmm7", "xmm8", "xmm9", + "xmm10", "xmm11"); + + __asm__ __volatile__ ("mulpd %0, %%xmm6 \n\t" + "mulpd %1, %%xmm7 \n\t" + "mulpd %2, %%xmm8 \n\t" + "mulpd %3, %%xmm9 \n\t" + "mulpd %4, %%xmm10 \n\t" + "mulpd %5, %%xmm11 \n\t" + "addpd %%xmm6, %%xmm0 \n\t" + "addpd %%xmm7, %%xmm1 \n\t" + "addpd %%xmm8, %%xmm2 \n\t" + "addpd %%xmm9, %%xmm3 \n\t" + "addpd %%xmm10, %%xmm4 \n\t" + "addpd %%xmm11, %%xmm5" + : + : + "m" ((*s).c1.c1), + "m" ((*s).c1.c2), + "m" ((*s).c1.c3), + "m" ((*s).c2.c1), + "m" ((*s).c2.c2), + "m" ((*s).c2.c3) + : + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11"); + + __asm__ __volatile__ ("movddup %0, %%xmm6 \n\t" + "movddup %1, %%xmm7 \n\t" + "movddup %2, %%xmm8 \n\t" + "movddup %3, %%xmm9 \n\t" + "movddup %4, %%xmm10 \n\t" + "movddup %5, %%xmm11" + : + : + "m" ((*m).u[6]), + "m" ((*m).u[16]), + "m" ((*m).u[24]), + "m" ((*m).u[30]), + "m" ((*m).u[34]), + "m" ((*m).u[14]) + : + "xmm6", "xmm7", "xmm8", "xmm9", + "xmm10", "xmm11"); + + __asm__ __volatile__ ("mulpd %0, %%xmm6 \n\t" + "mulpd %1, %%xmm7 \n\t" + "mulpd %2, %%xmm8 \n\t" + "mulpd %3, %%xmm9 \n\t" + "mulpd %4, %%xmm10 \n\t" + "mulpd %5, %%xmm11 \n\t" + "addpd %%xmm6, %%xmm0 \n\t" + "addpd %%xmm7, %%xmm1 \n\t" + "addpd %%xmm8, %%xmm2 \n\t" + "addpd %%xmm9, %%xmm3 \n\t" + "addpd %%xmm10, %%xmm4 \n\t" + "addpd %%xmm11, %%xmm5" + : + : + "m" ((*s).c1.c2), + "m" ((*s).c1.c3), + "m" ((*s).c2.c1), + "m" ((*s).c2.c2), + "m" ((*s).c2.c3), + "m" ((*s).c1.c1) + : + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11"); + + __asm__ __volatile__ ("movddup %0, %%xmm6 \n\t" + "movddup %1, %%xmm7 \n\t" + "movddup %2, %%xmm8 \n\t" + "movddup %3, %%xmm9 \n\t" + "movddup %4, %%xmm10 \n\t" + "movddup %5, %%xmm11" + : + : + "m" ((*m).u[8]), + "m" ((*m).u[18]), + "m" ((*m).u[26]), + "m" ((*m).u[32]), + "m" ((*m).u[12]), + "m" ((*m).u[22]) + : + "xmm6", "xmm7", "xmm8", "xmm9", + "xmm10", "xmm11"); + + __asm__ __volatile__ ("mulpd %0, %%xmm6 \n\t" + "mulpd %1, %%xmm7 \n\t" + "mulpd %2, %%xmm8 \n\t" + "mulpd %3, %%xmm9 \n\t" + "mulpd %4, %%xmm10 \n\t" + "mulpd %5, %%xmm11 \n\t" + "addpd %%xmm6, %%xmm0 \n\t" + "addpd %%xmm7, %%xmm1 \n\t" + "addpd %%xmm8, %%xmm2 \n\t" + "addpd %%xmm9, %%xmm3 \n\t" + "addpd %%xmm10, %%xmm4 \n\t" + "addpd %%xmm11, %%xmm5" + : + : + "m" ((*s).c1.c3), + "m" ((*s).c2.c1), + "m" ((*s).c2.c2), + "m" ((*s).c2.c3), + "m" ((*s).c1.c1), + "m" ((*s).c1.c2) + : + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11"); + + __asm__ __volatile__ ("movddup %0, %%xmm6 \n\t" + "movddup %1, %%xmm7 \n\t" + "movddup %2, %%xmm8 \n\t" + "movddup %3, %%xmm9 \n\t" + "movddup %4, %%xmm10 \n\t" + "movddup %5, %%xmm11" + : + : + "m" ((*m).u[10]), + "m" ((*m).u[20]), + "m" ((*m).u[28]), + "m" ((*m).u[10]), + "m" ((*m).u[20]), + "m" ((*m).u[28]) + : + "xmm6", "xmm7", "xmm8", "xmm9", + "xmm10", "xmm11"); + + __asm__ __volatile__ ("mulpd %0, %%xmm6 \n\t" + "mulpd %1, %%xmm7 \n\t" + "mulpd %2, %%xmm8 \n\t" + "mulpd %3, %%xmm9 \n\t" + "mulpd %4, %%xmm10 \n\t" + "mulpd %5, %%xmm11 \n\t" + "addpd %%xmm6, %%xmm0 \n\t" + "addpd %%xmm7, %%xmm1 \n\t" + "addpd %%xmm8, %%xmm2 \n\t" + "addpd %%xmm9, %%xmm3 \n\t" + "addpd %%xmm10, %%xmm4 \n\t" + "addpd %%xmm11, %%xmm5" + : + : + "m" ((*s).c2.c1), + "m" ((*s).c2.c2), + "m" ((*s).c2.c3), + "m" ((*s).c1.c1), + "m" ((*s).c1.c2), + "m" ((*s).c1.c3) + : + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11"); + + __asm__ __volatile__ ("movddup %0, %%xmm6 \n\t" + "movddup %1, %%xmm7 \n\t" + "movddup %2, %%xmm8 \n\t" + "movddup %3, %%xmm9 \n\t" + "movddup %4, %%xmm10 \n\t" + "movddup %5, %%xmm11" + : + : + "m" ((*m).u[12]), + "m" ((*m).u[22]), + "m" ((*m).u[8]), + "m" ((*m).u[18]), + "m" ((*m).u[26]), + "m" ((*m).u[32]) + : + "xmm6", "xmm7", "xmm8", "xmm9", + "xmm10", "xmm11"); + + __asm__ __volatile__ ("mulpd %0, %%xmm6 \n\t" + "mulpd %1, %%xmm7 \n\t" + "mulpd %2, %%xmm8 \n\t" + "mulpd %3, %%xmm9 \n\t" + "mulpd %4, %%xmm10 \n\t" + "mulpd %5, %%xmm11 \n\t" + "addpd %%xmm6, %%xmm0 \n\t" + "addpd %%xmm7, %%xmm1 \n\t" + "addpd %%xmm8, %%xmm2 \n\t" + "addpd %%xmm9, %%xmm3 \n\t" + "addpd %%xmm10, %%xmm4 \n\t" + "addpd %%xmm11, %%xmm5" + : + : + "m" ((*s).c2.c2), + "m" ((*s).c2.c3), + "m" ((*s).c1.c1), + "m" ((*s).c1.c2), + "m" ((*s).c1.c3), + "m" ((*s).c2.c1) + : + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11"); + + __asm__ __volatile__ ("movddup %0, %%xmm6 \n\t" + "movddup %1, %%xmm7 \n\t" + "movddup %2, %%xmm8 \n\t" + "movddup %3, %%xmm9 \n\t" + "movddup %4, %%xmm10 \n\t" + "movddup %5, %%xmm11" + : + : + "m" ((*m).u[14]), + "m" ((*m).u[6]), + "m" ((*m).u[16]), + "m" ((*m).u[24]), + "m" ((*m).u[30]), + "m" ((*m).u[34]) + : + "xmm6", "xmm7", "xmm8", "xmm9", + "xmm10", "xmm11"); + + __asm__ __volatile__ ("mulpd %0, %%xmm6 \n\t" + "mulpd %1, %%xmm7 \n\t" + "mulpd %2, %%xmm8 \n\t" + "mulpd %3, %%xmm9 \n\t" + "mulpd %4, %%xmm10 \n\t" + "mulpd %5, %%xmm11 \n\t" + "addpd %%xmm6, %%xmm0 \n\t" + "addpd %%xmm7, %%xmm1 \n\t" + "addpd %%xmm8, %%xmm2 \n\t" + "addpd %%xmm9, %%xmm3 \n\t" + "addpd %%xmm10, %%xmm4 \n\t" + "addpd %%xmm11, %%xmm5" + : + : + "m" ((*s).c2.c3), + "m" ((*s).c1.c1), + "m" ((*s).c1.c2), + "m" ((*s).c1.c3), + "m" ((*s).c2.c1), + "m" ((*s).c2.c2) + : + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11"); + + __asm__ __volatile__ ("movapd %%xmm0, %0 \n\t" + "movapd %%xmm1, %1 \n\t" + "movapd %%xmm2, %2 \n\t" + "movapd %%xmm3, %3 \n\t" + "movapd %%xmm4, %4 \n\t" + "movapd %%xmm5, %5" + : + "=m" ((*r).c1.c1), + "=m" ((*r).c1.c2), + "=m" ((*r).c1.c3), + "=m" ((*r).c2.c1), + "=m" ((*r).c2.c2), + "=m" ((*r).c2.c3)); +} + +#endif + +static int fwd_house(double eps) +{ + int i,j,k,ifail; + double r1,r2,r3; + complex_dble z,*ak,*aj; + + ifail=0; + + for (k=0;k<5;k++) + { + r1=aa[6*k+k].re*aa[6*k+k].re+aa[6*k+k].im*aa[6*k+k].im; + r2=sqrt(r1); + + for (j=(k+1);j<6;j++) + r1+=(aa[6*j+k].re*aa[6*j+k].re+aa[6*j+k].im*aa[6*j+k].im); + + if (r1>=eps) + r1=sqrt(r1); + else + { + ifail=1; + r1=1.0; + } + + if (r2>=(DBL_EPSILON*r1)) + { + r3=1.0/r2; + z.re=r3*aa[6*k+k].re; + z.im=r3*aa[6*k+k].im; + } + else + { + z.re=1.0; + z.im=0.0; + } + + aa[6*k+k].re+=r1*z.re; + aa[6*k+k].im+=r1*z.im; + + r3=1.0/(r1*(r1+r2)); + rr[k]=r3; + dd[k].re=-(r1+r2)*r3*z.re; + dd[k].im= (r1+r2)*r3*z.im; + + for (j=(k+1);j<6;j++) + { + __asm__ __volatile__ ("xorpd %%xmm7, %%xmm7" + : + : + : + "xmm7"); + + ak=aa+6*k+k; + aj=aa+6*k+j; + + for (i=k;i<6;i++) + { + __asm__ __volatile__ ("movddup %0, %%xmm0 \n\t" + "movddup %1, %%xmm1 \n\t" + "mulpd %2, %%xmm0 \n\t" + "mulpd %2, %%xmm1 \n\t" + "addpd %%xmm0, %%xmm7 \n\t" + "mulpd %3, %%xmm1 \n\t" + "shufpd $0x1, %%xmm1, %%xmm1 \n\t" + "addpd %%xmm1, %%xmm7" + : + : + "m" (ak[0].re), + "m" (ak[0].im), + "m" (aj[0]), + "m" (_sse_sgn1_dble) + : + "xmm0", "xmm1", "xmm7"); + + ak+=6; + aj+=6; + } + + __asm__ __volatile__ ("movddup %0, %%xmm5 \n\t" + "mulpd %%xmm5, %%xmm7 \n\t" + "movddup %%xmm7, %%xmm6 \n\t" + "unpckhpd %%xmm7, %%xmm7 \n\t" + "mulpd %1, %%xmm7" + : + : + "m" (rr[k]), + "m" (_sse_sgn1_dble) + : + "xmm5", "xmm6", "xmm7"); + + ak=aa+6*k+k; + aj=aa+6*k+j; + + for (i=k;i<6;i++) + { + __asm__ __volatile__ ("movapd %%xmm7, %%xmm5 \n\t" + "movapd %%xmm6, %%xmm4 \n\t" + "mulpd %1, %%xmm5 \n\t" + "mulpd %1, %%xmm4 \n\t" + "shufpd $0x1, %%xmm5, %%xmm5 \n\t" + "subpd %2, %%xmm4 \n\t" + "subpd %%xmm4, %%xmm5 \n\t" + "movapd %%xmm5, %0" + : + "=m" (aj[0]) + : + "m" (ak[0]), + "m" (aj[0]) + : + "xmm4", "xmm5"); + + ak+=6; + aj+=6; + } + } + } + + r1=aa[35].re*aa[35].re+aa[35].im*aa[35].im; + + if (r1>=eps) + r1=1.0/r1; + else + { + ifail=1; + r1=1.0; + } + + dd[5].re= r1*aa[35].re; + dd[5].im=-r1*aa[35].im; + + return ifail; +} + + +static void solv_sys(void) +{ + int i,j,k; + + for (k=5;k>0;k--) + { + for (i=(k-1);i>=0;i--) + { + __asm__ __volatile__ ("movddup %0, %%xmm6 \n\t" + "movddup %1, %%xmm7 \n\t" + "mulpd %2, %%xmm6 \n\t" + "mulpd %2, %%xmm7 \n\t" + "shufpd $0x1, %%xmm6, %%xmm6 \n\t" + "addsubpd %%xmm6, %%xmm7" + : + : + "m" (aa[6*i+k].im), + "m" (aa[6*i+k].re), + "m" (dd[k]) + : + "xmm6", "xmm7"); + + for (j=(k-1);j>i;j--) + { + __asm__ __volatile__ ("movddup %0, %%xmm0 \n\t" + "movddup %1, %%xmm1 \n\t" + "mulpd %2, %%xmm0 \n\t" + "mulpd %2, %%xmm1 \n\t" + "addpd %%xmm0, %%xmm7 \n\t" + "shufpd $0x1, %%xmm1, %%xmm1 \n\t" + "addsubpd %%xmm1, %%xmm7" + : + : + "m" (aa[6*j+k].re), + "m" (aa[6*j+k].im), + "m" (aa[6*i+j]) + : + "xmm0", "xmm1", "xmm7"); + } + + __asm__ __volatile__ ("movddup %%xmm7, %%xmm6 \n\t" + "unpckhpd %%xmm7, %%xmm7 \n\t" + "mulpd %1, %%xmm7 \n\t" + "mulpd %1, %%xmm6 \n\t" + "mulpd %2, %%xmm7 \n\t" + "shufpd $0x1, %%xmm7, %%xmm7 \n\t" + "subpd %%xmm6, %%xmm7 \n\t" + "movapd %%xmm7, %0" + : + "=m" (aa[6*i+k]) + : + "m" (dd[i]), + "m" (_sse_sgn1_dble) + : + "xmm6", "xmm7"); + } + } +} + + +static void bck_house(void) +{ + int i,j,k; + complex_dble z,*d,*a; + + aa[35].re=dd[5].re; + aa[35].im=dd[5].im; + + for (k=4;k>=0;k--) + { + z.re=dd[k].re; + z.im=dd[k].im; + dd[k].re=aa[6*k+k].re; + dd[k].im=aa[6*k+k].im; + aa[6*k+k].re=z.re; + aa[6*k+k].im=z.im; + + for (j=(k+1);j<6;j++) + { + dd[j].re=aa[6*j+k].re; + dd[j].im=aa[6*j+k].im; + aa[6*j+k].re=0.0; + aa[6*j+k].im=0.0; + } + + for (i=0;i<6;i+=2) + { + __asm__ __volatile__ ("xorpd %%xmm6, %%xmm6 \n\t" + "xorpd %%xmm7, %%xmm7" + : + : + : + "xmm6", "xmm7"); + + d=dd+k; + a=aa+6*i+k; + + for (j=k;j<6;j++) + { + __asm__ __volatile__ ("movddup %0, %%xmm0 \n\t" + "movddup %1, %%xmm1 \n\t" + "movapd %%xmm0, %%xmm2 \n\t" + "movapd %%xmm1, %%xmm3 \n\t" + "mulpd %2, %%xmm0 \n\t" + "mulpd %2, %%xmm1 \n\t" + "mulpd %3, %%xmm2 \n\t" + "mulpd %3, %%xmm3 \n\t" + "addpd %%xmm0, %%xmm6 \n\t" + "shufpd $0x1, %%xmm1, %%xmm1 \n\t" + "addpd %%xmm2, %%xmm7 \n\t" + "shufpd $0x1, %%xmm3, %%xmm3 \n\t" + "addsubpd %%xmm1, %%xmm6 \n\t" + "addsubpd %%xmm3, %%xmm7 \n\t" + : + : + "m" (d[0].re), + "m" (d[0].im), + "m" (a[0]), + "m" (a[6]) + : + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm6", "xmm7"); + + d+=1; + a+=1; + } + + __asm__ __volatile__ ("movddup %0, %%xmm0 \n\t" + "mulpd %%xmm0, %%xmm6 \n\t" + "mulpd %%xmm0, %%xmm7 \n\t" + "movddup %%xmm6, %%xmm4 \n\t" + "movddup %%xmm7, %%xmm5 \n\t" + "unpckhpd %%xmm6, %%xmm6 \n\t" + "unpckhpd %%xmm7, %%xmm7 \n\t" + "mulpd %1, %%xmm4 \n\t" + "mulpd %1, %%xmm5" + : + : + "m" (rr[k]), + "m" (_sse_sgn1_dble) + : + "xmm0", "xmm4", "xmm5", + "xmm6", "xmm7"); + + d=dd+k; + a=aa+6*i+k; + + for (j=k;j<6;j++) + { + __asm__ __volatile__ ("movapd %%xmm6, %%xmm2 \n\t" + "movapd %%xmm7, %%xmm3 \n\t" + "movapd %%xmm4, %%xmm0 \n\t" + "movapd %%xmm5, %%xmm1 \n\t" + "mulpd %2, %%xmm2 \n\t" + "mulpd %2, %%xmm3 \n\t" + "mulpd %2, %%xmm0 \n\t" + "mulpd %2, %%xmm1 \n\t" + "shufpd $0x1, %%xmm2, %%xmm2 \n\t" + "shufpd $0x1, %%xmm3, %%xmm3 \n\t" + "addpd %3, %%xmm0 \n\t" + "addpd %4, %%xmm1 \n\t" + "subpd %%xmm2, %%xmm0 \n\t" + "subpd %%xmm3, %%xmm1 \n\t" + "movapd %%xmm0, %0 \n\t" + "movapd %%xmm1, %1" + : + "=m" (a[0]), + "=m" (a[6]) + : + "m" (d[0]), + "m" (a[0]), + "m" (a[6]) + : + "xmm0", "xmm1", "xmm2", "xmm3"); + + d+=1; + a+=1; + } + } + } +} + +#elif (defined QPX) +#include "qpx.h" +static double rr[5]; +static complex_dble aa[36],dd[6]; + +void mul_pauli_dble_qpx(pauli_dble *m,vector4double *im1[3], vector4double *im2[3] ) +{ + vector4double s1,s2,s3,s4,s5,s6,s10,s11; + vector4double v1,v2,v3,v4,v5,v6,v7,v71,v8,v9,v10,v11,v12,v13,v14,v15,v100; + vector4double r10,r11, r12,r13,r14,r15, r100,r101,r102,r110,r111,ri1,ri2; + + s1=vec_perm(*(im1[0]),*(im1[0]),perm1); + s2=vec_perm(*(im1[0]),*(im1[0]),perm2); + s3=vec_perm(*(im1[1]),*(im1[1]),perm1); + s4=vec_perm(*(im1[1]),*(im1[1]),perm2); + s5=vec_perm(*(im1[2]),*(im1[2]),perm1); + s6=vec_perm(*(im1[2]),*(im1[2]),perm2); + v10=vec_ld2a(0,&((*m).u[0])); + v11=vec_ld2a(0,&((*m).u[2])); + v10=vec_sldw(v10,v11,2); + v11=vec_ld2a(0,&((*m).u[4])); + v1=vec_ld2a(0,&((*m).u[6])); + v2=vec_ld2a(0,&((*m).u[8])); + v3=vec_ld2a(0,&((*m).u[10])); + v2=vec_sldw(v2,v3,2); + v3=vec_ld2a(0,&((*m).u[12])); + v4=vec_ld2a(0,&((*m).u[14])); + v3=vec_sldw(v3,v4,2); + v4=vec_ld2a(0,&((*m).u[16])); + v5=vec_ld2a(0,&((*m).u[18])); + v4=vec_sldw(v4,v5,2); + v5=vec_ld2a(0,&((*m).u[20])); + v6=vec_ld2a(0,&((*m).u[22])); + v5=vec_sldw(v5,v6,2); + v6=vec_ld2a(0,&((*m).u[24])); + v7=vec_ld2a(0,&((*m).u[26])); + v71=vec_ld2a(0,&((*m).u[28])); + v8=vec_ld2a(0,&((*m).u[30])); + v9=vec_ld2a(0,&((*m).u[32])); + v7=vec_sldw(v7,v71,2); + v8=vec_sldw(v8,v9,2); + v9=vec_ld2a(0,&((*m).u[34])); + + v100=vec_perm(v10,v10,perm0011); + s10=vec_sldw(s1,s2,2); + s11=vec_sldw(s2,s1,2); + v12=vec_perm(v2,v4,perm1); + v13=vec_perm(v2,v4,perm2); + v14=vec_perm(v3,v5,perm1); + v15=vec_perm(v3,v5,perm2); + + r10=vec_xmul(v100,s10); + r11=vec_xxnpmadd(s11,vec_mul(sign0,v1),vec_xmadd(v1,s11,r10)); + r12=vec_xxnpmadd(v12,s3,vec_xmadd(s3,v12,r11)); + r13=vec_xxnpmadd(v13,s4,vec_xmadd(s4,v13,r12)); + r14=vec_xxnpmadd(v14,s5,vec_xmadd(s5,v14,r13)); + *(im2[0])=vec_xxnpmadd(v15,s6,vec_xmadd(s6,v15,r14)); + + v100=vec_perm(v10,v10,perm2233); + s10=vec_sldw(s3,s4,2); + s11=vec_sldw(s4,s3,2); + v12=vec_perm(v7,v8,perm1); + v13=vec_perm(v7,v8,perm2); + r10=vec_xxcpnmadd(s1,v2,vec_xmul(v2,s1)); + r11=vec_xxcpnmadd(s2,v4,vec_xmadd(v4,s2,r10)); + r12=vec_xmadd(v100,s10,r11); + r13=vec_xxnpmadd(s11,vec_mul(sign0,v6),vec_xmadd(v6,s11,r12)); + r14=vec_xxnpmadd(v12,s5,vec_xmadd(s5,v12,r13)); + *(im2[1])=vec_xxnpmadd(v13,s6,vec_xmadd(s6,v13,r14)); + + v100=vec_perm(v11,v11,perm0011); + s10=vec_sldw(s5,s6,2); + s11=vec_sldw(s6,s5,2); + r10=vec_xxcpnmadd(s1,v3,vec_xmul(v3,s1)); + r11=vec_xxcpnmadd(s2,v5,vec_xmadd(v5,s2,r10)); + r12=vec_xxcpnmadd(s3,v7,vec_xmadd(v7,s3,r11)); + r13=vec_xxcpnmadd(s4,v8,vec_xmadd(v8,s4,r12)); + r14=vec_xmadd(v100,s10,r13); + *(im2[2])=vec_xxnpmadd(s11,vec_mul(sign0,v9),vec_xmadd(v9,s11,r14)); +} + + +void mul_pauli_dble(double mu, pauli_dble *m,weyl_dble *s,weyl_dble *r) +{ + vector4double s1,s2,s3,s4,s5,s6,s10,s11; + vector4double v1,v2,v3,v4,v5,v6,v7,v71,v8,v9,v10,v11,v12,v13,v14,v15,v100,v16,v17,v18; + vector4double r10,r11, r12,r13,r14,r15, r100,r101,r102,r110,r111; + + s1=vec_ld2a(0,&((*s).c1.c1.re)); + s2=vec_ld2a(0,&((*s).c1.c2.re)); + s3=vec_ld2a(0,&((*s).c1.c3.re)); + s4=vec_ld2a(0,&((*s).c2.c1.re)); + s5=vec_ld2a(0,&((*s).c2.c2.re)); + s6=vec_ld2a(0,&((*s).c2.c3.re)); + v16=vec_splats(mu); + v10=vec_ld2a(0,&((*m).u[0])); + v11=vec_ld2a(0,&((*m).u[2])); + v10=vec_sldw(v10,v11,2); + v11=vec_ld2a(0,&((*m).u[4])); + v1=vec_ld2a(0,&((*m).u[6])); + v2=vec_ld2a(0,&((*m).u[8])); + v3=vec_ld2a(0,&((*m).u[10])); + v2=vec_sldw(v2,v3,2); + v3=vec_ld2a(0,&((*m).u[12])); + v4=vec_ld2a(0,&((*m).u[14])); + v3=vec_sldw(v3,v4,2); + v4=vec_ld2a(0,&((*m).u[16])); + v5=vec_ld2a(0,&((*m).u[18])); + v4=vec_sldw(v4,v5,2); + v5=vec_ld2a(0,&((*m).u[20])); + v6=vec_ld2a(0,&((*m).u[22])); + v5=vec_sldw(v5,v6,2); + v6=vec_ld2a(0,&((*m).u[24])); + v7=vec_ld2a(0,&((*m).u[26])); + v71=vec_ld2a(0,&((*m).u[28])); + v8=vec_ld2a(0,&((*m).u[30])); + v9=vec_ld2a(0,&((*m).u[32])); + v7=vec_sldw(v7,v71,2); + v8=vec_sldw(v8,v9,2); + v9=vec_ld2a(0,&((*m).u[34])); + + + v100=vec_perm(v10,v16,perml1); + v17=vec_mul(sign0,vec_perm(v100,v1,perm1)); + v18=vec_perm(v1,v100,perm2); + v12=vec_perm(v2,v4,perm1); + v13=vec_perm(v2,v4,perm2); + v14=vec_perm(v3,v5,perm1); + v15=vec_perm(v3,v5,perm2); + r10=vec_xxnpmadd(v17,s1,vec_xmul(s1,v17)); + r11=vec_xxnpmadd(v18,s2,vec_xmadd(s2,v18,r10)); + r12=vec_xxnpmadd(v12,s3,vec_xmadd(s3,v12,r11)); + r13=vec_xxnpmadd(v13,s4,vec_xmadd(s4,v13,r12)); + r14=vec_xxnpmadd(v14,s5,vec_xmadd(s5,v14,r13)); + r15=vec_xxnpmadd(v15,s6,vec_xmadd(s6,v15,r14)); + vec_sta(r15,0,&((*r).c1.c1.re)); + + v100=vec_perm(v10,v16,perml2); + v17=vec_mul(sign0,vec_perm(v100,v6,perm1)); + v18=vec_perm(v6,v100,perm2); + v12=vec_perm(v7,v8,perm1); + v13=vec_perm(v7,v8,perm2); + r10=vec_xxcpnmadd(s1,v2,vec_xmul(v2,s1)); + r11=vec_xxcpnmadd(s2,v4,vec_xmadd(v4,s2,r10)); + r12=vec_xxnpmadd(v17,s3,vec_xmadd(s3,v17,r11)); + r13=vec_xxnpmadd(v18,s4,vec_xmadd(s4,v18,r12)); + r14=vec_xxnpmadd(v12,s5,vec_xmadd(s5,v12,r13)); + r15=vec_xxnpmadd(v13,s6,vec_xmadd(s6,v13,r14)); + vec_sta(r15,0,&((*r).c1.c3.re)); + + v100=vec_perm(v11,v16,perml1); + v17=vec_mul(sign0,vec_perm(v100,v9,perm1)); + v18=vec_perm(v9,v100,perm2); + r10=vec_xxcpnmadd(s1,v3,vec_xmul(v3,s1)); + r11=vec_xxcpnmadd(s2,v5,vec_xmadd(v5,s2,r10)); + r12=vec_xxcpnmadd(s3,v7,vec_xmadd(v7,s3,r11)); + r13=vec_xxcpnmadd(s4,v8,vec_xmadd(v8,s4,r12)); + r14=vec_xxnpmadd(v17,s5,vec_xmadd(s5,v17,r13)); + r15=vec_xxnpmadd(v18,s6,vec_xmadd(s6,v18,r14)); + vec_sta(r15,0,&((*r).c2.c2.re)); +} + +static int fwd_house(double eps) +{ + int i,j,k,ifail; + double r1,r2,r3; + complex_dble z; + + ifail=0; + + for (k=0;k<5;k++) + { + r1=aa[6*k+k].re*aa[6*k+k].re+aa[6*k+k].im*aa[6*k+k].im; + r2=sqrt(r1); + + for (j=(k+1);j<6;j++) + r1+=(aa[6*j+k].re*aa[6*j+k].re+aa[6*j+k].im*aa[6*j+k].im); + + if (r1>=eps) + r1=sqrt(r1); + else + { + ifail=1; + r1=1.0; + } + + if (r2>=(DBL_EPSILON*r1)) + { + r3=1.0/r2; + z.re=r3*aa[6*k+k].re; + z.im=r3*aa[6*k+k].im; + } + else + { + z.re=1.0; + z.im=0.0; + } + + aa[6*k+k].re+=r1*z.re; + aa[6*k+k].im+=r1*z.im; + + r3=1.0/(r1*(r1+r2)); + rr[k]=r3; + dd[k].re=-(r1+r2)*r3*z.re; + dd[k].im= (r1+r2)*r3*z.im; + + for (j=(k+1);j<6;j++) + { + z.re=0.0; + z.im=0.0; + + for (i=k;i<6;i++) + { + z.re+=(aa[6*i+k].re*aa[6*i+j].re+aa[6*i+k].im*aa[6*i+j].im); + z.im+=(aa[6*i+k].re*aa[6*i+j].im-aa[6*i+k].im*aa[6*i+j].re); + } + + z.re*=r3; + z.im*=r3; + + for (i=k;i<6;i++) + { + aa[6*i+j].re-=(z.re*aa[6*i+k].re-z.im*aa[6*i+k].im); + aa[6*i+j].im-=(z.re*aa[6*i+k].im+z.im*aa[6*i+k].re); + } + } + } + + r1=aa[35].re*aa[35].re+aa[35].im*aa[35].im; + + if (r1>=eps) + r1=1.0/r1; + else + { + ifail=1; + r1=1.0; + } + + dd[5].re= r1*aa[35].re; + dd[5].im=-r1*aa[35].im; + + return ifail; +} + + +static void solv_sys(void) +{ + int i,j,k; + complex_dble z; + + for (k=5;k>0;k--) + { + for (i=(k-1);i>=0;i--) + { + z.re=aa[6*i+k].re*dd[k].re-aa[6*i+k].im*dd[k].im; + z.im=aa[6*i+k].re*dd[k].im+aa[6*i+k].im*dd[k].re; + + for (j=(k-1);j>i;j--) + { + z.re+=(aa[6*i+j].re*aa[6*j+k].re-aa[6*i+j].im*aa[6*j+k].im); + z.im+=(aa[6*i+j].re*aa[6*j+k].im+aa[6*i+j].im*aa[6*j+k].re); + } + + aa[6*i+k].re=-dd[i].re*z.re+dd[i].im*z.im; + aa[6*i+k].im=-dd[i].re*z.im-dd[i].im*z.re; + } + } +} + + +static void bck_house(void) +{ + int i,j,k; + complex_dble z; + + aa[35].re=dd[5].re; + aa[35].im=dd[5].im; + + for (k=4;k>=0;k--) + { + z.re=dd[k].re; + z.im=dd[k].im; + dd[k].re=aa[6*k+k].re; + dd[k].im=aa[6*k+k].im; + aa[6*k+k].re=z.re; + aa[6*k+k].im=z.im; + + for (j=(k+1);j<6;j++) + { + dd[j].re=aa[6*j+k].re; + dd[j].im=aa[6*j+k].im; + aa[6*j+k].re=0.0; + aa[6*j+k].im=0.0; + } + + for (i=0;i<6;i++) + { + z.re=0.0; + z.im=0.0; + + for (j=k;j<6;j++) + { + z.re+=(aa[6*i+j].re*dd[j].re-aa[6*i+j].im*dd[j].im); + z.im+=(aa[6*i+j].re*dd[j].im+aa[6*i+j].im*dd[j].re); + } + + z.re*=rr[k]; + z.im*=rr[k]; + + for (j=k;j<6;j++) + { + aa[6*i+j].re-=(z.re*dd[j].re+z.im*dd[j].im); + aa[6*i+j].im+=(z.re*dd[j].im-z.im*dd[j].re); + } + } + } +} + +#else + +static weyl_dble rs; + + +void mul_pauli_dble(double mu,pauli_dble *m,weyl_dble *s,weyl_dble *r) +{ + double *u; + + u=(*m).u; + + rs.c1.c1.re= + u[ 0]*(*s).c1.c1.re- mu*(*s).c1.c1.im+ + u[ 6]*(*s).c1.c2.re-u[ 7]*(*s).c1.c2.im+ + u[ 8]*(*s).c1.c3.re-u[ 9]*(*s).c1.c3.im+ + u[10]*(*s).c2.c1.re-u[11]*(*s).c2.c1.im+ + u[12]*(*s).c2.c2.re-u[13]*(*s).c2.c2.im+ + u[14]*(*s).c2.c3.re-u[15]*(*s).c2.c3.im; + + rs.c1.c1.im= + u[ 0]*(*s).c1.c1.im+ mu*(*s).c1.c1.re+ + u[ 6]*(*s).c1.c2.im+u[ 7]*(*s).c1.c2.re+ + u[ 8]*(*s).c1.c3.im+u[ 9]*(*s).c1.c3.re+ + u[10]*(*s).c2.c1.im+u[11]*(*s).c2.c1.re+ + u[12]*(*s).c2.c2.im+u[13]*(*s).c2.c2.re+ + u[14]*(*s).c2.c3.im+u[15]*(*s).c2.c3.re; + + rs.c1.c2.re= + u[ 6]*(*s).c1.c1.re+u[ 7]*(*s).c1.c1.im+ + u[ 1]*(*s).c1.c2.re- mu*(*s).c1.c2.im+ + u[16]*(*s).c1.c3.re-u[17]*(*s).c1.c3.im+ + u[18]*(*s).c2.c1.re-u[19]*(*s).c2.c1.im+ + u[20]*(*s).c2.c2.re-u[21]*(*s).c2.c2.im+ + u[22]*(*s).c2.c3.re-u[23]*(*s).c2.c3.im; + + rs.c1.c2.im= + u[ 6]*(*s).c1.c1.im-u[ 7]*(*s).c1.c1.re+ + u[ 1]*(*s).c1.c2.im+ mu*(*s).c1.c2.re+ + u[16]*(*s).c1.c3.im+u[17]*(*s).c1.c3.re+ + u[18]*(*s).c2.c1.im+u[19]*(*s).c2.c1.re+ + u[20]*(*s).c2.c2.im+u[21]*(*s).c2.c2.re+ + u[22]*(*s).c2.c3.im+u[23]*(*s).c2.c3.re; + + rs.c1.c3.re= + u[ 8]*(*s).c1.c1.re+u[ 9]*(*s).c1.c1.im+ + u[16]*(*s).c1.c2.re+u[17]*(*s).c1.c2.im+ + u[ 2]*(*s).c1.c3.re- mu*(*s).c1.c3.im+ + u[24]*(*s).c2.c1.re-u[25]*(*s).c2.c1.im+ + u[26]*(*s).c2.c2.re-u[27]*(*s).c2.c2.im+ + u[28]*(*s).c2.c3.re-u[29]*(*s).c2.c3.im; + + rs.c1.c3.im= + u[ 8]*(*s).c1.c1.im-u[ 9]*(*s).c1.c1.re+ + u[16]*(*s).c1.c2.im-u[17]*(*s).c1.c2.re+ + u[ 2]*(*s).c1.c3.im+ mu*(*s).c1.c3.re+ + u[24]*(*s).c2.c1.im+u[25]*(*s).c2.c1.re+ + u[26]*(*s).c2.c2.im+u[27]*(*s).c2.c2.re+ + u[28]*(*s).c2.c3.im+u[29]*(*s).c2.c3.re; + + rs.c2.c1.re= + u[10]*(*s).c1.c1.re+u[11]*(*s).c1.c1.im+ + u[18]*(*s).c1.c2.re+u[19]*(*s).c1.c2.im+ + u[24]*(*s).c1.c3.re+u[25]*(*s).c1.c3.im+ + u[ 3]*(*s).c2.c1.re- mu*(*s).c2.c1.im+ + u[30]*(*s).c2.c2.re-u[31]*(*s).c2.c2.im+ + u[32]*(*s).c2.c3.re-u[33]*(*s).c2.c3.im; + + rs.c2.c1.im= + u[10]*(*s).c1.c1.im-u[11]*(*s).c1.c1.re+ + u[18]*(*s).c1.c2.im-u[19]*(*s).c1.c2.re+ + u[24]*(*s).c1.c3.im-u[25]*(*s).c1.c3.re+ + u[ 3]*(*s).c2.c1.im+ mu*(*s).c2.c1.re+ + u[30]*(*s).c2.c2.im+u[31]*(*s).c2.c2.re+ + u[32]*(*s).c2.c3.im+u[33]*(*s).c2.c3.re; + + rs.c2.c2.re= + u[12]*(*s).c1.c1.re+u[13]*(*s).c1.c1.im+ + u[20]*(*s).c1.c2.re+u[21]*(*s).c1.c2.im+ + u[26]*(*s).c1.c3.re+u[27]*(*s).c1.c3.im+ + u[30]*(*s).c2.c1.re+u[31]*(*s).c2.c1.im+ + u[ 4]*(*s).c2.c2.re- mu*(*s).c2.c2.im+ + u[34]*(*s).c2.c3.re-u[35]*(*s).c2.c3.im; + + rs.c2.c2.im= + u[12]*(*s).c1.c1.im-u[13]*(*s).c1.c1.re+ + u[20]*(*s).c1.c2.im-u[21]*(*s).c1.c2.re+ + u[26]*(*s).c1.c3.im-u[27]*(*s).c1.c3.re+ + u[30]*(*s).c2.c1.im-u[31]*(*s).c2.c1.re+ + u[ 4]*(*s).c2.c2.im+ mu*(*s).c2.c2.re+ + u[34]*(*s).c2.c3.im+u[35]*(*s).c2.c3.re; + + rs.c2.c3.re= + u[14]*(*s).c1.c1.re+u[15]*(*s).c1.c1.im+ + u[22]*(*s).c1.c2.re+u[23]*(*s).c1.c2.im+ + u[28]*(*s).c1.c3.re+u[29]*(*s).c1.c3.im+ + u[32]*(*s).c2.c1.re+u[33]*(*s).c2.c1.im+ + u[34]*(*s).c2.c2.re+u[35]*(*s).c2.c2.im+ + u[ 5]*(*s).c2.c3.re- mu*(*s).c2.c3.im; + + rs.c2.c3.im= + u[14]*(*s).c1.c1.im-u[15]*(*s).c1.c1.re+ + u[22]*(*s).c1.c2.im-u[23]*(*s).c1.c2.re+ + u[28]*(*s).c1.c3.im-u[29]*(*s).c1.c3.re+ + u[32]*(*s).c2.c1.im-u[33]*(*s).c2.c1.re+ + u[34]*(*s).c2.c2.im-u[35]*(*s).c2.c2.re+ + u[ 5]*(*s).c2.c3.im+ mu*(*s).c2.c3.re; + + (*r)=rs; +} + + +static int fwd_house(double eps) +{ + int i,j,k,ifail; + double r1,r2,r3; + complex_dble z; + + ifail=0; + + for (k=0;k<5;k++) + { + r1=aa[6*k+k].re*aa[6*k+k].re+aa[6*k+k].im*aa[6*k+k].im; + r2=sqrt(r1); + + for (j=(k+1);j<6;j++) + r1+=(aa[6*j+k].re*aa[6*j+k].re+aa[6*j+k].im*aa[6*j+k].im); + + if (r1>=eps) + r1=sqrt(r1); + else + { + ifail=1; + r1=1.0; + } + + if (r2>=(DBL_EPSILON*r1)) + { + r3=1.0/r2; + z.re=r3*aa[6*k+k].re; + z.im=r3*aa[6*k+k].im; + } + else + { + z.re=1.0; + z.im=0.0; + } + + aa[6*k+k].re+=r1*z.re; + aa[6*k+k].im+=r1*z.im; + + r3=1.0/(r1*(r1+r2)); + rr[k]=r3; + dd[k].re=-(r1+r2)*r3*z.re; + dd[k].im= (r1+r2)*r3*z.im; + + for (j=(k+1);j<6;j++) + { + z.re=0.0; + z.im=0.0; + + for (i=k;i<6;i++) + { + z.re+=(aa[6*i+k].re*aa[6*i+j].re+aa[6*i+k].im*aa[6*i+j].im); + z.im+=(aa[6*i+k].re*aa[6*i+j].im-aa[6*i+k].im*aa[6*i+j].re); + } + + z.re*=r3; + z.im*=r3; + + for (i=k;i<6;i++) + { + aa[6*i+j].re-=(z.re*aa[6*i+k].re-z.im*aa[6*i+k].im); + aa[6*i+j].im-=(z.re*aa[6*i+k].im+z.im*aa[6*i+k].re); + } + } + } + + r1=aa[35].re*aa[35].re+aa[35].im*aa[35].im; + + if (r1>=eps) + r1=1.0/r1; + else + { + ifail=1; + r1=1.0; + } + + dd[5].re= r1*aa[35].re; + dd[5].im=-r1*aa[35].im; + + return ifail; +} + + +static void solv_sys(void) +{ + int i,j,k; + complex_dble z; + + for (k=5;k>0;k--) + { + for (i=(k-1);i>=0;i--) + { + z.re=aa[6*i+k].re*dd[k].re-aa[6*i+k].im*dd[k].im; + z.im=aa[6*i+k].re*dd[k].im+aa[6*i+k].im*dd[k].re; + + for (j=(k-1);j>i;j--) + { + z.re+=(aa[6*i+j].re*aa[6*j+k].re-aa[6*i+j].im*aa[6*j+k].im); + z.im+=(aa[6*i+j].re*aa[6*j+k].im+aa[6*i+j].im*aa[6*j+k].re); + } + + aa[6*i+k].re=-dd[i].re*z.re+dd[i].im*z.im; + aa[6*i+k].im=-dd[i].re*z.im-dd[i].im*z.re; + } + } +} + + +static void bck_house(void) +{ + int i,j,k; + complex_dble z; + + aa[35].re=dd[5].re; + aa[35].im=dd[5].im; + + for (k=4;k>=0;k--) + { + z.re=dd[k].re; + z.im=dd[k].im; + dd[k].re=aa[6*k+k].re; + dd[k].im=aa[6*k+k].im; + aa[6*k+k].re=z.re; + aa[6*k+k].im=z.im; + + for (j=(k+1);j<6;j++) + { + dd[j].re=aa[6*j+k].re; + dd[j].im=aa[6*j+k].im; + aa[6*j+k].re=0.0; + aa[6*j+k].im=0.0; + } + + for (i=0;i<6;i++) + { + z.re=0.0; + z.im=0.0; + + for (j=k;j<6;j++) + { + z.re+=(aa[6*i+j].re*dd[j].re-aa[6*i+j].im*dd[j].im); + z.im+=(aa[6*i+j].re*dd[j].im+aa[6*i+j].im*dd[j].re); + } + + z.re*=rr[k]; + z.im*=rr[k]; + + for (j=k;j<6;j++) + { + aa[6*i+j].re-=(z.re*dd[j].re+z.im*dd[j].im); + aa[6*i+j].im+=(z.re*dd[j].im-z.im*dd[j].re); + } + } + } +} + +#endif + +static double set_aa(double mu,pauli_dble *m) +{ + int i,j; + double sm,*u,*v; + + sm=0.0; + u=(*m).u; + v=u+6; + + for (i=0;i<6;i++) + { + sm+=u[0]*u[0]+mu*mu; + aa[6*i+i].re=u[0]; + aa[6*i+i].im=mu; + u+=1; + + for (j=i+1;j<6;j++) + { + sm+=2.0*(v[0]*v[0]+v[1]*v[1]); + aa[6*i+j].re= v[0]; + aa[6*i+j].im= v[1]; + aa[6*j+i].re= v[0]; + aa[6*j+i].im=-v[1]; + v+=2; + } + } + + return sm; +} + + +static double norm_aa(void) +{ + double sm; + complex_dble *z,*zm; + + sm=0.0; + z=aa; + zm=aa+36; + + for (;z1.0) + ifail=1; + + return ifail; +} + +complex_dble det_pauli_dble(double mu,pauli_dble *m) +{ + int i,j,k; + double eps,r1,r2,r3; + complex_dble det,z,w; + + eps=DBL_EPSILON*sqrt(set_aa(mu,m)); + det.re=1.0; + det.im=0.0; + + for (k=0;k<5;k++) + { + r1=aa[6*k+k].re*aa[6*k+k].re+aa[6*k+k].im*aa[6*k+k].im; + r2=sqrt(r1); + + for (j=(k+1);j<6;j++) + r1+=(aa[6*j+k].re*aa[6*j+k].re+aa[6*j+k].im*aa[6*j+k].im); + + r1=sqrt(r1); + + if (r1<=eps) + { + w.re=0.0; + w.im=0.0; + + return w; + } + + if (r2>=(DBL_EPSILON*r1)) + { + r3=1.0/r2; + z.re=r1*r3*aa[6*k+k].re; + z.im=r1*r3*aa[6*k+k].im; + } + else + { + z.re=r1; + z.im=0.0; + } + + w.re=det.re*z.re-det.im*z.im; + w.im=det.re*z.im+det.im*z.re; + det.re=w.re; + det.im=w.im; + + aa[6*k+k].re+=z.re; + aa[6*k+k].im+=z.im; + r3=1.0/(r1*(r1+r2)); + + for (j=(k+1);j<6;j++) + { + z.re=0.0; + z.im=0.0; + + for (i=k;i<6;i++) + { + z.re+=(aa[6*i+k].re*aa[6*i+j].re+aa[6*i+k].im*aa[6*i+j].im); + z.im+=(aa[6*i+k].re*aa[6*i+j].im-aa[6*i+k].im*aa[6*i+j].re); + } + + z.re*=r3; + z.im*=r3; + + for (i=(k+1);i<6;i++) + { + aa[6*i+j].re-=(z.re*aa[6*i+k].re-z.im*aa[6*i+k].im); + aa[6*i+j].im-=(z.re*aa[6*i+k].im+z.im*aa[6*i+k].re); + } + } + } + + w.re=det.re*aa[35].re-det.im*aa[35].im; + w.im=det.re*aa[35].im+det.im*aa[35].re; + + return w; +} + + +void apply_sw_dble(int vol,double mu,pauli_dble *m,spinor_dble *s, + spinor_dble *r) +{ + spin_t *ps,*pr,*pm; + + ps=(spin_t*)(s); + pr=(spin_t*)(r); + pm=ps+vol; + + for (;ps1.0) + ifail=1; + apply_aa((*ps).c,(*pr).c); + m+=1; + + eps=DELTA*set_aa(-mu,m); + ifail|=fwd_house(eps); + solv_sys(); + bck_house(); + if ((eps*norm_aa())>1.0) + ifail=1; + apply_aa((*ps).c+6,(*pr).c+6); + m+=1; + pr+=1; + } + + return ifail; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/sw_term/sw_term.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/sw_term/sw_term.c new file mode 100644 index 0000000000000000000000000000000000000000..aee8c871e84c84841e36a11e4698900e0d39fbfd --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/sw_term/sw_term.c @@ -0,0 +1,324 @@ + +/******************************************************************************* +* +* File sw_term.c +* +* Copyright (C) 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Computation of the SW term. +* +* The externally accessible functions are +* +* int sw_term(ptset_t set) +* Computes the SW term for the current double-precision gauge field +* and assigns the matrix to the global double-precision SW field. The +* matrices on the specified point set are then inverted and 0 or 1 +* is returned depending on whether all inversions were safe or not. +* +* Notes: +* +* The program sets the SW term to unity at global time +* +* x0=0 (open, SF and open-SF boundary conditions), +* +* x0=NPROC0*L0-1 (open boundary conditions). +* +* In all other cases, it is given by +* +* c(x0)+csw*(i/4)*sigma_{mu nu}*Fhat_{mu nu}(x) +* +* where +* +* c(x0) = 4+m0+cF[0]-1 if x0=1 (open, SF or open-SF bc), +* 4+m0+cF[1]-1 if x0=NPROCO*L0-2 (open bc), +* or x0=NPROC0*L0-1 (SF or open-SF bc), +* 4+m0 otherwise, +* +* sigma_{mu nu}=(i/2)*[gamma_mu,gamma_nu], +* +* and Fhat_{mu nu} is the standard (clover) expression for the gauge field +* tensor as computed by the program ftensor() [tcharge/ftensor.c]. The upper +* and lower 6x6 blocks of the matrix are stored in the pauli_dble structures +* swd[2*ix] and swd[2*ix+1], where ix is the label of the point x. +* +* The quark mass m0 and the improvement coefficients csw and cF are obtained +* from the parameter data base by calling sw_parms() [flags/lat_parms.c]. Note +* that this program checks the flags data base and only computes those parts +* of the SW array that do not already have the correct values. +* +* This program performs global operations and must be called simultaneously +* on all processes. +* +*******************************************************************************/ + +#define SW_TERM_C + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "tcharge.h" +#include "sw_term.h" +#include "global.h" + +#define N0 (NPROC0*L0) + +static double c1,c2,c3[2]; +static u3_alg_dble X ALIGNED16; +static const pauli_dble sw0={{1.0,1.0,1.0,1.0,1.0,1.0, + 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0, + 0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0, + 0.0,0.0,0.0,0.0,0.0,0.0}}; + + +static void u3_alg2pauli1(pauli_dble *m) +{ + (*m).u[10]=-X.c1; + + (*m).u[12]=-X.c5; + (*m).u[13]= X.c4; + (*m).u[14]=-X.c7; + (*m).u[15]= X.c6; + + (*m).u[18]=-X.c5; + (*m).u[19]=-X.c4; + (*m).u[20]=-X.c2; + + (*m).u[22]=-X.c9; + (*m).u[23]= X.c8; + (*m).u[24]=-X.c7; + (*m).u[25]=-X.c6; + (*m).u[26]=-X.c9; + (*m).u[27]=-X.c8; + (*m).u[28]=-X.c3; +} + + +static void u3_alg2pauli2(pauli_dble *m) +{ + (*m).u[11] =X.c1; + (*m).u[12]+=X.c4; + (*m).u[13]+=X.c5; + (*m).u[14]+=X.c6; + (*m).u[15]+=X.c7; + + (*m).u[18]-=X.c4; + (*m).u[19]+=X.c5; + + (*m).u[21] =X.c2; + (*m).u[22]+=X.c8; + (*m).u[23]+=X.c9; + (*m).u[24]-=X.c6; + (*m).u[25]+=X.c7; + (*m).u[26]-=X.c8; + (*m).u[27]+=X.c9; + + (*m).u[29] =X.c3; +} + + +static void u3_alg2pauli3(pauli_dble *m) +{ + (*m).u[ 0]=-X.c1; + (*m).u[ 1]=-X.c2; + (*m).u[ 2]=-X.c3; + (*m).u[ 3]= X.c1; + (*m).u[ 4]= X.c2; + (*m).u[ 5]= X.c3; + (*m).u[ 6]=-X.c5; + (*m).u[ 7]= X.c4; + (*m).u[ 8]=-X.c7; + (*m).u[ 9]= X.c6; + + (*m).u[16]=-X.c9; + (*m).u[17]= X.c8; + + (*m).u[30]= X.c5; + (*m).u[31]=-X.c4; + (*m).u[32]= X.c7; + (*m).u[33]=-X.c6; + (*m).u[34]= X.c9; + (*m).u[35]=-X.c8; +} + + +static void set_swd(int vol,int ofs,u3_alg_dble **ft,pauli_dble *sw) +{ + int bc,ix,t; + double c,*u; + u3_alg_dble *ft0,*ft1,*ft2,*ft3,*ft4,*ft5; + + bc=bc_type(); + vol+=ofs; + sw+=2*ofs; + ft0=ft[0]+ofs; + ft1=ft[1]+ofs; + ft2=ft[2]+ofs; + ft3=ft[3]+ofs; + ft4=ft[4]+ofs; + ft5=ft[5]+ofs; + + for (ix=ofs;ix1) + MPI_Allreduce(&ifail,&n,1,MPI_INT,MPI_MAX,MPI_COMM_WORLD); + else + n=ifail; + + return n; +} + + +int sw_term(ptset_t set) +{ + int iprms[1],ie,io,ifail; + pauli_dble *sw; + u3_alg_dble **ft; + sw_parms_t swp; + + if (NPROC>1) + { + iprms[0]=(int)(set); + MPI_Bcast(iprms,1,MPI_INT,0,MPI_COMM_WORLD); + + error(iprms[0]!=(int)(set),1,"sw_term [sw_term.c]", + "Parameter is not global"); + } + + swp=sw_parms(); + c1=4.0+swp.m0; + c2=-0.5*swp.csw; + c3[0]=c1+swp.cF[0]-1.0; + c3[1]=c1+swp.cF[1]-1.0; + + if (query_flags(SWD_UP2DATE)!=1) + { + ft=ftensor(); + sw=swdfld(); + set_swd(VOLUME,0,ft,sw); + set_flags(COMPUTED_SWD); + } + + ie=query_flags(SWD_E_INVERTED); + io=query_flags(SWD_O_INVERTED); + + if ((ie==1)&&((set==NO_PTS)||(set==ODD_PTS))) + { + ft=ftensor(); + sw=swdfld(); + set_swd(VOLUME/2,0,ft,sw); + ie=0; + } + + if ((io==1)&&((set==NO_PTS)||(set==EVEN_PTS))) + { + ft=ftensor(); + sw=swdfld(); + set_swd(VOLUME/2,VOLUME/2,ft,sw); + io=0; + } + + ifail=0; + + if ((ie==0)&&((set==ALL_PTS)||(set==EVEN_PTS))) + { + sw=swdfld(); + ifail|=iswd(VOLUME,sw); + ie=1; + } + + if ((io==0)&&((set==ALL_PTS)||(set==ODD_PTS))) + { + sw=swdfld()+VOLUME; + ifail|=iswd(VOLUME,sw); + io=1; + } + + set_flags(COMPUTED_SWD); + + if (ie==1) + set_flags(INVERTED_SWD_E); + + if (io==1) + set_flags(INVERTED_SWD_O); + + return ifail; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/sw_term/swflds.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/sw_term/swflds.c new file mode 100644 index 0000000000000000000000000000000000000000..9693faac944f4819b25b1aa4398b069e1a141f33 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/sw_term/swflds.c @@ -0,0 +1,135 @@ + +/******************************************************************************* +* +* File swflds.c +* +* Copyright (C) 2006, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Allocation and initialization of the global SW fields +* +* The externally accessible functions are +* +* pauli *swfld(void) +* Returns the base address of the single-precision SW field. If it +* is not already allocated, the field is allocated and initialized +* to unity. +* +* pauli_dble *swdfld(void) +* Returns the base address of the double-precision SW field. If it +* is not already allocated, the field is allocated and initialized +* to unity. +* +* void assign_swd2sw(void) +* Assigns the double-precision to the single-precision SW field. +* +* Notes: +* +* All these programs act globally and must be called simultaneously on all +* processes. +* +*******************************************************************************/ + +#define SWFLDS_C + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "sw_term.h" +#include "global.h" + +static const pauli sw0={{0.0f}}; +static const pauli_dble swd0={{0.0}}; +static pauli *swb=NULL; +static pauli_dble *swdb=NULL; + + +static void alloc_sw(void) +{ + pauli *sw,*sm,unity; + + error_root(sizeof(pauli)!=(36*sizeof(float)),1,"alloc_sw [swflds.c]", + "The pauli structures are not properly packed"); + + swb=amalloc(2*VOLUME*sizeof(*swb),ALIGN); + error(swb==NULL,1,"alloc_sw [swflds.c]", + "Unable to allocate the global single-precision SW field"); + + unity=sw0; + unity.u[0]=1.0f; + unity.u[1]=1.0f; + unity.u[2]=1.0f; + unity.u[3]=1.0f; + unity.u[4]=1.0f; + unity.u[5]=1.0f; + + sw=swb; + sm=sw+2*VOLUME; + + for (;sw +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "tcharge.h" +#include "global.h" + +static const int plns[6][2]={{0,1},{0,2},{0,3},{2,3},{3,1},{1,2}}; +static const u3_alg_dble ft0={0.0}; +static u3_alg_dble *ftbuf; +static ftidx_t *idx=NULL; + + +static void alloc_ftbuf(void) +{ + int n,nft,nbf; + + idx=ftidx(); + nbf=0; + + for (n=0;n<6;n++) + { + nft=idx[n].nft[0]; + if (nft>nbf) + nbf=nft; + + nft=idx[n].nft[1]; + if (nft>nbf) + nbf=nft; + } + + ftbuf=amalloc(nbf*sizeof(*ftbuf),ALIGN); + error(ftbuf==NULL,1,"alloc_ftbuf [ftcom.c]", + "Unable to allocate communication buffers"); +} + + +static void pack_buf(int n,int dir,u3_alg_dble *ft) +{ + int bc,mu,nft; + int *ift,*ifm; + u3_alg_dble *fb; + + nft=idx[n].nft[dir]; + + if (nft>0) + { + bc=bc_type(); + mu=plns[n][dir]; + + if ((mu>0)||(cpr[0]>0)||(bc==3)) + { + ift=idx[n].ift[dir]; + ifm=ift+nft; + fb=ftbuf; + + for (;ift0) + { + bc=bc_type(); + mu=plns[n][dir]; + tag=mpi_tag(); + saddr=npr[2*mu]; + raddr=npr[2*mu+1]; + sbuf=ftbuf; + rbuf=ft+VOLUME; + if (dir==1) + rbuf+=idx[n].nft[0]; + nbf=9*nft; + + if (np==0) + { + if ((mu>0)||(cpr[0]>0)||(bc==3)) + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + if ((mu>0)||(cpr[0]<(NPROC0-1))||(bc==3)) + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + } + else + { + if ((mu>0)||(cpr[0]<(NPROC0-1))||(bc==3)) + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + if ((mu>0)||(cpr[0]>0)||(bc==3)) + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + } + } +} + + +void copy_bnd_ft(int n,u3_alg_dble *ft) +{ + if (NPROC>1) + { + if (idx==NULL) + alloc_ftbuf(); + + pack_buf(n,1,ft); + fwd_send(n,1,ft); + pack_buf(n,0,ft); + fwd_send(n,0,ft); + } +} + + +static void bck_send(int n,int dir,u3_alg_dble *ft) +{ + int bc,mu,nft,nbf; + int tag,saddr,raddr,np; + u3_alg_dble *sbuf,*rbuf; + MPI_Status stat; + + np=(cpr[0]+cpr[1]+cpr[2]+cpr[3])&0x1; + nft=idx[n].nft[dir]; + + if (nft>0) + { + bc=bc_type(); + mu=plns[n][dir]; + tag=mpi_tag(); + saddr=npr[2*mu+1]; + raddr=npr[2*mu]; + sbuf=ft+VOLUME; + if (dir==1) + sbuf+=idx[n].nft[0]; + rbuf=ftbuf; + nbf=9*nft; + + if (np==0) + { + if ((mu>0)||(cpr[0]<(NPROC0-1))||(bc==3)) + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + if ((mu>0)||(cpr[0]>0)||(bc==3)) + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + } + else + { + if ((mu>0)||(cpr[0]>0)||(bc==3)) + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + if ((mu>0)||(cpr[0]<(NPROC0-1))||(bc==3)) + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + } + } +} + + +static void unpack_buf(int n,int dir,u3_alg_dble *ft) +{ + int bc,mu,nft; + int *ift,*ifm; + u3_alg_dble *f,*fb; + + nft=idx[n].nft[dir]; + + if (nft>0) + { + bc=bc_type(); + mu=plns[n][dir]; + + if ((mu>0)||(cpr[0]>0)||(bc==3)) + { + ift=idx[n].ift[dir]; + ifm=ift+nft; + fb=ftbuf; + + for (;ift1) + { + if (idx==NULL) + alloc_ftbuf(); + + bck_send(n,0,ft); + unpack_buf(n,0,ft); + bck_send(n,1,ft); + unpack_buf(n,1,ft); + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/tcharge/ftensor.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/tcharge/ftensor.c new file mode 100644 index 0000000000000000000000000000000000000000..2ec94c1a7a2769cb5830a517ce5bae155d7d8bde --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/tcharge/ftensor.c @@ -0,0 +1,219 @@ + +/******************************************************************************* +* +* File ftensor.c +* +* Copyright (C) 2010-2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Computation of the symmetric field tensor. +* +* The externally accessible function is +* +* u3_alg_dble **ftensor(void) +* Computes the symmetric field tensor of the global double-precision +* gauge field and returns the pointers ft[0],..,ft[5] to the field +* components with the Lorentz indices (0,1),(0,2),(0,3),(2,3),(3,1), +* (1,2). The arrays are automatically allocated if needed. Along the +* boundaries of the lattice (if any), the program sets the field to +* zero. +* +* Notes: +* +* At all points x in the interior of the lattice, the (mu,nu)-component of +* the field tensor is defined by +* +* F_{mu,nu}(x) = (1/8)*[Q_{mu,nu}(x)-Q_{nu,mu}(x)] +* +* where +* +* Q_{mu,nu}(x) = U(x,mu)*U(x+mu,nu)*U(x+nu,mu)^dag*U(x,nu)^dag + (3 more) +* +* denotes the sum of the four plaquette loops at x in the (mu,nu)-plane (the +* same as in the case of the SW term). Elsewhere the elements of the field +* arrays are set to zero. The interior points are those at global time x0 +* in the range +* +* 0 +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "su3fcts.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "linalg.h" +#include "tcharge.h" +#include "global.h" + +#define N0 (NPROC0*L0) + +static u3_alg_dble **fts=NULL,**ft,X; +static su3_dble w1,w2 ALIGNED16; +static ftidx_t *idx; + + +static void alloc_fts(void) +{ + int n,nbf; + u3_alg_dble **pp,*p; + + error_root(sizeof(u3_alg_dble)!=(9*sizeof(double)),1, + "alloc_fts [ftensor.c]", + "The u3_alg_dble structures are not properly packed"); + + idx=ftidx(); + nbf=0; + + for (n=0;n<6;n++) + nbf+=idx[n].nft[0]+idx[n].nft[1]; + + pp=malloc(12*sizeof(*pp)); + p=amalloc((6*VOLUME+nbf)*sizeof(*p),ALIGN); + error((pp==NULL)||(p==NULL),1,"alloc_fts [ftensor.c]", + "Unable to allocate field tensor arrays"); + + fts=pp; + ft=pp+6; + + for (n=0;n<6;n++) + { + (*pp)=p; + pp+=1; + p+=VOLUME+idx[n].nft[0]+idx[n].nft[1]; + } +} + + +static void add_X2ft(u3_alg_dble *f) +{ + double r; + + r=0.125; + (*f).c1+=r*X.c1; + (*f).c2+=r*X.c2; + (*f).c3+=r*X.c3; + (*f).c4+=r*X.c4; + (*f).c5+=r*X.c5; + (*f).c6+=r*X.c6; + (*f).c7+=r*X.c7; + (*f).c8+=r*X.c8; + (*f).c9+=r*X.c9; +} + + +static void build_fts(void) +{ + int bc,n,ix,t,ip[4],ipf[4]; + int tmx; + su3_dble *ub; + u3_alg_dble *ftn; + + bc=bc_type(); + ub=udfld(); + + for (n=0;n<6;n++) + { + ftn=fts[n]; + set_ualg2zero(VOLUME+idx[n].nft[0]+idx[n].nft[1],ftn); + tmx=N0; + if (bc==0) + tmx-=1; + if (n<3) + tmx-=1; + + for (ix=0;ix0)&&(t +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "tcharge.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define MAX_LEVELS 8 +#define BLK_LENGTH 8 + +static int cnt[L0][MAX_LEVELS]; +static double smx[L0][MAX_LEVELS],qsl0[N0]; +static u3_alg_dble **ft; + + +static double prodXY(u3_alg_dble *X,u3_alg_dble *Y) +{ + double sm; + + sm=(-2.0/3.0)*((*X).c1+(*X).c2+(*X).c3)*((*Y).c1+(*Y).c2+(*Y).c3)+ + 2.0*((*X).c1*(*Y).c1+(*X).c2*(*Y).c2+(*X).c3*(*Y).c3)+ + 4.0*((*X).c4*(*Y).c4+(*X).c5*(*Y).c5+(*X).c6*(*Y).c6+ + (*X).c7*(*Y).c7+(*X).c8*(*Y).c8+(*X).c9*(*Y).c9); + + return sm; +} + + +static double density(int ix) +{ + double sm; + + sm=prodXY(ft[0]+ix,ft[3]+ix)+ + prodXY(ft[1]+ix,ft[4]+ix)+ + prodXY(ft[2]+ix,ft[5]+ix); + + return sm; +} + + +double tcharge(void) +{ + int bc,tmx; + int n,ix,t,*cnt0; + double pi,Q,*smx0; + + ft=ftensor(); + cnt0=cnt[0]; + smx0=smx[0]; + + for (n=0;n0)&&(t=BLK_LENGTH)&&(n1) + { + MPI_Reduce(smx0,&Q,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(&Q,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + else + Q=smx0[0]; + + pi=4.0*atan(1.0); + + return Q/(8.0*pi*pi); +} + + +double tcharge_slices(double *qsl) +{ + int bc,tmx; + int n,ix,t,t0; + double pi,fact,Q; + + ft=ftensor(); + bc=bc_type(); + if (bc==0) + tmx=N0-1; + else + tmx=N0; + + for (t=0;t0)&&(t=BLK_LENGTH)&&(n1) + { + MPI_Reduce(qsl0,qsl,N0,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(qsl,N0,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + else + { + for (t=0;t +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "su3fcts.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "linalg.h" +#include "tcharge.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define MAX_LEVELS 12 +#define BLK_LENGTH 8 + +static int cnt[L0][MAX_LEVELS]; +static double smx[L0][MAX_LEVELS],asl0[N0]; +static u3_alg_dble **ft; + + +static double prodXX(u3_alg_dble *X) +{ + double sm; + + sm=(-2.0/3.0)*((*X).c1+(*X).c2+(*X).c3)*((*X).c1+(*X).c2+(*X).c3)+ + 2.0*((*X).c1*(*X).c1+(*X).c2*(*X).c2+(*X).c3*(*X).c3)+ + 4.0*((*X).c4*(*X).c4+(*X).c5*(*X).c5+(*X).c6*(*X).c6+ + (*X).c7*(*X).c7+(*X).c8*(*X).c8+(*X).c9*(*X).c9); + + return sm; +} + + +static double density(int ix) +{ + double sm; + + sm=prodXX(ft[0]+ix)+prodXX(ft[1]+ix)+prodXX(ft[2]+ix)+ + prodXX(ft[3]+ix)+prodXX(ft[4]+ix)+prodXX(ft[5]+ix); + + return sm; +} + + +double ym_action(void) +{ + int bc,tmx; + int n,ix,t,*cnt0; + double s,*smx0; + + ft=ftensor(); + cnt0=cnt[0]; + smx0=smx[0]; + + for (n=0;n0)&&(t=BLK_LENGTH)&&(n1) + { + MPI_Reduce(smx0,&s,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(&s,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + else + s=smx0[0]; + + return 0.5*s; +} + + +double ym_action_slices(double *asl) +{ + int bc,tmx; + int n,ix,t,t0; + double s; + + ft=ftensor(); + bc=bc_type(); + if (bc==0) + tmx=N0-1; + else + tmx=N0; + + for (t=0;t0)&&(t=BLK_LENGTH)&&(n1) + { + MPI_Reduce(asl0,asl,N0,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(asl,N0,MPI_DOUBLE,0,MPI_COMM_WORLD); + } + else + { + for (t=0;t +#include +#include +#include "mpi.h" +#include "su3.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "global.h" + +static const int plns[6][2]={{0,1},{0,2},{0,3},{2,3},{3,1},{1,2}}; +static int bc,np,nfc[8],ofs[8],hofs[8],tags[8],nmu[8]; +static const su3_dble ud0={{0.0}}; +static su3_dble wd ALIGNED16; +static su3_dble *hdb=NULL; + + +static void set_ofs(void) +{ + int ifc; + + bc=bc_type(); + np=(cpr[0]+cpr[1]+cpr[2]+cpr[3])&0x1; + + nfc[0]=FACE0/2; + nfc[1]=FACE0/2; + nfc[2]=FACE1/2; + nfc[3]=FACE1/2; + nfc[4]=FACE2/2; + nfc[5]=FACE2/2; + nfc[6]=FACE3/2; + nfc[7]=FACE3/2; + + ofs[0]=0; + ofs[1]=ofs[0]+(FACE0/2); + ofs[2]=ofs[1]+(FACE0/2); + ofs[3]=ofs[2]+(FACE1/2); + ofs[4]=ofs[3]+(FACE1/2); + ofs[5]=ofs[4]+(FACE2/2); + ofs[6]=ofs[5]+(FACE2/2); + ofs[7]=ofs[6]+(FACE3/2); + + hofs[0]=0; + hofs[1]=hofs[0]+3*FACE0; + hofs[2]=hofs[1]+3*FACE0; + hofs[3]=hofs[2]+3*FACE1; + hofs[4]=hofs[3]+3*FACE1; + hofs[5]=hofs[4]+3*FACE2; + hofs[6]=hofs[5]+3*FACE2; + hofs[7]=hofs[6]+3*FACE3; + + for (ifc=0;ifc<8;ifc++) + { + nmu[ifc]=cpr[ifc/2]&0x1; + tags[ifc]=mpi_permanent_tag(); + } +} + + +static void alloc_hdb(void) +{ + int ifc,n,ib; + su3_dble unity; + + error(iup[0][0]==0,1,"alloc_hdb [bstap.c]", + "Geometry arrays are not set"); + + set_ofs(); + n=0; + + for (ifc=0;ifc<8;ifc+=2) + { + if (n1)&&(hdb==NULL)) + alloc_hdb(); + + return hdb; +} + + +static void get_ofs(int mu,int nu,int ix,int *ip) +{ + int n,is; + + for (n=0;n<6;n++) + { + if (((plns[n][0]==mu)&&(plns[n][1]==nu))|| + ((plns[n][0]==nu)&&(plns[n][1]==mu))) + { + plaq_uidx(n,ix,ip); + + if (mu==plns[n][0]) + { + is=ip[0]; + ip[0]=ip[2]; + ip[2]=is; + + is=ip[1]; + ip[1]=ip[3]; + ip[3]=is; + } + + return; + } + } +} + + +static void get_staples(int ifc) +{ + int ib,ix,mu,nu,k,ip[4]; + su3_dble *udb,*sbuf; + + udb=udfld(); + sbuf=hdb+3*BNDRY; + mu=ifc/2; + + for (ib=0;ib<(2*nfc[ifc]);ib++) + { + if (ib<(nfc[ifc])) + ix=map[ofs[ifc]+ib]; + else + ix=map[(BNDRY/2)+ofs[ifc]+ib-nfc[ifc]]; + + for (k=0;k<3;k++) + { + nu=k+(k>=mu); + get_ofs(mu,nu,ix,ip); + + if (ifc&0x1) + { + if ((mu>0)||(cpr[0]>0)||(bc==3)) + { + su3xsu3dag(udb+ip[3],udb+ip[1],&wd); + su3xsu3(udb+ip[2],&wd,sbuf); + } + } + else + { + if ((mu>0)||(cpr[0]<(NPROC0-1))||(bc==3)) + { + su3xsu3(udb+ip[0],udb+ip[1],&wd); + su3dagxsu3(udb+ip[2],&wd,sbuf); + } + } + + sbuf+=1; + } + } +} + + +static void send_staples(int ifc,int tag) +{ + int saddr,raddr,nbf,ib; + su3_dble *sbuf,*rbuf; + MPI_Status stat; + + saddr=npr[ifc^0x1]; + raddr=saddr; + sbuf=hdb+3*BNDRY; + rbuf=hdb+hofs[ifc^0x1]; + nbf=108*nfc[ifc]; + + if ((ifc>1)||(bc==3)|| + ((ifc==1)&&(cpr[0]>0))||((ifc==0)&&(cpr[0]<(NPROC0-1)))) + { + if (np==0) + { + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + } + else + { + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + } + } + else + { + for (ib=0;ib<(3*FACE0);ib++) + rbuf[ib]=ud0; + } +} + + +void set_bstap(void) +{ + int ifc,sfc; + + if (query_flags(UDBUF_UP2DATE)!=1) + copy_bnd_ud(); + + if (NPROC>1) + { + if (hdb==NULL) + alloc_hdb(); + + for (ifc=0;ifc<8;ifc++) + { + sfc=ifc^nmu[ifc]; + + if (nfc[sfc]>0) + { + get_staples(sfc); + send_staples(sfc,tags[ifc]); + } + } + } + + set_flags(SET_BSTAP); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/uflds/plaq_sum.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/uflds/plaq_sum.c new file mode 100644 index 0000000000000000000000000000000000000000..8ef0c970fb506312dbdd64e830881e39d394f2f4 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/uflds/plaq_sum.c @@ -0,0 +1,307 @@ + +/******************************************************************************* +* +* File plaq_sum.c +* +* Copyright (C) 2005, 2011, 2012, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Calculation of plaquette sums. +* +* The externally accessible functions are +* +* double plaq_sum_dble(int icom) +* Returns the sum of Re[tr{U(p)}] over all unoriented plaquettes p, +* where U(p) is the product of the double-precision link variables +* around p. If icom=1 the global sum of the local sums is returned +* and otherwise just the local sum. +* +* double plaq_wsum_dble(int icom) +* Same as plaq_sum_dble(), but giving weight 1/2 to the contribution +* of the space-like plaquettes at the boundaries of the lattice if +* boundary conditions of type 0,1 or 2 are chosen. +* +* double plaq_action_slices(double *asl) +* Computes the time-slice sums asl[x0] of the tree-level O(a)-improved +* plaquette action density of the double-precision gauge field. The +* factor 1/g0^2 is omitted and the time x0 runs from 0 to NPROC0*L0-1. +* The program returns the total action. +* +* Notes: +* +* The Wilson plaquette action density is defined so that it converges to the +* Yang-Mills action in the classical continuum limit with a rate proportional +* to a^2. In particular, at the boundaries of the lattice (if there are any), +* the space-like plaquettes are given the weight 1/2 and the contribution of +* a plaquette p in the bulk is 2*Re[tr{1-U(p)}]. +* +* The time-slice sum asl[x0] computed by plaq_action_slices() includes the +* full contribution to the action of the space-like plaquettes at time x0 and +* 1/2 of the contribution of the time-like plaquettes at time x0 and x0-1. +* +* The programs in this module perform global communications and must be +* called simultaneously on all MPI processes. +* +*******************************************************************************/ + +#define PLAQ_SUM_C + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "utils.h" +#include "flags.h" +#include "su3fcts.h" +#include "lattice.h" +#include "uflds.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define MAX_LEVELS 8 +#define BLK_LENGTH 8 + +static int cnt[L0][MAX_LEVELS]; +static double smE[L0][MAX_LEVELS],smB[L0][MAX_LEVELS]; +static double aslE[N0],aslB[N0]; +static su3_dble *udb; +static su3_dble wd1,wd2 ALIGNED16; + + +static double plaq_dble(int n,int ix) +{ + int ip[4]; + double sm; + + plaq_uidx(n,ix,ip); + + su3xsu3(udb+ip[0],udb+ip[1],&wd1); + su3dagxsu3dag(udb+ip[3],udb+ip[2],&wd2); + cm3x3_retr(&wd1,&wd2,&sm); + + return sm; +} + + +static double local_plaq_sum_dble(int iw) +{ + int bc,n,ix,t,*cnt0; + double wp,pa,*smx0; + + bc=bc_type(); + + if (iw==0) + wp=1.0; + else + wp=0.5; + + udb=udfld(); + cnt0=cnt[0]; + smx0=smE[0]; + + for (n=0;n0)||(bc==3))&&((t<(N0-1))||(bc!=0))) + { + for (n=3;n<6;n++) + pa+=plaq_dble(n,ix); + } + else + { + for (n=3;n<6;n++) + pa+=wp*plaq_dble(n,ix); + } + + if ((t==(N0-1))&&((bc==1)||(bc==2))) + pa+=9.0*wp; + + cnt0[0]+=1; + smx0[0]+=pa; + + for (n=1;(cnt0[n-1]>=BLK_LENGTH)&&(n1)&&(icom==1)) + { + MPI_Reduce(&p,&pa,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(&pa,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + p=pa; + } + + return p; +} + + +double plaq_wsum_dble(int icom) +{ + double p,pa; + + if (query_flags(UDBUF_UP2DATE)!=1) + copy_bnd_ud(); + + p=local_plaq_sum_dble(1); + + if ((NPROC>1)&&(icom==1)) + { + MPI_Reduce(&p,&pa,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + MPI_Bcast(&pa,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + p=pa; + } + + return p; +} + + +double plaq_action_slices(double *asl) +{ + int bc,n,ix,t,t0; + double sE,sB,A; + + if (query_flags(UDBUF_UP2DATE)!=1) + copy_bnd_ud(); + + bc=bc_type(); + t0=cpr[0]*L0; + udb=udfld(); + + for (t=0;t0)||(bc!=1)) + { + for (n=3;n<6;n++) + sB+=(3.0-plaq_dble(n,ix)); + } + + t-=t0; + smE[t][0]+=sE; + smB[t][0]+=sB; + cnt[t][0]+=1; + + for (n=1;(cnt[t][n-1]>=BLK_LENGTH)&&(n +#include +#include +#include "mpi.h" +#include "su3.h" +#include "utils.h" +#include "flags.h" +#include "lattice.h" +#include "uflds.h" +#include "global.h" + +typedef struct +{ + int saddr,raddr; + int iu,*idx; +} comlink_t; + +typedef struct +{ + int saddr,raddr; + int **idx; +} comstar_t; + +static int init=0,bs[4],np[4],nlk[4],npt[4],ofs[4]; +static su3_dble *sdbuf=NULL,*rdbuf; +static comlink_t comlink[4]; +static comstar_t comstar[8]; + + +static void set_const(void) +{ + int mu,ifc,iu; + comlink_t *cl; + comstar_t *cs; + + bs[0]=L0; + bs[1]=L1; + bs[2]=L2; + bs[3]=L3; + + np[0]=NPROC0; + np[1]=NPROC1; + np[2]=NPROC2; + np[3]=NPROC3; + + nlk[0]=FACE0/2; + nlk[1]=FACE1/2; + nlk[2]=FACE2/2; + nlk[3]=FACE3/2; + + ofs[0]=FACE0/2; + ofs[1]=ofs[0]+(FACE0+FACE1)/2; + ofs[2]=ofs[1]+(FACE1+FACE2)/2; + ofs[3]=ofs[2]+(FACE2+FACE3)/2; + + npt[0]=VOLUME/L0; + npt[1]=VOLUME/L1; + npt[2]=VOLUME/L2; + npt[3]=VOLUME/L3; + + cl=comlink; + iu=4*VOLUME; + + for (mu=0;mu<4;mu++) + { + (*cl).saddr=npr[2*mu]; + (*cl).raddr=npr[2*mu+1]; + (*cl).iu=iu; + iu+=nlk[mu]; + cl+=1; + } + + cs=comstar; + + for (ifc=0;ifc<8;ifc++) + { + (*cs).saddr=npr[ifc]; + (*cs).raddr=npr[ifc^0x1]; + cs+=1; + } +} + + +static void alloc_idx(void) +{ + int mu,ifc,t,**id,*ib; + comlink_t *cl; + comstar_t *cs; + + cl=comlink; + + if (NPROC>1) + { + ib=malloc((BNDRY/4)*sizeof(*ib)); + error(ib==NULL,1,"alloc_idx [shift.c]", + "Unable to allocate index arrays"); + } + else + ib=NULL; + + for (mu=0;mu<4;mu++) + { + if (np[mu]>1) + { + (*cl).idx=ib; + ib+=nlk[mu]; + } + else + (*cl).idx=NULL; + + cl+=1; + } + + cs=comstar; + id=malloc(2*(L0+L1+L2+L3)*sizeof(*id)); + ib=malloc(16*VOLUME*sizeof(*ib)); + error((id==NULL)||(ib==NULL),1,"alloc_idx [shift.c]", + "Unable to allocate index arrays"); + + for (ifc=0;ifc<8;ifc++) + { + mu=ifc/2; + (*cs).idx=id; + id+=bs[mu]; + + if ((ifc&0x1)==0) + { + for (t=0;t=(VOLUME/2)) + return 8*(ix-(VOLUME/2))+2*mu; + + iy=iup[ix][mu]; + + if (iy1) + { + idx=(comlink[mu]).idx; + + for (ix=0;ixn) + n=npt[mu]; + } + + sdbuf=amalloc(8*n*sizeof(su3_dble),ALIGN); + error(sdbuf==NULL,1,"alloc_udbufs [shift.c]", + "Unable to allocate communication buffers"); + + rdbuf=sdbuf+4*n; +} + + +static void get_udlinks(void) +{ + int mu,*idx,*idm; + int tag,ip,saddr,raddr,nbf; + su3_dble *ub,*u,*sb,*rb; + comlink_t *cl; + MPI_Status stat; + + ub=udfld(); + ip=(cpr[0]+cpr[1]+cpr[2]+cpr[3])&0x1; + cl=comlink; + + for (mu=0;mu<4;mu++) + { + if (np[mu]>1) + { + u=sdbuf; + idx=(*cl).idx; + idm=idx+nlk[mu]; + + for (;idx1) + { + tag=mpi_tag(); + nbf=18*nlk[mu]; + saddr=(*cl).raddr; + raddr=(*cl).saddr; + sb=ub+(*cl).iu; + rb=sdbuf; + + if (ip==0) + { + MPI_Send(sb,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + MPI_Recv(rb,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + } + else + { + MPI_Recv(rb,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + MPI_Send(sb,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + } + + u=sdbuf; + idx=(*cl).idx; + idm=idx+nlk[mu]; + + for (;idx1) + u=sdbuf; + else + u=rdbuf; + + idx=(*cs).idx[0]; + idm=idx+4*npt[mu]; + + for (;idx1) + { + tag=mpi_tag(); + nbf=72*npt[mu]; + saddr=(*cs).saddr; + raddr=(*cs).raddr; + sb=sdbuf; + rb=rdbuf; + + if (ip==0) + { + MPI_Send(sb,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + MPI_Recv(rb,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + } + else + { + MPI_Recv(rb,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + MPI_Send(sb,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + } + } +} + + +static void shift_udstars(int ifc) +{ + int mu,t; + int *id0,*id1,*idm; + su3_dble *ub,*u; + comstar_t *cs; + + get_udstars(ifc); + + ub=udfld(); + cs=comstar+ifc; + mu=ifc/2; + + for (t=0;t<(bs[mu]-1);t++) + { + id0=(*cs).idx[t]; + id1=(*cs).idx[t+1]; + idm=id0+4*npt[mu]; + + for (;id01) + { + iprms[0]=s[0]; + iprms[1]=s[1]; + iprms[2]=s[2]; + iprms[3]=s[3]; + + MPI_Bcast(iprms,4,MPI_INT,0,MPI_COMM_WORLD); + + error((iprms[0]!=s[0])||(iprms[1]!=s[1])|| + (iprms[2]!=s[2])||(iprms[3]!=s[3]),1, + "shift_ud [shift.c]","Shift vector is not global"); + } + + if (sdbuf==NULL) + alloc_udbufs(); + + for (mu=0;mu<4;mu++) + { + n=np[mu]*bs[mu]; + + if (abs(s[mu])>(n/2)) + { + sr[mu]=safe_mod(s[mu],n); + + if (sr[mu]>(n/2)) + sr[mu]-=n; + } + else + sr[mu]=s[mu]; + } + + if ((sr[0]==0)&&(sr[1]==0)&&(sr[2]==0)&&(sr[3]==0)) + return 0; + + error_root((sr[0]!=0)&&(bc_type()!=3),1,"shift_ud [shift.c]", + "Shifts in time are only permitted for periodic bc"); + + if (sr[0]!=0) + { + ie=chs_ubnd(1); + error_root(ie==1,1,"shift_ud [shift.c]", + "Attempt to move sign-changed link variables in time"); + } + + get_udlinks(); + n=0; + + for (mu=0;mu<4;mu++) + { + if (sr[mu]>=0) + ifc=2*mu+1; + else + ifc=2*mu; + + for (t=0;t +#include +#include +#include "mpi.h" +#include "su3.h" +#include "utils.h" +#include "flags.h" +#include "lattice.h" +#include "uflds.h" +#include "global.h" + +static int bc,np; +static su3_dble *sbuf=NULL,*rbuf; +static uidx_t *idx; + + +static void alloc_sbuf(void) +{ + int mu,nuk,n; + + bc=bc_type(); + np=(cpr[0]+cpr[1]+cpr[2]+cpr[3])&0x1; + idx=uidx(); + n=0; + + for (mu=0;mu<4;mu++) + { + nuk=idx[mu].nuk; + + if (nuk>n) + n=nuk; + } + + sbuf=amalloc(n*sizeof(*sbuf),ALIGN); + error(sbuf==NULL,1,"alloc_sbuf [udcom.c]", + "Unable to allocate send buffer"); +} + + +static void pack_ud0(int mu) +{ + int nu0,*iu,*ium; + su3_dble *u,*udb; + + udb=udfld(); + nu0=idx[mu].nu0; + + if (nu0>0) + { + u=sbuf; + iu=idx[mu].iu0; + ium=iu+nu0; + + for (;iu0)&&((mu>0)||(cpr[0]>0)||(bc==3))) + { + u=sbuf; + iu=idx[mu].iuk; + ium=iu+nuk; + + for (;iu0) + { + tag=mpi_tag(); + saddr=npr[2*mu]; + raddr=npr[2*mu+1]; + nbf=18*nu0; + + if (np==0) + { + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + } + else + { + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + } + + rbuf+=nu0; + } +} + + +static void send_udk(int mu) +{ + int nuk,nbf; + int tag,saddr,raddr; + MPI_Status stat; + + nuk=idx[mu].nuk; + + if (nuk>0) + { + tag=mpi_tag(); + saddr=npr[2*mu]; + raddr=npr[2*mu+1]; + nbf=18*nuk; + + if (np==0) + { + if ((mu>0)||(cpr[0]>0)||(bc==3)) + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + if ((mu>0)||(cpr[0]<(NPROC0-1))||(bc==3)) + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + } + else + { + if ((mu>0)||(cpr[0]<(NPROC0-1))||(bc==3)) + MPI_Recv(rbuf,nbf,MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&stat); + if ((mu>0)||(cpr[0]>0)||(bc==3)) + MPI_Send(sbuf,nbf,MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD); + } + + rbuf+=nuk; + } +} + + +void copy_bnd_ud(void) +{ + int mu; + + if (NPROC>1) + { + if (sbuf==NULL) + alloc_sbuf(); + + rbuf=udfld()+4*VOLUME; + + for (mu=0;mu<4;mu++) + { + pack_ud0(mu); + send_ud0(mu); + } + + for (mu=0;mu<4;mu++) + { + pack_udk(mu); + send_udk(mu); + } + } + + set_flags(COPIED_BND_UD); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/uflds/uflds.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/uflds/uflds.c new file mode 100644 index 0000000000000000000000000000000000000000..33d21a1855a0ba2531a3e2d65e806c92a739adc2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/uflds/uflds.c @@ -0,0 +1,299 @@ + +/******************************************************************************* +* +* File uflds.c +* +* Copyright (C) 2006, 2010, 2011, 2012, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Allocation and initialization of the global gauge fields. +* +* The externally accessible functions are +* +* su3 *ufld(void) +* Returns the base address of the single-precision gauge field. If it +* is not already allocated, the field is allocated and initialized to +* unity. +* +* su3_dble *udfld(void) +* Returns the base address of the double-precision gauge field. If it +* is not already allocated, the field is allocated and initialized to +* unity. Then the boundary conditions are set according to the data +* base by calling set_bc() [bcnds.c]. +* +* void random_ud(void) +* Initializes the active double-precision link variables to uniformly +* distributed random SU(3) matrices. The static link variables are +* left untouched. +* +* void renormalize_ud(void) +* Projects the active double-precision link variables back to SU(3). +* The static link variables are left untouched. +* +* void assign_ud2u(void) +* Assigns the double-precision gauge field to the single-precision +* gauge field. All link variables in the local field, including the +* static ones, are copied. +* +* Notes: +* +* The double-precision field can only be allocated after the geometry arrays +* are set up. All programs in this module act globally and must be called on +* all MPI processes simultaneously. +* +*******************************************************************************/ + +#define UFLDS_C + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "global.h" + +#define N0 (NPROC0*L0) + +static const su3 u0={{0.0f}}; +static const su3_dble ud0={{0.0}}; +static su3 *ub=NULL; +static su3_dble *udb=NULL; + + +static void alloc_u(void) +{ + size_t n; + su3 unity,*u,*um; + + error_root(sizeof(su3)!=(18*sizeof(float)),1,"alloc_u [uflds.c]", + "The su3 structures are not properly packed"); + + n=4*VOLUME; + ub=amalloc(n*sizeof(*ub),ALIGN); + error(ub==NULL,1,"alloc_u [uflds.c]", + "Unable to allocate memory space for the gauge field"); + + unity=u0; + unity.c11.re=1.0f; + unity.c22.re=1.0f; + unity.c33.re=1.0f; + u=ub; + um=ub+n; + + for (;u P-eps*F,U + + TU(eps): P,U -> P,exp(eps*P)*U + +for the momentum field P and the gauge field U, where eps denotes the +integration step size and F the force field that is integrated. Explicitly, + + LPFR(eps,F) = TP(0.5*eps,F)* + TU(eps)* + TP(0.5*eps,F) + + OMF2(eps,F) = TP(r0*eps,F)* + TU(eps/2)* + TP((1-2*r0)*eps,F)* + TU(eps/2)* + TP(r0*eps,F) + + OMF4(eps,F) = TP(r1*eps,F)* + TU(r2*eps)* + TP(r3*eps,F)* + TU(r4*eps)* + TP((1/2-r1-r3)*eps,F)* + TU((1-2*(r2+r4))*eps)* + TP((1/2-r1-r3)*eps,F)* + TU(r4*eps)* + TP(r3*eps,F)* + TU(r2*eps)* + TP(r1*eps,F) + +where r0 is a tunable parameter, usually set to a value in the range from 1/6 +to 0.2, while + + r1=0.08398315262876693 + r2=0.2539785108410595 + r3=0.6822365335719091 + r4=-0.03230286765269967 + +are already fine-tuned values (in the literature, the parameter r0 is often +denoted by lambda, and this convention is also adopted by the parameter data +base in flags/mdint_parms.c). + + +Hierarchical integrators +------------------------ + +If the force F=F0+F1 is a sum of two contributions, one may integrate the two +parts with different integration step sizes. This can be achieved using an +integrator with two levels. At the innermost level, the force F0 is integrated +using a power of an elementary integrator such as + + I0(n0*eps,F0) = OMF4(eps,F0)*...*OMF2(eps,F0) (n0 factors). + +The two-level integrator is then constructed starting from a power of an +elementary integrator for F1, with step size n0*eps, and by replacing all +update steps TU(dt) in that integrator by I0(dt,F0). An example of such an +integrator is + + I1(n1*n0*eps,F1,F0) = OMF2(n0*eps)|_{TU->I0}*... (n1 factors). + +The MD equations are integrated from time t to time t+n1*n0*eps in this case. + +Hierarchical integrators with any number of levels are obtained in the same +way by proceeding from the lowest to the higher levels one by one. An +integrator of this kind is specified by + + tau Trajectory length + nlv Number of levels + +and, for each level, + + integrator Elementary integrator (LPFR, OMF2 or OMF4) + lambda Parameter of the 2nd order OMF integrator + nstep Power of the elementary integrator + nfr Number of forces integrated at this level + ifr Indices of these forces + +The force indices refer to the force data base (see forces/README.forces). The +step sizes need not be given, since these can be inferred from the the data +provided. + + +Parameter data base +------------------- + +The data characterizing a hierarchical integrator are administered by the +parameter data base in the directory modules/flags. The data base consists +of various components that are managed by the following modules: + +hmc_parms.c Basic HMC parameters +mdint_parms.c Descriptions of the integrator levels +action_parms.c Action parameter sets +force_parms.c Force parameter sets +solver_parms.c Solver parameter sets + +In the last three cases, the different parameter sets are labeled by an +integer index. Detailed descriptions of the available parameters are given at +the top of these modules. + +In a main program, the desired parameter sets must first be entered into the +data base. They can then be referred to by their index (where appropriate) and +be easily retrieved in any subprogram. See main/qcd1.in for an example of a +parameter file that contains a complete specification of an MD integrator. + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/update/chrono.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/update/chrono.c new file mode 100644 index 0000000000000000000000000000000000000000..eee28edef2e192c848d1b050d7a031d45bec4a8e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/update/chrono.c @@ -0,0 +1,366 @@ + +/******************************************************************************* +* +* File chrono.c +* +* Copyright (C) 2007, 2011, 2012 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Programs needed for the propagation of solutions of the Dirac equation +* along the molecular-dynamics trajectories +* +* The externally accessible functions are +* +* void setup_chrono(void) +* Allocates the required memory space for the stacks of previous +* solutions to be used in the course of the molecular-dynamics +* trajectories. The number and size of the stacks is inferred from +* the parameter data base. +* +* double mdtime(void) +* Returns the current molecular-dynamics time. +* +* void step_mdtime(double dt) +* Advances the molecular-dynamics time by dt. +* +* void add_chrono(int icr,spinor_dble *psi) +* Adds the solution psi obtained at the current molecular-dynamics +* time to the stack number icr of previously calculated solutions. +* +* int get_chrono(int icr,spinor_dble *psi) +* Extrapolates the solutions stored in the stack number icr to the +* current molecular-dynamics time. The program returns 0 and leaves +* psi unchanged if the stack does not contain any previous solutions. +* Otherwise the program assigns the extrapolated solution to psi and +* returns 1. +* +* void reset_chrono(void) +* Sets the molecular-dynamics time and all counters of previously +* computed solutions to zero. +* +* Notes: +* +* The propagation of the solutions of the Dirac equation was proposed by +* +* R.C. Brower et al., "Chronological inversion method for the Dirac +* matrix in Hybrid Monte Carlo", Nucl. Phys. B484 (1997) 353 +* +* Here the solutions are propagated using a polynomial extrapolation. The +* maximal number of solutions to be kept in memory can be chosen for each +* solution stack separately. +* +* Each quark force specified in the parameter data base may have up to 4 +* solution stacks associated with it (see flags/force_parms.c). In all +* cases, the chronological propagation of the solutions can be turned off +* by setting the maximal numbers of fields to be kept in memory to zero. +* Internally the stacks are labeled by an index icr>=0, where the empty +* stack has index icr=0 and all other stacks have index icr>0. The stack +* indices are included in the force parameter sets. +* +* The module includes a clock that serves to keep track of the molecular- +* dynamics times at which the Dirac equation is solved. The clock is +* advanced by the molecular-dynamics integrator (see update/mdint.c). +* +* All programs in this module should be called simultaneously on all MPI +* processes. +* +*******************************************************************************/ + +#define CHRONO_C + +#include +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "sflds.h" +#include "linalg.h" +#include "update.h" +#include "global.h" + +typedef struct +{ + int ncr; + int isd,nsd; + double *ta; + spinor_dble **sd; +} stack_t; + +static int nst=0; +static double mdt=0.0; +static stack_t *st=NULL; + + +static void init_stacks(void) +{ + int ncr,icr,k; + double *ta; + spinor_dble **sd; + + for (icr=0;icr0) + { + free(st[1].ta); + afree(st[1].sd[0]); + free(st[1].sd); + free(st); + + nst=0; + st=NULL; + } +} + + +static void alloc_stacks(void) +{ + int i,j,k; + hmc_parms_t hmc; + mdint_parms_t mdp; + force_parms_t fp; + + hmc=hmc_parms(); + + for (i=0;inst) + nst=fp.icr[k]; + } + } + } + + if (nst>0) + { + nst+=1; + st=malloc(nst*sizeof(*st)); + error(st==NULL,1,"alloc_stacks [chrono.c]", + "Unable to allocate stack structures"); + + for (i=0;i0) + { + ncr=0; + + for (icr=0;icr0) + { + ta=malloc(ncr*sizeof(*ta)); + sd=malloc(ncr*sizeof(*sd)); + s=amalloc(ncr*VOLUME*sizeof(*s),ALIGN); + + error((ta==NULL)||(sd==NULL)||(s==NULL),1,"alloc_stacks [chrono.c]", + "Unable to allocate field stacks"); + } + else + { + ta=NULL; + sd=NULL; + s=NULL; + } + + for (icr=1;icr0) + { + st[icr].ta=ta; + st[icr].sd=sd; + + for (k=0;k0)&&(icr=ncr) + jsd-=ncr; + + st[icr].ta[jsd]=mdt; + assign_sd2sd(VOLUME,psi,st[icr].sd[jsd]); + st[icr].nsd+=1; + } + } + else + error_loc(1,1,"add_chrono [chrono.c]","Unknown field stack"); +} + + +int get_chrono(int icr,spinor_dble *psi) +{ + int ncr,nsd,isd; + int k,l,ksd,lsd; + double *ta,c; + spinor_dble **sd; + + if ((icr>0)&&(icr=ncr) + ksd-=ncr; + c=1.0; + + for (l=0;l=ncr) + lsd-=ncr; + + c*=((mdt-ta[lsd])/(ta[ksd]-ta[lsd])); + } + } + + mulr_spinor_add_dble(VOLUME,psi,sd[ksd],c); + } + + return 1; + } + else if (icr==0) + return 0; + else + { + error_loc(1,1,"get_chrono [chrono.c]","Unknown field stack"); + return 0; + } +} + + +void reset_chrono(void) +{ + int icr; + + for (icr=0;icr +#include +#include +#include +#include "mpi.h" +#include "utils.h" +#include "flags.h" +#include "update.h" +#include "global.h" + +typedef struct +{ + int n,ns; + int *status; +} counter_t; + +static int nac=0,nfd=0,nfr=0,nmd=0; +static counter_t *act=NULL,*fld=NULL,*frc=NULL,*mds=NULL; + + +static void free_cnt(int nc,counter_t *cnt) +{ + int i; + + for (i=0;i0) + { + free(cnt[i].status); + free(cnt); + break; + } + } +} + + +static counter_t *alloc_cnt(int nc) +{ + int i; + counter_t *cnt; + + if (nc>0) + { + cnt=malloc(nc*sizeof(*cnt)); + error(cnt==NULL,1,"alloc_cnt [counters.c]", + "Unable to allocate counters"); + + for (i=0;i=nac) + nac=j+1; + + sp=solver_parms(ap.isp[0]); + if (sp.solver==DFL_SAP_GCR) + nmd=3; + + if ((ap.action==ACF_TM2)|| + (ap.action==ACF_TM2_EO)) + { + if (ap.ipf>=nfd) + nfd=ap.ipf+1; + + sp=solver_parms(ap.isp[1]); + if (sp.solver==DFL_SAP_GCR) + nmd=3; + } + + if ((ap.action==ACF_RAT)|| + (ap.action==ACF_RAT_SDET)) + { + if (ap.ipf>=nfd) + nfd=ap.ipf+1; + } + } + } + + for (i=0;i=nfr) + nfr=k+1; + + sp=solver_parms(fp.isp[0]); + if (sp.solver==DFL_SAP_GCR) + nmd=3; + } + } + } +} + + +static void set_ns(void) +{ + int i,j,k; + hmc_parms_t hmc; + mdint_parms_t mdp; + action_parms_t ap; + force_parms_t fp; + solver_parms_t sp; + + hmc=hmc_parms(); + + for (i=0;i0) + { + mds[0].ns=1; + mds[1].ns=1; + mds[2].ns=1; + } +} + + +static void alloc_stat(int nc,counter_t *cnt) +{ + int i,ns,*stat; + + if (nc>0) + { + ns=0; + + for (i=0;i0) + { + cnt[i].status=stat; + stat+=cnt[i].ns; + } + } + } +} + + +void setup_counters(void) +{ + free_cnt(nac,act); + free_cnt(nfd,fld); + free_cnt(nfr,frc); + free_cnt(nmd,mds); + + set_nc(); + act=alloc_cnt(nac); + fld=alloc_cnt(nfd); + frc=alloc_cnt(nfr); + mds=alloc_cnt(nmd); + + set_ns(); + alloc_stat(nac,act); + alloc_stat(nfd,fld); + alloc_stat(nfr,frc); + alloc_stat(nmd,mds); + + clear_counters(); +} + + +static void set_cnt2zero(int nc,counter_t *cnt) +{ + int i,j,ns,*stat; + + for (i=0;i=0)&&(idx=0)&&(idx=0)&&(idx = ",idx); + } + } + else if (strcmp(type,"modes")==0) + { + if ((idx>=0)&&(idx = ",idx); + } + } + else if (strcmp(type,"action")==0) + { + if ((idx>=0)&&(idx = ",idx); + } + } + else if (strcmp(type,"field")==0) + { + if ((idx>=0)&&(idx = ",idx); + } + } + else + { + error_loc(1,1,"print_avgstat [counters.c]","Unknown counter type"); + return; + } + + if (ns>0) + { + if ((strcmp(type,"modes")==0)&&(idx==0)) + { + n+=mds[2].n; + + if (n>0) + r=1.0/(double)(n); + else + r=1.0; + + printf("%d",(int)((double)(stat[0]+mds[2].status[0])*r+0.5)); + + if (mds[2].n>0) + printf(" (no of regenerations = %d)",mds[2].n); + } + else + { + if (n>0) + r=1.0/(double)(n); + else + r=1.0; + + printf("%d",(int)((double)(stat[0])*r+0.5)); + + for (i=1;i0) + print_avgstat("action",i); + } + + for (i=0;i0) + print_avgstat("field",i); + } + + for (i=0;i0) + print_avgstat("force",i); + } + + for (i=0;i<(nmd-1);i++) + { + if (mds[i].ns>0) + print_avgstat("modes",i); + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/update/hmc.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/update/hmc.c new file mode 100644 index 0000000000000000000000000000000000000000..b485aa7dc4c0e321239fecff2065671e2c29abc5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/update/hmc.c @@ -0,0 +1,791 @@ + +/******************************************************************************* +* +* File hmc.c +* +* Copyright (C) 2005, 2007, 2009-2013 Martin Luescher, Filippo Palombi, +* Stefan Schaefer +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* HMC simulation algorithm. +* +* The externally accessible functions are +* +* void hmc_sanity_check(void) +* Performs various checks on the chosen parameters for the HMC +* algorithm and terminates with an error message if an inconsistency +* is discovered. +* +* void hmc_wsize(int *nwud,int *nws,int *nwsd,int *nwv,int *nwvd) +* Determines the minimal sizes of the workspaces required for the +* HMC algorithm based on the information in the parameter data base. +* On exit the program returns the numbers of double-precision gauge +* (nwud), spinor (nwsd) and complex vector (nwvd) fields as well as +* the numbers of single-precision spinor (nws) and complex vector +* (nwv) fields that must be allocated. +* +* int run_hmc(double *act0,double *act1) +* Generates a random momentum field, integrates the MD equations and +* applies the HMC acceptance step to the fields at the end of the MD +* trajectory (see the notes). +* The arrays act0 and act1 must have at least nact+1 elements, where +* nact is the number of actions that take part in the HMC algorithm +* (see flags/hmc_parms.c). On exit act0 and act1 contain the part of +* the actions computed on the local lattice at the beginning and the +* end of the MD evolution (see the notes). +* The program returns 1 or 0 depending on whether the field generated +* by the molecular-dynamics evolution was accepted or not. If it was +* not accepted, the gauge field is restored to its initial value. +* +* Notes: +* +* The molecular-dynamics equations are integrated using the integrator +* specified by the list of elementary operations returned by mdsteps() +* (see update/mdsteps.c and update/mdint.c). The elements of the action +* arrays act0 and act1 are +* +* actx[0] Action of the momentum field, +* actx[1] Gauge field action, +* actx[2+n] Pseudo-fermion action number n, +* +* where the pseudo-fermion actions are counted from 0 in steps of 1, as +* they appear in the action array hmc.iact returned by hmc_parms(). +* +* The boundary conditions are imposed as specified in the parameter data +* base (see flags/lat_parms.c). Accepted new gauge field configurations +* are renormalized to SU(3) on all active links. +* +* The programs in this module perform global communications and must be +* called simultaneously on all MPI processes. +* +*******************************************************************************/ + +#define HMC_C + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "random.h" +#include "su3fcts.h" +#include "flags.h" +#include "lattice.h" +#include "utils.h" +#include "uflds.h" +#include "mdflds.h" +#include "linalg.h" +#include "dfl.h" +#include "forces.h" +#include "update.h" +#include "global.h" + +#define MAX(n,m) \ + if ((n)<(m)) \ + (n)=(m) + +static int nrs=0,*rs; + + +static void init_rs(int nr) +{ + int k; + + if (nr>nrs) + { + if (nrs>0) + free(rs); + + rs=malloc(nr*sizeof(*rs)); + error_root(rs==NULL,1,"init_rs [hmc.c]", + "Unable to allocate auxiliary array"); + nrs=nr; + } + + for (k=0;k=npf); + iemu|=(ap.imu[0]<0); + iemu|=(ap.imu[0]>=nmu); + iem0|=(sea_quark_mass(ap.im0)==DBL_MAX); + + if ((ap.action==ACF_TM2)|| + (ap.action==ACF_TM2_EO)) + { + iemu|=(ap.imu[1]<0); + iemu|=(ap.imu[1]>=nmu); + } + } + else if ((ap.action==ACF_RAT)|| + (ap.action==ACF_RAT_SDET)) + { + iepf|=(ap.ipf<0); + iepf|=(ap.ipf>=npf); + iem0|=(sea_quark_mass(ap.im0)==DBL_MAX); + + rp=rat_parms(ap.irat[0]); + ierat|=(ap.irat[2]>=rp.degree); + } + } + + for (i=0;i=npf); + iemu|=(fp.imu[0]<0); + iemu|=(fp.imu[0]>=nmu); + iem0|=(sea_quark_mass(fp.im0)==DBL_MAX); + + if ((fp.force==FRF_TM2)|| + (fp.force==FRF_TM2_EO)) + { + iemu|=(fp.imu[1]<0); + iemu|=(fp.imu[1]>=nmu); + } + } + else if ((fp.force==FRF_RAT)|| + (fp.force==FRF_RAT_SDET)) + { + iepf|=(fp.ipf<0); + iepf|=(fp.ipf>=npf); + iem0|=(sea_quark_mass(fp.im0)==DBL_MAX); + + rp=rat_parms(fp.irat[0]); + ierat|=(fp.irat[2]>=rp.degree); + } + } + } + + error_root(iepf!=0,1,"hmc_sanity_check [hmc.c]", + "Some pseudo-fermion indices are out of range"); + error_root(iemu!=0,1,"hmc_sanity_check [hmc.c]", + "Some twisted-mass indices are out of range"); + error_root(iem0!=0,1,"hmc_sanity_check [hmc.c]", + "Some sea-quark mass indices are out of range"); + error_root(ierat!=0,1,"hmc_sanity_check [hmc.c]", + "Some rational functions are not or not correctly specified"); + error_root(iacg!=1,1,"hmc_sanity_check [hmc.c]", + "Gauge action is missing or occurs several times"); + + ie=0; + + for (k=0;k1) + { + MAX(*nwsd,nsd+np+3); + } + else + { + MAX(*nwsd,nsd+5); + } + } + else if (sp.solver==SAP_GCR) + { + MAX(*nws,2*sp.nkv+1); + MAX(*nwsd,nsd+2); + } + else if (sp.solver==DFL_SAP_GCR) + { + MAX(*nws,2*sp.nkv+2); + MAX(*nwsd,nsd+3); + dfl_wsize(nws,nwv,nwvd); + } +} + + +void hmc_wsize(int *nwud,int *nws,int *nwsd,int *nwv,int *nwvd) +{ + int nlv,nact,*iact; + int nfr,*ifr,nsd,np,i,j; + hmc_parms_t hmc; + mdint_parms_t mdp; + action_parms_t ap; + force_parms_t fp; + solver_parms_t sp; + + (*nwud)=1; + (*nws)=0; + (*nwsd)=0; + (*nwv)=0; + (*nwvd)=0; + + hmc=hmc_parms(); + nlv=hmc.nlv; + nact=hmc.nact; + iact=hmc.iact; + + for (i=0;i0)) + add2counter("modes",2,status+2); +} + + +static void start_hmc(double *act0,su3_dble *uold) +{ + int i,n,nact,*iact; + int status[3]; + double *mu; + su3_dble *udb; + dfl_parms_t dfl; + hmc_parms_t hmc; + action_parms_t ap; + + clear_counters(); + udb=udfld(); + cm3x3_assign(4*VOLUME,udb,uold); + chs_ubnd(-1); + random_mom(); + act0[0]=momentum_action(0); + + dfl=dfl_parms(); + + if (dfl.Ns) + { + dfl_modes2(status); + error_root((status[1]<0)||((status[1]==0)&&(status[0]<0)),1, + "start_hmc [hmc.c]","Deflation subspace generation " + "failed (status = %d;%d)",status[0],status[1]); + + if (status[1]==0) + add2counter("modes",0,status); + else + add2counter("modes",2,status+1); + } + + hmc=hmc_parms(); + nact=hmc.nact; + iact=hmc.iact; + mu=hmc.mu; + n=2; + + for (i=0;i1) + { + r=da; + MPI_Reduce(&r,&da,1,MPI_DOUBLE,MPI_SUM,0,MPI_COMM_WORLD); + } + + if (my_rank==0) + { + ranlxd(&r,1); + + if (da<=0.0) + iac=1; + else if (r<=exp(-da)) + iac=1; + } + + if (NPROC>1) + mpc_bcast_i(&iac,1); + + if (iac==0) + { + udb=udfld(); + cm3x3_assign(4*VOLUME,uold,udb); + set_flags(UPDATED_UD); + } + else + { + chs_ubnd(1); + renormalize_ud(); + } + + return iac; +} + + +int run_hmc(double *act0,double *act1) +{ + int iac; + su3_dble **uold; + + uold=reserve_wud(1); + + start_hmc(act0,uold[0]); + run_mdint(); + end_hmc(act1); + iac=accept_hmc(act0,act1,uold[0]); + + release_wud(); + + return iac; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/update/mdint.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/update/mdint.c new file mode 100644 index 0000000000000000000000000000000000000000..b122ad159c28d0ced1c96bd554019dfc63025411 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/update/mdint.c @@ -0,0 +1,451 @@ + +/******************************************************************************* +* +* File mdint.c +* +* Copyright (C) 2011-2013 Stefan Schaefer, Martin Luescher, John Bulava +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Integration of the molecular-dynamics equations. +* +* The externally accessible functions are +* +* void run_mdint(void) +* Integrates the molecular-dynamics equations using the current +* integrator (see the notes). +* +* Notes: +* +* The integrator used is the one defined by the array of elementary operations +* returned by mdsteps() (see update/mdsteps.c). It is assumed that the fields +* and the integrator have been properly initialized. +* +* In the course of the integration, the solver iteration numbers are added +* to the appropriate counters provided by the module update/counters.c. +* +* The program in this module performs global communications and must be +* called simultaneously on all MPI processes. +* +* Some debugging information is printed to stdout if the macro MDINT_DBG is +* defined. The norm of the forces printed is the norm per active link. +* +*******************************************************************************/ + +#define MDINT_C + +#include +#include +#include +#include +#include +#include "mpi.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "mdflds.h" +#include "su3fcts.h" +#include "linalg.h" +#include "dfl.h" +#include "forces.h" +#include "update.h" +#include "global.h" + +#define N0 (NPROC0*L0) +#define N1 (NPROC1*L1) +#define N2 (NPROC2*L2) +#define N3 (NPROC3*L3) + +static int nsm; +static double rtau,dtau; + + +static void chk_mode_regen(int isp,int *status) +{ + int i,is; + solver_parms_t sp; + + sp=solver_parms(isp); + + if (sp.solver==DFL_SAP_GCR) + { + is=status[2]; + + for (i=2;i<4;i++) + status[i]=status[i+1]; + + status[4]=is; + + if (status[4]>0) + add2counter("modes",2,status+4); + if (status[5]>0) + add2counter("modes",2,status+5); + } +} + + +static void update_mom(void) +{ + int bc,ix,t,ifc; + su3_alg_dble *mom,*frc; + mdflds_t *mdfs; + + bc=bc_type(); + mdfs=mdflds(); + mom=(*mdfs).mom; + frc=(*mdfs).frc; + + for (ix=(VOLUME/2);ix0)&&(rtau>dtau)) + { + sp=solver_parms(isp); + + if (sp.solver==DFL_SAP_GCR) + { + dfl_update2(nsm,status); + error_root((status[1]<0)||((status[1]==0)&&(status[0]<0)),1, + "dfl_upd [mdint.c]","Deflation subspace update " + "failed (status = %d;%d)",status[0],status[1]); + + if (status[1]==0) + add2counter("modes",1,status); + else + add2counter("modes",2,status+1); + + rtau=0.0; + } + } +} + +#ifdef MDINT_DBG + +void run_mdint(void) +{ + int my_rank,nop,itu; + int iop,status[6]; + double *mu,eps,nlk,nrm; + mdflds_t *mdfs; + mdstep_t *s,*sm; + hmc_parms_t hmc; + force_parms_t fp; + double wt1, wt2; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + mdfs=mdflds(); + hmc=hmc_parms(); + mu=hmc.mu; + reset_chrono(); + start_dfl_upd(); + + nlk=(double)(4*N0*N1)*(double)(N2*N3); + if (bc_type()==0) + nlk-=(double)(N1)*(double)(N2*N3); + else if (bc_type()==1) + nlk-=(double)(3*N1)*(double)(N2*N3); + + s=mdsteps(&nop,&itu); + sm=s+nop; + + for (;s +#include +#include +#include +#include +#include "mpi.h" +#include "utils.h" +#include "flags.h" +#include "update.h" +#include "global.h" + +static int nsmx,nmds=0,iend=1; +static mdstep_t *mds=NULL,*mdw[3]; + + +static void set_nsmx(int nlv) +{ + int ntu,ilv; + int nfr,*ifr,i; + mdint_parms_t mdp; + + iend=0; + ntu=1; + + for (ilv=0;ilviend) + iend=ifr[i]; + } + } + + iend+=2; + nsmx=(ntu+1)*iend; +} + + +static void alloc_mds(void) +{ + int k; + + if (mds!=NULL) + free(mds); + + mds=malloc(4*nsmx*sizeof(*mds)); + error(mds==NULL,1,"alloc_mds [mdsteps.c]", + "Unable to allocate mdsteps array"); + + for (k=0;k<3;k++) + mdw[k]=mds+(k+1)*nsmx; +} + + +static void set_steps2zero(int n,mdstep_t *s) +{ + int i; + + for (i=0;i=itu) + { + r[j].iop=s[i].iop; + r[j].eps=c*s[i].eps; + } + } +} + + +static int nfrc_steps(mdstep_t *s) +{ + int itu,n; + + itu=iend-1; + n=0; + + while (s[n].iop0) + swap_steps(s,s+i); + k+=1; + } + } + + error_root(k!=1,1,"sort_forces [mdsteps.c]", + "Incorrect gauge force count"); + + for (i=1;i=0;ilv--) + { + n=nall_steps(mds); + copy_steps(n,1.0,mds,mdw[0]); + expand_level(ilv,1.0,mdw[1],mdw[2]); + insert_level(mdw[1],mdw[0],mds); + } + + sort_forces(); + nmds=nall_steps(mds)+1; +} + + +mdstep_t *mdsteps(int *nop,int *itu) +{ + (*nop)=nmds; + (*itu)=iend-1; + + return mds; +} + + +static void print_ops(void) +{ + int i,itu; + + printf("List of elementary operations:\n"); + + itu=iend-1; + + for (i=0;i +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "sflds.h" +#include "sw_term.h" +#include "dirac.h" +#include "linalg.h" +#include "sap.h" +#include "dfl.h" +#include "ratfcts.h" +#include "forces.h" +#include "update.h" +#include "global.h" + +#define PRECISION_LIMIT 1.0e-10 + +static int nps=0,ns=0,*nsps; +static double *rs; +static double cfs[4]={-0.5,0.375,-0.3125,0.2734375}; + + +static void set_nsps(int n,int *np,int *isp) +{ + int k; + + if (n>ns) + { + if (ns>0) + free(nsps); + + nsps=malloc(2*n*sizeof(*nsps)); + error(nsps==NULL,1,"set_nsps [rwrat.c]", + "Unable to allocate auxiliary array"); + ns=n; + } + + for (k=0;knps) + { + if (nps>0) + free(rs); + + rs=malloc(np*sizeof(*rs)); + error(rs==NULL,1,"set_res [rwrat.c]", + "Unable to allocate auxiliary array"); + nps=np; + } + + for (k=0;k1) + { + iprms[0]=irp; + iprms[1]=n; + + MPI_Bcast(iprms,2,MPI_INT,0,MPI_COMM_WORLD); + + error((iprms[0]!=irp)||(iprms[1]!=n),1, + "rwrat [rwrat.c]","Parameter irp or n is not global"); + } + + rp=rat_parms(irp); + error_root((rp.degree==0)||(n<1),1,"rwrat [rwrat.c]", + "Undefined rational function or improper choice of n"); + + if (NPROC>1) + { + set_nsps(n,np,isp); + MPI_Bcast(nsps,2*n,MPI_INT,0,MPI_COMM_WORLD); + ie=0; + + for (k=0;k = %.4e, = %.4e", + (*sqn),r[0],r[1]); +#endif + + if ((delta*r[1])>PRECISION_LIMIT) + { + k=2; + apply_Z(n,np,isp,&rf,wsd[1],wsd[0],status); + r[0]=spinor_prod_re_dble(VOLUME/2,1,wsd[1],wsd[0]); + r[1]=norm_square_dble(VOLUME/2,1,wsd[0]); + lnr+=(cfs[2]*r[0]+cfs[3]*r[1]); + +#ifdef RWRAT_DBG + message(", = %.4e, = %.4e",r[0],r[1]); +#endif + + error_root((delta*r[1])>PRECISION_LIMIT,1,"rwrat [rwrat.c]", + "Unable to reach the required precision"); + } + +#ifdef RWRAT_DBG + message("\n"); +#endif + + avg_stat(k,n,isp,status); + release_wsd(); + +#ifdef RWRAT_DBG + message("[rwrat]: -ln(r) = %.4e\n",lnr); +#endif + + return lnr; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/update/rwtm.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/update/rwtm.c new file mode 100644 index 0000000000000000000000000000000000000000..129aea399576a5768aa12f057015311468d96162 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/update/rwtm.c @@ -0,0 +1,326 @@ + +/******************************************************************************* +* +* File rwtm.c +* +* Copyright (C) 2012-2014 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Twisted-mass reweighting factors. +* +* The externally accessible functions are +* +* double rwtm1(double mu1,double mu2,int isp,double *sqn,int *status) +* Generates a random pseudo-fermion field with normal distribution, +* assigns its square norm to sqn and returns -ln(r1) (see the notes). +* The twisted-mass Dirac equation is solved using the solver specified +* by the parameter set number isp. +* The argument status must be an array of at least 1,1 and 3 elements, +* respectively, in the case of the CGNE, SAP_GCR and DFL_SAP_GCR solver. +* On exit the array elements contain the status values returned by the +* solver program (when the DFL_SAP_GCR solver is used, status[2] reports +* the number of deflation subspace regenerations that were required). +* +* double rwtm2(double mu1,double mu2,int isp,double *sqn,int *status) +* Generates a random pseudo-fermion field with normal distribution, +* assigns its square norm to sqn and returns -ln(r2) (see the notes). +* The twisted-mass Dirac equation is solved using the solver specified +* by the parameter set number isp. +* The argument status must be an array of at least 1,1 and 3 elements, +* respectively, in the case of the CGNE, SAP_GCR and DFL_SAP_GCR solver. +* On exit the array elements contain the average of the status values +* returned by the solver program (when the DFL_SAP_GCR solver is used, +* status[2] reports the number of deflation subspace regenerations that +* were required). +* +* Notes: +* +* Twisted-mass reweighting of the quark determinant was introduced in +* +* M. Luescher, F. Palombi: "Fluctuations and reweighting of the quark +* determinant on large lattices", PoS LATTICE2008 (2008) 049. +* +* The values returned by the programs in this module are stochastic estimates +* of the factors in a product decomposition of the reweighting factors. See +* section 6 of the notes +* +* M. Luescher: "Parameters of the openQCD main programs" [doc/parms.pdf]. +* +* For a given random pseudo-fermion field eta with distribution proportional +* to exp{-(eta,eta)}, the factors r1 and r2 are defined by +* +* r1=exp{-(eta,[R_1-1]*eta)}, r2=exp{-(eta,[R_2-1]*eta)}, +* +* R1=(X+mu2^2)*(X+mu1^2)^(-1), +* +* R2=R1^2*(X+2*mu1^2)*(X+2*mu2^2)^(-1), X=Dw^dag*Dw, +* +* where Dw denotes the massive O(a)-improved Wilson-Dirac operator. In +* both cases, the twisted masses must satisfy +* +* 0<=mu1 +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "sflds.h" +#include "linalg.h" +#include "sap.h" +#include "dfl.h" +#include "forces.h" +#include "update.h" +#include "global.h" + + +static void check_parms(double mu1,double mu2,int isp) +{ + int iprms[1]; + double dprms[2]; + + if (NPROC>1) + { + iprms[0]=isp; + dprms[0]=mu1; + dprms[1]=mu2; + + MPI_Bcast(iprms,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(dprms,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + + error((iprms[0]!=isp)||(dprms[0]!=mu1)||(dprms[1]!=mu2),1, + "check_parms [rwtm.c]","Parameters are not global"); + } + + error_root((mu1<0.0)||(mu2<=mu1),1,"check_parms [rwtm.c]", + "Twisted masses mu1,mu2 are out of range"); +} + + +static double set_eta(spinor_dble *eta) +{ + random_sd(VOLUME,eta,1.0); + bnd_sd2zero(ALL_PTS,eta); + + return norm_square_dble(VOLUME,1,eta); +} + + +double rwtm1(double mu1,double mu2,int isp,double *sqn,int *status) +{ + double lnr; + spinor_dble *eta,*phi,**wsd; + solver_parms_t sp; + sap_parms_t sap; + + check_parms(mu1,mu2,isp); + wsd=reserve_wsd(2); + eta=wsd[0]; + phi=wsd[1]; + (*sqn)=set_eta(eta); + sp=solver_parms(isp); + + if (sp.solver==CGNE) + { + tmcg(sp.nmx,sp.res,mu1,eta,phi,status); + + error_root(status[0]<0,1,"rwtm1 [rwtm.c]", + "CGNE solver failed (mu = %.2e, parameter set no %d, " + "status = %d)",mu1,isp,status[0]); + + lnr=spinor_prod_re_dble(VOLUME,1,eta,phi); + } + else if (sp.solver==SAP_GCR) + { + sap=sap_parms(); + set_sap_parms(sap.bs,sp.isolv,sp.nmr,sp.ncy); + mulg5_dble(VOLUME,eta); + sap_gcr(sp.nkv,sp.nmx,sp.res,mu1,eta,phi,status); + + error_root(status[0]<0,1,"rwtm1 [rwtm.c]", + "SAP_GCR solver failed (mu = %.2e, parameter set no %d, " + "status = %d)",mu1,isp,status[0]); + + lnr=norm_square_dble(VOLUME,1,phi); + } + else if (sp.solver==DFL_SAP_GCR) + { + sap=sap_parms(); + set_sap_parms(sap.bs,sp.isolv,sp.nmr,sp.ncy); + mulg5_dble(VOLUME,eta); + dfl_sap_gcr2(sp.nkv,sp.nmx,sp.res,mu1,eta,phi,status); + + error_root((status[0]<0)||(status[1]<0),1, + "rwtm1 [rwtm.c]","DFL_SAP_GCR solver failed " + "(mu = %.2e, parameter set no %d, status = (%d,%d,%d))", + mu1,isp,status[0],status[1],status[2]); + + status[2]=(status[2]!=0); + lnr=norm_square_dble(VOLUME,1,phi); + } + else + { + lnr=0.0; + error_root(1,1,"rwtm1 [rwtm.c]","Unknown solver"); + } + + release_wsd(); + + return (mu2*mu2-mu1*mu1)*lnr; +} + + +double rwtm2(double mu1,double mu2,int isp,double *sqn,int *status) +{ + int stat[3]; + double lnr1,lnr2; + spinor_dble *eta,*phi,**wsd; + solver_parms_t sp; + sap_parms_t sap; + + check_parms(mu1,mu2,isp); + wsd=reserve_wsd(2); + eta=wsd[0]; + phi=wsd[1]; + (*sqn)=set_eta(eta); + sp=solver_parms(isp); + + if (sp.solver==CGNE) + { + tmcg(sp.nmx,sp.res,mu1,eta,phi,status); + + error_root(status[0]<0,1,"rwtm2 [rwtm.c]", + "CGNE solver failed (mu = %.2e, parameter set no %d, " + "status = %d)",mu1,isp,status[0]); + + tmcg(sp.nmx,sp.res,sqrt(2.0)*mu2,eta,eta,stat); + + error_root(stat[0]<0,1,"rwtm2 [rwtm.c]", + "CGNE solver failed (mu = %.2e, parameter set no %d, " + "status = %d)",sqrt(2.0)*mu2,isp,stat[0]); + status[0]=(status[0]+stat[0]+1)/2; + + if (mu1>0.0) + lnr1=norm_square_dble(VOLUME,1,phi); + else + lnr1=0.0; + + lnr2=spinor_prod_re_dble(VOLUME,1,eta,phi); + } + else if (sp.solver==SAP_GCR) + { + sap=sap_parms(); + set_sap_parms(sap.bs,sp.isolv,sp.nmr,sp.ncy); + mulg5_dble(VOLUME,eta); + sap_gcr(sp.nkv,sp.nmx,sp.res,mu1,eta,phi,status); + + error_root(status[0]<0,1,"rwtm2 [rwtm.c]", + "SAP_GCR solver failed (mu = %.2e, parameter set no %d, " + "status = %d)",mu1,isp,status[0]); + + mulg5_dble(VOLUME,phi); + sap_gcr(sp.nkv,sp.nmx,sp.res,sqrt(2.0)*mu2,phi,eta,stat); + + error_root(stat[0]<0,2,"rwtm2 [rwtm.c]", + "SAP_GCR solver failed (mu = %.2e, parameter set no %d, " + "status = %d)",sqrt(2.0)*mu2,isp,stat[0]); + status[0]+=stat[0]; + + if (mu1>0.0) + { + sap_gcr(sp.nkv,sp.nmx,sp.res,mu1,phi,phi,stat); + error_root(stat[0]<0,3,"rwtm2 [rwtm.c]", + "SAP_GCR solver failed (mu = %.2e, parameter set no %d, " + "status = %d)",mu1,isp,stat[0]); + status[0]=(status[0]+stat[0]+1)/3; + + lnr1=norm_square_dble(VOLUME,1,phi); + } + else + { + status[0]=(status[0]+1)/2; + lnr1=0.0; + } + + lnr2=norm_square_dble(VOLUME,1,eta); + } + else if (sp.solver==DFL_SAP_GCR) + { + sap=sap_parms(); + set_sap_parms(sap.bs,sp.isolv,sp.nmr,sp.ncy); + mulg5_dble(VOLUME,eta); + dfl_sap_gcr2(sp.nkv,sp.nmx,sp.res,mu1,eta,phi,status); + + error_root((status[0]<0)||(status[1]<0),1, + "rwtm2 [rwtm.c]","DFL_SAP_GCR solver failed " + "(mu = %.2e, parameter set no %d, status = (%d,%d,%d))", + mu1,isp,status[0],status[1],status[2]); + status[2]=(status[2]!=0); + + mulg5_dble(VOLUME,phi); + dfl_sap_gcr2(sp.nkv,sp.nmx,sp.res,sqrt(2.0)*mu2,phi,eta,stat); + + error_root((stat[0]<0)||(stat[1]<0),2, + "rwtm2 [rwtm.c]","DFL_SAP_GCR solver failed " + "(mu = %.2e, parameter set no %d, status = (%d,%d,%d)", + sqrt(2.0)*mu2,isp,stat[0],stat[1],stat[2]); + status[0]+=stat[0]; + status[1]+=stat[1]; + status[2]+=(stat[2]!=0); + + if (mu1>0.0) + { + dfl_sap_gcr2(sp.nkv,sp.nmx,sp.res,mu1,phi,phi,stat); + + error_root((stat[0]<0)||(stat[1]<0),3, + "rwtm2 [rwtm.c]","DFL_SAP_GCR solver failed " + "(mu = %.2e, parameter set no %d, status = (%d,%d,%d)", + mu1,isp,stat[0],stat[1],stat[2]); + + status[0]=(status[0]+stat[0]+1)/3; + status[1]=(status[1]+stat[1]+1)/3; + status[2]+=(stat[2]!=0); + + lnr1=norm_square_dble(VOLUME,1,phi); + } + else + { + status[0]=(status[0]+1)/2; + status[1]=(status[1]+1)/2; + lnr1=0.0; + } + + lnr2=norm_square_dble(VOLUME,1,eta); + } + else + { + lnr1=0.0; + lnr2=0.0; + error_root(1,1,"rwtm2 [rwtm.c]","Unknown solver"); + } + + release_wsd(); + + mu1=mu1*mu1; + mu2=mu2*mu2; + + return ((mu2-mu1)/(2.0*mu2-mu1))*(mu1*(mu2-mu1)*lnr1+2.0*mu2*mu2*lnr2); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/update/rwtmeo.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/update/rwtmeo.c new file mode 100644 index 0000000000000000000000000000000000000000..1d0d375ae58e7f7cb60217470e4d285f939dd151 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/update/rwtmeo.c @@ -0,0 +1,343 @@ + +/******************************************************************************* +* +* File rwtmeo.c +* +* Copyright (C) 2012-2014 Martin Luescher, Stefan Schaefer +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Twisted-mass reweighting factors (even-odd preconditioned version). +* +* The externally accessible functions are +* +* double rwtm1eo(double mu1,double mu2,int isp,double *sqn,int *status) +* Generates a random pseudo-fermion field with normal distribution, +* assigns its square norm to sqn and returns -ln(r1) (see the notes). +* The twisted-mass Dirac equation is solved using the solver specified +* by the parameter set number isp. +* The argument status must be an array of at least 1,1 and 3 elements, +* respectively, in the case of the CGNE, SAP_GCR and DFL_SAP_GCR solver. +* On exit the array elements contain the status values returned by the +* solver program (when the DFL_SAP_GCR solver is used, status[2] reports +* the number of deflation subspace regenerations that were required). +* +* double rwtm2eo(double mu1,double mu2,int isp,double *sqn,int *status) +* Generates a random pseudo-fermion field with normal distribution, +* assigns its square norm to sqn and returns -ln(r2) (see the notes). +* The twisted-mass Dirac equation is solved using the solver specified +* by the parameter set number isp. +* The argument status must be an array of at least 1,1 and 3 elements, +* respectively, in the case of the CGNE, SAP_GCR and DFL_SAP_GCR solver. +* On exit the array elements contain the average of the status values +* returned by the solver program (when the DFL_SAP_GCR solver is used, +* status[2] reports the number of deflation subspace regenerations that +* were required). +* +* Notes: +* +* Twisted-mass reweighting of the quark determinant was introduced in +* +* M. Luescher, F. Palombi: "Fluctuations and reweighting of the quark +* determinant on large lattices", PoS LATTICE2008 (2008) 049. +* +* The values returned by the programs in this module are stochastic estimates +* of the factors in a product decomposition of the reweighting factors. See +* section 6 of the notes +* +* M. Luescher: "Parameters of the openQCD main programs" [doc/parms.pdf]. +* +* For a given random pseudo-fermion field eta with distribution proportional +* to exp{-(eta,eta)}, the factors r1 and r2 are defined by +* +* r1=exp{-(eta,[R_1-1]*eta)}, r2=exp{-(eta,[R_2-1]*eta)}, +* +* R1=(X+mu2^2)*(X+mu1^2)^(-1), +* +* R2=R1^2*(X+2*mu1^2)*(X+2*mu2^2)^(-1), X=Dwhat^dag*Dwhat, +* +* where Dwhat denotes the even-odd preconditioned, massive O(a)-improved +* Wilson-Dirac operator. In both cases, the twisted masses must satisfy +* +* 0<=mu1 +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "lattice.h" +#include "sflds.h" +#include "linalg.h" +#include "sap.h" +#include "dfl.h" +#include "forces.h" +#include "update.h" +#include "global.h" + + +static void check_parms(double mu1,double mu2,int isp) +{ + int iprms[1]; + double dprms[2]; + + if (NPROC>1) + { + iprms[0]=isp; + dprms[0]=mu1; + dprms[1]=mu2; + + MPI_Bcast(iprms,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(dprms,2,MPI_DOUBLE,0,MPI_COMM_WORLD); + + error((iprms[0]!=isp)||(dprms[0]!=mu1)||(dprms[1]!=mu2),1, + "check_parms [rwtmeo.c]","Parameters are not global"); + } + + error_root((mu1<0.0)||(mu2<=mu1),1,"check_parms [rwtmeo.c]", + "Twisted masses mu1,mu2 are out of range"); +} + + +static double set_eta(spinor_dble *eta) +{ + random_sd(VOLUME/2,eta,1.0); + set_sd2zero(VOLUME/2,eta+(VOLUME/2)); + bnd_sd2zero(EVEN_PTS,eta); + + return norm_square_dble(VOLUME/2,1,eta); +} + + +double rwtm1eo(double mu1,double mu2,int isp,double *sqn,int *status) +{ + double lnr; + spinor_dble *eta,*phi,**wsd; + solver_parms_t sp; + sap_parms_t sap; + tm_parms_t tm; + + tm=tm_parms(); + if (tm.eoflg!=1) + set_tm_parms(1); + + check_parms(mu1,mu2,isp); + wsd=reserve_wsd(2); + eta=wsd[0]; + phi=wsd[1]; + (*sqn)=set_eta(eta); + sp=solver_parms(isp); + + if (sp.solver==CGNE) + { + tmcgeo(sp.nmx,sp.res,mu1,eta,phi,status); + + error_root(status[0]<0,1,"rwtm1eo [rwtmeo.c]", + "CGNE solver failed (mu = %.2e, parameter set no %d, " + "status = %d)",mu1,isp,status[0]); + + lnr=spinor_prod_re_dble(VOLUME/2,1,eta,phi); + } + else if (sp.solver==SAP_GCR) + { + sap=sap_parms(); + set_sap_parms(sap.bs,sp.isolv,sp.nmr,sp.ncy); + mulg5_dble(VOLUME/2,eta); + sap_gcr(sp.nkv,sp.nmx,sp.res,mu1,eta,phi,status); + + error_root(status[0]<0,1,"rwtm1eo [rwtmeo.c]", + "SAP_GCR solver failed (mu = %.2e, parameter set no %d, " + "status = %d)",mu1,isp,status[0]); + + lnr=norm_square_dble(VOLUME/2,1,phi); + } + else if (sp.solver==DFL_SAP_GCR) + { + sap=sap_parms(); + set_sap_parms(sap.bs,sp.isolv,sp.nmr,sp.ncy); + mulg5_dble(VOLUME/2,eta); + dfl_sap_gcr2(sp.nkv,sp.nmx,sp.res,mu1,eta,phi,status); + + error_root((status[0]<0)||(status[1]<0),1, + "rwtm1eo [rwtmeo.c]","DFL_SAP_GCR solver failed " + "(mu = %.2e, parameter set no %d, status = (%d,%d,%d))", + mu1,isp,status[0],status[1],status[2]); + status[2]=(status[2]!=0); + + lnr=norm_square_dble(VOLUME/2,1,phi); + } + else + { + lnr=0.0; + error_root(1,1,"rwtm1eo [rwtmeo.c]","Unknown solver"); + } + + release_wsd(); + + return (mu2*mu2-mu1*mu1)*lnr; +} + + +double rwtm2eo(double mu1,double mu2,int isp,double *sqn,int *status) +{ + int stat[3]; + double lnr1,lnr2; + spinor_dble *eta,*phi,**wsd; + solver_parms_t sp; + sap_parms_t sap; + tm_parms_t tm; + + tm=tm_parms(); + if (tm.eoflg!=1) + set_tm_parms(1); + + check_parms(mu1,mu2,isp); + wsd=reserve_wsd(2); + eta=wsd[0]; + phi=wsd[1]; + (*sqn)=set_eta(eta); + sp=solver_parms(isp); + + if (sp.solver==CGNE) + { + tmcgeo(sp.nmx,sp.res,mu1,eta,phi,status); + + error_root(status[0]<0,1,"rwtm2eo [rwtmeo.c]", + "CGNE solver failed (mu = %.2e, parameter set no %d, " + "status = %d)",mu1,isp,status[0]); + + tmcgeo(sp.nmx,sp.res,sqrt(2.0)*mu2,eta,eta,stat); + + error_root(stat[0]<0,1,"rwtm2eo [rwtmeo.c]", + "CGNE solver failed (mu = %.2e, parameter set no %d, " + "status = %d)",sqrt(2.0)*mu2,isp,stat[0]); + status[0]=(status[0]+stat[0]+1)/2; + + if (mu1>0.0) + lnr1=norm_square_dble(VOLUME/2,1,phi); + else + lnr1=0.0; + + lnr2=spinor_prod_re_dble(VOLUME/2,1,eta,phi); + } + else if (sp.solver==SAP_GCR) + { + sap=sap_parms(); + set_sap_parms(sap.bs,sp.isolv,sp.nmr,sp.ncy); + mulg5_dble(VOLUME/2,eta); + sap_gcr(sp.nkv,sp.nmx,sp.res,mu1,eta,phi,status); + + error_root(status[0]<0,1,"rwtm2eo [rwtmeo.c]", + "SAP_GCR solver failed (mu = %.2e, parameter set no %d, " + "status = %d)",mu1,isp,status[0]); + + mulg5_dble(VOLUME/2,phi); + set_sd2zero(VOLUME/2,phi+(VOLUME/2)); + sap_gcr(sp.nkv,sp.nmx,sp.res,sqrt(2.0)*mu2,phi,eta,stat); + + error_root(stat[0]<0,2,"rwtm2eo [rwtmeo.c]", + "SAP_GCR solver failed (mu = %.2e, parameter set no %d, " + "status = %d)",sqrt(2.0)*mu2,isp,stat[0]); + status[0]+=stat[0]; + + if (mu1>0.0) + { + sap_gcr(sp.nkv,sp.nmx,sp.res,mu1,phi,phi,stat); + + error_root(stat[0]<0,3,"rwtm2eo [rwtmeo.c]", + "SAP_GCR solver failed (mu = %.2e, parameter set no %d, " + "status = %d)",mu1,isp,stat[0]); + status[0]=(status[0]+stat[0]+1)/3; + + lnr1=norm_square_dble(VOLUME/2,1,phi); + } + else + { + status[0]=(status[0]+1)/2; + lnr1=0.0; + } + + lnr2=norm_square_dble(VOLUME/2,1,eta); + } + else if (sp.solver==DFL_SAP_GCR) + { + sap=sap_parms(); + set_sap_parms(sap.bs,sp.isolv,sp.nmr,sp.ncy); + + mulg5_dble(VOLUME/2,eta); + dfl_sap_gcr2(sp.nkv,sp.nmx,sp.res,mu1,eta,phi,status); + + error_root((status[0]<0)||(status[1]<0),1, + "rwtm2eo [rwtmeo.c]","DFL_SAP_GCR solver failed " + "(mu = %.2e, parameter set no %d, status = (%d,%d,%d))", + mu1,isp,status[0],status[1],status[2]); + status[2]=(status[2]!=0); + + mulg5_dble(VOLUME/2,phi); + set_sd2zero(VOLUME/2,phi+(VOLUME/2)); + + dfl_sap_gcr2(sp.nkv,sp.nmx,sp.res,sqrt(2.0)*mu2,phi,eta,stat); + + error_root((stat[0]<0)||(stat[1]<0),2, + "rwtm2eo [rwtmeo.c]","DFL_SAP_GCR solver failed " + "(mu = %.2e, parameter set no %d, status = (%d,%d,%d)", + sqrt(2.0)*mu2,isp,stat[0],stat[1],stat[2]); + status[0]+=stat[0]; + status[1]+=stat[1]; + status[2]+=(stat[2]!=0); + + if (mu1>0.0) + { + dfl_sap_gcr2(sp.nkv,sp.nmx,sp.res,mu1,phi,phi,stat); + + error_root((stat[0]<0)||(stat[1]<0),3, + "rwtm2eo [rwtmeo.c]","DFL_SAP_GCR solver failed " + "(mu = %.2e, parameter set no %d, status = (%d,%d,%d)", + mu1,isp,stat[0],stat[1],stat[2]); + + status[0]=(status[0]+stat[0]+1)/3; + status[1]=(status[1]+stat[1]+1)/3; + status[2]+=(stat[2]!=0); + + lnr1=norm_square_dble(VOLUME/2,1,phi); + } + else + { + status[0]=(status[0]+1)/2; + status[1]=(status[1]+1)/2; + lnr1=0.0; + } + + lnr2=norm_square_dble(VOLUME/2,1,eta); + } + else + { + lnr1=0.0; + lnr2=0.0; + error_root(1,1,"rwtm2eo [rwtmeo.c]","Unknown solver"); + } + + release_wsd(); + + mu1=mu1*mu1; + mu2=mu2*mu2; + + return ((mu2-mu1)/(2.0*mu2-mu1))*(mu1*(mu2-mu1)*lnr1+2.0*mu2*mu2*lnr2); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/utils/README b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/utils/README new file mode 100644 index 0000000000000000000000000000000000000000..854091967e0ad5a5905ba95785b95a8da1796dcc --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/utils/README @@ -0,0 +1,282 @@ + +******************************************************************************** + + Utility programs + +******************************************************************************** + + +Files +----- + +endian.c Byte swapping programs + +mutils.c Utility functions used in main programs + +utils.c Basic utility functions + +wspace.c Workspace allocation + + +Include file +------------ + +The file utils.h defines the prototypes for all externally accessible +functions that are defined in the *.c files listed above. + + +List of functions +----------------- + +int endianness(void) + Returns LITTLE_ENDIAN if the machine is little endian and BIG_ENDIAN + if it is big endian. Otherwise the return value is UNKNOWN_ENDIAN + +void bswap_int(int n,void *a) + Inverts the byte order of the array elements a[0],..,a[n-1] + assuming these are 4 byte long + +void bswap_double(int n,void *a) + Inverts the byte order of the array elements a[0],..,a[n-1] + assuming these are 8 byte long + +int find_opt(int argc,char *argv[],char *opt) + On process 0, this program compares the string opt with the arguments + argv[1],..,argv[argc-1] and returns the position of the first argument + that matches the string. If there is no matching argument, or if the + program is called from another process, the return value is 0. + +int fdigits(double x) + Returns the smallest integer n such that the value of x printed with + print format %.nf coincides with x up to a relative error at most a + few times the machine precision DBL_EPSILON. + +void check_dir(char* dir) + This program checks whether the directory dir is locally accessible, + from each process, and aborts the main program with an informative + error message if this is not the case. The program must be called + simultaneously on all processes, but the argument may depend on the + process. + +void check_dir_root(char* dir) + On process 0, this program checks whether the directory dir is + accessible and aborts the main program with an informative error + message if this is not the case. When called on other processes, + the program does nothing. + +int name_size(char *format,...) + On process 0, this program returns the length of the string that + would be printed by calling sprintf(*,format,...). The format + string can be any combination of literal text and the conversion + specifiers %s, %d and %.nf (where n is a positive integer). When + called on other processes, the program does nothing and returns + the value of NAME_SIZE. + +long find_section(char *title) + On process 0, this program scans stdin for a line starting with + the string "[title]" (after any number of blanks). It terminates + with an error message if no such line is found or if there are + several of them. The program returns the offset of the line from + the beginning of the file and positions the file pointer to the + next line. On processes other than 0, the program does nothing + and returns -1L. + +long read_line(char *tag,char *format,...) + On process 0, this program reads a line of text and data from stdin + in a controlled manner, as described in the notes below. The tag can + be the empty string "" and must otherwise be an alpha-numeric word + that starts with a letter. If it is not empty, the program searches + for the tag in the current section. An error occurs if the tag is not + found. The program returns the offset of the line from the beginning + of the file and positions the file pointer to the next line. On + processes other than 0, the program does nothing and returns -1L. + +int count_tokens(char *tag) + On process 0, this program finds and reads a line from stdin, exactly + as read_line(tag,..) does, and returns the number of tokens found on + that line after the tag. Tokens are separated by white space (blanks, + tabs or newline characters) and comments (text beginning with #) are + ignored. On exit, the file pointer is positioned at the next line. If + called on other processes, the program does nothing and returns 0. + +void read_iprms(char *tag,int n,int *iprms) + On process 0, this program finds and reads a line from stdin, exactly + as read_line(tag,..) does, reads n integer values from that line after + the tag and assigns them to the elements of the array iprms. An error + occurs if less than n values are found on the line. The values must be + separated by white space (blanks, tabs or newline characters). On exit, + the file pointer is positioned at the next line. When called on other + processes, the program does nothing. + +void read_dprms(char *tag,int n,double *dprms) + On process 0, this program finds and reads a line from stdin, exactly + as read_line(tag,..) does, reads n double values from that line after + the tag and assigns them to the elements of the array iprms. An error + occurs if less than n values are found on the line. The values must be + separated by white space (blanks, tabs or newline characters).On exit, + the file pointer is positioned at the next line. When called on other + processes, the program does nothing. + +int copy_file(char *in,char *out) + Copies the file "in" to the file "out" in binary mode. The return + value is 0 if no I/O error is detected and 1 otherwise, in which + case the error can also be detected by the error_chk() [utils.c] + function. + +int safe_mod(int x,int y) + Returns x mod y, where y is assumed positive and x can have any + sign. The return value is in the interval [0,y) + +void *amalloc(size_t size,int p) + Allocates an aligned memory area of "size" bytes, with a starting + address (the return value) that is an integer multiple of 2^p. A + NULL pointer is returned if the allocation was not successful + +void afree(void *addr) + Frees the aligned memory area at address "addr" that was previously + allocated using amalloc. If the memory space at this address was + already freed using afree, or if the address does not match an + address previously returned by amalloc, the program does not do + anything + +int mpi_permanent_tag(void) + Returns a new send tag that is guaranteed to be unique and which + is therefore suitable for use in permanent communication requests. + The available number of tags of this kind is 16384 + +int mpi_tag(void) + Returns a new send tag for use in non-permanent communications. + Note that the counter for these tags wraps around after 16384 + tags have been delivered + +void error(int test,int no,char *name,char *format,...) + Checks whether "test"=0 on all processes and, if not, aborts the + program gracefully with error number "no" after printing the "name" + of the calling program and an error message to stdout from process 0. + The message is formed on process 0 using the "format" string and any + additional arguments, exactly as in a printf statement + +void error_root(int test,int no,char *name,char *format,...) + Same as the error() function except that "test" is examined on + process 0 only + +int error_loc(int test,int no,char *name,char *message) + Checks whether "test"=0 on the local process and, if not, writes + the error number "no", the program "name" and the error "message" + to an internal buffer. Only the data of the first instance where + this happens are recorded. Note that saved program names and error + messages are truncated to 127 and 511 bytes, respectively. In all + cases, the program returns the value of "test" + +void error_chk(void) + Checks the status of the data saved by error_loc() and aborts the + program gracefully, with error number 1, if an error is recorded on + some of the processes. Before abortion the error numbers, program + names and error messages saved on these processes are printed to + stdout from process 0 + +void message(char *format,...) + Prints a message from process 0 to stdout. The usage and argument + list is the same as in the case of the printf function + +alloc_wud(int n) + Allocates a workspace of n double-precision gauge fields. + +su3_dble **reserve_wud(int n) + Reserves a new workspace of n global double-precision gauge fields + and returns the array ud[0],..,ud[n-1] of the base addresses of the + fields in the workspace. No workspace is reserved and a NULL pointer + is returned if n<=0. + +int release_wud(void) + Releases the workspace of global double-precision gauge fields that + was last reserved and returns the number of fields that are released. + +int wud_size(void) + Returns the number of global double-precision gauge fields that + are currently reserved. + +alloc_wfd(int n) + Allocates a workspace of n double-precision force fields. + +su3_alg_dble **reserve_wfd(int n) + Reserves a new workspace of n global double-precision force fields + and returns the array fd[0],..,fd[n-1] of the base addresses of the + fields in the workspace. No workspace is reserved and a NULL pointer + is returned if n<=0. + +int release_wfd(void) + Releases the workspace of global double-precision force fields that + was last reserved and returns the number of fields that are released. + +int wfd_size(void) + Returns the number of global double-precision force fields that + are currently reserved. + +alloc_ws(int n) + Allocates a workspace of n single-precision spinor fields. + +spinor **reserve_ws(int n) + Reserves a new workspace of n global single-precision spinor fields + and returns the array s[0],..,s[n-1] of the base addresses of the + fields in the workspace. No workspace is reserved and a NULL pointer + is returned if n<=0. + +int release_ws(void) + Releases the workspace of global single-precision spinor fields that + was last reserved and returns the number of fields that are released. + +int ws_size(void) + Returns the number of global single-precision spinor fields that + are currently reserved. + +alloc_wsd(int n) + Allocates a workspace of n double-precision spinor fields. + +spinor_dble **reserve_wsd(int n) + Reserves a new workspace of n global double-precision spinor fields + and returns the array sd[0],..,sd[n-1] of the base addresses of the + fields in the workspace. No workspace is reserved and a NULL pointer + is returned if n<=0. + +int release_wsd(void) + Releases the workspace of global double-precision spinor fields that + was last reserved and returns the number of fields that are released. + +int wsd_size(void) + Returns the number of global double-precision spinor fields that + are currently reserved. + +void alloc_wv(int n) + Allocates a workspace of n single-precision vector fields. + +complex **reserve_wv(int n) + Reserves a new workspace of n global single-precision vector fields + and returns the array v[0],..,v[n-1] of the base addresses of the + fields in the workspace. No workspace is reserved and a NULL pointer + is returned if n<=0. + +int release_wv(void) + Releases the workspace of global single-precision vector fields that + was last reserved and returns the number of fields that are released. + +int wv_size(void) + Returns the number of global single-precision vector fields that + are currently reserved. + +void alloc_wvd(int n) + Allocates a workspace of n double-precision vector fields. + +complex_dble **reserve_wvd(int n) + Reserves a new workspace of n global double-precision vector fields + and returns the array vd[0],..,vd[n-1] of the base addresses of the + fields in the workspace. No workspace is reserved and a NULL pointer + is returned if n<=0. + +int release_wvd(void) + Releases the workspace of global double-precision vector fields that + was last reserved and returns the number of fields that are released. + +int wvd_size(void) + Returns the number of global double-precision vector fields that + are currently reserved. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/utils/endian.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/utils/endian.c new file mode 100644 index 0000000000000000000000000000000000000000..ea1be7bffbd682d30477fe18479b2c40eb8ee077 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/utils/endian.c @@ -0,0 +1,110 @@ + +/******************************************************************************* +* +* File endian.c +* +* Copyright (C) 2007, 2009, 2010 Bjoern Leder, Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Byte swapping programs +* +* The externally accessible functions are +* +* int endianness(void) +* Returns LITTLE_ENDIAN if the machine is little endian and BIG_ENDIAN +* if it is big endian. Otherwise the return value is UNKNOWN_ENDIAN +* +* void bswap_int(int n,void *a) +* Inverts the byte order of the array elements a[0],..,a[n-1] +* assuming these are 4 byte long +* +* void bswap_double(int n,void *a) +* Inverts the byte order of the array elements a[0],..,a[n-1] +* assuming these are 8 byte long +* +* Notes: +* +* The integer types that are guaranteed to be 4 byte long are stdint_t +* and stduint_t. These are defined in the header file misc.h. +* +* On machines complying with the IEEE-754 standard, double precision +* floating-point numbers are 8 byte long. When the header file misc.h +* is read, the compiler checks whether the machine complies with the +* standard. + +* The programs in this module do not involve any communications and can +* be called locally. +* +*******************************************************************************/ + +#define ENDIAN_C + +#include +#include +#include "utils.h" + + +int endianness(void) +{ + stduint_t i; + unsigned char *b; + + i=0x04030201; + b=(unsigned char*)(&i); + + if ((b[0]==1u)&&(b[1]==2u)&&(b[2]==3u)&&(b[3]==4u)) + return LITTLE_ENDIAN; + else if ((b[0]==4u)&&(b[1]==3u)&&(b[2]==2u)&&(b[3]==1u)) + return BIG_ENDIAN; + else return UNKNOWN_ENDIAN; +} + + +void bswap_int(int n,void *a) +{ + unsigned char *ba,*bam,bas; + + ba=(unsigned char*)(a); + bam=ba+4*n; + + for (;ba +#include +#include +#include +#include +#include "mpi.h" +#include "utils.h" +#include "global.h" + +static char text[512]; +static char line[NAME_SIZE+1]; +static char inum[3*sizeof(int)+4]; + + +int find_opt(int argc,char *argv[],char *opt) +{ + int my_rank,k; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + for (k=1;kpl)) + { + pl+=1; + pr[0]='\0'; + + if (cmp_text(pl,title)==1) + { + error_root(sofs>=0L,1,"find_section [mutils.c]", + "Section [%s] occurs more than once",title); + sofs=ofs; + } + } + + ofs=ftell(stdin); + s=get_line(); + } + + error_root(sofs==-1L,1,"find_section [mutils.c]", + "Section [%s] not found",title); + ie=fseek(stdin,sofs,SEEK_SET); + error_root(ie!=0,1,"find_section [mutils.c]", + "Unable to go to section [%s]",title); + get_line(); + + return sofs; + } + else + return -1L; +} + + +static void check_tag(char *tag) +{ + if (tag[0]=='\0') + return; + + error_root((strspn(tag," 0123456789.")!=0L)|| + (strcspn(tag," \n")!=strlen(tag)),1, + "check_tag [mutils.c]","Improper tag %s",tag); +} + + +static long find_tag(char *tag) +{ + int ie; + long tofs,lofs,ofs; + char *s,*pl,*pr; + + ie=0; + tofs=-1L; + lofs=ftell(stdin); + rewind(stdin); + ofs=ftell(stdin); + s=get_line(); + + while (s!=NULL) + { + pl=strchr(line,'['); + pr=strchr(line,']'); + + if ((pl==(line+strspn(line," \t")))&&(pr>pl)) + { + if (ofs +#include +#include +#include +#include +#include "mpi.h" +#include "utils.h" +#include "global.h" + +#define MAX_TAG 32767 +#define MAX_PERMANENT_TAG MAX_TAG/2 + +#define MPC_BUF_LEN 2048 + +static int mpcBuf[MPC_BUF_LEN]; +static int mpcRank = -1; + +static int pcmn_cnt=-1,cmn_cnt=MAX_TAG; +static int err_no,err_flg=0; +static char prog_name[128],err_msg[512]; + +static long long int amem_use = 0; +static long long int amem_max = 0; + +struct addr_t +{ + char *addr; + char *true_addr; + size_t true_size; + struct addr_t *next; +}; + +static struct addr_t *first=NULL; + + +int safe_mod(int x,int y) +{ + if (x>=0) + return x%y; + else + return (y-(abs(x)%y))%y; +} + + +void *amalloc(size_t size,int p) +{ + int shift; + char *true_addr,*addr; + unsigned long mask; + struct addr_t *new; + + if ((size<=0)||(p<0)) + return(NULL); + + shift=1<0) + { + MPI_Barrier(MPI_COMM_WORLD); + + tag1=mpi_tag(); + tag2=mpi_tag(); + tag3=mpi_tag(); + + if (my_rank==n) + { + MPI_Send(err,2,MPI_INT,0,tag1,MPI_COMM_WORLD); + MPI_Send(prog_name,127,MPI_CHAR,0,tag2,MPI_COMM_WORLD); + MPI_Send(err_msg,511,MPI_CHAR,0,tag3,MPI_COMM_WORLD); + } + + if (my_rank==0) + { + MPI_Recv(err,2,MPI_INT,n,tag1,MPI_COMM_WORLD,&stat); + MPI_Recv(prog_name,127,MPI_CHAR,n,tag2,MPI_COMM_WORLD,&stat); + MPI_Recv(err_msg,511,MPI_CHAR,n,tag3,MPI_COMM_WORLD,&stat); + } + } + + if ((err[0]==1)&&(my_rank==0)) + { + printf("%3d: in %s:\n",n,prog_name); + printf(" %s (error number %d)\n",err_msg,err[1]); + } + } + + if (my_rank==0) + { + printf("\nProgram aborted\n\n"); + fflush(stdout); + + MPI_Abort(MPI_COMM_WORLD,1); + } + else + for (i=1;i<2;i=safe_mod(i,2)); +} + + +void message(char *format,...) +{ + int my_rank; + va_list args; + + MPI_Comm_rank(MPI_COMM_WORLD,&my_rank); + + if (my_rank==0) + { + va_start(args,format); + vprintf(format,args); + va_end(args); + } +} + +#undef USE_MPI_BCAST +#define USE_MPI_ALLREDUCE + +void mpc_print_info() { +#ifdef USE_MPI_BCAST + message("mpc_bcast implemented as MPI_Bcast\n"); +#else + message("mpc_bcast implemented as MPI_Allreduce\n"); +#endif + +#ifdef USE_MPI_ALLREDUCE + message("mpc_gsum_d implemented as MPI_Allreduce\n"); +#else + message("mpc_gsum_d implemented as MPI_Reduce + mpc_bcast\n"); +#endif +} + +void mpc_bcast_c(char *buf, int num) { +#ifdef USE_MPI_BCAST + MPI_Bcast(buf, num, MPI_CHAR, 0, MPI_COMM_WORLD); +#else + int i, nint; + int *pi; + char *pc; + nint = (sizeof(char)*num)/sizeof(int); + while( nint*sizeof(int) < num*sizeof(char) ) nint++; + pc = (char*) mpcBuf; + pi = (int*) mpcBuf; + if ( mpcRank < 0 ) MPI_Comm_rank(MPI_COMM_WORLD,&mpcRank); + error_root( nint > MPC_BUF_LEN, + 0, "mpc_bcast_c [utils.c]", "Too many elements: %d", num); + if ( mpcRank == 0 ) { + for(i=0; i MPC_BUF_LEN*sizeof(int), + 0, "mpc_bcast_d [utils.c]", "Too many elements: %d", num); + if ( mpcRank == 0 ) { + for(i=0; i MPC_BUF_LEN, + 0, "mpc_bcast_i [utils.c]", "Too many elements: %d", num); + if ( mpcRank == 0 ) { + for(i=0; i +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "global.h" + +static int nudt=0,iwud=0,nwudt=0,*nwud; +static su3_dble **wud0,**wud,ud0={{0.0}}; + +static int nfdt=0,iwfd=0,nwfdt=0,*nwfd; +static su3_alg_dble **wfd0,**wfd,fd0={0.0}; + +static int nst=0,iws=0,nwst=0,*nws; +static spinor **ws0,**ws,s0={{{0.0f}}}; + +static int nsdt=0,iwsd=0,nwsdt=0,*nwsd; +static spinor_dble **wsd0,**wsd,sd0={{{0.0}}}; + +static int nvt=0,iwv=0,nwvt=0,*nwv,nvec; +static complex **wv0,**wv,v0={0.0f}; + +static int nvdt=0,iwvd=0,nwvdt=0,*nwvd; +static complex_dble **wvd0,**wvd,vd0={0.0}; + + +void alloc_wud(int n) +{ + int i; + su3_dble *ud,*um; + + if (n==nudt) + return; + + error_root(nwudt!=0,1,"alloc_wud [wspace.c]","Fields are in use"); + + if (nudt>0) + { + free(nwud); + afree(wud0[0]); + free(wud0); + nwud=NULL; + wud0=NULL; + wud=NULL; + } + + nudt=n; + iwud=0; + nwudt=0; + + if (nudt>0) + { + nwud=malloc(nudt*sizeof(*nwud)); + wud0=malloc(2*nudt*sizeof(*wud0)); + wud=wud0+nudt; + + error((nwud==NULL)||(wud0==NULL),1,"alloc_wud [wspace.c]", + "Unable to allocate index arrays"); + + wud0[0]=amalloc(nudt*4*VOLUME*sizeof(**wud0),ALIGN); + + error(wud0[0]==NULL,1,"alloc_wud [wspace.c]", + "Unable to allocate workspace"); + + for (i=0;i0) + wud0[i]=wud0[i-1]+4*VOLUME; + + nwud[i]=0; + wud[i]=NULL; + } + + ud=wud0[0]; + um=ud+nudt*4*VOLUME; + + for (;ud1) + { + iprms[0]=n; + + MPI_Bcast(iprms,1,MPI_INT,0,MPI_COMM_WORLD); + + error(iprms[0]!=n,1,"reserve_wud [wspace.c]", + "Parameter n is not global"); + } + + if (n>0) + { + error((nwudt+n)>nudt,1,"reserve_wud [wspace.c]", + "Requested too many fields (tot=%d,use=%d,req=%d)",nudt,nwudt,n); + + ia=nwudt; + nwud[iwud]=n; + nwudt+=n; + iwud+=1; + + for (i=ia;i<(ia+n);i++) + wud[i]=wud0[i]; + + return wud+ia; + } + else + return NULL; +} + + +int release_wud(void) +{ + int n,i; + + if (nwudt==0) + return 0; + else + { + iwud-=1; + n=nwud[iwud]; + nwudt-=n; + nwud[iwud]=0; + + for (i=nwudt;i<(nwudt+n);i++) + wud[i]=NULL; + + return n; + } +} + + +int wud_size(void) +{ + return nwudt; +} + + +void alloc_wfd(int n) +{ + int i; + su3_alg_dble *fd,*fm; + + if (n==nfdt) + return; + + error_root(nwfdt!=0,1,"alloc_wfd [wspace.c]","Fields are in use"); + + if (nfdt>0) + { + free(nwfd); + afree(wfd0[0]); + free(wfd0); + nwfd=NULL; + wfd0=NULL; + wfd=NULL; + } + + nfdt=n; + iwfd=0; + nwfdt=0; + + if (nfdt>0) + { + nwfd=malloc(nfdt*sizeof(*nwfd)); + wfd0=malloc(2*nfdt*sizeof(*wfd0)); + wfd=wfd0+nfdt; + + error((nwfd==NULL)||(wfd0==NULL),1,"alloc_wfd [wspace.c]", + "Unable to allocate index arrays"); + + wfd0[0]=amalloc(nfdt*4*VOLUME*sizeof(**wfd0),ALIGN); + + error(wfd0[0]==NULL,1,"alloc_wfd [wspace.c]", + "Unable to allocate workspace"); + + for (i=0;i0) + wfd0[i]=wfd0[i-1]+4*VOLUME; + + nwfd[i]=0; + wfd[i]=NULL; + } + + fd=wfd0[0]; + fm=fd+nfdt*4*VOLUME; + + for (;fd1) + { + iprms[0]=n; + + MPI_Bcast(iprms,1,MPI_INT,0,MPI_COMM_WORLD); + + error(iprms[0]!=n,1,"reserve_wfd [wspace.c]", + "Parameter n is not global"); + } + + if (n>0) + { + error((nwfdt+n)>nfdt,1,"reserve_wfd [wspace.c]", + "Requested too many fields (tot=%d,use=%d,req=%d)",nfdt,nwfdt,n); + + ia=nwfdt; + nwfd[iwfd]=n; + nwfdt+=n; + iwfd+=1; + + for (i=ia;i<(ia+n);i++) + wfd[i]=wfd0[i]; + + return wfd+ia; + } + else + return NULL; +} + + +int release_wfd(void) +{ + int n,i; + + if (nwfdt==0) + return 0; + else + { + iwfd-=1; + n=nwfd[iwfd]; + nwfdt-=n; + nwfd[iwfd]=0; + + for (i=nwfdt;i<(nwfdt+n);i++) + wfd[i]=NULL; + + return n; + } +} + + +int wfd_size(void) +{ + return nwfdt; +} + + +void alloc_ws(int n) +{ + int i; + spinor *s,*sm; + + if (n==nst) + return; + + error_root(nwst!=0,1,"alloc_ws [wspace.c]","Fields are in use"); + + if (nst>0) + { + free(nws); + afree(ws0[0]); + free(ws0); + nws=NULL; + ws0=NULL; + ws=NULL; + } + + nst=n; + iws=0; + nwst=0; + + if (nst>0) + { + nws=malloc(nst*sizeof(*nws)); + ws0=malloc(2*nst*sizeof(*ws0)); + ws=ws0+nst; + + error((nws==NULL)||(ws0==NULL),1,"alloc_ws [wspace.c]", + "Unable to allocate index arrays"); + + ws0[0]=amalloc(nst*NSPIN*sizeof(**ws0),ALIGN); + + error(ws0[0]==NULL,1,"alloc_ws [wspace.c]", + "Unable to allocate workspace"); + + for (i=0;i0) + ws0[i]=ws0[i-1]+NSPIN; + + nws[i]=0; + ws[i]=NULL; + } + + s=ws0[0]; + sm=s+nst*NSPIN; + + for (;s1) + { + iprms[0]=n; + + MPI_Bcast(iprms,1,MPI_INT,0,MPI_COMM_WORLD); + + error(iprms[0]!=n,1,"reserve_ws [wspace.c]", + "Parameter n is not global"); + } + + if (n>0) + { + error((nwst+n)>nst,1,"reserve_ws [wspace.c]", + "Requested too many fields (tot=%d,use=%d,req=%d)",nst,nwst,n); + + ia=nwst; + nws[iws]=n; + nwst+=n; + iws+=1; + + for (i=ia;i<(ia+n);i++) + ws[i]=ws0[i]; + + return ws+ia; + } + else + return NULL; +} + + +int release_ws(void) +{ + int n,i; + + if (nwst==0) + return 0; + else + { + iws-=1; + n=nws[iws]; + nwst-=n; + nws[iws]=0; + + for (i=nwst;i<(nwst+n);i++) + ws[i]=NULL; + + return n; + } +} + + +int ws_size(void) +{ + return nwst; +} + + +void alloc_wsd(int n) +{ + int i; + spinor_dble *sd,*sm; + + if (n==nsdt) + return; + + error_root(nwsdt!=0,1,"alloc_wsd [wspace.c]","Fields are in use"); + + if (nsdt>0) + { + free(nwsd); + afree(wsd0[0]); + free(wsd0); + nwsd=NULL; + wsd0=NULL; + wsd=NULL; + } + + nsdt=n; + iwsd=0; + nwsdt=0; + + if (nsdt>0) + { + nwsd=malloc(nsdt*sizeof(*nwsd)); + wsd0=malloc(2*nsdt*sizeof(*wsd0)); + wsd=wsd0+nsdt; + + error((nwsd==NULL)||(wsd0==NULL),1,"alloc_wsd [wspace.c]", + "Unable to allocate index arrays"); + + wsd0[0]=amalloc(nsdt*NSPIN*sizeof(**wsd0),ALIGN); + + error(wsd0[0]==NULL,1,"alloc_wsd [wspace.c]", + "Unable to allocate workspace"); + + for (i=0;i0) + wsd0[i]=wsd0[i-1]+NSPIN; + + nwsd[i]=0; + wsd[i]=NULL; + } + + sd=wsd0[0]; + sm=sd+nsdt*NSPIN; + + for (;sd1) + { + iprms[0]=n; + + MPI_Bcast(iprms,1,MPI_INT,0,MPI_COMM_WORLD); + + error(iprms[0]!=n,1,"reserve_wsd [wspace.c]", + "Parameter n is not global"); + } + + if (n>0) + { + error((nwsdt+n)>nsdt,1,"reserve_wsd [wspace.c]", + "Requested too many fields (tot=%d,use=%d,req=%d)",nsdt,nwsdt,n); + + ia=nwsdt; + nwsd[iwsd]=n; + nwsdt+=n; + iwsd+=1; + + for (i=ia;i<(ia+n);i++) + wsd[i]=wsd0[i]; + + return wsd+ia; + } + else + return NULL; +} + + +int release_wsd(void) +{ + int n,i; + + if (nwsdt==0) + return 0; + else + { + iwsd-=1; + n=nwsd[iwsd]; + nwsdt-=n; + nwsd[iwsd]=0; + + for (i=nwsdt;i<(nwsdt+n);i++) + wsd[i]=NULL; + + return n; + } +} + + +int wsd_size(void) +{ + return nwsdt; +} + + +static void set_nvec(void) +{ + int *bs; + dfl_parms_t dfl; + + dfl=dfl_parms(); + + error_root(dfl.Ns==0,1,"set_nvec [wspace.c]", + "Deflation subspace parameters are not set"); + + bs=dfl.bs; + nvec=VOLUME+FACE0*bs[0]+FACE1*bs[1]+FACE2*bs[2]+FACE3*bs[3]; + nvec/=(bs[0]*bs[1]*bs[2]*bs[3]); + nvec*=dfl.Ns; +} + + +void alloc_wv(int n) +{ + int i; + complex *v,*vm; + + if (n==nvt) + return; + + error_root(nwvt!=0,1,"alloc_wv [wspace.c]","Fields are in use"); + + if (nvt>0) + { + free(nwv); + afree(wv0[0]); + free(wv0); + nwv=NULL; + wv0=NULL; + wv=NULL; + } + + nvt=n; + iwv=0; + nwvt=0; + + if (nvt>0) + { + set_nvec(); + nwv=malloc(nvt*sizeof(*nwv)); + wv0=malloc(2*nvt*sizeof(*wv0)); + wv=wv0+nvt; + + error((nwv==NULL)||(wv0==NULL),1,"alloc_wv [wspace.c]", + "Unable to allocate index arrays"); + + wv0[0]=amalloc(nvt*nvec*sizeof(**wv0),ALIGN); + + error(wv0[0]==NULL,1,"alloc_wv [wspace.c]", + "Unable to allocate workspace"); + + for (i=0;i0) + wv0[i]=wv0[i-1]+nvec; + + nwv[i]=0; + wv[i]=NULL; + } + + v=wv0[0]; + vm=v+nvt*nvec; + + for (;v1) + { + iprms[0]=n; + + MPI_Bcast(iprms,1,MPI_INT,0,MPI_COMM_WORLD); + + error(iprms[0]!=n,1,"reserve_wv [wspace.c]", + "Parameter n is not global"); + } + + if (n>0) + { + error((nwvt+n)>nvt,1,"reserve_wv [wspace.c]", + "Requested too many fields (tot=%d,use=%d,req=%d)",nvt,nwvt,n); + + ia=nwvt; + nwv[iwv]=n; + nwvt+=n; + iwv+=1; + + for (i=ia;i<(ia+n);i++) + wv[i]=wv0[i]; + + return wv+ia; + } + else + return NULL; +} + + +int release_wv(void) +{ + int n,i; + + if (nwvt==0) + return 0; + else + { + iwv-=1; + n=nwv[iwv]; + nwvt-=n; + nwv[iwv]=0; + + for (i=nwvt;i<(nwvt+n);i++) + wv[i]=NULL; + + return n; + } +} + + +int wv_size(void) +{ + return nwvt; +} + + +void alloc_wvd(int n) +{ + int i; + complex_dble *vd,*vm; + + if (n==nvdt) + return; + + error_root(nwvdt!=0,1,"alloc_wvd [wspace.c]","Fields are in use"); + + if (nvdt>0) + { + free(nwvd); + afree(wvd0[0]); + free(wvd0); + nwvd=NULL; + wvd0=NULL; + wvd=NULL; + } + + nvdt=n; + iwvd=0; + nwvdt=0; + + if (nvdt>0) + { + set_nvec(); + nwvd=malloc(nvdt*sizeof(*nwvd)); + wvd0=malloc(2*nvdt*sizeof(*wvd0)); + wvd=wvd0+nvdt; + + error((nwvd==NULL)||(wvd0==NULL),1,"alloc_wvd [wspace.c]", + "Unable to allocate index arrays"); + + wvd0[0]=amalloc(nvdt*nvec*sizeof(**wvd0),ALIGN); + + error(wvd0[0]==NULL,1,"alloc_wvd [wspace.c]", + "Unable to allocate workspace"); + + for (i=0;i0) + wvd0[i]=wvd0[i-1]+nvec; + + nwvd[i]=0; + wvd[i]=NULL; + } + + vd=wvd0[0]; + vm=vd+nvdt*nvec; + + for (;vd1) + { + iprms[0]=n; + + MPI_Bcast(iprms,1,MPI_INT,0,MPI_COMM_WORLD); + + error(iprms[0]!=n,1,"reserve_wvd [wspace.c]", + "Parameter n is not global"); + } + + if (n>0) + { + error((nwvdt+n)>nvdt,1,"reserve_wvd [wspace.c]", + "Requested too many fields (tot=%d,use=%d,req=%d)",nvdt,nwvdt,n); + + ia=nwvdt; + nwvd[iwvd]=n; + nwvdt+=n; + iwvd+=1; + + for (i=ia;i<(ia+n);i++) + wvd[i]=wvd0[i]; + + return wvd+ia; + } + else + return NULL; +} + + +int release_wvd(void) +{ + int n,i; + + if (nwvdt==0) + return 0; + else + { + iwvd-=1; + n=nwvd[iwvd]; + nwvdt-=n; + nwvd[iwvd]=0; + + for (i=nwvdt;i<(nwvdt+n);i++) + wvd[i]=NULL; + + return n; + } +} + + +int wvd_size(void) +{ + return nwvdt; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/vflds/README b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/vflds/README new file mode 100644 index 0000000000000000000000000000000000000000..6b51050dc093150d8081e21951586cb08d353122 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/vflds/README @@ -0,0 +1,107 @@ + +******************************************************************************** + + Allocation and initialization of the global vector fields + +******************************************************************************** + + +Files +----- + +vcom.c Communication functions for the global single-precision + vector fields. + +vdcom.c Communication functions for the global double-precision + vector fields. + +vflds.c Allocation and initialization of the global vector fields + related to the deflation subspace. + +vinit.c Generic initialization and assignment programs for complex + single- and double-precision fields. + + +Include file +------------ + +The file vflds.h defines the prototypes for all externally accessible +functions that are defined in the *.c files listed above. + + +List of functions +----------------- + +void cpv_int_bnd(complex *v) + Copies the components of the field v on the interior boundary of + the local block lattice to the corresponding field components at + the exterior boundaries of the block lattices on the neighbouring + MPI processes. + +void cpv_ext_bnd(complex *v) + *Adds* the components of the field v on the exterior boundary of + the local block lattice to the corresponding field components on + the interior boundaries of the block lattices on the neighbouring + MPI processes. + +void cpvd_int_bnd(complex_dble *vd) + Copies the components of the field vd on the interior boundary of + the local block lattice to the corresponding field components at + the exterior boundaries of the block lattices on the neighbouring + MPI processes. + +void cpvd_ext_bnd(complex_dble *vd) + *Adds* the components of the field v on the exterior boundary of + the local block lattice to the corresponding field components on + the interior boundaries of the block lattices on the neighbouring + MPI processes. + +complex **vflds(void) + Returns the base address of the global single-precision vector fields + (see the notes). The fields are allocated and initialized to zero if + they are not already allocated. + +complex_dble **vdflds(void) + Returns the base address of the global double-precision vector fields + (see the notes). The fields are allocated and initialized to zero if + they are not already allocated. + +void set_v2zero(int n,complex *v) + Sets the single-precision field v to zero. + +void set_vd2zero(int n,complex_dble *vd) + Sets the double-precision field vd to zero. + +void random_v(int n,complex *v,float sigma) + Initializes the components of the single-precision field v to + (complex) random values z with distribution proportional to + exp{-|z|^2/sigma^2}. + +void random_vd(int n,complex_dble *vd,double sigma) + Initializes the components of the double-precision field vd to + (complex) random values z with distribution proportional to + exp{-|z|^2/sigma^2}. + +void assign_v2v(int n,complex *v,complex *w) + Assigns the single-precision field v to the single-precision + field w. + +void assign_v2vd(int n,complex *v,complex_dble *wd) + Assigns the single-precision field v to the double-precision + field wd. + +void assign_vd2v(int n,complex_dble *vd,complex *w) + Assigns the double-precision field vd to the single-precision + field w. + +void assign_vd2vd(int n,complex_dble *vd,complex_dble *wd) + Assigns the double-precision field vd to the double-precision + field wd. + +void add_v2vd(int n,complex *v,complex_dble *wd) + Adds the single-precision field v to the double-precision field + wd. + +void diff_vd2v(int n,complex_dble *vd,complex_dble *wd,complex *w) + Assigns the difference vd-wd of the double-precision fields vd + and wd to the single-precision field w. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/vflds/vcom.c b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/vflds/vcom.c new file mode 100644 index 0000000000000000000000000000000000000000..7cc7bbd64e80bc471117c1601fe0ff8fb1b75ee1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_C/openQCD-1.4-bgopt/modules/vflds/vcom.c @@ -0,0 +1,416 @@ + +/******************************************************************************* +* +* File vcom.c +* +* Copyright (C) 2007, 2011, 2013 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Communication functions for the global single-precision vector fields. +* +* void cpv_int_bnd(complex *v) +* Copies the components of the field v on the interior boundary of +* the local block lattice to the corresponding field components at +* the exterior boundaries of the block lattices on the neighbouring +* MPI processes. +* +* void cpv_ext_bnd(complex *v) +* *Adds* the components of the field v on the exterior boundary of +* the local block lattice to the corresponding field components on +* the interior boundaries of the block lattices on the neighbouring +* MPI processes. +* +* Notes: +* +* The fields passed to cpv_int_bnd() and cpv_ext_bnd() are interpreted as +* elements of the deflation subspace spanned by the Ns local modes in the +* DFL_BLOCKS block grid. They must have at least Ns*(nb+nbb/2) elements, +* where nb and nbb are the numbers blocks in the DFL_BLOCKS grid and its +* exterior boundary (see dfl/dfl_geometry.c for further explanations). +* +* In the case of boundary conditions of type 0,1 and 2, the programs do not +* copy any components of the fields across the boundaries of the lattice at +* global time 0 and NPROC0*L0-1. The program cpv_int_bnd() instead sets the +* field at the exterior boundaries of the block lattice at these times to +* zero. +* +* All these programs involve global communications and must be called on all +* MPI processes simultaneously. +* +*******************************************************************************/ + +#define VCOM_C + +#include +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "dfl.h" +#include "vflds.h" +#include "global.h" + +static int bc,np,nmu[8]; +static int Ns,nb,nbb; +static int nbbe[8],nbbo[8],obbe[8],obbo[8],*ipp; +static int nsnd,sfc[8],sflg[8]; +static complex *snd_buf_int[8],*rcv_buf_int[8]; +static complex *snd_buf_ext[8],*rcv_buf_ext[8],*wb=NULL; +static MPI_Request snd_req_int[8],rcv_req_int[8]; +static MPI_Request snd_req_ext[8],rcv_req_ext[8]; + + +static void alloc_vbufs(void) +{ + int ifc,tag,saddr,raddr; + complex *w; + dfl_parms_t dfl; + dfl_grid_t dgr; + + bc=bc_type(); + np=(cpr[0]+cpr[1]+cpr[2]+cpr[3])&0x1; + dfl=dfl_parms(); + Ns=dfl.Ns; + + error_root(Ns==0,1,"alloc_vbufs [vcom.c]", + "Deflation subspace parameters are not set"); + + dgr=dfl_geometry(); + nb=dgr.nb; + nbb=dgr.nbb; + nsnd=0; + + for (ifc=0;ifc<8;ifc++) + { + nmu[ifc]=cpr[ifc/2]&0x1; + nbbe[ifc]=dgr.nbbe[ifc]; + nbbo[ifc]=dgr.nbbo[ifc]; + obbe[ifc]=dgr.obbe[ifc]; + obbo[ifc]=dgr.obbo[ifc]; + + if (nbbe[ifc]+nbbo[ifc]) + { + sfc[nsnd]=ifc; + nsnd+=1; + } + + sflg[ifc]=((ifc>1)|| + ((ifc==0)&&(cpr[0]!=0))|| + ((ifc==1)&&(cpr[0]!=(NPROC0-1)))|| + (bc==3)); + } + + ipp=dgr.ipp; + + wb=amalloc(Ns*nbb*sizeof(*wb),ALIGN); + error(wb==NULL,1,"alloc_vbufs [vcom.c]", + "Unable to allocate communication buffers"); + set_v2zero(Ns*nbb,wb); + w=wb; + + for (ifc=0;ifc<8;ifc++) + { + snd_buf_int[ifc]=w; + w+=Ns*nbbo[ifc]; + rcv_buf_int[ifc]=w; + w+=Ns*nbbe[ifc^0x1]; + + tag=mpi_permanent_tag(); + saddr=npr[ifc]; + raddr=npr[ifc^0x1]; + + MPI_Send_init(snd_buf_int[ifc],2*Ns*nbbo[ifc], + MPI_FLOAT,saddr,tag,MPI_COMM_WORLD,&snd_req_int[ifc]); + MPI_Recv_init(rcv_buf_int[ifc],2*Ns*nbbe[ifc^0x1], + MPI_FLOAT,raddr,tag,MPI_COMM_WORLD,&rcv_req_int[ifc]); + } + + w=wb; + + for (ifc=0;ifc<8;ifc++) + { + snd_buf_ext[ifc]=w; + w+=Ns*nbbe[ifc]; + rcv_buf_ext[ifc]=w; + w+=Ns*nbbo[ifc^0x1]; + + tag=mpi_permanent_tag(); + saddr=npr[ifc]; + raddr=npr[ifc^0x1]; + + MPI_Send_init(snd_buf_ext[ifc],2*Ns*nbbe[ifc], + MPI_FLOAT,saddr,tag,MPI_COMM_WORLD,&snd_req_ext[ifc]); + MPI_Recv_init(rcv_buf_ext[ifc],2*Ns*nbbo[ifc^0x1], + MPI_FLOAT,raddr,tag,MPI_COMM_WORLD,&rcv_req_ext[ifc]); + } +} + + +static void get_int(int n,int *imb,complex *v,complex *w) +{ + int *imm; + complex *vv,*vm; + + imm=imb+n; + + for (;imb0) + send_bufs_int(sfc[m],eo); + + ifc=sfc[n]; + io=ifc^nmu[ifc]; + + if (sflg[io]) + get_int(nbbo[io],ipp+obbo[io],v,snd_buf_int[io]); + + if (n>0) + { + wait_bufs_int(sfc[m],eo); + m+=eo; + eo^=0x1; + } + } + + for (n=0;n<2;n++) + { + send_bufs_int(sfc[m],eo); + wait_bufs_int(sfc[m],eo); + m+=eo; + eo^=0x1; + } + + for (n=0;n0) + send_bufs_ext(sfc[m],eo); + + ifc=sfc[n]; + io=ifc^nmu[ifc]; + + if (sflg[io]) + assign_v2v(Ns*nbbe[io],vb+Ns*obbe[io],snd_buf_ext[io]); + + if (n>0) + { + wait_bufs_ext(sfc[m],eo); + m+=eo; + eo^=0x1; + } + } + + for (n=0;n<2;n++) + { + send_bufs_ext(sfc[m],eo); + wait_bufs_ext(sfc[m],eo); + m+=eo; + eo^=0x1; + } + + for (n=0;n +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "dfl.h" +#include "vflds.h" +#include "global.h" + +static int bc,np,nmu[8]; +static int Ns,nb,nbb; +static int nbbe[8],nbbo[8],obbe[8],obbo[8],*ipp; +static int nsnd,sfc[8],sflg[8]; +static complex_dble *snd_buf_int[8],*rcv_buf_int[8]; +static complex_dble *snd_buf_ext[8],*rcv_buf_ext[8],*wb=NULL; +static MPI_Request snd_req_int[8],rcv_req_int[8]; +static MPI_Request snd_req_ext[8],rcv_req_ext[8]; + + +static void alloc_vdbufs(void) +{ + int ifc,tag,saddr,raddr; + complex_dble *w; + dfl_parms_t dfl; + dfl_grid_t dgr; + + bc=bc_type(); + np=(cpr[0]+cpr[1]+cpr[2]+cpr[3])&0x1; + dfl=dfl_parms(); + Ns=dfl.Ns; + + error_root(Ns==0,1,"alloc_vdbufs [vdcom.c]", + "Deflation subspace parameters are not set"); + + dgr=dfl_geometry(); + nb=dgr.nb; + nbb=dgr.nbb; + nsnd=0; + + for (ifc=0;ifc<8;ifc++) + { + nmu[ifc]=cpr[ifc/2]&0x1; + nbbe[ifc]=dgr.nbbe[ifc]; + nbbo[ifc]=dgr.nbbo[ifc]; + obbe[ifc]=dgr.obbe[ifc]; + obbo[ifc]=dgr.obbo[ifc]; + + if (nbbe[ifc]+nbbo[ifc]) + { + sfc[nsnd]=ifc; + nsnd+=1; + } + + sflg[ifc]=((ifc>1)|| + ((ifc==0)&&(cpr[0]!=0))|| + ((ifc==1)&&(cpr[0]!=(NPROC0-1)))|| + (bc==3)); + } + + ipp=dgr.ipp; + + wb=amalloc(Ns*nbb*sizeof(*wb),ALIGN); + error(wb==NULL,1,"alloc_vdbufs [vcom.c]", + "Unable to allocate communication buffers"); + set_vd2zero(Ns*nbb,wb); + w=wb; + + for (ifc=0;ifc<8;ifc++) + { + snd_buf_int[ifc]=w; + w+=Ns*nbbo[ifc]; + rcv_buf_int[ifc]=w; + w+=Ns*nbbe[ifc^0x1]; + + tag=mpi_permanent_tag(); + saddr=npr[ifc]; + raddr=npr[ifc^0x1]; + + MPI_Send_init(snd_buf_int[ifc],2*Ns*nbbo[ifc], + MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD,&snd_req_int[ifc]); + MPI_Recv_init(rcv_buf_int[ifc],2*Ns*nbbe[ifc^0x1], + MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&rcv_req_int[ifc]); + } + + w=wb; + + for (ifc=0;ifc<8;ifc++) + { + snd_buf_ext[ifc]=w; + w+=Ns*nbbe[ifc]; + rcv_buf_ext[ifc]=w; + w+=Ns*nbbo[ifc^0x1]; + + tag=mpi_permanent_tag(); + saddr=npr[ifc]; + raddr=npr[ifc^0x1]; + + MPI_Send_init(snd_buf_ext[ifc],2*Ns*nbbe[ifc], + MPI_DOUBLE,saddr,tag,MPI_COMM_WORLD,&snd_req_ext[ifc]); + MPI_Recv_init(rcv_buf_ext[ifc],2*Ns*nbbo[ifc^0x1], + MPI_DOUBLE,raddr,tag,MPI_COMM_WORLD,&rcv_req_ext[ifc]); + } +} + + +static void get_int(int n,int *imb,complex_dble *v,complex_dble *w) +{ + int *imm; + complex_dble *vv,*vm; + + imm=imb+n; + + for (;imb0) + send_bufs_int(sfc[m],eo); + + ifc=sfc[n]; + io=ifc^nmu[ifc]; + + if (sflg[io]) + get_int(nbbo[io],ipp+obbo[io],vd,snd_buf_int[io]); + + if (n>0) + { + wait_bufs_int(sfc[m],eo); + m+=eo; + eo^=0x1; + } + } + + for (n=0;n<2;n++) + { + send_bufs_int(sfc[m],eo); + wait_bufs_int(sfc[m],eo); + m+=eo; + eo^=0x1; + } + + for (n=0;n0) + send_bufs_ext(sfc[m],eo); + + ifc=sfc[n]; + io=ifc^nmu[ifc]; + + if (sflg[io]) + assign_vd2vd(Ns*nbbe[io],vb+Ns*obbe[io],snd_buf_ext[io]); + + if (n>0) + { + wait_bufs_ext(sfc[m],eo); + m+=eo; + eo^=0x1; + } + } + + for (n=0;n<2;n++) + { + send_bufs_ext(sfc[m],eo); + wait_bufs_ext(sfc[m],eo); + m+=eo; + eo^=0x1; + } + + for (n=0;n +#include +#include +#include "mpi.h" +#include "su3.h" +#include "flags.h" +#include "utils.h" +#include "vflds.h" +#include "global.h" + +static int Ns,nv=0; +static complex **vs=NULL,**v; +static complex_dble **vds=NULL,**vd; + + +static void vfld_size(void) +{ + int *bs; + dfl_parms_t dfl; + + error_root(sizeof(complex)!=(2*sizeof(float)),1, + "vfld_size [vflds.c]", + "The complex structures are not properly packed"); + error_root(sizeof(complex_dble)!=(2*sizeof(double)),1, + "vfld_size [vflds.c]", + "The complex_dble structures are not properly packed"); + + dfl=dfl_parms(); + bs=dfl.bs; + Ns=dfl.Ns; + + error_root(dfl.Ns==0,1,"vfld_size [vflds.c]", + "The deflation subspace parameters are not set"); + + nv=VOLUME/(bs[0]*bs[1]*bs[2]*bs[3]); + nv*=Ns; +} + + +static void alloc_vflds(void) +{ + int n; + complex *w; + + if (nv==0) + vfld_size(); + + vs=malloc(4*Ns*sizeof(*vs)); + w=amalloc(2*Ns*nv*sizeof(*w),ALIGN); + + error((vs==NULL)||(w==NULL),1,"alloc_vflds [vflds.c]", + "Unable to allocate vector fields"); + + set_v2zero(2*Ns*nv,w); + v=vs+2*Ns; + + for (n=0;n<(2*Ns);n++) + { + v[n]=NULL; + vs[n]=w; + w+=nv; + } +} + + +static void alloc_vdflds(void) +{ + int n; + complex_dble *wd; + + if (nv==0) + vfld_size(); + + vds=malloc(2*Ns*sizeof(*vds)); + wd=amalloc(Ns*nv*sizeof(*wd),ALIGN); + + error((vds==NULL)||(wd==NULL),1,"alloc_vdflds [vflds.c]", + "Unable to allocate vector fields"); + + set_vd2zero(Ns*nv,wd); + vd=vds+Ns; + + for (n=0;n +#include +#include +#include "su3.h" +#include "random.h" +#include "vflds.h" + +static const complex v0={0.0f}; +static const complex_dble vd0={0.0}; + + +void set_v2zero(int n,complex *v) +{ + complex *vm; + + vm=v+n; + + for (;v +#include +#include "mpi.h" +#include "flags.h" +#include "su3fcts.h" +#include "utils.h" +#include "lattice.h" +#include "uflds.h" +#include "mdflds.h" +#include "linalg.h" +#include "forces.h" +#include "wflow.h" +#include "global.h" + +#define N0 (NPROC0*L0) + + +static void update_ud(double eps,su3_alg_dble *frc) +{ + int bc,ix,t,ifc; + su3_dble *u; + + bc=bc_type(); + u=udfld(); + + for (ix=(VOLUME/2);ix1) + { + iprms[0]=n; + dprms[0]=eps; + + MPI_Bcast(iprms,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(dprms,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + error((iprms[0]!=n)||(dprms[0]!=eps),1, + "fwd_euler [wflow.c]","Parameters are not global"); + } + + if (n>0) + { + mdfs=mdflds(); + frc=(*mdfs).frc; + + for (k=0;k1) + { + iprms[0]=n; + dprms[0]=eps; + + MPI_Bcast(iprms,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(dprms,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + error((iprms[0]!=n)||(dprms[0]!=eps),1, + "fwd_rk2 [wflow.c]","Parameters are not global"); + } + + if (n>0) + { + mdfs=mdflds(); + frc=(*mdfs).frc; + fsv=reserve_wfd(1); + fro=fsv[0]; + + for (k=0;k1) + { + iprms[0]=n; + dprms[0]=eps; + + MPI_Bcast(iprms,1,MPI_INT,0,MPI_COMM_WORLD); + MPI_Bcast(dprms,1,MPI_DOUBLE,0,MPI_COMM_WORLD); + + error((iprms[0]!=n)||(dprms[0]!=eps),1, + "fwd_rk3 [wflow.c]","Parameters are not global"); + } + + if (n>0) + { + mdfs=mdflds(); + frc=(*mdfs).frc; + fsv=reserve_wfd(1); + fro=fsv[0]; + + for (k=0;k fixed + +14.05.2009: bug in reread functionality fixed. + +30.05.2008: scidac checksum for gauge fields not correctly working + different result for serial and MPI run +--> fixed (Remi 02.03.2009) + problem was a wrong computation of rank for I/O writing functions only + when MPI was used + + +16.04.2007: enable-gaugecopy and disable-newdiracop is buggy, at least +in the 1-dim parallel case. -> fixed, wrong #ifdef in xchange_gauge +around MPI_Waitall + +18.1.2007: phmc does not work with halfspinor, seg fault in Hopping_Matrix. +--> fixed + +For x86_64 there seems to be a compiler bug in gcc 3.3.3. The code +is not working with the default sse2 optimisation done by the compiler +and now switched of with mfpmath=387. It has to be checked, whether this +is really a compiler bug, a strange interplay between our sse2 units and +the compiler units or a bug in the code (17.08.2004) --> fixed + +bug in the serial version of the code fixed. (16.08.2004) + +there are now two exchange routines for the spinor fields, since at +least on the iwarp pc cluster the version with MPI_Type_vector is not +working for small local lattices. Maybe this is a bug in the MPI +implementation, but maybe not ... +Seems to be a bug in the MPI driver for infiniband. (17.08.2004) + +write and read of spinor fields not yet tested --> tested + +--disable-mpi will be configured correctly, but the code is not correct. --> fixed diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/COPYING b/qcd/part_cpu/applications/QCD/src/kernel_D/COPYING new file mode 100644 index 0000000000000000000000000000000000000000..94a9ed024d3859793618152ea559a168bbcbb5e2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/COPYING @@ -0,0 +1,674 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/DirectPut.c b/qcd/part_cpu/applications/QCD/src/kernel_D/DirectPut.c new file mode 100644 index 0000000000000000000000000000000000000000..9678465083ce2e3ca2028509ba2d231c874e0224 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/DirectPut.c @@ -0,0 +1,470 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#ifdef MPI +# include +#endif +#ifdef OMP +# include +#endif +#include "global.h" +#include "DirectPut.h" + +// actual number of directions +unsigned int spi_num_dirs = NUM_DIRS; +// total Message Size +// in bytes! +uint64_t totalMessageSize; +// Allocate static memory for descriptors +char SPIDescriptorsMemory[ NUM_DIRS * sizeof(MUHWI_Descriptor_t) + 64 ]; +char SPIDescriptorsMemory32[ NUM_DIRS * sizeof(MUHWI_Descriptor_t) + 64 ]; +// pointer to descriptor array +MUHWI_Descriptor_t *SPIDescriptors; +MUHWI_Descriptor_t *SPIDescriptors32; + +const int batsubgroupID = 0; +int do_dynamic = 1; +// Enable different zone routing modes +uint8_t zoneRoutingMask = 0; +unsigned zoneRoutingId = 0; +// stay on bubble bits +uint8_t stayOnBubbleMask = 0; +unsigned stayOnBubbleFlag = 0; + +// pointers to send and receive buffers +char * SPIrecvBuffers; +char * SPIsendBuffers; + +// neighbour destination cache +struct { + MUHWI_Destination_t dest; + uint8_t hintsABCD; + uint8_t hintsE; +} nb2dest[NUM_DIRS]; + +// receive counter +volatile uint64_t recvCounter; + +// counter for injected messages +uint64_t descCount[NUM_DIRS]; + +// base addess table slot for receive buffer and counter +uint32_t recvBufBatId = 0, recvCntrBatId = 1; + +// physical address of send buffers +uint64_t sendBufPAddr; + +msg_InjFifoHandle_t injFifoHandle; + +void setup_mregions_bats_counters(const int bufferSize) { + const uint64_t buffersSize = bufferSize; + + // allocate bat entries for the recive buffer and the receive counter + + uint32_t batIds[2] = { recvBufBatId, recvCntrBatId }; + MUSPI_BaseAddressTableSubGroup_t batSubGrp; + + int rc = Kernel_AllocateBaseAddressTable( batsubgroupID/*subgrpId*/, + &batSubGrp, + 2,/*nbatids*/ + batIds, + 0 /* "User" use */); + + if (rc != 0) { + fprintf(stderr, "Kernel_AllocateBaseAddressTable failed with rc=%d\n", rc); + exit(1); + } + + // Receive buffer bat is set to the PA addr of the receive buffer + Kernel_MemoryRegion_t memRegion; + rc = Kernel_CreateMemoryRegion ( &memRegion, + SPIrecvBuffers, + buffersSize); + if ( rc != 0) { + printf("Kernel_CreateMemoryRegion failed with rc=%d\n",rc); + exit(1); + } + + uint64_t paAddr = + (uint64_t)SPIrecvBuffers - + (uint64_t)memRegion.BaseVa + + (uint64_t)memRegion.BasePa; + + rc = MUSPI_SetBaseAddress ( &batSubGrp, + recvBufBatId, + paAddr ); + + if(rc != 0) { + printf("MUSPI_SetBaseAddress failed with rc=%d\n",rc); + exit(1); + } + + // Receive counter bat is set to the MU style atomic PA addr of the receive counter + if( (uint64_t)(&recvCounter) & 0x7 ) { + printf("ERROR: recv counter is not 8 byte aligned\n"); + exit(1); + } + + rc = Kernel_CreateMemoryRegion ( &memRegion, + (void *)&recvCounter, + sizeof(recvCounter)); + if(rc != 0) { + printf("Kernel_CreateMemoryRegion failed with rc=%d\n",rc); + exit(1); + } + + paAddr = + (uint64_t)&recvCounter - + (uint64_t)memRegion.BaseVa + + (uint64_t)memRegion.BasePa; + + uint64_t paAddrAtomic = MUSPI_GetAtomicAddress(paAddr,MUHWI_ATOMIC_OPCODE_STORE_ADD); + + rc = MUSPI_SetBaseAddress ( &batSubGrp, + recvCntrBatId, + paAddrAtomic ); + + if(rc != 0) { + printf("MUSPI_SetBaseAddress failed with rc=%d\n",rc); + exit(1); + } + + // Get the send buffers physical address + rc = Kernel_CreateMemoryRegion ( &memRegion, + SPIsendBuffers, + buffersSize); + if(rc != 0) { + printf("Kernel_CreateMemoryRegion failed with rc=%d\n",rc); + exit(1); + } + + sendBufPAddr = + (uint64_t)SPIsendBuffers - + (uint64_t)memRegion.BaseVa + + (uint64_t)memRegion.BasePa; + return; +} + + +void create_descriptors(MUHWI_Descriptor_t * descriptors, uint64_t * messageSizes, uint64_t * soffsets, + uint64_t * roffsets, const unsigned int num_dirs) { + uint64_t anyFifoMap = + MUHWI_DESCRIPTOR_TORUS_FIFO_MAP_AM | + MUHWI_DESCRIPTOR_TORUS_FIFO_MAP_AP | + MUHWI_DESCRIPTOR_TORUS_FIFO_MAP_BM | + MUHWI_DESCRIPTOR_TORUS_FIFO_MAP_BP | + MUHWI_DESCRIPTOR_TORUS_FIFO_MAP_CM | + MUHWI_DESCRIPTOR_TORUS_FIFO_MAP_CP | + MUHWI_DESCRIPTOR_TORUS_FIFO_MAP_DM | + MUHWI_DESCRIPTOR_TORUS_FIFO_MAP_DP | + MUHWI_DESCRIPTOR_TORUS_FIFO_MAP_EM | + MUHWI_DESCRIPTOR_TORUS_FIFO_MAP_EP; + + uint64_t offset; + static int did_print =0; + + // loop over directions + // CHECK offset needs to be adjusted for QCD case + for(unsigned int i = 0; i < num_dirs; i++) { + // Injection Direct Put Descriptor Information Structure + MUSPI_Pt2PtDirectPutDescriptorInfo_t dinfo; + + memset( (void*)&dinfo, 0x00, sizeof(dinfo) ); + + dinfo.Base.Payload_Address = sendBufPAddr + soffsets[i]; + dinfo.Base.Message_Length = messageSizes[i]; + dinfo.Base.Torus_FIFO_Map = anyFifoMap; + + dinfo.Base.Dest = nb2dest[i].dest; + + dinfo.Pt2Pt.Hints_ABCD = nb2dest[i].hintsABCD; + + if(do_dynamic) { + dinfo.Pt2Pt.Misc1 = + nb2dest[i].hintsE | + MUHWI_PACKET_USE_DYNAMIC_ROUTING | + MUHWI_PACKET_DO_NOT_ROUTE_TO_IO_NODE; + + dinfo.Pt2Pt.Misc2 = + MUHWI_PACKET_VIRTUAL_CHANNEL_DYNAMIC | + zoneRoutingMask | + stayOnBubbleMask; + if ( (g_cart_id ==0) && (did_print ==0)) + printf("# SPI using dynamic routing zoneRoutingMask=%d stayOnBubbleMask=%d\n", + zoneRoutingMask, stayOnBubbleMask); + } + else { + dinfo.Pt2Pt.Misc1 = + nb2dest[i].hintsE | + MUHWI_PACKET_USE_DETERMINISTIC_ROUTING | + MUHWI_PACKET_DO_NOT_ROUTE_TO_IO_NODE; + + dinfo.Pt2Pt.Misc2 = + MUHWI_PACKET_VIRTUAL_CHANNEL_DETERMINISTIC | + zoneRoutingMask | + stayOnBubbleMask; + if ( (g_cart_id ==0) && (did_print ==0)) printf("# SPI using deterministic routing\n"); + } + did_print++; + + dinfo.Pt2Pt.Skip = 8; // for checksumming, skip the header + dinfo.DirectPut.Rec_Payload_Base_Address_Id = recvBufBatId; + dinfo.DirectPut.Rec_Payload_Offset = roffsets[i]; + dinfo.DirectPut.Rec_Counter_Base_Address_Id = recvCntrBatId; + dinfo.DirectPut.Rec_Counter_Offset = 0; + + dinfo.DirectPut.Pacing = MUHWI_PACKET_DIRECT_PUT_IS_NOT_PACED; + + int rc = MUSPI_CreatePt2PtDirectPutDescriptor(&descriptors[i], + &dinfo ); + if (rc != 0) { + fprintf(stderr, "MUSPI_CreatePt2PtDirectPutDescriptor failed with rc=%d\n",rc); + exit(1); + } + } +} + + +int get_destinations(int * mypers) { + + int tmp[6]; +#if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) + MPI_Status mstatus; + MPI_Sendrecv((void*)mypers, 6, MPI_INT, g_nb_t_up, 0, + (void*)tmp, 6, MPI_INT, g_nb_t_dn, 0, + g_cart_grid, &mstatus); + MUSPI_SetUpDestination( &nb2dest[1].dest, tmp[0], tmp[1], tmp[2], tmp[3], tmp[4] ); + MPI_Sendrecv((void*)mypers, 6, MPI_INT, g_nb_t_dn, 1, + (void*)tmp, 6, MPI_INT, g_nb_t_up, 1, + g_cart_grid, &mstatus); + MUSPI_SetUpDestination( &nb2dest[0].dest, tmp[0], tmp[1], tmp[2], tmp[3], tmp[4] ); +#endif +#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) + MPI_Sendrecv((void*)mypers, 6, MPI_INT, g_nb_x_up, 2, + (void*)tmp, 6, MPI_INT, g_nb_x_dn, 2, + g_cart_grid, &mstatus); + MUSPI_SetUpDestination( &nb2dest[3].dest, tmp[0], tmp[1], tmp[2], tmp[3], tmp[4] ); + MPI_Sendrecv((void*)mypers, 6, MPI_INT, g_nb_x_dn, 3, + (void*)tmp, 6, MPI_INT, g_nb_x_up, 3, + g_cart_grid, &mstatus); + MUSPI_SetUpDestination( &nb2dest[2].dest, tmp[0], tmp[1], tmp[2], tmp[3], tmp[4] ); +#endif +#if (defined PARALLELXYT || defined PARALLELXYZT) + MPI_Sendrecv((void*)mypers, 6, MPI_INT, g_nb_y_up, 4, + (void*)tmp, 6, MPI_INT, g_nb_y_dn, 4, + g_cart_grid, &mstatus); + MUSPI_SetUpDestination( &nb2dest[5].dest, tmp[0], tmp[1], tmp[2], tmp[3], tmp[4] ); + MPI_Sendrecv((void*)mypers, 6, MPI_INT, g_nb_y_dn, 5, + (void*)tmp, 6, MPI_INT, g_nb_y_up, 5, + g_cart_grid, &mstatus); + MUSPI_SetUpDestination( &nb2dest[4].dest, tmp[0], tmp[1], tmp[2], tmp[3], tmp[4] ); +#endif +#if (defined PARALLELXYZT) + MPI_Sendrecv((void*)mypers, 6, MPI_INT, g_nb_z_up, 6, + (void*)tmp, 6, MPI_INT, g_nb_z_dn, 6, + g_cart_grid, &mstatus); + MUSPI_SetUpDestination( &nb2dest[7].dest, tmp[0], tmp[1], tmp[2], tmp[3], tmp[4] ); + MPI_Sendrecv((void*)mypers, 6, MPI_INT, g_nb_z_dn, 7, + (void*)tmp, 6, MPI_INT, g_nb_z_up, 7, + g_cart_grid, &mstatus); + MUSPI_SetUpDestination( &nb2dest[6].dest, tmp[0], tmp[1], tmp[2], tmp[3], tmp[4] ); +#endif + return(0); +} + +typedef struct msg_InjFifoInfo +{ + MUSPI_InjFifoSubGroup_t subgroup[BGQ_MU_NUM_FIFO_SUBGROUPS_PER_NODE]; + uint32_t numFifosInSubgroup[BGQ_MU_NUM_FIFO_SUBGROUPS_PER_NODE]; + void *fifoMemoryPtr [BGQ_MU_NUM_INJ_FIFOS_PER_SUBGROUP * + BGQ_MU_NUM_FIFO_SUBGROUPS_PER_NODE]; + void *fifoPtr [BGQ_MU_NUM_INJ_FIFOS_PER_SUBGROUP * + BGQ_MU_NUM_FIFO_SUBGROUPS_PER_NODE]; + uint32_t startingSubgroupId; + uint32_t startingFifoId; + uint32_t numFifos; + uint32_t numSubgroups; +} msg_InjFifoInfo_t; + + +uint64_t msg_InjFifoInject ( msg_InjFifoHandle_t injFifoHandle, + uint32_t relativeFifoId, + MUHWI_Descriptor_t *descPtr ) { + msg_InjFifoInfo_t *info = (msg_InjFifoInfo_t*)injFifoHandle.pOpaqueObject; + + uint32_t globalFifoId = (info->startingSubgroupId * BGQ_MU_NUM_INJ_FIFOS_PER_SUBGROUP) + + info->startingFifoId + relativeFifoId; + + uint32_t subgroupId = globalFifoId / BGQ_MU_NUM_INJ_FIFOS_PER_SUBGROUP; + uint64_t rc = MUSPI_InjFifoInject (MUSPI_IdToInjFifo( globalFifoId % BGQ_MU_NUM_INJ_FIFOS_PER_SUBGROUP, + &info->subgroup[subgroupId] ), + descPtr); + return rc; +} + +void msg_InjFifoTerm ( msg_InjFifoHandle_t injFifoHandle ) { + return; /*Simple library do nothing! */ +} + +int msg_InjFifoInit ( msg_InjFifoHandle_t *injFifoHandlePtr, + uint32_t startingSubgroupId, + uint32_t startingFifoId, + uint32_t numFifos, + size_t fifoSize, + Kernel_InjFifoAttributes_t *injFifoAttrs ) { + + void *buffer = NULL; + uint32_t endingFifoId; // Relative to a subgroup + uint32_t numFifosInSubgroup; + int rc; + uint32_t subgroupId = startingSubgroupId; + uint32_t fifoIds[BGQ_MU_NUM_INJ_FIFOS_PER_SUBGROUP]; + Kernel_InjFifoAttributes_t attrs[BGQ_MU_NUM_INJ_FIFOS_PER_SUBGROUP]; + Kernel_InjFifoAttributes_t defaultAttrs; + uint64_t lock_cache; + + memset ( &defaultAttrs, 0x00, sizeof(defaultAttrs) ); + if(injFifoAttrs == NULL) { + injFifoAttrs = &defaultAttrs; + } + + // Malloc space for the info structure + msg_InjFifoInfo_t *info; + info = (msg_InjFifoInfo_t *) memalign(32, sizeof(msg_InjFifoInfo_t)); + if( !info ) return -1; + + // Initialize the info structure + info->startingSubgroupId = startingSubgroupId; + info->startingFifoId = startingFifoId; + info->numFifos = numFifos; + info->numSubgroups = 0; + + // Malloc space for the injection fifos. They are 64-byte aligned. + for (unsigned int i = 0; i < numFifos; i++) { + info->fifoPtr[i] = (uint64_t*)memalign(64, fifoSize); + if ( !info->fifoPtr[i] ) return -1; + } + + // Process one subgroup at a time. + // - Allocate the fifos. + // - Init the MU MMIO for the fifos. + // - Activate the fifos. + while ( numFifos > 0 ) { + info->numSubgroups++; + + // startingFifoId is the starting fifo number relative to the + // subgroup we are working on. + // Determine endingFifoId, the ending fifo number relative to + // the subgroup we are working on. + endingFifoId = startingFifoId + numFifos-1; + if ( endingFifoId > (BGQ_MU_NUM_INJ_FIFOS_PER_SUBGROUP-1) ) { + endingFifoId = BGQ_MU_NUM_INJ_FIFOS_PER_SUBGROUP-1; + } + numFifosInSubgroup = endingFifoId - startingFifoId + 1; + info->numFifosInSubgroup[subgroupId] = numFifosInSubgroup; + + // Init structures for allocating the fifos... + // - fifo Ids + // - attributes + for (unsigned int i = 0; i < numFifosInSubgroup; i++) { + fifoIds[i] = startingFifoId + i; + memcpy(&attrs[i], injFifoAttrs, sizeof(attrs[i])); + } + + // Allocate the fifos + rc = Kernel_AllocateInjFifos (subgroupId, + &info->subgroup[subgroupId], + numFifosInSubgroup, + fifoIds, + attrs); + if ( rc ) { + printf("msg_InjFifoInit: Kernel_AllocateInjFifos failed with rc=%d\n",rc); + return rc; + } + + // Init the MU MMIO for the fifos. + for (unsigned int i = 0; i < numFifosInSubgroup; i++) { + Kernel_MemoryRegion_t memRegion; + rc = Kernel_CreateMemoryRegion ( &memRegion, + info->fifoPtr[numFifos-i-1], + fifoSize ); + if ( rc ) { + printf("msg_InjFifoInit: Kernel_CreateMemoryRegion failed with rc=%d\n",rc); + return rc; + } + + // initialise the Fifos + rc = Kernel_InjFifoInit (&info->subgroup[subgroupId], + fifoIds[i], + &memRegion, + (uint64_t)info->fifoPtr[numFifos-i-1] - + (uint64_t)memRegion.BaseVa, + fifoSize-1); + if ( rc ) { + printf("msg_InjFifoInit: Kernel_InjFifoInit failed with rc=%d\n",rc); + return rc; + } + } + + // Activate the fifos. + rc = Kernel_InjFifoActivate (&info->subgroup[subgroupId], + numFifosInSubgroup, + fifoIds, + KERNEL_INJ_FIFO_ACTIVATE); + if ( rc ) { + printf("msg_InjFifoInit: Kernel_InjFifoActivate failed with rc=%d\n",rc); + return rc; + } + + startingFifoId = 0; // Next subgroup will start at fifo 0. + + subgroupId++; // Next subgroup. + numFifos -= numFifosInSubgroup; + } + + injFifoHandlePtr->pOpaqueObject = (void *)info; + return 0; +} + + +void global_barrier() { + int rc = 0; + uint64_t timeoutCycles = 60UL * 1600000000UL; // about 60 sec at 1.6 ghz + rc = MUSPI_GIBarrierEnter ( &GIBarrier ); + if (rc) { + printf("MUSPI_GIBarrierEnter failed returned rc = %d\n", rc); + exit(1); + } + + // Poll for completion of the barrier. + rc = MUSPI_GIBarrierPollWithTimeout ( &GIBarrier, timeoutCycles); + if( rc ) { + printf("MUSPI_GIBarrierPollWithTimeout failed returned rc = %d\n", rc); + DelayTimeBase (200000000000UL); + exit(1); + } + return; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/DirectPut.h b/qcd/part_cpu/applications/QCD/src/kernel_D/DirectPut.h new file mode 100644 index 0000000000000000000000000000000000000000..f37bc43a292390e4f6678acde8b8e3c1b28f5037 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/DirectPut.h @@ -0,0 +1,133 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _DIRECT_PUT_H +#define _DIRECT_PUT_H +# ifdef SPI +// Basic SPI and HWI includes +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include +# include + +// maximal number of directions +# define NUM_DIRS 8 +// we have four directions and forward/backward +# define INJ_MEMORY_FIFO_SIZE ((64*NUM_DIRS) -1) + +// total message size summed over all directions +extern uint64_t totalMessageSize; + +// actual number of directions +extern unsigned int spi_num_dirs; + +// pointers to send and receive buffers +extern char * SPIrecvBuffers; +extern char * SPIsendBuffers; +extern char SPIDescriptorsMemory[ NUM_DIRS * sizeof(MUHWI_Descriptor_t) + 64 ]; +extern char SPIDescriptorsMemory32[ NUM_DIRS * sizeof(MUHWI_Descriptor_t) + 64 ]; +extern MUHWI_Descriptor_t * SPIDescriptors; +extern MUHWI_Descriptor_t * SPIDescriptors32; + +// physical address of send buffers +extern uint64_t sendBufPAddr; + +// receive counter +extern volatile uint64_t recvCounter; + +// counter for injected messages +extern uint64_t descCount[NUM_DIRS]; + +// get the destinations for all neighbours +// will be saved in nb2dest +int get_destinations(int * mypers); + +// Call to create the descriptors for all eight directions +void create_descriptors(MUHWI_Descriptor_t * descriptors, uint64_t *, uint64_t *, uint64_t *, const unsigned int); + +// Call to set up the base address table id and memory regions +void setup_mregions_bats_counters(const int bufferSize); + +// global barrier using GIBarrier +MUSPI_GIBarrier_t GIBarrier; +void global_barrier(); + +/** + * \brief Injection Fifo Handle + * + * This is a "handle" returned from msg_InjFifoInit() and passed into subsequent + * calls to msg_InjFifoXXXX() functions. It is used internally within the + * msg_InjFifoXXXX() functions to anchor resources that have been allocated. + */ +typedef struct { + void* pOpaqueObject; +} msg_InjFifoHandle_t; + +// Fifo handles +extern msg_InjFifoHandle_t injFifoHandle; + +int msg_InjFifoInit ( msg_InjFifoHandle_t *injFifoHandlePtr, + uint32_t startingSubgroupId, + uint32_t startingFifoId, + uint32_t numFifos, + size_t fifoSize, + Kernel_InjFifoAttributes_t *injFifoAttrs ); + +// basically a dummy routine for termination +void msg_InjFifoTerm ( msg_InjFifoHandle_t injFifoHandle ); + + +/** + * \brief Inject Descriptor into Injection Fifo + * + * Inject the specified descriptor into the specified injection fifo. + * + * \param [in] injFifoHandle The handle returned from msg_InjFifoInit(). + * It must be passed into this function untouched + * from when it was returned from msg_InjFifoInit(). + * \param [in] relativeFifoId The fifo number, relative to the start of + * the fifos managed by this opaque object. + * For example, if msg_InjFifoInit() was called + * to init fifos in subgroup 2, starting with + * fifo Id 3, the relativeFifoNumber of the + * first fifo is 0, not 3. + * \param [in] descPtr Pointer to the descriptor to be injected. + * + * \retval positiveNumber The descriptor was successfully injected. The + * returned value is the sequence number of this + * descriptor. + * \retval -1 The descriptor was not injected, most likely because + * there is no room in the fifo. + */ +uint64_t msg_InjFifoInject ( msg_InjFifoHandle_t injFifoHandle, + uint32_t relativeFifoId, + MUHWI_Descriptor_t *descPtr ); + + +# endif // SPI +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/GIT-VERSION-GEN b/qcd/part_cpu/applications/QCD/src/kernel_D/GIT-VERSION-GEN new file mode 100644 index 0000000000000000000000000000000000000000..c7898f9451cfbb07bbf1efb66506c4ae3d789aef --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/GIT-VERSION-GEN @@ -0,0 +1,86 @@ +####################################################################### +# Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach +# Copyright (C) GIT-VERSION-GEN 2012 Bartosz Kostrzewa +# +# This file is part of tmLQCD. +# +# tmLQCD is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# tmLQCD is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with tmLQCD. If not, see . +####################################################################### + +# This file has been adapted from GIT-VERSION-GEN from the git distribution. +# The original is available from http://git-scm.com/ + +# It is run by make during the build process and generates git_hash.h with the current +# head commit as an identifier. (after checking that .git exists) + +#!/bin/sh + +# write the hash to git_hash.h +write_git_hash() { + echo "#ifndef _GIT_HASH_H" > git_hash.h + echo "#define _GIT_HASH_H" >> git_hash.h + echo "const char git_hash[] = {\"${GIT_HASH}\"};" >> git_hash.h + echo "#endif /* _GIT_HASH_H */" >> git_hash.h +} + +# extract default version from configure.in if it exists +if test -r configure.in +then + DEF_VER=$(grep "AC_INIT" configure.in | awk '{print $2}' | sed 's/,//') +else + DEF_VER="no_version_information" +fi + +# find git +GIT_BIN=`command -v git` + +# First see if there is a version file (included in release tarballs), +# compare whether it matches the current HEAD commit, +# then try git rev-parse HEAD, then default. +# We also check whether we should leave it alone and just exit. +if test -f git_hash.h +then + # remove all unneccessary fields and characters + GIT_HASH=$( grep "const" git_hash.h | awk '{print $5}' | sed 's/[\",\{,\},;]//g' ) + # are we in a git repo and does git exist? + if test -d .git -o -f .git && test -x ${GIT_BIN} + then + GIT_REV=$(git rev-parse HEAD) + # does the version correspond to the HEAD commit? + if [ ${GIT_HASH} = ${GIT_REV} ] + then + # the versions match, let's exit to avoid changing the timestamp of git_hash.h + exit 0 + else + GIT_HASH=${GIT_REV} + write_git_hash + fi + # we are not in a git repository but git_hash.h exists. We must be building from + # a tarball! Let's assume git_hash.h is correct and exit before we do any damage + # (this branch will also be followed if .git exists but there is no git available + # to extract version information) + else + exit 0 + fi +# git_hash.h does not exist, let's try to generate it +elif test -d .git -o -f .git && test -x ${GIT_BIN} +then + # .git exists and we are in a git repo + GIT_HASH=$(git rev-parse HEAD) + write_git_hash +else + GIT_HASH=${DEF_VER} + write_git_hash +fi + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/ALTERNATE.cuh b/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/ALTERNATE.cuh new file mode 100644 index 0000000000000000000000000000000000000000..ce81c6e3d7d46f897121d3580167b53bf8897ac3 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/ALTERNATE.cuh @@ -0,0 +1,514 @@ +/************************************************************************** + * + * Copyright (C) 2010 Joseph Nagel + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + **************************************************************************/ + + + + +///////////////////// +// initializations // +///////////////////// + + +/* +//#ifdef MPI + #ifdef ALTERNATE_HOPPING_MATRIX + +// allocates memory for the fields for the alternative way of addressing positions in dev_Hopping_Matrix_alternate() + +void init_gpu_indexfields() { + + size_t size; + + // debug + //printf("Test: %p ?= %p\n", *g_iup, g_iup[0]); + //printf("Test: %p ?= %p\n", *g_idn, g_idn[0]); + //printf("Test: %p ?= %p\n", g_lexic2eo, &g_lexic2eo[0]); + //printf("Test: %p ?= %p\n", g_lexic2eosub, &g_lexic2eosub[0]); + //printf("Test: %p ?= %p\n", g_eo2lexic, &g_eo2lexic[0]); + //printf("Test: %p ?= %p ?= %p\n", ***g_ipt, &g_ipt[0][0][0][0], g_ipt[0][0][0]); + + size = 4*(VOLUME+RAND)*sizeof(int); + cudaMalloc((void **) &dev_g_iup, size); + cudaMalloc((void **) &dev_g_idn, size); + cudaMemcpy(dev_g_iup, g_iup[0], size, cudaMemcpyHostToDevice); + cudaMemcpy(dev_g_idn, g_idn[0], size, cudaMemcpyHostToDevice); + + size = (VOLUME+RAND)*sizeof(int); + cudaMalloc((void **) &dev_g_lexic2eo, size); + cudaMalloc((void **) &dev_g_lexic2eosub, size); + cudaMemcpy(dev_g_lexic2eo, g_lexic2eo, size, cudaMemcpyHostToDevice); + cudaMemcpy(dev_g_lexic2eosub, g_lexic2eosub, size, cudaMemcpyHostToDevice); + + size = (VOLUME+RAND)*sizeof(int); + cudaMalloc((void **) &dev_g_eo2lexic, size); + cudaMemcpy(dev_g_eo2lexic, g_eo2lexic, size, cudaMemcpyHostToDevice); + + size = VOLUME*sizeof(int); + cudaMalloc((void **) &dev_g_ipt, size); + cudaMemcpy(dev_g_ipt, g_ipt[0][0][0], size, cudaMemcpyHostToDevice); + +} + + + + +// frees the memory + +void free_gpu_indexfields() { + + cudaFree(dev_g_iup); + cudaFree(dev_g_idn); + + cudaFree(dev_g_lexic2eo); + cudaFree(dev_g_lexic2eosub); + + cudaFree(dev_g_eo2lexic); + + cudaFree(dev_g_ipt); + +} + + #endif +//#endif // MPI +*/ + + + + + + +//////////////////// +// hopping matrix // +//////////////////// + + +/* +//#ifdef MPI + #ifdef ALTERNATE_HOPPING_MATRIX + +// applies the Hopping Part Even-Odd ! +// the gauge field is the complete gaugefield! +// the gauge field at the local point is reconstructed by 2*pos+eo where pos is the eo-position +// from 0..VOLUME/2-1, eo = 0 or 1 +// the positions in the gauge fields are passed in "gfindex_site" for gf's that are attached at +// the actual positions and in "gfindex_nextsite" for gf's that start at a position of the +// other eo-sublattice. +// for the hopping positions of the eo-spinor field we use on of the two dedicated eo-nn fields +// the boundary conditions are implemented as in Hopping_Matrix.c +// mult with complex conjugate k0,k1,k2,k3 in positive direction because +// psi(x+mu) != exp(i theta_mu) psi(x) + +__global__ void dev_Hopping_Matrix_alternate (const dev_su3_2v * gf, const dev_spinor * sin, dev_spinor * sout, + int * dev_iup, int * dev_idn, int * dev_eo2lexic, int * dev_lexic2eosub, + int ieo) { + + + // guess: ieo = 0 corresponds to even sites ?! + + // USETEXTURE is not likely to work ... not now ... + // same for TEMPORALGAUGE ... + + + int pos_eo; + int pos_global; + int hoppos_eo; + int hoppos_global; + + dev_spinor shelp1[6], ssum[6]; + __shared__ dev_su3_pad gfsmem[BLOCK]; + + + + pos_eo = threadIdx.x + blockDim.x*blockIdx.x; + int ix = threadIdx.x; + + + + + ////////// + // main // + ////////// + + + if (pos_eo < dev_VOLUME) { + + + if (ieo == 0) + pos_global = dev_eo2lexic[pos_eo]; + else + pos_global = dev_eo2lexic[dev_VOLUMEPLUSRAND/2 + pos_eo]; + + + dev_zero_spinor(&(ssum[0])); // zero sum + + + #ifdef TEMPORALGAUGE + int spatialvol = dev_LX*dev_LY*dev_LZ; + #endif + + + + + /////////////// + // l == 0, t // + /////////////// + + // positive direction + hoppos_global = dev_iup[4*pos_global + 0]; + hoppos_eo = dev_lexic2eosub[hoppos_global]; + + #ifdef TEMPORALGAUGE + // gf == ID for t != T-1 => just read the spinor + + if((gfindex_site[pos]/spatialvol) != (dev_T-1) ){ + #ifdef USETEXTURE + shelp1[0] = tex1Dfetch(spin_tex,6*hoppos); + shelp1[1] = tex1Dfetch(spin_tex,6*hoppos+1); + shelp1[2] = tex1Dfetch(spin_tex,6*hoppos+2); + shelp1[3] = tex1Dfetch(spin_tex,6*hoppos+3); + shelp1[4] = tex1Dfetch(spin_tex,6*hoppos+4); + shelp1[5] = tex1Dfetch(spin_tex,6*hoppos+5); + #else + shelp1[0] = sin[6*hoppos]; + shelp1[1] = sin[6*hoppos+1]; + shelp1[2] = sin[6*hoppos+2]; + shelp1[3] = sin[6*hoppos+3]; + shelp1[4] = sin[6*hoppos+4]; + shelp1[5] = sin[6*hoppos+5]; + #endif + } + else{ + // gf != ID for t == T-1 => mult spinor with gf + #ifdef GF_8 + dev_reconstructgf_8texref(gf, 4*(gfindex_site[pos]),&(gfsmem[ix].m)); + #else + dev_reconstructgf_2vtexref(gf,4*(gfindex_site[pos]),&(gfsmem[ix].m)); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix].m, hoppos_eo, &(shelp1[0])); + #else + dev_su3MtV(gfsmem[ix].m, &(sin[6*hoppos_eo]), &(shelp1[0])); + #endif + } + #else + #ifdef GF_8 + dev_reconstructgf_8texref(gf, 4*hoppos_global, &(gfsmem[ix].m)); + #else + dev_reconstructgf_2vtexref(gf, 4*hoppos_global, &(gfsmem[ix].m)); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix].m, hoppos_eo, &(shelp1[0])); + #else + dev_su3MtV(gfsmem[ix].m, &(sin[6*hoppos_eo]), &(shelp1[0])); + #endif + #endif + + //-kappa(r - gamma_mu) + #ifdef GF_8 + dev_kappaP0_plus(&(ssum[0]), &(shelp1[0]), dev_cconj(dev_k0)); + #else + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_mk0,&(shelp1[0]), &(ssum[0])); + dev_Gamma0(&(shelp1[0])); + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_k0,&(shelp1[0]), &(ssum[0])); + #endif + + + + + /////////////// + // l == 0, t // + /////////////// + + // negative direction + hoppos_global = dev_idn[4*pos_global + 0]; + hoppos_eo = dev_lexic2eosub[hoppos_global]; + + //hoppos = tex1Dfetch(nn_tex,8*pos+4); + //color + #ifdef TEMPORALGAUGE + // gf == ID for t != T-1 => just read the spinor + if((gfindex_nextsite[hoppos]/spatialvol) != (dev_T-1) ){ + #ifdef USETEXTURE + shelp1[0] = tex1Dfetch(spin_tex,6*hoppos); + shelp1[1] = tex1Dfetch(spin_tex,6*hoppos+1); + shelp1[2] = tex1Dfetch(spin_tex,6*hoppos+2); + shelp1[3] = tex1Dfetch(spin_tex,6*hoppos+3); + shelp1[4] = tex1Dfetch(spin_tex,6*hoppos+4); + shelp1[5] = tex1Dfetch(spin_tex,6*hoppos+5); + #else + shelp1[0] = sin[6*hoppos]; + shelp1[1] = sin[6*hoppos+1]; + shelp1[2] = sin[6*hoppos+2]; + shelp1[3] = sin[6*hoppos+3]; + shelp1[4] = sin[6*hoppos+4]; + shelp1[5] = sin[6*hoppos+5]; + #endif + } + else{ + // gf != ID for t == T-1 => mult spinor with gf + #ifdef GF_8 + dev_reconstructgf_8texref_dagger(gf,4*gfindex_nextsite[hoppos],&(gfsmem[ix].m)); + #else + dev_reconstructgf_2vtexref_dagger(gf,4*gfindex_nextsite[hoppos],&(gfsmem[ix].m)); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix].m, hoppos, &(shelp1[0])); + #else + dev_su3MtV(gfsmem[ix].m, &(sin[6*hoppos]), &(shelp1[0])); + #endif + } + #else + #ifdef GF_8 + dev_reconstructgf_8texref_dagger(gf, 4*hoppos_global, &(gfsmem[ix].m)); + #else + dev_reconstructgf_2vtexref_dagger(gf, 4*hoppos_global, &(gfsmem[ix].m)); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix].m, hoppos_eo, &(shelp1[0])); + #else + dev_su3MtV(gfsmem[ix].m, &(sin[6*hoppos_eo]), &(shelp1[0])); + #endif + #endif + + //-kappa(r + gamma_mu) + #ifdef GF_8 + dev_kappaP0_minus(&(ssum[0]), &(shelp1[0]), dev_k0); + #else + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_mk0,&(shelp1[0]), &(ssum[0])); + dev_Gamma0(&(shelp1[0])); + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_mk0,&(shelp1[0]), &(ssum[0])); + #endif + + + + + /////////////// + // l == 3, z // + /////////////// + + // positive direction + hoppos_global = dev_iup[4*pos_global + 3]; + hoppos_eo = dev_lexic2eosub[hoppos_global]; + + //hoppos = tex1Dfetch(nn_tex,8*pos+3); + //color + #ifdef GF_8 + dev_reconstructgf_8texref(gf, 4*(hoppos_global)+(3), &(gfsmem[ix].m)); + #else + dev_reconstructgf_2vtexref(gf, 4*(hoppos_global)+(3),&(gfsmem[ix].m)); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix].m, hoppos_eo, &(shelp1[0])); + #else + dev_su3MtV(gfsmem[ix].m, &(sin[6*hoppos_eo]), &(shelp1[0])); + #endif + //-kappa(r - gamma_mu) + #ifdef GF_8 + dev_kappaP3_plus(&(ssum[0]), &(shelp1[0]), dev_k3.re); + #else + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_mk3,&(shelp1[0]), &(ssum[0])); + dev_Gamma3(&(shelp1[0])); + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_k3,&(shelp1[0]), &(ssum[0])); + #endif + + + + + /////////////// + // l == 3, z // + /////////////// + + // negative direction + hoppos_global = dev_idn[4*pos_global + 3]; + hoppos_eo = dev_lexic2eosub[hoppos_global]; + + //hoppos = tex1Dfetch(nn_tex,8*pos+7); + //color + #ifdef GF_8 + dev_reconstructgf_8texref_dagger(gf, 4*hoppos_global+(3), &(gfsmem[ix].m)); + #else + dev_reconstructgf_2vtexref_dagger(gf, 4*hoppos_global+(3), &(gfsmem[ix].m)); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix].m, hoppos_eo, &(shelp1[0])); + #else + dev_su3MtV(gfsmem[ix].m, &(sin[6*hoppos_eo]), &(shelp1[0])); + #endif + //-kappa(r + gamma_mu) + #ifdef GF_8 + dev_kappaP3_minus(&(ssum[0]), &(shelp1[0]), dev_k3.re); + #else + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_mk3,&(shelp1[0]), &(ssum[0])); + dev_Gamma3(&(shelp1[0])); + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_mk3,&(shelp1[0]), &(ssum[0])); + #endif + + + + + /////////////// + // l == 2, y // + /////////////// + + // positive direction + hoppos_global = dev_iup[4*pos_global + 2]; + hoppos_eo = dev_lexic2eosub[hoppos_global]; + + //hoppos = tex1Dfetch(nn_tex,8*pos+2); + //color + #ifdef GF_8 + dev_reconstructgf_8texref(gf, 4*(hoppos_global)+(2), &(gfsmem[ix].m)); + #else + dev_reconstructgf_2vtexref(gf, 4*(hoppos_global)+(2), &(gfsmem[ix].m)); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix].m, hoppos_eo, &(shelp1[0])); + #else + dev_su3MtV(gfsmem[ix].m, &(sin[6*hoppos_eo]), &(shelp1[0])); + #endif + //-kappa(r - gamma_mu) + #ifdef GF_8 + dev_kappaP2_plus(&(ssum[0]), &(shelp1[0]), dev_k2.re); + #else + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_mk2,&(shelp1[0]), &(ssum[0])); + dev_Gamma2(&(shelp1[0])); + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_k2,&(shelp1[0]), &(ssum[0])); + #endif + + + + + /////////////// + // l == 2, y // + /////////////// + + // negative direction + hoppos_global = dev_idn[4*pos_global + 2]; + hoppos_eo = dev_lexic2eosub[hoppos_global]; + + //hoppos = tex1Dfetch(nn_tex,8*pos+6); + //color + #ifdef GF_8 + dev_reconstructgf_8texref_dagger(gf, 4*(hoppos_global)+(2), &(gfsmem[ix].m)); + #else + dev_reconstructgf_2vtexref_dagger(gf, 4*(hoppos_global)+(2), &(gfsmem[ix].m)); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix].m, hoppos_eo, &(shelp1[0])); + #else + dev_su3MtV(gfsmem[ix].m, &(sin[6*hoppos_eo]), &(shelp1[0])); + #endif + //-kappa(r + gamma_mu) + #ifdef GF_8 + dev_kappaP2_minus(&(ssum[0]), &(shelp1[0]), dev_k2.re); + #else + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_mk2,&(shelp1[0]), &(ssum[0])); + dev_Gamma2(&(shelp1[0])); + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_mk2,&(shelp1[0]), &(ssum[0])); + #endif + + + + + /////////////// + // l == 1, x // + /////////////// + + // positive direction + hoppos_global = dev_iup[4*pos_global + 1]; + hoppos_eo = dev_lexic2eosub[hoppos_global]; + + //hoppos = tex1Dfetch(nn_tex,8*pos+1); + //color + #ifdef GF_8 + dev_reconstructgf_8texref(gf, 4*(hoppos_global)+(1), &(gfsmem[ix].m)); + #else + dev_reconstructgf_2vtexref(gf, 4*(hoppos_global)+(1), &(gfsmem[ix].m)); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix].m, hoppos_eo, &(shelp1[0])); + #else + dev_su3MtV(gfsmem[ix].m, &(sin[6*hoppos_eo]), &(shelp1[0])); + #endif + //-kappa(r - gamma_mu) + #ifdef GF_8 + dev_kappaP1_plus(&(ssum[0]), &(shelp1[0]), dev_k1.re); + #else + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_mk1,&(shelp1[0]), &(ssum[0])); + dev_Gamma1(&(shelp1[0])); + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_k1,&(shelp1[0]), &(ssum[0])); + #endif + + + + + /////////////// + // l == 1, x // + /////////////// + + // negative direction + hoppos_global = dev_idn[4*pos_global + 1]; + hoppos_eo = dev_lexic2eosub[hoppos_global]; + + //hoppos = tex1Dfetch(nn_tex,8*pos+5); + //color + #ifdef GF_8 + dev_reconstructgf_8texref_dagger(gf, 4*(hoppos_global)+(1), &(gfsmem[ix].m)); + #else + dev_reconstructgf_2vtexref_dagger(gf, 4*(hoppos_global)+(1), &(gfsmem[ix].m)); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix].m, hoppos_eo, &(shelp1[0])); + #else + dev_su3MtV(gfsmem[ix].m, &(sin[6*hoppos_eo]), &(shelp1[0])); + #endif + //-kappa(r + gamma_mu) + #ifdef GF_8 + dev_kappaP1_minus(&(ssum[0]), &(shelp1[0]), dev_k1.re); + #else + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_mk1,&(shelp1[0]), &(ssum[0])); + dev_Gamma1(&(shelp1[0])); + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_mk1,&(shelp1[0]), &(ssum[0])); + #endif + + + + + ///////////// + // output // + //////////// + + //copy to output spinor + dev_copy_spinor(&(ssum[0]),&(sout[6*pos_eo])); + + } + + +}//dev_Hopping_Matrix_alternate<<<>>>() + + #endif +//#endif // MPI +*/ + + + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/ASYNC.cuh b/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/ASYNC.cuh new file mode 100644 index 0000000000000000000000000000000000000000..423c4a2a2ddc1c39526be18e04466bbfe45673c1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/ASYNC.cuh @@ -0,0 +1,2048 @@ +/************************************************************************** + * + * Copyright (C) 2010 Joseph Nagel + * 2010 Florian Burger + * + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + **************************************************************************/ + + + + + + +#ifndef HALF + + +//applies the Hopping Part Even-Odd ! +//the gauge field is the complete gaugefield! +//the gauge field at the local point is reconstructed by 2*pos+eo where pos is the eo-position +//from 0..VOLUME/2-1, eo = 0 or 1 +//the positions in the gauge fields are passed in "gfindex_site" for gf's that are attached at +//the actual positions and in "gfindex_nextsite" for gf's that start at a position of the +//other eo-sublattice. +//for the hopping positions of the eo-spinor field we use on of the two dedicated eo-nn fields +//the boundary conditions are implemented as in Hopping_Matrix.c +//mult with complex conjugate k0,k1,k2,k3 in positive direction because +// psi(x+mu) != exp(i theta_mu) psi(x) + +__global__ void dev_Hopping_Matrix_ASYNC (const dev_su3_2v * gf, + const dev_spinor * sin, dev_spinor * sout, + const int * gfindex_site, const int* gfindex_nextsite, const int * nn_evenodd, + const int eo, + int start, int size) { + + int pos, hoppos; + + + dev_spinor shelp1[6], ssum[6]; + __shared__ dev_su3_pad gfsmem[BLOCK]; + + + + pos = start + threadIdx.x + blockDim.x * blockIdx.x; + int ix = threadIdx.x; + + + if (pos < start + size) { + + + dev_zero_spinor(&(ssum[0])); // zero sum + + #ifdef TEMPORALGAUGE + int spatialvol = dev_LX*dev_LY*dev_LZ; + #endif + + +//hopping term +//l==0,t + //positive direction + hoppos = nn_evenodd[8*pos]; + //hoppos = tex1Dfetch(nn_tex,8*pos); + //color + + #ifdef TEMPORALGAUGE + // gf == ID for t != T-1 => just read the spinor + #ifdef MPI + if ( ((gfindex_site[pos]) < (dev_T-1)*spatialvol) || (dev_rank < dev_nproc-1) ) { + //if ((gfindex_site[pos]) < (dev_T-1)*spatialvol) { // FAKE TEMPORALGAUGE + #else + if ((gfindex_site[pos]/spatialvol) != (dev_T-1) ) { + #endif + + #ifdef USETEXTURE + shelp1[0] = tex1Dfetch(spin_tex,6*hoppos); + shelp1[1] = tex1Dfetch(spin_tex,6*hoppos+1); + shelp1[2] = tex1Dfetch(spin_tex,6*hoppos+2); + shelp1[3] = tex1Dfetch(spin_tex,6*hoppos+3); + shelp1[4] = tex1Dfetch(spin_tex,6*hoppos+4); + shelp1[5] = tex1Dfetch(spin_tex,6*hoppos+5); + #else + shelp1[0] = sin[6*hoppos]; + shelp1[1] = sin[6*hoppos+1]; + shelp1[2] = sin[6*hoppos+2]; + shelp1[3] = sin[6*hoppos+3]; + shelp1[4] = sin[6*hoppos+4]; + shelp1[5] = sin[6*hoppos+5]; + #endif + } + else{ + // gf != ID for t == T-1 => mult spinor with gf + #ifdef GF_8 + dev_reconstructgf_8texref(gf, 4*(gfindex_site[pos]),&(gfsmem[ix].m)); + #else + dev_reconstructgf_2vtexref(gf,4*(gfindex_site[pos]),&(gfsmem[ix].m)); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix].m, hoppos, &(shelp1[0])); + #else + dev_su3MtV(gfsmem[ix].m, &(sin[6*hoppos]), &(shelp1[0])); + #endif + } + #else + #ifdef GF_8 + dev_reconstructgf_8texref(gf, 4*(gfindex_site[pos]),&(gfsmem[ix].m)); + #else + dev_reconstructgf_2vtexref(gf, 4*(gfindex_site[pos]),&(gfsmem[ix].m)); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix].m, hoppos, &(shelp1[0])); + #else + dev_su3MtV(gfsmem[ix].m, &(sin[6*hoppos]), &(shelp1[0])); + #endif + #endif + + //-kappa(r - gamma_mu) + #ifdef GF_8 + dev_kappaP0_plus(&(ssum[0]), &(shelp1[0]), dev_cconj(dev_k0)); + #else + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_mk0,&(shelp1[0]), &(ssum[0])); + dev_Gamma0(&(shelp1[0])); + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_k0,&(shelp1[0]), &(ssum[0])); + #endif + +//l==0,t + //negative direction + hoppos = nn_evenodd[8*pos+4]; + //hoppos = tex1Dfetch(nn_tex,8*pos+4); + //color + #ifdef TEMPORALGAUGE + // gf == ID for t != T-1 => just read the spinor + #ifdef MPI + if ( ((gfindex_nextsite[hoppos]) < (dev_T-1)*spatialvol) || (dev_rank > 0) ) { + //if ((gfindex_nextsite[hoppos]) < (dev_T-1)*spatialvol) { // FAKE TEMPORALGAUGE + #else + if ((gfindex_nextsite[hoppos]/spatialvol) != (dev_T-1) ) { + #endif + + #ifdef USETEXTURE + shelp1[0] = tex1Dfetch(spin_tex,6*hoppos); + shelp1[1] = tex1Dfetch(spin_tex,6*hoppos+1); + shelp1[2] = tex1Dfetch(spin_tex,6*hoppos+2); + shelp1[3] = tex1Dfetch(spin_tex,6*hoppos+3); + shelp1[4] = tex1Dfetch(spin_tex,6*hoppos+4); + shelp1[5] = tex1Dfetch(spin_tex,6*hoppos+5); + #else + shelp1[0] = sin[6*hoppos]; + shelp1[1] = sin[6*hoppos+1]; + shelp1[2] = sin[6*hoppos+2]; + shelp1[3] = sin[6*hoppos+3]; + shelp1[4] = sin[6*hoppos+4]; + shelp1[5] = sin[6*hoppos+5]; + #endif + } + else{ + // gf != ID for t == T-1 => mult spinor with gf + #ifdef GF_8 + dev_reconstructgf_8texref_dagger(gf,4*gfindex_nextsite[hoppos],&(gfsmem[ix].m)); + #else + dev_reconstructgf_2vtexref_dagger(gf,4*gfindex_nextsite[hoppos],&(gfsmem[ix].m)); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix].m, hoppos, &(shelp1[0])); + #else + dev_su3MtV(gfsmem[ix].m, &(sin[6*hoppos]), &(shelp1[0])); + #endif + } + #else + #ifdef GF_8 + dev_reconstructgf_8texref_dagger(gf,4*gfindex_nextsite[hoppos],&(gfsmem[ix].m)); + #else + dev_reconstructgf_2vtexref_dagger(gf,4*gfindex_nextsite[hoppos],&(gfsmem[ix].m)); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix].m, hoppos, &(shelp1[0])); + #else + dev_su3MtV(gfsmem[ix].m, &(sin[6*hoppos]), &(shelp1[0])); + #endif + #endif + + //-kappa(r + gamma_mu) + #ifdef GF_8 + dev_kappaP0_minus(&(ssum[0]), &(shelp1[0]), dev_k0); + #else + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_mk0,&(shelp1[0]), &(ssum[0])); + dev_Gamma0(&(shelp1[0])); + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_mk0,&(shelp1[0]), &(ssum[0])); + #endif + + + + +//l==3,z + //positive direction + hoppos = nn_evenodd[8*pos+3]; + //hoppos = tex1Dfetch(nn_tex,8*pos+3); + //color + #ifdef GF_8 + dev_reconstructgf_8texref(gf,4*(gfindex_site[pos])+(3),&(gfsmem[ix].m)); + #else + dev_reconstructgf_2vtexref(gf, 4*(gfindex_site[pos])+(3),&(gfsmem[ix].m)); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix].m, hoppos, &(shelp1[0])); + #else + dev_su3MtV(gfsmem[ix].m, &(sin[6*hoppos]), &(shelp1[0])); + #endif + //-kappa(r - gamma_mu) + #ifdef GF_8 + dev_kappaP3_plus(&(ssum[0]), &(shelp1[0]), dev_k3.re); + #else + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_mk3,&(shelp1[0]), &(ssum[0])); + dev_Gamma3(&(shelp1[0])); + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_k3,&(shelp1[0]), &(ssum[0])); + #endif +//l==3,z + + //negative direction + hoppos = nn_evenodd[8*pos+7]; + //hoppos = tex1Dfetch(nn_tex,8*pos+7); + //color + #ifdef GF_8 + dev_reconstructgf_8texref_dagger(gf,4*gfindex_nextsite[hoppos]+(3),&(gfsmem[ix].m)); + #else + dev_reconstructgf_2vtexref_dagger(gf,4*gfindex_nextsite[hoppos]+(3),&(gfsmem[ix].m)); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix].m, hoppos, &(shelp1[0])); + #else + dev_su3MtV(gfsmem[ix].m, &(sin[6*hoppos]), &(shelp1[0])); + #endif + //-kappa(r + gamma_mu) + #ifdef GF_8 + dev_kappaP3_minus(&(ssum[0]), &(shelp1[0]), dev_k3.re); + #else + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_mk3,&(shelp1[0]), &(ssum[0])); + dev_Gamma3(&(shelp1[0])); + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_mk3,&(shelp1[0]), &(ssum[0])); + #endif + + + + +//l==2,y + //positive direction + hoppos = nn_evenodd[8*pos+2]; + //hoppos = tex1Dfetch(nn_tex,8*pos+2); + //color + #ifdef GF_8 + dev_reconstructgf_8texref(gf,4*(gfindex_site[pos])+(2),&(gfsmem[ix].m)); + #else + dev_reconstructgf_2vtexref(gf,4*(gfindex_site[pos])+(2),&(gfsmem[ix].m)); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix].m, hoppos, &(shelp1[0])); + #else + dev_su3MtV(gfsmem[ix].m, &(sin[6*hoppos]), &(shelp1[0])); + #endif + //-kappa(r - gamma_mu) + #ifdef GF_8 + dev_kappaP2_plus(&(ssum[0]), &(shelp1[0]), dev_k2.re); + #else + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_mk2,&(shelp1[0]), &(ssum[0])); + dev_Gamma2(&(shelp1[0])); + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_k2,&(shelp1[0]), &(ssum[0])); + #endif + +//l==2,y + + + //negative direction + hoppos = nn_evenodd[8*pos+6]; + //hoppos = tex1Dfetch(nn_tex,8*pos+6); + //color + #ifdef GF_8 + dev_reconstructgf_8texref_dagger(gf,4*gfindex_nextsite[hoppos]+(2),&(gfsmem[ix].m)); + #else + dev_reconstructgf_2vtexref_dagger(gf,4*gfindex_nextsite[hoppos]+(2),&(gfsmem[ix].m)); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix].m, hoppos, &(shelp1[0])); + #else + dev_su3MtV(gfsmem[ix].m, &(sin[6*hoppos]), &(shelp1[0])); + #endif + //-kappa(r + gamma_mu) + #ifdef GF_8 + dev_kappaP2_minus(&(ssum[0]), &(shelp1[0]), dev_k2.re); + #else + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_mk2,&(shelp1[0]), &(ssum[0])); + dev_Gamma2(&(shelp1[0])); + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_mk2,&(shelp1[0]), &(ssum[0])); + #endif + + + +//l==1,x + //positive direction + hoppos = nn_evenodd[8*pos+1]; + //hoppos = tex1Dfetch(nn_tex,8*pos+1); + //color + #ifdef GF_8 + dev_reconstructgf_8texref(gf,4*(gfindex_site[pos])+(1),&(gfsmem[ix].m)); + #else + dev_reconstructgf_2vtexref(gf,4*(gfindex_site[pos])+(1),&(gfsmem[ix].m)); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix].m, hoppos, &(shelp1[0])); + #else + dev_su3MtV(gfsmem[ix].m, &(sin[6*hoppos]), &(shelp1[0])); + #endif + //-kappa(r - gamma_mu) + #ifdef GF_8 + dev_kappaP1_plus(&(ssum[0]), &(shelp1[0]), dev_k1.re); + #else + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_mk1,&(shelp1[0]), &(ssum[0])); + dev_Gamma1(&(shelp1[0])); + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_k1,&(shelp1[0]), &(ssum[0])); + #endif + + +//l==1,x + + //negative direction + hoppos = nn_evenodd[8*pos+5]; + //hoppos = tex1Dfetch(nn_tex,8*pos+5); + //color + #ifdef GF_8 + dev_reconstructgf_8texref_dagger(gf,4*gfindex_nextsite[hoppos]+(1),&(gfsmem[ix].m)); + #else + dev_reconstructgf_2vtexref_dagger(gf,4*gfindex_nextsite[hoppos]+(1),&(gfsmem[ix].m)); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix].m, hoppos, &(shelp1[0])); + #else + dev_su3MtV(gfsmem[ix].m, &(sin[6*hoppos]), &(shelp1[0])); + #endif + //-kappa(r + gamma_mu) + #ifdef GF_8 + dev_kappaP1_minus(&(ssum[0]), &(shelp1[0]), dev_k1.re); + #else + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_mk1,&(shelp1[0]), &(ssum[0])); + dev_Gamma1(&(shelp1[0])); + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_mk1,&(shelp1[0]), &(ssum[0])); + #endif + + //copy to output spinor + dev_copy_spinor(&(ssum[0]),&(sout[6*pos])); + } +}//dev_Hopping_Matrix_ASYNC() + + + + + + + + + + +void HOPPING_ASYNC (dev_su3_2v * gf, + dev_spinor * spinin, dev_spinor * spinout, + int * gfindex_site, int * gfindex_nextsite, int * nn_evenodd, + int ieo, + int gridsize, int blocksize) { + + + // for even/odd + int tSliceEO = LX*LY*LZ/2; + int VolumeEO = VOLUME/2; + + #if defined ASYNC_OPTIMIZED && ASYNC == 3 + int offset; + if (tSliceEO % nStreams == 0) { + offset = tSliceEO / nStreams; + } + else { + printf("Error in HOPPING_ASYNC(): tSliceEO is not divisible by nStreams!\n"); + exit(-1); + } + #endif + + // gridsizes + int gridsize1; + int gridsize2; + + #ifndef ASYNC_TSLICES + if ( (VolumeEO-2*tSliceEO) % blocksize == 0 ) { + gridsize1 = (VolumeEO-2*tSliceEO) / blocksize; + } + else { + gridsize1 = (int) ( ((VolumeEO-2*tSliceEO)/blocksize) + 1); + } + + if ( (tSliceEO) % blocksize == 0 ) { + gridsize2 = (tSliceEO) / blocksize; + } + else { + gridsize2 = (int) ( ((tSliceEO)/blocksize) + 1); + } + #else + int tSlices = ASYNC_TSLICES; + if ( (VolumeEO-2*tSlices*tSliceEO) % blocksize == 0 ) { + gridsize1 = (VolumeEO-2*tSlices*tSliceEO) / blocksize; + } + else { + gridsize1 = (int) ( ((VolumeEO-2*tSlices*tSliceEO)/blocksize) + 1); + } + + if ( (tSlices*tSliceEO) % blocksize == 0 ) { + gridsize2 = (tSlices*tSliceEO) / blocksize; + } + else { + gridsize2 = (int) ( ((tSlices*tSliceEO)/blocksize) + 1); + } + #endif + + + + + #ifdef USETEXTURE + bind_texture_spin(spinin,1); + #endif + + + + + #if ASYNC == 0 // primitive version + + + // applies to the parts which don't need communication + dev_Hopping_Matrix_ASYNC <<>> ( gf, + spinin, spinout, + gfindex_site, gfindex_nextsite, nn_evenodd, + ieo, + //2*tSliceEO, VolumeEO-4*tSliceEO ); + tSliceEO, VolumeEO-2*tSliceEO ); + + // exchanges the boundaries + xchange_field_wrapper(spinin, ieo); // to be further optimized !! + + // applies the hopping matrix to remaining parts + dev_Hopping_Matrix_ASYNC <<>> ( gf, + spinin, spinout, + gfindex_site, gfindex_nextsite, nn_evenodd, + ieo, + //0, 2*tSliceEO ); + 0, tSliceEO ); + + dev_Hopping_Matrix_ASYNC <<>> ( gf, + spinin, spinout, + gfindex_site, gfindex_nextsite, nn_evenodd, + ieo, + //VolumeEO-2*tSliceEO, 2*tSliceEO ); + VolumeEO-tSliceEO, tSliceEO ); + + + + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + + + + #elif ASYNC == 1 // optimized version + + + #ifdef ASYNC_TIMING + cudaEventRecord(start_ALL, 0); + mpi_start_ALL = MPI_Wtime(); + #endif + + + // copies first FACE to host + cudaMemcpyAsync(RAND1, spinin, tSliceEO*6*sizeof(float4), cudaMemcpyDeviceToHost, stream[1]); + + #ifdef ASYNC_TIMING + cudaEventRecord(stop_D2H_1, stream[1]); + #endif + + + // INTERNAL kernel + dev_Hopping_Matrix_ASYNC <<>> ( gf, + spinin, spinout, + gfindex_site, gfindex_nextsite, nn_evenodd, + ieo, + tSlices*tSliceEO, VolumeEO-2*tSlices*tSliceEO ); + #ifdef ASYNC_TIMING + cudaEventRecord(stop_INT_0, stream[0]); + #endif + + + // exchanges first FACE + cudaStreamSynchronize(stream[1]); // SYNCPOINT + + #ifdef ASYNC_TIMING + mpi_start_sendrecv_1 = MPI_Wtime(); + #endif + + + // copies second FACE to host + cudaMemcpyAsync(RAND2, spinin+6*(VolumeEO-tSliceEO), tSliceEO*6*sizeof(float4), cudaMemcpyDeviceToHost, stream[2]); + + #ifdef ASYNC_TIMING + cudaEventRecord(stop_D2H_2, stream[2]); + #endif + + + //MPI_Irecv(RAND3, 24*tSliceEO, MPI_FLOAT, g_nb_t_up, 0, + // g_cart_grid, &recv_req[0]); + //MPI_Isend(RAND1, 24*tSliceEO, MPI_FLOAT, g_nb_t_dn, 0, + // g_cart_grid, &send_req[0]); + + MPI_Sendrecv(RAND1, 24*tSliceEO, MPI_FLOAT, g_nb_t_dn, 0, // SYNCPOINT + RAND3, 24*tSliceEO, MPI_FLOAT, g_nb_t_up, 0, + g_cart_grid, &stat[0]); + + #ifdef ASYNC_TIMING + mpi_stop_sendrecv_1 = MPI_Wtime(); + #endif + + + // copies first FACE back to device // order may switched + //MPI_Wait(&recv_req[0], &stat[0]); // synchronous + cudaMemcpyAsync(spinin+6*VolumeEO, RAND3, tSliceEO*6*sizeof(float4), cudaMemcpyHostToDevice, stream[1]); + #ifdef ASYNC_TIMING + cudaEventRecord(stop_H2D_3, stream[1]); + #endif + + + // applies first FACE + dev_Hopping_Matrix_ASYNC <<>> ( gf, + spinin, spinout, + gfindex_site, gfindex_nextsite, nn_evenodd, + ieo, + VolumeEO-tSlices*tSliceEO, tSlices*tSliceEO ); + #ifdef ASYNC_TIMING + cudaEventRecord(stop_EXT_1, stream[1]); + #endif + + + // exchanges second FACE + cudaStreamSynchronize(stream[2]); // SYNCPOINT + + #ifdef ASYNC_TIMING + mpi_start_sendrecv_2 = MPI_Wtime(); + #endif + + //MPI_Irecv(RAND4, 24*tSliceEO, MPI_FLOAT, g_nb_t_dn, 1, + // g_cart_grid, &recv_req[1]); + //MPI_Isend(RAND2, 24*tSliceEO, MPI_FLOAT, g_nb_t_up, 1, + // g_cart_grid, &send_req[1]); + + MPI_Sendrecv(RAND2, 24*tSliceEO, MPI_FLOAT, g_nb_t_up, 1, // SYNCPOINT + RAND4, 24*tSliceEO, MPI_FLOAT, g_nb_t_dn, 1, + g_cart_grid, &stat[1]); + + #ifdef ASYNC_TIMING + mpi_stop_sendrecv_2 = MPI_Wtime(); + #endif + + + + + // copies second FACE back to device + //MPI_Wait(&recv_req[1], &stat[1]); + cudaMemcpyAsync(spinin+6*(VolumeEO+tSliceEO), RAND4, tSliceEO*6*sizeof(float4), cudaMemcpyHostToDevice, stream[2]); + #ifdef ASYNC_TIMING + cudaEventRecord(stop_H2D_4, stream[2]); + #endif + + + // applies second FACE + dev_Hopping_Matrix_ASYNC <<>> ( gf, + spinin, spinout, + gfindex_site, gfindex_nextsite, nn_evenodd, + ieo, + 0, tSlices*tSliceEO ); + #ifdef ASYNC_TIMING + cudaEventRecord(stop_EXT_2, stream[2]); + #endif + + + #ifdef ASYNC_TIMING + cudaEventRecord(stop_ALL, 0); + #endif + + + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + + + + #elif ASYNC == 2 // alternate optimized version + + + #ifdef ASYNC_TIMING + cudaEventRecord(start_ALL, 0); + mpi_start_ALL = MPI_Wtime(); + #endif + + + // copies first FACE to host + cudaMemcpyAsync(RAND1, spinin, tSliceEO*6*sizeof(float4), cudaMemcpyDeviceToHost, stream[1]); + + #ifdef ASYNC_TIMING + cudaEventRecord(stop_D2H_1, stream[1]); + #endif + + + // copies second FACE to host + cudaMemcpyAsync(RAND2, spinin+6*(VolumeEO-tSliceEO), tSliceEO*6*sizeof(float4), cudaMemcpyDeviceToHost, stream[2]); + + #ifdef ASYNC_TIMING + cudaEventRecord(stop_D2H_2, stream[2]); + #endif + + + // INTERNAL kernel + dev_Hopping_Matrix_ASYNC <<>> ( gf, + spinin, spinout, + gfindex_site, gfindex_nextsite, nn_evenodd, + ieo, + tSlices*tSliceEO, VolumeEO-2*tSlices*tSliceEO ); + #ifdef ASYNC_TIMING + cudaEventRecord(stop_INT_0, stream[0]); + #endif + + + // first FACE + cudaStreamSynchronize(stream[1]); // SYNCPOINT + + #ifdef ASYNC_TIMING + mpi_start_sendrecv_1 = MPI_Wtime(); + #endif + + MPI_Sendrecv(RAND1, 24*tSliceEO, MPI_FLOAT, g_nb_t_dn, 0, // SYNCPOINT + RAND3, 24*tSliceEO, MPI_FLOAT, g_nb_t_up, 0, + g_cart_grid, &stat[0]); + + //MPI_Isend(RAND1, 24*tSliceEO, MPI_FLOAT, g_nb_t_dn, 0, + // g_cart_grid, &send_req[0]); + //MPI_Recv(RAND3, 24*tSliceEO, MPI_FLOAT, g_nb_t_up, 0, + // g_cart_grid, &stat[0]); + + //MPI_Wait(&recv_request1, &stat[0]); + + #ifdef ASYNC_TIMING + mpi_stop_sendrecv_1 = MPI_Wtime(); + #endif + + cudaMemcpyAsync(spinin+6*VolumeEO, RAND3, tSliceEO*6*sizeof(float4), cudaMemcpyHostToDevice, stream[1]); + + #ifdef ASYNC_TIMING + cudaEventRecord(stop_H2D_3, stream[1]); + #endif + + + dev_Hopping_Matrix_ASYNC <<>> ( gf, + spinin, spinout, + gfindex_site, gfindex_nextsite, nn_evenodd, + ieo, + VolumeEO-tSlices*tSliceEO, tSlices*tSliceEO ); + #ifdef ASYNC_TIMING + cudaEventRecord(stop_EXT_1, stream[1]); + #endif + + + // second FACE + cudaStreamSynchronize(stream[2]); // SYNCPOINT + + #ifdef ASYNC_TIMING + mpi_start_sendrecv_2 = MPI_Wtime(); + #endif + + MPI_Sendrecv(RAND2, 24*tSliceEO, MPI_FLOAT, g_nb_t_up, 1, // SYNCPOINT + RAND4, 24*tSliceEO, MPI_FLOAT, g_nb_t_dn, 1, + g_cart_grid, &stat[1]); + + //MPI_Isend(RAND2, 24*tSliceEO, MPI_FLOAT, g_nb_t_up, 1, + // g_cart_grid, &send_req[1]); + //MPI_Recv(RAND4, 24*tSliceEO, MPI_FLOAT, g_nb_t_dn, 1, + // g_cart_grid, &stat[1]); + + //MPI_Wait(&recv_request2, &stat[1]); + + #ifdef ASYNC_TIMING + mpi_stop_sendrecv_2 = MPI_Wtime(); + #endif + + cudaMemcpyAsync(spinin+6*(VolumeEO+tSliceEO), RAND4, tSliceEO*6*sizeof(float4), cudaMemcpyHostToDevice, stream[2]); + + #ifdef ASYNC_TIMING + cudaEventRecord(stop_H2D_4, stream[2]); + #endif + + + dev_Hopping_Matrix_ASYNC <<>> ( gf, + spinin, spinout, + gfindex_site, gfindex_nextsite, nn_evenodd, + ieo, + 0, tSlices*tSliceEO ); + #ifdef ASYNC_TIMING + cudaEventRecord(stop_EXT_2, stream[2]); + #endif + + + #ifdef ASYNC_TIMING + cudaEventRecord(stop_ALL, 0); + #endif + + + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + + #elif ASYNC == 3 + + + #ifdef ASYNC_TIMING + cudaEventRecord(start_ALL, 0); + mpiTime_start_ALL = MPI_Wtime(); + #endif + + + // copies first FACE to host + cudaMemcpyAsync(RAND1, spinin , tSliceEO*6*sizeof(float4), cudaMemcpyDeviceToHost, stream[1]); + + #ifdef ASYNC_TIMING + cudaEventRecord(stop_D2H_1, stream[1]); + #endif + + // copies second FACE to host + cudaMemcpyAsync(RAND2, spinin+6*(VolumeEO-tSliceEO), tSliceEO*6*sizeof(float4), cudaMemcpyDeviceToHost, stream[2]); + + #ifdef ASYNC_TIMING + cudaEventRecord(stop_D2H_2, stream[2]); + #endif + + + // INTERNAL kernel + dev_Hopping_Matrix_ASYNC <<>> ( gf, + spinin, spinout, + gfindex_site, gfindex_nextsite, nn_evenodd, + ieo, + tSliceEO, VolumeEO-2*tSliceEO ); + #ifdef ASYNC_TIMING + cudaEventRecord(stop_INT_0, stream[0]); + #endif + + + // first FACE + cudaStreamSynchronize(stream[1]); + + for (int i = 0; i < nStreams; i++) { + + #ifdef ASYNC_TIMING + mpiTime_start_sendrecv_1 = MPI_Wtime(); + #endif + + MPI_Sendrecv(RAND1+6*i*offset, 24*offset, MPI_FLOAT, g_nb_t_dn, 0, // NOT asynchronous + RAND3+6*i*offset, 24*offset, MPI_FLOAT, g_nb_t_up, 0, + g_cart_grid, &stat[i]); + + #ifdef ASYNC_TIMING + mpiTime_stop_sendrecv_1 = MPI_Wtime(); + #endif + + //MPI_Isend(RAND1+6*i*offset, 24*offset, MPI_FLOAT, g_nb_t_dn, i, + // g_cart_grid, &send_req[i]); + //MPI_Irecv (RAND3+6*i*offset, 24*offset, MPI_FLOAT, g_nb_t_up, i, + // g_cart_grid, &recv_req[i]); + + //MPI_Wait(&recv_req[i], &stat[i]); + + cudaMemcpyAsync(spinin+6*VolumeEO+6*i*offset, RAND3+6*i*offset, offset*6*sizeof(float4), cudaMemcpyHostToDevice, stream[1+i]); + + #ifdef ASYNC_TIMING + cudaEventRecord(stop_H2D_3, stream[1]); + #endif + + dev_Hopping_Matrix_ASYNC <<>> ( gf, + spinin, spinout, + gfindex_site, gfindex_nextsite, nn_evenodd, + ieo, + VolumeEO-tSliceEO+i*offset, offset ); + #ifdef ASYNC_TIMING + cudaEventRecord(stop_EXT_1, stream[1]); + #endif + + } + + + // second FACE + cudaStreamSynchronize(stream[nStreams+1]); + + for (int i = 0; i < nStreams; i++) { + + #ifdef ASYNC_TIMING + mpiTime_start_sendrecv_2 = MPI_Wtime(); + #endif + + MPI_Sendrecv(RAND2+6*i*offset, 24*offset, MPI_FLOAT, g_nb_t_up, 1, + RAND4+6*i*offset, 24*offset, MPI_FLOAT, g_nb_t_dn, 1, + g_cart_grid, &stat[nStreams+i]); + + #ifdef ASYNC_TIMING + mpiTime_stop_sendrecv_2 = MPI_Wtime(); + #endif + + //MPI_Isend(RAND2+6*i*offset, 24*offset, MPI_FLOAT, g_nb_t_up, nStreams+i, + // g_cart_grid, &send_req[nStreams+i]); + //MPI_Irecv (RAND4+6*i*offset, 24*offset, MPI_FLOAT, g_nb_t_dn, nStreams+i, + // g_cart_grid, &recv_req[nStreams+i]); + + //MPI_Wait(&recv_req[nStreams+i], &stat[nStreams+i]); + + cudaMemcpyAsync(spinin+6*(VolumeEO+tSliceEO)+6*i*offset, RAND4+6*i*offset, offset*6*sizeof(float4), cudaMemcpyHostToDevice, stream[nStreams+1+i]); + + #ifdef ASYNC_TIMING + cudaEventRecord(stop_H2D_4, stream[2]); + #endif + + dev_Hopping_Matrix_ASYNC <<>> ( gf, + spinin, spinout, + gfindex_site, gfindex_nextsite, nn_evenodd, + ieo, + 0+i*offset, offset ); + #ifdef ASYNC_TIMING + cudaEventRecord(stop_EXT_2, stream[2]); + #endif + + } + + + #ifdef ASYNC_TIMING + cudaEventRecord(stop_ALL, 0); + #endif + + + + #endif // different optimized and non-optimized version + + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + + cudaThreadSynchronize(); // test if needed // for timing ... + + + #ifdef USETEXTURE + unbind_texture_spin(1); + #endif + + +} + + + + + + +/////////////////////////// +// MATRIX MULTIPLICATION // +/////////////////////////// + +// the GPU implementation of Q_Qdagger_ND(...) from Nondegenerate_Matrix.c +// Flo's equivalent function for the standard and non-nd case is dev_Qtm_pm_psi + +void matrix_multiplication32_mpi_ASYNC (dev_spinor * spinout_up, dev_spinor * spinout_dn, + dev_spinor * spinin_up , dev_spinor * spinin_dn , + int gridsize1, int blocksize1, int gridsize2, int blocksize2, + int gridsize3, int blocksize3, int gridsize4, int blocksize4) { + + + typedef REAL RealT; + // we will use the auxiliary fields dev_spin_eo{1,2}_up/dn for working on and buffering + // and set dev_spin_eo2_up/dn equal spinout_up/dn + // spinin_up/dn have to remain unchanged !! + // spinout_up/dn can be freely used + + + + + ///////////////////// + // LOCAL VARIABLES // + ///////////////////// + + int N_sites = VOLUME/2; // #lattice sites + int N_floats = 24*VOLUME/2; // #floats + + + + + //////////////////////////////////// + // MATCHING with Q_Qdagger_ND // + //////////////////////////////////// + // // + // _strange = _up // + // _charm = _dn // + // // + // DUM_MATRIX = dev_spin_eo1_up // + // DUM_MATRIX+1 = dev_spin_eo1_dn // + // // + // DUM_MATRIX+2 = dev_spin_eo2_up // + // DUM_MATRIX+3 = dev_spin_eo2_dn // + // // + //////////////////////////////////// + + + + + /////////////////////////////////// + // INITIALIZATIONS & ASSIGNMENTS // // have to use (one) other auxiliary field(s) than the calling function dev_cg_eo_nd + /////////////////////////////////// + + dev_spin_eo2_up = spinout_up; // need no memory allocated + dev_spin_eo2_dn = spinout_dn; + ///////////// THEORY //////////////////////////////////////////////////////////////// + // // + // (Q_tilde) = gamma5 * ((M_oo) - (M_oe)(Mee^-1)(M_eo)) // + // (Q_tilde)(Q_tilde_dagger) * (up,dn) = (Q_tilde) * (b,a) // + /////////////// // (a,b) = (Q_tilde) * (dn,up) // + // MAIN BODY // // // + /////////////// ///////////////////////////////////////////////////////////////////////////////////// + + + double nrm = 1.0 / (1.0 + g_mubar*g_mubar - g_epsbar*g_epsbar); + + //printf("This is matrix_multiplication32_mpi_ASYNC().\n"); + + + /////////////////////////////////////// ///////////////////////////////// + // Q_tilde_dagger(2x2) // // (a,b) = (Q_tilde) * (dn,up) // + /////////////////////////////////////// ///////////////////////////////// + + + //////////////////////////// + // (M_oe) (Mee^-1) (M_eo) // + //////////////////////////// + + + HOPPING_ASYNC(dev_gf, spinin_dn, dev_spin_eo1_up, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0, gridsize1, blocksize1); + + HOPPING_ASYNC(dev_gf, spinin_up, dev_spin_eo1_dn, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0, gridsize1, blocksize1); + + + // imubar, gamma5 + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo1_up, dev_spin_eo2_up, -1.0); // dev_spin_eo2_up = (1 - imubar) * dev_spin_eo1_up = (1 - imubar)*(M_eo) * spinin_dn + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo1_dn, dev_spin_eo2_dn, +1.0); // dev_spin_eo2_dn = (1 + imubar) * dev_spin_eo1_dn = (1 + imubar)*(M_eo) * spinin_up + + + // linear algebra + cublasAxpy (N_floats, g_epsbar, (RealT*)dev_spin_eo1_dn, 1, (RealT*)dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up + epsbar * dev_spin_eo1_dn = (1 - imubar)*(M_eo) * spinin_dn + epsbar * (M_eo) * spinin_up + cublasAxpy (N_floats, g_epsbar, (RealT*)dev_spin_eo1_up, 1, (RealT*)dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn + epsbar * dev_spin_eo1_up = (1 + imubar)*(M_eo) * spinin_up + epsbar * (M_eo) * spinin_dn + + // linear algebra + cublasScal (N_floats, nrm, (RealT*)dev_spin_eo2_up, 1); // dev_spin_eo2_up = nrm * dev_spin_eo2_up = nrm*(1-imubar)*(M_eo) * spinin_dn + nrm*epsbar*(M_eo) * spinin_up + + cublasScal (N_floats, nrm, (RealT*)dev_spin_eo2_dn, 1); // dev_spin_eo2_dn = nrm * dev_spin_eo2_dn = nrm*(1+imubar)*(M_eo) * spinin_up + nrm*epsbar*(M_eo) * spinin_dn + + + + HOPPING_ASYNC(dev_gf, dev_spin_eo2_up, dev_spin_eo1_up, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1, gridsize1, blocksize1); + + HOPPING_ASYNC(dev_gf, dev_spin_eo2_dn, dev_spin_eo1_dn, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1, gridsize1, blocksize1); + + + + + + //////////// + // (M_oo) // + //////////// + + + // imubar, gamma5 + dev_mul_one_pm_imubar_gamma5<<>>(spinin_dn, dev_spin_eo2_up, +1.0); // dev_spin_eo2_up = (1 + imubar) * spinin_dn + dev_mul_one_pm_imubar_gamma5<<>>(spinin_up, dev_spin_eo2_dn, -1.0); // dev_spin_eo2_dn = (1 - imubar) * spinin_up + + + // linear algebra // remember: this is (M_oo) * (spinin_dn, spinin_up): + cublasAxpy (N_floats, -g_epsbar, (RealT*)spinin_up, 1, (RealT*)dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up - epsbar*spinin_up = (1+imubar)*spinin_dn - epsbar*spinin_up + cublasAxpy (N_floats, -g_epsbar, (RealT*)spinin_dn, 1, (RealT*)dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn - epsbar*spinin_dn = (1-imubar)*spinin_up - epsbar*spinin_dn + + + + /////////////////////////////////////// + // (M_oo) - (M_oe) (Mee^-1) (M_eo) // + /////////////////////////////////////// + + // linear algebra // this is ((M_oo) - (M_oe)(Mee^-1)(M_eo)) * (spinin_dn, spinin_up): + cublasAxpy (N_floats, -1.0, (RealT*)dev_spin_eo1_up, 1, (RealT*)dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up - dev_spin_eo1_up = (1+imubar) * spinin_dn - epsbar * spinin_up + // - (M_oe)*nrm*(1-imubar)*(M_eo) * spinin_dn - (M_oe)*nrm*epsbar*(M_eo) * spinin_up + cublasAxpy (N_floats, -1.0, (RealT*)dev_spin_eo1_dn, 1, (RealT*)dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn - dev_spin_eo1_dn = (1-imubar) * spinin_up - epsbar * spinin_dn + // - (M_oe)*nrm*(1+imubar)*(M_eo) * spinin_up - (M_oe)*nrm*epsbar*(M_eo) * spinin_dn + + + //////////// + // gamma5 // + //////////// + + // gamma5 + dev_gamma5<<>>(dev_spin_eo2_up, dev_spin_eo2_up); // dev_spin_eo2_up = gamma5 * dev_spin_eo2_up + dev_gamma5<<>>(dev_spin_eo2_dn, dev_spin_eo2_dn); // dev_spin_eo2_dn = gamma5 * dev_spin_eo2_dn + + + + + + + //////////////////// + // (a,b) -> (b,a) // + //////////////////// + dev_copy_spinor_field<<>>(dev_spin_eo2_dn, dev_spin_eo3_up); // dev_spin_eo3_up = dev_spin_eo2_dn + dev_copy_spinor_field<<>>(dev_spin_eo2_up, dev_spin_eo3_dn); // dev_spin_eo3_dn = dev_spin_eo2_up + + + + + + + /////////////////////////////////// /////////////////////// + // Q_tilde(2x2) // // (Q_tilde) * (b,a) // + /////////////////////////////////// /////////////////////// + + + //////////////////////////// + // (M_oe) (Mee^-1) (M_eo) // + //////////////////////////// + + + HOPPING_ASYNC(dev_gf, dev_spin_eo3_up, dev_spin_eo1_up, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0, gridsize1, blocksize1); + + HOPPING_ASYNC(dev_gf, dev_spin_eo3_dn, dev_spin_eo1_dn, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0, gridsize1, blocksize1); + + + + // imubar, gamma5 + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo1_up, dev_spin_eo2_up, -1.0); // dev_spin_eo2_up = (1 - imubar) * dev_spin_eo1_up = (1 - imubar)*(M_eo) * dev_spin_eo3_up + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo1_dn, dev_spin_eo2_dn, +1.0); // dev_spin_eo2_dn = (1 + imubar) * dev_spin_eo1_dn = (1 + imubar)*(M_eo) * dev_spin_eo3_dn + + + // linear algebra + cublasAxpy (N_floats, g_epsbar, (RealT*)dev_spin_eo1_dn, 1, (RealT*)dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up + epsbar * dev_spin_eo1_dn = (1 - imubar)*(M_eo) * dev_spin_eo3_up + epsbar * (M_eo) * dev_spin_eo3_dn + cublasAxpy (N_floats, g_epsbar, (RealT*)dev_spin_eo1_up, 1, (RealT*)dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn + epsbar * dev_spin_eo1_up = (1 + imubar)*(M_eo) * dev_spin_eo3_dn + epsbar * (M_eo) * dev_spin_eo3_up + + // lineare algebra + cublasScal (N_floats, nrm, (RealT*)dev_spin_eo2_up, 1); // dev_spin_eo2_up = nrm * dev_spin_eo2_up = nrm*(1-imubar)*(M_eo) * dev_spin_eo3_up + nrm*epsbar*(M_eo) * dev_spin_eo3_dn + + cublasScal (N_floats, nrm, (RealT*)dev_spin_eo2_dn, 1); // dev_spin_eo2_dn = nrm * dev_spin_eo2_dn = nrm*(1+imubar)*(M_eo) * dev_spin_eo3_dn + nrm*epsbar*(M_eo) * dev_spin_eo3_up + + + + HOPPING_ASYNC(dev_gf, dev_spin_eo2_up, dev_spin_eo1_up, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1, gridsize1, blocksize1); + + HOPPING_ASYNC(dev_gf, dev_spin_eo2_dn, dev_spin_eo1_dn, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1, gridsize1, blocksize1); + + + + + //////////// + // (M_oo) // + //////////// + + // imubar, gamma5 + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo3_up, dev_spin_eo2_up, +1.0); // dev_spin_eo2_up = (1 + imubar) * dev_spin_eo3_up + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo3_dn, dev_spin_eo2_dn, -1.0); // dev_spin_eo2_dn = (1 - imubar) * dev_spin_eo3_dn + + + // lineare algebra // remember: this is (M_oo) * (dev_spin_eo3_up, dev_spin_eo3_dn): + cublasAxpy (N_floats, -g_epsbar, (RealT*)dev_spin_eo3_dn, 1, (RealT*)dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up - epsbar*dev_spin_eo3_dn = (1+imubar)*dev_spin_eo3_up - epsbar*dev_spin_eo3_dn + cublasAxpy (N_floats, -g_epsbar, (RealT*)dev_spin_eo3_up, 1, (RealT*)dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn - epsbar*dev_spin_eo3_up = (1-imubar)*dev_spin_eo3_dn - epsbar*dev_spin_eo3_up + + + + /////////////////////////////////////// + // (M_oo) - (M_oe) (Mee^-1) (M_eo) // + /////////////////////////////////////// + + // lineare algebra // this is ( (M_oo) - (M_oe) (Mee^-1) (M_eo) ) * (dev_spin_eo3_up, dev_spin_eo3_dn) + cublasAxpy (N_floats, -1.0, (RealT*)dev_spin_eo1_up, 1, (RealT*)dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up - dev_spin_eo1_up = (1+imubar) * dev_spin_eo3_up - epsbar * dev_spin_eo3_dn + // - (M_oe)*nrm*(1-imubar)*(M_eo) * dev_spin_eo3_up - (M_oe)*nrm*epsbar*(M_eo) * dev_spin_eo3_dn + cublasAxpy (N_floats, -1.0, (RealT*)dev_spin_eo1_dn, 1, (RealT*)dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn - dev_spin_eo1_dn = (1-imubar) * dev_spin_eo3_dn - epsbar * dev_spin_eo3_up + // - (M_oe)*nrm*(1+imubar)*(M_eo) * dev_spin_eo3_dn - (M_oe)*nrm*epsbar*(M_eo) * dev_spin_eo3_up + + + //////////// + // gamma5 // + //////////// + + // gamma5 + dev_gamma5<<>>(dev_spin_eo2_up, dev_spin_eo2_up); // dev_spin_eo2_up = gamma5 * dev_spin_eo2_up + dev_gamma5<<>>(dev_spin_eo2_dn, dev_spin_eo2_dn); // dev_spin_eo2_dn = gamma5 * dev_spin_eo2_dn + + + + + /* + //////////// + // output // // output is already done by setting dev_spin_eo2_up/dn = spinout_up/dn + //////////// + + dev_copy_spinor_field<<>>(dev_spin_eo2_up, spinout_up); // spinout_up = dev_spin_eo2_up + dev_copy_spinor_field<<>>(dev_spin_eo2_dn, spinout_dn); // spinout_dn = dev_spin_eo2_dn + */ + + + return; + +}//matrix_multiplication32_mpi_ASYNC() + + + + + +#else //HALF + +void matrix_multiplication32_mpi_ASYNC (dev_spinor * spinout_up, dev_spinor * spinout_dn, + dev_spinor * spinin_up , dev_spinor * spinin_dn , + int gridsize1, int blocksize1, int gridsize2, int blocksize2, + int gridsize3, int blocksize3, int gridsize4, int blocksize4) { + + printf("Warning: 'matrix_multiplication32_mpi_ASYNC' has been called from HALF code part. Not impemented yet. Aborting...\n"); + exit(200); +} + + + + + +//applies the Hopping Part Even-Odd ! +//the gauge field is the complete gaugefield! +//the gauge field at the local point is reconstructed by 2*pos+eo where pos is the eo-position +//from 0..VOLUME/2-1, eo = 0 or 1 +//the positions in the gauge fields are passed in "gfindex_site" for gf's that are attached at +//the actual positions and in "gfindex_nextsite" for gf's that start at a position of the +//other eo-sublattice. +//for the hopping positions of the eo-spinor field we use on of the two dedicated eo-nn fields +//the boundary conditions are implemented as in Hopping_Matrix.c +//mult with complex conjugate k0,k1,k2,k3 in positive direction because +// psi(x+mu) != exp(i theta_mu) psi(x) + +__global__ void dev_Hopping_Matrix_half_ASYNC (const dev_su3_2v_half * gf, + const dev_spinor_half * sin, const float* sin_norm, dev_spinor_half * sout, + float* sout_norm, const int * gfindex_site, + const int* gfindex_nextsite, const int * nn_evenodd, + const int eo, + int start, int size) { + + int pos, hoppos; + + + dev_spinor shelp1[6], ssum[6]; + __shared__ dev_su3_pad gfsmem[BLOCK]; + + + + pos = start + threadIdx.x + blockDim.x * blockIdx.x; + int ix = threadIdx.x; + + + if (pos < start + size) { + + + dev_zero_spinor(&(ssum[0])); // zero sum + + #ifdef TEMPORALGAUGE + int spatialvol = dev_LX*dev_LY*dev_LZ; + #endif + + +//hopping term +//l==0,t + //positive direction + hoppos = nn_evenodd[8*pos]; + //hoppos = tex1Dfetch(nn_tex,8*pos); + //color + + #ifdef TEMPORALGAUGE + // gf == ID for t != T-1 => just read the spinor + #ifdef MPI + if ( ((gfindex_site[pos]) < (dev_T-1)*spatialvol) || (dev_rank < dev_nproc-1) ) { + //if ((gfindex_site[pos]) < (dev_T-1)*spatialvol) { // FAKE TEMPORALGAUGE + #else + if ((gfindex_site[pos]/spatialvol) != (dev_T-1) ) { + #endif + + #ifdef USETEXTURE + norm = tex1Dfetch(spinnormhalf_tex, hoppos); + shelp1[0] = tex1Dfetch(spinhalf_tex,6*hoppos); + shelp1[1] = tex1Dfetch(spinhalf_tex,6*hoppos+1); + shelp1[2] = tex1Dfetch(spinhalf_tex,6*hoppos+2); + shelp1[3] = tex1Dfetch(spinhalf_tex,6*hoppos+3); + shelp1[4] = tex1Dfetch(spinhalf_tex,6*hoppos+4); + shelp1[5] = tex1Dfetch(spinhalf_tex,6*hoppos+5); + //normalize + #pragma unroll 6 + for(i=0; i<6; i++){ + shelp1[i].x = norm*shelp1[i].x; + shelp1[i].y = norm*shelp1[i].y; + shelp1[i].z = norm*shelp1[i].z; + shelp1[i].w = norm*shelp1[i].w; + } + #else + norm = sin_norm[hoppos]; + //read and normalize + #pragma unroll 6 + for(i=0; i<6; i++){ + shelp1[i].x = norm*sh2fl(sin[6*hoppos+i].x); + shelp1[i].y = norm*sh2fl(sin[6*hoppos+i].y); + shelp1[i].z = norm*sh2fl(sin[6*hoppos+i].z); + shelp1[i].w = norm*sh2fl(sin[6*hoppos+i].w); + } + #endif + } + else{ + // gf != ID for t == T-1 => mult spinor with gf + #ifdef GF_8 + dev_reconstructgf_8texref_half(gf, 4*(gfindex_site[pos]),&(gfsmem[ix].m)); + #else + dev_reconstructgf_2vtexref_half(gf,4*(gfindex_site[pos]),&(gfsmem[ix].m)); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix].m, hoppos, &(shelp1[0])); + #else + dev_su3MtV_half(gfsmem[ix].m, &(sin[6*hoppos]),&(sin_norm[hoppos]), &(shelp1[0])); + #endif + } + #else + #ifdef GF_8 + dev_reconstructgf_8texref_half(gf, 4*(gfindex_site[pos]),&(gfsmem[ix].m)); + #else + dev_reconstructgf_2vtexref_half(gf, 4*(gfindex_site[pos]),&(gfsmem[ix].m)); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix].m, hoppos, &(shelp1[0])); + #else + dev_su3MtV_half(gfsmem[ix].m, &(sin[6*hoppos]),&(sin_norm[hoppos]), &(shelp1[0])); + #endif + #endif + + //-kappa(r - gamma_mu) + #ifdef GF_8 + dev_kappaP0_plus(&(ssum[0]), &(shelp1[0]), dev_cconj(dev_k0)); + #else + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_mk0,&(shelp1[0]), &(ssum[0])); + dev_Gamma0(&(shelp1[0])); + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_k0,&(shelp1[0]), &(ssum[0])); + #endif + +//l==0,t + //negative direction + hoppos = nn_evenodd[8*pos+4]; + //hoppos = tex1Dfetch(nn_tex,8*pos+4); + //color + #ifdef TEMPORALGAUGE + // gf == ID for t != T-1 => just read the spinor + #ifdef MPI + if ( ((gfindex_nextsite[hoppos]) < (dev_T-1)*spatialvol) || (dev_rank > 0) ) { + //if ((gfindex_nextsite[hoppos]) < (dev_T-1)*spatialvol) { // FAKE TEMPORALGAUGE + #else + if ((gfindex_nextsite[hoppos]/spatialvol) != (dev_T-1) ) { + #endif + + #ifdef USETEXTURE + norm = tex1Dfetch(spinnormhalf_tex, hoppos); + shelp1[0] = tex1Dfetch(spinhalf_tex,6*hoppos); + shelp1[1] = tex1Dfetch(spinhalf_tex,6*hoppos+1); + shelp1[2] = tex1Dfetch(spinhalf_tex,6*hoppos+2); + shelp1[3] = tex1Dfetch(spinhalf_tex,6*hoppos+3); + shelp1[4] = tex1Dfetch(spinhalf_tex,6*hoppos+4); + shelp1[5] = tex1Dfetch(spinhalf_tex,6*hoppos+5); + //normalize + #pragma unroll 6 + for(i=0; i<6; i++){ + shelp1[i].x = norm*shelp1[i].x; + shelp1[i].y = norm*shelp1[i].y; + shelp1[i].z = norm*shelp1[i].z; + shelp1[i].w = norm*shelp1[i].w; + } + #else + norm = sin_norm[hoppos]; + //read and normalize + #pragma unroll 6 + for(i=0; i<6; i++){ + shelp1[i].x = norm*sh2fl(sin[6*hoppos+i].x); + shelp1[i].y = norm*sh2fl(sin[6*hoppos+i].y); + shelp1[i].z = norm*sh2fl(sin[6*hoppos+i].z); + shelp1[i].w = norm*sh2fl(sin[6*hoppos+i].w); + } + #endif + } + else{ + // gf != ID for t == T-1 => mult spinor with gf + #ifdef GF_8 + dev_reconstructgf_8texref_dagger_half(gf,4*gfindex_nextsite[hoppos],&(gfsmem[ix].m)); + #else + dev_reconstructgf_2vtexref_dagger_half(gf,4*gfindex_nextsite[hoppos],&(gfsmem[ix].m)); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix].m, hoppos, &(shelp1[0])); + #else + dev_su3MtV_half(gfsmem[ix].m, &(sin[6*hoppos]),&(sin_norm[hoppos]), &(shelp1[0])); + #endif + } + #else + #ifdef GF_8 + dev_reconstructgf_8texref_dagger_half(gf,4*gfindex_nextsite[hoppos],&(gfsmem[ix].m)); + #else + dev_reconstructgf_2vtexref_dagger_half(gf,4*gfindex_nextsite[hoppos],&(gfsmem[ix].m)); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix].m, hoppos, &(shelp1[0])); + #else + dev_su3MtV_half(gfsmem[ix].m, &(sin[6*hoppos]),&(sin_norm[hoppos]), &(shelp1[0])); + #endif + #endif + + //-kappa(r + gamma_mu) + #ifdef GF_8 + dev_kappaP0_minus(&(ssum[0]), &(shelp1[0]), dev_k0); + #else + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_mk0,&(shelp1[0]), &(ssum[0])); + dev_Gamma0(&(shelp1[0])); + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_mk0,&(shelp1[0]), &(ssum[0])); + #endif + + + + +//l==3,z + //positive direction + hoppos = nn_evenodd[8*pos+3]; + //hoppos = tex1Dfetch(nn_tex,8*pos+3); + //color + #ifdef GF_8 + dev_reconstructgf_8texref_half(gf,4*(gfindex_site[pos])+(3),&(gfsmem[ix].m)); + #else + dev_reconstructgf_2vtexref_half(gf, 4*(gfindex_site[pos])+(3),&(gfsmem[ix].m)); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix].m, hoppos, &(shelp1[0])); + #else + dev_su3MtV_half(gfsmem[ix].m, &(sin[6*hoppos]),&(sin_norm[hoppos]), &(shelp1[0])); + #endif + //-kappa(r - gamma_mu) + #ifdef GF_8 + dev_kappaP3_plus(&(ssum[0]), &(shelp1[0]), dev_k3.re); + #else + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_mk3,&(shelp1[0]), &(ssum[0])); + dev_Gamma3(&(shelp1[0])); + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_k3,&(shelp1[0]), &(ssum[0])); + #endif +//l==3,z + + //negative direction + hoppos = nn_evenodd[8*pos+7]; + //hoppos = tex1Dfetch(nn_tex,8*pos+7); + //color + #ifdef GF_8 + dev_reconstructgf_8texref_dagger_half(gf,4*gfindex_nextsite[hoppos]+(3),&(gfsmem[ix].m)); + #else + dev_reconstructgf_2vtexref_dagger_half(gf,4*gfindex_nextsite[hoppos]+(3),&(gfsmem[ix].m)); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix].m, hoppos, &(shelp1[0])); + #else + dev_su3MtV_half(gfsmem[ix].m, &(sin[6*hoppos]),&(sin_norm[hoppos]), &(shelp1[0])); + #endif + //-kappa(r + gamma_mu) + #ifdef GF_8 + dev_kappaP3_minus(&(ssum[0]), &(shelp1[0]), dev_k3.re); + #else + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_mk3,&(shelp1[0]), &(ssum[0])); + dev_Gamma3(&(shelp1[0])); + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_mk3,&(shelp1[0]), &(ssum[0])); + #endif + + + + +//l==2,y + //positive direction + hoppos = nn_evenodd[8*pos+2]; + //hoppos = tex1Dfetch(nn_tex,8*pos+2); + //color + #ifdef GF_8 + dev_reconstructgf_8texref_half(gf,4*(gfindex_site[pos])+(2),&(gfsmem[ix].m)); + #else + dev_reconstructgf_2vtexref_half(gf,4*(gfindex_site[pos])+(2),&(gfsmem[ix].m)); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix].m, hoppos, &(shelp1[0])); + #else + dev_su3MtV_half(gfsmem[ix].m, &(sin[6*hoppos]),&(sin_norm[hoppos]), &(shelp1[0])); + #endif + //-kappa(r - gamma_mu) + #ifdef GF_8 + dev_kappaP2_plus(&(ssum[0]), &(shelp1[0]), dev_k2.re); + #else + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_mk2,&(shelp1[0]), &(ssum[0])); + dev_Gamma2(&(shelp1[0])); + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_k2,&(shelp1[0]), &(ssum[0])); + #endif + +//l==2,y + + + //negative direction + hoppos = nn_evenodd[8*pos+6]; + //hoppos = tex1Dfetch(nn_tex,8*pos+6); + //color + #ifdef GF_8 + dev_reconstructgf_8texref_dagger_half(gf,4*gfindex_nextsite[hoppos]+(2),&(gfsmem[ix].m)); + #else + dev_reconstructgf_2vtexref_dagger_half(gf,4*gfindex_nextsite[hoppos]+(2),&(gfsmem[ix].m)); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix].m, hoppos, &(shelp1[0])); + #else + dev_su3MtV_half(gfsmem[ix].m, &(sin[6*hoppos]),&(sin_norm[hoppos]), &(shelp1[0])); + #endif + //-kappa(r + gamma_mu) + #ifdef GF_8 + dev_kappaP2_minus(&(ssum[0]), &(shelp1[0]), dev_k2.re); + #else + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_mk2,&(shelp1[0]), &(ssum[0])); + dev_Gamma2(&(shelp1[0])); + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_mk2,&(shelp1[0]), &(ssum[0])); + #endif + + + +//l==1,x + //positive direction + hoppos = nn_evenodd[8*pos+1]; + //hoppos = tex1Dfetch(nn_tex,8*pos+1); + //color + #ifdef GF_8 + dev_reconstructgf_8texref_half(gf,4*(gfindex_site[pos])+(1),&(gfsmem[ix].m)); + #else + dev_reconstructgf_2vtexref_half(gf,4*(gfindex_site[pos])+(1),&(gfsmem[ix].m)); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix].m, hoppos, &(shelp1[0])); + #else + dev_su3MtV_half(gfsmem[ix].m, &(sin[6*hoppos]),&(sin_norm[hoppos]), &(shelp1[0])); + #endif + //-kappa(r - gamma_mu) + #ifdef GF_8 + dev_kappaP1_plus(&(ssum[0]), &(shelp1[0]), dev_k1.re); + #else + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_mk1,&(shelp1[0]), &(ssum[0])); + dev_Gamma1(&(shelp1[0])); + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_k1,&(shelp1[0]), &(ssum[0])); + #endif + + +//l==1,x + + //negative direction + hoppos = nn_evenodd[8*pos+5]; + //hoppos = tex1Dfetch(nn_tex,8*pos+5); + //color + #ifdef GF_8 + dev_reconstructgf_8texref_dagger_half(gf,4*gfindex_nextsite[hoppos]+(1),&(gfsmem[ix].m)); + #else + dev_reconstructgf_2vtexref_dagger_half(gf,4*gfindex_nextsite[hoppos]+(1),&(gfsmem[ix].m)); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix].m, hoppos, &(shelp1[0])); + #else + dev_su3MtV_half(gfsmem[ix].m, &(sin[6*hoppos]),&(sin_norm[hoppos]), &(shelp1[0])); + #endif + //-kappa(r + gamma_mu) + #ifdef GF_8 + dev_kappaP1_minus(&(ssum[0]), &(shelp1[0]), dev_k1.re); + #else + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_mk1,&(shelp1[0]), &(ssum[0])); + dev_Gamma1(&(shelp1[0])); + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_mk1,&(shelp1[0]), &(ssum[0])); + #endif + + //copy to output spinor + dev_write_spinor_half(&(ssum[0]),&(sout[6*pos]), &(sout_norm[pos])); + } +}//dev_Hopping_Matrix_half_ASYNC() + + + + + + + + + + +void HOPPING_HALF_ASYNC (dev_su3_2v_half * gf, + dev_spinor_half * spinin, float* spinin_norm, dev_spinor_half * spinout, + float* spinout_norm, int * gfindex_site, int * gfindex_nextsite, + int * nn_evenodd, int ieo, + int gridsize, int blocksize) { + + + // for even/odd + int tSliceEO = LX*LY*LZ/2; + int VolumeEO = VOLUME/2; + + #if defined ASYNC_OPTIMIZED && ASYNC == 3 + int offset; + if (tSliceEO % nStreams == 0) { + offset = tSliceEO / nStreams; + } + else { + printf("Error in HOPPING_ASYNC(): tSliceEO is not divisible by nStreams!\n"); + exit(-1); + } + #endif + + // gridsizes + int gridsize1; + int gridsize2; + + #ifndef ASYNC_TSLICES + if ( (VolumeEO-2*tSliceEO) % blocksize == 0 ) { + gridsize1 = (VolumeEO-2*tSliceEO) / blocksize; + } + else { + gridsize1 = (int) ( ((VolumeEO-2*tSliceEO)/blocksize) + 1); + } + + if ( (tSliceEO) % blocksize == 0 ) { + gridsize2 = (tSliceEO) / blocksize; + } + else { + gridsize2 = (int) ( ((tSliceEO)/blocksize) + 1); + } + #else + int tSlices = ASYNC_TSLICES; + if ( (VolumeEO-2*tSlices*tSliceEO) % blocksize == 0 ) { + gridsize1 = (VolumeEO-2*tSlices*tSliceEO) / blocksize; + } + else { + gridsize1 = (int) ( ((VolumeEO-2*tSlices*tSliceEO)/blocksize) + 1); + } + + if ( (tSlices*tSliceEO) % blocksize == 0 ) { + gridsize2 = (tSlices*tSliceEO) / blocksize; + } + else { + gridsize2 = (int) ( ((tSlices*tSliceEO)/blocksize) + 1); + } + #endif + + + + + #ifdef USETEXTURE + bind_halfspinor_texture(spinin, spinin_norm); + #endif + + + + + #if ASYNC == 0 // primitive version + + + /* + + // applies to the parts which don't need communication + dev_Hopping_Matrix_half_ASYNC <<>> ( gf, + spinin, spinout, + gfindex_site, gfindex_nextsite, nn_evenodd, + ieo, + //2*tSliceEO, VolumeEO-4*tSliceEO ); + tSliceEO, VolumeEO-2*tSliceEO ); + + // exchanges the boundaries + xchange_field_wrapper(spinin, ieo); // to be further optimized !! + + // applies the hopping matrix to remaining parts + dev_Hopping_Matrix_ASYNC <<>> ( gf, + spinin, spinout, + gfindex_site, gfindex_nextsite, nn_evenodd, + ieo, + //0, 2*tSliceEO ); + 0, tSliceEO ); + + dev_Hopping_Matrix_ASYNC <<>> ( gf, + spinin, spinout, + gfindex_site, gfindex_nextsite, nn_evenodd, + ieo, + //VolumeEO-2*tSliceEO, 2*tSliceEO ); + VolumeEO-tSliceEO, tSliceEO ); + + */ + + +//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + + + + #elif ASYNC == 1 // optimized version + + + #ifdef ASYNC_TIMING + cudaEventRecord(start_ALL, 0); + mpi_start_ALL = MPI_Wtime(); + #endif + + + // copies first FACE to host + cudaMemcpyAsync(RAND1, spinin, tSliceEO*6*sizeof(short4), cudaMemcpyDeviceToHost, stream[1]); + cudaMemcpyAsync(RAND1_norm, spinin_norm, tSliceEO*sizeof(float), cudaMemcpyDeviceToHost, stream[1]); + #ifdef ASYNC_TIMING + cudaEventRecord(stop_D2H_1, stream[1]); + #endif + + + // INTERNAL kernel + dev_Hopping_Matrix_half_ASYNC <<>> ( gf, + spinin, spinin_norm, + spinout, spinout_norm, + gfindex_site, gfindex_nextsite, + nn_evenodd, ieo, + tSlices*tSliceEO, + VolumeEO-2*tSlices*tSliceEO ); + #ifdef ASYNC_TIMING + cudaEventRecord(stop_INT_0, stream[0]); + #endif + + + // exchanges first FACE + cudaStreamSynchronize(stream[1]); // SYNCPOINT + + #ifdef ASYNC_TIMING + mpi_start_sendrecv_1 = MPI_Wtime(); + #endif + + + // copies second FACE to host + cudaMemcpyAsync(RAND2, spinin+6*(VolumeEO-tSliceEO), tSliceEO*6*sizeof(short4), cudaMemcpyDeviceToHost, stream[2]); + cudaMemcpyAsync(RAND2_norm, spinin_norm+(VolumeEO-tSliceEO), tSliceEO*sizeof(float), cudaMemcpyDeviceToHost, stream[2]); + #ifdef ASYNC_TIMING + cudaEventRecord(stop_D2H_2, stream[2]); + #endif + + + //MPI_Irecv(RAND3, 24*tSliceEO, MPI_FLOAT, g_nb_t_up, 0, + // g_cart_grid, &recv_req[0]); + //MPI_Isend(RAND1, 24*tSliceEO, MPI_FLOAT, g_nb_t_dn, 0, + // g_cart_grid, &send_req[0]); + + MPI_Sendrecv(RAND1, 24*tSliceEO, MPI_SHORT, g_nb_t_dn, 0, // SYNCPOINT + RAND3, 24*tSliceEO, MPI_SHORT, g_nb_t_up, 0, + g_cart_grid, &stat[0]); + // send norm + MPI_Sendrecv(RAND1_norm, tSliceEO, MPI_FLOAT, g_nb_t_dn, 0, // SYNCPOINT + RAND3_norm, tSliceEO, MPI_FLOAT, g_nb_t_up, 0, + g_cart_grid, &stat[0]); + #ifdef ASYNC_TIMING + mpi_stop_sendrecv_1 = MPI_Wtime(); + #endif + + + // copies first FACE back to device // order may switched + //MPI_Wait(&recv_req[0], &stat[0]); // synchronous + cudaMemcpyAsync(spinin+6*VolumeEO, RAND3, tSliceEO*6*sizeof(short4), cudaMemcpyHostToDevice, stream[1]); + cudaMemcpyAsync(spinin_norm+VolumeEO, RAND3_norm, tSliceEO*sizeof(float), cudaMemcpyHostToDevice, stream[1]); + #ifdef ASYNC_TIMING + cudaEventRecord(stop_H2D_3, stream[1]); + #endif + + + // applies first FACE + dev_Hopping_Matrix_half_ASYNC <<>> ( gf, + spinin, spinin_norm, + spinout, spinout_norm, + gfindex_site, gfindex_nextsite, + nn_evenodd, ieo, + VolumeEO-tSlices*tSliceEO, + tSlices*tSliceEO ); + #ifdef ASYNC_TIMING + cudaEventRecord(stop_EXT_1, stream[1]); + #endif + + + // exchanges second FACE + cudaStreamSynchronize(stream[2]); // SYNCPOINT + + #ifdef ASYNC_TIMING + mpi_start_sendrecv_2 = MPI_Wtime(); + #endif + + MPI_Sendrecv(RAND2, 24*tSliceEO, MPI_SHORT, g_nb_t_up, 1, // SYNCPOINT + RAND4, 24*tSliceEO, MPI_SHORT, g_nb_t_dn, 1, + g_cart_grid, &stat[1]); + //send norms + MPI_Sendrecv(RAND2_norm, tSliceEO, MPI_FLOAT, g_nb_t_up, 1, // SYNCPOINT + RAND4_norm, tSliceEO, MPI_FLOAT, g_nb_t_dn, 1, + g_cart_grid, &stat[1]); + + #ifdef ASYNC_TIMING + mpi_stop_sendrecv_2 = MPI_Wtime(); + #endif + + + // copies second FACE back to device + //MPI_Wait(&recv_req[1], &stat[1]); + cudaMemcpyAsync(spinin+6*(VolumeEO+tSliceEO), RAND4, tSliceEO*6*sizeof(short4), cudaMemcpyHostToDevice, stream[2]); + cudaMemcpyAsync(spinin_norm+(VolumeEO+tSliceEO), RAND4_norm, tSliceEO*sizeof(float), cudaMemcpyHostToDevice, stream[2]); + #ifdef ASYNC_TIMING + cudaEventRecord(stop_H2D_4, stream[2]); + #endif + + + // applies second FACE + dev_Hopping_Matrix_half_ASYNC <<>> ( gf, + spinin, spinin_norm, + spinout, spinout_norm, + gfindex_site, gfindex_nextsite, + nn_evenodd, ieo, + 0, tSlices*tSliceEO ); + #ifdef ASYNC_TIMING + cudaEventRecord(stop_EXT_2, stream[2]); + #endif + + + #ifdef ASYNC_TIMING + cudaEventRecord(stop_ALL, 0); + #endif + + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + + /* + + #elif ASYNC == 2 // alternate optimized version + + + #ifdef ASYNC_TIMING + cudaEventRecord(start_ALL, 0); + mpi_start_ALL = MPI_Wtime(); + #endif + + + // copies first FACE to host + cudaMemcpyAsync(RAND1, spinin, tSliceEO*6*sizeof(float4), cudaMemcpyDeviceToHost, stream[1]); + + #ifdef ASYNC_TIMING + cudaEventRecord(stop_D2H_1, stream[1]); + #endif + + + // copies second FACE to host + cudaMemcpyAsync(RAND2, spinin+6*(VolumeEO-tSliceEO), tSliceEO*6*sizeof(float4), cudaMemcpyDeviceToHost, stream[2]); + + #ifdef ASYNC_TIMING + cudaEventRecord(stop_D2H_2, stream[2]); + #endif + + + // INTERNAL kernel + dev_Hopping_Matrix_ASYNC <<>> ( gf, + spinin, spinout, + gfindex_site, gfindex_nextsite, nn_evenodd, + ieo, + tSlices*tSliceEO, VolumeEO-2*tSlices*tSliceEO ); + #ifdef ASYNC_TIMING + cudaEventRecord(stop_INT_0, stream[0]); + #endif + + + // first FACE + cudaStreamSynchronize(stream[1]); // SYNCPOINT + + #ifdef ASYNC_TIMING + mpi_start_sendrecv_1 = MPI_Wtime(); + #endif + + MPI_Sendrecv(RAND1, 24*tSliceEO, MPI_FLOAT, g_nb_t_dn, 0, // SYNCPOINT + RAND3, 24*tSliceEO, MPI_FLOAT, g_nb_t_up, 0, + g_cart_grid, &stat[0]); + + //MPI_Isend(RAND1, 24*tSliceEO, MPI_FLOAT, g_nb_t_dn, 0, + // g_cart_grid, &send_req[0]); + //MPI_Recv(RAND3, 24*tSliceEO, MPI_FLOAT, g_nb_t_up, 0, + // g_cart_grid, &stat[0]); + + //MPI_Wait(&recv_request1, &stat[0]); + + #ifdef ASYNC_TIMING + mpi_stop_sendrecv_1 = MPI_Wtime(); + #endif + + cudaMemcpyAsync(spinin+6*VolumeEO, RAND3, tSliceEO*6*sizeof(float4), cudaMemcpyHostToDevice, stream[1]); + + #ifdef ASYNC_TIMING + cudaEventRecord(stop_H2D_3, stream[1]); + #endif + + + dev_Hopping_Matrix_ASYNC <<>> ( gf, + spinin, spinout, + gfindex_site, gfindex_nextsite, nn_evenodd, + ieo, + VolumeEO-tSlices*tSliceEO, tSlices*tSliceEO ); + #ifdef ASYNC_TIMING + cudaEventRecord(stop_EXT_1, stream[1]); + #endif + + + // second FACE + cudaStreamSynchronize(stream[2]); // SYNCPOINT + + #ifdef ASYNC_TIMING + mpi_start_sendrecv_2 = MPI_Wtime(); + #endif + + MPI_Sendrecv(RAND2, 24*tSliceEO, MPI_FLOAT, g_nb_t_up, 1, // SYNCPOINT + RAND4, 24*tSliceEO, MPI_FLOAT, g_nb_t_dn, 1, + g_cart_grid, &stat[1]); + + //MPI_Isend(RAND2, 24*tSliceEO, MPI_FLOAT, g_nb_t_up, 1, + // g_cart_grid, &send_req[1]); + //MPI_Recv(RAND4, 24*tSliceEO, MPI_FLOAT, g_nb_t_dn, 1, + // g_cart_grid, &stat[1]); + + //MPI_Wait(&recv_request2, &stat[1]); + + #ifdef ASYNC_TIMING + mpi_stop_sendrecv_2 = MPI_Wtime(); + #endif + + cudaMemcpyAsync(spinin+6*(VolumeEO+tSliceEO), RAND4, tSliceEO*6*sizeof(float4), cudaMemcpyHostToDevice, stream[2]); + + #ifdef ASYNC_TIMING + cudaEventRecord(stop_H2D_4, stream[2]); + #endif + + + dev_Hopping_Matrix_ASYNC <<>> ( gf, + spinin, spinout, + gfindex_site, gfindex_nextsite, nn_evenodd, + ieo, + 0, tSlices*tSliceEO ); + #ifdef ASYNC_TIMING + cudaEventRecord(stop_EXT_2, stream[2]); + #endif + + + #ifdef ASYNC_TIMING + cudaEventRecord(stop_ALL, 0); + #endif + + + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + + #elif ASYNC == 3 + + + #ifdef ASYNC_TIMING + cudaEventRecord(start_ALL, 0); + mpiTime_start_ALL = MPI_Wtime(); + #endif + + + // copies first FACE to host + cudaMemcpyAsync(RAND1, spinin , tSliceEO*6*sizeof(float4), cudaMemcpyDeviceToHost, stream[1]); + + #ifdef ASYNC_TIMING + cudaEventRecord(stop_D2H_1, stream[1]); + #endif + + // copies second FACE to host + cudaMemcpyAsync(RAND2, spinin+6*(VolumeEO-tSliceEO), tSliceEO*6*sizeof(float4), cudaMemcpyDeviceToHost, stream[2]); + + #ifdef ASYNC_TIMING + cudaEventRecord(stop_D2H_2, stream[2]); + #endif + + + // INTERNAL kernel + dev_Hopping_Matrix_ASYNC <<>> ( gf, + spinin, spinout, + gfindex_site, gfindex_nextsite, nn_evenodd, + ieo, + tSliceEO, VolumeEO-2*tSliceEO ); + #ifdef ASYNC_TIMING + cudaEventRecord(stop_INT_0, stream[0]); + #endif + + + // first FACE + cudaStreamSynchronize(stream[1]); + + for (int i = 0; i < nStreams; i++) { + + #ifdef ASYNC_TIMING + mpiTime_start_sendrecv_1 = MPI_Wtime(); + #endif + + MPI_Sendrecv(RAND1+6*i*offset, 24*offset, MPI_FLOAT, g_nb_t_dn, 0, // NOT asynchronous + RAND3+6*i*offset, 24*offset, MPI_FLOAT, g_nb_t_up, 0, + g_cart_grid, &stat[i]); + + #ifdef ASYNC_TIMING + mpiTime_stop_sendrecv_1 = MPI_Wtime(); + #endif + + //MPI_Isend(RAND1+6*i*offset, 24*offset, MPI_FLOAT, g_nb_t_dn, i, + // g_cart_grid, &send_req[i]); + //MPI_Irecv (RAND3+6*i*offset, 24*offset, MPI_FLOAT, g_nb_t_up, i, + // g_cart_grid, &recv_req[i]); + + //MPI_Wait(&recv_req[i], &stat[i]); + + cudaMemcpyAsync(spinin+6*VolumeEO+6*i*offset, RAND3+6*i*offset, offset*6*sizeof(float4), cudaMemcpyHostToDevice, stream[1+i]); + + #ifdef ASYNC_TIMING + cudaEventRecord(stop_H2D_3, stream[1]); + #endif + + dev_Hopping_Matrix_ASYNC <<>> ( gf, + spinin, spinout, + gfindex_site, gfindex_nextsite, nn_evenodd, + ieo, + VolumeEO-tSliceEO+i*offset, offset ); + #ifdef ASYNC_TIMING + cudaEventRecord(stop_EXT_1, stream[1]); + #endif + + } + + + // second FACE + cudaStreamSynchronize(stream[nStreams+1]); + + for (int i = 0; i < nStreams; i++) { + + #ifdef ASYNC_TIMING + mpiTime_start_sendrecv_2 = MPI_Wtime(); + #endif + + MPI_Sendrecv(RAND2+6*i*offset, 24*offset, MPI_FLOAT, g_nb_t_up, 1, + RAND4+6*i*offset, 24*offset, MPI_FLOAT, g_nb_t_dn, 1, + g_cart_grid, &stat[nStreams+i]); + + #ifdef ASYNC_TIMING + mpiTime_stop_sendrecv_2 = MPI_Wtime(); + #endif + + //MPI_Isend(RAND2+6*i*offset, 24*offset, MPI_FLOAT, g_nb_t_up, nStreams+i, + // g_cart_grid, &send_req[nStreams+i]); + //MPI_Irecv (RAND4+6*i*offset, 24*offset, MPI_FLOAT, g_nb_t_dn, nStreams+i, + // g_cart_grid, &recv_req[nStreams+i]); + + //MPI_Wait(&recv_req[nStreams+i], &stat[nStreams+i]); + + cudaMemcpyAsync(spinin+6*(VolumeEO+tSliceEO)+6*i*offset, RAND4+6*i*offset, offset*6*sizeof(float4), cudaMemcpyHostToDevice, stream[nStreams+1+i]); + + #ifdef ASYNC_TIMING + cudaEventRecord(stop_H2D_4, stream[2]); + #endif + + dev_Hopping_Matrix_ASYNC <<>> ( gf, + spinin, spinout, + gfindex_site, gfindex_nextsite, nn_evenodd, + ieo, + 0+i*offset, offset ); + #ifdef ASYNC_TIMING + cudaEventRecord(stop_EXT_2, stream[2]); + #endif + + } + + + #ifdef ASYNC_TIMING + cudaEventRecord(stop_ALL, 0); + #endif + */ + + + #endif // different optimized and non-optimized version + + + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + + cudaThreadSynchronize(); // test if needed // for timing ... + + + #ifdef USETEXTURE + unbind_halfspinor_texture(); + #endif + +} + + + + + +#endif //HALF + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/DEBUG/CG.cuh b/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/DEBUG/CG.cuh new file mode 100644 index 0000000000000000000000000000000000000000..824b5cbd407764dd0bf4345fff3ce3e24f33a95d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/DEBUG/CG.cuh @@ -0,0 +1,431 @@ + +// this is an exact implementation of the CG according to cg_her_nd() + + +//////////////////////// +// CONJUGATE GRADIENT // +//////////////////////// + +// for the odd field after even/odd-preconditioning +// single precision on GPU + +int alt_cg_eo_nd (dev_su3_2v * gf, + dev_spinor * P_up, dev_spinor * P_dn, + dev_spinor * Q_up, dev_spinor * Q_dn, + double eps_sq) { + + // P_up/dn can be used as auxiliary field to work on, as it is not later used + // Q_up/dn can be used as feedback or initial guess, or if not, also as auxiliary field + + printf("This is the ALTERNATIVE CG on the device!\n"); + + + ///////////////////// + // LOCAL VARIABLES // // single precision + ///////////////////// + + // some variables + int iteration; // iteration counter + cudaError_t cudaerr; // CUDA errors + cublasStatus cublasstatus; // CUBLAS status + + + + + // some variables + int maxit = 1000; + float eps_rel = 0.0001; + // int maxit = max_innersolver_it; // maximal number of inner iterations per one outer iteration + // float eps_rel = (float) innersolver_precision;// precision for the inner solver + float eps_abs = (float) eps_sq/2.0; + + int N_sites = VOLUME/2; + int N_floats = 24*VOLUME/2; // (single precision) CUBLAS functions get the number of floats as input + + int N_recalcres = 40; + + size_t dev_spinsize = 6*VOLUME/2*sizeof(dev_spinor); + + + + ///////////////////////////////////////////// + // CUDA block- and gridsize specifications // // why here initialized and passed ?? // could also be done locally in matrix_multiplication32 + ///////////////////////////////////////////// + + int gridsize; // auxiliary + int blocksize; // auxiliary + + blocksize = 128; + int blockdim1 = blocksize; // here: dev_zero_spinor_field , dev_copy_spinor_field + int griddim1 = (int) (VOLUME/2/blocksize) + 1; + + blocksize = 128; + int blockdim2 = blocksize; // passed: dev_Hopping_Matrix + int griddim2 = (int) (VOLUME/2/blocksize) + 1; + + blocksize = 128; + int blockdim3 = blocksize; // passed: dev_mul_one_pm_imubar_gamma5 + int griddim3 = (int) (VOLUME/2/blocksize) + 1; + + blocksize = 128; + int blockdim4 = blocksize; // passed: dev_gamma5 + int griddim4 = (int) (VOLUME/2/blocksize) + 1; + + blocksize = 128; + int blockdim5 = blocksize; // passed: dev_copy_spinor_field + int griddim5 = (int) (VOLUME/2/blocksize) + 1; + + + + + ///////////////// + // ASSIGNMENTS // + ///////////////// + + + + + ///////////////////// + // INITIALIZATIONS // + ///////////////////// + + + // Initialize some stuff + dev_complex h0, h1, h2, h3, mh0, mh1, mh2, mh3; + + h0.re = (REAL) ka0.re; h0.im = -(REAL) ka0.im; // ka{0-4} are defined in boundary.c + h1.re = (REAL) ka1.re; h1.im = -(REAL) ka1.im; // what is the meaning? + h2.re = (REAL) ka2.re; h2.im = -(REAL) ka2.im; + h3.re = (REAL) ka3.re; h3.im = -(REAL) ka3.im; + + mh0.re = -(REAL) ka0.re; mh0.im = (REAL) ka0.im; + mh1.re = -(REAL) ka1.re; mh1.im = (REAL) ka1.im; + mh2.re = -(REAL) ka2.re; mh2.im = (REAL) ka2.im; + mh3.re = -(REAL) ka3.re; mh3.im = (REAL) ka3.im; + + // try using constant mem for kappas // constant memory is cached! + cudaMemcpyToSymbol("dev_k0c", &h0, sizeof(dev_complex)); + cudaMemcpyToSymbol("dev_k1c", &h1, sizeof(dev_complex)); + cudaMemcpyToSymbol("dev_k2c", &h2, sizeof(dev_complex)); + cudaMemcpyToSymbol("dev_k3c", &h3, sizeof(dev_complex)); + + cudaMemcpyToSymbol("dev_mk0c", &mh0, sizeof(dev_complex)); + cudaMemcpyToSymbol("dev_mk1c", &mh1, sizeof(dev_complex)); + cudaMemcpyToSymbol("dev_mk2c", &mh2, sizeof(dev_complex)); + cudaMemcpyToSymbol("dev_mk3c", &mh3, sizeof(dev_complex)); + + // debug // CUDA + #ifdef CUDA_DEBUG + CUDA_CHECK("CUDA error in cg_eo_nd(). Trying to use constant memory for strange stuff failed.", "Using constant memory for strange stuff."); + #endif + + + // bind texture gf + bind_texture_gf(gf); // needed for subfunctions of dev_Hopping_Matrix(...) + // e.g. dev_reconstructgf_2vtexref(...), dev_reconstructgf_8texref(...), + // in general in functions dev_reconstructgf[...] with "tex1Dfetch(gf_tex[...]" + // debug // CUDA + #ifdef CUDA_DEBUG + CUDA_CHECK("CUDA error in bind_texture_gf(). Binding GF to texture failed.", "GF bound to texture."); + #endif + + + // "he" = "host entry" + he_cg_init<<< 1, 1 >>> (dev_grid, (REAL) g_kappa, (REAL)(g_mu/(2.0*g_kappa)), h0, h1, h2, h3); + // BEWARE in dev_tm_dirac_kappa we need the true mu (not 2 kappa mu!) // ?? + + // dev_LX, dev_LY, dev_LZ, dev_T, dev_VOLUME = grid[5] = dev_grid[5] + // dev_VOLUME is necessary for many kernel functions as for instance dev_gamma5() + // initializes mu, kappa and twokappamu on the device + // initializes the strange dev_k{0-3}, dev_mk{0-3} as derived from the ka{0-3} from boundary.c + + // debug // kernel + #ifdef CUDA_DEBUG + CUDA_KERNEL_CHECK("Kernel error in he_cg_init(). Couldn't initialize some stuff.", "he_cg_init() succeeded."); + #endif + + // debug // check stuff on device + #ifdef STUFF_DEBUG + int host_check_LX, host_check_LY, host_check_LZ, host_check_T, host_check_VOLUME; + cudaMemcpyFromSymbol(&host_check_LX, dev_LX, sizeof(int)); + cudaMemcpyFromSymbol(&host_check_LY, dev_LY, sizeof(int)); + cudaMemcpyFromSymbol(&host_check_LZ, dev_LZ, sizeof(int)); + cudaMemcpyFromSymbol(&host_check_T, dev_T, sizeof(int)); + cudaMemcpyFromSymbol(&host_check_VOLUME, dev_VOLUME, sizeof(int)); + printf("\teven_odd_flag = %i\n", even_odd_flag); + printf("\tOn device:\n"); + printf("\tdev_LX = %i\n", host_check_LX); + printf("\tdev_LY = %i\n", host_check_LY); + printf("\tdev_LZ = %i\n", host_check_LZ); + printf("\tdev_T = %i\n", host_check_T); + printf("\tdev_VOLUME = %i/2 ?!= %i\n", host_check_LX*host_check_LY*host_check_LZ*host_check_T, host_check_VOLUME); + + float host_check_mu, host_check_kappa, host_check_twokappamu; + cudaMemcpyFromSymbol(&host_check_mu, mu, sizeof(float)); + cudaMemcpyFromSymbol(&host_check_kappa, kappa, sizeof(float)); + cudaMemcpyFromSymbol(&host_check_twokappamu, twokappamu, sizeof(float)); + // printf("\tOn device:\n"); + // printf("\tmu = %f\n", host_check_mu); // not needed for the nd case + printf("\tkappa = %f\n", host_check_kappa); + // printf("\ttwokappamu = %f\n", host_twokappamu); + #endif + + // additional + dev_spinor * dev_spin_up; + dev_spinor * dev_spin_dn; + dev_spinor * dev_spin4_up; + dev_spinor * dev_spin4_dn; + dev_spinor * dev_spin5_up; + dev_spinor * dev_spin5_dn; + cudaMalloc((void **) &dev_spin_up, dev_spinsize); + cudaMalloc((void **) &dev_spin_dn, dev_spinsize); + cudaMalloc((void **) &dev_spin4_up, dev_spinsize); + cudaMalloc((void **) &dev_spin4_dn, dev_spinsize); + cudaMalloc((void **) &dev_spin5_up, dev_spinsize); + cudaMalloc((void **) &dev_spin5_dn, dev_spinsize); + + + he_cg_init_nd_additional<<<1,1>>> (g_mubar, g_epsbar); + + // debug // kernel + #ifdef CUDA_DEBUG + CUDA_KERNEL_CHECK("Kernel error in he_cg_init_nd_additional(). Couldn't initialize some stuff.", "he_cg_init_nd_additional() succeeded."); + #endif + + // debug // check mubar and epsbar on host and device + #ifdef STUFF_DEBUG + // printf("\tOn host:\n"); + // printf("\tg_mubar = %f\n", g_mubar); + // printf("\tg_epsbar = %f\n", g_epsbar); + + float host_check_mubar, host_check_epsbar; + cudaMemcpyFromSymbol(&host_check_mubar, mubar, sizeof(float)); + cudaMemcpyFromSymbol(&host_check_epsbar, epsbar, sizeof(float)); + printf("\tOn device:\n"); + printf("\tmubar = %f\n", host_check_mubar); + printf("\tepsbar = %f\n", host_check_epsbar); + #endif + + + + + + // init CUBLAS + // cublasInit(); + + // debug // CUBLAS helper function + #ifdef CUDA_DEBUG + CUBLAS_HELPER_CHECK(cublasInit(), "CUBLAS error in cublasInit(). Couldn't initialize CUBLAS.", "CUBLAS initialized."); + #endif + + + + float squarenorm_up, squarenorm_dn, squarenorm; + float normsp_up, normsp_dn, normsp; // just to check if zero + float normsq_up, normsq_dn, normsq; // for the algorithm + float pro_up, pro_dn, pro; // + float alpha_cg, beta_cg; // alpha, beta + float err_up, err_dn, err; // r(k+1)*r(k+1) + + + + ////////////////////////// + /////////////// // dev_spin = x(k) // + // ALGORITHM // // dev_spin1 = r(k) // + /////////////// // dev_spin2 = d(k) // + // dev_spin4 = A*d(k) // + ////////////////////////// + + + // P = 0 + dev_zero_spinor_field<<>>(P_up); // P_up = 0 + dev_zero_spinor_field<<>>(P_dn); // P_dn = 0 + + + // squarenorm = (Q_up)^2 + (Q_dn)^2 + squarenorm_up = cublasSdot(N_floats, (float *) Q_up, 1, (float *) Q_up, 1); + squarenorm_dn = cublasSdot(N_floats, (float *) Q_dn, 1, (float *) Q_dn, 1); + squarenorm = squarenorm_up + squarenorm_dn; + + // x(0) = P = 0 + dev_copy_spinor_field<<>>(P_up, dev_spin_up); // dev_spin_up = P_up + dev_copy_spinor_field<<>>(P_dn, dev_spin_dn); // dev_spin_dn = P_dn + + // normsp = (P_up)^2 + (P_dn)^2 + normsp_up = cublasSdot(N_floats, (float *) P_up, 1, (float *) P_up, 1); + normsp_dn = cublasSdot(N_floats, (float *) P_dn, 1, (float *) P_dn, 1); + normsp = normsp_up + normsp_dn; + + // why and what ?? + dev_copy_spinor_field<<>>(Q_up, dev_spin5_up); // dev_spin5_up = Q_up + dev_copy_spinor_field<<>>(Q_dn, dev_spin5_dn); // dev_spin5_dn = Q_dn + + + if (normsp == 0) { + + printf("Yes, normsp = 0!\n"); + + // r(0) = p(0) = Q + dev_copy_spinor_field<<>>(dev_spin5_up, dev_spin1_up); // dev_spin1_up = dev_spin5_up = Q_up + dev_copy_spinor_field<<>>(dev_spin5_dn, dev_spin1_dn); // dev_spin1_dn = dev_spin5_dn = Q_dn + dev_copy_spinor_field<<>>(dev_spin5_up, dev_spin2_up); // dev_spin2_up = dev_spin5_up = Q_up + dev_copy_spinor_field<<>>(dev_spin5_dn, dev_spin2_dn); // dev_spin2_dn = dev_spin5_dn = Q_dn + + // normsq = (Q_up)^2 + (Q_dn)^2 // first residual for (initial guess = 0) + normsq_up = cublasSdot(N_floats, (float *) Q_up, 1, (float *) Q_up, 1); + normsq_dn = cublasSdot(N_floats, (float *) Q_dn, 1, (float *) Q_dn, 1); + normsq = normsq_up + normsq_dn; + + } + + // debug // CUBLAS core function + #ifdef CUDA_DEBUG + CUBLAS_CORE_CHECK("CUBLAS error in alt_cg_eo_nd(). Calculating initial residue failed.", "Initial residue calculated."); + #endif + + // debug + printf("Initial inner residue: %.8e\n", squarenorm); + + + + ////////// + // LOOP // + ////////// + + + // debug + printf("Entering inner CG loop.\n"); + + + for (iteration = 0; iteration < maxit; iteration++) { + + + // A*d(k) + #ifndef MATRIX_DEBUG + + matrix_multiplication32(dev_spin4_up, dev_spin4_dn, + dev_spin2_up, dev_spin2_dn, + griddim2, blockdim2, // for calling some kernels as subfunctions + griddim3, blockdim3, // ... + griddim4, blockdim4, + griddim5, blockdim5); + + + // debug // CUDA // also other stuff ?! + #ifdef CUDA_DEBUG + CUDA_CHECK("CUDA error in matrix_muliplication32(). Applying the matrix on GPU failed.", "The matrix was applied on GPU."); + #endif + + #else + + // debug // apply the host matrix on trial + + // host/device interaction + cudaMemcpy(h2d_spin_up, dev_spin2_up, dev_spinsize, cudaMemcpyDeviceToHost); + cudaMemcpy(h2d_spin_dn, dev_spin2_dn, dev_spinsize, cudaMemcpyDeviceToHost); + convert2double_spin(h2d_spin_up, g_chi_up_spinor_field[DUM_SOLVER+3]); + convert2double_spin(h2d_spin_up, g_chi_dn_spinor_field[DUM_SOLVER+3]); + + // matrix multiplication + Q_Qdagger_ND(g_chi_up_spinor_field[DUM_SOLVER+4], g_chi_dn_spinor_field[DUM_SOLVER+4], + g_chi_up_spinor_field[DUM_SOLVER+3], g_chi_dn_spinor_field[DUM_SOLVER+3] ); + + // host/device interaction + convert2REAL4_spin(g_chi_up_spinor_field[DUM_SOLVER+4], h2d_spin_up); + convert2REAL4_spin(g_chi_dn_spinor_field[DUM_SOLVER+4], h2d_spin_dn); + cudaMemcpy(dev_spin4_up, h2d_spin_up, dev_spinsize, cudaMemcpyHostToDevice); + cudaMemcpy(dev_spin4_dn, h2d_spin_dn, dev_spinsize, cudaMemcpyHostToDevice); + + // debug // CUDA + #ifdef CUDA_DEBUG + CUDA_CHECK("CUDA error in cg_eo_nd(). Applying the matrix on CPU failed.", "The matrix was applied on CPU."); + #endif + + #endif + + + // pro = d*A*d + pro_up = cublasSdot(N_floats, (float *) dev_spin2_up, 1, (float *) dev_spin4_up, 1); // what about the imaginary and real parts !? + pro_dn = cublasSdot(N_floats, (float *) dev_spin2_dn, 1, (float *) dev_spin4_dn, 1); // does that really work ? + pro = pro_up + pro_dn; + + // alpha = r(k-1)*r(k-1) / d*A*d + alpha_cg = normsq / pro; + + // x(k+1) = x(k) + alpha*d(k) + cublasSaxpy(N_floats, alpha_cg, (float *) dev_spin2_up, 1, (float *) dev_spin_up, 1); // dev_spin_up = dev_spin_up + alpha * dev_spin2_up + cublasSaxpy(N_floats, alpha_cg, (float *) dev_spin2_dn, 1, (float *) dev_spin_dn, 1); // dev_spin_dn = dev_spin_dn + alpha * dev_spin2_dn + + // r(k+1) = r(k) - alpha*A*d(k) + cublasSaxpy(N_floats, -1.0*alpha_cg, (float *) dev_spin4_up, 1, (float *) dev_spin1_up, 1); // dev_spin1_up = dev_spin1_up - alpha*dev_spin4_up + cublasSaxpy(N_floats, -1.0*alpha_cg, (float *) dev_spin4_dn, 1, (float *) dev_spin1_dn, 1); // dev_spin1_dn = dev_spin1_dn - alpha*dev_spin4_dn + + // err = r(k) * r(k) + err_up = cublasSdot(N_floats, (float *) dev_spin1_up, 1, (float *) dev_spin1_up, 1); // err_up = (dev_spin1_up)^2 + err_dn = cublasSdot(N_floats, (float *) dev_spin1_dn, 1, (float *) dev_spin1_dn, 1); // err_dn = (dev_spin1_dn)^2 + err = err_up + err_dn; + + // debug // CUBLAS core function + #ifdef CUDA_DEBUG + CUBLAS_CORE_CHECK_NO_SUCCESS_MSG("CUBLAS error in cg_eo_nd(). CUBLAS function failed."); + #endif + + // debug + printf("inner iteration j = %i: err = %.8e\n", iteration, err); + + // debug // is NaN ? + if isnan(err) { + printf("Error in cg_eo_nd(). Inner residue is NaN.\n"); + exit(-1); + } + + // check wether precision is reached + if ( (err <= eps_rel*squarenorm) || (err <= eps_abs) ) { + + // output + dev_copy_spinor_field<<>>(dev_spin_up, P_up); // P_up = dev_spin_up + dev_copy_spinor_field<<>>(dev_spin_dn, P_dn); // P_dn = dev_spin_dn + + g_sloppy_precision = 0; + return(iteration+1); + } + + // beta = r(k+1)*r(k+1) / r(k)*r(k) + beta_cg = err / normsq; + + // d(k+1) = r(k+1) + beta*d(k) + cublasSscal (N_floats, beta_cg, (float *) dev_spin2_up, 1); // dev_spin2_up = beta * dev_spin2_up + cublasSaxpy (N_floats, 1.0 , (float *) dev_spin1_up, 1, (float *) dev_spin2_up, 1); // dev_spin2_up += dev_spin1_up + cublasSscal (N_floats, beta_cg, (float *) dev_spin2_dn, 1); // dev_spin2_dn = beta * dev_spin2_dn + cublasSaxpy (N_floats, 1.0 , (float *) dev_spin1_dn, 1, (float *) dev_spin2_dn, 1); // dev_spin2_dn += dev_spin1_dn + + normsq = err; // for the next iteration + + + }//LOOP + + + // output + dev_copy_spinor_field<<>>(dev_spin_up, P_up); // P_up = dev_spin_up + dev_copy_spinor_field<<>>(dev_spin_dn, P_dn); // P_dn = dev_spin_dn + + g_sloppy_precision = 0; + + // additional + cudaFree(dev_spin_up); + cudaFree(dev_spin_dn); + cudaFree(dev_spin4_up); + cudaFree(dev_spin4_dn); + cudaFree(dev_spin5_up); + cudaFree(dev_spin5_dn); + + return(-1); + + +}//alt_cg_eo_nd() + + + + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/DEBUG/MATRIX_DEBUG.cuh b/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/DEBUG/MATRIX_DEBUG.cuh new file mode 100644 index 0000000000000000000000000000000000000000..33bbea249a1a573f0982fdd81aab89c209cbe5c7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/DEBUG/MATRIX_DEBUG.cuh @@ -0,0 +1,2136 @@ + +// matrix_debug1() replaces matrix_multiplication32() +// matrix_debug2(), Zwitter1(), Zwitter2() and Zwitter3() replace Q_Qdagger_ND() + + + +extern "C" { +//#ifdef HAVE_CONFIG_H +//# include +//#endif +//#include +//#include +//#include "../global.h" +//#include "../su3.h" +#include "../Hopping_Matrix.h" +#include "../phmc.h" +#include "../gamma.h" +//#include "../linsolve.h" +//#include "../linalg_eo.h" +//#include "../Nondegenerate_Matrix.h" +} + +/* +#define CHECK_HOPPING_MATRIX +#define CHECK_IMUGAMMA5 +#define CHECK_GAMMA5 +#define CHECK_CUBLAS1 +#define CHECK_CUBLAS2 +#define CHECK_CUBLAS3 +#define CHECK_COPY +//#define CHECK_MAXEV +*/ + + + + + +/////////////////////////// +// MATRIX MULTIPLICATION // +/////////////////////////// + + +// this replaces matrix_multiplication32() for debugging !! + +void matrix_debug1 (dev_spinor * spinout_up, dev_spinor * spinout_dn, + dev_spinor * spinin_up , dev_spinor * spinin_dn , + int gridsize1, int blocksize1, int gridsize2, int blocksize2, + int gridsize3, int blocksize3, int gridsize4, int blocksize4) { + + + + + ///////////////////// + // LOCAL VARIABLES // + ///////////////////// + + int N_sites = VOLUME/2; // #lattice sites + int N_floats = 24*VOLUME/2; // #floats + + // additional for the debugging purposes + size_t dev_spinsize = 6*VOLUME/2*sizeof(dev_spinor); + + + + + //////////////////////////////////// + // MATCHING with Q_Qdagger_ND // + //////////////////////////////////// + // // + // _strange = _up // + // _charm = _dn // + // // + // DUM_MATRIX = dev_spin_eo1_up // + // DUM_MATRIX+1 = dev_spin_eo1_dn // + // // + // DUM_MATRIX+2 = dev_spin_eo2_up // + // DUM_MATRIX+3 = dev_spin_eo2_dn // + // // + //////////////////////////////////// + + // can savely use the following spinors on host: g_spinor_field[DUM_MATRIX{ , +1, ... , +7}] + + + + /////////////////////////////////// + // INITIALIZATIONS & ASSIGNMENTS // // have to use (one) other auxiliary field(s) than the calling function dev_cg_eo_nd + /////////////////////////////////// + + //cudaMalloc((void **) &dev_spin_eo2_up, dev_spinsize); + //cudaMalloc((void **) &dev_spin_eo2_dn, dev_spinsize); + + dev_spin_eo2_up = spinout_up; // need no memory allocated + dev_spin_eo2_dn = spinout_dn; + + //dev_spin_eo2_up = dev_spin3_up; + //dev_spin_eo2_dn = dev_spin3_dn; + ///////////// THEORY //////////////////////////////////////////////////////////////// + // // + // (Q_tilde) = gamma5 * ((M_oo) - (M_oe)(Mee^-1)(M_eo)) // + // (Q_tilde)(Q_tilde_dagger) * (up,dn) = (Q_tilde) * (b,a) // + /////////////// // (a,b) = (Q_tilde) * (dn,up) // + // MAIN BODY // // // + /////////////// ///////////////////////////////////////////////////////////////////////////////////// + + + double nrm = 1. / (1. + g_mubar*g_mubar - g_epsbar*g_epsbar); + + + spinor * l_strange = (spinor *) malloc(6*VOLUME*sizeof(dev_spinor)); + spinor * l_charm = (spinor *) malloc(6*VOLUME*sizeof(dev_spinor)); + spinor * k_strange = (spinor *) malloc(6*VOLUME*sizeof(dev_spinor)); + spinor * k_charm = (spinor *) malloc(6*VOLUME*sizeof(dev_spinor)); + + +/* + #ifdef CHECK_HOPPING_MATRIX + printf("\tCHECK_HOPPING_MATRIX\n"); + #endif + + #ifdef CHECK_IMUGAMMA5 + printf("\tCHECK_IMUGAMMA5\n"); + #endif + + #ifdef CHECK_GAMMA5 + printf("\tCHECK_GAMMA5\n"); + #endif + + #ifdef CHECK_CUBLAS1 + printf("\tCHECK_CUBLAS1\n"); + #endif + + #ifdef CHECK_CUBLAS2 + printf("\tCHECK_CUBLAS2\n"); + #endif + + #ifdef CHECK_CUBLAS3 + printf("\tCHECK_CUBLAS3\n"); + #endif + + #ifdef CHECK_COPY + printf("\tCHECK_COPY\n"); + #endif +*/ + + + + + printf("This is matrix_debug1(). "); + + + + + /////////////////////////////////////// ///////////////////////////////// + // Q_tilde_dagger(2x2) // // (a,b) = (Q_tilde) * (dn,up) // + /////////////////////////////////////// ///////////////////////////////// + + + //////////////////////////// + // (M_oe) (Mee^-1) (M_eo) // + //////////////////////////// + + #ifndef CHECK_HOPPING_MATRIX + bind_texture_spin(spinin_dn,1); + dev_Hopping_Matrix<<>>(dev_gf, spinin_dn, dev_spin_eo1_up, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); // dev_spin_eo1_up = (M_eo) * spinin_dn + unbind_texture_spin(1); + + bind_texture_spin(spinin_up,1); + dev_Hopping_Matrix<<>>(dev_gf, spinin_up, dev_spin_eo1_dn, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); // dev_spin_eo1_dn = (M_eo) * spinin_up + unbind_texture_spin(1); + #else + to_host(k_charm, spinin_dn, h2d_spin_up, dev_spinsize); + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX], k_charm); // g_spinor_field[DUM_MATRIX] = (M_eo) * k_charm + to_device(dev_spin_eo1_up, g_spinor_field[DUM_MATRIX], h2d_spin_up, dev_spinsize); + + to_host(k_strange, spinin_up, h2d_spin_dn, dev_spinsize); + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX+1], k_strange); // g_spinor_field[DUM_MATRIX+1] = (M_eo) * k_strange + to_device(dev_spin_eo1_dn, g_spinor_field[DUM_MATRIX+1], h2d_spin_dn, dev_spinsize); + #endif + + + + + #ifndef CHECK_IMUGAMMA5 + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo1_up, dev_spin_eo2_up, -1.0); // dev_spin_eo2_up = (1 - imubar) * dev_spin_eo1_up = (1 - imubar)*(M_eo) * spinin_dn + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo1_dn, dev_spin_eo2_dn, +1.0); // dev_spin_eo2_dn = (1 + imubar) * dev_spin_eo1_dn = (1 + imubar)*(M_eo) * spinin_up + #else + to_host(g_spinor_field[DUM_MATRIX], dev_spin_eo1_up, h2d_spin_up, dev_spinsize); + mul_one_minus_imubar(g_spinor_field[DUM_MATRIX+2], g_spinor_field[DUM_MATRIX]); // g_spinor_field[DUM_MATRIX+2] = (1 - imubar)*(M_eo) * g_spinor_field[DUM_MATRIX] + to_device(dev_spin_eo2_up, g_spinor_field[DUM_MATRIX+2], h2d_spin_up, dev_spinsize); + + to_host(g_spinor_field[DUM_MATRIX+1], dev_spin_eo1_dn, h2d_spin_dn, dev_spinsize); + mul_one_plus_imubar (g_spinor_field[DUM_MATRIX+3], g_spinor_field[DUM_MATRIX+1]); // g_spinor_field[DUM_MATRIX+3] = (1 + imubar)*(M_eo) * g_spinor_field[DUM_MATRIX+1] + to_device(dev_spin_eo2_dn, g_spinor_field[DUM_MATRIX+3], h2d_spin_dn, dev_spinsize); + #endif + + + + + #ifndef CHECK_CUBLAS1 + cublasSaxpy (N_floats, g_epsbar, (float *) dev_spin_eo1_dn, 1, (float *) dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up + epsbar * dev_spin_eo1_dn = (1 - imubar)*(M_eo) * spinin_dn + epsbar * (M_eo) * spinin_up + cublasSaxpy (N_floats, g_epsbar, (float *) dev_spin_eo1_up, 1, (float *) dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn + epsbar * dev_spin_eo1_up = (1 + imubar)*(M_eo) * spinin_up + epsbar * (M_eo) * spinin_dn + + cublasSscal (N_floats, nrm, (float *) dev_spin_eo2_up, 1); // dev_spin_eo2_up = nrm * dev_spin_eo2_up = nrm*(1-imubar)*(M_eo) * spinin_dn + nrm*epsbar*(M_eo) * spinin_up + + cublasSscal (N_floats, nrm, (float *) dev_spin_eo2_dn, 1); // dev_spin_eo2_dn = nrm * dev_spin_eo2_dn = nrm*(1+imubar)*(M_eo) * spinin_up + nrm*epsbar*(M_eo) * spinin_dn + #else + to_host(g_spinor_field[DUM_MATRIX+1], dev_spin_eo1_dn, h2d_spin_up, dev_spinsize); + to_host(g_spinor_field[DUM_MATRIX+2], dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + assign_add_mul_r(g_spinor_field[DUM_MATRIX+2], g_spinor_field[DUM_MATRIX+1], g_epsbar, VOLUME/2); + // g_spinor_field[DUM_MATRIX+2] = g_spinor_field[DUM_MATRIX+2] + epsbar*g_spinor_field[DUM_MATRIX+1] + mul_r(g_spinor_field[DUM_MATRIX+2], nrm, g_spinor_field[DUM_MATRIX+2], VOLUME/2); // g_spinor_field[DUM_MATRIX+2] = nrm * g_spinor_field[DUM_MATRIX+2] + to_device(dev_spin_eo2_up, g_spinor_field[DUM_MATRIX+2], h2d_spin_up, dev_spinsize); + + + to_host(g_spinor_field[DUM_MATRIX], dev_spin_eo1_up, h2d_spin_dn, dev_spinsize); + to_host(g_spinor_field[DUM_MATRIX+3], dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + assign_add_mul_r(g_spinor_field[DUM_MATRIX+3], g_spinor_field[DUM_MATRIX], g_epsbar, VOLUME/2); + // g_spinor_field[DUM_MATRIX+3] = g_spinor_field[DUM_MATRIX+3] + epsbar*g_spinor_field[DUM_MATRIX] + mul_r(g_spinor_field[DUM_MATRIX+3], nrm, g_spinor_field[DUM_MATRIX+3], VOLUME/2); // g_spinor_field[DUM_MATRIX+3] = nrm * g_spinor_field[DUM_MATRIX+3] + to_device(dev_spin_eo2_dn, g_spinor_field[DUM_MATRIX+3] , h2d_spin_dn, dev_spinsize); + #endif + + + + + #ifndef CHECK_HOPPING_MATRIX // remember: this is ((M_oe)(Mee^-1)(M_eo)) * (spinin_dn, spinin_up): + bind_texture_spin(dev_spin_eo2_up,1); + dev_Hopping_Matrix<<>>(dev_gf, dev_spin_eo2_up, dev_spin_eo1_up, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + unbind_texture_spin(1); + // dev_spin_eo1_up = (M_oe) * dev_spin_eo2_up = (M_oe)*nrm*(1-imubar)*(M_eo) * spinin_dn + (M_oe)*nrm*epsbar*(M_eo) * spinin_up + bind_texture_spin(dev_spin_eo2_dn,1); + dev_Hopping_Matrix<<>>(dev_gf, dev_spin_eo2_dn, dev_spin_eo1_dn, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + unbind_texture_spin(1); + // dev_spin_eo1_dn = (M_oe) * dev_spin_eo2_dn = (M_oe)*nrm*(1+imubar)*(M_eo) * spinin_up + (M_oe)*nrm*epsbar*(M_eo) * spinin_dn + #else + to_host(g_spinor_field[DUM_MATRIX+2], dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + Hopping_Matrix(OE, g_spinor_field[DUM_MATRIX] , g_spinor_field[DUM_MATRIX+2]); // g_spinor_field[DUM_MATRIX] = (M_oe) * g_spinor_field[DUM_MATRIX+2] + to_device(dev_spin_eo1_up, g_spinor_field[DUM_MATRIX], h2d_spin_up, dev_spinsize); + + to_host(g_spinor_field[DUM_MATRIX+3], dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + Hopping_Matrix(OE, g_spinor_field[DUM_MATRIX+1], g_spinor_field[DUM_MATRIX+3]); // g_spinor_field[DUM_MATRIX+1] = (M_oe) * g_spinor_field[DUM_MATRIX+3] + to_device(dev_spin_eo1_dn, g_spinor_field[DUM_MATRIX+1], h2d_spin_dn, dev_spinsize); + #endif + + + + + //////////// + // (M_oo) // + //////////// + + #ifndef CHECK_IMUGAMMA5 + dev_mul_one_pm_imubar_gamma5<<>>(spinin_dn, dev_spin_eo2_up, +1.0); // dev_spin_eo2_up = (1 + imubar) * spinin_dn + dev_mul_one_pm_imubar_gamma5<<>>(spinin_up, dev_spin_eo2_dn, -1.0); // dev_spin_eo2_dn = (1 - imubar) * spinin_up + #else + to_host(k_charm, spinin_dn, h2d_spin_up, dev_spinsize); + mul_one_plus_imubar (g_spinor_field[DUM_MATRIX+2], k_charm); // g_spinor_field[DUM_MATRIX+2] = (1 + imubar) * k_charm + to_device(dev_spin_eo2_up, g_spinor_field[DUM_MATRIX+2], h2d_spin_up, dev_spinsize); + + to_host(k_strange, spinin_up, h2d_spin_dn, dev_spinsize); + mul_one_minus_imubar(g_spinor_field[DUM_MATRIX+3], k_strange); // g_spinor_field[DUM_MATRIX+3] = (1 - imubar) * k_strange + to_device(dev_spin_eo2_dn, g_spinor_field[DUM_MATRIX+3], h2d_spin_dn, dev_spinsize); + #endif + + + + #ifndef CHECK_CUBLAS2 + // remember: this is (M_oo) * (spinin_dn, spinin_up): + cublasSaxpy (N_floats, -g_epsbar, (float *) spinin_up, 1, (float *) dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up - epsbar*spinin_up = (1+imubar)*spinin_dn - epsbar*spinin_up + cublasSaxpy (N_floats, -g_epsbar, (float *) spinin_dn, 1, (float *) dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn - epsbar*spinin_dn = (1-imubar)*spinin_up - epsbar*spinin_dn + #else + to_host(k_strange, spinin_up, h2d_spin_up, dev_spinsize); + to_host(g_spinor_field[DUM_MATRIX+2], dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + assign_add_mul_r(g_spinor_field[DUM_MATRIX+2], k_strange, -g_epsbar, VOLUME/2); // g_spinor_field[DUM_MATRIX+2] = g_spinor_field[DUM_MATRIX+2] - epsbar*k_strange + to_device(dev_spin_eo2_up, g_spinor_field[DUM_MATRIX+2], h2d_spin_up, dev_spinsize); + + to_host(k_charm, spinin_dn, h2d_spin_dn, dev_spinsize); + to_host(g_spinor_field[DUM_MATRIX+3], dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + assign_add_mul_r(g_spinor_field[DUM_MATRIX+3], k_charm , -g_epsbar, VOLUME/2); // g_spinor_field[DUM_MATRIX+3] = g_spinor_field[DUM_MATRIX+3] - epsbar*k_charm + to_device(dev_spin_eo2_dn, g_spinor_field[DUM_MATRIX+3], h2d_spin_dn, dev_spinsize); + #endif + + + + + /////////////////////////////////////// + // (M_oo) - (M_oe) (Mee^-1) (M_eo) // + /////////////////////////////////////// + + #ifndef CHECK_CUBLAS3 + // this is ((M_oo) - (M_oe)(Mee^-1)(M_eo)) * (spinin_dn, spinin_up): + cublasSaxpy (N_floats, -1.0, (float *) dev_spin_eo1_up, 1, (float *) dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up - dev_spin_eo1_up = (1+imubar) * spinin_dn - epsbar * spinin_up + // - (M_oe)*nrm*(1-imubar)*(M_eo) * spinin_dn - (M_oe)*nrm*epsbar*(M_eo) * spinin_up + cublasSaxpy (N_floats, -1.0, (float *) dev_spin_eo1_dn, 1, (float *) dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn - dev_spin_eo1_dn = (1-imubar) * spinin_up - epsbar * spinin_dn + // - (M_oe)*nrm*(1+imubar)*(M_eo) * spinin_up - (M_oe)*nrm*epsbar*(M_eo) * spinin_dn + #else + to_host(g_spinor_field[DUM_MATRIX], dev_spin_eo1_up, h2d_spin_up, dev_spinsize); + to_host(g_spinor_field[DUM_MATRIX+2], dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + diff(g_spinor_field[DUM_MATRIX+4], g_spinor_field[DUM_MATRIX+2], g_spinor_field[DUM_MATRIX] , VOLUME/2); + // g_spinor_field[DUM_MATRIX+4] = g_spinor_field[DUM_MATRIX+2] - g_spinor_field[DUM_MATRIX] + to_device(dev_spin_eo2_up, g_spinor_field[DUM_MATRIX+4], h2d_spin_up, dev_spinsize); + + + to_host(g_spinor_field[DUM_MATRIX+1], dev_spin_eo1_dn, h2d_spin_dn, dev_spinsize); + to_host(g_spinor_field[DUM_MATRIX+3], dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + diff(g_spinor_field[DUM_MATRIX+5], g_spinor_field[DUM_MATRIX+3], g_spinor_field[DUM_MATRIX+1], VOLUME/2); + // g_spinor_field[DUM_MATRIX+5] = g_spinor_field[DUM_MATRIX+3] - g_spinor_field[DUM_MATRIX+1] + to_device(dev_spin_eo2_dn, g_spinor_field[DUM_MATRIX+5], h2d_spin_dn, dev_spinsize); + #endif + + + + + //////////// + // gamma5 // + //////////// + + #ifndef CHECK_GAMMA5 + dev_gamma5<<>>(dev_spin_eo2_up, dev_spin_eo2_up); // dev_spin_eo2_up = gamma5 * dev_spin_eo2_up + dev_gamma5<<>>(dev_spin_eo2_dn, dev_spin_eo2_dn); // dev_spin_eo2_dn = gamma5 * dev_spin_eo2_dn + #else + to_host(g_spinor_field[DUM_MATRIX+4], dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + gamma5(g_spinor_field[DUM_MATRIX+2], g_spinor_field[DUM_MATRIX+4], VOLUME/2); // g_spinor_field[DUM_MATRIX+2] = gamma5 * g_spinor_field[DUM_MATRIX+4] + to_device(dev_spin_eo2_up, g_spinor_field[DUM_MATRIX+2], h2d_spin_up, dev_spinsize); + + to_host(g_spinor_field[DUM_MATRIX+5], dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + gamma5(g_spinor_field[DUM_MATRIX+3], g_spinor_field[DUM_MATRIX+5], VOLUME/2); // g_spinor_field[DUM_MATRIX+3] = gamma5 * g_spinor_field[DUM_MATRIX+5] + to_device(dev_spin_eo2_dn, g_spinor_field[DUM_MATRIX+3], h2d_spin_dn, dev_spinsize); + #endif + + + + + + + //////////////////// + // (a,b) -> (b,a) // + //////////////////// + + #ifndef CHECK_COPY + dev_copy_spinor_field<<>>(dev_spin_eo2_dn, spinin_up); // spinin_up = dev_spin_eo2_dn + dev_copy_spinor_field<<>>(dev_spin_eo2_up, spinin_dn); // spinin_dn = dev_spin_eo2_up + #else + to_host(g_spinor_field[DUM_MATRIX+2], dev_spin_eo2_dn, h2d_spin_up, dev_spinsize); + assign(g_spinor_field[DUM_MATRIX+6], g_spinor_field[DUM_MATRIX+2], VOLUME/2); // g_spinor_field[DUM_MATRIX+6] = g_spinor_field[DUM_MATRIX+2] + to_device(spinin_up, g_spinor_field[DUM_MATRIX+6], h2d_spin_up, dev_spinsize); + + to_host(g_spinor_field[DUM_MATRIX+3], dev_spin_eo2_up, h2d_spin_dn, dev_spinsize); + assign(g_spinor_field[DUM_MATRIX+7], g_spinor_field[DUM_MATRIX+3], VOLUME/2); // g_spinor_field[DUM_MATRIX+7] = g_spinor_field[DUM_MATRIX+3] + to_device(spinin_dn, g_spinor_field[DUM_MATRIX+7], h2d_spin_dn, dev_spinsize); + #endif + + + + + + + /////////////////////////////////// /////////////////////// + // Q_tilde(2x2) // // (Q_tilde) * (b,a) // + /////////////////////////////////// /////////////////////// + + + //////////////////////////// + // (M_oe) (Mee^-1) (M_eo) // + //////////////////////////// + + #ifndef CHECK_HOPPING_MATRIX + bind_texture_spin(spinin_up,1); + dev_Hopping_Matrix<<>>(dev_gf, spinin_up, dev_spin_eo1_up, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); // dev_spin_eo1_up = (M_eo) * spinin_up + unbind_texture_spin(1); + + bind_texture_spin(spinin_dn,1); + dev_Hopping_Matrix<<>>(dev_gf, spinin_dn, dev_spin_eo1_dn, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); // dev_spin_eo1_dn = (M_eo) * spinin_dn + unbind_texture_spin(1); + #else + to_host(g_spinor_field[DUM_MATRIX+7], spinin_up, h2d_spin_up, dev_spinsize); + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX] , g_spinor_field[DUM_MATRIX+7]); // g_spinor_field[DUM_MATRIX] = (M_eo) * g_spinor_field[DUM_MATRIX+7] + to_device(dev_spin_eo1_up, g_spinor_field[DUM_MATRIX], h2d_spin_up, dev_spinsize); + + to_host(g_spinor_field[DUM_MATRIX+6], spinin_dn, h2d_spin_dn, dev_spinsize); + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX+1], g_spinor_field[DUM_MATRIX+6]); // g_spinor_field[DUM_MATRIX+1] = (M_eo) * g_spinor_field[DUM_MATRIX+6] + to_device(dev_spin_eo1_dn, g_spinor_field[DUM_MATRIX+1], h2d_spin_dn, dev_spinsize); + #endif + + + + + #ifndef CHECK_IMUGAMMA5 + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo1_up, dev_spin_eo2_up, -1.0); // dev_spin_eo2_up = (1 - imubar) * dev_spin_eo1_up = (1 - imubar)*(M_eo) * spinin_up + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo1_dn, dev_spin_eo2_dn, +1.0); // dev_spin_eo2_dn = (1 + imubar) * dev_spin_eo1_dn = (1 + imubar)*(M_eo) * spinin_dn + #else + to_host(g_spinor_field[DUM_MATRIX], dev_spin_eo1_up, h2d_spin_up, dev_spinsize); + mul_one_minus_imubar(g_spinor_field[DUM_MATRIX+2], g_spinor_field[DUM_MATRIX]); // g_spinor_field[DUM_MATRIX+2] = (1 - imubar) * g_spinor_field[DUM_MATRIX] + to_device(dev_spin_eo2_up, g_spinor_field[DUM_MATRIX+2], h2d_spin_up, dev_spinsize); + + to_host(g_spinor_field[DUM_MATRIX+1], dev_spin_eo1_dn, h2d_spin_dn, dev_spinsize); + mul_one_plus_imubar (g_spinor_field[DUM_MATRIX+3], g_spinor_field[DUM_MATRIX+1]); // g_spinor_field[DUM_MATRIX+3] = (1 + imubar) * g_spinor_field[DUM_MATRIX+1] + to_device(dev_spin_eo2_dn, g_spinor_field[DUM_MATRIX+3], h2d_spin_dn, dev_spinsize); + #endif + + + + + #ifndef CHECK_CUBLAS1 + cublasSaxpy (N_floats, g_epsbar, (float *) dev_spin_eo1_dn, 1, (float *) dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up + epsbar * dev_spin_eo1_dn = (1 - imubar)*(M_eo) * spinin_up + epsbar * (M_eo) * spinin_dn + cublasSaxpy (N_floats, g_epsbar, (float *) dev_spin_eo1_up, 1, (float *) dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn + epsbar * dev_spin_eo1_up = (1 + imubar)*(M_eo) * spinin_dn + epsbar * (M_eo) * spinin_up + + cublasSscal (N_floats, nrm, (float *) dev_spin_eo2_up, 1); // dev_spin_eo2_up = nrm * dev_spin_eo2_up = nrm*(1-imubar)*(M_eo) * spinin_up + nrm*epsbar*(M_eo) * spinin_dn + + cublasSscal (N_floats, nrm, (float *) dev_spin_eo2_dn, 1); // dev_spin_eo2_dn = nrm * dev_spin_eo2_dn = nrm*(1+imubar)*(M_eo) * spinin_dn + nrm*epsbar*(M_eo) * spinin_up + #else + to_host(g_spinor_field[DUM_MATRIX+1], dev_spin_eo1_dn, h2d_spin_up, dev_spinsize); + to_host(g_spinor_field[DUM_MATRIX+2], dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + assign_add_mul_r(g_spinor_field[DUM_MATRIX+2], g_spinor_field[DUM_MATRIX+1], g_epsbar, VOLUME/2); + // g_spinor_field[DUM_MATRIX+2] = g_spinor_field[DUM_MATRIX+2] + epsbar * g_spinor_field[DUM_MATRIX+1] + mul_r(g_spinor_field[DUM_MATRIX+2], nrm, g_spinor_field[DUM_MATRIX+2], VOLUME/2); // g_spinor_field[DUM_MATRIX+2] = nrm * g_spinor_field[DUM_MATRIX+2] + to_device(dev_spin_eo2_up, g_spinor_field[DUM_MATRIX+2], h2d_spin_up, dev_spinsize); + + + to_host(g_spinor_field[DUM_MATRIX], dev_spin_eo1_up, h2d_spin_dn, dev_spinsize); + to_host(g_spinor_field[DUM_MATRIX+3], dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + assign_add_mul_r(g_spinor_field[DUM_MATRIX+3], g_spinor_field[DUM_MATRIX] , g_epsbar, VOLUME/2); + // g_spinor_field[DUM_MATRIX+3] = g_spinor_field[DUM_MATRIX+3] + epsbar * g_spinor_field[DUM_MATRIX] + mul_r(g_spinor_field[DUM_MATRIX+3], nrm, g_spinor_field[DUM_MATRIX+3], VOLUME/2); // g_spinor_field[DUM_MATRIX+3] = nrm * g_spinor_field[DUM_MATRIX+3] + to_device(dev_spin_eo2_dn, g_spinor_field[DUM_MATRIX+3] , h2d_spin_dn, dev_spinsize); + #endif + + + + + #ifndef CHECK_HOPPING_MATRIX // remember: this is ((M_oe) (Mee^-1) (M_eo)) * (spinin_up, spinin_dn): + bind_texture_spin(dev_spin_eo2_up,1); + dev_Hopping_Matrix<<>>(dev_gf, dev_spin_eo2_up, dev_spin_eo1_up, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + unbind_texture_spin(1); + // dev_spin_eo1_up = (M_oe) * dev_spin_eo2_up = (M_oe)*nrm*(1-imubar)*(M_eo) * spinin_up + (M_oe)*nrm*epsbar*(M_eo) * spinin_dn + bind_texture_spin(dev_spin_eo2_dn,1); + dev_Hopping_Matrix<<>>(dev_gf, dev_spin_eo2_dn, dev_spin_eo1_dn, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + unbind_texture_spin(1); + // dev_spin_eo1_dn = (M_oe) * dev_spin_eo2_dn = (M_oe)*nrm*(1+imubar)*(M_eo) * spinin_dn + (M_oe)*nrm*epsbar*(M_eo) * spinin_up + #else + to_host(g_spinor_field[DUM_MATRIX+2], dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + Hopping_Matrix(OE, l_strange, g_spinor_field[DUM_MATRIX+2]); // l_strange = (M_oe) * g_spinor_field[DUM_MATRIX+2] + to_device(dev_spin_eo1_up, l_strange, h2d_spin_up, dev_spinsize); + + to_host(g_spinor_field[DUM_MATRIX+3], dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + Hopping_Matrix(OE, l_charm , g_spinor_field[DUM_MATRIX+3]); // l_charm = (M_oe) * g_spinor_field[DUM_MATRIX+3] + to_device(dev_spin_eo1_dn, l_charm, h2d_spin_dn, dev_spinsize); + #endif + + + + + //////////// + // (M_oo) // + //////////// + + #ifndef CHECK_IMUGAMMA5 + dev_mul_one_pm_imubar_gamma5<<>>(spinin_up, dev_spin_eo2_up, +1.0); // dev_spin_eo2_up = (1 + imubar) * spinin_up + dev_mul_one_pm_imubar_gamma5<<>>(spinin_dn, dev_spin_eo2_dn, -1.0); // dev_spin_eo2_dn = (1 - imubar) * spinin_dn + #else + to_host(g_spinor_field[DUM_MATRIX+7], spinin_up, h2d_spin_up, dev_spinsize); + mul_one_plus_imubar (g_spinor_field[DUM_MATRIX] , g_spinor_field[DUM_MATRIX+7]); // g_spinor_field[DUM_MATRIX] = (1 + imubar) * g_spinor_field[DUM_MATRIX+7] + to_device(dev_spin_eo2_up, g_spinor_field[DUM_MATRIX], h2d_spin_up, dev_spinsize); + + to_host(g_spinor_field[DUM_MATRIX+6], spinin_dn, h2d_spin_dn, dev_spinsize); + mul_one_minus_imubar(g_spinor_field[DUM_MATRIX+1], g_spinor_field[DUM_MATRIX+6]); // g_spinor_field[DUM_MATRIX+1] = (1 - imubar) * g_spinor_field[DUM_MATRIX+6] + to_device(dev_spin_eo2_dn, g_spinor_field[DUM_MATRIX+1], h2d_spin_dn, dev_spinsize); + #endif + + + + + #ifndef CHECK_CUBLAS2 + // remember: this is (M_oo) * (spinin_up, spinin_dn): + cublasSaxpy (N_floats, -g_epsbar, (float *) spinin_dn, 1, (float *) dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up - epsbar*spinin_dn = (1+imubar)*spinin_up - epsbar*spinin_dn + cublasSaxpy (N_floats, -g_epsbar, (float *) spinin_up, 1, (float *) dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn - epsbar*spinin_up = (1-imubar)*spinin_dn - epsbar*spinin_up + #else + to_host(g_spinor_field[DUM_MATRIX+6], spinin_dn, h2d_spin_up, dev_spinsize); + to_host(g_spinor_field[DUM_MATRIX], dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + assign_add_mul_r(g_spinor_field[DUM_MATRIX] , g_spinor_field[DUM_MATRIX+6], -g_epsbar, VOLUME/2); + // g_spinor_field[DUM_MATRIX] = g_spinor_field[DUM_MATRIX] - epsbar * g_spinor_field[DUM_MATRIX+6] + to_device(dev_spin_eo2_up, g_spinor_field[DUM_MATRIX], h2d_spin_up, dev_spinsize); + + + to_host(g_spinor_field[DUM_MATRIX+7], spinin_up, h2d_spin_dn, dev_spinsize); + to_host(g_spinor_field[DUM_MATRIX+1], dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + assign_add_mul_r(g_spinor_field[DUM_MATRIX+1], g_spinor_field[DUM_MATRIX+7], -g_epsbar, VOLUME/2); + // g_spinor_field[DUM_MATRIX+1] = g_spinor_field[DUM_MATRIX+1] - epsbar * g_spinor_field[DUM_MATRIX+7] + to_device(dev_spin_eo2_dn, g_spinor_field[DUM_MATRIX+1], h2d_spin_dn, dev_spinsize); + #endif + + + + + /////////////////////////////////////// + // (M_oo) - (M_oe) (Mee^-1) (M_eo) // + /////////////////////////////////////// + + #ifndef CHECK_CUBLAS3 + // this is ( (M_oo) - (M_oe) (Mee^-1) (M_eo) ) * (spinin_up, spinin_dn) + cublasSaxpy (N_floats, -1.0, (float *) dev_spin_eo1_up, 1, (float *) dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up - dev_spin_eo1_up = (1+imubar) * spinin_up - epsbar * spinin_dn + // - (M_oe)*nrm*(1-imubar)*(M_eo) * spinin_up - (M_oe)*nrm*epsbar*(M_eo) * spinin_dn + cublasSaxpy (N_floats, -1.0, (float *) dev_spin_eo1_dn, 1, (float *) dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn - dev_spin_eo1_dn = (1-imubar) * spinin_dn - epsbar * spinin_up + // - (M_oe)*nrm*(1+imubar)*(M_eo) * spinin_dn - (M_oe)*nrm*epsbar*(M_eo) * spinin_up + #else + to_host(l_strange, dev_spin_eo1_up, h2d_spin_up, dev_spinsize); + to_host(g_spinor_field[DUM_MATRIX], dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + diff(l_strange, g_spinor_field[DUM_MATRIX], l_strange, VOLUME/2); // l_strange = g_spinor_field[DUM_MATRIX] - l_strange + to_device(dev_spin_eo2_up, l_strange, h2d_spin_up, dev_spinsize); + + to_host(l_charm, dev_spin_eo1_dn, h2d_spin_dn, dev_spinsize); + to_host(g_spinor_field[DUM_MATRIX+1], dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + diff(l_charm, g_spinor_field[DUM_MATRIX+1], l_charm , VOLUME/2); // l_charm = g_spinor_field[DUM_MATRIX+1] - l_charm + to_device(dev_spin_eo2_dn, l_charm, h2d_spin_dn, dev_spinsize); + #endif + + + + + //////////// + // gamma5 // + //////////// + + #ifndef CHECK_GAMMA5 + dev_gamma5<<>>(dev_spin_eo2_up, dev_spin_eo2_up); // dev_spin_eo2_up = gamma5 * dev_spin_eo2_up + dev_gamma5<<>>(dev_spin_eo2_dn, dev_spin_eo2_dn); // dev_spin_eo2_dn = gamma5 * dev_spin_eo2_dn + #else + to_host(l_strange, dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + gamma5(l_strange, l_strange, VOLUME/2); // l_strange = gamma5 * l_strange + to_device(dev_spin_eo2_up, l_strange, h2d_spin_up, dev_spinsize); + + to_host(l_charm, dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + gamma5(l_charm , l_charm , VOLUME/2); // l_charm = gamma5 * l_charm + to_device(dev_spin_eo2_dn, l_charm, h2d_spin_dn, dev_spinsize); + #endif + + + + + /* + //////////// + // output // // output is already done by setting dev_spin_eo2_up/dn = spinout_up/dn + //////////// + + dev_copy_spinor_field<<>>(dev_spin_eo2_up, spinout_up); // spinin_up = dev_spin_eo2_up + dev_copy_spinor_field<<>>(dev_spin_eo2_dn, spinout_dn); // spinin_dn = dev_spin_eo2_dn + */ + + + + + /* At the end, the normalisation by the max. eigenvalue */ + /* Twice phmc_invmaxev since we consider here D Ddag !!! */ + /* + #ifndef CHECK_MAXEV + + #else + to_host(l_charm, dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + mul_r(l_charm, phmc_invmaxev*phmc_invmaxev, l_charm, VOLUME/2); + to_device(dev_spin_eo2_up, l_charm, h2d_spin_up, dev_spinsize); + + to_host(l_strange, dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + mul_r(l_strange, phmc_invmaxev*phmc_invmaxev, l_strange, VOLUME/2); + to_device(dev_spin_eo2_dn, l_strange, h2d_spin_dn, dev_spinsize); + #endif + */ + + + + + return; + +}//matrix_debug1() + + + + + + + + +// this replaces Q_Qdagger_ND() for debugging !! + +// RESULT: ALL parts on the GPU are working +// the error has to be in the structure connecting the individual parts + +void matrix_debug2 (spinor * const l_strange, spinor * const l_charm, // output + spinor * const k_strange, spinor * const k_charm) { // input + + + int N_sites = VOLUME/2; // #lattice sites + int N_floats = 24*VOLUME/2; // #floats + + // additional for the debugging purposes + size_t dev_spinsize = 6*VOLUME/2*sizeof(dev_spinor); + + + int gridsize; // auxiliary + int blocksize; // auxiliary + + blocksize = 128; + int blocksize1 = blocksize; // here: dev_zero_spinor_field , dev_copy_spinor_field + int gridsize1 = (int) (VOLUME/2/blocksize) + 1; + + blocksize = 128; + int blocksize2 = blocksize; // passed: dev_Hopping_Matrix + int gridsize2 = (int) (VOLUME/2/blocksize) + 1; + + blocksize = 128; + int blocksize3 = blocksize; // passed: dev_mul_one_pm_imubar_gamma5 + int gridsize3 = (int) (VOLUME/2/blocksize) + 1; + + blocksize = 128; + int blocksize4 = blocksize; // passed: dev_gamma5 + int gridsize4 = (int) (VOLUME/2/blocksize) + 1; + + blocksize = 128; + int blocksize5 = blocksize; // passed: dev_copy_spinor_field + int gridsize5 = (int) (VOLUME/2/blocksize) + 1; + + dev_spinor * spinin_up; + dev_spinor * spinin_dn; + dev_spinor * spinout_up; + dev_spinor * spinout_dn; + + + cudaMalloc((void **) &dev_spin_eo2_up, dev_spinsize); + cudaMalloc((void **) &dev_spin_eo2_dn, dev_spinsize); + cudaMalloc((void **) &spinin_up, dev_spinsize); + cudaMalloc((void **) &spinin_dn, dev_spinsize); + cudaMalloc((void **) &spinout_up, dev_spinsize); + cudaMalloc((void **) &spinout_dn, dev_spinsize); + + + + double nrm = 1./(1. + g_mubar*g_mubar - g_epsbar*g_epsbar); // nrm = (1 + mubar^2 - epsbar^2)^-1 + + + + + printf("This is matrix_debug2(). "); + + + + + /* FIRST THE Qhat(2x2)^dagger PART */ // we will apply Qhat(2x2) with charme and strange interchanged + // which is equivalent to apply Qhat(2x2)^dagger + + /* Here the M_oe Mee^-1 M_eo implementation */ + + #ifndef CHECK_HOPPING_MATRIX + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX] , k_charm); // g_spinor_field[DUM_MATRIX] = (M_eo) * k_charm // notice the order + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX+1], k_strange); // g_spinor_field[DUM_MATRIX+1] = (M_eo) * k_strange // of k_charm and k_strange + #else + to_device(spinin_dn, k_charm, h2d_spin_up, dev_spinsize); + bind_texture_spin(spinin_dn,1); + dev_Hopping_Matrix<<>>(dev_gf, spinin_dn, dev_spin_eo1_up, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); // dev_spin_eo1_up = (M_eo) * spinin_dn + unbind_texture_spin(1); + to_host(g_spinor_field[DUM_MATRIX], dev_spin_eo1_up, h2d_spin_up, dev_spinsize); + + to_device(spinin_up, k_strange, h2d_spin_up, dev_spinsize); + bind_texture_spin(spinin_up,1); + dev_Hopping_Matrix<<>>(dev_gf, spinin_up, dev_spin_eo1_dn, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); // dev_spin_eo1_dn = (M_eo) * spinin_up + unbind_texture_spin(1); + to_host(g_spinor_field[DUM_MATRIX+1], dev_spin_eo1_dn, h2d_spin_dn, dev_spinsize); + #endif + + + + + #ifndef CHECK_IMUGAMMA5 + // remark: here the factor GAMMA5 is not written: + mul_one_minus_imubar(g_spinor_field[DUM_MATRIX+2], g_spinor_field[DUM_MATRIX]); // g_spinor_field[DUM_MATRIX+2] = (1 - imubar)*(M_eo) * g_spinor_field[DUM_MATRIX] + // = (1 - imubar)*(M_eo) * k_charm + mul_one_plus_imubar (g_spinor_field[DUM_MATRIX+3], g_spinor_field[DUM_MATRIX+1]); // g_spinor_field[DUM_MATRIX+3] = (1 + imubar)*(M_eo) * g_spinor_field[DUM_MATRIX+1] + // = (1 + imubar)*(M_eo) * k_strange + #else + to_device(dev_spin_eo1_up, g_spinor_field[DUM_MATRIX], h2d_spin_up, dev_spinsize); + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo1_up, dev_spin_eo2_up, -1.0); // dev_spin_eo2_up = (1 - imubar) * dev_spin_eo1_up + to_host(g_spinor_field[DUM_MATRIX+2], dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + + to_device(dev_spin_eo1_dn, g_spinor_field[DUM_MATRIX+1], h2d_spin_dn, dev_spinsize); + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo1_dn, dev_spin_eo2_dn, +1.0); // dev_spin_eo2_dn = (1 + imubar) * dev_spin_eo1_dn + to_host(g_spinor_field[DUM_MATRIX+3], dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + #endif + + + + + #ifndef CHECK_CUBLAS1 + assign_add_mul_r(g_spinor_field[DUM_MATRIX+2], g_spinor_field[DUM_MATRIX+1], g_epsbar, VOLUME/2); + // g_spinor_field[DUM_MATRIX+2] = g_spinor_field[DUM_MATRIX+2] + epsbar*g_spinor_field[DUM_MATRIX+1] + // = (1 - imubar)*(M_eo) * k_charm + epsbar*(M_eo) * k_strange + assign_add_mul_r(g_spinor_field[DUM_MATRIX+3], g_spinor_field[DUM_MATRIX] , g_epsbar, VOLUME/2); + // g_spinor_field[DUM_MATRIX+3] = g_spinor_field[DUM_MATRIX+3] + epsbar*g_spinor_field[DUM_MATRIX] + // = (1 + imubar)*(M_eo) * k_strange + epsbar*(M_eo) * k_charm + + mul_r(g_spinor_field[DUM_MATRIX+2], nrm, g_spinor_field[DUM_MATRIX+2], VOLUME/2); // g_spinor_field[DUM_MATRIX+2] = nrm * g_spinor_field[DUM_MATRIX+2] + // = nrm * ( (1 - imubar)*(M_eo) * k_charm + epsbar*(M_eo) * k_strange ) + // = nrm*(1 - imubar)*(M_eo)*k_charm + nrm*epsbar*(M_eo)*k_strange + + mul_r(g_spinor_field[DUM_MATRIX+3], nrm, g_spinor_field[DUM_MATRIX+3], VOLUME/2); // g_spinor_field[DUM_MATRIX+3] = nrm * g_spinor_field[DUM_MATRIX+3] + // = nrm * ( (1 + imubar)*(M_eo) * k_strange + epsbar*(M_eo) * k_charm ) + // = nrm*(1 + imubar)*(M_eo)*k_strange + nrm*epsbar*(M_eo)*k_charm + #else + to_device(dev_spin_eo1_dn, g_spinor_field[DUM_MATRIX+1], h2d_spin_up, dev_spinsize); + to_device(dev_spin_eo2_up, g_spinor_field[DUM_MATRIX+2], h2d_spin_up, dev_spinsize); + cublasSaxpy (N_floats, g_epsbar, (float *) dev_spin_eo1_dn, 1, (float *) dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up + epsbar * dev_spin_eo1_dn + cublasSscal (N_floats, nrm, (float *) dev_spin_eo2_up, 1); // dev_spin_eo2_up = nrm * dev_spin_eo2_up + to_host(g_spinor_field[DUM_MATRIX+2], dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + + + to_device(dev_spin_eo1_up, g_spinor_field[DUM_MATRIX], h2d_spin_dn, dev_spinsize); + to_device(dev_spin_eo2_dn, g_spinor_field[DUM_MATRIX+3], h2d_spin_dn, dev_spinsize); + cublasSaxpy (N_floats, g_epsbar, (float *) dev_spin_eo1_up, 1, (float *) dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn + epsbar * dev_spin_eo1_up + cublasSscal (N_floats, nrm, (float *) dev_spin_eo2_dn, 1); // dev_spin_eo2_dn = nrm * dev_spin_eo2_dn + to_host(g_spinor_field[DUM_MATRIX+3], dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + #endif + + + + + #ifndef CHECK_HOPPING_MATRIX + Hopping_Matrix(OE, g_spinor_field[DUM_MATRIX] , g_spinor_field[DUM_MATRIX+2]); // g_spinor_field[DUM_MATRIX] = (M_oe) * g_spinor_field[DUM_MATRIX+2] + // = (M_oe)*nrm*(1 - imubar)*(M_eo)*k_charm + (M_oe)*nrm*epsbar*(M_eo)*k_strange + Hopping_Matrix(OE, g_spinor_field[DUM_MATRIX+1], g_spinor_field[DUM_MATRIX+3]); // g_spinor_field[DUM_MATRIX+1] = (M_oe) * g_spinor_field[DUM_MATRIX+3] + // = (M_oe)*nrm*(1 + imubar)*(M_eo)*k_strange + (M_oe)*nrm*epsbar*(M_eo) * k_charm + #else + to_device(dev_spin_eo2_up, g_spinor_field[DUM_MATRIX+2], h2d_spin_up, dev_spinsize); + bind_texture_spin(dev_spin_eo2_up,1); + dev_Hopping_Matrix<<>>(dev_gf, dev_spin_eo2_up, dev_spin_eo1_up, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + unbind_texture_spin(1); // dev_spin_eo1_up = (M_oe) * dev_spin_eo2_up + to_host(g_spinor_field[DUM_MATRIX], dev_spin_eo1_up, h2d_spin_up, dev_spinsize); + + to_device(dev_spin_eo2_dn, g_spinor_field[DUM_MATRIX+3], h2d_spin_dn, dev_spinsize); + bind_texture_spin(dev_spin_eo2_dn,1); + dev_Hopping_Matrix<<>>(dev_gf, dev_spin_eo2_dn, dev_spin_eo1_dn, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + unbind_texture_spin(1); // dev_spin_eo1_dn = (M_oe) * dev_spin_eo2_dn + to_host(g_spinor_field[DUM_MATRIX+1], dev_spin_eo1_dn, h2d_spin_dn, dev_spinsize); + #endif + + + + + /* Here the M_oo implementation */ + + #ifndef CHECK_IMUGAMMA5 + mul_one_plus_imubar (g_spinor_field[DUM_MATRIX+2], k_charm); // g_spinor_field[DUM_MATRIX+2] = (1 + imubar) * k_charm + mul_one_minus_imubar(g_spinor_field[DUM_MATRIX+3], k_strange); // g_spinor_field[DUM_MATRIX+3] = (1 - imubar) * k_strange + #else + to_device(spinin_dn, k_charm, h2d_spin_up, dev_spinsize); + dev_mul_one_pm_imubar_gamma5<<>>(spinin_dn, dev_spin_eo2_up, +1.0); // dev_spin_eo2_up = (1 + imubar) * spinin_dn + to_host(g_spinor_field[DUM_MATRIX+2], dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + + to_device(spinin_up, k_strange, h2d_spin_dn, dev_spinsize); + dev_mul_one_pm_imubar_gamma5<<>>(spinin_up, dev_spin_eo2_dn, -1.0); // dev_spin_eo2_dn = (1 - imubar) * spinin_up + to_host(g_spinor_field[DUM_MATRIX+3], dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + #endif + + + + + #ifndef CHECK_CUBLAS2 + assign_add_mul_r(g_spinor_field[DUM_MATRIX+2], k_strange, -g_epsbar, VOLUME/2); // g_spinor_field[DUM_MATRIX+2] = g_spinor_field[DUM_MATRIX+2] - epsbar*k_strange + // = (1 + imubar) * k_charm - epsbar*k_strange + assign_add_mul_r(g_spinor_field[DUM_MATRIX+3], k_charm , -g_epsbar, VOLUME/2); // g_spinor_field[DUM_MATRIX+3] = g_spinor_field[DUM_MATRIX+3] - epsbar*k_charm + // = (1 - imubar) * k_strange - epsbar*k_charm + #else + to_device(spinin_up, k_strange, h2d_spin_up, dev_spinsize); + to_device(dev_spin_eo2_up, g_spinor_field[DUM_MATRIX+2], h2d_spin_up, dev_spinsize); + cublasSaxpy (N_floats, -g_epsbar, (float *) spinin_up, 1, (float *) dev_spin_eo2_up, 1); // dev_spin_eo2_up = dev_spin_eo2_up - epsbar*spinin_up + to_host(g_spinor_field[DUM_MATRIX+2], dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + + to_device(spinin_dn, k_charm, h2d_spin_dn, dev_spinsize); + to_device(dev_spin_eo2_dn, g_spinor_field[DUM_MATRIX+3], h2d_spin_dn, dev_spinsize); + cublasSaxpy (N_floats, -g_epsbar, (float *) spinin_dn, 1, (float *) dev_spin_eo2_dn, 1); // dev_spin_eo2_dn = dev_spin_eo2_dn - epsbar*spinin_dn + to_host(g_spinor_field[DUM_MATRIX+3], dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + #endif + + + + + // here the (M_oo - M_oe Mee^-1 M_eo) implementation + + #ifndef CHECK_CUBLAS3 + diff(g_spinor_field[DUM_MATRIX+4], g_spinor_field[DUM_MATRIX+2], g_spinor_field[DUM_MATRIX] , VOLUME/2); + // g_spinor_field[DUM_MATRIX+4] = g_spinor_field[DUM_MATRIX+2] - g_spinor_field[DUM_MATRIX] + // = (1 + imubar) * k_charm - epsbar * k_strange + // - (M_oe)*nrm*(1 - imubar)*(M_eo) * k_charm - (M_oe)*nrm*epsbar*(M_eo) * k_strange + diff(g_spinor_field[DUM_MATRIX+5], g_spinor_field[DUM_MATRIX+3], g_spinor_field[DUM_MATRIX+1], VOLUME/2); + // g_spinor_field[DUM_MATRIX+5] = g_spinor_field[DUM_MATRIX+3] - g_spinor_field[DUM_MATRIX+1] + // = (1 - imubar) * k_strange - epsbar * k_charm + // - (M_oe)*nrm*(1 + imubar)*(M_eo) * k_strange - (M_oe)*nrm*epsbar*(M_eo) * k_charm + #else + to_device(dev_spin_eo1_up, g_spinor_field[DUM_MATRIX], h2d_spin_up, dev_spinsize); + to_device(dev_spin_eo2_up, g_spinor_field[DUM_MATRIX+2], h2d_spin_up, dev_spinsize); + cublasSaxpy (N_floats, -1.0, (float *) dev_spin_eo1_up, 1, (float *) dev_spin_eo2_up, 1); // dev_spin_eo2_up = dev_spin_eo2_up - dev_spin_eo1_up + to_host(g_spinor_field[DUM_MATRIX+4], dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + + + to_device(dev_spin_eo1_dn, g_spinor_field[DUM_MATRIX+1], h2d_spin_dn, dev_spinsize); + to_device(dev_spin_eo2_dn, g_spinor_field[DUM_MATRIX+3], h2d_spin_dn, dev_spinsize); + cublasSaxpy (N_floats, -1.0, (float *) dev_spin_eo1_dn, 1, (float *) dev_spin_eo2_dn, 1); // dev_spin_eo2_dn = dev_spin_eo2_dn - dev_spin_eo1_dn + to_host(g_spinor_field[DUM_MATRIX+5], dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + #endif + + + + + /* and finally the GAMMA5 multiplication */ + + #ifndef CHECK_GAMMA5 + gamma5(g_spinor_field[DUM_MATRIX+2], g_spinor_field[DUM_MATRIX+4], VOLUME/2); // g_spinor_field[DUM_MATRIX+2] = gamma5 * g_spinor_field[DUM_MATRIX+4] ?!= l_charm' + gamma5(g_spinor_field[DUM_MATRIX+3], g_spinor_field[DUM_MATRIX+5], VOLUME/2); // g_spinor_field[DUM_MATRIX+3] = gamma5 * g_spinor_field[DUM_MATRIX+5] ?!= l_strange' + #else + to_device(dev_spin_eo2_up, g_spinor_field[DUM_MATRIX+4], h2d_spin_up, dev_spinsize); + dev_gamma5<<>>(dev_spin_eo2_up, dev_spin_eo2_up); // dev_spin_eo2_up = gamma5 * dev_spin_eo2_up + to_host(g_spinor_field[DUM_MATRIX+2], dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + + to_device(dev_spin_eo2_dn, g_spinor_field[DUM_MATRIX+5], h2d_spin_dn, dev_spinsize); + dev_gamma5<<>>(dev_spin_eo2_dn, dev_spin_eo2_dn); // dev_spin_eo2_dn = gamma5 * dev_spin_eo2_dn + to_host(g_spinor_field[DUM_MATRIX+3], dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + #endif + + + + + /* The normalisation by the max. eigenvalue is done twice at the end */ // what ?? + + + /* We have to reassigin as follows to avoid overwriting */ + /* Recall in fact that Q^hat = tau_1 Q tau_1 , hence */ + + /* ABOVE: dum_matrix+2 is l_charm goes to dum_matrix+6 :BELOW */ + /* ABOVE: dum_matrix+3 is l_strange goes to dum_matrix+7 :BELOW */ + + + + + #ifndef CHECK_COPY + assign(g_spinor_field[DUM_MATRIX+6], g_spinor_field[DUM_MATRIX+2], VOLUME/2); // g_spinor_field[DUM_MATRIX+6] = g_spinor_field[DUM_MATRIX+2] ?!= l_charm' + assign(g_spinor_field[DUM_MATRIX+7], g_spinor_field[DUM_MATRIX+3], VOLUME/2); // g_spinor_field[DUM_MATRIX+7] = g_spinor_field[DUM_MATRIX+3] ?!= l_strange' + #else + to_device(dev_spin_eo2_dn, g_spinor_field[DUM_MATRIX+2], h2d_spin_up, dev_spinsize); + dev_copy_spinor_field<<>>(dev_spin_eo2_dn, spinin_up); // spinin_up = dev_spin_eo2_dn + to_host(g_spinor_field[DUM_MATRIX+6], spinin_up, h2d_spin_up, dev_spinsize); + + to_device(dev_spin_eo2_up, g_spinor_field[DUM_MATRIX+3], h2d_spin_dn, dev_spinsize); + dev_copy_spinor_field<<>>(dev_spin_eo2_up, spinin_dn); // spinin_dn = dev_spin_eo2_up + to_host(g_spinor_field[DUM_MATRIX+7], spinin_dn, h2d_spin_dn, dev_spinsize); + #endif + + + + + /* AND THEN THE Qhat(2x2) PART */ // notice the swapping ! + + + /* Here the M_oe Mee^-1 M_eo implementation */ // SWAP: + + #ifndef CHECK_HOPPING_MATRIX + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX] , g_spinor_field[DUM_MATRIX+7]); // g_spinor_field[DUM_MATRIX] = (M_eo) * g_spinor_field[DUM_MATRIX+7] = (M_eo) * l_strange' // notice the order + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX+1], g_spinor_field[DUM_MATRIX+6]); // g_spinor_field[DUM_MATRIX+1] = (M_eo) * g_spinor_field[DUM_MATRIX+6] = (M_eo) * l_charm' // of l_strange and l_charm + #else + to_device(spinin_up, g_spinor_field[DUM_MATRIX+7], h2d_spin_up, dev_spinsize); + bind_texture_spin(spinin_up,1); + dev_Hopping_Matrix<<>>(dev_gf, spinin_up, dev_spin_eo1_up, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); // dev_spin_eo1_up = (M_eo) * spinin_up + unbind_texture_spin(1); + to_host(g_spinor_field[DUM_MATRIX], dev_spin_eo1_up, h2d_spin_up, dev_spinsize); + + to_device(spinin_dn, g_spinor_field[DUM_MATRIX+6], h2d_spin_dn, dev_spinsize); + bind_texture_spin(spinin_dn,1); + dev_Hopping_Matrix<<>>(dev_gf, spinin_dn, dev_spin_eo1_dn, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); // dev_spin_eo1_dn = (M_eo) * spinin_dn + unbind_texture_spin(1); + to_host(g_spinor_field[DUM_MATRIX+1], dev_spin_eo1_dn, h2d_spin_dn, dev_spinsize); + #endif + + + + + #ifndef CHECK_IMUGAMMA5 + // remark: here we don't need g_mu = -g_mu // remark: here the factor GAMMA5 is not written: + mul_one_minus_imubar(g_spinor_field[DUM_MATRIX+2], g_spinor_field[DUM_MATRIX]); // g_spinor_field[DUM_MATRIX+2] = (1 - imubar) * g_spinor_field[DUM_MATRIX] = (1 - imubar)*(M_eo) * l_strange + mul_one_plus_imubar (g_spinor_field[DUM_MATRIX+3], g_spinor_field[DUM_MATRIX+1]); // g_spinor_field[DUM_MATRIX+3] = (1 + imubar) * g_spinor_field[DUM_MATRIX+1] = (1 + imubar)*(M_eo) * l_charm + #else + to_device(dev_spin_eo1_up, g_spinor_field[DUM_MATRIX], h2d_spin_up, dev_spinsize); + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo1_up, dev_spin_eo2_up, -1.0); // dev_spin_eo2_up = (1 - imubar) * dev_spin_eo1_up + to_host(g_spinor_field[DUM_MATRIX+2], dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + + to_device(dev_spin_eo1_dn, g_spinor_field[DUM_MATRIX+1], h2d_spin_dn, dev_spinsize); + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo1_dn, dev_spin_eo2_dn, +1.0); // dev_spin_eo2_dn = (1 + imubar) * dev_spin_eo1_dn + to_host(g_spinor_field[DUM_MATRIX+3], dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + #endif + + + + #ifndef CHECK_CUBLAS1 + assign_add_mul_r(g_spinor_field[DUM_MATRIX+2], g_spinor_field[DUM_MATRIX+1], g_epsbar, VOLUME/2); + // g_spinor_field[DUM_MATRIX+2] = g_spinor_field[DUM_MATRIX+2] + epsbar * g_spinor_field[DUM_MATRIX+1] + // = (1 - imubar)*(M_eo)*l_strange + epsbar * (M_eo) * l_charm + assign_add_mul_r(g_spinor_field[DUM_MATRIX+3], g_spinor_field[DUM_MATRIX] , g_epsbar, VOLUME/2); + // g_spinor_field[DUM_MATRIX+3] = g_spinor_field[DUM_MATRIX+3] + epsbar * g_spinor_field[DUM_MATRIX] + // = (1 + imubar)*(M_eo)*l_charm + epsbar * (M_eo) * l_strange + + mul_r(g_spinor_field[DUM_MATRIX+2], nrm, g_spinor_field[DUM_MATRIX+2], VOLUME/2); // g_spinor_field[DUM_MATRIX+2] = nrm * g_spinor_field[DUM_MATRIX+2] = nrm*(1 - imubar)*(M_eo)*l_strange + nrm*epsbar*(M_eo)*l_charm + mul_r(g_spinor_field[DUM_MATRIX+3], nrm, g_spinor_field[DUM_MATRIX+3], VOLUME/2); // g_spinor_field[DUM_MATRIX+3] = nrm * g_spinor_field[DUM_MATRIX+3] = nrm*(1 + imubar)*(M_eo)*l_charm + nrm*epsbar*(M_eo)*l_strange + #else + to_device(dev_spin_eo1_dn, g_spinor_field[DUM_MATRIX+1], h2d_spin_up, dev_spinsize); + to_device(dev_spin_eo2_up, g_spinor_field[DUM_MATRIX+2], h2d_spin_up, dev_spinsize); + cublasSaxpy (N_floats, g_epsbar, (float *) dev_spin_eo1_dn, 1, (float *) dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up + epsbar * dev_spin_eo1_dn + cublasSscal (N_floats, nrm, (float *) dev_spin_eo2_up, 1); // dev_spin_eo2_up = nrm * dev_spin_eo2_up + to_host(g_spinor_field[DUM_MATRIX+2], dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + + + to_device(dev_spin_eo1_up, g_spinor_field[DUM_MATRIX], h2d_spin_dn, dev_spinsize); + to_device(dev_spin_eo2_dn, g_spinor_field[DUM_MATRIX+3], h2d_spin_dn, dev_spinsize); + cublasSaxpy (N_floats, g_epsbar, (float *) dev_spin_eo1_up, 1, (float *) dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn + epsbar * dev_spin_eo1_up + cublasSscal (N_floats, nrm, (float *) dev_spin_eo2_dn, 1); // dev_spin_eo2_dn = nrm * dev_spin_eo2_dn + to_host(g_spinor_field[DUM_MATRIX+3], dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + #endif + + + + + #ifndef CHECK_HOPPING_MATRIX + Hopping_Matrix(OE, l_strange, g_spinor_field[DUM_MATRIX+2]); // l_strange = (M_oe) * g_spinor_field[DUM_MATRIX+2] = (M_oe)*nrm*(1 - imubar)*(M_eo)*l_strange + (M_oe)*nrm*epsbar*(M_eo)*l_charm + Hopping_Matrix(OE, l_charm , g_spinor_field[DUM_MATRIX+3]); // l_charm = (M_oe) * g_spinor_field[DUM_MATRIX+3] = (M_oe)*nrm*(1 + imubar)*(M_eo)*l_charm + (M_oe)*nrm*epsbar*(M_eo)*l_strange + #else + to_device(dev_spin_eo2_up, g_spinor_field[DUM_MATRIX+2], h2d_spin_up, dev_spinsize); + bind_texture_spin(dev_spin_eo2_up,1); + dev_Hopping_Matrix<<>>(dev_gf, dev_spin_eo2_up, dev_spin_eo1_up, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + unbind_texture_spin(1); // dev_spin_eo1_up = (M_oe) * dev_spin_eo2_up + to_host(l_strange, dev_spin_eo1_up, h2d_spin_up, dev_spinsize); + + to_device(dev_spin_eo2_dn, g_spinor_field[DUM_MATRIX+3], h2d_spin_dn, dev_spinsize); + bind_texture_spin(dev_spin_eo2_dn,1); + dev_Hopping_Matrix<<>>(dev_gf, dev_spin_eo2_dn, dev_spin_eo1_dn, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + unbind_texture_spin(1); // dev_spin_eo1_dn = (M_oe) * dev_spin_eo2_dn + to_host(l_charm, dev_spin_eo1_dn, h2d_spin_dn, dev_spinsize); + #endif + + + + + /* Here the M_oo implementation */ + + #ifndef CHECK_IMUGAMMA5 + mul_one_plus_imubar (g_spinor_field[DUM_MATRIX] , g_spinor_field[DUM_MATRIX+7]); // g_spinor_field[DUM_MATRIX] = (1 + imubar) * g_spinor_field[DUM_MATRIX+7] = (1 + imubar) * l_strange + mul_one_minus_imubar(g_spinor_field[DUM_MATRIX+1], g_spinor_field[DUM_MATRIX+6]); // g_spinor_field[DUM_MATRIX+1] = (1 - imubar) * g_spinor_field[DUM_MATRIX+6] = (1 - imubar) * l_charm + #else + to_device(spinin_up, g_spinor_field[DUM_MATRIX+7], h2d_spin_up, dev_spinsize); + dev_mul_one_pm_imubar_gamma5<<>>(spinin_up, dev_spin_eo2_up, +1.0); // dev_spin_eo2_up = (1 + imubar) * spinin_up + to_host(g_spinor_field[DUM_MATRIX], dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + + to_device(spinin_dn, g_spinor_field[DUM_MATRIX+6], h2d_spin_dn, dev_spinsize); + dev_mul_one_pm_imubar_gamma5<<>>(spinin_dn, dev_spin_eo2_dn, -1.0); // dev_spin_eo2_dn = (1 - imubar) * spinin_dn + to_host(g_spinor_field[DUM_MATRIX+1], dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + #endif + + + + + #ifndef CHECK_CUBLAS2 + assign_add_mul_r(g_spinor_field[DUM_MATRIX] , g_spinor_field[DUM_MATRIX+6], -g_epsbar, VOLUME/2); + // g_spinor_field[DUM_MATRIX] = g_spinor_field[DUM_MATRIX] - epsbar * g_spinor_field[DUM_MATRIX+6] + // = (1 + imubar) * l_strange - epsbar * l_charm + assign_add_mul_r(g_spinor_field[DUM_MATRIX+1], g_spinor_field[DUM_MATRIX+7], -g_epsbar, VOLUME/2); + // g_spinor_field[DUM_MATRIX+1] = g_spinor_field[DUM_MATRIX+1] - epsbar * g_spinor_field[DUM_MATRIX+7] + // = (1 - imubar) * l_charm - epsbar * l_strange + #else + to_device(spinin_dn, g_spinor_field[DUM_MATRIX+6], h2d_spin_up, dev_spinsize); + to_device(dev_spin_eo2_up, g_spinor_field[DUM_MATRIX], h2d_spin_up, dev_spinsize); + cublasSaxpy (N_floats, -g_epsbar, (float *) spinin_dn, 1, (float *) dev_spin_eo2_up, 1); // dev_spin_eo2_up = dev_spin_eo2_up - epsbar*spinin_dn + to_host(g_spinor_field[DUM_MATRIX], dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + + to_device(spinin_up, g_spinor_field[DUM_MATRIX+7], h2d_spin_dn, dev_spinsize); + to_device(dev_spin_eo2_dn, g_spinor_field[DUM_MATRIX+1], h2d_spin_dn, dev_spinsize); + cublasSaxpy (N_floats, -g_epsbar, (float *) spinin_up, 1, (float *) dev_spin_eo2_dn, 1); // dev_spin_eo2_dn = dev_spin_eo2_dn - epsbar*spinin_up + to_host(g_spinor_field[DUM_MATRIX+1], dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + #endif + + + + + // here the (M_oo - M_oe Mee^-1 M_eo) implementation + + #ifndef CHECK_CUBLAS3 + diff(l_strange, g_spinor_field[DUM_MATRIX] , l_strange, VOLUME/2); // l_strange = g_spinor_field[DUM_MATRIX] - l_strange + // = (1 + imubar) * l_strange - epsbar * l_charm + // - (M_oe)*nrm*(1 - imubar)*(M_eo) * l_strange + (M_oe)*nrm*epsbar*(M_eo) * l_charm + + diff(l_charm , g_spinor_field[DUM_MATRIX+1], l_charm , VOLUME/2); // l_charm = g_spinor_field[DUM_MATRIX+1] - l_charm + // = (1 - imubar) * l_charm - epsbar * l_strange + // - (M_oe)*nrm*(1 + imubar)*(M_eo) * l_charm + (M_oe)*nrm*epsbar*(M_eo) * l_strange + #else + to_device(dev_spin_eo1_up, l_strange, h2d_spin_up, dev_spinsize); + to_device(dev_spin_eo2_up, g_spinor_field[DUM_MATRIX], h2d_spin_up, dev_spinsize); + cublasSaxpy (N_floats, -1.0, (float *) dev_spin_eo1_up, 1, (float *) dev_spin_eo2_up, 1); // dev_spin_eo2_up = dev_spin_eo2_up - dev_spin_eo1_up + to_host(l_strange, dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + + to_device(dev_spin_eo1_dn, l_charm, h2d_spin_dn, dev_spinsize); + to_device(dev_spin_eo2_dn, g_spinor_field[DUM_MATRIX+1], h2d_spin_dn, dev_spinsize); + cublasSaxpy (N_floats, -1.0, (float *) dev_spin_eo1_dn, 1, (float *) dev_spin_eo2_dn, 1); // dev_spin_eo2_dn = dev_spin_eo2_dn - dev_spin_eo1_dn + to_host(l_charm, dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + #endif + + + + + /* and finally the GAMMA5 multiplication */ + + #ifndef CHECK_GAMMA5 + gamma5(l_strange, l_strange, VOLUME/2); // l_strange = gamma5 * l_strange + gamma5(l_charm , l_charm , VOLUME/2); // l_charm = gamma5 * l_charm + #else + to_device(dev_spin_eo2_up, l_strange, h2d_spin_up, dev_spinsize); + dev_gamma5<<>>(dev_spin_eo2_up, dev_spin_eo2_up); // dev_spin_eo2_up = gamma5 * dev_spin_eo2_up + to_host(l_strange, dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + + to_device(dev_spin_eo2_dn, l_charm, h2d_spin_dn, dev_spinsize); + dev_gamma5<<>>(dev_spin_eo2_dn, dev_spin_eo2_dn); // dev_spin_eo2_dn = gamma5 * dev_spin_eo2_dn + to_host(l_charm, dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + #endif + + + + + /* At the end, the normalisation by the max. eigenvalue */ + /* Twice phmc_invmaxev since we consider here D Ddag !!! */ + mul_r(l_charm, phmc_invmaxev*phmc_invmaxev, l_charm, VOLUME/2); + mul_r(l_strange, phmc_invmaxev*phmc_invmaxev, l_strange, VOLUME/2); + return; + +}//matrix_debug2() + + + + + + + + +// this replaces Q_Qdagger_ND() for debugging !! + +void Zwitter1 (spinor * const l_strange, spinor * const l_charm, // output + spinor * const k_strange, spinor * const k_charm) { // input + + + int N_sites = VOLUME/2; // #lattice sites + int N_floats = 24*VOLUME/2; // #floats + + // additional for the debugging purposes + size_t dev_spinsize = 6*VOLUME/2*sizeof(dev_spinor); + + + int gridsize; // auxiliary + int blocksize; // auxiliary + + blocksize = 128; + int blocksize1 = blocksize; // here: dev_zero_spinor_field , dev_copy_spinor_field + int gridsize1 = (int) (VOLUME/2/blocksize) + 1; + + blocksize = 128; + int blocksize2 = blocksize; // passed: dev_Hopping_Matrix + int gridsize2 = (int) (VOLUME/2/blocksize) + 1; + + blocksize = 128; + int blocksize3 = blocksize; // passed: dev_mul_one_pm_imubar_gamma5 + int gridsize3 = (int) (VOLUME/2/blocksize) + 1; + + blocksize = 128; + int blocksize4 = blocksize; // passed: dev_gamma5 + int gridsize4 = (int) (VOLUME/2/blocksize) + 1; + + blocksize = 128; + int blocksize5 = blocksize; // passed: dev_copy_spinor_field + int gridsize5 = (int) (VOLUME/2/blocksize) + 1; + + dev_spinor * spinin_up; + dev_spinor * spinin_dn; + dev_spinor * spinout_up; + dev_spinor * spinout_dn; + + + cudaMalloc((void **) &dev_spin_eo2_up, dev_spinsize); + cudaMalloc((void **) &dev_spin_eo2_dn, dev_spinsize); + cudaMalloc((void **) &spinin_up, dev_spinsize); + cudaMalloc((void **) &spinin_dn, dev_spinsize); + cudaMalloc((void **) &spinout_up, dev_spinsize); + cudaMalloc((void **) &spinout_dn, dev_spinsize); + + + + + double nrm = 1./(1. + g_mubar*g_mubar - g_epsbar*g_epsbar); // nrm = (1 + mubar^2 - epsbar^2)^-1 + + + + printf("This is Zwitter1(). "); + + + + + to_device(spinin_dn, k_charm, h2d_spin_up, dev_spinsize); + to_device(spinin_up, k_strange, h2d_spin_dn, dev_spinsize); + + + + + //////////////////////////// + // (M_oe) (Mee^-1) (M_eo) // + //////////////////////////// + + printf("GPU. "); + + // Flo: + bind_texture_spin(spinin_dn,1); + dev_Hopping_Matrix<<>>(dev_gf, spinin_dn, dev_spin_eo1_up, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); // dev_spin_eo1_up = (M_eo) * spinin_dn + unbind_texture_spin(1); + + bind_texture_spin(spinin_up,1); + dev_Hopping_Matrix<<>>(dev_gf, spinin_up, dev_spin_eo1_dn, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); // dev_spin_eo1_dn = (M_eo) * spinin_up + unbind_texture_spin(1); + + + // written: + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo1_up, dev_spin_eo2_up, -1.0); // dev_spin_eo2_up = (1 - imubar) * dev_spin_eo1_up = (1 - imubar)*(M_eo) * spinin_dn + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo1_dn, dev_spin_eo2_dn, +1.0); // dev_spin_eo2_dn = (1 + imubar) * dev_spin_eo1_dn = (1 + imubar)*(M_eo) * spinin_up + + + // CUBLAS: + cublasSaxpy (N_floats, g_epsbar, (float *) dev_spin_eo1_dn, 1, (float *) dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up + epsbar * dev_spin_eo1_dn = (1 - imubar)*(M_eo) * spinin_dn + epsbar * (M_eo) * spinin_up + cublasSaxpy (N_floats, g_epsbar, (float *) dev_spin_eo1_up, 1, (float *) dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn + epsbar * dev_spin_eo1_up = (1 + imubar)*(M_eo) * spinin_up + epsbar * (M_eo) * spinin_dn + + // CUBLAS: + cublasSscal (N_floats, nrm, (float *) dev_spin_eo2_up, 1); // dev_spin_eo2_up = nrm * dev_spin_eo2_up = nrm*(1-imubar)*(M_eo) * spinin_dn + nrm*epsbar*(M_eo) * spinin_up + + cublasSscal (N_floats, nrm, (float *) dev_spin_eo2_dn, 1); // dev_spin_eo2_dn = nrm * dev_spin_eo2_dn = nrm*(1+imubar)*(M_eo) * spinin_up + nrm*epsbar*(M_eo) * spinin_dn + + + // Flo: // remember: this is ((M_oe)(Mee^-1)(M_eo)) * (spinin_dn, spinin_up): + bind_texture_spin(dev_spin_eo2_up,1); + dev_Hopping_Matrix<<>>(dev_gf, dev_spin_eo2_up, dev_spin_eo1_up, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + unbind_texture_spin(1); + // dev_spin_eo1_up = (M_oe) * dev_spin_eo2_up = (M_oe)*nrm*(1-imubar)*(M_eo) * spinin_dn + (M_oe)*nrm*epsbar*(M_eo) * spinin_up + bind_texture_spin(dev_spin_eo2_dn,1); + dev_Hopping_Matrix<<>>(dev_gf, dev_spin_eo2_dn, dev_spin_eo1_dn, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + unbind_texture_spin(1); + // dev_spin_eo1_dn = (M_oe) * dev_spin_eo2_dn = (M_oe)*nrm*(1+imubar)*(M_eo) * spinin_up + (M_oe)*nrm*epsbar*(M_eo) * spinin_dn + + + + //////////// + // (M_oo) // + //////////// + + + // written: + dev_mul_one_pm_imubar_gamma5<<>>(spinin_dn, dev_spin_eo2_up, +1.0); // dev_spin_eo2_up = (1 + imubar) * spinin_dn + dev_mul_one_pm_imubar_gamma5<<>>(spinin_up, dev_spin_eo2_dn, -1.0); // dev_spin_eo2_dn = (1 - imubar) * spinin_up + + + // CUBLAS: // remember: this is (M_oo) * (spinin_dn, spinin_up): + cublasSaxpy (N_floats, -g_epsbar, (float *) spinin_up, 1, (float *) dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up - epsbar*spinin_up = (1+imubar)*spinin_dn - epsbar*spinin_up + cublasSaxpy (N_floats, -g_epsbar, (float *) spinin_dn, 1, (float *) dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn - epsbar*spinin_dn = (1-imubar)*spinin_up - epsbar*spinin_dn + + + + /////////////////////////////////////// + // (M_oo) - (M_oe) (Mee^-1) (M_eo) // + /////////////////////////////////////// + + // CUBLAS: // this is ((M_oo) - (M_oe)(Mee^-1)(M_eo)) * (spinin_dn, spinin_up): + cublasSaxpy (N_floats, -1.0, (float *) dev_spin_eo1_up, 1, (float *) dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up - dev_spin_eo1_up = (1+imubar) * spinin_dn - epsbar * spinin_up + // - (M_oe)*nrm*(1-imubar)*(M_eo) * spinin_dn - (M_oe)*nrm*epsbar*(M_eo) * spinin_up + cublasSaxpy (N_floats, -1.0, (float *) dev_spin_eo1_dn, 1, (float *) dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn - dev_spin_eo1_dn = (1-imubar) * spinin_up - epsbar * spinin_dn + // - (M_oe)*nrm*(1+imubar)*(M_eo) * spinin_up - (M_oe)*nrm*epsbar*(M_eo) * spinin_dn + + + //////////// + // gamma5 // + //////////// + + // Flo: + dev_gamma5<<>>(dev_spin_eo2_up, dev_spin_eo2_up); // dev_spin_eo2_up = gamma5 * dev_spin_eo2_up + dev_gamma5<<>>(dev_spin_eo2_dn, dev_spin_eo2_dn); // dev_spin_eo2_dn = gamma5 * dev_spin_eo2_dn + + + + + + + //////////////////// + // (a,b) -> (b,a) // + //////////////////// + dev_copy_spinor_field<<>>(dev_spin_eo2_dn, spinin_up); // spinin_up = dev_spin_eo2_dn + dev_copy_spinor_field<<>>(dev_spin_eo2_up, spinin_dn); // spinin_dn = dev_spin_eo2_up + + + + + to_host(g_spinor_field[DUM_MATRIX+7], spinin_up, h2d_spin_up, dev_spinsize); + to_host(g_spinor_field[DUM_MATRIX+6], spinin_dn, h2d_spin_dn, dev_spinsize); + + + + + /* AND THEN THE Qhat(2x2) PART */ // notice the swapping ! + + printf("CPU. "); + + /* Here the M_oe Mee^-1 M_eo implementation */ // SWAP: + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX] , g_spinor_field[DUM_MATRIX+7]); // g_spinor_field[DUM_MATRIX] = (M_eo) * g_spinor_field[DUM_MATRIX+7] = (M_eo) * l_strange' // notice the order + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX+1], g_spinor_field[DUM_MATRIX+6]); // g_spinor_field[DUM_MATRIX+1] = (M_eo) * g_spinor_field[DUM_MATRIX+6] = (M_eo) * l_charm' // of l_strange and l_charm + + + // remark: here we don't need g_mu = -g_mu // remark: here the factor GAMMA5 is not written: + mul_one_minus_imubar(g_spinor_field[DUM_MATRIX+2], g_spinor_field[DUM_MATRIX]); // g_spinor_field[DUM_MATRIX+2] = (1 - imubar) * g_spinor_field[DUM_MATRIX] = (1 - imubar)*(M_eo) * l_strange + mul_one_plus_imubar (g_spinor_field[DUM_MATRIX+3], g_spinor_field[DUM_MATRIX+1]); // g_spinor_field[DUM_MATRIX+3] = (1 + imubar) * g_spinor_field[DUM_MATRIX+1] = (1 + imubar)*(M_eo) * l_charm + + + assign_add_mul_r(g_spinor_field[DUM_MATRIX+2], g_spinor_field[DUM_MATRIX+1], g_epsbar, VOLUME/2); + // g_spinor_field[DUM_MATRIX+2] = g_spinor_field[DUM_MATRIX+2] + epsbar * g_spinor_field[DUM_MATRIX+1] + // = (1 - imubar)*(M_eo)*l_strange + epsbar * (M_eo) * l_charm + assign_add_mul_r(g_spinor_field[DUM_MATRIX+3], g_spinor_field[DUM_MATRIX] , g_epsbar, VOLUME/2); + // g_spinor_field[DUM_MATRIX+3] = g_spinor_field[DUM_MATRIX+3] + epsbar * g_spinor_field[DUM_MATRIX] + // = (1 + imubar)*(M_eo)*l_charm + epsbar * (M_eo) * l_strange + + + mul_r(g_spinor_field[DUM_MATRIX+2], nrm, g_spinor_field[DUM_MATRIX+2], VOLUME/2); // g_spinor_field[DUM_MATRIX+2] = nrm * g_spinor_field[DUM_MATRIX+2] = nrm*(1 - imubar)*(M_eo)*l_strange + nrm*epsbar*(M_eo)*l_charm + mul_r(g_spinor_field[DUM_MATRIX+3], nrm, g_spinor_field[DUM_MATRIX+3], VOLUME/2); // g_spinor_field[DUM_MATRIX+3] = nrm * g_spinor_field[DUM_MATRIX+3] = nrm*(1 + imubar)*(M_eo)*l_charm + nrm*epsbar*(M_eo)*l_strange + + + Hopping_Matrix(OE, l_strange, g_spinor_field[DUM_MATRIX+2]); // l_strange = (M_oe) * g_spinor_field[DUM_MATRIX+2] = (M_oe)*nrm*(1 - imubar)*(M_eo)*l_strange + (M_oe)*nrm*epsbar*(M_eo)*l_charm + Hopping_Matrix(OE, l_charm , g_spinor_field[DUM_MATRIX+3]); // l_charm = (M_oe) * g_spinor_field[DUM_MATRIX+3] = (M_oe)*nrm*(1 + imubar)*(M_eo)*l_charm + (M_oe)*nrm*epsbar*(M_eo)*l_strange + + + + /* Here the M_oo implementation */ + mul_one_plus_imubar (g_spinor_field[DUM_MATRIX] , g_spinor_field[DUM_MATRIX+7]); // g_spinor_field[DUM_MATRIX] = (1 + imubar) * g_spinor_field[DUM_MATRIX+7] = (1 + imubar) * l_strange + mul_one_minus_imubar(g_spinor_field[DUM_MATRIX+1], g_spinor_field[DUM_MATRIX+6]); // g_spinor_field[DUM_MATRIX+1] = (1 - imubar) * g_spinor_field[DUM_MATRIX+6] = (1 - imubar) * l_charm + + + assign_add_mul_r(g_spinor_field[DUM_MATRIX] , g_spinor_field[DUM_MATRIX+6], -g_epsbar, VOLUME/2); + // g_spinor_field[DUM_MATRIX] = g_spinor_field[DUM_MATRIX] - epsbar * g_spinor_field[DUM_MATRIX+6] + // = (1 + imubar) * l_strange - epsbar * l_charm + assign_add_mul_r(g_spinor_field[DUM_MATRIX+1], g_spinor_field[DUM_MATRIX+7], -g_epsbar, VOLUME/2); + // g_spinor_field[DUM_MATRIX+1] = g_spinor_field[DUM_MATRIX+1] - epsbar * g_spinor_field[DUM_MATRIX+7] + // = (1 - imubar) * l_charm - epsbar * l_strange + + + + // here the (M_oo - M_oe Mee^-1 M_eo) implementation + diff(l_strange, g_spinor_field[DUM_MATRIX] , l_strange, VOLUME/2); // l_strange = g_spinor_field[DUM_MATRIX] - l_strange + // = (1 + imubar) * l_strange - epsbar * l_charm + // - (M_oe)*nrm*(1 - imubar)*(M_eo) * l_strange + (M_oe)*nrm*epsbar*(M_eo) * l_charm + + diff(l_charm , g_spinor_field[DUM_MATRIX+1], l_charm , VOLUME/2); // l_charm = g_spinor_field[DUM_MATRIX+1] - l_charm + // = (1 - imubar) * l_charm - epsbar * l_strange + // - (M_oe)*nrm*(1 + imubar)*(M_eo) * l_charm + (M_oe)*nrm*epsbar*(M_eo) * l_strange + + /* and finally the GAMMA5 multiplication */ + gamma5(l_strange, l_strange, VOLUME/2); // l_strange = gamma5 * l_strange + gamma5(l_charm , l_charm , VOLUME/2); // l_charm = gamma5 * l_charm + + // the gamma5 multiplication + + + + /* At the end, the normalisation by the max. eigenvalue */ + /* Twice phmc_invmaxev since we consider here D Ddag !!! */ + mul_r(l_charm, phmc_invmaxev*phmc_invmaxev, l_charm, VOLUME/2); + mul_r(l_strange, phmc_invmaxev*phmc_invmaxev, l_strange, VOLUME/2); + return; +}//Zwitter1() + + + + + + + + +// this replaces Q_Qdagger_ND() for debugging !! + +void Zwitter2 (spinor * const l_strange, spinor * const l_charm, // output + spinor * const k_strange, spinor * const k_charm) { // input + + + int N_sites = VOLUME/2; // #lattice sites + int N_floats = 24*VOLUME/2; // #floats + + // additional for the debugging purposes + size_t dev_spinsize = 6*VOLUME/2*sizeof(dev_spinor); + + + int gridsize; // auxiliary + int blocksize; // auxiliary + + blocksize = 128; + int blocksize1 = blocksize; // here: dev_zero_spinor_field , dev_copy_spinor_field + int gridsize1 = (int) (VOLUME/2/blocksize) + 1; + + blocksize = 128; + int blocksize2 = blocksize; // passed: dev_Hopping_Matrix + int gridsize2 = (int) (VOLUME/2/blocksize) + 1; + + blocksize = 128; + int blocksize3 = blocksize; // passed: dev_mul_one_pm_imubar_gamma5 + int gridsize3 = (int) (VOLUME/2/blocksize) + 1; + + blocksize = 128; + int blocksize4 = blocksize; // passed: dev_gamma5 + int gridsize4 = (int) (VOLUME/2/blocksize) + 1; + + blocksize = 128; + int blocksize5 = blocksize; // passed: dev_copy_spinor_field + int gridsize5 = (int) (VOLUME/2/blocksize) + 1; + + dev_spinor * spinin_up; + dev_spinor * spinin_dn; + dev_spinor * spinout_up; + dev_spinor * spinout_dn; + + + cudaMalloc((void **) &dev_spin_eo2_up, dev_spinsize); + cudaMalloc((void **) &dev_spin_eo2_dn, dev_spinsize); + cudaMalloc((void **) &spinin_up, dev_spinsize); + cudaMalloc((void **) &spinin_dn, dev_spinsize); + cudaMalloc((void **) &spinout_up, dev_spinsize); + cudaMalloc((void **) &spinout_dn, dev_spinsize); + + + double nrm = 1./(1. + g_mubar*g_mubar - g_epsbar*g_epsbar); // nrm = (1 + mubar^2 - epsbar^2)^-1 + + + + + printf("This is Zwitter2(). "); + + + + + /* FIRST THE Qhat(2x2)^dagger PART */ // we will apply Qhat(2x2) with charme and strange interchanged + // which is equivalent to apply Qhat(2x2)^dagger + + printf("CPU. "); + + + /* Here the M_oe Mee^-1 M_eo implementation */ + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX] , k_charm); // g_spinor_field[DUM_MATRIX] = (M_eo) * k_charm // notice the order + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX+1], k_strange); // g_spinor_field[DUM_MATRIX+1] = (M_eo) * k_strange // of k_charm and k_strange + + + // remark: here the factor GAMMA5 is not written: + mul_one_minus_imubar(g_spinor_field[DUM_MATRIX+2], g_spinor_field[DUM_MATRIX]); // g_spinor_field[DUM_MATRIX+2] = (1 - imubar)*(M_eo) * g_spinor_field[DUM_MATRIX] + // = (1 - imubar)*(M_eo) * k_charm + mul_one_plus_imubar (g_spinor_field[DUM_MATRIX+3], g_spinor_field[DUM_MATRIX+1]); // g_spinor_field[DUM_MATRIX+3] = (1 + imubar)*(M_eo) * g_spinor_field[DUM_MATRIX+1] + // = (1 + imubar)*(M_eo) * k_strange + + + assign_add_mul_r(g_spinor_field[DUM_MATRIX+2], g_spinor_field[DUM_MATRIX+1], g_epsbar, VOLUME/2); + // g_spinor_field[DUM_MATRIX+2] = g_spinor_field[DUM_MATRIX+2] + epsbar*g_spinor_field[DUM_MATRIX+1] + // = (1 - imubar)*(M_eo) * k_charm + epsbar*(M_eo) * k_strange + assign_add_mul_r(g_spinor_field[DUM_MATRIX+3], g_spinor_field[DUM_MATRIX] , g_epsbar, VOLUME/2); + // g_spinor_field[DUM_MATRIX+3] = g_spinor_field[DUM_MATRIX+3] + epsbar*g_spinor_field[DUM_MATRIX] + // = (1 + imubar)*(M_eo) * k_strange + epsbar*(M_eo) * k_charm + + + mul_r(g_spinor_field[DUM_MATRIX+2], nrm, g_spinor_field[DUM_MATRIX+2], VOLUME/2); // g_spinor_field[DUM_MATRIX+2] = nrm * g_spinor_field[DUM_MATRIX+2] + // = nrm * ( (1 - imubar)*(M_eo) * k_charm + epsbar*(M_eo) * k_strange ) + // = nrm*(1 - imubar)*(M_eo)*k_charm + nrm*epsbar*(M_eo)*k_strange + + mul_r(g_spinor_field[DUM_MATRIX+3], nrm, g_spinor_field[DUM_MATRIX+3], VOLUME/2); // g_spinor_field[DUM_MATRIX+3] = nrm * g_spinor_field[DUM_MATRIX+3] + // = nrm * ( (1 + imubar)*(M_eo) * k_strange + epsbar*(M_eo) * k_charm ) + // = nrm*(1 + imubar)*(M_eo)*k_strange + nrm*epsbar*(M_eo)*k_charm + + + Hopping_Matrix(OE, g_spinor_field[DUM_MATRIX] , g_spinor_field[DUM_MATRIX+2]); // g_spinor_field[DUM_MATRIX] = (M_oe) * g_spinor_field[DUM_MATRIX+2] + // = (M_oe)*nrm*(1 - imubar)*(M_eo)*k_charm + (M_oe)*nrm*epsbar*(M_eo)*k_strange + Hopping_Matrix(OE, g_spinor_field[DUM_MATRIX+1], g_spinor_field[DUM_MATRIX+3]); // g_spinor_field[DUM_MATRIX+1] = (M_oe) * g_spinor_field[DUM_MATRIX+3] + // = (M_oe)*nrm*(1 + imubar)*(M_eo)*k_strange + (M_oe)*nrm*epsbar*(M_eo) * k_charm + + + + /* Here the M_oo implementation */ + mul_one_plus_imubar (g_spinor_field[DUM_MATRIX+2], k_charm); // g_spinor_field[DUM_MATRIX+2] = (1 + imubar) * k_charm + mul_one_minus_imubar(g_spinor_field[DUM_MATRIX+3], k_strange); // g_spinor_field[DUM_MATRIX+3] = (1 - imubar) * k_strange + + assign_add_mul_r(g_spinor_field[DUM_MATRIX+2], k_strange, -g_epsbar, VOLUME/2); // g_spinor_field[DUM_MATRIX+2] = g_spinor_field[DUM_MATRIX+2] - epsbar*k_strange + // = (1 + imubar) * k_charm - epsbar*k_strange + assign_add_mul_r(g_spinor_field[DUM_MATRIX+3], k_charm , -g_epsbar, VOLUME/2); // g_spinor_field[DUM_MATRIX+3] = g_spinor_field[DUM_MATRIX+3] - epsbar*k_charm + // = (1 - imubar) * k_strange - epsbar*k_charm + + // here the (M_oo - M_oe Mee^-1 M_eo) implementation + diff(g_spinor_field[DUM_MATRIX+4], g_spinor_field[DUM_MATRIX+2], g_spinor_field[DUM_MATRIX] , VOLUME/2); + // g_spinor_field[DUM_MATRIX+4] = g_spinor_field[DUM_MATRIX+2] - g_spinor_field[DUM_MATRIX] + // = (1 + imubar) * k_charm - epsbar * k_strange + // - (M_oe)*nrm*(1 - imubar)*(M_eo) * k_charm - (M_oe)*nrm*epsbar*(M_eo) * k_strange + diff(g_spinor_field[DUM_MATRIX+5], g_spinor_field[DUM_MATRIX+3], g_spinor_field[DUM_MATRIX+1], VOLUME/2); + // g_spinor_field[DUM_MATRIX+5] = g_spinor_field[DUM_MATRIX+3] - g_spinor_field[DUM_MATRIX+1] + // = (1 - imubar) * k_strange - epsbar * k_charm + // - (M_oe)*nrm*(1 + imubar)*(M_eo) * k_strange - (M_oe)*nrm*epsbar*(M_eo) * k_charm + + + /* and finally the GAMMA5 multiplication */ + gamma5(g_spinor_field[DUM_MATRIX+2], g_spinor_field[DUM_MATRIX+4], VOLUME/2); // g_spinor_field[DUM_MATRIX+2] = gamma5 * g_spinor_field[DUM_MATRIX+4] ?!= l_charm' + gamma5(g_spinor_field[DUM_MATRIX+3], g_spinor_field[DUM_MATRIX+5], VOLUME/2); // g_spinor_field[DUM_MATRIX+3] = gamma5 * g_spinor_field[DUM_MATRIX+5] ?!= l_strange' + + + /* The normalisation by the max. eigenvalue is done twice at the end */ // what ?? + + + /* We have to reassigin as follows to avoid overwriting */ + /* Recall in fact that Q^hat = tau_1 Q tau_1 , hence */ + + /* ABOVE: dum_matrix+2 is l_charm goes to dum_matrix+6 :BELOW */ + /* ABOVE: dum_matrix+3 is l_strange goes to dum_matrix+7 :BELOW */ + assign(g_spinor_field[DUM_MATRIX+6], g_spinor_field[DUM_MATRIX+2], VOLUME/2); // g_spinor_field[DUM_MATRIX+6] = g_spinor_field[DUM_MATRIX+2] ?!= l_charm' + assign(g_spinor_field[DUM_MATRIX+7], g_spinor_field[DUM_MATRIX+3], VOLUME/2); // g_spinor_field[DUM_MATRIX+7] = g_spinor_field[DUM_MATRIX+3] ?!= l_strange' + + + + + + + to_device(spinin_up, g_spinor_field[DUM_MATRIX+7], h2d_spin_up, dev_spinsize); + to_device(spinin_dn, g_spinor_field[DUM_MATRIX+6], h2d_spin_dn, dev_spinsize); + + + + + /////////////////////////////////// /////////////////////// + // Q_tilde(2x2) // // (Q_tilde) * (b,a) // + /////////////////////////////////// /////////////////////// + + + //////////////////////////// + // (M_oe) (Mee^-1) (M_eo) // + //////////////////////////// + + printf("GPU. "); + + // Flo: + bind_texture_spin(spinin_up,1); + dev_Hopping_Matrix<<>>(dev_gf, spinin_up, dev_spin_eo1_up, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); // dev_spin_eo1_up = (M_eo) * spinin_up + unbind_texture_spin(1); + + bind_texture_spin(spinin_dn,1); + dev_Hopping_Matrix<<>>(dev_gf, spinin_dn, dev_spin_eo1_dn, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); // dev_spin_eo1_dn = (M_eo) * spinin_dn + unbind_texture_spin(1); + + + // written: + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo1_up, dev_spin_eo2_up, -1.0); // dev_spin_eo2_up = (1 - imubar) * dev_spin_eo1_up = (1 - imubar)*(M_eo) * spinin_up + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo1_dn, dev_spin_eo2_dn, +1.0); // dev_spin_eo2_dn = (1 + imubar) * dev_spin_eo1_dn = (1 + imubar)*(M_eo) * spinin_dn + + + // CUBLAS: + cublasSaxpy (N_floats, g_epsbar, (float *) dev_spin_eo1_dn, 1, (float *) dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up + epsbar * dev_spin_eo1_dn = (1 - imubar)*(M_eo) * spinin_up + epsbar * (M_eo) * spinin_dn + cublasSaxpy (N_floats, g_epsbar, (float *) dev_spin_eo1_up, 1, (float *) dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn + epsbar * dev_spin_eo1_up = (1 + imubar)*(M_eo) * spinin_dn + epsbar * (M_eo) * spinin_up + + // CUBLAS: + cublasSscal (N_floats, nrm, (float *) dev_spin_eo2_up, 1); // dev_spin_eo2_up = nrm * dev_spin_eo2_up = nrm*(1-imubar)*(M_eo) * spinin_up + nrm*epsbar*(M_eo) * spinin_dn + + cublasSscal (N_floats, nrm, (float *) dev_spin_eo2_dn, 1); // dev_spin_eo2_dn = nrm * dev_spin_eo2_dn = nrm*(1+imubar)*(M_eo) * spinin_dn + nrm*epsbar*(M_eo) * spinin_up + + + // Flo: // remember: this is ((M_oe) (Mee^-1) (M_eo)) * (spinin_up, spinin_dn): + bind_texture_spin(dev_spin_eo2_up,1); + dev_Hopping_Matrix<<>>(dev_gf, dev_spin_eo2_up, dev_spin_eo1_up, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + unbind_texture_spin(1); + // dev_spin_eo1_up = (M_oe) * dev_spin_eo2_up = (M_oe)*nrm*(1-imubar)*(M_eo) * spinin_up + (M_oe)*nrm*epsbar*(M_eo) * spinin_dn + bind_texture_spin(dev_spin_eo2_dn,1); + dev_Hopping_Matrix<<>>(dev_gf, dev_spin_eo2_dn, dev_spin_eo1_dn, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + unbind_texture_spin(1); + // dev_spin_eo1_dn = (M_oe) * dev_spin_eo2_dn = (M_oe)*nrm*(1+imubar)*(M_eo) * spinin_dn + (M_oe)*nrm*epsbar*(M_eo) * spinin_up + + + + //////////// + // (M_oo) // + //////////// + + // written: + dev_mul_one_pm_imubar_gamma5<<>>(spinin_up, dev_spin_eo2_up, +1.0); // dev_spin_eo2_up = (1 + imubar) * spinin_up + dev_mul_one_pm_imubar_gamma5<<>>(spinin_dn, dev_spin_eo2_dn, -1.0); // dev_spin_eo2_dn = (1 - imubar) * spinin_dn + + + // CUBLAS: // remember: this is (M_oo) * (spinin_up, spinin_dn): + cublasSaxpy (N_floats, -g_epsbar, (float *) spinin_dn, 1, (float *) dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up - epsbar*spinin_dn = (1+imubar)*spinin_up - epsbar*spinin_dn + cublasSaxpy (N_floats, -g_epsbar, (float *) spinin_up, 1, (float *) dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn - epsbar*spinin_up = (1-imubar)*spinin_dn - epsbar*spinin_up + + + + /////////////////////////////////////// + // (M_oo) - (M_oe) (Mee^-1) (M_eo) // + /////////////////////////////////////// + + // CUBLAS: // this is ( (M_oo) - (M_oe) (Mee^-1) (M_eo) ) * (spinin_up, spinin_dn) + cublasSaxpy (N_floats, -1.0, (float *) dev_spin_eo1_up, 1, (float *) dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up - dev_spin_eo1_up = (1+imubar) * spinin_up - epsbar * spinin_dn + // - (M_oe)*nrm*(1-imubar)*(M_eo) * spinin_up - (M_oe)*nrm*epsbar*(M_eo) * spinin_dn + cublasSaxpy (N_floats, -1.0, (float *) dev_spin_eo1_dn, 1, (float *) dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn - dev_spin_eo1_dn = (1-imubar) * spinin_dn - epsbar * spinin_up + // - (M_oe)*nrm*(1+imubar)*(M_eo) * spinin_dn - (M_oe)*nrm*epsbar*(M_eo) * spinin_up + + + //////////// + // gamma5 // + //////////// + + // Flo: + dev_gamma5<<>>(dev_spin_eo2_up, dev_spin_eo2_up); // dev_spin_eo2_up = gamma5 * dev_spin_eo2_up + dev_gamma5<<>>(dev_spin_eo2_dn, dev_spin_eo2_dn); // dev_spin_eo2_dn = gamma5 * dev_spin_eo2_dn + + + + + to_host(l_strange, dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + to_host(l_charm, dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + + + + + return; +}//Zwitter2() + + + + + + + + + +// this replaces Q_Qdagger_ND() for debugging !! + +void Zwitter3 (spinor * const l_strange, spinor * const l_charm, // output + spinor * const k_strange, spinor * const k_charm) { // input + + + int N_sites = VOLUME/2; // #lattice sites + int N_floats = 24*VOLUME/2; // #floats + + // additional for the debugging purposes + size_t dev_spinsize = 6*VOLUME/2*sizeof(dev_spinor); + + + int gridsize; // auxiliary + int blocksize; // auxiliary + + blocksize = 128; + int blocksize1 = blocksize; // here: dev_zero_spinor_field , dev_copy_spinor_field + int gridsize1 = (int) (VOLUME/2/blocksize) + 1; + + blocksize = 128; + int blocksize2 = blocksize; // passed: dev_Hopping_Matrix + int gridsize2 = (int) (VOLUME/2/blocksize) + 1; + + blocksize = 128; + int blocksize3 = blocksize; // passed: dev_mul_one_pm_imubar_gamma5 + int gridsize3 = (int) (VOLUME/2/blocksize) + 1; + + blocksize = 128; + int blocksize4 = blocksize; // passed: dev_gamma5 + int gridsize4 = (int) (VOLUME/2/blocksize) + 1; + + blocksize = 128; + int blocksize5 = blocksize; // passed: dev_copy_spinor_field + int gridsize5 = (int) (VOLUME/2/blocksize) + 1; + + /* + printf("gridsize1 = %i, blocksize1 = %i\n", gridsize1, blocksize1); + printf("gridsize2 = %i, blocksize2 = %i\n", gridsize2, blocksize2); + printf("gridsize3 = %i, blocksize3 = %i\n", gridsize3, blocksize3); + printf("gridsize4 = %i, blocksize4 = %i\n", gridsize4, blocksize4); + printf("gridsize5 = %i, blocksize5 = %i\n", gridsize5, blocksize5); + */ + + dev_spinor * spinin_up; + dev_spinor * spinin_dn; + dev_spinor * spinout_up; + dev_spinor * spinout_dn; + + + cudaMalloc((void **) &dev_spin_eo2_up, dev_spinsize); + cudaMalloc((void **) &dev_spin_eo2_dn, dev_spinsize); + cudaMalloc((void **) &spinin_up, dev_spinsize); + cudaMalloc((void **) &spinin_dn, dev_spinsize); + cudaMalloc((void **) &spinout_up, dev_spinsize); + cudaMalloc((void **) &spinout_dn, dev_spinsize); + + + double nrm = 1./(1. + g_mubar*g_mubar - g_epsbar*g_epsbar); // nrm = (1 + mubar^2 - epsbar^2)^-1 + + + + + printf("This is Zwitter3(). "); + + + + + to_device(spinin_up, k_strange, h2d_spin_dn, dev_spinsize); + to_device(spinin_dn, k_charm , h2d_spin_up, dev_spinsize); + + + + + /////////////////////////////////////// ///////////////////////////////// + // Q_tilde_dagger(2x2) // // (a,b) = (Q_tilde) * (dn,up) // + /////////////////////////////////////// ///////////////////////////////// + + + //////////////////////////// + // (M_oe) (Mee^-1) (M_eo) // + //////////////////////////// + + printf("GPU. "); + + // Flo: + bind_texture_spin(spinin_dn,1); + dev_Hopping_Matrix<<>>(dev_gf, spinin_dn, dev_spin_eo1_up, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); // dev_spin_eo1_up = (M_eo) * spinin_dn + unbind_texture_spin(1); + + bind_texture_spin(spinin_up,1); + dev_Hopping_Matrix<<>>(dev_gf, spinin_up, dev_spin_eo1_dn, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); // dev_spin_eo1_dn = (M_eo) * spinin_up + unbind_texture_spin(1); + + + // written: + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo1_up, dev_spin_eo2_up, -1.0); // dev_spin_eo2_up = (1 - imubar) * dev_spin_eo1_up = (1 - imubar)*(M_eo) * spinin_dn + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo1_dn, dev_spin_eo2_dn, +1.0); // dev_spin_eo2_dn = (1 + imubar) * dev_spin_eo1_dn = (1 + imubar)*(M_eo) * spinin_up + + + // CUBLAS: + cublasSaxpy (N_floats, g_epsbar, (float *) dev_spin_eo1_dn, 1, (float *) dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up + epsbar * dev_spin_eo1_dn = (1 - imubar)*(M_eo) * spinin_dn + epsbar * (M_eo) * spinin_up + cublasSaxpy (N_floats, g_epsbar, (float *) dev_spin_eo1_up, 1, (float *) dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn + epsbar * dev_spin_eo1_up = (1 + imubar)*(M_eo) * spinin_up + epsbar * (M_eo) * spinin_dn + + // CUBLAS: + cublasSscal (N_floats, nrm, (float *) dev_spin_eo2_up, 1); // dev_spin_eo2_up = nrm * dev_spin_eo2_up = nrm*(1-imubar)*(M_eo) * spinin_dn + nrm*epsbar*(M_eo) * spinin_up + + cublasSscal (N_floats, nrm, (float *) dev_spin_eo2_dn, 1); // dev_spin_eo2_dn = nrm * dev_spin_eo2_dn = nrm*(1+imubar)*(M_eo) * spinin_up + nrm*epsbar*(M_eo) * spinin_dn + + + // Flo: // remember: this is ((M_oe)(Mee^-1)(M_eo)) * (spinin_dn, spinin_up): + bind_texture_spin(dev_spin_eo2_up,1); + dev_Hopping_Matrix<<>>(dev_gf, dev_spin_eo2_up, dev_spin_eo1_up, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + unbind_texture_spin(1); + // dev_spin_eo1_up = (M_oe) * dev_spin_eo2_up = (M_oe)*nrm*(1-imubar)*(M_eo) * spinin_dn + (M_oe)*nrm*epsbar*(M_eo) * spinin_up + bind_texture_spin(dev_spin_eo2_dn,1); + dev_Hopping_Matrix<<>>(dev_gf, dev_spin_eo2_dn, dev_spin_eo1_dn, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + unbind_texture_spin(1); + // dev_spin_eo1_dn = (M_oe) * dev_spin_eo2_dn = (M_oe)*nrm*(1+imubar)*(M_eo) * spinin_up + (M_oe)*nrm*epsbar*(M_eo) * spinin_dn + + + + //////////// + // (M_oo) // + //////////// + + + // written: + dev_mul_one_pm_imubar_gamma5<<>>(spinin_dn, dev_spin_eo2_up, +1.0); // dev_spin_eo2_up = (1 + imubar) * spinin_dn + dev_mul_one_pm_imubar_gamma5<<>>(spinin_up, dev_spin_eo2_dn, -1.0); // dev_spin_eo2_dn = (1 - imubar) * spinin_up + + + // CUBLAS: // remember: this is (M_oo) * (spinin_dn, spinin_up): + cublasSaxpy (N_floats, -g_epsbar, (float *) spinin_up, 1, (float *) dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up - epsbar*spinin_up = (1+imubar)*spinin_dn - epsbar*spinin_up + cublasSaxpy (N_floats, -g_epsbar, (float *) spinin_dn, 1, (float *) dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn - epsbar*spinin_dn = (1-imubar)*spinin_up - epsbar*spinin_dn + + + + /////////////////////////////////////// + // (M_oo) - (M_oe) (Mee^-1) (M_eo) // + /////////////////////////////////////// + + // CUBLAS: // this is ((M_oo) - (M_oe)(Mee^-1)(M_eo)) * (spinin_dn, spinin_up): + cublasSaxpy (N_floats, -1.0, (float *) dev_spin_eo1_up, 1, (float *) dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up - dev_spin_eo1_up = (1+imubar) * spinin_dn - epsbar * spinin_up + // - (M_oe)*nrm*(1-imubar)*(M_eo) * spinin_dn - (M_oe)*nrm*epsbar*(M_eo) * spinin_up + cublasSaxpy (N_floats, -1.0, (float *) dev_spin_eo1_dn, 1, (float *) dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn - dev_spin_eo1_dn = (1-imubar) * spinin_up - epsbar * spinin_dn + // - (M_oe)*nrm*(1+imubar)*(M_eo) * spinin_up - (M_oe)*nrm*epsbar*(M_eo) * spinin_dn + + + //////////// + // gamma5 // + //////////// + + // Flo: + dev_gamma5<<>>(dev_spin_eo2_up, dev_spin_eo2_up); // dev_spin_eo2_up = gamma5 * dev_spin_eo2_up + dev_gamma5<<>>(dev_spin_eo2_dn, dev_spin_eo2_dn); // dev_spin_eo2_dn = gamma5 * dev_spin_eo2_dn + + + + + + + //////////////////// + // (a,b) -> (b,a) // + //////////////////// + dev_copy_spinor_field<<>>(dev_spin_eo2_dn, spinin_up); // spinin_up = dev_spin_eo2_dn + dev_copy_spinor_field<<>>(dev_spin_eo2_up, spinin_dn); // spinin_dn = dev_spin_eo2_up + + + + + + + /////////////////////////////////// /////////////////////// + // Q_tilde(2x2) // // (Q_tilde) * (b,a) // + /////////////////////////////////// /////////////////////// + + + //////////////////////////// + // (M_oe) (Mee^-1) (M_eo) // + //////////////////////////// + + printf("GPU. "); + + // Flo: + bind_texture_spin(spinin_up,1); + dev_Hopping_Matrix<<>>(dev_gf, spinin_up, dev_spin_eo1_up, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); // dev_spin_eo1_up = (M_eo) * spinin_up + unbind_texture_spin(1); + + bind_texture_spin(spinin_dn,1); + dev_Hopping_Matrix<<>>(dev_gf, spinin_dn, dev_spin_eo1_dn, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); // dev_spin_eo1_dn = (M_eo) * spinin_dn + unbind_texture_spin(1); + + + // written: + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo1_up, dev_spin_eo2_up, -1.0); // dev_spin_eo2_up = (1 - imubar) * dev_spin_eo1_up = (1 - imubar)*(M_eo) * spinin_up + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo1_dn, dev_spin_eo2_dn, +1.0); // dev_spin_eo2_dn = (1 + imubar) * dev_spin_eo1_dn = (1 + imubar)*(M_eo) * spinin_dn + + + // CUBLAS: + cublasSaxpy (N_floats, g_epsbar, (float *) dev_spin_eo1_dn, 1, (float *) dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up + epsbar * dev_spin_eo1_dn = (1 - imubar)*(M_eo) * spinin_up + epsbar * (M_eo) * spinin_dn + cublasSaxpy (N_floats, g_epsbar, (float *) dev_spin_eo1_up, 1, (float *) dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn + epsbar * dev_spin_eo1_up = (1 + imubar)*(M_eo) * spinin_dn + epsbar * (M_eo) * spinin_up + + // CUBLAS: + cublasSscal (N_floats, nrm, (float *) dev_spin_eo2_up, 1); // dev_spin_eo2_up = nrm * dev_spin_eo2_up = nrm*(1-imubar)*(M_eo) * spinin_up + nrm*epsbar*(M_eo) * spinin_dn + + cublasSscal (N_floats, nrm, (float *) dev_spin_eo2_dn, 1); // dev_spin_eo2_dn = nrm * dev_spin_eo2_dn = nrm*(1+imubar)*(M_eo) * spinin_dn + nrm*epsbar*(M_eo) * spinin_up + + + // Flo: // remember: this is ((M_oe) (Mee^-1) (M_eo)) * (spinin_up, spinin_dn): + bind_texture_spin(dev_spin_eo2_up,1); + dev_Hopping_Matrix<<>>(dev_gf, dev_spin_eo2_up, dev_spin_eo1_up, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + unbind_texture_spin(1); + // dev_spin_eo1_up = (M_oe) * dev_spin_eo2_up = (M_oe)*nrm*(1-imubar)*(M_eo) * spinin_up + (M_oe)*nrm*epsbar*(M_eo) * spinin_dn + bind_texture_spin(dev_spin_eo2_dn,1); + dev_Hopping_Matrix<<>>(dev_gf, dev_spin_eo2_dn, dev_spin_eo1_dn, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + unbind_texture_spin(1); + // dev_spin_eo1_dn = (M_oe) * dev_spin_eo2_dn = (M_oe)*nrm*(1+imubar)*(M_eo) * spinin_dn + (M_oe)*nrm*epsbar*(M_eo) * spinin_up + + + + //////////// + // (M_oo) // + //////////// + + // written: + dev_mul_one_pm_imubar_gamma5<<>>(spinin_up, dev_spin_eo2_up, +1.0); // dev_spin_eo2_up = (1 + imubar) * spinin_up + dev_mul_one_pm_imubar_gamma5<<>>(spinin_dn, dev_spin_eo2_dn, -1.0); // dev_spin_eo2_dn = (1 - imubar) * spinin_dn + + + // CUBLAS: // remember: this is (M_oo) * (spinin_up, spinin_dn): + cublasSaxpy (N_floats, -g_epsbar, (float *) spinin_dn, 1, (float *) dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up - epsbar*spinin_dn = (1+imubar)*spinin_up - epsbar*spinin_dn + cublasSaxpy (N_floats, -g_epsbar, (float *) spinin_up, 1, (float *) dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn - epsbar*spinin_up = (1-imubar)*spinin_dn - epsbar*spinin_up + + + + /////////////////////////////////////// + // (M_oo) - (M_oe) (Mee^-1) (M_eo) // + /////////////////////////////////////// + + // CUBLAS: // this is ( (M_oo) - (M_oe) (Mee^-1) (M_eo) ) * (spinin_up, spinin_dn) + cublasSaxpy (N_floats, -1.0, (float *) dev_spin_eo1_up, 1, (float *) dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up - dev_spin_eo1_up = (1+imubar) * spinin_up - epsbar * spinin_dn + // - (M_oe)*nrm*(1-imubar)*(M_eo) * spinin_up - (M_oe)*nrm*epsbar*(M_eo) * spinin_dn + cublasSaxpy (N_floats, -1.0, (float *) dev_spin_eo1_dn, 1, (float *) dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn - dev_spin_eo1_dn = (1-imubar) * spinin_dn - epsbar * spinin_up + // - (M_oe)*nrm*(1+imubar)*(M_eo) * spinin_dn - (M_oe)*nrm*epsbar*(M_eo) * spinin_up + + + //////////// + // gamma5 // + //////////// + + // Flo: + dev_gamma5<<>>(dev_spin_eo2_up, dev_spin_eo2_up); // dev_spin_eo2_up = gamma5 * dev_spin_eo2_up + dev_gamma5<<>>(dev_spin_eo2_dn, dev_spin_eo2_dn); // dev_spin_eo2_dn = gamma5 * dev_spin_eo2_dn + + + + + to_host(l_strange, dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + to_host(l_charm , dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + + + + + return; + +}//Zwitter3() + + + + + + + + + + +// replaces matrix_multiplication32() for debugging + +// RESULT: d_up = spinin_up and +// d_dn = spinin_dn have to be wrapped +// the assignement dev_spin_eo2_up = spinout_up is legal +// apparently spinin_up/dn is after the matrix application not the same as before any more + +void matrix_multiplication_test (dev_spinor * spinout_up, dev_spinor * spinout_dn, // Ad_up = dev_spin3_up, Ad_dn = dev_spin3_dn + dev_spinor * spinin_up , dev_spinor * spinin_dn , // d_up = dev_spin2_up, d_dn = dev_spin2_dn + int gridsize1, int blocksize1, int gridsize2, int blocksize2, + int gridsize3, int blocksize3, int gridsize4, int blocksize4) { + + + + + ///////////////////// + // LOCAL VARIABLES // + ///////////////////// + + int N_sites = VOLUME/2; // #lattice sites + int N_floats = 24*VOLUME/2; // #floats + + // additional for the debugging purposes + size_t dev_spinsize = 6*VOLUME/2*sizeof(dev_spinor); + + + /* + printf("gridsize1 = %i, blocksize1 = %i\n", gridsize1, blocksize1); + printf("gridsize2 = %i, blocksize2 = %i\n", gridsize2, blocksize2); + printf("gridsize3 = %i, blocksize3 = %i\n", gridsize3, blocksize3); + printf("gridsize4 = %i, blocksize4 = %i\n", gridsize4, blocksize4); + */ + /* + printf("%p ?= %p ?= %p\n", dev_spin3_up, spinout_up, dev_spin_eo2_up); + printf("%p ?= %p ?= %p\n", dev_spin3_dn, spinout_dn, dev_spin_eo2_dn); + printf("%p ?= %p\n", dev_spin2_up, spinin_up); + printf("%p ?= %p\n", dev_spin2_dn, spinin_dn); + */ + + + + + /////////////////////////////////// + // INITIALIZATIONS & ASSIGNMENTS // + /////////////////////////////////// + + + + + dev_spin_eo2_up = spinout_up; + dev_spin_eo2_dn = spinout_dn; + //cudaMalloc((void **) &dev_spin_eo2_up, dev_spinsize); + //cudaMalloc((void **) &dev_spin_eo2_dn, dev_spinsize); + + + + +///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + + ///////////// THEORY //////////////////////////////////////////////////////////////// + // // + // (Q_tilde) = gamma5 * ((M_oo) - (M_oe)(Mee^-1)(M_eo)) // + // (Q_tilde)(Q_tilde_dagger) * (up,dn) = (Q_tilde) * (b,a) // + /////////////// // (a,b) = (Q_tilde) * (dn,up) // + // MAIN BODY // // // + /////////////// ///////////////////////////////////////////////////////////////////////////////////// + + + double nrm = 1.0 / (1.0 + g_mubar*g_mubar - g_epsbar*g_epsbar); + + + /////////////////////////////////////// ///////////////////////////////// + // Q_tilde_dagger(2x2) // // (a,b) = (Q_tilde) * (dn,up) // + /////////////////////////////////////// ///////////////////////////////// + + + //////////////////////////// + // (M_oe) (Mee^-1) (M_eo) // + //////////////////////////// + + printf("This is matrix_multiplication_test(). "); + + // Flo: + bind_texture_spin(spinin_dn,1); + dev_Hopping_Matrix<<>>(dev_gf, spinin_dn, dev_spin_eo1_up, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); // dev_spin_eo1_up = (M_eo) * spinin_dn + unbind_texture_spin(1); + + bind_texture_spin(spinin_up,1); + dev_Hopping_Matrix<<>>(dev_gf, spinin_up, dev_spin_eo1_dn, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); // dev_spin_eo1_dn = (M_eo) * spinin_up + unbind_texture_spin(1); + + + // written: + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo1_up, dev_spin_eo2_up, -1.0); // dev_spin_eo2_up = (1 - imubar) * dev_spin_eo1_up = (1 - imubar)*(M_eo) * spinin_dn + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo1_dn, dev_spin_eo2_dn, +1.0); // dev_spin_eo2_dn = (1 + imubar) * dev_spin_eo1_dn = (1 + imubar)*(M_eo) * spinin_up + + + // CUBLAS: + cublasSaxpy (N_floats, g_epsbar, (float *) dev_spin_eo1_dn, 1, (float *) dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up + epsbar * dev_spin_eo1_dn = (1 - imubar)*(M_eo) * spinin_dn + epsbar * (M_eo) * spinin_up + cublasSaxpy (N_floats, g_epsbar, (float *) dev_spin_eo1_up, 1, (float *) dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn + epsbar * dev_spin_eo1_up = (1 + imubar)*(M_eo) * spinin_up + epsbar * (M_eo) * spinin_dn + + // CUBLAS: + cublasSscal (N_floats, nrm, (float *) dev_spin_eo2_up, 1); // dev_spin_eo2_up = nrm * dev_spin_eo2_up = nrm*(1-imubar)*(M_eo) * spinin_dn + nrm*epsbar*(M_eo) * spinin_up + + cublasSscal (N_floats, nrm, (float *) dev_spin_eo2_dn, 1); // dev_spin_eo2_dn = nrm * dev_spin_eo2_dn = nrm*(1+imubar)*(M_eo) * spinin_up + nrm*epsbar*(M_eo) * spinin_dn + + + // Flo: // remember: this is ((M_oe)(Mee^-1)(M_eo)) * (spinin_dn, spinin_up): + bind_texture_spin(dev_spin_eo2_up,1); + dev_Hopping_Matrix<<>>(dev_gf, dev_spin_eo2_up, dev_spin_eo1_up, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + unbind_texture_spin(1); + // dev_spin_eo1_up = (M_oe) * dev_spin_eo2_up = (M_oe)*nrm*(1-imubar)*(M_eo) * spinin_dn + (M_oe)*nrm*epsbar*(M_eo) * spinin_up + bind_texture_spin(dev_spin_eo2_dn,1); + dev_Hopping_Matrix<<>>(dev_gf, dev_spin_eo2_dn, dev_spin_eo1_dn, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + unbind_texture_spin(1); + // dev_spin_eo1_dn = (M_oe) * dev_spin_eo2_dn = (M_oe)*nrm*(1+imubar)*(M_eo) * spinin_up + (M_oe)*nrm*epsbar*(M_eo) * spinin_dn + + + + //////////// + // (M_oo) // + //////////// + + + // written: + dev_mul_one_pm_imubar_gamma5<<>>(spinin_dn, dev_spin_eo2_up, +1.0); // dev_spin_eo2_up = (1 + imubar) * spinin_dn + dev_mul_one_pm_imubar_gamma5<<>>(spinin_up, dev_spin_eo2_dn, -1.0); // dev_spin_eo2_dn = (1 - imubar) * spinin_up + + + // CUBLAS: // remember: this is (M_oo) * (spinin_dn, spinin_up): + cublasSaxpy (N_floats, -g_epsbar, (float *) spinin_up, 1, (float *) dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up - epsbar*spinin_up = (1+imubar)*spinin_dn - epsbar*spinin_up + cublasSaxpy (N_floats, -g_epsbar, (float *) spinin_dn, 1, (float *) dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn - epsbar*spinin_dn = (1-imubar)*spinin_up - epsbar*spinin_dn + + + + /////////////////////////////////////// + // (M_oo) - (M_oe) (Mee^-1) (M_eo) // + /////////////////////////////////////// + + // CUBLAS: // this is ((M_oo) - (M_oe)(Mee^-1)(M_eo)) * (spinin_dn, spinin_up): + cublasSaxpy (N_floats, -1.0, (float *) dev_spin_eo1_up, 1, (float *) dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up - dev_spin_eo1_up = (1+imubar) * spinin_dn - epsbar * spinin_up + // - (M_oe)*nrm*(1-imubar)*(M_eo) * spinin_dn - (M_oe)*nrm*epsbar*(M_eo) * spinin_up + cublasSaxpy (N_floats, -1.0, (float *) dev_spin_eo1_dn, 1, (float *) dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn - dev_spin_eo1_dn = (1-imubar) * spinin_up - epsbar * spinin_dn + // - (M_oe)*nrm*(1+imubar)*(M_eo) * spinin_up - (M_oe)*nrm*epsbar*(M_eo) * spinin_dn + + + //////////// + // gamma5 // + //////////// + + // Flo: + dev_gamma5<<>>(dev_spin_eo2_up, dev_spin_eo2_up); // dev_spin_eo2_up = gamma5 * dev_spin_eo2_up + dev_gamma5<<>>(dev_spin_eo2_dn, dev_spin_eo2_dn); // dev_spin_eo2_dn = gamma5 * dev_spin_eo2_dn + + + + + + + //////////////////// // HERE IS THE MISTAKE !!! + // (a,b) -> (b,a) // // spinin_up/dn is changed + //////////////////// + dev_copy_spinor_field<<>>(dev_spin_eo2_dn, spinin_up); // spinin_up = dev_spin_eo2_dn + dev_copy_spinor_field<<>>(dev_spin_eo2_up, spinin_dn); // spinin_dn = dev_spin_eo2_up + + + + + + + /////////////////////////////////// /////////////////////// + // Q_tilde(2x2) // // (Q_tilde) * (b,a) // + /////////////////////////////////// /////////////////////// + + + //////////////////////////// + // (M_oe) (Mee^-1) (M_eo) // + //////////////////////////// + + // Flo: + bind_texture_spin(spinin_up,1); + dev_Hopping_Matrix<<>>(dev_gf, spinin_up, dev_spin_eo1_up, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); // dev_spin_eo1_up = (M_eo) * spinin_up + unbind_texture_spin(1); + + bind_texture_spin(spinin_dn,1); + dev_Hopping_Matrix<<>>(dev_gf, spinin_dn, dev_spin_eo1_dn, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); // dev_spin_eo1_dn = (M_eo) * spinin_dn + unbind_texture_spin(1); + + + // written: + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo1_up, dev_spin_eo2_up, -1.0); // dev_spin_eo2_up = (1 - imubar) * dev_spin_eo1_up = (1 - imubar)*(M_eo) * spinin_up + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo1_dn, dev_spin_eo2_dn, +1.0); // dev_spin_eo2_dn = (1 + imubar) * dev_spin_eo1_dn = (1 + imubar)*(M_eo) * spinin_dn + + + // CUBLAS: + cublasSaxpy (N_floats, g_epsbar, (float *) dev_spin_eo1_dn, 1, (float *) dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up + epsbar * dev_spin_eo1_dn = (1 - imubar)*(M_eo) * spinin_up + epsbar * (M_eo) * spinin_dn + cublasSaxpy (N_floats, g_epsbar, (float *) dev_spin_eo1_up, 1, (float *) dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn + epsbar * dev_spin_eo1_up = (1 + imubar)*(M_eo) * spinin_dn + epsbar * (M_eo) * spinin_up + + // CUBLAS: + cublasSscal (N_floats, nrm, (float *) dev_spin_eo2_up, 1); // dev_spin_eo2_up = nrm * dev_spin_eo2_up = nrm*(1-imubar)*(M_eo) * spinin_up + nrm*epsbar*(M_eo) * spinin_dn + + cublasSscal (N_floats, nrm, (float *) dev_spin_eo2_dn, 1); // dev_spin_eo2_dn = nrm * dev_spin_eo2_dn = nrm*(1+imubar)*(M_eo) * spinin_dn + nrm*epsbar*(M_eo) * spinin_up + + + // Flo: // remember: this is ((M_oe) (Mee^-1) (M_eo)) * (spinin_up, spinin_dn): + bind_texture_spin(dev_spin_eo2_up,1); + dev_Hopping_Matrix<<>>(dev_gf, dev_spin_eo2_up, dev_spin_eo1_up, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + unbind_texture_spin(1); + // dev_spin_eo1_up = (M_oe) * dev_spin_eo2_up = (M_oe)*nrm*(1-imubar)*(M_eo) * spinin_up + (M_oe)*nrm*epsbar*(M_eo) * spinin_dn + bind_texture_spin(dev_spin_eo2_dn,1); + dev_Hopping_Matrix<<>>(dev_gf, dev_spin_eo2_dn, dev_spin_eo1_dn, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + unbind_texture_spin(1); + // dev_spin_eo1_dn = (M_oe) * dev_spin_eo2_dn = (M_oe)*nrm*(1+imubar)*(M_eo) * spinin_dn + (M_oe)*nrm*epsbar*(M_eo) * spinin_up + + + + //////////// + // (M_oo) // + //////////// + + // written: + dev_mul_one_pm_imubar_gamma5<<>>(spinin_up, dev_spin_eo2_up, +1.0); // dev_spin_eo2_up = (1 + imubar) * spinin_up + dev_mul_one_pm_imubar_gamma5<<>>(spinin_dn, dev_spin_eo2_dn, -1.0); // dev_spin_eo2_dn = (1 - imubar) * spinin_dn + + + // CUBLAS: // remember: this is (M_oo) * (spinin_up, spinin_dn): + cublasSaxpy (N_floats, -g_epsbar, (float *) spinin_dn, 1, (float *) dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up - epsbar*spinin_dn = (1+imubar)*spinin_up - epsbar*spinin_dn + cublasSaxpy (N_floats, -g_epsbar, (float *) spinin_up, 1, (float *) dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn - epsbar*spinin_up = (1-imubar)*spinin_dn - epsbar*spinin_up + + + + /////////////////////////////////////// + // (M_oo) - (M_oe) (Mee^-1) (M_eo) // + /////////////////////////////////////// + + // CUBLAS: // this is ( (M_oo) - (M_oe) (Mee^-1) (M_eo) ) * (spinin_up, spinin_dn) + cublasSaxpy (N_floats, -1.0, (float *) dev_spin_eo1_up, 1, (float *) dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up - dev_spin_eo1_up = (1+imubar) * spinin_up - epsbar * spinin_dn + // - (M_oe)*nrm*(1-imubar)*(M_eo) * spinin_up - (M_oe)*nrm*epsbar*(M_eo) * spinin_dn + cublasSaxpy (N_floats, -1.0, (float *) dev_spin_eo1_dn, 1, (float *) dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn - dev_spin_eo1_dn = (1-imubar) * spinin_dn - epsbar * spinin_up + // - (M_oe)*nrm*(1+imubar)*(M_eo) * spinin_dn - (M_oe)*nrm*epsbar*(M_eo) * spinin_up + + + //////////// + // gamma5 // + //////////// + + // Flo: + dev_gamma5<<>>(dev_spin_eo2_up, dev_spin_eo2_up); // dev_spin_eo2_up = gamma5 * dev_spin_eo2_up + dev_gamma5<<>>(dev_spin_eo2_dn, dev_spin_eo2_dn); // dev_spin_eo2_dn = gamma5 * dev_spin_eo2_dn + + + + + /* + //////////// + // output // // output is already done by setting dev_spin_eo2_up/dn = spinout_up/dn + //////////// + + dev_copy_spinor_field<<>>(dev_spin_eo2_up, spinout_up); // spinin_up = dev_spin_eo2_up + dev_copy_spinor_field<<>>(dev_spin_eo2_dn, spinout_dn); // spinin_dn = dev_spin_eo2_dn + */ + + return; + + +}//matrix_multiplication_test() + + + + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/DEBUG/MATRIX_MPI_DEBUG.cuh b/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/DEBUG/MATRIX_MPI_DEBUG.cuh new file mode 100644 index 0000000000000000000000000000000000000000..9668aece0c20aef1d563eac0c246bfd2cbe1fd36 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/DEBUG/MATRIX_MPI_DEBUG.cuh @@ -0,0 +1,2249 @@ + +// matrix_mpi_debug1() and matrix_mpi_debug1() replace matrix_multiplication32_mpi() +// matrix_mpi_debug3() replaces Q_Qdagger_ND() + + + +extern "C" { + //#ifdef HAVE_CONFIG_H + //# include + //#endif + //#include + //#include + //#include "../global.h" + //#include "../su3.h" + #include "../../Hopping_Matrix.h" + #include "../../phmc.h" + #include "../../gamma.h" + //#include "../linsolve.h" + //#include "../linalg_eo.h" + //#include "../Nondegenerate_Matrix.h" +} + + + + + + +//#define CHECK_HOPPING_MATRIX +//#define CHECK_IMUGAMMA5 +//#define CHECK_GAMMA5 +//#define CHECK_CUBLAS1 +//#define CHECK_CUBLAS2 +//#define CHECK_CUBLAS3 +//#define CHECK_COPY +//#define CHECK_MAXEV + + + + + + + + + +// replaces matrix_multiplication32_mpi() for debugging !! + +void matrix_mpi_debug1 (dev_spinor * spinout_up, dev_spinor * spinout_dn, + dev_spinor * spinin_up , dev_spinor * spinin_dn , + int gridsize1, int blocksize1, int gridsize2, int blocksize2, + int gridsize3, int blocksize3, int gridsize4, int blocksize4) { + + + // we will use the auxiliary fields dev_spin_eo{1,2}_up/dn for working on and buffering + // and set dev_spin_eo2_up/dn equal spinout_up/dn + // spinin_up/dn have to remain unchanged !! + // spinout_up/dn can be freely used + + + + + ///////////////////// + // LOCAL VARIABLES // + ///////////////////// + + int N_sites = VOLUME/2; // #lattice sites + int N_floats = 24*VOLUME/2; // #floats + + // additional for the debugging purposes + size_t dev_spinsize = 6*(VOLUME+RAND)/2*sizeof(dev_spinor); + + + + + //////////////////////////////////// + // MATCHING with Q_Qdagger_ND // + //////////////////////////////////// + // // + // _strange = _up // + // _charm = _dn // + // // + // DUM_MATRIX = dev_spin_eo1_up // + // DUM_MATRIX+1 = dev_spin_eo1_dn // + // // + // DUM_MATRIX+2 = dev_spin_eo2_up // + // DUM_MATRIX+3 = dev_spin_eo2_dn // + // // + //////////////////////////////////// + + // can savely use the following spinors on host: g_spinor_field[DUM_MATRIX{ , +1, ... , +7}] + + + + + /////////////////////////////////// + // INITIALIZATIONS & ASSIGNMENTS // // have to use (one) other auxiliary field(s) than the calling function dev_cg_eo_nd + /////////////////////////////////// + + dev_spin_eo2_up = spinout_up; // need no memory allocated + dev_spin_eo2_dn = spinout_dn; + ///////////// THEORY //////////////////////////////////////////////////////////////// + // // + // (Q_tilde) = gamma5 * ((M_oo) - (M_oe)(Mee^-1)(M_eo)) // + // (Q_tilde)(Q_tilde_dagger) * (up,dn) = (Q_tilde) * (b,a) // + /////////////// // (a,b) = (Q_tilde) * (dn,up) // + // MAIN BODY // // // + /////////////// ///////////////////////////////////////////////////////////////////////////////////// + + + double nrm = 1.0 / (1.0 + g_mubar*g_mubar - g_epsbar*g_epsbar); + + + if (g_proc_id == 0) + printf("This is matrix_mpi_debug1(). "); + + + spinor * l_strange = (spinor *) malloc(2*dev_spinsize); + spinor * l_charm = (spinor *) malloc(2*dev_spinsize); + spinor * k_strange = (spinor *) malloc(2*dev_spinsize); + spinor * k_charm = (spinor *) malloc(2*dev_spinsize); + + + /////////////////////////////////////// ///////////////////////////////// + // Q_tilde_dagger(2x2) // // (a,b) = (Q_tilde) * (dn,up) // + /////////////////////////////////////// ///////////////////////////////// + + + //////////////////////////// + // (M_oe) (Mee^-1) (M_eo) // + //////////////////////////// + + #ifndef CHECK_HOPPING_MATRIX + // xchange + #ifndef HOPPING_DEBUG + xchange_field_wrapper(spinin_dn, 0); + #endif + #ifndef HOPPING_DEBUG + dev_Hopping_Matrix<<>>(dev_gf, spinin_dn, dev_spin_eo1_up, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); // dev_spin_eo1_up = (M_eo) * spinin_dn + #else + Hopping_Matrix_wrapper(0, dev_spin_eo1_up, spinin_dn); + #endif + // xchange + #ifndef HOPPING_DEBUG + xchange_field_wrapper(spinin_up, 0); + #endif + #ifndef HOPPING_DEBUG + dev_Hopping_Matrix<<>>(dev_gf, spinin_up, dev_spin_eo1_dn, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); // dev_spin_eo1_dn = (M_eo) * spinin_up + #else + Hopping_Matrix_wrapper(0, dev_spin_eo1_dn, spinin_up); + #endif + #else + to_host(k_charm, spinin_dn, h2d_spin_up, dev_spinsize); + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX], k_charm); // g_spinor_field[DUM_MATRIX] = (M_eo) * k_charm + to_device(dev_spin_eo1_up, g_spinor_field[DUM_MATRIX], h2d_spin_up, dev_spinsize); + + to_host(k_strange, spinin_up, h2d_spin_dn, dev_spinsize); + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX+1], k_strange); // g_spinor_field[DUM_MATRIX+1] = (M_eo) * k_strange + to_device(dev_spin_eo1_dn, g_spinor_field[DUM_MATRIX+1], h2d_spin_dn, dev_spinsize); + #endif + + + + + #ifndef CHECK_IMUGAMMA5 + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo1_up, dev_spin_eo2_up, -1.0); // dev_spin_eo2_up = (1 - imubar) * dev_spin_eo1_up = (1 - imubar)*(M_eo) * spinin_dn + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo1_dn, dev_spin_eo2_dn, +1.0); // dev_spin_eo2_dn = (1 + imubar) * dev_spin_eo1_dn = (1 + imubar)*(M_eo) * spinin_up + #else + to_host(g_spinor_field[DUM_MATRIX], dev_spin_eo1_up, h2d_spin_up, dev_spinsize); + mul_one_minus_imubar(g_spinor_field[DUM_MATRIX+2], g_spinor_field[DUM_MATRIX]); // g_spinor_field[DUM_MATRIX+2] = (1 - imubar)*(M_eo) * g_spinor_field[DUM_MATRIX] + to_device(dev_spin_eo2_up, g_spinor_field[DUM_MATRIX+2], h2d_spin_up, dev_spinsize); + + to_host(g_spinor_field[DUM_MATRIX+1], dev_spin_eo1_dn, h2d_spin_dn, dev_spinsize); + mul_one_plus_imubar (g_spinor_field[DUM_MATRIX+3], g_spinor_field[DUM_MATRIX+1]); // g_spinor_field[DUM_MATRIX+3] = (1 + imubar)*(M_eo) * g_spinor_field[DUM_MATRIX+1] + to_device(dev_spin_eo2_dn, g_spinor_field[DUM_MATRIX+3], h2d_spin_dn, dev_spinsize); + #endif + + + + + #ifndef CHECK_CUBLAS1 + cublasSaxpy (N_floats, g_epsbar, (float *) dev_spin_eo1_dn, 1, (float *) dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up + epsbar * dev_spin_eo1_dn = (1 - imubar)*(M_eo) * spinin_dn + epsbar * (M_eo) * spinin_up + cublasSaxpy (N_floats, g_epsbar, (float *) dev_spin_eo1_up, 1, (float *) dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn + epsbar * dev_spin_eo1_up = (1 + imubar)*(M_eo) * spinin_up + epsbar * (M_eo) * spinin_dn + + cublasSscal (N_floats, nrm, (float *) dev_spin_eo2_up, 1); // dev_spin_eo2_up = nrm * dev_spin_eo2_up = nrm*(1-imubar)*(M_eo) * spinin_dn + nrm*epsbar*(M_eo) * spinin_up + + cublasSscal (N_floats, nrm, (float *) dev_spin_eo2_dn, 1); // dev_spin_eo2_dn = nrm * dev_spin_eo2_dn = nrm*(1+imubar)*(M_eo) * spinin_up + nrm*epsbar*(M_eo) * spinin_dn + #else + to_host(g_spinor_field[DUM_MATRIX+1], dev_spin_eo1_dn, h2d_spin_up, dev_spinsize); + to_host(g_spinor_field[DUM_MATRIX+2], dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + assign_add_mul_r(g_spinor_field[DUM_MATRIX+2], g_spinor_field[DUM_MATRIX+1], g_epsbar, VOLUME/2); + // g_spinor_field[DUM_MATRIX+2] = g_spinor_field[DUM_MATRIX+2] + epsbar*g_spinor_field[DUM_MATRIX+1] + mul_r(g_spinor_field[DUM_MATRIX+2], nrm, g_spinor_field[DUM_MATRIX+2], VOLUME/2); // g_spinor_field[DUM_MATRIX+2] = nrm * g_spinor_field[DUM_MATRIX+2] + to_device(dev_spin_eo2_up, g_spinor_field[DUM_MATRIX+2], h2d_spin_up, dev_spinsize); + + + to_host(g_spinor_field[DUM_MATRIX], dev_spin_eo1_up, h2d_spin_dn, dev_spinsize); + to_host(g_spinor_field[DUM_MATRIX+3], dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + assign_add_mul_r(g_spinor_field[DUM_MATRIX+3], g_spinor_field[DUM_MATRIX], g_epsbar, VOLUME/2); + // g_spinor_field[DUM_MATRIX+3] = g_spinor_field[DUM_MATRIX+3] + epsbar*g_spinor_field[DUM_MATRIX] + mul_r(g_spinor_field[DUM_MATRIX+3], nrm, g_spinor_field[DUM_MATRIX+3], VOLUME/2); // g_spinor_field[DUM_MATRIX+3] = nrm * g_spinor_field[DUM_MATRIX+3] + to_device(dev_spin_eo2_dn, g_spinor_field[DUM_MATRIX+3] , h2d_spin_dn, dev_spinsize); + #endif + + + + + #ifndef CHECK_HOPPING_MATRIX + // xchange + #ifndef HOPPING_DEBUG + xchange_field_wrapper(dev_spin_eo2_up, 1); + #endif + #ifndef HOPPING_DEBUG + dev_Hopping_Matrix<<>>(dev_gf, dev_spin_eo2_up, dev_spin_eo1_up, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + #else + Hopping_Matrix_wrapper(1, dev_spin_eo1_up, dev_spin_eo2_up); + #endif + // xchange + #ifndef HOPPING_DEBUG + xchange_field_wrapper(dev_spin_eo2_dn, 1); + #endif + #ifndef HOPPING_DEBUG + dev_Hopping_Matrix<<>>(dev_gf, dev_spin_eo2_dn, dev_spin_eo1_dn, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + #else + Hopping_Matrix_wrapper(1, dev_spin_eo1_dn, dev_spin_eo2_dn); + #endif + #else + to_host(g_spinor_field[DUM_MATRIX+2], dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + Hopping_Matrix(OE, g_spinor_field[DUM_MATRIX] , g_spinor_field[DUM_MATRIX+2]); // g_spinor_field[DUM_MATRIX] = (M_oe) * g_spinor_field[DUM_MATRIX+2] + to_device(dev_spin_eo1_up, g_spinor_field[DUM_MATRIX], h2d_spin_up, dev_spinsize); + + to_host(g_spinor_field[DUM_MATRIX+3], dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + Hopping_Matrix(OE, g_spinor_field[DUM_MATRIX+1], g_spinor_field[DUM_MATRIX+3]); // g_spinor_field[DUM_MATRIX+1] = (M_oe) * g_spinor_field[DUM_MATRIX+3] + to_device(dev_spin_eo1_dn, g_spinor_field[DUM_MATRIX+1], h2d_spin_dn, dev_spinsize); + #endif + + + + + //////////// + // (M_oo) // + //////////// + + + #ifndef CHECK_IMUGAMMA5 + dev_mul_one_pm_imubar_gamma5<<>>(spinin_dn, dev_spin_eo2_up, +1.0); // dev_spin_eo2_up = (1 + imubar) * spinin_dn + dev_mul_one_pm_imubar_gamma5<<>>(spinin_up, dev_spin_eo2_dn, -1.0); // dev_spin_eo2_dn = (1 - imubar) * spinin_up + #else + to_host(k_charm, spinin_dn, h2d_spin_up, dev_spinsize); + mul_one_plus_imubar (g_spinor_field[DUM_MATRIX+2], k_charm); // g_spinor_field[DUM_MATRIX+2] = (1 + imubar) * k_charm + to_device(dev_spin_eo2_up, g_spinor_field[DUM_MATRIX+2], h2d_spin_up, dev_spinsize); + + to_host(k_strange, spinin_up, h2d_spin_dn, dev_spinsize); + mul_one_minus_imubar(g_spinor_field[DUM_MATRIX+3], k_strange); // g_spinor_field[DUM_MATRIX+3] = (1 - imubar) * k_strange + to_device(dev_spin_eo2_dn, g_spinor_field[DUM_MATRIX+3], h2d_spin_dn, dev_spinsize); + #endif + + + + + #ifndef CHECK_CUBLAS2 // remember: this is (M_oo) * (spinin_dn, spinin_up): + cublasSaxpy (N_floats, -g_epsbar, (float *) spinin_up, 1, (float *) dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up - epsbar*spinin_up = (1+imubar)*spinin_dn - epsbar*spinin_up + cublasSaxpy (N_floats, -g_epsbar, (float *) spinin_dn, 1, (float *) dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn - epsbar*spinin_dn = (1-imubar)*spinin_up - epsbar*spinin_dn + #else + to_host(k_strange, spinin_up, h2d_spin_up, dev_spinsize); + to_host(g_spinor_field[DUM_MATRIX+2], dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + assign_add_mul_r(g_spinor_field[DUM_MATRIX+2], k_strange, -g_epsbar, VOLUME/2); // g_spinor_field[DUM_MATRIX+2] = g_spinor_field[DUM_MATRIX+2] - epsbar*k_strange + to_device(dev_spin_eo2_up, g_spinor_field[DUM_MATRIX+2], h2d_spin_up, dev_spinsize); + + to_host(k_charm, spinin_dn, h2d_spin_dn, dev_spinsize); + to_host(g_spinor_field[DUM_MATRIX+3], dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + assign_add_mul_r(g_spinor_field[DUM_MATRIX+3], k_charm , -g_epsbar, VOLUME/2); // g_spinor_field[DUM_MATRIX+3] = g_spinor_field[DUM_MATRIX+3] - epsbar*k_charm + to_device(dev_spin_eo2_dn, g_spinor_field[DUM_MATRIX+3], h2d_spin_dn, dev_spinsize); + #endif + + + + + /////////////////////////////////////// + // (M_oo) - (M_oe) (Mee^-1) (M_eo) // + /////////////////////////////////////// + + #ifndef CHECK_CUBLAS3 // this is ((M_oo) - (M_oe)(Mee^-1)(M_eo)) * (spinin_dn, spinin_up): + cublasSaxpy (N_floats, -1.0, (float *) dev_spin_eo1_up, 1, (float *) dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up - dev_spin_eo1_up = (1+imubar) * spinin_dn - epsbar * spinin_up + // - (M_oe)*nrm*(1-imubar)*(M_eo) * spinin_dn - (M_oe)*nrm*epsbar*(M_eo) * spinin_up + cublasSaxpy (N_floats, -1.0, (float *) dev_spin_eo1_dn, 1, (float *) dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn - dev_spin_eo1_dn = (1-imubar) * spinin_up - epsbar * spinin_dn + // - (M_oe)*nrm*(1+imubar)*(M_eo) * spinin_up - (M_oe)*nrm*epsbar*(M_eo) * spinin_dn + #else + to_host(g_spinor_field[DUM_MATRIX], dev_spin_eo1_up, h2d_spin_up, dev_spinsize); + to_host(g_spinor_field[DUM_MATRIX+2], dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + diff(g_spinor_field[DUM_MATRIX+4], g_spinor_field[DUM_MATRIX+2], g_spinor_field[DUM_MATRIX] , VOLUME/2); + // g_spinor_field[DUM_MATRIX+4] = g_spinor_field[DUM_MATRIX+2] - g_spinor_field[DUM_MATRIX] + to_device(dev_spin_eo2_up, g_spinor_field[DUM_MATRIX+4], h2d_spin_up, dev_spinsize); + + + to_host(g_spinor_field[DUM_MATRIX+1], dev_spin_eo1_dn, h2d_spin_dn, dev_spinsize); + to_host(g_spinor_field[DUM_MATRIX+3], dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + diff(g_spinor_field[DUM_MATRIX+5], g_spinor_field[DUM_MATRIX+3], g_spinor_field[DUM_MATRIX+1], VOLUME/2); + // g_spinor_field[DUM_MATRIX+5] = g_spinor_field[DUM_MATRIX+3] - g_spinor_field[DUM_MATRIX+1] + to_device(dev_spin_eo2_dn, g_spinor_field[DUM_MATRIX+5], h2d_spin_dn, dev_spinsize); + #endif + + + + + //////////// + // gamma5 // + //////////// + + #ifndef CHECK_GAMMA5 + dev_gamma5<<>>(dev_spin_eo2_up, dev_spin_eo2_up); // dev_spin_eo2_up = gamma5 * dev_spin_eo2_up + dev_gamma5<<>>(dev_spin_eo2_dn, dev_spin_eo2_dn); // dev_spin_eo2_dn = gamma5 * dev_spin_eo2_dn + #else + to_host(g_spinor_field[DUM_MATRIX+4], dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + gamma5(g_spinor_field[DUM_MATRIX+2], g_spinor_field[DUM_MATRIX+4], VOLUME/2); // g_spinor_field[DUM_MATRIX+2] = gamma5 * g_spinor_field[DUM_MATRIX+4] + to_device(dev_spin_eo2_up, g_spinor_field[DUM_MATRIX+2], h2d_spin_up, dev_spinsize); + + to_host(g_spinor_field[DUM_MATRIX+5], dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + gamma5(g_spinor_field[DUM_MATRIX+3], g_spinor_field[DUM_MATRIX+5], VOLUME/2); // g_spinor_field[DUM_MATRIX+3] = gamma5 * g_spinor_field[DUM_MATRIX+5] + to_device(dev_spin_eo2_dn, g_spinor_field[DUM_MATRIX+3], h2d_spin_dn, dev_spinsize); + #endif + + + + + + + //////////////////// + // (a,b) -> (b,a) // + //////////////////// + + #ifndef CHECK_COPY + dev_copy_spinor_field<<>>(dev_spin_eo2_dn, dev_spin_eo3_up); // dev_spin_eo3_up = dev_spin_eo2_dn + dev_copy_spinor_field<<>>(dev_spin_eo2_up, dev_spin_eo3_dn); // dev_spin_eo3_dn = dev_spin_eo2_up + #else + to_host(g_spinor_field[DUM_MATRIX+2], dev_spin_eo2_dn, h2d_spin_up, dev_spinsize); + assign(g_spinor_field[DUM_MATRIX+6], g_spinor_field[DUM_MATRIX+2], VOLUME/2); // g_spinor_field[DUM_MATRIX+6] = g_spinor_field[DUM_MATRIX+2] + to_device(dev_spin_eo3_up, g_spinor_field[DUM_MATRIX+6], h2d_spin_up, dev_spinsize); + + to_host(g_spinor_field[DUM_MATRIX+3], dev_spin_eo2_up, h2d_spin_dn, dev_spinsize); + assign(g_spinor_field[DUM_MATRIX+7], g_spinor_field[DUM_MATRIX+3], VOLUME/2); // g_spinor_field[DUM_MATRIX+7] = g_spinor_field[DUM_MATRIX+3] + to_device(dev_spin_eo3_dn, g_spinor_field[DUM_MATRIX+7], h2d_spin_dn, dev_spinsize); + #endif + + + + + + + /////////////////////////////////// /////////////////////// + // Q_tilde(2x2) // // (Q_tilde) * (b,a) // + /////////////////////////////////// /////////////////////// + + + //////////////////////////// + // (M_oe) (Mee^-1) (M_eo) // + //////////////////////////// + + #ifndef CHECK_HOPPING_MATRIX + // xchange + #ifndef HOPPING_DEBUG + xchange_field_wrapper(dev_spin_eo3_up, 0); + #endif + #ifndef HOPPING_DEBUG + dev_Hopping_Matrix<<>>(dev_gf, dev_spin_eo3_up, dev_spin_eo1_up, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); // dev_spin_eo1_up = (M_eo) * dev_spin_eo3_up + #else + Hopping_Matrix_wrapper(0, dev_spin_eo1_up, dev_spin_eo3_up); + #endif + // xchange + #ifndef HOPPING_DEBUG + xchange_field_wrapper(dev_spin_eo3_dn, 0); + #endif + #ifndef HOPPING_DEBUG + dev_Hopping_Matrix<<>>(dev_gf, dev_spin_eo3_dn, dev_spin_eo1_dn, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); // dev_spin_eo1_dn = (M_eo) * dev_spin_eo3_dn + #else + Hopping_Matrix_wrapper(0, dev_spin_eo1_dn, dev_spin_eo3_dn); + #endif + #else + to_host(g_spinor_field[DUM_MATRIX+7], dev_spin_eo3_up, h2d_spin_up, dev_spinsize); + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX] , g_spinor_field[DUM_MATRIX+7]); // g_spinor_field[DUM_MATRIX] = (M_eo) * g_spinor_field[DUM_MATRIX+7] + to_device(dev_spin_eo1_up, g_spinor_field[DUM_MATRIX], h2d_spin_up, dev_spinsize); + + to_host(g_spinor_field[DUM_MATRIX+6], dev_spin_eo3_dn, h2d_spin_dn, dev_spinsize); + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX+1], g_spinor_field[DUM_MATRIX+6]); // g_spinor_field[DUM_MATRIX+1] = (M_eo) * g_spinor_field[DUM_MATRIX+6] + to_device(dev_spin_eo1_dn, g_spinor_field[DUM_MATRIX+1], h2d_spin_dn, dev_spinsize); + #endif + + + + + #ifndef CHECK_IMUGAMMA5 + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo1_up, dev_spin_eo2_up, -1.0); // dev_spin_eo2_up = (1 - imubar) * dev_spin_eo1_up = (1 - imubar)*(M_eo) * dev_spin_eo3_up + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo1_dn, dev_spin_eo2_dn, +1.0); // dev_spin_eo2_dn = (1 + imubar) * dev_spin_eo1_dn = (1 + imubar)*(M_eo) * dev_spin_eo3_dn + #else + to_host(g_spinor_field[DUM_MATRIX], dev_spin_eo1_up, h2d_spin_up, dev_spinsize); + mul_one_minus_imubar(g_spinor_field[DUM_MATRIX+2], g_spinor_field[DUM_MATRIX]); // g_spinor_field[DUM_MATRIX+2] = (1 - imubar) * g_spinor_field[DUM_MATRIX] + to_device(dev_spin_eo2_up, g_spinor_field[DUM_MATRIX+2], h2d_spin_up, dev_spinsize); + + to_host(g_spinor_field[DUM_MATRIX+1], dev_spin_eo1_dn, h2d_spin_dn, dev_spinsize); + mul_one_plus_imubar (g_spinor_field[DUM_MATRIX+3], g_spinor_field[DUM_MATRIX+1]); // g_spinor_field[DUM_MATRIX+3] = (1 + imubar) * g_spinor_field[DUM_MATRIX+1] + to_device(dev_spin_eo2_dn, g_spinor_field[DUM_MATRIX+3], h2d_spin_dn, dev_spinsize); + #endif + + + + + #ifndef CHECK_CUBLAS1 + cublasSaxpy (N_floats, g_epsbar, (float *) dev_spin_eo1_dn, 1, (float *) dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up + epsbar * dev_spin_eo1_dn = (1 - imubar)*(M_eo) * dev_spin_eo3_up + epsbar * (M_eo) * dev_spin_eo3_dn + cublasSaxpy (N_floats, g_epsbar, (float *) dev_spin_eo1_up, 1, (float *) dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn + epsbar * dev_spin_eo1_up = (1 + imubar)*(M_eo) * dev_spin_eo3_dn + epsbar * (M_eo) * dev_spin_eo3_up + + cublasSscal (N_floats, nrm, (float *) dev_spin_eo2_up, 1); // dev_spin_eo2_up = nrm * dev_spin_eo2_up = nrm*(1-imubar)*(M_eo) * dev_spin_eo3_up + nrm*epsbar*(M_eo) * dev_spin_eo3_dn + + cublasSscal (N_floats, nrm, (float *) dev_spin_eo2_dn, 1); // dev_spin_eo2_dn = nrm * dev_spin_eo2_dn = nrm*(1+imubar)*(M_eo) * dev_spin_eo3_dn + nrm*epsbar*(M_eo) * dev_spin_eo3_up + #else + to_host(g_spinor_field[DUM_MATRIX+1], dev_spin_eo1_dn, h2d_spin_up, dev_spinsize); + to_host(g_spinor_field[DUM_MATRIX+2], dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + assign_add_mul_r(g_spinor_field[DUM_MATRIX+2], g_spinor_field[DUM_MATRIX+1], g_epsbar, VOLUME/2); + // g_spinor_field[DUM_MATRIX+2] = g_spinor_field[DUM_MATRIX+2] + epsbar * g_spinor_field[DUM_MATRIX+1] + mul_r(g_spinor_field[DUM_MATRIX+2], nrm, g_spinor_field[DUM_MATRIX+2], VOLUME/2); // g_spinor_field[DUM_MATRIX+2] = nrm * g_spinor_field[DUM_MATRIX+2] + to_device(dev_spin_eo2_up, g_spinor_field[DUM_MATRIX+2], h2d_spin_up, dev_spinsize); + + + to_host(g_spinor_field[DUM_MATRIX], dev_spin_eo1_up, h2d_spin_dn, dev_spinsize); + to_host(g_spinor_field[DUM_MATRIX+3], dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + assign_add_mul_r(g_spinor_field[DUM_MATRIX+3], g_spinor_field[DUM_MATRIX] , g_epsbar, VOLUME/2); + // g_spinor_field[DUM_MATRIX+3] = g_spinor_field[DUM_MATRIX+3] + epsbar * g_spinor_field[DUM_MATRIX] + mul_r(g_spinor_field[DUM_MATRIX+3], nrm, g_spinor_field[DUM_MATRIX+3], VOLUME/2); // g_spinor_field[DUM_MATRIX+3] = nrm * g_spinor_field[DUM_MATRIX+3] + to_device(dev_spin_eo2_dn, g_spinor_field[DUM_MATRIX+3] , h2d_spin_dn, dev_spinsize); + #endif + + + + + #ifndef CHECK_HOPPING_MATRIX + // xchange + #ifndef HOPPING_DEBUG + xchange_field_wrapper(dev_spin_eo2_up, 1); + #endif + #ifndef HOPPING_DEBUG + dev_Hopping_Matrix<<>>(dev_gf, dev_spin_eo2_up, dev_spin_eo1_up, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + #else + Hopping_Matrix_wrapper(1, dev_spin_eo1_up, dev_spin_eo2_up); + #endif + // xchange + #ifndef HOPPING_DEBUG + xchange_field_wrapper(dev_spin_eo2_dn, 1); + #endif + #ifndef HOPPING_DEBUG + dev_Hopping_Matrix<<>>(dev_gf, dev_spin_eo2_dn, dev_spin_eo1_dn, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + #else + Hopping_Matrix_wrapper(1, dev_spin_eo1_dn, dev_spin_eo2_dn); + #endif + #else + to_host(g_spinor_field[DUM_MATRIX+2], dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + Hopping_Matrix(OE, l_strange, g_spinor_field[DUM_MATRIX+2]); // l_strange = (M_oe) * g_spinor_field[DUM_MATRIX+2] + to_device(dev_spin_eo1_up, l_strange, h2d_spin_up, dev_spinsize); + + to_host(g_spinor_field[DUM_MATRIX+3], dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + Hopping_Matrix(OE, l_charm , g_spinor_field[DUM_MATRIX+3]); // l_charm = (M_oe) * g_spinor_field[DUM_MATRIX+3] + to_device(dev_spin_eo1_dn, l_charm, h2d_spin_dn, dev_spinsize); + #endif + + + + + //////////// + // (M_oo) // + //////////// + + #ifndef CHECK_IMUGAMMA5 + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo3_up, dev_spin_eo2_up, +1.0); // dev_spin_eo2_up = (1 + imubar) * dev_spin_eo3_up + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo3_dn, dev_spin_eo2_dn, -1.0); // dev_spin_eo2_dn = (1 - imubar) * dev_spin_eo3_dn + #else + to_host(g_spinor_field[DUM_MATRIX+7], dev_spin_eo3_up, h2d_spin_up, dev_spinsize); + mul_one_plus_imubar (g_spinor_field[DUM_MATRIX] , g_spinor_field[DUM_MATRIX+7]); // g_spinor_field[DUM_MATRIX] = (1 + imubar) * g_spinor_field[DUM_MATRIX+7] + to_device(dev_spin_eo2_up, g_spinor_field[DUM_MATRIX], h2d_spin_up, dev_spinsize); + + to_host(g_spinor_field[DUM_MATRIX+6], dev_spin_eo3_dn, h2d_spin_dn, dev_spinsize); + mul_one_minus_imubar(g_spinor_field[DUM_MATRIX+1], g_spinor_field[DUM_MATRIX+6]); // g_spinor_field[DUM_MATRIX+1] = (1 - imubar) * g_spinor_field[DUM_MATRIX+6] + to_device(dev_spin_eo2_dn, g_spinor_field[DUM_MATRIX+1], h2d_spin_dn, dev_spinsize); + #endif + + + + + #ifndef CHECK_CUBLAS2 // remember: this is (M_oo) * (dev_spin_eo3_up, dev_spin_eo3_dn): + cublasSaxpy (N_floats, -g_epsbar, (float *) dev_spin_eo3_dn, 1, (float *) dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up - epsbar*dev_spin_eo3_dn = (1+imubar)*dev_spin_eo3_up - epsbar*dev_spin_eo3_dn + cublasSaxpy (N_floats, -g_epsbar, (float *) dev_spin_eo3_up, 1, (float *) dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn - epsbar*dev_spin_eo3_up = (1-imubar)*dev_spin_eo3_dn - epsbar*dev_spin_eo3_up + #else + to_host(g_spinor_field[DUM_MATRIX+6], dev_spin_eo3_dn, h2d_spin_up, dev_spinsize); + to_host(g_spinor_field[DUM_MATRIX], dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + assign_add_mul_r(g_spinor_field[DUM_MATRIX] , g_spinor_field[DUM_MATRIX+6], -g_epsbar, VOLUME/2); + // g_spinor_field[DUM_MATRIX] = g_spinor_field[DUM_MATRIX] - epsbar * g_spinor_field[DUM_MATRIX+6] + to_device(dev_spin_eo2_up, g_spinor_field[DUM_MATRIX], h2d_spin_up, dev_spinsize); + + + to_host(g_spinor_field[DUM_MATRIX+7], dev_spin_eo3_up, h2d_spin_dn, dev_spinsize); + to_host(g_spinor_field[DUM_MATRIX+1], dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + assign_add_mul_r(g_spinor_field[DUM_MATRIX+1], g_spinor_field[DUM_MATRIX+7], -g_epsbar, VOLUME/2); + // g_spinor_field[DUM_MATRIX+1] = g_spinor_field[DUM_MATRIX+1] - epsbar * g_spinor_field[DUM_MATRIX+7] + to_device(dev_spin_eo2_dn, g_spinor_field[DUM_MATRIX+1], h2d_spin_dn, dev_spinsize); + #endif + + + + + /////////////////////////////////////// + // (M_oo) - (M_oe) (Mee^-1) (M_eo) // + /////////////////////////////////////// + + #ifndef CHECK_CUBLAS3 // this is ( (M_oo) - (M_oe) (Mee^-1) (M_eo) ) * (dev_spin_eo3_up, dev_spin_eo3_dn) + cublasSaxpy (N_floats, -1.0, (float *) dev_spin_eo1_up, 1, (float *) dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up - dev_spin_eo1_up = (1+imubar) * dev_spin_eo3_up - epsbar * dev_spin_eo3_dn + // - (M_oe)*nrm*(1-imubar)*(M_eo) * dev_spin_eo3_up - (M_oe)*nrm*epsbar*(M_eo) * dev_spin_eo3_dn + cublasSaxpy (N_floats, -1.0, (float *) dev_spin_eo1_dn, 1, (float *) dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn - dev_spin_eo1_dn = (1-imubar) * dev_spin_eo3_dn - epsbar * dev_spin_eo3_up + // - (M_oe)*nrm*(1+imubar)*(M_eo) * dev_spin_eo3_dn - (M_oe)*nrm*epsbar*(M_eo) * dev_spin_eo3_up + #else + to_host(l_strange, dev_spin_eo1_up, h2d_spin_up, dev_spinsize); + to_host(g_spinor_field[DUM_MATRIX], dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + diff(l_strange, g_spinor_field[DUM_MATRIX], l_strange, VOLUME/2); // l_strange = g_spinor_field[DUM_MATRIX] - l_strange + to_device(dev_spin_eo2_up, l_strange, h2d_spin_up, dev_spinsize); + + to_host(l_charm, dev_spin_eo1_dn, h2d_spin_dn, dev_spinsize); + to_host(g_spinor_field[DUM_MATRIX+1], dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + diff(l_charm, g_spinor_field[DUM_MATRIX+1], l_charm , VOLUME/2); // l_charm = g_spinor_field[DUM_MATRIX+1] - l_charm + to_device(dev_spin_eo2_dn, l_charm, h2d_spin_dn, dev_spinsize); + #endif + + + + //////////// + // gamma5 // + //////////// + + #ifndef CHECK_GAMMA5 + dev_gamma5<<>>(dev_spin_eo2_up, dev_spin_eo2_up); // dev_spin_eo2_up = gamma5 * dev_spin_eo2_up + dev_gamma5<<>>(dev_spin_eo2_dn, dev_spin_eo2_dn); // dev_spin_eo2_dn = gamma5 * dev_spin_eo2_dn + #else + to_host(l_strange, dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + gamma5(l_strange, l_strange, VOLUME/2); // l_strange = gamma5 * l_strange + to_device(dev_spin_eo2_up, l_strange, h2d_spin_up, dev_spinsize); + + to_host(l_charm, dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + gamma5(l_charm , l_charm , VOLUME/2); // l_charm = gamma5 * l_charm + to_device(dev_spin_eo2_dn, l_charm, h2d_spin_dn, dev_spinsize); + #endif + + + + /* + //////////// + // output // // output is already done by setting dev_spin_eo2_up/dn = spinout_up/dn + //////////// + + dev_copy_spinor_field<<>>(dev_spin_eo2_up, spinout_up); // spinout_up = dev_spin_eo2_up + dev_copy_spinor_field<<>>(dev_spin_eo2_dn, spinout_dn); // spinout_dn = dev_spin_eo2_dn + */ + + + return; + +}//matrix_mpi_debug1() + + + + + + + + + + +// replaces matrix_multiplication32_mpi() for debugging !! + +// RESULT: twice the memory requirements for some of the auxiliary host/device-interaction fields !! + +void matrix_mpi_debug2 (dev_spinor * spinout_up, dev_spinor * spinout_dn, + dev_spinor * spinin_up , dev_spinor * spinin_dn , + int gridsize1, int blocksize1, int gridsize2, int blocksize2, + int gridsize3, int blocksize3, int gridsize4, int blocksize4) { + + + // we will use the auxiliary fields dev_spin_eo{1,2}_up/dn for working on and buffering + // and set dev_spin_eo2_up/dn equal spinout_up/dn + // spinin_up/dn have to remain unchanged !! + // spinout_up/dn can be freely used + + + + + ///////////////////// + // LOCAL VARIABLES // + ///////////////////// + + int N_sites = VOLUME/2; // #lattice sites + int N_floats = 24*VOLUME/2; // #floats + + // additional for the debugging purposes + size_t dev_spinsize_int = VOLUME/2 * 6*sizeof(dev_spinor); + size_t dev_spinsize_ext = (VOLUME+RAND)/2 * 6*sizeof(dev_spinor); + size_t dev_spinsize = dev_spinsize_int; + + + + //////////////////////////////////// + // MATCHING with Q_Qdagger_ND // + //////////////////////////////////// + // // + // _strange = _up // + // _charm = _dn // + // // + // DUM_MATRIX = dev_spin_eo1_up // + // DUM_MATRIX+1 = dev_spin_eo1_dn // + // // + // DUM_MATRIX+2 = dev_spin_eo2_up // + // DUM_MATRIX+3 = dev_spin_eo2_dn // + // // + //////////////////////////////////// + + + + + /////////////////////////////////// + // INITIALIZATIONS & ASSIGNMENTS // // have to use (one) other auxiliary field(s) than the calling function dev_cg_eo_nd + /////////////////////////////////// + + dev_spin_eo2_up = spinout_up; // need no memory allocated + dev_spin_eo2_dn = spinout_dn; + ///////////// THEORY //////////////////////////////////////////////////////////////// + // // + // (Q_tilde) = gamma5 * ((M_oo) - (M_oe)(Mee^-1)(M_eo)) // + // (Q_tilde)(Q_tilde_dagger) * (up,dn) = (Q_tilde) * (b,a) // + /////////////// // (a,b) = (Q_tilde) * (dn,up) // + // MAIN BODY // // // + /////////////// ///////////////////////////////////////////////////////////////////////////////////// + + + double nrm = 1.0 / (1.0 + g_mubar*g_mubar - g_epsbar*g_epsbar); + + + if (g_proc_id == 0) + printf("This is matrix_mpi_debug2(). "); + + + spinor * help1 = (spinor *) malloc((VOLUME+RAND) * 6*sizeof(dev_spinor)); // "/2" is missing because of "double" + spinor * help2 = (spinor *) malloc((VOLUME+RAND) * 6*sizeof(dev_spinor)); + spinor * help3 = (spinor *) malloc((VOLUME+RAND) * 6*sizeof(dev_spinor)); + spinor * help4 = (spinor *) malloc((VOLUME+RAND) * 6*sizeof(dev_spinor)); + + + + /////////////////////////////////////// ///////////////////////////////// + // Q_tilde_dagger(2x2) // // (a,b) = (Q_tilde) * (dn,up) // + /////////////////////////////////////// ///////////////////////////////// + + + //////////////////////////// + // (M_oe) (Mee^-1) (M_eo) // + //////////////////////////// + + + #ifndef CHECK_HOPPING_MATRIX + // xchange + #ifndef HOPPING_DEBUG + xchange_field_wrapper(spinin_dn, 0); + #endif + // Flo: + #ifdef USETEXTURE + bind_texture_spin(spinin_dn,1); + #endif + #ifndef HOPPING_DEBUG + dev_Hopping_Matrix<<>>(dev_gf, spinin_dn, dev_spin_eo1_up, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); // dev_spin_eo1_up = (M_eo) * spinin_dn + #else + Hopping_Matrix_wrapper(0, dev_spin_eo1_up, spinin_dn); + #endif + #ifdef USETEXTURE + unbind_texture_spin(1); + #endif + // xchange + #ifndef HOPPING_DEBUG + xchange_field_wrapper(spinin_up, 0); + #endif + #ifdef USETEXTURE + bind_texture_spin(spinin_up,1); + #endif + #ifndef HOPPING_DEBUG + dev_Hopping_Matrix<<>>(dev_gf, spinin_up, dev_spin_eo1_dn, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); // dev_spin_eo1_dn = (M_eo) * spinin_up + #else + Hopping_Matrix_wrapper(0, dev_spin_eo1_dn, spinin_up); + #endif + #ifdef USETEXTURE + unbind_texture_spin(1); + #endif + #else + to_host(help1, spinin_dn, h2d_spin_up, dev_spinsize); + Hopping_Matrix(EO, help2, help1); // g_spinor_field[DUM_MATRIX] = (M_eo) * k_charm + to_device(dev_spin_eo1_up, help2, h2d_spin_up, dev_spinsize); + + to_host(help3, spinin_up, h2d_spin_dn, dev_spinsize); + Hopping_Matrix(EO, help4, help3); // g_spinor_field[DUM_MATRIX+1] = (M_eo) * k_strange + to_device(dev_spin_eo1_dn, help4, h2d_spin_dn, dev_spinsize); + #endif + + + // written: + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo1_up, dev_spin_eo2_up, -1.0); // dev_spin_eo2_up = (1 - imubar) * dev_spin_eo1_up = (1 - imubar)*(M_eo) * spinin_dn + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo1_dn, dev_spin_eo2_dn, +1.0); // dev_spin_eo2_dn = (1 + imubar) * dev_spin_eo1_dn = (1 + imubar)*(M_eo) * spinin_up + + + // CUBLAS: + cublasSaxpy (N_floats, g_epsbar, (float *) dev_spin_eo1_dn, 1, (float *) dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up + epsbar * dev_spin_eo1_dn = (1 - imubar)*(M_eo) * spinin_dn + epsbar * (M_eo) * spinin_up + cublasSaxpy (N_floats, g_epsbar, (float *) dev_spin_eo1_up, 1, (float *) dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn + epsbar * dev_spin_eo1_up = (1 + imubar)*(M_eo) * spinin_up + epsbar * (M_eo) * spinin_dn + + // CUBLAS: + cublasSscal (N_floats, nrm, (float *) dev_spin_eo2_up, 1); // dev_spin_eo2_up = nrm * dev_spin_eo2_up = nrm*(1-imubar)*(M_eo) * spinin_dn + nrm*epsbar*(M_eo) * spinin_up + + cublasSscal (N_floats, nrm, (float *) dev_spin_eo2_dn, 1); // dev_spin_eo2_dn = nrm * dev_spin_eo2_dn = nrm*(1+imubar)*(M_eo) * spinin_up + nrm*epsbar*(M_eo) * spinin_dn + + + #ifndef CHECK_HOPPING_MATRIX + // xchange + #ifndef HOPPING_DEBUG + xchange_field_wrapper(dev_spin_eo2_up, 1); + #endif + // Flo: + #ifdef USETEXTURE + bind_texture_spin(dev_spin_eo2_up,1); // remember: this is ((M_oe)(Mee^-1)(M_eo)) * (spinin_dn, spinin_up): + #endif + #ifndef HOPPING_DEBUG + dev_Hopping_Matrix<<>>(dev_gf, dev_spin_eo2_up, dev_spin_eo1_up, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + #else + Hopping_Matrix_wrapper(1, dev_spin_eo1_up, dev_spin_eo2_up); + #endif + #ifdef USETEXTURE + unbind_texture_spin(1); // dev_spin_eo1_up = (M_oe) * dev_spin_eo2_up = (M_oe)*nrm*(1-imubar)*(M_eo) * spinin_dn + (M_oe)*nrm*epsbar*(M_eo) * spinin_up + #endif + // xchange + #ifndef HOPPING_DEBUG + xchange_field_wrapper(dev_spin_eo2_dn, 1); + #endif + #ifdef USETEXTURE + bind_texture_spin(dev_spin_eo2_dn,1); + #endif + #ifndef HOPPING_DEBUG + dev_Hopping_Matrix<<>>(dev_gf, dev_spin_eo2_dn, dev_spin_eo1_dn, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + #else + Hopping_Matrix_wrapper(1, dev_spin_eo1_dn, dev_spin_eo2_dn); + #endif + #ifdef USETEXTURE + unbind_texture_spin(1); // dev_spin_eo1_dn = (M_oe) * dev_spin_eo2_dn = (M_oe)*nrm*(1+imubar)*(M_eo) * spinin_up + (M_oe)*nrm*epsbar*(M_eo) * spinin_dn + #endif + #else + to_host(help1, dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + Hopping_Matrix(OE, help2, help1); // g_spinor_field[DUM_MATRIX] = (M_oe) * g_spinor_field[DUM_MATRIX+2] + to_device(dev_spin_eo1_up, help2, h2d_spin_up, dev_spinsize); + + to_host(help3, dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + Hopping_Matrix(OE, help4, help3); // g_spinor_field[DUM_MATRIX+1] = (M_oe) * g_spinor_field[DUM_MATRIX+3] + to_device(dev_spin_eo1_dn, help4, h2d_spin_dn, dev_spinsize); + #endif + + + + //////////// + // (M_oo) // + //////////// + + + // written: + dev_mul_one_pm_imubar_gamma5<<>>(spinin_dn, dev_spin_eo2_up, +1.0); // dev_spin_eo2_up = (1 + imubar) * spinin_dn + dev_mul_one_pm_imubar_gamma5<<>>(spinin_up, dev_spin_eo2_dn, -1.0); // dev_spin_eo2_dn = (1 - imubar) * spinin_up + + + // CUBLAS: // remember: this is (M_oo) * (spinin_dn, spinin_up): + cublasSaxpy (N_floats, -g_epsbar, (float *) spinin_up, 1, (float *) dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up - epsbar*spinin_up = (1+imubar)*spinin_dn - epsbar*spinin_up + cublasSaxpy (N_floats, -g_epsbar, (float *) spinin_dn, 1, (float *) dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn - epsbar*spinin_dn = (1-imubar)*spinin_up - epsbar*spinin_dn + + + + /////////////////////////////////////// + // (M_oo) - (M_oe) (Mee^-1) (M_eo) // + /////////////////////////////////////// + + // CUBLAS: // this is ((M_oo) - (M_oe)(Mee^-1)(M_eo)) * (spinin_dn, spinin_up): + cublasSaxpy (N_floats, -1.0, (float *) dev_spin_eo1_up, 1, (float *) dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up - dev_spin_eo1_up = (1+imubar) * spinin_dn - epsbar * spinin_up + // - (M_oe)*nrm*(1-imubar)*(M_eo) * spinin_dn - (M_oe)*nrm*epsbar*(M_eo) * spinin_up + cublasSaxpy (N_floats, -1.0, (float *) dev_spin_eo1_dn, 1, (float *) dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn - dev_spin_eo1_dn = (1-imubar) * spinin_up - epsbar * spinin_dn + // - (M_oe)*nrm*(1+imubar)*(M_eo) * spinin_up - (M_oe)*nrm*epsbar*(M_eo) * spinin_dn + + + //////////// + // gamma5 // + //////////// + + // Flo: + dev_gamma5<<>>(dev_spin_eo2_up, dev_spin_eo2_up); // dev_spin_eo2_up = gamma5 * dev_spin_eo2_up + dev_gamma5<<>>(dev_spin_eo2_dn, dev_spin_eo2_dn); // dev_spin_eo2_dn = gamma5 * dev_spin_eo2_dn + + + + + + + //////////////////// + // (a,b) -> (b,a) // + //////////////////// + dev_copy_spinor_field<<>>(dev_spin_eo2_dn, dev_spin_eo3_up); // dev_spin_eo3_up = dev_spin_eo2_dn + dev_copy_spinor_field<<>>(dev_spin_eo2_up, dev_spin_eo3_dn); // dev_spin_eo3_dn = dev_spin_eo2_up + + + + + + + /////////////////////////////////// /////////////////////// + // Q_tilde(2x2) // // (Q_tilde) * (b,a) // + /////////////////////////////////// /////////////////////// + + + //////////////////////////// + // (M_oe) (Mee^-1) (M_eo) // + //////////////////////////// + + #ifndef CHECK_HOPPING_MATRIX + // xchange + #ifndef HOPPING_DEBUG + xchange_field_wrapper(dev_spin_eo3_up, 0); + #endif + // Flo: + #ifdef USETEXTURE + bind_texture_spin(dev_spin_eo3_up,1); + #endif + #ifndef HOPPING_DEBUG + dev_Hopping_Matrix<<>>(dev_gf, dev_spin_eo3_up, dev_spin_eo1_up, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); // dev_spin_eo1_up = (M_eo) * dev_spin_eo3_up + #else + Hopping_Matrix_wrapper(0, dev_spin_eo1_up, dev_spin_eo3_up); + #endif + #ifdef USETEXTURE + unbind_texture_spin(1); + #endif + // xchange + #ifndef HOPPING_DEBUG + xchange_field_wrapper(dev_spin_eo3_dn, 0); + #endif + #ifdef USETEXTURE + bind_texture_spin(dev_spin_eo3_dn,1); + #endif + #ifndef HOPPING_DEBUG + dev_Hopping_Matrix<<>>(dev_gf, dev_spin_eo3_dn, dev_spin_eo1_dn, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); // dev_spin_eo1_dn = (M_eo) * dev_spin_eo3_dn + #else + Hopping_Matrix_wrapper(0, dev_spin_eo1_dn, dev_spin_eo3_dn); + #endif + #ifdef USETEXTURE + unbind_texture_spin(1); + #endif + #else + to_host(help1, dev_spin_eo3_up, h2d_spin_up, dev_spinsize); + Hopping_Matrix(EO, help2, help1); // g_spinor_field[DUM_MATRIX] = (M_eo) * g_spinor_field[DUM_MATRIX+7] + to_device(dev_spin_eo1_up, help2, h2d_spin_up, dev_spinsize); + + to_host(help3, dev_spin_eo3_dn, h2d_spin_dn, dev_spinsize); + Hopping_Matrix(EO, help4, help3); // g_spinor_field[DUM_MATRIX+1] = (M_eo) * g_spinor_field[DUM_MATRIX+6] + to_device(dev_spin_eo1_dn, help4, h2d_spin_dn, dev_spinsize); + #endif + + + // written: + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo1_up, dev_spin_eo2_up, -1.0); // dev_spin_eo2_up = (1 - imubar) * dev_spin_eo1_up = (1 - imubar)*(M_eo) * dev_spin_eo3_up + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo1_dn, dev_spin_eo2_dn, +1.0); // dev_spin_eo2_dn = (1 + imubar) * dev_spin_eo1_dn = (1 + imubar)*(M_eo) * dev_spin_eo3_dn + + + // CUBLAS: + cublasSaxpy (N_floats, g_epsbar, (float *) dev_spin_eo1_dn, 1, (float *) dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up + epsbar * dev_spin_eo1_dn = (1 - imubar)*(M_eo) * dev_spin_eo3_up + epsbar * (M_eo) * dev_spin_eo3_dn + cublasSaxpy (N_floats, g_epsbar, (float *) dev_spin_eo1_up, 1, (float *) dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn + epsbar * dev_spin_eo1_up = (1 + imubar)*(M_eo) * dev_spin_eo3_dn + epsbar * (M_eo) * dev_spin_eo3_up + + // CUBLAS: + cublasSscal (N_floats, nrm, (float *) dev_spin_eo2_up, 1); // dev_spin_eo2_up = nrm * dev_spin_eo2_up = nrm*(1-imubar)*(M_eo) * dev_spin_eo3_up + nrm*epsbar*(M_eo) * dev_spin_eo3_dn + + cublasSscal (N_floats, nrm, (float *) dev_spin_eo2_dn, 1); // dev_spin_eo2_dn = nrm * dev_spin_eo2_dn = nrm*(1+imubar)*(M_eo) * dev_spin_eo3_dn + nrm*epsbar*(M_eo) * dev_spin_eo3_up + + + #ifndef CHECK_HOPPING_MATRIX + // xchange + #ifndef HOPPING_DEBUG + xchange_field_wrapper(dev_spin_eo2_up, 1); + #endif + // Flo: + #ifdef USETEXTURE + bind_texture_spin(dev_spin_eo2_up,1); // remember: this is ((M_oe) (Mee^-1) (M_eo)) * (dev_spin_eo3_up, dev_spin_eo3_dn): + #endif + #ifndef HOPPING_DEBUG + dev_Hopping_Matrix<<>>(dev_gf, dev_spin_eo2_up, dev_spin_eo1_up, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + #else + Hopping_Matrix_wrapper(1, dev_spin_eo1_up, dev_spin_eo2_up); + #endif + #ifdef USETEXTURE + unbind_texture_spin(1); // dev_spin_eo1_up = (M_oe) * dev_spin_eo2_up = (M_oe)*nrm*(1-imubar)*(M_eo) * dev_spin_eo3_up + (M_oe)*nrm*epsbar*(M_eo) * dev_spin_eo3_dn + #endif + // xchange + #ifndef HOPPING_DEBUG + xchange_field_wrapper(dev_spin_eo2_dn, 1); + #endif + #ifdef USETEXTURE + bind_texture_spin(dev_spin_eo2_dn,1); + #endif + #ifndef HOPPING_DEBUG + dev_Hopping_Matrix<<>>(dev_gf, dev_spin_eo2_dn, dev_spin_eo1_dn, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + #else + Hopping_Matrix_wrapper(1, dev_spin_eo1_dn, dev_spin_eo2_dn); + #endif + #ifdef USETEXTURE + unbind_texture_spin(1); // dev_spin_eo1_dn = (M_oe) * dev_spin_eo2_dn = (M_oe)*nrm*(1+imubar)*(M_eo) * dev_spin_eo3_dn + (M_oe)*nrm*epsbar*(M_eo) * dev_spin_eo3_up + #endif + #else + to_host(help1, dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + Hopping_Matrix(OE, help2, help1); // l_strange = (M_oe) * g_spinor_field[DUM_MATRIX+2] + to_device(dev_spin_eo1_up, help2, h2d_spin_up, dev_spinsize); + + to_host(help3, dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + Hopping_Matrix(OE, help4, help3); // l_charm = (M_oe) * g_spinor_field[DUM_MATRIX+3] + to_device(dev_spin_eo1_dn, help4, h2d_spin_dn, dev_spinsize); + #endif + + + //////////// + // (M_oo) // + //////////// + + // written: + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo3_up, dev_spin_eo2_up, +1.0); // dev_spin_eo2_up = (1 + imubar) * dev_spin_eo3_up + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo3_dn, dev_spin_eo2_dn, -1.0); // dev_spin_eo2_dn = (1 - imubar) * dev_spin_eo3_dn + + + // CUBLAS: // remember: this is (M_oo) * (dev_spin_eo3_up, dev_spin_eo3_dn): + cublasSaxpy (N_floats, -g_epsbar, (float *) dev_spin_eo3_dn, 1, (float *) dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up - epsbar*dev_spin_eo3_dn = (1+imubar)*dev_spin_eo3_up - epsbar*dev_spin_eo3_dn + cublasSaxpy (N_floats, -g_epsbar, (float *) dev_spin_eo3_up, 1, (float *) dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn - epsbar*dev_spin_eo3_up = (1-imubar)*dev_spin_eo3_dn - epsbar*dev_spin_eo3_up + + + + /////////////////////////////////////// + // (M_oo) - (M_oe) (Mee^-1) (M_eo) // + /////////////////////////////////////// + + // CUBLAS: // this is ( (M_oo) - (M_oe) (Mee^-1) (M_eo) ) * (dev_spin_eo3_up, dev_spin_eo3_dn) + cublasSaxpy (N_floats, -1.0, (float *) dev_spin_eo1_up, 1, (float *) dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up - dev_spin_eo1_up = (1+imubar) * dev_spin_eo3_up - epsbar * dev_spin_eo3_dn + // - (M_oe)*nrm*(1-imubar)*(M_eo) * dev_spin_eo3_up - (M_oe)*nrm*epsbar*(M_eo) * dev_spin_eo3_dn + cublasSaxpy (N_floats, -1.0, (float *) dev_spin_eo1_dn, 1, (float *) dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn - dev_spin_eo1_dn = (1-imubar) * dev_spin_eo3_dn - epsbar * dev_spin_eo3_up + // - (M_oe)*nrm*(1+imubar)*(M_eo) * dev_spin_eo3_dn - (M_oe)*nrm*epsbar*(M_eo) * dev_spin_eo3_up + + + //////////// + // gamma5 // + //////////// + + // Flo: + dev_gamma5<<>>(dev_spin_eo2_up, dev_spin_eo2_up); // dev_spin_eo2_up = gamma5 * dev_spin_eo2_up + dev_gamma5<<>>(dev_spin_eo2_dn, dev_spin_eo2_dn); // dev_spin_eo2_dn = gamma5 * dev_spin_eo2_dn + + + + + /* + //////////// + // output // // output is already done by setting dev_spin_eo2_up/dn = spinout_up/dn + //////////// + + dev_copy_spinor_field<<>>(dev_spin_eo2_up, spinout_up); // spinout_up = dev_spin_eo2_up + dev_copy_spinor_field<<>>(dev_spin_eo2_dn, spinout_dn); // spinout_dn = dev_spin_eo2_dn + */ + + + return; + +}//matrix_mpi_debug2() + + + + + + + + +// replaces matrix_multiplication32_mpi() for debugging !! + +// is for the moment an identical copy of matrix_mpi_debug2() +// will be changed for testing the alternative version of the hopping matrix ... +// will for now test the xchange_fields_mpi() procedure +// +// one therefore has to turn of the xchange_fields() in the CPU's Hopping_Matrix() routine !! +// we are most likely use version 5 of Hopping_Matrix() in Hopping_Matrix.c + +#define check_xchange + +void matrix_mpi_debug3 (dev_spinor * spinout_up, dev_spinor * spinout_dn, + dev_spinor * spinin_up , dev_spinor * spinin_dn , + int gridsize1, int blocksize1, int gridsize2, int blocksize2, + int gridsize3, int blocksize3, int gridsize4, int blocksize4) { + + + // we will use the auxiliary fields dev_spin_eo{1,2}_up/dn for working on and buffering + // and set dev_spin_eo2_up/dn equal spinout_up/dn + // spinin_up/dn have to remain unchanged !! + // spinout_up/dn can be freely used + + + + + ///////////////////// + // LOCAL VARIABLES // + ///////////////////// + + int N_sites = VOLUME/2; // #lattice sites + int N_floats = 24*VOLUME/2; // #floats + + // additional for the debugging purposes + size_t dev_spinsize_int = VOLUME/2 * 6*sizeof(dev_spinor); + size_t dev_spinsize_ext = (VOLUME+RAND)/2 * 6*sizeof(dev_spinor); + size_t dev_spinsize = dev_spinsize_int; + + + + //////////////////////////////////// + // MATCHING with Q_Qdagger_ND // + //////////////////////////////////// + // // + // _strange = _up // + // _charm = _dn // + // // + // DUM_MATRIX = dev_spin_eo1_up // + // DUM_MATRIX+1 = dev_spin_eo1_dn // + // // + // DUM_MATRIX+2 = dev_spin_eo2_up // + // DUM_MATRIX+3 = dev_spin_eo2_dn // + // // + //////////////////////////////////// + + + + + /////////////////////////////////// + // INITIALIZATIONS & ASSIGNMENTS // // have to use (one) other auxiliary field(s) than the calling function dev_cg_eo_nd + /////////////////////////////////// + + dev_spin_eo2_up = spinout_up; // need no memory allocated + dev_spin_eo2_dn = spinout_dn; + ///////////// THEORY //////////////////////////////////////////////////////////////// + // // + // (Q_tilde) = gamma5 * ((M_oo) - (M_oe)(Mee^-1)(M_eo)) // + // (Q_tilde)(Q_tilde_dagger) * (up,dn) = (Q_tilde) * (b,a) // + /////////////// // (a,b) = (Q_tilde) * (dn,up) // + // MAIN BODY // // // + /////////////// ///////////////////////////////////////////////////////////////////////////////////// + + + double nrm = 1.0 / (1.0 + g_mubar*g_mubar - g_epsbar*g_epsbar); + + + if (g_proc_id == 0) + printf("This is matrix_mpi_debug3(). "); + + + spinor * help1 = (spinor *) malloc((VOLUME+RAND) * 6*sizeof(dev_spinor)); // "/2" is missing because of "double" + spinor * help2 = (spinor *) malloc((VOLUME+RAND) * 6*sizeof(dev_spinor)); + spinor * help3 = (spinor *) malloc((VOLUME+RAND) * 6*sizeof(dev_spinor)); + spinor * help4 = (spinor *) malloc((VOLUME+RAND) * 6*sizeof(dev_spinor)); + + + + /////////////////////////////////////// ///////////////////////////////// + // Q_tilde_dagger(2x2) // // (a,b) = (Q_tilde) * (dn,up) // + /////////////////////////////////////// ///////////////////////////////// + + + //////////////////////////// + // (M_oe) (Mee^-1) (M_eo) // + //////////////////////////// + + + #ifndef CHECK_HOPPING_MATRIX + + #ifndef HOPPING_DEBUG + xchange_field_wrapper(spinin_dn, 0); + dev_Hopping_Matrix<<>>(dev_gf, spinin_dn, dev_spin_eo1_up, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); // dev_spin_eo1_up = (M_eo) * spinin_dn + #else + printf("Does it work?\n"); + xchange_field_wrapper(spinin_dn, 0); + Hopping_Matrix_wrapper(0, dev_spin_eo1_up, spinin_dn); + #endif + + #ifndef HOPPING_DEBUG + xchange_field_wrapper(spinin_up, 0); + dev_Hopping_Matrix<<>>(dev_gf, spinin_up, dev_spin_eo1_dn, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); // dev_spin_eo1_dn = (M_eo) * spinin_up + #else + xchange_field_wrapper(spinin_up, 0); + Hopping_Matrix_wrapper(0, dev_spin_eo1_dn, spinin_up); + #endif + #else + to_host(help1, spinin_dn, h2d_spin_up, dev_spinsize); + Hopping_Matrix(EO, help2, help1); // g_spinor_field[DUM_MATRIX] = (M_eo) * k_charm + to_device(dev_spin_eo1_up, help2, h2d_spin_up, dev_spinsize); + + to_host(help3, spinin_up, h2d_spin_dn, dev_spinsize); + Hopping_Matrix(EO, help4, help3); // g_spinor_field[DUM_MATRIX+1] = (M_eo) * k_strange + to_device(dev_spin_eo1_dn, help4, h2d_spin_dn, dev_spinsize); + #endif + + + // written: + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo1_up, dev_spin_eo2_up, -1.0); // dev_spin_eo2_up = (1 - imubar) * dev_spin_eo1_up = (1 - imubar)*(M_eo) * spinin_dn + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo1_dn, dev_spin_eo2_dn, +1.0); // dev_spin_eo2_dn = (1 + imubar) * dev_spin_eo1_dn = (1 + imubar)*(M_eo) * spinin_up + + + // CUBLAS: + cublasSaxpy (N_floats, g_epsbar, (float *) dev_spin_eo1_dn, 1, (float *) dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up + epsbar * dev_spin_eo1_dn = (1 - imubar)*(M_eo) * spinin_dn + epsbar * (M_eo) * spinin_up + cublasSaxpy (N_floats, g_epsbar, (float *) dev_spin_eo1_up, 1, (float *) dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn + epsbar * dev_spin_eo1_up = (1 + imubar)*(M_eo) * spinin_up + epsbar * (M_eo) * spinin_dn + + // CUBLAS: + cublasSscal (N_floats, nrm, (float *) dev_spin_eo2_up, 1); // dev_spin_eo2_up = nrm * dev_spin_eo2_up = nrm*(1-imubar)*(M_eo) * spinin_dn + nrm*epsbar*(M_eo) * spinin_up + + cublasSscal (N_floats, nrm, (float *) dev_spin_eo2_dn, 1); // dev_spin_eo2_dn = nrm * dev_spin_eo2_dn = nrm*(1+imubar)*(M_eo) * spinin_up + nrm*epsbar*(M_eo) * spinin_dn + + + #ifndef CHECK_HOPPING_MATRIX + + #ifndef HOPPING_DEBUG + xchange_field_wrapper(dev_spin_eo2_up, 1); + dev_Hopping_Matrix<<>>(dev_gf, dev_spin_eo2_up, dev_spin_eo1_up, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + #else + xchange_field_wrapper(dev_spin_eo2_up, 1); + Hopping_Matrix_wrapper(1, dev_spin_eo1_up, dev_spin_eo2_up); + #endif + + #ifndef HOPPING_DEBUG + xchange_field_wrapper(dev_spin_eo2_dn, 1); + dev_Hopping_Matrix<<>>(dev_gf, dev_spin_eo2_dn, dev_spin_eo1_dn, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + #else + xchange_field_wrapper(dev_spin_eo2_dn, 1); + Hopping_Matrix_wrapper(1, dev_spin_eo1_dn, dev_spin_eo2_dn); + #endif + #else + to_host(help1, dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + Hopping_Matrix(OE, help2, help1); // g_spinor_field[DUM_MATRIX] = (M_oe) * g_spinor_field[DUM_MATRIX+2] + to_device(dev_spin_eo1_up, help2, h2d_spin_up, dev_spinsize); + + to_host(help3, dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + Hopping_Matrix(OE, help4, help3); // g_spinor_field[DUM_MATRIX+1] = (M_oe) * g_spinor_field[DUM_MATRIX+3] + to_device(dev_spin_eo1_dn, help4, h2d_spin_dn, dev_spinsize); + #endif + + + + //////////// + // (M_oo) // + //////////// + + + // written: + dev_mul_one_pm_imubar_gamma5<<>>(spinin_dn, dev_spin_eo2_up, +1.0); // dev_spin_eo2_up = (1 + imubar) * spinin_dn + dev_mul_one_pm_imubar_gamma5<<>>(spinin_up, dev_spin_eo2_dn, -1.0); // dev_spin_eo2_dn = (1 - imubar) * spinin_up + + + // CUBLAS: // remember: this is (M_oo) * (spinin_dn, spinin_up): + cublasSaxpy (N_floats, -g_epsbar, (float *) spinin_up, 1, (float *) dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up - epsbar*spinin_up = (1+imubar)*spinin_dn - epsbar*spinin_up + cublasSaxpy (N_floats, -g_epsbar, (float *) spinin_dn, 1, (float *) dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn - epsbar*spinin_dn = (1-imubar)*spinin_up - epsbar*spinin_dn + + + + /////////////////////////////////////// + // (M_oo) - (M_oe) (Mee^-1) (M_eo) // + /////////////////////////////////////// + + // CUBLAS: // this is ((M_oo) - (M_oe)(Mee^-1)(M_eo)) * (spinin_dn, spinin_up): + cublasSaxpy (N_floats, -1.0, (float *) dev_spin_eo1_up, 1, (float *) dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up - dev_spin_eo1_up = (1+imubar) * spinin_dn - epsbar * spinin_up + // - (M_oe)*nrm*(1-imubar)*(M_eo) * spinin_dn - (M_oe)*nrm*epsbar*(M_eo) * spinin_up + cublasSaxpy (N_floats, -1.0, (float *) dev_spin_eo1_dn, 1, (float *) dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn - dev_spin_eo1_dn = (1-imubar) * spinin_up - epsbar * spinin_dn + // - (M_oe)*nrm*(1+imubar)*(M_eo) * spinin_up - (M_oe)*nrm*epsbar*(M_eo) * spinin_dn + + + //////////// + // gamma5 // + //////////// + + // Flo: + dev_gamma5<<>>(dev_spin_eo2_up, dev_spin_eo2_up); // dev_spin_eo2_up = gamma5 * dev_spin_eo2_up + dev_gamma5<<>>(dev_spin_eo2_dn, dev_spin_eo2_dn); // dev_spin_eo2_dn = gamma5 * dev_spin_eo2_dn + + + + + + + //////////////////// + // (a,b) -> (b,a) // + //////////////////// + dev_copy_spinor_field<<>>(dev_spin_eo2_dn, dev_spin_eo3_up); // dev_spin_eo3_up = dev_spin_eo2_dn + dev_copy_spinor_field<<>>(dev_spin_eo2_up, dev_spin_eo3_dn); // dev_spin_eo3_dn = dev_spin_eo2_up + + + + + + + /////////////////////////////////// /////////////////////// + // Q_tilde(2x2) // // (Q_tilde) * (b,a) // + /////////////////////////////////// /////////////////////// + + + //////////////////////////// + // (M_oe) (Mee^-1) (M_eo) // + //////////////////////////// + + #ifndef CHECK_HOPPING_MATRIX + + #ifndef HOPPING_DEBUG + xchange_field_wrapper(dev_spin_eo3_up, 0); + dev_Hopping_Matrix<<>>(dev_gf, dev_spin_eo3_up, dev_spin_eo1_up, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); // dev_spin_eo1_up = (M_eo) * dev_spin_eo3_up + #else + xchange_field_wrapper(dev_spin_eo3_up, 0); + Hopping_Matrix_wrapper(0, dev_spin_eo1_up, dev_spin_eo3_up); + #endif + + #ifndef HOPPING_DEBUG + xchange_field_wrapper(dev_spin_eo3_dn, 0); + dev_Hopping_Matrix<<>>(dev_gf, dev_spin_eo3_dn, dev_spin_eo1_dn, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); // dev_spin_eo1_dn = (M_eo) * dev_spin_eo3_dn + #else + xchange_field_wrapper(dev_spin_eo3_dn, 0); + Hopping_Matrix_wrapper(0, dev_spin_eo1_dn, dev_spin_eo3_dn); + #endif + #else + to_host(help1, dev_spin_eo3_up, h2d_spin_up, dev_spinsize); + Hopping_Matrix(EO, help2, help1); // g_spinor_field[DUM_MATRIX] = (M_eo) * g_spinor_field[DUM_MATRIX+7] + to_device(dev_spin_eo1_up, help2, h2d_spin_up, dev_spinsize); + + to_host(help3, dev_spin_eo3_dn, h2d_spin_dn, dev_spinsize); + Hopping_Matrix(EO, help4, help3); // g_spinor_field[DUM_MATRIX+1] = (M_eo) * g_spinor_field[DUM_MATRIX+6] + to_device(dev_spin_eo1_dn, help4, h2d_spin_dn, dev_spinsize); + #endif + + + // written: + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo1_up, dev_spin_eo2_up, -1.0); // dev_spin_eo2_up = (1 - imubar) * dev_spin_eo1_up = (1 - imubar)*(M_eo) * dev_spin_eo3_up + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo1_dn, dev_spin_eo2_dn, +1.0); // dev_spin_eo2_dn = (1 + imubar) * dev_spin_eo1_dn = (1 + imubar)*(M_eo) * dev_spin_eo3_dn + + + // CUBLAS: + cublasSaxpy (N_floats, g_epsbar, (float *) dev_spin_eo1_dn, 1, (float *) dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up + epsbar * dev_spin_eo1_dn = (1 - imubar)*(M_eo) * dev_spin_eo3_up + epsbar * (M_eo) * dev_spin_eo3_dn + cublasSaxpy (N_floats, g_epsbar, (float *) dev_spin_eo1_up, 1, (float *) dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn + epsbar * dev_spin_eo1_up = (1 + imubar)*(M_eo) * dev_spin_eo3_dn + epsbar * (M_eo) * dev_spin_eo3_up + + // CUBLAS: + cublasSscal (N_floats, nrm, (float *) dev_spin_eo2_up, 1); // dev_spin_eo2_up = nrm * dev_spin_eo2_up = nrm*(1-imubar)*(M_eo) * dev_spin_eo3_up + nrm*epsbar*(M_eo) * dev_spin_eo3_dn + + cublasSscal (N_floats, nrm, (float *) dev_spin_eo2_dn, 1); // dev_spin_eo2_dn = nrm * dev_spin_eo2_dn = nrm*(1+imubar)*(M_eo) * dev_spin_eo3_dn + nrm*epsbar*(M_eo) * dev_spin_eo3_up + + + #ifndef CHECK_HOPPING_MATRIX + + #ifndef HOPPING_DEBUG + xchange_field_wrapper(dev_spin_eo2_up, 1); + dev_Hopping_Matrix<<>>(dev_gf, dev_spin_eo2_up, dev_spin_eo1_up, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + #else + xchange_field_wrapper(dev_spin_eo2_up, 1); + Hopping_Matrix_wrapper(1, dev_spin_eo1_up, dev_spin_eo2_up); + #endif + + #ifndef HOPPING_DEBUG + xchange_field_wrapper(dev_spin_eo2_dn, 1); + dev_Hopping_Matrix<<>>(dev_gf, dev_spin_eo2_dn, dev_spin_eo1_dn, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + #else + xchange_field_wrapper(dev_spin_eo2_dn, 1); + Hopping_Matrix_wrapper(1, dev_spin_eo1_dn, dev_spin_eo2_dn); + #endif + #else + to_host(help1, dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + Hopping_Matrix(OE, help2, help1); // l_strange = (M_oe) * g_spinor_field[DUM_MATRIX+2] + to_device(dev_spin_eo1_up, help2, h2d_spin_up, dev_spinsize); + + to_host(help3, dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + Hopping_Matrix(OE, help4, help3); // l_charm = (M_oe) * g_spinor_field[DUM_MATRIX+3] + to_device(dev_spin_eo1_dn, help4, h2d_spin_dn, dev_spinsize); + #endif + + + //////////// + // (M_oo) // + //////////// + + // written: + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo3_up, dev_spin_eo2_up, +1.0); // dev_spin_eo2_up = (1 + imubar) * dev_spin_eo3_up + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo3_dn, dev_spin_eo2_dn, -1.0); // dev_spin_eo2_dn = (1 - imubar) * dev_spin_eo3_dn + + + // CUBLAS: // remember: this is (M_oo) * (dev_spin_eo3_up, dev_spin_eo3_dn): + cublasSaxpy (N_floats, -g_epsbar, (float *) dev_spin_eo3_dn, 1, (float *) dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up - epsbar*dev_spin_eo3_dn = (1+imubar)*dev_spin_eo3_up - epsbar*dev_spin_eo3_dn + cublasSaxpy (N_floats, -g_epsbar, (float *) dev_spin_eo3_up, 1, (float *) dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn - epsbar*dev_spin_eo3_up = (1-imubar)*dev_spin_eo3_dn - epsbar*dev_spin_eo3_up + + + + /////////////////////////////////////// + // (M_oo) - (M_oe) (Mee^-1) (M_eo) // + /////////////////////////////////////// + + // CUBLAS: // this is ( (M_oo) - (M_oe) (Mee^-1) (M_eo) ) * (dev_spin_eo3_up, dev_spin_eo3_dn) + cublasSaxpy (N_floats, -1.0, (float *) dev_spin_eo1_up, 1, (float *) dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up - dev_spin_eo1_up = (1+imubar) * dev_spin_eo3_up - epsbar * dev_spin_eo3_dn + // - (M_oe)*nrm*(1-imubar)*(M_eo) * dev_spin_eo3_up - (M_oe)*nrm*epsbar*(M_eo) * dev_spin_eo3_dn + cublasSaxpy (N_floats, -1.0, (float *) dev_spin_eo1_dn, 1, (float *) dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn - dev_spin_eo1_dn = (1-imubar) * dev_spin_eo3_dn - epsbar * dev_spin_eo3_up + // - (M_oe)*nrm*(1+imubar)*(M_eo) * dev_spin_eo3_dn - (M_oe)*nrm*epsbar*(M_eo) * dev_spin_eo3_up + + + //////////// + // gamma5 // + //////////// + + // Flo: + dev_gamma5<<>>(dev_spin_eo2_up, dev_spin_eo2_up); // dev_spin_eo2_up = gamma5 * dev_spin_eo2_up + dev_gamma5<<>>(dev_spin_eo2_dn, dev_spin_eo2_dn); // dev_spin_eo2_dn = gamma5 * dev_spin_eo2_dn + + + + + /* + //////////// + // output // // output is already done by setting dev_spin_eo2_up/dn = spinout_up/dn + //////////// + + dev_copy_spinor_field<<>>(dev_spin_eo2_up, spinout_up); // spinout_up = dev_spin_eo2_up + dev_copy_spinor_field<<>>(dev_spin_eo2_dn, spinout_dn); // spinout_dn = dev_spin_eo2_dn + */ + + + return; + +}//matrix_mpi_debug3() + + + + + + + +// replaces matrix_multiplication32_mpi() for debugging !! + +// is for the moment an identical copy of matrix_mpi_debug2() +// will be changed for testing the alternative version of the hopping matrix ... +// will for now test the xchange_fields_mpi() procedure +// +// one therefore has to turn of the xchange_fields() in the CPU's Hopping_Matrix() routine !! +// we are most likely use version 5 of Hopping_Matrix() in Hopping_Matrix.c + +void matrix_mpi_debug4 (dev_spinor * spinout_up, dev_spinor * spinout_dn, + dev_spinor * spinin_up , dev_spinor * spinin_dn , + int gridsize1, int blocksize1, int gridsize2, int blocksize2, + int gridsize3, int blocksize3, int gridsize4, int blocksize4) { + + + // we will use the auxiliary fields dev_spin_eo{1,2}_up/dn for working on and buffering + // and set dev_spin_eo2_up/dn equal spinout_up/dn + // spinin_up/dn have to remain unchanged !! + // spinout_up/dn can be freely used + + + + + ///////////////////// + // LOCAL VARIABLES // + ///////////////////// + + int N_sites = VOLUME/2; // #lattice sites + int N_floats = 24*VOLUME/2; // #floats + + // additional for the debugging purposes + size_t dev_spinsize_int = VOLUME/2 * 6*sizeof(dev_spinor); + size_t dev_spinsize_ext = (VOLUME+RAND)/2 * 6*sizeof(dev_spinor); + size_t dev_spinsize = dev_spinsize_int; + + + + //////////////////////////////////// + // MATCHING with Q_Qdagger_ND // + //////////////////////////////////// + // // + // _strange = _up // + // _charm = _dn // + // // + // DUM_MATRIX = dev_spin_eo1_up // + // DUM_MATRIX+1 = dev_spin_eo1_dn // + // // + // DUM_MATRIX+2 = dev_spin_eo2_up // + // DUM_MATRIX+3 = dev_spin_eo2_dn // + // // + //////////////////////////////////// + + + + + /////////////////////////////////// + // INITIALIZATIONS & ASSIGNMENTS // // have to use (one) other auxiliary field(s) than the calling function dev_cg_eo_nd + /////////////////////////////////// + + dev_spin_eo2_up = spinout_up; // need no memory allocated + dev_spin_eo2_dn = spinout_dn; + ///////////// THEORY //////////////////////////////////////////////////////////////// + // // + // (Q_tilde) = gamma5 * ((M_oo) - (M_oe)(Mee^-1)(M_eo)) // + // (Q_tilde)(Q_tilde_dagger) * (up,dn) = (Q_tilde) * (b,a) // + /////////////// // (a,b) = (Q_tilde) * (dn,up) // + // MAIN BODY // // // + /////////////// ///////////////////////////////////////////////////////////////////////////////////// + + + double nrm = 1.0 / (1.0 + g_mubar*g_mubar - g_epsbar*g_epsbar); + + + if (g_proc_id == 0) + printf("This is matrix_mpi_debug4(). "); + + + spinor * help1 = (spinor *) malloc((VOLUME+RAND) * 6*sizeof(dev_spinor)); // "/2" is missing because of "double" + spinor * help2 = (spinor *) malloc((VOLUME+RAND) * 6*sizeof(dev_spinor)); + spinor * help3 = (spinor *) malloc((VOLUME+RAND) * 6*sizeof(dev_spinor)); + spinor * help4 = (spinor *) malloc((VOLUME+RAND) * 6*sizeof(dev_spinor)); + + + + /////////////////////////////////////// ///////////////////////////////// + // Q_tilde_dagger(2x2) // // (a,b) = (Q_tilde) * (dn,up) // + /////////////////////////////////////// ///////////////////////////////// + + + //////////////////////////// + // (M_oe) (Mee^-1) (M_eo) // + //////////////////////////// + + + xchange_test = 1; // is set to "1", the xchange_field() in the host Hopping_Matrix() will be turned off + // procedure: define global variable in Hopping_Matrix.c which can disable the xchange of the fields + // set this variable appropriately ... + + #ifndef CHECK_HOPPING_MATRIX + + #ifndef HOPPING_DEBUG + xchange_field_wrapper(spinin_dn, 0); + dev_Hopping_Matrix<<>>(dev_gf, spinin_dn, dev_spin_eo1_up, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); // dev_spin_eo1_up = (M_eo) * spinin_dn + #else + //printf("Does it work?\n"); + xchange_field_wrapper(spinin_dn, 0); + Hopping_Matrix_wrapper(0, dev_spin_eo1_up, spinin_dn); + #endif + + #ifndef HOPPING_DEBUG + xchange_field_wrapper(spinin_up, 0); + dev_Hopping_Matrix<<>>(dev_gf, spinin_up, dev_spin_eo1_dn, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); // dev_spin_eo1_dn = (M_eo) * spinin_up + #else + xchange_field_wrapper(spinin_up, 0); + Hopping_Matrix_wrapper(0, dev_spin_eo1_dn, spinin_up); + #endif + #else + to_host(help1, spinin_dn, h2d_spin_up, dev_spinsize); + Hopping_Matrix(EO, help2, help1); // g_spinor_field[DUM_MATRIX] = (M_eo) * k_charm + to_device(dev_spin_eo1_up, help2, h2d_spin_up, dev_spinsize); + + to_host(help3, spinin_up, h2d_spin_dn, dev_spinsize); + Hopping_Matrix(EO, help4, help3); // g_spinor_field[DUM_MATRIX+1] = (M_eo) * k_strange + to_device(dev_spin_eo1_dn, help4, h2d_spin_dn, dev_spinsize); + #endif + + + // written: + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo1_up, dev_spin_eo2_up, -1.0); // dev_spin_eo2_up = (1 - imubar) * dev_spin_eo1_up = (1 - imubar)*(M_eo) * spinin_dn + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo1_dn, dev_spin_eo2_dn, +1.0); // dev_spin_eo2_dn = (1 + imubar) * dev_spin_eo1_dn = (1 + imubar)*(M_eo) * spinin_up + + + // CUBLAS: + cublasSaxpy (N_floats, g_epsbar, (float *) dev_spin_eo1_dn, 1, (float *) dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up + epsbar * dev_spin_eo1_dn = (1 - imubar)*(M_eo) * spinin_dn + epsbar * (M_eo) * spinin_up + cublasSaxpy (N_floats, g_epsbar, (float *) dev_spin_eo1_up, 1, (float *) dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn + epsbar * dev_spin_eo1_up = (1 + imubar)*(M_eo) * spinin_up + epsbar * (M_eo) * spinin_dn + + // CUBLAS: + cublasSscal (N_floats, nrm, (float *) dev_spin_eo2_up, 1); // dev_spin_eo2_up = nrm * dev_spin_eo2_up = nrm*(1-imubar)*(M_eo) * spinin_dn + nrm*epsbar*(M_eo) * spinin_up + + cublasSscal (N_floats, nrm, (float *) dev_spin_eo2_dn, 1); // dev_spin_eo2_dn = nrm * dev_spin_eo2_dn = nrm*(1+imubar)*(M_eo) * spinin_up + nrm*epsbar*(M_eo) * spinin_dn + + + #ifndef CHECK_HOPPING_MATRIX + + #ifndef HOPPING_DEBUG + xchange_field_wrapper(dev_spin_eo2_up, 1); + dev_Hopping_Matrix<<>>(dev_gf, dev_spin_eo2_up, dev_spin_eo1_up, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + #else + xchange_field_wrapper(dev_spin_eo2_up, 1); + Hopping_Matrix_wrapper(1, dev_spin_eo1_up, dev_spin_eo2_up); + #endif + + #ifndef HOPPING_DEBUG + xchange_field_wrapper(dev_spin_eo2_dn, 1); + dev_Hopping_Matrix<<>>(dev_gf, dev_spin_eo2_dn, dev_spin_eo1_dn, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + #else + xchange_field_wrapper(dev_spin_eo2_dn, 1); + Hopping_Matrix_wrapper(1, dev_spin_eo1_dn, dev_spin_eo2_dn); + #endif + #else + to_host(help1, dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + Hopping_Matrix(OE, help2, help1); // g_spinor_field[DUM_MATRIX] = (M_oe) * g_spinor_field[DUM_MATRIX+2] + to_device(dev_spin_eo1_up, help2, h2d_spin_up, dev_spinsize); + + to_host(help3, dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + Hopping_Matrix(OE, help4, help3); // g_spinor_field[DUM_MATRIX+1] = (M_oe) * g_spinor_field[DUM_MATRIX+3] + to_device(dev_spin_eo1_dn, help4, h2d_spin_dn, dev_spinsize); + #endif + + + + //////////// + // (M_oo) // + //////////// + + + // written: + dev_mul_one_pm_imubar_gamma5<<>>(spinin_dn, dev_spin_eo2_up, +1.0); // dev_spin_eo2_up = (1 + imubar) * spinin_dn + dev_mul_one_pm_imubar_gamma5<<>>(spinin_up, dev_spin_eo2_dn, -1.0); // dev_spin_eo2_dn = (1 - imubar) * spinin_up + + + // CUBLAS: // remember: this is (M_oo) * (spinin_dn, spinin_up): + cublasSaxpy (N_floats, -g_epsbar, (float *) spinin_up, 1, (float *) dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up - epsbar*spinin_up = (1+imubar)*spinin_dn - epsbar*spinin_up + cublasSaxpy (N_floats, -g_epsbar, (float *) spinin_dn, 1, (float *) dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn - epsbar*spinin_dn = (1-imubar)*spinin_up - epsbar*spinin_dn + + + + /////////////////////////////////////// + // (M_oo) - (M_oe) (Mee^-1) (M_eo) // + /////////////////////////////////////// + + // CUBLAS: // this is ((M_oo) - (M_oe)(Mee^-1)(M_eo)) * (spinin_dn, spinin_up): + cublasSaxpy (N_floats, -1.0, (float *) dev_spin_eo1_up, 1, (float *) dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up - dev_spin_eo1_up = (1+imubar) * spinin_dn - epsbar * spinin_up + // - (M_oe)*nrm*(1-imubar)*(M_eo) * spinin_dn - (M_oe)*nrm*epsbar*(M_eo) * spinin_up + cublasSaxpy (N_floats, -1.0, (float *) dev_spin_eo1_dn, 1, (float *) dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn - dev_spin_eo1_dn = (1-imubar) * spinin_up - epsbar * spinin_dn + // - (M_oe)*nrm*(1+imubar)*(M_eo) * spinin_up - (M_oe)*nrm*epsbar*(M_eo) * spinin_dn + + + //////////// + // gamma5 // + //////////// + + // Flo: + dev_gamma5<<>>(dev_spin_eo2_up, dev_spin_eo2_up); // dev_spin_eo2_up = gamma5 * dev_spin_eo2_up + dev_gamma5<<>>(dev_spin_eo2_dn, dev_spin_eo2_dn); // dev_spin_eo2_dn = gamma5 * dev_spin_eo2_dn + + + + + + + //////////////////// + // (a,b) -> (b,a) // + //////////////////// + dev_copy_spinor_field<<>>(dev_spin_eo2_dn, dev_spin_eo3_up); // dev_spin_eo3_up = dev_spin_eo2_dn + dev_copy_spinor_field<<>>(dev_spin_eo2_up, dev_spin_eo3_dn); // dev_spin_eo3_dn = dev_spin_eo2_up + + + + + + + /////////////////////////////////// /////////////////////// + // Q_tilde(2x2) // // (Q_tilde) * (b,a) // + /////////////////////////////////// /////////////////////// + + + //////////////////////////// + // (M_oe) (Mee^-1) (M_eo) // + //////////////////////////// + + #ifndef CHECK_HOPPING_MATRIX + + #ifndef HOPPING_DEBUG + xchange_field_wrapper(dev_spin_eo3_up, 0); + dev_Hopping_Matrix<<>>(dev_gf, dev_spin_eo3_up, dev_spin_eo1_up, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); // dev_spin_eo1_up = (M_eo) * dev_spin_eo3_up + #else + xchange_field_wrapper(dev_spin_eo3_up, 0); + Hopping_Matrix_wrapper(0, dev_spin_eo1_up, dev_spin_eo3_up); + #endif + + #ifndef HOPPING_DEBUG + xchange_field_wrapper(dev_spin_eo3_dn, 0); + dev_Hopping_Matrix<<>>(dev_gf, dev_spin_eo3_dn, dev_spin_eo1_dn, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); // dev_spin_eo1_dn = (M_eo) * dev_spin_eo3_dn + #else + xchange_field_wrapper(dev_spin_eo3_dn, 0); + Hopping_Matrix_wrapper(0, dev_spin_eo1_dn, dev_spin_eo3_dn); + #endif + #else + to_host(help1, dev_spin_eo3_up, h2d_spin_up, dev_spinsize); + Hopping_Matrix(EO, help2, help1); // g_spinor_field[DUM_MATRIX] = (M_eo) * g_spinor_field[DUM_MATRIX+7] + to_device(dev_spin_eo1_up, help2, h2d_spin_up, dev_spinsize); + + to_host(help3, dev_spin_eo3_dn, h2d_spin_dn, dev_spinsize); + Hopping_Matrix(EO, help4, help3); // g_spinor_field[DUM_MATRIX+1] = (M_eo) * g_spinor_field[DUM_MATRIX+6] + to_device(dev_spin_eo1_dn, help4, h2d_spin_dn, dev_spinsize); + #endif + + + // written: + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo1_up, dev_spin_eo2_up, -1.0); // dev_spin_eo2_up = (1 - imubar) * dev_spin_eo1_up = (1 - imubar)*(M_eo) * dev_spin_eo3_up + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo1_dn, dev_spin_eo2_dn, +1.0); // dev_spin_eo2_dn = (1 + imubar) * dev_spin_eo1_dn = (1 + imubar)*(M_eo) * dev_spin_eo3_dn + + + // CUBLAS: + cublasSaxpy (N_floats, g_epsbar, (float *) dev_spin_eo1_dn, 1, (float *) dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up + epsbar * dev_spin_eo1_dn = (1 - imubar)*(M_eo) * dev_spin_eo3_up + epsbar * (M_eo) * dev_spin_eo3_dn + cublasSaxpy (N_floats, g_epsbar, (float *) dev_spin_eo1_up, 1, (float *) dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn + epsbar * dev_spin_eo1_up = (1 + imubar)*(M_eo) * dev_spin_eo3_dn + epsbar * (M_eo) * dev_spin_eo3_up + + // CUBLAS: + cublasSscal (N_floats, nrm, (float *) dev_spin_eo2_up, 1); // dev_spin_eo2_up = nrm * dev_spin_eo2_up = nrm*(1-imubar)*(M_eo) * dev_spin_eo3_up + nrm*epsbar*(M_eo) * dev_spin_eo3_dn + + cublasSscal (N_floats, nrm, (float *) dev_spin_eo2_dn, 1); // dev_spin_eo2_dn = nrm * dev_spin_eo2_dn = nrm*(1+imubar)*(M_eo) * dev_spin_eo3_dn + nrm*epsbar*(M_eo) * dev_spin_eo3_up + + + #ifndef CHECK_HOPPING_MATRIX + + #ifndef HOPPING_DEBUG + xchange_field_wrapper(dev_spin_eo2_up, 1); + dev_Hopping_Matrix<<>>(dev_gf, dev_spin_eo2_up, dev_spin_eo1_up, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + #else + xchange_field_wrapper(dev_spin_eo2_up, 1); + Hopping_Matrix_wrapper(1, dev_spin_eo1_up, dev_spin_eo2_up); + #endif + + #ifndef HOPPING_DEBUG + xchange_field_wrapper(dev_spin_eo2_dn, 1); + dev_Hopping_Matrix<<>>(dev_gf, dev_spin_eo2_dn, dev_spin_eo1_dn, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + #else + xchange_field_wrapper(dev_spin_eo2_dn, 1); + Hopping_Matrix_wrapper(1, dev_spin_eo1_dn, dev_spin_eo2_dn); + #endif + #else + to_host(help1, dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + Hopping_Matrix(OE, help2, help1); // l_strange = (M_oe) * g_spinor_field[DUM_MATRIX+2] + to_device(dev_spin_eo1_up, help2, h2d_spin_up, dev_spinsize); + + to_host(help3, dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + Hopping_Matrix(OE, help4, help3); // l_charm = (M_oe) * g_spinor_field[DUM_MATRIX+3] + to_device(dev_spin_eo1_dn, help4, h2d_spin_dn, dev_spinsize); + #endif + + + //////////// + // (M_oo) // + //////////// + + // written: + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo3_up, dev_spin_eo2_up, +1.0); // dev_spin_eo2_up = (1 + imubar) * dev_spin_eo3_up + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo3_dn, dev_spin_eo2_dn, -1.0); // dev_spin_eo2_dn = (1 - imubar) * dev_spin_eo3_dn + + + // CUBLAS: // remember: this is (M_oo) * (dev_spin_eo3_up, dev_spin_eo3_dn): + cublasSaxpy (N_floats, -g_epsbar, (float *) dev_spin_eo3_dn, 1, (float *) dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up - epsbar*dev_spin_eo3_dn = (1+imubar)*dev_spin_eo3_up - epsbar*dev_spin_eo3_dn + cublasSaxpy (N_floats, -g_epsbar, (float *) dev_spin_eo3_up, 1, (float *) dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn - epsbar*dev_spin_eo3_up = (1-imubar)*dev_spin_eo3_dn - epsbar*dev_spin_eo3_up + + + + /////////////////////////////////////// + // (M_oo) - (M_oe) (Mee^-1) (M_eo) // + /////////////////////////////////////// + + // CUBLAS: // this is ( (M_oo) - (M_oe) (Mee^-1) (M_eo) ) * (dev_spin_eo3_up, dev_spin_eo3_dn) + cublasSaxpy (N_floats, -1.0, (float *) dev_spin_eo1_up, 1, (float *) dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up - dev_spin_eo1_up = (1+imubar) * dev_spin_eo3_up - epsbar * dev_spin_eo3_dn + // - (M_oe)*nrm*(1-imubar)*(M_eo) * dev_spin_eo3_up - (M_oe)*nrm*epsbar*(M_eo) * dev_spin_eo3_dn + cublasSaxpy (N_floats, -1.0, (float *) dev_spin_eo1_dn, 1, (float *) dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn - dev_spin_eo1_dn = (1-imubar) * dev_spin_eo3_dn - epsbar * dev_spin_eo3_up + // - (M_oe)*nrm*(1+imubar)*(M_eo) * dev_spin_eo3_dn - (M_oe)*nrm*epsbar*(M_eo) * dev_spin_eo3_up + + + //////////// + // gamma5 // + //////////// + + // Flo: + dev_gamma5<<>>(dev_spin_eo2_up, dev_spin_eo2_up); // dev_spin_eo2_up = gamma5 * dev_spin_eo2_up + dev_gamma5<<>>(dev_spin_eo2_dn, dev_spin_eo2_dn); // dev_spin_eo2_dn = gamma5 * dev_spin_eo2_dn + + + + + /* + //////////// + // output // // output is already done by setting dev_spin_eo2_up/dn = spinout_up/dn + //////////// + + dev_copy_spinor_field<<>>(dev_spin_eo2_up, spinout_up); // spinout_up = dev_spin_eo2_up + dev_copy_spinor_field<<>>(dev_spin_eo2_dn, spinout_dn); // spinout_dn = dev_spin_eo2_dn + */ + + + xchange_test = 0; + + + return; + +}//matrix_mpi_debug4() + + + + + + + + + +// this replaces Q_Qdagger_ND() for debugging purposes in the parallel case !! + +// first debugging RESULT: +// the host Hopping_Matrix() works for the halfspinor/gaugecopy-option enabled or disabled !! +// contrary to the use of Hopping_Matrix() in the original code +// fatal error or what?! :D +// the error has to concern the fields (lengths, usage, order etc.) + +// the Hopping_Matrix on device is not implemented here + +void matrix_mpi_debug10 (spinor * const l_strange, spinor * const l_charm, // output + spinor * const k_strange, spinor * const k_charm) { // input + + + int N_sites = VOLUME/2; // #lattice sites + int N_floats = 24*VOLUME/2; // #floats + + // additional for the debugging purposes + size_t dev_spinsize = 6*(VOLUME+RAND)/2*sizeof(dev_spinor); + + + int gridsize; // auxiliary + int blocksize; // auxiliary + + blocksize = 128; + int blocksize1 = blocksize; // here: dev_zero_spinor_field , dev_copy_spinor_field + int gridsize1 = (int) (VOLUME/2/blocksize) + 1; + + blocksize = 128; + int blocksize2 = blocksize; // passed: dev_Hopping_Matrix + int gridsize2 = (int) (VOLUME/2/blocksize) + 1; + + blocksize = 128; + int blocksize3 = blocksize; // passed: dev_mul_one_pm_imubar_gamma5 + int gridsize3 = (int) (VOLUME/2/blocksize) + 1; + + blocksize = 128; + int blocksize4 = blocksize; // passed: dev_gamma5 + int gridsize4 = (int) (VOLUME/2/blocksize) + 1; + + blocksize = 128; + int blocksize5 = blocksize; // passed: dev_copy_spinor_field + int gridsize5 = (int) (VOLUME/2/blocksize) + 1; + + dev_spinor * spinin_up; + dev_spinor * spinin_dn; + dev_spinor * spinout_up; + dev_spinor * spinout_dn; + + + cudaMalloc((void **) &dev_spin_eo2_up, dev_spinsize); + cudaMalloc((void **) &dev_spin_eo2_dn, dev_spinsize); + cudaMalloc((void **) &spinin_up, dev_spinsize); + cudaMalloc((void **) &spinin_dn, dev_spinsize); + cudaMalloc((void **) &spinout_up, dev_spinsize); + cudaMalloc((void **) &spinout_dn, dev_spinsize); + + + + double nrm = 1./(1. + g_mubar*g_mubar - g_epsbar*g_epsbar); // nrm = (1 + mubar^2 - epsbar^2)^-1 + + + + + if (g_proc_id == 0) + printf("This is matrix_mpi_debug10(). "); + + + + + /* FIRST THE Qhat(2x2)^dagger PART */ // we will apply Qhat(2x2) with charme and strange interchanged + // which is equivalent to apply Qhat(2x2)^dagger + + /* Here the M_oe Mee^-1 M_eo implementation */ + + #ifndef CHECK_HOPPING_MATRIX + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX] , k_charm); // g_spinor_field[DUM_MATRIX] = (M_eo) * k_charm // notice the order + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX+1], k_strange); // g_spinor_field[DUM_MATRIX+1] = (M_eo) * k_strange // of k_charm and k_strange + #else + // xchange + #ifndef HOPPING_DEBUG + xchange_field_wrapper(spinin_dn, 0); + #endif + #ifndef HOPPING_DEBUG + dev_Hopping_Matrix<<>>(dev_gf, spinin_dn, dev_spin_eo1_up, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); // dev_spin_eo1_up = (M_eo) * spinin_dn + #else + Hopping_Matrix_wrapper(0, dev_spin_eo1_up, spinin_dn); + #endif + // xchange + #ifndef HOPPING_DEBUG + xchange_field_wrapper(spinin_up, 0); + #endif + #ifndef HOPPING_DEBUG + dev_Hopping_Matrix<<>>(dev_gf, spinin_up, dev_spin_eo1_dn, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); // dev_spin_eo1_dn = (M_eo) * spinin_up + #else + Hopping_Matrix_wrapper(0, dev_spin_eo1_dn, spinin_up); + #endif + #endif + + + + + #ifndef CHECK_IMUGAMMA5 + // remark: here the factor GAMMA5 is not written: + mul_one_minus_imubar(g_spinor_field[DUM_MATRIX+2], g_spinor_field[DUM_MATRIX]); // g_spinor_field[DUM_MATRIX+2] = (1 - imubar)*(M_eo) * g_spinor_field[DUM_MATRIX] + // = (1 - imubar)*(M_eo) * k_charm + mul_one_plus_imubar (g_spinor_field[DUM_MATRIX+3], g_spinor_field[DUM_MATRIX+1]); // g_spinor_field[DUM_MATRIX+3] = (1 + imubar)*(M_eo) * g_spinor_field[DUM_MATRIX+1] + // = (1 + imubar)*(M_eo) * k_strange + #else + to_device(dev_spin_eo1_up, g_spinor_field[DUM_MATRIX], h2d_spin_up, dev_spinsize); + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo1_up, dev_spin_eo2_up, -1.0); // dev_spin_eo2_up = (1 - imubar) * dev_spin_eo1_up + to_host(g_spinor_field[DUM_MATRIX+2], dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + + to_device(dev_spin_eo1_dn, g_spinor_field[DUM_MATRIX+1], h2d_spin_dn, dev_spinsize); + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo1_dn, dev_spin_eo2_dn, +1.0); // dev_spin_eo2_dn = (1 + imubar) * dev_spin_eo1_dn + to_host(g_spinor_field[DUM_MATRIX+3], dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + #endif + + + + + #ifndef CHECK_CUBLAS1 + assign_add_mul_r(g_spinor_field[DUM_MATRIX+2], g_spinor_field[DUM_MATRIX+1], g_epsbar, VOLUME/2); + // g_spinor_field[DUM_MATRIX+2] = g_spinor_field[DUM_MATRIX+2] + epsbar*g_spinor_field[DUM_MATRIX+1] + // = (1 - imubar)*(M_eo) * k_charm + epsbar*(M_eo) * k_strange + assign_add_mul_r(g_spinor_field[DUM_MATRIX+3], g_spinor_field[DUM_MATRIX] , g_epsbar, VOLUME/2); + // g_spinor_field[DUM_MATRIX+3] = g_spinor_field[DUM_MATRIX+3] + epsbar*g_spinor_field[DUM_MATRIX] + // = (1 + imubar)*(M_eo) * k_strange + epsbar*(M_eo) * k_charm + + mul_r(g_spinor_field[DUM_MATRIX+2], nrm, g_spinor_field[DUM_MATRIX+2], VOLUME/2); // g_spinor_field[DUM_MATRIX+2] = nrm * g_spinor_field[DUM_MATRIX+2] + // = nrm * ( (1 - imubar)*(M_eo) * k_charm + epsbar*(M_eo) * k_strange ) + // = nrm*(1 - imubar)*(M_eo)*k_charm + nrm*epsbar*(M_eo)*k_strange + + mul_r(g_spinor_field[DUM_MATRIX+3], nrm, g_spinor_field[DUM_MATRIX+3], VOLUME/2); // g_spinor_field[DUM_MATRIX+3] = nrm * g_spinor_field[DUM_MATRIX+3] + // = nrm * ( (1 + imubar)*(M_eo) * k_strange + epsbar*(M_eo) * k_charm ) + // = nrm*(1 + imubar)*(M_eo)*k_strange + nrm*epsbar*(M_eo)*k_charm + #else + to_device(dev_spin_eo1_dn, g_spinor_field[DUM_MATRIX+1], h2d_spin_up, dev_spinsize); + to_device(dev_spin_eo2_up, g_spinor_field[DUM_MATRIX+2], h2d_spin_up, dev_spinsize); + cublasSaxpy (N_floats, g_epsbar, (float *) dev_spin_eo1_dn, 1, (float *) dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up + epsbar * dev_spin_eo1_dn + cublasSscal (N_floats, nrm, (float *) dev_spin_eo2_up, 1); // dev_spin_eo2_up = nrm * dev_spin_eo2_up + to_host(g_spinor_field[DUM_MATRIX+2], dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + + + to_device(dev_spin_eo1_up, g_spinor_field[DUM_MATRIX], h2d_spin_dn, dev_spinsize); + to_device(dev_spin_eo2_dn, g_spinor_field[DUM_MATRIX+3], h2d_spin_dn, dev_spinsize); + cublasSaxpy (N_floats, g_epsbar, (float *) dev_spin_eo1_up, 1, (float *) dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn + epsbar * dev_spin_eo1_up + cublasSscal (N_floats, nrm, (float *) dev_spin_eo2_dn, 1); // dev_spin_eo2_dn = nrm * dev_spin_eo2_dn + to_host(g_spinor_field[DUM_MATRIX+3], dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + #endif + + + + + #ifndef CHECK_HOPPING_MATRIX + Hopping_Matrix(OE, g_spinor_field[DUM_MATRIX] , g_spinor_field[DUM_MATRIX+2]); // g_spinor_field[DUM_MATRIX] = (M_oe) * g_spinor_field[DUM_MATRIX+2] + // = (M_oe)*nrm*(1 - imubar)*(M_eo)*k_charm + (M_oe)*nrm*epsbar*(M_eo)*k_strange + Hopping_Matrix(OE, g_spinor_field[DUM_MATRIX+1], g_spinor_field[DUM_MATRIX+3]); // g_spinor_field[DUM_MATRIX+1] = (M_oe) * g_spinor_field[DUM_MATRIX+3] + // = (M_oe)*nrm*(1 + imubar)*(M_eo)*k_strange + (M_oe)*nrm*epsbar*(M_eo) * k_charm + #else + // xchange + #ifndef HOPPING_DEBUG + xchange_field_wrapper(dev_spin_eo2_up, 1); + #endif + #ifndef HOPPING_DEBUG + dev_Hopping_Matrix<<>>(dev_gf, dev_spin_eo2_up, dev_spin_eo1_up, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + #else + Hopping_Matrix_wrapper(1, dev_spin_eo1_up, dev_spin_eo2_up); + #endif + // xchange + #ifndef HOPPING_DEBUG + xchange_field_wrapper(dev_spin_eo2_dn, 1); + #endif + #ifndef HOPPING_DEBUG + dev_Hopping_Matrix<<>>(dev_gf, dev_spin_eo2_dn, dev_spin_eo1_dn, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + #else + Hopping_Matrix_wrapper(1, dev_spin_eo1_dn, dev_spin_eo2_dn); + #endif + #endif + + + + + /* Here the M_oo implementation */ + + #ifndef CHECK_IMUGAMMA5 + mul_one_plus_imubar (g_spinor_field[DUM_MATRIX+2], k_charm); // g_spinor_field[DUM_MATRIX+2] = (1 + imubar) * k_charm + mul_one_minus_imubar(g_spinor_field[DUM_MATRIX+3], k_strange); // g_spinor_field[DUM_MATRIX+3] = (1 - imubar) * k_strange + #else + to_device(spinin_dn, k_charm, h2d_spin_up, dev_spinsize); + dev_mul_one_pm_imubar_gamma5<<>>(spinin_dn, dev_spin_eo2_up, +1.0); // dev_spin_eo2_up = (1 + imubar) * spinin_dn + to_host(g_spinor_field[DUM_MATRIX+2], dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + + to_device(spinin_up, k_strange, h2d_spin_dn, dev_spinsize); + dev_mul_one_pm_imubar_gamma5<<>>(spinin_up, dev_spin_eo2_dn, -1.0); // dev_spin_eo2_dn = (1 - imubar) * spinin_up + to_host(g_spinor_field[DUM_MATRIX+3], dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + #endif + + + + + #ifndef CHECK_CUBLAS2 + assign_add_mul_r(g_spinor_field[DUM_MATRIX+2], k_strange, -g_epsbar, VOLUME/2); // g_spinor_field[DUM_MATRIX+2] = g_spinor_field[DUM_MATRIX+2] - epsbar*k_strange + // = (1 + imubar) * k_charm - epsbar*k_strange + assign_add_mul_r(g_spinor_field[DUM_MATRIX+3], k_charm , -g_epsbar, VOLUME/2); // g_spinor_field[DUM_MATRIX+3] = g_spinor_field[DUM_MATRIX+3] - epsbar*k_charm + // = (1 - imubar) * k_strange - epsbar*k_charm + #else + to_device(spinin_up, k_strange, h2d_spin_up, dev_spinsize); + to_device(dev_spin_eo2_up, g_spinor_field[DUM_MATRIX+2], h2d_spin_up, dev_spinsize); + cublasSaxpy (N_floats, -g_epsbar, (float *) spinin_up, 1, (float *) dev_spin_eo2_up, 1); // dev_spin_eo2_up = dev_spin_eo2_up - epsbar*spinin_up + to_host(g_spinor_field[DUM_MATRIX+2], dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + + to_device(spinin_dn, k_charm, h2d_spin_dn, dev_spinsize); + to_device(dev_spin_eo2_dn, g_spinor_field[DUM_MATRIX+3], h2d_spin_dn, dev_spinsize); + cublasSaxpy (N_floats, -g_epsbar, (float *) spinin_dn, 1, (float *) dev_spin_eo2_dn, 1); // dev_spin_eo2_dn = dev_spin_eo2_dn - epsbar*spinin_dn + to_host(g_spinor_field[DUM_MATRIX+3], dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + #endif + + + + + // here the (M_oo - M_oe Mee^-1 M_eo) implementation + + #ifndef CHECK_CUBLAS3 + diff(g_spinor_field[DUM_MATRIX+4], g_spinor_field[DUM_MATRIX+2], g_spinor_field[DUM_MATRIX] , VOLUME/2); + // g_spinor_field[DUM_MATRIX+4] = g_spinor_field[DUM_MATRIX+2] - g_spinor_field[DUM_MATRIX] + // = (1 + imubar) * k_charm - epsbar * k_strange + // - (M_oe)*nrm*(1 - imubar)*(M_eo) * k_charm - (M_oe)*nrm*epsbar*(M_eo) * k_strange + diff(g_spinor_field[DUM_MATRIX+5], g_spinor_field[DUM_MATRIX+3], g_spinor_field[DUM_MATRIX+1], VOLUME/2); + // g_spinor_field[DUM_MATRIX+5] = g_spinor_field[DUM_MATRIX+3] - g_spinor_field[DUM_MATRIX+1] + // = (1 - imubar) * k_strange - epsbar * k_charm + // - (M_oe)*nrm*(1 + imubar)*(M_eo) * k_strange - (M_oe)*nrm*epsbar*(M_eo) * k_charm + #else + to_device(dev_spin_eo1_up, g_spinor_field[DUM_MATRIX], h2d_spin_up, dev_spinsize); + to_device(dev_spin_eo2_up, g_spinor_field[DUM_MATRIX+2], h2d_spin_up, dev_spinsize); + cublasSaxpy (N_floats, -1.0, (float *) dev_spin_eo1_up, 1, (float *) dev_spin_eo2_up, 1); // dev_spin_eo2_up = dev_spin_eo2_up - dev_spin_eo1_up + to_host(g_spinor_field[DUM_MATRIX+4], dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + + + to_device(dev_spin_eo1_dn, g_spinor_field[DUM_MATRIX+1], h2d_spin_dn, dev_spinsize); + to_device(dev_spin_eo2_dn, g_spinor_field[DUM_MATRIX+3], h2d_spin_dn, dev_spinsize); + cublasSaxpy (N_floats, -1.0, (float *) dev_spin_eo1_dn, 1, (float *) dev_spin_eo2_dn, 1); // dev_spin_eo2_dn = dev_spin_eo2_dn - dev_spin_eo1_dn + to_host(g_spinor_field[DUM_MATRIX+5], dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + #endif + + + + + /* and finally the GAMMA5 multiplication */ + + #ifndef CHECK_GAMMA5 + gamma5(g_spinor_field[DUM_MATRIX+2], g_spinor_field[DUM_MATRIX+4], VOLUME/2); // g_spinor_field[DUM_MATRIX+2] = gamma5 * g_spinor_field[DUM_MATRIX+4] ?!= l_charm' + gamma5(g_spinor_field[DUM_MATRIX+3], g_spinor_field[DUM_MATRIX+5], VOLUME/2); // g_spinor_field[DUM_MATRIX+3] = gamma5 * g_spinor_field[DUM_MATRIX+5] ?!= l_strange' + #else + to_device(dev_spin_eo2_up, g_spinor_field[DUM_MATRIX+4], h2d_spin_up, dev_spinsize); + dev_gamma5<<>>(dev_spin_eo2_up, dev_spin_eo2_up); // dev_spin_eo2_up = gamma5 * dev_spin_eo2_up + to_host(g_spinor_field[DUM_MATRIX+2], dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + + to_device(dev_spin_eo2_dn, g_spinor_field[DUM_MATRIX+5], h2d_spin_dn, dev_spinsize); + dev_gamma5<<>>(dev_spin_eo2_dn, dev_spin_eo2_dn); // dev_spin_eo2_dn = gamma5 * dev_spin_eo2_dn + to_host(g_spinor_field[DUM_MATRIX+3], dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + #endif + + + + + /* The normalisation by the max. eigenvalue is done twice at the end */ // what ?? + + + /* We have to reassigin as follows to avoid overwriting */ + /* Recall in fact that Q^hat = tau_1 Q tau_1 , hence */ + + /* ABOVE: dum_matrix+2 is l_charm goes to dum_matrix+6 :BELOW */ + /* ABOVE: dum_matrix+3 is l_strange goes to dum_matrix+7 :BELOW */ + + + + + #ifndef CHECK_COPY + assign(g_spinor_field[DUM_MATRIX+6], g_spinor_field[DUM_MATRIX+2], VOLUME/2); // g_spinor_field[DUM_MATRIX+6] = g_spinor_field[DUM_MATRIX+2] ?!= l_charm' + assign(g_spinor_field[DUM_MATRIX+7], g_spinor_field[DUM_MATRIX+3], VOLUME/2); // g_spinor_field[DUM_MATRIX+7] = g_spinor_field[DUM_MATRIX+3] ?!= l_strange' + #else + to_device(dev_spin_eo2_dn, g_spinor_field[DUM_MATRIX+2], h2d_spin_up, dev_spinsize); + dev_copy_spinor_field<<>>(dev_spin_eo2_dn, spinin_up); // spinin_up = dev_spin_eo2_dn + to_host(g_spinor_field[DUM_MATRIX+6], spinin_up, h2d_spin_up, dev_spinsize); + + to_device(dev_spin_eo2_up, g_spinor_field[DUM_MATRIX+3], h2d_spin_dn, dev_spinsize); + dev_copy_spinor_field<<>>(dev_spin_eo2_up, spinin_dn); // spinin_dn = dev_spin_eo2_up + to_host(g_spinor_field[DUM_MATRIX+7], spinin_dn, h2d_spin_dn, dev_spinsize); + #endif + + + + + /* AND THEN THE Qhat(2x2) PART */ // notice the swapping ! + + + /* Here the M_oe Mee^-1 M_eo implementation */ // SWAP: + + #ifndef CHECK_HOPPING_MATRIX + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX] , g_spinor_field[DUM_MATRIX+7]); // g_spinor_field[DUM_MATRIX] = (M_eo) * g_spinor_field[DUM_MATRIX+7] = (M_eo) * l_strange' // notice the order + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX+1], g_spinor_field[DUM_MATRIX+6]); // g_spinor_field[DUM_MATRIX+1] = (M_eo) * g_spinor_field[DUM_MATRIX+6] = (M_eo) * l_charm' // of l_strange and l_charm + #else + // xchange + #ifndef HOPPING_DEBUG + xchange_field_wrapper(dev_spin_eo3_up, 0); + #endif + #ifndef HOPPING_DEBUG + dev_Hopping_Matrix<<>>(dev_gf, dev_spin_eo3_up, dev_spin_eo1_up, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); // dev_spin_eo1_up = (M_eo) * dev_spin_eo3_up + #else + Hopping_Matrix_wrapper(0, dev_spin_eo1_up, dev_spin_eo3_up); + #endif + // xchange + #ifndef HOPPING_DEBUG + xchange_field_wrapper(dev_spin_eo3_dn, 0); + #endif + #ifndef HOPPING_DEBUG + dev_Hopping_Matrix<<>>(dev_gf, dev_spin_eo3_dn, dev_spin_eo1_dn, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); // dev_spin_eo1_dn = (M_eo) * dev_spin_eo3_dn + #else + Hopping_Matrix_wrapper(0, dev_spin_eo1_dn, dev_spin_eo3_dn); + #endif + #endif + + + + + #ifndef CHECK_IMUGAMMA5 + // remark: here we don't need g_mu = -g_mu // remark: here the factor GAMMA5 is not written: + mul_one_minus_imubar(g_spinor_field[DUM_MATRIX+2], g_spinor_field[DUM_MATRIX]); // g_spinor_field[DUM_MATRIX+2] = (1 - imubar) * g_spinor_field[DUM_MATRIX] = (1 - imubar)*(M_eo) * l_strange + mul_one_plus_imubar (g_spinor_field[DUM_MATRIX+3], g_spinor_field[DUM_MATRIX+1]); // g_spinor_field[DUM_MATRIX+3] = (1 + imubar) * g_spinor_field[DUM_MATRIX+1] = (1 + imubar)*(M_eo) * l_charm + #else + to_device(dev_spin_eo1_up, g_spinor_field[DUM_MATRIX], h2d_spin_up, dev_spinsize); + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo1_up, dev_spin_eo2_up, -1.0); // dev_spin_eo2_up = (1 - imubar) * dev_spin_eo1_up + to_host(g_spinor_field[DUM_MATRIX+2], dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + + to_device(dev_spin_eo1_dn, g_spinor_field[DUM_MATRIX+1], h2d_spin_dn, dev_spinsize); + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo1_dn, dev_spin_eo2_dn, +1.0); // dev_spin_eo2_dn = (1 + imubar) * dev_spin_eo1_dn + to_host(g_spinor_field[DUM_MATRIX+3], dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + #endif + + + + #ifndef CHECK_CUBLAS1 + assign_add_mul_r(g_spinor_field[DUM_MATRIX+2], g_spinor_field[DUM_MATRIX+1], g_epsbar, VOLUME/2); + // g_spinor_field[DUM_MATRIX+2] = g_spinor_field[DUM_MATRIX+2] + epsbar * g_spinor_field[DUM_MATRIX+1] + // = (1 - imubar)*(M_eo)*l_strange + epsbar * (M_eo) * l_charm + assign_add_mul_r(g_spinor_field[DUM_MATRIX+3], g_spinor_field[DUM_MATRIX] , g_epsbar, VOLUME/2); + // g_spinor_field[DUM_MATRIX+3] = g_spinor_field[DUM_MATRIX+3] + epsbar * g_spinor_field[DUM_MATRIX] + // = (1 + imubar)*(M_eo)*l_charm + epsbar * (M_eo) * l_strange + + mul_r(g_spinor_field[DUM_MATRIX+2], nrm, g_spinor_field[DUM_MATRIX+2], VOLUME/2); // g_spinor_field[DUM_MATRIX+2] = nrm * g_spinor_field[DUM_MATRIX+2] = nrm*(1 - imubar)*(M_eo)*l_strange + nrm*epsbar*(M_eo)*l_charm + mul_r(g_spinor_field[DUM_MATRIX+3], nrm, g_spinor_field[DUM_MATRIX+3], VOLUME/2); // g_spinor_field[DUM_MATRIX+3] = nrm * g_spinor_field[DUM_MATRIX+3] = nrm*(1 + imubar)*(M_eo)*l_charm + nrm*epsbar*(M_eo)*l_strange + #else + to_device(dev_spin_eo1_dn, g_spinor_field[DUM_MATRIX+1], h2d_spin_up, dev_spinsize); + to_device(dev_spin_eo2_up, g_spinor_field[DUM_MATRIX+2], h2d_spin_up, dev_spinsize); + cublasSaxpy (N_floats, g_epsbar, (float *) dev_spin_eo1_dn, 1, (float *) dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up + epsbar * dev_spin_eo1_dn + cublasSscal (N_floats, nrm, (float *) dev_spin_eo2_up, 1); // dev_spin_eo2_up = nrm * dev_spin_eo2_up + to_host(g_spinor_field[DUM_MATRIX+2], dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + + + to_device(dev_spin_eo1_up, g_spinor_field[DUM_MATRIX], h2d_spin_dn, dev_spinsize); + to_device(dev_spin_eo2_dn, g_spinor_field[DUM_MATRIX+3], h2d_spin_dn, dev_spinsize); + cublasSaxpy (N_floats, g_epsbar, (float *) dev_spin_eo1_up, 1, (float *) dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn + epsbar * dev_spin_eo1_up + cublasSscal (N_floats, nrm, (float *) dev_spin_eo2_dn, 1); // dev_spin_eo2_dn = nrm * dev_spin_eo2_dn + to_host(g_spinor_field[DUM_MATRIX+3], dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + #endif + + + + + #ifndef CHECK_HOPPING_MATRIX + Hopping_Matrix(OE, l_strange, g_spinor_field[DUM_MATRIX+2]); // l_strange = (M_oe) * g_spinor_field[DUM_MATRIX+2] = (M_oe)*nrm*(1 - imubar)*(M_eo)*l_strange + (M_oe)*nrm*epsbar*(M_eo)*l_charm + Hopping_Matrix(OE, l_charm , g_spinor_field[DUM_MATRIX+3]); // l_charm = (M_oe) * g_spinor_field[DUM_MATRIX+3] = (M_oe)*nrm*(1 + imubar)*(M_eo)*l_charm + (M_oe)*nrm*epsbar*(M_eo)*l_strange + #else + // xchange + #ifndef HOPPING_DEBUG + xchange_field_wrapper(dev_spin_eo2_up, 1); + #endif + #ifndef HOPPING_DEBUG + dev_Hopping_Matrix<<>>(dev_gf, dev_spin_eo2_up, dev_spin_eo1_up, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + #else + Hopping_Matrix_wrapper(1, dev_spin_eo1_up, dev_spin_eo2_up); + #endif + // xchange + #ifndef HOPPING_DEBUG + xchange_field_wrapper(dev_spin_eo2_dn, 1); + #endif + #ifndef HOPPING_DEBUG + dev_Hopping_Matrix<<>>(dev_gf, dev_spin_eo2_dn, dev_spin_eo1_dn, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + #else + Hopping_Matrix_wrapper(1, dev_spin_eo1_dn, dev_spin_eo2_dn); + #endif + #endif + + + + + /* Here the M_oo implementation */ + + #ifndef CHECK_IMUGAMMA5 + mul_one_plus_imubar (g_spinor_field[DUM_MATRIX] , g_spinor_field[DUM_MATRIX+7]); // g_spinor_field[DUM_MATRIX] = (1 + imubar) * g_spinor_field[DUM_MATRIX+7] = (1 + imubar) * l_strange + mul_one_minus_imubar(g_spinor_field[DUM_MATRIX+1], g_spinor_field[DUM_MATRIX+6]); // g_spinor_field[DUM_MATRIX+1] = (1 - imubar) * g_spinor_field[DUM_MATRIX+6] = (1 - imubar) * l_charm + #else + to_device(spinin_up, g_spinor_field[DUM_MATRIX+7], h2d_spin_up, dev_spinsize); + dev_mul_one_pm_imubar_gamma5<<>>(spinin_up, dev_spin_eo2_up, +1.0); // dev_spin_eo2_up = (1 + imubar) * spinin_up + to_host(g_spinor_field[DUM_MATRIX], dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + + to_device(spinin_dn, g_spinor_field[DUM_MATRIX+6], h2d_spin_dn, dev_spinsize); + dev_mul_one_pm_imubar_gamma5<<>>(spinin_dn, dev_spin_eo2_dn, -1.0); // dev_spin_eo2_dn = (1 - imubar) * spinin_dn + to_host(g_spinor_field[DUM_MATRIX+1], dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + #endif + + + + + #ifndef CHECK_CUBLAS2 + assign_add_mul_r(g_spinor_field[DUM_MATRIX] , g_spinor_field[DUM_MATRIX+6], -g_epsbar, VOLUME/2); + // g_spinor_field[DUM_MATRIX] = g_spinor_field[DUM_MATRIX] - epsbar * g_spinor_field[DUM_MATRIX+6] + // = (1 + imubar) * l_strange - epsbar * l_charm + assign_add_mul_r(g_spinor_field[DUM_MATRIX+1], g_spinor_field[DUM_MATRIX+7], -g_epsbar, VOLUME/2); + // g_spinor_field[DUM_MATRIX+1] = g_spinor_field[DUM_MATRIX+1] - epsbar * g_spinor_field[DUM_MATRIX+7] + // = (1 - imubar) * l_charm - epsbar * l_strange + #else + to_device(spinin_dn, g_spinor_field[DUM_MATRIX+6], h2d_spin_up, dev_spinsize); + to_device(dev_spin_eo2_up, g_spinor_field[DUM_MATRIX], h2d_spin_up, dev_spinsize); + cublasSaxpy (N_floats, -g_epsbar, (float *) spinin_dn, 1, (float *) dev_spin_eo2_up, 1); // dev_spin_eo2_up = dev_spin_eo2_up - epsbar*spinin_dn + to_host(g_spinor_field[DUM_MATRIX], dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + + to_device(spinin_up, g_spinor_field[DUM_MATRIX+7], h2d_spin_dn, dev_spinsize); + to_device(dev_spin_eo2_dn, g_spinor_field[DUM_MATRIX+1], h2d_spin_dn, dev_spinsize); + cublasSaxpy (N_floats, -g_epsbar, (float *) spinin_up, 1, (float *) dev_spin_eo2_dn, 1); // dev_spin_eo2_dn = dev_spin_eo2_dn - epsbar*spinin_up + to_host(g_spinor_field[DUM_MATRIX+1], dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + #endif + + + + + // here the (M_oo - M_oe Mee^-1 M_eo) implementation + + #ifndef CHECK_CUBLAS3 + diff(l_strange, g_spinor_field[DUM_MATRIX] , l_strange, VOLUME/2); // l_strange = g_spinor_field[DUM_MATRIX] - l_strange + // = (1 + imubar) * l_strange - epsbar * l_charm + // - (M_oe)*nrm*(1 - imubar)*(M_eo) * l_strange + (M_oe)*nrm*epsbar*(M_eo) * l_charm + + diff(l_charm , g_spinor_field[DUM_MATRIX+1], l_charm , VOLUME/2); // l_charm = g_spinor_field[DUM_MATRIX+1] - l_charm + // = (1 - imubar) * l_charm - epsbar * l_strange + // - (M_oe)*nrm*(1 + imubar)*(M_eo) * l_charm + (M_oe)*nrm*epsbar*(M_eo) * l_strange + #else + to_device(dev_spin_eo1_up, l_strange, h2d_spin_up, dev_spinsize); + to_device(dev_spin_eo2_up, g_spinor_field[DUM_MATRIX], h2d_spin_up, dev_spinsize); + cublasSaxpy (N_floats, -1.0, (float *) dev_spin_eo1_up, 1, (float *) dev_spin_eo2_up, 1); // dev_spin_eo2_up = dev_spin_eo2_up - dev_spin_eo1_up + to_host(l_strange, dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + + to_device(dev_spin_eo1_dn, l_charm, h2d_spin_dn, dev_spinsize); + to_device(dev_spin_eo2_dn, g_spinor_field[DUM_MATRIX+1], h2d_spin_dn, dev_spinsize); + cublasSaxpy (N_floats, -1.0, (float *) dev_spin_eo1_dn, 1, (float *) dev_spin_eo2_dn, 1); // dev_spin_eo2_dn = dev_spin_eo2_dn - dev_spin_eo1_dn + to_host(l_charm, dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + #endif + + + + + /* and finally the GAMMA5 multiplication */ + + #ifndef CHECK_GAMMA5 + gamma5(l_strange, l_strange, VOLUME/2); // l_strange = gamma5 * l_strange + gamma5(l_charm , l_charm , VOLUME/2); // l_charm = gamma5 * l_charm + #else + to_device(dev_spin_eo2_up, l_strange, h2d_spin_up, dev_spinsize); + dev_gamma5<<>>(dev_spin_eo2_up, dev_spin_eo2_up); // dev_spin_eo2_up = gamma5 * dev_spin_eo2_up + to_host(l_strange, dev_spin_eo2_up, h2d_spin_up, dev_spinsize); + + to_device(dev_spin_eo2_dn, l_charm, h2d_spin_dn, dev_spinsize); + dev_gamma5<<>>(dev_spin_eo2_dn, dev_spin_eo2_dn); // dev_spin_eo2_dn = gamma5 * dev_spin_eo2_dn + to_host(l_charm, dev_spin_eo2_dn, h2d_spin_dn, dev_spinsize); + #endif + + + + + /* At the end, the normalisation by the max. eigenvalue */ + /* Twice phmc_invmaxev since we consider here D Ddag !!! */ + mul_r(l_charm, phmc_invmaxev*phmc_invmaxev, l_charm, VOLUME/2); + mul_r(l_strange, phmc_invmaxev*phmc_invmaxev, l_strange, VOLUME/2); + return; + +}//matrix__mpi_debug10() + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/HEADER.h b/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/HEADER.h new file mode 100644 index 0000000000000000000000000000000000000000..4123f3c6ccd18233c9a023d8cc1c9af08352118a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/HEADER.h @@ -0,0 +1,330 @@ + +#ifndef _MIXED_SOLVE_H_ + + + + +/////////////////// +// own functions // +/////////////////// + + +// eo, nd + +void to_device (dev_spinor * device, spinor * host, dev_spinor * auxiliary, int size); + +void to_host (spinor * host, dev_spinor * device, dev_spinor * auxiliary, int size); + +__global__ void he_cg_init_nd_additional (float param_mubar, float param_epsbar); + +__global__ void dev_mul_one_pm_imubar_gamma5 (dev_spinor * sin, dev_spinor * sout, REAL sign); + +void init_mixedsolve_eo_nd(su3** gf); + +void finalize_mixedsolve_eo_nd(void); + +void matrix_multiplication32 (dev_spinor * , dev_spinor * , dev_spinor * , dev_spinor * , int, int, int, int, int, int, int, int); + +void flopcount(unsigned long long int& total, int add); + +extern "C" void benchmark_eo_nd (spinor * const Q_up, spinor * const Q_dn, int N); + +int cg_eo_nd (dev_su3_2v * gf, + dev_spinor * P_up, dev_spinor * P_dn, + dev_spinor * Q_up, dev_spinor * Q_dn, + int max_iter, + int check_abs , int check_rel, + double eps_abs, double eps_rel ); + +extern "C" int mixedsolve_eo_nd (spinor * P_up, spinor * P_dn, + spinor * Q_up, spinor * Q_dn, + int max_iter, double eps_sq, int rel_prec); + +void set_global_sizes(); + +float cublasDot_wrapper(int size, float * A, int incx, float * B, int incy); + + +// eo, nd, MPI + +void convert2double_spin_mpi (dev_spinor* spin, spinor* h2d, int start, int end); +void convert2REAL4_spin_mpi (spinor* spin, dev_spinor* h2d, int start, int end); +void to_device_mpi (dev_spinor * device, spinor * host, dev_spinor * auxiliary, int size, int start, int end); +void to_host_mpi (spinor * host, dev_spinor * device, dev_spinor * auxiliary, int size, int start, int end); +void xchange_field_wrapper (dev_spinor * dev_spin, int ieo); +void Hopping_Matrix_wrapper (int ieo, dev_spinor * out, dev_spinor * in); +void su3to2vf4_mpi(su3** gf, dev_su3_2v* h2d_gf); +void su3to8_mpi(su3** gf, dev_su3_8* h2d_gf); +void init_iseven(); +void init_nnspinor_eo_mpi(); +void init_idxgauge_mpi(); +void init_gpu_indexfields(); +void free_gpu_indexfields(); +__global__ void he_cg_init_nd_additional_mpi (int param_VOLUMEPLUSRAND, int param_RAND, int rank, int nproc); +void init_mixedsolve_eo_nd_mpi(su3** gf); +void finalize_mixedsolve_eo_nd_mpi(void); + +__global__ void dev_Hopping_Matrix_mpi (const dev_su3_2v * gf, const dev_spinor * sin, dev_spinor * sout, + int * dev_iup, int * dev_idn, int * dev_eo2lexic, int * dev_lexic2eosub, + int ieo); + +void matrix_multiplication32_mpi (dev_spinor * spinout_up, dev_spinor * spinout_dn, + dev_spinor * spinin_up , dev_spinor * spinin_dn , + int gridsize1, int blocksize1, int gridsize2, int blocksize2, + int gridsize3, int blocksize3, int gridsize4, int blocksize4); + +int cg_eo_nd_mpi (dev_su3_2v * gf, + dev_spinor * P_up, dev_spinor * P_dn, + dev_spinor * Q_up, dev_spinor * Q_dn, + int max_iter, + int check_abs , int check_rel, + double eps_abs, double eps_rel ); + +extern "C" int mixedsolve_eo_nd_mpi (spinor * P_up, spinor * P_dn, + spinor * Q_up, spinor * Q_dn, + int max_iter, double eps_sq, int rel_prec); + + + + +// ASYNC + +__global__ void dev_Hopping_Matrix_ASYNC (const dev_su3_2v * gf, + const dev_spinor * sin, dev_spinor * sout, + const int * gfindex_site, const int* gfindex_nextsite, const int * nn_evenodd, + const int eo, + int start, int size); + +void HOPPING_ASYNC (dev_su3_2v * gf, + dev_spinor * spinin, dev_spinor * spinout, + int * gfindex_site, int * gfindex_nextsite, int * nn_evenodd, + int ieo, + int gridsize, int blocksize); + +void matrix_multiplication32_mpi_ASYNC (dev_spinor * spinout_up, dev_spinor * spinout_dn, + dev_spinor * spinin_up , dev_spinor * spinin_dn , + int gridsize1, int blocksize1, int gridsize2, int blocksize2, + int gridsize3, int blocksize3, int gridsize4, int blocksize4); + + + + + + + + + +///////////////////////// +// Florian's functions // +///////////////////////// + +// eo, non-eo, non-nd +templatestruct MixedsolveParameter; +templateclass MixedsolveOperator // interface class +{ +public: + virtual ~MixedsolveOperator(); + + virtual void gpuInit (dev_spinorM(RealT)* spinin,dev_spinorM(RealT)* spinTmp,dev_spinorM(RealT)* spinout,dev_su3_2vM(RealT)* gf,int* dev_nn,const dim3& linAlgGriddim,const dim3& linAlgBlockdim); + virtual void gpu (dev_spinorM(RealT)* spinin,dev_spinorM(RealT)* spinTmp,dev_spinorM(RealT)* spinout,dev_su3_2vM(RealT)* gf,int* dev_nn,const dim3& linAlgGriddim,const dim3& linAlgBlockdim)=0; //linkage error due to pure virtual functions with gcc 4.3.2 + virtual void gpuDeinit(dev_spinorM(RealT)* spininout,dev_spinorM(RealT)* spinTmp,dev_su3_2vM(RealT)* gf,int* dev_nn,const dim3& linAlgGriddim,const dim3& linAlgBlockdim,const RealT scaleparam); + + virtual void checkInit (spinor* const spinin,spinor* const spinTmp,spinor* const spinout,int volume); + virtual void check (spinor* const conjungateBasisPSpinin,spinor* const spinout,const int volume)=0; + virtual void checkDeinit(spinor* const spinin,spinor* const spinTmp,spinor* const spinout,int volume); +}; + +// cublasWrapper +float cublasDot(int n,const float* x,int incx,const float* y,int incy); +double cublasDot(int n,const double* x,int incx,const double* y,int incy); +void cublasAxpy(int n,float alpha,const float* x,int incx,float* y,int incy); +void cublasAxpy(int n,double alpha,const double* x,int incx,double* y,int incy); +void cublasScal(int n,float alpha,float* x,int incx); +void cublasScal(int n,double alpha,double* x,int incx); +void cublasCopy(int n,const float* x,int incx,float* y,int incy); +void cublasCopy(int n,const double* x,int incx,double* y,int incy); + + +//__device__ inline dev_complex dev_cconj (dev_complex c); +template__device__ inline dev_complexT dev_cconj (dev_complexT c); +//__device__ inline void dev_ccopy(dev_complex* von, dev_complex* nach); +template__device__ inline void dev_ccopy(dev_complexT* von, dev_complexT* nach); +//__device__ inline REAL dev_cabssquare (dev_complex c); +template__device__ inline RealT dev_cabssquare (dev_complexT c); +//__device__ inline REAL dev_cabsolute (dev_complex c); +template__device__ inline RealT dev_cabsolute (dev_complexT c); +//__device__ inline dev_complex dev_crealmult(dev_complex c1, REAL real); +template__device__ inline dev_complexT dev_crealmult(dev_complexT c1, RealT real); +//__device__ inline dev_complex dev_cmult (dev_complex c1, dev_complex c2); +template__device__ inline dev_complexT dev_cmult (dev_complexT c1, dev_complexT c2); +//__device__ inline dev_complex dev_cadd (dev_complex c1, dev_complex c2); +template__device__ inline dev_complexT dev_cadd (dev_complexT c1, dev_complexT c2); +//__device__ inline dev_complex dev_cdiv(dev_complex c1, dev_complex c2); +template__device__ inline dev_complexT dev_cdiv(dev_complexT c1, dev_complexT c2); +//__device__ inline dev_complex dev_csub(dev_complex c1, dev_complex c2); +template__device__ inline dev_complexT dev_csub(dev_complexT c1, dev_complexT c2); +//__device__ inline dev_complex dev_initcomplex(REAL re, REAL im); +template__device__ inline dev_complexT dev_initcomplex(RealT re, RealT im); +//__device__ inline void dev_copy_spinor(dev_spinor *i1, dev_spinor *i2); +template__device__ inline void dev_copy_spinor(typename dev_spinorT::type *i1, typename dev_spinorT::type *i2); +//__device__ inline void dev_zero_spinor(dev_spinor *sin); +template__device__ inline void dev_zero_spinor(typename dev_spinorT::type *sin); +//__device__ inline void dev_skalarmult_add_assign_spinor(dev_spinor *in, REAL lambda,dev_spinor * in2, dev_spinor * out); +template__device__ inline void dev_skalarmult_add_assign_spinor(typename dev_spinorT::type *in, RealT lambda, typename dev_spinorT::type * in2, typename dev_spinorT::type * out); +//__device__ inline void dev_complexmult_add_assign_spinor(dev_spinor * in, dev_complex lambda,dev_spinor * in2, dev_spinor * out); +template__device__ inline void dev_complexmult_add_assign_spinor( typename dev_spinorT::type* in, dev_complexT lambda, typename dev_spinorT::type* in2, typename dev_spinorT::type* out); +//__device__ inline void dev_complexcgmult_add_assign_spinor(dev_spinor * in, dev_complex lambda,dev_spinor * in2, dev_spinor * out); +template__device__ inline void dev_complexcgmult_add_assign_spinor( typename dev_spinorT::type * in, dev_complexT lambda, typename dev_spinorT::type* in2, typename dev_spinorT::type* out); +//__device__ void inline dev_skalarmult_spinor(dev_spinor * in, dev_complex lambda, dev_spinor * out); +template__device__ void inline dev_skalarmult_spinora( typename dev_spinorT::type* in, dev_complexT lambda, typename dev_spinorT::type* out); +//__device__ void inline dev_skalarmult_gamma5_spinor(dev_spinor * out, dev_complex lambda, dev_spinor * in); +template__device__ void inline dev_skalarmult_gamma5_spinor(typename dev_spinorT::type* out, dev_complexT lambda, typename dev_spinorT::type* in); +//__device__ void inline dev_realmult_spinor(dev_spinor * in, REAL lambda); +template__device__ void inline dev_realmult_spinor(typename dev_spinorT::type* in, RealT lambda); +//__device__ void inline dev_realmult_spinor_assign(dev_spinor* out, REAL lambda, dev_spinor* in); +template__device__ void inline dev_realmult_spinor_assign(typename dev_spinorT::type* out, RealT lambda, typename dev_spinorT::type* in); +//__device__ void dev_assign_realmult_add_spinor(dev_spinor* out, REAL lambda, dev_spinor* in1, dev_spinor* in2); +template__device__ void dev_assign_realmult_add_spinor( typename dev_spinorT::type* out, RealT lambda, typename dev_spinorT::type* in1, typename dev_spinorT::type* in2); +//__device__ inline void dev_add_spinor_assign(dev_spinor * i1, dev_spinor * i2); +template__device__ inline void dev_add_spinor_assign(typename dev_spinorT::type * i1, typename dev_spinorT::type * i2); +//__device__ inline void dev_sub_spinor_assign(dev_spinor * i1, dev_spinor * i2); +template__device__ inline void dev_sub_spinor_assign(typename dev_spinorT::type * i1, typename dev_spinorT::type * i2); +//__device__ void dev_su3MtV_spintex(dev_su3 M, int pos, dev_spinor * out); +template __device__ void dev_su3MtV_spintex(dev_su3M(RealT) M, int pos, dev_spinorM(RealT) * out); +//__device__ void dev_su3MtV(dev_su3 M, const dev_spinor * s, dev_spinor * out); +template__device__ void dev_su3MtV(typename dev_su3T::type M, const typename dev_spinorT::type * s, typename dev_spinorT::type * out); +//__device__ void dev_su3MdaggertV(dev_su3 M, dev_spinor * s, dev_spinor * out); +template__device__ void dev_su3MdaggertV(typename dev_su3T::type M, typename dev_spinorT::type * s, typename dev_spinorT::type * out); +//__device__ void dev_Gamma0(dev_spinor * in); +template__device__ void dev_Gamma0(typename dev_spinorT::type * in); +//__device__ void dev_Gamma3(dev_spinor * in); +template__device__ void dev_Gamma3(typename dev_spinorT::type * in); +//__device__ void dev_Gamma2(dev_spinor * in); +template__device__ void dev_Gamma2(typename dev_spinorT::type * in); +//__device__ void dev_Gamma1(dev_spinor * in); +template__device__ void dev_Gamma1(typename dev_spinorT::type * in); +//__device__ void dev_Gamma5(dev_spinor * in); +template__device__ void dev_Gamma5(typename dev_spinorT::type * in); +//__device__ void dev_Gamma5_assign(dev_spinor* out, dev_spinor* in); +template__device__ void dev_Gamma5_assign(typename dev_spinorT::type* out, typename dev_spinorT::type* in); +//__device__ void dev_GammatV(int mu, dev_spinor * in); +template__device__ void dev_GammatV(int mu, typename dev_spinorT::type * in); +//__device__ void dev_reconstructgf_2vtexref (const dev_su3_2v* field, int pos, dev_su3* gf); +//__device__ void dev_reconstructgf_2vtexref_dagger (const dev_su3_2v* field, int pos, dev_su3* gf); +//__device__ void dev_reconstructgf_8texref (const dev_su3_2v * field, int pos, dev_su3* gf); +//__device__ void dev_reconstructgf_8texref_dagger (const dev_su3_2v* field,int pos, dev_su3* gf); +template __device__ void dev_reconstructgf_2vtexref (const typename dev_su3_2vT::type* field, int pos, typename dev_su3T::type* gf); +template __device__ void dev_reconstructgf_2vtexref_dagger (const typename dev_su3_2vT::type* field, int pos, typename dev_su3T::type* gf); +template __device__ void dev_reconstructgf_8texref (const typename dev_su3_2vT::type* field, int pos, typename dev_su3T::type* gf); +template __device__ void dev_reconstructgf_8texref_dagger (const typename dev_su3_2vT::type* field, int pos, typename dev_su3T::type* gf); +template __global__ void dev_gamma5(typename dev_spinorT::type * sin, typename dev_spinorT::type * sout); +__global__ void dev_swapmu(); +//__global__ void dev_mul_one_pm_imu_inv(dev_spinor* sin, dev_spinor* sout, const REAL sign); +template__global__ void dev_mul_one_pm_imu_inv(dev_spinorM(RealT)* sin, dev_spinorM(RealT)* sout, const RealT sign); +//__global__ void dev_mul_one_pm_imu_sub_mul_gamma5(dev_spinor* sin1, dev_spinor* sin2, dev_spinor* sout, const REAL sign); +template__global__ void dev_mul_one_pm_imu_sub_mul_gamma5(dev_spinorM(RealT)* sin1, dev_spinorM(RealT)* sin2, dev_spinorM(RealT)* sout, const RealT sign); +template__device__ void dev_kappaP1_plus (dev_spinorM(RealT) * out, dev_spinorM(RealT) * in, RealT kappa); +template__device__ void dev_kappaP1_minus(dev_spinorM(RealT) * out, dev_spinorM(RealT) * in, RealT kappa); +template__device__ void dev_kappaP2_plus (dev_spinorM(RealT) * out, dev_spinorM(RealT) * in, RealT kappa); +template__device__ void dev_kappaP2_minus(dev_spinorM(RealT) * out, dev_spinorM(RealT) * in, RealT kappa); +template__device__ void dev_kappaP3_plus (dev_spinorM(RealT) * out, dev_spinorM(RealT) * in, RealT kappa); +template__device__ void dev_kappaP3_minus(dev_spinorM(RealT) * out, dev_spinorM(RealT) * in, RealT kappa); +template__device__ void dev_kappaP0_plus (dev_spinorM(RealT) * out, dev_spinorM(RealT) * in, dev_complexM(RealT) kappa); +template__device__ void dev_kappaP0_minus(dev_spinorM(RealT) * out, dev_spinorM(RealT) * in, dev_complexM(RealT) kappa); +//__global__ void dev_Hopping_Matrix(const dev_su3_2v * gf, const dev_spinor * sin, dev_spinor * sout, const int * gfindex_site, const int* gfindex_nextsite, const int * nn_evenodd, const int eo); +template__global__ void dev_Hopping_Matrix(const dev_su3_2vM(RealT) * gf, const dev_spinorM(RealT) * sin, dev_spinorM(RealT) * sout, const int * gfindex_site, const int* gfindex_nextsite, const int * nn_evenodd, const int eo); +//extern "C" void dev_Qtm_pm_psi(dev_spinor* spinin, dev_spinor* spinout, int gridsize, int blocksize, int gridsize2, int blocksize2); +templatevoid dev_Qtm_pm_psi(dev_spinorM(RealT)* spinin, dev_spinorM(RealT)* spinout, int gridsize, int blocksize, int gridsize2, int blocksize2, MixedsolveParameter& mixedsolveParameter); +//__global__ void dev_tm_dirac_kappa(dev_su3_2v * gf, dev_spinor * sin, dev_spinor * sout, int * dev_nn); +template +__global__ void dev_tm_dirac_kappa +( + typename dev_su3_2vT::type * gf, + typename dev_spinorT::type * sin, + typename dev_spinorT::type * sout, + int * dev_nn +); +//extern "C" void dev_tm_dirac_dagger_kappa(dev_su3_2v * gf,dev_spinor* spinin, dev_spinor* spinout, int *grid, int * nn_grid, REAL* output,REAL* erg, int xsize, int ysize); +templatevoid dev_tm_dirac_dagger_kappa(typename dev_su3_2vT::type * gf,typename dev_spinorT::type* spinin,typename dev_spinorT::type* spinout, int *grid, int * nn_grid, RealT* output,RealT* erg, int xsize, int ysize); +__device__ inline REAL dev_skalarprod_spinor(dev_spinor * s1, dev_spinor * s2); +__device__ inline REAL dev_squarenorm_spinor(dev_spinor * s1); +__device__ inline REAL dev_squarenorm_spinor_tex(int pos); +__global__ void dev_skalarprod_spinor_field2(dev_spinor* s1, dev_spinor* s2, REAL* erg); +__global__ void dev_squarenorm_spinor_field(dev_spinor* s1, REAL* erg); +__global__ void dev_skalarprod_spinor_field(dev_spinor* s1, dev_spinor* s2, REAL* erg); +template__global__ void dev_zero_spinor_field(typename dev_spinorT::type* s1); +//__global__ void dev_copy_spinor_field(dev_spinor* s1, dev_spinor* s2); +template__global__ void dev_copy_spinor_field(dev_spinorM(RealT1)* s1, dev_spinorM(RealT2)* s2); +//__global__ void dev_skalarmult_add_assign_spinor_field(dev_spinor* s1, REAL lambda, dev_spinor* s2, dev_spinor* so); +template__global__ void dev_skalarmult_add_assign_spinor_field(dev_spinorM(RealT)* s1, RealT lambda, dev_spinorM(RealT)* s2, dev_spinorM(RealT)* so); +//__global__ void dev_skalarmult_spinor_field(dev_spinor* s1, REAL lambda, dev_spinor* so); +template__global__ void dev_skalarmult_spinor_field(dev_spinorM(RealT)* s1, RealT lambda, dev_spinorM(RealT)* so); +//__global__ void dev_complexmult_spinor_field(dev_spinor* s1, dev_complex lambda, dev_spinor* so); +template__global__ void dev_complexmult_spinor_field(dev_spinorM(RealT)* s1, dev_complexM(RealT) lambda, dev_spinorM(RealT)* so); +__global__ void he_cg_init (int* grid, REAL param_kappa, REAL param_mu, dev_complex k0, dev_complex k1, dev_complex k2, dev_complex k3); +extern "C" int find_devices(); +extern "C" int bind_texture_spin(dev_spinor* s, int i); +extern "C" int unbind_texture_spin(int i); +extern "C" int bind_texture_gf(dev_su3_2v * gf); +extern "C" int unbind_texture_gf(); +extern "C" int bind_texture_nn(int* nn); +extern "C" int unbind_texture_nn(); +extern "C" void test_operator(dev_su3_2v * gf,dev_spinor* spinin, dev_spinor* spinout, dev_spinor* spin0, dev_spinor* spin1, dev_spinor* spin2, dev_spinor* spin3, dev_spinor* spin4, int *grid, int * nn_grid, REAL* output,REAL* erg, int xsize, int ysize); +//extern "C" int dev_cg(dev_su3_2v * gf,dev_spinor* spinin, dev_spinor* spinout, dev_spinor* spin0, dev_spinor* spin1, dev_spinor* spin2, dev_spinor* spin3, dev_spinor* spin4, int *grid, int * nn_grid, int rescalekappa); +templateclass MixedsolveOperatorT>int dev_cg (dev_su3_2vM(RealT)* gf, dev_spinorM(RealT)* spinin, dev_spinorM(RealT)* spinout, dev_spinorM(RealT)* spin0, dev_spinorM(RealT)* spin1, dev_spinorM(RealT)* spin2, dev_spinorM(RealT)* spin3, dev_spinorM(RealT)* spin4, int* grid, int* nn_grid, int rescalekappa,MixedsolveOperatorT& mixedsolveOperator, REALD initial_sourcesquarenorm, bool rel_prec, double finalEps/*, bool& reachedFinalPrecision*/); +//extern "C" int dev_cg_eo(dev_su3_2v * gf,dev_spinor* spinin, dev_spinor* spinout, dev_spinor* spin0, dev_spinor* spin1, dev_spinor* spin2, dev_spinor* spin3, dev_spinor* spin4, int *grid, int * nn_grid, REAL epsfinal); +templateint dev_cg_eo(dev_su3_2vM(RealT)* gf, dev_spinorM(RealT)* spinin, dev_spinorM(RealT)* spinout, dev_spinorM(RealT)* spin0, dev_spinorM(RealT)* spin1, dev_spinorM(RealT)* spin2, dev_spinorM(RealT)* spin3, dev_spinorM(RealT)* spin4, int* grid, int* nn_grid, RealT epsfinal, MixedsolveParameter& mixedsolveParameter); +void initnn(); +void initnn_eo(); +void shownn_eo(); +void show_su3(su3 gf1); +void show_dev_su3(dev_su3 gf1); +void lptovec(int k); +void shownn(); +//void su3to2vf4(su3** gf, dev_su3_2v* h2d_gf); +template void su3to2vf4(su3** gf, typename dev_su3_2vT::type* h2d_gf); +//void su3to8(su3** gf, dev_su3_8* h2d_gf); +template void su3to8(su3** gf, typename dev_su3_8T::type* h2d_gf); +void reconstructgf_2v (dev_su3* gf); +template__global__ void dev_check_gauge_reconstruction_8(typename dev_su3_2vT::type* gf, int pos, typename dev_su3T::type * outgf1, typename dev_su3T::type* outgf2); +//void check_gauge_reconstruction_8(su3 ** gf1, dev_su3_2v * gf2, int ind1, int mu); +templatevoid check_gauge_reconstruction_8(su3 ** gf1, dev_su3_2vM(RealT) * gf2, int ind1, int mu, MixedsolveParameter& mixedsolveParameter); +void reconstructgf_8 (dev_su3_8 * h2d_gf, dev_su3* gf); +//void showcompare_gf(int t, int x, int y, int z, int mu); +templatevoid showcompare_gf(int t, int x, int y, int z, int mu, MixedsolveParameter& mixedsolveParameter); +//void convert2double_spin(dev_spinor* spin, spinor* h2d); +template void convert2double_spin(typename dev_spinorT::type* spin, spinor* h2d); +//void convert2REAL4_spin(spinor* spin, dev_spinor* h2d); +template void convert2REAL4_spin(spinor* spin, typename dev_spinorT::type* h2d); +//void init_mixedsolve(su3** gf); +templateMixedsolveParameter* init_mixedsolve(su3** gf); +//void init_mixedsolve_eo(su3** gf); +templateMixedsolveParameter* init_mixedsolve_eo(su3** gf); +//void finalize_mixedsolve(); +templatevoid finalize_mixedsolve(MixedsolveParameter* mixedsolveParameterP); +//extern "C" int mixed_solve (spinor * const P, spinor * const Q, const int max_iter, double eps, const int rel_prec,const int N); +templateclass MixedsolveOperatorT>int mixed_solveT (spinor* const P, spinor* const Q, const int max_iter, double eps, const int rel_prec,const int N, MixedsolveOperatorT& mixedsolveOperator); +extern "C" int mixed_solve (spinor* const P, spinor* const Q, const int max_iter, double eps, const int rel_prec,const int N); +extern "C" int mixed_solveD (spinor* const P, spinor* const Q, const int max_iter, double eps, const int rel_prec,const int N); +extern "C" int mixed_solve_DiracDaggerDirac (spinor* const P, spinor* const Q, const int max_iter, double eps, const int rel_prec,const int N); +extern "C" int mixed_solve_DiracDaggerDiracD (spinor* const P, spinor* const Q, const int max_iter, double eps, const int rel_prec,const int N); +extern "C" int mixed_solve_DiracDaggerDiracDiracDaggerDirac (spinor* const P, spinor* const Q, const int max_iter, double eps, const int rel_prec,const int N); +extern "C" int mixed_solve_DiracDaggerDiracDiracDaggerDiracD(spinor* const P, spinor* const Q, const int max_iter, double eps, const int rel_prec,const int N); + +void dummy (dev_spinor* a, dev_spinor* b); +//void benchmark(spinor * const Q); +templatevoid benchmark(spinor * const Q,MixedsolveParameter& mixedsolveParameter); +//extern "C" int mixed_solve_eo (spinor * const P, spinor * const Q, const int max_iter, double eps, const int rel_prec, const int N); +templateint mixed_solve_eoT (spinor * const P, spinor * const Q, const int max_iter, double eps, const int rel_prec, const int N); +extern "C" int mixed_solve_eo (spinor * const P, spinor * const Q, const int max_iter, double eps, const int rel_prec, const int N); +extern "C" int mixed_solve_eoD (spinor * const P, spinor * const Q, const int max_iter, double eps, const int rel_prec, const int N); + + + +#define _MIXED_SOLVE_H_ + +#endif + + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/HOWTO b/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/HOWTO new file mode 100644 index 0000000000000000000000000000000000000000..33bec68e51eea9baba7c1693db57d362aaa61343 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/HOWTO @@ -0,0 +1,30 @@ + + + +The CUDA code can be switched on via ./configure by adding the following +arguments to it: + +--enable-gpu=yes +--with-cuda=/usr/local/cuda/lib or any other path, where libcuda.so, + libcudart.so etc. are located +--with-cudacompileargs=string additional arguments to nvcc + +Examples to --with-cudacompileargs: + "--gpu-architecture sm_13 --use_fast_math -O3" for devices with compute capability < 2.0 + "-c -prec-sqrt=false -prec-div=false -Xptxas -dlcm=ca -O3" for devices with compute capability 2.0 + + +A proper installation of CUDA and nvcc is required. + +For devices with compute capability = 2.0 (Fermi cards) the definition +#define USE_TEXTURE in GPU/cudadefs.h should be commented out in order to +gain more performance + +By commenting out #define GF_8 (#define TEMPORALGAUGE) in GPU/cudadefs.h +the reconstruction of the gauge field (the usage of temporal gauge for the +gauge fields) can be switched off. This results in lower performance. + +A sample input file can be found in sample-invert0_gpu.input. + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/Hopping_Matrix.cuh b/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/Hopping_Matrix.cuh new file mode 100644 index 0000000000000000000000000000000000000000..2ed19dd57274ccf3f8107df45ec3198e448a8515 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/Hopping_Matrix.cuh @@ -0,0 +1,1298 @@ +/*********************************************************************** + * + * Copyright (C) 2010 Florian Burger + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * + * File: Hopping_Matrix.cuh + * + * CUDA Hopping_Matrix and associated functions + * + * + * + **************************************************************************/ + + + + + + +//-kappa(r - gamma_mu) +template +__device__ void dev_kappaP1_plus(dev_spinorM(RealT) * out, dev_spinorM(RealT) * in, RealT kappa){ + + (*(out+0)).x -= kappa*( (*(in+0)).x - (*(in+4)).w); + (*(out+0)).y -= kappa*( (*(in+0)).y + (*(in+4)).z); + (*(out+0)).z -= kappa*( (*(in+0)).z - (*(in+5)).y); + (*(out+0)).w -= kappa*( (*(in+0)).w + (*(in+5)).x); + + (*(out+1)).x -= kappa*((*(in+1)).x - (*(in+5)).w); + (*(out+1)).y -= kappa*((*(in+1)).y + (*(in+5)).z); + (*(out+1)).z -= kappa*((*(in+1)).z - (*(in+3)).y); + (*(out+1)).w -= kappa*((*(in+1)).w + (*(in+3)).x); + + (*(out+2)).x -= kappa*((*(in+2)).x - (*(in+3)).w); + (*(out+2)).y -= kappa*((*(in+2)).y + (*(in+3)).z); + (*(out+2)).z -= kappa*((*(in+2)).z - (*(in+4)).y); + (*(out+2)).w -= kappa*((*(in+2)).w + (*(in+4)).x); + + (*(out+3)).x -= kappa*((*(in+3)).x + (*(in+1)).w); + (*(out+3)).y -= kappa*((*(in+3)).y - (*(in+1)).z); + (*(out+3)).z -= kappa*((*(in+3)).z + (*(in+2)).y); + (*(out+3)).w -= kappa*((*(in+3)).w - (*(in+2)).x); + + (*(out+4)).z -= kappa*( (*(in+4)).z + (*(in+0)).y); + (*(out+4)).w -= kappa*( (*(in+4)).w - (*(in+0)).x); + (*(out+4)).x -= kappa*((*(in+4)).x + (*(in+2)).w); + (*(out+4)).y -= kappa*((*(in+4)).y - (*(in+2)).z); + + (*(out+5)).x -= kappa*( (*(in+5)).x + (*(in+0)).w); + (*(out+5)).y -= kappa*( (*(in+5)).y - (*(in+0)).z); + (*(out+5)).z -= kappa*((*(in+5)).z + (*(in+1)).y); + (*(out+5)).w -= kappa*((*(in+5)).w - (*(in+1)).x); + +} + + +//-kappa(r + gamma_mu) +template +__device__ void dev_kappaP1_minus(dev_spinorM(RealT) * out, dev_spinorM(RealT) * in, RealT kappa){ + + (*(out+0)).x -= kappa*( (*(in+0)).x + (*(in+4)).w); + (*(out+0)).y -= kappa*( (*(in+0)).y - (*(in+4)).z); + (*(out+0)).z -= kappa*( (*(in+0)).z + (*(in+5)).y); + (*(out+0)).w -= kappa*( (*(in+0)).w - (*(in+5)).x); + + (*(out+1)).x -= kappa*((*(in+1)).x + (*(in+5)).w); + (*(out+1)).y -= kappa*((*(in+1)).y - (*(in+5)).z); + (*(out+1)).z -= kappa*((*(in+1)).z + (*(in+3)).y); + (*(out+1)).w -= kappa*((*(in+1)).w - (*(in+3)).x); + + (*(out+2)).x -= kappa*((*(in+2)).x + (*(in+3)).w); + (*(out+2)).y -= kappa*((*(in+2)).y - (*(in+3)).z); + (*(out+2)).z -= kappa*((*(in+2)).z + (*(in+4)).y); + (*(out+2)).w -= kappa*((*(in+2)).w - (*(in+4)).x); + + (*(out+3)).x -= kappa*((*(in+3)).x - (*(in+1)).w); + (*(out+3)).y -= kappa*((*(in+3)).y + (*(in+1)).z); + (*(out+3)).z -= kappa*((*(in+3)).z - (*(in+2)).y); + (*(out+3)).w -= kappa*((*(in+3)).w + (*(in+2)).x); + + (*(out+4)).z -= kappa*( (*(in+4)).z - (*(in+0)).y); + (*(out+4)).w -= kappa*( (*(in+4)).w + (*(in+0)).x); + (*(out+4)).x -= kappa*((*(in+4)).x - (*(in+2)).w); + (*(out+4)).y -= kappa*((*(in+4)).y + (*(in+2)).z); + + (*(out+5)).x -= kappa*( (*(in+5)).x - (*(in+0)).w); + (*(out+5)).y -= kappa*( (*(in+5)).y + (*(in+0)).z); + (*(out+5)).z -= kappa*((*(in+5)).z - (*(in+1)).y); + (*(out+5)).w -= kappa*((*(in+5)).w + (*(in+1)).x); + +} + + + + + +//-kappa(r - gamma_mu) +template +__device__ void dev_kappaP2_plus(dev_spinorM(RealT) * out, dev_spinorM(RealT) * in, RealT kappa){ + + + (*(out+0)).x -= kappa*( (*(in+0)).x + (*(in+4)).z); + (*(out+0)).y -= kappa*( (*(in+0)).y + (*(in+4)).w); + (*(out+4)).z -= kappa*( (*(in+4)).z + (*(in+0)).x); + (*(out+4)).w -= kappa*( (*(in+4)).w + (*(in+0)).y); + + + (*(out+0)).z -= kappa*( (*(in+0)).z + (*(in+5)).x); + (*(out+0)).w -= kappa*( (*(in+0)).w + (*(in+5)).y); + (*(out+5)).x -= kappa*( (*(in+5)).x + (*(in+0)).z); + (*(out+5)).y -= kappa*( (*(in+5)).y + (*(in+0)).w); + + + (*(out+1)).x -= kappa*( (*(in+1)).x + (*(in+5)).z); + (*(out+1)).y -= kappa*( (*(in+1)).y + (*(in+5)).w); + (*(out+5)).z -= kappa*( (*(in+5)).z + (*(in+1)).x); + (*(out+5)).w -= kappa*( (*(in+5)).w + (*(in+1)).y); + + + (*(out+1)).z -= kappa*( (*(in+1)).z - (*(in+3)).x); + (*(out+1)).w -= kappa*( (*(in+1)).w - (*(in+3)).y); + (*(out+3)).x -= kappa*( (*(in+3)).x - (*(in+1)).z); + (*(out+3)).y -= kappa*( (*(in+3)).y - (*(in+1)).w); + + + (*(out+2)).x -= kappa*( (*(in+2)).x - (*(in+3)).z); + (*(out+2)).y -= kappa*( (*(in+2)).y - (*(in+3)).w); + (*(out+3)).z -= kappa*( (*(in+3)).z - (*(in+2)).x); + (*(out+3)).w -= kappa*( (*(in+3)).w - (*(in+2)).y); + + + (*(out+2)).z -= kappa*( (*(in+2)).z - (*(in+4)).x); + (*(out+2)).w -= kappa*( (*(in+2)).w - (*(in+4)).y); + (*(out+4)).x -= kappa*( (*(in+4)).x - (*(in+2)).z); + (*(out+4)).y -= kappa*( (*(in+4)).y - (*(in+2)).w); + + +} + + +//-kappa(r + gamma_mu) kappa reell !!!! +template +__device__ void dev_kappaP2_minus(dev_spinorM(RealT) * out, dev_spinorM(RealT) * in, RealT kappa){ + + + (*(out+0)).x -= kappa*( (*(in+0)).x - (*(in+4)).z); + (*(out+0)).y -= kappa*( (*(in+0)).y - (*(in+4)).w); + (*(out+4)).z -= kappa*( (*(in+4)).z - (*(in+0)).x); + (*(out+4)).w -= kappa*( (*(in+4)).w - (*(in+0)).y); + + + (*(out+0)).z -= kappa*( (*(in+0)).z - (*(in+5)).x); + (*(out+0)).w -= kappa*( (*(in+0)).w - (*(in+5)).y); + (*(out+5)).x -= kappa*( (*(in+5)).x - (*(in+0)).z); + (*(out+5)).y -= kappa*( (*(in+5)).y - (*(in+0)).w); + + + (*(out+1)).x -= kappa*( (*(in+1)).x - (*(in+5)).z); + (*(out+1)).y -= kappa*( (*(in+1)).y - (*(in+5)).w); + (*(out+5)).z -= kappa*( (*(in+5)).z - (*(in+1)).x); + (*(out+5)).w -= kappa*( (*(in+5)).w - (*(in+1)).y); + + + (*(out+1)).z -= kappa*( (*(in+1)).z + (*(in+3)).x); + (*(out+1)).w -= kappa*( (*(in+1)).w + (*(in+3)).y); + (*(out+3)).x -= kappa*( (*(in+3)).x + (*(in+1)).z); + (*(out+3)).y -= kappa*( (*(in+3)).y + (*(in+1)).w); + + + (*(out+2)).x -= kappa*( (*(in+2)).x + (*(in+3)).z); + (*(out+2)).y -= kappa*( (*(in+2)).y + (*(in+3)).w); + (*(out+3)).z -= kappa*( (*(in+3)).z + (*(in+2)).x); + (*(out+3)).w -= kappa*( (*(in+3)).w + (*(in+2)).y); + + + (*(out+2)).z -= kappa*( (*(in+2)).z + (*(in+4)).x); + (*(out+2)).w -= kappa*( (*(in+2)).w + (*(in+4)).y); + (*(out+4)).x -= kappa*( (*(in+4)).x + (*(in+2)).z); + (*(out+4)).y -= kappa*( (*(in+4)).y + (*(in+2)).w); + + +} + + + +//-kappa(r - gamma_mu) kappa reell !!!! +template +__device__ void dev_kappaP3_plus(dev_spinorM(RealT) * out, dev_spinorM(RealT) * in, RealT kappa){ + + (*(out+0)).x -= kappa*( (*(in+0)).x - (*(in+3)).y); + (*(out+0)).y -= kappa*( (*(in+0)).y + (*(in+3)).x); + (*(out+3)).x -= kappa*( (*(in+3)).x + (*(in+0)).y); + (*(out+3)).y -= kappa*( (*(in+3)).y - (*(in+0)).x); + + + (*(out+0)).z -= kappa*( (*(in+0)).z - (*(in+3)).w); + (*(out+0)).w -= kappa*( (*(in+0)).w + (*(in+3)).z); + (*(out+3)).z -= kappa*( (*(in+3)).z + (*(in+0)).w); + (*(out+3)).w -= kappa*( (*(in+3)).w - (*(in+0)).z); + + + (*(out+1)).x -= kappa*( (*(in+1)).x - (*(in+4)).y); + (*(out+1)).y -= kappa*( (*(in+1)).y + (*(in+4)).x); + (*(out+4)).x -= kappa*( (*(in+4)).x + (*(in+1)).y); + (*(out+4)).y -= kappa*( (*(in+4)).y - (*(in+1)).x); + + + (*(out+1)).z -= kappa*( (*(in+1)).z + (*(in+4)).w); + (*(out+1)).w -= kappa*( (*(in+1)).w - (*(in+4)).z); + (*(out+4)).z -= kappa*( (*(in+4)).z - (*(in+1)).w); + (*(out+4)).w -= kappa*( (*(in+4)).w + (*(in+1)).z); + + + (*(out+2)).x -= kappa*( (*(in+2)).x + (*(in+5)).y); + (*(out+2)).y -= kappa*( (*(in+2)).y - (*(in+5)).x); + (*(out+5)).x -= kappa*( (*(in+5)).x - (*(in+2)).y); + (*(out+5)).y -= kappa*( (*(in+5)).y + (*(in+2)).x); + + + (*(out+2)).z -= kappa*( (*(in+2)).z + (*(in+5)).w); + (*(out+2)).w -= kappa*( (*(in+2)).w - (*(in+5)).z); + (*(out+5)).z -= kappa*( (*(in+5)).z - (*(in+2)).w); + (*(out+5)).w -= kappa*( (*(in+5)).w + (*(in+2)).z); + +} + + +//-kappa(r + gamma_mu) kappa reell !!! +template +__device__ void dev_kappaP3_minus(dev_spinorM(RealT) * out, dev_spinorM(RealT) * in, RealT kappa){ + + (*(out+0)).x -= kappa*( (*(in+0)).x + (*(in+3)).y); + (*(out+0)).y -= kappa*( (*(in+0)).y - (*(in+3)).x); + (*(out+3)).x -= kappa*( (*(in+3)).x - (*(in+0)).y); + (*(out+3)).y -= kappa*( (*(in+3)).y + (*(in+0)).x); + + + (*(out+0)).z -= kappa*( (*(in+0)).z + (*(in+3)).w); + (*(out+0)).w -= kappa*( (*(in+0)).w - (*(in+3)).z); + (*(out+3)).z -= kappa*( (*(in+3)).z - (*(in+0)).w); + (*(out+3)).w -= kappa*( (*(in+3)).w + (*(in+0)).z); + + + (*(out+1)).x -= kappa*( (*(in+1)).x + (*(in+4)).y); + (*(out+1)).y -= kappa*( (*(in+1)).y - (*(in+4)).x); + (*(out+4)).x -= kappa*( (*(in+4)).x - (*(in+1)).y); + (*(out+4)).y -= kappa*( (*(in+4)).y + (*(in+1)).x); + + + (*(out+1)).z -= kappa*( (*(in+1)).z - (*(in+4)).w); + (*(out+1)).w -= kappa*( (*(in+1)).w + (*(in+4)).z); + (*(out+4)).z -= kappa*( (*(in+4)).z + (*(in+1)).w); + (*(out+4)).w -= kappa*( (*(in+4)).w - (*(in+1)).z); + + + (*(out+2)).x -= kappa*( (*(in+2)).x - (*(in+5)).y); + (*(out+2)).y -= kappa*( (*(in+2)).y + (*(in+5)).x); + (*(out+5)).x -= kappa*( (*(in+5)).x + (*(in+2)).y); + (*(out+5)).y -= kappa*( (*(in+5)).y - (*(in+2)).x); + + + (*(out+2)).z -= kappa*( (*(in+2)).z - (*(in+5)).w); + (*(out+2)).w -= kappa*( (*(in+2)).w + (*(in+5)).z); + (*(out+5)).z -= kappa*( (*(in+5)).z + (*(in+2)).w); + (*(out+5)).w -= kappa*( (*(in+5)).w - (*(in+2)).z); + +} + + + + + + + +//-kappa(r - gamma_mu) +template +__device__ void dev_kappaP0_plus(dev_spinorM(RealT) * out, dev_spinorM(RealT) * in, dev_complexM(RealT) kappa){ + + + (*(out+0)).x -= (*(in+0)).x*kappa.re - (*(in+0)).y*kappa.im; + (*(out+0)).y -= (*(in+0)).y*kappa.re + (*(in+0)).x*kappa.im; + (*(out+0)).x -= (*(in+3)).x*kappa.re - (*(in+3)).y*kappa.im; + (*(out+0)).y -= (*(in+3)).y*kappa.re + (*(in+3)).x*kappa.im; + + (*(out+3)).x -= (*(in+3)).x*kappa.re - (*(in+3)).y*kappa.im; + (*(out+3)).y -= (*(in+3)).y*kappa.re + (*(in+3)).x*kappa.im; + (*(out+3)).x -= (*(in+0)).x*kappa.re - (*(in+0)).y*kappa.im; + (*(out+3)).y -= (*(in+0)).y*kappa.re + (*(in+0)).x*kappa.im; + + + + (*(out+0)).z -= (*(in+0)).z*kappa.re - (*(in+0)).w*kappa.im; + (*(out+0)).w -= (*(in+0)).w*kappa.re + (*(in+0)).z*kappa.im; + (*(out+0)).z -= (*(in+3)).z*kappa.re - (*(in+3)).w*kappa.im; + (*(out+0)).w -= (*(in+3)).w*kappa.re + (*(in+3)).z*kappa.im; + + (*(out+3)).z -= (*(in+3)).z*kappa.re - (*(in+3)).w*kappa.im; + (*(out+3)).w -= (*(in+3)).w*kappa.re + (*(in+3)).z*kappa.im; + (*(out+3)).z -= (*(in+0)).z*kappa.re - (*(in+0)).w*kappa.im; + (*(out+3)).w -= (*(in+0)).w*kappa.re + (*(in+0)).z*kappa.im; + + + + (*(out+1)).x -= (*(in+1)).x*kappa.re - (*(in+1)).y*kappa.im; + (*(out+1)).y -= (*(in+1)).y*kappa.re + (*(in+1)).x*kappa.im; + (*(out+1)).x -= (*(in+4)).x*kappa.re - (*(in+4)).y*kappa.im; + (*(out+1)).y -= (*(in+4)).y*kappa.re + (*(in+4)).x*kappa.im; + + (*(out+4)).x -= (*(in+4)).x*kappa.re - (*(in+4)).y*kappa.im; + (*(out+4)).y -= (*(in+4)).y*kappa.re + (*(in+4)).x*kappa.im; + (*(out+4)).x -= (*(in+1)).x*kappa.re - (*(in+1)).y*kappa.im; + (*(out+4)).y -= (*(in+1)).y*kappa.re + (*(in+1)).x*kappa.im; + + + + (*(out+1)).z -= (*(in+1)).z*kappa.re - (*(in+1)).w*kappa.im; + (*(out+1)).w -= (*(in+1)).w*kappa.re + (*(in+1)).z*kappa.im; + (*(out+1)).z -= (*(in+4)).z*kappa.re - (*(in+4)).w*kappa.im; + (*(out+1)).w -= (*(in+4)).w*kappa.re + (*(in+4)).z*kappa.im; + + (*(out+4)).z -= (*(in+4)).z*kappa.re - (*(in+4)).w*kappa.im; + (*(out+4)).w -= (*(in+4)).w*kappa.re + (*(in+4)).z*kappa.im; + (*(out+4)).z -= (*(in+1)).z*kappa.re - (*(in+1)).w*kappa.im; + (*(out+4)).w -= (*(in+1)).w*kappa.re + (*(in+1)).z*kappa.im; + + + + (*(out+2)).x -= (*(in+2)).x*kappa.re - (*(in+2)).y*kappa.im; + (*(out+2)).y -= (*(in+2)).y*kappa.re + (*(in+2)).x*kappa.im; + (*(out+2)).x -= (*(in+5)).x*kappa.re - (*(in+5)).y*kappa.im; + (*(out+2)).y -= (*(in+5)).y*kappa.re + (*(in+5)).x*kappa.im; + + (*(out+5)).x -= (*(in+5)).x*kappa.re - (*(in+5)).y*kappa.im; + (*(out+5)).y -= (*(in+5)).y*kappa.re + (*(in+5)).x*kappa.im; + (*(out+5)).x -= (*(in+2)).x*kappa.re - (*(in+2)).y*kappa.im; + (*(out+5)).y -= (*(in+2)).y*kappa.re + (*(in+2)).x*kappa.im; + + + + (*(out+2)).z -= (*(in+2)).z*kappa.re - (*(in+2)).w*kappa.im; + (*(out+2)).w -= (*(in+2)).w*kappa.re + (*(in+2)).z*kappa.im; + (*(out+2)).z -= (*(in+5)).z*kappa.re - (*(in+5)).w*kappa.im; + (*(out+2)).w -= (*(in+5)).w*kappa.re + (*(in+5)).z*kappa.im; + + (*(out+5)).z -= (*(in+5)).z*kappa.re - (*(in+5)).w*kappa.im; + (*(out+5)).w -= (*(in+5)).w*kappa.re + (*(in+5)).z*kappa.im; + (*(out+5)).z -= (*(in+2)).z*kappa.re - (*(in+2)).w*kappa.im; + (*(out+5)).w -= (*(in+2)).w*kappa.re + (*(in+2)).z*kappa.im; + +} + + + + + + +//-kappa(r - gamma_mu) +template +__device__ void dev_kappaP0_minus(dev_spinorM(RealT) * out, dev_spinorM(RealT) * in, dev_complexM(RealT) kappa){ + + + (*(out+0)).x -= (*(in+0)).x*kappa.re - (*(in+0)).y*kappa.im; + (*(out+0)).y -= (*(in+0)).y*kappa.re + (*(in+0)).x*kappa.im; + (*(out+0)).x += (*(in+3)).x*kappa.re - (*(in+3)).y*kappa.im; + (*(out+0)).y += (*(in+3)).y*kappa.re + (*(in+3)).x*kappa.im; + + (*(out+3)).x -= (*(in+3)).x*kappa.re - (*(in+3)).y*kappa.im; + (*(out+3)).y -= (*(in+3)).y*kappa.re + (*(in+3)).x*kappa.im; + (*(out+3)).x += (*(in+0)).x*kappa.re - (*(in+0)).y*kappa.im; + (*(out+3)).y += (*(in+0)).y*kappa.re + (*(in+0)).x*kappa.im; + + + + (*(out+0)).z -= (*(in+0)).z*kappa.re - (*(in+0)).w*kappa.im; + (*(out+0)).w -= (*(in+0)).w*kappa.re + (*(in+0)).z*kappa.im; + (*(out+0)).z += (*(in+3)).z*kappa.re - (*(in+3)).w*kappa.im; + (*(out+0)).w += (*(in+3)).w*kappa.re + (*(in+3)).z*kappa.im; + + (*(out+3)).z -= (*(in+3)).z*kappa.re - (*(in+3)).w*kappa.im; + (*(out+3)).w -= (*(in+3)).w*kappa.re + (*(in+3)).z*kappa.im; + (*(out+3)).z += (*(in+0)).z*kappa.re - (*(in+0)).w*kappa.im; + (*(out+3)).w += (*(in+0)).w*kappa.re + (*(in+0)).z*kappa.im; + + + + (*(out+1)).x -= (*(in+1)).x*kappa.re - (*(in+1)).y*kappa.im; + (*(out+1)).y -= (*(in+1)).y*kappa.re + (*(in+1)).x*kappa.im; + (*(out+1)).x += (*(in+4)).x*kappa.re - (*(in+4)).y*kappa.im; + (*(out+1)).y += (*(in+4)).y*kappa.re + (*(in+4)).x*kappa.im; + + (*(out+4)).x -= (*(in+4)).x*kappa.re - (*(in+4)).y*kappa.im; + (*(out+4)).y -= (*(in+4)).y*kappa.re + (*(in+4)).x*kappa.im; + (*(out+4)).x += (*(in+1)).x*kappa.re - (*(in+1)).y*kappa.im; + (*(out+4)).y += (*(in+1)).y*kappa.re + (*(in+1)).x*kappa.im; + + + + (*(out+1)).z -= (*(in+1)).z*kappa.re - (*(in+1)).w*kappa.im; + (*(out+1)).w -= (*(in+1)).w*kappa.re + (*(in+1)).z*kappa.im; + (*(out+1)).z += (*(in+4)).z*kappa.re - (*(in+4)).w*kappa.im; + (*(out+1)).w += (*(in+4)).w*kappa.re + (*(in+4)).z*kappa.im; + + (*(out+4)).z -= (*(in+4)).z*kappa.re - (*(in+4)).w*kappa.im; + (*(out+4)).w -= (*(in+4)).w*kappa.re + (*(in+4)).z*kappa.im; + (*(out+4)).z += (*(in+1)).z*kappa.re - (*(in+1)).w*kappa.im; + (*(out+4)).w += (*(in+1)).w*kappa.re + (*(in+1)).z*kappa.im; + + + + (*(out+2)).x -= (*(in+2)).x*kappa.re - (*(in+2)).y*kappa.im; + (*(out+2)).y -= (*(in+2)).y*kappa.re + (*(in+2)).x*kappa.im; + (*(out+2)).x += (*(in+5)).x*kappa.re - (*(in+5)).y*kappa.im; + (*(out+2)).y += (*(in+5)).y*kappa.re + (*(in+5)).x*kappa.im; + + (*(out+5)).x -= (*(in+5)).x*kappa.re - (*(in+5)).y*kappa.im; + (*(out+5)).y -= (*(in+5)).y*kappa.re + (*(in+5)).x*kappa.im; + (*(out+5)).x += (*(in+2)).x*kappa.re - (*(in+2)).y*kappa.im; + (*(out+5)).y += (*(in+2)).y*kappa.re + (*(in+2)).x*kappa.im; + + + + (*(out+2)).z -= (*(in+2)).z*kappa.re - (*(in+2)).w*kappa.im; + (*(out+2)).w -= (*(in+2)).w*kappa.re + (*(in+2)).z*kappa.im; + (*(out+2)).z += (*(in+5)).z*kappa.re - (*(in+5)).w*kappa.im; + (*(out+2)).w += (*(in+5)).w*kappa.re + (*(in+5)).z*kappa.im; + + (*(out+5)).z -= (*(in+5)).z*kappa.re - (*(in+5)).w*kappa.im; + (*(out+5)).w -= (*(in+5)).w*kappa.re + (*(in+5)).z*kappa.im; + (*(out+5)).z += (*(in+2)).z*kappa.re - (*(in+2)).w*kappa.im; + (*(out+5)).w += (*(in+2)).w*kappa.re + (*(in+2)).z*kappa.im; + +} + + + + + + + + + + + +//applies the Hopping Part Even-Odd ! +//the gauge field is the complete gaugefield! +//the gauge field at the local point is reconstructed by 2*pos+eo where pos is the eo-position +//from 0..VOLUME/2-1, eo = 0 or 1 +//the positions in the gauge fields are passed in "gfindex_site" for gf's that are attached at +//the actual positions and in "gfindex_nextsite" for gf's that start at a position of the +//other eo-sublattice. +//for the hopping positions of the eo-spinor field we use on of the two dedicated eo-nn fields +//the boundary conditions are implemented as in Hopping_Matrix.c +//mult with complex conjugate k0,k1,k2,k3 in positive direction because +// psi(x+mu) != exp(i theta_mu) psi(x) +template +__global__ void dev_Hopping_Matrix(const dev_su3_2vM(RealT) * gf, const dev_spinorM(RealT) * sin, dev_spinorM(RealT) * sout, const int * gfindex_site, const int* gfindex_nextsite, const int * nn_evenodd, const int eo){ + + int pos,hoppos; + dev_spinorM(RealT) shelp1[6], ssum[6]; + __shared__ dev_su3_padM(RealT) gfsmem[BLOCK]; + + + + pos= threadIdx.x + blockDim.x*blockIdx.x; + int ix = threadIdx.x; + + + if(pos < dev_VOLUME){ + + + dev_zero_spinor(&(ssum[0])); // zero sum + #ifdef TEMPORALGAUGE + int spatialvol = dev_LX*dev_LY*dev_LZ; + #endif + + +//hopping term +//l==0,t + //positive direction + hoppos = nn_evenodd[8*pos]; + //hoppos = tex1Dfetch(nn_tex,8*pos); + //color + + #ifdef TEMPORALGAUGE + // gf == ID for t != T-1 => just read the spinor + #ifdef MPI + if ( ((gfindex_site[pos]) < (dev_T-1)*spatialvol) || (dev_rank < dev_nproc-1) ) { + //if ((gfindex_site[pos]) < (dev_T-1)*spatialvol) { // FAKE TEMPORALGAUGE + #else + if ((gfindex_site[pos]/spatialvol) != (dev_T-1) ) { + #endif + + #ifdef USETEXTURE + shelp1[0] = tex1Dfetch(spin_tex,6*hoppos); + shelp1[1] = tex1Dfetch(spin_tex,6*hoppos+1); + shelp1[2] = tex1Dfetch(spin_tex,6*hoppos+2); + shelp1[3] = tex1Dfetch(spin_tex,6*hoppos+3); + shelp1[4] = tex1Dfetch(spin_tex,6*hoppos+4); + shelp1[5] = tex1Dfetch(spin_tex,6*hoppos+5); + #else + shelp1[0] = sin[6*hoppos]; + shelp1[1] = sin[6*hoppos+1]; + shelp1[2] = sin[6*hoppos+2]; + shelp1[3] = sin[6*hoppos+3]; + shelp1[4] = sin[6*hoppos+4]; + shelp1[5] = sin[6*hoppos+5]; + #endif + } + else{ + // gf != ID for t == T-1 => mult spinor with gf + #ifdef GF_8 + dev_reconstructgf_8texref (gf, 4*(gfindex_site[pos]),&(gfsmem[ix].m)); + #else + dev_reconstructgf_2vtexref(gf,4*(gfindex_site[pos]),&(gfsmem[ix].m)); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix].m, hoppos, &(shelp1[0])); + #else + dev_su3MtV (gfsmem[ix].m, &(sin[6*hoppos]), &(shelp1[0])); + #endif + } + #else + #ifdef GF_8 + dev_reconstructgf_8texref (gf, 4*(gfindex_site[pos]),&(gfsmem[ix].m)); + #else + dev_reconstructgf_2vtexref(gf, 4*(gfindex_site[pos]),&(gfsmem[ix].m)); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix].m, hoppos, &(shelp1[0])); + #else + dev_su3MtV (gfsmem[ix].m, &(sin[6*hoppos]), &(shelp1[0])); + #endif + #endif + + //-kappa(r - gamma_mu) + #ifdef GF_8 + dev_kappaP0_plus(&(ssum[0]), &(shelp1[0]), dev_cconj(dev_complexT(dev_k0))); + #else + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_mk0),&(shelp1[0]), &(ssum[0])); + dev_Gamma0(&(shelp1[0])); + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_k0),&(shelp1[0]), &(ssum[0])); + #endif + +//l==0,t + //negative direction + hoppos = nn_evenodd[8*pos+4]; + //hoppos = tex1Dfetch(nn_tex,8*pos+4); + //color + #ifdef TEMPORALGAUGE + // gf == ID for t != T-1 => just read the spinor + #ifdef MPI + if ( ((gfindex_nextsite[hoppos]) < (dev_T-1)*spatialvol) || (dev_rank > 0) ) { + //if ((gfindex_nextsite[hoppos]) < (dev_T-1)*spatialvol) { // FAKE TEMPORALGAUGE + #else + if ((gfindex_nextsite[hoppos]/spatialvol) != (dev_T-1) ) { + #endif + + #ifdef USETEXTURE + shelp1[0] = tex1Dfetch(spin_tex,6*hoppos); + shelp1[1] = tex1Dfetch(spin_tex,6*hoppos+1); + shelp1[2] = tex1Dfetch(spin_tex,6*hoppos+2); + shelp1[3] = tex1Dfetch(spin_tex,6*hoppos+3); + shelp1[4] = tex1Dfetch(spin_tex,6*hoppos+4); + shelp1[5] = tex1Dfetch(spin_tex,6*hoppos+5); + #else + shelp1[0] = sin[6*hoppos]; + shelp1[1] = sin[6*hoppos+1]; + shelp1[2] = sin[6*hoppos+2]; + shelp1[3] = sin[6*hoppos+3]; + shelp1[4] = sin[6*hoppos+4]; + shelp1[5] = sin[6*hoppos+5]; + #endif + } + else{ + // gf != ID for t == T-1 => mult spinor with gf + #ifdef GF_8 + dev_reconstructgf_8texref_dagger (gf,4*gfindex_nextsite[hoppos],&(gfsmem[ix].m)); + #else + dev_reconstructgf_2vtexref_dagger(gf,4*gfindex_nextsite[hoppos],&(gfsmem[ix].m)); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix].m, hoppos, &(shelp1[0])); + #else + dev_su3MtV (gfsmem[ix].m, &(sin[6*hoppos]), &(shelp1[0])); + #endif + } + #else + #ifdef GF_8 + dev_reconstructgf_8texref_dagger (gf,4*gfindex_nextsite[hoppos],&(gfsmem[ix].m)); + #else + dev_reconstructgf_2vtexref_dagger(gf,4*gfindex_nextsite[hoppos],&(gfsmem[ix].m)); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix].m, hoppos, &(shelp1[0])); + #else + dev_su3MtV (gfsmem[ix].m, &(sin[6*hoppos]), &(shelp1[0])); + #endif + #endif + + //-kappa(r + gamma_mu) + #ifdef GF_8 + dev_kappaP0_minus(&(ssum[0]), &(shelp1[0]), dev_complexT(dev_k0)); + #else + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_mk0),&(shelp1[0]), &(ssum[0])); + dev_Gamma0(&(shelp1[0])); + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_mk0),&(shelp1[0]), &(ssum[0])); + #endif + + + + +//l==3,z + //positive direction + hoppos = nn_evenodd[8*pos+3]; + //hoppos = tex1Dfetch(nn_tex,8*pos+3); + //color + #ifdef GF_8 + dev_reconstructgf_8texref (gf,4*(gfindex_site[pos])+(3),&(gfsmem[ix].m)); + #else + dev_reconstructgf_2vtexref(gf, 4*(gfindex_site[pos])+(3),&(gfsmem[ix].m)); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix].m, hoppos, &(shelp1[0])); + #else + dev_su3MtV (gfsmem[ix].m, &(sin[6*hoppos]), &(shelp1[0])); + #endif + //-kappa(r - gamma_mu) + #ifdef GF_8 + dev_kappaP3_plus(&(ssum[0]), &(shelp1[0]), RealT(dev_k3.re)); + #else + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_mk3),&(shelp1[0]), &(ssum[0])); + dev_Gamma3(&(shelp1[0])); + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_k3),&(shelp1[0]), &(ssum[0])); + #endif +//l==3,z + + //negative direction + hoppos = nn_evenodd[8*pos+7]; + //hoppos = tex1Dfetch(nn_tex,8*pos+7); + //color + #ifdef GF_8 + dev_reconstructgf_8texref_dagger (gf,4*gfindex_nextsite[hoppos]+(3),&(gfsmem[ix].m)); + #else + dev_reconstructgf_2vtexref_dagger(gf,4*gfindex_nextsite[hoppos]+(3),&(gfsmem[ix].m)); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix].m, hoppos, &(shelp1[0])); + #else + dev_su3MtV (gfsmem[ix].m, &(sin[6*hoppos]), &(shelp1[0])); + #endif + //-kappa(r + gamma_mu) + #ifdef GF_8 + dev_kappaP3_minus(&(ssum[0]), &(shelp1[0]), RealT(dev_k3.re)); + #else + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_mk3),&(shelp1[0]), &(ssum[0])); + dev_Gamma3(&(shelp1[0])); + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_mk3),&(shelp1[0]), &(ssum[0])); + #endif + + + + +//l==2,y + //positive direction + hoppos = nn_evenodd[8*pos+2]; + //hoppos = tex1Dfetch(nn_tex,8*pos+2); + //color + #ifdef GF_8 + dev_reconstructgf_8texref (gf,4*(gfindex_site[pos])+(2),&(gfsmem[ix].m)); + #else + dev_reconstructgf_2vtexref(gf,4*(gfindex_site[pos])+(2),&(gfsmem[ix].m)); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix].m, hoppos, &(shelp1[0])); + #else + dev_su3MtV (gfsmem[ix].m, &(sin[6*hoppos]), &(shelp1[0])); + #endif + //-kappa(r - gamma_mu) + #ifdef GF_8 + dev_kappaP2_plus(&(ssum[0]), &(shelp1[0]), RealT(dev_k2.re)); + #else + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_mk2),&(shelp1[0]), &(ssum[0])); + dev_Gamma2(&(shelp1[0])); + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_k2),&(shelp1[0]), &(ssum[0])); + #endif + +//l==2,y + + + //negative direction + hoppos = nn_evenodd[8*pos+6]; + //hoppos = tex1Dfetch(nn_tex,8*pos+6); + //color + #ifdef GF_8 + dev_reconstructgf_8texref_dagger (gf,4*gfindex_nextsite[hoppos]+(2),&(gfsmem[ix].m)); + #else + dev_reconstructgf_2vtexref_dagger(gf,4*gfindex_nextsite[hoppos]+(2),&(gfsmem[ix].m)); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix].m, hoppos, &(shelp1[0])); + #else + dev_su3MtV (gfsmem[ix].m, &(sin[6*hoppos]), &(shelp1[0])); + #endif + //-kappa(r + gamma_mu) + #ifdef GF_8 + dev_kappaP2_minus(&(ssum[0]), &(shelp1[0]), RealT(dev_k2.re)); + #else + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_mk2),&(shelp1[0]), &(ssum[0])); + dev_Gamma2(&(shelp1[0])); + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_mk2),&(shelp1[0]), &(ssum[0])); + #endif + + + +//l==1,x + //positive direction + hoppos = nn_evenodd[8*pos+1]; + //hoppos = tex1Dfetch(nn_tex,8*pos+1); + //color + #ifdef GF_8 + dev_reconstructgf_8texref (gf,4*(gfindex_site[pos])+(1),&(gfsmem[ix].m)); + #else + dev_reconstructgf_2vtexref(gf,4*(gfindex_site[pos])+(1),&(gfsmem[ix].m)); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix].m, hoppos, &(shelp1[0])); + #else + dev_su3MtV (gfsmem[ix].m, &(sin[6*hoppos]), &(shelp1[0])); + #endif + //-kappa(r - gamma_mu) + #ifdef GF_8 + dev_kappaP1_plus(&(ssum[0]), &(shelp1[0]), RealT(dev_k1.re)); + #else + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_mk1),&(shelp1[0]), &(ssum[0])); + dev_Gamma1(&(shelp1[0])); + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_k1),&(shelp1[0]), &(ssum[0])); + #endif + + +//l==1,x + + //negative direction + hoppos = nn_evenodd[8*pos+5]; + //hoppos = tex1Dfetch(nn_tex,8*pos+5); + //color + #ifdef GF_8 + dev_reconstructgf_8texref_dagger (gf,4*gfindex_nextsite[hoppos]+(1),&(gfsmem[ix].m)); + #else + dev_reconstructgf_2vtexref_dagger(gf,4*gfindex_nextsite[hoppos]+(1),&(gfsmem[ix].m)); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix].m, hoppos, &(shelp1[0])); + #else + dev_su3MtV (gfsmem[ix].m, &(sin[6*hoppos]), &(shelp1[0])); + #endif + //-kappa(r + gamma_mu) + #ifdef GF_8 + dev_kappaP1_minus(&(ssum[0]), &(shelp1[0]), RealT(dev_k1.re)); + #else + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_mk1),&(shelp1[0]), &(ssum[0])); + dev_Gamma1(&(shelp1[0])); + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_mk1),&(shelp1[0]), &(ssum[0])); + #endif + + //copy to output spinor + dev_copy_spinor(&(ssum[0]),&(sout[6*pos])); + } +} + + + +/* + +//applies the Hopping Part Even-Odd ! +//the gauge field is the complete gaugefield! +//the gauge field at the local point is reconstructed by 2*pos+eo where pos is the eo-position +//from 0..VOLUME/2-1, eo = 0 or 1 +//the positions in the gauge fields are passed in "gfindex_site" for gf's that are attached at +//the actual positions and in "gfindex_nextsite" for gf's that start at a position of the +//other eo-sublattice. +//for the hopping positions of the eo-spinor field we use on of the two dedicated eo-nn fields +//the boundary conditions are implemented as in Hopping_Matrix.c +//mult with complex conjugate k0,k1,k2,k3 in positive direction because +// psi(x+mu) != exp(i theta_mu) psi(x) +__global__ void dev_Hopping_Matrix(dev_su3_2v * gf, dev_spinor * sin, dev_spinor * sout, int * gfindex_site,int* gfindex_nextsite, int * nn_evenodd, const int eo){ + + int pos,hoppos; + dev_spinor shelp1[6], ssum[6]; + __shared__ dev_su3 gfsmem[BLOCK]; + + + pos= threadIdx.x + blockDim.x*blockIdx.x; + int ix = threadIdx.x; + if(pos < dev_VOLUME){ + + dev_zero_spinor(&(ssum[0])); // zero sum +//hopping term +//l==0,t + //positive direction + hoppos = nn_evenodd[8*pos]; + //color + #ifdef GF_8 + dev_reconstructgf_8texref(4*(gfindex_site[pos]),&(gfsmem[ix])); + #else + dev_reconstructgf_2vtexref(4*(gfindex_site[pos]),&(gfsmem[ix])); + #endif + + dev_su3MtV_spintex(gfsmem[ix], hoppos, &(shelp1[0])); + //-kappa(r - gamma_mu) + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_mk0),&(shelp1[0]), &(ssum[0])); + dev_Gamma0(&(shelp1[0])); + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_k0),&(shelp1[0]), &(ssum[0])); + + //negative direction + hoppos = nn_evenodd[8*pos+4]; + //color + #ifdef GF_8 + dev_reconstructgf_8texref_dagger(4*gfindex_nextsite[hoppos],&(gfsmem[ix])); + #else + dev_reconstructgf_2vtexref_dagger(4*gfindex_nextsite[hoppos],&(gfsmem[ix])); + #endif + dev_su3MtV_spintex(gfsmem[ix], hoppos, &(shelp1[0])); + //-kappa(r + gamma_mu) + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_mk0),&(shelp1[0]), &(ssum[0])); + dev_Gamma0(&(shelp1[0])); + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_mk0),&(shelp1[0]), &(ssum[0])); + + +//l==3,z + //positive direction + hoppos = nn_evenodd[8*pos+3]; + //color + #ifdef GF_8 + dev_reconstructgf_8texref(4*(gfindex_site[pos])+(3),&(gfsmem[ix])); + #else + dev_reconstructgf_2vtexref(4*(gfindex_site[pos])+(3),&(gfsmem[ix])); + #endif + dev_su3MtV_spintex(gfsmem[ix], hoppos, &(shelp1[0])); + //-kappa(r - gamma_mu) + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_mk3),&(shelp1[0]), &(ssum[0])); + dev_Gamma3(&(shelp1[0])); + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_k3),&(shelp1[0]), &(ssum[0])); + + //negative direction + hoppos = nn_evenodd[8*pos+7]; + //color + #ifdef GF_8 + dev_reconstructgf_8texref_dagger(4*gfindex_nextsite[hoppos]+(3),&(gfsmem[ix])); + #else + dev_reconstructgf_2vtexref_dagger(4*gfindex_nextsite[hoppos]+(3),&(gfsmem[ix])); + #endif + dev_su3MtV_spintex(gfsmem[ix], hoppos, &(shelp1[0])); + //-kappa(r + gamma_mu) + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_mk3),&(shelp1[0]), &(ssum[0])); + dev_Gamma3(&(shelp1[0])); + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_mk3),&(shelp1[0]), &(ssum[0])); + + +//l==2,y + //positive direction + hoppos = nn_evenodd[8*pos+2]; + //color + #ifdef GF_8 + dev_reconstructgf_8texref(4*(gfindex_site[pos])+(2),&(gfsmem[ix])); + #else + dev_reconstructgf_2vtexref(4*(gfindex_site[pos])+(2),&(gfsmem[ix])); + #endif + dev_su3MtV_spintex(gfsmem[ix], hoppos, &(shelp1[0])); + //-kappa(r - gamma_mu) + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_mk2),&(shelp1[0]), &(ssum[0])); + dev_Gamma2(&(shelp1[0])); + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_k2),&(shelp1[0]), &(ssum[0])); + + //negative direction + hoppos = nn_evenodd[8*pos+6]; + //color + #ifdef GF_8 + dev_reconstructgf_8texref_dagger(4*gfindex_nextsite[hoppos]+(2),&(gfsmem[ix])); + #else + dev_reconstructgf_2vtexref_dagger(4*gfindex_nextsite[hoppos]+(2),&(gfsmem[ix])); + #endif + dev_su3MtV_spintex(gfsmem[ix], hoppos, &(shelp1[0])); + //-kappa(r + gamma_mu) + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_mk2),&(shelp1[0]), &(ssum[0])); + dev_Gamma2(&(shelp1[0])); + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_mk2),&(shelp1[0]), &(ssum[0])); + + +//l==1,x + //positive direction + hoppos = nn_evenodd[8*pos+1]; + //color + #ifdef GF_8 + dev_reconstructgf_8texref(4*(gfindex_site[pos])+(1),&(gfsmem[ix])); + #else + dev_reconstructgf_2vtexref(4*(gfindex_site[pos])+(1),&(gfsmem[ix])); + #endif + dev_su3MtV_spintex(gfsmem[ix], hoppos, &(shelp1[0])); + //-kappa(r - gamma_mu) + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_mk1),&(shelp1[0]), &(ssum[0])); + dev_Gamma1(&(shelp1[0])); + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_k1),&(shelp1[0]), &(ssum[0])); + + //negative direction + hoppos = nn_evenodd[8*pos+5]; + //color + #ifdef GF_8 + dev_reconstructgf_8texref_dagger(4*gfindex_nextsite[hoppos]+(1),&(gfsmem[ix])); + #else + dev_reconstructgf_2vtexref_dagger(4*gfindex_nextsite[hoppos]+(1),&(gfsmem[ix])); + #endif + dev_su3MtV_spintex(gfsmem[ix], hoppos, &(shelp1[0])); + //-kappa(r + gamma_mu) + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_mk1),&(shelp1[0]), &(ssum[0])); + dev_Gamma1(&(shelp1[0])); + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_mk1),&(shelp1[0]), &(ssum[0])); + + + //copy to output spinor + dev_copy_spinor(&(ssum[0]),&(sout[6*pos])); + } +} + + + +*/ + + + + + +#ifdef HALF + +//applies the Hopping Part Even-Odd HALF PRECISION ! +//else aequivalent to the above version +__global__ void dev_Hopping_Matrix_half(const dev_su3_2v_half * gf, const dev_spinor_half * sin, const float* sin_norm, dev_spinor_half * sout, float* sout_norm, const int * gfindex_site, const int* gfindex_nextsite, const int * nn_evenodd, const int eo){ + + typedef REAL RealT; + int pos,hoppos; + dev_spinor shelp1[6], ssum[6]; + __shared__ dev_su3_pad gfsmem[BLOCK]; + + + pos= threadIdx.x + blockDim.x*blockIdx.x; + int ix = threadIdx.x; + + + if(pos < dev_VOLUME){ + + + dev_zero_spinor(&(ssum[0])); // zero sum + #ifdef TEMPORALGAUGE + int spatialvol = dev_LX*dev_LY*dev_LZ; + #endif + + +//hopping term +//l==0,t + //positive direction + hoppos = nn_evenodd[8*pos]; + //hoppos = tex1Dfetch(nn_tex,8*pos); + //color + + #ifdef TEMPORALGAUGE + // gf == ID for t != T-1 => just read the spinor + #ifdef MPI + if ( ((gfindex_site[pos]) < (dev_T-1)*spatialvol) || (dev_rank < dev_nproc-1) ) { + //if ((gfindex_site[pos]) < (dev_T-1)*spatialvol) { // FAKE TEMPORALGAUGE + #else + if ((gfindex_site[pos]/spatialvol) != (dev_T-1) ) { + #endif + + #ifdef USETEXTURE + double norm = tex1Dfetch(spinnormhalf_tex, hoppos); + shelp1[0] = tex1Dfetch(spinhalf_tex,6*hoppos); + shelp1[1] = tex1Dfetch(spinhalf_tex,6*hoppos+1); + shelp1[2] = tex1Dfetch(spinhalf_tex,6*hoppos+2); + shelp1[3] = tex1Dfetch(spinhalf_tex,6*hoppos+3); + shelp1[4] = tex1Dfetch(spinhalf_tex,6*hoppos+4); + shelp1[5] = tex1Dfetch(spinhalf_tex,6*hoppos+5); + //normalize + #pragma unroll 6 + for(int i=0; i<6; i++){ + shelp1[i].x = norm*shelp1[i].x; + shelp1[i].y = norm*shelp1[i].y; + shelp1[i].z = norm*shelp1[i].z; + shelp1[i].w = norm*shelp1[i].w; + } + #else + norm = sin_norm[hoppos]; + //read and normalize + #pragma unroll 6 + for(i=0; i<6; i++){ + shelp1[i].x = norm*sh2fl(sin[6*hoppos+i].x); + shelp1[i].y = norm*sh2fl(sin[6*hoppos+i].y); + shelp1[i].z = norm*sh2fl(sin[6*hoppos+i].z); + shelp1[i].w = norm*sh2fl(sin[6*hoppos+i].w); + } + #endif + } + else{ + // gf != ID for t == T-1 => mult spinor with gf + #ifdef GF_8 + dev_reconstructgf_8texref_half (gf, 4*(gfindex_site[pos]),&(gfsmem[ix].m)); + #else + dev_reconstructgf_2vtexref_half(gf,4*(gfindex_site[pos]),&(gfsmem[ix].m)); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix].m, hoppos, &(shelp1[0])); + #else + dev_su3MtV_half(gfsmem[ix].m, &(sin[6*hoppos]), &(sin_norm[hoppos]), &(shelp1[0])); + #endif + } + #else + #ifdef GF_8 + dev_reconstructgf_8texref_half (gf, 4*(gfindex_site[pos]),&(gfsmem[ix].m)); + #else + dev_reconstructgf_2vtexref_half(gf, 4*(gfindex_site[pos]),&(gfsmem[ix].m)); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix].m, hoppos, &(shelp1[0])); + #else + dev_su3MtV_half(gfsmem[ix].m, &(sin[6*hoppos]), &(sin_norm[hoppos]), &(shelp1[0])); + #endif + #endif + + //-kappa(r - gamma_mu) + #ifdef GF_8 + dev_kappaP0_plus(&(ssum[0]), &(shelp1[0]), dev_cconj(dev_complexT(dev_k0))); + #else + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_mk0),&(shelp1[0]), &(ssum[0])); + dev_Gamma0(&(shelp1[0])); + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_k0),&(shelp1[0]), &(ssum[0])); + #endif + +//l==0,t + //negative direction + hoppos = nn_evenodd[8*pos+4]; + //hoppos = tex1Dfetch(nn_tex,8*pos+4); + //color + #ifdef TEMPORALGAUGE + // gf == ID for t != T-1 => just read the spinor + #ifdef MPI + if ( ((gfindex_nextsite[hoppos]) < (dev_T-1)*spatialvol) || (dev_rank > 0) ) { + //if ((gfindex_nextsite[hoppos]) < (dev_T-1)*spatialvol) { // FAKE TEMPORALGAUGE + #else + if ((gfindex_nextsite[hoppos]/spatialvol) != (dev_T-1) ) { + #endif + + #ifdef USETEXTURE + double norm = tex1Dfetch(spinnormhalf_tex, hoppos); + shelp1[0] = tex1Dfetch(spinhalf_tex,6*hoppos); + shelp1[1] = tex1Dfetch(spinhalf_tex,6*hoppos+1); + shelp1[2] = tex1Dfetch(spinhalf_tex,6*hoppos+2); + shelp1[3] = tex1Dfetch(spinhalf_tex,6*hoppos+3); + shelp1[4] = tex1Dfetch(spinhalf_tex,6*hoppos+4); + shelp1[5] = tex1Dfetch(spinhalf_tex,6*hoppos+5); + //normalize + #pragma unroll 6 + for(int i=0; i<6; i++){ + shelp1[i].x = norm*shelp1[i].x; + shelp1[i].y = norm*shelp1[i].y; + shelp1[i].z = norm*shelp1[i].z; + shelp1[i].w = norm*shelp1[i].w; + } + #else + norm = sin_norm[hoppos]; + //read and normalize + #pragma unroll 6 + for(i=0; i<6; i++){ + shelp1[i].x = norm*sh2fl(sin[6*hoppos+i].x); + shelp1[i].y = norm*sh2fl(sin[6*hoppos+i].y); + shelp1[i].z = norm*sh2fl(sin[6*hoppos+i].z); + shelp1[i].w = norm*sh2fl(sin[6*hoppos+i].w); + } + #endif + } + else{ + // gf != ID for t == T-1 => mult spinor with gf + #ifdef GF_8 + dev_reconstructgf_8texref_dagger_half (gf,4*gfindex_nextsite[hoppos],&(gfsmem[ix].m)); + #else + dev_reconstructgf_2vtexref_dagger_half(gf,4*gfindex_nextsite[hoppos],&(gfsmem[ix].m)); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix].m, hoppos, &(shelp1[0])); + #else + dev_su3MtV_half(gfsmem[ix].m, &(sin[6*hoppos]),&(sin_norm[hoppos]), &(shelp1[0])); + #endif + } + #else + #ifdef GF_8 + dev_reconstructgf_8texref_dagger_half (gf,4*gfindex_nextsite[hoppos],&(gfsmem[ix].m)); + #else + dev_reconstructgf_2vtexref_dagger_half(gf,4*gfindex_nextsite[hoppos],&(gfsmem[ix].m)); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix].m, hoppos, &(shelp1[0])); + #else + dev_su3MtV_half(gfsmem[ix].m, &(sin[6*hoppos]),&(sin_norm[hoppos]), &(shelp1[0])); + #endif + #endif + + //-kappa(r + gamma_mu) + #ifdef GF_8 + dev_kappaP0_minus(&(ssum[0]), &(shelp1[0]), dev_complexT(dev_k0)); + #else + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_mk0),&(shelp1[0]), &(ssum[0])); + dev_Gamma0(&(shelp1[0])); + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_mk0),&(shelp1[0]), &(ssum[0])); + #endif + + + + +//l==3,z + //positive direction + hoppos = nn_evenodd[8*pos+3]; + //hoppos = tex1Dfetch(nn_tex,8*pos+3); + //color + #ifdef GF_8 + dev_reconstructgf_8texref_half (gf,4*(gfindex_site[pos])+(3),&(gfsmem[ix].m)); + #else + dev_reconstructgf_2vtexref_half(gf, 4*(gfindex_site[pos])+(3),&(gfsmem[ix].m)); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix].m, hoppos, &(shelp1[0])); + #else + dev_su3MtV_half(gfsmem[ix].m, &(sin[6*hoppos]),&(sin_norm[hoppos]), &(shelp1[0])); + #endif + //-kappa(r - gamma_mu) + #ifdef GF_8 + dev_kappaP3_plus(&(ssum[0]), &(shelp1[0]), RealT(dev_k3.re)); + #else + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_mk3),&(shelp1[0]), &(ssum[0])); + dev_Gamma3(&(shelp1[0])); + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_k3),&(shelp1[0]), &(ssum[0])); + #endif +//l==3,z + + //negative direction + hoppos = nn_evenodd[8*pos+7]; + //hoppos = tex1Dfetch(nn_tex,8*pos+7); + //color + #ifdef GF_8 + dev_reconstructgf_8texref_dagger_half (gf,4*gfindex_nextsite[hoppos]+(3),&(gfsmem[ix].m)); + #else + dev_reconstructgf_2vtexref_dagger_half(gf,4*gfindex_nextsite[hoppos]+(3),&(gfsmem[ix].m)); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix].m, hoppos, &(shelp1[0])); + #else + dev_su3MtV_half(gfsmem[ix].m, &(sin[6*hoppos]),&(sin_norm[hoppos]), &(shelp1[0])); + #endif + //-kappa(r + gamma_mu) + #ifdef GF_8 + dev_kappaP3_minus(&(ssum[0]), &(shelp1[0]), RealT(dev_k3.re)); + #else + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_mk3),&(shelp1[0]), &(ssum[0])); + dev_Gamma3(&(shelp1[0])); + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_mk3),&(shelp1[0]), &(ssum[0])); + #endif + + + + +//l==2,y + //positive direction + hoppos = nn_evenodd[8*pos+2]; + //hoppos = tex1Dfetch(nn_tex,8*pos+2); + //color + #ifdef GF_8 + dev_reconstructgf_8texref_half (gf,4*(gfindex_site[pos])+(2),&(gfsmem[ix].m)); + #else + dev_reconstructgf_2vtexref_half(gf,4*(gfindex_site[pos])+(2),&(gfsmem[ix].m)); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix].m, hoppos, &(shelp1[0])); + #else + dev_su3MtV_half(gfsmem[ix].m, &(sin[6*hoppos]),&(sin_norm[hoppos]), &(shelp1[0])); + #endif + //-kappa(r - gamma_mu) + #ifdef GF_8 + dev_kappaP2_plus(&(ssum[0]), &(shelp1[0]), RealT(dev_k2.re)); + #else + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_mk2),&(shelp1[0]), &(ssum[0])); + dev_Gamma2(&(shelp1[0])); + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_k2),&(shelp1[0]), &(ssum[0])); + #endif + +//l==2,y + + + //negative direction + hoppos = nn_evenodd[8*pos+6]; + //hoppos = tex1Dfetch(nn_tex,8*pos+6); + //color + #ifdef GF_8 + dev_reconstructgf_8texref_dagger_half (gf,4*gfindex_nextsite[hoppos]+(2),&(gfsmem[ix].m)); + #else + dev_reconstructgf_2vtexref_dagger_half(gf,4*gfindex_nextsite[hoppos]+(2),&(gfsmem[ix].m)); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix].m, hoppos, &(shelp1[0])); + #else + dev_su3MtV_half(gfsmem[ix].m, &(sin[6*hoppos]), &(sin_norm[hoppos]), &(shelp1[0])); + #endif + //-kappa(r + gamma_mu) + #ifdef GF_8 + dev_kappaP2_minus(&(ssum[0]), &(shelp1[0]), RealT(dev_k2.re)); + #else + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_mk2),&(shelp1[0]), &(ssum[0])); + dev_Gamma2(&(shelp1[0])); + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_mk2),&(shelp1[0]), &(ssum[0])); + #endif + + + +//l==1,x + //positive direction + hoppos = nn_evenodd[8*pos+1]; + //hoppos = tex1Dfetch(nn_tex,8*pos+1); + //color + #ifdef GF_8 + dev_reconstructgf_8texref_half (gf,4*(gfindex_site[pos])+(1),&(gfsmem[ix].m)); + #else + dev_reconstructgf_2vtexref_half(gf,4*(gfindex_site[pos])+(1),&(gfsmem[ix].m)); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix].m, hoppos, &(shelp1[0])); + #else + dev_su3MtV_half(gfsmem[ix].m, &(sin[6*hoppos]),&(sin_norm[hoppos]), &(shelp1[0])); + #endif + //-kappa(r - gamma_mu) + #ifdef GF_8 + dev_kappaP1_plus(&(ssum[0]), &(shelp1[0]), RealT(dev_k1.re)); + #else + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_mk1),&(shelp1[0]), &(ssum[0])); + dev_Gamma1(&(shelp1[0])); + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_k1),&(shelp1[0]), &(ssum[0])); + #endif + + +//l==1,x + + //negative direction + hoppos = nn_evenodd[8*pos+5]; + //hoppos = tex1Dfetch(nn_tex,8*pos+5); + //color + #ifdef GF_8 + dev_reconstructgf_8texref_dagger_half (gf,4*gfindex_nextsite[hoppos]+(1),&(gfsmem[ix].m)); + #else + dev_reconstructgf_2vtexref_dagger_half(gf,4*gfindex_nextsite[hoppos]+(1),&(gfsmem[ix].m)); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix].m, hoppos, &(shelp1[0])); + #else + dev_su3MtV_half(gfsmem[ix].m, &(sin[6*hoppos]),&(sin_norm[hoppos]), &(shelp1[0])); + #endif + //-kappa(r + gamma_mu) + #ifdef GF_8 + dev_kappaP1_minus(&(ssum[0]), &(shelp1[0]), RealT(dev_k1.re)); + #else + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_mk1),&(shelp1[0]), &(ssum[0])); + dev_Gamma1(&(shelp1[0])); + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_mk1),&(shelp1[0]), &(ssum[0])); + #endif + + //write to output spinor and write the norm + dev_write_spinor_half(&(ssum[0]),&(sout[6*pos]), &(sout_norm[pos])); + } +} + + +#endif + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/MACROS.cuh b/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/MACROS.cuh new file mode 100644 index 0000000000000000000000000000000000000000..a1cf010eb2a4735723090590c1e3c75cb9259e04 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/MACROS.cuh @@ -0,0 +1,284 @@ + + +///////////// +// general // +///////////// + + + + +// output & debug + +#define CUDA_DEBUG // provides some tests and output specific to the used CUDA code +#define STUFF_DEBUG // some stuff +//#define HOPPING_DEBUG // enables the Hopping Matrix on the CPU (inside matrix_multiplication32_mpi()) +//#define MATRIX_DEBUG // enables the matrix multiplication on the CPU (in the inner CG solver) +//#define CG_DEBUG // enables the CG on the CPU + + + + +// conjugate gradient +// ... // to come ... + + + + +// benchmarks + +#define OPERATOR_BENCHMARK 100 // refers to only matrix applications +#define ALGORITHM_BENCHMARK // counts the number of effective flops + + + +// alternative code + +#define ALTERNATE_FIELD_XCHANGE // provides a better communication, without ASYNC +//#define ALTERNATE_HOPPING_MATRIX // at the moment: provides an alternative way of passing the (nn-)positions to dev_Hopping_Matrix() + // does not work properly yet, ALTERNATE.cuh + + + +// CUDA + MPI + +#define DEVICE_EQUAL_RANK // for MPI: cudaSetDevice(mpi-rank) +#define ASYNC 1 // overlaps computation and communication // 0, 1, 2, 3 +#define ASYNC_TSLICES 1 // determines workload af kernels +#define ASYNC_OPTIMIZED 1 // CUDA streams // needs ASYNC == 3 +//#define ASYNC_TIMING // profiling the ASYNC_OPTIMIZED code // needs ASYNC == 1,2 + + + + +// CUDA parameters + +#define BLOCKSIZE1 64 // here: dev_zero_spinor_field , dev_copy_spinor_field +#define BLOCKSIZE2 64 // passed: dev_Hopping_Matrix +#define BLOCKSIZE3 64 // passed: dev_mul_one_pm_imubar_gamma5 +#define BLOCKSIZE4 64 // passed: dev_gamma5 +#define BLOCKSIZE5 64 // passed: dev_copy_spinor_field + + + + + + +//////////////////////////////////////////////////////////// +// debugging macros for CUDA, CUBLAS and kernel functions // +//////////////////////////////////////////////////////////// + + + + +#ifndef MPI // non-MPI //////////////////////////////////////////////////////////// + + + + +// debug // CUDA + +#define CUDA_CHECK(errorMessage, successMessage) { \ + if ( (cudaerr = cudaGetLastError()) != cudaSuccess ) { \ + printf("%s: %s\n", errorMessage, cudaGetErrorString(cudaerr)); \ + exit(-1); \ + } \ + else printf("%s%s", successMessage, "\n"); \ + } + +#define CUDA_CHECK_NO_SUCCESS_MSG(errorMessage) { \ + if ( (cudaerr = cudaGetLastError()) != cudaSuccess ) { \ + printf("%s: %s\n", errorMessage, cudaGetErrorString(cudaerr)); \ + exit(-1); \ + } \ + } + + + + +// debug // CUBLAS core function + +#define CUBLAS_CORE_CHECK(errorMessage, successMessage) { \ + if ( (cublasstatus = cublasGetError()) != CUBLAS_STATUS_SUCCESS ) { \ + printf("%s%s", errorMessage, "\n"); \ + exit(-1); \ + } \ + else printf("%s%s", successMessage, "\n"); \ + } + +#define CUBLAS_CORE_CHECK_NO_SUCCESS_MSG(errorMessage) { \ + if ( (cublasstatus = cublasGetError()) != CUBLAS_STATUS_SUCCESS ) { \ + printf("%s%s", errorMessage, "\n"); \ + exit(-1); \ + } \ + } + + + + +// debug // CUBLAS helper function + +#define CUBLAS_HELPER_CHECK(function, errorMessage, successMessage) { \ + if ( (cublasstatus = function) != CUBLAS_STATUS_SUCCESS ) { \ + printf("%s%s", errorMessage, "\n"); \ + exit(-1); \ + } \ + else printf("%s%s", successMessage, "\n"); \ + } + +#define CUBLAS_HELPER_CHECK_NO_SUCCESS_MSG(function, errorMessage) { \ + if ( (cublasstatus = function) != CUBLAS_STATUS_SUCCESS ) { \ + printf("%s%s", errorMessage, "\n"); \ + exit(-1); \ + } \ + } + + + + +// debug // kernel function + +#define CUDA_KERNEL_CHECK(errorMessage, successMessage) { \ + if ( (cudaerr = cudaThreadSynchronize()) != cudaSuccess ) { \ + printf("%s: %s\n", errorMessage, cudaGetErrorString(cudaGetLastError())); \ + exit(-1); \ + } \ + else printf("%s%s", successMessage, "\n"); \ + } + +#define CUDA_KERNEL_CHECK_NO_SUCCESS_MSG(errorMessage) { \ + if ( (cudaerr = cudaThreadSynchronize()) != cudaSuccess ) { \ + printf("%s: %s\n", errorMessage, cudaGetErrorString(cudaGetLastError())); \ + exit(-1); \ + } \ + } + + + + +#else // MPI //////////////////////////////////////////////////////////////////////// + + + + +// debug // CUDA + +#define CUDA_CHECK(errorMessage, successMessage) { \ + if ( (cudaerr = cudaGetLastError()) != cudaSuccess ) { \ + printf("Process %d of %d: ", g_cart_id, g_nproc); \ + printf("%s: %s\n", errorMessage, cudaGetErrorString(cudaerr)); \ + exit(-1); \ + } \ + else if (g_cart_id == 0) printf("%s%s", successMessage, "\n"); \ + } + +#define CUDA_CHECK_NO_SUCCESS_MSG(errorMessage) { \ + if ( (cudaerr = cudaGetLastError()) != cudaSuccess ) { \ + printf("Process %d of %d: ", g_cart_id, g_nproc); \ + printf("%s: %s\n", errorMessage, cudaGetErrorString(cudaerr)); \ + exit(-1); \ + } \ + } + + + + +// debug // CUBLAS core function + +#define CUBLAS_CORE_CHECK(errorMessage, successMessage) { \ + if ( (cublasstatus = cublasGetError()) != CUBLAS_STATUS_SUCCESS ) { \ + printf("Process %d of %d: ", g_cart_id, g_nproc); \ + printf("%s%s", errorMessage, "\n"); \ + exit(-1); \ + } \ + else if (g_cart_id == 0) printf("%s%s", successMessage, "\n"); \ + } + +#define CUBLAS_CORE_CHECK_NO_SUCCESS_MSG(errorMessage) { \ + if ( (cublasstatus = cublasGetError()) != CUBLAS_STATUS_SUCCESS ) { \ + printf("Process %d of %d: ", g_cart_id, g_nproc); \ + printf("%s%s", errorMessage, "\n"); \ + exit(-1); \ + } \ + } + + + + +// debug // CUBLAS helper function + +#define CUBLAS_HELPER_CHECK(function, errorMessage, successMessage) { \ + if ( (cublasstatus = function) != CUBLAS_STATUS_SUCCESS ) { \ + printf("Process %d of %d: ", g_cart_id, g_nproc); \ + printf("%s%s", errorMessage, "\n"); \ + exit(-1); \ + } \ + else if (g_cart_id == 0) printf("%s%s", successMessage, "\n"); \ + } + +#define CUBLAS_HELPER_CHECK_NO_SUCCESS_MSG(function, errorMessage) { \ + if ( (cublasstatus = function) != CUBLAS_STATUS_SUCCESS ) { \ + printf("Process %d of %d: ", g_cart_id, g_nproc); \ + printf("%s%s", errorMessage, "\n"); \ + exit(-1); \ + } \ + } + + + + +// debug // kernel function + +#define CUDA_KERNEL_CHECK(errorMessage, successMessage) { \ + if ( (cudaerr = cudaThreadSynchronize()) != cudaSuccess ) { \ + printf("Process %d of %d: ", g_cart_id, g_nproc); \ + printf("%s: %s\n", errorMessage, cudaGetErrorString(cudaGetLastError())); \ + exit(-1); \ + } \ + else if (g_cart_id == 0) printf("%s%s", successMessage, "\n"); \ + } + +#define CUDA_KERNEL_CHECK_NO_SUCCESS_MSG(errorMessage) { \ + if ( (cudaerr = cudaThreadSynchronize()) != cudaSuccess ) { \ + printf("Process %d of %d: ", g_cart_id, g_nproc); \ + printf("%s: %s\n", errorMessage, cudaGetErrorString(cudaGetLastError())); \ + exit(-1); \ + } \ + } + + + + +#endif ///////////////////////////////////////////////////////////////////////////////// + + + + + + + + +////////////////////////////// EXAMPLES //////////////////////////////////////////////////////////////////////////////////////////////////////// +// +// +// // debug // CUDA +// #ifdef CUDA_DEBUG +// CUDA_CHECK("CUDA error in mixedsolve_eo_nd(). Host to device interaction failed.", "Fields initializedhallo on device."); +// #endif +// +// +// // debug // CUBLAS helper function +// #ifdef CUDA_DEBUG +// CUBLAS_HELPER_CHECK(cublasInit(), "Error in cublasInit(). Couldn't initialize CUBLAS.", "CUBLAS is initialized."); +// #endif +// +// +// // debug // kernel +// #ifdef CUDA_DEBUG +// CUDA_KERNEL_CHECK("Error in cg_eo_nd(): Initializing spinor fields on device failed.", "Spinor fields initialized on device."); +// #endif +// +// +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/Makefile.in b/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/Makefile.in new file mode 100644 index 0000000000000000000000000000000000000000..c856221ffdb2aec111110ba7ed1a50f4904519d5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/Makefile.in @@ -0,0 +1,79 @@ +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ +abs_top_srcdir = @abs_top_srcdir@ +top_builddir = . +abs_top_builddir = @abs_top_builddir@ +builddir = @builddir@ +prefix = @prefix@ +exec_prefix = @exec_prefix@ +bindir = @bindir@ +program_transform_name = @program_transform_name@ +subdir = . + +AR = @AR@ +RANLIB = @RANLIB@ +CC = +CCDEP = +CFLAGS = @GPUCFLAGS@ +LDFLAGS = @LDFLAGS@ +DEPFLAGS = +CPPFLAGS = @CPPFLAGS@ +CCLD = @CCLD@ +LEX = @LEX@ +AUTOCONF = @AUTOCONF@ +LIBS = @LIBS@ +SHELL = @SHELL@ +OPTARGS = @OPTARGS@ +SOPTARGS = @SOPTARGS@ +DEFS = @DEFS@ +GPUOBJECTS = @GPUDIR@ +USESUBDIRS = @USESUBDIRS@ +NVCC = @NVCC@ +GPUMPICOMPILER = @GPUMPICOMPILER@ +INCLUDES = @INCLUDES@ + +COMPILE = ${NVCC} -c ${DEFS} ${GPUMPICOMPILER} ${INCLUDES} -o $@ ${CFLAGS} + + +GPUSOURCES := $(wildcard ${srcdir}/*.cu) +GPUOBJECTS := $(patsubst ${srcdir}/%.cu, %.o, $(GPUSOURCES)) +DEPS := $(patsubst %.o,%.d,$(GPUOBJECTS)) + +.SUFFIXES: + +all: Makefile dummy + +#ifneq (,$(findstring lapack,${LIBS})) +#all: Makefile all-recursive dep hmc_tm invert invert_doublet +#else +#all: Makefile all-recursive dep hmc_tm invert invert_doublet +#endif + + +.NOTPARALLEL: + +-include $(addsuffix .d,$(GPUTARGETS)) +-include $(DEPS) + +include ${top_srcdir}/Makefile.global + + +%.o: ${srcdir}/%.cu Makefile ${srcdir}/*.h ${srcdir}/*.cuh + @$(COMPILE) ${INCLUDES} $< > $@ + +#%.o: ${srcdir}/%.c Makefile +# $(NVCC) -c $(DEFS) --compiler-bindir mpicc ${INCLUDES} -o $@ $(CFLAGS) $< > $@ +# mpicc -c ${CUBLAS} $< > $@ + +dummy: ${GPUOBJECTS} Makefile + @echo "Have generated all %.o files" + +compile-clean: Makefile + rm -f ${GPUOBJECTS} *.d + +clean: compile-clean + +distclean: compile-clean + rm -f Makefile + +.PHONY: all compile-clean diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/cublasWrapper.cuh b/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/cublasWrapper.cuh new file mode 100644 index 0000000000000000000000000000000000000000..be85b9b806d31043db110d28095c99325502c4f3 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/cublasWrapper.cuh @@ -0,0 +1,39 @@ +//prepare cublas[typeId el. {s,S,d,D,c,C,z,Z}][function]([parameterList]) for template usage by defining +//overloaded cublas [Function]([parameterList]) + +#include "cublas.h" +#include "cudaglobal.h" + + +//#ifdef OLD_CUBLAS + + float cublasDot(int n,const float* x,int incx,const float* y,int incy) + { return cublasSdot(n,x,incx,y,incy); } + double cublasDot(int n,const double* x,int incx,const double* y,int incy) + { return cublasDdot(n,x,incx,y,incy); } + + void cublasAxpy(int n,float alpha,const float* x,int incx,float* y,int incy) + { cublasSaxpy(n,alpha,x,incx,y,incy); } + void cublasAxpy(int n,double alpha,const double* x,int incx,double* y,int incy) + { cublasDaxpy(n,alpha,x,incx,y,incy); } + + void cublasScal(int n,float alpha,float* x,int incx) + { cublasSscal(n,alpha,x,incx); } + void cublasScal(int n,double alpha,double* x,int incx) + { cublasDscal(n,alpha,x,incx); } + + void cublasCopy(int n,const float* x,int incx,float* y,int incy) + { cublasScopy(n,x,incx,y,incy); } + void cublasCopy(int n,const double* x,int incx,double* y,int incy) + { cublasDcopy(n,x,incx,y,incy); } + +/*#else + + template inline cublasStatus_t cublasDot (cublasHandle_t handle,int n,const RealT* x,int incx,const RealT* y,int incy,RealT* result) + { return RealT.cublasWrapperError(); } //produces an error when called with wrong template type + template< > inline cublasStatus_t cublasDot(cublasHandle_t handle,int n,const RealT* x,int incx,const RealT* y,int incy,RealT* result) + { return cublasSdot(handle,n,x,incx,y,incy,result); } + template< > inline cublasStatus_t cublasDot(cublasHandle_t handle,int n,const RealT* x,int incx,const RealT* y,int incy,RealT* result) + { return cublasDdot(handle,n,x,incx,y,incy,result); } +#endif*/ + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/cudadefs.h b/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/cudadefs.h new file mode 100644 index 0000000000000000000000000000000000000000..ffba1935666a08f8c8866e6ece6e55bb96ba211f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/cudadefs.h @@ -0,0 +1,33 @@ +#ifndef _CUDADEF_H +#define _CUDADEF_H + +#define cublasStatus_t cublasStatus//necessary for cublas>=4.0 + +#define ACCUM_N 2048 +#define DOTPROD_DIM 128 + +//#define GF_8 +//#define TEMPORALGAUGE +//#define USETEXTURE +//#define HALF +#define OLD_CUBLAS //cublas older than 4.0 ? + +#define REAL float +#define REALD double +#define REAL4 float4 +#define REAL4D double4 + + +#define BLOCK 64//192 // Block Size // dev_Hopping_Matrix<<<>>>() +#define BLOCK2 64//320 // Block Size 2 for dev_mul_one_pm... // dev_mul_one_pm_imu_inv<<<>>>() +#define BLOCK3 64//128 // dev_copy_spinor_field<<<>>>(), dev_zero_spinor_field<<<>>>() +#define REDUCTION_N 64//512 // Block size for reduction operations //old: 512 + + + +#define maxblockdim 64//512 + + +#endif + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/cudaglobal.h b/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/cudaglobal.h new file mode 100644 index 0000000000000000000000000000000000000000..a35a6d6ed28ed6c00052950824f5285342630ba2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/cudaglobal.h @@ -0,0 +1,147 @@ +#include "cudadefs.h" + + +#ifndef __CUDADEFS_H + #define __CUDADEFS_H + +#ifndef __cplusplus + #error "GPU code needs C++ due to templates" +#endif + +#ifdef OLD_CUBLAS + #define cublasStatus_t cublasStatus +#else + #define cublasStatus cublasStatus_t +#endif +/* GPU Stuff */ +template struct dev_complexT +{ + RealT re; + RealT im; + + templateoperator dev_complexT() const //enables conversions between dev_complexT instances with different template arguments + { + dev_complexT tmp; + tmp.re=TargetRealT(re); tmp.im=TargetRealT(im); + + return tmp; + } +}; +//template<>operator dev_complexT::dev_complexT() const {return *this} +#define dev_complexM(RealT) dev_complexT +#define dev_complex dev_complexT +#define dev_complexD dev_complexT +//typedef dev_complexT dev_complex ; +//typedef dev_complexT dev_complexD; + + +/* non-scalar types x: usage xT::type */ + +templatestruct REAL4T //usefull to select REAL4-Type according to RealT +{ + struct type { RealT w,x,y,z; }; +}; +template<> struct REAL4T //template specialisation to select the nvidia-based type, which may offer some optimisation +{ typedef REAL4 type; }; +template<> struct REAL4T +{ typedef REAL4D type; }; +#define REAL4M(RealT) typename REAL4T::type + + +/* Device Gauge Fields */ +// Typedef dev_su3 [3][3]; /* su(3)-Matrix 3x3 komplexe Einträge DEVICE */ + +template struct dev_su3T //no template typedef in c++ yet; structure will be usefull in function templates +{ typedef dev_complexT type[3][3]; };/* su(3)-Matrix 3x3 komplexe Einträge DEVICE */ +#define dev_su3M(RealT) typename dev_su3T::type +#define dev_su3 dev_su3T::type +#define dev_su3D dev_su3T::type +//typedef dev_su3T::type dev_su3; +//typedef dev_su3T::type dev_su3D; + +template struct dev_su3_padT +{ + struct type //only for conistency + { + typename dev_su3T::type m; + RealT pad; + }; +}; +#define dev_su3_padM(RealT) typename dev_su3_padT::type +#define dev_su3_pad dev_su3_padT::type +#define dev_su3_padD dev_su3_padT::type +//typedef dev_su3_padT::type dev_su3_pad ; +//typedef dev_su3_padT::type dev_su3_padD; + +//#define su3_2vT REAL4T /* 2 Zeilen der su(3)-Matrix, 6 komplexe Einträge HOST 3*4*VOLUME in array -> texture */ +template struct su3_2vT:REAL4T {}; +#define su3_2vM(RealT) typename su3_2vT::type +#define su3_2v su3_2vT::type +#define su3_2vD su3_2vT::type +//typedef su3_2vT::type su3_2v ; +//typedef su3_2vT::type su3_2vD; + +//#define dev_su3_2vT REAL4T /* 2 Zeilen der su(3)-Matrix 3*2 komplexe Einträge DEVICE 3*4*VOLUME in array -> texture*/ +template struct dev_su3_2vT:REAL4T {}; +#define dev_su3_2vM(RealT) typename dev_su3_2vT::type +#define dev_su3_2v dev_su3_2vT::type +#define dev_su3_2vD dev_su3_2vT::type +//typedef dev_su3_2vT::type dev_su3_2v ; +//typedef dev_su3_2vT::type dev_su3_2vD; + +//#define dev_su3_8T REAL4T /* 8 numbers to reconstruct the gauge field as described in M. Clark */ +template struct dev_su3_8T:REAL4T {}; +#define dev_su3_8M(RealT) typename dev_su3_8T::type +#define dev_su3_8 dev_su3_8T::type +#define dev_su3_8D dev_su3_8T::type +//typedef dev_su3_8T::type dev_su3_8; +//typedef dev_su3_8T::type dev_su3_8D; + + +/* Device Spinor Fields */ +//#define dev_spinorT REAL4T +template struct dev_spinorT:REAL4T {}; +#define dev_spinorM(RealT) typename dev_spinorT::type +#define dev_spinor dev_spinorT::type +#define dev_spinorD dev_spinorT::type +//typedef REAL4T::type dev_spinor; +//typedef REAL4T::type dev_spinorD; + +template struct dev_spinor_smemT +{ + struct type + { + dev_spinorT spin; + RealT dummy; // used to fit memory usage to GPU architecture? - then we probably need template specialisation - otherwise delete this comment + }; +}; +#define dev_spinor_smemM(RealT) typename dev_spinor_smemT::type +#define dev_spinor_smem dev_spinor_smemT::type +#define dev_spinor_smemD dev_spinor_smemT::type +//typedef dev_spinor_smemT::type dev_spinor_smem ; +//typedef dev_spinor_smemT::type dev_spinor_smemD; + +template struct dev_propmatrixT { typedef dev_complexT type[12][12]; }; +#define dev_propmatrixM(RealT) typename dev_propmatrixT::type +#define dev_propmatrix dev_propmatrixT::type +#define dev_propmatrixD dev_propmatrixT::type +//typedef dev_propmatrixT::type dev_propmatrix ; +//typedef dev_propmatrixT::type dev_propmatrixD; + +template struct dev_fbyfT { typedef dev_complexT type[4][4]; }; +#define dev_fbyfM(RealT) typename dev_fbyfT::type +#define dev_fbyf dev_fbyfT::type +#define dev_fbyfD dev_fbyfT::type +//typedef dev_fbyfT::type dev_fbyf ; +//typedef dev_fbyfT::type dev_fbyfD; + + +#ifdef HALF + typedef short4 dev_spinor_half; + typedef short4 dev_su3_2v_half; + typedef short4 dev_su3_8_half; +#endif + + +#endif +/* END GPU Stuff */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/gauge_reconstruction.cuh b/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/gauge_reconstruction.cuh new file mode 100644 index 0000000000000000000000000000000000000000..fca0f0d2464468c831845649e906da13a5eb3489 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/gauge_reconstruction.cuh @@ -0,0 +1,1385 @@ +/*********************************************************************** + * + * Copyright (C) 2010 Florian Burger + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * + * File: gauge_reconstruction.cuh + * + * CUDA gauge reconstruction functions + * + * + * + **************************************************************************/ + + + + + +////////////////////// DEVICE FUNCTIONS FOR GAUGE RECONSTRUCTION //////////////////// + + +#ifdef HALF + #define pi_float 3.141592654f + #define sh4tofl4(fl) make_float4(sh2fl(fl.x), sh2fl(fl.y), sh2fl(fl.z), sh2fl(fl.w)) +#else + #define sh4tofl4(fl) (fl) +#endif + + + + + + + + + +// reconstruction of the link fields from two rows of the su3 matrix +// numbers are fetched from texture cache +template +__device__ void dev_reconstructgf_2vtexref (const typename dev_su3_2vT::type* field, int pos, typename dev_su3T::type* gf){ + typename REAL4T::type gfin; + + #ifdef USETEXTURE + gfin = tex1Dfetch(gf_tex,3*pos); + #else + gfin = field[3*pos]; + #endif + //first row + (*gf)[0][0].re = gfin.x; + (*gf)[0][0].im = gfin.y; + (*gf)[0][1].re = gfin.z; + (*gf)[0][1].im = gfin.w; + #ifdef USETEXTURE + gfin = tex1Dfetch(gf_tex,3*pos+1); + #else + gfin = field[3*pos + 1]; + #endif + (*gf)[0][2].re = gfin.x; + (*gf)[0][2].im = gfin.y; + //second row + (*gf)[1][0].re = gfin.z; + (*gf)[1][0].im = gfin.w; + #ifdef USETEXTURE + gfin = tex1Dfetch(gf_tex,3*pos+2); + #else + gfin = field[3*pos + 2]; + #endif + (*gf)[1][1].re = gfin.x; + (*gf)[1][1].im = gfin.y; + (*gf)[1][2].re = gfin.z; + (*gf)[1][2].im = gfin.w; + + //third row from cconj(cross product of first and second row) + + (*gf)[2][0].re = (*gf)[0][1].re * (*gf)[1][2].re; + (*gf)[2][0].re -= (*gf)[0][1].im * (*gf)[1][2].im; + (*gf)[2][0].re -= (*gf)[0][2].re * (*gf)[1][1].re; + (*gf)[2][0].re += (*gf)[0][2].im * (*gf)[1][1].im; + + (*gf)[2][0].im = -(*gf)[0][1].re * (*gf)[1][2].im; + (*gf)[2][0].im -= (*gf)[0][1].im * (*gf)[1][2].re; + (*gf)[2][0].im += (*gf)[0][2].re * (*gf)[1][1].im; + (*gf)[2][0].im += (*gf)[0][2].im * (*gf)[1][1].re; + + + + (*gf)[2][1].re = (*gf)[0][2].re * (*gf)[1][0].re; + (*gf)[2][1].re -= (*gf)[0][2].im * (*gf)[1][0].im; + (*gf)[2][1].re -= (*gf)[0][0].re * (*gf)[1][2].re; + (*gf)[2][1].re += (*gf)[0][0].im * (*gf)[1][2].im; + + (*gf)[2][1].im = -(*gf)[0][2].re * (*gf)[1][0].im; + (*gf)[2][1].im -= (*gf)[0][2].im * (*gf)[1][0].re; + (*gf)[2][1].im += (*gf)[0][0].re * (*gf)[1][2].im; + (*gf)[2][1].im += (*gf)[0][0].im * (*gf)[1][2].re; + + + + (*gf)[2][2].re = (*gf)[0][0].re * (*gf)[1][1].re; + (*gf)[2][2].re -= (*gf)[0][0].im * (*gf)[1][1].im; + (*gf)[2][2].re -= (*gf)[0][1].re * (*gf)[1][0].re; + (*gf)[2][2].re += (*gf)[0][1].im * (*gf)[1][0].im; + + (*gf)[2][2].im = -(*gf)[0][0].re * (*gf)[1][1].im; + (*gf)[2][2].im -= (*gf)[0][0].im * (*gf)[1][1].re; + (*gf)[2][2].im += (*gf)[0][1].re * (*gf)[1][0].im; + (*gf)[2][2].im += (*gf)[0][1].im * (*gf)[1][0].re; + + + return; +} + + + + +// su3 - dagger reconstruction from two rows +template +__device__ void dev_reconstructgf_2vtexref_dagger (const typename dev_su3_2vT::type* field, int pos, typename dev_su3T::type* gf){ + //dev_complex help1; + //dev_complex help2; + typename REAL4T::type gfin; + + + //first column (minus in im for complex conj.) + #ifdef USETEXTURE + gfin = tex1Dfetch(gf_tex,3*pos); + #else + gfin = field[3*pos]; + #endif + (*gf)[0][0].re = gfin.x; + (*gf)[0][0].im = -gfin.y; + (*gf)[1][0].re = gfin.z; + (*gf)[1][0].im = -gfin.w; + #ifdef USETEXTURE + gfin = tex1Dfetch(gf_tex,3*pos+1); + #else + gfin = field[3*pos +1]; + #endif + (*gf)[2][0].re = gfin.x; + (*gf)[2][0].im = -gfin.y; + + //second column (minus in im for complex conj.) + (*gf)[0][1].re = gfin.z; + (*gf)[0][1].im = -gfin.w; + #ifdef USETEXTURE + gfin = tex1Dfetch(gf_tex,3*pos+2); + #else + gfin = field[3*pos +2]; + #endif + (*gf)[1][1].re = gfin.x; + (*gf)[1][1].im = -gfin.y; + (*gf)[2][1].re = gfin.z; + (*gf)[2][1].im = -gfin.w; + + + + (*gf)[0][2].re = (*gf)[1][0].re * (*gf)[2][1].re; + (*gf)[0][2].re -= (*gf)[1][0].im * (*gf)[2][1].im; + (*gf)[0][2].re -= (*gf)[2][0].re * (*gf)[1][1].re; + (*gf)[0][2].re += (*gf)[2][0].im * (*gf)[1][1].im; + + (*gf)[0][2].im = -(*gf)[1][0].re* (*gf)[2][1].im; + (*gf)[0][2].im -= (*gf)[1][0].im* (*gf)[2][1].re; + (*gf)[0][2].im += (*gf)[2][0].re*(*gf)[1][1].im; + (*gf)[0][2].im += (*gf)[2][0].im*(*gf)[1][1].re; + + + (*gf)[1][2].re = (*gf)[2][0].re*(*gf)[0][1].re; + (*gf)[1][2].re -= (*gf)[2][0].im*(*gf)[0][1].im; + (*gf)[1][2].re -= (*gf)[0][0].re*(*gf)[2][1].re; + (*gf)[1][2].re += (*gf)[0][0].im*(*gf)[2][1].im; + + (*gf)[1][2].im = -(*gf)[2][0].re * (*gf)[0][1].im; + (*gf)[1][2].im -= (*gf)[2][0].im * (*gf)[0][1].re; + (*gf)[1][2].im += (*gf)[0][0].re * (*gf)[2][1].im; + (*gf)[1][2].im += (*gf)[0][0].im * (*gf)[2][1].re; + + (*gf)[2][2].re = (*gf)[0][0].re * (*gf)[1][1].re; + (*gf)[2][2].re -= (*gf)[0][0].im * (*gf)[1][1].im; + (*gf)[2][2].re -= (*gf)[1][0].re * (*gf)[0][1].re; + (*gf)[2][2].re += (*gf)[1][0].im * (*gf)[0][1].im; + + (*gf)[2][2].im = -(*gf)[0][0].re * (*gf)[1][1].im; + (*gf)[2][2].im -= (*gf)[0][0].im * (*gf)[1][1].re; + (*gf)[2][2].im += (*gf)[1][0].re * (*gf)[0][1].im; + (*gf)[2][2].im += (*gf)[1][0].im * (*gf)[0][1].re; + +} + + + + + + +// reconstruction of the gf using 8 real parameters as +// described in the appendix of hep-lat 0911.3191 (M.Clark et al.) +// optimized once +template +__device__ void dev_reconstructgf_8texref (const typename dev_su3_2vT::type* field, int pos, typename dev_su3T::type* gf){ + + typename REAL4T::type gfin; + RealT one_over_N, help; + dev_complexT p1,p2; + + #ifdef USETEXTURE + gfin = tex1Dfetch(gf_tex,2*pos); + #else + gfin = field[2*pos]; + #endif + // read a2 a3 + (*gf)[0][1].re = gfin.x; + (*gf)[0][1].im = gfin.y; + (*gf)[0][2].re = gfin.z; + (*gf)[0][2].im = gfin.w; + + p1.re = gfin.x*gfin.x + gfin.y*gfin.y + gfin.z*gfin.z + gfin.w*gfin.w; // use later on + one_over_N = rsqrt(p1.re); //reciprocal sqrt + + // read theta_a1, theta_c1, b1 + #ifdef USETEXTURE + gfin = tex1Dfetch(gf_tex,2*pos + 1); + #else + gfin = field[2*pos + 1]; + #endif + + // reconstruct a1 use sqrt instead of sin + help = 1.0f - p1.re; + if(help > 0.0f){ + p1.re = sqrt(help); + } + else{ + p1.re = 0.0f; + } + #ifdef HALF + // we have to multiply by two pi because normalization to -1..1 + gfin.x = gfin.x*pi_float; + gfin.y = gfin.y*pi_float; + #endif + sincos(gfin.x, &(*gf)[0][0].im, &(*gf)[0][0].re); + (*gf)[0][0].re = (*gf)[0][0].re * p1.re; + (*gf)[0][0].im = (*gf)[0][0].im * p1.re; + + + + // assign b1 + (*gf)[1][0].re = gfin.z; + (*gf)[1][0].im = gfin.w; + + // p2 = 1/N b1 + p2.re = one_over_N*(*gf)[1][0].re; + p2.im = one_over_N*(*gf)[1][0].im; + + + // reconstruct c1 use sqrt instead of sin + help =1.0f - + (*gf)[0][0].re * (*gf)[0][0].re - (*gf)[0][0].im * (*gf)[0][0].im - + (*gf)[1][0].re * (*gf)[1][0].re - (*gf)[1][0].im * (*gf)[1][0].im; + if(help > 0.0f){ + p1.re = sqrt(help); + } + else{ + p1.re = 0.0f; + } + sincos(gfin.y, &(*gf)[2][0].im, &(*gf)[2][0].re); + (*gf)[2][0].re = (*gf)[2][0].re * p1.re; + (*gf)[2][0].im = (*gf)[2][0].im * p1.re; + + + + // p1 = 1/N*cconj(c1) + p1.re = one_over_N*(*gf)[2][0].re; + p1.im = - one_over_N*(*gf)[2][0].im; + + + + //use the last reconstructed gf component gf[2][2] (c3) as a help variable for b2,b3 and c2 + //this is in order to save registers and to prevent extra loading and storing from global mem + // calculate b2 + + (*gf)[1][1].re = p1.re*(*gf)[0][2].re; + (*gf)[1][1].re += p1.im*(*gf)[0][2].im; + (*gf)[1][1].im = p1.im*(*gf)[0][2].re; + (*gf)[1][1].im -= p1.re*(*gf)[0][2].im; + + (*gf)[2][2].re = (*gf)[0][0].re * (*gf)[0][1].re; + (*gf)[2][2].re += (*gf)[0][0].im * (*gf)[0][1].im; + + (*gf)[2][2].im = (*gf)[0][0].re * (*gf)[0][1].im; + (*gf)[2][2].im -= (*gf)[0][0].im * (*gf)[0][1].re; + (*gf)[2][2] = dev_cmult(p2, (*gf)[2][2]); + + (*gf)[1][1].re = -one_over_N*( (*gf)[1][1].re + (*gf)[2][2].re); + (*gf)[1][1].im = -one_over_N*((*gf)[1][1].im + (*gf)[2][2].im); + + + + + + // calculate b3 + (*gf)[1][2].re = p1.re*(*gf)[0][1].re; + (*gf)[1][2].re += p1.im*(*gf)[0][1].im; + (*gf)[1][2].im = p1.im*(*gf)[0][1].re; + (*gf)[1][2].im -= p1.re*(*gf)[0][1].im; + + (*gf)[2][2].re = (*gf)[0][0].re*(*gf)[0][2].re; + (*gf)[2][2].re += (*gf)[0][0].im*(*gf)[0][2].im; + (*gf)[2][2].im = (*gf)[0][0].re*(*gf)[0][2].im; + (*gf)[2][2].im -= (*gf)[0][0].im*(*gf)[0][2].re; + (*gf)[2][2] = dev_cmult(p2,(*gf)[2][2]); + + (*gf)[1][2].re = one_over_N*( (*gf)[1][2].re - (*gf)[2][2].re); + (*gf)[1][2].im = one_over_N*( (*gf)[1][2].im - (*gf)[2][2].im); + + + // calculate c2 + (*gf)[2][1].re = p2.re*(*gf)[0][2].re; + (*gf)[2][1].re -= p2.im*(*gf)[0][2].im; + (*gf)[2][1].im = -p2.re*(*gf)[0][2].im; + (*gf)[2][1].im -= p2.im*(*gf)[0][2].re; + + + + (*gf)[2][2].re = (*gf)[0][0].re*(*gf)[0][1].re; + (*gf)[2][2].re += (*gf)[0][0].im*(*gf)[0][1].im; + (*gf)[2][2].im = (*gf)[0][0].re* (*gf)[0][1].im; + (*gf)[2][2].im -= (*gf)[0][0].im* (*gf)[0][1].re; + help = (*gf)[2][2].re; + (*gf)[2][2].re = p1.re*(*gf)[2][2].re; + (*gf)[2][2].re += p1.im*(*gf)[2][2].im; + (*gf)[2][2].im = p1.re*(*gf)[2][2].im - p1.im*help; + + + (*gf)[2][1].re = one_over_N*((*gf)[2][1].re - (*gf)[2][2].re); + (*gf)[2][1].im = one_over_N*((*gf)[2][1].im - (*gf)[2][2].im); + + // now we have to use p2 and p1 as a help variable, as this is not + // needed any more after the first + // step + // calculate c3 + (*gf)[2][2].re = p2.re * (*gf)[0][1].re; + (*gf)[2][2].re -= p2.im * (*gf)[0][1].im; + (*gf)[2][2].im = - p2.im*(*gf)[0][1].re; + (*gf)[2][2].im -= p2.re*(*gf)[0][1].im; + + p2.re = (*gf)[0][0].re * (*gf)[0][2].re; + p2.re += (*gf)[0][0].im * (*gf)[0][2].im; + p2.im = (*gf)[0][0].re * (*gf)[0][2].im; + p2.im -= (*gf)[0][0].im * (*gf)[0][2].re; + p2 = dev_cmult( dev_cconj(p1) , p2); + + (*gf)[2][2] = dev_cadd((*gf)[2][2], p2); + (*gf)[2][2] = dev_crealmult((*gf)[2][2], -one_over_N); + +} + + + + + + + +template +__device__ void dev_reconstructgf_8texref_dagger (const typename dev_su3_2vT::type* field,int pos, typename dev_su3T::type* gf){ + + + typename REAL4T::type gfin; + RealT one_over_N, help; + dev_complexT p1,p2; + + #ifdef USETEXTURE + gfin = tex1Dfetch(gf_tex,2*pos); + #else + gfin = field[2*pos]; + #endif + // read a2 a3 + (*gf)[1][0].re = gfin.x; + (*gf)[1][0].im = -gfin.y; + (*gf)[2][0].re = gfin.z; + (*gf)[2][0].im = -gfin.w; + + p1.re = gfin.x*gfin.x + gfin.y*gfin.y + gfin.z*gfin.z + gfin.w*gfin.w; // use later on + one_over_N = rsqrt(p1.re); // reciprocal sqrt + + + // read theta_a1, theta_c1, b1 + #ifdef USETEXTURE + gfin = tex1Dfetch(gf_tex,2*pos + 1); + #else + gfin = field[2*pos + 1]; + #endif + + // reconstruct a1 + help = 1.0f - p1.re; + if(help > 0.0f){ + p1.re = sqrt(help); + } + else{ + p1.re = 0.0f; + } + //(*gf)[0][0].re = p1.re*cosf(gfin.x); + //(*gf)[0][0].im = -p1.re*sinf(gfin.x); + + #ifdef HALF + // we have to multiply by two pi because normalization to -1..1 + gfin.x = gfin.x*pi_float; + gfin.y = gfin.y*pi_float; + #endif + + sincos(gfin.x, &(*gf)[0][0].im, &(*gf)[0][0].re); + (*gf)[0][0].re = (*gf)[0][0].re * p1.re; + (*gf)[0][0].im = -(*gf)[0][0].im * p1.re; + + + // assign b1 + (*gf)[0][1].re = gfin.z; + (*gf)[0][1].im = -gfin.w; + + // p2 = 1/N b1 + p2.re = one_over_N*(*gf)[0][1].re; + p2.im = -one_over_N*(*gf)[0][1].im; + + + // reconstruct c1 + help = 1.0f - + (*gf)[0][0].re * (*gf)[0][0].re - (*gf)[0][0].im * (*gf)[0][0].im - + (*gf)[0][1].re * (*gf)[0][1].re - (*gf)[0][1].im * (*gf)[0][1].im; + if(help > 0.0f){ + p1.re = sqrt(help); + } + else{ + p1.re = 0.0f; + } + //(*gf)[0][2].re = p1.re*cosf(gfin.y); + //(*gf)[0][2].im = -p1.re*sinf(gfin.y); + + sincos(gfin.y, &(*gf)[0][2].im, &(*gf)[0][2].re); + (*gf)[0][2].re = (*gf)[0][2].re * p1.re; + (*gf)[0][2].im = -(*gf)[0][2].im * p1.re; + + + // p1 = 1/N*cconj(c1) + p1.re = one_over_N*(*gf)[0][2].re; + p1.im = one_over_N*(*gf)[0][2].im; + + //use the last reconstructed gf component gf[2][2] (c3) as a help variable for b2,b3 and c2 + //this is in order to save registers and to prevent extra loading and storing from global mem + // calculate b2 + (*gf)[1][1] = dev_cmult(p1, (*gf)[2][0] ); + (*gf)[2][2] = dev_cmult(p2, dev_cmult( (*gf)[0][0] , dev_cconj((*gf)[1][0] )) ); + (*gf)[1][1] = dev_cadd((*gf)[1][1], (*gf)[2][2]); + (*gf)[1][1] = dev_cconj(dev_crealmult((*gf)[1][1], -one_over_N)); + + // calculate b3 + (*gf)[2][1] = dev_cmult(p1, (*gf)[1][0] ); + (*gf)[2][2] = dev_cmult(p2, dev_cmult( (*gf)[0][0] , dev_cconj((*gf)[2][0] )) ); + (*gf)[2][1] = dev_csub((*gf)[2][1], (*gf)[2][2]); + (*gf)[2][1] = dev_cconj(dev_crealmult((*gf)[2][1], one_over_N)); + + // calculate c2 + (*gf)[1][2] = dev_cmult( dev_cconj(p2) , (*gf)[2][0] ); + (*gf)[2][2] = dev_cmult( dev_cconj(p1) , + dev_cmult( (*gf)[0][0] , dev_cconj( (*gf)[1][0]) ) + ); + (*gf)[1][2] = dev_csub((*gf)[1][2], (*gf)[2][2]); + (*gf)[1][2] = dev_cconj(dev_crealmult((*gf)[1][2], one_over_N)); + + // use p2 as help variable after the first step + // calculate c3 + (*gf)[2][2] = dev_cmult( dev_cconj(p2) , (*gf)[1][0] ); + p2 = dev_cmult( dev_cconj(p1) , + dev_cmult( (*gf)[0][0] , dev_cconj((*gf)[2][0] ) ) + ); + (*gf)[2][2] = dev_cadd((*gf)[2][2], p2); + (*gf)[2][2] = dev_cconj(dev_crealmult((*gf)[2][2], -one_over_N)); + +} + + + + +#ifdef HALF // for half precision + +// reconstruction of the link fields from two rows of the su3 matrix +// numbers are fetched from texture cache +__device__ void dev_reconstructgf_2vtexref_half (const dev_su3_2v_half* field, int pos, dev_su3* gf){ + float4 gfin; + + #ifdef USETEXTURE + gfin = tex1Dfetch(gf_tex,3*pos); + #else + gfin = sh4tofl4(field[3*pos]); + #endif + //first row + (*gf)[0][0].re = gfin.x; + (*gf)[0][0].im = gfin.y; + (*gf)[0][1].re = gfin.z; + (*gf)[0][1].im = gfin.w; + #ifdef USETEXTURE + gfin = tex1Dfetch(gf_tex,3*pos+1); + #else + gfin = sh4tofl4(field[3*pos + 1]); + #endif + (*gf)[0][2].re = gfin.x; + (*gf)[0][2].im = gfin.y; + //second row + (*gf)[1][0].re = gfin.z; + (*gf)[1][0].im = gfin.w; + #ifdef USETEXTURE + gfin = tex1Dfetch(gf_tex,3*pos+2); + #else + gfin = sh4tofl4(field[3*pos + 2]); + #endif + (*gf)[1][1].re = gfin.x; + (*gf)[1][1].im = gfin.y; + (*gf)[1][2].re = gfin.z; + (*gf)[1][2].im = gfin.w; + + //third row from cconj(cross product of first and second row) + + (*gf)[2][0].re = (*gf)[0][1].re * (*gf)[1][2].re; + (*gf)[2][0].re -= (*gf)[0][1].im * (*gf)[1][2].im; + (*gf)[2][0].re -= (*gf)[0][2].re * (*gf)[1][1].re; + (*gf)[2][0].re += (*gf)[0][2].im * (*gf)[1][1].im; + + (*gf)[2][0].im = -(*gf)[0][1].re * (*gf)[1][2].im; + (*gf)[2][0].im -= (*gf)[0][1].im * (*gf)[1][2].re; + (*gf)[2][0].im += (*gf)[0][2].re * (*gf)[1][1].im; + (*gf)[2][0].im += (*gf)[0][2].im * (*gf)[1][1].re; + + + + (*gf)[2][1].re = (*gf)[0][2].re * (*gf)[1][0].re; + (*gf)[2][1].re -= (*gf)[0][2].im * (*gf)[1][0].im; + (*gf)[2][1].re -= (*gf)[0][0].re * (*gf)[1][2].re; + (*gf)[2][1].re += (*gf)[0][0].im * (*gf)[1][2].im; + + (*gf)[2][1].im = -(*gf)[0][2].re * (*gf)[1][0].im; + (*gf)[2][1].im -= (*gf)[0][2].im * (*gf)[1][0].re; + (*gf)[2][1].im += (*gf)[0][0].re * (*gf)[1][2].im; + (*gf)[2][1].im += (*gf)[0][0].im * (*gf)[1][2].re; + + + + (*gf)[2][2].re = (*gf)[0][0].re * (*gf)[1][1].re; + (*gf)[2][2].re -= (*gf)[0][0].im * (*gf)[1][1].im; + (*gf)[2][2].re -= (*gf)[0][1].re * (*gf)[1][0].re; + (*gf)[2][2].re += (*gf)[0][1].im * (*gf)[1][0].im; + + (*gf)[2][2].im = -(*gf)[0][0].re * (*gf)[1][1].im; + (*gf)[2][2].im -= (*gf)[0][0].im * (*gf)[1][1].re; + (*gf)[2][2].im += (*gf)[0][1].re * (*gf)[1][0].im; + (*gf)[2][2].im += (*gf)[0][1].im * (*gf)[1][0].re; + + + return; +} + + + + +// su3 - dagger reconstruction from two rows +__device__ void dev_reconstructgf_2vtexref_dagger_half (const dev_su3_2v_half* field, int pos, dev_su3* gf){ + //dev_complex help1; + //dev_complex help2; + float4 gfin; + + + //first column (minus in im for complex conj.) + #ifdef USETEXTURE + gfin = tex1Dfetch(gf_tex,3*pos); + #else + gfin = sh4tofl4(field[3*pos]); + #endif + (*gf)[0][0].re = gfin.x; + (*gf)[0][0].im = -gfin.y; + (*gf)[1][0].re = gfin.z; + (*gf)[1][0].im = -gfin.w; + #ifdef USETEXTURE + gfin = tex1Dfetch(gf_tex,3*pos+1); + #else + gfin = sh4tofl4(field[3*pos +1]); + #endif + (*gf)[2][0].re = gfin.x; + (*gf)[2][0].im = -gfin.y; + + //second column (minus in im for complex conj.) + (*gf)[0][1].re = gfin.z; + (*gf)[0][1].im = -gfin.w; + #ifdef USETEXTURE + gfin = tex1Dfetch(gf_tex,3*pos+2); + #else + gfin = sh4tofl4(field[3*pos +2]); + #endif + (*gf)[1][1].re = gfin.x; + (*gf)[1][1].im = -gfin.y; + (*gf)[2][1].re = gfin.z; + (*gf)[2][1].im = -gfin.w; + + + + (*gf)[0][2].re = (*gf)[1][0].re * (*gf)[2][1].re; + (*gf)[0][2].re -= (*gf)[1][0].im * (*gf)[2][1].im; + (*gf)[0][2].re -= (*gf)[2][0].re * (*gf)[1][1].re; + (*gf)[0][2].re += (*gf)[2][0].im * (*gf)[1][1].im; + + (*gf)[0][2].im = -(*gf)[1][0].re* (*gf)[2][1].im; + (*gf)[0][2].im -= (*gf)[1][0].im* (*gf)[2][1].re; + (*gf)[0][2].im += (*gf)[2][0].re*(*gf)[1][1].im; + (*gf)[0][2].im += (*gf)[2][0].im*(*gf)[1][1].re; + + + (*gf)[1][2].re = (*gf)[2][0].re*(*gf)[0][1].re; + (*gf)[1][2].re -= (*gf)[2][0].im*(*gf)[0][1].im; + (*gf)[1][2].re -= (*gf)[0][0].re*(*gf)[2][1].re; + (*gf)[1][2].re += (*gf)[0][0].im*(*gf)[2][1].im; + + (*gf)[1][2].im = -(*gf)[2][0].re * (*gf)[0][1].im; + (*gf)[1][2].im -= (*gf)[2][0].im * (*gf)[0][1].re; + (*gf)[1][2].im += (*gf)[0][0].re * (*gf)[2][1].im; + (*gf)[1][2].im += (*gf)[0][0].im * (*gf)[2][1].re; + + (*gf)[2][2].re = (*gf)[0][0].re * (*gf)[1][1].re; + (*gf)[2][2].re -= (*gf)[0][0].im * (*gf)[1][1].im; + (*gf)[2][2].re -= (*gf)[1][0].re * (*gf)[0][1].re; + (*gf)[2][2].re += (*gf)[1][0].im * (*gf)[0][1].im; + + (*gf)[2][2].im = -(*gf)[0][0].re * (*gf)[1][1].im; + (*gf)[2][2].im -= (*gf)[0][0].im * (*gf)[1][1].re; + (*gf)[2][2].im += (*gf)[1][0].re * (*gf)[0][1].im; + (*gf)[2][2].im += (*gf)[1][0].im * (*gf)[0][1].re; + +} + + + + + + +// reconstruction of the gf using 8 real parameters as +// described in the appendix of hep-lat 0911.3191 (M.Clark et al.) +// optimized once +__device__ void dev_reconstructgf_8texref_half (const dev_su3_2v_half * field, int pos, dev_su3* gf){ + + float4 gfin; + REAL one_over_N, help; + dev_complex p1,p2; + + #ifdef USETEXTURE + gfin = tex1Dfetch(gf_tex,2*pos); + #else + gfin = sh4tofl4(field[2*pos]); + #endif + // read a2 a3 + (*gf)[0][1].re = gfin.x; + (*gf)[0][1].im = gfin.y; + (*gf)[0][2].re = gfin.z; + (*gf)[0][2].im = gfin.w; + + p1.re = gfin.x*gfin.x + gfin.y*gfin.y + gfin.z*gfin.z + gfin.w*gfin.w; // use later on + one_over_N = rsqrtf(p1.re); //reciprocal sqrt + + // read theta_a1, theta_c1, b1 + #ifdef USETEXTURE + gfin = tex1Dfetch(gf_tex,2*pos + 1); + #else + gfin = sh4tofl4(field[2*pos + 1]); + #endif + + // reconstruct a1 use sqrt instead of sin + help = 1.0f - p1.re; + if(help > 0.0f){ + p1.re = sqrtf(help); + } + else{ + p1.re = 0.0f; + } + #ifdef HALF + // we have to multiply by two pi because normalization to -1..1 + gfin.x = gfin.x*pi_float; + gfin.y = gfin.y*pi_float; + #endif + sincos(gfin.x, &(*gf)[0][0].im, &(*gf)[0][0].re); + (*gf)[0][0].re = (*gf)[0][0].re * p1.re; + (*gf)[0][0].im = (*gf)[0][0].im * p1.re; + + + + // assign b1 + (*gf)[1][0].re = gfin.z; + (*gf)[1][0].im = gfin.w; + + // p2 = 1/N b1 + p2.re = one_over_N*(*gf)[1][0].re; + p2.im = one_over_N*(*gf)[1][0].im; + + + // reconstruct c1 use sqrt instead of sin + help =1.0f - + (*gf)[0][0].re * (*gf)[0][0].re - (*gf)[0][0].im * (*gf)[0][0].im - + (*gf)[1][0].re * (*gf)[1][0].re - (*gf)[1][0].im * (*gf)[1][0].im; + if(help > 0.0f){ + p1.re = sqrtf(help); + } + else{ + p1.re = 0.0f; + } + sincos(gfin.y, &(*gf)[2][0].im, &(*gf)[2][0].re); + (*gf)[2][0].re = (*gf)[2][0].re * p1.re; + (*gf)[2][0].im = (*gf)[2][0].im * p1.re; + + + + // p1 = 1/N*cconj(c1) + p1.re = one_over_N*(*gf)[2][0].re; + p1.im = - one_over_N*(*gf)[2][0].im; + + + + //use the last reconstructed gf component gf[2][2] (c3) as a help variable for b2,b3 and c2 + //this is in order to save registers and to prevent extra loading and storing from global mem + // calculate b2 + + (*gf)[1][1].re = p1.re*(*gf)[0][2].re; + (*gf)[1][1].re += p1.im*(*gf)[0][2].im; + (*gf)[1][1].im = p1.im*(*gf)[0][2].re; + (*gf)[1][1].im -= p1.re*(*gf)[0][2].im; + + (*gf)[2][2].re = (*gf)[0][0].re * (*gf)[0][1].re; + (*gf)[2][2].re += (*gf)[0][0].im * (*gf)[0][1].im; + + (*gf)[2][2].im = (*gf)[0][0].re * (*gf)[0][1].im; + (*gf)[2][2].im -= (*gf)[0][0].im * (*gf)[0][1].re; + (*gf)[2][2] = dev_cmult(p2, (*gf)[2][2]); + + (*gf)[1][1].re = -one_over_N*( (*gf)[1][1].re + (*gf)[2][2].re); + (*gf)[1][1].im = -one_over_N*((*gf)[1][1].im + (*gf)[2][2].im); + + + + + + // calculate b3 + (*gf)[1][2].re = p1.re*(*gf)[0][1].re; + (*gf)[1][2].re += p1.im*(*gf)[0][1].im; + (*gf)[1][2].im = p1.im*(*gf)[0][1].re; + (*gf)[1][2].im -= p1.re*(*gf)[0][1].im; + + (*gf)[2][2].re = (*gf)[0][0].re*(*gf)[0][2].re; + (*gf)[2][2].re += (*gf)[0][0].im*(*gf)[0][2].im; + (*gf)[2][2].im = (*gf)[0][0].re*(*gf)[0][2].im; + (*gf)[2][2].im -= (*gf)[0][0].im*(*gf)[0][2].re; + (*gf)[2][2] = dev_cmult(p2,(*gf)[2][2]); + + (*gf)[1][2].re = one_over_N*( (*gf)[1][2].re - (*gf)[2][2].re); + (*gf)[1][2].im = one_over_N*( (*gf)[1][2].im - (*gf)[2][2].im); + + + // calculate c2 + (*gf)[2][1].re = p2.re*(*gf)[0][2].re; + (*gf)[2][1].re -= p2.im*(*gf)[0][2].im; + (*gf)[2][1].im = -p2.re*(*gf)[0][2].im; + (*gf)[2][1].im -= p2.im*(*gf)[0][2].re; + + + + (*gf)[2][2].re = (*gf)[0][0].re*(*gf)[0][1].re; + (*gf)[2][2].re += (*gf)[0][0].im*(*gf)[0][1].im; + (*gf)[2][2].im = (*gf)[0][0].re* (*gf)[0][1].im; + (*gf)[2][2].im -= (*gf)[0][0].im* (*gf)[0][1].re; + help = (*gf)[2][2].re; + (*gf)[2][2].re = p1.re*(*gf)[2][2].re; + (*gf)[2][2].re += p1.im*(*gf)[2][2].im; + (*gf)[2][2].im = p1.re*(*gf)[2][2].im - p1.im*help; + + + (*gf)[2][1].re = one_over_N*((*gf)[2][1].re - (*gf)[2][2].re); + (*gf)[2][1].im = one_over_N*((*gf)[2][1].im - (*gf)[2][2].im); + + // now we have to use p2 and p1 as a help variable, as this is not + // needed any more after the first + // step + // calculate c3 + (*gf)[2][2].re = p2.re * (*gf)[0][1].re; + (*gf)[2][2].re -= p2.im * (*gf)[0][1].im; + (*gf)[2][2].im = - p2.im*(*gf)[0][1].re; + (*gf)[2][2].im -= p2.re*(*gf)[0][1].im; + + p2.re = (*gf)[0][0].re * (*gf)[0][2].re; + p2.re += (*gf)[0][0].im * (*gf)[0][2].im; + p2.im = (*gf)[0][0].re * (*gf)[0][2].im; + p2.im -= (*gf)[0][0].im * (*gf)[0][2].re; + p2 = dev_cmult( dev_cconj(p1) , p2); + + (*gf)[2][2] = dev_cadd((*gf)[2][2], p2); + (*gf)[2][2] = dev_crealmult((*gf)[2][2], -one_over_N); + +} + + + + + + + + +__device__ void dev_reconstructgf_8texref_dagger_half (const dev_su3_2v_half* field,int pos, dev_su3* gf){ + + + float4 gfin; + REAL one_over_N, help; + dev_complex p1,p2; + + #ifdef USETEXTURE + gfin = tex1Dfetch(gf_tex,2*pos); + #else + gfin = sh4tofl4(field[2*pos]); + #endif + // read a2 a3 + (*gf)[1][0].re = gfin.x; + (*gf)[1][0].im = -gfin.y; + (*gf)[2][0].re = gfin.z; + (*gf)[2][0].im = -gfin.w; + + p1.re = gfin.x*gfin.x + gfin.y*gfin.y + gfin.z*gfin.z + gfin.w*gfin.w; // use later on + one_over_N = rsqrtf(p1.re); // reciprocal sqrt + + + // read theta_a1, theta_c1, b1 + #ifdef USETEXTURE + gfin = tex1Dfetch(gf_tex,2*pos + 1); + #else + gfin = sh4tofl4(field[2*pos + 1]); + #endif + + // reconstruct a1 + help = 1.0f - p1.re; + if(help > 0.0f){ + p1.re = sqrtf(help); + } + else{ + p1.re = 0.0f; + } + //(*gf)[0][0].re = p1.re*cosf(gfin.x); + //(*gf)[0][0].im = -p1.re*sinf(gfin.x); + + #ifdef HALF + // we have to multiply by two pi because normalization to -1..1 + gfin.x = gfin.x*pi_float; + gfin.y = gfin.y*pi_float; + #endif + + sincos(gfin.x, &(*gf)[0][0].im, &(*gf)[0][0].re); + (*gf)[0][0].re = (*gf)[0][0].re * p1.re; + (*gf)[0][0].im = -(*gf)[0][0].im * p1.re; + + + // assign b1 + (*gf)[0][1].re = gfin.z; + (*gf)[0][1].im = -gfin.w; + + // p2 = 1/N b1 + p2.re = one_over_N*(*gf)[0][1].re; + p2.im = -one_over_N*(*gf)[0][1].im; + + + // reconstruct c1 + help = 1.0f - + (*gf)[0][0].re * (*gf)[0][0].re - (*gf)[0][0].im * (*gf)[0][0].im - + (*gf)[0][1].re * (*gf)[0][1].re - (*gf)[0][1].im * (*gf)[0][1].im; + if(help > 0.0f){ + p1.re = sqrtf(help); + } + else{ + p1.re = 0.0f; + } + //(*gf)[0][2].re = p1.re*cosf(gfin.y); + //(*gf)[0][2].im = -p1.re*sinf(gfin.y); + + sincos(gfin.y, &(*gf)[0][2].im, &(*gf)[0][2].re); + (*gf)[0][2].re = (*gf)[0][2].re * p1.re; + (*gf)[0][2].im = -(*gf)[0][2].im * p1.re; + + + // p1 = 1/N*cconj(c1) + p1.re = one_over_N*(*gf)[0][2].re; + p1.im = one_over_N*(*gf)[0][2].im; + + //use the last reconstructed gf component gf[2][2] (c3) as a help variable for b2,b3 and c2 + //this is in order to save registers and to prevent extra loading and storing from global mem + // calculate b2 + (*gf)[1][1] = dev_cmult(p1, (*gf)[2][0] ); + (*gf)[2][2] = dev_cmult(p2, dev_cmult( (*gf)[0][0] , dev_cconj((*gf)[1][0] )) ); + (*gf)[1][1] = dev_cadd((*gf)[1][1], (*gf)[2][2]); + (*gf)[1][1] = dev_cconj(dev_crealmult((*gf)[1][1], -one_over_N)); + + // calculate b3 + (*gf)[2][1] = dev_cmult(p1, (*gf)[1][0] ); + (*gf)[2][2] = dev_cmult(p2, dev_cmult( (*gf)[0][0] , dev_cconj((*gf)[2][0] )) ); + (*gf)[2][1] = dev_csub((*gf)[2][1], (*gf)[2][2]); + (*gf)[2][1] = dev_cconj(dev_crealmult((*gf)[2][1], one_over_N)); + + // calculate c2 + (*gf)[1][2] = dev_cmult( dev_cconj(p2) , (*gf)[2][0] ); + (*gf)[2][2] = dev_cmult( dev_cconj(p1) , + dev_cmult( (*gf)[0][0] , dev_cconj( (*gf)[1][0]) ) + ); + (*gf)[1][2] = dev_csub((*gf)[1][2], (*gf)[2][2]); + (*gf)[1][2] = dev_cconj(dev_crealmult((*gf)[1][2], one_over_N)); + + // use p2 as help variable after the first step + // calculate c3 + (*gf)[2][2] = dev_cmult( dev_cconj(p2) , (*gf)[1][0] ); + p2 = dev_cmult( dev_cconj(p1) , + dev_cmult( (*gf)[0][0] , dev_cconj((*gf)[2][0] ) ) + ); + (*gf)[2][2] = dev_cadd((*gf)[2][2], p2); + (*gf)[2][2] = dev_cconj(dev_crealmult((*gf)[2][2], -one_over_N)); + +} + + +#endif // HALF + + + + + + + + + +template +__global__ void dev_check_gauge_reconstruction_8(typename dev_su3_2vT::type* gf, int pos, typename dev_su3T::type * outgf1, typename dev_su3T::type* outgf2){ + dev_reconstructgf_8texref (gf,pos, outgf1); + dev_reconstructgf_8texref_dagger (gf,pos, outgf2); +} + + + + + + +////////////////////// HOST FUNCTIONS FOR GAUGE RECONSTRUCTION //////////////////// + + +// get 2 first rows of gf float4 type +// +// +template +void su3to2vf4(su3** gf, typename dev_su3_2vT::type* h2d_gf){ + int i,j; + #ifndef MPI + for (i = 0; i < VOLUME; i++) { + #else + for (i = 0; i < (VOLUME+RAND); i++) { + #endif + for(j=0;j<4;j++){ + //first row + h2d_gf[3*(4*i+j)].x = (RealT) gf[i][j].c00.re; + h2d_gf[3*(4*i+j)].y = (RealT) gf[i][j].c00.im; + h2d_gf[3*(4*i+j)].z = (RealT) gf[i][j].c01.re; + h2d_gf[3*(4*i+j)].w = (RealT) gf[i][j].c01.im; + h2d_gf[3*(4*i+j)+1].x = (RealT) gf[i][j].c02.re; + h2d_gf[3*(4*i+j)+1].y = (RealT) gf[i][j].c02.im; + //second row + h2d_gf[3*(4*i+j)+1].z = (RealT) gf[i][j].c10.re; + h2d_gf[3*(4*i+j)+1].w = (RealT) gf[i][j].c10.im; + h2d_gf[3*(4*i+j)+2].x = (RealT) gf[i][j].c11.re; + h2d_gf[3*(4*i+j)+2].y = (RealT) gf[i][j].c11.im; + h2d_gf[3*(4*i+j)+2].z = (RealT) gf[i][j].c12.re; + h2d_gf[3*(4*i+j)+2].w = (RealT) gf[i][j].c12.im; + } + } +} + + + + +// bring gf into the form +// a2 a3, theta_a1, theta_c1, b1 +// +template +void su3to8(su3** gf, typename dev_su3_8T::type* h2d_gf){ + int i,j; + #ifndef MPI + for (i = 0; i < VOLUME; i++) { + #else + for (i = 0; i < (VOLUME+RAND); i++) { + #endif + for(j=0;j<4;j++){ + // a2, a3 + h2d_gf[2*(4*i+j)].x = (RealT) gf[i][j].c01.re; + h2d_gf[2*(4*i+j)].y = (RealT) gf[i][j].c01.im; + h2d_gf[2*(4*i+j)].z = (RealT) gf[i][j].c02.re; + h2d_gf[2*(4*i+j)].w = (RealT) gf[i][j].c02.im; + + // theta_a1, theta_c1 + // use atan2 for this: following the reference, atan2 should give an angle -pi < phi < +pi + h2d_gf[2*(4*i+j)+1].x = (RealT)( atan2((RealT) gf[i][j].c00.im,(RealT) gf[i][j].c00.re )); + h2d_gf[2*(4*i+j)+1].y = (RealT) ( atan2((RealT) gf[i][j].c20.im,(RealT)gf[i][j].c20.re )); + + // b1 + h2d_gf[2*(4*i+j)+1].z = (RealT) gf[i][j].c10.re ; + h2d_gf[2*(4*i+j)+1].w = (RealT) gf[i][j].c10.im ; + } + } +} + + + + + + +// this is to reconstruct the gf on the host from 2 rows of the link +// may be used for tests +void reconstructgf_2v (dev_su3* gf){ + complex help1; + complex help2; + //third row from cconj(cross product of first and second row) + _mult_assign_complex(help1,(*gf)[0][1],(*gf)[1][2]); + _mult_assign_complex(help2,(*gf)[0][2],(*gf)[1][1]); + _diff_complex(help1,help2); + help1.im = -help1.im; + (*gf)[2][0].re = help1.re; + (*gf)[2][0].im = help1.im; + + _mult_assign_complex(help1,(*gf)[0][2],(*gf)[1][0]); + _mult_assign_complex(help2,(*gf)[0][0],(*gf)[1][2]); + _diff_complex(help1,help2); + help1.im = -help1.im; + (*gf)[2][1].re = help1.re; + (*gf)[2][1].im = help1.im; + + _mult_assign_complex(help1,(*gf)[0][0],(*gf)[1][1]); + _mult_assign_complex(help2,(*gf)[0][1],(*gf)[1][0]); + _diff_complex(help1,help2); + help1.im = -help1.im; + (*gf)[2][2].re = help1.re; + (*gf)[2][2].im = help1.im; + return; +} + + + + +// this is to reconstruct the gf on the host from 2 rows of the link +// may be used for tests +void reconstructgf_8 (dev_su3_8 * h2d_gf, dev_su3* gf){ + + float4 gfin; + REAL N, one_over_N, help; + complex p1,p2, chelp1, chelp2, chelp3, chelpconj, chelpconj2; + + gfin = h2d_gf[0]; + // read a2 a3 + (*gf)[0][1].re = gfin.x; + (*gf)[0][1].im = gfin.y; + (*gf)[0][2].re = gfin.z; + (*gf)[0][2].im = gfin.w; + + help = gfin.x*gfin.x + gfin.y*gfin.y + gfin.z*gfin.z + gfin.w*gfin.w; // use later on + N = sqrt(help); + one_over_N = 1.0f/N; + + // read theta_a1, theta_c1, b1 + gfin = h2d_gf[1]; + + // reconstruct a1 + help = sqrt(1.0f - help); + (*gf)[0][0].re = help*cos(gfin.x); + (*gf)[0][0].im = help*sin(gfin.x); + + // assign b1 + (*gf)[1][0].re = gfin.z; + (*gf)[1][0].im = gfin.w; + + // p2 = 1/N b1 + p2.re = one_over_N*(*gf)[1][0].re; + p2.im = one_over_N*(*gf)[1][0].im; + + + // reconstruct c1 + help = sqrt(1.0f - + (*gf)[0][0].re * (*gf)[0][0].re - (*gf)[0][0].im * (*gf)[0][0].im - + (*gf)[1][0].re * (*gf)[1][0].re - (*gf)[1][0].im * (*gf)[1][0].im + ); + (*gf)[2][0].re = help*cos(gfin.y); + (*gf)[2][0].im = help*sin(gfin.y); + + + // p1 = 1/N*cconj(c1) + p1.re = one_over_N*(*gf)[2][0].re; + p1.im = - one_over_N*(*gf)[2][0].im; + + + float temp = p1.re*p1.re + p1.im*p1.im + p2.re*p2.re + p2.im*p2.im; + printf("p1**2 + p2**2 = %f\n", temp); + + + // calculate b2 + _complex_conj(chelpconj, (*gf)[0][2] ); + _mult_assign_complex(chelp1, p1, chelpconj ); + _complex_conj(chelpconj, (*gf)[0][0]); + _mult_assign_complex(chelp3, chelpconj , (*gf)[0][1] ); + _mult_assign_complex(chelp2, p2, chelp3); + _add_complex(chelp1, chelp2); + _mult_real((*gf)[1][1], chelp1, -one_over_N); + + + // calculate b3 + _complex_conj(chelpconj, (*gf)[0][1] ); + _mult_assign_complex(chelp1, p1, chelpconj ); + _complex_conj(chelpconj, (*gf)[0][0]); + _mult_assign_complex(chelp3, chelpconj , (*gf)[0][2] ); + _mult_assign_complex(chelp2, p2, chelp3 ); + _diff_complex(chelp1, chelp2); + _mult_real((*gf)[1][2],chelp1, one_over_N); + + + // calculate c2 + _complex_conj(chelpconj, p2); + _complex_conj(chelpconj2, (*gf)[0][2]); + _mult_assign_complex(chelp1, chelpconj , chelpconj2 ); + _complex_conj(chelpconj,(*gf)[0][0]); + _mult_assign_complex(chelp3, chelpconj , (*gf)[0][1] ); + _complex_conj(chelpconj2,p1); + _mult_assign_complex(chelp2, chelpconj2 , chelp3); + _diff_complex(chelp1, chelp2); + _mult_real((*gf)[2][1],chelp1, one_over_N); + + + // calculate c3 + _complex_conj(chelpconj, p2); + _complex_conj(chelpconj2, (*gf)[0][1] ); + _mult_assign_complex(chelp1, chelpconj , chelpconj2 ); + _complex_conj(chelpconj,(*gf)[0][0]); + _mult_assign_complex(chelp3, chelpconj ,(*gf)[0][2]); + _complex_conj(chelpconj,p1); + _mult_assign_complex( chelp2, chelpconj , chelp3 ); + _add_complex(chelp1, chelp2); + _mult_real((*gf)[2][2], chelp1, -one_over_N); + +} + + + + + + + + +void show_su3(su3 gf1){ + printf("(%f,%f)\t(%f,%f)\t(%f,%f)\n",gf1.c00.re, + gf1.c00.im, + gf1.c01.re, + gf1.c01.im, + gf1.c02.re, + gf1.c02.im + ); + printf("(%f,%f)\t(%f,%f)\t(%f,%f)\n",gf1.c10.re, + gf1.c10.im, + gf1.c11.re, + gf1.c11.im, + gf1.c12.re, + gf1.c12.im + ); + printf("(%f,%f)\t(%f,%f)\t(%f,%f)\n",gf1.c20.re, + gf1.c20.im, + gf1.c21.re, + gf1.c21.im, + gf1.c22.re, + gf1.c22.im + ); +} + + +void show_dev_su3(dev_su3 gf1){ + printf("(%f,%f)\t(%f,%f)\t(%f,%f)\n",gf1[0][0].re, + gf1[0][0].im, + gf1[0][1].re, + gf1[0][1].im, + gf1[0][2].re, + gf1[0][2].im + ); + printf("(%f,%f)\t(%f,%f)\t(%f,%f)\n",gf1[1][0].re, + gf1[1][0].im, + gf1[1][1].re, + gf1[1][1].im, + gf1[1][2].re, + gf1[1][2].im + ); + printf("(%f,%f)\t(%f,%f)\t(%f,%f)\n",gf1[2][0].re, + gf1[2][0].im, + gf1[2][1].re, + gf1[2][1].im, + gf1[2][2].re, + gf1[2][2].im + ); + +} + + + +template +void check_gauge_reconstruction_8(su3 ** gf1, dev_su3_2vM(RealT) * gf2, int ind1, int mu, MixedsolveParameter& mixedsolveParameter){ + dev_su3M(RealT) * reconst_g , * reconst_g_dagger; + dev_su3M(RealT) result, result_dagger; + printf("Checking 8 paramater reconstruction of gauge field:\n"); + su3 gfdagger; + #ifdef USETEXTURE + bind_texture_gf(gf2); + #endif + printf("\n"); + size_t cpsize = sizeof(dev_su3M(RealT)); // parallel in t and z direction + cudaMalloc((void **) &reconst_g, cpsize); + cudaMalloc((void **) &reconst_g_dagger, cpsize); + + show_su3(gf1[ind1][mu]); + printf("\n"); + + dev_check_gauge_reconstruction_8 <<< 1 , 1 >>> (mixedsolveParameter.dev_gf,4*ind1 + mu, reconst_g, reconst_g_dagger); + cudaMemcpy(&result, reconst_g, cpsize, cudaMemcpyDeviceToHost); + cudaMemcpy(&result_dagger, reconst_g_dagger, cpsize, cudaMemcpyDeviceToHost); + + show_dev_su3(result); + printf("\n"); + + _su3_dagger(gfdagger,gf1[ind1][mu]); + show_su3(gfdagger); + printf("\n"); + show_dev_su3(result_dagger); + + + + #ifdef USETEXTURE + unbind_texture_gf(); + #endif + cudaFree(reconst_g); +} + + + + + + + + + + + +// compare host gauge-field with gauge-field that is reconstructed (on host) +template +void showcompare_gf(int t, int x, int y, int z, int mu, MixedsolveParameter& mixedsolveParameter){ + int ind1 = g_ipt[t][x][y][z]; + su3 ** gf1 = g_gauge_field; + + printf("(%f,%f)\t(%f,%f)\t(%f,%f)\n",gf1[ind1][mu].c00.re, + gf1[ind1][mu].c00.im, + gf1[ind1][mu].c01.re, + gf1[ind1][mu].c01.im, + gf1[ind1][mu].c02.re, + gf1[ind1][mu].c02.im + ); + printf("(%f,%f)\t(%f,%f)\t(%f,%f)\n",gf1[ind1][mu].c10.re, + gf1[ind1][mu].c10.im, + gf1[ind1][mu].c11.re, + gf1[ind1][mu].c11.im, + gf1[ind1][mu].c12.re, + gf1[ind1][mu].c12.im + ); + printf("(%f,%f)\t(%f,%f)\t(%f,%f)\n",gf1[ind1][mu].c20.re, + gf1[ind1][mu].c20.im, + gf1[ind1][mu].c21.re, + gf1[ind1][mu].c21.im, + gf1[ind1][mu].c22.re, + gf1[ind1][mu].c22.im + ); + printf("\n\n"); + + int ind2 = z + LZ*(y + LY*(x + LX*t)); +#ifdef GF_8 + printf("8-field:\t(%f,%f,%f,%f) (%f,%f,%f,%f)\n", + mixedsolveParameter.h2d_gf[2*(4*ind2+mu)].x, + mixedsolveParameter.h2d_gf[2*(4*ind2+mu)].y, + mixedsolveParameter.h2d_gf[2*(4*ind2+mu)].z, + mixedsolveParameter.h2d_gf[2*(4*ind2+mu)].w, + mixedsolveParameter.h2d_gf[2*(4*ind2+mu)+1].x, + mixedsolveParameter.h2d_gf[2*(4*ind2+mu)+1].y, + mixedsolveParameter.h2d_gf[2*(4*ind2+mu)+1].z, + mixedsolveParameter.h2d_gf[2*(4*ind2+mu)+1].w + ); + dev_su3M(RealT) help; + reconstructgf_8( &(mixedsolveParameter.h2d_gf[2*(4*ind2+mu)]) , &help ); + printf("(%f,%f)\t(%f,%f)\t(%f,%f)\n",help[0][0].re, + help[0][0].im, + help[0][1].re, + help[0][1].im, + help[0][2].re, + help[0][2].im + ); + printf("(%f,%f)\t(%f,%f)\t(%f,%f)\n",help[1][0].re, + help[1][0].im, + help[1][1].re, + help[1][1].im, + help[1][2].re, + help[1][2].im + ); + printf("(%f,%f)\t(%f,%f)\t(%f,%f)\n",help[2][0].re, + help[2][0].im, + help[2][1].re, + help[2][1].im, + help[2][2].re, + help[2][2].im + ); + +#else + printf("(%f,%f)\t(%f,%f)\t(%f,%f)\n",mixedsolveParameter.h2d_gf[3*(4*ind2+mu)].x, + mixedsolveParameter.h2d_gf[3*(4*ind2+mu)].y, + mixedsolveParameter.h2d_gf[3*(4*ind2+mu)].z, + mixedsolveParameter.h2d_gf[3*(4*ind2+mu)].w, + mixedsolveParameter.h2d_gf[3*(4*ind2+mu)+1].x, + mixedsolveParameter.h2d_gf[3*(4*ind2+mu)+1].y + ); + printf("(%f,%f)\t(%f,%f)\t(%f,%f)\n",mixedsolveParameter.h2d_gf[3*(4*ind2+mu)+1].z, + mixedsolveParameter.h2d_gf[3*(4*ind2+mu)+1].w, + mixedsolveParameter.h2d_gf[3*(4*ind2+mu)+2].x, + mixedsolveParameter.h2d_gf[3*(4*ind2+mu)+2].y, + mixedsolveParameter.h2d_gf[3*(4*ind2+mu)+2].z, + mixedsolveParameter.h2d_gf[3*(4*ind2+mu)+2].w + ); + + dev_su3M(RealT) help; + + help[0][0].re = mixedsolveParameter.h2d_gf[3*(4*ind2+mu)].x; + help[0][0].im = mixedsolveParameter.h2d_gf[3*(4*ind2+mu)].y; + help[0][1].re = mixedsolveParameter.h2d_gf[3*(4*ind2+mu)].z; + help[0][1].im = mixedsolveParameter.h2d_gf[3*(4*ind2+mu)].w; + + help[0][2].re = mixedsolveParameter.h2d_gf[3*(4*ind2+mu)+1].x; + help[0][2].im = mixedsolveParameter.h2d_gf[3*(4*ind2+mu)+1].y; + help[1][0].re = mixedsolveParameter.h2d_gf[3*(4*ind2+mu)+1].z; + help[1][0].im = mixedsolveParameter.h2d_gf[3*(4*ind2+mu)+1].w; + + help[1][1].re = mixedsolveParameter.h2d_gf[3*(4*ind2+mu)+2].x; + help[1][1].im = mixedsolveParameter.h2d_gf[3*(4*ind2+mu)+2].y; + help[1][2].re = mixedsolveParameter.h2d_gf[3*(4*ind2+mu)+2].z; + help[1][2].im = mixedsolveParameter.h2d_gf[3*(4*ind2+mu)+2].w; + + reconstructgf_2v (&help); + + printf("(%f,%f)\t(%f,%f)\t(%f,%f)\n",help[2][0].re, + help[2][0].im, + help[2][1].re, + help[2][1].im, + help[2][2].re, + help[2][2].im + ); +#endif +} + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/half.cuh b/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/half.cuh new file mode 100644 index 0000000000000000000000000000000000000000..1261f240dd790dd535412348521f843cc6b8bc9e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/half.cuh @@ -0,0 +1,1157 @@ +/*********************************************************************** + * + * Copyright (C) 2010 Florian Burger + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * + * File: half.cuh + * + * CUDA half precision conversions and BLAS kernels + * + * + * + **************************************************************************/ + + +#define pi_float 3.141592654f + + + + +//////// short <-> float conversion ///////////// +#define SHORT_LEN 65536 +#define SCALE ((SHORT_LEN-1) * 0.5) +#define SHIFT (-1.0f/(SHORT_LEN-1)) + +/* +__device__ short fl2sh(float f) { + short ret = (short)((f+SHIFT)*SCALE); + return ret; +} + +__device__ float sh2fl(short s) { + return ((float)(s/SCALE) - SHIFT); +} + + +short fl2sh_host(float f) { + short ret = (short)((f+SHIFT)*SCALE); + return ret; +} + +float sh2fl_host(short s) { + return ((float)(s/SCALE) - SHIFT); +} + + +float half2float_host(short in, float innorm){ + return(sh2fl_host(in)*innorm); +} + +*/ + + +#define fl2sh(f) ((short)(((f)+SHIFT)*SCALE)) +#define sh2fl(s) ((float)((s)/SCALE) - SHIFT) +#define half2fl(s,norm) (norm*((float)((s)/SCALE) - SHIFT)) + +short fl2sh_host(float f) { + short ret = (short)((f+SHIFT)*SCALE); + return ret; +} + +float sh2fl_host(short s) { + return ((float)(s/SCALE) - SHIFT); +} + + +float half2float_host(short in, float innorm){ + return(sh2fl_host(in)*innorm); +} + + + +#define construct_spinor_fromhalf(sf, sh, shn, pos){ \ + (sf)[0].x = shn*sh2fl(sh[6*(pos)].x); \ + (sf)[0].y = shn*sh2fl(sh[6*(pos)].y); \ + (sf)[0].z = shn*sh2fl(sh[6*(pos)].z); \ + (sf)[0].w = shn*sh2fl(sh[6*(pos)].w); \ + (sf)[1].x = shn*sh2fl(sh[6*(pos)+1].x); \ + (sf)[1].y = shn*sh2fl(sh[6*(pos)+1].y); \ + (sf)[1].z = shn*sh2fl(sh[6*(pos)+1].z); \ + (sf)[1].w = shn*sh2fl(sh[6*(pos)+1].w); \ + (sf)[2].x = shn*sh2fl(sh[6*(pos)+2].x); \ + (sf)[2].y = shn*sh2fl(sh[6*(pos)+2].y); \ + (sf)[2].z = shn*sh2fl(sh[6*(pos)+2].z); \ + (sf)[2].w = shn*sh2fl(sh[6*(pos)+2].w); \ + (sf)[3].x = shn*sh2fl(sh[6*(pos)+3].x); \ + (sf)[3].y = shn*sh2fl(sh[6*(pos)+3].y); \ + (sf)[3].z = shn*sh2fl(sh[6*(pos)+3].z); \ + (sf)[3].w = shn*sh2fl(sh[6*(pos)+3].w); \ + (sf)[4].x = shn*sh2fl(sh[6*(pos)+4].x); \ + (sf)[4].y = shn*sh2fl(sh[6*(pos)+4].y); \ + (sf)[4].z = shn*sh2fl(sh[6*(pos)+4].z); \ + (sf)[4].w = shn*sh2fl(sh[6*(pos)+4].w); \ + (sf)[5].x = shn*sh2fl(sh[6*(pos)+5].x); \ + (sf)[5].y = shn*sh2fl(sh[6*(pos)+5].y); \ + (sf)[5].z = shn*sh2fl(sh[6*(pos)+5].z); \ + (sf)[5].w = shn*sh2fl(sh[6*(pos)+5].w); }\ + + +#define get_half_norm(n,s){ \ + float c0 = fmaxf(fabsf((s[0]).x), fabsf((s[0]).y)); \ + float c1 = fmaxf(fabsf((s[0]).z), fabsf((s[0]).w)); \ + float c2 = fmaxf(fabsf((s[1]).x), fabsf((s[1]).y)); \ + float c3 = fmaxf(fabsf((s[1]).z), fabsf((s[1]).w)); \ + float c4 = fmaxf(fabsf((s[2]).x), fabsf((s[2]).y)); \ + float c5 = fmaxf(fabsf((s[2]).z), fabsf((s[2]).w)); \ + float c6 = fmaxf(fabsf((s[3]).x), fabsf((s[3]).y)); \ + float c7 = fmaxf(fabsf((s[3]).z), fabsf((s[3]).w)); \ + float c8 = fmaxf(fabsf((s[4]).x), fabsf((s[4]).y)); \ + float c9 = fmaxf(fabsf((s[4]).z), fabsf((s[4]).w)); \ + float c10 = fmaxf(fabsf((s[5]).x), fabsf((s[5]).y)); \ + float c11 = fmaxf(fabsf((s[5]).z), fabsf((s[5]).w)); \ + c0 = fmaxf(c0, c1); c1 = fmaxf(c2, c3); c2 = fmaxf(c4, c5); \ + c3 = fmaxf(c6, c7); c4 = fmaxf(c8, c9); c5 = fmaxf(c10, c11); \ + c0 = fmaxf(c0, c1); c1 = fmaxf(c2, c3); c2 = fmaxf(c4, c5); \ + c0 = fmaxf(c0, c1); c0 = fmaxf(c0, c2); \ + n = c0; \ +}\ + + +#define get_half_norm_from_pos(n,s,pos){ \ + float c0 = fmaxf(fabsf((s[6*pos+0]).x), fabsf((s[6*pos+0]).y)); \ + float c1 = fmaxf(fabsf((s[6*pos+0]).z), fabsf((s[6*pos+0]).w)); \ + float c2 = fmaxf(fabsf((s[6*pos+1]).x), fabsf((s[6*pos+1]).y)); \ + float c3 = fmaxf(fabsf((s[6*pos+1]).z), fabsf((s[6*pos+1]).w)); \ + float c4 = fmaxf(fabsf((s[6*pos+2]).x), fabsf((s[6*pos+2]).y)); \ + float c5 = fmaxf(fabsf((s[6*pos+2]).z), fabsf((s[6*pos+2]).w)); \ + float c6 = fmaxf(fabsf((s[6*pos+3]).x), fabsf((s[6*pos+3]).y)); \ + float c7 = fmaxf(fabsf((s[6*pos+3]).z), fabsf((s[6*pos+3]).w)); \ + float c8 = fmaxf(fabsf((s[6*pos+4]).x), fabsf((s[6*pos+4]).y)); \ + float c9 = fmaxf(fabsf((s[6*pos+4]).z), fabsf((s[6*pos+4]).w)); \ + float c10 = fmaxf(fabsf((s[6*pos+5]).x), fabsf((s[6*pos+5]).y)); \ + float c11 = fmaxf(fabsf((s[6*pos+5]).z), fabsf((s[6*pos+5]).w)); \ + c0 = fmaxf(c0, c1); c1 = fmaxf(c2, c3); c2 = fmaxf(c4, c5); \ + c3 = fmaxf(c6, c7); c4 = fmaxf(c8, c9); c5 = fmaxf(c10, c11); \ + c0 = fmaxf(c0, c1); c1 = fmaxf(c2, c3); c2 = fmaxf(c4, c5); \ + c0 = fmaxf(c0, c1); c0 = fmaxf(c0, c2); \ + n = c0; \ +}\ + + + +#define get_half_norm_from_pos_host(n,s,pos){ \ + float c0 = fmaxf(fabsf((s[pos].s0.c0.re)), fabsf((s[pos].s0.c0.im))); \ + float c1 = fmaxf(fabsf((s[pos].s0.c1.re)), fabsf((s[pos].s0.c1.im))); \ + float c2 = fmaxf(fabsf((s[pos].s0.c2.re)), fabsf((s[pos].s0.c2.im))); \ + float c3 = fmaxf(fabsf((s[pos].s1.c0.re)), fabsf((s[pos].s1.c0.im))); \ + float c4 = fmaxf(fabsf((s[pos].s1.c1.re)), fabsf((s[pos].s1.c1.im))); \ + float c5 = fmaxf(fabsf((s[pos].s1.c2.re)), fabsf((s[pos].s1.c2.im))); \ + float c6 = fmaxf(fabsf((s[pos].s2.c0.re)), fabsf((s[pos].s2.c0.im))); \ + float c7 = fmaxf(fabsf((s[pos].s2.c1.re)), fabsf((s[pos].s2.c1.im))); \ + float c8 = fmaxf(fabsf((s[pos].s2.c2.re)), fabsf((s[pos].s2.c2.im))); \ + float c9 = fmaxf(fabsf((s[pos].s3.c0.re)), fabsf((s[pos].s3.c0.im))); \ + float c10 = fmaxf(fabsf((s[pos].s3.c1.re)), fabsf((s[pos].s3.c1.im))); \ + float c11 = fmaxf(fabsf((s[pos].s3.c2.re)), fabsf((s[pos].s3.c2.im))); \ + c0 = fmaxf(c0, c1); c1 = fmaxf(c2, c3); c2 = fmaxf(c4, c5); \ + c3 = fmaxf(c6, c7); c4 = fmaxf(c8, c9); c5 = fmaxf(c10, c11); \ + c0 = fmaxf(c0, c1); c1 = fmaxf(c2, c3); c2 = fmaxf(c4, c5); \ + c0 = fmaxf(c0, c1); c0 = fmaxf(c0, c2); \ + n = c0; \ +}\ + + +////////////////////////////// + + + + +// these are textures for the half spinor fields - maybe move to textures.h if possible + + /* texture for spinor field */ + texture spinhalf_tex; + + /* texture for norm of spinor field 1*/ + texture spinnormhalf_tex; + + + + int blas_half_gridsize; + int blas_half_blocksize; // kernel parameters for the half_dot and axpy kernels + float * dev_blas_half_redfield; //this is the reduction field for the + //blas reduction kernels + float * dev_blas_half_sredfield; // this is the small reduction field after one sweep of reduction + float * blas_half_sredfield; + int blas_half_redblocks; // the number of blocks of the reduction kernel + // VOLUME/REDUCTION_N + // also the size of the final sum (of reduction) + // performed on host + + + + +// write float spinor in to half spinor out and out_norm +__device__ void dev_write_spinor_half(dev_spinor* in, dev_spinor_half* out, float* out_norm){ + float norm = 0.0f; + int i; + + get_half_norm(norm, in); + + //store unit direction vector + *out_norm = norm; + if (norm != 0.0f){ + //store norm + #pragma unroll 6 + for(i=0; i<6; i++){ + out[i].x = fl2sh(in[i].x/norm); + out[i].y = fl2sh(in[i].y/norm); + out[i].z = fl2sh(in[i].z/norm); + out[i].w = fl2sh(in[i].w/norm); + } + } + else{ + //store norm + #pragma unroll 6 + for(i=0; i<6; i++){ + out[i].x = fl2sh(0.0f); + out[i].y = fl2sh(0.0f); + out[i].z = fl2sh(0.0f); + out[i].w = fl2sh(0.0f); + } + } +} + + + + + + + +// stores the float spinor field in s into the half spinor field sh and the norm into shnorm +__global__ void float2half_spinorfield(dev_spinor* s, dev_spinor_half* sh, float* shnorm){ + int pos=threadIdx.x + blockDim.x*blockIdx.x; + //__shared__ float4 slocal[6]; + int i; + float norm = 0.0; + if(pos < dev_VOLUME){ + + /* BEWARE THIS IS NOT WORKING FOR SOME REASON dev_copy_spinor fails because slocal is shared??? + dev_copy_spinor(&(s[6*pos]), &(slocal[0])); + // calculate norm + + + for(i=0; i<6; i++){ + norm += slocal[i].x*slocal[i].x + slocal[i].y*slocal[i].y + + slocal[i].z*slocal[i].z + slocal[i].w*slocal[i].w; + } + + */ + + get_half_norm_from_pos(norm, s, pos); + + shnorm[pos] = norm; + //store unit direction vector + if (norm != 0.0f){ + //store norm + #pragma unroll 6 + for(i=0; i<6; i++){ + sh[6*pos+i].x = fl2sh(s[6*pos+i].x/norm); + sh[6*pos+i].y = fl2sh(s[6*pos+i].y/norm); + sh[6*pos+i].z = fl2sh(s[6*pos+i].z/norm); + sh[6*pos+i].w = fl2sh(s[6*pos+i].w/norm); + } + } + else{ + //store norm + #pragma unroll 6 + for(i=0; i<6; i++){ + sh[6*pos+i].x = fl2sh(0.0f); + sh[6*pos+i].y = fl2sh(0.0f); + sh[6*pos+i].z = fl2sh(0.0f); + sh[6*pos+i].w = fl2sh(0.0f); + } + } + + + } +} + + +// reads half spinor from texture "spinhalf_tex" and the norm from "spinnorm_tex" and stores it into float spinor to +__global__ void half2float_spinorfield_tex(dev_spinor* to){ + +int pos=threadIdx.x + blockDim.x*blockIdx.x; + int i; + float norm = 0.0; + float4 help; + if(pos < dev_VOLUME){ + norm = tex1Dfetch(spinnormhalf_tex,pos); + for(i=0; i<6; i++){ + help = tex1Dfetch(spinhalf_tex,6*pos+i); + to[6*pos+i].x = help.x*norm; + to[6*pos+i].y = help.y*norm; + to[6*pos+i].z = help.z*norm; + to[6*pos+i].w = help.w*norm; + } + + } +} + + +// stores the float4 gauge field gf into the half gauge field gfh +// for GF_8 we have to be careful, as we have two angles in -Pi .. Pi +// so we have to divide them by (Pi) This is taken care of in the gauge +// reconstruction routines +// the volume is given explicitly (vol) here, to make sure alway the complete +// VOLUME is processed and not VOLUME/2 (eo) +__global__ void float2half_gaugefield(dev_su3_2v* gf, dev_su3_2v_half* gfh, int vol){ + + int pos=threadIdx.x + blockDim.x*blockIdx.x; + int nf4,mu; + if(pos < vol){ + for(mu=0; mu<4; mu++){ + #ifdef GF_8 + nf4 = 2; + gfh[nf4*(4*pos+mu)].x = fl2sh(gf[nf4*(4*pos+mu)].x); + gfh[nf4*(4*pos+mu)].y = fl2sh(gf[nf4*(4*pos+mu)].y); + gfh[nf4*(4*pos+mu)].z = fl2sh(gf[nf4*(4*pos+mu)].z); + gfh[nf4*(4*pos+mu)].w = fl2sh(gf[nf4*(4*pos+mu)].w); + + gfh[nf4*(4*pos+mu)+1].x = fl2sh(gf[nf4*(4*pos+mu)+1].x/pi_float); + gfh[nf4*(4*pos+mu)+1].y = fl2sh(gf[nf4*(4*pos+mu)+1].y/pi_float); + gfh[nf4*(4*pos+mu)+1].z = fl2sh(gf[nf4*(4*pos+mu)+1].z); + gfh[nf4*(4*pos+mu)+1].w = fl2sh(gf[nf4*(4*pos+mu)+1].w); + #else + int i; + nf4 = 3; + for(i=0; i= BLOCK2){ + gridsize = (int) (VOLUME/BLOCK2) +1; + } + else{ + gridsize=1; + } + + + //DEBUG_FLO + int i; + + float4 blub[6]; + for(i=0; i<6; i++){ + blub[i].x = (float) 0.1; + blub[i].y = (float) 0.2; + blub[i].z = (float) 0.3; + blub[i].w = (float) 0.4; + } + for(i=0; i<6; i++){ + printf("%d:x of float test vector: %f\n",i,blub[i].x); + printf("%d:y of float test vector: %f\n",i,blub[i].y); + printf("%d:z of float test vector: %f\n",i,blub[i].z); + printf("%d:w of float test vector: %f\n",i,blub[i].w); + } + + + cudaMemcpy(s, &(blub[0]) , 6*sizeof(float4), cudaMemcpyHostToDevice); + + //END DEBUG_FLO + + + //printf("Converting spinor to half precision... "); + float2half_spinorfield <<< gridsize, BLOCK2 >>>(s, sh, shnorm); + //printf("Done\n"); + cudaError_t cudaerr; + if((cudaerr=cudaGetLastError()) != cudaSuccess){ + printf("%s\n", cudaGetErrorString(cudaerr)); + exit(200); + } + + + //DEBUG_FLO + short4 testnorm_half[6]; + float thenorm; + float4 testnorm[6]; + + cudaMemcpy(&(testnorm_half), sh, 6*sizeof(short4), cudaMemcpyDeviceToHost); + cudaMemcpy(&(thenorm), shnorm, sizeof(float), cudaMemcpyDeviceToHost); + + printf("norm of float test vector: %f\n",thenorm); + printf("%f\n", sh2fl_host((short)(-32767))); + printf("%f\n", sh2fl_host((short)(32767))); + + printf("%d\n", fl2sh_host(-1.0)); + printf("%d\n", fl2sh_host(1.0)); + for(i=0; i<6; i++){ + testnorm[i].x = half2float_host(testnorm_half[i].x, thenorm); + testnorm[i].y = half2float_host(testnorm_half[i].y, thenorm); + testnorm[i].z = half2float_host(testnorm_half[i].z, thenorm); + testnorm[i].w = half2float_host(testnorm_half[i].w, thenorm); + printf("%d:x of float test vector: %f\n",i,testnorm[i].x); + printf("%d:y of float test vector: %f\n",i,testnorm[i].y); + printf("%d:z of float test vector: %f\n",i,testnorm[i].z); + printf("%d:w of float test vector: %f\n",i,testnorm[i].w); + } + cudaBindTexture(0, spinhalf_tex,sh, size); + // bind texture for norm + cudaBindTexture(0, spinnormhalf_tex, shnorm, sizenorm); + + if((cudaerr=cudaGetLastError()) != cudaSuccess){ + printf("%s\n", cudaGetErrorString(cudaerr)); + exit(200); + } + + half2float_spinorfield_tex <<< gridsize, BLOCK2 >>>(dev_spin4); + + if((cudaerr=cudaGetLastError()) != cudaSuccess){ + printf("%s\n", cudaGetErrorString(cudaerr)); + exit(200); + } + + cudaUnbindTexture(spinhalf_tex); + cudaUnbindTexture(spinnormhalf_tex); + + cudaMemcpy(&(testnorm), dev_spin4, 6*sizeof(float4), cudaMemcpyDeviceToHost); + for(i=0; i<6; i++){ + printf("%d:x of float test vector: %f\n",i,testnorm[i].x); + printf("%d:y of float test vector: %f\n",i,testnorm[i].y); + printf("%d:z of float test vector: %f\n",i,testnorm[i].z); + printf("%d:w of float test vector: %f\n",i,testnorm[i].w); + } + + //exit(100); + //END DEBUG + */ + + //printf("Binding textures to half spinorfield\n"); + // bind texture for vector + cudaBindTexture(0, spinhalf_tex, sh, size); + + // bind texture for norm + cudaBindTexture(0, spinnormhalf_tex, shnorm, sizenorm); + //printf("%s\n", cudaGetErrorString(cudaGetLastError())); + + +return(0); +} + + +extern "C" int unbind_halfspinor_texture(){ + //printf("Unbinding textures of half spinorfield\n"); + cudaUnbindTexture(spinhalf_tex); + cudaUnbindTexture(spinnormhalf_tex); + //printf("%s\n", cudaGetErrorString(cudaGetLastError())); +return(0); +} + + + + +extern "C" int bind_texture_gf_half(dev_su3_2v_half * gf){ + //printf("Binding texture to gaugefield\n"); + + #ifdef MPI + #ifdef GF_8 + size_t size = sizeof(short4)*2*(VOLUME+RAND)*4; + #else + size_t size = sizeof(short4)*3*(VOLUME+RAND)*4; + #endif + #else + #ifdef GF_8 + size_t size = sizeof(short4)*2*VOLUME*4; + #else + size_t size = sizeof(short4)*3*VOLUME*4; + #endif + #endif + + cudaGetTextureReference(&gf_texRefPtr, "gf_tex"); + gf_channelDesc = cudaCreateChannelDesc(); + cudaBindTexture(0, gf_texRefPtr, gf, &gf_channelDesc, size); + printf("%s\n", cudaGetErrorString(cudaGetLastError())); + return(0); +} + + + +extern "C" int unbind_texture_gf_half(){ + //printf("Unbinding texture to gaugefield\n"); + cudaUnbindTexture(gf_texRefPtr); + printf("%s\n", cudaGetErrorString(cudaGetLastError())); + return(0); +} + + + +// convert spinor to REAL4 (float4, double4) +void convert2REAL4_spin_half(spinor* spin, dev_spinor_half* h2d, float* h2d_norm){ + int i,Vol; + float norm; + if(even_odd_flag){ + Vol = VOLUME/2; + } + else{ + Vol = VOLUME; + } + for (i=0;i= BLOCK2){ + blas_half_gridsize = (int)(vol/BLOCK2) + 1; + } + else{ + blas_half_gridsize=1; + } + + size_t size = vol * sizeof(float); + + if((cudaerr=cudaMalloc((void **) &dev_blas_half_redfield, size)) != cudaSuccess){ + printf("Error in init_blas_half(): Memory allocation of reduction field failed. Aborting...\n"); + exit(200); + } // Allocate array on device + else{ + printf("Allocated blas reduction field on device\n"); + } + + + // IMPLEMENT THIS FOR ALL LATTICE SIZES !!!!!!!!!!!!!!!!!!!! + if((vol%REDUCTION_N) == 0){ + blas_half_redblocks = vol/REDUCTION_N; + } + else{ + fprintf(stderr,"Error: Volume is not a multiple of REDUCTION_N (%d). Aborting...\n", REDUCTION_N); + exit(100); + } + + // initialize small redfields + size = blas_half_redblocks * sizeof(float); + if((cudaerr=cudaMalloc((void **) &dev_blas_half_sredfield, size)) != cudaSuccess){ + printf("Error in init_blas_half(): Memory allocation of small reduction field failed. Aborting...\n"); + exit(200); + } // Allocate array on device + else{ + printf("Allocated blas small reduction field on device\n"); + } + + if((void*)(blas_half_sredfield = (float *)malloc(size)) == NULL){ + printf("Could not allocate memory for blas small redfield on host. Aborting...\n"); + exit(200); + } + + + +} + + +void finalize_blas_half(){ + cudaFree(dev_blas_half_redfield); + cudaFree(dev_blas_half_sredfield); + free(blas_half_sredfield); +} + + + + + + + +// this is a reduction algorithm for float based on the CUDA SDK +__global__ void reduce_float(float *g_idata, float *g_odata, unsigned int n) +{ + extern __shared__ float sdata[]; + + // load shared mem + unsigned int tid = threadIdx.x; + unsigned int i = blockIdx.x*blockDim.x + threadIdx.x; + + sdata[tid] = (i < n) ? g_idata[i] : 0; + + __syncthreads(); + + // do reduction in shared mem + for(unsigned int s=blockDim.x/2; s>0; s>>=1) + { + if (tid < s) + { + sdata[tid] += sdata[tid + s]; + } + __syncthreads(); + } + + // write result for this block to global mem + if (tid == 0) g_odata[blockIdx.x] = sdata[0]; +} + + + + +// this is the version for float2 +__global__ void reduce_float2(float2 *g_idata, float2 *g_odata, unsigned int n) +{ + extern __shared__ float2 sdata2[]; + + // load shared mem + unsigned int tid = threadIdx.x; + unsigned int i = blockIdx.x*blockDim.x + threadIdx.x; + + sdata2[tid].x = (i < n) ? g_idata[i].x : 0; + sdata2[tid].y = (i < n) ? g_idata[i].y : 0; + + __syncthreads(); + + // do reduction in shared mem + for(unsigned int s=blockDim.x/2; s>0; s>>=1) + { + if (tid < s) + { + sdata2[tid].x += sdata2[tid + s].x; + sdata2[tid].y += sdata2[tid + s].y; + } + __syncthreads(); + } + + // write result for this block to global mem + if (tid == 0) { + g_odata[blockIdx.x].x = sdata2[0].x; + g_odata[blockIdx.x].y = sdata2[0].y; + } +} + + + + + +__global__ void dot_half ( float* redfield, dev_spinor_half* x, float* x_norm, dev_spinor_half* y, float* y_norm){ + int pos= threadIdx.x + blockDim.x*blockIdx.x; + float4 xhelp,yhelp; + int i; + float xnhelp, ynhelp; + float dotp = 0.0f; + + if(pos < dev_VOLUME){ + // this is the loop over the 6 float4 forming one spinor + #pragma unroll 6 + for(i=0; i<6; i++){ + //xhelp = tex1Dfetch(spinhalf_tex, 6*pos+i); + //xnhelp = tex1Dfetch(spinnormhalf_tex, pos); + + xnhelp = x_norm[pos]; + xhelp.x = sh2fl(x[6*pos+i].x)*xnhelp; + xhelp.y = sh2fl(x[6*pos+i].y)*xnhelp; + xhelp.z = sh2fl(x[6*pos+i].z)*xnhelp; + xhelp.w = sh2fl(x[6*pos+i].w)*xnhelp; + + ynhelp = y_norm[pos]; + yhelp.x = sh2fl(y[6*pos+i].x)*ynhelp; + yhelp.y = sh2fl(y[6*pos+i].y)*ynhelp; + yhelp.z = sh2fl(y[6*pos+i].z)*ynhelp; + yhelp.w = sh2fl(y[6*pos+i].w)*ynhelp; + + dotp += xhelp.x * yhelp.x; + dotp += xhelp.y * yhelp.y; + dotp += xhelp.z * yhelp.z; + dotp += xhelp.w * yhelp.w; + } + // write sum_i (x_i y_i) to reduction field + redfield[pos] = dotp; + }//dev_VOLUME +} + + + +// kernel for the square of the norm of a half spinor +// local squared norms are written to reduction field redfield +__global__ void sqnorm_half (float* redfield, dev_spinor_half* x, float* x_norm){ + int pos= threadIdx.x + blockDim.x*blockIdx.x; + float xnhelp; + float dotp = 0.0; + int i; + float4 xhelp; + if(pos < dev_VOLUME){ + #pragma unroll 6 + for(i=0; i<6; i++){ + xnhelp = x_norm[pos]; + xhelp.x = sh2fl(x[6*pos+i].x)*xnhelp; + xhelp.y = sh2fl(x[6*pos+i].y)*xnhelp; + xhelp.z = sh2fl(x[6*pos+i].z)*xnhelp; + xhelp.w = sh2fl(x[6*pos+i].w)*xnhelp; + + dotp += xhelp.x * xhelp.x; + dotp += xhelp.y * xhelp.y; + dotp += xhelp.z * xhelp.z; + dotp += xhelp.w * xhelp.w; + } + redfield[pos] = dotp; + }//dev_VOLUME +} + + + + + + + +// calculates the dot product of x and y +float dotprod_half(dev_spinor_half* x, float* x_norm, dev_spinor_half* y, float* y_norm){ + int i; + float result; + cudaError_t cudaerr; + + dot_half <<< blas_half_gridsize, blas_half_blocksize >>> + (dev_blas_half_redfield, x, x_norm, y, y_norm); + if((cudaerr=cudaGetLastError()) != cudaSuccess){ + printf("%s\n", cudaGetErrorString(cudaerr)); + exit(200); + } + //reduce reductionfield on device + reduce_float <<< blas_half_redblocks, REDUCTION_N, + REDUCTION_N*sizeof(float) >>> + ( dev_blas_half_redfield, dev_blas_half_sredfield, VOLUME); + //this reduction always takes the VOLUME (also for mpi) + + //copy back + cudaMemcpy(blas_half_sredfield, dev_blas_half_sredfield, (size_t)(blas_half_redblocks*sizeof(float)), cudaMemcpyDeviceToHost); + + //do final reduction on host + float finalsum=0.0f; + for(i=0; i>> + (dev_blas_half_redfield, x, xnorm); + if((cudaerr=cudaGetLastError()) != cudaSuccess){ + printf("%s\n", cudaGetErrorString(cudaerr)); + exit(200); + } + //reduce reductionfield on device + reduce_float <<< blas_half_redblocks, REDUCTION_N, + REDUCTION_N*sizeof(float) >>> + ( dev_blas_half_redfield, dev_blas_half_sredfield, VOLUME); + //this reduction always takes the VOLUME (also for mpi) + + //copy back + cudaMemcpy(blas_half_sredfield, dev_blas_half_sredfield, (size_t)(blas_half_redblocks*sizeof(float)), cudaMemcpyDeviceToHost); + + //do final reduction on host + float finalsum=0.0f; + for(i=0; i(gf, 2 , to); + #else + dev_reconstructgf_2vtexref(gf, 2 , to); + #endif + } +} + + +void testhalf_gf(dev_su3_2v_half * gf,MixedsolveParameter& mixedsolveParameter){ + dev_su3 * testfield; + dev_su3 hosttestfield; + size_t size = sizeof(dev_su3); + dev_su3 hostmatrix; + + + cudaMalloc((void **) &testfield, size); + #ifdef USETEXTURE + //Bind texture gf + bind_texture_gf_half(mixedsolveParameter.dev_gf_half); + #endif + testhalf <<< 1, 1 >>> (mixedsolveParameter.dev_gf, testfield); + cudaMemcpy(&(hosttestfield), testfield, size, cudaMemcpyDeviceToHost); + show_dev_su3(hosttestfield); + + + #ifdef USETEXTURE + unbind_texture_gf_half(); + #endif +} + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/half_solvers.cuh b/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/half_solvers.cuh new file mode 100644 index 0000000000000000000000000000000000000000..33b74005397e46ce165bcb158b2a811a60f7f766 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/half_solvers.cuh @@ -0,0 +1,713 @@ + + +void test_spinor_normalization(dev_spinor_half* s, float* sn){ + dev_spinor_half s_host[6]; + + size_t size = 6*sizeof(dev_spinor_half); + cudaMemcpy( &(s_host[0]),s , 6*sizeof(short4), cudaMemcpyDeviceToHost); + + int i; + + for(i=0; i<6; i++){ + float helpx = sh2fl_host(s_host[i].x); + float helpy = sh2fl_host(s_host[i].y); + float helpz = sh2fl_host(s_host[i].z); + float helpw = sh2fl_host(s_host[i].w); + + printf("%f, %f, %f, %f\n", helpx, helpy, helpz, helpw); + } + +} + + + +void showspinor_half(dev_spinor_half* s, float* snorm){ + int i,j; + + dev_spinor_half help[6]; + dev_spinor help2[6]; + float norm; + + size_t size = 6*sizeof(dev_spinor_half); + + for(i=0; i& mixedsolveParameter){ + + double timeelapsed = 0.0; + clock_t start, stop; + int i; + + + printf("%s\n", cudaGetErrorString(cudaGetLastError())); + printf("Applying H 1000 times\n"); + for(i=0; i<1000; i++){ + //Q_{-} + #ifdef USETEXTURE + bind_halfspinor_texture(spin1, spin1_norm); + #endif + //cudaFuncSetCacheConfig(dev_Hopping_Matrix_half, cudaFuncCachePreferL1); + dev_Hopping_Matrix_half<<>> + (mixedsolveParameter.dev_gf_half, spin1, spin1_norm, spin2, spin2_norm, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); //dev_spin_eo1 == even -> 0 + #ifdef USETEXTURE + unbind_halfspinor_texture(); + #endif + + #ifdef USETEXTURE + bind_halfspinor_texture(spin2, spin2_norm); + #endif + //cudaFuncSetCacheConfig(dev_Hopping_Matrix_half, cudaFuncCachePreferL1); + dev_Hopping_Matrix_half<<>> + (mixedsolveParameter.dev_gf_half, spin2, spin2_norm, spin1, spin1_norm, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + #ifdef USETEXTURE + unbind_halfspinor_texture(); + #endif + + } + printf("%s\n", cudaGetErrorString(cudaGetLastError())); + printf("Done\n"); + + assert((stop = clock())!=-1); + timeelapsed = (double) (stop-start)/CLOCKS_PER_SEC; + // x2 because 2x Hopping per iteration + double benchres = 1400.0*2*(VOLUME/2)* 1000 / timeelapsed / 1.0e9; + printf("Benchmark: %f Gflops\n", benchres); +} + + + + + + + + + + + +// this is the HALF eo version of the device cg inner solver +// we invert the hermitean Q_{-} Q_{+} +extern "C" int dev_cg_eo_half( + dev_su3_2v * gf, + dev_spinor_half* spinin, float* spinin_norm, + dev_spinor_half* spinout, float* spinout_norm, + dev_spinor_half* spin0, float* spin0_norm, + dev_spinor_half* spin1, float* spin1_norm, + dev_spinor_half* spin2, float* spin2_norm, + dev_spinor_half* spin3, float* spin3_norm, + dev_spinor_half* spin4, float* spin4_norm, + int *grid, int * nn_grid, + REAL epsfinal, + MixedsolveParameter& mixedsolveParameter){ + + + REAL host_alpha, host_beta, host_dotprod, host_rk, sourcesquarenorm; + REAL * dotprod, * dotprod2, * rk, * alpha, *beta; + + + + int i, gridsize; + int maxit = max_innersolver_it; + REAL eps = (REAL) innersolver_precision; + int N_recalcres = 20; // after N_recalcres iterations calculate r = A x_k - b + + cudaError_t cudaerr; + // this is the partitioning for the copying of fields + dim3 blockdim(1,1); + dim3 blockdim2(128,1,1); + if( VOLUME/2 >= 128){ + gridsize = (int) VOLUME/2/128 + 1; + } + else{ + gridsize=1; + } + dim3 griddim2(gridsize,1,1); + + + //this is the partitioning for the HoppingMatrix kernel + int blockdim3=BLOCK; + if( VOLUME/2 >= BLOCK){ + gridsize = (int)(VOLUME/2/BLOCK) + 1; + } + else{ + gridsize=1; + } + printf("gridsize = %d\n", gridsize); + int griddim3=gridsize; + + //this is the partitioning for dev_mul_one_pm... + int blockdim4=BLOCK2; + if( VOLUME/2 >= BLOCK2){ + gridsize = (int)(VOLUME/2/BLOCK2) + 1; + } + else{ + gridsize=1; + } + int griddim4=gridsize; + + + + //Initialize some stuff + printf("mu = %f\n", g_mu); + dev_complex h0,h1,h2,h3,mh0, mh1, mh2, mh3; + h0.re = (REAL)ka0.re; h0.im = -(REAL)ka0.im; + h1.re = (REAL)ka1.re; h1.im = -(REAL)ka1.im; + h2.re = (REAL)ka2.re; h2.im = -(REAL)ka2.im; + h3.re = (REAL)ka3.re; h3.im = -(REAL)ka3.im; + + mh0.re = -(REAL)ka0.re; mh0.im = (REAL)ka0.im; + mh1.re = -(REAL)ka1.re; mh1.im = (REAL)ka1.im; + mh2.re = -(REAL)ka2.re; mh2.im = (REAL)ka2.im; + mh3.re = -(REAL)ka3.re; mh3.im = (REAL)ka3.im; + + // try using constant mem for kappas + cudaMemcpyToSymbol("dev_k0c", &h0, sizeof(dev_complex)) ; + cudaMemcpyToSymbol("dev_k1c", &h1, sizeof(dev_complex)) ; + cudaMemcpyToSymbol("dev_k2c", &h2, sizeof(dev_complex)) ; + cudaMemcpyToSymbol("dev_k3c", &h3, sizeof(dev_complex)) ; + + cudaMemcpyToSymbol("dev_mk0c", &mh0, sizeof(dev_complex)) ; + cudaMemcpyToSymbol("dev_mk1c", &mh1, sizeof(dev_complex)) ; + cudaMemcpyToSymbol("dev_mk2c", &mh2, sizeof(dev_complex)) ; + cudaMemcpyToSymbol("dev_mk3c", &mh3, sizeof(dev_complex)) ; + + he_cg_init<<< 1, 1 >>> (grid, (REAL) g_kappa, (REAL)(g_mu/(2.0*g_kappa)), h0,h1,h2,h3); + // BEWARE in dev_tm_dirac_kappa we need the true mu (not 2 kappa mu!) + + + //use full volume here as we need the complete gauge field!!! + int Vol; + #ifndef MPI + Vol = VOLUME; + #else + Vol = VOLUME+RAND; + #endif + + if( Vol >= BLOCK2){ + gridsize = (int)(Vol/BLOCK2) + 1; + } + else{ + gridsize=1; + } + + printf("Converting gauge to half precision... "); + + + float2half_gaugefield <<< gridsize, BLOCK2 >>>(mixedsolveParameter.dev_gf, mixedsolveParameter.dev_gf_half, Vol); + printf("Done\n"); + + //testhalf_gf(dev_gf_half); + + + + #ifdef USETEXTURE + //Bind texture gf + bind_texture_gf_half(mixedsolveParameter.dev_gf_half); + #endif + + + // Init x,p,r for k=0 + // Allocate some numbers for host <-> device interaction + cudaMalloc((void **) &dotprod, sizeof(REAL)); + cudaMalloc((void **) &dotprod2, sizeof(REAL)); + cudaMalloc((void **) &rk, sizeof(REAL)); + cudaMalloc((void **) &alpha, sizeof(REAL)); + cudaMalloc((void **) &beta, sizeof(REAL)); + printf("%s\n", cudaGetErrorString(cudaGetLastError())); + + + //init blas + init_blas_half(VOLUME/2); + printf("Have initialized blas for half precision\n"); + + + + dev_copy_spinor_field_half + <<>>(spinin, spinin_norm, spin0, spin0_norm); + dev_zero_spinor_field_half + <<>>(spin1,spin1_norm); // x_0 = 0 + dev_copy_spinor_field_half + <<>>(spinin, spinin_norm, spin2, spin2_norm); + dev_zero_spinor_field_half + <<>>(spin3, spin3_norm); + printf("%s\n", cudaGetErrorString(cudaGetLastError())); + + + //test_spinor_normalization(spin2, spin2_norm); + //showspinor_half(spinin, spinin_norm); + + + //relative precision -> get initial residue + sourcesquarenorm = squarenorm_half(spinin, spinin_norm); + printf("with squarenorm: %f\n", sourcesquarenorm); + sourcesquarenorm = dotprod_half(spinin, spinin_norm,spinin, spinin_norm); + printf("with dotprod: %f\n", sourcesquarenorm); + + host_rk = sourcesquarenorm; //for use in main loop + printf("Squarenorm Source:\t%.8e\n", sourcesquarenorm); + printf("%s\n", cudaGetErrorString(cudaGetLastError())); + + + /* + // small benchmark for half ///////////// + benchmark_half(spin2, spin2_norm, spin3, spin3_norm, griddim3,blockdim3); + exit(0); + ///////////////////////////////////////// + */ + + + printf("Entering inner solver cg-loop\n"); + for(i=0;i>> + (-1.0*host_alpha, spin3, spin3_norm, spin0, spin0_norm); + + //showspinor_half(spin0, spin0_norm); + //exit(200); + + //printf("r(k+1)\n"); + //test_spinor_normalization(spin3, spin3_norm); + //test_spinor_normalization(spin0, spin0_norm); + //x(k+1); + //cublasSaxpy (24*VOLUME/2, host_alpha, (const float *) spin2, 1, (float *) spin1, 1); + axpy_half<<>> + (host_alpha, spin2, spin2_norm, spin1, spin1_norm); + + //printf("x(k+1)\n"); + //test_spinor_normalization(spin1, spin1_norm); + //test_spinor_normalization(spin2, spin2_norm); + + if((cudaerr=cudaGetLastError()) != cudaSuccess){ + printf("%s\n", cudaGetErrorString(cudaerr)); + exit(200); + } + + //Abbruch? + host_dotprod = squarenorm_half(spin0, spin0_norm); + + if (((host_dotprod <= eps*sourcesquarenorm) && (i > maxit / 4) ) || ( host_dotprod <= epsfinal/2.)){//error-limit erreicht (epsfinal/2 sollte ausreichen um auch in double precision zu bestehen) + break; + } + printf("iter %d: err = %.8e\n", i, host_dotprod); + + //beta + host_beta =host_dotprod/host_rk; + //printf("beta = %f\n",host_beta); + //p(k+1) + //cublasSscal (24*VOLUME/2, host_beta, (float *)spin2, 1); + scal_half<<>> + (host_beta, spin2, spin2_norm); + //printf("scal p\n"); + //test_spinor_normalization(spin2, spin2_norm); + + //cublasSaxpy (24*VOLUME/2, 1.0, (const float *) spin0, 1, (float *) spin2, 1); + axpy_half<<>> + (1.0, spin0, spin0_norm, spin2, spin2_norm); + //printf("axpy p\n"); + //test_spinor_normalization(spin2, spin2_norm); + + host_rk = host_dotprod; + + // recalculate residue frome r = b - Ax + if(((i+1) % N_recalcres) == 0){ + // r_(k+1) = Ax -b + printf("Recalculating residue\n"); + + // D Ddagger -- Ddagger = gamma5 D gamma5 for Wilson Dirac Operator + // DO NOT USE tm_dirac_dagger_kappa here, otherwise spin2 will be overwritten!!! + + // Q_{-}Q{+} + #ifndef MPI + dev_Qtm_pm_psi_half(spin1, spin1_norm, spin3, spin3_norm, griddim3, blockdim3, griddim4, blockdim4); + #else + dev_Qtm_pm_psi_half_mpi(spin1, spin1_norm, spin3, spin3_norm, griddim3, blockdim3, griddim4, blockdim4); + #endif + if((cudaerr=cudaGetLastError()) != cudaSuccess){ + printf("%s\n", cudaGetErrorString(cudaerr)); + exit(200); + } + + + // r = b - Ax + //cublasSscal (24*VOLUME/2, -1.0, (float *)spin3, 1); + scal_half<<>> + (-1.0, spin3, spin3_norm); + + //cublasSaxpy (24*VOLUME/2, 1.0, (const float *) spinin, 1, (float *) spin3, 1); + axpy_half<<>> + (1.0, spinin, spinin_norm, spin3, spin3_norm); + + //cublasScopy (24*VOLUME/2, (const float *)spin3, 1, (float *)spin0, 1); + dev_copy_spinor_field_half + <<>>(spin3, spin3_norm, spin0, spin0_norm); + + }//recalculate residue + + }//MAIN LOOP cg + + + printf("Final residue: %.6e\n",host_dotprod); + // x_result = spin1 ! + + //no multiplication with D^{dagger} here and no return to non-kappa basis as in dev_cg! + dev_copy_spinor_field_half<<>>(spin1, spin1_norm,spinout, spinout_norm); + + #ifdef USETEXTURE + unbind_texture_gf_half(); + #endif + cudaFree(dotprod); + cudaFree(dotprod2); + cudaFree(rk); + cudaFree(alpha); + cudaFree(beta); + finalize_blas_half(); + + return(i); +} + + + + + + + + + + + + +// this is the HALF eo version of the device cg inner solver +// we invert the hermitean Q_{-} Q_{+} +extern "C" int dev_cg_half_reliable_update( + dev_su3_2v * gf, + dev_spinor_half* spinin, float* spinin_norm, + dev_spinor_half* spinout, float* spinout_norm, + dev_spinor_half* spin0, float* spin0_norm, + dev_spinor_half* spin1, float* spin1_norm, + dev_spinor_half* spin2, float* spin2_norm, + dev_spinor_half* spin3, float* spin3_norm, + dev_spinor_half* spin4, float* spin4_norm, + int *grid, int * nn_grid, + REAL epsfinal, + MixedsolveParameter& mixedsolveParameter){ + + + REAL host_alpha, host_beta, host_dotprod, host_rk, sourcesquarenorm; + REAL * dotprod, * dotprod2, * rk, * alpha, *beta; + + + + int i, gridsize; + int maxit = max_innersolver_it; + REAL eps = (REAL) innersolver_precision; + int N_recalcres = 1000; // after N_recalcres iterations calculate r = A x_k - b + + cudaError_t cudaerr; + // this is the partitioning for the copying of fields + dim3 blockdim(1,1); + dim3 blockdim2(128,1,1); + if( VOLUME/2 >= 128){ + gridsize = (int) VOLUME/2/128 + 1; + } + else{ + gridsize=1; + } + dim3 griddim2(gridsize,1,1); + + + //this is the partitioning for the HoppingMatrix kernel + int blockdim3=BLOCK; + if( VOLUME/2 >= BLOCK){ + gridsize = (int)(VOLUME/2/BLOCK) + 1; + } + else{ + gridsize=1; + } + printf("gridsize = %d\n", gridsize); + int griddim3=gridsize; + + //this is the partitioning for dev_mul_one_pm... + int blockdim4=BLOCK2; + if( VOLUME/2 >= BLOCK2){ + gridsize = (int)(VOLUME/2/BLOCK2) + 1; + } + else{ + gridsize=1; + } + int griddim4=gridsize; + + + + //Initialize some stuff + printf("mu = %f\n", g_mu); + dev_complex h0,h1,h2,h3,mh0, mh1, mh2, mh3; + h0.re = (REAL)ka0.re; h0.im = -(REAL)ka0.im; + h1.re = (REAL)ka1.re; h1.im = -(REAL)ka1.im; + h2.re = (REAL)ka2.re; h2.im = -(REAL)ka2.im; + h3.re = (REAL)ka3.re; h3.im = -(REAL)ka3.im; + + mh0.re = -(REAL)ka0.re; mh0.im = (REAL)ka0.im; + mh1.re = -(REAL)ka1.re; mh1.im = (REAL)ka1.im; + mh2.re = -(REAL)ka2.re; mh2.im = (REAL)ka2.im; + mh3.re = -(REAL)ka3.re; mh3.im = (REAL)ka3.im; + + // try using constant mem for kappas + cudaMemcpyToSymbol("dev_k0c", &h0, sizeof(dev_complex)) ; + cudaMemcpyToSymbol("dev_k1c", &h1, sizeof(dev_complex)) ; + cudaMemcpyToSymbol("dev_k2c", &h2, sizeof(dev_complex)) ; + cudaMemcpyToSymbol("dev_k3c", &h3, sizeof(dev_complex)) ; + + cudaMemcpyToSymbol("dev_mk0c", &mh0, sizeof(dev_complex)) ; + cudaMemcpyToSymbol("dev_mk1c", &mh1, sizeof(dev_complex)) ; + cudaMemcpyToSymbol("dev_mk2c", &mh2, sizeof(dev_complex)) ; + cudaMemcpyToSymbol("dev_mk3c", &mh3, sizeof(dev_complex)) ; + + he_cg_init<<< 1, 1 >>> (grid, (REAL) g_kappa, (REAL)(g_mu/(2.0*g_kappa)), h0,h1,h2,h3); + // BEWARE in dev_tm_dirac_kappa we need the true mu (not 2 kappa mu!) + + + //use full volume here as we need the complete gauge field!!! + if( VOLUME >= BLOCK2){ + gridsize = (int)(VOLUME/BLOCK2) + 1; + } + else{ + gridsize=1; + } + + printf("Converting gauge to half precision... "); + float2half_gaugefield <<< gridsize, BLOCK2 >>>(mixedsolveParameter.dev_gf, mixedsolveParameter.dev_gf_half, VOLUME); + printf("Done\n"); + + //testhalf_gf(dev_gf_half); + + + + #ifdef USETEXTURE + //Bind texture gf + bind_texture_gf_half(mixedsolveParameter.dev_gf_half); + #endif + + + // Init x,p,r for k=0 + // Allocate some numbers for host <-> device interaction + cudaMalloc((void **) &dotprod, sizeof(REAL)); + cudaMalloc((void **) &dotprod2, sizeof(REAL)); + cudaMalloc((void **) &rk, sizeof(REAL)); + cudaMalloc((void **) &alpha, sizeof(REAL)); + cudaMalloc((void **) &beta, sizeof(REAL)); + printf("%s\n", cudaGetErrorString(cudaGetLastError())); + + + //init blas + init_blas_half(VOLUME/2); + printf("Have initialized blas for half precision\n"); + + + + dev_copy_spinor_field_half + <<>>(spinin, spinin_norm, spin0, spin0_norm); + dev_zero_spinor_field_half + <<>>(spin1,spin1_norm); // x_0 = 0 + dev_copy_spinor_field_half + <<>>(spinin, spinin_norm, spin2, spin2_norm); + dev_zero_spinor_field_half + <<>>(spin3, spin3_norm); + printf("%s\n", cudaGetErrorString(cudaGetLastError())); + + + //test_spinor_normalization(spin2, spin2_norm); + //showspinor_half(spinin, spinin_norm); + + + //relative precision -> get initial residue + sourcesquarenorm = squarenorm_half(spinin, spinin_norm); + printf("with squarenorm: %f\n", sourcesquarenorm); + sourcesquarenorm = dotprod_half(spinin, spinin_norm,spinin, spinin_norm); + printf("with dotprod: %f\n", sourcesquarenorm); + + host_rk = sourcesquarenorm; //for use in main loop + printf("Squarenorm Source:\t%.8e\n", sourcesquarenorm); + printf("%s\n", cudaGetErrorString(cudaGetLastError())); + + + + printf("Entering inner solver cg-loop\n"); + for(i=0;i>> + (-1.0*host_alpha, spin3, spin3_norm, spin0, spin0_norm); + + //showspinor_half(spin0, spin0_norm); + //exit(200); + + //printf("r(k+1)\n"); + //test_spinor_normalization(spin3, spin3_norm); + //test_spinor_normalization(spin0, spin0_norm); + //x(k+1); + //cublasSaxpy (24*VOLUME/2, host_alpha, (const float *) spin2, 1, (float *) spin1, 1); + axpy_half<<>> + (host_alpha, spin2, spin2_norm, spin1, spin1_norm); + + //printf("x(k+1)\n"); + //test_spinor_normalization(spin1, spin1_norm); + //test_spinor_normalization(spin2, spin2_norm); + + if((cudaerr=cudaGetLastError()) != cudaSuccess){ + printf("%s\n", cudaGetErrorString(cudaerr)); + exit(200); + } + + //Abbruch? + host_dotprod = squarenorm_half(spin0, spin0_norm); + + if (((host_dotprod <= eps*sourcesquarenorm) && (i > maxit / 4) ) || ( host_dotprod <= epsfinal/2.)){//error-limit erreicht (epsfinal/2 sollte ausreichen um auch in double precision zu bestehen) + break; + } + printf("iter %d: err = %.8e\n", i, host_dotprod); + + //beta + host_beta =host_dotprod/host_rk; + //printf("beta = %f\n",host_beta); + //p(k+1) + //cublasSscal (24*VOLUME/2, host_beta, (float *)spin2, 1); + scal_half<<>> + (host_beta, spin2, spin2_norm); + //printf("scal p\n"); + //test_spinor_normalization(spin2, spin2_norm); + + //cublasSaxpy (24*VOLUME/2, 1.0, (const float *) spin0, 1, (float *) spin2, 1); + axpy_half<<>> + (1.0, spin0, spin0_norm, spin2, spin2_norm); + //printf("axpy p\n"); + //test_spinor_normalization(spin2, spin2_norm); + + host_rk = host_dotprod; + + // recalculate residue frome r = b - Ax + if(((i+1) % N_recalcres) == 0){ + // r_(k+1) = Ax -b + printf("Recalculating residue\n"); + + // D Ddagger -- Ddagger = gamma5 D gamma5 for Wilson Dirac Operator + // DO NOT USE tm_dirac_dagger_kappa here, otherwise spin2 will be overwritten!!! + + // Q_{-}Q{+} + dev_Qtm_pm_psi_half(spin1, spin1_norm, spin3, spin3_norm, griddim3, blockdim3, griddim4, blockdim4); + if((cudaerr=cudaGetLastError()) != cudaSuccess){ + printf("%s\n", cudaGetErrorString(cudaerr)); + exit(200); + } + + + // r = b - Ax + //cublasSscal (24*VOLUME/2, -1.0, (float *)spin3, 1); + scal_half<<>> + (-1.0, spin3, spin3_norm); + + //cublasSaxpy (24*VOLUME/2, 1.0, (const float *) spinin, 1, (float *) spin3, 1); + axpy_half<<>> + (1.0, spinin, spinin_norm, spin3, spin3_norm); + + //cublasScopy (24*VOLUME/2, (const float *)spin3, 1, (float *)spin0, 1); + dev_copy_spinor_field_half + <<>>(spin3, spin3_norm, spin0, spin0_norm); + + }//recalculate residue + + }//MAIN LOOP cg + + + printf("Final residue: %.6e\n",host_dotprod); + // x_result = spin1 ! + + //no multiplication with D^{dagger} here and no return to non-kappa basis as in dev_cg! + dev_copy_spinor_field_half<<>>(spin1, spin1_norm,spinout, spinout_norm); + + #ifdef USETEXTURE + unbind_texture_gf_half(); + #endif + cudaFree(dotprod); + cudaFree(dotprod2); + cudaFree(rk); + cudaFree(alpha); + cudaFree(beta); + finalize_blas_half(); + + return(i); +} + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/linalg.cuh b/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/linalg.cuh new file mode 100644 index 0000000000000000000000000000000000000000..a75713c7e5dc6635d1cf9aa9a6518ffd1641e496 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/linalg.cuh @@ -0,0 +1,1537 @@ +/*********************************************************************** + * + * Copyright (C) 2010 Florian Burger + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * + * File: linalg.cuh + * + * CUDA linear algebra functions and implementation of gamma-multiplication + * + * + * + **************************************************************************/ + + + +template +__device__ inline dev_complexT dev_cconj (dev_complexT c){ /*konjugiert komplexe Zahl*/ + dev_complexT erg; + erg.re = c.re; + erg.im = -1.0*c.im; +return erg; +} + +template +__device__ inline void dev_ccopy(dev_complexT* von, dev_complexT* nach){/*kopiert complex von nach complex nach*/ + nach->re = RealTNach(von->re); + nach->im = RealTNach(von->im); +} + +template +__device__ inline RealT dev_cabssquare (dev_complexT c){ /*gibt abs^2 einer komplexen Zahl zurück*/ + return c.re*c.re + c.im*c.im; +} + +template +__device__ inline RealT dev_cabsolute (dev_complexT c){/*gibt Betrag einer kompl. zahl zurück*/ + return sqrt(c.re*c.re + c.im*c.im); +} + + +template +__device__ inline dev_complexT dev_crealmult(dev_complexT c1, RealT real){ /*multipliziert c1 mit reeller zahl re*/ + dev_complexT erg; + erg.re = real*c1.re; + erg.im = real*c1.im; +return erg; +} + +template +__device__ inline dev_complexT dev_cmult (dev_complexT c1, dev_complexT c2){ /*multiplizier zwei komplexe Zahlen*/ + dev_complexT erg; + erg.re = c1.re * c2.re - c1.im * c2.im; + erg.im = c1.re * c2.im + c1.im * c2.re; +return erg; +} + +template +__device__ inline dev_complexT dev_cadd (dev_complexT c1, dev_complexT c2){ /*addiert zwei komplexe Zahlen */ + dev_complexT erg; + erg.re = c1.re + c2.re; + erg.im = c1.im + c2.im; +return erg; +} + + +template +__device__ inline dev_complexT dev_cdiv(dev_complexT c1, dev_complexT c2) { /* dividiert c1 durch c2 */ + dev_complexT erg; + RealT oneovernenner = 1.0/(c2.re*c2.re + c2.im*c2.im); + erg.re = oneovernenner*(c1.re*c2.re + c1.im*c2.im); + erg.im = oneovernenner*(c1.im*c2.re - c1.re*c2.im); +return erg; +} + + +template +__device__ inline dev_complexT dev_csub(dev_complexT c1, dev_complexT c2){ + dev_complexT erg; + erg.re = c1.re - c2.re; + erg.im = c1.im - c2.im; +return erg; +} + + +template +__device__ inline dev_complexT dev_initcomplex(RealT re, RealT im){/* gibt komplexe Zahl mit Realt re und Imt im zurück*/ + dev_complexT erg; + erg.re = re; + erg.im = im; +return (erg); +} + + + + + +template +__device__ inline void dev_copy_spinor(typename dev_spinorT::type *i1, typename dev_spinorT::type *i2){ + int i; + #pragma unroll 6 + for(i=0;i<6;i++){ //color + spin + (*(i2+i)).x = RealT2((*(i1+i)).x); + (*(i2+i)).y = RealT2((*(i1+i)).y); + (*(i2+i)).z = RealT2((*(i1+i)).z); + (*(i2+i)).w = RealT2((*(i1+i)).w); + } +} + +template +__device__ inline void dev_zero_spinor(typename dev_spinorT::type *sin){ + int i; + #pragma unroll 6 + for(i=0;i<6;i++){ //color + spin + (*(sin+i)).x = 0.0; + (*(sin+i)).y = 0.0; + (*(sin+i)).z = 0.0; + (*(sin+i)).w = 0.0; + } +} + + + + + + +//out = in + lambda in2 +template +__device__ inline void dev_skalarmult_add_assign_spinor +( + typename dev_spinorT::type *in, + RealT lambda, + typename dev_spinorT::type * in2, + typename dev_spinorT::type * out +){ + int i; + #pragma unroll 6 +for(i=0;i<6;i++){ //color + spin + (*(out+i)).x = (*(in+i)).x + lambda* (*(in2+i)).x; + (*(out+i)).y = (*(in+i)).y + lambda* (*(in2+i)).y; + (*(out+i)).z = (*(in+i)).z + lambda* (*(in2+i)).z; + (*(out+i)).w = (*(in+i)).w + lambda* (*(in2+i)).w; + } +} + + + + +//out = in + lambda in2 +template +__device__ inline void dev_complexmult_add_assign_spinor +( + typename dev_spinorT::type* in, + dev_complexT lambda, + typename dev_spinorT::type* in2, + typename dev_spinorT::type* out +){ + int i; + #pragma unroll 6 + for(i=0;i<6;i++){ //color + spin + (*(out+i)).x = (*(in+i)).x + ((*(in2+i)).x*lambda.re - (*(in2+i)).y*lambda.im); + (*(out+i)).y = (*(in+i)).y + ((*(in2+i)).x*lambda.im + (*(in2+i)).y*lambda.re); + (*(out+i)).z = (*(in+i)).z + ((*(in2+i)).z*lambda.re - (*(in2+i)).w*lambda.im); + (*(out+i)).w = (*(in+i)).w + ((*(in2+i)).z*lambda.im + (*(in2+i)).w*lambda.re); + } +} + + + + +//out = in + (lambda)* in2 +template +__device__ inline void dev_complexcgmult_add_assign_spinor +( + typename dev_spinorT::type * in, + dev_complexT lambda, + typename dev_spinorT::type* in2, + typename dev_spinorT::type* out +){ + int i; + #pragma unroll 6 + for(i=0;i<6;i++){ //color + spin + (*(out+i)).x = (*(in+i)).x + ((*(in2+i)).x*lambda.re + (*(in2+i)).y*lambda.im); + (*(out+i)).y = (*(in+i)).y + (-(*(in2+i)).x*lambda.im + (*(in2+i)).y*lambda.re); + (*(out+i)).z = (*(in+i)).z + ((*(in2+i)).z*lambda.re + (*(in2+i)).w*lambda.im); + (*(out+i)).w = (*(in+i)).w + (-(*(in2+i)).z*lambda.im + (*(in2+i)).w*lambda.re); + } +} + + + +template +__device__ void inline dev_skalarmult_spinor +( + typename dev_spinorT::type* in, + dev_complexT lambda, + typename dev_spinorT::type* out +){ + int i; + #pragma unroll 6 + for(i=0;i<6;i++){ //color + spin + //out[i] = dev_cmult(in[i],lambda); + + (*(out+i)).x = (*(in+i)).x*lambda.re - (*(in+i)).y*lambda.im; + (*(out+i)).y = (*(in+i)).y*lambda.re + (*(in+i)).x*lambda.im; + + (*(out+i)).z = (*(in+i)).z*lambda.re - (*(in+i)).w*lambda.im; + (*(out+i)).w = (*(in+i)).w*lambda.re + (*(in+i)).z*lambda.im; + } +} + + + +/* +__device__ void inline dev_skalarmult_gamma5_spinor(dev_spinor * out, const dev_complex lambda, dev_spinor * in){ + + + (*(out)).x = (*(in)).x*lambda.re; + (*(out)).x -= (*(in)).y*lambda.im; + + (*(out)).y = (*(in)).y*lambda.re; + (*(out)).y += (*(in)).x*lambda.im; + + (*(out)).z = (*(in)).z*lambda.re; + (*(out)).z -= (*(in)).w*lambda.im; + + (*(out)).w = (*(in)).w*lambda.re; + (*(out)).w += (*(in)).z*lambda.im; + + + (*(out+1)).x = (*(in+1)).x*lambda.re; + (*(out+1)).x -= (*(in+1)).y*lambda.im; + + (*(out+1)).y = (*(in+1)).y*lambda.re; + (*(out+1)).y += (*(in+1)).x*lambda.im; + + (*(out+1)).z = (*(in+1)).z*lambda.re; + (*(out+1)).z -= (*(in+1)).w*lambda.im; + + (*(out+1)).w = (*(in+1)).w*lambda.re; + (*(out+1)).w += (*(in+1)).z*lambda.im; + + + (*(out+2)).x = (*(in+2)).x*lambda.re; + (*(out+2)).x -= (*(in+2)).y*lambda.im; + + (*(out+2)).y = (*(in+2)).y*lambda.re; + (*(out+2)).y += (*(in+2)).x*lambda.im; + + (*(out+2)).z = (*(in+2)).z*lambda.re; + (*(out+2)).z -= (*(in+2)).w*lambda.im; + + (*(out+2)).w = (*(in+2)).w*lambda.re; + (*(out+2)).w += (*(in+2)).z*lambda.im; + + + (*(out+3)).x = (*(in+3)).y*lambda.im; + (*(out+3)).x -= (*(in+3)).x*lambda.re; + + (*(out+3)).y = - (*(in+3)).x*lambda.im; + (*(out+3)).y -= (*(in+3)).y*lambda.re; + + (*(out+3)).z = (*(in+3)).w*lambda.im; + (*(out+3)).z -= (*(in+3)).z*lambda.re; + + (*(out+3)).w = -(*(in+3)).z*lambda.im; + (*(out+3)).w -= (*(in+3)).w*lambda.re; + + + (*(out+4)).x = (*(in+4)).y*lambda.im; + (*(out+4)).x -= (*(in+4)).x*lambda.re; + + (*(out+4)).y = - (*(in+4)).x*lambda.im; + (*(out+4)).y -= (*(in+4)).y*lambda.re; + + (*(out+4)).z = (*(in+4)).w*lambda.im; + (*(out+4)).z -= (*(in+4)).z*lambda.re; + + (*(out+4)).w = -(*(in+4)).z*lambda.im; + (*(out+4)).w -= (*(in+4)).w*lambda.re; + + + (*(out+5)).x = (*(in+5)).y*lambda.im; + (*(out+5)).x -= (*(in+5)).x*lambda.re; + + (*(out+5)).y = - (*(in+5)).x*lambda.im; + (*(out+5)).y -= (*(in+5)).y*lambda.re; + + (*(out+5)).z = (*(in+5)).w*lambda.im; + (*(out+5)).z -= (*(in+5)).z*lambda.re; + + (*(out+5)).w = -(*(in+5)).z*lambda.im; + (*(out+5)).w -= (*(in+5)).w*lambda.re; + +} +*/ + + +template +__device__ void inline dev_skalarmult_gamma5_spinor(typename dev_spinorT::type* out, dev_complexT lambda, typename dev_spinorT::type* in){ +int i; + typename dev_spinorT::type shelp, tempout; + +shelp = *(in); + tempout.x = shelp.x*lambda.re; + tempout.x -= shelp.y*lambda.im; + + tempout.y = shelp.y*lambda.re; + tempout.y += shelp.x*lambda.im; + + tempout.z = shelp.z*lambda.re; + tempout.z -= shelp.w*lambda.im; + + tempout.w = shelp.w*lambda.re; + tempout.w += shelp.z*lambda.im; +(*(out)) = tempout; + + +shelp = *(in+1); + tempout.x = shelp.x*lambda.re; + tempout.x -= shelp.y*lambda.im; + + tempout.y = shelp.y*lambda.re; + tempout.y += shelp.x*lambda.im; + + tempout.z = shelp.z*lambda.re; + tempout.z -= shelp.w*lambda.im; + + tempout.w = shelp.w*lambda.re; + tempout.w += shelp.z*lambda.im; +(*(out+1)) = tempout; + + +shelp = *(in+2); + tempout.x = shelp.x*lambda.re; + tempout.x -= shelp.y*lambda.im; + + tempout.y = shelp.y*lambda.re; + tempout.y += shelp.x*lambda.im; + + tempout.z = shelp.z*lambda.re; + tempout.z -= shelp.w*lambda.im; + + tempout.w = shelp.w*lambda.re; + tempout.w += shelp.z*lambda.im; +(*(out+2)) = tempout; + + +shelp = *(in+3); + tempout.x = shelp.y*lambda.im; + tempout.x -= shelp.x*lambda.re; + + tempout.y = - shelp.x*lambda.im; + tempout.y -= shelp.y*lambda.re; + + tempout.z = shelp.w*lambda.im; + tempout.z -= shelp.z*lambda.re; + + tempout.w = -shelp.z*lambda.im; + tempout.w -= shelp.w*lambda.re; +(*(out+3)) = tempout; + +shelp = *(in+4); + tempout.x = shelp.y*lambda.im; + tempout.x -= shelp.x*lambda.re; + + tempout.y = - shelp.x*lambda.im; + tempout.y -= shelp.y*lambda.re; + + tempout.z = shelp.w*lambda.im; + tempout.z -= shelp.z*lambda.re; + + tempout.w = -shelp.z*lambda.im; + tempout.w -= shelp.w*lambda.re; +(*(out+4)) = tempout; + +shelp = *(in+5); + tempout.x = shelp.y*lambda.im; + tempout.x -= shelp.x*lambda.re; + + tempout.y = - shelp.x*lambda.im; + tempout.y -= shelp.y*lambda.re; + + tempout.z = shelp.w*lambda.im; + tempout.z -= shelp.z*lambda.re; + + tempout.w = -shelp.z*lambda.im; + tempout.w -= shelp.w*lambda.re; +(*(out+5)) = tempout; +} + + + +template +__device__ void inline dev_realmult_spinor(typename dev_spinorT::type* in, RealT lambda){ + int i; + #pragma unroll 6 + for(i=0;i<6;i++){ //color + spin + //in[i] = in[i]*lambda; + (*(in+i)).x = (*(in+i)).x*lambda; + (*(in+i)).y = (*(in+i)).y*lambda; + + (*(in+i)).z = (*(in+i)).z*lambda; + (*(in+i)).w = (*(in+i)).w*lambda; + } +} + + +template +__device__ void inline dev_realmult_spinor_assign(typename dev_spinorT::type* out, RealT lambda, typename dev_spinorT::type* in){ +int i; +#pragma unroll 6 + for(i=0;i<6;i++){ //color + spin + //out[i] = in[i]*lambda; + (*(out+i)).x = (*(in+i)).x*lambda; + (*(out+i)).y = (*(in+i)).y*lambda; + + (*(out+i)).z = (*(in+i)).z*lambda; + (*(out+i)).w = (*(in+i)).w*lambda; + } +} + + + + +template +__device__ void dev_assign_realmult_add_spinor +( + typename dev_spinorT::type* out, + RealT lambda, + typename dev_spinorT::type* in1, + typename dev_spinorT::type* in2 +){ +int i; +RealT help; +//out = lambda*(in1 + in2) +#pragma unroll 6 + for(i=0;i<6;i++){ //color + spin + + help = (*(in1+i)).x*lambda; + help += (*(in2+i)).x*lambda; + (*(out+i)).x = help; + + help = (*(in1+i)).y*lambda; + help += (*(in2+i)).y*lambda; + (*(out+i)).y = help; + + help = (*(in1+i)).z*lambda; + help += (*(in2+i)).z*lambda; + (*(out+i)).z = help; + + help = (*(in1+i)).w*lambda; + help += (*(in2+i)).w*lambda; + (*(out+i)).w = help; + } +} + + +template +__device__ inline void dev_add_spinor_assign(typename dev_spinorT::type * i1, typename dev_spinorT::type * i2){ + int i; + #pragma unroll 6 + for(i=0;i<6;i++){ //color + spin + (*(i1+i)).x = (*(i1+i)).x + (*(i2+i)).x; + (*(i1+i)).y = (*(i1+i)).y + (*(i2+i)).y; + (*(i1+i)).z = (*(i1+i)).z + (*(i2+i)).z; + (*(i1+i)).w = (*(i1+i)).w + (*(i2+i)).w; + } +} + + + +template +__device__ inline void dev_sub_spinor_assign(typename dev_spinorT::type * i1, typename dev_spinorT::type * i2){ + int i; + #pragma unroll 6 + for(i=0;i<6;i++){ //color + spin + (*(i1+i)).x = (*(i1+i)).x - (*(i2+i)).x; + (*(i1+i)).y = (*(i1+i)).y - (*(i2+i)).y; + (*(i1+i)).z = (*(i1+i)).z - (*(i2+i)).z; + (*(i1+i)).w = (*(i1+i)).w - (*(i2+i)).w; + } +} + + + + +/* +//multipliziert su3-Matrix mal Spinor im Dirac-Raum +//code in su3_MtV.txt -- generated with codegen +__device__ void dev_su3MtV_spintex(dev_su3 M, int pos, dev_spinor * out){ + +dev_spinor s1, s2; + +s1 = tex1Dfetch(spin_tex,6*pos); +s2 = tex1Dfetch(spin_tex,6*pos+1); + + +//(*(out+0)).x = ( M[0][0].re*s1.x - M[0][0].im*s1.y ) + ( M[0][1].re*s1.z - M[0][1].im*s1.w ) + ( M[0][2].re*s2.x - M[0][2].im*s2.y ); +//(*(out+0)).y = ( M[0][0].re*s1.y + M[0][0].im*s1.x ) + ( M[0][1].re*s1.w + M[0][1].im*s1.z ) + ( M[0][2].re*s2.y + M[0][2].im*s2.x ); + +(*(out+0)).x = M[0][0].re*s1.x; + (*(out+0)).y = M[0][0].re*s1.y; +(*(out+0)).x -= M[0][0].im*s1.y; + (*(out+0)).y += M[0][0].im*s1.x; +(*(out+0)).x += M[0][1].re*s1.z; + (*(out+0)).y += M[0][1].re*s1.w; +(*(out+0)).x -= M[0][1].im*s1.w; + (*(out+0)).y += M[0][1].im*s1.z; +(*(out+0)).x += M[0][2].re*s2.x; + (*(out+0)).y += M[0][2].re*s2.y; +(*(out+0)).x -= M[0][2].im*s2.y; + (*(out+0)).y += M[0][2].im*s2.x; + + + +//(*(out+0)).z = ( M[1][0].re*s1.x - M[1][0].im*s1.y ) + ( M[1][1].re*s1.z - M[1][1].im*s1.w ) + ( M[1][2].re*s2.x - M[1][2].im*s2.y ); +//(*(out+0)).w = ( M[1][0].re*s1.y + M[1][0].im*s1.x ) + ( M[1][1].re*s1.w + M[1][1].im*s1.z ) + ( M[1][2].re*s2.y + M[1][2].im*s2.x ); + + +(*(out+0)).z = M[1][0].re*s1.x; + (*(out+0)).w = M[1][0].re*s1.y; +(*(out+0)).z -= M[1][0].im*s1.y; + (*(out+0)).w += M[1][0].im*s1.x; +(*(out+0)).z += M[1][1].re*s1.z; + (*(out+0)).w += M[1][1].re*s1.w; +(*(out+0)).z -= M[1][1].im*s1.w; + (*(out+0)).w += M[1][1].im*s1.z; +(*(out+0)).z += M[1][2].re*s2.x; + (*(out+0)).w += M[1][2].re*s2.y; +(*(out+0)).z -= M[1][2].im*s2.y; + (*(out+0)).w += M[1][2].im*s2.x; + + + +//(*(out+1)).x = ( M[2][0].re*s1.x - M[2][0].im*s1.y ) + ( M[2][1].re*s1.z - M[2][1].im*s1.w ) + ( M[2][2].re*s2.x - M[2][2].im*s2.y ); +//(*(out+1)).y = ( M[2][0].re*s1.y + M[2][0].im*s1.x ) + ( M[2][1].re*s1.w + M[2][1].im*s1.z ) + ( M[2][2].re*s2.y + M[2][2].im*s2.x ); + + +(*(out+1)).x = M[2][0].re*s1.x; + (*(out+1)).y = M[2][0].re*s1.y; +(*(out+1)).x -= M[2][0].im*s1.y; + (*(out+1)).y += M[2][0].im*s1.x; +(*(out+1)).x += M[2][1].re*s1.z; + (*(out+1)).y += M[2][1].re*s1.w; +(*(out+1)).x -= M[2][1].im*s1.w; + (*(out+1)).y += M[2][1].im*s1.z; +(*(out+1)).x += M[2][2].re*s2.x; + (*(out+1)).y += M[2][2].re*s2.y; +(*(out+1)).x -= M[2][2].im*s2.y; + (*(out+1)).y += M[2][2].im*s2.x; + + + + + +s1 = tex1Dfetch(spin_tex,6*pos+2); +(*(out+1)).z = ( M[0][0].re*s2.z - M[0][0].im*s2.w ) + ( M[0][1].re*s1.x - M[0][1].im*s1.y ) + ( M[0][2].re*s1.z - M[0][2].im*s1.w ); +(*(out+1)).w = ( M[0][0].re*s2.w + M[0][0].im*s2.z ) + ( M[0][1].re*s1.y + M[0][1].im*s1.x ) + ( M[0][2].re*s1.w + M[0][2].im*s1.z ); + + +(*(out+2)).x = ( M[1][0].re*s2.z - M[1][0].im*s2.w ) + ( M[1][1].re*s1.x - M[1][1].im*s1.y ) + ( M[1][2].re*s1.z - M[1][2].im*s1.w ); +(*(out+2)).y = ( M[1][0].re*s2.w + M[1][0].im*s2.z ) + ( M[1][1].re*s1.y + M[1][1].im*s1.x ) + ( M[1][2].re*s1.w + M[1][2].im*s1.z ); + + +(*(out+2)).z = ( M[2][0].re*s2.z - M[2][0].im*s2.w ) + ( M[2][1].re*s1.x - M[2][1].im*s1.y ) + ( M[2][2].re*s1.z - M[2][2].im*s1.w ); +(*(out+2)).w = ( M[2][0].re*s2.w + M[2][0].im*s2.z ) + ( M[2][1].re*s1.y + M[2][1].im*s1.x ) + ( M[2][2].re*s1.w + M[2][2].im*s1.z ); + + + +s1 = tex1Dfetch(spin_tex,6*pos+3); +s2 = tex1Dfetch(spin_tex,6*pos+4); +(*(out+3)).x = ( M[0][0].re*s1.x - M[0][0].im*s1.y ) + ( M[0][1].re*s1.z - M[0][1].im*s1.w ) + ( M[0][2].re*s2.x - M[0][2].im*s2.y ); +(*(out+3)).y = ( M[0][0].re*s1.y + M[0][0].im*s1.x ) + ( M[0][1].re*s1.w + M[0][1].im*s1.z ) + ( M[0][2].re*s2.y + M[0][2].im*s2.x ); + + +(*(out+3)).z = ( M[1][0].re*s1.x - M[1][0].im*s1.y ) + ( M[1][1].re*s1.z - M[1][1].im*s1.w ) + ( M[1][2].re*s2.x - M[1][2].im*s2.y ); +(*(out+3)).w = ( M[1][0].re*s1.y + M[1][0].im*s1.x ) + ( M[1][1].re*s1.w + M[1][1].im*s1.z ) + ( M[1][2].re*s2.y + M[1][2].im*s2.x ); + + +(*(out+4)).x = ( M[2][0].re*s1.x - M[2][0].im*s1.y ) + ( M[2][1].re*s1.z - M[2][1].im*s1.w ) + ( M[2][2].re*s2.x - M[2][2].im*s2.y ); +(*(out+4)).y = ( M[2][0].re*s1.y + M[2][0].im*s1.x ) + ( M[2][1].re*s1.w + M[2][1].im*s1.z ) + ( M[2][2].re*s2.y + M[2][2].im*s2.x ); + + + +s1 = tex1Dfetch(spin_tex,6*pos+5); +(*(out+4)).z = ( M[0][0].re*s2.z - M[0][0].im*s2.w ) + ( M[0][1].re*s1.x - M[0][1].im*s1.y ) + ( M[0][2].re*s1.z - M[0][2].im*s1.w ); +(*(out+4)).w = ( M[0][0].re*s2.w + M[0][0].im*s2.z ) + ( M[0][1].re*s1.y + M[0][1].im*s1.x ) + ( M[0][2].re*s1.w + M[0][2].im*s1.z ); + + +(*(out+5)).x = ( M[1][0].re*s2.z - M[1][0].im*s2.w ) + ( M[1][1].re*s1.x - M[1][1].im*s1.y ) + ( M[1][2].re*s1.z - M[1][2].im*s1.w ); +(*(out+5)).y = ( M[1][0].re*s2.w + M[1][0].im*s2.z ) + ( M[1][1].re*s1.y + M[1][1].im*s1.x ) + ( M[1][2].re*s1.w + M[1][2].im*s1.z ); + + +(*(out+5)).z = ( M[2][0].re*s2.z - M[2][0].im*s2.w ) + ( M[2][1].re*s1.x - M[2][1].im*s1.y ) + ( M[2][2].re*s1.z - M[2][2].im*s1.w ); +(*(out+5)).w = ( M[2][0].re*s2.w + M[2][0].im*s2.z ) + ( M[2][1].re*s1.y + M[2][1].im*s1.x ) + ( M[2][2].re*s1.w + M[2][2].im*s1.z ); + + +} +*/ + + + + + +//multipliziert su3-Matrix mal Spinor im Dirac-Raum +//code in su3_MtV.txt -- generated with codegen +template +__device__ void dev_su3MtV_spintex(dev_su3M(RealT) M, int pos, dev_spinorM(RealT) * out){ + +dev_spinorM(RealT) s1, s2; + +#ifndef HALF + s1 = tex1Dfetch(spin_tex,6*pos); +#else + s1 = tex1Dfetch(spinhalf_tex,6*pos); + float norm = tex1Dfetch(spinnormhalf_tex,pos); + s1.x *= norm; + s1.y *= norm; + s1.z *= norm; + s1.w *= norm; +#endif + +#ifndef HALF + s2 = tex1Dfetch(spin_tex,6*pos+1); +#else + s2 = tex1Dfetch(spinhalf_tex,6*pos+1); + s2.x *= norm; + s2.y *= norm; + s2.z *= norm; + s2.w *= norm; +#endif + +(*(out+0)).x = ( M[0][0].re*s1.x - M[0][0].im*s1.y ) + ( M[0][1].re*s1.z - M[0][1].im*s1.w ) + ( M[0][2].re*s2.x - M[0][2].im*s2.y ); +(*(out+0)).y = ( M[0][0].re*s1.y + M[0][0].im*s1.x ) + ( M[0][1].re*s1.w + M[0][1].im*s1.z ) + ( M[0][2].re*s2.y + M[0][2].im*s2.x ); + + + +(*(out+0)).z = ( M[1][0].re*s1.x - M[1][0].im*s1.y ) + ( M[1][1].re*s1.z - M[1][1].im*s1.w ) + ( M[1][2].re*s2.x - M[1][2].im*s2.y ); +(*(out+0)).w = ( M[1][0].re*s1.y + M[1][0].im*s1.x ) + ( M[1][1].re*s1.w + M[1][1].im*s1.z ) + ( M[1][2].re*s2.y + M[1][2].im*s2.x ); + + +(*(out+1)).x = ( M[2][0].re*s1.x - M[2][0].im*s1.y ) + ( M[2][1].re*s1.z - M[2][1].im*s1.w ) + ( M[2][2].re*s2.x - M[2][2].im*s2.y ); +(*(out+1)).y = ( M[2][0].re*s1.y + M[2][0].im*s1.x ) + ( M[2][1].re*s1.w + M[2][1].im*s1.z ) + ( M[2][2].re*s2.y + M[2][2].im*s2.x ); + + +#ifndef HALF + s1 = tex1Dfetch(spin_tex,6*pos+2); +#else + s1 = tex1Dfetch(spinhalf_tex,6*pos+2); + s1.x *= norm; + s1.y *= norm; + s1.z *= norm; + s1.w *= norm; +#endif + +(*(out+1)).z = ( M[0][0].re*s2.z - M[0][0].im*s2.w ) + ( M[0][1].re*s1.x - M[0][1].im*s1.y ) + ( M[0][2].re*s1.z - M[0][2].im*s1.w ); +(*(out+1)).w = ( M[0][0].re*s2.w + M[0][0].im*s2.z ) + ( M[0][1].re*s1.y + M[0][1].im*s1.x ) + ( M[0][2].re*s1.w + M[0][2].im*s1.z ); + + +(*(out+2)).x = ( M[1][0].re*s2.z - M[1][0].im*s2.w ) + ( M[1][1].re*s1.x - M[1][1].im*s1.y ) + ( M[1][2].re*s1.z - M[1][2].im*s1.w ); +(*(out+2)).y = ( M[1][0].re*s2.w + M[1][0].im*s2.z ) + ( M[1][1].re*s1.y + M[1][1].im*s1.x ) + ( M[1][2].re*s1.w + M[1][2].im*s1.z ); + + +(*(out+2)).z = ( M[2][0].re*s2.z - M[2][0].im*s2.w ) + ( M[2][1].re*s1.x - M[2][1].im*s1.y ) + ( M[2][2].re*s1.z - M[2][2].im*s1.w ); +(*(out+2)).w = ( M[2][0].re*s2.w + M[2][0].im*s2.z ) + ( M[2][1].re*s1.y + M[2][1].im*s1.x ) + ( M[2][2].re*s1.w + M[2][2].im*s1.z ); + + +#ifndef HALF + s1 = tex1Dfetch(spin_tex,6*pos+3); +#else + s1 = tex1Dfetch(spinhalf_tex,6*pos+3); + s1.x *= norm; + s1.y *= norm; + s1.z *= norm; + s1.w *= norm; +#endif + +#ifndef HALF + s2 = tex1Dfetch(spin_tex,6*pos+4); +#else + s2 = tex1Dfetch(spinhalf_tex,6*pos+4); + s2.x *= norm; + s2.y *= norm; + s2.z *= norm; + s2.w *= norm; +#endif +(*(out+3)).x = ( M[0][0].re*s1.x - M[0][0].im*s1.y ) + ( M[0][1].re*s1.z - M[0][1].im*s1.w ) + ( M[0][2].re*s2.x - M[0][2].im*s2.y ); +(*(out+3)).y = ( M[0][0].re*s1.y + M[0][0].im*s1.x ) + ( M[0][1].re*s1.w + M[0][1].im*s1.z ) + ( M[0][2].re*s2.y + M[0][2].im*s2.x ); + + +(*(out+3)).z = ( M[1][0].re*s1.x - M[1][0].im*s1.y ) + ( M[1][1].re*s1.z - M[1][1].im*s1.w ) + ( M[1][2].re*s2.x - M[1][2].im*s2.y ); +(*(out+3)).w = ( M[1][0].re*s1.y + M[1][0].im*s1.x ) + ( M[1][1].re*s1.w + M[1][1].im*s1.z ) + ( M[1][2].re*s2.y + M[1][2].im*s2.x ); + + +(*(out+4)).x = ( M[2][0].re*s1.x - M[2][0].im*s1.y ) + ( M[2][1].re*s1.z - M[2][1].im*s1.w ) + ( M[2][2].re*s2.x - M[2][2].im*s2.y ); +(*(out+4)).y = ( M[2][0].re*s1.y + M[2][0].im*s1.x ) + ( M[2][1].re*s1.w + M[2][1].im*s1.z ) + ( M[2][2].re*s2.y + M[2][2].im*s2.x ); + + +#ifndef HALF + s1 = tex1Dfetch(spin_tex,6*pos+5); +#else + s1 = tex1Dfetch(spinhalf_tex,6*pos+5); + s1.x *= norm; + s1.y *= norm; + s1.z *= norm; + s1.w *= norm; +#endif +(*(out+4)).z = ( M[0][0].re*s2.z - M[0][0].im*s2.w ) + ( M[0][1].re*s1.x - M[0][1].im*s1.y ) + ( M[0][2].re*s1.z - M[0][2].im*s1.w ); +(*(out+4)).w = ( M[0][0].re*s2.w + M[0][0].im*s2.z ) + ( M[0][1].re*s1.y + M[0][1].im*s1.x ) + ( M[0][2].re*s1.w + M[0][2].im*s1.z ); + + +(*(out+5)).x = ( M[1][0].re*s2.z - M[1][0].im*s2.w ) + ( M[1][1].re*s1.x - M[1][1].im*s1.y ) + ( M[1][2].re*s1.z - M[1][2].im*s1.w ); +(*(out+5)).y = ( M[1][0].re*s2.w + M[1][0].im*s2.z ) + ( M[1][1].re*s1.y + M[1][1].im*s1.x ) + ( M[1][2].re*s1.w + M[1][2].im*s1.z ); + + +(*(out+5)).z = ( M[2][0].re*s2.z - M[2][0].im*s2.w ) + ( M[2][1].re*s1.x - M[2][1].im*s1.y ) + ( M[2][2].re*s1.z - M[2][2].im*s1.w ); +(*(out+5)).w = ( M[2][0].re*s2.w + M[2][0].im*s2.z ) + ( M[2][1].re*s1.y + M[2][1].im*s1.x ) + ( M[2][2].re*s1.w + M[2][2].im*s1.z ); + + +} + + + + + + + + + + +//multipliziert su3-Matrix mal Spinor im Dirac-Raum +//code in su3_MtV.txt -- generated with codegen +template +__device__ void dev_su3MtV(typename dev_su3T::type M, const typename dev_spinorT::type * s, typename dev_spinorT::type * out){ + +(*(out+0)).x = ( M[0][0].re*(*(s+0)).x - M[0][0].im*(*(s+0)).y ) + ( M[0][1].re*(*(s+0)).z - M[0][1].im*(*(s+0)).w ) + ( M[0][2].re*(*(s+1)).x - M[0][2].im*(*(s+1)).y ); +(*(out+0)).y = ( M[0][0].re*(*(s+0)).y + M[0][0].im*(*(s+0)).x ) + ( M[0][1].re*(*(s+0)).w + M[0][1].im*(*(s+0)).z ) + ( M[0][2].re*(*(s+1)).y + M[0][2].im*(*(s+1)).x ); + + +(*(out+0)).z = ( M[1][0].re*(*(s+0)).x - M[1][0].im*(*(s+0)).y ) + ( M[1][1].re*(*(s+0)).z - M[1][1].im*(*(s+0)).w ) + ( M[1][2].re*(*(s+1)).x - M[1][2].im*(*(s+1)).y ); +(*(out+0)).w = ( M[1][0].re*(*(s+0)).y + M[1][0].im*(*(s+0)).x ) + ( M[1][1].re*(*(s+0)).w + M[1][1].im*(*(s+0)).z ) + ( M[1][2].re*(*(s+1)).y + M[1][2].im*(*(s+1)).x ); + + +(*(out+1)).x = ( M[2][0].re*(*(s+0)).x - M[2][0].im*(*(s+0)).y ) + ( M[2][1].re*(*(s+0)).z - M[2][1].im*(*(s+0)).w ) + ( M[2][2].re*(*(s+1)).x - M[2][2].im*(*(s+1)).y ); +(*(out+1)).y = ( M[2][0].re*(*(s+0)).y + M[2][0].im*(*(s+0)).x ) + ( M[2][1].re*(*(s+0)).w + M[2][1].im*(*(s+0)).z ) + ( M[2][2].re*(*(s+1)).y + M[2][2].im*(*(s+1)).x ); + + +(*(out+1)).z = ( M[0][0].re*(*(s+1)).z - M[0][0].im*(*(s+1)).w ) + ( M[0][1].re*(*(s+2)).x - M[0][1].im*(*(s+2)).y ) + ( M[0][2].re*(*(s+2)).z - M[0][2].im*(*(s+2)).w ); +(*(out+1)).w = ( M[0][0].re*(*(s+1)).w + M[0][0].im*(*(s+1)).z ) + ( M[0][1].re*(*(s+2)).y + M[0][1].im*(*(s+2)).x ) + ( M[0][2].re*(*(s+2)).w + M[0][2].im*(*(s+2)).z ); + + +(*(out+2)).x = ( M[1][0].re*(*(s+1)).z - M[1][0].im*(*(s+1)).w ) + ( M[1][1].re*(*(s+2)).x - M[1][1].im*(*(s+2)).y ) + ( M[1][2].re*(*(s+2)).z - M[1][2].im*(*(s+2)).w ); +(*(out+2)).y = ( M[1][0].re*(*(s+1)).w + M[1][0].im*(*(s+1)).z ) + ( M[1][1].re*(*(s+2)).y + M[1][1].im*(*(s+2)).x ) + ( M[1][2].re*(*(s+2)).w + M[1][2].im*(*(s+2)).z ); + + +(*(out+2)).z = ( M[2][0].re*(*(s+1)).z - M[2][0].im*(*(s+1)).w ) + ( M[2][1].re*(*(s+2)).x - M[2][1].im*(*(s+2)).y ) + ( M[2][2].re*(*(s+2)).z - M[2][2].im*(*(s+2)).w ); +(*(out+2)).w = ( M[2][0].re*(*(s+1)).w + M[2][0].im*(*(s+1)).z ) + ( M[2][1].re*(*(s+2)).y + M[2][1].im*(*(s+2)).x ) + ( M[2][2].re*(*(s+2)).w + M[2][2].im*(*(s+2)).z ); + + +(*(out+3)).x = ( M[0][0].re*(*(s+3)).x - M[0][0].im*(*(s+3)).y ) + ( M[0][1].re*(*(s+3)).z - M[0][1].im*(*(s+3)).w ) + ( M[0][2].re*(*(s+4)).x - M[0][2].im*(*(s+4)).y ); +(*(out+3)).y = ( M[0][0].re*(*(s+3)).y + M[0][0].im*(*(s+3)).x ) + ( M[0][1].re*(*(s+3)).w + M[0][1].im*(*(s+3)).z ) + ( M[0][2].re*(*(s+4)).y + M[0][2].im*(*(s+4)).x ); + + +(*(out+3)).z = ( M[1][0].re*(*(s+3)).x - M[1][0].im*(*(s+3)).y ) + ( M[1][1].re*(*(s+3)).z - M[1][1].im*(*(s+3)).w ) + ( M[1][2].re*(*(s+4)).x - M[1][2].im*(*(s+4)).y ); +(*(out+3)).w = ( M[1][0].re*(*(s+3)).y + M[1][0].im*(*(s+3)).x ) + ( M[1][1].re*(*(s+3)).w + M[1][1].im*(*(s+3)).z ) + ( M[1][2].re*(*(s+4)).y + M[1][2].im*(*(s+4)).x ); + + +(*(out+4)).x = ( M[2][0].re*(*(s+3)).x - M[2][0].im*(*(s+3)).y ) + ( M[2][1].re*(*(s+3)).z - M[2][1].im*(*(s+3)).w ) + ( M[2][2].re*(*(s+4)).x - M[2][2].im*(*(s+4)).y ); +(*(out+4)).y = ( M[2][0].re*(*(s+3)).y + M[2][0].im*(*(s+3)).x ) + ( M[2][1].re*(*(s+3)).w + M[2][1].im*(*(s+3)).z ) + ( M[2][2].re*(*(s+4)).y + M[2][2].im*(*(s+4)).x ); + + +(*(out+4)).z = ( M[0][0].re*(*(s+4)).z - M[0][0].im*(*(s+4)).w ) + ( M[0][1].re*(*(s+5)).x - M[0][1].im*(*(s+5)).y ) + ( M[0][2].re*(*(s+5)).z - M[0][2].im*(*(s+5)).w ); +(*(out+4)).w = ( M[0][0].re*(*(s+4)).w + M[0][0].im*(*(s+4)).z ) + ( M[0][1].re*(*(s+5)).y + M[0][1].im*(*(s+5)).x ) + ( M[0][2].re*(*(s+5)).w + M[0][2].im*(*(s+5)).z ); + + +(*(out+5)).x = ( M[1][0].re*(*(s+4)).z - M[1][0].im*(*(s+4)).w ) + ( M[1][1].re*(*(s+5)).x - M[1][1].im*(*(s+5)).y ) + ( M[1][2].re*(*(s+5)).z - M[1][2].im*(*(s+5)).w ); +(*(out+5)).y = ( M[1][0].re*(*(s+4)).w + M[1][0].im*(*(s+4)).z ) + ( M[1][1].re*(*(s+5)).y + M[1][1].im*(*(s+5)).x ) + ( M[1][2].re*(*(s+5)).w + M[1][2].im*(*(s+5)).z ); + + +(*(out+5)).z = ( M[2][0].re*(*(s+4)).z - M[2][0].im*(*(s+4)).w ) + ( M[2][1].re*(*(s+5)).x - M[2][1].im*(*(s+5)).y ) + ( M[2][2].re*(*(s+5)).z - M[2][2].im*(*(s+5)).w ); +(*(out+5)).w = ( M[2][0].re*(*(s+4)).w + M[2][0].im*(*(s+4)).z ) + ( M[2][1].re*(*(s+5)).y + M[2][1].im*(*(s+5)).x ) + ( M[2][2].re*(*(s+5)).w + M[2][2].im*(*(s+5)).z ); +} + + + + + + +#ifdef HALF +//multipliziert su3-Matrix mal Spinor im Dirac-Raum +//code in su3_MtV.txt -- generated with codegen +__device__ void dev_su3MtV_half(dev_su3 M, const dev_spinor_half * s, const float * s_norm, dev_spinor * out){ +float norm = * s_norm; + +(*(out+0)).x = ( M[0][0].re*half2fl((*(s+0)).x,norm) - + M[0][0].im*half2fl((*(s+0)).y,norm) ) + + ( M[0][1].re*half2fl((*(s+0)).z,norm) - + M[0][1].im*half2fl((*(s+0)).w,norm) ) + + ( M[0][2].re*half2fl((*(s+1)).x,norm) - + M[0][2].im*half2fl((*(s+1)).y,norm) ); +(*(out+0)).y = ( M[0][0].re*half2fl((*(s+0)).y,norm) + + M[0][0].im*half2fl((*(s+0)).x,norm) ) + + ( M[0][1].re*half2fl((*(s+0)).w,norm) + + M[0][1].im*half2fl((*(s+0)).z,norm) ) + + ( M[0][2].re*half2fl((*(s+1)).y,norm) + + M[0][2].im*half2fl((*(s+1)).x,norm) ); + + +(*(out+0)).z = ( M[1][0].re*half2fl((*(s+0)).x,norm) - + M[1][0].im*half2fl((*(s+0)).y,norm) ) + + ( M[1][1].re*half2fl((*(s+0)).z,norm) - + M[1][1].im*half2fl((*(s+0)).w,norm) ) + + ( M[1][2].re*half2fl((*(s+1)).x,norm) - + M[1][2].im*half2fl((*(s+1)).y,norm) ); +(*(out+0)).w = ( M[1][0].re*half2fl((*(s+0)).y,norm) + + M[1][0].im*half2fl((*(s+0)).x,norm) ) + + ( M[1][1].re*half2fl((*(s+0)).w,norm) + + M[1][1].im*half2fl((*(s+0)).z,norm) ) + + ( M[1][2].re*half2fl((*(s+1)).y,norm) + + M[1][2].im*half2fl((*(s+1)).x,norm) ); + + +(*(out+1)).x = ( M[2][0].re*half2fl((*(s+0)).x, norm) - + M[2][0].im*half2fl((*(s+0)).y, norm) ) + + ( M[2][1].re*half2fl((*(s+0)).z, norm) - + M[2][1].im*half2fl((*(s+0)).w, norm) ) + + ( M[2][2].re*half2fl((*(s+1)).x, norm) - + M[2][2].im*half2fl((*(s+1)).y, norm) ); +(*(out+1)).y = ( M[2][0].re*half2fl((*(s+0)).y, norm) + + M[2][0].im*half2fl((*(s+0)).x, norm) ) + + ( M[2][1].re*half2fl((*(s+0)).w, norm) + + M[2][1].im*half2fl((*(s+0)).z, norm) ) + + ( M[2][2].re*half2fl((*(s+1)).y, norm) + + M[2][2].im*half2fl((*(s+1)).x, norm) ); + + +(*(out+1)).z = ( M[0][0].re*half2fl((*(s+1)).z, norm) - + M[0][0].im*half2fl((*(s+1)).w, norm) ) + + ( M[0][1].re*half2fl((*(s+2)).x, norm) - + M[0][1].im*half2fl((*(s+2)).y, norm) ) + + ( M[0][2].re*half2fl((*(s+2)).z, norm) - + M[0][2].im*half2fl((*(s+2)).w, norm) ); +(*(out+1)).w = ( M[0][0].re*half2fl((*(s+1)).w, norm) + + M[0][0].im*half2fl((*(s+1)).z, norm) ) + + ( M[0][1].re*half2fl((*(s+2)).y, norm) + + M[0][1].im*half2fl((*(s+2)).x, norm) ) + + ( M[0][2].re*half2fl((*(s+2)).w, norm) + + M[0][2].im*half2fl((*(s+2)).z, norm) ); + + +(*(out+2)).x = ( M[1][0].re*half2fl((*(s+1)).z, norm) - + M[1][0].im*half2fl((*(s+1)).w, norm) ) + + ( M[1][1].re*half2fl((*(s+2)).x, norm) - + M[1][1].im*half2fl((*(s+2)).y, norm) ) + + ( M[1][2].re*half2fl((*(s+2)).z, norm) - + M[1][2].im*half2fl((*(s+2)).w, norm) ); +(*(out+2)).y = ( M[1][0].re*half2fl((*(s+1)).w, norm) + + M[1][0].im*half2fl((*(s+1)).z, norm) ) + + ( M[1][1].re*half2fl((*(s+2)).y, norm) + + M[1][1].im*half2fl((*(s+2)).x, norm) ) + + ( M[1][2].re*half2fl((*(s+2)).w, norm) + + M[1][2].im*half2fl((*(s+2)).z, norm) ); + + +(*(out+2)).z = ( M[2][0].re*half2fl((*(s+1)).z, norm) - + M[2][0].im*half2fl((*(s+1)).w, norm) ) + + ( M[2][1].re*half2fl((*(s+2)).x, norm) - + M[2][1].im*half2fl((*(s+2)).y, norm) ) + + ( M[2][2].re*half2fl((*(s+2)).z, norm) - + M[2][2].im*half2fl((*(s+2)).w, norm) ); +(*(out+2)).w = ( M[2][0].re*half2fl((*(s+1)).w, norm) + + M[2][0].im*half2fl((*(s+1)).z, norm) ) + + ( M[2][1].re*half2fl((*(s+2)).y, norm) + + M[2][1].im*half2fl((*(s+2)).x, norm) ) + + ( M[2][2].re*half2fl((*(s+2)).w, norm) + + M[2][2].im*half2fl((*(s+2)).z, norm) ); + + +(*(out+3)).x = ( M[0][0].re*half2fl((*(s+3)).x, norm) - + M[0][0].im*half2fl((*(s+3)).y, norm) ) + + ( M[0][1].re*half2fl((*(s+3)).z, norm) - + M[0][1].im*half2fl((*(s+3)).w, norm) ) + + ( M[0][2].re*half2fl((*(s+4)).x, norm) - + M[0][2].im*half2fl((*(s+4)).y, norm) ); +(*(out+3)).y = ( M[0][0].re*half2fl((*(s+3)).y, norm) + + M[0][0].im*half2fl((*(s+3)).x, norm) ) + + ( M[0][1].re*half2fl((*(s+3)).w, norm) + + M[0][1].im*half2fl((*(s+3)).z, norm) ) + + ( M[0][2].re*half2fl((*(s+4)).y, norm) + + M[0][2].im*half2fl((*(s+4)).x, norm) ); + + +(*(out+3)).z = ( M[1][0].re*half2fl((*(s+3)).x, norm) - + M[1][0].im*half2fl((*(s+3)).y, norm) ) + + ( M[1][1].re*half2fl((*(s+3)).z, norm) - + M[1][1].im*half2fl((*(s+3)).w, norm) ) + + ( M[1][2].re*half2fl((*(s+4)).x, norm) - + M[1][2].im*half2fl((*(s+4)).y, norm) ); +(*(out+3)).w = ( M[1][0].re*half2fl((*(s+3)).y, norm) + + M[1][0].im*half2fl((*(s+3)).x, norm) ) + + ( M[1][1].re*half2fl((*(s+3)).w, norm) + + M[1][1].im*half2fl((*(s+3)).z, norm) ) + + ( M[1][2].re*half2fl((*(s+4)).y, norm) + + M[1][2].im*half2fl((*(s+4)).x, norm) ); + + +(*(out+4)).x = ( M[2][0].re*half2fl((*(s+3)).x, norm) - + M[2][0].im*half2fl((*(s+3)).y, norm) ) + + ( M[2][1].re*half2fl((*(s+3)).z, norm) - + M[2][1].im*half2fl((*(s+3)).w, norm) ) + + ( M[2][2].re*half2fl((*(s+4)).x, norm) - + M[2][2].im*half2fl((*(s+4)).y, norm) ); +(*(out+4)).y = ( M[2][0].re*half2fl((*(s+3)).y, norm) + + M[2][0].im*half2fl((*(s+3)).x, norm) ) + + ( M[2][1].re*half2fl((*(s+3)).w, norm) + + M[2][1].im*half2fl((*(s+3)).z, norm) ) + + ( M[2][2].re*half2fl((*(s+4)).y, norm) + + M[2][2].im*half2fl((*(s+4)).x, norm) ); + + +(*(out+4)).z = ( M[0][0].re*half2fl((*(s+4)).z, norm) - + M[0][0].im*half2fl((*(s+4)).w, norm) ) + + ( M[0][1].re*half2fl((*(s+5)).x, norm) - + M[0][1].im*half2fl((*(s+5)).y, norm) ) + + ( M[0][2].re*half2fl((*(s+5)).z, norm) - + M[0][2].im*half2fl((*(s+5)).w, norm) ); +(*(out+4)).w = ( M[0][0].re*half2fl((*(s+4)).w, norm) + + M[0][0].im*half2fl((*(s+4)).z, norm) ) + + ( M[0][1].re*half2fl((*(s+5)).y, norm) + + M[0][1].im*half2fl((*(s+5)).x, norm) ) + + ( M[0][2].re*half2fl((*(s+5)).w, norm) + + M[0][2].im*half2fl((*(s+5)).z, norm) ); + + +(*(out+5)).x = ( M[1][0].re*half2fl((*(s+4)).z, norm) - + M[1][0].im*half2fl((*(s+4)).w, norm) ) + + ( M[1][1].re*half2fl((*(s+5)).x, norm) - + M[1][1].im*half2fl((*(s+5)).y, norm) ) + + ( M[1][2].re*half2fl((*(s+5)).z, norm) - + M[1][2].im*half2fl((*(s+5)).w, norm) ); +(*(out+5)).y = ( M[1][0].re*half2fl((*(s+4)).w, norm) + + M[1][0].im*half2fl((*(s+4)).z, norm) ) + + ( M[1][1].re*half2fl((*(s+5)).y, norm) + + M[1][1].im*half2fl((*(s+5)).x, norm) ) + + ( M[1][2].re*half2fl((*(s+5)).w, norm) + + M[1][2].im*half2fl((*(s+5)).z, norm) ); + + +(*(out+5)).z = ( M[2][0].re*half2fl((*(s+4)).z, norm) - + M[2][0].im*half2fl((*(s+4)).w, norm) ) + + ( M[2][1].re*half2fl((*(s+5)).x, norm) - + M[2][1].im*half2fl((*(s+5)).y, norm) ) + + ( M[2][2].re*half2fl((*(s+5)).z, norm) - + M[2][2].im*half2fl((*(s+5)).w, norm) ); +(*(out+5)).w = ( M[2][0].re*half2fl((*(s+4)).w, norm) + + M[2][0].im*half2fl((*(s+4)).z, norm) ) + + ( M[2][1].re*half2fl((*(s+5)).y, norm) + + M[2][1].im*half2fl((*(s+5)).x, norm) ) + + ( M[2][2].re*half2fl((*(s+5)).w, norm) + + M[2][2].im*half2fl((*(s+5)).z, norm) ); +} +#endif + + + + + + + +//multipliziert gedaggerte su3-Matrix mal Spinor im Dirac-Raum -- generated with codegen +template +__device__ void dev_su3MdaggertV(typename dev_su3T::type M, typename dev_spinorT::type * s, typename dev_spinorT::type * out){ + dev_complexT help1; +help1.re = M[0][0].re*(*(s+0)).x + M[0][0].im*(*(s+0)).y + M[1][0].re*(*(s+0)).z + M[1][0].im*(*(s+0)).w + M[2][0].re*(*(s+1)).x + M[2][0].im*(*(s+1)).y; +(*(out+0)).x = help1.re; +help1.im = M[0][0].re*(*(s+0)).y - M[0][0].im*(*(s+0)).x + M[1][0].re*(*(s+0)).w - M[1][0].im*(*(s+0)).z + M[2][0].re*(*(s+1)).y - M[2][0].im*(*(s+1)).x; +(*(out+0)).y = help1.im; + +help1.re = M[0][1].re*(*(s+0)).x + M[0][1].im*(*(s+0)).y + M[1][1].re*(*(s+0)).z + M[1][1].im*(*(s+0)).w + M[2][1].re*(*(s+1)).x + M[2][1].im*(*(s+1)).y; +(*(out+0)).z = help1.re; +help1.im = M[0][1].re*(*(s+0)).y - M[0][1].im*(*(s+0)).x + M[1][1].re*(*(s+0)).w - M[1][1].im*(*(s+0)).z + M[2][1].re*(*(s+1)).y - M[2][1].im*(*(s+1)).x; +(*(out+0)).w = help1.im; + +help1.re = M[0][2].re*(*(s+0)).x + M[0][2].im*(*(s+0)).y + M[1][2].re*(*(s+0)).z + M[1][2].im*(*(s+0)).w + M[2][2].re*(*(s+1)).x + M[2][2].im*(*(s+1)).y; +(*(out+1)).x = help1.re; +help1.im = M[0][2].re*(*(s+0)).y - M[0][2].im*(*(s+0)).x + M[1][2].re*(*(s+0)).w - M[1][2].im*(*(s+0)).z + M[2][2].re*(*(s+1)).y - M[2][2].im*(*(s+1)).x; +(*(out+1)).y = help1.im; + +help1.re = M[0][0].re*(*(s+1)).z + M[0][0].im*(*(s+1)).w + M[1][0].re*(*(s+2)).x + M[1][0].im*(*(s+2)).y + M[2][0].re*(*(s+2)).z + M[2][0].im*(*(s+2)).w; +(*(out+1)).z = help1.re; +help1.im = M[0][0].re*(*(s+1)).w - M[0][0].im*(*(s+1)).z + M[1][0].re*(*(s+2)).y - M[1][0].im*(*(s+2)).x + M[2][0].re*(*(s+2)).w - M[2][0].im*(*(s+2)).z; +(*(out+1)).w = help1.im; + +help1.re = M[0][1].re*(*(s+1)).z + M[0][1].im*(*(s+1)).w + M[1][1].re*(*(s+2)).x + M[1][1].im*(*(s+2)).y + M[2][1].re*(*(s+2)).z + M[2][1].im*(*(s+2)).w; +(*(out+2)).x = help1.re; +help1.im = M[0][1].re*(*(s+1)).w - M[0][1].im*(*(s+1)).z + M[1][1].re*(*(s+2)).y - M[1][1].im*(*(s+2)).x + M[2][1].re*(*(s+2)).w - M[2][1].im*(*(s+2)).z; +(*(out+2)).y = help1.im; + +help1.re = M[0][2].re*(*(s+1)).z + M[0][2].im*(*(s+1)).w + M[1][2].re*(*(s+2)).x + M[1][2].im*(*(s+2)).y + M[2][2].re*(*(s+2)).z + M[2][2].im*(*(s+2)).w; +(*(out+2)).z = help1.re; +help1.im = M[0][2].re*(*(s+1)).w - M[0][2].im*(*(s+1)).z + M[1][2].re*(*(s+2)).y - M[1][2].im*(*(s+2)).x + M[2][2].re*(*(s+2)).w - M[2][2].im*(*(s+2)).z; +(*(out+2)).w = help1.im; + +help1.re = M[0][0].re*(*(s+3)).x + M[0][0].im*(*(s+3)).y + M[1][0].re*(*(s+3)).z + M[1][0].im*(*(s+3)).w + M[2][0].re*(*(s+4)).x + M[2][0].im*(*(s+4)).y; +(*(out+3)).x = help1.re; +help1.im = M[0][0].re*(*(s+3)).y - M[0][0].im*(*(s+3)).x + M[1][0].re*(*(s+3)).w - M[1][0].im*(*(s+3)).z + M[2][0].re*(*(s+4)).y - M[2][0].im*(*(s+4)).x; +(*(out+3)).y = help1.im; + +help1.re = M[0][1].re*(*(s+3)).x + M[0][1].im*(*(s+3)).y + M[1][1].re*(*(s+3)).z + M[1][1].im*(*(s+3)).w + M[2][1].re*(*(s+4)).x + M[2][1].im*(*(s+4)).y; +(*(out+3)).z = help1.re; +help1.im = M[0][1].re*(*(s+3)).y - M[0][1].im*(*(s+3)).x + M[1][1].re*(*(s+3)).w - M[1][1].im*(*(s+3)).z + M[2][1].re*(*(s+4)).y - M[2][1].im*(*(s+4)).x; +(*(out+3)).w = help1.im; + +help1.re = M[0][2].re*(*(s+3)).x + M[0][2].im*(*(s+3)).y + M[1][2].re*(*(s+3)).z + M[1][2].im*(*(s+3)).w + M[2][2].re*(*(s+4)).x + M[2][2].im*(*(s+4)).y; +(*(out+4)).x = help1.re; +help1.im = M[0][2].re*(*(s+3)).y - M[0][2].im*(*(s+3)).x + M[1][2].re*(*(s+3)).w - M[1][2].im*(*(s+3)).z + M[2][2].re*(*(s+4)).y - M[2][2].im*(*(s+4)).x; +(*(out+4)).y = help1.im; + +help1.re = M[0][0].re*(*(s+4)).z + M[0][0].im*(*(s+4)).w + M[1][0].re*(*(s+5)).x + M[1][0].im*(*(s+5)).y + M[2][0].re*(*(s+5)).z + M[2][0].im*(*(s+5)).w; +(*(out+4)).z = help1.re; +help1.im = M[0][0].re*(*(s+4)).w - M[0][0].im*(*(s+4)).z + M[1][0].re*(*(s+5)).y - M[1][0].im*(*(s+5)).x + M[2][0].re*(*(s+5)).w - M[2][0].im*(*(s+5)).z; +(*(out+4)).w = help1.im; + +help1.re = M[0][1].re*(*(s+4)).z + M[0][1].im*(*(s+4)).w + M[1][1].re*(*(s+5)).x + M[1][1].im*(*(s+5)).y + M[2][1].re*(*(s+5)).z + M[2][1].im*(*(s+5)).w; +(*(out+5)).x = help1.re; +help1.im = M[0][1].re*(*(s+4)).w - M[0][1].im*(*(s+4)).z + M[1][1].re*(*(s+5)).y - M[1][1].im*(*(s+5)).x + M[2][1].re*(*(s+5)).w - M[2][1].im*(*(s+5)).z; +(*(out+5)).y = help1.im; + +help1.re = M[0][2].re*(*(s+4)).z + M[0][2].im*(*(s+4)).w + M[1][2].re*(*(s+5)).x + M[1][2].im*(*(s+5)).y + M[2][2].re*(*(s+5)).z + M[2][2].im*(*(s+5)).w; +(*(out+5)).z = help1.re; +help1.im = M[0][2].re*(*(s+4)).w - M[0][2].im*(*(s+4)).z + M[1][2].re*(*(s+5)).y - M[1][2].im*(*(s+5)).x + M[2][2].re*(*(s+5)).w - M[2][2].im*(*(s+5)).z; +(*(out+5)).w = help1.im; +} + + + + +// Gamma t +template +__device__ void dev_Gamma0(typename dev_spinorT::type * in){ + RealT tempre,tempim; + tempre = (*(in+0)).x; + tempim = (*(in+0)).y; + (*(in+0)).x = -1.0*(*(in+3)).x; + (*(in+0)).y = -1.0*(*(in+3)).y; + (*(in+3)).x = -1.0*tempre; + (*(in+3)).y = -1.0*tempim; + + tempre = (*(in+0)).z; + tempim = (*(in+0)).w; + (*(in+0)).z = -1.0*(*(in+3)).z; + (*(in+0)).w = -1.0*(*(in+3)).w; + (*(in+3)).z = -1.0*tempre; + (*(in+3)).w = -1.0*tempim; + + + tempre = (*(in+1)).x; + tempim = (*(in+1)).y; + (*(in+1)).x = -1.0*(*(in+4)).x; + (*(in+1)).y = -1.0*(*(in+4)).y; + (*(in+4)).x = -1.0*tempre; + (*(in+4)).y = -1.0*tempim; + + tempre = (*(in+1)).z; + tempim = (*(in+1)).w; + (*(in+1)).z = -1.0*(*(in+4)).z; + (*(in+1)).w = -1.0*(*(in+4)).w; + (*(in+4)).z = -1.0*tempre; + (*(in+4)).w = -1.0*tempim; + + + tempre = (*(in+2)).x; + tempim = (*(in+2)).y; + (*(in+2)).x = -1.0*(*(in+5)).x; + (*(in+2)).y = -1.0*(*(in+5)).y; + (*(in+5)).x = -1.0*tempre; + (*(in+5)).y = -1.0*tempim; + + + tempre = (*(in+2)).z; + tempim = (*(in+2)).w; + (*(in+2)).z = -1.0*(*(in+5)).z; + (*(in+2)).w = -1.0*(*(in+5)).w; + (*(in+5)).z = -1.0*tempre; + (*(in+5)).w = -1.0*tempim; +} + + + +//Gamma z +template +__device__ void dev_Gamma3(typename dev_spinorT::type * in){ + RealT tempre,tempim; + tempre = (*(in+0)).x; + tempim = (*(in+0)).y; + (*(in+0)).x = (*(in+3)).y; + (*(in+0)).y = -1.0*(*(in+3)).x; + (*(in+3)).x = -1.0*tempim; + (*(in+3)).y = tempre; + + tempre = (*(in+0)).z; + tempim = (*(in+0)).w; + (*(in+0)).z = (*(in+3)).w; + (*(in+0)).w = -1.0*(*(in+3)).z; + (*(in+3)).z = -1.0*tempim; + (*(in+3)).w = tempre; + + + tempre = (*(in+1)).x; + tempim = (*(in+1)).y; + (*(in+1)).x = (*(in+4)).y; + (*(in+1)).y = -1.0*(*(in+4)).x; + (*(in+4)).x = -1.0*tempim; + (*(in+4)).y = tempre; + + + tempre = (*(in+1)).z; + tempim = (*(in+1)).w; + (*(in+1)).z = -1.0*(*(in+4)).w; + (*(in+1)).w = (*(in+4)).z; + (*(in+4)).z = tempim; + (*(in+4)).w = -1.0*tempre; + + + + tempre = (*(in+2)).x; + tempim = (*(in+2)).y; + (*(in+2)).x = -1.0*(*(in+5)).y; + (*(in+2)).y = (*(in+5)).x; + (*(in+5)).x = tempim; + (*(in+5)).y = -1.0*tempre; + + + tempre = (*(in+2)).z; + tempim = (*(in+2)).w; + (*(in+2)).z = -1.0*(*(in+5)).w; + (*(in+2)).w = (*(in+5)).z; + (*(in+5)).z = tempim; + (*(in+5)).w = -1.0*tempre; + +} + + + +//Gamma y +template +__device__ void dev_Gamma2(typename dev_spinorT::type * in){ + RealT tempre,tempim; + tempre = (*(in+0)).x; + tempim = (*(in+0)).y; + (*(in+0)).x = -1.0*(*(in+4)).z; + (*(in+0)).y = -1.0*(*(in+4)).w; + (*(in+4)).z = -1.0*tempre; + (*(in+4)).w = -1.0*tempim; + + tempre = (*(in+0)).z; + tempim = (*(in+0)).w; + (*(in+0)).z = -1.0*(*(in+5)).x; + (*(in+0)).w = -1.0*(*(in+5)).y; + (*(in+5)).x = -1.0*tempre; + (*(in+5)).y = -1.0*tempim; + + + tempre = (*(in+1)).x; + tempim = (*(in+1)).y; + (*(in+1)).x = -1.0*(*(in+5)).z; + (*(in+1)).y = -1.0*(*(in+5)).w; + (*(in+5)).z = -1.0*tempre; + (*(in+5)).w = -1.0*tempim; + + tempre = (*(in+1)).z; + tempim = (*(in+1)).w; + (*(in+1)).z = (*(in+3)).x; + (*(in+1)).w = (*(in+3)).y; + (*(in+3)).x = tempre; + (*(in+3)).y = tempim; + + tempre = (*(in+2)).x; + tempim = (*(in+2)).y; + (*(in+2)).x = (*(in+3)).z; + (*(in+2)).y = (*(in+3)).w; + (*(in+3)).z = tempre; + (*(in+3)).w = tempim; + + + tempre = (*(in+2)).z; + tempim = (*(in+2)).w; + (*(in+2)).z = (*(in+4)).x; + (*(in+2)).w = (*(in+4)).y; + (*(in+4)).x = tempre; + (*(in+4)).y = tempim; +} + + + +//Gamma x +template +__device__ void dev_Gamma1(typename dev_spinorT::type * in){ + RealT tempre,tempim; + tempre = (*(in+0)).x; + tempim = (*(in+0)).y; + (*(in+0)).x = (*(in+4)).w; + (*(in+0)).y = -1.0*(*(in+4)).z; + (*(in+4)).z = -1.0*tempim; + (*(in+4)).w = tempre; + + tempre = (*(in+0)).z; + tempim = (*(in+0)).w; + (*(in+0)).z = (*(in+5)).y; + (*(in+0)).w = -1.0*(*(in+5)).x; + (*(in+5)).x = -1.0*tempim; + (*(in+5)).y = tempre; + + tempre = (*(in+1)).x; + tempim = (*(in+1)).y; + (*(in+1)).x = (*(in+5)).w; + (*(in+1)).y = -1.0*(*(in+5)).z; + (*(in+5)).z = -1.0*tempim; + (*(in+5)).w = tempre; + + tempre = (*(in+1)).z; + tempim = (*(in+1)).w; + (*(in+1)).z = (*(in+3)).y; + (*(in+1)).w = -1.0*(*(in+3)).x; + (*(in+3)).x = -1.0*tempim; + (*(in+3)).y = tempre; + + tempre = (*(in+2)).x; + tempim = (*(in+2)).y; + (*(in+2)).x = (*(in+3)).w; + (*(in+2)).y = -1.0*(*(in+3)).z; + (*(in+3)).z = -1.0*tempim; + (*(in+3)).w = tempre; + + + tempre = (*(in+2)).z; + tempim = (*(in+2)).w; + (*(in+2)).z = (*(in+4)).y; + (*(in+2)).w = -1.0*(*(in+4)).x; + (*(in+4)).x = -1.0*tempim; + (*(in+4)).y = tempre; + +} + + + +template +__device__ void dev_Gamma5(typename dev_spinorT::type * in){ + (*(in+3)).x = -1.0*(*(in+3)).x; + (*(in+3)).y = -1.0*(*(in+3)).y; + (*(in+3)).z = -1.0*(*(in+3)).z; + (*(in+3)).w = -1.0*(*(in+3)).w; + (*(in+4)).x = -1.0*(*(in+4)).x; + (*(in+4)).y = -1.0*(*(in+4)).y; + + (*(in+4)).z = -1.0*(*(in+4)).z; + (*(in+4)).w = -1.0*(*(in+4)).w; + (*(in+5)).x = -1.0*(*(in+5)).x; + (*(in+5)).y = -1.0*(*(in+5)).y; + (*(in+5)).z = -1.0*(*(in+5)).z; + (*(in+5)).w = -1.0*(*(in+5)).w; +} + + +template +__device__ void dev_Gamma5_assign(typename dev_spinorT::type* out, typename dev_spinorT::type* in){ + (*(out)).x = (*(in)).x; + (*(out)).y = (*(in)).y; + (*(out)).z = (*(in)).z; + (*(out)).w = (*(in)).w; + (*(out+1)).x = (*(in+1)).x; + (*(out+1)).y = (*(in+1)).y; + + (*(out+1)).z = (*(in+1)).z; + (*(out+1)).w = (*(in+1)).w; + (*(out+2)).x = (*(in+2)).x; + (*(out+2)).y = (*(in+2)).y; + (*(out+2)).z = (*(in+2)).z; + (*(out+2)).w = (*(in+2)).w; + + (*(out+3)).x = -1.0*(*(in+3)).x; + (*(out+3)).y = -1.0*(*(in+3)).y; + (*(out+3)).z = -1.0*(*(in+3)).z; + (*(out+3)).w = -1.0*(*(in+3)).w; + (*(out+4)).x = -1.0*(*(in+4)).x; + (*(out+4)).y = -1.0*(*(in+4)).y; + + (*(out+4)).z = -1.0*(*(in+4)).z; + (*(out+4)).w = -1.0*(*(in+4)).w; + (*(out+5)).x = -1.0*(*(in+5)).x; + (*(out+5)).y = -1.0*(*(in+5)).y; + (*(out+5)).z = -1.0*(*(in+5)).z; + (*(out+5)).w = -1.0*(*(in+5)).w; +} + + + + +// older version, all in one function +template +__device__ void dev_GammatV(int mu, typename dev_spinorT::type * in){//multipliziert Gamma(mu)*V effizientes ausnutzen der Nullen + RealT tempre,tempim; + /* ORDER: t, z, y, x*/ + switch (mu){ + + case 0: + tempre = (*(in+0)).x; + tempim = (*(in+0)).y; + (*(in+0)).x = -1.0*(*(in+3)).x; + (*(in+0)).y = -1.0*(*(in+3)).y; + (*(in+3)).x = -1.0*tempre; + (*(in+3)).y = -1.0*tempim; + + tempre = (*(in+0)).z; + tempim = (*(in+0)).w; + (*(in+0)).z = -1.0*(*(in+3)).z; + (*(in+0)).w = -1.0*(*(in+3)).w; + (*(in+3)).z = -1.0*tempre; + (*(in+3)).w = -1.0*tempim; + + + tempre = (*(in+1)).x; + tempim = (*(in+1)).y; + (*(in+1)).x = -1.0*(*(in+4)).x; + (*(in+1)).y = -1.0*(*(in+4)).y; + (*(in+4)).x = -1.0*tempre; + (*(in+4)).y = -1.0*tempim; + + tempre = (*(in+1)).z; + tempim = (*(in+1)).w; + (*(in+1)).z = -1.0*(*(in+4)).z; + (*(in+1)).w = -1.0*(*(in+4)).w; + (*(in+4)).z = -1.0*tempre; + (*(in+4)).w = -1.0*tempim; + + + tempre = (*(in+2)).x; + tempim = (*(in+2)).y; + (*(in+2)).x = -1.0*(*(in+5)).x; + (*(in+2)).y = -1.0*(*(in+5)).y; + (*(in+5)).x = -1.0*tempre; + (*(in+5)).y = -1.0*tempim; + + + tempre = (*(in+2)).z; + tempim = (*(in+2)).w; + (*(in+2)).z = -1.0*(*(in+5)).z; + (*(in+2)).w = -1.0*(*(in+5)).w; + (*(in+5)).z = -1.0*tempre; + (*(in+5)).w = -1.0*tempim; + + break; + + case 1: + tempre = (*(in+0)).x; + tempim = (*(in+0)).y; + (*(in+0)).x = (*(in+3)).y; + (*(in+0)).y = -1.0*(*(in+3)).x; + (*(in+3)).x = -1.0*tempim; + (*(in+3)).y = tempre; + + tempre = (*(in+0)).z; + tempim = (*(in+0)).w; + (*(in+0)).z = (*(in+3)).w; + (*(in+0)).w = -1.0*(*(in+3)).z; + (*(in+3)).z = -1.0*tempim; + (*(in+3)).w = tempre; + + + tempre = (*(in+1)).x; + tempim = (*(in+1)).y; + (*(in+1)).x = (*(in+4)).y; + (*(in+1)).y = -1.0*(*(in+4)).x; + (*(in+4)).x = -1.0*tempim; + (*(in+4)).y = tempre; + + + tempre = (*(in+1)).z; + tempim = (*(in+1)).w; + (*(in+1)).z = -1.0*(*(in+4)).w; + (*(in+1)).w = (*(in+4)).z; + (*(in+4)).z = tempim; + (*(in+4)).w = -1.0*tempre; + + + + tempre = (*(in+2)).x; + tempim = (*(in+2)).y; + (*(in+2)).x = -1.0*(*(in+5)).y; + (*(in+2)).y = (*(in+5)).x; + (*(in+5)).x = tempim; + (*(in+5)).y = -1.0*tempre; + + + tempre = (*(in+2)).z; + tempim = (*(in+2)).w; + (*(in+2)).z = -1.0*(*(in+5)).w; + (*(in+2)).w = (*(in+5)).z; + (*(in+5)).z = tempim; + (*(in+5)).w = -1.0*tempre; + + + break; + + case 2: + tempre = (*(in+0)).x; + tempim = (*(in+0)).y; + (*(in+0)).x = -1.0*(*(in+4)).z; + (*(in+0)).y = -1.0*(*(in+4)).w; + (*(in+4)).z = -1.0*tempre; + (*(in+4)).w = -1.0*tempim; + + tempre = (*(in+0)).z; + tempim = (*(in+0)).w; + (*(in+0)).z = -1.0*(*(in+5)).x; + (*(in+0)).w = -1.0*(*(in+5)).y; + (*(in+5)).x = -1.0*tempre; + (*(in+5)).y = -1.0*tempim; + + + tempre = (*(in+1)).x; + tempim = (*(in+1)).y; + (*(in+1)).x = -1.0*(*(in+5)).z; + (*(in+1)).y = -1.0*(*(in+5)).w; + (*(in+5)).z = -1.0*tempre; + (*(in+5)).w = -1.0*tempim; + + tempre = (*(in+1)).z; + tempim = (*(in+1)).w; + (*(in+1)).z = (*(in+3)).x; + (*(in+1)).w = (*(in+3)).y; + (*(in+3)).x = tempre; + (*(in+3)).y = tempim; + + tempre = (*(in+2)).x; + tempim = (*(in+2)).y; + (*(in+2)).x = (*(in+3)).z; + (*(in+2)).y = (*(in+3)).w; + (*(in+3)).z = tempre; + (*(in+3)).w = tempim; + + + tempre = (*(in+2)).z; + tempim = (*(in+2)).w; + (*(in+2)).z = (*(in+4)).x; + (*(in+2)).w = (*(in+4)).y; + (*(in+4)).x = tempre; + (*(in+4)).y = tempim; + + break; + + case 3: + + + tempre = (*(in+0)).x; + tempim = (*(in+0)).y; + (*(in+0)).x = (*(in+4)).w; + (*(in+0)).y = -1.0*(*(in+4)).z; + (*(in+4)).z = -1.0*tempim; + (*(in+4)).w = tempre; + + tempre = (*(in+0)).z; + tempim = (*(in+0)).w; + (*(in+0)).z = (*(in+5)).y; + (*(in+0)).w = -1.0*(*(in+5)).x; + (*(in+5)).x = -1.0*tempim; + (*(in+5)).y = tempre; + + tempre = (*(in+1)).x; + tempim = (*(in+1)).y; + (*(in+1)).x = (*(in+5)).w; + (*(in+1)).y = -1.0*(*(in+5)).z; + (*(in+5)).z = -1.0*tempim; + (*(in+5)).w = tempre; + + tempre = (*(in+1)).z; + tempim = (*(in+1)).w; + (*(in+1)).z = (*(in+3)).y; + (*(in+1)).w = -1.0*(*(in+3)).x; + (*(in+3)).x = -1.0*tempim; + (*(in+3)).y = tempre; + + tempre = (*(in+2)).x; + tempim = (*(in+2)).y; + (*(in+2)).x = (*(in+3)).w; + (*(in+2)).y = -1.0*(*(in+3)).z; + (*(in+3)).z = -1.0*tempim; + (*(in+3)).w = tempre; + + + tempre = (*(in+2)).z; + tempim = (*(in+2)).w; + (*(in+2)).z = (*(in+4)).y; + (*(in+2)).w = -1.0*(*(in+4)).x; + (*(in+4)).x = -1.0*tempim; + (*(in+4)).y = tempre; + + + break; + + + case 4: + + (*(in+3)).x = -1.0*(*(in+3)).x; + (*(in+3)).y = -1.0*(*(in+3)).y; + (*(in+3)).z = -1.0*(*(in+3)).z; + (*(in+3)).w = -1.0*(*(in+3)).w; + (*(in+4)).x = -1.0*(*(in+4)).x; + (*(in+4)).y = -1.0*(*(in+4)).y; + + (*(in+4)).z = -1.0*(*(in+4)).z; + (*(in+4)).w = -1.0*(*(in+4)).w; + (*(in+5)).x = -1.0*(*(in+5)).x; + (*(in+5)).y = -1.0*(*(in+5)).y; + (*(in+5)).z = -1.0*(*(in+5)).z; + (*(in+5)).w = -1.0*(*(in+5)).w; + break; + } +} + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/mixed_solve.cu b/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/mixed_solve.cu new file mode 100644 index 0000000000000000000000000000000000000000..5a33e4e37b5e4e048ca487484ad89c86526fca1e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/mixed_solve.cu @@ -0,0 +1,3132 @@ +/*********************************************************************** + * + * Copyright (C) 2010 Florian Burger + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * + * File: mixed_solve.cu + * + * CUDA GPU mixed_solver for EO and non-EO + * CUDA kernels for Hopping-Matrix and D_tm + * + * The externally accessible functions are + * + * + * extern "C" int mixed_solve_eo (spinor * const P, spinor * const Q, const int max_iter, + double eps, const int rel_prec, const int N) + * + * extern "C" int mixed_solve (spinor * const P, spinor * const Q, const int max_iter, + double eps, const int rel_prec,const int N) + * + * input: + * Q: source + * inout: + * P: initial guess and result + * + * + **************************************************************************/ + + + +#include +#include +#include "cublas.h" +#include +#include +#include +#include + +#include "../global.h" +#include "cudaglobal.h" +//#include "mixed_solve.h" +#include "HEADER.h" +#include "cudadefs.h" +#include + + +extern "C" { +#include "../tm_operators.h" +#include "../linalg_eo.h" +#include "../start.h" +#include "../complex.h" +#include "../read_input.h" +#include "../geometry_eo.h" +#include "../boundary.h" +#include "../su3.h" +#include "../temporalgauge.h" +#include "../observables.h" +#include "../measure_rectangles.h" +#include "../polyakov_loop.h" +#include "../su3spinor.h" +#include "../solver/solver_field.h" + +#ifdef MPI + #include "../xchange.h" +#endif + +} + + + +#ifdef HAVE_CONFIG_H + #include +#endif + + +#ifdef MPI + #undef MPI + #undef REAL + #include + #define MPI + #define REAL float +#endif + +#include "MACROS.cuh" +#include "cublasWrapper.cuh" + + + +int g_numofgpu; + +template +struct MixedsolveParameter +{//internal variables of mixed solver routine corresponding to fields on the device + #ifdef GF_8 + dev_su3_8M(RealT)* dev_gf; + dev_su3_8M(RealT)* h2d_gf; + #else + dev_su3_2vM(RealT)* dev_gf; + dev_su3_2vM(RealT)* h2d_gf; + #endif + + #ifndef HALF + dev_spinorM(RealT)* dev_spin1; + dev_spinorM(RealT)* dev_spin2; + dev_spinorM(RealT)* dev_spin3; + dev_spinorM(RealT)* dev_spin4; + dev_spinorM(RealT)* dev_spin5; + dev_spinorM(RealT)* dev_spinin; + dev_spinorM(RealT)* dev_spinout; + dev_spinorM(RealT)* h2d_spin; + + //additional spinors for even-odd + dev_spinorM(RealT)* dev_spin_eo1; + dev_spinorM(RealT)* dev_spin_eo2; + #else + + dev_spinor_half* dev_spin1; + dev_spinor_half* dev_spin2; + dev_spinor_half* dev_spin3; + dev_spinor_half* dev_spin4; + dev_spinor_half* dev_spin5; + dev_spinor_half* dev_spinin; + dev_spinor_half* dev_spinout; + dev_spinor_half* h2d_spin; + //additional spinors for even-odd + dev_spinor_half* dev_spin_eo1; + dev_spinor_half* dev_spin_eo2; + + + RealT* dev_spin1_norm; + RealT* dev_spin2_norm; + RealT* dev_spin3_norm; + RealT* dev_spin4_norm; + RealT* dev_spin5_norm; + RealT* dev_spinin_norm; + RealT* dev_spinout_norm; + RealT* h2d_spin_norm; + + RealT* dev_spin_eo1_norm; + RealT* dev_spin_eo2_norm; + + + // a half precsion gauge field + #ifdef GF_8 + dev_su3_8_half* dev_gf_half; + #else + dev_su3_2v_half* dev_gf_half; + #endif + #endif + + + + // selects global instance of this structure depending on template parameter RealT to determine precision + static MixedsolveParameter* getGlobalP(); +}; +MixedsolveParameter mixedsolveParameter ; +MixedsolveParameter mixedsolveParameterD; + +template inline MixedsolveParameter* MixedsolveParameter::getGlobalP() { printf("WARNING: MixedsolveParameter::getGlobal() called with invalid template argument.\n"); return NULL; } +template< > inline MixedsolveParameter* MixedsolveParameter::getGlobalP() { return &mixedsolveParameter ; } +template< > inline MixedsolveParameter* MixedsolveParameter::getGlobalP() { return &mixedsolveParameterD; } + + +//{ + int * nn; + int * nn_eo; + int * nn_oe; + int * eoidx_even; + int * eoidx_odd; + + int * dev_nn; + int * dev_nn_eo; + int * dev_nn_oe; + + int * dev_eoidx_even; + int * dev_eoidx_odd; + + + size_t output_size; + int* dev_grid; + float* dev_output; + + + REALD hostr; + REALD hostkappa; + REALD hostm; + REALD hostmu; +//} + + +int havedevice = 0; + + +__device__ REAL m; +__device__ REAL mu; +__device__ REAL r=1.0; // this is implicitly assumed to be 1.0 in the host code!!! +__device__ REAL kappa; +__device__ REAL twokappamu; + +__device__ dev_complex dev_k0; +__device__ dev_complex dev_k1; +__device__ dev_complex dev_k2; +__device__ dev_complex dev_k3; + +__device__ dev_complex dev_mk0; +__device__ dev_complex dev_mk1; +__device__ dev_complex dev_mk2; +__device__ dev_complex dev_mk3; + + + +__constant__ __device__ dev_complex dev_k0c; +__constant__ __device__ dev_complex dev_k1c; +__constant__ __device__ dev_complex dev_k2c; +__constant__ __device__ dev_complex dev_k3c; + +__constant__ __device__ dev_complex dev_mk0c; +__constant__ __device__ dev_complex dev_mk1c; +__constant__ __device__ dev_complex dev_mk2c; +__constant__ __device__ dev_complex dev_mk3c; + + + +__device__ int dev_LX,dev_LY,dev_LZ,dev_T,dev_VOLUME; + + + + + + + + + + + + + +// include files with other GPU code as all GPU code has to reside in one file +// the texture references and functions +#include "textures.cuh" +// if we want to use half precision +#ifdef HALF + #include "half.cuh" +#endif +// linear algebra functions and gamma-multiplications +#include "linalg.cuh" +// reconstruction of the gauge field +#include "gauge_reconstruction.cuh" +// the device su3 functions +#include "su3.cuh" +// the plaquette and rectangle routines +#include "observables.cuh" + + + + +#ifdef MPI + + +// from mixed_solve_eo_nd.cuh +__device__ int dev_RAND; // not used, maybe later ... +__device__ int dev_VOLUMEPLUSRAND; // is now used in dev_Hopping_Matrix_mpi() +__device__ int dev_rank; +__device__ int dev_nproc; + + + #ifndef ALTERNATE_FIELD_XCHANGE + spinor * spinor_xchange; // for xchange_field_wrapper() + #else + dev_spinor * R1; + dev_spinor * R2; + dev_spinor * R3; + dev_spinor * R4; + #endif + + +#if ASYNC > 0 + int nStreams = ASYNC_OPTIMIZED; + cudaStream_t stream[2*ASYNC_OPTIMIZED+1]; + + #ifndef HALF + dev_spinor * RAND1; // for exchanging the boundaries in ASYNC.cuh + dev_spinor * RAND2; + dev_spinor * RAND3; // page-locked memory + dev_spinor * RAND4; + #else + dev_spinor_half * RAND1; // for exchanging the boundaries in ASYNC.cuh + dev_spinor_half * RAND2; + dev_spinor_half * RAND3; // page-locked memory + dev_spinor_half * RAND4; + //we also need page-locked norms + float * RAND1_norm; + float * RAND2_norm; + float * RAND3_norm; + float * RAND4_norm; + #endif +#endif + + + +#if defined(ALTERNATE_FIELD_XCHANGE) || defined(ASYNC_OPTIMIZED) + MPI_Status stat[2]; + MPI_Request send_req[2]; + MPI_Request recv_req[2]; +#endif + + +#define EXTERN extern + // taken from global.h +EXTERN MPI_Status status; +EXTERN MPI_Request req1,req2,req3,req4; +EXTERN MPI_Comm g_cart_grid; +EXTERN MPI_Comm g_mpi_time_slices; +EXTERN MPI_Comm g_mpi_SV_slices; +EXTERN MPI_Comm g_mpi_z_slices; +EXTERN MPI_Comm g_mpi_ST_slices; + +/* the next neighbours for MPI */ +EXTERN int g_nb_x_up, g_nb_x_dn; +EXTERN int g_nb_y_up, g_nb_y_dn; +EXTERN int g_nb_t_up, g_nb_t_dn; +EXTERN int g_nb_z_up, g_nb_z_dn; + +#endif //MPI + + + +// the device Hopping_Matrix +#include "Hopping_Matrix.cuh" +// the non-EO twisted mass dirac operator +#include "tm_diracoperator.cuh" +// mixed solver, even/odd, non-degenerate two flavour +#include "mixed_solve_eo_nd.cuh" + +#ifdef MPI +// optimization of the communication + #include "ASYNC.cuh" +#endif + + + + + +#ifndef HALF +// computes sout = 1/(1 +- mutilde gamma5) sin = (1 -+ i mutilde gamma5)/(1+mutilde^2) sin +// mutilde = 2 kappa mu +// uses shared local memory for manipulation +template +__global__ void dev_mul_one_pm_imu_inv(dev_spinorM(RealT)* sin, dev_spinorM(RealT)* sout, const RealT sign){ + dev_spinorM(RealT) slocal[6]; + //need the inverse sign in the numerator because of inverse + dev_complexM(RealT) pm_imu = dev_initcomplex(0.0,-1.0*sign*twokappamu); + + RealT one_plus_musquare_inv = 1.0/(1.0 + twokappamu*twokappamu); + int pos; + pos= threadIdx.x + blockDim.x*blockIdx.x; + //not referenced: int ix = threadIdx.x; + if(pos < dev_VOLUME){ + //dev_skalarmult_spinor(&(sin[6*pos]), pm_imu, &(slocal[0])); + //dev_Gamma5(&(slocal[0])); + dev_skalarmult_gamma5_spinor(&(slocal[0]), pm_imu, &(sin[6*pos]) ); + dev_add_spinor_assign(&(slocal[0]), &(sin[6*pos])); + //dev_realmult_spinor(&(slocal[0]), one_plus_musquare_inv); + //dev_copy_spinor(&(slocal[0]), &(sout[6*pos])); + dev_realmult_spinor_assign(&(sout[6*pos]), one_plus_musquare_inv, &(slocal[0]) ); + } +} + + + + + +// sout = gamma_5*((1\pm i\mutilde \gamma_5)*sin1 - sin2) +// uses shared local memory for manipulation +template +__global__ void dev_mul_one_pm_imu_sub_mul_gamma5(dev_spinorM(RealT)* sin1, dev_spinorM(RealT)* sin2, dev_spinorM(RealT)* sout, const RealT sign){ + dev_spinorM(RealT) slocal[6]; + dev_complexM(RealT) pm_imu = dev_initcomplex(0.0, sign*twokappamu); // i mutilde + int pos; + pos= threadIdx.x + blockDim.x*blockIdx.x; + //not referenced: int ix = threadIdx.x; + if(pos < dev_VOLUME){ + //dev_skalarmult_spinor(&(sin1[6*pos]), pm_imu, &(slocal[0])); + //dev_Gamma5(&(slocal[0])); + dev_skalarmult_gamma5_spinor(&(slocal[0]),pm_imu,&(sin1[6*pos])); + dev_add_spinor_assign(&(slocal[0]), &(sin1[6*pos])); + dev_sub_spinor_assign(&(slocal[0]), &(sin2[6*pos])); + //dev_Gamma5(&(slocal[0])); + //dev_copy_spinor(&(slocal[0]), &(sout[6*pos])); + dev_Gamma5_assign(&(sout[6*pos]), &(slocal[0])); + } +} + + + + + + + + + + + + + +// aequivalent to Qtm_pm_psi in tm_operators.c +template +void dev_Qtm_pm_psi(dev_spinorM(RealT)* spinin, dev_spinorM(RealT)* spinout, int gridsize, int blocksize, int gridsize2, int blocksize2, MixedsolveParameter& mixedsolveParameter){ + //spinin == odd + //spinout == odd + + //Q_{-} + #ifdef MPI + xchange_field_wrapper(spinin, 0); + #endif + #ifdef USETEXTURE + bind_texture_spin(spinin,1); + #endif + //bind_texture_nn(dev_nn_eo); + //cudaFuncSetCacheConfig(dev_Hopping_Matrix, cudaFuncCachePreferL1); + dev_Hopping_Matrix <<>> + (mixedsolveParameter.dev_gf, spinin, mixedsolveParameter.dev_spin_eo1, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); //mixedsolveParameter.dev_spin_eo1 == even -> 0 + //unbind_texture_nn(); + #ifdef USETEXTURE + unbind_texture_spin(1); + #endif + dev_mul_one_pm_imu_inv <<>>(mixedsolveParameter.dev_spin_eo1,mixedsolveParameter.dev_spin_eo2, -1.); + + #ifdef MPI + xchange_field_wrapper(mixedsolveParameter.dev_spin_eo2, 1); + #endif + #ifdef USETEXTURE + bind_texture_spin(mixedsolveParameter.dev_spin_eo2,1); + #endif + //bind_texture_nn(dev_nn_oe); + //cudaFuncSetCacheConfig(dev_Hopping_Matrix, cudaFuncCachePreferL1); + dev_Hopping_Matrix <<>> + (mixedsolveParameter.dev_gf, mixedsolveParameter.dev_spin_eo2, mixedsolveParameter.dev_spin_eo1, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + //unbind_texture_nn(); + #ifdef USETEXTURE + unbind_texture_spin(1); + #endif + dev_mul_one_pm_imu_sub_mul_gamma5 <<>>(spinin, mixedsolveParameter.dev_spin_eo1, mixedsolveParameter.dev_spin_eo2, -1.); + + + //Q_{+} + #ifdef MPI + xchange_field_wrapper(mixedsolveParameter.dev_spin_eo2, 0); + #endif + #ifdef USETEXTURE + bind_texture_spin(mixedsolveParameter.dev_spin_eo2,1); + #endif + //bind_texture_nn(dev_nn_eo); + //cudaFuncSetCacheConfig(dev_Hopping_Matrix, cudaFuncCachePreferL1); + dev_Hopping_Matrix <<>> + (mixedsolveParameter.dev_gf, mixedsolveParameter.dev_spin_eo2, mixedsolveParameter.dev_spin_eo1, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); //mixedsolveParameter.dev_spin_eo1 == even -> 0 + //unbind_texture_nn(); + #ifdef USETEXTURE + unbind_texture_spin(1); + #endif + dev_mul_one_pm_imu_inv <<>>(mixedsolveParameter.dev_spin_eo1,spinout, +1.); + + #ifdef MPI + xchange_field_wrapper(spinout, 1); + #endif + #ifdef USETEXTURE + bind_texture_spin(spinout,1); + #endif + //bind_texture_nn(dev_nn_oe); + //cudaFuncSetCacheConfig(dev_Hopping_Matrix, cudaFuncCachePreferL1); + dev_Hopping_Matrix <<>> + (mixedsolveParameter.dev_gf, spinout, mixedsolveParameter.dev_spin_eo1, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + //unbind_texture_nn(); + #ifdef USETEXTURE + unbind_texture_spin(1); + #endif + dev_mul_one_pm_imu_sub_mul_gamma5 <<>>(mixedsolveParameter.dev_spin_eo2, mixedsolveParameter.dev_spin_eo1, spinout , +1.); +} + + + + +#ifdef MPI +// aequivalent to Qtm_pm_psi in tm_operators.c +// using HOPPING_ASYNC for mpi +template +void dev_Qtm_pm_psi_mpi(dev_spinorM(RealT)* spinin, dev_spinorM(RealT)* spinout, int gridsize, int blocksize, int gridsize2, int blocksize2,MixedsolveParameter& mixedsolveParameter){ + //spinin == odd + //spinout == odd + + //Q_{-} + + //cudaFuncSetCacheConfig(dev_Hopping_Matrix, cudaFuncCachePreferL1); + HOPPING_ASYNC(mixedsolveParameter.dev_gf, spinin, mixedsolveParameter.dev_spin_eo1, dev_eoidx_even, + dev_eoidx_odd, dev_nn_eo, 0,gridsize, blocksize); //mixedsolveParameter.dev_spin_eo1 == even -> 0 + + + + dev_mul_one_pm_imu_inv <<>>(mixedsolveParameter.dev_spin_eo1,mixedsolveParameter.dev_spin_eo2, -1.); + + + + + //cudaFuncSetCacheConfig(dev_Hopping_Matrix, cudaFuncCachePreferL1); + HOPPING_ASYNC(mixedsolveParameter.dev_gf, mixedsolveParameter.dev_spin_eo2, mixedsolveParameter.dev_spin_eo1, + dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1,gridsize, + blocksize); + + dev_mul_one_pm_imu_sub_mul_gamma5 <<>>(spinin, mixedsolveParameter.dev_spin_eo1, mixedsolveParameter.dev_spin_eo2, -1.); + + + //Q_{+} + + //cudaFuncSetCacheConfig(dev_Hopping_Matrix, cudaFuncCachePreferL1); + HOPPING_ASYNC(mixedsolveParameter.dev_gf, mixedsolveParameter.dev_spin_eo2, mixedsolveParameter.dev_spin_eo1, + dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0, gridsize, + blocksize); //mixedsolveParameter.dev_spin_eo1 == even -> 0 + + dev_mul_one_pm_imu_inv <<>>(mixedsolveParameter.dev_spin_eo1,spinout, +1.); + + + //cudaFuncSetCacheConfig(dev_Hopping_Matrix, cudaFuncCachePreferL1); + HOPPING_ASYNC(mixedsolveParameter.dev_gf, spinout, mixedsolveParameter.dev_spin_eo1, dev_eoidx_odd, + dev_eoidx_even, dev_nn_oe, 1,gridsize, blocksize); + + dev_mul_one_pm_imu_sub_mul_gamma5 <<>>(mixedsolveParameter.dev_spin_eo2, mixedsolveParameter.dev_spin_eo1, spinout , +1.); +} +#endif + + + + +#else // HALF + +// computes sout = 1/(1 +- mutilde gamma5) sin = (1 -+ i mutilde gamma5)/(1+mutilde^2) sin +// mutilde = 2 kappa mu +// uses shared local memory for manipulation +__global__ void dev_mul_one_pm_imu_inv_half(dev_spinor_half* sin, float* sin_norm, dev_spinor_half* sout, float* sout_norm, const REAL sign){ + + typedef REAL RealT; + dev_spinor slocal[6]; + dev_spinor s[6]; + float norm; + + //need the inverse sign in the numerator because of inverse + dev_complex pm_imu = dev_initcomplex(0.0,-1.0*sign*twokappamu); + + REAL one_plus_musquare_inv = 1.0/(1.0 + twokappamu*twokappamu); + int pos; + pos= threadIdx.x + blockDim.x*blockIdx.x; + int ix = threadIdx.x; + if(pos < dev_VOLUME){ + norm = sin_norm[pos]; + construct_spinor_fromhalf(s, sin, norm, pos); + + dev_skalarmult_gamma5_spinor(&(slocal[0]), pm_imu, &(s[0]) ); + dev_add_spinor_assign(&(slocal[0]), &(s[0])); + + dev_realmult_spinor_assign(&(s[0]), one_plus_musquare_inv, &(slocal[0]) ); + + dev_write_spinor_half(&(s[0]),&(sout[6*pos]), &(sout_norm[pos])); + } +} + + + + + +// sout = gamma_5*((1\pm i\mutilde \gamma_5)*sin1 - sin2) +// uses shared local memory for manipulation +__global__ void dev_mul_one_pm_imu_sub_mul_gamma5_half(dev_spinor_half* sin1, float* sin1_norm, dev_spinor_half* sin2, float* sin2_norm, dev_spinor_half* sout, float* sout_norm, const REAL sign){ + typedef REAL RealT; + dev_spinor slocal[6]; + dev_spinor s1[6]; + dev_spinor s2[6]; + float norm; + dev_complex pm_imu = dev_initcomplex(0.0, sign*twokappamu); // i mutilde + int pos; + pos= threadIdx.x + blockDim.x*blockIdx.x; + int ix = threadIdx.x; + if(pos < dev_VOLUME){ + norm = sin1_norm[pos]; + construct_spinor_fromhalf(s1, sin1,norm, pos); + norm = sin2_norm[pos]; + construct_spinor_fromhalf(s2, sin2, norm, pos); + + dev_skalarmult_gamma5_spinor(&(slocal[0]),pm_imu,&(s1[0])); + dev_add_spinor_assign(&(slocal[0]), &(s1[0])); + dev_sub_spinor_assign(&(slocal[0]), &(s2[0])); + dev_Gamma5_assign(&(s1[0]), &(slocal[0])); + dev_write_spinor_half(&(s1[0]),&(sout[6*pos]), &(sout_norm[pos])); + } +} + + + + + +// aequivalent to Qtm_pm_psi in tm_operators.c for half precision +extern "C" void dev_Qtm_pm_psi_half(dev_spinor_half* spinin, float* spinin_norm, dev_spinor_half* spinout, float* spinout_norm, int gridsize, int blocksize, int gridsize2, int blocksize2){ + //spinin == odd + //spinout == odd + + //Q_{-} + #ifdef USETEXTURE + bind_halfspinor_texture(spinin, spinin_norm); + #endif + //cudaFuncSetCacheConfig(dev_Hopping_Matrix_half, cudaFuncCachePreferL1); + dev_Hopping_Matrix_half<<>> + (mixedsolveParameter.dev_gf_half, spinin, spinin_norm, mixedsolveParameter.dev_spin_eo1, mixedsolveParameter.dev_spin_eo1_norm, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); //mixedsolveParameter.dev_spin_eo1 == even -> 0 + #ifdef USETEXTURE + unbind_halfspinor_texture(); + #endif + dev_mul_one_pm_imu_inv_half<<>>(mixedsolveParameter.dev_spin_eo1, mixedsolveParameter.dev_spin_eo1_norm ,mixedsolveParameter.dev_spin_eo2, mixedsolveParameter.dev_spin_eo2_norm, -1.); + + #ifdef USETEXTURE + bind_halfspinor_texture(mixedsolveParameter.dev_spin_eo2, mixedsolveParameter.dev_spin_eo2_norm); + #endif + //cudaFuncSetCacheConfig(dev_Hopping_Matrix_half, cudaFuncCachePreferL1); + dev_Hopping_Matrix_half<<>> + (mixedsolveParameter.dev_gf_half, mixedsolveParameter.dev_spin_eo2, mixedsolveParameter.dev_spin_eo2_norm, mixedsolveParameter.dev_spin_eo1, mixedsolveParameter.dev_spin_eo1_norm, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + #ifdef USETEXTURE + unbind_halfspinor_texture(); + #endif + dev_mul_one_pm_imu_sub_mul_gamma5_half<<>>(spinin, spinin_norm, mixedsolveParameter.dev_spin_eo1, mixedsolveParameter.dev_spin_eo1_norm, mixedsolveParameter.dev_spin_eo2, mixedsolveParameter.dev_spin_eo2_norm, -1.); + + //Q_{+} + #ifdef USETEXTURE + bind_halfspinor_texture(mixedsolveParameter.dev_spin_eo2, mixedsolveParameter.dev_spin_eo2_norm); + #endif + //cudaFuncSetCacheConfig(dev_Hopping_Matrix_half, cudaFuncCachePreferL1); + dev_Hopping_Matrix_half<<>> + (mixedsolveParameter.dev_gf_half, mixedsolveParameter.dev_spin_eo2, mixedsolveParameter.dev_spin_eo2_norm, mixedsolveParameter.dev_spin_eo1, mixedsolveParameter.dev_spin_eo1_norm, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); //mixedsolveParameter.dev_spin_eo1 == even -> 0 + #ifdef USETEXTURE + unbind_halfspinor_texture(); + #endif + dev_mul_one_pm_imu_inv_half<<>>(mixedsolveParameter.dev_spin_eo1, mixedsolveParameter.dev_spin_eo1_norm,spinout, spinout_norm, +1.); + + #ifdef USETEXTURE + bind_halfspinor_texture(spinout, spinout_norm); + #endif + //cudaFuncSetCacheConfig(dev_Hopping_Matrix_half, cudaFuncCachePreferL1); + dev_Hopping_Matrix_half<<>> + (mixedsolveParameter.dev_gf_half, spinout, spinout_norm, mixedsolveParameter.dev_spin_eo1, mixedsolveParameter.dev_spin_eo1_norm, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + #ifdef USETEXTURE + unbind_halfspinor_texture(); + #endif + dev_mul_one_pm_imu_sub_mul_gamma5_half<<>>(mixedsolveParameter.dev_spin_eo2, mixedsolveParameter.dev_spin_eo2_norm, mixedsolveParameter.dev_spin_eo1, mixedsolveParameter.dev_spin_eo1_norm, spinout, spinout_norm , +1.); +} + + +#ifdef MPI + +// aequivalent to Qtm_pm_psi in tm_operators.c for half precision +extern "C" void dev_Qtm_pm_psi_half_mpi(dev_spinor_half* spinin, float* spinin_norm, dev_spinor_half* spinout, float* spinout_norm, int gridsize, int blocksize, int gridsize2, int blocksize2){ + //spinin == odd + //spinout == odd + + //Q_{-} + HOPPING_HALF_ASYNC(mixedsolveParameter.dev_gf_half, spinin, spinin_norm, mixedsolveParameter.dev_spin_eo1, mixedsolveParameter.dev_spin_eo1_norm, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0,gridsize, blocksize); //mixedsolveParameter.dev_spin_eo1 == even -> 0 + + dev_mul_one_pm_imu_inv_half<<>>(mixedsolveParameter.dev_spin_eo1, mixedsolveParameter.dev_spin_eo1_norm ,mixedsolveParameter.dev_spin_eo2, mixedsolveParameter.dev_spin_eo2_norm, -1.); + + + HOPPING_HALF_ASYNC(mixedsolveParameter.dev_gf_half, mixedsolveParameter.dev_spin_eo2, mixedsolveParameter.dev_spin_eo2_norm, mixedsolveParameter.dev_spin_eo1, mixedsolveParameter.dev_spin_eo1_norm, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1,gridsize, blocksize); + + dev_mul_one_pm_imu_sub_mul_gamma5_half<<>>(spinin, spinin_norm, mixedsolveParameter.dev_spin_eo1, mixedsolveParameter.dev_spin_eo1_norm, mixedsolveParameter.dev_spin_eo2, mixedsolveParameter.dev_spin_eo2_norm, -1.); + + //Q_{+} + HOPPING_HALF_ASYNC (mixedsolveParameter.dev_gf_half, mixedsolveParameter.dev_spin_eo2, mixedsolveParameter.dev_spin_eo2_norm, mixedsolveParameter.dev_spin_eo1, mixedsolveParameter.dev_spin_eo1_norm, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0,gridsize, blocksize); //mixedsolveParameter.dev_spin_eo1 == even -> 0 + + dev_mul_one_pm_imu_inv_half<<>>(mixedsolveParameter.dev_spin_eo1, mixedsolveParameter.dev_spin_eo1_norm,spinout, spinout_norm, +1.); + + HOPPING_HALF_ASYNC (mixedsolveParameter.dev_gf_half, spinout, spinout_norm, mixedsolveParameter.dev_spin_eo1, mixedsolveParameter.dev_spin_eo1_norm, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1,gridsize, blocksize); + + dev_mul_one_pm_imu_sub_mul_gamma5_half<<>>(mixedsolveParameter.dev_spin_eo2, mixedsolveParameter.dev_spin_eo2_norm, mixedsolveParameter.dev_spin_eo1, mixedsolveParameter.dev_spin_eo1_norm, spinout, spinout_norm , +1.); +} +#endif // MPI + + + + + +/* +extern "C" void dev_Qtm_pm_psi(dev_spinor* spinin, dev_spinor* spinout, int gridsize, int blocksize, int gridsize2, int blocksize2){ + + printf("WARNING: dummy function 'dev_Qtm_pm_psi' was called\n"); + +} +*/ + + + + + +#endif //HALF + + + +template +__global__ void dev_zero_spinor_field(typename dev_spinorT::type* s1){ + int pos; + pos= threadIdx.x + blockDim.x*blockIdx.x; + if(pos < dev_VOLUME){ + dev_zero_spinor(&(s1[6*pos])); + } +} + + + + +template +__global__ void dev_copy_spinor_field(dev_spinorM(RealT1)* s1, dev_spinorM(RealT2)* s2){ + int pos; + pos= threadIdx.x + blockDim.x*blockIdx.x; + if(pos < dev_VOLUME){ + dev_copy_spinor(&(s1[6*pos]),&(s2[6*pos])); + } +} + + + +template +__global__ void dev_skalarmult_add_assign_spinor_field(dev_spinorM(RealT)* s1, RealT lambda, dev_spinorM(RealT)* s2, dev_spinorM(RealT)* so){ + int pos; + pos= threadIdx.x + blockDim.x*blockIdx.x; + if(pos < dev_VOLUME){ + dev_skalarmult_add_assign_spinor(&(s1[6*pos]), lambda ,&(s2[6*pos]), &(so[6*pos]) ); + } +} + + + +template +__global__ void dev_skalarmult_spinor_field(dev_spinorM(RealT)* s1, RealT lambda, dev_spinorM(RealT)* so){ + int pos; + pos= threadIdx.x + blockDim.x*blockIdx.x; + if(pos < dev_VOLUME){ + dev_skalarmult_spinor(&(s1[6*pos]), dev_initcomplex(lambda,0.0) , &(so[6*pos]) ); + } +} + + + +template +__global__ void dev_complexmult_spinor_field(dev_spinorM(RealT)* s1, dev_complexM(RealT) lambda, dev_spinorM(RealT)* so){ + int pos; + pos= threadIdx.x + blockDim.x*blockIdx.x; + if(pos < dev_VOLUME){ + dev_skalarmult_spinor(&(s1[6*pos]), lambda , &(so[6*pos]) ); + } +} + + + + + + +// init the gpu inner solver, assigen constants etc. +__global__ void he_cg_init (int* grid, REAL param_kappa, REAL param_mu, dev_complex k0, dev_complex k1, dev_complex k2, dev_complex k3){ + dev_LX = grid[0]; + dev_LY = grid[1]; + dev_LZ = grid[2]; + dev_T = grid[3]; + dev_VOLUME = grid[4]; // grid[4] is initialized 1/2 VOLUME for eo + + kappa = param_kappa; + mu = param_mu; + twokappamu = 2.0*param_kappa*param_mu; + + dev_k0.re = k0.re; + dev_k0.im = k0.im; + dev_mk0.re = -k0.re; + dev_mk0.im = -k0.im; + + dev_k1.re = k1.re; + dev_k1.im = k1.im; + dev_mk1.re = -k1.re; + dev_mk1.im = -k1.im; + + dev_k2.re = k2.re; + dev_k2.im = k2.im; + dev_mk2.re = -k2.re; + dev_mk2.im = -k2.im; + + dev_k3.re = k3.re; + dev_k3.im = k3.im; + dev_mk3.re = -k3.re; + dev_mk3.im = -k3.im; +} + + + + + +// init the gpu, assign dimensions +__global__ void dev_init_grid (int* grid){ + dev_LX = grid[0]; + dev_LY = grid[1]; + dev_LZ = grid[2]; + dev_T = grid[3]; + dev_VOLUME = grid[4]; // grid[4] is initialized 1/2 VOLUME for eo +} + + + + + +// code to list available devices, not yet included in main code +// this is copied from the CUDA sdk +extern "C" int find_devices() { + + int deviceCount, dev; + + cudaGetDeviceCount(&deviceCount); + + #ifdef MPI + if (g_cart_id == 0) { + #endif + + if (deviceCount == 0) + printf("There is no device supporting CUDA\n"); + for (dev = 0; dev < deviceCount; ++dev) { + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, dev); + if (dev == 0) { + if (deviceProp.major == 9999 && deviceProp.minor == 9999) + printf("There is no device supporting CUDA.\n"); + else if (deviceCount == 1) + printf("There is 1 device supporting CUDA\n"); + else + printf("There are %d devices supporting CUDA\n", deviceCount); + } + printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name); + printf(" Major revision number: %d\n", + deviceProp.major); + printf(" Minor revision number: %d\n", + deviceProp.minor); + printf(" Total amount of global memory: %u bytes\n", + deviceProp.totalGlobalMem); + #if CUDART_VERSION >= 2000 + printf(" Number of multiprocessors: %d\n", + deviceProp.multiProcessorCount); + printf(" Number of cores: %d\n", + 8 * deviceProp.multiProcessorCount); + #endif + printf(" Total amount of constant memory: %u bytes\n", + deviceProp.totalConstMem); + printf(" Total amount of shared memory per block: %u bytes\n", + deviceProp.sharedMemPerBlock); + printf(" Total number of registers available per block: %d\n", + deviceProp.regsPerBlock); + printf(" Warp size: %d\n", + deviceProp.warpSize); + printf(" Maximum number of threads per block: %d\n", + deviceProp.maxThreadsPerBlock); + printf(" Maximum sizes of each dimension of a block: %d x %d x %d\n", + deviceProp.maxThreadsDim[0], + deviceProp.maxThreadsDim[1], + deviceProp.maxThreadsDim[2]); + printf(" Maximum sizes of each dimension of a grid: %d x %d x %d\n", + deviceProp.maxGridSize[0], + deviceProp.maxGridSize[1], + deviceProp.maxGridSize[2]); + printf(" Maximum memory pitch: %u bytes\n", + deviceProp.memPitch); + printf(" Texture alignment: %u bytes\n", + deviceProp.textureAlignment); + printf(" Clock rate: %.2f GHz\n", + deviceProp.clockRate * 1e-6f); + #if CUDART_VERSION >= 2000 + printf(" Concurrent copy and execution: %s\n", + deviceProp.deviceOverlap ? "Yes" : "No"); + #endif + } + + #ifdef MPI + } + #endif + + return(deviceCount); +} + + + + + + + + + +extern "C" void test_operator(dev_su3_2v * gf,dev_spinor* spinin, dev_spinor* spinout, +dev_spinor* spin0, dev_spinor* spin1, dev_spinor* spin2, dev_spinor* spin3, dev_spinor* spin4, int *grid, int * nn_grid, REAL* output,REAL* erg, int xsize, int ysize){ + + typedef REAL RealT; + int gridsize; + + dim3 blockdim(1,1); + dim3 blockdim2(128,1,1); + if( VOLUME >= 128){ + gridsize =VOLUME/128; + } + else{ + gridsize=1; + } + dim3 griddim2(gridsize,1,1); + + + dim3 blockdim3(BLOCK,1,1); + if( VOLUME >= BLOCK){ + gridsize = (int) VOLUME/BLOCK + 1; + } + else{ + gridsize=1; + } + dim3 griddim3(gridsize,1,1); + + + dev_complex h0,h1,h2,h3; + h0.re = (REAL)ka0.re; h0.im = (REAL)ka0.im; + h1.re = (REAL)ka1.re; h1.im = (REAL)ka1.im; + h2.re = (REAL)ka2.re; h2.im = (REAL)ka2.im; + h3.re = (REAL)ka3.re; h3.im = (REAL)ka3.im; + he_cg_init<<< 1, 1 >>> (grid, (REAL) g_kappa, (REAL)(g_mu/(2.0*g_kappa)), h0,h1,h2,h3); + + + REAL scaleparam = sqrt(1.0/(2.0 * (REAL) hostkappa)); + dev_skalarmult_spinor_field<<>>(spinin,scaleparam*scaleparam, spin4); + + #ifdef USETEXTURE + bind_texture_gf(gf); + bind_texture_spin(spin4,1); + #endif + // apply D_tm + dev_tm_dirac_kappa <<>>(gf, spin4, spinout, nn_grid); + + #ifdef USETEXTURE + unbind_texture_gf(); + unbind_texture_spin(1); + #endif +} + + + + + +// this is the eo version of the device cg inner solver +// we invert the hermitean D_tm D_tm^{+} + + +/* +/// member definition of CG-interface class /// + +templateclass MixedsolveOperator // interface class +{ +public: + virtual ~MixedsolveOperator() { } + + virtual void gpuInit (dev_spinorM(RealT)* spinin,dev_spinorM(RealT)* spinTmp,dev_spinorM(RealT)* spinout,dev_su3_2vM(RealT)* gf,int* dev_nn,const dim3& linAlgGriddim,const dim3& linAlgBlockdim) { } + virtual void gpu (dev_spinorM(RealT)* spinin,dev_spinorM(RealT)* spinTmp,dev_spinorM(RealT)* spinout,dev_su3_2vM(RealT)* gf,int* dev_nn,const dim3& linAlgGriddim,const dim3& linAlgBlockdim) =0; + virtual void gpuDeinit(dev_spinorM(RealT)* spininout,dev_spinorM(RealT)* spinTmp,dev_su3_2vM(RealT)* gf,int* dev_nn,const dim3& linAlgGriddim,const dim3& linAlgBlockdim,const RealT scaleparam) { } + + virtual void checkInit (spinor* const spinin,spinor* const spinTmp,spinor* const spinout,int Volume) { } + virtual void check (spinor* const conjungateBasisPSpinin,spinor* const spinout,const int Volume) =0; + virtual void checkDeinit(spinor* const spinin,spinor* const spinTmp,spinor* const spinout,int Volume) { } +}; +*/ +templateMixedsolveOperator::~MixedsolveOperator() { } + +templatevoid MixedsolveOperator::gpuInit (dev_spinorM(RealT)* spinin,dev_spinorM(RealT)* spinTmp,dev_spinorM(RealT)* spinout,dev_su3_2vM(RealT)* gf,int* dev_nn,const dim3& linAlgGriddim,const dim3& linAlgBlockdim) { } +templatevoid MixedsolveOperator::gpuDeinit(dev_spinorM(RealT)* spininout,dev_spinorM(RealT)* spinTmp,dev_su3_2vM(RealT)* gf,int* dev_nn,const dim3& linAlgGriddim,const dim3& linAlgBlockdim,const RealT scaleparam) { } + +templatevoid MixedsolveOperator::checkInit (spinor* const spinin,spinor* const spinTmp,spinor* const spinout,int Volume) { } +templatevoid MixedsolveOperator::checkDeinit(spinor* const spinin,spinor* const spinTmp,spinor* const spinout,int Volume) { } + + + +templateclass MixedsolveOperatorT> +int dev_cg( + dev_su3_2vM(RealT)* gf, + dev_spinorM(RealT)* spinin, + dev_spinorM(RealT)* spinout, + dev_spinorM(RealT)* spin0, + dev_spinorM(RealT)* spin1, + dev_spinorM(RealT)* spin2, + dev_spinorM(RealT)* spin3, + dev_spinorM(RealT)* spin4, + int* grid, int* nn_grid, MixedsolveOperatorT& mixedsolveOperator, + REALD initial_sourcesquarenorm,bool rel_prec,double finalEps/*,bool& reachedFinalPrecision*/){ + + + RealT host_alpha, host_beta, host_dotprod, host_rk, sourcesquarenorm; + RealT * dotprod, * dotprod2, * rk, * alpha, *beta; + + + cudaError_t cudaerr; + int i, gridsize; + int maxit = max_innersolver_it; + RealT eps = (RealT) innersolver_precision; + int N_recalcres = 30; // after N_recalcres iterations calculate r = A x_k - b + + + // initialize grid and block, make sure VOLUME is a multiple of blocksize + if(VOLUME%DOTPROD_DIM != 0){ + printf("Error: VOLUME is not a multiple of DOTPROD_DIM. Aborting...\n"); + exit(100); + } + + // this is the partitioning for the copying of fields + dim3 blockdim(1,1); + dim3 blockdim2(128,1,1); + if( VOLUME >= 128){ + gridsize = (int) VOLUME/128 + 1; + } + else{ + gridsize=1; + } + dim3 griddim2(gridsize,1,1); + + mixedsolveOperator.gpuInit(spin2,spin4,spin3,gf,dev_nn,griddim2,blockdim2); + + + //Initialize some stuff + printf("mu = %f\n", g_mu); + dev_complexM(RealT) h0,h1,h2,h3; + h0.re = (RealT)ka0.re; h0.im = (RealT)ka0.im; + h1.re = (RealT)ka1.re; h1.im = (RealT)ka1.im; + h2.re = (RealT)ka2.re; h2.im = (RealT)ka2.im; + h3.re = (RealT)ka3.re; h3.im = (RealT)ka3.im; + he_cg_init<<< 1, 1 >>> (grid, (RealT) g_kappa, (RealT)(g_mu/(2.0*g_kappa)), h0,h1,h2,h3); + // BEWARE in dev_tm_dirac_kappa we need the true mu (not 2 kappa mu!) + + // Init x,p,r for k=0 + // Allocate some numbers for host <-> device interaction + cudaMalloc((void **) &dotprod, sizeof(RealT)); + cudaMalloc((void **) &dotprod2, sizeof(RealT)); + cudaMalloc((void **) &rk, sizeof(RealT)); + cudaMalloc((void **) &alpha, sizeof(RealT)); + cudaMalloc((void **) &beta, sizeof(RealT)); + printf("%s\n", cudaGetErrorString(cudaGetLastError())); + + + //init blas + cublasInit(); + printf("%s\n", cudaGetErrorString(cudaGetLastError())); + printf("have initialized cublas\n"); + + + // go over to kappa (if wanted) + RealT scaleparam = sqrt(1.0/(2.0 * (RealT)hostkappa)); + printf("1/2kappa = %.16f\n",scaleparam); + //dev_skalarmult_spinor_field<<>>(spinin,scaleparam, spin1); + //dev_copy_spinor_field<<>>(spin1, spinin); + + + dev_copy_spinor_field <<>>(spinin, spin0); + dev_zero_spinor_field <<>>(spin1); // x_0 = 0 + dev_copy_spinor_field <<>>(spinin, spin2); + dev_zero_spinor_field <<>>(spin3); + printf("%s\n", cudaGetErrorString(cudaGetLastError())); + + + + + //relative precision -> get initial residue + sourcesquarenorm = cublasDot (24*VOLUME, (const RealT*)spinin, 1, (const RealT*)spinin, 1); + host_rk = sourcesquarenorm; //for use in main loop + printf("Squarenorm Source:\t%.16e\n", sourcesquarenorm); + printf("%s\n", cudaGetErrorString(cudaGetLastError())); + + printf("Entering cg-loop\n"); + for(i=0;i=sizeof(REALD) && ((host_rk<=eps&&rel_prec==0) || (host_rk<=finalEps*initial_sourcesquarenorm&&rel_prec==1)) )//different from abort criterium some lines above: here we check wether we reached the final desired precision, which only works in double precision + {//the final precision is reached + printf("inner solver: Reached precision of eps=%.2e\n",( rel_prec==0 ? eps : finalEps )); + break;//escape innner solver if desired prec. is reached: should not happen with singele precision - here only the double prec. outer solver is reliable + } + }//recalculate residue + }//MAIN LOOP cg + + + printf("Final residue: %.6e\n",host_dotprod); + // x_result = spin1 ! + + mixedsolveOperator.gpuDeinit(spin1,spin4,gf,dev_nn,griddim2,blockdim2,scaleparam); + dev_copy_spinor_field <<>>(spin1,spinout); + + #ifdef USETEXTURE + unbind_texture_gf(); + #endif + cudaFree(dotprod); + cudaFree(dotprod2); + cudaFree(rk); + cudaFree(alpha); + cudaFree(beta); + cublasShutdown(); + return(i); +} + + + +template +void showspinor(dev_spinorM(RealT)* s){ + int i,j; + dev_spinor help[6]; + size_t size = 6*sizeof(dev_spinorM(RealT)); + + for(i=0; i +int dev_cg_eo( + dev_su3_2vM(RealT)* gf, + dev_spinorM(RealT)* spinin, + dev_spinorM(RealT)* spinout, + dev_spinorM(RealT)* spin0, + dev_spinorM(RealT)* spin1, + dev_spinorM(RealT)* spin2, + dev_spinorM(RealT)* spin3, + dev_spinorM(RealT)* spin4, + int* grid, int* nn_grid, RealT epsfinal, MixedsolveParameter& mixedsolveParameter){ + + + RealT host_alpha, host_beta, host_dotprod, host_rk, sourcesquarenorm; + RealT * dotprod, * dotprod2, * rk, * alpha, *beta; + + + + int i, gridsize; + int maxit = max_innersolver_it; + RealT eps = (RealT) innersolver_precision; + int N_recalcres = 40; // after N_recalcres iterations calculate r = A x_k - b + + cudaError_t cudaerr; + + // this is the partitioning for the copying of fields + dim3 blockdim(1,1); + //dim3 blockdim2(128,1,1); + + int blockdim2 = BLOCK3; + if( VOLUME/2 % blockdim2 == 0){ + gridsize = (int) VOLUME/2/blockdim2; + } + else{ + gridsize = (int) VOLUME/2/blockdim2 + 1; + } + int griddim2 = gridsize; + + + //this is the partitioning for the HoppingMatrix kernel + /* + int blockdim3=BLOCK; + if( VOLUME/2 >= BLOCK){ + gridsize = (int)(VOLUME/2/BLOCK) + 1; + } + else{ + gridsize=1; + } + int griddim3=gridsize; + */ + int blockdim3 = BLOCK; + if( VOLUME/2 % blockdim3 == 0){ + gridsize = (int) VOLUME/2/blockdim3; + } + else{ + gridsize = (int) VOLUME/2/blockdim3 + 1; + } + int griddim3 = gridsize; + + + if (g_proc_id == 0) { printf("gridsize = %d\nsizeof(Real) = %hi\n", gridsize, sizeof(RealT)); } + + + + //this is the partitioning for dev_mul_one_pm... + /* + int blockdim4=BLOCK2; + if( VOLUME/2 >= BLOCK2){ + gridsize = (int)(VOLUME/2/BLOCK2) + 1; + } + else{ + gridsize=1; + } + int griddim4=gridsize; + */ + int blockdim4 = BLOCK2; + if( VOLUME/2 % blockdim4 == 0){ + gridsize = (int) VOLUME/2/blockdim4; + } + else{ + gridsize = (int) VOLUME/2/blockdim4 + 1; + } + int griddim4 = gridsize; + + + //never referenced: size_t size2 = sizeof(dev_spinorM(RealT))*6*VOLUME/2; + + + //Initialize some stuff + + + if (g_proc_id == 0) printf("mu = %f\n", g_mu); + + + + + dev_complex h0,h1,h2,h3,mh0, mh1, mh2, mh3; + h0.re = (RealT)ka0.re; h0.im = -(RealT)ka0.im; + h1.re = (RealT)ka1.re; h1.im = -(RealT)ka1.im; + h2.re = (RealT)ka2.re; h2.im = -(RealT)ka2.im; + h3.re = (RealT)ka3.re; h3.im = -(RealT)ka3.im; + + mh0.re = -(RealT)ka0.re; mh0.im = (RealT)ka0.im; + mh1.re = -(RealT)ka1.re; mh1.im = (RealT)ka1.im; + mh2.re = -(RealT)ka2.re; mh2.im = (RealT)ka2.im; + mh3.re = -(RealT)ka3.re; mh3.im = (RealT)ka3.im; + + // try using constant mem for kappas + cudaMemcpyToSymbol("dev_k0c", &h0, sizeof(h0)) ; + cudaMemcpyToSymbol("dev_k1c", &h1, sizeof(h1)) ; + cudaMemcpyToSymbol("dev_k2c", &h2, sizeof(h2)) ; + cudaMemcpyToSymbol("dev_k3c", &h3, sizeof(h3)) ; + + cudaMemcpyToSymbol("dev_mk0c", &mh0, sizeof(mh0)) ; + cudaMemcpyToSymbol("dev_mk1c", &mh1, sizeof(mh1)) ; + cudaMemcpyToSymbol("dev_mk2c", &mh2, sizeof(mh2)) ; + cudaMemcpyToSymbol("dev_mk3c", &mh3, sizeof(mh3)) ; + + he_cg_init<<< 1, 1 >>> (grid, (REAL) g_kappa, (REAL)(g_mu/(2.0*g_kappa)), h0,h1,h2,h3); + // BEWARE in dev_tm_dirac_kappa we need the true mu (not 2 kappa mu!) + + #ifdef MPI + he_cg_init_nd_additional_mpi<<<1,1>>>(VOLUMEPLUSRAND, RAND, g_cart_id, g_nproc); + // debug // check dev_VOLUMEPLUSRAND and dev_RAND on device + if (g_proc_id == 0) { + int host_check_VOLUMEPLUSRAND, host_check_RAND; + int host_check_rank, host_check_nproc; + cudaMemcpyFromSymbol(&host_check_VOLUMEPLUSRAND, dev_VOLUMEPLUSRAND, sizeof(int)); + cudaMemcpyFromSymbol(&host_check_RAND, dev_RAND, sizeof(int)); + printf("\tOn device:\n"); + printf("\tdev_VOLUMEPLUSRAND = %i\n", host_check_VOLUMEPLUSRAND); + printf("\tdev_RAND = %i\n", host_check_RAND); + cudaMemcpyFromSymbol(&host_check_rank, dev_rank, sizeof(int)); + cudaMemcpyFromSymbol(&host_check_nproc, dev_nproc, sizeof(int)); + printf("\tdev_rank = %i\n", host_check_rank); + printf("\tdev_nproc = %i\n", host_check_nproc); + } + #endif + + + #ifdef USETEXTURE + //Bind texture gf + bind_texture_gf(gf); + #endif + + + // Init x,p,r for k=0 + // Allocate some numbers for host <-> device interaction + cudaMalloc((void **) &dotprod, sizeof(RealT)); + cudaMalloc((void **) &dotprod2, sizeof(RealT)); + cudaMalloc((void **) &rk, sizeof(RealT)); + cudaMalloc((void **) &alpha, sizeof(RealT)); + cudaMalloc((void **) &beta, sizeof(RealT)); + printf("%s\n", cudaGetErrorString(cudaGetLastError())); + + + //init blas + cublasInit(); + + + if (g_proc_id == 0) { + printf("%s\n", cudaGetErrorString(cudaGetLastError())); + printf("have initialized cublas\n"); + } + + + + + + + //dev_skalarmult_spinor_field<<>>(spinin,scaleparam, spin1); + //dev_copy_spinor_field<<>>(spin1, spinin); + + + dev_copy_spinor_field <<>>(spinin, spin0); + dev_zero_spinor_field <<>>(spin1); // x_0 = 0 + dev_copy_spinor_field <<>>(spinin, spin2); + dev_zero_spinor_field <<>>(spin3); + + + if (g_proc_id == 0) printf("%s\n", cudaGetErrorString(cudaGetLastError())); + + + + + + //relative precision -> get initial residue + #ifndef MPI + sourcesquarenorm = cublasDot (24*VOLUME/2, (const RealT*)spinin, 1, (const RealT*)spinin, 1); + #else + sourcesquarenorm = cublasDot_wrapper (24*VOLUME/2, (RealT*)spinin, 1, (RealT*)spinin, 1); + #endif + host_rk = sourcesquarenorm; //for use in main loop + + + if (g_proc_id == 0) { + printf("Squarenorm Source:\t%.16e\n", sourcesquarenorm); + printf("%s\n", cudaGetErrorString(cudaGetLastError())); + printf("Entering inner solver cg-loop\n"); + } + + + + + + + for(i=0;i(spin2, spin3, griddim3, blockdim3, griddim4, blockdim4, mixedsolveParameter); + #else + dev_Qtm_pm_psi_mpi(spin2, spin3, griddim3, blockdim3, griddim4, blockdim4, mixedsolveParameter); + #endif + + if((cudaerr=cudaGetLastError()) != cudaSuccess){ + printf("%s\n", cudaGetErrorString(cudaerr)); + exit(200); + } + + + //alpha + #ifndef MPI + host_dotprod = cublasDot (24*VOLUME/2, (const RealT*) spin2, 1, (const RealT*) spin3, 1); + #else + host_dotprod = cublasDot_wrapper (24*VOLUME/2, (RealT*) spin2, 1, (RealT*) spin3, 1); + #endif + + host_alpha = (host_rk / host_dotprod); // alpha = r*r/ p M p + + //r(k+1) + cublasAxpy (24*VOLUME/2,-1.0*host_alpha, (const RealT*)spin3, 1, (RealT*)spin0, 1); + + + //x(k+1); + cublasAxpy (24*VOLUME/2, host_alpha, (const RealT*)spin2, 1, (RealT*)spin1, 1); + + if((cudaerr=cudaGetLastError()) != cudaSuccess){ + printf("%s\n", cudaGetErrorString(cudaerr)); + exit(200); + } + + //Abbruch? + #ifndef MPI + host_dotprod = cublasDot (24*VOLUME/2, (const RealT*) spin0, 1,(const RealT*) spin0, 1); + #else + host_dotprod = cublasDot_wrapper (24*VOLUME/2, (RealT*) spin0, 1,(RealT*) spin0, 1); + #endif + + if (((host_dotprod <= eps*sourcesquarenorm) && (i > maxit / 4) ) || ( host_dotprod <= epsfinal/2.)){//error-limit erreicht (epsfinal/2 sollte ausreichen um auch in double precision zu bestehen) + break; + } + + + if (g_proc_id == 0) printf("iter %d: err = %.16e\n", i, host_dotprod); + + + + //beta + host_beta =host_dotprod/host_rk; + //p(k+1) + cublasScal (24*VOLUME/2, host_beta, (RealT*)spin2, 1); + cublasAxpy (24*VOLUME/2, 1.0, (const RealT*)spin0, 1, (RealT*)spin2, 1); + + host_rk = host_dotprod; + + // recalculate residue frome r = b - Ax + if(((i+1) % N_recalcres) == 0){ + // r_(k+1) = Ax -b + + if (g_proc_id == 0) printf("Recalculating residue\n"); + + // D Ddagger -- Ddagger = gamma5 D gamma5 for Wilson Dirac Operator + // DO NOT USE tm_dirac_dagger_kappa here, otherwise spin2 will be overwritten!!! + + // Q_{-}Q{+} + #ifndef MPI + dev_Qtm_pm_psi (spin1, spin3, griddim3, blockdim3, griddim4, blockdim4, mixedsolveParameter); + #else + dev_Qtm_pm_psi_mpi(spin1, spin3, griddim3, blockdim3, griddim4, blockdim4, mixedsolveParameter); + #endif + if((cudaerr=cudaGetLastError()) != cudaSuccess){ + printf("%s\n", cudaGetErrorString(cudaerr)); + exit(200); + } + + + // r = b - Ax + cublasScal (24*VOLUME/2, -1.0, (RealT*)spin3, 1); + cublasAxpy (24*VOLUME/2, 1.0, (const RealT*)spinin, 1, (RealT*)spin3, 1); + cublasCopy (24*VOLUME/2, (const RealT*)spin3, 1, (RealT*)spin0, 1); + //dev_skalarmult_add_assign_spinor_field<<>>(spinin, -1.0, spin3, spin0); + }//recalculate residue + + }//MAIN LOOP cg + + + if (g_proc_id == 0) printf("Final residue: %.16e\n",host_dotprod); + + + // x_result = spin1 ! + + //no multiplication with D^{dagger} here and no return to non-kappa basis as in dev_cg! + dev_copy_spinor_field <<>>(spin1,spinout); + + #ifdef USETEXTURE + unbind_texture_gf(); + #endif + cudaFree(dotprod); + cudaFree(dotprod2); + cudaFree(rk); + cudaFree(alpha); + cudaFree(beta); + cublasShutdown(); + return(i); +} + +#endif + + + + + + + + + + +//initialize nearest-neighbour table for gpu +void initnn(){ + int t,x,y,z,pos; + for(t=0;t +void convert2double_spin (typename dev_spinorT::type* spin, spinor* h2d) { + + int i, Vol; + + //#ifndef MPI + if (even_odd_flag) { + Vol = VOLUME/2; + } + else { + Vol = VOLUME; + } + //#else + // Vol = (VOLUME+RAND)/2; + //#endif + + + for (i = 0; i < Vol; i++) { + + h2d[i].s0.c0.re = (double) spin[6*i+0].x; + h2d[i].s0.c0.im = (double) spin[6*i+0].y; + h2d[i].s0.c1.re = (double) spin[6*i+0].z; + h2d[i].s0.c1.im = (double) spin[6*i+0].w; + + h2d[i].s0.c2.re = (double) spin[6*i+1].x; + h2d[i].s0.c2.im = (double) spin[6*i+1].y; + h2d[i].s1.c0.re = (double) spin[6*i+1].z; + h2d[i].s1.c0.im = (double) spin[6*i+1].w; + + h2d[i].s1.c1.re = (double) spin[6*i+2].x; + h2d[i].s1.c1.im = (double) spin[6*i+2].y; + h2d[i].s1.c2.re = (double) spin[6*i+2].z; + h2d[i].s1.c2.im = (double) spin[6*i+2].w; + + h2d[i].s2.c0.re = (double) spin[6*i+3].x; + h2d[i].s2.c0.im = (double) spin[6*i+3].y; + h2d[i].s2.c1.re = (double) spin[6*i+3].z; + h2d[i].s2.c1.im = (double) spin[6*i+3].w; + + h2d[i].s2.c2.re = (double) spin[6*i+4].x; + h2d[i].s2.c2.im = (double) spin[6*i+4].y; + h2d[i].s3.c0.re = (double) spin[6*i+4].z; + h2d[i].s3.c0.im = (double) spin[6*i+4].w; + + h2d[i].s3.c1.re = (double) spin[6*i+5].x; + h2d[i].s3.c1.im = (double) spin[6*i+5].y; + h2d[i].s3.c2.re = (double) spin[6*i+5].z; + h2d[i].s3.c2.im = (double) spin[6*i+5].w; + + } +} + + + + + +// convert spinor to REAL4 (float4, double4) +template +void convert2REAL4_spin(spinor* spin, typename dev_spinorT::type* h2d){ + + int i, Vol; + + //#ifndef MPI + if (even_odd_flag) { + Vol = VOLUME/2; + } + else { + Vol = VOLUME; + } + //#else + // Vol = (VOLUME+RAND)/2; + //#endif + + for (i = 0; i < Vol; i++) { + + h2d[6*i+0].x = (RealT) spin[i].s0.c0.re; + h2d[6*i+0].y = (RealT) spin[i].s0.c0.im; + h2d[6*i+0].z = (RealT) spin[i].s0.c1.re; + h2d[6*i+0].w = (RealT) spin[i].s0.c1.im; + + h2d[6*i+1].x = (RealT) spin[i].s0.c2.re; + h2d[6*i+1].y = (RealT) spin[i].s0.c2.im; + h2d[6*i+1].z = (RealT) spin[i].s1.c0.re; + h2d[6*i+1].w = (RealT) spin[i].s1.c0.im; + + h2d[6*i+2].x = (RealT) spin[i].s1.c1.re; + h2d[6*i+2].y = (RealT) spin[i].s1.c1.im; + h2d[6*i+2].z = (RealT) spin[i].s1.c2.re; + h2d[6*i+2].w = (RealT) spin[i].s1.c2.im; + + h2d[6*i+3].x = (RealT) spin[i].s2.c0.re; + h2d[6*i+3].y = (RealT) spin[i].s2.c0.im; + h2d[6*i+3].z = (RealT) spin[i].s2.c1.re; + h2d[6*i+3].w = (RealT) spin[i].s2.c1.im; + + h2d[6*i+4].x = (RealT) spin[i].s2.c2.re; + h2d[6*i+4].y = (RealT) spin[i].s2.c2.im; + h2d[6*i+4].z = (RealT) spin[i].s3.c0.re; + h2d[6*i+4].w = (RealT) spin[i].s3.c0.im; + + h2d[6*i+5].x = (RealT) spin[i].s3.c1.re; + h2d[6*i+5].y = (RealT) spin[i].s3.c1.im; + h2d[6*i+5].z = (RealT) spin[i].s3.c2.re; + h2d[6*i+5].w = (RealT) spin[i].s3.c2.im; + + } +} + + + + + + + + + + +template +MixedsolveParameter* init_mixedsolve(su3** gf){ + + cudaError_t cudaerr; + MixedsolveParameter& mixedsolveParameter=*MixedsolveParameter::getGlobalP(); + + // get number of devices + if(havedevice == 0){ + int ndev = find_devices(); + if(ndev == 0){ + fprintf(stderr, "Error: no CUDA devices found. Aborting...\n"); + exit(300); + } + // only if device_num is not the default (-1) + if(device_num > -1){ + // try to set active device to device_num given in input file + if(device_num < ndev){ + printf("Setting active device to: %d\n", device_num); + cudaSetDevice(device_num); + } + else{ + fprintf(stderr, "Error: There is no CUDA device with No. %d. Aborting...\n",device_num); + exit(301); + } + if((cudaerr=cudaGetLastError())!=cudaSuccess){ + printf("Error in init_mixedsolve_eo(): Could not set active device. Aborting...\n"); + exit(302); + } + } + else{ + printf("Not setting any active device. Let the driver choose.\n"); + int device=-1;cudaGetDevice(&device);printf("device=%i",device); + } + havedevice = 1; + } + #ifdef GF_8 + /* allocate 8 floats of gf = 2*4*VOLUME float4's*/ + printf("Using GF 8 reconstruction\n"); + size_t dev_gfsize = 2*4*VOLUME * sizeof(dev_su3_8M(RealT)); + #else + /* allocate 2 rows of gf = 3*4*VOLUME float4's*/ + printf("Using GF 12 reconstruction\n"); + size_t dev_gfsize = 3*4*VOLUME * sizeof(dev_su3_2vM(RealT)); + #endif + + #ifdef USETEXTURE + printf("Using texture references\n"); + #else + printf("NOT using texture references\n"); + #endif + if((cudaerr=cudaMalloc((void **) &mixedsolveParameter.dev_gf, dev_gfsize)) != cudaSuccess){ + printf("Error in init_mixedsolve(): Memory allocation of gauge field failed. Aborting...\n"); + exit(200); + } // Allocate array on device + else{ + printf("Allocated gauge field on device\n"); + } + + #ifdef GF_8 + mixedsolveParameter.h2d_gf = (dev_su3_8M(RealT)*)malloc(dev_gfsize); // Allocate REAL conversion gf on host + su3to8(gf,mixedsolveParameter.h2d_gf); + #else + mixedsolveParameter.h2d_gf = (dev_su3_2vM(RealT)*)malloc(dev_gfsize); // Allocate REAL conversion gf on host + su3to2vf4(gf,mixedsolveParameter.h2d_gf); + #endif + cudaMemcpy(mixedsolveParameter.dev_gf, mixedsolveParameter.h2d_gf, dev_gfsize, cudaMemcpyHostToDevice); + + +//grid + size_t nnsize = 8*VOLUME*sizeof(int); + nn = (int *) malloc(nnsize); + cudaMalloc((void **) &dev_nn, nnsize); + + initnn(); + //shownn(); + //showcompare_gf(T-1, LX-1, LY-1, LZ-1, 3); + cudaMemcpy(dev_nn, nn, nnsize, cudaMemcpyHostToDevice); + + //free again + free(nn); + + + +// Spinors + #ifndef HALF + size_t dev_spinsize = 6*VOLUME * sizeof(dev_spinorM(RealT)); /* float4 */ + if((void*)(mixedsolveParameter.h2d_spin = (dev_spinorM(RealT)*)malloc(dev_spinsize)) == NULL){ + printf("Could not allocate memory for mixedsolveParameter.h2d_spin. Aborting...\n"); + exit(200); + } // Allocate float conversion spinor on host + #else + size_t dev_spinsize = 6*VOLUME * sizeof(dev_spinor_half); /*short4*/ + if((void*)(mixedsolveParameter.h2d_spin = (dev_spinor_half *)malloc(dev_spinsize)) == NULL){ + printf("Could not allocate memory for mixedsolveParameter.h2d_spin. Aborting...\n"); + exit(200); + } // Allocate float conversion spinor on host + size_t dev_normsize = VOLUME/2 * sizeof(float); + if((void*)(mixedsolveParameter.h2d_spin_norm = (float*)malloc(dev_normsize)) == NULL){ + printf("Could not allocate memory for mixedsolveParameter.h2d_spin_norm. Aborting...\n"); + exit(200); + } // Allocate float conversion norm on host + #endif + + + cudaMalloc((void **) &mixedsolveParameter.dev_spin1, dev_spinsize); // Allocate array spin1 on device + cudaMalloc((void **) &mixedsolveParameter.dev_spin2, dev_spinsize); // Allocate array spin2 on device + cudaMalloc((void **) &mixedsolveParameter.dev_spin3, dev_spinsize); // Allocate array spin3 on device + cudaMalloc((void **) &mixedsolveParameter.dev_spin4, dev_spinsize); + cudaMalloc((void **) &mixedsolveParameter.dev_spin5, dev_spinsize); + cudaMalloc((void **) &mixedsolveParameter.dev_spinin, dev_spinsize); + cudaMalloc((void **) &mixedsolveParameter.dev_spinout, dev_spinsize); + + #ifdef HALF + dev_spinsize = VOLUME/2*sizeof(float); + cudaMalloc((void **) &mixedsolveParameter.dev_spin1_norm, dev_spinsize); // Allocate norm spin1 on device + cudaMalloc((void **) &mixedsolveParameter.dev_spin2_norm, dev_spinsize); // Allocate norm spin2 on device + cudaMalloc((void **) &mixedsolveParameter.dev_spin3_norm, dev_spinsize); // Allocate norm spin3 on device + cudaMalloc((void **) &mixedsolveParameter.dev_spin4_norm, dev_spinsize); + cudaMalloc((void **) &mixedsolveParameter.dev_spin5_norm, dev_spinsize); + cudaMalloc((void **) &mixedsolveParameter.dev_spinin_norm, dev_spinsize); + cudaMalloc((void **) &mixedsolveParameter.dev_spinout_norm, dev_spinsize); + #endif + + + if((cudaerr=cudaGetLastError())!=cudaSuccess){ + printf("Error in init_mixedsolve(): Memory allocation of spinor fields failed. Aborting...\n"); + exit(200); + } + else{ + printf("Allocated spinor fields on device\n"); + } + + + output_size = LZ*T*sizeof(float); // parallel in t and z direction + cudaMalloc((void **) &dev_output, output_size); // output array + float * host_output = (float*) malloc(output_size); + + int grid[5]; + grid[0]=LX; grid[1]=LY; grid[2]=LZ; grid[3]=T; grid[4]=VOLUME; + + cudaMalloc((void **) &dev_grid, 5*sizeof(int)); + cudaMemcpy(dev_grid, &(grid[0]), 5*sizeof(int), cudaMemcpyHostToDevice); + + + return &mixedsolveParameter; +} + + + + + +template +MixedsolveParameter* init_mixedsolve_eo(su3** gf){ + + cudaError_t cudaerr; + MixedsolveParameter& mixedsolveParameter=*MixedsolveParameter::getGlobalP(); + + if (havedevice == 0) { + + // get number of devices + int ndev = find_devices(); + if(ndev == 0){ + fprintf(stderr, "Error: no CUDA devices found. Aborting...\n"); + exit(300); + } + + // try to set active device to device_num given in input file (or mpi rank) + #ifndef MPI + // only if device_num is not the default (-1) + if(device_num > -1){ + if(device_num < ndev){ + printf("Setting active device to: %d\n", device_num); + //cudaSetDevice(device_num); + } + else{ + fprintf(stderr, "Error: There is no CUDA device with No. %d. Aborting...\n",device_num); + exit(301); + } + if((cudaerr=cudaGetLastError())!=cudaSuccess){ + printf("Error in init_mixedsolve_eo(): Could not set active device. Aborting...\n"); + exit(302); + } + } + else{ + printf("Not setting any active device. Let the driver choose.\n"); + } + #else + #ifndef DEVICE_EQUAL_RANK + // try to set active device to device_num given in input file + // each process gets bounded to the same GPU + if(device_num > -1){ + if (device_num < ndev) { + printf("Process %d of %d: Setting active device to: %d\n", g_proc_id, g_nproc, device_num); + cudaSetDevice(device_num); + } + else { + fprintf(stderr, "Process %d of %d: Error: There is no CUDA device with No. %d. Aborting...\n", g_proc_id, g_nproc, device_num); + exit(301); + } + } + else{ + printf("Not setting any active device. Let the driver choose.\n"); + } + #else + // device number = mpi rank + if (g_cart_id < ndev) { + printf("Process %d of %d: Setting active device to: %d\n", g_proc_id, g_nproc, g_cart_id); + cudaSetDevice(g_cart_id); + } + else { + fprintf(stderr, "Process %d of %d: Error: There is no CUDA device with No. %d. Aborting...\n", g_proc_id, g_nproc, g_cart_id); + exit(301); + } + #endif + if ((cudaerr=cudaGetLastError()) != cudaSuccess) { + printf("Process %d of %d: Error in init_mixedsolve_eo_nd(): Could not set active device. Aborting...\n", g_proc_id, g_nproc); + exit(302); + } + #endif + + havedevice=1; + + } + + // output + #ifdef MPI + if (g_cart_id == 0) { + #endif + + #ifdef USETEXTURE + printf("Using texture references.\n"); + #else + printf("NOT using texture references.\n"); + #endif + + #ifdef GF_8 + printf("Using GF 8 reconstruction.\n"); + #else + printf("Using GF 12 reconstruction.\n"); + #endif + + #ifdef MPI + } + #endif + + #ifndef MPI + #ifdef GF_8 + /* allocate 8 floats for gf = 2*4*VOLUME float4's*/ + size_t dev_gfsize = 2*4*VOLUME * sizeof(dev_su3_8M(RealT)); + #else + /* allocate 2 rows of gf = 3*4*VOLUME float4's*/ + size_t dev_gfsize = 3*4*VOLUME * sizeof(dev_su3_2vM(RealT)); + #endif + #else + #ifdef GF_8 + /* allocate 8 floats for gf = 2*4*VOLUME float4's*/ + size_t dev_gfsize = 2*4*(VOLUME+RAND) * sizeof(dev_su3_8M(RealT)); + #else + /* allocate 2 rows of gf = 3*4*VOLUME float4's*/ + size_t dev_gfsize = 3*4*(VOLUME+RAND) * sizeof(dev_su3_2vM(RealT)); + #endif + + #endif + + if((cudaerr=cudaMalloc((void **) &mixedsolveParameter.dev_gf, dev_gfsize)) != cudaSuccess){ + printf("Error in init_mixedsolve(): Memory allocation of gauge field failed. Aborting...\n"); + exit(200); + } // Allocate array on device + else { + #ifndef MPI + printf("Allocated memory for gauge field on device.\n"); + #else + if (g_cart_id == 0) printf("Allocated memory for gauge field on devices.\n"); + #endif + } + + #ifdef GF_8 + mixedsolveParameter.h2d_gf = (dev_su3_8M(RealT)*)malloc(dev_gfsize); // Allocate REAL conversion gf on host + su3to8(gf,mixedsolveParameter.h2d_gf); + #else + mixedsolveParameter.h2d_gf = (dev_su3_2vM(RealT)*)malloc(dev_gfsize); // Allocate REAL conversion gf on host + su3to2vf4(gf,mixedsolveParameter.h2d_gf); + #endif + //bring to device + cudaMemcpy(mixedsolveParameter.dev_gf, mixedsolveParameter.h2d_gf, dev_gfsize, cudaMemcpyHostToDevice); + + + #ifdef HALF + #ifndef MPI + #ifdef GF_8 + /* allocate 8 floats for gf = 2*4*VOLUME float4's*/ + printf("Using half precision GF 8 reconstruction\n"); + dev_gfsize = 2*4*VOLUME * sizeof(dev_su3_8_half); + #else + /* allocate 2 rows of gf = 3*4*VOLUME float4's*/ + printf("Using half precision GF 12 reconstruction\n"); + dev_gfsize = 3*4*VOLUME * sizeof(dev_su3_2v_half); + #endif + #else // MPI + #ifdef GF_8 + /* allocate 8 floats for gf = 2*4*VOLUME float4's*/ + printf("Using half precision GF 8 reconstruction\n"); + dev_gfsize = 2*4*(VOLUME+RAND) * sizeof(dev_su3_8_half); + #else + /* allocate 2 rows of gf = 3*4*VOLUME float4's*/ + printf("Using half precision GF 12 reconstruction\n"); + dev_gfsize = 3*4*(VOLUME+RAND) * sizeof(dev_su3_2v_half); + #endif + #endif //MPI + if((cudaerr=cudaMalloc((void **) &mixedsolveParameter.dev_gf_half, dev_gfsize)) != cudaSuccess){ + printf("Error in init_mixedsolve(): Memory allocation of half precsion gauge field failed. Aborting...\n"); + exit(200); + } // Allocate array on device + else{ + printf("Allocated half precision gauge field on device\n"); + } + + #endif // HALF + + +//grid + size_t nnsize = 8*VOLUME*sizeof(int); + nn = (int *) malloc(nnsize); + + //nn grid for even-odd + nn_eo = (int *) malloc(nnsize/2); + nn_oe = (int *) malloc(nnsize/2); + + cudaMalloc((void **) &dev_nn, nnsize); + cudaMalloc((void **) &dev_nn_eo, nnsize/2); + cudaMalloc((void **) &dev_nn_oe, nnsize/2); + + #ifndef MPI + size_t idxsize = VOLUME/2*sizeof(int); + #else + size_t idxsize = (VOLUME+RAND)/2*sizeof(int); + #endif + eoidx_even = (int *) malloc(idxsize); + eoidx_odd = (int *) malloc(idxsize); + cudaMalloc((void **) &dev_eoidx_even, idxsize); + cudaMalloc((void **) &dev_eoidx_odd, idxsize); + + #ifndef MPI + initnn(); + initnn_eo(); + //shownn_eo(); + #else + init_nnspinor_eo_mpi(); + init_idxgauge_mpi(); + #endif + + //shownn(); + //showcompare_gf(T-1, LX-1, LY-1, LZ-1, 3); + //check_gauge_reconstruction_8(gf, mixedsolveParameter.dev_gf, 0, 0); + cudaMemcpy(dev_nn, nn, nnsize, cudaMemcpyHostToDevice); + cudaMemcpy(dev_nn_eo, nn_eo, nnsize/2, cudaMemcpyHostToDevice); + cudaMemcpy(dev_nn_oe, nn_oe, nnsize/2, cudaMemcpyHostToDevice); + cudaMemcpy(dev_eoidx_even, eoidx_even, idxsize, cudaMemcpyHostToDevice); + cudaMemcpy(dev_eoidx_odd, eoidx_odd, idxsize, cudaMemcpyHostToDevice); + + //free again + free(eoidx_odd); + free(eoidx_even); + free(nn_oe); + free(nn_eo); + free(nn); + + +// Spinors + #ifndef HALF + size_t dev_spinsize = 6*VOLUME/2 * sizeof(dev_spinorM(RealT)); /* float4 */ + if((void*)(mixedsolveParameter.h2d_spin = (dev_spinorM(RealT)*)malloc(dev_spinsize)) == NULL){ + printf("Could not allocate memory for mixedsolveParameter.h2d_spin. Aborting...\n"); + exit(200); + } // Allocate float conversion spinor on host + #ifdef MPI + size_t dev_spinsize_ext = 6*(VOLUME+RAND)/2*sizeof(dev_spinorM(RealT)); + #endif + #else + size_t dev_spinsize = 6*VOLUME/2 * sizeof(dev_spinor_half);/*short4*/ + if((void*)(mixedsolveParameter.h2d_spin = (dev_spinor_half *)malloc(dev_spinsize)) == NULL){ + printf("Could not allocate memory for mixedsolveParameter.h2d_spin. Aborting...\n"); + exit(200); + } // Allocate float conversion spinor on host + size_t dev_normsize = VOLUME/2 * sizeof(RealT); + if((void*)(mixedsolveParameter.h2d_spin_norm = (RealT *)malloc(dev_normsize)) == NULL){ + printf("Could not allocate memory for mixedsolveParameter.h2d_spin_norm. Aborting...\n"); + exit(200); + } // Allocate float conversion norm on host + #ifdef MPI + size_t dev_spinsize_ext = 6*(VOLUME+RAND)/2*sizeof(dev_spinor_half); + size_t dev_normsize_ext = (VOLUME+RAND)/2*sizeof(float); + #endif + #endif + + + #ifndef MPI + cudaMalloc((void **) &mixedsolveParameter.dev_spin1, dev_spinsize); // Allocate array spin1 on device + cudaMalloc((void **) &mixedsolveParameter.dev_spin2, dev_spinsize); // Allocate array spin2 on device + cudaMalloc((void **) &mixedsolveParameter.dev_spin3, dev_spinsize); // Allocate array spin3 on device + cudaMalloc((void **) &mixedsolveParameter.dev_spin4, dev_spinsize); + cudaMalloc((void **) &mixedsolveParameter.dev_spin5, dev_spinsize); + cudaMalloc((void **) &mixedsolveParameter.dev_spinin, dev_spinsize); + cudaMalloc((void **) &mixedsolveParameter.dev_spinout, dev_spinsize); + + cudaMalloc((void **) &mixedsolveParameter.dev_spin_eo1, dev_spinsize); + cudaMalloc((void **) &mixedsolveParameter.dev_spin_eo2, dev_spinsize); + + + #ifdef HALF + cudaMalloc((void **) &mixedsolveParameter.dev_spin1_norm, dev_spinsize); // Allocate norm spin1 on device + cudaMalloc((void **) &mixedsolveParameter.dev_spin2_norm, dev_spinsize); // Allocate norm spin2 on device + cudaMalloc((void **) &mixedsolveParameter.dev_spin3_norm, dev_spinsize); // Allocate norm spin3 on device + cudaMalloc((void **) &mixedsolveParameter.dev_spin4_norm, dev_spinsize); + cudaMalloc((void **) &mixedsolveParameter.dev_spin5_norm, dev_spinsize); + cudaMalloc((void **) &mixedsolveParameter.dev_spinin_norm, dev_spinsize); + cudaMalloc((void **) &mixedsolveParameter.dev_spinout_norm, dev_spinsize); + + cudaMalloc((void **) &mixedsolveParameter.dev_spin_eo1_norm, dev_spinsize); + cudaMalloc((void **) &mixedsolveParameter.dev_spin_eo2_norm, dev_spinsize); + #endif + + + #else + cudaMalloc((void **) &mixedsolveParameter.dev_spin1, dev_spinsize_ext); + cudaMalloc((void **) &mixedsolveParameter.dev_spin2, dev_spinsize_ext); + cudaMalloc((void **) &mixedsolveParameter.dev_spin3, dev_spinsize_ext); + cudaMalloc((void **) &mixedsolveParameter.dev_spin4, dev_spinsize_ext); + cudaMalloc((void **) &mixedsolveParameter.dev_spin5, dev_spinsize_ext); + cudaMalloc((void **) &mixedsolveParameter.dev_spinin, dev_spinsize_ext); + cudaMalloc((void **) &mixedsolveParameter.dev_spinout, dev_spinsize_ext); + + cudaMalloc((void **) &mixedsolveParameter.dev_spin_eo1, dev_spinsize_ext); + cudaMalloc((void **) &mixedsolveParameter.dev_spin_eo2, dev_spinsize_ext); + + #ifdef HALF + cudaMalloc((void **) &mixedsolveParameter.dev_spin1_norm, dev_normsize_ext); // Allocate norm spin1 on device + cudaMalloc((void **) &mixedsolveParameter.dev_spin2_norm, dev_normsize_ext); // Allocate norm spin2 on device + cudaMalloc((void **) &mixedsolveParameter.dev_spin3_norm, dev_normsize_ext); // Allocate norm spin3 on device + cudaMalloc((void **) &mixedsolveParameter.dev_spin4_norm, dev_normsize_ext); + cudaMalloc((void **) &mixedsolveParameter.dev_spin5_norm, dev_normsize_ext); + cudaMalloc((void **) &mixedsolveParameter.dev_spinin_norm, dev_normsize_ext); + cudaMalloc((void **) &mixedsolveParameter.dev_spinout_norm, dev_normsize_ext); + + cudaMalloc((void **) &mixedsolveParameter.dev_spin_eo1_norm, dev_normsize_ext); + cudaMalloc((void **) &mixedsolveParameter.dev_spin_eo2_norm, dev_normsize_ext); + #endif + + int tSliceEO = LX*LY*LZ/2; + #ifndef HALF + R1 = (dev_spinor *) malloc(2*tSliceEO*24*sizeof(float)); + R2 = R1 + 6*tSliceEO; + R3 = (dev_spinor *) malloc(2*tSliceEO*24*sizeof(float)); + R4 = R3 + 6*tSliceEO; + #else + + // implement this for half? + // -> ALTERNATE_FIELD_EXCHANGE + #endif + + #endif + + + + + + if((cudaerr=cudaGetLastError())!=cudaSuccess){ + printf("Error in init_mixedsolve(): Memory allocation of spinor fields failed. Aborting...\n"); + exit(200); + } + else{ + printf("Allocated spinor fields on device\n"); + } + + + #ifdef MPI + /* for async communication */ + // page-locked memory + #ifndef HALF + cudaMallocHost(&RAND3, 2*tSliceEO*6*sizeof(REAL4M(RealT))); + RAND4 = RAND3 + 6*tSliceEO; + cudaMallocHost(&RAND1, 2*tSliceEO*6*sizeof(REAL4M(RealT))); + RAND2 = RAND1 + 6*tSliceEO; + #else + cudaMallocHost(&RAND3, 2*tSliceEO*6*sizeof(short4)); + RAND4 = RAND3 + 6*tSliceEO; + cudaMallocHost(&RAND1, 2*tSliceEO*6*sizeof(short4)); + RAND2 = RAND1 + 6*tSliceEO; + //norm page-locked mem + cudaMallocHost(&RAND3_norm, 2*tSliceEO*sizeof(float)); + RAND4_norm = RAND3_norm + tSliceEO; + cudaMallocHost(&RAND1_norm, 2*tSliceEO*sizeof(float)); + RAND2_norm = RAND1_norm + tSliceEO; + #endif + + // CUDA streams and events + for (int i = 0; i < 3; i++) { + cudaStreamCreate(&stream[i]); + } + /* end for async communication */ + #endif + + output_size = LZ*T*sizeof(float); // parallel in t and z direction + cudaMalloc((void **) &dev_output, output_size); // output array + float * host_output = (float*) malloc(output_size); + + int grid[5]; + grid[0]=LX; grid[1]=LY; grid[2]=LZ; grid[3]=T; grid[4]=VOLUME/2; + // dev_VOLUME is half of VOLUME for eo + + cudaMalloc((void **) &dev_grid, 5*sizeof(int)); + cudaMemcpy(dev_grid, &(grid[0]), 5*sizeof(int), cudaMemcpyHostToDevice); + + + /* + init_dev_observables(); + + clock_t start, stop; + double timeelapsed = 0.0; + int count; + + assert((start = clock())!=-1); + float devplaq; + //for(count=0; count<1; count++){ + devplaq = calc_plaquette(mixedsolveParameter.dev_gf, dev_nn); + //} + assert((stop = clock())!=-1); + timeelapsed = (double) (stop-start)/CLOCKS_PER_SEC; + printf("Calculating Plaquette on device: plaq(device) = %.8f\n", devplaq); + printf("Time spent calculating: %f sec\n", timeelapsed); + + assert((start = clock())!=-1); + float hostplaq; + int a = 0; + //for(count=0; count<1; count++){ + hostplaq = (float) measure_gauge_action()/(6.*VOLUME*g_nproc); + //} + assert((stop = clock())!=-1); + timeelapsed = (double) (stop-start)/CLOCKS_PER_SEC; + printf("Calculating Plaquette on host: plaq(host) = %.8f\n", hostplaq); + printf("Time spent calculating: %f sec\n", timeelapsed); + + float devrect; + assert((start = clock())!=-1); + //for(count=0; count<100; count++){ + devrect = calc_rectangle(mixedsolveParameter.dev_gf, dev_nn); + //} + assert((stop = clock())!=-1); + timeelapsed = (double) (stop-start)/CLOCKS_PER_SEC; + printf("Calculating Rectangles on device: rectangle(device) = %.8f\n", devrect); + printf("Time spent calculating: %f sec\n", timeelapsed); + + float hostrect; + assert((start = clock())!=-1); + //for(count=0; count<100; count++){ + hostrect = (float) measure_rectangles()/(12.*VOLUME*g_nproc); + //} + assert((stop = clock())!=-1); + timeelapsed = (double) (stop-start)/CLOCKS_PER_SEC; + printf("Calculating Rectangles on host: rectangle(host) = %.8f\n", hostrect); + printf("Time spent calculating: %f sec\n", timeelapsed); + + + float2 ret; + + calc_polyakov_0(&ret, mixedsolveParameter.dev_gf, dev_nn); + printf("Calculating Polyakov loop on device:\n"); + printf("pl_0 (Re) = %.8e\n",ret.x); + printf("pl_0 (Im) = %.8e\n",ret.y); + + //polyakov_loop_dir(1, 0); + //printf("Calculating Polyakov loop on host:\n"); + + finalize_dev_observables(); + + exit(100); + */ + + + return &mixedsolveParameter; +} + + + +template +void finalize_mixedsolve(MixedsolveParameter* mixedsolveParameterP){ + + MixedsolveParameter& mixedsolveParameter=*mixedsolveParameterP;//use pointer in interface so we can delete mix\.solv\.Param later here + + cudaFree(mixedsolveParameter.dev_spin1); + cudaFree(mixedsolveParameter.dev_spin2); + cudaFree(mixedsolveParameter.dev_spin3); + cudaFree(mixedsolveParameter.dev_spin4); + cudaFree(mixedsolveParameter.dev_spin5); + cudaFree(mixedsolveParameter.dev_spinin); + cudaFree(mixedsolveParameter.dev_spinout); + cudaFree(mixedsolveParameter.dev_gf); + cudaFree(dev_grid); + cudaFree(dev_output); + cudaFree(dev_nn); + + if(even_odd_flag){ + cudaFree(mixedsolveParameter.dev_spin_eo1); + cudaFree(mixedsolveParameter.dev_spin_eo2); + cudaFree(dev_eoidx_even); + cudaFree(dev_eoidx_odd); + cudaFree(dev_nn_eo); + cudaFree(dev_nn_oe); + } + + #ifdef HALF + cudaFree(mixedsolveParameter.dev_gf_half); + + cudaFree(mixedsolveParameter.dev_spin1_norm); + cudaFree(mixedsolveParameter.dev_spin2_norm); + cudaFree(mixedsolveParameter.dev_spin3_norm); + cudaFree(mixedsolveParameter.dev_spin4_norm); + cudaFree(mixedsolveParameter.dev_spin5_norm); + cudaFree(mixedsolveParameter.dev_spinin_norm); + cudaFree(mixedsolveParameter.dev_spinout_norm); + + if(even_odd_flag){ + cudaFree(mixedsolveParameter.dev_spin_eo1_norm); + cudaFree(mixedsolveParameter.dev_spin_eo2_norm); + } + + + #endif + +#ifdef MPI + cudaFreeHost(RAND1); + cudaFreeHost(RAND3); + + #ifdef HALF + cudaFreeHost(RAND1_norm); + cudaFreeHost(RAND3_norm); + #endif + + for (int i = 0; i < 3; i++) { + cudaStreamDestroy(stream[i]); + } +#endif + + + + free(mixedsolveParameter.h2d_spin); + free(mixedsolveParameter.h2d_gf); +} + + +// include half versions of dev_cg - solvers +#ifdef HALF + #include "half_solvers.cuh" +#endif + + + + + +#ifndef HALF +templateclass MixedsolveOperatorT> +int mixed_solveT(spinor * const P, spinor * const Q, const int max_iter, + double eps, const int rel_prec,const int N, MixedsolveOperatorT& mixedsolveOperator){ + + // source in Q, initial solution in P (not yet implemented) + double rk; + int outercount=0; + int totalcount=0; + clock_t start, stop, startinner, stopinner; + double timeelapsed = 0.0; + double sourcesquarenorm; + int iter; + spinor ** solver_field = NULL; + const int nr_sf = 4; + + init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf); + + size_t dev_spinsize = 6*VOLUME * sizeof(dev_spinorM(RealT)); // float4 + MixedsolveParameter& mixedsolveParameter=*init_mixedsolve(g_gauge_field); + + // Start timer + assert((start = clock())!=-1); + + rk = square_norm(Q, N, 0); + sourcesquarenorm = rk; // for relative precision + assign(solver_field[0],Q,N); + printf("Initial residue: %.16e\n",rk); + zero_spinor_field(solver_field[1], N);//spin2 = x_k + zero_spinor_field(solver_field[2], N); + printf("The VOLUME is: %d\n",N); + + mixedsolveOperator.checkInit(solver_field[2],solver_field[3],solver_field[0],N); + + + for(iter=0; iter(solver_field[0],mixedsolveParameter.h2d_spin); + + cudaMemcpy(mixedsolveParameter.dev_spinin, mixedsolveParameter.h2d_spin, dev_spinsize, cudaMemcpyHostToDevice); + printf("%s\n", cudaGetErrorString(cudaGetLastError())); + + // solve in single prec on device + // D p_k = r_k + printf("Entering inner solver\n"); + assert((startinner = clock())!=-1); + totalcount += dev_cg(mixedsolveParameter.dev_gf, mixedsolveParameter.dev_spinin, mixedsolveParameter.dev_spinout, mixedsolveParameter.dev_spin1, mixedsolveParameter.dev_spin2, mixedsolveParameter.dev_spin3, mixedsolveParameter.dev_spin4, mixedsolveParameter.dev_spin5, dev_grid,dev_nn, mixedsolveOperator, sourcesquarenorm, rel_prec, eps); + stopinner = clock(); + timeelapsed = (double) (stopinner-startinner)/CLOCKS_PER_SEC; + printf("Inner solver done\nTime elapsed: %.6e sec\n", timeelapsed); + + + // copy back + cudaMemcpy(mixedsolveParameter.h2d_spin, mixedsolveParameter.dev_spinout, dev_spinsize, cudaMemcpyDeviceToHost); + printf("%s\n", cudaGetErrorString(cudaGetLastError())); + + convert2double_spin(mixedsolveParameter.h2d_spin, solver_field[2]); + + add(solver_field[1],solver_field[1],solver_field[2],N); + // x_(k+1) = x_k + p_k + + outercount ++; + + }// outer loop + + printf("Did NOT reach solver precision of eps=%.2e\n",eps); + //multiply with D^dagger + mixedsolveOperator.checkDeinit(solver_field[1],solver_field[3],P,N); + finalize_mixedsolve(&mixedsolveParameter); + + + stop = clock(); + timeelapsed = (double) (stop-start)/CLOCKS_PER_SEC; + printf("Inversion done in mixed precision.\n Number of iterations in outer solver: %d\n Squared residue: %.16e\n Time elapsed: %.6e sec\n", outercount, rk, timeelapsed); + + finalize_solver(solver_field, nr_sf); + return(-1); +} + + +#include "mixedsolveOperator.cuh" + + +extern "C" int mixed_solve (spinor* const P, spinor* const Q, const int max_iter, double eps, const int rel_prec,const int N) +{ + MixedsolveOperatorDirac mixedsolveOperator(0); + return mixed_solveT(P,Q,max_iter,eps,rel_prec,N,mixedsolveOperator); +} +extern "C" int mixed_solveD(spinor* const P, spinor* const Q, const int max_iter, double eps, const int rel_prec,const int N) +{ + #ifndef USETEXTURE + MixedsolveOperatorDirac mixedsolveOperator(0); + return mixed_solveT(P,Q,max_iter,eps,rel_prec,N,mixedsolveOperator); + #else + printf("WARNING: Using GPU/mixed_solve instead of double precision version."); + return mixed_solve(P,Q,max_iter,eps,rel_prec,N); + #endif +} + +extern "C" int mixed_solve_DiracDaggerDirac (spinor* const P, spinor* const Q, const int max_iter, double eps, const int rel_prec,const int N) +{ + MixedsolveOperatorDiracDaggerDirac mixedsolveOperator; + return mixed_solveT(P,Q,max_iter,eps,rel_prec,N,mixedsolveOperator); +} +extern "C" int mixed_solve_DiracDaggerDiracD(spinor* const P, spinor* const Q, const int max_iter, double eps, const int rel_prec,const int N) +{ + #ifndef USETEXTURE + MixedsolveOperatorDiracDaggerDirac mixedsolveOperator; + return mixed_solveT(P,Q,max_iter,eps,rel_prec,N,mixedsolveOperator); + #else + printf("WARNING: Using GPU/mixed_solve_DiracDaggerDirac instead of double precision version."); + return mixed_solve_DiracDaggerDirac(P,Q,max_iter,eps,rel_prec,N); + #endif +} + +extern "C" int mixed_solve_DiracDaggerDiracDiracDaggerDirac (spinor* const P, spinor* const Q, const int max_iter, double eps, const int rel_prec,const int N) +{ + MixedsolveOperatorDiracDaggerDiracDiracDaggerDirac mixedsolveOperator; + return mixed_solveT(P,Q,max_iter,eps,rel_prec,N,mixedsolveOperator); +} +extern "C" int mixed_solve_DiracDaggerDiracDiracDaggerDiracD(spinor* const P, spinor* const Q, const int max_iter, double eps, const int rel_prec,const int N) +{ + #ifndef USETEXTURE + MixedsolveOperatorDiracDaggerDiracDiracDaggerDirac mixedsolveOperator; + return mixed_solveT(P,Q,max_iter,eps,rel_prec,N,mixedsolveOperator); + #else + printf("WARNING: Using GPU/mixed_solve_DiracDaggerDiracDiracDaggerDirac instead of double precision version."); + return mixed_solve_DiracDaggerDiracDiracDaggerDirac(P,Q,max_iter,eps,rel_prec,N); + #endif +} + + + + + + + + + +template +void benchmark(spinor * const Q,MixedsolveParameter& mixedsolveParameter){ + + double timeelapsed = 0.0; + clock_t start, stop; + int i; + + size_t dev_spinsize = 6*VOLUME/2 * sizeof(dev_spinorM(RealT)); // float4 even-odd ! + convert2REAL4_spin(Q,mixedsolveParameter.h2d_spin); + cudaMemcpy(mixedsolveParameter.dev_spinin, mixedsolveParameter.h2d_spin, dev_spinsize, cudaMemcpyHostToDevice); + printf("%s\n", cudaGetErrorString(cudaGetLastError())); + + #ifndef MPI + assert((start = clock())!=-1); + #else + start = MPI_Wtime(); + #endif + + + + + #ifdef USETEXTURE + //Bind texture gf + bind_texture_gf(mixedsolveParameter.dev_gf); + #endif + + //Initialize some stuff + printf("mu = %f\n", g_mu); + dev_complex h0,h1,h2,h3,mh0, mh1, mh2, mh3; + h0.re = (REAL)ka0.re; h0.im = -(REAL)ka0.im; + h1.re = (REAL)ka1.re; h1.im = -(REAL)ka1.im; + h2.re = (REAL)ka2.re; h2.im = -(REAL)ka2.im; + h3.re = (REAL)ka3.re; h3.im = -(REAL)ka3.im; + + mh0.re = -(REAL)ka0.re; mh0.im = (REAL)ka0.im; + mh1.re = -(REAL)ka1.re; mh1.im = (REAL)ka1.im; + mh2.re = -(REAL)ka2.re; mh2.im = (REAL)ka2.im; + mh3.re = -(REAL)ka3.re; mh3.im = (REAL)ka3.im; + + // try using constant mem for kappas + cudaMemcpyToSymbol("dev_k0c", &h0, sizeof(h0)) ; + cudaMemcpyToSymbol("dev_k1c", &h1, sizeof(h1)) ; + cudaMemcpyToSymbol("dev_k2c", &h2, sizeof(h2)) ; + cudaMemcpyToSymbol("dev_k3c", &h3, sizeof(h3)) ; + + cudaMemcpyToSymbol("dev_mk0c", &mh0, sizeof(mh0)) ; + cudaMemcpyToSymbol("dev_mk1c", &mh1, sizeof(mh1)) ; + cudaMemcpyToSymbol("dev_mk2c", &mh2, sizeof(mh2)) ; + cudaMemcpyToSymbol("dev_mk3c", &mh3, sizeof(mh3)) ; + + + int blockdim3=BLOCK; + int gridsize; + if( VOLUME/2 >= BLOCK){ + gridsize = (int)(VOLUME/2/BLOCK) + 1; + } + else{ + gridsize=1; + } + printf("gridsize = %d\n", gridsize); + int griddim3=gridsize; + + + he_cg_init<<< 1, 1 >>> (dev_grid, (REAL) g_kappa, (REAL)(g_mu/(2.0*g_kappa)), h0,h1,h2,h3); + printf("%s\n", cudaGetErrorString(cudaGetLastError())); + printf("Applying H 1000 times\n"); + for(i=0; i<1000; i++){ + + #ifdef MPI + xchange_field_wrapper(mixedsolveParameter.dev_spinin, 0); + #endif + #ifdef USETEXTURE + bind_texture_spin(mixedsolveParameter.dev_spinin,1); + #endif + //bind_texture_nn(dev_nn_eo); + //cudaFuncSetCacheConfig(dev_Hopping_Matrix, cudaFuncCachePreferL1); + dev_Hopping_Matrix <<>> + (mixedsolveParameter.dev_gf, mixedsolveParameter.dev_spinin, mixedsolveParameter.dev_spin_eo1, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); //mixedsolveParameter.dev_spin_eo1 == even -> 0 + //unbind_texture_nn(); + #ifdef USETEXTURE + unbind_texture_spin(1); + #endif + + #ifdef MPI + xchange_field_wrapper(mixedsolveParameter.dev_spin_eo1, 0); + #endif + bind_texture_spin(mixedsolveParameter.dev_spin_eo1,1); + //bind_texture_nn(dev_nn_oe); + // cudaFuncSetCacheConfig(dev_Hopping_Matrix, cudaFuncCachePreferL1); + dev_Hopping_Matrix <<>> + (mixedsolveParameter.dev_gf, mixedsolveParameter.dev_spin_eo1, mixedsolveParameter.dev_spinin, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + //unbind_texture_nn(); + #ifdef USETEXTURE + unbind_texture_spin(1); + #endif + + } + printf("%s\n", cudaGetErrorString(cudaGetLastError())); + printf("Done\n"); + + + + #ifndef MPI + assert((stop = clock())!=-1); + timeelapsed = (double) (stop-start)/CLOCKS_PER_SEC; + // x2 because 2x Hopping per iteration + double benchres = 1608.0*2*(VOLUME/2)* 1000 / timeelapsed / 1.0e9; + printf("Benchmark: %f Gflops\n", benchres); + #else + stop = MPI_Wtime(); + timeelapsed = (double) (stop-start); + // x2 because 2x Hopping per iteration + double benchres = 1608.0*2*(g_nproc*VOLUME/2)* 1000 / timeelapsed / 1.0e9; + if (g_proc_id == 0) { + printf("Benchmark: %f Gflops\n", benchres); + } + #endif + + + + + + #ifdef USETEXTURE + unbind_texture_gf(); + #endif +} + + + +#ifdef MPI +template +void benchmark2(spinor * const Q,MixedsolveParameter& mixedsolveParameter){ + + double timeelapsed = 0.0; + clock_t start, stop; + int i; + + size_t dev_spinsize = 6*VOLUME/2 * sizeof(dev_spinorM(RealT)); // float4 even-odd ! + convert2REAL4_spin(Q,mixedsolveParameter.h2d_spin); + cudaMemcpy(mixedsolveParameter.dev_spinin, mixedsolveParameter.h2d_spin, dev_spinsize, cudaMemcpyHostToDevice); + printf("%s\n", cudaGetErrorString(cudaGetLastError())); + + #ifndef MPI + assert((start = clock())!=-1); + #else + start = MPI_Wtime(); + #endif + + + + + #ifdef USETEXTURE + //Bind texture gf + bind_texture_gf(mixedsolveParameter.dev_gf); + #endif + + //Initialize some stuff + printf("mu = %f\n", g_mu); + dev_complex h0,h1,h2,h3,mh0, mh1, mh2, mh3; + h0.re = (REAL)ka0.re; h0.im = -(REAL)ka0.im; + h1.re = (REAL)ka1.re; h1.im = -(REAL)ka1.im; + h2.re = (REAL)ka2.re; h2.im = -(REAL)ka2.im; + h3.re = (REAL)ka3.re; h3.im = -(REAL)ka3.im; + + mh0.re = -(REAL)ka0.re; mh0.im = (REAL)ka0.im; + mh1.re = -(REAL)ka1.re; mh1.im = (REAL)ka1.im; + mh2.re = -(REAL)ka2.re; mh2.im = (REAL)ka2.im; + mh3.re = -(REAL)ka3.re; mh3.im = (REAL)ka3.im; + + // try using constant mem for kappas + cudaMemcpyToSymbol("dev_k0c", &h0, sizeof(h0)) ; + cudaMemcpyToSymbol("dev_k1c", &h1, sizeof(h1)) ; + cudaMemcpyToSymbol("dev_k2c", &h2, sizeof(h2)) ; + cudaMemcpyToSymbol("dev_k3c", &h3, sizeof(h3)) ; + + cudaMemcpyToSymbol("dev_mk0c", &mh0, sizeof(mh0)) ; + cudaMemcpyToSymbol("dev_mk1c", &mh1, sizeof(mh1)) ; + cudaMemcpyToSymbol("dev_mk2c", &mh2, sizeof(mh2)) ; + cudaMemcpyToSymbol("dev_mk3c", &mh3, sizeof(mh3)) ; + + + int blockdim3=BLOCK; + int gridsize; + if( VOLUME/2 >= BLOCK){ + gridsize = (int)(VOLUME/2/BLOCK) + 1; + } + else{ + gridsize=1; + } + printf("gridsize = %d\n", gridsize); + int griddim3=gridsize; + + + int blockdim4 = BLOCK2; + if( VOLUME/2 % blockdim4 == 0){ + gridsize = (int) VOLUME/2/blockdim4; + } + else{ + gridsize = (int) VOLUME/2/blockdim4 + 1; + } + int griddim4 = gridsize; + + + + he_cg_init<<< 1, 1 >>> (dev_grid, (REAL) g_kappa, (REAL)(g_mu/(2.0*g_kappa)), h0,h1,h2,h3); + printf("%s\n", cudaGetErrorString(cudaGetLastError())); + printf("Applying dev_Qtm_pm_psi 100 times\n"); + + for(i=0; i<100; i++){ + + + dev_Qtm_pm_psi_mpi(mixedsolveParameter.dev_spinin, mixedsolveParameter.dev_spin_eo1, griddim3,blockdim3, griddim4, blockdim4); + + dev_Qtm_pm_psi_mpi(mixedsolveParameter.dev_spin_eo1, mixedsolveParameter.dev_spinin, griddim3,blockdim3, griddim4, blockdim4); + + } + printf("%s\n", cudaGetErrorString(cudaGetLastError())); + printf("Done\n"); + + + + #ifndef MPI + assert((stop = clock())!=-1); + timeelapsed = (double) (stop-start)/CLOCKS_PER_SEC; + // x8 because 8x Hopping per iteration + double benchres = 1608.0*8*(VOLUME/2)* 100 / timeelapsed / 1.0e9; + printf("Benchmark: %f Gflops\n", benchres); + #else + stop = MPI_Wtime(); + timeelapsed = (double) (stop-start); + // 8 because 8x Hopping per iteration + double benchres = 1608.0*8*(g_nproc*VOLUME/2)* 100 / timeelapsed / 1.0e9; + if (g_proc_id == 0) { + printf("Benchmark: %f Gflops\n", benchres); + } + #endif + + + + + + #ifdef USETEXTURE + unbind_texture_gf(); + #endif +} + +#endif + + + + + + + + + +#else +extern "C" int mixed_solve (spinor * const P, spinor * const Q, const int max_iter, + double eps, const int rel_prec,const int N){ + printf("WARNING dummy function mixed_solve called\n"); + return(0); +} + +#endif +// WORK TO DO: +// Separate half and non-half inner solvers in a more transparent way!! + + + + +template +int mixed_solve_eoT (spinor * const P, spinor * const Q, const int max_iter, + double eps, const int rel_prec, const int N){ + + // source in Q, initial solution in P (not yet implemented) + double rk; + int outercount=0; + int totalcount=0; + clock_t start, stop, startinner, stopinner; + double timeelapsed = 0.0; + double sourcesquarenorm; + int iter;//never referenced: , retval; + spinor ** solver_field = NULL; + const int nr_sf = 4; + + init_solver_field(&solver_field, VOLUMEPLUSRAND/2, nr_sf); + size_t dev_spinsize; + #ifndef HALF + dev_spinsize = 6*VOLUME/2 * sizeof(dev_spinorM(RealT)); // float4 even-odd ! + #else + dev_spinsize = 6*VOLUME/2 * sizeof(dev_spinor_half); //short4 eo ! + size_t dev_normsize = VOLUME/2 * sizeof(float); + #endif + MixedsolveParameter& mixedsolveParameter=*init_mixedsolve_eo(g_gauge_field); + + + /* + #ifndef HALF + // small benchmark + assign(solver_field[0],Q,N); + #ifndef MPI + benchmark(solver_field[0]); + #else + benchmark2(solver_field[0]); + #endif + // end small benchmark + + //exit(100); + + #endif //not HALF + */ + + + // Start timer + assert((start = clock())!=-1); + rk = square_norm(Q, N, 1); + sourcesquarenorm=rk; // for relative prec + double finaleps; + if(rel_prec == 1){ + finaleps = eps * sourcesquarenorm; + } + else{ + finaleps = eps; + } + assign(solver_field[0],Q,N); + printf("Initial residue: %.16e\n",rk); + zero_spinor_field(solver_field[1], N);//spin2 = x_k + zero_spinor_field(solver_field[2], N); + printf("The VOLUME/2 is: %d\n",N); + + + double norm = sqrt(_spinor_prod_re(Q[0],Q[0])); + printf("norm source(0): %f\n", norm); + + + //#include "test.sqz" + +for(iter=0; iter(solver_field[0],mixedsolveParameter.h2d_spin); + #else + convert2REAL4_spin_half(solver_field[0],mixedsolveParameter.h2d_spin, mixedsolveParameter.h2d_spin_norm); + #endif + cudaMemcpy(mixedsolveParameter.dev_spinin, mixedsolveParameter.h2d_spin, dev_spinsize, cudaMemcpyHostToDevice); + + // also copy half spinor norm + #ifdef HALF + cudaMemcpy(mixedsolveParameter.dev_spinin_norm, mixedsolveParameter.h2d_spin_norm, dev_normsize, cudaMemcpyHostToDevice); + #endif + + printf("%s\n", cudaGetErrorString(cudaGetLastError())); + + // solve in single prec on device + // D p_k = r_k + printf("Entering inner solver\n"); + assert((startinner = clock())!=-1); + #ifndef HALF + totalcount += dev_cg_eo(mixedsolveParameter.dev_gf, mixedsolveParameter.dev_spinin, mixedsolveParameter.dev_spinout, mixedsolveParameter.dev_spin1, mixedsolveParameter.dev_spin2, mixedsolveParameter.dev_spin3, mixedsolveParameter.dev_spin4, mixedsolveParameter.dev_spin5, dev_grid,dev_nn, (RealT) finaleps, mixedsolveParameter); + #else + totalcount += dev_cg_eo_half(mixedsolveParameter.dev_gf, + mixedsolveParameter.dev_spinin, mixedsolveParameter.dev_spinin_norm, + mixedsolveParameter.dev_spinout,mixedsolveParameter.dev_spinout_norm, + mixedsolveParameter.dev_spin1, mixedsolveParameter.dev_spin1_norm, + mixedsolveParameter.dev_spin2, mixedsolveParameter.dev_spin2_norm, + mixedsolveParameter.dev_spin3, mixedsolveParameter.dev_spin3_norm, + mixedsolveParameter.dev_spin4, mixedsolveParameter.dev_spin4_norm, + mixedsolveParameter.dev_spin5, mixedsolveParameter.dev_spin5_norm, + dev_grid,dev_nn, (RealT) finaleps, + mixedsolveParameter); + #endif + stopinner = clock(); + timeelapsed = (double) (stopinner-startinner)/CLOCKS_PER_SEC; + printf("Inner solver done\nTime elapsed: %.6e sec\n", timeelapsed); + + // copy back + cudaMemcpy(mixedsolveParameter.h2d_spin, mixedsolveParameter.dev_spinout, dev_spinsize, cudaMemcpyDeviceToHost); + #ifdef HALF + cudaMemcpy(mixedsolveParameter.h2d_spin_norm, mixedsolveParameter.dev_spinout_norm, dev_normsize, cudaMemcpyDeviceToHost); + #endif + printf("%s\n", cudaGetErrorString(cudaGetLastError())); + + #ifndef HALF + convert2double_spin(mixedsolveParameter.h2d_spin, solver_field[2]); + #else + convert2double_spin_half(mixedsolveParameter.h2d_spin, mixedsolveParameter.h2d_spin_norm, solver_field[2]); + #endif + + // x_(k+1) = x_k + p_k + add(solver_field[1],solver_field[1],solver_field[2],N); + + outercount ++; +}// outer loop + + printf("Did NOT reach solver precision of eps=%.2e\n",eps); + //multiply with Qtm_minus_psi (for non gpu done in invert_eo.c) + Qtm_minus_psi(solver_field[3], solver_field[1]); + assign(P, solver_field[3], N); + + + assert((stop = clock())!=-1); + timeelapsed = (double) (stop-start)/CLOCKS_PER_SEC; + printf("Inversion done in mixed precision.\n Number of iterations in outer solver: %d\n Squared residue: %.16e\n Time elapsed: %.6e sec\n", outercount, rk, timeelapsed); + + finalize_mixedsolve(&mixedsolveParameter); + finalize_solver(solver_field, nr_sf); + return(-1); +} + +extern "C" int mixed_solve_eo (spinor * const P, spinor * const Q, const int max_iter, double eps, const int rel_prec, const int N) +{ return mixed_solve_eoT(P,Q,max_iter,eps,rel_prec,N); }; +#ifndef HALF + #ifndef USETEXTURE + extern "C" int mixed_solve_eoD (spinor * const P, spinor * const Q, const int max_iter, double eps, const int rel_prec, const int N) + { return mixed_solve_eoT(P,Q,max_iter,eps,rel_prec,N); } + #endif +#endif + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/mixed_solve.h b/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/mixed_solve.h new file mode 100644 index 0000000000000000000000000000000000000000..2735116c437252049db6f37390728fe82affb81f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/mixed_solve.h @@ -0,0 +1,23 @@ +#ifndef _MIXED_SOLVE_H_ + +void initnn(); + +extern "C" int mixed_solve (spinor * const P, spinor * const Q, const int max_iter, + double eps,const int rel_prec, const int N); +extern "C" int mixed_solveD (spinor * const P, spinor * const Q, const int max_iter, + double eps,const int rel_prec, const int N); + +extern "C" int mixed_solve_eo (spinor * const P, spinor * const Q, const int max_iter, + double eps, const int rel_prec, const int N); +extern "C" int mixed_solve_eoD (spinor * const P, spinor * const Q, const int max_iter, + double eps, const int rel_prec, const int N); + + +extern "C" int bind_texture_spin(dev_spinor* s, int i); +extern "C" int unbind_texture_spin(int i); + +extern "C" int bind_texture_nn(int* nn); +extern "C" int unbind_texture_nn(); + +#define _MIXED_SOLVE_H_ +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/mixed_solve_eo_nd.cuh b/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/mixed_solve_eo_nd.cuh new file mode 100644 index 0000000000000000000000000000000000000000..fdeac4b5f25594c36cdd8de790633f01f65faab6 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/mixed_solve_eo_nd.cuh @@ -0,0 +1,4299 @@ +/************************************************************************** + * + * Copyright (C) 2010 Joseph Nagel + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + ************************************************************************** + * + * inspired by: Florian Burger + * Carsten Urbach + * + **************************************************************************/ + + + + + ////////////////////////////////////////////////////////////////// + // // + // this is the implementation of the EO, ND mixed solver // + // // + ////////////////////////////////////////////////////////////////// + + ////////////////////////////////////////////////////////////////// + // // + // and the MPI implementation of the EO, ND mixed solver // + // // + // PARALLELT parallelization // + // no _GAUGE_COPY and no _USE_HALFSPINOR // + // // + ////////////////////////////////////////////////////////////////// + + + + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + + + +// the debugging functions can be included here via: #include "./DEBUG/MATRIX_DEBUG.cuh" +// #include "./DEBUG/MATRIX_MPI_DEBUG.cuh" + + + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + + + +#ifdef HAVE_CONFIG_H + #include +#endif + +extern "C" { +#include "../Nondegenerate_Matrix.h" +#include "../Hopping_Matrix.h" +#include "../solver/cg_her_nd.h" +#include "../solver/solver_field.h" +} +#include "../global.h" + +#ifdef MPI + #undef MPI + #undef REAL + #include + #define MPI + #define REAL float +#endif + + + + + + + +// global formal parameters +size_t dev_gfsize; +size_t dev_spinsize_int; // making the structure transparent: +int N_sites_int; // _int: internal sites +int N_floats_int; // _ext: internal sites + additional boundaries +#ifdef MPI + size_t dev_spinsize_ext; + int N_sites_ext; + int N_floats_ext; +#endif + + +// spinor fields (pointing to device) +dev_spinor * dev_spin1_up; // auxiliary fields for cg_eo_nd() +dev_spinor * dev_spin1_dn; +dev_spinor * dev_spin2_up; +dev_spinor * dev_spin2_dn; +dev_spinor * dev_spin3_up; +dev_spinor * dev_spin3_dn; +/* +dev_spinor * dev_spin4_up; +dev_spinor * dev_spin4_dn; +dev_spinor * dev_spin5_up; +dev_spinor * dev_spin5_dn; +*/ + +dev_spinor * dev_spinin_up; // host/device interaction // mixedsolve_eo_nd() <--> cg_eo_nd() +dev_spinor * dev_spinin_dn; // inner/outer interaction +dev_spinor * dev_spinout_up; +dev_spinor * dev_spinout_dn; + +dev_spinor * h2d_spin_up; // for transferring in double precision on host to single precision on device (pointing to host) +dev_spinor * h2d_spin_dn; + +dev_spinor * dev_spin_eo1_up; // auxiliary for matrix_multiplication32() called by dev_cg_eo_nd() +dev_spinor * dev_spin_eo1_dn; +dev_spinor * dev_spin_eo2_up; +dev_spinor * dev_spin_eo2_dn; +dev_spinor * dev_spin_eo3_up; +dev_spinor * dev_spin_eo3_dn; + + +// physical parameters (on device) +__device__ float mubar, epsbar; + + +#ifdef MPI // collecting variables for the MPI implementation + // put to mixed_solve.cu + /* + __device__ int dev_RAND; // not used, maybe later ... + __device__ int dev_VOLUMEPLUSRAND; // is now used in dev_Hopping_Matrix_mpi() + __device__ int dev_rank; // was for the moment put to mixed_solve.cu ... + __device__ int dev_nproc; + */ + + int * iseven; + int * dev_g_iup; + int * dev_g_idn; + int * dev_g_lexic2eo; + int * dev_g_lexic2eosub; + int * dev_g_eo2lexic; + int * dev_g_ipt; + + #ifdef HOPPING_DEBUG + spinor * spinor_debug_in; // for Hopping_Matrix_wrapper() + spinor * spinor_debug_out; // for Hopping_Matrix_wrapper() + #endif + + + #if ASYNC > 0 + #ifdef ASYNC_TIMING + cudaEvent_t start_ALL; // CUDA events for timing and profiling + cudaEvent_t stop_ALL; + cudaEvent_t stop_D2H_1; + cudaEvent_t stop_D2H_2; + cudaEvent_t stop_INT_0; + cudaEvent_t stop_H2D_3; + cudaEvent_t stop_H2D_4; + cudaEvent_t stop_EXT_1; + cudaEvent_t stop_EXT_2; + float time_stop_D2H_1; // CUDA times in milliseconds + float time_stop_D2H_2; + float time_stop_INT_0; + float time_stop_H2D_3; + float time_stop_H2D_4; + float time_stop_EXT_1; + float time_stop_EXT_2; + float time_stop_ALL; + double mpi_start_ALL; // MPI times with arbitrary zero-point for timing and profiling + double mpi_start_sendrecv_1; + double mpi_stop_sendrecv_1; + double mpi_start_sendrecv_2; + double mpi_stop_sendrecv_2; + double mpiTime_start_sendrecv_1; // MPI times in seconds + double mpiTime_stop_sendrecv_1; + double mpiTime_start_sendrecv_2; + double mpiTime_stop_sendrecv_2; + #endif + #endif + +#endif + + + + + +//#include "communication.cuh" +//#include "index_fields.cuh" + + + + + + +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// +/////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + + + + + + + /////////////////////////// + // // + // INITIALIZATIONS // + // // + /////////////////////////// + + + + +//////////////////// +// GPU parameters // +//////////////////// + + +// puts the additional nd parameters mubar and epsbar on the device +__global__ void he_cg_init_nd_additional (float param_mubar, float param_epsbar) { + + mubar = param_mubar; + epsbar = param_epsbar; + +} + + + + + + +#ifdef MPI + +// puts the additional variables VOLUMEPLUSRAND and RAND on the device +__global__ void he_cg_init_nd_additional_mpi (int param_VOLUMEPLUSRAND, int param_RAND, int rank, int nproc) { + + dev_VOLUMEPLUSRAND = param_VOLUMEPLUSRAND; + dev_RAND = param_RAND; + + dev_rank = rank; + dev_nproc = nproc; + +} + +#endif + + + + + + +///////////////////////////////////////////// +// geometry- and nearest-neighbour indices // +///////////////////////////////////////////// + + +#ifdef MPI + +// builds an array iseven[global position] to check wether is even or odd + +void init_iseven() { + + int x0, x1, x2, x3; + int ix; + + for (x0 = -1; x0 < T+1; x0++) { + for (x1 = 0; x1 < LX; x1++) { + for (x2 = 0; x2 < LY; x2++) { + for (x3 = 0; x3 < LZ; x3++) { + + ix = Index(x0, x1, x2, x3); + + if ((x0 + x1 + x2 + x3 + g_proc_coords[0]*T + g_proc_coords[1]*LX + + g_proc_coords[2]*LY + g_proc_coords[3]*LZ) % 2 == 0) { + iseven[ix] = 1; + } + else { + iseven[ix] = 0; + } + + }}}} + +} + + + + + + +// initialize nearest-neighbour table for gpu with even-odd enabled + +void init_nnspinor_eo_mpi() { + + int x, y, z, t, ind, nnpos, j; // mixed_solve_eo(...) allocates 8 integers per even or odd lattice site: size_t nnsize = 8*VOLUME*sizeof(int); + + for (t = 0; t < T; t++) { // loop goes over all INTERN latice sites !! + for (x = 0; x < LX; x++) { // doesn't refer to any EXTERN BOUNDARIES !! -> CORRESPONDS TO THE WHOLE LATTICE (I.E. WHEN NO SUBLATTICES ARE ASSIGNED) !! + for (y = 0; y < LY; y++) { // because of the behaviour of g_iup[][] in the non-parallel case + for (z = 0; z < LZ; z++) { + // NOTICE: g_ipt, g_iup, g_idn, and g_lexic2eosub refer to pos. of lin. proj. pos. of the lattice + ind = g_ipt[t][x][y][z]; // g_ipt[t][x][y][z] returns the linearly projected position of (t,x,y,z) of the lattice + // indexes computed in geometry_eo() from geometry_eo.c + // memory for the index array allocated by init_geometry_indices() from init_geometry_indices.c + if ((t+x+y+z)%2 == 0) { // EVEN + //if ((t + x + y + z + g_proc_coords[0]*T + g_proc_coords[1]*LX + + // g_proc_coords[2]*LY + g_proc_coords[3]*LZ) % 2 == 0) { + + nnpos = g_lexic2eosub[ind]; // g_lexic2eosub[ind] returns the position of [ind] in the sub-eo-notation + //////////////// + for (j = 0; j < 4; j++) { // plus direction // here are also the // BOUNDARIES // included and properly mapped: + // //////////////// + nn_eo[8*nnpos+j] = g_lexic2eosub[ g_iup[ind][j] ]; // g_iup[ind][j] returns the position of the nearest neighbour of [ind] in direction +[j] + } // --> for the non-parallized code g_iup[][] maps INTERN !! + for (j = 0; j < 4; j++) { // minus direction + nn_eo[8*nnpos+4+j] = g_lexic2eosub[ g_idn[ind][j] ]; // g_idn[ind][j] returns the position of the nearest neighbour of [ind] in direction -[j] + } + } + + else { // ODD + + nnpos = g_lexic2eosub[ind]; + + for (j = 0; j < 4; j++) { // plus direction + nn_oe[8*nnpos+j] = g_lexic2eosub[ g_iup[ind][j] ]; // nn_oe will return the nearest neigbours + } // nn_eo and nn_oe strictly refer to the 4d-spacetime lattice + + for (j = 0; j < 4; j++) { // minus direction + nn_oe[8*nnpos+4+j] = g_lexic2eosub[ g_idn[ind][j] ]; + } + } + }}}} // for loops +} + + + + + + +// the following functions can all be used to properly initialize the fields eoidx_even[] and eoidx_odd[] for addressing the gauge fields: + + +void init_idxgauge_mpi() { // works! + + int t, x, y, z; + int pos_eo, pos_global; + + for (t = -1; t < T+1; t++) { + for (x = 0; x < LX; x++) { + for (y = 0; y < LY; y++) { + for (z = 0; z < LZ; z++) { + + //pos_global = g_ipt[t][x][y][z]; + pos_global = Index(t,x,y,z); + pos_eo = g_lexic2eosub[pos_global]; + + //if ((t+x+y+z)%2 == 0) { // EVEN + if ((t + x + y + z + g_proc_coords[0]*T + g_proc_coords[1]*LX + + g_proc_coords[2]*LY + g_proc_coords[3]*LZ) % 2 == 0) { + eoidx_even[pos_eo] = g_eo2lexic[pos_eo]; + } + else { // ODD + eoidx_odd[pos_eo] = g_eo2lexic[(VOLUME+RAND)/2+pos_eo]; + } + }}}} // for loop over the INTERN lattice + + //printf("This was init_idxgauge_mpi().\n"); + +} + + + +/* +void init_idxgauge_mpi() { // works! + + int t, x, y, z; + int pos_eo, pos_global; + + for (t = -1; t < T+1; t++) { + for (x = 0; x < LX; x++) { + for (y = 0; y < LY; y++) { + for (z = 0; z < LZ; z++) { + + //pos_global = g_ipt[t][x][y][z]; + pos_global = Index(t,x,y,z); + pos_eo = g_lexic2eosub[pos_global]; + + //if ((t+x+y+z)%2 == 0) { // EVEN + if ((t + x + y + z + g_proc_coords[0]*T + g_proc_coords[1]*LX + + g_proc_coords[2]*LY + g_proc_coords[3]*LZ) % 2 == 0) { + eoidx_even[pos_eo] = pos_global; + } + else { // ODD + eoidx_odd[pos_eo] = pos_global; + } + }}}} // for loop over the INTERN lattice + + //printf("This was init_idxgauge_mpi().\n"); + +} +*/ + + +/* +void init_idxgauge_mpi() { // works! + + int pos_eo, pos_global_even, pos_global_odd; + + for (pos_eo = 0; pos_eo < (VOLUME+RAND)/2; pos_eo++) { + // even + pos_global_even = g_eo2lexic[pos_eo]; + eoidx_even[pos_eo] = pos_global_even; + // odd + pos_global_odd = g_eo2lexic[(VOLUME+RAND)/2 + pos_eo]; + eoidx_odd[pos_eo] = pos_global_odd; + } + + //printf("This was init_idxgauge_mpi().\n"); + +} +*/ + + +/* +void init_idxgauge_mpi() { // works! + + int pos_eo, pos_global; + + for (pos_global = 0; pos_global < (VOLUME+RAND); pos_global++) { + + pos_eo = g_lexic2eosub[pos_global]; + + if (iseven[pos_global] == 1) { + //if (pos_global%2 == 0) { + eoidx_even[pos_eo] = pos_global; + } + else { + eoidx_odd[pos_eo] = pos_global; + } + + } + + //printf("This was init_idxgauge_mpi().\n"); + +} +*/ + + +/* +void init_idxgauge_mpi() { // works! + + int x, y, z, t; + int ind; + int evenpos = 0; + int oddpos = 0; + + for (t = 0; t < T; t++) { + for (x = 0; x < LX; x++) { + for (y = 0; y < LY; y++) { + for (z = 0; z < LZ; z++) { + ind = g_ipt[t][x][y][z]; + if ((t+x+y+z) % 2 == 0) { + eoidx_even[evenpos] = ind; + evenpos++; + } + else { + eoidx_odd[oddpos] = ind; + oddpos++; + } + }}}} // INTERN + + + t = T; + for (x = 0; x < LX; x++) { + for (y = 0; y < LY; y++) { + for (z = 0; z < LZ; z++) { + ind = VOLUME + z + LZ*y + LZ*LY*x; + //if (iseven[ind] == 1) { + if ((t+x+y+z) % 2 == 0) { + eoidx_even[evenpos] = ind; + evenpos++; + } + else { + eoidx_odd[oddpos] = ind; + oddpos++; + } + }}} // EXTERN + + + t = -1; + for (x = 0; x < LX; x++) { + for (y = 0; y < LY; y++) { + for (z = 0; z < LZ; z++) { + ind = VOLUME + LX*LY*LZ + z + LZ*y + LZ*LY*x; + //if (iseven[ind] == 1) { + if ((t+x+y+z) % 2 == 0) { + eoidx_even[evenpos] = ind; + evenpos++; + } + else { + eoidx_odd[oddpos] = ind; + oddpos++; + } + }}} // EXTERN + + //printf("This was init_idxgauge_mpi().\n"); + +} +*/ + + +#endif // MPI + + + + + + +void set_global_sizes() { + + #ifndef MPI + #ifdef GF_8 + // allocate 8 floats for gf = 2*4*VOLUME float4's // dev_su3_8 = float4 + dev_gfsize = 4*VOLUME * 2*sizeof(dev_su3_8); // allocates for each lattice site and for 4 directions 2*float4 = 8 floats = 8 real parameters + #else + // allocate 2 rows of gf = 3*4*VOLUME float4's // dev_su3_2v = float4 + dev_gfsize = 4*VOLUME * 3*sizeof(dev_su3_2v); // allocates for each lattice site and for 4 directions 3*float4 = 12 floats = 2 rows of complex 3-vectors + #endif + #else + #ifdef GF_8 // dev_su3_8 = float4 + dev_gfsize = 4*(VOLUME+RAND) * 2*sizeof(dev_su3_8); // allocates for each lattice site and RAND for 4 directions 2*float4 = 8 floats = 8 real parameters + #else // dev_su3_2v = float4 + dev_gfsize = 4*(VOLUME+RAND) * 3*sizeof(dev_su3_2v); // allocates for each lattice site and RAND for 4 directions 3*float4 = 12 floats = 2 rows of complex 3-vectors + #endif + #endif + + dev_spinsize_int = 6*VOLUME/2*sizeof(dev_spinor); // 24 floats per lattice site + N_sites_int = VOLUME/2; + N_floats_int = 24*VOLUME/2; + #ifdef MPI + dev_spinsize_ext = 6*(VOLUME+RAND)/2*sizeof(dev_spinor); + N_sites_ext = (VOLUME+RAND)/2; + N_floats_ext = 24*(VOLUME+RAND)/2; + #endif + +} + + + + + + +//////////////// +// ALLOCATING // +//////////////// + +// initializes and allocates all quantities for the mixed solver +// more precise: +// puts the gauge field on device as "2 rows" or "8 floats" per SU(3)-matrix +// allocates memory for all spinor fields +// puts the nn- and eoidx-fields on device memory + +void init_mixedsolve_eo_nd (su3** gf) { // gf is the full gauge field + + + + + typedef REAL RealT; + + ////////////////////// + // GLOBAL VARIABLES // + ////////////////////// + + /* + #ifndef MPI + #ifdef GF_8 + // allocate 8 floats for gf = 2*4*VOLUME float4's // dev_su3_8 = float4 + dev_gfsize = 4*VOLUME * 2*sizeof(dev_su3_8); // allocates for each lattice site and for 4 directions 2*float4 = 8 floats = 8 real parameters + #else + // allocate 2 rows of gf = 3*4*VOLUME float4's // dev_su3_2v = float4 + dev_gfsize = 4*VOLUME * 3*sizeof(dev_su3_2v); // allocates for each lattice site and for 4 directions 3*float4 = 12 floats = 2 rows of complex 3-vectors + #endif + #else + #ifdef GF_8 // dev_su3_8 = float4 + dev_gfsize = 4*(VOLUME+RAND) * 2*sizeof(dev_su3_8); // allocates for each lattice site and RAND for 4 directions 2*float4 = 8 floats = 8 real parameters + #else // dev_su3_2v = float4 + dev_gfsize = 4*(VOLUME+RAND) * 3*sizeof(dev_su3_2v); // allocates for each lattice site and RAND for 4 directions 3*float4 = 12 floats = 2 rows of complex 3-vectors + #endif + #endif + + dev_spinsize_int = 6*VOLUME/2*sizeof(dev_spinor); // 24 floats per lattice site + N_sites_int = VOLUME/2; + N_floats_int = 24*VOLUME/2; + #ifdef MPI + dev_spinsize_ext = 6*(VOLUME+RAND)/2*sizeof(dev_spinor); + N_sites_ext = (VOLUME+RAND)/2; + N_floats_ext = 24*(VOLUME+RAND)/2; + #endif + */ + + set_global_sizes(); + + + + ///////////////////// + // LOCAL VARIABLES // + ///////////////////// + + cudaError_t cudaerr; // CUDA errors + int ndev; // number of devices + //size_t dev_gfsize; // size of the gauge-field on device memory // put to global // non-MPI: VOLUME/2 // MPI: (VOLUME+RAND)/2 + size_t nnsize; // size of memory for nn-table + size_t idxsize; // size of memory for even/odd-positions + //size_t dev_spinsize; // size of memory for spinors // put to global + int grid[5]; // array for grid specifications + float * host_output; // ?? + + + + + // get number of devices + + if (havedevice == 0) { + + ndev = find_devices(); + if (ndev == 0) { + fprintf(stderr, "Error: no CUDA devices found. Aborting...\n"); + exit(300); + } + + #ifndef MPI + // only if device_num is not the default (-1) + if(device_num > -1){ + // try to set active device to device_num given in input file + if (device_num < ndev) { + printf("Setting active device to: %d\n", device_num); + cudaSetDevice(device_num); + } + else { + fprintf(stderr, "Error: There is no CUDA device with No. %d. Aborting...\n",device_num); + exit(301); + } + if ((cudaerr=cudaGetLastError())!=cudaSuccess) { + printf("Error in init_mixedsolve_eo_nd(): Could not set active device. Aborting...\n"); + exit(302); + } + } + else{ + printf("Not setting any active device. Let the driver choose.\n"); + } + #else + #ifndef DEVICE_EQUAL_RANK + // try to set active device to device_num given in input file + // each process gets bounded to the same GPU + if (device_num < ndev) { + printf("Process %d of %d: Setting active device to: %d\n", g_proc_id, g_nproc, device_num); + //cudaSetDevice(device_num); + } + else { + fprintf(stderr, "Process %d of %d: Error: There is no CUDA device with No. %d. Aborting...\n", g_proc_id, g_nproc, device_num); + exit(301); + } + #else + // device number = mpi rank + if (g_cart_id < ndev) { + printf("Process %d of %d: Setting active device to: %d\n", g_proc_id, g_nproc, g_cart_id); + //cudaSetDevice(g_cart_id); + } + else { + fprintf(stderr, "Process %d of %d: Error: There is no CUDA device with No. %d. Aborting...\n", g_proc_id, g_nproc, g_cart_id); + exit(301); + } + #endif + if ((cudaerr=cudaGetLastError()) != cudaSuccess) { + printf("Process %d of %d: Error in init_mixedsolve_eo_nd(): Could not set active device. Aborting...\n", g_proc_id, g_nproc); + exit(302); + } + #endif + + havedevice = 1; + } + + + + + // output + #ifdef MPI + if (g_cart_id == 0) { + #endif + + #ifdef USETEXTURE + printf("Using texture references.\n"); + #else + printf("NOT using texture references.\n"); + #endif + + #ifdef GF_8 + printf("Using GF 8 reconstruction.\n"); + #else + printf("Using GF 12 reconstruction.\n"); + #endif + + #ifdef MPI + } + #endif + + + + + ///////////////// + // GAUGE FIELD // + ///////////////// + + /* // put to global + #ifndef MPI + #ifdef GF_8 + // allocate 8 floats for gf = 2*4*VOLUME float4's // dev_su3_8 = float4 + dev_gfsize = 4*VOLUME * 2*sizeof(dev_su3_8); // allocates for each lattice site and for 4 directions 2*float4 = 8 floats = 8 real parameters + #else + // allocate 2 rows of gf = 3*4*VOLUME float4's // dev_su3_2v = float4 + dev_gfsize = 4*VOLUME * 3*sizeof(dev_su3_2v); // allocates for each lattice site and for 4 directions 3*float4 = 12 floats = 2 rows of complex 3-vectors + #endif + #else + #ifdef GF_8 // dev_su3_8 = float4 + dev_gfsize = 4*(VOLUME+RAND) * 2*sizeof(dev_su3_8); // allocates for each lattice site and RAND for 4 directions 2*float4 = 8 floats = 8 real parameters + #else // dev_su3_2v = float4 + dev_gfsize = 4*(VOLUME+RAND) * 3*sizeof(dev_su3_2v); // allocates for each lattice site and RAND for 4 directions 3*float4 = 12 floats = 2 rows of complex 3-vectors + #endif + #endif + */ + + + if ( (cudaerr = cudaMalloc((void **) &MixedsolveParameter::getGlobalP()->dev_gf, dev_gfsize)) != cudaSuccess ) { // allocates memory for the gauge field MixedsolveParameter::getGlobalP()->dev_gf on device + printf("Error in init_mixedsolve_eo_nd(): Memory allocation of gauge field failed. Aborting...\n"); + printf("%s\n", cudaGetErrorString(cudaGetLastError())); + exit(200); + } + else { + #ifndef MPI + printf("Allocated memory for gauge field on device.\n"); + #else + if (g_cart_id == 0) printf("Allocated memory for gauge gauge field on devices.\n"); + #endif + } + + + #ifdef GF_8 + MixedsolveParameter::getGlobalP()->h2d_gf = (dev_su3_8 *) malloc(dev_gfsize); // allocates on host + su3to8(gf, MixedsolveParameter::getGlobalP()->h2d_gf); // MixedsolveParameter::getGlobalP()->h2d_gf is the gauge field gf with the 8-real-parameter-representation (according to M. Clark, p. 28) + #else + MixedsolveParameter::getGlobalP()->h2d_gf = (dev_su3_2v *) malloc(dev_gfsize); // allocates on host + su3to2vf4(gf, MixedsolveParameter::getGlobalP()->h2d_gf); // MixedsolveParameter::getGlobalP()->h2d_gf is the gauge field gf with the first two rows stored + #endif + + cudaMemcpy(MixedsolveParameter::getGlobalP()->dev_gf, MixedsolveParameter::getGlobalP()->h2d_gf, dev_gfsize, cudaMemcpyHostToDevice); + // MixedsolveParameter::getGlobalP()->dev_gf = MixedsolveParameter::getGlobalP()->h2d_gf on device memory + + // debug // CUDA + #ifdef CUDA_DEBUG + #ifndef MPI + CUDA_CHECK("CUDA error in init_mixedsolve_eo_nd(). Copying MixedsolveParameter::getGlobalP()->dev_gf to device failed.", "Copied MixedsolveParameter::getGlobalP()->dev_gf to device."); + #else + CUDA_CHECK("CUDA error in init_mixedsolve_eo_nd(). Copying MixedsolveParameter::getGlobalP()->dev_gf to device failed.", "Copied MixedsolveParameter::getGlobalP()->dev_gf to devices."); + #endif + #endif + + + + + ////////// + // GRID // + ////////// + + nnsize = 8*VOLUME*sizeof(int); // size of memory for 8*VOLUME integers + nn = (int *) malloc(nnsize); // allocate this memory on host + nn_eo = (int *) malloc(nnsize/2); // allocate half this memory + nn_oe = (int *) malloc(nnsize/2); // allocate half this memory + cudaMalloc((void **) &dev_nn, nnsize); // memory on device + cudaMalloc((void **) &dev_nn_eo, nnsize/2); // half the memory on device + cudaMalloc((void **) &dev_nn_oe, nnsize/2); // half the memory on device + + + #ifndef MPI + idxsize = VOLUME/2*sizeof(int); // size of memory necessary for VOLUME/2 integers + #else + idxsize = (VOLUME+RAND)/2*sizeof(int); + #endif + eoidx_even = (int *) malloc(idxsize); // allocate on host + eoidx_odd = (int *) malloc(idxsize); // allocate on host + cudaMalloc((void **) &dev_eoidx_even, idxsize); // allocate on device + cudaMalloc((void **) &dev_eoidx_odd, idxsize); // allocate on device + + + #ifndef MPI + initnn(); // initialize nearest-neighbour table for gpu + initnn_eo(); // initialize nearest-neighbour table for gpu with even-odd enabled + #else + init_nnspinor_eo_mpi(); // initialize nearest-neighbour table for gpu with even-odd enabled + init_idxgauge_mpi(); + #endif + + + cudaMemcpy(dev_nn, nn, nnsize, cudaMemcpyHostToDevice); // copies the previous initialized index-arrays from host to device memory + cudaMemcpy(dev_nn_eo, nn_eo, nnsize/2, cudaMemcpyHostToDevice); + cudaMemcpy(dev_nn_oe, nn_oe, nnsize/2, cudaMemcpyHostToDevice); + cudaMemcpy(dev_eoidx_even, eoidx_even, idxsize, cudaMemcpyHostToDevice); + cudaMemcpy(dev_eoidx_odd, eoidx_odd, idxsize, cudaMemcpyHostToDevice); + + + + free(eoidx_odd); // deallocates the host memory for the field + free(eoidx_even); // they are only on the device + free(nn_oe); + free(nn_eo); // not necessary for locally defined variables ?? + free(nn); + + + // debug // CUDA + #ifdef CUDA_DEBUG + #ifndef MPI + CUDA_CHECK("CUDA error in init_mixedsolve_eo_nd(). Memory allocation of grid stuff failed.", "Allocated grid stuff on device."); + #else + CUDA_CHECK("CUDA error in init_mixedsolve_eo_nd(). Memory allocation of grid stuff failed.", "Allocated grid stuff on devices."); + #endif + #endif + + + + + ///////////// + // SPINORS // // allocates device memory for the odd part of the spinor fields (24 = 4*6 floats per odd lattice sites) + ///////////// // now we have to consider 2 flavors: up, dn + + /* + #ifndef MPI + dev_spinsize = 6*VOLUME/2*sizeof(dev_spinor); // remember: dev_spinor = float4 + #else + dev_spinsize = (VOLUME+RAND)/2 * 6*sizeof(dev_spinor); // NOTICE: this refers to the memory requirements for the device, host needs twice the memory !! + #endif + */ + + + #ifndef MPI + + cudaMalloc((void **) &dev_spin1_up, dev_spinsize_int); // allocates device memory for the fields spinor fields used in dev_cg_eo_nd(...) + cudaMalloc((void **) &dev_spin1_dn, dev_spinsize_int); // pointing to device + cudaMalloc((void **) &dev_spin2_up, dev_spinsize_int); // ... + cudaMalloc((void **) &dev_spin2_dn, dev_spinsize_int); + cudaMalloc((void **) &dev_spin3_up, dev_spinsize_int); + cudaMalloc((void **) &dev_spin3_dn, dev_spinsize_int); + /* + cudaMalloc((void **) &dev_spin4_up, dev_spinsize_int); // not needed + cudaMalloc((void **) &dev_spin4_dn, dev_spinsize_int); + cudaMalloc((void **) &dev_spin5_up, dev_spinsize_int); + cudaMalloc((void **) &dev_spin5_dn, dev_spinsize_int); + */ + cudaMalloc((void **) &dev_spinin_up , dev_spinsize_int); // host/device interaction + cudaMalloc((void **) &dev_spinin_dn , dev_spinsize_int); // inner/outer interaction + cudaMalloc((void **) &dev_spinout_up, dev_spinsize_int); + cudaMalloc((void **) &dev_spinout_dn, dev_spinsize_int); + + #else + + cudaMalloc((void **) &dev_spin1_up, dev_spinsize_ext); + cudaMalloc((void **) &dev_spin1_dn, dev_spinsize_ext); + cudaMalloc((void **) &dev_spin2_up, dev_spinsize_ext); + cudaMalloc((void **) &dev_spin2_dn, dev_spinsize_ext); + cudaMalloc((void **) &dev_spin3_up, dev_spinsize_ext); + cudaMalloc((void **) &dev_spin3_dn, dev_spinsize_ext); + /* + cudaMalloc((void **) &dev_spin4_up, dev_spinsize_ext); + cudaMalloc((void **) &dev_spin4_dn, dev_spinsize_ext); + cudaMalloc((void **) &dev_spin5_up, dev_spinsize_ext); + cudaMalloc((void **) &dev_spin5_dn, dev_spinsize_ext); + */ + cudaMalloc((void **) &dev_spinin_up , dev_spinsize_ext); + cudaMalloc((void **) &dev_spinin_dn , dev_spinsize_ext); + cudaMalloc((void **) &dev_spinout_up, dev_spinsize_ext); + cudaMalloc((void **) &dev_spinout_dn, dev_spinsize_ext); + + #endif + + + #ifndef MPI + // debug // host code + if ( (void *) (h2d_spin_up = (dev_spinor *) malloc(dev_spinsize_int) ) == NULL) { + printf("Could not allocate memory for h2d_spin_up. Aborting...\n"); + exit(200); + } + + if ( (void *) (h2d_spin_dn = (dev_spinor *) malloc(dev_spinsize_int) ) == NULL) { + printf("Could not allocate memory for h2d_spin_dn. Aborting...\n"); + exit(200); + } + #else + // debug // host code + if ( (void *) (h2d_spin_up = (dev_spinor *) malloc(dev_spinsize_ext) ) == NULL) { // MEMORY REQUIREMENTS: these are auxiliary fields for to_host() and to_device() + printf("Process %d of %d: Could not allocate memory for h2d_spin_up. Aborting...\n", g_proc_id, g_nproc); // they have to store floats (not doubles) + exit(200); // can use "_int" ... + } // must use "_ext" when used with to_host_mpi as in xchange_field_wrapper() + + if ( (void *) (h2d_spin_dn = (dev_spinor *) malloc(dev_spinsize_ext) ) == NULL) { + printf("Process %d of %d: Could not allocate memory for h2d_spin_dn. Aborting...\n", g_proc_id, g_nproc); + exit(200); + } + #endif + + + #ifndef MPI + + cudaMalloc((void **) &dev_spin_eo1_up, dev_spinsize_int); // used for matrix_multiplication32(...) + cudaMalloc((void **) &dev_spin_eo1_dn, dev_spinsize_int); + cudaMalloc((void **) &dev_spin_eo3_up, dev_spinsize_int); + cudaMalloc((void **) &dev_spin_eo3_dn, dev_spinsize_int); + /* + cudaMalloc((void **) &dev_spin_eo2_up, dev_spinsize_int); // no memory allocation needed + cudaMalloc((void **) &dev_spin_eo2_dn, dev_spinsize_int); // will point to already allocated memory when used in matrix_multiplication + */ + + #else + + cudaMalloc((void **) &dev_spin_eo1_up, dev_spinsize_ext); + cudaMalloc((void **) &dev_spin_eo1_dn, dev_spinsize_ext); + cudaMalloc((void **) &dev_spin_eo3_up, dev_spinsize_ext); + cudaMalloc((void **) &dev_spin_eo3_dn, dev_spinsize_ext); + /* + cudaMalloc((void **) &dev_spin_eo2_up, dev_spinsize_ext); + cudaMalloc((void **) &dev_spin_eo2_dn, dev_spinsize_ext); + */ + + #endif + + // debug // CUDA + #ifdef CUDA_DEBUG + #ifndef MPI + CUDA_CHECK("CUDA error in init_mixedsolve_eo_nd(). Memory allocation of spinor fields failed.", "Allocated spinor fields on device."); + #else + CUDA_CHECK("CUDA error in init_mixedsolve_eo_nd(). Memory allocation of spinor fields failed.", "Allocated spinor fields on devices."); + #endif + #endif + + + + + + #ifdef MPI + + #ifdef HOPPING_DEBUG // Hopping_Matrix() is applied upon these spinor fields + // debug // host code + if ( (void *) (spinor_debug_in = (spinor *) malloc(2*dev_spinsize_ext) ) == NULL) { + printf("Process %d of %d: Could not allocate memory for spinor_debug_in. Aborting...\n", g_proc_id, g_nproc); + exit(200); + } + // debug // host code + if ( (void *) (spinor_debug_out = (spinor *) malloc(2*dev_spinsize_ext) ) == NULL) { + printf("Process %d of %d: Could not allocate memory for spinor_debug_out. Aborting...\n", g_proc_id, g_nproc); + exit(200); + } + #endif + + + + + #if defined(ALTERNATE_FIELD_XCHANGE) || ASYNC > 0 + int tSliceEO = LX*LY*LZ/2; + #endif + + + #ifndef ALTERNATE_FIELD_XCHANGE // xchange_field() acts on this spinor field + // debug // host code // MEMORY REQUIREMENTS: + if ( (void *) (spinor_xchange = (spinor *) malloc(2*dev_spinsize_ext) ) == NULL) { // auxiliary fields for xchange_field_wrapper() and Hopping_Matrix_wrapper() + printf("Process %d of %d: Could not allocate memory for spinor_xchange. Aborting...\n", g_proc_id, g_nproc); // have to store doubles --> 2*dev_spinsize !! + exit(200); + } + #else // xchange procedure comparable to ASYNC + R1 = (dev_spinor *) malloc(2*tSliceEO*24*sizeof(float)); + R2 = R1 + 6*tSliceEO; + R3 = (dev_spinor *) malloc(2*tSliceEO*24*sizeof(float)); + R4 = R3 + 6*tSliceEO; + #endif + + + #if ASYNC > 0 // asynchronous communication and computation + + // page-locked memory + cudaMallocHost(&RAND3, 2*tSliceEO*6*sizeof(float4)); + RAND4 = RAND3 + 6*tSliceEO; + cudaMallocHost(&RAND1, 2*tSliceEO*6*sizeof(float4)); + RAND2 = RAND1 + 6*tSliceEO; + + // CUDA streams and events + for (int i = 0; i < 2*nStreams+1; i++) { + cudaStreamCreate(&stream[i]); + } + + #ifdef ASYNC_TIMING + cudaEventCreate(&start_ALL); + cudaEventCreate(&stop_ALL); + cudaEventCreate(&stop_D2H_1); + cudaEventCreate(&stop_D2H_2); + cudaEventCreate(&stop_INT_0); + cudaEventCreate(&stop_H2D_3); + cudaEventCreate(&stop_H2D_4); + cudaEventCreate(&stop_EXT_1); + cudaEventCreate(&stop_EXT_2); + #endif + #endif + + #endif // MPI + + + + + //////////// + // output // // ?? + //////////// + /* + output_size = LZ*T*sizeof(float); // parallel in t and z direction + cudaMalloc((void **) &dev_output, output_size); // output array + host_output = (float *) malloc(output_size); + + + // debug // CUDA + #ifdef CUDA_DEBUG + #ifndef MPI + CUDA_CHECK("CUDA error in init_mixedsolve_eo_nd(). Memory allocation output stuff failed.", "Allocated output stuff on device."); + #else + CUDA_CHECK("CUDA error in init_mixedsolve_eo_nd(). Memory allocation output stuff failed.", "Allocated output stuff on devices."); + #endif + #endif + */ + + + + // HAVE TO: maybe set grid[5] = (VOLUME+RAND)/2 ?? // no because refers to INTERN lattice sites !! + //////////////////////////// + // grid[ ] specifications // // allocate and initializes the array grid[5] on device + //////////////////////////// + + grid[0] = LX; // it contains the dimensions of the lattice and the volume of the eo-sublattice + grid[1] = LY; + grid[2] = LZ; + grid[3] = T; + grid[4] = VOLUME/2; // will be used to set dev_VOLUME: dev_VOLUME is half of VOLUME for eo + + cudaMalloc((void **) &dev_grid, 5*sizeof(int)); // dev_grid + cudaMemcpy(dev_grid, &(grid[0]), 5*sizeof(int), cudaMemcpyHostToDevice); + + + // debug // CUDA + #ifdef CUDA_DEBUG + #ifndef MPI + CUDA_CHECK("CUDA error in init_mixedsolve_eo_nd(). Memory allocation of grid[] specifications failed.", "Allocated grid[] specifications on device."); + #else + CUDA_CHECK("CUDA error in init_mixedsolve_eo_nd(). Memory allocation of grid[] specifications failed.", "Allocated grid[] specifications on devices."); + #endif + #endif + + + + + // MPI_Barrier(g_cart_grid); + + + + +}//init_mixedsolve_eo_nd() + + + + + + + //////////////////////// + // // + // FINALIZATION // + // // + //////////////////////// + + + + +// deallocates the previous allocated memory + +void finalize_mixedsolve_eo_nd(void) { + + typedef REAL RealT; + cudaError_t cudaerr; + + cudaFree(dev_spin1_up); + cudaFree(dev_spin1_dn); + cudaFree(dev_spin2_up); + cudaFree(dev_spin2_dn); + cudaFree(dev_spin3_up); + cudaFree(dev_spin3_dn); + /* + cudaFree(dev_spin4_up); + cudaFree(dev_spin4_dn); + cudaFree(dev_spin5_up); + cudaFree(dev_spin5_dn); + */ + + cudaFree(dev_spinin_up); + cudaFree(dev_spinin_dn); + cudaFree(dev_spinout_up); + cudaFree(dev_spinout_dn); + + free(h2d_spin_up); + free(h2d_spin_dn); + + #ifdef MPI + #ifndef ALTERNATE_FIELD_XCHANGE + free(spinor_xchange); + #else + free(R1); + free(R3); + #endif + + #ifdef HOPPING_DEBUG + free(spinor_debug_in); + free(spinor_debug_out); + #endif + #endif + + cudaFree(dev_spin_eo1_up); + cudaFree(dev_spin_eo1_dn); + cudaFree(dev_spin_eo3_up); + cudaFree(dev_spin_eo3_dn); + /* + cudaFree(dev_spin_eo2_up); + cudaFree(dev_spin_eo2_dn); + */ + + + cudaFree(dev_nn); + cudaFree(dev_nn_eo); + cudaFree(dev_nn_oe); + cudaFree(dev_eoidx_even); + cudaFree(dev_eoidx_odd); + + + cudaFree(MixedsolveParameter::getGlobalP()->dev_gf); + //cudaFree(dev_output); + cudaFree(dev_grid); + + + free(MixedsolveParameter::getGlobalP()->h2d_gf); + + + #ifdef MPI + #ifdef ALTERNATE_HOPPING_MATRIX + free_gpu_indexfields(); + #endif + + #if ASYNC > 0 + cudaFreeHost(RAND1); + cudaFreeHost(RAND3); + + for (int i = 0; i < 2*nStreams+1; i++) { + cudaStreamDestroy(stream[i]); + } + + #ifdef ASYNC_TIMING + cudaEventDestroy(start_ALL); + cudaEventDestroy(stop_ALL); + cudaEventDestroy(stop_D2H_1); + cudaEventDestroy(stop_D2H_2); + cudaEventDestroy(stop_INT_0); + cudaEventDestroy(stop_H2D_3); + cudaEventDestroy(stop_H2D_4); + cudaEventDestroy(stop_EXT_1); + cudaEventDestroy(stop_EXT_2); + #endif + #endif + #endif + + + // Clean up CUDA API for calling thread // ?? + cudaThreadExit(); // is essential + + + // debug // CUDA + #ifdef CUDA_DEBUG + CUDA_CHECK("CUDA error in finalize_mixedsolve_eo_nd(). Device memory deallocation failed", "Device memory deallocated."); + #endif + + +} + + + + + + + ///////////////////////////////// + // // + // H <--> D interactions // + // // + ///////////////////////////////// + + + + +///////// +// MPI // +///////// + +#ifdef MPI + +// convert spinor to double + +void convert2double_spin_mpi (dev_spinor * spin, spinor * h2d, int start, int end) { + + int i; + + for (i = start; i < end; i++) { + + h2d[i].s0.c0.re = (double) spin[6*i+0].x; + h2d[i].s0.c0.im = (double) spin[6*i+0].y; + h2d[i].s0.c1.re = (double) spin[6*i+0].z; + h2d[i].s0.c1.im = (double) spin[6*i+0].w; + + h2d[i].s0.c2.re = (double) spin[6*i+1].x; + h2d[i].s0.c2.im = (double) spin[6*i+1].y; + h2d[i].s1.c0.re = (double) spin[6*i+1].z; + h2d[i].s1.c0.im = (double) spin[6*i+1].w; + + h2d[i].s1.c1.re = (double) spin[6*i+2].x; + h2d[i].s1.c1.im = (double) spin[6*i+2].y; + h2d[i].s1.c2.re = (double) spin[6*i+2].z; + h2d[i].s1.c2.im = (double) spin[6*i+2].w; + + h2d[i].s2.c0.re = (double) spin[6*i+3].x; + h2d[i].s2.c0.im = (double) spin[6*i+3].y; + h2d[i].s2.c1.re = (double) spin[6*i+3].z; + h2d[i].s2.c1.im = (double) spin[6*i+3].w; + + h2d[i].s2.c2.re = (double) spin[6*i+4].x; + h2d[i].s2.c2.im = (double) spin[6*i+4].y; + h2d[i].s3.c0.re = (double) spin[6*i+4].z; + h2d[i].s3.c0.im = (double) spin[6*i+4].w; + + h2d[i].s3.c1.re = (double) spin[6*i+5].x; + h2d[i].s3.c1.im = (double) spin[6*i+5].y; + h2d[i].s3.c2.re = (double) spin[6*i+5].z; + h2d[i].s3.c2.im = (double) spin[6*i+5].w; + + } +} + + + +// convert spinor to REAL4 (float4, double4) + +void convert2REAL4_spin_mpi (spinor * spin, dev_spinor * h2d, int start, int end) { + + int i; + + for (i = start; i < end; i++) { + + h2d[6*i+0].x = (float) spin[i].s0.c0.re; + h2d[6*i+0].y = (float) spin[i].s0.c0.im; + h2d[6*i+0].z = (float) spin[i].s0.c1.re; + h2d[6*i+0].w = (float) spin[i].s0.c1.im; + + h2d[6*i+1].x = (float) spin[i].s0.c2.re; + h2d[6*i+1].y = (float) spin[i].s0.c2.im; + h2d[6*i+1].z = (float) spin[i].s1.c0.re; + h2d[6*i+1].w = (float) spin[i].s1.c0.im; + + h2d[6*i+2].x = (float) spin[i].s1.c1.re; + h2d[6*i+2].y = (float) spin[i].s1.c1.im; + h2d[6*i+2].z = (float) spin[i].s1.c2.re; + h2d[6*i+2].w = (float) spin[i].s1.c2.im; + + h2d[6*i+3].x = (float) spin[i].s2.c0.re; + h2d[6*i+3].y = (float) spin[i].s2.c0.im; + h2d[6*i+3].z = (float) spin[i].s2.c1.re; + h2d[6*i+3].w = (float) spin[i].s2.c1.im; + + h2d[6*i+4].x = (float) spin[i].s2.c2.re; + h2d[6*i+4].y = (float) spin[i].s2.c2.im; + h2d[6*i+4].z = (float) spin[i].s3.c0.re; + h2d[6*i+4].w = (float) spin[i].s3.c0.im; + + h2d[6*i+5].x = (float) spin[i].s3.c1.re; + h2d[6*i+5].y = (float) spin[i].s3.c1.im; + h2d[6*i+5].z = (float) spin[i].s3.c2.re; + h2d[6*i+5].w = (float) spin[i].s3.c2.im; + + } +} + + + + + + +// cudaMemcpy gets "spinor+6*offset" because of pointer to float4 and there are 24 floats per site + +void to_device_mpi (dev_spinor * device, spinor * host, dev_spinor * auxiliary, int size, int start, int end) { + + convert2REAL4_spin_mpi(host, auxiliary, start, end); // auxiliary = (float) host + cudaMemcpy(device+6*start, auxiliary+6*start, size, cudaMemcpyHostToDevice); // device = auxiliary (on device) + +} + + +void to_host_mpi (spinor * host, dev_spinor * device, dev_spinor * auxiliary, int size, int start, int end) { + + cudaMemcpy(auxiliary+6*start, device+6*start, size, cudaMemcpyDeviceToHost); // auxiliary = device (on device) + convert2double_spin_mpi(auxiliary, host, start, end); // host = (double) auxiliary + +} + +#endif // MPI + + + + + + +///////////////////////////// +// host/device interaction // +///////////////////////////// + +// remark: the host spinors are double precision and therefore need twice the memory !! +// dev_spinor * device: dev_spinsize +// spinor * host: 2*dev_spinsize +// dev_spinor * auxiliary: dev_spinsize +// the parameter "size" specifies the memory needed for the spinor n the device !! +// + +void to_device (dev_spinor * device, spinor * host, dev_spinor * auxiliary, int size) { + + typedef REAL RealT; + convert2REAL4_spin(host, auxiliary); // auxiliary = (float) host + cudaMemcpy(device, auxiliary, size, cudaMemcpyHostToDevice); // device = auxiliary (on device) + +} + + +void to_host (spinor * host, dev_spinor * device, dev_spinor * auxiliary, int size) { + + typedef REAL RealT; + cudaMemcpy(auxiliary, device, size, cudaMemcpyDeviceToHost); // auxiliary = device (on device) + convert2double_spin(auxiliary, host); // host = (double) auxiliary + +} + + + + + + +/////////////////////// +// boundary exchange // +/////////////////////// + +#ifdef MPI + +// all three versions do work: + +/* +// preliminarily exchanges the full spinor field instead of only the boundaries + +void xchange_field_wrapper (dev_spinor * dev_spin, int ieo) { + + size_t size = (VOLUME+RAND)/2 * 6*sizeof(dev_spinor); + + to_host_mpi(spinor_xchange, dev_spin, h2d_spin_up, size, 0, (VOLUME+RAND)/2); + xchange_field(spinor_xchange, ieo); + to_device_mpi(dev_spin, spinor_xchange, h2d_spin_dn, size, 0, (VOLUME+RAND)/2); + +} +*/ + + + + +/* +// copies VOLUME to host, exchanges, copies RAND back to device + +void xchange_field_wrapper (dev_spinor * dev_spin, int ieo) { + + size_t size_Volume = VOLUME/2 * 6*sizeof(dev_spinor); + size_t size_Rand = RAND/2 * 6*sizeof(dev_spinor); + + to_host_mpi(spinor_xchange, dev_spin, h2d_spin_up, size_Volume, 0, VOLUME/2); + xchange_field(spinor_xchange, ieo); + to_device_mpi(dev_spin, spinor_xchange, h2d_spin_dn, size_Rand, VOLUME/2, (VOLUME+RAND)/2); + +} +*/ + + + + +// copies the boundary t-slices t=0 and t=T-1 to host // will be used in matrix_multiplication32_mpi(), not ASYNC +// exchanges // provides a wrapped version of Carsten's xchange_field() +// copies RAND back to device // and not asynchronous version of ASYNC.cuh + +void xchange_field_wrapper (dev_spinor * dev_spin, int ieo) { + + #ifndef ALTERNATE_FIELD_XCHANGE + + size_t size_tSlice = LX*LY*LZ/2 * 6*sizeof(dev_spinor); + size_t size_Rand = RAND/2 * 6*sizeof(dev_spinor); + + to_host_mpi(spinor_xchange, dev_spin, h2d_spin_up, size_tSlice, 0 , LX*LY*LZ/2); + to_host_mpi(spinor_xchange, dev_spin, h2d_spin_dn, size_tSlice, (T-1)*LX*LY*LZ/2, (VOLUME)/2); + + xchange_field(spinor_xchange, ieo); + + to_device_mpi(dev_spin, spinor_xchange, h2d_spin_up, size_Rand, VOLUME/2, (VOLUME+RAND)/2); + + #else + + int tSliceEO = LX*LY*LZ/2; + int VolumeEO = VOLUME/2; + + cudaMemcpy(R1, dev_spin , tSliceEO*6*sizeof(float4), cudaMemcpyDeviceToHost); + cudaMemcpy(R2, dev_spin+6*(VolumeEO-tSliceEO), tSliceEO*6*sizeof(float4), cudaMemcpyDeviceToHost); + + MPI_Sendrecv(R1, 24*tSliceEO, MPI_FLOAT, g_nb_t_dn, 0, + R3, 24*tSliceEO, MPI_FLOAT, g_nb_t_up, 0, + g_cart_grid, &stat[0]); + MPI_Sendrecv(R2, 24*tSliceEO, MPI_FLOAT, g_nb_t_up, 1, + R4, 24*tSliceEO, MPI_FLOAT, g_nb_t_dn, 1, + g_cart_grid, &stat[1]); + + cudaMemcpy(dev_spin+6*VolumeEO , R3, tSliceEO*6*sizeof(float4), cudaMemcpyHostToDevice); + cudaMemcpy(dev_spin+6*(VolumeEO+tSliceEO), R4, tSliceEO*6*sizeof(float4), cudaMemcpyHostToDevice); + + #endif + +} + +#endif // MPI + + + + + + +//////////////////// +// hopping matrix // +//////////////////// + +#ifdef MPI // implemented for checking the MPI implementation of the hopping matrix + #ifdef HOPPING_DEBUG + + // applies the hopping matrix on host for debugging purposes + + void Hopping_Matrix_wrapper (int ieo, dev_spinor * out, dev_spinor * in) { + + //size_t size = (VOLUME+RAND)/2 * 6*sizeof(dev_spinor); + //to_host(g_chi_up_spinor_field[DUM_OLVER+3], in, h2d_spin_up, size); + //Hopping_Matrix(ieo, g_chi_dn_spinor_field[DUM_OLVER+3], g_chi_up_spinor_field[DUM_OLVER+3]); + //to_device(out, g_chi_dn_spinor_field[DUM_OLVER+3], h2d_spin_up, size); + + to_host(spinor_debug_in, in, h2d_spin_up, dev_spinsize_int); + Hopping_Matrix(ieo, spinor_debug_out, spinor_debug_in); + to_device(out, spinor_debug_out, h2d_spin_dn, dev_spinsize_int); + + } + + #endif +#endif + + + + + + +//////////////////// +// linear algebra // +//////////////////// + +#ifdef MPI + +// have to rebuilt some linear algebra functions which contain global communication +// can be done as wrappers to appropriate CUBLAS routines + + + +// a wrapper function for cublasDot() (with the same interface) +// provides the MPI communication via MPI_Allreduce() + +float cublasDot_wrapper(int size, float * A, int incx, float * B, int incy) { + + float result; + float buffer; + + buffer = cublasDot(size, (float *) A, incx, (float *) B, incy); + MPI_Allreduce(&buffer, &result, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD); + + return(result); + +} + +#endif + + + + + + + ////////////////////////////////// + // // + // MATRIX MULTIPLICATIONS // + // // + ////////////////////////////////// + + + + +///////////// +// KERNELS // +///////////// + +// derived from Flo's function dev_mul_one_pm_imu_inv +// order of the arguments also like Flo's convention: (spinin, spinout) + +// applies (1 +- imubar*gamma5) +// uses shared local memory for manipulation // really ?? where ?? +// one thread per lattice site + + +__global__ void dev_mul_one_pm_imubar_gamma5 (dev_spinor * sin, + dev_spinor * sout, + float sign ) { + + dev_spinor slocal[6]; // dev_spinor = float4 // 6*float4 = 24 floats // auxiliary for each thread + + dev_complex pm_imu = dev_initcomplex(0.0, sign * mubar); // dev_complex = struct { REAL re; REAL im; } // pm_imu.re = 0.0 + // pm_imu.im = sign * mubar + int pos = threadIdx.x + blockDim.x*blockIdx.x; + + if (pos < dev_VOLUME) { + dev_skalarmult_gamma5_spinor(&(slocal[0]), pm_imu, &(sin[6*pos]) ); // slocal = pm_imu * (gamma5) * sin + dev_add_spinor_assign(&(slocal[0]), &(sin[6*pos])); // slocal = slocal + sin = pm_imu * (gamma5) * sin + sin + dev_realmult_spinor_assign(&(sout[6*pos]), 1.0, &(slocal[0]) ); // sout = slocal + } +} + + + + + + +/////////////////////////// +// MATRIX MULTIPLICATION // +/////////////////////////// + +// the GPU implementation of Q_Qdagger_ND(...) from Nondegenerate_Matrix.c +// Flo's equivalent function for the standard and non-nd case is dev_Qtm_pm_psi + +void matrix_multiplication32 (dev_spinor * spinout_up, dev_spinor * spinout_dn, + dev_spinor * spinin_up , dev_spinor * spinin_dn , + int gridsize1, int blocksize1, int gridsize2, int blocksize2, + int gridsize3, int blocksize3, int gridsize4, int blocksize4) { + typedef REAL RealT; + + + // we will use the auxiliary fields dev_spin_eo{1,2}_up/dn for working on and buffering + // and set dev_spin_eo2_up/dn equal spinout_up/dn + // spinin_up/dn have to remain unchanged !! + // spinout_up/dn can be freely used + + + + + ///////////////////// + // LOCAL VARIABLES // + ///////////////////// + + int N_sites = VOLUME/2; // #lattice sites + int N_floats = 24*VOLUME/2; // #floats + + + + + //////////////////////////////////// + // MATCHING with Q_Qdagger_ND // + //////////////////////////////////// + // // + // _strange = _up // + // _charm = _dn // + // // + // DUM_MATRIX = dev_spin_eo1_up // + // DUM_MATRIX+1 = dev_spin_eo1_dn // + // // + // DUM_MATRIX+2 = dev_spin_eo2_up // + // DUM_MATRIX+3 = dev_spin_eo2_dn // + // // + //////////////////////////////////// + + + + + /////////////////////////////////// + // INITIALIZATIONS & ASSIGNMENTS // // have to use (one) other auxiliary field(s) than the calling function dev_cg_eo_nd + /////////////////////////////////// + + dev_spin_eo2_up = spinout_up; // need no memory allocated + dev_spin_eo2_dn = spinout_dn; + ///////////// THEORY //////////////////////////////////////////////////////////////// + // // + // (Q_tilde) = gamma5 * ((M_oo) - (M_oe)(Mee^-1)(M_eo)) // + // (Q_tilde)(Q_tilde_dagger) * (up,dn) = (Q_tilde) * (b,a) // + /////////////// // (a,b) = (Q_tilde) * (dn,up) // + // MAIN BODY // // // + /////////////// ///////////////////////////////////////////////////////////////////////////////////// + + + double nrm = 1.0 / (1.0 + g_mubar*g_mubar - g_epsbar*g_epsbar); + + + /////////////////////////////////////// ///////////////////////////////// + // Q_tilde_dagger(2x2) // // (a,b) = (Q_tilde) * (dn,up) // + /////////////////////////////////////// ///////////////////////////////// + + + //////////////////////////// + // (M_oe) (Mee^-1) (M_eo) // + //////////////////////////// + + // Flo: + #ifdef USETEXTURE + bind_texture_spin(spinin_dn,1); + #endif + dev_Hopping_Matrix <<>>(MixedsolveParameter::getGlobalP()->dev_gf, spinin_dn, dev_spin_eo1_up, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); // dev_spin_eo1_up = (M_eo) * spinin_dn + #ifdef USETEXTURE + unbind_texture_spin(1); + #endif + + #ifdef USETEXTURE + bind_texture_spin(spinin_up,1); + #endif + dev_Hopping_Matrix <<>>(MixedsolveParameter::getGlobalP()->dev_gf, spinin_up, dev_spin_eo1_dn, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); // dev_spin_eo1_dn = (M_eo) * spinin_up + #ifdef USETEXTURE + unbind_texture_spin(1); + #endif + + + // written: + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo1_up, dev_spin_eo2_up, -1.0); // dev_spin_eo2_up = (1 - imubar) * dev_spin_eo1_up = (1 - imubar)*(M_eo) * spinin_dn + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo1_dn, dev_spin_eo2_dn, +1.0); // dev_spin_eo2_dn = (1 + imubar) * dev_spin_eo1_dn = (1 + imubar)*(M_eo) * spinin_up + + + // CUBLAS: + cublasAxpy (N_floats, g_epsbar, (RealT*)dev_spin_eo1_dn, 1, (RealT*)dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up + epsbar * dev_spin_eo1_dn = (1 - imubar)*(M_eo) * spinin_dn + epsbar * (M_eo) * spinin_up + cublasAxpy (N_floats, g_epsbar, (RealT*)dev_spin_eo1_up, 1, (RealT*)dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn + epsbar * dev_spin_eo1_up = (1 + imubar)*(M_eo) * spinin_up + epsbar * (M_eo) * spinin_dn + + // CUBLAS: + cublasScal (N_floats, nrm, (RealT*)dev_spin_eo2_up, 1); // dev_spin_eo2_up = nrm * dev_spin_eo2_up = nrm*(1-imubar)*(M_eo) * spinin_dn + nrm*epsbar*(M_eo) * spinin_up + + cublasScal (N_floats, nrm, (RealT*)dev_spin_eo2_dn, 1); // dev_spin_eo2_dn = nrm * dev_spin_eo2_dn = nrm*(1+imubar)*(M_eo) * spinin_up + nrm*epsbar*(M_eo) * spinin_dn + + + // Flo: + #ifdef USETEXTURE + bind_texture_spin(dev_spin_eo2_up,1); // remember: this is ((M_oe)(Mee^-1)(M_eo)) * (spinin_dn, spinin_up): + #endif + dev_Hopping_Matrix <<>>(MixedsolveParameter::getGlobalP()->dev_gf, dev_spin_eo2_up, dev_spin_eo1_up, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + #ifdef USETEXTURE + unbind_texture_spin(1); // dev_spin_eo1_up = (M_oe) * dev_spin_eo2_up = (M_oe)*nrm*(1-imubar)*(M_eo) * spinin_dn + (M_oe)*nrm*epsbar*(M_eo) * spinin_up + #endif + + #ifdef USETEXTURE + bind_texture_spin(dev_spin_eo2_dn,1); + #endif + dev_Hopping_Matrix <<>>(MixedsolveParameter::getGlobalP()->dev_gf, dev_spin_eo2_dn, dev_spin_eo1_dn, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + #ifdef USETEXTURE + unbind_texture_spin(1); // dev_spin_eo1_dn = (M_oe) * dev_spin_eo2_dn = (M_oe)*nrm*(1+imubar)*(M_eo) * spinin_up + (M_oe)*nrm*epsbar*(M_eo) * spinin_dn + #endif + + + + + //////////// + // (M_oo) // + //////////// + + + // written: + dev_mul_one_pm_imubar_gamma5<<>>(spinin_dn, dev_spin_eo2_up, +1.0); // dev_spin_eo2_up = (1 + imubar) * spinin_dn + dev_mul_one_pm_imubar_gamma5<<>>(spinin_up, dev_spin_eo2_dn, -1.0); // dev_spin_eo2_dn = (1 - imubar) * spinin_up + + + // CUBLAS: // remember: this is (M_oo) * (spinin_dn, spinin_up): + cublasAxpy (N_floats, -g_epsbar, (RealT*)spinin_up, 1, (RealT*)dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up - epsbar*spinin_up = (1+imubar)*spinin_dn - epsbar*spinin_up + cublasAxpy (N_floats, -g_epsbar, (RealT*)spinin_dn, 1, (RealT*)dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn - epsbar*spinin_dn = (1-imubar)*spinin_up - epsbar*spinin_dn + + + + /////////////////////////////////////// + // (M_oo) - (M_oe) (Mee^-1) (M_eo) // + /////////////////////////////////////// + + // CUBLAS: // this is ((M_oo) - (M_oe)(Mee^-1)(M_eo)) * (spinin_dn, spinin_up): + cublasAxpy (N_floats, -1.0, (RealT*)dev_spin_eo1_up, 1, (RealT*)dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up - dev_spin_eo1_up = (1+imubar) * spinin_dn - epsbar * spinin_up + // - (M_oe)*nrm*(1-imubar)*(M_eo) * spinin_dn - (M_oe)*nrm*epsbar*(M_eo) * spinin_up + cublasAxpy (N_floats, -1.0, (RealT*)dev_spin_eo1_dn, 1, (RealT*)dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn - dev_spin_eo1_dn = (1-imubar) * spinin_up - epsbar * spinin_dn + // - (M_oe)*nrm*(1+imubar)*(M_eo) * spinin_up - (M_oe)*nrm*epsbar*(M_eo) * spinin_dn + + + //////////// + // gamma5 // + //////////// + + // Flo: + dev_gamma5 <<>>(dev_spin_eo2_up, dev_spin_eo2_up); // dev_spin_eo2_up = gamma5 * dev_spin_eo2_up + dev_gamma5 <<>>(dev_spin_eo2_dn, dev_spin_eo2_dn); // dev_spin_eo2_dn = gamma5 * dev_spin_eo2_dn + + + + + + + //////////////////// + // (a,b) -> (b,a) // + //////////////////// + dev_copy_spinor_field <<>>(dev_spin_eo2_dn, dev_spin_eo3_up); // dev_spin_eo3_up = dev_spin_eo2_dn + dev_copy_spinor_field <<>>(dev_spin_eo2_up, dev_spin_eo3_dn); // dev_spin_eo3_dn = dev_spin_eo2_up + + + + + + + /////////////////////////////////// /////////////////////// + // Q_tilde(2x2) // // (Q_tilde) * (b,a) // + /////////////////////////////////// /////////////////////// + + + //////////////////////////// + // (M_oe) (Mee^-1) (M_eo) // + //////////////////////////// + + // Flo: + #ifdef USETEXTURE + bind_texture_spin(dev_spin_eo3_up,1); + #endif + dev_Hopping_Matrix <<>>(MixedsolveParameter::getGlobalP()->dev_gf, dev_spin_eo3_up, dev_spin_eo1_up, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); // dev_spin_eo1_up = (M_eo) * dev_spin_eo3_up + #ifdef USETEXTURE + unbind_texture_spin(1); + #endif + + #ifdef USETEXTURE + bind_texture_spin(dev_spin_eo3_dn,1); + #endif + dev_Hopping_Matrix <<>>(MixedsolveParameter::getGlobalP()->dev_gf, dev_spin_eo3_dn, dev_spin_eo1_dn, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); // dev_spin_eo1_dn = (M_eo) * dev_spin_eo3_dn + #ifdef USETEXTURE + unbind_texture_spin(1); + #endif + + + // written: + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo1_up, dev_spin_eo2_up, -1.0); // dev_spin_eo2_up = (1 - imubar) * dev_spin_eo1_up = (1 - imubar)*(M_eo) * dev_spin_eo3_up + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo1_dn, dev_spin_eo2_dn, +1.0); // dev_spin_eo2_dn = (1 + imubar) * dev_spin_eo1_dn = (1 + imubar)*(M_eo) * dev_spin_eo3_dn + + + // CUBLAS: + cublasAxpy (N_floats, g_epsbar, (RealT*)dev_spin_eo1_dn, 1, (RealT*)dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up + epsbar * dev_spin_eo1_dn = (1 - imubar)*(M_eo) * dev_spin_eo3_up + epsbar * (M_eo) * dev_spin_eo3_dn + cublasAxpy (N_floats, g_epsbar, (RealT*)dev_spin_eo1_up, 1, (RealT*)dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn + epsbar * dev_spin_eo1_up = (1 + imubar)*(M_eo) * dev_spin_eo3_dn + epsbar * (M_eo) * dev_spin_eo3_up + + // CUBLAS: + cublasScal (N_floats, nrm, (RealT*)dev_spin_eo2_up, 1); // dev_spin_eo2_up = nrm * dev_spin_eo2_up = nrm*(1-imubar)*(M_eo) * dev_spin_eo3_up + nrm*epsbar*(M_eo) * dev_spin_eo3_dn + + cublasScal (N_floats, nrm, (RealT*)dev_spin_eo2_dn, 1); // dev_spin_eo2_dn = nrm * dev_spin_eo2_dn = nrm*(1+imubar)*(M_eo) * dev_spin_eo3_dn + nrm*epsbar*(M_eo) * dev_spin_eo3_up + + + // Flo: + #ifdef USETEXTURE + bind_texture_spin(dev_spin_eo2_up,1); // remember: this is ((M_oe) (Mee^-1) (M_eo)) * (dev_spin_eo3_up, dev_spin_eo3_dn): + #endif + dev_Hopping_Matrix <<>>(MixedsolveParameter::getGlobalP()->dev_gf, dev_spin_eo2_up, dev_spin_eo1_up, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + #ifdef USETEXTURE + unbind_texture_spin(1); // dev_spin_eo1_up = (M_oe) * dev_spin_eo2_up = (M_oe)*nrm*(1-imubar)*(M_eo) * dev_spin_eo3_up + (M_oe)*nrm*epsbar*(M_eo) * dev_spin_eo3_dn + #endif + + #ifdef USETEXTURE + bind_texture_spin(dev_spin_eo2_dn,1); + #endif + dev_Hopping_Matrix <<>>(MixedsolveParameter::getGlobalP()->dev_gf, dev_spin_eo2_dn, dev_spin_eo1_dn, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + #ifdef USETEXTURE + unbind_texture_spin(1); // dev_spin_eo1_dn = (M_oe) * dev_spin_eo2_dn = (M_oe)*nrm*(1+imubar)*(M_eo) * dev_spin_eo3_dn + (M_oe)*nrm*epsbar*(M_eo) * dev_spin_eo3_up + #endif + + + + //////////// + // (M_oo) // + //////////// + + // written: + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo3_up, dev_spin_eo2_up, +1.0); // dev_spin_eo2_up = (1 + imubar) * dev_spin_eo3_up + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo3_dn, dev_spin_eo2_dn, -1.0); // dev_spin_eo2_dn = (1 - imubar) * dev_spin_eo3_dn + + + // CUBLAS: // remember: this is (M_oo) * (dev_spin_eo3_up, dev_spin_eo3_dn): + cublasAxpy (N_floats, -g_epsbar, (RealT*)dev_spin_eo3_dn, 1, (RealT*)dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up - epsbar*dev_spin_eo3_dn = (1+imubar)*dev_spin_eo3_up - epsbar*dev_spin_eo3_dn + cublasAxpy (N_floats, -g_epsbar, (RealT*)dev_spin_eo3_up, 1, (RealT*)dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn - epsbar*dev_spin_eo3_up = (1-imubar)*dev_spin_eo3_dn - epsbar*dev_spin_eo3_up + + + + /////////////////////////////////////// + // (M_oo) - (M_oe) (Mee^-1) (M_eo) // + /////////////////////////////////////// + + // CUBLAS: // this is ( (M_oo) - (M_oe) (Mee^-1) (M_eo) ) * (dev_spin_eo3_up, dev_spin_eo3_dn) + cublasAxpy (N_floats, -1.0, (RealT*)dev_spin_eo1_up, 1, (RealT*)dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up - dev_spin_eo1_up = (1+imubar) * dev_spin_eo3_up - epsbar * dev_spin_eo3_dn + // - (M_oe)*nrm*(1-imubar)*(M_eo) * dev_spin_eo3_up - (M_oe)*nrm*epsbar*(M_eo) * dev_spin_eo3_dn + cublasAxpy (N_floats, -1.0, (RealT*)dev_spin_eo1_dn, 1, (RealT*)dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn - dev_spin_eo1_dn = (1-imubar) * dev_spin_eo3_dn - epsbar * dev_spin_eo3_up + // - (M_oe)*nrm*(1+imubar)*(M_eo) * dev_spin_eo3_dn - (M_oe)*nrm*epsbar*(M_eo) * dev_spin_eo3_up + + + //////////// + // gamma5 // + //////////// + + // Flo: + dev_gamma5 <<>>(dev_spin_eo2_up, dev_spin_eo2_up); // dev_spin_eo2_up = gamma5 * dev_spin_eo2_up + dev_gamma5 <<>>(dev_spin_eo2_dn, dev_spin_eo2_dn); // dev_spin_eo2_dn = gamma5 * dev_spin_eo2_dn + + + + + /* + //////////// + // output // // output is already done by setting dev_spin_eo2_up/dn = spinout_up/dn + //////////// + + dev_copy_spinor_field<<>>(dev_spin_eo2_up, spinout_up); // spinout_up = dev_spin_eo2_up + dev_copy_spinor_field<<>>(dev_spin_eo2_dn, spinout_dn); // spinout_dn = dev_spin_eo2_dn + */ + + + return; + +}//matrix_multiplication32() + + + + + + +#ifdef MPI + +/////////////////////////// +// MATRIX MULTIPLICATION // +/////////////////////////// + +// the GPU implementation of Q_Qdagger_ND(...) from Nondegenerate_Matrix.c +// Flo's equivalent function for the standard and non-nd case is dev_Qtm_pm_psi + +void matrix_multiplication32_mpi (dev_spinor * spinout_up, dev_spinor * spinout_dn, + dev_spinor * spinin_up , dev_spinor * spinin_dn , + int gridsize1, int blocksize1, int gridsize2, int blocksize2, + int gridsize3, int blocksize3, int gridsize4, int blocksize4) { + + + // we will use the auxiliary fields dev_spin_eo{1,2}_up/dn for working on and buffering + // and set dev_spin_eo2_up/dn equal spinout_up/dn + // spinin_up/dn have to remain unchanged !! + // spinout_up/dn can be freely used + + + + + ///////////////////// + // LOCAL VARIABLES // + ///////////////////// + + int N_sites = VOLUME/2; // #lattice sites + int N_floats = 24*VOLUME/2; // #floats + + + + + //////////////////////////////////// + // MATCHING with Q_Qdagger_ND // + //////////////////////////////////// + // // + // _strange = _up // + // _charm = _dn // + // // + // DUM_MATRIX = dev_spin_eo1_up // + // DUM_MATRIX+1 = dev_spin_eo1_dn // + // // + // DUM_MATRIX+2 = dev_spin_eo2_up // + // DUM_MATRIX+3 = dev_spin_eo2_dn // + // // + //////////////////////////////////// + + + + + /////////////////////////////////// + // INITIALIZATIONS & ASSIGNMENTS // // have to use (one) other auxiliary field(s) than the calling function dev_cg_eo_nd + /////////////////////////////////// + + dev_spin_eo2_up = spinout_up; // need no memory allocated + dev_spin_eo2_dn = spinout_dn; + ///////////// THEORY //////////////////////////////////////////////////////////////// + // // + // (Q_tilde) = gamma5 * ((M_oo) - (M_oe)(Mee^-1)(M_eo)) // + // (Q_tilde)(Q_tilde_dagger) * (up,dn) = (Q_tilde) * (b,a) // + /////////////// // (a,b) = (Q_tilde) * (dn,up) // + // MAIN BODY // // // + /////////////// ///////////////////////////////////////////////////////////////////////////////////// + + + double nrm = 1.0 / (1.0 + g_mubar*g_mubar - g_epsbar*g_epsbar); + + + /////////////////////////////////////// ///////////////////////////////// + // Q_tilde_dagger(2x2) // // (a,b) = (Q_tilde) * (dn,up) // + /////////////////////////////////////// ///////////////////////////////// + + + //////////////////////////// + // (M_oe) (Mee^-1) (M_eo) // + //////////////////////////// + + + // xchange + #ifndef HOPPING_DEBUG + xchange_field_wrapper(spinin_dn, 0); + #endif + + // hopping matrix + #ifdef USETEXTURE + bind_texture_spin(spinin_dn,1); + #endif + + #ifndef HOPPING_DEBUG + #ifndef ALTERNATE_HOPPING_MATRIX + dev_Hopping_Matrix <<>>(MixedsolveParameter::getGlobalP()->dev_gf, spinin_dn, dev_spin_eo1_up, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); // dev_spin_eo1_up = (M_eo) * spinin_dn + #else + dev_Hopping_Matrix_alternate<<>>(MixedsolveParameter::getGlobalP()->dev_gf, spinin_dn, dev_spin_eo1_up, dev_g_iup, dev_g_idn, dev_g_eo2lexic, dev_g_lexic2eosub, 0); + #endif + #else + Hopping_Matrix_wrapper(0, dev_spin_eo1_up, spinin_dn); + #endif + + #ifdef USETEXTURE + unbind_texture_spin(1); + #endif + + + // xchange + #ifndef HOPPING_DEBUG + xchange_field_wrapper(spinin_up, 0); + #endif + + // hopping matrix + #ifdef USETEXTURE + bind_texture_spin(spinin_up,1); + #endif + + #ifndef HOPPING_DEBUG + #ifndef ALTERNATE_HOPPING_MATRIX + dev_Hopping_Matrix <<>>(MixedsolveParameter::getGlobalP()->dev_gf, spinin_up, dev_spin_eo1_dn, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); // dev_spin_eo1_dn = (M_eo) * spinin_up + #else + dev_Hopping_Matrix_alternate<<>>(MixedsolveParameter::getGlobalP()->dev_gf, spinin_up, dev_spin_eo1_dn, dev_g_iup, dev_g_idn, dev_g_eo2lexic, dev_g_lexic2eosub, 0); + #endif + #else + Hopping_Matrix_wrapper(0, dev_spin_eo1_dn, spinin_up); + #endif + + #ifdef USETEXTURE + unbind_texture_spin(1); + #endif + + + // imubar, gamma5 + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo1_up, dev_spin_eo2_up, -1.0); // dev_spin_eo2_up = (1 - imubar) * dev_spin_eo1_up = (1 - imubar)*(M_eo) * spinin_dn + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo1_dn, dev_spin_eo2_dn, +1.0); // dev_spin_eo2_dn = (1 + imubar) * dev_spin_eo1_dn = (1 + imubar)*(M_eo) * spinin_up + + + // linear algebra + cublasAxpy (N_floats, g_epsbar, (RealT*)dev_spin_eo1_dn, 1, (RealT*)dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up + epsbar * dev_spin_eo1_dn = (1 - imubar)*(M_eo) * spinin_dn + epsbar * (M_eo) * spinin_up + cublasAxpy (N_floats, g_epsbar, (RealT*)dev_spin_eo1_up, 1, (RealT*)dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn + epsbar * dev_spin_eo1_up = (1 + imubar)*(M_eo) * spinin_up + epsbar * (M_eo) * spinin_dn + + // linear algebra + cublasScal (N_floats, nrm, (RealT*)dev_spin_eo2_up, 1); // dev_spin_eo2_up = nrm * dev_spin_eo2_up = nrm*(1-imubar)*(M_eo) * spinin_dn + nrm*epsbar*(M_eo) * spinin_up + + cublasScal (N_floats, nrm, (RealT*)dev_spin_eo2_dn, 1); // dev_spin_eo2_dn = nrm * dev_spin_eo2_dn = nrm*(1+imubar)*(M_eo) * spinin_up + nrm*epsbar*(M_eo) * spinin_dn + + + // xchange + #ifndef HOPPING_DEBUG + xchange_field_wrapper(dev_spin_eo2_up, 1); + #endif + + // hopping matrix + #ifdef USETEXTURE + bind_texture_spin(dev_spin_eo2_up,1); // remember: this is ((M_oe)(Mee^-1)(M_eo)) * (spinin_dn, spinin_up): + #endif + + #ifndef HOPPING_DEBUG + #ifndef ALTERNATE_HOPPING_MATRIX + dev_Hopping_Matrix <<>>(MixedsolveParameter::getGlobalP()->dev_gf, dev_spin_eo2_up, dev_spin_eo1_up, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + #else + dev_Hopping_Matrix_alternate<<>>(MixedsolveParameter::getGlobalP()->dev_gf, dev_spin_eo2_up, dev_spin_eo1_up, dev_g_iup, dev_g_idn, dev_g_eo2lexic, dev_g_lexic2eosub, 1); + #endif + #else + Hopping_Matrix_wrapper(1, dev_spin_eo1_up, dev_spin_eo2_up); + #endif + + #ifdef USETEXTURE + unbind_texture_spin(1); // dev_spin_eo1_up = (M_oe) * dev_spin_eo2_up = (M_oe)*nrm*(1-imubar)*(M_eo) * spinin_dn + (M_oe)*nrm*epsbar*(M_eo) * spinin_up + #endif + + + // xchange + #ifndef HOPPING_DEBUG + xchange_field_wrapper(dev_spin_eo2_dn, 1); + #endif + + // hopping matrix + #ifdef USETEXTURE + bind_texture_spin(dev_spin_eo2_dn,1); + #endif + + #ifndef HOPPING_DEBUG + #ifndef ALTERNATE_HOPPING_MATRIX + dev_Hopping_Matrix <<>>(MixedsolveParameter::getGlobalP()->dev_gf, dev_spin_eo2_dn, dev_spin_eo1_dn, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + #else + dev_Hopping_Matrix_alternate<<>>(MixedsolveParameter::getGlobalP()->dev_gf, dev_spin_eo2_dn, dev_spin_eo1_dn, dev_g_iup, dev_g_idn, dev_g_eo2lexic, dev_g_lexic2eosub, 1); + #endif + #else + Hopping_Matrix_wrapper(1, dev_spin_eo1_dn, dev_spin_eo2_dn); + #endif + + #ifdef USETEXTURE + unbind_texture_spin(1); // dev_spin_eo1_dn = (M_oe) * dev_spin_eo2_dn = (M_oe)*nrm*(1+imubar)*(M_eo) * spinin_up + (M_oe)*nrm*epsbar*(M_eo) * spinin_dn + #endif + + + + + //////////// + // (M_oo) // + //////////// + + + // imubar, gamma5 + dev_mul_one_pm_imubar_gamma5<<>>(spinin_dn, dev_spin_eo2_up, +1.0); // dev_spin_eo2_up = (1 + imubar) * spinin_dn + dev_mul_one_pm_imubar_gamma5<<>>(spinin_up, dev_spin_eo2_dn, -1.0); // dev_spin_eo2_dn = (1 - imubar) * spinin_up + + + // linear algebra // remember: this is (M_oo) * (spinin_dn, spinin_up): + cublasAxpy (N_floats, -g_epsbar, (RealT*)spinin_up, 1, (RealT*)dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up - epsbar*spinin_up = (1+imubar)*spinin_dn - epsbar*spinin_up + cublasAxpy (N_floats, -g_epsbar, (RealT*)spinin_dn, 1, (RealT*)dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn - epsbar*spinin_dn = (1-imubar)*spinin_up - epsbar*spinin_dn + + + + /////////////////////////////////////// + // (M_oo) - (M_oe) (Mee^-1) (M_eo) // + /////////////////////////////////////// + + // linear algebra // this is ((M_oo) - (M_oe)(Mee^-1)(M_eo)) * (spinin_dn, spinin_up): + cublasAxpy (N_floats, -1.0, (RealT*)dev_spin_eo1_up, 1, (RealT*)dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up - dev_spin_eo1_up = (1+imubar) * spinin_dn - epsbar * spinin_up + // - (M_oe)*nrm*(1-imubar)*(M_eo) * spinin_dn - (M_oe)*nrm*epsbar*(M_eo) * spinin_up + cublasAxpy (N_floats, -1.0, (RealT*)dev_spin_eo1_dn, 1, (RealT*)dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn - dev_spin_eo1_dn = (1-imubar) * spinin_up - epsbar * spinin_dn + // - (M_oe)*nrm*(1+imubar)*(M_eo) * spinin_up - (M_oe)*nrm*epsbar*(M_eo) * spinin_dn + + + //////////// + // gamma5 // + //////////// + + // gamma5 + dev_gamma5<<>>(dev_spin_eo2_up, dev_spin_eo2_up); // dev_spin_eo2_up = gamma5 * dev_spin_eo2_up + dev_gamma5<<>>(dev_spin_eo2_dn, dev_spin_eo2_dn); // dev_spin_eo2_dn = gamma5 * dev_spin_eo2_dn + + + + + + + //////////////////// + // (a,b) -> (b,a) // + //////////////////// + dev_copy_spinor_field<<>>(dev_spin_eo2_dn, dev_spin_eo3_up); // dev_spin_eo3_up = dev_spin_eo2_dn + dev_copy_spinor_field<<>>(dev_spin_eo2_up, dev_spin_eo3_dn); // dev_spin_eo3_dn = dev_spin_eo2_up + + + + + + + /////////////////////////////////// /////////////////////// + // Q_tilde(2x2) // // (Q_tilde) * (b,a) // + /////////////////////////////////// /////////////////////// + + + //////////////////////////// + // (M_oe) (Mee^-1) (M_eo) // + //////////////////////////// + + // xchange + #ifndef HOPPING_DEBUG + xchange_field_wrapper(dev_spin_eo3_up, 0); + #endif + + // hopping matrix + #ifdef USETEXTURE + bind_texture_spin(dev_spin_eo3_up,1); + #endif + + #ifndef HOPPING_DEBUG + #ifndef ALTERNATE_HOPPING_MATRIX + dev_Hopping_Matrix <<>>(MixedsolveParameter::getGlobalP()->dev_gf, dev_spin_eo3_up, dev_spin_eo1_up, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); // dev_spin_eo1_up = (M_eo) * dev_spin_eo3_up + #else + dev_Hopping_Matrix_alternate<<>>(MixedsolveParameter::getGlobalP()->dev_gf, dev_spin_eo3_up, dev_spin_eo1_up, dev_g_iup, dev_g_idn, dev_g_eo2lexic, dev_g_lexic2eosub, 0); + #endif + #else + Hopping_Matrix_wrapper(0, dev_spin_eo1_up, dev_spin_eo3_up); + #endif + + #ifdef USETEXTURE + unbind_texture_spin(1); + #endif + + + // xchange + #ifndef HOPPING_DEBUG + xchange_field_wrapper(dev_spin_eo3_dn, 0); + #endif + + // hopping matrix + #ifdef USETEXTURE + bind_texture_spin(dev_spin_eo3_dn,1); + #endif + + #ifndef HOPPING_DEBUG + #ifndef ALTERNATE_HOPPING_MATRIX + dev_Hopping_Matrix <<>>(MixedsolveParameter::getGlobalP()->dev_gf, dev_spin_eo3_dn, dev_spin_eo1_dn, dev_eoidx_even, dev_eoidx_odd, dev_nn_eo, 0); // dev_spin_eo1_dn = (M_eo) * dev_spin_eo3_dn + #else + dev_Hopping_Matrix_alternate<<>>(MixedsolveParameter::getGlobalP()->dev_gf, dev_spin_eo3_dn, dev_spin_eo1_dn, dev_g_iup, dev_g_idn, dev_g_eo2lexic, dev_g_lexic2eosub, 0); + #endif + #else + Hopping_Matrix_wrapper(0, dev_spin_eo1_dn, dev_spin_eo3_dn); + #endif + + #ifdef USETEXTURE + unbind_texture_spin(1); + #endif + + + // imubar, gamma5 + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo1_up, dev_spin_eo2_up, -1.0); // dev_spin_eo2_up = (1 - imubar) * dev_spin_eo1_up = (1 - imubar)*(M_eo) * dev_spin_eo3_up + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo1_dn, dev_spin_eo2_dn, +1.0); // dev_spin_eo2_dn = (1 + imubar) * dev_spin_eo1_dn = (1 + imubar)*(M_eo) * dev_spin_eo3_dn + + + // linear algebra + cublasAxpy (N_floats, g_epsbar, (RealT*)dev_spin_eo1_dn, 1, (RealT*)dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up + epsbar * dev_spin_eo1_dn = (1 - imubar)*(M_eo) * dev_spin_eo3_up + epsbar * (M_eo) * dev_spin_eo3_dn + cublasAxpy (N_floats, g_epsbar, (RealT*)dev_spin_eo1_up, 1, (RealT*)dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn + epsbar * dev_spin_eo1_up = (1 + imubar)*(M_eo) * dev_spin_eo3_dn + epsbar * (M_eo) * dev_spin_eo3_up + + // lineare algebra + cublasScal (N_floats, nrm, (RealT*)dev_spin_eo2_up, 1); // dev_spin_eo2_up = nrm * dev_spin_eo2_up = nrm*(1-imubar)*(M_eo) * dev_spin_eo3_up + nrm*epsbar*(M_eo) * dev_spin_eo3_dn + + cublasScal (N_floats, nrm, (RealT*)dev_spin_eo2_dn, 1); // dev_spin_eo2_dn = nrm * dev_spin_eo2_dn = nrm*(1+imubar)*(M_eo) * dev_spin_eo3_dn + nrm*epsbar*(M_eo) * dev_spin_eo3_up + + + // xchange + #ifndef HOPPING_DEBUG + xchange_field_wrapper(dev_spin_eo2_up, 1); + #endif + + // hopping matrix + #ifdef USETEXTURE + bind_texture_spin(dev_spin_eo2_up,1); // remember: this is ((M_oe) (Mee^-1) (M_eo)) * (dev_spin_eo3_up, dev_spin_eo3_dn): + #endif + + #ifndef HOPPING_DEBUG + #ifndef ALTERNATE_HOPPING_MATRIX + dev_Hopping_Matrix <<>>(MixedsolveParameter::getGlobalP()->dev_gf, dev_spin_eo2_up, dev_spin_eo1_up, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + #else + dev_Hopping_Matrix_alternate<<>>(MixedsolveParameter::getGlobalP()->dev_gf, dev_spin_eo2_up, dev_spin_eo1_up, dev_g_iup, dev_g_idn, dev_g_eo2lexic, dev_g_lexic2eosub, 1); + #endif + #else + Hopping_Matrix_wrapper(1, dev_spin_eo1_up, dev_spin_eo2_up); + #endif + + #ifdef USETEXTURE + unbind_texture_spin(1); // dev_spin_eo1_up = (M_oe) * dev_spin_eo2_up = (M_oe)*nrm*(1-imubar)*(M_eo) * dev_spin_eo3_up + (M_oe)*nrm*epsbar*(M_eo) * dev_spin_eo3_dn + #endif + + + // xchange + #ifndef HOPPING_DEBUG + xchange_field_wrapper(dev_spin_eo2_dn, 1); + #endif + + // hopping matrix + #ifdef USETEXTURE + bind_texture_spin(dev_spin_eo2_dn,1); + #endif + + #ifndef HOPPING_DEBUG + #ifndef ALTERNATE_HOPPING_MATRIX + dev_Hopping_Matrix <<>>(MixedsolveParameter::getGlobalP()->dev_gf, dev_spin_eo2_dn, dev_spin_eo1_dn, dev_eoidx_odd, dev_eoidx_even, dev_nn_oe, 1); + #else + dev_Hopping_Matrix_alternate<<>>(MixedsolveParameter::getGlobalP()->dev_gf, dev_spin_eo2_dn, dev_spin_eo1_dn, dev_g_iup, dev_g_idn, dev_g_eo2lexic, dev_g_lexic2eosub, 1); + #endif + #else + Hopping_Matrix_wrapper(1, dev_spin_eo1_dn, dev_spin_eo2_dn); + #endif + + #ifdef USETEXTURE + unbind_texture_spin(1); // dev_spin_eo1_dn = (M_oe) * dev_spin_eo2_dn = (M_oe)*nrm*(1+imubar)*(M_eo) * dev_spin_eo3_dn + (M_oe)*nrm*epsbar*(M_eo) * dev_spin_eo3_up + #endif + + + + //////////// + // (M_oo) // + //////////// + + // imubar, gamma5 + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo3_up, dev_spin_eo2_up, +1.0); // dev_spin_eo2_up = (1 + imubar) * dev_spin_eo3_up + dev_mul_one_pm_imubar_gamma5<<>>(dev_spin_eo3_dn, dev_spin_eo2_dn, -1.0); // dev_spin_eo2_dn = (1 - imubar) * dev_spin_eo3_dn + + + // lineare algebra // remember: this is (M_oo) * (dev_spin_eo3_up, dev_spin_eo3_dn): + cublasAxpy (N_floats, -g_epsbar, (RealT*)dev_spin_eo3_dn, 1, (RealT*)dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up - epsbar*dev_spin_eo3_dn = (1+imubar)*dev_spin_eo3_up - epsbar*dev_spin_eo3_dn + cublasAxpy (N_floats, -g_epsbar, (RealT*)dev_spin_eo3_up, 1, (RealT*)dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn - epsbar*dev_spin_eo3_up = (1-imubar)*dev_spin_eo3_dn - epsbar*dev_spin_eo3_up + + + + /////////////////////////////////////// + // (M_oo) - (M_oe) (Mee^-1) (M_eo) // + /////////////////////////////////////// + + // lineare algebra // this is ( (M_oo) - (M_oe) (Mee^-1) (M_eo) ) * (dev_spin_eo3_up, dev_spin_eo3_dn) + cublasAxpy (N_floats, -1.0, (RealT*)dev_spin_eo1_up, 1, (RealT*)dev_spin_eo2_up, 1); + // dev_spin_eo2_up = dev_spin_eo2_up - dev_spin_eo1_up = (1+imubar) * dev_spin_eo3_up - epsbar * dev_spin_eo3_dn + // - (M_oe)*nrm*(1-imubar)*(M_eo) * dev_spin_eo3_up - (M_oe)*nrm*epsbar*(M_eo) * dev_spin_eo3_dn + cublasAxpy (N_floats, -1.0, (RealT*)dev_spin_eo1_dn, 1, (RealT*)dev_spin_eo2_dn, 1); + // dev_spin_eo2_dn = dev_spin_eo2_dn - dev_spin_eo1_dn = (1-imubar) * dev_spin_eo3_dn - epsbar * dev_spin_eo3_up + // - (M_oe)*nrm*(1+imubar)*(M_eo) * dev_spin_eo3_dn - (M_oe)*nrm*epsbar*(M_eo) * dev_spin_eo3_up + + + //////////// + // gamma5 // + //////////// + + // gamma5 + dev_gamma5<<>>(dev_spin_eo2_up, dev_spin_eo2_up); // dev_spin_eo2_up = gamma5 * dev_spin_eo2_up + dev_gamma5<<>>(dev_spin_eo2_dn, dev_spin_eo2_dn); // dev_spin_eo2_dn = gamma5 * dev_spin_eo2_dn + + + + + /* + //////////// + // output // // output is already done by setting dev_spin_eo2_up/dn = spinout_up/dn + //////////// + + dev_copy_spinor_field<<>>(dev_spin_eo2_up, spinout_up); // spinout_up = dev_spin_eo2_up + dev_copy_spinor_field<<>>(dev_spin_eo2_dn, spinout_dn); // spinout_dn = dev_spin_eo2_dn + */ + + + return; + +}//matrix_multiplication32_mpi() + + +#endif // MPI + + + + + + + ///////////////////// + // // + // BENCHMARK // + // // + ///////////////////// + + + + +extern "C" void benchmark_eo_nd (spinor * Q_up, spinor * Q_dn, int N) { + + + + //////////////////////////////////////////////////////////////////////////////////////////////////////// + // // + // total FLOPS = (#iterations) * (FLOPS/matrix application) * (#lattice sites) // + // // + // // + // FLOPS per lattice site and application of the function, // + // count the floating point op's on device: // + // // + // dev_Hopping_Matrix = 4136 // + // dev_mul_one_pm_imubar_gamma5 = 120 // + // dev_gamma5 = 12 // + // // + // cublasSaxpy = 24*2 = 48 // + // cublasSscal = 24*1 = 24 // + // // + // // + // (FLOPS/matrix application) = 2 * (4*4136 + 4*120 + 6*48 + 2*24 + 2*12) = 2 * 17384 = 34768 // + // // + //////////////////////////////////////////////////////////////////////////////////////////////////////// + + + // timing + #ifndef MPI + double timeElapsed; + #else + double singleTimeElapsed; + double maxTimeElapsed; + #endif + double startBenchmark; + double stopBenchmark; + + // counter + int i; + + // flop counting + /* + double realFlopsPerApp = 34768.0; + */ + // double effectiveFlopsPerApp = 23984.0; // hopping = 1488 + double effectiveFlopsPerApp = 21296.0; // per lattice site + + #ifndef MPI + /* + double realDeviceFlops; + double realFlops; + */ + double effectiveDeviceFlops; + double effectiveFlops; + #else + /* + double realDeviceFlops; + double allRealDeviceFlops; + double realFlops; + */ + double effectiveDeviceFlops; + double allEffectiveDeviceFlops; + double effectiveFlops; + #endif + + // CUDA errors + cudaError_t cudaerr; + cublasStatus cublasstatus; + + // size of a spinor + /* + size_t dev_spinsize_int = 6*VOLUME/2 * sizeof(dev_spinor); + #ifdef MPI + size_t dev_spinsize_ext = 6*(VOLUME+RAND)/2 * sizeof(dev_spinor); + #endif + */ + + // formal parameters + int staticsource = 0; // 1: applies matrix every time on the same source + // 0: applies matrix consecutively ... + + + // init_mixedsolve_eo_nd(g_gauge_field); // only when externally called + + + dev_spinor * A_up; + dev_spinor * A_dn; + dev_spinor * B_up; + dev_spinor * B_dn; + + dev_spinor * C_up; + dev_spinor * C_dn; + + #ifndef MPI + cudaMalloc((void **) &A_up, dev_spinsize_int); + cudaMalloc((void **) &A_dn, dev_spinsize_int); + cudaMalloc((void **) &B_up, dev_spinsize_int); + cudaMalloc((void **) &B_dn, dev_spinsize_int); + #else + cudaMalloc((void **) &A_up, dev_spinsize_ext); + cudaMalloc((void **) &A_dn, dev_spinsize_ext); + cudaMalloc((void **) &B_up, dev_spinsize_ext); + cudaMalloc((void **) &B_dn, dev_spinsize_ext); + #endif + + // debug // CUDA + #ifdef CUDA_DEBUG + CUDA_CHECK_NO_SUCCESS_MSG("CUDA error in benchmark_eo_nd(). Memory allocation of spinor fields failed."); + #endif + + + /* + #ifdef USETEXTURE + bind_texture_gf(MixedsolveParameter::getGlobalP()->dev_gf); + #endif + */ + + + /* // only when externally called + //Initialize some stuff + dev_complex h0,h1,h2,h3,mh0, mh1, mh2, mh3; + + h0.re = (float)ka0.re; h0.im = -(float)ka0.im; + h1.re = (float)ka1.re; h1.im = -(float)ka1.im; + h2.re = (float)ka2.re; h2.im = -(float)ka2.im; + h3.re = (float)ka3.re; h3.im = -(float)ka3.im; + + mh0.re = -(float)ka0.re; mh0.im = (float)ka0.im; + mh1.re = -(float)ka1.re; mh1.im = (float)ka1.im; + mh2.re = -(float)ka2.re; mh2.im = (float)ka2.im; + mh3.re = -(float)ka3.re; mh3.im = (float)ka3.im; + + // try using constant mem for kappas + cudaMemcpyToSymbol("dev_k0c", &h0, sizeof(dev_complex)); + cudaMemcpyToSymbol("dev_k1c", &h1, sizeof(dev_complex)); + cudaMemcpyToSymbol("dev_k2c", &h2, sizeof(dev_complex)); + cudaMemcpyToSymbol("dev_k3c", &h3, sizeof(dev_complex)); + + cudaMemcpyToSymbol("dev_mk0c", &mh0, sizeof(dev_complex)); + cudaMemcpyToSymbol("dev_mk1c", &mh1, sizeof(dev_complex)); + cudaMemcpyToSymbol("dev_mk2c", &mh2, sizeof(dev_complex)); + cudaMemcpyToSymbol("dev_mk3c", &mh3, sizeof(dev_complex)); + */ + + + + int blocksize; // auxiliary + + blocksize = BLOCKSIZE1; + int blockdim1, griddim1; // here: dev_zero_spinor_field , dev_copy_spinor_field + if ( (VOLUME/2) % blocksize == 0 ) { + blockdim1 = blocksize; + griddim1 = VOLUME/2/blocksize; + } + else { + blockdim1 = blocksize; + griddim1 = (int) ((VOLUME/2/blocksize) + 1); + } + + blocksize = BLOCKSIZE2; + int blockdim2, griddim2; // passed: dev_Hopping_Matrix + if ( (VOLUME/2) % blocksize == 0 ) { + blockdim2 = blocksize; + griddim2 = VOLUME/2/blocksize; + } + else { + blockdim2 = blocksize; + griddim2 = (int) ((VOLUME/2/blocksize) + 1); + } + + blocksize = BLOCKSIZE3; + int blockdim3, griddim3; // passed: dev_mul_one_pm_imubar_gamma5 + if ( (VOLUME/2) % blocksize == 0 ) { + blockdim3 = blocksize; + griddim3 = VOLUME/2/blocksize; + } + else { + blockdim3 = blocksize; + griddim3 = (int) ((VOLUME/2/blocksize) + 1); + } + + blocksize = BLOCKSIZE4; + int blockdim4, griddim4; // passed: dev_gamma5 + if ( (VOLUME/2) % blocksize == 0 ) { + blockdim4 = blocksize; + griddim4 = VOLUME/2/blocksize; + } + else { + blockdim4 = blocksize; + griddim4 = (int) ((VOLUME/2/blocksize) + 1); + } + + blocksize = BLOCKSIZE5; + int blockdim5, griddim5; // passed: dev_copy_spinor_field + if ( (VOLUME/2) % blocksize == 0 ) { + blockdim5 = blocksize; + griddim5 = VOLUME/2/blocksize; + } + else { + blockdim5 = blocksize; + griddim5 = (int) ((VOLUME/2/blocksize) + 1); + } + + + //debug + #ifndef MPI + printf("\nStarting a little BENCHMARK. benchmark_eo_nd().\n"); + #else + if (g_proc_id == 0) printf("\nStarting a little BENCHMARK. benchmark_eo_nd_mpi().\n"); + #endif + + + + + /* // only when externally called + he_cg_init<<< 1, 1 >>> (dev_grid, (float) g_kappa, (float)(g_mu/(2.0*g_kappa)), h0, h1, h2, h3); + + // debug // kernel + #ifdef CUDA_DEBUG + CUDA_KERNEL_CHECK_NO_SUCCESS_MSG("Kernel error in he_cg_init(). Couldn't initialize some stuff."); + #endif + + + he_cg_init_nd_additional<<<1,1>>> (g_mubar, g_epsbar); + + // debug // kernel + #ifdef CUDA_DEBUG + CUDA_KERNEL_CHECK_NO_SUCCESS_MSG("Kernel error in he_cg_init_nd_additional(). Couldn't initialize some stuff."); + #endif + */ + + + + /* + // debug // CUBLAS helper function + #ifdef CUDA_DEBUG + CUBLAS_HELPER_CHECK_NO_SUCCESS_MSG(cublasInit(), "CUBLAS error in benchmark_eo_nd(). Couldn't initialize CUBLAS."); + #else + cublasInit(); + #endif + */ + + + + + // debug + #ifndef MPI + printf("Applying the eo-preconditioned matrix %i times.\n", N); + #else + if (g_proc_id == 0) printf("Applying the eo-preconditioned matrix %i times.\n", N); + #endif + + + to_device(B_up, Q_up, h2d_spin_up, dev_spinsize_int); + to_device(B_dn, Q_dn, h2d_spin_dn, dev_spinsize_int); + + + // timer + #ifndef MPI + startBenchmark = double(clock()) / double(CLOCKS_PER_SEC); + #else + startBenchmark = MPI_Wtime(); + #endif + + + + + for (i = 0; i < N; i++) { + + + #ifndef MPI + matrix_multiplication32(A_up, A_dn, // A = (matrix)*B + B_up, B_dn, + griddim2, blockdim2, + griddim3, blockdim3, + griddim4, blockdim4, + griddim5, blockdim5); + #else + #ifndef ASYNC + matrix_multiplication32_mpi(A_up, A_dn, // A = (matrix)*B + B_up, B_dn, + griddim2, blockdim2, + griddim3, blockdim3, + griddim4, blockdim4, + griddim5, blockdim5); + #else + matrix_multiplication32_mpi_ASYNC(A_up, A_dn, // A = (matrix)*B + B_up, B_dn, + griddim2, blockdim2, + griddim3, blockdim3, + griddim4, blockdim4, + griddim5, blockdim5); + #endif + #endif + + + if (staticsource == 0) { + // swaps A and B + C_up = B_up; + C_dn = B_dn; + B_up = A_up; + B_dn = A_dn; + A_up = C_up; + A_dn = C_dn; + } + //else { + // do nothing + //} + + } + + // debug // CUDA + #ifdef CUDA_DEBUG + CUDA_CHECK_NO_SUCCESS_MSG("CUDA error in matrix_muliplication32(). Applying the matrix on GPU failed."); + #endif + + + + // timer + #ifndef MPI + stopBenchmark = double(clock()) / double(CLOCKS_PER_SEC); + #else + stopBenchmark = MPI_Wtime(); + #endif + + + #ifndef MPI + + timeElapsed = stopBenchmark - startBenchmark; + /* + realDeviceFlops = N * VOLUME/2 * realFlopsPerApp; + realFlops = N * VOLUME/2 * realFlopsPerApp / timeElapsed / 1.0e9; + */ + effectiveDeviceFlops = N * VOLUME/2 * effectiveFlopsPerApp; + effectiveFlops = N * VOLUME/2 * effectiveFlopsPerApp / timeElapsed / 1.0e9; + + /* + printf("REAL:\n"); + printf("\ttime: %.2e sec\n", timeElapsed); + printf("\tflop's: %.2e flops\n", realDeviceFlops); + printf("\tperformance: %.2e Gflop/s\n\n", realFlops); + */ + printf("EFFECTIVE:\n"); + printf("\ttime: %.4e sec\n", timeElapsed); + printf("\tflop's: %.4e flops\n", effectiveDeviceFlops); + printf("\tperformance: %.4e Gflop/s\n\n", effectiveFlops); + + #else + + singleTimeElapsed = stopBenchmark - startBenchmark; + MPI_Allreduce(&singleTimeElapsed, &maxTimeElapsed, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); + /* + realDeviceFlops = N * VOLUME/2 * realFlopsPerApp; + MPI_Allreduce(&realDeviceFlops, &allRealDeviceFlops, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + realFlops = allRealDeviceFlops / maxTimeElapsed / 1.0e9; + */ + effectiveDeviceFlops = N * VOLUME/2 * effectiveFlopsPerApp; + MPI_Allreduce(&effectiveDeviceFlops, &allEffectiveDeviceFlops, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + effectiveFlops = allEffectiveDeviceFlops / maxTimeElapsed / 1.0e9; + + + if (g_proc_id == 0) { + /* + printf("\tTEST:\n"); + printf("\ttime: %.2e sec\n", singleTimeElapsed); + printf("\tflop's: %.2e flops\n", realDeviceFlops); + printf("\tperformance: %.2e Gflop/s\n\n", realDeviceFlops / singleTimeElapsed / 1.0e9); + + printf("\tREAL:\n"); + printf("\ttime: %.2e sec\n", maxTimeElapsed); + printf("\tflop's: %.2e flops\n", allRealDeviceFlops); + printf("\tperformance: %.2e Gflop/s\n\n", realFlops); + */ + printf("\tEFFECTIVE:\n"); + printf("\ttime: %.4e sec\n", maxTimeElapsed); + printf("\tflop's: %.4e flops\n", allEffectiveDeviceFlops); + printf("\tperformance: %.4e Gflop/s\n\n", effectiveFlops); + + #if ASYNC > 0 && defined(ASYNC_TIMING) + // calculate the times from the "beginning" + cudaEventElapsedTime(&time_stop_D2H_1, start_ALL, stop_D2H_1); + cudaEventElapsedTime(&time_stop_D2H_2, start_ALL, stop_D2H_2); + cudaEventElapsedTime(&time_stop_INT_0, start_ALL, stop_INT_0); + cudaEventElapsedTime(&time_stop_H2D_3, start_ALL, stop_H2D_3); + cudaEventElapsedTime(&time_stop_H2D_4, start_ALL, stop_H2D_4); + cudaEventElapsedTime(&time_stop_EXT_1, start_ALL, stop_EXT_1); + cudaEventElapsedTime(&time_stop_EXT_2, start_ALL, stop_EXT_2); + cudaEventElapsedTime(&time_stop_ALL , start_ALL, stop_ALL); + mpiTime_start_sendrecv_1 = mpi_start_sendrecv_1 - mpi_start_ALL; + mpiTime_stop_sendrecv_1 = mpi_stop_sendrecv_1 - mpi_start_ALL; + mpiTime_start_sendrecv_2 = mpi_start_sendrecv_2 - mpi_start_ALL; + mpiTime_stop_sendrecv_2 = mpi_stop_sendrecv_2 - mpi_start_ALL; + // outputting the times + #if ASYNC == 1 + printf("\tTIMING[sec]:\n"); + printf("\tSTART: %.2e - \n", 0.0); + printf("\tD2H_1: - %.2e\n", time_stop_D2H_1/1000); + printf("\tINT_0: - %.2e\n", time_stop_INT_0/1000); + printf("\tSENDRECV_1: %.2e - %.2e\n", mpiTime_start_sendrecv_1, mpiTime_stop_sendrecv_1); + printf("\tH2D_3: %.2e - %.2e\n", mpiTime_stop_sendrecv_1, time_stop_H2D_3/1000); + printf("\tEXT_1: %.2e - %.2e\n", time_stop_H2D_3/1000, time_stop_EXT_1/1000); + printf("\tD2H_2: %.2e - %.2e\n", mpiTime_stop_sendrecv_1, time_stop_D2H_2/1000); + printf("\tSENDRECV_2: %.2e - %.2e\n", mpiTime_start_sendrecv_2, mpiTime_stop_sendrecv_2); + printf("\tH2D_4: %.2e - %.2e\n", mpiTime_stop_sendrecv_2, time_stop_H2D_4/1000); + printf("\tEXT_2: %.2e - %.2e\n", time_stop_H2D_4/1000, time_stop_EXT_2/1000); + printf("\tSTOP: - %.2e\n", time_stop_ALL/1000); + #elif ASYNC == 2 + printf("\tTIMING[sec]:\n"); + printf("\tSTART: %.2e - \n", 0.0); + printf("\tD2H_1: - %.2e\n", time_stop_D2H_1/1000); + printf("\tD2H_2: - %.2e\n", time_stop_D2H_2/1000); + printf("\tINT_0: - %.2e\n", time_stop_INT_0/1000); + printf("\tSENDRECV_1: %.2e - %.2e\n", mpiTime_start_sendrecv_1, mpiTime_stop_sendrecv_1); + printf("\tH2D_3: %.2e - %.2e\n", mpiTime_stop_sendrecv_1, time_stop_H2D_3/1000); + printf("\tEXT_1: %.2e - %.2e\n", time_stop_H2D_3/1000, time_stop_EXT_1/1000); + printf("\tSENDRECV_2: %.2e - %.2e\n", mpiTime_start_sendrecv_2, mpiTime_stop_sendrecv_2); + printf("\tH2D_4: %.2e - %.2e\n", mpiTime_stop_sendrecv_2, time_stop_H2D_4/1000); + printf("\tEXT_2: %.2e - %.2e\n", time_stop_H2D_4/1000, time_stop_EXT_2/1000); + printf("\tSTOP: - %.2e\n", time_stop_ALL/1000); + #endif + #endif + + } + + #endif // MPI + + + cudaFree(A_up); + cudaFree(A_dn); + cudaFree(B_up); + cudaFree(B_dn); + + + // finalize_mixedsolve_eo_nd(); // only when externally called + + /* + #ifdef USETEXTURE + unbind_texture_gf(); + #endif + + + // debug // CUBLAS helper function + #ifdef CUDA_DEBUG + CUBLAS_HELPER_CHECK_NO_SUCCESS_MSG(cublasShutdown(), "CUBLAS error in benchmark_eo_nd(). Couldn't shut down CUBLAS."); + #else + cublasShutdown(); + #endif + */ + + +}//benchmark_eo_nd() + + + + + + + //////////////////////// + // // + // MIXED SOLVER // + // // + //////////////////////// + + + + +//////////////////////// +// CONJUGATE GRADIENT // +//////////////////////// + +// for the odd field after even/odd-preconditioning +// single precision on GPU + +int cg_eo_nd (dev_su3_2v * gf, + dev_spinor * P_up, dev_spinor * P_dn, + dev_spinor * Q_up, dev_spinor * Q_dn, + int max_iter, + int check_abs , int check_rel, + double eps_abs, double eps_rel ) { + + // P_up/dn can be used as auxiliary field to work on, as it is not later used (could be used as initial guess at the very start) + // Q_up/dn can be used as feedback, or if not, also as auxiliary field + + + + typedef REAL RealT; + + ///////////////////// + // LOCAL VARIABLES // + ///////////////////// + + // CUDA + cudaError_t cudaerr; + cublasStatus cublasstatus; + + // algorithm + float rr_up; + float rr_dn; + float rr; + float rr_old; + float r0r0; + + float dAd_up; + float dAd_dn; + float dAd; + + float alpha; + float beta; + + // (auxiliary) device fields + dev_spinor * r_up, * r_dn, + * Ad_up, * Ad_dn, + * x_up, * x_dn, + * d_up, * d_dn, + * Ax_up, * Ax_dn; // for recalculating the residue + + // counting + int j; // iteration counter + + // formal parameters + /* + size_t dev_spinsize_int = 6*VOLUME/2*sizeof(dev_spinor); + int N_sites_int = VOLUME/2; + int N_floats_int = 24*VOLUME/2;// (single precision) CUBLAS functions get the number of floats as input + #ifdef MPI + size_t dev_spinsize_ext = 6*(VOLUME+RAND)/2*sizeof(dev_spinor); + int N_sites_ext = (VOLUME+RAND)/2; + int N_floats_ext = 24*(VOLUME+RAND)/2; + #endif + */ + + // algorithm control parameters + // int N_recalc_res = 10; // recalculate residue r(k+1) = b - A*x(k+1) each N_recalc_res iteration + int N_recalc_res = 1000; + spinor ** up_field = NULL; + spinor ** dn_field = NULL; + const int nr_sf = 5; + + init_solver_field(&up_field, VOLUMEPLUSRAND/2, nr_sf); + init_solver_field(&dn_field, VOLUMEPLUSRAND/2, nr_sf); + + ///////////////////////////////////////////// + // CUDA block- and gridsize specifications // + ///////////////////////////////////////////// + + // int gridsize; // auxiliary + int blocksize; // auxiliary + + blocksize = BLOCKSIZE1; + int blockdim1, griddim1; // here: dev_zero_spinor_field , dev_copy_spinor_field + if ( (VOLUME/2) % blocksize == 0 ) { + blockdim1 = blocksize; + griddim1 = VOLUME/2/blocksize; + } + else { + blockdim1 = blocksize; + griddim1 = (int) ((VOLUME/2/blocksize) + 1); + } + + blocksize = BLOCKSIZE2; + int blockdim2, griddim2; // passed: dev_Hopping_Matrix + if ( (VOLUME/2) % blocksize == 0 ) { + blockdim2 = blocksize; + griddim2 = VOLUME/2/blocksize; + } + else { + blockdim2 = blocksize; + griddim2 = (int) ((VOLUME/2/blocksize) + 1); + } + + blocksize = BLOCKSIZE3; + int blockdim3, griddim3; // passed: dev_mul_one_pm_imubar_gamma5 + if ( (VOLUME/2) % blocksize == 0 ) { + blockdim3 = blocksize; + griddim3 = VOLUME/2/blocksize; + } + else { + blockdim3 = blocksize; + griddim3 = (int) ((VOLUME/2/blocksize) + 1); + } + + blocksize = BLOCKSIZE4; + int blockdim4, griddim4; // passed: dev_gamma5 + if ( (VOLUME/2) % blocksize == 0 ) { + blockdim4 = blocksize; + griddim4 = VOLUME/2/blocksize; + } + else { + blockdim4 = blocksize; + griddim4 = (int) ((VOLUME/2/blocksize) + 1); + } + + blocksize = BLOCKSIZE5; + int blockdim5, griddim5; // passed: dev_copy_spinor_field + if ( (VOLUME/2) % blocksize == 0 ) { + blockdim5 = blocksize; + griddim5 = VOLUME/2/blocksize; + } + else { + blockdim5 = blocksize; + griddim5 = (int) ((VOLUME/2/blocksize) + 1); + } + + /* + // debug + printf("griddim1 = %i, blockdim1 = %i\n", griddim1, blockdim1); + printf("griddim2 = %i, blockdim2 = %i\n", griddim2, blockdim2); + printf("griddim3 = %i, blockdim3 = %i\n", griddim3, blockdim3); + printf("griddim4 = %i, blockdim4 = %i\n", griddim4, blockdim4); + printf("griddim5 = %i, blockdim5 = %i\n", griddim5, blockdim5); + */ + + + + + ///////////////// + // ASSIGNMENTS // + ///////////////// + + x_up = P_up; // can use the output spinors also as auxiliary fields + x_dn = P_dn; // saves copying the output spinor field + /* + r_up = Q_up; // could possibly be done if Q_up/dn is not used as feedback + r_dn = Q_dn; // would save one field and one copying the field + */ + r_up = dev_spin1_up; // use these pointers to the allocated space on device memory (allocated by init_mixedsolve_eo_nd) + r_dn = dev_spin1_dn; + d_up = dev_spin2_up; + d_dn = dev_spin2_dn; + Ad_up = dev_spin3_up; + Ad_dn = dev_spin3_dn; + Ax_up = Ad_up; // works as long as no initial guess vector x(0) is passed to cg_eo_nd() + Ax_dn = Ad_dn; + + + + + ///////////////////// + // INITIALIZATIONS // + ///////////////////// + + /* // relocated to mixedsolve_eo_nd(), before here were: + // Initialize some stuff ... + // try using constant mem for kappas ... + */ + + /* + // bind texture gf + #ifdef USETEXTURE // needed for subfunctions of dev_Hopping_Matrix(...) + bind_texture_gf(gf); // e.g. dev_reconstructgf_2vtexref(...), dev_reconstructgf_8texref(...), + #endif // in general in functions dev_reconstructgf[...] with "tex1Dfetch(gf_tex[...]" + + // debug // CUDA + #ifdef CUDA_DEBUG + CUDA_CHECK("CUDA error in bind_texture_gf(). Binding GF to texture failed.", "GF bound to texture."); + #endif + */ + + /* // relocated to mixedsolve_eo_nd(), before here were: + he_cg_init<<< 1, 1 >>> (dev_grid, (float) g_kappa, (float)(g_mu/(2.0*g_kappa)), h0, h1, h2, h3); + + // debug // kernel + #ifdef CUDA_DEBUG + CUDA_KERNEL_CHECK("Kernel error in he_cg_init(). Couldn't initialize some stuff.", "he_cg_init() succeeded."); + #endif + + + he_cg_init_nd_additional<<<1,1>>> (g_mubar, g_epsbar); + + // debug // kernel + #ifdef CUDA_DEBUG + CUDA_KERNEL_CHECK("Kernel error in he_cg_init_nd_additional(). Couldn't initialize some stuff.", "he_cg_init_nd_additional() succeeded."); + #endif + */ + + /* + // cublasInit(); // init CUBLAS + + // debug // CUBLAS helper function + #ifdef CUDA_DEBUG + CUBLAS_HELPER_CHECK(cublasInit(), "CUBLAS error in cublasInit(). Couldn't initialize CUBLAS.", "CUBLAS initialized."); + #else + cublasInit(); + #endif + */ + + + + + /////////////// + // ALGORITHM // + /////////////// + + + // initialize x(0) = 0 // will be added up + dev_zero_spinor_field <<>>(x_up); + dev_zero_spinor_field <<>>(x_dn); + + + // r(0) = b - A*x(0) = b + dev_copy_spinor_field <<>>(Q_up, r_up); + dev_copy_spinor_field <<>>(Q_dn, r_dn); + + + // d(0) = r(0) + dev_copy_spinor_field <<>>(r_up, d_up); + dev_copy_spinor_field <<>>(r_dn, d_dn); + + + // debug // kernel + #ifdef CUDA_DEBUG + CUDA_KERNEL_CHECK("Kernel error in cg_eo_nd(). Initializing spinor fields on device failed.", "Spinor fields initialized on device."); + #endif + + + + + // rr = (r_up)^2 + (r_dn)^2 + #ifndef MPI + rr_up = cublasDot(N_floats_int, (float *) r_up, 1, (float *) r_up, 1); + rr_dn = cublasDot(N_floats_int, (float *) r_dn, 1, (float *) r_dn, 1); + #else + rr_up = cublasDot_wrapper(N_floats_int, (float *) r_up, 1, (float *) r_up, 1); + rr_dn = cublasDot_wrapper(N_floats_int, (float *) r_dn, 1, (float *) r_dn, 1); + #endif + rr = rr_up + rr_dn; + + + + + r0r0 = rr; // for relative precision + rr_old = rr; // for the first iteration + + + + + ////////// + // LOOP // + ////////// + + + // debug + #ifndef MPI + printf("\nEntering inner loop.\n"); + #else + if (g_cart_id == 0) printf("\nEntering inner loop.\n"); + #endif + + // debug // CUBLAS core function + #ifdef CUDA_DEBUG + // CUBLAS_CORE_CHECK("CUBLAS error in cg_eo_nd(). Calculating initial residue failed.", "Initial inner residue calculated."); + CUBLAS_CORE_CHECK_NO_SUCCESS_MSG("CUBLAS error in cg_eo_nd(). Calculating initial residue failed."); + #endif + + // debug + #ifndef MPI + printf("Initial inner residue: %.6e\n", r0r0); + #else + if (g_cart_id == 0) printf("Initial inner residue: %.6e\n", r0r0); + #endif + + + + + for (j = 0; j < max_iter; j++) { + + + #ifndef MATRIX_DEBUG + + // A*d(k) + #ifndef MPI + matrix_multiplication32(Ad_up, Ad_dn, // normally: matrix_multiplication32() + d_up, d_dn, // debugging: matrix_debug1(), matrix_multiplication_test() + griddim2, blockdim2, + griddim3, blockdim3, + griddim4, blockdim4, + griddim5, blockdim5); + #else + #ifndef ASYNC + matrix_multiplication32_mpi(Ad_up, Ad_dn, // normally: matrix_multiplication32_mpi() + d_up, d_dn, // debugging: matrix_mpi_debug1/2/3/4() + griddim2, blockdim2, + griddim3, blockdim3, + griddim4, blockdim4, + griddim5, blockdim5); + #else // tries to overlap computation and communication + matrix_multiplication32_mpi_ASYNC(Ad_up, Ad_dn, + d_up, d_dn, + griddim2, blockdim2, + griddim3, blockdim3, + griddim4, blockdim4, + griddim5, blockdim5); + #endif + #endif // MPI + + + // debug // CUDA // also other stuff ?! + #ifdef CUDA_DEBUG + // CUDA_CHECK("CUDA error in matrix_muliplication32(). Applying the matrix on GPU failed.", "The matrix was applied on GPU."); + CUDA_CHECK_NO_SUCCESS_MSG("CUDA error in matrix_muliplication32(). Applying the matrix on GPU failed."); + #endif + + + #else + + // debug // apply the host matrix on trial + + // host/device interaction + to_host(up_field[3], d_up, h2d_spin_up, dev_spinsize_int); + to_host(dn_field[3], d_dn, h2d_spin_dn, dev_spinsize_int); + + // matrix multiplication + #ifndef MPI + printf("This is Q_Qdagger_ND(). "); + #else + if (g_proc_id == 0) printf("This is Q_Qdagger_ND(). "); + #endif + Q_Qdagger_ND(up_field[4], dn_field[4], // normally: Q_Qdagger_ND() + up_field[3], dn_field[3] ); // debugging: matrix_debug2(), Zwitter1(), Zwitter2(), Zwitter3() + // mpi: matrix_mpi_debug10() + // host/device interaction + to_device(Ad_up, up_field[4], h2d_spin_up, dev_spinsize_int); + to_device(Ad_dn, dn_field[4], h2d_spin_dn, dev_spinsize_int); + + + // debug // CUDA + #ifdef CUDA_DEBUG + // CUDA_CHECK("CUDA error in cg_eo_nd(). Applying the matrix on CPU failed.", "The matrix was applied on CPU."); + CUDA_CHECK_NO_SUCCESS_MSG("CUDA error in cg_eo_nd(). Applying the matrix on CPU failed."); + #endif + + #endif // MATRIX_DEBUG + + + // alpha = r(k)*r(k) / d(k)*A*d(k) + #ifndef MPI + dAd_up = cublasDot(N_floats_int, (float *) d_up, 1, (float *) Ad_up, 1); + dAd_dn = cublasDot(N_floats_int, (float *) d_dn, 1, (float *) Ad_dn, 1); + #else + dAd_up = cublasDot_wrapper(N_floats_int, (float *) d_up, 1, (float *) Ad_up, 1); + dAd_dn = cublasDot_wrapper(N_floats_int, (float *) d_dn, 1, (float *) Ad_dn, 1); + #endif + dAd = dAd_up + dAd_dn; + + // debug // is NaN ? + if isnan(dAd) { + printf("Error in cg_eo_nd(). dAd is NaN.\n"); + exit(-1); + } + + alpha = rr_old / dAd; // rr_old is taken from the last iteration respectively + + + + + // x(k+1) = x(k) + alpha*d(k) + cublasAxpy(N_floats_int, alpha, (RealT*)d_up, 1, (RealT*)x_up, 1); + cublasAxpy(N_floats_int, alpha, (RealT*)d_dn, 1, (RealT*)x_dn, 1); + + + + + // r(k+1) + if ( (j+1) % N_recalc_res != 0 ) { // r(k+1) = r(k) - alpha*A*d(k) + cublasAxpy(N_floats_int, -1.0*alpha, (RealT*)Ad_up, 1, (RealT*)r_up, 1); + cublasAxpy(N_floats_int, -1.0*alpha, (RealT*)Ad_dn, 1, (RealT*)r_dn, 1); + } + + else { // recalculate residue r(k+1) = b - A*x(k+1) + // "feedback" + // debug + #ifndef MPI + printf("Recalculating the inner residue.\n"); + #else + if (g_proc_id == 0) printf("Recalculating the inner residue.\n"); + #endif + + + // A*x(k+1) + + #ifndef MATRIX_DEBUG + + #ifndef MPI + matrix_multiplication32(Ax_up, Ax_dn, + x_up, x_dn, + griddim2, blockdim2, + griddim3, blockdim3, + griddim4, blockdim4, + griddim5, blockdim5); + #else + #ifndef ASYNC + matrix_multiplication32_mpi(Ax_up, Ax_dn, // normally: matrix_multiplication32_mpi() + x_up, x_dn, // debugging: matrix_mpi_debug1/2/3/4() + griddim2, blockdim2, + griddim3, blockdim3, + griddim4, blockdim4, + griddim5, blockdim5); + #else + matrix_multiplication32_mpi_ASYNC(Ax_up, Ax_dn, // normally: matrix_multiplication32_mpi() + x_up, x_dn, // debugging: matrix_mpi_debug1/2/3/4() + griddim2, blockdim2, + griddim3, blockdim3, + griddim4, blockdim4, + griddim5, blockdim5); + #endif + #endif // MPI + + #else + + // debug // apply the host matrix on trial + + // host/device interaction + to_host(up_field[3], x_up, h2d_spin_up, dev_spinsize_int); + to_host(dn_field[3], x_dn, h2d_spin_dn, dev_spinsize_int); + + // matrix multiplication + #ifndef MPI + printf("This is Q_Qdagger_ND(). "); + #else + if (g_proc_id == 0) printf("This is Q_Qdagger_ND(). "); + #endif + Q_Qdagger_ND(up_field[4], dn_field[4], // normally: Q_Qdagger_ND() + up_field[3], dn_field[3] ); // debugging, mpi: matrix_mpi_debug10() + + // host/device interaction + to_device(Ax_up, up_field[4], h2d_spin_up, dev_spinsize_int); + to_device(Ax_dn, dn_field[4], h2d_spin_dn, dev_spinsize_int); + + + // debug // CUDA + #ifdef CUDA_DEBUG + // CUDA_CHECK("CUDA error in cg_eo_nd(). Applying the matrix on CPU failed.", "The matrix was applied on CPU."); + CUDA_CHECK_NO_SUCCESS_MSG("CUDA error in cg_eo_nd(). Applying the matrix on CPU failed."); + #endif + + #endif // MATRIX_DEBUG + + + + + // r(k+1) = b - A*x(k+1) + cublasCopy(N_floats_int, (RealT*)Q_up, 1, (RealT*)r_up, 1); // r_up = Q_up + cublasCopy(N_floats_int, (RealT*)Q_dn, 1, (RealT*)r_dn, 1); // r_dn = Q_dn + cublasAxpy(N_floats_int, -1.0, (RealT*)Ax_up, 1, (RealT*)r_up, 1); // r_up = Q_up - Ax_up + cublasAxpy(N_floats_int, -1.0, (RealT*)Ax_dn, 1, (RealT*)r_dn, 1); // r_dn = Q_dn - Ax_dn + + + } // recalculate residue + + + + + // r(k+1)*r(k+1) + #ifndef MPI + rr_up = cublasDot(N_floats_int, (float *) r_up, 1, (float *) r_up, 1); + rr_dn = cublasDot(N_floats_int, (float *) r_dn, 1, (float *) r_dn, 1); + #else + rr_up = cublasDot_wrapper(N_floats_int, (float *) r_up, 1, (float *) r_up, 1); + rr_dn = cublasDot_wrapper(N_floats_int, (float *) r_dn, 1, (float *) r_dn, 1); + #endif + rr = rr_up + rr_dn; + + // debug // CUBLAS core function + #ifdef CUDA_DEBUG + CUBLAS_CORE_CHECK_NO_SUCCESS_MSG("CUBLAS error in cg_eo_nd(). CUBLAS function failed."); + #endif + + + // debug + #ifndef MPI + printf("inner iteration j = %i: rr = %.6e\n", j, rr); + #else + if (g_proc_id == 0) printf("inner iteration j = %i: rr = %.6e\n", j, rr); + #endif + + // debug // is NaN ? + if isnan(rr) { + printf("Error in cg_eo_nd(). Inner residue is NaN.\n"); + exit(-1); + } + + + // aborting ?? // check wether precision is reached ... + if ( (check_abs)&&(rr <= eps_abs) || (check_rel)&&(rr <= eps_rel*r0r0) ) { + + #ifdef MPI + if (g_cart_id == 0) { + #endif + + // debug + printf("Finished inner loop because of reached precision.\n"); + + if ((check_rel)&&(rr <= eps_rel*r0r0)) { + // debug + printf("Reached relative inner solver precision of eps_rel = %.2e\n", eps_rel); + } + if ((check_abs)&&(rr <= eps_abs)) { + // debug + printf("Reached absolute inner solver precision of eps_abs = %.2e\n", eps_abs); + } + + //debug + printf("Final inner residue: %.6e\n", rr); + + #ifdef MPI + } + #endif + + + /* + #ifdef USETEXTURE + unbind_texture_gf(); + #endif + + // debug // CUDA + #ifdef CUDA_DEBUG + CUDA_CHECK("CUDA error in unbind_texture(). Unbindung the GF texture failed.", "GF texture unbound."); + #endif + + + // cublasShutdown(); // ends CUBLAS + + // debug // CUBLAS helper function + #ifdef CUDA_DEBUG + CUBLAS_HELPER_CHECK(cublasShutdown(), "CUBLAS error in cublasInit(). Couldn't shut down CUBLAS.", "CUBLAS is shutted down."); + #else + cublasShutdown(); + #endif + */ + + return(j+1); + } + + + // beta = r(k+1)*r(k+1) / r(k)*r(k) + beta = rr / rr_old; + + + rr_old = rr; // for next iteration + + + // d(k+1) = r(k+1) + beta*d(k) + cublasScal (N_floats_int, beta, (RealT*)d_up, 1); + cublasAxpy (N_floats_int, 1.0 , (RealT*)r_up, 1, (RealT*)d_up, 1); + + cublasScal (N_floats_int, beta, (RealT*)d_dn, 1); + cublasAxpy (N_floats_int, 1.0 , (RealT*)r_dn, 1, (RealT*)d_dn, 1); + + // debug // CUBLAS core function + #ifdef CUDA_DEBUG + CUBLAS_CORE_CHECK_NO_SUCCESS_MSG("CUBLAS error in cg_eo_nd(). Error in CUBLAS function."); + #endif + + + }//LOOP + + + // debug + #ifndef MPI + printf("Finished inner loop beacuse of maximal number of inner iterations.\n"); + printf("Final inner residue: %.6e\n", rr); + #else + if (g_cart_id == 0) printf("Finished inner loop beacuse of maximal number of inner iterations.\n"); + if (g_cart_id == 0) printf("Final inner residue: %.6e\n", rr); + #endif + + /* + #ifdef USETEXTURE + unbind_texture_gf(); + #endif + + // debug // CUDA + #ifdef CUDA_DEBUG + CUDA_CHECK("CUDA error in unbind_texture(). Unbindung the GF texture failed.", "GF texture unbound."); + #endif + + + // cublasShutdown(); + + // debug // CUBLAS helper function + #ifdef CUDA_DEBUG + CUBLAS_HELPER_CHECK(cublasShutdown(), "CUBLAS error in cublasInit(). Couldn't shut down CUBLAS.", "CUBLAS is shutted down."); + #else + cublasShutdown(); + #endif + */ + + return(j+1); + +}//cg_eo_nd() + + + + + + +////////////////// +// OUTER SOLVER // +////////////////// + +// iterative refinement, defect correction +// that function is to replace the call of cg_her_nd() in invert_doublet_eo.c +// solves the odd part of the full eo and nd problem +// more precisely we have to invert Qhat(2x2)*Qhat(2x2)^dagger +// multiplying by Qhat(2x2)^dagger is done in invert_doublet_eo.c + +extern "C" int mixedsolve_eo_nd (spinor * P_up, spinor * P_dn, + spinor * Q_up, spinor * Q_dn, + int max_iter, double eps_sq, int rel_prec) { + + typedef REAL RealT; + + // basically P_up/dn and Q_up/dn could be used as auxiliary fields + // P_up/dn is the output field (and can be used as initial guess) + // Q_up/dn is not used later in the calling invert_doublet_eo.c + // but will be used as feedback in r(k+1) = b - A*x(k+1) + + + // debug + #ifdef MPI + if (g_proc_id == 0) { + #endif + + printf("\n\nmixedsolve_eo_nd():\n"); + + printf("SOLVER PARAMETERS:\n"); + + printf("outer:"); + printf("\tmaximal iterations: %i\n", max_iter); + printf("\trelative check?: %i\n", bool(rel_prec)); + printf("\tprecision: %.8e\n", eps_sq); + + printf("inner:"); + printf("\tmaximal iterations: %i\n", max_innersolver_it); + printf("\tabsolute check?: %i\n", bool(innersolver_precision_check_abs)); + printf("\trelative check?: %i\n", bool(innersolver_precision_check_rel)); + printf("\tabsolute precision: %.8e\n", innersolver_precision_abs); + printf("\trelative precision: %.8e\n", innersolver_precision_rel); + + #ifdef MPI + } + #endif + + + + + ///////////////////// + // LOCAL VARIABLES // + ///////////////////// + + // CUDA + cudaError_t cudaerr; + cublasStatus cublasstatus; + + // algorithm + double rr_up; + double rr_dn; + double rr; + double rr_old; + double r0r0; + double bb; + + // counting + int i = 0; // iteration counter + int innercount; // latest inner solver iterations + int outercount = 0; // total inner solver iterations + double flops; + #ifdef ALGORITHM_BENCHMARK + double effectiveflops; // will used to count the "effective" flop's (from the algorithmic perspective) + // double hoppingflops = 1488.0; + double hoppingflops = 1608.0; + double matrixflops = 2 * ( 2 * ( (2*hoppingflops+12+3) + (2*hoppingflops+3) + (12+2) + 12 ) ); + #ifdef MPI + double allflops; // flops added for all processes + #endif + #endif + + // timing + clock_t startouter, stopouter; + clock_t startinner, stopinner; + // double timeelapsed; + clock_t innerclocks; + clock_t totalinnerclocks = 0; + clock_t totalouterclocks = 0; + + #ifdef ALGORITHM_BENCHMARK + #ifndef MPI + clock_t starteffective; + clock_t stopeffective; + #else + double starteffective; + double stopeffective; + double singletime; // time for each process = stopeffective - starteffective + double maxtime; // max. parallel process time + #endif + #endif + + // (auxiliary) fields + spinor * r_up, * r_dn, + * Ad_up, * Ad_dn, + * x_up, * x_dn, + * d_up, * d_dn, + * Ax_up, * Ax_dn; + + spinor ** up_field = NULL; + spinor ** dn_field = NULL; + const int nr_sf = 5; + + init_solver_field(&up_field, VOLUMEPLUSRAND/2, nr_sf); + init_solver_field(&dn_field, VOLUMEPLUSRAND/2, nr_sf); + + // formal parameters + /* + size_t dev_spinsize_int = 6*VOLUME/2*sizeof(dev_spinor); // 24 floats per spinor per even lattice site + int N_sites_int = VOLUME/2; // Carsten's functions get the number of lattice points as input + int N_floats_int = 24*VOLUME/2; + #ifdef MPI + size_t dev_spinsize_ext = 6*(VOLUME+RAND)/2*sizeof(dev_spinor); + int N_sites_ext = (VOLUME+RAND)/2; + int N_floats_ext = 24*(VOLUME+RAND)/2; + #endif + */ + + // algorithm control parameters + bool rbAx = true; // choose how to calculate r(k+1) + bool initial_guess = false; // choose if initial guess + + + + + ////////////////// + // INITIALIZING // + ////////////////// + + + //debug + #ifndef MPI + printf("init_mixedsolve_eo_nd():\n"); + #else + if (g_cart_id == 0) printf("init_mixedsolve_eo_nd_mpi():\n"); + #endif + + + init_mixedsolve_eo_nd(g_gauge_field); // initializes and allocates all quantities for the mixed solver + // more precise: + // puts the gauge field on device as "2 rows" or "8 floats" per SU(3)-matrix + // allocates memory for all spinor fields + // puts the nn- and eoidx-fields on device memory + + //debug + #ifndef MPI + printf("mixedsolve_eo_nd():\n"); + #else + if (g_cart_id == 0) printf("mixedsolve_eo_nd_mpi():\n"); + #endif + + + // the following initializations are moved from cg_eo_nd(): + + // Initialize some stuff + dev_complex h0, h1, h2, h3, mh0, mh1, mh2, mh3; + + h0.re = (float) ka0.re; h0.im = -(float) ka0.im; // ka{0-4} are defined in boundary.c + h1.re = (float) ka1.re; h1.im = -(float) ka1.im; // what is the meaning? + h2.re = (float) ka2.re; h2.im = -(float) ka2.im; + h3.re = (float) ka3.re; h3.im = -(float) ka3.im; + + mh0.re = -(float) ka0.re; mh0.im = (float) ka0.im; + mh1.re = -(float) ka1.re; mh1.im = (float) ka1.im; + mh2.re = -(float) ka2.re; mh2.im = (float) ka2.im; + mh3.re = -(float) ka3.re; mh3.im = (float) ka3.im; + /* + // try using constant mem for kappas // constant memory is cached! + cudaMemcpyToSymbol("dev_k0c", &h0, sizeof(dev_complex)); + cudaMemcpyToSymbol("dev_k1c", &h1, sizeof(dev_complex)); + cudaMemcpyToSymbol("dev_k2c", &h2, sizeof(dev_complex)); + cudaMemcpyToSymbol("dev_k3c", &h3, sizeof(dev_complex)); + + cudaMemcpyToSymbol("dev_mk0c", &mh0, sizeof(dev_complex)); + cudaMemcpyToSymbol("dev_mk1c", &mh1, sizeof(dev_complex)); + cudaMemcpyToSymbol("dev_mk2c", &mh2, sizeof(dev_complex)); + cudaMemcpyToSymbol("dev_mk3c", &mh3, sizeof(dev_complex)); + */ + + + // bind texture gf + #ifdef USETEXTURE // needed for subfunctions of dev_Hopping_Matrix(...) + bind_texture_gf(MixedsolveParameter::getGlobalP()->dev_gf); // e.g. dev_reconstructgf_2vtexref(...), dev_reconstructgf_8texref(...), + #endif // in general in functions dev_reconstructgf[...] with "tex1Dfetch(gf_tex[...]" + + // debug // CUDA + #ifdef CUDA_DEBUG + CUDA_CHECK("CUDA error in bind_texture_gf(). Binding GF to texture failed.", "GF bound to texture."); + #endif + + + he_cg_init<<< 1, 1 >>> (dev_grid, (float) g_kappa, (float)(g_mu/(2.0*g_kappa)), h0, h1, h2, h3); + // "he" = "host entry" + // BEWARE in dev_tm_dirac_kappa we need the true mu (not 2 kappa mu!) // ?? + + // dev_LX, dev_LY, dev_LZ, dev_T, dev_VOLUME = grid[5] = dev_grid[5] + // dev_VOLUME is necessary for many kernel functions as for instance dev_gamma5() + // initializes mu, kappa and twokappamu on the device + // initializes the strange dev_k{0-3}, dev_mk{0-3} as derived from the ka{0-3} from boundary.c + + // debug // kernel + #ifdef CUDA_DEBUG + CUDA_KERNEL_CHECK("Kernel error in he_cg_init(). Couldn't initialize some stuff.", "he_cg_init() succeeded."); + #endif + + // debug // check stuff on device + #ifdef STUFF_DEBUG + + #ifdef MPI + if (g_proc_id == 0) { + #endif + + #ifdef MPI + printf("\tOn host:\n"); + printf("\tVOLUME = %i\n", VOLUME); // checking VOLUME and RAND in the parallel case + printf("\tRAND = %i\n", RAND); + printf("\tVOLUME + RAND = %i\n", VOLUME+RAND); + #endif + + int host_check_LX, host_check_LY, host_check_LZ, host_check_T, host_check_VOLUME; + cudaMemcpyFromSymbol(&host_check_LX, dev_LX, sizeof(int)); + cudaMemcpyFromSymbol(&host_check_LY, dev_LY, sizeof(int)); + cudaMemcpyFromSymbol(&host_check_LZ, dev_LZ, sizeof(int)); + cudaMemcpyFromSymbol(&host_check_T, dev_T, sizeof(int)); + cudaMemcpyFromSymbol(&host_check_VOLUME, dev_VOLUME, sizeof(int)); + // printf("\teven_odd_flag = %i\n", even_odd_flag); + printf("\tOn device:\n"); + printf("\tdev_LX = %i\n", host_check_LX); + printf("\tdev_LY = %i\n", host_check_LY); + printf("\tdev_LZ = %i\n", host_check_LZ); + printf("\tdev_T = %i\n", host_check_T); + printf("\tdev_VOLUME = %i/2 ?!= %i\n", host_check_LX*host_check_LY*host_check_LZ*host_check_T, host_check_VOLUME); + + float host_check_mu, host_check_kappa, host_check_twokappamu; + cudaMemcpyFromSymbol(&host_check_mu, mu, sizeof(float)); + cudaMemcpyFromSymbol(&host_check_kappa, kappa, sizeof(float)); + cudaMemcpyFromSymbol(&host_check_twokappamu, twokappamu, sizeof(float)); + // printf("\tOn device:\n"); + // printf("\tmu = %f\n", host_check_mu); // not needed for the nd case + printf("\tkappa = %f\n", host_check_kappa); + // printf("\ttwokappamu = %f\n", host_check_twokappamu); + + #ifdef MPI + } + #endif + + #endif + + + he_cg_init_nd_additional<<<1,1>>> (g_mubar, g_epsbar); + + // debug // kernel + #ifdef CUDA_DEBUG + CUDA_KERNEL_CHECK("Kernel error in he_cg_init_nd_additional(). Couldn't initialize some stuff.", "he_cg_init_nd_additional() succeeded."); + #endif + + // debug // check mubar and epsbar on host and device + #ifdef STUFF_DEBUG + + #ifdef MPI + if (g_proc_id == 0) { + #endif + + // printf("\tOn host:\n"); + // printf("\tg_mubar = %f\n", g_mubar); + // printf("\tg_epsbar = %f\n", g_epsbar); + + float host_check_mubar, host_check_epsbar; + cudaMemcpyFromSymbol(&host_check_mubar, mubar, sizeof(float)); + cudaMemcpyFromSymbol(&host_check_epsbar, epsbar, sizeof(float)); + printf("\tOn device:\n"); + printf("\tmubar = %f\n", host_check_mubar); + printf("\tepsbar = %f\n", host_check_epsbar); + + #ifdef MPI + } + #endif + + #endif + + + #ifdef MPI + + he_cg_init_nd_additional_mpi<<<1,1>>>(VOLUMEPLUSRAND, RAND, g_cart_id, g_nproc); + + // debug // kernel + #ifdef CUDA_DEBUG + CUDA_KERNEL_CHECK("Kernel error in he_cg_init_nd_additional_mpi(). Couldn't initialize some stuff.", "he_cg_init_nd_additional_mpi() succeeded."); + #endif + + // debug + #ifdef STUFF_DEBUG + + // debug // check dev_VOLUMEPLUSRAND and dev_RAND on device + #ifdef STUFF_DEBUG + if (g_proc_id == 0) { + int host_check_VOLUMEPLUSRAND, host_check_RAND; + cudaMemcpyFromSymbol(&host_check_VOLUMEPLUSRAND, dev_VOLUMEPLUSRAND, sizeof(int)); + cudaMemcpyFromSymbol(&host_check_RAND, dev_RAND, sizeof(int)); + printf("\tOn device:\n"); + printf("\tdev_VOLUMEPLUSRAND = %i\n", host_check_VOLUMEPLUSRAND); + printf("\tdev_RAND = %i\n", host_check_RAND); + } + #endif + + #endif + + #endif + + + + + /* // necessary ?? + // cublasInit(); + + // debug // CUBLAS helper function + #ifdef CUDA_DEBUG + CUBLAS_HELPER_CHECK(cublasInit(), "CUBLAS error in cublasInit(). Couldn't initialize CUBLAS.", "CUBLAS initialized."); + #else + cublasInit(); + #endif + */ + + + + #ifdef OPERATOR_BENCHMARK + benchmark_eo_nd(Q_up, Q_dn, OPERATOR_BENCHMARK); + #endif + + + + + ///////////////// + // ASSIGNMENTS // + ///////////////// + + + x_up = P_up; // can use the output spinors also as auxiliary fields + x_dn = P_dn; // can use as initial guess at the same time + + + #ifndef CG_DEBUG + + r_up = up_field[0]; // use the pre-allocated memory on host memory + r_dn = dn_field[0]; // allocated by init_chi_spinor_field.c and invert_doublet.c !? + d_up = up_field[1]; // the fields g_chi_up/dn_spinor_field[DUM_SOLVER{ , +1, ... , +5}] are used in cg_her_nd() + d_dn = dn_field[1]; + Ad_up = up_field[2]; + Ad_dn = dn_field[2]; + Ax_up = Ad_up; + Ax_dn = Ad_dn; + + // debug + #ifndef MPI + printf("Now using the fields g_chi_up/dn_spinor_field[DUM_SOLVER{ , +1, +2}] in the mixedsolve_eo_nd().\n"); + #else + if (g_cart_id == 0) printf("Now using the fields g_chi_up/dn_spinor_field[DUM_SOLVER{ , +1, +2}] in the mixedsolve_eo_nd().\n"); + #endif + + #else + + r_up = (spinor *) malloc(24*N_sites_int*sizeof(double)); // if using cg_her_nd() as the CG, we cannot use the g_chi_up/dn-fields at the same time + r_dn = (spinor *) malloc(24*N_sites_int*sizeof(double)); + d_up = (spinor *) malloc(24*N_sites_int*sizeof(double)); + d_dn = (spinor *) malloc(24*N_sites_int*sizeof(double)); + Ad_up = (spinor *) malloc(24*N_sites_int*sizeof(double)); + Ad_dn = (spinor *) malloc(24*N_sites_int*sizeof(double)); + Ax_up = Ad_up; + Ax_dn = Ad_dn; + // debug + #ifndef MPI + printf("Now allocating new host space for the fields in mixedsolve_eo_nd().\n"); + #else + if (g_cart_id == 0) printf("Now allocating new host space for the fields in mixedsolve_eo_nd().\n"); + #endif + + #endif + + + + + /////////////// + // ALGORITHM // + /////////////// + + // timer + startouter = clock(); + + #ifdef ALGORITHM_BENCHMARK + #ifndef MPI + starteffective = ((double)clock()) / ((double)(CLOCKS_PER_SEC)); + #else + starteffective = MPI_Wtime(); + #endif + #endif + + + // r(0) + if (!initial_guess) { // r(0) = b = Q // for x(0) = 0 + assign(r_up, Q_up, N_sites_int); + assign(r_dn, Q_dn, N_sites_int); + #ifndef MPI + printf("x(0) = 0\n"); + #else + if (g_cart_id == 0) printf("x(0) = 0\n"); + #endif + } + else { // r(0) = b - A*x(0) = Q - A*P + bb = square_norm(P_up, N_sites_int, 1) + square_norm(P_dn, N_sites_int, 1); + #ifndef MPI + printf("bb = %.10e\n", bb); + #else + if (g_cart_id == 0) printf("bb = %.10e\n", bb); + #endif + if (bb == 0) { + assign(r_up, Q_up, N_sites_int); + assign(r_dn, Q_dn, N_sites_int); + #ifndef MPI + printf("x(0) = 0\n"); + #else + if (g_cart_id == 0) printf("x(0) = 0\n"); + #endif + } + else { + Q_Qdagger_ND(Ax_up, Ax_dn, P_up, P_dn); + diff(r_up, Q_up, Ax_up, N_sites_int); + diff(r_dn, Q_dn, Ax_dn, N_sites_int); + #ifndef MPI + printf("x(0) != 0\n"); + #else + if (g_cart_id == 0) printf("x(0) != 0\n"); + #endif + } + } + + + // rr = (r_up)^2 + (r_dn)^2 + rr_up = square_norm(r_up, N_sites_int, 1); + rr_dn = square_norm(r_dn, N_sites_int, 1); + rr = rr_up + rr_dn; + + + r0r0 = rr; // for relative precision + rr_old = rr; // for the first iteration + + // debug + #ifndef MPI + printf("Initial outer residue: %.10e\n", rr_old); + #else + if (g_cart_id == 0) printf("Initial outer residue: %.10e\n", rr_old); + #endif + + + // set to zero // x_up, x_dn will be added up // as x_up/dn = P_up/dn up to here P_up/dn was not changed + zero_spinor_field(x_up, N_sites_int); + zero_spinor_field(x_dn, N_sites_int); + + + + + //////////////// + // OUTER LOOP // + //////////////// + + // debug + #ifndef MPI + printf("\nEntering outer loop."); + #else + if (g_cart_id == 0) printf("\nEntering outer loop."); + #endif + + + do { // for (i = 0; i < max_iter; i++) { + + i++; + + // debug + #ifndef MPI + printf("\nouter iteration i = %i\n", i); + #else + if (g_cart_id == 0) printf("\nouter iteration i = %i\n", i); + #endif + + + + + #ifndef CG_DEBUG + + // host/device interaction + to_device(dev_spinin_up, r_up, h2d_spin_up, dev_spinsize_int); // notice: for MPI communicateion the boundary exchange takes place when the hopping matrix is applied + to_device(dev_spinin_dn, r_dn, h2d_spin_dn, dev_spinsize_int); + + // debug // CUDA + #ifdef CUDA_DEBUG + // CUDA_CHECK("CUDA error in mixedsolve_eo_nd(). Host to device interaction failed.", "Fields copied to device."); + CUDA_CHECK_NO_SUCCESS_MSG("CUDA error in mixedsolve_eo_nd(). Host to device interaction failed."); + #endif + + + + + //////////////////////////////////// + // INNER LOOP, CONJUGATE GRADIENT // + //////////////////////////////////// + + // timer + startinner = clock(); + + // debug + #ifndef MPI + printf("cg_eo_nd():\n"); + #else + if (g_cart_id == 0) printf("cg_eo_nd():\n"); + #endif + + + // solves A*p(k+1) = r(k) + // A*p(0) = r(0) = b + innercount = cg_eo_nd(MixedsolveParameter::getGlobalP()->dev_gf, + dev_spinout_up, dev_spinout_dn, + dev_spinin_up , dev_spinin_dn, + max_innersolver_it, + innersolver_precision_check_abs, innersolver_precision_check_rel, + innersolver_precision_abs , innersolver_precision_rel ); + + outercount = outercount + innercount; + + // timer + stopinner = clock(); + innerclocks = stopinner-startinner; + totalinnerclocks = totalinnerclocks + innerclocks; + + // debug + #ifndef MPI + printf("Inner solver done in: %.4e sec\n", double(innerclocks) / double(CLOCKS_PER_SEC)); + #else + if (g_cart_id == 0) printf("Inner solver done in: %.4e sec\n", double(innerclocks) / double(CLOCKS_PER_SEC)); + #endif + + + // host/device interaction + to_host(d_up, dev_spinout_up, h2d_spin_up, dev_spinsize_int); + to_host(d_dn, dev_spinout_dn, h2d_spin_dn, dev_spinsize_int); + + // debug // CUDA + #ifdef CUDA_DEBUG + // CUDA_CHECK("CUDA error in mixedsolve_eo_nd(). Device to host interaction failed.", "Fields copied back to device."); + CUDA_CHECK_NO_SUCCESS_MSG("CUDA error in mixedsolve_eo_nd(). Device to host interaction failed."); + #endif + + + #else + + + // debug + #ifndef MPI + printf("cg_her_nd():\n"); + #else + if (g_cart_id == 0) printf("cg_her_nd():\n"); + #endif + + innercount = cg_her_nd(d_up, d_dn, r_up, r_dn, // MISTAKE, was: r_up, r_dn, d_up, d_dn, + 1000, eps_sq/2, 0, + VOLUME/2, &Q_Qdagger_ND, 0, 1000); + + outercount = outercount + innercount; + + // debug + #ifndef MPI + printf("cg_her_nd() on host was used for debugging purposes.\n"); + #else + if (g_cart_id == 0) printf("cg_her_nd() on host was used for debugging purposes.\n"); + #endif + + + #endif + + + // debug + #ifndef MPI + printf("mixedsolve_eo_nd():\n"); + #else + if (g_cart_id == 0) printf("mixedsolve_eo_nd():\n"); + #endif + + + // x(k+1) = x(k) + d(k+1) + add(x_up, x_up, d_up, N_sites_int); + add(x_dn, x_dn, d_dn, N_sites_int); + + + + + // r(k+1) + if (rbAx) { // r(k+1) = b - A*x(k+1) + // A*x(k+1) + Q_Qdagger_ND(Ax_up, Ax_dn, x_up, x_dn); + // debug + #ifndef MPI + printf("The matrix was applied on CPU in double precision. r = b - Ax\n"); + #else + if (g_cart_id == 0) printf("The matrix was applied on CPU in double precision. r = b - Ax\n"); + #endif + diff(r_up, Q_up, Ax_up, N_sites_int); + diff(r_dn, Q_dn, Ax_dn, N_sites_int); + } + else { // r(k+1) = r(k) - A*d(k+1) // makes actually no sense ;) + // A*d(k+1) + Q_Qdagger_ND(Ad_up, Ad_dn, d_up, d_dn); + // debug + #ifndef MPI + printf("The matrix was applied on CPU in double precision. r = r - Ad\n"); + #else + if (g_cart_id == 0) printf("The matrix was applied on CPU in double precision. r = r - Ad\n"); + #endif + // r(k+1) = r(k) - A*d(k+1) + diff(r_up, r_up, Ad_up, N_sites_int); + diff(r_dn, r_dn, Ad_dn, N_sites_int); + } + + + + + // rr = (rr_up)^2 + (r_dn)^2 + rr_up = square_norm(r_up, N_sites_int, 1); + rr_dn = square_norm(r_dn, N_sites_int, 1); + rr = rr_up + rr_dn; + + // debug + #ifndef MPI + printf("Outer residue in the outer iteration i = %i after %i total inner iterations : %.10e\n", i, outercount, rr); + #else + if (g_cart_id == 0) printf("Outer residue in the outer iteration i = %i after %i total inner iterations : %.10e\n", i, outercount, rr); + #endif + + // debug // is NaN ? + if isnan(rr) { + printf("Error in mixedsolve_eo_nd(). Outer residue is NaN.\n"); + exit(-1); + } + + + + + // aborting ?? // check wether precision is reached ... + if ( ((rr <= eps_sq) && (rel_prec == 0)) || ((rr <= eps_sq*r0r0) && (rel_prec == 1)) ) { + + // timer + stopouter = clock(); + totalouterclocks = stopouter-startouter - totalinnerclocks; + + #ifdef ALGORITHM_BENCHMARK + #ifndef MPI + stopeffective = ((double)clock()) / ((double)(CLOCKS_PER_SEC)); + #else + stopeffective = MPI_Wtime(); + #endif + #endif + + + // debug + #ifdef MPI + if (g_cart_id == 0) { + #endif + printf("\nEO inversion done in mixed precision.\n"); + if (rel_prec == 0) printf("Finished outer loop because of reached absolute outer solver precision.\n"); + if (rel_prec == 1) printf("Finished outer loop because of reached relative outer solver precision.\n"); + printf("Total number of inner iterations: %i\n", outercount); + printf("Total number of outer iterations: %i\n", i+1); + printf("Squared residue: %.10e\n", rr); + printf("Outer solver done in: %.4e sec\n", double(stopouter-startouter) / double(CLOCKS_PER_SEC)); + #ifdef MPI + } + #endif + + // benchmark + #ifdef ALGORITHM_BENCHMARK + // will now count the number of effective flops + // effectiveflops = #(inner iterations)*(matrixflops+linalgflops)*VOLUME/2 + #(outer iterations)*(matrixflops+linalgflops)*VOLUME/2 + // outer loop: linalg = flops for calculating r(k+1) and x(k+1) + // inner loop: linalg = flops for calculating alpha, x(k+1), r(k+1), beta, d(k+1) + #ifndef MPI + effectiveflops = outercount*(matrixflops + 2*2*2*24 + 2*2*24 + 2*2*24 + 2*2*2*24 + 2*2*24)*VOLUME/2 + i*(matrixflops + 2*24 + 2*24)*VOLUME/2; + printf("effective BENCHMARK:\n"); + printf("\ttotal mixed solver time: %.4e sec\n", double(stopeffective-starteffective)); + printf("\tfloating point operations: %.4e flops\n", effectiveflops); + printf("\tinner solver performance: %.4e Gflop/s\n", double(effectiveflops) / double(stopeffective-starteffective) / 1.0e9); + #else + singletime = double(stopeffective-starteffective); + effectiveflops = outercount*(matrixflops + 2*2*2*24 + 2*2*24 + 2*2*24 + 2*2*2*24 + 2*2*24)*VOLUME/2 + i*(matrixflops + 2*24 + 2*24)*VOLUME/2; + MPI_Allreduce(&singletime, &maxtime, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); + MPI_Allreduce(&effectiveflops, &allflops, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + if (g_proc_id == 0) printf("effective BENCHMARK:\n"); + if (g_proc_id == 0) printf("\ttotal mixed solver time: %.4e sec\n", double(maxtime)); + if (g_proc_id == 0) printf("\tfloating point operations: %.4e flops\n", double(allflops)); + if (g_proc_id == 0) printf("\tinner solver performance: %.4e Gflop/s\n", double(allflops) / double(maxtime) / 1.0e9); + /* + printf("this is for checking:\n"); + printf("\ttotal mixed solver time: %.2e sec\n", double(stopeffective-starteffective)); + printf("\tfloating point operations: %.2e flops\n", effectiveflops); + printf("\tinner solver performance: %.2e Gflop/s\n", double(effectiveflops) / double(stopeffective-starteffective) / 1.0e9); + */ + #endif + #endif + + + #ifdef USETEXTURE + unbind_texture_gf(); + #endif + + // debug // CUDA + #ifdef CUDA_DEBUG + CUDA_CHECK("CUDA error in unbind_texture(). Unbindung the GF texture failed.", "GF texture unbound."); + #endif + + /* + // cublasShutdown(); + + // debug // CUBLAS helper function + #ifdef CUDA_DEBUG + CUBLAS_HELPER_CHECK(cublasShutdown(), "CUBLAS error in cublasShutdown(). Couldn't shut down CUBLAS.", "CUBLAS is shutted down."); + #else + cublasShutdown(); + #endif + */ + + // debug + #ifndef MPI + printf("finalize_mixedsolve_eo_nd():\n"); + #else + if (g_cart_id == 0) printf("finalize_mixedsolve_eo_nd():\n"); + #endif + + finalize_mixedsolve_eo_nd(); + + // debug + #ifndef MPI + printf("\n"); + #else + if (g_cart_id == 0) printf("\n"); + #endif + finalize_solver(up_field, nr_sf); + finalize_solver(dn_field, nr_sf); + return(outercount); + + } + + + + + }//OUTER LOOP + while (outercount <= max_iter); + + + // multiplying with Qhat(2x2)^dagger is done in invert_doublet_eo.c + + + // timer + stopouter = clock(); + totalouterclocks = stopouter-startouter - totalinnerclocks; + + #ifdef ALGORITHM_BENCHMARK + #ifndef MPI + stopeffective = ((double)clock()) / ((double)(CLOCKS_PER_SEC)); + #else + stopeffective = MPI_Wtime(); + #endif + #endif + + + // debug + #ifdef MPI + if (g_cart_id == 0) { + #endif + printf("\nEO inversion done in mixed precision.\n"); + printf("Finished outer loop, because of maximal number of outer iterations.\n"); + printf("Total number of inner iterations: %i\n", outercount); + printf("Total number of outer iterations: %i\n", i+1); + printf("Squared residue: %.10e\n", rr); + printf("Outer solver done in: %.4e sec\n", double(stopouter-startouter)/CLOCKS_PER_SEC); + #ifdef MPI + } + #endif + + // benchmark + #ifdef ALGORITHM_BENCHMARK + // will now count the number of effective flops + // effectiveflops = #(inner iterations)*(matrixflops+linalgflops)*VOLUME/2 + #(outer iterations)*(matrixflops+linalgflops)*VOLUME/2 + // outer loop: linalg = flops for calculating r(k+1) and x(k+1) + // inner loop: linalg = flops for calculating alpha, x(k+1), r(k+1), beta, d(k+1) + #ifndef MPI + effectiveflops = outercount*(matrixflops + 2*2*2*24 + 2*2*24 + 2*2*24 + 2*2*2*24 + 2*2*24)*VOLUME/2 + i*(matrixflops + 2*24 + 2*24)*VOLUME/2; + printf("effective BENCHMARK:\n"); + printf("\ttotal mixed solver time: %.4e sec\n", double(stopeffective-starteffective)); + printf("\tfloating point operations: %.4e flops\n", effectiveflops); + printf("\tinner solver performance: %.4e Gflop/s\n", double(effectiveflops) / double(stopeffective-starteffective) / 1.0e9); + #else + singletime = double(stopeffective-starteffective); + effectiveflops = outercount*(matrixflops + 2*2*2*24 + 2*2*24 + 2*2*24 + 2*2*2*24 + 2*2*24)*VOLUME/2 + i*(matrixflops + 2*24 + 2*24)*VOLUME/2; + MPI_Allreduce(&singletime, &maxtime, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD); + MPI_Allreduce(&effectiveflops, &allflops, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + if (g_proc_id == 0) printf("effective BENCHMARK:\n"); + if (g_proc_id == 0) printf("\ttotal mixed solver time: %.4e sec\n", double(maxtime)); + if (g_proc_id == 0) printf("\tfloating point operations: %.4e flops\n", double(allflops)); + if (g_proc_id == 0) printf("\tinner solver performance: %.4e Gflop/s\n", double(allflops) / double(maxtime) / 1.0e9); + /* + printf("this is for checking:\n"); + printf("\ttotal mixed solver time: %.2e sec\n", double(stopeffective-starteffective)); + printf("\tfloating point operations: %.2e flops\n", effectiveflops); + printf("\tinner solver performance: %.2e Gflop/s\n", double(effectiveflops) / double(stopeffective-starteffective) / 1.0e9); + */ + #endif + #endif + + + #ifdef USETEXTURE + unbind_texture_gf(); + #endif + + // debug // CUDA + #ifdef CUDA_DEBUG + CUDA_CHECK("CUDA error in unbind_texture(). Unbindung the GF texture failed.", "GF texture unbound."); + #endif + + /* + // cublasShutdown(); + + // debug // CUBLAS helper function + #ifdef CUDA_DEBUG + CUBLAS_HELPER_CHECK(cublasShutdown(), "CUBLAS error in cublasShutdown(). Couldn't shut down CUBLAS.", "CUBLAS is shutted down."); + #else + cublasShutdown(); + #endif + */ + + // debug + #ifndef MPI + printf("finalize_mixedsolve_eo_nd():\n"); + #else + if (g_cart_id == 0) printf("finalize_mixedsolve_eo_nd():\n"); + #endif + + finalize_mixedsolve_eo_nd(); + + // debug + #ifndef MPI + printf("\n"); + #else + if (g_cart_id == 0) printf("\n"); + #endif + + finalize_solver(up_field, nr_sf); + finalize_solver(dn_field, nr_sf); + + return(outercount); + + +}//mixedsolve_eo_nd() + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/mixedsolveOperator.cuh b/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/mixedsolveOperator.cuh new file mode 100644 index 0000000000000000000000000000000000000000..18cd1100dd0c3759d261128d9c8581a776cb72cd --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/mixedsolveOperator.cuh @@ -0,0 +1,290 @@ +/*********************************************************************** + * + * Copyright (C) + * original code from Florian Burger + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * + * File: mixedsolveOperator.cuh + * + * operators for (non-EO) mixed_solver in mixedsolve.cu, + * derived from template interface class templateclass mixedsolveOperator; + * in mixedsolve.cu + * + * + **************************************************************************/ + + + +templateclass MixedsolveOperatorDirac:public MixedsolveOperator +{ +protected: + templatefriend class MixedsolveOperatorDiracDaggerDirac; + dim3 tm_dirac_kappaBlockdim; + dim3 tm_dirac_kappaGriddim; + +public: + int rescalekappa; + + MixedsolveOperatorDirac(int rescalekappaD=0) + : tm_dirac_kappaBlockdim(BLOCK,1,1),tm_dirac_kappaGriddim(( VOLUME>=BLOCK ? int(VOLUME/BLOCK)+1 : 1 ),1,1), // this is the partitioning for the Dirac-Kernel + rescalekappa(rescalekappaD) + { } + + + virtual void gpuInit(dev_spinorM(RealT)* spinin,dev_spinorM(RealT)* spinTmp,dev_spinorM(RealT)* spinout,dev_su3_2vM(RealT)* gf,int* dev_nn,const dim3& linAlgGriddim,const dim3& linAlgBlockdim) + { + #ifdef USETEXTURE + //Bind texture gf + bind_texture_gf(gf); + //Bind texture spinor to spin4 (D_tm is always applied to spin4) + bind_texture_spin(spinTmp,1); + #endif + } + + virtual void gpu(dev_spinorM(RealT)* spinin,dev_spinorM(RealT)* spinTmp,dev_spinorM(RealT)* spinout,dev_su3_2vM(RealT)* gf,int* dev_nn,const dim3& linAlgGriddim,const dim3& linAlgBlockdim) + { + // D Ddagger -- Ddagger = gamma5 D gamma5 for Wilson Dirac Operator + // mu -> -mu for twisted term + // DO NOT USE tm_dirac_dagger_kappa here, otherwise spin2 will be overwritten!!! + + #ifdef USETEXTURE + unbind_texture_spin(1); + #endif + // GAMMA5, mu -> -mu + dev_gamma5 <<>> (spinin,spinTmp); + dev_swapmu <<<1,1>>> (); + #ifdef USETEXTURE + bind_texture_spin(spinTmp,1); + #endif + //D_tm + dev_tm_dirac_kappa <<>> (gf, spinTmp, spinout, dev_nn); + #ifdef USETEXTURE + unbind_texture_spin(1); + #endif + //GAMMA5 mu -> -mu + dev_gamma5 <<>>(spinout,spinTmp); + dev_swapmu <<<1,1>>> (); + #ifdef USETEXTURE + bind_texture_spin(spinTmp,1); + #endif + //D_tm + dev_tm_dirac_kappa <<>> (gf, spinTmp, spinout, dev_nn); + } + + virtual void gpuDeinit(dev_spinorM(RealT)* spininout,dev_spinorM(RealT)* spinTmp,dev_su3_2vM(RealT)* gf,int* dev_nn,const dim3& linAlgGriddim,const dim3& linAlgBlockdim,const RealT scaleparam) + {//we have to invert D^+D instead of D because first operator is hermitian , which is a requirement for conjugated gradient algorithm + if(rescalekappa == 1) + { //want D^-1 rescaled by 2*kappa + /// maybe move this block into mixedsolveFunction::gpuDeinit(...) ? - "rescalekappa" can be a public member, which has to set before the dev_cg call + + //multiply with D^dagger + + #ifdef USETEXTURE + unbind_texture_spin(1); + #endif + dev_gamma5 <<>>(spininout,spinTmp); + dev_swapmu <<<1,1>>> (); + #ifdef USETEXTURE + bind_texture_spin(spinTmp,1); + #endif + dev_tm_dirac_kappa <<>> (gf, spinTmp, spininout, dev_nn); + dev_gamma5 <<>>(spininout,spinTmp); + dev_swapmu <<<1,1>>> (); + #ifdef USETEXTURE + unbind_texture_spin(1); + #endif + + + //go over to non-kappa, Ddagger = g5 D g5 + dev_skalarmult_spinor_field <<>>(spinTmp,RealT(1.0/(scaleparam*scaleparam)), spininout); + //dev_tm_dirac_kappa<<>>(gf, spin3, spinout, nn_grid); + } + } + + + virtual void check(spinor* const conjungateBasisPSpininTmp,spinor* const spinout,const int volume) + { + printf("Applying double precision Dirac-Op...\n"); + + Q_pm_psi_gpu(spinout, conjungateBasisPSpininTmp); + //diff(residueRSpininout, residueRSpininout, spinTmp ,volume); + } + + virtual void checkDeinit(spinor* const spinin,spinor* const spinTmp,spinor* const spinout,int volume) + {//=^ multiplication D^+ of inverted (D^+D)^-1 => D^-1 + Q_minus_psi_gpu(spinTmp, spinin); + assign(spinout, spinTmp, volume); + } +}; + + +/*templateclass MixedsolveOperatorDiracDaggerDirac:public MixedsolveOperator +{ +protected: + MixedsolveOperatorDirac operatorDirac; + +public: + + MixedsolveOperatorDiracDaggerDirac() + : operatorDirac(0) + { } + + + virtual void gpuInit(dev_spinorM(RealT)* spinin,dev_spinorM(RealT)* spinTmp,dev_spinorM(RealT)* spinout,dev_su3_2vM(RealT)* gf,int* dev_nn,const dim3& linAlgGriddim,const dim3& linAlgBlockdim) + { operatorDirac.gpuInit(spinin,spinTmp,spinout,gf,dev_nn,linAlgGriddim,linAlgBlockdim); } + + + virtual void gpu(dev_spinorM(RealT)* spinin,dev_spinorM(RealT)* spinTmp,dev_spinorM(RealT)* spinout,dev_su3_2vM(RealT)* gf,int* dev_nn,const dim3& linAlgGriddim,const dim3& linAlgBlockdim) + { + // D Ddagger -- Ddagger = gamma5 D gamma5 for Wilson Dirac Operator + // mu -> -mu for twisted term + // DO NOT USE tm_dirac_dagger_kappa here, otherwise spin2 will be overwritten!!! + + #ifdef USETEXTURE + unbind_texture_spin(1);//because it is bind to spin2==spinTmp + #endif + #ifdef USETEXTURE + bind_texture_spin(spinin,1); + #endif + //D_tm + dev_tm_dirac_kappa <<>> (gf, spinin, spinTmp, dev_nn); + #ifdef USETEXTURE + unbind_texture_spin(1); + #endif + // GAMMA5, mu -> -mu + dev_gamma5 <<>> (spinTmp,spinout); + dev_swapmu <<<1,1>>> (); + #ifdef USETEXTURE + bind_texture_spin(spinout,1); + #endif + //D_tm + dev_tm_dirac_kappa <<>> (gf, spinout, spinTmp, dev_nn); + #ifdef USETEXTURE + unbind_texture_spin(1); + #endif + //GAMMA5 mu -> -mu + dev_gamma5 <<>>(spinTmp,spinout); + dev_swapmu <<<1,1>>> (); + #ifdef USETEXTURE + bind_texture_spin(spinin,1); + #endif + + //dev_skalarmult_add_assign_spinor_field<<>>(spinout,mStarSquare,spinin,spinout); + } + + + virtual void check(spinor* const conjungateBasisPSpininTmp,spinor* const spinout,const int volume) + { + printf("Applying double precision Dirac-Op...\n"); + Q_pm_psi(spinout, conjungateBasisPSpininTmp);//D^+D statt DD^+ wie ~OperatorDirac + + //assign_add_mul_r(spinout, conjungateBasisPSpininTmp, mStarSquare, volume); + //operatorDirac.check(conjungateBasisPSpininTmp,spinout,volume); + } + + virtual void checkDeinit(spinor* const spinin,spinor* const spinTmp,spinor* const spinout,int volume) + { + assign(spinout, spinin, volume); + } +};*/ + + +templateclass MixedsolveOperatorDiracDaggerDirac:public MixedsolveOperator +{ +protected: + MixedsolveOperatorDirac operatorDirac; + +public: + + MixedsolveOperatorDiracDaggerDirac() + : operatorDirac(0) + { } + + + #ifdef USETEXTURE + virtual void gpuInit(dev_spinorM(RealT)* spinin,dev_spinorM(RealT)* spinTmp,dev_spinorM(RealT)* spinout,dev_su3_2vM(RealT)* gf,int* dev_nn,const dim3& linAlgGriddim,const dim3& linAlgBlockdim) + { bind_texture_gf(gf); }//Bind texture gf + #endif + + virtual void gpu(dev_spinorM(RealT)* spinin,dev_spinorM(RealT)* spinTmp,dev_spinorM(RealT)* spinout,dev_su3_2vM(RealT)* gf,int* dev_nn,const dim3& linAlgGriddim,const dim3& linAlgBlockdim) + { + // D Ddagger -- Ddagger = gamma5 D gamma5 for Wilson Dirac Operator + // mu -> -mu for twisted term + // DO NOT USE tm_dirac_dagger_kappa here, otherwise spin2 will be overwritten!!! + + #ifdef USETEXTURE + bind_texture_spin(spinin,1);//correct? + #endif + //D_tm + dev_tm_dirac_kappa <<>> (gf, spinin, spinTmp, dev_nn); + #ifdef USETEXTURE + unbind_texture_spin(1); + #endif + // GAMMA5, mu -> -mu + dev_gamma5 <<>> (spinTmp,spinout); + dev_swapmu <<<1,1>>> (); + #ifdef USETEXTURE + bind_texture_spin(spinout,1); + #endif + //D_tm + dev_tm_dirac_kappa <<>> (gf, spinout, spinTmp, dev_nn); + #ifdef USETEXTURE + unbind_texture_spin(1); + #endif + //GAMMA5 mu -> -mu + dev_gamma5 <<>>(spinTmp,spinout); + dev_swapmu <<<1,1>>> (); + + //dev_skalarmult_add_assign_spinor_field<<>>(spinout,mStarSquare,spinin,spinout); + } + + + virtual void check(spinor* const conjungateBasisPSpininTmp,spinor* const spinout,const int volume) + { + printf("Applying double precision Dirac-Op...\n"); + Q_pm_psi(spinout, conjungateBasisPSpininTmp);//D^+D statt DD^+ wie ~OperatorDirac + + //assign_add_mul_r(spinout, conjungateBasisPSpininTmp, mStarSquare, volume); + //operatorDirac.check(conjungateBasisPSpininTmp,spinout,volume); + } + + virtual void checkDeinit(spinor* const spinin,spinor* const spinTmp,spinor* const spinout,int volume) + { assign(spinout, spinin, volume); } +}; + + +templateclass MixedsolveOperatorDiracDaggerDiracDiracDaggerDirac:public MixedsolveOperatorDiracDaggerDirac +{ +public: + + virtual void gpu(dev_spinorM(RealT)* spinin,dev_spinorM(RealT)* spinTmp,dev_spinorM(RealT)* spinout,dev_su3_2vM(RealT)* gf,int* dev_nn,const dim3& linAlgGriddim,const dim3& linAlgBlockdim) + { + MixedsolveOperatorDiracDaggerDirac::gpu(spinin ,spinTmp,spinout,gf,dev_nn,linAlgGriddim,linAlgBlockdim); + MixedsolveOperatorDiracDaggerDirac::gpu(spinout,spinTmp,spinout,gf,dev_nn,linAlgGriddim,linAlgBlockdim); + } + + + virtual void check(spinor* const conjungateBasisPSpininTmp,spinor* const spinout,const int volume) + { + printf("Applying double precision Dirac-Op 2x...\n"); + Q_pm_psi(spinout, conjungateBasisPSpininTmp);//D^+D statt DD^+ wie ~OperatorDirac + assign(conjungateBasisPSpininTmp, spinout, volume); + Q_pm_psi(spinout, conjungateBasisPSpininTmp);//D^+D statt DD^+ wie ~OperatorDirac + } +}; + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/observables.cuh b/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/observables.cuh new file mode 100755 index 0000000000000000000000000000000000000000..366a60913658ab845eeeb0ea1e19ca003df1d6fe --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/observables.cuh @@ -0,0 +1,769 @@ + + + + + + +/// This uses all the single precision device gauge field +/// Think of porting this to double ! + + +// reduction field on host and device +float* redfield; +float * dev_redfield; +float * dev_sredfield; +float * dev_ssredfield; + +// same for polyakov loop +float2 * dev_polyredfield; +float2 * dev_polysredfield; +float2 * dev_polyssredfield; +float2 * polyredfield; + +// size of the first (small) and second (smallsmall) reduction fields +int sredsize, ssredsize; + +// same for polyakov loop +int polysredsize, polyssredsize; + + +int init_dev_observables(){ + cudaError_t cudaerr; + + + // IMPLEMENT THIS FOR ALL LATTICE SIZES !!!!!!!!!!!!!!!!!!!! + if((VOLUME%REDUCTION_N) == 0){ + sredsize = VOLUME/REDUCTION_N; + } + else{ + fprintf(stderr,"Error: Volume is not a multiple of REDUCTION_N (%d). Aborting...\n", REDUCTION_N); + exit(100); + } + + if(sredsize < REDUCTION_N){ + ssredsize = 1; + } + else{ + if(sredsize%REDUCTION_N == 0){ + ssredsize = sredsize/REDUCTION_N; + } + else{ + ssredsize = sredsize/REDUCTION_N + 1; + } + } + + + //VOLUME * float on device + cudaMalloc((void **) &dev_redfield, VOLUME*sizeof(float)); + + if((redfield = (float*)malloc(sredsize*sizeof(float)))==(void*)NULL){ + fprintf(stderr,"Error in init_dev_observables: malloc error(plaq)\n"); + return(1); + } + cudaMalloc((void **) &dev_sredfield, sredsize*sizeof(float)); + cudaMalloc((void **) &dev_ssredfield, ssredsize*sizeof(float)); + + if((cudaerr=cudaGetLastError())!=cudaSuccess){ + fprintf(stderr, "Error in init_dev_observables(): GPU memory allocation of reduction fields failed. Aborting...\n"); + return(2); + } + + + + int spatialvol = LX*LY*LZ; + if((spatialvol%REDUCTION_N) == 0){ + polysredsize = spatialvol/REDUCTION_N; + } + else{ + fprintf(stderr,"Error: spatial Volume is not a multiple of REDUCTION_N (%d). Aborting...\n", REDUCTION_N); + exit(100); + } + + if(polysredsize < REDUCTION_N){ + polyssredsize = 1; + } + else{ + if(polysredsize%REDUCTION_N == 0){ + polyssredsize = polysredsize/REDUCTION_N; + } + else{ + polyssredsize = polysredsize/REDUCTION_N + 1; + } + } + + + // spatial volume*2 (->complex) field for Polyakov loop data + cudaMalloc((void **) &dev_polyredfield, spatialvol*sizeof(float2)); + if((polyredfield = (float2*)malloc(spatialvol*sizeof(float2)))==(void*)NULL){ + fprintf(stderr,"Error in init_dev_observables: malloc error(poly)\n"); + return(1); + } + cudaMalloc((void **) &dev_polysredfield, polysredsize*sizeof(float2));//complex !! + cudaMalloc((void **) &dev_polyssredfield, polyssredsize*sizeof(float2));//complex !! + + if((cudaerr=cudaGetLastError())!=cudaSuccess){ + fprintf(stderr, "Error in init_dev_observables(): GPU memory allocation of poly reduction fields failed. Aborting...\n"); + return(2); + } + + + cudaMemcpyToSymbol("dev_VOLUME", &VOLUME, sizeof(int)) ; + cudaMemcpyToSymbol("dev_LX", &LX, sizeof(int)) ; + cudaMemcpyToSymbol("dev_LY", &LY, sizeof(int)) ; + cudaMemcpyToSymbol("dev_LZ", &LZ, sizeof(int)) ; + cudaMemcpyToSymbol("dev_T", &T, sizeof(int)) ; + + + return(0); +} + + + + + +void finalize_dev_observables(){ + free(redfield); + cudaFree(dev_redfield); + cudaFree(dev_sredfield); + cudaFree(dev_ssredfield); + cudaFree(dev_polyredfield); + cudaFree(dev_polysredfield); + cudaFree(dev_polyssredfield); +} + + + + +#ifndef HALF +// this is a reduction algorithm for float based on the CUDA SDK +__global__ void reduce_float(float *g_idata, float *g_odata, unsigned int n) +{ + extern __shared__ float sdata[]; + + // load shared mem + unsigned int tid = threadIdx.x; + unsigned int i = blockIdx.x*blockDim.x + threadIdx.x; + + sdata[tid] = (i < n) ? g_idata[i] : 0; + + __syncthreads(); + + // do reduction in shared mem + for(unsigned int s=blockDim.x/2; s>0; s>>=1) + { + if (tid < s) + { + sdata[tid] += sdata[tid + s]; + } + __syncthreads(); + } + + // write result for this block to global mem + if (tid == 0) g_odata[blockIdx.x] = sdata[0]; +} + + + + +// this is the version for float2 +__global__ void reduce_float2(float2 *g_idata, float2 *g_odata, unsigned int n) +{ + extern __shared__ float2 sdata2[]; + + // load shared mem + unsigned int tid = threadIdx.x; + unsigned int i = blockIdx.x*blockDim.x + threadIdx.x; + + sdata2[tid].x = (i < n) ? g_idata[i].x : 0; + sdata2[tid].y = (i < n) ? g_idata[i].y : 0; + + __syncthreads(); + + // do reduction in shared mem + for(unsigned int s=blockDim.x/2; s>0; s>>=1) + { + if (tid < s) + { + sdata2[tid].x += sdata2[tid + s].x; + sdata2[tid].y += sdata2[tid + s].y; + } + __syncthreads(); + } + + // write result for this block to global mem + if (tid == 0) { + g_odata[blockIdx.x].x = sdata2[0].x; + g_odata[blockIdx.x].y = sdata2[0].y; + } +} + +#endif + + + + + +// update the global device gaugefield with host gaugefield given by gf +// this is all single precision! +template +void update_dev_gaugefield(su3** gf,MixedsolveParameter& mixedsolveParameter){ + + size_t dev_gfsize; + #ifdef GF_8 + dev_gfsize = 2*4*VOLUME * sizeof(typename dev_su3_8T::type); + su3to8(gf,mixedsolveParameter.h2d_gf); + #else + dev_gfsize = 3*4*VOLUME * sizeof(typename dev_su3_2vT::type); + su3to2vf4(gf,mixedsolveParameter.h2d_gf); + #endif + cudaMemcpy(mixedsolveParameter.dev_gf, mixedsolveParameter.h2d_gf, dev_gfsize, cudaMemcpyHostToDevice); +} + + + + + + + + + + + + +// calculates the mean plaquette of the gauge field +__global__ void dev_mean_plaq(float* reductionfield, int * dev_nn, dev_su3_2v * gf){ + typedef REAL RealT; + float mplaq = 0.0; + int x0pos, x1pos, x2pos ; /* x0pos = basepoint of plaquette, x1pos = x0pos + e_mu, x2pos = x0pos + e_nu */ + int t,mu,nu; + dev_su3 su3matrix,su3matrix2, M1,M2,M3,M4; + + x0pos = threadIdx.x + blockDim.x*blockIdx.x; + #ifdef TEMPORALGAUGE + int spatialvol = dev_LX*dev_LY*dev_LZ; + #endif + + + if(x0pos < dev_VOLUME){ + + + //nu == T-direction -> beware temporal gauge and GF8 + nu = 0; + for(mu =nu+1; mu < 4; mu++){ + x1pos = dev_nn[8*x0pos + mu]; + x2pos = dev_nn[8*x0pos + nu]; + +/* U_mu(x) */ + #ifdef GF_8 + dev_reconstructgf_8texref(gf, (4*x0pos+mu),&M1); + #else + dev_reconstructgf_2vtexref(gf, (4*x0pos+mu),&M1); + #endif +/* U_nu(x+e_mu) */ + #ifdef TEMPORALGAUGE + t = x0pos/spatialvol; // are we on timeslice T-1? no -> U==ID + if(t != (dev_T-1) ){ + dev_unit_su3(&M2); + } + else{ + #ifdef GF_8 + dev_reconstructgf_8texref(gf, (4*x1pos+nu),&M2); + #else + dev_reconstructgf_2vtexref(gf, (4*x1pos+nu),&M2); + #endif + } + #else + #ifdef GF_8 + dev_reconstructgf_8texref(gf, (4*x1pos+nu),&M2); + #else + dev_reconstructgf_2vtexref(gf, (4*x1pos+nu),&M2); + #endif + #endif /*TEMPORALGAUGE*/ + dev_su3_ti_su3(&su3matrix, &M1,&M2); + +/* Udagger_mu(x+e_nu) */ + #ifdef GF_8 + dev_reconstructgf_8texref_dagger(gf, (4*x2pos+mu),&M3); + #else + dev_reconstructgf_2vtexref_dagger(gf, (4*x2pos+mu),&M3); + #endif + dev_su3_ti_su3(&su3matrix2, &su3matrix,&M3); +/* Udagger_nu(x)*/ + #ifdef TEMPORALGAUGE + if(t != (dev_T-1) ){ + dev_unit_su3(&M4); + } + else{ + #ifdef GF_8 + dev_reconstructgf_8texref_dagger(gf, (4*x0pos+nu),&M4); + #else + dev_reconstructgf_2vtexref_dagger(gf, (4*x0pos+nu),&M4); + #endif + } + #else + #ifdef GF_8 + dev_reconstructgf_8texref_dagger(gf, (4*x0pos+nu),&M4); + #else + dev_reconstructgf_2vtexref_dagger(gf, (4*x0pos+nu),&M4); + #endif + #endif /*TEMPORALGAUGE*/ + dev_su3_ti_su3(&su3matrix, &su3matrix2,&M4); + mplaq += dev_su3Retrace(&su3matrix)/3.0; + + /* multiply these and store in su3matrix*/ + /* + dev_su3_ti_su3(&su3matrix, &M3,&M4); + dev_su3_ti_su3(&su3matrix2, &M2,&su3matrix); + dev_su3_ti_su3(&su3matrix, &M1,&su3matrix2); + + mplaq += dev_su3Retrace(&su3matrix)/3.0; + */ + } + + + // nu != T-direction -> no problem with temporal gauge and GF8 + for(nu=1;nu <3; nu++){ + for(mu =nu+1; mu < 4; mu++){ + x1pos = dev_nn[8*x0pos + mu]; + x2pos = dev_nn[8*x0pos + nu]; + +/* U_nu(x) */ + #ifdef GF_8 + dev_reconstructgf_8texref(gf, (4*x0pos+mu),&M1); + #else + dev_reconstructgf_2vtexref(gf, (4*x0pos+mu),&M1); + #endif +/* U_mu(x+e_mu) */ + #ifdef GF_8 + dev_reconstructgf_8texref(gf, (4*x1pos+nu),&M2); + #else + dev_reconstructgf_2vtexref(gf, (4*x1pos+nu),&M2); + #endif + dev_su3_ti_su3(&su3matrix, &M1,&M2); +/* Udagger_nu(x+e_nu) */ + #ifdef GF_8 + dev_reconstructgf_8texref_dagger(gf, (4*x2pos+mu),&M3); + #else + dev_reconstructgf_2vtexref_dagger(gf, (4*x2pos+mu),&M3); + #endif + dev_su3_ti_su3(&su3matrix2, &su3matrix,&M3); +/* Udagger_mu(x)*/ + #ifdef GF_8 + dev_reconstructgf_8texref_dagger(gf, (4*x0pos+nu),&M4); + #else + dev_reconstructgf_2vtexref_dagger(gf, (4*x0pos+nu),&M4); + #endif + dev_su3_ti_su3(&su3matrix, &su3matrix2,&M4); + mplaq += dev_su3Retrace(&su3matrix)/3.0; + /* multiply these and store in su3matrix*/ + /* + dev_su3_ti_su3(&su3matrix, &M3,&M4); + dev_su3_ti_su3(&su3matrix2, &M2,&su3matrix); + dev_su3_ti_su3(&su3matrix, &M1,&su3matrix2); + + mplaq += dev_su3Retrace(&su3matrix)/3.0; + */ + } + } + + reductionfield[x0pos] = mplaq; + } +} + + + + + + + +float calc_plaquette(dev_su3_2v * U, int* nn){ + float erg=0.0; + int j; + #ifdef USETEXTURE + //Bind texture gf + bind_texture_gf(U); + #endif + + int gridsize; + int blocksize=BLOCK2; + if( VOLUME >= BLOCK2){ + gridsize = (int)(VOLUME/BLOCK2) + 1; + } + else{ + gridsize=1; + } + + dev_mean_plaq <<< gridsize , blocksize >>> (dev_redfield, nn, U) ; + printf("Plaquette calculation on device: %s\n", cudaGetErrorString(cudaGetLastError())); + + #ifdef USETEXTURE + unbind_texture_gf(); + #endif + + int redblocks; + if(sredsize > 1){ + redblocks = sredsize; // VOLUME/REDUCTION_N + reduce_float <<< redblocks, REDUCTION_N, REDUCTION_N*sizeof(float) >>> + ( dev_redfield, dev_sredfield, VOLUME); + printf("Reduction 1 of data: %s\n", cudaGetErrorString(cudaGetLastError())); + } + if(ssredsize > 1){ + redblocks = ssredsize; + reduce_float <<< redblocks, REDUCTION_N, REDUCTION_N*sizeof(float) >>> + ( dev_sredfield, dev_ssredfield, sredsize ); + printf("Reduction 2 of data: %s\n", cudaGetErrorString(cudaGetLastError())); + + cudaMemcpy(redfield, dev_ssredfield, (size_t)(redblocks*sizeof(float)), cudaMemcpyDeviceToHost); + } + else{ + cudaMemcpy(redfield, dev_sredfield, (size_t)(redblocks*sizeof(float)), cudaMemcpyDeviceToHost); + } + + // we have to add up the final sum on host + for(j=0; j nu +// ^ +// | +// +// ^ mu +// | +// x +__device__ float dev_onerect(int * dev_nn, dev_su3_2v * gf, int tid, + int x0pos, int mu, int nu ){ + typedef REAL RealT; + int x1pos, x2pos; + dev_su3 M1,M2, su3matrix; + + __shared__ dev_su3 su3matrix2[BLOCK]; + + + x1pos = dev_nn[8*x0pos + mu]; + x2pos = dev_nn[8*x1pos + mu]; + +/* U_mu(x) */ + #ifdef GF_8 + dev_reconstructgf_8texref(gf, (4*x0pos+mu),&M1); + #else + dev_reconstructgf_2vtexref(gf, (4*x0pos+mu),&M1); + #endif +/* U_mu(x+e_mu) */ + #ifdef GF_8 + dev_reconstructgf_8texref(gf, (4*x1pos+mu),&M2); + #else + dev_reconstructgf_2vtexref(gf, (4*x1pos+mu),&M2); + #endif + dev_su3_ti_su3(&su3matrix2[tid], &M1,&M2); + +/* U_nu(x+ 2 e_mu) */ + #ifdef GF_8 + dev_reconstructgf_8texref(gf, (4*x2pos+nu),&M1); + #else + dev_reconstructgf_2vtexref(gf, (4*x2pos+nu),&M1); + #endif + dev_su3_ti_su3(&su3matrix, &su3matrix2[tid], &M1); + + + + //x0pos = x + dev_LX*(y + dev_LY*(z + dev_LZ*t)); + x1pos = dev_nn[8*x0pos + mu]; + x2pos = dev_nn[8*x1pos + nu]; + + x1pos = dev_nn[8*x0pos + nu]; + +/* Udagger_mu(x+e_nu+e_mu)*/ + #ifdef GF_8 + dev_reconstructgf_8texref_dagger(gf, (4*x2pos+mu),&M1); + #else + dev_reconstructgf_2vtexref_dagger(gf, (4*x2pos+mu),&M1); + #endif + dev_su3_ti_su3(&su3matrix2[tid], &su3matrix , &M1); + /* Udagger_mu(x+e_nu) */ + #ifdef GF_8 + dev_reconstructgf_8texref_dagger(gf, (4*x1pos+mu),&M2); + #else + dev_reconstructgf_2vtexref_dagger(gf, (4*x1pos+mu),&M2); + #endif + dev_su3_ti_su3(&su3matrix, &su3matrix2[tid] , &M2); +/* Udagger_nu(x)*/ + #ifdef GF_8 + dev_reconstructgf_8texref_dagger(gf, (4*x0pos+nu),&M1); + #else + dev_reconstructgf_2vtexref_dagger(gf, (4*x0pos+nu),&M1); + #endif + dev_su3_ti_su3(&su3matrix2[tid], &su3matrix , &M1); + + + + float help = dev_su3Retrace(&su3matrix2[tid])/3.0; +return(help); +} + + + + +// calculates the rectangles of the gauge field +// uses 2d parallelization: +__global__ void dev_rectangle(float* reductionfield, + int * dev_nn, dev_su3_2v * gf){ + float mrect = 0.0; + int x0pos,mu,nu, ix; + + ix = threadIdx.x; + x0pos = threadIdx.x + blockDim.x*blockIdx.x; + + if(x0pos < dev_VOLUME){ + + + mrect += dev_onerect(dev_nn, gf, ix, x0pos, 0, 1); + mrect += dev_onerect(dev_nn, gf, ix, x0pos, 0, 2); + mrect += dev_onerect(dev_nn, gf, ix, x0pos, 0, 3); + + mrect += dev_onerect(dev_nn, gf, ix, x0pos, 1, 0); + mrect += dev_onerect(dev_nn, gf, ix, x0pos, 1, 2); + mrect += dev_onerect(dev_nn, gf, ix, x0pos, 1, 3); + + mrect += dev_onerect(dev_nn, gf, ix, x0pos, 2, 0); + mrect += dev_onerect(dev_nn, gf, ix, x0pos, 2, 1); + mrect += dev_onerect(dev_nn, gf, ix, x0pos, 2, 3); + + mrect += dev_onerect(dev_nn, gf, ix, x0pos, 3, 0); + mrect += dev_onerect(dev_nn, gf, ix, x0pos, 3, 1); + mrect += dev_onerect(dev_nn, gf, ix, x0pos, 3, 2); + + + + reductionfield[x0pos] = mrect; + + } + +} + + + + + +float calc_rectangle(dev_su3_2v * U, int* nn){ + float erg=0.0; + int j; + #ifdef USETEXTURE + //Bind texture gf + bind_texture_gf(U); + #endif + + int gridsize; + int blocksize=BLOCK; + if( VOLUME >= BLOCK){ + gridsize = (int)(VOLUME/BLOCK) + 1; + } + else{ + gridsize=1; + } + + dev_rectangle <<< gridsize , blocksize >>> (dev_redfield, nn, U) ; + printf("Rectangle calculation on device: %s\n", cudaGetErrorString(cudaGetLastError())); + + #ifdef USETEXTURE + unbind_texture_gf(); + #endif + + int redblocks; + if(sredsize > 0){ + redblocks = sredsize; // VOLUME/REDUCTION_N + + reduce_float <<< redblocks, REDUCTION_N, REDUCTION_N*sizeof(float) >>> + ( dev_redfield, dev_sredfield, VOLUME); + printf("Reduction 1 of data: %s\n", cudaGetErrorString(cudaGetLastError())); + } + if(ssredsize > 0){ + redblocks = ssredsize; + reduce_float <<< redblocks, REDUCTION_N, REDUCTION_N*sizeof(float) >>> + ( dev_sredfield, dev_ssredfield, sredsize ); + printf("Reduction 2 of data: %s\n", cudaGetErrorString(cudaGetLastError())); + + cudaMemcpy(redfield, dev_ssredfield, (size_t)(redblocks*sizeof(float)), cudaMemcpyDeviceToHost); + } + else{ + cudaMemcpy(redfield, dev_sredfield, (size_t)(redblocks*sizeof(float)), cudaMemcpyDeviceToHost); + } + // we have to add up the final sum on host + for(j=0; j set actualpos to spatialpos + int actualpos = spatialpos; + + for(t=0; t < dev_T; t++){ + +/* U_0(x) */ + #ifdef GF_8 + dev_reconstructgf_8texref(gf, (4*actualpos),&M1); + #else + dev_reconstructgf_2vtexref(gf, (4*actualpos),&M1); + #endif + + //multiply + dev_su3_ti_su3(&tmp, &gather, &M1); + //store again gather + dev_su3_assign(&gather, &tmp); + + //go one step in 0-direction + actualpos = dev_nn[8*actualpos]; + } + dev_su3trace(&poly, &gather); + reductionfield[spatialpos].x = poly.x/3.0; + reductionfield[spatialpos].y = poly.y/3.0; + } +} + + + + + + + + + +void calc_polyakov_0(float2* ret, dev_su3_2v * U, int* nn){ + int j; + float2 erg; + #ifdef USETEXTURE + //Bind texture gf + bind_texture_gf(U); + #endif + + int gridsize; + int blocksize=BLOCK2; + int spatialvol = LX*LY*LZ; + if( spatialvol >= BLOCK2){ + gridsize = (int)(spatialvol/BLOCK2) + 1; + } + else{ + gridsize=1; + } + + dev_polyakov_0 <<< gridsize , blocksize >>> (dev_polyredfield, nn, U) ; + printf("Polyakov loop calculation on device: %s\n", cudaGetErrorString(cudaGetLastError())); + + #ifdef USETEXTURE + unbind_texture_gf(); + #endif + + + + + int redblocks = polysredsize; // VOLUME/REDUCTION_N + cudaMemcpy(polyredfield, dev_polyredfield, (size_t)(redblocks*sizeof(float2)), cudaMemcpyDeviceToHost); + /* write to file */ + + if(polysredsize > 1){ + reduce_float2 <<< redblocks, REDUCTION_N, REDUCTION_N*sizeof(float2) >>> + ( dev_polyredfield, dev_polysredfield, spatialvol); + printf("Reduction 1 of data: %s\n", cudaGetErrorString(cudaGetLastError())); + } + if(polyssredsize > 1){ + redblocks = polyssredsize; + reduce_float2 <<< redblocks, REDUCTION_N, REDUCTION_N*sizeof(float2) >>> + ( dev_polysredfield, dev_polyssredfield, polysredsize ); + printf("Reduction 2 of data: %s\n", cudaGetErrorString(cudaGetLastError())); + + cudaMemcpy(polyredfield, dev_polyssredfield, (size_t)(redblocks*sizeof(float2)), cudaMemcpyDeviceToHost); + } + else{ + cudaMemcpy(polyredfield, dev_polysredfield, (size_t)(redblocks*sizeof(float2)), cudaMemcpyDeviceToHost); + } + + + // we have to add up the final sum on host + for(j=0; j maxblockdim + + ks=0.0; + kc=0.0; + + if(blockDim.x > dev_VOLUME){ + stepwidth = 1; + } + else{ + stepwidth = dev_VOLUME/(gridDim.x*blockDim.x); + } + + int start = (blockIdx.x*blockDim.x + threadIdx.x)*stepwidth; + int end = (blockIdx.x*blockDim.x + threadIdx.x+1)*stepwidth; + + for(pos=start;pos maxblockdim + + ks=0.0; + kc=0.0; + + if(blockDim.x > dev_VOLUME){ + stepwidth = 1; + } + else{ + stepwidth = dev_VOLUME/(gridDim.x*blockDim.x); + } + + int start = (blockIdx.x*blockDim.x + threadIdx.x)*stepwidth; + int end = (blockIdx.x*blockDim.x + threadIdx.x+1)*stepwidth; + + for(pos=start;pos maxblockdim + + ks=0.0; + kc=0.0; + + if(ACCUM_N > dev_VOLUME){ + stepwidth = 1; + sweepsperthread = 1; + } + else{ + stepwidth = dev_VOLUME/ACCUM_N; + sweepsperthread = ACCUM_N/blockDim.x; + } + + + + for(int j = 0; j < sweepsperthread; j++){ + + int start = (threadIdx.x + j*blockDim.x)*stepwidth; + int end = (threadIdx.x+j*blockDim.x+1)*stepwidth; + ks=0.0; + kc=0.0; + + for(pos=start;pos 0; stride >>= 1){ + __syncthreads(); + for(int iAccum = threadIdx.x; iAccum < stride; iAccum += blockDim.x) + shrinkarray[iAccum] += shrinkarray[stride + iAccum]; + } + + if(threadIdx.x == 0) (*erg) = shrinkarray[0]; + + + /* + if(threadIdx.x==0){ + ks=0.0; + kc=0.0; + int k; + for(k=0; k. + * + * + * File: textures.cuh + * + * CUDA texture functions and references + * + * + * + **************************************************************************/ + + +#ifdef HAVE_CONFIG_H + #include +#endif + + /* texture for nearest neighbours*/ + texture nn_tex; + const textureReference* nn_texRefPtr = NULL; + cudaChannelFormatDesc nn_channelDesc; + + /* texture for spinor field */ + texture spin_tex; + const textureReference* spin_texRefPtr = NULL; + cudaChannelFormatDesc spin_channelDesc; + + /* texture for spinor field 2*/ + texture spin_tex2; + const textureReference* spin_texRefPtr2 = NULL; + cudaChannelFormatDesc spin_channelDesc2; + + +#ifndef HALF + /* texture for gauge field */ + texture gf_tex; + const textureReference* gf_texRefPtr = NULL; + cudaChannelFormatDesc gf_channelDesc; + + +extern "C" int bind_texture_spin(dev_spinor* s, int i){ + + size_t size; + + #ifdef MPI + if(even_odd_flag){ + size = sizeof(float4)*6*(VOLUME+RAND)/2; + } + else{ + size = sizeof(float4)*6*(VOLUME+RAND); + } + #else + if(even_odd_flag){ + size = sizeof(float4)*6*VOLUME/2; + } + else{ + size = sizeof(float4)*6*VOLUME; + } + #endif + + + switch(i){ + case 1: + //printf("Binding texture to spinorfield 1\n"); + spin_texRefPtr = NULL; + cudaGetTextureReference(&spin_texRefPtr, "spin_tex"); + spin_channelDesc = cudaCreateChannelDesc(); + cudaBindTexture(0, spin_texRefPtr, s, &spin_channelDesc, size); + //printf("%s\n", cudaGetErrorString(cudaGetLastError())); + return(0); + + case 2: + //printf("Binding texture to spinorfield 2\n"); + spin_texRefPtr2 = NULL; + cudaGetTextureReference(&spin_texRefPtr2, "spin_tex2"); + spin_channelDesc2 = cudaCreateChannelDesc(); + cudaBindTexture(0, spin_texRefPtr2, s, &spin_channelDesc2, size); + //printf("%s\n", cudaGetErrorString(cudaGetLastError())); + return(0); + } +return(1); +} + + +extern "C" int unbind_texture_spin(int i){ + switch(i){ + case 1: + //printf("Unbinding texture of spinorfield 1\n"); + cudaUnbindTexture(spin_texRefPtr); + //printf("%s\n", cudaGetErrorString(cudaGetLastError())); + return(0); + case 2: + //printf("Unbinding texture of spinorfield 2\n"); + cudaUnbindTexture(spin_texRefPtr2); + //printf("%s\n", cudaGetErrorString(cudaGetLastError())); + return(0); + } + +return(1); +} + + + + +#else + + /* texture for gauge field */ + texture gf_tex; + const textureReference* gf_texRefPtr = NULL; + cudaChannelFormatDesc gf_channelDesc; + + // the textures for the half spinors are defined in half.cuh + +#endif // NOT HALF + + + + + + + +extern "C" int bind_texture_gf(dev_su3_2v * gf){ + //printf("Binding texture to gaugefield\n"); + + #ifdef MPI + #ifdef GF_8 + size_t size = sizeof(float4)*2*(VOLUME+RAND)*4; + #else + size_t size = sizeof(float4)*3*(VOLUME+RAND)*4; + #endif + #else + #ifdef GF_8 + size_t size = sizeof(float4)*2*VOLUME*4; + #else + size_t size = sizeof(float4)*3*VOLUME*4; + #endif + #endif + + cudaGetTextureReference(&gf_texRefPtr, "gf_tex"); + gf_channelDesc = cudaCreateChannelDesc(); + cudaBindTexture(0, gf_texRefPtr, gf, &gf_channelDesc, size); + //printf("%s\n", cudaGetErrorString(cudaGetLastError())); + return(0); +} + + +extern "C" int unbind_texture_gf(){ + //printf("Unbinding texture to gaugefield\n"); + cudaUnbindTexture(gf_texRefPtr); + //printf("%s\n", cudaGetErrorString(cudaGetLastError())); + return(0); +} + + + + + + + +extern "C" int bind_texture_nn(int* nn){ + //printf("Binding texture to nn field\n"); + size_t size; + + #ifdef MPI + if(even_odd_flag){ + size = sizeof(int)*8*(VOLUME+RAND)/2; + } + else{ + size = sizeof(int)*8*(VOLUME+RAND); + } + #else + if(even_odd_flag){ + size = sizeof(int)*8*VOLUME/2; + } + else{ + size = sizeof(int)*8*VOLUME; + } + #endif + + + cudaGetTextureReference(&nn_texRefPtr, "nn_tex"); + nn_channelDesc = cudaCreateChannelDesc(); + cudaBindTexture(0, nn_texRefPtr, nn, &nn_channelDesc, size); + //printf("%s\n", cudaGetErrorString(cudaGetLastError())); + return(0); +} + + +extern "C" int unbind_texture_nn(){ + //printf("Unbinding texture to nn field\n"); + cudaUnbindTexture(nn_texRefPtr); + //printf("%s\n", cudaGetErrorString(cudaGetLastError())); + return(0); +} + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/tm_diracoperator.cuh b/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/tm_diracoperator.cuh new file mode 100644 index 0000000000000000000000000000000000000000..c8f8edffd67b644f97e987a1ec855e6acabb470a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/GPU/tm_diracoperator.cuh @@ -0,0 +1,347 @@ +/*********************************************************************** + * + * Copyright (C) 2010 Florian Burger + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * + * File: tm_diracoperator.cuh + * + * CUDA twisted mass dirac operator and its adjoint + * + * + * + **************************************************************************/ + + + + + + +//applies the full tm Operator +// uses texture cache (spin_tex) for input spinor +// runs through whole lattice for output spinor +// D_psi uses phase_mu and not ka_mu for the boundary conds (vice versa in HoppingMatrix) +// -> thats why complexmult and complexcgmult are interchanged in dev_HoppingMatrix and in +// dev_tm_dirac_kappa + + + +template +__global__ void dev_tm_dirac_kappa +( + typename dev_su3_2vT::type * gf, + typename dev_spinorT::type * sin, + typename dev_spinorT::type * sout, + int * dev_nn +){ + int pos,hoppos; + typename dev_spinorT::type shelp1[6], ssum[6]; + __shared__ typename dev_su3T::type gfsmem[BLOCK]; + + + pos= threadIdx.x + blockDim.x*blockIdx.x; + int ix = threadIdx.x; + if(pos < dev_VOLUME){ + + //dev_zero_spinor(&(ssum[0])); // zero sum + //skalarer Term + #ifdef USETEXTURE + ssum[0] = tex1Dfetch(spin_tex,6*pos); + ssum[1] = tex1Dfetch(spin_tex,6*pos+1); + ssum[2] = tex1Dfetch(spin_tex,6*pos+2); + ssum[3] = tex1Dfetch(spin_tex,6*pos+3); + ssum[4] = tex1Dfetch(spin_tex,6*pos+4); + ssum[5] = tex1Dfetch(spin_tex,6*pos+5); + #else + ssum[0] = sin[6*pos]; + ssum[1] = sin[6*pos+1]; + ssum[2] = sin[6*pos+2]; + ssum[3] = sin[6*pos+3]; + ssum[4] = sin[6*pos+4]; + ssum[5] = sin[6*pos+5]; + #endif + +//hopping term +//l==0,t + //positive direction + hoppos = dev_nn[8*pos]; + //color + #ifdef GF_8 + dev_reconstructgf_8texref (gf,4*pos,&(gfsmem[ix])); + #else + dev_reconstructgf_2vtexref(gf,4*pos,&(gfsmem[ix])); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix], hoppos, &(shelp1[0])); + #else + dev_su3MtV (gfsmem[ix], &(sin[6*hoppos]), &(shelp1[0])); + #endif + //-kappa(r - gamma_mu) + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_mk0),&(shelp1[0]), &(ssum[0])); + //dev_GammatV(0,&(shelp1[0])); + dev_Gamma0(&(shelp1[0])); + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_k0),&(shelp1[0]), &(ssum[0])); + + //negative direction + hoppos = dev_nn[8*pos+4]; + //color + #ifdef GF_8 + dev_reconstructgf_8texref_dagger (gf,4*hoppos,&(gfsmem[ix])); + #else + dev_reconstructgf_2vtexref_dagger(gf,4*hoppos,&(gfsmem[ix])); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix], hoppos, &(shelp1[0])); + #else + dev_su3MtV (gfsmem[ix], &(sin[6*hoppos]), &(shelp1[0])); + #endif + //-kappa(r + gamma_mu) + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_mk0),&(shelp1[0]), &(ssum[0])); + //dev_GammatV(0,&(shelp1[0])); + dev_Gamma0(&(shelp1[0])); + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_mk0),&(shelp1[0]), &(ssum[0])); + + +//l==3,z + //positive direction + hoppos = dev_nn[8*pos+3]; + //color + #ifdef GF_8 + dev_reconstructgf_8texref (gf,4*pos+(3),&(gfsmem[ix])); + #else + dev_reconstructgf_2vtexref(gf,4*pos+(3),&(gfsmem[ix])); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix], hoppos, &(shelp1[0])); + #else + dev_su3MtV (gfsmem[ix], &(sin[6*hoppos]), &(shelp1[0])); + #endif + //-kappa(r - gamma_mu) + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_mk3),&(shelp1[0]), &(ssum[0])); + //dev_GammatV(3,&(shelp1[0])); + dev_Gamma3(&(shelp1[0])); + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_k3),&(shelp1[0]), &(ssum[0])); + + //negative direction + hoppos = dev_nn[8*pos+7]; + //color + #ifdef GF_8 + dev_reconstructgf_8texref_dagger (gf,4*hoppos+(3),&(gfsmem[ix])); + #else + dev_reconstructgf_2vtexref_dagger(gf,4*hoppos+(3),&(gfsmem[ix])); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix], hoppos, &(shelp1[0])); + #else + dev_su3MtV (gfsmem[ix], &(sin[6*hoppos]), &(shelp1[0])); + #endif + //-kappa(r + gamma_mu) + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_mk3),&(shelp1[0]), &(ssum[0])); + //dev_GammatV(3,&(shelp1[0])); + dev_Gamma3(&(shelp1[0])); + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_mk3),&(shelp1[0]), &(ssum[0])); + + +//l==2,y + //positive direction + hoppos = dev_nn[8*pos+2]; + //color + #ifdef GF_8 + dev_reconstructgf_8texref (gf,4*pos+(2),&(gfsmem[ix])); + #else + dev_reconstructgf_2vtexref(gf,4*pos+(2),&(gfsmem[ix])); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix], hoppos, &(shelp1[0])); + #else + dev_su3MtV (gfsmem[ix], &(sin[6*hoppos]), &(shelp1[0])); + #endif + //-kappa(r - gamma_mu) + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_mk2),&(shelp1[0]), &(ssum[0])); + //dev_GammatV(2,&(shelp1[0])); + dev_Gamma2(&(shelp1[0])); + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_k2),&(shelp1[0]), &(ssum[0])); + + //negative direction + hoppos = dev_nn[8*pos+6]; + //color + #ifdef GF_8 + dev_reconstructgf_8texref_dagger (gf,4*hoppos+(2),&(gfsmem[ix])); + #else + dev_reconstructgf_2vtexref_dagger(gf,4*hoppos+(2),&(gfsmem[ix])); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix], hoppos, &(shelp1[0])); + #else + dev_su3MtV (gfsmem[ix], &(sin[6*hoppos]), &(shelp1[0])); + #endif + //-kappa(r + gamma_mu) + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_mk2),&(shelp1[0]), &(ssum[0])); + //dev_GammatV(2,&(shelp1[0])); + dev_Gamma2(&(shelp1[0])); + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_mk2),&(shelp1[0]), &(ssum[0])); + + +//l==1,x + //positive direction + hoppos = dev_nn[8*pos+1]; + //color + #ifdef GF_8 + dev_reconstructgf_8texref (gf,4*pos+(1),&(gfsmem[ix])); + #else + dev_reconstructgf_2vtexref(gf,4*pos+(1),&(gfsmem[ix])); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix], hoppos, &(shelp1[0])); + #else + dev_su3MtV (gfsmem[ix], &(sin[6*hoppos]), &(shelp1[0])); + #endif + //-kappa(r - gamma_mu) + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_mk1),&(shelp1[0]), &(ssum[0])); + //dev_GammatV(1,&(shelp1[0])); + dev_Gamma1(&(shelp1[0])); + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_k1),&(shelp1[0]), &(ssum[0])); + + //negative direction + hoppos = dev_nn[8*pos+5]; + //color + #ifdef GF_8 + dev_reconstructgf_8texref_dagger (gf,4*hoppos+(1),&(gfsmem[ix])); + #else + dev_reconstructgf_2vtexref_dagger(gf,4*hoppos+(1),&(gfsmem[ix])); + #endif + #ifdef USETEXTURE + dev_su3MtV_spintex(gfsmem[ix], hoppos, &(shelp1[0])); + #else + dev_su3MtV (gfsmem[ix], &(sin[6*hoppos]), &(shelp1[0])); + #endif + //-kappa(r + gamma_mu) + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_mk1),&(shelp1[0]), &(ssum[0])); + //dev_GammatV(1,&(shelp1[0])); + dev_Gamma1(&(shelp1[0])); + dev_complexcgmult_add_assign_spinor(&(ssum[0]),dev_complexT(dev_mk1),&(shelp1[0]), &(ssum[0])); + + + + //gamma5 term + #ifdef USETEXTURE + shelp1[0] = tex1Dfetch(spin_tex,6*pos); + shelp1[1] = tex1Dfetch(spin_tex,6*pos+1); + shelp1[2] = tex1Dfetch(spin_tex,6*pos+2); + shelp1[3] = tex1Dfetch(spin_tex,6*pos+3); + shelp1[4] = tex1Dfetch(spin_tex,6*pos+4); + shelp1[5] = tex1Dfetch(spin_tex,6*pos+5); + #else + shelp1[0] = sin[6*pos]; + shelp1[1] = sin[6*pos+1]; + shelp1[2] = sin[6*pos+2]; + shelp1[3] = sin[6*pos+3]; + shelp1[4] = sin[6*pos+4]; + shelp1[5] = sin[6*pos+5]; + #endif + + + //dev_GammatV(4,&(shelp1[0])); + dev_Gamma5(&(shelp1[0])); + dev_complexmult_add_assign_spinor(&(ssum[0]),dev_initcomplex(0.0,2.0*kappa*mu),&(shelp1[0]), &(sout[6*pos])); + } +} + + + + + +template +__global__ void dev_gamma5(typename dev_spinorT::type * sin,typename dev_spinorT::type * sout){ + int pos; + pos= threadIdx.x + blockDim.x*blockIdx.x; + if(pos < dev_VOLUME){ + sout[6*pos+0].x = sin[6*pos+0].x; + sout[6*pos+0].y = sin[6*pos+0].y; + sout[6*pos+0].z = sin[6*pos+0].z; + sout[6*pos+0].w = sin[6*pos+0].w; + sout[6*pos+1].x = sin[6*pos+1].x; + sout[6*pos+1].y = sin[6*pos+1].y; + + sout[6*pos+1].z = sin[6*pos+1].z; + sout[6*pos+1].w = sin[6*pos+1].w; + sout[6*pos+2].x = sin[6*pos+2].x; + sout[6*pos+2].y = sin[6*pos+2].y; + sout[6*pos+2].z = sin[6*pos+2].z; + sout[6*pos+2].w = sin[6*pos+2].w; + + sout[6*pos+3].x = -1.0*sin[6*pos+3].x; + sout[6*pos+3].y = -1.0*sin[6*pos+3].y; + sout[6*pos+3].z = -1.0*sin[6*pos+3].z; + sout[6*pos+3].w = -1.0*sin[6*pos+3].w; + sout[6*pos+4].x = -1.0*sin[6*pos+4].x; + sout[6*pos+4].y = -1.0*sin[6*pos+4].y; + + sout[6*pos+4].z = -1.0*sin[6*pos+4].z; + sout[6*pos+4].w = -1.0*sin[6*pos+4].w; + sout[6*pos+5].x = -1.0*sin[6*pos+5].x; + sout[6*pos+5].y = -1.0*sin[6*pos+5].y; + sout[6*pos+5].z = -1.0*sin[6*pos+5].z; + sout[6*pos+5].w = -1.0*sin[6*pos+5].w; + } +} + + + + + +template +void dev_tm_dirac_dagger_kappa +( + typename dev_su3_2vT::type * gf, + typename dev_spinorT::type* spinin, + typename dev_spinorT::type* spinout, + int *grid, int * nn_grid, RealT* output,RealT* erg, int xsize, int ysize +){ + int gridsize; + if( VOLUME >= 128){ + gridsize =VOLUME/128; + } + else{ + gridsize=1; + } + dim3 griddim2(gridsize,1,1); + dim3 blockdim2(128,1,1); + dim3 blockdim(xsize,ysize); + + dim3 blockdim3(BLOCK,1,1); + if( VOLUME >= BLOCK){ + gridsize = (int)(VOLUME/BLOCK) + 1; + } + else{ + gridsize=1; + } + dim3 griddim3(gridsize,1,1); + dev_gamma5 <<>> (spinin,spinout); + dev_tm_dirac_kappa <<>> (gf, spinout, spinin, dev_nn); + dev_gamma5 <<>>(spinin,spinout); +} + + + + +__global__ void dev_swapmu(){ + if(blockIdx.x == 0 && threadIdx.x == 0){ + mu = - mu; + } +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/HOWTO-benchmark b/qcd/part_cpu/applications/QCD/src/kernel_D/HOWTO-benchmark new file mode 100644 index 0000000000000000000000000000000000000000..ec591d91eeeb43512f59bd36ba8317b83ba469fb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/HOWTO-benchmark @@ -0,0 +1,91 @@ +Benchmark of the even-odd preconditioned Dirac operator + +Example commands to compile the program you can find at the end of this +file. You may paste them into a file ccc and use it to compile the +benchmark tool. + +The examples from below need to be edited in order to give the proper path for +the mpicc or cc. Maybe also the compile-option has to be adopted. +In particular +-DSSE2 -DP4 +should only be given for a Pentium4 (R) system, +-DSSE3 -DP4 +only for Petium4 prescott (R). +And +-DSSE2 -DOPTERON +should only be given for a AMD Opteron (R) system. -DSSE2|3 is tested +only to work with the gnu compiler. SSE3 work for gcc version >= 3.3.4 +(3.3.3 on x86_64). + +You should allways set -D_GAUGE_COPY and -D_NEW_GEOMERY. + +There are two different parallelisations available, a one dimensionale +parallelisation (set -DMPI -DPARALLELT) and a two dimensional +parallelisation (set -DMPI -DPARALLELXT). +If none of them are used, you will get a serial version of the program. + +The local lattice size in the case of the one dimensional +prallelisation is controlled by the parameters in the file +benchmark.input: + +T = 32 +L = 16 + +which will give a 32 x 16^3 global lattice. + +NrXProcs = 2 + +needs only to be set in case of a 2-dim. parallelisation and sets +the number of processes in x-direction. The number of processes in +t-direction is computed from NrXProcs and the total number of processes. +You should only take care that all this fits with the lattice size. + + + +the package size of the data that are send and recieved is +192 * (1/2) * L^3 Byte in case of the one dimensional parallelisation. +In case of the two dimensional parallelisation it is +192 * (1/2) ((L*L*L/N_PROC_X)+(T*L*L)) Byte. + +A run of the benchmark takes about one minute. + +The out-put of the program is something like this: (T=2,L=16) + +The number of processes is 12 +The local lattice size is 2 x 16 ^3 +total time 4.681349e+00 sec, Variance of the time 6.314982e-03 sec + + (297 Mflops [64 bit arithmetic]) + +communication switched off + (577 Mflops [64 bit arithmetic]) + +The size of the package is 393216 Byte +The bandwidth is 84.49 + 84.49 MB/sec + + +If you use the serial version of course the part depending on the +parallel setup will be missing. + + +Compilation commands (you need a c-compiler with c99 standard, otherwise you may need to define inline, restrict etc. to nothing): + +in general (gcc) +gcc -std=c99 -I. -I./ -I.. -o benchmark -D_GAUGE_COPY -O Hopping_Matrix.c mpi_init.c geometry_eo.c test/check_xchange.c test/check_geometry.c boundary.c start.c ranlxd.c init_gauge_field.c init_geometry_indices.c init_moment_field.c init_spinor_field.c read_input.c benchmark.c update_backward_gauge.c D_psi.c ranlxs.c -lm + +gcc and OPTERON (64 Bit architecture): +gcc -std=c99 -I. -I./ -I.. -o benchmark -DOPTERON -DSSE2 -mfpmath=387 -fomit-frame-pointer -ffloat-store -D_GAUGE_COPY -O Hopping_Matrix.c mpi_init.c geometry_eo.c test/check_xchange.c test/check_geometry.c boundary.c start.c ranlxd.c init_gauge_field.c init_geometry_indices.c init_moment_field.c init_spinor_field.c read_input.c benchmark.c update_backward_gauge.c D_psi.c ranlxs.c -lm + +gcc and pentium4: +gcc -std=c99 -I. -I./ -I.. -o benchmark -DSSE2 -DP4 -march=pentium4 -malign-double -fomit-frame-pointer -ffloat-store -D_GAUGE_COPY -O Hopping_Matrix.c mpi_init.c geometry_eo.c test/check_xchange.c test/check_geometry.c boundary.c start.c ranlxd.c init_gauge_field.c init_geometry_indices.c init_moment_field.c init_spinor_field.c read_input.c benchmark.c update_backward_gauge.c D_psi.c ranlxs.c -lm + +mpicc (gcc) general, four dimensional parallelisation: +mpicc -std=c99 -I. -I./ -I.. -o benchmark -O3 -DMPI -DPARALLELXYZT -D_GAUGE_COPY -O Hopping_Matrix.c Hopping_Matrix_nocom.c xchange_deri.c xchange_field.c xchange_gauge.c xchange_halffield.c xchange_lexicfield.c mpi_init.c geometry_eo.c test/check_xchange.c test/check_geometry.c boundary.c start.c ranlxd.c init_gauge_field.c init_geometry_indices.c init_moment_field.c init_spinor_field.c read_input.c benchmark.c update_backward_gauge.c D_psi.c ranlxs.c init_dirac_halfspinor.c -lm + + +xlc and IBM powerpc (threadsave: xlc_r): +xlc_r -I. -I./ -I.. -o benchmark -q64 -qsrcmsg -DXLC -D_GAUGE_COPY -O3 -qhot Hopping_Matrix.c Hopping_Matrix_nocom.c xchange.c mpi_init.c geometry_eo.c test/check_xchange.c test/check_geometry.c boundary.c start.c ranlxd.c init_gauge_field.c init_geometry_indices.c init_moment_field.c init_spinor_field.c read_input.c benchmark.c update_backward_gauge.c D_psi.c ranlxs.c -lm + +mpcc and IBM powerpc and _one_ dimensional parallelisation (threadsave:mpcc_r): +mpcc_r -I. -I./ -I.. -o benchmark -q64 -qsrcmsg -DXLC -DMPI -DPARALLELT -D_GAUGE_COPY -O3 -qhot Hopping_Matrix.c Hopping_Matrix_nocom.c xchange.c mpi_init.c geometry_eo.c test/check_xchange.c test/check_geometry.c boundary.c start.c ranlxd.c init_gauge_field.c init_geometry_indices.c init_moment_field.c init_spinor_field.c read_input.c benchmark.c update_backward_gauge.c D_psi.c ranlxs.c -lm + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/LapH_ev.c b/qcd/part_cpu/applications/QCD/src/kernel_D/LapH_ev.c new file mode 100644 index 0000000000000000000000000000000000000000..4f121d876feada24d574f71f2ec17581a1f0de7e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/LapH_ev.c @@ -0,0 +1,210 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +/* + * Program for computing the eigensystem of the Laplacian operator + * Authors Luigi Scorzato, Marco Cristoforetti + * + * + *******************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#else +#error "no config.h" +#endif +#include +#include +#include +#include +#include +#if (defined BGL && !defined BGP) +# include +#endif +#ifdef MPI +# include +#endif +#include "global.h" +#include +#include +#include "su3.h" +#include "ranlxd.h" +#include "geometry_eo.h" +#include "read_input.h" +#include "start.h" +#include "xchange/xchange.h" +#include "init/init.h" +#include "mpi_init.h" +#include "solver/eigenvalues_Jacobi.h" + +int main(int argc,char *argv[]) +{ + int tslice,j,k; + char conf_filename[50]; + +#ifdef MPI + MPI_Init(&argc, &argv); +#endif + + /* Read the input file */ + read_input("LapH.input"); + + tmlqcd_mpi_init(argc, argv); + + if(g_proc_id==0) { +#ifdef SSE + printf("# The code was compiled with SSE instructions\n"); +#endif +#ifdef SSE2 + printf("# The code was compiled with SSE2 instructions\n"); +#endif +#ifdef SSE3 + printf("# The code was compiled with SSE3 instructions\n"); +#endif +#ifdef P4 + printf("# The code was compiled for Pentium4\n"); +#endif +#ifdef OPTERON + printf("# The code was compiled for AMD Opteron\n"); +#endif +#ifdef _GAUGE_COPY + printf("# The code was compiled with -D_GAUGE_COPY\n"); +#endif +#ifdef BGL + printf("# The code was compiled for Blue Gene/L\n"); +#endif +#ifdef BGP + printf("# The code was compiled for Blue Gene/P\n"); +#endif +#ifdef _USE_HALFSPINOR + printf("# The code was compiled with -D_USE_HALFSPINOR\n"); +#endif +#ifdef _USE_SHMEM + printf("# the code was compiled with -D_USE_SHMEM\n"); +# ifdef _PERSISTENT + printf("# the code was compiled for persistent MPI calls (halfspinor only)\n"); +# endif +#endif +#ifdef MPI +# ifdef _NON_BLOCKING + printf("# the code was compiled for non-blocking MPI calls (spinor and gauge)\n"); +# endif +#endif + printf("\n"); + fflush(stdout); + } + + +#ifndef WITHLAPH + printf(" Error: WITHLAPH not defined"); + exit(0); +#endif +#ifdef MPI +#ifndef _INDEX_INDEP_GEOM + printf(" Error: _INDEX_INDEP_GEOM not defined"); + exit(0); +#endif +#ifndef _USE_TSPLITPAR + printf(" Error: _USE_TSPLITPAR not defined"); + exit(0); +#endif +#endif +#ifdef FIXEDVOLUME + printf(" Error: FIXEDVOLUME not allowed"); + exit(0); +#endif + + + init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 0); + init_geometry_indices(VOLUMEPLUSRAND + g_dbw2rand); + + if(g_proc_id == 0) { + fprintf(stdout,"The number of processes is %d \n",g_nproc); + printf("# The lattice size is %d x %d x %d x %d\n", + (int)(T*g_nproc_t), (int)(LX*g_nproc_x), (int)(LY*g_nproc_y), (int)(g_nproc_z*LZ)); + printf("# The local lattice size is %d x %d x %d x %d\n", + (int)(T), (int)(LX), (int)(LY),(int) LZ); + printf("# Computing LapH eigensystem \n"); + + fflush(stdout); + } + + /* define the geometry */ + geometry(); + + start_ranlux_KD(1, 123456); + + /* Read Gauge field */ + sprintf(conf_filename, "%s.%.4d", gauge_input_filename, nstore); + if (g_cart_id == 0) { + printf("#\n# Trying to read gauge field from file %s in %s precision.\n", + conf_filename, (gauge_precision_read_flag == 32 ? "single" : "double")); + fflush(stdout); + } + if( (j = read_gauge_field(conf_filename,g_gauge_field)) !=0) { + fprintf(stderr, "Error %d while reading gauge field from %s\n Aborting...\n", j, conf_filename); + exit(-2); + } + + + if (g_cart_id == 0) { + printf("# Finished reading gauge field.\n"); + fflush(stdout); + } + +#ifdef MPI + /*For parallelization: exchange the gaugefield */ + xchange_gauge(g_gauge_field); +#endif + + /* Init Jacobi field */ + init_jacobi_field(SPACEVOLUME+SPACERAND,3); + +#ifdef MPI + { + /* for debugging in parallel set i_gdb = 0 */ + volatile int i_gdb = 8; + char hostname[256]; + gethostname(hostname, sizeof(hostname)); + printf("PID %d on %s ready for attach\n", getpid(), hostname); + fflush(stdout); + if(g_cart_id == 0){ + while (0 == i_gdb){ + sleep(5); + } + } + } + + MPI_Barrier(MPI_COMM_WORLD); +#endif + + for (k=0 ; k<3 ; k++) + random_jacobi_field(g_jacobi_field[k],SPACEVOLUME); + + + /* Compute LapH Eigensystem */ + + for(tslice=0; tslice $@ + +# dirty hack to prevent make from entering an infinite loop because a phony target is given as a real +# dependency (make will build invert.d and hmc_tm.d indefinitely) +# when git_hash.h does not exist (as checked using wildcard) it is given as a dependency of invert.d and hmc_tm.d +# once it exists, this is no longer the case +# while this does break updating of git_hash.h while the dependencies are built, this is quite +# irrelevant because it will be rebuilt during the compilation of either invert or hmc_tm +ifneq (git_hash.h, $(findstring git_hash.h,$(wildcard $(top_srcdir)/git_has*.h))) +$(addsuffix .d, $(filter ${PROGRAMS_WITH_GIT_HASH},${ALLOBJ})): %.d: ${srcdir}/%.c ${top_srcdir}/git_hash.h Makefile + @ $(CCDEP) ${DEPFLAGS} ${DEFS} ${INCLUDES} $< > $@ +else +$(addsuffix .d, $(filter ${PROGRAMS_WITH_GIT_HASH},${ALLOBJ})): %.d: ${srcdir}/%.c Makefile + @ $(CCDEP) ${DEPFLAGS} ${DEFS} ${INCLUDES} $< > $@ +endif + +${top_builddir}/fixed_volume.h: ${top_srcdir}/fixed_volume.h.in ${top_builddir}/config.status + cd ${abs_top_builddir} && CONFIG_FILES=fixed_volume.h CONFIG_HEADERS= $(SHELL) ${top_builddir}/config.status + +all-recursive all-debug-recursive all-profile-recursive clean-recursive distclean-recursive compile-clean-recursive: Makefile + @set fnord ${MAKEFLAGS}; amf=$$2; \ + dot_seen=no; \ + target=`echo $@ | sed s/-recursive//`; \ + list='$(SUBDIRS)'; for subdir in $$list; do \ + echo "Making $$target in $$subdir"; \ + local_target="$$target"; \ + ( cd $$subdir && $(MAKE) $$local_target ) \ + || case "$$amf" in *=*) exit 1;; *k*) fail=yes;; *) exit 1;; esac; \ + done; test -z "$$fail"; diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/Makefile.in b/qcd/part_cpu/applications/QCD/src/kernel_D/Makefile.in new file mode 100644 index 0000000000000000000000000000000000000000..58859f1eaec3997ade85193a617ac3869c183dd6 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/Makefile.in @@ -0,0 +1,158 @@ +srcdir = @srcdir@ +top_srcdir = @top_srcdir@ +abs_top_srcdir = @abs_top_srcdir@ +top_builddir = . +abs_top_builddir = @abs_top_builddir@ +builddir = @builddir@ +prefix = @prefix@ +exec_prefix = @exec_prefix@ +bindir = @bindir@ +program_transform_name = @program_transform_name@ +subdir = . + +AR = @AR@ +RANLIB = @RANLIB@ +CC = @CC@ +CCDEP = @CCDEP@ +CFLAGS = @CFLAGS@ +LDFLAGS = @LDFLAGS@ +DEPFLAGS = @DEPFLAGS@ +CPPFLAGS = @CPPFLAGS@ +CCLD = @CCLD@ +LEX = @LEX@ +AUTOCONF = @AUTOCONF@ +LIBS = @LIBS@ +SHELL = @SHELL@ +OPTARGS = @OPTARGS@ +SOPTARGS = @SOPTARGS@ +DEFS = @DEFS@ +GPUDIR = @GPUDIR@ +USESUBDIRS = @USESUBDIRS@ +NVCC = @NVCC@ +GPUMPICOMPILER = @GPUMPICOMPILER@ + +INCLUDES = @INCLUDES@ +LINK = $(CCLD) -o $@ ${LDFLAGS} + +COMPILE = ${CC} ${DEFS} ${INCLUDES} -o $@ ${CFLAGS} + +SMODULES = + +MODULES = read_input gamma measure_gauge_action start \ + expo matrix_utils get_staples update_backward_gauge \ + measure_rectangles get_rectangle_staples \ + test/check_geometry test/check_xchange \ + test/overlaptests \ + invert_eo invert_doublet_eo update_gauge \ + getopt sighandler reweighting_factor \ + source_generation boundary update_tm ranlxd \ + mpi_init deriv_Sb deriv_Sb_D_psi ranlxs \ + geometry_eo invert_overlap aligned_malloc \ + prepare_source chebyshev_polynomial_nd Ptilde_nd \ + reweighting_factor_nd rnd_gauge_trafo \ + update_momenta integrator phmc \ + little_D block operator \ + temporalgauge spinor_fft X_psi P_M_eta \ + jacobi fatal_error invert_clover_eo gettime @SPI_FILES@ \ + @QUDA_INTERFACE@ + +## the GPU modules (all .cu files in $GPUDIR) +GPUSOURCES := $(wildcard $(srcdir)/$(GPUDIR)/*.cu) +GPUOBJECTS := $(patsubst $(srcdir)/$(GPUDIR)/%.cu, $(GPUDIR)/%.o, $(GPUSOURCES)) + +#GPUSOURCES_C := $(wildcard $(srcdir)/$(GPUDIR)/*.c) +#GPUOBJECTS_C := $(patsubst $(srcdir)/$(GPUDIR)/%.c, $(GPUDIR)/%.o, $(GPUSOURCES_C)) + +NOOPTMOD = test/check_xchange test/check_geometry + +PROGRAMS = hmc_tm benchmark invert gen_sources \ + check_locallity test_lemon hopping_test LapH_ev \ + offline_measurement + +ALLOBJ = ${MODULES} ${PROGRAMS} ${SMODULES} +SUBDIRS = ${USESUBDIRS} + +# delete the default suffix rules +.SUFFIXES: + +# need to build modules before subdirs! +all: Makefile dep $(SUBDIRS) hmc_tm invert benchmark offline_measurement + +$(SUBDIRS): + $(MAKE) --directory=$@ + +# run the GIT-VERSION-GEN script to generate version information in git_hash.h +# making sure that we run in the correct directory +${top_srcdir}/git_hash.h: + @ ( cd @srcdir@ && sh GIT-VERSION-GEN ) + +-include $(addsuffix .d,$(ALLOBJ)) + +include ${top_srcdir}/Makefile.global + +${top_srcdir}/read_input.c: ${top_srcdir}/read_input.l +ifneq (,$(findstring lex,${LEX})) + ${LEX} -Ptmlqcd -i -t ${top_srcdir}/read_input.l > ${top_srcdir}/read_input.c +else + $(error Unable to find (f)lex, read_input.c not built. Please install (f)lex!) +endif + +libhmc.a: ${addsuffix .o, ${MODULES} ${SMODULES}} Makefile + @rm -f libhmc.a + @${AR} cru libhmc.a ${addsuffix .o, ${MODULES} ${SMODULES}} + @$(RANLIB) libhmc.a + @cp libhmc.a ${top_builddir}/lib/libhmc.a + +$(addsuffix .o,$(filter-out ${NOOPTMOD},${MODULES})): %.o: ${srcdir}/%.c %.d Makefile $(abs_top_builddir)/config.h + ${COMPILE} ${OPTARGS} -c $< + +#here we don't need optimisation +$(addsuffix .o,$(filter ${NOOPTMOD},${MODULES})): %.o: ${srcdir}/%.c %.d Makefile $(abs_top_builddir)/config.h + ${COMPILE} -c $< + +${addsuffix .o, ${SMODULES}}: %.o: ${srcdir}/%.c %.d Makefile $(abs_top_builddir)/config.h + ${COMPILE} ${SOPTARGS} -c $< + +${addsuffix .o, ${PROGRAMS}}: %.o: ${srcdir}/%.c %.d Makefile $(abs_top_builddir)/config.h ${top_srcdir}/git_hash.h + ${COMPILE} ${OPTARGS} -c $< + +${PROGRAMS}: %: %.o libhmc.a $(SUBDIRS) + ${LINK} $@.o $(GPUOBJECTS) $(GPUOBJECTS_C) $(LIBS) + +# The rules for unit tests are kept in a separate file for tidyness +include ${top_srcdir}/Makefile.tests + +dep: $(addsuffix .d,$(ALLOBJ)) + @ echo "...dependency files built" + +install: Makefile + @mkdir -p $(bindir); \ + for p in hmc_tm invert; do \ + progname=`echo $$p | sed '$(program_transform_name)'`; \ + echo "Installing $$p as $$progname in $(bindir)..."; \ + cp $$p $(bindir)/$$progname; \ + done; \ + echo "done"; + +uninstall: Makefile + for p in hmc_tm invert; do \ + progname=`echo $$p | sed '$(program_transform_name)'`; \ + echo "Un-Installing $$progname in $(bindir)..."; \ + rm $(bindir)/$$progname; \ + done; \ + echo "done"; + +compile-clean: compile-clean-recursive Makefile + rm -f *.o *.d test/*.o test/*.d tests/*.o tests/*.d + +clean: clean-recursive Makefile + rm -f benchmark hmc_tm invert *.o *.d test/*.o test/*.d tests/*.o tests/*.d + +distclean: distclean-recursive Makefile + rm -f benchmark hmc_tm invert *.o *.d *~ Makefile config.log config.status fixed_volume.h + rm -f config.h + +.PHONY: all ${SUBDIRS} ${top_srcdir}/git_hash.h clean compile-clean distclean dep install \ + all-recursive all-debug-recursive all-profile-recursive \ + clean-recursive distclean-recursive \ + compile-clean-recursive diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/Makefile.old b/qcd/part_cpu/applications/QCD/src/kernel_D/Makefile.old new file mode 100644 index 0000000000000000000000000000000000000000..67b471a73039c0944da171ade55c08f102e87d6f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/Makefile.old @@ -0,0 +1,37 @@ +include Makefile.defs.in_local + +.c.o: + $(CC) $(CFLAGS) -I. -I.. -lm -c -o $@ $< + +OBJS = \ + benchmark.o \ + geometry_eo.o \ + init_dirac_halfspinor.o \ + init_moment_field.o \ + ranlxd.o \ + start.o \ + xchange_field.o \ + xchange_lexicfield.o \ + boundary.o \ + Hopping_Matrix.o \ + init_gauge_field.o \ + init_spinor_field.o \ + ranlxs.o \ + update_backward_gauge.o \ + xchange_gauge.o \ + D_psi.o \ + Hopping_Matrix_nocom.o \ + init_geometry_indices.o \ + mpi_init.o \ + read_input.o \ + xchange_deri.o \ + xchange_halffield.o \ + test/check_geometry.o \ + test/check_xchange.o + + +benchmark: $(OBJS) + $(CC) $(CFLAGS) $(OBJS) -I. -I.. -lm -o benchmark + +clean: + $(RM) $(OBJS) ../kernel_D.a benchmark \ No newline at end of file diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/Makefile.tests b/qcd/part_cpu/applications/QCD/src/kernel_D/Makefile.tests new file mode 100644 index 0000000000000000000000000000000000000000..a9a393ac658208c5c92fb7d6a45e9c42caea6793 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/Makefile.tests @@ -0,0 +1,64 @@ +TESTS = tests/test_sample tests/test_su3 tests/test_buffers tests/test_qpx tests/test_linalg tests/test_clover tests/test_rat + +TEMP = $(patsubst %.c,%,$(wildcard $(top_srcdir)/tests/*.c)) +TESTMODULES = $(patsubst $(top_srcdir)/%,%,$(TEMP)) + +TESTFLAGS = -L$(top_builddir)/cu/ -lcu + +$(addsuffix .o,$(TESTMODULES)): %.o : $(top_srcdir)/%.c + ${COMPILE} -c $(OPTARGS) ${DEFS} $< + +# The linking stage needs to be differentiated because different tests rely on +# different modules from the codebase +# Each test itself consists of a number of modules that need to be linked. + +# when used as a prerequisite, the wildcard with "tests/test_sample*.c" replaced by "$@*.c" is not evaluated +# correctly, even though it works perfectly in an echo statement, it results in make +# trying to compile all objects in top_srcdir +# we therefore evaluate the wildcard into a variable + +TEST_SAMPLE_OBJECTS:=$(patsubst $(top_srcdir)/%.c,%.o,$(wildcard $(top_srcdir)/tests/test_sample*.c)) +TEST_SAMPLE_FLAGS:= +TEST_SAMPLE_LIBS:=$(top_builddir)/cu/libcu.a +tests/test_sample: $(TEST_SAMPLE_OBJECTS) $(TEST_SAMPLE_LIBS) + ${LINK} $(TEST_SAMPLE_OBJECTS) $(TESTFLAGS) $(TEST_SAMPLE_FLAGS) + +TEST_SU3_OBJECTS:=$(patsubst $(top_srcdir)/%.c,%.o,$(wildcard $(top_srcdir)/tests/test_su3*.c)) expo.o +TEST_SU3_FLAGS:=-lm +TEST_SU3_LIBS:=$(top_builddir)/cu/libcu.a +tests/test_su3: $(TEST_SU3_OBJECTS) $(TEST_SU3_LIBS) + ${LINK} $(TEST_SU3_OBJECTS) $(TESTFLAGS) $(TEST_SU3_FLAGS) + +TEST_QPX_OBJECTS:=$(patsubst $(top_srcdir)/%.c,%.o,$(wildcard $(top_srcdir)/tests/test_qpx*.c)) +TEST_QPX_FLAGS:=-lm +TEST_QPX_LIBS:=$(top_builddir)/cu/libcu.a +tests/test_qpx: $(TEST_QPX_OBJECTS) $(TEST_QPX_LIBS) + ${LINK} $(TEST_QPX_OBJECTS) $(TESTFLAGS) $(TEST_QPX_FLAGS) + +TEST_LINALG_OBJECTS:=$(patsubst $(top_srcdir)/%.c,%.o,$(wildcard $(top_srcdir)/tests/test_linalg*.c)) +TEST_LINALG_FLAGS:=-lm +TEST_LINALG_LIBS:=$(top_builddir)/cu/libcu.a $(top_builddir)/linalg/liblinalg.a +tests/test_linalg: $(TEST_LINALG_OBJECTS) $(TEST_LINALG_LIBS) + ${LINK} $(TEST_LINALG_OBJECTS) $(TEST_LINALG_LIBS) $(TESTFLAGS) $(TEST_LINALG_FLAGS) + +TEST_BUFFERS_OBJECTS:=$(patsubst $(top_srcdir)/%.c,%.o,$(wildcard $(top_srcdir)/tests/test_buffers*.c)) fatal_error.o +TEST_BUFFERS_FLAGS:=-lbuffers -L$(top_builddir)/buffers/ +TEST_BUFFERS_LIBS:=$(top_builddir)/cu/libcu.a $(top_builddir)/buffers/libbuffers.a +tests/test_buffers: $(TEST_BUFFERS_OBJECTS) $(TEST_BUFFERS_LIBS) + ${LINK} $(TEST_BUFFERS_OBJECTS) $(TESTFLAGS) $(TEST_BUFFERS_FLAGS) + +TEST_CLOVER_OBJECTS:=$(patsubst $(top_srcdir)/%.c,%.o,$(wildcard $(top_srcdir)/tests/test_clover*.c)) operator/clover_leaf.o +TEST_CLOVER_FLAGS:=-lm -lhmc -llinalg +TEST_CLOVER_LIBS:=$(top_builddir)/cu/libcu.a +tests/test_clover: $(TEST_CLOVER_OBJECTS) $(TEST_CLOVER_LIBS) + ${LINK} $(TEST_CLOVER_OBJECTS) $(TESTFLAGS) $(TEST_CLOVER_FLAGS) + +TEST_RAT_OBJECTS:=$(patsubst $(top_srcdir)/%.c,%.o,$(wildcard $(top_srcdir)/tests/test_rat*.c)) +TEST_RAT_FLAGS:=-lm -lrational +TEST_RAT_LIBS:=$(top_builddir)/cu/libcu.a +tests/test_rat: $(TEST_RAT_OBJECTS) $(TEST_RAT_LIBS) + ${LINK} $(TEST_RAT_OBJECTS) $(TESTFLAGS) $(TEST_RAT_FLAGS) + + +tests: ${TESTS} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/Makefile_main b/qcd/part_cpu/applications/QCD/src/kernel_D/Makefile_main new file mode 100644 index 0000000000000000000000000000000000000000..f79629df10b128d778cf2e9c4ce7e8b5b65f065f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/Makefile_main @@ -0,0 +1,39 @@ +include Makefile.defs.in_local + +.c.o: + $(CC) $(CFLAGS) -I. -I.. -I/usr/lib/openmpi/include -lm -c -o $@ $< + +OBJS = \ + benchmark_main.o \ + gettime.o \ + geometry_eo.o \ + init/init_dirac_halfspinor.o \ + init/init_moment_field.o \ + ranlxd.o \ + start.o \ + xchange/xchange_field.o \ + xchange/xchange_lexicfield.o \ + boundary.o \ + operator/Hopping_Matrix.o \ + init/init_gauge_field.o \ + init/init_spinor_field.o \ + ranlxs.o \ + update_backward_gauge.o \ + xchange/xchange_gauge.o \ + operator/D_psi.o \ + operator/Hopping_Matrix_nocom.o \ + init/init_geometry_indices.o \ + mpi_init.o \ + read_input.o \ + fatal_error.o \ + xchange/xchange_deri.o \ + xchange/xchange_halffield.o \ + test/check_xchange.o \ + test/check_geometry.o + + +benchmark_main: $(OBJS) + $(CC) $(CFLAGS) $(OBJS) -I. -I.. -I/usr/lib/openmpi/include -lm -o benchmark_main + +clean: + $(RM) $(OBJS) benchmark_main \ No newline at end of file diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/Makefile_tmp b/qcd/part_cpu/applications/QCD/src/kernel_D/Makefile_tmp new file mode 100644 index 0000000000000000000000000000000000000000..36b9206c51b1cee87ee89f549c0297932e9b1f3a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/Makefile_tmp @@ -0,0 +1,133 @@ +srcdir = . +top_srcdir = . +abs_top_srcdir = /home/jacob/Job/Prace/kernel_D_soft/kernel_D_update2 +top_builddir = . +abs_top_builddir = /home/jacob/Job/Prace/kernel_D_soft/kernel_D_update2 +builddir = . +prefix = /home/jacob +exec_prefix = ${prefix} +bindir = ${exec_prefix}/bin +program_transform_name = s,x,x, +subdir = . + +AR = ar +RANLIB = ranlib +CC = mpicc +CCDEP = gcc +CFLAGS = -std=c99 -fopenmp -pedantic -Wall +LDFLAGS = -L${HOME}/lib -L${top_builddir}/lib +DEPFLAGS = -MM +CPPFLAGS = +CCLD = mpicc +LEX = flex +AUTOCONF = autoconf +LIBS = -loperator -linit -llinalg /usr/lib/lapack/liblapack.so.3 /usr/lib/libblas.so.3 /usr/lib/x86_64-linux-gnu/libgomp.so.1 -L/usr/lib/gcc/x86_64-linux-gnu/4.8 -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../.. -lgfortran -lm -lquadmath -lm +SHELL = /bin/bash +OPTARGS = -O +SOPTARGS = -O +DEFS = -DHAVE_CONFIG_H +GPUDIR = +USESUBDIRS = operator linalg +NVCC = +GPUMPICOMPILER = + +INCLUDES = -I$(HOME)/include/ -I. -I${abs_top_builddir}/ -I${abs_top_srcdir}/ -I/include/ -I/include/ +LINK = $(CCLD) -o $@ ${LDFLAGS} + +COMPILE = ${CC} ${DEFS} ${INCLUDES} -o $@ ${CFLAGS} + +SMODULES = + +MODULES = read_input gamma measure_gauge_action start \ + expo matrix_utils get_staples update_backward_gauge \ + measure_rectangles get_rectangle_staples \ + test/check_geometry test/check_xchange \ + test/overlaptests \ + invert_eo invert_doublet_eo update_gauge \ + getopt sighandler reweighting_factor \ + source_generation boundary update_tm ranlxd \ + mpi_init deriv_Sb deriv_Sb_D_psi ranlxs \ + geometry_eo invert_overlap aligned_malloc \ + prepare_source chebyshev_polynomial_nd Ptilde_nd \ + reweighting_factor_nd rnd_gauge_trafo \ + update_momenta integrator phmc \ + little_D block operator \ + temporalgauge spinor_fft X_psi P_M_eta \ + jacobi fatal_error invert_clover_eo gettime \ + + +## the GPU modules (all .cu files in $GPUDIR) +GPUSOURCES := $(wildcard $(srcdir)/$(GPUDIR)/*.cu) +GPUOBJECTS := $(patsubst $(srcdir)/$(GPUDIR)/%.cu, $(GPUDIR)/%.o, $(GPUSOURCES)) + +#GPUSOURCES_C := $(wildcard $(srcdir)/$(GPUDIR)/*.c) +#GPUOBJECTS_C := $(patsubst $(srcdir)/$(GPUDIR)/%.c, $(GPUDIR)/%.o, $(GPUSOURCES_C)) + +NOOPTMOD = test/check_xchange test/check_geometry + +PROGRAMS = benchmark + +ALLOBJ = ${MODULES} ${PROGRAMS} ${SMODULES} +SUBDIRS = ${USESUBDIRS} + +# delete the default suffix rules +.SUFFIXES: + +# need to build modules before subdirs! +all: Makefile dep $(SUBDIRS) benchmark + +$(SUBDIRS): + $(MAKE) --directory=$@ + +# run the GIT-VERSION-GEN script to generate version information in git_hash.h +# making sure that we run in the correct directory +${top_srcdir}/git_hash.h: + @ ( cd . && sh GIT-VERSION-GEN ) + +-include $(addsuffix .d,$(ALLOBJ)) + +include ${top_srcdir}/Makefile.global + +${top_srcdir}/read_input.c: ${top_srcdir}/read_input.l +ifneq (,$(findstring lex,${LEX})) + ${LEX} -Ptmlqcd -i -t ${top_srcdir}/read_input.l > ${top_srcdir}/read_input.c +else + $(error Unable to find (f)lex, read_input.c not built. Please install (f)lex!) +endif + +$(addsuffix .o,$(filter-out ${NOOPTMOD},${MODULES})): %.o: ${srcdir}/%.c %.d Makefile $(abs_top_builddir)/config.h + ${COMPILE} ${OPTARGS} -c $< + +#here we don't need optimisation +$(addsuffix .o,$(filter ${NOOPTMOD},${MODULES})): %.o: ${srcdir}/%.c %.d Makefile $(abs_top_builddir)/config.h + ${COMPILE} -c $< + +${addsuffix .o, ${SMODULES}}: %.o: ${srcdir}/%.c %.d Makefile $(abs_top_builddir)/config.h + ${COMPILE} ${SOPTARGS} -c $< + +${addsuffix .o, ${PROGRAMS}}: %.o: ${srcdir}/%.c %.d Makefile $(abs_top_builddir)/config.h ${top_srcdir}/git_hash.h + ${COMPILE} ${OPTARGS} -c $< + +${PROGRAMS}: %: %.o $(SUBDIRS) + ${LINK} $@.o $(GPUOBJECTS) $(GPUOBJECTS_C) $(LIBS) + +# The rules for unit tests are kept in a separate file for tidyness +include ${top_srcdir}/Makefile.tests + +dep: $(addsuffix .d,$(ALLOBJ)) + @ echo "...dependency files built" + +compile-clean: compile-clean-recursive Makefile + rm -f *.o *.d test/*.o test/*.d tests/*.o tests/*.d + +clean: clean-recursive Makefile + rm -f benchmark *.o *.d test/*.o test/*.d tests/*.o tests/*.d + +distclean: distclean-recursive Makefile + rm -f benchmark *.o *.d *~ Makefile config.log config.status fixed_volume.h + rm -f config.h + +.PHONY: all ${SUBDIRS} ${top_srcdir}/git_hash.h clean compile-clean distclean dep install \ + all-recursive all-debug-recursive all-profile-recursive \ + clean-recursive distclean-recursive \ + compile-clean-recursive diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/NEWS b/qcd/part_cpu/applications/QCD/src/kernel_D/NEWS new file mode 100644 index 0000000000000000000000000000000000000000..561e76ff864340720a007d2d85ad374106457b3c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/NEWS @@ -0,0 +1,403 @@ +08.09.2011: added code LapH_ev for the computation of the eigensystem + for the Laplacian Heaviside method (0905.2160 [hep-lat]). + In order to use it configure with --disable-halfspinor --enable-laph (serial), + and also with --enable-indexindepgeom --enable-tsplitpar --with-mpidimension=XYZ + --with-lemondir=${lemondir} (parallel). + Compile with make LapH_ev. + +23.08.2010: indexindependentgeom (see below) extended to naive + communications (implemented with MPI_Sendrecv) and halfspinor. + +16.04.2010: mixed precision CG implemented for invert and HMC + determination of the Ptilde degree optimised + new input parameter for NDPOLY monomial: MaxPtildeDegree + +16.04.2010: as online measurements are available CORRELATORS, PIONNORM, POLYAKOV + syntax + BeginMeasurement TYPE + option = value + EndMeasurement + +March.2010: parallel IO implemented and tested to work. + new operator syntax in input file + Begin Operator TYPE + option=value + EndOperator + +10.02.2010: New hopping_test.c routing to test the parallel version of + an Hopping_Matrix + +10.02.2010: when compiling with --enable-tsplitpar, the communications in the + Hopping Matrix are splitted and performed after each timeslice. + This allows the overlap of communications and computations. + tsplitpar needs indexindepgeom, SSE2/3, gaugecopy + +10.02.2010: when compiling with --enable-indexindepgeom, the xchange routines do not + rely of absolute addressing anymore, but on addresses gI_?_?_?_? which are + defined based on the function Index. + This enables the parallelization in X, XY, XYZ directions + which is obtained --with-mpidimension=XYZ ... + indexindepgeom is still not compatible with blocking-mpi of halfspinors + +13.11.2009: parallel IO for gauge fields tested and working. + parallel reading of Checksum still buggy, but not essential + +09.11.2009: MMS successfully checked. Little code for checking + in contractions/check_propagators/check_mms_props.cc + +04.06.2009: bug in eigensolver routine for ND case fixed + parallel IO included using lemon library + Schroedinger functional boundary conditions for + gauge case included + +14.05.2009: multiple mass solver (CG) for twisted mass + implemented, input is solverflag = cgmms + the number of additional masses has to be specified with + CGMMSNoExtraMasses = N + and the extra masses have to be given line by line in a file called + "extra_masses.input" + The CGMMS solver writes for every mass (extra + the original one) one + propagator storing the result of (Q^\dagger Q)^(-1) \gamma_5 \phi + where \phi is the source. + Hence the result for +\mu and -\mu can be extracted at any later + stage, respectively. + +14.05.2009: in the propagator files we now store additional info, namely + the xlf-info message is copied from the original gauge field, if + existing, as well as the scidac-checksum and the ildg-data-lfn + messages. This should ease the mapping from a propagator to the corresponding + gauge configuration file. + +01.08.2008: online measurements implemented for + PP and PA correlators. Input variables are + PerformOnlineMeasurements = yes|no and + OnlineMeasuremntsFreq = n + Result will be written to a file called + onlinemeas.trajno + tested to work scalar and parallel + +30.05.2008: support now for three formats for source and propagator, for compatibility: + cmi, GWC, ETMC. The latter is the standard and recommended. + +07.12.2007: inversion for the flavour split doublet implemented and tested + new spinor field IO format implemented and tested + scidac checksums implemented + +19.10.2007: hmc runs now also without even/odd preconditioning. + Set UseEvenOdd=no in the input file (thanks to Jan Volkholz). + +05.09.2007: invert suppports now also inversions using D_psi. This is + triggered by UseEvenOdd (default = yes) as input parameter. + +04.09.2007: D_psi in three versions tested, also parallel + versions. BG/L version of deriv_Sb still not checked. + +31.08.2007: D_psi for full spinor field (no even/odd) implemented and tested + for SSE2 and normal version. new deriv_Sb implemented and tested. + BG/L versions still need testing. + +08.08.2007: all integration schemes available for the PHMC + new input paramter + TimeScaleHeavyDoublet + which must be set to an integeter >=0 specifying the timescale + on which to integrate the heavy doublet on. Note that 0 is + the smallest possible timescale, which is the smallest + timescale available for pseudo fermion fields. (the gauge + field is one scale below) + not completely tested yet. + +15.04.2007: eigenvalue computation for squared one flavour operator + implemented, few new input parameters connected to this, + see documentation. + Preconditioned CG added and tested with eigenvalues + estimates. However, CG with exact eigenvector subspace + projected out is faster. + + Sloppy precision now also available for CG. + +03.04.2007: Addition of 2 input parameters: + SplittedPropagator and SourceLocation + New function in start.c to use the SourceLocation + when its value is different from 0. + The same modification was done in the GWC code. + +01.04.2007: many changes to PHMC + new input parameters: + PhmcNoFlavours (2+1+1 or 1+1) + PhmcComputeOnlyEVs (yes or no: compute EVs only and exit then) + PhmcStildeMax (flaot, upper bound for appr. interval) + PhmcStildeMin (float, lower bound for appr. interval) + PhmcDegreeOfP (int, degree of P, the less precise polynomial) + PhmcRecEVInterval (int, recompute EV's every n trajectories) + + parallel Eigenvalue computation works now, also on BG/L + + PHMC fuer 1+1 flavours exists and works + +18.01.2007: towards version 4.0 + merged with branch phmc. hmc_tm and phmc_tm + are both compiling and running. phmc needs + lapack. + phmc has so far only on integration scheme + available. The PHMC code is tested against a + code of I. Montway. + Credits to T. Chiarappa for the PHMC, see + doc directory for more details. + Tested so far only on PC's + +15.01.2007: stout smearing for invert implemented by Craig McNeile + input parameters are: UseStoutSmearing, StoutRho and + StoutNoIterations (hopefully self explaining) + Tested against Chroma-3.17.0 + + stouting for hmc not yet implemented + +12.12.2006: run time call for dram window on BG/L implemented (--with-bgldram) + persistent MPI for halfspinor available (--with-persistentmpi) + non-blocking MPI calls also for gauge fields implemented + now generally available with --with-nonblockingmpi for all platforms + The BG/L performance is now close to 18% peak. + + Improved performance for opteron CPU's + +02.09.2006: New Dirac operator implemented and working + On BG/L this brings a 20% improvement + it can be switched on with --enable-newdiraop + + The exchange routines are now also with shmem API + available. Usable with --enable-shmem. this + feature is not yet completely tested. + +15.04.2006: Various new input parameters, see doc/input.tex. + removing of hmc.reread now also works on BG/L. + The file conf.save is now almost always save: a new + configuration is first stored in .conf.temp and then moved + to conf.save. There might be still a problem in case the + job chrashes when .nstore_counter is written... + +28.03.2006: chronological solver guess now independent of lapack + the current filename is now saved in .nstore_counter + last_configuration and last_state are therefore obsolet. + random number state now saved in lime format in the gauge file + rlxd_state files are not any longer produced, but will still be + read. + +14.02.2006: --enable-gaugcopy will work now also without + --enable-eogeom . But it requires two additional + copies of the gauge fields instead of one. + + A reasonably well tuned BG/L Dirac operator available + about 10% of peak scaling up to 2048 processors. + +13.02.2006: four dimensional parallelisation more or less tested + +12.02.2006: But in all but four dimensional parallelisation + fixed. + +10.02.2006: package will compile only with lime >= 1.2.3! + +10.02.2006: Build in separate directory possible now. + +09.02.2006: Bug in io.c fixed + +08.02.2006: Bug in write ILDG configs fixed + +07.02.2006: In case of 4-dim. parallelisation the product + T*LX*LY _must_ be even, otherwise the field exchange + in z-direction does not work. + +07.02.2006: 3 and 4 dimensional parallelisation implemented + working both so far _only_ with --disable-eogeom + (maybe configure then also --disable-gaugecopy) + New input parameter NrZProcs + local LZ _must_ be even + + everything is not yet finally tested. + +06.02.2006: Added the possibility for fixed volume at compiletime + --with-fixedvolume + please edit fixed_volume.h accordingly + not at all tested tested! + +30.01.2006: New input parameter NrYProcs + +29.01.2006: running and partially tested on the BGL in Juelich + configure with + ./configure --host=powerpc64-bgl-linux-gnu --without-lapack CC=/opt/ibmcmp/vac/7.0/bin/blrts_xlc + and other options. Also lime needs to be configured + with ./configure CC=/opt/ibmcmp/vac/7.0/bin/blrts_xlc + please use lime-1.2.3 _at least_! Earlier version might not + work. + +30.01.2006: at least for BGL lime needs to be configured with + ./configure CC=/opt/ibmcmp/vac/7.0/bin/blrts_xlc --enable-largefile + and (remember) lime-1.2.3. + +25.01.2006: Trying to setup a versioning system: + last digit -> bug fixes + second digit odd -> developement Version + second digit even -> stable release + + +25.01.2006: changed --disalbe-lapack to --without-lapack ... + +05.01.2006: By setting the history parameters for the CSG to zero one gets + now a zero spinor as trial guess for the solvers. + Moreover, by specifying --disable-lapack to configure it is + possible to compile without the need of the external libs lapack + and blas and the fortran-lib. + +14.11.2005: For the flavour non-degenerate eigenvalues computation, a new + structure, called bispinor, has been introduced. Consequently, + a serial and a parallel new Jacoby-Davidson routine working + with bispinors has been implemented, as well as two new solvers, + bicgstab_complex and cg_her, respectively. A bunch of linear + algebra files have been adapted to work with bispinors. All these + files are distinguished by the suffix "_bi" (e.g: "file_bi.c"). + +14.11.2005: New global parameters (g_mubar, g_epsbar) have been introduced. + These are the mass parameters needed in the eigenvalues + computation of the flavour non-degenerate case Dirac operator. + +11.11.2005: gcc-4.x does not use libg2c anymore. One has to link + against gfortran, which is done now. + +26.10.2005: Added gauge file format conversion programs for + gwc -> ildg and ildg -> gwc + + All solver have additional parameter now. It is now possible + to invert with realtive precision. Moreover, the propagator + (or source) format of Chris Michael can be read in. + The new input parameter is: + SourceFormat = cmi (otherwise gwc assumed) + other related input parameters are + ReadSource = yes + SolverPrecision = 1.e-10 + SourceInputFilename = random_test + UseRelativePrecision = yes + +11.08.2005: 2MN integrator implemented and tested. Two versions + available: velocity and position version (hep-lat/0505020). + Integrator=2MN or 2MNposition + +29.06.2005: ILDG LIME file format introduced. + old file format deprecated. But it will be still + automatically detected and read in. + + trajectory counter introduced which will now allow + to correctly keep the Nskip's between the confs. + this is as well as the plaquette value stored as + xlf-info record in the new LIME format. + +10.03.2005: Precisions in the solver for force and Acceptance are input + parameter now for each mu parameter: + ForcePrecisionMu, ForcePrecisionMu2, ForcePrecisionMu3 + AcceptancePrecisionMu, + AcceptancePrecisionMu2, AcceptancePrecisionMu3 + + ExtIntStepsMu0 is now called IntegrationStepsMu, + ExtIntStepsMu1 is now called IntegrationStepsMu2, + ExtIntStepsMu2 is now called IntegrationStepsMu3, + matchin the input names for the mu parameter. + The old one are still usable. + + Added an input parameter DebugLevel to control the + debug output. Setting it to one will cause the program + to compute and print out the norms of the forces. + +17.02.2005: Reversibility check implemented. Input parameters are + ReversibilityCheck = yes|no (default no) + ReversibilityCheckIntervall = 100 (default 100) + + Precisions in the solver for force and Acceptance are input + parameter now: + ForcePrecision = float (default 1.e-7) + AcceptancePrecision = float (default 16.e-7) + + One can choose to have relative precision: + UseRelativePrecision = yes|no (default no) + +13.02.2005: Integration scheme with error cancellation implemented. + It is only implemented for the highest level and should + have errors in \delta\tau^5 only. + +07.02.2005: LX,LY,LZ now possible as input parameter. + +07.01.2005: Extended leap-frog and extended Sexton-Weingarten + integration schemes (multiple time scales) implemented + and tested. + +17.12.2004: Possibility for rereading some parameters added. + If there is a file hmc.reread, it will be parsed + automatically and deleted afterwards. It is not possible + to change T, L, RGIC1 from zero to a non zero value, NrXProcs. + +13.12.2004: Extended leapfrog integration scheme implemented + and tested. + +23.11.2004: The lattice size must be now set in the input file. + Recompilation is only needed for one or two dimensional + parallelisation. + + For invert there is a new input parameter: + ReadSource = yes|no + SourceInputFilename = filename + This let's you read in a generalised source for the + inversion. The real filename must be of the form + filename${massnumber}.is${is}ic${ic}.${nstore} + +22.11.2004: DBW2 implemented for the serial code and the parallel + code with new and old geometry. + Input parameter BoundaryCond is supplemented by + BCAngleT, like in the GWC code. BoundaryCond is + deprecated now. + +28.09.2004: input parameter added: + input parameter MaxSolverIterations + and SolverPrecision available. + + history_hmc_tm file added with history of + written configurations and corresponding + Plaquette values and timestamp + +20.08.2004: SSE3 Version of the most important macros added and + tested. SSE3 versus SSE2: 1.84 Gflops versus 1.64 Gflops. + (P4 3.20GHz prescott) + +17.08.2004: 64 Bit Version of the code running and tested + Cache Optimisation for Opteron added. + New configure options: + --enable-opteron : Enables cache optimisation + for Opteron [default=no] + --enable-gaugecopy : Enables usage of a copy of + the gauge field [default=yes] + --enable-eogeom : Enables usage of EO geometry + also for the gauge fields [default=yes] + +13.08.2004 New EO geometry for gauge fields implemented also + for the 2-dim parallelisation and tested. + +04.05.2004: additional parallelisation in x-direction added + and tested. IO added as well and tested, apart + from write and read for spinor fields. + extended test functions written (hmc/test) + +05.04.2004: Bug fix. serial version without MPI running now. + Bug was in geometry_eo.c + program for thermal cycles added. + +11.03.2004: third pseudo fermion field added and tested. + +09.03.2004: .nstore_counter file introduced to easily restart the + programm. Just set InitialStoreCounter = readin. Also a + sighandler was added to savely finish the program. + +04.03.2004: Hasenbusch trick tested and working. + New version of the Hopping matrix for the IBM implemented + with special improvements. Even Odd ordering translated + also to the gauge fields and tested, but not yet default. + +03.03.2004: second pseudo fermion (trick of Martin Hasenbusch) + implemented also for tmQCD + +02.03.2004: Release 1.0.1 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/P_M_eta.c b/qcd/part_cpu/applications/QCD/src/kernel_D/P_M_eta.c new file mode 100644 index 0000000000000000000000000000000000000000..6e44bbbdc6afe93054dbb9f46fd179be750ba909 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/P_M_eta.c @@ -0,0 +1,433 @@ +/*********************************************************************** + * + * Copyright (C) 2011 Elena Garcia-Ramos + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include +#include "global.h" +#include "start.h" +#include "su3.h" +#include "linalg_eo.h" +#include "chebyshev_polynomial_nd.h" +#include +#include "solver/solver.h" +#include "solver/jdher.h" +#include "solver/eigenvalues.h" +#include "X_psi.h" +#include "gamma.h" + +double rnorm=-1; + +/* |R>=rnorm^2 Q^2 |S> */ +void norm_X_sqr_psi(spinor * const R, spinor * const S, + double const mstar); + +/* |R>=rnorm Q|S> */ +void norm_X_n_psi(spinor * const R, spinor * const S, + const int n, double const mstar); + +/* Construct the sign function of the operator X */ +/* X/sqrt(X^2) ,, X = 1-(2M^2/(DdaggeraD+M^2))*/ +void X_over_sqrt_X_sqr(spinor * const R, double * const c, + const int n, spinor * const S, + const double minev, double const mstar); + + +double * x_cheby_coef = NULL; +double epsilon=0.01; +int x_n_cheby = 32; + + +void h_X_sqr_eta(spinor * const R1,spinor * const R2,spinor * const S, double const mstar){ + int i; + double mode_n; + spinor **s, *s_; + static int n_cheby = 0; + static int rec_coefs = 1; + + /* Compute Chebyshev coefficients */ + /* c[j] ,, j=0..n, n=degree of the polynomial*/ + + if(g_proc_id == 0) { + printf("Degree of Polynomial set to %d\n", x_n_cheby); + } + + if(n_cheby != x_n_cheby || rec_coefs) { + if(x_cheby_coef != NULL) free(x_cheby_coef); + x_cheby_coef = (double*)malloc(x_n_cheby*sizeof(double)); + chebyshev_coefs(epsilon, 1., x_cheby_coef, x_n_cheby, -0.5);//coefs for f(x)=x^(-0.5) // represents P(y)=1/sqrt(y) in paper "Chiral symmetry breaking an the Banks-Casher relation in lattice QCD with Wilson quarks" page 12. + rec_coefs = 0; + n_cheby = x_n_cheby; + } + + if(g_proc_id == 0) { + printf("mstar= %f \n",mstar); + } + + /*Evaluate X_over_sqrt_X_sqr*/ + X_over_sqrt_X_sqr(R1, x_cheby_coef, x_n_cheby, S, epsilon, mstar); + + /* Construct h(x)=1/2-1/2 X/sqrt(X^2) */ + /* this routine makes (*R)=c1*(*R)+c2*(*S) , c1 and c2 are real constants */ + assign_mul_add_mul_r(R1,S, 0.5, 0.5, VOLUME); + + + /*we need h(X)^2|nu>*/ + X_over_sqrt_X_sqr(R2, x_cheby_coef, x_n_cheby, R1, epsilon, mstar); + assign_mul_add_mul_r(R2,R1,0.5, 0.5, VOLUME); + + return; +} + +void h_X_eta(spinor * const R,spinor * const S, double const mstar){ + int i; + double mode_n; + spinor **s, *s_; + static int n_cheby = 0; + static int rec_coefs = 1; + + /* Compute Chebyshev coefficients */ + /* c[j] ,, j=0..n, n=degree of the polynomial*/ + + if(g_proc_id == 0) { + printf("Degree of Polynomial set to %d\n", x_n_cheby); + } + + if(n_cheby != x_n_cheby || rec_coefs) { + if(x_cheby_coef != NULL) free(x_cheby_coef); + x_cheby_coef = (double*)malloc(x_n_cheby*sizeof(double)); + chebyshev_coefs(epsilon, 1., x_cheby_coef, x_n_cheby, -0.5); + rec_coefs = 0; + n_cheby = x_n_cheby; + } + + /*Evaluate X_over_sqrt_X_sqr*/ + X_over_sqrt_X_sqr(R, x_cheby_coef, x_n_cheby, S, epsilon, mstar); + + /* Construct h(x)=1/2-1/2 X/sqrt(X^2) */ + /* this routine makes (*R)=c1*(*R)+c2*(*S) , c1 and c2 are real constants */ + assign_mul_add_mul_r(R,S, 0.5, 0.5, VOLUME); + + return; +} + + +void h_X_4_eta(spinor * const R1,spinor * const R2,spinor * const S, double const mstar){ + int i; + double mode_n; + spinor **s, *s_; + static int n_cheby = 0; + static int rec_coefs = 1; + + /* Compute Chebyshev coefficients */ + /* c[j] ,, j=0..n, n=degree of the polynomial*/ + if(g_proc_id == 0) { + printf("Degree of Polynomial set to %d\n", x_n_cheby); + } + + if(n_cheby != x_n_cheby || rec_coefs) { + if(x_cheby_coef != NULL) free(x_cheby_coef); + x_cheby_coef = (double*)malloc(x_n_cheby*sizeof(double)); + chebyshev_coefs(epsilon, 1., x_cheby_coef, x_n_cheby, -0.5); + rec_coefs = 0; + n_cheby = x_n_cheby; + } + s_ = calloc(3*VOLUMEPLUSRAND+1, sizeof(spinor)); + s = calloc(3, sizeof(spinor*)); + + for(i = 0; i < 3; i++) { +#if (defined SSE3 || defined SSE2 || defined SSE) + s[i] = (spinor*)(((unsigned long int)(s_)+ALIGN_BASE)&~ALIGN_BASE)+i*VOLUMEPLUSRAND; +#else + s[i] = s_+i*VOLUMEPLUSRAND; +#endif + } + + printf("mstar= %f \n",mstar); + /* Evaluate X_over_sqrt_X_sqr */ + X_over_sqrt_X_sqr(s[0], x_cheby_coef, x_n_cheby, S, epsilon, mstar); + + /* Construct h(x)=1/2-1/2 X/sqrt(X^2) */ + /* this routine makes (*R)=c1*(*R)+c2*(*S) , c1 and c2 are real constants */ + assign_mul_add_mul_r(s[0],S, 0.5, 0.5, VOLUME); + + X_over_sqrt_X_sqr(R1, x_cheby_coef, x_n_cheby, s[0], epsilon, mstar); + assign_mul_add_mul_r(R1,s[0],0.5, 0.5, VOLUME); + + X_over_sqrt_X_sqr(s[2], x_cheby_coef, x_n_cheby, R1, epsilon, mstar); + assign_mul_add_mul_r(s[2],R1,0.5, 0.5, VOLUME); + + /*we need h(X)^2|nu>*/ + X_over_sqrt_X_sqr(R2, x_cheby_coef, x_n_cheby, s[2], epsilon, mstar); + assign_mul_add_mul_r(R2,s[2],0.5, 0.5, VOLUME); + + free(s); + free(s_); + + return; +} + + + +void norm_X_sqr_psi(spinor * const R, spinor * const S, double const mstar) { + + spinor *aux_,*aux; +#if ( defined SSE || defined SSE2 || defined SSE3 ) + aux_=calloc(VOLUMEPLUSRAND+1, sizeof(spinor)); + aux = (spinor *)(((unsigned long int)(aux_)+ALIGN_BASE)&~ALIGN_BASE); +#else + aux_=calloc(VOLUMEPLUSRAND, sizeof(spinor)); + aux = aux_; +#endif + + /* Here is where we have to include our operator which in this case is + X = 1 - (2M^2)/(D_m^dagger*D_m + mu^2 + M^2) */ + + if(1) + { + X_psi(aux, S, mstar); + X_psi(R, aux, mstar); + } + else + { + printf("using X_psiSquare.\n"); + X_psiSquare(R, S, mstar); + } + mul_r(R, rnorm*rnorm, R, VOLUME); + + + free(aux_); + return; +} + + +void norm_X_n_psi(spinor * const R, spinor * const S, + const int n, double const mstar) { + + int i; + double npar = 1.; + spinor *aux_,*aux; +#if (defined SSE || defined SSE2 || defined SSE3) + aux_=calloc(VOLUMEPLUSRAND+1, sizeof(spinor)); + aux = (spinor *)(((unsigned long int)(aux_)+ALIGN_BASE)&~ALIGN_BASE); +#else + aux_=calloc(VOLUMEPLUSRAND, sizeof(spinor)); + aux = aux_; +#endif + assign(aux, S, VOLUME); + + for(i=0; i < n; i++){ + /* Here is where we have to include our operator which in this case is + X = 1 - (2M^2)/(D_m^dagger*D_m + M^2) */ + X_psi(R, aux, mstar); + npar *= rnorm; + } + mul_r(R, npar, R, VOLUME); + + free(aux_); + return; +} + +void X_over_sqrt_X_sqr(spinor * const R, double * const c, + const int n, spinor * const S, const double minev, double const mstar) { +//x/sqrt(x*x) <=> normalisation <= reasoned by Clenshaw recurrence: maps X to [-1,1] + + int j; + double fact1, fact2, temp1, temp2, temp3, temp4, maxev; + spinor *sv_, *sv, *d_, *d, *dd_, *dd, *aux_, *aux, *aux3_, *aux3;// *_ holds the adress of the sse-unaligned memory block + +#if ( defined SSE || defined SSE2 || defined SSE3) + sv_ = calloc(VOLUMEPLUSRAND+1, sizeof(spinor)); + sv = (spinor *)(((unsigned long int)(sv_)+ALIGN_BASE)&~ALIGN_BASE); + d_ = calloc(VOLUMEPLUSRAND+1, sizeof(spinor)); + d = (spinor *)(((unsigned long int)(d_)+ALIGN_BASE)&~ALIGN_BASE); + dd_ = calloc(VOLUMEPLUSRAND+1, sizeof(spinor)); + dd = (spinor *)(((unsigned long int)(dd_)+ALIGN_BASE)&~ALIGN_BASE); + aux_ = calloc(VOLUMEPLUSRAND+1, sizeof(spinor)); + aux = (spinor *)(((unsigned long int)(aux_)+ALIGN_BASE)&~ALIGN_BASE); + aux3_= calloc(VOLUMEPLUSRAND+1, sizeof(spinor)); + aux3 = (spinor *)(((unsigned long int)(aux3_)+ALIGN_BASE)&~ALIGN_BASE); +#else + sv_=calloc(VOLUMEPLUSRAND, sizeof(spinor)); + sv = sv_; + d_=calloc(VOLUMEPLUSRAND, sizeof(spinor)); + d = d_; + dd_=calloc(VOLUMEPLUSRAND, sizeof(spinor)); + dd = dd_; + aux_=calloc(VOLUMEPLUSRAND, sizeof(spinor)); + aux = aux_; + aux3_=calloc(VOLUMEPLUSRAND, sizeof(spinor)); + aux3 = aux3_; +#endif + + /*EVALUATE THE APPROXIMATION USING THE CLENSHAW'S RECURRENCE FORMULA*/ + + maxev=1.0; + + /*interval = [minev,maxev] = [epsilon,1]*/ + fact1=4/(maxev-minev); + fact2=-2*(maxev+minev)/(maxev-minev); + /* d=0 , dd=0 */ + zero_spinor_field(d, VOLUME); + zero_spinor_field(dd, VOLUME); + + + /*input S = aux3*/ + if(0) assign_sub_lowest_eigenvalues(aux3, S, no_eigenvalues-1, VOLUME); + else assign(aux3, S, VOLUME); + + + /*starting the loop*/ + if(1) { + for (j = n-1; j >= 1; j--) { + + /*sv=d = d_j+1*/ + assign(sv, d, VOLUME); + + /*aux= our random field S =0(j=n-1)*/ + assign(aux, d, VOLUME); + + if(j == n-1){ + assign(R, aux, VOLUME);//=0 + } + else{ + /*|R>=rnorm^2 X^2|aux> -> since aux=d -> |R>=rnorm^2 Q^2|d>*/ + norm_X_sqr_psi(R, aux, mstar);//WARNING: - maybe we have to pass this point only when j=n-2, because R is not manipulated in the loop body. + // - seems to setup d_n-1=0 + } + temp1=-1.0; + temp2=c[j]; /*Chebyshev coefficients*/ + + /* d = d*fact2 + R*fact1 + dd*temp1 + aux3*temp2 + d = -2*(maxev+minev)/(maxev-minev)*d + 4/(maxev-minev)*R + -1*dd + c[j]*aux3 */ + /* y = (2*x-a-b)/(b-a) , y2=2*y + d = y2*d - dd + c[j] = -2*(a+b)*d/(b-a) + 4*x*d/(b-a) -dd + c[j] */ + assign_mul_add_mul_add_mul_add_mul_r(d, R, dd, aux3, fact2, fact1, temp1, temp2, VOLUME);// =d_j+1 + /* dd = sv */ + assign(dd, sv, VOLUME);// = d_j+2 + } + + /* R = d */ + if(0) assign_sub_lowest_eigenvalues(R, d, no_eigenvalues-1, VOLUME); + else assign(R, d, VOLUME); + + /*|aux>=rnorm^2 Q^2|R> */ + norm_X_sqr_psi(aux, R, mstar); + temp1=-1.0; + temp2=c[0]/2.; + temp3=fact1/2.; + temp4=fact2/2.; + + /* aux = aux*temp3 + d*temp4 + dd*temp1 + aux3*temp2 + aux = 2/(maxev-minev)*aux + -(maxev+minev)/(maxev-minev)d + -1*dd + 0.5*c[j]*aux3 */ + /* P(X^2)|_x = y*d -dd + 0.5*c[0] */ + assign_mul_add_mul_add_mul_add_mul_r(aux, d, dd, aux3, temp3, temp4, temp1, temp2, VOLUME); + /* ONCE WE HAVE THE EVALUATION OF P(X^2) = 1/SQRT(X^2) + WE CONSTRUCT -X/SQRT(X^2) --> -X*P(X^2) */ + norm_X_n_psi(R, aux, 1, mstar); + } + + free(sv_); + free(d_); + free(dd_); + free(aux_); + free(aux3_); + return; +} + + +void Check_Approximation(double const mstar, const int repro) { + + if(g_proc_id == 0) { + printf("Checking the approximation of X/sqrt(X^2) in the mode number: \n"); + } + + int i; + double res = 0; + spinor **s, *s_; + spinor *Sin = NULL; + spinor *Sin_ = NULL; + static int n_cheby = 0; + static int rec_coefs = 1; + + printf("epsilon= %f \n", epsilon); + printf("M*^2= %f \n", mstar); + printf("x_n_cheby= %d \n", x_n_cheby); + if(n_cheby != x_n_cheby || rec_coefs) { + if(x_cheby_coef != NULL) free(x_cheby_coef); + x_cheby_coef = (double*)malloc(x_n_cheby*sizeof(double)); + chebyshev_coefs(epsilon, 1., x_cheby_coef, x_n_cheby, -0.5); + rec_coefs = 0; + n_cheby = x_n_cheby; + } + +#if (defined SSE3 || defined SSE2 || defined SSE) + Sin_ = calloc(VOLUMEPLUSRAND+1, sizeof(spinor)); + Sin = (spinor *)(((unsigned long int)(Sin_)+ALIGN_BASE)&~ALIGN_BASE); +#else + Sin =calloc(VOLUMEPLUSRAND, sizeof(spinor)); +#endif + + random_spinor_field_lexic(Sin, repro, RN_GAUSS); + + s_ = calloc(4*VOLUMEPLUSRAND+1, sizeof(spinor)); + s = calloc(4, sizeof(spinor*)); + + for(i = 0; i < 4; i++) { +#if (defined SSE3 || defined SSE2 || defined SSE) + s[i] = (spinor*)(((unsigned long int)(s_)+ALIGN_BASE)&~ALIGN_BASE)+i*VOLUMEPLUSRAND; +#else + s[i] = s_+i*VOLUMEPLUSRAND; +#endif + } + + X_over_sqrt_X_sqr(s[0], x_cheby_coef, x_n_cheby, Sin, epsilon, mstar); + + diff(s[2], Sin, s[0], VOLUME); + diff(s[2], Sin, s[0], VOLUME); + + X_over_sqrt_X_sqr(s[1], x_cheby_coef, x_n_cheby, s[0], epsilon, mstar); + + diff(s[3], s[1], Sin, VOLUME); + res = square_norm(s[3],VOLUME,0); + + if(g_proc_id == 0) { + printf("\n"); + printf("Deviation from the real value : \n"); + printf("||X^2/sqrt(X^2)|psi> - |nu>||^2 = %1.4e \n",res); + printf("\n"); + } + +#if (defined SSE3 || defined SSE2 || defined SSE) + free(Sin_); +#else + free(Sin); +#endif + free(s); + free(s_); + return; +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/P_M_eta.h b/qcd/part_cpu/applications/QCD/src/kernel_D/P_M_eta.h new file mode 100644 index 0000000000000000000000000000000000000000..5b0067f696fa3ec646620a504b7cbdd9cdb5a894 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/P_M_eta.h @@ -0,0 +1,44 @@ +/*********************************************************************** + * + * Copyright (C) 2011 Elena Garcia-Ramos + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _P_M_ETA_H +#define _P_M_ETA_H + +#include "su3.h" + +extern int x_n_cheby; +extern double * x_cheby_coef; + +void norm_X_sqr_psi(spinor * const R, spinor * const S, double const mstar); + +void norm_X_n_psi(spinor * const R, spinor * const S, const int n, double const mstar); + +void X_over_sqrt_X_sqr(spinor * const R, double * const c, const int n, spinor * const S, const double minev, double const mstar); + +void h_X_sqr_eta(spinor * const R1,spinor * const R2,spinor * const S, double const mstar); + +void h_X_eta(spinor * const R,spinor * const S, double const mstar); + +void h_X_4_eta(spinor * const R1, spinor * const R2, spinor * const S, double const mstar); + +void Check_Approximation(double const mstar, const int repro); + +#endif + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/Ptilde_nd.c b/qcd/part_cpu/applications/QCD/src/kernel_D/Ptilde_nd.c new file mode 100644 index 0000000000000000000000000000000000000000..cc0e5b9a4ab2343422ed1d939cdab02b0c2cda72 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/Ptilde_nd.c @@ -0,0 +1,351 @@ +/*********************************************************************** + * + * Copyright (C) 2006 Thomas Chiarappa + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#ifdef MPI +# include +#endif +#include "global.h" +#include "linalg_eo.h" +#include "start.h" +#include "operator/tm_operators.h" +#include "operator/tm_operators_nd.h" +#include "chebyshev_polynomial_nd.h" +#include "phmc.h" +#include "solver/matrix_mult_typedef_nd.h" +#include "Ptilde_nd.h" + + +#define PI 3.141592653589793 + + +double func_tilde(double u, double exponent){ + + double ff=0.0; + double d=0,ddd=0, sv, z, z2; + int j; + double res=0.0; + + z = (2.0*u - phmc_cheb_evmin - phmc_cheb_evmax)/(double)(phmc_cheb_evmax - phmc_cheb_evmin); + z2 = 2.0*z; + + for(j=phmc_dop_n_cheby-1; j>=1; j--){ + sv = d; + d = z2*d - ddd + phmc_dop_cheby_coef[j]; + ddd = sv; + } + + res = z*d - ddd + 0.5*phmc_dop_cheby_coef[0]; + + ff = (double)(res * sqrt(u)); + + return(pow(ff,exponent)); +} + +void Ptilde_cheb_coefs(double aa, double bb, double dd[], int n, double exponent){ + int k,j; + double fac,bpa,bma,*f; + double inv_n; + + inv_n=1./(double)n; + f=calloc(n,sizeof(double));/*vector(0,n-1);*/ + + fflush(stdout); + bma=0.5*(bb-aa); + bpa=0.5*(bb+aa); + for (k=0;k=1; j--) { + assign(&svs[0],&ds[0],VOLUME/2); + assign(&svc[0],&dc[0],VOLUME/2); + + /* + * if ( (j%10) == 0 ) { + * sub_low_ev(&aux[0], &d[0]); + * } else { */ + assign(&auxs[0], &ds[0], VOLUME/2); + assign(&auxc[0], &dc[0], VOLUME/2); + /* } */ + + + Qsq(&R_s[0], &R_c[0], &auxs[0], &auxc[0]); + + temp1=-1.0; + temp2=dd[j]; + assign_mul_add_mul_add_mul_add_mul_r(&ds[0] , &R_s[0], &dds[0], &aux3s[0], fact2, fact1, temp1, temp2,VOLUME/2); + assign_mul_add_mul_add_mul_add_mul_r(&dc[0] , &R_c[0], &ddc[0], &aux3c[0], fact2, fact1, temp1, temp2,VOLUME/2); + assign(&dds[0], &svs[0],VOLUME/2); + assign(&ddc[0], &svc[0],VOLUME/2); + } + + assign(&R_s[0], &ds[0],VOLUME/2); + assign(&R_c[0], &dc[0],VOLUME/2); + + Qsq(&auxs[0], &auxc[0], &R_s[0], &R_c[0]); + + temp1=-1.0; + temp2=dd[0]/2; + temp3=fact1/2; + temp4=fact2/2; + assign_mul_add_mul_add_mul_add_mul_r(&auxs[0], &ds[0], &dds[0], &aux3s[0], temp3, temp4, temp1, temp2,VOLUME/2); + assign_mul_add_mul_add_mul_add_mul_r(&auxc[0], &dc[0], &ddc[0], &aux3c[0], temp3, temp4, temp1, temp2,VOLUME/2); + assign(&R_s[0], &auxs[0],VOLUME/2); + assign(&R_c[0], &auxc[0],VOLUME/2); + + free(svs_); + free(ds_); + free(dds_); + free(auxs_); + free(aux2s_); + free(aux3s_); + free(svc_); + free(dc_); + free(ddc_); + free(auxc_); + free(aux2c_); + free(aux3c_); +} + +double chebtilde_eval(int M, double *dd, double s){ + + double d=0,ddd=0, sv, z, z2, res; + int j; + + z = (2.0*s - phmc_cheb_evmin - phmc_cheb_evmax)/(double)(phmc_cheb_evmax - phmc_cheb_evmin); + z2 = 2.0*z; + + for(j=M-1; j>=1; j--){ + sv = d; + d = z2*d - ddd + dd[j]; + ddd = sv; + } + + res = z*d - ddd + 0.5*dd[0]; + + return(res); +} + +/************************************************************************** + * + * The externally accessible function is + * + * void degree_of_Ptilde + * Computation of (QdaggerQ)^1/4 + * by using the chebyshev approximation for the function ()^1/4 + * + * Author: Thomas Chiarappa May 2006 + * + *****************************************************************************/ + + + +void degree_of_Ptilde(int * _degree, double ** coefs, + const double EVMin, const double EVMax, + const int sloppy_degree, const double acc, + matrix_mult_nd Qsq, const int repro) { + int i, j; + double temp, temp2; + int degree; + double sum=0.0; + + spinor *ss=NULL, *ss_=NULL, *sc=NULL, *sc_=NULL; + spinor *auxs=NULL, *auxs_=NULL, *auxc=NULL, *auxc_=NULL; + spinor *aux2s=NULL, *aux2s_=NULL, *aux2c=NULL, *aux2c_=NULL; + + *coefs = calloc(phmc_max_ptilde_degree, sizeof(double)); + + ss_ = calloc(VOLUMEPLUSRAND/2+1, sizeof(spinor)); + auxs_ = calloc(VOLUMEPLUSRAND/2+1, sizeof(spinor)); + aux2s_= calloc(VOLUMEPLUSRAND/2+1, sizeof(spinor)); + sc_ = calloc(VOLUMEPLUSRAND/2+1, sizeof(spinor)); + auxc_ = calloc(VOLUMEPLUSRAND/2+1, sizeof(spinor)); + aux2c_= calloc(VOLUMEPLUSRAND/2+1, sizeof(spinor)); + + ss = (spinor *)(((unsigned long int)(ss_)+ALIGN_BASE)&~ALIGN_BASE); + auxs = (spinor *)(((unsigned long int)(auxs_)+ALIGN_BASE)&~ALIGN_BASE); + aux2s = (spinor *)(((unsigned long int)(aux2s_)+ALIGN_BASE)&~ALIGN_BASE); + sc = (spinor *)(((unsigned long int)(sc_)+ALIGN_BASE)&~ALIGN_BASE); + auxc = (spinor *)(((unsigned long int)(auxc_)+ALIGN_BASE)&~ALIGN_BASE); + aux2c = (spinor *)(((unsigned long int)(aux2c_)+ALIGN_BASE)&~ALIGN_BASE); + + Ptilde_cheb_coefs(EVMin, EVMax, *coefs, phmc_max_ptilde_degree, -1.0); + + if(g_proc_id == g_stdio_proc && g_debug_level > 0){ + printf("# NDPOLY Acceptance Polynomial: EVmin = %f EVmax = %f\n", EVMin, EVMax); + printf("# NDPOLY ACceptance Polynomial: desired accuracy is %e \n", acc); + fflush(stdout); + } + + degree = 2*sloppy_degree; + + for(i = 0; i < 100 ; i++) { + if (degree > phmc_max_ptilde_degree) { + fprintf(stderr, "Error: n_cheby=%d > phmc_max_ptilde_degree=%d in ptilde\n", + degree, phmc_max_ptilde_degree); + fprintf(stderr, "Increase n_chebymax\n"); +#ifdef MPI + MPI_Finalize(); +#endif + exit(-5); + } + + sum=0; + for(j=degree; j 0)) { + printf("# NDPOLY Acceptance Polynomial: Sum remaining | d_n | = %e for degree=%d\n", sum, degree); + printf("# NDPOLY Acceptance Polynomial: coef[degree] = %e\n", (*coefs)[degree]); + } + if(sum < acc) { + break; + } + degree= (int)(degree*1.2); + } + + if(g_debug_level > 2) { + /* Ptilde P S P Ptilde X - X */ + /* for random spinor X */ + random_spinor_field_eo(ss, repro, RN_GAUSS); + random_spinor_field_eo(sc, repro, RN_GAUSS); + + Ptilde_ndpsi(&auxs[0], &auxc[0], *coefs, degree, &ss[0], &sc[0], Qsq); + Ptilde_ndpsi(&aux2s[0], &aux2c[0], phmc_dop_cheby_coef, phmc_dop_n_cheby, &auxs[0], &auxc[0], Qsq); + Qsq(&auxs[0], &auxc[0], &aux2s[0], &aux2c[0]); + Ptilde_ndpsi(&aux2s[0], &aux2c[0], phmc_dop_cheby_coef, phmc_dop_n_cheby, &auxs[0], &auxc[0], Qsq); + Ptilde_ndpsi(&auxs[0], &auxc[0], *coefs, degree, &aux2s[0], &aux2c[0], Qsq); + + diff(&aux2s[0],&auxs[0], &ss[0], VOLUME/2); + temp = square_norm(&aux2s[0], VOLUME/2, 1) / square_norm(&ss[0], VOLUME/2, 1) / 4.0; + + diff(&aux2c[0],&auxc[0], &sc[0], VOLUME/2); + temp2 = square_norm(&aux2c[0], VOLUME/2, 1)/square_norm(&sc[0], VOLUME/2, 1) / 4.0; + + if(g_epsbar == 0){ + temp2 = 0.0; + } + /* || (Ptilde P S P Ptilde - 1)X ||^2 / || 2X ||^2 */ + if(g_proc_id == g_stdio_proc) { + printf("# NDPOLY Acceptance Polynomial: relative squared accuracy in components:\n# UP=%e DN=%e \n", temp, temp2); + } + + temp = chebtilde_eval(degree, *coefs, EVMin); + temp *= cheb_eval(phmc_dop_n_cheby, phmc_dop_cheby_coef, EVMin); + temp *= EVMin; + temp *= cheb_eval(phmc_dop_n_cheby, phmc_dop_cheby_coef, EVMin); + temp *= chebtilde_eval(degree, *coefs, EVMin); + temp = 0.5*fabs(temp - 1); + if(g_proc_id == g_stdio_proc) { + printf("# NDPOLY Acceptance Polynomial: Delta_IR at s=%f: | Ptilde P s_low P Ptilde - 1 |/2 = %e \n", EVMin, temp); + } + } + if(g_proc_id == g_stdio_proc) { + printf("# NDPOLY Acceptance Polynomial degree set to %d\n\n", degree); + } + + *_degree = degree; + free(ss_); + free(auxs_); + free(aux2s_); + free(sc_); + free(auxc_); + free(aux2c_); + return; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/Ptilde_nd.h b/qcd/part_cpu/applications/QCD/src/kernel_D/Ptilde_nd.h new file mode 100644 index 0000000000000000000000000000000000000000..dc1cb72000c50d043caca076caa44ec0716fa335 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/Ptilde_nd.h @@ -0,0 +1,39 @@ +/*********************************************************************** + * Copyright (C) 2006 Thomas Chiarappa + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _PTILDE_ND_H +#define _PTILDE_ND_H + +#include "solver/matrix_mult_typedef_nd.h" + +double func_tilde(double u, double exponent); + +void Ptilde_cheb_coefs(double a, double b, double dd[], int n, double exponent); + +void Ptilde_ndpsi(spinor *R_s, spinor *R_c, double *dd, int n, + spinor *S_s, spinor *S_c, matrix_mult_nd Qsq); + +double chebtilde_eval(int M, double *dd, double s); + +void degree_of_Ptilde(int * _degree, double ** coefs, + const double EVMin, const double EVMax, + const int sloppy_degree, const double acc, + matrix_mult_nd Qsw, const int repro); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/README b/qcd/part_cpu/applications/QCD/src/kernel_D/README new file mode 100644 index 0000000000000000000000000000000000000000..01d557c4e4f56d05b18ff8f4101e794c37ac37f7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/README @@ -0,0 +1,371 @@ +Here are some remarks collected in order to configure, compile and +install the tmLQCD programme suit. For more information, also about running +the code please read the documentation in the doc sub-directory. + +CONFIGURE and COMPILE + +It is recommended to build the code not in the source directory but in +a separate directory. + +The lime library (tested with version 1.2.3) is needed to compile the +program. Please download it at + +http://usqcd.jlab.org/usqcd-software/c-lime/ + +Configure and compile lime (for documentation see +http://usqcd.jlab.org/usqcd-docs/c-lime/) first. +Then you should use the configure option --with-lime=dir for the +tmLQCD to set the correct directory where to find lime (see below). + +For more documentation please change into the doc directory and type +latex main.tex +and see the sections for configuring, installing and testing the code. + +Here we have gathered some examples for some standard architectures. +Building the tmLQCD executables is a three step procedure: + +**************************************************************************** + +1) configure: + +In your build directory type + +path-to-the-sources/configure --help + +to get an overview of the available options and switches. In +particular check out the prefix option for your installation path. +What follows now are some examples for a few standard architectures. + +- a scalar build on a P4 machine would look like: + +path-to-the-sources/configure --disable-mpi --enable-sse2 --enable-p4 \ + --enable-gaugecopy --disable-newdiracop --with-limedir= \ + --with-lapack="" \ + CC= + +- Opteron with SSE2: + +path-to-the-sources/configure --disable-mpi --enable-sse2 --enable-opteron \ + --enable-gaugecopy --disable-newdiracop --with-limedir= \ + --with-lapack="" \ + CC= + +- A MPI parallel (4dims) build on a P4 cluster: + +path-to-the-sources/configure --enable-mpi --enable-sse2 --enable-p4 \ + --with-mpidimension=4 --enable-gaugecopy --disable-newdiracop \ + --with-limedir= --with-lapack="" \ + CC= + +- on the Munich Altix machine: + +path-to-the-sources/configure --enable-mpi --with-mpidimension=4 \ + --with-limedir= --enable-newdiracop \ + --disable-shmem --with-lapack="" \ + CC=mpicc CFLAGS="-mcpu=itanium2 -O3 -g -c99 -mtune=itanium2" + +for lapack on this machine please type +module load mkl + + +- on the HLRB ice installation use + +path-to-the-sources/configure --enable-mpi --with-mpidimension=4 \ + --disable-sse2 --disable-p4 --with-limedir= \ + --enable-newdiracop --with-lapack="" \ + CC="mpicc -std=c99" CFLAGS="-g" \ + +where it is again important to use the Intel C compiler! + +for lapack first load the module mkl and then use + +--with-lapack="-L$LIBRARY_PATH -llapack -lblas" + +- on Blue Gene installations + +For the Blue Gene L and P see the README.bg? files + +For BG/Q you can enable QPX intrinsics with --enable-qpx, which will have +effect only with the XLC compiler. + +You may enable or disable other configure options as needed. See the +documentation for more details. + +**************************************************************************** + +2) make + +type `make` in your build directory. + +If there appears no error message during compilation you should end up +with a few executable in the build directory, namely `hmc_tm`, +`invert` and `invert_doublet`. + +**************************************************************************** + +3) make install + +type `make install` + +to get the executables installed. + + + +**************************************************************************** +**************************************************************************** + +in the following we provide a "codemap", giving a short explanation +for the contents of each c-file: + +**************************************************************************** +top directory: apart from the main routines all routines are compiled into + the run-time library libhmc. + +DML_crc32.c: invert, invert_doublet, hmc_tm + some helper functions to compute the SCIDAC + checksum +D_psi.c: invert, invert_doublet, hmc_tm + Wilson twisted mass Dirac operator, not even/odd + preconditioned +Hopping_Matrix.c: invert, invert_doublet, hmc_tm + Hopping matrix for the even/odd preconditioned + Dirac operator +Hopping_Matrix_nocom.c: benchmark + Hopping matrix for the even/odd preconditioned + Dirac operator, communication switched off +Nondegenerate_Matrix.c: invert_doublet, hmc_tm + operators needed for even/odd preconditioning + the non-degenerate flavour doublet Dirac operator +Ptilde_nd.c: hmc_tm + the more precise polynomial $\tilde P$ needed for + the PHMC for the non-degenerate flavour doublet +benchmark.c: main routine + benchmark code for D_psi and Hopping_Matrix +block.c: experimental +boundary.c: invert, invert_doublet, hmc_tm + implements the twisted boundary conditions for the + spinor fields +chebyshev_polynomial.c: experimental +chebyshev_polynomial_nd.c: hmc_tm + implements the generation of coefficients for the + chebyshev polynomial using the clenshaw recursion + relation +deriv_Sb.c: hmc_tm + the variation of Q=gamma_5 D with respect to the + gauge fields in the even/odd case +deriv_Sb_D_psi.c: hmc_tm + the variation of Q=gamma_5 D with respect to the + gauge fields in the non even/odd case +det_monomial.c: hmc_tm + implements the functions needed for a det monomial +detratio_monomial.c: hmc_tm + implements the functions needed for a detratio monomial +poly_monomial.c: hmc_tm + implements function needed for a POLY monomial + (PHMC for light degenerate quarks) +dml.c: invert, invert_doublet, hmc_tm + some helper functions to compute the SCIDAC + checksum +double2single.c: main routine + can convert a gauge field from double to single precision +single2double.c: main routine + can convert a gauge field from single to double precision +eigenvalues_bi.c: hmc_tm + computes eigenvalues of the mass non-degenerate two flavour + Dirac operatoe +expo.c: hmc_tm + implements the exponetial function of an su(3) element +gamma.c: invert, invert_doublet, hmc_tm + implements multiplication of gamma matrices and some useful + combination of those with a spinor field +gauge_io.c: invert, invert_doublet, hmc_tm + IO routines for gauge fields +gauge_monomial.c: hmc_tm + implements the functions needed for a gauge monomial +gen_sources.c: invert, invert_doublet, hmc_tm + implements the generation of source spinor fields +geometry_eo.c: invert, invert_doublet, hmc_tm + anything related to gauge and spinor field geometry +get_rectangle_staples.c: hmc_tm + computes rectangular staples of gauge links as needed for + e.g. the Iwasaki gauge action and its derivative +get_staples.c: hmc_tm + computes plaquette staples of gauge links as needed for + for all gauge actions and their derivatives +getopt.c: invert, invert_doublet, hmc_tm + needed for command line options +hmc_tm.c: main routine + hmc_tm executable +hybrid_update.c: hmc_tm + implements the functions for the gauge field update and + the momenta update +init_bispinor_field.c +init_chi_copy.c +init_chi_spinor_field.c +init_dirac_halfspinor.c +init_gauge_field.c +init_gauge_tmp.c +init_geometry_indices.c +init_moment_field.c +init_spinor_field.c +init_stout_smear_vars.c: invert, invert_doublet, hmc_tm + provide routines to allocate memory for the corresponding + objects +integrator.c: hmc_tm + implements the routines needed for the integrator in the + MD udpate +invert.c: main routine + invert executable +invert_doublet.c: main routine + invert_doublet executable +invert_doublet_eo.c: invert_doublet + performs an inversion of the flavour doublet operator using + even/odd preconditioning and the CG solver +invert_eo.c: invert + performs an inversion of the Wilson twisted mass Dirac operator + using a solver as specified in the input file. Depending on the + input file even/odd preconditioning is used or not +io.c: invert, invert_doublet, hmc_tm + helper routines: some deprecated IO routines for gauge and spinor + spinor fields, and the routine writing the initial stdout message + of the executables +io_utils.c: invert, invert_doublet, hmc_tm + IO helper routines related to swap endian and checksums +linsolve.c: hmc_tm + CG and bicgstab solvers as used only in the HMC +little_D.c: experimental +measure_rectangles.c: hmc_tm + computes the gauge action related to the rectangular part +monomial.c: hmc_tm + provides the definition for monomials and initialisation functions +mpi_init.c: invert, invert_doublet, hmc_tm, benchmark + MPI initialisation routine +ndpoly_monomial.c: hmc_tm + implements the functions needed for a ndpoly monomial +observables.c: hmc_tm, invert, invert_doublet + computes the gauge action related to the Wilson plaquette part +online_measurement.c: hmc_tm + anything related to online measurements +phmc.c hmc_tm + functions and variables as needed for the PHC +polyakov_loop.c: hmc_tm + measures the polyakov loop +propagator_io.c: invert, invert_doublet, hmc_tm + functions related to spinor field IO +ranlxd.c: invert, invert_doublet, hmc_tm + RANLUX random number generator (64 Bit) +ranlxs.c: invert, invert_doublet, hmc_tm + RANLUX random number generator (32 Bit) +read_input.l: invert, invert_doublet, hmc_tm + definition of the input file parser (flex) +reweighting_factor.c: experimental +reweighting_factor_nd.c: experimental +sighandler.c: invert, invert_doublet, hmc_tm + handles signal related to illegal instructions +start.c: invert, invert_doublet, hmc_tm + functions needed to give initial values to gauge and spinor fields +stout_smear.c: invert, invert_doublet + functions to stout smear a given gauge configuration +stout_smear_force.c: experimental +tm_operators.c: invert, invert_doublet, hmc_tm + operators needed for even/odd preconditioning the Wilson + twisted mass Dirac operator +update_backward_gauge.c: invert, invert_doublet, hmc_tm + functions to update the gauge copy +update_momenta.c: hmc_tm + function to update the momenta in the HMC MD part +update_tm.c: hmc_tm + the HMC MD part +xchange_2fields.c: invert, invert_doublet, hmc_tm + implements the MPI communication of two even/odd spinor fields + at once +xchange_deri.c: hmc_tm + implements the MPI communication of derivatives +xchange_field.c: invert, invert_doublet, hmc_tm + implements the MPI communication of a single even/odd spinor + field +xchange_gauge.c: invert, invert_doublet, hmc_tm + implements the MPI communication of the gauge field +xchange_halffield.c: invert, invert_doublet, hmc_tm + implements the MPI communication of a half spinor field +xchange_lexicfield.c: invert, invert_doublet, hmc_tm + implements the MPI communication of a single (full) spinor + field + +**************************************************************************** +the linalg directory: all routines here are compiled into the liblinalg + runtime library + capital letters are spinor fields, others scalars +add.c: Q = R + S +assign.c: R = S +assign_add_mul.c: P = P + c Q with c complex +assign_add_mul_r.c: P = P + c Q with c real +assign_add_mul_add_mul.c: R = R + c1*S + c2*U with c1 and c2 complex variables +assign_add_mul_add_mul_r.c: R = R + c1*S + c2*U with c1 and c2 real variables +assign_diff_mul.c: S=S-c*Q +assign_mul_add_mul_add_mul_add_mul_r.c: R = c1*R + c2*S + c3*U + c4*V + with c1, c2, c3, c4 real variables +assign_mul_add_mul_add_mul_r.c: R = c1*R + c2*S + c3*U + with c1, c2 and c3 real variables +assign_mul_add_mul_r.c: R = c1*R + c2*S , c1 and c2 are real constants +assign_mul_add_r.c: R = c*R + S c is a real constant +assign_mul_bra_add_mul_ket_add.c: R = c2*(R + c1*S) + (*U) + with c1 and c2 complex variables +assign_mul_bra_add_mul_ket_add_r.c: R = c2*(R + c1*S) + (*U) + with c1 and c2 complex variables +assign_mul_bra_add_mul_r.c: R = c1*(R + c2*S) + with c1 and c2 complex variables +comp_decomp.c: Splits the Bi-spinor R in the spinors S and T +convert_eo_to_lexic.c: convert to even odd spinors to one full spinor +diff.c: Q = R - S +diff_and_square_norm.c: Q = R - S and ||Q||^2 +mattimesvec.c: w = M*v for complex vectors w,v and and complex square matrix M +mul.c: R = c*S, for complex c +mul_r.c: R = c*S, for real c +mul_add_mul.c: R = c1*S + c2*U , c1 and c2 are complex constants +mul_add_mul_r.c R = c1*S + c2*U , c1 and c2 are real constants +mul_diff_mul.c: R = c1*S - c2*U , c1 and c2 are complex constants +mul_diff_mul_r.c R = c1*S - c2*U , c1 and c2 are real constants +mul_diff_r.c R = c1*S - U , c1 is a real constant +scalar_prod.c: c = (R, S) +scalar_prod_i.c: c = Im(R, S) +scalar_prod_r.c: c = Re(R, S) +square_and_prod_r.c: Returns Re(R,S) and the square norm of S +square_norm.c: c = ||Q||^2 + +**************************************************************************** +solver directory: all routines here are compiled into the libsolver + runtime library + the solvers are for spinor fields, if not indicated + otherwise. + +Msap.c: experimental SAP preconditioner +bicgstab_complex.c: BiCGstab for complex fields +bicgstabell.c: experimental +cg_her.c : CG solver for hermitian operators +cg_her_nd.c: CG solver for hermitian heavy doublet operators +cgs_real.c: CGS solver +chrono_guess.c: routines for the chronological solver +dfl_projector.c: experimental +diagonalise_general_matrix.c: subroutine to diagonalise a complex n times n + matrix. Input is a complex matrix in _C_ like + order. Output is again _C_ like. Uses lapack +eigenvalues.c compute the nr_of_eigenvalues lowest eigenvalues + of (gamma5*D)^2 +fgmres.c: FGMRES (flexible GMRES) solver +gcr.c: GCR solver +gcr4complex.c: GCR solver for complex fields +generate_dfl_subspace.c: experimental +gmres.c: GMRES solver +gmres_dr.c: GMRES-DR solver +gmres_precon.c: GMRES usable for preconditioning other solvers (experimental) +gram-schmidt.c: Gram-Schmidt orthonormalisation routines +jdher.c: Jacobi Davidson for hermitian matrices (to compute EVs) +lu_solve.c: compute the inverse of a matrix with LU decomposition +mr.c: MR solver +pcg_her.c: PCG solver +poly_precon.c: polynomial preconditioner using Chebysheff polynomials + with complex argument +quicksort.c: a quicksort routine +sub_low_ev.c: routines to subtract exactly computed eigenvectors from + a given spinor field diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/README.bgl b/qcd/part_cpu/applications/QCD/src/kernel_D/README.bgl new file mode 100644 index 0000000000000000000000000000000000000000..45d1348f4050f30dc68a71939db20d40ba84ae9e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/README.bgl @@ -0,0 +1,33 @@ +This summarises some info for the Blue Gene light system + +The configure command should be something like the following: + +path-to-the-sources/configure --host=ppc-ibm-blrts --build=ppc64-ibm-linux \ +--without-lapack --enable-mpi --with-mpidimension=4 \ +--enable-gaugecopy --with-limedir=path-to-lime/ \ +--enable-newdiracop CC=/usr/bin/blrts_xlc CPPFLAGS=-I/bgl/BlueLight/ppcfloor/bglsys/include/ \ +--with-nonblockingmpi --with-persistentmpi --with-bgldram + +some comments are important: +- if option --with-bgldram is used, then the executables have to be + relinked with /bgl/local/bin/blrts_gcc -Xlinker --script=./elf32ppcblrts.x . + The linker is now replaced automatically. + the file elf32ppcblrts.x can be obtained from + blrts-gnu/powerpc-bgl-blrts-gnu/lib/ldscripts/elf32ppcblrts.x + and it needs the change of the line + PROVIDE (__executable_start = 0x00200000); . = 0x00200000 + SIZEF_HEADERS; + to + PROVIDE (__executable_start = 0x01000000); . = 0x01000000 + SIZEF_HEADERS; + + otherwise the code will stop with the message that there is not enough memory + for halfspinor fields. + + note that this option is default for the BGL build! + +- the new Dirac operator implementation is useful for bad boundary to volume + ratios. In case of large local volumes it might be better to use + --disable-newdiraop + +- running should be done with something like + -env "BGLMPI_EAGER=500000000 BGLMPI_PACING=n" + as options to mpirun. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/README.bgp b/qcd/part_cpu/applications/QCD/src/kernel_D/README.bgp new file mode 100644 index 0000000000000000000000000000000000000000..94ce64eb2527ee6995df44d6e993bb0e636369a2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/README.bgp @@ -0,0 +1,20 @@ +Configure with + +../hmc/configure --with-lapack --enable-mpi --with-mpidimension=4 --enable-gaugecopy --with-limedir=../../c-lime/ --host=ppc-ibm-bprts --build=ppc64-ibm-linux --enable-largefile --with-lapack="-L/bgsys/local/lib/ -lesslbg -llapack -lesslbg -lxlf90_r" CC="mpixlc_r" CCFLAGS="-I/bgsys/drivers/ppcfloor/arch/include/ -I/bgsys/drivers/ppcfloor/comm/include" F77="bgf77" + +may need some path adjustment, in particular where to find ESSL for +BG. + +Running should be done with +NrZProcs = 4 +always. Matching the physical torus works with + : T*LX*LY*LZ +midplane: 8*8*8*4 +rack: 16*8*8*4 +2rack: 32*8*8*4 +4rack: 32*16*8*4 + +number of processors in time direction is chosen automatically. + +please read README.bgl as well. + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/README.phmc b/qcd/part_cpu/applications/QCD/src/kernel_D/README.phmc new file mode 100644 index 0000000000000000000000000000000000000000..80dde747f350481f63a48a73ca645992069dfb06 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/README.phmc @@ -0,0 +1,42 @@ +See doc/2+1+1_howto.text for more details + +as a test the following parameter set can be used (see also +sample-input/sample-hmc2.input): +Kappa = 0.170 +2KappaMu = 0.01 +2Kappamubar = 0.1105 +2Kappaepsbar = 0.0935 +beta=3.3 +tlsym gauge action +antiperiodic boundary conditions for quark fields + +P = 0.53347(17) +P_rec = 0.30393(22) + + +How it roughly works: +- Decide for a fixed order n of the less accurate polynomial + P +- Decide also for an approximation interval +- compute the n roots of the polynomial using the programme + in util/laguer/ + This will store the monomials in Square_root_BR_roots.dat + and the normalisation factor in normierungLocal.dat +- the normalisation factor in the latter file corresponds + to n-th root of C_total. In the monomial repr. the square + root of this is needed. This rooting is done in phmc_tm.c +- the polynomial approximates 1/sqrt(x) +- copy the files normierungLocal.dat and Square_root_BR_roots.dat + to your running directory +- the order used in the PHMC for P must be fixed to the same + order as used for the root computation (or vice versa) + + +todo: +- fixed order of P as input parameter +- maybe more meaningful names for input/output files? +- generic way to use n_f=2 PHMC degenerate and non-degenerate +- eigenvalue computation - in parallel? + - on BG/L? +- reduce memory usage? + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/README.unit-testing b/qcd/part_cpu/applications/QCD/src/kernel_D/README.unit-testing new file mode 100644 index 0000000000000000000000000000000000000000..2f1a6b77113cbdc2309c73204e865831db9fcbde --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/README.unit-testing @@ -0,0 +1,114 @@ +This document explains how to run and develop test suites for the tmLQCD +program suite. tmLQCD uses the CU unit testing framework by +Daniel Fiser [http://cu.danfis.cz/] + +########### +# COMPILE # +########### + +The unit tests reside in the tests directory and are currently not built +automatically. To compile them, enter your build directory and call +'configure' if necessary and then 'make tests': + + $ cd build + $ ../configure [--options ..] + $ make tests + +The build system is configured so that make will build the modules +which are used by the tests. + + +########### +# EXECUTE # +########### + +To run the unit tests, change into the tests directory in your build +directory and run the tests as any executable. Make sure that you are +actually in the tests directory because the tests require a 'regressions' +folder to be in the current working directory. + + $ cd build/tests + $ ./test_sample + +Any output of the unit tests is redirected into the files in the +'regressions' directory. The python script 'check-regressions' in +the 'cu' directory in the source tree can be used to perform +automated regression testing on these output files. (ie. compare +known-good ouput to the current output after a change has been +made) + + +###################### +# REGRESSION TESTING # +###################### + +CU implements a rudimentary form of regression testing based on differences +in output files. + +CU redirects stdout and stderr to two files in the $builddir/tests/regressions +directory which are named + + tmp.NAME_TESTSUITENAME.[out,err] . + +By moving tmp.*.[out,err] to *.[out,err] , you can create 'reference' output +from a known-good test-run. + +To run a regression test, after running the tests you want to regression +check, run the 'check-regressions' script from the 'cu' directory with +the $builddir/tests/regressions folder as an argument. E.g.: + + ~/tmLQCD $ cu/check-regressions build/tests/regressions + +The script will compare output and show differences in case the outputs +diverge. When dealing with floating point numbers the script compares +up to a given precision which you can specify with the --eps option. + +See 'cu/check-regressions -h' for more information and further options. + +Don't forget that you have to update the reference output if you change the +output of your test harnesses. + +########### +# DEVELOP # +########### + +The process of adding a unit test begins with the creation of three files +in the test directory. + +tests/test_name.c +tests/test_name_testsuitename.h +tests/test_name_testsuitename.c + +Where 'name' should be a descriptive name of what the test harness does. + +CU supports adding multiple testsuites to one test harness to create +further thematic links. For instance, the test harness for the buffers +framework is test_buffers, and the test suite for the "gauge" buffers +is test_buffers_gauge. + +There is a sample test harness in test_sample*.[c,h] which clarifies how +to write test suites. In principle the stem file runs the tests. The +test suite header declares the tests and adds them together into a +test suite and the test suite C file defines the different tests. + +Finally, in order to build the test harness, 'Makefile.tests' has to be edited +as hinted at by the existing tests. + +1) add the stem of your test name to the TESTS variable +2) add the five-line rule for building the test harness, adjusting the + lines as required + +TEST_SAMPLE_OBJECTS:=$(patsubst $(top_srcdir)/%.c,%.o,$(wildcard $(top_srcdir)/tests/test_sample*.c)) +TEST_SAMPLE_FLAGS:= +TEST_SAMPLE_LIBS:= $(top_builddir)/cu/libcu.a +tests/test_sample: $(TEST_SAMPLE_OBJECTS) $(TEST_SAMPLE_LIBS) + ${LINK} $(TEST_SAMPLE_OBJECTS) $(TESTFLAGS) $(TEST_SAMPLE_FLAGS) + +Object files of the modules under test shoud be added to *_OBJECTS variable. +For example, the su3 test requires 'expo.o'. If a module is built into a +library the object can also be added to the compilation and prerequisites +by adding it to the *_LIBS variable in addition to $(top_builddir)/cu/libcu.a +(see test_buffers for an example which has both an object file and a library +added) + +Bartosz Kostrzewa, 2012/02/03 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/Releases b/qcd/part_cpu/applications/QCD/src/kernel_D/Releases new file mode 100644 index 0000000000000000000000000000000000000000..38084379b9c486af9720709e1f9fe9e4a224c246 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/Releases @@ -0,0 +1,111 @@ +Release 5.1.1: +Multi Mass solver added +overlap operator added +parallel IO added + +Release 5.0.3: +bug fix release, release published in computer +physics communications + +Release 5.0.0: +new input file structure (monomials) allowing for any even +number of quarks + +Release 4.0.0: +new IO format +inversion for split doublet +checksums +improvements for the PHMC +preconditioning for the CG + +Release 4.0.0-rc2: +Improvements for the PHMC +preconditioning for the CG + +Release 4.0.0-rc1: +PHMC merged into main trunk and first running version, +also for 1+1 flavours + +Release 3.2.2: +performance improvements for BG/L and opteron +target machines + +Release 3.2.1: +Bugfix release. Basically bug fixed on the altix for +shmem version. + +Release 3.2.0: +New Dirac Operator where only half of the spinor is +exchanged. This bring a 20% improvement on the BG/L +Moreover, shmem versions of the exchange routines +are available. + +shmem not yet completely testet! + +Release 3.1.0: +Running with an extra BG/L Dirac operator + +Release 2.3.6: +Bug fix release and some minor important new features. + +Release 2.3.5: +Bug fix release and some new input parameter, see +NEWS for details. + +Release 2.3.4: +Integration scheme with error cancellation +Reversibility check implemented +Relative precision possible +Precisions are input parameter now + +Release 2.3.3: +some bug fixes +extended leap frog and Sexton-Weingarten integration +scheme (multiple time scales) implemented and tested + +Release 2.3.2: +some bug fixes, extended leap frog integrator + +Release 2.3.1: +some bug fixes + +Release 2.3: +T and L can be set in the input file now. +The number of processors in x direction +is also set now in the input file. +T= integer +L= integer +NrXProcs = integer + +Release 2.2: +DBW2 inplemented and working. New input parameter: +RGIC1 = floating point number +memory dynamically allocated now. + +Release 2.1: +working and optimised for x86_64 and AMD Opteron. benchmark tool +added. Even Odd geometry also for the gauge fields and the possibility +to use a copy of the gauge fields for a better cache usage. +New configure options: +--enable-opteron : Enables cache optimisation for Opteron [default=no] +--enable-gaugecopy : Enables usage of a copy of the gauge field [default=yes] +--enable-eogeom : Enables usage of EO geometry also for the gauge fields [default=yes] + +Release 2.0: +second parallel direction introduced + +Release 1.2.0: +Implementation of the Hasebusch trick with a third pseudo fermion +field. + +Release 1.1.0: +Implementation of the Hasenbusch trick with two pseudo fermion +fields. This is also tested. + +Release 1.0.1: +fully parallelized tm, Wilson and clover impr. dynamical QCD code with +SSE2 improvement for P4 and improvement for IBM power4. Serial version +gives not the correct results. EO order implemented for also for the +gauge fields, but not yet switched on per default. To do so one has to +undef OlD in geometry_eo.c and xchange.c. This order will not yet work +with clover improved Wilson. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/Todo b/qcd/part_cpu/applications/QCD/src/kernel_D/Todo new file mode 100644 index 0000000000000000000000000000000000000000..cb8f0ac99fca0c024a28bea8b8a291255f0643b5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/Todo @@ -0,0 +1,36 @@ +- source filename for heavy-light operator not consistent + +- read_spinor does not distingiush between a source and a +propagator. In case there is source and sink in the same file, one +cannot read the source, probably. To be fixed! + +- even/odd is not consistently implemented in invert + if there is one operator without we need to get the memory + for even_odd_flag == 0!! + +- SourceSink_Pairs propagator format possibly broken for 2 fl. tmwilson +- residuum check for 2 fl nd twisted fails +- do we need basenames per operator +- introduce source types +- volume sources +- spin shifts in sources +- adaptive precision for overlap +- adapt online measuremnts for new operator structure +- incorporate stouting into new operator structure +- Remove the DUM_* variables and g_spinor_fields +- incorporate overlap operator +- append/not append feature for spinor fields +- source generation inside the code +- deflation +- mixed precision solver, single precision Dirac operator +- GPU implementation... +- test RHMC? +- dynamical overlap code +- create simpler test suite + +- Add error reporting in spinor_write_binary for the lime functions +- Organize error (exit) codes + - Make a list of all existing error exits + - Create a scheme for these error messages with enums + - Implement scheme everywhere in the code + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/X_psi.c b/qcd/part_cpu/applications/QCD/src/kernel_D/X_psi.c new file mode 100644 index 0000000000000000000000000000000000000000..f35bfdf5177e59b6051345fd25a2bece12929499 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/X_psi.c @@ -0,0 +1,188 @@ +/*********************************************************************** + * + * Copyright (C) 2011 Elena Garcia-Ramos + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#ifdef HAVE_CONFIG_H +# include +#endif +#include "global.h" +#include "su3.h" +#include "linalg_eo.h" +#include "operator/D_psi.h" +#include "gamma.h" +#include "X_psi.h" +#include "operator/tm_operators.h" +#include "solver/solver.h" +#include "read_input.h" + +void DdaggerD_plus_M(spinor * const R, spinor * const S) +{ + double g_muWithoutMStarSquare=g_mu; + g_mu=sqrt(g_mu*g_mu+mstarsq); + Q_pm_psi(R, S); + g_mu=g_muWithoutMStarSquare; + +/* spinor *aux_ = NULL, *aux; + spinor *aux2_ = NULL, *aux2; + int N = VOLUMEPLUSRAND; + double twokmu, g_musq; + +#if ( defined SSE || defined SSE2 || defined SSE3) + aux_=calloc(VOLUMEPLUSRAND+1, sizeof(spinor)); + aux = (spinor *)(((unsigned long int)(aux_)+ALIGN_BASE)&~ALIGN_BASE); + aux2_=calloc(VOLUMEPLUSRAND+1, sizeof(spinor)); + aux2 = (spinor *)(((unsigned long int)(aux2_)+ALIGN_BASE)&~ALIGN_BASE); +#else + aux_=calloc(VOLUMEPLUSRAND, sizeof(spinor)); + aux = aux_; + aux2_=calloc(VOLUMEPLUSRAND, sizeof(spinor)); + aux2 = aux2_; +#endif + + assign(aux2, S, VOLUME); + + // we have to apply DdagerD and M*^2 to the same field S + twokmu=g_mu; + g_mu=0.; + //org: + //D_psi(R, S); + //gamma5(aux, R, VOLUME); + //D_psi(R, aux); + //gamma5(R, R, VOLUME); + D_psi(aux, S); + gamma5(R, aux, VOLUME); + D_psi(aux,R); + gamma5(R, aux, VOLUME); + + g_mu=twokmu; + g_musq=g_mu*g_mu; + assign_add_mul_r(R, aux2, mstarsq, VOLUME); + if(g_musq!=0) assign_add_mul_r(R, aux2, g_musq, VOLUME); + + free(aux_); + free(aux2_);*/ +} + +#define X_psiSIterations 5000 +#define X_psiSPrecision 1.e-6 + + +void X_psi(spinor * const R, spinor * const S, double const mstarsq){ + + // double a = -2*mstar*mstar; + double a = -2*mstarsq; + double b = 1.; + double g_muWithoutMStarSquare=g_mu; + + /*cg_her(out spinor, in spinor, max iter, solver precision, flag relative precision default 0, volume, operator to invert)*/ + #ifdef HAVE_GPU + if(usegpu_flag) + { + if(g_proc_id == 0) printf("Using GPU for inversion\n"); + // call mixed_solve_DiracDaggerDaggerD for double precision calculations on gpu - may be faster if the device supports compute capability >= 2.0 (fermi generation) + {//include M^{*2} into twisted mass g_mu => saves one assign_multiply_add; + g_mu=sqrt(g_mu*g_mu+mstarsq); + mixed_solve_DiracDaggerDirac ( R, S, X_psiSIterations, X_psiSPrecision, 0/*!rel_prec*/, VOLUME); + //mixed_solve_DiracDaggerDiracD( R, S, X_psiSIterations, X_psiSPrecision, 0/*!rel_prec*/, VOLUME); + g_mu=g_muWithoutMStarSquare; + } + } + else + #endif + { + if(g_proc_id == 0) printf("Using CPU for inversion\n"); + cg_her( R, S, X_psiSIterations, X_psiSPrecision, 0, VOLUME, &DdaggerD_plus_M); + } + fflush(stdout); + +/*//// Test + spinor *aux_ = NULL, *aux; + int N = VOLUMEPLUSRAND; + +#if ( defined SSE || defined SSE2 || defined SSE3) + aux_=calloc(VOLUMEPLUSRAND+1, sizeof(spinor)); + aux = (spinor *)(((unsigned long int)(aux_)+ALIGN_BASE)&~ALIGN_BASE); +#else + aux_=calloc(VOLUMEPLUSRAND, sizeof(spinor)); + aux = aux_; +#endif + + //Q_pm_psi_gpu(aux,R); + DdaggerD_plus_M(aux,R); + diff(aux,S,aux,N); + double t=square_norm(aux,N,1); + printf("TestMStar %lf\n",t); + exit(1); +*//// Test + assign_mul_add_mul_r( R, S, a, b, VOLUME); +} + + +void X_psiSquare(spinor * const R, spinor * const S, double const mstarsq) +{//inverts DD^+DD^+ instead of DD^+ but performs poorly + spinor *aux_,*aux; + { + #if ( defined SSE || defined SSE2 || defined SSE3 ) + aux_=calloc(VOLUMEPLUSRAND+1, sizeof(spinor)); + aux = (spinor *)(((unsigned long int)(aux_)+ALIGN_BASE)&~ALIGN_BASE); + #else + aux_=calloc(VOLUMEPLUSRAND, sizeof(spinor)); + aux = aux_; + #endif + } + + #ifdef HAVE_GPU + if(usegpu_flag) + { + if(g_proc_id == 0) printf("Using GPU for inversion\n"); + // call mixed_solve_DiracDaggerDaggerD for double precision calculations on gpu - may be faster if the device supports compute capability >= 2.0 (fermi generation) + + {//include M^{*2} into twisted mass g_mu => saves one assign_multiply_add; + double g_muWithoutMStarSquare=g_mu; + + g_mu=sqrt(g_mu*g_mu+mstarsq); + mixed_solve_DiracDaggerDiracDiracDaggerDirac ( R, S, X_psiSIterations, X_psiSPrecision, 0/*!rel_prec*/, VOLUME); + //mixed_solve_DiracDaggerDiracDiracDagerDiracD( R, S, X_psiSIterations, X_psiSPrecision, 0/*!rel_prec*/, VOLUME); + g_mu=g_muWithoutMStarSquare; + }//R holds now the value of (D^+DD^+D)^-1 ! + + DdaggerD_plus_M(aux,R); + + assign_mul_add_mul_r( R, aux, mstarsq, -1, VOLUME); + assign_mul_add_mul_r( R, S, 4*mstarsq, 1, VOLUME);//1-4mstarsq(-(D^+D+mstarsq)^-1 + mstarsq*(D^+D+mstarsq)^-2) + + + free(aux_); + fflush(stdout); + } + else + #endif + { + X_psi(aux, S, mstarsq); + X_psi(R, aux, mstarsq); + } + + free(aux_); +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/X_psi.h b/qcd/part_cpu/applications/QCD/src/kernel_D/X_psi.h new file mode 100644 index 0000000000000000000000000000000000000000..929c2fa90c72c8b28fcb06756a93fc981f320645 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/X_psi.h @@ -0,0 +1,32 @@ +/*********************************************************************** + * + * Copyright (C) 2011 Elena Garcia-Ramos + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _X_PSI_H +#define _X_PSI_H + +#include "su3.h" + +extern double mstar; + +void DdaggerD_plus_M(spinor * const R, spinor * const S); +void X_psi(spinor * const R, spinor * const S, double const mstar); +void X_psiSquare(spinor * const R, spinor * const S, double const mstar); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/aligned_malloc.c b/qcd/part_cpu/applications/QCD/src/kernel_D/aligned_malloc.c new file mode 100644 index 0000000000000000000000000000000000000000..c8e3703b57bd801b423fc336a41e71ffe80358a9 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/aligned_malloc.c @@ -0,0 +1,101 @@ +/*********************************************************************** + * Copyright (C) 2015 Bartosz Kostrzewa + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . +***********************************************************************/ + +#if HAVE_CONFIG_H +#include +#endif + +#include +#include +#include +#include "aligned_malloc.h" +#include "su3.h" +#include "su3adj.h" + +#include "fatal_error.h" + +void *aligned_malloc(size_t const size) { + void *mem = malloc(size+ALIGN_BASE+sizeof(void*)); + void ** ptr; + + if(mem == NULL) { + return(mem); + } + + ptr = (void**)(((uintptr_t)mem+(uintptr_t)ALIGN_BASE+sizeof(void*)) & ~ (uintptr_t)(ALIGN_BASE)); + ptr[-1] = mem; + + return ptr; +} + +void aligned_free(void *ptr) { + free(((void**)ptr)[-1]); +} + +aligned_su3_field_t aligned_su3_field_alloc(const unsigned int V) { + aligned_su3_field_t f_struct; + + su3** field = (su3**) aligned_malloc(V*sizeof(su3*)); + su3* mem = (su3*)aligned_malloc((4*V+1)*sizeof(su3)); + + if( (void*)field == (void*)NULL || (void*)mem == (void*)NULL ) { + fatal_error("Memory allocation error!","aligned_su3_field_alloc"); + } + + field[0] = mem; + for(int i = 1; i < V; ++i) { + field[i] = field[i-1]+4; + } + + f_struct.field = field; + f_struct.mem = mem; + + return(f_struct); +} + +aligned_su3adj_field_t aligned_su3adj_field_alloc(const unsigned int V) { + aligned_su3adj_field_t f_struct; + su3adj** field = (su3adj**) aligned_malloc(V*sizeof(su3adj*)); + su3adj* mem = (su3adj*)aligned_malloc((4*V+1)*sizeof(su3adj)); + + if( (void*)field == (void*)NULL || (void*)mem == (void*)NULL ) { + fatal_error("Memory allocation error!","aligned_su3_field_alloc"); + } + + field[0] = mem; + for(int i = 1; i < V; ++i) { + field[i] = field[i-1]+4; + } + + f_struct.field = field; + f_struct.mem = mem; + + return(f_struct); +} + +void aligned_su3_field_free(const aligned_su3_field_t* f_struct) { + aligned_free((void*)f_struct->field); + aligned_free((void*)f_struct->mem); +} + +void aligned_su3adj_field_free(const aligned_su3adj_field_t* f_struct) { + aligned_free((void*)f_struct->field); + aligned_free((void*)f_struct->mem); +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/aligned_malloc.h b/qcd/part_cpu/applications/QCD/src/kernel_D/aligned_malloc.h new file mode 100644 index 0000000000000000000000000000000000000000..ffbebaa51d512e73e97ab9a4e46d46861dbf176f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/aligned_malloc.h @@ -0,0 +1,42 @@ +/*********************************************************************** + * Copyright (C) 2015 Bartosz Kostrzewa + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . +***********************************************************************/ + +#ifndef _ALIGNED_MALLOC_H +#define _ALIGNED_MALLOC_H + +#include "su3.h" +#include "su3adj.h" + +typedef struct { + su3** field; + su3* mem; +} aligned_su3_field_t; + +typedef struct { + su3adj** field; + su3adj* mem; +} aligned_su3adj_field_t; + +aligned_su3_field_t aligned_su3_field_alloc(const unsigned int V); +aligned_su3adj_field_t aligned_su3adj_field_alloc(const unsigned int V); + +void aligned_su3_field_free(const aligned_su3_field_t* f_struct); +void aligned_su3adj_field_free(const aligned_su3adj_field_t* f_stuct); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/autom4te.cache/output.0 b/qcd/part_cpu/applications/QCD/src/kernel_D/autom4te.cache/output.0 new file mode 100644 index 0000000000000000000000000000000000000000..46b1adcde9c87c43d3f64df9a81979376ddc8012 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/autom4te.cache/output.0 @@ -0,0 +1,9823 @@ +@%:@! /bin/sh +@%:@ Guess values for system-dependent variables and create Makefiles. +@%:@ Generated by GNU Autoconf 2.69 for tmLQCD 5.2.0. +@%:@ +@%:@ Report bugs to . +@%:@ +@%:@ +@%:@ Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. +@%:@ +@%:@ +@%:@ This configure script is free software; the Free Software Foundation +@%:@ gives unlimited permission to copy, distribute and modify it. +## -------------------- ## +## M4sh Initialization. ## +## -------------------- ## + +# Be more Bourne compatible +DUALCASE=1; export DUALCASE # for MKS sh +if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then : + emulate sh + NULLCMD=: + # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which + # is contrary to our usage. Disable this feature. + alias -g '${1+"$@"}'='"$@"' + setopt NO_GLOB_SUBST +else + case `(set -o) 2>/dev/null` in @%:@( + *posix*) : + set -o posix ;; @%:@( + *) : + ;; +esac +fi + + +as_nl=' +' +export as_nl +# Printing a long string crashes Solaris 7 /usr/bin/printf. +as_echo='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' +as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo +as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo$as_echo +# Prefer a ksh shell builtin over an external printf program on Solaris, +# but without wasting forks for bash or zsh. +if test -z "$BASH_VERSION$ZSH_VERSION" \ + && (test "X`print -r -- $as_echo`" = "X$as_echo") 2>/dev/null; then + as_echo='print -r --' + as_echo_n='print -rn --' +elif (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then + as_echo='printf %s\n' + as_echo_n='printf %s' +else + if test "X`(/usr/ucb/echo -n -n $as_echo) 2>/dev/null`" = "X-n $as_echo"; then + as_echo_body='eval /usr/ucb/echo -n "$1$as_nl"' + as_echo_n='/usr/ucb/echo -n' + else + as_echo_body='eval expr "X$1" : "X\\(.*\\)"' + as_echo_n_body='eval + arg=$1; + case $arg in @%:@( + *"$as_nl"*) + expr "X$arg" : "X\\(.*\\)$as_nl"; + arg=`expr "X$arg" : ".*$as_nl\\(.*\\)"`;; + esac; + expr "X$arg" : "X\\(.*\\)" | tr -d "$as_nl" + ' + export as_echo_n_body + as_echo_n='sh -c $as_echo_n_body as_echo' + fi + export as_echo_body + as_echo='sh -c $as_echo_body as_echo' +fi + +# The user is always right. +if test "${PATH_SEPARATOR+set}" != set; then + PATH_SEPARATOR=: + (PATH='/bin;/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 && { + (PATH='/bin:/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 || + PATH_SEPARATOR=';' + } +fi + + +# IFS +# We need space, tab and new line, in precisely that order. Quoting is +# there to prevent editors from complaining about space-tab. +# (If _AS_PATH_WALK were called with IFS unset, it would disable word +# splitting by setting IFS to empty value.) +IFS=" "" $as_nl" + +# Find who we are. Look in the path if we contain no directory separator. +as_myself= +case $0 in @%:@(( + *[\\/]* ) as_myself=$0 ;; + *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break + done +IFS=$as_save_IFS + + ;; +esac +# We did not find ourselves, most probably we were run as `sh COMMAND' +# in which case we are not to be found in the path. +if test "x$as_myself" = x; then + as_myself=$0 +fi +if test ! -f "$as_myself"; then + $as_echo "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2 + exit 1 +fi + +# Unset variables that we do not need and which cause bugs (e.g. in +# pre-3.0 UWIN ksh). But do not cause bugs in bash 2.01; the "|| exit 1" +# suppresses any "Segmentation fault" message there. '((' could +# trigger a bug in pdksh 5.2.14. +for as_var in BASH_ENV ENV MAIL MAILPATH +do eval test x\${$as_var+set} = xset \ + && ( (unset $as_var) || exit 1) >/dev/null 2>&1 && unset $as_var || : +done +PS1='$ ' +PS2='> ' +PS4='+ ' + +# NLS nuisances. +LC_ALL=C +export LC_ALL +LANGUAGE=C +export LANGUAGE + +# CDPATH. +(unset CDPATH) >/dev/null 2>&1 && unset CDPATH + +# Use a proper internal environment variable to ensure we don't fall + # into an infinite loop, continuously re-executing ourselves. + if test x"${_as_can_reexec}" != xno && test "x$CONFIG_SHELL" != x; then + _as_can_reexec=no; export _as_can_reexec; + # We cannot yet assume a decent shell, so we have to provide a +# neutralization value for shells without unset; and this also +# works around shells that cannot unset nonexistent variables. +# Preserve -v and -x to the replacement shell. +BASH_ENV=/dev/null +ENV=/dev/null +(unset BASH_ENV) >/dev/null 2>&1 && unset BASH_ENV ENV +case $- in @%:@ (((( + *v*x* | *x*v* ) as_opts=-vx ;; + *v* ) as_opts=-v ;; + *x* ) as_opts=-x ;; + * ) as_opts= ;; +esac +exec $CONFIG_SHELL $as_opts "$as_myself" ${1+"$@"} +# Admittedly, this is quite paranoid, since all the known shells bail +# out after a failed `exec'. +$as_echo "$0: could not re-execute with $CONFIG_SHELL" >&2 +as_fn_exit 255 + fi + # We don't want this to propagate to other subprocesses. + { _as_can_reexec=; unset _as_can_reexec;} +if test "x$CONFIG_SHELL" = x; then + as_bourne_compatible="if test -n \"\${ZSH_VERSION+set}\" && (emulate sh) >/dev/null 2>&1; then : + emulate sh + NULLCMD=: + # Pre-4.2 versions of Zsh do word splitting on \${1+\"\$@\"}, which + # is contrary to our usage. Disable this feature. + alias -g '\${1+\"\$@\"}'='\"\$@\"' + setopt NO_GLOB_SUBST +else + case \`(set -o) 2>/dev/null\` in @%:@( + *posix*) : + set -o posix ;; @%:@( + *) : + ;; +esac +fi +" + as_required="as_fn_return () { (exit \$1); } +as_fn_success () { as_fn_return 0; } +as_fn_failure () { as_fn_return 1; } +as_fn_ret_success () { return 0; } +as_fn_ret_failure () { return 1; } + +exitcode=0 +as_fn_success || { exitcode=1; echo as_fn_success failed.; } +as_fn_failure && { exitcode=1; echo as_fn_failure succeeded.; } +as_fn_ret_success || { exitcode=1; echo as_fn_ret_success failed.; } +as_fn_ret_failure && { exitcode=1; echo as_fn_ret_failure succeeded.; } +if ( set x; as_fn_ret_success y && test x = \"\$1\" ); then : + +else + exitcode=1; echo positional parameters were not saved. +fi +test x\$exitcode = x0 || exit 1 +test -x / || exit 1" + as_suggested=" as_lineno_1=";as_suggested=$as_suggested$LINENO;as_suggested=$as_suggested" as_lineno_1a=\$LINENO + as_lineno_2=";as_suggested=$as_suggested$LINENO;as_suggested=$as_suggested" as_lineno_2a=\$LINENO + eval 'test \"x\$as_lineno_1'\$as_run'\" != \"x\$as_lineno_2'\$as_run'\" && + test \"x\`expr \$as_lineno_1'\$as_run' + 1\`\" = \"x\$as_lineno_2'\$as_run'\"' || exit 1 +test \$(( 1 + 1 )) = 2 || exit 1" + if (eval "$as_required") 2>/dev/null; then : + as_have_required=yes +else + as_have_required=no +fi + if test x$as_have_required = xyes && (eval "$as_suggested") 2>/dev/null; then : + +else + as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +as_found=false +for as_dir in /bin$PATH_SEPARATOR/usr/bin$PATH_SEPARATOR$PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + as_found=: + case $as_dir in @%:@( + /*) + for as_base in sh bash ksh sh5; do + # Try only shells that exist, to save several forks. + as_shell=$as_dir/$as_base + if { test -f "$as_shell" || test -f "$as_shell.exe"; } && + { $as_echo "$as_bourne_compatible""$as_required" | as_run=a "$as_shell"; } 2>/dev/null; then : + CONFIG_SHELL=$as_shell as_have_required=yes + if { $as_echo "$as_bourne_compatible""$as_suggested" | as_run=a "$as_shell"; } 2>/dev/null; then : + break 2 +fi +fi + done;; + esac + as_found=false +done +$as_found || { if { test -f "$SHELL" || test -f "$SHELL.exe"; } && + { $as_echo "$as_bourne_compatible""$as_required" | as_run=a "$SHELL"; } 2>/dev/null; then : + CONFIG_SHELL=$SHELL as_have_required=yes +fi; } +IFS=$as_save_IFS + + + if test "x$CONFIG_SHELL" != x; then : + export CONFIG_SHELL + # We cannot yet assume a decent shell, so we have to provide a +# neutralization value for shells without unset; and this also +# works around shells that cannot unset nonexistent variables. +# Preserve -v and -x to the replacement shell. +BASH_ENV=/dev/null +ENV=/dev/null +(unset BASH_ENV) >/dev/null 2>&1 && unset BASH_ENV ENV +case $- in @%:@ (((( + *v*x* | *x*v* ) as_opts=-vx ;; + *v* ) as_opts=-v ;; + *x* ) as_opts=-x ;; + * ) as_opts= ;; +esac +exec $CONFIG_SHELL $as_opts "$as_myself" ${1+"$@"} +# Admittedly, this is quite paranoid, since all the known shells bail +# out after a failed `exec'. +$as_echo "$0: could not re-execute with $CONFIG_SHELL" >&2 +exit 255 +fi + + if test x$as_have_required = xno; then : + $as_echo "$0: This script requires a shell more modern than all" + $as_echo "$0: the shells that I found on your system." + if test x${ZSH_VERSION+set} = xset ; then + $as_echo "$0: In particular, zsh $ZSH_VERSION has bugs and should" + $as_echo "$0: be upgraded to zsh 4.3.4 or later." + else + $as_echo "$0: Please tell bug-autoconf@gnu.org and curbach@gmx.de +$0: about your system, including any error possibly output +$0: before this message. Then install a modern shell, or +$0: manually run the script under such a shell if you do +$0: have one." + fi + exit 1 +fi +fi +fi +SHELL=${CONFIG_SHELL-/bin/sh} +export SHELL +# Unset more variables known to interfere with behavior of common tools. +CLICOLOR_FORCE= GREP_OPTIONS= +unset CLICOLOR_FORCE GREP_OPTIONS + +## --------------------- ## +## M4sh Shell Functions. ## +## --------------------- ## +@%:@ as_fn_unset VAR +@%:@ --------------- +@%:@ Portably unset VAR. +as_fn_unset () +{ + { eval $1=; unset $1;} +} +as_unset=as_fn_unset + +@%:@ as_fn_set_status STATUS +@%:@ ----------------------- +@%:@ Set @S|@? to STATUS, without forking. +as_fn_set_status () +{ + return $1 +} @%:@ as_fn_set_status + +@%:@ as_fn_exit STATUS +@%:@ ----------------- +@%:@ Exit the shell with STATUS, even in a "trap 0" or "set -e" context. +as_fn_exit () +{ + set +e + as_fn_set_status $1 + exit $1 +} @%:@ as_fn_exit + +@%:@ as_fn_mkdir_p +@%:@ ------------- +@%:@ Create "@S|@as_dir" as a directory, including parents if necessary. +as_fn_mkdir_p () +{ + + case $as_dir in #( + -*) as_dir=./$as_dir;; + esac + test -d "$as_dir" || eval $as_mkdir_p || { + as_dirs= + while :; do + case $as_dir in #( + *\'*) as_qdir=`$as_echo "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #'( + *) as_qdir=$as_dir;; + esac + as_dirs="'$as_qdir' $as_dirs" + as_dir=`$as_dirname -- "$as_dir" || +$as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ + X"$as_dir" : 'X\(//\)[^/]' \| \ + X"$as_dir" : 'X\(//\)$' \| \ + X"$as_dir" : 'X\(/\)' \| . 2>/dev/null || +$as_echo X"$as_dir" | + sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ + s//\1/ + q + } + /^X\(\/\/\)[^/].*/{ + s//\1/ + q + } + /^X\(\/\/\)$/{ + s//\1/ + q + } + /^X\(\/\).*/{ + s//\1/ + q + } + s/.*/./; q'` + test -d "$as_dir" && break + done + test -z "$as_dirs" || eval "mkdir $as_dirs" + } || test -d "$as_dir" || as_fn_error $? "cannot create directory $as_dir" + + +} @%:@ as_fn_mkdir_p + +@%:@ as_fn_executable_p FILE +@%:@ ----------------------- +@%:@ Test if FILE is an executable regular file. +as_fn_executable_p () +{ + test -f "$1" && test -x "$1" +} @%:@ as_fn_executable_p +@%:@ as_fn_append VAR VALUE +@%:@ ---------------------- +@%:@ Append the text in VALUE to the end of the definition contained in VAR. Take +@%:@ advantage of any shell optimizations that allow amortized linear growth over +@%:@ repeated appends, instead of the typical quadratic growth present in naive +@%:@ implementations. +if (eval "as_var=1; as_var+=2; test x\$as_var = x12") 2>/dev/null; then : + eval 'as_fn_append () + { + eval $1+=\$2 + }' +else + as_fn_append () + { + eval $1=\$$1\$2 + } +fi # as_fn_append + +@%:@ as_fn_arith ARG... +@%:@ ------------------ +@%:@ Perform arithmetic evaluation on the ARGs, and store the result in the +@%:@ global @S|@as_val. Take advantage of shells that can avoid forks. The arguments +@%:@ must be portable across @S|@(()) and expr. +if (eval "test \$(( 1 + 1 )) = 2") 2>/dev/null; then : + eval 'as_fn_arith () + { + as_val=$(( $* )) + }' +else + as_fn_arith () + { + as_val=`expr "$@" || test $? -eq 1` + } +fi # as_fn_arith + + +@%:@ as_fn_error STATUS ERROR [LINENO LOG_FD] +@%:@ ---------------------------------------- +@%:@ Output "`basename @S|@0`: error: ERROR" to stderr. If LINENO and LOG_FD are +@%:@ provided, also output the error to LOG_FD, referencing LINENO. Then exit the +@%:@ script with STATUS, using 1 if that was 0. +as_fn_error () +{ + as_status=$1; test $as_status -eq 0 && as_status=1 + if test "$4"; then + as_lineno=${as_lineno-"$3"} as_lineno_stack=as_lineno_stack=$as_lineno_stack + $as_echo "$as_me:${as_lineno-$LINENO}: error: $2" >&$4 + fi + $as_echo "$as_me: error: $2" >&2 + as_fn_exit $as_status +} @%:@ as_fn_error + +if expr a : '\(a\)' >/dev/null 2>&1 && + test "X`expr 00001 : '.*\(...\)'`" = X001; then + as_expr=expr +else + as_expr=false +fi + +if (basename -- /) >/dev/null 2>&1 && test "X`basename -- / 2>&1`" = "X/"; then + as_basename=basename +else + as_basename=false +fi + +if (as_dir=`dirname -- /` && test "X$as_dir" = X/) >/dev/null 2>&1; then + as_dirname=dirname +else + as_dirname=false +fi + +as_me=`$as_basename -- "$0" || +$as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \ + X"$0" : 'X\(//\)$' \| \ + X"$0" : 'X\(/\)' \| . 2>/dev/null || +$as_echo X/"$0" | + sed '/^.*\/\([^/][^/]*\)\/*$/{ + s//\1/ + q + } + /^X\/\(\/\/\)$/{ + s//\1/ + q + } + /^X\/\(\/\).*/{ + s//\1/ + q + } + s/.*/./; q'` + +# Avoid depending upon Character Ranges. +as_cr_letters='abcdefghijklmnopqrstuvwxyz' +as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ' +as_cr_Letters=$as_cr_letters$as_cr_LETTERS +as_cr_digits='0123456789' +as_cr_alnum=$as_cr_Letters$as_cr_digits + + + as_lineno_1=$LINENO as_lineno_1a=$LINENO + as_lineno_2=$LINENO as_lineno_2a=$LINENO + eval 'test "x$as_lineno_1'$as_run'" != "x$as_lineno_2'$as_run'" && + test "x`expr $as_lineno_1'$as_run' + 1`" = "x$as_lineno_2'$as_run'"' || { + # Blame Lee E. McMahon (1931-1989) for sed's syntax. :-) + sed -n ' + p + /[$]LINENO/= + ' <$as_myself | + sed ' + s/[$]LINENO.*/&-/ + t lineno + b + :lineno + N + :loop + s/[$]LINENO\([^'$as_cr_alnum'_].*\n\)\(.*\)/\2\1\2/ + t loop + s/-\n.*// + ' >$as_me.lineno && + chmod +x "$as_me.lineno" || + { $as_echo "$as_me: error: cannot create $as_me.lineno; rerun with a POSIX shell" >&2; as_fn_exit 1; } + + # If we had to re-execute with $CONFIG_SHELL, we're ensured to have + # already done that, so ensure we don't try to do so again and fall + # in an infinite loop. This has already happened in practice. + _as_can_reexec=no; export _as_can_reexec + # Don't try to exec as it changes $[0], causing all sort of problems + # (the dirname of $[0] is not the place where we might find the + # original and so on. Autoconf is especially sensitive to this). + . "./$as_me.lineno" + # Exit status is that of the last command. + exit +} + +ECHO_C= ECHO_N= ECHO_T= +case `echo -n x` in @%:@((((( +-n*) + case `echo 'xy\c'` in + *c*) ECHO_T=' ';; # ECHO_T is single tab character. + xy) ECHO_C='\c';; + *) echo `echo ksh88 bug on AIX 6.1` > /dev/null + ECHO_T=' ';; + esac;; +*) + ECHO_N='-n';; +esac + +rm -f conf$$ conf$$.exe conf$$.file +if test -d conf$$.dir; then + rm -f conf$$.dir/conf$$.file +else + rm -f conf$$.dir + mkdir conf$$.dir 2>/dev/null +fi +if (echo >conf$$.file) 2>/dev/null; then + if ln -s conf$$.file conf$$ 2>/dev/null; then + as_ln_s='ln -s' + # ... but there are two gotchas: + # 1) On MSYS, both `ln -s file dir' and `ln file dir' fail. + # 2) DJGPP < 2.04 has no symlinks; `ln -s' creates a wrapper executable. + # In both cases, we have to default to `cp -pR'. + ln -s conf$$.file conf$$.dir 2>/dev/null && test ! -f conf$$.exe || + as_ln_s='cp -pR' + elif ln conf$$.file conf$$ 2>/dev/null; then + as_ln_s=ln + else + as_ln_s='cp -pR' + fi +else + as_ln_s='cp -pR' +fi +rm -f conf$$ conf$$.exe conf$$.dir/conf$$.file conf$$.file +rmdir conf$$.dir 2>/dev/null + +if mkdir -p . 2>/dev/null; then + as_mkdir_p='mkdir -p "$as_dir"' +else + test -d ./-p && rmdir ./-p + as_mkdir_p=false +fi + +as_test_x='test -x' +as_executable_p=as_fn_executable_p + +# Sed expression to map a string onto a valid CPP name. +as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'" + +# Sed expression to map a string onto a valid variable name. +as_tr_sh="eval sed 'y%*+%pp%;s%[^_$as_cr_alnum]%_%g'" + + +test -n "$DJDIR" || exec 7<&0 &1 + +# Name of the host. +# hostname on some systems (SVR3.2, old GNU/Linux) returns a bogus exit status, +# so uname gets run too. +ac_hostname=`(hostname || uname -n) 2>/dev/null | sed 1q` + +# +# Initializations. +# +ac_default_prefix=/usr/local +ac_clean_files= +ac_config_libobj_dir=. +LIB@&t@OBJS= +cross_compiling=no +subdirs= +MFLAGS= +MAKEFLAGS= + +# Identity of this package. +PACKAGE_NAME='tmLQCD' +PACKAGE_TARNAME='tmlqcd' +PACKAGE_VERSION='5.2.0' +PACKAGE_STRING='tmLQCD 5.2.0' +PACKAGE_BUGREPORT='curbach@gmx.de' +PACKAGE_URL='' + +ac_unique_file="hmc_tm.c" +ac_default_prefix=$HOME +# Factoring default headers for most tests. +ac_includes_default="\ +#include +#ifdef HAVE_SYS_TYPES_H +# include +#endif +#ifdef HAVE_SYS_STAT_H +# include +#endif +#ifdef STDC_HEADERS +# include +# include +#else +# ifdef HAVE_STDLIB_H +# include +# endif +#endif +#ifdef HAVE_STRING_H +# if !defined STDC_HEADERS && defined HAVE_MEMORY_H +# include +# endif +# include +#endif +#ifdef HAVE_STRINGS_H +# include +#endif +#ifdef HAVE_INTTYPES_H +# include +#endif +#ifdef HAVE_STDINT_H +# include +#endif +#ifdef HAVE_UNISTD_H +# include +#endif" + +ac_subst_vars='LTLIBOBJS +QUDA_AVAILABLE +ac_ct_CXX +CXXFLAGS +CXX +GPUMPICOMPILER +GPUCFLAGS +GPUDIR +NVCC +USESUBDIRS +QUDA_INTERFACE +SPI_FILES +LEMON_AVAILABLE +XLIB +MEASDIR +XCHANGEDIR +XCHANGELIB +PROFILE_FLAG +DEBUG_FLAG +DEPFLAGS +CCLD +SOLVEROUT +AUTOCONF +INCLUDES +SOPTARGS +OPTARGS +LIB@&t@OBJS +OPENMP_CFLAGS +EGREP +GREP +CPP +CCDEP +RANLIB +SET_MAKE +LEXLIB +LEX_OUTPUT_ROOT +LEX +AR +FLIBS +ac_ct_F77 +FFLAGS +F77 +OBJEXT +EXEEXT +ac_ct_CC +CPPFLAGS +LDFLAGS +CFLAGS +CC +host_os +host_vendor +host_cpu +host +build_os +build_vendor +build_cpu +build +target_alias +host_alias +build_alias +LIBS +ECHO_T +ECHO_N +ECHO_C +DEFS +mandir +localedir +libdir +psdir +pdfdir +dvidir +htmldir +infodir +docdir +oldincludedir +includedir +localstatedir +sharedstatedir +sysconfdir +datadir +datarootdir +libexecdir +sbindir +bindir +program_transform_name +prefix +exec_prefix +PACKAGE_URL +PACKAGE_BUGREPORT +PACKAGE_STRING +PACKAGE_VERSION +PACKAGE_TARNAME +PACKAGE_NAME +PATH_SEPARATOR +SHELL' +ac_subst_files='' +ac_user_opts=' +enable_option_checking +enable_benchmark +with_limedir +with_lemondir +enable_indexindepgeom +enable_mpi +enable_qpx +enable_spi +enable_omp +enable_openmp +enable_fftw +with_mpidimension +with_persistentmpi +with_nonblockingmpi +with_fixedvolume +with_kojakinst +with_lapack +enable_largefile +enable_alignment +enable_p4 +enable_opteron +enable_sse2 +enable_sse3 +with_gprof +with_bgldram +enable_optimize +enable_gaugecopy +enable_halfspinor +enable_shmem +enable_tsplitpar +enable_laph +enable_gpu +with_cuda +with_cudacompileargs +with_qudadir +with_cudadir +' + ac_precious_vars='build_alias +host_alias +target_alias +CC +CFLAGS +LDFLAGS +LIBS +CPPFLAGS +F77 +FFLAGS +CPP +CXX +CXXFLAGS +CCC' + + +# Initialize some variables set by options. +ac_init_help= +ac_init_version=false +ac_unrecognized_opts= +ac_unrecognized_sep= +# The variables have the same names as the options, with +# dashes changed to underlines. +cache_file=/dev/null +exec_prefix=NONE +no_create= +no_recursion= +prefix=NONE +program_prefix=NONE +program_suffix=NONE +program_transform_name=s,x,x, +silent= +site= +srcdir= +verbose= +x_includes=NONE +x_libraries=NONE + +# Installation directory options. +# These are left unexpanded so users can "make install exec_prefix=/foo" +# and all the variables that are supposed to be based on exec_prefix +# by default will actually change. +# Use braces instead of parens because sh, perl, etc. also accept them. +# (The list follows the same order as the GNU Coding Standards.) +bindir='${exec_prefix}/bin' +sbindir='${exec_prefix}/sbin' +libexecdir='${exec_prefix}/libexec' +datarootdir='${prefix}/share' +datadir='${datarootdir}' +sysconfdir='${prefix}/etc' +sharedstatedir='${prefix}/com' +localstatedir='${prefix}/var' +includedir='${prefix}/include' +oldincludedir='/usr/include' +docdir='${datarootdir}/doc/${PACKAGE_TARNAME}' +infodir='${datarootdir}/info' +htmldir='${docdir}' +dvidir='${docdir}' +pdfdir='${docdir}' +psdir='${docdir}' +libdir='${exec_prefix}/lib' +localedir='${datarootdir}/locale' +mandir='${datarootdir}/man' + +ac_prev= +ac_dashdash= +for ac_option +do + # If the previous option needs an argument, assign it. + if test -n "$ac_prev"; then + eval $ac_prev=\$ac_option + ac_prev= + continue + fi + + case $ac_option in + *=?*) ac_optarg=`expr "X$ac_option" : '[^=]*=\(.*\)'` ;; + *=) ac_optarg= ;; + *) ac_optarg=yes ;; + esac + + # Accept the important Cygnus configure options, so we can diagnose typos. + + case $ac_dashdash$ac_option in + --) + ac_dashdash=yes ;; + + -bindir | --bindir | --bindi | --bind | --bin | --bi) + ac_prev=bindir ;; + -bindir=* | --bindir=* | --bindi=* | --bind=* | --bin=* | --bi=*) + bindir=$ac_optarg ;; + + -build | --build | --buil | --bui | --bu) + ac_prev=build_alias ;; + -build=* | --build=* | --buil=* | --bui=* | --bu=*) + build_alias=$ac_optarg ;; + + -cache-file | --cache-file | --cache-fil | --cache-fi \ + | --cache-f | --cache- | --cache | --cach | --cac | --ca | --c) + ac_prev=cache_file ;; + -cache-file=* | --cache-file=* | --cache-fil=* | --cache-fi=* \ + | --cache-f=* | --cache-=* | --cache=* | --cach=* | --cac=* | --ca=* | --c=*) + cache_file=$ac_optarg ;; + + --config-cache | -C) + cache_file=config.cache ;; + + -datadir | --datadir | --datadi | --datad) + ac_prev=datadir ;; + -datadir=* | --datadir=* | --datadi=* | --datad=*) + datadir=$ac_optarg ;; + + -datarootdir | --datarootdir | --datarootdi | --datarootd | --dataroot \ + | --dataroo | --dataro | --datar) + ac_prev=datarootdir ;; + -datarootdir=* | --datarootdir=* | --datarootdi=* | --datarootd=* \ + | --dataroot=* | --dataroo=* | --dataro=* | --datar=*) + datarootdir=$ac_optarg ;; + + -disable-* | --disable-*) + ac_useropt=`expr "x$ac_option" : 'x-*disable-\(.*\)'` + # Reject names that are not valid shell variable names. + expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null && + as_fn_error $? "invalid feature name: $ac_useropt" + ac_useropt_orig=$ac_useropt + ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'` + case $ac_user_opts in + *" +"enable_$ac_useropt" +"*) ;; + *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--disable-$ac_useropt_orig" + ac_unrecognized_sep=', ';; + esac + eval enable_$ac_useropt=no ;; + + -docdir | --docdir | --docdi | --doc | --do) + ac_prev=docdir ;; + -docdir=* | --docdir=* | --docdi=* | --doc=* | --do=*) + docdir=$ac_optarg ;; + + -dvidir | --dvidir | --dvidi | --dvid | --dvi | --dv) + ac_prev=dvidir ;; + -dvidir=* | --dvidir=* | --dvidi=* | --dvid=* | --dvi=* | --dv=*) + dvidir=$ac_optarg ;; + + -enable-* | --enable-*) + ac_useropt=`expr "x$ac_option" : 'x-*enable-\([^=]*\)'` + # Reject names that are not valid shell variable names. + expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null && + as_fn_error $? "invalid feature name: $ac_useropt" + ac_useropt_orig=$ac_useropt + ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'` + case $ac_user_opts in + *" +"enable_$ac_useropt" +"*) ;; + *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--enable-$ac_useropt_orig" + ac_unrecognized_sep=', ';; + esac + eval enable_$ac_useropt=\$ac_optarg ;; + + -exec-prefix | --exec_prefix | --exec-prefix | --exec-prefi \ + | --exec-pref | --exec-pre | --exec-pr | --exec-p | --exec- \ + | --exec | --exe | --ex) + ac_prev=exec_prefix ;; + -exec-prefix=* | --exec_prefix=* | --exec-prefix=* | --exec-prefi=* \ + | --exec-pref=* | --exec-pre=* | --exec-pr=* | --exec-p=* | --exec-=* \ + | --exec=* | --exe=* | --ex=*) + exec_prefix=$ac_optarg ;; + + -gas | --gas | --ga | --g) + # Obsolete; use --with-gas. + with_gas=yes ;; + + -help | --help | --hel | --he | -h) + ac_init_help=long ;; + -help=r* | --help=r* | --hel=r* | --he=r* | -hr*) + ac_init_help=recursive ;; + -help=s* | --help=s* | --hel=s* | --he=s* | -hs*) + ac_init_help=short ;; + + -host | --host | --hos | --ho) + ac_prev=host_alias ;; + -host=* | --host=* | --hos=* | --ho=*) + host_alias=$ac_optarg ;; + + -htmldir | --htmldir | --htmldi | --htmld | --html | --htm | --ht) + ac_prev=htmldir ;; + -htmldir=* | --htmldir=* | --htmldi=* | --htmld=* | --html=* | --htm=* \ + | --ht=*) + htmldir=$ac_optarg ;; + + -includedir | --includedir | --includedi | --included | --include \ + | --includ | --inclu | --incl | --inc) + ac_prev=includedir ;; + -includedir=* | --includedir=* | --includedi=* | --included=* | --include=* \ + | --includ=* | --inclu=* | --incl=* | --inc=*) + includedir=$ac_optarg ;; + + -infodir | --infodir | --infodi | --infod | --info | --inf) + ac_prev=infodir ;; + -infodir=* | --infodir=* | --infodi=* | --infod=* | --info=* | --inf=*) + infodir=$ac_optarg ;; + + -libdir | --libdir | --libdi | --libd) + ac_prev=libdir ;; + -libdir=* | --libdir=* | --libdi=* | --libd=*) + libdir=$ac_optarg ;; + + -libexecdir | --libexecdir | --libexecdi | --libexecd | --libexec \ + | --libexe | --libex | --libe) + ac_prev=libexecdir ;; + -libexecdir=* | --libexecdir=* | --libexecdi=* | --libexecd=* | --libexec=* \ + | --libexe=* | --libex=* | --libe=*) + libexecdir=$ac_optarg ;; + + -localedir | --localedir | --localedi | --localed | --locale) + ac_prev=localedir ;; + -localedir=* | --localedir=* | --localedi=* | --localed=* | --locale=*) + localedir=$ac_optarg ;; + + -localstatedir | --localstatedir | --localstatedi | --localstated \ + | --localstate | --localstat | --localsta | --localst | --locals) + ac_prev=localstatedir ;; + -localstatedir=* | --localstatedir=* | --localstatedi=* | --localstated=* \ + | --localstate=* | --localstat=* | --localsta=* | --localst=* | --locals=*) + localstatedir=$ac_optarg ;; + + -mandir | --mandir | --mandi | --mand | --man | --ma | --m) + ac_prev=mandir ;; + -mandir=* | --mandir=* | --mandi=* | --mand=* | --man=* | --ma=* | --m=*) + mandir=$ac_optarg ;; + + -nfp | --nfp | --nf) + # Obsolete; use --without-fp. + with_fp=no ;; + + -no-create | --no-create | --no-creat | --no-crea | --no-cre \ + | --no-cr | --no-c | -n) + no_create=yes ;; + + -no-recursion | --no-recursion | --no-recursio | --no-recursi \ + | --no-recurs | --no-recur | --no-recu | --no-rec | --no-re | --no-r) + no_recursion=yes ;; + + -oldincludedir | --oldincludedir | --oldincludedi | --oldincluded \ + | --oldinclude | --oldinclud | --oldinclu | --oldincl | --oldinc \ + | --oldin | --oldi | --old | --ol | --o) + ac_prev=oldincludedir ;; + -oldincludedir=* | --oldincludedir=* | --oldincludedi=* | --oldincluded=* \ + | --oldinclude=* | --oldinclud=* | --oldinclu=* | --oldincl=* | --oldinc=* \ + | --oldin=* | --oldi=* | --old=* | --ol=* | --o=*) + oldincludedir=$ac_optarg ;; + + -prefix | --prefix | --prefi | --pref | --pre | --pr | --p) + ac_prev=prefix ;; + -prefix=* | --prefix=* | --prefi=* | --pref=* | --pre=* | --pr=* | --p=*) + prefix=$ac_optarg ;; + + -program-prefix | --program-prefix | --program-prefi | --program-pref \ + | --program-pre | --program-pr | --program-p) + ac_prev=program_prefix ;; + -program-prefix=* | --program-prefix=* | --program-prefi=* \ + | --program-pref=* | --program-pre=* | --program-pr=* | --program-p=*) + program_prefix=$ac_optarg ;; + + -program-suffix | --program-suffix | --program-suffi | --program-suff \ + | --program-suf | --program-su | --program-s) + ac_prev=program_suffix ;; + -program-suffix=* | --program-suffix=* | --program-suffi=* \ + | --program-suff=* | --program-suf=* | --program-su=* | --program-s=*) + program_suffix=$ac_optarg ;; + + -program-transform-name | --program-transform-name \ + | --program-transform-nam | --program-transform-na \ + | --program-transform-n | --program-transform- \ + | --program-transform | --program-transfor \ + | --program-transfo | --program-transf \ + | --program-trans | --program-tran \ + | --progr-tra | --program-tr | --program-t) + ac_prev=program_transform_name ;; + -program-transform-name=* | --program-transform-name=* \ + | --program-transform-nam=* | --program-transform-na=* \ + | --program-transform-n=* | --program-transform-=* \ + | --program-transform=* | --program-transfor=* \ + | --program-transfo=* | --program-transf=* \ + | --program-trans=* | --program-tran=* \ + | --progr-tra=* | --program-tr=* | --program-t=*) + program_transform_name=$ac_optarg ;; + + -pdfdir | --pdfdir | --pdfdi | --pdfd | --pdf | --pd) + ac_prev=pdfdir ;; + -pdfdir=* | --pdfdir=* | --pdfdi=* | --pdfd=* | --pdf=* | --pd=*) + pdfdir=$ac_optarg ;; + + -psdir | --psdir | --psdi | --psd | --ps) + ac_prev=psdir ;; + -psdir=* | --psdir=* | --psdi=* | --psd=* | --ps=*) + psdir=$ac_optarg ;; + + -q | -quiet | --quiet | --quie | --qui | --qu | --q \ + | -silent | --silent | --silen | --sile | --sil) + silent=yes ;; + + -sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb) + ac_prev=sbindir ;; + -sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \ + | --sbi=* | --sb=*) + sbindir=$ac_optarg ;; + + -sharedstatedir | --sharedstatedir | --sharedstatedi \ + | --sharedstated | --sharedstate | --sharedstat | --sharedsta \ + | --sharedst | --shareds | --shared | --share | --shar \ + | --sha | --sh) + ac_prev=sharedstatedir ;; + -sharedstatedir=* | --sharedstatedir=* | --sharedstatedi=* \ + | --sharedstated=* | --sharedstate=* | --sharedstat=* | --sharedsta=* \ + | --sharedst=* | --shareds=* | --shared=* | --share=* | --shar=* \ + | --sha=* | --sh=*) + sharedstatedir=$ac_optarg ;; + + -site | --site | --sit) + ac_prev=site ;; + -site=* | --site=* | --sit=*) + site=$ac_optarg ;; + + -srcdir | --srcdir | --srcdi | --srcd | --src | --sr) + ac_prev=srcdir ;; + -srcdir=* | --srcdir=* | --srcdi=* | --srcd=* | --src=* | --sr=*) + srcdir=$ac_optarg ;; + + -sysconfdir | --sysconfdir | --sysconfdi | --sysconfd | --sysconf \ + | --syscon | --sysco | --sysc | --sys | --sy) + ac_prev=sysconfdir ;; + -sysconfdir=* | --sysconfdir=* | --sysconfdi=* | --sysconfd=* | --sysconf=* \ + | --syscon=* | --sysco=* | --sysc=* | --sys=* | --sy=*) + sysconfdir=$ac_optarg ;; + + -target | --target | --targe | --targ | --tar | --ta | --t) + ac_prev=target_alias ;; + -target=* | --target=* | --targe=* | --targ=* | --tar=* | --ta=* | --t=*) + target_alias=$ac_optarg ;; + + -v | -verbose | --verbose | --verbos | --verbo | --verb) + verbose=yes ;; + + -version | --version | --versio | --versi | --vers | -V) + ac_init_version=: ;; + + -with-* | --with-*) + ac_useropt=`expr "x$ac_option" : 'x-*with-\([^=]*\)'` + # Reject names that are not valid shell variable names. + expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null && + as_fn_error $? "invalid package name: $ac_useropt" + ac_useropt_orig=$ac_useropt + ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'` + case $ac_user_opts in + *" +"with_$ac_useropt" +"*) ;; + *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--with-$ac_useropt_orig" + ac_unrecognized_sep=', ';; + esac + eval with_$ac_useropt=\$ac_optarg ;; + + -without-* | --without-*) + ac_useropt=`expr "x$ac_option" : 'x-*without-\(.*\)'` + # Reject names that are not valid shell variable names. + expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null && + as_fn_error $? "invalid package name: $ac_useropt" + ac_useropt_orig=$ac_useropt + ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'` + case $ac_user_opts in + *" +"with_$ac_useropt" +"*) ;; + *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--without-$ac_useropt_orig" + ac_unrecognized_sep=', ';; + esac + eval with_$ac_useropt=no ;; + + --x) + # Obsolete; use --with-x. + with_x=yes ;; + + -x-includes | --x-includes | --x-include | --x-includ | --x-inclu \ + | --x-incl | --x-inc | --x-in | --x-i) + ac_prev=x_includes ;; + -x-includes=* | --x-includes=* | --x-include=* | --x-includ=* | --x-inclu=* \ + | --x-incl=* | --x-inc=* | --x-in=* | --x-i=*) + x_includes=$ac_optarg ;; + + -x-libraries | --x-libraries | --x-librarie | --x-librari \ + | --x-librar | --x-libra | --x-libr | --x-lib | --x-li | --x-l) + ac_prev=x_libraries ;; + -x-libraries=* | --x-libraries=* | --x-librarie=* | --x-librari=* \ + | --x-librar=* | --x-libra=* | --x-libr=* | --x-lib=* | --x-li=* | --x-l=*) + x_libraries=$ac_optarg ;; + + -*) as_fn_error $? "unrecognized option: \`$ac_option' +Try \`$0 --help' for more information" + ;; + + *=*) + ac_envvar=`expr "x$ac_option" : 'x\([^=]*\)='` + # Reject names that are not valid shell variable names. + case $ac_envvar in #( + '' | [0-9]* | *[!_$as_cr_alnum]* ) + as_fn_error $? "invalid variable name: \`$ac_envvar'" ;; + esac + eval $ac_envvar=\$ac_optarg + export $ac_envvar ;; + + *) + # FIXME: should be removed in autoconf 3.0. + $as_echo "$as_me: WARNING: you should use --build, --host, --target" >&2 + expr "x$ac_option" : ".*[^-._$as_cr_alnum]" >/dev/null && + $as_echo "$as_me: WARNING: invalid host type: $ac_option" >&2 + : "${build_alias=$ac_option} ${host_alias=$ac_option} ${target_alias=$ac_option}" + ;; + + esac +done + +if test -n "$ac_prev"; then + ac_option=--`echo $ac_prev | sed 's/_/-/g'` + as_fn_error $? "missing argument to $ac_option" +fi + +if test -n "$ac_unrecognized_opts"; then + case $enable_option_checking in + no) ;; + fatal) as_fn_error $? "unrecognized options: $ac_unrecognized_opts" ;; + *) $as_echo "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2 ;; + esac +fi + +# Check all directory arguments for consistency. +for ac_var in exec_prefix prefix bindir sbindir libexecdir datarootdir \ + datadir sysconfdir sharedstatedir localstatedir includedir \ + oldincludedir docdir infodir htmldir dvidir pdfdir psdir \ + libdir localedir mandir +do + eval ac_val=\$$ac_var + # Remove trailing slashes. + case $ac_val in + */ ) + ac_val=`expr "X$ac_val" : 'X\(.*[^/]\)' \| "X$ac_val" : 'X\(.*\)'` + eval $ac_var=\$ac_val;; + esac + # Be sure to have absolute directory names. + case $ac_val in + [\\/$]* | ?:[\\/]* ) continue;; + NONE | '' ) case $ac_var in *prefix ) continue;; esac;; + esac + as_fn_error $? "expected an absolute directory name for --$ac_var: $ac_val" +done + +# There might be people who depend on the old broken behavior: `$host' +# used to hold the argument of --host etc. +# FIXME: To remove some day. +build=$build_alias +host=$host_alias +target=$target_alias + +# FIXME: To remove some day. +if test "x$host_alias" != x; then + if test "x$build_alias" = x; then + cross_compiling=maybe + elif test "x$build_alias" != "x$host_alias"; then + cross_compiling=yes + fi +fi + +ac_tool_prefix= +test -n "$host_alias" && ac_tool_prefix=$host_alias- + +test "$silent" = yes && exec 6>/dev/null + + +ac_pwd=`pwd` && test -n "$ac_pwd" && +ac_ls_di=`ls -di .` && +ac_pwd_ls_di=`cd "$ac_pwd" && ls -di .` || + as_fn_error $? "working directory cannot be determined" +test "X$ac_ls_di" = "X$ac_pwd_ls_di" || + as_fn_error $? "pwd does not report name of working directory" + + +# Find the source files, if location was not specified. +if test -z "$srcdir"; then + ac_srcdir_defaulted=yes + # Try the directory containing this script, then the parent directory. + ac_confdir=`$as_dirname -- "$as_myself" || +$as_expr X"$as_myself" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ + X"$as_myself" : 'X\(//\)[^/]' \| \ + X"$as_myself" : 'X\(//\)$' \| \ + X"$as_myself" : 'X\(/\)' \| . 2>/dev/null || +$as_echo X"$as_myself" | + sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ + s//\1/ + q + } + /^X\(\/\/\)[^/].*/{ + s//\1/ + q + } + /^X\(\/\/\)$/{ + s//\1/ + q + } + /^X\(\/\).*/{ + s//\1/ + q + } + s/.*/./; q'` + srcdir=$ac_confdir + if test ! -r "$srcdir/$ac_unique_file"; then + srcdir=.. + fi +else + ac_srcdir_defaulted=no +fi +if test ! -r "$srcdir/$ac_unique_file"; then + test "$ac_srcdir_defaulted" = yes && srcdir="$ac_confdir or .." + as_fn_error $? "cannot find sources ($ac_unique_file) in $srcdir" +fi +ac_msg="sources are in $srcdir, but \`cd $srcdir' does not work" +ac_abs_confdir=`( + cd "$srcdir" && test -r "./$ac_unique_file" || as_fn_error $? "$ac_msg" + pwd)` +# When building in place, set srcdir=. +if test "$ac_abs_confdir" = "$ac_pwd"; then + srcdir=. +fi +# Remove unnecessary trailing slashes from srcdir. +# Double slashes in file names in object file debugging info +# mess up M-x gdb in Emacs. +case $srcdir in +*/) srcdir=`expr "X$srcdir" : 'X\(.*[^/]\)' \| "X$srcdir" : 'X\(.*\)'`;; +esac +for ac_var in $ac_precious_vars; do + eval ac_env_${ac_var}_set=\${${ac_var}+set} + eval ac_env_${ac_var}_value=\$${ac_var} + eval ac_cv_env_${ac_var}_set=\${${ac_var}+set} + eval ac_cv_env_${ac_var}_value=\$${ac_var} +done + +# +# Report the --help message. +# +if test "$ac_init_help" = "long"; then + # Omit some internal or obsolete options to make the list less imposing. + # This message is too long to be a string in the A/UX 3.1 sh. + cat <<_ACEOF +\`configure' configures tmLQCD 5.2.0 to adapt to many kinds of systems. + +Usage: $0 [OPTION]... [VAR=VALUE]... + +To assign environment variables (e.g., CC, CFLAGS...), specify them as +VAR=VALUE. See below for descriptions of some of the useful variables. + +Defaults for the options are specified in brackets. + +Configuration: + -h, --help display this help and exit + --help=short display options specific to this package + --help=recursive display the short help of all the included packages + -V, --version display version information and exit + -q, --quiet, --silent do not print \`checking ...' messages + --cache-file=FILE cache test results in FILE [disabled] + -C, --config-cache alias for \`--cache-file=config.cache' + -n, --no-create do not create output files + --srcdir=DIR find the sources in DIR [configure dir or \`..'] + +Installation directories: + --prefix=PREFIX install architecture-independent files in PREFIX + @<:@@S|@ac_default_prefix@:>@ + --exec-prefix=EPREFIX install architecture-dependent files in EPREFIX + @<:@PREFIX@:>@ + +By default, \`make install' will install all the files in +\`$ac_default_prefix/bin', \`$ac_default_prefix/lib' etc. You can specify +an installation prefix other than \`$ac_default_prefix' using \`--prefix', +for instance \`--prefix=\$HOME'. + +For better control, use the options below. + +Fine tuning of the installation directories: + --bindir=DIR user executables [EPREFIX/bin] + --sbindir=DIR system admin executables [EPREFIX/sbin] + --libexecdir=DIR program executables [EPREFIX/libexec] + --sysconfdir=DIR read-only single-machine data [PREFIX/etc] + --sharedstatedir=DIR modifiable architecture-independent data [PREFIX/com] + --localstatedir=DIR modifiable single-machine data [PREFIX/var] + --libdir=DIR object code libraries [EPREFIX/lib] + --includedir=DIR C header files [PREFIX/include] + --oldincludedir=DIR C header files for non-gcc [/usr/include] + --datarootdir=DIR read-only arch.-independent data root [PREFIX/share] + --datadir=DIR read-only architecture-independent data [DATAROOTDIR] + --infodir=DIR info documentation [DATAROOTDIR/info] + --localedir=DIR locale-dependent data [DATAROOTDIR/locale] + --mandir=DIR man documentation [DATAROOTDIR/man] + --docdir=DIR documentation root @<:@DATAROOTDIR/doc/tmlqcd@:>@ + --htmldir=DIR html documentation [DOCDIR] + --dvidir=DIR dvi documentation [DOCDIR] + --pdfdir=DIR pdf documentation [DOCDIR] + --psdir=DIR ps documentation [DOCDIR] +_ACEOF + + cat <<\_ACEOF + +Program names: + --program-prefix=PREFIX prepend PREFIX to installed program names + --program-suffix=SUFFIX append SUFFIX to installed program names + --program-transform-name=PROGRAM run sed PROGRAM on installed program names + +System types: + --build=BUILD configure for building on BUILD [guessed] + --host=HOST cross-compile to build programs to run on HOST [BUILD] +_ACEOF +fi + +if test -n "$ac_init_help"; then + case $ac_init_help in + short | recursive ) echo "Configuration of tmLQCD 5.2.0:";; + esac + cat <<\_ACEOF + +Optional Features: + --disable-option-checking ignore unrecognized --enable/--with options + --disable-FEATURE do not include FEATURE (same as --enable-FEATURE=no) + --enable-FEATURE[=ARG] include FEATURE [ARG=yes] + --enable-benchmark enable use of benchmark @<:@default=yes@:>@ + --enable-indexindepgeom enable Index independent addressing @<:@default=no@:>@ + --enable-mpi enable use of mpi @<:@default=yes@:>@ + --enable-qpx enable use of qpx intrinsics @<:@default=no@:>@ + --enable-spi enable use of SPI @<:@default=no@:>@ + --enable-omp enable use of OpenMP @<:@default=yes@:>@ + --disable-openmp do not use OpenMP + --enable-fftw enable use of fftw @<:@default=no@:>@ + --disable-largefile omit support for large files + --enable-alignment=n Automatically or expliclty align arrays to byte + number: auto, none, 16, 32 @<:@default=auto@:>@ + --enable-p4 enable use of P4 instructions @<:@default=no@:>@ + --enable-opteron enable use of Opteron instructions @<:@default=no@:>@ + --enable-sse2 enable use of SSE2 instructions @<:@default=no@:>@ + --enable-sse3 enable use of SSE3 instructions @<:@default=no@:>@ + --enable-optimize enable optimisation @<:@default=yes@:>@ + --enable-gaugecopy enable use of a copy of the gauge field + @<:@default=yes@:>@ + --enable-halfspinor use a Dirac Op. with halfspinor exchange + @<:@default=yes@:>@ + --enable-shmem use shmem API @<:@default=no@:>@ + --enable-tsplitpar enable timeslice-splitted communications + @<:@default=no@:>@ + --enable-laph enable computation of LapH eigensystem @<:@default=no@:>@ + --enable-gpu use GPU @<:@default=no@:>@ + +Optional Packages: + --with-PACKAGE[=ARG] use PACKAGE [ARG=yes] + --without-PACKAGE do not use PACKAGE (same as --with-PACKAGE=no) + --with-limedir=dir search lime in dir @<:@default=./lime@:>@ + --with-lemondir=dir use lemon, to be found in dir + --with-mpidimension=n use n dimensional parallelisation @<:@default=1@:>@ + --with-persistentmpi use persistent MPI calls for halfspinor @<:@default=no@:>@ + --with-nonblockingmpi use non-blocking MPI calls for spinor and gauge + @<:@default=yes@:>@ + --with-fixedvolume fix volume at compiletime @<:@default=no@:>@ + --with-kojakinst instrumentalise for KOJAK @<:@default=no@:>@ + --with-lapack enable use of lapack @<:@default=yes@:>@ + --with-gprof use of gprof profiler @<:@default=no@:>@ + --with-bgldram use BGL dram window (BGL only!) @<:@default=yes@:>@ + --with-cuda=dir use CUDA GPU with lib dir + @<:@default=/usr/local/cuda/lib@:>@ + --with-cudacompileargs=string + use CUDA compile args @<:@default="--gpu-architecture + sm_13 --use_fast_math -O3"@:>@ + --with-qudadir=dir use QUDA, to be found in dir + --with-cudadir=dir if using QUDA, then set CUDA lib dir + @<:@default=/usr/local/cuda/lib@:>@ + +Some influential environment variables: + CC C compiler command + CFLAGS C compiler flags + LDFLAGS linker flags, e.g. -L if you have libraries in a + nonstandard directory + LIBS libraries to pass to the linker, e.g. -l + CPPFLAGS (Objective) C/C++ preprocessor flags, e.g. -I if + you have headers in a nonstandard directory + F77 Fortran 77 compiler command + FFLAGS Fortran 77 compiler flags + CPP C preprocessor + CXX C++ compiler command + CXXFLAGS C++ compiler flags + +Use these variables to override the choices made by `configure' or to help +it to find libraries and programs with nonstandard names/locations. + +Report bugs to . +_ACEOF +ac_status=$? +fi + +if test "$ac_init_help" = "recursive"; then + # If there are subdirs, report their specific --help. + for ac_dir in : $ac_subdirs_all; do test "x$ac_dir" = x: && continue + test -d "$ac_dir" || + { cd "$srcdir" && ac_pwd=`pwd` && srcdir=. && test -d "$ac_dir"; } || + continue + ac_builddir=. + +case "$ac_dir" in +.) ac_dir_suffix= ac_top_builddir_sub=. ac_top_build_prefix= ;; +*) + ac_dir_suffix=/`$as_echo "$ac_dir" | sed 's|^\.[\\/]||'` + # A ".." for each directory in $ac_dir_suffix. + ac_top_builddir_sub=`$as_echo "$ac_dir_suffix" | sed 's|/[^\\/]*|/..|g;s|/||'` + case $ac_top_builddir_sub in + "") ac_top_builddir_sub=. ac_top_build_prefix= ;; + *) ac_top_build_prefix=$ac_top_builddir_sub/ ;; + esac ;; +esac +ac_abs_top_builddir=$ac_pwd +ac_abs_builddir=$ac_pwd$ac_dir_suffix +# for backward compatibility: +ac_top_builddir=$ac_top_build_prefix + +case $srcdir in + .) # We are building in place. + ac_srcdir=. + ac_top_srcdir=$ac_top_builddir_sub + ac_abs_top_srcdir=$ac_pwd ;; + [\\/]* | ?:[\\/]* ) # Absolute name. + ac_srcdir=$srcdir$ac_dir_suffix; + ac_top_srcdir=$srcdir + ac_abs_top_srcdir=$srcdir ;; + *) # Relative name. + ac_srcdir=$ac_top_build_prefix$srcdir$ac_dir_suffix + ac_top_srcdir=$ac_top_build_prefix$srcdir + ac_abs_top_srcdir=$ac_pwd/$srcdir ;; +esac +ac_abs_srcdir=$ac_abs_top_srcdir$ac_dir_suffix + + cd "$ac_dir" || { ac_status=$?; continue; } + # Check for guested configure. + if test -f "$ac_srcdir/configure.gnu"; then + echo && + $SHELL "$ac_srcdir/configure.gnu" --help=recursive + elif test -f "$ac_srcdir/configure"; then + echo && + $SHELL "$ac_srcdir/configure" --help=recursive + else + $as_echo "$as_me: WARNING: no configuration information is in $ac_dir" >&2 + fi || ac_status=$? + cd "$ac_pwd" || { ac_status=$?; break; } + done +fi + +test -n "$ac_init_help" && exit $ac_status +if $ac_init_version; then + cat <<\_ACEOF +tmLQCD configure 5.2.0 +generated by GNU Autoconf 2.69 + +Copyright (C) 2012 Free Software Foundation, Inc. +This configure script is free software; the Free Software Foundation +gives unlimited permission to copy, distribute and modify it. +_ACEOF + exit +fi + +## ------------------------ ## +## Autoconf initialization. ## +## ------------------------ ## + +@%:@ ac_fn_c_try_compile LINENO +@%:@ -------------------------- +@%:@ Try to compile conftest.@S|@ac_ext, and return whether this succeeded. +ac_fn_c_try_compile () +{ + as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack + rm -f conftest.$ac_objext + if { { ac_try="$ac_compile" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" +$as_echo "$ac_try_echo"; } >&5 + (eval "$ac_compile") 2>conftest.err + ac_status=$? + if test -s conftest.err; then + grep -v '^ *+' conftest.err >conftest.er1 + cat conftest.er1 >&5 + mv -f conftest.er1 conftest.err + fi + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; } && { + test -z "$ac_c_werror_flag" || + test ! -s conftest.err + } && test -s conftest.$ac_objext; then : + ac_retval=0 +else + $as_echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + + ac_retval=1 +fi + eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno + as_fn_set_status $ac_retval + +} @%:@ ac_fn_c_try_compile + +@%:@ ac_fn_f77_try_compile LINENO +@%:@ ---------------------------- +@%:@ Try to compile conftest.@S|@ac_ext, and return whether this succeeded. +ac_fn_f77_try_compile () +{ + as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack + rm -f conftest.$ac_objext + if { { ac_try="$ac_compile" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" +$as_echo "$ac_try_echo"; } >&5 + (eval "$ac_compile") 2>conftest.err + ac_status=$? + if test -s conftest.err; then + grep -v '^ *+' conftest.err >conftest.er1 + cat conftest.er1 >&5 + mv -f conftest.er1 conftest.err + fi + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; } && { + test -z "$ac_f77_werror_flag" || + test ! -s conftest.err + } && test -s conftest.$ac_objext; then : + ac_retval=0 +else + $as_echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + + ac_retval=1 +fi + eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno + as_fn_set_status $ac_retval + +} @%:@ ac_fn_f77_try_compile + +@%:@ ac_fn_c_try_link LINENO +@%:@ ----------------------- +@%:@ Try to link conftest.@S|@ac_ext, and return whether this succeeded. +ac_fn_c_try_link () +{ + as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack + rm -f conftest.$ac_objext conftest$ac_exeext + if { { ac_try="$ac_link" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" +$as_echo "$ac_try_echo"; } >&5 + (eval "$ac_link") 2>conftest.err + ac_status=$? + if test -s conftest.err; then + grep -v '^ *+' conftest.err >conftest.er1 + cat conftest.er1 >&5 + mv -f conftest.er1 conftest.err + fi + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; } && { + test -z "$ac_c_werror_flag" || + test ! -s conftest.err + } && test -s conftest$ac_exeext && { + test "$cross_compiling" = yes || + test -x conftest$ac_exeext + }; then : + ac_retval=0 +else + $as_echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + + ac_retval=1 +fi + # Delete the IPA/IPO (Inter Procedural Analysis/Optimization) information + # created by the PGI compiler (conftest_ipa8_conftest.oo), as it would + # interfere with the next link command; also delete a directory that is + # left behind by Apple's compiler. We do this before executing the actions. + rm -rf conftest.dSYM conftest_ipa8_conftest.oo + eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno + as_fn_set_status $ac_retval + +} @%:@ ac_fn_c_try_link + +@%:@ ac_fn_c_try_cpp LINENO +@%:@ ---------------------- +@%:@ Try to preprocess conftest.@S|@ac_ext, and return whether this succeeded. +ac_fn_c_try_cpp () +{ + as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack + if { { ac_try="$ac_cpp conftest.$ac_ext" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" +$as_echo "$ac_try_echo"; } >&5 + (eval "$ac_cpp conftest.$ac_ext") 2>conftest.err + ac_status=$? + if test -s conftest.err; then + grep -v '^ *+' conftest.err >conftest.er1 + cat conftest.er1 >&5 + mv -f conftest.er1 conftest.err + fi + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; } > conftest.i && { + test -z "$ac_c_preproc_warn_flag$ac_c_werror_flag" || + test ! -s conftest.err + }; then : + ac_retval=0 +else + $as_echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + + ac_retval=1 +fi + eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno + as_fn_set_status $ac_retval + +} @%:@ ac_fn_c_try_cpp + +@%:@ ac_fn_c_check_header_mongrel LINENO HEADER VAR INCLUDES +@%:@ ------------------------------------------------------- +@%:@ Tests whether HEADER exists, giving a warning if it cannot be compiled using +@%:@ the include files in INCLUDES and setting the cache variable VAR +@%:@ accordingly. +ac_fn_c_check_header_mongrel () +{ + as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack + if eval \${$3+:} false; then : + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5 +$as_echo_n "checking for $2... " >&6; } +if eval \${$3+:} false; then : + $as_echo_n "(cached) " >&6 +fi +eval ac_res=\$$3 + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5 +$as_echo "$ac_res" >&6; } +else + # Is the header compilable? +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking $2 usability" >&5 +$as_echo_n "checking $2 usability... " >&6; } +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +$4 +@%:@include <$2> +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + ac_header_compiler=yes +else + ac_header_compiler=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_header_compiler" >&5 +$as_echo "$ac_header_compiler" >&6; } + +# Is the header present? +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking $2 presence" >&5 +$as_echo_n "checking $2 presence... " >&6; } +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +@%:@include <$2> +_ACEOF +if ac_fn_c_try_cpp "$LINENO"; then : + ac_header_preproc=yes +else + ac_header_preproc=no +fi +rm -f conftest.err conftest.i conftest.$ac_ext +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_header_preproc" >&5 +$as_echo "$ac_header_preproc" >&6; } + +# So? What about this header? +case $ac_header_compiler:$ac_header_preproc:$ac_c_preproc_warn_flag in #(( + yes:no: ) + { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: accepted by the compiler, rejected by the preprocessor!" >&5 +$as_echo "$as_me: WARNING: $2: accepted by the compiler, rejected by the preprocessor!" >&2;} + { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: proceeding with the compiler's result" >&5 +$as_echo "$as_me: WARNING: $2: proceeding with the compiler's result" >&2;} + ;; + no:yes:* ) + { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: present but cannot be compiled" >&5 +$as_echo "$as_me: WARNING: $2: present but cannot be compiled" >&2;} + { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: check for missing prerequisite headers?" >&5 +$as_echo "$as_me: WARNING: $2: check for missing prerequisite headers?" >&2;} + { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: see the Autoconf documentation" >&5 +$as_echo "$as_me: WARNING: $2: see the Autoconf documentation" >&2;} + { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: section \"Present But Cannot Be Compiled\"" >&5 +$as_echo "$as_me: WARNING: $2: section \"Present But Cannot Be Compiled\"" >&2;} + { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: proceeding with the compiler's result" >&5 +$as_echo "$as_me: WARNING: $2: proceeding with the compiler's result" >&2;} +( $as_echo "## ----------------------------- ## +## Report this to curbach@gmx.de ## +## ----------------------------- ##" + ) | sed "s/^/$as_me: WARNING: /" >&2 + ;; +esac + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5 +$as_echo_n "checking for $2... " >&6; } +if eval \${$3+:} false; then : + $as_echo_n "(cached) " >&6 +else + eval "$3=\$ac_header_compiler" +fi +eval ac_res=\$$3 + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5 +$as_echo "$ac_res" >&6; } +fi + eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno + +} @%:@ ac_fn_c_check_header_mongrel + +@%:@ ac_fn_c_try_run LINENO +@%:@ ---------------------- +@%:@ Try to link conftest.@S|@ac_ext, and return whether this succeeded. Assumes +@%:@ that executables *can* be run. +ac_fn_c_try_run () +{ + as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack + if { { ac_try="$ac_link" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" +$as_echo "$ac_try_echo"; } >&5 + (eval "$ac_link") 2>&5 + ac_status=$? + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; } && { ac_try='./conftest$ac_exeext' + { { case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" +$as_echo "$ac_try_echo"; } >&5 + (eval "$ac_try") 2>&5 + ac_status=$? + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; }; }; then : + ac_retval=0 +else + $as_echo "$as_me: program exited with status $ac_status" >&5 + $as_echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + + ac_retval=$ac_status +fi + rm -rf conftest.dSYM conftest_ipa8_conftest.oo + eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno + as_fn_set_status $ac_retval + +} @%:@ ac_fn_c_try_run + +@%:@ ac_fn_c_check_header_compile LINENO HEADER VAR INCLUDES +@%:@ ------------------------------------------------------- +@%:@ Tests whether HEADER exists and can be compiled using the include files in +@%:@ INCLUDES, setting the cache variable VAR accordingly. +ac_fn_c_check_header_compile () +{ + as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5 +$as_echo_n "checking for $2... " >&6; } +if eval \${$3+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +$4 +@%:@include <$2> +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + eval "$3=yes" +else + eval "$3=no" +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +fi +eval ac_res=\$$3 + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5 +$as_echo "$ac_res" >&6; } + eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno + +} @%:@ ac_fn_c_check_header_compile + +@%:@ ac_fn_c_check_type LINENO TYPE VAR INCLUDES +@%:@ ------------------------------------------- +@%:@ Tests whether TYPE exists after having included INCLUDES, setting cache +@%:@ variable VAR accordingly. +ac_fn_c_check_type () +{ + as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5 +$as_echo_n "checking for $2... " >&6; } +if eval \${$3+:} false; then : + $as_echo_n "(cached) " >&6 +else + eval "$3=no" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +$4 +int +main () +{ +if (sizeof ($2)) + return 0; + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +$4 +int +main () +{ +if (sizeof (($2))) + return 0; + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + +else + eval "$3=yes" +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +fi +eval ac_res=\$$3 + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5 +$as_echo "$ac_res" >&6; } + eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno + +} @%:@ ac_fn_c_check_type + +@%:@ ac_fn_c_compute_int LINENO EXPR VAR INCLUDES +@%:@ -------------------------------------------- +@%:@ Tries to find the compile-time value of EXPR in a program that includes +@%:@ INCLUDES, setting VAR accordingly. Returns whether the value could be +@%:@ computed +ac_fn_c_compute_int () +{ + as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack + if test "$cross_compiling" = yes; then + # Depending upon the size, compute the lo and hi bounds. +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +$4 +int +main () +{ +static int test_array @<:@1 - 2 * !(($2) >= 0)@:>@; +test_array @<:@0@:>@ = 0; +return test_array @<:@0@:>@; + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + ac_lo=0 ac_mid=0 + while :; do + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +$4 +int +main () +{ +static int test_array @<:@1 - 2 * !(($2) <= $ac_mid)@:>@; +test_array @<:@0@:>@ = 0; +return test_array @<:@0@:>@; + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + ac_hi=$ac_mid; break +else + as_fn_arith $ac_mid + 1 && ac_lo=$as_val + if test $ac_lo -le $ac_mid; then + ac_lo= ac_hi= + break + fi + as_fn_arith 2 '*' $ac_mid + 1 && ac_mid=$as_val +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + done +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +$4 +int +main () +{ +static int test_array @<:@1 - 2 * !(($2) < 0)@:>@; +test_array @<:@0@:>@ = 0; +return test_array @<:@0@:>@; + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + ac_hi=-1 ac_mid=-1 + while :; do + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +$4 +int +main () +{ +static int test_array @<:@1 - 2 * !(($2) >= $ac_mid)@:>@; +test_array @<:@0@:>@ = 0; +return test_array @<:@0@:>@; + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + ac_lo=$ac_mid; break +else + as_fn_arith '(' $ac_mid ')' - 1 && ac_hi=$as_val + if test $ac_mid -le $ac_hi; then + ac_lo= ac_hi= + break + fi + as_fn_arith 2 '*' $ac_mid && ac_mid=$as_val +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + done +else + ac_lo= ac_hi= +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +# Binary search between lo and hi bounds. +while test "x$ac_lo" != "x$ac_hi"; do + as_fn_arith '(' $ac_hi - $ac_lo ')' / 2 + $ac_lo && ac_mid=$as_val + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +$4 +int +main () +{ +static int test_array @<:@1 - 2 * !(($2) <= $ac_mid)@:>@; +test_array @<:@0@:>@ = 0; +return test_array @<:@0@:>@; + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + ac_hi=$ac_mid +else + as_fn_arith '(' $ac_mid ')' + 1 && ac_lo=$as_val +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +done +case $ac_lo in @%:@(( +?*) eval "$3=\$ac_lo"; ac_retval=0 ;; +'') ac_retval=1 ;; +esac + else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +$4 +static long int longval () { return $2; } +static unsigned long int ulongval () { return $2; } +@%:@include +@%:@include +int +main () +{ + + FILE *f = fopen ("conftest.val", "w"); + if (! f) + return 1; + if (($2) < 0) + { + long int i = longval (); + if (i != ($2)) + return 1; + fprintf (f, "%ld", i); + } + else + { + unsigned long int i = ulongval (); + if (i != ($2)) + return 1; + fprintf (f, "%lu", i); + } + /* Do not output a trailing newline, as this causes \r\n confusion + on some platforms. */ + return ferror (f) || fclose (f) != 0; + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_run "$LINENO"; then : + echo >>conftest.val; read $3 &5 +$as_echo_n "checking for $2... " >&6; } +if eval \${$3+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +/* Define $2 to an innocuous variant, in case declares $2. + For example, HP-UX 11i declares gettimeofday. */ +#define $2 innocuous_$2 + +/* System header to define __stub macros and hopefully few prototypes, + which can conflict with char $2 (); below. + Prefer to if __STDC__ is defined, since + exists even on freestanding compilers. */ + +#ifdef __STDC__ +# include +#else +# include +#endif + +#undef $2 + +/* Override any GCC internal prototype to avoid an error. + Use char because int might match the return type of a GCC + builtin and then its argument prototype would still apply. */ +#ifdef __cplusplus +extern "C" +#endif +char $2 (); +/* The GNU C library defines this for functions which it implements + to always fail with ENOSYS. Some functions are actually named + something starting with __ and the normal name is an alias. */ +#if defined __stub_$2 || defined __stub___$2 +choke me +#endif + +int +main () +{ +return $2 (); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + eval "$3=yes" +else + eval "$3=no" +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +fi +eval ac_res=\$$3 + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5 +$as_echo "$ac_res" >&6; } + eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno + +} @%:@ ac_fn_c_check_func + +@%:@ ac_fn_cxx_try_compile LINENO +@%:@ ---------------------------- +@%:@ Try to compile conftest.@S|@ac_ext, and return whether this succeeded. +ac_fn_cxx_try_compile () +{ + as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack + rm -f conftest.$ac_objext + if { { ac_try="$ac_compile" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" +$as_echo "$ac_try_echo"; } >&5 + (eval "$ac_compile") 2>conftest.err + ac_status=$? + if test -s conftest.err; then + grep -v '^ *+' conftest.err >conftest.er1 + cat conftest.er1 >&5 + mv -f conftest.er1 conftest.err + fi + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; } && { + test -z "$ac_cxx_werror_flag" || + test ! -s conftest.err + } && test -s conftest.$ac_objext; then : + ac_retval=0 +else + $as_echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + + ac_retval=1 +fi + eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno + as_fn_set_status $ac_retval + +} @%:@ ac_fn_cxx_try_compile + +@%:@ ac_fn_cxx_try_link LINENO +@%:@ ------------------------- +@%:@ Try to link conftest.@S|@ac_ext, and return whether this succeeded. +ac_fn_cxx_try_link () +{ + as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack + rm -f conftest.$ac_objext conftest$ac_exeext + if { { ac_try="$ac_link" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" +$as_echo "$ac_try_echo"; } >&5 + (eval "$ac_link") 2>conftest.err + ac_status=$? + if test -s conftest.err; then + grep -v '^ *+' conftest.err >conftest.er1 + cat conftest.er1 >&5 + mv -f conftest.er1 conftest.err + fi + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; } && { + test -z "$ac_cxx_werror_flag" || + test ! -s conftest.err + } && test -s conftest$ac_exeext && { + test "$cross_compiling" = yes || + test -x conftest$ac_exeext + }; then : + ac_retval=0 +else + $as_echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + + ac_retval=1 +fi + # Delete the IPA/IPO (Inter Procedural Analysis/Optimization) information + # created by the PGI compiler (conftest_ipa8_conftest.oo), as it would + # interfere with the next link command; also delete a directory that is + # left behind by Apple's compiler. We do this before executing the actions. + rm -rf conftest.dSYM conftest_ipa8_conftest.oo + eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno + as_fn_set_status $ac_retval + +} @%:@ ac_fn_cxx_try_link +cat >config.log <<_ACEOF +This file contains any messages produced by compilers while +running configure, to aid debugging if configure makes a mistake. + +It was created by tmLQCD $as_me 5.2.0, which was +generated by GNU Autoconf 2.69. Invocation command line was + + $ $0 $@ + +_ACEOF +exec 5>>config.log +{ +cat <<_ASUNAME +## --------- ## +## Platform. ## +## --------- ## + +hostname = `(hostname || uname -n) 2>/dev/null | sed 1q` +uname -m = `(uname -m) 2>/dev/null || echo unknown` +uname -r = `(uname -r) 2>/dev/null || echo unknown` +uname -s = `(uname -s) 2>/dev/null || echo unknown` +uname -v = `(uname -v) 2>/dev/null || echo unknown` + +/usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null || echo unknown` +/bin/uname -X = `(/bin/uname -X) 2>/dev/null || echo unknown` + +/bin/arch = `(/bin/arch) 2>/dev/null || echo unknown` +/usr/bin/arch -k = `(/usr/bin/arch -k) 2>/dev/null || echo unknown` +/usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null || echo unknown` +/usr/bin/hostinfo = `(/usr/bin/hostinfo) 2>/dev/null || echo unknown` +/bin/machine = `(/bin/machine) 2>/dev/null || echo unknown` +/usr/bin/oslevel = `(/usr/bin/oslevel) 2>/dev/null || echo unknown` +/bin/universe = `(/bin/universe) 2>/dev/null || echo unknown` + +_ASUNAME + +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + $as_echo "PATH: $as_dir" + done +IFS=$as_save_IFS + +} >&5 + +cat >&5 <<_ACEOF + + +## ----------- ## +## Core tests. ## +## ----------- ## + +_ACEOF + + +# Keep a trace of the command line. +# Strip out --no-create and --no-recursion so they do not pile up. +# Strip out --silent because we don't want to record it for future runs. +# Also quote any args containing shell meta-characters. +# Make two passes to allow for proper duplicate-argument suppression. +ac_configure_args= +ac_configure_args0= +ac_configure_args1= +ac_must_keep_next=false +for ac_pass in 1 2 +do + for ac_arg + do + case $ac_arg in + -no-create | --no-c* | -n | -no-recursion | --no-r*) continue ;; + -q | -quiet | --quiet | --quie | --qui | --qu | --q \ + | -silent | --silent | --silen | --sile | --sil) + continue ;; + *\'*) + ac_arg=`$as_echo "$ac_arg" | sed "s/'/'\\\\\\\\''/g"` ;; + esac + case $ac_pass in + 1) as_fn_append ac_configure_args0 " '$ac_arg'" ;; + 2) + as_fn_append ac_configure_args1 " '$ac_arg'" + if test $ac_must_keep_next = true; then + ac_must_keep_next=false # Got value, back to normal. + else + case $ac_arg in + *=* | --config-cache | -C | -disable-* | --disable-* \ + | -enable-* | --enable-* | -gas | --g* | -nfp | --nf* \ + | -q | -quiet | --q* | -silent | --sil* | -v | -verb* \ + | -with-* | --with-* | -without-* | --without-* | --x) + case "$ac_configure_args0 " in + "$ac_configure_args1"*" '$ac_arg' "* ) continue ;; + esac + ;; + -* ) ac_must_keep_next=true ;; + esac + fi + as_fn_append ac_configure_args " '$ac_arg'" + ;; + esac + done +done +{ ac_configure_args0=; unset ac_configure_args0;} +{ ac_configure_args1=; unset ac_configure_args1;} + +# When interrupted or exit'd, cleanup temporary files, and complete +# config.log. We remove comments because anyway the quotes in there +# would cause problems or look ugly. +# WARNING: Use '\'' to represent an apostrophe within the trap. +# WARNING: Do not start the trap code with a newline, due to a FreeBSD 4.0 bug. +trap 'exit_status=$? + # Save into config.log some information that might help in debugging. + { + echo + + $as_echo "## ---------------- ## +## Cache variables. ## +## ---------------- ##" + echo + # The following way of writing the cache mishandles newlines in values, +( + for ac_var in `(set) 2>&1 | sed -n '\''s/^\([a-zA-Z_][a-zA-Z0-9_]*\)=.*/\1/p'\''`; do + eval ac_val=\$$ac_var + case $ac_val in #( + *${as_nl}*) + case $ac_var in #( + *_cv_*) { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: cache variable $ac_var contains a newline" >&5 +$as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;; + esac + case $ac_var in #( + _ | IFS | as_nl) ;; #( + BASH_ARGV | BASH_SOURCE) eval $ac_var= ;; #( + *) { eval $ac_var=; unset $ac_var;} ;; + esac ;; + esac + done + (set) 2>&1 | + case $as_nl`(ac_space='\'' '\''; set) 2>&1` in #( + *${as_nl}ac_space=\ *) + sed -n \ + "s/'\''/'\''\\\\'\'''\''/g; + s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1='\''\\2'\''/p" + ;; #( + *) + sed -n "/^[_$as_cr_alnum]*_cv_[_$as_cr_alnum]*=/p" + ;; + esac | + sort +) + echo + + $as_echo "## ----------------- ## +## Output variables. ## +## ----------------- ##" + echo + for ac_var in $ac_subst_vars + do + eval ac_val=\$$ac_var + case $ac_val in + *\'\''*) ac_val=`$as_echo "$ac_val" | sed "s/'\''/'\''\\\\\\\\'\'''\''/g"`;; + esac + $as_echo "$ac_var='\''$ac_val'\''" + done | sort + echo + + if test -n "$ac_subst_files"; then + $as_echo "## ------------------- ## +## File substitutions. ## +## ------------------- ##" + echo + for ac_var in $ac_subst_files + do + eval ac_val=\$$ac_var + case $ac_val in + *\'\''*) ac_val=`$as_echo "$ac_val" | sed "s/'\''/'\''\\\\\\\\'\'''\''/g"`;; + esac + $as_echo "$ac_var='\''$ac_val'\''" + done | sort + echo + fi + + if test -s confdefs.h; then + $as_echo "## ----------- ## +## confdefs.h. ## +## ----------- ##" + echo + cat confdefs.h + echo + fi + test "$ac_signal" != 0 && + $as_echo "$as_me: caught signal $ac_signal" + $as_echo "$as_me: exit $exit_status" + } >&5 + rm -f core *.core core.conftest.* && + rm -f -r conftest* confdefs* conf$$* $ac_clean_files && + exit $exit_status +' 0 +for ac_signal in 1 2 13 15; do + trap 'ac_signal='$ac_signal'; as_fn_exit 1' $ac_signal +done +ac_signal=0 + +# confdefs.h avoids OS command line length limits that DEFS can exceed. +rm -f -r conftest* confdefs.h + +$as_echo "/* confdefs.h */" > confdefs.h + +# Predefined preprocessor variables. + +cat >>confdefs.h <<_ACEOF +@%:@define PACKAGE_NAME "$PACKAGE_NAME" +_ACEOF + +cat >>confdefs.h <<_ACEOF +@%:@define PACKAGE_TARNAME "$PACKAGE_TARNAME" +_ACEOF + +cat >>confdefs.h <<_ACEOF +@%:@define PACKAGE_VERSION "$PACKAGE_VERSION" +_ACEOF + +cat >>confdefs.h <<_ACEOF +@%:@define PACKAGE_STRING "$PACKAGE_STRING" +_ACEOF + +cat >>confdefs.h <<_ACEOF +@%:@define PACKAGE_BUGREPORT "$PACKAGE_BUGREPORT" +_ACEOF + +cat >>confdefs.h <<_ACEOF +@%:@define PACKAGE_URL "$PACKAGE_URL" +_ACEOF + + +# Let the site file select an alternate cache file if it wants to. +# Prefer an explicitly selected file to automatically selected ones. +ac_site_file1=NONE +ac_site_file2=NONE +if test -n "$CONFIG_SITE"; then + # We do not want a PATH search for config.site. + case $CONFIG_SITE in @%:@(( + -*) ac_site_file1=./$CONFIG_SITE;; + */*) ac_site_file1=$CONFIG_SITE;; + *) ac_site_file1=./$CONFIG_SITE;; + esac +elif test "x$prefix" != xNONE; then + ac_site_file1=$prefix/share/config.site + ac_site_file2=$prefix/etc/config.site +else + ac_site_file1=$ac_default_prefix/share/config.site + ac_site_file2=$ac_default_prefix/etc/config.site +fi +for ac_site_file in "$ac_site_file1" "$ac_site_file2" +do + test "x$ac_site_file" = xNONE && continue + if test /dev/null != "$ac_site_file" && test -r "$ac_site_file"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: loading site script $ac_site_file" >&5 +$as_echo "$as_me: loading site script $ac_site_file" >&6;} + sed 's/^/| /' "$ac_site_file" >&5 + . "$ac_site_file" \ + || { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +as_fn_error $? "failed to load site script $ac_site_file +See \`config.log' for more details" "$LINENO" 5; } + fi +done + +if test -r "$cache_file"; then + # Some versions of bash will fail to source /dev/null (special files + # actually), so we avoid doing that. DJGPP emulates it as a regular file. + if test /dev/null != "$cache_file" && test -f "$cache_file"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: loading cache $cache_file" >&5 +$as_echo "$as_me: loading cache $cache_file" >&6;} + case $cache_file in + [\\/]* | ?:[\\/]* ) . "$cache_file";; + *) . "./$cache_file";; + esac + fi +else + { $as_echo "$as_me:${as_lineno-$LINENO}: creating cache $cache_file" >&5 +$as_echo "$as_me: creating cache $cache_file" >&6;} + >$cache_file +fi + +# Check that the precious variables saved in the cache have kept the same +# value. +ac_cache_corrupted=false +for ac_var in $ac_precious_vars; do + eval ac_old_set=\$ac_cv_env_${ac_var}_set + eval ac_new_set=\$ac_env_${ac_var}_set + eval ac_old_val=\$ac_cv_env_${ac_var}_value + eval ac_new_val=\$ac_env_${ac_var}_value + case $ac_old_set,$ac_new_set in + set,) + { $as_echo "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&5 +$as_echo "$as_me: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&2;} + ac_cache_corrupted=: ;; + ,set) + { $as_echo "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' was not set in the previous run" >&5 +$as_echo "$as_me: error: \`$ac_var' was not set in the previous run" >&2;} + ac_cache_corrupted=: ;; + ,);; + *) + if test "x$ac_old_val" != "x$ac_new_val"; then + # differences in whitespace do not lead to failure. + ac_old_val_w=`echo x $ac_old_val` + ac_new_val_w=`echo x $ac_new_val` + if test "$ac_old_val_w" != "$ac_new_val_w"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' has changed since the previous run:" >&5 +$as_echo "$as_me: error: \`$ac_var' has changed since the previous run:" >&2;} + ac_cache_corrupted=: + else + { $as_echo "$as_me:${as_lineno-$LINENO}: warning: ignoring whitespace changes in \`$ac_var' since the previous run:" >&5 +$as_echo "$as_me: warning: ignoring whitespace changes in \`$ac_var' since the previous run:" >&2;} + eval $ac_var=\$ac_old_val + fi + { $as_echo "$as_me:${as_lineno-$LINENO}: former value: \`$ac_old_val'" >&5 +$as_echo "$as_me: former value: \`$ac_old_val'" >&2;} + { $as_echo "$as_me:${as_lineno-$LINENO}: current value: \`$ac_new_val'" >&5 +$as_echo "$as_me: current value: \`$ac_new_val'" >&2;} + fi;; + esac + # Pass precious variables to config.status. + if test "$ac_new_set" = set; then + case $ac_new_val in + *\'*) ac_arg=$ac_var=`$as_echo "$ac_new_val" | sed "s/'/'\\\\\\\\''/g"` ;; + *) ac_arg=$ac_var=$ac_new_val ;; + esac + case " $ac_configure_args " in + *" '$ac_arg' "*) ;; # Avoid dups. Use of quotes ensures accuracy. + *) as_fn_append ac_configure_args " '$ac_arg'" ;; + esac + fi +done +if $ac_cache_corrupted; then + { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} + { $as_echo "$as_me:${as_lineno-$LINENO}: error: changes in the environment can compromise the build" >&5 +$as_echo "$as_me: error: changes in the environment can compromise the build" >&2;} + as_fn_error $? "run \`make distclean' and/or \`rm $cache_file' and start over" "$LINENO" 5 +fi +## -------------------- ## +## Main body of script. ## +## -------------------- ## + +ac_ext=c +ac_cpp='$CPP $CPPFLAGS' +ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_c_compiler_gnu + + +ac_config_headers="$ac_config_headers config.h" + + +ac_aux_dir= +for ac_dir in "$srcdir" "$srcdir/.." "$srcdir/../.."; do + if test -f "$ac_dir/install-sh"; then + ac_aux_dir=$ac_dir + ac_install_sh="$ac_aux_dir/install-sh -c" + break + elif test -f "$ac_dir/install.sh"; then + ac_aux_dir=$ac_dir + ac_install_sh="$ac_aux_dir/install.sh -c" + break + elif test -f "$ac_dir/shtool"; then + ac_aux_dir=$ac_dir + ac_install_sh="$ac_aux_dir/shtool install -c" + break + fi +done +if test -z "$ac_aux_dir"; then + as_fn_error $? "cannot find install-sh, install.sh, or shtool in \"$srcdir\" \"$srcdir/..\" \"$srcdir/../..\"" "$LINENO" 5 +fi + +# These three variables are undocumented and unsupported, +# and are intended to be withdrawn in a future Autoconf release. +# They can cause serious problems if a builder's source tree is in a directory +# whose full name contains unusual characters. +ac_config_guess="$SHELL $ac_aux_dir/config.guess" # Please don't use this var. +ac_config_sub="$SHELL $ac_aux_dir/config.sub" # Please don't use this var. +ac_configure="$SHELL $ac_aux_dir/configure" # Please don't use this var. + + +# Make sure we can run config.sub. +$SHELL "$ac_aux_dir/config.sub" sun4 >/dev/null 2>&1 || + as_fn_error $? "cannot run $SHELL $ac_aux_dir/config.sub" "$LINENO" 5 + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking build system type" >&5 +$as_echo_n "checking build system type... " >&6; } +if ${ac_cv_build+:} false; then : + $as_echo_n "(cached) " >&6 +else + ac_build_alias=$build_alias +test "x$ac_build_alias" = x && + ac_build_alias=`$SHELL "$ac_aux_dir/config.guess"` +test "x$ac_build_alias" = x && + as_fn_error $? "cannot guess build type; you must specify one" "$LINENO" 5 +ac_cv_build=`$SHELL "$ac_aux_dir/config.sub" $ac_build_alias` || + as_fn_error $? "$SHELL $ac_aux_dir/config.sub $ac_build_alias failed" "$LINENO" 5 + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_build" >&5 +$as_echo "$ac_cv_build" >&6; } +case $ac_cv_build in +*-*-*) ;; +*) as_fn_error $? "invalid value of canonical build" "$LINENO" 5;; +esac +build=$ac_cv_build +ac_save_IFS=$IFS; IFS='-' +set x $ac_cv_build +shift +build_cpu=$1 +build_vendor=$2 +shift; shift +# Remember, the first character of IFS is used to create $*, +# except with old shells: +build_os=$* +IFS=$ac_save_IFS +case $build_os in *\ *) build_os=`echo "$build_os" | sed 's/ /-/g'`;; esac + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking host system type" >&5 +$as_echo_n "checking host system type... " >&6; } +if ${ac_cv_host+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test "x$host_alias" = x; then + ac_cv_host=$ac_cv_build +else + ac_cv_host=`$SHELL "$ac_aux_dir/config.sub" $host_alias` || + as_fn_error $? "$SHELL $ac_aux_dir/config.sub $host_alias failed" "$LINENO" 5 +fi + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_host" >&5 +$as_echo "$ac_cv_host" >&6; } +case $ac_cv_host in +*-*-*) ;; +*) as_fn_error $? "invalid value of canonical host" "$LINENO" 5;; +esac +host=$ac_cv_host +ac_save_IFS=$IFS; IFS='-' +set x $ac_cv_host +shift +host_cpu=$1 +host_vendor=$2 +shift; shift +# Remember, the first character of IFS is used to create $*, +# except with old shells: +host_os=$* +IFS=$ac_save_IFS +case $host_os in *\ *) host_os=`echo "$host_os" | sed 's/ /-/g'`;; esac + + + +test "$program_prefix" != NONE && + program_transform_name="s&^&$program_prefix&;$program_transform_name" +# Use a double $ so make ignores it. +test "$program_suffix" != NONE && + program_transform_name="s&\$&$program_suffix&;$program_transform_name" +# Double any \ or $. +# By default was `s,x,x', remove it if useless. +ac_script='s/[\\$]/&&/g;s/;s,x,x,$//' +program_transform_name=`$as_echo "$program_transform_name" | sed "$ac_script"` + + +if test "$host_vendor" = "cray"; then + ac_cv_c_bigendian=yes +fi + +ac_ext=c +ac_cpp='$CPP $CPPFLAGS' +ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_c_compiler_gnu +if test -n "$ac_tool_prefix"; then + # Extract the first word of "${ac_tool_prefix}gcc", so it can be a program name with args. +set dummy ${ac_tool_prefix}gcc; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_CC+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$CC"; then + ac_cv_prog_CC="$CC" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_prog_CC="${ac_tool_prefix}gcc" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + +fi +fi +CC=$ac_cv_prog_CC +if test -n "$CC"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5 +$as_echo "$CC" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + +fi +if test -z "$ac_cv_prog_CC"; then + ac_ct_CC=$CC + # Extract the first word of "gcc", so it can be a program name with args. +set dummy gcc; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_ac_ct_CC+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$ac_ct_CC"; then + ac_cv_prog_ac_ct_CC="$ac_ct_CC" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_prog_ac_ct_CC="gcc" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + +fi +fi +ac_ct_CC=$ac_cv_prog_ac_ct_CC +if test -n "$ac_ct_CC"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CC" >&5 +$as_echo "$ac_ct_CC" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + if test "x$ac_ct_CC" = x; then + CC="" + else + case $cross_compiling:$ac_tool_warned in +yes:) +{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5 +$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;} +ac_tool_warned=yes ;; +esac + CC=$ac_ct_CC + fi +else + CC="$ac_cv_prog_CC" +fi + +if test -z "$CC"; then + if test -n "$ac_tool_prefix"; then + # Extract the first word of "${ac_tool_prefix}cc", so it can be a program name with args. +set dummy ${ac_tool_prefix}cc; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_CC+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$CC"; then + ac_cv_prog_CC="$CC" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_prog_CC="${ac_tool_prefix}cc" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + +fi +fi +CC=$ac_cv_prog_CC +if test -n "$CC"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5 +$as_echo "$CC" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + + fi +fi +if test -z "$CC"; then + # Extract the first word of "cc", so it can be a program name with args. +set dummy cc; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_CC+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$CC"; then + ac_cv_prog_CC="$CC" # Let the user override the test. +else + ac_prog_rejected=no +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + if test "$as_dir/$ac_word$ac_exec_ext" = "/usr/ucb/cc"; then + ac_prog_rejected=yes + continue + fi + ac_cv_prog_CC="cc" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + +if test $ac_prog_rejected = yes; then + # We found a bogon in the path, so make sure we never use it. + set dummy $ac_cv_prog_CC + shift + if test $@%:@ != 0; then + # We chose a different compiler from the bogus one. + # However, it has the same basename, so the bogon will be chosen + # first if we set CC to just the basename; use the full file name. + shift + ac_cv_prog_CC="$as_dir/$ac_word${1+' '}$@" + fi +fi +fi +fi +CC=$ac_cv_prog_CC +if test -n "$CC"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5 +$as_echo "$CC" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + +fi +if test -z "$CC"; then + if test -n "$ac_tool_prefix"; then + for ac_prog in cl.exe + do + # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args. +set dummy $ac_tool_prefix$ac_prog; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_CC+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$CC"; then + ac_cv_prog_CC="$CC" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_prog_CC="$ac_tool_prefix$ac_prog" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + +fi +fi +CC=$ac_cv_prog_CC +if test -n "$CC"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5 +$as_echo "$CC" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + + test -n "$CC" && break + done +fi +if test -z "$CC"; then + ac_ct_CC=$CC + for ac_prog in cl.exe +do + # Extract the first word of "$ac_prog", so it can be a program name with args. +set dummy $ac_prog; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_ac_ct_CC+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$ac_ct_CC"; then + ac_cv_prog_ac_ct_CC="$ac_ct_CC" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_prog_ac_ct_CC="$ac_prog" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + +fi +fi +ac_ct_CC=$ac_cv_prog_ac_ct_CC +if test -n "$ac_ct_CC"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CC" >&5 +$as_echo "$ac_ct_CC" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + + test -n "$ac_ct_CC" && break +done + + if test "x$ac_ct_CC" = x; then + CC="" + else + case $cross_compiling:$ac_tool_warned in +yes:) +{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5 +$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;} +ac_tool_warned=yes ;; +esac + CC=$ac_ct_CC + fi +fi + +fi + + +test -z "$CC" && { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +as_fn_error $? "no acceptable C compiler found in \$PATH +See \`config.log' for more details" "$LINENO" 5; } + +# Provide some information about the compiler. +$as_echo "$as_me:${as_lineno-$LINENO}: checking for C compiler version" >&5 +set X $ac_compile +ac_compiler=$2 +for ac_option in --version -v -V -qversion; do + { { ac_try="$ac_compiler $ac_option >&5" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" +$as_echo "$ac_try_echo"; } >&5 + (eval "$ac_compiler $ac_option >&5") 2>conftest.err + ac_status=$? + if test -s conftest.err; then + sed '10a\ +... rest of stderr output deleted ... + 10q' conftest.err >conftest.er1 + cat conftest.er1 >&5 + fi + rm -f conftest.er1 conftest.err + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; } +done + +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main () +{ + + ; + return 0; +} +_ACEOF +ac_clean_files_save=$ac_clean_files +ac_clean_files="$ac_clean_files a.out a.out.dSYM a.exe b.out" +# Try to create an executable without -o first, disregard a.out. +# It will help us diagnose broken compilers, and finding out an intuition +# of exeext. +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether the C compiler works" >&5 +$as_echo_n "checking whether the C compiler works... " >&6; } +ac_link_default=`$as_echo "$ac_link" | sed 's/ -o *conftest[^ ]*//'` + +# The possible output files: +ac_files="a.out conftest.exe conftest a.exe a_out.exe b.out conftest.*" + +ac_rmfiles= +for ac_file in $ac_files +do + case $ac_file in + *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM | *.o | *.obj ) ;; + * ) ac_rmfiles="$ac_rmfiles $ac_file";; + esac +done +rm -f $ac_rmfiles + +if { { ac_try="$ac_link_default" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" +$as_echo "$ac_try_echo"; } >&5 + (eval "$ac_link_default") 2>&5 + ac_status=$? + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; }; then : + # Autoconf-2.13 could set the ac_cv_exeext variable to `no'. +# So ignore a value of `no', otherwise this would lead to `EXEEXT = no' +# in a Makefile. We should not override ac_cv_exeext if it was cached, +# so that the user can short-circuit this test for compilers unknown to +# Autoconf. +for ac_file in $ac_files '' +do + test -f "$ac_file" || continue + case $ac_file in + *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM | *.o | *.obj ) + ;; + [ab].out ) + # We found the default executable, but exeext='' is most + # certainly right. + break;; + *.* ) + if test "${ac_cv_exeext+set}" = set && test "$ac_cv_exeext" != no; + then :; else + ac_cv_exeext=`expr "$ac_file" : '[^.]*\(\..*\)'` + fi + # We set ac_cv_exeext here because the later test for it is not + # safe: cross compilers may not add the suffix if given an `-o' + # argument, so we may need to know it at that point already. + # Even if this section looks crufty: it has the advantage of + # actually working. + break;; + * ) + break;; + esac +done +test "$ac_cv_exeext" = no && ac_cv_exeext= + +else + ac_file='' +fi +if test -z "$ac_file"; then : + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +$as_echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + +{ { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +as_fn_error 77 "C compiler cannot create executables +See \`config.log' for more details" "$LINENO" 5; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for C compiler default output file name" >&5 +$as_echo_n "checking for C compiler default output file name... " >&6; } +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_file" >&5 +$as_echo "$ac_file" >&6; } +ac_exeext=$ac_cv_exeext + +rm -f -r a.out a.out.dSYM a.exe conftest$ac_cv_exeext b.out +ac_clean_files=$ac_clean_files_save +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for suffix of executables" >&5 +$as_echo_n "checking for suffix of executables... " >&6; } +if { { ac_try="$ac_link" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" +$as_echo "$ac_try_echo"; } >&5 + (eval "$ac_link") 2>&5 + ac_status=$? + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; }; then : + # If both `conftest.exe' and `conftest' are `present' (well, observable) +# catch `conftest.exe'. For instance with Cygwin, `ls conftest' will +# work properly (i.e., refer to `conftest.exe'), while it won't with +# `rm'. +for ac_file in conftest.exe conftest conftest.*; do + test -f "$ac_file" || continue + case $ac_file in + *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM | *.o | *.obj ) ;; + *.* ) ac_cv_exeext=`expr "$ac_file" : '[^.]*\(\..*\)'` + break;; + * ) break;; + esac +done +else + { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +as_fn_error $? "cannot compute suffix of executables: cannot compile and link +See \`config.log' for more details" "$LINENO" 5; } +fi +rm -f conftest conftest$ac_cv_exeext +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_exeext" >&5 +$as_echo "$ac_cv_exeext" >&6; } + +rm -f conftest.$ac_ext +EXEEXT=$ac_cv_exeext +ac_exeext=$EXEEXT +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +@%:@include +int +main () +{ +FILE *f = fopen ("conftest.out", "w"); + return ferror (f) || fclose (f) != 0; + + ; + return 0; +} +_ACEOF +ac_clean_files="$ac_clean_files conftest.out" +# Check that the compiler produces executables we can run. If not, either +# the compiler is broken, or we cross compile. +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are cross compiling" >&5 +$as_echo_n "checking whether we are cross compiling... " >&6; } +if test "$cross_compiling" != yes; then + { { ac_try="$ac_link" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" +$as_echo "$ac_try_echo"; } >&5 + (eval "$ac_link") 2>&5 + ac_status=$? + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; } + if { ac_try='./conftest$ac_cv_exeext' + { { case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" +$as_echo "$ac_try_echo"; } >&5 + (eval "$ac_try") 2>&5 + ac_status=$? + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; }; }; then + cross_compiling=no + else + if test "$cross_compiling" = maybe; then + cross_compiling=yes + else + { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +as_fn_error $? "cannot run C compiled programs. +If you meant to cross compile, use \`--host'. +See \`config.log' for more details" "$LINENO" 5; } + fi + fi +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $cross_compiling" >&5 +$as_echo "$cross_compiling" >&6; } + +rm -f conftest.$ac_ext conftest$ac_cv_exeext conftest.out +ac_clean_files=$ac_clean_files_save +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for suffix of object files" >&5 +$as_echo_n "checking for suffix of object files... " >&6; } +if ${ac_cv_objext+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main () +{ + + ; + return 0; +} +_ACEOF +rm -f conftest.o conftest.obj +if { { ac_try="$ac_compile" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" +$as_echo "$ac_try_echo"; } >&5 + (eval "$ac_compile") 2>&5 + ac_status=$? + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; }; then : + for ac_file in conftest.o conftest.obj conftest.*; do + test -f "$ac_file" || continue; + case $ac_file in + *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM ) ;; + *) ac_cv_objext=`expr "$ac_file" : '.*\.\(.*\)'` + break;; + esac +done +else + $as_echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + +{ { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +as_fn_error $? "cannot compute suffix of object files: cannot compile +See \`config.log' for more details" "$LINENO" 5; } +fi +rm -f conftest.$ac_cv_objext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_objext" >&5 +$as_echo "$ac_cv_objext" >&6; } +OBJEXT=$ac_cv_objext +ac_objext=$OBJEXT +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are using the GNU C compiler" >&5 +$as_echo_n "checking whether we are using the GNU C compiler... " >&6; } +if ${ac_cv_c_compiler_gnu+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main () +{ +#ifndef __GNUC__ + choke me +#endif + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + ac_compiler_gnu=yes +else + ac_compiler_gnu=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +ac_cv_c_compiler_gnu=$ac_compiler_gnu + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_c_compiler_gnu" >&5 +$as_echo "$ac_cv_c_compiler_gnu" >&6; } +if test $ac_compiler_gnu = yes; then + GCC=yes +else + GCC= +fi +ac_test_CFLAGS=${CFLAGS+set} +ac_save_CFLAGS=$CFLAGS +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CC accepts -g" >&5 +$as_echo_n "checking whether $CC accepts -g... " >&6; } +if ${ac_cv_prog_cc_g+:} false; then : + $as_echo_n "(cached) " >&6 +else + ac_save_c_werror_flag=$ac_c_werror_flag + ac_c_werror_flag=yes + ac_cv_prog_cc_g=no + CFLAGS="-g" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + ac_cv_prog_cc_g=yes +else + CFLAGS="" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + +else + ac_c_werror_flag=$ac_save_c_werror_flag + CFLAGS="-g" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + ac_cv_prog_cc_g=yes +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + ac_c_werror_flag=$ac_save_c_werror_flag +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cc_g" >&5 +$as_echo "$ac_cv_prog_cc_g" >&6; } +if test "$ac_test_CFLAGS" = set; then + CFLAGS=$ac_save_CFLAGS +elif test $ac_cv_prog_cc_g = yes; then + if test "$GCC" = yes; then + CFLAGS="-g -O2" + else + CFLAGS="-g" + fi +else + if test "$GCC" = yes; then + CFLAGS="-O2" + else + CFLAGS= + fi +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $CC option to accept ISO C89" >&5 +$as_echo_n "checking for $CC option to accept ISO C89... " >&6; } +if ${ac_cv_prog_cc_c89+:} false; then : + $as_echo_n "(cached) " >&6 +else + ac_cv_prog_cc_c89=no +ac_save_CC=$CC +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +#include +struct stat; +/* Most of the following tests are stolen from RCS 5.7's src/conf.sh. */ +struct buf { int x; }; +FILE * (*rcsopen) (struct buf *, struct stat *, int); +static char *e (p, i) + char **p; + int i; +{ + return p[i]; +} +static char *f (char * (*g) (char **, int), char **p, ...) +{ + char *s; + va_list v; + va_start (v,p); + s = g (p, va_arg (v,int)); + va_end (v); + return s; +} + +/* OSF 4.0 Compaq cc is some sort of almost-ANSI by default. It has + function prototypes and stuff, but not '\xHH' hex character constants. + These don't provoke an error unfortunately, instead are silently treated + as 'x'. The following induces an error, until -std is added to get + proper ANSI mode. Curiously '\x00'!='x' always comes out true, for an + array size at least. It's necessary to write '\x00'==0 to get something + that's true only with -std. */ +int osf4_cc_array ['\x00' == 0 ? 1 : -1]; + +/* IBM C 6 for AIX is almost-ANSI by default, but it replaces macro parameters + inside strings and character constants. */ +#define FOO(x) 'x' +int xlc6_cc_array[FOO(a) == 'x' ? 1 : -1]; + +int test (int i, double x); +struct s1 {int (*f) (int a);}; +struct s2 {int (*f) (double a);}; +int pairnames (int, char **, FILE *(*)(struct buf *, struct stat *, int), int, int); +int argc; +char **argv; +int +main () +{ +return f (e, argv, 0) != argv[0] || f (e, argv, 1) != argv[1]; + ; + return 0; +} +_ACEOF +for ac_arg in '' -qlanglvl=extc89 -qlanglvl=ansi -std \ + -Ae "-Aa -D_HPUX_SOURCE" "-Xc -D__EXTENSIONS__" +do + CC="$ac_save_CC $ac_arg" + if ac_fn_c_try_compile "$LINENO"; then : + ac_cv_prog_cc_c89=$ac_arg +fi +rm -f core conftest.err conftest.$ac_objext + test "x$ac_cv_prog_cc_c89" != "xno" && break +done +rm -f conftest.$ac_ext +CC=$ac_save_CC + +fi +# AC_CACHE_VAL +case "x$ac_cv_prog_cc_c89" in + x) + { $as_echo "$as_me:${as_lineno-$LINENO}: result: none needed" >&5 +$as_echo "none needed" >&6; } ;; + xno) + { $as_echo "$as_me:${as_lineno-$LINENO}: result: unsupported" >&5 +$as_echo "unsupported" >&6; } ;; + *) + CC="$CC $ac_cv_prog_cc_c89" + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cc_c89" >&5 +$as_echo "$ac_cv_prog_cc_c89" >&6; } ;; +esac +if test "x$ac_cv_prog_cc_c89" != xno; then : + +fi + +ac_ext=c +ac_cpp='$CPP $CPPFLAGS' +ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_c_compiler_gnu + + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $CC option to accept ISO C99" >&5 +$as_echo_n "checking for $CC option to accept ISO C99... " >&6; } +if ${ac_cv_prog_cc_c99+:} false; then : + $as_echo_n "(cached) " >&6 +else + ac_cv_prog_cc_c99=no +ac_save_CC=$CC +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +#include +#include +#include +#include + +// Check varargs macros. These examples are taken from C99 6.10.3.5. +#define debug(...) fprintf (stderr, __VA_ARGS__) +#define showlist(...) puts (#__VA_ARGS__) +#define report(test,...) ((test) ? puts (#test) : printf (__VA_ARGS__)) +static void +test_varargs_macros (void) +{ + int x = 1234; + int y = 5678; + debug ("Flag"); + debug ("X = %d\n", x); + showlist (The first, second, and third items.); + report (x>y, "x is %d but y is %d", x, y); +} + +// Check long long types. +#define BIG64 18446744073709551615ull +#define BIG32 4294967295ul +#define BIG_OK (BIG64 / BIG32 == 4294967297ull && BIG64 % BIG32 == 0) +#if !BIG_OK + your preprocessor is broken; +#endif +#if BIG_OK +#else + your preprocessor is broken; +#endif +static long long int bignum = -9223372036854775807LL; +static unsigned long long int ubignum = BIG64; + +struct incomplete_array +{ + int datasize; + double data[]; +}; + +struct named_init { + int number; + const wchar_t *name; + double average; +}; + +typedef const char *ccp; + +static inline int +test_restrict (ccp restrict text) +{ + // See if C++-style comments work. + // Iterate through items via the restricted pointer. + // Also check for declarations in for loops. + for (unsigned int i = 0; *(text+i) != '\0'; ++i) + continue; + return 0; +} + +// Check varargs and va_copy. +static void +test_varargs (const char *format, ...) +{ + va_list args; + va_start (args, format); + va_list args_copy; + va_copy (args_copy, args); + + const char *str; + int number; + float fnumber; + + while (*format) + { + switch (*format++) + { + case 's': // string + str = va_arg (args_copy, const char *); + break; + case 'd': // int + number = va_arg (args_copy, int); + break; + case 'f': // float + fnumber = va_arg (args_copy, double); + break; + default: + break; + } + } + va_end (args_copy); + va_end (args); +} + +int +main () +{ + + // Check bool. + _Bool success = false; + + // Check restrict. + if (test_restrict ("String literal") == 0) + success = true; + char *restrict newvar = "Another string"; + + // Check varargs. + test_varargs ("s, d' f .", "string", 65, 34.234); + test_varargs_macros (); + + // Check flexible array members. + struct incomplete_array *ia = + malloc (sizeof (struct incomplete_array) + (sizeof (double) * 10)); + ia->datasize = 10; + for (int i = 0; i < ia->datasize; ++i) + ia->data[i] = i * 1.234; + + // Check named initializers. + struct named_init ni = { + .number = 34, + .name = L"Test wide string", + .average = 543.34343, + }; + + ni.number = 58; + + int dynamic_array[ni.number]; + dynamic_array[ni.number - 1] = 543; + + // work around unused variable warnings + return (!success || bignum == 0LL || ubignum == 0uLL || newvar[0] == 'x' + || dynamic_array[ni.number - 1] != 543); + + ; + return 0; +} +_ACEOF +for ac_arg in '' -std=gnu99 -std=c99 -c99 -AC99 -D_STDC_C99= -qlanglvl=extc99 +do + CC="$ac_save_CC $ac_arg" + if ac_fn_c_try_compile "$LINENO"; then : + ac_cv_prog_cc_c99=$ac_arg +fi +rm -f core conftest.err conftest.$ac_objext + test "x$ac_cv_prog_cc_c99" != "xno" && break +done +rm -f conftest.$ac_ext +CC=$ac_save_CC + +fi +# AC_CACHE_VAL +case "x$ac_cv_prog_cc_c99" in + x) + { $as_echo "$as_me:${as_lineno-$LINENO}: result: none needed" >&5 +$as_echo "none needed" >&6; } ;; + xno) + { $as_echo "$as_me:${as_lineno-$LINENO}: result: unsupported" >&5 +$as_echo "unsupported" >&6; } ;; + *) + CC="$CC $ac_cv_prog_cc_c99" + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cc_c99" >&5 +$as_echo "$ac_cv_prog_cc_c99" >&6; } ;; +esac +if test "x$ac_cv_prog_cc_c99" != xno; then : + +fi + + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for an ANSI C-conforming const" >&5 +$as_echo_n "checking for an ANSI C-conforming const... " >&6; } +if ${ac_cv_c_const+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main () +{ + +#ifndef __cplusplus + /* Ultrix mips cc rejects this sort of thing. */ + typedef int charset[2]; + const charset cs = { 0, 0 }; + /* SunOS 4.1.1 cc rejects this. */ + char const *const *pcpcc; + char **ppc; + /* NEC SVR4.0.2 mips cc rejects this. */ + struct point {int x, y;}; + static struct point const zero = {0,0}; + /* AIX XL C 1.02.0.0 rejects this. + It does not let you subtract one const X* pointer from another in + an arm of an if-expression whose if-part is not a constant + expression */ + const char *g = "string"; + pcpcc = &g + (g ? g-g : 0); + /* HPUX 7.0 cc rejects these. */ + ++pcpcc; + ppc = (char**) pcpcc; + pcpcc = (char const *const *) ppc; + { /* SCO 3.2v4 cc rejects this sort of thing. */ + char tx; + char *t = &tx; + char const *s = 0 ? (char *) 0 : (char const *) 0; + + *t++ = 0; + if (s) return 0; + } + { /* Someone thinks the Sun supposedly-ANSI compiler will reject this. */ + int x[] = {25, 17}; + const int *foo = &x[0]; + ++foo; + } + { /* Sun SC1.0 ANSI compiler rejects this -- but not the above. */ + typedef const int *iptr; + iptr p = 0; + ++p; + } + { /* AIX XL C 1.02.0.0 rejects this sort of thing, saying + "k.c", line 2.27: 1506-025 (S) Operand must be a modifiable lvalue. */ + struct s { int j; const int *ap[3]; } bx; + struct s *b = &bx; b->j = 5; + } + { /* ULTRIX-32 V3.1 (Rev 9) vcc rejects this */ + const int foo = 10; + if (!foo) return 0; + } + return !cs[0] && !zero.x; +#endif + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + ac_cv_c_const=yes +else + ac_cv_c_const=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_c_const" >&5 +$as_echo "$ac_cv_c_const" >&6; } +if test $ac_cv_c_const = no; then + +$as_echo "@%:@define const /**/" >>confdefs.h + +fi + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for inline" >&5 +$as_echo_n "checking for inline... " >&6; } +if ${ac_cv_c_inline+:} false; then : + $as_echo_n "(cached) " >&6 +else + ac_cv_c_inline=no +for ac_kw in inline __inline__ __inline; do + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#ifndef __cplusplus +typedef int foo_t; +static $ac_kw foo_t static_foo () {return 0; } +$ac_kw foo_t foo () {return 0; } +#endif + +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + ac_cv_c_inline=$ac_kw +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + test "$ac_cv_c_inline" != no && break +done + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_c_inline" >&5 +$as_echo "$ac_cv_c_inline" >&6; } + +case $ac_cv_c_inline in + inline | yes) ;; + *) + case $ac_cv_c_inline in + no) ac_val=;; + *) ac_val=$ac_cv_c_inline;; + esac + cat >>confdefs.h <<_ACEOF +#ifndef __cplusplus +#define inline $ac_val +#endif +_ACEOF + ;; +esac + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for C/C++ restrict keyword" >&5 +$as_echo_n "checking for C/C++ restrict keyword... " >&6; } +if ${ac_cv_c_restrict+:} false; then : + $as_echo_n "(cached) " >&6 +else + ac_cv_c_restrict=no + # The order here caters to the fact that C++ does not require restrict. + for ac_kw in __restrict __restrict__ _Restrict restrict; do + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +typedef int * int_ptr; + int foo (int_ptr $ac_kw ip) { + return ip[0]; + } +int +main () +{ +int s[1]; + int * $ac_kw t = s; + t[0] = 0; + return foo(t) + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + ac_cv_c_restrict=$ac_kw +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + test "$ac_cv_c_restrict" != no && break + done + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_c_restrict" >&5 +$as_echo "$ac_cv_c_restrict" >&6; } + + case $ac_cv_c_restrict in + restrict) ;; + no) $as_echo "@%:@define restrict /**/" >>confdefs.h + ;; + *) cat >>confdefs.h <<_ACEOF +@%:@define restrict $ac_cv_c_restrict +_ACEOF + ;; + esac + +ac_ext=f +ac_compile='$F77 -c $FFLAGS conftest.$ac_ext >&5' +ac_link='$F77 -o conftest$ac_exeext $FFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_f77_compiler_gnu +if test -n "$ac_tool_prefix"; then + for ac_prog in g77 xlf f77 frt pgf77 cf77 fort77 fl32 af77 xlf90 f90 pgf90 pghpf epcf90 gfortran g95 xlf95 f95 fort ifort ifc efc pgfortran pgf95 lf95 ftn nagfor + do + # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args. +set dummy $ac_tool_prefix$ac_prog; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_F77+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$F77"; then + ac_cv_prog_F77="$F77" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_prog_F77="$ac_tool_prefix$ac_prog" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + +fi +fi +F77=$ac_cv_prog_F77 +if test -n "$F77"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $F77" >&5 +$as_echo "$F77" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + + test -n "$F77" && break + done +fi +if test -z "$F77"; then + ac_ct_F77=$F77 + for ac_prog in g77 xlf f77 frt pgf77 cf77 fort77 fl32 af77 xlf90 f90 pgf90 pghpf epcf90 gfortran g95 xlf95 f95 fort ifort ifc efc pgfortran pgf95 lf95 ftn nagfor +do + # Extract the first word of "$ac_prog", so it can be a program name with args. +set dummy $ac_prog; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_ac_ct_F77+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$ac_ct_F77"; then + ac_cv_prog_ac_ct_F77="$ac_ct_F77" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_prog_ac_ct_F77="$ac_prog" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + +fi +fi +ac_ct_F77=$ac_cv_prog_ac_ct_F77 +if test -n "$ac_ct_F77"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_F77" >&5 +$as_echo "$ac_ct_F77" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + + test -n "$ac_ct_F77" && break +done + + if test "x$ac_ct_F77" = x; then + F77="" + else + case $cross_compiling:$ac_tool_warned in +yes:) +{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5 +$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;} +ac_tool_warned=yes ;; +esac + F77=$ac_ct_F77 + fi +fi + + +# Provide some information about the compiler. +$as_echo "$as_me:${as_lineno-$LINENO}: checking for Fortran 77 compiler version" >&5 +set X $ac_compile +ac_compiler=$2 +for ac_option in --version -v -V -qversion; do + { { ac_try="$ac_compiler $ac_option >&5" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" +$as_echo "$ac_try_echo"; } >&5 + (eval "$ac_compiler $ac_option >&5") 2>conftest.err + ac_status=$? + if test -s conftest.err; then + sed '10a\ +... rest of stderr output deleted ... + 10q' conftest.err >conftest.er1 + cat conftest.er1 >&5 + fi + rm -f conftest.er1 conftest.err + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; } +done +rm -f a.out + +# If we don't use `.F' as extension, the preprocessor is not run on the +# input file. (Note that this only needs to work for GNU compilers.) +ac_save_ext=$ac_ext +ac_ext=F +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are using the GNU Fortran 77 compiler" >&5 +$as_echo_n "checking whether we are using the GNU Fortran 77 compiler... " >&6; } +if ${ac_cv_f77_compiler_gnu+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat > conftest.$ac_ext <<_ACEOF + program main +#ifndef __GNUC__ + choke me +#endif + + end +_ACEOF +if ac_fn_f77_try_compile "$LINENO"; then : + ac_compiler_gnu=yes +else + ac_compiler_gnu=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +ac_cv_f77_compiler_gnu=$ac_compiler_gnu + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_f77_compiler_gnu" >&5 +$as_echo "$ac_cv_f77_compiler_gnu" >&6; } +ac_ext=$ac_save_ext +ac_test_FFLAGS=${FFLAGS+set} +ac_save_FFLAGS=$FFLAGS +FFLAGS= +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $F77 accepts -g" >&5 +$as_echo_n "checking whether $F77 accepts -g... " >&6; } +if ${ac_cv_prog_f77_g+:} false; then : + $as_echo_n "(cached) " >&6 +else + FFLAGS=-g +cat > conftest.$ac_ext <<_ACEOF + program main + + end +_ACEOF +if ac_fn_f77_try_compile "$LINENO"; then : + ac_cv_prog_f77_g=yes +else + ac_cv_prog_f77_g=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_f77_g" >&5 +$as_echo "$ac_cv_prog_f77_g" >&6; } +if test "$ac_test_FFLAGS" = set; then + FFLAGS=$ac_save_FFLAGS +elif test $ac_cv_prog_f77_g = yes; then + if test "x$ac_cv_f77_compiler_gnu" = xyes; then + FFLAGS="-g -O2" + else + FFLAGS="-g" + fi +else + if test "x$ac_cv_f77_compiler_gnu" = xyes; then + FFLAGS="-O2" + else + FFLAGS= + fi +fi + +if test $ac_compiler_gnu = yes; then + G77=yes +else + G77= +fi +ac_ext=c +ac_cpp='$CPP $CPPFLAGS' +ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_c_compiler_gnu + + +ac_ext=f +ac_compile='$F77 -c $FFLAGS conftest.$ac_ext >&5' +ac_link='$F77 -o conftest$ac_exeext $FFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_f77_compiler_gnu +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking how to get verbose linking output from $F77" >&5 +$as_echo_n "checking how to get verbose linking output from $F77... " >&6; } +if ${ac_cv_prog_f77_v+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat > conftest.$ac_ext <<_ACEOF + program main + + end +_ACEOF +if ac_fn_f77_try_compile "$LINENO"; then : + ac_cv_prog_f77_v= +# Try some options frequently used verbose output +for ac_verb in -v -verbose --verbose -V -\#\#\#; do + cat > conftest.$ac_ext <<_ACEOF + program main + + end +_ACEOF + +# Compile and link our simple test program by passing a flag (argument +# 1 to this macro) to the Fortran compiler in order to get +# "verbose" output that we can then parse for the Fortran linker +# flags. +ac_save_FFLAGS=$FFLAGS +FFLAGS="$FFLAGS $ac_verb" +eval "set x $ac_link" +shift +$as_echo "$as_me:${as_lineno-$LINENO}: $*" >&5 +# gfortran 4.3 outputs lines setting COLLECT_GCC_OPTIONS, COMPILER_PATH, +# LIBRARY_PATH; skip all such settings. +ac_f77_v_output=`eval $ac_link 5>&1 2>&1 | + sed '/^Driving:/d; /^Configured with:/d; + '"/^[_$as_cr_Letters][_$as_cr_alnum]*=/d"` +$as_echo "$ac_f77_v_output" >&5 +FFLAGS=$ac_save_FFLAGS + +rm -rf conftest* + +# On HP/UX there is a line like: "LPATH is: /foo:/bar:/baz" where +# /foo, /bar, and /baz are search directories for the Fortran linker. +# Here, we change these into -L/foo -L/bar -L/baz (and put it first): +ac_f77_v_output="`echo $ac_f77_v_output | + grep 'LPATH is:' | + sed 's|.*LPATH is\(: *[^ ]*\).*|\1|;s|: */| -L/|g'` $ac_f77_v_output" + +# FIXME: we keep getting bitten by quoted arguments; a more general fix +# that detects unbalanced quotes in FLIBS should be implemented +# and (ugh) tested at some point. +case $ac_f77_v_output in + # With xlf replace commas with spaces, + # and remove "-link" and closing parenthesis. + *xlfentry*) + ac_f77_v_output=`echo $ac_f77_v_output | + sed ' + s/,/ /g + s/ -link / /g + s/) *$// + ' + ` ;; + + # With Intel ifc, ignore the quoted -mGLOB_options_string stuff (quoted + # $LIBS confuse us, and the libraries appear later in the output anyway). + *mGLOB_options_string*) + ac_f77_v_output=`echo $ac_f77_v_output | sed 's/"-mGLOB[^"]*"/ /g'` ;; + + # Portland Group compiler has singly- or doubly-quoted -cmdline argument + # Singly-quoted arguments were reported for versions 5.2-4 and 6.0-4. + # Doubly-quoted arguments were reported for "PGF90/x86 Linux/x86 5.0-2". + *-cmdline\ * | *-ignore\ * | *-def\ *) + ac_f77_v_output=`echo $ac_f77_v_output | sed "\ + s/-cmdline *'[^']*'/ /g; s/-cmdline *\"[^\"]*\"/ /g + s/-ignore *'[^']*'/ /g; s/-ignore *\"[^\"]*\"/ /g + s/-def *'[^']*'/ /g; s/-def *\"[^\"]*\"/ /g"` ;; + + # If we are using fort77 (the f2c wrapper) then filter output and delete quotes. + *fort77*f2c*gcc*) + ac_f77_v_output=`echo "$ac_f77_v_output" | sed -n ' + /:[ ]\+Running[ ]\{1,\}"gcc"/{ + /"-c"/d + /[.]c"*/d + s/^.*"gcc"/"gcc"/ + s/"//gp + }'` ;; + + # If we are using Cray Fortran then delete quotes. + *cft90*) + ac_f77_v_output=`echo $ac_f77_v_output | sed 's/"//g'` ;; +esac + + + # look for -l* and *.a constructs in the output + for ac_arg in $ac_f77_v_output; do + case $ac_arg in + [\\/]*.a | ?:[\\/]*.a | -[lLRu]*) + ac_cv_prog_f77_v=$ac_verb + break 2 ;; + esac + done +done +if test -z "$ac_cv_prog_f77_v"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: cannot determine how to obtain linking information from $F77" >&5 +$as_echo "$as_me: WARNING: cannot determine how to obtain linking information from $F77" >&2;} +fi +else + { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: compilation failed" >&5 +$as_echo "$as_me: WARNING: compilation failed" >&2;} +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_f77_v" >&5 +$as_echo "$ac_cv_prog_f77_v" >&6; } +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for Fortran 77 libraries of $F77" >&5 +$as_echo_n "checking for Fortran 77 libraries of $F77... " >&6; } +if ${ac_cv_f77_libs+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test "x$FLIBS" != "x"; then + ac_cv_f77_libs="$FLIBS" # Let the user override the test. +else + +cat > conftest.$ac_ext <<_ACEOF + program main + + end +_ACEOF + +# Compile and link our simple test program by passing a flag (argument +# 1 to this macro) to the Fortran compiler in order to get +# "verbose" output that we can then parse for the Fortran linker +# flags. +ac_save_FFLAGS=$FFLAGS +FFLAGS="$FFLAGS $ac_cv_prog_f77_v" +eval "set x $ac_link" +shift +$as_echo "$as_me:${as_lineno-$LINENO}: $*" >&5 +# gfortran 4.3 outputs lines setting COLLECT_GCC_OPTIONS, COMPILER_PATH, +# LIBRARY_PATH; skip all such settings. +ac_f77_v_output=`eval $ac_link 5>&1 2>&1 | + sed '/^Driving:/d; /^Configured with:/d; + '"/^[_$as_cr_Letters][_$as_cr_alnum]*=/d"` +$as_echo "$ac_f77_v_output" >&5 +FFLAGS=$ac_save_FFLAGS + +rm -rf conftest* + +# On HP/UX there is a line like: "LPATH is: /foo:/bar:/baz" where +# /foo, /bar, and /baz are search directories for the Fortran linker. +# Here, we change these into -L/foo -L/bar -L/baz (and put it first): +ac_f77_v_output="`echo $ac_f77_v_output | + grep 'LPATH is:' | + sed 's|.*LPATH is\(: *[^ ]*\).*|\1|;s|: */| -L/|g'` $ac_f77_v_output" + +# FIXME: we keep getting bitten by quoted arguments; a more general fix +# that detects unbalanced quotes in FLIBS should be implemented +# and (ugh) tested at some point. +case $ac_f77_v_output in + # With xlf replace commas with spaces, + # and remove "-link" and closing parenthesis. + *xlfentry*) + ac_f77_v_output=`echo $ac_f77_v_output | + sed ' + s/,/ /g + s/ -link / /g + s/) *$// + ' + ` ;; + + # With Intel ifc, ignore the quoted -mGLOB_options_string stuff (quoted + # $LIBS confuse us, and the libraries appear later in the output anyway). + *mGLOB_options_string*) + ac_f77_v_output=`echo $ac_f77_v_output | sed 's/"-mGLOB[^"]*"/ /g'` ;; + + # Portland Group compiler has singly- or doubly-quoted -cmdline argument + # Singly-quoted arguments were reported for versions 5.2-4 and 6.0-4. + # Doubly-quoted arguments were reported for "PGF90/x86 Linux/x86 5.0-2". + *-cmdline\ * | *-ignore\ * | *-def\ *) + ac_f77_v_output=`echo $ac_f77_v_output | sed "\ + s/-cmdline *'[^']*'/ /g; s/-cmdline *\"[^\"]*\"/ /g + s/-ignore *'[^']*'/ /g; s/-ignore *\"[^\"]*\"/ /g + s/-def *'[^']*'/ /g; s/-def *\"[^\"]*\"/ /g"` ;; + + # If we are using fort77 (the f2c wrapper) then filter output and delete quotes. + *fort77*f2c*gcc*) + ac_f77_v_output=`echo "$ac_f77_v_output" | sed -n ' + /:[ ]\+Running[ ]\{1,\}"gcc"/{ + /"-c"/d + /[.]c"*/d + s/^.*"gcc"/"gcc"/ + s/"//gp + }'` ;; + + # If we are using Cray Fortran then delete quotes. + *cft90*) + ac_f77_v_output=`echo $ac_f77_v_output | sed 's/"//g'` ;; +esac + + + +ac_cv_f77_libs= + +# Save positional arguments (if any) +ac_save_positional="$@" + +set X $ac_f77_v_output +while test $@%:@ != 1; do + shift + ac_arg=$1 + case $ac_arg in + [\\/]*.a | ?:[\\/]*.a) + ac_exists=false + for ac_i in $ac_cv_f77_libs; do + if test x"$ac_arg" = x"$ac_i"; then + ac_exists=true + break + fi + done + + if test x"$ac_exists" = xtrue; then : + +else + ac_cv_f77_libs="$ac_cv_f77_libs $ac_arg" +fi + ;; + -bI:*) + ac_exists=false + for ac_i in $ac_cv_f77_libs; do + if test x"$ac_arg" = x"$ac_i"; then + ac_exists=true + break + fi + done + + if test x"$ac_exists" = xtrue; then : + +else + if test "$ac_compiler_gnu" = yes; then + for ac_link_opt in $ac_arg; do + ac_cv_f77_libs="$ac_cv_f77_libs -Xlinker $ac_link_opt" + done +else + ac_cv_f77_libs="$ac_cv_f77_libs $ac_arg" +fi +fi + ;; + # Ignore these flags. + -lang* | -lcrt*.o | -lc | -lgcc* | -lSystem | -libmil | -little \ + |-LANG:=* | -LIST:* | -LNO:* | -link) + ;; + -lkernel32) + case $host_os in + *cygwin*) ;; + *) ac_cv_f77_libs="$ac_cv_f77_libs $ac_arg" + ;; + esac + ;; + -[LRuYz]) + # These flags, when seen by themselves, take an argument. + # We remove the space between option and argument and re-iterate + # unless we find an empty arg or a new option (starting with -) + case $2 in + "" | -*);; + *) + ac_arg="$ac_arg$2" + shift; shift + set X $ac_arg "$@" + ;; + esac + ;; + -YP,*) + for ac_j in `$as_echo "$ac_arg" | sed -e 's/-YP,/-L/;s/:/ -L/g'`; do + ac_exists=false + for ac_i in $ac_cv_f77_libs; do + if test x"$ac_j" = x"$ac_i"; then + ac_exists=true + break + fi + done + + if test x"$ac_exists" = xtrue; then : + +else + ac_arg="$ac_arg $ac_j" + ac_cv_f77_libs="$ac_cv_f77_libs $ac_j" +fi + done + ;; + -[lLR]*) + ac_exists=false + for ac_i in $ac_cv_f77_libs; do + if test x"$ac_arg" = x"$ac_i"; then + ac_exists=true + break + fi + done + + if test x"$ac_exists" = xtrue; then : + +else + ac_cv_f77_libs="$ac_cv_f77_libs $ac_arg" +fi + ;; + -zallextract*| -zdefaultextract) + ac_cv_f77_libs="$ac_cv_f77_libs $ac_arg" + ;; + # Ignore everything else. + esac +done +# restore positional arguments +set X $ac_save_positional; shift + +# We only consider "LD_RUN_PATH" on Solaris systems. If this is seen, +# then we insist that the "run path" must be an absolute path (i.e. it +# must begin with a "/"). +case `(uname -sr) 2>/dev/null` in + "SunOS 5"*) + ac_ld_run_path=`$as_echo "$ac_f77_v_output" | + sed -n 's,^.*LD_RUN_PATH *= *\(/[^ ]*\).*$,-R\1,p'` + test "x$ac_ld_run_path" != x && + if test "$ac_compiler_gnu" = yes; then + for ac_link_opt in $ac_ld_run_path; do + ac_cv_f77_libs="$ac_cv_f77_libs -Xlinker $ac_link_opt" + done +else + ac_cv_f77_libs="$ac_cv_f77_libs $ac_ld_run_path" +fi + ;; +esac +fi # test "x$[]_AC_LANG_PREFIX[]LIBS" = "x" + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_f77_libs" >&5 +$as_echo "$ac_cv_f77_libs" >&6; } +FLIBS="$ac_cv_f77_libs" + + +ac_ext=c +ac_cpp='$CPP $CPPFLAGS' +ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_c_compiler_gnu + +if test -n "$ac_tool_prefix"; then + # Extract the first word of "${ac_tool_prefix}ar", so it can be a program name with args. +set dummy ${ac_tool_prefix}ar; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_AR+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$AR"; then + ac_cv_prog_AR="$AR" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_prog_AR="${ac_tool_prefix}ar" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + +fi +fi +AR=$ac_cv_prog_AR +if test -n "$AR"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $AR" >&5 +$as_echo "$AR" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + +fi +if test -z "$ac_cv_prog_AR"; then + ac_ct_AR=$AR + # Extract the first word of "ar", so it can be a program name with args. +set dummy ar; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_ac_ct_AR+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$ac_ct_AR"; then + ac_cv_prog_ac_ct_AR="$ac_ct_AR" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_prog_ac_ct_AR="ar" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + +fi +fi +ac_ct_AR=$ac_cv_prog_ac_ct_AR +if test -n "$ac_ct_AR"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_AR" >&5 +$as_echo "$ac_ct_AR" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + if test "x$ac_ct_AR" = x; then + AR="ar" + else + case $cross_compiling:$ac_tool_warned in +yes:) +{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5 +$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;} +ac_tool_warned=yes ;; +esac + AR=$ac_ct_AR + fi +else + AR="$ac_cv_prog_AR" +fi + +LIBS="$LIBS $FLIBS -lm" + +for ac_prog in flex lex +do + # Extract the first word of "$ac_prog", so it can be a program name with args. +set dummy $ac_prog; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_LEX+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$LEX"; then + ac_cv_prog_LEX="$LEX" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_prog_LEX="$ac_prog" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + +fi +fi +LEX=$ac_cv_prog_LEX +if test -n "$LEX"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $LEX" >&5 +$as_echo "$LEX" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + + test -n "$LEX" && break +done +test -n "$LEX" || LEX=":" + +if test "x$LEX" != "x:"; then + cat >conftest.l <<_ACEOF +%% +a { ECHO; } +b { REJECT; } +c { yymore (); } +d { yyless (1); } +e { /* IRIX 6.5 flex 2.5.4 underquotes its yyless argument. */ + yyless ((input () != 0)); } +f { unput (yytext[0]); } +. { BEGIN INITIAL; } +%% +#ifdef YYTEXT_POINTER +extern char *yytext; +#endif +int +main (void) +{ + return ! yylex () + ! yywrap (); +} +_ACEOF +{ { ac_try="$LEX conftest.l" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" +$as_echo "$ac_try_echo"; } >&5 + (eval "$LEX conftest.l") 2>&5 + ac_status=$? + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; } +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking lex output file root" >&5 +$as_echo_n "checking lex output file root... " >&6; } +if ${ac_cv_prog_lex_root+:} false; then : + $as_echo_n "(cached) " >&6 +else + +if test -f lex.yy.c; then + ac_cv_prog_lex_root=lex.yy +elif test -f lexyy.c; then + ac_cv_prog_lex_root=lexyy +else + as_fn_error $? "cannot find output from $LEX; giving up" "$LINENO" 5 +fi +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_lex_root" >&5 +$as_echo "$ac_cv_prog_lex_root" >&6; } +LEX_OUTPUT_ROOT=$ac_cv_prog_lex_root + +if test -z "${LEXLIB+set}"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking lex library" >&5 +$as_echo_n "checking lex library... " >&6; } +if ${ac_cv_lib_lex+:} false; then : + $as_echo_n "(cached) " >&6 +else + + ac_save_LIBS=$LIBS + ac_cv_lib_lex='none needed' + for ac_lib in '' -lfl -ll; do + LIBS="$ac_lib $ac_save_LIBS" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +`cat $LEX_OUTPUT_ROOT.c` +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + ac_cv_lib_lex=$ac_lib +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext + test "$ac_cv_lib_lex" != 'none needed' && break + done + LIBS=$ac_save_LIBS + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_lex" >&5 +$as_echo "$ac_cv_lib_lex" >&6; } + test "$ac_cv_lib_lex" != 'none needed' && LEXLIB=$ac_cv_lib_lex +fi + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether yytext is a pointer" >&5 +$as_echo_n "checking whether yytext is a pointer... " >&6; } +if ${ac_cv_prog_lex_yytext_pointer+:} false; then : + $as_echo_n "(cached) " >&6 +else + # POSIX says lex can declare yytext either as a pointer or an array; the +# default is implementation-dependent. Figure out which it is, since +# not all implementations provide the %pointer and %array declarations. +ac_cv_prog_lex_yytext_pointer=no +ac_save_LIBS=$LIBS +LIBS="$LEXLIB $ac_save_LIBS" +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + + #define YYTEXT_POINTER 1 +`cat $LEX_OUTPUT_ROOT.c` +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + ac_cv_prog_lex_yytext_pointer=yes +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +LIBS=$ac_save_LIBS + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_lex_yytext_pointer" >&5 +$as_echo "$ac_cv_prog_lex_yytext_pointer" >&6; } +if test $ac_cv_prog_lex_yytext_pointer = yes; then + +$as_echo "@%:@define YYTEXT_POINTER 1" >>confdefs.h + +fi +rm -f conftest.l $LEX_OUTPUT_ROOT.c + +fi +if test "$LEX" = ":"; then + as_fn_error $? "(F)LEX is required for building read_input.c. Please install it and run configure again." "$LINENO" 5 +fi + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ${MAKE-make} sets \$(MAKE)" >&5 +$as_echo_n "checking whether ${MAKE-make} sets \$(MAKE)... " >&6; } +set x ${MAKE-make} +ac_make=`$as_echo "$2" | sed 's/+/p/g; s/[^a-zA-Z0-9_]/_/g'` +if eval \${ac_cv_prog_make_${ac_make}_set+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat >conftest.make <<\_ACEOF +SHELL = /bin/sh +all: + @echo '@@@%%%=$(MAKE)=@@@%%%' +_ACEOF +# GNU make sometimes prints "make[1]: Entering ...", which would confuse us. +case `${MAKE-make} -f conftest.make 2>/dev/null` in + *@@@%%%=?*=@@@%%%*) + eval ac_cv_prog_make_${ac_make}_set=yes;; + *) + eval ac_cv_prog_make_${ac_make}_set=no;; +esac +rm -f conftest.make +fi +if eval test \$ac_cv_prog_make_${ac_make}_set = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + SET_MAKE= +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } + SET_MAKE="MAKE=${MAKE-make}" +fi + +if test -n "$ac_tool_prefix"; then + # Extract the first word of "${ac_tool_prefix}ranlib", so it can be a program name with args. +set dummy ${ac_tool_prefix}ranlib; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_RANLIB+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$RANLIB"; then + ac_cv_prog_RANLIB="$RANLIB" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_prog_RANLIB="${ac_tool_prefix}ranlib" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + +fi +fi +RANLIB=$ac_cv_prog_RANLIB +if test -n "$RANLIB"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $RANLIB" >&5 +$as_echo "$RANLIB" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + +fi +if test -z "$ac_cv_prog_RANLIB"; then + ac_ct_RANLIB=$RANLIB + # Extract the first word of "ranlib", so it can be a program name with args. +set dummy ranlib; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_ac_ct_RANLIB+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$ac_ct_RANLIB"; then + ac_cv_prog_ac_ct_RANLIB="$ac_ct_RANLIB" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_prog_ac_ct_RANLIB="ranlib" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + +fi +fi +ac_ct_RANLIB=$ac_cv_prog_ac_ct_RANLIB +if test -n "$ac_ct_RANLIB"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_RANLIB" >&5 +$as_echo "$ac_ct_RANLIB" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + if test "x$ac_ct_RANLIB" = x; then + RANLIB=":" + else + case $cross_compiling:$ac_tool_warned in +yes:) +{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5 +$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;} +ac_tool_warned=yes ;; +esac + RANLIB=$ac_ct_RANLIB + fi +else + RANLIB="$ac_cv_prog_RANLIB" +fi + +# Extract the first word of "gcc", so it can be a program name with args. +set dummy gcc; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_CCDEP+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$CCDEP"; then + ac_cv_prog_CCDEP="$CCDEP" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_prog_CCDEP=""gcc"" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + + test -z "$ac_cv_prog_CCDEP" && ac_cv_prog_CCDEP=""$CC"" +fi +fi +CCDEP=$ac_cv_prog_CCDEP +if test -n "$CCDEP"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CCDEP" >&5 +$as_echo "$CCDEP" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + +#(endian="", AC_DEFINE(LITTLE_ENDIAN,1,The endian of the architechture)) + +# AC_PROG_FC([ifort gfortran]) +# AC_FC_FUNC(testfunc, ) + +LDFLAGS="$LDFLAGS -L\${HOME}/lib -L\${top_builddir}/lib" +CCLD=${CC} + +# compilation in operator is slowest so we do it first, saves time in parallel compiles +USESUBDIRS="operator linalg solver monomial buffers cu io meas xchange init rational wrapper" + +ac_ext=c +ac_cpp='$CPP $CPPFLAGS' +ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_c_compiler_gnu +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking how to run the C preprocessor" >&5 +$as_echo_n "checking how to run the C preprocessor... " >&6; } +# On Suns, sometimes $CPP names a directory. +if test -n "$CPP" && test -d "$CPP"; then + CPP= +fi +if test -z "$CPP"; then + if ${ac_cv_prog_CPP+:} false; then : + $as_echo_n "(cached) " >&6 +else + # Double quotes because CPP needs to be expanded + for CPP in "$CC -E" "$CC -E -traditional-cpp" "/lib/cpp" + do + ac_preproc_ok=false +for ac_c_preproc_warn_flag in '' yes +do + # Use a header file that comes with gcc, so configuring glibc + # with a fresh cross-compiler works. + # Prefer to if __STDC__ is defined, since + # exists even on freestanding compilers. + # On the NeXT, cc -E runs the code through the compiler's parser, + # not just through cpp. "Syntax error" is here to catch this case. + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +@%:@ifdef __STDC__ +@%:@ include +@%:@else +@%:@ include +@%:@endif + Syntax error +_ACEOF +if ac_fn_c_try_cpp "$LINENO"; then : + +else + # Broken: fails on valid input. +continue +fi +rm -f conftest.err conftest.i conftest.$ac_ext + + # OK, works on sane cases. Now check whether nonexistent headers + # can be detected and how. + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +@%:@include +_ACEOF +if ac_fn_c_try_cpp "$LINENO"; then : + # Broken: success on invalid input. +continue +else + # Passes both tests. +ac_preproc_ok=: +break +fi +rm -f conftest.err conftest.i conftest.$ac_ext + +done +# Because of `break', _AC_PREPROC_IFELSE's cleaning code was skipped. +rm -f conftest.i conftest.err conftest.$ac_ext +if $ac_preproc_ok; then : + break +fi + + done + ac_cv_prog_CPP=$CPP + +fi + CPP=$ac_cv_prog_CPP +else + ac_cv_prog_CPP=$CPP +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $CPP" >&5 +$as_echo "$CPP" >&6; } +ac_preproc_ok=false +for ac_c_preproc_warn_flag in '' yes +do + # Use a header file that comes with gcc, so configuring glibc + # with a fresh cross-compiler works. + # Prefer to if __STDC__ is defined, since + # exists even on freestanding compilers. + # On the NeXT, cc -E runs the code through the compiler's parser, + # not just through cpp. "Syntax error" is here to catch this case. + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +@%:@ifdef __STDC__ +@%:@ include +@%:@else +@%:@ include +@%:@endif + Syntax error +_ACEOF +if ac_fn_c_try_cpp "$LINENO"; then : + +else + # Broken: fails on valid input. +continue +fi +rm -f conftest.err conftest.i conftest.$ac_ext + + # OK, works on sane cases. Now check whether nonexistent headers + # can be detected and how. + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +@%:@include +_ACEOF +if ac_fn_c_try_cpp "$LINENO"; then : + # Broken: success on invalid input. +continue +else + # Passes both tests. +ac_preproc_ok=: +break +fi +rm -f conftest.err conftest.i conftest.$ac_ext + +done +# Because of `break', _AC_PREPROC_IFELSE's cleaning code was skipped. +rm -f conftest.i conftest.err conftest.$ac_ext +if $ac_preproc_ok; then : + +else + { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +as_fn_error $? "C preprocessor \"$CPP\" fails sanity check +See \`config.log' for more details" "$LINENO" 5; } +fi + +ac_ext=c +ac_cpp='$CPP $CPPFLAGS' +ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_c_compiler_gnu + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for grep that handles long lines and -e" >&5 +$as_echo_n "checking for grep that handles long lines and -e... " >&6; } +if ${ac_cv_path_GREP+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -z "$GREP"; then + ac_path_GREP_found=false + # Loop through the user's path and test for each of PROGNAME-LIST + as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH$PATH_SEPARATOR/usr/xpg4/bin +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_prog in grep ggrep; do + for ac_exec_ext in '' $ac_executable_extensions; do + ac_path_GREP="$as_dir/$ac_prog$ac_exec_ext" + as_fn_executable_p "$ac_path_GREP" || continue +# Check for GNU ac_path_GREP and select it if it is found. + # Check for GNU $ac_path_GREP +case `"$ac_path_GREP" --version 2>&1` in +*GNU*) + ac_cv_path_GREP="$ac_path_GREP" ac_path_GREP_found=:;; +*) + ac_count=0 + $as_echo_n 0123456789 >"conftest.in" + while : + do + cat "conftest.in" "conftest.in" >"conftest.tmp" + mv "conftest.tmp" "conftest.in" + cp "conftest.in" "conftest.nl" + $as_echo 'GREP' >> "conftest.nl" + "$ac_path_GREP" -e 'GREP$' -e '-(cannot match)-' < "conftest.nl" >"conftest.out" 2>/dev/null || break + diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break + as_fn_arith $ac_count + 1 && ac_count=$as_val + if test $ac_count -gt ${ac_path_GREP_max-0}; then + # Best one so far, save it but keep looking for a better one + ac_cv_path_GREP="$ac_path_GREP" + ac_path_GREP_max=$ac_count + fi + # 10*(2^10) chars as input seems more than enough + test $ac_count -gt 10 && break + done + rm -f conftest.in conftest.tmp conftest.nl conftest.out;; +esac + + $ac_path_GREP_found && break 3 + done + done + done +IFS=$as_save_IFS + if test -z "$ac_cv_path_GREP"; then + as_fn_error $? "no acceptable grep could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" "$LINENO" 5 + fi +else + ac_cv_path_GREP=$GREP +fi + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_path_GREP" >&5 +$as_echo "$ac_cv_path_GREP" >&6; } + GREP="$ac_cv_path_GREP" + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for egrep" >&5 +$as_echo_n "checking for egrep... " >&6; } +if ${ac_cv_path_EGREP+:} false; then : + $as_echo_n "(cached) " >&6 +else + if echo a | $GREP -E '(a|b)' >/dev/null 2>&1 + then ac_cv_path_EGREP="$GREP -E" + else + if test -z "$EGREP"; then + ac_path_EGREP_found=false + # Loop through the user's path and test for each of PROGNAME-LIST + as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH$PATH_SEPARATOR/usr/xpg4/bin +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_prog in egrep; do + for ac_exec_ext in '' $ac_executable_extensions; do + ac_path_EGREP="$as_dir/$ac_prog$ac_exec_ext" + as_fn_executable_p "$ac_path_EGREP" || continue +# Check for GNU ac_path_EGREP and select it if it is found. + # Check for GNU $ac_path_EGREP +case `"$ac_path_EGREP" --version 2>&1` in +*GNU*) + ac_cv_path_EGREP="$ac_path_EGREP" ac_path_EGREP_found=:;; +*) + ac_count=0 + $as_echo_n 0123456789 >"conftest.in" + while : + do + cat "conftest.in" "conftest.in" >"conftest.tmp" + mv "conftest.tmp" "conftest.in" + cp "conftest.in" "conftest.nl" + $as_echo 'EGREP' >> "conftest.nl" + "$ac_path_EGREP" 'EGREP$' < "conftest.nl" >"conftest.out" 2>/dev/null || break + diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break + as_fn_arith $ac_count + 1 && ac_count=$as_val + if test $ac_count -gt ${ac_path_EGREP_max-0}; then + # Best one so far, save it but keep looking for a better one + ac_cv_path_EGREP="$ac_path_EGREP" + ac_path_EGREP_max=$ac_count + fi + # 10*(2^10) chars as input seems more than enough + test $ac_count -gt 10 && break + done + rm -f conftest.in conftest.tmp conftest.nl conftest.out;; +esac + + $ac_path_EGREP_found && break 3 + done + done + done +IFS=$as_save_IFS + if test -z "$ac_cv_path_EGREP"; then + as_fn_error $? "no acceptable egrep could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" "$LINENO" 5 + fi +else + ac_cv_path_EGREP=$EGREP +fi + + fi +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_path_EGREP" >&5 +$as_echo "$ac_cv_path_EGREP" >&6; } + EGREP="$ac_cv_path_EGREP" + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for ANSI C header files" >&5 +$as_echo_n "checking for ANSI C header files... " >&6; } +if ${ac_cv_header_stdc+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +#include +#include +#include + +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + ac_cv_header_stdc=yes +else + ac_cv_header_stdc=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + +if test $ac_cv_header_stdc = yes; then + # SunOS 4.x string.h does not declare mem*, contrary to ANSI. + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include + +_ACEOF +if (eval "$ac_cpp conftest.$ac_ext") 2>&5 | + $EGREP "memchr" >/dev/null 2>&1; then : + +else + ac_cv_header_stdc=no +fi +rm -f conftest* + +fi + +if test $ac_cv_header_stdc = yes; then + # ISC 2.0.2 stdlib.h does not declare free, contrary to ANSI. + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include + +_ACEOF +if (eval "$ac_cpp conftest.$ac_ext") 2>&5 | + $EGREP "free" >/dev/null 2>&1; then : + +else + ac_cv_header_stdc=no +fi +rm -f conftest* + +fi + +if test $ac_cv_header_stdc = yes; then + # /bin/cc in Irix-4.0.5 gets non-ANSI ctype macros unless using -ansi. + if test "$cross_compiling" = yes; then : + : +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +#include +#if ((' ' & 0x0FF) == 0x020) +# define ISLOWER(c) ('a' <= (c) && (c) <= 'z') +# define TOUPPER(c) (ISLOWER(c) ? 'A' + ((c) - 'a') : (c)) +#else +# define ISLOWER(c) \ + (('a' <= (c) && (c) <= 'i') \ + || ('j' <= (c) && (c) <= 'r') \ + || ('s' <= (c) && (c) <= 'z')) +# define TOUPPER(c) (ISLOWER(c) ? ((c) | 0x40) : (c)) +#endif + +#define XOR(e, f) (((e) && !(f)) || (!(e) && (f))) +int +main () +{ + int i; + for (i = 0; i < 256; i++) + if (XOR (islower (i), ISLOWER (i)) + || toupper (i) != TOUPPER (i)) + return 2; + return 0; +} +_ACEOF +if ac_fn_c_try_run "$LINENO"; then : + +else + ac_cv_header_stdc=no +fi +rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \ + conftest.$ac_objext conftest.beam conftest.$ac_ext +fi + +fi +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_header_stdc" >&5 +$as_echo "$ac_cv_header_stdc" >&6; } +if test $ac_cv_header_stdc = yes; then + +$as_echo "@%:@define STDC_HEADERS 1" >>confdefs.h + +fi + +# On IRIX 5.3, sys/types and inttypes.h are conflicting. +for ac_header in sys/types.h sys/stat.h stdlib.h string.h memory.h strings.h \ + inttypes.h stdint.h unistd.h +do : + as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh` +ac_fn_c_check_header_compile "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default +" +if eval test \"x\$"$as_ac_Header"\" = x"yes"; then : + cat >>confdefs.h <<_ACEOF +@%:@define `$as_echo "HAVE_$ac_header" | $as_tr_cpp` 1 +_ACEOF + +fi + +done + + +for ac_header in stdint.h +do : + ac_fn_c_check_header_mongrel "$LINENO" "stdint.h" "ac_cv_header_stdint_h" "$ac_includes_default" +if test "x$ac_cv_header_stdint_h" = xyes; then : + cat >>confdefs.h <<_ACEOF +@%:@define HAVE_STDINT_H 1 +_ACEOF + ac_fn_c_check_type "$LINENO" "uint16_t" "ac_cv_type_uint16_t" "$ac_includes_default" +if test "x$ac_cv_type_uint16_t" = xyes; then : + +cat >>confdefs.h <<_ACEOF +@%:@define HAVE_UINT16_T 1 +_ACEOF + + +else + as_fn_error $? "stdint.h found but either uint16_t, uint32_t or uint64_t not found" "$LINENO" 5 + +fi +ac_fn_c_check_type "$LINENO" "uint32_t" "ac_cv_type_uint32_t" "$ac_includes_default" +if test "x$ac_cv_type_uint32_t" = xyes; then : + +cat >>confdefs.h <<_ACEOF +@%:@define HAVE_UINT32_T 1 +_ACEOF + + +else + as_fn_error $? "stdint.h found but either uint16_t, uint32_t or uint64_t not found" "$LINENO" 5 + +fi +ac_fn_c_check_type "$LINENO" "uint64_t" "ac_cv_type_uint64_t" "$ac_includes_default" +if test "x$ac_cv_type_uint64_t" = xyes; then : + +cat >>confdefs.h <<_ACEOF +@%:@define HAVE_UINT64_T 1 +_ACEOF + + +else + as_fn_error $? "stdint.h found but either uint16_t, uint32_t or uint64_t not found" "$LINENO" 5 + +fi + + +else + + # The cast to long int works around a bug in the HP C Compiler +# version HP92453-01 B.11.11.23709.GP, which incorrectly rejects +# declarations like `int a3[[(sizeof (unsigned char)) >= 0]];'. +# This bug is HP SR number 8606223364. +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking size of unsigned char" >&5 +$as_echo_n "checking size of unsigned char... " >&6; } +if ${ac_cv_sizeof_unsigned_char+:} false; then : + $as_echo_n "(cached) " >&6 +else + if ac_fn_c_compute_int "$LINENO" "(long int) (sizeof (unsigned char))" "ac_cv_sizeof_unsigned_char" "$ac_includes_default"; then : + +else + if test "$ac_cv_type_unsigned_char" = yes; then + { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +as_fn_error 77 "cannot compute sizeof (unsigned char) +See \`config.log' for more details" "$LINENO" 5; } + else + ac_cv_sizeof_unsigned_char=0 + fi +fi + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_sizeof_unsigned_char" >&5 +$as_echo "$ac_cv_sizeof_unsigned_char" >&6; } + + + +cat >>confdefs.h <<_ACEOF +@%:@define SIZEOF_UNSIGNED_CHAR $ac_cv_sizeof_unsigned_char +_ACEOF + + + # The cast to long int works around a bug in the HP C Compiler +# version HP92453-01 B.11.11.23709.GP, which incorrectly rejects +# declarations like `int a3[[(sizeof (unsigned char)) >= 0]];'. +# This bug is HP SR number 8606223364. +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking size of unsigned short" >&5 +$as_echo_n "checking size of unsigned short... " >&6; } +if ${ac_cv_sizeof_unsigned_short+:} false; then : + $as_echo_n "(cached) " >&6 +else + if ac_fn_c_compute_int "$LINENO" "(long int) (sizeof (unsigned short))" "ac_cv_sizeof_unsigned_short" "$ac_includes_default"; then : + +else + if test "$ac_cv_type_unsigned_short" = yes; then + { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +as_fn_error 77 "cannot compute sizeof (unsigned short) +See \`config.log' for more details" "$LINENO" 5; } + else + ac_cv_sizeof_unsigned_short=0 + fi +fi + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_sizeof_unsigned_short" >&5 +$as_echo "$ac_cv_sizeof_unsigned_short" >&6; } + + + +cat >>confdefs.h <<_ACEOF +@%:@define SIZEOF_UNSIGNED_SHORT $ac_cv_sizeof_unsigned_short +_ACEOF + + + # The cast to long int works around a bug in the HP C Compiler +# version HP92453-01 B.11.11.23709.GP, which incorrectly rejects +# declarations like `int a3[[(sizeof (unsigned char)) >= 0]];'. +# This bug is HP SR number 8606223364. +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking size of unsigned int" >&5 +$as_echo_n "checking size of unsigned int... " >&6; } +if ${ac_cv_sizeof_unsigned_int+:} false; then : + $as_echo_n "(cached) " >&6 +else + if ac_fn_c_compute_int "$LINENO" "(long int) (sizeof (unsigned int))" "ac_cv_sizeof_unsigned_int" "$ac_includes_default"; then : + +else + if test "$ac_cv_type_unsigned_int" = yes; then + { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +as_fn_error 77 "cannot compute sizeof (unsigned int) +See \`config.log' for more details" "$LINENO" 5; } + else + ac_cv_sizeof_unsigned_int=0 + fi +fi + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_sizeof_unsigned_int" >&5 +$as_echo "$ac_cv_sizeof_unsigned_int" >&6; } + + + +cat >>confdefs.h <<_ACEOF +@%:@define SIZEOF_UNSIGNED_INT $ac_cv_sizeof_unsigned_int +_ACEOF + + + # The cast to long int works around a bug in the HP C Compiler +# version HP92453-01 B.11.11.23709.GP, which incorrectly rejects +# declarations like `int a3[[(sizeof (unsigned char)) >= 0]];'. +# This bug is HP SR number 8606223364. +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking size of unsigned long" >&5 +$as_echo_n "checking size of unsigned long... " >&6; } +if ${ac_cv_sizeof_unsigned_long+:} false; then : + $as_echo_n "(cached) " >&6 +else + if ac_fn_c_compute_int "$LINENO" "(long int) (sizeof (unsigned long))" "ac_cv_sizeof_unsigned_long" "$ac_includes_default"; then : + +else + if test "$ac_cv_type_unsigned_long" = yes; then + { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +as_fn_error 77 "cannot compute sizeof (unsigned long) +See \`config.log' for more details" "$LINENO" 5; } + else + ac_cv_sizeof_unsigned_long=0 + fi +fi + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_sizeof_unsigned_long" >&5 +$as_echo "$ac_cv_sizeof_unsigned_long" >&6; } + + + +cat >>confdefs.h <<_ACEOF +@%:@define SIZEOF_UNSIGNED_LONG $ac_cv_sizeof_unsigned_long +_ACEOF + + + # The cast to long int works around a bug in the HP C Compiler +# version HP92453-01 B.11.11.23709.GP, which incorrectly rejects +# declarations like `int a3[[(sizeof (unsigned char)) >= 0]];'. +# This bug is HP SR number 8606223364. +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking size of unsigned long long" >&5 +$as_echo_n "checking size of unsigned long long... " >&6; } +if ${ac_cv_sizeof_unsigned_long_long+:} false; then : + $as_echo_n "(cached) " >&6 +else + if ac_fn_c_compute_int "$LINENO" "(long int) (sizeof (unsigned long long))" "ac_cv_sizeof_unsigned_long_long" "$ac_includes_default"; then : + +else + if test "$ac_cv_type_unsigned_long_long" = yes; then + { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +as_fn_error 77 "cannot compute sizeof (unsigned long long) +See \`config.log' for more details" "$LINENO" 5; } + else + ac_cv_sizeof_unsigned_long_long=0 + fi +fi + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_sizeof_unsigned_long_long" >&5 +$as_echo "$ac_cv_sizeof_unsigned_long_long" >&6; } + + + +cat >>confdefs.h <<_ACEOF +@%:@define SIZEOF_UNSIGNED_LONG_LONG $ac_cv_sizeof_unsigned_long_long +_ACEOF + + + + +fi + +done + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we want to use only Benchmark" >&5 +$as_echo_n "checking whether we want to use only Benchmark... " >&6; } +@%:@ Check whether --enable-benchmark was given. +if test "${enable_benchmark+set}" = set; then : + enableval=$enable_benchmark; enable_benchmark=$enableval +else + enable_benchmark=yes +fi + +if test $enable_benchmark = no; then + +@%:@ Check whether --with-limedir was given. +if test "${with_limedir+set}" = set; then : + withval=$with_limedir; lime_dir=$withval +else + lime_dir="./c-lime" +fi + + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $lime_dir" >&5 +$as_echo "$lime_dir" >&6; } + LDFLAGS="$LDFLAGS -L${lime_dir}/lib/" + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for limeReaderNextRecord in -llime" >&5 +$as_echo_n "checking for limeReaderNextRecord in -llime... " >&6; } +if ${ac_cv_lib_lime_limeReaderNextRecord+:} false; then : + $as_echo_n "(cached) " >&6 +else + ac_check_lib_save_LIBS=$LIBS +LIBS="-llime $LIBS" +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +/* Override any GCC internal prototype to avoid an error. + Use char because int might match the return type of a GCC + builtin and then its argument prototype would still apply. */ +#ifdef __cplusplus +extern "C" +#endif +char limeReaderNextRecord (); +int +main () +{ +return limeReaderNextRecord (); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + ac_cv_lib_lime_limeReaderNextRecord=yes +else + ac_cv_lib_lime_limeReaderNextRecord=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +LIBS=$ac_check_lib_save_LIBS +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_lime_limeReaderNextRecord" >&5 +$as_echo "$ac_cv_lib_lime_limeReaderNextRecord" >&6; } +if test "x$ac_cv_lib_lime_limeReaderNextRecord" = xyes; then : + cat >>confdefs.h <<_ACEOF +@%:@define HAVE_LIBLIME 1 +_ACEOF + + LIBS="-llime $LIBS" + +else + as_fn_error $? "library liblime is missing or needed function is not available" "$LINENO" 5 +fi + +else + +$as_echo "@%:@define BENCHMARK 1" >>confdefs.h + +fi + + + +#LIBS="$LIBS $FLIBS -lm" + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we want to use lemon" >&5 +$as_echo_n "checking whether we want to use lemon... " >&6; } + +@%:@ Check whether --with-lemondir was given. +if test "${with_lemondir+set}" = set; then : + withval=$with_lemondir; echo yes + LEMON_AVAILABLE=1 + lemon_dir=$withval + LDFLAGS="$LDFLAGS -L${lemon_dir}/lib" + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for lemonReaderNextRecord in -llemon" >&5 +$as_echo_n "checking for lemonReaderNextRecord in -llemon... " >&6; } +if ${ac_cv_lib_lemon_lemonReaderNextRecord+:} false; then : + $as_echo_n "(cached) " >&6 +else + ac_check_lib_save_LIBS=$LIBS +LIBS="-llemon $LIBS" +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +/* Override any GCC internal prototype to avoid an error. + Use char because int might match the return type of a GCC + builtin and then its argument prototype would still apply. */ +#ifdef __cplusplus +extern "C" +#endif +char lemonReaderNextRecord (); +int +main () +{ +return lemonReaderNextRecord (); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + ac_cv_lib_lemon_lemonReaderNextRecord=yes +else + ac_cv_lib_lemon_lemonReaderNextRecord=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +LIBS=$ac_check_lib_save_LIBS +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_lemon_lemonReaderNextRecord" >&5 +$as_echo "$ac_cv_lib_lemon_lemonReaderNextRecord" >&6; } +if test "x$ac_cv_lib_lemon_lemonReaderNextRecord" = xyes; then : + cat >>confdefs.h <<_ACEOF +@%:@define HAVE_LIBLEMON 1 +_ACEOF + + LIBS="-llemon $LIBS" + +else + as_fn_error $? "library liblemon was not found" "$LINENO" 5 +fi + +else + echo no + LEMON_AVAILABLE=0 +fi + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we use the general geometry" >&5 +$as_echo_n "checking whether we use the general geometry... " >&6; } +@%:@ Check whether --enable-indexindepgeom was given. +if test "${enable_indexindepgeom+set}" = set; then : + enableval=$enable_indexindepgeom; enable_iig=$enableval +else + enable_iig=no +fi + +if test $enable_iig = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + +$as_echo "@%:@define _INDEX_INDEP_GEOM 1" >>confdefs.h + +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we want to use MPI" >&5 +$as_echo_n "checking whether we want to use MPI... " >&6; } +@%:@ Check whether --enable-mpi was given. +if test "${enable_mpi+set}" = set; then : + enableval=$enable_mpi; enable_mpi=$enableval +else + enable_mpi=yes +fi + +if test $enable_mpi = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + +$as_echo "@%:@define MPI 1" >>confdefs.h + +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to use QPX intrinsics" >&5 +$as_echo_n "checking whether to use QPX intrinsics... " >&6; } +@%:@ Check whether --enable-qpx was given. +if test "${enable_qpx+set}" = set; then : + enableval=$enable_qpx; enable_qpx=$enableval +else + enable_qpx=no +fi + +if test $enable_qpx = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + +$as_echo "@%:@define BGQ 1" >>confdefs.h + + { $as_echo "$as_me:${as_lineno-$LINENO}: Compiling with QPX intrinsics on BGQ, enabling compiler optimizations for XLC." >&5 +$as_echo "$as_me: Compiling with QPX intrinsics on BGQ, enabling compiler optimizations for XLC." >&6;} + OPTARGS="-O2 -qstrict=all -qtune=qp -qarch=qp -qmaxmem=-1" + SOPTARGS="$OPTARGS" +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to use IBM BG/Q SPI for communications" >&5 +$as_echo_n "checking whether to use IBM BG/Q SPI for communications... " >&6; } +@%:@ Check whether --enable-spi was given. +if test "${enable_spi+set}" = set; then : + enableval=$enable_spi; enable_spi=$enableval +else + enable_spi=no +fi + +if test $enable_spi = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + +$as_echo "@%:@define SPI 1" >>confdefs.h + + SPI_FILES="DirectPut" +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } + SPI_FILES="" +fi + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we want to use OpenMP" >&5 +$as_echo_n "checking whether we want to use OpenMP... " >&6; } +@%:@ Check whether --enable-omp was given. +if test "${enable_omp+set}" = set; then : + enableval=$enable_omp; enable_omp=$enableval +else + enable_omp=yes +fi + +if test $enable_omp = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + +$as_echo "@%:@define OMP 1" >>confdefs.h + + for ac_header in omp.h +do : + ac_fn_c_check_header_mongrel "$LINENO" "omp.h" "ac_cv_header_omp_h" "$ac_includes_default" +if test "x$ac_cv_header_omp_h" = xyes; then : + cat >>confdefs.h <<_ACEOF +@%:@define HAVE_OMP_H 1 +_ACEOF + +else + as_fn_error $? "Cannot find OpenMP headers!" "$LINENO" 5 +fi + +done + + + OPENMP_CFLAGS= + @%:@ Check whether --enable-openmp was given. +if test "${enable_openmp+set}" = set; then : + enableval=$enable_openmp; +fi + + if test "$enable_openmp" != no; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $CC option to support OpenMP" >&5 +$as_echo_n "checking for $CC option to support OpenMP... " >&6; } +if ${ac_cv_prog_c_openmp+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +#ifndef _OPENMP + choke me +#endif +#include +int main () { return omp_get_num_threads (); } + +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + ac_cv_prog_c_openmp='none needed' +else + ac_cv_prog_c_openmp='unsupported' + for ac_option in -fopenmp -xopenmp -openmp -mp -omp -qsmp=omp -homp \ + -Popenmp --openmp; do + ac_save_CFLAGS=$CFLAGS + CFLAGS="$CFLAGS $ac_option" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +#ifndef _OPENMP + choke me +#endif +#include +int main () { return omp_get_num_threads (); } + +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + ac_cv_prog_c_openmp=$ac_option +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext + CFLAGS=$ac_save_CFLAGS + if test "$ac_cv_prog_c_openmp" != unsupported; then + break + fi + done +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_c_openmp" >&5 +$as_echo "$ac_cv_prog_c_openmp" >&6; } + case $ac_cv_prog_c_openmp in #( + "none needed" | unsupported) + ;; #( + *) + OPENMP_CFLAGS=$ac_cv_prog_c_openmp ;; + esac + fi + + +# -- AC_OPENMP provides a compiler-dependent OPENMP_CFLAGS so we can set it here +# on the BG/Q with XLC we force a special set of options for OpenMP support + if test $enable_qpx = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: Using OpenMP with XLC on BG/Q. Compiling with \"-qsmp=omp:noauto:schedule=static -qthreaded\"." >&5 +$as_echo "$as_me: Using OpenMP with XLC on BG/Q. Compiling with \"-qsmp=omp:noauto:schedule=static -qthreaded\"." >&6;} + CFLAGS="$CFLAGS -qsmp=omp:noauto:schedule=static -qthreaded" + CPPFLAGS="$CPPFLAGS -qsmp=omp:noauto:schedule=static -qthreaded" + LDFLAGS="$LDFLAGS -qsmp=omp:noauto:schedule=static -qthreaded" + else + CFLAGS="$CFLAGS $OPENMP_CFLAGS" + CPPFLAGS="$CPPFLAGS $OPENMP_CFLAGS" + LDFLAGS="$LDFLAGS $OPENMP_CFLAGS" + fi +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + +fftw_lib=/usr +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we want to use FFTW" >&5 +$as_echo_n "checking whether we want to use FFTW... " >&6; } +@%:@ Check whether --enable-fftw was given. +if test "${enable_fftw+set}" = set; then : + enableval=$enable_fftw; enable_fftw=$enableval +else + enable_fftw=no +fi + +if test $enable_fftw = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + +$as_echo "@%:@define HAVE_FFTW 1" >>confdefs.h + + LIBS="-lfftw3 ${LIBS}" +elif test $enable_fftw = no; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + +$as_echo "@%:@define HAVE_FFTW 1" >>confdefs.h + + fftw_lib=${enable_fftw} + LDFLAGS="$LDFLAGS -L${fftw_lib}/lib64" + LIBS="-lfftw3 ${LIBS}" + INCLUDES="-I${fftw_lib}/include ${INCLUDES}" +fi + +if test $enable_mpi = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking which parallelisation to use for MPI" >&5 +$as_echo_n "checking which parallelisation to use for MPI... " >&6; } + +@%:@ Check whether --with-mpidimension was given. +if test "${with_mpidimension+set}" = set; then : + withval=$with_mpidimension; withmpidimension=$withval +else + withmpidimension=1 +fi + + if test $withmpidimension = 1; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: n=1 t" >&5 +$as_echo "n=1 t" >&6; } + +$as_echo "@%:@define PARALLELT 1" >>confdefs.h + + elif test $withmpidimension = 2; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: n=2 xt" >&5 +$as_echo "n=2 xt" >&6; } + +$as_echo "@%:@define PARALLELXT 1" >>confdefs.h + + elif test $withmpidimension = 3; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: n=3 xyt" >&5 +$as_echo "n=3 xyt" >&6; } + +$as_echo "@%:@define PARALLELXYT 1" >>confdefs.h + + elif test $withmpidimension = 4; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: n=4 xyzt" >&5 +$as_echo "n=4 xyzt" >&6; } + +$as_echo "@%:@define PARALLELXYZT 1" >>confdefs.h + + elif test $withmpidimension = X; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: n=1 x" >&5 +$as_echo "n=1 x" >&6; } + +$as_echo "@%:@define PARALLELX 1" >>confdefs.h + + elif test $withmpidimension = XY; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: n=2 xy" >&5 +$as_echo "n=2 xy" >&6; } + +$as_echo "@%:@define PARALLELXY 1" >>confdefs.h + + elif test $withmpidimension = XYZ; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: n=3 xyz" >&5 +$as_echo "n=3 xyz" >&6; } + +$as_echo "@%:@define PARALLELXYZ 1" >>confdefs.h + + elif test $withmpidimension = T; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: n=1 t" >&5 +$as_echo "n=1 t" >&6; } + +$as_echo "@%:@define PARALLELT 1" >>confdefs.h + + elif test $withmpidimension = XT; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: n=2 xt" >&5 +$as_echo "n=2 xt" >&6; } + +$as_echo "@%:@define PARALLELXT 1" >>confdefs.h + + elif test $withmpidimension = XYT; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: n=3 xyt" >&5 +$as_echo "n=3 xyt" >&6; } + +$as_echo "@%:@define PARALLELXYT 1" >>confdefs.h + + elif test $withmpidimension = XYZT; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: n=4 xyzt" >&5 +$as_echo "n=4 xyzt" >&6; } + +$as_echo "@%:@define PARALLELXYZT 1" >>confdefs.h + + else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: unknown" >&5 +$as_echo "unknown" >&6; } + as_fn_error $? "Only t, xt, xyt, xyzt, x, xy, xyz parallelisation available" "$LINENO" 5 + fi + + { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we shall use persistent MPI calls for halfspinor" >&5 +$as_echo_n "checking whether we shall use persistent MPI calls for halfspinor... " >&6; } + +@%:@ Check whether --with-persistentmpi was given. +if test "${with_persistentmpi+set}" = set; then : + withval=$with_persistentmpi; withpersistent=$withval +else + withpersistent=no +fi + + if test $withpersistent = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + +$as_echo "@%:@define _PERSISTENT 1" >>confdefs.h + + else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } + fi + + { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we shall use non-blocking MPI calls" >&5 +$as_echo_n "checking whether we shall use non-blocking MPI calls... " >&6; } + +@%:@ Check whether --with-nonblockingmpi was given. +if test "${with_nonblockingmpi+set}" = set; then : + withval=$with_nonblockingmpi; withnonblock=$withval +else + withnonblock=yes +fi + + if test $withnonblock = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + +$as_echo "@%:@define _NON_BLOCKING 1" >>confdefs.h + + else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } + fi +fi + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we want to fix volume at compiletime" >&5 +$as_echo_n "checking whether we want to fix volume at compiletime... " >&6; } + +@%:@ Check whether --with-fixedvolume was given. +if test "${with_fixedvolume+set}" = set; then : + withval=$with_fixedvolume; with_fixvol=$withval +else + with_fixvol=no +fi + +if test $with_fixvol = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + +$as_echo "@%:@define FIXEDVOLUME 1" >>confdefs.h + + ac_config_files="$ac_config_files fixed_volume.h" + +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we want to use KOJAK instrumentalisation" >&5 +$as_echo_n "checking whether we want to use KOJAK instrumentalisation... " >&6; } + +@%:@ Check whether --with-kojakinst was given. +if test "${with_kojakinst+set}" = set; then : + withval=$with_kojakinst; with_kojakinst=$withval +else + with_kojakinst=no +fi + +if test $with_kojakinst = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + CC="kinst-pomp ${CC}" +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we want to use lapack and blas" >&5 +$as_echo_n "checking whether we want to use lapack and blas... " >&6; } + +@%:@ Check whether --with-lapack was given. +if test "${with_lapack+set}" = set; then : + withval=$with_lapack; with_lapack=$withval +else + with_lapack=yes +fi + +if test "$with_lapack" = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + LAPACKLIB= + +$as_echo "@%:@define HAVE_LAPACK 1" >>confdefs.h + +elif test "$with_lapack" != no; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + LIBS="$withval $LIBS" + with_lapack=yes + +$as_echo "@%:@define HAVE_LAPACK 1" >>confdefs.h + +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } + as_fn_error $? "lapack is needed! Will stop here." "$LINENO" 5 +fi + +if test $enable_mpi = yes; then + if test "$host_vendor" != "cray"; then + cross_compiling=yes + fi +fi + + +for ac_func in clock_gettime +do : + ac_fn_c_check_func "$LINENO" "clock_gettime" "ac_cv_func_clock_gettime" +if test "x$ac_cv_func_clock_gettime" = xyes; then : + cat >>confdefs.h <<_ACEOF +@%:@define HAVE_CLOCK_GETTIME 1 +_ACEOF + +else + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for clock_gettime in -lrt" >&5 +$as_echo_n "checking for clock_gettime in -lrt... " >&6; } +if ${ac_cv_lib_rt_clock_gettime+:} false; then : + $as_echo_n "(cached) " >&6 +else + ac_check_lib_save_LIBS=$LIBS +LIBS="-lrt $LIBS" +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +/* Override any GCC internal prototype to avoid an error. + Use char because int might match the return type of a GCC + builtin and then its argument prototype would still apply. */ +#ifdef __cplusplus +extern "C" +#endif +char clock_gettime (); +int +main () +{ +return clock_gettime (); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + ac_cv_lib_rt_clock_gettime=yes +else + ac_cv_lib_rt_clock_gettime=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +LIBS=$ac_check_lib_save_LIBS +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_rt_clock_gettime" >&5 +$as_echo "$ac_cv_lib_rt_clock_gettime" >&6; } +if test "x$ac_cv_lib_rt_clock_gettime" = xyes; then : + cat >>confdefs.h <<_ACEOF +@%:@define HAVE_LIBRT 1 +_ACEOF + + LIBS="-lrt $LIBS" + +fi + +fi +done + + +if ( test "$ac_cv_lib_rt_clock_gettime" = "yes" || test "$ac_cv_func_clock_gettime" = "yes" ); then + $as_echo "@%:@define HAVE_CLOCK_GETTIME 1" >>confdefs.h + + { $as_echo "$as_me:${as_lineno-$LINENO}: Instructing the compiler to use POSIX 199309L" >&5 +$as_echo "$as_me: Instructing the compiler to use POSIX 199309L" >&6;} +fi + +ac_ext=f +ac_compile='$F77 -c $FFLAGS conftest.$ac_ext >&5' +ac_link='$F77 -o conftest$ac_exeext $FFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_f77_compiler_gnu + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for dummy main to link with Fortran 77 libraries" >&5 +$as_echo_n "checking for dummy main to link with Fortran 77 libraries... " >&6; } +if ${ac_cv_f77_dummy_main+:} false; then : + $as_echo_n "(cached) " >&6 +else + ac_f77_dm_save_LIBS=$LIBS + LIBS="$LIBS $FLIBS" + ac_fortran_dm_var=F77_DUMMY_MAIN + ac_ext=c +ac_cpp='$CPP $CPPFLAGS' +ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_c_compiler_gnu + + # First, try linking without a dummy main: + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +#ifdef F77_DUMMY_MAIN + +# ifdef __cplusplus + extern "C" +# endif + int F77_DUMMY_MAIN() { return 1; } + +#endif +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + ac_cv_fortran_dummy_main=none +else + ac_cv_fortran_dummy_main=unknown +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext + + if test $ac_cv_fortran_dummy_main = unknown; then + for ac_func in MAIN__ MAIN_ __main MAIN _MAIN __MAIN main_ main__ _main; do + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +@%:@define $ac_fortran_dm_var $ac_func +#ifdef F77_DUMMY_MAIN + +# ifdef __cplusplus + extern "C" +# endif + int F77_DUMMY_MAIN() { return 1; } + +#endif +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + ac_cv_fortran_dummy_main=$ac_func; break +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext + done + fi + ac_ext=f +ac_compile='$F77 -c $FFLAGS conftest.$ac_ext >&5' +ac_link='$F77 -o conftest$ac_exeext $FFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_f77_compiler_gnu + ac_cv_f77_dummy_main=$ac_cv_fortran_dummy_main + rm -rf conftest* + LIBS=$ac_f77_dm_save_LIBS + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_f77_dummy_main" >&5 +$as_echo "$ac_cv_f77_dummy_main" >&6; } +F77_DUMMY_MAIN=$ac_cv_f77_dummy_main +if test "$F77_DUMMY_MAIN" != unknown; then : + if test $F77_DUMMY_MAIN != none; then + +cat >>confdefs.h <<_ACEOF +@%:@define F77_DUMMY_MAIN $F77_DUMMY_MAIN +_ACEOF + + if test "x$ac_cv_fc_dummy_main" = "x$ac_cv_f77_dummy_main"; then + +$as_echo "@%:@define FC_DUMMY_MAIN_EQ_F77 1" >>confdefs.h + + fi +fi +else + { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +as_fn_error $? "linking to Fortran libraries from C fails +See \`config.log' for more details" "$LINENO" 5; } +fi + +ac_ext=c +ac_cpp='$CPP $CPPFLAGS' +ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_c_compiler_gnu + +ac_ext=f +ac_compile='$F77 -c $FFLAGS conftest.$ac_ext >&5' +ac_link='$F77 -o conftest$ac_exeext $FFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_f77_compiler_gnu +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for Fortran 77 name-mangling scheme" >&5 +$as_echo_n "checking for Fortran 77 name-mangling scheme... " >&6; } +if ${ac_cv_f77_mangling+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat > conftest.$ac_ext <<_ACEOF + subroutine foobar() + return + end + subroutine foo_bar() + return + end +_ACEOF +if ac_fn_f77_try_compile "$LINENO"; then : + mv conftest.$ac_objext cfortran_test.$ac_objext + + ac_save_LIBS=$LIBS + LIBS="cfortran_test.$ac_objext $LIBS $FLIBS" + + ac_ext=c +ac_cpp='$CPP $CPPFLAGS' +ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_c_compiler_gnu + ac_success=no + for ac_foobar in foobar FOOBAR; do + for ac_underscore in "" "_"; do + ac_func="$ac_foobar$ac_underscore" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +/* Override any GCC internal prototype to avoid an error. + Use char because int might match the return type of a GCC + builtin and then its argument prototype would still apply. */ +#ifdef __cplusplus +extern "C" +#endif +char $ac_func (); +#ifdef F77_DUMMY_MAIN + +# ifdef __cplusplus + extern "C" +# endif + int F77_DUMMY_MAIN() { return 1; } + +#endif +int +main () +{ +return $ac_func (); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + ac_success=yes; break 2 +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext + done + done + ac_ext=f +ac_compile='$F77 -c $FFLAGS conftest.$ac_ext >&5' +ac_link='$F77 -o conftest$ac_exeext $FFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_f77_compiler_gnu + + if test "$ac_success" = "yes"; then + case $ac_foobar in + foobar) + ac_case=lower + ac_foo_bar=foo_bar + ;; + FOOBAR) + ac_case=upper + ac_foo_bar=FOO_BAR + ;; + esac + + ac_ext=c +ac_cpp='$CPP $CPPFLAGS' +ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_c_compiler_gnu + ac_success_extra=no + for ac_extra in "" "_"; do + ac_func="$ac_foo_bar$ac_underscore$ac_extra" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +/* Override any GCC internal prototype to avoid an error. + Use char because int might match the return type of a GCC + builtin and then its argument prototype would still apply. */ +#ifdef __cplusplus +extern "C" +#endif +char $ac_func (); +#ifdef F77_DUMMY_MAIN + +# ifdef __cplusplus + extern "C" +# endif + int F77_DUMMY_MAIN() { return 1; } + +#endif +int +main () +{ +return $ac_func (); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + ac_success_extra=yes; break +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext + done + ac_ext=f +ac_compile='$F77 -c $FFLAGS conftest.$ac_ext >&5' +ac_link='$F77 -o conftest$ac_exeext $FFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_f77_compiler_gnu + + if test "$ac_success_extra" = "yes"; then + ac_cv_f77_mangling="$ac_case case" + if test -z "$ac_underscore"; then + ac_cv_f77_mangling="$ac_cv_f77_mangling, no underscore" + else + ac_cv_f77_mangling="$ac_cv_f77_mangling, underscore" + fi + if test -z "$ac_extra"; then + ac_cv_f77_mangling="$ac_cv_f77_mangling, no extra underscore" + else + ac_cv_f77_mangling="$ac_cv_f77_mangling, extra underscore" + fi + else + ac_cv_f77_mangling="unknown" + fi + else + ac_cv_f77_mangling="unknown" + fi + + LIBS=$ac_save_LIBS + rm -rf conftest* + rm -f cfortran_test* +else + { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +as_fn_error $? "cannot compile a simple Fortran program +See \`config.log' for more details" "$LINENO" 5; } +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_f77_mangling" >&5 +$as_echo "$ac_cv_f77_mangling" >&6; } + +ac_ext=c +ac_cpp='$CPP $CPPFLAGS' +ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_c_compiler_gnu + +ac_ext=f +ac_compile='$F77 -c $FFLAGS conftest.$ac_ext >&5' +ac_link='$F77 -o conftest$ac_exeext $FFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_f77_compiler_gnu +case $ac_cv_f77_mangling in + upper*) ac_val="ZHEEV" ;; + lower*) ac_val="zheev" ;; + *) ac_val="unknown" ;; +esac +case $ac_cv_f77_mangling in *," underscore"*) ac_val="$ac_val"_ ;; esac + +zheev="$ac_val" + +ac_ext=c +ac_cpp='$CPP $CPPFLAGS' +ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_c_compiler_gnu + +if test "$zheev" = "zheev"; then + +$as_echo "@%:@define NOF77_ 1" >>confdefs.h + +fi +as_ac_Search=`$as_echo "ac_cv_search_$zheev" | $as_tr_sh` +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for library containing $zheev" >&5 +$as_echo_n "checking for library containing $zheev... " >&6; } +if eval \${$as_ac_Search+:} false; then : + $as_echo_n "(cached) " >&6 +else + ac_func_search_save_LIBS=$LIBS +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +/* Override any GCC internal prototype to avoid an error. + Use char because int might match the return type of a GCC + builtin and then its argument prototype would still apply. */ +#ifdef __cplusplus +extern "C" +#endif +char $zheev (); +#ifdef F77_DUMMY_MAIN + +# ifdef __cplusplus + extern "C" +# endif + int F77_DUMMY_MAIN() { return 1; } + +#endif +int +main () +{ +return $zheev (); + ; + return 0; +} +_ACEOF +for ac_lib in '' lapack; do + if test -z "$ac_lib"; then + ac_res="none required" + else + ac_res=-l$ac_lib + LIBS="-l$ac_lib $ac_func_search_save_LIBS" + fi + if ac_fn_c_try_link "$LINENO"; then : + eval "$as_ac_Search=\$ac_res" +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext + if eval \${$as_ac_Search+:} false; then : + break +fi +done +if eval \${$as_ac_Search+:} false; then : + +else + eval "$as_ac_Search=no" +fi +rm conftest.$ac_ext +LIBS=$ac_func_search_save_LIBS +fi +eval ac_res=\$$as_ac_Search + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5 +$as_echo "$ac_res" >&6; } +eval ac_res=\$$as_ac_Search +if test "$ac_res" != no; then : + test "$ac_res" = "none required" || LIBS="$ac_res $LIBS" + +else + as_fn_error $? "Cannot find lapack" "$LINENO" 5 +fi + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for ANSI C header files" >&5 +$as_echo_n "checking for ANSI C header files... " >&6; } +if ${ac_cv_header_stdc+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +#include +#include +#include + +#ifdef F77_DUMMY_MAIN + +# ifdef __cplusplus + extern "C" +# endif + int F77_DUMMY_MAIN() { return 1; } + +#endif +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + ac_cv_header_stdc=yes +else + ac_cv_header_stdc=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + +if test $ac_cv_header_stdc = yes; then + # SunOS 4.x string.h does not declare mem*, contrary to ANSI. + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include + +_ACEOF +if (eval "$ac_cpp conftest.$ac_ext") 2>&5 | + $EGREP "memchr" >/dev/null 2>&1; then : + +else + ac_cv_header_stdc=no +fi +rm -f conftest* + +fi + +if test $ac_cv_header_stdc = yes; then + # ISC 2.0.2 stdlib.h does not declare free, contrary to ANSI. + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include + +_ACEOF +if (eval "$ac_cpp conftest.$ac_ext") 2>&5 | + $EGREP "free" >/dev/null 2>&1; then : + +else + ac_cv_header_stdc=no +fi +rm -f conftest* + +fi + +if test $ac_cv_header_stdc = yes; then + # /bin/cc in Irix-4.0.5 gets non-ANSI ctype macros unless using -ansi. + if test "$cross_compiling" = yes; then : + : +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +#include +#if ((' ' & 0x0FF) == 0x020) +# define ISLOWER(c) ('a' <= (c) && (c) <= 'z') +# define TOUPPER(c) (ISLOWER(c) ? 'A' + ((c) - 'a') : (c)) +#else +# define ISLOWER(c) \ + (('a' <= (c) && (c) <= 'i') \ + || ('j' <= (c) && (c) <= 'r') \ + || ('s' <= (c) && (c) <= 'z')) +# define TOUPPER(c) (ISLOWER(c) ? ((c) | 0x40) : (c)) +#endif + +#define XOR(e, f) (((e) && !(f)) || (!(e) && (f))) +int +main () +{ + int i; + for (i = 0; i < 256; i++) + if (XOR (islower (i), ISLOWER (i)) + || toupper (i) != TOUPPER (i)) + return 2; + return 0; +} +_ACEOF +if ac_fn_c_try_run "$LINENO"; then : + +else + ac_cv_header_stdc=no +fi +rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \ + conftest.$ac_objext conftest.beam conftest.$ac_ext +fi + +fi +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_header_stdc" >&5 +$as_echo "$ac_cv_header_stdc" >&6; } +if test $ac_cv_header_stdc = yes; then + +$as_echo "@%:@define STDC_HEADERS 1" >>confdefs.h + +fi + +for ac_header in float.h libintl.h limits.h stdint.h stdlib.h string.h strings.h sys/time.h unistd.h endian.h +do : + as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh` +ac_fn_c_check_header_mongrel "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default" +if eval test \"x\$"$as_ac_Header"\" = x"yes"; then : + cat >>confdefs.h <<_ACEOF +@%:@define `$as_echo "HAVE_$ac_header" | $as_tr_cpp` 1 +_ACEOF + +fi + +done + +ac_fn_c_check_header_mongrel "$LINENO" "getopt.h" "ac_cv_header_getopt_h" "$ac_includes_default" +if test "x$ac_cv_header_getopt_h" = xyes; then : + +fi + + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for an ANSI C-conforming const" >&5 +$as_echo_n "checking for an ANSI C-conforming const... " >&6; } +if ${ac_cv_c_const+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +#ifdef F77_DUMMY_MAIN + +# ifdef __cplusplus + extern "C" +# endif + int F77_DUMMY_MAIN() { return 1; } + +#endif +int +main () +{ + +#ifndef __cplusplus + /* Ultrix mips cc rejects this sort of thing. */ + typedef int charset[2]; + const charset cs = { 0, 0 }; + /* SunOS 4.1.1 cc rejects this. */ + char const *const *pcpcc; + char **ppc; + /* NEC SVR4.0.2 mips cc rejects this. */ + struct point {int x, y;}; + static struct point const zero = {0,0}; + /* AIX XL C 1.02.0.0 rejects this. + It does not let you subtract one const X* pointer from another in + an arm of an if-expression whose if-part is not a constant + expression */ + const char *g = "string"; + pcpcc = &g + (g ? g-g : 0); + /* HPUX 7.0 cc rejects these. */ + ++pcpcc; + ppc = (char**) pcpcc; + pcpcc = (char const *const *) ppc; + { /* SCO 3.2v4 cc rejects this sort of thing. */ + char tx; + char *t = &tx; + char const *s = 0 ? (char *) 0 : (char const *) 0; + + *t++ = 0; + if (s) return 0; + } + { /* Someone thinks the Sun supposedly-ANSI compiler will reject this. */ + int x[] = {25, 17}; + const int *foo = &x[0]; + ++foo; + } + { /* Sun SC1.0 ANSI compiler rejects this -- but not the above. */ + typedef const int *iptr; + iptr p = 0; + ++p; + } + { /* AIX XL C 1.02.0.0 rejects this sort of thing, saying + "k.c", line 2.27: 1506-025 (S) Operand must be a modifiable lvalue. */ + struct s { int j; const int *ap[3]; } bx; + struct s *b = &bx; b->j = 5; + } + { /* ULTRIX-32 V3.1 (Rev 9) vcc rejects this */ + const int foo = 10; + if (!foo) return 0; + } + return !cs[0] && !zero.x; +#endif + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + ac_cv_c_const=yes +else + ac_cv_c_const=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_c_const" >&5 +$as_echo "$ac_cv_c_const" >&6; } +if test $ac_cv_c_const = no; then + +$as_echo "@%:@define const /**/" >>confdefs.h + +fi + +ac_fn_c_check_type "$LINENO" "off_t" "ac_cv_type_off_t" "$ac_includes_default" +if test "x$ac_cv_type_off_t" = xyes; then : + +else + +cat >>confdefs.h <<_ACEOF +@%:@define off_t long int +_ACEOF + +fi + +ac_fn_c_check_type "$LINENO" "size_t" "ac_cv_type_size_t" "$ac_includes_default" +if test "x$ac_cv_type_size_t" = xyes; then : + +else + +cat >>confdefs.h <<_ACEOF +@%:@define size_t unsigned int +_ACEOF + +fi + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether time.h and sys/time.h may both be included" >&5 +$as_echo_n "checking whether time.h and sys/time.h may both be included... " >&6; } +if ${ac_cv_header_time+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +#include +#include + +#ifdef F77_DUMMY_MAIN + +# ifdef __cplusplus + extern "C" +# endif + int F77_DUMMY_MAIN() { return 1; } + +#endif +int +main () +{ +if ((struct tm *) 0) +return 0; + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + ac_cv_header_time=yes +else + ac_cv_header_time=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_header_time" >&5 +$as_echo "$ac_cv_header_time" >&6; } +if test $ac_cv_header_time = yes; then + +$as_echo "@%:@define TIME_WITH_SYS_TIME 1" >>confdefs.h + +fi + + +@%:@ Check whether --enable-largefile was given. +if test "${enable_largefile+set}" = set; then : + enableval=$enable_largefile; +fi + +if test "$enable_largefile" != no; then + + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for special C compiler options needed for large files" >&5 +$as_echo_n "checking for special C compiler options needed for large files... " >&6; } +if ${ac_cv_sys_largefile_CC+:} false; then : + $as_echo_n "(cached) " >&6 +else + ac_cv_sys_largefile_CC=no + if test "$GCC" != yes; then + ac_save_CC=$CC + while :; do + # IRIX 6.2 and later do not support large files by default, + # so use the C compiler's -n32 option if that helps. + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +@%:@include + /* Check that off_t can represent 2**63 - 1 correctly. + We can't simply define LARGE_OFF_T to be 9223372036854775807, + since some C++ compilers masquerading as C compilers + incorrectly reject 9223372036854775807. */ +@%:@define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31)) + int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721 + && LARGE_OFF_T % 2147483647 == 1) + ? 1 : -1]; +#ifdef F77_DUMMY_MAIN + +# ifdef __cplusplus + extern "C" +# endif + int F77_DUMMY_MAIN() { return 1; } + +#endif +int +main () +{ + + ; + return 0; +} +_ACEOF + if ac_fn_c_try_compile "$LINENO"; then : + break +fi +rm -f core conftest.err conftest.$ac_objext + CC="$CC -n32" + if ac_fn_c_try_compile "$LINENO"; then : + ac_cv_sys_largefile_CC=' -n32'; break +fi +rm -f core conftest.err conftest.$ac_objext + break + done + CC=$ac_save_CC + rm -f conftest.$ac_ext + fi +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_sys_largefile_CC" >&5 +$as_echo "$ac_cv_sys_largefile_CC" >&6; } + if test "$ac_cv_sys_largefile_CC" != no; then + CC=$CC$ac_cv_sys_largefile_CC + fi + + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _FILE_OFFSET_BITS value needed for large files" >&5 +$as_echo_n "checking for _FILE_OFFSET_BITS value needed for large files... " >&6; } +if ${ac_cv_sys_file_offset_bits+:} false; then : + $as_echo_n "(cached) " >&6 +else + while :; do + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +@%:@include + /* Check that off_t can represent 2**63 - 1 correctly. + We can't simply define LARGE_OFF_T to be 9223372036854775807, + since some C++ compilers masquerading as C compilers + incorrectly reject 9223372036854775807. */ +@%:@define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31)) + int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721 + && LARGE_OFF_T % 2147483647 == 1) + ? 1 : -1]; +#ifdef F77_DUMMY_MAIN + +# ifdef __cplusplus + extern "C" +# endif + int F77_DUMMY_MAIN() { return 1; } + +#endif +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + ac_cv_sys_file_offset_bits=no; break +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +@%:@define _FILE_OFFSET_BITS 64 +@%:@include + /* Check that off_t can represent 2**63 - 1 correctly. + We can't simply define LARGE_OFF_T to be 9223372036854775807, + since some C++ compilers masquerading as C compilers + incorrectly reject 9223372036854775807. */ +@%:@define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31)) + int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721 + && LARGE_OFF_T % 2147483647 == 1) + ? 1 : -1]; +#ifdef F77_DUMMY_MAIN + +# ifdef __cplusplus + extern "C" +# endif + int F77_DUMMY_MAIN() { return 1; } + +#endif +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + ac_cv_sys_file_offset_bits=64; break +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + ac_cv_sys_file_offset_bits=unknown + break +done +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_sys_file_offset_bits" >&5 +$as_echo "$ac_cv_sys_file_offset_bits" >&6; } +case $ac_cv_sys_file_offset_bits in #( + no | unknown) ;; + *) +cat >>confdefs.h <<_ACEOF +@%:@define _FILE_OFFSET_BITS $ac_cv_sys_file_offset_bits +_ACEOF +;; +esac +rm -rf conftest* + if test $ac_cv_sys_file_offset_bits = unknown; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _LARGE_FILES value needed for large files" >&5 +$as_echo_n "checking for _LARGE_FILES value needed for large files... " >&6; } +if ${ac_cv_sys_large_files+:} false; then : + $as_echo_n "(cached) " >&6 +else + while :; do + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +@%:@include + /* Check that off_t can represent 2**63 - 1 correctly. + We can't simply define LARGE_OFF_T to be 9223372036854775807, + since some C++ compilers masquerading as C compilers + incorrectly reject 9223372036854775807. */ +@%:@define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31)) + int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721 + && LARGE_OFF_T % 2147483647 == 1) + ? 1 : -1]; +#ifdef F77_DUMMY_MAIN + +# ifdef __cplusplus + extern "C" +# endif + int F77_DUMMY_MAIN() { return 1; } + +#endif +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + ac_cv_sys_large_files=no; break +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +@%:@define _LARGE_FILES 1 +@%:@include + /* Check that off_t can represent 2**63 - 1 correctly. + We can't simply define LARGE_OFF_T to be 9223372036854775807, + since some C++ compilers masquerading as C compilers + incorrectly reject 9223372036854775807. */ +@%:@define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31)) + int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721 + && LARGE_OFF_T % 2147483647 == 1) + ? 1 : -1]; +#ifdef F77_DUMMY_MAIN + +# ifdef __cplusplus + extern "C" +# endif + int F77_DUMMY_MAIN() { return 1; } + +#endif +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + ac_cv_sys_large_files=1; break +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + ac_cv_sys_large_files=unknown + break +done +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_sys_large_files" >&5 +$as_echo "$ac_cv_sys_large_files" >&6; } +case $ac_cv_sys_large_files in #( + no | unknown) ;; + *) +cat >>confdefs.h <<_ACEOF +@%:@define _LARGE_FILES $ac_cv_sys_large_files +_ACEOF +;; +esac +rm -rf conftest* + fi + + +fi + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _LARGEFILE_SOURCE value needed for large files" >&5 +$as_echo_n "checking for _LARGEFILE_SOURCE value needed for large files... " >&6; } +if ${ac_cv_sys_largefile_source+:} false; then : + $as_echo_n "(cached) " >&6 +else + while :; do + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include /* for off_t */ + #include +#ifdef F77_DUMMY_MAIN + +# ifdef __cplusplus + extern "C" +# endif + int F77_DUMMY_MAIN() { return 1; } + +#endif +int +main () +{ +int (*fp) (FILE *, off_t, int) = fseeko; + return fseeko (stdin, 0, 0) && fp (stdin, 0, 0); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + ac_cv_sys_largefile_source=no; break +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +@%:@define _LARGEFILE_SOURCE 1 +#include /* for off_t */ + #include +#ifdef F77_DUMMY_MAIN + +# ifdef __cplusplus + extern "C" +# endif + int F77_DUMMY_MAIN() { return 1; } + +#endif +int +main () +{ +int (*fp) (FILE *, off_t, int) = fseeko; + return fseeko (stdin, 0, 0) && fp (stdin, 0, 0); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + ac_cv_sys_largefile_source=1; break +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext + ac_cv_sys_largefile_source=unknown + break +done +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_sys_largefile_source" >&5 +$as_echo "$ac_cv_sys_largefile_source" >&6; } +case $ac_cv_sys_largefile_source in #( + no | unknown) ;; + *) +cat >>confdefs.h <<_ACEOF +@%:@define _LARGEFILE_SOURCE $ac_cv_sys_largefile_source +_ACEOF +;; +esac +rm -rf conftest* + +# We used to try defining _XOPEN_SOURCE=500 too, to work around a bug +# in glibc 2.1.3, but that breaks too many other things. +# If you want fseeko and ftello with glibc, upgrade to a fixed glibc. +if test $ac_cv_sys_largefile_source != unknown; then + +$as_echo "@%:@define HAVE_FSEEKO 1" >>confdefs.h + +fi + +for ac_header in stdlib.h +do : + ac_fn_c_check_header_mongrel "$LINENO" "stdlib.h" "ac_cv_header_stdlib_h" "$ac_includes_default" +if test "x$ac_cv_header_stdlib_h" = xyes; then : + cat >>confdefs.h <<_ACEOF +@%:@define HAVE_STDLIB_H 1 +_ACEOF + +fi + +done + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for GNU libc compatible malloc" >&5 +$as_echo_n "checking for GNU libc compatible malloc... " >&6; } +if ${ac_cv_func_malloc_0_nonnull+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test "$cross_compiling" = yes; then : + ac_cv_func_malloc_0_nonnull=no +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#if defined STDC_HEADERS || defined HAVE_STDLIB_H +# include +#else +char *malloc (); +#endif + +#ifdef F77_DUMMY_MAIN + +# ifdef __cplusplus + extern "C" +# endif + int F77_DUMMY_MAIN() { return 1; } + +#endif +int +main () +{ +return ! malloc (0); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_run "$LINENO"; then : + ac_cv_func_malloc_0_nonnull=yes +else + ac_cv_func_malloc_0_nonnull=no +fi +rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \ + conftest.$ac_objext conftest.beam conftest.$ac_ext +fi + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_func_malloc_0_nonnull" >&5 +$as_echo "$ac_cv_func_malloc_0_nonnull" >&6; } +if test $ac_cv_func_malloc_0_nonnull = yes; then : + +$as_echo "@%:@define HAVE_MALLOC 1" >>confdefs.h + +else + $as_echo "@%:@define HAVE_MALLOC 0" >>confdefs.h + + case " $LIB@&t@OBJS " in + *" malloc.$ac_objext "* ) ;; + *) LIB@&t@OBJS="$LIB@&t@OBJS malloc.$ac_objext" + ;; +esac + + +$as_echo "@%:@define malloc rpl_malloc" >>confdefs.h + +fi + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking return type of signal handlers" >&5 +$as_echo_n "checking return type of signal handlers... " >&6; } +if ${ac_cv_type_signal+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +#include + +#ifdef F77_DUMMY_MAIN + +# ifdef __cplusplus + extern "C" +# endif + int F77_DUMMY_MAIN() { return 1; } + +#endif +int +main () +{ +return *(signal (0, 0)) (0) == 1; + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + ac_cv_type_signal=int +else + ac_cv_type_signal=void +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_type_signal" >&5 +$as_echo "$ac_cv_type_signal" >&6; } + +cat >>confdefs.h <<_ACEOF +@%:@define RETSIGTYPE $ac_cv_type_signal +_ACEOF + + +for ac_func in gettimeofday pow sqrt +do : + as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh` +ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var" +if eval test \"x\$"$as_ac_var"\" = x"yes"; then : + cat >>confdefs.h <<_ACEOF +@%:@define `$as_echo "HAVE_$ac_func" | $as_tr_cpp` 1 +_ACEOF + +fi +done + + + + + + + + + + + + + + + + + + + + +INCLUDES="$INCLUDES -I\$(HOME)/include/ -I. -I\${abs_top_builddir}/ -I\${abs_top_srcdir}/ -I${lime_dir}/include/ -I${lemon_dir}/include/" +DEPFLAGS="$DEPFLAGS" + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking what alignment we want for arrays" >&5 +$as_echo_n "checking what alignment we want for arrays... " >&6; } +@%:@ Check whether --enable-alignment was given. +if test "${enable_alignment+set}" = set; then : + enableval=$enable_alignment; withalign=$enableval +else + withalign=auto +fi + +if test "$withalign" = "none"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: none" >&5 +$as_echo "none" >&6; } + withalign=1 + +$as_echo "@%:@define ALIGN_BASE 0x00" >>confdefs.h + + $as_echo "@%:@define ALIGN /**/" >>confdefs.h + + +$as_echo "@%:@define ALIGN_BASE32 0x00" >>confdefs.h + + $as_echo "@%:@define ALIGN32 /**/" >>confdefs.h + +elif test $withalign = 16; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: 16 bytes" >&5 +$as_echo "16 bytes" >&6; } + +$as_echo "@%:@define ALIGN_BASE 0x0F" >>confdefs.h + + $as_echo "@%:@define ALIGN __attribute__ ((aligned (16)))" >>confdefs.h + + +$as_echo "@%:@define ALIGN_BASE32 0x0F" >>confdefs.h + + $as_echo "@%:@define ALIGN32 __attribute__ ((aligned (16)))" >>confdefs.h + +elif test $withalign = 32; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: 32 bytes" >&5 +$as_echo "32 bytes" >&6; } + +$as_echo "@%:@define ALIGN_BASE 0x1F" >>confdefs.h + + $as_echo "@%:@define ALIGN __attribute__ ((aligned (32)))" >>confdefs.h + + +$as_echo "@%:@define ALIGN_BASE32 0x1F" >>confdefs.h + + $as_echo "@%:@define ALIGN32 __attribute__ ((aligned (32)))" >>confdefs.h + +elif test $withalign = auto; then + withautoalign=1 + { $as_echo "$as_me:${as_lineno-$LINENO}: result: auto" >&5 +$as_echo "auto" >&6; } + +$as_echo "@%:@define ALIGN_BASE 0x00" >>confdefs.h + + $as_echo "@%:@define ALIGN /**/" >>confdefs.h + + +$as_echo "@%:@define ALIGN_BASE32 0x00" >>confdefs.h + + $as_echo "@%:@define ALIGN32 /**/" >>confdefs.h + +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: Unusable value for array alignment" >&5 +$as_echo "Unusable value for array alignment" >&6; } + as_fn_error $? "Allowed values are: auto, none, 16, 32" "$LINENO" 5 +fi + +if test "$host_cpu" = "i686" || test "$host_cpu" = "x86_64"; then + + { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we want to use P4 instructions" >&5 +$as_echo_n "checking whether we want to use P4 instructions... " >&6; } + @%:@ Check whether --enable-p4 was given. +if test "${enable_p4+set}" = set; then : + enableval=$enable_p4; enable_p4=$enableval +else + enable_p4=no +fi + + if test $enable_p4 = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + +$as_echo "@%:@define P4 1" >>confdefs.h + + if test $withalign = auto; then + if test $withautoalign -lt 16; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: increasing array alignment to 16 bytes for P4 instructions" >&5 +$as_echo "increasing array alignment to 16 bytes for P4 instructions" >&6; } + +$as_echo "@%:@define ALIGN_BASE 0x0F" >>confdefs.h + + $as_echo "@%:@define ALIGN __attribute__ ((aligned (16)))" >>confdefs.h + + { $as_echo "$as_me:${as_lineno-$LINENO}: result: increasing array 32 bit alignment to 16 bytes for P4 instructions" >&5 +$as_echo "increasing array 32 bit alignment to 16 bytes for P4 instructions" >&6; } + +$as_echo "@%:@define ALIGN_BASE32 0x0F" >>confdefs.h + + $as_echo "@%:@define ALIGN32 __attribute__ ((aligned (16)))" >>confdefs.h + + withautoalign=16 + fi + elif test $withalign -lt 16; then + as_fn_error $? "alignment incompatible with P4 instructions (16 bytes required)!" "$LINENO" 5 + fi + else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } + fi + + { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we want to use Opteron instructions" >&5 +$as_echo_n "checking whether we want to use Opteron instructions... " >&6; } + @%:@ Check whether --enable-opteron was given. +if test "${enable_opteron+set}" = set; then : + enableval=$enable_opteron; enable_opteron=$enableval +else + enable_opteron=no +fi + + if test $enable_opteron = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + +$as_echo "@%:@define OPTERON 1" >>confdefs.h + + if test $withalign = auto; then + if test $withautoalign -lt 16; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: increasing array alignment to 16 bytes for Opteron instructions" >&5 +$as_echo "increasing array alignment to 16 bytes for Opteron instructions" >&6; } + +$as_echo "@%:@define ALIGN_BASE 0x0F" >>confdefs.h + + $as_echo "@%:@define ALIGN __attribute__ ((aligned (16)))" >>confdefs.h + + { $as_echo "$as_me:${as_lineno-$LINENO}: result: increasing array 32 bit alignment to 16 bytes for Opteron instructions" >&5 +$as_echo "increasing array 32 bit alignment to 16 bytes for Opteron instructions" >&6; } + +$as_echo "@%:@define ALIGN_BASE32 0x0F" >>confdefs.h + + $as_echo "@%:@define ALIGN32 __attribute__ ((aligned (16)))" >>confdefs.h + + withautoalign=16 + fi + elif test $withalign -lt 16; then + as_fn_error $? "alignment incompatible with Opteron instructions (16 bytes required)!" "$LINENO" 5 + fi + else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } + fi + + { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we want to use SSE2 instructions" >&5 +$as_echo_n "checking whether we want to use SSE2 instructions... " >&6; } + @%:@ Check whether --enable-sse2 was given. +if test "${enable_sse2+set}" = set; then : + enableval=$enable_sse2; enable_sse2=$enableval +else + enable_sse2=no +fi + + if test $enable_sse2 = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + if test $withalign != auto && test $withalign -lt 16; then + as_fn_error $? "alignment incompatible with SSE2 instructions (16 bytes required)" "$LINENO" 5 + fi + else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } + fi + + { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we want to use SSE3 instructions" >&5 +$as_echo_n "checking whether we want to use SSE3 instructions... " >&6; } + @%:@ Check whether --enable-sse3 was given. +if test "${enable_sse3+set}" = set; then : + enableval=$enable_sse3; enable_sse3=$enableval +else + enable_sse3=no +fi + + if test $enable_sse3 = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + if test $withalign != auto && $withalign -lt 16; then + as_fn_error $? "alignment incompatible with SSE3 instructions (16 bytes required)" "$LINENO" 5 + fi + else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } + fi + + if test "$enable_sse2" = "yes" || test "$enable_sse3" = "yes"; then + if test $withalign = auto; then + if test $withautoalign -lt 16; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: increasing array alignment to 16 bytes for SSE instructions" >&5 +$as_echo "increasing array alignment to 16 bytes for SSE instructions" >&6; } + +$as_echo "@%:@define ALIGN_BASE 0x0F" >>confdefs.h + + $as_echo "@%:@define ALIGN __attribute__ ((aligned (16)))" >>confdefs.h + + { $as_echo "$as_me:${as_lineno-$LINENO}: result: increasing 32bit array alignment to 16 bytes for SSE instructions" >&5 +$as_echo "increasing 32bit array alignment to 16 bytes for SSE instructions" >&6; } + +$as_echo "@%:@define ALIGN_BASE32 0x0F" >>confdefs.h + + $as_echo "@%:@define ALIGN32 __attribute__ ((aligned (16)))" >>confdefs.h + + withautoalign=16 + fi + fi + fi +fi + +if test $enable_qpx = yes; then + if test $withalign = auto; then + if test $withautoalign -lt 32; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: increasing array alignment to 32 bytes for use of QPX instructions on BG/Q" >&5 +$as_echo "increasing array alignment to 32 bytes for use of QPX instructions on BG/Q" >&6; } + +$as_echo "@%:@define ALIGN_BASE 0x1F" >>confdefs.h + + $as_echo "@%:@define ALIGN __attribute__ ((aligned (32)))" >>confdefs.h + + { $as_echo "$as_me:${as_lineno-$LINENO}: result: increasing 32bit array alignment to 16 bytes for use of QPX instructions on BG/Q" >&5 +$as_echo "increasing 32bit array alignment to 16 bytes for use of QPX instructions on BG/Q" >&6; } + +$as_echo "@%:@define ALIGN_BASE32 0x0F" >>confdefs.h + + $as_echo "@%:@define ALIGN32 __attribute__ ((aligned (16)))" >>confdefs.h + + withautoalign=32 + fi + elif test $withalign -lt 32; then + as_fn_error $? "alignment incompatible with QPX instructions (32 bytes required)" "$LINENO" 5 + fi +fi + +if test "$host_cpu" = "powerpc" && test "$host_vendor" = "ibm" && test "$host_os" = "blrts"; then + if test $withalign = auto; then + if test $withautoalign -lt 16; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: increasing array alignment to 16 bytes for BG/L optimization" >&5 +$as_echo "increasing array alignment to 16 bytes for BG/L optimization" >&6; } + +$as_echo "@%:@define ALIGN_BASE 0x0F" >>confdefs.h + + +$as_echo "@%:@define ALIGN __attribute__ ((aligned (16)))" >>confdefs.h + + withautoalign=16 + fi + fi +elif test "$host_cpu" = "powerpc" && test "$host_vendor" = "ibm" && test "$host_os" = "bprts"; then + if test $withalign = auto; then + if test $withautoalign -lt 16; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: increasing array alignment to 16 bytes for BG/P optimization" >&5 +$as_echo "increasing array alignment to 16 bytes for BG/P optimization" >&6; } + +$as_echo "@%:@define ALIGN_BASE 0x0F" >>confdefs.h + + +$as_echo "@%:@define ALIGN __attribute__ ((aligned (16)))" >>confdefs.h + + withautoalign=16 + fi + fi +elif test "$host_cpu" = "powerpc64" && test "$host_vendor" = "unknown" && test "$host_os" = "linux-gnu"; then + if test $withalign = auto; then + if test $withautoalign -lt 32; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: increasing array alignment to 32 bytes for BG/Q and generic POWER optimization" >&5 +$as_echo "increasing array alignment to 32 bytes for BG/Q and generic POWER optimization" >&6; } + +$as_echo "@%:@define ALIGN_BASE 0x1F" >>confdefs.h + + $as_echo "@%:@define ALIGN __attribute__ ((aligned (32)))" >>confdefs.h + + { $as_echo "$as_me:${as_lineno-$LINENO}: result: increasing array 32 bit alignment to 16 bytes for BG/Q and generic POWER optimization" >&5 +$as_echo "increasing array 32 bit alignment to 16 bytes for BG/Q and generic POWER optimization" >&6; } + +$as_echo "@%:@define ALIGN_BASE32 0x0F" >>confdefs.h + + $as_echo "@%:@define ALIGN32 __attribute__ ((aligned (16)))" >>confdefs.h + + withautoalign=32 + fi + fi +fi + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we want to use gprof as profiler" >&5 +$as_echo_n "checking whether we want to use gprof as profiler... " >&6; } + +@%:@ Check whether --with-gprof was given. +if test "${with_gprof+set}" = set; then : + withval=$with_gprof; enable_gprof=$withval +else + enable_gprof=no +fi + +if test $enable_gprof = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + if test "$host_cpu" = "powerpc" && test "$host_vendor" = "ibm"; then + PROFILE_FLAG="-pg -qfullpath -g" + else + PROFILE_FLAG="-pg -g" + fi +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } + PROFILE_FLAG= +fi + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we shall use rts dram window" >&5 +$as_echo_n "checking whether we shall use rts dram window... " >&6; } + +@%:@ Check whether --with-bgldram was given. +if test "${with_bgldram+set}" = set; then : + withval=$with_bgldram; with_bgldram=$withval +else + with_bgldram=yes +fi + +if test $with_bgldram = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + +$as_echo "@%:@define _USE_BGLDRAM 1" >>confdefs.h + +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + +XLCGREP=`$CC -V 2>&1 | grep -i xlc` +if test "$XLCGREP" != ""; then + XLC="yes" + +$as_echo "@%:@define XLC 1" >>confdefs.h + +fi +PGCC=`$CC -V 2>&1 | grep pgcc` +ICC=`$CC -V 2>&1 | grep -i intel` + +if test "$host_cpu" = "i686" || test "$host_cpu" = "x86_64"; then + if test "$GCC" = yes && test "$ICC" = ""; then + DEPFLAGS="-MM" + CFLAGS="$CFLAGS -pedantic -Wall" + OPTARGS='-O' + SOPTARGS='-O' + + if test $enable_sse3 = yes; then + echo Using SSE3 and SSE2 macros! + +$as_echo "@%:@define SSE3 1" >>confdefs.h + + DEPFLAGS="$DEPFLAGS -DSSE3" + if test "$host_cpu" = "x86_64"; then + CFLAGS="$CFLAGS -mfpmath=387" + fi + elif test $enable_sse2 = yes; then + DEPFLAGS="$DEPFLAGS -DSSE2" + +$as_echo "@%:@define SSE2 1" >>confdefs.h + + if test "$host_cpu" = "x86_64"; then + CFLAGS="$CFLAGS -mfpmath=387" + fi + fi + + if test "$host_cpu" = "x86_64"; then + +$as_echo "@%:@define _x86_64 1" >>confdefs.h + + fi + CCDEP="$CC" + if test $enable_mpi = yes; then + CCDEP="gcc" + fi + DEBUG_FLAG="-g" + else + if test "$PGCC" != ""; then + DEPFLAGS="-M" + echo "We are using the Portland Group C compiler!" + OPTARGS="-O2" + SOPTARGS="-O2" + DEBUG_FLAG="-g" + PROFILE_FLAG="-p -g" + CCDEP="$CC" + + elif test "$ICC" != ""; then + echo "We are using the Intel C compiler!" + DEPFLAGS="-M" + OPTARGS="-O3" + SOPTARGS="-O3" + DEBUG_FLAG="-g" + PROFILE_FLAG="-p -g" + CCDEP="$CC" + + else + # other compilers might support SSE inline assembly too + # (the cray compiler, for example) + if test $enable_sse3 = yes; then + echo Using SSE3 and SSE2 macros! + +$as_echo "@%:@define SSE3 1" >>confdefs.h + + elif test $enable_sse2 = yes; then + echo Using SSE2 macros only! + +$as_echo "@%:@define SSE2 1" >>confdefs.h + + fi + + DEPFLAGS="-M" + CFLAGS="$CFLAGS -O" + DEBUG_FLAG="-g" + CCDEP="$CC" + fi + fi + +# The MareNostrum: powerpc on a linux system +# this will also evaluate to "true" on BG/Q with XLC +elif test "$host_cpu" = "powerpc64" && test "$host_vendor" = "unknown" && test "$host_os" = "linux-gnu"; then + + DEBUGFLAG="-g" + if test "$XLC" = "yes"; then + CFLAGS="-qsrcmsg $CFLAGS" + DEBUGFLAG="$DEBUGFLAG -qfullpath" + fi + + OPTARGS="$OPTARGS" + SOPTARGS="$OPTARGS" + if test "$CCDEP" = "gcc"; then + DEPFLAGS="-MM" + else + DEPFLAGS="-M" + fi + +#The BLue Gene/L +elif test "$host_cpu" = "powerpc" && test "$host_vendor" = "ibm" && test "$host_os" = "blrts"; then + if test "$with_bgldram" = yes; then + if (test -e /bgl/local/bin/blrts_gcc); then + BLRTSGCC=/bgl/local/bin/blrts_gcc + elif (test -e /bgl/BlueLight/ppcfloor/blrts-gnu/bin/powerpc-bgl-blrts-gnu-gcc); then + BLRTSGCC=/bgl/BlueLight/ppcfloor/blrts-gnu/bin/powerpc-bgl-blrts-gnu-gcc + else + as_fn_error $? "Sorry, don't know where to find blrts_gcc, see README.bgl!" "$LINENO" 5 + fi + CCLD="$BLRTSGCC -Xlinker --script=./elf32ppcblrts.x" + if (!(test -s ./elf32ppcblrts.x)); then + as_fn_error $? "Sorry, elf32ppcblrts.x is missing, see README.bgl!" "$LINENO" 5 + fi + fi + DEBUGFLAG="-g" + OPTARGS="-O3" + SOPTARGS="-O3" + +$as_echo "@%:@define BGL 1" >>confdefs.h + + + if test "$XLC" = "yes"; then + CFLAGS="-qsrcmsg $CFLAGS" + OPTARGS="$OPTARGS -qarch=440d -qtune=440" + SOPTARGS="$SOPTARGS -qarch=440d -qtune=440" + DEBUGFLAG="$DEBUGFLAG -qfullpath" +# OPTARGS="-qhot" leads to wrong code + fi + LIBS="-lmpich.rts -lfmpich.rts -lmsglayer.rts -lrts.rts -ldevices.rts $LIBS" + LDFLAGS="$LDFLAGS -L/bgl/BlueLight/ppcfloor/bglsys/lib" + if test $with_lapack = yes; then + LIBS="-lesslbg -llapack.rts -lesslbg -lxlf90 -lxlfmath -lxl -lxlopt $LIBS" + LDFLAGS="$LDFLAGS -L/opt/ibmcmp/xlf/bg/10.1/blrts_lib -L/bgl/local/lib/ -L/opt/ibmmath/lib/" + fi + + if test "$CCDEP" = "gcc"; then + DEPFLAGS="-MM" + else + DEPFLAGS="-M" + fi + CPPFLAGS="-I/bgl/BlueLight/ppcfloor/bglsys/include" + INCLUDES="$INCLUDES -I/bgl/BlueLight/ppcfloor/bglsys/include/" + +#The BLue Gene/P +elif test "$host_cpu" = "powerpc" && test "$host_vendor" = "ibm" && test "$host_os" = "bprts"; then + CFLAGS="$CFLAGS" + DEBUGFLAG="-g" + OPTARGS="-O3" + SOPTARGS="-O3" + +$as_echo "@%:@define BGL 1" >>confdefs.h + + +$as_echo "@%:@define BGP 1" >>confdefs.h + + + if test "$XLC" = "yes"; then + CFLAGS="-qsrcmsg $CFLAGS" + OPTARGS="$OPTARGS -qarch=450d -qtune=450" + SOPTARGS="$SOPTARGS -qarch=450d -qtune=450" + DEBUGFLAG="$DEBUGFLAG -qfullpath" +# OPTARGS="-qhot" leads to wrong code + fi +# LIBS="-lxlf90_r -lxlomp_ser -lxl -lxlopt -lxlfmath -ldl -lrt -lpthread $LIBS" +# LDFLAGS="$LDFLAGS -L/bgsys/local/lib/ -L/opt/ibmcmp/xlf/bg/11.1/lib -L/bgsys/drivers/ppcfloor/comm/" +# if test $with_lapack = yes; then +# LIBS="-lesslbg -llapack -lesslbg $LIBS" +# LDFLAGS="$LDFLAGS -L/opt/ibmmath/lib/" +# fi + + if test "$CCDEP" = "gcc"; then + DEPFLAGS="-MM" + else + DEPFLAGS="-M" + fi + CPPFLAGS="-I/bgsys/drivers/ppcfloor/arch/include/ -I/bgsys/drivers/ppcfloor/comm/include" + INCLUDES="$INCLUDES -I/bgsys/local/include/ -I/bgsys/drivers/ppcfloor/arch/include/ -I/bgsys/drivers/ppcfloor/comm/include" + + + +# The IBM Power PC +elif test "$host_cpu" = "powerpc" && test "$host_vendor" = "ibm"; then + CFLAGS="$CFLAGS -q64 -qsrcmsg" + LDFLAGS="$LDFLAGS -q64" + OPTARGS="-O2" + SOPTARGS="-O2" + DEBUG_FLAG="-qfullpath -g" + if test "$CCDEP" = "gcc"; then + DEPFLAGS="-MM" + else + DEPFLAGS="-M" + fi + +# The CRAY +elif test "$host_vendor" = "cray"; then + echo + echo "Hey, we are on a cray, you should take some time for this..." + echo "get yourself a coffee or so!" + echo + CFLAGS="$CFLAGS -dp" + +$as_echo "@%:@define CRAY 1" >>confdefs.h + + OPTARGS="-O3" + SOPTARGS="-O3" + DEBUG_FLAG="-g" + CCDEP="$CC" + DEPFLAGS="-M" + +else + # Extract the first word of "gcc", so it can be a program name with args. +set dummy gcc; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_CCDEP+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$CCDEP"; then + ac_cv_prog_CCDEP="$CCDEP" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_prog_CCDEP=""gcc"" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + + test -z "$ac_cv_prog_CCDEP" && ac_cv_prog_CCDEP=""$CC"" +fi +fi +CCDEP=$ac_cv_prog_CCDEP +if test -n "$CCDEP"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CCDEP" >&5 +$as_echo "$CCDEP" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + + if test "$CCDEP" = "gcc"; then + DEPFLAGS="-MM" + else + DEPFLAGS="-M" + fi + OPTARGS= + SOPTARGS= +fi + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we want to switch on optimisation" >&5 +$as_echo_n "checking whether we want to switch on optimisation... " >&6; } +@%:@ Check whether --enable-optimize was given. +if test "${enable_optimize+set}" = set; then : + enableval=$enable_optimize; enable_optimize=$enableval +else + enable_optimize=yes +fi + +if test $enable_optimize = no; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } + OPTARGS= + SOPTARGS= +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } +fi + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we want to use a copy of the gauge field" >&5 +$as_echo_n "checking whether we want to use a copy of the gauge field... " >&6; } +@%:@ Check whether --enable-gaugecopy was given. +if test "${enable_gaugecopy+set}" = set; then : + enableval=$enable_gaugecopy; enable_gaugecopy=$enableval +else + enable_gaugecopy=yes +fi + +if test $enable_gaugecopy = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + +$as_echo "@%:@define _GAUGE_COPY 1" >>confdefs.h + +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we want to use a Dirac Op. with halfspinor exchange" >&5 +$as_echo_n "checking whether we want to use a Dirac Op. with halfspinor exchange... " >&6; } +@%:@ Check whether --enable-halfspinor was given. +if test "${enable_halfspinor+set}" = set; then : + enableval=$enable_halfspinor; enable_halfspinor=$enableval +else + enable_halfspinor=yes +fi + +if test $enable_halfspinor = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + +$as_echo "@%:@define _USE_HALFSPINOR 1" >>confdefs.h + + if test $enable_gaugecopy = no; then + { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: switching on gaugecopy for Dirac operator with halfspinor!" >&5 +$as_echo "$as_me: WARNING: switching on gaugecopy for Dirac operator with halfspinor!" >&2;} + +$as_echo "@%:@define _GAUGE_COPY 1" >>confdefs.h + + fi +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we want to use shmem API" >&5 +$as_echo_n "checking whether we want to use shmem API... " >&6; } +@%:@ Check whether --enable-shmem was given. +if test "${enable_shmem+set}" = set; then : + enableval=$enable_shmem; enable_shmem=$enableval +else + enable_shmem=no +fi + +if test $enable_shmem = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + +$as_echo "@%:@define _USE_SHMEM 1" >>confdefs.h + + LIBS="$LIBS -lsma" +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we want to use timeslice-splitted communications" >&5 +$as_echo_n "checking whether we want to use timeslice-splitted communications... " >&6; } +@%:@ Check whether --enable-tsplitpar was given. +if test "${enable_tsplitpar+set}" = set; then : + enableval=$enable_tsplitpar; enable_tsp=$enableval +else + enable_tsp=no +fi + +if test $enable_tsp = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + +$as_echo "@%:@define _USE_TSPLITPAR 1" >>confdefs.h + +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we want to compute the LapH eigenvalues" >&5 +$as_echo_n "checking whether we want to compute the LapH eigenvalues... " >&6; } +@%:@ Check whether --enable-laph was given. +if test "${enable_laph+set}" = set; then : + enableval=$enable_laph; enable_laph=$enableval +else + enable_laph=no +fi + +if test $enable_laph = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + +$as_echo "@%:@define WITHLAPH 1" >>confdefs.h + +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we want to use CUDA GPU" >&5 +$as_echo_n "checking whether we want to use CUDA GPU... " >&6; } +@%:@ Check whether --enable-gpu was given. +if test "${enable_gpu+set}" = set; then : + enableval=$enable_gpu; usegpu=$enableval +else + usegpu=no +fi + +if test $usegpu = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + +$as_echo "@%:@define HAVE_GPU 1" >>confdefs.h + + NVCC="nvcc" + USESUBDIRS="$USESUBDIRS GPU" + GPUDIR="GPU" + LIBS="$LIBS -lcuda -lcudart -lcublas" + + { $as_echo "$as_me:${as_lineno-$LINENO}: checking where to search for CUDA libs" >&5 +$as_echo_n "checking where to search for CUDA libs... " >&6; } + +@%:@ Check whether --with-cuda was given. +if test "${with_cuda+set}" = set; then : + withval=$with_cuda; cuda_dir=$withval +else + cuda_dir="/usr/local/cuda/lib" +fi + + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $cuda_dir" >&5 +$as_echo "$cuda_dir" >&6; } + if test $usegpu = yes; then + LDFLAGS="$LDFLAGS -L$cuda_dir" + fi + + + { $as_echo "$as_me:${as_lineno-$LINENO}: checking CUDA compile args" >&5 +$as_echo_n "checking CUDA compile args... " >&6; } + +@%:@ Check whether --with-cudacompileargs was given. +if test "${with_cudacompileargs+set}" = set; then : + withval=$with_cudacompileargs; cuda_compileargs=$withval +else + cuda_compileargs="--gpu-architecture sm_13 --use_fast_math -O3" +fi + + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $cuda_compileargs" >&5 +$as_echo "$cuda_compileargs" >&6; } + if test $usegpu = yes; then + GPUCFLAGS="$GPUCFLAGS $cuda_compileargs" + fi + if test $enable_mpi = yes; then + GPUMPICOMPILER="--compiler-bindir mpicc" + if test $withmpidimension != 1; then + as_fn_error $? "ERROR! The GPU Code is only parallelized in t-direction so far!" "$LINENO" 5 + fi + else + GPUMPICOMPILER="" + fi +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } + NVCC="" +fi + + + + + + + + + +# QUDA library for GPUs +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we want to use QUDA GPU" >&5 +$as_echo_n "checking whether we want to use QUDA GPU... " >&6; } +ac_ext=cpp +ac_cpp='$CXXCPP $CPPFLAGS' +ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_cxx_compiler_gnu +if test -z "$CXX"; then + if test -n "$CCC"; then + CXX=$CCC + else + if test -n "$ac_tool_prefix"; then + for ac_prog in g++ c++ gpp aCC CC cxx cc++ cl.exe FCC KCC RCC xlC_r xlC + do + # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args. +set dummy $ac_tool_prefix$ac_prog; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_CXX+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$CXX"; then + ac_cv_prog_CXX="$CXX" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_prog_CXX="$ac_tool_prefix$ac_prog" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + +fi +fi +CXX=$ac_cv_prog_CXX +if test -n "$CXX"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CXX" >&5 +$as_echo "$CXX" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + + test -n "$CXX" && break + done +fi +if test -z "$CXX"; then + ac_ct_CXX=$CXX + for ac_prog in g++ c++ gpp aCC CC cxx cc++ cl.exe FCC KCC RCC xlC_r xlC +do + # Extract the first word of "$ac_prog", so it can be a program name with args. +set dummy $ac_prog; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_ac_ct_CXX+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$ac_ct_CXX"; then + ac_cv_prog_ac_ct_CXX="$ac_ct_CXX" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_prog_ac_ct_CXX="$ac_prog" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + +fi +fi +ac_ct_CXX=$ac_cv_prog_ac_ct_CXX +if test -n "$ac_ct_CXX"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CXX" >&5 +$as_echo "$ac_ct_CXX" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + + test -n "$ac_ct_CXX" && break +done + + if test "x$ac_ct_CXX" = x; then + CXX="g++" + else + case $cross_compiling:$ac_tool_warned in +yes:) +{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5 +$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;} +ac_tool_warned=yes ;; +esac + CXX=$ac_ct_CXX + fi +fi + + fi +fi +# Provide some information about the compiler. +$as_echo "$as_me:${as_lineno-$LINENO}: checking for C++ compiler version" >&5 +set X $ac_compile +ac_compiler=$2 +for ac_option in --version -v -V -qversion; do + { { ac_try="$ac_compiler $ac_option >&5" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" +$as_echo "$ac_try_echo"; } >&5 + (eval "$ac_compiler $ac_option >&5") 2>conftest.err + ac_status=$? + if test -s conftest.err; then + sed '10a\ +... rest of stderr output deleted ... + 10q' conftest.err >conftest.er1 + cat conftest.er1 >&5 + fi + rm -f conftest.er1 conftest.err + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; } +done + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are using the GNU C++ compiler" >&5 +$as_echo_n "checking whether we are using the GNU C++ compiler... " >&6; } +if ${ac_cv_cxx_compiler_gnu+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +#ifdef F77_DUMMY_MAIN + +# ifdef __cplusplus + extern "C" +# endif + int F77_DUMMY_MAIN() { return 1; } + +#endif +int +main () +{ +#ifndef __GNUC__ + choke me +#endif + + ; + return 0; +} +_ACEOF +if ac_fn_cxx_try_compile "$LINENO"; then : + ac_compiler_gnu=yes +else + ac_compiler_gnu=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +ac_cv_cxx_compiler_gnu=$ac_compiler_gnu + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_cxx_compiler_gnu" >&5 +$as_echo "$ac_cv_cxx_compiler_gnu" >&6; } +if test $ac_compiler_gnu = yes; then + GXX=yes +else + GXX= +fi +ac_test_CXXFLAGS=${CXXFLAGS+set} +ac_save_CXXFLAGS=$CXXFLAGS +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CXX accepts -g" >&5 +$as_echo_n "checking whether $CXX accepts -g... " >&6; } +if ${ac_cv_prog_cxx_g+:} false; then : + $as_echo_n "(cached) " >&6 +else + ac_save_cxx_werror_flag=$ac_cxx_werror_flag + ac_cxx_werror_flag=yes + ac_cv_prog_cxx_g=no + CXXFLAGS="-g" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +#ifdef F77_DUMMY_MAIN + +# ifdef __cplusplus + extern "C" +# endif + int F77_DUMMY_MAIN() { return 1; } + +#endif +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_cxx_try_compile "$LINENO"; then : + ac_cv_prog_cxx_g=yes +else + CXXFLAGS="" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +#ifdef F77_DUMMY_MAIN + +# ifdef __cplusplus + extern "C" +# endif + int F77_DUMMY_MAIN() { return 1; } + +#endif +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_cxx_try_compile "$LINENO"; then : + +else + ac_cxx_werror_flag=$ac_save_cxx_werror_flag + CXXFLAGS="-g" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +#ifdef F77_DUMMY_MAIN + +# ifdef __cplusplus + extern "C" +# endif + int F77_DUMMY_MAIN() { return 1; } + +#endif +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_cxx_try_compile "$LINENO"; then : + ac_cv_prog_cxx_g=yes +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + ac_cxx_werror_flag=$ac_save_cxx_werror_flag +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cxx_g" >&5 +$as_echo "$ac_cv_prog_cxx_g" >&6; } +if test "$ac_test_CXXFLAGS" = set; then + CXXFLAGS=$ac_save_CXXFLAGS +elif test $ac_cv_prog_cxx_g = yes; then + if test "$GXX" = yes; then + CXXFLAGS="-g -O2" + else + CXXFLAGS="-g" + fi +else + if test "$GXX" = yes; then + CXXFLAGS="-O2" + else + CXXFLAGS= + fi +fi +ac_ext=cpp +ac_cpp='$CXXCPP $CPPFLAGS' +ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_cxx_compiler_gnu + + + +@%:@ Check whether --with-qudadir was given. +if test "${with_qudadir+set}" = set; then : + withval=$with_qudadir; echo yes + QUDA_AVAILABLE=1 + +$as_echo "@%:@define QUDA 1" >>confdefs.h + + quda_dir=$withval + LDFLAGS="$LDFLAGS -L${quda_dir}/lib" + INCLUDES="$INCLUDES -I${quda_dir}/include/" + QUDA_INTERFACE="quda_interface" + { $as_echo "$as_me:${as_lineno-$LINENO}: checking where to search for CUDA libs" >&5 +$as_echo_n "checking where to search for CUDA libs... " >&6; } + +@%:@ Check whether --with-cudadir was given. +if test "${with_cudadir+set}" = set; then : + withval=$with_cudadir; cuda_dir=$withval +else + cuda_dir="/usr/local/cuda/lib" +fi + + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $cuda_dir" >&5 +$as_echo "$cuda_dir" >&6; } + LDFLAGS="$LDFLAGS -L$cuda_dir" + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for cudaMalloc in -lcudart" >&5 +$as_echo_n "checking for cudaMalloc in -lcudart... " >&6; } +if ${ac_cv_lib_cudart_cudaMalloc+:} false; then : + $as_echo_n "(cached) " >&6 +else + ac_check_lib_save_LIBS=$LIBS +LIBS="-lcudart $LIBS" +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +/* Override any GCC internal prototype to avoid an error. + Use char because int might match the return type of a GCC + builtin and then its argument prototype would still apply. */ +#ifdef __cplusplus +extern "C" +#endif +char cudaMalloc (); +#ifdef F77_DUMMY_MAIN + +# ifdef __cplusplus + extern "C" +# endif + int F77_DUMMY_MAIN() { return 1; } + +#endif +int +main () +{ +return cudaMalloc (); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + ac_cv_lib_cudart_cudaMalloc=yes +else + ac_cv_lib_cudart_cudaMalloc=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +LIBS=$ac_check_lib_save_LIBS +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_cudart_cudaMalloc" >&5 +$as_echo "$ac_cv_lib_cudart_cudaMalloc" >&6; } +if test "x$ac_cv_lib_cudart_cudaMalloc" = xyes; then : + cat >>confdefs.h <<_ACEOF +@%:@define HAVE_LIBCUDART 1 +_ACEOF + + LIBS="-lcudart $LIBS" + +else + as_fn_error $? "Can't link a simple program against library cudart." "$LINENO" 5 + +fi + + # Perform test in C++ + ac_ext=cpp +ac_cpp='$CXXCPP $CPPFLAGS' +ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_cxx_compiler_gnu + + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for freeGaugeQuda in -lquda" >&5 +$as_echo_n "checking for freeGaugeQuda in -lquda... " >&6; } +if ${ac_cv_lib_quda_freeGaugeQuda+:} false; then : + $as_echo_n "(cached) " >&6 +else + ac_check_lib_save_LIBS=$LIBS +LIBS="-lquda $LIBS" +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +/* Override any GCC internal prototype to avoid an error. + Use char because int might match the return type of a GCC + builtin and then its argument prototype would still apply. */ +#ifdef __cplusplus +extern "C" +#endif +char freeGaugeQuda (); +#ifdef F77_DUMMY_MAIN + +# ifdef __cplusplus + extern "C" +# endif + int F77_DUMMY_MAIN() { return 1; } + +#endif +int +main () +{ +return freeGaugeQuda (); + ; + return 0; +} +_ACEOF +if ac_fn_cxx_try_link "$LINENO"; then : + ac_cv_lib_quda_freeGaugeQuda=yes +else + ac_cv_lib_quda_freeGaugeQuda=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +LIBS=$ac_check_lib_save_LIBS +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_quda_freeGaugeQuda" >&5 +$as_echo "$ac_cv_lib_quda_freeGaugeQuda" >&6; } +if test "x$ac_cv_lib_quda_freeGaugeQuda" = xyes; then : + cat >>confdefs.h <<_ACEOF +@%:@define HAVE_LIBQUDA 1 +_ACEOF + + LIBS="-lquda $LIBS" + +else + as_fn_error $? "Can't link a simple program against library libquda. (Did you set CXX properly?)" "$LINENO" 5 + +fi + + ac_ext=cpp +ac_cpp='$CXXCPP $CPPFLAGS' +ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_cxx_compiler_gnu + + #QUDA needs to be linked with C++ linker + CCLD=${CXX} + +else + echo no + QUDA_AVAILABLE=0 + QUDA_INTERFACE="" + + +fi + + + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking checking consistency" >&5 +$as_echo_n "checking checking consistency... " >&6; } +if test $enable_mpi = yes ; then + if test $enable_iig = yes && test $withpersistent = yes ; then + as_fn_error $? "ERROR! indexindepgeom is not compatible with persistent communications " "$LINENO" 5 + fi + if test $enable_iig = yes && test $enable_shmem = yes ; then + as_fn_error $? "ERROR! indexindepgeom is not compatible with shmem API " "$LINENO" 5 + fi + if test $enable_tsp = yes && test $enable_iig = no; then + as_fn_error $? "ERROR! tsplitpar needs indexindepgeom" "$LINENO" 5 + fi + if test $enable_tsp = yes && test $enable_sse2 != yes ; then + as_fn_error $? "ERROR! tsplitpar needs at least SSE2 " "$LINENO" 5 + fi + if test $enable_tsp = yes && test $enable_gaugecopy != yes ; then + as_fn_error $? "ERROR! tsplitpar needs gaugecopy" "$LINENO" 5 + fi + if test $enable_laph = yes && test $enable_tsp != yes ; then + as_fn_error $? "ERROR! laph needs tsplitpar" "$LINENO" 5 + fi +fi + +if test ! -e lib; then + mkdir lib +fi + +if test ! -e test; then + mkdir test +fi + +if test ! -e tests; then + mkdir tests +fi + +if test ! -e tests/regressions; then + mkdir tests/regressions +fi + + +LIBS="-lhmc -lmonomial -loperator -lsolver -linit -lmeas -llinalg -lhmc -lxchange -lrational -lio $LIBS" +AUTOCONF=autoconf + +for i in $USESUBDIRS +do + make_files="$make_files $i/Makefile" +done + +ac_config_files="$ac_config_files Makefile $make_files" + + +cat >confcache <<\_ACEOF +# This file is a shell script that caches the results of configure +# tests run on this system so they can be shared between configure +# scripts and configure runs, see configure's option --config-cache. +# It is not useful on other systems. If it contains results you don't +# want to keep, you may remove or edit it. +# +# config.status only pays attention to the cache file if you give it +# the --recheck option to rerun configure. +# +# `ac_cv_env_foo' variables (set or unset) will be overridden when +# loading this file, other *unset* `ac_cv_foo' will be assigned the +# following values. + +_ACEOF + +# The following way of writing the cache mishandles newlines in values, +# but we know of no workaround that is simple, portable, and efficient. +# So, we kill variables containing newlines. +# Ultrix sh set writes to stderr and can't be redirected directly, +# and sets the high bit in the cache file unless we assign to the vars. +( + for ac_var in `(set) 2>&1 | sed -n 's/^\([a-zA-Z_][a-zA-Z0-9_]*\)=.*/\1/p'`; do + eval ac_val=\$$ac_var + case $ac_val in #( + *${as_nl}*) + case $ac_var in #( + *_cv_*) { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: cache variable $ac_var contains a newline" >&5 +$as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;; + esac + case $ac_var in #( + _ | IFS | as_nl) ;; #( + BASH_ARGV | BASH_SOURCE) eval $ac_var= ;; #( + *) { eval $ac_var=; unset $ac_var;} ;; + esac ;; + esac + done + + (set) 2>&1 | + case $as_nl`(ac_space=' '; set) 2>&1` in #( + *${as_nl}ac_space=\ *) + # `set' does not quote correctly, so add quotes: double-quote + # substitution turns \\\\ into \\, and sed turns \\ into \. + sed -n \ + "s/'/'\\\\''/g; + s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1='\\2'/p" + ;; #( + *) + # `set' quotes correctly as required by POSIX, so do not add quotes. + sed -n "/^[_$as_cr_alnum]*_cv_[_$as_cr_alnum]*=/p" + ;; + esac | + sort +) | + sed ' + /^ac_cv_env_/b end + t clear + :clear + s/^\([^=]*\)=\(.*[{}].*\)$/test "${\1+set}" = set || &/ + t end + s/^\([^=]*\)=\(.*\)$/\1=${\1=\2}/ + :end' >>confcache +if diff "$cache_file" confcache >/dev/null 2>&1; then :; else + if test -w "$cache_file"; then + if test "x$cache_file" != "x/dev/null"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: updating cache $cache_file" >&5 +$as_echo "$as_me: updating cache $cache_file" >&6;} + if test ! -f "$cache_file" || test -h "$cache_file"; then + cat confcache >"$cache_file" + else + case $cache_file in #( + */* | ?:*) + mv -f confcache "$cache_file"$$ && + mv -f "$cache_file"$$ "$cache_file" ;; #( + *) + mv -f confcache "$cache_file" ;; + esac + fi + fi + else + { $as_echo "$as_me:${as_lineno-$LINENO}: not updating unwritable cache $cache_file" >&5 +$as_echo "$as_me: not updating unwritable cache $cache_file" >&6;} + fi +fi +rm -f confcache + +test "x$prefix" = xNONE && prefix=$ac_default_prefix +# Let make expand exec_prefix. +test "x$exec_prefix" = xNONE && exec_prefix='${prefix}' + +DEFS=-DHAVE_CONFIG_H + +ac_libobjs= +ac_ltlibobjs= +U= +for ac_i in : $LIB@&t@OBJS; do test "x$ac_i" = x: && continue + # 1. Remove the extension, and $U if already installed. + ac_script='s/\$U\././;s/\.o$//;s/\.obj$//' + ac_i=`$as_echo "$ac_i" | sed "$ac_script"` + # 2. Prepend LIBOBJDIR. When used with automake>=1.10 LIBOBJDIR + # will be set to the directory where LIBOBJS objects are built. + as_fn_append ac_libobjs " \${LIBOBJDIR}$ac_i\$U.$ac_objext" + as_fn_append ac_ltlibobjs " \${LIBOBJDIR}$ac_i"'$U.lo' +done +LIB@&t@OBJS=$ac_libobjs + +LTLIBOBJS=$ac_ltlibobjs + + + +: "${CONFIG_STATUS=./config.status}" +ac_write_fail=0 +ac_clean_files_save=$ac_clean_files +ac_clean_files="$ac_clean_files $CONFIG_STATUS" +{ $as_echo "$as_me:${as_lineno-$LINENO}: creating $CONFIG_STATUS" >&5 +$as_echo "$as_me: creating $CONFIG_STATUS" >&6;} +as_write_fail=0 +cat >$CONFIG_STATUS <<_ASEOF || as_write_fail=1 +#! $SHELL +# Generated by $as_me. +# Run this file to recreate the current configuration. +# Compiler output produced by configure, useful for debugging +# configure, is in config.log if it exists. + +debug=false +ac_cs_recheck=false +ac_cs_silent=false + +SHELL=\${CONFIG_SHELL-$SHELL} +export SHELL +_ASEOF +cat >>$CONFIG_STATUS <<\_ASEOF || as_write_fail=1 +## -------------------- ## +## M4sh Initialization. ## +## -------------------- ## + +# Be more Bourne compatible +DUALCASE=1; export DUALCASE # for MKS sh +if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then : + emulate sh + NULLCMD=: + # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which + # is contrary to our usage. Disable this feature. + alias -g '${1+"$@"}'='"$@"' + setopt NO_GLOB_SUBST +else + case `(set -o) 2>/dev/null` in @%:@( + *posix*) : + set -o posix ;; @%:@( + *) : + ;; +esac +fi + + +as_nl=' +' +export as_nl +# Printing a long string crashes Solaris 7 /usr/bin/printf. +as_echo='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' +as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo +as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo$as_echo +# Prefer a ksh shell builtin over an external printf program on Solaris, +# but without wasting forks for bash or zsh. +if test -z "$BASH_VERSION$ZSH_VERSION" \ + && (test "X`print -r -- $as_echo`" = "X$as_echo") 2>/dev/null; then + as_echo='print -r --' + as_echo_n='print -rn --' +elif (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then + as_echo='printf %s\n' + as_echo_n='printf %s' +else + if test "X`(/usr/ucb/echo -n -n $as_echo) 2>/dev/null`" = "X-n $as_echo"; then + as_echo_body='eval /usr/ucb/echo -n "$1$as_nl"' + as_echo_n='/usr/ucb/echo -n' + else + as_echo_body='eval expr "X$1" : "X\\(.*\\)"' + as_echo_n_body='eval + arg=$1; + case $arg in @%:@( + *"$as_nl"*) + expr "X$arg" : "X\\(.*\\)$as_nl"; + arg=`expr "X$arg" : ".*$as_nl\\(.*\\)"`;; + esac; + expr "X$arg" : "X\\(.*\\)" | tr -d "$as_nl" + ' + export as_echo_n_body + as_echo_n='sh -c $as_echo_n_body as_echo' + fi + export as_echo_body + as_echo='sh -c $as_echo_body as_echo' +fi + +# The user is always right. +if test "${PATH_SEPARATOR+set}" != set; then + PATH_SEPARATOR=: + (PATH='/bin;/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 && { + (PATH='/bin:/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 || + PATH_SEPARATOR=';' + } +fi + + +# IFS +# We need space, tab and new line, in precisely that order. Quoting is +# there to prevent editors from complaining about space-tab. +# (If _AS_PATH_WALK were called with IFS unset, it would disable word +# splitting by setting IFS to empty value.) +IFS=" "" $as_nl" + +# Find who we are. Look in the path if we contain no directory separator. +as_myself= +case $0 in @%:@(( + *[\\/]* ) as_myself=$0 ;; + *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break + done +IFS=$as_save_IFS + + ;; +esac +# We did not find ourselves, most probably we were run as `sh COMMAND' +# in which case we are not to be found in the path. +if test "x$as_myself" = x; then + as_myself=$0 +fi +if test ! -f "$as_myself"; then + $as_echo "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2 + exit 1 +fi + +# Unset variables that we do not need and which cause bugs (e.g. in +# pre-3.0 UWIN ksh). But do not cause bugs in bash 2.01; the "|| exit 1" +# suppresses any "Segmentation fault" message there. '((' could +# trigger a bug in pdksh 5.2.14. +for as_var in BASH_ENV ENV MAIL MAILPATH +do eval test x\${$as_var+set} = xset \ + && ( (unset $as_var) || exit 1) >/dev/null 2>&1 && unset $as_var || : +done +PS1='$ ' +PS2='> ' +PS4='+ ' + +# NLS nuisances. +LC_ALL=C +export LC_ALL +LANGUAGE=C +export LANGUAGE + +# CDPATH. +(unset CDPATH) >/dev/null 2>&1 && unset CDPATH + + +@%:@ as_fn_error STATUS ERROR [LINENO LOG_FD] +@%:@ ---------------------------------------- +@%:@ Output "`basename @S|@0`: error: ERROR" to stderr. If LINENO and LOG_FD are +@%:@ provided, also output the error to LOG_FD, referencing LINENO. Then exit the +@%:@ script with STATUS, using 1 if that was 0. +as_fn_error () +{ + as_status=$1; test $as_status -eq 0 && as_status=1 + if test "$4"; then + as_lineno=${as_lineno-"$3"} as_lineno_stack=as_lineno_stack=$as_lineno_stack + $as_echo "$as_me:${as_lineno-$LINENO}: error: $2" >&$4 + fi + $as_echo "$as_me: error: $2" >&2 + as_fn_exit $as_status +} @%:@ as_fn_error + + +@%:@ as_fn_set_status STATUS +@%:@ ----------------------- +@%:@ Set @S|@? to STATUS, without forking. +as_fn_set_status () +{ + return $1 +} @%:@ as_fn_set_status + +@%:@ as_fn_exit STATUS +@%:@ ----------------- +@%:@ Exit the shell with STATUS, even in a "trap 0" or "set -e" context. +as_fn_exit () +{ + set +e + as_fn_set_status $1 + exit $1 +} @%:@ as_fn_exit + +@%:@ as_fn_unset VAR +@%:@ --------------- +@%:@ Portably unset VAR. +as_fn_unset () +{ + { eval $1=; unset $1;} +} +as_unset=as_fn_unset +@%:@ as_fn_append VAR VALUE +@%:@ ---------------------- +@%:@ Append the text in VALUE to the end of the definition contained in VAR. Take +@%:@ advantage of any shell optimizations that allow amortized linear growth over +@%:@ repeated appends, instead of the typical quadratic growth present in naive +@%:@ implementations. +if (eval "as_var=1; as_var+=2; test x\$as_var = x12") 2>/dev/null; then : + eval 'as_fn_append () + { + eval $1+=\$2 + }' +else + as_fn_append () + { + eval $1=\$$1\$2 + } +fi # as_fn_append + +@%:@ as_fn_arith ARG... +@%:@ ------------------ +@%:@ Perform arithmetic evaluation on the ARGs, and store the result in the +@%:@ global @S|@as_val. Take advantage of shells that can avoid forks. The arguments +@%:@ must be portable across @S|@(()) and expr. +if (eval "test \$(( 1 + 1 )) = 2") 2>/dev/null; then : + eval 'as_fn_arith () + { + as_val=$(( $* )) + }' +else + as_fn_arith () + { + as_val=`expr "$@" || test $? -eq 1` + } +fi # as_fn_arith + + +if expr a : '\(a\)' >/dev/null 2>&1 && + test "X`expr 00001 : '.*\(...\)'`" = X001; then + as_expr=expr +else + as_expr=false +fi + +if (basename -- /) >/dev/null 2>&1 && test "X`basename -- / 2>&1`" = "X/"; then + as_basename=basename +else + as_basename=false +fi + +if (as_dir=`dirname -- /` && test "X$as_dir" = X/) >/dev/null 2>&1; then + as_dirname=dirname +else + as_dirname=false +fi + +as_me=`$as_basename -- "$0" || +$as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \ + X"$0" : 'X\(//\)$' \| \ + X"$0" : 'X\(/\)' \| . 2>/dev/null || +$as_echo X/"$0" | + sed '/^.*\/\([^/][^/]*\)\/*$/{ + s//\1/ + q + } + /^X\/\(\/\/\)$/{ + s//\1/ + q + } + /^X\/\(\/\).*/{ + s//\1/ + q + } + s/.*/./; q'` + +# Avoid depending upon Character Ranges. +as_cr_letters='abcdefghijklmnopqrstuvwxyz' +as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ' +as_cr_Letters=$as_cr_letters$as_cr_LETTERS +as_cr_digits='0123456789' +as_cr_alnum=$as_cr_Letters$as_cr_digits + +ECHO_C= ECHO_N= ECHO_T= +case `echo -n x` in @%:@((((( +-n*) + case `echo 'xy\c'` in + *c*) ECHO_T=' ';; # ECHO_T is single tab character. + xy) ECHO_C='\c';; + *) echo `echo ksh88 bug on AIX 6.1` > /dev/null + ECHO_T=' ';; + esac;; +*) + ECHO_N='-n';; +esac + +rm -f conf$$ conf$$.exe conf$$.file +if test -d conf$$.dir; then + rm -f conf$$.dir/conf$$.file +else + rm -f conf$$.dir + mkdir conf$$.dir 2>/dev/null +fi +if (echo >conf$$.file) 2>/dev/null; then + if ln -s conf$$.file conf$$ 2>/dev/null; then + as_ln_s='ln -s' + # ... but there are two gotchas: + # 1) On MSYS, both `ln -s file dir' and `ln file dir' fail. + # 2) DJGPP < 2.04 has no symlinks; `ln -s' creates a wrapper executable. + # In both cases, we have to default to `cp -pR'. + ln -s conf$$.file conf$$.dir 2>/dev/null && test ! -f conf$$.exe || + as_ln_s='cp -pR' + elif ln conf$$.file conf$$ 2>/dev/null; then + as_ln_s=ln + else + as_ln_s='cp -pR' + fi +else + as_ln_s='cp -pR' +fi +rm -f conf$$ conf$$.exe conf$$.dir/conf$$.file conf$$.file +rmdir conf$$.dir 2>/dev/null + + +@%:@ as_fn_mkdir_p +@%:@ ------------- +@%:@ Create "@S|@as_dir" as a directory, including parents if necessary. +as_fn_mkdir_p () +{ + + case $as_dir in #( + -*) as_dir=./$as_dir;; + esac + test -d "$as_dir" || eval $as_mkdir_p || { + as_dirs= + while :; do + case $as_dir in #( + *\'*) as_qdir=`$as_echo "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #'( + *) as_qdir=$as_dir;; + esac + as_dirs="'$as_qdir' $as_dirs" + as_dir=`$as_dirname -- "$as_dir" || +$as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ + X"$as_dir" : 'X\(//\)[^/]' \| \ + X"$as_dir" : 'X\(//\)$' \| \ + X"$as_dir" : 'X\(/\)' \| . 2>/dev/null || +$as_echo X"$as_dir" | + sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ + s//\1/ + q + } + /^X\(\/\/\)[^/].*/{ + s//\1/ + q + } + /^X\(\/\/\)$/{ + s//\1/ + q + } + /^X\(\/\).*/{ + s//\1/ + q + } + s/.*/./; q'` + test -d "$as_dir" && break + done + test -z "$as_dirs" || eval "mkdir $as_dirs" + } || test -d "$as_dir" || as_fn_error $? "cannot create directory $as_dir" + + +} @%:@ as_fn_mkdir_p +if mkdir -p . 2>/dev/null; then + as_mkdir_p='mkdir -p "$as_dir"' +else + test -d ./-p && rmdir ./-p + as_mkdir_p=false +fi + + +@%:@ as_fn_executable_p FILE +@%:@ ----------------------- +@%:@ Test if FILE is an executable regular file. +as_fn_executable_p () +{ + test -f "$1" && test -x "$1" +} @%:@ as_fn_executable_p +as_test_x='test -x' +as_executable_p=as_fn_executable_p + +# Sed expression to map a string onto a valid CPP name. +as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'" + +# Sed expression to map a string onto a valid variable name. +as_tr_sh="eval sed 'y%*+%pp%;s%[^_$as_cr_alnum]%_%g'" + + +exec 6>&1 +## ----------------------------------- ## +## Main body of $CONFIG_STATUS script. ## +## ----------------------------------- ## +_ASEOF +test $as_write_fail = 0 && chmod +x $CONFIG_STATUS || ac_write_fail=1 + +cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 +# Save the log message, to keep $0 and so on meaningful, and to +# report actual input values of CONFIG_FILES etc. instead of their +# values after options handling. +ac_log=" +This file was extended by tmLQCD $as_me 5.2.0, which was +generated by GNU Autoconf 2.69. Invocation command line was + + CONFIG_FILES = $CONFIG_FILES + CONFIG_HEADERS = $CONFIG_HEADERS + CONFIG_LINKS = $CONFIG_LINKS + CONFIG_COMMANDS = $CONFIG_COMMANDS + $ $0 $@ + +on `(hostname || uname -n) 2>/dev/null | sed 1q` +" + +_ACEOF + +case $ac_config_files in *" +"*) set x $ac_config_files; shift; ac_config_files=$*;; +esac + +case $ac_config_headers in *" +"*) set x $ac_config_headers; shift; ac_config_headers=$*;; +esac + + +cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 +# Files that config.status was made for. +config_files="$ac_config_files" +config_headers="$ac_config_headers" + +_ACEOF + +cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 +ac_cs_usage="\ +\`$as_me' instantiates files and other configuration actions +from templates according to the current configuration. Unless the files +and actions are specified as TAGs, all are instantiated by default. + +Usage: $0 [OPTION]... [TAG]... + + -h, --help print this help, then exit + -V, --version print version number and configuration settings, then exit + --config print configuration, then exit + -q, --quiet, --silent + do not print progress messages + -d, --debug don't remove temporary files + --recheck update $as_me by reconfiguring in the same conditions + --file=FILE[:TEMPLATE] + instantiate the configuration file FILE + --header=FILE[:TEMPLATE] + instantiate the configuration header FILE + +Configuration files: +$config_files + +Configuration headers: +$config_headers + +Report bugs to ." + +_ACEOF +cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 +ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" +ac_cs_version="\\ +tmLQCD config.status 5.2.0 +configured by $0, generated by GNU Autoconf 2.69, + with options \\"\$ac_cs_config\\" + +Copyright (C) 2012 Free Software Foundation, Inc. +This config.status script is free software; the Free Software Foundation +gives unlimited permission to copy, distribute and modify it." + +ac_pwd='$ac_pwd' +srcdir='$srcdir' +test -n "\$AWK" || AWK=awk +_ACEOF + +cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 +# The default lists apply if the user does not specify any file. +ac_need_defaults=: +while test $# != 0 +do + case $1 in + --*=?*) + ac_option=`expr "X$1" : 'X\([^=]*\)='` + ac_optarg=`expr "X$1" : 'X[^=]*=\(.*\)'` + ac_shift=: + ;; + --*=) + ac_option=`expr "X$1" : 'X\([^=]*\)='` + ac_optarg= + ac_shift=: + ;; + *) + ac_option=$1 + ac_optarg=$2 + ac_shift=shift + ;; + esac + + case $ac_option in + # Handling of the options. + -recheck | --recheck | --rechec | --reche | --rech | --rec | --re | --r) + ac_cs_recheck=: ;; + --version | --versio | --versi | --vers | --ver | --ve | --v | -V ) + $as_echo "$ac_cs_version"; exit ;; + --config | --confi | --conf | --con | --co | --c ) + $as_echo "$ac_cs_config"; exit ;; + --debug | --debu | --deb | --de | --d | -d ) + debug=: ;; + --file | --fil | --fi | --f ) + $ac_shift + case $ac_optarg in + *\'*) ac_optarg=`$as_echo "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"` ;; + '') as_fn_error $? "missing file argument" ;; + esac + as_fn_append CONFIG_FILES " '$ac_optarg'" + ac_need_defaults=false;; + --header | --heade | --head | --hea ) + $ac_shift + case $ac_optarg in + *\'*) ac_optarg=`$as_echo "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"` ;; + esac + as_fn_append CONFIG_HEADERS " '$ac_optarg'" + ac_need_defaults=false;; + --he | --h) + # Conflict between --help and --header + as_fn_error $? "ambiguous option: \`$1' +Try \`$0 --help' for more information.";; + --help | --hel | -h ) + $as_echo "$ac_cs_usage"; exit ;; + -q | -quiet | --quiet | --quie | --qui | --qu | --q \ + | -silent | --silent | --silen | --sile | --sil | --si | --s) + ac_cs_silent=: ;; + + # This is an error. + -*) as_fn_error $? "unrecognized option: \`$1' +Try \`$0 --help' for more information." ;; + + *) as_fn_append ac_config_targets " $1" + ac_need_defaults=false ;; + + esac + shift +done + +ac_configure_extra_args= + +if $ac_cs_silent; then + exec 6>/dev/null + ac_configure_extra_args="$ac_configure_extra_args --silent" +fi + +_ACEOF +cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 +if \$ac_cs_recheck; then + set X $SHELL '$0' $ac_configure_args \$ac_configure_extra_args --no-create --no-recursion + shift + \$as_echo "running CONFIG_SHELL=$SHELL \$*" >&6 + CONFIG_SHELL='$SHELL' + export CONFIG_SHELL + exec "\$@" +fi + +_ACEOF +cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 +exec 5>>config.log +{ + echo + sed 'h;s/./-/g;s/^.../@%:@@%:@ /;s/...$/ @%:@@%:@/;p;x;p;x' <<_ASBOX +@%:@@%:@ Running $as_me. @%:@@%:@ +_ASBOX + $as_echo "$ac_log" +} >&5 + +_ACEOF +cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 +_ACEOF + +cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 + +# Handling of arguments. +for ac_config_target in $ac_config_targets +do + case $ac_config_target in + "config.h") CONFIG_HEADERS="$CONFIG_HEADERS config.h" ;; + "fixed_volume.h") CONFIG_FILES="$CONFIG_FILES fixed_volume.h" ;; + "Makefile") CONFIG_FILES="$CONFIG_FILES Makefile" ;; + "$make_files") CONFIG_FILES="$CONFIG_FILES $make_files" ;; + + *) as_fn_error $? "invalid argument: \`$ac_config_target'" "$LINENO" 5;; + esac +done + + +# If the user did not use the arguments to specify the items to instantiate, +# then the envvar interface is used. Set only those that are not. +# We use the long form for the default assignment because of an extremely +# bizarre bug on SunOS 4.1.3. +if $ac_need_defaults; then + test "${CONFIG_FILES+set}" = set || CONFIG_FILES=$config_files + test "${CONFIG_HEADERS+set}" = set || CONFIG_HEADERS=$config_headers +fi + +# Have a temporary directory for convenience. Make it in the build tree +# simply because there is no reason against having it here, and in addition, +# creating and moving files from /tmp can sometimes cause problems. +# Hook for its removal unless debugging. +# Note that there is a small window in which the directory will not be cleaned: +# after its creation but before its name has been assigned to `$tmp'. +$debug || +{ + tmp= ac_tmp= + trap 'exit_status=$? + : "${ac_tmp:=$tmp}" + { test ! -d "$ac_tmp" || rm -fr "$ac_tmp"; } && exit $exit_status +' 0 + trap 'as_fn_exit 1' 1 2 13 15 +} +# Create a (secure) tmp directory for tmp files. + +{ + tmp=`(umask 077 && mktemp -d "./confXXXXXX") 2>/dev/null` && + test -d "$tmp" +} || +{ + tmp=./conf$$-$RANDOM + (umask 077 && mkdir "$tmp") +} || as_fn_error $? "cannot create a temporary directory in ." "$LINENO" 5 +ac_tmp=$tmp + +# Set up the scripts for CONFIG_FILES section. +# No need to generate them if there are no CONFIG_FILES. +# This happens for instance with `./config.status config.h'. +if test -n "$CONFIG_FILES"; then + + +ac_cr=`echo X | tr X '\015'` +# On cygwin, bash can eat \r inside `` if the user requested igncr. +# But we know of no other shell where ac_cr would be empty at this +# point, so we can use a bashism as a fallback. +if test "x$ac_cr" = x; then + eval ac_cr=\$\'\\r\' +fi +ac_cs_awk_cr=`$AWK 'BEGIN { print "a\rb" }' /dev/null` +if test "$ac_cs_awk_cr" = "a${ac_cr}b"; then + ac_cs_awk_cr='\\r' +else + ac_cs_awk_cr=$ac_cr +fi + +echo 'BEGIN {' >"$ac_tmp/subs1.awk" && +_ACEOF + + +{ + echo "cat >conf$$subs.awk <<_ACEOF" && + echo "$ac_subst_vars" | sed 's/.*/&!$&$ac_delim/' && + echo "_ACEOF" +} >conf$$subs.sh || + as_fn_error $? "could not make $CONFIG_STATUS" "$LINENO" 5 +ac_delim_num=`echo "$ac_subst_vars" | grep -c '^'` +ac_delim='%!_!# ' +for ac_last_try in false false false false false :; do + . ./conf$$subs.sh || + as_fn_error $? "could not make $CONFIG_STATUS" "$LINENO" 5 + + ac_delim_n=`sed -n "s/.*$ac_delim\$/X/p" conf$$subs.awk | grep -c X` + if test $ac_delim_n = $ac_delim_num; then + break + elif $ac_last_try; then + as_fn_error $? "could not make $CONFIG_STATUS" "$LINENO" 5 + else + ac_delim="$ac_delim!$ac_delim _$ac_delim!! " + fi +done +rm -f conf$$subs.sh + +cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 +cat >>"\$ac_tmp/subs1.awk" <<\\_ACAWK && +_ACEOF +sed -n ' +h +s/^/S["/; s/!.*/"]=/ +p +g +s/^[^!]*!// +:repl +t repl +s/'"$ac_delim"'$// +t delim +:nl +h +s/\(.\{148\}\)..*/\1/ +t more1 +s/["\\]/\\&/g; s/^/"/; s/$/\\n"\\/ +p +n +b repl +:more1 +s/["\\]/\\&/g; s/^/"/; s/$/"\\/ +p +g +s/.\{148\}// +t nl +:delim +h +s/\(.\{148\}\)..*/\1/ +t more2 +s/["\\]/\\&/g; s/^/"/; s/$/"/ +p +b +:more2 +s/["\\]/\\&/g; s/^/"/; s/$/"\\/ +p +g +s/.\{148\}// +t delim +' >$CONFIG_STATUS || ac_write_fail=1 +rm -f conf$$subs.awk +cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 +_ACAWK +cat >>"\$ac_tmp/subs1.awk" <<_ACAWK && + for (key in S) S_is_set[key] = 1 + FS = "" + +} +{ + line = $ 0 + nfields = split(line, field, "@") + substed = 0 + len = length(field[1]) + for (i = 2; i < nfields; i++) { + key = field[i] + keylen = length(key) + if (S_is_set[key]) { + value = S[key] + line = substr(line, 1, len) "" value "" substr(line, len + keylen + 3) + len += length(value) + length(field[++i]) + substed = 1 + } else + len += 1 + keylen + } + + print line +} + +_ACAWK +_ACEOF +cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 +if sed "s/$ac_cr//" < /dev/null > /dev/null 2>&1; then + sed "s/$ac_cr\$//; s/$ac_cr/$ac_cs_awk_cr/g" +else + cat +fi < "$ac_tmp/subs1.awk" > "$ac_tmp/subs.awk" \ + || as_fn_error $? "could not setup config files machinery" "$LINENO" 5 +_ACEOF + +# VPATH may cause trouble with some makes, so we remove sole $(srcdir), +# ${srcdir} and @srcdir@ entries from VPATH if srcdir is ".", strip leading and +# trailing colons and then remove the whole line if VPATH becomes empty +# (actually we leave an empty line to preserve line numbers). +if test "x$srcdir" = x.; then + ac_vpsub='/^[ ]*VPATH[ ]*=[ ]*/{ +h +s/// +s/^/:/ +s/[ ]*$/:/ +s/:\$(srcdir):/:/g +s/:\${srcdir}:/:/g +s/:@srcdir@:/:/g +s/^:*// +s/:*$// +x +s/\(=[ ]*\).*/\1/ +G +s/\n// +s/^[^=]*=[ ]*$// +}' +fi + +cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 +fi # test -n "$CONFIG_FILES" + +# Set up the scripts for CONFIG_HEADERS section. +# No need to generate them if there are no CONFIG_HEADERS. +# This happens for instance with `./config.status Makefile'. +if test -n "$CONFIG_HEADERS"; then +cat >"$ac_tmp/defines.awk" <<\_ACAWK || +BEGIN { +_ACEOF + +# Transform confdefs.h into an awk script `defines.awk', embedded as +# here-document in config.status, that substitutes the proper values into +# config.h.in to produce config.h. + +# Create a delimiter string that does not exist in confdefs.h, to ease +# handling of long lines. +ac_delim='%!_!# ' +for ac_last_try in false false :; do + ac_tt=`sed -n "/$ac_delim/p" confdefs.h` + if test -z "$ac_tt"; then + break + elif $ac_last_try; then + as_fn_error $? "could not make $CONFIG_HEADERS" "$LINENO" 5 + else + ac_delim="$ac_delim!$ac_delim _$ac_delim!! " + fi +done + +# For the awk script, D is an array of macro values keyed by name, +# likewise P contains macro parameters if any. Preserve backslash +# newline sequences. + +ac_word_re=[_$as_cr_Letters][_$as_cr_alnum]* +sed -n ' +s/.\{148\}/&'"$ac_delim"'/g +t rset +:rset +s/^[ ]*#[ ]*define[ ][ ]*/ / +t def +d +:def +s/\\$// +t bsnl +s/["\\]/\\&/g +s/^ \('"$ac_word_re"'\)\(([^()]*)\)[ ]*\(.*\)/P["\1"]="\2"\ +D["\1"]=" \3"/p +s/^ \('"$ac_word_re"'\)[ ]*\(.*\)/D["\1"]=" \2"/p +d +:bsnl +s/["\\]/\\&/g +s/^ \('"$ac_word_re"'\)\(([^()]*)\)[ ]*\(.*\)/P["\1"]="\2"\ +D["\1"]=" \3\\\\\\n"\\/p +t cont +s/^ \('"$ac_word_re"'\)[ ]*\(.*\)/D["\1"]=" \2\\\\\\n"\\/p +t cont +d +:cont +n +s/.\{148\}/&'"$ac_delim"'/g +t clear +:clear +s/\\$// +t bsnlc +s/["\\]/\\&/g; s/^/"/; s/$/"/p +d +:bsnlc +s/["\\]/\\&/g; s/^/"/; s/$/\\\\\\n"\\/p +b cont +' >$CONFIG_STATUS || ac_write_fail=1 + +cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 + for (key in D) D_is_set[key] = 1 + FS = "" +} +/^[\t ]*#[\t ]*(define|undef)[\t ]+$ac_word_re([\t (]|\$)/ { + line = \$ 0 + split(line, arg, " ") + if (arg[1] == "#") { + defundef = arg[2] + mac1 = arg[3] + } else { + defundef = substr(arg[1], 2) + mac1 = arg[2] + } + split(mac1, mac2, "(") #) + macro = mac2[1] + prefix = substr(line, 1, index(line, defundef) - 1) + if (D_is_set[macro]) { + # Preserve the white space surrounding the "#". + print prefix "define", macro P[macro] D[macro] + next + } else { + # Replace #undef with comments. This is necessary, for example, + # in the case of _POSIX_SOURCE, which is predefined and required + # on some systems where configure will not decide to define it. + if (defundef == "undef") { + print "/*", prefix defundef, macro, "*/" + next + } + } +} +{ print } +_ACAWK +_ACEOF +cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 + as_fn_error $? "could not setup config headers machinery" "$LINENO" 5 +fi # test -n "$CONFIG_HEADERS" + + +eval set X " :F $CONFIG_FILES :H $CONFIG_HEADERS " +shift +for ac_tag +do + case $ac_tag in + :[FHLC]) ac_mode=$ac_tag; continue;; + esac + case $ac_mode$ac_tag in + :[FHL]*:*);; + :L* | :C*:*) as_fn_error $? "invalid tag \`$ac_tag'" "$LINENO" 5;; + :[FH]-) ac_tag=-:-;; + :[FH]*) ac_tag=$ac_tag:$ac_tag.in;; + esac + ac_save_IFS=$IFS + IFS=: + set x $ac_tag + IFS=$ac_save_IFS + shift + ac_file=$1 + shift + + case $ac_mode in + :L) ac_source=$1;; + :[FH]) + ac_file_inputs= + for ac_f + do + case $ac_f in + -) ac_f="$ac_tmp/stdin";; + *) # Look for the file first in the build tree, then in the source tree + # (if the path is not absolute). The absolute path cannot be DOS-style, + # because $ac_f cannot contain `:'. + test -f "$ac_f" || + case $ac_f in + [\\/$]*) false;; + *) test -f "$srcdir/$ac_f" && ac_f="$srcdir/$ac_f";; + esac || + as_fn_error 1 "cannot find input file: \`$ac_f'" "$LINENO" 5;; + esac + case $ac_f in *\'*) ac_f=`$as_echo "$ac_f" | sed "s/'/'\\\\\\\\''/g"`;; esac + as_fn_append ac_file_inputs " '$ac_f'" + done + + # Let's still pretend it is `configure' which instantiates (i.e., don't + # use $as_me), people would be surprised to read: + # /* config.h. Generated by config.status. */ + configure_input='Generated from '` + $as_echo "$*" | sed 's|^[^:]*/||;s|:[^:]*/|, |g' + `' by configure.' + if test x"$ac_file" != x-; then + configure_input="$ac_file. $configure_input" + { $as_echo "$as_me:${as_lineno-$LINENO}: creating $ac_file" >&5 +$as_echo "$as_me: creating $ac_file" >&6;} + fi + # Neutralize special characters interpreted by sed in replacement strings. + case $configure_input in #( + *\&* | *\|* | *\\* ) + ac_sed_conf_input=`$as_echo "$configure_input" | + sed 's/[\\\\&|]/\\\\&/g'`;; #( + *) ac_sed_conf_input=$configure_input;; + esac + + case $ac_tag in + *:-:* | *:-) cat >"$ac_tmp/stdin" \ + || as_fn_error $? "could not create $ac_file" "$LINENO" 5 ;; + esac + ;; + esac + + ac_dir=`$as_dirname -- "$ac_file" || +$as_expr X"$ac_file" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ + X"$ac_file" : 'X\(//\)[^/]' \| \ + X"$ac_file" : 'X\(//\)$' \| \ + X"$ac_file" : 'X\(/\)' \| . 2>/dev/null || +$as_echo X"$ac_file" | + sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ + s//\1/ + q + } + /^X\(\/\/\)[^/].*/{ + s//\1/ + q + } + /^X\(\/\/\)$/{ + s//\1/ + q + } + /^X\(\/\).*/{ + s//\1/ + q + } + s/.*/./; q'` + as_dir="$ac_dir"; as_fn_mkdir_p + ac_builddir=. + +case "$ac_dir" in +.) ac_dir_suffix= ac_top_builddir_sub=. ac_top_build_prefix= ;; +*) + ac_dir_suffix=/`$as_echo "$ac_dir" | sed 's|^\.[\\/]||'` + # A ".." for each directory in $ac_dir_suffix. + ac_top_builddir_sub=`$as_echo "$ac_dir_suffix" | sed 's|/[^\\/]*|/..|g;s|/||'` + case $ac_top_builddir_sub in + "") ac_top_builddir_sub=. ac_top_build_prefix= ;; + *) ac_top_build_prefix=$ac_top_builddir_sub/ ;; + esac ;; +esac +ac_abs_top_builddir=$ac_pwd +ac_abs_builddir=$ac_pwd$ac_dir_suffix +# for backward compatibility: +ac_top_builddir=$ac_top_build_prefix + +case $srcdir in + .) # We are building in place. + ac_srcdir=. + ac_top_srcdir=$ac_top_builddir_sub + ac_abs_top_srcdir=$ac_pwd ;; + [\\/]* | ?:[\\/]* ) # Absolute name. + ac_srcdir=$srcdir$ac_dir_suffix; + ac_top_srcdir=$srcdir + ac_abs_top_srcdir=$srcdir ;; + *) # Relative name. + ac_srcdir=$ac_top_build_prefix$srcdir$ac_dir_suffix + ac_top_srcdir=$ac_top_build_prefix$srcdir + ac_abs_top_srcdir=$ac_pwd/$srcdir ;; +esac +ac_abs_srcdir=$ac_abs_top_srcdir$ac_dir_suffix + + + case $ac_mode in + :F) + # + # CONFIG_FILE + # + +_ACEOF + +cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 +# If the template does not know about datarootdir, expand it. +# FIXME: This hack should be removed a few years after 2.60. +ac_datarootdir_hack=; ac_datarootdir_seen= +ac_sed_dataroot=' +/datarootdir/ { + p + q +} +/@datadir@/p +/@docdir@/p +/@infodir@/p +/@localedir@/p +/@mandir@/p' +case `eval "sed -n \"\$ac_sed_dataroot\" $ac_file_inputs"` in +*datarootdir*) ac_datarootdir_seen=yes;; +*@datadir@*|*@docdir@*|*@infodir@*|*@localedir@*|*@mandir@*) + { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&5 +$as_echo "$as_me: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&2;} +_ACEOF +cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 + ac_datarootdir_hack=' + s&@datadir@&$datadir&g + s&@docdir@&$docdir&g + s&@infodir@&$infodir&g + s&@localedir@&$localedir&g + s&@mandir@&$mandir&g + s&\\\${datarootdir}&$datarootdir&g' ;; +esac +_ACEOF + +# Neutralize VPATH when `$srcdir' = `.'. +# Shell code in configure.ac might set extrasub. +# FIXME: do we really want to maintain this feature? +cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 +ac_sed_extra="$ac_vpsub +$extrasub +_ACEOF +cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 +:t +/@[a-zA-Z_][a-zA-Z_0-9]*@/!b +s|@configure_input@|$ac_sed_conf_input|;t t +s&@top_builddir@&$ac_top_builddir_sub&;t t +s&@top_build_prefix@&$ac_top_build_prefix&;t t +s&@srcdir@&$ac_srcdir&;t t +s&@abs_srcdir@&$ac_abs_srcdir&;t t +s&@top_srcdir@&$ac_top_srcdir&;t t +s&@abs_top_srcdir@&$ac_abs_top_srcdir&;t t +s&@builddir@&$ac_builddir&;t t +s&@abs_builddir@&$ac_abs_builddir&;t t +s&@abs_top_builddir@&$ac_abs_top_builddir&;t t +$ac_datarootdir_hack +" +eval sed \"\$ac_sed_extra\" "$ac_file_inputs" | $AWK -f "$ac_tmp/subs.awk" \ + >$ac_tmp/out || as_fn_error $? "could not create $ac_file" "$LINENO" 5 + +test -z "$ac_datarootdir_hack$ac_datarootdir_seen" && + { ac_out=`sed -n '/\${datarootdir}/p' "$ac_tmp/out"`; test -n "$ac_out"; } && + { ac_out=`sed -n '/^[ ]*datarootdir[ ]*:*=/p' \ + "$ac_tmp/out"`; test -z "$ac_out"; } && + { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $ac_file contains a reference to the variable \`datarootdir' +which seems to be undefined. Please make sure it is defined" >&5 +$as_echo "$as_me: WARNING: $ac_file contains a reference to the variable \`datarootdir' +which seems to be undefined. Please make sure it is defined" >&2;} + + rm -f "$ac_tmp/stdin" + case $ac_file in + -) cat "$ac_tmp/out" && rm -f "$ac_tmp/out";; + *) rm -f "$ac_file" && mv "$ac_tmp/out" "$ac_file";; + esac \ + || as_fn_error $? "could not create $ac_file" "$LINENO" 5 + ;; + :H) + # + # CONFIG_HEADER + # + if test x"$ac_file" != x-; then + { + $as_echo "/* $configure_input */" \ + && eval '$AWK -f "$ac_tmp/defines.awk"' "$ac_file_inputs" + } >"$ac_tmp/config.h" \ + || as_fn_error $? "could not create $ac_file" "$LINENO" 5 + if diff "$ac_file" "$ac_tmp/config.h" >/dev/null 2>&1; then + { $as_echo "$as_me:${as_lineno-$LINENO}: $ac_file is unchanged" >&5 +$as_echo "$as_me: $ac_file is unchanged" >&6;} + else + rm -f "$ac_file" + mv "$ac_tmp/config.h" "$ac_file" \ + || as_fn_error $? "could not create $ac_file" "$LINENO" 5 + fi + else + $as_echo "/* $configure_input */" \ + && eval '$AWK -f "$ac_tmp/defines.awk"' "$ac_file_inputs" \ + || as_fn_error $? "could not create -" "$LINENO" 5 + fi + ;; + + + esac + +done # for ac_tag + + +as_fn_exit 0 +_ACEOF +ac_clean_files=$ac_clean_files_save + +test $ac_write_fail = 0 || + as_fn_error $? "write failure creating $CONFIG_STATUS" "$LINENO" 5 + + +# configure is writing to config.log, and then calls config.status. +# config.status does its own redirection, appending to config.log. +# Unfortunately, on DOS this fails, as config.log is still kept open +# by configure, so config.status won't be able to write to it; its +# output is simply discarded. So we exec the FD to /dev/null, +# effectively closing config.log, so it can be properly (re)opened and +# appended to by config.status. When coming back to configure, we +# need to make the FD available again. +if test "$no_create" != yes; then + ac_cs_success=: + ac_config_status_args= + test "$silent" = yes && + ac_config_status_args="$ac_config_status_args --quiet" + exec 5>/dev/null + $SHELL $CONFIG_STATUS $ac_config_status_args || ac_cs_success=false + exec 5>>config.log + # Use ||, not &&, to avoid exiting from the if with $? = 1, which + # would make configure fail if this is the last instruction. + $ac_cs_success || as_fn_exit 1 +fi +if test -n "$ac_unrecognized_opts" && test "$enable_option_checking" != no; then + { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: unrecognized options: $ac_unrecognized_opts" >&5 +$as_echo "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2;} +fi + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/autom4te.cache/requests b/qcd/part_cpu/applications/QCD/src/kernel_D/autom4te.cache/requests new file mode 100644 index 0000000000000000000000000000000000000000..99449c3805aa583ad0930a3f951c8c90dfd0b0ea --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/autom4te.cache/requests @@ -0,0 +1,77 @@ +# This file was generated by Autom4te Thu Apr 10 10:06:43 UTC 2014. +# It contains the lists of macros which have been traced. +# It can be safely removed. + +@request = ( + bless( [ + '0', + 1, + [ + '/usr/share/autoconf' + ], + [ + '/usr/share/autoconf/autoconf/autoconf.m4f', + 'configure.in' + ], + { + '_AM_COND_ENDIF' => 1, + 'AC_SUBST' => 1, + 'AM_MAINTAINER_MODE' => 1, + 'AC_PROG_LIBTOOL' => 1, + 'AM_PROG_FC_C_O' => 1, + 'AC_CONFIG_HEADERS' => 1, + 'AC_FC_PP_SRCEXT' => 1, + 'AM_NLS' => 1, + 'AM_XGETTEXT_OPTION' => 1, + '_LT_AC_TAGCONFIG' => 1, + 'AM_PROG_F77_C_O' => 1, + 'AC_CONFIG_AUX_DIR' => 1, + 'AC_LIBSOURCE' => 1, + 'AM_PROG_CXX_C_O' => 1, + '_AM_MAKEFILE_INCLUDE' => 1, + 'AC_CANONICAL_SYSTEM' => 1, + 'AM_SILENT_RULES' => 1, + 'AM_PATH_GUILE' => 1, + 'LT_INIT' => 1, + 'AM_CONDITIONAL' => 1, + 'AM_AUTOMAKE_VERSION' => 1, + 'LT_SUPPORTED_TAG' => 1, + 'AC_DEFINE_TRACE_LITERAL' => 1, + 'AC_FC_SRCEXT' => 1, + 'AM_PROG_AR' => 1, + 'AC_CANONICAL_TARGET' => 1, + 'AC_CANONICAL_BUILD' => 1, + 'AM_GNU_GETTEXT' => 1, + 'm4_include' => 1, + 'AC_CONFIG_LIBOBJ_DIR' => 1, + 'AM_GNU_GETTEXT_INTL_SUBDIR' => 1, + 'include' => 1, + 'AM_PROG_CC_C_O' => 1, + 'AM_INIT_AUTOMAKE' => 1, + 'AC_CONFIG_LINKS' => 1, + 'AC_INIT' => 1, + 'AC_SUBST_TRACE' => 1, + 'AC_FC_FREEFORM' => 1, + 'AC_FC_PP_DEFINE' => 1, + 'AC_CONFIG_SUBDIRS' => 1, + 'm4_pattern_allow' => 1, + 'AM_ENABLE_MULTILIB' => 1, + 'AM_POT_TOOLS' => 1, + 'AC_CONFIG_FILES' => 1, + '_AM_COND_IF' => 1, + '_AM_COND_ELSE' => 1, + 'm4_sinclude' => 1, + 'm4_pattern_forbid' => 1, + '_AM_SUBST_NOTMAKE' => 1, + 'AC_CANONICAL_HOST' => 1, + '_m4_warn' => 1, + 'LT_CONFIG_LTDL_DIR' => 1, + 'AH_OUTPUT' => 1, + 'AC_REQUIRE_AUX_FILE' => 1, + 'AM_PROG_MOC' => 1, + 'sinclude' => 1, + 'AM_MAKEFILE_INCLUDE' => 1 + } + ], 'Autom4te::Request' ) + ); + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/autom4te.cache/traces.0 b/qcd/part_cpu/applications/QCD/src/kernel_D/autom4te.cache/traces.0 new file mode 100644 index 0000000000000000000000000000000000000000..5a20f8e3c49c5c179b81048addc369819f406ab5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/autom4te.cache/traces.0 @@ -0,0 +1,911 @@ +m4trace:configure.in:5: -1- AC_INIT([tmLQCD], [5.2.0], [curbach@gmx.de]) +m4trace:configure.in:5: -1- m4_pattern_forbid([^_?A[CHUM]_]) +m4trace:configure.in:5: -1- m4_pattern_forbid([_AC_]) +m4trace:configure.in:5: -1- m4_pattern_forbid([^LIBOBJS$], [do not use LIBOBJS directly, use AC_LIBOBJ (see section `AC_LIBOBJ vs LIBOBJS']) +m4trace:configure.in:5: -1- m4_pattern_allow([^AS_FLAGS$]) +m4trace:configure.in:5: -1- m4_pattern_forbid([^_?m4_]) +m4trace:configure.in:5: -1- m4_pattern_forbid([^dnl$]) +m4trace:configure.in:5: -1- m4_pattern_forbid([^_?AS_]) +m4trace:configure.in:5: -1- AC_SUBST([SHELL]) +m4trace:configure.in:5: -1- AC_SUBST_TRACE([SHELL]) +m4trace:configure.in:5: -1- m4_pattern_allow([^SHELL$]) +m4trace:configure.in:5: -1- AC_SUBST([PATH_SEPARATOR]) +m4trace:configure.in:5: -1- AC_SUBST_TRACE([PATH_SEPARATOR]) +m4trace:configure.in:5: -1- m4_pattern_allow([^PATH_SEPARATOR$]) +m4trace:configure.in:5: -1- AC_SUBST([PACKAGE_NAME], [m4_ifdef([AC_PACKAGE_NAME], ['AC_PACKAGE_NAME'])]) +m4trace:configure.in:5: -1- AC_SUBST_TRACE([PACKAGE_NAME]) +m4trace:configure.in:5: -1- m4_pattern_allow([^PACKAGE_NAME$]) +m4trace:configure.in:5: -1- AC_SUBST([PACKAGE_TARNAME], [m4_ifdef([AC_PACKAGE_TARNAME], ['AC_PACKAGE_TARNAME'])]) +m4trace:configure.in:5: -1- AC_SUBST_TRACE([PACKAGE_TARNAME]) +m4trace:configure.in:5: -1- m4_pattern_allow([^PACKAGE_TARNAME$]) +m4trace:configure.in:5: -1- AC_SUBST([PACKAGE_VERSION], [m4_ifdef([AC_PACKAGE_VERSION], ['AC_PACKAGE_VERSION'])]) +m4trace:configure.in:5: -1- AC_SUBST_TRACE([PACKAGE_VERSION]) +m4trace:configure.in:5: -1- m4_pattern_allow([^PACKAGE_VERSION$]) +m4trace:configure.in:5: -1- AC_SUBST([PACKAGE_STRING], [m4_ifdef([AC_PACKAGE_STRING], ['AC_PACKAGE_STRING'])]) +m4trace:configure.in:5: -1- AC_SUBST_TRACE([PACKAGE_STRING]) +m4trace:configure.in:5: -1- m4_pattern_allow([^PACKAGE_STRING$]) +m4trace:configure.in:5: -1- AC_SUBST([PACKAGE_BUGREPORT], [m4_ifdef([AC_PACKAGE_BUGREPORT], ['AC_PACKAGE_BUGREPORT'])]) +m4trace:configure.in:5: -1- AC_SUBST_TRACE([PACKAGE_BUGREPORT]) +m4trace:configure.in:5: -1- m4_pattern_allow([^PACKAGE_BUGREPORT$]) +m4trace:configure.in:5: -1- AC_SUBST([PACKAGE_URL], [m4_ifdef([AC_PACKAGE_URL], ['AC_PACKAGE_URL'])]) +m4trace:configure.in:5: -1- AC_SUBST_TRACE([PACKAGE_URL]) +m4trace:configure.in:5: -1- m4_pattern_allow([^PACKAGE_URL$]) +m4trace:configure.in:5: -1- AC_SUBST([exec_prefix], [NONE]) +m4trace:configure.in:5: -1- AC_SUBST_TRACE([exec_prefix]) +m4trace:configure.in:5: -1- m4_pattern_allow([^exec_prefix$]) +m4trace:configure.in:5: -1- AC_SUBST([prefix], [NONE]) +m4trace:configure.in:5: -1- AC_SUBST_TRACE([prefix]) +m4trace:configure.in:5: -1- m4_pattern_allow([^prefix$]) +m4trace:configure.in:5: -1- AC_SUBST([program_transform_name], [s,x,x,]) +m4trace:configure.in:5: -1- AC_SUBST_TRACE([program_transform_name]) +m4trace:configure.in:5: -1- m4_pattern_allow([^program_transform_name$]) +m4trace:configure.in:5: -1- AC_SUBST([bindir], ['${exec_prefix}/bin']) +m4trace:configure.in:5: -1- AC_SUBST_TRACE([bindir]) +m4trace:configure.in:5: -1- m4_pattern_allow([^bindir$]) +m4trace:configure.in:5: -1- AC_SUBST([sbindir], ['${exec_prefix}/sbin']) +m4trace:configure.in:5: -1- AC_SUBST_TRACE([sbindir]) +m4trace:configure.in:5: -1- m4_pattern_allow([^sbindir$]) +m4trace:configure.in:5: -1- AC_SUBST([libexecdir], ['${exec_prefix}/libexec']) +m4trace:configure.in:5: -1- AC_SUBST_TRACE([libexecdir]) +m4trace:configure.in:5: -1- m4_pattern_allow([^libexecdir$]) +m4trace:configure.in:5: -1- AC_SUBST([datarootdir], ['${prefix}/share']) +m4trace:configure.in:5: -1- AC_SUBST_TRACE([datarootdir]) +m4trace:configure.in:5: -1- m4_pattern_allow([^datarootdir$]) +m4trace:configure.in:5: -1- AC_SUBST([datadir], ['${datarootdir}']) +m4trace:configure.in:5: -1- AC_SUBST_TRACE([datadir]) +m4trace:configure.in:5: -1- m4_pattern_allow([^datadir$]) +m4trace:configure.in:5: -1- AC_SUBST([sysconfdir], ['${prefix}/etc']) +m4trace:configure.in:5: -1- AC_SUBST_TRACE([sysconfdir]) +m4trace:configure.in:5: -1- m4_pattern_allow([^sysconfdir$]) +m4trace:configure.in:5: -1- AC_SUBST([sharedstatedir], ['${prefix}/com']) +m4trace:configure.in:5: -1- AC_SUBST_TRACE([sharedstatedir]) +m4trace:configure.in:5: -1- m4_pattern_allow([^sharedstatedir$]) +m4trace:configure.in:5: -1- AC_SUBST([localstatedir], ['${prefix}/var']) +m4trace:configure.in:5: -1- AC_SUBST_TRACE([localstatedir]) +m4trace:configure.in:5: -1- m4_pattern_allow([^localstatedir$]) +m4trace:configure.in:5: -1- AC_SUBST([includedir], ['${prefix}/include']) +m4trace:configure.in:5: -1- AC_SUBST_TRACE([includedir]) +m4trace:configure.in:5: -1- m4_pattern_allow([^includedir$]) +m4trace:configure.in:5: -1- AC_SUBST([oldincludedir], ['/usr/include']) +m4trace:configure.in:5: -1- AC_SUBST_TRACE([oldincludedir]) +m4trace:configure.in:5: -1- m4_pattern_allow([^oldincludedir$]) +m4trace:configure.in:5: -1- AC_SUBST([docdir], [m4_ifset([AC_PACKAGE_TARNAME], + ['${datarootdir}/doc/${PACKAGE_TARNAME}'], + ['${datarootdir}/doc/${PACKAGE}'])]) +m4trace:configure.in:5: -1- AC_SUBST_TRACE([docdir]) +m4trace:configure.in:5: -1- m4_pattern_allow([^docdir$]) +m4trace:configure.in:5: -1- AC_SUBST([infodir], ['${datarootdir}/info']) +m4trace:configure.in:5: -1- AC_SUBST_TRACE([infodir]) +m4trace:configure.in:5: -1- m4_pattern_allow([^infodir$]) +m4trace:configure.in:5: -1- AC_SUBST([htmldir], ['${docdir}']) +m4trace:configure.in:5: -1- AC_SUBST_TRACE([htmldir]) +m4trace:configure.in:5: -1- m4_pattern_allow([^htmldir$]) +m4trace:configure.in:5: -1- AC_SUBST([dvidir], ['${docdir}']) +m4trace:configure.in:5: -1- AC_SUBST_TRACE([dvidir]) +m4trace:configure.in:5: -1- m4_pattern_allow([^dvidir$]) +m4trace:configure.in:5: -1- AC_SUBST([pdfdir], ['${docdir}']) +m4trace:configure.in:5: -1- AC_SUBST_TRACE([pdfdir]) +m4trace:configure.in:5: -1- m4_pattern_allow([^pdfdir$]) +m4trace:configure.in:5: -1- AC_SUBST([psdir], ['${docdir}']) +m4trace:configure.in:5: -1- AC_SUBST_TRACE([psdir]) +m4trace:configure.in:5: -1- m4_pattern_allow([^psdir$]) +m4trace:configure.in:5: -1- AC_SUBST([libdir], ['${exec_prefix}/lib']) +m4trace:configure.in:5: -1- AC_SUBST_TRACE([libdir]) +m4trace:configure.in:5: -1- m4_pattern_allow([^libdir$]) +m4trace:configure.in:5: -1- AC_SUBST([localedir], ['${datarootdir}/locale']) +m4trace:configure.in:5: -1- AC_SUBST_TRACE([localedir]) +m4trace:configure.in:5: -1- m4_pattern_allow([^localedir$]) +m4trace:configure.in:5: -1- AC_SUBST([mandir], ['${datarootdir}/man']) +m4trace:configure.in:5: -1- AC_SUBST_TRACE([mandir]) +m4trace:configure.in:5: -1- m4_pattern_allow([^mandir$]) +m4trace:configure.in:5: -1- AC_DEFINE_TRACE_LITERAL([PACKAGE_NAME]) +m4trace:configure.in:5: -1- m4_pattern_allow([^PACKAGE_NAME$]) +m4trace:configure.in:5: -1- AH_OUTPUT([PACKAGE_NAME], [/* Define to the full name of this package. */ +@%:@undef PACKAGE_NAME]) +m4trace:configure.in:5: -1- AC_DEFINE_TRACE_LITERAL([PACKAGE_TARNAME]) +m4trace:configure.in:5: -1- m4_pattern_allow([^PACKAGE_TARNAME$]) +m4trace:configure.in:5: -1- AH_OUTPUT([PACKAGE_TARNAME], [/* Define to the one symbol short name of this package. */ +@%:@undef PACKAGE_TARNAME]) +m4trace:configure.in:5: -1- AC_DEFINE_TRACE_LITERAL([PACKAGE_VERSION]) +m4trace:configure.in:5: -1- m4_pattern_allow([^PACKAGE_VERSION$]) +m4trace:configure.in:5: -1- AH_OUTPUT([PACKAGE_VERSION], [/* Define to the version of this package. */ +@%:@undef PACKAGE_VERSION]) +m4trace:configure.in:5: -1- AC_DEFINE_TRACE_LITERAL([PACKAGE_STRING]) +m4trace:configure.in:5: -1- m4_pattern_allow([^PACKAGE_STRING$]) +m4trace:configure.in:5: -1- AH_OUTPUT([PACKAGE_STRING], [/* Define to the full name and version of this package. */ +@%:@undef PACKAGE_STRING]) +m4trace:configure.in:5: -1- AC_DEFINE_TRACE_LITERAL([PACKAGE_BUGREPORT]) +m4trace:configure.in:5: -1- m4_pattern_allow([^PACKAGE_BUGREPORT$]) +m4trace:configure.in:5: -1- AH_OUTPUT([PACKAGE_BUGREPORT], [/* Define to the address where bug reports for this package should be sent. */ +@%:@undef PACKAGE_BUGREPORT]) +m4trace:configure.in:5: -1- AC_DEFINE_TRACE_LITERAL([PACKAGE_URL]) +m4trace:configure.in:5: -1- m4_pattern_allow([^PACKAGE_URL$]) +m4trace:configure.in:5: -1- AH_OUTPUT([PACKAGE_URL], [/* Define to the home page for this package. */ +@%:@undef PACKAGE_URL]) +m4trace:configure.in:5: -1- AC_SUBST([DEFS]) +m4trace:configure.in:5: -1- AC_SUBST_TRACE([DEFS]) +m4trace:configure.in:5: -1- m4_pattern_allow([^DEFS$]) +m4trace:configure.in:5: -1- AC_SUBST([ECHO_C]) +m4trace:configure.in:5: -1- AC_SUBST_TRACE([ECHO_C]) +m4trace:configure.in:5: -1- m4_pattern_allow([^ECHO_C$]) +m4trace:configure.in:5: -1- AC_SUBST([ECHO_N]) +m4trace:configure.in:5: -1- AC_SUBST_TRACE([ECHO_N]) +m4trace:configure.in:5: -1- m4_pattern_allow([^ECHO_N$]) +m4trace:configure.in:5: -1- AC_SUBST([ECHO_T]) +m4trace:configure.in:5: -1- AC_SUBST_TRACE([ECHO_T]) +m4trace:configure.in:5: -1- m4_pattern_allow([^ECHO_T$]) +m4trace:configure.in:5: -1- AC_SUBST([LIBS]) +m4trace:configure.in:5: -1- AC_SUBST_TRACE([LIBS]) +m4trace:configure.in:5: -1- m4_pattern_allow([^LIBS$]) +m4trace:configure.in:5: -1- AC_SUBST([build_alias]) +m4trace:configure.in:5: -1- AC_SUBST_TRACE([build_alias]) +m4trace:configure.in:5: -1- m4_pattern_allow([^build_alias$]) +m4trace:configure.in:5: -1- AC_SUBST([host_alias]) +m4trace:configure.in:5: -1- AC_SUBST_TRACE([host_alias]) +m4trace:configure.in:5: -1- m4_pattern_allow([^host_alias$]) +m4trace:configure.in:5: -1- AC_SUBST([target_alias]) +m4trace:configure.in:5: -1- AC_SUBST_TRACE([target_alias]) +m4trace:configure.in:5: -1- m4_pattern_allow([^target_alias$]) +m4trace:configure.in:6: -1- AC_CONFIG_HEADERS([config.h]) +m4trace:configure.in:8: -1- AC_CANONICAL_HOST([]) +m4trace:configure.in:8: -1- AC_CANONICAL_BUILD +m4trace:configure.in:8: -1- AC_REQUIRE_AUX_FILE([config.sub]) +m4trace:configure.in:8: -1- AC_REQUIRE_AUX_FILE([config.guess]) +m4trace:configure.in:8: -1- AC_SUBST([build], [$ac_cv_build]) +m4trace:configure.in:8: -1- AC_SUBST_TRACE([build]) +m4trace:configure.in:8: -1- m4_pattern_allow([^build$]) +m4trace:configure.in:8: -1- AC_SUBST([build_cpu], [$[1]]) +m4trace:configure.in:8: -1- AC_SUBST_TRACE([build_cpu]) +m4trace:configure.in:8: -1- m4_pattern_allow([^build_cpu$]) +m4trace:configure.in:8: -1- AC_SUBST([build_vendor], [$[2]]) +m4trace:configure.in:8: -1- AC_SUBST_TRACE([build_vendor]) +m4trace:configure.in:8: -1- m4_pattern_allow([^build_vendor$]) +m4trace:configure.in:8: -1- AC_SUBST([build_os]) +m4trace:configure.in:8: -1- AC_SUBST_TRACE([build_os]) +m4trace:configure.in:8: -1- m4_pattern_allow([^build_os$]) +m4trace:configure.in:8: -1- AC_SUBST([host], [$ac_cv_host]) +m4trace:configure.in:8: -1- AC_SUBST_TRACE([host]) +m4trace:configure.in:8: -1- m4_pattern_allow([^host$]) +m4trace:configure.in:8: -1- AC_SUBST([host_cpu], [$[1]]) +m4trace:configure.in:8: -1- AC_SUBST_TRACE([host_cpu]) +m4trace:configure.in:8: -1- m4_pattern_allow([^host_cpu$]) +m4trace:configure.in:8: -1- AC_SUBST([host_vendor], [$[2]]) +m4trace:configure.in:8: -1- AC_SUBST_TRACE([host_vendor]) +m4trace:configure.in:8: -1- m4_pattern_allow([^host_vendor$]) +m4trace:configure.in:8: -1- AC_SUBST([host_os]) +m4trace:configure.in:8: -1- AC_SUBST_TRACE([host_os]) +m4trace:configure.in:8: -1- m4_pattern_allow([^host_os$]) +m4trace:configure.in:16: -1- AC_SUBST([CC]) +m4trace:configure.in:16: -1- AC_SUBST_TRACE([CC]) +m4trace:configure.in:16: -1- m4_pattern_allow([^CC$]) +m4trace:configure.in:16: -1- AC_SUBST([CFLAGS]) +m4trace:configure.in:16: -1- AC_SUBST_TRACE([CFLAGS]) +m4trace:configure.in:16: -1- m4_pattern_allow([^CFLAGS$]) +m4trace:configure.in:16: -1- AC_SUBST([LDFLAGS]) +m4trace:configure.in:16: -1- AC_SUBST_TRACE([LDFLAGS]) +m4trace:configure.in:16: -1- m4_pattern_allow([^LDFLAGS$]) +m4trace:configure.in:16: -1- AC_SUBST([LIBS]) +m4trace:configure.in:16: -1- AC_SUBST_TRACE([LIBS]) +m4trace:configure.in:16: -1- m4_pattern_allow([^LIBS$]) +m4trace:configure.in:16: -1- AC_SUBST([CPPFLAGS]) +m4trace:configure.in:16: -1- AC_SUBST_TRACE([CPPFLAGS]) +m4trace:configure.in:16: -1- m4_pattern_allow([^CPPFLAGS$]) +m4trace:configure.in:16: -1- AC_SUBST([CC]) +m4trace:configure.in:16: -1- AC_SUBST_TRACE([CC]) +m4trace:configure.in:16: -1- m4_pattern_allow([^CC$]) +m4trace:configure.in:16: -1- AC_SUBST([CC]) +m4trace:configure.in:16: -1- AC_SUBST_TRACE([CC]) +m4trace:configure.in:16: -1- m4_pattern_allow([^CC$]) +m4trace:configure.in:16: -1- AC_SUBST([CC]) +m4trace:configure.in:16: -1- AC_SUBST_TRACE([CC]) +m4trace:configure.in:16: -1- m4_pattern_allow([^CC$]) +m4trace:configure.in:16: -1- AC_SUBST([CC]) +m4trace:configure.in:16: -1- AC_SUBST_TRACE([CC]) +m4trace:configure.in:16: -1- m4_pattern_allow([^CC$]) +m4trace:configure.in:16: -1- AC_SUBST([ac_ct_CC]) +m4trace:configure.in:16: -1- AC_SUBST_TRACE([ac_ct_CC]) +m4trace:configure.in:16: -1- m4_pattern_allow([^ac_ct_CC$]) +m4trace:configure.in:16: -1- AC_SUBST([EXEEXT], [$ac_cv_exeext]) +m4trace:configure.in:16: -1- AC_SUBST_TRACE([EXEEXT]) +m4trace:configure.in:16: -1- m4_pattern_allow([^EXEEXT$]) +m4trace:configure.in:16: -1- AC_SUBST([OBJEXT], [$ac_cv_objext]) +m4trace:configure.in:16: -1- AC_SUBST_TRACE([OBJEXT]) +m4trace:configure.in:16: -1- m4_pattern_allow([^OBJEXT$]) +m4trace:configure.in:19: -1- AC_DEFINE_TRACE_LITERAL([const]) +m4trace:configure.in:19: -1- m4_pattern_allow([^const$]) +m4trace:configure.in:19: -1- AH_OUTPUT([const], [/* Define to empty if `const\' does not conform to ANSI C. */ +@%:@undef const]) +m4trace:configure.in:20: -1- AH_OUTPUT([inline], [/* Define to `__inline__\' or `__inline\' if that\'s what the C compiler + calls it, or to nothing if \'inline\' is not supported under any name. */ +#ifndef __cplusplus +#undef inline +#endif]) +m4trace:configure.in:21: -1- AH_OUTPUT([restrict], [/* Define to the equivalent of the C99 \'restrict\' keyword, or to + nothing if this is not supported. Do not define if restrict is + supported directly. */ +#undef restrict +/* Work around a bug in Sun C++: it does not support _Restrict or + __restrict__, even though the corresponding Sun C compiler ends up with + "#define restrict _Restrict" or "#define restrict __restrict__" in the + previous line. Perhaps some future version of Sun C++ will work with + restrict; if so, hopefully it defines __RESTRICT like Sun C does. */ +#if defined __SUNPRO_CC && !defined __RESTRICT +# define _Restrict +# define __restrict__ +#endif]) +m4trace:configure.in:21: -1- AC_DEFINE_TRACE_LITERAL([restrict]) +m4trace:configure.in:21: -1- m4_pattern_allow([^restrict$]) +m4trace:configure.in:21: -1- AC_DEFINE_TRACE_LITERAL([restrict]) +m4trace:configure.in:21: -1- m4_pattern_allow([^restrict$]) +m4trace:configure.in:22: -1- AC_SUBST([F77]) +m4trace:configure.in:22: -1- AC_SUBST_TRACE([F77]) +m4trace:configure.in:22: -1- m4_pattern_allow([^F77$]) +m4trace:configure.in:22: -1- AC_SUBST([FFLAGS]) +m4trace:configure.in:22: -1- AC_SUBST_TRACE([FFLAGS]) +m4trace:configure.in:22: -1- m4_pattern_allow([^FFLAGS$]) +m4trace:configure.in:22: -1- AC_SUBST([LDFLAGS]) +m4trace:configure.in:22: -1- AC_SUBST_TRACE([LDFLAGS]) +m4trace:configure.in:22: -1- m4_pattern_allow([^LDFLAGS$]) +m4trace:configure.in:22: -1- AC_SUBST([LIBS]) +m4trace:configure.in:22: -1- AC_SUBST_TRACE([LIBS]) +m4trace:configure.in:22: -1- m4_pattern_allow([^LIBS$]) +m4trace:configure.in:22: -1- AC_SUBST([F77]) +m4trace:configure.in:22: -1- AC_SUBST_TRACE([F77]) +m4trace:configure.in:22: -1- m4_pattern_allow([^F77$]) +m4trace:configure.in:22: -1- AC_SUBST([ac_ct_F77]) +m4trace:configure.in:22: -1- AC_SUBST_TRACE([ac_ct_F77]) +m4trace:configure.in:22: -1- m4_pattern_allow([^ac_ct_F77$]) +m4trace:configure.in:22: -1- AC_SUBST([FLIBS]) +m4trace:configure.in:22: -1- AC_SUBST_TRACE([FLIBS]) +m4trace:configure.in:22: -1- m4_pattern_allow([^FLIBS$]) +m4trace:configure.in:23: -1- AC_SUBST([AR]) +m4trace:configure.in:23: -1- AC_SUBST_TRACE([AR]) +m4trace:configure.in:23: -1- m4_pattern_allow([^AR$]) +m4trace:configure.in:26: -1- AC_SUBST([LEX]) +m4trace:configure.in:26: -1- AC_SUBST_TRACE([LEX]) +m4trace:configure.in:26: -1- m4_pattern_allow([^LEX$]) +m4trace:configure.in:26: -1- AC_SUBST([LEX_OUTPUT_ROOT], [$ac_cv_prog_lex_root]) +m4trace:configure.in:26: -1- AC_SUBST_TRACE([LEX_OUTPUT_ROOT]) +m4trace:configure.in:26: -1- m4_pattern_allow([^LEX_OUTPUT_ROOT$]) +m4trace:configure.in:26: -1- AC_SUBST([LEXLIB]) +m4trace:configure.in:26: -1- AC_SUBST_TRACE([LEXLIB]) +m4trace:configure.in:26: -1- m4_pattern_allow([^LEXLIB$]) +m4trace:configure.in:26: -1- AC_DEFINE_TRACE_LITERAL([YYTEXT_POINTER]) +m4trace:configure.in:26: -1- m4_pattern_allow([^YYTEXT_POINTER$]) +m4trace:configure.in:26: -1- AH_OUTPUT([YYTEXT_POINTER], [/* Define to 1 if `lex\' declares `yytext\' as a `char *\' by default, not a + `char@<:@@:>@\'. */ +@%:@undef YYTEXT_POINTER]) +m4trace:configure.in:32: -1- AC_SUBST([SET_MAKE]) +m4trace:configure.in:32: -1- AC_SUBST_TRACE([SET_MAKE]) +m4trace:configure.in:32: -1- m4_pattern_allow([^SET_MAKE$]) +m4trace:configure.in:33: -1- AC_SUBST([RANLIB]) +m4trace:configure.in:33: -1- AC_SUBST_TRACE([RANLIB]) +m4trace:configure.in:33: -1- m4_pattern_allow([^RANLIB$]) +m4trace:configure.in:34: -1- AC_SUBST([CCDEP]) +m4trace:configure.in:34: -1- AC_SUBST_TRACE([CCDEP]) +m4trace:configure.in:34: -1- m4_pattern_allow([^CCDEP$]) +m4trace:configure.in:46: -1- AH_OUTPUT([HAVE_STDINT_H], [/* Define to 1 if you have the header file. */ +@%:@undef HAVE_STDINT_H]) +m4trace:configure.in:46: -1- AC_SUBST([CPP]) +m4trace:configure.in:46: -1- AC_SUBST_TRACE([CPP]) +m4trace:configure.in:46: -1- m4_pattern_allow([^CPP$]) +m4trace:configure.in:46: -1- AC_SUBST([CPPFLAGS]) +m4trace:configure.in:46: -1- AC_SUBST_TRACE([CPPFLAGS]) +m4trace:configure.in:46: -1- m4_pattern_allow([^CPPFLAGS$]) +m4trace:configure.in:46: -1- AC_SUBST([CPP]) +m4trace:configure.in:46: -1- AC_SUBST_TRACE([CPP]) +m4trace:configure.in:46: -1- m4_pattern_allow([^CPP$]) +m4trace:configure.in:46: -1- AC_SUBST([GREP]) +m4trace:configure.in:46: -1- AC_SUBST_TRACE([GREP]) +m4trace:configure.in:46: -1- m4_pattern_allow([^GREP$]) +m4trace:configure.in:46: -1- AC_SUBST([EGREP]) +m4trace:configure.in:46: -1- AC_SUBST_TRACE([EGREP]) +m4trace:configure.in:46: -1- m4_pattern_allow([^EGREP$]) +m4trace:configure.in:46: -1- AC_DEFINE_TRACE_LITERAL([STDC_HEADERS]) +m4trace:configure.in:46: -1- m4_pattern_allow([^STDC_HEADERS$]) +m4trace:configure.in:46: -1- AH_OUTPUT([STDC_HEADERS], [/* Define to 1 if you have the ANSI C header files. */ +@%:@undef STDC_HEADERS]) +m4trace:configure.in:46: -1- AH_OUTPUT([HAVE_SYS_TYPES_H], [/* Define to 1 if you have the header file. */ +@%:@undef HAVE_SYS_TYPES_H]) +m4trace:configure.in:46: -1- AH_OUTPUT([HAVE_SYS_STAT_H], [/* Define to 1 if you have the header file. */ +@%:@undef HAVE_SYS_STAT_H]) +m4trace:configure.in:46: -1- AH_OUTPUT([HAVE_STDLIB_H], [/* Define to 1 if you have the header file. */ +@%:@undef HAVE_STDLIB_H]) +m4trace:configure.in:46: -1- AH_OUTPUT([HAVE_STRING_H], [/* Define to 1 if you have the header file. */ +@%:@undef HAVE_STRING_H]) +m4trace:configure.in:46: -1- AH_OUTPUT([HAVE_MEMORY_H], [/* Define to 1 if you have the header file. */ +@%:@undef HAVE_MEMORY_H]) +m4trace:configure.in:46: -1- AH_OUTPUT([HAVE_STRINGS_H], [/* Define to 1 if you have the header file. */ +@%:@undef HAVE_STRINGS_H]) +m4trace:configure.in:46: -1- AH_OUTPUT([HAVE_INTTYPES_H], [/* Define to 1 if you have the header file. */ +@%:@undef HAVE_INTTYPES_H]) +m4trace:configure.in:46: -1- AH_OUTPUT([HAVE_STDINT_H], [/* Define to 1 if you have the header file. */ +@%:@undef HAVE_STDINT_H]) +m4trace:configure.in:46: -1- AH_OUTPUT([HAVE_UNISTD_H], [/* Define to 1 if you have the header file. */ +@%:@undef HAVE_UNISTD_H]) +m4trace:configure.in:46: -1- AC_DEFINE_TRACE_LITERAL([HAVE_STDINT_H]) +m4trace:configure.in:46: -1- m4_pattern_allow([^HAVE_STDINT_H$]) +m4trace:configure.in:46: -1- AC_DEFINE_TRACE_LITERAL([HAVE_UINT16_T]) +m4trace:configure.in:46: -1- m4_pattern_allow([^HAVE_UINT16_T$]) +m4trace:configure.in:46: -1- AH_OUTPUT([HAVE_UINT16_T], [/* Define to 1 if the system has the type `uint16_t\'. */ +@%:@undef HAVE_UINT16_T]) +m4trace:configure.in:46: -1- AC_DEFINE_TRACE_LITERAL([HAVE_UINT32_T]) +m4trace:configure.in:46: -1- m4_pattern_allow([^HAVE_UINT32_T$]) +m4trace:configure.in:46: -1- AH_OUTPUT([HAVE_UINT32_T], [/* Define to 1 if the system has the type `uint32_t\'. */ +@%:@undef HAVE_UINT32_T]) +m4trace:configure.in:46: -1- AC_DEFINE_TRACE_LITERAL([HAVE_UINT64_T]) +m4trace:configure.in:46: -1- m4_pattern_allow([^HAVE_UINT64_T$]) +m4trace:configure.in:46: -1- AH_OUTPUT([HAVE_UINT64_T], [/* Define to 1 if the system has the type `uint64_t\'. */ +@%:@undef HAVE_UINT64_T]) +m4trace:configure.in:46: -1- AC_DEFINE_TRACE_LITERAL([SIZEOF_UNSIGNED_CHAR]) +m4trace:configure.in:46: -1- m4_pattern_allow([^SIZEOF_UNSIGNED_CHAR$]) +m4trace:configure.in:46: -1- AH_OUTPUT([SIZEOF_UNSIGNED_CHAR], [/* The size of `unsigned char\', as computed by sizeof. */ +@%:@undef SIZEOF_UNSIGNED_CHAR]) +m4trace:configure.in:46: -1- AC_DEFINE_TRACE_LITERAL([SIZEOF_UNSIGNED_SHORT]) +m4trace:configure.in:46: -1- m4_pattern_allow([^SIZEOF_UNSIGNED_SHORT$]) +m4trace:configure.in:46: -1- AH_OUTPUT([SIZEOF_UNSIGNED_SHORT], [/* The size of `unsigned short\', as computed by sizeof. */ +@%:@undef SIZEOF_UNSIGNED_SHORT]) +m4trace:configure.in:46: -1- AC_DEFINE_TRACE_LITERAL([SIZEOF_UNSIGNED_INT]) +m4trace:configure.in:46: -1- m4_pattern_allow([^SIZEOF_UNSIGNED_INT$]) +m4trace:configure.in:46: -1- AH_OUTPUT([SIZEOF_UNSIGNED_INT], [/* The size of `unsigned int\', as computed by sizeof. */ +@%:@undef SIZEOF_UNSIGNED_INT]) +m4trace:configure.in:46: -1- AC_DEFINE_TRACE_LITERAL([SIZEOF_UNSIGNED_LONG]) +m4trace:configure.in:46: -1- m4_pattern_allow([^SIZEOF_UNSIGNED_LONG$]) +m4trace:configure.in:46: -1- AH_OUTPUT([SIZEOF_UNSIGNED_LONG], [/* The size of `unsigned long\', as computed by sizeof. */ +@%:@undef SIZEOF_UNSIGNED_LONG]) +m4trace:configure.in:46: -1- AC_DEFINE_TRACE_LITERAL([SIZEOF_UNSIGNED_LONG_LONG]) +m4trace:configure.in:46: -1- m4_pattern_allow([^SIZEOF_UNSIGNED_LONG_LONG$]) +m4trace:configure.in:46: -1- AH_OUTPUT([SIZEOF_UNSIGNED_LONG_LONG], [/* The size of `unsigned long long\', as computed by sizeof. */ +@%:@undef SIZEOF_UNSIGNED_LONG_LONG]) +m4trace:configure.in:75: -1- AH_OUTPUT([HAVE_LIBLIME], [/* Define to 1 if you have the `lime\' library (-llime). */ +@%:@undef HAVE_LIBLIME]) +m4trace:configure.in:75: -1- AC_DEFINE_TRACE_LITERAL([HAVE_LIBLIME]) +m4trace:configure.in:75: -1- m4_pattern_allow([^HAVE_LIBLIME$]) +m4trace:configure.in:78: -1- AC_DEFINE_TRACE_LITERAL([BENCHMARK]) +m4trace:configure.in:78: -1- m4_pattern_allow([^BENCHMARK$]) +m4trace:configure.in:78: -1- AH_OUTPUT([BENCHMARK], [/* Using Benchmarking no c-lime */ +@%:@undef BENCHMARK]) +m4trace:configure.in:86: -1- AH_OUTPUT([HAVE_LIBLEMON], [/* Define to 1 if you have the `lemon\' library (-llemon). */ +@%:@undef HAVE_LIBLEMON]) +m4trace:configure.in:86: -1- AC_DEFINE_TRACE_LITERAL([HAVE_LIBLEMON]) +m4trace:configure.in:86: -1- m4_pattern_allow([^HAVE_LIBLEMON$]) +m4trace:configure.in:105: -1- AC_DEFINE_TRACE_LITERAL([_INDEX_INDEP_GEOM]) +m4trace:configure.in:105: -1- m4_pattern_allow([^_INDEX_INDEP_GEOM$]) +m4trace:configure.in:105: -1- AH_OUTPUT([_INDEX_INDEP_GEOM], [/* Index independent addressing */ +@%:@undef _INDEX_INDEP_GEOM]) +m4trace:configure.in:116: -1- AC_DEFINE_TRACE_LITERAL([MPI]) +m4trace:configure.in:116: -1- m4_pattern_allow([^MPI$]) +m4trace:configure.in:116: -1- AH_OUTPUT([MPI], [/* Compile with MPI support */ +@%:@undef MPI]) +m4trace:configure.in:127: -1- AC_DEFINE_TRACE_LITERAL([BGQ]) +m4trace:configure.in:127: -1- m4_pattern_allow([^BGQ$]) +m4trace:configure.in:127: -1- AH_OUTPUT([BGQ], [/* Compile with QPX intrinsics */ +@%:@undef BGQ]) +m4trace:configure.in:141: -1- AC_DEFINE_TRACE_LITERAL([SPI]) +m4trace:configure.in:141: -1- m4_pattern_allow([^SPI$]) +m4trace:configure.in:141: -1- AH_OUTPUT([SPI], [/* Compile with SPI for communications */ +@%:@undef SPI]) +m4trace:configure.in:155: -1- AC_DEFINE_TRACE_LITERAL([OMP]) +m4trace:configure.in:155: -1- m4_pattern_allow([^OMP$]) +m4trace:configure.in:155: -1- AH_OUTPUT([OMP], [/* Compile with OpenMP support */ +@%:@undef OMP]) +m4trace:configure.in:156: -1- AH_OUTPUT([HAVE_OMP_H], [/* Define to 1 if you have the header file. */ +@%:@undef HAVE_OMP_H]) +m4trace:configure.in:156: -1- AC_DEFINE_TRACE_LITERAL([HAVE_OMP_H]) +m4trace:configure.in:156: -1- m4_pattern_allow([^HAVE_OMP_H$]) +m4trace:configure.in:157: -1- AC_SUBST([OPENMP_CFLAGS]) +m4trace:configure.in:157: -1- AC_SUBST_TRACE([OPENMP_CFLAGS]) +m4trace:configure.in:157: -1- m4_pattern_allow([^OPENMP_CFLAGS$]) +m4trace:configure.in:181: -1- AC_DEFINE_TRACE_LITERAL([HAVE_FFTW]) +m4trace:configure.in:181: -1- m4_pattern_allow([^HAVE_FFTW$]) +m4trace:configure.in:181: -1- AH_OUTPUT([HAVE_FFTW], [/* Compile with FFTW support */ +@%:@undef HAVE_FFTW]) +m4trace:configure.in:187: -1- AC_DEFINE_TRACE_LITERAL([HAVE_FFTW]) +m4trace:configure.in:187: -1- m4_pattern_allow([^HAVE_FFTW$]) +m4trace:configure.in:187: -1- AH_OUTPUT([HAVE_FFTW], [/* Compile with FFTW support */ +@%:@undef HAVE_FFTW]) +m4trace:configure.in:201: -1- AC_DEFINE_TRACE_LITERAL([PARALLELT]) +m4trace:configure.in:201: -1- m4_pattern_allow([^PARALLELT$]) +m4trace:configure.in:201: -1- AH_OUTPUT([PARALLELT], [/* One dimensional parallelisation */ +@%:@undef PARALLELT]) +m4trace:configure.in:204: -1- AC_DEFINE_TRACE_LITERAL([PARALLELXT]) +m4trace:configure.in:204: -1- m4_pattern_allow([^PARALLELXT$]) +m4trace:configure.in:204: -1- AH_OUTPUT([PARALLELXT], [/* Two dimensional parallelisation */ +@%:@undef PARALLELXT]) +m4trace:configure.in:207: -1- AC_DEFINE_TRACE_LITERAL([PARALLELXYT]) +m4trace:configure.in:207: -1- m4_pattern_allow([^PARALLELXYT$]) +m4trace:configure.in:207: -1- AH_OUTPUT([PARALLELXYT], [/* Three dimensional parallelisation */ +@%:@undef PARALLELXYT]) +m4trace:configure.in:210: -1- AC_DEFINE_TRACE_LITERAL([PARALLELXYZT]) +m4trace:configure.in:210: -1- m4_pattern_allow([^PARALLELXYZT$]) +m4trace:configure.in:210: -1- AH_OUTPUT([PARALLELXYZT], [/* Four dimensional parallelisation */ +@%:@undef PARALLELXYZT]) +m4trace:configure.in:213: -1- AC_DEFINE_TRACE_LITERAL([PARALLELX]) +m4trace:configure.in:213: -1- m4_pattern_allow([^PARALLELX$]) +m4trace:configure.in:213: -1- AH_OUTPUT([PARALLELX], [/* X parallelisation */ +@%:@undef PARALLELX]) +m4trace:configure.in:216: -1- AC_DEFINE_TRACE_LITERAL([PARALLELXY]) +m4trace:configure.in:216: -1- m4_pattern_allow([^PARALLELXY$]) +m4trace:configure.in:216: -1- AH_OUTPUT([PARALLELXY], [/* XY parallelisation */ +@%:@undef PARALLELXY]) +m4trace:configure.in:219: -1- AC_DEFINE_TRACE_LITERAL([PARALLELXYZ]) +m4trace:configure.in:219: -1- m4_pattern_allow([^PARALLELXYZ$]) +m4trace:configure.in:219: -1- AH_OUTPUT([PARALLELXYZ], [/* XYZ parallelisation */ +@%:@undef PARALLELXYZ]) +m4trace:configure.in:222: -1- AC_DEFINE_TRACE_LITERAL([PARALLELT]) +m4trace:configure.in:222: -1- m4_pattern_allow([^PARALLELT$]) +m4trace:configure.in:222: -1- AH_OUTPUT([PARALLELT], [/* T parallelisation */ +@%:@undef PARALLELT]) +m4trace:configure.in:225: -1- AC_DEFINE_TRACE_LITERAL([PARALLELXT]) +m4trace:configure.in:225: -1- m4_pattern_allow([^PARALLELXT$]) +m4trace:configure.in:225: -1- AH_OUTPUT([PARALLELXT], [/* XT parallelisation */ +@%:@undef PARALLELXT]) +m4trace:configure.in:228: -1- AC_DEFINE_TRACE_LITERAL([PARALLELXYT]) +m4trace:configure.in:228: -1- m4_pattern_allow([^PARALLELXYT$]) +m4trace:configure.in:228: -1- AH_OUTPUT([PARALLELXYT], [/* XYT parallelisation */ +@%:@undef PARALLELXYT]) +m4trace:configure.in:231: -1- AC_DEFINE_TRACE_LITERAL([PARALLELXYZT]) +m4trace:configure.in:231: -1- m4_pattern_allow([^PARALLELXYZT$]) +m4trace:configure.in:231: -1- AH_OUTPUT([PARALLELXYZT], [/* XYZT parallelisation */ +@%:@undef PARALLELXYZT]) +m4trace:configure.in:243: -1- AC_DEFINE_TRACE_LITERAL([_PERSISTENT]) +m4trace:configure.in:243: -1- m4_pattern_allow([^_PERSISTENT$]) +m4trace:configure.in:243: -1- AH_OUTPUT([_PERSISTENT], [/* use persistent MPI calls for halfspinor */ +@%:@undef _PERSISTENT]) +m4trace:configure.in:254: -1- AC_DEFINE_TRACE_LITERAL([_NON_BLOCKING]) +m4trace:configure.in:254: -1- m4_pattern_allow([^_NON_BLOCKING$]) +m4trace:configure.in:254: -1- AH_OUTPUT([_NON_BLOCKING], [/* use non-blocking MPI calls for spinor ang gauge */ +@%:@undef _NON_BLOCKING]) +m4trace:configure.in:266: -1- AC_DEFINE_TRACE_LITERAL([FIXEDVOLUME]) +m4trace:configure.in:266: -1- m4_pattern_allow([^FIXEDVOLUME$]) +m4trace:configure.in:266: -1- AH_OUTPUT([FIXEDVOLUME], [/* Fixed volume at compiletime */ +@%:@undef FIXEDVOLUME]) +m4trace:configure.in:267: -1- AC_CONFIG_FILES([fixed_volume.h]) +m4trace:configure.in:290: -1- AC_DEFINE_TRACE_LITERAL([HAVE_LAPACK]) +m4trace:configure.in:290: -1- m4_pattern_allow([^HAVE_LAPACK$]) +m4trace:configure.in:290: -1- AH_OUTPUT([HAVE_LAPACK], [/* lapack available */ +@%:@undef HAVE_LAPACK]) +m4trace:configure.in:295: -1- AC_DEFINE_TRACE_LITERAL([HAVE_LAPACK]) +m4trace:configure.in:295: -1- m4_pattern_allow([^HAVE_LAPACK$]) +m4trace:configure.in:295: -1- AH_OUTPUT([HAVE_LAPACK], [/* lapack available */ +@%:@undef HAVE_LAPACK]) +m4trace:configure.in:315: -1- AH_OUTPUT([HAVE_CLOCK_GETTIME], [/* Define to 1 if you have the `clock_gettime\' function. */ +@%:@undef HAVE_CLOCK_GETTIME]) +m4trace:configure.in:315: -1- AC_DEFINE_TRACE_LITERAL([HAVE_CLOCK_GETTIME]) +m4trace:configure.in:315: -1- m4_pattern_allow([^HAVE_CLOCK_GETTIME$]) +m4trace:configure.in:315: -1- AH_OUTPUT([HAVE_LIBRT], [/* Define to 1 if you have the `rt\' library (-lrt). */ +@%:@undef HAVE_LIBRT]) +m4trace:configure.in:315: -1- AC_DEFINE_TRACE_LITERAL([HAVE_LIBRT]) +m4trace:configure.in:315: -1- m4_pattern_allow([^HAVE_LIBRT$]) +m4trace:configure.in:321: -1- AC_DEFINE_TRACE_LITERAL([HAVE_CLOCK_GETTIME]) +m4trace:configure.in:321: -1- m4_pattern_allow([^HAVE_CLOCK_GETTIME$]) +m4trace:configure.in:330: -1- AC_DEFINE_TRACE_LITERAL([F77_DUMMY_MAIN]) +m4trace:configure.in:330: -1- m4_pattern_allow([^F77_DUMMY_MAIN$]) +m4trace:configure.in:330: -1- AH_OUTPUT([F77_DUMMY_MAIN], [/* Define to dummy `main\' function (if any) required to link to the Fortran + libraries. */ +@%:@undef F77_DUMMY_MAIN]) +m4trace:configure.in:330: -1- AC_DEFINE_TRACE_LITERAL([FC_DUMMY_MAIN_EQ_F77]) +m4trace:configure.in:330: -1- m4_pattern_allow([^FC_DUMMY_MAIN_EQ_F77$]) +m4trace:configure.in:330: -1- AH_OUTPUT([FC_DUMMY_MAIN_EQ_F77], [/* Define if F77 and FC dummy `main\' functions are identical. */ +@%:@undef FC_DUMMY_MAIN_EQ_F77]) +m4trace:configure.in:332: -1- AC_DEFINE_TRACE_LITERAL([NOF77_]) +m4trace:configure.in:332: -1- m4_pattern_allow([^NOF77_$]) +m4trace:configure.in:332: -1- AH_OUTPUT([NOF77_], [/* Fortran has no extra _ */ +@%:@undef NOF77_]) +m4trace:configure.in:337: -1- AC_DEFINE_TRACE_LITERAL([STDC_HEADERS]) +m4trace:configure.in:337: -1- m4_pattern_allow([^STDC_HEADERS$]) +m4trace:configure.in:337: -1- AH_OUTPUT([STDC_HEADERS], [/* Define to 1 if you have the ANSI C header files. */ +@%:@undef STDC_HEADERS]) +m4trace:configure.in:338: -1- AH_OUTPUT([HAVE_FLOAT_H], [/* Define to 1 if you have the header file. */ +@%:@undef HAVE_FLOAT_H]) +m4trace:configure.in:338: -1- AH_OUTPUT([HAVE_LIBINTL_H], [/* Define to 1 if you have the header file. */ +@%:@undef HAVE_LIBINTL_H]) +m4trace:configure.in:338: -1- AH_OUTPUT([HAVE_LIMITS_H], [/* Define to 1 if you have the header file. */ +@%:@undef HAVE_LIMITS_H]) +m4trace:configure.in:338: -1- AH_OUTPUT([HAVE_STDINT_H], [/* Define to 1 if you have the header file. */ +@%:@undef HAVE_STDINT_H]) +m4trace:configure.in:338: -1- AH_OUTPUT([HAVE_STDLIB_H], [/* Define to 1 if you have the header file. */ +@%:@undef HAVE_STDLIB_H]) +m4trace:configure.in:338: -1- AH_OUTPUT([HAVE_STRING_H], [/* Define to 1 if you have the header file. */ +@%:@undef HAVE_STRING_H]) +m4trace:configure.in:338: -1- AH_OUTPUT([HAVE_STRINGS_H], [/* Define to 1 if you have the header file. */ +@%:@undef HAVE_STRINGS_H]) +m4trace:configure.in:338: -1- AH_OUTPUT([HAVE_SYS_TIME_H], [/* Define to 1 if you have the header file. */ +@%:@undef HAVE_SYS_TIME_H]) +m4trace:configure.in:338: -1- AH_OUTPUT([HAVE_UNISTD_H], [/* Define to 1 if you have the header file. */ +@%:@undef HAVE_UNISTD_H]) +m4trace:configure.in:338: -1- AH_OUTPUT([HAVE_ENDIAN_H], [/* Define to 1 if you have the header file. */ +@%:@undef HAVE_ENDIAN_H]) +m4trace:configure.in:342: -1- AC_DEFINE_TRACE_LITERAL([const]) +m4trace:configure.in:342: -1- m4_pattern_allow([^const$]) +m4trace:configure.in:342: -1- AH_OUTPUT([const], [/* Define to empty if `const\' does not conform to ANSI C. */ +@%:@undef const]) +m4trace:configure.in:343: -1- AC_DEFINE_TRACE_LITERAL([off_t]) +m4trace:configure.in:343: -1- m4_pattern_allow([^off_t$]) +m4trace:configure.in:343: -1- AH_OUTPUT([off_t], [/* Define to `long int\' if does not define. */ +@%:@undef off_t]) +m4trace:configure.in:344: -1- AC_DEFINE_TRACE_LITERAL([size_t]) +m4trace:configure.in:344: -1- m4_pattern_allow([^size_t$]) +m4trace:configure.in:344: -1- AH_OUTPUT([size_t], [/* Define to `unsigned int\' if does not define. */ +@%:@undef size_t]) +m4trace:configure.in:345: -1- AC_DEFINE_TRACE_LITERAL([TIME_WITH_SYS_TIME]) +m4trace:configure.in:345: -1- m4_pattern_allow([^TIME_WITH_SYS_TIME$]) +m4trace:configure.in:345: -1- AH_OUTPUT([TIME_WITH_SYS_TIME], [/* Define to 1 if you can safely include both and . */ +@%:@undef TIME_WITH_SYS_TIME]) +m4trace:configure.in:348: -1- AC_DEFINE_TRACE_LITERAL([_FILE_OFFSET_BITS]) +m4trace:configure.in:348: -1- m4_pattern_allow([^_FILE_OFFSET_BITS$]) +m4trace:configure.in:348: -1- AH_OUTPUT([_FILE_OFFSET_BITS], [/* Number of bits in a file offset, on hosts where this is settable. */ +@%:@undef _FILE_OFFSET_BITS]) +m4trace:configure.in:348: -1- AC_DEFINE_TRACE_LITERAL([_LARGE_FILES]) +m4trace:configure.in:348: -1- m4_pattern_allow([^_LARGE_FILES$]) +m4trace:configure.in:348: -1- AH_OUTPUT([_LARGE_FILES], [/* Define for large files, on AIX-style hosts. */ +@%:@undef _LARGE_FILES]) +m4trace:configure.in:348: -1- AH_OUTPUT([_DARWIN_USE_64_BIT_INODE], [/* Enable large inode numbers on Mac OS X 10.5. */ +#ifndef _DARWIN_USE_64_BIT_INODE +# define _DARWIN_USE_64_BIT_INODE 1 +#endif]) +m4trace:configure.in:349: -1- AC_DEFINE_TRACE_LITERAL([_LARGEFILE_SOURCE]) +m4trace:configure.in:349: -1- m4_pattern_allow([^_LARGEFILE_SOURCE$]) +m4trace:configure.in:349: -1- AH_OUTPUT([_LARGEFILE_SOURCE], [/* Define to 1 to make fseeko visible on some hosts (e.g. glibc 2.2). */ +@%:@undef _LARGEFILE_SOURCE]) +m4trace:configure.in:349: -1- AC_DEFINE_TRACE_LITERAL([HAVE_FSEEKO]) +m4trace:configure.in:349: -1- m4_pattern_allow([^HAVE_FSEEKO$]) +m4trace:configure.in:349: -1- AH_OUTPUT([HAVE_FSEEKO], [/* Define to 1 if fseeko (and presumably ftello) exists and is declared. */ +@%:@undef HAVE_FSEEKO]) +m4trace:configure.in:350: -1- AH_OUTPUT([HAVE_STDLIB_H], [/* Define to 1 if you have the header file. */ +@%:@undef HAVE_STDLIB_H]) +m4trace:configure.in:350: -1- AC_DEFINE_TRACE_LITERAL([HAVE_STDLIB_H]) +m4trace:configure.in:350: -1- m4_pattern_allow([^HAVE_STDLIB_H$]) +m4trace:configure.in:350: -1- AC_DEFINE_TRACE_LITERAL([HAVE_MALLOC]) +m4trace:configure.in:350: -1- m4_pattern_allow([^HAVE_MALLOC$]) +m4trace:configure.in:350: -1- AH_OUTPUT([HAVE_MALLOC], [/* Define to 1 if your system has a GNU libc compatible `malloc\' function, and + to 0 otherwise. */ +@%:@undef HAVE_MALLOC]) +m4trace:configure.in:350: -1- AC_DEFINE_TRACE_LITERAL([HAVE_MALLOC]) +m4trace:configure.in:350: -1- m4_pattern_allow([^HAVE_MALLOC$]) +m4trace:configure.in:350: -1- AC_SUBST([LIB@&t@OBJS], ["$LIB@&t@OBJS malloc.$ac_objext"]) +m4trace:configure.in:350: -1- AC_SUBST_TRACE([LIB@&t@OBJS]) +m4trace:configure.in:350: -1- m4_pattern_allow([^LIB@&t@OBJS$]) +m4trace:configure.in:350: -1- AC_LIBSOURCE([malloc.c]) +m4trace:configure.in:350: -1- AC_DEFINE_TRACE_LITERAL([malloc]) +m4trace:configure.in:350: -1- m4_pattern_allow([^malloc$]) +m4trace:configure.in:350: -1- AH_OUTPUT([malloc], [/* Define to rpl_malloc if the replacement function should be used. */ +@%:@undef malloc]) +m4trace:configure.in:351: -1- _m4_warn([obsolete], [The macro `AC_TYPE_SIGNAL' is obsolete. +You should run autoupdate.], [../../lib/autoconf/types.m4:746: AC_TYPE_SIGNAL is expanded from... +configure.in:351: the top level]) +m4trace:configure.in:351: -1- AC_DEFINE_TRACE_LITERAL([RETSIGTYPE]) +m4trace:configure.in:351: -1- m4_pattern_allow([^RETSIGTYPE$]) +m4trace:configure.in:351: -1- AH_OUTPUT([RETSIGTYPE], [/* Define as the return type of signal handlers (`int\' or `void\'). */ +@%:@undef RETSIGTYPE]) +m4trace:configure.in:352: -1- AH_OUTPUT([HAVE_GETTIMEOFDAY], [/* Define to 1 if you have the `gettimeofday\' function. */ +@%:@undef HAVE_GETTIMEOFDAY]) +m4trace:configure.in:352: -1- AH_OUTPUT([HAVE_POW], [/* Define to 1 if you have the `pow\' function. */ +@%:@undef HAVE_POW]) +m4trace:configure.in:352: -1- AH_OUTPUT([HAVE_SQRT], [/* Define to 1 if you have the `sqrt\' function. */ +@%:@undef HAVE_SQRT]) +m4trace:configure.in:355: -1- AC_SUBST([OPTARGS]) +m4trace:configure.in:355: -1- AC_SUBST_TRACE([OPTARGS]) +m4trace:configure.in:355: -1- m4_pattern_allow([^OPTARGS$]) +m4trace:configure.in:356: -1- AC_SUBST([SOPTARGS]) +m4trace:configure.in:356: -1- AC_SUBST_TRACE([SOPTARGS]) +m4trace:configure.in:356: -1- m4_pattern_allow([^SOPTARGS$]) +m4trace:configure.in:357: -1- AC_SUBST([INCLUDES]) +m4trace:configure.in:357: -1- AC_SUBST_TRACE([INCLUDES]) +m4trace:configure.in:357: -1- m4_pattern_allow([^INCLUDES$]) +m4trace:configure.in:358: -1- AC_SUBST([AUTOCONF]) +m4trace:configure.in:358: -1- AC_SUBST_TRACE([AUTOCONF]) +m4trace:configure.in:358: -1- m4_pattern_allow([^AUTOCONF$]) +m4trace:configure.in:359: -1- AC_SUBST([SOLVEROUT]) +m4trace:configure.in:359: -1- AC_SUBST_TRACE([SOLVEROUT]) +m4trace:configure.in:359: -1- m4_pattern_allow([^SOLVEROUT$]) +m4trace:configure.in:360: -1- AC_SUBST([CCDEP]) +m4trace:configure.in:360: -1- AC_SUBST_TRACE([CCDEP]) +m4trace:configure.in:360: -1- m4_pattern_allow([^CCDEP$]) +m4trace:configure.in:361: -1- AC_SUBST([CCLD]) +m4trace:configure.in:361: -1- AC_SUBST_TRACE([CCLD]) +m4trace:configure.in:361: -1- m4_pattern_allow([^CCLD$]) +m4trace:configure.in:362: -1- AC_SUBST([DEPFLAGS]) +m4trace:configure.in:362: -1- AC_SUBST_TRACE([DEPFLAGS]) +m4trace:configure.in:362: -1- m4_pattern_allow([^DEPFLAGS$]) +m4trace:configure.in:363: -1- AC_SUBST([DEBUG_FLAG]) +m4trace:configure.in:363: -1- AC_SUBST_TRACE([DEBUG_FLAG]) +m4trace:configure.in:363: -1- m4_pattern_allow([^DEBUG_FLAG$]) +m4trace:configure.in:364: -1- AC_SUBST([PROFILE_FLAG]) +m4trace:configure.in:364: -1- AC_SUBST_TRACE([PROFILE_FLAG]) +m4trace:configure.in:364: -1- m4_pattern_allow([^PROFILE_FLAG$]) +m4trace:configure.in:365: -1- AC_SUBST([XCHANGELIB]) +m4trace:configure.in:365: -1- AC_SUBST_TRACE([XCHANGELIB]) +m4trace:configure.in:365: -1- m4_pattern_allow([^XCHANGELIB$]) +m4trace:configure.in:366: -1- AC_SUBST([XCHANGEDIR]) +m4trace:configure.in:366: -1- AC_SUBST_TRACE([XCHANGEDIR]) +m4trace:configure.in:366: -1- m4_pattern_allow([^XCHANGEDIR$]) +m4trace:configure.in:367: -1- AC_SUBST([MEASDIR]) +m4trace:configure.in:367: -1- AC_SUBST_TRACE([MEASDIR]) +m4trace:configure.in:367: -1- m4_pattern_allow([^MEASDIR$]) +m4trace:configure.in:368: -1- AC_SUBST([XLIB]) +m4trace:configure.in:368: -1- AC_SUBST_TRACE([XLIB]) +m4trace:configure.in:368: -1- m4_pattern_allow([^XLIB$]) +m4trace:configure.in:369: -1- AC_SUBST([LEMON_AVAILABLE]) +m4trace:configure.in:369: -1- AC_SUBST_TRACE([LEMON_AVAILABLE]) +m4trace:configure.in:369: -1- m4_pattern_allow([^LEMON_AVAILABLE$]) +m4trace:configure.in:370: -1- AC_SUBST([SPI_FILES]) +m4trace:configure.in:370: -1- AC_SUBST_TRACE([SPI_FILES]) +m4trace:configure.in:370: -1- m4_pattern_allow([^SPI_FILES$]) +m4trace:configure.in:371: -1- AC_SUBST([QUDA_INTERFACE]) +m4trace:configure.in:371: -1- AC_SUBST_TRACE([QUDA_INTERFACE]) +m4trace:configure.in:371: -1- m4_pattern_allow([^QUDA_INTERFACE$]) +m4trace:configure.in:383: -1- AC_DEFINE_TRACE_LITERAL([ALIGN_BASE]) +m4trace:configure.in:383: -1- m4_pattern_allow([^ALIGN_BASE$]) +m4trace:configure.in:383: -1- AH_OUTPUT([ALIGN_BASE], [/* Align base */ +@%:@undef ALIGN_BASE]) +m4trace:configure.in:384: -1- AC_DEFINE_TRACE_LITERAL([ALIGN]) +m4trace:configure.in:384: -1- m4_pattern_allow([^ALIGN$]) +m4trace:configure.in:385: -1- AC_DEFINE_TRACE_LITERAL([ALIGN_BASE32]) +m4trace:configure.in:385: -1- m4_pattern_allow([^ALIGN_BASE32$]) +m4trace:configure.in:385: -1- AH_OUTPUT([ALIGN_BASE32], [/* Align base32 */ +@%:@undef ALIGN_BASE32]) +m4trace:configure.in:386: -1- AC_DEFINE_TRACE_LITERAL([ALIGN32]) +m4trace:configure.in:386: -1- m4_pattern_allow([^ALIGN32$]) +m4trace:configure.in:389: -1- AC_DEFINE_TRACE_LITERAL([ALIGN_BASE]) +m4trace:configure.in:389: -1- m4_pattern_allow([^ALIGN_BASE$]) +m4trace:configure.in:389: -1- AH_OUTPUT([ALIGN_BASE], [/* Align base */ +@%:@undef ALIGN_BASE]) +m4trace:configure.in:390: -1- AC_DEFINE_TRACE_LITERAL([ALIGN]) +m4trace:configure.in:390: -1- m4_pattern_allow([^ALIGN$]) +m4trace:configure.in:391: -1- AC_DEFINE_TRACE_LITERAL([ALIGN_BASE32]) +m4trace:configure.in:391: -1- m4_pattern_allow([^ALIGN_BASE32$]) +m4trace:configure.in:391: -1- AH_OUTPUT([ALIGN_BASE32], [/* Align base32 */ +@%:@undef ALIGN_BASE32]) +m4trace:configure.in:392: -1- AC_DEFINE_TRACE_LITERAL([ALIGN32]) +m4trace:configure.in:392: -1- m4_pattern_allow([^ALIGN32$]) +m4trace:configure.in:395: -1- AC_DEFINE_TRACE_LITERAL([ALIGN_BASE]) +m4trace:configure.in:395: -1- m4_pattern_allow([^ALIGN_BASE$]) +m4trace:configure.in:395: -1- AH_OUTPUT([ALIGN_BASE], [/* Align base */ +@%:@undef ALIGN_BASE]) +m4trace:configure.in:396: -1- AC_DEFINE_TRACE_LITERAL([ALIGN]) +m4trace:configure.in:396: -1- m4_pattern_allow([^ALIGN$]) +m4trace:configure.in:397: -1- AC_DEFINE_TRACE_LITERAL([ALIGN_BASE32]) +m4trace:configure.in:397: -1- m4_pattern_allow([^ALIGN_BASE32$]) +m4trace:configure.in:397: -1- AH_OUTPUT([ALIGN_BASE32], [/* Align base32 */ +@%:@undef ALIGN_BASE32]) +m4trace:configure.in:398: -1- AC_DEFINE_TRACE_LITERAL([ALIGN32]) +m4trace:configure.in:398: -1- m4_pattern_allow([^ALIGN32$]) +m4trace:configure.in:402: -1- AC_DEFINE_TRACE_LITERAL([ALIGN_BASE]) +m4trace:configure.in:402: -1- m4_pattern_allow([^ALIGN_BASE$]) +m4trace:configure.in:402: -1- AH_OUTPUT([ALIGN_BASE], [/* Align base */ +@%:@undef ALIGN_BASE]) +m4trace:configure.in:403: -1- AC_DEFINE_TRACE_LITERAL([ALIGN]) +m4trace:configure.in:403: -1- m4_pattern_allow([^ALIGN$]) +m4trace:configure.in:404: -1- AC_DEFINE_TRACE_LITERAL([ALIGN_BASE32]) +m4trace:configure.in:404: -1- m4_pattern_allow([^ALIGN_BASE32$]) +m4trace:configure.in:404: -1- AH_OUTPUT([ALIGN_BASE32], [/* Align base32 */ +@%:@undef ALIGN_BASE32]) +m4trace:configure.in:405: -1- AC_DEFINE_TRACE_LITERAL([ALIGN32]) +m4trace:configure.in:405: -1- m4_pattern_allow([^ALIGN32$]) +m4trace:configure.in:420: -1- AC_DEFINE_TRACE_LITERAL([P4]) +m4trace:configure.in:420: -1- m4_pattern_allow([^P4$]) +m4trace:configure.in:420: -1- AH_OUTPUT([P4], [/* Use Pentium4 instructions */ +@%:@undef P4]) +m4trace:configure.in:424: -1- AC_DEFINE_TRACE_LITERAL([ALIGN_BASE]) +m4trace:configure.in:424: -1- m4_pattern_allow([^ALIGN_BASE$]) +m4trace:configure.in:424: -1- AH_OUTPUT([ALIGN_BASE], [/* Align base */ +@%:@undef ALIGN_BASE]) +m4trace:configure.in:425: -1- AC_DEFINE_TRACE_LITERAL([ALIGN]) +m4trace:configure.in:425: -1- m4_pattern_allow([^ALIGN$]) +m4trace:configure.in:427: -1- AC_DEFINE_TRACE_LITERAL([ALIGN_BASE32]) +m4trace:configure.in:427: -1- m4_pattern_allow([^ALIGN_BASE32$]) +m4trace:configure.in:427: -1- AH_OUTPUT([ALIGN_BASE32], [/* Align base */ +@%:@undef ALIGN_BASE32]) +m4trace:configure.in:428: -1- AC_DEFINE_TRACE_LITERAL([ALIGN32]) +m4trace:configure.in:428: -1- m4_pattern_allow([^ALIGN32$]) +m4trace:configure.in:444: -1- AC_DEFINE_TRACE_LITERAL([OPTERON]) +m4trace:configure.in:444: -1- m4_pattern_allow([^OPTERON$]) +m4trace:configure.in:444: -1- AH_OUTPUT([OPTERON], [/* Use Opteron instructions */ +@%:@undef OPTERON]) +m4trace:configure.in:448: -1- AC_DEFINE_TRACE_LITERAL([ALIGN_BASE]) +m4trace:configure.in:448: -1- m4_pattern_allow([^ALIGN_BASE$]) +m4trace:configure.in:448: -1- AH_OUTPUT([ALIGN_BASE], [/* Align base */ +@%:@undef ALIGN_BASE]) +m4trace:configure.in:449: -1- AC_DEFINE_TRACE_LITERAL([ALIGN]) +m4trace:configure.in:449: -1- m4_pattern_allow([^ALIGN$]) +m4trace:configure.in:451: -1- AC_DEFINE_TRACE_LITERAL([ALIGN_BASE32]) +m4trace:configure.in:451: -1- m4_pattern_allow([^ALIGN_BASE32$]) +m4trace:configure.in:451: -1- AH_OUTPUT([ALIGN_BASE32], [/* Align base32 */ +@%:@undef ALIGN_BASE32]) +m4trace:configure.in:452: -1- AC_DEFINE_TRACE_LITERAL([ALIGN32]) +m4trace:configure.in:452: -1- m4_pattern_allow([^ALIGN32$]) +m4trace:configure.in:492: -1- AC_DEFINE_TRACE_LITERAL([ALIGN_BASE]) +m4trace:configure.in:492: -1- m4_pattern_allow([^ALIGN_BASE$]) +m4trace:configure.in:492: -1- AH_OUTPUT([ALIGN_BASE], [/* Align base */ +@%:@undef ALIGN_BASE]) +m4trace:configure.in:493: -1- AC_DEFINE_TRACE_LITERAL([ALIGN]) +m4trace:configure.in:493: -1- m4_pattern_allow([^ALIGN$]) +m4trace:configure.in:495: -1- AC_DEFINE_TRACE_LITERAL([ALIGN_BASE32]) +m4trace:configure.in:495: -1- m4_pattern_allow([^ALIGN_BASE32$]) +m4trace:configure.in:495: -1- AH_OUTPUT([ALIGN_BASE32], [/* Align base32 */ +@%:@undef ALIGN_BASE32]) +m4trace:configure.in:496: -1- AC_DEFINE_TRACE_LITERAL([ALIGN32]) +m4trace:configure.in:496: -1- m4_pattern_allow([^ALIGN32$]) +m4trace:configure.in:508: -1- AC_DEFINE_TRACE_LITERAL([ALIGN_BASE]) +m4trace:configure.in:508: -1- m4_pattern_allow([^ALIGN_BASE$]) +m4trace:configure.in:508: -1- AH_OUTPUT([ALIGN_BASE], [/* Align base */ +@%:@undef ALIGN_BASE]) +m4trace:configure.in:509: -1- AC_DEFINE_TRACE_LITERAL([ALIGN]) +m4trace:configure.in:509: -1- m4_pattern_allow([^ALIGN$]) +m4trace:configure.in:511: -1- AC_DEFINE_TRACE_LITERAL([ALIGN_BASE32]) +m4trace:configure.in:511: -1- m4_pattern_allow([^ALIGN_BASE32$]) +m4trace:configure.in:511: -1- AH_OUTPUT([ALIGN_BASE32], [/* Align base32 */ +@%:@undef ALIGN_BASE32]) +m4trace:configure.in:512: -1- AC_DEFINE_TRACE_LITERAL([ALIGN32]) +m4trace:configure.in:512: -1- m4_pattern_allow([^ALIGN32$]) +m4trace:configure.in:526: -1- AC_DEFINE_TRACE_LITERAL([ALIGN_BASE]) +m4trace:configure.in:526: -1- m4_pattern_allow([^ALIGN_BASE$]) +m4trace:configure.in:526: -1- AH_OUTPUT([ALIGN_BASE], [/* Align base */ +@%:@undef ALIGN_BASE]) +m4trace:configure.in:527: -1- AC_DEFINE_TRACE_LITERAL([ALIGN]) +m4trace:configure.in:527: -1- m4_pattern_allow([^ALIGN$]) +m4trace:configure.in:527: -1- AH_OUTPUT([ALIGN], [/* Align base */ +@%:@undef ALIGN]) +m4trace:configure.in:535: -1- AC_DEFINE_TRACE_LITERAL([ALIGN_BASE]) +m4trace:configure.in:535: -1- m4_pattern_allow([^ALIGN_BASE$]) +m4trace:configure.in:535: -1- AH_OUTPUT([ALIGN_BASE], [/* Align base */ +@%:@undef ALIGN_BASE]) +m4trace:configure.in:536: -1- AC_DEFINE_TRACE_LITERAL([ALIGN]) +m4trace:configure.in:536: -1- m4_pattern_allow([^ALIGN$]) +m4trace:configure.in:536: -1- AH_OUTPUT([ALIGN], [/* Align base */ +@%:@undef ALIGN]) +m4trace:configure.in:544: -1- AC_DEFINE_TRACE_LITERAL([ALIGN_BASE]) +m4trace:configure.in:544: -1- m4_pattern_allow([^ALIGN_BASE$]) +m4trace:configure.in:544: -1- AH_OUTPUT([ALIGN_BASE], [/* Align base */ +@%:@undef ALIGN_BASE]) +m4trace:configure.in:545: -1- AC_DEFINE_TRACE_LITERAL([ALIGN]) +m4trace:configure.in:545: -1- m4_pattern_allow([^ALIGN$]) +m4trace:configure.in:547: -1- AC_DEFINE_TRACE_LITERAL([ALIGN_BASE32]) +m4trace:configure.in:547: -1- m4_pattern_allow([^ALIGN_BASE32$]) +m4trace:configure.in:547: -1- AH_OUTPUT([ALIGN_BASE32], [/* Align base */ +@%:@undef ALIGN_BASE32]) +m4trace:configure.in:548: -1- AC_DEFINE_TRACE_LITERAL([ALIGN32]) +m4trace:configure.in:548: -1- m4_pattern_allow([^ALIGN32$]) +m4trace:configure.in:576: -1- AC_DEFINE_TRACE_LITERAL([_USE_BGLDRAM]) +m4trace:configure.in:576: -1- m4_pattern_allow([^_USE_BGLDRAM$]) +m4trace:configure.in:576: -1- AH_OUTPUT([_USE_BGLDRAM], [/* use BGL dram window */ +@%:@undef _USE_BGLDRAM]) +m4trace:configure.in:585: -1- AC_DEFINE_TRACE_LITERAL([XLC]) +m4trace:configure.in:585: -1- m4_pattern_allow([^XLC$]) +m4trace:configure.in:585: -1- AH_OUTPUT([XLC], [/* Are we using the IBM xlc compiler? */ +@%:@undef XLC]) +m4trace:configure.in:601: -1- AC_DEFINE_TRACE_LITERAL([SSE3]) +m4trace:configure.in:601: -1- m4_pattern_allow([^SSE3$]) +m4trace:configure.in:601: -1- AH_OUTPUT([SSE3], [/* Compile with SSE3 support */ +@%:@undef SSE3]) +m4trace:configure.in:608: -1- AC_DEFINE_TRACE_LITERAL([SSE2]) +m4trace:configure.in:608: -1- m4_pattern_allow([^SSE2$]) +m4trace:configure.in:608: -1- AH_OUTPUT([SSE2], [/* Compile with SSE2 support */ +@%:@undef SSE2]) +m4trace:configure.in:615: -1- AC_DEFINE_TRACE_LITERAL([_x86_64]) +m4trace:configure.in:615: -1- m4_pattern_allow([^_x86_64$]) +m4trace:configure.in:615: -1- AH_OUTPUT([_x86_64], [/* x86 64 Bit architecture */ +@%:@undef _x86_64]) +m4trace:configure.in:649: -1- AC_DEFINE_TRACE_LITERAL([SSE3]) +m4trace:configure.in:649: -1- m4_pattern_allow([^SSE3$]) +m4trace:configure.in:649: -1- AH_OUTPUT([SSE3], [/* Compile with SSE3 support */ +@%:@undef SSE3]) +m4trace:configure.in:652: -1- AC_DEFINE_TRACE_LITERAL([SSE2]) +m4trace:configure.in:652: -1- m4_pattern_allow([^SSE2$]) +m4trace:configure.in:652: -1- AH_OUTPUT([SSE2], [/* Compile with SSE2 support */ +@%:@undef SSE2]) +m4trace:configure.in:698: -1- AC_DEFINE_TRACE_LITERAL([BGL]) +m4trace:configure.in:698: -1- m4_pattern_allow([^BGL$]) +m4trace:configure.in:698: -1- AH_OUTPUT([BGL], [/* Optimize for Blue Gene/L */ +@%:@undef BGL]) +m4trace:configure.in:728: -1- AC_DEFINE_TRACE_LITERAL([BGL]) +m4trace:configure.in:728: -1- m4_pattern_allow([^BGL$]) +m4trace:configure.in:728: -1- AH_OUTPUT([BGL], [/* Optimize for Blue Gene/L */ +@%:@undef BGL]) +m4trace:configure.in:729: -1- AC_DEFINE_TRACE_LITERAL([BGP]) +m4trace:configure.in:729: -1- m4_pattern_allow([^BGP$]) +m4trace:configure.in:729: -1- AH_OUTPUT([BGP], [/* Optimize for Blue Gene/P */ +@%:@undef BGP]) +m4trace:configure.in:775: -1- AC_DEFINE_TRACE_LITERAL([CRAY]) +m4trace:configure.in:775: -1- m4_pattern_allow([^CRAY$]) +m4trace:configure.in:775: -1- AH_OUTPUT([CRAY], [/* We are on a CRAY */ +@%:@undef CRAY]) +m4trace:configure.in:783: -1- AC_SUBST([CCDEP]) +m4trace:configure.in:783: -1- AC_SUBST_TRACE([CCDEP]) +m4trace:configure.in:783: -1- m4_pattern_allow([^CCDEP$]) +m4trace:configure.in:812: -1- AC_DEFINE_TRACE_LITERAL([_GAUGE_COPY]) +m4trace:configure.in:812: -1- m4_pattern_allow([^_GAUGE_COPY$]) +m4trace:configure.in:812: -1- AH_OUTPUT([_GAUGE_COPY], [/* Construct an extra copy of the gauge fields */ +@%:@undef _GAUGE_COPY]) +m4trace:configure.in:823: -1- AC_DEFINE_TRACE_LITERAL([_USE_HALFSPINOR]) +m4trace:configure.in:823: -1- m4_pattern_allow([^_USE_HALFSPINOR$]) +m4trace:configure.in:823: -1- AH_OUTPUT([_USE_HALFSPINOR], [/* Exchange only a halfspinor in the Dirac Operator */ +@%:@undef _USE_HALFSPINOR]) +m4trace:configure.in:826: -1- AC_DEFINE_TRACE_LITERAL([_GAUGE_COPY]) +m4trace:configure.in:826: -1- m4_pattern_allow([^_GAUGE_COPY$]) +m4trace:configure.in:826: -1- AH_OUTPUT([_GAUGE_COPY], [/* Construct an extra copy of the gauge fields */ +@%:@undef _GAUGE_COPY]) +m4trace:configure.in:838: -1- AC_DEFINE_TRACE_LITERAL([_USE_SHMEM]) +m4trace:configure.in:838: -1- m4_pattern_allow([^_USE_SHMEM$]) +m4trace:configure.in:838: -1- AH_OUTPUT([_USE_SHMEM], [/* Use shmem API */ +@%:@undef _USE_SHMEM]) +m4trace:configure.in:850: -1- AC_DEFINE_TRACE_LITERAL([_USE_TSPLITPAR]) +m4trace:configure.in:850: -1- m4_pattern_allow([^_USE_TSPLITPAR$]) +m4trace:configure.in:850: -1- AH_OUTPUT([_USE_TSPLITPAR], [/* timeslice-splitted communications */ +@%:@undef _USE_TSPLITPAR]) +m4trace:configure.in:861: -1- AC_DEFINE_TRACE_LITERAL([WITHLAPH]) +m4trace:configure.in:861: -1- m4_pattern_allow([^WITHLAPH$]) +m4trace:configure.in:861: -1- AH_OUTPUT([WITHLAPH], [/* LapH eigensystem */ +@%:@undef WITHLAPH]) +m4trace:configure.in:873: -1- AC_DEFINE_TRACE_LITERAL([HAVE_GPU]) +m4trace:configure.in:873: -1- m4_pattern_allow([^HAVE_GPU$]) +m4trace:configure.in:873: -1- AH_OUTPUT([HAVE_GPU], [/* Using CUDA GPU */ +@%:@undef HAVE_GPU]) +m4trace:configure.in:911: -1- AC_SUBST([USESUBDIRS]) +m4trace:configure.in:911: -1- AC_SUBST_TRACE([USESUBDIRS]) +m4trace:configure.in:911: -1- m4_pattern_allow([^USESUBDIRS$]) +m4trace:configure.in:912: -1- AC_SUBST([NVCC]) +m4trace:configure.in:912: -1- AC_SUBST_TRACE([NVCC]) +m4trace:configure.in:912: -1- m4_pattern_allow([^NVCC$]) +m4trace:configure.in:913: -1- AC_SUBST([GPUDIR]) +m4trace:configure.in:913: -1- AC_SUBST_TRACE([GPUDIR]) +m4trace:configure.in:913: -1- m4_pattern_allow([^GPUDIR$]) +m4trace:configure.in:914: -1- AC_SUBST([GPUCFLAGS]) +m4trace:configure.in:914: -1- AC_SUBST_TRACE([GPUCFLAGS]) +m4trace:configure.in:914: -1- m4_pattern_allow([^GPUCFLAGS$]) +m4trace:configure.in:915: -1- AC_SUBST([GPUMPICOMPILER]) +m4trace:configure.in:915: -1- AC_SUBST_TRACE([GPUMPICOMPILER]) +m4trace:configure.in:915: -1- m4_pattern_allow([^GPUMPICOMPILER$]) +m4trace:configure.in:920: -1- AC_DEFINE_TRACE_LITERAL([QUDA]) +m4trace:configure.in:920: -1- m4_pattern_allow([^QUDA$]) +m4trace:configure.in:920: -1- AH_OUTPUT([QUDA], [/* Using QUDA GPU */ +@%:@undef QUDA]) +m4trace:configure.in:920: -1- AH_OUTPUT([HAVE_LIBCUDART], [/* Define to 1 if you have the `cudart\' library (-lcudart). */ +@%:@undef HAVE_LIBCUDART]) +m4trace:configure.in:920: -1- AC_DEFINE_TRACE_LITERAL([HAVE_LIBCUDART]) +m4trace:configure.in:920: -1- m4_pattern_allow([^HAVE_LIBCUDART$]) +m4trace:configure.in:920: -1- AH_OUTPUT([HAVE_LIBQUDA], [/* Define to 1 if you have the `quda\' library (-lquda). */ +@%:@undef HAVE_LIBQUDA]) +m4trace:configure.in:920: -1- AC_SUBST([CXX]) +m4trace:configure.in:920: -1- AC_SUBST_TRACE([CXX]) +m4trace:configure.in:920: -1- m4_pattern_allow([^CXX$]) +m4trace:configure.in:920: -1- AC_SUBST([CXXFLAGS]) +m4trace:configure.in:920: -1- AC_SUBST_TRACE([CXXFLAGS]) +m4trace:configure.in:920: -1- m4_pattern_allow([^CXXFLAGS$]) +m4trace:configure.in:920: -1- AC_SUBST([LDFLAGS]) +m4trace:configure.in:920: -1- AC_SUBST_TRACE([LDFLAGS]) +m4trace:configure.in:920: -1- m4_pattern_allow([^LDFLAGS$]) +m4trace:configure.in:920: -1- AC_SUBST([LIBS]) +m4trace:configure.in:920: -1- AC_SUBST_TRACE([LIBS]) +m4trace:configure.in:920: -1- m4_pattern_allow([^LIBS$]) +m4trace:configure.in:920: -1- AC_SUBST([CPPFLAGS]) +m4trace:configure.in:920: -1- AC_SUBST_TRACE([CPPFLAGS]) +m4trace:configure.in:920: -1- m4_pattern_allow([^CPPFLAGS$]) +m4trace:configure.in:920: -1- AC_SUBST([CXX]) +m4trace:configure.in:920: -1- AC_SUBST_TRACE([CXX]) +m4trace:configure.in:920: -1- m4_pattern_allow([^CXX$]) +m4trace:configure.in:920: -1- AC_SUBST([ac_ct_CXX]) +m4trace:configure.in:920: -1- AC_SUBST_TRACE([ac_ct_CXX]) +m4trace:configure.in:920: -1- m4_pattern_allow([^ac_ct_CXX$]) +m4trace:configure.in:920: -1- AC_DEFINE_TRACE_LITERAL([HAVE_LIBQUDA]) +m4trace:configure.in:920: -1- m4_pattern_allow([^HAVE_LIBQUDA$]) +m4trace:configure.in:956: -1- AC_SUBST([QUDA_AVAILABLE]) +m4trace:configure.in:956: -1- AC_SUBST_TRACE([QUDA_AVAILABLE]) +m4trace:configure.in:956: -1- m4_pattern_allow([^QUDA_AVAILABLE$]) +m4trace:configure.in:1007: -1- AC_CONFIG_FILES([Makefile $make_files]) +m4trace:configure.in:1009: -1- AC_SUBST([LIB@&t@OBJS], [$ac_libobjs]) +m4trace:configure.in:1009: -1- AC_SUBST_TRACE([LIB@&t@OBJS]) +m4trace:configure.in:1009: -1- m4_pattern_allow([^LIB@&t@OBJS$]) +m4trace:configure.in:1009: -1- AC_SUBST([LTLIBOBJS], [$ac_ltlibobjs]) +m4trace:configure.in:1009: -1- AC_SUBST_TRACE([LTLIBOBJS]) +m4trace:configure.in:1009: -1- m4_pattern_allow([^LTLIBOBJS$]) +m4trace:configure.in:1009: -1- AC_SUBST_TRACE([top_builddir]) +m4trace:configure.in:1009: -1- AC_SUBST_TRACE([top_build_prefix]) +m4trace:configure.in:1009: -1- AC_SUBST_TRACE([srcdir]) +m4trace:configure.in:1009: -1- AC_SUBST_TRACE([abs_srcdir]) +m4trace:configure.in:1009: -1- AC_SUBST_TRACE([top_srcdir]) +m4trace:configure.in:1009: -1- AC_SUBST_TRACE([abs_top_srcdir]) +m4trace:configure.in:1009: -1- AC_SUBST_TRACE([builddir]) +m4trace:configure.in:1009: -1- AC_SUBST_TRACE([abs_builddir]) +m4trace:configure.in:1009: -1- AC_SUBST_TRACE([abs_top_builddir]) diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/benchmark.c b/qcd/part_cpu/applications/QCD/src/kernel_D/benchmark.c new file mode 100644 index 0000000000000000000000000000000000000000..c5fe2ba3f1bcbc4f7e5e5a14d59093efb88ff68d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/benchmark.c @@ -0,0 +1,494 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +/******************************************************************************* +* +* Benchmark program for the even-odd preconditioned Wilson-Dirac operator +* +* +*******************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#include + +#include "global.h" + +#include "mpi_init.h" +#if (defined BGL && !defined BGP) +# include +#endif +#ifdef MPI +# include +# ifdef HAVE_LIBLEMON +# include +# include +# endif +#endif +#ifdef OMP +# include +# include "init/init_openmp.h" +#endif +#include "gettime.h" +#include "su3.h" +#include "su3adj.h" +#include "ranlxd.h" +#include "geometry_eo.h" +#include "read_input.h" +#include "start.h" +#include "boundary.h" +#include "operator/Hopping_Matrix.h" +#include "operator/Hopping_Matrix_nocom.h" +#include "operator/tm_operators.h" +#include "xchange/xchange.h" +#include "init/init.h" +#include "test/check_geometry.h" +#include "operator/D_psi.h" +//#include "../qcd-diag.h" + +#ifndef BENCHMARK +#include "phmc.h" +#endif + + +#ifdef PARALLELT +# define SLICE (LX*LY*LZ/2) +#elif defined PARALLELXT +# define SLICE ((LX*LY*LZ/2)+(T*LY*LZ/2)) +#elif defined PARALLELXYT +# define SLICE ((LX*LY*LZ/2)+(T*LY*LZ/2) + (T*LX*LZ/2)) +#elif defined PARALLELXYZT +# define SLICE ((LX*LY*LZ/2)+(T*LY*LZ/2) + (T*LX*LZ/2) + (T*LX*LY/2)) +#elif defined PARALLELX +# define SLICE ((LY*LZ*T/2)) +#elif defined PARALLELXY +# define SLICE ((LY*LZ*T/2) + (LX*LZ*T/2)) +#elif defined PARALLELXYZ +# define SLICE ((LY*LZ*T/2) + (LX*LZ*T/2) + (LX*LY*T/2)) +#endif + +int check_xchange(); + +int kernel_d() +{ + int j,j_max,k,k_max = 1; +#ifdef HAVE_LIBLEMON + paramsXlfInfo *xlfInfo; +#endif + int status = 0; + int jube_kernel_number = 3; + static double t1,t2,dt,sdt,dts,qdt,sqdt; + double antioptaway=0.0; + +#ifdef MPI + static double dt2; + +/* JuBE: */ + jube_kernel_init(&jube_kernel_number); + + DUM_DERI = 6; + DUM_SOLVER = DUM_DERI+2; + DUM_MATRIX = DUM_SOLVER+6; + NO_OF_SPINORFIELDS = DUM_MATRIX+2; + +# ifdef OMP + int mpi_thread_provided; + MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_thread_provided); +# else + + /* JuBE: no mpi init needed */ + /* MPI_Init(&argc, &argv); */ +# endif + MPI_Comm_rank(MPI_COMM_WORLD, &g_proc_id); + +#else + g_proc_id = 0; +#endif + + g_rgi_C1 = 1.; + + /* Read the input file */ + if((status = read_input("kernel_D.input")) != 0) { + fprintf(stderr, "Could not find input file: kernel_D.input\nAborting...\n"); + exit(-1); + } + +#ifdef OMP + init_openmp(); +#endif + + tmlqcd_mpi_init(); + + + + if(g_proc_id==0) { +#ifdef SSE + printf("# The code was compiled with SSE instructions\n"); +#endif +#ifdef SSE2 + printf("# The code was compiled with SSE2 instructions\n"); +#endif +#ifdef SSE3 + printf("# The code was compiled with SSE3 instructions\n"); +#endif +#ifdef P4 + printf("# The code was compiled for Pentium4\n"); +#endif +#ifdef OPTERON + printf("# The code was compiled for AMD Opteron\n"); +#endif +#ifdef _GAUGE_COPY + printf("# The code was compiled with -D_GAUGE_COPY\n"); +#endif +#ifdef BGL + printf("# The code was compiled for Blue Gene/L\n"); +#endif +#ifdef BGP + printf("# The code was compiled for Blue Gene/P\n"); +#endif +#ifdef _USE_HALFSPINOR + printf("# The code was compiled with -D_USE_HALFSPINOR\n"); +#endif +#ifdef _USE_SHMEM + printf("# The code was compiled with -D_USE_SHMEM\n"); +# ifdef _PERSISTENT + printf("# The code was compiled for persistent MPI calls (halfspinor only)\n"); +# endif +#endif +#ifdef MPI +# ifdef _NON_BLOCKING + printf("# The code was compiled for non-blocking MPI calls (spinor and gauge)\n"); +# endif +#endif + printf("\n"); + fflush(stdout); + } + + +#ifdef _GAUGE_COPY + init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1); +#else + init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 0); +#endif + init_geometry_indices(VOLUMEPLUSRAND + g_dbw2rand); + + if(even_odd_flag) { + j = init_spinor_field(VOLUMEPLUSRAND/2, 2*k_max+1); + } + else { + j = init_spinor_field(VOLUMEPLUSRAND, 2*k_max); + } + + if ( j!= 0) { + fprintf(stderr, "Not enough memory for spinor fields! Aborting...\n"); + exit(0); + } + j = init_moment_field(VOLUME, VOLUMEPLUSRAND + g_dbw2rand); + if ( j!= 0) { + fprintf(stderr, "Not enough memory for moment fields! Aborting...\n"); + exit(0); + } + + if(g_proc_id == 0) { + fprintf(stdout,"# The number of processes is %d \n",g_nproc); + printf("# The lattice size is %d x %d x %d x %d\n", + (int)(T*g_nproc_t), (int)(LX*g_nproc_x), (int)(LY*g_nproc_y), (int)(g_nproc_z*LZ)); + printf("# The local lattice size is %d x %d x %d x %d\n", + (int)(T), (int)(LX), (int)(LY),(int) LZ); + if(even_odd_flag) { + printf("# benchmarking the even/odd preconditioned Dirac operator\n"); + } + else { + printf("# benchmarking the standard Dirac operator\n"); + } + fflush(stdout); + } + + /* define the geometry */ + geometry_KD(); + /* define the boundary conditions for the fermion fields */ + boundary(g_kappa); + +#ifdef _USE_HALFSPINOR + j = init_dirac_halfspinor(); + if ( j!= 0) { + fprintf(stderr, "Not enough memory for halfspinor fields! Aborting...\n"); + exit(0); + } + if(g_sloppy_precision_flag == 1) { + g_sloppy_precision = 1; + j = init_dirac_halfspinor32(); + if ( j!= 0) { + fprintf(stderr, "Not enough memory for 32-Bit halfspinor fields! Aborting...\n"); + exit(0); + } + } +# if (defined _PERSISTENT) + init_xchange_halffield(); +# endif +#endif + + status = check_geometry(); + if (status != 0) { + fprintf(stderr, "Checking of geometry failed. Unable to proceed.\nAborting....\n"); + exit(1); + } +#if (defined MPI && !(defined _USE_SHMEM)) + check_xchange(); +#endif + + start_ranlux_KD(1, 123456); + random_gauge_field(reproduce_randomnumber_flag, g_gauge_field); + +#ifdef MPI + /*For parallelization: exchange the gaugefield */ + xchange_gauge(g_gauge_field); +#endif +/* JuBE */ + jube_kernel_run(); + + if(even_odd_flag) { + sdt=0.; sqdt=0.0; + /*initialize the pseudo-fermion fields*/ + for (k = 0; k < k_max; k++) { + random_spinor_field_eo(g_spinor_field[k], reproduce_randomnumber_flag, RN_GAUSS); + } + + j_max=512; + antioptaway=0.0; + /* compute approximately how many applications we need to do to get a reliable measurement */ +#ifdef MPI + MPI_Barrier(MPI_COMM_WORLD); +#endif + t1 = gettime(); + for (j=0;j. + ***********************************************************************/ +/******************************************************************************* +* +* Benchmark program for the even-odd preconditioned Wilson-Dirac operator +* +* +*******************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#include + +#include "global.h" + +#include "mpi_init.h" +#if (defined BGL && !defined BGP) +# include +#endif +#ifdef MPI +# include +# ifdef HAVE_LIBLEMON +# include +# include +# endif +#endif +#ifdef OMP +# include +# include "init/init_openmp.h" +#endif +#include "gettime.h" +#include "su3.h" +#include "su3adj.h" +#include "ranlxd.h" +#include "geometry_eo.h" +#include "read_input.h" +#include "start.h" +#include "boundary.h" +#include "operator/Hopping_Matrix.h" +#include "operator/Hopping_Matrix_nocom.h" +#include "operator/tm_operators.h" +#include "xchange/xchange.h" +#include "init/init.h" +#include "test/check_geometry.h" +#include "operator/D_psi.h" + +#ifndef BENCHMARK +#include "phmc.h" +#endif + + +#ifdef PARALLELT +# define SLICE (LX*LY*LZ/2) +#elif defined PARALLELXT +# define SLICE ((LX*LY*LZ/2)+(T*LY*LZ/2)) +#elif defined PARALLELXYT +# define SLICE ((LX*LY*LZ/2)+(T*LY*LZ/2) + (T*LX*LZ/2)) +#elif defined PARALLELXYZT +# define SLICE ((LX*LY*LZ/2)+(T*LY*LZ/2) + (T*LX*LZ/2) + (T*LX*LY/2)) +#elif defined PARALLELX +# define SLICE ((LY*LZ*T/2)) +#elif defined PARALLELXY +# define SLICE ((LY*LZ*T/2) + (LX*LZ*T/2)) +#elif defined PARALLELXYZ +# define SLICE ((LY*LZ*T/2) + (LX*LZ*T/2) + (LX*LY*T/2)) +#endif + +int check_xchange(); + +int main(int argc,char *argv[]) +{ + int j,j_max,k,k_max = 1; +#ifdef HAVE_LIBLEMON + paramsXlfInfo *xlfInfo; +#endif + int status = 0; + + static double t1,t2,dt,sdt,dts,qdt,sqdt; + double antioptaway=0.0; + +#ifdef MPI + static double dt2; + + DUM_DERI = 6; + DUM_SOLVER = DUM_DERI+2; + DUM_MATRIX = DUM_SOLVER+6; + NO_OF_SPINORFIELDS = DUM_MATRIX+2; + +# ifdef OMP + int mpi_thread_provided; + MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_thread_provided); +# else + MPI_Init(&argc, &argv); +# endif + MPI_Comm_rank(MPI_COMM_WORLD, &g_proc_id); + +#else + g_proc_id = 0; +#endif + + g_rgi_C1 = 1.; + + /* Read the input file */ + if((status = read_input("benchmark.input")) != 0) { + fprintf(stderr, "Could not find input file: benchmark.input\nAborting...\n"); + exit(-1); + } + +#ifdef OMP + init_openmp(); +#endif + + tmlqcd_mpi_init(); + + + + if(g_proc_id==0) { +#ifdef SSE + printf("# The code was compiled with SSE instructions\n"); +#endif +#ifdef SSE2 + printf("# The code was compiled with SSE2 instructions\n"); +#endif +#ifdef SSE3 + printf("# The code was compiled with SSE3 instructions\n"); +#endif +#ifdef P4 + printf("# The code was compiled for Pentium4\n"); +#endif +#ifdef OPTERON + printf("# The code was compiled for AMD Opteron\n"); +#endif +#ifdef _GAUGE_COPY + printf("# The code was compiled with -D_GAUGE_COPY\n"); +#endif +#ifdef BGL + printf("# The code was compiled for Blue Gene/L\n"); +#endif +#ifdef BGP + printf("# The code was compiled for Blue Gene/P\n"); +#endif +#ifdef _USE_HALFSPINOR + printf("# The code was compiled with -D_USE_HALFSPINOR\n"); +#endif +#ifdef _USE_SHMEM + printf("# The code was compiled with -D_USE_SHMEM\n"); +# ifdef _PERSISTENT + printf("# The code was compiled for persistent MPI calls (halfspinor only)\n"); +# endif +#endif +#ifdef MPI +# ifdef _NON_BLOCKING + printf("# The code was compiled for non-blocking MPI calls (spinor and gauge)\n"); +# endif +#endif + printf("\n"); + fflush(stdout); + } + + +#ifdef _GAUGE_COPY + init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1); +#else + init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 0); +#endif + init_geometry_indices(VOLUMEPLUSRAND + g_dbw2rand); + + if(even_odd_flag) { + j = init_spinor_field(VOLUMEPLUSRAND/2, 2*k_max+1); + } + else { + j = init_spinor_field(VOLUMEPLUSRAND, 2*k_max); + } + + if ( j!= 0) { + fprintf(stderr, "Not enough memory for spinor fields! Aborting...\n"); + exit(0); + } + j = init_moment_field(VOLUME, VOLUMEPLUSRAND + g_dbw2rand); + if ( j!= 0) { + fprintf(stderr, "Not enough memory for moment fields! Aborting...\n"); + exit(0); + } + + if(g_proc_id == 0) { + fprintf(stdout,"# The number of processes is %d \n",g_nproc); + printf("# The lattice size is %d x %d x %d x %d\n", + (int)(T*g_nproc_t), (int)(LX*g_nproc_x), (int)(LY*g_nproc_y), (int)(g_nproc_z*LZ)); + printf("# The local lattice size is %d x %d x %d x %d\n", + (int)(T), (int)(LX), (int)(LY),(int) LZ); + if(even_odd_flag) { + printf("# benchmarking the even/odd preconditioned Dirac operator\n"); + } + else { + printf("# benchmarking the standard Dirac operator\n"); + } + fflush(stdout); + } + + /* define the geometry */ + geometry_KD(); + /* define the boundary conditions for the fermion fields */ + boundary(g_kappa); + +#ifdef _USE_HALFSPINOR + j = init_dirac_halfspinor(); + if ( j!= 0) { + fprintf(stderr, "Not enough memory for halfspinor fields! Aborting...\n"); + exit(0); + } + if(g_sloppy_precision_flag == 1) { + g_sloppy_precision = 1; + j = init_dirac_halfspinor32(); + if ( j!= 0) { + fprintf(stderr, "Not enough memory for 32-Bit halfspinor fields! Aborting...\n"); + exit(0); + } + } +# if (defined _PERSISTENT) + init_xchange_halffield(); +# endif +#endif + + status = check_geometry(); + if (status != 0) { + fprintf(stderr, "Checking of geometry failed. Unable to proceed.\nAborting....\n"); + exit(1); + } +#if (defined MPI && !(defined _USE_SHMEM)) + check_xchange(); +#endif + + start_ranlux_KD(1, 123456); + random_gauge_field(reproduce_randomnumber_flag, g_gauge_field); + +#ifdef MPI + /*For parallelization: exchange the gaugefield */ + xchange_gauge(g_gauge_field); +#endif + + if(even_odd_flag) { + sdt=0.; sqdt=0.0; + /*initialize the pseudo-fermion fields*/ + for (k = 0; k < k_max; k++) { + random_spinor_field_eo(g_spinor_field[k], reproduce_randomnumber_flag, RN_GAUSS); + } + + j_max=512; + antioptaway=0.0; + /* compute approximately how many applications we need to do to get a reliable measurement */ +#ifdef MPI + MPI_Barrier(MPI_COMM_WORLD); +#endif + t1 = gettime(); + for (j=0;j. + ***********************************************************************/ + +#ifndef _BGL_H +#define _BGL_H + +/*********************************************** + * + * some macros for optimising on the Blue Gene/L + * + * In the functions where they are to be used + * there must be declared + * double _Complex reg00, reg01,...,reg07; + * double _Complex reg10, ... + * double _Complex rs00, ..., rs32 + * + * Author: Carsten Urbach + * carsten.urbach@liverpool.ac.uk + * + ***********************************************/ + +#define _bgl_load_reg0(s) \ + reg00 = __lfpd((double*)&(s).c0); \ + reg01 = __lfpd((double*)&(s).c1); \ + reg02 = __lfpd((double*)&(s).c2); + +#define _bgl_load_reg0_32(s) \ + reg00 = __lfps((float*)&(s).c0); \ + reg01 = __lfps((float*)&(s).c1); \ + reg02 = __lfps((float*)&(s).c2); + +#define _bgl_load_reg1(s) \ + reg10 = __lfpd((double*)&(s).c0); \ + reg11 = __lfpd((double*)&(s).c1); \ + reg12 = __lfpd((double*)&(s).c2); + +#define _bgl_load_reg1_32(s) \ + reg10 = __lfps((float*)&(s).c0); \ + reg11 = __lfps((float*)&(s).c1); \ + reg12 = __lfps((float*)&(s).c2); + +#define _bgl_load_reg0_up(s) \ + reg03 = __lfpd((double*)&(s).c0); \ + reg04 = __lfpd((double*)&(s).c1); \ + reg05 = __lfpd((double*)&(s).c2); + +#define _bgl_load_reg0_up_32(s) \ + reg03 = __lfps((float*)&(s).c0); \ + reg04 = __lfps((float*)&(s).c1); \ + reg05 = __lfps((float*)&(s).c2); + +#define _bgl_load_reg1_up(s) \ + reg13 = __lfpd((double*)&(s).c0); \ + reg14 = __lfpd((double*)&(s).c1); \ + reg15 = __lfpd((double*)&(s).c2); + +#define _bgl_load_reg1_up_32(s) \ + reg13 = __lfps((float*)&(s).c0); \ + reg14 = __lfps((float*)&(s).c1); \ + reg15 = __lfps((float*)&(s).c2); + +#define _bgl_store_reg0(s) \ + __stfpd((double*)&(s).c0, reg00); \ + __stfpd((double*)&(s).c1, reg01); \ + __stfpd((double*)&(s).c2, reg02); + +#define _bgl_store_reg0_32(s) \ + __stfps((float*)&(s).c0, reg00); \ + __stfps((float*)&(s).c1, reg01); \ + __stfps((float*)&(s).c2, reg02); + +#define _bgl_store_reg1(s) \ + __stfpd((double*)&(s).c0, reg10); \ + __stfpd((double*)&(s).c1, reg11); \ + __stfpd((double*)&(s).c2, reg12); + +#define _bgl_store_reg1_32(s) \ + __stfps((float*)&(s).c0, reg10); \ + __stfps((float*)&(s).c1, reg11); \ + __stfps((float*)&(s).c2, reg12); + +#define _bgl_store_reg0_up(s) \ + __stfpd((double*)&(s).c0, reg03); \ + __stfpd((double*)&(s).c1, reg04); \ + __stfpd((double*)&(s).c2, reg05); + +#define _bgl_store_reg0_up_32(s) \ + __stfps((float*)&(s).c0, reg03); \ + __stfps((float*)&(s).c1, reg04); \ + __stfps((float*)&(s).c2, reg05); + +#define _bgl_store_reg1_up(s) \ + __stfpd((double*)&(s).c0, reg13); \ + __stfpd((double*)&(s).c1, reg14); \ + __stfpd((double*)&(s).c2, reg15); + +#define _bgl_store_reg1_up_32(s) \ + __stfps((float*)&(s).c0, reg13); \ + __stfps((float*)&(s).c1, reg14); \ + __stfps((float*)&(s).c2, reg15); + +#define _bgl_load_rs0(s) \ + rs00 = __lfpd((double*)&(s).c0); \ + rs01 = __lfpd((double*)&(s).c1); \ + rs02 = __lfpd((double*)&(s).c2); + +#define _bgl_load_rs0_32(s) \ + rs00 = __lfps((float*)&(s).c0); \ + rs01 = __lfps((float*)&(s).c1); \ + rs02 = __lfps((float*)&(s).c2); + +#define _bgl_load_rs1(s) \ + rs10 = __lfpd((double*)&(s).c0); \ + rs11 = __lfpd((double*)&(s).c1); \ + rs12 = __lfpd((double*)&(s).c2); + +#define _bgl_load_rs1_32(s) \ + rs10 = __lfps((float*)&(s).c0); \ + rs11 = __lfps((float*)&(s).c1); \ + rs12 = __lfps((float*)&(s).c2); + +#define _bgl_load_rs2(s) \ + rs20 = __lfpd((double*)&(s).c0); \ + rs21 = __lfpd((double*)&(s).c1); \ + rs22 = __lfpd((double*)&(s).c2); + +#define _bgl_load_rs2_32(s) \ + rs20 = __lfps((float*)&(s).c0); \ + rs21 = __lfps((float*)&(s).c1); \ + rs22 = __lfps((float*)&(s).c2); + +#define _bgl_load_rs3(s) \ + rs30 = __lfpd((double*)&(s).c0); \ + rs31 = __lfpd((double*)&(s).c1); \ + rs32 = __lfpd((double*)&(s).c2); + +#define _bgl_load_rs3_32(s) \ + rs30 = __lfps((float*)&(s).c0); \ + rs31 = __lfps((float*)&(s).c1); \ + rs32 = __lfps((float*)&(s).c2); + +#define _bgl_store_rs0(s) \ + __stfpd((double*)&(s).c0, rs00); \ + __stfpd((double*)&(s).c1, rs01); \ + __stfpd((double*)&(s).c2, rs02); + +#define _bgl_store_rs1(s) \ + __stfpd((double*)&(s).c0, rs10); \ + __stfpd((double*)&(s).c1, rs11); \ + __stfpd((double*)&(s).c2, rs12); + +#define _bgl_store_rs2(s) \ + __stfpd((double*)&(s).c0, rs20); \ + __stfpd((double*)&(s).c1, rs21); \ + __stfpd((double*)&(s).c2, rs22); + +#define _bgl_store_rs3(s) \ + __stfpd((double*)&(s).c0, rs30); \ + __stfpd((double*)&(s).c1, rs31); \ + __stfpd((double*)&(s).c2, rs32); + + +#define _bgl_store_reg0_up_rs1() \ + rs10 = reg03; \ + rs11 = reg04; \ + rs12 = reg05; + +#define _bgl_store_reg1_up_rs1() \ + rs10 = reg13; \ + rs11 = reg14; \ + rs12 = reg15; + +#define _bgl_store_reg0_up_rs3() \ + rs30 = reg03; \ + rs31 = reg04; \ + rs32 = reg05; + +#define _bgl_store_reg1_up_rs3() \ + rs30 = reg13; \ + rs31 = reg14; \ + rs32 = reg15; + +#define _bgl_store_reg0_up_rs0() \ + rs00 = reg03; \ + rs01 = reg04; \ + rs02 = reg05; + +#define _bgl_store_reg1_up_rs0() \ + rs00 = reg13; \ + rs01 = reg14; \ + rs02 = reg15; + +#define _bgl_store_reg0_up_rs2() \ + rs20 = reg03; \ + rs21 = reg04; \ + rs22 = reg05; + +#define _bgl_store_reg1_up_rs2() \ + rs20 = reg13; \ + rs21 = reg14; \ + rs22 = reg15; + +#define _bgl_add_to_rs0_reg0() \ + rs00 = __fpadd(reg03, rs00); \ + rs01 = __fpadd(reg04, rs01); \ + rs02 = __fpadd(reg05, rs02); + +#define _bgl_add_to_rs0_reg1() \ + rs00 = __fpadd(reg13, rs00); \ + rs01 = __fpadd(reg14, rs01); \ + rs02 = __fpadd(reg15, rs02); + +#define _bgl_i_mul_add_to_rs0_reg0() \ + rs00 = __fxcxnpma(rs00, reg03, 1.); \ + rs01 = __fxcxnpma(rs01, reg04, 1.); \ + rs02 = __fxcxnpma(rs02, reg05, 1.); + +#define _bgl_i_mul_add_to_rs0_reg1() \ + rs00 = __fxcxnpma(rs00, reg13, 1.); \ + rs01 = __fxcxnpma(rs01, reg14, 1.); \ + rs02 = __fxcxnpma(rs02, reg15, 1.); + +#define _bgl_add_to_rs1_reg0() \ + rs10 = __fpadd(reg03, rs10); \ + rs11 = __fpadd(reg04, rs11); \ + rs12 = __fpadd(reg05, rs12); + +#define _bgl_add_to_rs1_reg1() \ + rs10 = __fpadd(reg13, rs10); \ + rs11 = __fpadd(reg14, rs11); \ + rs12 = __fpadd(reg15, rs12); + +#define _bgl_i_mul_add_to_rs1_reg0() \ + rs10 = __fxcxnpma(rs10, reg03, 1.); \ + rs11 = __fxcxnpma(rs11, reg04, 1.); \ + rs12 = __fxcxnpma(rs12, reg05, 1.); + +#define _bgl_i_mul_add_to_rs1_reg1() \ + rs10 = __fxcxnpma(rs10, reg13, 1.); \ + rs11 = __fxcxnpma(rs11, reg14, 1.); \ + rs12 = __fxcxnpma(rs12, reg15, 1.); + +#define _bgl_add_to_rs2_reg0() \ + rs20 = __fpadd(reg03, rs20); \ + rs21 = __fpadd(reg04, rs21); \ + rs22 = __fpadd(reg05, rs22); + +#define _bgl_add_to_rs2_reg1() \ + rs20 = __fpadd(reg13, rs20); \ + rs21 = __fpadd(reg14, rs21); \ + rs22 = __fpadd(reg15, rs22); + +#define _bgl_i_mul_add_to_rs2_reg0() \ + rs20 = __fxcxnpma(rs20, reg03, 1.); \ + rs21 = __fxcxnpma(rs21, reg04, 1.); \ + rs22 = __fxcxnpma(rs22, reg05, 1.); + +#define _bgl_i_mul_add_to_rs2_reg1() \ + rs20 = __fxcxnpma(rs20, reg13, 1.); \ + rs21 = __fxcxnpma(rs21, reg14, 1.); \ + rs22 = __fxcxnpma(rs22, reg15, 1.); + +#define _bgl_add_to_rs3_reg0() \ + rs30 = __fpadd(reg03, rs30); \ + rs31 = __fpadd(reg04, rs31); \ + rs32 = __fpadd(reg05, rs32); + +#define _bgl_add_to_rs3_reg1() \ + rs30 = __fpadd(reg13, rs30); \ + rs31 = __fpadd(reg14, rs31); \ + rs32 = __fpadd(reg15, rs32); + +#define _bgl_i_mul_add_to_rs3_reg0() \ + rs30 = __fxcxnpma(rs30, reg03, 1.); \ + rs31 = __fxcxnpma(rs31, reg04, 1.); \ + rs32 = __fxcxnpma(rs32, reg05, 1.); + +#define _bgl_i_mul_add_to_rs3_reg1() \ + rs30 = __fxcxnpma(rs30, reg13, 1.); \ + rs31 = __fxcxnpma(rs31, reg14, 1.); \ + rs32 = __fxcxnpma(rs32, reg15, 1.); + +#define _bgl_sub_from_rs0_reg0() \ + rs00 = __fpsub(rs00, reg03); \ + rs01 = __fpsub(rs01, reg04); \ + rs02 = __fpsub(rs02, reg05); \ + +#define _bgl_sub_from_rs0_reg1() \ + rs00 = __fpsub(rs00, reg13); \ + rs01 = __fpsub(rs01, reg14); \ + rs02 = __fpsub(rs02, reg15); \ + +#define _bgl_i_mul_sub_from_rs0_reg0() \ + rs00 = __fxcxnpma(rs00, reg03, -1.); \ + rs01 = __fxcxnpma(rs01, reg04, -1.); \ + rs02 = __fxcxnpma(rs02, reg05, -1.); + +#define _bgl_i_mul_sub_from_rs0_reg1() \ + rs00 = __fxcxnpma(rs00, reg13, -1.); \ + rs01 = __fxcxnpma(rs01, reg14, -1.); \ + rs02 = __fxcxnpma(rs02, reg15, -1.); + +#define _bgl_sub_from_rs1_reg0() \ + rs10 = __fpsub(rs10, reg03); \ + rs11 = __fpsub(rs11, reg04); \ + rs12 = __fpsub(rs12, reg05); \ + +#define _bgl_sub_from_rs1_reg1() \ + rs10 = __fpsub(rs10, reg13); \ + rs11 = __fpsub(rs11, reg14); \ + rs12 = __fpsub(rs12, reg15); \ + +#define _bgl_i_mul_sub_from_rs1_reg0() \ + rs10 = __fxcxnpma(rs10, reg03, -1.); \ + rs11 = __fxcxnpma(rs11, reg04, -1.); \ + rs12 = __fxcxnpma(rs12, reg05, -1.); + +#define _bgl_i_mul_sub_from_rs1_reg1() \ + rs10 = __fxcxnpma(rs10, reg13, -1.); \ + rs11 = __fxcxnpma(rs11, reg14, -1.); \ + rs12 = __fxcxnpma(rs12, reg15, -1.); + +#define _bgl_sub_from_rs2_reg0() \ + rs20 = __fpsub(rs20, reg03); \ + rs21 = __fpsub(rs21, reg04); \ + rs22 = __fpsub(rs22, reg05); \ + +#define _bgl_sub_from_rs2_reg1() \ + rs20 = __fpsub(rs20, reg13); \ + rs21 = __fpsub(rs21, reg14); \ + rs22 = __fpsub(rs22, reg15); \ + +#define _bgl_i_mul_sub_from_rs2_reg0() \ + rs20 = __fxcxnpma(rs20, reg03, -1.); \ + rs21 = __fxcxnpma(rs21, reg04, -1.); \ + rs22 = __fxcxnpma(rs22, reg05, -1.); + +#define _bgl_i_mul_sub_from_rs2_reg1() \ + rs20 = __fxcxnpma(rs20, reg13, -1.); \ + rs21 = __fxcxnpma(rs21, reg14, -1.); \ + rs22 = __fxcxnpma(rs22, reg15, -1.); + +#define _bgl_sub_from_rs3_reg0() \ + rs30 = __fpsub(rs30, reg03); \ + rs31 = __fpsub(rs31, reg04); \ + rs32 = __fpsub(rs32, reg05); \ + +#define _bgl_sub_from_rs3_reg1() \ + rs30 = __fpsub(rs30, reg13); \ + rs31 = __fpsub(rs31, reg14); \ + rs32 = __fpsub(rs32, reg15); \ + +#define _bgl_i_mul_sub_from_rs3_reg0() \ + rs30 = __fxcxnpma(rs30, reg03, -1.); \ + rs31 = __fxcxnpma(rs31, reg04, -1.); \ + rs32 = __fxcxnpma(rs32, reg05, -1.); + +#define _bgl_i_mul_sub_from_rs3_reg1() \ + rs30 = __fxcxnpma(rs30, reg13, -1.); \ + rs31 = __fxcxnpma(rs31, reg14, -1.); \ + rs32 = __fxcxnpma(rs32, reg15, -1.); + +#define _bgl_vector_add_reg0() \ + reg00 = __fpadd(reg00, reg03); \ + reg01 = __fpadd(reg01, reg04); \ + reg02 = __fpadd(reg02, reg05); + +#define _bgl_vector_sub_reg0() \ + reg00 = __fpsub(reg00, reg03); \ + reg01 = __fpsub(reg01, reg04); \ + reg02 = __fpsub(reg02, reg05); + +#define _bgl_vector_sub_reg0_up() \ + reg00 = __fpsub(reg03, reg00); \ + reg01 = __fpsub(reg04, reg01); \ + reg02 = __fpsub(reg05, reg02); + +#define _bgl_vector_add_reg1() \ + reg10 = __fpadd(reg10, reg13); \ + reg11 = __fpadd(reg11, reg14); \ + reg12 = __fpadd(reg12, reg15); + +#define _bgl_vector_sub_reg1() \ + reg10 = __fpsub(reg10, reg13); \ + reg11 = __fpsub(reg11, reg14); \ + reg12 = __fpsub(reg12, reg15); + +#define _bgl_vector_sub_reg1_up() \ + reg10 = __fpsub(reg13, reg10); \ + reg11 = __fpsub(reg14, reg11); \ + reg12 = __fpsub(reg15, reg12); + +#define _bgl_vector_sub_rs2_from_rs1_reg1() \ + reg10 = __fpsub(rs10, rs20); \ + reg11 = __fpsub(rs11, rs21); \ + reg12 = __fpsub(rs12, rs22); + +#define _bgl_vector_sub_rs3_from_rs1_reg1() \ + reg10 = __fpsub(rs10, rs30); \ + reg11 = __fpsub(rs11, rs31); \ + reg12 = __fpsub(rs12, rs32); + +#define _bgl_vector_sub_rs2_from_rs0_reg1() \ + reg10 = __fpsub(rs00, rs20); \ + reg11 = __fpsub(rs01, rs21); \ + reg12 = __fpsub(rs02, rs22); + +#define _bgl_vector_sub_rs2_from_rs0_reg0() \ + reg00 = __fpsub(rs00, rs20); \ + reg01 = __fpsub(rs01, rs21); \ + reg02 = __fpsub(rs02, rs22); + +#define _bgl_vector_sub_rs3_from_rs0_reg0() \ + reg00 = __fpsub(rs00, rs30); \ + reg01 = __fpsub(rs01, rs31); \ + reg02 = __fpsub(rs02, rs32); + +/* + * Multiplies reg3, reg4, reg5 with + * a complex number c + * + */ + +#define _bgl_vector_cmplx_mul(c) \ + reg00 = __fxpmul(reg03, __creal(c)); \ + reg01 = __fxpmul(reg04, __creal(c)); \ + reg02 = __fxpmul(reg05, __creal(c)); \ + reg03 = __fxcxnpma(reg00, reg03, __cimag(c)); \ + reg04 = __fxcxnpma(reg01, reg04, __cimag(c)); \ + reg05 = __fxcxnpma(reg02, reg05, __cimag(c)); + +#define _bgl_vector_cmplx_mul_double(c) \ + reg00 = __fxpmul(reg03, __creal(c)); \ + reg10 = __fxpmul(reg13, __creal(c)); \ + reg01 = __fxpmul(reg04, __creal(c)); \ + reg11 = __fxpmul(reg14, __creal(c)); \ + reg02 = __fxpmul(reg05, __creal(c)); \ + reg12 = __fxpmul(reg15, __creal(c)); \ + reg03 = __fxcxnpma(reg00, reg03, __cimag(c)); \ + reg13 = __fxcxnpma(reg10, reg13, __cimag(c)); \ + reg04 = __fxcxnpma(reg01, reg04, __cimag(c)); \ + reg14 = __fxcxnpma(reg11, reg14, __cimag(c)); \ + reg05 = __fxcxnpma(reg02, reg05, __cimag(c)); \ + reg15 = __fxcxnpma(reg12, reg15, __cimag(c)); + +/* complex number c times rs0 and rs1 */ +/* complex number c times cjgt of rs2 and rs3 */ +/* for gamma_5 mu multiplication */ + +#define _bgl_vector_cmplx_mul_rs(c) \ + reg20 = __lfpd((double*)&c); \ + reg00 = __fxpmul(rs00, __creal(reg20)); \ + reg01 = __fxpmul(rs01, __creal(reg20)); \ + reg02 = __fxpmul(rs02, __creal(reg20)); \ + reg10 = __fxpmul(rs10, __creal(reg20)); \ + reg11 = __fxpmul(rs11, __creal(reg20)); \ + reg12 = __fxpmul(rs12, __creal(reg20)); \ + rs00 = __fxcxnpma(reg00, rs00, __cimag(reg20)); \ + rs01 = __fxcxnpma(reg01, rs01, __cimag(reg20)); \ + rs02 = __fxcxnpma(reg02, rs02, __cimag(reg20)); \ + rs10 = __fxcxnpma(reg10, rs10, __cimag(reg20)); \ + rs11 = __fxcxnpma(reg11, rs11, __cimag(reg20)); \ + rs12 = __fxcxnpma(reg12, rs12, __cimag(reg20)); \ + reg00 = __fxpmul(rs20, __creal(reg20)); \ + reg01 = __fxpmul(rs21, __creal(reg20)); \ + reg02 = __fxpmul(rs22, __creal(reg20)); \ + reg10 = __fxpmul(rs30, __creal(reg20)); \ + reg11 = __fxpmul(rs31, __creal(reg20)); \ + reg12 = __fxpmul(rs32, __creal(reg20)); \ + rs20 = __fxcxnsma(reg00, rs20, __cimag(reg20)); \ + rs21 = __fxcxnsma(reg01, rs21, __cimag(reg20)); \ + rs22 = __fxcxnsma(reg02, rs22, __cimag(reg20)); \ + rs30 = __fxcxnsma(reg10, rs30, __cimag(reg20)); \ + rs31 = __fxcxnsma(reg11, rs31, __cimag(reg20)); \ + rs32 = __fxcxnsma(reg12, rs32, __cimag(reg20)); + + +#define _bgl_vector_cmplxcg_mul(c) \ + reg00 = __fxpmul(reg03, __creal(c)); \ + reg01 = __fxpmul(reg04, __creal(c)); \ + reg02 = __fxpmul(reg05, __creal(c)); \ + reg03 = __fxcxnsma(reg00, reg03, __cimag(c)); \ + reg04 = __fxcxnsma(reg01, reg04, __cimag(c)); \ + reg05 = __fxcxnsma(reg02, reg05, __cimag(c)); + +#define _bgl_vector_cmplxcg_mul_double(c) \ + reg00 = __fxpmul(reg03, __creal(c)); \ + reg10 = __fxpmul(reg13, __creal(c)); \ + reg01 = __fxpmul(reg04, __creal(c)); \ + reg11 = __fxpmul(reg14, __creal(c)); \ + reg02 = __fxpmul(reg05, __creal(c)); \ + reg12 = __fxpmul(reg15, __creal(c)); \ + reg03 = __fxcxnsma(reg00, reg03, __cimag(c)); \ + reg13 = __fxcxnsma(reg10, reg13, __cimag(c)); \ + reg04 = __fxcxnsma(reg01, reg04, __cimag(c)); \ + reg14 = __fxcxnsma(reg11, reg14, __cimag(c)); \ + reg05 = __fxcxnsma(reg02, reg05, __cimag(c)); \ + reg15 = __fxcxnsma(reg12, reg15, __cimag(c)); + + +#define _bgl_vector_cmplx_mul1(c) \ + reg10 = __cmplx(__creal(c),__creal(c)); \ + reg20 = __cmplx(__cimag(c),-__cimag(c)); \ + reg00 = __fpmul(reg03, reg10); \ + reg03 = __fxmul(reg03, reg20); \ + reg01 = __fpmul(reg04, reg10); \ + reg04 = __fxmul(reg04, reg20); \ + reg02 = __fpmul(reg05, reg10); \ + reg05 = __fxmul(reg05, reg20); \ + reg03 = __fpadd(reg00, reg03); \ + reg03 = __fpadd(reg01, reg04); \ + reg03 = __fpadd(reg02, reg05); + +/* + * Multiplies reg3, reg4, reg5 with + * a complex conjugate of c + * + */ + +#define _bgl_vector_cmplxcg_mul1(c) \ + reg10 = __cmplx(__creal(c),__creal(c)); \ + reg20 = __cmplx(-__cimag(c),__cimag(c)); \ + reg00 = __fpmul(reg03, reg10); \ + reg03 = __fxmul(reg03, reg20); \ + reg01 = __fpmul(reg04, reg10); \ + reg04 = __fxmul(reg04, reg20); \ + reg02 = __fpmul(reg05, reg10); \ + reg05 = __fxmul(reg05, reg20); \ + reg03 = __fpadd(reg00, reg03); \ + reg03 = __fpadd(reg01, reg04); \ + reg03 = __fpadd(reg02, reg05); + +#define _bgl_vector_i_mul_reg0() \ + reg03 = __cmplx(-__cimag(reg03), __creal(reg03)); \ + reg04 = __cmplx(-__cimag(reg04), __creal(reg04)); \ + reg05 = __cmplx(-__cimag(reg05), __creal(reg05)); + +#define _bgl_vector_i_mul_reg1() \ + reg13 = __cmplx(-__cimag(reg13), __creal(reg13)); \ + reg14 = __cmplx(-__cimag(reg14), __creal(reg14)); \ + reg15 = __cmplx(-__cimag(reg15), __creal(reg15)); + +#define _bgl_vector_i_mul_add_reg0() \ + reg00 = __fxcxnpma(reg00, reg03, 1.); \ + reg01 = __fxcxnpma(reg01, reg04, 1.); \ + reg02 = __fxcxnpma(reg02, reg05, 1.); + +#define _bgl_vector_i_mul_add_rs3_to_rs0_reg0() \ + reg00 = __fxcxnpma(rs00, rs30, 1.); \ + reg01 = __fxcxnpma(rs01, rs31, 1.); \ + reg02 = __fxcxnpma(rs02, rs32, 1.); + +#define _bgl_vector_i_mul_add_rs2_to_rs0_reg0() \ + reg00 = __fxcxnpma(rs00, rs20, 1.); \ + reg01 = __fxcxnpma(rs01, rs21, 1.); \ + reg02 = __fxcxnpma(rs02, rs22, 1.); + +#define _bgl_vector_i_mul_add_rs2_to_rs1_reg1() \ + reg10 = __fxcxnpma(rs10, rs20, 1.); \ + reg11 = __fxcxnpma(rs11, rs21, 1.); \ + reg12 = __fxcxnpma(rs12, rs22, 1.); + +#define _bgl_vector_i_mul_add_rs3_to_rs1_reg1() \ + reg10 = __fxcxnpma(rs10, rs30, 1.); \ + reg11 = __fxcxnpma(rs11, rs31, 1.); \ + reg12 = __fxcxnpma(rs12, rs32, 1.); + +#define _bgl_vector_i_mul_add_reg1() \ + reg10 = __fxcxnpma(reg10, reg13, 1.); \ + reg11 = __fxcxnpma(reg11, reg14, 1.); \ + reg12 = __fxcxnpma(reg12, reg15, 1.); + +#define _bgl_vector_i_mul_sub_reg0() \ + reg00 = __fxcxnpma(reg00, reg03, -1.); \ + reg01 = __fxcxnpma(reg01, reg04, -1.); \ + reg02 = __fxcxnpma(reg02, reg05, -1.); + +#define _bgl_vector_i_mul_sub_reg1() \ + reg10 = __fxcxnpma(reg10, reg13, -1.); \ + reg11 = __fxcxnpma(reg11, reg14, -1.); \ + reg12 = __fxcxnpma(reg12, reg15, -1.); + +#define _bgl_vector_i_mul_sub_rs3_from_rs1_reg1() \ + reg10 = __fxcxnpma(rs10, rs30, -1.); \ + reg11 = __fxcxnpma(rs11, rs31, -1.); \ + reg12 = __fxcxnpma(rs12, rs32, -1.); + +#define _bgl_vector_i_mul_sub_rs2_from_rs1_reg1() \ + reg10 = __fxcxnpma(rs10, rs20, -1.); \ + reg11 = __fxcxnpma(rs11, rs21, -1.); \ + reg12 = __fxcxnpma(rs12, rs22, -1.); + +#define _bgl_vector_i_mul_sub_rs2_from_rs0_reg1() \ + reg10 = __fxcxnpma(rs00, rs20, -1.); \ + reg11 = __fxcxnpma(rs01, rs21, -1.); \ + reg12 = __fxcxnpma(rs02, rs22, -1.); + +#define _bgl_vector_i_mul_sub_rs2_from_rs0_reg0() \ + reg00 = __fxcxnpma(rs00, rs20, -1.); \ + reg01 = __fxcxnpma(rs01, rs21, -1.); \ + reg02 = __fxcxnpma(rs02, rs22, -1.); + +#define _bgl_vector_i_mul_sub_rs3_from_rs0_reg0() \ + reg00 = __fxcxnpma(rs00, rs30, -1.); \ + reg01 = __fxcxnpma(rs01, rs31, -1.); \ + reg02 = __fxcxnpma(rs02, rs32, -1.); + +#define _bgl_vector_i_mul1() \ + reg10 = __cmplx(1., -1.); \ + reg03 = __fxmul(reg03, reg10); \ + reg04 = __fxmul(reg04, reg10); \ + reg05 = __fxmul(reg05, reg10); + +#define _bgl_su3_multiply(u) \ + reg03 = __fxpmul(reg00, __creal((u).c00)); \ + reg06 = __fxpmul(reg01, __creal((u).c01)); \ + reg04 = __fxpmul(reg00, __creal((u).c10)); \ + reg03 = __fpadd(reg06, reg03); \ + reg07 = __fxpmul(reg02, __creal((u).c12)); \ + reg05 = __fxpmul(reg00, __creal((u).c20)); \ + reg04 = __fpadd(reg07, reg04); \ + reg06 = __fxpmul(reg01, __creal((u).c21)); \ + reg07 = __fxpmul(reg02, __creal((u).c02)); \ + reg05 = __fpadd(reg06, reg05); \ + reg03 = __fpadd(reg07, reg03); \ + reg06 = __fxpmul(reg01, __creal((u).c11)); \ + reg07 = __fxpmul(reg02, __creal((u).c22)); \ + reg04 = __fpadd(reg06, reg04); \ + reg05 = __fpadd(reg07, reg05); \ + \ + reg03 = __fxcxnpma(reg03, reg00, __cimag((u).c00)); \ + reg04 = __fxcxnpma(reg04, reg01, __cimag((u).c11)); \ + reg05 = __fxcxnpma(reg05, reg02, __cimag((u).c22)); \ + reg04 = __fxcxnpma(reg04, reg00, __cimag((u).c10)); \ + reg03 = __fxcxnpma(reg03, reg01, __cimag((u).c01)); \ + reg05 = __fxcxnpma(reg05, reg00, __cimag((u).c20)); \ + reg03 = __fxcxnpma(reg03, reg02, __cimag((u).c02)); \ + reg05 = __fxcxnpma(reg05, reg01, __cimag((u).c21)); \ + reg04 = __fxcxnpma(reg04, reg02, __cimag((u).c12)); + +#define _bgl_su3_multiply_double(u) \ + u00 = __lfpd((double*)&(u).c00); \ + u01 = __lfpd((double*)&(u).c01); \ + u02 = __lfpd((double*)&(u).c02); \ + u10 = __lfpd((double*)&(u).c10); \ + u11 = __lfpd((double*)&(u).c11); \ + u12 = __lfpd((double*)&(u).c12); \ + reg20 = __lfpd((double*)&(u).c20); \ + reg03 = __fxpmul(reg00, __creal(u00)); \ + reg13 = __fxpmul(reg10, __creal(u00)); \ + reg04 = __fxpmul(reg00, __creal(u10)); \ + reg14 = __fxpmul(reg10, __creal(u10)); \ + reg05 = __fxpmul(reg00, __creal(reg20)); \ + reg15 = __fxpmul(reg10, __creal(reg20)); \ + reg03 = __fxcxnpma(reg03, reg00, __cimag(u00)); \ + reg13 = __fxcxnpma(reg13, reg10, __cimag(u00)); \ + reg04 = __fxcxnpma(reg04, reg00, __cimag(u10)); \ + reg14 = __fxcxnpma(reg14, reg10, __cimag(u10)); \ + reg05 = __fxcxnpma(reg05, reg00, __cimag(reg20)); \ + reg15 = __fxcxnpma(reg15, reg10, __cimag(reg20)); \ + reg21 = __lfpd((double*)&(u).c21); \ + reg03 = __fxcpmadd(reg03, reg01, __creal(u01)); \ + reg13 = __fxcpmadd(reg13, reg11, __creal(u01)); \ + reg04 = __fxcpmadd(reg04, reg01, __creal(u11)); \ + reg14 = __fxcpmadd(reg14, reg11, __creal(u11)); \ + reg05 = __fxcpmadd(reg05, reg01, __creal(reg21)); \ + reg15 = __fxcpmadd(reg15, reg11, __creal(reg21)); \ + reg03 = __fxcxnpma(reg03, reg01, __cimag(u01)); \ + reg13 = __fxcxnpma(reg13, reg11, __cimag(u01)); \ + reg04 = __fxcxnpma(reg04, reg01, __cimag(u11)); \ + reg14 = __fxcxnpma(reg14, reg11, __cimag(u11)); \ + reg05 = __fxcxnpma(reg05, reg01, __cimag(reg21)); \ + reg15 = __fxcxnpma(reg15, reg11, __cimag(reg21)); \ + u00 = __lfpd((double*)&(u).c22); \ + reg03 = __fxcpmadd(reg03, reg02, __creal(u02)); \ + reg13 = __fxcpmadd(reg13, reg12, __creal(u02)); \ + reg04 = __fxcpmadd(reg04, reg02, __creal(u12)); \ + reg14 = __fxcpmadd(reg14, reg12, __creal(u12)); \ + reg05 = __fxcpmadd(reg05, reg02, __creal(u00)); \ + reg15 = __fxcpmadd(reg15, reg12, __creal(u00)); \ + reg03 = __fxcxnpma(reg03, reg02, __cimag(u02)); \ + reg13 = __fxcxnpma(reg13, reg12, __cimag(u02)); \ + reg04 = __fxcxnpma(reg04, reg02, __cimag(u12)); \ + reg14 = __fxcxnpma(reg14, reg12, __cimag(u12)); \ + reg05 = __fxcxnpma(reg05, reg02, __cimag(u00)); \ + reg15 = __fxcxnpma(reg15, reg12, __cimag(u00)); + + +#define _bgl_su3_inverse_multiply(u) \ + reg03 = __fxpmul(reg00, __creal((u).c00)); \ + reg06 = __fxpmul(reg01, __creal((u).c10)); \ + reg04 = __fxpmul(reg00, __creal((u).c01)); \ + reg03 = __fpadd(reg06, reg03); \ + reg07 = __fxpmul(reg02, __creal((u).c21)); \ + reg05 = __fxpmul(reg00, __creal((u).c02)); \ + reg04 = __fpadd(reg07, reg04); \ + reg06 = __fxpmul(reg01, __creal((u).c12)); \ + reg07 = __fxpmul(reg02, __creal((u).c20)); \ + reg05 = __fpadd(reg06, reg05); \ + reg03 = __fpadd(reg07, reg03); \ + reg06 = __fxpmul(reg01, __creal((u).c11)); \ + reg07 = __fxpmul(reg02, __creal((u).c22)); \ + reg04 = __fpadd(reg06, reg04); \ + reg05 = __fpadd(reg07, reg05); \ + \ + reg03 = __fxcxnsma(reg03, reg00, __cimag((u).c00)); \ + reg04 = __fxcxnsma(reg04, reg01, __cimag((u).c11)); \ + reg05 = __fxcxnsma(reg05, reg02, __cimag((u).c22)); \ + reg04 = __fxcxnsma(reg04, reg00, __cimag((u).c01)); \ + reg03 = __fxcxnsma(reg03, reg01, __cimag((u).c10)); \ + reg05 = __fxcxnsma(reg05, reg00, __cimag((u).c02)); \ + reg03 = __fxcxnsma(reg03, reg02, __cimag((u).c20)); \ + reg05 = __fxcxnsma(reg05, reg01, __cimag((u).c12)); \ + reg04 = __fxcxnsma(reg04, reg02, __cimag((u).c21)); + +#define _bgl_su3_inverse_multiply_double(u) \ + u00 = __lfpd((double*)&(u).c00); \ + u01 = __lfpd((double*)&(u).c01); \ + u02 = __lfpd((double*)&(u).c02); \ + reg03 = __fxpmul(reg00, __creal(u00)); \ + reg13 = __fxpmul(reg10, __creal(u00)); \ + reg04 = __fxpmul(reg00, __creal(u01)); \ + reg14 = __fxpmul(reg10, __creal(u01)); \ + reg05 = __fxpmul(reg00, __creal(u02)); \ + reg15 = __fxpmul(reg10, __creal(u02)); \ + reg03 = __fxcxnsma(reg03, reg00, __cimag(u00)); \ + reg13 = __fxcxnsma(reg13, reg10, __cimag(u00)); \ + reg04 = __fxcxnsma(reg04, reg00, __cimag(u01)); \ + reg14 = __fxcxnsma(reg14, reg10, __cimag(u01)); \ + reg05 = __fxcxnsma(reg05, reg00, __cimag(u02)); \ + reg15 = __fxcxnsma(reg15, reg10, __cimag(u02)); \ + u10 = __lfpd((double*)&(u).c10); \ + u11 = __lfpd((double*)&(u).c11); \ + u12 = __lfpd((double*)&(u).c12); \ + reg03 = __fxcpmadd(reg03, reg01, __creal(u10)); \ + reg13 = __fxcpmadd(reg13, reg11, __creal(u10)); \ + reg04 = __fxcpmadd(reg04, reg01, __creal(u11)); \ + reg14 = __fxcpmadd(reg14, reg11, __creal(u11)); \ + reg05 = __fxcpmadd(reg05, reg01, __creal(u12)); \ + reg15 = __fxcpmadd(reg15, reg11, __creal(u12)); \ + reg03 = __fxcxnsma(reg03, reg01, __cimag(u10)); \ + reg13 = __fxcxnsma(reg13, reg11, __cimag(u10)); \ + reg04 = __fxcxnsma(reg04, reg01, __cimag(u11)); \ + reg14 = __fxcxnsma(reg14, reg11, __cimag(u11)); \ + reg05 = __fxcxnsma(reg05, reg01, __cimag(u12)); \ + reg15 = __fxcxnsma(reg15, reg11, __cimag(u12)); \ + u00 = __lfpd((double*)&(u).c20); \ + u01 = __lfpd((double*)&(u).c21); \ + u02 = __lfpd((double*)&(u).c22); \ + reg03 = __fxcpmadd(reg03, reg02, __creal(u00)); \ + reg13 = __fxcpmadd(reg13, reg12, __creal(u00)); \ + reg04 = __fxcpmadd(reg04, reg02, __creal(u01)); \ + reg14 = __fxcpmadd(reg14, reg12, __creal(u01)); \ + reg05 = __fxcpmadd(reg05, reg02, __creal(u02)); \ + reg15 = __fxcpmadd(reg15, reg12, __creal(u02)); \ + reg03 = __fxcxnsma(reg03, reg02, __cimag(u00)); \ + reg13 = __fxcxnsma(reg13, reg12, __cimag(u00)); \ + reg04 = __fxcxnsma(reg04, reg02, __cimag(u01)); \ + reg14 = __fxcxnsma(reg14, reg12, __cimag(u01)); \ + reg05 = __fxcxnsma(reg05, reg02, __cimag(u02)); \ + reg15 = __fxcxnsma(reg15, reg12, __cimag(u02)); + + +/* 35 cycles ! */ +#define _prefetch_spinor(addr) \ + __dcbt(((char*)((unsigned long int)(addr)))); \ + __dcbt(((char*)((unsigned long int)(addr)))+32); \ + __dcbt(((char*)((unsigned long int)(addr)))+64); \ + __dcbt(((char*)((unsigned long int)(addr)))+96); \ + __dcbt(((char*)((unsigned long int)(addr)))+128); \ + __dcbt(((char*)((unsigned long int)(addr)))+164); + +#define _prefetch_spinor_for_store(addr) \ + __dcbz(((char*)((unsigned long int)(addr)))); \ + __dcbz(((char*)((unsigned long int)(addr)))+32); \ + __dcbz(((char*)((unsigned long int)(addr)))+64); \ + __dcbz(((char*)((unsigned long int)(addr)))+96); \ + __dcbz(((char*)((unsigned long int)(addr)))+128); \ + __dcbz(((char*)((unsigned long int)(addr)))+164); + +#define _prefetch_halfspinor_for_store(addr) \ + __dcbz(((char*)((unsigned long int)(addr)))); \ + __dcbz(((char*)((unsigned long int)(addr)))+32); \ + __dcbz(((char*)((unsigned long int)(addr)))+64); + +#define _prefetch_halfspinor(addr) \ + __dcbt(((char*)((unsigned long int)(addr)))); \ + __dcbt(((char*)((unsigned long int)(addr)))+32); \ + __dcbt(((char*)((unsigned long int)(addr)))+64); + +#define _prefetch_spinor2(addr) \ + __prefetch_by_load(((char*)((unsigned long int)(addr)))); \ + __prefetch_by_load(((char*)((unsigned long int)(addr)))+32); \ + __prefetch_by_load(((char*)((unsigned long int)(addr)))+64); \ + __prefetch_by_load(((char*)((unsigned long int)(addr)))+96); \ + __prefetch_by_load(((char*)((unsigned long int)(addr)))+128); \ + __prefetch_by_load(((char*)((unsigned long int)(addr)))+164); + + +#define _prefetch_su3(addr) \ + __dcbt(((char*)((unsigned long int)(addr)))); \ + __dcbt(((char*)((unsigned long int)(addr)))+32); \ + __dcbt(((char*)((unsigned long int)(addr)))+64); \ + __dcbt(((char*)((unsigned long int)(addr)))+96); \ + __dcbt(((char*)((unsigned long int)(addr)))+128); + +#define _prefetch_su32(addr) \ + __prefetch_by_load(((char*)((unsigned long int)(addr)))); \ + __prefetch_by_load(((char*)((unsigned long int)(addr)))+32); \ + __prefetch_by_load(((char*)((unsigned long int)(addr)))+64); \ + __prefetch_by_load(((char*)((unsigned long int)(addr)))+96); \ + __prefetch_by_load(((char*)((unsigned long int)(addr)))+128); + +#define _prefetch_spinor3(addr) \ + __prefetch_by_stream(1,((char*)((unsigned long int)(addr)))); \ + __prefetch_by_stream(1,((char*)((unsigned long int)(addr)))+32); \ + __prefetch_by_stream(1,((char*)((unsigned long int)(addr)))+64); \ + __prefetch_by_stream(1,((char*)((unsigned long int)(addr)))+96); \ + __prefetch_by_stream(1,((char*)((unsigned long int)(addr)))+128); \ + __prefetch_by_stream(1,((char*)((unsigned long int)(addr)))+164); + + +#define _prefetch_su33(addr) \ + __prefetch_by_stream(1,((char*)((unsigned long int)(addr)))); \ + __prefetch_by_stream(1,((char*)((unsigned long int)(addr)))+32); \ + __prefetch_by_stream(1,((char*)((unsigned long int)(addr)))+64); \ + __prefetch_by_stream(1,((char*)((unsigned long int)(addr)))+96); \ + __prefetch_by_stream(1,((char*)((unsigned long int)(addr)))+128); + + +/* computers u*w and stores result in regxx */ +#define _bgl_su3_times_su3(u, w) \ + u00 = __lfpd((double*)&(u).c00); \ + u01 = __lfpd((double*)&(u).c01); \ + u02 = __lfpd((double*)&(u).c02); \ + u10 = __lfpd((double*)&(u).c10); \ + u11 = __lfpd((double*)&(u).c11); \ + u12 = __lfpd((double*)&(u).c12); \ + u20 = __lfpd((double*)&(u).c20); \ + w00 = __lfpd((double*)&(w).c00); \ + w01 = __lfpd((double*)&(w).c01); \ + w02 = __lfpd((double*)&(w).c02); \ + reg00 = __fxpmul(w00, __creal(u00)); \ + reg10 = __fxpmul(w01, __creal(u00)); \ + reg20 = __fxpmul(w02, __creal(u00)); \ + reg01 = __fxpmul(w00, __creal(u10)); \ + reg11 = __fxpmul(w01, __creal(u10)); \ + reg21 = __fxpmul(w02, __creal(u10)); \ + reg02 = __fxpmul(w00, __creal(u20)); \ + reg12 = __fxpmul(w01, __creal(u20)); \ + reg22 = __fxpmul(w02, __creal(u20)); \ + w10 = __lfpd((double*)&(w).c10); \ + w11 = __lfpd((double*)&(w).c11); \ + w12 = __lfpd((double*)&(w).c12); \ + reg00 = __fxcxnpma(reg00, w00, __cimag(u00)); \ + reg10 = __fxcxnpma(reg10, w01, __cimag(u00)); \ + reg20 = __fxcxnpma(reg20, w02, __cimag(u00)); \ + reg01 = __fxcxnpma(reg01, w00, __cimag(u10)); \ + reg11 = __fxcxnpma(reg11, w01, __cimag(u10)); \ + reg21 = __fxcxnpma(reg21, w02, __cimag(u10)); \ + reg02 = __fxcxnpma(reg02, w00, __cimag(u20)); \ + reg12 = __fxcxnpma(reg12, w01, __cimag(u20)); \ + reg22 = __fxcxnpma(reg22, w02, __cimag(u20)); \ + u00 = __lfpd((double*)&(u).c21); \ + u10 = __lfpd((double*)&(u).c22); \ + reg00 = __fxcpmadd(reg00, w10, __creal(u01)); \ + reg10 = __fxcpmadd(reg10, w11, __creal(u01)); \ + reg20 = __fxcpmadd(reg20, w12, __creal(u01)); \ + reg01 = __fxcpmadd(reg01, w10, __creal(u11)); \ + reg11 = __fxcpmadd(reg11, w11, __creal(u11)); \ + reg21 = __fxcpmadd(reg21, w12, __creal(u11)); \ + reg02 = __fxcpmadd(reg02, w10, __creal(u00)); \ + reg12 = __fxcpmadd(reg12, w11, __creal(u00)); \ + reg22 = __fxcpmadd(reg22, w12, __creal(u00)); \ + w20 = __lfpd((double*)&(w).c20); \ + w01 = __lfpd((double*)&(w).c21); \ + w02 = __lfpd((double*)&(w).c22); \ + reg00 = __fxcxnpma(reg00, w10, __cimag(u01)); \ + reg10 = __fxcxnpma(reg10, w11, __cimag(u01)); \ + reg20 = __fxcxnpma(reg20, w12, __cimag(u01)); \ + reg01 = __fxcxnpma(reg01, w10, __cimag(u11)); \ + reg11 = __fxcxnpma(reg11, w11, __cimag(u11)); \ + reg21 = __fxcxnpma(reg21, w12, __cimag(u11)); \ + reg02 = __fxcxnpma(reg02, w10, __cimag(u00)); \ + reg12 = __fxcxnpma(reg12, w11, __cimag(u00)); \ + reg22 = __fxcxnpma(reg22, w12, __cimag(u00)); \ + reg00 = __fxcpmadd(reg00, w20, __creal(u02)); \ + reg10 = __fxcpmadd(reg10, w01, __creal(u02)); \ + reg20 = __fxcpmadd(reg20, w02, __creal(u02)); \ + reg01 = __fxcpmadd(reg01, w20, __creal(u12)); \ + reg11 = __fxcpmadd(reg11, w01, __creal(u12)); \ + reg21 = __fxcpmadd(reg21, w02, __creal(u12)); \ + reg02 = __fxcpmadd(reg02, w20, __creal(u10)); \ + reg12 = __fxcpmadd(reg12, w01, __creal(u10)); \ + reg22 = __fxcpmadd(reg22, w02, __creal(u10)); \ + reg00 = __fxcxnpma(reg00, w20, __cimag(u02)); \ + reg10 = __fxcxnpma(reg10, w01, __cimag(u02)); \ + reg20 = __fxcxnpma(reg20, w02, __cimag(u02)); \ + reg01 = __fxcxnpma(reg01, w20, __cimag(u12)); \ + reg11 = __fxcxnpma(reg11, w01, __cimag(u12)); \ + reg21 = __fxcxnpma(reg21, w02, __cimag(u12)); \ + reg02 = __fxcxnpma(reg02, w20, __cimag(u10)); \ + reg12 = __fxcxnpma(reg12, w01, __cimag(u10)); \ + reg22 = __fxcxnpma(reg22, w02, __cimag(u10)); + +/* computers u*w^{dag} and stores result in regxx */ + + + +/* computer u*regxx and adds the result to vxx */ + +#define _bgl_su3_times_su3_acc(u) \ + u00 = __lfpd((double*)&(u).c00); \ + u01 = __lfpd((double*)&(u).c01); \ + u02 = __lfpd((double*)&(u).c02); \ + u10 = __lfpd((double*)&(u).c10); \ + u11 = __lfpd((double*)&(u).c11); \ + u12 = __lfpd((double*)&(u).c12); \ + u20 = __lfpd((double*)&(u).c20); \ + v00 = __fxcpmadd(v00, reg00, __creal(u00)); \ + v10 = __fxcpmadd(v10, reg01, __creal(u00)); \ + v20 = __fxcpmadd(v20, reg02, __creal(u00)); \ + v01 = __fxcpmadd(v01, reg00, __creal(u10)); \ + v11 = __fxcpmadd(v11, reg01, __creal(u10)); \ + v21 = __fxcpmadd(v21, reg02, __creal(u10)); \ + v02 = __fxcpmadd(v02, reg00, __creal(u20)); \ + v12 = __fxcpmadd(v12, reg01, __creal(u20)); \ + v22 = __fxcpmadd(v22, reg02, __creal(u20)); \ + v00 = __fxcxnpma(v00, reg00, __cimag(u00)); \ + v10 = __fxcxnpma(v10, reg01, __cimag(u00)); \ + v20 = __fxcxnpma(v20, reg02, __cimag(u00)); \ + v01 = __fxcxnpma(v01, reg00, __cimag(u10)); \ + v11 = __fxcxnpma(v11, reg01, __cimag(u10)); \ + v21 = __fxcxnpma(v21, reg02, __cimag(u10)); \ + v02 = __fxcxnpma(v02, reg00, __cimag(u20)); \ + v12 = __fxcxnpma(v12, reg01, __cimag(u20)); \ + v22 = __fxcxnpma(v22, reg02, __cimag(u20)); \ + u00 = __lfpd((double*)&(u).c21); \ + u01 = __lfpd((double*)&(u).c22); \ + v00 = __fxcpmadd(v00, reg10, __creal(u01)); \ + v10 = __fxcpmadd(v10, reg11, __creal(u01)); \ + v20 = __fxcpmadd(v20, reg12, __creal(u01)); \ + v01 = __fxcpmadd(v01, reg10, __creal(u11)); \ + v11 = __fxcpmadd(v11, reg11, __creal(u11)); \ + v21 = __fxcpmadd(v21, reg12, __creal(u11)); \ + v02 = __fxcpmadd(v02, reg10, __creal(u00)); \ + v12 = __fxcpmadd(v12, reg11, __creal(u00)); \ + v22 = __fxcpmadd(v22, reg12, __creal(u00)); \ + v00 = __fxcxnpma(v00, reg10, __cimag(u01)); \ + v10 = __fxcxnpma(v10, reg11, __cimag(u01)); \ + v20 = __fxcxnpma(v20, reg12, __cimag(u01)); \ + v01 = __fxcxnpma(v01, reg10, __cimag(u11)); \ + v11 = __fxcxnpma(v11, reg11, __cimag(u11)); \ + v21 = __fxcxnpma(v21, reg12, __cimag(u11)); \ + v02 = __fxcxnpma(v02, reg10, __cimag(u00)); \ + v12 = __fxcxnpma(v12, reg11, __cimag(u00)); \ + v22 = __fxcxnpma(v22, reg12, __cimag(u00)); \ + v00 = __fxcpmadd(v00, reg20, __creal(u02)); \ + v10 = __fxcpmadd(v10, reg21, __creal(u02)); \ + v20 = __fxcpmadd(v20, reg22, __creal(u02)); \ + v01 = __fxcpmadd(v01, reg20, __creal(u12)); \ + v11 = __fxcpmadd(v11, reg21, __creal(u12)); \ + v21 = __fxcpmadd(v21, reg22, __creal(u12)); \ + v02 = __fxcpmadd(v02, reg20, __creal(u01)); \ + v12 = __fxcpmadd(v12, reg21, __creal(u01)); \ + v22 = __fxcpmadd(v22, reg22, __creal(u01)); \ + v00 = __fxcxnpma(v00, reg20, __cimag(u02)); \ + v10 = __fxcxnpma(v10, reg21, __cimag(u02)); \ + v20 = __fxcxnpma(v20, reg22, __cimag(u02)); \ + v01 = __fxcxnpma(v01, reg20, __cimag(u12)); \ + v11 = __fxcxnpma(v11, reg21, __cimag(u12)); \ + v21 = __fxcxnpma(v21, reg22, __cimag(u12)); \ + v02 = __fxcxnpma(v02, reg20, __cimag(u01)); \ + v12 = __fxcxnpma(v12, reg21, __cimag(u01)); \ + v22 = __fxcxnpma(v22, reg22, __cimag(u01)); + +#define _bgl_store_vxx(v) \ + __stfpd((double*)&(v).c00, v00); \ + __stfpd((double*)&(v).c01, v01); \ + __stfpd((double*)&(v).c02, v02); \ + __stfpd((double*)&(v).c10, v10); \ + __stfpd((double*)&(v).c11, v11); \ + __stfpd((double*)&(v).c12, v12); \ + __stfpd((double*)&(v).c20, v20); \ + __stfpd((double*)&(v).c21, v21); \ + __stfpd((double*)&(v).c22, v22); \ + +#define _bgl_assign_rs0_to_reg0() \ + reg00 = rs00; \ + reg01 = rs01; \ + reg02 = rs02; + +#define _bgl_assign_rs0_to_reg1() \ + reg10 = rs00; \ + reg11 = rs01; \ + reg12 = rs02; + +#define _bgl_assign_rs1_to_reg0() \ + reg00 = rs10; \ + reg01 = rs11; \ + reg02 = rs12; + +#define _bgl_assign_rs1_to_reg1() \ + reg10 = rs10; \ + reg11 = rs11; \ + reg12 = rs12; + +#define _bgl_assign_rs2_to_reg0() \ + reg00 = rs20; \ + reg01 = rs21; \ + reg02 = rs22; + +#define _bgl_assign_rs2_to_reg1() \ + reg10 = rs20; \ + reg11 = rs21; \ + reg12 = rs22; + +#define _bgl_assign_rs3_to_reg0() \ + reg00 = rs30; \ + reg01 = rs31; \ + reg02 = rs32; + +#define _bgl_assign_rs3_to_reg1() \ + reg10 = rs30; \ + reg11 = rs31; \ + reg12 = rs32; + +#define _bgl_vector_add_rs2_to_rs0_reg0() \ + reg00 = __fpadd(rs00, rs20); \ + reg01 = __fpadd(rs01, rs21); \ + reg02 = __fpadd(rs02, rs22); + +#define _bgl_vector_add_rs3_to_rs0_reg0() \ + reg00 = __fpadd(rs00, rs30); \ + reg01 = __fpadd(rs01, rs31); \ + reg02 = __fpadd(rs02, rs32); + +#define _bgl_vector_add_rs3_to_rs1_reg1() \ + reg10 = __fpadd(rs10, rs30); \ + reg11 = __fpadd(rs11, rs31); \ + reg12 = __fpadd(rs12, rs32); + +#define _bgl_vector_add_rs2_to_rs1_reg1() \ + reg10 = __fpadd(rs10, rs20); \ + reg11 = __fpadd(rs11, rs21); \ + reg12 = __fpadd(rs12, rs22); + + +/* for deriv_Sb */ +#define _bgl_load_r0(s) \ + r00 = __lfpd((double*)&(s).c0); \ + r01 = __lfpd((double*)&(s).c1); \ + r02 = __lfpd((double*)&(s).c2); + +#define _bgl_load_r1(s) \ + r10 = __lfpd((double*)&(s).c0); \ + r11 = __lfpd((double*)&(s).c1); \ + r12 = __lfpd((double*)&(s).c2); + +#define _bgl_load_minus_r2(s) \ + r20 = -__lfpd((double*)&(s).c0); \ + r21 = -__lfpd((double*)&(s).c1); \ + r22 = -__lfpd((double*)&(s).c2); + +#define _bgl_load_minus_r3(s) \ + r30 = -__lfpd((double*)&(s).c0); \ + r31 = -__lfpd((double*)&(s).c1); \ + r32 = -__lfpd((double*)&(s).c2); + +/*c*/ +#define _bgl_add_to_reg0_reg1() \ + reg00 = __fpadd(reg10, reg00); \ + reg01 = __fpadd(reg11, reg01); \ + reg02 = __fpadd(reg12, reg02); + +/*c*/ +#define _bgl_add_to_reg0_reg1_up() \ + reg00 = __fpadd(reg13, reg00); \ + reg01 = __fpadd(reg14, reg01); \ + reg02 = __fpadd(reg15, reg02); + +/*c*/ +#define _bgl_sub_from_reg0_reg1() \ + reg00 = __fpsub(reg00, reg10); \ + reg01 = __fpsub(reg01, reg11); \ + reg02 = __fpsub(reg02, reg12); + +/*c*/ +#define _bgl_sub_from_reg0_reg1_up() \ + reg00 = __fpsub(reg00, reg13); \ + reg01 = __fpsub(reg01, reg14); \ + reg02 = __fpsub(reg02, reg15); + +/*c*/ +#define _bgl_sub_from_reg0_up_reg1() \ + reg03 = __fpsub(reg03, reg10); \ + reg04 = __fpsub(reg04, reg11); \ + reg05 = __fpsub(reg05, reg12); + +/*c*/ +#define _bgl_i_mul_add_to_reg0_reg1_up() \ + reg00 = __fxcxnpma(reg00, reg13, 1.); \ + reg01 = __fxcxnpma(reg01, reg14, 1.); \ + reg02 = __fxcxnpma(reg02, reg15, 1.); + +/*c*/ +#define _bgl_i_mul_add_to_reg0_reg1() \ + reg00 = __fxcxnpma(reg00, reg10, 1.); \ + reg01 = __fxcxnpma(reg01, reg11, 1.); \ + reg02 = __fxcxnpma(reg02, reg12, 1.); + +/*c*/ +#define _bgl_i_mul_add_to_reg0_up_reg1() \ + reg03 = __fxcxnpma(reg03, reg10, 1.); \ + reg04 = __fxcxnpma(reg04, reg11, 1.); \ + reg05 = __fxcxnpma(reg05, reg12, 1.); + +/*c*/ +#define _bgl_i_mul_add_to_reg0_up_reg1_up() \ + reg03 = __fxcxnpma(reg03, reg13, 1.); \ + reg04 = __fxcxnpma(reg04, reg14, 1.); \ + reg05 = __fxcxnpma(reg05, reg15, 1.); + +/*c*/ +#define _bgl_i_mul_sub_from_reg0_reg1_up() \ + reg00 = __fxcxnpma(reg00, reg13, -1.); \ + reg01 = __fxcxnpma(reg01, reg14, -1.); \ + reg02 = __fxcxnpma(reg02, reg15, -1.); + +/*c*/ +#define _bgl_i_mul_sub_from_reg0_reg1() \ + reg00 = __fxcxnpma(reg00, reg10, -1.); \ + reg01 = __fxcxnpma(reg01, reg11, -1.); \ + reg02 = __fxcxnpma(reg02, reg12, -1.); + +/*c*/ +#define _bgl_i_mul_sub_from_reg0_up_reg1_up() \ + reg03 = __fxcxnpma(reg03, reg13, -1.); \ + reg04 = __fxcxnpma(reg04, reg14, -1.); \ + reg05 = __fxcxnpma(reg05, reg15, -1.); + +/*c*/ +#define _bgl_i_mul_sub_from_reg0_up_reg1() \ + reg03 = __fxcxnpma(reg03, reg10, -1.); \ + reg04 = __fxcxnpma(reg04, reg11, -1.); \ + reg05 = __fxcxnpma(reg05, reg12, -1.); + +/*c*/ +#define _bgl_add_to_reg0_up_reg1_up() \ + reg03 = __fpadd(reg13, reg03); \ + reg04 = __fpadd(reg14, reg04); \ + reg05 = __fpadd(reg15, reg05); + +/*c*/ +#define _bgl_add_to_reg0_up_reg1() \ + reg03 = __fpadd(reg10, reg03); \ + reg04 = __fpadd(reg11, reg04); \ + reg05 = __fpadd(reg12, reg05); + +/*c*/ +#define _bgl_sub_from_reg0_up_reg1_up() \ + reg03 = __fpsub(reg03, reg13); \ + reg04 = __fpsub(reg04, reg14); \ + reg05 = __fpsub(reg05, reg15); + +#define _bgl_add_r0_to_r2_reg1() \ + reg10 = __fpadd(r00, r20); \ + reg11 = __fpadd(r01, r21); \ + reg12 = __fpadd(r02, r22); + +/*c*/ +#define _bgl_add_r0_to_r3_reg1() \ + reg10 = __fpadd(r00, r30); \ + reg11 = __fpadd(r01, r31); \ + reg12 = __fpadd(r02, r32); + +/*c*/ +#define _bgl_add_r1_to_r2_reg1_up() \ + reg13 = __fpadd(r10, r20); \ + reg14 = __fpadd(r11, r21); \ + reg15 = __fpadd(r12, r22); + +/*c*/ +#define _bgl_i_mul_add_r0_to_r3_reg1() \ + reg10 = __fxcxnpma(r00, r30, 1.); \ + reg11 = __fxcxnpma(r01, r31, 1.); \ + reg12 = __fxcxnpma(r02, r32, 1.); + +/*c*/ +#define _bgl_i_mul_add_r1_to_r2_reg1_up() \ + reg13 = __fxcxnpma(r10, r20, 1.); \ + reg14 = __fxcxnpma(r11, r21, 1.); \ + reg15 = __fxcxnpma(r12, r22, 1.); + +/*c*/ +#define _bgl_i_mul_add_r1_to_r3_reg1_up() \ + reg13 = __fxcxnpma(r10, r30, 1.); \ + reg14 = __fxcxnpma(r11, r31, 1.); \ + reg15 = __fxcxnpma(r12, r32, 1.); + +/*c*/ +#define _bgl_i_mul_add_r0_to_r2_reg1() \ + reg10 = __fxcxnpma(r00, r20, 1.); \ + reg11 = __fxcxnpma(r01, r21, 1.); \ + reg12 = __fxcxnpma(r02, r22, 1.); + +/*c*/ +#define _bgl_i_mul_sub_from_r0_r3_reg1() \ + reg10 = __fxcxnpma(r00, r30, -1.); \ + reg11 = __fxcxnpma(r01, r31, -1.); \ + reg12 = __fxcxnpma(r02, r32, -1.); + +/*c*/ +#define _bgl_i_mul_sub_from_r0_r2_reg1() \ + reg10 = __fxcxnpma(r00, r20, -1.); \ + reg11 = __fxcxnpma(r01, r21, -1.); \ + reg12 = __fxcxnpma(r02, r22, -1.); + +/*c*/ +#define _bgl_i_mul_sub_from_r1_r3_reg1_up() \ + reg13 = __fxcxnpma(r10, r30, -1.); \ + reg14 = __fxcxnpma(r11, r31, -1.); \ + reg15 = __fxcxnpma(r12, r32, -1.); + +/*c*/ +#define _bgl_i_mul_sub_from_r1_r2_reg1_up() \ + reg13 = __fxcxnpma(r10, r20, -1.); \ + reg14 = __fxcxnpma(r11, r21, -1.); \ + reg15 = __fxcxnpma(r12, r22, -1.); + +/*c*/ +#define _bgl_sub_from_r0_r2_reg1() \ + reg10 = __fpsub(r00, r20); \ + reg11 = __fpsub(r01, r21); \ + reg12 = __fpsub(r02, r22); + +/*c*/ +#define _bgl_sub_from_r0_r3_reg1() \ + reg10 = __fpsub(r00, r30); \ + reg11 = __fpsub(r01, r31); \ + reg12 = __fpsub(r02, r32); + + +#define _bgl_sub_from_r1_r2_reg1_up() \ + reg13 = __fpsub(r10, r20); \ + reg14 = __fpsub(r11, r21); \ + reg15 = __fpsub(r12, r22); + +#define _bgl_add_r1_to_r3_reg1_up() \ + reg13 = __fpadd(r10, r30); \ + reg14 = __fpadd(r11, r31); \ + reg15 = __fpadd(r12, r32); + +/*c*/ +#define _bgl_sub_from_r1_r3_reg1_up() \ + reg13 = __fpsub(r10, r30); \ + reg14 = __fpsub(r11, r31); \ + reg15 = __fpsub(r12, r32); + +/* reg1 tensor reg0^dagger */ +/* computes tensor product of reg0y with reg1x, x=0,1,2 */ +/* and tensor product of reg0y with reg1x, x,y=3,4,5 */ +/* adds the results and stores them in vxy */ +/* 9th element is stored in reg00 */ + +#define _bgl_tensor_product_and_add() \ + v00 = __fxpmul(reg00, __creal(reg10)); \ + v01 = __fxpmul(reg00, __creal(reg11)); \ + v02 = __fxpmul(reg00, __creal(reg12)); \ + v00 = __fxcxnsma(v00, reg00, __cimag(reg10)); \ + v01 = __fxcxnsma(v01, reg00, __cimag(reg11)); \ + v02 = __fxcxnsma(v02, reg00, __cimag(reg12)); \ + v00 = __fxcpmadd(v00, reg03, __creal(reg13)); \ + v01 = __fxcpmadd(v01, reg03, __creal(reg14)); \ + v02 = __fxcpmadd(v02, reg03, __creal(reg15)); \ + v00 = __fxcxnsma(v00, reg03, __cimag(reg13)); \ + v01 = __fxcxnsma(v01, reg03, __cimag(reg14)); \ + v02 = __fxcxnsma(v02, reg03, __cimag(reg15)); \ + v10 = __fxpmul(reg01, __creal(reg10)); \ + v11 = __fxpmul(reg01, __creal(reg11)); \ + v12 = __fxpmul(reg01, __creal(reg12)); \ + v10 = __fxcxnsma(v10, reg01, __cimag(reg10)); \ + v11 = __fxcxnsma(v11, reg01, __cimag(reg11)); \ + v12 = __fxcxnsma(v12, reg01, __cimag(reg12)); \ + v10 = __fxcpmadd(v10, reg04, __creal(reg13)); \ + v11 = __fxcpmadd(v11, reg04, __creal(reg14)); \ + v12 = __fxcpmadd(v12, reg04, __creal(reg15)); \ + v10 = __fxcxnsma(v10, reg04, __cimag(reg13)); \ + v11 = __fxcxnsma(v11, reg04, __cimag(reg14)); \ + v12 = __fxcxnsma(v12, reg04, __cimag(reg15)); \ + v20 = __fxpmul(reg02, __creal(reg10)); \ + v21 = __fxpmul(reg02, __creal(reg11)); \ + reg00 = __fxpmul(reg02, __creal(reg12)); \ + v20 = __fxcxnsma(v20, reg02, __cimag(reg10)); \ + v21 = __fxcxnsma(v21, reg02, __cimag(reg11)); \ + reg00 = __fxcxnsma(reg00, reg02, __cimag(reg12)); \ + v20 = __fxcpmadd(v20, reg05, __creal(reg13)); \ + v21 = __fxcpmadd(v21, reg05, __creal(reg14)); \ + reg00 = __fxcpmadd(reg00, reg05, __creal(reg15)); \ + v20 = __fxcxnsma(v20, reg05, __cimag(reg13)); \ + v21 = __fxcxnsma(v21, reg05, __cimag(reg14)); \ + reg00 = __fxcxnsma(reg00, reg05, __cimag(reg15)); \ + +/* reg0 tensor reg1^dagger */ +/* computes tensor product of reg1y with reg0x, x,y=0,1,2 */ +/* and tensor product of reg1y with reg0x, x,y=3,4,5 */ +/* adds the results and stores them in vxy */ +/* not that the result is the same as in the non-bgl case, but daggered */ +/* 9th element is stored in reg00 */ + +#define _bgl_tensor_product_and_add_d() \ + v00 = __fxpmul(reg10, __creal(reg00)); \ + v01 = __fxpmul(reg10, __creal(reg01)); \ + v02 = __fxpmul(reg10, __creal(reg02)); \ + v00 = __fxcxnsma(v00, reg10, __cimag(reg00)); \ + v01 = __fxcxnsma(v01, reg10, __cimag(reg01)); \ + v02 = __fxcxnsma(v02, reg10, __cimag(reg02)); \ + v00 = __fxcpmadd(v00, reg13, __creal(reg03)); \ + v01 = __fxcpmadd(v01, reg13, __creal(reg04)); \ + v02 = __fxcpmadd(v02, reg13, __creal(reg05)); \ + v00 = __fxcxnsma(v00, reg13, __cimag(reg03)); \ + v01 = __fxcxnsma(v01, reg13, __cimag(reg04)); \ + v02 = __fxcxnsma(v02, reg13, __cimag(reg05)); \ + v10 = __fxpmul(reg11, __creal(reg00)); \ + v11 = __fxpmul(reg11, __creal(reg01)); \ + v12 = __fxpmul(reg11, __creal(reg02)); \ + v10 = __fxcxnsma(v10, reg11, __cimag(reg00)); \ + v11 = __fxcxnsma(v11, reg11, __cimag(reg01)); \ + v12 = __fxcxnsma(v12, reg11, __cimag(reg02)); \ + v10 = __fxcpmadd(v10, reg14, __creal(reg03)); \ + v11 = __fxcpmadd(v11, reg14, __creal(reg04)); \ + v12 = __fxcpmadd(v12, reg14, __creal(reg05)); \ + v10 = __fxcxnsma(v10, reg14, __cimag(reg03)); \ + v11 = __fxcxnsma(v11, reg14, __cimag(reg04)); \ + v12 = __fxcxnsma(v12, reg14, __cimag(reg05)); \ + v20 = __fxpmul(reg12, __creal(reg00)); \ + v21 = __fxpmul(reg12, __creal(reg01)); \ + reg10 = __fxpmul(reg12, __creal(reg02)); \ + v20 = __fxcxnsma(v20, reg12, __cimag(reg00)); \ + v21 = __fxcxnsma(v21, reg12, __cimag(reg01)); \ + reg00 = __fxcxnsma(reg10, reg12, __cimag(reg02)); \ + v20 = __fxcpmadd(v20, reg15, __creal(reg03)); \ + v21 = __fxcpmadd(v21, reg15, __creal(reg04)); \ + reg00 = __fxcpmadd(reg00, reg15, __creal(reg05)); \ + v20 = __fxcxnsma(v20, reg15, __cimag(reg03)); \ + v21 = __fxcxnsma(v21, reg15, __cimag(reg04)); \ + reg00 = __fxcxnsma(reg00, reg15, __cimag(reg05)); \ + +/* computes tensor product of reg0x with reg1x, x=0,1,2 */ +/* and tensor product of reg0x with reg1x, x=3,4,5 */ +/* adds the results and stores their complex */ +/* conjugate in vxy transposed */ +/* 9th element is stored in reg00 */ + +#define _bgl_tensor_product_and_add_dagger() \ + v00 = __fxpmul(reg10, __creal(reg00)); \ + v01 = __fxpmul(reg10, __creal(reg01)); \ + v02 = __fxpmul(reg10, __creal(reg02)); \ + v00 = __fxcxnsma(v00, reg10, __cimag(reg00)); \ + v01 = __fxcxnsma(v01, reg10, __cimag(reg01)); \ + v02 = __fxcxnsma(v02, reg10, __cimag(reg02)); \ + v00 = __fxcpmadd(v00, reg13, __creal(reg03)); \ + v01 = __fxcpmadd(v01, reg13, __creal(reg04)); \ + v02 = __fxcpmadd(v02, reg13, __creal(reg05)); \ + v00 = __fxcxnsma(v00, reg13, __cimag(reg03)); \ + v01 = __fxcxnsma(v01, reg13, __cimag(reg04)); \ + v02 = __fxcxnsma(v02, reg13, __cimag(reg05)); \ + v10 = __fxpmul(reg11, __creal(reg00)); \ + v11 = __fxpmul(reg11, __creal(reg01)); \ + v12 = __fxpmul(reg11, __creal(reg02)); \ + v10 = __fxcxnsma(v10, reg11, __cimag(reg00)); \ + v11 = __fxcxnsma(v11, reg11, __cimag(reg01)); \ + v12 = __fxcxnsma(v12, reg11, __cimag(reg02)); \ + v10 = __fxcpmadd(v10, reg14, __creal(reg03)); \ + v11 = __fxcpmadd(v11, reg14, __creal(reg04)); \ + v12 = __fxcpmadd(v12, reg14, __creal(reg05)); \ + v10 = __fxcxnsma(v10, reg14, __cimag(reg03)); \ + v11 = __fxcxnsma(v11, reg14, __cimag(reg04)); \ + v12 = __fxcxnsma(v12, reg14, __cimag(reg05)); \ + v20 = __fxpmul(reg12, __creal(reg00)); \ + v21 = __fxpmul(reg12, __creal(reg01)); \ + reg00 = __fxpmul(reg12, __creal(reg02)); \ + v20 = __fxcxnsma(v20, reg12, __cimag(reg00)); \ + v21 = __fxcxnsma(v21, reg12, __cimag(reg01)); \ + reg00 = __fxcxnsma(reg00, reg12, __cimag(reg02)); \ + v20 = __fxcpmadd(v20, reg15, __creal(reg03)); \ + v21 = __fxcpmadd(v21, reg15, __creal(reg04)); \ + reg00 = __fxcpmadd(reg00, reg15, __creal(reg05)); \ + v20 = __fxcxnsma(v20, reg15, __cimag(reg03)); \ + v21 = __fxcxnsma(v21, reg15, __cimag(reg04)); \ + reg00 = __fxcxnsma(reg00, reg15, __cimag(reg05)); \ + +/* computes u*v^dagger */ +/* result back to v */ +#define _bgl_su3_times_v_dagger(u) \ + r00 = __lfpd((double*)&(u).c00); \ + r01 = __lfpd((double*)&(u).c01); \ + r02 = __lfpd((double*)&(u).c02); \ + r10 = __lfpd((double*)&(u).c10); \ + r11 = __lfpd((double*)&(u).c11); \ + r12 = __lfpd((double*)&(u).c12); \ + r20 = __lfpd((double*)&(u).c20); \ + r21 = __lfpd((double*)&(u).c21); \ + r22 = __lfpd((double*)&(u).c22); \ + reg03 = __fxpmul(r00, __creal(v00)); \ + reg10 = __fxpmul(r10, __creal(v00)); \ + reg13 = __fxpmul(r20, __creal(v00)); \ + reg04 = __fxpmul(r00, __creal(v10)); \ + reg11 = __fxpmul(r10, __creal(v10)); \ + reg14 = __fxpmul(r20, __creal(v10)); \ + reg05 = __fxpmul(r00, __creal(v20)); \ + reg12 = __fxpmul(r10, __creal(v20)); \ + reg15 = __fxpmul(r20, __creal(v20)); \ + reg03 = __fxcxnsma(reg03, r00, __cimag(v00)); \ + reg10 = __fxcxnsma(reg10, r10, __cimag(v00)); \ + reg13 = __fxcxnsma(reg13, r20, __cimag(v00)); \ + reg04 = __fxcxnsma(reg04, r00, __cimag(v10)); \ + reg11 = __fxcxnsma(reg11, r10, __cimag(v10)); \ + reg14 = __fxcxnsma(reg14, r20, __cimag(v10)); \ + reg05 = __fxcxnsma(reg05, r00, __cimag(v20)); \ + reg12 = __fxcxnsma(reg12, r10, __cimag(v20)); \ + reg15 = __fxcxnsma(reg15, r20, __cimag(v20)); \ + reg03 = __fxcpmadd(reg03, r01, __creal(v01)); \ + reg10 = __fxcpmadd(reg10, r11, __creal(v01)); \ + reg13 = __fxcpmadd(reg13, r21, __creal(v01)); \ + reg04 = __fxcpmadd(reg04, r01, __creal(v11)); \ + reg11 = __fxcpmadd(reg11, r11, __creal(v11)); \ + reg14 = __fxcpmadd(reg14, r21, __creal(v11)); \ + reg05 = __fxcpmadd(reg05, r01, __creal(v21)); \ + reg12 = __fxcpmadd(reg12, r11, __creal(v21)); \ + reg15 = __fxcpmadd(reg15, r21, __creal(v21)); \ + reg03 = __fxcxnsma(reg03, r01, __cimag(v01)); \ + reg10 = __fxcxnsma(reg10, r11, __cimag(v01)); \ + reg13 = __fxcxnsma(reg13, r21, __cimag(v01)); \ + reg04 = __fxcxnsma(reg04, r01, __cimag(v11)); \ + reg11 = __fxcxnsma(reg11, r11, __cimag(v11)); \ + reg14 = __fxcxnsma(reg14, r21, __cimag(v11)); \ + reg05 = __fxcxnsma(reg05, r01, __cimag(v21)); \ + reg12 = __fxcxnsma(reg12, r11, __cimag(v21)); \ + reg15 = __fxcxnsma(reg15, r21, __cimag(v21)); \ + reg03 = __fxcpmadd(reg03, r02, __creal(v02)); \ + reg10 = __fxcpmadd(reg10, r12, __creal(v02)); \ + reg13 = __fxcpmadd(reg13, r22, __creal(v02)); \ + reg04 = __fxcpmadd(reg04, r02, __creal(v12)); \ + reg11 = __fxcpmadd(reg11, r12, __creal(v12)); \ + reg14 = __fxcpmadd(reg14, r22, __creal(v12)); \ + reg05 = __fxcpmadd(reg05, r02, __creal(reg00)); \ + reg12 = __fxcpmadd(reg12, r12, __creal(reg00)); \ + reg15 = __fxcpmadd(reg15, r22, __creal(reg00)); \ + r00 = __fxcxnsma(reg03, r02, __cimag(v02)); \ + r10 = __fxcxnsma(reg10, r12, __cimag(v02)); \ + r20 = __fxcxnsma(reg13, r22, __cimag(v02)); \ + r01 = __fxcxnsma(reg04, r02, __cimag(v12)); \ + r11 = __fxcxnsma(reg11, r12, __cimag(v12)); \ + r21 = __fxcxnsma(reg14, r22, __cimag(v12)); \ + r02 = __fxcxnsma(reg05, r02, __cimag(reg00)); \ + r12 = __fxcxnsma(reg12, r12, __cimag(reg00)); \ + r22 = __fxcxnsma(reg15, r22, __cimag(reg00)); + +/* computes u*v^dagger */ +/* result back to v */ +#define _bgl_su3_times_v(u) \ + r00 = __lfpd((double*)&(u).c00); \ + r01 = __lfpd((double*)&(u).c01); \ + r02 = __lfpd((double*)&(u).c02); \ + r10 = __lfpd((double*)&(u).c10); \ + r11 = __lfpd((double*)&(u).c11); \ + r12 = __lfpd((double*)&(u).c12); \ + r20 = __lfpd((double*)&(u).c20); \ + r21 = __lfpd((double*)&(u).c21); \ + r22 = __lfpd((double*)&(u).c22); \ + reg03 = __fxpmul(r00, __creal(v00)); \ + reg10 = __fxpmul(r10, __creal(v00)); \ + reg13 = __fxpmul(r20, __creal(v00)); \ + reg04 = __fxpmul(r00, __creal(v01)); \ + reg11 = __fxpmul(r10, __creal(v01)); \ + reg14 = __fxpmul(r20, __creal(v01)); \ + reg05 = __fxpmul(r00, __creal(v02)); \ + reg12 = __fxpmul(r10, __creal(v02)); \ + reg15 = __fxpmul(r20, __creal(v02)); \ + reg03 = __fxcxnpma(reg03, r00, __cimag(v00)); \ + reg10 = __fxcxnpma(reg10, r10, __cimag(v00)); \ + reg13 = __fxcxnpma(reg13, r20, __cimag(v00)); \ + reg04 = __fxcxnpma(reg04, r00, __cimag(v01)); \ + reg11 = __fxcxnpma(reg11, r10, __cimag(v01)); \ + reg14 = __fxcxnpma(reg14, r20, __cimag(v01)); \ + reg05 = __fxcxnpma(reg05, r00, __cimag(v02)); \ + reg12 = __fxcxnpma(reg12, r10, __cimag(v02)); \ + reg15 = __fxcxnpma(reg15, r20, __cimag(v02)); \ + reg03 = __fxcpmadd(reg03, r01, __creal(v10)); \ + reg10 = __fxcpmadd(reg10, r11, __creal(v10)); \ + reg13 = __fxcpmadd(reg13, r21, __creal(v10)); \ + reg04 = __fxcpmadd(reg04, r01, __creal(v11)); \ + reg11 = __fxcpmadd(reg11, r11, __creal(v11)); \ + reg14 = __fxcpmadd(reg14, r21, __creal(v11)); \ + reg05 = __fxcpmadd(reg05, r01, __creal(v12)); \ + reg12 = __fxcpmadd(reg12, r11, __creal(v12)); \ + reg15 = __fxcpmadd(reg15, r21, __creal(v12)); \ + reg03 = __fxcxnpma(reg03, r01, __cimag(v10)); \ + reg10 = __fxcxnpma(reg10, r11, __cimag(v10)); \ + reg13 = __fxcxnpma(reg13, r21, __cimag(v10)); \ + reg04 = __fxcxnpma(reg04, r01, __cimag(v11)); \ + reg11 = __fxcxnpma(reg11, r11, __cimag(v11)); \ + reg14 = __fxcxnpma(reg14, r21, __cimag(v11)); \ + reg05 = __fxcxnpma(reg05, r01, __cimag(v12)); \ + reg12 = __fxcxnpma(reg12, r11, __cimag(v12)); \ + reg15 = __fxcxnpma(reg15, r21, __cimag(v12)); \ + reg03 = __fxcpmadd(reg03, r02, __creal(v20)); \ + reg10 = __fxcpmadd(reg10, r12, __creal(v20)); \ + reg13 = __fxcpmadd(reg13, r22, __creal(v20)); \ + reg04 = __fxcpmadd(reg04, r02, __creal(v21)); \ + reg11 = __fxcpmadd(reg11, r12, __creal(v21)); \ + reg14 = __fxcpmadd(reg14, r22, __creal(v21)); \ + reg05 = __fxcpmadd(reg05, r02, __creal(reg00)); \ + reg12 = __fxcpmadd(reg12, r12, __creal(reg00)); \ + reg15 = __fxcpmadd(reg15, r22, __creal(reg00)); \ + r00 = __fxcxnpma(reg03, r02, __cimag(v20)); \ + r10 = __fxcxnpma(reg10, r12, __cimag(v20)); \ + r20 = __fxcxnpma(reg13, r22, __cimag(v20)); \ + r01 = __fxcxnpma(reg04, r02, __cimag(v21)); \ + r11 = __fxcxnpma(reg11, r12, __cimag(v21)); \ + r21 = __fxcxnpma(reg14, r22, __cimag(v21)); \ + r02 = __fxcxnpma(reg05, r02, __cimag(reg00)); \ + r12 = __fxcxnpma(reg12, r12, __cimag(reg00)); \ + r22 = __fxcxnpma(reg15, r22, __cimag(reg00)); + + +#define _bgl_complex_times_r(c) \ + reg00 = __lfpd((double*)&c); \ + reg03 = __fxpmul(r00, __creal(reg00)); \ + reg10 = __fxpmul(r10, __creal(reg00)); \ + reg13 = __fxpmul(r20, __creal(reg00)); \ + reg04 = __fxpmul(r01, __creal(reg00)); \ + reg11 = __fxpmul(r11, __creal(reg00)); \ + reg14 = __fxpmul(r21, __creal(reg00)); \ + reg05 = __fxpmul(r02, __creal(reg00)); \ + reg12 = __fxpmul(r12, __creal(reg00)); \ + reg15 = __fxpmul(r22, __creal(reg00)); \ + r00 = __fxcxnpma(reg03, r00, __cimag(reg00)); \ + r10 = __fxcxnpma(reg10, r10, __cimag(reg00)); \ + r20 = __fxcxnpma(reg13, r20, __cimag(reg00)); \ + r01 = __fxcxnpma(reg04, r01, __cimag(reg00)); \ + r11 = __fxcxnpma(reg11, r11, __cimag(reg00)); \ + r21 = __fxcxnpma(reg14, r21, __cimag(reg00)); \ + r02 = __fxcxnpma(reg05, r02, __cimag(reg00)); \ + r12 = __fxcxnpma(reg12, r12, __cimag(reg00)); \ + r22 = __fxcxnpma(reg15, r22, __cimag(reg00)); + +#define _bgl_trace_lambda_add_assign(r) \ + (r).d1+= (-__cimag(r10) - __cimag(r01)); \ + (r).d2+= (+__creal(r10) - __creal(r01)); \ + (r).d3+= (-__cimag(r00) + __cimag(r11)); \ + (r).d4+= (-__cimag(r20) - __cimag(r02)); \ + (r).d5+= (+__creal(r20) - __creal(r02)); \ + (r).d6+= (-__cimag(r21) - __cimag(r12)); \ + (r).d7+= (+__creal(r21) - __creal(r12)); \ + (r).d8+= ((-__cimag(r00) - __cimag(r11) + 2.*__cimag(r22))*0.577350269189625); + +#define _bgl_trace_lambda_mul_add_assign(r, c) \ + (r).d1+= c * (-__cimag(r10) - __cimag(r01)); \ + (r).d2+= c * (+__creal(r10) - __creal(r01)); \ + (r).d3+= c * (-__cimag(r00) + __cimag(r11)); \ + (r).d4+= c * (-__cimag(r20) - __cimag(r02)); \ + (r).d5+= c * (+__creal(r20) - __creal(r02)); \ + (r).d6+= c * (-__cimag(r21) - __cimag(r12)); \ + (r).d7+= c * (+__creal(r21) - __creal(r12)); \ + (r).d8+= c * ((-__cimag(r00) - __cimag(r11) + 2.*__cimag(r22))*0.577350269189625); + + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/bgq.h b/qcd/part_cpu/applications/QCD/src/kernel_D/bgq.h new file mode 100644 index 0000000000000000000000000000000000000000..552e69761d267d74839ccdd37a18db00e7c5c7f0 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/bgq.h @@ -0,0 +1,138 @@ +#ifndef _BGQ_H +#define _BGQ_H + +#include "bgq2.h" + +#define _vec_load_spinor(r0, r1, r2, r3, r4, r5, phi) \ + r0 = vec_ld(0L, (double*) &(phi).c0); \ + r1 = vec_ld(32L, (double*) &(phi).c0); \ + r2 = vec_ld(64L, (double*) &(phi).c0); \ + r3 = vec_ld(96L, (double*) &(phi).c0); \ + r4 = vec_ld(128L, (double*) &(phi).c0); \ + r5 = vec_ld(160L, (double*) &(phi).c0); + +#define _vec_load_halfspinor(r0, r1, r2, phi) \ + r0 = vec_ld(0L, (double*) &(phi).c0); \ + r1 = vec_ld(32L, (double*) &(phi).c0); \ + r2 = vec_ld(64L, (double*) &(phi).c0); + +#define _vec_load_halfspinor_32(r0, r1, r2, phi) \ + r0 = vec_ld(0L, (float*) &(phi).c0); \ + r1 = vec_ld(16L, (float*) &(phi).c0); \ + r2 = vec_ld(32L, (float*) &(phi).c0); + + +#define _vec_store_spinor(phi, r0, r1, r2, r3, r4, r5) \ + vec_st(r0, 0L, (double*) &(phi).c0); \ + vec_st(r1, 32L, (double*) &(phi).c0); \ + vec_st(r2, 64L, (double*) &(phi).c0); \ + vec_st(r3, 96L, (double*) &(phi).c0); \ + vec_st(r4, 128L, (double*) &(phi).c0); \ + vec_st(r5, 160L, (double*) &(phi).c0); + +#define _vec_add_ul_spinor(rs0, rs1, rs2, r0, r1, r2, r3, r4, r5) \ + rs0 = vec_add(r0, r3); \ + rs1 = vec_add(r1, r4); \ + rs2 = vec_add(r2, r5); + +#define _vec_sub_ul_spinor(rs0, rs1, rs2, r0, r1, r2, r3, r4, r5) \ + rs0 = vec_sub(r0, r3); \ + rs1 = vec_sub(r1, r4); \ + rs2 = vec_sub(r2, r5); + +// requires 32 byte alignment of phi +#define _vec_load(r0, r1, phi) \ + r0 = vec_ld(0L, (double*) &(phi).c0); \ + r1 = vec_ld2(0L, (double*) &(phi).c2); + +#define _vec_load_32(r0, r1, phi) \ + r0 = vec_ld(0L, (float*) &(phi).c0); \ + r1 = vec_ld2(0L, (float*) &(phi).c2); + + +// works also with 16 byte alignement of phi +#define _vec_load16(r0, r1, phi, tmp) \ + r0 = vec_ld2(0L, (double*) &(phi).c0); \ + r1 = vec_ld(0L, (double*) &(phi).c1); \ + tmp = vec_gpci(00145); \ + r0 = vec_perm(r0, r1, tmp); \ + tmp = vec_gpci(02301); \ + r1 = vec_perm(r1, r0, tmp); + +#define _vec_load16_32(r0, r1, phi, tmp) \ + r0 = vec_ld2(0L, (float*) &(phi).c0); \ + r1 = vec_ld(0L, (float*) &(phi).c1); \ + tmp = vec_gpci(00145); \ + r0 = vec_perm(r0, r1, tmp); \ + tmp = vec_gpci(02301); \ + r1 = vec_perm(r1, r0, tmp); + + +// alternative +#define _vec_load16c(r0, r1, phi, tmp) \ + r0 = vec_ld2(0L, (double*) &(phi).c0); \ + r1 = vec_ld(0L, (double*) &(phi).c1); \ + tmp = vec_gpci(00145); \ + r0 = vec_perm(r0, r1, tmp); \ + r1 = vec_ld2(0L, (double*) &(phi).c2); + +// requires 32 byte alignment of phi +#define _vec_store(phi, r0, r1) \ + vec_st((r0), 0L, (double*) &(phi).c0); \ + vec_st2((r1), 0L, (double*) &(phi).c2); + + +// requires 16 byte alignment of phi +#define _vec_store_32(phi, r0, r1) \ + vec_st((r0), 0L, (float*) &(phi).c0); \ + vec_st2((r1), 0L, (float*) &(phi).c2); + + +// requires 16 (and must not be 32) byte alignment of phi +#define _vec_store16(phi, r0, r1, tmp) \ + vec_st2((r0), 0L, (double*) &(phi).c0); \ + tmp = vec_gpci(02345); \ + r0 = vec_perm(r0, r1, tmp); \ + vec_st((r0), 0L, (double *) &(phi).c1); + + +// requires 8 (and must not be 16) byte alignment of phi +#define _vec_store16_32(phi, r0, r1, tmp) \ + vec_st2((r0), 0L, (float*) &(phi).c0); \ + tmp = vec_gpci(02345); \ + r0 = vec_perm(r0, r1, tmp); \ + vec_st((r0), 0L, (float *) &(phi).c1); + + +// requires 32 byte alignment of phi +#define _vec_store_halfspinor(phi, r0, r1, r2) \ + vec_st((r0), 0L, (double*) &(phi).c0); \ + vec_st((r1), 32L, (double*) &(phi).c0); \ + vec_st((r2), 64L, (double*) &(phi).c0); + + // requires 16 byte alignment of phi +#define _vec_store_halfspinor_32(phi, r0, r1, r2) \ + vec_st((r0), 0L, (float*) &(phi).c0); \ + vec_st((r1), 16L, (float*) &(phi).c0); \ + vec_st((r2), 32L, (float*) &(phi).c0); + + +#define _vec_add(rs0, rs1, r0, r1, s0, s1) \ + rs0 = vec_add(r0, s0); \ + rs1 = vec_add(r1, s1); + +#define _vec_sub(rs0, rs1, r0, r1, s0, s1) \ + rs0 = vec_sub(r0, s0); \ + rs1 = vec_sub(r1, s1); + +#define _vec_i_mul_add(rs0, rs1, r0, r1, s0, s1, tmp) \ + tmp = vec_splats(1.); \ + rs0 = vec_xxnpmadd(s0, tmp, r0); \ + rs1 = vec_xxnpmadd(s1, tmp, r1); + +#define _vec_i_mul_sub(rs0, rs1, r0, r1, s0, s1, tmp) \ + tmp = vec_splats(-1.); \ + rs0 = vec_xxnpmadd(s0, tmp, r0); \ + rs1 = vec_xxnpmadd(s1, tmp, r1); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/bgq2.h b/qcd/part_cpu/applications/QCD/src/kernel_D/bgq2.h new file mode 100644 index 0000000000000000000000000000000000000000..49c01bfca2d7bf45d2db870fc54c660232a17c98 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/bgq2.h @@ -0,0 +1,844 @@ +/********************************************************************** + * + * Copyright (C) 2012 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * These are routines and macros for using half of the available + * four floating point units of the BG/Q processor + * + **********************************************************************/ + +#ifndef _BGQ2_H +#define _BGQ2_H + +//#define regtype vector4double + +#define _vec_load2(r0, r1, r2, phi) \ + (r0) = vec_ld2(0L, (double*) &(phi).c0); \ + (r1) = vec_ld2(0L, (double*) &(phi).c1); \ + (r2) = vec_ld2(0L, (double*) &(phi).c2); + +#define _vec_load2_32(r0, r1, r2, phi) \ + (r0) = vec_ld2(0L, (float*) &(phi).c0); \ + (r1) = vec_ld2(0L, (float*) &(phi).c1); \ + (r2) = vec_ld2(0L, (float*) &(phi).c2); + +#define _vec_load2c(r0, r1, phi) \ + r0 = vec_ld(0L, (double*) &(phi).c0); \ + r1 = vec_ld2(0L, (double*) &(phi).c2); + +#define _vec_store2(phi, r0, r1, r2) \ + vec_st2((r0), 0, (double*) &phi.c0); \ + vec_st2((r1), 0, (double*) &phi.c1); \ + vec_st2((r2), 0, (double*) &phi.c2); + +#define _vec_store2_32(phi, r0, r1, r2) \ + vec_st2((r0), 0, (float*) &phi.c0); \ + vec_st2((r1), 0, (float*) &phi.c1); \ + vec_st2((r2), 0, (float*) &phi.c2); + +// r = r + s +#define _vec_add2(r0, r1, r2, s0, s1, s2) \ + (r0) = vec_add((r0), (s0)); \ + (r1) = vec_add((r1), (s1)); \ + (r2) = vec_add((r2), (s2)); + +#define _vec_add_to2(rs0, rs1, rs2, r0, r1, r2, s0, s1, s2) \ + (rs0) = vec_add((r0), (s0)); \ + (rs1) = vec_add((r1), (s1)); \ + (rs2) = vec_add((r2), (s2)); + +// r = r + s +#define _vec_add_double2(r0, r1, r2, r3, r4, r5, s0, s1, s2, s3, s4, s5) \ + (r0) = vec_add((r0), (s0)); \ + (r1) = vec_add((r1), (s1)); \ + (r2) = vec_add((r2), (s2)); \ + (r3) = vec_add((r3), (s3)); \ + (r4) = vec_add((r4), (s4)); \ + (r5) = vec_add((r5), (s5)); + +#define _vec_add_double_to2(rs0, rs1, rs2, rs3, rs4, rs5, r0, r1, r2, r3, r4, r5, s0, s1, s2, s3, s4, s5) \ + (rs0) = vec_add((r0), (s0)); \ + (rs1) = vec_add((r1), (s1)); \ + (rs2) = vec_add((r2), (s2)); \ + (rs3) = vec_add((r3), (s3)); \ + (rs4) = vec_add((r4), (s4)); \ + (rs5) = vec_add((r5), (s5)); + +// r = r - s +#define _vec_sub2(r0, r1, r2, s0, s1, s2) \ + (r0) = vec_sub((r0), (s0)); \ + (r1) = vec_sub((r1), (s1)); \ + (r2) = vec_sub((r2), (s2)); + +#define _vec_sub_to2(rs0, rs1, rs2, r0, r1, r2, s0, s1, s2) \ + (rs0) = vec_sub((r0), (s0)); \ + (rs1) = vec_sub((r1), (s1)); \ + (rs2) = vec_sub((r2), (s2)); + +// r = r - s +#define _vec_sub_double2(r0, r1, r2, r3, r4, r5, s0, s1, s2, s3, s4, s5) \ + (r0) = vec_sub((r0), (s0)); \ + (r1) = vec_sub((r1), (s1)); \ + (r2) = vec_sub((r2), (s2)); \ + (r3) = vec_sub((r3), (s3)); \ + (r4) = vec_sub((r4), (s4)); \ + (r5) = vec_sub((r5), (s5)); + +#define _vec_sub_to_double2(rs0, rs1, rs2, rs3, rs4, rs5, r0, r1, r2, r3, r4, r5, s0, s1, s2, s3, s4, s5) \ + (rs0) = vec_sub((r0), (s0)); \ + (rs1) = vec_sub((r1), (s1)); \ + (rs2) = vec_sub((r2), (s2)); \ + (rs3) = vec_sub((r3), (s3)); \ + (rs4) = vec_sub((r4), (s4)); \ + (rs5) = vec_sub((r5), (s5)); + +// r = r + i*s +#define _vec_i_mul_add2(r0, r1, r2, s0, s1, s2, tmp) \ + tmp = vec_splats(1.); \ + r0 = vec_xxnpmadd(s0, tmp, r0); \ + r1 = vec_xxnpmadd(s1, tmp, r1); \ + r2 = vec_xxnpmadd(s2, tmp, r2); + +#define _vec_i_mul_add_to2(rs0, rs1, rs2, r0, r1, r2, s0, s1, s2, tmp) \ + tmp = vec_splats(1.); \ + rs0 = vec_xxnpmadd(s0, tmp, r0); \ + rs1 = vec_xxnpmadd(s1, tmp, r1); \ + rs2 = vec_xxnpmadd(s2, tmp, r2); + +// r = r + i*s +#define _vec_i_mul_add_double2(r0, r1, r2, r3, r4, r5, s0, s1, s2, s3, s4, s5, tmp) \ + tmp = vec_splats(1.); \ + r0 = vec_xxnpmadd(s0, tmp, r0); \ + r1 = vec_xxnpmadd(s1, tmp, r1); \ + r2 = vec_xxnpmadd(s2, tmp, r2); \ + r3 = vec_xxnpmadd(s3, tmp, r3); \ + r4 = vec_xxnpmadd(s4, tmp, r4); \ + r5 = vec_xxnpmadd(s5, tmp, r5); + +#define _vec_i_mul_add_double_to2(rs0, rs1, rs2, rs3, rs4, rs5, r0, r1, r2, r3, r4, r5, s0, s1, s2, s3, s4, s5, tmp) \ + tmp = vec_splats(1.); \ + rs0 = vec_xxnpmadd(s0, tmp, r0); \ + rs1 = vec_xxnpmadd(s1, tmp, r1); \ + rs2 = vec_xxnpmadd(s2, tmp, r2); \ + rs3 = vec_xxnpmadd(s3, tmp, r3); \ + rs4 = vec_xxnpmadd(s4, tmp, r4); \ + rs5 = vec_xxnpmadd(s5, tmp, r5); + +// r = r - i*s +#define _vec_i_mul_sub2(r0, r1, r2, s0, s1, s2, tmp) \ + tmp = vec_splats(-1.); \ + r0 = vec_xxnpmadd(s0, tmp, r0); \ + r1 = vec_xxnpmadd(s1, tmp, r1); \ + r2 = vec_xxnpmadd(s2, tmp, r2); + +#define _vec_i_mul_sub_to2(rs0, rs1, rs2, r0, r1, r2, s0, s1, s2, tmp) \ + tmp = vec_splats(-1.); \ + rs0 = vec_xxnpmadd(s0, tmp, r0); \ + rs1 = vec_xxnpmadd(s1, tmp, r1); \ + rs2 = vec_xxnpmadd(s2, tmp, r2); + +// r = r - i*s +#define _vec_i_mul_sub_double2(r0, r1, r2, r3, r4, r5, s0, s1, s2, s3, s4, s5, tmp) \ + tmp = vec_splats(-1.); \ + r0 = vec_xxnpmadd(s0, tmp, r0); \ + r1 = vec_xxnpmadd(s1, tmp, r1); \ + r2 = vec_xxnpmadd(s2, tmp, r2); \ + r3 = vec_xxnpmadd(s3, tmp, r3); \ + r4 = vec_xxnpmadd(s4, tmp, r4); \ + r5 = vec_xxnpmadd(s5, tmp, r5); + +#define _vec_i_mul_sub_double_to2(rs0, rs1, rs2, rs3, rs4, rs5, r0, r1, r2, r3, r4, r5, s0, s1, s2, s3, s4, s5, tmp) \ + tmp = vec_splats(-1.); \ + rs0 = vec_xxnpmadd(s0, tmp, r0); \ + rs1 = vec_xxnpmadd(s1, tmp, r1); \ + rs2 = vec_xxnpmadd(s2, tmp, r2); \ + rs3 = vec_xxnpmadd(s3, tmp, r3); \ + rs4 = vec_xxnpmadd(s4, tmp, r4); \ + rs5 = vec_xxnpmadd(s5, tmp, r5); + +#define _vec_cmplx_mul_double2(rs0, rs1, rs2, rs3, rs4, rs5, r0, r1, r2, r3, r4, r5, tmp) \ + rs0 = vec_xmul(r0, tmp); \ + rs1 = vec_xmul(r1, tmp); \ + rs2 = vec_xmul(r2, tmp); \ + rs3 = vec_xmul(r3, tmp); \ + rs4 = vec_xmul(r4, tmp); \ + rs5 = vec_xmul(r5, tmp); \ + rs0 = vec_xxnpmadd(tmp, r0, rs0); \ + rs1 = vec_xxnpmadd(tmp, r1, rs1); \ + rs2 = vec_xxnpmadd(tmp, r2, rs2); \ + rs3 = vec_xxnpmadd(tmp, r3, rs3); \ + rs4 = vec_xxnpmadd(tmp, r4, rs4); \ + rs5 = vec_xxnpmadd(tmp, r5, rs5); + +#define _vec_cmplx_mul_double2c(rs0, rs1, rs2, r0, r1, r2, tmp) \ + rs0 = vec_xmul(r0, tmp); \ + rs1 = vec_xmul(r1, tmp); \ + rs2 = vec_xmul(r2, tmp); \ + rs0 = vec_xxnpmadd(tmp, r0, rs0); \ + rs1 = vec_xxnpmadd(tmp, r1, rs1); \ + rs2 = vec_xxnpmadd(tmp, r2, rs2); + +#define _vec_cmplxcg_mul_double2(rs0, rs1, rs2, rs3, rs4, rs5, r0, r1, r2, r3, r4, r5, tmp) \ + rs0 = vec_xmul(tmp, r0); \ + rs1 = vec_xmul(tmp, r1); \ + rs2 = vec_xmul(tmp, r2); \ + rs3 = vec_xmul(tmp, r3); \ + rs4 = vec_xmul(tmp, r4); \ + rs5 = vec_xmul(tmp, r5); \ + rs0 = vec_xxcpnmadd(r0, tmp, rs0); \ + rs1 = vec_xxcpnmadd(r1, tmp, rs1); \ + rs2 = vec_xxcpnmadd(r2, tmp, rs2); \ + rs3 = vec_xxcpnmadd(r3, tmp, rs3); \ + rs4 = vec_xxcpnmadd(r4, tmp, rs4); \ + rs5 = vec_xxcpnmadd(r5, tmp, rs5); \ + +#define _vec_cmplxcg_mul_double2c(rs0, rs1, rs2, r0, r1, r2, tmp) \ + rs0 = vec_xmul(tmp, r0); \ + rs1 = vec_xmul(tmp, r1); \ + rs2 = vec_xmul(tmp, r2); \ + rs0 = vec_xxcpnmadd(r0, tmp, rs0); \ + rs1 = vec_xxcpnmadd(r1, tmp, rs1); \ + rs2 = vec_xxcpnmadd(r2, tmp, rs2); + +// pushes the second quadword from r0, r1, r2 +// int the first quadword of r3, r4, r5 +#define _vec_unfuse(r0, r1, r2, r3, r4, r5) \ + r3 = vec_sldw(r0, r0, 2); \ + r4 = vec_sldw(r1, r1, 2); \ + r5 = vec_sldw(r2, r2, 2); + +// multiplies one su3 matrix with two su3_vectors +// the first of which stored in r[0-2] +// and the second one in r[3-5] +// +// the resulting two vectors are stored in +// r[6-11] +// +// this routine uses only half of the 4 doubles in vector4double +#define _vec_su3_multiply_double2b(u) \ + U[0] = vec_ld2(0, (double*) &(u)->c00); \ + U[3] = vec_ld2(0, (double*) &(u)->c01); \ + U[6] = vec_ld2(0, (double*) &(u)->c02); \ + U[1] = vec_ld2(0, (double*) &(u)->c10); \ + U[4] = vec_ld2(0, (double*) &(u)->c11); \ + U[7] = vec_ld2(0, (double*) &(u)->c12); \ + U[2] = vec_ld2(0, (double*) &(u)->c20); \ + r[6] = vec_xmul(r[0], U[0]); \ + r[7] = vec_xmul(r[0], U[1]); \ + r[8] = vec_xmul(r[0], U[2]); \ + r[9] = vec_xmul(r[3], U[0]); \ + r[10] = vec_xmul(r[3], U[1]); \ + r[11] = vec_xmul(r[3], U[2]); \ + \ + r[6] = vec_xxnpmadd(U[0], r[0], r[6]); \ + r[7] = vec_xxnpmadd(U[1], r[0], r[7]); \ + r[8] = vec_xxnpmadd(U[2], r[0], r[8]); \ + r[9] = vec_xxnpmadd(U[0], r[3], r[9]); \ + r[10] = vec_xxnpmadd(U[1], r[3], r[10]); \ + r[11] = vec_xxnpmadd(U[2], r[3], r[11]); \ + U[5] = vec_ld2(0, (double*) &(u)->c21); \ + \ + r[6] = vec_xmadd(r[1], U[3], r[6]); \ + r[7] = vec_xmadd(r[1], U[4], r[7]); \ + r[8] = vec_xmadd(r[1], U[5], r[8]); \ + r[9] = vec_xmadd(r[4], U[3], r[9]); \ + r[10] = vec_xmadd(r[4], U[4], r[10]); \ + r[11] = vec_xmadd(r[4], U[5], r[11]); \ + \ + r[6] = vec_xxnpmadd(U[3], r[1], r[6]); \ + r[7] = vec_xxnpmadd(U[4], r[1], r[7]); \ + r[8] = vec_xxnpmadd(U[5], r[1], r[8]); \ + r[9] = vec_xxnpmadd(U[3], r[4], r[9]); \ + r[10] = vec_xxnpmadd(U[4], r[4], r[10]); \ + r[11] = vec_xxnpmadd(U[5], r[4], r[11]); \ + U[8] = vec_ld2(0, (double*) &(u)->c22); \ + \ + r[6] = vec_xmadd(r[2], U[6], r[6]); \ + r[7] = vec_xmadd(r[2], U[7], r[7]); \ + r[8] = vec_xmadd(r[2], U[8], r[8]); \ + r[9] = vec_xmadd(r[5], U[6], r[9]); \ + r[10] = vec_xmadd(r[5], U[7], r[10]); \ + r[11] = vec_xmadd(r[5], U[8], r[11]); \ + \ + r[6] = vec_xxnpmadd(U[6], r[2], r[6]); \ + r[7] = vec_xxnpmadd(U[7], r[2], r[7]); \ + r[8] = vec_xxnpmadd(U[8], r[2], r[8]); \ + r[9] = vec_xxnpmadd(U[6], r[5], r[9]); \ + r[10] = vec_xxnpmadd(U[7], r[5], r[10]); \ + r[11] = vec_xxnpmadd(U[8], r[5], r[11]); + +#define _vec_su3_multiply_double2(u) \ + U0 = vec_ld2(0, (double*) &(u)->c00); \ + U3 = vec_ld2(0, (double*) &(u)->c01); \ + U6 = vec_ld2(0, (double*) &(u)->c02); \ + U1 = vec_ld2(0, (double*) &(u)->c10); \ + U4 = vec_ld2(0, (double*) &(u)->c11); \ + U7 = vec_ld2(0, (double*) &(u)->c12); \ + U2 = vec_ld2(0, (double*) &(u)->c20); \ + r6 = vec_xmul(r0, U0); \ + r7 = vec_xmul(r0, U1); \ + r8 = vec_xmul(r0, U2); \ + r9 = vec_xmul(r3, U0); \ + r10= vec_xmul(r3, U1); \ + r11= vec_xmul(r3, U2); \ + \ + r6 = vec_xxnpmadd(U0, r0, r6); \ + r7 = vec_xxnpmadd(U1, r0, r7); \ + r8 = vec_xxnpmadd(U2, r0, r8); \ + r9 = vec_xxnpmadd(U0, r3, r9); \ + r10= vec_xxnpmadd(U1, r3, r10); \ + r11= vec_xxnpmadd(U2, r3, r11); \ + U0 = vec_ld2(0, (double*) &(u)->c21); \ + \ + r6 = vec_xmadd(r1, U3, r6); \ + r7 = vec_xmadd(r1, U4, r7); \ + r8 = vec_xmadd(r1, U0, r8); \ + r9 = vec_xmadd(r4, U3, r9); \ + r10= vec_xmadd(r4, U4, r10); \ + r11= vec_xmadd(r4, U0, r11); \ + \ + r6 = vec_xxnpmadd(U3, r1, r6); \ + r7 = vec_xxnpmadd(U4, r1, r7); \ + r8 = vec_xxnpmadd(U0, r1, r8); \ + r9 = vec_xxnpmadd(U3, r4, r9); \ + r10= vec_xxnpmadd(U4, r4, r10); \ + r11= vec_xxnpmadd(U0, r4, r11); \ + U1 = vec_ld2(0, (double*) &(u)->c22); \ + \ + r6 = vec_xmadd(r2, U6, r6); \ + r7 = vec_xmadd(r2, U7, r7); \ + r8 = vec_xmadd(r2, U1, r8); \ + r9 = vec_xmadd(r5, U6, r9); \ + r10= vec_xmadd(r5, U7, r10); \ + r11= vec_xmadd(r5, U1, r11); \ + \ + r6 = vec_xxnpmadd(U6, r2, r6); \ + r7 = vec_xxnpmadd(U7, r2, r7); \ + r8 = vec_xxnpmadd(U1, r2, r8); \ + r9 = vec_xxnpmadd(U6, r5, r9); \ + r10= vec_xxnpmadd(U7, r5, r10); \ + r11= vec_xxnpmadd(U1, r5, r11); + + +//same as _vec_su3_multiply_double2 but loading a 32bit gauge field +#define _vec_su3_multiply_double2_32(u) \ + U0 = vec_ld2(0, (float*) &(u)->c00); \ + U3 = vec_ld2(0, (float*) &(u)->c01); \ + U6 = vec_ld2(0, (float*) &(u)->c02); \ + U1 = vec_ld2(0, (float*) &(u)->c10); \ + U4 = vec_ld2(0, (float*) &(u)->c11); \ + U7 = vec_ld2(0, (float*) &(u)->c12); \ + U2 = vec_ld2(0, (float*) &(u)->c20); \ + r6 = vec_xmul(r0, U0); \ + r7 = vec_xmul(r0, U1); \ + r8 = vec_xmul(r0, U2); \ + r9 = vec_xmul(r3, U0); \ + r10= vec_xmul(r3, U1); \ + r11= vec_xmul(r3, U2); \ + \ + r6 = vec_xxnpmadd(U0, r0, r6); \ + r7 = vec_xxnpmadd(U1, r0, r7); \ + r8 = vec_xxnpmadd(U2, r0, r8); \ + r9 = vec_xxnpmadd(U0, r3, r9); \ + r10= vec_xxnpmadd(U1, r3, r10); \ + r11= vec_xxnpmadd(U2, r3, r11); \ + U0 = vec_ld2(0, (float*) &(u)->c21); \ + \ + r6 = vec_xmadd(r1, U3, r6); \ + r7 = vec_xmadd(r1, U4, r7); \ + r8 = vec_xmadd(r1, U0, r8); \ + r9 = vec_xmadd(r4, U3, r9); \ + r10= vec_xmadd(r4, U4, r10); \ + r11= vec_xmadd(r4, U0, r11); \ + \ + r6 = vec_xxnpmadd(U3, r1, r6); \ + r7 = vec_xxnpmadd(U4, r1, r7); \ + r8 = vec_xxnpmadd(U0, r1, r8); \ + r9 = vec_xxnpmadd(U3, r4, r9); \ + r10= vec_xxnpmadd(U4, r4, r10); \ + r11= vec_xxnpmadd(U0, r4, r11); \ + U1 = vec_ld2(0, (float*) &(u)->c22); \ + \ + r6 = vec_xmadd(r2, U6, r6); \ + r7 = vec_xmadd(r2, U7, r7); \ + r8 = vec_xmadd(r2, U1, r8); \ + r9 = vec_xmadd(r5, U6, r9); \ + r10= vec_xmadd(r5, U7, r10); \ + r11= vec_xmadd(r5, U1, r11); \ + \ + r6 = vec_xxnpmadd(U6, r2, r6); \ + r7 = vec_xxnpmadd(U7, r2, r7); \ + r8 = vec_xxnpmadd(U1, r2, r8); \ + r9 = vec_xxnpmadd(U6, r5, r9); \ + r10= vec_xxnpmadd(U7, r5, r10); \ + r11= vec_xxnpmadd(U1, r5, r11); + + + + +#define _vec_su3_multiply(u) \ + U0 = vec_ld2(0, (double*) &(u)->c00); \ + U3 = vec_ld2(0, (double*) &(u)->c01); \ + U6 = vec_ld2(0, (double*) &(u)->c02); \ + U1 = vec_ld2(0, (double*) &(u)->c10); \ + U4 = vec_ld2(0, (double*) &(u)->c11); \ + U7 = vec_ld2(0, (double*) &(u)->c12); \ + U2 = vec_ld2(0, (double*) &(u)->c20); \ + r6 = vec_xmul(r0, U0); \ + r7 = vec_xmul(r0, U1); \ + r8 = vec_xmul(r0, U2); \ + \ + r6 = vec_xxnpmadd(U0, r0, r6); \ + r7 = vec_xxnpmadd(U1, r0, r7); \ + r8 = vec_xxnpmadd(U2, r0, r8); \ + U0 = vec_ld2(0, (double*) &(u)->c21); \ + \ + r6 = vec_xmadd(r1, U3, r6); \ + r7 = vec_xmadd(r1, U4, r7); \ + r8 = vec_xmadd(r1, U0, r8); \ + \ + r6 = vec_xxnpmadd(U3, r1, r6); \ + r7 = vec_xxnpmadd(U4, r1, r7); \ + r8 = vec_xxnpmadd(U0, r1, r8); \ + U1 = vec_ld2(0, (double*) &(u)->c22); \ + \ + r6 = vec_xmadd(r2, U6, r6); \ + r7 = vec_xmadd(r2, U7, r7); \ + r8 = vec_xmadd(r2, U1, r8); \ + \ + r6 = vec_xxnpmadd(U6, r2, r6); \ + r7 = vec_xxnpmadd(U7, r2, r7); \ + r8 = vec_xxnpmadd(U1, r2, r8); \ + + + +#define _vec_su3_inverse_multiply(u) \ + U0 = vec_ld2(0, (double*) &(u)->c00); \ + U1 = vec_ld2(0, (double*) &(u)->c01); \ + U2 = vec_ld2(0, (double*) &(u)->c02); \ + \ + r6 = vec_xmul(U0, r0); \ + r7 = vec_xmul(U1, r0); \ + r8 = vec_xmul(U2, r0); \ + \ + r6 = vec_xxcpnmadd(r0, U0, r6); \ + r7 = vec_xxcpnmadd(r0, U1, r7); \ + r8 = vec_xxcpnmadd(r0, U2, r8); \ + \ + U3 = vec_ld2(0, (double*) &(u)->c10); \ + U4 = vec_ld2(0, (double*) &(u)->c11); \ + U6 = vec_ld2(0, (double*) &(u)->c12); \ + \ + r6 = vec_xmadd(U3, r1, r6); \ + r7 = vec_xmadd(U4, r1, r7); \ + r8 = vec_xmadd(U6, r1, r8); \ + \ + r6 = vec_xxcpnmadd(r1, U3, r6); \ + r7 = vec_xxcpnmadd(r1, U4, r7); \ + r8 = vec_xxcpnmadd(r1, U6, r8); \ + \ + U0 = vec_ld2(0, (double*) &(u)->c20); \ + U1 = vec_ld2(0, (double*) &(u)->c21); \ + U2 = vec_ld2(0, (double*) &(u)->c22); \ + \ + r6 = vec_xmadd(U0, r2, r6); \ + r7 = vec_xmadd(U1, r2, r7); \ + r8 = vec_xmadd(U2, r2, r8); \ + \ + r6 = vec_xxcpnmadd(r2, U0, r6); \ + r7 = vec_xxcpnmadd(r2, U1, r7); \ + r8 = vec_xxcpnmadd(r2, U2, r8); \ + + + + +// expects the spinor to act on in +// r0, r1 -> s0 +// r2, r3 -> s1 +#define _vec_su3_multiply_double2c(u) \ + r8 = vec_gpci(00145); \ + r9 = vec_gpci(02367); \ + U0 = vec_ld2(0, (double*) &(u)->c00); \ + U3 = vec_ld2(0, (double*) &(u)->c01); \ + U6 = vec_ld2(0, (double*) &(u)->c02); \ + U1 = vec_ld2(0, (double*) &(u)->c10); \ + r7 = vec_perm(r0, r2, r8); \ + U4 = vec_ld2(0, (double*) &(u)->c11); \ + U7 = vec_ld2(0, (double*) &(u)->c12); \ + U2 = vec_ld2(0, (double*) &(u)->c20); \ + r4 = vec_xmul(r7, U0); \ + r5 = vec_xmul(r7, U1); \ + r6 = vec_xmul(r7, U2); \ + \ + r4 = vec_xxnpmadd(U0, r7, r4); \ + r5 = vec_xxnpmadd(U1, r7, r5); \ + r6 = vec_xxnpmadd(U2, r7, r6); \ + r7 = vec_perm(r0, r2, r9); \ + U0 = vec_ld2(0, (double*) &(u)->c21); \ + \ + r4 = vec_xmadd(r7, U3, r4); \ + r5 = vec_xmadd(r7, U4, r5); \ + r6 = vec_xmadd(r7, U0, r6); \ + \ + r4 = vec_xxnpmadd(U3, r7, r4); \ + r5 = vec_xxnpmadd(U4, r7, r5); \ + r6 = vec_xxnpmadd(U0, r7, r6); \ + r7 = vec_perm(r1, r3, r8); \ + U1 = vec_ld2(0, (double*) &(u)->c22); \ + \ + r4 = vec_xmadd(r7, U6, r4); \ + r5 = vec_xmadd(r7, U7, r5); \ + r6 = vec_xmadd(r7, U1, r6); \ + \ + r4 = vec_xxnpmadd(U6, r7, r4); \ + r5 = vec_xxnpmadd(U7, r7, r5); \ + r6 = vec_xxnpmadd(U1, r7, r6); + + + +#define _vec_su3_multiply_double2c_32(u) \ + r8 = vec_gpci(00145); \ + r9 = vec_gpci(02367); \ + U0 = vec_ld2(0, (float*) &(u)->c00); \ + U3 = vec_ld2(0, (float*) &(u)->c01); \ + U6 = vec_ld2(0, (float*) &(u)->c02); \ + U1 = vec_ld2(0, (float*) &(u)->c10); \ + r7 = vec_perm(r0, r2, r8); \ + U4 = vec_ld2(0, (float*) &(u)->c11); \ + U7 = vec_ld2(0, (float*) &(u)->c12); \ + U2 = vec_ld2(0, (float*) &(u)->c20); \ + r4 = vec_xmul(r7, U0); \ + r5 = vec_xmul(r7, U1); \ + r6 = vec_xmul(r7, U2); \ + \ + r4 = vec_xxnpmadd(U0, r7, r4); \ + r5 = vec_xxnpmadd(U1, r7, r5); \ + r6 = vec_xxnpmadd(U2, r7, r6); \ + r7 = vec_perm(r0, r2, r9); \ + U0 = vec_ld2(0, (float*) &(u)->c21); \ + \ + r4 = vec_xmadd(r7, U3, r4); \ + r5 = vec_xmadd(r7, U4, r5); \ + r6 = vec_xmadd(r7, U0, r6); \ + \ + r4 = vec_xxnpmadd(U3, r7, r4); \ + r5 = vec_xxnpmadd(U4, r7, r5); \ + r6 = vec_xxnpmadd(U0, r7, r6); \ + r7 = vec_perm(r1, r3, r8); \ + U1 = vec_ld2(0, (float*) &(u)->c22); \ + \ + r4 = vec_xmadd(r7, U6, r4); \ + r5 = vec_xmadd(r7, U7, r5); \ + r6 = vec_xmadd(r7, U1, r6); \ + \ + r4 = vec_xxnpmadd(U6, r7, r4); \ + r5 = vec_xxnpmadd(U7, r7, r5); \ + r6 = vec_xxnpmadd(U1, r7, r6); + + + +#define _vec_su3_multiply_double2ct(u) \ + r8 = vec_gpci(00167); \ + U0 = vec_ld2(0, (double*) &(u)->c00); \ + U3 = vec_ld2(0, (double*) &(u)->c01); \ + U6 = vec_ld2(0, (double*) &(u)->c02); \ + U1 = vec_ld2(0, (double*) &(u)->c10); \ + r7 = vec_perm(r0, r1, r8); \ + U4 = vec_ld2(0, (double*) &(u)->c11); \ + U7 = vec_ld2(0, (double*) &(u)->c12); \ + U2 = vec_ld2(0, (double*) &(u)->c20); \ + r4 = vec_xmul(r7, U0); \ + r5 = vec_xmul(r7, U1); \ + r6 = vec_xmul(r7, U2); \ + \ + r4 = vec_xxnpmadd(U0, r7, r4); \ + r5 = vec_xxnpmadd(U1, r7, r5); \ + r6 = vec_xxnpmadd(U2, r7, r6); \ + r7 = vec_sldw(r0, r2, 2); \ + U0 = vec_ld2(0, (double*) &(u)->c21); \ + \ + r4 = vec_xmadd(r7, U3, r4); \ + r5 = vec_xmadd(r7, U4, r5); \ + r6 = vec_xmadd(r7, U0, r6); \ + \ + r4 = vec_xxnpmadd(U3, r7, r4); \ + r5 = vec_xxnpmadd(U4, r7, r5); \ + r6 = vec_xxnpmadd(U0, r7, r6); \ + r7 = vec_perm(r1, r2, r8); \ + U1 = vec_ld2(0, (double*) &(u)->c22); \ + \ + r4 = vec_xmadd(r7, U6, r4); \ + r5 = vec_xmadd(r7, U7, r5); \ + r6 = vec_xmadd(r7, U1, r6); \ + \ + r4 = vec_xxnpmadd(U6, r7, r4); \ + r5 = vec_xxnpmadd(U7, r7, r5); \ + r6 = vec_xxnpmadd(U1, r7, r6); + +// multiplies the inverse of one su3 matrix with two su3_vectors +// the first of which stored in r[0-2] +// and the second one in r[3-5] +// +// the resulting two vectors are stored in +// r[6-11] +// +// this routine uses only half of the 4 doubles in vector4double +#define _vec_su3_inverse_multiply_double2(u) \ + U0 = vec_ld2(0, (double*) &(u)->c00); \ + U1 = vec_ld2(0, (double*) &(u)->c01); \ + U2 = vec_ld2(0, (double*) &(u)->c02); \ + \ + r6 = vec_xmul(U0, r0); \ + r7 = vec_xmul(U1, r0); \ + r8 = vec_xmul(U2, r0); \ + r9 = vec_xmul(U0, r3); \ + r10= vec_xmul(U1, r3); \ + r11= vec_xmul(U2, r3); \ + \ + r6 = vec_xxcpnmadd(r0, U0, r6); \ + r7 = vec_xxcpnmadd(r0, U1, r7); \ + r8 = vec_xxcpnmadd(r0, U2, r8); \ + r9 = vec_xxcpnmadd(r3, U0, r9); \ + r10= vec_xxcpnmadd(r3, U1, r10); \ + r11= vec_xxcpnmadd(r3, U2, r11); \ + \ + U3 = vec_ld2(0, (double*) &(u)->c10); \ + U4 = vec_ld2(0, (double*) &(u)->c11); \ + U6 = vec_ld2(0, (double*) &(u)->c12); \ + \ + r6 = vec_xmadd(U3, r1, r6); \ + r7 = vec_xmadd(U4, r1, r7); \ + r8 = vec_xmadd(U6, r1, r8); \ + r9 = vec_xmadd(U3, r4, r9); \ + r10= vec_xmadd(U4, r4, r10); \ + r11= vec_xmadd(U6, r4, r11); \ + \ + r6 = vec_xxcpnmadd(r1, U3, r6); \ + r7 = vec_xxcpnmadd(r1, U4, r7); \ + r8 = vec_xxcpnmadd(r1, U6, r8); \ + r9 = vec_xxcpnmadd(r4, U3, r9); \ + r10= vec_xxcpnmadd(r4, U4, r10); \ + r11= vec_xxcpnmadd(r4, U6, r11); \ + \ + U0 = vec_ld2(0, (double*) &(u)->c20); \ + U1 = vec_ld2(0, (double*) &(u)->c21); \ + U2 = vec_ld2(0, (double*) &(u)->c22); \ + \ + r6 = vec_xmadd(U0, r2, r6); \ + r7 = vec_xmadd(U1, r2, r7); \ + r8 = vec_xmadd(U2, r2, r8); \ + r9 = vec_xmadd(U0, r5, r9); \ + r10= vec_xmadd(U1, r5, r10); \ + r11= vec_xmadd(U2, r5, r11); \ + \ + r6 = vec_xxcpnmadd(r2, U0, r6); \ + r7 = vec_xxcpnmadd(r2, U1, r7); \ + r8 = vec_xxcpnmadd(r2, U2, r8); \ + r9 = vec_xxcpnmadd(r5, U0, r9); \ + r10= vec_xxcpnmadd(r5, U1, r10); \ + r11= vec_xxcpnmadd(r5, U2, r11); + + +//same as _vec_su3_inverse_multiply_double2 but for 32bit gauge field +#define _vec_su3_inverse_multiply_double2_32(u) \ + U0 = vec_ld2(0, (float*) &(u)->c00); \ + U1 = vec_ld2(0, (float*) &(u)->c01); \ + U2 = vec_ld2(0, (float*) &(u)->c02); \ + \ + r6 = vec_xmul(U0, r0); \ + r7 = vec_xmul(U1, r0); \ + r8 = vec_xmul(U2, r0); \ + r9 = vec_xmul(U0, r3); \ + r10= vec_xmul(U1, r3); \ + r11= vec_xmul(U2, r3); \ + \ + r6 = vec_xxcpnmadd(r0, U0, r6); \ + r7 = vec_xxcpnmadd(r0, U1, r7); \ + r8 = vec_xxcpnmadd(r0, U2, r8); \ + r9 = vec_xxcpnmadd(r3, U0, r9); \ + r10= vec_xxcpnmadd(r3, U1, r10); \ + r11= vec_xxcpnmadd(r3, U2, r11); \ + \ + U3 = vec_ld2(0, (float*) &(u)->c10); \ + U4 = vec_ld2(0, (float*) &(u)->c11); \ + U6 = vec_ld2(0, (float*) &(u)->c12); \ + \ + r6 = vec_xmadd(U3, r1, r6); \ + r7 = vec_xmadd(U4, r1, r7); \ + r8 = vec_xmadd(U6, r1, r8); \ + r9 = vec_xmadd(U3, r4, r9); \ + r10= vec_xmadd(U4, r4, r10); \ + r11= vec_xmadd(U6, r4, r11); \ + \ + r6 = vec_xxcpnmadd(r1, U3, r6); \ + r7 = vec_xxcpnmadd(r1, U4, r7); \ + r8 = vec_xxcpnmadd(r1, U6, r8); \ + r9 = vec_xxcpnmadd(r4, U3, r9); \ + r10= vec_xxcpnmadd(r4, U4, r10); \ + r11= vec_xxcpnmadd(r4, U6, r11); \ + \ + U0 = vec_ld2(0, (float*) &(u)->c20); \ + U1 = vec_ld2(0, (float*) &(u)->c21); \ + U2 = vec_ld2(0, (float*) &(u)->c22); \ + \ + r6 = vec_xmadd(U0, r2, r6); \ + r7 = vec_xmadd(U1, r2, r7); \ + r8 = vec_xmadd(U2, r2, r8); \ + r9 = vec_xmadd(U0, r5, r9); \ + r10= vec_xmadd(U1, r5, r10); \ + r11= vec_xmadd(U2, r5, r11); \ + \ + r6 = vec_xxcpnmadd(r2, U0, r6); \ + r7 = vec_xxcpnmadd(r2, U1, r7); \ + r8 = vec_xxcpnmadd(r2, U2, r8); \ + r9 = vec_xxcpnmadd(r5, U0, r9); \ + r10= vec_xxcpnmadd(r5, U1, r10); \ + r11= vec_xxcpnmadd(r5, U2, r11); + + + +#define _vec_su3_inverse_multiply_double2c(u) \ + U0 = vec_ld2(0, (double*) &(u)->c00); \ + r8 = vec_gpci(00145); \ + r9 = vec_gpci(02367); \ + U1 = vec_ld2(0, (double*) &(u)->c01); \ + r7 = vec_perm(r0, r2, r8); \ + U2 = vec_ld2(0, (double*) &(u)->c02); \ + \ + r4 = vec_xmul(U0, r7); \ + r5 = vec_xmul(U1, r7); \ + r6 = vec_xmul(U2, r7); \ + \ + r4 = vec_xxcpnmadd(r7, U0, r4); \ + r5 = vec_xxcpnmadd(r7, U1, r5); \ + r6 = vec_xxcpnmadd(r7, U2, r6); \ + \ + r7 = vec_perm(r0, r2, r9); \ + U3 = vec_ld2(0, (double*) &(u)->c10); \ + U4 = vec_ld2(0, (double*) &(u)->c11); \ + U6 = vec_ld2(0, (double*) &(u)->c12); \ + \ + r4 = vec_xmadd(U3, r7, r4); \ + r5 = vec_xmadd(U4, r7, r5); \ + r6 = vec_xmadd(U6, r7, r6); \ + \ + r4 = vec_xxcpnmadd(r7, U3, r4); \ + r5 = vec_xxcpnmadd(r7, U4, r5); \ + r6 = vec_xxcpnmadd(r7, U6, r6); \ + \ + r7 = vec_perm(r1, r3, r8); \ + U0 = vec_ld2(0, (double*) &(u)->c20); \ + U1 = vec_ld2(0, (double*) &(u)->c21); \ + U2 = vec_ld2(0, (double*) &(u)->c22); \ + \ + r4 = vec_xmadd(U0, r7, r4); \ + r5 = vec_xmadd(U1, r7, r5); \ + r6 = vec_xmadd(U2, r7, r6); \ + \ + r4 = vec_xxcpnmadd(r7, U0, r4); \ + r5 = vec_xxcpnmadd(r7, U1, r5); \ + r6 = vec_xxcpnmadd(r7, U2, r6); + + +#define _vec_su3_inverse_multiply_double2c_32(u) \ + U0 = vec_ld2(0, (float*) &(u)->c00); \ + r8 = vec_gpci(00145); \ + r9 = vec_gpci(02367); \ + U1 = vec_ld2(0, (float*) &(u)->c01); \ + r7 = vec_perm(r0, r2, r8); \ + U2 = vec_ld2(0, (float*) &(u)->c02); \ + \ + r4 = vec_xmul(U0, r7); \ + r5 = vec_xmul(U1, r7); \ + r6 = vec_xmul(U2, r7); \ + \ + r4 = vec_xxcpnmadd(r7, U0, r4); \ + r5 = vec_xxcpnmadd(r7, U1, r5); \ + r6 = vec_xxcpnmadd(r7, U2, r6); \ + \ + r7 = vec_perm(r0, r2, r9); \ + U3 = vec_ld2(0, (float*) &(u)->c10); \ + U4 = vec_ld2(0, (float*) &(u)->c11); \ + U6 = vec_ld2(0, (float*) &(u)->c12); \ + \ + r4 = vec_xmadd(U3, r7, r4); \ + r5 = vec_xmadd(U4, r7, r5); \ + r6 = vec_xmadd(U6, r7, r6); \ + \ + r4 = vec_xxcpnmadd(r7, U3, r4); \ + r5 = vec_xxcpnmadd(r7, U4, r5); \ + r6 = vec_xxcpnmadd(r7, U6, r6); \ + \ + r7 = vec_perm(r1, r3, r8); \ + U0 = vec_ld2(0, (float*) &(u)->c20); \ + U1 = vec_ld2(0, (float*) &(u)->c21); \ + U2 = vec_ld2(0, (float*) &(u)->c22); \ + \ + r4 = vec_xmadd(U0, r7, r4); \ + r5 = vec_xmadd(U1, r7, r5); \ + r6 = vec_xmadd(U2, r7, r6); \ + \ + r4 = vec_xxcpnmadd(r7, U0, r4); \ + r5 = vec_xxcpnmadd(r7, U1, r5); \ + r6 = vec_xxcpnmadd(r7, U2, r6); + + + + +#define _vec_su3_inverse_multiply_double2ct(u) \ + U0 = vec_ld2(0, (double*) &(u)->c00); \ + r8 = vec_gpci(00167); \ + U1 = vec_ld2(0, (double*) &(u)->c01); \ + r7 = vec_perm(r0, r1, r8); \ + U2 = vec_ld2(0, (double*) &(u)->c02); \ + \ + r4 = vec_xmul(U0, r7); \ + r5 = vec_xmul(U1, r7); \ + r6 = vec_xmul(U2, r7); \ + \ + r4 = vec_xxcpnmadd(r7, U0, r4); \ + r5 = vec_xxcpnmadd(r7, U1, r5); \ + r6 = vec_xxcpnmadd(r7, U2, r6); \ + \ + r7 = vec_sldw(r0, r2, 2); \ + U3 = vec_ld2(0, (double*) &(u)->c10); \ + U4 = vec_ld2(0, (double*) &(u)->c11); \ + U6 = vec_ld2(0, (double*) &(u)->c12); \ + \ + r4 = vec_xmadd(U3, r7, r4); \ + r5 = vec_xmadd(U4, r7, r5); \ + r6 = vec_xmadd(U6, r7, r6); \ + \ + r4 = vec_xxcpnmadd(r7, U3, r4); \ + r5 = vec_xxcpnmadd(r7, U4, r5); \ + r6 = vec_xxcpnmadd(r7, U6, r6); \ + \ + r7 = vec_perm(r1, r2, r8); \ + U0 = vec_ld2(0, (double*) &(u)->c20); \ + U1 = vec_ld2(0, (double*) &(u)->c21); \ + U2 = vec_ld2(0, (double*) &(u)->c22); \ + \ + r4 = vec_xmadd(U0, r7, r4); \ + r5 = vec_xmadd(U1, r7, r5); \ + r6 = vec_xmadd(U2, r7, r6); \ + \ + r4 = vec_xxcpnmadd(r7, U0, r4); \ + r5 = vec_xxcpnmadd(r7, U1, r5); \ + r6 = vec_xxcpnmadd(r7, U2, r6); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/block.c b/qcd/part_cpu/applications/QCD/src/kernel_D/block.c new file mode 100644 index 0000000000000000000000000000000000000000..f6644ac3f1b93623adab731b585c5bb67d0ffd80 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/block.c @@ -0,0 +1,1586 @@ +/*********************************************************************** + * + * Copyright (C) 2008 Albert Deuzeman, Siebren Reker, Carsten Urbach + * 2010 Claude Tadonki, Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#include +#include "global.h" +#include "operator/D_psi.h" +#include "linalg_eo.h" +#include "start.h" +#include "xchange/xchange.h" +#include "block.h" +#include "solver/lu_solve.h" +#include "su3.h" + +#define CALLOC_ERROR_CRASH {printf ("calloc errno : %d\n", errno); errno = 0; return 1;} + + +int init_blocks_geometry(); + +int **** block_ipt; +int *** bipt__; +int ** bipt_; +int * bipt; +_Complex double * little_A = NULL; +_Complex float * little_A32 = NULL; +_Complex double * little_A_eo = NULL; +_Complex float * little_A32_eo = NULL; +int * block_idx; +int * block_evenidx; +int * block_oddidx; +enum{ + NONE = 0, + T_UP = 1, + T_DN = 2, + X_UP = 3, + X_DN = 4, + Y_UP = 5, + Y_DN = 6, + Z_UP = 7, + Z_DN = 8 +} Direction; + +static void (*boundary_D[8])(spinor * const r, spinor * const s, su3 *u) = +{boundary_D_0, boundary_D_1, boundary_D_2, boundary_D_3, boundary_D_4, boundary_D_5, boundary_D_6, boundary_D_7}; + + + +block * block_list = NULL; +static spinor * basis = NULL; +static su3 * u = NULL; +const int spinpad = 1; +static int block_init = 0; + +int dT, dX, dY, dZ; /* Block dimension */ + + +int index_a(int t, int x, int y, int z){ + /* Provides the absolute lexicographic index of (t, x, y, z) + Useful to walk over the blocks, maybe could be just g_ipt[t][x][y][z] + Claude Tadonki (claude.tadonki@u-psud.fr) + */ + return ((t*LX + x)*LY + y)*(LZ) + z; +} +int index_b(int t, int x, int y, int z){ + /* Provides the block lexicographic index of (t, x, y, z) + Useful to walk inside a block + Claude Tadonki (claude.tadonki@u-psud.fr) + */ + return ((t*dX + x)*dY + y)*(dZ) + z; +} +int block_index(int t, int x, int y, int z){ + /* Provides the lexicographic index of the block (t, x, y, z) + Useful to walk over the blocks + Claude Tadonki (claude.tadonki@u-psud.fr) + */ + return ((t*nblks_x + x)*nblks_y + y)*(nblks_z) + z; +} + +int init_blocks(const int nt, const int nx, const int ny, const int nz) { + int i,j; + /* Initialization of block-global variables for blocks */ + nb_blocks = 1; + nblks_t = nt; + nblks_x = nx; + nblks_y = ny; + nblks_z = nz; + blk_gauge_eo = -1; + nblks_dir[0] = nblks_t; + nblks_dir[1] = nblks_x; + nblks_dir[2] = nblks_y; + nblks_dir[3] = nblks_z; + nb_blocks = nblks_t*nblks_x*nblks_y*nblks_z; + dT = T/nblks_t; + dX = LX/nblks_x; + dY = LY/nblks_y; + dZ = LZ/nblks_z; + if(g_proc_id == 0 && g_debug_level > 0) { + printf("# Number of deflation blocks = %d\n n_block_t = %d\n n_block_x = %d\n n_block_y = %d\n n_block_z = %d\n", + nb_blocks, nblks_t, nblks_x, nblks_y, nblks_z); + /* printf("# Number of iteration with the polynomial preconditioner = %d \n", dfl_field_iter); */ + /* printf("# Number of iteration in the polynomial preconditioner = %d \n", dfl_poly_iter); */ + } + + free_blocks(); + block_init = 1; + block_list = calloc(nb_blocks, sizeof(block)); + if((void*)(basis = (spinor*)calloc((nb_blocks + 1) * g_N_s * (VOLUME/nb_blocks + spinpad) + 1, sizeof(spinor))) == NULL) { + CALLOC_ERROR_CRASH; + } + if((void*)(u = (su3*)calloc(1+8*VOLUME, sizeof(su3))) == NULL) { + CALLOC_ERROR_CRASH; + } + for(i = 0; i < nb_blocks; i++) { + block_list[i].basis = (spinor**)calloc(g_N_s, sizeof(spinor*)); + } + +#if ( defined SSE || defined SSE2 || defined SSE3) + block_list[0].basis[0] = (spinor*)(((unsigned long int)(basis)+ALIGN_BASE)&~ALIGN_BASE); + block_list[0].u = (su3*)(((unsigned long int)(u)+ALIGN_BASE)&~ALIGN_BASE); +#else + block_list[0].basis[0] = basis; + block_list[0].u = u; +#endif + for(j = 1; j < nb_blocks; j++) { + block_list[j].basis[0] = block_list[j-1].basis[0] + g_N_s*((VOLUME/nb_blocks) + spinpad) ; + block_list[j].u = block_list[j-1].u + 8*(VOLUME/nb_blocks); + } + for(j = 0; j < nb_blocks; j++) { + for(i = 1 ; i < g_N_s ; i ++ ) { + block_list[j].basis[i] = block_list[j].basis[i-1] + (VOLUME/nb_blocks + spinpad); + } + } + + if((void*)(block_ipt = (int****)calloc(T/nblks_t+2,sizeof(int*))) == NULL) return(5); + if((void*)(bipt__ = (int***)calloc ((T/nblks_t+2)*(LX/nblks_x+2), sizeof(int*))) == NULL) return(4); + if((void*)(bipt_ = (int**)calloc((T/nblks_t+2)*(LX/nblks_x+2)*(LY/nblks_y+2), sizeof(int*))) == NULL) return(3); + if((void*)(bipt = (int*)calloc((T/nblks_t+2)*(LX/nblks_x+2)*(LY/nblks_y+2)*(LZ/nblks_z+2), sizeof(int))) == NULL) return(8); + if((void*)(index_block_eo = (int*)calloc(nblks_t*nblks_x*nblks_y*nblks_z, sizeof(int))) == NULL) return(8); + bipt_[0] = bipt; + bipt__[0] = bipt_; + block_ipt[0] = bipt__; + for(i = 1; i < (T/nblks_t+2)*(LX/nblks_x+2)*(LY/nblks_y+2); i++) { + bipt_[i] = bipt_[i-1]+(LZ/nblks_z+2); + } + for(i = 1; i < (T/nblks_t+2)*(LX/nblks_x+2); i++) { + bipt__[i] = bipt__[i-1]+(LY/nblks_y+2); + } + for(i = 1; i < (T/nblks_t+2); i++) { + block_ipt[i] = block_ipt[i-1]+(LX/nblks_x+2); + } + + for (i = 0; i < nb_blocks; ++i) { + block_list[i].id = i; + block_list[i].volume = VOLUME/nb_blocks; + block_list[i].BLX = LX/nblks_x; + block_list[i].BLY = LY/nblks_y; + block_list[i].BLZ = LZ/nblks_z; + block_list[i].BT = T/nblks_t; + block_list[i].ns = g_N_s; + block_list[i].spinpad = spinpad; + + /* The following has not yet been adapted for */ + /* new block geometry right? (C.U.) */ + for (j = 0 ; j < 6; ++j) { +#ifdef MPI + block_list[i].mpilocal_neighbour[j] = (g_nb_list[j] == g_cart_id) ? i : -1; +#else + block_list[i].mpilocal_neighbour[j] = i; +#endif + } +#ifdef MPI + block_list[i].mpilocal_neighbour[6] = (i == 0 ? 1 : (g_nb_list[j] == g_cart_id) ? 0 : -1); + block_list[i].mpilocal_neighbour[7] = (i == 1 ? 0 : (g_nb_list[j] == g_cart_id) ? 1 : -1); +#else + block_list[i].mpilocal_neighbour[6] = (i == 0 ? 1 : 0); + block_list[i].mpilocal_neighbour[7] = (i == 0 ? 1 : 0); +#endif + if(g_debug_level > 4 && g_proc_id == 0) { + for(j = 0; j < 8; j++) { + printf("block %d mpilocal_neighbour[%d] = %d\n", i, j, block_list[i].mpilocal_neighbour[j]); + } + } + /* till here... (C.U.) */ + + /* block coordinate on the mpilocal processor */ + block_list[i].mpilocal_coordinate[0] = (i / (nblks_x * nblks_y * nblks_z)); + block_list[i].mpilocal_coordinate[1] = (i / (nblks_y * nblks_z)) % nblks_x; + block_list[i].mpilocal_coordinate[2] = (i / (nblks_z)) % nblks_y; + block_list[i].mpilocal_coordinate[3] = i % nblks_z; + + /* global block coordinate */ + for(j = 0; j < 4; j++) { + block_list[i].coordinate[j] = nblks_dir[j] * g_proc_coords[j] + block_list[i].mpilocal_coordinate[j]; + } + /* even/odd id of block coordinate */ + block_list[i].evenodd = (block_list[i].coordinate[0] + block_list[i].coordinate[1] + + block_list[i].coordinate[2] + block_list[i].coordinate[3]) % 2; + + /* block_list[i].evenodd = i % 2; */ + if(g_proc_id == 0 && g_debug_level > 1) { + printf("%d %d (%d %d %d %d)\n", i, block_list[i].evenodd, block_list[i].coordinate[0], block_list[i].coordinate[1], block_list[i].coordinate[2], block_list[i].coordinate[3]); + } + if ((void*)(block_idx = calloc(8 * (VOLUME/nb_blocks), sizeof(int))) == NULL) + CALLOC_ERROR_CRASH; + + if ((void*)(block_evenidx = calloc(8 * (VOLUME/nb_blocks/2), sizeof(int))) == NULL) + CALLOC_ERROR_CRASH; + + if ((void*)(block_oddidx = calloc(8 * (VOLUME/nb_blocks/2), sizeof(int))) == NULL) + CALLOC_ERROR_CRASH; + + for (j = 0; j < g_N_s; j++) { /* write a zero element at the end of every spinor */ + _spinor_null(block_list[i].basis[j][VOLUME/nb_blocks]); + } + + if ((void*)(block_list[i].little_dirac_operator = calloc(9 * g_N_s * g_N_s, sizeof(_Complex double))) == NULL) + CALLOC_ERROR_CRASH; + if ((void*)(block_list[i].little_dirac_operator32 = calloc(9 * g_N_s * g_N_s, sizeof(_Complex float))) == NULL) + CALLOC_ERROR_CRASH; + if ((void*)(block_list[i].little_dirac_operator_eo = calloc(9*g_N_s * g_N_s, sizeof(_Complex double))) == NULL) + CALLOC_ERROR_CRASH; + for (j = 0; j < 9 * g_N_s * g_N_s; ++j) { + block_list[i].little_dirac_operator[j] = 0.0; + block_list[i].little_dirac_operator32[j] = 0.0; + block_list[i].little_dirac_operator_eo[j] = 0.0; + } + } + + + + init_blocks_geometry(); + init_blocks_gaugefield(); + + return 0; +} + +int free_blocks() { + int i; + if(block_init == 1) { + for(i = 0; i < nb_blocks; ++i) { + free(block_list[i].basis); + free(block_list[i].little_dirac_operator); + free(block_list[i].little_dirac_operator32); + free(block_list[i].little_dirac_operator_eo); + } + free(block_ipt); + free(bipt__); + free(bipt_); + free(bipt); + free(index_block_eo); + free(u); + free(basis); + free(block_list); + block_init = 0; + } + return 0; +} +int init_blocks_gaugefield() { + /* + Copies the existing gauge field on the processor into the separate blocks in a form + that is readable by the block Dirac operator. Specifically, in consecutive memory + now +t,-t,+x,-x,+y,-y,+z,-z gauge links are stored. This requires double the storage in + memory. + */ + + int i, x, y, z, t, ix, ix_new = 0; + int bx, by, bz, bt; + + for (t = 0; t < dT; t++) { + for (x = 0; x < dX; x++) { + for (y = 0; y < dY; y++) { + for (z = 0; z < dZ; z++) { + i = 0; + for(bt = 0; bt < nblks_t; bt ++) { + for(bx = 0; bx < nblks_x; bx ++) { + for(by = 0; by < nblks_y; by ++) { + for(bz = 0; bz < nblks_z; bz ++) { + ix = g_ipt[t + bt*dT][x + bx*dX][y + by*dY][z + bz*dZ]; + memcpy(block_list[i].u + ix_new, &g_gauge_field[ ix ][0], sizeof(su3)); + memcpy(block_list[i].u + ix_new + 1, &g_gauge_field[ g_idn[ix][0] ][0], sizeof(su3)); + memcpy(block_list[i].u + ix_new + 2, &g_gauge_field[ ix ][1], sizeof(su3)); + memcpy(block_list[i].u + ix_new + 3, &g_gauge_field[ g_idn[ix][1] ][1], sizeof(su3)); + memcpy(block_list[i].u + ix_new + 4, &g_gauge_field[ ix ][2], sizeof(su3)); + memcpy(block_list[i].u + ix_new + 5, &g_gauge_field[ g_idn[ix][2] ][2], sizeof(su3)); + memcpy(block_list[i].u + ix_new + 6, &g_gauge_field[ ix ][3], sizeof(su3)); + memcpy(block_list[i].u + ix_new + 7, &g_gauge_field[ g_idn[ix][3] ][3], sizeof(su3)); + i++; + } + } + } + } + ix_new += 8; + } + } + } + } + blk_gauge_eo = 0; + return(0); +} + +int init_blocks_eo_gaugefield() { + /* + Copies the existing gauge field on the processor into the separate blocks in a form + that is readable by the block Hopping matrix. Specifically, in consecutive memory + now +t,-t,+x,-x,+y,-y,+z,-z gauge links are stored. This requires double the storage in + memory. + */ + + int i, x, y, z, t, ix, ix_even = 0, ix_odd = (dT*dX*dY*dZ*8)/2, ixeo; + int bx, by, bz, bt, even=0; + + for (t = 0; t < dT; t++) { + for (x = 0; x < dX; x++) { + for (y = 0; y < dY; y++) { + for (z = 0; z < dZ; z++) { + if((t+x+y+z)%2 == 0) { + even = 1; + ixeo = ix_even; + } + else { + even = 0; + ixeo = ix_odd; + } + i = 0; + for(bt = 0; bt < nblks_t; bt ++) { + for(bx = 0; bx < nblks_x; bx ++) { + for(by = 0; by < nblks_y; by ++) { + for(bz = 0; bz < nblks_z; bz ++) { + ix = g_ipt[t + bt*dT][x + bx*dX][y + by*dY][z + bz*dZ]; + memcpy(block_list[i].u + ixeo, &g_gauge_field[ ix ][0], sizeof(su3)); + memcpy(block_list[i].u + ixeo + 1, &g_gauge_field[ g_idn[ix][0] ][0], sizeof(su3)); + memcpy(block_list[i].u + ixeo + 2, &g_gauge_field[ ix ][1], sizeof(su3)); + memcpy(block_list[i].u + ixeo + 3, &g_gauge_field[ g_idn[ix][1] ][1], sizeof(su3)); + memcpy(block_list[i].u + ixeo + 4, &g_gauge_field[ ix ][2], sizeof(su3)); + memcpy(block_list[i].u + ixeo + 5, &g_gauge_field[ g_idn[ix][2] ][2], sizeof(su3)); + memcpy(block_list[i].u + ixeo + 6, &g_gauge_field[ ix ][3], sizeof(su3)); + memcpy(block_list[i].u + ixeo + 7, &g_gauge_field[ g_idn[ix][3] ][3], sizeof(su3)); + i++; + } + } + } + } + if(even) ix_even += 8; + else ix_odd += 8; + } + } + } + } + blk_gauge_eo = 1; + return(0); +} + + +int check_blocks_geometry(block * blk) { + int i, k=0, x, y, z, t; + int * itest; + int * ipt; + ipt = blk->idx; + itest = (int*)calloc(blk->volume + blk->spinpad, sizeof(int)); + for(i = 0; i < 8*blk->volume; i++) { + if(*ipt > blk->volume + blk->spinpad-1 || *ipt < 0) { + if(g_proc_id == 0) { + printf("error in block geometry! ipt = %d dir = %d i = %d of %d\n", + (*ipt), i%8, i/8, blk->volume + blk->spinpad); + } + } + + itest[*(ipt++)]++; + } + + for(i = 0; i < blk->volume; i++) { + k += itest[i]; + if(itest[i] < 1 || itest[i] > 8) { + if(g_proc_id == 0) { + printf("error in block geometry, itest[%d] = %d\n", i, itest[i]); + } + } + } + + if(itest[blk->volume + blk->spinpad-1] != 2*(blk->BLX*blk->BLY*blk->BLZ+blk->BT*blk->BLX*blk->BLY+blk->BT*blk->BLY*blk->BLZ+blk->BT*blk->BLX*blk->BLZ)) { + if(g_proc_id == 0){ + printf("error in block geometry, boundary points wrong %d != %d\n", + itest[blk->volume + blk->spinpad-1], 2*(blk->BLX*blk->BLY*blk->BLZ+blk->BT*blk->BLX*blk->BLY+blk->BT*blk->BLY*blk->BLZ+blk->BT*blk->BLX*blk->BLZ)); + } + } + k+= itest[blk->volume + blk->spinpad-1]; + if(k != 8*blk->volume) { + if(g_proc_id == 0){ + printf("error in block geometry, total number of points wrong %d != %d\n", + k, 8*blk->volume); + } + } + + ipt = blk->idx; + for(t = 0; t < T/nblks_t; t++) { + for(x = 0; x < LX/nblks_x; x++) { + for(y = 0; y < LY/nblks_y; y++) { + for(z = 0; z < LZ/nblks_z; z++) { + i = block_ipt[t][x][y][z]; + if(t != T/nblks_t-1) { + if(*ipt != block_ipt[t+1][x][y][z] && g_proc_id == 0) + printf("Shit +t! %d %d %d %d %d != %d at %d\n", + t, x, y, z, *ipt, block_ipt[t+1][x][y][z], i); + } + else if(*ipt != VOLUME/nb_blocks) + printf("Shit +t! %d %d %d %d %d != %d at %d\n", + t, x, y, z, *ipt, VOLUME/nb_blocks, i); + ipt++; + if(t != 0) { + if(*ipt != block_ipt[t-1][x][y][z] && g_proc_id == 0) + printf("Shit -t! %d %d %d %d %d != %d at %d\n", + t, x, y, z, *ipt, block_ipt[t+1][x][y][z], i); + } + else if(*ipt != VOLUME/nb_blocks) + printf("Shit -t! %d %d %d %d %d != %d at %d\n", + t, x, y, z, *ipt, VOLUME/nb_blocks, i); + ipt++; + if(x != LX/nblks_x-1) { + if(*ipt != block_ipt[t][x+1][y][z] && g_proc_id == 0) + printf("Shit +x! %d %d %d %d %d != %d at %d\n", + t, x, y, z, *ipt, block_ipt[t][x+1][y][z], i); + } + else if(*ipt != VOLUME/nb_blocks) + printf("Shit +x! %d %d %d %d %d != %d at %d\n", + t, x, y, z, *ipt, VOLUME/nb_blocks, i); + ipt++; + if(x != 0) { + if(*ipt != block_ipt[t][x-1][y][z] && g_proc_id == 0) + printf("Shit -x! %d %d %d %d %d != %d at %d\n", + t, x, y, z, *ipt, block_ipt[t][x-1][y][z], i); + } + else if(*ipt != VOLUME/nb_blocks) + printf("Shit -x! %d %d %d %d %d != %d at %d\n", + t, x, y, z, *ipt, VOLUME/nb_blocks, i); + ipt++; + if(y != LY/nblks_y-1) { + if(*ipt != block_ipt[t][x][y+1][z] && g_proc_id == 0) + printf("Shit +y! %d %d %d %d %d != %d at %d\n", + t, x, y, z, *ipt, block_ipt[t][x][y+1][z], i); + } + else if(*ipt != VOLUME/nb_blocks) + printf("Shit +y! %d %d %d %d %d != %d at %d\n", + t, x, y, z, *ipt, VOLUME/nb_blocks, i); + ipt++; + if(y != 0) { + if(*ipt != block_ipt[t][x][y-1][z] && g_proc_id == 0) + printf("Shit -y! %d %d %d %d %d != %d at %d\n", + t, x, y, z, *ipt, block_ipt[t][x][y-1][z], i); + } + else if(*ipt != VOLUME/nb_blocks) + printf("Shit -y! %d %d %d %d %d != %d at %d\n", + t, x, y, z, *ipt, VOLUME/nb_blocks, i); + ipt++; + if(z != LZ/nblks_z-1) { + if(*ipt != block_ipt[t][x][y][z+1] && g_proc_id == 0) + printf("Shit +z! %d %d %d %d %d != %d at %d\n", + t, x, y, z, *ipt, block_ipt[t][x][y][z+1], i); + } + else if(*ipt != VOLUME/nb_blocks) + printf("Shit +z! %d %d %d %d %d != %d at %d\n", + t, x, y, z, *ipt, VOLUME/nb_blocks, i); + ipt++; + if(z != 0) { + if(*ipt != block_ipt[t][x][y][z-1] && g_proc_id == 0) + printf("Shit -z! %d %d %d %d %d != %d at %d\n", + t, x, y, z, *ipt, block_ipt[t][x][y][z-1], i); + } + else if(*ipt != VOLUME/nb_blocks) + printf("Shit -z! %d %d %d %d %d != %d at %d\n", + t, x, y, z, *ipt, VOLUME/nb_blocks, i); + ipt++; + } + } + } + } + + if(g_proc_id == 0 && g_debug_level > 1) { + printf("# block geometry checked successfully for block %d !\n", blk->id); + } + for(i = 0; i < blk->volume; i++) { + itest[i] = 0; + } + ipt = blk->evenidx; + for(i = 0; i < 8*blk->volume/2; i++) { + if(*ipt > (blk->volume/2 + blk->spinpad)-1 || *ipt < 0) { + if(g_proc_id == 0) { + printf("error in block eo geometry! ipt = %d dir = %d i = %d of %d\n", + (*ipt), i%8, i/8, (blk->volume/2 + blk->spinpad)); + } + } + + itest[*(ipt++)]++; + } + + k = 0; + for(i = 0; i < blk->volume/2; i++) { + k += itest[i]; + if(itest[i] < 1 || itest[i] > 8) { + if(g_proc_id == 0) { + printf("error in block eo geometry, itest[%d] = %d\n", i, itest[i]); + } + } + } + k += itest[blk->volume/2 + blk->spinpad-1]; + if(k != 8*blk->volume/2) { + if(g_proc_id == 0) { + printf("error in block eo geometry, total number of points wrong %d != %d\n", + k, 8*blk->volume/2); + } + } + + ipt = blk->evenidx; + for(t = 0; t < T/nblks_t; t++) { + for(x = 0; x < LX/nblks_x; x++) { + for(y = 0; y < LY/nblks_y; y++) { + for(z = 0; z < LZ/nblks_z; z++) { + if((x + y + z + t)%2 == 0) { + i = block_ipt[t][x][y][z]/2; + if(t != T/nblks_t-1) { + if(*ipt != block_ipt[t+1][x][y][z]/2 && g_proc_id == 0) + printf("Shit +t! (%d %d %d %d): %d != %d at %d\n", + t, x, y, z, *ipt, block_ipt[t+1][x][y][z]/2, i); + } + else if(*ipt != VOLUME/nb_blocks/2) + printf("Shit +t! (%d %d %d %d): %d != %d at %d\n", + t, x, y, z, *ipt, VOLUME/nb_blocks/2, i); + ipt++; + if(t != 0) { + if(*ipt != block_ipt[t-1][x][y][z]/2 && g_proc_id == 0) + printf("Shit -t! (%d %d %d %d): %d != %d at %d\n", + t, x, y, z, *ipt, block_ipt[t+1][x][y][z]/2, i); + } + else if(*ipt != VOLUME/nb_blocks/2) + printf("Shit -t! (%d %d %d %d): %d != %d at %d\n", + t, x, y, z, *ipt, VOLUME/nb_blocks/2, i); + ipt++; + if(x != LX/nblks_x-1) { + if(*ipt != block_ipt[t][x+1][y][z]/2 && g_proc_id == 0) + printf("Shit +x! (%d %d %d %d): %d != %d at %d\n", + t, x, y, z, *ipt, block_ipt[t][x+1][y][z]/2, i); + } + else if(*ipt != VOLUME/nb_blocks/2) + printf("Shit +x! (%d %d %d %d): %d != %d at %d\n", + t, x, y, z, *ipt, VOLUME/nb_blocks/2, i); + ipt++; + if(x != 0) { + if(*ipt != block_ipt[t][x-1][y][z]/2 && g_proc_id == 0) + printf("Shit -x! (%d %d %d %d): %d != %d at %d\n", + t, x, y, z, *ipt, block_ipt[t][x-1][y][z]/2, i); + } + else if(*ipt != VOLUME/nb_blocks/2) + printf("Shit -x! (%d %d %d %d): %d != %d at %d\n", + t, x, y, z, *ipt, VOLUME/nb_blocks, i); + ipt++; + if(y != LY/nblks_y-1) { + if(*ipt != block_ipt[t][x][y+1][z]/2 && g_proc_id == 0) + printf("Shit +y! (%d %d %d %d): %d != %d at %d\n", + t, x, y, z, *ipt, block_ipt[t][x][y+1][z]/2, i); + } + else if(*ipt != VOLUME/nb_blocks/2) + printf("Shit +y! (%d %d %d %d): %d != %d at %d\n", + t, x, y, z, *ipt, VOLUME/nb_blocks/2, i); + ipt++; + if(y != 0) { + if(*ipt != block_ipt[t][x][y-1][z]/2 && g_proc_id == 0) + printf("Shit -y! (%d %d %d %d): %d != %d at %d\n", + t, x, y, z, *ipt, block_ipt[t][x][y-1][z]/2, i); + } + else if(*ipt != VOLUME/nb_blocks/2) + printf("Shit -y! (%d %d %d %d): %d != %d at %d\n", + t, x, y, z, *ipt, VOLUME/nb_blocks/2, i); + ipt++; + if(z != LZ/nblks_z-1) { + if(*ipt != block_ipt[t][x][y][z+1]/2 && g_proc_id == 0) + printf("Shit +z! (%d %d %d %d): %d != %d at %d\n", + t, x, y, z, *ipt, block_ipt[t][x][y][z+1]/2, i); + } + else if(*ipt != VOLUME/nb_blocks/2) + printf("Shit +z! (%d %d %d %d): %d != %d at %d\n", + t, x, y, z, *ipt, VOLUME/nb_blocks/2, i); + ipt++; + if(z != 0) { + if(*ipt != block_ipt[t][x][y][z-1]/2 && g_proc_id == 0) + printf("Shit -z! (%d %d %d %d): %d != %d at %d\n", + t, x, y, z, *ipt, block_ipt[t][x][y][z-1]/2, i); + } + else if(*ipt != VOLUME/nb_blocks/2) + printf("Shit -z! (%d %d %d %d): %d != %d at %d\n", + t, x, y, z, *ipt, VOLUME/nb_blocks/2, i); + ipt++; + } + } + } + } + } + + if(g_proc_id == 0 && g_debug_level > 1) { + printf("# block eo geometry checked successfully for block %d !\n", blk->id); + } + + free(itest); + return(0); +} + +int init_blocks_geometry() { + int i, ix, x, y, z, t, eo, i_even, i_odd; + int zstride = 1; + int ystride = dZ; + int xstride = dY * dZ; + int tstride = dX * dY * dZ; + int boundidx = VOLUME/nb_blocks; + for (ix = 0; ix < VOLUME/nb_blocks; ++ix) { + block_idx[8 * ix + 0] = ix >= VOLUME/nb_blocks - tstride ? boundidx : ix + tstride;/* +t */ + block_idx[8 * ix + 1] = ix < tstride ? boundidx : ix - tstride;/* -t */ + block_idx[8 * ix + 2] = (ix % tstride >= dZ * dY * (dX - 1) ? boundidx : ix + xstride);/* +x */ + block_idx[8 * ix + 3] = ix % tstride < dZ * dY ? boundidx : ix - xstride;/* -x */ + block_idx[8 * ix + 4] = (ix % xstride >= dZ * (dY - 1) ? boundidx : ix + ystride);/* +y */ + block_idx[8 * ix + 5] = ix % xstride < dZ ? boundidx : ix - ystride;/* -y */ + block_idx[8 * ix + 6] = ix % ystride == dZ - 1 ? boundidx : ix + zstride;/* +z */ + block_idx[8 * ix + 7] = ix % ystride == 0 ? boundidx : ix - zstride;/* -z */ + /* Assume that all directions have even extension */ + /* even and odd versions should be equal */ + eo = ((ix%dZ)+(ix/ystride)%dY+(ix/(xstride))%dX + +ix/(tstride))%2; + if(eo == 0) { + block_evenidx[8*(ix/2) + 0] = block_idx[8 * ix + 0] / 2; + block_evenidx[8*(ix/2) + 1] = block_idx[8 * ix + 1] / 2; + block_evenidx[8*(ix/2) + 2] = block_idx[8 * ix + 2] / 2; + block_evenidx[8*(ix/2) + 3] = block_idx[8 * ix + 3] / 2; + block_evenidx[8*(ix/2) + 4] = block_idx[8 * ix + 4] / 2; + block_evenidx[8*(ix/2) + 5] = block_idx[8 * ix + 5] / 2; + block_evenidx[8*(ix/2) + 6] = block_idx[8 * ix + 6] / 2; + block_evenidx[8*(ix/2) + 7] = block_idx[8 * ix + 7] / 2; + } + else { + block_oddidx[8*(ix/2) + 0] = block_idx[8 * ix + 0] / 2; + block_oddidx[8*(ix/2) + 1] = block_idx[8 * ix + 1] / 2; + block_oddidx[8*(ix/2) + 2] = block_idx[8 * ix + 2] / 2; + block_oddidx[8*(ix/2) + 3] = block_idx[8 * ix + 3] / 2; + block_oddidx[8*(ix/2) + 4] = block_idx[8 * ix + 4] / 2; + block_oddidx[8*(ix/2) + 5] = block_idx[8 * ix + 5] / 2; + block_oddidx[8*(ix/2) + 6] = block_idx[8 * ix + 6] / 2; + block_oddidx[8*(ix/2) + 7] = block_idx[8 * ix + 7] / 2; + } + } + for(i = 0; i < nb_blocks; i++) { + block_list[i].idx = block_idx; + block_list[i].evenidx = block_evenidx; + block_list[i].oddidx = block_oddidx; + } + ix = 0; + for(t = 0; t < dT; t++) { + for(x = 0; x < dX; x++) { + for(y = 0; y < dY; y++) { + for(z = 0; z < dZ; z++) { + block_ipt[t][x][y][z] = ix; + ix++; + } + } + } + } + + i_even = 0; + i_odd = 0; + for (t=0;tbasis[i], parent->volume, 0)); + mul_r(parent->basis[i], scale, parent->basis[i], parent->volume); + + /* rescaling done, now subtract this direction from all vectors that follow */ + for(j = i + 1; j < g_N_s; ++j){ + coeff = scalar_prod(parent->basis[i], parent->basis[j], parent->volume, 0); + assign_diff_mul(parent->basis[j], parent->basis[i], coeff, parent->volume); + } + } + + if(g_debug_level > 4) { + for(i = 0; i < g_N_s; i++) { + for(j = 0; j < g_N_s; j++) { + coeff = scalar_prod(parent->basis[i], parent->basis[j], parent->volume, 0); + if(g_proc_id == 0) printf("basis id = %d <%d, %d> = %1.3e +i %1.3e\n", parent->id, j, i, creal(coeff), cimag(coeff)); + } + } + } + return; +} + +void block_orthonormalize_free(block *parent) { + int i, j; + _Complex double coeff; + double scale; + + for(i = 0; i < 12; i++){ // CHECK THIS !!!!!! 12 + /* rescale the current vector */ + constant_spinor_field(parent->basis[i], i, parent->volume); + scale = 1. / sqrt(square_norm(parent->basis[i], parent->volume, 0)); + mul_r(parent->basis[i], scale, parent->basis[i], parent->volume); + } + + if(g_debug_level > 4 && g_proc_id == 0) { + for(i = 0; i < g_N_s; i++) { + for(j = 0; j < g_N_s; j++) { + coeff = scalar_prod(parent->basis[i], parent->basis[j], parent->volume, 0); + if(g_proc_id == 0) printf("basis id = %d <%d, %d> = %1.3e +i %1.3e\n", parent->id, j, i, creal(coeff), cimag(coeff)); + } + } + } + return; +} + + + +/* the following 2 functions are reference functions for computing little_d */ +/* but much slower than block_compute_little_D_diagonal and */ +/* block_compute_little_D_offdiagonal */ +void block_contract_basis(int const idx, int const vecnum, int const dir, spinor * const psi){ + int l; + for(l = 0; l < g_N_s; ++l) { + block_list[idx].little_dirac_operator[dir * g_N_s * g_N_s + vecnum * g_N_s + l] = + scalar_prod(block_list[idx].basis[l], psi + idx * (VOLUME/nb_blocks+1), VOLUME/nb_blocks, 0); + } +} + +void alt_block_compute_little_D() { + int i, j, k, l; + spinor *_rec, *rec, *_app, *app, *zero; + spinor *psi, **psi_blocks; + + _rec = calloc(VOLUMEPLUSRAND+1, sizeof(spinor)); +#if ( defined SSE || defined SSE2 || defined SSE3) + rec = (spinor*)(((unsigned long int)(_rec)+ALIGN_BASE)&~ALIGN_BASE); +#else + rec = _rec; +#endif + _app = calloc(VOLUMEPLUSRAND+1, sizeof(spinor)); +#if ( defined SSE || defined SSE2 || defined SSE3) + app = (spinor*)(((unsigned long int)(_app)+ALIGN_BASE)&~ALIGN_BASE); +#else + app = _app; +#endif + zero = calloc(VOLUMEPLUSRAND, sizeof(spinor)); + psi = calloc(VOLUME+nb_blocks, sizeof(spinor)); + psi_blocks = (spinor**)calloc(nb_blocks, sizeof(spinor*)); + for(i=0;i -1) { + if (g_N_s <= 5 && g_cart_id == 0){ + printf("\n\n *** CHECKING LITTLE D ***\n"); + printf("\n ** node 0, lower block **\n"); + for (i = 0*g_N_s; i < 9 * g_N_s; ++i){ + printf(" [ "); + for (j = 0; j < g_N_s; ++j){ + printf("%s%1.3e %s %1.3e i", creal(block_list[0].little_dirac_operator[i * g_N_s + j]) >= 0 ? " " : "- ", creal(block_list[0].little_dirac_operator[i * g_N_s + j]) >= 0 ? creal(block_list[0].little_dirac_operator[i * g_N_s + j]) : -creal(block_list[0].little_dirac_operator[i * g_N_s + j]), cimag(block_list[0].little_dirac_operator[i * g_N_s + j]) >= 0 ? "+" : "-", cimag(block_list[0].little_dirac_operator[i * g_N_s + j]) >= 0 ? cimag(block_list[0].little_dirac_operator[i * g_N_s + j]) : -cimag(block_list[0].little_dirac_operator[i * g_N_s + j])); + if (j != g_N_s - 1){ + printf(",\t"); + } + } + printf(" ]\n"); + if ((i % g_N_s) == (g_N_s - 1)) + printf("\n"); + } + + printf("\n\n *** CHECKING LITTLE D ***\n"); + printf("\n ** node 0, upper block **\n"); + for (i = 0*g_N_s; i < 9 * g_N_s; ++i){ + printf(" [ "); + for (j = 0; j < g_N_s; ++j){ + printf("%s%1.3e %s %1.3e i", creal(block_list[1].little_dirac_operator[i * g_N_s + j]) >= 0 ? " " : "- ", creal(block_list[1].little_dirac_operator[i * g_N_s + j]) >= 0 ? creal(block_list[1].little_dirac_operator[i * g_N_s + j]) : -creal(block_list[1].little_dirac_operator[i * g_N_s + j]), cimag(block_list[1].little_dirac_operator[i * g_N_s + j]) >= 0 ? "+" : "-", cimag(block_list[1].little_dirac_operator[i * g_N_s + j]) >= 0 ? cimag(block_list[1].little_dirac_operator[i * g_N_s + j]) : -cimag(block_list[1].little_dirac_operator[i * g_N_s + j])); + if (j != g_N_s - 1){ + printf(",\t"); + } + } + printf(" ]\n"); + if ((i % g_N_s) == (g_N_s - 1)) + printf("\n"); + } + } + } + + free(_rec); + free(_app); + free(zero); + free(psi); +} + + +/* checked CU */ +void compute_little_D_diagonal() { + int i,j, blk; + spinor * tmp, * _tmp; + _Complex double * M; + _tmp = calloc( block_list[0].volume + block_list[0].spinpad + 1, sizeof(spinor)); +#if ( defined SSE || defined SSE2 || defined SSE3) + tmp = (spinor*)(((unsigned long int)(_tmp)+ALIGN_BASE)&~ALIGN_BASE); +#else + tmp = _tmp; +#endif + + for(blk = 0; blk < nb_blocks; blk++) { + M = block_list[blk].little_dirac_operator; + for(i = 0; i < g_N_s; i++) { + Block_D_psi(&block_list[blk], tmp, block_list[blk].basis[i]); + for(j = 0; j < g_N_s; j++) { + M[i * g_N_s + j] = scalar_prod(block_list[blk].basis[j], tmp, block_list[blk].volume, 0); + block_list[blk].little_dirac_operator32[i*g_N_s + j] = M[i * g_N_s + j]; + } + } + } + free(_tmp); + return; +} + + +/* what happens if this routine is called in a one dimensional parallelisation? */ +/* or even serially ? */ +/* checked CU */ +void compute_little_D() { + /* + This is the little dirac routine rewritten according to multidimensional blocking + Adaptation by Claude Tadonki (claude.tadonki@u-psud.fr) + Date: May 2010 + */ + spinor *scratch, * temp, *_scratch; + spinor *r, *s; + su3 * u; + int x, y, z=0, t, ix, iy=0, i, j, pm, mu=0, blk; + int t_start, t_end, x_start, x_end, y_start, y_end, z_start, z_end; + _Complex double c, *M; + int count=0; + int bx, by, bz, bt, block_id = 0, block_id_e, block_id_o,is_up = 0, ib; + int dT, dX, dY, dZ; + dT = T/nblks_t; dX = LX/nblks_x; dY = LY/nblks_y; dZ = LZ/nblks_z; + + if(g_proc_id == 0) printf("||-----------------------\n||compute_little_D\n||-----------------------\n"); + + + /* for a full spinor field we need VOLUMEPLUSRAND */ + /* because we use the same geometry as for the */ + /* gauge field */ + /* It is VOLUME + 2*LZ*(LY*LX + T*LY + T*LX) + 4*LZ*(LY + T + LX) */ + _scratch = calloc(2*VOLUMEPLUSRAND+1, sizeof(spinor)); +#if ( defined SSE || defined SSE2 || defined SSE3) + scratch = (spinor*)(((unsigned long int)(_scratch)+ALIGN_BASE)&~ALIGN_BASE); +#else + scratch = _scratch; +#endif + temp = scratch + VOLUMEPLUSRAND; + // NEED TO BE REWRITTEN + block_id_e = 0; + block_id_o = 0; + for(blk = 0; blk < nb_blocks; blk++) { + M = block_list[blk].little_dirac_operator; + for(i = 0; i < g_N_s; i++) { + Block_D_psi(&block_list[blk], scratch, block_list[blk].basis[i]); + for(j = 0; j < g_N_s; j++) { + M[i * g_N_s + j] = scalar_prod(block_list[blk].basis[j], scratch, block_list[blk].volume, 0); + + if (block_list[blk].evenodd==0) { + block_list[block_id_e].little_dirac_operator_eo[i * g_N_s + j] = M[i * g_N_s + j]; + } + if (block_list[blk].evenodd==1) { + block_list[(nb_blocks/2)+block_id_o].little_dirac_operator_eo[i * g_N_s + j] = M[i * g_N_s + j]; + } + } + } + if (block_list[blk].evenodd==0) block_id_e++; + if (block_list[blk].evenodd==1) block_id_o++; + } + + /* computation of little_Dhat^{-1}_ee */ + + for(blk = 0; blk < nb_blocks/2; blk++) { + LUInvert(g_N_s,block_list[blk].little_dirac_operator_eo,g_N_s); + } + for (i = 0; i < g_N_s; i++) { + if(i==0) count = 0; + reconstruct_global_field_GEN_ID(scratch, block_list, i , nb_blocks); + +#ifdef MPI + xchange_lexicfield(scratch); +#endif + + /* the initialisation causes troubles on a single processor */ + if(g_nproc == -1) zero_spinor_field(scratch, VOLUME); + /* +-t +-x +-y +-z */ + for(pm = 0; pm < 8; pm++) { + /* We set up the generic bounds */ + t_start = 0; t_end = dT; + x_start = 0; x_end = dX; + y_start = 0; y_end = dY; + z_start = 0; z_end = dZ; + switch(pm){ + case 0: t_start = dT - 1; t_end = t_start + 1; mu = 0; is_up = 1; break; /* Boundary in direction +t */ + case 1: t_start = 0; t_end = t_start + 1; mu = 0; is_up = 0; break; /* Boundary in direction -t */ + case 2: x_start = dX - 1; x_end = x_start + 1; mu = 1; is_up = 1; break; /* Boundary in direction +x */ + case 3: x_start = 0; x_end = x_start + 1; mu = 1; is_up = 0; break; /* Boundary in direction -x */ + case 4: y_start = dY - 1; y_end = y_start + 1; mu = 2; is_up = 1; break; /* Boundary in direction +y */ + case 5: y_start = 0; y_end = y_start + 1; mu = 2; is_up = 0; break; /* Boundary in direction -y */ + case 6: z_start = dZ - 1; z_end = z_start + 1; mu = 3; is_up = 1; break; /* Boundary in direction +z */ + case 7: z_start = 0; z_end = z_start + 1; mu = 3; is_up = 0; break; /* Boundary in direction -z */ + default: ; + } + /* Dirac operator on the boundaries */ + r = temp; + for(bt = 0; bt < nblks_t; bt++) { + for(bx = 0; bx < nblks_x; bx++) { + for(by = 0; by < nblks_y; by++) { + for(bz = 0; bz < nblks_z; bz++) { + for(t = t_start; t < t_end; t++) { + for(x = x_start; x < x_end; x++) { + for(y = y_start; y < y_end; y++) { + for(z = z_start; z < z_end; z++) { + /* We treat the case when we need to cross between blocks */ + /* We are in block (bt, bx, by, bz) and compute direction pm */ + /* We check inner block statement by ( b_ > 0 )&&( b_ < nblks_ - 1 ) */ + /* Other cases are threated in a standard way using the boundary of the scracth array */ + ib = -1; /* ib is the index of the selected block if any */ + if((pm==0)&&(bt0)&&(t==0)){ //direction -t + iy = index_b(dT - 1, x, y, z); /* highest edge of lower block needed */ + ib = block_index(bt-1, bx, by, bz); + } + else if((pm==2)&&(bx0)&&(x==0)){ //direction -x + iy = index_b(t, dX - 1, y, z); /* highest edge of lower block needed */ + ib = block_index(bt, bx-1, by, bz); + } + else if((pm==4)&&(by0)&&(y==0)){ //direction -y + iy = index_b(t, x, dY - 1, z); /* highest edge of lower block needed */ + ib = block_index(bt, bx, by-1, bz); + } + else if((pm==6)&&(bz0)&&(z==0)){ //direction -z + iy = index_b(t, x, y, dZ - 1); /* highest edge of lower block needed */ + ib = block_index(bt, bx, by, bz-1); + } + ix = index_a(dT*bt + t, dX*bx + x, dY*by + y, dZ*bz + z);// GAFFE ICI + if(is_up == 1) { + s = &scratch[ g_iup[ ix ][mu] ]; + u = &g_gauge_field[ ix ][mu]; + } + else { + s = &scratch[ g_idn[ ix ][mu] ]; + u = &g_gauge_field[ g_idn[ix][mu] ][mu]; + } + if(ib >= 0) s = &block_list[ib].basis[ i ][ iy ] ; + boundary_D[pm](r, s, u); + r++; + } + } + } + } + } + } + } + } + + /* Now all the scalar products */ + for(j = 0; j < g_N_s; j++) { + iy = i * g_N_s + j + (pm + 1) * g_N_s * g_N_s; + block_id = 0; + block_id_e=0; + block_id_o=0; + r = temp; + for(bt = 0; bt < nblks_t; bt++) { + for(bx = 0; bx < nblks_x; bx++) { + for(by = 0; by < nblks_y; by++) { + for(bz = 0; bz < nblks_z; bz++){ + block_list[block_id].little_dirac_operator[ iy ] = 0.0; + if (block_list[block_id].evenodd==0) {block_list[block_id_e].little_dirac_operator_eo[ iy ] = 0.0;} + if (block_list[block_id].evenodd==1) {block_list[block_id_o+nb_blocks/2].little_dirac_operator_eo[ iy ] = 0.0;} + /* We need to contract g_N_s times with the same set of fields */ + for(t = t_start; t < t_end; t++) { + for(x = x_start; x < x_end; x++) { + for(y = y_start; y < y_end; y++) { + for(z = z_start; z < z_end; z++) { + ix = index_b(t, x, y, z); // TO BE INLINED + s = &block_list[block_id].basis[j][ ix ]; + c = scalar_prod(s, r, 1, 0);// TO BE INLINED + block_list[block_id].little_dirac_operator[ iy ] += c; + if (block_list[block_id].evenodd==0) { + block_list[block_id_e].little_dirac_operator_eo[ iy ] += c; + } + if (block_list[block_id].evenodd==1) { + block_list[block_id_o+nb_blocks/2].little_dirac_operator_eo[ iy ] += c; + } + r++; + } + + } + } + } + if (block_list[block_id].evenodd==0) block_id_e++; + if (block_list[block_id].evenodd==1) block_id_o++; + block_id++; + } + } + } + } + } + } + } + for(i = 0; i < nb_blocks; i++) + for(j = 0; j < 9 * g_N_s * g_N_s; j++) + block_list[i].little_dirac_operator32[j] = (_Complex float)block_list[i].little_dirac_operator[ iy ]; + + if(g_debug_level > 3) { + if (g_N_s <= 5 && !g_cart_id){ + printf("\n\n *** CHECKING LITTLE D ***\n"); + printf("\n ** node 0, lower block **\n"); + for (i = 0*g_N_s; i < 9 * g_N_s; ++i){ + printf(" [ "); + for (j = 0; j < g_N_s; ++j){ + printf("%s%1.3e %s %1.3e i", creal(block_list[0].little_dirac_operator[i * g_N_s + j]) >= 0 ? " " : "- ", creal(block_list[0].little_dirac_operator[i * g_N_s + j]) >= 0 ? creal(block_list[0].little_dirac_operator[i * g_N_s + j]) : -creal(block_list[0].little_dirac_operator[i * g_N_s + j]), cimag(block_list[0].little_dirac_operator[i * g_N_s + j]) >= 0 ? "+" : "-", cimag(block_list[0].little_dirac_operator[i * g_N_s + j]) >= 0 ? cimag(block_list[0].little_dirac_operator[i * g_N_s + j]) : -cimag(block_list[0].little_dirac_operator[i * g_N_s + j])); + if (j != g_N_s - 1){ + printf(",\t"); + } + } + printf(" ]\n"); + if ((i % g_N_s) == (g_N_s - 1)) + printf("\n"); + } + + printf("\n\n *** CHECKING LITTLE D ***\n"); + printf("\n ** node 0, upper block **\n"); + for (i = 0*g_N_s; i < 9 * g_N_s; ++i){ + printf(" [ "); + for (j = 0; j < g_N_s; ++j){ + printf("%s%1.3e %s %1.3e i", creal(block_list[1].little_dirac_operator[i * g_N_s + j]) >= 0 ? " " : "- ", creal(block_list[1].little_dirac_operator[i * g_N_s + j]) >= 0 ? creal(block_list[1].little_dirac_operator[i * g_N_s + j]) : -creal(block_list[1].little_dirac_operator[i * g_N_s + j]), cimag(block_list[1].little_dirac_operator[i * g_N_s + j]) >= 0 ? "+" : "-", cimag(block_list[1].little_dirac_operator[i * g_N_s + j]) >= 0 ? cimag(block_list[1].little_dirac_operator[i * g_N_s + j]) : -cimag(block_list[1].little_dirac_operator[i * g_N_s + j])); + if (j != g_N_s - 1){ + printf(",\t"); + } + } + printf(" ]\n"); + if ((i % g_N_s) == (g_N_s - 1)) + printf("\n"); + + } + } + } + + free(_scratch); + return; +} + + +int split_global_field_GEN(spinor ** const psi, spinor * const field, const int nb_blocks) { + int j,ctr_t=0; + int x, y, z, t; + int bx, by, bz, bt, block_id; + for (t = 0; t < dT; t++) { + for (x = 0; x < dX; x++) { + for (y = 0; y < dY; y++) { + for (z = 0; z < dZ; z++) { + block_id = 0; + for(bt = 0; bt < nblks_t; bt++) { + for(bx = 0; bx < nblks_x; bx++) { + for(by = 0; by < nblks_y; by++) { + for(bz = 0; bz < nblks_z; bz++) { + _spinor_assign(*(psi[block_id] + ctr_t), + *(field + index_a(dT*bt + t, dX*bx + x, dY*by + y, dZ*bz + z))); + block_id++; + } + } + } + } + ctr_t++; + } + } + } + } + + if(g_proc_id == 0 && g_debug_level > 8) { + for(j = 0; j < nb_blocks; j++) + printf("Basis norm %2d = %1.3e\n", j, square_norm(psi[j], VOLUME / nb_blocks, 0)); + } + return 0; +} + +int split_global_field_GEN_ID(block * const block_list, const int id, spinor * const field, const int nb_blocks){ + int j,ctr_t=0; + int x, y, z, t; + int bx, by, bz, bt, block_id; + for (t = 0; t < dT; t++) { + for (x = 0; x < dX; x++) { + for (y = 0; y < dY; y++) { + for (z = 0; z < dZ; z++) { + block_id = 0; + for(bt = 0; bt < nblks_t; bt++) { + for(bx = 0; bx < nblks_x; bx++) { + for(by = 0; by < nblks_y; by++) { + for(bz = 0; bz < nblks_z; bz++) { + _spinor_assign(*(block_list[block_id].basis[id] + ctr_t), + *(field + index_a(dT*bt + t, dX*bx + x, dY*by + y, dZ*bz + z))); + block_id++; + } + } + } + } + ctr_t++; + } + } + } + } + + if(g_proc_id == 0 && g_debug_level > 8) { + for(j = 0; j < nb_blocks; j++) + printf("Basis norm %2d = %1.3e\n", j, square_norm(block_list[j].basis[id], VOLUME / nb_blocks, 0)); + } + return 0; +} + +/* copies the part of globalfields corresponding to block blk */ +/* to the block field blockfield */ +void copy_global_to_block(spinor * const blockfield, spinor * const globalfield, const int blk) { + int i,it,ix,iy,iz; + int ibt,ibx,iby,ibz; + int itb,ixb,iyb,izb; + int ixcurrent; + + ibz = blk%nblks_z; + iby = (blk / nblks_z)%nblks_y; + ibx = (blk / (nblks_y * nblks_z))%nblks_x; + ibt = blk / (nblks_x * nblks_y*nblks_z); + + ixcurrent=0; + for (i = 0; i < VOLUME; i++) { + + /* global coordinates */ + iz = i%LZ; + iy = (i / LZ)%LY; + ix = (i / (LY * LZ))%LX; + it = i / (LX * LY * LZ); + + /* block coordinates */ + izb = iz / block_list[blk].BLZ; + iyb = iy / block_list[blk].BLY; + ixb = ix / block_list[blk].BLX; + itb = it / block_list[blk].BT; + + if ((ibz == izb) && (iby == iyb) && (ibx == ixb) && (ibt==itb)) { + memcpy(blockfield+ixcurrent, globalfield+i, sizeof(spinor)); + ixcurrent++; + } + } + return; +} + +/* copies the part of globalfields corresponding to block blk */ +/* to the even and odd block fields */ +void copy_global_to_block_eo(spinor * const beven, spinor * const bodd, spinor * const globalfield, const int blk) { + int t, x, y, z; + int i,it,ix,iy,iz; + int even = 0, odd = 0; + + for(t = 0; t < block_list[blk].BT; t++) { + it = t + block_list[blk].mpilocal_coordinate[0]*block_list[blk].BT; + for(x = 0; x < block_list[blk].BLX; x++) { + ix = x + block_list[blk].mpilocal_coordinate[1]*block_list[blk].BLX; + for(y = 0; y < block_list[blk].BLY; y++) { + iy = y + block_list[blk].mpilocal_coordinate[2]*block_list[blk].BLY; + for(z = 0; z < block_list[blk].BLZ; z++) { + iz = z + block_list[blk].mpilocal_coordinate[3]*block_list[blk].BLZ; + i = g_ipt[it][ix][iy][iz]; + if((t+x+y+z)%2 == 0) { + memcpy(beven + even, globalfield + i, sizeof(spinor)); + even++; + } + else { + memcpy(bodd + odd, globalfield + i, sizeof(spinor)); + odd++; + } + } + } + } + } + return; +} + +/* reverts copy_global_to_block_eo */ +void copy_block_eo_to_global(spinor * const globalfield, spinor * const beven, spinor * const bodd, const int blk) { + int t, x, y, z; + int i,it,ix,iy,iz; + int even = 0, odd = 0; + + for(t = 0; t < block_list[blk].BT; t++) { + it = t + block_list[blk].mpilocal_coordinate[0]*block_list[blk].BT; + for(x = 0; x < block_list[blk].BLX; x++) { + ix = x + block_list[blk].mpilocal_coordinate[1]*block_list[blk].BLX; + for(y = 0; y < block_list[blk].BLY; y++) { + iy = y + block_list[blk].mpilocal_coordinate[2]*block_list[blk].BLY; + for(z = 0; z < block_list[blk].BLZ; z++) { + iz = z + block_list[blk].mpilocal_coordinate[3]*block_list[blk].BLZ; + i = g_ipt[it][ix][iy][iz]; + if((t+x+y+z)%2 == 0) { + memcpy(globalfield + i, beven + even, sizeof(spinor)); + even++; + } + else { + memcpy(globalfield + i, bodd + odd, sizeof(spinor)); + odd++; + } + } + } + } + } + return; +} + + +/* reconstructs the parts of globalfield corresponding to block blk */ +/* from block field blockfield */ +void copy_block_to_global(spinor * const globalfield, spinor * const blockfield, const int blk) { + int i,it,ix,iy,iz; + int ibt,ibx,iby,ibz; + int itb,ixb,iyb,izb; + int ixcurrent; + + ibz = blk%nblks_z; + iby = (blk / nblks_z)%nblks_y; + ibx = (blk / (nblks_y * nblks_z))%nblks_x; + ibt = blk / (nblks_x * nblks_y*nblks_z); + + ixcurrent=0; + for (i = 0; i < VOLUME; i++) { + + /* global coordinates */ + iz = i%LZ; + iy = (i / LZ)%LY; + ix = (i / (LY * LZ))%LX; + it = i / (LX * LY * LZ); + + /* block coordinates */ + izb = iz / block_list[blk].BLZ; + iyb = iy / block_list[blk].BLY; + ixb = ix / block_list[blk].BLX; + itb = it / block_list[blk].BT; + + if ((ibz == izb) && (iby == iyb) && (ibx == ixb) && (ibt==itb)) { + memcpy(globalfield+i, blockfield+ixcurrent, sizeof(spinor)); + ixcurrent++; + } + } + + return; +} + + + + +/* Reconstructs a global field from the little basis of nb_blocks blocks */ +void reconstruct_global_field_GEN(spinor * const rec_field, spinor ** const psi, const int nb_blocks) { + int ctr_t=0; + int x, y, z, t; + int bx, by, bz, bt, block_id; + for (t = 0; t < dT; t++) { + for (x = 0; x < dX; x++) { + for (y = 0; y < dY; y++) { + for (z = 0; z < dZ; z++) { + block_id = 0; + for(bt = 0; bt < nblks_t; bt++) { + for(bx = 0; bx < nblks_x; bx++) { + for(by = 0; by < nblks_y; by++) { + for(bz = 0; bz < nblks_z; bz++) { + _spinor_assign(*(rec_field + index_a(dT*bt + t, dX*bx + x, dY*by + y, dZ*bz + z)), + *(psi[block_id] + ctr_t)); + block_id++; + } + } + } + } + ctr_t++; + } + } + } + } + return; +} + +/* Reconstructs a global field from the little basis of nb_blocks blocks taken from block_list[*].basis[id] */ +void reconstruct_global_field_GEN_ID(spinor * const rec_field, block * const block_list, const int id, const int nb_blocks) { + int ctr_t=0; + int x, y, z, t; + int bx, by, bz, bt, block_id; + for (t = 0; t < dT; t++) { + for (x = 0; x < dX; x++) { + for (y = 0; y < dY; y++) { + for (z = 0; z < dZ; z++) { + block_id = 0; + for(bt = 0; bt < nblks_t; bt++) { + for(bx = 0; bx < nblks_x; bx++) { + for(by = 0; by < nblks_y; by++) { + for(bz = 0; bz < nblks_z; bz++) { + _spinor_assign(*(rec_field + index_a(dT*bt + t, dX*bx + x, dY*by + y, dZ*bz + z)), + *(block_list[block_id].basis[id] + ctr_t)); + block_id++; + } + } + } + } + ctr_t++; + } + } + } + } + return; +} + +void add_eo_block_to_global(spinor * const globalfield, spinor * const beven, spinor * const bodd, const int blk) { + int t, x, y, z; + int i,it,ix,iy,iz; + int even = 0, odd = 0; + + for(t = 0; t < block_list[blk].BT; t++) { + it = t + block_list[blk].mpilocal_coordinate[0]*block_list[blk].BT; + for(x = 0; x < block_list[blk].BLX; x++) { + ix = x + block_list[blk].mpilocal_coordinate[1]*block_list[blk].BLX; + for(y = 0; y < block_list[blk].BLY; y++) { + iy = y + block_list[blk].mpilocal_coordinate[2]*block_list[blk].BLY; + for(z = 0; z < block_list[blk].BLZ; z++) { + iz = z + block_list[blk].mpilocal_coordinate[3]*block_list[blk].BLZ; + i = g_ipt[it][ix][iy][iz]; + if((t+x+y+z)%2 == 0) { + add(globalfield + i, globalfield + i, beven + even, 1); + even++; + } + else { + add(globalfield + i, globalfield + i, bodd + odd, 1); + odd++; + } + } + } + } + } + return; +} + +void add_block_to_global(spinor * const globalfield, spinor * const blockfield, const int blk) { + int i; + spinor * r, * s; + int it,ix,iy,iz; + int ibt,ibx,iby,ibz; + int itb,ixb,iyb,izb; + int ixcurrent; + + ibz = blk%nblks_z; + iby = (blk / nblks_z)%nblks_y; + ibx = (blk / (nblks_y * nblks_z))%nblks_x; + ibt = blk / (nblks_x * nblks_y * nblks_z); + + ixcurrent = 0; + for (i = 0; i < VOLUME; i++) { + + iz = i%LZ; + iy = (i / LZ)%LY; + ix = (i / (LY * LZ))%LX; + it = i / (LX * LY * LZ); + + + izb = iz / block_list[blk].BLZ; + iyb = iy / block_list[blk].BLY; + ixb = ix / block_list[blk].BLX; + itb = it / block_list[blk].BT; + + if ((ibz == izb) && (iby == iyb) && (ibx == ixb) && (ibt == itb)) { + r = globalfield + i; + s = blockfield + ixcurrent; + add(r, r, s, 1); + ixcurrent++; + } + } + return; +} + +/* eo -> lexic + * P: new spinor with full volume + * s: source spinor even + * r: source spinor odd + */ +void block_convert_eo_to_lexic(spinor * const P, spinor * const s, spinor * const r) { + int x, y, z, t, i, ix; + spinor * p = NULL; + + for(x = 0; x < dX; x++) { + for(y = 0; y < dY; y++) { + for(z = 0; z < dZ; z++) { + for(t = 0; t < dT; t++) { + ix = block_ipt[t][x][y][z]; + i = ix / 2; + if((x + y + z + t)%2 == 0) { + p = s; + } + else { + p = r; + } + memcpy((P+ix), (p+i), sizeof(spinor)); + } + } + } + } + return; +} + +/* lexic -> eo + * P: source spinor with full volume + * s: new spinor even + * r: new spinor odd + */ +void block_convert_lexic_to_eo(spinor * const s, spinor * const r, spinor * const P) { + int x, y, z, t, i, ix; + spinor * p = NULL; + + for(x = 0; x < dX; x++) { + for(y = 0; y < dY; y++) { + for(z = 0; z < dZ; z++) { + for(t = 0; t < dT; t++) { + ix = block_ipt[t][x][y][z]; + i = ix / 2; + if((x + y + z + t)%2 == 0) { + p = s; + } + else { + p = r; + } + memcpy((p+i), (P+ix), sizeof(spinor)); + } + } + } + } + return; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/block.h b/qcd/part_cpu/applications/QCD/src/kernel_D/block.h new file mode 100644 index 0000000000000000000000000000000000000000..913420fe156fbc408381e2ac8e06d76abceae52e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/block.h @@ -0,0 +1,105 @@ +/*********************************************************************** + * + * Copyright (C) 2008 Albert Deuzeman, Siebren Reker, Carsten Urbach + * 2010 Claude Tadonki, Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + + +#ifndef _BLOCK_H +#define _BLOCK_H + +#include "su3.h" +#include "su3spinor.h" + +_Complex double * little_A; +_Complex float * little_A32; +_Complex double * little_A_eo; +_Complex float * little_A32_eo; + + +typedef struct { + /**** Data members ****/ + int volume; /* the block local 4 volume */ + int id; /* mpilocal block id */ + int BLX, BLY, BLZ, BT; /* block local sizes */ + int ns; /* the number of basis fields, which is needed almost everywhere */ + int coordinate[4]; /* global block coordinate */ + int mpilocal_coordinate[4]; /* mpi process local coordinate */ + int mpilocal_neighbour[8]; /* contains the block id of mpilocal neighbours, or -1 if non-mpilocal */ + int *idx; /* provides the next neighbours for spinors on the block */ + int *evenidx; /* provides the next neighbours for spinors on the block even/odd case */ + int *oddidx; /* provides the next neighbours for spinors on the block even/odd case */ + spinor **basis; /* generated orthonormal basis for little D [Ns x local_volume] */ + su3 * u; /* block local gauge field, for use in D */ + int spinpad; /* number of elements needed to store the boundaries of the spinor */ + int evenodd; /* block even or odd (0 or 1) */ + + /* storage will be g_Ns x (9 * g_Ns) */ + /* build_little_diraclocal g_Ns x g_Ns block first (the diagonal part) */ + /* then +t, -t, +x, -x, +y, -y, +z, -z */ + _Complex double *little_dirac_operator; /* full dense representation of the little D */ + _Complex float *little_dirac_operator32; + _Complex double *little_dirac_operator_eo; /* full dense representation of the little D in e/o order */ +} block; + +int init_blocks(const int nt, const int nx, const int ny, const int nz); +int free_blocks(); + +int init_blocks_gaugefield(); +int init_blocks_eo_gaugefield(); + +void copy_global_to_block(spinor * const blockfield, spinor * const globalfield, const int blk); +void copy_block_to_global(spinor * const globalfield, spinor * const blockfield, const int blk); +void copy_global_to_block_eo(spinor * const beven, spinor * const bodd, spinor * const globalfield, const int blk); +void copy_block_eo_to_global(spinor * const globalfield, spinor * const beven, spinor * const bodd, const int blk); +void add_block_to_global(spinor * const globalfield, spinor * const blockfield, const int blk); +void add_eo_block_to_global(spinor * const globalfield, spinor * const beven, spinor * const bodd, const int blk); + +void block_convert_lexic_to_eo(spinor * const s, spinor * const r, spinor * const P); +void block_convert_eo_to_lexic(spinor * const P, spinor * const s, spinor * const r); + +void block_orthonormalize(block *parent); +void block_orthonormalize_free(block *parent); + +void compute_little_D(); +void compute_little_D_diagonal(); +void alt_block_compute_little_D(); + +extern int dfl_field_iter; +extern int dfl_poly_iter; + +int nb_blocks; +int nblks_t; +int nblks_x; +int nblks_y; +int nblks_z; +int nblks_dir[4]; +int blk_gauge_eo; +void reconstruct_global_field_GEN(spinor * const rec_field, spinor ** const psi, int nb_blocks); +void reconstruct_global_field_GEN_ID(spinor * const rec_field, block * const block_list, const int id, const int nb_blocks); +int split_global_field_GEN(spinor ** const psi, spinor * const field, int nb_blocks); +int split_global_field_GEN_ID(block * const block_list, const int id, spinor * const field, const int nb_blocks); + +/* Functions for index manipulation related to blocks, C. Tadonki */ +int index_a(int t, int x, int y, int z); +int index_b(int t, int x, int y, int z); +int block_index(int t, int x, int y, int z); + +extern block * block_list; + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/boundary.c b/qcd/part_cpu/applications/QCD/src/kernel_D/boundary.c new file mode 100644 index 0000000000000000000000000000000000000000..04dd3fd4c017f591ae95d3b048717d0dedc27b9a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/boundary.c @@ -0,0 +1,55 @@ +/*********************************************************************** + * Copyright (C) 2001 Martin Hasenbusch + * Copyright (C) 2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * This function defines the boundary cond. + * with arbitrary angle in all directions + * + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "boundary.h" + +_Complex double ALIGN ka0, ka1, ka2, ka3; +_Complex double ALIGN phase_0, phase_1, phase_2, phase_3; +const double PI_ = 3.14159265358979; +double X0, X1, X2, X3; + +void boundary(const double kappa) +{ + double x0,x1,x2,x3; + x0 = X0 * PI_/((T)*g_nproc_t); + x1 = X1 * PI_/((LX)*g_nproc_x); + x2 = X2 * PI_/((LY)*g_nproc_y); + x3 = X3 * PI_/((LZ)*g_nproc_z); + ka0 = kappa * cexp(x0 * I); + ka1 = kappa * cexp(x1 * I); + ka2 = kappa * cexp(x2 * I); + ka3 = kappa * cexp(x3 * I); + phase_0 = -ka0; + phase_1 = -ka1; + phase_2 = -ka2; + phase_3 = -ka3; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/boundary.h b/qcd/part_cpu/applications/QCD/src/kernel_D/boundary.h new file mode 100644 index 0000000000000000000000000000000000000000..aadb5821ad12f92ffad1cf4305ae4a78cf1bff34 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/boundary.h @@ -0,0 +1,29 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _BOUNDARY_H +#define _BOUNDARY_H + +#include "su3.h" + +extern _Complex double ka0, ka1, ka2, ka3; +extern _Complex double phase_0, phase_1, phase_2, phase_3; +void boundary(const double kappa); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..84f1228618e6c48a6868c67031791131f1c815d4 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/Makefile @@ -0,0 +1,100 @@ + +srcdir = . +top_builddir = .. +abs_top_builddir = /home/jacob/Job/Prace/kernel_D_soft/kernel_D_update2 +top_srcdir = .. +abs_top_srcdir = /home/jacob/Job/Prace/kernel_D_soft/kernel_D_update2 +subdir = buffers +builddir = . + +CFLAGS = -std=c99 -fopenmp -pedantic -Wall +DEPFLAGS = -MM +LDFLAGS = -L${HOME}/lib -L${top_builddir}/lib +DEFS = -DHAVE_CONFIG_H +OPTARGS = -O + +AR = ar +RANLIB = ranlib +CC = mpicc +CCDEP = gcc +CCLD = $(CC) +LINK = $(CCLD) $(CFLAGS) $(LDFLAGS) ${OPTARGS} -o $@ +LEX = flex +AUTOCONF = autoconf +DEFS = -DHAVE_CONFIG_H + +INCLUDES = -I$(HOME)/include/ -I. -I${abs_top_builddir}/ -I${abs_top_srcdir}/ -I/include/ -I/include/ +LDADD = +COMPILE = ${CC} ${DEFS} ${INCLUDES} ${CFLAGS} ${OPTARGS} + +LIBRARIES = libbuffers + +libbuffers_TARGETS = gauge \ + gauge_allocate_gauge_buffers \ + gauge_finalize_gauge_buffers \ + gauge_free_unused_gauge_buffers \ + gauge_get_gauge_field \ + gauge_get_gauge_field_array \ + gauge_initialize_gauge_buffers \ + gauge_return_gauge_field \ + gauge_return_gauge_field_array \ + utils_generic_exchange + +libbuffers_OBJECTS = $(addsuffix .o, ${libbuffers_TARGETS}) + +# default rule + +all: Makefile dep libbuffers.a + +# rules for debugging +debug all-debug: CFLAGS := $(CFLAGS) -g +debug all-debug: all + +# rules for profiling information +profile all-profile: CFLAGS := $(filter-out -fomit-frame-pointer,${CFLAGS}) +profile all-profile: all + + +#include dep rules + + +-include $(addsuffix .d,${libbuffers_TARGETS}) + +include ${top_srcdir}/Makefile.global + +# rule to compile objects + +%.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h + $(COMPILE) -c $< + + +# rule to make libbuffers +libbuffers.a: ${libbuffers_OBJECTS} Makefile + @rm -f libbuffers.a + @${AR} cru libbuffers.a $(libbuffers_OBJECTS) + @$(RANLIB) libbuffers.a + @cp libbuffers.a ${top_builddir}/lib/libbuffers.a + +# rule to generate .d files + +$(addsuffix .d,$(libbuffers_TARGETS)): %.d: ${srcdir}/%.c Makefile + @$(CCDEP) ${DEFS} ${DEPFLAGS} ${INCLUDES} $< > $@ + +# rule to make dependencies + +dep: ${addsuffix .d, ${libbuffers_TARGETS}} + +# rules to clean + +compile-clean: Makefile + rm -f ${$(addsuffix _OBJECTS, ${LIBRARIES})} *.d + +clean: compile-clean + rm -f $(addsuffix .a, ${LIBRARIES}) + rm -f ../lib/libbuffers.a + +distclean: clean + rm -f Makefile + + +.PHONY: all dep clean compile-clean distclean debug all-debug profile all-profile diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/Makefile.in b/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/Makefile.in new file mode 100644 index 0000000000000000000000000000000000000000..4c9a2806cba2f4c2736e5f8c7e57b3569143bc1a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/Makefile.in @@ -0,0 +1,100 @@ + +srcdir = @srcdir@ +top_builddir = @top_builddir@ +abs_top_builddir = @abs_top_builddir@ +top_srcdir = @top_srcdir@ +abs_top_srcdir = @abs_top_srcdir@ +subdir = buffers +builddir = @builddir@ + +CFLAGS = @CFLAGS@ +DEPFLAGS = @DEPFLAGS@ +LDFLAGS = @LDFLAGS@ +DEFS = @DEFS@ +OPTARGS = @OPTARGS@ + +AR = @AR@ +RANLIB = @RANLIB@ +CC = @CC@ +CCDEP = @CCDEP@ +CCLD = $(CC) +LINK = $(CCLD) $(CFLAGS) $(LDFLAGS) ${OPTARGS} -o $@ +LEX = @LEX@ +AUTOCONF = @AUTOCONF@ +DEFS = @DEFS@ + +INCLUDES = @INCLUDES@ +LDADD = +COMPILE = ${CC} ${DEFS} ${INCLUDES} ${CFLAGS} ${OPTARGS} + +LIBRARIES = libbuffers + +libbuffers_TARGETS = gauge \ + gauge_allocate_gauge_buffers \ + gauge_finalize_gauge_buffers \ + gauge_free_unused_gauge_buffers \ + gauge_get_gauge_field \ + gauge_get_gauge_field_array \ + gauge_initialize_gauge_buffers \ + gauge_return_gauge_field \ + gauge_return_gauge_field_array \ + utils_generic_exchange + +libbuffers_OBJECTS = $(addsuffix .o, ${libbuffers_TARGETS}) + +# default rule + +all: Makefile dep libbuffers.a + +# rules for debugging +debug all-debug: CFLAGS := $(CFLAGS) @DEBUG_FLAG@ +debug all-debug: all + +# rules for profiling information +profile all-profile: CFLAGS := $(filter-out -fomit-frame-pointer,${CFLAGS}) @PROFILE_FLAG@ +profile all-profile: all + + +#include dep rules + + +-include $(addsuffix .d,${libbuffers_TARGETS}) + +include ${top_srcdir}/Makefile.global + +# rule to compile objects + +%.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h + $(COMPILE) -c $< + + +# rule to make libbuffers +libbuffers.a: ${libbuffers_OBJECTS} Makefile + @rm -f libbuffers.a + @${AR} cru libbuffers.a $(libbuffers_OBJECTS) + @$(RANLIB) libbuffers.a + @cp libbuffers.a ${top_builddir}/lib/libbuffers.a + +# rule to generate .d files + +$(addsuffix .d,$(libbuffers_TARGETS)): %.d: ${srcdir}/%.c Makefile + @$(CCDEP) ${DEFS} ${DEPFLAGS} ${INCLUDES} $< > $@ + +# rule to make dependencies + +dep: ${addsuffix .d, ${libbuffers_TARGETS}} + +# rules to clean + +compile-clean: Makefile + rm -f ${$(addsuffix _OBJECTS, ${LIBRARIES})} *.d + +clean: compile-clean + rm -f $(addsuffix .a, ${LIBRARIES}) + rm -f ../lib/libbuffers.a + +distclean: clean + rm -f Makefile + + +.PHONY: all dep clean compile-clean distclean debug all-debug profile all-profile diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/gauge.c b/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/gauge.c new file mode 100644 index 0000000000000000000000000000000000000000..f7d2e24fc6fe1af5f5147c41a1958b39567e3f41 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/gauge.c @@ -0,0 +1,3 @@ +#include "gauge.ih" + +gauge_buffers_t g_gauge_buffers; diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/gauge.h b/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/gauge.h new file mode 100644 index 0000000000000000000000000000000000000000..8ac5a238938b66fe16394e5f27150a5afa04eb6b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/gauge.h @@ -0,0 +1,42 @@ +#pragma once + +#include "su3.h" + +#ifndef ALIGN_BASE +# define ALIGN_BASE 0x0f +#endif + +typedef su3 su3_tuple[4]; + +typedef struct +{ + su3_tuple **reserve; + unsigned int max; + unsigned int allocated; + unsigned int free; +} gauge_buffers_t; + +typedef struct +{ + su3_tuple *field; +} gauge_field_t; + +typedef struct +{ + gauge_field_t *field_array; + unsigned int length; +} gauge_field_array_t; + +extern gauge_buffers_t g_gauge_buffers; + +void initialize_gauge_buffers(unsigned int max); +void finalize_gauge_buffers(); + +void allocate_gauge_buffers(unsigned int count); +void free_unused_gauge_buffers(); + +gauge_field_t get_gauge_field(); +void return_gauge_field(gauge_field_t *gauge_field); + +gauge_field_array_t get_gauge_field_array(unsigned int length); +void return_gauge_field_array(gauge_field_array_t *gauge_field_array); diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/gauge.ih b/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/gauge.ih new file mode 100644 index 0000000000000000000000000000000000000000..9b66819bd79e64df82991ef195be94d451ec7388 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/gauge.ih @@ -0,0 +1,10 @@ +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include + +#include +#include + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/gauge_allocate_gauge_buffers.c b/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/gauge_allocate_gauge_buffers.c new file mode 100644 index 0000000000000000000000000000000000000000..b45968adead9e992632c643540157026e923192e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/gauge_allocate_gauge_buffers.c @@ -0,0 +1,21 @@ +#include "gauge.ih" + +void allocate_gauge_buffers(unsigned int count) +{ + if ((g_gauge_buffers.allocated + count) > g_gauge_buffers.max) + fatal_error("Maximum number of allocated gauge fields exceeded.", "allocate_gauge_buffers"); + + for (unsigned int ctr = 0; ctr < count; ++ctr) + { + void *raw = malloc(sizeof(void*) + ALIGN_BASE + sizeof(su3_tuple) * VOLUMEPLUSRAND + 1); + if (raw == NULL) + fatal_error("Could not allocate the requested amount of memory.", "allocate_gauge_buffers"); + size_t p = (size_t)raw + sizeof(void*); + p = ((p + ALIGN_BASE) & ~ALIGN_BASE); + ((void**)p)[-1] = raw; + + g_gauge_buffers.reserve[g_gauge_buffers.free] = (su3_tuple*)p; + ++g_gauge_buffers.allocated; + ++g_gauge_buffers.free; + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/gauge_finalize_gauge_buffers.c b/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/gauge_finalize_gauge_buffers.c new file mode 100644 index 0000000000000000000000000000000000000000..dd8906492f24514cb2ce599dc0830a5d17a1cbe8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/gauge_finalize_gauge_buffers.c @@ -0,0 +1,11 @@ +#include "gauge.ih" + +void finalize_gauge_buffers() +{ + if (g_gauge_buffers.free != g_gauge_buffers.allocated) + fatal_error("Finalized g_gauge_buffers with unreturned fields!", "finalize_gauge_buffers"); + + free_unused_gauge_buffers(); + free(g_gauge_buffers.reserve); + g_gauge_buffers.max = 0; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/gauge_free_unused_gauge_buffers.c b/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/gauge_free_unused_gauge_buffers.c new file mode 100644 index 0000000000000000000000000000000000000000..7492775dc9f593946bad5df162f8f06bdc3c2338 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/gauge_free_unused_gauge_buffers.c @@ -0,0 +1,10 @@ +#include "gauge.ih" + +void free_unused_gauge_buffers() +{ + for ( ; g_gauge_buffers.free > 0; --g_gauge_buffers.free, --g_gauge_buffers.allocated) + { + void* ptr = ((void**)g_gauge_buffers.reserve[g_gauge_buffers.free - 1])[-1]; + free(ptr); + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/gauge_get_gauge_field.c b/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/gauge_get_gauge_field.c new file mode 100644 index 0000000000000000000000000000000000000000..d555ee29ce5fb8961418f2230a5998fe897459bc --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/gauge_get_gauge_field.c @@ -0,0 +1,21 @@ +#include "gauge.ih" + +/* This routine not only malloc's a field, but immediately aligns it. + To keep track of the original address to free the field eventually, + we store that address _before_ the actual buffer. + The end user should never have to see the alignment after this. */ + +gauge_field_t get_gauge_field() +{ + gauge_field_t gauge_field; + + if (g_gauge_buffers.free == 0) /* Need to allocate a new buffer */ + allocate_gauge_buffers(1); + --g_gauge_buffers.free; + + gauge_field.field = g_gauge_buffers.reserve[g_gauge_buffers.free]; + g_gauge_buffers.reserve[g_gauge_buffers.free] = NULL; + + return gauge_field; +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/gauge_get_gauge_field_array.c b/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/gauge_get_gauge_field_array.c new file mode 100644 index 0000000000000000000000000000000000000000..139c4f6c165d660c45a7787c11dd6367bc027c6c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/gauge_get_gauge_field_array.c @@ -0,0 +1,21 @@ +#include "gauge.ih" + +gauge_field_array_t get_gauge_field_array(unsigned int length) +{ + gauge_field_array_t gauge_field_array; + gauge_field_array.length = length; + gauge_field_array.field_array = (gauge_field_t*)calloc(length, sizeof(gauge_field_t)); + + if (g_gauge_buffers.free < length) /* Need to allocate more buffers */ + allocate_gauge_buffers(length - g_gauge_buffers.free); + + for (unsigned int ctr = 0; ctr < length; ++ctr) + { + --g_gauge_buffers.free; + gauge_field_array.field_array[ctr].field = g_gauge_buffers.reserve[g_gauge_buffers.free]; + g_gauge_buffers.reserve[g_gauge_buffers.free] = NULL; + } + + return gauge_field_array; +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/gauge_initialize_gauge_buffers.c b/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/gauge_initialize_gauge_buffers.c new file mode 100644 index 0000000000000000000000000000000000000000..daac0ccb1457505e7108a6633fad47cc74abe108 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/gauge_initialize_gauge_buffers.c @@ -0,0 +1,10 @@ +#include "gauge.ih" + +void initialize_gauge_buffers(unsigned int max) +{ + g_gauge_buffers.max = max; + g_gauge_buffers.allocated = 0; + g_gauge_buffers.free = 0; + g_gauge_buffers.reserve = (su3_tuple**)calloc(max, sizeof(su3_tuple*)); +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/gauge_return_gauge_field.c b/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/gauge_return_gauge_field.c new file mode 100644 index 0000000000000000000000000000000000000000..5df67f86cc833590233817cd58c5f797aca78c6d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/gauge_return_gauge_field.c @@ -0,0 +1,9 @@ +#include "gauge.ih" + +void return_gauge_field(gauge_field_t *gauge_field) +{ + g_gauge_buffers.reserve[g_gauge_buffers.free] = gauge_field->field; + ++g_gauge_buffers.free; + gauge_field->field = NULL; +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/gauge_return_gauge_field_array.c b/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/gauge_return_gauge_field_array.c new file mode 100644 index 0000000000000000000000000000000000000000..9089e899862827a2c64a23436fad6524c365f1ee --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/gauge_return_gauge_field_array.c @@ -0,0 +1,12 @@ +#include "gauge.ih" + +void return_gauge_field_array(gauge_field_array_t *gauge_field_array) +{ + for (unsigned int ctr = 0; ctr < gauge_field_array->length; ++ctr) + { + g_gauge_buffers.reserve[g_gauge_buffers.free] = gauge_field_array->field_array[ctr].field; + ++g_gauge_buffers.free; + gauge_field_array->field_array[ctr].field = NULL; + } + free(gauge_field_array->field_array); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/utils.h b/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/utils.h new file mode 100644 index 0000000000000000000000000000000000000000..49cb7585f4fee8b87ea487bfb1f930c81c0d3592 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/utils.h @@ -0,0 +1,25 @@ +#pragma once + +#include + +void copy_gauge_field(gauge_field_t dest, gauge_field_t orig); + +void generic_exchange(void *field_in, int bytes_per_site); +void exchange_gauge_field(gauge_field_t target); +void exchange_gauge_field_array(gauge_field_array_t target); + +inline void copy_gauge_field(gauge_field_t dest, gauge_field_t orig) +{ + memmove((void*)dest.field, (void*)orig.field, sizeof(su3_tuple) * VOLUMEPLUSRAND + 1); +} + +inline void exchange_gauge_field(gauge_field_t target) +{ + generic_exchange((void*)target.field, sizeof(su3_tuple)); +} + +inline void exchange_gauge_field_array(gauge_field_array_t target) +{ + for (unsigned int idx = 0; idx < target.length; ++idx) + exchange_gauge_field(target.field_array[idx]); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/utils.ih b/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/utils.ih new file mode 100644 index 0000000000000000000000000000000000000000..939ff5e0c818e5e31683d23633edb3584459cc74 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/utils.ih @@ -0,0 +1,10 @@ +#ifdef HAVE_CONFIG_H +# include +#endif + +#include + +#include +#include + +#include diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/utils_generic_exchange.1.inc b/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/utils_generic_exchange.1.inc new file mode 100644 index 0000000000000000000000000000000000000000..68117563dc03e04abdcb6a1a5d2f3bcd6fe754dd --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/utils_generic_exchange.1.inc @@ -0,0 +1,547 @@ +# if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT ) + /* send the data to the neighbour on the left */ + /* recieve the data from the neighbour on the right */ + + MPI_Isend(buffer[gI_0_0_0_0], 1, slice_T_cont_type, g_nb_t_dn, 83, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[gI_L_0_0_0], 1, slice_T_cont_type, g_nb_t_up, 83, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right */ + /* recieve the data from the neighbour on the left */ + MPI_Isend(buffer[gI_Lm1_0_0_0], 1, slice_T_cont_type, g_nb_t_up, 84, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[gI_m1_0_0_0], 1, slice_T_cont_type, g_nb_t_dn, 84, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + if(g_dbw2rand > 0) { + /* send the data to the neighbour on the left */ + /* recieve the data from the neighbour on the right */ + /* t2-Rand */ + MPI_Isend(buffer[gI_p1_0_0_0], 1, slice_T_cont_type, g_nb_t_dn, 85, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[gI_Lp1_0_0_0], 1, slice_T_cont_type, g_nb_t_up, 85, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right */ + /* recieve the data from the neighbour on the left */ + /* t2-Rand */ + MPI_Isend(buffer[gI_Lm2_0_0_0], 1, slice_T_cont_type, g_nb_t_up, 86, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[gI_m2_0_0_0], 1, slice_T_cont_type, g_nb_t_dn, 86, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + } +# endif +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + MPI_Isend(buffer[gI_0_0_0_0], 1, slice_X_gath_type, g_nb_x_dn, 87, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[gI_0_L_0_0], 1, slice_X_cont_type, g_nb_x_up, 87, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* x-Rand */ + MPI_Isend(buffer[gI_0_Lm1_0_0], 1, slice_X_gath_type, g_nb_x_up, 88, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[gI_0_m1_0_0], 1, slice_X_cont_type, g_nb_x_dn, 88, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + if(g_dbw2rand > 0) { + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + /* x2-Rand */ + MPI_Isend(buffer[gI_0_p1_0_0], 1, slice_X_gath_type, g_nb_x_dn, 89, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[gI_0_Lp1_0_0], 1, slice_X_cont_type, g_nb_x_up, 89, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* x2-Rand */ + MPI_Isend(buffer[gI_0_Lm2_0_0], 1, slice_X_gath_type, g_nb_x_up, 90, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[gI_0_m2_0_0], 1, slice_X_cont_type, g_nb_x_dn, 90, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + } +# endif + + MPI_Waitall(cntr, request, status); + cntr=0; + + /* Communications of the xt (x2t and t2x) edges are done by using the previously + communicated x-borders whose t-borders are now exchanged in t directions [ORD!] */ + /* In this case the code cannot be completely independent of the definition in Index, + since edge_XT_gath_type are defined by joining together the x=L and the x=-1 parts. + For this reason we need to know that x=L comes before x=-1 in the definition of + Index() and hence we need to refer to the starting point gI_0_L_0_0 . [DEP!] */ + +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT ) + /* send the data to the neighbour on the left in t direction */ + /* recieve the data from the neighbour on the right in t direction */ + /* is on the x-Rand: xt-edge */ + MPI_Isend(buffer[gI_0_L_0_0], 1, edge_XT_gath_type, g_nb_t_dn, 100, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[gI_L_L_0_0], 1, edge_XT_cont_type, g_nb_t_up, 100, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in t direction */ + /* recieve the data from the neighbour on the left in t direction */ + /* xt-edge */ + MPI_Isend(buffer[gI_Lm1_L_0_0], 1, edge_XT_gath_type, g_nb_t_up, 101, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[gI_m1_L_0_0], 1, edge_XT_cont_type, g_nb_t_dn, 101, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + if(g_dbw2rand > 0) { + /* send the data to the neighbour on the left in t direction */ + /* recieve the data from the neighbour on the right in t direction */ + /* t2x-edge */ + MPI_Isend(buffer[gI_p1_L_0_0], 1, edge_XT_gath_type, g_nb_t_dn, 102, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[gI_Lp1_L_0_0], 1, edge_XT_cont_type, g_nb_t_up, 102, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in t direction */ + /* recieve the data from the neighbour on the left in t direction */ + /* t2x-edge */ + MPI_Isend(buffer[gI_Lm2_L_0_0], 1, edge_XT_gath_type, g_nb_t_up, 103, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[gI_m2_L_0_0], 1, edge_XT_cont_type, g_nb_t_dn, 103, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the left in t direction */ + /* recieve the data from the neighbour on the right in t direction */ + /* x2t-edge */ /* x=L+1 comes before x=-2. see [DEP!] */ + MPI_Isend(buffer[gI_0_Lp1_0_0], 1, edge_XT_gath_type, g_nb_t_dn, 104, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[gI_L_Lp1_0_0], 1, edge_XT_cont_type, g_nb_t_up, 104, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in t direction */ + /* recieve the data from the neighbour on the left in t direction */ + /* x2t-edge */ + MPI_Isend(buffer[gI_Lm1_Lp1_0_0], 1, edge_XT_gath_type, g_nb_t_up, 105, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[gI_m1_Lp1_0_0], 1, edge_XT_cont_type, g_nb_t_dn, 105, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + } +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + MPI_Isend(buffer[gI_0_0_0_0], 1, slice_Y_gath_type, g_nb_y_dn, 106, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[gI_0_0_L_0], 1, slice_Y_cont_type, g_nb_y_up, 106, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + MPI_Isend(buffer[gI_0_0_Lm1_0], 1, slice_Y_gath_type, g_nb_y_up, 107, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[gI_0_0_m1_0], 1, slice_Y_cont_type, g_nb_y_dn, 107, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + if(g_dbw2rand > 0) { + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + /* y2-Rand */ + MPI_Isend(buffer[gI_0_0_p1_0], 1, slice_Y_gath_type, g_nb_y_dn, 108, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[gI_0_0_Lp1_0], 1, slice_Y_cont_type, g_nb_y_up, 108, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* y2-Rand */ + MPI_Isend(buffer[gI_0_0_Lm2_0], 1, slice_Y_gath_type, g_nb_y_up, 109, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[gI_0_0_m2_0], 1, slice_Y_cont_type, g_nb_y_dn, 109, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + } +# endif + + MPI_Waitall(cntr, request, status); + cntr=0; + + /* see [ORD!] above, where now x plays the role of t and y the role of x */ + /* see [DEP!] above, where now y=L comes before y=-1 */ + +# if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ ) + + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + /* is on the y-Rand -> yx-edge*/ + MPI_Isend(buffer[gI_0_0_L_0], 1, edge_XY_gath_type, g_nb_x_dn, 110, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[gI_0_L_L_0], 1, edge_XY_cont_type, g_nb_x_up, 110, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* yx-edge */ + MPI_Isend(buffer[gI_0_Lm1_L_0], 1, edge_XY_gath_type, g_nb_x_up, 111, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[gI_0_m1_L_0], 1, edge_XY_cont_type, g_nb_x_dn, 111, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + +# endif + + /* see [ORD!] above, where now y plays the role of t and t the role of x */ + /* see [DEP!] above, where now t=L comes before t=-1 */ + +# if (defined PARALLELXYT || defined PARALLELXYZT ) + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* is on the t-Rand -> ty-edge*/ + MPI_Isend(buffer[gI_L_0_0_0], 1, edge_YT_gath_type, g_nb_y_dn, 112, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[gI_L_0_L_0], 1, edge_YT_cont_type, g_nb_y_up, 112, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* ty-edge */ + MPI_Isend(buffer[gI_L_0_Lm1_0], 1, edge_YT_gath_type, g_nb_y_up, 113, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[gI_L_0_m1_0], 1, edge_YT_cont_type, g_nb_y_dn, 113, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + +# endif + + if(g_dbw2rand > 0) { + +# if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ ) + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* x2y edge */ /* y=L comes before y=-1 */ + MPI_Isend(buffer[gI_0_p1_L_0], 1, edge_XY_gath_type, g_nb_x_dn, 114, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[gI_0_Lp1_L_0], 1, edge_XY_cont_type, g_nb_x_up, 114, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* x2y-edge */ + MPI_Isend(buffer[gI_0_Lm2_L_0], 1, edge_XY_gath_type, g_nb_x_up, 115, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[gI_0_m2_L_0], 1, edge_XY_cont_type, g_nb_x_dn, 115, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* y2x -edge */ + MPI_Isend(buffer[gI_0_0_Lp1_0], 1, edge_XY_gath_type, g_nb_x_dn, 116, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[gI_0_L_Lp1_0], 1, edge_XY_cont_type, g_nb_x_up, 116, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* y2x edge */ + MPI_Isend(buffer[gI_0_Lm1_Lp1_0], 1, edge_XY_gath_type, g_nb_x_up, 117, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[gI_0_m1_Lp1_0], 1, edge_XY_cont_type, g_nb_x_dn, 117, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + +# endif +# if (defined PARALLELXYT || defined PARALLELXYZT ) + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* t2y-edge */ + MPI_Isend(buffer[gI_Lp1_0_0_0], 1, edge_YT_gath_type, g_nb_y_dn, 118, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[gI_Lp1_0_L_0], 1, edge_YT_cont_type, g_nb_y_up, 118, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* t2y edge */ + MPI_Isend(buffer[gI_Lp1_0_Lm1_0], 1, edge_YT_gath_type, g_nb_y_up, 119, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[gI_Lp1_0_m1_0], 1, edge_YT_cont_type, g_nb_y_dn, 119, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* y2t edge */ + MPI_Isend(buffer[gI_L_0_p1_0], 1, edge_YT_gath_type, g_nb_y_dn, 120, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[gI_L_0_Lp1_0], 1, edge_YT_cont_type, g_nb_y_up, 120, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* y2t-edge */ + MPI_Isend(buffer[gI_L_0_Lm2_0], 1, edge_YT_gath_type, g_nb_y_up, 121, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[gI_L_0_m2_0], 1, edge_YT_cont_type, g_nb_y_dn, 121, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; +# endif + } +# if (defined PARALLELXYZT || defined PARALLELXYZ ) + /* z-Rand */ + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + MPI_Isend(buffer[gI_0_0_0_0], 1, slice_Z_gath_type, g_nb_z_dn, 122, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[gI_0_0_0_L], 1, slice_Z_cont_type, g_nb_z_up, 122, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + /* send the data to the neighbour on the right in z direction */ + /* recieve the data from the neighbour on the left in z direction */ + MPI_Isend(buffer[gI_0_0_0_Lm1], 1, slice_Z_gath_type, g_nb_z_up, 123, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[gI_0_0_0_m1], 1, slice_Z_cont_type, g_nb_z_dn, 123, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + if(g_dbw2rand > 0) { + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + /* z2-Rand */ + MPI_Isend(buffer[gI_0_0_0_p1], 1, slice_Z_gath_type, g_nb_z_dn, 124, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[gI_0_0_0_Lp1], 1, slice_Z_cont_type, g_nb_z_up, 124, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + /* send the data to the neighbour on the right in z direction */ + /* recieve the data from the neighbour on the left in z direction */ + /* z2-Rand */ + MPI_Isend(buffer[gI_0_0_0_Lm2], 1, slice_Z_gath_type, g_nb_z_up, 125, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[gI_0_0_0_m2], 1, slice_Z_cont_type, g_nb_z_dn, 125, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + } +# endif + MPI_Waitall(cntr, request, status); + cntr=0; + + /* see [ORD!] above, where now x plays the role of t and z the role of x */ + /* see [DEP!] above, where now z=L comes before z=-1 */ + +# if (defined PARALLELXYZT || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + /* is on the z-Rand -> zx-edge*/ + MPI_Isend(buffer[gI_0_0_0_L], 1, edge_XZ_gath_type, g_nb_x_dn, 126, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[gI_0_L_0_L], 1, edge_XZ_cont_type, g_nb_x_up, 126, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* zx-edge */ + MPI_Isend(buffer[gI_0_Lm1_0_L], 1, edge_XZ_gath_type, g_nb_x_up, 127, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[gI_0_m1_0_L], + 1, edge_XZ_cont_type, g_nb_x_dn, 127, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + +# endif + + /* see [ORD!] above, where now z plays the role of t and t the role of x */ + /* see [DEP!] above, where now t=L comes before t=-1 */ + +# if (defined PARALLELXYZT) + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + /* is on the t-Rand -> tz-edge*/ + MPI_Isend(buffer[gI_L_0_0_0], 1, edge_ZT_gath_type, g_nb_z_dn, 128, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[gI_L_0_0_L], 1, edge_ZT_cont_type, g_nb_z_up, 128, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in z direction */ + /* recieve the data from the neighbour on the left in z direction */ + /* tz-edge */ + MPI_Isend(buffer[gI_L_0_0_Lm1], 1, edge_ZT_gath_type, g_nb_z_up, 129, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[gI_L_0_0_m1], 1, edge_ZT_cont_type, g_nb_z_dn, 129, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + +# endif + + /* see [ORD!] above, where now y plays the role of t and z the role of x */ + /* see [DEP!] above, where now z=L comes before z=-1 */ + +# if (defined PARALLELXYZT || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* is on the z-Rand -> zy-edge*/ + MPI_Isend(buffer[gI_0_0_0_L], 1, edge_YZ_gath_type, g_nb_y_dn, 130, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[gI_0_0_L_L], 1, edge_YZ_cont_type, g_nb_y_up, 130, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* zy-edge */ + MPI_Isend(buffer[gI_0_0_Lm1_L], 1, edge_YZ_gath_type, g_nb_y_up, 131, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[gI_0_0_m1_L], 1, edge_YZ_cont_type, g_nb_y_dn, 131, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + +# endif + + if(g_dbw2rand > 0) { + +# if (defined PARALLELXYZT) + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + /* t2z edge */ /* t=L+1 comes before t=-2*/ + MPI_Isend(buffer[gI_Lp1_0_0_0], 1, edge_ZT_gath_type, g_nb_z_dn, 132, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[gI_Lp1_0_0_L], 1, edge_ZT_cont_type, g_nb_z_up, 132, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in z direction */ + /* recieve the data from the neighbour on the left in z direction */ + /* t2z-edge */ + MPI_Isend(buffer[gI_Lp1_0_0_Lm1], 1, edge_ZT_gath_type, g_nb_z_up, 133, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[gI_Lp1_0_0_m1], 1, edge_ZT_cont_type, g_nb_z_dn, 133, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + /* z2t -edge */ + MPI_Isend(buffer[gI_L_0_0_p1], 1, edge_ZT_gath_type, g_nb_z_dn, 134, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[gI_L_0_0_Lp1], 1, edge_ZT_cont_type, g_nb_z_up, 134, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in z direction */ + /* recieve the data from the neighbour on the left in z direction */ + /* z2t edge */ + MPI_Isend(buffer[gI_L_0_0_Lm2], 1, edge_ZT_gath_type, g_nb_z_up, 135, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[gI_L_0_0_m2], 1, edge_ZT_cont_type, g_nb_z_dn, 135, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + +# endif +# if (defined PARALLELXYZT || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + /* z2x-edge */ + MPI_Isend(buffer[gI_0_0_0_Lp1], 1, edge_XZ_gath_type, g_nb_x_dn, 136, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[gI_0_L_0_Lp1], 1, edge_XZ_cont_type, g_nb_x_up, 136, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* z2x edge */ + MPI_Isend(buffer[gI_0_Lm1_0_Lp1], 1, edge_XZ_gath_type, g_nb_x_up, 137, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[gI_0_m1_0_Lp1], 1, edge_XZ_cont_type, g_nb_x_dn, 137, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + /* x2z edge */ + MPI_Isend(buffer[gI_0_p1_0_L], 1, edge_XZ_gath_type, g_nb_x_dn, 138, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[gI_0_Lp1_0_L], 1, edge_XZ_cont_type, g_nb_x_up, 138, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* x2z-edge */ + MPI_Isend(buffer[gI_0_Lm2_0_L], 1, edge_XZ_gath_type, g_nb_x_up, 139, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[gI_0_m2_0_L], 1, edge_XZ_cont_type, g_nb_x_dn, 139, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + +# endif +# if (defined PARALLELXYZT || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* z2y-edge */ /* z=L+1 comes before z=-2 */ + MPI_Isend(buffer[gI_0_0_0_Lp1], 1, edge_YZ_gath_type, g_nb_y_dn, 140, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[gI_0_0_L_Lp1], 1, edge_YZ_cont_type, g_nb_y_up, 140, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* z2y edge */ + MPI_Isend(buffer[gI_0_0_Lm1_Lp1], 1, edge_YZ_gath_type, g_nb_y_up, 141, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[gI_0_0_m1_Lp1], 1, edge_YZ_cont_type, g_nb_y_dn, 141, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* y2z edge */ /* z=L comes before z=-1 */ + MPI_Isend(buffer[gI_0_0_p1_L], 1, edge_YZ_gath_type, g_nb_y_dn, 142, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[gI_0_0_Lp1_L], 1, edge_YZ_cont_type, g_nb_y_up, 142, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* y2z-edge */ + MPI_Isend(buffer[gI_0_0_Lm2_L], 1, edge_YZ_gath_type, g_nb_y_up, 143, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[gI_0_0_m2_L], 1, edge_YZ_cont_type, g_nb_y_dn, 143, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; +# endif + } + +# if (defined PARALLELXYZT || defined PARALLELXYZ ) + MPI_Waitall(cntr, request, status); +# endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/utils_generic_exchange.2.inc b/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/utils_generic_exchange.2.inc new file mode 100644 index 0000000000000000000000000000000000000000..0789a490f114351fe80d2ae0acbf9a9954ca38d5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/utils_generic_exchange.2.inc @@ -0,0 +1,568 @@ + MPI_Isend(buffer[0], 1, slice_T_cont_type, g_nb_t_dn, 83, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[VOLUME], 1, slice_T_cont_type, g_nb_t_up, 83, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right */ + /* recieve the data from the neighbour on the left */ + MPI_Isend(buffer[(T-1)*LX*LY*LZ], 1, slice_T_cont_type, g_nb_t_up, 84, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[(T+1)*LX*LY*LZ], 1, slice_T_cont_type, g_nb_t_dn, 84, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + if(g_dbw2rand > 0) { + /* send the data to the neighbour on the left */ + /* recieve the data from the neighbour on the right */ + /* t2-Rand */ + MPI_Isend(buffer[1*LX*LY*LZ], 1, slice_T_cont_type, g_nb_t_dn, 85, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[VOLUMEPLUSRAND], 1, slice_T_cont_type, g_nb_t_up, 85, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right */ + /* recieve the data from the neighbour on the left */ + /* t2-Rand */ + MPI_Isend(buffer[(T-2)*LX*LY*LZ], 1, slice_T_cont_type, g_nb_t_up, 86, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[VOLUMEPLUSRAND+LX*LY*LZ], 1, slice_T_cont_type, g_nb_t_dn, 86, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + } + +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + MPI_Isend(buffer[0], 1, slice_X_gath_type, g_nb_x_dn, 87, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[(T+2)*LX*LY*LZ], 1, slice_X_cont_type, g_nb_x_up, 87, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* x-Rand */ + MPI_Isend(buffer[(LX-1)*LY*LZ], 1, slice_X_gath_type, g_nb_x_up, 88, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[(T+2)*LX*LY*LZ + T*LY*LZ], 1, slice_X_cont_type, g_nb_x_dn, 88, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + if(g_dbw2rand > 0) { + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + /* x2-Rand */ + MPI_Isend(buffer[LY*LZ], 1, slice_X_gath_type, g_nb_x_dn, 89, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[VOLUMEPLUSRAND+2*LX*LY*LZ], 1, slice_X_cont_type, g_nb_x_up, 89, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* x2-Rand */ + MPI_Isend(buffer[(LX-2)*LY*LZ], 1, slice_X_gath_type, g_nb_x_up, 90, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[VOLUMEPLUSRAND+2*LX*LY*LZ + T*LY*LZ], 1, slice_X_cont_type, g_nb_x_dn, 90, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + } +# endif + MPI_Waitall(cntr, request, status); + cntr=0; +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) + /* The edges */ + + /* send the data to the neighbour on the left in t direction */ + /* recieve the data from the neighbour on the right in t direction */ + /* is on the x-Rand: xt-edge */ + MPI_Isend(buffer[(T+2)*LX*LY*LZ], 1, edge_XT_gath_type, g_nb_t_dn, 100, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[VOLUME + RAND], 1, edge_XT_cont_type, g_nb_t_up, 100, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in t direction */ + /* recieve the data from the neighbour on the left in t direction */ + /* xt-edge */ + MPI_Isend(buffer[(T+2)*LX*LY*LZ + (T-1)*LY*LZ], 1, edge_XT_gath_type, g_nb_t_up, 101, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[VOLUME + RAND + 2*LY*LZ], 1, edge_XT_cont_type, g_nb_t_dn, 101, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + if(g_dbw2rand > 0) { + /* send the data to the neighbour on the left in t direction */ + /* recieve the data from the neighbour on the right in t direction */ + /* t2x-edge */ + MPI_Isend(buffer[(T+2)*LX*LY*LZ + LY*LZ], + 1, edge_XT_gath_type, g_nb_t_dn, 102, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[VOLUMEPLUSRAND + RAND], + 1, edge_XT_cont_type, g_nb_t_up, 102, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in t direction */ + /* recieve the data from the neighbour on the left in t direction */ + /* t2x-edge */ + MPI_Isend(buffer[(T+2)*LX*LY*LZ + (T-2)*LY*LZ], + 1, edge_XT_gath_type, g_nb_t_up, 103, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[VOLUMEPLUSRAND + RAND + 2*LY*LZ], + 1, edge_XT_cont_type, g_nb_t_dn, 103, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the left in t direction */ + /* recieve the data from the neighbour on the right in t direction */ + /* x2t-edge */ + MPI_Isend(buffer[VOLUMEPLUSRAND + 2*LX*LY*LZ], + 1, edge_XT_gath_type, g_nb_t_dn, 104, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[VOLUMEPLUSRAND + RAND + 4*LY*LZ], + 1, edge_XT_cont_type, g_nb_t_up, 104, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in t direction */ + /* recieve the data from the neighbour on the left in t direction */ + /* x2t-edge */ + MPI_Isend(buffer[VOLUMEPLUSRAND + 2*LX*LY*LZ + (T-1)*LY*LZ], + 1, edge_XT_gath_type, g_nb_t_up, 105, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[VOLUMEPLUSRAND + RAND + 6*LY*LZ], + 1, edge_XT_cont_type, g_nb_t_dn, 105, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + } + /* end of if defined PARALLELXT || PARALLELXYT || PARALLELXYZT*/ +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT) + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + MPI_Isend(buffer[0], 1, slice_Y_gath_type, g_nb_y_dn, 106, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[VOLUME + 2*LZ*(LX*LY + T*LY)], 1, slice_Y_cont_type, g_nb_y_up, 106, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + MPI_Isend(buffer[(LY-1)*LZ], 1, slice_Y_gath_type, g_nb_y_up, 107, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[VOLUME + 2*LZ*(LX*LY + T*LY) + T*LX*LZ], 1, slice_Y_cont_type, g_nb_y_dn, 107, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + if(g_dbw2rand > 0) { + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + /* y2-Rand */ + MPI_Isend(buffer[LZ], 1, slice_Y_gath_type, g_nb_y_dn, 108, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[VOLUMEPLUSRAND+(2*LX+2*T)*LY*LZ], 1, slice_Y_cont_type, g_nb_y_up, 108, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* y2-Rand */ + MPI_Isend(buffer[(LY-2)*LZ], 1, slice_Y_gath_type, g_nb_y_up, 109, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[VOLUMEPLUSRAND+(2*LX+2*T)*LY*LZ + T*LX*LZ], 1, slice_Y_cont_type, g_nb_y_dn, 109, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + } +# endif + MPI_Waitall(cntr, request, status); + cntr=0; +# if (defined PARALLELXYT || defined PARALLELXYZT) + + /* jetzt wirds richtig eklig ... */ + + /* edges */ + + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + /* is on the y-Rand -> yx-edge*/ + MPI_Isend(buffer[VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ], 1, edge_XY_gath_type, g_nb_x_dn, 110, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[VOLUME + RAND + 4*LY*LZ], 1, edge_XY_cont_type, g_nb_x_up, 110, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* yx-edge */ + MPI_Isend(buffer[VOLUME + 2*LZ*(LX*LY + T*LY) + (LX-1)*LZ], 1, edge_XY_gath_type, g_nb_x_up, 111, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[VOLUME + RAND + 4*LY*LZ + 2*T*LZ], 1, edge_XY_cont_type, g_nb_x_dn, 111, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* is on the t-Rand -> ty-edge*/ + MPI_Isend(buffer[VOLUME], 1, edge_YT_gath_type, g_nb_y_dn, 112, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[VOLUME + RAND + 4*LY*LZ + 4*T*LZ], 1, edge_YT_cont_type, g_nb_y_up, 112, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* ty-edge */ + MPI_Isend(buffer[VOLUME + (LY-1)*LZ], 1, edge_YT_gath_type, g_nb_y_up, 113, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 2*LX*LZ], 1, edge_YT_cont_type, g_nb_y_dn, 113, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + + + if(g_dbw2rand > 0) { + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* x2y edge */ + MPI_Isend(buffer[VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + LZ], + 1, edge_XY_gath_type, g_nb_x_dn, 114, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[VOLUMEPLUSRAND + RAND + 8*LY*LZ], + 1, edge_XY_cont_type, g_nb_x_up, 114, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* x2y-edge */ + MPI_Isend(buffer[VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + (LX-2)*LZ], + 1, edge_XY_gath_type, g_nb_x_up, 115, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 2*T*LZ], + 1, edge_XY_cont_type, g_nb_x_dn, 115, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* y2x -edge */ + MPI_Isend(buffer[VOLUMEPLUSRAND + 2*LX*LY*LZ + 2*T*LY*LZ], + 1, edge_XY_gath_type, g_nb_x_dn, 116, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 4*T*LZ], + 1, edge_XY_cont_type, g_nb_x_up, 116, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* y2x edge */ + MPI_Isend(buffer[VOLUMEPLUSRAND + 2*LX*LY*LZ + 2*T*LY*LZ + (LX-1)*LZ], + 1, edge_XY_gath_type, g_nb_x_up, 117, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 6*T*LZ], + 1, edge_XY_cont_type, g_nb_x_dn, 117, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* t2y-edge */ + MPI_Isend(buffer[VOLUMEPLUSRAND], + 1, edge_YT_gath_type, g_nb_y_dn, 118, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ], + 1, edge_YT_cont_type, g_nb_y_up, 118, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* t2y edge */ + MPI_Isend(buffer[VOLUMEPLUSRAND + (LY-1)*LZ], + 1, edge_YT_gath_type, g_nb_y_up, 119, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 2*LX*LZ], + 1, edge_YT_cont_type, g_nb_y_dn, 119, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* y2t edge */ + MPI_Isend(buffer[VOLUME + LZ], + 1, edge_YT_gath_type, g_nb_y_dn, 120, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 4*LX*LZ], + 1, edge_YT_cont_type, g_nb_y_up, 120, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* y2t-edge */ + MPI_Isend(buffer[VOLUME + (LY-2)*LZ], + 1, edge_YT_gath_type, g_nb_y_up, 121, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 6*LX*LZ], + 1, edge_YT_cont_type, g_nb_y_dn, 121, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + } + + /* end of if defined PARALLELXYT || PARALLELXYZT */ +# endif +# if defined PARALLELXYZT + /* z-Rand */ + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + MPI_Isend(buffer[0], + 1, slice_Z_gath_type, g_nb_z_dn, 122, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[VOLUME + 2*LZ*(LX*LY + T*LY) + 2*LZ*T*LX], + 1, slice_Z_cont_type, g_nb_z_up, 122, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + /* send the data to the neighbour on the right in z direction */ + /* recieve the data from the neighbour on the left in z direction */ + MPI_Isend(buffer[LZ-1], + 1, slice_Z_gath_type, g_nb_z_up, 123, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[VOLUME + 2*LZ*(LX*LY + T*LY) + 2*T*LX*LZ + T*LX*LY], + 1, slice_Z_cont_type, g_nb_z_dn, 123, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + if(g_dbw2rand > 0) { + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + /* z2-Rand */ + MPI_Isend(buffer[1], + 1, slice_Z_gath_type, g_nb_z_dn, 124, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[VOLUMEPLUSRAND+(2*LX+2*T)*LY*LZ + 2*T*LX*LZ], + 1, slice_Z_cont_type, g_nb_z_up, 124, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + /* send the data to the neighbour on the right in z direction */ + /* recieve the data from the neighbour on the left in z direction */ + /* z2-Rand */ + MPI_Isend(buffer[LZ-2], + 1, slice_Z_gath_type, g_nb_z_up, 125, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[VOLUMEPLUSRAND+(2*LX+2*T)*LY*LZ + 2*T*LX*LZ + T*LX*LY], + 1, slice_Z_cont_type, g_nb_z_dn, 125, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + } +# endif + MPI_Waitall(cntr, request, status); +# if defined PARALLELXYZT + cntr=0; + /* edges */ + + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + /* is on the z-Rand -> zx-edge*/ + MPI_Isend(buffer[VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ], + 1, edge_XZ_gath_type, g_nb_x_dn, 126, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ], + 1, edge_XZ_cont_type, g_nb_x_up, 126, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* zx-edge */ + MPI_Isend(buffer[VOLUME + 2*LZ*(LX*LY + T*LY) + 2*T*LX*LZ + (LX-1)*LY], + 1, edge_XZ_gath_type, g_nb_x_up, 127, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ + 2*T*LY], + 1, edge_XZ_cont_type, g_nb_x_dn, 127, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + /* is on the t-Rand -> tz-edge*/ + MPI_Isend(buffer[VOLUME], + 1, edge_ZT_gath_type, g_nb_z_dn, 128, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ + 4*T*LY], + 1, edge_ZT_cont_type, g_nb_z_up, 128, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in z direction */ + /* recieve the data from the neighbour on the left in z direction */ + /* tz-edge */ + MPI_Isend(buffer[VOLUME + (LZ-1)], + 1, edge_ZT_gath_type, g_nb_z_up, 129, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ + 4*T*LY + 2*LX*LY], + 1, edge_ZT_cont_type, g_nb_z_dn, 129, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* is on the z-Rand -> zy-edge*/ + MPI_Isend(buffer[VOLUME + 2*LZ*(LX*LY + T*LY) + 2*T*LX*LZ], + 1, edge_YZ_gath_type, g_nb_y_dn, 130, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ + 4*T*LY + 4*LX*LY], + 1, edge_YZ_cont_type, g_nb_y_up, 130, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* zy-edge */ + MPI_Isend(buffer[VOLUME + 2*LZ*(LX*LY + T*LY) + 2*T*LX*LZ + (LY-1)], + 1, edge_YZ_gath_type, g_nb_y_up, 131, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ + 4*T*LY + 4*LX*LY + 2*T*LX], + 1, edge_YZ_cont_type, g_nb_y_dn, 131, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* rectangular gauge action Stuff! */ + if(g_dbw2rand > 0) { + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + /* t2z edge */ + MPI_Isend(buffer[VOLUMEPLUSRAND], + 1, edge_ZT_gath_type, g_nb_z_dn, 132, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ], + 1, edge_ZT_cont_type, g_nb_z_up, 132, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in z direction */ + /* recieve the data from the neighbour on the left in z direction */ + /* t2z-edge */ + MPI_Isend(buffer[VOLUMEPLUSRAND + (LZ-1)], + 1, edge_ZT_gath_type, g_nb_z_up, 133, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 2*LX*LY], + 1, edge_ZT_cont_type, g_nb_z_dn, 133, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + /* z2t -edge */ + MPI_Isend(buffer[VOLUME + 1], + 1, edge_ZT_gath_type, g_nb_z_dn, 134, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 4*LX*LY], + 1, edge_ZT_cont_type, g_nb_z_up, 134, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in z direction */ + /* recieve the data from the neighbour on the left in z direction */ + /* z2t edge */ + MPI_Isend(buffer[VOLUME + (LZ-2)], + 1, edge_ZT_gath_type, g_nb_z_up, 135, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 6*LX*LY], + 1, edge_ZT_cont_type, g_nb_z_dn, 135, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + /* z2x-edge */ + MPI_Isend(buffer[VOLUMEPLUSRAND + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ], + 1, edge_XZ_gath_type, g_nb_x_dn, 136, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY], + 1, edge_XZ_cont_type, g_nb_x_up, 136, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* z2x edge */ + MPI_Isend(buffer[VOLUMEPLUSRAND + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + (LX-1)*LY], + 1, edge_XZ_gath_type, g_nb_x_up, 137, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 2*T*LY], + 1, edge_XZ_cont_type, g_nb_x_dn, 137, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + /* x2z edge */ + MPI_Isend(buffer[VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + LY], + 1, edge_XZ_gath_type, g_nb_x_dn, 138, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 4*T*LY], + 1, edge_XZ_cont_type, g_nb_x_up, 138, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* x2z-edge */ + MPI_Isend(buffer[VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + (LX-2)*LY], + 1, edge_XZ_gath_type, g_nb_x_up, 139, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 6*T*LY], + 1, edge_XZ_cont_type, g_nb_x_dn, 139, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* z2y-edge */ + MPI_Isend(buffer[VOLUMEPLUSRAND + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ], + 1, edge_YZ_gath_type, g_nb_y_dn, 140, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 8*T*LY], + 1, edge_YZ_cont_type, g_nb_y_up, 140, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* z2y edge */ + MPI_Isend(buffer[VOLUMEPLUSRAND + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + (LY-1)], + 1, edge_YZ_gath_type, g_nb_y_up, 141, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 8*T*LY + 2*T*LX], + 1, edge_YZ_cont_type, g_nb_y_dn, 141, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* y2z edge */ + MPI_Isend(buffer[VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + 1], + 1, edge_YZ_gath_type, g_nb_y_dn, 142, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 8*T*LY + 4*T*LX], + 1, edge_YZ_cont_type, g_nb_y_up, 142, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* y2z-edge */ + MPI_Isend(buffer[VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + (LY-2)], + 1, edge_YZ_gath_type, g_nb_y_up, 143, + g_cart_grid, &request[cntr]); + MPI_Irecv(buffer[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 8*T*LY + 6*T*LX], + 1, edge_YZ_cont_type, g_nb_y_dn, 143, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + } + MPI_Waitall(cntr, request, status); +#endif \ No newline at end of file diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/utils_generic_exchange.3.inc b/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/utils_generic_exchange.3.inc new file mode 100644 index 0000000000000000000000000000000000000000..80bcfebfd0c35ae5d42d1a8625e320ac76f20f2e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/utils_generic_exchange.3.inc @@ -0,0 +1,415 @@ +# if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT ) + /* send the data to the neighbour on the left */ + /* recieve the data from the neighbour on the right */ + MPI_Sendrecv(buffer[gI_0_0_0_0], 1, slice_T_cont_type, g_nb_t_dn, 83, + buffer[gI_L_0_0_0], 1, slice_T_cont_type, g_nb_t_up, 83, + g_cart_grid, &status); + + /* send the data to the neighbour on the right */ + /* recieve the data from the neighbour on the left */ + MPI_Sendrecv(buffer[gI_Lm1_0_0_0], 1, slice_T_cont_type, g_nb_t_up, 84, + buffer[gI_m1_0_0_0], 1, slice_T_cont_type, g_nb_t_dn, 84, + g_cart_grid, &status); + + if(g_dbw2rand > 0) { + /* send the data to the neighbour on the left */ + /* recieve the data from the neighbour on the right */ + /* t2-Rand */ + MPI_Sendrecv(buffer[gI_p1_0_0_0], 1, slice_T_cont_type, g_nb_t_dn, 85, + buffer[gI_Lp1_0_0_0], 1, slice_T_cont_type, g_nb_t_up, 85, + g_cart_grid, &status); + + /* send the data to the neighbour on the right */ + /* recieve the data from the neighbour on the left */ + /* t2-Rand */ + MPI_Sendrecv(buffer[gI_Lm2_0_0_0], 1, slice_T_cont_type, g_nb_t_up, 86, + buffer[gI_m2_0_0_0], 1, slice_T_cont_type, g_nb_t_dn, 86, + g_cart_grid, &status); + } + +# endif +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + MPI_Sendrecv(buffer[gI_0_0_0_0], 1, slice_X_gath_type, g_nb_x_dn, 87, + buffer[gI_0_L_0_0], 1, slice_X_cont_type, g_nb_x_up, 87, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* x2-Rand */ + MPI_Sendrecv(buffer[gI_0_Lm1_0_0], 1, slice_X_gath_type, g_nb_x_up, 88, + buffer[gI_0_m1_0_0], 1, slice_X_cont_type, g_nb_x_dn, 88, + g_cart_grid, &status); + + if(g_dbw2rand > 0) { + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + /* x2-Rand */ + MPI_Sendrecv(buffer[gI_0_p1_0_0], 1, slice_X_gath_type, g_nb_x_dn, 89, + buffer[gI_0_Lp1_0_0], 1, slice_X_cont_type, g_nb_x_up, 89, + g_cart_grid, &status); + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* x2-Rand */ + MPI_Sendrecv(buffer[gI_0_Lm2_0_0], 1, slice_X_gath_type, g_nb_x_up, 90, + buffer[gI_0_m2_0_0], 1, slice_X_cont_type, g_nb_x_dn, 90, + g_cart_grid, &status); + } +# endif + /* The edges */ +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT ) + /* send the data to the neighbour on the left in t direction */ + /* recieve the data from the neighbour on the right in t direction */ + /* is on the x-Rand: xt-edge */ + MPI_Sendrecv(buffer[gI_0_L_0_0], 1, edge_XT_gath_type, g_nb_t_dn, 100, + buffer[gI_L_L_0_0], 1, edge_XT_cont_type, g_nb_t_up, 100, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in t direction */ + /* recieve the data from the neighbour on the left in t direction */ + /* xt-edge */ + MPI_Sendrecv(buffer[gI_Lm1_L_0_0], 1, edge_XT_gath_type, g_nb_t_up, 101, + buffer[gI_m1_L_0_0], 1, edge_XT_cont_type, g_nb_t_dn, 101, + g_cart_grid, &status); + + if(g_dbw2rand > 0) { + /* send the data to the neighbour on the left in t direction */ + /* recieve the data from the neighbour on the right in t direction */ + /* t2x-edge */ + MPI_Sendrecv(buffer[gI_p1_L_0_0], 1, edge_XT_gath_type, g_nb_t_dn, 102, + buffer[gI_Lp1_L_0_0], 1, edge_XT_cont_type, g_nb_t_up, 102, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in t direction */ + /* recieve the data from the neighbour on the left in t direction */ + /* t2x-edge */ + MPI_Sendrecv(buffer[gI_Lm2_L_0_0], 1, edge_XT_gath_type, g_nb_t_up, 103, + buffer[gI_m2_L_0_0], 1, edge_XT_cont_type, g_nb_t_dn, 103, + g_cart_grid, &status); + + /* send the data to the neighbour on the left in t direction */ + /* recieve the data from the neighbour on the right in t direction */ + /* x2t-edge */ + MPI_Sendrecv(buffer[gI_0_Lp1_0_0], 1, edge_XT_gath_type, g_nb_t_dn, 104, + buffer[gI_L_Lp1_0_0], 1, edge_XT_cont_type, g_nb_t_up, 104, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in t direction */ + /* recieve the data from the neighbour on the left in t direction */ + /* x2t-edge */ + MPI_Sendrecv(buffer[gI_Lm1_Lp1_0_0], 1, edge_XT_gath_type, g_nb_t_up, 105, + buffer[gI_m1_Lp1_0_0], 1, edge_XT_cont_type, g_nb_t_dn, 105, + g_cart_grid, &status); + } + /* end of if defined PARALLELXT || PARALLELXYT || PARALLELXYZT*/ +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + MPI_Sendrecv(buffer[gI_0_0_0_0], 1, slice_Y_gath_type, g_nb_y_dn, 106, + buffer[gI_0_0_L_0], 1, slice_Y_cont_type, g_nb_y_up, 106, + g_cart_grid, &status); + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + MPI_Sendrecv(buffer[gI_0_0_Lm1_0], 1, slice_Y_gath_type, g_nb_y_up, 107, + buffer[gI_0_0_m1_0], 1, slice_Y_cont_type, g_nb_y_dn, 107, + g_cart_grid, &status); + + if(g_dbw2rand > 0) { + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + /* y2-Rand */ + MPI_Sendrecv(buffer[gI_0_0_p1_0], 1, slice_Y_gath_type, g_nb_y_dn, 108, + buffer[gI_0_0_Lp1_0], 1, slice_Y_cont_type, g_nb_y_up, 108, + g_cart_grid, &status); + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* y2-Rand */ + MPI_Sendrecv(buffer[gI_0_0_Lm2_0], 1, slice_Y_gath_type, g_nb_y_up, 109, + buffer[gI_0_0_m2_0], 1, slice_Y_cont_type, g_nb_y_dn, 109, + g_cart_grid, &status); + } +# endif + /* jetzt wirds richtig eklig ... */ + + /* edges */ +# if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + /* is on the y-Rand -> yx-edge*/ + MPI_Sendrecv(buffer[gI_0_0_L_0], 1, edge_XY_gath_type, g_nb_x_dn, 110, + buffer[gI_0_L_L_0], 1, edge_XY_cont_type, g_nb_x_up, 110, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* yx-edge */ + MPI_Sendrecv(buffer[gI_0_Lm1_L_0], 1, edge_XY_gath_type, g_nb_x_up, 111, + buffer[gI_0_m1_L_0], 1, edge_XY_cont_type, g_nb_x_dn, 111, + g_cart_grid, &status); + +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT ) + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* is on the t-Rand -> ty-edge*/ + MPI_Sendrecv(buffer[gI_L_0_0_0], 1, edge_YT_gath_type, g_nb_y_dn, 112, + buffer[gI_L_0_L_0], 1, edge_YT_cont_type, g_nb_y_up, 112, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* ty-edge */ + MPI_Sendrecv(buffer[gI_L_0_Lm1_0], 1, edge_YT_gath_type, g_nb_y_up, 113, + buffer[gI_L_0_m1_0], 1, edge_YT_cont_type, g_nb_y_dn, 113, + g_cart_grid, &status); +# endif + + + if(g_dbw2rand > 0) { + +# if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* x2y edge */ + MPI_Sendrecv(buffer[gI_0_p1_L_0], 1, edge_XY_gath_type, g_nb_x_dn, 114, + buffer[gI_0_Lp1_L_0], 1, edge_XY_cont_type, g_nb_x_up, 114, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* x2y-edge */ + MPI_Sendrecv(buffer[gI_0_Lm2_L_0], 1, edge_XY_gath_type, g_nb_x_up, 115, + buffer[gI_0_m2_L_0], 1, edge_XY_cont_type, g_nb_x_dn, 115, + g_cart_grid, &status); + + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* y2x -edge */ + MPI_Sendrecv(buffer[gI_0_0_Lp1_0], 1, edge_XY_gath_type, g_nb_x_dn, 116, + buffer[gI_0_L_Lp1_0], 1, edge_XY_cont_type, g_nb_x_up, 116, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* y2x edge */ + MPI_Sendrecv(buffer[gI_0_Lm1_Lp1_0], 1, edge_XY_gath_type, g_nb_x_up, 117, + buffer[gI_0_m1_Lp1_0], 1, edge_XY_cont_type, g_nb_x_dn, 117, + g_cart_grid, &status); + +# endif +# if (defined PARALLELXYT || defined PARALLELXYZT ) + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* t2y-edge */ + MPI_Sendrecv(buffer[gI_Lp1_0_0_0], 1, edge_YT_gath_type, g_nb_y_dn, 118, + buffer[gI_Lp1_0_L_0], 1, edge_YT_cont_type, g_nb_y_up, 118, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* t2y edge */ + MPI_Sendrecv(buffer[gI_Lp1_0_Lm1_0], 1, edge_YT_gath_type, g_nb_y_up, 119, + buffer[gI_Lp1_0_m1_0], 1, edge_YT_cont_type, g_nb_y_dn, 119, + g_cart_grid, &status); + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* y2t edge */ + MPI_Sendrecv(buffer[gI_L_0_p1_0], 1, edge_YT_gath_type, g_nb_y_dn, 120, + buffer[gI_L_0_Lp1_0], 1, edge_YT_cont_type, g_nb_y_up, 120, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* y2t-edge */ + MPI_Sendrecv(buffer[gI_L_0_Lm2_0], 1, edge_YT_gath_type, g_nb_y_up, 121, + buffer[gI_L_0_m2_0], 1, edge_YT_cont_type, g_nb_y_dn, 121, + g_cart_grid, &status); +# endif /* end of if defined PARALLELXYT || PARALLELXYZT */ + } + + +# if (defined PARALLELXYZT || defined PARALLELXYZ ) + /* z-Rand */ + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + MPI_Sendrecv(buffer[gI_0_0_0_0], 1, slice_Z_gath_type, g_nb_z_dn, 122, + buffer[gI_0_0_0_L], 1, slice_Z_cont_type, g_nb_z_up, 122, + g_cart_grid, &status); + /* send the data to the neighbour on the right in z direction */ + /* recieve the data from the neighbour on the left in z direction */ + MPI_Sendrecv(buffer[gI_0_0_0_Lm1], 1, slice_Z_gath_type, g_nb_z_up, 123, + buffer[gI_0_0_0_m1], 1, slice_Z_cont_type, g_nb_z_dn, 123, + g_cart_grid, &status); + + if(g_dbw2rand > 0) { + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + /* z2-Rand */ + MPI_Sendrecv(buffer[gI_0_0_0_p1], 1, slice_Z_gath_type, g_nb_z_dn, 124, + buffer[gI_0_0_0_Lp1], 1, slice_Z_cont_type, g_nb_z_up, 124, + g_cart_grid, &status); + /* send the data to the neighbour on the right in z direction */ + /* recieve the data from the neighbour on the left in z direction */ + /* z2-Rand */ + MPI_Sendrecv(buffer[gI_0_0_0_Lm2], 1, slice_Z_gath_type, g_nb_z_up, 125, + buffer[gI_0_0_0_m2], 1, slice_Z_cont_type, g_nb_z_dn, 125, + g_cart_grid, &status); + } + +# endif + /* edges */ + +# if (defined PARALLELXYZT || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + /* is on the z-Rand -> zx-edge*/ + MPI_Sendrecv(buffer[gI_0_0_0_L], 1, edge_XZ_gath_type, g_nb_x_dn, 126, + buffer[gI_0_L_0_L], 1, edge_XZ_cont_type, g_nb_x_up, 126, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* zx-edge */ + MPI_Sendrecv(buffer[gI_0_Lm1_0_L], 1, edge_XZ_gath_type, g_nb_x_up, 127, + buffer[gI_0_m1_0_L], 1, edge_XZ_cont_type, g_nb_x_dn, 127, + g_cart_grid, &status); +# endif + +# if (defined PARALLELXYZT) + + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + /* is on the t-Rand -> tz-edge*/ + MPI_Sendrecv(buffer[gI_L_0_0_0], 1, edge_ZT_gath_type, g_nb_z_dn, 128, + buffer[gI_L_0_0_L], 1, edge_ZT_cont_type, g_nb_z_up, 128, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in z direction */ + /* recieve the data from the neighbour on the left in z direction */ + /* tz-edge */ + MPI_Sendrecv(buffer[gI_L_0_0_Lm1], 1, edge_ZT_gath_type, g_nb_z_up, 129, + buffer[gI_L_0_0_m1], 1, edge_ZT_cont_type, g_nb_z_dn, 129, + g_cart_grid, &status); + +# endif + +# if (defined PARALLELXYZT || defined PARALLELXYZ ) + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* is on the z-Rand -> zy-edge*/ + MPI_Sendrecv(buffer[gI_0_0_0_L], 1, edge_YZ_gath_type, g_nb_y_dn, 130, + buffer[gI_0_0_L_L], 1, edge_YZ_cont_type, g_nb_y_up, 130, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* zy-edge */ + MPI_Sendrecv(buffer[gI_0_0_Lm1_L], 1, edge_YZ_gath_type, g_nb_y_up, 131, + buffer[gI_0_0_m1_L], 1, edge_YZ_cont_type, g_nb_y_dn, 131, + g_cart_grid, &status); + +# endif + + /* rectangular gauge action Stuff! */ + if(g_dbw2rand > 0) { + +# if (defined PARALLELXYZT) + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + /* t2z edge */ + MPI_Sendrecv(buffer[gI_Lp1_0_0_0], 1, edge_ZT_gath_type, g_nb_z_dn, 132, + buffer[gI_Lp1_0_0_L], 1, edge_ZT_cont_type, g_nb_z_up, 132, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in z direction */ + /* recieve the data from the neighbour on the left in z direction */ + /* t2z-edge */ + MPI_Sendrecv(buffer[gI_Lp1_0_0_Lm1], 1, edge_ZT_gath_type, g_nb_z_up, 133, + buffer[gI_Lp1_0_0_m1], 1, edge_ZT_cont_type, g_nb_z_dn, 133, + g_cart_grid, &status); + + + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + /* z2t -edge */ + MPI_Sendrecv(buffer[gI_L_0_0_p1], 1, edge_ZT_gath_type, g_nb_z_dn, 134, + buffer[gI_L_0_0_Lp1], 1, edge_ZT_cont_type, g_nb_z_up, 134, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in z direction */ + /* recieve the data from the neighbour on the left in z direction */ + /* z2t edge */ + MPI_Sendrecv(buffer[gI_L_0_0_Lm2], 1, edge_ZT_gath_type, g_nb_z_up, 135, + buffer[gI_L_0_0_m2], 1, edge_ZT_cont_type, g_nb_z_dn, 135, + g_cart_grid, &status); + +# endif +# if (defined PARALLELXYZT || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + /* z2x-edge */ + MPI_Sendrecv(buffer[gI_0_0_0_Lp1], 1, edge_XZ_gath_type, g_nb_x_dn, 136, + buffer[gI_0_L_0_Lp1], 1, edge_XZ_cont_type, g_nb_x_up, 136, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* z2x edge */ + MPI_Sendrecv(buffer[gI_0_Lm1_0_Lp1], 1, edge_XZ_gath_type, g_nb_x_up, 137, + buffer[gI_0_m1_0_Lp1], 1, edge_XZ_cont_type, g_nb_x_dn, 137, + g_cart_grid, &status); + + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + /* x2z edge */ + MPI_Sendrecv(buffer[gI_0_p1_0_L], 1, edge_XZ_gath_type, g_nb_x_dn, 138, + buffer[gI_0_Lp1_0_L], 1, edge_XZ_cont_type, g_nb_x_up, 138, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* x2z-edge */ + MPI_Sendrecv(buffer[gI_0_Lm2_0_L], 1, edge_XZ_gath_type, g_nb_x_up, 139, + buffer[gI_0_m2_0_L], 1, edge_XZ_cont_type, g_nb_x_dn, 139, + g_cart_grid, &status); + +# endif +# if (defined PARALLELXYZT || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* z2y-edge */ + MPI_Sendrecv(buffer[gI_0_0_0_Lp1], 1, edge_YZ_gath_type, g_nb_y_dn, 140, + buffer[gI_0_0_L_Lp1], 1, edge_YZ_cont_type, g_nb_y_up, 140, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* z2y edge */ + MPI_Sendrecv(buffer[gI_0_0_Lm1_Lp1], 1, edge_YZ_gath_type, g_nb_y_up, 141, + buffer[gI_0_0_m1_Lp1], 1, edge_YZ_cont_type, g_nb_y_dn, 141, + g_cart_grid, &status); + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* y2z edge */ + MPI_Sendrecv(buffer[gI_0_0_p1_L], 1, edge_YZ_gath_type, g_nb_y_dn, 142, + buffer[gI_0_0_Lp1_L], 1, edge_YZ_cont_type, g_nb_y_up, 142, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* y2z-edge */ + MPI_Sendrecv(buffer[gI_0_0_Lm2_L], 1, edge_YZ_gath_type, g_nb_y_up, 143, + buffer[gI_0_0_m2_L], 1, edge_YZ_cont_type, g_nb_y_dn, 143, + g_cart_grid, &status); + +# endif /* end of if defined PARALLELXYZT or PARALLELXYZ */ + } diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/utils_generic_exchange.4.inc b/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/utils_generic_exchange.4.inc new file mode 100644 index 0000000000000000000000000000000000000000..e6e5f975c1d56475a1ae96678b6b68ddaceba10a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/utils_generic_exchange.4.inc @@ -0,0 +1,457 @@ + /* send the data to the neighbour on the left */ + /* recieve the data from the neighbour on the right */ + MPI_Sendrecv(buffer[0], 1, slice_T_cont_type, g_nb_t_dn, 83, + buffer[VOLUME], 1, slice_T_cont_type, g_nb_t_up, 83, + g_cart_grid, &status); + + /* send the data to the neighbour on the right */ + /* recieve the data from the neighbour on the left */ + MPI_Sendrecv(buffer[(T-1)*LX*LY*LZ], 1, slice_T_cont_type, g_nb_t_up, 84, + buffer[(T+1)*LX*LY*LZ], 1, slice_T_cont_type, g_nb_t_dn, 84, + g_cart_grid, &status); + + if(g_dbw2rand > 0) { + /* send the data to the neighbour on the left */ + /* recieve the data from the neighbour on the right */ + /* t2-Rand */ + MPI_Sendrecv(buffer[1*LX*LY*LZ], 1, slice_T_cont_type, g_nb_t_dn, 85, + buffer[VOLUMEPLUSRAND], 1, slice_T_cont_type, g_nb_t_up, 85, + g_cart_grid, &status); + + /* send the data to the neighbour on the right */ + /* recieve the data from the neighbour on the left */ + /* t2-Rand */ + MPI_Sendrecv(buffer[(T-2)*LX*LY*LZ], 1, slice_T_cont_type, g_nb_t_up, 86, + buffer[VOLUMEPLUSRAND+LX*LY*LZ], 1, slice_T_cont_type, g_nb_t_dn, 86, + g_cart_grid, &status); + } + +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + MPI_Sendrecv(buffer[0], 1, slice_X_gath_type, g_nb_x_dn, 93, + buffer[(T+2)*LX*LY*LZ], 1, slice_X_cont_type, g_nb_x_up, 93, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* x2-Rand */ + MPI_Sendrecv(buffer[(LX-1)*LY*LZ], 1, slice_X_gath_type, g_nb_x_up, 94, + buffer[(T+2)*LX*LY*LZ + T*LY*LZ], 1, slice_X_cont_type, g_nb_x_dn, 94, + g_cart_grid, &status); + + if(g_dbw2rand > 0) { + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + /* x2-Rand */ + MPI_Sendrecv(buffer[LY*LZ], 1, slice_X_gath_type, g_nb_x_dn, 95, + buffer[VOLUMEPLUSRAND+2*LX*LY*LZ], 1, slice_X_cont_type, g_nb_x_up, 95, + g_cart_grid, &status); + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* x2-Rand */ + MPI_Sendrecv(buffer[(LX-2)*LY*LZ], 1, slice_X_gath_type, g_nb_x_up, 96, + buffer[VOLUMEPLUSRAND+2*LX*LY*LZ + T*LY*LZ], 1, slice_X_cont_type, g_nb_x_dn, 96, + g_cart_grid, &status); + } + + /* The edges */ + + /* send the data to the neighbour on the left in t direction */ + /* recieve the data from the neighbour on the right in t direction */ + /* is on the x-Rand: xt-edge */ + MPI_Sendrecv(buffer[(T+2)*LX*LY*LZ], 1, edge_XT_gath_type, g_nb_t_dn, 95, + buffer[VOLUME + RAND], 1, edge_XT_cont_type, g_nb_t_up, 95, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in t direction */ + /* recieve the data from the neighbour on the left in t direction */ + /* xt-edge */ + MPI_Sendrecv(buffer[(T+2)*LX*LY*LZ + (T-1)*LY*LZ], 1, edge_XT_gath_type, g_nb_t_up, 96, + buffer[VOLUME + RAND + 2*LY*LZ], 1, edge_XT_cont_type, g_nb_t_dn, 96, + g_cart_grid, &status); + + if(g_dbw2rand > 0) { + /* send the data to the neighbour on the left in t direction */ + /* recieve the data from the neighbour on the right in t direction */ + /* t2x-edge */ + MPI_Sendrecv(buffer[(T+2)*LX*LY*LZ + LY*LZ], + 1, edge_XT_gath_type, g_nb_t_dn, 97, + buffer[VOLUMEPLUSRAND + RAND], + 1, edge_XT_cont_type, g_nb_t_up, 97, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in t direction */ + /* recieve the data from the neighbour on the left in t direction */ + /* t2x-edge */ + MPI_Sendrecv(buffer[(T+2)*LX*LY*LZ + (T-2)*LY*LZ], + 1, edge_XT_gath_type, g_nb_t_up, 98, + buffer[VOLUMEPLUSRAND + RAND + 2*LY*LZ], + 1, edge_XT_cont_type, g_nb_t_dn, 98, + g_cart_grid, &status); + + /* send the data to the neighbour on the left in t direction */ + /* recieve the data from the neighbour on the right in t direction */ + /* x2t-edge */ + MPI_Sendrecv(buffer[VOLUMEPLUSRAND + 2*LX*LY*LZ], + 1, edge_XT_gath_type, g_nb_t_dn, 97, + buffer[VOLUMEPLUSRAND + RAND + 4*LY*LZ], + 1, edge_XT_cont_type, g_nb_t_up, 97, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in t direction */ + /* recieve the data from the neighbour on the left in t direction */ + /* x2t-edge */ + MPI_Sendrecv(buffer[VOLUMEPLUSRAND + 2*LX*LY*LZ + (T-1)*LY*LZ], + 1, edge_XT_gath_type, g_nb_t_up, 98, + buffer[VOLUMEPLUSRAND + RAND + 6*LY*LZ], + 1, edge_XT_cont_type, g_nb_t_dn, 98, + g_cart_grid, &status); + } + /* end of if defined PARALLELXT || PARALLELXYT || PARALLELXYZT*/ +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT) + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + MPI_Sendrecv(buffer[0], 1, slice_Y_gath_type, g_nb_y_dn, 103, + buffer[VOLUME + 2*LZ*(LX*LY + T*LY)], 1, slice_Y_cont_type, g_nb_y_up, 103, + g_cart_grid, &status); + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + MPI_Sendrecv(buffer[(LY-1)*LZ], 1, slice_Y_gath_type, g_nb_y_up, 104, + buffer[VOLUME + 2*LZ*(LX*LY + T*LY) + T*LX*LZ], 1, slice_Y_cont_type, g_nb_y_dn, 104, + g_cart_grid, &status); + + if(g_dbw2rand > 0) { + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + /* y2-Rand */ + MPI_Sendrecv(buffer[LZ], 1, slice_Y_gath_type, g_nb_y_dn, 105, + buffer[VOLUMEPLUSRAND+(2*LX+2*T)*LY*LZ], 1, slice_Y_cont_type, g_nb_y_up, 105, + g_cart_grid, &status); + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* y2-Rand */ + MPI_Sendrecv(buffer[(LY-2)*LZ], 1, slice_Y_gath_type, g_nb_y_up, 106, + buffer[VOLUMEPLUSRAND+(2*LX+2*T)*LY*LZ + T*LX*LZ], 1, slice_Y_cont_type, g_nb_y_dn, 106, + g_cart_grid, &status); + } + + /* jetzt wirds richtig eklig ... */ + + /* edges */ + + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + /* is on the y-Rand -> yx-edge*/ + MPI_Sendrecv(buffer[VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ], 1, edge_XY_gath_type, g_nb_x_dn, 107, + buffer[VOLUME + RAND + 4*LY*LZ], 1, edge_XY_cont_type, g_nb_x_up, 107, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* yx-edge */ + MPI_Sendrecv(buffer[VOLUME + 2*LZ*(LX*LY + T*LY) + (LX-1)*LZ], 1, edge_XY_gath_type, g_nb_x_up, 108, + buffer[VOLUME + RAND + 4*LY*LZ + 2*T*LZ], 1, edge_XY_cont_type, g_nb_x_dn, 108, + g_cart_grid, &status); + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* is on the t-Rand -> ty-edge*/ + MPI_Sendrecv(buffer[VOLUME], 1, edge_YT_gath_type, g_nb_y_dn, 109, + buffer[VOLUME + RAND + 4*LY*LZ + 4*T*LZ], 1, edge_YT_cont_type, g_nb_y_up, 109, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* ty-edge */ + MPI_Sendrecv(buffer[VOLUME + (LY-1)*LZ], 1, edge_YT_gath_type, g_nb_y_up, 110, + buffer[VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 2*LX*LZ], 1, edge_YT_cont_type, g_nb_y_dn, 110, + g_cart_grid, &status); + + + + if(g_dbw2rand > 0) { + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* x2y edge */ + MPI_Sendrecv(buffer[VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + LZ], + 1, edge_XY_gath_type, g_nb_x_dn, 97, + buffer[VOLUMEPLUSRAND + RAND + 8*LY*LZ], + 1, edge_XY_cont_type, g_nb_x_up, 97, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* x2y-edge */ + MPI_Sendrecv(buffer[VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + (LX-2)*LZ], + 1, edge_XY_gath_type, g_nb_x_up, 98, + buffer[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 2*T*LZ], + 1, edge_XY_cont_type, g_nb_x_dn, 98, + g_cart_grid, &status); + + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* y2x -edge */ + MPI_Sendrecv(buffer[VOLUMEPLUSRAND + 2*LX*LY*LZ + 2*T*LY*LZ], + 1, edge_XY_gath_type, g_nb_x_dn, 97, + buffer[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 4*T*LZ], + 1, edge_XY_cont_type, g_nb_x_up, 97, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* y2x edge */ + MPI_Sendrecv(buffer[VOLUMEPLUSRAND + 2*LX*LY*LZ + 2*T*LY*LZ + (LX-1)*LZ], + 1, edge_XY_gath_type, g_nb_x_up, 98, + buffer[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 6*T*LZ], + 1, edge_XY_cont_type, g_nb_x_dn, 98, + g_cart_grid, &status); + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* t2y-edge */ + MPI_Sendrecv(buffer[VOLUMEPLUSRAND], + 1, edge_YT_gath_type, g_nb_y_dn, 197, + buffer[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ], + 1, edge_YT_cont_type, g_nb_y_up, 197, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* t2y edge */ + MPI_Sendrecv(buffer[VOLUMEPLUSRAND + (LY-1)*LZ], + 1, edge_YT_gath_type, g_nb_y_up, 198, + buffer[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 2*LX*LZ], + 1, edge_YT_cont_type, g_nb_y_dn, 198, + g_cart_grid, &status); + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* y2t edge */ + MPI_Sendrecv(buffer[VOLUME + LZ], + 1, edge_YT_gath_type, g_nb_y_dn, 297, + buffer[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 4*LX*LZ], + 1, edge_YT_cont_type, g_nb_y_up, 297, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* y2t-edge */ + MPI_Sendrecv(buffer[VOLUME + (LY-2)*LZ], + 1, edge_YT_gath_type, g_nb_y_up, 298, + buffer[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 6*LX*LZ], + 1, edge_YT_cont_type, g_nb_y_dn, 298, + g_cart_grid, &status); + } + + /* end of if defined PARALLELXYT || PARALLELXYZT */ +# endif +# if defined PARALLELXYZT + /* z-Rand */ + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + MPI_Sendrecv(buffer[0], + 1, slice_Z_gath_type, g_nb_z_dn, 303, + buffer[VOLUME + 2*LZ*(LX*LY + T*LY) + 2*LZ*T*LX], + 1, slice_Z_cont_type, g_nb_z_up, 303, + g_cart_grid, &status); + /* send the data to the neighbour on the right in z direction */ + /* recieve the data from the neighbour on the left in z direction */ + MPI_Sendrecv(buffer[LZ-1], + 1, slice_Z_gath_type, g_nb_z_up, 304, + buffer[VOLUME + 2*LZ*(LX*LY + T*LY) + 2*T*LX*LZ + T*LX*LY], + 1, slice_Z_cont_type, g_nb_z_dn, 304, + g_cart_grid, &status); + + if(g_dbw2rand > 0) { + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + /* z2-Rand */ + MPI_Sendrecv(buffer[1], + 1, slice_Z_gath_type, g_nb_z_dn, 305, + buffer[VOLUMEPLUSRAND+(2*LX+2*T)*LY*LZ + 2*T*LX*LZ], + 1, slice_Z_cont_type, g_nb_z_up, 305, + g_cart_grid, &status); + /* send the data to the neighbour on the right in z direction */ + /* recieve the data from the neighbour on the left in z direction */ + /* z2-Rand */ + MPI_Sendrecv(buffer[LZ-2], + 1, slice_Z_gath_type, g_nb_z_up, 306, + buffer[VOLUMEPLUSRAND+(2*LX+2*T)*LY*LZ + 2*T*LX*LZ + T*LX*LY], + 1, slice_Z_cont_type, g_nb_z_dn, 306, + g_cart_grid, &status); + } + + /* edges */ + + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + /* is on the z-Rand -> zx-edge*/ + MPI_Sendrecv(buffer[VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ], + 1, edge_XZ_gath_type, g_nb_x_dn, 307, + buffer[VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ], + 1, edge_XZ_cont_type, g_nb_x_up, 307, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* zx-edge */ + MPI_Sendrecv(buffer[VOLUME + 2*LZ*(LX*LY + T*LY) + 2*T*LX*LZ + (LX-1)*LY], + 1, edge_XZ_gath_type, g_nb_x_up, 308, + buffer[VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ + 2*T*LY], + 1, edge_XZ_cont_type, g_nb_x_dn, 308, + g_cart_grid, &status); + + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + /* is on the t-Rand -> tz-edge*/ + MPI_Sendrecv(buffer[VOLUME], + 1, edge_ZT_gath_type, g_nb_z_dn, 309, + buffer[VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ + 4*T*LY], + 1, edge_ZT_cont_type, g_nb_z_up, 309, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in z direction */ + /* recieve the data from the neighbour on the left in z direction */ + /* tz-edge */ + MPI_Sendrecv(buffer[VOLUME + (LZ-1)], + 1, edge_ZT_gath_type, g_nb_z_up, 310, + buffer[VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ + 4*T*LY + 2*LX*LY], + 1, edge_ZT_cont_type, g_nb_z_dn, 310, + g_cart_grid, &status); + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* is on the z-Rand -> zy-edge*/ + MPI_Sendrecv(buffer[VOLUME + 2*LZ*(LX*LY + T*LY) + 2*T*LX*LZ], + 1, edge_YZ_gath_type, g_nb_y_dn, 310, + buffer[VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ + 4*T*LY + 4*LX*LY], + 1, edge_YZ_cont_type, g_nb_y_up, 310, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* zy-edge */ + MPI_Sendrecv(buffer[VOLUME + 2*LZ*(LX*LY + T*LY) + 2*T*LX*LZ + (LY-1)], + 1, edge_YZ_gath_type, g_nb_y_up, 310, + buffer[VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ + 4*T*LY + 4*LX*LY + 2*T*LX], + 1, edge_YZ_cont_type, g_nb_y_dn, 310, + g_cart_grid, &status); + + /* rectangular gauge action Stuff! */ + if(g_dbw2rand > 0) { + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + /* t2z edge */ + MPI_Sendrecv(buffer[VOLUMEPLUSRAND], + 1, edge_ZT_gath_type, g_nb_z_dn, 500, + buffer[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ], + 1, edge_ZT_cont_type, g_nb_z_up, 500, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in z direction */ + /* recieve the data from the neighbour on the left in z direction */ + /* t2z-edge */ + MPI_Sendrecv(buffer[VOLUMEPLUSRAND + (LZ-1)], + 1, edge_ZT_gath_type, g_nb_z_up, 501, + buffer[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 2*LX*LY], + 1, edge_ZT_cont_type, g_nb_z_dn, 501, + g_cart_grid, &status); + + + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + /* z2t -edge */ + MPI_Sendrecv(buffer[VOLUME + 1], + 1, edge_ZT_gath_type, g_nb_z_dn, 502, + buffer[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 4*LX*LY], + 1, edge_ZT_cont_type, g_nb_z_up, 502, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in z direction */ + /* recieve the data from the neighbour on the left in z direction */ + /* z2t edge */ + MPI_Sendrecv(buffer[VOLUME + (LZ-2)], + 1, edge_ZT_gath_type, g_nb_z_up, 503, + buffer[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 6*LX*LY], + 1, edge_ZT_cont_type, g_nb_z_dn, 503, + g_cart_grid, &status); + + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + /* z2x-edge */ + MPI_Sendrecv(buffer[VOLUMEPLUSRAND + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ], + 1, edge_XZ_gath_type, g_nb_x_dn, 504, + buffer[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY], + 1, edge_XZ_cont_type, g_nb_x_up, 504, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* z2x edge */ + MPI_Sendrecv(buffer[VOLUMEPLUSRAND + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + (LX-1)*LY], + 1, edge_XZ_gath_type, g_nb_x_up, 504, + buffer[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 2*T*LY], + 1, edge_XZ_cont_type, g_nb_x_dn, 504, + g_cart_grid, &status); + + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + /* x2z edge */ + MPI_Sendrecv(buffer[VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + LY], + 1, edge_XZ_gath_type, g_nb_x_dn, 505, + buffer[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 4*T*LY], + 1, edge_XZ_cont_type, g_nb_x_up, 505, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* x2z-edge */ + MPI_Sendrecv(buffer[VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + (LX-2)*LY], + 1, edge_XZ_gath_type, g_nb_x_up, 506, + buffer[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 6*T*LY], + 1, edge_XZ_cont_type, g_nb_x_dn, 506, + g_cart_grid, &status); + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* z2y-edge */ + MPI_Sendrecv(buffer[VOLUMEPLUSRAND + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ], + 1, edge_YZ_gath_type, g_nb_y_dn, 507, + buffer[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 8*T*LY], + 1, edge_YZ_cont_type, g_nb_y_up, 507, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* z2y edge */ + MPI_Sendrecv(buffer[VOLUMEPLUSRAND + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + (LY-1)], + 1, edge_YZ_gath_type, g_nb_y_up, 508, + buffer[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 8*T*LY + 2*T*LX], + 1, edge_YZ_cont_type, g_nb_y_dn, 508, + g_cart_grid, &status); + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* y2z edge */ + MPI_Sendrecv(buffer[VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + 1], + 1, edge_YZ_gath_type, g_nb_y_dn, 509, + buffer[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 8*T*LY + 4*T*LX], + 1, edge_YZ_cont_type, g_nb_y_up, 509, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* y2z-edge */ + MPI_Sendrecv(buffer[VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + (LY-2)], + 1, edge_YZ_gath_type, g_nb_y_up, 510, + buffer[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 8*T*LY + 6*T*LX], + 1, edge_YZ_cont_type, g_nb_y_dn, 510, + g_cart_grid, &status); + + } + +#endif /* PARALLELXYZT */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/utils_generic_exchange.c b/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/utils_generic_exchange.c new file mode 100644 index 0000000000000000000000000000000000000000..d1566a9fb5fbaebff55adc83ef464cdee80ae17f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/buffers/utils_generic_exchange.c @@ -0,0 +1,146 @@ +#include "utils.ih" + +#ifndef MPI /*Let's deal with this case once and for all*/ +void generic_exchange(void *field_in, int bytes_per_site) +{} +#else /* MPI */ +void generic_exchange(void *field_in, int bytes_per_site) +{ +#if defined _NON_BLOCKING + int cntr=0; + MPI_Request request[108]; + MPI_Status status[108]; +#else /* _NON_BLOCKING */ + MPI_Status status; +#endif /* _NON_BLOCKING */ + static int initialized = 0; + + /* We start by defining all the MPI datatypes required */ + static MPI_Datatype site_type; + + static MPI_Datatype slice_X_cont_type, slice_Y_cont_type, slice_Z_cont_type, slice_T_cont_type; + static MPI_Datatype slice_X_subs_type, slice_Y_subs_type; + static MPI_Datatype slice_X_gath_type, slice_Y_gath_type, slice_Z_gath_type; + + static MPI_Datatype edge_XY_cont_type, edge_XZ_cont_type, edge_XT_cont_type, edge_YZ_cont_type, edge_YT_cont_type, edge_ZT_cont_type; + static MPI_Datatype edge_XY_gath_type, edge_XZ_gath_type, edge_XT_gath_type, edge_YZ_gath_type, edge_YT_gath_type, edge_ZT_gath_type; + + unsigned char(*buffer)[bytes_per_site] = field_in; /* To allow for pointer arithmetic */ + + // To avoid continuous MPI operations on these local variables, let's declare them static. + // That means we should only initialize if this is the first use of the function, or if + // the existing initialization is for the wrong number of bytes per size! + if (initialized && (initialized != bytes_per_site)) + { + MPI_Type_free(&site_type); + + MPI_Type_free(&slice_T_cont_type); + MPI_Type_free(&slice_X_cont_type); + MPI_Type_free(&slice_Y_cont_type); + MPI_Type_free(&slice_Z_cont_type); + + MPI_Type_free(&slice_X_subs_type); + MPI_Type_free(&slice_Y_subs_type); + + MPI_Type_free(&slice_X_gath_type); + MPI_Type_free(&slice_Y_gath_type); + MPI_Type_free(&slice_Z_gath_type); + + MPI_Type_free(&edge_XY_cont_type); + MPI_Type_free(&edge_XZ_cont_type); + MPI_Type_free(&edge_XT_cont_type); + MPI_Type_free(&edge_YZ_cont_type); + MPI_Type_free(&edge_YT_cont_type); + MPI_Type_free(&edge_ZT_cont_type); + + MPI_Type_free(&edge_XY_gath_type); + MPI_Type_free(&edge_XZ_gath_type); + MPI_Type_free(&edge_XT_gath_type); + MPI_Type_free(&edge_YZ_gath_type); + MPI_Type_free(&edge_YT_gath_type); + MPI_Type_free(&edge_ZT_gath_type); + + /* We're ready to reinitialize all these types now... */ + initialized = 0; + } + + if (!initialized) + { + /* Initialization of the datatypes - adapted from mpi_init.c */ + MPI_Type_contiguous(bytes_per_site, MPI_BYTE, &site_type); + MPI_Type_commit(&site_type); + + MPI_Type_contiguous(LX * LY *LZ, site_type, &slice_T_cont_type); + MPI_Type_contiguous( T * LY *LZ, site_type, &slice_X_cont_type); + MPI_Type_contiguous( T * LX *LZ, site_type, &slice_Y_cont_type); + MPI_Type_contiguous( T * LX *LY, site_type, &slice_Z_cont_type); + + MPI_Type_commit(&slice_T_cont_type); + MPI_Type_commit(&slice_X_cont_type); + MPI_Type_commit(&slice_Y_cont_type); + MPI_Type_commit(&slice_Z_cont_type); + + MPI_Type_contiguous(LY * LZ, site_type, &slice_X_subs_type); + MPI_Type_contiguous(LZ, site_type, &slice_Y_subs_type); + + MPI_Type_commit(&slice_X_subs_type); + MPI_Type_commit(&slice_Y_subs_type); + + MPI_Type_vector(T, 1, LX, slice_X_subs_type, &slice_X_gath_type); + MPI_Type_vector(T * LX, 1, LY, slice_Y_subs_type, &slice_Y_gath_type); + MPI_Type_vector(T * LX * LY, 1, LZ, site_type, &slice_Z_gath_type); + + MPI_Type_commit(&slice_X_gath_type); + MPI_Type_commit(&slice_Y_gath_type); + MPI_Type_commit(&slice_Z_gath_type); + + MPI_Type_contiguous(2 * T * LZ, site_type, &edge_XY_cont_type); + MPI_Type_contiguous(2 * T * LY, site_type, &edge_XZ_cont_type); + MPI_Type_contiguous(2 * LY * LZ, site_type, &edge_XT_cont_type); + MPI_Type_contiguous(2 * T * LX, site_type, &edge_YZ_cont_type); + MPI_Type_contiguous(2 * LX * LZ, site_type, &edge_YT_cont_type); + + MPI_Type_contiguous(2 * LX * LY, site_type, &edge_ZT_cont_type); + + MPI_Type_commit(&edge_XY_cont_type); + MPI_Type_commit(&edge_XZ_cont_type); + MPI_Type_commit(&edge_XT_cont_type); + MPI_Type_commit(&edge_YZ_cont_type); + MPI_Type_commit(&edge_YT_cont_type); + MPI_Type_commit(&edge_ZT_cont_type); + + MPI_Type_vector(2 * T, LZ, LX * LZ, site_type, &edge_XY_gath_type); + MPI_Type_vector(2 * T, LY, LY * LX, site_type, &edge_XZ_gath_type); + MPI_Type_vector(2, 1, T, slice_X_subs_type, &edge_XT_gath_type); + MPI_Type_vector(2 * T * LX, 1, LY, site_type, &edge_YZ_gath_type); + MPI_Type_vector(2 * LX, LZ, LY * LZ, site_type, &edge_YT_gath_type); + MPI_Type_vector(2 * LX * LY, 1, LZ, site_type, &edge_ZT_gath_type); + + MPI_Type_commit(&edge_XY_gath_type); + MPI_Type_commit(&edge_XZ_gath_type); + MPI_Type_commit(&edge_XT_gath_type); + MPI_Type_commit(&edge_YZ_gath_type); + MPI_Type_commit(&edge_YT_gath_type); + MPI_Type_commit(&edge_ZT_gath_type); + + initialized = bytes_per_site; + } + + /* Following are implementations using different compile time flags */ +#if defined _NON_BLOCKING +# if defined _INDEX_INDEP_GEOM +# include "utils_generic_exchange.1.inc" +# else /* _INDEX_INDEP_GEOM */ +# include "utils_generic_exchange.2.inc" +# endif /* _INDEX_INDEP_GEOM */ +#else /* _NON_BLOCKING */ +# if defined _INDEX_INDEP_GEOM +# include "utils_generic_exchange.3.inc" +# else /* _INDEX_INDEP_GEOM */ +# include "utils_generic_exchange.4.inc" +# endif /* _INDEX_INDEP_GEOM */ +#endif /* _NON_BLOCKING */ +} + +#endif /* MPI */ + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/chebyshev_polynomial.c b/qcd/part_cpu/applications/QCD/src/kernel_D/chebyshev_polynomial.c new file mode 100644 index 0000000000000000000000000000000000000000..bbf6e42187868a4828b0941087309d287c7e3f43 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/chebyshev_polynomial.c @@ -0,0 +1,366 @@ +/*********************************************************************** + * + * Copyright (C) 2003,2005,2006,2007,2008 Mauro Papinutto, Ines Wetzorke, + * Karl Jansen, Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "global.h" +#include "linalg_eo.h" +#include "start.h" +#include "operator/tm_operators.h" +#include "operator/tm_operators_nd.h" +#include "chebyshev_polynomial.h" + +#define PI 3.141592653589793 + +double cheb_evmin, cheb_evmax; + +double func(double u, double exponent){ + return pow(u,exponent); +} + +void chebyshev_polynomial(double aa, double bb, double c[], int n, double exponent){ + int k,j; + double fac,bpa,bma,*f; + double inv_n; + + inv_n=1./(double)n; + f=calloc(n,sizeof(double));/*vector(0,n-1);*/ + fflush(stdout); + bma=0.5*(bb-aa); + bpa=0.5*(bb+aa); + for (k=0;k=1; j--) { + assign(&svs[0],&ds[0],VOLUME/2); + assign(&svc[0],&dc[0],VOLUME/2); + +/* if ( (j%10) == 0 ) { + sub_low_ev(&aux[0], &d[0]); + } + else { */ + assign(&auxs[0], &ds[0], VOLUME/2); + assign(&auxc[0], &dc[0], VOLUME/2); +/* } */ + + Qtm_dagger_ndpsi(&aux2s[0], &aux2c[0], &auxs[0], &auxc[0]); + Qtm_ndpsi(&R_s[0], &R_c[0], &aux2s[0], &aux2c[0]); + temp1=-1.0; + temp2=c[j]; + assign_mul_add_mul_add_mul_add_mul_r(&ds[0] , &R_s[0], &dds[0], &aux3s[0], fact2, fact1, temp1, temp2,VOLUME/2); + assign_mul_add_mul_add_mul_add_mul_r(&dc[0] , &R_c[0], &ddc[0], &aux3c[0], fact2, fact1, temp1, temp2,VOLUME/2); + assign(&dds[0], &svs[0],VOLUME/2); + assign(&ddc[0], &svc[0],VOLUME/2); + } + +/* sub_low_ev(&R[0],&d[0]); */ + assign(&R_s[0], &ds[0],VOLUME/2); + assign(&R_c[0], &dc[0],VOLUME/2); + + Qtm_dagger_ndpsi(&aux2s[0], &aux2c[0], &R_s[0], &R_c[0]); + Qtm_ndpsi(&auxs[0], &auxc[0], &aux2s[0], &aux2c[0]); + + temp1=-1.0; + temp2=c[0]/2; + temp3=fact1/2; + temp4=fact2/2; + assign_mul_add_mul_add_mul_add_mul_r(&auxs[0], &ds[0], &dds[0], &aux3s[0], temp3, temp4, temp1, temp2,VOLUME/2); + assign_mul_add_mul_add_mul_add_mul_r(&auxc[0], &dc[0], &ddc[0], &aux3c[0], temp3, temp4, temp1, temp2,VOLUME/2); + assign(&R_s[0], &auxs[0],VOLUME/2); + assign(&R_c[0], &auxc[0],VOLUME/2); + +/* addproj_q_invsqrt(&R[0], &S[0]); */ + +/* +#ifndef _SOLVER_OUTPUT + if(g_proc_id == g_stdio_proc){ + printf("Order of Chebysheff approximation = %d\n",j); + fflush( stdout);}; +#endif +*/ + + + free(svs_); + free(ds_); + free(dds_); + free(auxs_); + free(aux2s_); + free(aux3s_); + free(svc_); + free(dc_); + free(ddc_); + free(auxc_); + free(aux2c_); + free(aux3c_); + +} + + + +/************************************************************************** + * + * The externally accessible function is + * + * void degree_of_polynomial(void) + * Computation of (QdaggerQ)^1/4 + * by using the chebyshev approximation for the function ()^1/4 + * + * + *******************************************************************************/ + +double stopeps=5.0e-16; + +int dop_n_cheby=0; +double * dop_cheby_coef; + +void degree_of_polynomial(const int repro){ + int i; + double temp; + static int ini=0; + + spinor *ss=NULL, *ss_=NULL, *auxs=NULL, *auxs_=NULL, + *aux2s=NULL, *aux2s_=NULL, *aux3s=NULL, *aux3s_=NULL; + spinor *sc=NULL, *sc_=NULL, *auxc=NULL, *auxc_=NULL, *aux2c=NULL, + *aux2c_=NULL, *aux3c=NULL, *aux3c_=NULL; + + + + if(ini==0){ + dop_cheby_coef = calloc(N_CHEBYMAX,sizeof(double)); + ini=1; + } + + + + +#if ( defined SSE || defined SSE2 || defined SSE3) + ss_ = calloc(VOLUMEPLUSRAND/2+1, sizeof(spinor)); + auxs_ = calloc(VOLUMEPLUSRAND/2+1, sizeof(spinor)); + aux2s_= calloc(VOLUMEPLUSRAND/2+1, sizeof(spinor)); + aux3s_= calloc(VOLUMEPLUSRAND/2+1, sizeof(spinor)); + ss = (spinor *)(((unsigned long int)(ss_)+ALIGN_BASE)&~ALIGN_BASE); + auxs = (spinor *)(((unsigned long int)(auxs_)+ALIGN_BASE)&~ALIGN_BASE); + aux2s = (spinor *)(((unsigned long int)(aux2s_)+ALIGN_BASE)&~ALIGN_BASE); + aux3s = (spinor *)(((unsigned long int)(aux3s_)+ALIGN_BASE)&~ALIGN_BASE); + sc_ = calloc(VOLUMEPLUSRAND/2+1, sizeof(spinor)); + auxc_ = calloc(VOLUMEPLUSRAND/2+1, sizeof(spinor)); + aux2c_= calloc(VOLUMEPLUSRAND/2+1, sizeof(spinor)); + aux3c_= calloc(VOLUMEPLUSRAND/2+1, sizeof(spinor)); + sc = (spinor *)(((unsigned long int)(sc_)+ALIGN_BASE)&~ALIGN_BASE); + auxc = (spinor *)(((unsigned long int)(auxc_)+ALIGN_BASE)&~ALIGN_BASE); + aux2c = (spinor *)(((unsigned long int)(aux2c_)+ALIGN_BASE)&~ALIGN_BASE); + aux3c = (spinor *)(((unsigned long int)(aux3c_)+ALIGN_BASE)&~ALIGN_BASE); +#else + ss =calloc(VOLUMEPLUSRAND/2, sizeof(spinor)); + auxs =calloc(VOLUMEPLUSRAND/2, sizeof(spinor)); + aux2s=calloc(VOLUMEPLUSRAND/2, sizeof(spinor)); + aux3s=calloc(VOLUMEPLUSRAND/2, sizeof(spinor)); + sc =calloc(VOLUMEPLUSRAND/2, sizeof(spinor)); + auxc =calloc(VOLUMEPLUSRAND/2, sizeof(spinor)); + aux2c=calloc(VOLUMEPLUSRAND/2, sizeof(spinor)); + aux3c=calloc(VOLUMEPLUSRAND/2, sizeof(spinor)); +#endif + + chebyshev_polynomial(cheb_evmin, cheb_evmax, dop_cheby_coef, N_CHEBYMAX, 0.25); + + temp=1.0; + random_spinor_field_eo(ss, repro, RN_GAUSS); + random_spinor_field_eo(sc, repro, RN_GAUSS); +/* assign(&sc[0], &ss[0],VOLUME/2); + + Qtm_pm_psi(&auxs[0], &ss[0]); + temp=square_norm(&auxs[0],VOLUME/2, 1); + printf("||auxs Carsten||=%e\n",temp); + + Qtm_dagger_ndpsi(&aux3s[0], &aux3c[0], &ss[0], &sc[0]); + Qtm_ndpsi(&auxs[0], &auxc[0], &aux3s[0], &aux3c[0]); + temp=square_norm(&auxs[0],VOLUME/2, 1); + printf("||auxs own||=%e\n",temp); + temp=square_norm(&auxc[0],VOLUME/2, 1); + printf("||auxc own||=%e\n",temp); */ + + +/* if(g_proc_id == g_stdio_proc) { + printf("\ndetermine the degree of the polynomial:\n"); + fflush(stdout); + } */ + + dop_n_cheby=(int)5./sqrt(cheb_evmin); + for(i = 0;i < 1 ; i++){ +/* printf("n_cheby=%d i=%d\n", dop_n_cheby, i); */ + + if (dop_n_cheby >= N_CHEBYMAX) { + if(g_proc_id == g_stdio_proc){ + printf("Error: n_cheby=%d > N_CHEBYMAX=%d\n",dop_n_cheby,N_CHEBYMAX); + printf("Increase n_chebymax\n"); + } +/* errorhandler(35,"degree_of_polynomial"); */ + } + + QdaggerQ_power(&aux3s[0], &aux3c[0], dop_cheby_coef, dop_n_cheby, &ss[0], &sc[0]); + QdaggerQ_power(&auxs[0], &auxc[0], dop_cheby_coef, dop_n_cheby, &aux3s[0], &aux3c[0]); + QdaggerQ_power(&aux3s[0], &aux3c[0], dop_cheby_coef, dop_n_cheby, &auxs[0], &auxc[0]); + QdaggerQ_power(&auxs[0], &auxc[0], dop_cheby_coef, dop_n_cheby, &aux3s[0], &aux3c[0]); +/* temp=square_norm(&auxs[0],VOLUME/2, 1); + printf("||auxs||=%e\n",temp); + temp=square_norm(&auxc[0],VOLUME/2, 1); + printf("||auxc||=%e\n",temp); */ + + + Qtm_dagger_ndpsi(&aux2s[0], &aux2c[0], &ss[0], &sc[0]); + Qtm_ndpsi(&aux3s[0], &aux3c[0], &aux2s[0], &aux2c[0]); + +/* temp=square_norm(&aux3s[0],VOLUME/2, 1); + printf("||auxs_3||=%e\n",temp); + temp=square_norm(&aux3c[0],VOLUME/2, 1); + printf("||auxc_3||=%e\n",temp); */ + + diff(&auxs[0],&auxs[0],&aux3s[0],VOLUME/2); + temp=square_norm(&auxs[0],VOLUME/2)/square_norm(&aux3s[0],VOLUME/2, 1)/4.0; + if(g_proc_id == g_stdio_proc) { + printf("difference=%e\n",temp); + diff(&auxc[0],&auxc[0],&aux3c[0],VOLUME/2); + temp=square_norm(&auxc[0],VOLUME/2)/square_norm(&aux3c[0],VOLUME/2, 1)/4.0; + printf("difference=%e\n",temp); + } + if(temp < stopeps ) break; + dop_n_cheby*=1.05; + } + + free(ss_); + free(auxs_); + free(aux2s_); + free(aux3s_); + free(sc_); + free(auxc_); + free(aux2c_); + free(aux3c_); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/chebyshev_polynomial.h b/qcd/part_cpu/applications/QCD/src/kernel_D/chebyshev_polynomial.h new file mode 100644 index 0000000000000000000000000000000000000000..71c85f17bfcee9e8859a008b5cb59614c0efcc99 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/chebyshev_polynomial.h @@ -0,0 +1,36 @@ +/*********************************************************************** + * + * Copyright (C) 2006,2007,2008 Karl Jansen, Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _CHEBYSHEV_POLYNOMIAL_H +#define _CHEBYSHEV_POLYNOMIAL_H + +extern double cheb_evmin, cheb_evmax; +extern int dop_n_cheby; +extern double * dop_cheby_coef; + + +double func(double u, double exponent); +void chebyshev_polynomial(double a, double b, double c[], int n, double exponent); + +void QdaggerQ_power(spinor *R_s, spinor *R_c, double *c, int n, spinor *S_s, spinor *S_c); + +void degree_of_polynomial(const int repro); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/chebyshev_polynomial_nd.c b/qcd/part_cpu/applications/QCD/src/kernel_D/chebyshev_polynomial_nd.c new file mode 100644 index 0000000000000000000000000000000000000000..05b8a81a2a39a675b68d427c951d91aa4f415b1e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/chebyshev_polynomial_nd.c @@ -0,0 +1,183 @@ +/*********************************************************************** + * + * Copyright (C) 2006,2007,2008 Thomas Chiarappa, Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "global.h" +#include "linalg_eo.h" +#include "start.h" +#include "operator/tm_operators.h" +#include "operator/tm_operators_nd.h" +#include "phmc.h" +#include "Ptilde_nd.h" +#include "chebyshev_polynomial_nd.h" + + + +#define PI 3.141592653589793 + +double func(double u, double exponent){ + return pow(u,exponent); +} + + +void chebyshev_coefs(double aa, double bb, double c[], int n, double exponent){ + int k,j; + double fac,bpa,bma,*f; + double inv_n; + + + inv_n=1./(double)n; + f=calloc(n,sizeof(double));/*vector(0,n-1);*/ + fflush(stdout); + bma=0.5*(bb-aa); + bpa=0.5*(bb+aa); + for (k=0;k=1; j--){ + sv = d; + d = z2*d - dd + c[j]; + dd = sv; + } + + res = z*d - dd + 0.5*c[0]; + + return(res); +} + +/************************************************************************** + * + * The externally accessible function is + * + * void degree_of_polynomial_nd(void) + * Computation of (QdaggerQ)^1/4 + * by using the chebyshev approximation for the function ()^1/4 + * + * + *****************************************************************************/ + + +void degree_of_polynomial_nd(int * _degree_of_p, double ** coefs, + const double EVMin, const double EVMax, + matrix_mult_nd Qsq, const int repro) { + double temp, temp2; + int degree_of_p = *_degree_of_p + 1; + + spinor *ss=NULL, *ss_=NULL, *sc=NULL, *sc_=NULL; + spinor *auxs=NULL, *auxs_=NULL, *auxc=NULL, *auxc_=NULL; + spinor *aux2s=NULL, *aux2s_=NULL, *aux2c=NULL, *aux2c_=NULL; + + *coefs = calloc(degree_of_p, sizeof(double)); + + ss_ = calloc(VOLUMEPLUSRAND/2+1, sizeof(spinor)); + auxs_ = calloc(VOLUMEPLUSRAND/2+1, sizeof(spinor)); + aux2s_= calloc(VOLUMEPLUSRAND/2+1, sizeof(spinor)); + sc_ = calloc(VOLUMEPLUSRAND/2+1, sizeof(spinor)); + auxc_ = calloc(VOLUMEPLUSRAND/2+1, sizeof(spinor)); + aux2c_= calloc(VOLUMEPLUSRAND/2+1, sizeof(spinor)); + + ss = (spinor *)(((unsigned long int)(ss_)+ALIGN_BASE)&~ALIGN_BASE); + auxs = (spinor *)(((unsigned long int)(auxs_)+ALIGN_BASE)&~ALIGN_BASE); + aux2s = (spinor *)(((unsigned long int)(aux2s_)+ALIGN_BASE)&~ALIGN_BASE); + sc = (spinor *)(((unsigned long int)(sc_)+ALIGN_BASE)&~ALIGN_BASE); + auxc = (spinor *)(((unsigned long int)(auxc_)+ALIGN_BASE)&~ALIGN_BASE); + aux2c = (spinor *)(((unsigned long int)(aux2c_)+ALIGN_BASE)&~ALIGN_BASE); + + chebyshev_coefs(EVMin, EVMax, *coefs, degree_of_p, -0.5); + + random_spinor_field_eo(ss, repro, RN_GAUSS); + random_spinor_field_eo(sc, repro, RN_GAUSS); + + if((g_proc_id == g_stdio_proc) && (g_debug_level > 0)){ + printf("# NDPOLY MD Polynomial: EVmin = %e EVmax = %e \n", EVMin, EVMax); + printf("# NDPOLY MD Polynomial: the degree was set to: %d\n", degree_of_p); + fflush(stdout); + } + + if(g_debug_level > 1) { + /* Here we check the accuracy */ + Ptilde_ndpsi(&auxs[0], &auxc[0], *coefs, degree_of_p, &ss[0], &sc[0], Qsq); + Qsq(&aux2s[0], &aux2c[0], &auxs[0], &auxc[0]); + Ptilde_ndpsi(&auxs[0], &auxc[0], *coefs, degree_of_p, &aux2s[0], &aux2c[0], Qsq); + + diff(&aux2s[0],&auxs[0],&ss[0],VOLUME/2); + temp=square_norm(&aux2s[0],VOLUME/2, 1)/square_norm(&ss[0],VOLUME/2, 1)/4.0; + + diff(&aux2c[0],&auxc[0],&sc[0],VOLUME/2); + temp2 = square_norm(&aux2c[0],VOLUME/2, 1)/square_norm(&sc[0],VOLUME/2, 1)/4.0; + + if(g_epsbar == 0.){ + temp2 = 0.0; + } + + if(g_proc_id == g_stdio_proc){ + /* this is || (P S P - 1)X ||^2 /|| 2X ||^2 */ + /* where X is a random spinor field */ + printf("# NDPOLY MD Polynomial: relative squared accuracy in components:\n# UP=%e DN=%e \n", temp, temp2); + fflush(stdout); + } + + temp = cheb_eval(degree_of_p, *coefs, EVMin); + temp *= EVMin; + temp *= cheb_eval(degree_of_p, *coefs, EVMin); + temp = 0.5*fabs(temp - 1); + if(g_proc_id == g_stdio_proc) { + printf("# NDPOLY MD Polynomial: Delta_IR at s=%f: | P s_low P - 1 |/2 = %e \n", EVMin, temp); + } + } + /* RECALL THAT WE NEED AN EVEN DEGREE !!!! */ + *_degree_of_p = degree_of_p; + + free(ss_); + free(auxs_); + free(aux2s_); + free(sc_); + free(auxc_); + free(aux2c_); + return; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/chebyshev_polynomial_nd.h b/qcd/part_cpu/applications/QCD/src/kernel_D/chebyshev_polynomial_nd.h new file mode 100644 index 0000000000000000000000000000000000000000..7eb0916633114208ee45307e1174d860e036ef47 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/chebyshev_polynomial_nd.h @@ -0,0 +1,34 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _CHEBYSHEV_POLYNOMIAL_ND_H +#define _CHEBYSHEV_POLYNOMIAL_ND_H + +#include "solver/matrix_mult_typedef_nd.h" + +double func(double u, double exponent); + +void chebyshev_coefs(double a, double b, double c[], int n, double exponent); + +double cheb_eval(int M, double *c, double s); + +void degree_of_polynomial_nd(int * _degree_of_p, double ** coefs, + const double EVMin, const double EVMax, + matrix_mult_nd Qsq, const int repro); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/check_locallity.c b/qcd/part_cpu/applications/QCD/src/kernel_D/check_locallity.c new file mode 100644 index 0000000000000000000000000000000000000000..920a6b6decad6f5fa9c7d765bb287252a4bee267 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/check_locallity.c @@ -0,0 +1,313 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef BENCHMARK +#include <./c-lime/include/lime.h> +#else +#include +#endif + +#include +#include +#include +#include +#include +#include +#ifdef MPI +#include +#endif +#include "global.h" +#include "getopt.h" +#include "linalg_eo.h" +#include "geometry_eo.h" +#include "start.h" +#include "measure_gauge_action.h" +#ifdef MPI +#include "xchange/xchange.h" +#endif +#include "read_input.h" +#include "mpi_init.h" +#include "sighandler.h" +#include "boundary.h" +#include "solver/solver.h" +#include "init/init.h" +#include "smearing/stout.h" +#include "su3spinor.h" +#include "invert_eo.h" +#include "operator/D_psi.h" +#include "linalg/convert_eo_to_lexic.h" + + + +void usage(){ + fprintf(stdout, "Code for locallity check of the Dirac operator\n"); + fprintf(stdout, "Version %s \n\n", PACKAGE_VERSION); + fprintf(stdout, "Please send bug reports to %s\n", PACKAGE_BUGREPORT); + fprintf(stdout, "Usage: invert [options]\n"); + fprintf(stdout, "Options: [-f input-filename]\n"); + fprintf(stdout, " [-o output-filename]\n"); + fprintf(stdout, " [-h|-? this help]\n"); + exit(0); +} + +extern int nstore; + +int check_geometry(); + +int main(int argc,char *argv[]) { + + FILE *parameterfile=NULL; + int c, j, is=0, ic=0; + int x, X, y, Y, z, Z, t, tt, i, sum; + char * filename = NULL; + char datafilename[50]; + char parameterfilename[50]; + char conf_filename[50]; + char * input_filename = NULL; + double plaquette_energy, nrm; + double * norm; + struct stout_parameters params_smear; + +#ifdef _GAUGE_COPY + int kb=0; +#endif +#ifdef MPI + double atime=0., etime=0.; +#endif +#ifdef _KOJAK_INST +#pragma pomp inst init +#pragma pomp inst begin(main) +#endif + + DUM_DERI = 6; + /* DUM_DERI + 2 is enough (not 7) */ + DUM_SOLVER = DUM_DERI+2; + DUM_MATRIX = DUM_SOLVER+6; + /* DUM_MATRIX + 2 is enough (not 6) */ + NO_OF_SPINORFIELDS = DUM_MATRIX+2; + + verbose = 0; + g_use_clover_flag = 0; + g_nr_of_psf = 1; + +#ifdef MPI + MPI_Init(&argc, &argv); +#endif + + while ((c = getopt(argc, argv, "h?f:o:")) != -1) { + switch (c) { + case 'f': + input_filename = calloc(200, sizeof(char)); + strcpy(input_filename,optarg); + break; + case 'o': + filename = calloc(200, sizeof(char)); + strcpy(filename,optarg); + break; + case 'h': + case '?': + default: + usage(); + break; + } + } + if(input_filename == NULL){ + input_filename = "hmc.input"; + } + if(filename == NULL){ + filename = "output"; + } + + /* Read the input file */ + read_input(input_filename); + /* here we want no even/odd preconditioning */ + even_odd_flag = 0; + + /* this DBW2 stuff is not needed for the inversion ! */ + g_rgi_C1 = 0; + if(Nsave == 0){ + Nsave = 1; + } + tmlqcd_mpi_init(argc, argv); + + g_dbw2rand = 0; + +#ifndef MPI + g_dbw2rand = 0; +#endif + +#ifdef _GAUGE_COPY + j = init_gauge_field(VOLUMEPLUSRAND, 1); +#else + j = init_gauge_field(VOLUMEPLUSRAND, 0); +#endif + if ( j!= 0) { + fprintf(stderr, "Not enough memory for gauge_fields! Aborting...\n"); + exit(-1); + } + j = init_geometry_indices(VOLUMEPLUSRAND); + if ( j!= 0) { + fprintf(stderr, "Not enough memory for geometry indices! Aborting...\n"); + exit(-1); + } + if(even_odd_flag) { + j = init_spinor_field(VOLUMEPLUSRAND/2, NO_OF_SPINORFIELDS); + } + else { + j = init_spinor_field(VOLUMEPLUSRAND, NO_OF_SPINORFIELDS); + } + if ( j!= 0) { + fprintf(stderr, "Not enough memory for spinor fields! Aborting...\n"); + exit(-1); + } + + g_mu = g_mu1; + if(g_proc_id == 0){ + /*construct the filenames for the observables and the parameters*/ + strcpy(datafilename,filename); strcat(datafilename,".data"); + strcpy(parameterfilename,filename); strcat(parameterfilename,".para"); + + parameterfile=fopen(parameterfilename, "w"); + write_first_messages(parameterfile, "check_locality", "NA"); + } + + /* define the geometry */ + geometry(); + + /* define the boundary conditions for the fermion fields */ + boundary(); + +#ifdef _USE_HALFSPINOR + j = init_dirac_halfspinor(); + if ( j!= 0) { + fprintf(stderr, "Not enough memory for halffield! Aborting...\n"); + exit(-1); + } + if(g_sloppy_precision_flag == 1) { + j = init_dirac_halfspinor32(); + if ( j!= 0) { + fprintf(stderr, "Not enough memory for 32-Bit halffield! Aborting...\n"); + exit(-1); + } + } +# if (defined _PERSISTENT) + init_xchange_halffield(); +# endif +#endif + norm = (double*)calloc(3.*LX/2.+T/2., sizeof(double)); + + for(j=0;j LX/2) X = LX-x; + else X = x; + for(y = 0; y < LY; y++){ + if(y > LY/2) Y = LY-y; + else Y = y; + for(z = 0; z < LZ; z++){ + if(z > LZ/2) Z = LZ-z; + else Z = z; + for(t = 0; t < T; t++){ + if(t > T/2) tt = T - t; + else tt = t; + sum = X + Y + Z + tt; + _spinor_norm_sq(nrm, g_spinor_field[DUM_DERI+1][ g_ipt[t][x][y][z] ]); +/* _spinor_norm_sq(nrm, qprop[0][0][1][ g_ipt[t][x][y][z] ]); */ + printf("%e %e\n", creal(g_spinor_field[DUM_DERI+1][ g_ipt[t][x][y][z] ].s0.c0), cimag(g_spinor_field[DUM_DERI+1][ g_ipt[t][x][y][z] ].s0.c0)); + nrm = sqrt( nrm ); + printf("%1.12e\n", nrm); + if(nrm > norm[sum]) norm[sum] = nrm; + } + } + } + } + + for(i = 0; i < 3*L/2+T/2; i++){ + printf("%d %1.12e\n", i, norm[i]); + } + printf("\n"); + + nstore+=Nsave; + } + +#ifdef MPI + MPI_Finalize(); +#endif + free_gauge_field(); + free_geometry_indices(); + free_spinor_field(); + free_moment_field(); + return(0); +#ifdef _KOJAK_INST +#pragma pomp inst end(main) +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/clenshaw_coef.c b/qcd/part_cpu/applications/QCD/src/kernel_D/clenshaw_coef.c new file mode 100644 index 0000000000000000000000000000000000000000..85441ea5f8e74f646b98bec5dcd6fa45c774cf40 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/clenshaw_coef.c @@ -0,0 +1,278 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#include +#include +#include +#include "phmc.h" +#include "clenshaw_coef.h" +#define Pi 3.141592653589793 + +extern long double c[3000]; + +extern double a, b; + +long double D[300]; + +void clenscoef(int M){ + + long long int j, jmax, k, N2, N, ij, i, imax; + + long double s[500][500]; + long double l[500][500]; + long double sgn; + long double sum; + long double snew, sold, lnew, lold; + + long double jj; + + long long int j2; + long double A, B, A2, B2; + + + FILE *coeroot; + char *filename_stub7 = "Cheby_coeff_for_roots_"; + char *filename7; + char buf7[100]; + + + FILE *factors; + char *filename_stub9 = "Pre-factors_"; + char *filename9; + char buf9[100]; + + + filename7=buf7; + sprintf(filename7,"%s%d.dat", filename_stub7,M); + + filename9=buf9; + sprintf(filename9,"%s%d.dat", filename_stub9,M); + + coeroot = fopen(filename7,"w"); + fprintf(coeroot,"### Chebishev coeff. in ascending order (pow. coef.) \n"); + /* fprintf(coeroot,"### power j coeff. \n"); */ + fclose(coeroot); + + + N = (long long int)(M - 1); + N2 = (long long int)(N/2); + + A = (long double)(2./(long double)(b-a)); + B = (long double)((b+a)/(long double)(b-a)); + A2 = (long double)(2*A); + B2 = (long double)(2*B); + + /* Initialisation */ + for(k=0; k=1; j--){ + + sgn = -1.0; + + j2=2*j; + + ij = (long long int)((j+1)/2) - (long long int)(j/2); + /* + printf(" ij=%lld \n", ij); + printf(" j=%lld j2=%lld \n", j, j2); + */ + if(ij == 0) sgn = -sgn; + /* + printf(" sgn=%llf \n", sgn); + printf(" C=%llf \n", c[j2]); + */ + + D[0]+= (long double)(c[j2]*sgn); + /* + printf(" D=%llf \n", D[0]); + */ + } + + + D[0] = (long double)(D[0] + 0.5*c[0]); + /* + printf(" Pre final D=%llf \n", D[0]); + */ + + /* + printf(" D0 = %llf \n", D[0]); + */ + + /* Evaluate first the coefficient of x^0 */ + for(i=1; i 1) sum = (long double)(sum*powl(B2,(i-1))); + + + D[0] = (long double)(sum + D[0]); + /* + printf("At i=%lld Sum=%llf D=%llf D=%20.18lle\n", i, sum, D[0], D[0]); + */ + } + + + + + /* Evaluate the Block of coefficients [1, N-1] */ + + for(k=1; k 1 LOOP over inner loop */ + for(i=1; i<=imax; i++){ + + sgn = 1.0; + ij = (long long int)(i/2) - (long long int)((i-1)/2); + if(ij == 0) sgn = -sgn; + sum = 0.0; + jmax = (long long int)((N-k+3-i)/2); + + /* printf(" \n At i=%d ij=%d jmax=%d \n", i, ij, jmax); */ + for(j=1; j<=jmax; j++){ + + j2 = k + 2*j + i - 3; + sgn = -sgn; + /* + printf("At k=%d i=%d jmax=%d j=%d j2=%d \n", k, i, jmax, j, j2); + */ + sum += (long double)(c[j2]*sgn*s[k+i-1][j]); + /* + printf("s=%d sgn=%llf sum=%llf \n", s[k+i-1][j], sgn, sum); + */ + } + + /* printf(" At k=%d and i=%d Value is %d \n", k, i, l[k][i]); */ + /* D[k] += sum * l[k][i]; */ + /* printf(" At degree %d The value is %12.10e \n", k,D[k]); */ + + sum = (long double)(sum*l[k][i]*powl(B2,(i-1))); + + D[k] = (long double)((sum + D[k])); + /* + printf(" At k=%d i=%d, l=%d sum=%llf D=%llf \n", k,i,l[k][i], sum, D[k]); + */ + } + D[k] = (long double)(D[k]*powl(A2,k)/2); + } + + /* And finally the highest degree coefficient k=N */ + + D[N] = (long double)(powl(A2,(N-1))*A*c[N]); + + /* If normalisation is required */ + /* + for(k=0; k. + ***********************************************************************/ +#ifndef _CLENSHAW_COEF_H +#define _CLENSHAW_COEF_H + +void clenscoef(int M); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/config.guess b/qcd/part_cpu/applications/QCD/src/kernel_D/config.guess new file mode 100755 index 0000000000000000000000000000000000000000..c08cebd35dd2f5d4dbead5cb578bc1dacb7d1a76 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/config.guess @@ -0,0 +1,1507 @@ +#! /bin/sh +# Attempt to guess a canonical system name. +# Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, +# 2000, 2001, 2002, 2003, 2004, 2005, 2006 Free Software Foundation, +# Inc. + +timestamp='2006-11-30' + +# This file is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA +# 02110-1301, USA. +# +# As a special exception to the GNU General Public License, if you +# distribute this file as part of a program that contains a +# configuration script generated by Autoconf, you may include it under +# the same distribution terms that you use for the rest of that program. + + +# Originally written by Per Bothner . +# Please send patches to . Submit a context +# diff and a properly formatted ChangeLog entry. +# +# This script attempts to guess a canonical system name similar to +# config.sub. If it succeeds, it prints the system name on stdout, and +# exits with 0. Otherwise, it exits with 1. +# +# The plan is that this can be called by configure scripts if you +# don't specify an explicit build system type. + +me=`echo "$0" | sed -e 's,.*/,,'` + +usage="\ +Usage: $0 [OPTION] + +Output the configuration name of the system \`$me' is run on. + +Operation modes: + -h, --help print this help, then exit + -t, --time-stamp print date of last modification, then exit + -v, --version print version number, then exit + +Report bugs and patches to ." + +version="\ +GNU config.guess ($timestamp) + +Originally written by Per Bothner. +Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005 +Free Software Foundation, Inc. + +This is free software; see the source for copying conditions. There is NO +warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." + +help=" +Try \`$me --help' for more information." + +# Parse command line +while test $# -gt 0 ; do + case $1 in + --time-stamp | --time* | -t ) + echo "$timestamp" ; exit ;; + --version | -v ) + echo "$version" ; exit ;; + --help | --h* | -h ) + echo "$usage"; exit ;; + -- ) # Stop option processing + shift; break ;; + - ) # Use stdin as input. + break ;; + -* ) + echo "$me: invalid option $1$help" >&2 + exit 1 ;; + * ) + break ;; + esac +done + +if test $# != 0; then + echo "$me: too many arguments$help" >&2 + exit 1 +fi + +trap 'exit 1' 1 2 15 + +# CC_FOR_BUILD -- compiler used by this script. Note that the use of a +# compiler to aid in system detection is discouraged as it requires +# temporary files to be created and, as you can see below, it is a +# headache to deal with in a portable fashion. + +# Historically, `CC_FOR_BUILD' used to be named `HOST_CC'. We still +# use `HOST_CC' if defined, but it is deprecated. + +# Portable tmp directory creation inspired by the Autoconf team. + +set_cc_for_build=' +trap "exitcode=\$?; (rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null) && exit \$exitcode" 0 ; +trap "rm -f \$tmpfiles 2>/dev/null; rmdir \$tmp 2>/dev/null; exit 1" 1 2 13 15 ; +: ${TMPDIR=/tmp} ; + { tmp=`(umask 077 && mktemp -d "$TMPDIR/cgXXXXXX") 2>/dev/null` && test -n "$tmp" && test -d "$tmp" ; } || + { test -n "$RANDOM" && tmp=$TMPDIR/cg$$-$RANDOM && (umask 077 && mkdir $tmp) ; } || + { tmp=$TMPDIR/cg-$$ && (umask 077 && mkdir $tmp) && echo "Warning: creating insecure temp directory" >&2 ; } || + { echo "$me: cannot create a temporary directory in $TMPDIR" >&2 ; exit 1 ; } ; +dummy=$tmp/dummy ; +tmpfiles="$dummy.c $dummy.o $dummy.rel $dummy" ; +case $CC_FOR_BUILD,$HOST_CC,$CC in + ,,) echo "int x;" > $dummy.c ; + for c in cc gcc c89 c99 ; do + if ($c -c -o $dummy.o $dummy.c) >/dev/null 2>&1 ; then + CC_FOR_BUILD="$c"; break ; + fi ; + done ; + if test x"$CC_FOR_BUILD" = x ; then + CC_FOR_BUILD=no_compiler_found ; + fi + ;; + ,,*) CC_FOR_BUILD=$CC ;; + ,*,*) CC_FOR_BUILD=$HOST_CC ;; +esac ; set_cc_for_build= ;' + +# This is needed to find uname on a Pyramid OSx when run in the BSD universe. +# (ghazi@noc.rutgers.edu 1994-08-24) +if (test -f /.attbin/uname) >/dev/null 2>&1 ; then + PATH=$PATH:/.attbin ; export PATH +fi + +UNAME_MACHINE=`(uname -m) 2>/dev/null` || UNAME_MACHINE=unknown +UNAME_RELEASE=`(uname -r) 2>/dev/null` || UNAME_RELEASE=unknown +UNAME_SYSTEM=`(uname -s) 2>/dev/null` || UNAME_SYSTEM=unknown +UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown + +# Note: order is significant - the case branches are not exclusive. + +case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in + *:NetBSD:*:*) + # NetBSD (nbsd) targets should (where applicable) match one or + # more of the tupples: *-*-netbsdelf*, *-*-netbsdaout*, + # *-*-netbsdecoff* and *-*-netbsd*. For targets that recently + # switched to ELF, *-*-netbsd* would select the old + # object file format. This provides both forward + # compatibility and a consistent mechanism for selecting the + # object file format. + # + # Note: NetBSD doesn't particularly care about the vendor + # portion of the name. We always set it to "unknown". + sysctl="sysctl -n hw.machine_arch" + UNAME_MACHINE_ARCH=`(/sbin/$sysctl 2>/dev/null || \ + /usr/sbin/$sysctl 2>/dev/null || echo unknown)` + case "${UNAME_MACHINE_ARCH}" in + armeb) machine=armeb-unknown ;; + arm*) machine=arm-unknown ;; + sh3el) machine=shl-unknown ;; + sh3eb) machine=sh-unknown ;; + sh5el) machine=sh5le-unknown ;; + *) machine=${UNAME_MACHINE_ARCH}-unknown ;; + esac + # The Operating System including object format, if it has switched + # to ELF recently, or will in the future. + case "${UNAME_MACHINE_ARCH}" in + arm*|i386|m68k|ns32k|sh3*|sparc|vax) + eval $set_cc_for_build + if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \ + | grep __ELF__ >/dev/null + then + # Once all utilities can be ECOFF (netbsdecoff) or a.out (netbsdaout). + # Return netbsd for either. FIX? + os=netbsd + else + os=netbsdelf + fi + ;; + *) + os=netbsd + ;; + esac + # The OS release + # Debian GNU/NetBSD machines have a different userland, and + # thus, need a distinct triplet. However, they do not need + # kernel version information, so it can be replaced with a + # suitable tag, in the style of linux-gnu. + case "${UNAME_VERSION}" in + Debian*) + release='-gnu' + ;; + *) + release=`echo ${UNAME_RELEASE}|sed -e 's/[-_].*/\./'` + ;; + esac + # Since CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM: + # contains redundant information, the shorter form: + # CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM is used. + echo "${machine}-${os}${release}" + exit ;; + *:OpenBSD:*:*) + UNAME_MACHINE_ARCH=`arch | sed 's/OpenBSD.//'` + echo ${UNAME_MACHINE_ARCH}-unknown-openbsd${UNAME_RELEASE} + exit ;; + *:ekkoBSD:*:*) + echo ${UNAME_MACHINE}-unknown-ekkobsd${UNAME_RELEASE} + exit ;; + *:SolidBSD:*:*) + echo ${UNAME_MACHINE}-unknown-solidbsd${UNAME_RELEASE} + exit ;; + macppc:MirBSD:*:*) + echo powerpc-unknown-mirbsd${UNAME_RELEASE} + exit ;; + *:MirBSD:*:*) + echo ${UNAME_MACHINE}-unknown-mirbsd${UNAME_RELEASE} + exit ;; + alpha:OSF1:*:*) + case $UNAME_RELEASE in + *4.0) + UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $3}'` + ;; + *5.*) + UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $4}'` + ;; + esac + # According to Compaq, /usr/sbin/psrinfo has been available on + # OSF/1 and Tru64 systems produced since 1995. I hope that + # covers most systems running today. This code pipes the CPU + # types through head -n 1, so we only detect the type of CPU 0. + ALPHA_CPU_TYPE=`/usr/sbin/psrinfo -v | sed -n -e 's/^ The alpha \(.*\) processor.*$/\1/p' | head -n 1` + case "$ALPHA_CPU_TYPE" in + "EV4 (21064)") + UNAME_MACHINE="alpha" ;; + "EV4.5 (21064)") + UNAME_MACHINE="alpha" ;; + "LCA4 (21066/21068)") + UNAME_MACHINE="alpha" ;; + "EV5 (21164)") + UNAME_MACHINE="alphaev5" ;; + "EV5.6 (21164A)") + UNAME_MACHINE="alphaev56" ;; + "EV5.6 (21164PC)") + UNAME_MACHINE="alphapca56" ;; + "EV5.7 (21164PC)") + UNAME_MACHINE="alphapca57" ;; + "EV6 (21264)") + UNAME_MACHINE="alphaev6" ;; + "EV6.7 (21264A)") + UNAME_MACHINE="alphaev67" ;; + "EV6.8CB (21264C)") + UNAME_MACHINE="alphaev68" ;; + "EV6.8AL (21264B)") + UNAME_MACHINE="alphaev68" ;; + "EV6.8CX (21264D)") + UNAME_MACHINE="alphaev68" ;; + "EV6.9A (21264/EV69A)") + UNAME_MACHINE="alphaev69" ;; + "EV7 (21364)") + UNAME_MACHINE="alphaev7" ;; + "EV7.9 (21364A)") + UNAME_MACHINE="alphaev79" ;; + esac + # A Pn.n version is a patched version. + # A Vn.n version is a released version. + # A Tn.n version is a released field test version. + # A Xn.n version is an unreleased experimental baselevel. + # 1.2 uses "1.2" for uname -r. + echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[PVTX]//' | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'` + exit ;; + Alpha\ *:Windows_NT*:*) + # How do we know it's Interix rather than the generic POSIX subsystem? + # Should we change UNAME_MACHINE based on the output of uname instead + # of the specific Alpha model? + echo alpha-pc-interix + exit ;; + 21064:Windows_NT:50:3) + echo alpha-dec-winnt3.5 + exit ;; + Amiga*:UNIX_System_V:4.0:*) + echo m68k-unknown-sysv4 + exit ;; + *:[Aa]miga[Oo][Ss]:*:*) + echo ${UNAME_MACHINE}-unknown-amigaos + exit ;; + *:[Mm]orph[Oo][Ss]:*:*) + echo ${UNAME_MACHINE}-unknown-morphos + exit ;; + *:OS/390:*:*) + echo i370-ibm-openedition + exit ;; + *:z/VM:*:*) + echo s390-ibm-zvmoe + exit ;; + *:OS400:*:*) + echo powerpc-ibm-os400 + exit ;; + arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*) + echo arm-acorn-riscix${UNAME_RELEASE} + exit ;; + arm:riscos:*:*|arm:RISCOS:*:*) + echo arm-unknown-riscos + exit ;; + SR2?01:HI-UX/MPP:*:* | SR8000:HI-UX/MPP:*:*) + echo hppa1.1-hitachi-hiuxmpp + exit ;; + Pyramid*:OSx*:*:* | MIS*:OSx*:*:* | MIS*:SMP_DC-OSx*:*:*) + # akee@wpdis03.wpafb.af.mil (Earle F. Ake) contributed MIS and NILE. + if test "`(/bin/universe) 2>/dev/null`" = att ; then + echo pyramid-pyramid-sysv3 + else + echo pyramid-pyramid-bsd + fi + exit ;; + NILE*:*:*:dcosx) + echo pyramid-pyramid-svr4 + exit ;; + DRS?6000:unix:4.0:6*) + echo sparc-icl-nx6 + exit ;; + DRS?6000:UNIX_SV:4.2*:7* | DRS?6000:isis:4.2*:7*) + case `/usr/bin/uname -p` in + sparc) echo sparc-icl-nx7; exit ;; + esac ;; + sun4H:SunOS:5.*:*) + echo sparc-hal-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` + exit ;; + sun4*:SunOS:5.*:* | tadpole*:SunOS:5.*:*) + echo sparc-sun-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` + exit ;; + i86pc:SunOS:5.*:*) + echo i386-pc-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` + exit ;; + sun4*:SunOS:6*:*) + # According to config.sub, this is the proper way to canonicalize + # SunOS6. Hard to guess exactly what SunOS6 will be like, but + # it's likely to be more like Solaris than SunOS4. + echo sparc-sun-solaris3`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` + exit ;; + sun4*:SunOS:*:*) + case "`/usr/bin/arch -k`" in + Series*|S4*) + UNAME_RELEASE=`uname -v` + ;; + esac + # Japanese Language versions have a version number like `4.1.3-JL'. + echo sparc-sun-sunos`echo ${UNAME_RELEASE}|sed -e 's/-/_/'` + exit ;; + sun3*:SunOS:*:*) + echo m68k-sun-sunos${UNAME_RELEASE} + exit ;; + sun*:*:4.2BSD:*) + UNAME_RELEASE=`(sed 1q /etc/motd | awk '{print substr($5,1,3)}') 2>/dev/null` + test "x${UNAME_RELEASE}" = "x" && UNAME_RELEASE=3 + case "`/bin/arch`" in + sun3) + echo m68k-sun-sunos${UNAME_RELEASE} + ;; + sun4) + echo sparc-sun-sunos${UNAME_RELEASE} + ;; + esac + exit ;; + aushp:SunOS:*:*) + echo sparc-auspex-sunos${UNAME_RELEASE} + exit ;; + # The situation for MiNT is a little confusing. The machine name + # can be virtually everything (everything which is not + # "atarist" or "atariste" at least should have a processor + # > m68000). The system name ranges from "MiNT" over "FreeMiNT" + # to the lowercase version "mint" (or "freemint"). Finally + # the system name "TOS" denotes a system which is actually not + # MiNT. But MiNT is downward compatible to TOS, so this should + # be no problem. + atarist[e]:*MiNT:*:* | atarist[e]:*mint:*:* | atarist[e]:*TOS:*:*) + echo m68k-atari-mint${UNAME_RELEASE} + exit ;; + atari*:*MiNT:*:* | atari*:*mint:*:* | atarist[e]:*TOS:*:*) + echo m68k-atari-mint${UNAME_RELEASE} + exit ;; + *falcon*:*MiNT:*:* | *falcon*:*mint:*:* | *falcon*:*TOS:*:*) + echo m68k-atari-mint${UNAME_RELEASE} + exit ;; + milan*:*MiNT:*:* | milan*:*mint:*:* | *milan*:*TOS:*:*) + echo m68k-milan-mint${UNAME_RELEASE} + exit ;; + hades*:*MiNT:*:* | hades*:*mint:*:* | *hades*:*TOS:*:*) + echo m68k-hades-mint${UNAME_RELEASE} + exit ;; + *:*MiNT:*:* | *:*mint:*:* | *:*TOS:*:*) + echo m68k-unknown-mint${UNAME_RELEASE} + exit ;; + m68k:machten:*:*) + echo m68k-apple-machten${UNAME_RELEASE} + exit ;; + powerpc:machten:*:*) + echo powerpc-apple-machten${UNAME_RELEASE} + exit ;; + RISC*:Mach:*:*) + echo mips-dec-mach_bsd4.3 + exit ;; + RISC*:ULTRIX:*:*) + echo mips-dec-ultrix${UNAME_RELEASE} + exit ;; + VAX*:ULTRIX*:*:*) + echo vax-dec-ultrix${UNAME_RELEASE} + exit ;; + 2020:CLIX:*:* | 2430:CLIX:*:*) + echo clipper-intergraph-clix${UNAME_RELEASE} + exit ;; + mips:*:*:UMIPS | mips:*:*:RISCos) + eval $set_cc_for_build + sed 's/^ //' << EOF >$dummy.c +#ifdef __cplusplus +#include /* for printf() prototype */ + int main (int argc, char *argv[]) { +#else + int main (argc, argv) int argc; char *argv[]; { +#endif + #if defined (host_mips) && defined (MIPSEB) + #if defined (SYSTYPE_SYSV) + printf ("mips-mips-riscos%ssysv\n", argv[1]); exit (0); + #endif + #if defined (SYSTYPE_SVR4) + printf ("mips-mips-riscos%ssvr4\n", argv[1]); exit (0); + #endif + #if defined (SYSTYPE_BSD43) || defined(SYSTYPE_BSD) + printf ("mips-mips-riscos%sbsd\n", argv[1]); exit (0); + #endif + #endif + exit (-1); + } +EOF + $CC_FOR_BUILD -o $dummy $dummy.c && + dummyarg=`echo "${UNAME_RELEASE}" | sed -n 's/\([0-9]*\).*/\1/p'` && + SYSTEM_NAME=`$dummy $dummyarg` && + { echo "$SYSTEM_NAME"; exit; } + echo mips-mips-riscos${UNAME_RELEASE} + exit ;; + Motorola:PowerMAX_OS:*:*) + echo powerpc-motorola-powermax + exit ;; + Motorola:*:4.3:PL8-*) + echo powerpc-harris-powermax + exit ;; + Night_Hawk:*:*:PowerMAX_OS | Synergy:PowerMAX_OS:*:*) + echo powerpc-harris-powermax + exit ;; + Night_Hawk:Power_UNIX:*:*) + echo powerpc-harris-powerunix + exit ;; + m88k:CX/UX:7*:*) + echo m88k-harris-cxux7 + exit ;; + m88k:*:4*:R4*) + echo m88k-motorola-sysv4 + exit ;; + m88k:*:3*:R3*) + echo m88k-motorola-sysv3 + exit ;; + AViiON:dgux:*:*) + # DG/UX returns AViiON for all architectures + UNAME_PROCESSOR=`/usr/bin/uname -p` + if [ $UNAME_PROCESSOR = mc88100 ] || [ $UNAME_PROCESSOR = mc88110 ] + then + if [ ${TARGET_BINARY_INTERFACE}x = m88kdguxelfx ] || \ + [ ${TARGET_BINARY_INTERFACE}x = x ] + then + echo m88k-dg-dgux${UNAME_RELEASE} + else + echo m88k-dg-dguxbcs${UNAME_RELEASE} + fi + else + echo i586-dg-dgux${UNAME_RELEASE} + fi + exit ;; + M88*:DolphinOS:*:*) # DolphinOS (SVR3) + echo m88k-dolphin-sysv3 + exit ;; + M88*:*:R3*:*) + # Delta 88k system running SVR3 + echo m88k-motorola-sysv3 + exit ;; + XD88*:*:*:*) # Tektronix XD88 system running UTekV (SVR3) + echo m88k-tektronix-sysv3 + exit ;; + Tek43[0-9][0-9]:UTek:*:*) # Tektronix 4300 system running UTek (BSD) + echo m68k-tektronix-bsd + exit ;; + *:IRIX*:*:*) + echo mips-sgi-irix`echo ${UNAME_RELEASE}|sed -e 's/-/_/g'` + exit ;; + ????????:AIX?:[12].1:2) # AIX 2.2.1 or AIX 2.1.1 is RT/PC AIX. + echo romp-ibm-aix # uname -m gives an 8 hex-code CPU id + exit ;; # Note that: echo "'`uname -s`'" gives 'AIX ' + i*86:AIX:*:*) + echo i386-ibm-aix + exit ;; + ia64:AIX:*:*) + if [ -x /usr/bin/oslevel ] ; then + IBM_REV=`/usr/bin/oslevel` + else + IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE} + fi + echo ${UNAME_MACHINE}-ibm-aix${IBM_REV} + exit ;; + *:AIX:2:3) + if grep bos325 /usr/include/stdio.h >/dev/null 2>&1; then + eval $set_cc_for_build + sed 's/^ //' << EOF >$dummy.c + #include + + main() + { + if (!__power_pc()) + exit(1); + puts("powerpc-ibm-aix3.2.5"); + exit(0); + } +EOF + if $CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy` + then + echo "$SYSTEM_NAME" + else + echo rs6000-ibm-aix3.2.5 + fi + elif grep bos324 /usr/include/stdio.h >/dev/null 2>&1; then + echo rs6000-ibm-aix3.2.4 + else + echo rs6000-ibm-aix3.2 + fi + exit ;; + *:AIX:*:[45]) + IBM_CPU_ID=`/usr/sbin/lsdev -C -c processor -S available | sed 1q | awk '{ print $1 }'` + if /usr/sbin/lsattr -El ${IBM_CPU_ID} | grep ' POWER' >/dev/null 2>&1; then + IBM_ARCH=rs6000 + else + IBM_ARCH=powerpc + fi + if [ -x /usr/bin/oslevel ] ; then + IBM_REV=`/usr/bin/oslevel` + else + IBM_REV=${UNAME_VERSION}.${UNAME_RELEASE} + fi + echo ${IBM_ARCH}-ibm-aix${IBM_REV} + exit ;; + *:AIX:*:*) + echo rs6000-ibm-aix + exit ;; + ibmrt:4.4BSD:*|romp-ibm:BSD:*) + echo romp-ibm-bsd4.4 + exit ;; + ibmrt:*BSD:*|romp-ibm:BSD:*) # covers RT/PC BSD and + echo romp-ibm-bsd${UNAME_RELEASE} # 4.3 with uname added to + exit ;; # report: romp-ibm BSD 4.3 + *:BOSX:*:*) + echo rs6000-bull-bosx + exit ;; + DPX/2?00:B.O.S.:*:*) + echo m68k-bull-sysv3 + exit ;; + 9000/[34]??:4.3bsd:1.*:*) + echo m68k-hp-bsd + exit ;; + hp300:4.4BSD:*:* | 9000/[34]??:4.3bsd:2.*:*) + echo m68k-hp-bsd4.4 + exit ;; + 9000/[34678]??:HP-UX:*:*) + HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'` + case "${UNAME_MACHINE}" in + 9000/31? ) HP_ARCH=m68000 ;; + 9000/[34]?? ) HP_ARCH=m68k ;; + 9000/[678][0-9][0-9]) + if [ -x /usr/bin/getconf ]; then + sc_cpu_version=`/usr/bin/getconf SC_CPU_VERSION 2>/dev/null` + sc_kernel_bits=`/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null` + case "${sc_cpu_version}" in + 523) HP_ARCH="hppa1.0" ;; # CPU_PA_RISC1_0 + 528) HP_ARCH="hppa1.1" ;; # CPU_PA_RISC1_1 + 532) # CPU_PA_RISC2_0 + case "${sc_kernel_bits}" in + 32) HP_ARCH="hppa2.0n" ;; + 64) HP_ARCH="hppa2.0w" ;; + '') HP_ARCH="hppa2.0" ;; # HP-UX 10.20 + esac ;; + esac + fi + if [ "${HP_ARCH}" = "" ]; then + eval $set_cc_for_build + sed 's/^ //' << EOF >$dummy.c + + #define _HPUX_SOURCE + #include + #include + + int main () + { + #if defined(_SC_KERNEL_BITS) + long bits = sysconf(_SC_KERNEL_BITS); + #endif + long cpu = sysconf (_SC_CPU_VERSION); + + switch (cpu) + { + case CPU_PA_RISC1_0: puts ("hppa1.0"); break; + case CPU_PA_RISC1_1: puts ("hppa1.1"); break; + case CPU_PA_RISC2_0: + #if defined(_SC_KERNEL_BITS) + switch (bits) + { + case 64: puts ("hppa2.0w"); break; + case 32: puts ("hppa2.0n"); break; + default: puts ("hppa2.0"); break; + } break; + #else /* !defined(_SC_KERNEL_BITS) */ + puts ("hppa2.0"); break; + #endif + default: puts ("hppa1.0"); break; + } + exit (0); + } +EOF + (CCOPTS= $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null) && HP_ARCH=`$dummy` + test -z "$HP_ARCH" && HP_ARCH=hppa + fi ;; + esac + if [ ${HP_ARCH} = "hppa2.0w" ] + then + eval $set_cc_for_build + + # hppa2.0w-hp-hpux* has a 64-bit kernel and a compiler generating + # 32-bit code. hppa64-hp-hpux* has the same kernel and a compiler + # generating 64-bit code. GNU and HP use different nomenclature: + # + # $ CC_FOR_BUILD=cc ./config.guess + # => hppa2.0w-hp-hpux11.23 + # $ CC_FOR_BUILD="cc +DA2.0w" ./config.guess + # => hppa64-hp-hpux11.23 + + if echo __LP64__ | (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | + grep __LP64__ >/dev/null + then + HP_ARCH="hppa2.0w" + else + HP_ARCH="hppa64" + fi + fi + echo ${HP_ARCH}-hp-hpux${HPUX_REV} + exit ;; + ia64:HP-UX:*:*) + HPUX_REV=`echo ${UNAME_RELEASE}|sed -e 's/[^.]*.[0B]*//'` + echo ia64-hp-hpux${HPUX_REV} + exit ;; + 3050*:HI-UX:*:*) + eval $set_cc_for_build + sed 's/^ //' << EOF >$dummy.c + #include + int + main () + { + long cpu = sysconf (_SC_CPU_VERSION); + /* The order matters, because CPU_IS_HP_MC68K erroneously returns + true for CPU_PA_RISC1_0. CPU_IS_PA_RISC returns correct + results, however. */ + if (CPU_IS_PA_RISC (cpu)) + { + switch (cpu) + { + case CPU_PA_RISC1_0: puts ("hppa1.0-hitachi-hiuxwe2"); break; + case CPU_PA_RISC1_1: puts ("hppa1.1-hitachi-hiuxwe2"); break; + case CPU_PA_RISC2_0: puts ("hppa2.0-hitachi-hiuxwe2"); break; + default: puts ("hppa-hitachi-hiuxwe2"); break; + } + } + else if (CPU_IS_HP_MC68K (cpu)) + puts ("m68k-hitachi-hiuxwe2"); + else puts ("unknown-hitachi-hiuxwe2"); + exit (0); + } +EOF + $CC_FOR_BUILD -o $dummy $dummy.c && SYSTEM_NAME=`$dummy` && + { echo "$SYSTEM_NAME"; exit; } + echo unknown-hitachi-hiuxwe2 + exit ;; + 9000/7??:4.3bsd:*:* | 9000/8?[79]:4.3bsd:*:* ) + echo hppa1.1-hp-bsd + exit ;; + 9000/8??:4.3bsd:*:*) + echo hppa1.0-hp-bsd + exit ;; + *9??*:MPE/iX:*:* | *3000*:MPE/iX:*:*) + echo hppa1.0-hp-mpeix + exit ;; + hp7??:OSF1:*:* | hp8?[79]:OSF1:*:* ) + echo hppa1.1-hp-osf + exit ;; + hp8??:OSF1:*:*) + echo hppa1.0-hp-osf + exit ;; + i*86:OSF1:*:*) + if [ -x /usr/sbin/sysversion ] ; then + echo ${UNAME_MACHINE}-unknown-osf1mk + else + echo ${UNAME_MACHINE}-unknown-osf1 + fi + exit ;; + parisc*:Lites*:*:*) + echo hppa1.1-hp-lites + exit ;; + C1*:ConvexOS:*:* | convex:ConvexOS:C1*:*) + echo c1-convex-bsd + exit ;; + C2*:ConvexOS:*:* | convex:ConvexOS:C2*:*) + if getsysinfo -f scalar_acc + then echo c32-convex-bsd + else echo c2-convex-bsd + fi + exit ;; + C34*:ConvexOS:*:* | convex:ConvexOS:C34*:*) + echo c34-convex-bsd + exit ;; + C38*:ConvexOS:*:* | convex:ConvexOS:C38*:*) + echo c38-convex-bsd + exit ;; + C4*:ConvexOS:*:* | convex:ConvexOS:C4*:*) + echo c4-convex-bsd + exit ;; + CRAY*Y-MP:*:*:*) + echo ymp-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' + exit ;; + CRAY*[A-Z]90:*:*:*) + echo ${UNAME_MACHINE}-cray-unicos${UNAME_RELEASE} \ + | sed -e 's/CRAY.*\([A-Z]90\)/\1/' \ + -e y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/ \ + -e 's/\.[^.]*$/.X/' + exit ;; + CRAY*TS:*:*:*) + echo t90-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' + exit ;; + CRAY*T3E:*:*:*) + echo alphaev5-cray-unicosmk${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' + exit ;; + CRAY*SV1:*:*:*) + echo sv1-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' + exit ;; + *:UNICOS/mp:*:*) + echo craynv-cray-unicosmp${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' + exit ;; + F30[01]:UNIX_System_V:*:* | F700:UNIX_System_V:*:*) + FUJITSU_PROC=`uname -m | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'` + FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'` + FUJITSU_REL=`echo ${UNAME_RELEASE} | sed -e 's/ /_/'` + echo "${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}" + exit ;; + 5000:UNIX_System_V:4.*:*) + FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'` + FUJITSU_REL=`echo ${UNAME_RELEASE} | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/ /_/'` + echo "sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}" + exit ;; + i*86:BSD/386:*:* | i*86:BSD/OS:*:* | *:Ascend\ Embedded/OS:*:*) + echo ${UNAME_MACHINE}-pc-bsdi${UNAME_RELEASE} + exit ;; + sparc*:BSD/OS:*:*) + echo sparc-unknown-bsdi${UNAME_RELEASE} + exit ;; + *:BSD/OS:*:*) + echo ${UNAME_MACHINE}-unknown-bsdi${UNAME_RELEASE} + exit ;; + *:FreeBSD:*:*) + case ${UNAME_MACHINE} in + pc98) + echo i386-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;; + amd64) + echo x86_64-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;; + *) + echo ${UNAME_MACHINE}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;; + esac + exit ;; + i*:CYGWIN*:*) + echo ${UNAME_MACHINE}-pc-cygwin + exit ;; + i*:MINGW*:*) + echo ${UNAME_MACHINE}-pc-mingw32 + exit ;; + i*:windows32*:*) + # uname -m includes "-pc" on this system. + echo ${UNAME_MACHINE}-mingw32 + exit ;; + i*:PW*:*) + echo ${UNAME_MACHINE}-pc-pw32 + exit ;; + x86:Interix*:[3456]*) + echo i586-pc-interix${UNAME_RELEASE} + exit ;; + EM64T:Interix*:[3456]* | authenticamd:Interix*:[3456]*) + echo x86_64-unknown-interix${UNAME_RELEASE} + exit ;; + [345]86:Windows_95:* | [345]86:Windows_98:* | [345]86:Windows_NT:*) + echo i${UNAME_MACHINE}-pc-mks + exit ;; + i*:Windows_NT*:* | Pentium*:Windows_NT*:*) + # How do we know it's Interix rather than the generic POSIX subsystem? + # It also conflicts with pre-2.0 versions of AT&T UWIN. Should we + # UNAME_MACHINE based on the output of uname instead of i386? + echo i586-pc-interix + exit ;; + i*:UWIN*:*) + echo ${UNAME_MACHINE}-pc-uwin + exit ;; + amd64:CYGWIN*:*:* | x86_64:CYGWIN*:*:*) + echo x86_64-unknown-cygwin + exit ;; + p*:CYGWIN*:*) + echo powerpcle-unknown-cygwin + exit ;; + prep*:SunOS:5.*:*) + echo powerpcle-unknown-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` + exit ;; + *:GNU:*:*) + # the GNU system + echo `echo ${UNAME_MACHINE}|sed -e 's,[-/].*$,,'`-unknown-gnu`echo ${UNAME_RELEASE}|sed -e 's,/.*$,,'` + exit ;; + *:GNU/*:*:*) + # other systems with GNU libc and userland + echo ${UNAME_MACHINE}-unknown-`echo ${UNAME_SYSTEM} | sed 's,^[^/]*/,,' | tr '[A-Z]' '[a-z]'``echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`-gnu + exit ;; + i*86:Minix:*:*) + echo ${UNAME_MACHINE}-pc-minix + exit ;; + arm*:Linux:*:*) + echo ${UNAME_MACHINE}-unknown-linux-gnu + exit ;; + avr32*:Linux:*:*) + echo ${UNAME_MACHINE}-unknown-linux-gnu + exit ;; + cris:Linux:*:*) + echo cris-axis-linux-gnu + exit ;; + crisv32:Linux:*:*) + echo crisv32-axis-linux-gnu + exit ;; + frv:Linux:*:*) + echo frv-unknown-linux-gnu + exit ;; + ia64:Linux:*:*) + echo ${UNAME_MACHINE}-unknown-linux-gnu + exit ;; + m32r*:Linux:*:*) + echo ${UNAME_MACHINE}-unknown-linux-gnu + exit ;; + m68*:Linux:*:*) + echo ${UNAME_MACHINE}-unknown-linux-gnu + exit ;; + mips:Linux:*:*) + eval $set_cc_for_build + sed 's/^ //' << EOF >$dummy.c + #undef CPU + #undef mips + #undef mipsel + #if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL) + CPU=mipsel + #else + #if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB) + CPU=mips + #else + CPU= + #endif + #endif +EOF + eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n ' + /^CPU/{ + s: ::g + p + }'`" + test x"${CPU}" != x && { echo "${CPU}-unknown-linux-gnu"; exit; } + ;; + mips64:Linux:*:*) + eval $set_cc_for_build + sed 's/^ //' << EOF >$dummy.c + #undef CPU + #undef mips64 + #undef mips64el + #if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL) + CPU=mips64el + #else + #if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB) + CPU=mips64 + #else + CPU= + #endif + #endif +EOF + eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n ' + /^CPU/{ + s: ::g + p + }'`" + test x"${CPU}" != x && { echo "${CPU}-unknown-linux-gnu"; exit; } + ;; + or32:Linux:*:*) + echo or32-unknown-linux-gnu + exit ;; + ppc:Linux:*:*) + echo powerpc-unknown-linux-gnu + exit ;; + ppc64:Linux:*:*) + echo powerpc64-unknown-linux-gnu + exit ;; + alpha:Linux:*:*) + case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in + EV5) UNAME_MACHINE=alphaev5 ;; + EV56) UNAME_MACHINE=alphaev56 ;; + PCA56) UNAME_MACHINE=alphapca56 ;; + PCA57) UNAME_MACHINE=alphapca56 ;; + EV6) UNAME_MACHINE=alphaev6 ;; + EV67) UNAME_MACHINE=alphaev67 ;; + EV68*) UNAME_MACHINE=alphaev68 ;; + esac + objdump --private-headers /bin/sh | grep ld.so.1 >/dev/null + if test "$?" = 0 ; then LIBC="libc1" ; else LIBC="" ; fi + echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC} + exit ;; + parisc:Linux:*:* | hppa:Linux:*:*) + # Look for CPU level + case `grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2` in + PA7*) echo hppa1.1-unknown-linux-gnu ;; + PA8*) echo hppa2.0-unknown-linux-gnu ;; + *) echo hppa-unknown-linux-gnu ;; + esac + exit ;; + parisc64:Linux:*:* | hppa64:Linux:*:*) + echo hppa64-unknown-linux-gnu + exit ;; + s390:Linux:*:* | s390x:Linux:*:*) + echo ${UNAME_MACHINE}-ibm-linux + exit ;; + sh64*:Linux:*:*) + echo ${UNAME_MACHINE}-unknown-linux-gnu + exit ;; + sh*:Linux:*:*) + echo ${UNAME_MACHINE}-unknown-linux-gnu + exit ;; + sparc:Linux:*:* | sparc64:Linux:*:*) + echo ${UNAME_MACHINE}-unknown-linux-gnu + exit ;; + vax:Linux:*:*) + echo ${UNAME_MACHINE}-dec-linux-gnu + exit ;; + x86_64:Linux:*:*) + echo x86_64-unknown-linux-gnu + exit ;; + i*86:Linux:*:*) + # The BFD linker knows what the default object file format is, so + # first see if it will tell us. cd to the root directory to prevent + # problems with other programs or directories called `ld' in the path. + # Set LC_ALL=C to ensure ld outputs messages in English. + ld_supported_targets=`cd /; LC_ALL=C ld --help 2>&1 \ + | sed -ne '/supported targets:/!d + s/[ ][ ]*/ /g + s/.*supported targets: *// + s/ .*// + p'` + case "$ld_supported_targets" in + elf32-i386) + TENTATIVE="${UNAME_MACHINE}-pc-linux-gnu" + ;; + a.out-i386-linux) + echo "${UNAME_MACHINE}-pc-linux-gnuaout" + exit ;; + coff-i386) + echo "${UNAME_MACHINE}-pc-linux-gnucoff" + exit ;; + "") + # Either a pre-BFD a.out linker (linux-gnuoldld) or + # one that does not give us useful --help. + echo "${UNAME_MACHINE}-pc-linux-gnuoldld" + exit ;; + esac + # Determine whether the default compiler is a.out or elf + eval $set_cc_for_build + sed 's/^ //' << EOF >$dummy.c + #include + #ifdef __ELF__ + # ifdef __GLIBC__ + # if __GLIBC__ >= 2 + LIBC=gnu + # else + LIBC=gnulibc1 + # endif + # else + LIBC=gnulibc1 + # endif + #else + #if defined(__INTEL_COMPILER) || defined(__PGI) || defined(__SUNPRO_C) || defined(__SUNPRO_CC) + LIBC=gnu + #else + LIBC=gnuaout + #endif + #endif + #ifdef __dietlibc__ + LIBC=dietlibc + #endif +EOF + eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n ' + /^LIBC/{ + s: ::g + p + }'`" + test x"${LIBC}" != x && { + echo "${UNAME_MACHINE}-pc-linux-${LIBC}" + exit + } + test x"${TENTATIVE}" != x && { echo "${TENTATIVE}"; exit; } + ;; + i*86:DYNIX/ptx:4*:*) + # ptx 4.0 does uname -s correctly, with DYNIX/ptx in there. + # earlier versions are messed up and put the nodename in both + # sysname and nodename. + echo i386-sequent-sysv4 + exit ;; + i*86:UNIX_SV:4.2MP:2.*) + # Unixware is an offshoot of SVR4, but it has its own version + # number series starting with 2... + # I am not positive that other SVR4 systems won't match this, + # I just have to hope. -- rms. + # Use sysv4.2uw... so that sysv4* matches it. + echo ${UNAME_MACHINE}-pc-sysv4.2uw${UNAME_VERSION} + exit ;; + i*86:OS/2:*:*) + # If we were able to find `uname', then EMX Unix compatibility + # is probably installed. + echo ${UNAME_MACHINE}-pc-os2-emx + exit ;; + i*86:XTS-300:*:STOP) + echo ${UNAME_MACHINE}-unknown-stop + exit ;; + i*86:atheos:*:*) + echo ${UNAME_MACHINE}-unknown-atheos + exit ;; + i*86:syllable:*:*) + echo ${UNAME_MACHINE}-pc-syllable + exit ;; + i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.0*:*) + echo i386-unknown-lynxos${UNAME_RELEASE} + exit ;; + i*86:*DOS:*:*) + echo ${UNAME_MACHINE}-pc-msdosdjgpp + exit ;; + i*86:*:4.*:* | i*86:SYSTEM_V:4.*:*) + UNAME_REL=`echo ${UNAME_RELEASE} | sed 's/\/MP$//'` + if grep Novell /usr/include/link.h >/dev/null 2>/dev/null; then + echo ${UNAME_MACHINE}-univel-sysv${UNAME_REL} + else + echo ${UNAME_MACHINE}-pc-sysv${UNAME_REL} + fi + exit ;; + i*86:*:5:[678]*) + # UnixWare 7.x, OpenUNIX and OpenServer 6. + case `/bin/uname -X | grep "^Machine"` in + *486*) UNAME_MACHINE=i486 ;; + *Pentium) UNAME_MACHINE=i586 ;; + *Pent*|*Celeron) UNAME_MACHINE=i686 ;; + esac + echo ${UNAME_MACHINE}-unknown-sysv${UNAME_RELEASE}${UNAME_SYSTEM}${UNAME_VERSION} + exit ;; + i*86:*:3.2:*) + if test -f /usr/options/cb.name; then + UNAME_REL=`sed -n 's/.*Version //p' /dev/null >/dev/null ; then + UNAME_REL=`(/bin/uname -X|grep Release|sed -e 's/.*= //')` + (/bin/uname -X|grep i80486 >/dev/null) && UNAME_MACHINE=i486 + (/bin/uname -X|grep '^Machine.*Pentium' >/dev/null) \ + && UNAME_MACHINE=i586 + (/bin/uname -X|grep '^Machine.*Pent *II' >/dev/null) \ + && UNAME_MACHINE=i686 + (/bin/uname -X|grep '^Machine.*Pentium Pro' >/dev/null) \ + && UNAME_MACHINE=i686 + echo ${UNAME_MACHINE}-pc-sco$UNAME_REL + else + echo ${UNAME_MACHINE}-pc-sysv32 + fi + exit ;; + pc:*:*:*) + # Left here for compatibility: + # uname -m prints for DJGPP always 'pc', but it prints nothing about + # the processor, so we play safe by assuming i386. + echo i386-pc-msdosdjgpp + exit ;; + Intel:Mach:3*:*) + echo i386-pc-mach3 + exit ;; + paragon:*:*:*) + echo i860-intel-osf1 + exit ;; + i860:*:4.*:*) # i860-SVR4 + if grep Stardent /usr/include/sys/uadmin.h >/dev/null 2>&1 ; then + echo i860-stardent-sysv${UNAME_RELEASE} # Stardent Vistra i860-SVR4 + else # Add other i860-SVR4 vendors below as they are discovered. + echo i860-unknown-sysv${UNAME_RELEASE} # Unknown i860-SVR4 + fi + exit ;; + mini*:CTIX:SYS*5:*) + # "miniframe" + echo m68010-convergent-sysv + exit ;; + mc68k:UNIX:SYSTEM5:3.51m) + echo m68k-convergent-sysv + exit ;; + M680?0:D-NIX:5.3:*) + echo m68k-diab-dnix + exit ;; + M68*:*:R3V[5678]*:*) + test -r /sysV68 && { echo 'm68k-motorola-sysv'; exit; } ;; + 3[345]??:*:4.0:3.0 | 3[34]??A:*:4.0:3.0 | 3[34]??,*:*:4.0:3.0 | 3[34]??/*:*:4.0:3.0 | 4400:*:4.0:3.0 | 4850:*:4.0:3.0 | SKA40:*:4.0:3.0 | SDS2:*:4.0:3.0 | SHG2:*:4.0:3.0 | S7501*:*:4.0:3.0) + OS_REL='' + test -r /etc/.relid \ + && OS_REL=.`sed -n 's/[^ ]* [^ ]* \([0-9][0-9]\).*/\1/p' < /etc/.relid` + /bin/uname -p 2>/dev/null | grep 86 >/dev/null \ + && { echo i486-ncr-sysv4.3${OS_REL}; exit; } + /bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \ + && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;; + 3[34]??:*:4.0:* | 3[34]??,*:*:4.0:*) + /bin/uname -p 2>/dev/null | grep 86 >/dev/null \ + && { echo i486-ncr-sysv4; exit; } ;; + m68*:LynxOS:2.*:* | m68*:LynxOS:3.0*:*) + echo m68k-unknown-lynxos${UNAME_RELEASE} + exit ;; + mc68030:UNIX_System_V:4.*:*) + echo m68k-atari-sysv4 + exit ;; + TSUNAMI:LynxOS:2.*:*) + echo sparc-unknown-lynxos${UNAME_RELEASE} + exit ;; + rs6000:LynxOS:2.*:*) + echo rs6000-unknown-lynxos${UNAME_RELEASE} + exit ;; + PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.0*:*) + echo powerpc-unknown-lynxos${UNAME_RELEASE} + exit ;; + SM[BE]S:UNIX_SV:*:*) + echo mips-dde-sysv${UNAME_RELEASE} + exit ;; + RM*:ReliantUNIX-*:*:*) + echo mips-sni-sysv4 + exit ;; + RM*:SINIX-*:*:*) + echo mips-sni-sysv4 + exit ;; + *:SINIX-*:*:*) + if uname -p 2>/dev/null >/dev/null ; then + UNAME_MACHINE=`(uname -p) 2>/dev/null` + echo ${UNAME_MACHINE}-sni-sysv4 + else + echo ns32k-sni-sysv + fi + exit ;; + PENTIUM:*:4.0*:*) # Unisys `ClearPath HMP IX 4000' SVR4/MP effort + # says + echo i586-unisys-sysv4 + exit ;; + *:UNIX_System_V:4*:FTX*) + # From Gerald Hewes . + # How about differentiating between stratus architectures? -djm + echo hppa1.1-stratus-sysv4 + exit ;; + *:*:*:FTX*) + # From seanf@swdc.stratus.com. + echo i860-stratus-sysv4 + exit ;; + i*86:VOS:*:*) + # From Paul.Green@stratus.com. + echo ${UNAME_MACHINE}-stratus-vos + exit ;; + *:VOS:*:*) + # From Paul.Green@stratus.com. + echo hppa1.1-stratus-vos + exit ;; + mc68*:A/UX:*:*) + echo m68k-apple-aux${UNAME_RELEASE} + exit ;; + news*:NEWS-OS:6*:*) + echo mips-sony-newsos6 + exit ;; + R[34]000:*System_V*:*:* | R4000:UNIX_SYSV:*:* | R*000:UNIX_SV:*:*) + if [ -d /usr/nec ]; then + echo mips-nec-sysv${UNAME_RELEASE} + else + echo mips-unknown-sysv${UNAME_RELEASE} + fi + exit ;; + BeBox:BeOS:*:*) # BeOS running on hardware made by Be, PPC only. + echo powerpc-be-beos + exit ;; + BeMac:BeOS:*:*) # BeOS running on Mac or Mac clone, PPC only. + echo powerpc-apple-beos + exit ;; + BePC:BeOS:*:*) # BeOS running on Intel PC compatible. + echo i586-pc-beos + exit ;; + SX-4:SUPER-UX:*:*) + echo sx4-nec-superux${UNAME_RELEASE} + exit ;; + SX-5:SUPER-UX:*:*) + echo sx5-nec-superux${UNAME_RELEASE} + exit ;; + SX-6:SUPER-UX:*:*) + echo sx6-nec-superux${UNAME_RELEASE} + exit ;; + SX-7:SUPER-UX:*:*) + echo sx7-nec-superux${UNAME_RELEASE} + exit ;; + SX-8:SUPER-UX:*:*) + echo sx8-nec-superux${UNAME_RELEASE} + exit ;; + Power*:Rhapsody:*:*) + echo powerpc-apple-rhapsody${UNAME_RELEASE} + exit ;; + *:Rhapsody:*:*) + echo ${UNAME_MACHINE}-apple-rhapsody${UNAME_RELEASE} + exit ;; + *:Darwin:*:*) + UNAME_PROCESSOR=`uname -p` || UNAME_PROCESSOR=unknown + case $UNAME_PROCESSOR in + unknown) UNAME_PROCESSOR=powerpc ;; + esac + echo ${UNAME_PROCESSOR}-apple-darwin${UNAME_RELEASE} + exit ;; + *:procnto*:*:* | *:QNX:[0123456789]*:*) + UNAME_PROCESSOR=`uname -p` + if test "$UNAME_PROCESSOR" = "x86"; then + UNAME_PROCESSOR=i386 + UNAME_MACHINE=pc + fi + echo ${UNAME_PROCESSOR}-${UNAME_MACHINE}-nto-qnx${UNAME_RELEASE} + exit ;; + *:QNX:*:4*) + echo i386-pc-qnx + exit ;; + NSE-?:NONSTOP_KERNEL:*:*) + echo nse-tandem-nsk${UNAME_RELEASE} + exit ;; + NSR-?:NONSTOP_KERNEL:*:*) + echo nsr-tandem-nsk${UNAME_RELEASE} + exit ;; + *:NonStop-UX:*:*) + echo mips-compaq-nonstopux + exit ;; + BS2000:POSIX*:*:*) + echo bs2000-siemens-sysv + exit ;; + DS/*:UNIX_System_V:*:*) + echo ${UNAME_MACHINE}-${UNAME_SYSTEM}-${UNAME_RELEASE} + exit ;; + *:Plan9:*:*) + # "uname -m" is not consistent, so use $cputype instead. 386 + # is converted to i386 for consistency with other x86 + # operating systems. + if test "$cputype" = "386"; then + UNAME_MACHINE=i386 + else + UNAME_MACHINE="$cputype" + fi + echo ${UNAME_MACHINE}-unknown-plan9 + exit ;; + *:TOPS-10:*:*) + echo pdp10-unknown-tops10 + exit ;; + *:TENEX:*:*) + echo pdp10-unknown-tenex + exit ;; + KS10:TOPS-20:*:* | KL10:TOPS-20:*:* | TYPE4:TOPS-20:*:*) + echo pdp10-dec-tops20 + exit ;; + XKL-1:TOPS-20:*:* | TYPE5:TOPS-20:*:*) + echo pdp10-xkl-tops20 + exit ;; + *:TOPS-20:*:*) + echo pdp10-unknown-tops20 + exit ;; + *:ITS:*:*) + echo pdp10-unknown-its + exit ;; + SEI:*:*:SEIUX) + echo mips-sei-seiux${UNAME_RELEASE} + exit ;; + *:DragonFly:*:*) + echo ${UNAME_MACHINE}-unknown-dragonfly`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` + exit ;; + *:*VMS:*:*) + UNAME_MACHINE=`(uname -p) 2>/dev/null` + case "${UNAME_MACHINE}" in + A*) echo alpha-dec-vms ; exit ;; + I*) echo ia64-dec-vms ; exit ;; + V*) echo vax-dec-vms ; exit ;; + esac ;; + *:XENIX:*:SysV) + echo i386-pc-xenix + exit ;; + i*86:skyos:*:*) + echo ${UNAME_MACHINE}-pc-skyos`echo ${UNAME_RELEASE}` | sed -e 's/ .*$//' + exit ;; + i*86:rdos:*:*) + echo ${UNAME_MACHINE}-pc-rdos + exit ;; +esac + +#echo '(No uname command or uname output not recognized.)' 1>&2 +#echo "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" 1>&2 + +eval $set_cc_for_build +cat >$dummy.c < +# include +#endif +main () +{ +#if defined (sony) +#if defined (MIPSEB) + /* BFD wants "bsd" instead of "newsos". Perhaps BFD should be changed, + I don't know.... */ + printf ("mips-sony-bsd\n"); exit (0); +#else +#include + printf ("m68k-sony-newsos%s\n", +#ifdef NEWSOS4 + "4" +#else + "" +#endif + ); exit (0); +#endif +#endif + +#if defined (__arm) && defined (__acorn) && defined (__unix) + printf ("arm-acorn-riscix\n"); exit (0); +#endif + +#if defined (hp300) && !defined (hpux) + printf ("m68k-hp-bsd\n"); exit (0); +#endif + +#if defined (NeXT) +#if !defined (__ARCHITECTURE__) +#define __ARCHITECTURE__ "m68k" +#endif + int version; + version=`(hostinfo | sed -n 's/.*NeXT Mach \([0-9]*\).*/\1/p') 2>/dev/null`; + if (version < 4) + printf ("%s-next-nextstep%d\n", __ARCHITECTURE__, version); + else + printf ("%s-next-openstep%d\n", __ARCHITECTURE__, version); + exit (0); +#endif + +#if defined (MULTIMAX) || defined (n16) +#if defined (UMAXV) + printf ("ns32k-encore-sysv\n"); exit (0); +#else +#if defined (CMU) + printf ("ns32k-encore-mach\n"); exit (0); +#else + printf ("ns32k-encore-bsd\n"); exit (0); +#endif +#endif +#endif + +#if defined (__386BSD__) + printf ("i386-pc-bsd\n"); exit (0); +#endif + +#if defined (sequent) +#if defined (i386) + printf ("i386-sequent-dynix\n"); exit (0); +#endif +#if defined (ns32000) + printf ("ns32k-sequent-dynix\n"); exit (0); +#endif +#endif + +#if defined (_SEQUENT_) + struct utsname un; + + uname(&un); + + if (strncmp(un.version, "V2", 2) == 0) { + printf ("i386-sequent-ptx2\n"); exit (0); + } + if (strncmp(un.version, "V1", 2) == 0) { /* XXX is V1 correct? */ + printf ("i386-sequent-ptx1\n"); exit (0); + } + printf ("i386-sequent-ptx\n"); exit (0); + +#endif + +#if defined (vax) +# if !defined (ultrix) +# include +# if defined (BSD) +# if BSD == 43 + printf ("vax-dec-bsd4.3\n"); exit (0); +# else +# if BSD == 199006 + printf ("vax-dec-bsd4.3reno\n"); exit (0); +# else + printf ("vax-dec-bsd\n"); exit (0); +# endif +# endif +# else + printf ("vax-dec-bsd\n"); exit (0); +# endif +# else + printf ("vax-dec-ultrix\n"); exit (0); +# endif +#endif + +#if defined (alliant) && defined (i860) + printf ("i860-alliant-bsd\n"); exit (0); +#endif + + exit (1); +} +EOF + +$CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null && SYSTEM_NAME=`$dummy` && + { echo "$SYSTEM_NAME"; exit; } + +# Apollos put the system type in the environment. + +test -d /usr/apollo && { echo ${ISP}-apollo-${SYSTYPE}; exit; } + +# Convex versions that predate uname can use getsysinfo(1) + +if [ -x /usr/convex/getsysinfo ] +then + case `getsysinfo -f cpu_type` in + c1*) + echo c1-convex-bsd + exit ;; + c2*) + if getsysinfo -f scalar_acc + then echo c32-convex-bsd + else echo c2-convex-bsd + fi + exit ;; + c34*) + echo c34-convex-bsd + exit ;; + c38*) + echo c38-convex-bsd + exit ;; + c4*) + echo c4-convex-bsd + exit ;; + esac +fi + +cat >&2 < in order to provide the needed +information to handle your system. + +config.guess timestamp = $timestamp + +uname -m = `(uname -m) 2>/dev/null || echo unknown` +uname -r = `(uname -r) 2>/dev/null || echo unknown` +uname -s = `(uname -s) 2>/dev/null || echo unknown` +uname -v = `(uname -v) 2>/dev/null || echo unknown` + +/usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null` +/bin/uname -X = `(/bin/uname -X) 2>/dev/null` + +hostinfo = `(hostinfo) 2>/dev/null` +/bin/universe = `(/bin/universe) 2>/dev/null` +/usr/bin/arch -k = `(/usr/bin/arch -k) 2>/dev/null` +/bin/arch = `(/bin/arch) 2>/dev/null` +/usr/bin/oslevel = `(/usr/bin/oslevel) 2>/dev/null` +/usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null` + +UNAME_MACHINE = ${UNAME_MACHINE} +UNAME_RELEASE = ${UNAME_RELEASE} +UNAME_SYSTEM = ${UNAME_SYSTEM} +UNAME_VERSION = ${UNAME_VERSION} +EOF + +exit 1 + +# Local variables: +# eval: (add-hook 'write-file-hooks 'time-stamp) +# time-stamp-start: "timestamp='" +# time-stamp-format: "%:y-%02m-%02d" +# time-stamp-end: "'" +# End: diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/config.h b/qcd/part_cpu/applications/QCD/src/kernel_D/config.h new file mode 100644 index 0000000000000000000000000000000000000000..ffd74d2b6da31ec449a7bdbb62adafc170de5fdb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/config.h @@ -0,0 +1,211 @@ +/* config.h. Generated from config.h.in by configure. */ +/* config.h.in. Generated from configure.in by autoheader. */ +#ifndef _CONFIG_H +#define _CONFIG_H + +/* We are on a CRAY */ +/* #undef CRAY */ + +/* lapack available */ +#define HAVE_LAPACK 1 + +/* Define to 1 if you have the `lime' library (-llime). */ +/* #undef HAVE_LIBLIME */ + +/* Define to 1 if you have the `lemon' library (-llemon). */ +/* #undef HAVE_LIBLEMON */ + +/* 1 if clock_gettime is available for use in benchmark */ +#define HAVE_CLOCK_GETTIME 1 + +/* Compile with MPI support */ +#define MPI 1 + +/* Compile with OpenMP support +#define OMP 1*/ + +/* Compile with FFTW support */ +/* #undef HAVE_FFTW */ + +/* Fortran has not extra _ */ +/* #undef NOF77_ */ + +/* Use Opteron instructions */ +/* #undef OPTERON */ + +/* Use Pentium4 instructions */ +/* #undef P4 */ + +/* Define to the address where bug reports for this package should be sent. */ +#define PACKAGE_BUGREPORT "curbach@gmx.de" + +/* Define to the full name of this package. */ +#define PACKAGE_NAME "tmLQCD" + +/* Define to the full name and version of this package. */ +#define PACKAGE_STRING "tmLQCD 5.2.0" + +/* Define to the one symbol short name of this package. */ +#define PACKAGE_TARNAME "tmlqcd" + +/* Define to the version of this package. */ +#define PACKAGE_VERSION "5.2.0" + +/* Index independent addressing */ +/* #undef _INDEX_INDEP_GEOM */ + +/* X parallelisation */ +/* #undef PARALLELX */ + +/* XY parallelisation */ +/* #undef PARALLELXY */ + +/* XYZ parallelisation */ +/* #undef PARALLELXYZ */ + +/* One dimensional parallelisation */ +/* #undef PARALLELT */ + +/* Two dimensional parallelisation */ +/* #undef PARALLELXT */ + +/* Three dimensional parallelisation */ +/* #undef PARALLELXYT */ + +/* Four dimensional parallelisation */ +#define PARALLELXYZT 1 + +/* timeslice-splitted communications */ +/* #undef _USE_TSPLITPAR */ + +/* Fixed volume at compiletime */ +/* #undef FIXEDVOLUME */ + +/* Define to 1 if fseeko (and presumably ftello) exists and is declared. */ +#define HAVE_FSEEKO 1 + +/* Alignment for arrays -- necessary for SSE and automated vectorization */ +#define ALIGN_BASE 0x00 + +/* Alignment compiler hint macro */ +#define ALIGN /**/ + +/* Alignment for 32bit arrays -- necessary for SSE and automated vectorization */ +#define ALIGN_BASE32 0x00 + +/* Alignment of 32bit fields, compiler hint macro */ +#define ALIGN32 /**/ + +/* Compile with SSE2 support */ +/* #undef SSE2 */ + +/* Compile with SSE3 support */ +/* #undef SSE3 */ + +/* Optimize for Blue Gene/L */ +/* #undef BGL */ + +/* Optimize for Blue Gene/P */ +/* #undef BGP */ + +/* Compile with QPX intrinsics */ +/* #undef BGQ */ + +/* Compile with SPI for communications */ +/* #undef SPI */ + +/* Are we using the IBM xlc compiler? */ +/* #undef XLC */ + +/* Define to 1 if `lex' declares `yytext' as a `char *' by default, not a + `char[]'. */ +#define YYTEXT_POINTER 1 + +/* Number of bits in a file offset, on hosts where this is settable. */ +/* #undef _FILE_OFFSET_BITS */ + +/* Construct an extra copy of the gauge fields */ +#define _GAUGE_COPY 1 + +/* Define to 1 to make fseeko visible on some hosts (e.g. glibc 2.2). */ +#define _LARGEFILE_SOURCE 1 + +/* Define for large files, on AIX-style hosts. */ +/* #undef _LARGE_FILES */ + +/* Use even/odd geometry in the gauge fields */ +/* #undef _NEW_GEOMETRY */ + +/* x86 64 Bit architecture */ +#define _x86_64 1 + +/* Define to empty if `const' does not conform to ANSI C. */ +/* #undef const */ + +/* Define to `__inline__' or `__inline' if that's what the C compiler + calls it, or to nothing if 'inline' is not supported under any name. */ +#ifndef __cplusplus +/* #undef inline */ +#endif + +/* Define to `long' if does not define. */ +/* #undef off_t */ + +/* Define to `unsigned' if does not define. */ +/* #undef size_t */ + +/* Define to 1 if you have the header file. */ +#define HAVE_STDINT_H 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_SYS_TYPES_H 1 + +/* Define to 1 if the system has the type `uint16_t'. */ +#define HAVE_UINT16_T 1 + +/* Define to 1 if the system has the type `uint32_t'. */ +#define HAVE_UINT32_T 1 + +/* Define to 1 if the system has the type `uint64_t'. */ +#define HAVE_UINT64_T 1 + +/* Define to 1 if you have the header file. */ +#define HAVE_UNISTD_H 1 + +/* Define to 1 if Dirac operator with halfspinor should be used */ +#define _USE_HALFSPINOR 1 + +/* Define to 1 if shmem API should be used */ +/* #undef _USE_SHMEM */ + +/* Define to 1 if KOJAK instrumentalisation should be done*/ +/* #undef _KOJAK_INST */ + +/* Define to equivalent of C99 restrict keyword, or to nothing if this is not + supported. Do not define if restrict is supported directly. */ +#define restrict __restrict + +/* Define to 1 if persistent MPI calls for halfspinor should be used */ +/* #undef _PERSISTENT */ + +/* Define to 1 if non-blocking MPI calls for spinor and gauge should be used */ +#define _NON_BLOCKING 1 + +/* Define if we want to use CUDA GPU */ +/* #undef HAVE_GPU */ + +/* Define if we want to compute the LapH eigenvectors */ +/* #undef WITHLAPH */ + +/* Define to 1 if you have the `quda' library (-lquda). */ +/* #undef HAVE_LIBQUDA */ + +/* Using QUDA GPU */ +/* #undef QUDA */ + +/* Using Benchmark - no c-lime needed */ +#define BENCHMARK 1 +#define INIT_GLOBALS 1 +#define _BENCH_ONLY 1 +#endif + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/config.h.in b/qcd/part_cpu/applications/QCD/src/kernel_D/config.h.in new file mode 100644 index 0000000000000000000000000000000000000000..83613d3380a90e8b697331381ea99185f5dee051 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/config.h.in @@ -0,0 +1,209 @@ +/* config.h.in. Generated from configure.in by autoheader. */ +#ifndef _CONFIG_H +#define _CONFIG_H + +/* We are on a CRAY */ +#undef CRAY + +/* lapack available */ +#undef HAVE_LAPACK + +/* Define to 1 if you have the `lime' library (-llime). */ +#undef HAVE_LIBLIME + +/* Define to 1 if you have the `lemon' library (-llemon). */ +#undef HAVE_LIBLEMON + +/* 1 if clock_gettime is available for use in benchmark */ +#undef HAVE_CLOCK_GETTIME + +/* Compile with MPI support */ +#undef MPI + +/* Compile with OpenMP support */ +#undef OMP + +/* Compile with FFTW support */ +#undef HAVE_FFTW + +/* Fortran has not extra _ */ +#undef NOF77_ + +/* Use Opteron instructions */ +#undef OPTERON + +/* Use Pentium4 instructions */ +#undef P4 + +/* Define to the address where bug reports for this package should be sent. */ +#undef PACKAGE_BUGREPORT + +/* Define to the full name of this package. */ +#undef PACKAGE_NAME + +/* Define to the full name and version of this package. */ +#undef PACKAGE_STRING + +/* Define to the one symbol short name of this package. */ +#undef PACKAGE_TARNAME + +/* Define to the version of this package. */ +#undef PACKAGE_VERSION + +/* Index independent addressing */ +#undef _INDEX_INDEP_GEOM + +/* X parallelisation */ +#undef PARALLELX + +/* XY parallelisation */ +#undef PARALLELXY + +/* XYZ parallelisation */ +#undef PARALLELXYZ + +/* One dimensional parallelisation */ +#undef PARALLELT + +/* Two dimensional parallelisation */ +#undef PARALLELXT + +/* Three dimensional parallelisation */ +#undef PARALLELXYT + +/* Four dimensional parallelisation */ +#undef PARALLELXYZT + +/* timeslice-splitted communications */ +#undef _USE_TSPLITPAR + +/* Fixed volume at compiletime */ +#undef FIXEDVOLUME + +/* Define to 1 if fseeko (and presumably ftello) exists and is declared. */ +#undef HAVE_FSEEKO + +/* Alignment for arrays -- necessary for SSE and automated vectorization */ +#undef ALIGN_BASE + +/* Alignment compiler hint macro */ +#undef ALIGN + +/* Alignment for 32bit arrays -- necessary for SSE and automated vectorization */ +#undef ALIGN_BASE32 + +/* Alignment of 32bit fields, compiler hint macro */ +#undef ALIGN32 + +/* Compile with SSE2 support */ +#undef SSE2 + +/* Compile with SSE3 support */ +#undef SSE3 + +/* Optimize for Blue Gene/L */ +#undef BGL + +/* Optimize for Blue Gene/P */ +#undef BGP + +/* Compile with QPX intrinsics */ +#undef BGQ + +/* Compile with SPI for communications */ +#undef SPI + +/* Are we using the IBM xlc compiler? */ +#undef XLC + +/* Define to 1 if `lex' declares `yytext' as a `char *' by default, not a + `char[]'. */ +#undef YYTEXT_POINTER + +/* Number of bits in a file offset, on hosts where this is settable. */ +#undef _FILE_OFFSET_BITS + +/* Construct an extra copy of the gauge fields */ +#undef _GAUGE_COPY + +/* Define to 1 to make fseeko visible on some hosts (e.g. glibc 2.2). */ +#undef _LARGEFILE_SOURCE + +/* Define for large files, on AIX-style hosts. */ +#undef _LARGE_FILES + +/* Use even/odd geometry in the gauge fields */ +#undef _NEW_GEOMETRY + +/* x86 64 Bit architecture */ +#undef _x86_64 + +/* Define to empty if `const' does not conform to ANSI C. */ +#undef const + +/* Define to `__inline__' or `__inline' if that's what the C compiler + calls it, or to nothing if 'inline' is not supported under any name. */ +#ifndef __cplusplus +#undef inline +#endif + +/* Define to `long' if does not define. */ +#undef off_t + +/* Define to `unsigned' if does not define. */ +#undef size_t + +/* Define to 1 if you have the header file. */ +#undef HAVE_STDINT_H + +/* Define to 1 if you have the header file. */ +#undef HAVE_SYS_TYPES_H + +/* Define to 1 if the system has the type `uint16_t'. */ +#undef HAVE_UINT16_T + +/* Define to 1 if the system has the type `uint32_t'. */ +#undef HAVE_UINT32_T + +/* Define to 1 if the system has the type `uint64_t'. */ +#undef HAVE_UINT64_T + +/* Define to 1 if you have the header file. */ +#undef HAVE_UNISTD_H + +/* Define to 1 if Dirac operator with halfspinor should be used */ +#undef _USE_HALFSPINOR + +/* Define to 1 if shmem API should be used */ +#undef _USE_SHMEM + +/* Define to 1 if KOJAK instrumentalisation should be done*/ +#undef _KOJAK_INST + +/* Define to equivalent of C99 restrict keyword, or to nothing if this is not + supported. Do not define if restrict is supported directly. */ +#undef restrict + +/* Define to 1 if persistent MPI calls for halfspinor should be used */ +#undef _PERSISTENT + +/* Define to 1 if non-blocking MPI calls for spinor and gauge should be used */ +#undef _NON_BLOCKING + +/* Define if we want to use CUDA GPU */ +#undef HAVE_GPU + +/* Define if we want to compute the LapH eigenvectors */ +#undef WITHLAPH + +/* Define to 1 if you have the `quda' library (-lquda). */ +#undef HAVE_LIBQUDA + +/* Using QUDA GPU */ +#undef QUDA + +/* Using Benchmark - no c-lime needed */ +#undef BENCHMARK + +#endif + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/config.log b/qcd/part_cpu/applications/QCD/src/kernel_D/config.log new file mode 100644 index 0000000000000000000000000000000000000000..64b4807fac83d8dc89313ca8852032d03f076ec9 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/config.log @@ -0,0 +1,3302 @@ +This file contains any messages produced by compilers while +running configure, to aid debugging if configure makes a mistake. + +It was created by tmLQCD configure 5.2.0, which was +generated by GNU Autoconf 2.69. Invocation command line was + + $ ./configure --enable-mpi --with-mpidimension=4 --enable-gaugecopy CC=mpicc CFLAGS=-std=c99 -fopenmp F77=f95 --enable-omp LIBS=/usr/lib/libblas.so.3 /usr/lib/x86_64-linux-gnu/libgomp.so.1 --with-lapack=/usr/lib/lapack/liblapack.so.3 --enable-benchmark + +## --------- ## +## Platform. ## +## --------- ## + +hostname = jacob-All-Series +uname -m = x86_64 +uname -r = 3.16.0-62-generic +uname -s = Linux +uname -v = #83~14.04.1-Ubuntu SMP Fri Feb 26 22:52:39 UTC 2016 + +/usr/bin/uname -p = unknown +/bin/uname -X = unknown + +/bin/arch = unknown +/usr/bin/arch -k = unknown +/usr/convex/getsysinfo = unknown +/usr/bin/hostinfo = unknown +/bin/machine = unknown +/usr/bin/oslevel = unknown +/bin/universe = unknown + +PATH: /usr/local/sbin +PATH: /usr/local/bin +PATH: /usr/sbin +PATH: /usr/bin +PATH: /sbin +PATH: /bin +PATH: /usr/games +PATH: /usr/local/games + + +## ----------- ## +## Core tests. ## +## ----------- ## + +configure:2633: checking build system type +configure:2647: result: x86_64-unknown-linux-gnu +configure:2667: checking host system type +configure:2680: result: x86_64-unknown-linux-gnu +configure:2764: checking for gcc +configure:2791: result: mpicc +configure:3020: checking for C compiler version +configure:3029: mpicc --version >&5 +gcc (Ubuntu 4.8.4-2ubuntu1~14.04.1) 4.8.4 +Copyright (C) 2013 Free Software Foundation, Inc. +This is free software; see the source for copying conditions. There is NO +warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + +configure:3040: $? = 0 +configure:3029: mpicc -v >&5 +Using built-in specs. +COLLECT_GCC=/usr/bin/gcc +COLLECT_LTO_WRAPPER=/usr/lib/gcc/x86_64-linux-gnu/4.8/lto-wrapper +Target: x86_64-linux-gnu +Configured with: ../src/configure -v --with-pkgversion='Ubuntu 4.8.4-2ubuntu1~14.04.1' --with-bugurl=file:///usr/share/doc/gcc-4.8/README.Bugs --enable-languages=c,c++,java,go,d,fortran,objc,obj-c++ --prefix=/usr --program-suffix=-4.8 --enable-shared --enable-linker-build-id --libexecdir=/usr/lib --without-included-gettext --enable-threads=posix --with-gxx-include-dir=/usr/include/c++/4.8 --libdir=/usr/lib --enable-nls --with-sysroot=/ --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --enable-gnu-unique-object --disable-libmudflap --enable-plugin --with-system-zlib --disable-browser-plugin --enable-java-awt=gtk --enable-gtk-cairo --with-java-home=/usr/lib/jvm/java-1.5.0-gcj-4.8-amd64/jre --enable-java-home --with-jvm-root-dir=/usr/lib/jvm/java-1.5.0-gcj-4.8-amd64 --with-jvm-jar-dir=/usr/lib/jvm-exports/java-1.5.0-gcj-4.8-amd64 --with-arch-directory=amd64 --with-ecj-jar=/usr/share/java/eclipse-ecj.jar --enable-objc-gc --enable-multiarch --disable-werror --with-arch-32=i686 --with-abi=m64 --with-multilib-list=m32,m64,mx32 --with-tune=generic --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu +Thread model: posix +gcc version 4.8.4 (Ubuntu 4.8.4-2ubuntu1~14.04.1) +configure:3040: $? = 0 +configure:3029: mpicc -V >&5 +gcc: error: unrecognized command line option '-V' +gcc: fatal error: no input files +compilation terminated. +configure:3040: $? = 4 +configure:3029: mpicc -qversion >&5 +gcc: error: unrecognized command line option '-qversion' +gcc: fatal error: no input files +compilation terminated. +configure:3040: $? = 4 +configure:3060: checking whether the C compiler works +configure:3082: mpicc -std=c99 -fopenmp conftest.c /usr/lib/libblas.so.3 /usr/lib/x86_64-linux-gnu/libgomp.so.1 >&5 +configure:3086: $? = 0 +configure:3134: result: yes +configure:3137: checking for C compiler default output file name +configure:3139: result: a.out +configure:3145: checking for suffix of executables +configure:3152: mpicc -o conftest -std=c99 -fopenmp conftest.c /usr/lib/libblas.so.3 /usr/lib/x86_64-linux-gnu/libgomp.so.1 >&5 +configure:3156: $? = 0 +configure:3178: result: +configure:3200: checking whether we are cross compiling +configure:3208: mpicc -o conftest -std=c99 -fopenmp conftest.c /usr/lib/libblas.so.3 /usr/lib/x86_64-linux-gnu/libgomp.so.1 >&5 +configure:3212: $? = 0 +configure:3219: ./conftest +configure:3223: $? = 0 +configure:3238: result: no +configure:3243: checking for suffix of object files +configure:3265: mpicc -c -std=c99 -fopenmp conftest.c >&5 +configure:3269: $? = 0 +configure:3290: result: o +configure:3294: checking whether we are using the GNU C compiler +configure:3313: mpicc -c -std=c99 -fopenmp conftest.c >&5 +configure:3313: $? = 0 +configure:3322: result: yes +configure:3331: checking whether mpicc accepts -g +configure:3351: mpicc -c -g conftest.c >&5 +configure:3351: $? = 0 +configure:3392: result: yes +configure:3409: checking for mpicc option to accept ISO C89 +configure:3472: mpicc -c -std=c99 -fopenmp conftest.c >&5 +configure:3472: $? = 0 +configure:3485: result: none needed +configure:3505: checking for mpicc option to accept ISO C99 +configure:3654: mpicc -c -std=c99 -fopenmp conftest.c >&5 +configure:3654: $? = 0 +configure:3667: result: none needed +configure:3683: checking for an ANSI C-conforming const +configure:3749: mpicc -c -std=c99 -fopenmp conftest.c >&5 +configure:3749: $? = 0 +configure:3756: result: yes +configure:3764: checking for inline +configure:3780: mpicc -c -std=c99 -fopenmp conftest.c >&5 +configure:3780: $? = 0 +configure:3788: result: inline +configure:3806: checking for C/C++ restrict keyword +configure:3831: mpicc -c -std=c99 -fopenmp conftest.c >&5 +configure:3831: $? = 0 +configure:3839: result: __restrict +configure:3958: checking for Fortran 77 compiler version +configure:3967: f95 --version >&5 +GNU Fortran (Ubuntu 4.8.4-2ubuntu1~14.04.1) 4.8.4 +Copyright (C) 2013 Free Software Foundation, Inc. + +GNU Fortran comes with NO WARRANTY, to the extent permitted by law. +You may redistribute copies of GNU Fortran +under the terms of the GNU General Public License. +For more information about these matters, see the file named COPYING + +configure:3978: $? = 0 +configure:3967: f95 -v >&5 +Using built-in specs. +COLLECT_GCC=f95 +COLLECT_LTO_WRAPPER=/usr/lib/gcc/x86_64-linux-gnu/4.8/lto-wrapper +Target: x86_64-linux-gnu +Configured with: ../src/configure -v --with-pkgversion='Ubuntu 4.8.4-2ubuntu1~14.04.1' --with-bugurl=file:///usr/share/doc/gcc-4.8/README.Bugs --enable-languages=c,c++,java,go,d,fortran,objc,obj-c++ --prefix=/usr --program-suffix=-4.8 --enable-shared --enable-linker-build-id --libexecdir=/usr/lib --without-included-gettext --enable-threads=posix --with-gxx-include-dir=/usr/include/c++/4.8 --libdir=/usr/lib --enable-nls --with-sysroot=/ --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --enable-gnu-unique-object --disable-libmudflap --enable-plugin --with-system-zlib --disable-browser-plugin --enable-java-awt=gtk --enable-gtk-cairo --with-java-home=/usr/lib/jvm/java-1.5.0-gcj-4.8-amd64/jre --enable-java-home --with-jvm-root-dir=/usr/lib/jvm/java-1.5.0-gcj-4.8-amd64 --with-jvm-jar-dir=/usr/lib/jvm-exports/java-1.5.0-gcj-4.8-amd64 --with-arch-directory=amd64 --with-ecj-jar=/usr/share/java/eclipse-ecj.jar --enable-objc-gc --enable-multiarch --disable-werror --with-arch-32=i686 --with-abi=m64 --with-multilib-list=m32,m64,mx32 --with-tune=generic --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu +Thread model: posix +gcc version 4.8.4 (Ubuntu 4.8.4-2ubuntu1~14.04.1) +configure:3978: $? = 0 +configure:3967: f95 -V >&5 +f95: error: unrecognized command line option '-V' +f95: fatal error: no input files +compilation terminated. +configure:3978: $? = 4 +configure:3967: f95 -qversion >&5 +f95: error: unrecognized command line option '-qversion' +f95: fatal error: no input files +compilation terminated. +configure:3978: $? = 4 +configure:3987: checking whether we are using the GNU Fortran 77 compiler +configure:4000: f95 -c conftest.F >&5 +configure:4000: $? = 0 +configure:4009: result: yes +configure:4015: checking whether f95 accepts -g +configure:4026: f95 -c -g conftest.f >&5 +configure:4026: $? = 0 +configure:4034: result: yes +configure:4068: checking how to get verbose linking output from f95 +configure:4078: f95 -c -g -O2 conftest.f >&5 +configure:4078: $? = 0 +configure:4096: f95 -o conftest -g -O2 -v conftest.f /usr/lib/libblas.so.3 /usr/lib/x86_64-linux-gnu/libgomp.so.1 +Using built-in specs. +Target: x86_64-linux-gnu +Thread model: posix +gcc version 4.8.4 (Ubuntu 4.8.4-2ubuntu1~14.04.1) + /usr/lib/gcc/x86_64-linux-gnu/4.8/f951 conftest.f -ffixed-form -quiet -dumpbase conftest.f -mtune=generic -march=x86-64 -auxbase conftest -g -O2 -version -fintrinsic-modules-path /usr/lib/gcc/x86_64-linux-gnu/4.8/finclude -o /tmp/ccQkW0en.s +GNU Fortran (Ubuntu 4.8.4-2ubuntu1~14.04.1) version 4.8.4 (x86_64-linux-gnu) + compiled by GNU C version 4.8.4, GMP version 5.1.3, MPFR version 3.1.2-p3, MPC version 1.0.1 +GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072 +GNU Fortran (Ubuntu 4.8.4-2ubuntu1~14.04.1) version 4.8.4 (x86_64-linux-gnu) + compiled by GNU C version 4.8.4, GMP version 5.1.3, MPFR version 3.1.2-p3, MPC version 1.0.1 +GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072 + as -v --64 -o /tmp/ccqxzcif.o /tmp/ccQkW0en.s +GNU assembler version 2.24 (x86_64-linux-gnu) using BFD version (GNU Binutils for Ubuntu) 2.24 +Reading specs from /usr/lib/gcc/x86_64-linux-gnu/4.8/libgfortran.spec +rename spec lib to liborig + /usr/lib/gcc/x86_64-linux-gnu/4.8/collect2 --sysroot=/ --build-id --eh-frame-hdr -m elf_x86_64 --hash-style=gnu --as-needed -dynamic-linker /lib64/ld-linux-x86-64.so.2 -z relro -o conftest /usr/lib/gcc/x86_64-linux-gnu/4.8/../../../x86_64-linux-gnu/crt1.o /usr/lib/gcc/x86_64-linux-gnu/4.8/../../../x86_64-linux-gnu/crti.o /usr/lib/gcc/x86_64-linux-gnu/4.8/crtbegin.o -L/usr/lib/gcc/x86_64-linux-gnu/4.8 -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../.. /tmp/ccqxzcif.o /usr/lib/libblas.so.3 /usr/lib/x86_64-linux-gnu/libgomp.so.1 -lgfortran -lm -lgcc_s -lgcc -lquadmath -lm -lgcc_s -lgcc -lc -lgcc_s -lgcc /usr/lib/gcc/x86_64-linux-gnu/4.8/crtend.o /usr/lib/gcc/x86_64-linux-gnu/4.8/../../../x86_64-linux-gnu/crtn.o +configure:4179: result: -v +configure:4181: checking for Fortran 77 libraries of f95 +configure:4204: f95 -o conftest -g -O2 -v conftest.f /usr/lib/libblas.so.3 /usr/lib/x86_64-linux-gnu/libgomp.so.1 +Using built-in specs. +Target: x86_64-linux-gnu +Thread model: posix +gcc version 4.8.4 (Ubuntu 4.8.4-2ubuntu1~14.04.1) + /usr/lib/gcc/x86_64-linux-gnu/4.8/f951 conftest.f -ffixed-form -quiet -dumpbase conftest.f -mtune=generic -march=x86-64 -auxbase conftest -g -O2 -version -fintrinsic-modules-path /usr/lib/gcc/x86_64-linux-gnu/4.8/finclude -o /tmp/ccgGH0gp.s +GNU Fortran (Ubuntu 4.8.4-2ubuntu1~14.04.1) version 4.8.4 (x86_64-linux-gnu) + compiled by GNU C version 4.8.4, GMP version 5.1.3, MPFR version 3.1.2-p3, MPC version 1.0.1 +GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072 +GNU Fortran (Ubuntu 4.8.4-2ubuntu1~14.04.1) version 4.8.4 (x86_64-linux-gnu) + compiled by GNU C version 4.8.4, GMP version 5.1.3, MPFR version 3.1.2-p3, MPC version 1.0.1 +GGC heuristics: --param ggc-min-expand=100 --param ggc-min-heapsize=131072 + as -v --64 -o /tmp/ccqlmwrh.o /tmp/ccgGH0gp.s +GNU assembler version 2.24 (x86_64-linux-gnu) using BFD version (GNU Binutils for Ubuntu) 2.24 +Reading specs from /usr/lib/gcc/x86_64-linux-gnu/4.8/libgfortran.spec +rename spec lib to liborig + /usr/lib/gcc/x86_64-linux-gnu/4.8/collect2 --sysroot=/ --build-id --eh-frame-hdr -m elf_x86_64 --hash-style=gnu --as-needed -dynamic-linker /lib64/ld-linux-x86-64.so.2 -z relro -o conftest /usr/lib/gcc/x86_64-linux-gnu/4.8/../../../x86_64-linux-gnu/crt1.o /usr/lib/gcc/x86_64-linux-gnu/4.8/../../../x86_64-linux-gnu/crti.o /usr/lib/gcc/x86_64-linux-gnu/4.8/crtbegin.o -L/usr/lib/gcc/x86_64-linux-gnu/4.8 -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../.. /tmp/ccqlmwrh.o /usr/lib/libblas.so.3 /usr/lib/x86_64-linux-gnu/libgomp.so.1 -lgfortran -lm -lgcc_s -lgcc -lquadmath -lm -lgcc_s -lgcc -lc -lgcc_s -lgcc /usr/lib/gcc/x86_64-linux-gnu/4.8/crtend.o /usr/lib/gcc/x86_64-linux-gnu/4.8/../../../x86_64-linux-gnu/crtn.o +configure:4400: result: -L/usr/lib/gcc/x86_64-linux-gnu/4.8 -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../.. -lgfortran -lm -lquadmath +configure:4454: checking for ar +configure:4470: found /usr/bin/ar +configure:4481: result: ar +configure:4509: checking for flex +configure:4525: found /usr/bin/flex +configure:4536: result: flex +configure:4574: flex conftest.l +configure:4578: $? = 0 +configure:4580: checking lex output file root +configure:4594: result: lex.yy +configure:4599: checking lex library +configure:4613: mpicc -o conftest -std=c99 -fopenmp conftest.c /usr/lib/libblas.so.3 /usr/lib/x86_64-linux-gnu/libgomp.so.1 -L/usr/lib/gcc/x86_64-linux-gnu/4.8 -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../.. -lgfortran -lm -lquadmath -lm >&5 +lex.yy.c: In function 'yy_init_buffer': +lex.yy.c:1347:9: warning: implicit declaration of function 'fileno' [-Wimplicit-function-declaration] + b->yy_is_interactive = file ? (isatty( fileno(file) ) > 0) : 0; + ^ +/tmp/cc0r2Dfj.o: In function `yylex': +conftest.c:(.text+0x676): undefined reference to `yywrap' +/tmp/cc0r2Dfj.o: In function `input': +conftest.c:(.text+0x10d2): undefined reference to `yywrap' +/tmp/cc0r2Dfj.o: In function `main': +conftest.c:(.text+0x1d8d): undefined reference to `yywrap' +collect2: error: ld returned 1 exit status +configure:4613: $? = 1 +configure: failed program was: +| /* confdefs.h */ +| #define PACKAGE_NAME "tmLQCD" +| #define PACKAGE_TARNAME "tmlqcd" +| #define PACKAGE_VERSION "5.2.0" +| #define PACKAGE_STRING "tmLQCD 5.2.0" +| #define PACKAGE_BUGREPORT "curbach@gmx.de" +| #define PACKAGE_URL "" +| #define restrict __restrict +| /* end confdefs.h. */ +| +| #line 3 "lex.yy.c" +| +| #define YY_INT_ALIGNED short int +| +| /* A lexical scanner generated by flex */ +| +| #define FLEX_SCANNER +| #define YY_FLEX_MAJOR_VERSION 2 +| #define YY_FLEX_MINOR_VERSION 5 +| #define YY_FLEX_SUBMINOR_VERSION 35 +| #if YY_FLEX_SUBMINOR_VERSION > 0 +| #define FLEX_BETA +| #endif +| +| /* First, we deal with platform-specific or compiler-specific issues. */ +| +| /* begin standard C headers. */ +| #include +| #include +| #include +| #include +| +| /* end standard C headers. */ +| +| /* flex integer type definitions */ +| +| #ifndef FLEXINT_H +| #define FLEXINT_H +| +| /* C99 systems have . Non-C99 systems may or may not. */ +| +| #if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L +| +| /* C99 says to define __STDC_LIMIT_MACROS before including stdint.h, +| * if you want the limit (max/min) macros for int types. +| */ +| #ifndef __STDC_LIMIT_MACROS +| #define __STDC_LIMIT_MACROS 1 +| #endif +| +| #include +| typedef int8_t flex_int8_t; +| typedef uint8_t flex_uint8_t; +| typedef int16_t flex_int16_t; +| typedef uint16_t flex_uint16_t; +| typedef int32_t flex_int32_t; +| typedef uint32_t flex_uint32_t; +| #else +| typedef signed char flex_int8_t; +| typedef short int flex_int16_t; +| typedef int flex_int32_t; +| typedef unsigned char flex_uint8_t; +| typedef unsigned short int flex_uint16_t; +| typedef unsigned int flex_uint32_t; +| +| /* Limits of integral types. */ +| #ifndef INT8_MIN +| #define INT8_MIN (-128) +| #endif +| #ifndef INT16_MIN +| #define INT16_MIN (-32767-1) +| #endif +| #ifndef INT32_MIN +| #define INT32_MIN (-2147483647-1) +| #endif +| #ifndef INT8_MAX +| #define INT8_MAX (127) +| #endif +| #ifndef INT16_MAX +| #define INT16_MAX (32767) +| #endif +| #ifndef INT32_MAX +| #define INT32_MAX (2147483647) +| #endif +| #ifndef UINT8_MAX +| #define UINT8_MAX (255U) +| #endif +| #ifndef UINT16_MAX +| #define UINT16_MAX (65535U) +| #endif +| #ifndef UINT32_MAX +| #define UINT32_MAX (4294967295U) +| #endif +| +| #endif /* ! C99 */ +| +| #endif /* ! FLEXINT_H */ +| +| #ifdef __cplusplus +| +| /* The "const" storage-class-modifier is valid. */ +| #define YY_USE_CONST +| +| #else /* ! __cplusplus */ +| +| /* C99 requires __STDC__ to be defined as 1. */ +| #if defined (__STDC__) +| +| #define YY_USE_CONST +| +| #endif /* defined (__STDC__) */ +| #endif /* ! __cplusplus */ +| +| #ifdef YY_USE_CONST +| #define yyconst const +| #else +| #define yyconst +| #endif +| +| /* Returned upon end-of-file. */ +| #define YY_NULL 0 +| +| /* Promotes a possibly negative, possibly signed char to an unsigned +| * integer for use as an array index. If the signed char is negative, +| * we want to instead treat it as an 8-bit unsigned char, hence the +| * double cast. +| */ +| #define YY_SC_TO_UI(c) ((unsigned int) (unsigned char) c) +| +| /* Enter a start condition. This macro really ought to take a parameter, +| * but we do it the disgusting crufty way forced on us by the ()-less +| * definition of BEGIN. +| */ +| #define BEGIN (yy_start) = 1 + 2 * +| +| /* Translate the current start state into a value that can be later handed +| * to BEGIN to return to the state. The YYSTATE alias is for lex +| * compatibility. +| */ +| #define YY_START (((yy_start) - 1) / 2) +| #define YYSTATE YY_START +| +| /* Action number for EOF rule of a given start state. */ +| #define YY_STATE_EOF(state) (YY_END_OF_BUFFER + state + 1) +| +| /* Special action meaning "start processing a new file". */ +| #define YY_NEW_FILE yyrestart(yyin ) +| +| #define YY_END_OF_BUFFER_CHAR 0 +| +| /* Size of default input buffer. */ +| #ifndef YY_BUF_SIZE +| #ifdef __ia64__ +| /* On IA-64, the buffer size is 16k, not 8k. +| * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case. +| * Ditto for the __ia64__ case accordingly. +| */ +| #define YY_BUF_SIZE 32768 +| #else +| #define YY_BUF_SIZE 16384 +| #endif /* __ia64__ */ +| #endif +| +| /* The state buf must be large enough to hold one state per character in the main buffer. +| */ +| #define YY_STATE_BUF_SIZE ((YY_BUF_SIZE + 2) * sizeof(yy_state_type)) +| +| #ifndef YY_TYPEDEF_YY_BUFFER_STATE +| #define YY_TYPEDEF_YY_BUFFER_STATE +| typedef struct yy_buffer_state *YY_BUFFER_STATE; +| #endif +| +| extern int yyleng; +| +| extern FILE *yyin, *yyout; +| +| #define EOB_ACT_CONTINUE_SCAN 0 +| #define EOB_ACT_END_OF_FILE 1 +| #define EOB_ACT_LAST_MATCH 2 +| +| #define YY_LESS_LINENO(n) +| +| /* Return all but the first "n" matched characters back to the input stream. */ +| #define yyless(n) \ +| do \ +| { \ +| /* Undo effects of setting up yytext. */ \ +| int yyless_macro_arg = (n); \ +| YY_LESS_LINENO(yyless_macro_arg);\ +| *yy_cp = (yy_hold_char); \ +| YY_RESTORE_YY_MORE_OFFSET \ +| (yy_c_buf_p) = yy_cp = yy_bp + yyless_macro_arg - YY_MORE_ADJ; \ +| YY_DO_BEFORE_ACTION; /* set up yytext again */ \ +| } \ +| while ( 0 ) +| +| #define unput(c) yyunput( c, (yytext_ptr) ) +| +| #ifndef YY_TYPEDEF_YY_SIZE_T +| #define YY_TYPEDEF_YY_SIZE_T +| typedef size_t yy_size_t; +| #endif +| +| #ifndef YY_STRUCT_YY_BUFFER_STATE +| #define YY_STRUCT_YY_BUFFER_STATE +| struct yy_buffer_state +| { +| FILE *yy_input_file; +| +| char *yy_ch_buf; /* input buffer */ +| char *yy_buf_pos; /* current position in input buffer */ +| +| /* Size of input buffer in bytes, not including room for EOB +| * characters. +| */ +| yy_size_t yy_buf_size; +| +| /* Number of characters read into yy_ch_buf, not including EOB +| * characters. +| */ +| int yy_n_chars; +| +| /* Whether we "own" the buffer - i.e., we know we created it, +| * and can realloc() it to grow it, and should free() it to +| * delete it. +| */ +| int yy_is_our_buffer; +| +| /* Whether this is an "interactive" input source; if so, and +| * if we're using stdio for input, then we want to use getc() +| * instead of fread(), to make sure we stop fetching input after +| * each newline. +| */ +| int yy_is_interactive; +| +| /* Whether we're considered to be at the beginning of a line. +| * If so, '^' rules will be active on the next match, otherwise +| * not. +| */ +| int yy_at_bol; +| +| int yy_bs_lineno; /**< The line count. */ +| int yy_bs_column; /**< The column count. */ +| +| /* Whether to try to fill the input buffer when we reach the +| * end of it. +| */ +| int yy_fill_buffer; +| +| int yy_buffer_status; +| +| #define YY_BUFFER_NEW 0 +| #define YY_BUFFER_NORMAL 1 +| /* When an EOF's been seen but there's still some text to process +| * then we mark the buffer as YY_EOF_PENDING, to indicate that we +| * shouldn't try reading from the input source any more. We might +| * still have a bunch of tokens to match, though, because of +| * possible backing-up. +| * +| * When we actually see the EOF, we change the status to "new" +| * (via yyrestart()), so that the user can continue scanning by +| * just pointing yyin at a new input file. +| */ +| #define YY_BUFFER_EOF_PENDING 2 +| +| }; +| #endif /* !YY_STRUCT_YY_BUFFER_STATE */ +| +| /* Stack of input buffers. */ +| static size_t yy_buffer_stack_top = 0; /**< index of top of stack. */ +| static size_t yy_buffer_stack_max = 0; /**< capacity of stack. */ +| static YY_BUFFER_STATE * yy_buffer_stack = 0; /**< Stack as an array. */ +| +| /* We provide macros for accessing buffer states in case in the +| * future we want to put the buffer states in a more general +| * "scanner state". +| * +| * Returns the top of the stack, or NULL. +| */ +| #define YY_CURRENT_BUFFER ( (yy_buffer_stack) \ +| ? (yy_buffer_stack)[(yy_buffer_stack_top)] \ +| : NULL) +| +| /* Same as previous macro, but useful when we know that the buffer stack is not +| * NULL or when we need an lvalue. For internal use only. +| */ +| #define YY_CURRENT_BUFFER_LVALUE (yy_buffer_stack)[(yy_buffer_stack_top)] +| +| /* yy_hold_char holds the character lost when yytext is formed. */ +| static char yy_hold_char; +| static int yy_n_chars; /* number of characters read into yy_ch_buf */ +| int yyleng; +| +| /* Points to current character in buffer. */ +| static char *yy_c_buf_p = (char *) 0; +| static int yy_init = 0; /* whether we need to initialize */ +| static int yy_start = 0; /* start state number */ +| +| /* Flag which is used to allow yywrap()'s to do buffer switches +| * instead of setting up a fresh yyin. A bit of a hack ... +| */ +| static int yy_did_buffer_switch_on_eof; +| +| void yyrestart (FILE *input_file ); +| void yy_switch_to_buffer (YY_BUFFER_STATE new_buffer ); +| YY_BUFFER_STATE yy_create_buffer (FILE *file,int size ); +| void yy_delete_buffer (YY_BUFFER_STATE b ); +| void yy_flush_buffer (YY_BUFFER_STATE b ); +| void yypush_buffer_state (YY_BUFFER_STATE new_buffer ); +| void yypop_buffer_state (void ); +| +| static void yyensure_buffer_stack (void ); +| static void yy_load_buffer_state (void ); +| static void yy_init_buffer (YY_BUFFER_STATE b,FILE *file ); +| +| #define YY_FLUSH_BUFFER yy_flush_buffer(YY_CURRENT_BUFFER ) +| +| YY_BUFFER_STATE yy_scan_buffer (char *base,yy_size_t size ); +| YY_BUFFER_STATE yy_scan_string (yyconst char *yy_str ); +| YY_BUFFER_STATE yy_scan_bytes (yyconst char *bytes,int len ); +| +| void *yyalloc (yy_size_t ); +| void *yyrealloc (void *,yy_size_t ); +| void yyfree (void * ); +| +| #define yy_new_buffer yy_create_buffer +| +| #define yy_set_interactive(is_interactive) \ +| { \ +| if ( ! YY_CURRENT_BUFFER ){ \ +| yyensure_buffer_stack (); \ +| YY_CURRENT_BUFFER_LVALUE = \ +| yy_create_buffer(yyin,YY_BUF_SIZE ); \ +| } \ +| YY_CURRENT_BUFFER_LVALUE->yy_is_interactive = is_interactive; \ +| } +| +| #define yy_set_bol(at_bol) \ +| { \ +| if ( ! YY_CURRENT_BUFFER ){\ +| yyensure_buffer_stack (); \ +| YY_CURRENT_BUFFER_LVALUE = \ +| yy_create_buffer(yyin,YY_BUF_SIZE ); \ +| } \ +| YY_CURRENT_BUFFER_LVALUE->yy_at_bol = at_bol; \ +| } +| +| #define YY_AT_BOL() (YY_CURRENT_BUFFER_LVALUE->yy_at_bol) +| +| /* Begin user sect3 */ +| +| typedef unsigned char YY_CHAR; +| +| FILE *yyin = (FILE *) 0, *yyout = (FILE *) 0; +| +| typedef int yy_state_type; +| +| extern int yylineno; +| +| int yylineno = 1; +| +| extern char *yytext; +| #define yytext_ptr yytext +| +| static yy_state_type yy_get_previous_state (void ); +| static yy_state_type yy_try_NUL_trans (yy_state_type current_state ); +| static int yy_get_next_buffer (void ); +| static void yy_fatal_error (yyconst char msg[] ); +| +| /* Done after the current pattern has been matched and before the +| * corresponding action - sets up yytext. +| */ +| #define YY_DO_BEFORE_ACTION \ +| (yytext_ptr) = yy_bp; \ +| (yytext_ptr) -= (yy_more_len); \ +| yyleng = (size_t) (yy_cp - (yytext_ptr)); \ +| (yy_hold_char) = *yy_cp; \ +| *yy_cp = '\0'; \ +| (yy_c_buf_p) = yy_cp; +| +| #define YY_NUM_RULES 8 +| #define YY_END_OF_BUFFER 9 +| /* This struct is not used in this scanner, +| but its presence is necessary. */ +| struct yy_trans_info +| { +| flex_int32_t yy_verify; +| flex_int32_t yy_nxt; +| }; +| static yyconst flex_int16_t yy_acclist[23] = +| { 0, +| 9, 7, 8, 8, 1, 7, 8, 2, 7, 8, +| 3, 7, 8, 4, 7, 8, 5, 7, 8, 6, +| 7, 8 +| } ; +| +| static yyconst flex_int16_t yy_accept[14] = +| { 0, +| 1, 1, 1, 2, 4, 5, 8, 11, 14, 17, +| 20, 23, 23 +| } ; +| +| static yyconst flex_int32_t yy_ec[256] = +| { 0, +| 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, +| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +| 1, 1, 1, 1, 1, 1, 3, 4, 5, 6, +| +| 7, 8, 1, 1, 1, 1, 1, 1, 1, 1, +| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +| +| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +| 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, +| 1, 1, 1, 1, 1 +| } ; +| +| static yyconst flex_int32_t yy_meta[9] = +| { 0, +| 1, 1, 1, 1, 1, 1, 1, 1 +| } ; +| +| static yyconst flex_int16_t yy_base[13] = +| { 0, +| 0, 0, 9, 10, 10, 10, 10, 10, 10, 10, +| 10, 10 +| } ; +| +| static yyconst flex_int16_t yy_def[13] = +| { 0, +| 12, 1, 12, 12, 12, 12, 12, 12, 12, 12, +| 12, 0 +| } ; +| +| static yyconst flex_int16_t yy_nxt[19] = +| { 0, +| 4, 5, 6, 7, 8, 9, 10, 11, 12, 3, +| 12, 12, 12, 12, 12, 12, 12, 12 +| } ; +| +| static yyconst flex_int16_t yy_chk[19] = +| { 0, +| 1, 1, 1, 1, 1, 1, 1, 1, 3, 12, +| 12, 12, 12, 12, 12, 12, 12, 12 +| } ; +| +| extern int yy_flex_debug; +| int yy_flex_debug = 0; +| +| static yy_state_type *yy_state_buf=0, *yy_state_ptr=0; +| static char *yy_full_match; +| static int yy_lp; +| #define REJECT \ +| { \ +| *yy_cp = (yy_hold_char); /* undo effects of setting up yytext */ \ +| yy_cp = (yy_full_match); /* restore poss. backed-over text */ \ +| ++(yy_lp); \ +| goto find_rule; \ +| } +| +| static int yy_more_flag = 0; +| static int yy_more_len = 0; +| #define yymore() ((yy_more_flag) = 1) +| #define YY_MORE_ADJ (yy_more_len) +| #define YY_RESTORE_YY_MORE_OFFSET +| char *yytext; +| #line 1 "conftest.l" +| #line 477 "lex.yy.c" +| +| #define INITIAL 0 +| +| #ifndef YY_NO_UNISTD_H +| /* Special case for "unistd.h", since it is non-ANSI. We include it way +| * down here because we want the user's section 1 to have been scanned first. +| * The user has a chance to override it with an option. +| */ +| #include +| #endif +| +| #ifndef YY_EXTRA_TYPE +| #define YY_EXTRA_TYPE void * +| #endif +| +| static int yy_init_globals (void ); +| +| /* Accessor methods to globals. +| These are made visible to non-reentrant scanners for convenience. */ +| +| int yylex_destroy (void ); +| +| int yyget_debug (void ); +| +| void yyset_debug (int debug_flag ); +| +| YY_EXTRA_TYPE yyget_extra (void ); +| +| void yyset_extra (YY_EXTRA_TYPE user_defined ); +| +| FILE *yyget_in (void ); +| +| void yyset_in (FILE * in_str ); +| +| FILE *yyget_out (void ); +| +| void yyset_out (FILE * out_str ); +| +| int yyget_leng (void ); +| +| char *yyget_text (void ); +| +| int yyget_lineno (void ); +| +| void yyset_lineno (int line_number ); +| +| /* Macros after this point can all be overridden by user definitions in +| * section 1. +| */ +| +| #ifndef YY_SKIP_YYWRAP +| #ifdef __cplusplus +| extern "C" int yywrap (void ); +| #else +| extern int yywrap (void ); +| #endif +| #endif +| +| static void yyunput (int c,char *buf_ptr ); +| +| #ifndef yytext_ptr +| static void yy_flex_strncpy (char *,yyconst char *,int ); +| #endif +| +| #ifdef YY_NEED_STRLEN +| static int yy_flex_strlen (yyconst char * ); +| #endif +| +| #ifndef YY_NO_INPUT +| +| #ifdef __cplusplus +| static int yyinput (void ); +| #else +| static int input (void ); +| #endif +| +| #endif +| +| /* Amount of stuff to slurp up with each read. */ +| #ifndef YY_READ_BUF_SIZE +| #ifdef __ia64__ +| /* On IA-64, the buffer size is 16k, not 8k */ +| #define YY_READ_BUF_SIZE 16384 +| #else +| #define YY_READ_BUF_SIZE 8192 +| #endif /* __ia64__ */ +| #endif +| +| /* Copy whatever the last rule matched to the standard output. */ +| #ifndef ECHO +| /* This used to be an fputs(), but since the string might contain NUL's, +| * we now use fwrite(). +| */ +| #define ECHO do { if (fwrite( yytext, yyleng, 1, yyout )) {} } while (0) +| #endif +| +| /* Gets input and stuffs it into "buf". number of characters read, or YY_NULL, +| * is returned in "result". +| */ +| #ifndef YY_INPUT +| #define YY_INPUT(buf,result,max_size) \ +| if ( YY_CURRENT_BUFFER_LVALUE->yy_is_interactive ) \ +| { \ +| int c = '*'; \ +| size_t n; \ +| for ( n = 0; n < max_size && \ +| (c = getc( yyin )) != EOF && c != '\n'; ++n ) \ +| buf[n] = (char) c; \ +| if ( c == '\n' ) \ +| buf[n++] = (char) c; \ +| if ( c == EOF && ferror( yyin ) ) \ +| YY_FATAL_ERROR( "input in flex scanner failed" ); \ +| result = n; \ +| } \ +| else \ +| { \ +| errno=0; \ +| while ( (result = fread(buf, 1, max_size, yyin))==0 && ferror(yyin)) \ +| { \ +| if( errno != EINTR) \ +| { \ +| YY_FATAL_ERROR( "input in flex scanner failed" ); \ +| break; \ +| } \ +| errno=0; \ +| clearerr(yyin); \ +| } \ +| }\ +| \ +| +| #endif +| +| /* No semi-colon after return; correct usage is to write "yyterminate();" - +| * we don't want an extra ';' after the "return" because that will cause +| * some compilers to complain about unreachable statements. +| */ +| #ifndef yyterminate +| #define yyterminate() return YY_NULL +| #endif +| +| /* Number of entries by which start-condition stack grows. */ +| #ifndef YY_START_STACK_INCR +| #define YY_START_STACK_INCR 25 +| #endif +| +| /* Report a fatal error. */ +| #ifndef YY_FATAL_ERROR +| #define YY_FATAL_ERROR(msg) yy_fatal_error( msg ) +| #endif +| +| /* end tables serialization structures and prototypes */ +| +| /* Default declaration of generated scanner - a define so the user can +| * easily add parameters. +| */ +| #ifndef YY_DECL +| #define YY_DECL_IS_OURS 1 +| +| extern int yylex (void); +| +| #define YY_DECL int yylex (void) +| #endif /* !YY_DECL */ +| +| /* Code executed at the beginning of each rule, after yytext and yyleng +| * have been set up. +| */ +| #ifndef YY_USER_ACTION +| #define YY_USER_ACTION +| #endif +| +| /* Code executed at the end of each rule. */ +| #ifndef YY_BREAK +| #define YY_BREAK break; +| #endif +| +| #define YY_RULE_SETUP \ +| YY_USER_ACTION +| +| /** The main scanner function which does all the work. +| */ +| YY_DECL +| { +| register yy_state_type yy_current_state; +| register char *yy_cp, *yy_bp; +| register int yy_act; +| +| #line 1 "conftest.l" +| +| #line 666 "lex.yy.c" +| +| if ( !(yy_init) ) +| { +| (yy_init) = 1; +| +| #ifdef YY_USER_INIT +| YY_USER_INIT; +| #endif +| +| /* Create the reject buffer large enough to save one state per allowed character. */ +| if ( ! (yy_state_buf) ) +| (yy_state_buf) = (yy_state_type *)yyalloc(YY_STATE_BUF_SIZE ); +| if ( ! (yy_state_buf) ) +| YY_FATAL_ERROR( "out of dynamic memory in yylex()" ); +| +| if ( ! (yy_start) ) +| (yy_start) = 1; /* first start state */ +| +| if ( ! yyin ) +| yyin = stdin; +| +| if ( ! yyout ) +| yyout = stdout; +| +| if ( ! YY_CURRENT_BUFFER ) { +| yyensure_buffer_stack (); +| YY_CURRENT_BUFFER_LVALUE = +| yy_create_buffer(yyin,YY_BUF_SIZE ); +| } +| +| yy_load_buffer_state( ); +| } +| +| while ( 1 ) /* loops until end-of-file is reached */ +| { +| (yy_more_len) = 0; +| if ( (yy_more_flag) ) +| { +| (yy_more_len) = (yy_c_buf_p) - (yytext_ptr); +| (yy_more_flag) = 0; +| } +| yy_cp = (yy_c_buf_p); +| +| /* Support of yytext. */ +| *yy_cp = (yy_hold_char); +| +| /* yy_bp points to the position in yy_ch_buf of the start of +| * the current run. +| */ +| yy_bp = yy_cp; +| +| yy_current_state = (yy_start); +| +| (yy_state_ptr) = (yy_state_buf); +| *(yy_state_ptr)++ = yy_current_state; +| +| yy_match: +| do +| { +| register YY_CHAR yy_c = yy_ec[YY_SC_TO_UI(*yy_cp)]; +| while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) +| { +| yy_current_state = (int) yy_def[yy_current_state]; +| if ( yy_current_state >= 13 ) +| yy_c = yy_meta[(unsigned int) yy_c]; +| } +| yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c]; +| *(yy_state_ptr)++ = yy_current_state; +| ++yy_cp; +| } +| while ( yy_base[yy_current_state] != 10 ); +| +| yy_find_action: +| yy_current_state = *--(yy_state_ptr); +| (yy_lp) = yy_accept[yy_current_state]; +| find_rule: /* we branch to this label when backing up */ +| for ( ; ; ) /* until we find what rule we matched */ +| { +| if ( (yy_lp) && (yy_lp) < yy_accept[yy_current_state + 1] ) +| { +| yy_act = yy_acclist[(yy_lp)]; +| { +| (yy_full_match) = yy_cp; +| break; +| } +| } +| --yy_cp; +| yy_current_state = *--(yy_state_ptr); +| (yy_lp) = yy_accept[yy_current_state]; +| } +| +| YY_DO_BEFORE_ACTION; +| +| do_action: /* This label is used only to access EOF actions. */ +| +| switch ( yy_act ) +| { /* beginning of action switch */ +| case 1: +| YY_RULE_SETUP +| #line 2 "conftest.l" +| { ECHO; } +| YY_BREAK +| case 2: +| YY_RULE_SETUP +| #line 3 "conftest.l" +| { REJECT; } +| YY_BREAK +| case 3: +| YY_RULE_SETUP +| #line 4 "conftest.l" +| { yymore (); } +| YY_BREAK +| case 4: +| YY_RULE_SETUP +| #line 5 "conftest.l" +| { yyless (1); } +| YY_BREAK +| case 5: +| YY_RULE_SETUP +| #line 6 "conftest.l" +| { /* IRIX 6.5 flex 2.5.4 underquotes its yyless argument. */ +| yyless ((input () != 0)); } +| YY_BREAK +| case 6: +| YY_RULE_SETUP +| #line 8 "conftest.l" +| { unput (yytext[0]); } +| YY_BREAK +| case 7: +| YY_RULE_SETUP +| #line 9 "conftest.l" +| { BEGIN INITIAL; } +| YY_BREAK +| case 8: +| YY_RULE_SETUP +| #line 10 "conftest.l" +| ECHO; +| YY_BREAK +| #line 805 "lex.yy.c" +| case YY_STATE_EOF(INITIAL): +| yyterminate(); +| +| case YY_END_OF_BUFFER: +| { +| /* Amount of text matched not including the EOB char. */ +| int yy_amount_of_matched_text = (int) (yy_cp - (yytext_ptr)) - 1; +| +| /* Undo the effects of YY_DO_BEFORE_ACTION. */ +| *yy_cp = (yy_hold_char); +| YY_RESTORE_YY_MORE_OFFSET +| +| if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_NEW ) +| { +| /* We're scanning a new file or input source. It's +| * possible that this happened because the user +| * just pointed yyin at a new source and called +| * yylex(). If so, then we have to assure +| * consistency between YY_CURRENT_BUFFER and our +| * globals. Here is the right place to do so, because +| * this is the first action (other than possibly a +| * back-up) that will match for the new input source. +| */ +| (yy_n_chars) = YY_CURRENT_BUFFER_LVALUE->yy_n_chars; +| YY_CURRENT_BUFFER_LVALUE->yy_input_file = yyin; +| YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = YY_BUFFER_NORMAL; +| } +| +| /* Note that here we test for yy_c_buf_p "<=" to the position +| * of the first EOB in the buffer, since yy_c_buf_p will +| * already have been incremented past the NUL character +| * (since all states make transitions on EOB to the +| * end-of-buffer state). Contrast this with the test +| * in input(). +| */ +| if ( (yy_c_buf_p) <= &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] ) +| { /* This was really a NUL. */ +| yy_state_type yy_next_state; +| +| (yy_c_buf_p) = (yytext_ptr) + yy_amount_of_matched_text; +| +| yy_current_state = yy_get_previous_state( ); +| +| /* Okay, we're now positioned to make the NUL +| * transition. We couldn't have +| * yy_get_previous_state() go ahead and do it +| * for us because it doesn't know how to deal +| * with the possibility of jamming (and we don't +| * want to build jamming into it because then it +| * will run more slowly). +| */ +| +| yy_next_state = yy_try_NUL_trans( yy_current_state ); +| +| yy_bp = (yytext_ptr) + YY_MORE_ADJ; +| +| if ( yy_next_state ) +| { +| /* Consume the NUL. */ +| yy_cp = ++(yy_c_buf_p); +| yy_current_state = yy_next_state; +| goto yy_match; +| } +| +| else +| { +| yy_cp = (yy_c_buf_p); +| goto yy_find_action; +| } +| } +| +| else switch ( yy_get_next_buffer( ) ) +| { +| case EOB_ACT_END_OF_FILE: +| { +| (yy_did_buffer_switch_on_eof) = 0; +| +| if ( yywrap( ) ) +| { +| /* Note: because we've taken care in +| * yy_get_next_buffer() to have set up +| * yytext, we can now set up +| * yy_c_buf_p so that if some total +| * hoser (like flex itself) wants to +| * call the scanner after we return the +| * YY_NULL, it'll still work - another +| * YY_NULL will get returned. +| */ +| (yy_c_buf_p) = (yytext_ptr) + YY_MORE_ADJ; +| +| yy_act = YY_STATE_EOF(YY_START); +| goto do_action; +| } +| +| else +| { +| if ( ! (yy_did_buffer_switch_on_eof) ) +| YY_NEW_FILE; +| } +| break; +| } +| +| case EOB_ACT_CONTINUE_SCAN: +| (yy_c_buf_p) = +| (yytext_ptr) + yy_amount_of_matched_text; +| +| yy_current_state = yy_get_previous_state( ); +| +| yy_cp = (yy_c_buf_p); +| yy_bp = (yytext_ptr) + YY_MORE_ADJ; +| goto yy_match; +| +| case EOB_ACT_LAST_MATCH: +| (yy_c_buf_p) = +| &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)]; +| +| yy_current_state = yy_get_previous_state( ); +| +| yy_cp = (yy_c_buf_p); +| yy_bp = (yytext_ptr) + YY_MORE_ADJ; +| goto yy_find_action; +| } +| break; +| } +| +| default: +| YY_FATAL_ERROR( +| "fatal flex scanner internal error--no action found" ); +| } /* end of action switch */ +| } /* end of scanning one token */ +| } /* end of yylex */ +| +| /* yy_get_next_buffer - try to read in a new buffer +| * +| * Returns a code representing an action: +| * EOB_ACT_LAST_MATCH - +| * EOB_ACT_CONTINUE_SCAN - continue scanning from current position +| * EOB_ACT_END_OF_FILE - end of file +| */ +| static int yy_get_next_buffer (void) +| { +| register char *dest = YY_CURRENT_BUFFER_LVALUE->yy_ch_buf; +| register char *source = (yytext_ptr); +| register int number_to_move, i; +| int ret_val; +| +| if ( (yy_c_buf_p) > &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars) + 1] ) +| YY_FATAL_ERROR( +| "fatal flex scanner internal error--end of buffer missed" ); +| +| if ( YY_CURRENT_BUFFER_LVALUE->yy_fill_buffer == 0 ) +| { /* Don't try to fill the buffer, so this is an EOF. */ +| if ( (yy_c_buf_p) - (yytext_ptr) - YY_MORE_ADJ == 1 ) +| { +| /* We matched a single character, the EOB, so +| * treat this as a final EOF. +| */ +| return EOB_ACT_END_OF_FILE; +| } +| +| else +| { +| /* We matched some text prior to the EOB, first +| * process it. +| */ +| return EOB_ACT_LAST_MATCH; +| } +| } +| +| /* Try to read more data. */ +| +| /* First move last chars to start of buffer. */ +| number_to_move = (int) ((yy_c_buf_p) - (yytext_ptr)) - 1; +| +| for ( i = 0; i < number_to_move; ++i ) +| *(dest++) = *(source++); +| +| if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_EOF_PENDING ) +| /* don't do the read, it's not guaranteed to return an EOF, +| * just force an EOF +| */ +| YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars) = 0; +| +| else +| { +| int num_to_read = +| YY_CURRENT_BUFFER_LVALUE->yy_buf_size - number_to_move - 1; +| +| while ( num_to_read <= 0 ) +| { /* Not enough room in the buffer - grow it. */ +| +| YY_FATAL_ERROR( +| "input buffer overflow, can't enlarge buffer because scanner uses REJECT" ); +| +| } +| +| if ( num_to_read > YY_READ_BUF_SIZE ) +| num_to_read = YY_READ_BUF_SIZE; +| +| /* Read in more data. */ +| YY_INPUT( (&YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[number_to_move]), +| (yy_n_chars), (size_t) num_to_read ); +| +| YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars); +| } +| +| if ( (yy_n_chars) == 0 ) +| { +| if ( number_to_move == YY_MORE_ADJ ) +| { +| ret_val = EOB_ACT_END_OF_FILE; +| yyrestart(yyin ); +| } +| +| else +| { +| ret_val = EOB_ACT_LAST_MATCH; +| YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = +| YY_BUFFER_EOF_PENDING; +| } +| } +| +| else +| ret_val = EOB_ACT_CONTINUE_SCAN; +| +| if ((yy_size_t) ((yy_n_chars) + number_to_move) > YY_CURRENT_BUFFER_LVALUE->yy_buf_size) { +| /* Extend the array by 50%, plus the number we really need. */ +| yy_size_t new_size = (yy_n_chars) + number_to_move + ((yy_n_chars) >> 1); +| YY_CURRENT_BUFFER_LVALUE->yy_ch_buf = (char *) yyrealloc((void *) YY_CURRENT_BUFFER_LVALUE->yy_ch_buf,new_size ); +| if ( ! YY_CURRENT_BUFFER_LVALUE->yy_ch_buf ) +| YY_FATAL_ERROR( "out of dynamic memory in yy_get_next_buffer()" ); +| } +| +| (yy_n_chars) += number_to_move; +| YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] = YY_END_OF_BUFFER_CHAR; +| YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars) + 1] = YY_END_OF_BUFFER_CHAR; +| +| (yytext_ptr) = &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[0]; +| +| return ret_val; +| } +| +| /* yy_get_previous_state - get the state just before the EOB char was reached */ +| +| static yy_state_type yy_get_previous_state (void) +| { +| register yy_state_type yy_current_state; +| register char *yy_cp; +| +| yy_current_state = (yy_start); +| +| (yy_state_ptr) = (yy_state_buf); +| *(yy_state_ptr)++ = yy_current_state; +| +| for ( yy_cp = (yytext_ptr) + YY_MORE_ADJ; yy_cp < (yy_c_buf_p); ++yy_cp ) +| { +| register YY_CHAR yy_c = (*yy_cp ? yy_ec[YY_SC_TO_UI(*yy_cp)] : 1); +| while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) +| { +| yy_current_state = (int) yy_def[yy_current_state]; +| if ( yy_current_state >= 13 ) +| yy_c = yy_meta[(unsigned int) yy_c]; +| } +| yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c]; +| *(yy_state_ptr)++ = yy_current_state; +| } +| +| return yy_current_state; +| } +| +| /* yy_try_NUL_trans - try to make a transition on the NUL character +| * +| * synopsis +| * next_state = yy_try_NUL_trans( current_state ); +| */ +| static yy_state_type yy_try_NUL_trans (yy_state_type yy_current_state ) +| { +| register int yy_is_jam; +| +| register YY_CHAR yy_c = 1; +| while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) +| { +| yy_current_state = (int) yy_def[yy_current_state]; +| if ( yy_current_state >= 13 ) +| yy_c = yy_meta[(unsigned int) yy_c]; +| } +| yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c]; +| yy_is_jam = (yy_current_state == 12); +| if ( ! yy_is_jam ) +| *(yy_state_ptr)++ = yy_current_state; +| +| return yy_is_jam ? 0 : yy_current_state; +| } +| +| static void yyunput (int c, register char * yy_bp ) +| { +| register char *yy_cp; +| +| yy_cp = (yy_c_buf_p); +| +| /* undo effects of setting up yytext */ +| *yy_cp = (yy_hold_char); +| +| if ( yy_cp < YY_CURRENT_BUFFER_LVALUE->yy_ch_buf + 2 ) +| { /* need to shift things up to make room */ +| /* +2 for EOB chars. */ +| register int number_to_move = (yy_n_chars) + 2; +| register char *dest = &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[ +| YY_CURRENT_BUFFER_LVALUE->yy_buf_size + 2]; +| register char *source = +| &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[number_to_move]; +| +| while ( source > YY_CURRENT_BUFFER_LVALUE->yy_ch_buf ) +| *--dest = *--source; +| +| yy_cp += (int) (dest - source); +| yy_bp += (int) (dest - source); +| YY_CURRENT_BUFFER_LVALUE->yy_n_chars = +| (yy_n_chars) = YY_CURRENT_BUFFER_LVALUE->yy_buf_size; +| +| if ( yy_cp < YY_CURRENT_BUFFER_LVALUE->yy_ch_buf + 2 ) +| YY_FATAL_ERROR( "flex scanner push-back overflow" ); +| } +| +| *--yy_cp = (char) c; +| +| (yytext_ptr) = yy_bp; +| (yy_hold_char) = *yy_cp; +| (yy_c_buf_p) = yy_cp; +| } +| +| #ifndef YY_NO_INPUT +| #ifdef __cplusplus +| static int yyinput (void) +| #else +| static int input (void) +| #endif +| +| { +| int c; +| +| *(yy_c_buf_p) = (yy_hold_char); +| +| if ( *(yy_c_buf_p) == YY_END_OF_BUFFER_CHAR ) +| { +| /* yy_c_buf_p now points to the character we want to return. +| * If this occurs *before* the EOB characters, then it's a +| * valid NUL; if not, then we've hit the end of the buffer. +| */ +| if ( (yy_c_buf_p) < &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] ) +| /* This was really a NUL. */ +| *(yy_c_buf_p) = '\0'; +| +| else +| { /* need more input */ +| int offset = (yy_c_buf_p) - (yytext_ptr); +| ++(yy_c_buf_p); +| +| switch ( yy_get_next_buffer( ) ) +| { +| case EOB_ACT_LAST_MATCH: +| /* This happens because yy_g_n_b() +| * sees that we've accumulated a +| * token and flags that we need to +| * try matching the token before +| * proceeding. But for input(), +| * there's no matching to consider. +| * So convert the EOB_ACT_LAST_MATCH +| * to EOB_ACT_END_OF_FILE. +| */ +| +| /* Reset buffer status. */ +| yyrestart(yyin ); +| +| /*FALLTHROUGH*/ +| +| case EOB_ACT_END_OF_FILE: +| { +| if ( yywrap( ) ) +| return EOF; +| +| if ( ! (yy_did_buffer_switch_on_eof) ) +| YY_NEW_FILE; +| #ifdef __cplusplus +| return yyinput(); +| #else +| return input(); +| #endif +| } +| +| case EOB_ACT_CONTINUE_SCAN: +| (yy_c_buf_p) = (yytext_ptr) + offset; +| break; +| } +| } +| } +| +| c = *(unsigned char *) (yy_c_buf_p); /* cast for 8-bit char's */ +| *(yy_c_buf_p) = '\0'; /* preserve yytext */ +| (yy_hold_char) = *++(yy_c_buf_p); +| +| return c; +| } +| #endif /* ifndef YY_NO_INPUT */ +| +| /** Immediately switch to a different input stream. +| * @param input_file A readable stream. +| * +| * @note This function does not reset the start condition to @c INITIAL . +| */ +| void yyrestart (FILE * input_file ) +| { +| +| if ( ! YY_CURRENT_BUFFER ){ +| yyensure_buffer_stack (); +| YY_CURRENT_BUFFER_LVALUE = +| yy_create_buffer(yyin,YY_BUF_SIZE ); +| } +| +| yy_init_buffer(YY_CURRENT_BUFFER,input_file ); +| yy_load_buffer_state( ); +| } +| +| /** Switch to a different input buffer. +| * @param new_buffer The new input buffer. +| * +| */ +| void yy_switch_to_buffer (YY_BUFFER_STATE new_buffer ) +| { +| +| /* TODO. We should be able to replace this entire function body +| * with +| * yypop_buffer_state(); +| * yypush_buffer_state(new_buffer); +| */ +| yyensure_buffer_stack (); +| if ( YY_CURRENT_BUFFER == new_buffer ) +| return; +| +| if ( YY_CURRENT_BUFFER ) +| { +| /* Flush out information for old buffer. */ +| *(yy_c_buf_p) = (yy_hold_char); +| YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = (yy_c_buf_p); +| YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars); +| } +| +| YY_CURRENT_BUFFER_LVALUE = new_buffer; +| yy_load_buffer_state( ); +| +| /* We don't actually know whether we did this switch during +| * EOF (yywrap()) processing, but the only time this flag +| * is looked at is after yywrap() is called, so it's safe +| * to go ahead and always set it. +| */ +| (yy_did_buffer_switch_on_eof) = 1; +| } +| +| static void yy_load_buffer_state (void) +| { +| (yy_n_chars) = YY_CURRENT_BUFFER_LVALUE->yy_n_chars; +| (yytext_ptr) = (yy_c_buf_p) = YY_CURRENT_BUFFER_LVALUE->yy_buf_pos; +| yyin = YY_CURRENT_BUFFER_LVALUE->yy_input_file; +| (yy_hold_char) = *(yy_c_buf_p); +| } +| +| /** Allocate and initialize an input buffer state. +| * @param file A readable stream. +| * @param size The character buffer size in bytes. When in doubt, use @c YY_BUF_SIZE. +| * +| * @return the allocated buffer state. +| */ +| YY_BUFFER_STATE yy_create_buffer (FILE * file, int size ) +| { +| YY_BUFFER_STATE b; +| +| b = (YY_BUFFER_STATE) yyalloc(sizeof( struct yy_buffer_state ) ); +| if ( ! b ) +| YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" ); +| +| b->yy_buf_size = size; +| +| /* yy_ch_buf has to be 2 characters longer than the size given because +| * we need to put in 2 end-of-buffer characters. +| */ +| b->yy_ch_buf = (char *) yyalloc(b->yy_buf_size + 2 ); +| if ( ! b->yy_ch_buf ) +| YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" ); +| +| b->yy_is_our_buffer = 1; +| +| yy_init_buffer(b,file ); +| +| return b; +| } +| +| /** Destroy the buffer. +| * @param b a buffer created with yy_create_buffer() +| * +| */ +| void yy_delete_buffer (YY_BUFFER_STATE b ) +| { +| +| if ( ! b ) +| return; +| +| if ( b == YY_CURRENT_BUFFER ) /* Not sure if we should pop here. */ +| YY_CURRENT_BUFFER_LVALUE = (YY_BUFFER_STATE) 0; +| +| if ( b->yy_is_our_buffer ) +| yyfree((void *) b->yy_ch_buf ); +| +| yyfree((void *) b ); +| } +| +| #ifndef __cplusplus +| extern int isatty (int ); +| #endif /* __cplusplus */ +| +| /* Initializes or reinitializes a buffer. +| * This function is sometimes called more than once on the same buffer, +| * such as during a yyrestart() or at EOF. +| */ +| static void yy_init_buffer (YY_BUFFER_STATE b, FILE * file ) +| +| { +| int oerrno = errno; +| +| yy_flush_buffer(b ); +| +| b->yy_input_file = file; +| b->yy_fill_buffer = 1; +| +| /* If b is the current buffer, then yy_init_buffer was _probably_ +| * called from yyrestart() or through yy_get_next_buffer. +| * In that case, we don't want to reset the lineno or column. +| */ +| if (b != YY_CURRENT_BUFFER){ +| b->yy_bs_lineno = 1; +| b->yy_bs_column = 0; +| } +| +| b->yy_is_interactive = file ? (isatty( fileno(file) ) > 0) : 0; +| +| errno = oerrno; +| } +| +| /** Discard all buffered characters. On the next scan, YY_INPUT will be called. +| * @param b the buffer state to be flushed, usually @c YY_CURRENT_BUFFER. +| * +| */ +| void yy_flush_buffer (YY_BUFFER_STATE b ) +| { +| if ( ! b ) +| return; +| +| b->yy_n_chars = 0; +| +| /* We always need two end-of-buffer characters. The first causes +| * a transition to the end-of-buffer state. The second causes +| * a jam in that state. +| */ +| b->yy_ch_buf[0] = YY_END_OF_BUFFER_CHAR; +| b->yy_ch_buf[1] = YY_END_OF_BUFFER_CHAR; +| +| b->yy_buf_pos = &b->yy_ch_buf[0]; +| +| b->yy_at_bol = 1; +| b->yy_buffer_status = YY_BUFFER_NEW; +| +| if ( b == YY_CURRENT_BUFFER ) +| yy_load_buffer_state( ); +| } +| +| /** Pushes the new state onto the stack. The new state becomes +| * the current state. This function will allocate the stack +| * if necessary. +| * @param new_buffer The new state. +| * +| */ +| void yypush_buffer_state (YY_BUFFER_STATE new_buffer ) +| { +| if (new_buffer == NULL) +| return; +| +| yyensure_buffer_stack(); +| +| /* This block is copied from yy_switch_to_buffer. */ +| if ( YY_CURRENT_BUFFER ) +| { +| /* Flush out information for old buffer. */ +| *(yy_c_buf_p) = (yy_hold_char); +| YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = (yy_c_buf_p); +| YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars); +| } +| +| /* Only push if top exists. Otherwise, replace top. */ +| if (YY_CURRENT_BUFFER) +| (yy_buffer_stack_top)++; +| YY_CURRENT_BUFFER_LVALUE = new_buffer; +| +| /* copied from yy_switch_to_buffer. */ +| yy_load_buffer_state( ); +| (yy_did_buffer_switch_on_eof) = 1; +| } +| +| /** Removes and deletes the top of the stack, if present. +| * The next element becomes the new top. +| * +| */ +| void yypop_buffer_state (void) +| { +| if (!YY_CURRENT_BUFFER) +| return; +| +| yy_delete_buffer(YY_CURRENT_BUFFER ); +| YY_CURRENT_BUFFER_LVALUE = NULL; +| if ((yy_buffer_stack_top) > 0) +| --(yy_buffer_stack_top); +| +| if (YY_CURRENT_BUFFER) { +| yy_load_buffer_state( ); +| (yy_did_buffer_switch_on_eof) = 1; +| } +| } +| +| /* Allocates the stack if it does not exist. +| * Guarantees space for at least one push. +| */ +| static void yyensure_buffer_stack (void) +| { +| int num_to_alloc; +| +| if (!(yy_buffer_stack)) { +| +| /* First allocation is just for 2 elements, since we don't know if this +| * scanner will even need a stack. We use 2 instead of 1 to avoid an +| * immediate realloc on the next call. +| */ +| num_to_alloc = 1; +| (yy_buffer_stack) = (struct yy_buffer_state**)yyalloc +| (num_to_alloc * sizeof(struct yy_buffer_state*) +| ); +| if ( ! (yy_buffer_stack) ) +| YY_FATAL_ERROR( "out of dynamic memory in yyensure_buffer_stack()" ); +| +| memset((yy_buffer_stack), 0, num_to_alloc * sizeof(struct yy_buffer_state*)); +| +| (yy_buffer_stack_max) = num_to_alloc; +| (yy_buffer_stack_top) = 0; +| return; +| } +| +| if ((yy_buffer_stack_top) >= ((yy_buffer_stack_max)) - 1){ +| +| /* Increase the buffer to prepare for a possible push. */ +| int grow_size = 8 /* arbitrary grow size */; +| +| num_to_alloc = (yy_buffer_stack_max) + grow_size; +| (yy_buffer_stack) = (struct yy_buffer_state**)yyrealloc +| ((yy_buffer_stack), +| num_to_alloc * sizeof(struct yy_buffer_state*) +| ); +| if ( ! (yy_buffer_stack) ) +| YY_FATAL_ERROR( "out of dynamic memory in yyensure_buffer_stack()" ); +| +| /* zero only the new slots.*/ +| memset((yy_buffer_stack) + (yy_buffer_stack_max), 0, grow_size * sizeof(struct yy_buffer_state*)); +| (yy_buffer_stack_max) = num_to_alloc; +| } +| } +| +| /** Setup the input buffer state to scan directly from a user-specified character buffer. +| * @param base the character buffer +| * @param size the size in bytes of the character buffer +| * +| * @return the newly allocated buffer state object. +| */ +| YY_BUFFER_STATE yy_scan_buffer (char * base, yy_size_t size ) +| { +| YY_BUFFER_STATE b; +| +| if ( size < 2 || +| base[size-2] != YY_END_OF_BUFFER_CHAR || +| base[size-1] != YY_END_OF_BUFFER_CHAR ) +| /* They forgot to leave room for the EOB's. */ +| return 0; +| +| b = (YY_BUFFER_STATE) yyalloc(sizeof( struct yy_buffer_state ) ); +| if ( ! b ) +| YY_FATAL_ERROR( "out of dynamic memory in yy_scan_buffer()" ); +| +| b->yy_buf_size = size - 2; /* "- 2" to take care of EOB's */ +| b->yy_buf_pos = b->yy_ch_buf = base; +| b->yy_is_our_buffer = 0; +| b->yy_input_file = 0; +| b->yy_n_chars = b->yy_buf_size; +| b->yy_is_interactive = 0; +| b->yy_at_bol = 1; +| b->yy_fill_buffer = 0; +| b->yy_buffer_status = YY_BUFFER_NEW; +| +| yy_switch_to_buffer(b ); +| +| return b; +| } +| +| /** Setup the input buffer state to scan a string. The next call to yylex() will +| * scan from a @e copy of @a str. +| * @param yystr a NUL-terminated string to scan +| * +| * @return the newly allocated buffer state object. +| * @note If you want to scan bytes that may contain NUL values, then use +| * yy_scan_bytes() instead. +| */ +| YY_BUFFER_STATE yy_scan_string (yyconst char * yystr ) +| { +| +| return yy_scan_bytes(yystr,strlen(yystr) ); +| } +| +| /** Setup the input buffer state to scan the given bytes. The next call to yylex() will +| * scan from a @e copy of @a bytes. +| * @param yybytes the byte buffer to scan +| * @param _yybytes_len the number of bytes in the buffer pointed to by @a bytes. +| * +| * @return the newly allocated buffer state object. +| */ +| YY_BUFFER_STATE yy_scan_bytes (yyconst char * yybytes, int _yybytes_len ) +| { +| YY_BUFFER_STATE b; +| char *buf; +| yy_size_t n; +| int i; +| +| /* Get memory for full buffer, including space for trailing EOB's. */ +| n = _yybytes_len + 2; +| buf = (char *) yyalloc(n ); +| if ( ! buf ) +| YY_FATAL_ERROR( "out of dynamic memory in yy_scan_bytes()" ); +| +| for ( i = 0; i < _yybytes_len; ++i ) +| buf[i] = yybytes[i]; +| +| buf[_yybytes_len] = buf[_yybytes_len+1] = YY_END_OF_BUFFER_CHAR; +| +| b = yy_scan_buffer(buf,n ); +| if ( ! b ) +| YY_FATAL_ERROR( "bad buffer in yy_scan_bytes()" ); +| +| /* It's okay to grow etc. this buffer, and we should throw it +| * away when we're done. +| */ +| b->yy_is_our_buffer = 1; +| +| return b; +| } +| +| #ifndef YY_EXIT_FAILURE +| #define YY_EXIT_FAILURE 2 +| #endif +| +| static void yy_fatal_error (yyconst char* msg ) +| { +| (void) fprintf( stderr, "%s\n", msg ); +| exit( YY_EXIT_FAILURE ); +| } +| +| /* Redefine yyless() so it works in section 3 code. */ +| +| #undef yyless +| #define yyless(n) \ +| do \ +| { \ +| /* Undo effects of setting up yytext. */ \ +| int yyless_macro_arg = (n); \ +| YY_LESS_LINENO(yyless_macro_arg);\ +| yytext[yyleng] = (yy_hold_char); \ +| (yy_c_buf_p) = yytext + yyless_macro_arg; \ +| (yy_hold_char) = *(yy_c_buf_p); \ +| *(yy_c_buf_p) = '\0'; \ +| yyleng = yyless_macro_arg; \ +| } \ +| while ( 0 ) +| +| /* Accessor methods (get/set functions) to struct members. */ +| +| /** Get the current line number. +| * +| */ +| int yyget_lineno (void) +| { +| +| return yylineno; +| } +| +| /** Get the input stream. +| * +| */ +| FILE *yyget_in (void) +| { +| return yyin; +| } +| +| /** Get the output stream. +| * +| */ +| FILE *yyget_out (void) +| { +| return yyout; +| } +| +| /** Get the length of the current token. +| * +| */ +| int yyget_leng (void) +| { +| return yyleng; +| } +| +| /** Get the current token. +| * +| */ +| +| char *yyget_text (void) +| { +| return yytext; +| } +| +| /** Set the current line number. +| * @param line_number +| * +| */ +| void yyset_lineno (int line_number ) +| { +| +| yylineno = line_number; +| } +| +| /** Set the input stream. This does not discard the current +| * input buffer. +| * @param in_str A readable stream. +| * +| * @see yy_switch_to_buffer +| */ +| void yyset_in (FILE * in_str ) +| { +| yyin = in_str ; +| } +| +| void yyset_out (FILE * out_str ) +| { +| yyout = out_str ; +| } +| +| int yyget_debug (void) +| { +| return yy_flex_debug; +| } +| +| void yyset_debug (int bdebug ) +| { +| yy_flex_debug = bdebug ; +| } +| +| static int yy_init_globals (void) +| { +| /* Initialization is the same as for the non-reentrant scanner. +| * This function is called from yylex_destroy(), so don't allocate here. +| */ +| +| (yy_buffer_stack) = 0; +| (yy_buffer_stack_top) = 0; +| (yy_buffer_stack_max) = 0; +| (yy_c_buf_p) = (char *) 0; +| (yy_init) = 0; +| (yy_start) = 0; +| +| (yy_state_buf) = 0; +| (yy_state_ptr) = 0; +| (yy_full_match) = 0; +| (yy_lp) = 0; +| +| /* Defined in main.c */ +| #ifdef YY_STDINIT +| yyin = stdin; +| yyout = stdout; +| #else +| yyin = (FILE *) 0; +| yyout = (FILE *) 0; +| #endif +| +| /* For future reference: Set errno on error, since we are called by +| * yylex_init() +| */ +| return 0; +| } +| +| /* yylex_destroy is for both reentrant and non-reentrant scanners. */ +| int yylex_destroy (void) +| { +| +| /* Pop the buffer stack, destroying each element. */ +| while(YY_CURRENT_BUFFER){ +| yy_delete_buffer(YY_CURRENT_BUFFER ); +| YY_CURRENT_BUFFER_LVALUE = NULL; +| yypop_buffer_state(); +| } +| +| /* Destroy the stack itself. */ +| yyfree((yy_buffer_stack) ); +| (yy_buffer_stack) = NULL; +| +| yyfree ( (yy_state_buf) ); +| (yy_state_buf) = NULL; +| +| /* Reset the globals. This is important in a non-reentrant scanner so the next time +| * yylex() is called, initialization will occur. */ +| yy_init_globals( ); +| +| return 0; +| } +| +| /* +| * Internal utility routines. +| */ +| +| #ifndef yytext_ptr +| static void yy_flex_strncpy (char* s1, yyconst char * s2, int n ) +| { +| register int i; +| for ( i = 0; i < n; ++i ) +| s1[i] = s2[i]; +| } +| #endif +| +| #ifdef YY_NEED_STRLEN +| static int yy_flex_strlen (yyconst char * s ) +| { +| register int n; +| for ( n = 0; s[n]; ++n ) +| ; +| +| return n; +| } +| #endif +| +| void *yyalloc (yy_size_t size ) +| { +| return (void *) malloc( size ); +| } +| +| void *yyrealloc (void * ptr, yy_size_t size ) +| { +| /* The cast to (char *) in the following accommodates both +| * implementations that use char* generic pointers, and those +| * that use void* generic pointers. It works with the latter +| * because both ANSI C and C++ allow castless assignment from +| * any pointer type to void*, and deal with argument conversions +| * as though doing an assignment. +| */ +| return (void *) realloc( (char *) ptr, size ); +| } +| +| void yyfree (void * ptr ) +| { +| free( (char *) ptr ); /* see yyrealloc() for (char *) cast */ +| } +| +| #define YYTABLES_NAME "yytables" +| +| #line 10 "conftest.l" +| +| +| #ifdef YYTEXT_POINTER +| extern char *yytext; +| #endif +| int +| main (void) +| { +| return ! yylex () + ! yywrap (); +| } +configure:4613: mpicc -o conftest -std=c99 -fopenmp conftest.c -lfl /usr/lib/libblas.so.3 /usr/lib/x86_64-linux-gnu/libgomp.so.1 -L/usr/lib/gcc/x86_64-linux-gnu/4.8 -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../.. -lgfortran -lm -lquadmath -lm >&5 +lex.yy.c: In function 'yy_init_buffer': +lex.yy.c:1347:9: warning: implicit declaration of function 'fileno' [-Wimplicit-function-declaration] + b->yy_is_interactive = file ? (isatty( fileno(file) ) > 0) : 0; + ^ +configure:4613: $? = 0 +configure:4623: result: -lfl +configure:4629: checking whether yytext is a pointer +configure:4646: mpicc -o conftest -std=c99 -fopenmp conftest.c -lfl /usr/lib/libblas.so.3 /usr/lib/x86_64-linux-gnu/libgomp.so.1 -L/usr/lib/gcc/x86_64-linux-gnu/4.8 -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../.. -lgfortran -lm -lquadmath -lm >&5 +lex.yy.c: In function 'yy_init_buffer': +lex.yy.c:1347:9: warning: implicit declaration of function 'fileno' [-Wimplicit-function-declaration] + b->yy_is_interactive = file ? (isatty( fileno(file) ) > 0) : 0; + ^ +configure:4646: $? = 0 +configure:4654: result: yes +configure:4668: checking whether make sets $(MAKE) +configure:4690: result: yes +configure:4742: checking for ranlib +configure:4758: found /usr/bin/ranlib +configure:4769: result: ranlib +configure:4793: checking for gcc +configure:4809: found /usr/bin/gcc +configure:4821: result: gcc +configure:4845: checking how to run the C preprocessor +configure:4876: mpicc -E conftest.c +configure:4876: $? = 0 +configure:4890: mpicc -E conftest.c +conftest.c:11:28: fatal error: ac_nonexistent.h: No such file or directory + #include + ^ +compilation terminated. +configure:4890: $? = 1 +configure: failed program was: +| /* confdefs.h */ +| #define PACKAGE_NAME "tmLQCD" +| #define PACKAGE_TARNAME "tmlqcd" +| #define PACKAGE_VERSION "5.2.0" +| #define PACKAGE_STRING "tmLQCD 5.2.0" +| #define PACKAGE_BUGREPORT "curbach@gmx.de" +| #define PACKAGE_URL "" +| #define restrict __restrict +| #define YYTEXT_POINTER 1 +| /* end confdefs.h. */ +| #include +configure:4915: result: mpicc -E +configure:4935: mpicc -E conftest.c +configure:4935: $? = 0 +configure:4949: mpicc -E conftest.c +conftest.c:11:28: fatal error: ac_nonexistent.h: No such file or directory + #include + ^ +compilation terminated. +configure:4949: $? = 1 +configure: failed program was: +| /* confdefs.h */ +| #define PACKAGE_NAME "tmLQCD" +| #define PACKAGE_TARNAME "tmlqcd" +| #define PACKAGE_VERSION "5.2.0" +| #define PACKAGE_STRING "tmLQCD 5.2.0" +| #define PACKAGE_BUGREPORT "curbach@gmx.de" +| #define PACKAGE_URL "" +| #define restrict __restrict +| #define YYTEXT_POINTER 1 +| /* end confdefs.h. */ +| #include +configure:4978: checking for grep that handles long lines and -e +configure:5036: result: /bin/grep +configure:5041: checking for egrep +configure:5103: result: /bin/grep -E +configure:5108: checking for ANSI C header files +configure:5128: mpicc -c -std=c99 -fopenmp conftest.c >&5 +configure:5128: $? = 0 +configure:5201: mpicc -o conftest -std=c99 -fopenmp -L${HOME}/lib -L${top_builddir}/lib conftest.c /usr/lib/libblas.so.3 /usr/lib/x86_64-linux-gnu/libgomp.so.1 -L/usr/lib/gcc/x86_64-linux-gnu/4.8 -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../.. -lgfortran -lm -lquadmath -lm >&5 +configure:5201: $? = 0 +configure:5201: ./conftest +configure:5201: $? = 0 +configure:5212: result: yes +configure:5225: checking for sys/types.h +configure:5225: mpicc -c -std=c99 -fopenmp conftest.c >&5 +configure:5225: $? = 0 +configure:5225: result: yes +configure:5225: checking for sys/stat.h +configure:5225: mpicc -c -std=c99 -fopenmp conftest.c >&5 +configure:5225: $? = 0 +configure:5225: result: yes +configure:5225: checking for stdlib.h +configure:5225: mpicc -c -std=c99 -fopenmp conftest.c >&5 +configure:5225: $? = 0 +configure:5225: result: yes +configure:5225: checking for string.h +configure:5225: mpicc -c -std=c99 -fopenmp conftest.c >&5 +configure:5225: $? = 0 +configure:5225: result: yes +configure:5225: checking for memory.h +configure:5225: mpicc -c -std=c99 -fopenmp conftest.c >&5 +configure:5225: $? = 0 +configure:5225: result: yes +configure:5225: checking for strings.h +configure:5225: mpicc -c -std=c99 -fopenmp conftest.c >&5 +configure:5225: $? = 0 +configure:5225: result: yes +configure:5225: checking for inttypes.h +configure:5225: mpicc -c -std=c99 -fopenmp conftest.c >&5 +configure:5225: $? = 0 +configure:5225: result: yes +configure:5225: checking for stdint.h +configure:5225: mpicc -c -std=c99 -fopenmp conftest.c >&5 +configure:5225: $? = 0 +configure:5225: result: yes +configure:5225: checking for unistd.h +configure:5225: mpicc -c -std=c99 -fopenmp conftest.c >&5 +configure:5225: $? = 0 +configure:5225: result: yes +configure:5239: checking for stdint.h +configure:5239: result: yes +configure:5244: checking for uint16_t +configure:5244: mpicc -c -std=c99 -fopenmp conftest.c >&5 +configure:5244: $? = 0 +configure:5244: mpicc -c -std=c99 -fopenmp conftest.c >&5 +conftest.c: In function 'main': +conftest.c:58:23: error: expected expression before ')' token + if (sizeof ((uint16_t))) + ^ +configure:5244: $? = 1 +configure: failed program was: +| /* confdefs.h */ +| #define PACKAGE_NAME "tmLQCD" +| #define PACKAGE_TARNAME "tmlqcd" +| #define PACKAGE_VERSION "5.2.0" +| #define PACKAGE_STRING "tmLQCD 5.2.0" +| #define PACKAGE_BUGREPORT "curbach@gmx.de" +| #define PACKAGE_URL "" +| #define restrict __restrict +| #define YYTEXT_POINTER 1 +| #define STDC_HEADERS 1 +| #define HAVE_SYS_TYPES_H 1 +| #define HAVE_SYS_STAT_H 1 +| #define HAVE_STDLIB_H 1 +| #define HAVE_STRING_H 1 +| #define HAVE_MEMORY_H 1 +| #define HAVE_STRINGS_H 1 +| #define HAVE_INTTYPES_H 1 +| #define HAVE_STDINT_H 1 +| #define HAVE_UNISTD_H 1 +| #define HAVE_STDINT_H 1 +| /* end confdefs.h. */ +| #include +| #ifdef HAVE_SYS_TYPES_H +| # include +| #endif +| #ifdef HAVE_SYS_STAT_H +| # include +| #endif +| #ifdef STDC_HEADERS +| # include +| # include +| #else +| # ifdef HAVE_STDLIB_H +| # include +| # endif +| #endif +| #ifdef HAVE_STRING_H +| # if !defined STDC_HEADERS && defined HAVE_MEMORY_H +| # include +| # endif +| # include +| #endif +| #ifdef HAVE_STRINGS_H +| # include +| #endif +| #ifdef HAVE_INTTYPES_H +| # include +| #endif +| #ifdef HAVE_STDINT_H +| # include +| #endif +| #ifdef HAVE_UNISTD_H +| # include +| #endif +| int +| main () +| { +| if (sizeof ((uint16_t))) +| return 0; +| ; +| return 0; +| } +configure:5244: result: yes +configure:5256: checking for uint32_t +configure:5256: mpicc -c -std=c99 -fopenmp conftest.c >&5 +configure:5256: $? = 0 +configure:5256: mpicc -c -std=c99 -fopenmp conftest.c >&5 +conftest.c: In function 'main': +conftest.c:59:23: error: expected expression before ')' token + if (sizeof ((uint32_t))) + ^ +configure:5256: $? = 1 +configure: failed program was: +| /* confdefs.h */ +| #define PACKAGE_NAME "tmLQCD" +| #define PACKAGE_TARNAME "tmlqcd" +| #define PACKAGE_VERSION "5.2.0" +| #define PACKAGE_STRING "tmLQCD 5.2.0" +| #define PACKAGE_BUGREPORT "curbach@gmx.de" +| #define PACKAGE_URL "" +| #define restrict __restrict +| #define YYTEXT_POINTER 1 +| #define STDC_HEADERS 1 +| #define HAVE_SYS_TYPES_H 1 +| #define HAVE_SYS_STAT_H 1 +| #define HAVE_STDLIB_H 1 +| #define HAVE_STRING_H 1 +| #define HAVE_MEMORY_H 1 +| #define HAVE_STRINGS_H 1 +| #define HAVE_INTTYPES_H 1 +| #define HAVE_STDINT_H 1 +| #define HAVE_UNISTD_H 1 +| #define HAVE_STDINT_H 1 +| #define HAVE_UINT16_T 1 +| /* end confdefs.h. */ +| #include +| #ifdef HAVE_SYS_TYPES_H +| # include +| #endif +| #ifdef HAVE_SYS_STAT_H +| # include +| #endif +| #ifdef STDC_HEADERS +| # include +| # include +| #else +| # ifdef HAVE_STDLIB_H +| # include +| # endif +| #endif +| #ifdef HAVE_STRING_H +| # if !defined STDC_HEADERS && defined HAVE_MEMORY_H +| # include +| # endif +| # include +| #endif +| #ifdef HAVE_STRINGS_H +| # include +| #endif +| #ifdef HAVE_INTTYPES_H +| # include +| #endif +| #ifdef HAVE_STDINT_H +| # include +| #endif +| #ifdef HAVE_UNISTD_H +| # include +| #endif +| int +| main () +| { +| if (sizeof ((uint32_t))) +| return 0; +| ; +| return 0; +| } +configure:5256: result: yes +configure:5268: checking for uint64_t +configure:5268: mpicc -c -std=c99 -fopenmp conftest.c >&5 +configure:5268: $? = 0 +configure:5268: mpicc -c -std=c99 -fopenmp conftest.c >&5 +conftest.c: In function 'main': +conftest.c:60:23: error: expected expression before ')' token + if (sizeof ((uint64_t))) + ^ +configure:5268: $? = 1 +configure: failed program was: +| /* confdefs.h */ +| #define PACKAGE_NAME "tmLQCD" +| #define PACKAGE_TARNAME "tmlqcd" +| #define PACKAGE_VERSION "5.2.0" +| #define PACKAGE_STRING "tmLQCD 5.2.0" +| #define PACKAGE_BUGREPORT "curbach@gmx.de" +| #define PACKAGE_URL "" +| #define restrict __restrict +| #define YYTEXT_POINTER 1 +| #define STDC_HEADERS 1 +| #define HAVE_SYS_TYPES_H 1 +| #define HAVE_SYS_STAT_H 1 +| #define HAVE_STDLIB_H 1 +| #define HAVE_STRING_H 1 +| #define HAVE_MEMORY_H 1 +| #define HAVE_STRINGS_H 1 +| #define HAVE_INTTYPES_H 1 +| #define HAVE_STDINT_H 1 +| #define HAVE_UNISTD_H 1 +| #define HAVE_STDINT_H 1 +| #define HAVE_UINT16_T 1 +| #define HAVE_UINT32_T 1 +| /* end confdefs.h. */ +| #include +| #ifdef HAVE_SYS_TYPES_H +| # include +| #endif +| #ifdef HAVE_SYS_STAT_H +| # include +| #endif +| #ifdef STDC_HEADERS +| # include +| # include +| #else +| # ifdef HAVE_STDLIB_H +| # include +| # endif +| #endif +| #ifdef HAVE_STRING_H +| # if !defined STDC_HEADERS && defined HAVE_MEMORY_H +| # include +| # endif +| # include +| #endif +| #ifdef HAVE_STRINGS_H +| # include +| #endif +| #ifdef HAVE_INTTYPES_H +| # include +| #endif +| #ifdef HAVE_STDINT_H +| # include +| #endif +| #ifdef HAVE_UNISTD_H +| # include +| #endif +| int +| main () +| { +| if (sizeof ((uint64_t))) +| return 0; +| ; +| return 0; +| } +configure:5268: result: yes +configure:5456: checking whether we want to use only Benchmark +configure:5534: checking whether we want to use lemon +configure:5596: checking whether we use the general geometry +configure:5612: result: no +configure:5616: checking whether we want to use MPI +configure:5626: result: yes +configure:5636: checking whether to use QPX intrinsics +configure:5656: result: no +configure:5660: checking whether to use IBM BG/Q SPI for communications +configure:5677: result: no +configure:5683: checking whether we want to use OpenMP +configure:5693: result: yes +configure:5700: checking omp.h usability +configure:5700: mpicc -c -std=c99 -fopenmp conftest.c >&5 +configure:5700: $? = 0 +configure:5700: result: yes +configure:5700: checking omp.h presence +configure:5700: mpicc -E conftest.c +configure:5700: $? = 0 +configure:5700: result: yes +configure:5700: checking for omp.h +configure:5700: result: yes +configure:5720: checking for mpicc option to support OpenMP +configure:5735: mpicc -o conftest -std=c99 -fopenmp -L${HOME}/lib -L${top_builddir}/lib conftest.c /usr/lib/libblas.so.3 /usr/lib/x86_64-linux-gnu/libgomp.so.1 -L/usr/lib/gcc/x86_64-linux-gnu/4.8 -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../.. -lgfortran -lm -lquadmath -lm >&5 +configure:5735: $? = 0 +configure:5767: result: none needed +configure:5797: checking whether we want to use FFTW +configure:5814: result: no +configure:5829: checking which parallelisation to use for MPI +configure:5858: result: n=4 xyzt +configure:5911: checking whether we shall use persistent MPI calls for halfspinor +configure:5928: result: no +configure:5932: checking whether we shall use non-blocking MPI calls +configure:5943: result: yes +configure:5954: checking whether we want to fix volume at compiletime +configure:5973: result: no +configure:5977: checking whether we want to use KOJAK instrumentalisation +configure:5992: result: no +configure:5996: checking whether we want to use lapack and blas +configure:6014: result: yes +configure:6036: checking for clock_gettime +configure:6036: mpicc -o conftest -std=c99 -fopenmp -L${HOME}/lib -L${top_builddir}/lib conftest.c /usr/lib/lapack/liblapack.so.3 /usr/lib/libblas.so.3 /usr/lib/x86_64-linux-gnu/libgomp.so.1 -L/usr/lib/gcc/x86_64-linux-gnu/4.8 -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../.. -lgfortran -lm -lquadmath -lm >&5 +configure:6036: $? = 0 +configure:6036: result: yes +configure:6095: Instructing the compiler to use POSIX 199309L +configure:6104: checking for dummy main to link with Fortran 77 libraries +configure:6138: mpicc -o conftest -std=c99 -fopenmp -L${HOME}/lib -L${top_builddir}/lib conftest.c /usr/lib/lapack/liblapack.so.3 /usr/lib/libblas.so.3 /usr/lib/x86_64-linux-gnu/libgomp.so.1 -L/usr/lib/gcc/x86_64-linux-gnu/4.8 -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../.. -lgfortran -lm -lquadmath -lm -L/usr/lib/gcc/x86_64-linux-gnu/4.8 -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../.. -lgfortran -lm -lquadmath >&5 +configure:6138: $? = 0 +configure:6183: result: none +configure:6216: checking for Fortran 77 name-mangling scheme +configure:6229: f95 -c -g -O2 conftest.f >&5 +configure:6229: $? = 0 +configure:6270: mpicc -o conftest -std=c99 -fopenmp -L${HOME}/lib -L${top_builddir}/lib conftest.c cfortran_test.o /usr/lib/lapack/liblapack.so.3 /usr/lib/libblas.so.3 /usr/lib/x86_64-linux-gnu/libgomp.so.1 -L/usr/lib/gcc/x86_64-linux-gnu/4.8 -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../.. -lgfortran -lm -lquadmath -lm -L/usr/lib/gcc/x86_64-linux-gnu/4.8 -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../.. -lgfortran -lm -lquadmath >&5 +/tmp/ccjp6OY8.o: In function `main': +conftest.c:(.text+0xa): undefined reference to `foobar' +collect2: error: ld returned 1 exit status +configure:6270: $? = 1 +configure: failed program was: +| /* confdefs.h */ +| #define PACKAGE_NAME "tmLQCD" +| #define PACKAGE_TARNAME "tmlqcd" +| #define PACKAGE_VERSION "5.2.0" +| #define PACKAGE_STRING "tmLQCD 5.2.0" +| #define PACKAGE_BUGREPORT "curbach@gmx.de" +| #define PACKAGE_URL "" +| #define restrict __restrict +| #define YYTEXT_POINTER 1 +| #define STDC_HEADERS 1 +| #define HAVE_SYS_TYPES_H 1 +| #define HAVE_SYS_STAT_H 1 +| #define HAVE_STDLIB_H 1 +| #define HAVE_STRING_H 1 +| #define HAVE_MEMORY_H 1 +| #define HAVE_STRINGS_H 1 +| #define HAVE_INTTYPES_H 1 +| #define HAVE_STDINT_H 1 +| #define HAVE_UNISTD_H 1 +| #define HAVE_STDINT_H 1 +| #define HAVE_UINT16_T 1 +| #define HAVE_UINT32_T 1 +| #define HAVE_UINT64_T 1 +| #define BENCHMARK 1 +| #define MPI 1 +| #define OMP 1 +| #define HAVE_OMP_H 1 +| #define PARALLELXYZT 1 +| #define _NON_BLOCKING 1 +| #define HAVE_LAPACK 1 +| #define HAVE_CLOCK_GETTIME 1 +| #define HAVE_CLOCK_GETTIME 1 +| /* end confdefs.h. */ +| +| /* Override any GCC internal prototype to avoid an error. +| Use char because int might match the return type of a GCC +| builtin and then its argument prototype would still apply. */ +| #ifdef __cplusplus +| extern "C" +| #endif +| char foobar (); +| #ifdef F77_DUMMY_MAIN +| +| # ifdef __cplusplus +| extern "C" +| # endif +| int F77_DUMMY_MAIN() { return 1; } +| +| #endif +| int +| main () +| { +| return foobar (); +| ; +| return 0; +| } +configure:6270: mpicc -o conftest -std=c99 -fopenmp -L${HOME}/lib -L${top_builddir}/lib conftest.c cfortran_test.o /usr/lib/lapack/liblapack.so.3 /usr/lib/libblas.so.3 /usr/lib/x86_64-linux-gnu/libgomp.so.1 -L/usr/lib/gcc/x86_64-linux-gnu/4.8 -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../.. -lgfortran -lm -lquadmath -lm -L/usr/lib/gcc/x86_64-linux-gnu/4.8 -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../.. -lgfortran -lm -lquadmath >&5 +configure:6270: $? = 0 +configure:6328: mpicc -o conftest -std=c99 -fopenmp -L${HOME}/lib -L${top_builddir}/lib conftest.c cfortran_test.o /usr/lib/lapack/liblapack.so.3 /usr/lib/libblas.so.3 /usr/lib/x86_64-linux-gnu/libgomp.so.1 -L/usr/lib/gcc/x86_64-linux-gnu/4.8 -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../.. -lgfortran -lm -lquadmath -lm -L/usr/lib/gcc/x86_64-linux-gnu/4.8 -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../.. -lgfortran -lm -lquadmath >&5 +configure:6328: $? = 0 +configure:6370: result: lower case, underscore, no extra underscore +configure:6404: checking for library containing zheev_ +configure:6443: mpicc -o conftest -std=c99 -fopenmp -L${HOME}/lib -L${top_builddir}/lib conftest.c /usr/lib/lapack/liblapack.so.3 /usr/lib/libblas.so.3 /usr/lib/x86_64-linux-gnu/libgomp.so.1 -L/usr/lib/gcc/x86_64-linux-gnu/4.8 -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../.. -lgfortran -lm -lquadmath -lm >&5 +configure:6443: $? = 0 +configure:6461: result: none required +configure:6472: checking for ANSI C header files +configure:6584: result: yes +configure:6595: checking float.h usability +configure:6595: mpicc -c -std=c99 -fopenmp conftest.c >&5 +configure:6595: $? = 0 +configure:6595: result: yes +configure:6595: checking float.h presence +configure:6595: mpicc -E conftest.c +configure:6595: $? = 0 +configure:6595: result: yes +configure:6595: checking for float.h +configure:6595: result: yes +configure:6595: checking libintl.h usability +configure:6595: mpicc -c -std=c99 -fopenmp conftest.c >&5 +configure:6595: $? = 0 +configure:6595: result: yes +configure:6595: checking libintl.h presence +configure:6595: mpicc -E conftest.c +configure:6595: $? = 0 +configure:6595: result: yes +configure:6595: checking for libintl.h +configure:6595: result: yes +configure:6595: checking limits.h usability +configure:6595: mpicc -c -std=c99 -fopenmp conftest.c >&5 +configure:6595: $? = 0 +configure:6595: result: yes +configure:6595: checking limits.h presence +configure:6595: mpicc -E conftest.c +configure:6595: $? = 0 +configure:6595: result: yes +configure:6595: checking for limits.h +configure:6595: result: yes +configure:6595: checking for stdint.h +configure:6595: result: yes +configure:6595: checking for stdlib.h +configure:6595: result: yes +configure:6595: checking for string.h +configure:6595: result: yes +configure:6595: checking for strings.h +configure:6595: result: yes +configure:6595: checking sys/time.h usability +configure:6595: mpicc -c -std=c99 -fopenmp conftest.c >&5 +configure:6595: $? = 0 +configure:6595: result: yes +configure:6595: checking sys/time.h presence +configure:6595: mpicc -E conftest.c +configure:6595: $? = 0 +configure:6595: result: yes +configure:6595: checking for sys/time.h +configure:6595: result: yes +configure:6595: checking for unistd.h +configure:6595: result: yes +configure:6595: checking endian.h usability +configure:6595: mpicc -c -std=c99 -fopenmp conftest.c >&5 +configure:6595: $? = 0 +configure:6595: result: yes +configure:6595: checking endian.h presence +configure:6595: mpicc -E conftest.c +configure:6595: $? = 0 +configure:6595: result: yes +configure:6595: checking for endian.h +configure:6595: result: yes +configure:6605: checking getopt.h usability +configure:6605: mpicc -c -std=c99 -fopenmp conftest.c >&5 +configure:6605: $? = 0 +configure:6605: result: yes +configure:6605: checking getopt.h presence +configure:6605: mpicc -E conftest.c +configure:6605: $? = 0 +configure:6605: result: yes +configure:6605: checking for getopt.h +configure:6605: result: yes +configure:6612: checking for an ANSI C-conforming const +configure:6693: result: yes +configure:6701: checking for off_t +configure:6701: mpicc -c -std=c99 -fopenmp conftest.c >&5 +configure:6701: $? = 0 +configure:6701: mpicc -c -std=c99 -fopenmp conftest.c >&5 +conftest.c: In function 'main': +conftest.c:81:20: error: expected expression before ')' token + if (sizeof ((off_t))) + ^ +configure:6701: $? = 1 +configure: failed program was: +| /* confdefs.h */ +| #define PACKAGE_NAME "tmLQCD" +| #define PACKAGE_TARNAME "tmlqcd" +| #define PACKAGE_VERSION "5.2.0" +| #define PACKAGE_STRING "tmLQCD 5.2.0" +| #define PACKAGE_BUGREPORT "curbach@gmx.de" +| #define PACKAGE_URL "" +| #define restrict __restrict +| #define YYTEXT_POINTER 1 +| #define STDC_HEADERS 1 +| #define HAVE_SYS_TYPES_H 1 +| #define HAVE_SYS_STAT_H 1 +| #define HAVE_STDLIB_H 1 +| #define HAVE_STRING_H 1 +| #define HAVE_MEMORY_H 1 +| #define HAVE_STRINGS_H 1 +| #define HAVE_INTTYPES_H 1 +| #define HAVE_STDINT_H 1 +| #define HAVE_UNISTD_H 1 +| #define HAVE_STDINT_H 1 +| #define HAVE_UINT16_T 1 +| #define HAVE_UINT32_T 1 +| #define HAVE_UINT64_T 1 +| #define BENCHMARK 1 +| #define MPI 1 +| #define OMP 1 +| #define HAVE_OMP_H 1 +| #define PARALLELXYZT 1 +| #define _NON_BLOCKING 1 +| #define HAVE_LAPACK 1 +| #define HAVE_CLOCK_GETTIME 1 +| #define HAVE_CLOCK_GETTIME 1 +| #define STDC_HEADERS 1 +| #define HAVE_FLOAT_H 1 +| #define HAVE_LIBINTL_H 1 +| #define HAVE_LIMITS_H 1 +| #define HAVE_STDINT_H 1 +| #define HAVE_STDLIB_H 1 +| #define HAVE_STRING_H 1 +| #define HAVE_STRINGS_H 1 +| #define HAVE_SYS_TIME_H 1 +| #define HAVE_UNISTD_H 1 +| #define HAVE_ENDIAN_H 1 +| /* end confdefs.h. */ +| #include +| #ifdef HAVE_SYS_TYPES_H +| # include +| #endif +| #ifdef HAVE_SYS_STAT_H +| # include +| #endif +| #ifdef STDC_HEADERS +| # include +| # include +| #else +| # ifdef HAVE_STDLIB_H +| # include +| # endif +| #endif +| #ifdef HAVE_STRING_H +| # if !defined STDC_HEADERS && defined HAVE_MEMORY_H +| # include +| # endif +| # include +| #endif +| #ifdef HAVE_STRINGS_H +| # include +| #endif +| #ifdef HAVE_INTTYPES_H +| # include +| #endif +| #ifdef HAVE_STDINT_H +| # include +| #endif +| #ifdef HAVE_UNISTD_H +| # include +| #endif +| int +| main () +| { +| if (sizeof ((off_t))) +| return 0; +| ; +| return 0; +| } +configure:6701: result: yes +configure:6712: checking for size_t +configure:6712: mpicc -c -std=c99 -fopenmp conftest.c >&5 +configure:6712: $? = 0 +configure:6712: mpicc -c -std=c99 -fopenmp conftest.c >&5 +conftest.c: In function 'main': +conftest.c:81:21: error: expected expression before ')' token + if (sizeof ((size_t))) + ^ +configure:6712: $? = 1 +configure: failed program was: +| /* confdefs.h */ +| #define PACKAGE_NAME "tmLQCD" +| #define PACKAGE_TARNAME "tmlqcd" +| #define PACKAGE_VERSION "5.2.0" +| #define PACKAGE_STRING "tmLQCD 5.2.0" +| #define PACKAGE_BUGREPORT "curbach@gmx.de" +| #define PACKAGE_URL "" +| #define restrict __restrict +| #define YYTEXT_POINTER 1 +| #define STDC_HEADERS 1 +| #define HAVE_SYS_TYPES_H 1 +| #define HAVE_SYS_STAT_H 1 +| #define HAVE_STDLIB_H 1 +| #define HAVE_STRING_H 1 +| #define HAVE_MEMORY_H 1 +| #define HAVE_STRINGS_H 1 +| #define HAVE_INTTYPES_H 1 +| #define HAVE_STDINT_H 1 +| #define HAVE_UNISTD_H 1 +| #define HAVE_STDINT_H 1 +| #define HAVE_UINT16_T 1 +| #define HAVE_UINT32_T 1 +| #define HAVE_UINT64_T 1 +| #define BENCHMARK 1 +| #define MPI 1 +| #define OMP 1 +| #define HAVE_OMP_H 1 +| #define PARALLELXYZT 1 +| #define _NON_BLOCKING 1 +| #define HAVE_LAPACK 1 +| #define HAVE_CLOCK_GETTIME 1 +| #define HAVE_CLOCK_GETTIME 1 +| #define STDC_HEADERS 1 +| #define HAVE_FLOAT_H 1 +| #define HAVE_LIBINTL_H 1 +| #define HAVE_LIMITS_H 1 +| #define HAVE_STDINT_H 1 +| #define HAVE_STDLIB_H 1 +| #define HAVE_STRING_H 1 +| #define HAVE_STRINGS_H 1 +| #define HAVE_SYS_TIME_H 1 +| #define HAVE_UNISTD_H 1 +| #define HAVE_ENDIAN_H 1 +| /* end confdefs.h. */ +| #include +| #ifdef HAVE_SYS_TYPES_H +| # include +| #endif +| #ifdef HAVE_SYS_STAT_H +| # include +| #endif +| #ifdef STDC_HEADERS +| # include +| # include +| #else +| # ifdef HAVE_STDLIB_H +| # include +| # endif +| #endif +| #ifdef HAVE_STRING_H +| # if !defined STDC_HEADERS && defined HAVE_MEMORY_H +| # include +| # endif +| # include +| #endif +| #ifdef HAVE_STRINGS_H +| # include +| #endif +| #ifdef HAVE_INTTYPES_H +| # include +| #endif +| #ifdef HAVE_STDINT_H +| # include +| #endif +| #ifdef HAVE_UNISTD_H +| # include +| #endif +| int +| main () +| { +| if (sizeof ((size_t))) +| return 0; +| ; +| return 0; +| } +configure:6712: result: yes +configure:6723: checking whether time.h and sys/time.h may both be included +configure:6751: mpicc -c -std=c99 -fopenmp conftest.c >&5 +configure:6751: $? = 0 +configure:6758: result: yes +configure:6774: checking for special C compiler options needed for large files +configure:6827: result: no +configure:6833: checking for _FILE_OFFSET_BITS value needed for large files +configure:6866: mpicc -c -std=c99 -fopenmp conftest.c >&5 +configure:6866: $? = 0 +configure:6906: result: no +configure:7007: checking for _LARGEFILE_SOURCE value needed for large files +configure:7034: mpicc -o conftest -std=c99 -fopenmp -L${HOME}/lib -L${top_builddir}/lib conftest.c /usr/lib/lapack/liblapack.so.3 /usr/lib/libblas.so.3 /usr/lib/x86_64-linux-gnu/libgomp.so.1 -L/usr/lib/gcc/x86_64-linux-gnu/4.8 -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../.. -lgfortran -lm -lquadmath -lm >&5 +conftest.c: In function 'main': +conftest.c:59:34: error: 'fseeko' undeclared (first use in this function) + int (*fp) (FILE *, off_t, int) = fseeko; + ^ +conftest.c:59:34: note: each undeclared identifier is reported only once for each function it appears in +conftest.c:60:6: warning: implicit declaration of function 'fseeko' [-Wimplicit-function-declaration] + return fseeko (stdin, 0, 0) && fp (stdin, 0, 0); + ^ +configure:7034: $? = 1 +configure: failed program was: +| /* confdefs.h */ +| #define PACKAGE_NAME "tmLQCD" +| #define PACKAGE_TARNAME "tmlqcd" +| #define PACKAGE_VERSION "5.2.0" +| #define PACKAGE_STRING "tmLQCD 5.2.0" +| #define PACKAGE_BUGREPORT "curbach@gmx.de" +| #define PACKAGE_URL "" +| #define restrict __restrict +| #define YYTEXT_POINTER 1 +| #define STDC_HEADERS 1 +| #define HAVE_SYS_TYPES_H 1 +| #define HAVE_SYS_STAT_H 1 +| #define HAVE_STDLIB_H 1 +| #define HAVE_STRING_H 1 +| #define HAVE_MEMORY_H 1 +| #define HAVE_STRINGS_H 1 +| #define HAVE_INTTYPES_H 1 +| #define HAVE_STDINT_H 1 +| #define HAVE_UNISTD_H 1 +| #define HAVE_STDINT_H 1 +| #define HAVE_UINT16_T 1 +| #define HAVE_UINT32_T 1 +| #define HAVE_UINT64_T 1 +| #define BENCHMARK 1 +| #define MPI 1 +| #define OMP 1 +| #define HAVE_OMP_H 1 +| #define PARALLELXYZT 1 +| #define _NON_BLOCKING 1 +| #define HAVE_LAPACK 1 +| #define HAVE_CLOCK_GETTIME 1 +| #define HAVE_CLOCK_GETTIME 1 +| #define STDC_HEADERS 1 +| #define HAVE_FLOAT_H 1 +| #define HAVE_LIBINTL_H 1 +| #define HAVE_LIMITS_H 1 +| #define HAVE_STDINT_H 1 +| #define HAVE_STDLIB_H 1 +| #define HAVE_STRING_H 1 +| #define HAVE_STRINGS_H 1 +| #define HAVE_SYS_TIME_H 1 +| #define HAVE_UNISTD_H 1 +| #define HAVE_ENDIAN_H 1 +| #define TIME_WITH_SYS_TIME 1 +| /* end confdefs.h. */ +| #include /* for off_t */ +| #include +| #ifdef F77_DUMMY_MAIN +| +| # ifdef __cplusplus +| extern "C" +| # endif +| int F77_DUMMY_MAIN() { return 1; } +| +| #endif +| int +| main () +| { +| int (*fp) (FILE *, off_t, int) = fseeko; +| return fseeko (stdin, 0, 0) && fp (stdin, 0, 0); +| ; +| return 0; +| } +configure:7061: mpicc -o conftest -std=c99 -fopenmp -L${HOME}/lib -L${top_builddir}/lib conftest.c /usr/lib/lapack/liblapack.so.3 /usr/lib/libblas.so.3 /usr/lib/x86_64-linux-gnu/libgomp.so.1 -L/usr/lib/gcc/x86_64-linux-gnu/4.8 -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../.. -lgfortran -lm -lquadmath -lm >&5 +configure:7061: $? = 0 +configure:7070: result: 1 +configure:7093: checking for stdlib.h +configure:7093: result: yes +configure:7103: checking for GNU libc compatible malloc +configure:7145: result: no +configure:7166: checking return type of signal handlers +configure:7192: mpicc -c -std=c99 -fopenmp conftest.c >&5 +conftest.c: In function 'main': +conftest.c:65:1: error: void value not ignored as it ought to be + return *(signal (0, 0)) (0) == 1; + ^ +configure:7192: $? = 1 +configure: failed program was: +| /* confdefs.h */ +| #define PACKAGE_NAME "tmLQCD" +| #define PACKAGE_TARNAME "tmlqcd" +| #define PACKAGE_VERSION "5.2.0" +| #define PACKAGE_STRING "tmLQCD 5.2.0" +| #define PACKAGE_BUGREPORT "curbach@gmx.de" +| #define PACKAGE_URL "" +| #define restrict __restrict +| #define YYTEXT_POINTER 1 +| #define STDC_HEADERS 1 +| #define HAVE_SYS_TYPES_H 1 +| #define HAVE_SYS_STAT_H 1 +| #define HAVE_STDLIB_H 1 +| #define HAVE_STRING_H 1 +| #define HAVE_MEMORY_H 1 +| #define HAVE_STRINGS_H 1 +| #define HAVE_INTTYPES_H 1 +| #define HAVE_STDINT_H 1 +| #define HAVE_UNISTD_H 1 +| #define HAVE_STDINT_H 1 +| #define HAVE_UINT16_T 1 +| #define HAVE_UINT32_T 1 +| #define HAVE_UINT64_T 1 +| #define BENCHMARK 1 +| #define MPI 1 +| #define OMP 1 +| #define HAVE_OMP_H 1 +| #define PARALLELXYZT 1 +| #define _NON_BLOCKING 1 +| #define HAVE_LAPACK 1 +| #define HAVE_CLOCK_GETTIME 1 +| #define HAVE_CLOCK_GETTIME 1 +| #define STDC_HEADERS 1 +| #define HAVE_FLOAT_H 1 +| #define HAVE_LIBINTL_H 1 +| #define HAVE_LIMITS_H 1 +| #define HAVE_STDINT_H 1 +| #define HAVE_STDLIB_H 1 +| #define HAVE_STRING_H 1 +| #define HAVE_STRINGS_H 1 +| #define HAVE_SYS_TIME_H 1 +| #define HAVE_UNISTD_H 1 +| #define HAVE_ENDIAN_H 1 +| #define TIME_WITH_SYS_TIME 1 +| #define _LARGEFILE_SOURCE 1 +| #define HAVE_FSEEKO 1 +| #define HAVE_STDLIB_H 1 +| #define HAVE_MALLOC 0 +| #define malloc rpl_malloc +| /* end confdefs.h. */ +| #include +| #include +| +| #ifdef F77_DUMMY_MAIN +| +| # ifdef __cplusplus +| extern "C" +| # endif +| int F77_DUMMY_MAIN() { return 1; } +| +| #endif +| int +| main () +| { +| return *(signal (0, 0)) (0) == 1; +| ; +| return 0; +| } +configure:7199: result: void +configure:7210: checking for gettimeofday +configure:7210: mpicc -o conftest -std=c99 -fopenmp -L${HOME}/lib -L${top_builddir}/lib conftest.c /usr/lib/lapack/liblapack.so.3 /usr/lib/libblas.so.3 /usr/lib/x86_64-linux-gnu/libgomp.so.1 -L/usr/lib/gcc/x86_64-linux-gnu/4.8 -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../.. -lgfortran -lm -lquadmath -lm >&5 +configure:7210: $? = 0 +configure:7210: result: yes +configure:7210: checking for pow +configure:7210: mpicc -o conftest -std=c99 -fopenmp -L${HOME}/lib -L${top_builddir}/lib conftest.c /usr/lib/lapack/liblapack.so.3 /usr/lib/libblas.so.3 /usr/lib/x86_64-linux-gnu/libgomp.so.1 -L/usr/lib/gcc/x86_64-linux-gnu/4.8 -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../.. -lgfortran -lm -lquadmath -lm >&5 +conftest.c:76:6: warning: conflicting types for built-in function 'pow' [enabled by default] + char pow (); + ^ +configure:7210: $? = 0 +configure:7210: result: yes +configure:7210: checking for sqrt +configure:7210: mpicc -o conftest -std=c99 -fopenmp -L${HOME}/lib -L${top_builddir}/lib conftest.c /usr/lib/lapack/liblapack.so.3 /usr/lib/libblas.so.3 /usr/lib/x86_64-linux-gnu/libgomp.so.1 -L/usr/lib/gcc/x86_64-linux-gnu/4.8 -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../.. -lgfortran -lm -lquadmath -lm >&5 +conftest.c:77:6: warning: conflicting types for built-in function 'sqrt' [enabled by default] + char sqrt (); + ^ +configure:7210: $? = 0 +configure:7210: result: yes +configure:7241: checking what alignment we want for arrays +configure:7292: result: auto +configure:7312: checking whether we want to use P4 instructions +configure:7349: result: no +configure:7353: checking whether we want to use Opteron instructions +configure:7390: result: no +configure:7394: checking whether we want to use SSE2 instructions +configure:7410: result: no +configure:7414: checking whether we want to use SSE3 instructions +configure:7430: result: no +configure:7531: checking whether we want to use gprof as profiler +configure:7550: result: no +configure:7555: checking whether we shall use rts dram window +configure:7566: result: yes +configure:7840: checking whether we want to switch on optimisation +configure:7855: result: yes +configure:7859: checking whether we want to use a copy of the gauge field +configure:7869: result: yes +configure:7879: checking whether we want to use a Dirac Op. with halfspinor exchange +configure:7889: result: yes +configure:7906: checking whether we want to use shmem API +configure:7923: result: no +configure:7927: checking whether we want to use timeslice-splitted communications +configure:7943: result: no +configure:7947: checking whether we want to compute the LapH eigenvalues +configure:7963: result: no +configure:7968: checking whether we want to use CUDA GPU +configure:8029: result: no +configure:8043: checking whether we want to use QUDA GPU +configure:8103: checking for g++ +configure:8119: found /usr/bin/g++ +configure:8130: result: g++ +configure:8146: WARNING: using cross tools not prefixed with host triplet +configure:8157: checking for C++ compiler version +configure:8166: g++ --version >&5 +g++ (Ubuntu 4.8.4-2ubuntu1~14.04.1) 4.8.4 +Copyright (C) 2013 Free Software Foundation, Inc. +This is free software; see the source for copying conditions. There is NO +warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + +configure:8177: $? = 0 +configure:8166: g++ -v >&5 +Using built-in specs. +COLLECT_GCC=g++ +COLLECT_LTO_WRAPPER=/usr/lib/gcc/x86_64-linux-gnu/4.8/lto-wrapper +Target: x86_64-linux-gnu +Configured with: ../src/configure -v --with-pkgversion='Ubuntu 4.8.4-2ubuntu1~14.04.1' --with-bugurl=file:///usr/share/doc/gcc-4.8/README.Bugs --enable-languages=c,c++,java,go,d,fortran,objc,obj-c++ --prefix=/usr --program-suffix=-4.8 --enable-shared --enable-linker-build-id --libexecdir=/usr/lib --without-included-gettext --enable-threads=posix --with-gxx-include-dir=/usr/include/c++/4.8 --libdir=/usr/lib --enable-nls --with-sysroot=/ --enable-clocale=gnu --enable-libstdcxx-debug --enable-libstdcxx-time=yes --enable-gnu-unique-object --disable-libmudflap --enable-plugin --with-system-zlib --disable-browser-plugin --enable-java-awt=gtk --enable-gtk-cairo --with-java-home=/usr/lib/jvm/java-1.5.0-gcj-4.8-amd64/jre --enable-java-home --with-jvm-root-dir=/usr/lib/jvm/java-1.5.0-gcj-4.8-amd64 --with-jvm-jar-dir=/usr/lib/jvm-exports/java-1.5.0-gcj-4.8-amd64 --with-arch-directory=amd64 --with-ecj-jar=/usr/share/java/eclipse-ecj.jar --enable-objc-gc --enable-multiarch --disable-werror --with-arch-32=i686 --with-abi=m64 --with-multilib-list=m32,m64,mx32 --with-tune=generic --enable-checking=release --build=x86_64-linux-gnu --host=x86_64-linux-gnu --target=x86_64-linux-gnu +Thread model: posix +gcc version 4.8.4 (Ubuntu 4.8.4-2ubuntu1~14.04.1) +configure:8177: $? = 0 +configure:8166: g++ -V >&5 +g++: error: unrecognized command line option '-V' +g++: fatal error: no input files +compilation terminated. +configure:8177: $? = 4 +configure:8166: g++ -qversion >&5 +g++: error: unrecognized command line option '-qversion' +g++: fatal error: no input files +compilation terminated. +configure:8177: $? = 4 +configure:8181: checking whether we are using the GNU C++ compiler +configure:8208: g++ -c conftest.cpp >&5 +configure:8208: $? = 0 +configure:8217: result: yes +configure:8226: checking whether g++ accepts -g +configure:8254: g++ -c -g conftest.cpp >&5 +configure:8254: $? = 0 +configure:8311: result: yes +configure:8499: checking checking consistency +configure:8664: creating ./config.status + +## ---------------------- ## +## Running config.status. ## +## ---------------------- ## + +This file was extended by tmLQCD config.status 5.2.0, which was +generated by GNU Autoconf 2.69. Invocation command line was + + CONFIG_FILES = + CONFIG_HEADERS = + CONFIG_LINKS = + CONFIG_COMMANDS = + $ ./config.status + +on jacob-All-Series + +config.status:893: creating Makefile +config.status:893: creating operator/Makefile +config.status:893: creating linalg/Makefile +config.status:893: creating solver/Makefile +config.status:893: creating monomial/Makefile +config.status:893: creating buffers/Makefile +config.status:893: creating cu/Makefile +config.status:893: creating io/Makefile +config.status:893: creating meas/Makefile +config.status:893: creating xchange/Makefile +config.status:893: creating init/Makefile +config.status:893: creating rational/Makefile +config.status:893: creating wrapper/Makefile +config.status:893: creating config.h + +## ---------------- ## +## Cache variables. ## +## ---------------- ## + +ac_cv_build=x86_64-unknown-linux-gnu +ac_cv_c_compiler_gnu=yes +ac_cv_c_const=yes +ac_cv_c_inline=inline +ac_cv_c_restrict=__restrict +ac_cv_cxx_compiler_gnu=yes +ac_cv_env_CCC_set= +ac_cv_env_CCC_value= +ac_cv_env_CC_set=set +ac_cv_env_CC_value=mpicc +ac_cv_env_CFLAGS_set=set +ac_cv_env_CFLAGS_value='-std=c99 -fopenmp' +ac_cv_env_CPPFLAGS_set= +ac_cv_env_CPPFLAGS_value= +ac_cv_env_CPP_set= +ac_cv_env_CPP_value= +ac_cv_env_CXXFLAGS_set= +ac_cv_env_CXXFLAGS_value= +ac_cv_env_CXX_set= +ac_cv_env_CXX_value= +ac_cv_env_F77_set=set +ac_cv_env_F77_value=f95 +ac_cv_env_FFLAGS_set= +ac_cv_env_FFLAGS_value= +ac_cv_env_LDFLAGS_set= +ac_cv_env_LDFLAGS_value= +ac_cv_env_LIBS_set=set +ac_cv_env_LIBS_value='/usr/lib/libblas.so.3 /usr/lib/x86_64-linux-gnu/libgomp.so.1' +ac_cv_env_build_alias_set= +ac_cv_env_build_alias_value= +ac_cv_env_host_alias_set= +ac_cv_env_host_alias_value= +ac_cv_env_target_alias_set= +ac_cv_env_target_alias_value= +ac_cv_f77_compiler_gnu=yes +ac_cv_f77_dummy_main=none +ac_cv_f77_libs=' -L/usr/lib/gcc/x86_64-linux-gnu/4.8 -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../.. -lgfortran -lm -lquadmath' +ac_cv_f77_mangling='lower case, underscore, no extra underscore' +ac_cv_fortran_dummy_main=none +ac_cv_func_clock_gettime=yes +ac_cv_func_gettimeofday=yes +ac_cv_func_malloc_0_nonnull=no +ac_cv_func_pow=yes +ac_cv_func_sqrt=yes +ac_cv_header_endian_h=yes +ac_cv_header_float_h=yes +ac_cv_header_getopt_h=yes +ac_cv_header_inttypes_h=yes +ac_cv_header_libintl_h=yes +ac_cv_header_limits_h=yes +ac_cv_header_memory_h=yes +ac_cv_header_omp_h=yes +ac_cv_header_stdc=yes +ac_cv_header_stdint_h=yes +ac_cv_header_stdlib_h=yes +ac_cv_header_string_h=yes +ac_cv_header_strings_h=yes +ac_cv_header_sys_stat_h=yes +ac_cv_header_sys_time_h=yes +ac_cv_header_sys_types_h=yes +ac_cv_header_time=yes +ac_cv_header_unistd_h=yes +ac_cv_host=x86_64-unknown-linux-gnu +ac_cv_lib_lex=-lfl +ac_cv_objext=o +ac_cv_path_EGREP='/bin/grep -E' +ac_cv_path_GREP=/bin/grep +ac_cv_prog_CCDEP=gcc +ac_cv_prog_CPP='mpicc -E' +ac_cv_prog_LEX=flex +ac_cv_prog_ac_ct_AR=ar +ac_cv_prog_ac_ct_CC=mpicc +ac_cv_prog_ac_ct_CXX=g++ +ac_cv_prog_ac_ct_RANLIB=ranlib +ac_cv_prog_c_openmp='none needed' +ac_cv_prog_cc_c89= +ac_cv_prog_cc_c99= +ac_cv_prog_cc_g=yes +ac_cv_prog_cxx_g=yes +ac_cv_prog_f77_g=yes +ac_cv_prog_f77_v=-v +ac_cv_prog_lex_root=lex.yy +ac_cv_prog_lex_yytext_pointer=yes +ac_cv_prog_make_make_set=yes +ac_cv_search_zheev_='none required' +ac_cv_sys_file_offset_bits=no +ac_cv_sys_largefile_CC=no +ac_cv_sys_largefile_source=1 +ac_cv_type_off_t=yes +ac_cv_type_signal=void +ac_cv_type_size_t=yes +ac_cv_type_uint16_t=yes +ac_cv_type_uint32_t=yes +ac_cv_type_uint64_t=yes + +## ----------------- ## +## Output variables. ## +## ----------------- ## + +AR='ar' +AUTOCONF='autoconf' +CC='mpicc' +CCDEP='gcc' +CCLD='mpicc' +CFLAGS='-std=c99 -fopenmp -pedantic -Wall' +CPP='mpicc -E' +CPPFLAGS=' ' +CXX='g++' +CXXFLAGS='-g -O2' +DEBUG_FLAG='-g' +DEFS='-DHAVE_CONFIG_H' +DEPFLAGS='-MM' +ECHO_C='' +ECHO_N='-n' +ECHO_T='' +EGREP='/bin/grep -E' +EXEEXT='' +F77='f95' +FFLAGS='-g -O2' +FLIBS=' -L/usr/lib/gcc/x86_64-linux-gnu/4.8 -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../.. -lgfortran -lm -lquadmath' +GPUCFLAGS='' +GPUDIR='' +GPUMPICOMPILER='' +GREP='/bin/grep' +INCLUDES=' -I$(HOME)/include/ -I. -I${abs_top_builddir}/ -I${abs_top_srcdir}/ -I/include/ -I/include/' +LDFLAGS=' -L${HOME}/lib -L${top_builddir}/lib ' +LEMON_AVAILABLE='0' +LEX='flex' +LEXLIB='-lfl' +LEX_OUTPUT_ROOT='lex.yy' +LIBOBJS=' ${LIBOBJDIR}malloc$U.o' +LIBS='-lhmc -lmonomial -loperator -lsolver -linit -lmeas -llinalg -lhmc -lxchange -lrational -lio /usr/lib/lapack/liblapack.so.3 /usr/lib/libblas.so.3 /usr/lib/x86_64-linux-gnu/libgomp.so.1 -L/usr/lib/gcc/x86_64-linux-gnu/4.8 -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../.. -lgfortran -lm -lquadmath -lm' +LTLIBOBJS=' ${LIBOBJDIR}malloc$U.lo' +MEASDIR='' +NVCC='' +OBJEXT='o' +OPENMP_CFLAGS='' +OPTARGS='-O' +PACKAGE_BUGREPORT='curbach@gmx.de' +PACKAGE_NAME='tmLQCD' +PACKAGE_STRING='tmLQCD 5.2.0' +PACKAGE_TARNAME='tmlqcd' +PACKAGE_URL='' +PACKAGE_VERSION='5.2.0' +PATH_SEPARATOR=':' +PROFILE_FLAG='' +QUDA_AVAILABLE='0' +QUDA_INTERFACE='' +RANLIB='ranlib' +SET_MAKE='' +SHELL='/bin/bash' +SOLVEROUT='' +SOPTARGS='-O' +SPI_FILES='' +USESUBDIRS='operator linalg solver monomial buffers cu io meas xchange init rational wrapper' +XCHANGEDIR='' +XCHANGELIB='' +XLIB='' +ac_ct_CC='mpicc' +ac_ct_CXX='g++' +ac_ct_F77='' +bindir='${exec_prefix}/bin' +build='x86_64-unknown-linux-gnu' +build_alias='' +build_cpu='x86_64' +build_os='linux-gnu' +build_vendor='unknown' +datadir='${datarootdir}' +datarootdir='${prefix}/share' +docdir='${datarootdir}/doc/${PACKAGE_TARNAME}' +dvidir='${docdir}' +exec_prefix='${prefix}' +host='x86_64-unknown-linux-gnu' +host_alias='' +host_cpu='x86_64' +host_os='linux-gnu' +host_vendor='unknown' +htmldir='${docdir}' +includedir='${prefix}/include' +infodir='${datarootdir}/info' +libdir='${exec_prefix}/lib' +libexecdir='${exec_prefix}/libexec' +localedir='${datarootdir}/locale' +localstatedir='${prefix}/var' +mandir='${datarootdir}/man' +oldincludedir='/usr/include' +pdfdir='${docdir}' +prefix='/home/jacob' +program_transform_name='s,x,x,' +psdir='${docdir}' +sbindir='${exec_prefix}/sbin' +sharedstatedir='${prefix}/com' +sysconfdir='${prefix}/etc' +target_alias='' + +## ----------- ## +## confdefs.h. ## +## ----------- ## + +/* confdefs.h */ +#define PACKAGE_NAME "tmLQCD" +#define PACKAGE_TARNAME "tmlqcd" +#define PACKAGE_VERSION "5.2.0" +#define PACKAGE_STRING "tmLQCD 5.2.0" +#define PACKAGE_BUGREPORT "curbach@gmx.de" +#define PACKAGE_URL "" +#define restrict __restrict +#define YYTEXT_POINTER 1 +#define STDC_HEADERS 1 +#define HAVE_SYS_TYPES_H 1 +#define HAVE_SYS_STAT_H 1 +#define HAVE_STDLIB_H 1 +#define HAVE_STRING_H 1 +#define HAVE_MEMORY_H 1 +#define HAVE_STRINGS_H 1 +#define HAVE_INTTYPES_H 1 +#define HAVE_STDINT_H 1 +#define HAVE_UNISTD_H 1 +#define HAVE_STDINT_H 1 +#define HAVE_UINT16_T 1 +#define HAVE_UINT32_T 1 +#define HAVE_UINT64_T 1 +#define BENCHMARK 1 +#define MPI 1 +#define OMP 1 +#define HAVE_OMP_H 1 +#define PARALLELXYZT 1 +#define _NON_BLOCKING 1 +#define HAVE_LAPACK 1 +#define HAVE_CLOCK_GETTIME 1 +#define HAVE_CLOCK_GETTIME 1 +#define STDC_HEADERS 1 +#define HAVE_FLOAT_H 1 +#define HAVE_LIBINTL_H 1 +#define HAVE_LIMITS_H 1 +#define HAVE_STDINT_H 1 +#define HAVE_STDLIB_H 1 +#define HAVE_STRING_H 1 +#define HAVE_STRINGS_H 1 +#define HAVE_SYS_TIME_H 1 +#define HAVE_UNISTD_H 1 +#define HAVE_ENDIAN_H 1 +#define TIME_WITH_SYS_TIME 1 +#define _LARGEFILE_SOURCE 1 +#define HAVE_FSEEKO 1 +#define HAVE_STDLIB_H 1 +#define HAVE_MALLOC 0 +#define malloc rpl_malloc +#define RETSIGTYPE void +#define HAVE_GETTIMEOFDAY 1 +#define HAVE_POW 1 +#define HAVE_SQRT 1 +#define ALIGN_BASE 0x00 +#define ALIGN /**/ +#define ALIGN_BASE32 0x00 +#define ALIGN32 /**/ +#define _USE_BGLDRAM 1 +#define _x86_64 1 +#define _GAUGE_COPY 1 +#define _USE_HALFSPINOR 1 + +configure: exit 0 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/config.status b/qcd/part_cpu/applications/QCD/src/kernel_D/config.status new file mode 100755 index 0000000000000000000000000000000000000000..1930093258107e56034503d824b26ddcb29a4dd4 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/config.status @@ -0,0 +1,1083 @@ +#! /bin/bash +# Generated by configure. +# Run this file to recreate the current configuration. +# Compiler output produced by configure, useful for debugging +# configure, is in config.log if it exists. + +debug=false +ac_cs_recheck=false +ac_cs_silent=false + +SHELL=${CONFIG_SHELL-/bin/bash} +export SHELL +## -------------------- ## +## M4sh Initialization. ## +## -------------------- ## + +# Be more Bourne compatible +DUALCASE=1; export DUALCASE # for MKS sh +if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then : + emulate sh + NULLCMD=: + # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which + # is contrary to our usage. Disable this feature. + alias -g '${1+"$@"}'='"$@"' + setopt NO_GLOB_SUBST +else + case `(set -o) 2>/dev/null` in #( + *posix*) : + set -o posix ;; #( + *) : + ;; +esac +fi + + +as_nl=' +' +export as_nl +# Printing a long string crashes Solaris 7 /usr/bin/printf. +as_echo='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' +as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo +as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo$as_echo +# Prefer a ksh shell builtin over an external printf program on Solaris, +# but without wasting forks for bash or zsh. +if test -z "$BASH_VERSION$ZSH_VERSION" \ + && (test "X`print -r -- $as_echo`" = "X$as_echo") 2>/dev/null; then + as_echo='print -r --' + as_echo_n='print -rn --' +elif (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then + as_echo='printf %s\n' + as_echo_n='printf %s' +else + if test "X`(/usr/ucb/echo -n -n $as_echo) 2>/dev/null`" = "X-n $as_echo"; then + as_echo_body='eval /usr/ucb/echo -n "$1$as_nl"' + as_echo_n='/usr/ucb/echo -n' + else + as_echo_body='eval expr "X$1" : "X\\(.*\\)"' + as_echo_n_body='eval + arg=$1; + case $arg in #( + *"$as_nl"*) + expr "X$arg" : "X\\(.*\\)$as_nl"; + arg=`expr "X$arg" : ".*$as_nl\\(.*\\)"`;; + esac; + expr "X$arg" : "X\\(.*\\)" | tr -d "$as_nl" + ' + export as_echo_n_body + as_echo_n='sh -c $as_echo_n_body as_echo' + fi + export as_echo_body + as_echo='sh -c $as_echo_body as_echo' +fi + +# The user is always right. +if test "${PATH_SEPARATOR+set}" != set; then + PATH_SEPARATOR=: + (PATH='/bin;/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 && { + (PATH='/bin:/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 || + PATH_SEPARATOR=';' + } +fi + + +# IFS +# We need space, tab and new line, in precisely that order. Quoting is +# there to prevent editors from complaining about space-tab. +# (If _AS_PATH_WALK were called with IFS unset, it would disable word +# splitting by setting IFS to empty value.) +IFS=" "" $as_nl" + +# Find who we are. Look in the path if we contain no directory separator. +as_myself= +case $0 in #(( + *[\\/]* ) as_myself=$0 ;; + *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break + done +IFS=$as_save_IFS + + ;; +esac +# We did not find ourselves, most probably we were run as `sh COMMAND' +# in which case we are not to be found in the path. +if test "x$as_myself" = x; then + as_myself=$0 +fi +if test ! -f "$as_myself"; then + $as_echo "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2 + exit 1 +fi + +# Unset variables that we do not need and which cause bugs (e.g. in +# pre-3.0 UWIN ksh). But do not cause bugs in bash 2.01; the "|| exit 1" +# suppresses any "Segmentation fault" message there. '((' could +# trigger a bug in pdksh 5.2.14. +for as_var in BASH_ENV ENV MAIL MAILPATH +do eval test x\${$as_var+set} = xset \ + && ( (unset $as_var) || exit 1) >/dev/null 2>&1 && unset $as_var || : +done +PS1='$ ' +PS2='> ' +PS4='+ ' + +# NLS nuisances. +LC_ALL=C +export LC_ALL +LANGUAGE=C +export LANGUAGE + +# CDPATH. +(unset CDPATH) >/dev/null 2>&1 && unset CDPATH + + +# as_fn_error STATUS ERROR [LINENO LOG_FD] +# ---------------------------------------- +# Output "`basename $0`: error: ERROR" to stderr. If LINENO and LOG_FD are +# provided, also output the error to LOG_FD, referencing LINENO. Then exit the +# script with STATUS, using 1 if that was 0. +as_fn_error () +{ + as_status=$1; test $as_status -eq 0 && as_status=1 + if test "$4"; then + as_lineno=${as_lineno-"$3"} as_lineno_stack=as_lineno_stack=$as_lineno_stack + $as_echo "$as_me:${as_lineno-$LINENO}: error: $2" >&$4 + fi + $as_echo "$as_me: error: $2" >&2 + as_fn_exit $as_status +} # as_fn_error + + +# as_fn_set_status STATUS +# ----------------------- +# Set $? to STATUS, without forking. +as_fn_set_status () +{ + return $1 +} # as_fn_set_status + +# as_fn_exit STATUS +# ----------------- +# Exit the shell with STATUS, even in a "trap 0" or "set -e" context. +as_fn_exit () +{ + set +e + as_fn_set_status $1 + exit $1 +} # as_fn_exit + +# as_fn_unset VAR +# --------------- +# Portably unset VAR. +as_fn_unset () +{ + { eval $1=; unset $1;} +} +as_unset=as_fn_unset +# as_fn_append VAR VALUE +# ---------------------- +# Append the text in VALUE to the end of the definition contained in VAR. Take +# advantage of any shell optimizations that allow amortized linear growth over +# repeated appends, instead of the typical quadratic growth present in naive +# implementations. +if (eval "as_var=1; as_var+=2; test x\$as_var = x12") 2>/dev/null; then : + eval 'as_fn_append () + { + eval $1+=\$2 + }' +else + as_fn_append () + { + eval $1=\$$1\$2 + } +fi # as_fn_append + +# as_fn_arith ARG... +# ------------------ +# Perform arithmetic evaluation on the ARGs, and store the result in the +# global $as_val. Take advantage of shells that can avoid forks. The arguments +# must be portable across $(()) and expr. +if (eval "test \$(( 1 + 1 )) = 2") 2>/dev/null; then : + eval 'as_fn_arith () + { + as_val=$(( $* )) + }' +else + as_fn_arith () + { + as_val=`expr "$@" || test $? -eq 1` + } +fi # as_fn_arith + + +if expr a : '\(a\)' >/dev/null 2>&1 && + test "X`expr 00001 : '.*\(...\)'`" = X001; then + as_expr=expr +else + as_expr=false +fi + +if (basename -- /) >/dev/null 2>&1 && test "X`basename -- / 2>&1`" = "X/"; then + as_basename=basename +else + as_basename=false +fi + +if (as_dir=`dirname -- /` && test "X$as_dir" = X/) >/dev/null 2>&1; then + as_dirname=dirname +else + as_dirname=false +fi + +as_me=`$as_basename -- "$0" || +$as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \ + X"$0" : 'X\(//\)$' \| \ + X"$0" : 'X\(/\)' \| . 2>/dev/null || +$as_echo X/"$0" | + sed '/^.*\/\([^/][^/]*\)\/*$/{ + s//\1/ + q + } + /^X\/\(\/\/\)$/{ + s//\1/ + q + } + /^X\/\(\/\).*/{ + s//\1/ + q + } + s/.*/./; q'` + +# Avoid depending upon Character Ranges. +as_cr_letters='abcdefghijklmnopqrstuvwxyz' +as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ' +as_cr_Letters=$as_cr_letters$as_cr_LETTERS +as_cr_digits='0123456789' +as_cr_alnum=$as_cr_Letters$as_cr_digits + +ECHO_C= ECHO_N= ECHO_T= +case `echo -n x` in #((((( +-n*) + case `echo 'xy\c'` in + *c*) ECHO_T=' ';; # ECHO_T is single tab character. + xy) ECHO_C='\c';; + *) echo `echo ksh88 bug on AIX 6.1` > /dev/null + ECHO_T=' ';; + esac;; +*) + ECHO_N='-n';; +esac + +rm -f conf$$ conf$$.exe conf$$.file +if test -d conf$$.dir; then + rm -f conf$$.dir/conf$$.file +else + rm -f conf$$.dir + mkdir conf$$.dir 2>/dev/null +fi +if (echo >conf$$.file) 2>/dev/null; then + if ln -s conf$$.file conf$$ 2>/dev/null; then + as_ln_s='ln -s' + # ... but there are two gotchas: + # 1) On MSYS, both `ln -s file dir' and `ln file dir' fail. + # 2) DJGPP < 2.04 has no symlinks; `ln -s' creates a wrapper executable. + # In both cases, we have to default to `cp -pR'. + ln -s conf$$.file conf$$.dir 2>/dev/null && test ! -f conf$$.exe || + as_ln_s='cp -pR' + elif ln conf$$.file conf$$ 2>/dev/null; then + as_ln_s=ln + else + as_ln_s='cp -pR' + fi +else + as_ln_s='cp -pR' +fi +rm -f conf$$ conf$$.exe conf$$.dir/conf$$.file conf$$.file +rmdir conf$$.dir 2>/dev/null + + +# as_fn_mkdir_p +# ------------- +# Create "$as_dir" as a directory, including parents if necessary. +as_fn_mkdir_p () +{ + + case $as_dir in #( + -*) as_dir=./$as_dir;; + esac + test -d "$as_dir" || eval $as_mkdir_p || { + as_dirs= + while :; do + case $as_dir in #( + *\'*) as_qdir=`$as_echo "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #'( + *) as_qdir=$as_dir;; + esac + as_dirs="'$as_qdir' $as_dirs" + as_dir=`$as_dirname -- "$as_dir" || +$as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ + X"$as_dir" : 'X\(//\)[^/]' \| \ + X"$as_dir" : 'X\(//\)$' \| \ + X"$as_dir" : 'X\(/\)' \| . 2>/dev/null || +$as_echo X"$as_dir" | + sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ + s//\1/ + q + } + /^X\(\/\/\)[^/].*/{ + s//\1/ + q + } + /^X\(\/\/\)$/{ + s//\1/ + q + } + /^X\(\/\).*/{ + s//\1/ + q + } + s/.*/./; q'` + test -d "$as_dir" && break + done + test -z "$as_dirs" || eval "mkdir $as_dirs" + } || test -d "$as_dir" || as_fn_error $? "cannot create directory $as_dir" + + +} # as_fn_mkdir_p +if mkdir -p . 2>/dev/null; then + as_mkdir_p='mkdir -p "$as_dir"' +else + test -d ./-p && rmdir ./-p + as_mkdir_p=false +fi + + +# as_fn_executable_p FILE +# ----------------------- +# Test if FILE is an executable regular file. +as_fn_executable_p () +{ + test -f "$1" && test -x "$1" +} # as_fn_executable_p +as_test_x='test -x' +as_executable_p=as_fn_executable_p + +# Sed expression to map a string onto a valid CPP name. +as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'" + +# Sed expression to map a string onto a valid variable name. +as_tr_sh="eval sed 'y%*+%pp%;s%[^_$as_cr_alnum]%_%g'" + + +exec 6>&1 +## ----------------------------------- ## +## Main body of $CONFIG_STATUS script. ## +## ----------------------------------- ## +# Save the log message, to keep $0 and so on meaningful, and to +# report actual input values of CONFIG_FILES etc. instead of their +# values after options handling. +ac_log=" +This file was extended by tmLQCD $as_me 5.2.0, which was +generated by GNU Autoconf 2.69. Invocation command line was + + CONFIG_FILES = $CONFIG_FILES + CONFIG_HEADERS = $CONFIG_HEADERS + CONFIG_LINKS = $CONFIG_LINKS + CONFIG_COMMANDS = $CONFIG_COMMANDS + $ $0 $@ + +on `(hostname || uname -n) 2>/dev/null | sed 1q` +" + +# Files that config.status was made for. +config_files=" Makefile operator/Makefile linalg/Makefile solver/Makefile monomial/Makefile buffers/Makefile cu/Makefile io/Makefile meas/Makefile xchange/Makefile init/Makefile rational/Makefile wrapper/Makefile" +config_headers=" config.h" + +ac_cs_usage="\ +\`$as_me' instantiates files and other configuration actions +from templates according to the current configuration. Unless the files +and actions are specified as TAGs, all are instantiated by default. + +Usage: $0 [OPTION]... [TAG]... + + -h, --help print this help, then exit + -V, --version print version number and configuration settings, then exit + --config print configuration, then exit + -q, --quiet, --silent + do not print progress messages + -d, --debug don't remove temporary files + --recheck update $as_me by reconfiguring in the same conditions + --file=FILE[:TEMPLATE] + instantiate the configuration file FILE + --header=FILE[:TEMPLATE] + instantiate the configuration header FILE + +Configuration files: +$config_files + +Configuration headers: +$config_headers + +Report bugs to ." + +ac_cs_config="'--enable-mpi' '--with-mpidimension=4' '--enable-gaugecopy' 'CC=mpicc' 'CFLAGS=-std=c99 -fopenmp' 'F77=f95' '--enable-omp' 'LIBS=/usr/lib/libblas.so.3 /usr/lib/x86_64-linux-gnu/libgomp.so.1' '--with-lapack=/usr/lib/lapack/liblapack.so.3' '--enable-benchmark'" +ac_cs_version="\ +tmLQCD config.status 5.2.0 +configured by ./configure, generated by GNU Autoconf 2.69, + with options \"$ac_cs_config\" + +Copyright (C) 2012 Free Software Foundation, Inc. +This config.status script is free software; the Free Software Foundation +gives unlimited permission to copy, distribute and modify it." + +ac_pwd='/home/jacob/Job/Prace/kernel_D_soft/kernel_D_update2' +srcdir='.' +test -n "$AWK" || AWK=awk +# The default lists apply if the user does not specify any file. +ac_need_defaults=: +while test $# != 0 +do + case $1 in + --*=?*) + ac_option=`expr "X$1" : 'X\([^=]*\)='` + ac_optarg=`expr "X$1" : 'X[^=]*=\(.*\)'` + ac_shift=: + ;; + --*=) + ac_option=`expr "X$1" : 'X\([^=]*\)='` + ac_optarg= + ac_shift=: + ;; + *) + ac_option=$1 + ac_optarg=$2 + ac_shift=shift + ;; + esac + + case $ac_option in + # Handling of the options. + -recheck | --recheck | --rechec | --reche | --rech | --rec | --re | --r) + ac_cs_recheck=: ;; + --version | --versio | --versi | --vers | --ver | --ve | --v | -V ) + $as_echo "$ac_cs_version"; exit ;; + --config | --confi | --conf | --con | --co | --c ) + $as_echo "$ac_cs_config"; exit ;; + --debug | --debu | --deb | --de | --d | -d ) + debug=: ;; + --file | --fil | --fi | --f ) + $ac_shift + case $ac_optarg in + *\'*) ac_optarg=`$as_echo "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"` ;; + '') as_fn_error $? "missing file argument" ;; + esac + as_fn_append CONFIG_FILES " '$ac_optarg'" + ac_need_defaults=false;; + --header | --heade | --head | --hea ) + $ac_shift + case $ac_optarg in + *\'*) ac_optarg=`$as_echo "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"` ;; + esac + as_fn_append CONFIG_HEADERS " '$ac_optarg'" + ac_need_defaults=false;; + --he | --h) + # Conflict between --help and --header + as_fn_error $? "ambiguous option: \`$1' +Try \`$0 --help' for more information.";; + --help | --hel | -h ) + $as_echo "$ac_cs_usage"; exit ;; + -q | -quiet | --quiet | --quie | --qui | --qu | --q \ + | -silent | --silent | --silen | --sile | --sil | --si | --s) + ac_cs_silent=: ;; + + # This is an error. + -*) as_fn_error $? "unrecognized option: \`$1' +Try \`$0 --help' for more information." ;; + + *) as_fn_append ac_config_targets " $1" + ac_need_defaults=false ;; + + esac + shift +done + +ac_configure_extra_args= + +if $ac_cs_silent; then + exec 6>/dev/null + ac_configure_extra_args="$ac_configure_extra_args --silent" +fi + +if $ac_cs_recheck; then + set X /bin/bash './configure' '--enable-mpi' '--with-mpidimension=4' '--enable-gaugecopy' 'CC=mpicc' 'CFLAGS=-std=c99 -fopenmp' 'F77=f95' '--enable-omp' 'LIBS=/usr/lib/libblas.so.3 /usr/lib/x86_64-linux-gnu/libgomp.so.1' '--with-lapack=/usr/lib/lapack/liblapack.so.3' '--enable-benchmark' $ac_configure_extra_args --no-create --no-recursion + shift + $as_echo "running CONFIG_SHELL=/bin/bash $*" >&6 + CONFIG_SHELL='/bin/bash' + export CONFIG_SHELL + exec "$@" +fi + +exec 5>>config.log +{ + echo + sed 'h;s/./-/g;s/^.../## /;s/...$/ ##/;p;x;p;x' <<_ASBOX +## Running $as_me. ## +_ASBOX + $as_echo "$ac_log" +} >&5 + + +# Handling of arguments. +for ac_config_target in $ac_config_targets +do + case $ac_config_target in + "config.h") CONFIG_HEADERS="$CONFIG_HEADERS config.h" ;; + "fixed_volume.h") CONFIG_FILES="$CONFIG_FILES fixed_volume.h" ;; + "Makefile") CONFIG_FILES="$CONFIG_FILES Makefile" ;; + "$make_files") CONFIG_FILES="$CONFIG_FILES $make_files" ;; + + *) as_fn_error $? "invalid argument: \`$ac_config_target'" "$LINENO" 5;; + esac +done + + +# If the user did not use the arguments to specify the items to instantiate, +# then the envvar interface is used. Set only those that are not. +# We use the long form for the default assignment because of an extremely +# bizarre bug on SunOS 4.1.3. +if $ac_need_defaults; then + test "${CONFIG_FILES+set}" = set || CONFIG_FILES=$config_files + test "${CONFIG_HEADERS+set}" = set || CONFIG_HEADERS=$config_headers +fi + +# Have a temporary directory for convenience. Make it in the build tree +# simply because there is no reason against having it here, and in addition, +# creating and moving files from /tmp can sometimes cause problems. +# Hook for its removal unless debugging. +# Note that there is a small window in which the directory will not be cleaned: +# after its creation but before its name has been assigned to `$tmp'. +$debug || +{ + tmp= ac_tmp= + trap 'exit_status=$? + : "${ac_tmp:=$tmp}" + { test ! -d "$ac_tmp" || rm -fr "$ac_tmp"; } && exit $exit_status +' 0 + trap 'as_fn_exit 1' 1 2 13 15 +} +# Create a (secure) tmp directory for tmp files. + +{ + tmp=`(umask 077 && mktemp -d "./confXXXXXX") 2>/dev/null` && + test -d "$tmp" +} || +{ + tmp=./conf$$-$RANDOM + (umask 077 && mkdir "$tmp") +} || as_fn_error $? "cannot create a temporary directory in ." "$LINENO" 5 +ac_tmp=$tmp + +# Set up the scripts for CONFIG_FILES section. +# No need to generate them if there are no CONFIG_FILES. +# This happens for instance with `./config.status config.h'. +if test -n "$CONFIG_FILES"; then + + +ac_cr=`echo X | tr X '\015'` +# On cygwin, bash can eat \r inside `` if the user requested igncr. +# But we know of no other shell where ac_cr would be empty at this +# point, so we can use a bashism as a fallback. +if test "x$ac_cr" = x; then + eval ac_cr=\$\'\\r\' +fi +ac_cs_awk_cr=`$AWK 'BEGIN { print "a\rb" }' /dev/null` +if test "$ac_cs_awk_cr" = "a${ac_cr}b"; then + ac_cs_awk_cr='\\r' +else + ac_cs_awk_cr=$ac_cr +fi + +echo 'BEGIN {' >"$ac_tmp/subs1.awk" && +cat >>"$ac_tmp/subs1.awk" <<\_ACAWK && +S["LTLIBOBJS"]=" ${LIBOBJDIR}malloc$U.lo" +S["QUDA_AVAILABLE"]="0" +S["ac_ct_CXX"]="g++" +S["CXXFLAGS"]="-g -O2" +S["CXX"]="g++" +S["GPUMPICOMPILER"]="" +S["GPUCFLAGS"]="" +S["GPUDIR"]="" +S["NVCC"]="" +S["USESUBDIRS"]="operator linalg solver monomial buffers cu io meas xchange init rational wrapper" +S["QUDA_INTERFACE"]="" +S["SPI_FILES"]="" +S["LEMON_AVAILABLE"]="0" +S["XLIB"]="" +S["MEASDIR"]="" +S["XCHANGEDIR"]="" +S["XCHANGELIB"]="" +S["PROFILE_FLAG"]="" +S["DEBUG_FLAG"]="-g" +S["DEPFLAGS"]="-MM" +S["CCLD"]="mpicc" +S["SOLVEROUT"]="" +S["AUTOCONF"]="autoconf" +S["INCLUDES"]=" -I$(HOME)/include/ -I. -I${abs_top_builddir}/ -I${abs_top_srcdir}/ -I/include/ -I/include/" +S["SOPTARGS"]="-O" +S["OPTARGS"]="-O" +S["LIBOBJS"]=" ${LIBOBJDIR}malloc$U.o" +S["OPENMP_CFLAGS"]="" +S["EGREP"]="/bin/grep -E" +S["GREP"]="/bin/grep" +S["CPP"]="mpicc -E" +S["CCDEP"]="gcc" +S["RANLIB"]="ranlib" +S["SET_MAKE"]="" +S["LEXLIB"]="-lfl" +S["LEX_OUTPUT_ROOT"]="lex.yy" +S["LEX"]="flex" +S["AR"]="ar" +S["FLIBS"]=" -L/usr/lib/gcc/x86_64-linux-gnu/4.8 -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../x86_64-linux-gnu -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../../l"\ +"ib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../.. -lgfortran -lm -"\ +"lquadmath" +S["ac_ct_F77"]="" +S["FFLAGS"]="-g -O2" +S["F77"]="f95" +S["OBJEXT"]="o" +S["EXEEXT"]="" +S["ac_ct_CC"]="mpicc" +S["CPPFLAGS"]=" " +S["LDFLAGS"]=" -L${HOME}/lib -L${top_builddir}/lib " +S["CFLAGS"]="-std=c99 -fopenmp -pedantic -Wall" +S["CC"]="mpicc" +S["host_os"]="linux-gnu" +S["host_vendor"]="unknown" +S["host_cpu"]="x86_64" +S["host"]="x86_64-unknown-linux-gnu" +S["build_os"]="linux-gnu" +S["build_vendor"]="unknown" +S["build_cpu"]="x86_64" +S["build"]="x86_64-unknown-linux-gnu" +S["target_alias"]="" +S["host_alias"]="" +S["build_alias"]="" +S["LIBS"]="-lhmc -lmonomial -loperator -lsolver -linit -lmeas -llinalg -lhmc -lxchange -lrational -lio /usr/lib/lapack/liblapack.so.3 /usr/lib/libblas.so.3 /us"\ +"r/lib/x86_64-linux-gnu/libgomp.so.1 -L/usr/lib/gcc/x86_64-linux-gnu/4.8 -L/usr/lib/gcc/x86_64-linux-gnu/4.8/../../../x86_64-linux-gnu -L/usr/lib/gc"\ +"c/x86_64-linux-gnu/4.8/../../../../lib -L/lib/x86_64-linux-gnu -L/lib/../lib -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib -L/usr/lib/gcc/x86_64-lin"\ +"ux-gnu/4.8/../../.. -lgfortran -lm -lquadmath -lm" +S["ECHO_T"]="" +S["ECHO_N"]="-n" +S["ECHO_C"]="" +S["DEFS"]="-DHAVE_CONFIG_H" +S["mandir"]="${datarootdir}/man" +S["localedir"]="${datarootdir}/locale" +S["libdir"]="${exec_prefix}/lib" +S["psdir"]="${docdir}" +S["pdfdir"]="${docdir}" +S["dvidir"]="${docdir}" +S["htmldir"]="${docdir}" +S["infodir"]="${datarootdir}/info" +S["docdir"]="${datarootdir}/doc/${PACKAGE_TARNAME}" +S["oldincludedir"]="/usr/include" +S["includedir"]="${prefix}/include" +S["localstatedir"]="${prefix}/var" +S["sharedstatedir"]="${prefix}/com" +S["sysconfdir"]="${prefix}/etc" +S["datadir"]="${datarootdir}" +S["datarootdir"]="${prefix}/share" +S["libexecdir"]="${exec_prefix}/libexec" +S["sbindir"]="${exec_prefix}/sbin" +S["bindir"]="${exec_prefix}/bin" +S["program_transform_name"]="s,x,x," +S["prefix"]="/home/jacob" +S["exec_prefix"]="${prefix}" +S["PACKAGE_URL"]="" +S["PACKAGE_BUGREPORT"]="curbach@gmx.de" +S["PACKAGE_STRING"]="tmLQCD 5.2.0" +S["PACKAGE_VERSION"]="5.2.0" +S["PACKAGE_TARNAME"]="tmlqcd" +S["PACKAGE_NAME"]="tmLQCD" +S["PATH_SEPARATOR"]=":" +S["SHELL"]="/bin/bash" +_ACAWK +cat >>"$ac_tmp/subs1.awk" <<_ACAWK && + for (key in S) S_is_set[key] = 1 + FS = "" + +} +{ + line = $ 0 + nfields = split(line, field, "@") + substed = 0 + len = length(field[1]) + for (i = 2; i < nfields; i++) { + key = field[i] + keylen = length(key) + if (S_is_set[key]) { + value = S[key] + line = substr(line, 1, len) "" value "" substr(line, len + keylen + 3) + len += length(value) + length(field[++i]) + substed = 1 + } else + len += 1 + keylen + } + + print line +} + +_ACAWK +if sed "s/$ac_cr//" < /dev/null > /dev/null 2>&1; then + sed "s/$ac_cr\$//; s/$ac_cr/$ac_cs_awk_cr/g" +else + cat +fi < "$ac_tmp/subs1.awk" > "$ac_tmp/subs.awk" \ + || as_fn_error $? "could not setup config files machinery" "$LINENO" 5 +fi # test -n "$CONFIG_FILES" + +# Set up the scripts for CONFIG_HEADERS section. +# No need to generate them if there are no CONFIG_HEADERS. +# This happens for instance with `./config.status Makefile'. +if test -n "$CONFIG_HEADERS"; then +cat >"$ac_tmp/defines.awk" <<\_ACAWK || +BEGIN { +D["PACKAGE_NAME"]=" \"tmLQCD\"" +D["PACKAGE_TARNAME"]=" \"tmlqcd\"" +D["PACKAGE_VERSION"]=" \"5.2.0\"" +D["PACKAGE_STRING"]=" \"tmLQCD 5.2.0\"" +D["PACKAGE_BUGREPORT"]=" \"curbach@gmx.de\"" +D["PACKAGE_URL"]=" \"\"" +D["restrict"]=" __restrict" +D["YYTEXT_POINTER"]=" 1" +D["STDC_HEADERS"]=" 1" +D["HAVE_SYS_TYPES_H"]=" 1" +D["HAVE_SYS_STAT_H"]=" 1" +D["HAVE_STDLIB_H"]=" 1" +D["HAVE_STRING_H"]=" 1" +D["HAVE_MEMORY_H"]=" 1" +D["HAVE_STRINGS_H"]=" 1" +D["HAVE_INTTYPES_H"]=" 1" +D["HAVE_STDINT_H"]=" 1" +D["HAVE_UNISTD_H"]=" 1" +D["HAVE_STDINT_H"]=" 1" +D["HAVE_UINT16_T"]=" 1" +D["HAVE_UINT32_T"]=" 1" +D["HAVE_UINT64_T"]=" 1" +D["BENCHMARK"]=" 1" +D["MPI"]=" 1" +D["OMP"]=" 1" +D["HAVE_OMP_H"]=" 1" +D["PARALLELXYZT"]=" 1" +D["_NON_BLOCKING"]=" 1" +D["HAVE_LAPACK"]=" 1" +D["HAVE_CLOCK_GETTIME"]=" 1" +D["HAVE_CLOCK_GETTIME"]=" 1" +D["STDC_HEADERS"]=" 1" +D["HAVE_FLOAT_H"]=" 1" +D["HAVE_LIBINTL_H"]=" 1" +D["HAVE_LIMITS_H"]=" 1" +D["HAVE_STDINT_H"]=" 1" +D["HAVE_STDLIB_H"]=" 1" +D["HAVE_STRING_H"]=" 1" +D["HAVE_STRINGS_H"]=" 1" +D["HAVE_SYS_TIME_H"]=" 1" +D["HAVE_UNISTD_H"]=" 1" +D["HAVE_ENDIAN_H"]=" 1" +D["TIME_WITH_SYS_TIME"]=" 1" +D["_LARGEFILE_SOURCE"]=" 1" +D["HAVE_FSEEKO"]=" 1" +D["HAVE_STDLIB_H"]=" 1" +D["HAVE_MALLOC"]=" 0" +D["malloc"]=" rpl_malloc" +D["RETSIGTYPE"]=" void" +D["HAVE_GETTIMEOFDAY"]=" 1" +D["HAVE_POW"]=" 1" +D["HAVE_SQRT"]=" 1" +D["ALIGN_BASE"]=" 0x00" +D["ALIGN"]=" /**/" +D["ALIGN_BASE32"]=" 0x00" +D["ALIGN32"]=" /**/" +D["_USE_BGLDRAM"]=" 1" +D["_x86_64"]=" 1" +D["_GAUGE_COPY"]=" 1" +D["_USE_HALFSPINOR"]=" 1" + for (key in D) D_is_set[key] = 1 + FS = "" +} +/^[\t ]*#[\t ]*(define|undef)[\t ]+[_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ][_abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789]*([\t (]|$)/ { + line = $ 0 + split(line, arg, " ") + if (arg[1] == "#") { + defundef = arg[2] + mac1 = arg[3] + } else { + defundef = substr(arg[1], 2) + mac1 = arg[2] + } + split(mac1, mac2, "(") #) + macro = mac2[1] + prefix = substr(line, 1, index(line, defundef) - 1) + if (D_is_set[macro]) { + # Preserve the white space surrounding the "#". + print prefix "define", macro P[macro] D[macro] + next + } else { + # Replace #undef with comments. This is necessary, for example, + # in the case of _POSIX_SOURCE, which is predefined and required + # on some systems where configure will not decide to define it. + if (defundef == "undef") { + print "/*", prefix defundef, macro, "*/" + next + } + } +} +{ print } +_ACAWK + as_fn_error $? "could not setup config headers machinery" "$LINENO" 5 +fi # test -n "$CONFIG_HEADERS" + + +eval set X " :F $CONFIG_FILES :H $CONFIG_HEADERS " +shift +for ac_tag +do + case $ac_tag in + :[FHLC]) ac_mode=$ac_tag; continue;; + esac + case $ac_mode$ac_tag in + :[FHL]*:*);; + :L* | :C*:*) as_fn_error $? "invalid tag \`$ac_tag'" "$LINENO" 5;; + :[FH]-) ac_tag=-:-;; + :[FH]*) ac_tag=$ac_tag:$ac_tag.in;; + esac + ac_save_IFS=$IFS + IFS=: + set x $ac_tag + IFS=$ac_save_IFS + shift + ac_file=$1 + shift + + case $ac_mode in + :L) ac_source=$1;; + :[FH]) + ac_file_inputs= + for ac_f + do + case $ac_f in + -) ac_f="$ac_tmp/stdin";; + *) # Look for the file first in the build tree, then in the source tree + # (if the path is not absolute). The absolute path cannot be DOS-style, + # because $ac_f cannot contain `:'. + test -f "$ac_f" || + case $ac_f in + [\\/$]*) false;; + *) test -f "$srcdir/$ac_f" && ac_f="$srcdir/$ac_f";; + esac || + as_fn_error 1 "cannot find input file: \`$ac_f'" "$LINENO" 5;; + esac + case $ac_f in *\'*) ac_f=`$as_echo "$ac_f" | sed "s/'/'\\\\\\\\''/g"`;; esac + as_fn_append ac_file_inputs " '$ac_f'" + done + + # Let's still pretend it is `configure' which instantiates (i.e., don't + # use $as_me), people would be surprised to read: + # /* config.h. Generated by config.status. */ + configure_input='Generated from '` + $as_echo "$*" | sed 's|^[^:]*/||;s|:[^:]*/|, |g' + `' by configure.' + if test x"$ac_file" != x-; then + configure_input="$ac_file. $configure_input" + { $as_echo "$as_me:${as_lineno-$LINENO}: creating $ac_file" >&5 +$as_echo "$as_me: creating $ac_file" >&6;} + fi + # Neutralize special characters interpreted by sed in replacement strings. + case $configure_input in #( + *\&* | *\|* | *\\* ) + ac_sed_conf_input=`$as_echo "$configure_input" | + sed 's/[\\\\&|]/\\\\&/g'`;; #( + *) ac_sed_conf_input=$configure_input;; + esac + + case $ac_tag in + *:-:* | *:-) cat >"$ac_tmp/stdin" \ + || as_fn_error $? "could not create $ac_file" "$LINENO" 5 ;; + esac + ;; + esac + + ac_dir=`$as_dirname -- "$ac_file" || +$as_expr X"$ac_file" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ + X"$ac_file" : 'X\(//\)[^/]' \| \ + X"$ac_file" : 'X\(//\)$' \| \ + X"$ac_file" : 'X\(/\)' \| . 2>/dev/null || +$as_echo X"$ac_file" | + sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ + s//\1/ + q + } + /^X\(\/\/\)[^/].*/{ + s//\1/ + q + } + /^X\(\/\/\)$/{ + s//\1/ + q + } + /^X\(\/\).*/{ + s//\1/ + q + } + s/.*/./; q'` + as_dir="$ac_dir"; as_fn_mkdir_p + ac_builddir=. + +case "$ac_dir" in +.) ac_dir_suffix= ac_top_builddir_sub=. ac_top_build_prefix= ;; +*) + ac_dir_suffix=/`$as_echo "$ac_dir" | sed 's|^\.[\\/]||'` + # A ".." for each directory in $ac_dir_suffix. + ac_top_builddir_sub=`$as_echo "$ac_dir_suffix" | sed 's|/[^\\/]*|/..|g;s|/||'` + case $ac_top_builddir_sub in + "") ac_top_builddir_sub=. ac_top_build_prefix= ;; + *) ac_top_build_prefix=$ac_top_builddir_sub/ ;; + esac ;; +esac +ac_abs_top_builddir=$ac_pwd +ac_abs_builddir=$ac_pwd$ac_dir_suffix +# for backward compatibility: +ac_top_builddir=$ac_top_build_prefix + +case $srcdir in + .) # We are building in place. + ac_srcdir=. + ac_top_srcdir=$ac_top_builddir_sub + ac_abs_top_srcdir=$ac_pwd ;; + [\\/]* | ?:[\\/]* ) # Absolute name. + ac_srcdir=$srcdir$ac_dir_suffix; + ac_top_srcdir=$srcdir + ac_abs_top_srcdir=$srcdir ;; + *) # Relative name. + ac_srcdir=$ac_top_build_prefix$srcdir$ac_dir_suffix + ac_top_srcdir=$ac_top_build_prefix$srcdir + ac_abs_top_srcdir=$ac_pwd/$srcdir ;; +esac +ac_abs_srcdir=$ac_abs_top_srcdir$ac_dir_suffix + + + case $ac_mode in + :F) + # + # CONFIG_FILE + # + +# If the template does not know about datarootdir, expand it. +# FIXME: This hack should be removed a few years after 2.60. +ac_datarootdir_hack=; ac_datarootdir_seen= +ac_sed_dataroot=' +/datarootdir/ { + p + q +} +/@datadir@/p +/@docdir@/p +/@infodir@/p +/@localedir@/p +/@mandir@/p' +case `eval "sed -n \"\$ac_sed_dataroot\" $ac_file_inputs"` in +*datarootdir*) ac_datarootdir_seen=yes;; +*@datadir@*|*@docdir@*|*@infodir@*|*@localedir@*|*@mandir@*) + { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&5 +$as_echo "$as_me: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&2;} + ac_datarootdir_hack=' + s&@datadir@&${datarootdir}&g + s&@docdir@&${datarootdir}/doc/${PACKAGE_TARNAME}&g + s&@infodir@&${datarootdir}/info&g + s&@localedir@&${datarootdir}/locale&g + s&@mandir@&${datarootdir}/man&g + s&\${datarootdir}&${prefix}/share&g' ;; +esac +ac_sed_extra="/^[ ]*VPATH[ ]*=[ ]*/{ +h +s/// +s/^/:/ +s/[ ]*$/:/ +s/:\$(srcdir):/:/g +s/:\${srcdir}:/:/g +s/:@srcdir@:/:/g +s/^:*// +s/:*$// +x +s/\(=[ ]*\).*/\1/ +G +s/\n// +s/^[^=]*=[ ]*$// +} + +:t +/@[a-zA-Z_][a-zA-Z_0-9]*@/!b +s|@configure_input@|$ac_sed_conf_input|;t t +s&@top_builddir@&$ac_top_builddir_sub&;t t +s&@top_build_prefix@&$ac_top_build_prefix&;t t +s&@srcdir@&$ac_srcdir&;t t +s&@abs_srcdir@&$ac_abs_srcdir&;t t +s&@top_srcdir@&$ac_top_srcdir&;t t +s&@abs_top_srcdir@&$ac_abs_top_srcdir&;t t +s&@builddir@&$ac_builddir&;t t +s&@abs_builddir@&$ac_abs_builddir&;t t +s&@abs_top_builddir@&$ac_abs_top_builddir&;t t +$ac_datarootdir_hack +" +eval sed \"\$ac_sed_extra\" "$ac_file_inputs" | $AWK -f "$ac_tmp/subs.awk" \ + >$ac_tmp/out || as_fn_error $? "could not create $ac_file" "$LINENO" 5 + +test -z "$ac_datarootdir_hack$ac_datarootdir_seen" && + { ac_out=`sed -n '/\${datarootdir}/p' "$ac_tmp/out"`; test -n "$ac_out"; } && + { ac_out=`sed -n '/^[ ]*datarootdir[ ]*:*=/p' \ + "$ac_tmp/out"`; test -z "$ac_out"; } && + { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $ac_file contains a reference to the variable \`datarootdir' +which seems to be undefined. Please make sure it is defined" >&5 +$as_echo "$as_me: WARNING: $ac_file contains a reference to the variable \`datarootdir' +which seems to be undefined. Please make sure it is defined" >&2;} + + rm -f "$ac_tmp/stdin" + case $ac_file in + -) cat "$ac_tmp/out" && rm -f "$ac_tmp/out";; + *) rm -f "$ac_file" && mv "$ac_tmp/out" "$ac_file";; + esac \ + || as_fn_error $? "could not create $ac_file" "$LINENO" 5 + ;; + :H) + # + # CONFIG_HEADER + # + if test x"$ac_file" != x-; then + { + $as_echo "/* $configure_input */" \ + && eval '$AWK -f "$ac_tmp/defines.awk"' "$ac_file_inputs" + } >"$ac_tmp/config.h" \ + || as_fn_error $? "could not create $ac_file" "$LINENO" 5 + if diff "$ac_file" "$ac_tmp/config.h" >/dev/null 2>&1; then + { $as_echo "$as_me:${as_lineno-$LINENO}: $ac_file is unchanged" >&5 +$as_echo "$as_me: $ac_file is unchanged" >&6;} + else + rm -f "$ac_file" + mv "$ac_tmp/config.h" "$ac_file" \ + || as_fn_error $? "could not create $ac_file" "$LINENO" 5 + fi + else + $as_echo "/* $configure_input */" \ + && eval '$AWK -f "$ac_tmp/defines.awk"' "$ac_file_inputs" \ + || as_fn_error $? "could not create -" "$LINENO" 5 + fi + ;; + + + esac + +done # for ac_tag + + +as_fn_exit 0 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/config.sub b/qcd/part_cpu/applications/QCD/src/kernel_D/config.sub new file mode 100755 index 0000000000000000000000000000000000000000..5caee604dc1c927b1a9541bfff3e93e59eb65d5f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/config.sub @@ -0,0 +1,1625 @@ +#! /bin/sh +# Configuration validation subroutine script. +# Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, +# 2000, 2001, 2002, 2003, 2004, 2005, 2006 Free Software Foundation, +# Inc. + +timestamp='2006-11-07' + +# This file is (in principle) common to ALL GNU software. +# The presence of a machine in this file suggests that SOME GNU software +# can handle that machine. It does not imply ALL GNU software can. +# +# This file is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA +# 02110-1301, USA. +# +# As a special exception to the GNU General Public License, if you +# distribute this file as part of a program that contains a +# configuration script generated by Autoconf, you may include it under +# the same distribution terms that you use for the rest of that program. + + +# Please send patches to . Submit a context +# diff and a properly formatted ChangeLog entry. +# +# Configuration subroutine to validate and canonicalize a configuration type. +# Supply the specified configuration type as an argument. +# If it is invalid, we print an error message on stderr and exit with code 1. +# Otherwise, we print the canonical config type on stdout and succeed. + +# This file is supposed to be the same for all GNU packages +# and recognize all the CPU types, system types and aliases +# that are meaningful with *any* GNU software. +# Each package is responsible for reporting which valid configurations +# it does not support. The user should be able to distinguish +# a failure to support a valid configuration from a meaningless +# configuration. + +# The goal of this file is to map all the various variations of a given +# machine specification into a single specification in the form: +# CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM +# or in some cases, the newer four-part form: +# CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM +# It is wrong to echo any other type of specification. + +me=`echo "$0" | sed -e 's,.*/,,'` + +usage="\ +Usage: $0 [OPTION] CPU-MFR-OPSYS + $0 [OPTION] ALIAS + +Canonicalize a configuration name. + +Operation modes: + -h, --help print this help, then exit + -t, --time-stamp print date of last modification, then exit + -v, --version print version number, then exit + +Report bugs and patches to ." + +version="\ +GNU config.sub ($timestamp) + +Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005 +Free Software Foundation, Inc. + +This is free software; see the source for copying conditions. There is NO +warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." + +help=" +Try \`$me --help' for more information." + +# Parse command line +while test $# -gt 0 ; do + case $1 in + --time-stamp | --time* | -t ) + echo "$timestamp" ; exit ;; + --version | -v ) + echo "$version" ; exit ;; + --help | --h* | -h ) + echo "$usage"; exit ;; + -- ) # Stop option processing + shift; break ;; + - ) # Use stdin as input. + break ;; + -* ) + echo "$me: invalid option $1$help" + exit 1 ;; + + *local*) + # First pass through any local machine types. + echo $1 + exit ;; + + * ) + break ;; + esac +done + +case $# in + 0) echo "$me: missing argument$help" >&2 + exit 1;; + 1) ;; + *) echo "$me: too many arguments$help" >&2 + exit 1;; +esac + +# Separate what the user gave into CPU-COMPANY and OS or KERNEL-OS (if any). +# Here we must recognize all the valid KERNEL-OS combinations. +maybe_os=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\2/'` +case $maybe_os in + nto-qnx* | linux-gnu* | linux-dietlibc | linux-newlib* | linux-uclibc* | \ + uclinux-uclibc* | uclinux-gnu* | kfreebsd*-gnu* | knetbsd*-gnu* | netbsd*-gnu* | \ + storm-chaos* | os2-emx* | rtmk-nova*) + os=-$maybe_os + basic_machine=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'` + ;; + *) + basic_machine=`echo $1 | sed 's/-[^-]*$//'` + if [ $basic_machine != $1 ] + then os=`echo $1 | sed 's/.*-/-/'` + else os=; fi + ;; +esac + +### Let's recognize common machines as not being operating systems so +### that things like config.sub decstation-3100 work. We also +### recognize some manufacturers as not being operating systems, so we +### can provide default operating systems below. +case $os in + -sun*os*) + # Prevent following clause from handling this invalid input. + ;; + -dec* | -mips* | -sequent* | -encore* | -pc532* | -sgi* | -sony* | \ + -att* | -7300* | -3300* | -delta* | -motorola* | -sun[234]* | \ + -unicom* | -ibm* | -next | -hp | -isi* | -apollo | -altos* | \ + -convergent* | -ncr* | -news | -32* | -3600* | -3100* | -hitachi* |\ + -c[123]* | -convex* | -sun | -crds | -omron* | -dg | -ultra | -tti* | \ + -harris | -dolphin | -highlevel | -gould | -cbm | -ns | -masscomp | \ + -apple | -axis | -knuth | -cray) + os= + basic_machine=$1 + ;; + -sim | -cisco | -oki | -wec | -winbond) + os= + basic_machine=$1 + ;; + -scout) + ;; + -wrs) + os=-vxworks + basic_machine=$1 + ;; + -chorusos*) + os=-chorusos + basic_machine=$1 + ;; + -chorusrdb) + os=-chorusrdb + basic_machine=$1 + ;; + -hiux*) + os=-hiuxwe2 + ;; + -sco6) + os=-sco5v6 + basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + ;; + -sco5) + os=-sco3.2v5 + basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + ;; + -sco4) + os=-sco3.2v4 + basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + ;; + -sco3.2.[4-9]*) + os=`echo $os | sed -e 's/sco3.2./sco3.2v/'` + basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + ;; + -sco3.2v[4-9]*) + # Don't forget version if it is 3.2v4 or newer. + basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + ;; + -sco5v6*) + # Don't forget version if it is 3.2v4 or newer. + basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + ;; + -sco*) + os=-sco3.2v2 + basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + ;; + -udk*) + basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + ;; + -isc) + os=-isc2.2 + basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + ;; + -clix*) + basic_machine=clipper-intergraph + ;; + -isc*) + basic_machine=`echo $1 | sed -e 's/86-.*/86-pc/'` + ;; + -lynx*) + os=-lynxos + ;; + -ptx*) + basic_machine=`echo $1 | sed -e 's/86-.*/86-sequent/'` + ;; + -windowsnt*) + os=`echo $os | sed -e 's/windowsnt/winnt/'` + ;; + -psos*) + os=-psos + ;; + -mint | -mint[0-9]*) + basic_machine=m68k-atari + os=-mint + ;; +esac + +# Decode aliases for certain CPU-COMPANY combinations. +case $basic_machine in + # Recognize the basic CPU types without company name. + # Some are omitted here because they have special meanings below. + 1750a | 580 \ + | a29k \ + | alpha | alphaev[4-8] | alphaev56 | alphaev6[78] | alphapca5[67] \ + | alpha64 | alpha64ev[4-8] | alpha64ev56 | alpha64ev6[78] | alpha64pca5[67] \ + | am33_2.0 \ + | arc | arm | arm[bl]e | arme[lb] | armv[2345] | armv[345][lb] | avr | avr32 \ + | bfin \ + | c4x | clipper \ + | d10v | d30v | dlx | dsp16xx \ + | fr30 | frv \ + | h8300 | h8500 | hppa | hppa1.[01] | hppa2.0 | hppa2.0[nw] | hppa64 \ + | i370 | i860 | i960 | ia64 \ + | ip2k | iq2000 \ + | m32c | m32r | m32rle | m68000 | m68k | m88k \ + | maxq | mb | microblaze | mcore \ + | mips | mipsbe | mipseb | mipsel | mipsle \ + | mips16 \ + | mips64 | mips64el \ + | mips64vr | mips64vrel \ + | mips64orion | mips64orionel \ + | mips64vr4100 | mips64vr4100el \ + | mips64vr4300 | mips64vr4300el \ + | mips64vr5000 | mips64vr5000el \ + | mips64vr5900 | mips64vr5900el \ + | mipsisa32 | mipsisa32el \ + | mipsisa32r2 | mipsisa32r2el \ + | mipsisa64 | mipsisa64el \ + | mipsisa64r2 | mipsisa64r2el \ + | mipsisa64sb1 | mipsisa64sb1el \ + | mipsisa64sr71k | mipsisa64sr71kel \ + | mipstx39 | mipstx39el \ + | mn10200 | mn10300 \ + | mt \ + | msp430 \ + | nios | nios2 \ + | ns16k | ns32k \ + | or32 \ + | pdp10 | pdp11 | pj | pjl \ + | powerpc | powerpc64 | powerpc64le | powerpcle | ppcbe \ + | pyramid \ + | score \ + | sh | sh[1234] | sh[24]a | sh[23]e | sh[34]eb | sheb | shbe | shle | sh[1234]le | sh3ele \ + | sh64 | sh64le \ + | sparc | sparc64 | sparc64b | sparc64v | sparc86x | sparclet | sparclite \ + | sparcv8 | sparcv9 | sparcv9b | sparcv9v \ + | spu | strongarm \ + | tahoe | thumb | tic4x | tic80 | tron \ + | v850 | v850e \ + | we32k \ + | x86 | xc16x | xscale | xscalee[bl] | xstormy16 | xtensa \ + | z8k) + basic_machine=$basic_machine-unknown + ;; + m6811 | m68hc11 | m6812 | m68hc12) + # Motorola 68HC11/12. + basic_machine=$basic_machine-unknown + os=-none + ;; + m88110 | m680[12346]0 | m683?2 | m68360 | m5200 | v70 | w65 | z8k) + ;; + ms1) + basic_machine=mt-unknown + ;; + + # We use `pc' rather than `unknown' + # because (1) that's what they normally are, and + # (2) the word "unknown" tends to confuse beginning users. + i*86 | x86_64) + basic_machine=$basic_machine-pc + ;; + # Object if more than one company name word. + *-*-*) + echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2 + exit 1 + ;; + # Recognize the basic CPU types with company name. + 580-* \ + | a29k-* \ + | alpha-* | alphaev[4-8]-* | alphaev56-* | alphaev6[78]-* \ + | alpha64-* | alpha64ev[4-8]-* | alpha64ev56-* | alpha64ev6[78]-* \ + | alphapca5[67]-* | alpha64pca5[67]-* | arc-* \ + | arm-* | armbe-* | armle-* | armeb-* | armv*-* \ + | avr-* | avr32-* \ + | bfin-* | bs2000-* \ + | c[123]* | c30-* | [cjt]90-* | c4x-* | c54x-* | c55x-* | c6x-* \ + | clipper-* | craynv-* | cydra-* \ + | d10v-* | d30v-* | dlx-* \ + | elxsi-* \ + | f30[01]-* | f700-* | fr30-* | frv-* | fx80-* \ + | h8300-* | h8500-* \ + | hppa-* | hppa1.[01]-* | hppa2.0-* | hppa2.0[nw]-* | hppa64-* \ + | i*86-* | i860-* | i960-* | ia64-* \ + | ip2k-* | iq2000-* \ + | m32c-* | m32r-* | m32rle-* \ + | m68000-* | m680[012346]0-* | m68360-* | m683?2-* | m68k-* \ + | m88110-* | m88k-* | maxq-* | mcore-* \ + | mips-* | mipsbe-* | mipseb-* | mipsel-* | mipsle-* \ + | mips16-* \ + | mips64-* | mips64el-* \ + | mips64vr-* | mips64vrel-* \ + | mips64orion-* | mips64orionel-* \ + | mips64vr4100-* | mips64vr4100el-* \ + | mips64vr4300-* | mips64vr4300el-* \ + | mips64vr5000-* | mips64vr5000el-* \ + | mips64vr5900-* | mips64vr5900el-* \ + | mipsisa32-* | mipsisa32el-* \ + | mipsisa32r2-* | mipsisa32r2el-* \ + | mipsisa64-* | mipsisa64el-* \ + | mipsisa64r2-* | mipsisa64r2el-* \ + | mipsisa64sb1-* | mipsisa64sb1el-* \ + | mipsisa64sr71k-* | mipsisa64sr71kel-* \ + | mipstx39-* | mipstx39el-* \ + | mmix-* \ + | mt-* \ + | msp430-* \ + | nios-* | nios2-* \ + | none-* | np1-* | ns16k-* | ns32k-* \ + | orion-* \ + | pdp10-* | pdp11-* | pj-* | pjl-* | pn-* | power-* \ + | powerpc-* | powerpc64-* | powerpc64le-* | powerpcle-* | ppcbe-* \ + | pyramid-* \ + | romp-* | rs6000-* \ + | sh-* | sh[1234]-* | sh[24]a-* | sh[23]e-* | sh[34]eb-* | sheb-* | shbe-* \ + | shle-* | sh[1234]le-* | sh3ele-* | sh64-* | sh64le-* \ + | sparc-* | sparc64-* | sparc64b-* | sparc64v-* | sparc86x-* | sparclet-* \ + | sparclite-* \ + | sparcv8-* | sparcv9-* | sparcv9b-* | sparcv9v-* | strongarm-* | sv1-* | sx?-* \ + | tahoe-* | thumb-* \ + | tic30-* | tic4x-* | tic54x-* | tic55x-* | tic6x-* | tic80-* \ + | tron-* \ + | v850-* | v850e-* | vax-* \ + | we32k-* \ + | x86-* | x86_64-* | xc16x-* | xps100-* | xscale-* | xscalee[bl]-* \ + | xstormy16-* | xtensa-* \ + | ymp-* \ + | z8k-*) + ;; + # Recognize the various machine names and aliases which stand + # for a CPU type and a company and sometimes even an OS. + 386bsd) + basic_machine=i386-unknown + os=-bsd + ;; + 3b1 | 7300 | 7300-att | att-7300 | pc7300 | safari | unixpc) + basic_machine=m68000-att + ;; + 3b*) + basic_machine=we32k-att + ;; + a29khif) + basic_machine=a29k-amd + os=-udi + ;; + abacus) + basic_machine=abacus-unknown + ;; + adobe68k) + basic_machine=m68010-adobe + os=-scout + ;; + alliant | fx80) + basic_machine=fx80-alliant + ;; + altos | altos3068) + basic_machine=m68k-altos + ;; + am29k) + basic_machine=a29k-none + os=-bsd + ;; + amd64) + basic_machine=x86_64-pc + ;; + amd64-*) + basic_machine=x86_64-`echo $basic_machine | sed 's/^[^-]*-//'` + ;; + amdahl) + basic_machine=580-amdahl + os=-sysv + ;; + amiga | amiga-*) + basic_machine=m68k-unknown + ;; + amigaos | amigados) + basic_machine=m68k-unknown + os=-amigaos + ;; + amigaunix | amix) + basic_machine=m68k-unknown + os=-sysv4 + ;; + apollo68) + basic_machine=m68k-apollo + os=-sysv + ;; + apollo68bsd) + basic_machine=m68k-apollo + os=-bsd + ;; + aux) + basic_machine=m68k-apple + os=-aux + ;; + balance) + basic_machine=ns32k-sequent + os=-dynix + ;; + c90) + basic_machine=c90-cray + os=-unicos + ;; + convex-c1) + basic_machine=c1-convex + os=-bsd + ;; + convex-c2) + basic_machine=c2-convex + os=-bsd + ;; + convex-c32) + basic_machine=c32-convex + os=-bsd + ;; + convex-c34) + basic_machine=c34-convex + os=-bsd + ;; + convex-c38) + basic_machine=c38-convex + os=-bsd + ;; + cray | j90) + basic_machine=j90-cray + os=-unicos + ;; + craynv) + basic_machine=craynv-cray + os=-unicosmp + ;; + cr16c) + basic_machine=cr16c-unknown + os=-elf + ;; + crds | unos) + basic_machine=m68k-crds + ;; + crisv32 | crisv32-* | etraxfs*) + basic_machine=crisv32-axis + ;; + cris | cris-* | etrax*) + basic_machine=cris-axis + ;; + crx) + basic_machine=crx-unknown + os=-elf + ;; + da30 | da30-*) + basic_machine=m68k-da30 + ;; + decstation | decstation-3100 | pmax | pmax-* | pmin | dec3100 | decstatn) + basic_machine=mips-dec + ;; + decsystem10* | dec10*) + basic_machine=pdp10-dec + os=-tops10 + ;; + decsystem20* | dec20*) + basic_machine=pdp10-dec + os=-tops20 + ;; + delta | 3300 | motorola-3300 | motorola-delta \ + | 3300-motorola | delta-motorola) + basic_machine=m68k-motorola + ;; + delta88) + basic_machine=m88k-motorola + os=-sysv3 + ;; + djgpp) + basic_machine=i586-pc + os=-msdosdjgpp + ;; + dpx20 | dpx20-*) + basic_machine=rs6000-bull + os=-bosx + ;; + dpx2* | dpx2*-bull) + basic_machine=m68k-bull + os=-sysv3 + ;; + ebmon29k) + basic_machine=a29k-amd + os=-ebmon + ;; + elxsi) + basic_machine=elxsi-elxsi + os=-bsd + ;; + encore | umax | mmax) + basic_machine=ns32k-encore + ;; + es1800 | OSE68k | ose68k | ose | OSE) + basic_machine=m68k-ericsson + os=-ose + ;; + fx2800) + basic_machine=i860-alliant + ;; + genix) + basic_machine=ns32k-ns + ;; + gmicro) + basic_machine=tron-gmicro + os=-sysv + ;; + go32) + basic_machine=i386-pc + os=-go32 + ;; + h3050r* | hiux*) + basic_machine=hppa1.1-hitachi + os=-hiuxwe2 + ;; + h8300hms) + basic_machine=h8300-hitachi + os=-hms + ;; + h8300xray) + basic_machine=h8300-hitachi + os=-xray + ;; + h8500hms) + basic_machine=h8500-hitachi + os=-hms + ;; + harris) + basic_machine=m88k-harris + os=-sysv3 + ;; + hp300-*) + basic_machine=m68k-hp + ;; + hp300bsd) + basic_machine=m68k-hp + os=-bsd + ;; + hp300hpux) + basic_machine=m68k-hp + os=-hpux + ;; + hp3k9[0-9][0-9] | hp9[0-9][0-9]) + basic_machine=hppa1.0-hp + ;; + hp9k2[0-9][0-9] | hp9k31[0-9]) + basic_machine=m68000-hp + ;; + hp9k3[2-9][0-9]) + basic_machine=m68k-hp + ;; + hp9k6[0-9][0-9] | hp6[0-9][0-9]) + basic_machine=hppa1.0-hp + ;; + hp9k7[0-79][0-9] | hp7[0-79][0-9]) + basic_machine=hppa1.1-hp + ;; + hp9k78[0-9] | hp78[0-9]) + # FIXME: really hppa2.0-hp + basic_machine=hppa1.1-hp + ;; + hp9k8[67]1 | hp8[67]1 | hp9k80[24] | hp80[24] | hp9k8[78]9 | hp8[78]9 | hp9k893 | hp893) + # FIXME: really hppa2.0-hp + basic_machine=hppa1.1-hp + ;; + hp9k8[0-9][13679] | hp8[0-9][13679]) + basic_machine=hppa1.1-hp + ;; + hp9k8[0-9][0-9] | hp8[0-9][0-9]) + basic_machine=hppa1.0-hp + ;; + hppa-next) + os=-nextstep3 + ;; + hppaosf) + basic_machine=hppa1.1-hp + os=-osf + ;; + hppro) + basic_machine=hppa1.1-hp + os=-proelf + ;; + i370-ibm* | ibm*) + basic_machine=i370-ibm + ;; +# I'm not sure what "Sysv32" means. Should this be sysv3.2? + i*86v32) + basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'` + os=-sysv32 + ;; + i*86v4*) + basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'` + os=-sysv4 + ;; + i*86v) + basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'` + os=-sysv + ;; + i*86sol2) + basic_machine=`echo $1 | sed -e 's/86.*/86-pc/'` + os=-solaris2 + ;; + i386mach) + basic_machine=i386-mach + os=-mach + ;; + i386-vsta | vsta) + basic_machine=i386-unknown + os=-vsta + ;; + iris | iris4d) + basic_machine=mips-sgi + case $os in + -irix*) + ;; + *) + os=-irix4 + ;; + esac + ;; + isi68 | isi) + basic_machine=m68k-isi + os=-sysv + ;; + m88k-omron*) + basic_machine=m88k-omron + ;; + magnum | m3230) + basic_machine=mips-mips + os=-sysv + ;; + merlin) + basic_machine=ns32k-utek + os=-sysv + ;; + mingw32) + basic_machine=i386-pc + os=-mingw32 + ;; + miniframe) + basic_machine=m68000-convergent + ;; + *mint | -mint[0-9]* | *MiNT | *MiNT[0-9]*) + basic_machine=m68k-atari + os=-mint + ;; + mips3*-*) + basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'` + ;; + mips3*) + basic_machine=`echo $basic_machine | sed -e 's/mips3/mips64/'`-unknown + ;; + monitor) + basic_machine=m68k-rom68k + os=-coff + ;; + morphos) + basic_machine=powerpc-unknown + os=-morphos + ;; + msdos) + basic_machine=i386-pc + os=-msdos + ;; + ms1-*) + basic_machine=`echo $basic_machine | sed -e 's/ms1-/mt-/'` + ;; + mvs) + basic_machine=i370-ibm + os=-mvs + ;; + ncr3000) + basic_machine=i486-ncr + os=-sysv4 + ;; + netbsd386) + basic_machine=i386-unknown + os=-netbsd + ;; + netwinder) + basic_machine=armv4l-rebel + os=-linux + ;; + news | news700 | news800 | news900) + basic_machine=m68k-sony + os=-newsos + ;; + news1000) + basic_machine=m68030-sony + os=-newsos + ;; + news-3600 | risc-news) + basic_machine=mips-sony + os=-newsos + ;; + necv70) + basic_machine=v70-nec + os=-sysv + ;; + next | m*-next ) + basic_machine=m68k-next + case $os in + -nextstep* ) + ;; + -ns2*) + os=-nextstep2 + ;; + *) + os=-nextstep3 + ;; + esac + ;; + nh3000) + basic_machine=m68k-harris + os=-cxux + ;; + nh[45]000) + basic_machine=m88k-harris + os=-cxux + ;; + nindy960) + basic_machine=i960-intel + os=-nindy + ;; + mon960) + basic_machine=i960-intel + os=-mon960 + ;; + nonstopux) + basic_machine=mips-compaq + os=-nonstopux + ;; + np1) + basic_machine=np1-gould + ;; + nsr-tandem) + basic_machine=nsr-tandem + ;; + op50n-* | op60c-*) + basic_machine=hppa1.1-oki + os=-proelf + ;; + openrisc | openrisc-*) + basic_machine=or32-unknown + ;; + os400) + basic_machine=powerpc-ibm + os=-os400 + ;; + OSE68000 | ose68000) + basic_machine=m68000-ericsson + os=-ose + ;; + os68k) + basic_machine=m68k-none + os=-os68k + ;; + pa-hitachi) + basic_machine=hppa1.1-hitachi + os=-hiuxwe2 + ;; + paragon) + basic_machine=i860-intel + os=-osf + ;; + pbd) + basic_machine=sparc-tti + ;; + pbb) + basic_machine=m68k-tti + ;; + pc532 | pc532-*) + basic_machine=ns32k-pc532 + ;; + pc98) + basic_machine=i386-pc + ;; + pc98-*) + basic_machine=i386-`echo $basic_machine | sed 's/^[^-]*-//'` + ;; + pentium | p5 | k5 | k6 | nexgen | viac3) + basic_machine=i586-pc + ;; + pentiumpro | p6 | 6x86 | athlon | athlon_*) + basic_machine=i686-pc + ;; + pentiumii | pentium2 | pentiumiii | pentium3) + basic_machine=i686-pc + ;; + pentium4) + basic_machine=i786-pc + ;; + pentium-* | p5-* | k5-* | k6-* | nexgen-* | viac3-*) + basic_machine=i586-`echo $basic_machine | sed 's/^[^-]*-//'` + ;; + pentiumpro-* | p6-* | 6x86-* | athlon-*) + basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'` + ;; + pentiumii-* | pentium2-* | pentiumiii-* | pentium3-*) + basic_machine=i686-`echo $basic_machine | sed 's/^[^-]*-//'` + ;; + pentium4-*) + basic_machine=i786-`echo $basic_machine | sed 's/^[^-]*-//'` + ;; + pn) + basic_machine=pn-gould + ;; + power) basic_machine=power-ibm + ;; + ppc) basic_machine=powerpc-unknown + ;; + ppc-*) basic_machine=powerpc-`echo $basic_machine | sed 's/^[^-]*-//'` + ;; + ppcle | powerpclittle | ppc-le | powerpc-little) + basic_machine=powerpcle-unknown + ;; + ppcle-* | powerpclittle-*) + basic_machine=powerpcle-`echo $basic_machine | sed 's/^[^-]*-//'` + ;; + ppc64) basic_machine=powerpc64-unknown + ;; + ppc64-*) basic_machine=powerpc64-`echo $basic_machine | sed 's/^[^-]*-//'` + ;; + ppc64le | powerpc64little | ppc64-le | powerpc64-little) + basic_machine=powerpc64le-unknown + ;; + ppc64le-* | powerpc64little-*) + basic_machine=powerpc64le-`echo $basic_machine | sed 's/^[^-]*-//'` + ;; + ps2) + basic_machine=i386-ibm + ;; + pw32) + basic_machine=i586-unknown + os=-pw32 + ;; + rdos) + basic_machine=i386-pc + os=-rdos + ;; + rom68k) + basic_machine=m68k-rom68k + os=-coff + ;; + rm[46]00) + basic_machine=mips-siemens + ;; + rtpc | rtpc-*) + basic_machine=romp-ibm + ;; + s390 | s390-*) + basic_machine=s390-ibm + ;; + s390x | s390x-*) + basic_machine=s390x-ibm + ;; + sa29200) + basic_machine=a29k-amd + os=-udi + ;; + sb1) + basic_machine=mipsisa64sb1-unknown + ;; + sb1el) + basic_machine=mipsisa64sb1el-unknown + ;; + sde) + basic_machine=mipsisa32-sde + os=-elf + ;; + sei) + basic_machine=mips-sei + os=-seiux + ;; + sequent) + basic_machine=i386-sequent + ;; + sh) + basic_machine=sh-hitachi + os=-hms + ;; + sh5el) + basic_machine=sh5le-unknown + ;; + sh64) + basic_machine=sh64-unknown + ;; + sparclite-wrs | simso-wrs) + basic_machine=sparclite-wrs + os=-vxworks + ;; + sps7) + basic_machine=m68k-bull + os=-sysv2 + ;; + spur) + basic_machine=spur-unknown + ;; + st2000) + basic_machine=m68k-tandem + ;; + stratus) + basic_machine=i860-stratus + os=-sysv4 + ;; + sun2) + basic_machine=m68000-sun + ;; + sun2os3) + basic_machine=m68000-sun + os=-sunos3 + ;; + sun2os4) + basic_machine=m68000-sun + os=-sunos4 + ;; + sun3os3) + basic_machine=m68k-sun + os=-sunos3 + ;; + sun3os4) + basic_machine=m68k-sun + os=-sunos4 + ;; + sun4os3) + basic_machine=sparc-sun + os=-sunos3 + ;; + sun4os4) + basic_machine=sparc-sun + os=-sunos4 + ;; + sun4sol2) + basic_machine=sparc-sun + os=-solaris2 + ;; + sun3 | sun3-*) + basic_machine=m68k-sun + ;; + sun4) + basic_machine=sparc-sun + ;; + sun386 | sun386i | roadrunner) + basic_machine=i386-sun + ;; + sv1) + basic_machine=sv1-cray + os=-unicos + ;; + symmetry) + basic_machine=i386-sequent + os=-dynix + ;; + t3e) + basic_machine=alphaev5-cray + os=-unicos + ;; + t90) + basic_machine=t90-cray + os=-unicos + ;; + tic54x | c54x*) + basic_machine=tic54x-unknown + os=-coff + ;; + tic55x | c55x*) + basic_machine=tic55x-unknown + os=-coff + ;; + tic6x | c6x*) + basic_machine=tic6x-unknown + os=-coff + ;; + tx39) + basic_machine=mipstx39-unknown + ;; + tx39el) + basic_machine=mipstx39el-unknown + ;; + toad1) + basic_machine=pdp10-xkl + os=-tops20 + ;; + tower | tower-32) + basic_machine=m68k-ncr + ;; + tpf) + basic_machine=s390x-ibm + os=-tpf + ;; + udi29k) + basic_machine=a29k-amd + os=-udi + ;; + ultra3) + basic_machine=a29k-nyu + os=-sym1 + ;; + v810 | necv810) + basic_machine=v810-nec + os=-none + ;; + vaxv) + basic_machine=vax-dec + os=-sysv + ;; + vms) + basic_machine=vax-dec + os=-vms + ;; + vpp*|vx|vx-*) + basic_machine=f301-fujitsu + ;; + vxworks960) + basic_machine=i960-wrs + os=-vxworks + ;; + vxworks68) + basic_machine=m68k-wrs + os=-vxworks + ;; + vxworks29k) + basic_machine=a29k-wrs + os=-vxworks + ;; + w65*) + basic_machine=w65-wdc + os=-none + ;; + w89k-*) + basic_machine=hppa1.1-winbond + os=-proelf + ;; + xbox) + basic_machine=i686-pc + os=-mingw32 + ;; + xps | xps100) + basic_machine=xps100-honeywell + ;; + ymp) + basic_machine=ymp-cray + os=-unicos + ;; + z8k-*-coff) + basic_machine=z8k-unknown + os=-sim + ;; + none) + basic_machine=none-none + os=-none + ;; + +# Here we handle the default manufacturer of certain CPU types. It is in +# some cases the only manufacturer, in others, it is the most popular. + w89k) + basic_machine=hppa1.1-winbond + ;; + op50n) + basic_machine=hppa1.1-oki + ;; + op60c) + basic_machine=hppa1.1-oki + ;; + romp) + basic_machine=romp-ibm + ;; + mmix) + basic_machine=mmix-knuth + ;; + rs6000) + basic_machine=rs6000-ibm + ;; + vax) + basic_machine=vax-dec + ;; + pdp10) + # there are many clones, so DEC is not a safe bet + basic_machine=pdp10-unknown + ;; + pdp11) + basic_machine=pdp11-dec + ;; + we32k) + basic_machine=we32k-att + ;; + sh[1234] | sh[24]a | sh[34]eb | sh[1234]le | sh[23]ele) + basic_machine=sh-unknown + ;; + sparc | sparcv8 | sparcv9 | sparcv9b | sparcv9v) + basic_machine=sparc-sun + ;; + cydra) + basic_machine=cydra-cydrome + ;; + orion) + basic_machine=orion-highlevel + ;; + orion105) + basic_machine=clipper-highlevel + ;; + mac | mpw | mac-mpw) + basic_machine=m68k-apple + ;; + pmac | pmac-mpw) + basic_machine=powerpc-apple + ;; + *-unknown) + # Make sure to match an already-canonicalized machine name. + ;; + *) + echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2 + exit 1 + ;; +esac + +# Here we canonicalize certain aliases for manufacturers. +case $basic_machine in + *-digital*) + basic_machine=`echo $basic_machine | sed 's/digital.*/dec/'` + ;; + *-commodore*) + basic_machine=`echo $basic_machine | sed 's/commodore.*/cbm/'` + ;; + *) + ;; +esac + +# Decode manufacturer-specific aliases for certain operating systems. + +if [ x"$os" != x"" ] +then +case $os in + # First match some system type aliases + # that might get confused with valid system types. + # -solaris* is a basic system type, with this one exception. + -solaris1 | -solaris1.*) + os=`echo $os | sed -e 's|solaris1|sunos4|'` + ;; + -solaris) + os=-solaris2 + ;; + -svr4*) + os=-sysv4 + ;; + -unixware*) + os=-sysv4.2uw + ;; + -gnu/linux*) + os=`echo $os | sed -e 's|gnu/linux|linux-gnu|'` + ;; + # First accept the basic system types. + # The portable systems comes first. + # Each alternative MUST END IN A *, to match a version number. + # -sysv* is not here because it comes later, after sysvr4. + -gnu* | -bsd* | -mach* | -minix* | -genix* | -ultrix* | -irix* \ + | -*vms* | -sco* | -esix* | -isc* | -aix* | -sunos | -sunos[34]*\ + | -hpux* | -unos* | -osf* | -luna* | -dgux* | -solaris* | -sym* \ + | -amigaos* | -amigados* | -msdos* | -newsos* | -unicos* | -aof* \ + | -aos* \ + | -nindy* | -vxsim* | -vxworks* | -ebmon* | -hms* | -mvs* \ + | -clix* | -riscos* | -uniplus* | -iris* | -rtu* | -xenix* \ + | -hiux* | -386bsd* | -knetbsd* | -mirbsd* | -netbsd* \ + | -openbsd* | -solidbsd* \ + | -ekkobsd* | -kfreebsd* | -freebsd* | -riscix* | -lynxos* \ + | -bosx* | -nextstep* | -cxux* | -aout* | -elf* | -oabi* \ + | -ptx* | -coff* | -ecoff* | -winnt* | -domain* | -vsta* \ + | -udi* | -eabi* | -lites* | -ieee* | -go32* | -aux* \ + | -chorusos* | -chorusrdb* \ + | -cygwin* | -pe* | -psos* | -moss* | -proelf* | -rtems* \ + | -mingw32* | -linux-gnu* | -linux-newlib* | -linux-uclibc* \ + | -uxpv* | -beos* | -mpeix* | -udk* \ + | -interix* | -uwin* | -mks* | -rhapsody* | -darwin* | -opened* \ + | -openstep* | -oskit* | -conix* | -pw32* | -nonstopux* \ + | -storm-chaos* | -tops10* | -tenex* | -tops20* | -its* \ + | -os2* | -vos* | -palmos* | -uclinux* | -nucleus* \ + | -morphos* | -superux* | -rtmk* | -rtmk-nova* | -windiss* \ + | -powermax* | -dnix* | -nx6 | -nx7 | -sei* | -dragonfly* \ + | -skyos* | -haiku* | -rdos* | -toppers*) + # Remember, each alternative MUST END IN *, to match a version number. + ;; + -qnx*) + case $basic_machine in + x86-* | i*86-*) + ;; + *) + os=-nto$os + ;; + esac + ;; + -nto-qnx*) + ;; + -nto*) + os=`echo $os | sed -e 's|nto|nto-qnx|'` + ;; + -sim | -es1800* | -hms* | -xray | -os68k* | -none* | -v88r* \ + | -windows* | -osx | -abug | -netware* | -os9* | -beos* | -haiku* \ + | -macos* | -mpw* | -magic* | -mmixware* | -mon960* | -lnews*) + ;; + -mac*) + os=`echo $os | sed -e 's|mac|macos|'` + ;; + -linux-dietlibc) + os=-linux-dietlibc + ;; + -linux*) + os=`echo $os | sed -e 's|linux|linux-gnu|'` + ;; + -sunos5*) + os=`echo $os | sed -e 's|sunos5|solaris2|'` + ;; + -sunos6*) + os=`echo $os | sed -e 's|sunos6|solaris3|'` + ;; + -opened*) + os=-openedition + ;; + -os400*) + os=-os400 + ;; + -wince*) + os=-wince + ;; + -osfrose*) + os=-osfrose + ;; + -osf*) + os=-osf + ;; + -utek*) + os=-bsd + ;; + -dynix*) + os=-bsd + ;; + -acis*) + os=-aos + ;; + -atheos*) + os=-atheos + ;; + -syllable*) + os=-syllable + ;; + -386bsd) + os=-bsd + ;; + -ctix* | -uts*) + os=-sysv + ;; + -nova*) + os=-rtmk-nova + ;; + -ns2 ) + os=-nextstep2 + ;; + -nsk*) + os=-nsk + ;; + # Preserve the version number of sinix5. + -sinix5.*) + os=`echo $os | sed -e 's|sinix|sysv|'` + ;; + -sinix*) + os=-sysv4 + ;; + -tpf*) + os=-tpf + ;; + -triton*) + os=-sysv3 + ;; + -oss*) + os=-sysv3 + ;; + -svr4) + os=-sysv4 + ;; + -svr3) + os=-sysv3 + ;; + -sysvr4) + os=-sysv4 + ;; + # This must come after -sysvr4. + -sysv*) + ;; + -ose*) + os=-ose + ;; + -es1800*) + os=-ose + ;; + -xenix) + os=-xenix + ;; + -*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*) + os=-mint + ;; + -aros*) + os=-aros + ;; + -kaos*) + os=-kaos + ;; + -zvmoe) + os=-zvmoe + ;; + -blrts) + os=-blrts + ;; + -bprts) + os=-bprts + ;; + -none) + ;; + *) + # Get rid of the `-' at the beginning of $os. + os=`echo $os | sed 's/[^-]*-//'` + echo Invalid configuration \`$1\': system \`$os\' not recognized 1>&2 + exit 1 + ;; +esac +else + +# Here we handle the default operating systems that come with various machines. +# The value should be what the vendor currently ships out the door with their +# machine or put another way, the most popular os provided with the machine. + +# Note that if you're going to try to match "-MANUFACTURER" here (say, +# "-sun"), then you have to tell the case statement up towards the top +# that MANUFACTURER isn't an operating system. Otherwise, code above +# will signal an error saying that MANUFACTURER isn't an operating +# system, and we'll never get to this point. + +case $basic_machine in + score-*) + os=-elf + ;; + spu-*) + os=-elf + ;; + *-acorn) + os=-riscix1.2 + ;; + arm*-rebel) + os=-linux + ;; + arm*-semi) + os=-aout + ;; + c4x-* | tic4x-*) + os=-coff + ;; + # This must come before the *-dec entry. + pdp10-*) + os=-tops20 + ;; + pdp11-*) + os=-none + ;; + *-dec | vax-*) + os=-ultrix4.2 + ;; + m68*-apollo) + os=-domain + ;; + i386-sun) + os=-sunos4.0.2 + ;; + m68000-sun) + os=-sunos3 + # This also exists in the configure program, but was not the + # default. + # os=-sunos4 + ;; + m68*-cisco) + os=-aout + ;; + mips*-cisco) + os=-elf + ;; + mips*-*) + os=-elf + ;; + or32-*) + os=-coff + ;; + *-tti) # must be before sparc entry or we get the wrong os. + os=-sysv3 + ;; + sparc-* | *-sun) + os=-sunos4.1.1 + ;; + *-be) + os=-beos + ;; + *-haiku) + os=-haiku + ;; + *-ibm) + os=-aix + ;; + *-knuth) + os=-mmixware + ;; + *-wec) + os=-proelf + ;; + *-winbond) + os=-proelf + ;; + *-oki) + os=-proelf + ;; + *-hp) + os=-hpux + ;; + *-hitachi) + os=-hiux + ;; + i860-* | *-att | *-ncr | *-altos | *-motorola | *-convergent) + os=-sysv + ;; + *-cbm) + os=-amigaos + ;; + *-dg) + os=-dgux + ;; + *-dolphin) + os=-sysv3 + ;; + m68k-ccur) + os=-rtu + ;; + m88k-omron*) + os=-luna + ;; + *-next ) + os=-nextstep + ;; + *-sequent) + os=-ptx + ;; + *-crds) + os=-unos + ;; + *-ns) + os=-genix + ;; + i370-*) + os=-mvs + ;; + *-next) + os=-nextstep3 + ;; + *-gould) + os=-sysv + ;; + *-highlevel) + os=-bsd + ;; + *-encore) + os=-bsd + ;; + *-sgi) + os=-irix + ;; + *-siemens) + os=-sysv4 + ;; + *-masscomp) + os=-rtu + ;; + f30[01]-fujitsu | f700-fujitsu) + os=-uxpv + ;; + *-rom68k) + os=-coff + ;; + *-*bug) + os=-coff + ;; + *-apple) + os=-macos + ;; + *-atari*) + os=-mint + ;; + *) + os=-none + ;; +esac +fi + +# Here we handle the case where we know the os, and the CPU type, but not the +# manufacturer. We pick the logical manufacturer. +vendor=unknown +case $basic_machine in + *-unknown) + case $os in + -riscix*) + vendor=acorn + ;; + -sunos*) + vendor=sun + ;; + -aix*) + vendor=ibm + ;; + -beos*) + vendor=be + ;; + -hpux*) + vendor=hp + ;; + -mpeix*) + vendor=hp + ;; + -hiux*) + vendor=hitachi + ;; + -unos*) + vendor=crds + ;; + -dgux*) + vendor=dg + ;; + -luna*) + vendor=omron + ;; + -genix*) + vendor=ns + ;; + -mvs* | -opened*) + vendor=ibm + ;; + -os400*) + vendor=ibm + ;; + -ptx*) + vendor=sequent + ;; + -tpf*) + vendor=ibm + ;; + -vxsim* | -vxworks* | -windiss*) + vendor=wrs + ;; + -aux*) + vendor=apple + ;; + -hms*) + vendor=hitachi + ;; + -mpw* | -macos*) + vendor=apple + ;; + -*mint | -mint[0-9]* | -*MiNT | -MiNT[0-9]*) + vendor=atari + ;; + -vos*) + vendor=stratus + ;; + esac + basic_machine=`echo $basic_machine | sed "s/unknown/$vendor/"` + ;; +esac + +echo $basic_machine$os +exit + +# Local variables: +# eval: (add-hook 'write-file-hooks 'time-stamp) +# time-stamp-start: "timestamp='" +# time-stamp-format: "%:y-%02m-%02d" +# time-stamp-end: "'" +# End: diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/configure b/qcd/part_cpu/applications/QCD/src/kernel_D/configure new file mode 100755 index 0000000000000000000000000000000000000000..6e3c67065735402ae1615b3bd4dee8b2a286dcfa --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/configure @@ -0,0 +1,9823 @@ +#! /bin/sh +# Guess values for system-dependent variables and create Makefiles. +# Generated by GNU Autoconf 2.69 for tmLQCD 5.2.0. +# +# Report bugs to . +# +# +# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. +# +# +# This configure script is free software; the Free Software Foundation +# gives unlimited permission to copy, distribute and modify it. +## -------------------- ## +## M4sh Initialization. ## +## -------------------- ## + +# Be more Bourne compatible +DUALCASE=1; export DUALCASE # for MKS sh +if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then : + emulate sh + NULLCMD=: + # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which + # is contrary to our usage. Disable this feature. + alias -g '${1+"$@"}'='"$@"' + setopt NO_GLOB_SUBST +else + case `(set -o) 2>/dev/null` in #( + *posix*) : + set -o posix ;; #( + *) : + ;; +esac +fi + + +as_nl=' +' +export as_nl +# Printing a long string crashes Solaris 7 /usr/bin/printf. +as_echo='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' +as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo +as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo$as_echo +# Prefer a ksh shell builtin over an external printf program on Solaris, +# but without wasting forks for bash or zsh. +if test -z "$BASH_VERSION$ZSH_VERSION" \ + && (test "X`print -r -- $as_echo`" = "X$as_echo") 2>/dev/null; then + as_echo='print -r --' + as_echo_n='print -rn --' +elif (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then + as_echo='printf %s\n' + as_echo_n='printf %s' +else + if test "X`(/usr/ucb/echo -n -n $as_echo) 2>/dev/null`" = "X-n $as_echo"; then + as_echo_body='eval /usr/ucb/echo -n "$1$as_nl"' + as_echo_n='/usr/ucb/echo -n' + else + as_echo_body='eval expr "X$1" : "X\\(.*\\)"' + as_echo_n_body='eval + arg=$1; + case $arg in #( + *"$as_nl"*) + expr "X$arg" : "X\\(.*\\)$as_nl"; + arg=`expr "X$arg" : ".*$as_nl\\(.*\\)"`;; + esac; + expr "X$arg" : "X\\(.*\\)" | tr -d "$as_nl" + ' + export as_echo_n_body + as_echo_n='sh -c $as_echo_n_body as_echo' + fi + export as_echo_body + as_echo='sh -c $as_echo_body as_echo' +fi + +# The user is always right. +if test "${PATH_SEPARATOR+set}" != set; then + PATH_SEPARATOR=: + (PATH='/bin;/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 && { + (PATH='/bin:/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 || + PATH_SEPARATOR=';' + } +fi + + +# IFS +# We need space, tab and new line, in precisely that order. Quoting is +# there to prevent editors from complaining about space-tab. +# (If _AS_PATH_WALK were called with IFS unset, it would disable word +# splitting by setting IFS to empty value.) +IFS=" "" $as_nl" + +# Find who we are. Look in the path if we contain no directory separator. +as_myself= +case $0 in #(( + *[\\/]* ) as_myself=$0 ;; + *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break + done +IFS=$as_save_IFS + + ;; +esac +# We did not find ourselves, most probably we were run as `sh COMMAND' +# in which case we are not to be found in the path. +if test "x$as_myself" = x; then + as_myself=$0 +fi +if test ! -f "$as_myself"; then + $as_echo "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2 + exit 1 +fi + +# Unset variables that we do not need and which cause bugs (e.g. in +# pre-3.0 UWIN ksh). But do not cause bugs in bash 2.01; the "|| exit 1" +# suppresses any "Segmentation fault" message there. '((' could +# trigger a bug in pdksh 5.2.14. +for as_var in BASH_ENV ENV MAIL MAILPATH +do eval test x\${$as_var+set} = xset \ + && ( (unset $as_var) || exit 1) >/dev/null 2>&1 && unset $as_var || : +done +PS1='$ ' +PS2='> ' +PS4='+ ' + +# NLS nuisances. +LC_ALL=C +export LC_ALL +LANGUAGE=C +export LANGUAGE + +# CDPATH. +(unset CDPATH) >/dev/null 2>&1 && unset CDPATH + +# Use a proper internal environment variable to ensure we don't fall + # into an infinite loop, continuously re-executing ourselves. + if test x"${_as_can_reexec}" != xno && test "x$CONFIG_SHELL" != x; then + _as_can_reexec=no; export _as_can_reexec; + # We cannot yet assume a decent shell, so we have to provide a +# neutralization value for shells without unset; and this also +# works around shells that cannot unset nonexistent variables. +# Preserve -v and -x to the replacement shell. +BASH_ENV=/dev/null +ENV=/dev/null +(unset BASH_ENV) >/dev/null 2>&1 && unset BASH_ENV ENV +case $- in # (((( + *v*x* | *x*v* ) as_opts=-vx ;; + *v* ) as_opts=-v ;; + *x* ) as_opts=-x ;; + * ) as_opts= ;; +esac +exec $CONFIG_SHELL $as_opts "$as_myself" ${1+"$@"} +# Admittedly, this is quite paranoid, since all the known shells bail +# out after a failed `exec'. +$as_echo "$0: could not re-execute with $CONFIG_SHELL" >&2 +as_fn_exit 255 + fi + # We don't want this to propagate to other subprocesses. + { _as_can_reexec=; unset _as_can_reexec;} +if test "x$CONFIG_SHELL" = x; then + as_bourne_compatible="if test -n \"\${ZSH_VERSION+set}\" && (emulate sh) >/dev/null 2>&1; then : + emulate sh + NULLCMD=: + # Pre-4.2 versions of Zsh do word splitting on \${1+\"\$@\"}, which + # is contrary to our usage. Disable this feature. + alias -g '\${1+\"\$@\"}'='\"\$@\"' + setopt NO_GLOB_SUBST +else + case \`(set -o) 2>/dev/null\` in #( + *posix*) : + set -o posix ;; #( + *) : + ;; +esac +fi +" + as_required="as_fn_return () { (exit \$1); } +as_fn_success () { as_fn_return 0; } +as_fn_failure () { as_fn_return 1; } +as_fn_ret_success () { return 0; } +as_fn_ret_failure () { return 1; } + +exitcode=0 +as_fn_success || { exitcode=1; echo as_fn_success failed.; } +as_fn_failure && { exitcode=1; echo as_fn_failure succeeded.; } +as_fn_ret_success || { exitcode=1; echo as_fn_ret_success failed.; } +as_fn_ret_failure && { exitcode=1; echo as_fn_ret_failure succeeded.; } +if ( set x; as_fn_ret_success y && test x = \"\$1\" ); then : + +else + exitcode=1; echo positional parameters were not saved. +fi +test x\$exitcode = x0 || exit 1 +test -x / || exit 1" + as_suggested=" as_lineno_1=";as_suggested=$as_suggested$LINENO;as_suggested=$as_suggested" as_lineno_1a=\$LINENO + as_lineno_2=";as_suggested=$as_suggested$LINENO;as_suggested=$as_suggested" as_lineno_2a=\$LINENO + eval 'test \"x\$as_lineno_1'\$as_run'\" != \"x\$as_lineno_2'\$as_run'\" && + test \"x\`expr \$as_lineno_1'\$as_run' + 1\`\" = \"x\$as_lineno_2'\$as_run'\"' || exit 1 +test \$(( 1 + 1 )) = 2 || exit 1" + if (eval "$as_required") 2>/dev/null; then : + as_have_required=yes +else + as_have_required=no +fi + if test x$as_have_required = xyes && (eval "$as_suggested") 2>/dev/null; then : + +else + as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +as_found=false +for as_dir in /bin$PATH_SEPARATOR/usr/bin$PATH_SEPARATOR$PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + as_found=: + case $as_dir in #( + /*) + for as_base in sh bash ksh sh5; do + # Try only shells that exist, to save several forks. + as_shell=$as_dir/$as_base + if { test -f "$as_shell" || test -f "$as_shell.exe"; } && + { $as_echo "$as_bourne_compatible""$as_required" | as_run=a "$as_shell"; } 2>/dev/null; then : + CONFIG_SHELL=$as_shell as_have_required=yes + if { $as_echo "$as_bourne_compatible""$as_suggested" | as_run=a "$as_shell"; } 2>/dev/null; then : + break 2 +fi +fi + done;; + esac + as_found=false +done +$as_found || { if { test -f "$SHELL" || test -f "$SHELL.exe"; } && + { $as_echo "$as_bourne_compatible""$as_required" | as_run=a "$SHELL"; } 2>/dev/null; then : + CONFIG_SHELL=$SHELL as_have_required=yes +fi; } +IFS=$as_save_IFS + + + if test "x$CONFIG_SHELL" != x; then : + export CONFIG_SHELL + # We cannot yet assume a decent shell, so we have to provide a +# neutralization value for shells without unset; and this also +# works around shells that cannot unset nonexistent variables. +# Preserve -v and -x to the replacement shell. +BASH_ENV=/dev/null +ENV=/dev/null +(unset BASH_ENV) >/dev/null 2>&1 && unset BASH_ENV ENV +case $- in # (((( + *v*x* | *x*v* ) as_opts=-vx ;; + *v* ) as_opts=-v ;; + *x* ) as_opts=-x ;; + * ) as_opts= ;; +esac +exec $CONFIG_SHELL $as_opts "$as_myself" ${1+"$@"} +# Admittedly, this is quite paranoid, since all the known shells bail +# out after a failed `exec'. +$as_echo "$0: could not re-execute with $CONFIG_SHELL" >&2 +exit 255 +fi + + if test x$as_have_required = xno; then : + $as_echo "$0: This script requires a shell more modern than all" + $as_echo "$0: the shells that I found on your system." + if test x${ZSH_VERSION+set} = xset ; then + $as_echo "$0: In particular, zsh $ZSH_VERSION has bugs and should" + $as_echo "$0: be upgraded to zsh 4.3.4 or later." + else + $as_echo "$0: Please tell bug-autoconf@gnu.org and curbach@gmx.de +$0: about your system, including any error possibly output +$0: before this message. Then install a modern shell, or +$0: manually run the script under such a shell if you do +$0: have one." + fi + exit 1 +fi +fi +fi +SHELL=${CONFIG_SHELL-/bin/sh} +export SHELL +# Unset more variables known to interfere with behavior of common tools. +CLICOLOR_FORCE= GREP_OPTIONS= +unset CLICOLOR_FORCE GREP_OPTIONS + +## --------------------- ## +## M4sh Shell Functions. ## +## --------------------- ## +# as_fn_unset VAR +# --------------- +# Portably unset VAR. +as_fn_unset () +{ + { eval $1=; unset $1;} +} +as_unset=as_fn_unset + +# as_fn_set_status STATUS +# ----------------------- +# Set $? to STATUS, without forking. +as_fn_set_status () +{ + return $1 +} # as_fn_set_status + +# as_fn_exit STATUS +# ----------------- +# Exit the shell with STATUS, even in a "trap 0" or "set -e" context. +as_fn_exit () +{ + set +e + as_fn_set_status $1 + exit $1 +} # as_fn_exit + +# as_fn_mkdir_p +# ------------- +# Create "$as_dir" as a directory, including parents if necessary. +as_fn_mkdir_p () +{ + + case $as_dir in #( + -*) as_dir=./$as_dir;; + esac + test -d "$as_dir" || eval $as_mkdir_p || { + as_dirs= + while :; do + case $as_dir in #( + *\'*) as_qdir=`$as_echo "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #'( + *) as_qdir=$as_dir;; + esac + as_dirs="'$as_qdir' $as_dirs" + as_dir=`$as_dirname -- "$as_dir" || +$as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ + X"$as_dir" : 'X\(//\)[^/]' \| \ + X"$as_dir" : 'X\(//\)$' \| \ + X"$as_dir" : 'X\(/\)' \| . 2>/dev/null || +$as_echo X"$as_dir" | + sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ + s//\1/ + q + } + /^X\(\/\/\)[^/].*/{ + s//\1/ + q + } + /^X\(\/\/\)$/{ + s//\1/ + q + } + /^X\(\/\).*/{ + s//\1/ + q + } + s/.*/./; q'` + test -d "$as_dir" && break + done + test -z "$as_dirs" || eval "mkdir $as_dirs" + } || test -d "$as_dir" || as_fn_error $? "cannot create directory $as_dir" + + +} # as_fn_mkdir_p + +# as_fn_executable_p FILE +# ----------------------- +# Test if FILE is an executable regular file. +as_fn_executable_p () +{ + test -f "$1" && test -x "$1" +} # as_fn_executable_p +# as_fn_append VAR VALUE +# ---------------------- +# Append the text in VALUE to the end of the definition contained in VAR. Take +# advantage of any shell optimizations that allow amortized linear growth over +# repeated appends, instead of the typical quadratic growth present in naive +# implementations. +if (eval "as_var=1; as_var+=2; test x\$as_var = x12") 2>/dev/null; then : + eval 'as_fn_append () + { + eval $1+=\$2 + }' +else + as_fn_append () + { + eval $1=\$$1\$2 + } +fi # as_fn_append + +# as_fn_arith ARG... +# ------------------ +# Perform arithmetic evaluation on the ARGs, and store the result in the +# global $as_val. Take advantage of shells that can avoid forks. The arguments +# must be portable across $(()) and expr. +if (eval "test \$(( 1 + 1 )) = 2") 2>/dev/null; then : + eval 'as_fn_arith () + { + as_val=$(( $* )) + }' +else + as_fn_arith () + { + as_val=`expr "$@" || test $? -eq 1` + } +fi # as_fn_arith + + +# as_fn_error STATUS ERROR [LINENO LOG_FD] +# ---------------------------------------- +# Output "`basename $0`: error: ERROR" to stderr. If LINENO and LOG_FD are +# provided, also output the error to LOG_FD, referencing LINENO. Then exit the +# script with STATUS, using 1 if that was 0. +as_fn_error () +{ + as_status=$1; test $as_status -eq 0 && as_status=1 + if test "$4"; then + as_lineno=${as_lineno-"$3"} as_lineno_stack=as_lineno_stack=$as_lineno_stack + $as_echo "$as_me:${as_lineno-$LINENO}: error: $2" >&$4 + fi + $as_echo "$as_me: error: $2" >&2 + as_fn_exit $as_status +} # as_fn_error + +if expr a : '\(a\)' >/dev/null 2>&1 && + test "X`expr 00001 : '.*\(...\)'`" = X001; then + as_expr=expr +else + as_expr=false +fi + +if (basename -- /) >/dev/null 2>&1 && test "X`basename -- / 2>&1`" = "X/"; then + as_basename=basename +else + as_basename=false +fi + +if (as_dir=`dirname -- /` && test "X$as_dir" = X/) >/dev/null 2>&1; then + as_dirname=dirname +else + as_dirname=false +fi + +as_me=`$as_basename -- "$0" || +$as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \ + X"$0" : 'X\(//\)$' \| \ + X"$0" : 'X\(/\)' \| . 2>/dev/null || +$as_echo X/"$0" | + sed '/^.*\/\([^/][^/]*\)\/*$/{ + s//\1/ + q + } + /^X\/\(\/\/\)$/{ + s//\1/ + q + } + /^X\/\(\/\).*/{ + s//\1/ + q + } + s/.*/./; q'` + +# Avoid depending upon Character Ranges. +as_cr_letters='abcdefghijklmnopqrstuvwxyz' +as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ' +as_cr_Letters=$as_cr_letters$as_cr_LETTERS +as_cr_digits='0123456789' +as_cr_alnum=$as_cr_Letters$as_cr_digits + + + as_lineno_1=$LINENO as_lineno_1a=$LINENO + as_lineno_2=$LINENO as_lineno_2a=$LINENO + eval 'test "x$as_lineno_1'$as_run'" != "x$as_lineno_2'$as_run'" && + test "x`expr $as_lineno_1'$as_run' + 1`" = "x$as_lineno_2'$as_run'"' || { + # Blame Lee E. McMahon (1931-1989) for sed's syntax. :-) + sed -n ' + p + /[$]LINENO/= + ' <$as_myself | + sed ' + s/[$]LINENO.*/&-/ + t lineno + b + :lineno + N + :loop + s/[$]LINENO\([^'$as_cr_alnum'_].*\n\)\(.*\)/\2\1\2/ + t loop + s/-\n.*// + ' >$as_me.lineno && + chmod +x "$as_me.lineno" || + { $as_echo "$as_me: error: cannot create $as_me.lineno; rerun with a POSIX shell" >&2; as_fn_exit 1; } + + # If we had to re-execute with $CONFIG_SHELL, we're ensured to have + # already done that, so ensure we don't try to do so again and fall + # in an infinite loop. This has already happened in practice. + _as_can_reexec=no; export _as_can_reexec + # Don't try to exec as it changes $[0], causing all sort of problems + # (the dirname of $[0] is not the place where we might find the + # original and so on. Autoconf is especially sensitive to this). + . "./$as_me.lineno" + # Exit status is that of the last command. + exit +} + +ECHO_C= ECHO_N= ECHO_T= +case `echo -n x` in #((((( +-n*) + case `echo 'xy\c'` in + *c*) ECHO_T=' ';; # ECHO_T is single tab character. + xy) ECHO_C='\c';; + *) echo `echo ksh88 bug on AIX 6.1` > /dev/null + ECHO_T=' ';; + esac;; +*) + ECHO_N='-n';; +esac + +rm -f conf$$ conf$$.exe conf$$.file +if test -d conf$$.dir; then + rm -f conf$$.dir/conf$$.file +else + rm -f conf$$.dir + mkdir conf$$.dir 2>/dev/null +fi +if (echo >conf$$.file) 2>/dev/null; then + if ln -s conf$$.file conf$$ 2>/dev/null; then + as_ln_s='ln -s' + # ... but there are two gotchas: + # 1) On MSYS, both `ln -s file dir' and `ln file dir' fail. + # 2) DJGPP < 2.04 has no symlinks; `ln -s' creates a wrapper executable. + # In both cases, we have to default to `cp -pR'. + ln -s conf$$.file conf$$.dir 2>/dev/null && test ! -f conf$$.exe || + as_ln_s='cp -pR' + elif ln conf$$.file conf$$ 2>/dev/null; then + as_ln_s=ln + else + as_ln_s='cp -pR' + fi +else + as_ln_s='cp -pR' +fi +rm -f conf$$ conf$$.exe conf$$.dir/conf$$.file conf$$.file +rmdir conf$$.dir 2>/dev/null + +if mkdir -p . 2>/dev/null; then + as_mkdir_p='mkdir -p "$as_dir"' +else + test -d ./-p && rmdir ./-p + as_mkdir_p=false +fi + +as_test_x='test -x' +as_executable_p=as_fn_executable_p + +# Sed expression to map a string onto a valid CPP name. +as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'" + +# Sed expression to map a string onto a valid variable name. +as_tr_sh="eval sed 'y%*+%pp%;s%[^_$as_cr_alnum]%_%g'" + + +test -n "$DJDIR" || exec 7<&0 &1 + +# Name of the host. +# hostname on some systems (SVR3.2, old GNU/Linux) returns a bogus exit status, +# so uname gets run too. +ac_hostname=`(hostname || uname -n) 2>/dev/null | sed 1q` + +# +# Initializations. +# +ac_default_prefix=/usr/local +ac_clean_files= +ac_config_libobj_dir=. +LIBOBJS= +cross_compiling=no +subdirs= +MFLAGS= +MAKEFLAGS= + +# Identity of this package. +PACKAGE_NAME='tmLQCD' +PACKAGE_TARNAME='tmlqcd' +PACKAGE_VERSION='5.2.0' +PACKAGE_STRING='tmLQCD 5.2.0' +PACKAGE_BUGREPORT='curbach@gmx.de' +PACKAGE_URL='' + +ac_unique_file="hmc_tm.c" +ac_default_prefix=$HOME +# Factoring default headers for most tests. +ac_includes_default="\ +#include +#ifdef HAVE_SYS_TYPES_H +# include +#endif +#ifdef HAVE_SYS_STAT_H +# include +#endif +#ifdef STDC_HEADERS +# include +# include +#else +# ifdef HAVE_STDLIB_H +# include +# endif +#endif +#ifdef HAVE_STRING_H +# if !defined STDC_HEADERS && defined HAVE_MEMORY_H +# include +# endif +# include +#endif +#ifdef HAVE_STRINGS_H +# include +#endif +#ifdef HAVE_INTTYPES_H +# include +#endif +#ifdef HAVE_STDINT_H +# include +#endif +#ifdef HAVE_UNISTD_H +# include +#endif" + +ac_subst_vars='LTLIBOBJS +QUDA_AVAILABLE +ac_ct_CXX +CXXFLAGS +CXX +GPUMPICOMPILER +GPUCFLAGS +GPUDIR +NVCC +USESUBDIRS +QUDA_INTERFACE +SPI_FILES +LEMON_AVAILABLE +XLIB +MEASDIR +XCHANGEDIR +XCHANGELIB +PROFILE_FLAG +DEBUG_FLAG +DEPFLAGS +CCLD +SOLVEROUT +AUTOCONF +INCLUDES +SOPTARGS +OPTARGS +LIBOBJS +OPENMP_CFLAGS +EGREP +GREP +CPP +CCDEP +RANLIB +SET_MAKE +LEXLIB +LEX_OUTPUT_ROOT +LEX +AR +FLIBS +ac_ct_F77 +FFLAGS +F77 +OBJEXT +EXEEXT +ac_ct_CC +CPPFLAGS +LDFLAGS +CFLAGS +CC +host_os +host_vendor +host_cpu +host +build_os +build_vendor +build_cpu +build +target_alias +host_alias +build_alias +LIBS +ECHO_T +ECHO_N +ECHO_C +DEFS +mandir +localedir +libdir +psdir +pdfdir +dvidir +htmldir +infodir +docdir +oldincludedir +includedir +localstatedir +sharedstatedir +sysconfdir +datadir +datarootdir +libexecdir +sbindir +bindir +program_transform_name +prefix +exec_prefix +PACKAGE_URL +PACKAGE_BUGREPORT +PACKAGE_STRING +PACKAGE_VERSION +PACKAGE_TARNAME +PACKAGE_NAME +PATH_SEPARATOR +SHELL' +ac_subst_files='' +ac_user_opts=' +enable_option_checking +enable_benchmark +with_limedir +with_lemondir +enable_indexindepgeom +enable_mpi +enable_qpx +enable_spi +enable_omp +enable_openmp +enable_fftw +with_mpidimension +with_persistentmpi +with_nonblockingmpi +with_fixedvolume +with_kojakinst +with_lapack +enable_largefile +enable_alignment +enable_p4 +enable_opteron +enable_sse2 +enable_sse3 +with_gprof +with_bgldram +enable_optimize +enable_gaugecopy +enable_halfspinor +enable_shmem +enable_tsplitpar +enable_laph +enable_gpu +with_cuda +with_cudacompileargs +with_qudadir +with_cudadir +' + ac_precious_vars='build_alias +host_alias +target_alias +CC +CFLAGS +LDFLAGS +LIBS +CPPFLAGS +F77 +FFLAGS +CPP +CXX +CXXFLAGS +CCC' + + +# Initialize some variables set by options. +ac_init_help= +ac_init_version=false +ac_unrecognized_opts= +ac_unrecognized_sep= +# The variables have the same names as the options, with +# dashes changed to underlines. +cache_file=/dev/null +exec_prefix=NONE +no_create= +no_recursion= +prefix=NONE +program_prefix=NONE +program_suffix=NONE +program_transform_name=s,x,x, +silent= +site= +srcdir= +verbose= +x_includes=NONE +x_libraries=NONE + +# Installation directory options. +# These are left unexpanded so users can "make install exec_prefix=/foo" +# and all the variables that are supposed to be based on exec_prefix +# by default will actually change. +# Use braces instead of parens because sh, perl, etc. also accept them. +# (The list follows the same order as the GNU Coding Standards.) +bindir='${exec_prefix}/bin' +sbindir='${exec_prefix}/sbin' +libexecdir='${exec_prefix}/libexec' +datarootdir='${prefix}/share' +datadir='${datarootdir}' +sysconfdir='${prefix}/etc' +sharedstatedir='${prefix}/com' +localstatedir='${prefix}/var' +includedir='${prefix}/include' +oldincludedir='/usr/include' +docdir='${datarootdir}/doc/${PACKAGE_TARNAME}' +infodir='${datarootdir}/info' +htmldir='${docdir}' +dvidir='${docdir}' +pdfdir='${docdir}' +psdir='${docdir}' +libdir='${exec_prefix}/lib' +localedir='${datarootdir}/locale' +mandir='${datarootdir}/man' + +ac_prev= +ac_dashdash= +for ac_option +do + # If the previous option needs an argument, assign it. + if test -n "$ac_prev"; then + eval $ac_prev=\$ac_option + ac_prev= + continue + fi + + case $ac_option in + *=?*) ac_optarg=`expr "X$ac_option" : '[^=]*=\(.*\)'` ;; + *=) ac_optarg= ;; + *) ac_optarg=yes ;; + esac + + # Accept the important Cygnus configure options, so we can diagnose typos. + + case $ac_dashdash$ac_option in + --) + ac_dashdash=yes ;; + + -bindir | --bindir | --bindi | --bind | --bin | --bi) + ac_prev=bindir ;; + -bindir=* | --bindir=* | --bindi=* | --bind=* | --bin=* | --bi=*) + bindir=$ac_optarg ;; + + -build | --build | --buil | --bui | --bu) + ac_prev=build_alias ;; + -build=* | --build=* | --buil=* | --bui=* | --bu=*) + build_alias=$ac_optarg ;; + + -cache-file | --cache-file | --cache-fil | --cache-fi \ + | --cache-f | --cache- | --cache | --cach | --cac | --ca | --c) + ac_prev=cache_file ;; + -cache-file=* | --cache-file=* | --cache-fil=* | --cache-fi=* \ + | --cache-f=* | --cache-=* | --cache=* | --cach=* | --cac=* | --ca=* | --c=*) + cache_file=$ac_optarg ;; + + --config-cache | -C) + cache_file=config.cache ;; + + -datadir | --datadir | --datadi | --datad) + ac_prev=datadir ;; + -datadir=* | --datadir=* | --datadi=* | --datad=*) + datadir=$ac_optarg ;; + + -datarootdir | --datarootdir | --datarootdi | --datarootd | --dataroot \ + | --dataroo | --dataro | --datar) + ac_prev=datarootdir ;; + -datarootdir=* | --datarootdir=* | --datarootdi=* | --datarootd=* \ + | --dataroot=* | --dataroo=* | --dataro=* | --datar=*) + datarootdir=$ac_optarg ;; + + -disable-* | --disable-*) + ac_useropt=`expr "x$ac_option" : 'x-*disable-\(.*\)'` + # Reject names that are not valid shell variable names. + expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null && + as_fn_error $? "invalid feature name: $ac_useropt" + ac_useropt_orig=$ac_useropt + ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'` + case $ac_user_opts in + *" +"enable_$ac_useropt" +"*) ;; + *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--disable-$ac_useropt_orig" + ac_unrecognized_sep=', ';; + esac + eval enable_$ac_useropt=no ;; + + -docdir | --docdir | --docdi | --doc | --do) + ac_prev=docdir ;; + -docdir=* | --docdir=* | --docdi=* | --doc=* | --do=*) + docdir=$ac_optarg ;; + + -dvidir | --dvidir | --dvidi | --dvid | --dvi | --dv) + ac_prev=dvidir ;; + -dvidir=* | --dvidir=* | --dvidi=* | --dvid=* | --dvi=* | --dv=*) + dvidir=$ac_optarg ;; + + -enable-* | --enable-*) + ac_useropt=`expr "x$ac_option" : 'x-*enable-\([^=]*\)'` + # Reject names that are not valid shell variable names. + expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null && + as_fn_error $? "invalid feature name: $ac_useropt" + ac_useropt_orig=$ac_useropt + ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'` + case $ac_user_opts in + *" +"enable_$ac_useropt" +"*) ;; + *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--enable-$ac_useropt_orig" + ac_unrecognized_sep=', ';; + esac + eval enable_$ac_useropt=\$ac_optarg ;; + + -exec-prefix | --exec_prefix | --exec-prefix | --exec-prefi \ + | --exec-pref | --exec-pre | --exec-pr | --exec-p | --exec- \ + | --exec | --exe | --ex) + ac_prev=exec_prefix ;; + -exec-prefix=* | --exec_prefix=* | --exec-prefix=* | --exec-prefi=* \ + | --exec-pref=* | --exec-pre=* | --exec-pr=* | --exec-p=* | --exec-=* \ + | --exec=* | --exe=* | --ex=*) + exec_prefix=$ac_optarg ;; + + -gas | --gas | --ga | --g) + # Obsolete; use --with-gas. + with_gas=yes ;; + + -help | --help | --hel | --he | -h) + ac_init_help=long ;; + -help=r* | --help=r* | --hel=r* | --he=r* | -hr*) + ac_init_help=recursive ;; + -help=s* | --help=s* | --hel=s* | --he=s* | -hs*) + ac_init_help=short ;; + + -host | --host | --hos | --ho) + ac_prev=host_alias ;; + -host=* | --host=* | --hos=* | --ho=*) + host_alias=$ac_optarg ;; + + -htmldir | --htmldir | --htmldi | --htmld | --html | --htm | --ht) + ac_prev=htmldir ;; + -htmldir=* | --htmldir=* | --htmldi=* | --htmld=* | --html=* | --htm=* \ + | --ht=*) + htmldir=$ac_optarg ;; + + -includedir | --includedir | --includedi | --included | --include \ + | --includ | --inclu | --incl | --inc) + ac_prev=includedir ;; + -includedir=* | --includedir=* | --includedi=* | --included=* | --include=* \ + | --includ=* | --inclu=* | --incl=* | --inc=*) + includedir=$ac_optarg ;; + + -infodir | --infodir | --infodi | --infod | --info | --inf) + ac_prev=infodir ;; + -infodir=* | --infodir=* | --infodi=* | --infod=* | --info=* | --inf=*) + infodir=$ac_optarg ;; + + -libdir | --libdir | --libdi | --libd) + ac_prev=libdir ;; + -libdir=* | --libdir=* | --libdi=* | --libd=*) + libdir=$ac_optarg ;; + + -libexecdir | --libexecdir | --libexecdi | --libexecd | --libexec \ + | --libexe | --libex | --libe) + ac_prev=libexecdir ;; + -libexecdir=* | --libexecdir=* | --libexecdi=* | --libexecd=* | --libexec=* \ + | --libexe=* | --libex=* | --libe=*) + libexecdir=$ac_optarg ;; + + -localedir | --localedir | --localedi | --localed | --locale) + ac_prev=localedir ;; + -localedir=* | --localedir=* | --localedi=* | --localed=* | --locale=*) + localedir=$ac_optarg ;; + + -localstatedir | --localstatedir | --localstatedi | --localstated \ + | --localstate | --localstat | --localsta | --localst | --locals) + ac_prev=localstatedir ;; + -localstatedir=* | --localstatedir=* | --localstatedi=* | --localstated=* \ + | --localstate=* | --localstat=* | --localsta=* | --localst=* | --locals=*) + localstatedir=$ac_optarg ;; + + -mandir | --mandir | --mandi | --mand | --man | --ma | --m) + ac_prev=mandir ;; + -mandir=* | --mandir=* | --mandi=* | --mand=* | --man=* | --ma=* | --m=*) + mandir=$ac_optarg ;; + + -nfp | --nfp | --nf) + # Obsolete; use --without-fp. + with_fp=no ;; + + -no-create | --no-create | --no-creat | --no-crea | --no-cre \ + | --no-cr | --no-c | -n) + no_create=yes ;; + + -no-recursion | --no-recursion | --no-recursio | --no-recursi \ + | --no-recurs | --no-recur | --no-recu | --no-rec | --no-re | --no-r) + no_recursion=yes ;; + + -oldincludedir | --oldincludedir | --oldincludedi | --oldincluded \ + | --oldinclude | --oldinclud | --oldinclu | --oldincl | --oldinc \ + | --oldin | --oldi | --old | --ol | --o) + ac_prev=oldincludedir ;; + -oldincludedir=* | --oldincludedir=* | --oldincludedi=* | --oldincluded=* \ + | --oldinclude=* | --oldinclud=* | --oldinclu=* | --oldincl=* | --oldinc=* \ + | --oldin=* | --oldi=* | --old=* | --ol=* | --o=*) + oldincludedir=$ac_optarg ;; + + -prefix | --prefix | --prefi | --pref | --pre | --pr | --p) + ac_prev=prefix ;; + -prefix=* | --prefix=* | --prefi=* | --pref=* | --pre=* | --pr=* | --p=*) + prefix=$ac_optarg ;; + + -program-prefix | --program-prefix | --program-prefi | --program-pref \ + | --program-pre | --program-pr | --program-p) + ac_prev=program_prefix ;; + -program-prefix=* | --program-prefix=* | --program-prefi=* \ + | --program-pref=* | --program-pre=* | --program-pr=* | --program-p=*) + program_prefix=$ac_optarg ;; + + -program-suffix | --program-suffix | --program-suffi | --program-suff \ + | --program-suf | --program-su | --program-s) + ac_prev=program_suffix ;; + -program-suffix=* | --program-suffix=* | --program-suffi=* \ + | --program-suff=* | --program-suf=* | --program-su=* | --program-s=*) + program_suffix=$ac_optarg ;; + + -program-transform-name | --program-transform-name \ + | --program-transform-nam | --program-transform-na \ + | --program-transform-n | --program-transform- \ + | --program-transform | --program-transfor \ + | --program-transfo | --program-transf \ + | --program-trans | --program-tran \ + | --progr-tra | --program-tr | --program-t) + ac_prev=program_transform_name ;; + -program-transform-name=* | --program-transform-name=* \ + | --program-transform-nam=* | --program-transform-na=* \ + | --program-transform-n=* | --program-transform-=* \ + | --program-transform=* | --program-transfor=* \ + | --program-transfo=* | --program-transf=* \ + | --program-trans=* | --program-tran=* \ + | --progr-tra=* | --program-tr=* | --program-t=*) + program_transform_name=$ac_optarg ;; + + -pdfdir | --pdfdir | --pdfdi | --pdfd | --pdf | --pd) + ac_prev=pdfdir ;; + -pdfdir=* | --pdfdir=* | --pdfdi=* | --pdfd=* | --pdf=* | --pd=*) + pdfdir=$ac_optarg ;; + + -psdir | --psdir | --psdi | --psd | --ps) + ac_prev=psdir ;; + -psdir=* | --psdir=* | --psdi=* | --psd=* | --ps=*) + psdir=$ac_optarg ;; + + -q | -quiet | --quiet | --quie | --qui | --qu | --q \ + | -silent | --silent | --silen | --sile | --sil) + silent=yes ;; + + -sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb) + ac_prev=sbindir ;; + -sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \ + | --sbi=* | --sb=*) + sbindir=$ac_optarg ;; + + -sharedstatedir | --sharedstatedir | --sharedstatedi \ + | --sharedstated | --sharedstate | --sharedstat | --sharedsta \ + | --sharedst | --shareds | --shared | --share | --shar \ + | --sha | --sh) + ac_prev=sharedstatedir ;; + -sharedstatedir=* | --sharedstatedir=* | --sharedstatedi=* \ + | --sharedstated=* | --sharedstate=* | --sharedstat=* | --sharedsta=* \ + | --sharedst=* | --shareds=* | --shared=* | --share=* | --shar=* \ + | --sha=* | --sh=*) + sharedstatedir=$ac_optarg ;; + + -site | --site | --sit) + ac_prev=site ;; + -site=* | --site=* | --sit=*) + site=$ac_optarg ;; + + -srcdir | --srcdir | --srcdi | --srcd | --src | --sr) + ac_prev=srcdir ;; + -srcdir=* | --srcdir=* | --srcdi=* | --srcd=* | --src=* | --sr=*) + srcdir=$ac_optarg ;; + + -sysconfdir | --sysconfdir | --sysconfdi | --sysconfd | --sysconf \ + | --syscon | --sysco | --sysc | --sys | --sy) + ac_prev=sysconfdir ;; + -sysconfdir=* | --sysconfdir=* | --sysconfdi=* | --sysconfd=* | --sysconf=* \ + | --syscon=* | --sysco=* | --sysc=* | --sys=* | --sy=*) + sysconfdir=$ac_optarg ;; + + -target | --target | --targe | --targ | --tar | --ta | --t) + ac_prev=target_alias ;; + -target=* | --target=* | --targe=* | --targ=* | --tar=* | --ta=* | --t=*) + target_alias=$ac_optarg ;; + + -v | -verbose | --verbose | --verbos | --verbo | --verb) + verbose=yes ;; + + -version | --version | --versio | --versi | --vers | -V) + ac_init_version=: ;; + + -with-* | --with-*) + ac_useropt=`expr "x$ac_option" : 'x-*with-\([^=]*\)'` + # Reject names that are not valid shell variable names. + expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null && + as_fn_error $? "invalid package name: $ac_useropt" + ac_useropt_orig=$ac_useropt + ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'` + case $ac_user_opts in + *" +"with_$ac_useropt" +"*) ;; + *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--with-$ac_useropt_orig" + ac_unrecognized_sep=', ';; + esac + eval with_$ac_useropt=\$ac_optarg ;; + + -without-* | --without-*) + ac_useropt=`expr "x$ac_option" : 'x-*without-\(.*\)'` + # Reject names that are not valid shell variable names. + expr "x$ac_useropt" : ".*[^-+._$as_cr_alnum]" >/dev/null && + as_fn_error $? "invalid package name: $ac_useropt" + ac_useropt_orig=$ac_useropt + ac_useropt=`$as_echo "$ac_useropt" | sed 's/[-+.]/_/g'` + case $ac_user_opts in + *" +"with_$ac_useropt" +"*) ;; + *) ac_unrecognized_opts="$ac_unrecognized_opts$ac_unrecognized_sep--without-$ac_useropt_orig" + ac_unrecognized_sep=', ';; + esac + eval with_$ac_useropt=no ;; + + --x) + # Obsolete; use --with-x. + with_x=yes ;; + + -x-includes | --x-includes | --x-include | --x-includ | --x-inclu \ + | --x-incl | --x-inc | --x-in | --x-i) + ac_prev=x_includes ;; + -x-includes=* | --x-includes=* | --x-include=* | --x-includ=* | --x-inclu=* \ + | --x-incl=* | --x-inc=* | --x-in=* | --x-i=*) + x_includes=$ac_optarg ;; + + -x-libraries | --x-libraries | --x-librarie | --x-librari \ + | --x-librar | --x-libra | --x-libr | --x-lib | --x-li | --x-l) + ac_prev=x_libraries ;; + -x-libraries=* | --x-libraries=* | --x-librarie=* | --x-librari=* \ + | --x-librar=* | --x-libra=* | --x-libr=* | --x-lib=* | --x-li=* | --x-l=*) + x_libraries=$ac_optarg ;; + + -*) as_fn_error $? "unrecognized option: \`$ac_option' +Try \`$0 --help' for more information" + ;; + + *=*) + ac_envvar=`expr "x$ac_option" : 'x\([^=]*\)='` + # Reject names that are not valid shell variable names. + case $ac_envvar in #( + '' | [0-9]* | *[!_$as_cr_alnum]* ) + as_fn_error $? "invalid variable name: \`$ac_envvar'" ;; + esac + eval $ac_envvar=\$ac_optarg + export $ac_envvar ;; + + *) + # FIXME: should be removed in autoconf 3.0. + $as_echo "$as_me: WARNING: you should use --build, --host, --target" >&2 + expr "x$ac_option" : ".*[^-._$as_cr_alnum]" >/dev/null && + $as_echo "$as_me: WARNING: invalid host type: $ac_option" >&2 + : "${build_alias=$ac_option} ${host_alias=$ac_option} ${target_alias=$ac_option}" + ;; + + esac +done + +if test -n "$ac_prev"; then + ac_option=--`echo $ac_prev | sed 's/_/-/g'` + as_fn_error $? "missing argument to $ac_option" +fi + +if test -n "$ac_unrecognized_opts"; then + case $enable_option_checking in + no) ;; + fatal) as_fn_error $? "unrecognized options: $ac_unrecognized_opts" ;; + *) $as_echo "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2 ;; + esac +fi + +# Check all directory arguments for consistency. +for ac_var in exec_prefix prefix bindir sbindir libexecdir datarootdir \ + datadir sysconfdir sharedstatedir localstatedir includedir \ + oldincludedir docdir infodir htmldir dvidir pdfdir psdir \ + libdir localedir mandir +do + eval ac_val=\$$ac_var + # Remove trailing slashes. + case $ac_val in + */ ) + ac_val=`expr "X$ac_val" : 'X\(.*[^/]\)' \| "X$ac_val" : 'X\(.*\)'` + eval $ac_var=\$ac_val;; + esac + # Be sure to have absolute directory names. + case $ac_val in + [\\/$]* | ?:[\\/]* ) continue;; + NONE | '' ) case $ac_var in *prefix ) continue;; esac;; + esac + as_fn_error $? "expected an absolute directory name for --$ac_var: $ac_val" +done + +# There might be people who depend on the old broken behavior: `$host' +# used to hold the argument of --host etc. +# FIXME: To remove some day. +build=$build_alias +host=$host_alias +target=$target_alias + +# FIXME: To remove some day. +if test "x$host_alias" != x; then + if test "x$build_alias" = x; then + cross_compiling=maybe + elif test "x$build_alias" != "x$host_alias"; then + cross_compiling=yes + fi +fi + +ac_tool_prefix= +test -n "$host_alias" && ac_tool_prefix=$host_alias- + +test "$silent" = yes && exec 6>/dev/null + + +ac_pwd=`pwd` && test -n "$ac_pwd" && +ac_ls_di=`ls -di .` && +ac_pwd_ls_di=`cd "$ac_pwd" && ls -di .` || + as_fn_error $? "working directory cannot be determined" +test "X$ac_ls_di" = "X$ac_pwd_ls_di" || + as_fn_error $? "pwd does not report name of working directory" + + +# Find the source files, if location was not specified. +if test -z "$srcdir"; then + ac_srcdir_defaulted=yes + # Try the directory containing this script, then the parent directory. + ac_confdir=`$as_dirname -- "$as_myself" || +$as_expr X"$as_myself" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ + X"$as_myself" : 'X\(//\)[^/]' \| \ + X"$as_myself" : 'X\(//\)$' \| \ + X"$as_myself" : 'X\(/\)' \| . 2>/dev/null || +$as_echo X"$as_myself" | + sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ + s//\1/ + q + } + /^X\(\/\/\)[^/].*/{ + s//\1/ + q + } + /^X\(\/\/\)$/{ + s//\1/ + q + } + /^X\(\/\).*/{ + s//\1/ + q + } + s/.*/./; q'` + srcdir=$ac_confdir + if test ! -r "$srcdir/$ac_unique_file"; then + srcdir=.. + fi +else + ac_srcdir_defaulted=no +fi +if test ! -r "$srcdir/$ac_unique_file"; then + test "$ac_srcdir_defaulted" = yes && srcdir="$ac_confdir or .." + as_fn_error $? "cannot find sources ($ac_unique_file) in $srcdir" +fi +ac_msg="sources are in $srcdir, but \`cd $srcdir' does not work" +ac_abs_confdir=`( + cd "$srcdir" && test -r "./$ac_unique_file" || as_fn_error $? "$ac_msg" + pwd)` +# When building in place, set srcdir=. +if test "$ac_abs_confdir" = "$ac_pwd"; then + srcdir=. +fi +# Remove unnecessary trailing slashes from srcdir. +# Double slashes in file names in object file debugging info +# mess up M-x gdb in Emacs. +case $srcdir in +*/) srcdir=`expr "X$srcdir" : 'X\(.*[^/]\)' \| "X$srcdir" : 'X\(.*\)'`;; +esac +for ac_var in $ac_precious_vars; do + eval ac_env_${ac_var}_set=\${${ac_var}+set} + eval ac_env_${ac_var}_value=\$${ac_var} + eval ac_cv_env_${ac_var}_set=\${${ac_var}+set} + eval ac_cv_env_${ac_var}_value=\$${ac_var} +done + +# +# Report the --help message. +# +if test "$ac_init_help" = "long"; then + # Omit some internal or obsolete options to make the list less imposing. + # This message is too long to be a string in the A/UX 3.1 sh. + cat <<_ACEOF +\`configure' configures tmLQCD 5.2.0 to adapt to many kinds of systems. + +Usage: $0 [OPTION]... [VAR=VALUE]... + +To assign environment variables (e.g., CC, CFLAGS...), specify them as +VAR=VALUE. See below for descriptions of some of the useful variables. + +Defaults for the options are specified in brackets. + +Configuration: + -h, --help display this help and exit + --help=short display options specific to this package + --help=recursive display the short help of all the included packages + -V, --version display version information and exit + -q, --quiet, --silent do not print \`checking ...' messages + --cache-file=FILE cache test results in FILE [disabled] + -C, --config-cache alias for \`--cache-file=config.cache' + -n, --no-create do not create output files + --srcdir=DIR find the sources in DIR [configure dir or \`..'] + +Installation directories: + --prefix=PREFIX install architecture-independent files in PREFIX + [$ac_default_prefix] + --exec-prefix=EPREFIX install architecture-dependent files in EPREFIX + [PREFIX] + +By default, \`make install' will install all the files in +\`$ac_default_prefix/bin', \`$ac_default_prefix/lib' etc. You can specify +an installation prefix other than \`$ac_default_prefix' using \`--prefix', +for instance \`--prefix=\$HOME'. + +For better control, use the options below. + +Fine tuning of the installation directories: + --bindir=DIR user executables [EPREFIX/bin] + --sbindir=DIR system admin executables [EPREFIX/sbin] + --libexecdir=DIR program executables [EPREFIX/libexec] + --sysconfdir=DIR read-only single-machine data [PREFIX/etc] + --sharedstatedir=DIR modifiable architecture-independent data [PREFIX/com] + --localstatedir=DIR modifiable single-machine data [PREFIX/var] + --libdir=DIR object code libraries [EPREFIX/lib] + --includedir=DIR C header files [PREFIX/include] + --oldincludedir=DIR C header files for non-gcc [/usr/include] + --datarootdir=DIR read-only arch.-independent data root [PREFIX/share] + --datadir=DIR read-only architecture-independent data [DATAROOTDIR] + --infodir=DIR info documentation [DATAROOTDIR/info] + --localedir=DIR locale-dependent data [DATAROOTDIR/locale] + --mandir=DIR man documentation [DATAROOTDIR/man] + --docdir=DIR documentation root [DATAROOTDIR/doc/tmlqcd] + --htmldir=DIR html documentation [DOCDIR] + --dvidir=DIR dvi documentation [DOCDIR] + --pdfdir=DIR pdf documentation [DOCDIR] + --psdir=DIR ps documentation [DOCDIR] +_ACEOF + + cat <<\_ACEOF + +Program names: + --program-prefix=PREFIX prepend PREFIX to installed program names + --program-suffix=SUFFIX append SUFFIX to installed program names + --program-transform-name=PROGRAM run sed PROGRAM on installed program names + +System types: + --build=BUILD configure for building on BUILD [guessed] + --host=HOST cross-compile to build programs to run on HOST [BUILD] +_ACEOF +fi + +if test -n "$ac_init_help"; then + case $ac_init_help in + short | recursive ) echo "Configuration of tmLQCD 5.2.0:";; + esac + cat <<\_ACEOF + +Optional Features: + --disable-option-checking ignore unrecognized --enable/--with options + --disable-FEATURE do not include FEATURE (same as --enable-FEATURE=no) + --enable-FEATURE[=ARG] include FEATURE [ARG=yes] + --enable-benchmark enable use of benchmark [default=yes] + --enable-indexindepgeom enable Index independent addressing [default=no] + --enable-mpi enable use of mpi [default=yes] + --enable-qpx enable use of qpx intrinsics [default=no] + --enable-spi enable use of SPI [default=no] + --enable-omp enable use of OpenMP [default=yes] + --disable-openmp do not use OpenMP + --enable-fftw enable use of fftw [default=no] + --disable-largefile omit support for large files + --enable-alignment=n Automatically or expliclty align arrays to byte + number: auto, none, 16, 32 [default=auto] + --enable-p4 enable use of P4 instructions [default=no] + --enable-opteron enable use of Opteron instructions [default=no] + --enable-sse2 enable use of SSE2 instructions [default=no] + --enable-sse3 enable use of SSE3 instructions [default=no] + --enable-optimize enable optimisation [default=yes] + --enable-gaugecopy enable use of a copy of the gauge field + [default=yes] + --enable-halfspinor use a Dirac Op. with halfspinor exchange + [default=yes] + --enable-shmem use shmem API [default=no] + --enable-tsplitpar enable timeslice-splitted communications + [default=no] + --enable-laph enable computation of LapH eigensystem [default=no] + --enable-gpu use GPU [default=no] + +Optional Packages: + --with-PACKAGE[=ARG] use PACKAGE [ARG=yes] + --without-PACKAGE do not use PACKAGE (same as --with-PACKAGE=no) + --with-limedir=dir search lime in dir [default=./lime] + --with-lemondir=dir use lemon, to be found in dir + --with-mpidimension=n use n dimensional parallelisation [default=1] + --with-persistentmpi use persistent MPI calls for halfspinor [default=no] + --with-nonblockingmpi use non-blocking MPI calls for spinor and gauge + [default=yes] + --with-fixedvolume fix volume at compiletime [default=no] + --with-kojakinst instrumentalise for KOJAK [default=no] + --with-lapack enable use of lapack [default=yes] + --with-gprof use of gprof profiler [default=no] + --with-bgldram use BGL dram window (BGL only!) [default=yes] + --with-cuda=dir use CUDA GPU with lib dir + [default=/usr/local/cuda/lib] + --with-cudacompileargs=string + use CUDA compile args [default="--gpu-architecture + sm_13 --use_fast_math -O3"] + --with-qudadir=dir use QUDA, to be found in dir + --with-cudadir=dir if using QUDA, then set CUDA lib dir + [default=/usr/local/cuda/lib] + +Some influential environment variables: + CC C compiler command + CFLAGS C compiler flags + LDFLAGS linker flags, e.g. -L if you have libraries in a + nonstandard directory + LIBS libraries to pass to the linker, e.g. -l + CPPFLAGS (Objective) C/C++ preprocessor flags, e.g. -I if + you have headers in a nonstandard directory + F77 Fortran 77 compiler command + FFLAGS Fortran 77 compiler flags + CPP C preprocessor + CXX C++ compiler command + CXXFLAGS C++ compiler flags + +Use these variables to override the choices made by `configure' or to help +it to find libraries and programs with nonstandard names/locations. + +Report bugs to . +_ACEOF +ac_status=$? +fi + +if test "$ac_init_help" = "recursive"; then + # If there are subdirs, report their specific --help. + for ac_dir in : $ac_subdirs_all; do test "x$ac_dir" = x: && continue + test -d "$ac_dir" || + { cd "$srcdir" && ac_pwd=`pwd` && srcdir=. && test -d "$ac_dir"; } || + continue + ac_builddir=. + +case "$ac_dir" in +.) ac_dir_suffix= ac_top_builddir_sub=. ac_top_build_prefix= ;; +*) + ac_dir_suffix=/`$as_echo "$ac_dir" | sed 's|^\.[\\/]||'` + # A ".." for each directory in $ac_dir_suffix. + ac_top_builddir_sub=`$as_echo "$ac_dir_suffix" | sed 's|/[^\\/]*|/..|g;s|/||'` + case $ac_top_builddir_sub in + "") ac_top_builddir_sub=. ac_top_build_prefix= ;; + *) ac_top_build_prefix=$ac_top_builddir_sub/ ;; + esac ;; +esac +ac_abs_top_builddir=$ac_pwd +ac_abs_builddir=$ac_pwd$ac_dir_suffix +# for backward compatibility: +ac_top_builddir=$ac_top_build_prefix + +case $srcdir in + .) # We are building in place. + ac_srcdir=. + ac_top_srcdir=$ac_top_builddir_sub + ac_abs_top_srcdir=$ac_pwd ;; + [\\/]* | ?:[\\/]* ) # Absolute name. + ac_srcdir=$srcdir$ac_dir_suffix; + ac_top_srcdir=$srcdir + ac_abs_top_srcdir=$srcdir ;; + *) # Relative name. + ac_srcdir=$ac_top_build_prefix$srcdir$ac_dir_suffix + ac_top_srcdir=$ac_top_build_prefix$srcdir + ac_abs_top_srcdir=$ac_pwd/$srcdir ;; +esac +ac_abs_srcdir=$ac_abs_top_srcdir$ac_dir_suffix + + cd "$ac_dir" || { ac_status=$?; continue; } + # Check for guested configure. + if test -f "$ac_srcdir/configure.gnu"; then + echo && + $SHELL "$ac_srcdir/configure.gnu" --help=recursive + elif test -f "$ac_srcdir/configure"; then + echo && + $SHELL "$ac_srcdir/configure" --help=recursive + else + $as_echo "$as_me: WARNING: no configuration information is in $ac_dir" >&2 + fi || ac_status=$? + cd "$ac_pwd" || { ac_status=$?; break; } + done +fi + +test -n "$ac_init_help" && exit $ac_status +if $ac_init_version; then + cat <<\_ACEOF +tmLQCD configure 5.2.0 +generated by GNU Autoconf 2.69 + +Copyright (C) 2012 Free Software Foundation, Inc. +This configure script is free software; the Free Software Foundation +gives unlimited permission to copy, distribute and modify it. +_ACEOF + exit +fi + +## ------------------------ ## +## Autoconf initialization. ## +## ------------------------ ## + +# ac_fn_c_try_compile LINENO +# -------------------------- +# Try to compile conftest.$ac_ext, and return whether this succeeded. +ac_fn_c_try_compile () +{ + as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack + rm -f conftest.$ac_objext + if { { ac_try="$ac_compile" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" +$as_echo "$ac_try_echo"; } >&5 + (eval "$ac_compile") 2>conftest.err + ac_status=$? + if test -s conftest.err; then + grep -v '^ *+' conftest.err >conftest.er1 + cat conftest.er1 >&5 + mv -f conftest.er1 conftest.err + fi + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; } && { + test -z "$ac_c_werror_flag" || + test ! -s conftest.err + } && test -s conftest.$ac_objext; then : + ac_retval=0 +else + $as_echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + + ac_retval=1 +fi + eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno + as_fn_set_status $ac_retval + +} # ac_fn_c_try_compile + +# ac_fn_f77_try_compile LINENO +# ---------------------------- +# Try to compile conftest.$ac_ext, and return whether this succeeded. +ac_fn_f77_try_compile () +{ + as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack + rm -f conftest.$ac_objext + if { { ac_try="$ac_compile" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" +$as_echo "$ac_try_echo"; } >&5 + (eval "$ac_compile") 2>conftest.err + ac_status=$? + if test -s conftest.err; then + grep -v '^ *+' conftest.err >conftest.er1 + cat conftest.er1 >&5 + mv -f conftest.er1 conftest.err + fi + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; } && { + test -z "$ac_f77_werror_flag" || + test ! -s conftest.err + } && test -s conftest.$ac_objext; then : + ac_retval=0 +else + $as_echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + + ac_retval=1 +fi + eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno + as_fn_set_status $ac_retval + +} # ac_fn_f77_try_compile + +# ac_fn_c_try_link LINENO +# ----------------------- +# Try to link conftest.$ac_ext, and return whether this succeeded. +ac_fn_c_try_link () +{ + as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack + rm -f conftest.$ac_objext conftest$ac_exeext + if { { ac_try="$ac_link" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" +$as_echo "$ac_try_echo"; } >&5 + (eval "$ac_link") 2>conftest.err + ac_status=$? + if test -s conftest.err; then + grep -v '^ *+' conftest.err >conftest.er1 + cat conftest.er1 >&5 + mv -f conftest.er1 conftest.err + fi + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; } && { + test -z "$ac_c_werror_flag" || + test ! -s conftest.err + } && test -s conftest$ac_exeext && { + test "$cross_compiling" = yes || + test -x conftest$ac_exeext + }; then : + ac_retval=0 +else + $as_echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + + ac_retval=1 +fi + # Delete the IPA/IPO (Inter Procedural Analysis/Optimization) information + # created by the PGI compiler (conftest_ipa8_conftest.oo), as it would + # interfere with the next link command; also delete a directory that is + # left behind by Apple's compiler. We do this before executing the actions. + rm -rf conftest.dSYM conftest_ipa8_conftest.oo + eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno + as_fn_set_status $ac_retval + +} # ac_fn_c_try_link + +# ac_fn_c_try_cpp LINENO +# ---------------------- +# Try to preprocess conftest.$ac_ext, and return whether this succeeded. +ac_fn_c_try_cpp () +{ + as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack + if { { ac_try="$ac_cpp conftest.$ac_ext" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" +$as_echo "$ac_try_echo"; } >&5 + (eval "$ac_cpp conftest.$ac_ext") 2>conftest.err + ac_status=$? + if test -s conftest.err; then + grep -v '^ *+' conftest.err >conftest.er1 + cat conftest.er1 >&5 + mv -f conftest.er1 conftest.err + fi + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; } > conftest.i && { + test -z "$ac_c_preproc_warn_flag$ac_c_werror_flag" || + test ! -s conftest.err + }; then : + ac_retval=0 +else + $as_echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + + ac_retval=1 +fi + eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno + as_fn_set_status $ac_retval + +} # ac_fn_c_try_cpp + +# ac_fn_c_check_header_mongrel LINENO HEADER VAR INCLUDES +# ------------------------------------------------------- +# Tests whether HEADER exists, giving a warning if it cannot be compiled using +# the include files in INCLUDES and setting the cache variable VAR +# accordingly. +ac_fn_c_check_header_mongrel () +{ + as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack + if eval \${$3+:} false; then : + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5 +$as_echo_n "checking for $2... " >&6; } +if eval \${$3+:} false; then : + $as_echo_n "(cached) " >&6 +fi +eval ac_res=\$$3 + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5 +$as_echo "$ac_res" >&6; } +else + # Is the header compilable? +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking $2 usability" >&5 +$as_echo_n "checking $2 usability... " >&6; } +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +$4 +#include <$2> +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + ac_header_compiler=yes +else + ac_header_compiler=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_header_compiler" >&5 +$as_echo "$ac_header_compiler" >&6; } + +# Is the header present? +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking $2 presence" >&5 +$as_echo_n "checking $2 presence... " >&6; } +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include <$2> +_ACEOF +if ac_fn_c_try_cpp "$LINENO"; then : + ac_header_preproc=yes +else + ac_header_preproc=no +fi +rm -f conftest.err conftest.i conftest.$ac_ext +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_header_preproc" >&5 +$as_echo "$ac_header_preproc" >&6; } + +# So? What about this header? +case $ac_header_compiler:$ac_header_preproc:$ac_c_preproc_warn_flag in #(( + yes:no: ) + { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: accepted by the compiler, rejected by the preprocessor!" >&5 +$as_echo "$as_me: WARNING: $2: accepted by the compiler, rejected by the preprocessor!" >&2;} + { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: proceeding with the compiler's result" >&5 +$as_echo "$as_me: WARNING: $2: proceeding with the compiler's result" >&2;} + ;; + no:yes:* ) + { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: present but cannot be compiled" >&5 +$as_echo "$as_me: WARNING: $2: present but cannot be compiled" >&2;} + { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: check for missing prerequisite headers?" >&5 +$as_echo "$as_me: WARNING: $2: check for missing prerequisite headers?" >&2;} + { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: see the Autoconf documentation" >&5 +$as_echo "$as_me: WARNING: $2: see the Autoconf documentation" >&2;} + { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: section \"Present But Cannot Be Compiled\"" >&5 +$as_echo "$as_me: WARNING: $2: section \"Present But Cannot Be Compiled\"" >&2;} + { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $2: proceeding with the compiler's result" >&5 +$as_echo "$as_me: WARNING: $2: proceeding with the compiler's result" >&2;} +( $as_echo "## ----------------------------- ## +## Report this to curbach@gmx.de ## +## ----------------------------- ##" + ) | sed "s/^/$as_me: WARNING: /" >&2 + ;; +esac + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5 +$as_echo_n "checking for $2... " >&6; } +if eval \${$3+:} false; then : + $as_echo_n "(cached) " >&6 +else + eval "$3=\$ac_header_compiler" +fi +eval ac_res=\$$3 + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5 +$as_echo "$ac_res" >&6; } +fi + eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno + +} # ac_fn_c_check_header_mongrel + +# ac_fn_c_try_run LINENO +# ---------------------- +# Try to link conftest.$ac_ext, and return whether this succeeded. Assumes +# that executables *can* be run. +ac_fn_c_try_run () +{ + as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack + if { { ac_try="$ac_link" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" +$as_echo "$ac_try_echo"; } >&5 + (eval "$ac_link") 2>&5 + ac_status=$? + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; } && { ac_try='./conftest$ac_exeext' + { { case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" +$as_echo "$ac_try_echo"; } >&5 + (eval "$ac_try") 2>&5 + ac_status=$? + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; }; }; then : + ac_retval=0 +else + $as_echo "$as_me: program exited with status $ac_status" >&5 + $as_echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + + ac_retval=$ac_status +fi + rm -rf conftest.dSYM conftest_ipa8_conftest.oo + eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno + as_fn_set_status $ac_retval + +} # ac_fn_c_try_run + +# ac_fn_c_check_header_compile LINENO HEADER VAR INCLUDES +# ------------------------------------------------------- +# Tests whether HEADER exists and can be compiled using the include files in +# INCLUDES, setting the cache variable VAR accordingly. +ac_fn_c_check_header_compile () +{ + as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5 +$as_echo_n "checking for $2... " >&6; } +if eval \${$3+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +$4 +#include <$2> +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + eval "$3=yes" +else + eval "$3=no" +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +fi +eval ac_res=\$$3 + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5 +$as_echo "$ac_res" >&6; } + eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno + +} # ac_fn_c_check_header_compile + +# ac_fn_c_check_type LINENO TYPE VAR INCLUDES +# ------------------------------------------- +# Tests whether TYPE exists after having included INCLUDES, setting cache +# variable VAR accordingly. +ac_fn_c_check_type () +{ + as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $2" >&5 +$as_echo_n "checking for $2... " >&6; } +if eval \${$3+:} false; then : + $as_echo_n "(cached) " >&6 +else + eval "$3=no" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +$4 +int +main () +{ +if (sizeof ($2)) + return 0; + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +$4 +int +main () +{ +if (sizeof (($2))) + return 0; + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + +else + eval "$3=yes" +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +fi +eval ac_res=\$$3 + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5 +$as_echo "$ac_res" >&6; } + eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno + +} # ac_fn_c_check_type + +# ac_fn_c_compute_int LINENO EXPR VAR INCLUDES +# -------------------------------------------- +# Tries to find the compile-time value of EXPR in a program that includes +# INCLUDES, setting VAR accordingly. Returns whether the value could be +# computed +ac_fn_c_compute_int () +{ + as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack + if test "$cross_compiling" = yes; then + # Depending upon the size, compute the lo and hi bounds. +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +$4 +int +main () +{ +static int test_array [1 - 2 * !(($2) >= 0)]; +test_array [0] = 0; +return test_array [0]; + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + ac_lo=0 ac_mid=0 + while :; do + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +$4 +int +main () +{ +static int test_array [1 - 2 * !(($2) <= $ac_mid)]; +test_array [0] = 0; +return test_array [0]; + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + ac_hi=$ac_mid; break +else + as_fn_arith $ac_mid + 1 && ac_lo=$as_val + if test $ac_lo -le $ac_mid; then + ac_lo= ac_hi= + break + fi + as_fn_arith 2 '*' $ac_mid + 1 && ac_mid=$as_val +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + done +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +$4 +int +main () +{ +static int test_array [1 - 2 * !(($2) < 0)]; +test_array [0] = 0; +return test_array [0]; + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + ac_hi=-1 ac_mid=-1 + while :; do + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +$4 +int +main () +{ +static int test_array [1 - 2 * !(($2) >= $ac_mid)]; +test_array [0] = 0; +return test_array [0]; + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + ac_lo=$ac_mid; break +else + as_fn_arith '(' $ac_mid ')' - 1 && ac_hi=$as_val + if test $ac_mid -le $ac_hi; then + ac_lo= ac_hi= + break + fi + as_fn_arith 2 '*' $ac_mid && ac_mid=$as_val +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + done +else + ac_lo= ac_hi= +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +# Binary search between lo and hi bounds. +while test "x$ac_lo" != "x$ac_hi"; do + as_fn_arith '(' $ac_hi - $ac_lo ')' / 2 + $ac_lo && ac_mid=$as_val + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +$4 +int +main () +{ +static int test_array [1 - 2 * !(($2) <= $ac_mid)]; +test_array [0] = 0; +return test_array [0]; + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + ac_hi=$ac_mid +else + as_fn_arith '(' $ac_mid ')' + 1 && ac_lo=$as_val +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +done +case $ac_lo in #(( +?*) eval "$3=\$ac_lo"; ac_retval=0 ;; +'') ac_retval=1 ;; +esac + else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +$4 +static long int longval () { return $2; } +static unsigned long int ulongval () { return $2; } +#include +#include +int +main () +{ + + FILE *f = fopen ("conftest.val", "w"); + if (! f) + return 1; + if (($2) < 0) + { + long int i = longval (); + if (i != ($2)) + return 1; + fprintf (f, "%ld", i); + } + else + { + unsigned long int i = ulongval (); + if (i != ($2)) + return 1; + fprintf (f, "%lu", i); + } + /* Do not output a trailing newline, as this causes \r\n confusion + on some platforms. */ + return ferror (f) || fclose (f) != 0; + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_run "$LINENO"; then : + echo >>conftest.val; read $3 &5 +$as_echo_n "checking for $2... " >&6; } +if eval \${$3+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +/* Define $2 to an innocuous variant, in case declares $2. + For example, HP-UX 11i declares gettimeofday. */ +#define $2 innocuous_$2 + +/* System header to define __stub macros and hopefully few prototypes, + which can conflict with char $2 (); below. + Prefer to if __STDC__ is defined, since + exists even on freestanding compilers. */ + +#ifdef __STDC__ +# include +#else +# include +#endif + +#undef $2 + +/* Override any GCC internal prototype to avoid an error. + Use char because int might match the return type of a GCC + builtin and then its argument prototype would still apply. */ +#ifdef __cplusplus +extern "C" +#endif +char $2 (); +/* The GNU C library defines this for functions which it implements + to always fail with ENOSYS. Some functions are actually named + something starting with __ and the normal name is an alias. */ +#if defined __stub_$2 || defined __stub___$2 +choke me +#endif + +int +main () +{ +return $2 (); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + eval "$3=yes" +else + eval "$3=no" +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +fi +eval ac_res=\$$3 + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5 +$as_echo "$ac_res" >&6; } + eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno + +} # ac_fn_c_check_func + +# ac_fn_cxx_try_compile LINENO +# ---------------------------- +# Try to compile conftest.$ac_ext, and return whether this succeeded. +ac_fn_cxx_try_compile () +{ + as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack + rm -f conftest.$ac_objext + if { { ac_try="$ac_compile" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" +$as_echo "$ac_try_echo"; } >&5 + (eval "$ac_compile") 2>conftest.err + ac_status=$? + if test -s conftest.err; then + grep -v '^ *+' conftest.err >conftest.er1 + cat conftest.er1 >&5 + mv -f conftest.er1 conftest.err + fi + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; } && { + test -z "$ac_cxx_werror_flag" || + test ! -s conftest.err + } && test -s conftest.$ac_objext; then : + ac_retval=0 +else + $as_echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + + ac_retval=1 +fi + eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno + as_fn_set_status $ac_retval + +} # ac_fn_cxx_try_compile + +# ac_fn_cxx_try_link LINENO +# ------------------------- +# Try to link conftest.$ac_ext, and return whether this succeeded. +ac_fn_cxx_try_link () +{ + as_lineno=${as_lineno-"$1"} as_lineno_stack=as_lineno_stack=$as_lineno_stack + rm -f conftest.$ac_objext conftest$ac_exeext + if { { ac_try="$ac_link" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" +$as_echo "$ac_try_echo"; } >&5 + (eval "$ac_link") 2>conftest.err + ac_status=$? + if test -s conftest.err; then + grep -v '^ *+' conftest.err >conftest.er1 + cat conftest.er1 >&5 + mv -f conftest.er1 conftest.err + fi + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; } && { + test -z "$ac_cxx_werror_flag" || + test ! -s conftest.err + } && test -s conftest$ac_exeext && { + test "$cross_compiling" = yes || + test -x conftest$ac_exeext + }; then : + ac_retval=0 +else + $as_echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + + ac_retval=1 +fi + # Delete the IPA/IPO (Inter Procedural Analysis/Optimization) information + # created by the PGI compiler (conftest_ipa8_conftest.oo), as it would + # interfere with the next link command; also delete a directory that is + # left behind by Apple's compiler. We do this before executing the actions. + rm -rf conftest.dSYM conftest_ipa8_conftest.oo + eval $as_lineno_stack; ${as_lineno_stack:+:} unset as_lineno + as_fn_set_status $ac_retval + +} # ac_fn_cxx_try_link +cat >config.log <<_ACEOF +This file contains any messages produced by compilers while +running configure, to aid debugging if configure makes a mistake. + +It was created by tmLQCD $as_me 5.2.0, which was +generated by GNU Autoconf 2.69. Invocation command line was + + $ $0 $@ + +_ACEOF +exec 5>>config.log +{ +cat <<_ASUNAME +## --------- ## +## Platform. ## +## --------- ## + +hostname = `(hostname || uname -n) 2>/dev/null | sed 1q` +uname -m = `(uname -m) 2>/dev/null || echo unknown` +uname -r = `(uname -r) 2>/dev/null || echo unknown` +uname -s = `(uname -s) 2>/dev/null || echo unknown` +uname -v = `(uname -v) 2>/dev/null || echo unknown` + +/usr/bin/uname -p = `(/usr/bin/uname -p) 2>/dev/null || echo unknown` +/bin/uname -X = `(/bin/uname -X) 2>/dev/null || echo unknown` + +/bin/arch = `(/bin/arch) 2>/dev/null || echo unknown` +/usr/bin/arch -k = `(/usr/bin/arch -k) 2>/dev/null || echo unknown` +/usr/convex/getsysinfo = `(/usr/convex/getsysinfo) 2>/dev/null || echo unknown` +/usr/bin/hostinfo = `(/usr/bin/hostinfo) 2>/dev/null || echo unknown` +/bin/machine = `(/bin/machine) 2>/dev/null || echo unknown` +/usr/bin/oslevel = `(/usr/bin/oslevel) 2>/dev/null || echo unknown` +/bin/universe = `(/bin/universe) 2>/dev/null || echo unknown` + +_ASUNAME + +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + $as_echo "PATH: $as_dir" + done +IFS=$as_save_IFS + +} >&5 + +cat >&5 <<_ACEOF + + +## ----------- ## +## Core tests. ## +## ----------- ## + +_ACEOF + + +# Keep a trace of the command line. +# Strip out --no-create and --no-recursion so they do not pile up. +# Strip out --silent because we don't want to record it for future runs. +# Also quote any args containing shell meta-characters. +# Make two passes to allow for proper duplicate-argument suppression. +ac_configure_args= +ac_configure_args0= +ac_configure_args1= +ac_must_keep_next=false +for ac_pass in 1 2 +do + for ac_arg + do + case $ac_arg in + -no-create | --no-c* | -n | -no-recursion | --no-r*) continue ;; + -q | -quiet | --quiet | --quie | --qui | --qu | --q \ + | -silent | --silent | --silen | --sile | --sil) + continue ;; + *\'*) + ac_arg=`$as_echo "$ac_arg" | sed "s/'/'\\\\\\\\''/g"` ;; + esac + case $ac_pass in + 1) as_fn_append ac_configure_args0 " '$ac_arg'" ;; + 2) + as_fn_append ac_configure_args1 " '$ac_arg'" + if test $ac_must_keep_next = true; then + ac_must_keep_next=false # Got value, back to normal. + else + case $ac_arg in + *=* | --config-cache | -C | -disable-* | --disable-* \ + | -enable-* | --enable-* | -gas | --g* | -nfp | --nf* \ + | -q | -quiet | --q* | -silent | --sil* | -v | -verb* \ + | -with-* | --with-* | -without-* | --without-* | --x) + case "$ac_configure_args0 " in + "$ac_configure_args1"*" '$ac_arg' "* ) continue ;; + esac + ;; + -* ) ac_must_keep_next=true ;; + esac + fi + as_fn_append ac_configure_args " '$ac_arg'" + ;; + esac + done +done +{ ac_configure_args0=; unset ac_configure_args0;} +{ ac_configure_args1=; unset ac_configure_args1;} + +# When interrupted or exit'd, cleanup temporary files, and complete +# config.log. We remove comments because anyway the quotes in there +# would cause problems or look ugly. +# WARNING: Use '\'' to represent an apostrophe within the trap. +# WARNING: Do not start the trap code with a newline, due to a FreeBSD 4.0 bug. +trap 'exit_status=$? + # Save into config.log some information that might help in debugging. + { + echo + + $as_echo "## ---------------- ## +## Cache variables. ## +## ---------------- ##" + echo + # The following way of writing the cache mishandles newlines in values, +( + for ac_var in `(set) 2>&1 | sed -n '\''s/^\([a-zA-Z_][a-zA-Z0-9_]*\)=.*/\1/p'\''`; do + eval ac_val=\$$ac_var + case $ac_val in #( + *${as_nl}*) + case $ac_var in #( + *_cv_*) { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: cache variable $ac_var contains a newline" >&5 +$as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;; + esac + case $ac_var in #( + _ | IFS | as_nl) ;; #( + BASH_ARGV | BASH_SOURCE) eval $ac_var= ;; #( + *) { eval $ac_var=; unset $ac_var;} ;; + esac ;; + esac + done + (set) 2>&1 | + case $as_nl`(ac_space='\'' '\''; set) 2>&1` in #( + *${as_nl}ac_space=\ *) + sed -n \ + "s/'\''/'\''\\\\'\'''\''/g; + s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1='\''\\2'\''/p" + ;; #( + *) + sed -n "/^[_$as_cr_alnum]*_cv_[_$as_cr_alnum]*=/p" + ;; + esac | + sort +) + echo + + $as_echo "## ----------------- ## +## Output variables. ## +## ----------------- ##" + echo + for ac_var in $ac_subst_vars + do + eval ac_val=\$$ac_var + case $ac_val in + *\'\''*) ac_val=`$as_echo "$ac_val" | sed "s/'\''/'\''\\\\\\\\'\'''\''/g"`;; + esac + $as_echo "$ac_var='\''$ac_val'\''" + done | sort + echo + + if test -n "$ac_subst_files"; then + $as_echo "## ------------------- ## +## File substitutions. ## +## ------------------- ##" + echo + for ac_var in $ac_subst_files + do + eval ac_val=\$$ac_var + case $ac_val in + *\'\''*) ac_val=`$as_echo "$ac_val" | sed "s/'\''/'\''\\\\\\\\'\'''\''/g"`;; + esac + $as_echo "$ac_var='\''$ac_val'\''" + done | sort + echo + fi + + if test -s confdefs.h; then + $as_echo "## ----------- ## +## confdefs.h. ## +## ----------- ##" + echo + cat confdefs.h + echo + fi + test "$ac_signal" != 0 && + $as_echo "$as_me: caught signal $ac_signal" + $as_echo "$as_me: exit $exit_status" + } >&5 + rm -f core *.core core.conftest.* && + rm -f -r conftest* confdefs* conf$$* $ac_clean_files && + exit $exit_status +' 0 +for ac_signal in 1 2 13 15; do + trap 'ac_signal='$ac_signal'; as_fn_exit 1' $ac_signal +done +ac_signal=0 + +# confdefs.h avoids OS command line length limits that DEFS can exceed. +rm -f -r conftest* confdefs.h + +$as_echo "/* confdefs.h */" > confdefs.h + +# Predefined preprocessor variables. + +cat >>confdefs.h <<_ACEOF +#define PACKAGE_NAME "$PACKAGE_NAME" +_ACEOF + +cat >>confdefs.h <<_ACEOF +#define PACKAGE_TARNAME "$PACKAGE_TARNAME" +_ACEOF + +cat >>confdefs.h <<_ACEOF +#define PACKAGE_VERSION "$PACKAGE_VERSION" +_ACEOF + +cat >>confdefs.h <<_ACEOF +#define PACKAGE_STRING "$PACKAGE_STRING" +_ACEOF + +cat >>confdefs.h <<_ACEOF +#define PACKAGE_BUGREPORT "$PACKAGE_BUGREPORT" +_ACEOF + +cat >>confdefs.h <<_ACEOF +#define PACKAGE_URL "$PACKAGE_URL" +_ACEOF + + +# Let the site file select an alternate cache file if it wants to. +# Prefer an explicitly selected file to automatically selected ones. +ac_site_file1=NONE +ac_site_file2=NONE +if test -n "$CONFIG_SITE"; then + # We do not want a PATH search for config.site. + case $CONFIG_SITE in #(( + -*) ac_site_file1=./$CONFIG_SITE;; + */*) ac_site_file1=$CONFIG_SITE;; + *) ac_site_file1=./$CONFIG_SITE;; + esac +elif test "x$prefix" != xNONE; then + ac_site_file1=$prefix/share/config.site + ac_site_file2=$prefix/etc/config.site +else + ac_site_file1=$ac_default_prefix/share/config.site + ac_site_file2=$ac_default_prefix/etc/config.site +fi +for ac_site_file in "$ac_site_file1" "$ac_site_file2" +do + test "x$ac_site_file" = xNONE && continue + if test /dev/null != "$ac_site_file" && test -r "$ac_site_file"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: loading site script $ac_site_file" >&5 +$as_echo "$as_me: loading site script $ac_site_file" >&6;} + sed 's/^/| /' "$ac_site_file" >&5 + . "$ac_site_file" \ + || { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +as_fn_error $? "failed to load site script $ac_site_file +See \`config.log' for more details" "$LINENO" 5; } + fi +done + +if test -r "$cache_file"; then + # Some versions of bash will fail to source /dev/null (special files + # actually), so we avoid doing that. DJGPP emulates it as a regular file. + if test /dev/null != "$cache_file" && test -f "$cache_file"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: loading cache $cache_file" >&5 +$as_echo "$as_me: loading cache $cache_file" >&6;} + case $cache_file in + [\\/]* | ?:[\\/]* ) . "$cache_file";; + *) . "./$cache_file";; + esac + fi +else + { $as_echo "$as_me:${as_lineno-$LINENO}: creating cache $cache_file" >&5 +$as_echo "$as_me: creating cache $cache_file" >&6;} + >$cache_file +fi + +# Check that the precious variables saved in the cache have kept the same +# value. +ac_cache_corrupted=false +for ac_var in $ac_precious_vars; do + eval ac_old_set=\$ac_cv_env_${ac_var}_set + eval ac_new_set=\$ac_env_${ac_var}_set + eval ac_old_val=\$ac_cv_env_${ac_var}_value + eval ac_new_val=\$ac_env_${ac_var}_value + case $ac_old_set,$ac_new_set in + set,) + { $as_echo "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&5 +$as_echo "$as_me: error: \`$ac_var' was set to \`$ac_old_val' in the previous run" >&2;} + ac_cache_corrupted=: ;; + ,set) + { $as_echo "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' was not set in the previous run" >&5 +$as_echo "$as_me: error: \`$ac_var' was not set in the previous run" >&2;} + ac_cache_corrupted=: ;; + ,);; + *) + if test "x$ac_old_val" != "x$ac_new_val"; then + # differences in whitespace do not lead to failure. + ac_old_val_w=`echo x $ac_old_val` + ac_new_val_w=`echo x $ac_new_val` + if test "$ac_old_val_w" != "$ac_new_val_w"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: error: \`$ac_var' has changed since the previous run:" >&5 +$as_echo "$as_me: error: \`$ac_var' has changed since the previous run:" >&2;} + ac_cache_corrupted=: + else + { $as_echo "$as_me:${as_lineno-$LINENO}: warning: ignoring whitespace changes in \`$ac_var' since the previous run:" >&5 +$as_echo "$as_me: warning: ignoring whitespace changes in \`$ac_var' since the previous run:" >&2;} + eval $ac_var=\$ac_old_val + fi + { $as_echo "$as_me:${as_lineno-$LINENO}: former value: \`$ac_old_val'" >&5 +$as_echo "$as_me: former value: \`$ac_old_val'" >&2;} + { $as_echo "$as_me:${as_lineno-$LINENO}: current value: \`$ac_new_val'" >&5 +$as_echo "$as_me: current value: \`$ac_new_val'" >&2;} + fi;; + esac + # Pass precious variables to config.status. + if test "$ac_new_set" = set; then + case $ac_new_val in + *\'*) ac_arg=$ac_var=`$as_echo "$ac_new_val" | sed "s/'/'\\\\\\\\''/g"` ;; + *) ac_arg=$ac_var=$ac_new_val ;; + esac + case " $ac_configure_args " in + *" '$ac_arg' "*) ;; # Avoid dups. Use of quotes ensures accuracy. + *) as_fn_append ac_configure_args " '$ac_arg'" ;; + esac + fi +done +if $ac_cache_corrupted; then + { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} + { $as_echo "$as_me:${as_lineno-$LINENO}: error: changes in the environment can compromise the build" >&5 +$as_echo "$as_me: error: changes in the environment can compromise the build" >&2;} + as_fn_error $? "run \`make distclean' and/or \`rm $cache_file' and start over" "$LINENO" 5 +fi +## -------------------- ## +## Main body of script. ## +## -------------------- ## + +ac_ext=c +ac_cpp='$CPP $CPPFLAGS' +ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_c_compiler_gnu + + +ac_config_headers="$ac_config_headers config.h" + + +ac_aux_dir= +for ac_dir in "$srcdir" "$srcdir/.." "$srcdir/../.."; do + if test -f "$ac_dir/install-sh"; then + ac_aux_dir=$ac_dir + ac_install_sh="$ac_aux_dir/install-sh -c" + break + elif test -f "$ac_dir/install.sh"; then + ac_aux_dir=$ac_dir + ac_install_sh="$ac_aux_dir/install.sh -c" + break + elif test -f "$ac_dir/shtool"; then + ac_aux_dir=$ac_dir + ac_install_sh="$ac_aux_dir/shtool install -c" + break + fi +done +if test -z "$ac_aux_dir"; then + as_fn_error $? "cannot find install-sh, install.sh, or shtool in \"$srcdir\" \"$srcdir/..\" \"$srcdir/../..\"" "$LINENO" 5 +fi + +# These three variables are undocumented and unsupported, +# and are intended to be withdrawn in a future Autoconf release. +# They can cause serious problems if a builder's source tree is in a directory +# whose full name contains unusual characters. +ac_config_guess="$SHELL $ac_aux_dir/config.guess" # Please don't use this var. +ac_config_sub="$SHELL $ac_aux_dir/config.sub" # Please don't use this var. +ac_configure="$SHELL $ac_aux_dir/configure" # Please don't use this var. + + +# Make sure we can run config.sub. +$SHELL "$ac_aux_dir/config.sub" sun4 >/dev/null 2>&1 || + as_fn_error $? "cannot run $SHELL $ac_aux_dir/config.sub" "$LINENO" 5 + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking build system type" >&5 +$as_echo_n "checking build system type... " >&6; } +if ${ac_cv_build+:} false; then : + $as_echo_n "(cached) " >&6 +else + ac_build_alias=$build_alias +test "x$ac_build_alias" = x && + ac_build_alias=`$SHELL "$ac_aux_dir/config.guess"` +test "x$ac_build_alias" = x && + as_fn_error $? "cannot guess build type; you must specify one" "$LINENO" 5 +ac_cv_build=`$SHELL "$ac_aux_dir/config.sub" $ac_build_alias` || + as_fn_error $? "$SHELL $ac_aux_dir/config.sub $ac_build_alias failed" "$LINENO" 5 + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_build" >&5 +$as_echo "$ac_cv_build" >&6; } +case $ac_cv_build in +*-*-*) ;; +*) as_fn_error $? "invalid value of canonical build" "$LINENO" 5;; +esac +build=$ac_cv_build +ac_save_IFS=$IFS; IFS='-' +set x $ac_cv_build +shift +build_cpu=$1 +build_vendor=$2 +shift; shift +# Remember, the first character of IFS is used to create $*, +# except with old shells: +build_os=$* +IFS=$ac_save_IFS +case $build_os in *\ *) build_os=`echo "$build_os" | sed 's/ /-/g'`;; esac + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking host system type" >&5 +$as_echo_n "checking host system type... " >&6; } +if ${ac_cv_host+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test "x$host_alias" = x; then + ac_cv_host=$ac_cv_build +else + ac_cv_host=`$SHELL "$ac_aux_dir/config.sub" $host_alias` || + as_fn_error $? "$SHELL $ac_aux_dir/config.sub $host_alias failed" "$LINENO" 5 +fi + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_host" >&5 +$as_echo "$ac_cv_host" >&6; } +case $ac_cv_host in +*-*-*) ;; +*) as_fn_error $? "invalid value of canonical host" "$LINENO" 5;; +esac +host=$ac_cv_host +ac_save_IFS=$IFS; IFS='-' +set x $ac_cv_host +shift +host_cpu=$1 +host_vendor=$2 +shift; shift +# Remember, the first character of IFS is used to create $*, +# except with old shells: +host_os=$* +IFS=$ac_save_IFS +case $host_os in *\ *) host_os=`echo "$host_os" | sed 's/ /-/g'`;; esac + + + +test "$program_prefix" != NONE && + program_transform_name="s&^&$program_prefix&;$program_transform_name" +# Use a double $ so make ignores it. +test "$program_suffix" != NONE && + program_transform_name="s&\$&$program_suffix&;$program_transform_name" +# Double any \ or $. +# By default was `s,x,x', remove it if useless. +ac_script='s/[\\$]/&&/g;s/;s,x,x,$//' +program_transform_name=`$as_echo "$program_transform_name" | sed "$ac_script"` + + +if test "$host_vendor" = "cray"; then + ac_cv_c_bigendian=yes +fi + +ac_ext=c +ac_cpp='$CPP $CPPFLAGS' +ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_c_compiler_gnu +if test -n "$ac_tool_prefix"; then + # Extract the first word of "${ac_tool_prefix}gcc", so it can be a program name with args. +set dummy ${ac_tool_prefix}gcc; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_CC+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$CC"; then + ac_cv_prog_CC="$CC" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_prog_CC="${ac_tool_prefix}gcc" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + +fi +fi +CC=$ac_cv_prog_CC +if test -n "$CC"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5 +$as_echo "$CC" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + +fi +if test -z "$ac_cv_prog_CC"; then + ac_ct_CC=$CC + # Extract the first word of "gcc", so it can be a program name with args. +set dummy gcc; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_ac_ct_CC+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$ac_ct_CC"; then + ac_cv_prog_ac_ct_CC="$ac_ct_CC" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_prog_ac_ct_CC="gcc" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + +fi +fi +ac_ct_CC=$ac_cv_prog_ac_ct_CC +if test -n "$ac_ct_CC"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CC" >&5 +$as_echo "$ac_ct_CC" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + if test "x$ac_ct_CC" = x; then + CC="" + else + case $cross_compiling:$ac_tool_warned in +yes:) +{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5 +$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;} +ac_tool_warned=yes ;; +esac + CC=$ac_ct_CC + fi +else + CC="$ac_cv_prog_CC" +fi + +if test -z "$CC"; then + if test -n "$ac_tool_prefix"; then + # Extract the first word of "${ac_tool_prefix}cc", so it can be a program name with args. +set dummy ${ac_tool_prefix}cc; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_CC+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$CC"; then + ac_cv_prog_CC="$CC" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_prog_CC="${ac_tool_prefix}cc" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + +fi +fi +CC=$ac_cv_prog_CC +if test -n "$CC"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5 +$as_echo "$CC" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + + fi +fi +if test -z "$CC"; then + # Extract the first word of "cc", so it can be a program name with args. +set dummy cc; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_CC+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$CC"; then + ac_cv_prog_CC="$CC" # Let the user override the test. +else + ac_prog_rejected=no +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + if test "$as_dir/$ac_word$ac_exec_ext" = "/usr/ucb/cc"; then + ac_prog_rejected=yes + continue + fi + ac_cv_prog_CC="cc" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + +if test $ac_prog_rejected = yes; then + # We found a bogon in the path, so make sure we never use it. + set dummy $ac_cv_prog_CC + shift + if test $# != 0; then + # We chose a different compiler from the bogus one. + # However, it has the same basename, so the bogon will be chosen + # first if we set CC to just the basename; use the full file name. + shift + ac_cv_prog_CC="$as_dir/$ac_word${1+' '}$@" + fi +fi +fi +fi +CC=$ac_cv_prog_CC +if test -n "$CC"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5 +$as_echo "$CC" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + +fi +if test -z "$CC"; then + if test -n "$ac_tool_prefix"; then + for ac_prog in cl.exe + do + # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args. +set dummy $ac_tool_prefix$ac_prog; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_CC+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$CC"; then + ac_cv_prog_CC="$CC" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_prog_CC="$ac_tool_prefix$ac_prog" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + +fi +fi +CC=$ac_cv_prog_CC +if test -n "$CC"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CC" >&5 +$as_echo "$CC" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + + test -n "$CC" && break + done +fi +if test -z "$CC"; then + ac_ct_CC=$CC + for ac_prog in cl.exe +do + # Extract the first word of "$ac_prog", so it can be a program name with args. +set dummy $ac_prog; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_ac_ct_CC+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$ac_ct_CC"; then + ac_cv_prog_ac_ct_CC="$ac_ct_CC" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_prog_ac_ct_CC="$ac_prog" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + +fi +fi +ac_ct_CC=$ac_cv_prog_ac_ct_CC +if test -n "$ac_ct_CC"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CC" >&5 +$as_echo "$ac_ct_CC" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + + test -n "$ac_ct_CC" && break +done + + if test "x$ac_ct_CC" = x; then + CC="" + else + case $cross_compiling:$ac_tool_warned in +yes:) +{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5 +$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;} +ac_tool_warned=yes ;; +esac + CC=$ac_ct_CC + fi +fi + +fi + + +test -z "$CC" && { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +as_fn_error $? "no acceptable C compiler found in \$PATH +See \`config.log' for more details" "$LINENO" 5; } + +# Provide some information about the compiler. +$as_echo "$as_me:${as_lineno-$LINENO}: checking for C compiler version" >&5 +set X $ac_compile +ac_compiler=$2 +for ac_option in --version -v -V -qversion; do + { { ac_try="$ac_compiler $ac_option >&5" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" +$as_echo "$ac_try_echo"; } >&5 + (eval "$ac_compiler $ac_option >&5") 2>conftest.err + ac_status=$? + if test -s conftest.err; then + sed '10a\ +... rest of stderr output deleted ... + 10q' conftest.err >conftest.er1 + cat conftest.er1 >&5 + fi + rm -f conftest.er1 conftest.err + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; } +done + +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main () +{ + + ; + return 0; +} +_ACEOF +ac_clean_files_save=$ac_clean_files +ac_clean_files="$ac_clean_files a.out a.out.dSYM a.exe b.out" +# Try to create an executable without -o first, disregard a.out. +# It will help us diagnose broken compilers, and finding out an intuition +# of exeext. +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether the C compiler works" >&5 +$as_echo_n "checking whether the C compiler works... " >&6; } +ac_link_default=`$as_echo "$ac_link" | sed 's/ -o *conftest[^ ]*//'` + +# The possible output files: +ac_files="a.out conftest.exe conftest a.exe a_out.exe b.out conftest.*" + +ac_rmfiles= +for ac_file in $ac_files +do + case $ac_file in + *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM | *.o | *.obj ) ;; + * ) ac_rmfiles="$ac_rmfiles $ac_file";; + esac +done +rm -f $ac_rmfiles + +if { { ac_try="$ac_link_default" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" +$as_echo "$ac_try_echo"; } >&5 + (eval "$ac_link_default") 2>&5 + ac_status=$? + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; }; then : + # Autoconf-2.13 could set the ac_cv_exeext variable to `no'. +# So ignore a value of `no', otherwise this would lead to `EXEEXT = no' +# in a Makefile. We should not override ac_cv_exeext if it was cached, +# so that the user can short-circuit this test for compilers unknown to +# Autoconf. +for ac_file in $ac_files '' +do + test -f "$ac_file" || continue + case $ac_file in + *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM | *.o | *.obj ) + ;; + [ab].out ) + # We found the default executable, but exeext='' is most + # certainly right. + break;; + *.* ) + if test "${ac_cv_exeext+set}" = set && test "$ac_cv_exeext" != no; + then :; else + ac_cv_exeext=`expr "$ac_file" : '[^.]*\(\..*\)'` + fi + # We set ac_cv_exeext here because the later test for it is not + # safe: cross compilers may not add the suffix if given an `-o' + # argument, so we may need to know it at that point already. + # Even if this section looks crufty: it has the advantage of + # actually working. + break;; + * ) + break;; + esac +done +test "$ac_cv_exeext" = no && ac_cv_exeext= + +else + ac_file='' +fi +if test -z "$ac_file"; then : + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +$as_echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + +{ { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +as_fn_error 77 "C compiler cannot create executables +See \`config.log' for more details" "$LINENO" 5; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for C compiler default output file name" >&5 +$as_echo_n "checking for C compiler default output file name... " >&6; } +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_file" >&5 +$as_echo "$ac_file" >&6; } +ac_exeext=$ac_cv_exeext + +rm -f -r a.out a.out.dSYM a.exe conftest$ac_cv_exeext b.out +ac_clean_files=$ac_clean_files_save +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for suffix of executables" >&5 +$as_echo_n "checking for suffix of executables... " >&6; } +if { { ac_try="$ac_link" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" +$as_echo "$ac_try_echo"; } >&5 + (eval "$ac_link") 2>&5 + ac_status=$? + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; }; then : + # If both `conftest.exe' and `conftest' are `present' (well, observable) +# catch `conftest.exe'. For instance with Cygwin, `ls conftest' will +# work properly (i.e., refer to `conftest.exe'), while it won't with +# `rm'. +for ac_file in conftest.exe conftest conftest.*; do + test -f "$ac_file" || continue + case $ac_file in + *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM | *.o | *.obj ) ;; + *.* ) ac_cv_exeext=`expr "$ac_file" : '[^.]*\(\..*\)'` + break;; + * ) break;; + esac +done +else + { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +as_fn_error $? "cannot compute suffix of executables: cannot compile and link +See \`config.log' for more details" "$LINENO" 5; } +fi +rm -f conftest conftest$ac_cv_exeext +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_exeext" >&5 +$as_echo "$ac_cv_exeext" >&6; } + +rm -f conftest.$ac_ext +EXEEXT=$ac_cv_exeext +ac_exeext=$EXEEXT +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +int +main () +{ +FILE *f = fopen ("conftest.out", "w"); + return ferror (f) || fclose (f) != 0; + + ; + return 0; +} +_ACEOF +ac_clean_files="$ac_clean_files conftest.out" +# Check that the compiler produces executables we can run. If not, either +# the compiler is broken, or we cross compile. +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are cross compiling" >&5 +$as_echo_n "checking whether we are cross compiling... " >&6; } +if test "$cross_compiling" != yes; then + { { ac_try="$ac_link" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" +$as_echo "$ac_try_echo"; } >&5 + (eval "$ac_link") 2>&5 + ac_status=$? + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; } + if { ac_try='./conftest$ac_cv_exeext' + { { case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" +$as_echo "$ac_try_echo"; } >&5 + (eval "$ac_try") 2>&5 + ac_status=$? + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; }; }; then + cross_compiling=no + else + if test "$cross_compiling" = maybe; then + cross_compiling=yes + else + { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +as_fn_error $? "cannot run C compiled programs. +If you meant to cross compile, use \`--host'. +See \`config.log' for more details" "$LINENO" 5; } + fi + fi +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $cross_compiling" >&5 +$as_echo "$cross_compiling" >&6; } + +rm -f conftest.$ac_ext conftest$ac_cv_exeext conftest.out +ac_clean_files=$ac_clean_files_save +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for suffix of object files" >&5 +$as_echo_n "checking for suffix of object files... " >&6; } +if ${ac_cv_objext+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main () +{ + + ; + return 0; +} +_ACEOF +rm -f conftest.o conftest.obj +if { { ac_try="$ac_compile" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" +$as_echo "$ac_try_echo"; } >&5 + (eval "$ac_compile") 2>&5 + ac_status=$? + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; }; then : + for ac_file in conftest.o conftest.obj conftest.*; do + test -f "$ac_file" || continue; + case $ac_file in + *.$ac_ext | *.xcoff | *.tds | *.d | *.pdb | *.xSYM | *.bb | *.bbg | *.map | *.inf | *.dSYM ) ;; + *) ac_cv_objext=`expr "$ac_file" : '.*\.\(.*\)'` + break;; + esac +done +else + $as_echo "$as_me: failed program was:" >&5 +sed 's/^/| /' conftest.$ac_ext >&5 + +{ { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +as_fn_error $? "cannot compute suffix of object files: cannot compile +See \`config.log' for more details" "$LINENO" 5; } +fi +rm -f conftest.$ac_cv_objext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_objext" >&5 +$as_echo "$ac_cv_objext" >&6; } +OBJEXT=$ac_cv_objext +ac_objext=$OBJEXT +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are using the GNU C compiler" >&5 +$as_echo_n "checking whether we are using the GNU C compiler... " >&6; } +if ${ac_cv_c_compiler_gnu+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main () +{ +#ifndef __GNUC__ + choke me +#endif + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + ac_compiler_gnu=yes +else + ac_compiler_gnu=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +ac_cv_c_compiler_gnu=$ac_compiler_gnu + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_c_compiler_gnu" >&5 +$as_echo "$ac_cv_c_compiler_gnu" >&6; } +if test $ac_compiler_gnu = yes; then + GCC=yes +else + GCC= +fi +ac_test_CFLAGS=${CFLAGS+set} +ac_save_CFLAGS=$CFLAGS +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CC accepts -g" >&5 +$as_echo_n "checking whether $CC accepts -g... " >&6; } +if ${ac_cv_prog_cc_g+:} false; then : + $as_echo_n "(cached) " >&6 +else + ac_save_c_werror_flag=$ac_c_werror_flag + ac_c_werror_flag=yes + ac_cv_prog_cc_g=no + CFLAGS="-g" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + ac_cv_prog_cc_g=yes +else + CFLAGS="" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + +else + ac_c_werror_flag=$ac_save_c_werror_flag + CFLAGS="-g" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + ac_cv_prog_cc_g=yes +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + ac_c_werror_flag=$ac_save_c_werror_flag +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cc_g" >&5 +$as_echo "$ac_cv_prog_cc_g" >&6; } +if test "$ac_test_CFLAGS" = set; then + CFLAGS=$ac_save_CFLAGS +elif test $ac_cv_prog_cc_g = yes; then + if test "$GCC" = yes; then + CFLAGS="-g -O2" + else + CFLAGS="-g" + fi +else + if test "$GCC" = yes; then + CFLAGS="-O2" + else + CFLAGS= + fi +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $CC option to accept ISO C89" >&5 +$as_echo_n "checking for $CC option to accept ISO C89... " >&6; } +if ${ac_cv_prog_cc_c89+:} false; then : + $as_echo_n "(cached) " >&6 +else + ac_cv_prog_cc_c89=no +ac_save_CC=$CC +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +#include +struct stat; +/* Most of the following tests are stolen from RCS 5.7's src/conf.sh. */ +struct buf { int x; }; +FILE * (*rcsopen) (struct buf *, struct stat *, int); +static char *e (p, i) + char **p; + int i; +{ + return p[i]; +} +static char *f (char * (*g) (char **, int), char **p, ...) +{ + char *s; + va_list v; + va_start (v,p); + s = g (p, va_arg (v,int)); + va_end (v); + return s; +} + +/* OSF 4.0 Compaq cc is some sort of almost-ANSI by default. It has + function prototypes and stuff, but not '\xHH' hex character constants. + These don't provoke an error unfortunately, instead are silently treated + as 'x'. The following induces an error, until -std is added to get + proper ANSI mode. Curiously '\x00'!='x' always comes out true, for an + array size at least. It's necessary to write '\x00'==0 to get something + that's true only with -std. */ +int osf4_cc_array ['\x00' == 0 ? 1 : -1]; + +/* IBM C 6 for AIX is almost-ANSI by default, but it replaces macro parameters + inside strings and character constants. */ +#define FOO(x) 'x' +int xlc6_cc_array[FOO(a) == 'x' ? 1 : -1]; + +int test (int i, double x); +struct s1 {int (*f) (int a);}; +struct s2 {int (*f) (double a);}; +int pairnames (int, char **, FILE *(*)(struct buf *, struct stat *, int), int, int); +int argc; +char **argv; +int +main () +{ +return f (e, argv, 0) != argv[0] || f (e, argv, 1) != argv[1]; + ; + return 0; +} +_ACEOF +for ac_arg in '' -qlanglvl=extc89 -qlanglvl=ansi -std \ + -Ae "-Aa -D_HPUX_SOURCE" "-Xc -D__EXTENSIONS__" +do + CC="$ac_save_CC $ac_arg" + if ac_fn_c_try_compile "$LINENO"; then : + ac_cv_prog_cc_c89=$ac_arg +fi +rm -f core conftest.err conftest.$ac_objext + test "x$ac_cv_prog_cc_c89" != "xno" && break +done +rm -f conftest.$ac_ext +CC=$ac_save_CC + +fi +# AC_CACHE_VAL +case "x$ac_cv_prog_cc_c89" in + x) + { $as_echo "$as_me:${as_lineno-$LINENO}: result: none needed" >&5 +$as_echo "none needed" >&6; } ;; + xno) + { $as_echo "$as_me:${as_lineno-$LINENO}: result: unsupported" >&5 +$as_echo "unsupported" >&6; } ;; + *) + CC="$CC $ac_cv_prog_cc_c89" + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cc_c89" >&5 +$as_echo "$ac_cv_prog_cc_c89" >&6; } ;; +esac +if test "x$ac_cv_prog_cc_c89" != xno; then : + +fi + +ac_ext=c +ac_cpp='$CPP $CPPFLAGS' +ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_c_compiler_gnu + + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $CC option to accept ISO C99" >&5 +$as_echo_n "checking for $CC option to accept ISO C99... " >&6; } +if ${ac_cv_prog_cc_c99+:} false; then : + $as_echo_n "(cached) " >&6 +else + ac_cv_prog_cc_c99=no +ac_save_CC=$CC +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +#include +#include +#include +#include + +// Check varargs macros. These examples are taken from C99 6.10.3.5. +#define debug(...) fprintf (stderr, __VA_ARGS__) +#define showlist(...) puts (#__VA_ARGS__) +#define report(test,...) ((test) ? puts (#test) : printf (__VA_ARGS__)) +static void +test_varargs_macros (void) +{ + int x = 1234; + int y = 5678; + debug ("Flag"); + debug ("X = %d\n", x); + showlist (The first, second, and third items.); + report (x>y, "x is %d but y is %d", x, y); +} + +// Check long long types. +#define BIG64 18446744073709551615ull +#define BIG32 4294967295ul +#define BIG_OK (BIG64 / BIG32 == 4294967297ull && BIG64 % BIG32 == 0) +#if !BIG_OK + your preprocessor is broken; +#endif +#if BIG_OK +#else + your preprocessor is broken; +#endif +static long long int bignum = -9223372036854775807LL; +static unsigned long long int ubignum = BIG64; + +struct incomplete_array +{ + int datasize; + double data[]; +}; + +struct named_init { + int number; + const wchar_t *name; + double average; +}; + +typedef const char *ccp; + +static inline int +test_restrict (ccp restrict text) +{ + // See if C++-style comments work. + // Iterate through items via the restricted pointer. + // Also check for declarations in for loops. + for (unsigned int i = 0; *(text+i) != '\0'; ++i) + continue; + return 0; +} + +// Check varargs and va_copy. +static void +test_varargs (const char *format, ...) +{ + va_list args; + va_start (args, format); + va_list args_copy; + va_copy (args_copy, args); + + const char *str; + int number; + float fnumber; + + while (*format) + { + switch (*format++) + { + case 's': // string + str = va_arg (args_copy, const char *); + break; + case 'd': // int + number = va_arg (args_copy, int); + break; + case 'f': // float + fnumber = va_arg (args_copy, double); + break; + default: + break; + } + } + va_end (args_copy); + va_end (args); +} + +int +main () +{ + + // Check bool. + _Bool success = false; + + // Check restrict. + if (test_restrict ("String literal") == 0) + success = true; + char *restrict newvar = "Another string"; + + // Check varargs. + test_varargs ("s, d' f .", "string", 65, 34.234); + test_varargs_macros (); + + // Check flexible array members. + struct incomplete_array *ia = + malloc (sizeof (struct incomplete_array) + (sizeof (double) * 10)); + ia->datasize = 10; + for (int i = 0; i < ia->datasize; ++i) + ia->data[i] = i * 1.234; + + // Check named initializers. + struct named_init ni = { + .number = 34, + .name = L"Test wide string", + .average = 543.34343, + }; + + ni.number = 58; + + int dynamic_array[ni.number]; + dynamic_array[ni.number - 1] = 543; + + // work around unused variable warnings + return (!success || bignum == 0LL || ubignum == 0uLL || newvar[0] == 'x' + || dynamic_array[ni.number - 1] != 543); + + ; + return 0; +} +_ACEOF +for ac_arg in '' -std=gnu99 -std=c99 -c99 -AC99 -D_STDC_C99= -qlanglvl=extc99 +do + CC="$ac_save_CC $ac_arg" + if ac_fn_c_try_compile "$LINENO"; then : + ac_cv_prog_cc_c99=$ac_arg +fi +rm -f core conftest.err conftest.$ac_objext + test "x$ac_cv_prog_cc_c99" != "xno" && break +done +rm -f conftest.$ac_ext +CC=$ac_save_CC + +fi +# AC_CACHE_VAL +case "x$ac_cv_prog_cc_c99" in + x) + { $as_echo "$as_me:${as_lineno-$LINENO}: result: none needed" >&5 +$as_echo "none needed" >&6; } ;; + xno) + { $as_echo "$as_me:${as_lineno-$LINENO}: result: unsupported" >&5 +$as_echo "unsupported" >&6; } ;; + *) + CC="$CC $ac_cv_prog_cc_c99" + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cc_c99" >&5 +$as_echo "$ac_cv_prog_cc_c99" >&6; } ;; +esac +if test "x$ac_cv_prog_cc_c99" != xno; then : + +fi + + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for an ANSI C-conforming const" >&5 +$as_echo_n "checking for an ANSI C-conforming const... " >&6; } +if ${ac_cv_c_const+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +int +main () +{ + +#ifndef __cplusplus + /* Ultrix mips cc rejects this sort of thing. */ + typedef int charset[2]; + const charset cs = { 0, 0 }; + /* SunOS 4.1.1 cc rejects this. */ + char const *const *pcpcc; + char **ppc; + /* NEC SVR4.0.2 mips cc rejects this. */ + struct point {int x, y;}; + static struct point const zero = {0,0}; + /* AIX XL C 1.02.0.0 rejects this. + It does not let you subtract one const X* pointer from another in + an arm of an if-expression whose if-part is not a constant + expression */ + const char *g = "string"; + pcpcc = &g + (g ? g-g : 0); + /* HPUX 7.0 cc rejects these. */ + ++pcpcc; + ppc = (char**) pcpcc; + pcpcc = (char const *const *) ppc; + { /* SCO 3.2v4 cc rejects this sort of thing. */ + char tx; + char *t = &tx; + char const *s = 0 ? (char *) 0 : (char const *) 0; + + *t++ = 0; + if (s) return 0; + } + { /* Someone thinks the Sun supposedly-ANSI compiler will reject this. */ + int x[] = {25, 17}; + const int *foo = &x[0]; + ++foo; + } + { /* Sun SC1.0 ANSI compiler rejects this -- but not the above. */ + typedef const int *iptr; + iptr p = 0; + ++p; + } + { /* AIX XL C 1.02.0.0 rejects this sort of thing, saying + "k.c", line 2.27: 1506-025 (S) Operand must be a modifiable lvalue. */ + struct s { int j; const int *ap[3]; } bx; + struct s *b = &bx; b->j = 5; + } + { /* ULTRIX-32 V3.1 (Rev 9) vcc rejects this */ + const int foo = 10; + if (!foo) return 0; + } + return !cs[0] && !zero.x; +#endif + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + ac_cv_c_const=yes +else + ac_cv_c_const=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_c_const" >&5 +$as_echo "$ac_cv_c_const" >&6; } +if test $ac_cv_c_const = no; then + +$as_echo "#define const /**/" >>confdefs.h + +fi + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for inline" >&5 +$as_echo_n "checking for inline... " >&6; } +if ${ac_cv_c_inline+:} false; then : + $as_echo_n "(cached) " >&6 +else + ac_cv_c_inline=no +for ac_kw in inline __inline__ __inline; do + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#ifndef __cplusplus +typedef int foo_t; +static $ac_kw foo_t static_foo () {return 0; } +$ac_kw foo_t foo () {return 0; } +#endif + +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + ac_cv_c_inline=$ac_kw +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + test "$ac_cv_c_inline" != no && break +done + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_c_inline" >&5 +$as_echo "$ac_cv_c_inline" >&6; } + +case $ac_cv_c_inline in + inline | yes) ;; + *) + case $ac_cv_c_inline in + no) ac_val=;; + *) ac_val=$ac_cv_c_inline;; + esac + cat >>confdefs.h <<_ACEOF +#ifndef __cplusplus +#define inline $ac_val +#endif +_ACEOF + ;; +esac + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for C/C++ restrict keyword" >&5 +$as_echo_n "checking for C/C++ restrict keyword... " >&6; } +if ${ac_cv_c_restrict+:} false; then : + $as_echo_n "(cached) " >&6 +else + ac_cv_c_restrict=no + # The order here caters to the fact that C++ does not require restrict. + for ac_kw in __restrict __restrict__ _Restrict restrict; do + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +typedef int * int_ptr; + int foo (int_ptr $ac_kw ip) { + return ip[0]; + } +int +main () +{ +int s[1]; + int * $ac_kw t = s; + t[0] = 0; + return foo(t) + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + ac_cv_c_restrict=$ac_kw +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + test "$ac_cv_c_restrict" != no && break + done + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_c_restrict" >&5 +$as_echo "$ac_cv_c_restrict" >&6; } + + case $ac_cv_c_restrict in + restrict) ;; + no) $as_echo "#define restrict /**/" >>confdefs.h + ;; + *) cat >>confdefs.h <<_ACEOF +#define restrict $ac_cv_c_restrict +_ACEOF + ;; + esac + +ac_ext=f +ac_compile='$F77 -c $FFLAGS conftest.$ac_ext >&5' +ac_link='$F77 -o conftest$ac_exeext $FFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_f77_compiler_gnu +if test -n "$ac_tool_prefix"; then + for ac_prog in g77 xlf f77 frt pgf77 cf77 fort77 fl32 af77 xlf90 f90 pgf90 pghpf epcf90 gfortran g95 xlf95 f95 fort ifort ifc efc pgfortran pgf95 lf95 ftn nagfor + do + # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args. +set dummy $ac_tool_prefix$ac_prog; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_F77+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$F77"; then + ac_cv_prog_F77="$F77" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_prog_F77="$ac_tool_prefix$ac_prog" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + +fi +fi +F77=$ac_cv_prog_F77 +if test -n "$F77"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $F77" >&5 +$as_echo "$F77" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + + test -n "$F77" && break + done +fi +if test -z "$F77"; then + ac_ct_F77=$F77 + for ac_prog in g77 xlf f77 frt pgf77 cf77 fort77 fl32 af77 xlf90 f90 pgf90 pghpf epcf90 gfortran g95 xlf95 f95 fort ifort ifc efc pgfortran pgf95 lf95 ftn nagfor +do + # Extract the first word of "$ac_prog", so it can be a program name with args. +set dummy $ac_prog; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_ac_ct_F77+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$ac_ct_F77"; then + ac_cv_prog_ac_ct_F77="$ac_ct_F77" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_prog_ac_ct_F77="$ac_prog" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + +fi +fi +ac_ct_F77=$ac_cv_prog_ac_ct_F77 +if test -n "$ac_ct_F77"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_F77" >&5 +$as_echo "$ac_ct_F77" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + + test -n "$ac_ct_F77" && break +done + + if test "x$ac_ct_F77" = x; then + F77="" + else + case $cross_compiling:$ac_tool_warned in +yes:) +{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5 +$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;} +ac_tool_warned=yes ;; +esac + F77=$ac_ct_F77 + fi +fi + + +# Provide some information about the compiler. +$as_echo "$as_me:${as_lineno-$LINENO}: checking for Fortran 77 compiler version" >&5 +set X $ac_compile +ac_compiler=$2 +for ac_option in --version -v -V -qversion; do + { { ac_try="$ac_compiler $ac_option >&5" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" +$as_echo "$ac_try_echo"; } >&5 + (eval "$ac_compiler $ac_option >&5") 2>conftest.err + ac_status=$? + if test -s conftest.err; then + sed '10a\ +... rest of stderr output deleted ... + 10q' conftest.err >conftest.er1 + cat conftest.er1 >&5 + fi + rm -f conftest.er1 conftest.err + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; } +done +rm -f a.out + +# If we don't use `.F' as extension, the preprocessor is not run on the +# input file. (Note that this only needs to work for GNU compilers.) +ac_save_ext=$ac_ext +ac_ext=F +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are using the GNU Fortran 77 compiler" >&5 +$as_echo_n "checking whether we are using the GNU Fortran 77 compiler... " >&6; } +if ${ac_cv_f77_compiler_gnu+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat > conftest.$ac_ext <<_ACEOF + program main +#ifndef __GNUC__ + choke me +#endif + + end +_ACEOF +if ac_fn_f77_try_compile "$LINENO"; then : + ac_compiler_gnu=yes +else + ac_compiler_gnu=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +ac_cv_f77_compiler_gnu=$ac_compiler_gnu + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_f77_compiler_gnu" >&5 +$as_echo "$ac_cv_f77_compiler_gnu" >&6; } +ac_ext=$ac_save_ext +ac_test_FFLAGS=${FFLAGS+set} +ac_save_FFLAGS=$FFLAGS +FFLAGS= +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $F77 accepts -g" >&5 +$as_echo_n "checking whether $F77 accepts -g... " >&6; } +if ${ac_cv_prog_f77_g+:} false; then : + $as_echo_n "(cached) " >&6 +else + FFLAGS=-g +cat > conftest.$ac_ext <<_ACEOF + program main + + end +_ACEOF +if ac_fn_f77_try_compile "$LINENO"; then : + ac_cv_prog_f77_g=yes +else + ac_cv_prog_f77_g=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_f77_g" >&5 +$as_echo "$ac_cv_prog_f77_g" >&6; } +if test "$ac_test_FFLAGS" = set; then + FFLAGS=$ac_save_FFLAGS +elif test $ac_cv_prog_f77_g = yes; then + if test "x$ac_cv_f77_compiler_gnu" = xyes; then + FFLAGS="-g -O2" + else + FFLAGS="-g" + fi +else + if test "x$ac_cv_f77_compiler_gnu" = xyes; then + FFLAGS="-O2" + else + FFLAGS= + fi +fi + +if test $ac_compiler_gnu = yes; then + G77=yes +else + G77= +fi +ac_ext=c +ac_cpp='$CPP $CPPFLAGS' +ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_c_compiler_gnu + + +ac_ext=f +ac_compile='$F77 -c $FFLAGS conftest.$ac_ext >&5' +ac_link='$F77 -o conftest$ac_exeext $FFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_f77_compiler_gnu +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking how to get verbose linking output from $F77" >&5 +$as_echo_n "checking how to get verbose linking output from $F77... " >&6; } +if ${ac_cv_prog_f77_v+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat > conftest.$ac_ext <<_ACEOF + program main + + end +_ACEOF +if ac_fn_f77_try_compile "$LINENO"; then : + ac_cv_prog_f77_v= +# Try some options frequently used verbose output +for ac_verb in -v -verbose --verbose -V -\#\#\#; do + cat > conftest.$ac_ext <<_ACEOF + program main + + end +_ACEOF + +# Compile and link our simple test program by passing a flag (argument +# 1 to this macro) to the Fortran compiler in order to get +# "verbose" output that we can then parse for the Fortran linker +# flags. +ac_save_FFLAGS=$FFLAGS +FFLAGS="$FFLAGS $ac_verb" +eval "set x $ac_link" +shift +$as_echo "$as_me:${as_lineno-$LINENO}: $*" >&5 +# gfortran 4.3 outputs lines setting COLLECT_GCC_OPTIONS, COMPILER_PATH, +# LIBRARY_PATH; skip all such settings. +ac_f77_v_output=`eval $ac_link 5>&1 2>&1 | + sed '/^Driving:/d; /^Configured with:/d; + '"/^[_$as_cr_Letters][_$as_cr_alnum]*=/d"` +$as_echo "$ac_f77_v_output" >&5 +FFLAGS=$ac_save_FFLAGS + +rm -rf conftest* + +# On HP/UX there is a line like: "LPATH is: /foo:/bar:/baz" where +# /foo, /bar, and /baz are search directories for the Fortran linker. +# Here, we change these into -L/foo -L/bar -L/baz (and put it first): +ac_f77_v_output="`echo $ac_f77_v_output | + grep 'LPATH is:' | + sed 's|.*LPATH is\(: *[^ ]*\).*|\1|;s|: */| -L/|g'` $ac_f77_v_output" + +# FIXME: we keep getting bitten by quoted arguments; a more general fix +# that detects unbalanced quotes in FLIBS should be implemented +# and (ugh) tested at some point. +case $ac_f77_v_output in + # With xlf replace commas with spaces, + # and remove "-link" and closing parenthesis. + *xlfentry*) + ac_f77_v_output=`echo $ac_f77_v_output | + sed ' + s/,/ /g + s/ -link / /g + s/) *$// + ' + ` ;; + + # With Intel ifc, ignore the quoted -mGLOB_options_string stuff (quoted + # $LIBS confuse us, and the libraries appear later in the output anyway). + *mGLOB_options_string*) + ac_f77_v_output=`echo $ac_f77_v_output | sed 's/"-mGLOB[^"]*"/ /g'` ;; + + # Portland Group compiler has singly- or doubly-quoted -cmdline argument + # Singly-quoted arguments were reported for versions 5.2-4 and 6.0-4. + # Doubly-quoted arguments were reported for "PGF90/x86 Linux/x86 5.0-2". + *-cmdline\ * | *-ignore\ * | *-def\ *) + ac_f77_v_output=`echo $ac_f77_v_output | sed "\ + s/-cmdline *'[^']*'/ /g; s/-cmdline *\"[^\"]*\"/ /g + s/-ignore *'[^']*'/ /g; s/-ignore *\"[^\"]*\"/ /g + s/-def *'[^']*'/ /g; s/-def *\"[^\"]*\"/ /g"` ;; + + # If we are using fort77 (the f2c wrapper) then filter output and delete quotes. + *fort77*f2c*gcc*) + ac_f77_v_output=`echo "$ac_f77_v_output" | sed -n ' + /:[ ]\+Running[ ]\{1,\}"gcc"/{ + /"-c"/d + /[.]c"*/d + s/^.*"gcc"/"gcc"/ + s/"//gp + }'` ;; + + # If we are using Cray Fortran then delete quotes. + *cft90*) + ac_f77_v_output=`echo $ac_f77_v_output | sed 's/"//g'` ;; +esac + + + # look for -l* and *.a constructs in the output + for ac_arg in $ac_f77_v_output; do + case $ac_arg in + [\\/]*.a | ?:[\\/]*.a | -[lLRu]*) + ac_cv_prog_f77_v=$ac_verb + break 2 ;; + esac + done +done +if test -z "$ac_cv_prog_f77_v"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: cannot determine how to obtain linking information from $F77" >&5 +$as_echo "$as_me: WARNING: cannot determine how to obtain linking information from $F77" >&2;} +fi +else + { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: compilation failed" >&5 +$as_echo "$as_me: WARNING: compilation failed" >&2;} +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_f77_v" >&5 +$as_echo "$ac_cv_prog_f77_v" >&6; } +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for Fortran 77 libraries of $F77" >&5 +$as_echo_n "checking for Fortran 77 libraries of $F77... " >&6; } +if ${ac_cv_f77_libs+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test "x$FLIBS" != "x"; then + ac_cv_f77_libs="$FLIBS" # Let the user override the test. +else + +cat > conftest.$ac_ext <<_ACEOF + program main + + end +_ACEOF + +# Compile and link our simple test program by passing a flag (argument +# 1 to this macro) to the Fortran compiler in order to get +# "verbose" output that we can then parse for the Fortran linker +# flags. +ac_save_FFLAGS=$FFLAGS +FFLAGS="$FFLAGS $ac_cv_prog_f77_v" +eval "set x $ac_link" +shift +$as_echo "$as_me:${as_lineno-$LINENO}: $*" >&5 +# gfortran 4.3 outputs lines setting COLLECT_GCC_OPTIONS, COMPILER_PATH, +# LIBRARY_PATH; skip all such settings. +ac_f77_v_output=`eval $ac_link 5>&1 2>&1 | + sed '/^Driving:/d; /^Configured with:/d; + '"/^[_$as_cr_Letters][_$as_cr_alnum]*=/d"` +$as_echo "$ac_f77_v_output" >&5 +FFLAGS=$ac_save_FFLAGS + +rm -rf conftest* + +# On HP/UX there is a line like: "LPATH is: /foo:/bar:/baz" where +# /foo, /bar, and /baz are search directories for the Fortran linker. +# Here, we change these into -L/foo -L/bar -L/baz (and put it first): +ac_f77_v_output="`echo $ac_f77_v_output | + grep 'LPATH is:' | + sed 's|.*LPATH is\(: *[^ ]*\).*|\1|;s|: */| -L/|g'` $ac_f77_v_output" + +# FIXME: we keep getting bitten by quoted arguments; a more general fix +# that detects unbalanced quotes in FLIBS should be implemented +# and (ugh) tested at some point. +case $ac_f77_v_output in + # With xlf replace commas with spaces, + # and remove "-link" and closing parenthesis. + *xlfentry*) + ac_f77_v_output=`echo $ac_f77_v_output | + sed ' + s/,/ /g + s/ -link / /g + s/) *$// + ' + ` ;; + + # With Intel ifc, ignore the quoted -mGLOB_options_string stuff (quoted + # $LIBS confuse us, and the libraries appear later in the output anyway). + *mGLOB_options_string*) + ac_f77_v_output=`echo $ac_f77_v_output | sed 's/"-mGLOB[^"]*"/ /g'` ;; + + # Portland Group compiler has singly- or doubly-quoted -cmdline argument + # Singly-quoted arguments were reported for versions 5.2-4 and 6.0-4. + # Doubly-quoted arguments were reported for "PGF90/x86 Linux/x86 5.0-2". + *-cmdline\ * | *-ignore\ * | *-def\ *) + ac_f77_v_output=`echo $ac_f77_v_output | sed "\ + s/-cmdline *'[^']*'/ /g; s/-cmdline *\"[^\"]*\"/ /g + s/-ignore *'[^']*'/ /g; s/-ignore *\"[^\"]*\"/ /g + s/-def *'[^']*'/ /g; s/-def *\"[^\"]*\"/ /g"` ;; + + # If we are using fort77 (the f2c wrapper) then filter output and delete quotes. + *fort77*f2c*gcc*) + ac_f77_v_output=`echo "$ac_f77_v_output" | sed -n ' + /:[ ]\+Running[ ]\{1,\}"gcc"/{ + /"-c"/d + /[.]c"*/d + s/^.*"gcc"/"gcc"/ + s/"//gp + }'` ;; + + # If we are using Cray Fortran then delete quotes. + *cft90*) + ac_f77_v_output=`echo $ac_f77_v_output | sed 's/"//g'` ;; +esac + + + +ac_cv_f77_libs= + +# Save positional arguments (if any) +ac_save_positional="$@" + +set X $ac_f77_v_output +while test $# != 1; do + shift + ac_arg=$1 + case $ac_arg in + [\\/]*.a | ?:[\\/]*.a) + ac_exists=false + for ac_i in $ac_cv_f77_libs; do + if test x"$ac_arg" = x"$ac_i"; then + ac_exists=true + break + fi + done + + if test x"$ac_exists" = xtrue; then : + +else + ac_cv_f77_libs="$ac_cv_f77_libs $ac_arg" +fi + ;; + -bI:*) + ac_exists=false + for ac_i in $ac_cv_f77_libs; do + if test x"$ac_arg" = x"$ac_i"; then + ac_exists=true + break + fi + done + + if test x"$ac_exists" = xtrue; then : + +else + if test "$ac_compiler_gnu" = yes; then + for ac_link_opt in $ac_arg; do + ac_cv_f77_libs="$ac_cv_f77_libs -Xlinker $ac_link_opt" + done +else + ac_cv_f77_libs="$ac_cv_f77_libs $ac_arg" +fi +fi + ;; + # Ignore these flags. + -lang* | -lcrt*.o | -lc | -lgcc* | -lSystem | -libmil | -little \ + |-LANG:=* | -LIST:* | -LNO:* | -link) + ;; + -lkernel32) + case $host_os in + *cygwin*) ;; + *) ac_cv_f77_libs="$ac_cv_f77_libs $ac_arg" + ;; + esac + ;; + -[LRuYz]) + # These flags, when seen by themselves, take an argument. + # We remove the space between option and argument and re-iterate + # unless we find an empty arg or a new option (starting with -) + case $2 in + "" | -*);; + *) + ac_arg="$ac_arg$2" + shift; shift + set X $ac_arg "$@" + ;; + esac + ;; + -YP,*) + for ac_j in `$as_echo "$ac_arg" | sed -e 's/-YP,/-L/;s/:/ -L/g'`; do + ac_exists=false + for ac_i in $ac_cv_f77_libs; do + if test x"$ac_j" = x"$ac_i"; then + ac_exists=true + break + fi + done + + if test x"$ac_exists" = xtrue; then : + +else + ac_arg="$ac_arg $ac_j" + ac_cv_f77_libs="$ac_cv_f77_libs $ac_j" +fi + done + ;; + -[lLR]*) + ac_exists=false + for ac_i in $ac_cv_f77_libs; do + if test x"$ac_arg" = x"$ac_i"; then + ac_exists=true + break + fi + done + + if test x"$ac_exists" = xtrue; then : + +else + ac_cv_f77_libs="$ac_cv_f77_libs $ac_arg" +fi + ;; + -zallextract*| -zdefaultextract) + ac_cv_f77_libs="$ac_cv_f77_libs $ac_arg" + ;; + # Ignore everything else. + esac +done +# restore positional arguments +set X $ac_save_positional; shift + +# We only consider "LD_RUN_PATH" on Solaris systems. If this is seen, +# then we insist that the "run path" must be an absolute path (i.e. it +# must begin with a "/"). +case `(uname -sr) 2>/dev/null` in + "SunOS 5"*) + ac_ld_run_path=`$as_echo "$ac_f77_v_output" | + sed -n 's,^.*LD_RUN_PATH *= *\(/[^ ]*\).*$,-R\1,p'` + test "x$ac_ld_run_path" != x && + if test "$ac_compiler_gnu" = yes; then + for ac_link_opt in $ac_ld_run_path; do + ac_cv_f77_libs="$ac_cv_f77_libs -Xlinker $ac_link_opt" + done +else + ac_cv_f77_libs="$ac_cv_f77_libs $ac_ld_run_path" +fi + ;; +esac +fi # test "x$[]_AC_LANG_PREFIX[]LIBS" = "x" + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_f77_libs" >&5 +$as_echo "$ac_cv_f77_libs" >&6; } +FLIBS="$ac_cv_f77_libs" + + +ac_ext=c +ac_cpp='$CPP $CPPFLAGS' +ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_c_compiler_gnu + +if test -n "$ac_tool_prefix"; then + # Extract the first word of "${ac_tool_prefix}ar", so it can be a program name with args. +set dummy ${ac_tool_prefix}ar; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_AR+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$AR"; then + ac_cv_prog_AR="$AR" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_prog_AR="${ac_tool_prefix}ar" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + +fi +fi +AR=$ac_cv_prog_AR +if test -n "$AR"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $AR" >&5 +$as_echo "$AR" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + +fi +if test -z "$ac_cv_prog_AR"; then + ac_ct_AR=$AR + # Extract the first word of "ar", so it can be a program name with args. +set dummy ar; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_ac_ct_AR+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$ac_ct_AR"; then + ac_cv_prog_ac_ct_AR="$ac_ct_AR" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_prog_ac_ct_AR="ar" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + +fi +fi +ac_ct_AR=$ac_cv_prog_ac_ct_AR +if test -n "$ac_ct_AR"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_AR" >&5 +$as_echo "$ac_ct_AR" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + if test "x$ac_ct_AR" = x; then + AR="ar" + else + case $cross_compiling:$ac_tool_warned in +yes:) +{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5 +$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;} +ac_tool_warned=yes ;; +esac + AR=$ac_ct_AR + fi +else + AR="$ac_cv_prog_AR" +fi + +LIBS="$LIBS $FLIBS -lm" + +for ac_prog in flex lex +do + # Extract the first word of "$ac_prog", so it can be a program name with args. +set dummy $ac_prog; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_LEX+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$LEX"; then + ac_cv_prog_LEX="$LEX" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_prog_LEX="$ac_prog" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + +fi +fi +LEX=$ac_cv_prog_LEX +if test -n "$LEX"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $LEX" >&5 +$as_echo "$LEX" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + + test -n "$LEX" && break +done +test -n "$LEX" || LEX=":" + +if test "x$LEX" != "x:"; then + cat >conftest.l <<_ACEOF +%% +a { ECHO; } +b { REJECT; } +c { yymore (); } +d { yyless (1); } +e { /* IRIX 6.5 flex 2.5.4 underquotes its yyless argument. */ + yyless ((input () != 0)); } +f { unput (yytext[0]); } +. { BEGIN INITIAL; } +%% +#ifdef YYTEXT_POINTER +extern char *yytext; +#endif +int +main (void) +{ + return ! yylex () + ! yywrap (); +} +_ACEOF +{ { ac_try="$LEX conftest.l" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" +$as_echo "$ac_try_echo"; } >&5 + (eval "$LEX conftest.l") 2>&5 + ac_status=$? + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; } +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking lex output file root" >&5 +$as_echo_n "checking lex output file root... " >&6; } +if ${ac_cv_prog_lex_root+:} false; then : + $as_echo_n "(cached) " >&6 +else + +if test -f lex.yy.c; then + ac_cv_prog_lex_root=lex.yy +elif test -f lexyy.c; then + ac_cv_prog_lex_root=lexyy +else + as_fn_error $? "cannot find output from $LEX; giving up" "$LINENO" 5 +fi +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_lex_root" >&5 +$as_echo "$ac_cv_prog_lex_root" >&6; } +LEX_OUTPUT_ROOT=$ac_cv_prog_lex_root + +if test -z "${LEXLIB+set}"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking lex library" >&5 +$as_echo_n "checking lex library... " >&6; } +if ${ac_cv_lib_lex+:} false; then : + $as_echo_n "(cached) " >&6 +else + + ac_save_LIBS=$LIBS + ac_cv_lib_lex='none needed' + for ac_lib in '' -lfl -ll; do + LIBS="$ac_lib $ac_save_LIBS" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +`cat $LEX_OUTPUT_ROOT.c` +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + ac_cv_lib_lex=$ac_lib +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext + test "$ac_cv_lib_lex" != 'none needed' && break + done + LIBS=$ac_save_LIBS + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_lex" >&5 +$as_echo "$ac_cv_lib_lex" >&6; } + test "$ac_cv_lib_lex" != 'none needed' && LEXLIB=$ac_cv_lib_lex +fi + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether yytext is a pointer" >&5 +$as_echo_n "checking whether yytext is a pointer... " >&6; } +if ${ac_cv_prog_lex_yytext_pointer+:} false; then : + $as_echo_n "(cached) " >&6 +else + # POSIX says lex can declare yytext either as a pointer or an array; the +# default is implementation-dependent. Figure out which it is, since +# not all implementations provide the %pointer and %array declarations. +ac_cv_prog_lex_yytext_pointer=no +ac_save_LIBS=$LIBS +LIBS="$LEXLIB $ac_save_LIBS" +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + + #define YYTEXT_POINTER 1 +`cat $LEX_OUTPUT_ROOT.c` +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + ac_cv_prog_lex_yytext_pointer=yes +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +LIBS=$ac_save_LIBS + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_lex_yytext_pointer" >&5 +$as_echo "$ac_cv_prog_lex_yytext_pointer" >&6; } +if test $ac_cv_prog_lex_yytext_pointer = yes; then + +$as_echo "#define YYTEXT_POINTER 1" >>confdefs.h + +fi +rm -f conftest.l $LEX_OUTPUT_ROOT.c + +fi +if test "$LEX" = ":"; then + as_fn_error $? "(F)LEX is required for building read_input.c. Please install it and run configure again." "$LINENO" 5 +fi + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether ${MAKE-make} sets \$(MAKE)" >&5 +$as_echo_n "checking whether ${MAKE-make} sets \$(MAKE)... " >&6; } +set x ${MAKE-make} +ac_make=`$as_echo "$2" | sed 's/+/p/g; s/[^a-zA-Z0-9_]/_/g'` +if eval \${ac_cv_prog_make_${ac_make}_set+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat >conftest.make <<\_ACEOF +SHELL = /bin/sh +all: + @echo '@@@%%%=$(MAKE)=@@@%%%' +_ACEOF +# GNU make sometimes prints "make[1]: Entering ...", which would confuse us. +case `${MAKE-make} -f conftest.make 2>/dev/null` in + *@@@%%%=?*=@@@%%%*) + eval ac_cv_prog_make_${ac_make}_set=yes;; + *) + eval ac_cv_prog_make_${ac_make}_set=no;; +esac +rm -f conftest.make +fi +if eval test \$ac_cv_prog_make_${ac_make}_set = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + SET_MAKE= +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } + SET_MAKE="MAKE=${MAKE-make}" +fi + +if test -n "$ac_tool_prefix"; then + # Extract the first word of "${ac_tool_prefix}ranlib", so it can be a program name with args. +set dummy ${ac_tool_prefix}ranlib; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_RANLIB+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$RANLIB"; then + ac_cv_prog_RANLIB="$RANLIB" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_prog_RANLIB="${ac_tool_prefix}ranlib" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + +fi +fi +RANLIB=$ac_cv_prog_RANLIB +if test -n "$RANLIB"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $RANLIB" >&5 +$as_echo "$RANLIB" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + +fi +if test -z "$ac_cv_prog_RANLIB"; then + ac_ct_RANLIB=$RANLIB + # Extract the first word of "ranlib", so it can be a program name with args. +set dummy ranlib; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_ac_ct_RANLIB+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$ac_ct_RANLIB"; then + ac_cv_prog_ac_ct_RANLIB="$ac_ct_RANLIB" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_prog_ac_ct_RANLIB="ranlib" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + +fi +fi +ac_ct_RANLIB=$ac_cv_prog_ac_ct_RANLIB +if test -n "$ac_ct_RANLIB"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_RANLIB" >&5 +$as_echo "$ac_ct_RANLIB" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + if test "x$ac_ct_RANLIB" = x; then + RANLIB=":" + else + case $cross_compiling:$ac_tool_warned in +yes:) +{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5 +$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;} +ac_tool_warned=yes ;; +esac + RANLIB=$ac_ct_RANLIB + fi +else + RANLIB="$ac_cv_prog_RANLIB" +fi + +# Extract the first word of "gcc", so it can be a program name with args. +set dummy gcc; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_CCDEP+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$CCDEP"; then + ac_cv_prog_CCDEP="$CCDEP" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_prog_CCDEP=""gcc"" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + + test -z "$ac_cv_prog_CCDEP" && ac_cv_prog_CCDEP=""$CC"" +fi +fi +CCDEP=$ac_cv_prog_CCDEP +if test -n "$CCDEP"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CCDEP" >&5 +$as_echo "$CCDEP" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + +#(endian="", AC_DEFINE(LITTLE_ENDIAN,1,The endian of the architechture)) + +# AC_PROG_FC([ifort gfortran]) +# AC_FC_FUNC(testfunc, ) + +LDFLAGS="$LDFLAGS -L\${HOME}/lib -L\${top_builddir}/lib" +CCLD=${CC} + +# compilation in operator is slowest so we do it first, saves time in parallel compiles +USESUBDIRS="operator linalg solver monomial buffers cu io meas xchange init rational wrapper" + +ac_ext=c +ac_cpp='$CPP $CPPFLAGS' +ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_c_compiler_gnu +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking how to run the C preprocessor" >&5 +$as_echo_n "checking how to run the C preprocessor... " >&6; } +# On Suns, sometimes $CPP names a directory. +if test -n "$CPP" && test -d "$CPP"; then + CPP= +fi +if test -z "$CPP"; then + if ${ac_cv_prog_CPP+:} false; then : + $as_echo_n "(cached) " >&6 +else + # Double quotes because CPP needs to be expanded + for CPP in "$CC -E" "$CC -E -traditional-cpp" "/lib/cpp" + do + ac_preproc_ok=false +for ac_c_preproc_warn_flag in '' yes +do + # Use a header file that comes with gcc, so configuring glibc + # with a fresh cross-compiler works. + # Prefer to if __STDC__ is defined, since + # exists even on freestanding compilers. + # On the NeXT, cc -E runs the code through the compiler's parser, + # not just through cpp. "Syntax error" is here to catch this case. + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#ifdef __STDC__ +# include +#else +# include +#endif + Syntax error +_ACEOF +if ac_fn_c_try_cpp "$LINENO"; then : + +else + # Broken: fails on valid input. +continue +fi +rm -f conftest.err conftest.i conftest.$ac_ext + + # OK, works on sane cases. Now check whether nonexistent headers + # can be detected and how. + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +_ACEOF +if ac_fn_c_try_cpp "$LINENO"; then : + # Broken: success on invalid input. +continue +else + # Passes both tests. +ac_preproc_ok=: +break +fi +rm -f conftest.err conftest.i conftest.$ac_ext + +done +# Because of `break', _AC_PREPROC_IFELSE's cleaning code was skipped. +rm -f conftest.i conftest.err conftest.$ac_ext +if $ac_preproc_ok; then : + break +fi + + done + ac_cv_prog_CPP=$CPP + +fi + CPP=$ac_cv_prog_CPP +else + ac_cv_prog_CPP=$CPP +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $CPP" >&5 +$as_echo "$CPP" >&6; } +ac_preproc_ok=false +for ac_c_preproc_warn_flag in '' yes +do + # Use a header file that comes with gcc, so configuring glibc + # with a fresh cross-compiler works. + # Prefer to if __STDC__ is defined, since + # exists even on freestanding compilers. + # On the NeXT, cc -E runs the code through the compiler's parser, + # not just through cpp. "Syntax error" is here to catch this case. + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#ifdef __STDC__ +# include +#else +# include +#endif + Syntax error +_ACEOF +if ac_fn_c_try_cpp "$LINENO"; then : + +else + # Broken: fails on valid input. +continue +fi +rm -f conftest.err conftest.i conftest.$ac_ext + + # OK, works on sane cases. Now check whether nonexistent headers + # can be detected and how. + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +_ACEOF +if ac_fn_c_try_cpp "$LINENO"; then : + # Broken: success on invalid input. +continue +else + # Passes both tests. +ac_preproc_ok=: +break +fi +rm -f conftest.err conftest.i conftest.$ac_ext + +done +# Because of `break', _AC_PREPROC_IFELSE's cleaning code was skipped. +rm -f conftest.i conftest.err conftest.$ac_ext +if $ac_preproc_ok; then : + +else + { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +as_fn_error $? "C preprocessor \"$CPP\" fails sanity check +See \`config.log' for more details" "$LINENO" 5; } +fi + +ac_ext=c +ac_cpp='$CPP $CPPFLAGS' +ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_c_compiler_gnu + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for grep that handles long lines and -e" >&5 +$as_echo_n "checking for grep that handles long lines and -e... " >&6; } +if ${ac_cv_path_GREP+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -z "$GREP"; then + ac_path_GREP_found=false + # Loop through the user's path and test for each of PROGNAME-LIST + as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH$PATH_SEPARATOR/usr/xpg4/bin +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_prog in grep ggrep; do + for ac_exec_ext in '' $ac_executable_extensions; do + ac_path_GREP="$as_dir/$ac_prog$ac_exec_ext" + as_fn_executable_p "$ac_path_GREP" || continue +# Check for GNU ac_path_GREP and select it if it is found. + # Check for GNU $ac_path_GREP +case `"$ac_path_GREP" --version 2>&1` in +*GNU*) + ac_cv_path_GREP="$ac_path_GREP" ac_path_GREP_found=:;; +*) + ac_count=0 + $as_echo_n 0123456789 >"conftest.in" + while : + do + cat "conftest.in" "conftest.in" >"conftest.tmp" + mv "conftest.tmp" "conftest.in" + cp "conftest.in" "conftest.nl" + $as_echo 'GREP' >> "conftest.nl" + "$ac_path_GREP" -e 'GREP$' -e '-(cannot match)-' < "conftest.nl" >"conftest.out" 2>/dev/null || break + diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break + as_fn_arith $ac_count + 1 && ac_count=$as_val + if test $ac_count -gt ${ac_path_GREP_max-0}; then + # Best one so far, save it but keep looking for a better one + ac_cv_path_GREP="$ac_path_GREP" + ac_path_GREP_max=$ac_count + fi + # 10*(2^10) chars as input seems more than enough + test $ac_count -gt 10 && break + done + rm -f conftest.in conftest.tmp conftest.nl conftest.out;; +esac + + $ac_path_GREP_found && break 3 + done + done + done +IFS=$as_save_IFS + if test -z "$ac_cv_path_GREP"; then + as_fn_error $? "no acceptable grep could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" "$LINENO" 5 + fi +else + ac_cv_path_GREP=$GREP +fi + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_path_GREP" >&5 +$as_echo "$ac_cv_path_GREP" >&6; } + GREP="$ac_cv_path_GREP" + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for egrep" >&5 +$as_echo_n "checking for egrep... " >&6; } +if ${ac_cv_path_EGREP+:} false; then : + $as_echo_n "(cached) " >&6 +else + if echo a | $GREP -E '(a|b)' >/dev/null 2>&1 + then ac_cv_path_EGREP="$GREP -E" + else + if test -z "$EGREP"; then + ac_path_EGREP_found=false + # Loop through the user's path and test for each of PROGNAME-LIST + as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH$PATH_SEPARATOR/usr/xpg4/bin +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_prog in egrep; do + for ac_exec_ext in '' $ac_executable_extensions; do + ac_path_EGREP="$as_dir/$ac_prog$ac_exec_ext" + as_fn_executable_p "$ac_path_EGREP" || continue +# Check for GNU ac_path_EGREP and select it if it is found. + # Check for GNU $ac_path_EGREP +case `"$ac_path_EGREP" --version 2>&1` in +*GNU*) + ac_cv_path_EGREP="$ac_path_EGREP" ac_path_EGREP_found=:;; +*) + ac_count=0 + $as_echo_n 0123456789 >"conftest.in" + while : + do + cat "conftest.in" "conftest.in" >"conftest.tmp" + mv "conftest.tmp" "conftest.in" + cp "conftest.in" "conftest.nl" + $as_echo 'EGREP' >> "conftest.nl" + "$ac_path_EGREP" 'EGREP$' < "conftest.nl" >"conftest.out" 2>/dev/null || break + diff "conftest.out" "conftest.nl" >/dev/null 2>&1 || break + as_fn_arith $ac_count + 1 && ac_count=$as_val + if test $ac_count -gt ${ac_path_EGREP_max-0}; then + # Best one so far, save it but keep looking for a better one + ac_cv_path_EGREP="$ac_path_EGREP" + ac_path_EGREP_max=$ac_count + fi + # 10*(2^10) chars as input seems more than enough + test $ac_count -gt 10 && break + done + rm -f conftest.in conftest.tmp conftest.nl conftest.out;; +esac + + $ac_path_EGREP_found && break 3 + done + done + done +IFS=$as_save_IFS + if test -z "$ac_cv_path_EGREP"; then + as_fn_error $? "no acceptable egrep could be found in $PATH$PATH_SEPARATOR/usr/xpg4/bin" "$LINENO" 5 + fi +else + ac_cv_path_EGREP=$EGREP +fi + + fi +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_path_EGREP" >&5 +$as_echo "$ac_cv_path_EGREP" >&6; } + EGREP="$ac_cv_path_EGREP" + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for ANSI C header files" >&5 +$as_echo_n "checking for ANSI C header files... " >&6; } +if ${ac_cv_header_stdc+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +#include +#include +#include + +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + ac_cv_header_stdc=yes +else + ac_cv_header_stdc=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + +if test $ac_cv_header_stdc = yes; then + # SunOS 4.x string.h does not declare mem*, contrary to ANSI. + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include + +_ACEOF +if (eval "$ac_cpp conftest.$ac_ext") 2>&5 | + $EGREP "memchr" >/dev/null 2>&1; then : + +else + ac_cv_header_stdc=no +fi +rm -f conftest* + +fi + +if test $ac_cv_header_stdc = yes; then + # ISC 2.0.2 stdlib.h does not declare free, contrary to ANSI. + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include + +_ACEOF +if (eval "$ac_cpp conftest.$ac_ext") 2>&5 | + $EGREP "free" >/dev/null 2>&1; then : + +else + ac_cv_header_stdc=no +fi +rm -f conftest* + +fi + +if test $ac_cv_header_stdc = yes; then + # /bin/cc in Irix-4.0.5 gets non-ANSI ctype macros unless using -ansi. + if test "$cross_compiling" = yes; then : + : +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +#include +#if ((' ' & 0x0FF) == 0x020) +# define ISLOWER(c) ('a' <= (c) && (c) <= 'z') +# define TOUPPER(c) (ISLOWER(c) ? 'A' + ((c) - 'a') : (c)) +#else +# define ISLOWER(c) \ + (('a' <= (c) && (c) <= 'i') \ + || ('j' <= (c) && (c) <= 'r') \ + || ('s' <= (c) && (c) <= 'z')) +# define TOUPPER(c) (ISLOWER(c) ? ((c) | 0x40) : (c)) +#endif + +#define XOR(e, f) (((e) && !(f)) || (!(e) && (f))) +int +main () +{ + int i; + for (i = 0; i < 256; i++) + if (XOR (islower (i), ISLOWER (i)) + || toupper (i) != TOUPPER (i)) + return 2; + return 0; +} +_ACEOF +if ac_fn_c_try_run "$LINENO"; then : + +else + ac_cv_header_stdc=no +fi +rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \ + conftest.$ac_objext conftest.beam conftest.$ac_ext +fi + +fi +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_header_stdc" >&5 +$as_echo "$ac_cv_header_stdc" >&6; } +if test $ac_cv_header_stdc = yes; then + +$as_echo "#define STDC_HEADERS 1" >>confdefs.h + +fi + +# On IRIX 5.3, sys/types and inttypes.h are conflicting. +for ac_header in sys/types.h sys/stat.h stdlib.h string.h memory.h strings.h \ + inttypes.h stdint.h unistd.h +do : + as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh` +ac_fn_c_check_header_compile "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default +" +if eval test \"x\$"$as_ac_Header"\" = x"yes"; then : + cat >>confdefs.h <<_ACEOF +#define `$as_echo "HAVE_$ac_header" | $as_tr_cpp` 1 +_ACEOF + +fi + +done + + +for ac_header in stdint.h +do : + ac_fn_c_check_header_mongrel "$LINENO" "stdint.h" "ac_cv_header_stdint_h" "$ac_includes_default" +if test "x$ac_cv_header_stdint_h" = xyes; then : + cat >>confdefs.h <<_ACEOF +#define HAVE_STDINT_H 1 +_ACEOF + ac_fn_c_check_type "$LINENO" "uint16_t" "ac_cv_type_uint16_t" "$ac_includes_default" +if test "x$ac_cv_type_uint16_t" = xyes; then : + +cat >>confdefs.h <<_ACEOF +#define HAVE_UINT16_T 1 +_ACEOF + + +else + as_fn_error $? "stdint.h found but either uint16_t, uint32_t or uint64_t not found" "$LINENO" 5 + +fi +ac_fn_c_check_type "$LINENO" "uint32_t" "ac_cv_type_uint32_t" "$ac_includes_default" +if test "x$ac_cv_type_uint32_t" = xyes; then : + +cat >>confdefs.h <<_ACEOF +#define HAVE_UINT32_T 1 +_ACEOF + + +else + as_fn_error $? "stdint.h found but either uint16_t, uint32_t or uint64_t not found" "$LINENO" 5 + +fi +ac_fn_c_check_type "$LINENO" "uint64_t" "ac_cv_type_uint64_t" "$ac_includes_default" +if test "x$ac_cv_type_uint64_t" = xyes; then : + +cat >>confdefs.h <<_ACEOF +#define HAVE_UINT64_T 1 +_ACEOF + + +else + as_fn_error $? "stdint.h found but either uint16_t, uint32_t or uint64_t not found" "$LINENO" 5 + +fi + + +else + + # The cast to long int works around a bug in the HP C Compiler +# version HP92453-01 B.11.11.23709.GP, which incorrectly rejects +# declarations like `int a3[[(sizeof (unsigned char)) >= 0]];'. +# This bug is HP SR number 8606223364. +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking size of unsigned char" >&5 +$as_echo_n "checking size of unsigned char... " >&6; } +if ${ac_cv_sizeof_unsigned_char+:} false; then : + $as_echo_n "(cached) " >&6 +else + if ac_fn_c_compute_int "$LINENO" "(long int) (sizeof (unsigned char))" "ac_cv_sizeof_unsigned_char" "$ac_includes_default"; then : + +else + if test "$ac_cv_type_unsigned_char" = yes; then + { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +as_fn_error 77 "cannot compute sizeof (unsigned char) +See \`config.log' for more details" "$LINENO" 5; } + else + ac_cv_sizeof_unsigned_char=0 + fi +fi + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_sizeof_unsigned_char" >&5 +$as_echo "$ac_cv_sizeof_unsigned_char" >&6; } + + + +cat >>confdefs.h <<_ACEOF +#define SIZEOF_UNSIGNED_CHAR $ac_cv_sizeof_unsigned_char +_ACEOF + + + # The cast to long int works around a bug in the HP C Compiler +# version HP92453-01 B.11.11.23709.GP, which incorrectly rejects +# declarations like `int a3[[(sizeof (unsigned char)) >= 0]];'. +# This bug is HP SR number 8606223364. +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking size of unsigned short" >&5 +$as_echo_n "checking size of unsigned short... " >&6; } +if ${ac_cv_sizeof_unsigned_short+:} false; then : + $as_echo_n "(cached) " >&6 +else + if ac_fn_c_compute_int "$LINENO" "(long int) (sizeof (unsigned short))" "ac_cv_sizeof_unsigned_short" "$ac_includes_default"; then : + +else + if test "$ac_cv_type_unsigned_short" = yes; then + { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +as_fn_error 77 "cannot compute sizeof (unsigned short) +See \`config.log' for more details" "$LINENO" 5; } + else + ac_cv_sizeof_unsigned_short=0 + fi +fi + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_sizeof_unsigned_short" >&5 +$as_echo "$ac_cv_sizeof_unsigned_short" >&6; } + + + +cat >>confdefs.h <<_ACEOF +#define SIZEOF_UNSIGNED_SHORT $ac_cv_sizeof_unsigned_short +_ACEOF + + + # The cast to long int works around a bug in the HP C Compiler +# version HP92453-01 B.11.11.23709.GP, which incorrectly rejects +# declarations like `int a3[[(sizeof (unsigned char)) >= 0]];'. +# This bug is HP SR number 8606223364. +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking size of unsigned int" >&5 +$as_echo_n "checking size of unsigned int... " >&6; } +if ${ac_cv_sizeof_unsigned_int+:} false; then : + $as_echo_n "(cached) " >&6 +else + if ac_fn_c_compute_int "$LINENO" "(long int) (sizeof (unsigned int))" "ac_cv_sizeof_unsigned_int" "$ac_includes_default"; then : + +else + if test "$ac_cv_type_unsigned_int" = yes; then + { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +as_fn_error 77 "cannot compute sizeof (unsigned int) +See \`config.log' for more details" "$LINENO" 5; } + else + ac_cv_sizeof_unsigned_int=0 + fi +fi + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_sizeof_unsigned_int" >&5 +$as_echo "$ac_cv_sizeof_unsigned_int" >&6; } + + + +cat >>confdefs.h <<_ACEOF +#define SIZEOF_UNSIGNED_INT $ac_cv_sizeof_unsigned_int +_ACEOF + + + # The cast to long int works around a bug in the HP C Compiler +# version HP92453-01 B.11.11.23709.GP, which incorrectly rejects +# declarations like `int a3[[(sizeof (unsigned char)) >= 0]];'. +# This bug is HP SR number 8606223364. +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking size of unsigned long" >&5 +$as_echo_n "checking size of unsigned long... " >&6; } +if ${ac_cv_sizeof_unsigned_long+:} false; then : + $as_echo_n "(cached) " >&6 +else + if ac_fn_c_compute_int "$LINENO" "(long int) (sizeof (unsigned long))" "ac_cv_sizeof_unsigned_long" "$ac_includes_default"; then : + +else + if test "$ac_cv_type_unsigned_long" = yes; then + { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +as_fn_error 77 "cannot compute sizeof (unsigned long) +See \`config.log' for more details" "$LINENO" 5; } + else + ac_cv_sizeof_unsigned_long=0 + fi +fi + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_sizeof_unsigned_long" >&5 +$as_echo "$ac_cv_sizeof_unsigned_long" >&6; } + + + +cat >>confdefs.h <<_ACEOF +#define SIZEOF_UNSIGNED_LONG $ac_cv_sizeof_unsigned_long +_ACEOF + + + # The cast to long int works around a bug in the HP C Compiler +# version HP92453-01 B.11.11.23709.GP, which incorrectly rejects +# declarations like `int a3[[(sizeof (unsigned char)) >= 0]];'. +# This bug is HP SR number 8606223364. +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking size of unsigned long long" >&5 +$as_echo_n "checking size of unsigned long long... " >&6; } +if ${ac_cv_sizeof_unsigned_long_long+:} false; then : + $as_echo_n "(cached) " >&6 +else + if ac_fn_c_compute_int "$LINENO" "(long int) (sizeof (unsigned long long))" "ac_cv_sizeof_unsigned_long_long" "$ac_includes_default"; then : + +else + if test "$ac_cv_type_unsigned_long_long" = yes; then + { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +as_fn_error 77 "cannot compute sizeof (unsigned long long) +See \`config.log' for more details" "$LINENO" 5; } + else + ac_cv_sizeof_unsigned_long_long=0 + fi +fi + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_sizeof_unsigned_long_long" >&5 +$as_echo "$ac_cv_sizeof_unsigned_long_long" >&6; } + + + +cat >>confdefs.h <<_ACEOF +#define SIZEOF_UNSIGNED_LONG_LONG $ac_cv_sizeof_unsigned_long_long +_ACEOF + + + + +fi + +done + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we want to use only Benchmark" >&5 +$as_echo_n "checking whether we want to use only Benchmark... " >&6; } +# Check whether --enable-benchmark was given. +if test "${enable_benchmark+set}" = set; then : + enableval=$enable_benchmark; enable_benchmark=$enableval +else + enable_benchmark=yes +fi + +if test $enable_benchmark = no; then + +# Check whether --with-limedir was given. +if test "${with_limedir+set}" = set; then : + withval=$with_limedir; lime_dir=$withval +else + lime_dir="./c-lime" +fi + + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $lime_dir" >&5 +$as_echo "$lime_dir" >&6; } + LDFLAGS="$LDFLAGS -L${lime_dir}/lib/" + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for limeReaderNextRecord in -llime" >&5 +$as_echo_n "checking for limeReaderNextRecord in -llime... " >&6; } +if ${ac_cv_lib_lime_limeReaderNextRecord+:} false; then : + $as_echo_n "(cached) " >&6 +else + ac_check_lib_save_LIBS=$LIBS +LIBS="-llime $LIBS" +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +/* Override any GCC internal prototype to avoid an error. + Use char because int might match the return type of a GCC + builtin and then its argument prototype would still apply. */ +#ifdef __cplusplus +extern "C" +#endif +char limeReaderNextRecord (); +int +main () +{ +return limeReaderNextRecord (); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + ac_cv_lib_lime_limeReaderNextRecord=yes +else + ac_cv_lib_lime_limeReaderNextRecord=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +LIBS=$ac_check_lib_save_LIBS +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_lime_limeReaderNextRecord" >&5 +$as_echo "$ac_cv_lib_lime_limeReaderNextRecord" >&6; } +if test "x$ac_cv_lib_lime_limeReaderNextRecord" = xyes; then : + cat >>confdefs.h <<_ACEOF +#define HAVE_LIBLIME 1 +_ACEOF + + LIBS="-llime $LIBS" + +else + as_fn_error $? "library liblime is missing or needed function is not available" "$LINENO" 5 +fi + +else + +$as_echo "#define BENCHMARK 1" >>confdefs.h + +fi + + + +#LIBS="$LIBS $FLIBS -lm" + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we want to use lemon" >&5 +$as_echo_n "checking whether we want to use lemon... " >&6; } + +# Check whether --with-lemondir was given. +if test "${with_lemondir+set}" = set; then : + withval=$with_lemondir; echo yes + LEMON_AVAILABLE=1 + lemon_dir=$withval + LDFLAGS="$LDFLAGS -L${lemon_dir}/lib" + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for lemonReaderNextRecord in -llemon" >&5 +$as_echo_n "checking for lemonReaderNextRecord in -llemon... " >&6; } +if ${ac_cv_lib_lemon_lemonReaderNextRecord+:} false; then : + $as_echo_n "(cached) " >&6 +else + ac_check_lib_save_LIBS=$LIBS +LIBS="-llemon $LIBS" +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +/* Override any GCC internal prototype to avoid an error. + Use char because int might match the return type of a GCC + builtin and then its argument prototype would still apply. */ +#ifdef __cplusplus +extern "C" +#endif +char lemonReaderNextRecord (); +int +main () +{ +return lemonReaderNextRecord (); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + ac_cv_lib_lemon_lemonReaderNextRecord=yes +else + ac_cv_lib_lemon_lemonReaderNextRecord=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +LIBS=$ac_check_lib_save_LIBS +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_lemon_lemonReaderNextRecord" >&5 +$as_echo "$ac_cv_lib_lemon_lemonReaderNextRecord" >&6; } +if test "x$ac_cv_lib_lemon_lemonReaderNextRecord" = xyes; then : + cat >>confdefs.h <<_ACEOF +#define HAVE_LIBLEMON 1 +_ACEOF + + LIBS="-llemon $LIBS" + +else + as_fn_error $? "library liblemon was not found" "$LINENO" 5 +fi + +else + echo no + LEMON_AVAILABLE=0 +fi + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we use the general geometry" >&5 +$as_echo_n "checking whether we use the general geometry... " >&6; } +# Check whether --enable-indexindepgeom was given. +if test "${enable_indexindepgeom+set}" = set; then : + enableval=$enable_indexindepgeom; enable_iig=$enableval +else + enable_iig=no +fi + +if test $enable_iig = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + +$as_echo "#define _INDEX_INDEP_GEOM 1" >>confdefs.h + +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we want to use MPI" >&5 +$as_echo_n "checking whether we want to use MPI... " >&6; } +# Check whether --enable-mpi was given. +if test "${enable_mpi+set}" = set; then : + enableval=$enable_mpi; enable_mpi=$enableval +else + enable_mpi=yes +fi + +if test $enable_mpi = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + +$as_echo "#define MPI 1" >>confdefs.h + +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to use QPX intrinsics" >&5 +$as_echo_n "checking whether to use QPX intrinsics... " >&6; } +# Check whether --enable-qpx was given. +if test "${enable_qpx+set}" = set; then : + enableval=$enable_qpx; enable_qpx=$enableval +else + enable_qpx=no +fi + +if test $enable_qpx = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + +$as_echo "#define BGQ 1" >>confdefs.h + + { $as_echo "$as_me:${as_lineno-$LINENO}: Compiling with QPX intrinsics on BGQ, enabling compiler optimizations for XLC." >&5 +$as_echo "$as_me: Compiling with QPX intrinsics on BGQ, enabling compiler optimizations for XLC." >&6;} + OPTARGS="-O2 -qstrict=all -qtune=qp -qarch=qp -qmaxmem=-1" + SOPTARGS="$OPTARGS" +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to use IBM BG/Q SPI for communications" >&5 +$as_echo_n "checking whether to use IBM BG/Q SPI for communications... " >&6; } +# Check whether --enable-spi was given. +if test "${enable_spi+set}" = set; then : + enableval=$enable_spi; enable_spi=$enableval +else + enable_spi=no +fi + +if test $enable_spi = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + +$as_echo "#define SPI 1" >>confdefs.h + + SPI_FILES="DirectPut" +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } + SPI_FILES="" +fi + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we want to use OpenMP" >&5 +$as_echo_n "checking whether we want to use OpenMP... " >&6; } +# Check whether --enable-omp was given. +if test "${enable_omp+set}" = set; then : + enableval=$enable_omp; enable_omp=$enableval +else + enable_omp=yes +fi + +if test $enable_omp = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + +$as_echo "#define OMP 1" >>confdefs.h + + for ac_header in omp.h +do : + ac_fn_c_check_header_mongrel "$LINENO" "omp.h" "ac_cv_header_omp_h" "$ac_includes_default" +if test "x$ac_cv_header_omp_h" = xyes; then : + cat >>confdefs.h <<_ACEOF +#define HAVE_OMP_H 1 +_ACEOF + +else + as_fn_error $? "Cannot find OpenMP headers!" "$LINENO" 5 +fi + +done + + + OPENMP_CFLAGS= + # Check whether --enable-openmp was given. +if test "${enable_openmp+set}" = set; then : + enableval=$enable_openmp; +fi + + if test "$enable_openmp" != no; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for $CC option to support OpenMP" >&5 +$as_echo_n "checking for $CC option to support OpenMP... " >&6; } +if ${ac_cv_prog_c_openmp+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +#ifndef _OPENMP + choke me +#endif +#include +int main () { return omp_get_num_threads (); } + +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + ac_cv_prog_c_openmp='none needed' +else + ac_cv_prog_c_openmp='unsupported' + for ac_option in -fopenmp -xopenmp -openmp -mp -omp -qsmp=omp -homp \ + -Popenmp --openmp; do + ac_save_CFLAGS=$CFLAGS + CFLAGS="$CFLAGS $ac_option" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +#ifndef _OPENMP + choke me +#endif +#include +int main () { return omp_get_num_threads (); } + +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + ac_cv_prog_c_openmp=$ac_option +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext + CFLAGS=$ac_save_CFLAGS + if test "$ac_cv_prog_c_openmp" != unsupported; then + break + fi + done +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_c_openmp" >&5 +$as_echo "$ac_cv_prog_c_openmp" >&6; } + case $ac_cv_prog_c_openmp in #( + "none needed" | unsupported) + ;; #( + *) + OPENMP_CFLAGS=$ac_cv_prog_c_openmp ;; + esac + fi + + +# -- AC_OPENMP provides a compiler-dependent OPENMP_CFLAGS so we can set it here +# on the BG/Q with XLC we force a special set of options for OpenMP support + if test $enable_qpx = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: Using OpenMP with XLC on BG/Q. Compiling with \"-qsmp=omp:noauto:schedule=static -qthreaded\"." >&5 +$as_echo "$as_me: Using OpenMP with XLC on BG/Q. Compiling with \"-qsmp=omp:noauto:schedule=static -qthreaded\"." >&6;} + CFLAGS="$CFLAGS -qsmp=omp:noauto:schedule=static -qthreaded" + CPPFLAGS="$CPPFLAGS -qsmp=omp:noauto:schedule=static -qthreaded" + LDFLAGS="$LDFLAGS -qsmp=omp:noauto:schedule=static -qthreaded" + else + CFLAGS="$CFLAGS $OPENMP_CFLAGS" + CPPFLAGS="$CPPFLAGS $OPENMP_CFLAGS" + LDFLAGS="$LDFLAGS $OPENMP_CFLAGS" + fi +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + +fftw_lib=/usr +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we want to use FFTW" >&5 +$as_echo_n "checking whether we want to use FFTW... " >&6; } +# Check whether --enable-fftw was given. +if test "${enable_fftw+set}" = set; then : + enableval=$enable_fftw; enable_fftw=$enableval +else + enable_fftw=no +fi + +if test $enable_fftw = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + +$as_echo "#define HAVE_FFTW 1" >>confdefs.h + + LIBS="-lfftw3 ${LIBS}" +elif test $enable_fftw = no; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + +$as_echo "#define HAVE_FFTW 1" >>confdefs.h + + fftw_lib=${enable_fftw} + LDFLAGS="$LDFLAGS -L${fftw_lib}/lib64" + LIBS="-lfftw3 ${LIBS}" + INCLUDES="-I${fftw_lib}/include ${INCLUDES}" +fi + +if test $enable_mpi = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking which parallelisation to use for MPI" >&5 +$as_echo_n "checking which parallelisation to use for MPI... " >&6; } + +# Check whether --with-mpidimension was given. +if test "${with_mpidimension+set}" = set; then : + withval=$with_mpidimension; withmpidimension=$withval +else + withmpidimension=1 +fi + + if test $withmpidimension = 1; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: n=1 t" >&5 +$as_echo "n=1 t" >&6; } + +$as_echo "#define PARALLELT 1" >>confdefs.h + + elif test $withmpidimension = 2; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: n=2 xt" >&5 +$as_echo "n=2 xt" >&6; } + +$as_echo "#define PARALLELXT 1" >>confdefs.h + + elif test $withmpidimension = 3; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: n=3 xyt" >&5 +$as_echo "n=3 xyt" >&6; } + +$as_echo "#define PARALLELXYT 1" >>confdefs.h + + elif test $withmpidimension = 4; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: n=4 xyzt" >&5 +$as_echo "n=4 xyzt" >&6; } + +$as_echo "#define PARALLELXYZT 1" >>confdefs.h + + elif test $withmpidimension = X; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: n=1 x" >&5 +$as_echo "n=1 x" >&6; } + +$as_echo "#define PARALLELX 1" >>confdefs.h + + elif test $withmpidimension = XY; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: n=2 xy" >&5 +$as_echo "n=2 xy" >&6; } + +$as_echo "#define PARALLELXY 1" >>confdefs.h + + elif test $withmpidimension = XYZ; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: n=3 xyz" >&5 +$as_echo "n=3 xyz" >&6; } + +$as_echo "#define PARALLELXYZ 1" >>confdefs.h + + elif test $withmpidimension = T; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: n=1 t" >&5 +$as_echo "n=1 t" >&6; } + +$as_echo "#define PARALLELT 1" >>confdefs.h + + elif test $withmpidimension = XT; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: n=2 xt" >&5 +$as_echo "n=2 xt" >&6; } + +$as_echo "#define PARALLELXT 1" >>confdefs.h + + elif test $withmpidimension = XYT; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: n=3 xyt" >&5 +$as_echo "n=3 xyt" >&6; } + +$as_echo "#define PARALLELXYT 1" >>confdefs.h + + elif test $withmpidimension = XYZT; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: n=4 xyzt" >&5 +$as_echo "n=4 xyzt" >&6; } + +$as_echo "#define PARALLELXYZT 1" >>confdefs.h + + else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: unknown" >&5 +$as_echo "unknown" >&6; } + as_fn_error $? "Only t, xt, xyt, xyzt, x, xy, xyz parallelisation available" "$LINENO" 5 + fi + + { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we shall use persistent MPI calls for halfspinor" >&5 +$as_echo_n "checking whether we shall use persistent MPI calls for halfspinor... " >&6; } + +# Check whether --with-persistentmpi was given. +if test "${with_persistentmpi+set}" = set; then : + withval=$with_persistentmpi; withpersistent=$withval +else + withpersistent=no +fi + + if test $withpersistent = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + +$as_echo "#define _PERSISTENT 1" >>confdefs.h + + else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } + fi + + { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we shall use non-blocking MPI calls" >&5 +$as_echo_n "checking whether we shall use non-blocking MPI calls... " >&6; } + +# Check whether --with-nonblockingmpi was given. +if test "${with_nonblockingmpi+set}" = set; then : + withval=$with_nonblockingmpi; withnonblock=$withval +else + withnonblock=yes +fi + + if test $withnonblock = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + +$as_echo "#define _NON_BLOCKING 1" >>confdefs.h + + else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } + fi +fi + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we want to fix volume at compiletime" >&5 +$as_echo_n "checking whether we want to fix volume at compiletime... " >&6; } + +# Check whether --with-fixedvolume was given. +if test "${with_fixedvolume+set}" = set; then : + withval=$with_fixedvolume; with_fixvol=$withval +else + with_fixvol=no +fi + +if test $with_fixvol = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + +$as_echo "#define FIXEDVOLUME 1" >>confdefs.h + + ac_config_files="$ac_config_files fixed_volume.h" + +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we want to use KOJAK instrumentalisation" >&5 +$as_echo_n "checking whether we want to use KOJAK instrumentalisation... " >&6; } + +# Check whether --with-kojakinst was given. +if test "${with_kojakinst+set}" = set; then : + withval=$with_kojakinst; with_kojakinst=$withval +else + with_kojakinst=no +fi + +if test $with_kojakinst = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + CC="kinst-pomp ${CC}" +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we want to use lapack and blas" >&5 +$as_echo_n "checking whether we want to use lapack and blas... " >&6; } + +# Check whether --with-lapack was given. +if test "${with_lapack+set}" = set; then : + withval=$with_lapack; with_lapack=$withval +else + with_lapack=yes +fi + +if test "$with_lapack" = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + LAPACKLIB= + +$as_echo "#define HAVE_LAPACK 1" >>confdefs.h + +elif test "$with_lapack" != no; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + LIBS="$withval $LIBS" + with_lapack=yes + +$as_echo "#define HAVE_LAPACK 1" >>confdefs.h + +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } + as_fn_error $? "lapack is needed! Will stop here." "$LINENO" 5 +fi + +if test $enable_mpi = yes; then + if test "$host_vendor" != "cray"; then + cross_compiling=yes + fi +fi + + +for ac_func in clock_gettime +do : + ac_fn_c_check_func "$LINENO" "clock_gettime" "ac_cv_func_clock_gettime" +if test "x$ac_cv_func_clock_gettime" = xyes; then : + cat >>confdefs.h <<_ACEOF +#define HAVE_CLOCK_GETTIME 1 +_ACEOF + +else + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for clock_gettime in -lrt" >&5 +$as_echo_n "checking for clock_gettime in -lrt... " >&6; } +if ${ac_cv_lib_rt_clock_gettime+:} false; then : + $as_echo_n "(cached) " >&6 +else + ac_check_lib_save_LIBS=$LIBS +LIBS="-lrt $LIBS" +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +/* Override any GCC internal prototype to avoid an error. + Use char because int might match the return type of a GCC + builtin and then its argument prototype would still apply. */ +#ifdef __cplusplus +extern "C" +#endif +char clock_gettime (); +int +main () +{ +return clock_gettime (); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + ac_cv_lib_rt_clock_gettime=yes +else + ac_cv_lib_rt_clock_gettime=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +LIBS=$ac_check_lib_save_LIBS +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_rt_clock_gettime" >&5 +$as_echo "$ac_cv_lib_rt_clock_gettime" >&6; } +if test "x$ac_cv_lib_rt_clock_gettime" = xyes; then : + cat >>confdefs.h <<_ACEOF +#define HAVE_LIBRT 1 +_ACEOF + + LIBS="-lrt $LIBS" + +fi + +fi +done + + +if ( test "$ac_cv_lib_rt_clock_gettime" = "yes" || test "$ac_cv_func_clock_gettime" = "yes" ); then + $as_echo "#define HAVE_CLOCK_GETTIME 1" >>confdefs.h + + { $as_echo "$as_me:${as_lineno-$LINENO}: Instructing the compiler to use POSIX 199309L" >&5 +$as_echo "$as_me: Instructing the compiler to use POSIX 199309L" >&6;} +fi + +ac_ext=f +ac_compile='$F77 -c $FFLAGS conftest.$ac_ext >&5' +ac_link='$F77 -o conftest$ac_exeext $FFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_f77_compiler_gnu + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for dummy main to link with Fortran 77 libraries" >&5 +$as_echo_n "checking for dummy main to link with Fortran 77 libraries... " >&6; } +if ${ac_cv_f77_dummy_main+:} false; then : + $as_echo_n "(cached) " >&6 +else + ac_f77_dm_save_LIBS=$LIBS + LIBS="$LIBS $FLIBS" + ac_fortran_dm_var=F77_DUMMY_MAIN + ac_ext=c +ac_cpp='$CPP $CPPFLAGS' +ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_c_compiler_gnu + + # First, try linking without a dummy main: + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +#ifdef F77_DUMMY_MAIN + +# ifdef __cplusplus + extern "C" +# endif + int F77_DUMMY_MAIN() { return 1; } + +#endif +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + ac_cv_fortran_dummy_main=none +else + ac_cv_fortran_dummy_main=unknown +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext + + if test $ac_cv_fortran_dummy_main = unknown; then + for ac_func in MAIN__ MAIN_ __main MAIN _MAIN __MAIN main_ main__ _main; do + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#define $ac_fortran_dm_var $ac_func +#ifdef F77_DUMMY_MAIN + +# ifdef __cplusplus + extern "C" +# endif + int F77_DUMMY_MAIN() { return 1; } + +#endif +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + ac_cv_fortran_dummy_main=$ac_func; break +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext + done + fi + ac_ext=f +ac_compile='$F77 -c $FFLAGS conftest.$ac_ext >&5' +ac_link='$F77 -o conftest$ac_exeext $FFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_f77_compiler_gnu + ac_cv_f77_dummy_main=$ac_cv_fortran_dummy_main + rm -rf conftest* + LIBS=$ac_f77_dm_save_LIBS + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_f77_dummy_main" >&5 +$as_echo "$ac_cv_f77_dummy_main" >&6; } +F77_DUMMY_MAIN=$ac_cv_f77_dummy_main +if test "$F77_DUMMY_MAIN" != unknown; then : + if test $F77_DUMMY_MAIN != none; then + +cat >>confdefs.h <<_ACEOF +#define F77_DUMMY_MAIN $F77_DUMMY_MAIN +_ACEOF + + if test "x$ac_cv_fc_dummy_main" = "x$ac_cv_f77_dummy_main"; then + +$as_echo "#define FC_DUMMY_MAIN_EQ_F77 1" >>confdefs.h + + fi +fi +else + { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +as_fn_error $? "linking to Fortran libraries from C fails +See \`config.log' for more details" "$LINENO" 5; } +fi + +ac_ext=c +ac_cpp='$CPP $CPPFLAGS' +ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_c_compiler_gnu + +ac_ext=f +ac_compile='$F77 -c $FFLAGS conftest.$ac_ext >&5' +ac_link='$F77 -o conftest$ac_exeext $FFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_f77_compiler_gnu +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for Fortran 77 name-mangling scheme" >&5 +$as_echo_n "checking for Fortran 77 name-mangling scheme... " >&6; } +if ${ac_cv_f77_mangling+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat > conftest.$ac_ext <<_ACEOF + subroutine foobar() + return + end + subroutine foo_bar() + return + end +_ACEOF +if ac_fn_f77_try_compile "$LINENO"; then : + mv conftest.$ac_objext cfortran_test.$ac_objext + + ac_save_LIBS=$LIBS + LIBS="cfortran_test.$ac_objext $LIBS $FLIBS" + + ac_ext=c +ac_cpp='$CPP $CPPFLAGS' +ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_c_compiler_gnu + ac_success=no + for ac_foobar in foobar FOOBAR; do + for ac_underscore in "" "_"; do + ac_func="$ac_foobar$ac_underscore" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +/* Override any GCC internal prototype to avoid an error. + Use char because int might match the return type of a GCC + builtin and then its argument prototype would still apply. */ +#ifdef __cplusplus +extern "C" +#endif +char $ac_func (); +#ifdef F77_DUMMY_MAIN + +# ifdef __cplusplus + extern "C" +# endif + int F77_DUMMY_MAIN() { return 1; } + +#endif +int +main () +{ +return $ac_func (); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + ac_success=yes; break 2 +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext + done + done + ac_ext=f +ac_compile='$F77 -c $FFLAGS conftest.$ac_ext >&5' +ac_link='$F77 -o conftest$ac_exeext $FFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_f77_compiler_gnu + + if test "$ac_success" = "yes"; then + case $ac_foobar in + foobar) + ac_case=lower + ac_foo_bar=foo_bar + ;; + FOOBAR) + ac_case=upper + ac_foo_bar=FOO_BAR + ;; + esac + + ac_ext=c +ac_cpp='$CPP $CPPFLAGS' +ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_c_compiler_gnu + ac_success_extra=no + for ac_extra in "" "_"; do + ac_func="$ac_foo_bar$ac_underscore$ac_extra" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +/* Override any GCC internal prototype to avoid an error. + Use char because int might match the return type of a GCC + builtin and then its argument prototype would still apply. */ +#ifdef __cplusplus +extern "C" +#endif +char $ac_func (); +#ifdef F77_DUMMY_MAIN + +# ifdef __cplusplus + extern "C" +# endif + int F77_DUMMY_MAIN() { return 1; } + +#endif +int +main () +{ +return $ac_func (); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + ac_success_extra=yes; break +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext + done + ac_ext=f +ac_compile='$F77 -c $FFLAGS conftest.$ac_ext >&5' +ac_link='$F77 -o conftest$ac_exeext $FFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_f77_compiler_gnu + + if test "$ac_success_extra" = "yes"; then + ac_cv_f77_mangling="$ac_case case" + if test -z "$ac_underscore"; then + ac_cv_f77_mangling="$ac_cv_f77_mangling, no underscore" + else + ac_cv_f77_mangling="$ac_cv_f77_mangling, underscore" + fi + if test -z "$ac_extra"; then + ac_cv_f77_mangling="$ac_cv_f77_mangling, no extra underscore" + else + ac_cv_f77_mangling="$ac_cv_f77_mangling, extra underscore" + fi + else + ac_cv_f77_mangling="unknown" + fi + else + ac_cv_f77_mangling="unknown" + fi + + LIBS=$ac_save_LIBS + rm -rf conftest* + rm -f cfortran_test* +else + { { $as_echo "$as_me:${as_lineno-$LINENO}: error: in \`$ac_pwd':" >&5 +$as_echo "$as_me: error: in \`$ac_pwd':" >&2;} +as_fn_error $? "cannot compile a simple Fortran program +See \`config.log' for more details" "$LINENO" 5; } +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_f77_mangling" >&5 +$as_echo "$ac_cv_f77_mangling" >&6; } + +ac_ext=c +ac_cpp='$CPP $CPPFLAGS' +ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_c_compiler_gnu + +ac_ext=f +ac_compile='$F77 -c $FFLAGS conftest.$ac_ext >&5' +ac_link='$F77 -o conftest$ac_exeext $FFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_f77_compiler_gnu +case $ac_cv_f77_mangling in + upper*) ac_val="ZHEEV" ;; + lower*) ac_val="zheev" ;; + *) ac_val="unknown" ;; +esac +case $ac_cv_f77_mangling in *," underscore"*) ac_val="$ac_val"_ ;; esac + +zheev="$ac_val" + +ac_ext=c +ac_cpp='$CPP $CPPFLAGS' +ac_compile='$CC -c $CFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CC -o conftest$ac_exeext $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_c_compiler_gnu + +if test "$zheev" = "zheev"; then + +$as_echo "#define NOF77_ 1" >>confdefs.h + +fi +as_ac_Search=`$as_echo "ac_cv_search_$zheev" | $as_tr_sh` +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for library containing $zheev" >&5 +$as_echo_n "checking for library containing $zheev... " >&6; } +if eval \${$as_ac_Search+:} false; then : + $as_echo_n "(cached) " >&6 +else + ac_func_search_save_LIBS=$LIBS +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +/* Override any GCC internal prototype to avoid an error. + Use char because int might match the return type of a GCC + builtin and then its argument prototype would still apply. */ +#ifdef __cplusplus +extern "C" +#endif +char $zheev (); +#ifdef F77_DUMMY_MAIN + +# ifdef __cplusplus + extern "C" +# endif + int F77_DUMMY_MAIN() { return 1; } + +#endif +int +main () +{ +return $zheev (); + ; + return 0; +} +_ACEOF +for ac_lib in '' lapack; do + if test -z "$ac_lib"; then + ac_res="none required" + else + ac_res=-l$ac_lib + LIBS="-l$ac_lib $ac_func_search_save_LIBS" + fi + if ac_fn_c_try_link "$LINENO"; then : + eval "$as_ac_Search=\$ac_res" +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext + if eval \${$as_ac_Search+:} false; then : + break +fi +done +if eval \${$as_ac_Search+:} false; then : + +else + eval "$as_ac_Search=no" +fi +rm conftest.$ac_ext +LIBS=$ac_func_search_save_LIBS +fi +eval ac_res=\$$as_ac_Search + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5 +$as_echo "$ac_res" >&6; } +eval ac_res=\$$as_ac_Search +if test "$ac_res" != no; then : + test "$ac_res" = "none required" || LIBS="$ac_res $LIBS" + +else + as_fn_error $? "Cannot find lapack" "$LINENO" 5 +fi + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for ANSI C header files" >&5 +$as_echo_n "checking for ANSI C header files... " >&6; } +if ${ac_cv_header_stdc+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +#include +#include +#include + +#ifdef F77_DUMMY_MAIN + +# ifdef __cplusplus + extern "C" +# endif + int F77_DUMMY_MAIN() { return 1; } + +#endif +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + ac_cv_header_stdc=yes +else + ac_cv_header_stdc=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + +if test $ac_cv_header_stdc = yes; then + # SunOS 4.x string.h does not declare mem*, contrary to ANSI. + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include + +_ACEOF +if (eval "$ac_cpp conftest.$ac_ext") 2>&5 | + $EGREP "memchr" >/dev/null 2>&1; then : + +else + ac_cv_header_stdc=no +fi +rm -f conftest* + +fi + +if test $ac_cv_header_stdc = yes; then + # ISC 2.0.2 stdlib.h does not declare free, contrary to ANSI. + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include + +_ACEOF +if (eval "$ac_cpp conftest.$ac_ext") 2>&5 | + $EGREP "free" >/dev/null 2>&1; then : + +else + ac_cv_header_stdc=no +fi +rm -f conftest* + +fi + +if test $ac_cv_header_stdc = yes; then + # /bin/cc in Irix-4.0.5 gets non-ANSI ctype macros unless using -ansi. + if test "$cross_compiling" = yes; then : + : +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +#include +#if ((' ' & 0x0FF) == 0x020) +# define ISLOWER(c) ('a' <= (c) && (c) <= 'z') +# define TOUPPER(c) (ISLOWER(c) ? 'A' + ((c) - 'a') : (c)) +#else +# define ISLOWER(c) \ + (('a' <= (c) && (c) <= 'i') \ + || ('j' <= (c) && (c) <= 'r') \ + || ('s' <= (c) && (c) <= 'z')) +# define TOUPPER(c) (ISLOWER(c) ? ((c) | 0x40) : (c)) +#endif + +#define XOR(e, f) (((e) && !(f)) || (!(e) && (f))) +int +main () +{ + int i; + for (i = 0; i < 256; i++) + if (XOR (islower (i), ISLOWER (i)) + || toupper (i) != TOUPPER (i)) + return 2; + return 0; +} +_ACEOF +if ac_fn_c_try_run "$LINENO"; then : + +else + ac_cv_header_stdc=no +fi +rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \ + conftest.$ac_objext conftest.beam conftest.$ac_ext +fi + +fi +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_header_stdc" >&5 +$as_echo "$ac_cv_header_stdc" >&6; } +if test $ac_cv_header_stdc = yes; then + +$as_echo "#define STDC_HEADERS 1" >>confdefs.h + +fi + +for ac_header in float.h libintl.h limits.h stdint.h stdlib.h string.h strings.h sys/time.h unistd.h endian.h +do : + as_ac_Header=`$as_echo "ac_cv_header_$ac_header" | $as_tr_sh` +ac_fn_c_check_header_mongrel "$LINENO" "$ac_header" "$as_ac_Header" "$ac_includes_default" +if eval test \"x\$"$as_ac_Header"\" = x"yes"; then : + cat >>confdefs.h <<_ACEOF +#define `$as_echo "HAVE_$ac_header" | $as_tr_cpp` 1 +_ACEOF + +fi + +done + +ac_fn_c_check_header_mongrel "$LINENO" "getopt.h" "ac_cv_header_getopt_h" "$ac_includes_default" +if test "x$ac_cv_header_getopt_h" = xyes; then : + +fi + + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for an ANSI C-conforming const" >&5 +$as_echo_n "checking for an ANSI C-conforming const... " >&6; } +if ${ac_cv_c_const+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +#ifdef F77_DUMMY_MAIN + +# ifdef __cplusplus + extern "C" +# endif + int F77_DUMMY_MAIN() { return 1; } + +#endif +int +main () +{ + +#ifndef __cplusplus + /* Ultrix mips cc rejects this sort of thing. */ + typedef int charset[2]; + const charset cs = { 0, 0 }; + /* SunOS 4.1.1 cc rejects this. */ + char const *const *pcpcc; + char **ppc; + /* NEC SVR4.0.2 mips cc rejects this. */ + struct point {int x, y;}; + static struct point const zero = {0,0}; + /* AIX XL C 1.02.0.0 rejects this. + It does not let you subtract one const X* pointer from another in + an arm of an if-expression whose if-part is not a constant + expression */ + const char *g = "string"; + pcpcc = &g + (g ? g-g : 0); + /* HPUX 7.0 cc rejects these. */ + ++pcpcc; + ppc = (char**) pcpcc; + pcpcc = (char const *const *) ppc; + { /* SCO 3.2v4 cc rejects this sort of thing. */ + char tx; + char *t = &tx; + char const *s = 0 ? (char *) 0 : (char const *) 0; + + *t++ = 0; + if (s) return 0; + } + { /* Someone thinks the Sun supposedly-ANSI compiler will reject this. */ + int x[] = {25, 17}; + const int *foo = &x[0]; + ++foo; + } + { /* Sun SC1.0 ANSI compiler rejects this -- but not the above. */ + typedef const int *iptr; + iptr p = 0; + ++p; + } + { /* AIX XL C 1.02.0.0 rejects this sort of thing, saying + "k.c", line 2.27: 1506-025 (S) Operand must be a modifiable lvalue. */ + struct s { int j; const int *ap[3]; } bx; + struct s *b = &bx; b->j = 5; + } + { /* ULTRIX-32 V3.1 (Rev 9) vcc rejects this */ + const int foo = 10; + if (!foo) return 0; + } + return !cs[0] && !zero.x; +#endif + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + ac_cv_c_const=yes +else + ac_cv_c_const=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_c_const" >&5 +$as_echo "$ac_cv_c_const" >&6; } +if test $ac_cv_c_const = no; then + +$as_echo "#define const /**/" >>confdefs.h + +fi + +ac_fn_c_check_type "$LINENO" "off_t" "ac_cv_type_off_t" "$ac_includes_default" +if test "x$ac_cv_type_off_t" = xyes; then : + +else + +cat >>confdefs.h <<_ACEOF +#define off_t long int +_ACEOF + +fi + +ac_fn_c_check_type "$LINENO" "size_t" "ac_cv_type_size_t" "$ac_includes_default" +if test "x$ac_cv_type_size_t" = xyes; then : + +else + +cat >>confdefs.h <<_ACEOF +#define size_t unsigned int +_ACEOF + +fi + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether time.h and sys/time.h may both be included" >&5 +$as_echo_n "checking whether time.h and sys/time.h may both be included... " >&6; } +if ${ac_cv_header_time+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +#include +#include + +#ifdef F77_DUMMY_MAIN + +# ifdef __cplusplus + extern "C" +# endif + int F77_DUMMY_MAIN() { return 1; } + +#endif +int +main () +{ +if ((struct tm *) 0) +return 0; + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + ac_cv_header_time=yes +else + ac_cv_header_time=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_header_time" >&5 +$as_echo "$ac_cv_header_time" >&6; } +if test $ac_cv_header_time = yes; then + +$as_echo "#define TIME_WITH_SYS_TIME 1" >>confdefs.h + +fi + + +# Check whether --enable-largefile was given. +if test "${enable_largefile+set}" = set; then : + enableval=$enable_largefile; +fi + +if test "$enable_largefile" != no; then + + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for special C compiler options needed for large files" >&5 +$as_echo_n "checking for special C compiler options needed for large files... " >&6; } +if ${ac_cv_sys_largefile_CC+:} false; then : + $as_echo_n "(cached) " >&6 +else + ac_cv_sys_largefile_CC=no + if test "$GCC" != yes; then + ac_save_CC=$CC + while :; do + # IRIX 6.2 and later do not support large files by default, + # so use the C compiler's -n32 option if that helps. + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include + /* Check that off_t can represent 2**63 - 1 correctly. + We can't simply define LARGE_OFF_T to be 9223372036854775807, + since some C++ compilers masquerading as C compilers + incorrectly reject 9223372036854775807. */ +#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31)) + int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721 + && LARGE_OFF_T % 2147483647 == 1) + ? 1 : -1]; +#ifdef F77_DUMMY_MAIN + +# ifdef __cplusplus + extern "C" +# endif + int F77_DUMMY_MAIN() { return 1; } + +#endif +int +main () +{ + + ; + return 0; +} +_ACEOF + if ac_fn_c_try_compile "$LINENO"; then : + break +fi +rm -f core conftest.err conftest.$ac_objext + CC="$CC -n32" + if ac_fn_c_try_compile "$LINENO"; then : + ac_cv_sys_largefile_CC=' -n32'; break +fi +rm -f core conftest.err conftest.$ac_objext + break + done + CC=$ac_save_CC + rm -f conftest.$ac_ext + fi +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_sys_largefile_CC" >&5 +$as_echo "$ac_cv_sys_largefile_CC" >&6; } + if test "$ac_cv_sys_largefile_CC" != no; then + CC=$CC$ac_cv_sys_largefile_CC + fi + + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _FILE_OFFSET_BITS value needed for large files" >&5 +$as_echo_n "checking for _FILE_OFFSET_BITS value needed for large files... " >&6; } +if ${ac_cv_sys_file_offset_bits+:} false; then : + $as_echo_n "(cached) " >&6 +else + while :; do + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include + /* Check that off_t can represent 2**63 - 1 correctly. + We can't simply define LARGE_OFF_T to be 9223372036854775807, + since some C++ compilers masquerading as C compilers + incorrectly reject 9223372036854775807. */ +#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31)) + int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721 + && LARGE_OFF_T % 2147483647 == 1) + ? 1 : -1]; +#ifdef F77_DUMMY_MAIN + +# ifdef __cplusplus + extern "C" +# endif + int F77_DUMMY_MAIN() { return 1; } + +#endif +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + ac_cv_sys_file_offset_bits=no; break +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#define _FILE_OFFSET_BITS 64 +#include + /* Check that off_t can represent 2**63 - 1 correctly. + We can't simply define LARGE_OFF_T to be 9223372036854775807, + since some C++ compilers masquerading as C compilers + incorrectly reject 9223372036854775807. */ +#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31)) + int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721 + && LARGE_OFF_T % 2147483647 == 1) + ? 1 : -1]; +#ifdef F77_DUMMY_MAIN + +# ifdef __cplusplus + extern "C" +# endif + int F77_DUMMY_MAIN() { return 1; } + +#endif +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + ac_cv_sys_file_offset_bits=64; break +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + ac_cv_sys_file_offset_bits=unknown + break +done +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_sys_file_offset_bits" >&5 +$as_echo "$ac_cv_sys_file_offset_bits" >&6; } +case $ac_cv_sys_file_offset_bits in #( + no | unknown) ;; + *) +cat >>confdefs.h <<_ACEOF +#define _FILE_OFFSET_BITS $ac_cv_sys_file_offset_bits +_ACEOF +;; +esac +rm -rf conftest* + if test $ac_cv_sys_file_offset_bits = unknown; then + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for _LARGE_FILES value needed for large files" >&5 +$as_echo_n "checking for _LARGE_FILES value needed for large files... " >&6; } +if ${ac_cv_sys_large_files+:} false; then : + $as_echo_n "(cached) " >&6 +else + while :; do + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include + /* Check that off_t can represent 2**63 - 1 correctly. + We can't simply define LARGE_OFF_T to be 9223372036854775807, + since some C++ compilers masquerading as C compilers + incorrectly reject 9223372036854775807. */ +#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31)) + int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721 + && LARGE_OFF_T % 2147483647 == 1) + ? 1 : -1]; +#ifdef F77_DUMMY_MAIN + +# ifdef __cplusplus + extern "C" +# endif + int F77_DUMMY_MAIN() { return 1; } + +#endif +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + ac_cv_sys_large_files=no; break +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#define _LARGE_FILES 1 +#include + /* Check that off_t can represent 2**63 - 1 correctly. + We can't simply define LARGE_OFF_T to be 9223372036854775807, + since some C++ compilers masquerading as C compilers + incorrectly reject 9223372036854775807. */ +#define LARGE_OFF_T ((((off_t) 1 << 31) << 31) - 1 + (((off_t) 1 << 31) << 31)) + int off_t_is_large[(LARGE_OFF_T % 2147483629 == 721 + && LARGE_OFF_T % 2147483647 == 1) + ? 1 : -1]; +#ifdef F77_DUMMY_MAIN + +# ifdef __cplusplus + extern "C" +# endif + int F77_DUMMY_MAIN() { return 1; } + +#endif +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + ac_cv_sys_large_files=1; break +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + ac_cv_sys_large_files=unknown + break +done +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_sys_large_files" >&5 +$as_echo "$ac_cv_sys_large_files" >&6; } +case $ac_cv_sys_large_files in #( + no | unknown) ;; + *) +cat >>confdefs.h <<_ACEOF +#define _LARGE_FILES $ac_cv_sys_large_files +_ACEOF +;; +esac +rm -rf conftest* + fi + + +fi + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for _LARGEFILE_SOURCE value needed for large files" >&5 +$as_echo_n "checking for _LARGEFILE_SOURCE value needed for large files... " >&6; } +if ${ac_cv_sys_largefile_source+:} false; then : + $as_echo_n "(cached) " >&6 +else + while :; do + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include /* for off_t */ + #include +#ifdef F77_DUMMY_MAIN + +# ifdef __cplusplus + extern "C" +# endif + int F77_DUMMY_MAIN() { return 1; } + +#endif +int +main () +{ +int (*fp) (FILE *, off_t, int) = fseeko; + return fseeko (stdin, 0, 0) && fp (stdin, 0, 0); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + ac_cv_sys_largefile_source=no; break +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#define _LARGEFILE_SOURCE 1 +#include /* for off_t */ + #include +#ifdef F77_DUMMY_MAIN + +# ifdef __cplusplus + extern "C" +# endif + int F77_DUMMY_MAIN() { return 1; } + +#endif +int +main () +{ +int (*fp) (FILE *, off_t, int) = fseeko; + return fseeko (stdin, 0, 0) && fp (stdin, 0, 0); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + ac_cv_sys_largefile_source=1; break +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext + ac_cv_sys_largefile_source=unknown + break +done +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_sys_largefile_source" >&5 +$as_echo "$ac_cv_sys_largefile_source" >&6; } +case $ac_cv_sys_largefile_source in #( + no | unknown) ;; + *) +cat >>confdefs.h <<_ACEOF +#define _LARGEFILE_SOURCE $ac_cv_sys_largefile_source +_ACEOF +;; +esac +rm -rf conftest* + +# We used to try defining _XOPEN_SOURCE=500 too, to work around a bug +# in glibc 2.1.3, but that breaks too many other things. +# If you want fseeko and ftello with glibc, upgrade to a fixed glibc. +if test $ac_cv_sys_largefile_source != unknown; then + +$as_echo "#define HAVE_FSEEKO 1" >>confdefs.h + +fi + +for ac_header in stdlib.h +do : + ac_fn_c_check_header_mongrel "$LINENO" "stdlib.h" "ac_cv_header_stdlib_h" "$ac_includes_default" +if test "x$ac_cv_header_stdlib_h" = xyes; then : + cat >>confdefs.h <<_ACEOF +#define HAVE_STDLIB_H 1 +_ACEOF + +fi + +done + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for GNU libc compatible malloc" >&5 +$as_echo_n "checking for GNU libc compatible malloc... " >&6; } +if ${ac_cv_func_malloc_0_nonnull+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test "$cross_compiling" = yes; then : + ac_cv_func_malloc_0_nonnull=no +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#if defined STDC_HEADERS || defined HAVE_STDLIB_H +# include +#else +char *malloc (); +#endif + +#ifdef F77_DUMMY_MAIN + +# ifdef __cplusplus + extern "C" +# endif + int F77_DUMMY_MAIN() { return 1; } + +#endif +int +main () +{ +return ! malloc (0); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_run "$LINENO"; then : + ac_cv_func_malloc_0_nonnull=yes +else + ac_cv_func_malloc_0_nonnull=no +fi +rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext \ + conftest.$ac_objext conftest.beam conftest.$ac_ext +fi + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_func_malloc_0_nonnull" >&5 +$as_echo "$ac_cv_func_malloc_0_nonnull" >&6; } +if test $ac_cv_func_malloc_0_nonnull = yes; then : + +$as_echo "#define HAVE_MALLOC 1" >>confdefs.h + +else + $as_echo "#define HAVE_MALLOC 0" >>confdefs.h + + case " $LIBOBJS " in + *" malloc.$ac_objext "* ) ;; + *) LIBOBJS="$LIBOBJS malloc.$ac_objext" + ;; +esac + + +$as_echo "#define malloc rpl_malloc" >>confdefs.h + +fi + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking return type of signal handlers" >&5 +$as_echo_n "checking return type of signal handlers... " >&6; } +if ${ac_cv_type_signal+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ +#include +#include + +#ifdef F77_DUMMY_MAIN + +# ifdef __cplusplus + extern "C" +# endif + int F77_DUMMY_MAIN() { return 1; } + +#endif +int +main () +{ +return *(signal (0, 0)) (0) == 1; + ; + return 0; +} +_ACEOF +if ac_fn_c_try_compile "$LINENO"; then : + ac_cv_type_signal=int +else + ac_cv_type_signal=void +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_type_signal" >&5 +$as_echo "$ac_cv_type_signal" >&6; } + +cat >>confdefs.h <<_ACEOF +#define RETSIGTYPE $ac_cv_type_signal +_ACEOF + + +for ac_func in gettimeofday pow sqrt +do : + as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh` +ac_fn_c_check_func "$LINENO" "$ac_func" "$as_ac_var" +if eval test \"x\$"$as_ac_var"\" = x"yes"; then : + cat >>confdefs.h <<_ACEOF +#define `$as_echo "HAVE_$ac_func" | $as_tr_cpp` 1 +_ACEOF + +fi +done + + + + + + + + + + + + + + + + + + + + +INCLUDES="$INCLUDES -I\$(HOME)/include/ -I. -I\${abs_top_builddir}/ -I\${abs_top_srcdir}/ -I${lime_dir}/include/ -I${lemon_dir}/include/" +DEPFLAGS="$DEPFLAGS" + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking what alignment we want for arrays" >&5 +$as_echo_n "checking what alignment we want for arrays... " >&6; } +# Check whether --enable-alignment was given. +if test "${enable_alignment+set}" = set; then : + enableval=$enable_alignment; withalign=$enableval +else + withalign=auto +fi + +if test "$withalign" = "none"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: none" >&5 +$as_echo "none" >&6; } + withalign=1 + +$as_echo "#define ALIGN_BASE 0x00" >>confdefs.h + + $as_echo "#define ALIGN /**/" >>confdefs.h + + +$as_echo "#define ALIGN_BASE32 0x00" >>confdefs.h + + $as_echo "#define ALIGN32 /**/" >>confdefs.h + +elif test $withalign = 16; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: 16 bytes" >&5 +$as_echo "16 bytes" >&6; } + +$as_echo "#define ALIGN_BASE 0x0F" >>confdefs.h + + $as_echo "#define ALIGN __attribute__ ((aligned (16)))" >>confdefs.h + + +$as_echo "#define ALIGN_BASE32 0x0F" >>confdefs.h + + $as_echo "#define ALIGN32 __attribute__ ((aligned (16)))" >>confdefs.h + +elif test $withalign = 32; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: 32 bytes" >&5 +$as_echo "32 bytes" >&6; } + +$as_echo "#define ALIGN_BASE 0x1F" >>confdefs.h + + $as_echo "#define ALIGN __attribute__ ((aligned (32)))" >>confdefs.h + + +$as_echo "#define ALIGN_BASE32 0x1F" >>confdefs.h + + $as_echo "#define ALIGN32 __attribute__ ((aligned (32)))" >>confdefs.h + +elif test $withalign = auto; then + withautoalign=1 + { $as_echo "$as_me:${as_lineno-$LINENO}: result: auto" >&5 +$as_echo "auto" >&6; } + +$as_echo "#define ALIGN_BASE 0x00" >>confdefs.h + + $as_echo "#define ALIGN /**/" >>confdefs.h + + +$as_echo "#define ALIGN_BASE32 0x00" >>confdefs.h + + $as_echo "#define ALIGN32 /**/" >>confdefs.h + +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: Unusable value for array alignment" >&5 +$as_echo "Unusable value for array alignment" >&6; } + as_fn_error $? "Allowed values are: auto, none, 16, 32" "$LINENO" 5 +fi + +if test "$host_cpu" = "i686" || test "$host_cpu" = "x86_64"; then + + { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we want to use P4 instructions" >&5 +$as_echo_n "checking whether we want to use P4 instructions... " >&6; } + # Check whether --enable-p4 was given. +if test "${enable_p4+set}" = set; then : + enableval=$enable_p4; enable_p4=$enableval +else + enable_p4=no +fi + + if test $enable_p4 = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + +$as_echo "#define P4 1" >>confdefs.h + + if test $withalign = auto; then + if test $withautoalign -lt 16; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: increasing array alignment to 16 bytes for P4 instructions" >&5 +$as_echo "increasing array alignment to 16 bytes for P4 instructions" >&6; } + +$as_echo "#define ALIGN_BASE 0x0F" >>confdefs.h + + $as_echo "#define ALIGN __attribute__ ((aligned (16)))" >>confdefs.h + + { $as_echo "$as_me:${as_lineno-$LINENO}: result: increasing array 32 bit alignment to 16 bytes for P4 instructions" >&5 +$as_echo "increasing array 32 bit alignment to 16 bytes for P4 instructions" >&6; } + +$as_echo "#define ALIGN_BASE32 0x0F" >>confdefs.h + + $as_echo "#define ALIGN32 __attribute__ ((aligned (16)))" >>confdefs.h + + withautoalign=16 + fi + elif test $withalign -lt 16; then + as_fn_error $? "alignment incompatible with P4 instructions (16 bytes required)!" "$LINENO" 5 + fi + else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } + fi + + { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we want to use Opteron instructions" >&5 +$as_echo_n "checking whether we want to use Opteron instructions... " >&6; } + # Check whether --enable-opteron was given. +if test "${enable_opteron+set}" = set; then : + enableval=$enable_opteron; enable_opteron=$enableval +else + enable_opteron=no +fi + + if test $enable_opteron = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + +$as_echo "#define OPTERON 1" >>confdefs.h + + if test $withalign = auto; then + if test $withautoalign -lt 16; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: increasing array alignment to 16 bytes for Opteron instructions" >&5 +$as_echo "increasing array alignment to 16 bytes for Opteron instructions" >&6; } + +$as_echo "#define ALIGN_BASE 0x0F" >>confdefs.h + + $as_echo "#define ALIGN __attribute__ ((aligned (16)))" >>confdefs.h + + { $as_echo "$as_me:${as_lineno-$LINENO}: result: increasing array 32 bit alignment to 16 bytes for Opteron instructions" >&5 +$as_echo "increasing array 32 bit alignment to 16 bytes for Opteron instructions" >&6; } + +$as_echo "#define ALIGN_BASE32 0x0F" >>confdefs.h + + $as_echo "#define ALIGN32 __attribute__ ((aligned (16)))" >>confdefs.h + + withautoalign=16 + fi + elif test $withalign -lt 16; then + as_fn_error $? "alignment incompatible with Opteron instructions (16 bytes required)!" "$LINENO" 5 + fi + else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } + fi + + { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we want to use SSE2 instructions" >&5 +$as_echo_n "checking whether we want to use SSE2 instructions... " >&6; } + # Check whether --enable-sse2 was given. +if test "${enable_sse2+set}" = set; then : + enableval=$enable_sse2; enable_sse2=$enableval +else + enable_sse2=no +fi + + if test $enable_sse2 = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + if test $withalign != auto && test $withalign -lt 16; then + as_fn_error $? "alignment incompatible with SSE2 instructions (16 bytes required)" "$LINENO" 5 + fi + else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } + fi + + { $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we want to use SSE3 instructions" >&5 +$as_echo_n "checking whether we want to use SSE3 instructions... " >&6; } + # Check whether --enable-sse3 was given. +if test "${enable_sse3+set}" = set; then : + enableval=$enable_sse3; enable_sse3=$enableval +else + enable_sse3=no +fi + + if test $enable_sse3 = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + if test $withalign != auto && $withalign -lt 16; then + as_fn_error $? "alignment incompatible with SSE3 instructions (16 bytes required)" "$LINENO" 5 + fi + else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } + fi + + if test "$enable_sse2" = "yes" || test "$enable_sse3" = "yes"; then + if test $withalign = auto; then + if test $withautoalign -lt 16; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: increasing array alignment to 16 bytes for SSE instructions" >&5 +$as_echo "increasing array alignment to 16 bytes for SSE instructions" >&6; } + +$as_echo "#define ALIGN_BASE 0x0F" >>confdefs.h + + $as_echo "#define ALIGN __attribute__ ((aligned (16)))" >>confdefs.h + + { $as_echo "$as_me:${as_lineno-$LINENO}: result: increasing 32bit array alignment to 16 bytes for SSE instructions" >&5 +$as_echo "increasing 32bit array alignment to 16 bytes for SSE instructions" >&6; } + +$as_echo "#define ALIGN_BASE32 0x0F" >>confdefs.h + + $as_echo "#define ALIGN32 __attribute__ ((aligned (16)))" >>confdefs.h + + withautoalign=16 + fi + fi + fi +fi + +if test $enable_qpx = yes; then + if test $withalign = auto; then + if test $withautoalign -lt 32; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: increasing array alignment to 32 bytes for use of QPX instructions on BG/Q" >&5 +$as_echo "increasing array alignment to 32 bytes for use of QPX instructions on BG/Q" >&6; } + +$as_echo "#define ALIGN_BASE 0x1F" >>confdefs.h + + $as_echo "#define ALIGN __attribute__ ((aligned (32)))" >>confdefs.h + + { $as_echo "$as_me:${as_lineno-$LINENO}: result: increasing 32bit array alignment to 16 bytes for use of QPX instructions on BG/Q" >&5 +$as_echo "increasing 32bit array alignment to 16 bytes for use of QPX instructions on BG/Q" >&6; } + +$as_echo "#define ALIGN_BASE32 0x0F" >>confdefs.h + + $as_echo "#define ALIGN32 __attribute__ ((aligned (16)))" >>confdefs.h + + withautoalign=32 + fi + elif test $withalign -lt 32; then + as_fn_error $? "alignment incompatible with QPX instructions (32 bytes required)" "$LINENO" 5 + fi +fi + +if test "$host_cpu" = "powerpc" && test "$host_vendor" = "ibm" && test "$host_os" = "blrts"; then + if test $withalign = auto; then + if test $withautoalign -lt 16; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: increasing array alignment to 16 bytes for BG/L optimization" >&5 +$as_echo "increasing array alignment to 16 bytes for BG/L optimization" >&6; } + +$as_echo "#define ALIGN_BASE 0x0F" >>confdefs.h + + +$as_echo "#define ALIGN __attribute__ ((aligned (16)))" >>confdefs.h + + withautoalign=16 + fi + fi +elif test "$host_cpu" = "powerpc" && test "$host_vendor" = "ibm" && test "$host_os" = "bprts"; then + if test $withalign = auto; then + if test $withautoalign -lt 16; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: increasing array alignment to 16 bytes for BG/P optimization" >&5 +$as_echo "increasing array alignment to 16 bytes for BG/P optimization" >&6; } + +$as_echo "#define ALIGN_BASE 0x0F" >>confdefs.h + + +$as_echo "#define ALIGN __attribute__ ((aligned (16)))" >>confdefs.h + + withautoalign=16 + fi + fi +elif test "$host_cpu" = "powerpc64" && test "$host_vendor" = "unknown" && test "$host_os" = "linux-gnu"; then + if test $withalign = auto; then + if test $withautoalign -lt 32; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: increasing array alignment to 32 bytes for BG/Q and generic POWER optimization" >&5 +$as_echo "increasing array alignment to 32 bytes for BG/Q and generic POWER optimization" >&6; } + +$as_echo "#define ALIGN_BASE 0x1F" >>confdefs.h + + $as_echo "#define ALIGN __attribute__ ((aligned (32)))" >>confdefs.h + + { $as_echo "$as_me:${as_lineno-$LINENO}: result: increasing array 32 bit alignment to 16 bytes for BG/Q and generic POWER optimization" >&5 +$as_echo "increasing array 32 bit alignment to 16 bytes for BG/Q and generic POWER optimization" >&6; } + +$as_echo "#define ALIGN_BASE32 0x0F" >>confdefs.h + + $as_echo "#define ALIGN32 __attribute__ ((aligned (16)))" >>confdefs.h + + withautoalign=32 + fi + fi +fi + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we want to use gprof as profiler" >&5 +$as_echo_n "checking whether we want to use gprof as profiler... " >&6; } + +# Check whether --with-gprof was given. +if test "${with_gprof+set}" = set; then : + withval=$with_gprof; enable_gprof=$withval +else + enable_gprof=no +fi + +if test $enable_gprof = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + if test "$host_cpu" = "powerpc" && test "$host_vendor" = "ibm"; then + PROFILE_FLAG="-pg -qfullpath -g" + else + PROFILE_FLAG="-pg -g" + fi +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } + PROFILE_FLAG= +fi + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we shall use rts dram window" >&5 +$as_echo_n "checking whether we shall use rts dram window... " >&6; } + +# Check whether --with-bgldram was given. +if test "${with_bgldram+set}" = set; then : + withval=$with_bgldram; with_bgldram=$withval +else + with_bgldram=yes +fi + +if test $with_bgldram = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + +$as_echo "#define _USE_BGLDRAM 1" >>confdefs.h + +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + +XLCGREP=`$CC -V 2>&1 | grep -i xlc` +if test "$XLCGREP" != ""; then + XLC="yes" + +$as_echo "#define XLC 1" >>confdefs.h + +fi +PGCC=`$CC -V 2>&1 | grep pgcc` +ICC=`$CC -V 2>&1 | grep -i intel` + +if test "$host_cpu" = "i686" || test "$host_cpu" = "x86_64"; then + if test "$GCC" = yes && test "$ICC" = ""; then + DEPFLAGS="-MM" + CFLAGS="$CFLAGS -pedantic -Wall" + OPTARGS='-O' + SOPTARGS='-O' + + if test $enable_sse3 = yes; then + echo Using SSE3 and SSE2 macros! + +$as_echo "#define SSE3 1" >>confdefs.h + + DEPFLAGS="$DEPFLAGS -DSSE3" + if test "$host_cpu" = "x86_64"; then + CFLAGS="$CFLAGS -mfpmath=387" + fi + elif test $enable_sse2 = yes; then + DEPFLAGS="$DEPFLAGS -DSSE2" + +$as_echo "#define SSE2 1" >>confdefs.h + + if test "$host_cpu" = "x86_64"; then + CFLAGS="$CFLAGS -mfpmath=387" + fi + fi + + if test "$host_cpu" = "x86_64"; then + +$as_echo "#define _x86_64 1" >>confdefs.h + + fi + CCDEP="$CC" + if test $enable_mpi = yes; then + CCDEP="gcc" + fi + DEBUG_FLAG="-g" + else + if test "$PGCC" != ""; then + DEPFLAGS="-M" + echo "We are using the Portland Group C compiler!" + OPTARGS="-O2" + SOPTARGS="-O2" + DEBUG_FLAG="-g" + PROFILE_FLAG="-p -g" + CCDEP="$CC" + + elif test "$ICC" != ""; then + echo "We are using the Intel C compiler!" + DEPFLAGS="-M" + OPTARGS="-O3" + SOPTARGS="-O3" + DEBUG_FLAG="-g" + PROFILE_FLAG="-p -g" + CCDEP="$CC" + + else + # other compilers might support SSE inline assembly too + # (the cray compiler, for example) + if test $enable_sse3 = yes; then + echo Using SSE3 and SSE2 macros! + +$as_echo "#define SSE3 1" >>confdefs.h + + elif test $enable_sse2 = yes; then + echo Using SSE2 macros only! + +$as_echo "#define SSE2 1" >>confdefs.h + + fi + + DEPFLAGS="-M" + CFLAGS="$CFLAGS -O" + DEBUG_FLAG="-g" + CCDEP="$CC" + fi + fi + +# The MareNostrum: powerpc on a linux system +# this will also evaluate to "true" on BG/Q with XLC +elif test "$host_cpu" = "powerpc64" && test "$host_vendor" = "unknown" && test "$host_os" = "linux-gnu"; then + + DEBUGFLAG="-g" + if test "$XLC" = "yes"; then + CFLAGS="-qsrcmsg $CFLAGS" + DEBUGFLAG="$DEBUGFLAG -qfullpath" + fi + + OPTARGS="$OPTARGS" + SOPTARGS="$OPTARGS" + if test "$CCDEP" = "gcc"; then + DEPFLAGS="-MM" + else + DEPFLAGS="-M" + fi + +#The BLue Gene/L +elif test "$host_cpu" = "powerpc" && test "$host_vendor" = "ibm" && test "$host_os" = "blrts"; then + if test "$with_bgldram" = yes; then + if (test -e /bgl/local/bin/blrts_gcc); then + BLRTSGCC=/bgl/local/bin/blrts_gcc + elif (test -e /bgl/BlueLight/ppcfloor/blrts-gnu/bin/powerpc-bgl-blrts-gnu-gcc); then + BLRTSGCC=/bgl/BlueLight/ppcfloor/blrts-gnu/bin/powerpc-bgl-blrts-gnu-gcc + else + as_fn_error $? "Sorry, don't know where to find blrts_gcc, see README.bgl!" "$LINENO" 5 + fi + CCLD="$BLRTSGCC -Xlinker --script=./elf32ppcblrts.x" + if (!(test -s ./elf32ppcblrts.x)); then + as_fn_error $? "Sorry, elf32ppcblrts.x is missing, see README.bgl!" "$LINENO" 5 + fi + fi + DEBUGFLAG="-g" + OPTARGS="-O3" + SOPTARGS="-O3" + +$as_echo "#define BGL 1" >>confdefs.h + + + if test "$XLC" = "yes"; then + CFLAGS="-qsrcmsg $CFLAGS" + OPTARGS="$OPTARGS -qarch=440d -qtune=440" + SOPTARGS="$SOPTARGS -qarch=440d -qtune=440" + DEBUGFLAG="$DEBUGFLAG -qfullpath" +# OPTARGS="-qhot" leads to wrong code + fi + LIBS="-lmpich.rts -lfmpich.rts -lmsglayer.rts -lrts.rts -ldevices.rts $LIBS" + LDFLAGS="$LDFLAGS -L/bgl/BlueLight/ppcfloor/bglsys/lib" + if test $with_lapack = yes; then + LIBS="-lesslbg -llapack.rts -lesslbg -lxlf90 -lxlfmath -lxl -lxlopt $LIBS" + LDFLAGS="$LDFLAGS -L/opt/ibmcmp/xlf/bg/10.1/blrts_lib -L/bgl/local/lib/ -L/opt/ibmmath/lib/" + fi + + if test "$CCDEP" = "gcc"; then + DEPFLAGS="-MM" + else + DEPFLAGS="-M" + fi + CPPFLAGS="-I/bgl/BlueLight/ppcfloor/bglsys/include" + INCLUDES="$INCLUDES -I/bgl/BlueLight/ppcfloor/bglsys/include/" + +#The BLue Gene/P +elif test "$host_cpu" = "powerpc" && test "$host_vendor" = "ibm" && test "$host_os" = "bprts"; then + CFLAGS="$CFLAGS" + DEBUGFLAG="-g" + OPTARGS="-O3" + SOPTARGS="-O3" + +$as_echo "#define BGL 1" >>confdefs.h + + +$as_echo "#define BGP 1" >>confdefs.h + + + if test "$XLC" = "yes"; then + CFLAGS="-qsrcmsg $CFLAGS" + OPTARGS="$OPTARGS -qarch=450d -qtune=450" + SOPTARGS="$SOPTARGS -qarch=450d -qtune=450" + DEBUGFLAG="$DEBUGFLAG -qfullpath" +# OPTARGS="-qhot" leads to wrong code + fi +# LIBS="-lxlf90_r -lxlomp_ser -lxl -lxlopt -lxlfmath -ldl -lrt -lpthread $LIBS" +# LDFLAGS="$LDFLAGS -L/bgsys/local/lib/ -L/opt/ibmcmp/xlf/bg/11.1/lib -L/bgsys/drivers/ppcfloor/comm/" +# if test $with_lapack = yes; then +# LIBS="-lesslbg -llapack -lesslbg $LIBS" +# LDFLAGS="$LDFLAGS -L/opt/ibmmath/lib/" +# fi + + if test "$CCDEP" = "gcc"; then + DEPFLAGS="-MM" + else + DEPFLAGS="-M" + fi + CPPFLAGS="-I/bgsys/drivers/ppcfloor/arch/include/ -I/bgsys/drivers/ppcfloor/comm/include" + INCLUDES="$INCLUDES -I/bgsys/local/include/ -I/bgsys/drivers/ppcfloor/arch/include/ -I/bgsys/drivers/ppcfloor/comm/include" + + + +# The IBM Power PC +elif test "$host_cpu" = "powerpc" && test "$host_vendor" = "ibm"; then + CFLAGS="$CFLAGS -q64 -qsrcmsg" + LDFLAGS="$LDFLAGS -q64" + OPTARGS="-O2" + SOPTARGS="-O2" + DEBUG_FLAG="-qfullpath -g" + if test "$CCDEP" = "gcc"; then + DEPFLAGS="-MM" + else + DEPFLAGS="-M" + fi + +# The CRAY +elif test "$host_vendor" = "cray"; then + echo + echo "Hey, we are on a cray, you should take some time for this..." + echo "get yourself a coffee or so!" + echo + CFLAGS="$CFLAGS -dp" + +$as_echo "#define CRAY 1" >>confdefs.h + + OPTARGS="-O3" + SOPTARGS="-O3" + DEBUG_FLAG="-g" + CCDEP="$CC" + DEPFLAGS="-M" + +else + # Extract the first word of "gcc", so it can be a program name with args. +set dummy gcc; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_CCDEP+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$CCDEP"; then + ac_cv_prog_CCDEP="$CCDEP" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_prog_CCDEP=""gcc"" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + + test -z "$ac_cv_prog_CCDEP" && ac_cv_prog_CCDEP=""$CC"" +fi +fi +CCDEP=$ac_cv_prog_CCDEP +if test -n "$CCDEP"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CCDEP" >&5 +$as_echo "$CCDEP" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + + if test "$CCDEP" = "gcc"; then + DEPFLAGS="-MM" + else + DEPFLAGS="-M" + fi + OPTARGS= + SOPTARGS= +fi + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we want to switch on optimisation" >&5 +$as_echo_n "checking whether we want to switch on optimisation... " >&6; } +# Check whether --enable-optimize was given. +if test "${enable_optimize+set}" = set; then : + enableval=$enable_optimize; enable_optimize=$enableval +else + enable_optimize=yes +fi + +if test $enable_optimize = no; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } + OPTARGS= + SOPTARGS= +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } +fi + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we want to use a copy of the gauge field" >&5 +$as_echo_n "checking whether we want to use a copy of the gauge field... " >&6; } +# Check whether --enable-gaugecopy was given. +if test "${enable_gaugecopy+set}" = set; then : + enableval=$enable_gaugecopy; enable_gaugecopy=$enableval +else + enable_gaugecopy=yes +fi + +if test $enable_gaugecopy = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + +$as_echo "#define _GAUGE_COPY 1" >>confdefs.h + +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we want to use a Dirac Op. with halfspinor exchange" >&5 +$as_echo_n "checking whether we want to use a Dirac Op. with halfspinor exchange... " >&6; } +# Check whether --enable-halfspinor was given. +if test "${enable_halfspinor+set}" = set; then : + enableval=$enable_halfspinor; enable_halfspinor=$enableval +else + enable_halfspinor=yes +fi + +if test $enable_halfspinor = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + +$as_echo "#define _USE_HALFSPINOR 1" >>confdefs.h + + if test $enable_gaugecopy = no; then + { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: switching on gaugecopy for Dirac operator with halfspinor!" >&5 +$as_echo "$as_me: WARNING: switching on gaugecopy for Dirac operator with halfspinor!" >&2;} + +$as_echo "#define _GAUGE_COPY 1" >>confdefs.h + + fi +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we want to use shmem API" >&5 +$as_echo_n "checking whether we want to use shmem API... " >&6; } +# Check whether --enable-shmem was given. +if test "${enable_shmem+set}" = set; then : + enableval=$enable_shmem; enable_shmem=$enableval +else + enable_shmem=no +fi + +if test $enable_shmem = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + +$as_echo "#define _USE_SHMEM 1" >>confdefs.h + + LIBS="$LIBS -lsma" +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we want to use timeslice-splitted communications" >&5 +$as_echo_n "checking whether we want to use timeslice-splitted communications... " >&6; } +# Check whether --enable-tsplitpar was given. +if test "${enable_tsplitpar+set}" = set; then : + enableval=$enable_tsplitpar; enable_tsp=$enableval +else + enable_tsp=no +fi + +if test $enable_tsp = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + +$as_echo "#define _USE_TSPLITPAR 1" >>confdefs.h + +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we want to compute the LapH eigenvalues" >&5 +$as_echo_n "checking whether we want to compute the LapH eigenvalues... " >&6; } +# Check whether --enable-laph was given. +if test "${enable_laph+set}" = set; then : + enableval=$enable_laph; enable_laph=$enableval +else + enable_laph=no +fi + +if test $enable_laph = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + +$as_echo "#define WITHLAPH 1" >>confdefs.h + +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we want to use CUDA GPU" >&5 +$as_echo_n "checking whether we want to use CUDA GPU... " >&6; } +# Check whether --enable-gpu was given. +if test "${enable_gpu+set}" = set; then : + enableval=$enable_gpu; usegpu=$enableval +else + usegpu=no +fi + +if test $usegpu = yes; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5 +$as_echo "yes" >&6; } + +$as_echo "#define HAVE_GPU 1" >>confdefs.h + + NVCC="nvcc" + USESUBDIRS="$USESUBDIRS GPU" + GPUDIR="GPU" + LIBS="$LIBS -lcuda -lcudart -lcublas" + + { $as_echo "$as_me:${as_lineno-$LINENO}: checking where to search for CUDA libs" >&5 +$as_echo_n "checking where to search for CUDA libs... " >&6; } + +# Check whether --with-cuda was given. +if test "${with_cuda+set}" = set; then : + withval=$with_cuda; cuda_dir=$withval +else + cuda_dir="/usr/local/cuda/lib" +fi + + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $cuda_dir" >&5 +$as_echo "$cuda_dir" >&6; } + if test $usegpu = yes; then + LDFLAGS="$LDFLAGS -L$cuda_dir" + fi + + + { $as_echo "$as_me:${as_lineno-$LINENO}: checking CUDA compile args" >&5 +$as_echo_n "checking CUDA compile args... " >&6; } + +# Check whether --with-cudacompileargs was given. +if test "${with_cudacompileargs+set}" = set; then : + withval=$with_cudacompileargs; cuda_compileargs=$withval +else + cuda_compileargs="--gpu-architecture sm_13 --use_fast_math -O3" +fi + + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $cuda_compileargs" >&5 +$as_echo "$cuda_compileargs" >&6; } + if test $usegpu = yes; then + GPUCFLAGS="$GPUCFLAGS $cuda_compileargs" + fi + if test $enable_mpi = yes; then + GPUMPICOMPILER="--compiler-bindir mpicc" + if test $withmpidimension != 1; then + as_fn_error $? "ERROR! The GPU Code is only parallelized in t-direction so far!" "$LINENO" 5 + fi + else + GPUMPICOMPILER="" + fi +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } + NVCC="" +fi + + + + + + + + + +# QUDA library for GPUs +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we want to use QUDA GPU" >&5 +$as_echo_n "checking whether we want to use QUDA GPU... " >&6; } +ac_ext=cpp +ac_cpp='$CXXCPP $CPPFLAGS' +ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_cxx_compiler_gnu +if test -z "$CXX"; then + if test -n "$CCC"; then + CXX=$CCC + else + if test -n "$ac_tool_prefix"; then + for ac_prog in g++ c++ gpp aCC CC cxx cc++ cl.exe FCC KCC RCC xlC_r xlC + do + # Extract the first word of "$ac_tool_prefix$ac_prog", so it can be a program name with args. +set dummy $ac_tool_prefix$ac_prog; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_CXX+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$CXX"; then + ac_cv_prog_CXX="$CXX" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_prog_CXX="$ac_tool_prefix$ac_prog" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + +fi +fi +CXX=$ac_cv_prog_CXX +if test -n "$CXX"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $CXX" >&5 +$as_echo "$CXX" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + + test -n "$CXX" && break + done +fi +if test -z "$CXX"; then + ac_ct_CXX=$CXX + for ac_prog in g++ c++ gpp aCC CC cxx cc++ cl.exe FCC KCC RCC xlC_r xlC +do + # Extract the first word of "$ac_prog", so it can be a program name with args. +set dummy $ac_prog; ac_word=$2 +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for $ac_word" >&5 +$as_echo_n "checking for $ac_word... " >&6; } +if ${ac_cv_prog_ac_ct_CXX+:} false; then : + $as_echo_n "(cached) " >&6 +else + if test -n "$ac_ct_CXX"; then + ac_cv_prog_ac_ct_CXX="$ac_ct_CXX" # Let the user override the test. +else +as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if as_fn_executable_p "$as_dir/$ac_word$ac_exec_ext"; then + ac_cv_prog_ac_ct_CXX="$ac_prog" + $as_echo "$as_me:${as_lineno-$LINENO}: found $as_dir/$ac_word$ac_exec_ext" >&5 + break 2 + fi +done + done +IFS=$as_save_IFS + +fi +fi +ac_ct_CXX=$ac_cv_prog_ac_ct_CXX +if test -n "$ac_ct_CXX"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_ct_CXX" >&5 +$as_echo "$ac_ct_CXX" >&6; } +else + { $as_echo "$as_me:${as_lineno-$LINENO}: result: no" >&5 +$as_echo "no" >&6; } +fi + + + test -n "$ac_ct_CXX" && break +done + + if test "x$ac_ct_CXX" = x; then + CXX="g++" + else + case $cross_compiling:$ac_tool_warned in +yes:) +{ $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: using cross tools not prefixed with host triplet" >&5 +$as_echo "$as_me: WARNING: using cross tools not prefixed with host triplet" >&2;} +ac_tool_warned=yes ;; +esac + CXX=$ac_ct_CXX + fi +fi + + fi +fi +# Provide some information about the compiler. +$as_echo "$as_me:${as_lineno-$LINENO}: checking for C++ compiler version" >&5 +set X $ac_compile +ac_compiler=$2 +for ac_option in --version -v -V -qversion; do + { { ac_try="$ac_compiler $ac_option >&5" +case "(($ac_try" in + *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;; + *) ac_try_echo=$ac_try;; +esac +eval ac_try_echo="\"\$as_me:${as_lineno-$LINENO}: $ac_try_echo\"" +$as_echo "$ac_try_echo"; } >&5 + (eval "$ac_compiler $ac_option >&5") 2>conftest.err + ac_status=$? + if test -s conftest.err; then + sed '10a\ +... rest of stderr output deleted ... + 10q' conftest.err >conftest.er1 + cat conftest.er1 >&5 + fi + rm -f conftest.er1 conftest.err + $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 + test $ac_status = 0; } +done + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether we are using the GNU C++ compiler" >&5 +$as_echo_n "checking whether we are using the GNU C++ compiler... " >&6; } +if ${ac_cv_cxx_compiler_gnu+:} false; then : + $as_echo_n "(cached) " >&6 +else + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +#ifdef F77_DUMMY_MAIN + +# ifdef __cplusplus + extern "C" +# endif + int F77_DUMMY_MAIN() { return 1; } + +#endif +int +main () +{ +#ifndef __GNUC__ + choke me +#endif + + ; + return 0; +} +_ACEOF +if ac_fn_cxx_try_compile "$LINENO"; then : + ac_compiler_gnu=yes +else + ac_compiler_gnu=no +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +ac_cv_cxx_compiler_gnu=$ac_compiler_gnu + +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_cxx_compiler_gnu" >&5 +$as_echo "$ac_cv_cxx_compiler_gnu" >&6; } +if test $ac_compiler_gnu = yes; then + GXX=yes +else + GXX= +fi +ac_test_CXXFLAGS=${CXXFLAGS+set} +ac_save_CXXFLAGS=$CXXFLAGS +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether $CXX accepts -g" >&5 +$as_echo_n "checking whether $CXX accepts -g... " >&6; } +if ${ac_cv_prog_cxx_g+:} false; then : + $as_echo_n "(cached) " >&6 +else + ac_save_cxx_werror_flag=$ac_cxx_werror_flag + ac_cxx_werror_flag=yes + ac_cv_prog_cxx_g=no + CXXFLAGS="-g" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +#ifdef F77_DUMMY_MAIN + +# ifdef __cplusplus + extern "C" +# endif + int F77_DUMMY_MAIN() { return 1; } + +#endif +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_cxx_try_compile "$LINENO"; then : + ac_cv_prog_cxx_g=yes +else + CXXFLAGS="" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +#ifdef F77_DUMMY_MAIN + +# ifdef __cplusplus + extern "C" +# endif + int F77_DUMMY_MAIN() { return 1; } + +#endif +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_cxx_try_compile "$LINENO"; then : + +else + ac_cxx_werror_flag=$ac_save_cxx_werror_flag + CXXFLAGS="-g" + cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +#ifdef F77_DUMMY_MAIN + +# ifdef __cplusplus + extern "C" +# endif + int F77_DUMMY_MAIN() { return 1; } + +#endif +int +main () +{ + + ; + return 0; +} +_ACEOF +if ac_fn_cxx_try_compile "$LINENO"; then : + ac_cv_prog_cxx_g=yes +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext +fi +rm -f core conftest.err conftest.$ac_objext conftest.$ac_ext + ac_cxx_werror_flag=$ac_save_cxx_werror_flag +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_prog_cxx_g" >&5 +$as_echo "$ac_cv_prog_cxx_g" >&6; } +if test "$ac_test_CXXFLAGS" = set; then + CXXFLAGS=$ac_save_CXXFLAGS +elif test $ac_cv_prog_cxx_g = yes; then + if test "$GXX" = yes; then + CXXFLAGS="-g -O2" + else + CXXFLAGS="-g" + fi +else + if test "$GXX" = yes; then + CXXFLAGS="-O2" + else + CXXFLAGS= + fi +fi +ac_ext=cpp +ac_cpp='$CXXCPP $CPPFLAGS' +ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_cxx_compiler_gnu + + + +# Check whether --with-qudadir was given. +if test "${with_qudadir+set}" = set; then : + withval=$with_qudadir; echo yes + QUDA_AVAILABLE=1 + +$as_echo "#define QUDA 1" >>confdefs.h + + quda_dir=$withval + LDFLAGS="$LDFLAGS -L${quda_dir}/lib" + INCLUDES="$INCLUDES -I${quda_dir}/include/" + QUDA_INTERFACE="quda_interface" + { $as_echo "$as_me:${as_lineno-$LINENO}: checking where to search for CUDA libs" >&5 +$as_echo_n "checking where to search for CUDA libs... " >&6; } + +# Check whether --with-cudadir was given. +if test "${with_cudadir+set}" = set; then : + withval=$with_cudadir; cuda_dir=$withval +else + cuda_dir="/usr/local/cuda/lib" +fi + + { $as_echo "$as_me:${as_lineno-$LINENO}: result: $cuda_dir" >&5 +$as_echo "$cuda_dir" >&6; } + LDFLAGS="$LDFLAGS -L$cuda_dir" + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for cudaMalloc in -lcudart" >&5 +$as_echo_n "checking for cudaMalloc in -lcudart... " >&6; } +if ${ac_cv_lib_cudart_cudaMalloc+:} false; then : + $as_echo_n "(cached) " >&6 +else + ac_check_lib_save_LIBS=$LIBS +LIBS="-lcudart $LIBS" +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +/* Override any GCC internal prototype to avoid an error. + Use char because int might match the return type of a GCC + builtin and then its argument prototype would still apply. */ +#ifdef __cplusplus +extern "C" +#endif +char cudaMalloc (); +#ifdef F77_DUMMY_MAIN + +# ifdef __cplusplus + extern "C" +# endif + int F77_DUMMY_MAIN() { return 1; } + +#endif +int +main () +{ +return cudaMalloc (); + ; + return 0; +} +_ACEOF +if ac_fn_c_try_link "$LINENO"; then : + ac_cv_lib_cudart_cudaMalloc=yes +else + ac_cv_lib_cudart_cudaMalloc=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +LIBS=$ac_check_lib_save_LIBS +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_cudart_cudaMalloc" >&5 +$as_echo "$ac_cv_lib_cudart_cudaMalloc" >&6; } +if test "x$ac_cv_lib_cudart_cudaMalloc" = xyes; then : + cat >>confdefs.h <<_ACEOF +#define HAVE_LIBCUDART 1 +_ACEOF + + LIBS="-lcudart $LIBS" + +else + as_fn_error $? "Can't link a simple program against library cudart." "$LINENO" 5 + +fi + + # Perform test in C++ + ac_ext=cpp +ac_cpp='$CXXCPP $CPPFLAGS' +ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_cxx_compiler_gnu + + { $as_echo "$as_me:${as_lineno-$LINENO}: checking for freeGaugeQuda in -lquda" >&5 +$as_echo_n "checking for freeGaugeQuda in -lquda... " >&6; } +if ${ac_cv_lib_quda_freeGaugeQuda+:} false; then : + $as_echo_n "(cached) " >&6 +else + ac_check_lib_save_LIBS=$LIBS +LIBS="-lquda $LIBS" +cat confdefs.h - <<_ACEOF >conftest.$ac_ext +/* end confdefs.h. */ + +/* Override any GCC internal prototype to avoid an error. + Use char because int might match the return type of a GCC + builtin and then its argument prototype would still apply. */ +#ifdef __cplusplus +extern "C" +#endif +char freeGaugeQuda (); +#ifdef F77_DUMMY_MAIN + +# ifdef __cplusplus + extern "C" +# endif + int F77_DUMMY_MAIN() { return 1; } + +#endif +int +main () +{ +return freeGaugeQuda (); + ; + return 0; +} +_ACEOF +if ac_fn_cxx_try_link "$LINENO"; then : + ac_cv_lib_quda_freeGaugeQuda=yes +else + ac_cv_lib_quda_freeGaugeQuda=no +fi +rm -f core conftest.err conftest.$ac_objext \ + conftest$ac_exeext conftest.$ac_ext +LIBS=$ac_check_lib_save_LIBS +fi +{ $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_cv_lib_quda_freeGaugeQuda" >&5 +$as_echo "$ac_cv_lib_quda_freeGaugeQuda" >&6; } +if test "x$ac_cv_lib_quda_freeGaugeQuda" = xyes; then : + cat >>confdefs.h <<_ACEOF +#define HAVE_LIBQUDA 1 +_ACEOF + + LIBS="-lquda $LIBS" + +else + as_fn_error $? "Can't link a simple program against library libquda. (Did you set CXX properly?)" "$LINENO" 5 + +fi + + ac_ext=cpp +ac_cpp='$CXXCPP $CPPFLAGS' +ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5' +ac_link='$CXX -o conftest$ac_exeext $CXXFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS >&5' +ac_compiler_gnu=$ac_cv_cxx_compiler_gnu + + #QUDA needs to be linked with C++ linker + CCLD=${CXX} + +else + echo no + QUDA_AVAILABLE=0 + QUDA_INTERFACE="" + + +fi + + + + +{ $as_echo "$as_me:${as_lineno-$LINENO}: checking checking consistency" >&5 +$as_echo_n "checking checking consistency... " >&6; } +if test $enable_mpi = yes ; then + if test $enable_iig = yes && test $withpersistent = yes ; then + as_fn_error $? "ERROR! indexindepgeom is not compatible with persistent communications " "$LINENO" 5 + fi + if test $enable_iig = yes && test $enable_shmem = yes ; then + as_fn_error $? "ERROR! indexindepgeom is not compatible with shmem API " "$LINENO" 5 + fi + if test $enable_tsp = yes && test $enable_iig = no; then + as_fn_error $? "ERROR! tsplitpar needs indexindepgeom" "$LINENO" 5 + fi + if test $enable_tsp = yes && test $enable_sse2 != yes ; then + as_fn_error $? "ERROR! tsplitpar needs at least SSE2 " "$LINENO" 5 + fi + if test $enable_tsp = yes && test $enable_gaugecopy != yes ; then + as_fn_error $? "ERROR! tsplitpar needs gaugecopy" "$LINENO" 5 + fi + if test $enable_laph = yes && test $enable_tsp != yes ; then + as_fn_error $? "ERROR! laph needs tsplitpar" "$LINENO" 5 + fi +fi + +if test ! -e lib; then + mkdir lib +fi + +if test ! -e test; then + mkdir test +fi + +if test ! -e tests; then + mkdir tests +fi + +if test ! -e tests/regressions; then + mkdir tests/regressions +fi + + +LIBS="-lhmc -lmonomial -loperator -lsolver -linit -lmeas -llinalg -lhmc -lxchange -lrational -lio $LIBS" +AUTOCONF=autoconf + +for i in $USESUBDIRS +do + make_files="$make_files $i/Makefile" +done + +ac_config_files="$ac_config_files Makefile $make_files" + + +cat >confcache <<\_ACEOF +# This file is a shell script that caches the results of configure +# tests run on this system so they can be shared between configure +# scripts and configure runs, see configure's option --config-cache. +# It is not useful on other systems. If it contains results you don't +# want to keep, you may remove or edit it. +# +# config.status only pays attention to the cache file if you give it +# the --recheck option to rerun configure. +# +# `ac_cv_env_foo' variables (set or unset) will be overridden when +# loading this file, other *unset* `ac_cv_foo' will be assigned the +# following values. + +_ACEOF + +# The following way of writing the cache mishandles newlines in values, +# but we know of no workaround that is simple, portable, and efficient. +# So, we kill variables containing newlines. +# Ultrix sh set writes to stderr and can't be redirected directly, +# and sets the high bit in the cache file unless we assign to the vars. +( + for ac_var in `(set) 2>&1 | sed -n 's/^\([a-zA-Z_][a-zA-Z0-9_]*\)=.*/\1/p'`; do + eval ac_val=\$$ac_var + case $ac_val in #( + *${as_nl}*) + case $ac_var in #( + *_cv_*) { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: cache variable $ac_var contains a newline" >&5 +$as_echo "$as_me: WARNING: cache variable $ac_var contains a newline" >&2;} ;; + esac + case $ac_var in #( + _ | IFS | as_nl) ;; #( + BASH_ARGV | BASH_SOURCE) eval $ac_var= ;; #( + *) { eval $ac_var=; unset $ac_var;} ;; + esac ;; + esac + done + + (set) 2>&1 | + case $as_nl`(ac_space=' '; set) 2>&1` in #( + *${as_nl}ac_space=\ *) + # `set' does not quote correctly, so add quotes: double-quote + # substitution turns \\\\ into \\, and sed turns \\ into \. + sed -n \ + "s/'/'\\\\''/g; + s/^\\([_$as_cr_alnum]*_cv_[_$as_cr_alnum]*\\)=\\(.*\\)/\\1='\\2'/p" + ;; #( + *) + # `set' quotes correctly as required by POSIX, so do not add quotes. + sed -n "/^[_$as_cr_alnum]*_cv_[_$as_cr_alnum]*=/p" + ;; + esac | + sort +) | + sed ' + /^ac_cv_env_/b end + t clear + :clear + s/^\([^=]*\)=\(.*[{}].*\)$/test "${\1+set}" = set || &/ + t end + s/^\([^=]*\)=\(.*\)$/\1=${\1=\2}/ + :end' >>confcache +if diff "$cache_file" confcache >/dev/null 2>&1; then :; else + if test -w "$cache_file"; then + if test "x$cache_file" != "x/dev/null"; then + { $as_echo "$as_me:${as_lineno-$LINENO}: updating cache $cache_file" >&5 +$as_echo "$as_me: updating cache $cache_file" >&6;} + if test ! -f "$cache_file" || test -h "$cache_file"; then + cat confcache >"$cache_file" + else + case $cache_file in #( + */* | ?:*) + mv -f confcache "$cache_file"$$ && + mv -f "$cache_file"$$ "$cache_file" ;; #( + *) + mv -f confcache "$cache_file" ;; + esac + fi + fi + else + { $as_echo "$as_me:${as_lineno-$LINENO}: not updating unwritable cache $cache_file" >&5 +$as_echo "$as_me: not updating unwritable cache $cache_file" >&6;} + fi +fi +rm -f confcache + +test "x$prefix" = xNONE && prefix=$ac_default_prefix +# Let make expand exec_prefix. +test "x$exec_prefix" = xNONE && exec_prefix='${prefix}' + +DEFS=-DHAVE_CONFIG_H + +ac_libobjs= +ac_ltlibobjs= +U= +for ac_i in : $LIBOBJS; do test "x$ac_i" = x: && continue + # 1. Remove the extension, and $U if already installed. + ac_script='s/\$U\././;s/\.o$//;s/\.obj$//' + ac_i=`$as_echo "$ac_i" | sed "$ac_script"` + # 2. Prepend LIBOBJDIR. When used with automake>=1.10 LIBOBJDIR + # will be set to the directory where LIBOBJS objects are built. + as_fn_append ac_libobjs " \${LIBOBJDIR}$ac_i\$U.$ac_objext" + as_fn_append ac_ltlibobjs " \${LIBOBJDIR}$ac_i"'$U.lo' +done +LIBOBJS=$ac_libobjs + +LTLIBOBJS=$ac_ltlibobjs + + + +: "${CONFIG_STATUS=./config.status}" +ac_write_fail=0 +ac_clean_files_save=$ac_clean_files +ac_clean_files="$ac_clean_files $CONFIG_STATUS" +{ $as_echo "$as_me:${as_lineno-$LINENO}: creating $CONFIG_STATUS" >&5 +$as_echo "$as_me: creating $CONFIG_STATUS" >&6;} +as_write_fail=0 +cat >$CONFIG_STATUS <<_ASEOF || as_write_fail=1 +#! $SHELL +# Generated by $as_me. +# Run this file to recreate the current configuration. +# Compiler output produced by configure, useful for debugging +# configure, is in config.log if it exists. + +debug=false +ac_cs_recheck=false +ac_cs_silent=false + +SHELL=\${CONFIG_SHELL-$SHELL} +export SHELL +_ASEOF +cat >>$CONFIG_STATUS <<\_ASEOF || as_write_fail=1 +## -------------------- ## +## M4sh Initialization. ## +## -------------------- ## + +# Be more Bourne compatible +DUALCASE=1; export DUALCASE # for MKS sh +if test -n "${ZSH_VERSION+set}" && (emulate sh) >/dev/null 2>&1; then : + emulate sh + NULLCMD=: + # Pre-4.2 versions of Zsh do word splitting on ${1+"$@"}, which + # is contrary to our usage. Disable this feature. + alias -g '${1+"$@"}'='"$@"' + setopt NO_GLOB_SUBST +else + case `(set -o) 2>/dev/null` in #( + *posix*) : + set -o posix ;; #( + *) : + ;; +esac +fi + + +as_nl=' +' +export as_nl +# Printing a long string crashes Solaris 7 /usr/bin/printf. +as_echo='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\' +as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo +as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo$as_echo +# Prefer a ksh shell builtin over an external printf program on Solaris, +# but without wasting forks for bash or zsh. +if test -z "$BASH_VERSION$ZSH_VERSION" \ + && (test "X`print -r -- $as_echo`" = "X$as_echo") 2>/dev/null; then + as_echo='print -r --' + as_echo_n='print -rn --' +elif (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then + as_echo='printf %s\n' + as_echo_n='printf %s' +else + if test "X`(/usr/ucb/echo -n -n $as_echo) 2>/dev/null`" = "X-n $as_echo"; then + as_echo_body='eval /usr/ucb/echo -n "$1$as_nl"' + as_echo_n='/usr/ucb/echo -n' + else + as_echo_body='eval expr "X$1" : "X\\(.*\\)"' + as_echo_n_body='eval + arg=$1; + case $arg in #( + *"$as_nl"*) + expr "X$arg" : "X\\(.*\\)$as_nl"; + arg=`expr "X$arg" : ".*$as_nl\\(.*\\)"`;; + esac; + expr "X$arg" : "X\\(.*\\)" | tr -d "$as_nl" + ' + export as_echo_n_body + as_echo_n='sh -c $as_echo_n_body as_echo' + fi + export as_echo_body + as_echo='sh -c $as_echo_body as_echo' +fi + +# The user is always right. +if test "${PATH_SEPARATOR+set}" != set; then + PATH_SEPARATOR=: + (PATH='/bin;/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 && { + (PATH='/bin:/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 || + PATH_SEPARATOR=';' + } +fi + + +# IFS +# We need space, tab and new line, in precisely that order. Quoting is +# there to prevent editors from complaining about space-tab. +# (If _AS_PATH_WALK were called with IFS unset, it would disable word +# splitting by setting IFS to empty value.) +IFS=" "" $as_nl" + +# Find who we are. Look in the path if we contain no directory separator. +as_myself= +case $0 in #(( + *[\\/]* ) as_myself=$0 ;; + *) as_save_IFS=$IFS; IFS=$PATH_SEPARATOR +for as_dir in $PATH +do + IFS=$as_save_IFS + test -z "$as_dir" && as_dir=. + test -r "$as_dir/$0" && as_myself=$as_dir/$0 && break + done +IFS=$as_save_IFS + + ;; +esac +# We did not find ourselves, most probably we were run as `sh COMMAND' +# in which case we are not to be found in the path. +if test "x$as_myself" = x; then + as_myself=$0 +fi +if test ! -f "$as_myself"; then + $as_echo "$as_myself: error: cannot find myself; rerun with an absolute file name" >&2 + exit 1 +fi + +# Unset variables that we do not need and which cause bugs (e.g. in +# pre-3.0 UWIN ksh). But do not cause bugs in bash 2.01; the "|| exit 1" +# suppresses any "Segmentation fault" message there. '((' could +# trigger a bug in pdksh 5.2.14. +for as_var in BASH_ENV ENV MAIL MAILPATH +do eval test x\${$as_var+set} = xset \ + && ( (unset $as_var) || exit 1) >/dev/null 2>&1 && unset $as_var || : +done +PS1='$ ' +PS2='> ' +PS4='+ ' + +# NLS nuisances. +LC_ALL=C +export LC_ALL +LANGUAGE=C +export LANGUAGE + +# CDPATH. +(unset CDPATH) >/dev/null 2>&1 && unset CDPATH + + +# as_fn_error STATUS ERROR [LINENO LOG_FD] +# ---------------------------------------- +# Output "`basename $0`: error: ERROR" to stderr. If LINENO and LOG_FD are +# provided, also output the error to LOG_FD, referencing LINENO. Then exit the +# script with STATUS, using 1 if that was 0. +as_fn_error () +{ + as_status=$1; test $as_status -eq 0 && as_status=1 + if test "$4"; then + as_lineno=${as_lineno-"$3"} as_lineno_stack=as_lineno_stack=$as_lineno_stack + $as_echo "$as_me:${as_lineno-$LINENO}: error: $2" >&$4 + fi + $as_echo "$as_me: error: $2" >&2 + as_fn_exit $as_status +} # as_fn_error + + +# as_fn_set_status STATUS +# ----------------------- +# Set $? to STATUS, without forking. +as_fn_set_status () +{ + return $1 +} # as_fn_set_status + +# as_fn_exit STATUS +# ----------------- +# Exit the shell with STATUS, even in a "trap 0" or "set -e" context. +as_fn_exit () +{ + set +e + as_fn_set_status $1 + exit $1 +} # as_fn_exit + +# as_fn_unset VAR +# --------------- +# Portably unset VAR. +as_fn_unset () +{ + { eval $1=; unset $1;} +} +as_unset=as_fn_unset +# as_fn_append VAR VALUE +# ---------------------- +# Append the text in VALUE to the end of the definition contained in VAR. Take +# advantage of any shell optimizations that allow amortized linear growth over +# repeated appends, instead of the typical quadratic growth present in naive +# implementations. +if (eval "as_var=1; as_var+=2; test x\$as_var = x12") 2>/dev/null; then : + eval 'as_fn_append () + { + eval $1+=\$2 + }' +else + as_fn_append () + { + eval $1=\$$1\$2 + } +fi # as_fn_append + +# as_fn_arith ARG... +# ------------------ +# Perform arithmetic evaluation on the ARGs, and store the result in the +# global $as_val. Take advantage of shells that can avoid forks. The arguments +# must be portable across $(()) and expr. +if (eval "test \$(( 1 + 1 )) = 2") 2>/dev/null; then : + eval 'as_fn_arith () + { + as_val=$(( $* )) + }' +else + as_fn_arith () + { + as_val=`expr "$@" || test $? -eq 1` + } +fi # as_fn_arith + + +if expr a : '\(a\)' >/dev/null 2>&1 && + test "X`expr 00001 : '.*\(...\)'`" = X001; then + as_expr=expr +else + as_expr=false +fi + +if (basename -- /) >/dev/null 2>&1 && test "X`basename -- / 2>&1`" = "X/"; then + as_basename=basename +else + as_basename=false +fi + +if (as_dir=`dirname -- /` && test "X$as_dir" = X/) >/dev/null 2>&1; then + as_dirname=dirname +else + as_dirname=false +fi + +as_me=`$as_basename -- "$0" || +$as_expr X/"$0" : '.*/\([^/][^/]*\)/*$' \| \ + X"$0" : 'X\(//\)$' \| \ + X"$0" : 'X\(/\)' \| . 2>/dev/null || +$as_echo X/"$0" | + sed '/^.*\/\([^/][^/]*\)\/*$/{ + s//\1/ + q + } + /^X\/\(\/\/\)$/{ + s//\1/ + q + } + /^X\/\(\/\).*/{ + s//\1/ + q + } + s/.*/./; q'` + +# Avoid depending upon Character Ranges. +as_cr_letters='abcdefghijklmnopqrstuvwxyz' +as_cr_LETTERS='ABCDEFGHIJKLMNOPQRSTUVWXYZ' +as_cr_Letters=$as_cr_letters$as_cr_LETTERS +as_cr_digits='0123456789' +as_cr_alnum=$as_cr_Letters$as_cr_digits + +ECHO_C= ECHO_N= ECHO_T= +case `echo -n x` in #((((( +-n*) + case `echo 'xy\c'` in + *c*) ECHO_T=' ';; # ECHO_T is single tab character. + xy) ECHO_C='\c';; + *) echo `echo ksh88 bug on AIX 6.1` > /dev/null + ECHO_T=' ';; + esac;; +*) + ECHO_N='-n';; +esac + +rm -f conf$$ conf$$.exe conf$$.file +if test -d conf$$.dir; then + rm -f conf$$.dir/conf$$.file +else + rm -f conf$$.dir + mkdir conf$$.dir 2>/dev/null +fi +if (echo >conf$$.file) 2>/dev/null; then + if ln -s conf$$.file conf$$ 2>/dev/null; then + as_ln_s='ln -s' + # ... but there are two gotchas: + # 1) On MSYS, both `ln -s file dir' and `ln file dir' fail. + # 2) DJGPP < 2.04 has no symlinks; `ln -s' creates a wrapper executable. + # In both cases, we have to default to `cp -pR'. + ln -s conf$$.file conf$$.dir 2>/dev/null && test ! -f conf$$.exe || + as_ln_s='cp -pR' + elif ln conf$$.file conf$$ 2>/dev/null; then + as_ln_s=ln + else + as_ln_s='cp -pR' + fi +else + as_ln_s='cp -pR' +fi +rm -f conf$$ conf$$.exe conf$$.dir/conf$$.file conf$$.file +rmdir conf$$.dir 2>/dev/null + + +# as_fn_mkdir_p +# ------------- +# Create "$as_dir" as a directory, including parents if necessary. +as_fn_mkdir_p () +{ + + case $as_dir in #( + -*) as_dir=./$as_dir;; + esac + test -d "$as_dir" || eval $as_mkdir_p || { + as_dirs= + while :; do + case $as_dir in #( + *\'*) as_qdir=`$as_echo "$as_dir" | sed "s/'/'\\\\\\\\''/g"`;; #'( + *) as_qdir=$as_dir;; + esac + as_dirs="'$as_qdir' $as_dirs" + as_dir=`$as_dirname -- "$as_dir" || +$as_expr X"$as_dir" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ + X"$as_dir" : 'X\(//\)[^/]' \| \ + X"$as_dir" : 'X\(//\)$' \| \ + X"$as_dir" : 'X\(/\)' \| . 2>/dev/null || +$as_echo X"$as_dir" | + sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ + s//\1/ + q + } + /^X\(\/\/\)[^/].*/{ + s//\1/ + q + } + /^X\(\/\/\)$/{ + s//\1/ + q + } + /^X\(\/\).*/{ + s//\1/ + q + } + s/.*/./; q'` + test -d "$as_dir" && break + done + test -z "$as_dirs" || eval "mkdir $as_dirs" + } || test -d "$as_dir" || as_fn_error $? "cannot create directory $as_dir" + + +} # as_fn_mkdir_p +if mkdir -p . 2>/dev/null; then + as_mkdir_p='mkdir -p "$as_dir"' +else + test -d ./-p && rmdir ./-p + as_mkdir_p=false +fi + + +# as_fn_executable_p FILE +# ----------------------- +# Test if FILE is an executable regular file. +as_fn_executable_p () +{ + test -f "$1" && test -x "$1" +} # as_fn_executable_p +as_test_x='test -x' +as_executable_p=as_fn_executable_p + +# Sed expression to map a string onto a valid CPP name. +as_tr_cpp="eval sed 'y%*$as_cr_letters%P$as_cr_LETTERS%;s%[^_$as_cr_alnum]%_%g'" + +# Sed expression to map a string onto a valid variable name. +as_tr_sh="eval sed 'y%*+%pp%;s%[^_$as_cr_alnum]%_%g'" + + +exec 6>&1 +## ----------------------------------- ## +## Main body of $CONFIG_STATUS script. ## +## ----------------------------------- ## +_ASEOF +test $as_write_fail = 0 && chmod +x $CONFIG_STATUS || ac_write_fail=1 + +cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 +# Save the log message, to keep $0 and so on meaningful, and to +# report actual input values of CONFIG_FILES etc. instead of their +# values after options handling. +ac_log=" +This file was extended by tmLQCD $as_me 5.2.0, which was +generated by GNU Autoconf 2.69. Invocation command line was + + CONFIG_FILES = $CONFIG_FILES + CONFIG_HEADERS = $CONFIG_HEADERS + CONFIG_LINKS = $CONFIG_LINKS + CONFIG_COMMANDS = $CONFIG_COMMANDS + $ $0 $@ + +on `(hostname || uname -n) 2>/dev/null | sed 1q` +" + +_ACEOF + +case $ac_config_files in *" +"*) set x $ac_config_files; shift; ac_config_files=$*;; +esac + +case $ac_config_headers in *" +"*) set x $ac_config_headers; shift; ac_config_headers=$*;; +esac + + +cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 +# Files that config.status was made for. +config_files="$ac_config_files" +config_headers="$ac_config_headers" + +_ACEOF + +cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 +ac_cs_usage="\ +\`$as_me' instantiates files and other configuration actions +from templates according to the current configuration. Unless the files +and actions are specified as TAGs, all are instantiated by default. + +Usage: $0 [OPTION]... [TAG]... + + -h, --help print this help, then exit + -V, --version print version number and configuration settings, then exit + --config print configuration, then exit + -q, --quiet, --silent + do not print progress messages + -d, --debug don't remove temporary files + --recheck update $as_me by reconfiguring in the same conditions + --file=FILE[:TEMPLATE] + instantiate the configuration file FILE + --header=FILE[:TEMPLATE] + instantiate the configuration header FILE + +Configuration files: +$config_files + +Configuration headers: +$config_headers + +Report bugs to ." + +_ACEOF +cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 +ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" +ac_cs_version="\\ +tmLQCD config.status 5.2.0 +configured by $0, generated by GNU Autoconf 2.69, + with options \\"\$ac_cs_config\\" + +Copyright (C) 2012 Free Software Foundation, Inc. +This config.status script is free software; the Free Software Foundation +gives unlimited permission to copy, distribute and modify it." + +ac_pwd='$ac_pwd' +srcdir='$srcdir' +test -n "\$AWK" || AWK=awk +_ACEOF + +cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 +# The default lists apply if the user does not specify any file. +ac_need_defaults=: +while test $# != 0 +do + case $1 in + --*=?*) + ac_option=`expr "X$1" : 'X\([^=]*\)='` + ac_optarg=`expr "X$1" : 'X[^=]*=\(.*\)'` + ac_shift=: + ;; + --*=) + ac_option=`expr "X$1" : 'X\([^=]*\)='` + ac_optarg= + ac_shift=: + ;; + *) + ac_option=$1 + ac_optarg=$2 + ac_shift=shift + ;; + esac + + case $ac_option in + # Handling of the options. + -recheck | --recheck | --rechec | --reche | --rech | --rec | --re | --r) + ac_cs_recheck=: ;; + --version | --versio | --versi | --vers | --ver | --ve | --v | -V ) + $as_echo "$ac_cs_version"; exit ;; + --config | --confi | --conf | --con | --co | --c ) + $as_echo "$ac_cs_config"; exit ;; + --debug | --debu | --deb | --de | --d | -d ) + debug=: ;; + --file | --fil | --fi | --f ) + $ac_shift + case $ac_optarg in + *\'*) ac_optarg=`$as_echo "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"` ;; + '') as_fn_error $? "missing file argument" ;; + esac + as_fn_append CONFIG_FILES " '$ac_optarg'" + ac_need_defaults=false;; + --header | --heade | --head | --hea ) + $ac_shift + case $ac_optarg in + *\'*) ac_optarg=`$as_echo "$ac_optarg" | sed "s/'/'\\\\\\\\''/g"` ;; + esac + as_fn_append CONFIG_HEADERS " '$ac_optarg'" + ac_need_defaults=false;; + --he | --h) + # Conflict between --help and --header + as_fn_error $? "ambiguous option: \`$1' +Try \`$0 --help' for more information.";; + --help | --hel | -h ) + $as_echo "$ac_cs_usage"; exit ;; + -q | -quiet | --quiet | --quie | --qui | --qu | --q \ + | -silent | --silent | --silen | --sile | --sil | --si | --s) + ac_cs_silent=: ;; + + # This is an error. + -*) as_fn_error $? "unrecognized option: \`$1' +Try \`$0 --help' for more information." ;; + + *) as_fn_append ac_config_targets " $1" + ac_need_defaults=false ;; + + esac + shift +done + +ac_configure_extra_args= + +if $ac_cs_silent; then + exec 6>/dev/null + ac_configure_extra_args="$ac_configure_extra_args --silent" +fi + +_ACEOF +cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 +if \$ac_cs_recheck; then + set X $SHELL '$0' $ac_configure_args \$ac_configure_extra_args --no-create --no-recursion + shift + \$as_echo "running CONFIG_SHELL=$SHELL \$*" >&6 + CONFIG_SHELL='$SHELL' + export CONFIG_SHELL + exec "\$@" +fi + +_ACEOF +cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 +exec 5>>config.log +{ + echo + sed 'h;s/./-/g;s/^.../## /;s/...$/ ##/;p;x;p;x' <<_ASBOX +## Running $as_me. ## +_ASBOX + $as_echo "$ac_log" +} >&5 + +_ACEOF +cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 +_ACEOF + +cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 + +# Handling of arguments. +for ac_config_target in $ac_config_targets +do + case $ac_config_target in + "config.h") CONFIG_HEADERS="$CONFIG_HEADERS config.h" ;; + "fixed_volume.h") CONFIG_FILES="$CONFIG_FILES fixed_volume.h" ;; + "Makefile") CONFIG_FILES="$CONFIG_FILES Makefile" ;; + "$make_files") CONFIG_FILES="$CONFIG_FILES $make_files" ;; + + *) as_fn_error $? "invalid argument: \`$ac_config_target'" "$LINENO" 5;; + esac +done + + +# If the user did not use the arguments to specify the items to instantiate, +# then the envvar interface is used. Set only those that are not. +# We use the long form for the default assignment because of an extremely +# bizarre bug on SunOS 4.1.3. +if $ac_need_defaults; then + test "${CONFIG_FILES+set}" = set || CONFIG_FILES=$config_files + test "${CONFIG_HEADERS+set}" = set || CONFIG_HEADERS=$config_headers +fi + +# Have a temporary directory for convenience. Make it in the build tree +# simply because there is no reason against having it here, and in addition, +# creating and moving files from /tmp can sometimes cause problems. +# Hook for its removal unless debugging. +# Note that there is a small window in which the directory will not be cleaned: +# after its creation but before its name has been assigned to `$tmp'. +$debug || +{ + tmp= ac_tmp= + trap 'exit_status=$? + : "${ac_tmp:=$tmp}" + { test ! -d "$ac_tmp" || rm -fr "$ac_tmp"; } && exit $exit_status +' 0 + trap 'as_fn_exit 1' 1 2 13 15 +} +# Create a (secure) tmp directory for tmp files. + +{ + tmp=`(umask 077 && mktemp -d "./confXXXXXX") 2>/dev/null` && + test -d "$tmp" +} || +{ + tmp=./conf$$-$RANDOM + (umask 077 && mkdir "$tmp") +} || as_fn_error $? "cannot create a temporary directory in ." "$LINENO" 5 +ac_tmp=$tmp + +# Set up the scripts for CONFIG_FILES section. +# No need to generate them if there are no CONFIG_FILES. +# This happens for instance with `./config.status config.h'. +if test -n "$CONFIG_FILES"; then + + +ac_cr=`echo X | tr X '\015'` +# On cygwin, bash can eat \r inside `` if the user requested igncr. +# But we know of no other shell where ac_cr would be empty at this +# point, so we can use a bashism as a fallback. +if test "x$ac_cr" = x; then + eval ac_cr=\$\'\\r\' +fi +ac_cs_awk_cr=`$AWK 'BEGIN { print "a\rb" }' /dev/null` +if test "$ac_cs_awk_cr" = "a${ac_cr}b"; then + ac_cs_awk_cr='\\r' +else + ac_cs_awk_cr=$ac_cr +fi + +echo 'BEGIN {' >"$ac_tmp/subs1.awk" && +_ACEOF + + +{ + echo "cat >conf$$subs.awk <<_ACEOF" && + echo "$ac_subst_vars" | sed 's/.*/&!$&$ac_delim/' && + echo "_ACEOF" +} >conf$$subs.sh || + as_fn_error $? "could not make $CONFIG_STATUS" "$LINENO" 5 +ac_delim_num=`echo "$ac_subst_vars" | grep -c '^'` +ac_delim='%!_!# ' +for ac_last_try in false false false false false :; do + . ./conf$$subs.sh || + as_fn_error $? "could not make $CONFIG_STATUS" "$LINENO" 5 + + ac_delim_n=`sed -n "s/.*$ac_delim\$/X/p" conf$$subs.awk | grep -c X` + if test $ac_delim_n = $ac_delim_num; then + break + elif $ac_last_try; then + as_fn_error $? "could not make $CONFIG_STATUS" "$LINENO" 5 + else + ac_delim="$ac_delim!$ac_delim _$ac_delim!! " + fi +done +rm -f conf$$subs.sh + +cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 +cat >>"\$ac_tmp/subs1.awk" <<\\_ACAWK && +_ACEOF +sed -n ' +h +s/^/S["/; s/!.*/"]=/ +p +g +s/^[^!]*!// +:repl +t repl +s/'"$ac_delim"'$// +t delim +:nl +h +s/\(.\{148\}\)..*/\1/ +t more1 +s/["\\]/\\&/g; s/^/"/; s/$/\\n"\\/ +p +n +b repl +:more1 +s/["\\]/\\&/g; s/^/"/; s/$/"\\/ +p +g +s/.\{148\}// +t nl +:delim +h +s/\(.\{148\}\)..*/\1/ +t more2 +s/["\\]/\\&/g; s/^/"/; s/$/"/ +p +b +:more2 +s/["\\]/\\&/g; s/^/"/; s/$/"\\/ +p +g +s/.\{148\}// +t delim +' >$CONFIG_STATUS || ac_write_fail=1 +rm -f conf$$subs.awk +cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 +_ACAWK +cat >>"\$ac_tmp/subs1.awk" <<_ACAWK && + for (key in S) S_is_set[key] = 1 + FS = "" + +} +{ + line = $ 0 + nfields = split(line, field, "@") + substed = 0 + len = length(field[1]) + for (i = 2; i < nfields; i++) { + key = field[i] + keylen = length(key) + if (S_is_set[key]) { + value = S[key] + line = substr(line, 1, len) "" value "" substr(line, len + keylen + 3) + len += length(value) + length(field[++i]) + substed = 1 + } else + len += 1 + keylen + } + + print line +} + +_ACAWK +_ACEOF +cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 +if sed "s/$ac_cr//" < /dev/null > /dev/null 2>&1; then + sed "s/$ac_cr\$//; s/$ac_cr/$ac_cs_awk_cr/g" +else + cat +fi < "$ac_tmp/subs1.awk" > "$ac_tmp/subs.awk" \ + || as_fn_error $? "could not setup config files machinery" "$LINENO" 5 +_ACEOF + +# VPATH may cause trouble with some makes, so we remove sole $(srcdir), +# ${srcdir} and @srcdir@ entries from VPATH if srcdir is ".", strip leading and +# trailing colons and then remove the whole line if VPATH becomes empty +# (actually we leave an empty line to preserve line numbers). +if test "x$srcdir" = x.; then + ac_vpsub='/^[ ]*VPATH[ ]*=[ ]*/{ +h +s/// +s/^/:/ +s/[ ]*$/:/ +s/:\$(srcdir):/:/g +s/:\${srcdir}:/:/g +s/:@srcdir@:/:/g +s/^:*// +s/:*$// +x +s/\(=[ ]*\).*/\1/ +G +s/\n// +s/^[^=]*=[ ]*$// +}' +fi + +cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 +fi # test -n "$CONFIG_FILES" + +# Set up the scripts for CONFIG_HEADERS section. +# No need to generate them if there are no CONFIG_HEADERS. +# This happens for instance with `./config.status Makefile'. +if test -n "$CONFIG_HEADERS"; then +cat >"$ac_tmp/defines.awk" <<\_ACAWK || +BEGIN { +_ACEOF + +# Transform confdefs.h into an awk script `defines.awk', embedded as +# here-document in config.status, that substitutes the proper values into +# config.h.in to produce config.h. + +# Create a delimiter string that does not exist in confdefs.h, to ease +# handling of long lines. +ac_delim='%!_!# ' +for ac_last_try in false false :; do + ac_tt=`sed -n "/$ac_delim/p" confdefs.h` + if test -z "$ac_tt"; then + break + elif $ac_last_try; then + as_fn_error $? "could not make $CONFIG_HEADERS" "$LINENO" 5 + else + ac_delim="$ac_delim!$ac_delim _$ac_delim!! " + fi +done + +# For the awk script, D is an array of macro values keyed by name, +# likewise P contains macro parameters if any. Preserve backslash +# newline sequences. + +ac_word_re=[_$as_cr_Letters][_$as_cr_alnum]* +sed -n ' +s/.\{148\}/&'"$ac_delim"'/g +t rset +:rset +s/^[ ]*#[ ]*define[ ][ ]*/ / +t def +d +:def +s/\\$// +t bsnl +s/["\\]/\\&/g +s/^ \('"$ac_word_re"'\)\(([^()]*)\)[ ]*\(.*\)/P["\1"]="\2"\ +D["\1"]=" \3"/p +s/^ \('"$ac_word_re"'\)[ ]*\(.*\)/D["\1"]=" \2"/p +d +:bsnl +s/["\\]/\\&/g +s/^ \('"$ac_word_re"'\)\(([^()]*)\)[ ]*\(.*\)/P["\1"]="\2"\ +D["\1"]=" \3\\\\\\n"\\/p +t cont +s/^ \('"$ac_word_re"'\)[ ]*\(.*\)/D["\1"]=" \2\\\\\\n"\\/p +t cont +d +:cont +n +s/.\{148\}/&'"$ac_delim"'/g +t clear +:clear +s/\\$// +t bsnlc +s/["\\]/\\&/g; s/^/"/; s/$/"/p +d +:bsnlc +s/["\\]/\\&/g; s/^/"/; s/$/\\\\\\n"\\/p +b cont +' >$CONFIG_STATUS || ac_write_fail=1 + +cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 + for (key in D) D_is_set[key] = 1 + FS = "" +} +/^[\t ]*#[\t ]*(define|undef)[\t ]+$ac_word_re([\t (]|\$)/ { + line = \$ 0 + split(line, arg, " ") + if (arg[1] == "#") { + defundef = arg[2] + mac1 = arg[3] + } else { + defundef = substr(arg[1], 2) + mac1 = arg[2] + } + split(mac1, mac2, "(") #) + macro = mac2[1] + prefix = substr(line, 1, index(line, defundef) - 1) + if (D_is_set[macro]) { + # Preserve the white space surrounding the "#". + print prefix "define", macro P[macro] D[macro] + next + } else { + # Replace #undef with comments. This is necessary, for example, + # in the case of _POSIX_SOURCE, which is predefined and required + # on some systems where configure will not decide to define it. + if (defundef == "undef") { + print "/*", prefix defundef, macro, "*/" + next + } + } +} +{ print } +_ACAWK +_ACEOF +cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 + as_fn_error $? "could not setup config headers machinery" "$LINENO" 5 +fi # test -n "$CONFIG_HEADERS" + + +eval set X " :F $CONFIG_FILES :H $CONFIG_HEADERS " +shift +for ac_tag +do + case $ac_tag in + :[FHLC]) ac_mode=$ac_tag; continue;; + esac + case $ac_mode$ac_tag in + :[FHL]*:*);; + :L* | :C*:*) as_fn_error $? "invalid tag \`$ac_tag'" "$LINENO" 5;; + :[FH]-) ac_tag=-:-;; + :[FH]*) ac_tag=$ac_tag:$ac_tag.in;; + esac + ac_save_IFS=$IFS + IFS=: + set x $ac_tag + IFS=$ac_save_IFS + shift + ac_file=$1 + shift + + case $ac_mode in + :L) ac_source=$1;; + :[FH]) + ac_file_inputs= + for ac_f + do + case $ac_f in + -) ac_f="$ac_tmp/stdin";; + *) # Look for the file first in the build tree, then in the source tree + # (if the path is not absolute). The absolute path cannot be DOS-style, + # because $ac_f cannot contain `:'. + test -f "$ac_f" || + case $ac_f in + [\\/$]*) false;; + *) test -f "$srcdir/$ac_f" && ac_f="$srcdir/$ac_f";; + esac || + as_fn_error 1 "cannot find input file: \`$ac_f'" "$LINENO" 5;; + esac + case $ac_f in *\'*) ac_f=`$as_echo "$ac_f" | sed "s/'/'\\\\\\\\''/g"`;; esac + as_fn_append ac_file_inputs " '$ac_f'" + done + + # Let's still pretend it is `configure' which instantiates (i.e., don't + # use $as_me), people would be surprised to read: + # /* config.h. Generated by config.status. */ + configure_input='Generated from '` + $as_echo "$*" | sed 's|^[^:]*/||;s|:[^:]*/|, |g' + `' by configure.' + if test x"$ac_file" != x-; then + configure_input="$ac_file. $configure_input" + { $as_echo "$as_me:${as_lineno-$LINENO}: creating $ac_file" >&5 +$as_echo "$as_me: creating $ac_file" >&6;} + fi + # Neutralize special characters interpreted by sed in replacement strings. + case $configure_input in #( + *\&* | *\|* | *\\* ) + ac_sed_conf_input=`$as_echo "$configure_input" | + sed 's/[\\\\&|]/\\\\&/g'`;; #( + *) ac_sed_conf_input=$configure_input;; + esac + + case $ac_tag in + *:-:* | *:-) cat >"$ac_tmp/stdin" \ + || as_fn_error $? "could not create $ac_file" "$LINENO" 5 ;; + esac + ;; + esac + + ac_dir=`$as_dirname -- "$ac_file" || +$as_expr X"$ac_file" : 'X\(.*[^/]\)//*[^/][^/]*/*$' \| \ + X"$ac_file" : 'X\(//\)[^/]' \| \ + X"$ac_file" : 'X\(//\)$' \| \ + X"$ac_file" : 'X\(/\)' \| . 2>/dev/null || +$as_echo X"$ac_file" | + sed '/^X\(.*[^/]\)\/\/*[^/][^/]*\/*$/{ + s//\1/ + q + } + /^X\(\/\/\)[^/].*/{ + s//\1/ + q + } + /^X\(\/\/\)$/{ + s//\1/ + q + } + /^X\(\/\).*/{ + s//\1/ + q + } + s/.*/./; q'` + as_dir="$ac_dir"; as_fn_mkdir_p + ac_builddir=. + +case "$ac_dir" in +.) ac_dir_suffix= ac_top_builddir_sub=. ac_top_build_prefix= ;; +*) + ac_dir_suffix=/`$as_echo "$ac_dir" | sed 's|^\.[\\/]||'` + # A ".." for each directory in $ac_dir_suffix. + ac_top_builddir_sub=`$as_echo "$ac_dir_suffix" | sed 's|/[^\\/]*|/..|g;s|/||'` + case $ac_top_builddir_sub in + "") ac_top_builddir_sub=. ac_top_build_prefix= ;; + *) ac_top_build_prefix=$ac_top_builddir_sub/ ;; + esac ;; +esac +ac_abs_top_builddir=$ac_pwd +ac_abs_builddir=$ac_pwd$ac_dir_suffix +# for backward compatibility: +ac_top_builddir=$ac_top_build_prefix + +case $srcdir in + .) # We are building in place. + ac_srcdir=. + ac_top_srcdir=$ac_top_builddir_sub + ac_abs_top_srcdir=$ac_pwd ;; + [\\/]* | ?:[\\/]* ) # Absolute name. + ac_srcdir=$srcdir$ac_dir_suffix; + ac_top_srcdir=$srcdir + ac_abs_top_srcdir=$srcdir ;; + *) # Relative name. + ac_srcdir=$ac_top_build_prefix$srcdir$ac_dir_suffix + ac_top_srcdir=$ac_top_build_prefix$srcdir + ac_abs_top_srcdir=$ac_pwd/$srcdir ;; +esac +ac_abs_srcdir=$ac_abs_top_srcdir$ac_dir_suffix + + + case $ac_mode in + :F) + # + # CONFIG_FILE + # + +_ACEOF + +cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 +# If the template does not know about datarootdir, expand it. +# FIXME: This hack should be removed a few years after 2.60. +ac_datarootdir_hack=; ac_datarootdir_seen= +ac_sed_dataroot=' +/datarootdir/ { + p + q +} +/@datadir@/p +/@docdir@/p +/@infodir@/p +/@localedir@/p +/@mandir@/p' +case `eval "sed -n \"\$ac_sed_dataroot\" $ac_file_inputs"` in +*datarootdir*) ac_datarootdir_seen=yes;; +*@datadir@*|*@docdir@*|*@infodir@*|*@localedir@*|*@mandir@*) + { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&5 +$as_echo "$as_me: WARNING: $ac_file_inputs seems to ignore the --datarootdir setting" >&2;} +_ACEOF +cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 + ac_datarootdir_hack=' + s&@datadir@&$datadir&g + s&@docdir@&$docdir&g + s&@infodir@&$infodir&g + s&@localedir@&$localedir&g + s&@mandir@&$mandir&g + s&\\\${datarootdir}&$datarootdir&g' ;; +esac +_ACEOF + +# Neutralize VPATH when `$srcdir' = `.'. +# Shell code in configure.ac might set extrasub. +# FIXME: do we really want to maintain this feature? +cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 +ac_sed_extra="$ac_vpsub +$extrasub +_ACEOF +cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 +:t +/@[a-zA-Z_][a-zA-Z_0-9]*@/!b +s|@configure_input@|$ac_sed_conf_input|;t t +s&@top_builddir@&$ac_top_builddir_sub&;t t +s&@top_build_prefix@&$ac_top_build_prefix&;t t +s&@srcdir@&$ac_srcdir&;t t +s&@abs_srcdir@&$ac_abs_srcdir&;t t +s&@top_srcdir@&$ac_top_srcdir&;t t +s&@abs_top_srcdir@&$ac_abs_top_srcdir&;t t +s&@builddir@&$ac_builddir&;t t +s&@abs_builddir@&$ac_abs_builddir&;t t +s&@abs_top_builddir@&$ac_abs_top_builddir&;t t +$ac_datarootdir_hack +" +eval sed \"\$ac_sed_extra\" "$ac_file_inputs" | $AWK -f "$ac_tmp/subs.awk" \ + >$ac_tmp/out || as_fn_error $? "could not create $ac_file" "$LINENO" 5 + +test -z "$ac_datarootdir_hack$ac_datarootdir_seen" && + { ac_out=`sed -n '/\${datarootdir}/p' "$ac_tmp/out"`; test -n "$ac_out"; } && + { ac_out=`sed -n '/^[ ]*datarootdir[ ]*:*=/p' \ + "$ac_tmp/out"`; test -z "$ac_out"; } && + { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: $ac_file contains a reference to the variable \`datarootdir' +which seems to be undefined. Please make sure it is defined" >&5 +$as_echo "$as_me: WARNING: $ac_file contains a reference to the variable \`datarootdir' +which seems to be undefined. Please make sure it is defined" >&2;} + + rm -f "$ac_tmp/stdin" + case $ac_file in + -) cat "$ac_tmp/out" && rm -f "$ac_tmp/out";; + *) rm -f "$ac_file" && mv "$ac_tmp/out" "$ac_file";; + esac \ + || as_fn_error $? "could not create $ac_file" "$LINENO" 5 + ;; + :H) + # + # CONFIG_HEADER + # + if test x"$ac_file" != x-; then + { + $as_echo "/* $configure_input */" \ + && eval '$AWK -f "$ac_tmp/defines.awk"' "$ac_file_inputs" + } >"$ac_tmp/config.h" \ + || as_fn_error $? "could not create $ac_file" "$LINENO" 5 + if diff "$ac_file" "$ac_tmp/config.h" >/dev/null 2>&1; then + { $as_echo "$as_me:${as_lineno-$LINENO}: $ac_file is unchanged" >&5 +$as_echo "$as_me: $ac_file is unchanged" >&6;} + else + rm -f "$ac_file" + mv "$ac_tmp/config.h" "$ac_file" \ + || as_fn_error $? "could not create $ac_file" "$LINENO" 5 + fi + else + $as_echo "/* $configure_input */" \ + && eval '$AWK -f "$ac_tmp/defines.awk"' "$ac_file_inputs" \ + || as_fn_error $? "could not create -" "$LINENO" 5 + fi + ;; + + + esac + +done # for ac_tag + + +as_fn_exit 0 +_ACEOF +ac_clean_files=$ac_clean_files_save + +test $ac_write_fail = 0 || + as_fn_error $? "write failure creating $CONFIG_STATUS" "$LINENO" 5 + + +# configure is writing to config.log, and then calls config.status. +# config.status does its own redirection, appending to config.log. +# Unfortunately, on DOS this fails, as config.log is still kept open +# by configure, so config.status won't be able to write to it; its +# output is simply discarded. So we exec the FD to /dev/null, +# effectively closing config.log, so it can be properly (re)opened and +# appended to by config.status. When coming back to configure, we +# need to make the FD available again. +if test "$no_create" != yes; then + ac_cs_success=: + ac_config_status_args= + test "$silent" = yes && + ac_config_status_args="$ac_config_status_args --quiet" + exec 5>/dev/null + $SHELL $CONFIG_STATUS $ac_config_status_args || ac_cs_success=false + exec 5>>config.log + # Use ||, not &&, to avoid exiting from the if with $? = 1, which + # would make configure fail if this is the last instruction. + $ac_cs_success || as_fn_exit 1 +fi +if test -n "$ac_unrecognized_opts" && test "$enable_option_checking" != no; then + { $as_echo "$as_me:${as_lineno-$LINENO}: WARNING: unrecognized options: $ac_unrecognized_opts" >&5 +$as_echo "$as_me: WARNING: unrecognized options: $ac_unrecognized_opts" >&2;} +fi + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/configure.in b/qcd/part_cpu/applications/QCD/src/kernel_D/configure.in new file mode 100644 index 0000000000000000000000000000000000000000..1d9ddb496db2fe668d31a947f40d609cf49cf9ee --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/configure.in @@ -0,0 +1,1009 @@ +# +# Process this file with autoconf to produce a configure script +# +AC_PREREQ(2.59) +AC_INIT(tmLQCD, 5.2.0, curbach@gmx.de) +AC_CONFIG_HEADER(config.h) +AC_CONFIG_SRCDIR([hmc_tm.c]) +AC_CANONICAL_HOST() +AC_PREFIX_DEFAULT($HOME) +AC_ARG_PROGRAM + +if test "$host_vendor" = "cray"; then + ac_cv_c_bigendian=yes +fi + +AC_PROG_CC +AC_PROG_CC_C99 +dnl AC_PROG_CC_STDC +AC_C_CONST +AC_C_INLINE +AC_C_RESTRICT +AC_F77_LIBRARY_LDFLAGS +AC_CHECK_TOOL(AR, ar, [ar]) +LIBS="$LIBS $FLIBS -lm" + +AC_PROG_LEX +dnl AC_PROG_LEX sets $LEX to ":" if neither lex nor flex are found! +if test "$LEX" = ":"; then + AC_MSG_ERROR([(F)LEX is required for building read_input.c. Please install it and run configure again.]) +fi + +AC_PROG_MAKE_SET +AC_PROG_RANLIB +AC_CHECK_PROG(CCDEP, gcc, "gcc", "$CC") +#(endian="", AC_DEFINE(LITTLE_ENDIAN,1,The endian of the architechture)) + +# AC_PROG_FC([ifort gfortran]) +# AC_FC_FUNC(testfunc, ) + +LDFLAGS="$LDFLAGS -L\${HOME}/lib -L\${top_builddir}/lib" +CCLD=${CC} + +# compilation in operator is slowest so we do it first, saves time in parallel compiles +USESUBDIRS="operator linalg solver monomial buffers cu io meas xchange init rational wrapper" + +AC_CHECK_HEADERS([stdint.h], +[ dnl for inttypes.h and stdint.h for uint_xxx types + dnl if successful check for the actual types too + AC_CHECK_TYPES([uint16_t, uint32_t, uint64_t], + [], + [AC_MSG_ERROR([stdint.h found but either uint16_t, uint32_t or uint64_t not found]) ] + ) +], +[ + dnl no inttypes.h or stdint.h found check common unsigned types + dnl for sizes and make appropriate decisions in the lime_fixed_types.h file + AC_CHECK_SIZEOF(unsigned char) + AC_CHECK_SIZEOF(unsigned short) + AC_CHECK_SIZEOF(unsigned int) + AC_CHECK_SIZEOF(unsigned long) + AC_CHECK_SIZEOF(unsigned long long) +] +) + +AC_MSG_CHECKING(whether we want to use only Benchmark) +AC_ARG_ENABLE(benchmark, + AS_HELP_STRING([--enable-benchmark], [enable use of benchmark [default=yes]]), + enable_benchmark=$enableval, enable_benchmark=yes) +if test $enable_benchmark = no; then + AC_ARG_WITH(limedir, + AS_HELP_STRING([--with-limedir[=dir]], [search lime in dir [default=./lime]]), + lime_dir=$withval, lime_dir="./c-lime") + AC_MSG_RESULT($lime_dir) + LDFLAGS="$LDFLAGS -L${lime_dir}/lib/" + AC_CHECK_LIB([lime], [limeReaderNextRecord],[], + [AC_MSG_ERROR([library liblime is missing or needed function is not available])]) +else + AC_DEFINE(BENCHMARK,1,Using Benchmarking no c-lime) +fi + + + +#LIBS="$LIBS $FLIBS -lm" + +AC_MSG_CHECKING(whether we want to use lemon) +AC_ARG_WITH(lemondir, + AS_HELP_STRING([--with-lemondir[=dir]], [use lemon, to be found in dir]), + [echo yes + LEMON_AVAILABLE=1 + lemon_dir=$withval + LDFLAGS="$LDFLAGS -L${lemon_dir}/lib" + AC_CHECK_LIB([lemon], + [lemonReaderNextRecord], + [], + [AC_MSG_ERROR([library liblemon was not found])])], + [echo no + LEMON_AVAILABLE=0]) + +AC_MSG_CHECKING(whether we use the general geometry) +AC_ARG_ENABLE(indexindepgeom, + AS_HELP_STRING([--enable-indexindepgeom], [enable Index independent addressing [default=no]]), + enable_iig=$enableval, enable_iig=no) +if test $enable_iig = yes; then + AC_MSG_RESULT(yes) + AC_DEFINE(_INDEX_INDEP_GEOM,1,Index independent addressing) +else + AC_MSG_RESULT(no) +fi + +AC_MSG_CHECKING(whether we want to use MPI) +AC_ARG_ENABLE(mpi, + AS_HELP_STRING([--enable-mpi], [enable use of mpi [default=yes]]), + enable_mpi=$enableval, enable_mpi=yes) +if test $enable_mpi = yes; then + AC_MSG_RESULT(yes) + AC_DEFINE(MPI,1,Compile with MPI support) +else + AC_MSG_RESULT(no) +fi + +AC_MSG_CHECKING(whether to use QPX intrinsics) +AC_ARG_ENABLE(qpx, + AS_HELP_STRING([--enable-qpx], [enable use of qpx intrinsics [default=no]]), + enable_qpx=$enableval, enable_qpx=no) +if test $enable_qpx = yes; then + AC_MSG_RESULT(yes) + AC_DEFINE(BGQ,1,Compile with QPX intrinsics) + AC_MSG_NOTICE([Compiling with QPX intrinsics on BGQ, enabling compiler optimizations for XLC.]) + OPTARGS="-O2 -qstrict=all -qtune=qp -qarch=qp -qmaxmem=-1" + SOPTARGS="$OPTARGS" +else + AC_MSG_RESULT(no) +fi + +AC_MSG_CHECKING(whether to use IBM BG/Q SPI for communications) +AC_ARG_ENABLE(spi, + AS_HELP_STRING([--enable-spi], [enable use of SPI [default=no]]), + enable_spi=$enableval, enable_spi=no) +if test $enable_spi = yes; then + AC_MSG_RESULT(yes) + AC_DEFINE(SPI,1,Compile with SPI for communications) + SPI_FILES="DirectPut" +else + AC_MSG_RESULT(no) + SPI_FILES="" +fi + + +AC_MSG_CHECKING(whether we want to use OpenMP) +AC_ARG_ENABLE(omp, + AS_HELP_STRING([--enable-omp], [enable use of OpenMP [default=yes]]), + enable_omp=$enableval, enable_omp=yes) +if test $enable_omp = yes; then + AC_MSG_RESULT(yes) + AC_DEFINE(OMP,1,Compile with OpenMP support) + AC_CHECK_HEADERS([omp.h],,[AC_MSG_ERROR([Cannot find OpenMP headers!])]) + AC_OPENMP +# -- AC_OPENMP provides a compiler-dependent OPENMP_CFLAGS so we can set it here +# on the BG/Q with XLC we force a special set of options for OpenMP support + if test $enable_qpx = yes; then + AC_MSG_NOTICE([Using OpenMP with XLC on BG/Q. Compiling with "-qsmp=omp:noauto:schedule=static -qthreaded".]) + CFLAGS="$CFLAGS -qsmp=omp:noauto:schedule=static -qthreaded" + CPPFLAGS="$CPPFLAGS -qsmp=omp:noauto:schedule=static -qthreaded" + LDFLAGS="$LDFLAGS -qsmp=omp:noauto:schedule=static -qthreaded" + else + CFLAGS="$CFLAGS $OPENMP_CFLAGS" + CPPFLAGS="$CPPFLAGS $OPENMP_CFLAGS" + LDFLAGS="$LDFLAGS $OPENMP_CFLAGS" + fi +else + AC_MSG_RESULT(no) +fi + +fftw_lib=/usr +AC_MSG_CHECKING(whether we want to use FFTW) +AC_ARG_ENABLE(fftw, + AS_HELP_STRING([--enable-fftw], [enable use of fftw [default=no]]), + enable_fftw=$enableval, enable_fftw=no) +if test $enable_fftw = yes; then + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_FFTW,1,Compile with FFTW support) + LIBS="-lfftw3 ${LIBS}" +elif test $enable_fftw = no; then + AC_MSG_RESULT(no) +else + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_FFTW,1,Compile with FFTW support) + fftw_lib=${enable_fftw} + LDFLAGS="$LDFLAGS -L${fftw_lib}/lib64" + LIBS="-lfftw3 ${LIBS}" + INCLUDES="-I${fftw_lib}/include ${INCLUDES}" +fi + +if test $enable_mpi = yes; then + AC_MSG_CHECKING(which parallelisation to use for MPI) + AC_ARG_WITH(mpidimension, + AS_HELP_STRING([--with-mpidimension[=n]], [use n dimensional parallelisation [default=1]]), + withmpidimension=$withval, withmpidimension=1) + if test $withmpidimension = 1; then + AC_MSG_RESULT(n=1 [t]) + AC_DEFINE(PARALLELT,1,One dimensional parallelisation) + elif test $withmpidimension = 2; then + AC_MSG_RESULT(n=2 [xt]) + AC_DEFINE(PARALLELXT,1,Two dimensional parallelisation) + elif test $withmpidimension = 3; then + AC_MSG_RESULT(n=3 [xyt]) + AC_DEFINE(PARALLELXYT,1,Three dimensional parallelisation) + elif test $withmpidimension = 4; then + AC_MSG_RESULT(n=4 [xyzt]) + AC_DEFINE(PARALLELXYZT,1,Four dimensional parallelisation) + elif test $withmpidimension = X; then + AC_MSG_RESULT(n=1 [x]) + AC_DEFINE(PARALLELX,1, X parallelisation) + elif test $withmpidimension = XY; then + AC_MSG_RESULT(n=2 [xy]) + AC_DEFINE(PARALLELXY,1, XY parallelisation) + elif test $withmpidimension = XYZ; then + AC_MSG_RESULT(n=3 [xyz]) + AC_DEFINE(PARALLELXYZ,1, XYZ parallelisation) + elif test $withmpidimension = T; then + AC_MSG_RESULT(n=1 [t]) + AC_DEFINE(PARALLELT,1, T parallelisation) + elif test $withmpidimension = XT; then + AC_MSG_RESULT(n=2 [xt]) + AC_DEFINE(PARALLELXT,1, XT parallelisation) + elif test $withmpidimension = XYT; then + AC_MSG_RESULT(n=3 [xyt]) + AC_DEFINE(PARALLELXYT,1, XYT parallelisation) + elif test $withmpidimension = XYZT; then + AC_MSG_RESULT(n=4 [xyzt]) + AC_DEFINE(PARALLELXYZT,1, XYZT parallelisation) + else + AC_MSG_RESULT(unknown) + AC_MSG_ERROR([Only t, xt, xyt, xyzt, x, xy, xyz parallelisation available]) + fi + + AC_MSG_CHECKING(whether we shall use persistent MPI calls for halfspinor) + AC_ARG_WITH([persistentmpi], + AS_HELP_STRING([--with-persistentmpi], [use persistent MPI calls for halfspinor [default=no]]), + withpersistent=$withval, withpersistent=no) + if test $withpersistent = yes; then + AC_MSG_RESULT(yes) + AC_DEFINE(_PERSISTENT,1,use persistent MPI calls for halfspinor) + else + AC_MSG_RESULT(no) + fi + + AC_MSG_CHECKING(whether we shall use non-blocking MPI calls) + AC_ARG_WITH([nonblockingmpi], + AS_HELP_STRING([--with-nonblockingmpi], [use non-blocking MPI calls for spinor and gauge [default=yes]]), + withnonblock=$withval, withnonblock=yes) + if test $withnonblock = yes; then + AC_MSG_RESULT(yes) + AC_DEFINE(_NON_BLOCKING,1,use non-blocking MPI calls for spinor ang gauge) + else + AC_MSG_RESULT(no) + fi +fi + +AC_MSG_CHECKING([whether we want to fix volume at compiletime]) +AC_ARG_WITH([fixedvolume], + AS_HELP_STRING([--with-fixedvolume], [fix volume at compiletime [default=no]]), + with_fixvol=$withval, with_fixvol=no) +if test $with_fixvol = yes; then + AC_MSG_RESULT(yes) + AC_DEFINE(FIXEDVOLUME,1,Fixed volume at compiletime) + AC_CONFIG_FILES([fixed_volume.h]) +else + AC_MSG_RESULT(no) +fi + +AC_MSG_CHECKING([whether we want to use KOJAK instrumentalisation]) +AC_ARG_WITH([kojakinst], + AS_HELP_STRING([--with-kojakinst], [instrumentalise for KOJAK [default=no]]), + with_kojakinst=$withval, with_kojakinst=no) +if test $with_kojakinst = yes; then + AC_MSG_RESULT(yes) + CC="kinst-pomp ${CC}" +else + AC_MSG_RESULT(no) +fi + +AC_MSG_CHECKING(whether we want to use lapack and blas) +AC_ARG_WITH(lapack, + AS_HELP_STRING([--with-lapack], [enable use of lapack [default=yes]]), + with_lapack=$withval, with_lapack=yes) +if test "$with_lapack" = yes; then + AC_MSG_RESULT(yes) + LAPACKLIB= + AC_DEFINE(HAVE_LAPACK,1,lapack available) +elif test "$with_lapack" != no; then + AC_MSG_RESULT(yes) + LIBS="$withval $LIBS" + with_lapack=yes + AC_DEFINE(HAVE_LAPACK,1,lapack available) +else + AC_MSG_RESULT(no) + AC_MSG_ERROR([lapack is needed! Will stop here.]) +fi + +if test $enable_mpi = yes; then + dnl In general one cannot run mpi programs directly + dnl thats why we need here cross_compiling=yes + dnl for non CRAY + if test "$host_vendor" != "cray"; then + cross_compiling=yes + fi +fi + +dnl for the case of other configure scripts +dnl AC_CONFIG_SUBDIRS( rng ) + +dnl check for clock_gettime and set correct library flag if one is required +dnl (this is done by AC_CHECK_LIB) +AC_CHECK_FUNCS(clock_gettime, [], [AC_CHECK_LIB(rt, clock_gettime)]) + +dnl in principle clock_gettime and CLOCK_MONOTONIC/CLOCK_REALTIME should be available +dnl only when using POSIX 199309, we set this explicitly here +dnl this should not cause problems on any relatively modern (post y2k) machine! +if ( test "$ac_cv_lib_rt_clock_gettime" = "yes" || test "$ac_cv_func_clock_gettime" = "yes" ); then + AC_DEFINE(HAVE_CLOCK_GETTIME,1) +dnl we set this in gettime.c explicitly for the time being +dnl due to endian problem on BG/Q +dnl CFLAGS="$CFLAGS -D_POSIX_C_SOURCE=199309L" + AC_MSG_NOTICE([Instructing the compiler to use POSIX 199309L]) +fi + +dnl Checks for lapack and defines proper name mangling scheme for +dnl linking with f77 code +AC_F77_FUNC(zheev) +if test "$zheev" = "zheev"; then + AC_DEFINE(NOF77_,1,Fortran has no extra _) +fi +AC_SEARCH_LIBS([$zheev],[lapack], [], [AC_MSG_ERROR([Cannot find lapack])]) + +dnl Checks for header files. +AC_HEADER_STDC +AC_CHECK_HEADERS([float.h libintl.h limits.h stdint.h stdlib.h string.h strings.h sys/time.h unistd.h endian.h]) +AC_CHECK_HEADER( getopt.h, []) + +dnl Checks for typedefs, structures, and compiler characteristics. +AC_C_CONST +AC_TYPE_OFF_T +AC_TYPE_SIZE_T +AC_HEADER_TIME + +dnl Checks for library functions. +AC_SYS_LARGEFILE +AC_FUNC_FSEEKO +AC_FUNC_MALLOC +AC_TYPE_SIGNAL +AC_CHECK_FUNCS([gettimeofday pow sqrt]) + +dnl We now define some replacement variables +AC_SUBST(OPTARGS) +AC_SUBST(SOPTARGS) +AC_SUBST(INCLUDES) +AC_SUBST(AUTOCONF) +AC_SUBST(SOLVEROUT) +AC_SUBST(CCDEP) +AC_SUBST(CCLD) +AC_SUBST(DEPFLAGS) +AC_SUBST(DEBUG_FLAG) +AC_SUBST(PROFILE_FLAG) +AC_SUBST(XCHANGELIB) +AC_SUBST(XCHANGEDIR) +AC_SUBST(MEASDIR) +AC_SUBST(XLIB) +AC_SUBST([LEMON_AVAILABLE]) +AC_SUBST(SPI_FILES) +AC_SUBST(QUDA_INTERFACE) + +INCLUDES="$INCLUDES -I\$(HOME)/include/ -I. -I\${abs_top_builddir}/ -I\${abs_top_srcdir}/ -I${lime_dir}/include/ -I${lemon_dir}/include/" +DEPFLAGS="$DEPFLAGS" + +AC_MSG_CHECKING(what alignment we want for arrays) +AC_ARG_ENABLE(alignment, + [AS_HELP_STRING([--enable-alignment[=n]], [Automatically or expliclty align arrays to byte number: auto, none, 16, 32 [default=auto]])], + withalign=$enableval, withalign=auto) +if test "$withalign" = "none"; then + AC_MSG_RESULT(none) + withalign=1 + AC_DEFINE(ALIGN_BASE, 0x00, [Align base]) + AC_DEFINE(ALIGN, []) + AC_DEFINE(ALIGN_BASE32, 0x00, [Align base32]) + AC_DEFINE(ALIGN32, [], []) +elif test $withalign = 16; then + AC_MSG_RESULT(16 bytes) + AC_DEFINE(ALIGN_BASE, 0x0F, [Align base]) + AC_DEFINE(ALIGN, [__attribute__ ((aligned (16)))]) + AC_DEFINE(ALIGN_BASE32, 0x0F, [Align base32]) + AC_DEFINE(ALIGN32, [__attribute__ ((aligned (16)))], []) +elif test $withalign = 32; then + AC_MSG_RESULT(32 bytes) + AC_DEFINE(ALIGN_BASE, 0x1F, [Align base]) + AC_DEFINE(ALIGN, [__attribute__ ((aligned (32)))]) + AC_DEFINE(ALIGN_BASE32, 0x1F, [Align base32]) + AC_DEFINE(ALIGN32, [__attribute__ ((aligned (32)))], []) +elif test $withalign = auto; then + withautoalign=1 + AC_MSG_RESULT(auto) + AC_DEFINE(ALIGN_BASE, 0x00, [Align base]) + AC_DEFINE(ALIGN, [], []) + AC_DEFINE(ALIGN_BASE32, 0x00, [Align base32]) + AC_DEFINE(ALIGN32, [], []) +else + AC_MSG_RESULT(Unusable value for array alignment) + AC_MSG_ERROR([Allowed values are: auto, none, 16, 32]) +fi + +dnl in the following we check for extra options +if test "$host_cpu" = "i686" || test "$host_cpu" = "x86_64"; then + + AC_MSG_CHECKING(whether we want to use P4 instructions) + AC_ARG_ENABLE(p4, + AS_HELP_STRING([--enable-p4],[enable use of P4 instructions [default=no]]), + enable_p4=$enableval, enable_p4=no) + if test $enable_p4 = yes; then + AC_MSG_RESULT(yes) + AC_DEFINE(P4,1,Use Pentium4 instructions) + if test $withalign = auto; then + if test $withautoalign -lt 16; then + AC_MSG_RESULT(increasing array alignment to 16 bytes for P4 instructions) + AC_DEFINE(ALIGN_BASE, 0x0F, [Align base]) + AC_DEFINE(ALIGN, [__attribute__ ((aligned (16)))]) + AC_MSG_RESULT(increasing array 32 bit alignment to 16 bytes for P4 instructions) + AC_DEFINE(ALIGN_BASE32, 0x0F, [Align base]) + AC_DEFINE(ALIGN32, [__attribute__ ((aligned (16)))]) + withautoalign=16 + fi + elif test $withalign -lt 16; then + AC_MSG_ERROR([alignment incompatible with P4 instructions (16 bytes required)!]) + fi + else + AC_MSG_RESULT(no) + fi + + AC_MSG_CHECKING(whether we want to use Opteron instructions) + AC_ARG_ENABLE(opteron, + AS_HELP_STRING([--enable-opteron], [enable use of Opteron instructions [default=no]]), + enable_opteron=$enableval, enable_opteron=no) + if test $enable_opteron = yes; then + AC_MSG_RESULT(yes) + AC_DEFINE(OPTERON,1,Use Opteron instructions) + if test $withalign = auto; then + if test $withautoalign -lt 16; then + AC_MSG_RESULT(increasing array alignment to 16 bytes for Opteron instructions) + AC_DEFINE(ALIGN_BASE, 0x0F, [Align base]) + AC_DEFINE(ALIGN, [__attribute__ ((aligned (16)))]) + AC_MSG_RESULT(increasing array 32 bit alignment to 16 bytes for Opteron instructions) + AC_DEFINE(ALIGN_BASE32, 0x0F, [Align base32]) + AC_DEFINE(ALIGN32, [__attribute__ ((aligned (16)))]) + withautoalign=16 + fi + elif test $withalign -lt 16; then + AC_MSG_ERROR([alignment incompatible with Opteron instructions (16 bytes required)!]) + fi + else + AC_MSG_RESULT(no) + fi + + AC_MSG_CHECKING(whether we want to use SSE2 instructions) + AC_ARG_ENABLE(sse2, + AS_HELP_STRING([--enable-sse2], [enable use of SSE2 instructions [default=no]]), + enable_sse2=$enableval, enable_sse2=no) + if test $enable_sse2 = yes; then + AC_MSG_RESULT(yes) + if test $withalign != auto && test $withalign -lt 16; then + AC_MSG_ERROR([alignment incompatible with SSE2 instructions (16 bytes required)]) + fi + else + AC_MSG_RESULT(no) + fi + + AC_MSG_CHECKING(whether we want to use SSE3 instructions) + AC_ARG_ENABLE(sse3, + AS_HELP_STRING([--enable-sse3], [enable use of SSE3 instructions [default=no]]), + enable_sse3=$enableval, enable_sse3=no) + if test $enable_sse3 = yes; then + AC_MSG_RESULT(yes) + if test $withalign != auto && $withalign -lt 16; then + AC_MSG_ERROR([alignment incompatible with SSE3 instructions (16 bytes required)]) + fi + else + AC_MSG_RESULT(no) + fi + + if test "$enable_sse2" = "yes" || test "$enable_sse3" = "yes"; then + if test $withalign = auto; then + if test $withautoalign -lt 16; then + AC_MSG_RESULT(increasing array alignment to 16 bytes for SSE instructions) + AC_DEFINE(ALIGN_BASE, 0x0F, [Align base]) + AC_DEFINE(ALIGN, [__attribute__ ((aligned (16)))]) + AC_MSG_RESULT(increasing 32bit array alignment to 16 bytes for SSE instructions) + AC_DEFINE(ALIGN_BASE32, 0x0F, [Align base32]) + AC_DEFINE(ALIGN32, [__attribute__ ((aligned (16)))]) + withautoalign=16 + fi + fi + fi +fi + +dnl We here check for alignment issues with QPX instructions -- this flag has been set earlier +if test $enable_qpx = yes; then + if test $withalign = auto; then + if test $withautoalign -lt 32; then + AC_MSG_RESULT(increasing array alignment to 32 bytes for use of QPX instructions on BG/Q) + AC_DEFINE(ALIGN_BASE, 0x1F, [Align base]) + AC_DEFINE(ALIGN, [__attribute__ ((aligned (32)))]) + AC_MSG_RESULT(increasing 32bit array alignment to 16 bytes for use of QPX instructions on BG/Q) + AC_DEFINE(ALIGN_BASE32, 0x0F, [Align base32]) + AC_DEFINE(ALIGN32, [__attribute__ ((aligned (16)))]) + withautoalign=32 + fi + elif test $withalign -lt 32; then + AC_MSG_ERROR([alignment incompatible with QPX instructions (32 bytes required)]) + fi +fi + +dnl Check for alignment associated with (non-QPX) BG optimization. +dnl This will also result in using 32 byte alignment on MareNostrum, but that should be fairly innocuous. +if test "$host_cpu" = "powerpc" && test "$host_vendor" = "ibm" && test "$host_os" = "blrts"; then + if test $withalign = auto; then + if test $withautoalign -lt 16; then + AC_MSG_RESULT(increasing array alignment to 16 bytes for BG/L optimization) + AC_DEFINE(ALIGN_BASE, 0x0F, [Align base]) + AC_DEFINE(ALIGN, [__attribute__ ((aligned (16)))], [Align base]) + withautoalign=16 + fi + fi +elif test "$host_cpu" = "powerpc" && test "$host_vendor" = "ibm" && test "$host_os" = "bprts"; then + if test $withalign = auto; then + if test $withautoalign -lt 16; then + AC_MSG_RESULT(increasing array alignment to 16 bytes for BG/P optimization) + AC_DEFINE(ALIGN_BASE, 0x0F, [Align base]) + AC_DEFINE(ALIGN, [__attribute__ ((aligned (16)))], [Align base]) + withautoalign=16 + fi + fi +elif test "$host_cpu" = "powerpc64" && test "$host_vendor" = "unknown" && test "$host_os" = "linux-gnu"; then + if test $withalign = auto; then + if test $withautoalign -lt 32; then + AC_MSG_RESULT(increasing array alignment to 32 bytes for BG/Q and generic POWER optimization) + AC_DEFINE(ALIGN_BASE, 0x1F, [Align base]) + AC_DEFINE(ALIGN, [__attribute__ ((aligned (32)))]) + AC_MSG_RESULT(increasing array 32 bit alignment to 16 bytes for BG/Q and generic POWER optimization) + AC_DEFINE(ALIGN_BASE32, 0x0F, [Align base]) + AC_DEFINE(ALIGN32, [__attribute__ ((aligned (16)))]) + withautoalign=32 + fi + fi +fi + +AC_MSG_CHECKING(whether we want to use gprof as profiler) +AC_ARG_WITH(gprof, + AS_HELP_STRING([--with-gprof], [use of gprof profiler [default=no]]), + enable_gprof=$withval, enable_gprof=no) +if test $enable_gprof = yes; then + AC_MSG_RESULT(yes) + if test "$host_cpu" = "powerpc" && test "$host_vendor" = "ibm"; then + PROFILE_FLAG="-pg -qfullpath -g" + else + PROFILE_FLAG="-pg -g" + fi +else + AC_MSG_RESULT(no) + PROFILE_FLAG= +fi + +AC_MSG_CHECKING(whether we shall use rts dram window) +AC_ARG_WITH([bgldram], + AS_HELP_STRING([--with-bgldram], [use BGL dram window (BGL only!) [default=yes]]), + with_bgldram=$withval, with_bgldram=yes) +if test $with_bgldram = yes; then + AC_MSG_RESULT(yes) + AC_DEFINE(_USE_BGLDRAM,1,use BGL dram window) +else + AC_MSG_RESULT(no) +fi + +dnl Now we have to set all Flags and compiler properly +XLCGREP=`$CC -V 2>&1 | grep -i xlc` +if test "$XLCGREP" != ""; then + XLC="yes" + AC_DEFINE(XLC,1,Are we using the IBM xlc compiler?) +fi +PGCC=`$CC -V 2>&1 | grep pgcc` +ICC=`$CC -V 2>&1 | grep -i intel` + +dnl first for PC's +if test "$host_cpu" = "i686" || test "$host_cpu" = "x86_64"; then +dnl the GNU compiler + if test "$GCC" = yes && test "$ICC" = ""; then + DEPFLAGS="-MM" + CFLAGS="$CFLAGS -pedantic -Wall" + OPTARGS='-O' + SOPTARGS='-O' + + if test $enable_sse3 = yes; then + echo Using SSE3 and SSE2 macros! + AC_DEFINE(SSE3,1,Compile with SSE3 support) + DEPFLAGS="$DEPFLAGS -DSSE3" + if test "$host_cpu" = "x86_64"; then + CFLAGS="$CFLAGS -mfpmath=387" + fi + elif test $enable_sse2 = yes; then + DEPFLAGS="$DEPFLAGS -DSSE2" + AC_DEFINE(SSE2,1,Compile with SSE2 support) + if test "$host_cpu" = "x86_64"; then + CFLAGS="$CFLAGS -mfpmath=387" + fi + fi + + if test "$host_cpu" = "x86_64"; then + AC_DEFINE(_x86_64,1,x86 64 Bit architecture) + fi + CCDEP="$CC" + if test $enable_mpi = yes; then + CCDEP="gcc" + fi + DEBUG_FLAG="-g" +dnl other compilers + else +dnl check for pgcc + if test "$PGCC" != ""; then + DEPFLAGS="-M" + echo "We are using the Portland Group C compiler!" + OPTARGS="-O2" + SOPTARGS="-O2" + DEBUG_FLAG="-g" + PROFILE_FLAG="-p -g" + CCDEP="$CC" + +dnl check for icc + elif test "$ICC" != ""; then + echo "We are using the Intel C compiler!" + DEPFLAGS="-M" + OPTARGS="-O3" + SOPTARGS="-O3" + DEBUG_FLAG="-g" + PROFILE_FLAG="-p -g" + CCDEP="$CC" + + else + # other compilers might support SSE inline assembly too + # (the cray compiler, for example) + if test $enable_sse3 = yes; then + echo Using SSE3 and SSE2 macros! + AC_DEFINE(SSE3,1,Compile with SSE3 support) + elif test $enable_sse2 = yes; then + echo Using SSE2 macros only! + AC_DEFINE(SSE2,1,Compile with SSE2 support) + fi + + DEPFLAGS="-M" + CFLAGS="$CFLAGS -O" + DEBUG_FLAG="-g" + CCDEP="$CC" + fi + fi + +# The MareNostrum: powerpc on a linux system +# this will also evaluate to "true" on BG/Q with XLC +elif test "$host_cpu" = "powerpc64" && test "$host_vendor" = "unknown" && test "$host_os" = "linux-gnu"; then + + DEBUGFLAG="-g" + if test "$XLC" = "yes"; then + CFLAGS="-qsrcmsg $CFLAGS" + DEBUGFLAG="$DEBUGFLAG -qfullpath" + fi + + OPTARGS="$OPTARGS" + SOPTARGS="$OPTARGS" + if test "$CCDEP" = "gcc"; then + DEPFLAGS="-MM" + else + DEPFLAGS="-M" + fi + +#The BLue Gene/L +elif test "$host_cpu" = "powerpc" && test "$host_vendor" = "ibm" && test "$host_os" = "blrts"; then + if test "$with_bgldram" = yes; then + if (test -e /bgl/local/bin/blrts_gcc); then + BLRTSGCC=/bgl/local/bin/blrts_gcc + elif (test -e /bgl/BlueLight/ppcfloor/blrts-gnu/bin/powerpc-bgl-blrts-gnu-gcc); then + BLRTSGCC=/bgl/BlueLight/ppcfloor/blrts-gnu/bin/powerpc-bgl-blrts-gnu-gcc + else + AC_MSG_ERROR([Sorry, don't know where to find blrts_gcc, see README.bgl!]) + fi + CCLD="$BLRTSGCC -Xlinker --script=./elf32ppcblrts.x" + if (!(test -s ./elf32ppcblrts.x)); then + AC_MSG_ERROR([Sorry, elf32ppcblrts.x is missing, see README.bgl!]) + fi + fi + DEBUGFLAG="-g" + OPTARGS="-O3" + SOPTARGS="-O3" + AC_DEFINE(BGL,1,[Optimize for Blue Gene/L]) + + if test "$XLC" = "yes"; then + CFLAGS="-qsrcmsg $CFLAGS" + OPTARGS="$OPTARGS -qarch=440d -qtune=440" + SOPTARGS="$SOPTARGS -qarch=440d -qtune=440" + DEBUGFLAG="$DEBUGFLAG -qfullpath" +# OPTARGS="-qhot" leads to wrong code + fi + LIBS="-lmpich.rts -lfmpich.rts -lmsglayer.rts -lrts.rts -ldevices.rts $LIBS" + LDFLAGS="$LDFLAGS -L/bgl/BlueLight/ppcfloor/bglsys/lib" + if test $with_lapack = yes; then + LIBS="-lesslbg -llapack.rts -lesslbg -lxlf90 -lxlfmath -lxl -lxlopt $LIBS" + LDFLAGS="$LDFLAGS -L/opt/ibmcmp/xlf/bg/10.1/blrts_lib -L/bgl/local/lib/ -L/opt/ibmmath/lib/" + fi + + if test "$CCDEP" = "gcc"; then + DEPFLAGS="-MM" + else + DEPFLAGS="-M" + fi + CPPFLAGS="-I/bgl/BlueLight/ppcfloor/bglsys/include" + INCLUDES="$INCLUDES -I/bgl/BlueLight/ppcfloor/bglsys/include/" + +#The BLue Gene/P +elif test "$host_cpu" = "powerpc" && test "$host_vendor" = "ibm" && test "$host_os" = "bprts"; then + CFLAGS="$CFLAGS" + DEBUGFLAG="-g" + OPTARGS="-O3" + SOPTARGS="-O3" + AC_DEFINE(BGL,1,[Optimize for Blue Gene/L]) + AC_DEFINE(BGP,1,[Optimize for Blue Gene/P]) + + if test "$XLC" = "yes"; then + CFLAGS="-qsrcmsg $CFLAGS" + OPTARGS="$OPTARGS -qarch=450d -qtune=450" + SOPTARGS="$SOPTARGS -qarch=450d -qtune=450" + DEBUGFLAG="$DEBUGFLAG -qfullpath" +# OPTARGS="-qhot" leads to wrong code + fi +# LIBS="-lxlf90_r -lxlomp_ser -lxl -lxlopt -lxlfmath -ldl -lrt -lpthread $LIBS" +# LDFLAGS="$LDFLAGS -L/bgsys/local/lib/ -L/opt/ibmcmp/xlf/bg/11.1/lib -L/bgsys/drivers/ppcfloor/comm/" +# if test $with_lapack = yes; then +# LIBS="-lesslbg -llapack -lesslbg $LIBS" +# LDFLAGS="$LDFLAGS -L/opt/ibmmath/lib/" +# fi + + if test "$CCDEP" = "gcc"; then + DEPFLAGS="-MM" + else + DEPFLAGS="-M" + fi + CPPFLAGS="-I/bgsys/drivers/ppcfloor/arch/include/ -I/bgsys/drivers/ppcfloor/comm/include" + INCLUDES="$INCLUDES -I/bgsys/local/include/ -I/bgsys/drivers/ppcfloor/arch/include/ -I/bgsys/drivers/ppcfloor/comm/include" + + + +# The IBM Power PC +elif test "$host_cpu" = "powerpc" && test "$host_vendor" = "ibm"; then + CFLAGS="$CFLAGS -q64 -qsrcmsg" + LDFLAGS="$LDFLAGS -q64" + OPTARGS="-O2" + SOPTARGS="-O2" + DEBUG_FLAG="-qfullpath -g" + if test "$CCDEP" = "gcc"; then + DEPFLAGS="-MM" + else + DEPFLAGS="-M" + fi + +# The CRAY +elif test "$host_vendor" = "cray"; then + echo + echo "Hey, we are on a cray, you should take some time for this..." + echo "get yourself a coffee or so!" + echo + CFLAGS="$CFLAGS -dp" + AC_DEFINE(CRAY,1,We are on a CRAY) + OPTARGS="-O3" + SOPTARGS="-O3" + DEBUG_FLAG="-g" + CCDEP="$CC" + DEPFLAGS="-M" + +else + AC_CHECK_PROG(CCDEP, gcc, "gcc", "$CC") + if test "$CCDEP" = "gcc"; then + DEPFLAGS="-MM" + else + DEPFLAGS="-M" + fi + OPTARGS= + SOPTARGS= +fi + + +AC_MSG_CHECKING(whether we want to switch on optimisation) +AC_ARG_ENABLE(optimize, + AS_HELP_STRING([--enable-optimize], [enable optimisation [default=yes]]), + enable_optimize=$enableval, enable_optimize=yes) +if test $enable_optimize = no; then + AC_MSG_RESULT(no) + OPTARGS= + SOPTARGS= +else + AC_MSG_RESULT(yes) +fi + +AC_MSG_CHECKING(whether we want to use a copy of the gauge field) +AC_ARG_ENABLE(gaugecopy, + AS_HELP_STRING([--enable-gaugecopy], [enable use of a copy of the gauge field [default=yes]]), + enable_gaugecopy=$enableval, enable_gaugecopy=yes) +if test $enable_gaugecopy = yes; then + AC_MSG_RESULT(yes) + AC_DEFINE(_GAUGE_COPY,1,Construct an extra copy of the gauge fields) +else + AC_MSG_RESULT(no) +fi + +AC_MSG_CHECKING(whether we want to use a Dirac Op. with halfspinor exchange) +AC_ARG_ENABLE(halfspinor, + AS_HELP_STRING([--enable-halfspinor], [use a Dirac Op. with halfspinor exchange [default=yes]]), + enable_halfspinor=$enableval, enable_halfspinor=yes) +if test $enable_halfspinor = yes; then + AC_MSG_RESULT(yes) + AC_DEFINE(_USE_HALFSPINOR,1,Exchange only a halfspinor in the Dirac Operator) + if test $enable_gaugecopy = no; then + AC_MSG_WARN([switching on gaugecopy for Dirac operator with halfspinor!]) + AC_DEFINE(_GAUGE_COPY,1,Construct an extra copy of the gauge fields) + fi +else + AC_MSG_RESULT(no) +fi + +AC_MSG_CHECKING(whether we want to use shmem API) +AC_ARG_ENABLE(shmem, + AS_HELP_STRING([--enable-shmem],[use shmem API [default=no]]), + enable_shmem=$enableval, enable_shmem=no) +if test $enable_shmem = yes; then + AC_MSG_RESULT(yes) + AC_DEFINE(_USE_SHMEM,1,Use shmem API) + LIBS="$LIBS -lsma" +else + AC_MSG_RESULT(no) +fi + +AC_MSG_CHECKING(whether we want to use timeslice-splitted communications) +AC_ARG_ENABLE(tsplitpar, + AS_HELP_STRING([--enable-tsplitpar],[enable timeslice-splitted communications [default=no]]), + enable_tsp=$enableval, enable_tsp=no) +if test $enable_tsp = yes; then + AC_MSG_RESULT(yes) + AC_DEFINE(_USE_TSPLITPAR,1,timeslice-splitted communications) +else + AC_MSG_RESULT(no) +fi + +AC_MSG_CHECKING(whether we want to compute the LapH eigenvalues) +AC_ARG_ENABLE(laph, + AS_HELP_STRING([--enable-laph], [enable computation of LapH eigensystem [default=no]]), + enable_laph=$enableval, enable_laph=no) +if test $enable_laph = yes; then + AC_MSG_RESULT(yes) + AC_DEFINE(WITHLAPH,1,LapH eigensystem) +else + AC_MSG_RESULT(no) +fi + + +AC_MSG_CHECKING(whether we want to use CUDA GPU) +AC_ARG_ENABLE(gpu, + AS_HELP_STRING([--enable-gpu],[use GPU [default=no]]), + usegpu=$enableval, usegpu=no) +if test $usegpu = yes; then + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_GPU,1,Using CUDA GPU) + NVCC="nvcc" + USESUBDIRS="$USESUBDIRS GPU" + GPUDIR="GPU" + LIBS="$LIBS -lcuda -lcudart -lcublas" + + AC_MSG_CHECKING([where to search for CUDA libs]) + AC_ARG_WITH(cuda, + AS_HELP_STRING([--with-cuda[=dir]], [use CUDA GPU with lib dir [default=/usr/local/cuda/lib]]), + cuda_dir=$withval, cuda_dir="/usr/local/cuda/lib") + AC_MSG_RESULT($cuda_dir) + if test $usegpu = yes; then + LDFLAGS="$LDFLAGS -L$cuda_dir" + fi + + + AC_MSG_CHECKING([CUDA compile args]) + AC_ARG_WITH(cudacompileargs, + AS_HELP_STRING([--with-cudacompileargs[=string]], [use CUDA compile args [default="--gpu-architecture sm_13 --use_fast_math -O3"]]), + cuda_compileargs=$withval, cuda_compileargs="--gpu-architecture sm_13 --use_fast_math -O3") + AC_MSG_RESULT($cuda_compileargs) + if test $usegpu = yes; then + GPUCFLAGS="$GPUCFLAGS $cuda_compileargs" + fi + if test $enable_mpi = yes; then + GPUMPICOMPILER="--compiler-bindir mpicc" + if test $withmpidimension != 1; then + AC_MSG_ERROR(ERROR! The GPU Code is only parallelized in t-direction so far!) + fi + else + GPUMPICOMPILER="" + fi +else + AC_MSG_RESULT(no) + NVCC="" +fi + + +AC_SUBST(USESUBDIRS) +AC_SUBST(NVCC) +AC_SUBST(GPUDIR) +AC_SUBST(GPUCFLAGS) +AC_SUBST(GPUMPICOMPILER) + + +# QUDA library for GPUs +AC_MSG_CHECKING(whether we want to use QUDA GPU) +AC_ARG_WITH(qudadir, + AS_HELP_STRING([--with-qudadir[=dir]], [use QUDA, to be found in dir]), + [echo yes + QUDA_AVAILABLE=1 + AC_DEFINE(QUDA,1,Using QUDA GPU) + quda_dir=$withval + LDFLAGS="$LDFLAGS -L${quda_dir}/lib" + INCLUDES="$INCLUDES -I${quda_dir}/include/" + QUDA_INTERFACE="quda_interface" + AC_MSG_CHECKING([where to search for CUDA libs]) + AC_ARG_WITH(cudadir, + AS_HELP_STRING([--with-cudadir[=dir]], [if using QUDA, then set CUDA lib dir [default=/usr/local/cuda/lib]]), + cuda_dir=$withval, cuda_dir="/usr/local/cuda/lib") + AC_MSG_RESULT($cuda_dir) + LDFLAGS="$LDFLAGS -L$cuda_dir" + AC_CHECK_LIB([cudart], + [cudaMalloc], + [], + [AC_MSG_ERROR([Can't link a simple program against library cudart.])] + ) + # Perform test in C++ + AC_LANG_PUSH([C++]) + AC_CHECK_LIB([quda], + [freeGaugeQuda], + [], + [AC_MSG_ERROR([Can't link a simple program against library libquda. (Did you set CXX properly?)])] + ) + AC_LANG_PUSH([C++]) + #QUDA needs to be linked with C++ linker + CCLD=${CXX} + ], + [echo no + QUDA_AVAILABLE=0 + QUDA_INTERFACE="" + ] + ) +AC_SUBST([QUDA_AVAILABLE]) + + +AC_MSG_CHECKING(checking consistency) +if test $enable_mpi = yes ; then + if test $enable_iig = yes && test $withpersistent = yes ; then + AC_MSG_ERROR(ERROR! indexindepgeom is not compatible with persistent communications ) + fi + if test $enable_iig = yes && test $enable_shmem = yes ; then + AC_MSG_ERROR(ERROR! indexindepgeom is not compatible with shmem API ) + fi + if test $enable_tsp = yes && test $enable_iig = no; then + AC_MSG_ERROR(ERROR! tsplitpar needs indexindepgeom) + fi + if test $enable_tsp = yes && test $enable_sse2 != yes ; then + AC_MSG_ERROR(ERROR! tsplitpar needs at least SSE2 ) + fi + if test $enable_tsp = yes && test $enable_gaugecopy != yes ; then + AC_MSG_ERROR(ERROR! tsplitpar needs gaugecopy) + fi + if test $enable_laph = yes && test $enable_tsp != yes ; then + AC_MSG_ERROR(ERROR! laph needs tsplitpar) + fi +fi + +if test ! -e lib; then + mkdir lib +fi + +dnl create the test and tests directory here +if test ! -e test; then + mkdir test +fi + +if test ! -e tests; then + mkdir tests +fi + +if test ! -e tests/regressions; then + mkdir tests/regressions +fi + + +LIBS="-lhmc -lmonomial -loperator -lsolver -linit -lmeas -llinalg -lhmc -lxchange -lrational -lio $LIBS" +AUTOCONF=autoconf + +for i in $USESUBDIRS +do + make_files="$make_files $i/Makefile" +done + +AC_CONFIG_FILES([Makefile $make_files]) + +AC_OUTPUT diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/cu/COPYING b/qcd/part_cpu/applications/QCD/src/kernel_D/cu/COPYING new file mode 100644 index 0000000000000000000000000000000000000000..94a9ed024d3859793618152ea559a168bbcbb5e2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/cu/COPYING @@ -0,0 +1,674 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/cu/COPYING.LESSER b/qcd/part_cpu/applications/QCD/src/kernel_D/cu/COPYING.LESSER new file mode 100644 index 0000000000000000000000000000000000000000..fc8a5de7edf437cdc98a216370faf7c757279bcb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/cu/COPYING.LESSER @@ -0,0 +1,165 @@ + GNU LESSER GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + + This version of the GNU Lesser General Public License incorporates +the terms and conditions of version 3 of the GNU General Public +License, supplemented by the additional permissions listed below. + + 0. Additional Definitions. + + As used herein, "this License" refers to version 3 of the GNU Lesser +General Public License, and the "GNU GPL" refers to version 3 of the GNU +General Public License. + + "The Library" refers to a covered work governed by this License, +other than an Application or a Combined Work as defined below. + + An "Application" is any work that makes use of an interface provided +by the Library, but which is not otherwise based on the Library. +Defining a subclass of a class defined by the Library is deemed a mode +of using an interface provided by the Library. + + A "Combined Work" is a work produced by combining or linking an +Application with the Library. The particular version of the Library +with which the Combined Work was made is also called the "Linked +Version". + + The "Minimal Corresponding Source" for a Combined Work means the +Corresponding Source for the Combined Work, excluding any source code +for portions of the Combined Work that, considered in isolation, are +based on the Application, and not on the Linked Version. + + The "Corresponding Application Code" for a Combined Work means the +object code and/or source code for the Application, including any data +and utility programs needed for reproducing the Combined Work from the +Application, but excluding the System Libraries of the Combined Work. + + 1. Exception to Section 3 of the GNU GPL. + + You may convey a covered work under sections 3 and 4 of this License +without being bound by section 3 of the GNU GPL. + + 2. Conveying Modified Versions. + + If you modify a copy of the Library, and, in your modifications, a +facility refers to a function or data to be supplied by an Application +that uses the facility (other than as an argument passed when the +facility is invoked), then you may convey a copy of the modified +version: + + a) under this License, provided that you make a good faith effort to + ensure that, in the event an Application does not supply the + function or data, the facility still operates, and performs + whatever part of its purpose remains meaningful, or + + b) under the GNU GPL, with none of the additional permissions of + this License applicable to that copy. + + 3. Object Code Incorporating Material from Library Header Files. + + The object code form of an Application may incorporate material from +a header file that is part of the Library. You may convey such object +code under terms of your choice, provided that, if the incorporated +material is not limited to numerical parameters, data structure +layouts and accessors, or small macros, inline functions and templates +(ten or fewer lines in length), you do both of the following: + + a) Give prominent notice with each copy of the object code that the + Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the object code with a copy of the GNU GPL and this license + document. + + 4. Combined Works. + + You may convey a Combined Work under terms of your choice that, +taken together, effectively do not restrict modification of the +portions of the Library contained in the Combined Work and reverse +engineering for debugging such modifications, if you also do each of +the following: + + a) Give prominent notice with each copy of the Combined Work that + the Library is used in it and that the Library and its use are + covered by this License. + + b) Accompany the Combined Work with a copy of the GNU GPL and this license + document. + + c) For a Combined Work that displays copyright notices during + execution, include the copyright notice for the Library among + these notices, as well as a reference directing the user to the + copies of the GNU GPL and this license document. + + d) Do one of the following: + + 0) Convey the Minimal Corresponding Source under the terms of this + License, and the Corresponding Application Code in a form + suitable for, and under terms that permit, the user to + recombine or relink the Application with a modified version of + the Linked Version to produce a modified Combined Work, in the + manner specified by section 6 of the GNU GPL for conveying + Corresponding Source. + + 1) Use a suitable shared library mechanism for linking with the + Library. A suitable mechanism is one that (a) uses at run time + a copy of the Library already present on the user's computer + system, and (b) will operate properly with a modified version + of the Library that is interface-compatible with the Linked + Version. + + e) Provide Installation Information, but only if you would otherwise + be required to provide such information under section 6 of the + GNU GPL, and only to the extent that such information is + necessary to install and execute a modified version of the + Combined Work produced by recombining or relinking the + Application with a modified version of the Linked Version. (If + you use option 4d0, the Installation Information must accompany + the Minimal Corresponding Source and Corresponding Application + Code. If you use option 4d1, you must provide the Installation + Information in the manner specified by section 6 of the GNU GPL + for conveying Corresponding Source.) + + 5. Combined Libraries. + + You may place library facilities that are a work based on the +Library side by side in a single library together with other library +facilities that are not Applications and are not covered by this +License, and convey such a combined library under terms of your +choice, if you do both of the following: + + a) Accompany the combined library with a copy of the same work based + on the Library, uncombined with any other library facilities, + conveyed under the terms of this License. + + b) Give prominent notice with the combined library that part of it + is a work based on the Library, and explaining where to find the + accompanying uncombined form of the same work. + + 6. Revised Versions of the GNU Lesser General Public License. + + The Free Software Foundation may publish revised and/or new versions +of the GNU Lesser General Public License from time to time. Such new +versions will be similar in spirit to the present version, but may +differ in detail to address new problems or concerns. + + Each version is given a distinguishing version number. If the +Library as you received it specifies that a certain numbered version +of the GNU Lesser General Public License "or any later version" +applies to it, you have the option of following the terms and +conditions either of that published version or of any later version +published by the Free Software Foundation. If the Library as you +received it does not specify a version number of the GNU Lesser +General Public License, you may choose any version of the GNU Lesser +General Public License ever published by the Free Software Foundation. + + If the Library as you received it specifies that a proxy can decide +whether future versions of the GNU Lesser General Public License shall +apply, that proxy's public statement of acceptance of any version is +permanent authorization for you to choose that version for the +Library. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/cu/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_D/cu/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..877c397a1f09d29c0a85cd5943f7d000ccf6bd5c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/cu/Makefile @@ -0,0 +1,80 @@ + +srcdir = . +top_builddir = .. +abs_top_builddir = /home/jacob/Job/Prace/kernel_D_soft/kernel_D_update2 +top_srcdir = .. +abs_top_srcdir = /home/jacob/Job/Prace/kernel_D_soft/kernel_D_update2 +subdir = cu +builddir = . + +CFLAGS = -std=c99 -fopenmp -pedantic -Wall +DEPFLAGS = -MM +LDFLAGS = -L${HOME}/lib -L${top_builddir}/lib +DEFS = -DHAVE_CONFIG_H +OPTARGS = -O + +AR = ar +RANLIB = ranlib +CC = mpicc +CCDEP = gcc +CCLD = $(CC) +LINK = $(CCLD) $(CFLAGS) $(LDFLAGS) ${OPTARGS} -o $@ +LEX = flex +AUTOCONF = autoconf +DEFS = -DHAVE_CONFIG_H + +INCLUDES = -I$(HOME)/include/ -I. -I${abs_top_builddir}/ -I${abs_top_srcdir}/ -I/include/ -I/include/ +LDADD = +COMPILE = ${CC} ${DEFS} ${INCLUDES} ${CFLAGS} ${OPTARGS} + +LIBRARIES = libcu + +libcu_TARGETS = cu + +libcu_OBJECTS = $(addsuffix .o, ${libcu_TARGETS}) + +# default rule +all: Makefile dep libcu.a + +#include dep rules + +-include $(addsuffix .d,${libcu_TARGETS}) + +include ${top_srcdir}/Makefile.global + +# rule to compile objects + +${libcu_OBJECTS}: %.o : ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h + $(COMPILE) -c $< + + +# rule to make libcu + +libcu.a: ${libcu_OBJECTS} Makefile + @rm -f libcu.a + @${AR} cru libcu.a $(libcu_OBJECTS) + @$(RANLIB) libcu.a +# @cp libcu.a ${top_builddir}/cu/libcu.a + +# rule to generate .d files + +$(addsuffix .d,$(libcu_TARGETS)): %.d: ${srcdir}/%.c Makefile + @$(CCDEP) ${DEFS} ${DEPFLAGS} ${INCLUDES} $< > $@ + +# rule to make dependencies + +dep: ${addsuffix .d, ${libcu_TARGETS}} + +# rules to clean + +compile-clean: Makefile + rm -f ${$(addsuffix _OBJECTS, ${LIBRARIES})} *.d + +clean: compile-clean + rm -f $(addsuffix .a, ${LIBRARIES}) + +distclean: clean + rm -f Makefile + + +.PHONY: all dep clean compile-clean distclean diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/cu/Makefile.in b/qcd/part_cpu/applications/QCD/src/kernel_D/cu/Makefile.in new file mode 100644 index 0000000000000000000000000000000000000000..7d63019cc383b98095daea0babc63454d407f78f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/cu/Makefile.in @@ -0,0 +1,80 @@ + +srcdir = @srcdir@ +top_builddir = @top_builddir@ +abs_top_builddir = @abs_top_builddir@ +top_srcdir = @top_srcdir@ +abs_top_srcdir = @abs_top_srcdir@ +subdir = cu +builddir = @builddir@ + +CFLAGS = @CFLAGS@ +DEPFLAGS = @DEPFLAGS@ +LDFLAGS = @LDFLAGS@ +DEFS = @DEFS@ +OPTARGS = @OPTARGS@ + +AR = @AR@ +RANLIB = @RANLIB@ +CC = @CC@ +CCDEP = @CCDEP@ +CCLD = $(CC) +LINK = $(CCLD) $(CFLAGS) $(LDFLAGS) ${OPTARGS} -o $@ +LEX = @LEX@ +AUTOCONF = @AUTOCONF@ +DEFS = @DEFS@ + +INCLUDES = @INCLUDES@ +LDADD = +COMPILE = ${CC} ${DEFS} ${INCLUDES} ${CFLAGS} ${OPTARGS} + +LIBRARIES = libcu + +libcu_TARGETS = cu + +libcu_OBJECTS = $(addsuffix .o, ${libcu_TARGETS}) + +# default rule +all: Makefile dep libcu.a + +#include dep rules + +-include $(addsuffix .d,${libcu_TARGETS}) + +include ${top_srcdir}/Makefile.global + +# rule to compile objects + +${libcu_OBJECTS}: %.o : ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h + $(COMPILE) -c $< + + +# rule to make libcu + +libcu.a: ${libcu_OBJECTS} Makefile + @rm -f libcu.a + @${AR} cru libcu.a $(libcu_OBJECTS) + @$(RANLIB) libcu.a +# @cp libcu.a ${top_builddir}/cu/libcu.a + +# rule to generate .d files + +$(addsuffix .d,$(libcu_TARGETS)): %.d: ${srcdir}/%.c Makefile + @$(CCDEP) ${DEFS} ${DEPFLAGS} ${INCLUDES} $< > $@ + +# rule to make dependencies + +dep: ${addsuffix .d, ${libcu_TARGETS}} + +# rules to clean + +compile-clean: Makefile + rm -f ${$(addsuffix _OBJECTS, ${LIBRARIES})} *.d + +clean: compile-clean + rm -f $(addsuffix .a, ${LIBRARIES}) + +distclean: clean + rm -f Makefile + + +.PHONY: all dep clean compile-clean distclean diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/cu/check-regressions b/qcd/part_cpu/applications/QCD/src/kernel_D/cu/check-regressions new file mode 100755 index 0000000000000000000000000000000000000000..07aec7d5b495912ff9a02b48272c3c8b976c8a75 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/cu/check-regressions @@ -0,0 +1,413 @@ +#!/usr/bin/python +## +# CU - C unit testing framework +# --------------------------------- +# Copyright (c)2007,2008 Daniel Fiser +# +# +# This file is part of CU. +# +# CU is free software; you can redistribute it and/or modify +# it under the terms of the GNU Lesser General Public License as +# published by the Free Software Foundation; either version 3 of +# the License, or (at your option) any later version. +# +# CU is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with this program. If not, see . +# + +from subprocess import Popen, PIPE +import os +import re +import sys +import math +from getopt import gnu_getopt, GetoptError + +EPS = 0.6 +BASE_DIR = "." +MAX_DIFF_LINES = 20 +EXACT = False + +PROGRESS_ON = True +MSG_BASE = "" + +class Hunk: + """ This class represents one hunk from diff. """ + + def __init__(self): + self.added = [] + self.deleted = [] + self.lines = [] + + # to identify lines with floating point numbers + self.re_is_num = re.compile("^.*[0-9].*$") + + # pattern to match floating point number + self.num_pattern = r"-?(?:(?:[0-9]+(?:\.[0-9]*)?)|(?:\.[0-9]+))(?:[eE]-?[0-9]+)?" + self.re_num = re.compile(self.num_pattern) + + def numLines(self): + return len(self.lines) + def numLinesAdded(self): + return len(self.added) + def numLinesDeleted(self): + return len(self.deleted) + + def addLineAdded(self, line): + self.added.append(line) + def addLineDeleted(self, line): + self.deleted.append(line) + def addLine(self, line): + self.lines.append(line) + + def getLines(self): + return self.lines + def getLinesAdded(self): + return self.added + def getLinesDeleted(self): + return self.deleted + + def __eq(self, num1, num2): + """ Returns True if num1 equals to num2 with respect to EPS + (defined above) """ + return math.fabs(num1 - num2) < EPS + + def checkFloats(self): + """ This method try to check if only difference between added and + deleted lines of this hunk is different precission of floating + point numbers + """ + + # If number of added and deleted lines differs, then there is more + # differences that precission of floating point numbers + if self.numLinesAdded() != self.numLinesDeleted(): + return False + + for i in xrange(0, self.numLinesAdded()): + # if any line does not contain number - return False because + # there must be more differences than in numbers + if not self.re_is_num.match(self.added[i]) \ + or not self.re_is_num.match(self.deleted[i]): + return False + + line1 = self.added[i] + line2 = self.deleted[i] + + # Extract all floating point numbers from each line + nums1 = self.re_num.findall(line1) + nums2 = self.re_num.findall(line2) + # and remove all empty strings + nums1 = filter(lambda x: len(x) > 0, nums1) + nums2 = filter(lambda x: len(x) > 0, nums2) + + # if length of list nums1 does not equal to length of nums2 + # return False + if len(nums1) != len(nums2): + return False + + # iterate trough all numbers + for j in xrange(0, len(nums1)): + # if numbers do not equal to each other return False + if not self.__eq(float(nums1[j]), float(nums2[j])): + return False + + # compare the rest of lines + line1 = self.re_num.sub("", line1) + line2 = self.re_num.sub("", line2) + if line1 != line2: + return False + + # If it does not fail anywhere, added and deleted lines must be + # same + return True + + +class Diff: + """ Represents whole diff. """ + + def __init__(self): + self.hunks = [] + self.lines = 0 + self.omitted_lines = 0 + + def addHunk(self, hunk): + self.hunks.append(hunk) + self.lines += hunk.numLines() + + def numLines(self): + return self.lines + def numOmittedLines(self): + return self.omitted_lines + + def getHunks(self): + return self.hunks + def numHunks(self): + return len(self.hunks) + + def checkFloats(self): + """ Will call method checkFloats on each hunk """ + hks = self.hunks[:] + self.hunks = [] + self.lines = 0 + for h in hks: + if not h.checkFloats(): + self.hunks.append(h) + self.lines += h.numLines() + else: + self.omitted_lines += h.numLines() + + + +class Parser: + def __init__(self, fin): + self.fin = fin + self.line = "" + self.diff = Diff() + self.cur_hunk = None + + # to recognize beginning of hunk: + self.re_hunk = re.compile(r"^[0-9]*(,[0-9]*){0,1}[a-zA-Z]?[0-9]*(,[0-9]*){0,1}$") + + self.re_added = re.compile(r"^> (.*)$") + self.re_deleted = re.compile(r"^< (.*)$") + + def __readNextLine(self): + self.line = self.fin.readline() + if len(self.line) == 0: + return False + return True + + def parse(self): + global PROGRESS_ON + global MSG_BASE + + num_lines = 0 + while self.__readNextLine(): + # beggining of hunk + if self.re_hunk.match(self.line): + if self.cur_hunk is not None: + self.diff.addHunk(self.cur_hunk) + self.cur_hunk = Hunk() + self.cur_hunk.addLine(self.line) + + # line added + match = self.re_added.match(self.line) + if match is not None: + self.cur_hunk.addLine(self.line) + self.cur_hunk.addLineAdded(match.group(1)) + + # line deleted + match = self.re_deleted.match(self.line) + if match is not None: + self.cur_hunk.addLine(self.line) + self.cur_hunk.addLineDeleted(match.group(1)) + + num_lines += 1 + + if PROGRESS_ON and num_lines % 50 == 0: + print MSG_BASE, "[ %08d ]" % num_lines, "\r", + sys.stdout.flush() + + # last push to list of hunks + if self.cur_hunk is not None: + self.diff.addHunk(self.cur_hunk) + + if PROGRESS_ON: + print MSG_BASE, " ", "\r", + sys.stdout.flush() + + def getDiff(self): + return self.diff + + +def regressionFilesInDir(): + """ Returns sorted list of pairs of filenames where first name in pair + is tmp. file and second corresponding file with saved regressions. + """ + + re_tmp_out_file = re.compile(r"tmp\.(.*\.out)") + re_tmp_err_file = re.compile(r"tmp\.(.*\.err)") + files = [] + + all_files = os.listdir(".") + all_files.sort() + for file in all_files: + res = re_tmp_out_file.match(file) + if res is not None: + fname = res.group(1) + tmp = [file, ""] + for file2 in all_files: + if file2 == fname: + tmp = [file, file2,] + break + files.append(tmp) + + res = re_tmp_err_file.match(file) + if res is not None: + fname = res.group(1) + tmp = [file, ""] + for file2 in all_files: + if file2 == fname: + tmp = [file, file2,] + break + files.append(tmp) + + return files + + +def MSG(str = "", wait = False): + if wait: + print str, + else: + print str +def MSGOK(prestr = "", str = "", poststr = ""): + print prestr, "\033[0;32m" + str + "\033[0;0m", poststr +def MSGFAIL(prestr = "", str = "", poststr = ""): + print prestr, "\033[0;31m" + str + "\033[0;0m", poststr +def MSGINFO(prestr = "", str = "", poststr = ""): + print prestr, "\033[0;33m" + str + "\033[0;0m", poststr +def dumpLines(lines, prefix = "", wait = False, max_lines = -1): + line_num = 0 + if wait: + for line in lines: + print prefix, line, + line_num += 1 + if max_lines >= 0 and line_num > max_lines: + break + else: + for line in lines: + print prefix, line + line_num += 1 + if max_lines >= 0 and line_num > max_lines: + break + +def main(files): + global MSG_BASE + + # As first compute length of columns + len1 = 0 + len2 = 0 + for filenames in files: + if len(filenames[0]) > len1: + len1 = len(filenames[0]) + if len(filenames[1]) > len2: + len2 = len(filenames[1]) + + for filenames in files: + if len(filenames[1]) == 0: + MSGFAIL("", "===", "Can't compare %s %s, bacause %s does not exist!" % \ + (filenames[0], filenames[0][4:], filenames[0][4:])) + continue + + cmd = ["diff", filenames[0], filenames[1]] + MSG_BASE = "Comparing %s and %s" % \ + (filenames[0].ljust(len1) ,filenames[1].ljust(len2)) + if not PROGRESS_ON: + print MSG_BASE, + sys.stdout.flush() + + pipe = Popen(cmd, stdout=PIPE) + parser = Parser(pipe.stdout) + parser.parse() + diff = parser.getDiff() + if not EXACT: + diff.checkFloats() + + if PROGRESS_ON: + print MSG_BASE, + + if diff.numHunks() == 0: + MSGOK(" [", "OK", "]") + if diff.numOmittedLines() > 0: + MSGINFO(" -->", str(diff.numOmittedLines()) + " lines from diff omitted") + else: + MSGFAIL(" [", "FAILED", "]") + if diff.numOmittedLines() > 0: + MSGINFO(" -->", str(diff.numOmittedLines()) + " lines from diff omitted") + MSGINFO(" -->", "Diff has " + str(diff.numLines()) + " lines") + + if diff.numLines() <= MAX_DIFF_LINES: + MSGINFO(" -->", "Diff:") + for h in diff.getHunks(): + dumpLines(h.getLines(), " |", True) + else: + MSGINFO(" -->", "Printing only first " + str(MAX_DIFF_LINES) + " lines:") + lines = [] + for h in diff.getHunks(): + lines += h.getLines() + if len(lines) > MAX_DIFF_LINES: + break; + dumpLines(lines, " |", True, MAX_DIFF_LINES) + +def usage(): + print "Usage: " + sys.argv[0] + " [ OPTIONS ] [ directory, [ directory, [ ... ] ] ]" + print "" + print " OPTIONS:" + print " --help / -h none Print this help" + print " --exact / -e none Switch do exact comparasion of files" + print " --not-exact / -n none Switch do non exact comparasion of files (default behaviour)" + print " --max-diff-lines int Maximum of lines of diff which can be printed (default " + str(MAX_DIFF_LINES) + ")" + print " --eps float Precision of floating point numbers (epsilon) (default " + str(EPS) + ")" + print " --no-progress none Turn off progress bar" + print " --progress none Turn on progress bar (default)" + print "" + print " This program is able to compare files with regressions generated by CU testsuites." + print " You can specify directories which are to be searched for regression files." + print " In non exact copmarasion mode (which is default), this program tries to compare" + print " floating point numbers in files with respect to specified precision (see --eps) and" + print " those lines which differ only in precission of floating point numbers are omitted." + print "" + sys.exit(-1) + + + +# Init: + +# Set up base dir +BASE_DIR = os.getcwd() + +# Parse command line options: +optlist, args = gnu_getopt(sys.argv[1:], + "hen", + ["help", "max-diff-lines=", "eps=", \ + "exact", "not-exact", \ + "no-progress", "progress"]) +for opt in optlist: + if opt[0] == "--help" or opt[0] == "-h": + usage() + if opt[0] == "--exact" or opt[0] == "-e": + EXACT = True + if opt[0] == "--not-exact" or opt[0] == "-n": + EXACT = False + if opt[0] == "--max-diff-lines": + MAX_DIFF_LINES = int(opt[1]) + if opt[0] == "--eps": + EPS = float(opt[1]) + if opt[0] == "--no-progress": + PROGRESS_ON = False + if opt[0] == "--progress": + PROGRESS_ON = True + +if len(args) == 0: + files = regressionFilesInDir() + main(files) +else: + for dir in args: + os.chdir(BASE_DIR) + + MSGINFO() + MSGINFO("", "Processing directory '" + dir + "':") + MSGINFO() + try: + os.chdir(dir) + except: + MSGFAIL(" -->", "Directory '" + dir + "' does not exist.") + files = regressionFilesInDir() + main(files) + +sys.exit(0) diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/cu/cu.c b/qcd/part_cpu/applications/QCD/src/kernel_D/cu/cu.c new file mode 100644 index 0000000000000000000000000000000000000000..26a22619fd86b026922215b41a69e7a25216c1ef --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/cu/cu.c @@ -0,0 +1,354 @@ +/*** + * CU - C unit testing framework + * --------------------------------- + * Copyright (c)2007,2008,2009 Daniel Fiser + * + * + * This file is part of CU. + * + * CU is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 3 of + * the License, or (at your option) any later version. + * + * CU is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see . + */ + +#include +#include +#include +#include + +#include "cu.h" + +/** Declared here, because I didn't find header file where it is declared */ +char *strsignal(int sig); + +const char *cu_current_test; +const char *cu_current_test_suite; +int cu_success_test_suites = 0; +int cu_fail_test_suites = 0; +int cu_success_tests = 0; +int cu_fail_tests = 0; +int cu_success_checks = 0; +int cu_fail_checks = 0; + +char cu_out_prefix[CU_OUT_PREFIX_LENGTH+1] = ""; + + +/* globally used file descriptor for reading/writing messages */ +int fd; + +/* indicate if test was failed */ +int test_failed; + +/* codes of messages */ +#define CHECK_FAILED '0' +#define CHECK_SUCCEED '1' +#define TEST_FAILED '2' +#define TEST_SUCCEED '3' +#define TEST_SUITE_FAILED '4' +#define TEST_SUITE_SUCCEED '5' +#define END '6' +#define TEST_NAME '7' + +/* predefined messages */ +#define MSG_CHECK_SUCCEED write(fd, "1\n", 2) +#define MSG_TEST_FAILED write(fd, "2\n", 2) +#define MSG_TEST_SUCCEED write(fd, "3\n", 2) +#define MSG_TEST_SUITE_FAILED write(fd, "4\n", 2) +#define MSG_TEST_SUITE_SUCCEED write(fd, "5\n", 2) +#define MSG_END write(fd, "6\n", 2) + +/* length of buffers */ +#define BUF_LEN 1000 +#define MSGBUF_LEN 300 + + +static void redirect_out_err(const char *testName); +static void close_out_err(void); +static void run_test_suite(const char *ts_name, cu_test_suite_t *ts); +static void receive_messages(void); + +static void cu_run_fork(const char *ts_name, cu_test_suite_t *test_suite); +static void cu_print_results(void); + +void cu_run(int argc, char *argv[]) +{ + cu_test_suites_t *tss; + int i; + char found = 0; + + if (argc > 1){ + for (i=1; i < argc; i++){ + tss = cu_test_suites; + while (tss->name != NULL && tss->test_suite != NULL){ + if (strcmp(argv[i], tss->name) == 0){ + found = 1; + cu_run_fork(tss->name, tss->test_suite); + break; + } + tss++; + } + + if (tss->name == NULL || tss->test_suite == NULL){ + fprintf(stderr, "ERROR: Could not find test suite '%s'\n", argv[i]); + } + } + + if (found == 1) + cu_print_results(); + + }else{ + tss = cu_test_suites; + while (tss->name != NULL && tss->test_suite != NULL){ + cu_run_fork(tss->name, tss->test_suite); + tss++; + } + cu_print_results(); + } + + +} + +static void cu_run_fork(const char *ts_name, cu_test_suite_t *ts) +{ + int pipefd[2]; + int pid; + int status; + + if (pipe(pipefd) == -1){ + perror("Pipe error"); + exit(-1); + } + + fprintf(stdout, " -> %s [IN PROGESS]\n", ts_name); + fflush(stdout); + + pid = fork(); + if (pid < 0){ + perror("Fork error"); + exit(-1); + } + + if (pid == 0){ + /* close read end of pipe */ + close(pipefd[0]); + + fd = pipefd[1]; + + /* run testsuite, messages go to fd */ + run_test_suite(ts_name, ts); + + MSG_END; + close(fd); + + /* stop process where running testsuite */ + exit(0); + }else{ + /* close write end of pipe */ + close(pipefd[1]); + + fd = pipefd[0]; + + /* receive and interpret all messages */ + receive_messages(); + + /* wait for children */ + wait(&status); + if (!WIFEXITED(status)){ /* if child process ends up abnormaly */ + if (WIFSIGNALED(status)){ + fprintf(stdout, "Test suite was terminated by signal %d (%s).\n", + WTERMSIG(status), strsignal(WTERMSIG(status))); + }else{ + fprintf(stdout, "Test suite terminated abnormaly!\n"); + } + + /* mark this test suite as failed, because was terminated + * prematurely */ + cu_fail_test_suites++; + } + + close(fd); + + fprintf(stdout, " -> %s [DONE]\n\n", ts_name); + fflush(stdout); + } + +} + +static void run_test_suite(const char *ts_name, cu_test_suite_t *ts) +{ + int test_suite_failed = 0; + char buffer[MSGBUF_LEN]; + int len; + + /* set up current test suite name for later messaging... */ + cu_current_test_suite = ts_name; + + /* redirect stdout and stderr */ + redirect_out_err(cu_current_test_suite); + + while (ts->name != NULL && ts->func != NULL){ + test_failed = 0; + + /* set up name of test for later messaging */ + cu_current_test = ts->name; + + /* send message what test is currently running */ + len = snprintf(buffer, MSGBUF_LEN, "%c --> Running %s...\n", + TEST_NAME, cu_current_test); + write(fd, buffer, len); + + /* run test */ + (*(ts->func))(); + + if (test_failed){ + MSG_TEST_FAILED; + test_suite_failed = 1; + }else{ + MSG_TEST_SUCCEED; + } + + ts++; /* next test in test suite */ + } + + if (test_suite_failed){ + MSG_TEST_SUITE_FAILED; + }else{ + MSG_TEST_SUITE_SUCCEED; + } + + /* close redirected stdout and stderr */ + close_out_err(); +} + + +static void receive_messages(void) +{ + char buf[BUF_LEN]; /* buffer */ + int buf_len; /* how many chars stored in buf */ + char bufout[MSGBUF_LEN]; /* buffer which can be printed out */ + int bufout_len; + int state = 0; /* 0 - waiting for code, 1 - copy msg to stdout */ + int i; + int end = 0; /* end of messages? */ + + bufout_len = 0; + while((buf_len = read(fd, buf, BUF_LEN)) > 0 && !end){ + for (i=0; i < buf_len; i++){ + + /* Prepare message for printing out */ + if (state == 1 || state == 2){ + if (bufout_len < MSGBUF_LEN) + bufout[bufout_len++] = buf[i]; + } + + /* reset state on '\n' in msg */ + if (buf[i] == '\n'){ + /* copy messages out */ + if (state == 1) + write(1, bufout, bufout_len); + if (state == 2) + write(2, bufout, bufout_len); + + state = 0; + bufout_len = 0; + continue; + } + + if (state == 0){ + if (buf[i] == CHECK_FAILED){ + cu_fail_checks++; + state = 2; + }else if (buf[i] == TEST_NAME){ + state = 1; + }else if (buf[i] == CHECK_SUCCEED){ + cu_success_checks++; + }else if (buf[i] == TEST_FAILED){ + cu_fail_tests++; + }else if (buf[i] == TEST_SUCCEED){ + cu_success_tests++; + }else if (buf[i] == TEST_SUITE_FAILED){ + cu_fail_test_suites++; + }else if (buf[i] == TEST_SUITE_SUCCEED){ + cu_success_test_suites++; + }else if (buf[i] == END){ + end = 1; + break; + } + } + } + } +} + +void cu_success_assertation(void) +{ + MSG_CHECK_SUCCEED; +} + +void cu_fail_assertation(const char *file, int line, const char *msg) +{ + char buf[MSGBUF_LEN]; + int len; + + len = snprintf(buf, MSGBUF_LEN, "%c%s:%d (%s::%s) :: %s\n", + CHECK_FAILED, + file, line, cu_current_test_suite, cu_current_test, msg); + write(fd, buf, len); + + /* enable test_failed flag */ + test_failed = 1; +} + +static void cu_print_results(void) +{ + fprintf(stdout, "\n"); + fprintf(stdout, "==================================================\n"); + fprintf(stdout, "| | failed | succeed | total |\n"); + fprintf(stdout, "|------------------------------------------------|\n"); + fprintf(stdout, "| assertations: | %6d | %7d | %5d |\n", + cu_fail_checks, cu_success_checks, + cu_success_checks+cu_fail_checks); + fprintf(stdout, "| tests: | %6d | %7d | %5d |\n", + cu_fail_tests, cu_success_tests, + cu_success_tests+cu_fail_tests); + fprintf(stdout, "| tests suites: | %6d | %7d | %5d |\n", + cu_fail_test_suites, cu_success_test_suites, + cu_success_test_suites+cu_fail_test_suites); + fprintf(stdout, "==================================================\n"); +} + +void cu_set_out_prefix(const char *str) +{ + strncpy(cu_out_prefix, str, CU_OUT_PREFIX_LENGTH); +} + +static void redirect_out_err(const char *test_name) +{ + char buf[100]; + + snprintf(buf, 99, "%stmp.%s.out", cu_out_prefix, test_name); + if (freopen(buf, "w", stdout) == NULL){ + perror("Redirecting of stdout failed"); + exit(-1); + } + + snprintf(buf, 99, "%stmp.%s.err", cu_out_prefix, test_name); + if (freopen(buf, "w", stderr) == NULL){ + perror("Redirecting of stderr failed"); + exit(-1); + } +} + +static void close_out_err(void) +{ + fclose(stdout); + fclose(stderr); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/cu/cu.h b/qcd/part_cpu/applications/QCD/src/kernel_D/cu/cu.h new file mode 100644 index 0000000000000000000000000000000000000000..6c61c8a2bd73dbed4d09b33150f340b7f4fc18b1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/cu/cu.h @@ -0,0 +1,131 @@ +/*** + * CU - C unit testing framework + * --------------------------------- + * Copyright (c)2007,2008,2009 Daniel Fiser + * + * + * This file is part of CU. + * + * CU is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as + * published by the Free Software Foundation; either version 3 of + * the License, or (at your option) any later version. + * + * CU is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program. If not, see . + */ + +#ifndef _CU_H_ +#define _CU_H_ + +/***** PUBLIC API *****/ +/** + * Define test + */ +#define TEST(name) \ + void name(void) + +/** + * Define testsuite + */ +#define TEST_SUITE(name) \ + cu_test_suite_t test_suite_##name[] = +/** + * Must be on the end of list of tests. + */ +#define TEST_SUITE_CLOSURE \ + { NULL, NULL } + +#define TEST_SUITES \ + cu_test_suites_t cu_test_suites[] = +#define TEST_SUITES_CLOSURE \ + { NULL, NULL } +#define TEST_SUITE_ADD(name) \ + { #name, test_suite_##name } + +/** + * Add test to testsuite + */ +#define TEST_ADD(name) \ + { #name, name } + +#define CU_RUN(argc, argv) \ + cu_run(argc, argv) + +/** + * Set prefix for files printed out. Must contain trailing /. + */ +#define CU_SET_OUT_PREFIX(str) \ + cu_set_out_prefix(str) + +/** + * Assertations + * Assertations with suffix 'M' (e.g. assertTrueM) is variation of macro + * where is possible to specify error message. + */ +#define assertTrueM(a, message) \ + if (a){ \ + cu_success_assertation(); \ + }else{ \ + cu_fail_assertation(__FILE__, __LINE__, message); \ + } +#define assertTrue(a) \ + assertTrueM((a), #a " is not true") + +#define assertFalseM(a, message) \ + assertTrueM(!(a), message) +#define assertFalse(a) \ + assertFalseM((a), #a " is not false") + +#define assertEqualsM(a,b,message) \ + assertTrueM((a) == (b), message) +#define assertEquals(a,b) \ + assertEqualsM((a), (b), #a " not equals " #b) + +#define assertNotEqualsM(a,b,message) \ + assertTrueM((a) != (b), message) +#define assertNotEquals(a,b) \ + assertNotEqualsM((a), (b), #a " equals " #b) +/***** PUBLIC API END *****/ + + +#include + +#define CU_MAX_NAME_LENGTH 30 + +typedef void (*cu_test_func_t)(void); +typedef struct _cu_test_suite_t { + const char *name; + cu_test_func_t func; +} cu_test_suite_t; +typedef struct _cu_test_suites_t { + const char *name; + cu_test_suite_t *test_suite; +} cu_test_suites_t; + +extern cu_test_suites_t cu_test_suites[]; + +extern const char *cu_current_test; +extern const char *cu_current_test_suite; + +extern int cu_success_test_suites; +extern int cu_fail_test_suites; +extern int cu_success_tests; +extern int cu_fail_tests; +extern int cu_success_checks; +extern int cu_fail_checks; + +#define CU_OUT_PREFIX_LENGTH 30 +extern char cu_out_prefix[CU_OUT_PREFIX_LENGTH+1]; + +void cu_run(int argc, char *argv[]); +void cu_success_assertation(void); +void cu_fail_assertation(const char *file, int line, const char *msg); +void cu_set_out_prefix(const char *str); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/default_input_values.h b/qcd/part_cpu/applications/QCD/src/kernel_D/default_input_values.h new file mode 100644 index 0000000000000000000000000000000000000000..328b9180ab606bc1c8d27357f66b9a33334c9ef2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/default_input_values.h @@ -0,0 +1,198 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * Modified by Jenifer Gonzalez Lopez 01/04/2009 + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +/************************************************* + * + * this header file contains default values + * for all input parameter, set in + * read_input.c + * + * Autor: Carsten Urbach + * urbach@desy.de + *************************************************/ + +#ifndef _DEFAULT_INPUT_VALUES_H +#define _DEFAULT_INPUT_VALUES_H + +#define _default_T_global 4 +#define _default_L 4 +#define _default_LX 0 +#define _default_LY 0 +#define _default_LZ 0 +#define _default_N_PROC_X 1 +#define _default_N_PROC_Y 1 +#define _default_N_PROC_Z 1 +#define _default_g_kappa 0.125 +#define _default_g_acc_Ptilde 1.e-06 +#define _default_g_acc_Hfin 1.e-04 +#define _default_g_rec_ev 0 +#define _default_g_mubar 0.0 +#define _default_g_epsbar 0.0 +#define _default_g_mu 0.0 +#define _default_g_mu1 0.0 +#define _default_g_mu2 0.0 +#define _default_g_mu3 0.0 +#define _default_c_sw -1.0 +#define _default_g_beta 6.0 +#define _default_g_N_s 20 +#define _default_g_dflgcr_flag 0 +#define _default_random_seed 123456 +#define _default_rlxd_level 1 +//this is CG +#define _default_solver_flag 1 +//this is CGMMSND +#define _default_nd_solver_flag 14 +#define _default_startoption 0 +#define _default_Ntherm 0 +#define _default_Nmeas 1 +#define _default_Nsave 9 +#define _default_write_cp_flag 1 +#define _default_cp_interval 5 +#define _default_nstore 0 +#define _default_rlxd_input_filename "last_state" +#define _default_gauge_input_filename "conf" +#define _default_read_source_flag 0 +#define _default_source_filename "source" +#define _default_g_stdio_proc 0 +#define _default_index_start 0 +#define _default_index_end 12 +#define _default_X0 0. +#define _default_X1 0. +#define _default_X2 0. +#define _default_X3 0. +#define _default_max_solver_iterations 5000 +#define _default_solver_precision 1.e-15 +#define _default_g_rgi_C1 0. +#define _default_g_eps_sq_force 1.0e-7 +#define _default_g_eps_sq_acc 1.0e-16 +#define _default_g_eps_sq_force1 -1. +#define _default_g_eps_sq_acc1 -1. +#define _default_g_eps_sq_force2 -1. +#define _default_g_eps_sq_acc2 -1. +#define _default_g_eps_sq_force3 -1. +#define _default_g_eps_sq_acc3 -1. +#define _default_g_relative_precision_flag 0 +#define _default_return_check_flag 0 +#define _default_return_check_interval 100 +#define _default_g_debug_level 1 +#define _default_g_csg_N 0 +#define _default_2mn_lambda 0.1938 +#define _default_source_format_flag 0 +#define _default_source_time_slice 0 +#define _default_automaticTS 0 +#define _default_gmres_m_parameter 10 +#define _default_gmresdr_nr_ev 0 +#define _default_gauge_precision_read_flag 64 +#define _default_gauge_precision_write_flag 64 +#define _default_g_disable_IO_checks 0 +#define _default_prop_precision_flag 32 +#define _default_reproduce_randomnumber_flag 1 +#define _default_g_sloppy_precision_flag 0 +#define _default_operator_sloppy_precision_flag 1 +#define _default_compression_type 18 +#define _default_stout_rho 0.1 +#define _default_rho 0. +#define _default_rho2 0. +#define _default_stout_no_iter 1 +#define _default_use_stout_flag 0 +#define _default_phmc_no_flavours 4 +#define _default_compute_evs 0 +#define _default_phmc_compute_evs 0 +#define _default_phmc_pure_phmc 0 +#define _default_stilde_max 3. +#define _default_stilde_min 0.01 +#define _default_degree_of_p 48 +#define _default_propagator_splitted 1 +#define _default_source_splitted 1 +#define _default_source_location 0 +#define _default_no_eigenvalues 10 +#define _default_eigenvalue_precision 1.e-5 +#define _default_sub_evs_cg_flag 0 +#define _default_phmc_heavy_timescale 0 +#define _default_phmc_exact_poly 0 +#define _default_even_odd_flag 1 +#define _default_measurement_freq 10 +#define _default_timescale 1 +#define _default_reweighting_flag 0 +#define _default_reweighting_samples 10 +#define _default_source_type_flag 0 +#define _default_no_samples 1 +#define _default_online_measurement_flag 1 +#define _default_online_measurement_freq 5 +#define _default_compute_modenumber 0 +#define _default_compute_topsus 0 +#define _default_mstarsq 0.01 +#define _default_no_sources_z2 1 + +/* sf default values */ +#define _default_g_eta 0. +#define _default_g_Tbsf 3 +#define _default_g_Ct 1. +#define _default_g_Cs 0.5 +#define _default_g_C1 0. +#define _default_g_C1ss 0. +#define _default_g_C1tss 0. +#define _default_g_C1tts 0. +#define _default_bc_flag 0 +/* default poly monomial values */ +#define _default_MDPolyDegree 123 +#define _default_MDPolyLmin 0.1 +#define _default_MDPolyLmax 3.0 +#define _default_MDPolyRootsFile "Square_root_BR_roots.dat" +#define _default_MDPolyLocNormConst -1.0 +#define _default_MDPolyDetRatio 0 + +/* default GPU values */ +#define _default_device_num -1 + +#define _default_min_innersolver_it 10 +#define _default_max_mms_shifts 6 + +/* default OpenMP values */ +#define _default_omp_num_threads 0 + +/* default mixed precision solver values */ +#define _default_mixcg_innereps 1.0e-6 +#define _default_mixcg_maxinnersolverit 5000 + +#define _default_use_preconditioning 0 + +#define _default_use_qudainverter 0 + +/* Benchmark default values ??? */ +#define _default_g_c_sw 0. +#define _default_dtau 0.1 +#define _default_tau 0.5 +#define _default_integtyp 2 +#define _default_matrix_element_flag 0 +#define _default_operator_flag 0 +#define _default_Nskip 0 +#define _default_Nsteps 0 +#define _default_nsmall 0 +#define _default_save_config_flag 0 +#define _default_save_prop_flag 0 +#define _default_save_prop_g2_flag 0 +#define _default_first_prop_flag 0 +#define _default_mass_number 0 +#define _default_ITER_MAX_BCG 5000 +#define _default_ITER_MAX_CG 5000 + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/deriv_Sb.c b/qcd/part_cpu/applications/QCD/src/kernel_D/deriv_Sb.c new file mode 100644 index 0000000000000000000000000000000000000000..ddb6b453b90e8d069697aef129b17f9fdad91d64 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/deriv_Sb.c @@ -0,0 +1,649 @@ +/*********************************************************************** + * + * Copyright (C) 2001 Martin Hasenbusch + * + * some changes to initial version by Carsten Urbach + * + * BG version Copyright (C) 2006, 2007 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * deriv_Sb: function to compute the derivative + * of the phi^{\dag} Q psi with respect + * to the generators of the gauge group. + * without the clover part. + * + * Author: Martin Hasenbusch + * Date: Fri Oct 26 15:06:27 MEST 2001 + * + * both l and k are input + * for ieo = 0 + * l resides on even lattice points and k on odd lattice points + * for ieo = 1 + * l resides on odd lattice points and k on even lattice points + * the output is a su3adj field that is written to df0[][] + * + ************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "boundary.h" +#include "xchange/xchange.h" +#include "sse.h" +#include "update_backward_gauge.h" +#include "hamiltonian_field.h" +#include "deriv_Sb.h" + + +#if (defined BGL && defined XLC) + +void deriv_Sb(const int ieo, spinor * const l, spinor * const k, + hamiltonian_field_t * const hf, const double factor) { + + int ix,iy, iz; + int ioff, icx, icy, icz; + su3 * restrict up ALIGN; + su3 * restrict um ALIGN; + su3adj * restrict ddd; + static su3adj der; + static su3 v1,v2; + static su3_vector psia,psib,phia,phib; + static spinor rr; + spinor * restrict r ALIGN; + spinor * restrict sp ALIGN; + spinor * restrict sm ALIGN; + + /* We have 32 registers available */ + double _Complex reg00, reg01, reg02, reg03, reg04, reg05; + double _Complex reg10, reg11, reg12, reg13, reg14, reg15; + /* For su3 matrix, use reg00 for missing register */ + double _Complex v00, v01, v02, v10, v11, v12, v20, v21; + /* The following contains the left spinor (12 regs) and the final */ + /* su3 matrix to trace over */ + double _Complex r00, r01, r02, r10, r11, r12, r20, r21, r22, + r30, r31, r32; + +#ifdef _KOJAK_INST +# pragma pomp inst begin(derivSb) +#endif + +#pragma disjoint(*r, *sp, *sm, *up, *um, *ddd) + __alignx(16, l); + __alignx(16, k); + + if(ieo==0) { + ioff=0; + } + else { + ioff=(VOLUME+RAND)/2; + } + + /* for parallelization */ +#ifdef MPI + xchange_2fields(k, l, ieo); +#endif + /************** loop over all lattice sites ****************/ + + ix=g_eo2lexic[ioff]; + iy=g_iup[ix][0]; icy=g_lexic2eosub[iy]; + sp = k + icy; + _prefetch_spinor(sp); + up=&hf->gaugefield[ix][0]; + _prefetch_su3(up); + + for(icx = ioff; icx < (VOLUME/2+ioff); icx++){ +/* rr = (*(l + (icx-ioff))); */ + /* rr=g_spinor_field[l][icx-ioff]; */ +/* r=&rr; */ + + /* load left vector r and */ + /* multiply with gamma5 */ + r = l + (icx-ioff); + ix=g_eo2lexic[icx]; + + /*********************** direction +0 ********************/ + + ddd = &hf->derivative[ix][0]; + _bgl_load_r0((*r).s0); + _bgl_load_r1((*r).s1); + _bgl_load_minus_r2((*r).s2); + _bgl_load_minus_r3((*r).s3); + + _bgl_load_reg0((*sp).s0); + _bgl_load_reg0_up((*sp).s1); + _bgl_load_reg1((*sp).s2); + _bgl_load_reg1_up((*sp).s3); + + _bgl_add_to_reg0_reg1(); + _bgl_add_to_reg0_up_reg1_up(); + + _bgl_add_r0_to_r2_reg1(); + _bgl_add_r1_to_r3_reg1_up(); + + iy=g_idn[ix][0]; icy=g_lexic2eosub[iy]; + sm = k + icy; + _prefetch_spinor(sm); + um=&hf->gaugefield[iy][0]; + _prefetch_su3(um); + + _bgl_tensor_product_and_add(); + /* result in v now */ + /* v is daggered as compared to non-bgl version */ + _bgl_su3_times_v(*up); + /* result in r now */ + _bgl_complex_times_r(ka0); + _bgl_trace_lambda_mul_add_assign((*ddd), 2.*factor); + + + /************** direction -0 ****************************/ + + ddd = &hf->derivative[iy][0]; + _bgl_load_r0(r->s0); + _bgl_load_r1(r->s1); + _bgl_load_minus_r2(r->s2); + _bgl_load_minus_r3(r->s3); + + _bgl_load_reg0(sm->s0); + _bgl_load_reg0_up(sm->s1); + _bgl_load_reg1(sm->s2); + _bgl_load_reg1_up(sm->s3); + + _bgl_sub_from_reg0_reg1(); + _bgl_sub_from_reg0_up_reg1_up(); + + _bgl_sub_from_r0_r2_reg1(); + _bgl_sub_from_r1_r3_reg1_up(); + + iy=g_iup[ix][1]; icy=g_lexic2eosub[iy]; + + sp = k + icy; + _prefetch_spinor(sp); + up=&hf->gaugefield[ix][1]; + _prefetch_su3(up); + + _bgl_tensor_product_and_add_d(); + /* result in v now */ + _bgl_su3_times_v(*um); + + /* result in r now */ + _bgl_complex_times_r(ka0); + _bgl_trace_lambda_mul_add_assign((*ddd), 2.*factor); + + /*************** direction +1 **************************/ + + ddd = &hf->derivative[ix][1]; + _bgl_load_r0(r->s0); + _bgl_load_r1(r->s1); + _bgl_load_minus_r2(r->s2); + _bgl_load_minus_r3(r->s3); + + _bgl_load_reg0(sp->s0); + _bgl_load_reg0_up(sp->s1); + _bgl_load_reg1(sp->s2); + _bgl_load_reg1_up(sp->s3); + + _bgl_i_mul_add_to_reg0_reg1_up(); + _bgl_i_mul_add_to_reg0_up_reg1(); + + _bgl_i_mul_add_r0_to_r3_reg1(); + _bgl_i_mul_add_r1_to_r2_reg1_up(); + + iy=g_idn[ix][1]; icy=g_lexic2eosub[iy]; + + sm = k + icy; + _prefetch_spinor(sm); + um=&hf->gaugefield[iy][1]; + _prefetch_su3(um); + + _bgl_tensor_product_and_add(); + /* result in v now */ + _bgl_su3_times_v(*up); + /* result in r now */ + _bgl_complex_times_r(ka1); + _bgl_trace_lambda_mul_add_assign((*ddd), 2.*factor); + + /**************** direction -1 *************************/ + + ddd = &hf->derivative[iy][1]; + _bgl_load_r0(r->s0); + _bgl_load_r1(r->s1); + _bgl_load_minus_r2(r->s2); + _bgl_load_minus_r3(r->s3); + + _bgl_load_reg0(sm->s0); + _bgl_load_reg0_up(sm->s1); + _bgl_load_reg1(sm->s2); + _bgl_load_reg1_up(sm->s3); + + _bgl_i_mul_sub_from_reg0_reg1_up(); + _bgl_i_mul_sub_from_reg0_up_reg1(); + + _bgl_i_mul_sub_from_r0_r3_reg1(); + _bgl_i_mul_sub_from_r1_r2_reg1_up(); + + iy=g_iup[ix][2]; icy=g_lexic2eosub[iy]; + + sp = k + icy; + _prefetch_spinor(sp); + up=&hf->gaugefield[ix][2]; + _prefetch_su3(up); + + _bgl_tensor_product_and_add_d(); + /* result in v now */ + _bgl_su3_times_v(*um); + /* result in r now */ + _bgl_complex_times_r(ka1); + _bgl_trace_lambda_mul_add_assign((*ddd), 2.*factor); + + /*************** direction +2 **************************/ + + ddd = &hf->derivative[ix][2]; + _bgl_load_r0(r->s0); + _bgl_load_r1(r->s1); + _bgl_load_minus_r2(r->s2); + _bgl_load_minus_r3(r->s3); + + _bgl_load_reg0(sp->s0); + _bgl_load_reg0_up(sp->s1); + _bgl_load_reg1(sp->s2); + _bgl_load_reg1_up(sp->s3); + + _bgl_add_to_reg0_reg1_up(); + _bgl_sub_from_reg0_up_reg1(); + + _bgl_add_r0_to_r3_reg1(); + _bgl_sub_from_r1_r2_reg1_up(); + + iy=g_idn[ix][2]; icy=g_lexic2eosub[iy]; + + sm = k + icy; + _prefetch_spinor(sm); + um=&hf->gaugefield[iy][2]; + _prefetch_su3(um); + + _bgl_tensor_product_and_add(); + /* result in v now */ + _bgl_su3_times_v(*up); + /* result in r now */ + _bgl_complex_times_r(ka2); + _bgl_trace_lambda_mul_add_assign((*ddd), 2.*factor); + + /***************** direction -2 ************************/ + + ddd = &hf->derivative[iy][2]; + _bgl_load_r0(r->s0); + _bgl_load_r1(r->s1); + _bgl_load_minus_r2(r->s2); + _bgl_load_minus_r3(r->s3); + + _bgl_load_reg0(sm->s0); + _bgl_load_reg0_up(sm->s1); + _bgl_load_reg1(sm->s2); + _bgl_load_reg1_up(sm->s3); + + _bgl_sub_from_reg0_reg1_up(); + _bgl_add_to_reg0_up_reg1(); + + _bgl_sub_from_r0_r3_reg1(); + _bgl_add_r1_to_r2_reg1_up(); + + iy=g_iup[ix][3]; icy=g_lexic2eosub[iy]; + + sp = k + icy; + _prefetch_spinor(sp); + up=&hf->gaugefield[ix][3]; + _prefetch_su3(up); + + _bgl_tensor_product_and_add_d(); + /* result in v now */ + _bgl_su3_times_v(*um); + /* result in r now */ + _bgl_complex_times_r(ka1); + _bgl_trace_lambda_mul_add_assign(*ddd, 2.*factor); + + /****************** direction +3 ***********************/ + + ddd = &hf->derivative[ix][3]; + _bgl_load_r0(r->s0); + _bgl_load_r1(r->s1); + _bgl_load_minus_r2(r->s2); + _bgl_load_minus_r3(r->s3); + + _bgl_load_reg0(sp->s0); + _bgl_load_reg0_up(sp->s1); + _bgl_load_reg1(sp->s2); + _bgl_load_reg1_up(sp->s3); + + _bgl_i_mul_add_to_reg0_reg1(); + _bgl_i_mul_sub_from_reg0_up_reg1_up(); + + _bgl_i_mul_add_r0_to_r2_reg1(); + _bgl_i_mul_sub_from_r1_r3_reg1_up(); + + iy=g_idn[ix][3]; icy=g_lexic2eosub[iy]; + + sm = k + icy; + _prefetch_spinor(sm); + um=&hf->gaugefield[iy][3]; + _prefetch_su3(um); + + _bgl_tensor_product_and_add(); + /* result in v now */ + _bgl_su3_times_v(*up); + /* result in r now */ + _bgl_complex_times_r(ka3); + _bgl_trace_lambda_mul_add_assign((*ddd), 2.*factor); + + /***************** direction -3 ************************/ + + ddd = &hf->derivative[iy][3]; + _bgl_load_r0(r->s0); + _bgl_load_r1(r->s1); + _bgl_load_minus_r2(r->s2); + _bgl_load_minus_r3(r->s3); + + _bgl_load_reg0(sm->s0); + _bgl_load_reg0_up(sm->s1); + _bgl_load_reg1(sm->s2); + _bgl_load_reg1_up(sm->s3); + + _bgl_i_mul_sub_from_reg0_reg1(); + _bgl_i_mul_add_to_reg0_up_reg1_up(); + + _bgl_i_mul_sub_from_r0_r2_reg1(); + _bgl_i_mul_add_r1_to_r3_reg1_up(); + + /* something wrong here...*/ + icz=icx+1; + if(icz==((VOLUME+RAND)/2+ioff)) icz=ioff; + iz=g_eo2lexic[icz]; + iy=g_iup[iz][0]; icy=g_lexic2eosub[iy]; + + sp = k + icy; + _prefetch_spinor(sp); + up=&hf->gaugefield[iz][0]; + _prefetch_su3(up); + + _bgl_tensor_product_and_add_d(); + /* result in v now */ + _bgl_su3_times_v(*um); + /* result in r now */ + _bgl_complex_times_r(ka3); + _bgl_trace_lambda_mul_add_assign((*ddd), 2.*factor); + + /****************** end of loop ************************/ + } +#ifdef _KOJAK_INST +#pragma pomp inst end(derivSb) +#endif +} + +#else + +void deriv_Sb(const int ieo, spinor * const l, spinor * const k, + hamiltonian_field_t * const hf, const double factor) { + +#ifdef _GAUGE_COPY + if(g_update_gauge_copy) { + update_backward_gauge(hf->gaugefield); + } +#endif + /* for parallelization */ +#ifdef MPI + xchange_2fields(k, l, ieo); +#endif + +#ifdef OMP +#define static +#pragma omp parallel + { +#endif + int ix,iy; + int ioff, icx, icy; + su3 * restrict up ALIGN; + su3 * restrict um ALIGN; + static su3 v1,v2; + static su3_vector psia,psib,phia,phib; + static spinor rr; + spinor * restrict sp ALIGN; + spinor * restrict sm ALIGN; + +#ifdef OMP +#undef static +#endif + +#ifdef _KOJAK_INST +#pragma pomp inst begin(derivSb) +#endif +#ifdef XLC +#pragma disjoint(*sp, *sm, *up, *um) +#endif + +#ifdef BGL + __alignx(16, l); + __alignx(16, k); +#endif + + if(ieo==0) { + ioff=0; + } + else { + ioff=(VOLUME+RAND)/2; + } + + /************** loop over all lattice sites ****************/ +#ifdef OMP +#pragma omp for +#endif + for(icx = ioff; icx < (VOLUME/2+ioff); icx++){ + ix=g_eo2lexic[icx]; + rr = (*(l + (icx-ioff))); + /* rr=g_spinor_field[l][icx-ioff]; */ + + /*multiply the left vector with gamma5*/ + _vector_minus_assign(rr.s2, rr.s2); + _vector_minus_assign(rr.s3, rr.s3); + + /*********************** direction +0 ********************/ + + iy=g_iup[ix][0]; icy=g_lexic2eosub[iy]; + + sp = k + icy; +#if (defined _GAUGE_COPY && !defined _USE_HALFSPINOR && !defined _USE_TSPLITPAR) + up=&g_gauge_field_copy[icx][0]; +#else + up=&hf->gaugefield[ix][0]; +#endif + _vector_add(psia,sp->s0,sp->s2); + _vector_add(psib,sp->s1,sp->s3); + + _vector_add(phia,rr.s0,rr.s2); + _vector_add(phib,rr.s1,rr.s3); + + _vector_tensor_vector_add(v1, phia, psia, phib, psib); + _su3_times_su3d(v2,*up,v1); + _complex_times_su3(v1, ka0, v2); + _trace_lambda_mul_add_assign_nonlocal(hf->derivative[ix][0], 2.*factor, v1); + + /************** direction -0 ****************************/ + + iy=g_idn[ix][0]; icy=g_lexic2eosub[iy]; + + sm = k + icy; +#if (defined _GAUGE_COPY && !defined _USE_HALFSPINOR && !defined _USE_TSPLITPAR) + um = up+1; +#else + um=&hf->gaugefield[iy][0]; +#endif + + _vector_sub(psia,sm->s0,sm->s2); + _vector_sub(psib,sm->s1,sm->s3); + + _vector_sub(phia,rr.s0,rr.s2); + _vector_sub(phib,rr.s1,rr.s3); + + + _vector_tensor_vector_add(v1, psia, phia, psib, phib); + _su3_times_su3d(v2,*um,v1); + _complex_times_su3(v1,ka0,v2); + _trace_lambda_mul_add_assign_nonlocal(hf->derivative[iy][0], 2.*factor, v1); + + /*************** direction +1 **************************/ + + iy=g_iup[ix][1]; icy=g_lexic2eosub[iy]; + + sp = k + icy; +#if (defined _GAUGE_COPY && !defined _USE_HALFSPINOR && !defined _USE_TSPLITPAR) + up=um+1; +#else + up=&hf->gaugefield[ix][1]; +#endif + _vector_i_add(psia,sp->s0,sp->s3); + _vector_i_add(psib,sp->s1,sp->s2); + + _vector_i_add(phia,rr.s0,rr.s3); + _vector_i_add(phib,rr.s1,rr.s2); + + _vector_tensor_vector_add(v1, phia, psia, phib, psib); + _su3_times_su3d(v2,*up,v1); + _complex_times_su3(v1,ka1,v2); + _trace_lambda_mul_add_assign_nonlocal(hf->derivative[ix][1], 2.*factor, v1); + + /**************** direction -1 *************************/ + + iy=g_idn[ix][1]; icy=g_lexic2eosub[iy]; + + sm = k + icy; +#if (defined _GAUGE_COPY && !defined _USE_HALFSPINOR && !defined _USE_TSPLITPAR) + um=up+1; +#else + um=&hf->gaugefield[iy][1]; +#endif + _vector_i_sub(psia,sm->s0,sm->s3); + _vector_i_sub(psib,sm->s1,sm->s2); + + _vector_i_sub(phia,rr.s0,rr.s3); + _vector_i_sub(phib,rr.s1,rr.s2); + + _vector_tensor_vector_add(v1, psia, phia, psib, phib); + _su3_times_su3d(v2,*um,v1); + _complex_times_su3(v1,ka1,v2); + _trace_lambda_mul_add_assign_nonlocal(hf->derivative[iy][1], 2.*factor, v1); + + /*************** direction +2 **************************/ + + iy=g_iup[ix][2]; icy=g_lexic2eosub[iy]; + + sp = k + icy; +#if (defined _GAUGE_COPY && !defined _USE_HALFSPINOR && !defined _USE_TSPLITPAR) + up=um+1; +#else + up=&hf->gaugefield[ix][2]; +#endif + _vector_add(psia,sp->s0,sp->s3); + _vector_sub(psib,sp->s1,sp->s2); + + _vector_add(phia,rr.s0,rr.s3); + _vector_sub(phib,rr.s1,rr.s2); + + _vector_tensor_vector_add(v1, phia, psia, phib, psib); + _su3_times_su3d(v2,*up,v1); + _complex_times_su3(v1,ka2,v2); + _trace_lambda_mul_add_assign_nonlocal(hf->derivative[ix][2], 2.*factor, v1); + + /***************** direction -2 ************************/ + + iy=g_idn[ix][2]; icy=g_lexic2eosub[iy]; + + sm = k + icy; +#if (defined _GAUGE_COPY && !defined _USE_HALFSPINOR && !defined _USE_TSPLITPAR) + um = up +1; +#else + um=&hf->gaugefield[iy][2]; +#endif + _vector_sub(psia,sm->s0,sm->s3); + _vector_add(psib,sm->s1,sm->s2); + + _vector_sub(phia,rr.s0,rr.s3); + _vector_add(phib,rr.s1,rr.s2); + + _vector_tensor_vector_add(v1, psia, phia, psib, phib); + _su3_times_su3d(v2,*um,v1); + _complex_times_su3(v1,ka2,v2); + _trace_lambda_mul_add_assign_nonlocal(hf->derivative[iy][2], 2.*factor, v1); + + /****************** direction +3 ***********************/ + + iy=g_iup[ix][3]; icy=g_lexic2eosub[iy]; + + sp = k + icy; +#if (defined _GAUGE_COPY && !defined _USE_HALFSPINOR && !defined _USE_TSPLITPAR) + up=um+1; +#else + up=&hf->gaugefield[ix][3]; +#endif + _vector_i_add(psia,sp->s0,sp->s2); + _vector_i_sub(psib,sp->s1,sp->s3); + + _vector_i_add(phia,rr.s0,rr.s2); + _vector_i_sub(phib,rr.s1,rr.s3); + + _vector_tensor_vector_add(v1, phia, psia, phib, psib); + _su3_times_su3d(v2,*up,v1); + _complex_times_su3(v1, ka3, v2); + _trace_lambda_mul_add_assign_nonlocal(hf->derivative[ix][3], 2.*factor, v1); + + /***************** direction -3 ************************/ + + iy=g_idn[ix][3]; icy=g_lexic2eosub[iy]; + + sm = k + icy; +#if (defined _GAUGE_COPY && !defined _USE_HALFSPINOR && !defined _USE_TSPLITPAR) + um = up+1; +#else + um=&hf->gaugefield[iy][3]; +#endif + _vector_i_sub(psia,sm->s0,sm->s2); + _vector_i_add(psib,sm->s1,sm->s3); + + _vector_i_sub(phia,rr.s0,rr.s2); + _vector_i_add(phib,rr.s1,rr.s3); + + _vector_tensor_vector_add(v1, psia, phia, psib, phib); + _su3_times_su3d(v2,*um,v1); + _complex_times_su3(v1,ka3,v2); + _trace_lambda_mul_add_assign_nonlocal(hf->derivative[iy][3], 2.*factor, v1); + + /****************** end of loop ************************/ + } + +#ifdef OMP + } /* OpenMP closing brace */ +#endif + +#ifdef _KOJAK_INST +#pragma pomp inst end(derivSb) +#endif +} + +#endif + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/deriv_Sb.h b/qcd/part_cpu/applications/QCD/src/kernel_D/deriv_Sb.h new file mode 100644 index 0000000000000000000000000000000000000000..86bcb266dc17793e04fc46ac662de15f37483250 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/deriv_Sb.h @@ -0,0 +1,27 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _DERIV_SB_H +#define _DERIV_SB_H + +#include "hamiltonian_field.h" + +void deriv_Sb(const int ieo, spinor * const l, spinor * const k, + hamiltonian_field_t * const hf, const double factor); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/deriv_Sb_D_psi.c b/qcd/part_cpu/applications/QCD/src/kernel_D/deriv_Sb_D_psi.c new file mode 100644 index 0000000000000000000000000000000000000000..f1febc8c95ae34854f4bd620ab57790cb78b4a79 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/deriv_Sb_D_psi.c @@ -0,0 +1,587 @@ +/*********************************************************************** + * + * Copyright (C) 2007,2008 Jan Volkholz, Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "boundary.h" +#include "xchange/xchange.h" +#include "sse.h" +#include "hamiltonian_field.h" +#include "deriv_Sb_D_psi.h" + + +#if (defined BGLnotchecked && defined XLC) + +void deriv_Sb_D_psi(spinor * const l, spinor * const k, + hamiltonian_field_t * const hf, const double factor) { + + int ix,iy, iz; + int ioff,ioff2,icx,icy, icz; + su3 * restrict up ALIGN; + su3 * restrict um ALIGN; + su3adj * restrict ddd; + static su3adj der; + static su3 v1,v2; + static su3_vector psia,psib,phia,phib; + static spinor rr; + spinor * restrict r ALIGN; + spinor * restrict sp ALIGN; + spinor * restrict sm ALIGN; + + /* We have 32 registers available */ + double _Complex reg00, reg01, reg02, reg03, reg04, reg05; + double _Complex reg10, reg11, reg12, reg13, reg14, reg15; + /* For su3 matrix, use reg00 for missing register */ + double _Complex v00, v01, v02, v10, v11, v12, v20, v21; + /* The following contains the left spinor (12 regs) and the final */ + /* su3 matrix to trace over */ + double _Complex r00, r01, r02, r10, r11, r12, r20, r21, r22, + r30, r31, r32; + +#ifdef _KOJAK_INST +# pragma pomp inst begin(derivSb) +#endif + +#pragma disjoint(*r, *sp, *sm, *up, *um, *ddd) + __alignx(16, l); + __alignx(16, k); + + if(ieo==0) { + ioff=0; + } + else { + ioff=(VOLUME+RAND)/2; + } + ioff2=(VOLUME+RAND)/2-ioff; + + /* for parallelization */ +#ifdef MPI + xchange_field(k, ieo); + xchange_field(l, (ieo+1)%2); +#endif + /************** loop over all lattice sites ****************/ + + ix=ioff; + iy=g_iup[ix][0]; icy=iy; + sp = k + icy; + _prefetch_spinor(sp); + up=&hf->gaugefield[ix][0]; + _prefetch_su3(up); + + for(icx = ioff; icx < (VOLUME+ioff); icx++){ + + /* load left vector r and */ + /* multiply with gamma5 */ + r = l + (icx-ioff); + ix=icx; + + /*********************** direction +0 ********************/ + + ddd = &hf->derivative[ix][0]; + _bgl_load_r0((*r).s0); + _bgl_load_r1((*r).s1); + _bgl_load_minus_r2((*r).s2); + _bgl_load_minus_r3((*r).s3); + + _bgl_load_reg0((*sp).s0); + _bgl_load_reg0_up((*sp).s1); + _bgl_load_reg1((*sp).s2); + _bgl_load_reg1_up((*sp).s3); + + _bgl_add_to_reg0_reg1(); + _bgl_add_to_reg0_up_reg1_up(); + + _bgl_add_r0_to_r2_reg1(); + _bgl_add_r1_to_r3_reg1_up(); + + iy=g_idn[ix][0]; icy=iy; + sm = k + icy; + _prefetch_spinor(sm); + um=&hf->gaugefield[iy][0]; + _prefetch_su3(um); + + _bgl_tensor_product_and_add(); + /* result in v now */ + _bgl_su3_times_v_dagger(*up); + /* result in r now */ + _bgl_complex_times_r(ka0); + _bgl_trace_lambda_mul_add_assign((*ddd), 2.*factor); + + /************** direction -0 ****************************/ + + ddd = &hf->derivative[iy][0]; + _bgl_load_r0((*r).s0); + _bgl_load_r1((*r).s1); + _bgl_load_minus_r2((*r).s2); + _bgl_load_minus_r3((*r).s3); + + _bgl_load_reg0((*sm).s0); + _bgl_load_reg0_up((*sm).s1); + _bgl_load_reg1((*sm).s2); + _bgl_load_reg1_up((*sm).s3); + + _bgl_sub_from_reg0_reg1(); + _bgl_sub_from_reg0_up_reg1_up(); + + _bgl_sub_from_r0_r2_reg1(); + _bgl_sub_from_r1_r3_reg1_up(); + + iy=g_iup[ix][1]; icy=[iy]; + + sp = k + icy; + _prefetch_spinor(sp); + up=&hf->gaugefield[ix][1]; + _prefetch_su3(up); + + _bgl_tensor_product_and_add_d(); + /* result in v now */ + _bgl_su3_times_v_dagger(*um); + /* result in r now */ + _bgl_complex_times_r(ka0); + _bgl_trace_lambda_mul_add_assign((*ddd), 2.*factor); + + /*************** direction +1 **************************/ + + ddd = &hf->derivative[ix][1]; + _bgl_load_r0((*r).s0); + _bgl_load_r1((*r).s1); + _bgl_load_minus_r2((*r).s2); + _bgl_load_minus_r3((*r).s3); + + _bgl_load_reg0((*sp).s0); + _bgl_load_reg0_up((*sp).s1); + _bgl_load_reg1((*sp).s2); + _bgl_load_reg1_up((*sp).s3); + + _bgl_i_mul_add_to_reg0_reg1_up(); + _bgl_i_mul_add_to_reg0_up_reg1(); + + _bgl_i_mul_add_r0_to_r3_reg1(); + _bgl_i_mul_add_r1_to_r2_reg1_up(); + + iy=g_idn[ix][1]; icy=iy; + + sm = k + icy; + _prefetch_spinor(sm); + um=&hf->gaugefield[iy][1]; + _prefetch_su3(um); + + _bgl_tensor_product_and_add(); + /* result in v now */ + _bgl_su3_times_v_dagger(*up); + /* result in r now */ + _bgl_complex_times_r(ka1); + _bgl_trace_lambda_mul_add_assign((*ddd), 2.*factor); + + /**************** direction -1 *************************/ + + ddd = &hf->derivative[iy][1]; + _bgl_load_r0((*r).s0); + _bgl_load_r1((*r).s1); + _bgl_load_minus_r2((*r).s2); + _bgl_load_minus_r3((*r).s3); + + _bgl_load_reg0((*sp).s0); + _bgl_load_reg0_up((*sp).s1); + _bgl_load_reg1((*sp).s2); + _bgl_load_reg1_up((*sp).s3); + + _bgl_i_mul_sub_from_reg0_reg1_up(); + _bgl_i_mul_sub_from_reg0_up_reg1(); + + _bgl_i_mul_sub_from_r0_r3_reg1(); + _bgl_i_mul_sub_from_r1_r2_reg1_up(); + + iy=g_iup[ix][2]; icy=iy; + + sp = k + icy; + _prefetch_spinor(sp); + up=&hf->gaugefield[ix][2]; + _prefetch_su3(up); + + _bgl_tensor_product_and_add_d(); + /* result in v now */ + _bgl_su3_times_v_dagger(*um); + /* result in r now */ + _bgl_complex_times_r(ka1); + _bgl_trace_lambda_mul_add_assign((*ddd), 2.*factor); + + /*************** direction +2 **************************/ + + ddd = &hf->derivative[ix][2]; + _bgl_load_r0((*r).s0); + _bgl_load_r1((*r).s1); + _bgl_load_minus_r2((*r).s2); + _bgl_load_minus_r3((*r).s3); + + _bgl_load_reg0((*sp).s0); + _bgl_load_reg0_up((*sp).s1); + _bgl_load_reg1((*sp).s2); + _bgl_load_reg1_up((*sp).s3); + + _bgl_add_to_reg0_reg1_up(); + _bgl_sub_from_reg0_up_reg1(); + + _bgl_add_r0_to_r3_reg1(); + _bgl_sub_from_r1_r2_reg1_up(); + + iy=g_idn[ix][2]; icy=iy; + + sm = k + icy; + _prefetch_spinor(sm); + um=&hf->gaugefield[iy][2]; + _prefetch_su3(um); + + _bgl_tensor_product_and_add(); + /* result in v now */ + _bgl_su3_times_v_dagger(*up); + /* result in r now */ + _bgl_complex_times_r(ka2); + _bgl_trace_lambda_mul_add_assign((*ddd), 2.*factor); + + /***************** direction -2 ************************/ + + ddd = &hf->derivative[iy][2]; + _bgl_load_r0((*r).s0); + _bgl_load_r1((*r).s1); + _bgl_load_minus_r2((*r).s2); + _bgl_load_minus_r3((*r).s3); + + _bgl_load_reg0((*sp).s0); + _bgl_load_reg0_up((*sp).s1); + _bgl_load_reg1((*sp).s2); + _bgl_load_reg1_up((*sp).s3); + + _bgl_sub_from_reg0_reg1_up(); + _bgl_add_to_reg0_up_reg1(); + + _bgl_sub_from_r0_r3_reg1(); + _bgl_add_r1_to_r2_reg1_up(); + + iy=g_iup[ix][3]; icy=iy; + + sp = k + icy; + _prefetch_spinor(sp); + up=&hf->gaugefield[ix][3]; + _prefetch_su3(up); + + _bgl_tensor_product_and_add_d(); + /* result in v now */ + _bgl_su3_times_v_dagger(*um); + /* result in r now */ + _bgl_complex_times_r(ka1); + _bgl_trace_lambda_mul_add_assign(*ddd, 2.*factor); + + /****************** direction +3 ***********************/ + + ddd = &hf->derivative[ix][3]; + _bgl_load_r0((*r).s0); + _bgl_load_r1((*r).s1); + _bgl_load_minus_r2((*r).s2); + _bgl_load_minus_r3((*r).s3); + + _bgl_load_reg0((*sp).s0); + _bgl_load_reg0_up((*sp).s1); + _bgl_load_reg1((*sp).s2); + _bgl_load_reg1_up((*sp).s3); + + _bgl_i_mul_add_to_reg0_reg1(); + _bgl_i_mul_sub_from_reg0_up_reg1_up(); + + _bgl_i_mul_add_r0_to_r2_reg1(); + _bgl_i_mul_sub_from_r1_r3_reg1_up(); + + iy=g_idn[ix][3]; icy=iy; + + sm = k + icy; + _prefetch_spinor(sm); + um=&hf->gaugefield[iy][3]; + _prefetch_su3(um); + + _bgl_tensor_product_and_add(); + /* result in v now */ + _bgl_su3_times_v_dagger(*up); + /* result in r now */ + _bgl_complex_times_r(ka3); + _bgl_trace_lambda_mul_add_assign((*ddd), 2.*factor); + + /***************** direction -3 ************************/ + + ddd = &hf->derivative[iy][3]; + _bgl_load_r0((*r).s0); + _bgl_load_r1((*r).s1); + _bgl_load_minus_r2((*r).s2); + _bgl_load_minus_r3((*r).s3); + + _bgl_load_reg0((*sp).s0); + _bgl_load_reg0_up((*sp).s1); + _bgl_load_reg1((*sp).s2); + _bgl_load_reg1_up((*sp).s3); + + _bgl_i_mul_sub_from_reg0_reg1(); + _bgl_i_mul_add_to_reg0_up_reg1_up(); + + _bgl_i_mul_sub_from_r0_r2_reg1(); + _bgl_i_mul_add_r1_to_r3_reg1_up(); + + /* something wrong here...*/ + icz=icx+1; + if(icz==((VOLUME+RAND)/2+ioff)) icz=ioff; + iz=icz; + iy=g_iup[iz][0]; icy=iy; + + sp = k + icy; + _prefetch_spinor(sp); + up=&hf->gaugefield[iz][0]; + _prefetch_su3(up); + + _bgl_tensor_product_and_add_d(); + /* result in v now */ + _bgl_su3_times_v_dagger(*um); + /* result in r now */ + _bgl_complex_times_r(ka3); + _bgl_trace_lambda_mul_add_assign((*ddd), 2.*factor); + + /****************** end of loop ************************/ + } +#ifdef _KOJAK_INST +#pragma pomp inst end(derivSb) +#endif +} + +#else + +/*----------------------------------------------------------------------------*/ + +void deriv_Sb_D_psi(spinor * const l, spinor * const k, + hamiltonian_field_t * const hf, const double factor) { +#ifdef BGL + __alignx(16, l); + __alignx(16, k); +#endif + + /* for parallelization */ +#ifdef MPI + xchange_lexicfield(k); + xchange_lexicfield(l); +#endif + +#ifdef OMP +#define static +#pragma omp parallel + { +#endif + + int ix,iy; + su3 * restrict up ALIGN; + su3 * restrict um ALIGN; + static su3 v1,v2; + static su3_vector psia,psib,phia,phib; + static spinor rr; +/* spinor * restrict r ALIGN; */ + spinor * restrict sp ALIGN; + spinor * restrict sm ALIGN; + +#ifdef OMP +#undef static +#endif + +#ifdef _KOJAK_INST +#pragma pomp inst begin(derivSb) +#endif +#ifdef XLC +#pragma disjoint(*sp, *sm, *up, *um) +#endif + + /************** loop over all lattice sites ****************/ +#ifdef OMP +#pragma omp for +#endif + for(ix = 0; ix < (VOLUME); ix++){ + rr = (*(l + ix)); + /* rr=g_spinor_field[l][icx-ioff]; */ + + /*multiply the left vector with gamma5*/ + _vector_minus_assign(rr.s2, rr.s2); + _vector_minus_assign(rr.s3, rr.s3); + + /*********************** direction +0 ********************/ + + iy=g_iup[ix][0]; + + sp = k + iy; + up=&hf->gaugefield[ix][0]; + + _vector_add(psia,(*sp).s0,(*sp).s2); + _vector_add(psib,(*sp).s1,(*sp).s3); + + _vector_add(phia,rr.s0,rr.s2); + _vector_add(phib,rr.s1,rr.s3); + + _vector_tensor_vector_add(v1, phia, psia, phib, psib); + _su3_times_su3d(v2,*up,v1); + _complex_times_su3(v1,ka0,v2); + _trace_lambda_mul_add_assign_nonlocal(hf->derivative[ix][0], 2.*factor, v1); + + /************** direction -0 ****************************/ + + iy=g_idn[ix][0]; + + sm = k + iy; + um=&hf->gaugefield[iy][0]; + + _vector_sub(psia,(*sm).s0,(*sm).s2); + _vector_sub(psib,(*sm).s1,(*sm).s3); + + _vector_sub(phia,rr.s0,rr.s2); + _vector_sub(phib,rr.s1,rr.s3); + + _vector_tensor_vector_add(v1, psia, phia, psib, phib); + _su3_times_su3d(v2,*um,v1); + _complex_times_su3(v1,ka0,v2); + _trace_lambda_mul_add_assign_nonlocal(hf->derivative[iy][0], 2.*factor, v1); + + /*************** direction +1 **************************/ + + iy=g_iup[ix][1]; + + sp = k + iy; + up=&hf->gaugefield[ix][1]; + + _vector_i_add(psia,(*sp).s0,(*sp).s3); + _vector_i_add(psib,(*sp).s1,(*sp).s2); + + _vector_i_add(phia,rr.s0,rr.s3); + _vector_i_add(phib,rr.s1,rr.s2); + + _vector_tensor_vector_add(v1, phia, psia, phib, psib); + _su3_times_su3d(v2,*up,v1); + _complex_times_su3(v1,ka1,v2); + _trace_lambda_mul_add_assign_nonlocal(hf->derivative[ix][1], 2.*factor, v1); + + /**************** direction -1 *************************/ + + iy=g_idn[ix][1]; + + sm = k + iy; + um=&hf->gaugefield[iy][1]; + + _vector_i_sub(psia,(*sm).s0,(*sm).s3); + _vector_i_sub(psib,(*sm).s1,(*sm).s2); + + _vector_i_sub(phia,rr.s0,rr.s3); + _vector_i_sub(phib,rr.s1,rr.s2); + + _vector_tensor_vector_add(v1, psia, phia, psib, phib); + _su3_times_su3d(v2,*um,v1); + _complex_times_su3(v1,ka1,v2); + _trace_lambda_mul_add_assign_nonlocal(hf->derivative[iy][1], 2.*factor, v1); + + /*************** direction +2 **************************/ + + iy=g_iup[ix][2]; + + sp = k + iy; + up=&hf->gaugefield[ix][2]; + + _vector_add(psia,(*sp).s0,(*sp).s3); + _vector_sub(psib,(*sp).s1,(*sp).s2); + + _vector_add(phia,rr.s0,rr.s3); + _vector_sub(phib,rr.s1,rr.s2); + + _vector_tensor_vector_add(v1, phia, psia, phib, psib); + _su3_times_su3d(v2,*up,v1); + _complex_times_su3(v1,ka2,v2); + _trace_lambda_mul_add_assign_nonlocal(hf->derivative[ix][2], 2.*factor, v1); + + /***************** direction -2 ************************/ + + iy=g_idn[ix][2]; + + sm = k + iy; + um=&hf->gaugefield[iy][2]; + + _vector_sub(psia,(*sm).s0,(*sm).s3); + _vector_add(psib,(*sm).s1,(*sm).s2); + + _vector_sub(phia,rr.s0,rr.s3); + _vector_add(phib,rr.s1,rr.s2); + + _vector_tensor_vector_add(v1, psia, phia, psib, phib); + _su3_times_su3d(v2,*um,v1); + _complex_times_su3(v1,ka2,v2); + _trace_lambda_mul_add_assign_nonlocal(hf->derivative[iy][2], 2.*factor, v1); + + /****************** direction +3 ***********************/ + + iy=g_iup[ix][3]; + + sp = k + iy; + up=&hf->gaugefield[ix][3]; + + _vector_i_add(psia,(*sp).s0,(*sp).s2); + _vector_i_sub(psib,(*sp).s1,(*sp).s3); + + _vector_i_add(phia,rr.s0,rr.s2); + _vector_i_sub(phib,rr.s1,rr.s3); + + _vector_tensor_vector_add(v1, phia, psia, phib, psib); + _su3_times_su3d(v2,*up,v1); + _complex_times_su3(v1,ka3,v2); + _trace_lambda_mul_add_assign_nonlocal(hf->derivative[ix][3], 2.*factor, v1); + + /***************** direction -3 ************************/ + + iy=g_idn[ix][3]; + + sm = k + iy; + um=&hf->gaugefield[iy][3]; + + _vector_i_sub(psia,(*sm).s0,(*sm).s2); + _vector_i_add(psib,(*sm).s1,(*sm).s3); + + _vector_i_sub(phia,rr.s0,rr.s2); + _vector_i_add(phib,rr.s1,rr.s3); + + _vector_tensor_vector_add(v1, psia, phia, psib, phib); + _su3_times_su3d(v2,*um,v1); + _complex_times_su3(v1,ka3,v2); + _trace_lambda_mul_add_assign_nonlocal(hf->derivative[iy][3], 2.*factor, v1); + + /****************** end of loop ************************/ + } +#ifdef _KOJAK_INST +#pragma pomp inst end(derivSb) +#endif + +#ifdef OMP + } /*OpenMP closing brace */ +#endif +} + +#endif + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/deriv_Sb_D_psi.h b/qcd/part_cpu/applications/QCD/src/kernel_D/deriv_Sb_D_psi.h new file mode 100644 index 0000000000000000000000000000000000000000..0e9d70188a97cb2989d97c0cb08814d09661ef99 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/deriv_Sb_D_psi.h @@ -0,0 +1,29 @@ +/*********************************************************************** + * + * Copyright (C) 2007,2008 Jan Volkholz, Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _DERIV_SB_D_PSI_H +#define _DERIV_SB_D_PSI_H + +#include "hamiltonian_field.h" + +void deriv_Sb_D_psi(spinor * const l, spinor * const k, + hamiltonian_field_t * const hf, const double factor); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/doc/2+1+1_howto.text b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/2+1+1_howto.text new file mode 100644 index 0000000000000000000000000000000000000000..2f9b3e15b7bf5b6169a6392eff2d4172aadd6c95 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/2+1+1_howto.text @@ -0,0 +1,180 @@ + + ******************************** + * "2+1+1" HOWTO **** + *** Written by Andreas Nube *** + ******************************** + +Introduction: + +In this HOWTO i shall describe how to run the PHMC code with "2+1+1" +setup. "2+1+1" in this context means that we have two degenerate LIGHT +quarks and two non-degenerate HEAVY quarks. This HOWTO addresses the +setup of all necessary input files and also the monitoring of the +thermalization progress due to necessary re-adjustments of the +polynomial. + + 0. Building the code + ------------------------------------ + +Well, this should be quite automatic. However, you need lapack and +blas in order to build the executable. Make sure that you have +both available and configure/make the hmc code accordingly. If +successful, you will find the executable hmc_tm in your build +directory. + + 1. Setup the "hmc.input" file + ------------------------------------ + +Set the parameters for your desired simulation (only 1+1 specific parameters are +discussed here) : +hmc.input: + +# hopping parameter +# set this to kappa_critical for maximum twist +kappa = 0.1234 + +# twisted mass, set this parameter to +# 2*kappa*mu_desired +# where mu_desired is the quark mass you want to simulate +# in the LIGHT quark sector +2KappaMu = 0.00005 + +# twisted mass for the heavy sector +# set this to the mean value of the mass of the HEAVY quarks +# you want to simulate e.g. M_s=0.02 M_c=0.2 ==> +2KappaMubar = 0.11 + +# split of the mass in the HEAVY quark sector +# set this to (M_c-M_s)/2 +# such that you get finally: +# M_s=PhmcMubar - PhmcEpsbar +# M_c=PhmcMubar + PhmcEpsbar +2KappaEpsbar = 0.09 + +# Setting the polynomial parameters: +# create a NDPOLY monomial first (see docu) +BeginMonomial NDPOLY + +# If you want to calculate the eigenvalues every n'th trajectory +# then set this parameter to n if you want no eigenvalues set this to 0 +# during thermalization you should set this to 1 or 2 to follow the evolution +# of smallest and largest eigenvalue to adjust the approximation interval +# of the polynomial (see below) + ComputeEVFreq = 1 + +# --------------------------------- +# you should have a first estimate for the smallest and largest eigenvalue +# of the HEAVY quark operator. as an estimate you can take +# StildeMin = 2*(PhmcMubar^2-PhmcEpsbar^2) and +# StildeMax = 1/2/sqrt(PhmcMubar^2 + PhmcEpsbar^2) (taken from +# [1] ) +# set the following parameter to your estimate of the smallest eigenvalue +# or a little below +# + StildeMin = 0.008 + +# and this to an upper bound of all eigenvalues +# + StildeMax = 3.6 + +# an estimate for the error of an approximation of 1/sqrt(x) by a chebycheff +# polynomial is given by (found in [2]): +# delta = 2*((1-sqrt(ratio))/(1+sqrt(ratio)))^(n+1) +# where ratio = PhmcStildeMin/PhmcStildeMax +# and "n" is the degree of the chebycheff polynomial +# e.g.: if you take the values from above and want an delta of 0.0001 +# you have to set the degree to 104: + DegreeOfMDPolynomial = 104 + +# To make the calculation of the hamiltonian more precise the program uses +# a second polynomial Ptilde. +# with the following parameter you can adjust the +# precision of the whole approximation of 1/sqrt(x)= Ptilde(x)*P(x)*(1+Rtilde(x)) +# with |Rtilde(x)| ~ PhmcPrecisionPtilde (see also [1]) + + PrecisionPtilde = 1.0e-9 + +# and you have to specify on which timescale to integrate the 1+1 part + Timescale = 1 + +EndMonomial +# you will need other monomials and an integrator, see docu +# end of hmc.input + +Now you should have a proper hmc.input file. For a complete example +see the file +hmc/sample-input/sample-hmc2.input +and the comments therein. + +2. Creating a polynomial +----------------------------------- + +Change to the directory hmc/util/laguer ! +Edit the file "chebyRoot.H" ! +Find the definition of "EPSILON" (not the out commented ones) +and set it to the "ratio" defined above: +e.g. if you have PhmcStildeMin = 0.008 , PhmcStildeMax=3.6 +then ratio = 0.002222222222222 +and you have to set EPSILON to this value in CLN notation: + + EPSILON = "0.00222222222222222e+0_700" + +Moreover, you have to set the degree of the polynomial to the correct +value, e.g. + +int MAXPOW = 104; + +This _must_ be identical to the degree used in the hmc code. If not, +the PHMC will fail. + +To actually compile the code the CLN library is needed, which is +available from + http://www.ginac.de/CLN/ +Please download it and install it in a convenient place. CLN provides +arbitrary precision. Edit the Makefile accordingly and run on command +line + + make + +to make the "chebyRoot" program! +run this program! +Now the program should have created this files + + Square_root_BR_roots.dat + normierungLocal.dat + +Attention: The files that might exist before will be overwritten. +So make a copy of these files if they took a lot of time to be computed. +(you might guess what happened to me several times ;-) + +Now copy this two files to the directory where you are going to RUN the +program together with the hmc.input file. + +Now you should be able to run the "phmc_tm" program in "2+1+1" mode. + +During the thermalization you should keep track of the lowest and largest + eigenvalue. this can be done by filtering the job output file. +cat job.12345.out|grep -A6 LAMBDA +. Alternating you get the (actually) four smallest and four largest + eigenvalues normalized to the value of StildeMax you set in + hmc.input file. To get the unscaled eigenvalue you have to multiply the +values by StildeMax. +If the eigenvalues you measured are outside the approximation interval + [StildeMin, StildeMax] you have to re-adjust these values (in hmc.input) and +also regenerate the files + + Square_root_BR_roots.dat + normierungLocal.dat + +according to the steps described above. + + +Have a nice simulation! + + + +References: + +[1] Thomas Chiarappa, "Status of the Phmc-code" found in hmc/doc +[2] Roberto Frezzotti, Karl Jansen, hep-lat/970201, "A Polynomial Hybrid Monte Carlo Algorithm" + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/doc/Phmc-report.pdf b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/Phmc-report.pdf new file mode 100644 index 0000000000000000000000000000000000000000..57d041efaf07e9ad2e097d11a91c0a6845466d64 Binary files /dev/null and b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/Phmc-report.pdf differ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/doc/Polynomial-constr-notes.pdf b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/Polynomial-constr-notes.pdf new file mode 100644 index 0000000000000000000000000000000000000000..952faa951a04483a1306bd463ada977cbb642f21 Binary files /dev/null and b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/Polynomial-constr-notes.pdf differ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/doc/Status-Phmc.pdf b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/Status-Phmc.pdf new file mode 100644 index 0000000000000000000000000000000000000000..abb00b29f25e1e22163c124529454050b3536d12 Binary files /dev/null and b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/Status-Phmc.pdf differ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/doc/basis.tex b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/basis.tex new file mode 100644 index 0000000000000000000000000000000000000000..87df2e0a9d39284b419dcdd0e07a848340f978ab --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/basis.tex @@ -0,0 +1,75 @@ +\subsection{QCD on a lattice} + +Quantum Chromodynamics on a hyper-cubic Euclidean space-time lattice +of size $L^3\times T$ with lattice spacing $a$ is formally described +by the action +\begin{equation} + \label{eq:action} + S = S_\mathrm{G}[U] + a^4 \sum_x \bar\psi\ D[U]\ \psi +\end{equation} +with $S_\mathrm{G}$ some suitable discretisation of the the Yang-Mills +action $F_{\mu\nu}^2/4$~\cite{Yang:1954ek}. The particular +implementation we are using can be found below in section 4.2 and +consists of plaquette and rectangular shaped Wilson loops with +particular coefficients. $D$ is a discretisation of the Dirac +operator, for which Wilson originally proposed~\cite{Wilson:1974sk} to +use the +so called Wilson Dirac operator +\begin{equation} + \label{eq:DW} + D_W[U] = \frac{1}{2}\left[\gamma_\mu\left(\nabla_\mu + + \nabla^*_\mu\right) -a\nabla^*_\mu\nabla_\mu \right] +\end{equation} +with $\nabla_\mu$ and $\nabla_\mu^*$ +the forward and backward gauge covariant difference operators, +respectively: +\begin{equation} + \label{eq:covariant} + \begin{split} + \nabla_\mu\psi(x) &= \frac{1}{a}\Bigl[U(x,x+a\hat \mu)\psi(x+a \hat \mu) - + \psi(x)\Bigr]\, , \\ + \nabla_\mu^* \psi(x) &= + \frac{1}{a}\Bigl[\psi(x)-U^\dagger(x,x-a\hat\mu)\psi(x-a\hat\mu)\Bigr]\, ,\\ + \end{split} +\end{equation} +where we denote the $\mathrm{SU}(3)$ link variables by $U_{x,\mu}$. +We shall set $a\equiv 1$ in the following for convenience. +Discretising the theory is by far not a unique procedure. Instead of Wilson's +original formulation one may equally well chose the +Wilson twisted mass formulation and the corresponding Dirac +operator~\cite{Frezzotti:2000nk} +\begin{equation} + \label{eq:Dtm} + D_\mathrm{tm} = (D_W[U] + m_0)\ 1_f + i \mu_q\gamma_5\tau^3 +\end{equation} +for a mass degenerate doublet of quarks. We denote by $m_0$ the bare +(Wilson) quark mass, $\mu_q$ is the bare twisted +mass parameter, $\tau^i$ the $i$-th Pauli matrix and $1_f$ the +unit matrix acting in flavour space (see appendix~\ref{sec:gammas} for +our convention). In the framework of Wilson twisted mass QCD only +flavour doublets of quarks can be simulated, however, the two quarks +do not need to be degenerate in mass. The corresponding mass +non-degenerate flavour doublet reads~\cite{Frezzotti:2003xj} +\begin{equation} + \label{eq:Dh} + D_h(\bar\mu, \bar\epsilon) = D_\mathrm{W}\ 1_f + + i\bar\mu\gamma_5\tau^3 - \bar\epsilon \tau^1 \, . +\end{equation} +It has the property +\[ +D_h^\dagger = \tau^1\gamma_5 D_h \gamma_5 \tau^1\,. +\] +Note that this notation is not unique. Equivalently -- as used in +Ref.~\cite{Chiarappa:2006ae} -- one may write +\begin{equation} + \label{eq:altDh} + D_h'(\mu_\sigma,\mu_\delta) = D_\mathrm{W}\cdot 1_f + + i\gamma_5\mu_\sigma\tau^1 + \mu_\delta \tau^3\, , +\end{equation} +which is related to $D_h$ by $D_h' = (1+i\tau^2)D_h(1-i\tau^2)/2$ +and $(\mu_\sigma,\mu_\delta)\to(\bar\mu, -\bar\epsilon)$. + +%%% Local Variables: +%%% mode: latex +%%% TeX-master: "main" +%%% End: diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/doc/bibliography.bib b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/bibliography.bib new file mode 100644 index 0000000000000000000000000000000000000000..67394ce13a93bd0e3bf68242751ed86e9cc9bd32 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/bibliography.bib @@ -0,0 +1,7653 @@ +@article{Clark:2009wm, + author = "Clark, M.A. and Babich, R. and Barros, K. and Brower, + R.C. and Rebbi, C.", + title = "{Solving Lattice QCD systems of equations using mixed + precision solvers on GPUs}", + journal = "Comput.Phys.Commun.", + volume = "181", + pages = "1517-1528", + doi = "10.1016/j.cpc.2010.05.002", + year = "2010", + eprint = "0911.3191", + archivePrefix = "arXiv", + primaryClass = "hep-lat", + SLACcitation = "%%CITATION = ARXIV:0911.3191;%%", +} + +@article{Babich:2011np, + author = "Babich, R. and Clark, M.A. and Joo, B. and Shi, G. and + Brower, R.C. and others", + title = "{Scaling Lattice QCD beyond 100 GPUs}", + year = "2011", + eprint = "1109.2935", + archivePrefix = "arXiv", + primaryClass = "hep-lat", + SLACcitation = "%%CITATION = ARXIV:1109.2935;%%", +} + +@article{Strelchenko:2013vaa, + author = "Strelchenko, Alexei and Alexandrou, Constantia and + Koutsou, Giannis and Aviles-Casco, Alejandro Vaquero", + title = "{Implementation of the twisted mass fermion operator in + the QUDA library}", + journal = "PoS", + volume = "LATTICE2013", + pages = "415", + year = "2014", + eprint = "1311.4462", + archivePrefix = "arXiv", + primaryClass = "hep-lat", + reportNumber = "FERMILAB-CONF-13-528-CD", + SLACcitation = "%%CITATION = ARXIV:1311.4462;%%", +} + +@article{Luscher:2012av, + author = "Luscher, Martin and Schaefer, Stefan", + title = "{Lattice QCD with open boundary conditions and + twisted-mass reweighting}", + journal = "Comput.Phys.Commun.", + volume = "184", + pages = "519-528", + doi = "10.1016/j.cpc.2012.10.003", + year = "2013", + eprint = "1206.2809", + archivePrefix = "arXiv", + primaryClass = "hep-lat", + reportNumber = "CERN-PH-TH-2012-161", + SLACcitation = "%%CITATION = ARXIV:1206.2809;%%", +} +@article{Luscher:2010ae, + author = "Luscher, Martin", + title = "{Computational Strategies in Lattice QCD}", + pages = "331-399", + year = "2010", + eprint = "1002.4232", + archivePrefix = "arXiv", + primaryClass = "hep-lat", + reportNumber = "CERN-PH-TH-2010-047", + SLACcitation = "%%CITATION = ARXIV:1002.4232;%%", +} +@article{Clark:2006fx, + author = "Clark, M.A. and Kennedy, A.D.", + title = "{Accelerating dynamical fermion computations using the + rational hybrid Monte Carlo (RHMC) algorithm with multiple + pseudofermion fields}", + journal = "Phys.Rev.Lett.", + volume = "98", + pages = "051601", + doi = "10.1103/PhysRevLett.98.051601", + year = "2007", + eprint = "hep-lat/0608015", + archivePrefix = "arXiv", + primaryClass = "hep-lat", + SLACcitation = "%%CITATION = HEP-LAT/0608015;%%", +} +@Article{'tHooft:1971fh, + author = "'t Hooft, G.", + title = "Renormalization of massless Yang-Mills fields", + journal = "Nucl. Phys.", + volume = "B33", + year = "1971", + pages = "173-199", + SLACcitation = "%%CITATION = NUPHA,B33,173;%%" +} +@Article{'tHooft:1971rn, + author = "'t Hooft, G.", + title = "Renormalizable lagrangians for massive Yang-Mills fields", + journal = "Nucl. Phys.", + volume = "B35", + year = "1971", + pages = "167-188", + SLACcitation = "%%CITATION = NUPHA,B35,167;%%" +} +@Unpublished{'tHooft:1972aa, + author = "'t Hooft, G.", + title = "", + note = "Unpublished remarks at the 1972 Marseille Conference + on Yang-Mills Fields" +} +@Article{'tHooft:1972fi, + author = "'t Hooft, G. and Veltman, M. J. G.", + title = "Regularization and renormalization of gauge fields", + journal = "Nucl. Phys.", + volume = "B44", + year = "1972", + pages = "189-213", + SLACcitation = "%%CITATION = NUPHA,B44,189;%%" +} +@Article{Abdel-Rehim:2004gx, + author = "Abdel-Rehim, A. M. and Lewis, R.", + title = "Twisted mass {QCD} for the pion electromagnetic form factor", + journal = "Phys. Rev.", + volume = "D71", + year = "2005", + pages = "014503", + eprint = "hep-lat/0410047", + SLACcitation = "%%CITATION = HEP-LAT 0410047;%%" +} +@Article{Abdel-Rehim:2005gz, + author = "Abdel-Rehim, Abdou M. and Lewis, Randy and Woloshyn, R. M. + ", + title = "Spectrum of quenched twisted mass lattice QCD at maximal + twist", + journal = "Phys. Rev.", + volume = "D71", + year = "2005", + pages = "094505", + eprint = "hep-lat/0503007", + SLACcitation = "%%CITATION = HEP-LAT/0503007;%%" +} +@Article{AbdelRehim:2004sp, + author = "Abdel-Rehim, Abdou M. and Lewis, Randy", + title = "Pion form factor with twisted mass QCD", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "140", + year = "2005", + pages = "299-301", + eprint = "hep-lat/0408033", + SLACcitation = "%%CITATION = HEP-LAT/0408033;%%" +} +@Article{AbdelRehim:2005gq, + author = "Abdel-Rehim, A. M. and Lewis, R. and Woloshyn, R. M.", + title = "Twisted mass lattice QCD and hadron phenomenology", + journal = "Int. J. Mod. Phys.", + volume = "A20", + year = "2005", + pages = "6159-6168", + SLACcitation = "%%CITATION = IMPAE,A20,6159;%%" +} +@Article{AbdelRehim:2005gz, + author = "Abdel-Rehim, Abdou M. and Lewis, Randy and Woloshyn, R. M. + ", + title = "{Spectrum of quenched twisted mass lattice QCD at maximal + twist}", + journal = "Phys. Rev.", + volume = "D71", + year = "2005", + pages = "094505", + eprint = "hep-lat/0503007", + archivePrefix = "arXiv", + doi = "10.1103/PhysRevD.71.094505", + SLACcitation = "%%CITATION = HEP-LAT/0503007;%%" +} +@Article{AbdelRehim:2005qv, + author = "Abdel-Rehim, Abdou M. and Lewis, Randy and Woloshyn, R. M. + ", + title = "The hadron spectrum from twisted mass QCD with a strange + quark", + journal = "PoS", + volume = "LAT2005", + year = "2006", + pages = "032", + eprint = "hep-lat/0509056", + SLACcitation = "%%CITATION = HEP-LAT/0509056;%%" +} +@Article{AbdelRehim:2005yx, + author = "Abdel-Rehim, Abdou M. and Lewis, Randy and Woloshyn, R. M. + ", + title = "Maximal twist and the spectrum of quenched twisted mass + lattice QCD", + journal = "PoS", + volume = "LAT2005", + year = "2006", + pages = "051", + eprint = "hep-lat/0509098", + SLACcitation = "%%CITATION = HEP-LAT/0509098;%%" +} +@Article{AbdelRehim:2006qu, + author = "Abdel-Rehim, Abdou M. and Lewis, Randy and Petry, Robert G. + and Woloshyn, R. M.", + title = "The spectrum of tmLQCD with quark and link smearing", + journal = "PoS", + volume = "LAT2006", + year = "2006", + pages = "164", + eprint = "hep-lat/0610004", + SLACcitation = "%%CITATION = HEP-LAT/0610004;%%" +} +@Article{AbdelRehim:2006ra, + author = "Abdel-Rehim, Abdou M. and Lewis, Randy and Woloshyn, R. M. + and Wu, Jackson M. S.", + title = "Lattice QCD with a twisted mass term and a strange quark", + journal = "Eur. Phys. J.", + volume = "A31", + year = "2007", + pages = "773-776", + eprint = "hep-lat/0610090", + SLACcitation = "%%CITATION = HEP-LAT/0610090;%%" +} +@Article{AbdelRehim:2006ve, + author = "Abdel-Rehim, Abdou M. and Lewis, Randy and Woloshyn, R. M. + and Wu, Jackson M. S.", + title = "Strange quarks in quenched twisted mass lattice QCD", + journal = "Phys. Rev.", + volume = "D74", + year = "2006", + pages = "014507", + eprint = "hep-lat/0601036", + SLACcitation = "%%CITATION = HEP-LAT/0601036;%%" +} +@Article{Adler:1974gd, + author = "Adler, Stephen L.", + title = "{Some Simple Vacuum Polarization Phenomenology: e+ e- $\to$ + Hadrons: The mu - Mesic Atom x-Ray Discrepancy and (g-2) of + the Muon}", + journal = "Phys. Rev.", + volume = "D10", + year = "1974", + pages = "3714", + SLACcitation = "%%CITATION = PHRVA,D10,3714;%%" +} +@Article{Albanese:1987ds, + author = "Albanese, M. and others", + collaboration = "APE", + title = "Glueball masses and string tension in lattice {QCD}", + journal = "Phys. Lett.", + volume = "B192", + year = "1987", + pages = "163", + SLACcitation = "%%CITATION = PHLTA,B192,163;%%" +} +@Article{Alexandrou:2008tn, + author = "Alexandrou, C. and others", + collaboration = "ETM", + title = "{Light baryon masses with dynamical twisted mass + fermions}", + year = "2008", + eprint = "0803.3190", + archivePrefix = "arXiv", + primaryClass = "hep-lat", + SLACcitation = "%%CITATION = 0803.3190;%%" +} +@Article{AliKhan:2000iv, + author = "Ali Khan, A. and others", + collaboration = "CP-PACS", + title = "Chiral properties of domain-wall quarks in quenched {QCD}", + journal = "Phys. Rev.", + volume = "D63", + year = "2001", + pages = "114504", + eprint = "hep-lat/0007014", + SLACcitation = "%%CITATION = HEP-LAT 0007014;%%" +} +@Article{AliKhan:2003br, + author = "Ali Khan, A. and others", + collaboration = "QCDSF", + title = "Accelerating the hybrid Monte Carlo algorithm", + journal = "Phys. Lett.", + volume = "B564", + year = "2003", + pages = "235-240", + eprint = "hep-lat/0303026", + SLACcitation = "%%CITATION = HEP-LAT 0303026;%%" +} +@Article{AliKhan:2003mu, + author = "Ali Khan, A. and others", + title = "Accelerating Hasenbusch's acceleration of hybrid Monte + Carlo", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "129", + year = "2004", + pages = "853-855", + eprint = "hep-lat/0309078", + SLACcitation = "%%CITATION = HEP-LAT 0309078;%%" +} +@Article{Allton:1993wc, + author = "Allton, C. R. and others", + collaboration = "UK{QCD}", + title = "Gauge invariant smearing and matrix correlators using + {Wilson} fermions at Beta = 6.2", + journal = "Phys. Rev.", + volume = "D47", + year = "1993", + pages = "5128-5137", + eprint = "hep-lat/9303009", + SLACcitation = "%%CITATION = HEP-LAT 9303009;%%" +} +@Article{Allton:2004qq, + author = "Allton, C. R. and others", + collaboration = "UKQCD", + title = "Improved Wilson QCD simulations with light quark masses", + journal = "Phys. Rev.", + volume = "D70", + year = "2004", + pages = "014501", + eprint = "hep-lat/0403007", + SLACcitation = "%%CITATION = HEP-LAT/0403007;%%" +} +@Article{Aoki:1984qi, + author = "Aoki, S.", + title = "New phase structure for lattice {QCD} with {Wilson} fermions", + journal = "Phys. Rev.", + volume = "D30", + year = "1984", + pages = "2653", + SLACcitation = "%%CITATION = PHRVA,D30,2653;%%" +} +@Article{Aoki:1985jj, + author = "Aoki, S. and Higashijima, K.", + title = "The recovery of the chiral symmetry in lattice {Gross-Neveu} + model", + journal = "Prog. Theor. Phys.", + volume = "76", + year = "1986", + pages = "521", + SLACcitation = "%%CITATION = PTPKA,76,521;%%" +} +@Article{Aoki:1986ua, + author = "Aoki, Sinya", + title = "NUMERICAL EVIDENCE FOR A PARITY VIOLATING PHASE IN LATTICE + QCD WITH WILSON FERMION", + journal = "Phys. Lett.", + volume = "B190", + year = "1987", + pages = "140", + SLACcitation = "%%CITATION = PHLTA,B190,140;%%" +} +@Article{Aoki:1986xr, + author = "Aoki, S.", + title = "A solution to the {U(1)} problem on a lattice", + journal = "Phys. Rev. Lett.", + volume = "57", + year = "1986", + pages = "3136", + SLACcitation = "%%CITATION = PRLTA,57,3136;%%" +} +@Article{Aoki:1993vs, + author = "Aoki, S. and Boettcher, S. and Gocksch, A.", + title = "Spontaneous breaking of flavor symmetry and parity in the + Nambu-Jona-Lasinio model with {Wilson} fermions", + journal = "Phys. Lett.", + volume = "B331", + year = "1994", + pages = "157-164", + eprint = "hep-lat/9312084", + SLACcitation = "%%CITATION = HEP-LAT 9312084;%%" +} +@Article{Aoki:1995ft, + author = "Aoki, S.", + title = "On the phase structure of {QCD} with {Wilson} fermions", + journal = "Prog. Theor. Phys. Suppl.", + volume = "122", + year = "1996", + pages = "179-186", + eprint = "hep-lat/9509008", + SLACcitation = "%%CITATION = HEP-LAT 9509008;%%" +} +@Article{Aoki:1995yf, + author = "Aoki, S. and Ukawa, A. and Umemura, T.", + title = "Finite temperature phase structure of lattice {QCD} with + {Wilson} quark action", + journal = "Phys. Rev. Lett.", + volume = "76", + year = "1996", + pages = "873-876", + eprint = "hep-lat/9508008", + SLACcitation = "%%CITATION = HEP-LAT 9508008;%%" +} +@Article{Aoki:1997fm, + author = "Aoki, S.", + title = "Phase structure of lattice {QCD} with {Wilson} fermion at + finite temperature", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "60A", + year = "1998", + pages = "206-219", + eprint = "hep-lat/9707020", + SLACcitation = "%%CITATION = HEP-LAT 9707020;%%" +} +@Article{Aoki:2001xq, + author = "Aoki, S. and others", + collaboration = "JL{QCD}", + title = "Non-trivial phase structure of {N(f)} = 3 {QCD} with {O(a)}- + improved {Wilson} fermion at zero temperature", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "106", + year = "2002", + pages = "263-265", + eprint = "hep-lat/0110088", + SLACcitation = "%%CITATION = HEP-LAT 0110088;%%" +} +@Article{Aoki:2002vt, + author = "Aoki, Y. and others", + title = "Domain wall fermions with improved gauge actions", + journal = "Phys. Rev.", + volume = "D69", + year = "2004", + pages = "074504", + eprint = "hep-lat/0211023", + SLACcitation = "%%CITATION = HEP-LAT 0211023;%%" +} +@Article{Aoki:2004iq, + author = "Aoki, S. and others", + collaboration = "JL{QCD}", + title = "Bulk first-order phase transition in three-flavor lattice + {QCD} with {O(a)}-improved {Wilson} fermion action at zero + temperature", + year = "2004", + eprint = "hep-lat/0409016", + SLACcitation = "%%CITATION = HEP-LAT 0409016;%%" +} +@Article{Aoki:2004ta, + author = "Aoki, Sinya and B{\"a}r, Oliver", + title = "Twisted-mass {QCD}, {O}(a) improvement and {Wilson} chiral + perturbation theory", + journal = "Phys. Rev.", + volume = "D70", + year = "2004", + pages = "116011", + eprint = "hep-lat/0409006", + SLACcitation = "%%CITATION = HEP-LAT 0409006;%%" +} +@Article{Aoki:2005ii, + author = "Aoki, S. and B{\"a}r, O.", + title = "Determining the low energy parameters of {Wilson} chiral + perturbation theory", + year = "2005", + eprint = "hep-lat/0509002", + SLACcitation = "%%CITATION = HEP-LAT 0509002;%%" +} +@Article{Arnold:2003sx, + author = "Arnold, Guido and others", + title = "Numerical methods for the QCD overlap operator. II: Optimal + Krylov subspace methods", + year = "2003", + eprint = "hep-lat/0311025", + SLACcitation = "%%CITATION = HEP-LAT 0311025;%%" +} +@Article{Atiyah:1971rm, + author = "Atiyah, M. F. and Singer, I. M.", + title = "The Index of elliptic operators. 5", + journal = "Annals Math.", + volume = "93", + year = "1971", + pages = "139-149", + SLACcitation = "%%CITATION = ANMAA,93,139;%%" +} +@Article{Aubin:2006cc, + author = "Aubin, C. and Blum, T.", + title = "{Hadronic contributions to the muon g-2 from the lattice}", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "162", + year = "2006", + pages = "251-255", + SLACcitation = "%%CITATION = NUPHZ,162,251;%%" +} +@Article{Aubin:2006xv, + author = "Aubin, C. and Blum, T.", + title = "{Calculating the hadronic vacuum polarization and leading + hadronic contribution to the muon anomalous magnetic + moment with improved staggered quarks}", + journal = "Phys. Rev.", + volume = "D75", + year = "2007", + pages = "114502", + eprint = "hep-lat/0608011", + SLACcitation = "%%CITATION = HEP-LAT/0608011;%%" +} +@Article{BAGEL, + author="P.A. Boyle", + year=2005, + eprint=" http://www.ph.ed.ac.uk/\~{ }paboyle/bagel/Bagel.html" + } +@Article{Baikov:2004ku, + author = "Baikov, P. A. and Chetyrkin, K. G. and K{\"u}hn, J. H.", + title = "{Vacuum polarization in pQCD: First complete O(alpha(s)**4) + result}", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "135", + year = "2004", + pages = "243-246", + SLACcitation = "%%CITATION = NUPHZ,135,243;%%" +} +@Article{Baikov:2005rw, + author = "Baikov, P. A. and Chetyrkin, K. G. and K{\"u}hn, J. H.", + title = "{Scalar correlator at O(alpha(s)**4), Higgs decay into b- + quarks and bounds on the light quark masses}", + journal = "Phys. Rev. Lett.", + volume = "96", + year = "2006", + pages = "012003", + eprint = "hep-ph/0511063", + SLACcitation = "%%CITATION = HEP-PH/0511063;%%" +} +@Article{Baikov:2008jh, + author = "Baikov, P. A. and Chetyrkin, K. G. and K{\"u}hn, J. H.", + title = "{Hadronic Z- and tau-Decays in Order alpha_s^4}", + year = "2008", + eprint = "0801.1821", + archivePrefix = "arXiv", + primaryClass = "hep-lat", + SLACcitation = "%%CITATION = ARXIV:0801.1821;%%" +} +@Article{Bali:2000vr, + author = "Bali, G. S. and others", + collaboration = "TXL", + title = "Static potentials and glueball masses from {QCD} simulations + with {Wilson} sea quarks", + journal = "Phys. Rev.", + volume = "D62", + year = "2000", + pages = "054503", + eprint = "hep-lat/0003012", + SLACcitation = "%%CITATION = HEP-LAT 0003012;%%" +} +@Article{Bali:2004pb, + author = "Bali, G. S. and others", + title = "String breaking with dynamical {Wilson} fermions", + journal = "Nucl. Phys. Proc. Supl.", + volume = "140", + pages = "609-611", + year = "2004", + eprint = "hep-lat/0409137", + SLACcitation = "%%CITATION = HEP-LAT 0409137;%%" +} +@Article{Bali:2005fu, + author = "Bali, G. S. and Neff, H. and Duessel, T. and + Lippert, T. and Schilling, K.", + collaboration = "SESAM", + title = "Observation of string breaking in {QCD}", + journal = "Phys. Rev.", + volume = "D71", + year = "2005", + pages = "114513", + eprint = "hep-lat/0505012", + SLACcitation = "%%CITATION = HEP-LAT 0505012;%%" +} +@Article{Bar:2006zj, + author = "B{\"a}r, O. and Jansen, K. and Schaefer, S. and Scorzato, L. + and Shindler, A.", + title = "Overlap fermions on a twisted mass sea", + year = "2006", + eprint = "hep-lat/0609039", + SLACcitation = "%%CITATION = HEP-LAT 0609039;%%" +} +@Article{Baxter:1993bv, + author = "Baxter, R. M. and others", + collaboration = "UK{QCD}", + title = "Quenched heavy light decay constants", + journal = "Phys. Rev.", + volume = "D49", + year = "1994", + pages = "1594-1605", + eprint = "hep-lat/9308020", + SLACcitation = "%%CITATION = HEP-LAT 9308020;%%" +} +@Article{Beane:2004tw, + author = "Beane, Silas R.", + title = "{Nucleon masses and magnetic moments in a finite volume}", + journal = "Phys. Rev.", + volume = "D70", + year = "2004", + pages = "034507", + eprint = "hep-lat/0403015", + archivePrefix = "arXiv", + doi = "10.1103/PhysRevD.70.034507", + SLACcitation = "%%CITATION = HEP-LAT/0403015;%%" +} +@Article{Becher:1999he, + author = "Becher, Thomas and Leutwyler, H.", + title = "Baryon chiral perturbation theory in manifestly Lorentz + invariant form", + journal = "Eur. Phys. J.", + volume = "C9", + year = "1999", + pages = "643-671", + eprint = "hep-ph/9901384", + SLACcitation = "%%CITATION = HEP-PH/9901384;%%" +} +@Article{Bietenholz:2004sa, + author = "Bietenholz, W. and others", + collaboration = "\xlf", + title = "Comparison between overlap and twisted mass fermions + towards the chiral limit", + year = "2004", + eprint = "hep-lat/0409109", + SLACcitation = "%%CITATION = HEP-LAT 0409109;%%" +} +@Article{Bietenholz:2004wv, + author = "Bietenholz, W. and others", + collaboration = "\xlf", + title = "Going chiral: Overlap versus twisted mass fermions", + journal = "JHEP", + volume = "12", + year = "2004", + pages = "044", + eprint = "hep-lat/0411001", + SLACcitation = "%%CITATION = HEP-LAT 0411001;%%" +} +@Article{Blossier:2007vv, + author = "Blossier, B. and others", + collaboration = "ETM", + title = "{Light quark masses and pseudoscalar decay constants from + Nf=2 Lattice QCD with twisted mass fermions}", + year = "2007", + eprint = "0709.4574", + archivePrefix = "arXiv", + primaryClass = "hep-lat", + SLACcitation = "%%CITATION = ARXIV:0709.4574;%%" +} +@Article{Blum:1994eh, + author = "Blum, Tom and others", + title = "QCD thermodynamics with Wilson quarks at large kappa", + journal = "Phys. Rev.", + volume = "D50", + year = "1994", + pages = "3377-3381", + eprint = "hep-lat/9404006", + SLACcitation = "%%CITATION = HEP-LAT 9404006;%%" +} +@Article{Blum:2000kn, + author = "Blum, T. and others", + title = "Quenched lattice {QCD} with domain wall fermions and the + chiral limit", + journal = "Phys. Rev.", + volume = "D69", + year = "2004", + pages = "074502", + eprint = "hep-lat/0007038", + SLACcitation = "%%CITATION = HEP-LAT 0007038;%%" +} +@Article{Bodin:2005gg, + author = "Bodin, F. and others", + collaboration = "ApeNEXT", + title = "The {apeNEXT} project", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "140", + year = "2005", + pages = "176-182", + SLACcitation = "%%CITATION = NUPHZ,140,176;%%" +} +@Article{Bolder:2000un, + author = "Bolder, B. and others", + title = "A high precision study of the Q anti-Q potential from + {Wilson} loops in the regime of string breaking", + journal = "Phys. Rev.", + volume = "D63", + year = "2001", + pages = "074504", + eprint = "hep-lat/0005018", + SLACcitation = "%%CITATION = HEP-LAT 0005018;%%" +} +@Article{Boucaud:2007uk, + author = "Boucaud, Ph. and others", + collaboration = "ETM", + title = "Dynamical twisted mass fermions with light quarks", + year = "2007", + eprint = "hep-lat/0701012", + SLACcitation = "%%CITATION = HEP-LAT 0701012;%%" +} +@Article{Boucaud:2008xu, + author = "Boucaud, Ph. and others", + collaboration = "ETM", + title = "{Dynamical Twisted Mass Fermions with Light Quarks: + Simulation and Analysis Details}", + journal = "Comput. Phys. Commun.", + volume = "179", + year = "2008", + pages = "695-715", + eprint = "0803.0224", + archivePrefix = "arXiv", + primaryClass = "hep-lat", + doi = "10.1016/j.cpc.2008.06.013", + SLACcitation = "%%CITATION = 0803.0224;%%" +} +@Article{Boughezal:2006px, + author = "Boughezal, R. and Czakon, M. and Schutzmeier, T.", + title = "{Charm and bottom quark masses from perturbative QCD}", + journal = "Phys. Rev.", + volume = "D74", + year = "2006", + pages = "074006", + eprint = "hep-ph/0605023", + SLACcitation = "%%CITATION = HEP-PH/0605023;%%" +} +@Article{Boyle:2005fb, + author = "Boyle, P. A. and others", + title = "{QCDOC}: Project status and first results", + journal = "J. Phys. Conf. Ser.", + volume = "16", + year = "2005", + pages = "129-139", + SLACcitation = "%%CITATION = 00462,16,129;%%" +} + +@Article{Brower:1994er, + author = "Brower, R. C. and Levi, A. R. and Orginos, K.", + title = "Extrapolation methods for the Dirac inverter in hybrid + Monte Carlo", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "42", + year = "1995", + pages = "855-857", + eprint = "hep-lat/9412004", + SLACcitation = "%%CITATION = HEP-LAT 9412004;%%" +} + +@Article{Brower:1995vx, + author = "Brower, R. C. and Ivanenko, T. and Levi, A. R. and Orginos, + K. N.", + title = "Chronological inversion method for the Dirac matrix in + hybrid Monte Carlo", + journal = "Nucl. Phys.", + volume = "B484", + year = "1997", + pages = "353-374", + eprint = "hep-lat/9509012", + SLACcitation = "%%CITATION = HEP-LAT 9509012;%%" +} +@Article{Bunk:1995uv, + author = "Bunk, B. and others", + title = "A New simulation algorithm for lattice {QCD} with dynamical + quarks", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "42", + year = "1995", + pages = "49-55", + eprint = "hep-lat/9411016", + SLACcitation = "%%CITATION = HEP-LAT 9411016;%%" +} +@Article{Bunk:1998rm, + author = "Bunk, B. and Elser, S. and Frezzotti, R. and Jansen, + K.", + title = "Ordering monomial factors of polynomials in the product + representation", + journal = "Comput. Phys. Commun.", + volume = "118", + year = "1999", + pages = "95-109", + eprint = "hep-lat/9805026", + SLACcitation = "%%CITATION = HEP-LAT 9805026;%%" +} +@Article{Burrage:1998a, + author = " K. Burrage and J. Erhel", + title = "On the performance of various adaptive preconditioned GMRES strategies", + journal = "Num. Lin. Alg. with Appl.", + year = "1998", + volume = "5", + pages = "101-121" +} +@Article{Campbell:1987nv, + author = "Campbell, N. A. and Huntley, A. and Michael, C.", + title = "Heavy quark potentials and hybrid mesons from SU(3) lattice + gauge theory", + journal = "Nucl. Phys.", + volume = "B306", + year = "1988", + pages = "51", + SLACcitation = "%%CITATION = NUPHA,B306,51;%%" +} +@Article{Capitani:2005jp, + author = "Capitani, S. and others", + title = "Parton distribution functions with twisted mass fermions", + journal = "Phys. Lett.", + volume = "B639", + year = "2006", + pages = "520-526", + eprint = "hep-lat/0511013", + SLACcitation = "%%CITATION = HEP-LAT 0511013;%%" +} +@Article{Chen:2003im, + author = "Chen, Y. and others", + title = "Chiral logarithms in quenched {QCD}", + journal = "Phys. Rev.", + volume = "D70", + year = "2004", + pages = "034502", + eprint = "hep-lat/0304005", + SLACcitation = "%%CITATION = HEP-LAT 0304005;%%" +} +@Book{Cheng:2000ct, + author = "Cheng, T. P. and Li, L. F.", + title = "Gauge theory of elementary particle physics: Problems and + solutions", + publisher = "Oxford, UK: Clarendon", + year = "2000", + pages = "306", + edition = "", +} +@Article{Chetyrkin:1990kr, + author = "Chetyrkin, K. G. and K{\"u}hn, Johann H.", + title = "{Mass corrections to the Z decay rate}", + journal = "Phys. Lett.", + volume = "B248", + year = "1990", + pages = "359-364", + SLACcitation = "%%CITATION = PHLTA,B248,359;%%" +} +@Article{Chetyrkin:1996cf, + author = "Chetyrkin, K. G. and K{\"u}hn, Johann H. and Steinhauser, M.", + title = "{Three-loop polarization function and O(alpha(s)**2) + corrections to the production of heavy quarks}", + journal = "Nucl. Phys.", + volume = "B482", + year = "1996", + pages = "213-240", + eprint = "hep-ph/9606230", + SLACcitation = "%%CITATION = HEP-PH/9606230;%%" +} +@Article{Chetyrkin:1997mb, + author = "Chetyrkin, K. G. and K{\"u}hn, Johann H. and Steinhauser, M.", + title = "{Heavy quark current correlators to O(alpha(s)**2)}", + journal = "Nucl. Phys.", + volume = "B505", + year = "1997", + pages = "40-64", + eprint = "hep-ph/9705254", + SLACcitation = "%%CITATION = HEP-PH/9705254;%%" +} +@Article{Chetyrkin:1998ix, + author = "Chetyrkin, K. G. and Harlander, R. and Steinhauser, M.", + title = "{Singlet polarization functions at O(alpha(s)**2)}", + journal = "Phys. Rev.", + volume = "D58", + year = "1998", + pages = "014012", + eprint = "hep-ph/9801432", + SLACcitation = "%%CITATION = HEP-PH/9801432;%%" +} +@Article{Chetyrkin:2000zk, + author = "Chetyrkin, K. G. and Harlander, R. V. and K{\"u}hn, Johann H.", + title = "{Quartic mass corrections to R(had) at O(alpha(s)**3)}", + journal = "Nucl. Phys.", + volume = "B586", + year = "2000", + pages = "56-72", + eprint = "hep-ph/0005139", + SLACcitation = "%%CITATION = HEP-PH/0005139;%%" +} +@Article{Chetyrkin:2006xg, + author = "Chetyrkin, K. G. and K{\"u}hn, J. H. and Sturm, C.", + title = "{Four-loop moments of the heavy quark vacuum polarization + function in perturbative QCD}", + journal = "Eur. Phys. J.", + volume = "C48", + year = "2006", + pages = "107-110", + eprint = "hep-ph/0604234", + SLACcitation = "%%CITATION = HEP-PH/0604234;%%" +} +@Article{Chiarappa:2004ry, + author = "Chiarappa, T. and others", + title = "{Comparing iterative methods for overlap and twisted mass + fermions}", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "140", + year = "2005", + pages = "853-855", + eprint = "hep-lat/0409107", + archivePrefix = "arXiv", + doi = "10.1016/j.nuclphysbps.2004.11.281", + SLACcitation = "%%CITATION = HEP-LAT/0409107;%%" +} +@Article{Chiarappa:2006ae, + author = "Chiarappa, T. and others", + title = "{Numerical simulation of {QCD} with u, d, s and c quarks in + the twisted-mass {W}ilson formulation}", + journal = "Eur. Phys. J.", + volume = "C50", + year = "2007", + pages = "373-383", + eprint = "hep-lat/0606011", + archivePrefix = "arXiv", + doi = "10.1140/epjc/s10052-006-0204-4", + SLACcitation = "%%CITATION = HEP-LAT/0606011;%%" +} +@Article{Chiarappa:2006hz, + author = "Chiarappa, T. and others", + title = "{Iterative methods for overlap and twisted mass fermions}", + year = "2008", + journal = "Comput. Sci. Disc.", + volume = "01", + pages = "015001", + eprint = "hep-lat/0609023", + archivePrefix = "arXiv", + SLACcitation = "%%CITATION = HEP-LAT/0609023;%%" +} +@Article{Cichy:2008gk, + author = "Cichy, K. and Gonzalez Lopez, J. and Jansen, K. and Kujawa, + A. and Shindler, A.", + title = "{Twisted Mass, Overlap and Creutz Fermions: Cut-off Effects + at Tree-level of Perturbation Theory}", + journal = "Nucl. Phys.", + volume = "B800", + year = "2008", + pages = "94-108", + eprint = "0802.3637", + archivePrefix = "arXiv", + primaryClass = "hep-lat", + doi = "10.1016/j.nuclphysb.2008.03.004", + SLACcitation = "%%CITATION = 0802.3637;%%" +} +@Article{Clark:2004cq, + author = "Clark, M. A. and Kennedy, A. D.", + title = "Accelerating fermionic molecular dynamics", + year = "2004", + eprint = "hep-lat/0409134", + SLACcitation = "%%CITATION = HEP-LAT 0409134;%%" +} + +@Article{Clark:2005sq, + author = "Clark, M. A. and de Forcrand, Ph. and Kennedy, A. D.", + title = "Algorithm shootout: R versus RHMC", + journal = "PoS", + volume = "LAT2005", + year = "2005", + pages = "115", + eprint = "hep-lat/0510004", + SLACcitation = "%%CITATION = HEP-LAT 0510004;%%" +} +@Article{Clark:2006fx, + author = "Clark, M. A. and Kennedy, A. D.", + title = "Accelerating dynamical fermion computations using the + rational hybrid {Monte} {Carlo} ({RHMC}) algorithm with multiple + pseudofermion fields", + year = "2006", + eprint = "hep-lat/0608015", + SLACcitation = "%%CITATION = HEP-LAT 0608015;%%" +} +@Article{Colangelo:2001df, + author = "Colangelo, G. and Gasser, J. and Leutwyler, H.", + title = "{pi pi scattering}", + journal = "Nucl. Phys.", + volume = "B603", + year = "2001", + pages = "125-179", + eprint = "hep-ph/0103088", + archivePrefix = "arXiv", + doi = "10.1016/S0550-3213(01)00147-X", + SLACcitation = "%%CITATION = HEP-PH/0103088;%%" +} +@Article{Colangelo:2003hf, + author = "Colangelo, Gilberto and D{\"u}rr, Stephan", + title = "The pion mass in finite volume", + journal = "Eur. Phys. J.", + volume = "C33", + year = "2004", + pages = "543-553", + eprint = "hep-lat/0311023", + SLACcitation = "%%CITATION = HEP-LAT/0311023;%%" +} +@Article{Colangelo:2005gd, + author = "Colangelo, Gilberto and D{\"u}rr, Stephan and Haefeli, + Christoph", + title = "Finite volume effects for meson masses and decay + constants", + journal = "Nucl. Phys.", + volume = "B721", + year = "2005", + pages = "136-174", + eprint = "hep-lat/0503014", + SLACcitation = "%%CITATION = HEP-LAT 0503014;%%" +} +@Article{Colangelo:2006mp, + author = "Colangelo, Gilberto and Haefeli, Christoph", + title = "{Finite volume effects for the pion mass at two loops}", + journal = "Nucl. Phys.", + volume = "B744", + year = "2006", + pages = "14-33", + eprint = "hep-lat/0602017", + archivePrefix = "arXiv", + doi = "10.1016/j.nuclphysb.2006.03.010", + SLACcitation = "%%CITATION = HEP-LAT/0602017;%%" +} +@Book{Collins:1994ab, + author = "Collins, J.C.", + title = "Renormalisation", + publisher = "Cambridge University Press", + series = "Cambridge Monographs on Mathematical Physics", + year = "1994", + edition = "", +} +@Article{Creutz:1984fj, + author = "Creutz, M. and Gocksch, A. and Ogilvie, M. and + Okawa, M.", + title = "Microcanonical renormalization group", + journal = "Phys. Rev. Lett.", + volume = "53", + year = "1984", + pages = "875", + SLACcitation = "%%CITATION = PRLTA,53,875;%%" +} +@Article{Creutz:1989wt, + author = "Creutz, M. and Gocksch, A.", + title = "Higher order hybrid monte carlo algorithms", + note = "BNL-42601" +} +@Article{Creutz:1996bg, + author = "Creutz, Michael", + title = "Wilson fermions at finite temperature", + year = "1996", + eprint = "hep-lat/9608024", + SLACcitation = "%%CITATION = HEP-LAT 9608024;%%" +} +@Article{Creutz:1998ee, + author = "Creutz, M.", + title = "Evaluating Grassmann integrals", + journal = "Phys. Rev. Lett.", + volume = "81", + year = "1998", + pages = "3555-3558", + eprint = "hep-lat/9806037", + SLACcitation = "%%CITATION = HEP-LAT 9806037;%%" +} +@Article{Cundy:2005pi, + author = "Cundy, N. and others", + title = "Numerical Methods for the {QCD} Overlap Operator IV: Hybrid + Monte Carlo", + year = "2005", + eprint = "hep-lat/0502007", + SLACcitation = "%%CITATION = HEP-LAT 0502007;%%" +} +@Article{David:1984ys, + author = "David, F. and Hamber, H. W.", + title = "Chiral condensate with {Wilson} fermions", + journal = "Nucl. Phys.", + volume = "B248", + year = "1984", + pages = "381", + SLACcitation = "%%CITATION = NUPHA,B248,381;%%" +} +@Article{Davies:2008sw, + author = "Davies, C. T. H. and others", + collaboration = "HPQCD", + title = "{Update: Accurate Determinations of $\alpha_s$ from + Realistic Lattice QCD}", + year = "2008", + eprint = "0807.1687", + archivePrefix = "arXiv", + primaryClass = "hep-lat", + SLACcitation = "%%CITATION = 0807.1687;%%" +} +@Article{DeGrand:1990dk, + author = "DeGrand, T. A. and Rossi, P.", + title = "Conditioning techniques for dynamical fermions", + journal = "Comput. Phys. Commun.", + volume = "60", + year = "1990", + pages = "211-214", + SLACcitation = "%%CITATION = CPHCB,60,211;%%" +} +@Article{DeGrand:1990ip, + author = "DeGrand, T. A.", + title = "Resonance masses from Monte Carlo simulations (with + emphasis on the rho meson)", + journal = "Phys. Rev.", + volume = "D43", + year = "1991", + pages = "2296-2300", + SLACcitation = "%%CITATION = PHRVA,D43,2296;%%" +} +@Article{DeGrand:2002vu, + author = "DeGrand, Thomas and Hasenfratz, Anna and Kovacs, Tamas G.", + title = "Improving the chiral properties of lattice fermions", + journal = "Phys. Rev.", + volume = "D67", + year = "2003", + pages = "054501", + eprint = "hep-lat/0211006", + SLACcitation = "%%CITATION = HEP-LAT 0211006;%%" +} +@Article{DeTar:2007ni, + author = "DeTar, Carleton and Levkova, L.", + title = "Effects of the disconnected flavor singlet corrections on + the hyperfine splitting in charmonium", + journal = "PoS", + volume = "LAT2007", + year = "2007", + pages = "116", + eprint = "0710.1322", + archivePrefix = "arXiv", + primaryClass = "hep-lat", + SLACcitation = "%%CITATION = ARXIV:0710.1322;%%" +} +@Article{DelDebbio:2006cn, + author = "Del Debbio, L. and Giusti, L. and Luscher, M. and + Petronzio, R. and Tantalo, N.", + title = "QCD with light Wilson quarks on fine lattices. I: First + experiences and physics results", + journal = "JHEP", + volume = "02", + year = "2007", + pages = "056", + eprint = "hep-lat/0610059", + SLACcitation = "%%CITATION = HEP-LAT 0610059;%%" +} +@Article{DellaMorte:2000yp, + author = "Della Morte, M. and Frezzotti, R. and Heitger, J. and Sint, + S.", + title = "Non-perturbative scaling tests of twisted mass {QCD}", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "94", + year = "2001", + pages = "617-621", + eprint = "hep-lat/0010091", + SLACcitation = "%%CITATION = HEP-LAT 0010091;%%" +} +@Article{DellaMorte:2001tu, + author = "Della Morte, M. and Frezzotti, R. and Heitger, J.", + title = "Quenched twisted mass {QCD} at small quark masses and in + large volume", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "106", + year = "2002", + pages = "260-262", + eprint = "hep-lat/0110166", + SLACcitation = "%%CITATION = HEP-LAT 0110166;%%" +} + +@Article{DellaMorte:2001ys, + author = "Della Morte, M. and Frezzotti, R. and Heitger, + J. and Sint, S.", + collaboration = "ALPHA", + title = "Cutoff effects in twisted mass lattice {QCD}", + journal = "JHEP", + volume = "10", + year = "2001", + pages = "041", + eprint = "hep-lat/0108019", + SLACcitation = "%%CITATION = HEP-LAT 0108019;%%" +} +@Article{DellaMorte:2003jj, + author = "Della Morte, M. and others", + collaboration = "ALPHA", + title = "Simulating the Schroedinger functional with two pseudo- + fermions", + journal = "Comput. Phys. Commun.", + volume = "156", + year = "2003", + pages = "62-72", + eprint = "hep-lat/0307008", + SLACcitation = "%%CITATION = HEP-LAT 0307008;%%" +} +@Article{DellaMorte:2003mn, + author = "Della Morte, M. and others", + collaboration = "ALPHA", + title = "Lattice HQET with exponentially improved statistical + precision", + journal = "Phys. Lett.", + volume = "B581", + year = "2004", + pages = "93-98", + eprint = "hep-lat/0307021", + SLACcitation = "%%CITATION = HEP-LAT 0307021;%%" +} +@Article{DellaMorte:2003mw, + author = "Della Morte, M. and others", + collaboration = "ALPHA", + title = "Static quarks with improved statistical precision", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "129", + year = "2004", + pages = "346-348", + eprint = "hep-lat/0309080", + SLACcitation = "%%CITATION = HEP-LAT 0309080;%%" +} +@Article{DellaMorte:2005yc, + author = "Della Morte, M. and Shindler, A. and Sommer, + R.", + title = "On lattice actions for static quarks", + year = "2005", + eprint = "hep-lat/0506008", + SLACcitation = "%%CITATION = HEP-LAT 0506008;%%" +} +@Article{Dimopoulos:2006dm, + author = "Dimopoulos, P. and others", + collaboration = "ALPHA", + title = "A precise determination of B(K) in quenched QCD", + journal = "Nucl. Phys.", + volume = "B749", + year = "2006", + pages = "69-108", + eprint = "hep-ph/0601002", + SLACcitation = "%%CITATION = HEP-PH 0601002;%%" +} +@Article{Dimopoulos:2007fn, + author = "Dimopoulos, P. and others", + title = "{Renormalisation of quark bilinears with Nf=2 Wilson + fermions and tree-level improved gauge action}", + journal = "PoS", + volume = "LAT2007", + year = "2007", + pages = "241", + eprint = "0710.0975", + archivePrefix = "arXiv", + primaryClass = "hep-lat", + SLACcitation = "%%CITATION = 0710.0975;%%" +} +@Article{Dimopoulos:2007qy, + author = "Dimopoulos, Petros and Frezzotti, Roberto and Herdoiza, + Gregorio and Urbach, Carsten and Wenger, Urs", + collaboration = "ETM", + title = "{Scaling and low energy constants in lattice QCD with N_f=2 + maximally twisted Wilson quarks}", + journal = "PoS", + volume = "LAT2007", + year = "2007", + pages = "102", + eprint = "0710.2498", + archivePrefix = "arXiv", + primaryClass = "hep-lat", + SLACcitation = "%%CITATION = 0710.2498;%%" +} +@Article{Dimopoulos:2008sy, + author = "Dimopoulos, Petros and others", + collaboration = "ETM", + title = "{Scaling and chiral extrapolation of pion mass and decay + constant with maximally twisted mass QCD}", + year = "2008", + eprint = "0810.2873", + archivePrefix = "arXiv", + primaryClass = "hep-lat", + SLACcitation = "%%CITATION = 0810.2873;%%" +} +@Article{Dong:2001fm, + author = "Dong, S. J. and others", + title = "Chiral properties of pseudoscalar mesons on a quenched + 20**4 lattice with overlap fermions", + journal = "Phys. Rev.", + volume = "D65", + year = "2002", + pages = "054507", + eprint = "hep-lat/0108020", + SLACcitation = "%%CITATION = HEP-LAT 0108020;%%" +} +@Article{Duane:1987de, + author = "Duane, S. and Kennedy, A. D. and Pendleton, B. J. and + Roweth, D.", + title = "{H}ybrid monte carlo", + journal = "Phys. Lett.", + volume = "B195", + year = "1987", + pages = "216-222", + SLACcitation = "%%CITATION = PHLTA,B195,216;%%" +} +@Article{Edwards:1996vs, + author = "Edwards, R. G. and Horvath, I. and Kennedy, A. D.", + title = "Instabilities and non-reversibility of molecular dynamics + trajectories", + journal = "Nucl. Phys.", + volume = "B484", + year = "1997", + pages = "375-402", + eprint = "hep-lat/9606004", + SLACcitation = "%%CITATION = HEP-LAT 9606004;%%" +} +@Article{Edwards:2004sx, + author = "Edwards, Robert G. and Joo, Balint", + collaboration = "SciDAC", + title = "The {Chroma} software system for lattice {QCD}", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "140", + year = "2005", + pages = "832", + eprint = "hep-lat/0409003", + SLACcitation = "%%CITATION = HEP-LAT 0409003;%%" +} +@Article{Eichten:1989zv, + author = "Eichten, E. and Hill, B.", + title = "An effective field theory for the calculation of matrix + elements involving heavy quarks", + journal = "Phys. Lett.", + volume = "B234", + year = "1990", + pages = "511", + SLACcitation = "%%CITATION = PHLTA,B234,511;%%" +} +@Article{Farchioni:2002vn, + author = "Farchioni, F. and Gebert, C. and Montvay, I. + and Scorzato, L.", + title = "Numerical simulation tests with light dynamical quarks", + journal = "Eur. Phys. J.", + volume = "C26", + year = "2002", + pages = "237-251", + eprint = "hep-lat/0206008", + SLACcitation = "%%CITATION = HEP-LAT 0206008;%%" +} +@Article{Farchioni:2004fs, + author = "Farchioni, F. and others", + title = "The phase structure of lattice {QCD} with {Wilson} quarks and + renormalization group improved gluons", + journal = "Eur. Phys. J.", + volume = "C42", + year = "2005", + pages = "73-87", + eprint = "hep-lat/0410031", + SLACcitation = "%%CITATION = HEP-LAT 0410031;%%" +} +@Article{Farchioni:2004ma, + author = "Farchioni, F. and others", + title = "Exploring the phase structure of lattice {{QCD}} with twisted + mass quarks", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "140", + year = "2005", + pages = "240-245", + eprint = "hep-lat/0409098", + SLACcitation = "%%CITATION = HEP-LAT 0409098;%%" +} +@Article{Farchioni:2004us, + author = "Farchioni, F. and others", + title = "Twisted mass quarks and the phase structure of lattice + {QCD}", + journal = "Eur. Phys. J.", + volume = "C39", + year = "2005", + pages = "421-433", + eprint = "hep-lat/0406039", + SLACcitation = "%%CITATION = HEP-LAT 0406039;%%" +} +@Article{Farchioni:2005ec, + author = "Farchioni, Federico and others", + title = "Dynamical twisted mass fermions", + journal = "PoS", + volume = "LAT2005", + year = "2006", + pages = "072", + eprint = "hep-lat/0509131", + SLACcitation = "%%CITATION = HEP-LAT 0509131;%%" +} +@Article{Farchioni:2005hf, + author = "Farchioni, F. and others", + title = "Twisted mass fermions: Neutral pion masses from + disconnected contributions", + journal = "PoS", + volume = "LAT2005", + year = "2006", + pages = "033", + eprint = "hep-lat/0509036", + SLACcitation = "%%CITATION = HEP-LAT 0509036;%%" +} +@Article{Farchioni:2005tu, + author = "Farchioni, F. and others", + title = "Lattice spacing dependence of the first order phase + transition for dynamical twisted mass fermions", + journal = "Phys. Lett.", + volume = "B624", + year = "2005", + pages = "324-333", + eprint = "hep-lat/0506025", + SLACcitation = "%%CITATION = HEP-LAT 0506025;%%" +} +@Article{Feldmann:1999uf, + author = "Feldmann, Thorsten", + title = "{Quark structure of pseudoscalar mesons}", + journal = "Int. J. Mod. Phys.", + volume = "A15", + year = "2000", + pages = "159-207", + eprint = "hep-ph/9907491", + SLACcitation = "%%CITATION = HEP-PH/9907491;%%" +} +@Article{Feynman:1948aa, + author = "Feynman, R. P.", + title = "Space-time approach to non-relativistic quantum mechanics", + journal = "Rev. Mod. Phys.", + volume = "20", + year = "1948", + pages = "367-387", + SLACcitation = "%%CITATION = RMPHA,20,367;%%" +} +@Article{Fischer:1996th, + author = "Fischer, S. and others", + title = "A Parallel SSOR Preconditioner for Lattice {QCD}", + journal = "Comp. Phys. Commun.", + volume = "98", + year = "1996", + pages = "20-34", + eprint = "hep-lat/9602019", + SLACcitation = "%%CITATION = HEP-LAT 9602019;%%" +} +@Article{Fokkema:1998aa, + author = "Fokkema, D.~R. and Sleijpen, G.~L.~G. and Van~der~Vorst, H.~A.", + title = "{J}acobi-{D}avidson style {QR} and {QZ} algorithms for + the reduction of matrix pencils", + journal = "J. Sci. Comput.", + volume = "20", + year = "1998", + pages = "94-125", +} +@Article{Foster:1998vw, + author = "Foster, M. and Michael, C.", + collaboration = "UKQCD", + title = "Quark mass dependence of hadron masses from lattice {QCD}", + journal = "Phys. Rev.", + volume = "D59", + year = "1999", + pages = "074503", + eprint = "hep-lat/9810021", + SLACcitation = "%%CITATION = HEP-LAT 9810021;%%" +} +@Article{Freund, + author = "Freund, R.W.", + journal = "in Numerical Linear Algebra, L.\ Reichel, A.\ Ruttan and R.S.\ Varga (eds.)", + year = "1993", + pages = "p. 101", +} +@Article{Frezzotti:1997ym, + author = "Frezzotti, R. and Jansen, K.", + title = "A polynomial hybrid Monte Carlo algorithm", + journal = "Phys. Lett.", + volume = "B402", + year = "1997", + pages = "328-334", + eprint = "hep-lat/9702016", + SLACcitation = "%%CITATION = HEP-LAT 9702016;%%" +} +@Article{Frezzotti:1998eu, + author = "Frezzotti, R. and Jansen, K.", + title = "The {PHMC} algorithm for simulations of dynamical fermions. + {I}: Description and properties", + journal = "Nucl. Phys.", + volume = "B555", + year = "1999", + pages = "395-431", + eprint = "hep-lat/9808011", + SLACcitation = "%%CITATION = HEP-LAT 9808011;%%" +} +@ArticleF{Frezzotti:1998yp, + author = "Frezzotti, R. and Jansen, K.", + title = "The {PHMC} algorithm for simulations of dynamical fermions. + {II}: Performance analysis", + journal = "Nucl. Phys.", + volume = "B555", + year = "1999", + pages = "432-453", + eprint = "hep-lat/9808038", + SLACcitation = "%%CITATION = HEP-LAT 9808038;%%" +} +@Article{Frezzotti:1999vv, + author = "Frezzotti, R. and Grassi, P. A. and Sint, + S. and Weisz, P.", + title = "A local formulation of lattice {QCD} without unphysical + fermion zero modes", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "83", + year = "2000", + pages = "941-946", + eprint = "hep-lat/9909003", + SLACcitation = "%%CITATION = HEP-LAT 9909003;%%" +} +@Article{Frezzotti:2000nk, + author = "Frezzotti, R. and Grassi, P. A. and Sint, + S. and Weisz, P.", + collaboration = "ALPHA", + title = "Lattice {QCD} with a chirally twisted mass term", + journal = "JHEP", + volume = "08", + year = "2001", + pages = "058", + eprint = "hep-lat/0101001", + SLACcitation = "%%CITATION = HEP-LAT 0101001;%%" +} +@Article{Frezzotti:2001du, + author = "Frezzotti, R. and Sint, S.", + title = "Some remarks on {O(a)} improved twisted mass {QCD}", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "106", + year = "2002", + pages = "814-816", + eprint = "hep-lat/0110140", + SLACcitation = "%%CITATION = HEP-LAT 0110140;%%" +} +@Article{Frezzotti:2001ea, + author = "Frezzotti, R. and Sint, S. and Weisz, P.", + collaboration = "ALPHA", + title = "{O(a)} improved twisted mass lattice {QCD}", + journal = "JHEP", + volume = "07", + year = "2001", + pages = "048", + eprint = "hep-lat/0104014", + SLACcitation = "%%CITATION = HEP-LAT 0104014;%%" +} +@Article{Frezzotti:2003ni, + author = "Frezzotti, R. and Rossi, G. C.", + title = "Chirally improving {Wilson} fermions. {I}: {O(a)} improvement", + journal = "JHEP", + volume = "08", + year = "2004", + pages = "007", + eprint = "hep-lat/0306014", + SLACcitation = "%%CITATION = HEP-LAT 0306014;%%" +} +@Article{Frezzotti:2003xj, + author = "Frezzotti, R. and Rossi, G. C.", + title = "Twisted-mass lattice {QCD} with mass non-degenerate quarks", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "128", + year = "2004", + pages = "193-202", + eprint = "hep-lat/0311008", + SLACcitation = "%%CITATION = HEP-LAT 0311008;%%" +} +@Article{Frezzotti:2004wz, + author = "Frezzotti, R. and Rossi, G. C.", + title = "Chirally improving {Wilson} fermions. {II}: Four-quark + operators", + journal = "JHEP", + volume = "10", + year = "2004", + pages = "070", + eprint = "hep-lat/0407002", + SLACcitation = "%%CITATION = HEP-LAT 0407002;%%" +} +@Article{Frezzotti:2005gi, + author = "Frezzotti, R. and Martinelli, G. and Papinutto, M. and + Rossi, G. C.", + title = "Reducing cutoff effects in maximally twisted lattice {QCD} + close to the chiral limit", + journal = "JHEP", + volume = "04", + year = "2006", + pages = "038", + eprint = "hep-lat/0503034", + SLACcitation = "%%CITATION = HEP-LAT 0503034;%%" +} +@Article{Frezzotti:2007qv, + author = "Frezzotti, R. and Rossi, G.", + title = "{O(a^2) cutoff effects in Wilson fermion simulations}", + journal = "PoS", + volume = "LAT2007", + year = "2007", + pages = "277", + eprint = "0710.2492", + archivePrefix = "arXiv", + primaryClass = "hep-lat", + SLACcitation = "%%CITATION = 0710.2492;%%" +} +@Article{Frezzotti:2008dr, + author = "Frezzotti, R. and Lubicz, V. and Simula, S.", + collaboration = "ETM", + title = "{Electromagnetic form factor of the pion from twisted-mass + lattice {QCD} at {Nf}=2}", + year = "2008", + eprint = "0812.4042", + archivePrefix = "arXiv", + primaryClass = "hep-lat", + SLACcitation = "%%CITATION = 0812.4042;%%" +} +@Article{Fritzsch:1973pi, + author = "Fritzsch, H. and Gell-Mann, M. and Leutwyler, H.", + title = "Advantages of the color octet gluon picture", + journal = "Phys. Lett.", + volume = "B47", + year = "1973", + pages = "365-368", + SLACcitation = "%%CITATION = PHLTA,B47,365;%%" +} +@Article{Frommer:1994vn, + author = "Frommer, A. and Hannemann, V. and Nockel, B. and Lippert, + T. and Schilling, K.", + title = "Accelerating {Wilson} fermion matrix inversions by means of + the stabilized biconjugate gradient algorithm", + journal = "Int. J. Mod. Phys.", + volume = "C5", + year = "1994", + pages = "1073-1088", + eprint = "hep-lat/9404013", + SLACcitation = "%%CITATION = HEP-LAT 9404013;%%" +} +@Article{Frommer:1995ik, + author = "Frommer, Andreas and Nockel, Bertold and Gusken, Stephan + and Lippert, Thomas and Schilling, Klaus", + title = "Many masses on one stroke: Economic computation of quark + propagators", + journal = "Int. J. Mod. Phys.", + volume = "C6", + year = "1995", + pages = "627-638", + eprint = "hep-lat/9504020", + SLACcitation = "%%CITATION = HEP-LAT 9504020;%%" +} +@Article{Furman:1994ky, + author = "Furman, V. and Shamir, Y.", + title = "Axial symmetries in lattice QCD with Kaplan fermions", + journal = "Nucl. Phys.", + volume = "B439", + year = "1995", + pages = "54-78", + eprint = "hep-lat/9405004", + SLACcitation = "%%CITATION = HEP-LAT 9405004;%%" +} +@Article{Garden:1999fg, + author = "Garden, J. and Heitger, J. and Sommer, R. and + Wittig H.", + collaboration = "ALPHA", + title = "Precision computation of the strange quark's mass in + quenched {QCD}", + journal = "Nucl. Phys.", + volume = "B571", + year = "2000", + pages = "237-256", + eprint = "hep-lat/9906013", + SLACcitation = "%%CITATION = HEP-LAT 9906013;%%" +} +@Article{Garron:2003cb, + author = "Garron, N. and Giusti, L. and Hoelbling, + C. and Lellouch, L. and Rebbi, C.", + title = "B(K) from quenched {QCD} with exact chiral symmetry", + journal = "Phys. Rev. Lett.", + volume = "92", + year = "2004", + pages = "042001", + eprint = "hep-ph/0306295", + SLACcitation = "%%CITATION = HEP-PH 0306295;%%" +} +@Article{Gasser:1982ap, + author = "Gasser, J. and Leutwyler, H.", + title = "Quark masses", + journal = "Phys. Rept.", + volume = "87", + year = "1982", + pages = "77-169", + SLACcitation = "%%CITATION = PRPLC,87,77;%%" +} + +@Article{Gasser:1983yg, + author = "Gasser, J. and Leutwyler, H.", + title = "Chiral perturbation theory to one loop", + journal = "Ann. Phys.", + volume = "158", + year = "1984", + pages = "142", + SLACcitation = "%%CITATION = APNYA,158,142;%%" +} +@Article{Gasser:1985gg, + author = "Gasser, J. and Leutwyler, H.", + title = "Chiral perturbation theory: expansions in the mass of the + strange quark", + journal = "Nucl. Phys.", + volume = "B250", + year = "1985", + pages = "465", + SLACcitation = "%%CITATION = NUPHA,B250,465;%%" +} +@Article{Gasser:1986vb, + author = "Gasser, J. and Leutwyler, H.", + title = "LIGHT QUARKS AT LOW TEMPERATURES", + journal = "Phys. Lett.", + volume = "B184", + year = "1987", + pages = "83", + SLACcitation = "%%CITATION = PHLTA,B184,83;%%" +} +@Article{Gattringer:2003qx, + author = "Gattringer, C. and others", + collaboration = "BGR", + title = "Quenched spectroscopy with fixed-point and chirally + improved fermions", + journal = "Nucl. Phys.", + volume = "B677", + year = "2004", + pages = "3-51", + eprint = "hep-lat/0307013", + SLACcitation = "%%CITATION = HEP-LAT 0307013;%%" +} +@Article{Gell-Mann:1964nj, + author = "Gell-Mann, M.", + title = "A Schematic model of baryons and mesons", + journal = "Phys. Lett.", + volume = "8", + year = "1964", + pages = "214-215", + SLACcitation = "%%CITATION = PHLTA,8,214;%%" +} +@Article{Gell-Mann:1968rz, + author = "Gell-Mann, M. and Oakes, R. J. and Renner, B.", + title = "Behavior of current divergences under SU(3) x SU(3)", + journal = "Phys. Rev.", + volume = "175", + year = "1968", + pages = "2195-2199", + SLACcitation = "%%CITATION = PHRVA,175,2195;%%" +} +@PhdThesis{Geus:2002, + author = {R. Geus}, + title = {The Jacobi-Davidson algorithm for solving large + sparse symmetric eigenvalue problems with + application to the design of accelerator cavities}, + school = {Swiss Federal Institute Of Technology Z{\"u}rich}, + year = {2002}, + OPTkey = {DISS. ETH NO. 14734}, + OPTtype = {}, + OPTaddress = {}, + OPTmonth = {}, + OPTnote = {}, + OPTannote = {} +} +@Article{Gimenez:1998ue, + author = "Gimenez, V. and Giusti, L. and Rapuano, F. and Talevi, M.", + title = "Non-perturbative renormalization of quark bilinears", + journal = "Nucl. Phys.", + volume = "B531", + year = "1998", + pages = "429-445", + eprint = "hep-lat/9806006", + SLACcitation = "%%CITATION = HEP-LAT 9806006;%%" +} +@Article{Gimenez:2005nt, + author = "Gimenez, V. and Lubicz, V. and Mescia, F. and Porretti, V. + and Reyes, J.", + title = "{Operator product expansion and quark condensate from + lattice QCD in coordinate space}", + journal = "Eur. Phys. J.", + volume = "C41", + year = "2005", + pages = "535-544", + eprint = "hep-lat/0503001", + SLACcitation = "%%CITATION = HEP-LAT/0503001;%%" +} +@Article{Ginsparg:1981bj, + author = "Ginsparg, P. H. and {Wilson}, K. G.", + title = "A remnant of chiral symmetry on the lattice", + journal = "Phys. Rev.", + volume = "D25", + year = "1982", + pages = "2649", + SLACcitation = "%%CITATION = PHRVA,D25,2649;%%" +} +@Article{Giusti:1998wy, + author = "Giusti, L. and Rapuano, F. and Talevi, M. and Vladikas, A. + ", + title = "The QCD chiral condensate from the lattice", + journal = "Nucl. Phys.", + volume = "B538", + year = "1999", + pages = "249-277", + eprint = "hep-lat/9807014", + SLACcitation = "%%CITATION = HEP-LAT 9807014;%%" +} +@Article{Giusti:2001pk, + author = "Giusti, L. and Hoelbling, C. and Rebbi, C.", + title = "Light quark masses with overlap fermions in quenched {QCD}", + journal = "Phys. Rev.", + volume = "D64", + year = "2001", + pages = "114508", + eprint = "hep-lat/0108007", + note = "Erratum-ibid.D65:079903,2002", + SLACcitation = "%%CITATION = HEP-LAT 0108007;%%" +} +@Article{Giusti:2002sm, + author = "Giusti, L. and Hoelbling, C. and L{\"u}scher, M. and Wittig, H. + ", + title = "Numerical techniques for lattice QCD in the epsilon- + regime", + journal = "Comput. Phys. Commun.", + volume = "153", + year = "2003", + pages = "31-51", + eprint = "hep-lat/0212012", + SLACcitation = "%%CITATION = HEP-LAT 0212012;%%" +} +@Article{Giusti:2007hk, + author = "Giusti, Leonardo", + title = "Light dynamical fermions on the lattice: Toward the chiral + regime of QCD", + journal = "PoS.", + volume = "LAT2006", + year = "2007", + pages = "", + eprint = "hep-lat/0702014", + SLACcitation = "%%CITATION = HEP-LAT/0702014;%%" +} +@Article{Glassner:1996gz, + author = "Gl{\"a}ssner, U. and others", + title = "How to compute {G}reen's functions for entire mass + trajectories within {K}rylov solvers", + year = "1996", + eprint = "hep-lat/9605008", + SLACcitation = "%%CITATION = HEP-LAT 9605008;%%" +} +@Article{Gockeler:1998fn, + author = "G{\"o}ckeler, M. and others", + title = "Scaling of non-perturbatively {O(a)} improved {Wilson} + fermions: Hadron spectrum, quark masses and decay + constants", + journal = "Phys. Rev.", + volume = "D57", + year = "1998", + pages = "5562-5580", + eprint = "hep-lat/9707021", + SLACcitation = "%%CITATION = HEP-LAT 9707021;%%" +} +@Article{Gorishnii:1990vf, + author = "Gorishnii, S. G. and Kataev, A. L. and Larin, S. A.", + title = "{The O (alpha-s**3) corrections to sigma-tot (e+ e- $\to$ + hadrons) and Gamma (tau- $\to$ tau-neutrino + hadrons) in + QCD}", + journal = "Phys. Lett.", + volume = "B259", + year = "1991", + pages = "144-150", + SLACcitation = "%%CITATION = PHLTA,B259,144;%%" +} +@Article{Greenberg:1964pe, + author = "Greenberg, O. W.", + title = "Spin and unitary spin independence in a paraquark model of + baryons and mesons", + journal = "Phys. Rev. Lett.", + volume = "13", + year = "1964", + pages = "598-602", + SLACcitation = "%%CITATION = PRLTA,13,598;%%" +} +@Article{Gregory:2007ce, + author = "Gregory, Eric B. and Irving, Alan and Richards, Chris M. + and McNeile, Craig and Hart, Alistair", + title = "Pseudoscalar Flavor-Singlet Physics with Staggered + Fermions", + year = "2007", + archivePrefix = "arXiv", + primaryClass = "hep-lat", + eprint = "0710.1725", + SLACcitation = "%%CITATION = ARXIV:0710.1725;%%" +} +@Article{Gross:1973id, + author = "Gross, D. J. and Wilczek, F.", + title = "Ultraviolet behavior of non-Abelian gauge theories", + journal = "Phys. Rev. Lett.", + volume = "30", + year = "1973", + pages = "1343-1346", + SLACcitation = "%%CITATION = PRLTA,30,1343;%%" +} +@Article{Gross:1973ju, + author = "Gross, D. J. and Wilczek, F.", + title = "Asymptotically free gauge theories. 1", + journal = "Phys. Rev.", + volume = "D8", + year = "1973", + pages = "3633-3652", + SLACcitation = "%%CITATION = PHRVA,D8,3633;%%" +} +@Article{Gross:1974jv, + author = "Gross, D. J. and Neveu, A.", + title = "Dynamical symmetry breaking in asymptotically free field + theories", + journal = "Phys. Rev.", + volume = "D10", + year = "1974", + pages = "3235", + SLACcitation = "%%CITATION = PHRVA,D10,3235;%%" +} +@Article{Guagnelli:1998ud, + author = "Guagnelli, M. and Sommer, R. and Wittig, H.", + collaboration = "ALPHA", + title = "Precision computation of a low-energy reference scale in + quenched lattice {QCD}", + journal = "Nucl. Phys.", + volume = "B535", + year = "1998", + pages = "389-402", + eprint = "hep-lat/9806005", + SLACcitation = "%%CITATION = HEP-LAT 9806005;%%" +} +@Article{Guagnelli:2004ga, + author = "Guagnelli, M. and others", + collaboration = "Zeuthen-Rome (ZeRo)", + title = "Non-perturbative pion matrix element of a twist-2 operator + from the lattice", + journal = "Eur. Phys. J.", + volume = "C40", + year = "2005", + pages = "69-80", + eprint = "hep-lat/0405027", + SLACcitation = "%%CITATION = HEP-LAT 0405027;%%" +} +@Article{Guagnelli:2004ww, + author = "Guagnelli, M. and others", + collaboration = "Zeuthen-Rome (ZeRo)", + title = "Finite size effects of a pion matrix element", + journal = "Phys. Lett.", + volume = "B597", + year = "2004", + pages = "216-221", + eprint = "hep-lat/0403009", + SLACcitation = "%%CITATION = HEP-LAT 0403009;%%" +} +@Article{Guagnelli:2005zc, + author = "Guagnelli, M. and Heitger, J. and Pena, C. and Sint, S. and + Vladikas, A.", + collaboration = "ALPHA", + title = "Non-perturbative renormalization of left-left four-fermion + operators in quenched lattice QCD", + journal = "JHEP", + volume = "03", + year = "2006", + pages = "088", + eprint = "hep-lat/0505002", + SLACcitation = "%%CITATION = HEP-LAT 0505002;%%" +} +@Article{Gupta:1988js, + author = "Gupta, R. and Kilcup, G. W. and Sharpe, S. R. + ", + title = "Tuning the hybrid monte carlo algorithm", + journal = "Phys. Rev.", + volume = "D38", + year = "1988", + pages = "1278", + SLACcitation = "%%CITATION = PHRVA,D38,1278;%%" +} +@Article{Gupta:1989kx, + author = "Gupta, R. and others", + title = "{QCD} with dynamical {Wilson} fermions", + journal = "Phys. Rev.", + volume = "D40", + year = "1989", + pages = "2072", + SLACcitation = "%%CITATION = PHRVA,D40,2072;%%" +} +@Article{Gupta:1990ka, + author = "Gupta, S. and Irback, A. and Karsch, F. and + Petersson, B.", + title = "The acceptance probability in the hybrid monte carlo + method", + journal = "Phys. Lett.", + volume = "B242", + year = "1990", + pages = "437-443", + SLACcitation = "%%CITATION = PHLTA,B242,437;%%" +} +@Article{Gupta:1991sn, + author = "Gupta, R. and others", + title = "{QCD} with dynamical {Wilson} fermions. 2", + journal = "Phys. Rev.", + volume = "D44", + year = "1991", + pages = "3272-3292", + SLACcitation = "%%CITATION = PHRVA,D44,3272;%%" +} +@Unpublished{Gupta:1997nd, + author = "Gupta, R.", + title = "Introduction to lattice {QCD}", + year = "1997", + eprint = "hep-lat/9807028", + note = "Lectures given at Les Houches Summer School in Theoretical Physics, Session 68", + SLACcitation = "%%CITATION = HEP-LAT 9807028;%%" +} +@Article{Han:1965pf, + author = "Han, M. Y. and Nambu, Yoichiro", + title = "Three-triplet model with double SU(3) symmetry", + journal = "Phys. Rev.", + volume = "139", + year = "1965", + pages = "B1006-B1010", + SLACcitation = "%%CITATION = PHRVA,139,B1006;%%" +} +@Article{Hasenbusch:2001ne, + author = "Hasenbusch, M.", + title = "Speeding up the {H}ybrid-{M}onte-{C}arlo algorithm for dynamical + fermions", + journal = "Phys. Lett.", + volume = "B519", + year = "2001", + pages = "177-182", + eprint = "hep-lat/0107019", + SLACcitation = "%%CITATION = HEP-LAT 0107019;%%" +} +@article{Hasenbusch:2002ai, + author = "Hasenbusch, M. and Jansen, K.", + title = "{Speeding up lattice QCD simulations with clover improved + Wilson fermions}", + journal = "Nucl.Phys.", + volume = "B659", + pages = "299-320", + doi = "10.1016/S0550-3213(03)00227-X", + year = "2003", + eprint = "hep-lat/0211042", + archivePrefix = "arXiv", + primaryClass = "hep-lat", + reportNumber = "DESY-02-200", + SLACcitation = "%%CITATION = HEP-LAT/0211042;%%", +} +@Article{Hasenbusch:2003vg, + author = "Hasenbusch, M.", + title = "Full {QCD} algorithms towards the chiral limit", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "129", + year = "2004", + pages = "27-33", + eprint = "hep-lat/0310029", + SLACcitation = "%%CITATION = HEP-LAT 0310029;%%" +} +@Article{Hasenfratz:1998jp, + author = "Hasenfratz, P.", + title = "Lattice {QCD} without tuning, mixing and current + renormalization", + journal = "Nucl. Phys.", + volume = "B525", + year = "1998", + pages = "401-409", + eprint = "hep-lat/9802007", + SLACcitation = "%%CITATION = HEP-LAT 9802007;%%" +} +@Article{Hasenfratz:1998ri, + author = "Hasenfratz, P. and Laliena, V. and Niedermayer, + F.", + title = "The index theorem in {QCD} with a finite cut-off", + journal = "Phys. Lett.", + volume = "B427", + year = "1998", + pages = "125-131", + eprint = "hep-lat/9801021", + SLACcitation = "%%CITATION = HEP-LAT 9801021;%%" +} +@Article{Hasenfratz:2001hp, + author = "Hasenfratz, A. and Knechtli, F.", + title = "Flavor symmetry and the static potential with hypercubic + blocking", + journal = "Phys. Rev.", + volume = "D64", + year = "2001", + pages = "034504", + eprint = "hep-lat/0103029", + SLACcitation = "%%CITATION = HEP-LAT 0103029;%%" +} +@Article{Hasenfratz:2001tw, + author = "Hasenfratz, A. and Hoffmann, R. and Knechtli, F.", + title = "The static potential with hypercubic blocking", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "106", + year = "2002", + pages = "418-420", + eprint = "hep-lat/0110168", + SLACcitation = "%%CITATION = HEP-LAT 0110168;%%" +} +@Article{Hashimoto:2008xg, + author = "Hashimoto, Koichi and Izubuchi, Taku", + title = "{eta' meson from two flavor dynamical domain wall + fermions}", + year = "2008", + eprint = "0803.0186", + archivePrefix = "arXiv", + primaryClass = "hep-lat", + SLACcitation = "%%CITATION = ARXIV:0803.0186;%%" +} +@Article{Heitger:2000ay, + author = "Heitger, J. and Sommer, R. and Wittig, H.", + collaboration = "ALPHA", + title = "Effective chiral Lagrangians and lattice {{QCD}}", + journal = "Nucl. Phys.", + volume = "B588", + year = "2000", + pages = "377-399", + eprint = "hep-lat/0006026", + note = "and references therein", + SLACcitation = "%%CITATION = HEP-LAT 0006026;%%" +} +@Article{Hernandez:1998et, + author = "Hernandez, P. and Jansen, K. and L{\"u}scher, M.", + title = "Locality properties of Neuberger's lattice Dirac operator", + journal = "Nucl. Phys.", + volume = "B552", + year = "1999", + pages = "363-378", + eprint = "hep-lat/9808010", + SLACcitation = "%%CITATION = HEP-LAT 9808010;%%" +} +@Article{Hernandez:2000sb, + author = "Hernandez, P. and Jansen, K. and Lellouch, L.", + title = "A numerical treatment of Neuberger's lattice Dirac + operator", + year = "2000", + eprint = "hep-lat/0001008", + SLACcitation = "%%CITATION = HEP-LAT 0001008;%%" +} +@Article{Hernandez:2001hq, + author = "Hernandez, P. and Jansen, K. and Lellouch, L. and + Wittig, H.", + title = "Scalar condensate and light quark masses from overlap + fermions", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "106", + year = "2002", + pages = "766-771", + eprint = "hep-lat/0110199", + SLACcitation = "%%CITATION = HEP-LAT 0110199;%%" +} +@Article{Hernandez:2001yn, + author = "Hernandez, P. and Jansen, K. and Lellouch, L. and + Wittig, H.", + title = "Non-perturbative renormalization of the quark condensate in + {Ginsparg}-{Wilson} regularizations", + journal = "JHEP", + volume = "07", + year = "2001", + pages = "018", + eprint = "hep-lat/0106011", + SLACcitation = "%%CITATION = HEP-LAT 0106011;%%" +} +@Article{Horsley:2004mx, + author = "Horsley, R. and Perlt, H. and Rakow, P. E. L. and + Schierholz, G. and Schiller, A.", + collaboration = "QCDSF", + title = "One-loop renormalisation of quark bilinears for overlap + fermions with improved gauge actions", + journal = "Nucl. Phys.", + volume = "B693", + year = "2004", + pages = "3-35", + eprint = "hep-lat/0404007", + SLACcitation = "%%CITATION = HEP-LAT 0404007;%%" +} +@Article{Ilgenfritz:2003gw, + author = "Ilgenfritz, E.-M. and Kerler, W. and + M{\"u}ller-Preu{\ss}ker, M. and Sternbeck, A. and St{\"u}ben, H.", + title = "A numerical reinvestigation of the {Aoki} phase with {N(f)} = 2 + {Wilson} fermions at zero temperature", + journal = "Phys. Rev.", + volume = "D69", + year = "2004", + pages = "074511", + eprint = "hep-lat/0309057", + SLACcitation = "%%CITATION = HEP-LAT 0309057;%%" +} +@Article{Ilgenfritz:2006tz, + author = "Ilgenfritz, E. -M. and others", + title = "Twisted mass QCD thermodynamics: First results on apeNEXT", + year = "2006", + eprint = "hep-lat/0610112", + SLACcitation = "%%CITATION = HEP-LAT 0610112;%%" +} +@Article{Iwasaki:1983ck, + author = "Iwasaki, Y.", + title = "Renormalization group analysis of lattice theories and + improved lattice action. 2. four-dimensional nonabelian + SU(N) gauge model", + note = "UTHEP-118" +} +@Article{Iwasaki:1985we, + author = "Iwasaki, Y.", + title = "Renormalization group analysis of lattice theories and + improved lattice action: two-dimensional nonlinear O(N) + sigma model", + journal = "Nucl. Phys.", + volume = "B258", + year = "1985", + pages = "141-156", + SLACcitation = "%%CITATION = NUPHA,B258,141;%%" +} +@Article{Iwasaki:1992hn, + author = "Iwasaki, Y. and Kanaya, K. and Sakai, S. and Yoshie, T.", + title = "Quark confinement in multi - flavor quantum + chromodynamics", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "30", + year = "1993", + pages = "327-330", + eprint = "hep-lat/9211035", + SLACcitation = "%%CITATION = HEP-LAT 9211035;%%" +} +@Article{Izubuchi:1998hy, + author = "Izubuchi, T. and Noaki, J. and Ukawa, A.", + title = "Two-dimensional lattice Gross-Neveu model with {Wilson} + fermion action at finite temperature and chemical + potential", + journal = "Phys. Rev.", + volume = "D58", + year = "1998", + pages = "114507", + eprint = "hep-lat/9805019", + SLACcitation = "%%CITATION = HEP-LAT 9805019;%%" +} +@Article{Jacobs:1983ph, + author = "Jacobs, L.", + title = "Undoubling chirally symmetric lattice fermions", + journal = "Phys. Rev. Lett.", + volume = "51", + year = "1983", + pages = "172", + SLACcitation = "%%CITATION = PRLTA,51,172;%%" +} +@Article{Jagels:1994a, + author = "Jagels, C. F. and Reichel, L.", + title = " fast minimal residual algorithm for shifted unitary matrices", + journal = "Numer. Linear Algebra Appl.", + volume = "1(6)", + pages = "555-570", + year = "1994" +} +@Article{Jagels:1994aa, + author = "Jagels, C. F. and Reichel, L.", + title = "A Fast Minimal Residual Algorithm for Shifted Unitary + Matrices", + journal = "Numerical Linear Algebra with Aplications", + volume = "1(6)", + year = "1994", + pages = "555-570", +} +@Article{Jansen:1994ym, + author = "Jansen, K.", + title = "Domain wall fermions and chiral gauge theories", + journal = "Phys. Rept.", + volume = "273", + year = "1996", + pages = "1-54", + eprint = "hep-lat/9410018", + SLACcitation = "%%CITATION = HEP-LAT 9410018;%%" +} +@Article{Jansen:1995ck, + author = "Jansen, Karl and others", + title = "Non-perturbative renormalization of lattice QCD at all + scales", + journal = "Phys. Lett.", + volume = "B372", + year = "1996", + pages = "275-282", + eprint = "hep-lat/9512009", + SLACcitation = "%%CITATION = HEP-LAT 9512009;%%" +} +@Article{Jansen:1996cq, + author = "Jansen, K. and Liu, C.", + title = "Study of Liapunov exponents and the reversibility of + molecular dynamics algorithms", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "53", + year = "1997", + pages = "974-976", + eprint = "hep-lat/9607057", + SLACcitation = "%%CITATION = HEP-LAT 9607057;%%" +} +@Article{Jansen:1996xp, + author = "Jansen, K.", + title = "Recent developments in fermion simulation algorithms", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "53", + year = "1997", + pages = "127-133", + eprint = "hep-lat/9607051", + SLACcitation = "%%CITATION = HEP-LAT 9607051;%%" +} +@Article{Jansen:1997yt, + author = "Jansen, K. and Liu, C.", + title = "Implementation of Symanzik's improvement program for + simulations of dynamical {Wilson} fermions in lattice {QCD}", + journal = "Comput. Phys. Commun.", + volume = "99", + year = "1997", + pages = "221-234", + eprint = "hep-lat/9603008", + SLACcitation = "%%CITATION = HEP-LAT 9603008;%%" +} +@Article{Jansen:1998mx, + author = "Jansen, K. and Sommer, R.", + collaboration = "ALPHA", + title = "O(alpha) improvement of lattice {QCD} with two flavors of + {Wilson} quarks", + journal = "Nucl. Phys.", + volume = "B530", + year = "1998", + pages = "185-203", + eprint = "hep-lat/9803017", + SLACcitation = "%%CITATION = HEP-LAT 9803017;%%" +} +@Article{Jansen:2003ir, + author = "Jansen, K. and Shindler, A. and Urbach, C. and + Wetzorke, I.", + collaboration = "\xlf", + title = "Scaling test for {Wilson} twisted mass {QCD}", + journal = "Phys. Lett.", + volume = "B586", + year = "2004", + pages = "432-438", + eprint = "hep-lat/0312013", + SLACcitation = "%%CITATION = HEP-LAT 0312013;%%" +} +@Article{Jansen:2003jq, + author = "Jansen, K. and Nagai, K.-I.", + title = "Reducing residual-mass effects for domain-wall fermions", + journal = "JHEP", + volume = "12", + year = "2003", + pages = "038", + eprint = "hep-lat/0305009", + SLACcitation = "%%CITATION = HEP-LAT 0305009;%%" +} +@Article{Jansen:2003nt, + author = "Jansen, K.", + title = "Actions for dynamical fermion simulations: Are we ready to + go?", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "129", + year = "2004", + pages = "3-16", + eprint = "hep-lat/0311039", + SLACcitation = "%%CITATION = HEP-LAT 0311039;%%" +} +@Article{Jansen:2005cg, + author = "Jansen, K. and others", + collaboration = "\xlf", + title = "Flavour breaking effects of {Wilson} twisted mass fermions", + journal = "Phys. Lett.", + volume = "B624", + year = "2005", + pages = "334-341", + eprint = "hep-lat/0507032", + SLACcitation = "%%CITATION = HEP-LAT 0507032;%%" +} +@Unpublished{Jansen:2005chi, + author = {Jansen, K. and others}, +collaborations = {\xlf}, + title = {}, + note = {in preparation}, + OPTkey = {}, + OPTmonth = {}, + year = {2005}, + OPTannote = {} +} +@Article{Jansen:2005gf, + author = "Jansen, K. and Papinutto, M. and Shindler, A. and Urbach, + C. and Wetzorke, I.", + collaboration = "\xlf", + title = "Light quarks with twisted mass fermions", + journal = "Phys. Lett.", + volume = "B619", + year = "2005", + pages = "184-191", + eprint = "hep-lat/0503031", + SLACcitation = "%%CITATION = HEP-LAT 0503031;%%" +} +@Article{Jansen:2005kk, + author = "Jansen, K. and Papinutto, M. and Shindler, A. and Urbach, + C. and Wetzorke, I.", + collaboration = "\xlf", + title = "Quenched scaling of {Wilson} twisted mass fermions", + journal = "JHEP", + volume = "09", + year = "2005", + pages = "071", + eprint = "hep-lat/0507010", + SLACcitation = "%%CITATION = HEP-LAT 0507010;%%" +} +@Article{Jansen:2005yp, + author = "Jansen, Karl and Shindler, Andrea and Urbach, Carsten and + Wenger, Urs", + title = "{HMC} algorithm with multiple time scale integration and mass + preconditioning", + journal = "PoS", + volume = "LAT2005", + year = "2006", + pages = "118", + eprint = "hep-lat/0510064", + SLACcitation = "%%CITATION = HEP-LAT 0510064;%%" +} +@Article{Jansen:2006ks, + author = "Jansen, Karl", + title = "Status report on ILDG activities", + year = "2006", + eprint = "hep-lat/0609012", + SLACcitation = "%%CITATION = HEP-LAT 0609012;%%" +} +@Article{Jansen:2006rf, + author = "Jansen, Karl and Urbach, Carsten", + collaboration = "ETM", + title = "First results with two light flavours of quarks with + maximally twisted mass", + year = "2006", + eprint = "hep-lat/0610015", + SLACcitation = "%%CITATION = HEP-LAT 0610015;%%" +} +@Article{Jansen:2008wv, + author = "Jansen, K. and Michael, C. and Urbach, C.", + collaboration = "ETM", + title = "The eta' meson from lattice {QCD}", + year = "2008", + eprint = "0804.3871", + archivePrefix = "arXiv", + primaryClass = "hep-lat", + SLACcitation = "%%CITATION = 0804.3871;%%" +} +@Article{Jansen:2008zz, + author = "Jansen, K. and Michael, C. and Urbach, C.", + title = "{The eta-prime meson from lattice QCD}", + journal = "Eur. Phys. J.", + volume = "C58", + year = "2008", + pages = "261-269", + doi = "10.1140/epjc/s10052-008-0764-6", + SLACcitation = "%%CITATION = EPHJA,C58,261;%%" +} +@Unpublished{Jegerlehner:1996pm, + author = "Jegerlehner, Beat", + title = "Krylov space solvers for shifted linear systems", + year = "1996", + eprint = "hep-lat/9612014", + note = "unpublished", + SLACcitation = "%%CITATION = HEP-LAT 9612014;%%" +} +@Article{Jegerlehner:1997rn, + author = "Jegerlehner, B.", + title = "Multiple mass solvers", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "63", + year = "1998", + pages = "958-960", + eprint = "hep-lat/9708029", + SLACcitation = "%%CITATION = HEP-LAT 9708029;%%" +} +@Article{Jegerlehner:2003qp, + author = "Jegerlehner, F.", + title = "Theoretical precision in estimates of the hadronic + contributions to (g-2)mu and alpha(QED)(M(Z))", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "126", + year = "2004", + pages = "325-334", + eprint = "hep-ph/0310234", + SLACcitation = "%%CITATION = HEP-PH 0310234;%%" +} + +@Article{Jenkins:1990jv, + author = "Jenkins, Elizabeth Ellen and Manohar, Aneesh V.", + title = "Baryon chiral perturbation theory using a heavy fermion + Lagrangian", + journal = "Phys. Lett.", + volume = "B255", + year = "1991", + pages = "558-562", + SLACcitation = "%%CITATION = PHLTA,B255,558;%%" +} +@Article{Kaiser:1998ds, + author = "Kaiser, Roland and Leutwyler, H.", + title = "{Pseudoscalar decay constants at large N(c)}", + year = "1998", + eprint = "hep-ph/9806336", + SLACcitation = "%%CITATION = HEP-PH/9806336;%%" +} +@Article{Kalkreuter:1995mm, + author = "Kalkreuter, Thomas and Simma, Hubert", + title = "An Accelerated conjugate gradient algorithm to compute low + lying eigenvalues: A Study for the Dirac operator in SU(2) + lattice QCD", + journal = "Comput. Phys. Commun.", + volume = "93", + year = "1996", + pages = "33-47", + eprint = "hep-lat/9507023", + SLACcitation = "%%CITATION = HEP-LAT 9507023;%%" +} +@Article{Kalkreuter:1996mm, + author = "Kalkreuter, T. and Simma, H.", + title = "An Accelerated conjugate gradient algorithm to compute low + lying eigenvalues: A Study for the Dirac operator in SU(2) + lattice {QCD}", + journal = "Comput. Phys. Commun.", + volume = "93", + year = "1996", + pages = "33-47", + eprint = "hep-lat/9507023", + SLACcitation = "%%CITATION = HEP-LAT 9507023;%%" +} +@Article{Kaplan:1992bt, + author = "Kaplan, D. B.", + title = "A Method for simulating chiral fermions on the lattice", + journal = "Phys. Lett.", + volume = "B288", + year = "1992", + pages = "342-347", + eprint = "hep-lat/9206013", + SLACcitation = "%%CITATION = HEP-LAT 9206013;%%" +} +@Article{Karsten:1980wd, + author = "Karsten, L. H. and Smit, J.", + title = "Lattice fermions: species doubling, chiral invariance, and + the triangle anomaly", + journal = "Nucl. Phys.", + volume = "B183", + year = "1981", + pages = "103", + SLACcitation = "%%CITATION = NUPHA,B183,103;%%" +} +@Article{Kennedy:1990bv, + author = "Kennedy, A. D. and Pendleton, B.", + title = "Acceptances and autocorrelations in hybrid Monte Carlo", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "20", + year = "1991", + pages = "118-121", + SLACcitation = "%%CITATION = NUPHZ,20,118;%%" +} +@Article{Knechtli:1998gf, + author = "Knechtli, F. and Sommer, R.", + collaboration = "ALPHA", + title = "String breaking in SU(2) gauge theory with scalar matter + fields", + journal = "Phys. Lett.", + volume = "B440", + year = "1998", + pages = "345-352", + eprint = "hep-lat/9807022", + SLACcitation = "%%CITATION = HEP-LAT 9807022;%%" +} +@Article{Knechtli:2000df, + author = "Knechtli, F. and Sommer, R.", + collaboration = "ALPHA", + title = "String breaking as a mixing phenomenon in the SU(2) Higgs + model", + journal = "Nucl. Phys.", + volume = "B590", + year = "2000", + pages = "309-328", + eprint = "hep-lat/0005021", + SLACcitation = "%%CITATION = HEP-LAT 0005021;%%" +} +@Article{Lacock:1994qx, + author = "Lacock, P. and McKerrell, A. and Michael, C. and Stopher, + I. M. and Stephenson, P. W.", + collaboration = "UKQCD", + title = "Efficient hadronic operators in lattice gauge theory", + journal = "Phys. Rev.", + volume = "D51", + year = "1995", + pages = "6403-6410", + eprint = "hep-lat/9412079", + SLACcitation = "%%CITATION = HEP-LAT 9412079;%%" +} +@Article{Lepage:1992xa, + author = "Lepage, G. Peter and Mackenzie, Paul B.", + title = "On the viability of lattice perturbation theory", + journal = "Phys. Rev.", + volume = "D48", + year = "1993", + pages = "2250-2264", + eprint = "hep-lat/9209022", + SLACcitation = "%%CITATION = HEP-LAT 9209022;%%" +} +@Article{Lepage:2001ym, + author = "Lepage, G. P. and others", + title = "{Constrained curve fitting}", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "106", + year = "2002", + pages = "12-20", + eprint = "hep-lat/0110175", + archivePrefix = "arXiv", + doi = "10.1016/S0920-5632(01)01638-3", + SLACcitation = "%%CITATION = HEP-LAT/0110175;%%" +} +@Article{Lesk:2002gd, + author = "Lesk, V. I. and others", + collaboration = "CP-PACS", + title = "Flavor singlet meson mass in the continuum limit in two- + flavor lattice QCD", + journal = "Phys. Rev.", + volume = "D67", + year = "2003", + pages = "074503", + eprint = "hep-lat/0211040", + SLACcitation = "%%CITATION = HEP-LAT/0211040;%%" +} +@Article{Leutwyler:1997yr, + author = "Leutwyler, H.", + title = "{On the 1/N-expansion in chiral perturbation theory}", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "64", + year = "1998", + pages = "223-231", + eprint = "hep-ph/9709408", + SLACcitation = "%%CITATION = HEP-PH/9709408;%%" +} +@Article{Leutwyler:2006qq, + author = "Leutwyler, H.", + title = "pi pi scattering", + year = "2006", + eprint = "hep-ph/0612112", + SLACcitation = "%%CITATION = HEP-PH 0612112;%%" +} +@Article{Liu:1997fs, + author = "Liu, C. and Jaster, A. and Jansen, K.", + title = "Liapunov exponents and the reversibility of molecular + dynamics algorithms", + journal = "Nucl. Phys.", + volume = "B524", + year = "1998", + pages = "603-617", + eprint = "hep-lat/9708017", + SLACcitation = "%%CITATION = HEP-LAT 9708017;%%" +} +@Article{Luscher:1985dn, + author = "Luscher, M.", + title = "{Volume Dependence of the Energy Spectrum in Massive + Quantum Field Theories. 1. Stable Particle States}", + journal = "Commun. Math. Phys.", + volume = "104", + year = "1986", + pages = "177", + doi = "10.1007/BF01211589", + SLACcitation = "%%CITATION = CMPHA,104,177;%%" +} +@Article{Luscher:1990ck, + author = "L{\"u}scher, M. and Wolff, U.", + title = "How to calculate the elastic scattering matrix in two- + dimensional quantum field theories by numerical + simulation", + journal = "Nucl. Phys.", + volume = "B339", + year = "1990", + pages = "222-252", + SLACcitation = "%%CITATION = NUPHA,B339,222;%%" +} +@Article{Luscher:1993dy, + author = "Luscher, Martin", + title = "{A Portable high quality random number generator for + lattice field theory simulations}", + journal = "Comput. Phys. Commun.", + volume = "79", + year = "1994", + pages = "100-110", + eprint = "hep-lat/9309020", + archivePrefix = "arXiv", + doi = "10.1016/0010-4655(94)90232-1", + SLACcitation = "%%CITATION = HEP-LAT/9309020;%%" +} +@Article{Luscher:1993xx, + author = "L{\"u}scher, M.", + title = "A New approach to the problem of dynamical quarks in + numerical simulations of lattice {QCD}", + journal = "Nucl. Phys.", + volume = "B418", + year = "1994", + pages = "637-648", + eprint = "hep-lat/9311007", + SLACcitation = "%%CITATION = HEP-LAT 9311007;%%" +} +@Article{Luscher:1996sc, + author = "L{\"u}scher, M. and Sint, S. and Sommer, R. and + Weisz, P.", + title = "Chiral symmetry and {O(a)} improvement in lattice {QCD}", + journal = "Nucl. Phys.", + volume = "B478", + year = "1996", + pages = "365-400", + eprint = "hep-lat/9605038", + SLACcitation = "%%CITATION = HEP-LAT 9605038;%%" +} +@Article{Luscher:1996ug, + author = "L{\"u}scher, M. and Sint, S. and Sommer, R. and + Weisz, P. and Wolff, U.", + title = "Non-perturbative {O(a)} improvement of lattice {QCD}", + journal = "Nucl. Phys.", + volume = "B491", + year = "1997", + pages = "323-343", + eprint = "hep-lat/9609035", + SLACcitation = "%%CITATION = HEP-LAT 9609035;%%" +} +@Article{Luscher:1998pq, + author = "L{\"u}scher, M.", + title = "Exact chiral symmetry on the lattice and the {Ginsparg}- + {Wilson} relation", + journal = "Phys. Lett.", + volume = "B428", + year = "1998", + pages = "342-345", + eprint = "hep-lat/9802011", + SLACcitation = "%%CITATION = HEP-LAT 9802011;%%" +} +@Article{Luscher:2001tx, + author = "L{\"u}scher, Martin", + title = "{Lattice QCD on PCs?}", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "106", + year = "2002", + pages = "21-28", + eprint = "hep-lat/0110007", + archivePrefix = "arXiv", + doi = "10.1016/S0920-5632(01)01639-5", + SLACcitation = "%%CITATION = HEP-LAT/0110007;%%" +} +@Article{Luscher:2003qa, + author = "L{\"u}scher, M.", + title = "Solution of the {D}irac equation in lattice {QCD} using a + domain decomposition method", + journal = "Comput. Phys. Commun.", + volume = "156", + year = "2004", + pages = "209-220", + eprint = "hep-lat/0310048", + SLACcitation = "%%CITATION = HEP-LAT 0310048;%%" +} +@Article{Luscher:2004rx, + author = "L{\"u}scher, M.", + title = "Schwarz-preconditioned {HMC} algorithm for two-flavour + lattice {QCD}", + journal = "Comput. Phys. Commun.", + volume = "165", + year = "2005", + pages = "199", + eprint = "hep-lat/0409106", + SLACcitation = "%%CITATION = HEP-LAT 0409106;%%" +} + +@Article{Luscher:2005mv, + author = "L{\"u}scher, Martin", + title = "Lattice {QCD} with light {W}ilson quarks", + journal = "\href{http://pos.sissa.it/archive/conferences/020/008/LAT2005_002.pdf}{PoS(LAT2005)002}", + year = "2005", + eprint = "hep-lat/0509152", + howpublished="Talk presented at International Symposium on Lattice Field Theory (Lattice 2005)", + SLACcitation = "%%CITATION = HEP-LAT 0509152;%%" +} +@Article{Luscher:ranluxweb, + author = "L{\"u}scher, M.", + title = "Ranlux random number generator", + eprint = "http://luscher.web.cern.ch/luscher/ranlux/" +} +@Article{Luscher:sse, + author = "L{\"u}scher, M.", + title = "Lattice QCD parallel benchmark programs", + eprint = "http://luscher.web.cern.ch/luscher/QCDpbm/" +} +@Article{Madras:1988ei, + author = "Madras, N. and Sokal, A. D.", + title = "The Pivot algorithm: a highly efficient Monte Carlo method + for selfavoiding walk", + journal = "J. Statist. Phys.", + volume = "50", + year = "1988", + pages = "109-186", + SLACcitation = "%%CITATION = JSTPB,50,109;%%" +} +@Article{Martinelli:1982mw, + author = "Martinelli, G. and Zhang, Yi-Cheng", + title = "THE CONNECTION BETWEEN LOCAL OPERATORS ON THE LATTICE AND + IN THE CONTINUUM AND ITS RELATION TO MESON DECAY + CONSTANTS", + journal = "Phys. Lett.", + volume = "B123", + year = "1983", + pages = "433", + SLACcitation = "%%CITATION = PHLTA,B123,433;%%" +} +@Article{Martinelli:1994ty, + author = "Martinelli, G. and Pittori, C. and Sachrajda, Christopher + T. and Testa, M. and Vladikas, A.", + title = "{A General method for nonperturbative renormalization of + lattice operators}", + journal = "Nucl. Phys.", + volume = "B445", + year = "1995", + pages = "81-108", + eprint = "hep-lat/9411010", + archivePrefix = "arXiv", + doi = "10.1016/0550-3213(95)00126-D", + SLACcitation = "%%CITATION = HEP-LAT/9411010;%%" +} +@Article{McNeile:2000hf, + author = "McNeile, C. and Michael, C.", + collaboration = "UKQCD", + title = "The eta and eta' mesons in {QCD}", + journal = "Phys. Lett.", + volume = "B491", + year = "2000", + pages = "123-129", + eprint = "hep-lat/0006020", + SLACcitation = "%%CITATION = HEP-LAT 0006020;%%" +} +@Article{McNeile:2000xx, + author = "McNeile, Craig and Michael, Chris", + collaboration = "UKQCD", + title = "Mixing of scalar glueballs and flavour-singlet scalar + mesons", + journal = "Phys. Rev.", + volume = "D63", + year = "2001", + pages = "114503", + eprint = "hep-lat/0010019", + SLACcitation = "%%CITATION = HEP-LAT0010019;%%" +} +@Article{McNeile:2001cr, + author = "McNeile, C. and Michael, C. and Sharkey, K. J.", + collaboration = "UKQCD", + title = "The flavor singlet mesons in {QCD}", + journal = "Phys. Rev.", + volume = "D65", + year = "2002", + pages = "014508", + eprint = "hep-lat/0107003", + SLACcitation = "%%CITATION = HEP-LAT 0107003;%%" +} +@Article{McNeile:2002fh, + author = "McNeile, C. and Michael, C.", + collaboration = "UKQCD", + title = "Hadronic decay of a vector meson from the lattice", + journal = "Phys. Lett.", + volume = "B556", + year = "2003", + pages = "177-184", + eprint = "hep-lat/0212020", + SLACcitation = "%%CITATION = HEP-LAT 0212020;%%" +} +@Article{McNeile:2006bz, + author = "McNeile, C. and Michael, C.", + collaboration = "UKQCD", + title = "Decay width of light quark hybrid meson from the lattice", + journal = "Phys. Rev.", + volume = "D73", + year = "2006", + pages = "074506", + eprint = "hep-lat/0603007", + SLACcitation = "%%CITATION = HEP-LAT 0603007;%%" +} +@Article{Meyer:2006ty, + author = "Meyer, Harvey B. and others", + title = "{Exploring the HMC trajectory-length dependence of + autocorrelation times in lattice QCD}", + journal = "Comput. Phys. Commun.", + volume = "176", + year = "2007", + pages = "91-97", + eprint = "hep-lat/0606004", + archivePrefix = "arXiv", + doi = "10.1016/j.cpc.2006.08.002", + SLACcitation = "%%CITATION = HEP-LAT/0606004;%%" +} +@Article{Michael:1982gb, + author = "Michael, C. and Teasdale, I.", + title = "EXTRACTING GLUEBALL MASSES FROM LATTICE QCD", + journal = "Nucl. Phys.", + volume = "B215", + year = "1983", + pages = "433", + SLACcitation = "%%CITATION = NUPHA,B215,433;%%" +} +@Article{Michael:1989mf, + author = "Michael, C.", + title = "Particle decay in lattice gauge theory", + journal = "Nucl. Phys.", + volume = "B327", + year = "1989", + pages = "515", + SLACcitation = "%%CITATION = NUPHA,B327,515;%%" +} +@Article{Michael:1991nc, + author = "Michael, C.", + title = "Hadronic forces from the lattice", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "26", + year = "1992", + pages = "417-419", + SLACcitation = "%%CITATION = NUPHZ,26,417;%%" +} +@Article{Michael:1993yj, + author = "Michael, Christopher", + title = "{Fitting correlated data}", + journal = "Phys. Rev.", + volume = "D49", + year = "1994", + pages = "2616-2619", + eprint = "hep-lat/9310026", + archivePrefix = "arXiv", + doi = "10.1103/PhysRevD.49.2616", + SLACcitation = "%%CITATION = HEP-LAT/9310026;%%" +} +@Article{Michael:1994sz, + author = "Michael, Christopher and McKerrell, A.", + title = "{Fitting correlated hadron mass spectrum data}", + journal = "Phys. Rev.", + volume = "D51", + year = "1995", + pages = "3745-3750", + eprint = "hep-lat/9412087", + archivePrefix = "arXiv", + doi = "10.1103/PhysRevD.51.3745", + SLACcitation = "%%CITATION = HEP-LAT/9412087;%%" +} +@Article{Michael:2007vn, + author = "Michael, C. and Urbach, C.", + collaboration = "ETM", + title = "Neutral mesons and disconnected diagrams in Twisted Mass + QCD", + journal = "", + volume = "", + pages = "", + year = "2007", + eprint = "0709.4564", + archivePrefix = "arXiv", + primaryClass = "hep-lat", + SLACcitation = "%%CITATION = ARXIV:0709.4564;%%" +} +@Book{Montvay:1994cy, + author = "Montvay, I. and M{\"u}nster, G.", + title = "Quantum fields on a lattice", + publisher = "Cambridge University Press", + year = "1994", + series = "Cambridge Monographs on Mathematical Physics", +} +@Article{Montvay:1995ea, + author = "Montvay, I.", + title = "An Algorithm for Gluinos on the Lattice", + journal = "Nucl. Phys.", + volume = "B466", + year = "1996", + pages = "259-284", + eprint = "hep-lat/9510042", + SLACcitation = "%%CITATION = HEP-LAT 9510042;%%" +} +@Article{Montvay:2005tj, + author = "Montvay, I. and Scholz, E.", + title = "Updating algorithms with multi-step stochastic correction", + journal = "Phys. Lett.", + volume = "B623", + year = "2005", + pages = "73-79", + eprint = "hep-lat/0506006", + SLACcitation = "%%CITATION = HEP-LAT 0506006;%%" +} +@Article{Morgan:2002a, + author = "Morgan, R. B.", + title = "GMRES with Deated Restarting", + journal = "SIAM J. Sci. Comput.", + volume = "24", + year = "2002", + pages = "20" +} +@Article{Morningstar:2003gk, + author = "Morningstar, Colin and Peardon, Mike J.", + title = "{Analytic smearing of SU(3) link variables in lattice + QCD}", + journal = "Phys. Rev.", + volume = "D69", + year = "2004", + pages = "054501", + eprint = "hep-lat/0311018", + archivePrefix = "arXiv", + doi = "10.1103/PhysRevD.69.054501", + SLACcitation = "%%CITATION = HEP-LAT/0311018;%%" +} +@Article{Munster:2004am, + author = "M{\"u}nster, G.", + title = "On the phase structure of twisted mass lattice {QCD}", + journal = "JHEP", + volume = "09", + year = "2004", + pages = "035", + eprint = "hep-lat/0407006", + SLACcitation = "%%CITATION = HEP-LAT 0407006;%%" +} +@Article{Munster:2004wt, + author = "M{\"u}nster, Gernot and Schmidt, Christian and Scholz, Enno E. + ", + title = "Chiral perturbation theory for twisted mass {QCD}", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "140", + year = "2005", + pages = "320-322", + eprint = "hep-lat/0409066", + SLACcitation = "%%CITATION = HEP-LAT 0409066;%%" +} +@Article{Nagai:2005mi, + author = "Nagai, Kei-ichi and Jansen, Karl", + title = "Two-dimensional lattice Gross-Neveu model with Wilson + twisted mass fermions", + journal = "Phys. Lett.", + volume = "B633", + year = "2006", + pages = "325-330", + eprint = "hep-lat/0510076", + SLACcitation = "%%CITATION = HEP-LAT 0510076;%%" +} +@Unpublished{Nagai:priv, + author = {Nagai, K}, + title = {Two-dimensional Gross-Neveu model with {Wilson} + twisted mass fermions}, + note = {private communication}, + OPTkey = {}, + OPTmonth = {}, + OPTyear = {}, + OPTannote = {} +} +@Article{Necco:2001xg, + author = "Necco, S. and Sommer, R.", + title = "The {N(f)} = 0 heavy quark potential from short to + intermediate distances", + journal = "Nucl. Phys.", + volume = "B622", + year = "2002", + pages = "328-346", + eprint = "hep-lat/0108008", + SLACcitation = "%%CITATION = HEP-LAT 0108008;%%" +} +@Article{Necco:2003vh, + author = "Necco, Silvia", + journal = "Nucl. Phys.", + volume = "B683", + year = "2004", + pages = "137-167", + eprint = "hep-lat/0309017", + SLACcitation = "%%CITATION = HEP-LAT 0309017;%%" +} +@Article{Neff:2001zr, + author = "Neff, H. and Eicker, N. and Lippert, T. and Negele, J. W. + and Schilling, K.", + title = "On the low fermionic eigenmode dominance in {QCD} on the + lattice", + journal = "Phys. Rev.", + volume = "D64", + year = "2001", + pages = "114509", + eprint = "hep-lat/0106016", + SLACcitation = "%%CITATION = HEP-LAT/0106016;%%" +} +@Article{Neuberger:1997fp, + author = "Neuberger, H.", + title = "Exactly massless quarks on the lattice", + journal = "Phys. Lett.", + volume = "B417", + year = "1998", + pages = "141-144", + eprint = "hep-lat/9707022", + SLACcitation = "%%CITATION = HEP-LAT 9707022;%%" +} +@Article{Neuberger:1998wv, + author = "Neuberger, H.", + title = "More about exactly massless quarks on the lattice", + journal = "Phys. Lett.", + volume = "B427", + year = "1998", + pages = "353-355", + eprint = "hep-lat/9801031", + SLACcitation = "%%CITATION = HEP-LAT 9801031;%%" +} +@Article{Niedermayer:1998bi, + author = "Niedermayer, F.", + title = "Exact chiral symmetry, topological charge and related + topics", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "73", + year = "1999", + pages = "105-119", + eprint = "hep-lat/9810026", + SLACcitation = "%%CITATION = HEP-LAT 9810026;%%" +} +@Article{Nielsen:1980rz, + author = "Nielsen, H. B. and Ninomiya, M.", + title = "Absence of neutrinos on a lattice. 1. proof by homotopy + theory", + journal = "Nucl. Phys.", + volume = "B185", + year = "1981", + pages = "20", + SLACcitation = "%%CITATION = NUPHA,B185,20;%%" +} +@Article{Nielsen:1981hk, + author = "Nielsen, H. B. and Ninomiya, M.", + title = "No go theorem for regularizing chiral fermions", + journal = "Phys. Lett.", + volume = "B105", + year = "1981", + pages = "219", + SLACcitation = "%%CITATION = PHLTA,B105,219;%%" +} +@Article{Nielsen:1981xu, + author = "Nielsen, H. B. and Ninomiya, M.", + title = "Absence of neutrinos on a lattice. 2. intuitive topological + proof", + journal = "Nucl. Phys.", + volume = "B193", + year = "1981", + pages = "173", + SLACcitation = "%%CITATION = NUPHA,B193,173;%%" +} +@Article{Noaki:1998zc, + author = "Noaki, J. and Izubuchi, T. and Ukawa, A.", + title = "Two-dimensional Gross-Neveu model with {Wilson} fermion + action at finite temperature and density", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "73", + year = "1999", + pages = "483-485", + eprint = "hep-lat/9809071", + SLACcitation = "%%CITATION = HEP-LAT 9809071;%%" +} +@Article{Orginos:2001xa, + author = "Orginos, K.", + collaboration = "RBC", + title = "Chiral properties of domain wall fermions with improved + gauge actions", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "106", + year = "2002", + pages = "721-723", + eprint = "hep-lat/0110074", + SLACcitation = "%%CITATION = HEP-LAT 0110074;%%" +} +@Article{Orth:2005kq, + author = "Orth, B. and Lippert, T. and Schilling, K.", + title = "Finite-size effects in lattice {QCD} with dynamical {Wilson} + fermions", + journal = "Phys. Rev.", + volume = "D72", + year = "2005", + pages = "014503", + eprint = "hep-lat/0503016", + SLACcitation = "%%CITATION = HEP-LAT 0503016;%%" +} +@Article{Osterwalder:1973dx, + author = "Osterwalder, K. and Schrader, R.", + title = "Axioms for euclidean Green's functions", + journal = "Commun. Math. Phys.", + volume = "31", + year = "1973", + pages = "83-112", + SLACcitation = "%%CITATION = CMPHA,31,83;%%" +} +@Article{Osterwalder:1975tc, + author = "Osterwalder, K. and Schrader, R.", + title = "Axioms for euclidean Green's functions. 2", + journal = "Commun. Math. Phys.", + volume = "42", + year = "1975", + pages = "281", + SLACcitation = "%%CITATION = CMPHA,42,281;%%" +} +@Article{Osterwalder:1977pc, + author = "Osterwalder, K. and Seiler, E.", + title = "Gauge field theories on the lattice", + journal = "Ann. Phys.", + volume = "110", + year = "1978", + pages = "440", + SLACcitation = "%%CITATION = APNYA,110,440;%%" +} +@Article{PDBook, + author = "Eidelman, S. and others", + title = "{Review of Particle Physics}", + journal = "{Physics Letters B}", + year = "2004", + volume = "592", + pages = {1+}, + url = {http://pdg.lbl.gov} +} +@Article{Peardon:2002wb, + author = "Peardon, M. J. and Sexton, J.", + collaboration = "TrinLat", + title = "Multiple molecular dynamics time-scales in hybrid Monte + Carlo fermion simulations", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "119", + year = "2003", + pages = "985-987", + eprint = "hep-lat/0209037", + SLACcitation = "%%CITATION = HEP-LAT 0209037;%%" +} +@Book{Peskin:1995ev, + author = {Peskin, M. E. and Schroeder, D. V.}, + title = {An Introduction to quantum field theory}, + publisher = {Westview Press}, + year = {1995}, + OPTkey = {}, + OPTvolume = {}, + OPTnumber = {}, + OPTseries = {Advanced Book Program}, + OPTaddress = {Boulder, Colorado}, + OPTedition = {}, + OPTmonth = {}, + OPTnote = {}, + OPTannote = {} +} +@Article{Politzer:1973fx, + author = "Politzer, H. D.", + title = "Reliable perturbative results for strong interactions?", + journal = "Phys. Rev. Lett.", + volume = "30", + year = "1973", + pages = "1346-1349", + SLACcitation = "%%CITATION = PRLTA,30,1346;%%" +} +@Article{Politzer:1974fr, + author = "Politzer, H. D.", + title = "Asymptotic freedom: an approach to strong interactions", + journal = "Phys. Rept.", + volume = "14", + year = "1974", + pages = "129-180", + SLACcitation = "%%CITATION = PRPLC,14,129;%%" +} +@Manual{R:2005, + title = {R: A language and environment for statistical computing}, + author = {{R Development Core Team}}, + organization = {R Foundation for Statistical Computing}, + address = {Vienna, Austria}, + year = {2005}, + note = {{ISBN} 3-900051-07-0}, + url = {http://www.R-project.org}, +} + +@Book{Rothe:1992wy, + author = "Rothe, H.J.", + title = "Lattice gauge theories", + publisher = "World Scientific, Singapore", + year = "1992", + pages = "528", + edition = "", +} +@Article{Rupak:2002sm, + author = "Rupak, G. and Shoresh, N.", + title = "Chiral perturbation theory for the {Wilson} lattice action", + journal = "Phys. Rev.", + volume = "D66", + year = "2002", + pages = "054503", + eprint = "hep-lat/0201019", + SLACcitation = "%%CITATION = HEP-LAT 0201019;%%" +} + +@Article{Saad:1993a, + author = "Saad, Y.", + title = "A flexible inner-outer preconditioned GMRES altorithm", + journal = "SIAM J. Sci. Comput.", + volume = "14 (2)", + year = "1993", + page = "461-469" +} +@Article{Sachrajda:2004mi, + author = "Sachrajda, C. T. and Villadoro, G.", + title = "{Twisted boundary conditions in lattice simulations}", + journal = "Phys. Lett.", + volume = "B609", + year = "2005", + pages = "73-85", + eprint = "hep-lat/0411033", + archivePrefix = "arXiv", + doi = "10.1016/j.physletb.2005.01.033", + SLACcitation = "%%CITATION = HEP-LAT/0411033;%%" +} +@Article{Scorzato:2004da, + author = "Scorzato, L.", + title = "Pion mass splitting and phase structure in twisted mass + {QCD}", + journal = "Eur. Phys. J.", + volume = "C37", + year = "2004", + pages = "445-455", + eprint = "hep-lat/0407023", + SLACcitation = "%%CITATION = HEP-LAT 0407023;%%" +} + +@Article{Scorzato:2005rb, + author = "Scorzato, L. and others", + title = "N(f) = 2 lattice {QCD} and chiral perturbation theory", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "153", + year = "2006", + pages = "283-290", + eprint = "hep-lat/0511036", + SLACcitation = "%%CITATION = HEP-LAT 0511036;%%" +} + +@Article{Sexton:1992nu, + author = "Sexton, J. C. and Weingarten, D. H.", + title = "Hamiltonian evolution for the hybrid monte carlo + algorithm", + journal = "Nucl. Phys.", + volume = "B380", + year = "1992", + pages = "665-678", + SLACcitation = "%%CITATION = NUPHA,B380,665;%%" +} + +@Article{Sharpe:1998xm, + author = "Sharpe, S. R. and Singleton, R., Jr.", + title = "Spontaneous flavor and parity breaking with {Wilson} + fermions", + journal = "Phys. Rev.", + volume = "D58", + year = "1998", + pages = "074501", + eprint = "hep-lat/9804028", + SLACcitation = "%%CITATION = HEP-LAT 9804028;%%" +} +@Article{Sharpe:2004ny, + author = "Sharpe, S. R. and Wu, Jackson M. S.", + title = "Twisted mass chiral perturbation theory at next-to-leading + order", + journal = "Phys. Rev.", + volume = "D71", + year = "2005", + pages = "074501", + eprint = "hep-lat/0411021", + SLACcitation = "%%CITATION = HEP-LAT 0411021;%%" +} +@Article{Sharpe:2004ps, + author = "Sharpe, S. R. and Wu, J. M. S.", + title = "The phase diagram of twisted mass lattice {QCD}", + journal = "Phys. Rev.", + volume = "D70", + year = "2004", + pages = "094029", + eprint = "hep-lat/0407025", + SLACcitation = "%%CITATION = HEP-LAT 0407025;%%" +} +@Article{Sharpe:2005rq, + author = "Sharpe, Stephen R.", + title = "Observations on discretization errors in twisted-mass + lattice QCD", + journal = "Phys. Rev.", + volume = "D72", + year = "2005", + pages = "074510", + eprint = "hep-lat/0509009", + SLACcitation = "%%CITATION = HEP-LAT 0509009;%%" +} +@Article{Sheikholeslami:1985ij, + author = "Sheikholeslami, B. and Wohlert, R.", + title = "Improved continuum limit lattice action for qcd with {Wilson} + fermions", + journal = "Nucl. Phys.", + volume = "B259", + year = "1985", + pages = "572", + SLACcitation = "%%CITATION = NUPHA,B259,572;%%" +} +@Article{Shindler:2005vj, + author = "Shindler, Andrea", + title = "Twisted mass lattice {QCD}: Recent developments and results", + journal = "PoS", + volume = "LAT2005", + year = "2006", + pages = "014", + eprint = "hep-lat/0511002", + SLACcitation = "%%CITATION = HEP-LAT 0511002;%%" +} +@Article{Shindler:2006tm, + author = "Shindler, A.", + collaboration = "ETM", + title = "Lattice QCD with light twisted quarks: First results", + year = "2006", + eprint = "hep-ph/0611264", + SLACcitation = "%%CITATION = HEP-PH 0611264;%%" +} +@Article{Shindler:2007vp, + author = "Shindler, A.", + title = "{Twisted mass lattice QCD}", + journal = "Phys. Rept.", + volume = "461", + year = "2008", + pages = "37-110", + eprint = "0707.4093", + archivePrefix = "arXiv", + primaryClass = "hep-lat", + doi = "10.1016/j.physrep.2008.03.001", + SLACcitation = "%%CITATION = 0707.4093;%%" +} +@Article{Sleijpen:1996aa, + author = "G. L. G. Sleijpen and H. A. Van der Vorst", + title = "A Jacobi-Davidson iteration method for linear + eigenvalue problems", + journal = "SIAM Journal on Matrix Analysis and Applications", + volume = "17", + year = "1996", + pages = "401-425", +} +@Article{Sommer:1993ce, + author = "Sommer, R.", + title = "A New way to set the energy scale in lattice gauge theories + and its applications to the static force and alpha-s in + SU(2) Yang-Mills theory", + journal = "Nucl. Phys.", + volume = "B411", + year = "1994", + pages = "839-854", + eprint = "hep-lat/9310022", + SLACcitation = "%%CITATION = HEP-LAT 9310022;%%" +} +@Article{Sonneveld:1989cgs, + author = {Peter Sonneveld}, + title = {CGS, a fast Lanczos-type solver for nonsymmetric linear systems}, + journal = {SIAM J. Sci. Stat. Comput.}, + volume = {10}, + number = {1}, + year = {1989}, + issn = {0196-5204}, + pages = {36--52}, + publisher = {Society for Industrial and Applied Mathematics}, + address = {Philadelphia, PA, USA}, + } +@Article{Sternbeck:2003gy, + author = "Sternbeck, A. and Ilgenfritz, E.-M. and Kerler, W. + and M{\"u}ller-Preu{\ss}ker, M. and St{\"u}ben, H.", + title = "The {Aoki} phase for {N(f)} = 2 {Wilson} fermions revisited", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "129", + year = "2004", + pages = "898-900", + eprint = "hep-lat/0309059", + SLACcitation = "%%CITATION = HEP-LAT 0309059;%%" +} +@Article{Sternbeck:2005tk, + author = "Sternbeck, A. and Ilgenfritz, E. -M. and Mueller-Preussker, + M. and Schiller, A.", + title = "{Going infrared in SU(3) Landau gauge gluodynamics}", + journal = "Phys. Rev.", + volume = "D72", + year = "2005", + pages = "014507", + eprint = "hep-lat/0506007", + SLACcitation = "%%CITATION = HEP-LAT/0506007;%%" +} +@Conference{Symanzik:1981hc, + author = "Symanzik, K.", + title = "Some topics in quantum field theory", + booktitle = "Mathematical problems in theoretical physics", + journal = "Lecture Notes in Physics", + volume = "153", + year = "1981", + pages = "47-58", + editor = "R. Schrader et al.", + note = "Presented at 6th Int. Conf. on Mathematical Physics, + Berlin, West Germany" +} +@Article{Symanzik:1983dc, + author = "Symanzik, K.", + title = "Continuum limit and improved action in lattice theories. 1. + principles and phi**4 theory", + journal = "Nucl. Phys.", + volume = "B226", + year = "1983", + pages = "187", + SLACcitation = "%%CITATION = NUPHA,B226,187;%%" +} +@Article{Symanzik:1983gh, + author = "Symanzik, K.", + title = "Continuum limit and improved action in lattice theories. 2. + O(N) nonlinear sigma model in perturbation theory", + journal = "Nucl. Phys.", + volume = "B226", + year = "1983", + pages = "205", + SLACcitation = "%%CITATION = NUPHA,B226,205;%%" +} +@Article{Takaishi:1996xj, + author = "Takaishi, T.", + title = "Heavy quark potential and effective actions on blocked + configurations", + journal = "Phys. Rev.", + volume = "D54", + year = "1996", + pages = "1050-1053", + SLACcitation = "%%CITATION = PHRVA,D54,1050;%%" +} +@Article{Takaishi:2005tz, + author = "Takaishi, T. and de Forcrand, P.", + title = "Testing and tuning new symplectic integrators for hybrid + Monte Carlo algorithm in lattice QCD", + year = "2005", + eprint = "hep-lat/0505020", + SLACcitation = "%%CITATION = HEP-LAT 0505020;%%" +} +@Article{Takeda:2004xh, + author = "Takeda, S. and others", + title = "A scaling study of the step scaling function in SU(3) gauge + theory with improved gauge actions", + journal = "Phys. Rev.", + volume = "D70", + year = "2004", + pages = "074510", + eprint = "hep-lat/0408010", + SLACcitation = "%%CITATION = HEP-LAT 0408010;%%" +} +@Article{Ukawa:2002pc, + author = "Ukawa, A.", + collaboration = "CP-PACS and JL{QCD}", + title = "Computational cost of full {QCD} simulations experienced by + {CP-PACS and JLQCD Collaborations}", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "106", + year = "2002", + pages = "195-196", + SLACcitation = "%%CITATION = NUPHZ,106,195;%%" +} +@Article{Urbach:2005ji, + author = "Urbach, C. and Jansen, K. and Shindler, A. and Wenger, U.", + title = "{HMC} algorithm with multiple time scale integration and mass + preconditioning", + journal = "Comput. Phys. Commun.", + volume = "174", + year = "2006", + pages = "87-98", + eprint = "hep-lat/0506011", + SLACcitation = "%%CITATION = HEP-LAT 0506011;%%" +} +@Article{Urbach:2007rt, + author = "Urbach, Carsten", + collaboration = "ETM", + title = "{Lattice QCD with two light Wilson quarks and maximally + twisted mass}", + journal = "PoS", + volume = "LAT2007", + year = "2007", + pages = "022", + eprint = "0710.1517", + archivePrefix = "arXiv", + primaryClass = "hep-lat", + SLACcitation = "%%CITATION = 0710.1517;%%" +} +@Article{WalkerLoud:2005bt, + author = "Walker-Loud, Andre and Wu, Jackson M. S.", + title = "{Nucleon and Delta masses in twisted mass chiral + perturbation theory}", + journal = "Phys. Rev.", + volume = "D72", + year = "2005", + pages = "014506", + eprint = "hep-lat/0504001", + archivePrefix = "arXiv", + doi = "10.1103/PhysRevD.72.014506", + SLACcitation = "%%CITATION = HEP-LAT/0504001;%%" +} +@Article{Weinberg:1973un, + author = "Weinberg, S.", + title = "Nonabelian gauge theories of the strong interactions", + journal = "Phys. Rev. Lett.", + volume = "31", + year = "1973", + pages = "494-497", + SLACcitation = "%%CITATION = PRLTA,31,494;%%" +} +@Article{Weinberg:1978kz, + author = "Weinberg, S.", + title = "Phenomenological Lagrangians", + journal = "Physica", + volume = "A96", + year = "1979", + pages = "327", + SLACcitation = "%%CITATION = PHYSA,A96,327;%%" +} +@Book{Weinberg:1995mt, + author = "Weinberg, S.", + title = "The Quantum theory of fields. Vol. 1: Foundations", + publisher = "Cambridge University Press", + year = "1995", + pages = "609", +} +@Article{Weisz:1982zw, + author = "Weisz, P.", + title = "Continuum limit improved lattice action for pure {Yang-Mills} + theory. 1", + journal = "Nucl. Phys.", + volume = "B212", + year = "1983", + pages = "1", + SLACcitation = "%%CITATION = NUPHA,B212,1;%%" +} +@Article{Weisz:1983bn, + author = "Weisz, P. and Wohlert, R.", + title = "Continuum limit improved lattice action for pure {Yang-Mills} + theory. 2", + journal = "Nucl. Phys.", + volume = "B236", + year = 1984, + pages = 397, + SLACcitation = "%%CITATION = NUPHA,B236,397;%%" +} +@Article{Wennekers:2005wa, + author = "Wennekers, J. and Wittig, H.", + title = "On the renormalized scalar density in quenched QCD", + year = "2005", + eprint = "hep-lat/0507026", + SLACcitation = "%%CITATION = HEP-LAT 0507026;%%" +} +@Article{Weyl:1918ib, + author = "Weyl, H.", + title = "Gravitation und Elektrizit{\"a}t", + journal = "Sitzungsber. Preuss. Akad. Wiss. Berlin (Math. Phys. )", + volume = "1918", + year = "1918", + pages = "465", + SLACcitation = "%%CITATION = SPWPA,1918,465;%%" +} +@Article{Weyl:1929fm, + author = "Weyl, H.", + title = "Electron and gravitation", + journal = "Z. Phys.", + volume = "56", + year = "1929", + pages = "330-352", + SLACcitation = "%%CITATION = ZEPYA,56,330;%%" +} +@Article{Wilson:1974sk, + author = "Wilson, K. G.", + title = "Confinement of quarks", + journal = "Phys. Rev.", + volume = "D10", + year = "1974", + pages = "2445-2459", + SLACcitation = "%%CITATION = PHRVA,D10,2445;%%" +} +@Article{Wilson:1974sk, + author = "Wilson, K. G.", + title = "Confinement of quarks", + journal = "Phys. Rev.", + volume = "D10", + year = "1974", + pages = "2445-2459", + SLACcitation = "%%CITATION = PHRVA,D10,2445;%%" +} +@Article{Wilson:1975mb, + author = "Wilson, K. G.", + title = "The renormalization group: Critical phenomena and the kondo + problem", + journal = "Rev. Mod. Phys.", + volume = "47", + year = "1975", + pages = "773", + SLACcitation = "%%CITATION = RMPHA,47,773;%%" +} +@Article{Wilson:1975mb, + author = "Wilson, K. G.", + title = "The renormalization group: Critical phenomena and the kondo + problem", + journal = "Rev. Mod. Phys.", + volume = "47", + year = "1975", + pages = "773", + SLACcitation = "%%CITATION = RMPHA,47,773;%%" +} +@Article{Wolff:2003sm, + author = "Wolff, U.", + collaboration = "ALPHA", + title = "Monte Carlo errors with less errors", + journal = "Comput. Phys. Commun.", + volume = "156", + year = "2004", + pages = "143-153", + eprint = "hep-lat/0306017", + SLACcitation = "%%CITATION = HEP-LAT 0306017;%%" +} +@Article{Yang:1954ek, + author = "Yang, C.-N. and Mills, R. L.", + title = "Conservation of isotopic spin and isotopic gauge + invariance", + journal = "Phys. Rev.", + volume = "96", + year = "1954", + pages = "191-195", + SLACcitation = "%%CITATION = PHRVA,96,191;%%" +} +@Article{Yoshie:2008aw, + author = "Yoshie, Tomoteru", + title = "{Making use of the International Lattice Data Grid}", + journal = "PoS", + volume = "LATTICE2008", + year = "2008", + pages = "019", + eprint = "0812.0849", + archivePrefix = "arXiv", + primaryClass = "hep-lat", + SLACcitation = "%%CITATION = 0812.0849;%%" +} +@Article{Zweig:1964jf, + author = "Zweig, G.", + title = "An SU(3) model for strong interaction symmetry and its + breaking. 2", + note = "CERN-TH-412" +} +@Article{cln:web, + author = {}, + eprint = {http://www.ginac.de/CLN/} +} +@Article{deForcrand:1995bs, + author = "de Forcrand, P.", + title = "Progress on lattice {QCD} algorithms", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "47", + year = "1996", + pages = "228-235", + eprint = "hep-lat/9509082", + SLACcitation = "%%CITATION = HEP-LAT 9509082;%%" +} +@Article{deForcrand:1996bx, + author = "de Forcrand, P. and others", + collaboration = "{QCD}-TARO", + title = "Search for effective lattice action of pure {QCD}", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "53", + year = "1997", + pages = "938-941", + eprint = "hep-lat/9608094", + SLACcitation = "%%CITATION = HEP-LAT 9608094;%%" +} +@Article{deForcrand:1996ck, + author = "de Forcrand, P. and Takaishi, T.", + title = "Fast fermion Monte Carlo", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "53", + year = "1997", + pages = "968-970", + eprint = "hep-lat/9608093", + SLACcitation = "%%CITATION = HEP-LAT 9608093;%%" +} +@Article{etmc:asqr, + author = "Frezzotti, R. et al.", + title = "{O(a^2) cutoff effects in Wilson fermion simulations}", + journal = "PoS", + volume = "LAT2007", + year = "2007", + pages = "277", + eprint = "0710.2492", + archivePrefix = "arXiv", + primaryClass = "hep-lat", + SLACcitation = "%%CITATION = 0710.2492;%%" +} +@Article{ildg:web, + eprint = {http://cssm.sasr.edu.au/ildg/}, + author = {ILDG working groups} +} +@Book{kleinert:1, + author = "Kleinert, H.", + title = "Path integrals in quantum mechanics, statistics and polymer ph +ysics", + publisher = "World Scientific, Singapore", + year = "1995", + edition = "2nd Edition", +} +@Article{lapack:web, + author = {}, + eprint = {http://www.netlib.org/lapack/} +} +@Article{lime:web, + author = {USQCD}, + title = {c-lime library}, + eprint = {http://usqcd.jlab.org/usqcd-docs/c-lime/} +} +@Book{meister:1999, + author = {Meister, Andreas}, + title = {Numerik linearer Gleichungssysteme}, + publisher = {vieweg}, + year = {1999}, + OPTkey = {}, + OPTvolume = {}, + OPTnumber = {}, + OPTseries = {}, + OPTaddress = {}, + OPTedition = {}, + OPTmonth = {}, + OPTnote = {}, + OPTannote = {} +} +@Manual{minuit, + title = {MINUIT home page}, + note= {\\seal.web.cern.ch/seal/snapshot/work-packages/mathlibs/minuit/home.html} +} +@Article{mpi:web, + author = {}, + title = {The message passing interface standard}, + eprint = {http://www-unix.mcs.anl.gov/mpi/} +} +@PhdThesis{orth:2004phd, + author = {Orth, B.}, + title = {Finite size effects in lattice {QCD} + with dynamical {Wilson} fermions}, + school = {Bergische Universit{\"a}t Wuppertal}, + year = {2004}, + OPTkey = {}, + OPTtype = {}, + OPTaddress = {}, + OPTmonth = {}, + OPTnote = {}, + OPTannote = {} +} +@PhdThesis{pleiter:phd, + author = {Pleiter, D.}, + title = {XXX}, + school = {Freie {U}niversität {B}erlin}, + year = {2001} +} +@Manual{root, + title = {The ROOT system home page}, + note = {root.cern.ch/} +} + +@Book{saad:2003a, + author = "Y. Saad", + title = "Iterative Methods for sparse linear systems", + publisher = "SIAM", + year = "2003", + edition = "2nd", +} + +@Article{scidac, + author = {}, + eprint = {http://www.scidac.gov/} +} +@MastersThesis{urbach:2002aa, + author = {Urbach, C.}, + title = {Untersuchung der {R}eversibilit{\"a}tsverletzung im {H}ybrid + {M}onte {C}arlo {A}lgorithmus}, + school = {Freie Universit{\"a}t Berlin, Fachbereich Physik}, + year = {2002} +} + +@Article{'tHooft:1971fh, + author = "'t Hooft, G.", + title = "Renormalization of massless Yang-Mills fields", + journal = "Nucl. Phys.", + volume = "B33", + year = "1971", + pages = "173-199", + SLACcitation = "%%CITATION = NUPHA,B33,173;%%" +} +@Article{'tHooft:1971rn, + author = "'t Hooft, G.", + title = "Renormalizable lagrangians for massive Yang-Mills fields", + journal = "Nucl. Phys.", + volume = "B35", + year = "1971", + pages = "167-188", + SLACcitation = "%%CITATION = NUPHA,B35,167;%%" +} +@Unpublished{'tHooft:1972aa, + author = "'t Hooft, G.", + title = "", + note = "Unpublished remarks at the 1972 Marseille Conference + on Yang-Mills Fields" +} +@Article{'tHooft:1972fi, + author = "'t Hooft, G. and Veltman, M. J. G.", + title = "Regularization and renormalization of gauge fields", + journal = "Nucl. Phys.", + volume = "B44", + year = "1972", + pages = "189-213", + SLACcitation = "%%CITATION = NUPHA,B44,189;%%" +} +@Article{Abdel-Rehim:2004gx, + author = "Abdel-Rehim, A. M. and Lewis, R.", + title = "Twisted mass {QCD} for the pion electromagnetic form factor", + journal = "Phys. Rev.", + volume = "D71", + year = "2005", + pages = "014503", + eprint = "hep-lat/0410047", + SLACcitation = "%%CITATION = HEP-LAT 0410047;%%" +} +@Article{Abdel-Rehim:2005gz, + author = "Abdel-Rehim, Abdou M. and Lewis, Randy and Woloshyn, R. M. + ", + title = "Spectrum of quenched twisted mass lattice QCD at maximal + twist", + journal = "Phys. Rev.", + volume = "D71", + year = "2005", + pages = "094505", + eprint = "hep-lat/0503007", + SLACcitation = "%%CITATION = HEP-LAT/0503007;%%" +} +@Article{AbdelRehim:2004sp, + author = "Abdel-Rehim, Abdou M. and Lewis, Randy", + title = "Pion form factor with twisted mass QCD", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "140", + year = "2005", + pages = "299-301", + eprint = "hep-lat/0408033", + SLACcitation = "%%CITATION = HEP-LAT/0408033;%%" +} +@Article{AbdelRehim:2005gq, + author = "Abdel-Rehim, A. M. and Lewis, R. and Woloshyn, R. M.", + title = "Twisted mass lattice QCD and hadron phenomenology", + journal = "Int. J. Mod. Phys.", + volume = "A20", + year = "2005", + pages = "6159-6168", + SLACcitation = "%%CITATION = IMPAE,A20,6159;%%" +} +@Article{AbdelRehim:2005gz, + author = "Abdel-Rehim, Abdou M. and Lewis, Randy and Woloshyn, R. M. + ", + title = "{Spectrum of quenched twisted mass lattice QCD at maximal + twist}", + journal = "Phys. Rev.", + volume = "D71", + year = "2005", + pages = "094505", + eprint = "hep-lat/0503007", + archivePrefix = "arXiv", + doi = "10.1103/PhysRevD.71.094505", + SLACcitation = "%%CITATION = HEP-LAT/0503007;%%" +} +@Article{AbdelRehim:2005qv, + author = "Abdel-Rehim, Abdou M. and Lewis, Randy and Woloshyn, R. M. + ", + title = "The hadron spectrum from twisted mass QCD with a strange + quark", + journal = "PoS", + volume = "LAT2005", + year = "2006", + pages = "032", + eprint = "hep-lat/0509056", + SLACcitation = "%%CITATION = HEP-LAT/0509056;%%" +} +@Article{AbdelRehim:2005yx, + author = "Abdel-Rehim, Abdou M. and Lewis, Randy and Woloshyn, R. M. + ", + title = "Maximal twist and the spectrum of quenched twisted mass + lattice QCD", + journal = "PoS", + volume = "LAT2005", + year = "2006", + pages = "051", + eprint = "hep-lat/0509098", + SLACcitation = "%%CITATION = HEP-LAT/0509098;%%" +} +@Article{AbdelRehim:2006qu, + author = "Abdel-Rehim, Abdou M. and Lewis, Randy and Petry, Robert G. + and Woloshyn, R. M.", + title = "The spectrum of tmLQCD with quark and link smearing", + journal = "PoS", + volume = "LAT2006", + year = "2006", + pages = "164", + eprint = "hep-lat/0610004", + SLACcitation = "%%CITATION = HEP-LAT/0610004;%%" +} +@Article{AbdelRehim:2006ra, + author = "Abdel-Rehim, Abdou M. and Lewis, Randy and Woloshyn, R. M. + and Wu, Jackson M. S.", + title = "Lattice QCD with a twisted mass term and a strange quark", + journal = "Eur. Phys. J.", + volume = "A31", + year = "2007", + pages = "773-776", + eprint = "hep-lat/0610090", + SLACcitation = "%%CITATION = HEP-LAT/0610090;%%" +} +@Article{AbdelRehim:2006ve, + author = "Abdel-Rehim, Abdou M. and Lewis, Randy and Woloshyn, R. M. + and Wu, Jackson M. S.", + title = "Strange quarks in quenched twisted mass lattice QCD", + journal = "Phys. Rev.", + volume = "D74", + year = "2006", + pages = "014507", + eprint = "hep-lat/0601036", + SLACcitation = "%%CITATION = HEP-LAT/0601036;%%" +} +@Article{Adler:1974gd, + author = "Adler, Stephen L.", + title = "{Some Simple Vacuum Polarization Phenomenology: e+ e- $\to$ + Hadrons: The mu - Mesic Atom x-Ray Discrepancy and (g-2) of + the Muon}", + journal = "Phys. Rev.", + volume = "D10", + year = "1974", + pages = "3714", + SLACcitation = "%%CITATION = PHRVA,D10,3714;%%" +} +@Article{Albanese:1987ds, + author = "Albanese, M. and others", + collaboration = "APE", + title = "Glueball masses and string tension in lattice {QCD}", + journal = "Phys. Lett.", + volume = "B192", + year = "1987", + pages = "163", + SLACcitation = "%%CITATION = PHLTA,B192,163;%%" +} +@Article{Alexandrou:2008tn, + author = "Alexandrou, C. and others", + collaboration = "ETM", + title = "{Light baryon masses with dynamical twisted mass + fermions}", + year = "2008", + eprint = "0803.3190", + archivePrefix = "arXiv", + primaryClass = "hep-lat", + SLACcitation = "%%CITATION = 0803.3190;%%" +} +@Article{AliKhan:2000iv, + author = "Ali Khan, A. and others", + collaboration = "CP-PACS", + title = "Chiral properties of domain-wall quarks in quenched {QCD}", + journal = "Phys. Rev.", + volume = "D63", + year = "2001", + pages = "114504", + eprint = "hep-lat/0007014", + SLACcitation = "%%CITATION = HEP-LAT 0007014;%%" +} +@Article{AliKhan:2003br, + author = "Ali Khan, A. and others", + collaboration = "QCDSF", + title = "Accelerating the hybrid Monte Carlo algorithm", + journal = "Phys. Lett.", + volume = "B564", + year = "2003", + pages = "235-240", + eprint = "hep-lat/0303026", + SLACcitation = "%%CITATION = HEP-LAT 0303026;%%" +} +@Article{AliKhan:2003mu, + author = "Ali Khan, A. and others", + title = "Accelerating Hasenbusch's acceleration of hybrid Monte + Carlo", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "129", + year = "2004", + pages = "853-855", + eprint = "hep-lat/0309078", + SLACcitation = "%%CITATION = HEP-LAT 0309078;%%" +} +@Article{Allton:1993wc, + author = "Allton, C. R. and others", + collaboration = "UK{QCD}", + title = "Gauge invariant smearing and matrix correlators using + {Wilson} fermions at Beta = 6.2", + journal = "Phys. Rev.", + volume = "D47", + year = "1993", + pages = "5128-5137", + eprint = "hep-lat/9303009", + SLACcitation = "%%CITATION = HEP-LAT 9303009;%%" +} +@Article{Allton:2004qq, + author = "Allton, C. R. and others", + collaboration = "UKQCD", + title = "Improved Wilson QCD simulations with light quark masses", + journal = "Phys. Rev.", + volume = "D70", + year = "2004", + pages = "014501", + eprint = "hep-lat/0403007", + SLACcitation = "%%CITATION = HEP-LAT/0403007;%%" +} +@Article{Aoki:1984qi, + author = "Aoki, S.", + title = "New phase structure for lattice {QCD} with {Wilson} fermions", + journal = "Phys. Rev.", + volume = "D30", + year = "1984", + pages = "2653", + SLACcitation = "%%CITATION = PHRVA,D30,2653;%%" +} +@Article{Aoki:1985jj, + author = "Aoki, S. and Higashijima, K.", + title = "The recovery of the chiral symmetry in lattice {Gross-Neveu} + model", + journal = "Prog. Theor. Phys.", + volume = "76", + year = "1986", + pages = "521", + SLACcitation = "%%CITATION = PTPKA,76,521;%%" +} +@Article{Aoki:1986ua, + author = "Aoki, Sinya", + title = "NUMERICAL EVIDENCE FOR A PARITY VIOLATING PHASE IN LATTICE + QCD WITH WILSON FERMION", + journal = "Phys. Lett.", + volume = "B190", + year = "1987", + pages = "140", + SLACcitation = "%%CITATION = PHLTA,B190,140;%%" +} +@Article{Aoki:1986xr, + author = "Aoki, S.", + title = "A solution to the {U(1)} problem on a lattice", + journal = "Phys. Rev. Lett.", + volume = "57", + year = "1986", + pages = "3136", + SLACcitation = "%%CITATION = PRLTA,57,3136;%%" +} +@Article{Aoki:1993vs, + author = "Aoki, S. and Boettcher, S. and Gocksch, A.", + title = "Spontaneous breaking of flavor symmetry and parity in the + Nambu-Jona-Lasinio model with {Wilson} fermions", + journal = "Phys. Lett.", + volume = "B331", + year = "1994", + pages = "157-164", + eprint = "hep-lat/9312084", + SLACcitation = "%%CITATION = HEP-LAT 9312084;%%" +} +@Article{Aoki:1995ft, + author = "Aoki, S.", + title = "On the phase structure of {QCD} with {Wilson} fermions", + journal = "Prog. Theor. Phys. Suppl.", + volume = "122", + year = "1996", + pages = "179-186", + eprint = "hep-lat/9509008", + SLACcitation = "%%CITATION = HEP-LAT 9509008;%%" +} +@Article{Aoki:1995yf, + author = "Aoki, S. and Ukawa, A. and Umemura, T.", + title = "Finite temperature phase structure of lattice {QCD} with + {Wilson} quark action", + journal = "Phys. Rev. Lett.", + volume = "76", + year = "1996", + pages = "873-876", + eprint = "hep-lat/9508008", + SLACcitation = "%%CITATION = HEP-LAT 9508008;%%" +} +@Article{Aoki:1997fm, + author = "Aoki, S.", + title = "Phase structure of lattice {QCD} with {Wilson} fermion at + finite temperature", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "60A", + year = "1998", + pages = "206-219", + eprint = "hep-lat/9707020", + SLACcitation = "%%CITATION = HEP-LAT 9707020;%%" +} +@Article{Aoki:2001xq, + author = "Aoki, S. and others", + collaboration = "JL{QCD}", + title = "Non-trivial phase structure of {N(f)} = 3 {QCD} with {O(a)}- + improved {Wilson} fermion at zero temperature", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "106", + year = "2002", + pages = "263-265", + eprint = "hep-lat/0110088", + SLACcitation = "%%CITATION = HEP-LAT 0110088;%%" +} +@Article{Aoki:2002vt, + author = "Aoki, Y. and others", + title = "Domain wall fermions with improved gauge actions", + journal = "Phys. Rev.", + volume = "D69", + year = "2004", + pages = "074504", + eprint = "hep-lat/0211023", + SLACcitation = "%%CITATION = HEP-LAT 0211023;%%" +} +@Article{Aoki:2004iq, + author = "Aoki, S. and others", + collaboration = "JL{QCD}", + title = "Bulk first-order phase transition in three-flavor lattice + {QCD} with {O(a)}-improved {Wilson} fermion action at zero + temperature", + year = "2004", + eprint = "hep-lat/0409016", + SLACcitation = "%%CITATION = HEP-LAT 0409016;%%" +} +@Article{Aoki:2004ta, + author = "Aoki, Sinya and B{\"a}r, Oliver", + title = "Twisted-mass {QCD}, {O}(a) improvement and {Wilson} chiral + perturbation theory", + journal = "Phys. Rev.", + volume = "D70", + year = "2004", + pages = "116011", + eprint = "hep-lat/0409006", + SLACcitation = "%%CITATION = HEP-LAT 0409006;%%" +} +@Article{Aoki:2005ii, + author = "Aoki, S. and B{\"a}r, O.", + title = "Determining the low energy parameters of {Wilson} chiral + perturbation theory", + year = "2005", + eprint = "hep-lat/0509002", + SLACcitation = "%%CITATION = HEP-LAT 0509002;%%" +} +@Article{Arnold:2003sx, + author = "Arnold, Guido and others", + title = "Numerical methods for the QCD overlap operator. II: Optimal + Krylov subspace methods", + year = "2003", + eprint = "hep-lat/0311025", + SLACcitation = "%%CITATION = HEP-LAT 0311025;%%" +} +@Article{Atiyah:1971rm, + author = "Atiyah, M. F. and Singer, I. M.", + title = "The Index of elliptic operators. 5", + journal = "Annals Math.", + volume = "93", + year = "1971", + pages = "139-149", + SLACcitation = "%%CITATION = ANMAA,93,139;%%" +} +@Article{Aubin:2006cc, + author = "Aubin, C. and Blum, T.", + title = "{Hadronic contributions to the muon g-2 from the lattice}", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "162", + year = "2006", + pages = "251-255", + SLACcitation = "%%CITATION = NUPHZ,162,251;%%" +} +@Article{Aubin:2006xv, + author = "Aubin, C. and Blum, T.", + title = "{Calculating the hadronic vacuum polarization and leading + hadronic contribution to the muon anomalous magnetic + moment with improved staggered quarks}", + journal = "Phys. Rev.", + volume = "D75", + year = "2007", + pages = "114502", + eprint = "hep-lat/0608011", + SLACcitation = "%%CITATION = HEP-LAT/0608011;%%" +} +@Article{BAGEL, + author="P.A. Boyle", + year=2005, + eprint=" http://www.ph.ed.ac.uk/\~{ }paboyle/bagel/Bagel.html" + } +@Article{Baikov:2004ku, + author = "Baikov, P. A. and Chetyrkin, K. G. and K{\"u}hn, J. H.", + title = "{Vacuum polarization in pQCD: First complete O(alpha(s)**4) + result}", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "135", + year = "2004", + pages = "243-246", + SLACcitation = "%%CITATION = NUPHZ,135,243;%%" +} +@Article{Baikov:2005rw, + author = "Baikov, P. A. and Chetyrkin, K. G. and K{\"u}hn, J. H.", + title = "{Scalar correlator at O(alpha(s)**4), Higgs decay into b- + quarks and bounds on the light quark masses}", + journal = "Phys. Rev. Lett.", + volume = "96", + year = "2006", + pages = "012003", + eprint = "hep-ph/0511063", + SLACcitation = "%%CITATION = HEP-PH/0511063;%%" +} +@Article{Baikov:2008jh, + author = "Baikov, P. A. and Chetyrkin, K. G. and K{\"u}hn, J. H.", + title = "{Hadronic Z- and tau-Decays in Order alpha_s^4}", + year = "2008", + eprint = "0801.1821", + archivePrefix = "arXiv", + primaryClass = "hep-lat", + SLACcitation = "%%CITATION = ARXIV:0801.1821;%%" +} +@Article{Bali:2000vr, + author = "Bali, G. S. and others", + collaboration = "TXL", + title = "Static potentials and glueball masses from {QCD} simulations + with {Wilson} sea quarks", + journal = "Phys. Rev.", + volume = "D62", + year = "2000", + pages = "054503", + eprint = "hep-lat/0003012", + SLACcitation = "%%CITATION = HEP-LAT 0003012;%%" +} +@Article{Bali:2004pb, + author = "Bali, G. S. and others", + title = "String breaking with dynamical {Wilson} fermions", + journal = "Nucl. Phys. Proc. Supl.", + volume = "140", + pages = "609-611", + year = "2004", + eprint = "hep-lat/0409137", + SLACcitation = "%%CITATION = HEP-LAT 0409137;%%" +} +@Article{Bali:2005fu, + author = "Bali, G. S. and Neff, H. and Duessel, T. and + Lippert, T. and Schilling, K.", + collaboration = "SESAM", + title = "Observation of string breaking in {QCD}", + journal = "Phys. Rev.", + volume = "D71", + year = "2005", + pages = "114513", + eprint = "hep-lat/0505012", + SLACcitation = "%%CITATION = HEP-LAT 0505012;%%" +} +@Article{Bar:2006zj, + author = "B{\"a}r, O. and Jansen, K. and Schaefer, S. and Scorzato, L. + and Shindler, A.", + title = "Overlap fermions on a twisted mass sea", + year = "2006", + eprint = "hep-lat/0609039", + SLACcitation = "%%CITATION = HEP-LAT 0609039;%%" +} +@Article{Baxter:1993bv, + author = "Baxter, R. M. and others", + collaboration = "UK{QCD}", + title = "Quenched heavy light decay constants", + journal = "Phys. Rev.", + volume = "D49", + year = "1994", + pages = "1594-1605", + eprint = "hep-lat/9308020", + SLACcitation = "%%CITATION = HEP-LAT 9308020;%%" +} +@Article{Beane:2004tw, + author = "Beane, Silas R.", + title = "{Nucleon masses and magnetic moments in a finite volume}", + journal = "Phys. Rev.", + volume = "D70", + year = "2004", + pages = "034507", + eprint = "hep-lat/0403015", + archivePrefix = "arXiv", + doi = "10.1103/PhysRevD.70.034507", + SLACcitation = "%%CITATION = HEP-LAT/0403015;%%" +} +@Article{Becher:1999he, + author = "Becher, Thomas and Leutwyler, H.", + title = "Baryon chiral perturbation theory in manifestly Lorentz + invariant form", + journal = "Eur. Phys. J.", + volume = "C9", + year = "1999", + pages = "643-671", + eprint = "hep-ph/9901384", + SLACcitation = "%%CITATION = HEP-PH/9901384;%%" +} +@Article{Bietenholz:2004sa, + author = "Bietenholz, W. and others", + collaboration = "\xlf", + title = "Comparison between overlap and twisted mass fermions + towards the chiral limit", + year = "2004", + eprint = "hep-lat/0409109", + SLACcitation = "%%CITATION = HEP-LAT 0409109;%%" +} +@Article{Bietenholz:2004wv, + author = "Bietenholz, W. and others", + collaboration = "\xlf", + title = "Going chiral: Overlap versus twisted mass fermions", + journal = "JHEP", + volume = "12", + year = "2004", + pages = "044", + eprint = "hep-lat/0411001", + SLACcitation = "%%CITATION = HEP-LAT 0411001;%%" +} +@Article{Blossier:2007vv, + author = "Blossier, B. and others", + collaboration = "ETM", + title = "{Light quark masses and pseudoscalar decay constants from + Nf=2 Lattice QCD with twisted mass fermions}", + year = "2007", + eprint = "0709.4574", + archivePrefix = "arXiv", + primaryClass = "hep-lat", + SLACcitation = "%%CITATION = ARXIV:0709.4574;%%" +} +@Article{Blum:1994eh, + author = "Blum, Tom and others", + title = "QCD thermodynamics with Wilson quarks at large kappa", + journal = "Phys. Rev.", + volume = "D50", + year = "1994", + pages = "3377-3381", + eprint = "hep-lat/9404006", + SLACcitation = "%%CITATION = HEP-LAT 9404006;%%" +} +@Article{Blum:2000kn, + author = "Blum, T. and others", + title = "Quenched lattice {QCD} with domain wall fermions and the + chiral limit", + journal = "Phys. Rev.", + volume = "D69", + year = "2004", + pages = "074502", + eprint = "hep-lat/0007038", + SLACcitation = "%%CITATION = HEP-LAT 0007038;%%" +} +@Article{Bodin:2005gg, + author = "Bodin, F. and others", + collaboration = "ApeNEXT", + title = "The {apeNEXT} project", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "140", + year = "2005", + pages = "176-182", + SLACcitation = "%%CITATION = NUPHZ,140,176;%%" +} +@Article{Bolder:2000un, + author = "Bolder, B. and others", + title = "A high precision study of the Q anti-Q potential from + {Wilson} loops in the regime of string breaking", + journal = "Phys. Rev.", + volume = "D63", + year = "2001", + pages = "074504", + eprint = "hep-lat/0005018", + SLACcitation = "%%CITATION = HEP-LAT 0005018;%%" +} +@Article{Boucaud:2007uk, + author = "Boucaud, Ph. and others", + collaboration = "ETM", + title = "Dynamical twisted mass fermions with light quarks", + year = "2007", + eprint = "hep-lat/0701012", + SLACcitation = "%%CITATION = HEP-LAT 0701012;%%" +} +@Article{Boucaud:2008xu, + author = "Boucaud, Ph. and others", + collaboration = "ETM", + title = "{Dynamical Twisted Mass Fermions with Light Quarks: + Simulation and Analysis Details}", + journal = "Comput. Phys. Commun.", + volume = "179", + year = "2008", + pages = "695-715", + eprint = "0803.0224", + archivePrefix = "arXiv", + primaryClass = "hep-lat", + doi = "10.1016/j.cpc.2008.06.013", + SLACcitation = "%%CITATION = 0803.0224;%%" +} +@Article{Boughezal:2006px, + author = "Boughezal, R. and Czakon, M. and Schutzmeier, T.", + title = "{Charm and bottom quark masses from perturbative QCD}", + journal = "Phys. Rev.", + volume = "D74", + year = "2006", + pages = "074006", + eprint = "hep-ph/0605023", + SLACcitation = "%%CITATION = HEP-PH/0605023;%%" +} +@Article{Boyle:2005fb, + author = "Boyle, P. A. and others", + title = "{QCDOC}: Project status and first results", + journal = "J. Phys. Conf. Ser.", + volume = "16", + year = "2005", + pages = "129-139", + SLACcitation = "%%CITATION = 00462,16,129;%%" +} +@Article{Brower:1994er, + author = "Brower, R. C. and Levi, A. R. and Orginos, K.", + title = "Extrapolation methods for the Dirac inverter in hybrid + Monte Carlo", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "42", + year = "1995", + pages = "855-857", + eprint = "hep-lat/9412004", + SLACcitation = "%%CITATION = HEP-LAT 9412004;%%" +} + +@Article{Brower:1995vx, + author = "Brower, R. C. and Ivanenko, T. and Levi, A. R. and Orginos, + K. N.", + title = "Chronological inversion method for the Dirac matrix in + hybrid Monte Carlo", + journal = "Nucl. Phys.", + volume = "B484", + year = "1997", + pages = "353-374", + eprint = "hep-lat/9509012", + SLACcitation = "%%CITATION = HEP-LAT 9509012;%%" +} + +@Article{Bunk:1995uv, + author = "Bunk, B. and others", + title = "A New simulation algorithm for lattice {QCD} with dynamical + quarks", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "42", + year = "1995", + pages = "49-55", + eprint = "hep-lat/9411016", + SLACcitation = "%%CITATION = HEP-LAT 9411016;%%" +} +@Article{Bunk:1998rm, + author = "Bunk, B. and Elser, Stephan and Frezzotti, R. and Jansen, + K.", + title = "{Ordering monomial factors of polynomials in the product + representation}", + journal = "Comput. Phys. Commun.", + volume = "118", + year = "1999", + pages = "95-109", + eprint = "hep-lat/9805026", + archivePrefix = "arXiv", + doi = "10.1016/S0010-4655(99)00198-8", + SLACcitation = "%%CITATION = HEP-LAT/9805026;%%" +} +@Article{Bunk:1998rm, + author = "Bunk, B. and Elser, S. and Frezzotti, R. and Jansen, + K.", + title = "Ordering monomial factors of polynomials in the product + representation", + journal = "Comput. Phys. Commun.", + volume = "118", + year = "1999", + pages = "95-109", + eprint = "hep-lat/9805026", + SLACcitation = "%%CITATION = HEP-LAT 9805026;%%" +} +@Article{Burrage:1998a, + author = " K. Burrage and J. Erhel", + title = "On the performance of various adaptive preconditioned GMRES strategies", + journal = "Num. Lin. Alg. with Appl.", + year = "1998", + volume = "5", + pages = "101-121" +} +@Article{Campbell:1987nv, + author = "Campbell, N. A. and Huntley, A. and Michael, C.", + title = "Heavy quark potentials and hybrid mesons from SU(3) lattice + gauge theory", + journal = "Nucl. Phys.", + volume = "B306", + year = "1988", + pages = "51", + SLACcitation = "%%CITATION = NUPHA,B306,51;%%" +} +@Article{Capitani:2005jp, + author = "Capitani, S. and others", + title = "Parton distribution functions with twisted mass fermions", + journal = "Phys. Lett.", + volume = "B639", + year = "2006", + pages = "520-526", + eprint = "hep-lat/0511013", + SLACcitation = "%%CITATION = HEP-LAT 0511013;%%" +} +@Article{Chen:2003im, + author = "Chen, Y. and others", + title = "Chiral logarithms in quenched {QCD}", + journal = "Phys. Rev.", + volume = "D70", + year = "2004", + pages = "034502", + eprint = "hep-lat/0304005", + SLACcitation = "%%CITATION = HEP-LAT 0304005;%%" +} +@Book{Cheng:2000ct, + author = "Cheng, T. P. and Li, L. F.", + title = "Gauge theory of elementary particle physics: Problems and + solutions", + publisher = "Oxford, UK: Clarendon", + year = "2000", + pages = "306", + edition = "", +} +@Article{Chetyrkin:1990kr, + author = "Chetyrkin, K. G. and K{\"u}hn, Johann H.", + title = "{Mass corrections to the Z decay rate}", + journal = "Phys. Lett.", + volume = "B248", + year = "1990", + pages = "359-364", + SLACcitation = "%%CITATION = PHLTA,B248,359;%%" +} +@Article{Chetyrkin:1996cf, + author = "Chetyrkin, K. G. and K{\"u}hn, Johann H. and Steinhauser, M.", + title = "{Three-loop polarization function and O(alpha(s)**2) + corrections to the production of heavy quarks}", + journal = "Nucl. Phys.", + volume = "B482", + year = "1996", + pages = "213-240", + eprint = "hep-ph/9606230", + SLACcitation = "%%CITATION = HEP-PH/9606230;%%" +} +@Article{Chetyrkin:1997mb, + author = "Chetyrkin, K. G. and K{\"u}hn, Johann H. and Steinhauser, M.", + title = "{Heavy quark current correlators to O(alpha(s)**2)}", + journal = "Nucl. Phys.", + volume = "B505", + year = "1997", + pages = "40-64", + eprint = "hep-ph/9705254", + SLACcitation = "%%CITATION = HEP-PH/9705254;%%" +} +@Article{Chetyrkin:1998ix, + author = "Chetyrkin, K. G. and Harlander, R. and Steinhauser, M.", + title = "{Singlet polarization functions at O(alpha(s)**2)}", + journal = "Phys. Rev.", + volume = "D58", + year = "1998", + pages = "014012", + eprint = "hep-ph/9801432", + SLACcitation = "%%CITATION = HEP-PH/9801432;%%" +} +@Article{Chetyrkin:2000zk, + author = "Chetyrkin, K. G. and Harlander, R. V. and K{\"u}hn, Johann H.", + title = "{Quartic mass corrections to R(had) at O(alpha(s)**3)}", + journal = "Nucl. Phys.", + volume = "B586", + year = "2000", + pages = "56-72", + eprint = "hep-ph/0005139", + SLACcitation = "%%CITATION = HEP-PH/0005139;%%" +} +@Article{Chetyrkin:2006xg, + author = "Chetyrkin, K. G. and K{\"u}hn, J. H. and Sturm, C.", + title = "{Four-loop moments of the heavy quark vacuum polarization + function in perturbative QCD}", + journal = "Eur. Phys. J.", + volume = "C48", + year = "2006", + pages = "107-110", + eprint = "hep-ph/0604234", + SLACcitation = "%%CITATION = HEP-PH/0604234;%%" +} +@Article{Chiarappa:2004ry, + author = "Chiarappa, T. and others", + title = "{Comparing iterative methods for overlap and twisted mass + fermions}", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "140", + year = "2005", + pages = "853-855", + eprint = "hep-lat/0409107", + archivePrefix = "arXiv", + doi = "10.1016/j.nuclphysbps.2004.11.281", + SLACcitation = "%%CITATION = HEP-LAT/0409107;%%" +} +@Article{Chiarappa:2006ae, + author = "Chiarappa, T. and others", + title = "{Numerical simulation of {QCD} with u, d, s and c quarks in + the twisted-mass {W}ilson formulation}", + journal = "Eur. Phys. J.", + volume = "C50", + year = "2007", + pages = "373-383", + eprint = "hep-lat/0606011", + archivePrefix = "arXiv", + doi = "10.1140/epjc/s10052-006-0204-4", + SLACcitation = "%%CITATION = HEP-LAT/0606011;%%" +} +@Article{Chiarappa:2006hz, + author = "Chiarappa, T. and others", + title = "{Iterative methods for overlap and twisted mass fermions}", + year = "2008", + journal = "Comput. Sci. Disc.", + volume = "01", + pages = "015001", + eprint = "hep-lat/0609023", + archivePrefix = "arXiv", + SLACcitation = "%%CITATION = HEP-LAT/0609023;%%" +} +@Article{Cichy:2008gk, + author = "Cichy, K. and Gonzalez Lopez, J. and Jansen, K. and Kujawa, + A. and Shindler, A.", + title = "{Twisted Mass, Overlap and Creutz Fermions: Cut-off Effects + at Tree-level of Perturbation Theory}", + journal = "Nucl. Phys.", + volume = "B800", + year = "2008", + pages = "94-108", + eprint = "0802.3637", + archivePrefix = "arXiv", + primaryClass = "hep-lat", + doi = "10.1016/j.nuclphysb.2008.03.004", + SLACcitation = "%%CITATION = 0802.3637;%%" +} +@Article{Clark:2004cq, + author = "Clark, M. A. and Kennedy, A. D.", + title = "Accelerating fermionic molecular dynamics", + year = "2004", + eprint = "hep-lat/0409134", + SLACcitation = "%%CITATION = HEP-LAT 0409134;%%" +} + +@Article{Clark:2005sq, + author = "Clark, M. A. and de Forcrand, Ph. and Kennedy, A. D.", + title = "Algorithm shootout: R versus RHMC", + journal = "PoS", + volume = "LAT2005", + year = "2005", + pages = "115", + eprint = "hep-lat/0510004", + SLACcitation = "%%CITATION = HEP-LAT 0510004;%%" +} +@Article{Clark:2006fx, + author = "Clark, M. A. and Kennedy, A. D.", + title = "{Accelerating Dynamical Fermion Computations using the + Rational Hybrid Monte Carlo (RHMC) Algorithm with Multiple + Pseudofermion Fields}", + journal = "Phys. Rev. Lett.", + volume = "98", + year = "2007", + pages = "051601", + eprint = "hep-lat/0608015", + archivePrefix = "arXiv", + doi = "10.1103/PhysRevLett.98.051601", + SLACcitation = "%%CITATION = HEP-LAT/0608015;%%" +} +@Article{Clark:2006wp, + author = "Clark, M. A. and Kennedy, A. D.", + title = "{Accelerating Staggered Fermion Dynamics with the Rational + Hybrid Monte Carlo (RHMC) Algorithm}", + journal = "Phys. Rev.", + volume = "D75", + year = "2007", + pages = "011502", + eprint = "hep-lat/0610047", + archivePrefix = "arXiv", + doi = "10.1103/PhysRevD.75.011502", + SLACcitation = "%%CITATION = HEP-LAT/0610047;%%" +} +@Article{Colangelo:2001df, + author = "Colangelo, G. and Gasser, J. and Leutwyler, H.", + title = "{pi pi scattering}", + journal = "Nucl. Phys.", + volume = "B603", + year = "2001", + pages = "125-179", + eprint = "hep-ph/0103088", + archivePrefix = "arXiv", + doi = "10.1016/S0550-3213(01)00147-X", + SLACcitation = "%%CITATION = HEP-PH/0103088;%%" +} +@Article{Colangelo:2003hf, + author = "Colangelo, Gilberto and D{\"u}rr, Stephan", + title = "The pion mass in finite volume", + journal = "Eur. Phys. J.", + volume = "C33", + year = "2004", + pages = "543-553", + eprint = "hep-lat/0311023", + SLACcitation = "%%CITATION = HEP-LAT/0311023;%%" +} +@Article{Colangelo:2005gd, + author = "Colangelo, Gilberto and D{\"u}rr, Stephan and Haefeli, + Christoph", + title = "Finite volume effects for meson masses and decay + constants", + journal = "Nucl. Phys.", + volume = "B721", + year = "2005", + pages = "136-174", + eprint = "hep-lat/0503014", + SLACcitation = "%%CITATION = HEP-LAT 0503014;%%" +} +@Article{Colangelo:2006mp, + author = "Colangelo, Gilberto and Haefeli, Christoph", + title = "{Finite volume effects for the pion mass at two loops}", + journal = "Nucl. Phys.", + volume = "B744", + year = "2006", + pages = "14-33", + eprint = "hep-lat/0602017", + archivePrefix = "arXiv", + doi = "10.1016/j.nuclphysb.2006.03.010", + SLACcitation = "%%CITATION = HEP-LAT/0602017;%%" +} +@Book{Collins:1994ab, + author = "Collins, J.C.", + title = "Renormalisation", + publisher = "Cambridge University Press", + series = "Cambridge Monographs on Mathematical Physics", + year = "1994", + edition = "", +} +@Article{Creutz:1984fj, + author = "Creutz, M. and Gocksch, A. and Ogilvie, M. and + Okawa, M.", + title = "Microcanonical renormalization group", + journal = "Phys. Rev. Lett.", + volume = "53", + year = "1984", + pages = "875", + SLACcitation = "%%CITATION = PRLTA,53,875;%%" +} +@Article{Creutz:1989wt, + author = "Creutz, M. and Gocksch, A.", + title = "Higher order hybrid monte carlo algorithms", + note = "BNL-42601" +} +@Article{Creutz:1996bg, + author = "Creutz, Michael", + title = "Wilson fermions at finite temperature", + year = "1996", + eprint = "hep-lat/9608024", + SLACcitation = "%%CITATION = HEP-LAT 9608024;%%" +} +@Article{Creutz:1998ee, + author = "Creutz, M.", + title = "Evaluating Grassmann integrals", + journal = "Phys. Rev. Lett.", + volume = "81", + year = "1998", + pages = "3555-3558", + eprint = "hep-lat/9806037", + SLACcitation = "%%CITATION = HEP-LAT 9806037;%%" +} +@Article{Cundy:2005pi, + author = "Cundy, N. and others", + title = "Numerical Methods for the {QCD} Overlap Operator IV: Hybrid + Monte Carlo", + year = "2005", + eprint = "hep-lat/0502007", + SLACcitation = "%%CITATION = HEP-LAT 0502007;%%" +} +@Article{David:1984ys, + author = "David, F. and Hamber, H. W.", + title = "Chiral condensate with {Wilson} fermions", + journal = "Nucl. Phys.", + volume = "B248", + year = "1984", + pages = "381", + SLACcitation = "%%CITATION = NUPHA,B248,381;%%" +} +@Article{Davies:2008sw, + author = "Davies, C. T. H. and others", + collaboration = "HPQCD", + title = "{Update: Accurate Determinations of $\alpha_s$ from + Realistic Lattice QCD}", + year = "2008", + eprint = "0807.1687", + archivePrefix = "arXiv", + primaryClass = "hep-lat", + SLACcitation = "%%CITATION = 0807.1687;%%" +} +@Article{DeGrand:1990dk, + author = "DeGrand, T. A. and Rossi, P.", + title = "Conditioning techniques for dynamical fermions", + journal = "Comput. Phys. Commun.", + volume = "60", + year = "1990", + pages = "211-214", + SLACcitation = "%%CITATION = CPHCB,60,211;%%" +} +@Article{DeGrand:1990ip, + author = "DeGrand, T. A.", + title = "Resonance masses from Monte Carlo simulations (with + emphasis on the rho meson)", + journal = "Phys. Rev.", + volume = "D43", + year = "1991", + pages = "2296-2300", + SLACcitation = "%%CITATION = PHRVA,D43,2296;%%" +} +@Article{DeGrand:2002vu, + author = "DeGrand, Thomas and Hasenfratz, Anna and Kovacs, Tamas G.", + title = "Improving the chiral properties of lattice fermions", + journal = "Phys. Rev.", + volume = "D67", + year = "2003", + pages = "054501", + eprint = "hep-lat/0211006", + SLACcitation = "%%CITATION = HEP-LAT 0211006;%%" +} +@Article{DeTar:2007ni, + author = "DeTar, Carleton and Levkova, L.", + title = "Effects of the disconnected flavor singlet corrections on + the hyperfine splitting in charmonium", + journal = "PoS", + volume = "LAT2007", + year = "2007", + pages = "116", + eprint = "0710.1322", + archivePrefix = "arXiv", + primaryClass = "hep-lat", + SLACcitation = "%%CITATION = ARXIV:0710.1322;%%" +} +@Article{DelDebbio:2006cn, + author = "Del Debbio, L. and Giusti, L. and L{\"u}scher, M. and + Petronzio, R. and Tantalo, N.", + title = "QCD with light Wilson quarks on fine lattices. I: First + experiences and physics results", + journal = "JHEP", + volume = "02", + year = "2007", + pages = "056", + eprint = "hep-lat/0610059", + SLACcitation = "%%CITATION = HEP-LAT 0610059;%%" +} +@Article{DellaMorte:2000yp, + author = "Della Morte, M. and Frezzotti, R. and Heitger, J. and Sint, + S.", + title = "Non-perturbative scaling tests of twisted mass {QCD}", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "94", + year = "2001", + pages = "617-621", + eprint = "hep-lat/0010091", + SLACcitation = "%%CITATION = HEP-LAT 0010091;%%" +} +@Article{DellaMorte:2001tu, + author = "Della Morte, M. and Frezzotti, R. and Heitger, J.", + title = "Quenched twisted mass {QCD} at small quark masses and in + large volume", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "106", + year = "2002", + pages = "260-262", + eprint = "hep-lat/0110166", + SLACcitation = "%%CITATION = HEP-LAT 0110166;%%" +} + +@Article{DellaMorte:2001ys, + author = "Della Morte, M. and Frezzotti, R. and Heitger, + J. and Sint, S.", + collaboration = "ALPHA", + title = "Cutoff effects in twisted mass lattice {QCD}", + journal = "JHEP", + volume = "10", + year = "2001", + pages = "041", + eprint = "hep-lat/0108019", + SLACcitation = "%%CITATION = HEP-LAT 0108019;%%" +} +@Article{DellaMorte:2003jj, + author = "Della Morte, M. and others", + collaboration = "ALPHA", + title = "Simulating the Schroedinger functional with two pseudo- + fermions", + journal = "Comput. Phys. Commun.", + volume = "156", + year = "2003", + pages = "62-72", + eprint = "hep-lat/0307008", + SLACcitation = "%%CITATION = HEP-LAT 0307008;%%" +} +@Article{DellaMorte:2003mn, + author = "Della Morte, M. and others", + collaboration = "ALPHA", + title = "Lattice HQET with exponentially improved statistical + precision", + journal = "Phys. Lett.", + volume = "B581", + year = "2004", + pages = "93-98", + eprint = "hep-lat/0307021", + SLACcitation = "%%CITATION = HEP-LAT 0307021;%%" +} +@Article{DellaMorte:2003mw, + author = "Della Morte, M. and others", + collaboration = "ALPHA", + title = "Static quarks with improved statistical precision", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "129", + year = "2004", + pages = "346-348", + eprint = "hep-lat/0309080", + SLACcitation = "%%CITATION = HEP-LAT 0309080;%%" +} +@Article{DellaMorte:2005yc, + author = "Della Morte, M. and Shindler, A. and Sommer, + R.", + title = "On lattice actions for static quarks", + year = "2005", + eprint = "hep-lat/0506008", + SLACcitation = "%%CITATION = HEP-LAT 0506008;%%" +} +@Article{Dimopoulos:2006dm, + author = "Dimopoulos, P. and others", + collaboration = "ALPHA", + title = "A precise determination of B(K) in quenched QCD", + journal = "Nucl. Phys.", + volume = "B749", + year = "2006", + pages = "69-108", + eprint = "hep-ph/0601002", + SLACcitation = "%%CITATION = HEP-PH 0601002;%%" +} +@Article{Dimopoulos:2007fn, + author = "Dimopoulos, P. and others", + title = "{Renormalisation of quark bilinears with Nf=2 Wilson + fermions and tree-level improved gauge action}", + journal = "PoS", + volume = "LAT2007", + year = "2007", + pages = "241", + eprint = "0710.0975", + archivePrefix = "arXiv", + primaryClass = "hep-lat", + SLACcitation = "%%CITATION = 0710.0975;%%" +} +@Article{Dimopoulos:2007qy, + author = "Dimopoulos, Petros and Frezzotti, Roberto and Herdoiza, + Gregorio and Urbach, Carsten and Wenger, Urs", + collaboration = "ETM", + title = "{Scaling and low energy constants in lattice QCD with N_f=2 + maximally twisted Wilson quarks}", + journal = "PoS", + volume = "LAT2007", + year = "2007", + pages = "102", + eprint = "0710.2498", + archivePrefix = "arXiv", + primaryClass = "hep-lat", + SLACcitation = "%%CITATION = 0710.2498;%%" +} +@Article{Dimopoulos:2008sy, + author = "Dimopoulos, Petros and others", + collaboration = "ETM", + title = "{Scaling and chiral extrapolation of pion mass and decay + constant with maximally twisted mass QCD}", + year = "2008", + eprint = "0810.2873", + archivePrefix = "arXiv", + primaryClass = "hep-lat", + SLACcitation = "%%CITATION = 0810.2873;%%" +} +@Article{Dong:2001fm, + author = "Dong, S. J. and others", + title = "Chiral properties of pseudoscalar mesons on a quenched + 20**4 lattice with overlap fermions", + journal = "Phys. Rev.", + volume = "D65", + year = "2002", + pages = "054507", + eprint = "hep-lat/0108020", + SLACcitation = "%%CITATION = HEP-LAT 0108020;%%" +} +@Article{Duane:1987de, + author = "Duane, S. and Kennedy, A. D. and Pendleton, B. J. and + Roweth, D.", + title = "{H}ybrid monte carlo", + journal = "Phys. Lett.", + volume = "B195", + year = "1987", + pages = "216-222", + SLACcitation = "%%CITATION = PHLTA,B195,216;%%" +} +@Article{Edwards:1996vs, + author = "Edwards, R. G. and Horvath, I. and Kennedy, A. D.", + title = "Instabilities and non-reversibility of molecular dynamics + trajectories", + journal = "Nucl. Phys.", + volume = "B484", + year = "1997", + pages = "375-402", + eprint = "hep-lat/9606004", + SLACcitation = "%%CITATION = HEP-LAT 9606004;%%" +} +@Article{Edwards:2004sx, + author = "Edwards, Robert G. and Joo, Balint", + collaboration = "SciDAC", + title = "The {Chroma} software system for lattice {QCD}", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "140", + year = "2005", + pages = "832", + eprint = "hep-lat/0409003", + SLACcitation = "%%CITATION = HEP-LAT 0409003;%%" +} +@Article{Eichten:1989zv, + author = "Eichten, E. and Hill, B.", + title = "An effective field theory for the calculation of matrix + elements involving heavy quarks", + journal = "Phys. Lett.", + volume = "B234", + year = "1990", + pages = "511", + SLACcitation = "%%CITATION = PHLTA,B234,511;%%" +} +@Article{Farchioni:2002vn, + author = "Farchioni, F. and Gebert, C. and Montvay, I. + and Scorzato, L.", + title = "Numerical simulation tests with light dynamical quarks", + journal = "Eur. Phys. J.", + volume = "C26", + year = "2002", + pages = "237-251", + eprint = "hep-lat/0206008", + SLACcitation = "%%CITATION = HEP-LAT 0206008;%%" +} +@Article{Farchioni:2004fs, + author = "Farchioni, F. and others", + title = "The phase structure of lattice {QCD} with {Wilson} quarks and + renormalization group improved gluons", + journal = "Eur. Phys. J.", + volume = "C42", + year = "2005", + pages = "73-87", + eprint = "hep-lat/0410031", + SLACcitation = "%%CITATION = HEP-LAT 0410031;%%" +} +@Article{Farchioni:2004ma, + author = "Farchioni, F. and others", + title = "Exploring the phase structure of lattice {{QCD}} with twisted + mass quarks", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "140", + year = "2005", + pages = "240-245", + eprint = "hep-lat/0409098", + SLACcitation = "%%CITATION = HEP-LAT 0409098;%%" +} +@Article{Farchioni:2004us, + author = "Farchioni, F. and others", + title = "Twisted mass quarks and the phase structure of lattice + {QCD}", + journal = "Eur. Phys. J.", + volume = "C39", + year = "2005", + pages = "421-433", + eprint = "hep-lat/0406039", + SLACcitation = "%%CITATION = HEP-LAT 0406039;%%" +} +@Article{Farchioni:2005ec, + author = "Farchioni, Federico and others", + title = "Dynamical twisted mass fermions", + journal = "PoS", + volume = "LAT2005", + year = "2006", + pages = "072", + eprint = "hep-lat/0509131", + SLACcitation = "%%CITATION = HEP-LAT 0509131;%%" +} +@Article{Farchioni:2005hf, + author = "Farchioni, F. and others", + title = "Twisted mass fermions: Neutral pion masses from + disconnected contributions", + journal = "PoS", + volume = "LAT2005", + year = "2006", + pages = "033", + eprint = "hep-lat/0509036", + SLACcitation = "%%CITATION = HEP-LAT 0509036;%%" +} +@Article{Farchioni:2005tu, + author = "Farchioni, F. and others", + title = "Lattice spacing dependence of the first order phase + transition for dynamical twisted mass fermions", + journal = "Phys. Lett.", + volume = "B624", + year = "2005", + pages = "324-333", + eprint = "hep-lat/0506025", + SLACcitation = "%%CITATION = HEP-LAT 0506025;%%" +} +@Article{Feldmann:1999uf, + author = "Feldmann, Thorsten", + title = "{Quark structure of pseudoscalar mesons}", + journal = "Int. J. Mod. Phys.", + volume = "A15", + year = "2000", + pages = "159-207", + eprint = "hep-ph/9907491", + SLACcitation = "%%CITATION = HEP-PH/9907491;%%" +} +@Article{Feynman:1948aa, + author = "Feynman, R. P.", + title = "Space-time approach to non-relativistic quantum mechanics", + journal = "Rev. Mod. Phys.", + volume = "20", + year = "1948", + pages = "367-387", + SLACcitation = "%%CITATION = RMPHA,20,367;%%" +} +@Article{Fischer:1996th, + author = "Fischer, S. and others", + title = "A Parallel SSOR Preconditioner for Lattice {QCD}", + journal = "Comp. Phys. Commun.", + volume = "98", + year = "1996", + pages = "20-34", + eprint = "hep-lat/9602019", + SLACcitation = "%%CITATION = HEP-LAT 9602019;%%" +} +@Article{Fokkema:1998aa, + author = "Fokkema, D.~R. and Sleijpen, G.~L.~G. and Van~der~Vorst, H.~A.", + title = "{J}acobi-{D}avidson style {QR} and {QZ} algorithms for + the reduction of matrix pencils", + journal = "J. Sci. Comput.", + volume = "20", + year = "1998", + pages = "94-125", +} +@Article{Foster:1998vw, + author = "Foster, M. and Michael, C.", + collaboration = "UKQCD", + title = "Quark mass dependence of hadron masses from lattice {QCD}", + journal = "Phys. Rev.", + volume = "D59", + year = "1999", + pages = "074503", + eprint = "hep-lat/9810021", + SLACcitation = "%%CITATION = HEP-LAT 9810021;%%" +} +@Article{Freund, + author = "Freund, R.W.", + journal = "in Numerical Linear Algebra, L.\ Reichel, A.\ Ruttan and R.S.\ Varga (eds.)", + year = "1993", + pages = "p. 101", +} +@Article{Frezzotti:1997ym, + author = "Frezzotti, R. and Jansen, K.", + title = "A polynomial hybrid Monte Carlo algorithm", + journal = "Phys. Lett.", + volume = "B402", + year = "1997", + pages = "328-334", + eprint = "hep-lat/9702016", + SLACcitation = "%%CITATION = HEP-LAT 9702016;%%" +} +@Article{Frezzotti:1998eu, + author = "Frezzotti, R. and Jansen, K.", + title = "The {PHMC} algorithm for simulations of dynamical fermions. + {I}: Description and properties", + journal = "Nucl. Phys.", + volume = "B555", + year = "1999", + pages = "395-431", + eprint = "hep-lat/9808011", + SLACcitation = "%%CITATION = HEP-LAT 9808011;%%" +} +@Article{Frezzotti:1998yp, + author = "Frezzotti, R. and Jansen, K.", + title = "The {PHMC} algorithm for simulations of dynamical fermions. + {II}: Performance analysis", + journal = "Nucl. Phys.", + volume = "B555", + year = "1999", + pages = "432-453", + eprint = "hep-lat/9808038", + SLACcitation = "%%CITATION = HEP-LAT 9808038;%%" +} +@Article{Frezzotti:1999vv, + author = "Frezzotti, R. and Grassi, P. A. and Sint, + S. and Weisz, P.", + title = "A local formulation of lattice {QCD} without unphysical + fermion zero modes", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "83", + year = "2000", + pages = "941-946", + eprint = "hep-lat/9909003", + SLACcitation = "%%CITATION = HEP-LAT 9909003;%%" +} +@Article{Frezzotti:2000nk, + author = "Frezzotti, R. and Grassi, P. A. and Sint, + S. and Weisz, P.", + collaboration = "ALPHA", + title = "Lattice {QCD} with a chirally twisted mass term", + journal = "JHEP", + volume = "08", + year = "2001", + pages = "058", + eprint = "hep-lat/0101001", + SLACcitation = "%%CITATION = HEP-LAT 0101001;%%" +} +@Article{Frezzotti:2001du, + author = "Frezzotti, R. and Sint, S.", + title = "Some remarks on {O(a)} improved twisted mass {QCD}", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "106", + year = "2002", + pages = "814-816", + eprint = "hep-lat/0110140", + SLACcitation = "%%CITATION = HEP-LAT 0110140;%%" +} +@Article{Frezzotti:2001ea, + author = "Frezzotti, R. and Sint, S. and Weisz, P.", + collaboration = "ALPHA", + title = "{O(a)} improved twisted mass lattice {QCD}", + journal = "JHEP", + volume = "07", + year = "2001", + pages = "048", + eprint = "hep-lat/0104014", + SLACcitation = "%%CITATION = HEP-LAT 0104014;%%" +} +@Article{Frezzotti:2003ni, + author = "Frezzotti, R. and Rossi, G. C.", + title = "Chirally improving {Wilson} fermions. {I}: {O(a)} improvement", + journal = "JHEP", + volume = "08", + year = "2004", + pages = "007", + eprint = "hep-lat/0306014", + SLACcitation = "%%CITATION = HEP-LAT 0306014;%%" +} +@Article{Frezzotti:2003xj, + author = "Frezzotti, R. and Rossi, G. C.", + title = "Twisted-mass lattice {QCD} with mass non-degenerate quarks", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "128", + year = "2004", + pages = "193-202", + eprint = "hep-lat/0311008", + SLACcitation = "%%CITATION = HEP-LAT 0311008;%%" +} +@Article{Frezzotti:2004wz, + author = "Frezzotti, R. and Rossi, G. C.", + title = "Chirally improving {Wilson} fermions. {II}: Four-quark + operators", + journal = "JHEP", + volume = "10", + year = "2004", + pages = "070", + eprint = "hep-lat/0407002", + SLACcitation = "%%CITATION = HEP-LAT 0407002;%%" +} +@Article{Frezzotti:2005gi, + author = "Frezzotti, R. and Martinelli, G. and Papinutto, M. and + Rossi, G. C.", + title = "Reducing cutoff effects in maximally twisted lattice {QCD} + close to the chiral limit", + journal = "JHEP", + volume = "04", + year = "2006", + pages = "038", + eprint = "hep-lat/0503034", + SLACcitation = "%%CITATION = HEP-LAT 0503034;%%" +} +@Article{Frezzotti:2007qv, + author = "Frezzotti, R. and Rossi, G.", + title = "{O(a^2) cutoff effects in Wilson fermion simulations}", + journal = "PoS", + volume = "LAT2007", + year = "2007", + pages = "277", + eprint = "0710.2492", + archivePrefix = "arXiv", + primaryClass = "hep-lat", + SLACcitation = "%%CITATION = 0710.2492;%%" +} +@Article{Frezzotti:2008dr, + author = "Frezzotti, R. and Lubicz, V. and Simula, S.", + collaboration = "ETM", + title = "{Electromagnetic form factor of the pion from twisted-mass + lattice {QCD} at {Nf}=2}", + year = "2008", + eprint = "0812.4042", + archivePrefix = "arXiv", + primaryClass = "hep-lat", + SLACcitation = "%%CITATION = 0812.4042;%%" +} +@Article{Fritzsch:1973pi, + author = "Fritzsch, H. and Gell-Mann, M. and Leutwyler, H.", + title = "Advantages of the color octet gluon picture", + journal = "Phys. Lett.", + volume = "B47", + year = "1973", + pages = "365-368", + SLACcitation = "%%CITATION = PHLTA,B47,365;%%" +} +@Article{Frommer:1994vn, + author = "Frommer, A. and Hannemann, V. and Nockel, B. and Lippert, + T. and Schilling, K.", + title = "Accelerating {Wilson} fermion matrix inversions by means of + the stabilized biconjugate gradient algorithm", + journal = "Int. J. Mod. Phys.", + volume = "C5", + year = "1994", + pages = "1073-1088", + eprint = "hep-lat/9404013", + SLACcitation = "%%CITATION = HEP-LAT 9404013;%%" +} +@Article{Frommer:1995ik, + author = "Frommer, Andreas and Nockel, Bertold and Gusken, Stephan + and Lippert, Thomas and Schilling, Klaus", + title = "Many masses on one stroke: Economic computation of quark + propagators", + journal = "Int. J. Mod. Phys.", + volume = "C6", + year = "1995", + pages = "627-638", + eprint = "hep-lat/9504020", + SLACcitation = "%%CITATION = HEP-LAT 9504020;%%" +} +@Article{Furman:1994ky, + author = "Furman, V. and Shamir, Y.", + title = "Axial symmetries in lattice QCD with Kaplan fermions", + journal = "Nucl. Phys.", + volume = "B439", + year = "1995", + pages = "54-78", + eprint = "hep-lat/9405004", + SLACcitation = "%%CITATION = HEP-LAT 9405004;%%" +} +@Article{Garden:1999fg, + author = "Garden, J. and Heitger, J. and Sommer, R. and + Wittig H.", + collaboration = "ALPHA", + title = "Precision computation of the strange quark's mass in + quenched {QCD}", + journal = "Nucl. Phys.", + volume = "B571", + year = "2000", + pages = "237-256", + eprint = "hep-lat/9906013", + SLACcitation = "%%CITATION = HEP-LAT 9906013;%%" +} +@Article{Garron:2003cb, + author = "Garron, N. and Giusti, L. and Hoelbling, + C. and Lellouch, L. and Rebbi, C.", + title = "B(K) from quenched {QCD} with exact chiral symmetry", + journal = "Phys. Rev. Lett.", + volume = "92", + year = "2004", + pages = "042001", + eprint = "hep-ph/0306295", + SLACcitation = "%%CITATION = HEP-PH 0306295;%%" +} +@Article{Gasser:1982ap, + author = "Gasser, J. and Leutwyler, H.", + title = "Quark masses", + journal = "Phys. Rept.", + volume = "87", + year = "1982", + pages = "77-169", + SLACcitation = "%%CITATION = PRPLC,87,77;%%" +} +@Article{Gasser:1983yg, + author = "Gasser, J. and Leutwyler, H.", + title = "Chiral perturbation theory to one loop", + journal = "Ann. Phys.", + volume = "158", + year = "1984", + pages = "142", + SLACcitation = "%%CITATION = APNYA,158,142;%%" +} + +@Article{Gasser:1985gg, + author = "Gasser, J. and Leutwyler, H.", + title = "Chiral perturbation theory: expansions in the mass of the + strange quark", + journal = "Nucl. Phys.", + volume = "B250", + year = "1985", + pages = "465", + SLACcitation = "%%CITATION = NUPHA,B250,465;%%" +} +@Article{Gasser:1986vb, + author = "Gasser, J. and Leutwyler, H.", + title = "LIGHT QUARKS AT LOW TEMPERATURES", + journal = "Phys. Lett.", + volume = "B184", + year = "1987", + pages = "83", + SLACcitation = "%%CITATION = PHLTA,B184,83;%%" +} +@Article{Gattringer:2003qx, + author = "Gattringer, C. and others", + collaboration = "BGR", + title = "Quenched spectroscopy with fixed-point and chirally + improved fermions", + journal = "Nucl. Phys.", + volume = "B677", + year = "2004", + pages = "3-51", + eprint = "hep-lat/0307013", + SLACcitation = "%%CITATION = HEP-LAT 0307013;%%" +} +@Article{Gell-Mann:1964nj, + author = "Gell-Mann, M.", + title = "A Schematic model of baryons and mesons", + journal = "Phys. Lett.", + volume = "8", + year = "1964", + pages = "214-215", + SLACcitation = "%%CITATION = PHLTA,8,214;%%" +} +@Article{Gell-Mann:1968rz, + author = "Gell-Mann, M. and Oakes, R. J. and Renner, B.", + title = "Behavior of current divergences under SU(3) x SU(3)", + journal = "Phys. Rev.", + volume = "175", + year = "1968", + pages = "2195-2199", + SLACcitation = "%%CITATION = PHRVA,175,2195;%%" +} +@PhdThesis{Geus:2002, + author = {R. Geus}, + title = {The Jacobi-Davidson algorithm for solving large + sparse symmetric eigenvalue problems with + application to the design of accelerator cavities}, + school = {Swiss Federal Institute Of Technology Z{\"u}rich}, + year = {2002}, + OPTkey = {DISS. ETH NO. 14734}, + OPTtype = {}, + OPTaddress = {}, + OPTmonth = {}, + OPTnote = {}, + OPTannote = {} +} +@Article{Gimenez:1998ue, + author = "Gimenez, V. and Giusti, L. and Rapuano, F. and Talevi, M.", + title = "Non-perturbative renormalization of quark bilinears", + journal = "Nucl. Phys.", + volume = "B531", + year = "1998", + pages = "429-445", + eprint = "hep-lat/9806006", + SLACcitation = "%%CITATION = HEP-LAT 9806006;%%" +} +@Article{Gimenez:2005nt, + author = "Gimenez, V. and Lubicz, V. and Mescia, F. and Porretti, V. + and Reyes, J.", + title = "{Operator product expansion and quark condensate from + lattice QCD in coordinate space}", + journal = "Eur. Phys. J.", + volume = "C41", + year = "2005", + pages = "535-544", + eprint = "hep-lat/0503001", + SLACcitation = "%%CITATION = HEP-LAT/0503001;%%" +} +@Article{Ginsparg:1981bj, + author = "Ginsparg, P. H. and {Wilson}, K. G.", + title = "A remnant of chiral symmetry on the lattice", + journal = "Phys. Rev.", + volume = "D25", + year = "1982", + pages = "2649", + SLACcitation = "%%CITATION = PHRVA,D25,2649;%%" +} +@Article{Giusti:1998wy, + author = "Giusti, L. and Rapuano, F. and Talevi, M. and Vladikas, A. + ", + title = "The QCD chiral condensate from the lattice", + journal = "Nucl. Phys.", + volume = "B538", + year = "1999", + pages = "249-277", + eprint = "hep-lat/9807014", + SLACcitation = "%%CITATION = HEP-LAT 9807014;%%" +} +@Article{Giusti:2001pk, + author = "Giusti, L. and Hoelbling, C. and Rebbi, C.", + title = "Light quark masses with overlap fermions in quenched {QCD}", + journal = "Phys. Rev.", + volume = "D64", + year = "2001", + pages = "114508", + eprint = "hep-lat/0108007", + note = "Erratum-ibid.D65:079903,2002", + SLACcitation = "%%CITATION = HEP-LAT 0108007;%%" +} +@Article{Giusti:2002sm, + author = "Giusti, L. and Hoelbling, C. and L{\"u}scher, M. and Wittig, H. + ", + title = "Numerical techniques for lattice QCD in the epsilon- + regime", + journal = "Comput. Phys. Commun.", + volume = "153", + year = "2003", + pages = "31-51", + eprint = "hep-lat/0212012", + SLACcitation = "%%CITATION = HEP-LAT 0212012;%%" +} +@Article{Giusti:2007hk, + author = "Giusti, Leonardo", + title = "Light dynamical fermions on the lattice: Toward the chiral + regime of QCD", + journal = "PoS.", + volume = "LAT2006", + year = "2007", + pages = "", + eprint = "hep-lat/0702014", + SLACcitation = "%%CITATION = HEP-LAT/0702014;%%" +} +@Article{Glassner:1996gz, + author = "Gl{\"a}ssner, U. and others", + title = "How to compute {G}reen's functions for entire mass + trajectories within {K}rylov solvers", + year = "1996", + eprint = "hep-lat/9605008", + SLACcitation = "%%CITATION = HEP-LAT 9605008;%%" +} +@Article{Gockeler:1998fn, + author = "G{\"o}ckeler, M. and others", + title = "Scaling of non-perturbatively {O(a)} improved {Wilson} + fermions: Hadron spectrum, quark masses and decay + constants", + journal = "Phys. Rev.", + volume = "D57", + year = "1998", + pages = "5562-5580", + eprint = "hep-lat/9707021", + SLACcitation = "%%CITATION = HEP-LAT 9707021;%%" +} +@Article{Gorishnii:1990vf, + author = "Gorishnii, S. G. and Kataev, A. L. and Larin, S. A.", + title = "{The O (alpha-s**3) corrections to sigma-tot (e+ e- $\to$ + hadrons) and Gamma (tau- $\to$ tau-neutrino + hadrons) in + QCD}", + journal = "Phys. Lett.", + volume = "B259", + year = "1991", + pages = "144-150", + SLACcitation = "%%CITATION = PHLTA,B259,144;%%" +} +@Article{Greenberg:1964pe, + author = "Greenberg, O. W.", + title = "Spin and unitary spin independence in a paraquark model of + baryons and mesons", + journal = "Phys. Rev. Lett.", + volume = "13", + year = "1964", + pages = "598-602", + SLACcitation = "%%CITATION = PRLTA,13,598;%%" +} +@Article{Gregory:2007ce, + author = "Gregory, Eric B. and Irving, Alan and Richards, Chris M. + and McNeile, Craig and Hart, Alistair", + title = "Pseudoscalar Flavor-Singlet Physics with Staggered + Fermions", + year = "2007", + archivePrefix = "arXiv", + primaryClass = "hep-lat", + eprint = "0710.1725", + SLACcitation = "%%CITATION = ARXIV:0710.1725;%%" +} +@Article{Gross:1973id, + author = "Gross, D. J. and Wilczek, F.", + title = "Ultraviolet behavior of non-Abelian gauge theories", + journal = "Phys. Rev. Lett.", + volume = "30", + year = "1973", + pages = "1343-1346", + SLACcitation = "%%CITATION = PRLTA,30,1343;%%" +} +@Article{Gross:1973ju, + author = "Gross, D. J. and Wilczek, F.", + title = "Asymptotically free gauge theories. 1", + journal = "Phys. Rev.", + volume = "D8", + year = "1973", + pages = "3633-3652", + SLACcitation = "%%CITATION = PHRVA,D8,3633;%%" +} +@Article{Gross:1974jv, + author = "Gross, D. J. and Neveu, A.", + title = "Dynamical symmetry breaking in asymptotically free field + theories", + journal = "Phys. Rev.", + volume = "D10", + year = "1974", + pages = "3235", + SLACcitation = "%%CITATION = PHRVA,D10,3235;%%" +} +@Article{Guagnelli:1998ud, + author = "Guagnelli, M. and Sommer, R. and Wittig, H.", + collaboration = "ALPHA", + title = "Precision computation of a low-energy reference scale in + quenched lattice {QCD}", + journal = "Nucl. Phys.", + volume = "B535", + year = "1998", + pages = "389-402", + eprint = "hep-lat/9806005", + SLACcitation = "%%CITATION = HEP-LAT 9806005;%%" +} +@Article{Guagnelli:2004ga, + author = "Guagnelli, M. and others", + collaboration = "Zeuthen-Rome (ZeRo)", + title = "Non-perturbative pion matrix element of a twist-2 operator + from the lattice", + journal = "Eur. Phys. J.", + volume = "C40", + year = "2005", + pages = "69-80", + eprint = "hep-lat/0405027", + SLACcitation = "%%CITATION = HEP-LAT 0405027;%%" +} +@Article{Guagnelli:2004ww, + author = "Guagnelli, M. and others", + collaboration = "Zeuthen-Rome (ZeRo)", + title = "Finite size effects of a pion matrix element", + journal = "Phys. Lett.", + volume = "B597", + year = "2004", + pages = "216-221", + eprint = "hep-lat/0403009", + SLACcitation = "%%CITATION = HEP-LAT 0403009;%%" +} +@Article{Guagnelli:2005zc, + author = "Guagnelli, M. and Heitger, J. and Pena, C. and Sint, S. and + Vladikas, A.", + collaboration = "ALPHA", + title = "Non-perturbative renormalization of left-left four-fermion + operators in quenched lattice QCD", + journal = "JHEP", + volume = "03", + year = "2006", + pages = "088", + eprint = "hep-lat/0505002", + SLACcitation = "%%CITATION = HEP-LAT 0505002;%%" +} +@Article{Gupta:1988js, + author = "Gupta, R. and Kilcup, G. W. and Sharpe, S. R. + ", + title = "Tuning the hybrid monte carlo algorithm", + journal = "Phys. Rev.", + volume = "D38", + year = "1988", + pages = "1278", + SLACcitation = "%%CITATION = PHRVA,D38,1278;%%" +} +@Article{Gupta:1989kx, + author = "Gupta, R. and others", + title = "{QCD} with dynamical {Wilson} fermions", + journal = "Phys. Rev.", + volume = "D40", + year = "1989", + pages = "2072", + SLACcitation = "%%CITATION = PHRVA,D40,2072;%%" +} +@Article{Gupta:1990ka, + author = "Gupta, S. and Irback, A. and Karsch, F. and + Petersson, B.", + title = "The acceptance probability in the hybrid monte carlo + method", + journal = "Phys. Lett.", + volume = "B242", + year = "1990", + pages = "437-443", + SLACcitation = "%%CITATION = PHLTA,B242,437;%%" +} +@Article{Gupta:1991sn, + author = "Gupta, R. and others", + title = "{QCD} with dynamical {Wilson} fermions. 2", + journal = "Phys. Rev.", + volume = "D44", + year = "1991", + pages = "3272-3292", + SLACcitation = "%%CITATION = PHRVA,D44,3272;%%" +} +@Unpublished{Gupta:1997nd, + author = "Gupta, R.", + title = "Introduction to lattice {QCD}", + year = "1997", + eprint = "hep-lat/9807028", + note = "Lectures given at Les Houches Summer School in Theoretical Physics, Session 68", + SLACcitation = "%%CITATION = HEP-LAT 9807028;%%" +} +@Article{Han:1965pf, + author = "Han, M. Y. and Nambu, Yoichiro", + title = "Three-triplet model with double SU(3) symmetry", + journal = "Phys. Rev.", + volume = "139", + year = "1965", + pages = "B1006-B1010", + SLACcitation = "%%CITATION = PHRVA,139,B1006;%%" +} +@Article{Hasenbusch:2001ne, + author = "Hasenbusch, M.", + title = "Speeding up the {H}ybrid-{M}onte-{C}arlo algorithm for dynamical + fermions", + journal = "Phys. Lett.", + volume = "B519", + year = "2001", + pages = "177-182", + eprint = "hep-lat/0107019", + SLACcitation = "%%CITATION = HEP-LAT 0107019;%%" +} +@Article{Hasenbusch:2002ai, + author = "Hasenbusch, M. and Jansen, K.", + title = "Speeding up lattice {QCD} simulations with clover-improved + {Wilson} fermions", + journal = "Nucl. Phys.", + volume = "B659", + year = "2003", + pages = "299-320", + eprint = "hep-lat/0211042", + SLACcitation = "%%CITATION = HEP-LAT 0211042;%%" +} +@Article{Hasenbusch:2003vg, + author = "Hasenbusch, Martin", + title = "{Full QCD algorithms towards the chiral limit}", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "129", + year = "2004", + pages = "27-33", + eprint = "hep-lat/0310029", + archivePrefix = "arXiv", + doi = "10.1016/S0920-5632(03)02504-0", + SLACcitation = "%%CITATION = HEP-LAT/0310029;%%" +} +@Article{Hasenfratz:1998jp, + author = "Hasenfratz, P.", + title = "Lattice {QCD} without tuning, mixing and current + renormalization", + journal = "Nucl. Phys.", + volume = "B525", + year = "1998", + pages = "401-409", + eprint = "hep-lat/9802007", + SLACcitation = "%%CITATION = HEP-LAT 9802007;%%" +} +@Article{Hasenfratz:1998ri, + author = "Hasenfratz, P. and Laliena, V. and Niedermayer, + F.", + title = "The index theorem in {QCD} with a finite cut-off", + journal = "Phys. Lett.", + volume = "B427", + year = "1998", + pages = "125-131", + eprint = "hep-lat/9801021", + SLACcitation = "%%CITATION = HEP-LAT 9801021;%%" +} +@Article{Hasenfratz:2001hp, + author = "Hasenfratz, A. and Knechtli, F.", + title = "Flavor symmetry and the static potential with hypercubic + blocking", + journal = "Phys. Rev.", + volume = "D64", + year = "2001", + pages = "034504", + eprint = "hep-lat/0103029", + SLACcitation = "%%CITATION = HEP-LAT 0103029;%%" +} +@Article{Hasenfratz:2001tw, + author = "Hasenfratz, A. and Hoffmann, R. and Knechtli, F.", + title = "The static potential with hypercubic blocking", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "106", + year = "2002", + pages = "418-420", + eprint = "hep-lat/0110168", + SLACcitation = "%%CITATION = HEP-LAT 0110168;%%" +} +@Article{Hashimoto:2008xg, + author = "Hashimoto, Koichi and Izubuchi, Taku", + title = "{eta' meson from two flavor dynamical domain wall + fermions}", + year = "2008", + eprint = "0803.0186", + archivePrefix = "arXiv", + primaryClass = "hep-lat", + SLACcitation = "%%CITATION = ARXIV:0803.0186;%%" +} +@Article{Heitger:2000ay, + author = "Heitger, J. and Sommer, R. and Wittig, H.", + collaboration = "ALPHA", + title = "Effective chiral Lagrangians and lattice {{QCD}}", + journal = "Nucl. Phys.", + volume = "B588", + year = "2000", + pages = "377-399", + eprint = "hep-lat/0006026", + note = "and references therein", + SLACcitation = "%%CITATION = HEP-LAT 0006026;%%" +} +@Article{Hernandez:1998et, + author = "Hernandez, P. and Jansen, K. and L{\"u}scher, M.", + title = "Locality properties of Neuberger's lattice Dirac operator", + journal = "Nucl. Phys.", + volume = "B552", + year = "1999", + pages = "363-378", + eprint = "hep-lat/9808010", + SLACcitation = "%%CITATION = HEP-LAT 9808010;%%" +} +@Article{Hernandez:2000sb, + author = "Hernandez, P. and Jansen, K. and Lellouch, L.", + title = "A numerical treatment of Neuberger's lattice Dirac + operator", + year = "2000", + eprint = "hep-lat/0001008", + SLACcitation = "%%CITATION = HEP-LAT 0001008;%%" +} +@Article{Hernandez:2001hq, + author = "Hernandez, P. and Jansen, K. and Lellouch, L. and + Wittig, H.", + title = "Scalar condensate and light quark masses from overlap + fermions", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "106", + year = "2002", + pages = "766-771", + eprint = "hep-lat/0110199", + SLACcitation = "%%CITATION = HEP-LAT 0110199;%%" +} +@Article{Hernandez:2001yn, + author = "Hernandez, P. and Jansen, K. and Lellouch, L. and + Wittig, H.", + title = "Non-perturbative renormalization of the quark condensate in + {Ginsparg}-{Wilson} regularizations", + journal = "JHEP", + volume = "07", + year = "2001", + pages = "018", + eprint = "hep-lat/0106011", + SLACcitation = "%%CITATION = HEP-LAT 0106011;%%" +} +@Article{Horsley:2004mx, + author = "Horsley, R. and Perlt, H. and Rakow, P. E. L. and + Schierholz, G. and Schiller, A.", + collaboration = "QCDSF", + title = "One-loop renormalisation of quark bilinears for overlap + fermions with improved gauge actions", + journal = "Nucl. Phys.", + volume = "B693", + year = "2004", + pages = "3-35", + eprint = "hep-lat/0404007", + SLACcitation = "%%CITATION = HEP-LAT 0404007;%%" +} +@Article{Ilgenfritz:2003gw, + author = "Ilgenfritz, E.-M. and Kerler, W. and + M{\"u}ller-Preu{\ss}ker, M. and Sternbeck, A. and St{\"u}ben, H.", + title = "A numerical reinvestigation of the {Aoki} phase with {N(f)} = 2 + {Wilson} fermions at zero temperature", + journal = "Phys. Rev.", + volume = "D69", + year = "2004", + pages = "074511", + eprint = "hep-lat/0309057", + SLACcitation = "%%CITATION = HEP-LAT 0309057;%%" +} +@Article{Ilgenfritz:2006tz, + author = "Ilgenfritz, E. -M. and others", + title = "Twisted mass QCD thermodynamics: First results on apeNEXT", + year = "2006", + eprint = "hep-lat/0610112", + SLACcitation = "%%CITATION = HEP-LAT 0610112;%%" +} +@Article{Iwasaki:1983ck, + author = "Iwasaki, Y.", + title = "Renormalization group analysis of lattice theories and + improved lattice action. 2. four-dimensional nonabelian + SU(N) gauge model", + note = "UTHEP-118" +} +@Article{Iwasaki:1985we, + author = "Iwasaki, Y.", + title = "Renormalization group analysis of lattice theories and + improved lattice action: two-dimensional nonlinear O(N) + sigma model", + journal = "Nucl. Phys.", + volume = "B258", + year = "1985", + pages = "141-156", + SLACcitation = "%%CITATION = NUPHA,B258,141;%%" +} +@Article{Iwasaki:1992hn, + author = "Iwasaki, Y. and Kanaya, K. and Sakai, S. and Yoshie, T.", + title = "Quark confinement in multi - flavor quantum + chromodynamics", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "30", + year = "1993", + pages = "327-330", + eprint = "hep-lat/9211035", + SLACcitation = "%%CITATION = HEP-LAT 9211035;%%" +} +@Article{Izubuchi:1998hy, + author = "Izubuchi, T. and Noaki, J. and Ukawa, A.", + title = "Two-dimensional lattice Gross-Neveu model with {Wilson} + fermion action at finite temperature and chemical + potential", + journal = "Phys. Rev.", + volume = "D58", + year = "1998", + pages = "114507", + eprint = "hep-lat/9805019", + SLACcitation = "%%CITATION = HEP-LAT 9805019;%%" +} +@Article{Jacobs:1983ph, + author = "Jacobs, L.", + title = "Undoubling chirally symmetric lattice fermions", + journal = "Phys. Rev. Lett.", + volume = "51", + year = "1983", + pages = "172", + SLACcitation = "%%CITATION = PRLTA,51,172;%%" +} +@Article{Jagels:1994a, + author = "Jagels, C. F. and Reichel, L.", + title = " fast minimal residual algorithm for shifted unitary matrices", + journal = "Numer. Linear Algebra Appl.", + volume = "1(6)", + pages = "555-570", + year = "1994" +} +@Article{Jagels:1994aa, + author = "Jagels, C. F. and Reichel, L.", + title = "A Fast Minimal Residual Algorithm for Shifted Unitary + Matrices", + journal = "Numerical Linear Algebra with Aplications", + volume = "1(6)", + year = "1994", + pages = "555-570", +} +@Article{Jansen:1994ym, + author = "Jansen, K.", + title = "Domain wall fermions and chiral gauge theories", + journal = "Phys. Rept.", + volume = "273", + year = "1996", + pages = "1-54", + eprint = "hep-lat/9410018", + SLACcitation = "%%CITATION = HEP-LAT 9410018;%%" +} +@Article{Jansen:1995ck, + author = "Jansen, Karl and others", + title = "Non-perturbative renormalization of lattice QCD at all + scales", + journal = "Phys. Lett.", + volume = "B372", + year = "1996", + pages = "275-282", + eprint = "hep-lat/9512009", + SLACcitation = "%%CITATION = HEP-LAT 9512009;%%" +} +@Article{Jansen:1996cq, + author = "Jansen, K. and Liu, C.", + title = "Study of Liapunov exponents and the reversibility of + molecular dynamics algorithms", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "53", + year = "1997", + pages = "974-976", + eprint = "hep-lat/9607057", + SLACcitation = "%%CITATION = HEP-LAT 9607057;%%" +} +@Article{Jansen:1996xp, + author = "Jansen, K.", + title = "Recent developments in fermion simulation algorithms", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "53", + year = "1997", + pages = "127-133", + eprint = "hep-lat/9607051", + SLACcitation = "%%CITATION = HEP-LAT 9607051;%%" +} +@Article{Jansen:1997yt, + author = "Jansen, K. and Liu, C.", + title = "Implementation of Symanzik's improvement program for + simulations of dynamical {Wilson} fermions in lattice {QCD}", + journal = "Comput. Phys. Commun.", + volume = "99", + year = "1997", + pages = "221-234", + eprint = "hep-lat/9603008", + SLACcitation = "%%CITATION = HEP-LAT 9603008;%%" +} +@Article{Jansen:1998mx, + author = "Jansen, K. and Sommer, R.", + collaboration = "ALPHA", + title = "O(alpha) improvement of lattice {QCD} with two flavors of + {Wilson} quarks", + journal = "Nucl. Phys.", + volume = "B530", + year = "1998", + pages = "185-203", + eprint = "hep-lat/9803017", + SLACcitation = "%%CITATION = HEP-LAT 9803017;%%" +} +@Article{Jansen:2003ir, + author = "Jansen, K. and Shindler, A. and Urbach, C. and + Wetzorke, I.", + collaboration = "\xlf", + title = "Scaling test for {Wilson} twisted mass {QCD}", + journal = "Phys. Lett.", + volume = "B586", + year = "2004", + pages = "432-438", + eprint = "hep-lat/0312013", + SLACcitation = "%%CITATION = HEP-LAT 0312013;%%" +} +@Article{Jansen:2003jq, + author = "Jansen, K. and Nagai, K.-I.", + title = "Reducing residual-mass effects for domain-wall fermions", + journal = "JHEP", + volume = "12", + year = "2003", + pages = "038", + eprint = "hep-lat/0305009", + SLACcitation = "%%CITATION = HEP-LAT 0305009;%%" +} +@Article{Jansen:2003nt, + author = "Jansen, K.", + title = "Actions for dynamical fermion simulations: Are we ready to + go?", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "129", + year = "2004", + pages = "3-16", + eprint = "hep-lat/0311039", + SLACcitation = "%%CITATION = HEP-LAT 0311039;%%" +} +@Article{Jansen:2005cg, + author = "Jansen, K. and others", + collaboration = "\xlf", + title = "Flavour breaking effects of {Wilson} twisted mass fermions", + journal = "Phys. Lett.", + volume = "B624", + year = "2005", + pages = "334-341", + eprint = "hep-lat/0507032", + SLACcitation = "%%CITATION = HEP-LAT 0507032;%%" +} +@Unpublished{Jansen:2005chi, + author = {Jansen, K. and others}, +collaborations = {\xlf}, + title = {}, + note = {in preparation}, + OPTkey = {}, + OPTmonth = {}, + year = {2005}, + OPTannote = {} +} +@Article{Jansen:2005gf, + author = "Jansen, K. and Papinutto, M. and Shindler, A. and Urbach, + C. and Wetzorke, I.", + collaboration = "\xlf", + title = "Light quarks with twisted mass fermions", + journal = "Phys. Lett.", + volume = "B619", + year = "2005", + pages = "184-191", + eprint = "hep-lat/0503031", + SLACcitation = "%%CITATION = HEP-LAT 0503031;%%" +} +@Article{Jansen:2005kk, + author = "Jansen, K. and Papinutto, M. and Shindler, A. and Urbach, + C. and Wetzorke, I.", + collaboration = "\xlf", + title = "Quenched scaling of {Wilson} twisted mass fermions", + journal = "JHEP", + volume = "09", + year = "2005", + pages = "071", + eprint = "hep-lat/0507010", + SLACcitation = "%%CITATION = HEP-LAT 0507010;%%" +} +@Article{Jansen:2005yp, + author = "Jansen, Karl and Shindler, Andrea and Urbach, Carsten and + Wenger, Urs", + title = "{HMC} algorithm with multiple time scale integration and mass + preconditioning", + journal = "PoS", + volume = "LAT2005", + year = "2006", + pages = "118", + eprint = "hep-lat/0510064", + SLACcitation = "%%CITATION = HEP-LAT 0510064;%%" +} +@Article{Jansen:2006ks, + author = "Jansen, Karl", + title = "Status report on ILDG activities", + year = "2006", + eprint = "hep-lat/0609012", + SLACcitation = "%%CITATION = HEP-LAT 0609012;%%" +} +@Article{Jansen:2006rf, + author = "Jansen, Karl and Urbach, Carsten", + collaboration = "ETM", + title = "First results with two light flavours of quarks with + maximally twisted mass", + year = "2006", + eprint = "hep-lat/0610015", + SLACcitation = "%%CITATION = HEP-LAT 0610015;%%" +} +@Article{Jansen:2008wv, + author = "Jansen, K. and Michael, C. and Urbach, C.", + collaboration = "ETM", + title = "The eta' meson from lattice {QCD}", + year = "2008", + eprint = "0804.3871", + archivePrefix = "arXiv", + primaryClass = "hep-lat", + SLACcitation = "%%CITATION = 0804.3871;%%" +} +@Article{Jansen:2008zz, + author = "Jansen, K. and Michael, C. and Urbach, C.", + title = "{The eta-prime meson from lattice QCD}", + journal = "Eur. Phys. J.", + volume = "C58", + year = "2008", + pages = "261-269", + doi = "10.1140/epjc/s10052-008-0764-6", + SLACcitation = "%%CITATION = EPHJA,C58,261;%%" +} +@Unpublished{Jegerlehner:1996pm, + author = "Jegerlehner, Beat", + title = "Krylov space solvers for shifted linear systems", + year = "1996", + eprint = "hep-lat/9612014", + note = "unpublished", + SLACcitation = "%%CITATION = HEP-LAT 9612014;%%" +} +@Article{Jegerlehner:1997rn, + author = "Jegerlehner, B.", + title = "Multiple mass solvers", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "63", + year = "1998", + pages = "958-960", + eprint = "hep-lat/9708029", + SLACcitation = "%%CITATION = HEP-LAT 9708029;%%" +} +@Article{Jegerlehner:2003qp, + author = "Jegerlehner, F.", + title = "Theoretical precision in estimates of the hadronic + contributions to (g-2)mu and alpha(QED)(M(Z))", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "126", + year = "2004", + pages = "325-334", + eprint = "hep-ph/0310234", + SLACcitation = "%%CITATION = HEP-PH 0310234;%%" +} +@Article{Jenkins:1990jv, + author = "Jenkins, Elizabeth Ellen and Manohar, Aneesh V.", + title = "Baryon chiral perturbation theory using a heavy fermion + Lagrangian", + journal = "Phys. Lett.", + volume = "B255", + year = "1991", + pages = "558-562", + SLACcitation = "%%CITATION = PHLTA,B255,558;%%" +} +@Article{Kaiser:1998ds, + author = "Kaiser, Roland and Leutwyler, H.", + title = "{Pseudoscalar decay constants at large N(c)}", + year = "1998", + eprint = "hep-ph/9806336", + SLACcitation = "%%CITATION = HEP-PH/9806336;%%" +} + +@Article{Kalkreuter:1995mm, + author = "Kalkreuter, Thomas and Simma, Hubert", + title = "An Accelerated conjugate gradient algorithm to compute low + lying eigenvalues: A Study for the Dirac operator in SU(2) + lattice QCD", + journal = "Comput. Phys. Commun.", + volume = "93", + year = "1996", + pages = "33-47", + eprint = "hep-lat/9507023", + SLACcitation = "%%CITATION = HEP-LAT 9507023;%%" +} +@Article{Kalkreuter:1996mm, + author = "Kalkreuter, T. and Simma, H.", + title = "An Accelerated conjugate gradient algorithm to compute low + lying eigenvalues: A Study for the Dirac operator in SU(2) + lattice {QCD}", + journal = "Comput. Phys. Commun.", + volume = "93", + year = "1996", + pages = "33-47", + eprint = "hep-lat/9507023", + SLACcitation = "%%CITATION = HEP-LAT 9507023;%%" +} +@Article{Kamleh:2005wg, + author = "Kamleh, W. and Peardon, M. J.", + collaboration = "TrinLat", + title = "{Polynomial filtering for HMC in lattice QCD}", + journal = "PoS", + volume = "LAT2005", + year = "2006", + pages = "106", + SLACcitation = "%%CITATION = POSCI,LAT2005,106;%%" +} +@Article{Kaplan:1992bt, + author = "Kaplan, D. B.", + title = "A Method for simulating chiral fermions on the lattice", + journal = "Phys. Lett.", + volume = "B288", + year = "1992", + pages = "342-347", + eprint = "hep-lat/9206013", + SLACcitation = "%%CITATION = HEP-LAT 9206013;%%" +} +@Article{Karsten:1980wd, + author = "Karsten, L. H. and Smit, J.", + title = "Lattice fermions: species doubling, chiral invariance, and + the triangle anomaly", + journal = "Nucl. Phys.", + volume = "B183", + year = "1981", + pages = "103", + SLACcitation = "%%CITATION = NUPHA,B183,103;%%" +} +@Article{Kennedy:1990bv, + author = "Kennedy, A. D. and Pendleton, B.", + title = "Acceptances and autocorrelations in hybrid Monte Carlo", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "20", + year = "1991", + pages = "118-121", + SLACcitation = "%%CITATION = NUPHZ,20,118;%%" +} +@Article{Knechtli:1998gf, + author = "Knechtli, F. and Sommer, R.", + collaboration = "ALPHA", + title = "String breaking in SU(2) gauge theory with scalar matter + fields", + journal = "Phys. Lett.", + volume = "B440", + year = "1998", + pages = "345-352", + eprint = "hep-lat/9807022", + SLACcitation = "%%CITATION = HEP-LAT 9807022;%%" +} +@Article{Knechtli:2000df, + author = "Knechtli, F. and Sommer, R.", + collaboration = "ALPHA", + title = "String breaking as a mixing phenomenon in the SU(2) Higgs + model", + journal = "Nucl. Phys.", + volume = "B590", + year = "2000", + pages = "309-328", + eprint = "hep-lat/0005021", + SLACcitation = "%%CITATION = HEP-LAT 0005021;%%" +} +@Article{Lacock:1994qx, + author = "Lacock, P. and McKerrell, A. and Michael, C. and Stopher, + I. M. and Stephenson, P. W.", + collaboration = "UKQCD", + title = "Efficient hadronic operators in lattice gauge theory", + journal = "Phys. Rev.", + volume = "D51", + year = "1995", + pages = "6403-6410", + eprint = "hep-lat/9412079", + SLACcitation = "%%CITATION = HEP-LAT 9412079;%%" +} +@Article{Lepage:1992xa, + author = "Lepage, G. Peter and Mackenzie, Paul B.", + title = "On the viability of lattice perturbation theory", + journal = "Phys. Rev.", + volume = "D48", + year = "1993", + pages = "2250-2264", + eprint = "hep-lat/9209022", + SLACcitation = "%%CITATION = HEP-LAT 9209022;%%" +} +@Article{Lepage:2001ym, + author = "Lepage, G. P. and others", + title = "{Constrained curve fitting}", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "106", + year = "2002", + pages = "12-20", + eprint = "hep-lat/0110175", + archivePrefix = "arXiv", + doi = "10.1016/S0920-5632(01)01638-3", + SLACcitation = "%%CITATION = HEP-LAT/0110175;%%" +} +@Article{Lesk:2002gd, + author = "Lesk, V. I. and others", + collaboration = "CP-PACS", + title = "Flavor singlet meson mass in the continuum limit in two- + flavor lattice QCD", + journal = "Phys. Rev.", + volume = "D67", + year = "2003", + pages = "074503", + eprint = "hep-lat/0211040", + SLACcitation = "%%CITATION = HEP-LAT/0211040;%%" +} +@Article{Leutwyler:1997yr, + author = "Leutwyler, H.", + title = "{On the 1/N-expansion in chiral perturbation theory}", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "64", + year = "1998", + pages = "223-231", + eprint = "hep-ph/9709408", + SLACcitation = "%%CITATION = HEP-PH/9709408;%%" +} +@Article{Leutwyler:2006qq, + author = "Leutwyler, H.", + title = "pi pi scattering", + year = "2006", + eprint = "hep-ph/0612112", + SLACcitation = "%%CITATION = HEP-PH 0612112;%%" +} +@Article{Liu:1997fs, + author = "Liu, C. and Jaster, A. and Jansen, K.", + title = "Liapunov exponents and the reversibility of molecular + dynamics algorithms", + journal = "Nucl. Phys.", + volume = "B524", + year = "1998", + pages = "603-617", + eprint = "hep-lat/9708017", + SLACcitation = "%%CITATION = HEP-LAT 9708017;%%" +} +@Article{Luscher:1985dn, + author = "L{\"u}scher, M.", + title = "{Volume Dependence of the Energy Spectrum in Massive + Quantum Field Theories. 1. Stable Particle States}", + journal = "Commun. Math. Phys.", + volume = "104", + year = "1986", + pages = "177", + doi = "10.1007/BF01211589", + SLACcitation = "%%CITATION = CMPHA,104,177;%%" +} +@Article{Luscher:1990ck, + author = "L{\"u}scher, M. and Wolff, U.", + title = "How to calculate the elastic scattering matrix in two- + dimensional quantum field theories by numerical + simulation", + journal = "Nucl. Phys.", + volume = "B339", + year = "1990", + pages = "222-252", + SLACcitation = "%%CITATION = NUPHA,B339,222;%%" +} +@Article{Luscher:1993dy, + author = "L{\"u}scher, Martin", + title = "{A Portable high quality random number generator for + lattice field theory simulations}", + journal = "Comput. Phys. Commun.", + volume = 79, + year = 1994, + pages = "100-110", + eprint = "hep-lat/9309020", + archivePrefix = "arXiv", + doi = "10.1016/0010-4655(94)90232-1", + SLACcitation = "%%CITATION = HEP-LAT/9309020;%%" +} +@Article{Luscher:1993xx, + author = "L{\"u}scher, Martin", + title = "A New approach to the problem of dynamical quarks in + numerical simulations of lattice {QCD}", + journal = "Nucl. Phys.", + volume = "B418", + year = "1994", + pages = "637-648", + eprint = "hep-lat/9311007", + archivePrefix = "arXiv", + doi = "10.1016/0550-3213(94)90533-9", + SLACcitation = "%%CITATION = HEP-LAT/9311007;%%" +} +@Article{Luscher:1993xx, + author = "L{\"u}scher, M.", + title = "A New approach to the problem of dynamical quarks in + numerical simulations of lattice {QCD}", + journal = "Nucl. Phys.", + volume = "B418", + year = "1994", + pages = "637-648", + eprint = "hep-lat/9311007", + SLACcitation = "%%CITATION = HEP-LAT 9311007;%%" +} +@Article{Luscher:1996sc, + author = "L{\"u}scher, M. and Sint, S. and Sommer, R. and + Weisz, P.", + title = "Chiral symmetry and {O(a)} improvement in lattice {QCD}", + journal = "Nucl. Phys.", + volume = "B478", + year = "1996", + pages = "365-400", + eprint = "hep-lat/9605038", + SLACcitation = "%%CITATION = HEP-LAT 9605038;%%" +} +@Article{Luscher:1996ug, + author = "L{\"u}scher, M. and Sint, S. and Sommer, R. and + Weisz, P. and Wolff, U.", + title = "Non-perturbative {O(a)} improvement of lattice {QCD}", + journal = "Nucl. Phys.", + volume = "B491", + year = "1997", + pages = "323-343", + eprint = "hep-lat/9609035", + SLACcitation = "%%CITATION = HEP-LAT 9609035;%%" +} +@Article{Luscher:1998pq, + author = "L{\"u}scher, M.", + title = "Exact chiral symmetry on the lattice and the {Ginsparg}- + {Wilson} relation", + journal = "Phys. Lett.", + volume = "B428", + year = "1998", + pages = "342-345", + eprint = "hep-lat/9802011", + SLACcitation = "%%CITATION = HEP-LAT 9802011;%%" +} +@Article{Luscher:2001tx, + author = "L{\"u}scher, Martin", + title = "{Lattice QCD on PCs?}", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "106", + year = "2002", + pages = "21-28", + eprint = "hep-lat/0110007", + archivePrefix = "arXiv", + doi = "10.1016/S0920-5632(01)01639-5", + SLACcitation = "%%CITATION = HEP-LAT/0110007;%%" +} +@Article{Luscher:2003qa, + author = "L{\"u}scher, M.", + title = "Solution of the {D}irac equation in lattice {QCD} using a + domain decomposition method", + journal = "Comput. Phys. Commun.", + volume = "156", + year = "2004", + pages = "209-220", + eprint = "hep-lat/0310048", + SLACcitation = "%%CITATION = HEP-LAT 0310048;%%" +} +@Article{Luscher:2004rx, + author = "L{\"u}scher, M.", + title = "Schwarz-preconditioned {HMC} algorithm for two-flavour + lattice {QCD}", + journal = "Comput. Phys. Commun.", + volume = "165", + year = "2005", + pages = "199", + eprint = "hep-lat/0409106", + SLACcitation = "%%CITATION = HEP-LAT 0409106;%%" +} + +@Article{Luscher:2005mv, + author = "L{\"u}scher, Martin", + title = "Lattice {QCD} with light {W}ilson quarks", + journal = "\href{http://pos.sissa.it/archive/conferences/020/008/LAT2005_002.pdf}{PoS(LAT2005)002}", + year = "2005", + eprint = "hep-lat/0509152", + howpublished="Talk presented at International Symposium on Lattice Field Theory (Lattice 2005)", + SLACcitation = "%%CITATION = HEP-LAT 0509152;%%" +} +@Article{Luscher:2007es, + author = "L{\"u}scher, Martin", + title = "{Deflation acceleration of lattice {QCD} simulations}", + journal = "JHEP", + volume = "12", + year = "2007", + pages = "011", + eprint = "0710.5417", + archivePrefix = "arXiv", + primaryClass = "hep-lat", + doi = "10.1088/1126-6708/2007/12/011", + SLACcitation = "%%CITATION = 0710.5417;%%" +} +@Article{Luscher:ranluxweb, + author = "L{\"u}scher, M.", + title = "Ranlux random number generator", + eprint = "http://luscher.web.cern.ch/luscher/ranlux/" +} +@Article{Luscher:sse, + author = "L{\"u}scher, M.", + title = "Lattice QCD parallel benchmark programs", + eprint = "http://luscher.web.cern.ch/luscher/QCDpbm/" +} +@Article{Madras:1988ei, + author = "Madras, N. and Sokal, A. D.", + title = "The Pivot algorithm: a highly efficient Monte Carlo method + for selfavoiding walk", + journal = "J. Statist. Phys.", + volume = "50", + year = "1988", + pages = "109-186", + SLACcitation = "%%CITATION = JSTPB,50,109;%%" +} +@Article{Martinelli:1982mw, + author = "Martinelli, G. and Zhang, Yi-Cheng", + title = "THE CONNECTION BETWEEN LOCAL OPERATORS ON THE LATTICE AND + IN THE CONTINUUM AND ITS RELATION TO MESON DECAY + CONSTANTS", + journal = "Phys. Lett.", + volume = "B123", + year = "1983", + pages = "433", + SLACcitation = "%%CITATION = PHLTA,B123,433;%%" +} +@Article{Martinelli:1994ty, + author = "Martinelli, G. and Pittori, C. and Sachrajda, Christopher + T. and Testa, M. and Vladikas, A.", + title = "{A General method for nonperturbative renormalization of + lattice operators}", + journal = "Nucl. Phys.", + volume = "B445", + year = "1995", + pages = "81-108", + eprint = "hep-lat/9411010", + archivePrefix = "arXiv", + doi = "10.1016/0550-3213(95)00126-D", + SLACcitation = "%%CITATION = HEP-LAT/9411010;%%" +} +@Article{McNeile:2000hf, + author = "McNeile, C. and Michael, C.", + collaboration = "UKQCD", + title = "The eta and eta' mesons in {QCD}", + journal = "Phys. Lett.", + volume = "B491", + year = "2000", + pages = "123-129", + eprint = "hep-lat/0006020", + SLACcitation = "%%CITATION = HEP-LAT 0006020;%%" +} +@Article{McNeile:2000xx, + author = "McNeile, Craig and Michael, Chris", + collaboration = "UKQCD", + title = "Mixing of scalar glueballs and flavour-singlet scalar + mesons", + journal = "Phys. Rev.", + volume = "D63", + year = "2001", + pages = "114503", + eprint = "hep-lat/0010019", + SLACcitation = "%%CITATION = HEP-LAT0010019;%%" +} +@Article{McNeile:2001cr, + author = "McNeile, C. and Michael, C. and Sharkey, K. J.", + collaboration = "UKQCD", + title = "The flavor singlet mesons in {QCD}", + journal = "Phys. Rev.", + volume = "D65", + year = "2002", + pages = "014508", + eprint = "hep-lat/0107003", + SLACcitation = "%%CITATION = HEP-LAT 0107003;%%" +} +@Article{McNeile:2002fh, + author = "McNeile, C. and Michael, C.", + collaboration = "UKQCD", + title = "Hadronic decay of a vector meson from the lattice", + journal = "Phys. Lett.", + volume = "B556", + year = "2003", + pages = "177-184", + eprint = "hep-lat/0212020", + SLACcitation = "%%CITATION = HEP-LAT 0212020;%%" +} +@Article{McNeile:2006bz, + author = "McNeile, C. and Michael, C.", + collaboration = "UKQCD", + title = "Decay width of light quark hybrid meson from the lattice", + journal = "Phys. Rev.", + volume = "D73", + year = "2006", + pages = "074506", + eprint = "hep-lat/0603007", + SLACcitation = "%%CITATION = HEP-LAT 0603007;%%" +} +@Article{Meyer:2006ty, + author = "Meyer, Harvey B. and others", + title = "{Exploring the HMC trajectory-length dependence of + autocorrelation times in lattice QCD}", + journal = "Comput. Phys. Commun.", + volume = "176", + year = "2007", + pages = "91-97", + eprint = "hep-lat/0606004", + archivePrefix = "arXiv", + doi = "10.1016/j.cpc.2006.08.002", + SLACcitation = "%%CITATION = HEP-LAT/0606004;%%" +} +@Article{Michael:1982gb, + author = "Michael, C. and Teasdale, I.", + title = "EXTRACTING GLUEBALL MASSES FROM LATTICE QCD", + journal = "Nucl. Phys.", + volume = "B215", + year = "1983", + pages = "433", + SLACcitation = "%%CITATION = NUPHA,B215,433;%%" +} +@Article{Michael:1989mf, + author = "Michael, C.", + title = "Particle decay in lattice gauge theory", + journal = "Nucl. Phys.", + volume = "B327", + year = "1989", + pages = "515", + SLACcitation = "%%CITATION = NUPHA,B327,515;%%" +} +@Article{Michael:1991nc, + author = "Michael, C.", + title = "Hadronic forces from the lattice", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "26", + year = "1992", + pages = "417-419", + SLACcitation = "%%CITATION = NUPHZ,26,417;%%" +} +@Article{Michael:1993yj, + author = "Michael, Christopher", + title = "{Fitting correlated data}", + journal = "Phys. Rev.", + volume = "D49", + year = "1994", + pages = "2616-2619", + eprint = "hep-lat/9310026", + archivePrefix = "arXiv", + doi = "10.1103/PhysRevD.49.2616", + SLACcitation = "%%CITATION = HEP-LAT/9310026;%%" +} +@Article{Michael:1994sz, + author = "Michael, Christopher and McKerrell, A.", + title = "{Fitting correlated hadron mass spectrum data}", + journal = "Phys. Rev.", + volume = "D51", + year = "1995", + pages = "3745-3750", + eprint = "hep-lat/9412087", + archivePrefix = "arXiv", + doi = "10.1103/PhysRevD.51.3745", + SLACcitation = "%%CITATION = HEP-LAT/9412087;%%" +} +@Article{Michael:2007vn, + author = "Michael, C. and Urbach, C.", + collaboration = "ETM", + title = "Neutral mesons and disconnected diagrams in Twisted Mass + QCD", + journal = "", + volume = "", + pages = "", + year = "2007", + eprint = "0709.4564", + archivePrefix = "arXiv", + primaryClass = "hep-lat", + SLACcitation = "%%CITATION = ARXIV:0709.4564;%%" +} +@Book{Montvay:1994cy, + author = "Montvay, I. and M{\"u}nster, G.", + title = "Quantum fields on a lattice", + publisher = "Cambridge University Press", + year = "1994", + series = "Cambridge Monographs on Mathematical Physics", +} +@Article{Montvay:1995ea, + author = "Montvay, I.", + title = "An Algorithm for Gluinos on the Lattice", + journal = "Nucl. Phys.", + volume = "B466", + year = "1996", + pages = "259-284", + eprint = "hep-lat/9510042", + SLACcitation = "%%CITATION = HEP-LAT 9510042;%%" +} +@Article{Montvay:2005tj, + author = "Montvay, I. and Scholz, E.", + title = "Updating algorithms with multi-step stochastic correction", + journal = "Phys. Lett.", + volume = "B623", + year = "2005", + pages = "73-79", + eprint = "hep-lat/0506006", + SLACcitation = "%%CITATION = HEP-LAT 0506006;%%" +} +@Article{Morgan:2002a, + author = "Morgan, R. B.", + title = "GMRES with Deated Restarting", + journal = "SIAM J. Sci. Comput.", + volume = "24", + year = "2002", + pages = "20" +} +@Article{Morningstar:2003gk, + author = "Morningstar, Colin and Peardon, Mike J.", + title = "{Analytic smearing of SU(3) link variables in lattice + QCD}", + journal = "Phys. Rev.", + volume = "D69", + year = "2004", + pages = "054501", + eprint = "hep-lat/0311018", + archivePrefix = "arXiv", + doi = "10.1103/PhysRevD.69.054501", + SLACcitation = "%%CITATION = HEP-LAT/0311018;%%" +} +@Article{Munster:2004am, + author = "M{\"u}nster, G.", + title = "On the phase structure of twisted mass lattice {QCD}", + journal = "JHEP", + volume = "09", + year = "2004", + pages = "035", + eprint = "hep-lat/0407006", + SLACcitation = "%%CITATION = HEP-LAT 0407006;%%" +} +@Article{Munster:2004wt, + author = "M{\"u}nster, Gernot and Schmidt, Christian and Scholz, Enno E. + ", + title = "Chiral perturbation theory for twisted mass {QCD}", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "140", + year = "2005", + pages = "320-322", + eprint = "hep-lat/0409066", + SLACcitation = "%%CITATION = HEP-LAT 0409066;%%" +} +@Article{Nagai:2005mi, + author = "Nagai, Kei-ichi and Jansen, Karl", + title = "Two-dimensional lattice Gross-Neveu model with Wilson + twisted mass fermions", + journal = "Phys. Lett.", + volume = "B633", + year = "2006", + pages = "325-330", + eprint = "hep-lat/0510076", + SLACcitation = "%%CITATION = HEP-LAT 0510076;%%" +} +@Unpublished{Nagai:priv, + author = {Nagai, K}, + title = {Two-dimensional Gross-Neveu model with {Wilson} + twisted mass fermions}, + note = {private communication}, + OPTkey = {}, + OPTmonth = {}, + OPTyear = {}, + OPTannote = {} +} +@Article{Necco:2001xg, + author = "Necco, S. and Sommer, R.", + title = "The {N(f)} = 0 heavy quark potential from short to + intermediate distances", + journal = "Nucl. Phys.", + volume = "B622", + year = "2002", + pages = "328-346", + eprint = "hep-lat/0108008", + SLACcitation = "%%CITATION = HEP-LAT 0108008;%%" +} +@Article{Necco:2003vh, + author = "Necco, Silvia", + journal = "Nucl. Phys.", + volume = "B683", + year = "2004", + pages = "137-167", + eprint = "hep-lat/0309017", + SLACcitation = "%%CITATION = HEP-LAT 0309017;%%" +} +@Article{Neff:2001zr, + author = "Neff, H. and Eicker, N. and Lippert, T. and Negele, J. W. + and Schilling, K.", + title = "On the low fermionic eigenmode dominance in {QCD} on the + lattice", + journal = "Phys. Rev.", + volume = "D64", + year = "2001", + pages = "114509", + eprint = "hep-lat/0106016", + SLACcitation = "%%CITATION = HEP-LAT/0106016;%%" +} +@Article{Neuberger:1997fp, + author = "Neuberger, H.", + title = "Exactly massless quarks on the lattice", + journal = "Phys. Lett.", + volume = "B417", + year = "1998", + pages = "141-144", + eprint = "hep-lat/9707022", + SLACcitation = "%%CITATION = HEP-LAT 9707022;%%" +} +@Article{Neuberger:1998wv, + author = "Neuberger, H.", + title = "More about exactly massless quarks on the lattice", + journal = "Phys. Lett.", + volume = "B427", + year = "1998", + pages = "353-355", + eprint = "hep-lat/9801031", + SLACcitation = "%%CITATION = HEP-LAT 9801031;%%" +} +@Article{Niedermayer:1998bi, + author = "Niedermayer, F.", + title = "Exact chiral symmetry, topological charge and related + topics", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "73", + year = "1999", + pages = "105-119", + eprint = "hep-lat/9810026", + SLACcitation = "%%CITATION = HEP-LAT 9810026;%%" +} +@Article{Nielsen:1980rz, + author = "Nielsen, H. B. and Ninomiya, M.", + title = "Absence of neutrinos on a lattice. 1. proof by homotopy + theory", + journal = "Nucl. Phys.", + volume = "B185", + year = "1981", + pages = "20", + SLACcitation = "%%CITATION = NUPHA,B185,20;%%" +} +@Article{Nielsen:1981hk, + author = "Nielsen, H. B. and Ninomiya, M.", + title = "No go theorem for regularizing chiral fermions", + journal = "Phys. Lett.", + volume = "B105", + year = "1981", + pages = "219", + SLACcitation = "%%CITATION = PHLTA,B105,219;%%" +} +@Article{Nielsen:1981xu, + author = "Nielsen, H. B. and Ninomiya, M.", + title = "Absence of neutrinos on a lattice. 2. intuitive topological + proof", + journal = "Nucl. Phys.", + volume = "B193", + year = "1981", + pages = "173", + SLACcitation = "%%CITATION = NUPHA,B193,173;%%" +} +@Article{Noaki:1998zc, + author = "Noaki, J. and Izubuchi, T. and Ukawa, A.", + title = "Two-dimensional Gross-Neveu model with {Wilson} fermion + action at finite temperature and density", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "73", + year = "1999", + pages = "483-485", + eprint = "hep-lat/9809071", + SLACcitation = "%%CITATION = HEP-LAT 9809071;%%" +} +@Article{Orginos:2001xa, + author = "Orginos, K.", + collaboration = "RBC", + title = "Chiral properties of domain wall fermions with improved + gauge actions", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "106", + year = "2002", + pages = "721-723", + eprint = "hep-lat/0110074", + SLACcitation = "%%CITATION = HEP-LAT 0110074;%%" +} +@Article{Orth:2005kq, + author = "Orth, B. and Lippert, T. and Schilling, K.", + title = "Finite-size effects in lattice {QCD} with dynamical {Wilson} + fermions", + journal = "Phys. Rev.", + volume = "D72", + year = "2005", + pages = "014503", + eprint = "hep-lat/0503016", + SLACcitation = "%%CITATION = HEP-LAT 0503016;%%" +} +@Article{Osterwalder:1973dx, + author = "Osterwalder, K. and Schrader, R.", + title = "Axioms for euclidean Green's functions", + journal = "Commun. Math. Phys.", + volume = "31", + year = "1973", + pages = "83-112", + SLACcitation = "%%CITATION = CMPHA,31,83;%%" +} +@Article{Osterwalder:1975tc, + author = "Osterwalder, K. and Schrader, R.", + title = "Axioms for euclidean Green's functions. 2", + journal = "Commun. Math. Phys.", + volume = "42", + year = "1975", + pages = "281", + SLACcitation = "%%CITATION = CMPHA,42,281;%%" +} +@Article{Osterwalder:1977pc, + author = "Osterwalder, K. and Seiler, E.", + title = "Gauge field theories on the lattice", + journal = "Ann. Phys.", + volume = "110", + year = "1978", + pages = "440", + SLACcitation = "%%CITATION = APNYA,110,440;%%" +} +@Article{PDBook, + author = "Eidelman, S. and others", + title = "{Review of Particle Physics}", + journal = "{Physics Letters B}", + year = "2004", + volume = "592", + pages = {1+}, + url = {http://pdg.lbl.gov} +} +@Article{Peardon:2002wb, + author = "Peardon, M. J. and Sexton, J.", + collaboration = "TrinLat", + title = "Multiple molecular dynamics time-scales in hybrid Monte + Carlo fermion simulations", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "119", + year = "2003", + pages = "985-987", + eprint = "hep-lat/0209037", + SLACcitation = "%%CITATION = HEP-LAT 0209037;%%" +} +@Book{Peskin:1995ev, + author = {Peskin, M. E. and Schroeder, D. V.}, + title = {An Introduction to quantum field theory}, + publisher = {Westview Press}, + year = {1995}, + OPTkey = {}, + OPTvolume = {}, + OPTnumber = {}, + OPTseries = {Advanced Book Program}, + OPTaddress = {Boulder, Colorado}, + OPTedition = {}, + OPTmonth = {}, + OPTnote = {}, + OPTannote = {} +} +@Article{Politzer:1973fx, + author = "Politzer, H. D.", + title = "Reliable perturbative results for strong interactions?", + journal = "Phys. Rev. Lett.", + volume = "30", + year = "1973", + pages = "1346-1349", + SLACcitation = "%%CITATION = PRLTA,30,1346;%%" +} +@Article{Politzer:1974fr, + author = "Politzer, H. D.", + title = "Asymptotic freedom: an approach to strong interactions", + journal = "Phys. Rept.", + volume = "14", + year = "1974", + pages = "129-180", + SLACcitation = "%%CITATION = PRPLC,14,129;%%" +} +@Manual{R:2005, + title = {R: A language and environment for statistical computing}, + author = {{R Development Core Team}}, + organization = {R Foundation for Statistical Computing}, + address = {Vienna, Austria}, + year = {2005}, + note = {{ISBN} 3-900051-07-0}, + url = {http://www.R-project.org}, +} + +@Book{Rothe:1992wy, + author = "Rothe, H.J.", + title = "Lattice gauge theories", + publisher = "World Scientific, Singapore", + year = "1992", + pages = "528", + edition = "", +} +@Article{Rupak:2002sm, + author = "Rupak, G. and Shoresh, N.", + title = "Chiral perturbation theory for the {Wilson} lattice action", + journal = "Phys. Rev.", + volume = "D66", + year = "2002", + pages = "054503", + eprint = "hep-lat/0201019", + SLACcitation = "%%CITATION = HEP-LAT 0201019;%%" +} + +@Article{Saad:1993a, + author = "Saad, Y.", + title = "A flexible inner-outer preconditioned GMRES altorithm", + journal = "SIAM J. Sci. Comput.", + volume = "14 (2)", + year = "1993", + page = "461-469" +} +@Article{Sachrajda:2004mi, + author = "Sachrajda, C. T. and Villadoro, G.", + title = "{Twisted boundary conditions in lattice simulations}", + journal = "Phys. Lett.", + volume = "B609", + year = "2005", + pages = "73-85", + eprint = "hep-lat/0411033", + archivePrefix = "arXiv", + doi = "10.1016/j.physletb.2005.01.033", + SLACcitation = "%%CITATION = HEP-LAT/0411033;%%" +} +@Article{Scorzato:2004da, + author = "Scorzato, L.", + title = "Pion mass splitting and phase structure in twisted mass + {QCD}", + journal = "Eur. Phys. J.", + volume = "C37", + year = "2004", + pages = "445-455", + eprint = "hep-lat/0407023", + SLACcitation = "%%CITATION = HEP-LAT 0407023;%%" +} +@Article{Scorzato:2005rb, + author = "Scorzato, L. and others", + title = "N(f) = 2 lattice {QCD} and chiral perturbation theory", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "153", + year = "2006", + pages = "283-290", + eprint = "hep-lat/0511036", + SLACcitation = "%%CITATION = HEP-LAT 0511036;%%" +} + +@Article{Sexton:1992nu, + author = "Sexton, J. C. and Weingarten, D. H.", + title = "Hamiltonian evolution for the hybrid monte carlo + algorithm", + journal = "Nucl. Phys.", + volume = "B380", + year = "1992", + pages = "665-678", + SLACcitation = "%%CITATION = NUPHA,B380,665;%%" +} + +@Article{Sharpe:1998xm, + author = "Sharpe, S. R. and Singleton, R., Jr.", + title = "Spontaneous flavor and parity breaking with {Wilson} + fermions", + journal = "Phys. Rev.", + volume = "D58", + year = "1998", + pages = "074501", + eprint = "hep-lat/9804028", + SLACcitation = "%%CITATION = HEP-LAT 9804028;%%" +} + +@Article{Sharpe:2004ny, + author = "Sharpe, S. R. and Wu, Jackson M. S.", + title = "Twisted mass chiral perturbation theory at next-to-leading + order", + journal = "Phys. Rev.", + volume = "D71", + year = "2005", + pages = "074501", + eprint = "hep-lat/0411021", + SLACcitation = "%%CITATION = HEP-LAT 0411021;%%" +} +@Article{Sharpe:2004ps, + author = "Sharpe, S. R. and Wu, J. M. S.", + title = "The phase diagram of twisted mass lattice {QCD}", + journal = "Phys. Rev.", + volume = "D70", + year = "2004", + pages = "094029", + eprint = "hep-lat/0407025", + SLACcitation = "%%CITATION = HEP-LAT 0407025;%%" +} +@Article{Sharpe:2005rq, + author = "Sharpe, Stephen R.", + title = "Observations on discretization errors in twisted-mass + lattice QCD", + journal = "Phys. Rev.", + volume = "D72", + year = "2005", + pages = "074510", + eprint = "hep-lat/0509009", + SLACcitation = "%%CITATION = HEP-LAT 0509009;%%" +} +@Article{Sheikholeslami:1985ij, + author = "Sheikholeslami, B. and Wohlert, R.", + title = "Improved continuum limit lattice action for qcd with {Wilson} + fermions", + journal = "Nucl. Phys.", + volume = "B259", + year = "1985", + pages = "572", + SLACcitation = "%%CITATION = NUPHA,B259,572;%%" +} +@Article{Shindler:2005vj, + author = "Shindler, Andrea", + title = "Twisted mass lattice {QCD}: Recent developments and results", + journal = "PoS", + volume = "LAT2005", + year = "2006", + pages = "014", + eprint = "hep-lat/0511002", + SLACcitation = "%%CITATION = HEP-LAT 0511002;%%" +} +@Article{Shindler:2006tm, + author = "Shindler, A.", + collaboration = "ETM", + title = "Lattice QCD with light twisted quarks: First results", + year = "2006", + eprint = "hep-ph/0611264", + SLACcitation = "%%CITATION = HEP-PH 0611264;%%" +} +@Article{Shindler:2007vp, + author = "Shindler, A.", + title = "{Twisted mass lattice QCD}", + journal = "Phys. Rept.", + volume = "461", + year = "2008", + pages = "37-110", + eprint = "0707.4093", + archivePrefix = "arXiv", + primaryClass = "hep-lat", + doi = "10.1016/j.physrep.2008.03.001", + SLACcitation = "%%CITATION = 0707.4093;%%" +} +@Article{Sleijpen:1996aa, + author = "G. L. G. Sleijpen and H. A. Van der Vorst", + title = "A Jacobi-Davidson iteration method for linear + eigenvalue problems", + journal = "SIAM Journal on Matrix Analysis and Applications", + volume = "17", + year = "1996", + pages = "401-425", +} +@Article{Sommer:1993ce, + author = "Sommer, R.", + title = "A New way to set the energy scale in lattice gauge theories + and its applications to the static force and alpha-s in + SU(2) Yang-Mills theory", + journal = "Nucl. Phys.", + volume = "B411", + year = "1994", + pages = "839-854", + eprint = "hep-lat/9310022", + SLACcitation = "%%CITATION = HEP-LAT 9310022;%%" +} +@Article{Sonneveld:1989cgs, + author = {Peter Sonneveld}, + title = {CGS, a fast Lanczos-type solver for nonsymmetric linear systems}, + journal = {SIAM J. Sci. Stat. Comput.}, + volume = {10}, + number = {1}, + year = {1989}, + issn = {0196-5204}, + pages = {36--52}, + publisher = {Society for Industrial and Applied Mathematics}, + address = {Philadelphia, PA, USA}, + } +@Article{Sternbeck:2003gy, + author = "Sternbeck, A. and Ilgenfritz, E.-M. and Kerler, W. + and M{\"u}ller-Preu{\ss}ker, M. and St{\"u}ben, H.", + title = "The {Aoki} phase for {N(f)} = 2 {Wilson} fermions revisited", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "129", + year = "2004", + pages = "898-900", + eprint = "hep-lat/0309059", + SLACcitation = "%%CITATION = HEP-LAT 0309059;%%" +} +@Article{Sternbeck:2005tk, + author = "Sternbeck, A. and Ilgenfritz, E. -M. and Mueller-Preussker, + M. and Schiller, A.", + title = "{Going infrared in SU(3) Landau gauge gluodynamics}", + journal = "Phys. Rev.", + volume = "D72", + year = "2005", + pages = "014507", + eprint = "hep-lat/0506007", + SLACcitation = "%%CITATION = HEP-LAT/0506007;%%" +} +@Article{Symanzik:1983dc, + author = "Symanzik, K.", + title = "Continuum limit and improved action in lattice theories. 1. + principles and phi**4 theory", + journal = "Nucl. Phys.", + volume = "B226", + year = "1983", + pages = "187", + SLACcitation = "%%CITATION = NUPHA,B226,187;%%" +} +@Conference{Symanzik:1981hc, + author = "Symanzik, K.", + title = "Some topics in quantum field theory", + booktitle = "Mathematical problems in theoretical physics", + journal = "Lecture Notes in Physics", + volume = "153", + year = "1981", + pages = "47-58", + editor = "R. Schrader et al.", + note = "Presented at 6th Int. Conf. on Mathematical Physics, + Berlin, West Germany" +} +@Article{Symanzik:1983gh, + author = "Symanzik, K.", + title = "Continuum limit and improved action in lattice theories. 2. + O(N) nonlinear sigma model in perturbation theory", + journal = "Nucl. Phys.", + volume = "B226", + year = "1983", + pages = "205", + SLACcitation = "%%CITATION = NUPHA,B226,205;%%" +} +@Article{Takaishi:1996xj, + author = "Takaishi, T.", + title = "Heavy quark potential and effective actions on blocked + configurations", + journal = "Phys. Rev.", + volume = "D54", + year = "1996", + pages = "1050-1053", + SLACcitation = "%%CITATION = PHRVA,D54,1050;%%" +} +@Article{Takaishi:2005tz, + author = "Takaishi, Tetsuya and de Forcrand, Philippe", + title = "{Testing and tuning new symplectic integrators for hybrid + Monte Carlo algorithm in lattice QCD}", + journal = "Phys. Rev.", + volume = "E73", + year = "2006", + pages = "036706", + eprint = "hep-lat/0505020", + archivePrefix = "arXiv", + doi = "10.1103/PhysRevE.73.036706", + SLACcitation = "%%CITATION = HEP-LAT/0505020;%%" +} +@Article{Takeda:2004xh, + author = "Takeda, S. and others", + title = "A scaling study of the step scaling function in SU(3) gauge + theory with improved gauge actions", + journal = "Phys. Rev.", + volume = "D70", + year = "2004", + pages = "074510", + eprint = "hep-lat/0408010", + SLACcitation = "%%CITATION = HEP-LAT 0408010;%%" +} +@Article{Ukawa:2002pc, + author = "Ukawa, A.", + collaboration = "CP-PACS and JL{QCD}", + title = "Computational cost of full {QCD} simulations experienced by + {CP-PACS and JLQCD Collaborations}", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "106", + year = "2002", + pages = "195-196", + SLACcitation = "%%CITATION = NUPHZ,106,195;%%" +} +@Article{Urbach:2005ji, + author = "Urbach, C. and Jansen, K. and Shindler, A. and Wenger, U.", + title = "{HMC} algorithm with multiple time scale integration and mass + preconditioning", + journal = "Comput. Phys. Commun.", + volume = "174", + year = "2006", + pages = "87-98", + eprint = "hep-lat/0506011", + SLACcitation = "%%CITATION = HEP-LAT 0506011;%%" +} +@Article{Urbach:2007rt, + author = "Urbach, Carsten", + collaboration = "ETM", + title = "{Lattice QCD with two light Wilson quarks and maximally + twisted mass}", + journal = "PoS", + volume = "LAT2007", + year = "2007", + pages = "022", + eprint = "0710.1517", + archivePrefix = "arXiv", + primaryClass = "hep-lat", + SLACcitation = "%%CITATION = 0710.1517;%%" +} +@Article{WalkerLoud:2005bt, + author = "Walker-Loud, Andre and Wu, Jackson M. S.", + title = "{Nucleon and Delta masses in twisted mass chiral + perturbation theory}", + journal = "Phys. Rev.", + volume = "D72", + year = "2005", + pages = "014506", + eprint = "hep-lat/0504001", + archivePrefix = "arXiv", + doi = "10.1103/PhysRevD.72.014506", + SLACcitation = "%%CITATION = HEP-LAT/0504001;%%" +} +@Article{Weinberg:1973un, + author = "Weinberg, S.", + title = "Nonabelian gauge theories of the strong interactions", + journal = "Phys. Rev. Lett.", + volume = "31", + year = "1973", + pages = "494-497", + SLACcitation = "%%CITATION = PRLTA,31,494;%%" +} +@Article{Weinberg:1978kz, + author = "Weinberg, S.", + title = "Phenomenological Lagrangians", + journal = "Physica", + volume = "A96", + year = "1979", + pages = "327", + SLACcitation = "%%CITATION = PHYSA,A96,327;%%" +} +@Book{Weinberg:1995mt, + author = "Weinberg, S.", + title = "The Quantum theory of fields. Vol. 1: Foundations", + publisher = "Cambridge University Press", + year = "1995", + pages = "609", +} +@Article{Weisz:1982zw, + author = "Weisz, P.", + title = "Continuum limit improved lattice action for pure {Yang-Mills} + theory. 1", + journal = "Nucl. Phys.", + volume = "B212", + year = "1983", + pages = "1", + SLACcitation = "%%CITATION = NUPHA,B212,1;%%" +} +@Article{Weisz:1983bn, + author = "Weisz, P. and Wohlert, R.", + title = "Continuum limit improved lattice action for pure {Yang-Mills} + theory. 2", + journal = "Nucl. Phys.", + volume = "B236", + year = 1984, + pages = 397, + SLACcitation = "%%CITATION = NUPHA,B236,397;%%" +} +@Article{Wennekers:2005wa, + author = "Wennekers, J. and Wittig, H.", + title = "On the renormalized scalar density in quenched QCD", + year = "2005", + eprint = "hep-lat/0507026", + SLACcitation = "%%CITATION = HEP-LAT 0507026;%%" +} +@Article{Weyl:1918ib, + author = "Weyl, H.", + title = "Gravitation und Elektrizit{\"a}t", + journal = "Sitzungsber. Preuss. Akad. Wiss. Berlin (Math. Phys. )", + volume = "1918", + year = "1918", + pages = "465", + SLACcitation = "%%CITATION = SPWPA,1918,465;%%" +} +@Article{Weyl:1929fm, + author = "Weyl, H.", + title = "Electron and gravitation", + journal = "Z. Phys.", + volume = "56", + year = "1929", + pages = "330-352", + SLACcitation = "%%CITATION = ZEPYA,56,330;%%" +} +@Article{Wilson:1974sk, + author = "Wilson, K. G.", + title = "Confinement of quarks", + journal = "Phys. Rev.", + volume = "D10", + year = "1974", + pages = "2445-2459", + SLACcitation = "%%CITATION = PHRVA,D10,2445;%%" +} +@Article{Wilson:1974sk, + author = "Wilson, K. G.", + title = "Confinement of quarks", + journal = "Phys. Rev.", + volume = "D10", + year = "1974", + pages = "2445-2459", + SLACcitation = "%%CITATION = PHRVA,D10,2445;%%" +} +@Article{Wilson:1975mb, + author = "Wilson, K. G.", + title = "The renormalization group: Critical phenomena and the kondo + problem", + journal = "Rev. Mod. Phys.", + volume = "47", + year = "1975", + pages = "773", + SLACcitation = "%%CITATION = RMPHA,47,773;%%" +} +@Article{Wilson:1975mb, + author = "Wilson, K. G.", + title = "The renormalization group: Critical phenomena and the kondo + problem", + journal = "Rev. Mod. Phys.", + volume = "47", + year = "1975", + pages = "773", + SLACcitation = "%%CITATION = RMPHA,47,773;%%" +} +@Article{Wolff:2003sm, + author = "Wolff, U.", + collaboration = "ALPHA", + title = "Monte Carlo errors with less errors", + journal = "Comput. Phys. Commun.", + volume = "156", + year = "2004", + pages = "143-153", + eprint = "hep-lat/0306017", + SLACcitation = "%%CITATION = HEP-LAT 0306017;%%" +} +@Article{Yang:1954ek, + author = "Yang, C.-N. and Mills, R. L.", + title = "Conservation of isotopic spin and isotopic gauge + invariance", + journal = "Phys. Rev.", + volume = "96", + year = "1954", + pages = "191-195", + SLACcitation = "%%CITATION = PHRVA,96,191;%%" +} +@Article{Yoshie:2008aw, + author = "Yoshie, Tomoteru", + title = "{Making use of the International Lattice Data Grid}", + journal = "PoS", + volume = "LATTICE2008", + year = "2008", + pages = "019", + eprint = "0812.0849", + archivePrefix = "arXiv", + primaryClass = "hep-lat", + SLACcitation = "%%CITATION = 0812.0849;%%" +} +@Article{Zweig:1964jf, + author = "Zweig, G.", + title = "An SU(3) model for strong interaction symmetry and its + breaking. 2", + note = "CERN-TH-412" +} +@Article{cln:web, + author = {}, + eprint = {http://www.ginac.de/CLN/} +} +@Article{deForcrand:1995bs, + author = "de Forcrand, P.", + title = "Progress on lattice {QCD} algorithms", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "47", + year = "1996", + pages = "228-235", + eprint = "hep-lat/9509082", + SLACcitation = "%%CITATION = HEP-LAT 9509082;%%" +} +@Article{deForcrand:1996bx, + author = "de Forcrand, P. and others", + collaboration = "{QCD}-TARO", + title = "Search for effective lattice action of pure {QCD}", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "53", + year = "1997", + pages = "938-941", + eprint = "hep-lat/9608094", + SLACcitation = "%%CITATION = HEP-LAT 9608094;%%" +} +@Article{deForcrand:1996ck, + author = "de Forcrand, P. and Takaishi, T.", + title = "Fast fermion Monte Carlo", + journal = "Nucl. Phys. Proc. Suppl.", + volume = "53", + year = "1997", + pages = "968-970", + eprint = "hep-lat/9608093", + SLACcitation = "%%CITATION = HEP-LAT 9608093;%%" +} +@Article{etmc:asqr, + author = "Frezzotti, R. et al.", + title = "{O(a^2) cutoff effects in Wilson fermion simulations}", + journal = "PoS", + volume = "LAT2007", + year = "2007", + pages = "277", + eprint = "0710.2492", + archivePrefix = "arXiv", + primaryClass = "hep-lat", + SLACcitation = "%%CITATION = 0710.2492;%%" +} +@Article{ildg:web, + eprint = {http://cssm.sasr.edu.au/ildg/}, + author = {} +} +@Book{kleinert:1, + author = "Kleinert, H.", + title = "Path integrals in quantum mechanics, statistics and polymer ph +ysics", + publisher = "World Scientific, Singapore", + year = "1995", + edition = "2nd Edition", +} +@Article{lapack:web, + author = {}, + eprint = {http://www.netlib.org/lapack/} +} +@Article{lime:web, + author = {USQCD}, + title = {c-lime library}, + eprint = {http://usqcd.jlab.org/usqcd-docs/c-lime/} +} +@Article{hmc:web, + author = {}, + title = {tmLQCD}, + eprint = {http://www.carsten-urbach.eu/} +} +@Book{meister:1999, + author = {Meister, Andreas}, + title = {Numerik linearer Gleichungssysteme}, + publisher = {vieweg}, + year = {1999}, + OPTkey = {}, + OPTvolume = {}, + OPTnumber = {}, + OPTseries = {}, + OPTaddress = {}, + OPTedition = {}, + OPTmonth = {}, + OPTnote = {}, + OPTannote = {} +} +@Manual{minuit, + title = {MINUIT home page}, + note= {\\seal.web.cern.ch/seal/snapshot/work-packages/mathlibs/minuit/home.html} +} +@Article{mpi:web, + author = {}, + title = {The message passing interface standard}, + eprint = {http://www-unix.mcs.anl.gov/mpi/} +} +@PhdThesis{orth:2004phd, + author = {Orth, B.}, + title = {Finite size effects in lattice {QCD} + with dynamical {Wilson} fermions}, + school = {Bergische Universit{\"a}t Wuppertal}, + year = {2004}, + OPTkey = {}, + OPTtype = {}, + OPTaddress = {}, + OPTmonth = {}, + OPTnote = {}, + OPTannote = {} +} +@PhdThesis{pleiter:phd, + author = {Pleiter, D.}, + title = {XXX}, + school = {Freie {U}niversität {B}erlin}, + year = {2001} +} +@book{press:1992, + address = {Cambridge, UK}, + author = {Press, William and Teukolsky, Saul and Vetterling, William and Flannery, Brian }, + citeulike-article-id = {767703}, + edition = {2nd}, + keywords = {bibtex-import}, + posted-at = {2006-07-21 00:26:35}, + priority = {0}, + publisher = {Cambridge University Press}, + title = {Numerical Recipes in C}, + year = {1992} +} +@Manual{root, + title = {The ROOT system home page}, + note = {root.cern.ch/} +} + +@Book{saad:2003a, + author = "Y. Saad", + title = "Iterative Methods for sparse linear systems", + publisher = "SIAM", + year = "2003", + edition = "2nd", +} + +@Article{scidac, + author = {}, + eprint = {http://www.scidac.gov/} +} +@MastersThesis{urbach:2002aa, + author = {Urbach, C.}, + title = {Untersuchung der {R}eversibilit{\"a}tsverletzung im {H}ybrid + {M}onte {C}arlo {A}lgorithmus}, + school = {Freie Universit{\"a}t Berlin, Fachbereich Physik}, + year = {2002} +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/doc/c-code.tex b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/c-code.tex new file mode 100644 index 0000000000000000000000000000000000000000..b29e1a9d2fcd361ef89371f2c9dfc6333b205d74 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/c-code.tex @@ -0,0 +1,327 @@ +\subsection{Some Useful Functions} + +\noindent {\ttfamily Hopping\_Matrix(int ieo, spinor * l, spinor * k)}:\\ +Files: {\ttfamily Hopping\_Matrix.c|h}.\\ +Here the Hopping Matrix is implemented. +\[ +\kappa\sum_\mu \delta_{x,y+\hat\mu}(1+\gamma_\mu) +\] +It connect even (odd) sites in {\ttfamily l} with odd (even) sites in {\ttfamily k} +if {\ttfamily ieo} is set to {\ttfamily EO} ({\ttfamily + OE}). {\ttfamily EO} and {\ttfamily OE} are defined in {\ttfamily + Hopping\_Matrix.h}. + +\noindent {\ttfamily mul\_one\_pm\_imu\_inv(spinor * l, double sign)}:\\ +Files: {\ttfamily tm\_operators.c}.\\ +This multiplies {\ttfamily l} with +\[ +l = (M_{ee}^{\pm})^{-1} l\, , +\] +where the sign of $M_{ee}^{\pm}$ is set corresponding to {\ttfamily + sign}. This connects even with even or odd with odd sites. + +\noindent{\ttfamily mul\_one\_pm\_imu\_sub\_mul\_gamma5(spinor * l, spinor * k, int j, +double sign)}:\\ +Files: {\ttfamily tm\_operators.c}.\\ +This performs: +\[ +l = \gamma_5 ((1 \pm i \mu \gamma_5)k - j)\, , +\] +where the sign is set corresponding to {\ttfamily sign}. + +\noindent{\ttfamily mul\_one\_pm\_imu\_sub\_mul(spinor * l, spinor * k, int j, +double sign)}:\\ +Files: {\ttfamily tm\_operators.c}.\\ +This performs: +\[ +l = ((1 \pm i \mu \gamma_5)k - j)\, , +\] +where the sign is set corresponding to {\ttfamily sign}. + +\noindent{\ttfamily Qtm\_plus\_psi(spinor * l, spinor * k)}:\\ +Files: {\ttfamily tm\_operators.c|h}.\\ +This implements: +\[ +\hat Q_{+} = \gamma_5(M_{oo}^+ - M_{oe}(M_{ee}^+ )^{-1}M_{eo})\, . +\] +The Operator is applied to {\ttfamily k} and the result is stored in +{\ttfamily l}. $\hat Q_{-}$ is implemented in {\ttfamily + Qtm\_minus\_psi(spinor * l, spinor * k)}. + +\noindent{\ttfamily Mtm\_plus\_psi(spinor * l, spinor * k)}:\\ +Files: {\ttfamily tm\_operators.c|h}.\\ +This implements: +\[ +\hat \gamma_5Q_{+} = (M_{oo}^+ - M_{oe}(M_{ee}^+ )^{-1}M_{eo})\, . +\] +The Operator is applied to {\ttfamily k} and the result is stored in +{\ttfamily l}. $\hat \gamma_5Q_{-}$ is implemented in {\ttfamily + Mtm\_minus\_psi(spinor * l, spinor * k)}. + +\noindent{\ttfamily Qtm\_pm\_psi(spinor * l, spinor * k)}:\\ +Files: {\ttfamily tm\_operators.c|h}.\\ +This implements: +\[ +\hat Q_{+} \hat Q_{-}\, . +\] +The Operator is applied to {\ttfamily k} and the result is stored in +{\ttfamily l}. + +\noindent{\ttfamily H\_eo\_tm\_inv\_psi(spinor * l, spinor * k, int ieo, double + sign)}:\\ +Files: {\ttfamily tm\_operators.c|h}.\\ +This implements: +\[ +l = (M_{ee|oo}^\pm)^{-1} M_{eo|oe} k\, . +\] +The sign is set corresponding to {\ttfamily sign}. Setting {\ttfamily + ieo} to {\ttfamily EO} ({\ttfamily OE}) means using $M_{eo}$ +($M_{oe}$) and $M_{ee}$ ($M_{oo}$). + +\subsubsection{Even/odd preconditioning vs.\ no even/odd preconditioning} + +There are also operators that act on ``full size'' spinors, which +store spinors on the full VOLUME of the lattice. These operators are +(like all operators) in \texttt{tm\_operators.c}. Specifically we have +so far (Oct. 5, 2007) +\begin{center} +\begin{tabular}{ll} +$\hat{Q}_+$ & \texttt{Q\_plus\_psi(spinor * const l, spinor * const + k)}\\ +$\hat{Q}_-$ & \texttt{Q\_minus\_psi(spinor * const l, spinor * const + k)}\\ +$\hat{Q}_+ \hat{Q}_-$ & \texttt{Q\_pm\_psi(spinor * const l, spinor * + const k)} +\end{tabular} +\end{center} +All these operators are based on the operator $D_\psi$ defined in +\texttt{D\_psi.c} + +The sandwiching $\delta S_b$ with even/odd preconditioning is done in +\texttt{deriv\_Sb.c}, whereas in the non-preconditioned case this is +handled in \texttt{deriv\_Sb\_D\_psi.c}. All this is handled within +\texttt{derivative\_psf.c}, so the integration scheme code does not +need to distinguish between the the two cases (even/odd +preconditioning vs.\ no even/odd preconditioning). + +\subsubsection{Further remarks} +\begin{itemize} +\item{}the conversion of SU(3) matrices to and from the adjoint + representation is not bijective. If a matrix $A$ is converted by + \texttt{\_trace\_lambda(B,A)} the result $B$ must be divided by + $-2$, if the inverse routine \texttt{\_make\_su3(A,B)} is to + reproduce the original $A$. +\item{}possibly the non even/odd operator combination $\gamma_5 + D_\psi$ should be implemented more efficiently +\end{itemize} + + +\subsection{Chronological Inverter} + +The implementation of the chronological inverter \cite{Brower:1995vx} +can be found in the direction {\ttfamily solver} in the files +{\ttfamily chrono\_guess.c|h}. These files define basically two +funtions: + +\noindent{\ttfamily void chrono\_add\_solution(spinor * const trial, spinor + ** const v, int index\_array[], const int N, const int \_n, const + int V)}\\ +This function adds the spinor {\ttfamily trial} to the stack of +already existing solutions. These are stored in the array {\ttfamily + v}. In the integer array {\ttfamily index\_array} the current order +of the spinor fields in {\ttfamily v} is stored. {\ttfamily N} must +contain the maximal number of solutions to store in {\ttfamily v}, +{\ttfamily \_n} contains the current index and {\ttfamily V} the volume. + +The memory for {\ttfamily v} must be allocated by the user and it must +be of size {\ttfamily V*N}. The bookholding with {\ttfamily \_n} and +{\ttfamily index\_array} is done by the function and should not be +changed by the user elsewhere. + +\noindent{\ttfamily int chrono\_guess(spinor * const trial, spinor * + const phi, spinor ** const v, int index\_array[], + const int \_N, const int \_n, const int V, matrix\_mult f)}\\ +This fuction returns in the spinor {\ttfamily trial} a trial guess +computed with the chronological guess algorithm. Apart from the +parametes explained already above: {\ttfamily phi} is the source +spinor and {\ttfamily matric\_mult} is a pointer to the matrix to be +inverted. + +Note that the chronological solver guess uses lapack functions and is +switched off when it is not available, and as well as when the history +length is chosen to be zero, a zero spinor field is returned by the +chrono guess function. See also input parameters concerning CSG. + +\subsection{$\gamma$ matrices} +\label{gammas} +$\gamma_5$ ist defined as follows: +\[ + \gamma_5 = + \begin{pmatrix} + +1 & 0 & 0 & 0 \\ + 0 & +1 & 0 & 0 \\ + 0 & 0 & -1 & 0 \\ + 0 & 0 & 0 & -1 \\ + \end{pmatrix}\ . +\] + +In the operator the following notation for +the matrices is used: +\[ +\begin{split} + \gamma_0 = \begin{pmatrix} + 0 & 0 & -1 & 0 \\ + 0 & 0 & 0 & -1 \\ + -1 & 0 & 0 & 0 \\ + 0 & -1 & 0 & 0 \\ + \end{pmatrix},\quad + \gamma_1 = \begin{pmatrix} + 0 & 0 & 0 & -i \\ + 0 & 0 & -i & 0 \\ + 0 & +i & 0 & 0 \\ + +i & 0 & 0 & 0 \\ + \end{pmatrix},\\ + \gamma_2 = \begin{pmatrix} + 0 & 0 & 0 & -1 \\ + 0 & 0 & +1 & 0 \\ + 0 & +1 & 0 & 0 \\ + -1 & 0 & 0 & 0 \\ + \end{pmatrix},\quad + \gamma_3 = \begin{pmatrix} + 0 & 0 & -i & 0 \\ + 0 & 0 & 0 & +i \\ + +i & 0 & 0 & 0 \\ + 0 & -i & 0 & 0 \\ + \end{pmatrix}\ .\\ +\end{split} +\] + +\subsection{Pauli matrices} +\[ +\begin{split} + \tau^1 = + \begin{pmatrix} + 0 & 1 \\ + 1 & 0 \\ + \end{pmatrix},\quad + \tau^2 = + \begin{pmatrix} + 0 & -i \\ + i & 0 \\ + \end{pmatrix},\quad + \tau^3 = + \begin{pmatrix} + 1 & 0 \\ + 0 & -1 \\ + \end{pmatrix} +\end{split} +\] + +\subsection{Flavour Split Doublet Operator} + +The convention we use internally for the flavour split doublet Dirac +operator is +\[ +D_h = D_W + m_0 + i\bar\mu\gamma_5\tau^3 - \bar\epsilon\tau^1 +\] +The programme {\ttfamily invert\_doublet} inverts this operator, but +the source and sink are written in a convention corresponding to +\[ +D_h' = D_W + m_0 + i\bar\mu\gamma_5\tau^1 + \bar\epsilon\tau^3 +\] +The relation between the two is given by +\[ +D_h' = \frac{1}{\sqrt{2}}(1+i\tau^2)\ D_h\ \frac{1}{\sqrt{2}}(1-i\tau^2)\, . +\] +The implementation is then such that first the source $\xi$ is multiplied +with +\[ +\xi\to\xi=\frac{1}{\sqrt{2}}(1-i\tau^2)\xi +\] +on which $D_h$ is inverted and the solution $\phi$ is obtained. The +solution is then multiplied +\[ +\phi \to \phi=\frac{1}{\sqrt{2}}(1+i\tau^2)\phi +\] +to obtain the result for $D_h'$. + +The convention $D_h'$ is the one of \cite{Chiarappa:2006ae} with +$\bar\mu = \mu_\sigma$ and $\bar\epsilon = \mu_\delta$. {\ttfamily + invert\_doublet} inverts with the source first set for the upper +flavour and then with the same source set for the lower +flavour. The other one is set to zero. Hence, the ouput is a set of +four Dirac fermion spinor fields, which are stored in the order upper, +lower, upper, lower flavour. The former two correspond to the source +in the upper flavour, the latter to the source in the lower flavour. + +\subsection{Stochastic Volume Sources} + +In order to compute disconnected contributions volume (all spin, +colour, space and time) sources are implemented. In this case only one +inversion is required. The volume sources are generated with gaussian +noise ($\sigma=1$) in real and imaginary part of the whole source +spinor. Note that the normalisation with $1/\sqrt{2}$ is \emph{not} +done and needs to be taken care off in the analysis. + +For the hopping parameter noise reduction method the following is +needed: Following the notation in Ref.~\cite{Boucaud:2008xu} the +operator can be written as +\[ +D_h' = A + H = (1+H\cdot B)\cdot A,\qquad B=1/A\, . +\] +where $H^\dagger = \gamma_5 H \gamma_5$ and +\[ +A = 1 + i\gamma_5\tau^1\tilde\mu_\sigma + \tau^3\tilde\mu_\delta +\] +where in the hopping parameter representation $\tilde\mu_\sigma = +2\kappa\mu_\sigma$ and $\tilde\mu_\delta=2\kappa\mu_\delta$. $A$ can +be inverted easily to +\[ +A^{-1}=\frac{1-i\gamma_5 \tau^1 \tilde\mu_\sigma - \tau_3 + \tilde\mu_\delta}{1+\tilde\mu_\sigma^2-\tilde\mu_\delta^2} +\] +It follows that +\[ +1/D_h' = B-BHB+B(HB)^2-B(HB)^3+1/D_h'(HB)^4\ . +\] +Then, since $\gamma_5$ commutes with $B$ one can evaluate the last +term stochastically for any $\gamma$ and or colour matrix $X$ like +\[ +X (1/D_h')(HB)^4 = \lim_{R\to\infty}\left[(\gamma_5 (B^{\dagger} H )^4 + \gamma_5 \xi)^* X \phi\right]_R +\] +where +\[ +\phi = (D_h')^{-1}\xi +\] +and +\[ +B(\tilde\mu_\sigma)^\dagger \equiv B(-\tilde\mu_\sigma)\ . +\] +The remaining terms can be computed exactly. For any source $\xi$ we +therefore have to generate +\[ +\xi_r = \gamma_5 (B^{\dagger} H )^4\gamma_5 \xi. +\] + + +%(ii)-------------------------------------------- +%Use source in all flavour spin colour space time (Z2xZ2) +% +%Let u,v be flavour indices (c,s) equivalent +% +%M \phi = \xi with flavour explicit M_{uv) \phi_v = \xi_u +% +% Then required quantity is +% Tr (X M^{-1} )_{uv} where X is diagonal in flavour +% and Tr is sum over colour, spin, space (at a given time) +% = Tr( \xi^*_v X \ph_u ) +% = Tr ( {(g_5 (B^{\dag} H)^4 g_5 \xi)_v}^* X \phi_u) +% for uv element + + + +%%% Local Variables: +%%% mode: latex +%%% TeX-master: "main" +%%% End: diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/doc/command.tex b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/command.tex new file mode 100644 index 0000000000000000000000000000000000000000..cf495acf098c0444eaab314119085e758adcdc01 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/command.tex @@ -0,0 +1,100 @@ +\newtheorem{theorem}{Theorem}[section] +\newtheorem{theo}{Theorem}[section] +\newtheorem{exam}{Theorem}[section] +\newtheorem{example}[exam]{Example} +\newtheorem{corollary}[theorem]{Lemma} +\newtheorem{lemma}[theorem]{Lemma} +\newtheorem{definition}[theo]{Definition} +\newtheorem{remark}[exam]{Remark} +\newtheorem{problem}[theorem]{Problem} +\newtheorem{question}[theorem]{Question} +\newtheorem{satz}[theorem]{Theorem} +\newtheorem{Aussage}[theorem]{Aussage} + +\newenvironment{proof}{\noindent{\bf Proof:~}}% +{\null\hfill $\Box$\par\medskip} +\newenvironment{fixme}{\noindent{\bf FIXME:\\~}}% +{\\{\bf ENDFIXME}\par\medskip} +\newenvironment{Bemerkung}{\noindent{\bf Remark:\\~}}% +{} + +%\renewcommand{\theequation}{\thesection-\arabic{equation}} +%\renewcommand{\thefigure}{\thesection.\arabic{figure}} +\renewcommand{\topfraction}{1} +\renewcommand{\bottomfraction}{1} +\renewcommand{\textfraction}{0} +%\renewcommand{\chaptermark}[1]% +%{\markboth{\chaptername\ \thechapter\ #1}{}} +\renewcommand{\sectionmark}[1]% +{\markright{\thesection\ #1}} + +%Spur, Real- und Imaginaerteil +\newcommand{\tr}{\operatorname{Tr}} +\newcommand{\re}{\operatorname{Re}} +\newcommand{\im}{\operatorname{Im}} +\newcommand{\C}{\mathbb{C}} +\newcommand{\R}{\mathbb{R}} +\newcommand{\ID}{\mathbb{I}} +\newcommand{\rf}{\mathcal{R}_5^{\mathbf{sp}}} +\newcommand{\hQpm}{\hat Q_{\pm}} +\newcommand{\hWpm}{\hat W_{\pm}} +\newcommand{\hQp}{\hat Q_{+}} +\newcommand{\hQm}{\hat Q_{-}} +\newcommand{\hWp}{\hat W_{+}} +\newcommand{\hWm}{\hat W_{-}} +\newcommand{\Wpm}{W_{\pm}} +\newcommand{\Qpm}{Q_{\pm}} +\newcommand{\Qp}{Q_{+}} +\newcommand{\Qm}{Q_{-}} +\newcommand{\Wp}{W_{+}} +\newcommand{\Wm}{W_{-}} +\newcommand{\Qnd}{Q_{\textrm{ND}}} +%\newcommand{\myinput}[1]{\def\old{\filename}% +% \def\filename{\tiny [#1.tex,\today]}% +% \cfoot{\filename}% +% \input{#1}% +% \cfoot{\old}% +%} +\newcommand{\dtau}{\Delta\hspace{-.06cm}\tau} +\newcommand{\npf}{N_\mathrm{PF}} + +\newcommand{\myinput}[1]{% + \input{#1}% +} + +%229und 230 oder 235 +%\newcommand{\totheright}{\ding{230}{\sffamily\scshape Rechts}: } +\newcommand{\totheright}{(b): } +%\newcommand{\toright}{\ding{230}{\sffamily\scshape Right}} +\newcommand{\toright}{b} +%\newcommand{\totheleft}{\reflectbox{\ding{229}}{\sffamily\scshape Links}: } +\newcommand{\totheleft}{(a): } +%\newcommand{\toleft}{\reflectbox{\ding{229}}{\sffamily\scshape Links}} +\newcommand{\toleft}{a} +\newcommand{\Mathlogo}{{\scshape Mathematica}\Pisymbol{psy}{226} } + +%Farben von root nach dessen nummern +\definecolor{eins}{rgb}{0,0,0}%black +\definecolor{zwei}{rgb}{1,0,0}%red +\definecolor{drei}{rgb}{0,1,0}%green +\definecolor{vier}{rgb}{0,0,1}%blue +\definecolor{fuenf}{rgb}{1,1,0}%yellow +\definecolor{pink}{rgb}{1,0,1} +\definecolor{sechs}{rgb}{1,0,1}%pink +\definecolor{sieben}{rgb}{0,1,1}%cyan +\definecolor{acht}{rgb}{0.35,0.83,0.33}%darkgreen +\definecolor{neun}{rgb}{0.35,0.33,0.85}%darkblue + +\def\sometext{% +Text Text Text Text Text Text Text Text Text Text Text Text Text Text Text Text Text Text Text Text % +Text Text Text Text Text Text Text Text Text Text Text Text Text Text Text Text Text Text Text Text % +Text Text Text Text Text Text Text Text Text Text Text Text Text Text Text Text Text Text Text Text % +Text Text Text Text Text Text Text Text Text Text Text Text Text Text Text Text Text Text Text Text % +} + +\endinput + +%%% Local Variables: +%%% mode: latex +%%% TeX-master: "main" +%%% End: diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/doc/components.tex b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/components.tex new file mode 100644 index 0000000000000000000000000000000000000000..a261695f76a92f0154b2b46247dfb6de0be3568d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/components.tex @@ -0,0 +1,620 @@ +\subsection{Dirac Operator} +\label{sec:dirac} + +The Dirac operator is the kernel routine of any lattice QCD +application, because its inverse is needed for the HMC update +procedure and also for computing correlation functions. The inversion +is usually performed by means of iterative solvers, like the conjugate +gradient algorithm, and hence the repeated application of the Dirac +operator to a spinor field is needed. Thus the optimisation of this +routine deserves special attention. + +At some space-time point $x$ the application of a Wilson type Dirac +operator is mainly given by +\begin{equation} + \label{eq:Dpsi} + \begin{split} + \phi(x) = & (m_0 + 4r +i\mu_q\gamma_5)\psi(x) \\ + &- \frac{1}{2}\sum_{\mu = 1}^4\Bigl[ + U_{x,\mu}(r+\gamma_\mu) \psi(x+a\hat\mu) + U^\dagger_{x-a\hat\mu,\mu} + (r-\gamma_\mu)\psi(x-a\hat\mu)\Bigr] \\ + \end{split} +\end{equation} +where $r$ is the Wilson parameter, which we set to one in the +following. The most computer time consuming part is the next-neighbour +interaction part. + +For this part it is useful to observe that +\[ +(1\pm \gamma_\mu)\psi +\] +has only two independent spinor components, the other two follow +trivially. So only two of the components need to be computed, then +to be +multiplied with the corresponding gauge field $U$, and then the other +two components are to be reconstructed. + +The operation in eq.~(\ref{eq:Dpsi}) must be performed for each space-time +point $x$. If the loop over $x$ is performed such that all elements +of $\phi$ are accessed sequentially (one output stream), it is clear +that the elements in $\psi$ and $U$ cannot be accessed sequentially as +well. This non-sequential access may lead to serious performance +degradations due to too many cache misses, because modern processing +units have only a very limited number of input streams available. + +While the $\psi$ field is usually different from +one to the next application of the Dirac operator, the gauge field +stays often the same for a large number of applications. This is for +instance so in iterative solvers, where the Dirac operator is applied +$\mathcal{O}(1000)$ times with fixed gauge fields. Therefore it is +useful to construct a double copy of the original gauge field sorted +such that the elements are accessed exactly in the order needed in the +Dirac operator. For the price of additional memory, with this simple +change one can obtain large performance improvements, depending on the +architecture. The double copy must be updated whenever the gauge field +change. This feature is available in the code at configure time, the +relevant switch is {\ttfamily --with-gaugecopy}. + +Above we were assuming that we run sequentially through the resulting +spinor field $\phi$. Another possibility is to run sequentially +through the source spinor field $\psi$. Moreover, one could split up +the operation (\ref{eq:Dpsi}) as follows, introducing intermediate +result vectors $\varphi^\pm$ with only two spinor components per lattice +site\footnote{We thank Peter Boyle for useful discussions on this + point.}. Concentrating on the hopping part only, we would have +\begin{equation} + \label{eq:Dsplit} + \begin{split} + \varphi^+(x, \mu) &= P_{+\mu}^{4\to2}\ U_{x,\mu}(r+\gamma_\mu) \psi(x) \\ + \varphi^-(x, \mu) &= P_{-\mu}^{4\to2}\ (r-\gamma_\mu) \psi(x)\; . \\ + \end{split} +\end{equation} +From $\varphi^\pm$ we can then reconstruct the resulting spinor field +as +\begin{equation} + \label{eq:Dunsplit} + \begin{split} + \phi(x) =& \sum_\mu P_{+\mu}^{2\to4}\varphi^+(x+a\hat\mu,\mu) \\ + & + \sum_\mu P_{-\mu}^{2\to4}U^\dagger_{x-a\hat\mu,\mu}\varphi^-(x-a\hat\mu,\mu) +% \phi(x-a\hat\mu) &= P_{+\mu}^{2\to4}\ \varphi^+(x, \mu) \\ +% \phi(x+a\hat\mu) &= P_{-\mu}^{2\to4}\ +% U^\dagger_{x-a\hat\mu,\mu}\varphi^-(x, \mu\; .) + \end{split} +\end{equation} +Here we denote with $P_{\pm\mu}^{4\to2}$ the projection to the two +independent spinor components for $1\pm\gamma_\mu$ and with +$P_{\pm\mu}^{2\to4}$ the corresponding reconstruction. +The half spinor fields $\varphi^\pm$ can be interlaced in +memory such that $\psi(x)$ as well as $\varphi^\pm(x)$ are always +accessed sequentially in memory. The same is possible for the gauge +fields, as explained above. So only for $\phi$ we cannot avoid strided +access. So far we have only introduced extra fields $\varphi^\pm$, +which need to be loaded and stored from and to main memory, and +divided the Dirac operator into two steps (\ref{eq:Dsplit}) and +(\ref{eq:Dunsplit}) which are very balanced with regard to memory +bandwidth and floating point operations. + +The advantage of this implementation of the Dirac operator comes +in the parallel case. In step (\ref{eq:Dsplit}) we need only elements +of $\psi(x)$, which are locally available on each node. So this step +can be performed without +any communication. In between step (\ref{eq:Dsplit}) and +(\ref{eq:Dunsplit}) one then needs to communicate part of +$\varphi^\pm$, however only half the amount is needed compared to a +communication of $\psi$. After the second step there is then no +further communication needed. Hence, one can reduce the amount of data +to be send by a factor of two. + +There is yet another performance improvement possible with this form +of the Dirac operator, this time for the price of precision. One can +store the intermediate fields $\varphi^\pm$ with reduced precision, +e.g. in single precision when the regular spinor fields are in double +precision. This will lead to a result with reduced precision, however, +in a situation where this is not important, as for instance in the MD +update procedure, it reduces the data to be communicated by another +factor of two. And the required memory bandwidth is reduced as well. +This version of the hopping matrix (currently it is only implemented +for the hopping matrix) is available at configure time with the switch +{\ttfamily --enable-halfspinor}. + +The reduced precision version (sloppy precision) is available through +the input parameter {\ttfamily UseSloppyPrecision}. It will be used in +the MD update where appropriate. Moreover, it is implemented in the CG +iterative solver following the ideas outlined in +Ref.~\cite{Chiarappa:2006hz} for the overlap operator. + +The various implementations of the Dirac operator can be found in the +file {\ttfamily D\_psi.c} and -- as needed for even/odd +preconditioning -- the hopping matrix in the file {\ttfamily + Hopping\_Matrix.c}. There are many different versions of these two +routines available, each optimised for a particular architecture, +e.g. for the Blue Gene/P double hummer processor or the streaming SIMD +extensions of modern PC processors (SSE2 and SSE3), see also +Ref.~\cite{Luscher:2001tx}. Martin L{\"u}scher has made available his +standard C and SSE/SSE2 Dirac operator~\cite{Luscher:sse} under the +GNU General Public License, which are partly included into the tmLQCD +package. + +\subsubsection{Blue Gene Version} + +The IBM PowerPC 450d processor used on the Blue Gene architecture +provides a dual FPU, which supports a set of SIMD operations working +on 32 special registers useful for lattice QCD. These operations can +be accessed using build in functions of the IBM XLC compiler. +The file {\ttfamily bgl.h} contains all macros relevant for the Blue +Gene version of the hopping matrix and the Dirac operator. + +\begin{algorithm}[t] + \caption{$\varphi^+ = \kappa\, U\, P_{+0}^{4\to2}(1+\gamma_0)\psi$} + \begin{algorithmic}[1] + \STATE // load components of $\psi$ into registers + \STATE \_bgl\_load\_rs0((*s).s0); + \STATE \_bgl\_load\_rs1((*s).s1); + \STATE \_bgl\_load\_rs2((*s).s2); + \STATE \_bgl\_load\_rs3((*s).s3); + \STATE // prefetch gauge field for next direction $(1+\gamma_1)$ + \STATE \_prefetch\_su3(U+1); + \STATE // do now first $P_{+0}^{4\to2}(1+\gamma_0)\psi$ + \STATE \_bgl\_vector\_add\_rs2\_to\_rs0\_reg0(); + \STATE \_bgl\_vector\_add\_rs3\_to\_rs1\_reg1(); + \STATE //now multiply both components at once with gauge field $U$ and $\kappa$ + \STATE \_bgl\_su3\_multiply\_double((*U)); + \STATE \_bgl\_vector\_cmplx\_mul\_double(ka0); + \STATE // store the result + \STATE \_bgl\_store\_reg0\_up((*phi[ix]).s0); + \STATE \_bgl\_store\_reg1\_up((*phi[ix]).s1); + \end{algorithmic} + \label{alg:bluegene} +\end{algorithm} + +A small fraction of half spinor version (see above) is given in +algorithm \ref{alg:bluegene}, which represents the operation +$\varphi^+ = \kappa\, U\, P_{+0}^{4\to2}(1+\gamma_0)\psi$. After +loading the components of $\psi$ into the special registers and +prefetching the gauge field for the next direction (in this case +$1+\gamma_1$), $P_{+0}^{4\to2}(1+\gamma_0)\psi$ is performed. It is +then important to load the gauge field $U$ only once from memory to +registers and multiply both spinor components in parallel. + +Finally the result is multiplied with $\kappa$ (which inherits also a +phase factor due to the way we implement the boundary conditions, see +next sub-section) and stored in memory. + + +\subsubsection{Boundary Conditions} + +As discussed previously, we allow for arbitrary phase factors in the +boundary conditions of the fermion fields. This is conveniently +implemented in the Dirac operator as a phase factor in the hopping +term +\[ +\sum_\mu \Bigl[ + e^{i\theta_\mu \pi/L_\mu}\ U_{x,\mu}(r+\gamma_\mu) + \psi(x+a\hat\mu) + e^{-i\theta_\mu \pi/L_\mu}\ + U^\dagger_{x-a\hat\mu,\mu} + (r-\gamma_\mu)\psi(x-a\hat\mu)\Bigr]\, . +\] +The relevant input parameters are {\ttfamily ThetaT}, {\ttfamily + ThetaX}, {\ttfamily ThetaY}, {\ttfamily ThetaZ}. + + + +\subsection{The HMC Update} + +We assume in the following that the action to be simulated can be +written as +\[ +S = S_\mathrm{G} + \sum_{i=1}^{N_\mathrm{monomials}} S_{\mathrm{PF}_i}\, , +\] +and we call -- following the CHROMA notation~\cite{Edwards:2004sx} -- each +term in this sum a \emph{monomial}. We require that there is exactly one +gauge monomial $S_\mathrm{G}$ (which we identify with $S_0$ in the +following) and an arbitrary number of pseudo +fermion monomials $S_{\mathrm{PF}_i}$. + +As a data type every monomial must known how to compute its +contribution to the initial Hamiltonian $\mathcal{H}$ at the beginning +of each trajectory in the heat-bath step. Then it must know how to +compute the derivative with respect to the gauge fields for given +gauge field and pseudo fermion field needed for the MD update. And finally +there must be a function to compute its contribution to the final +Hamiltonian $\mathcal{H}'$ as used in the acceptance step. + +\begin{figure}[t] + \centering + \includegraphics[width=0.7\linewidth]{monomial.eps} + \caption{Data type monomial and its components} + \label{fig:monomial} +\end{figure} + +In addition for each monomial it needs to be known on which timescale +it should be integrated. The corresponding data type is sketched in +figure~\ref{fig:monomial}. The general definitions for this data type +can be found in the file {\ttfamily monomial.c}. + +There are several sorts of monomials implemented: +\begin{itemize} +\item {\ttfamily DET}: pseudo fermion representation of the (mass + degenerate) simple determinant\\ + \[ + \det(Q^2(\kappa) + \mu^2) + \] +\item {\ttfamily DETRATIO}: pseudo fermion representation of the + determinant ratio\\ + \[ + \det(Q^2(\kappa) + \mu^2)/\det(Q^2(\kappa_2) + \mu_2^2) + \] +\item {\ttfamily NDPOLY}: polynomial representation of the (possibly + non-degenerate) doublet\\ + \[ + [\det(Q_{nd}(\bar\epsilon, \bar\mu)^2)]^{1/2}\, . + \] +\item {\ttfamily GAUGE}:\\ + \[ + \frac{\beta}{3}\sum_x\left( c_0\sum_{\substack{ + \mu,\nu=1\\1\leq\mu<\nu}}^4\{1-\re\tr(U^{1\times1}_{x,\mu,\nu})\}\Bigr. + \Bigl.\ +\ + c_1\sum_{\substack{\mu,\nu=1\\\mu\neq\nu}}^4\{1 + -\re\tr(U^{1\times2}_{x,\mu,\nu})\}\right)\, , + \] + The parameter $c_1$ can be set in the input file and + $c_0=1-8c_1$. Note that $c_1=0$ corresponds to the Wilson plaquette + gauge action. +\end{itemize} +The corresponding specific functions are defined in the files +{\ttfamily det\_monomial.c}, {\ttfamily detratio\_monomial.c}, +{\ttfamily ndpoly\_monomial.c} and {\ttfamily + gauge\_monomial.c}. Additional monomials can easily be implemented +by providing the corresponding functions as discussed above. + +\begin{algorithm}[t] + \caption{integrate} + \begin{algorithmic}[1] + \REQUIRE $0 < n_\mathrm{ts}\leq N_\mathrm{ts}$, $\tau > 0$ + \STATE $\dtau = \tau/$noSteps[$n_\mathrm{ts}$] + \FOR{$i$ = 0 to noSteps[$n_\mathrm{ts}$]} + \IF{$n_\mathrm{ts}$ == $1$} + \STATE updateGauge($\dtau$) + \ELSE + \STATE integrate($n_\mathrm{ts}-1$, $\dtau$) + \ENDIF + \STATE updateMomenta($\dtau$, monomialList[$n_\mathrm{ts}$]) + \ENDFOR + \end{algorithmic} + \label{alg:integrator} +\end{algorithm} + +The integration scheme is implemented recursively, as exemplified in +algorithm~\ref{alg:integrator} for the leap-frog integration scheme +(where we skipped half steps for simlicity). The updateMomenta +function simply calls the derivative functions of all monomials +that are integrated on timescale $n_\mathrm{ts}$ and updates the +momenta $P$ according to the time step $\dtau$. + +The recursive scheme for the integration can easily be extended to +more involved integration schemes. The details can be found in the +file {\ttfamily integrator.c}. We have implemented the leap-frog and +the second order minimal norm~\cite{Takaishi:2005tz} integrations +schemes. They are named in the input file as {\ttfamily LEAPFROG} and +{\ttfamily 2MN}, respectively. These two can be mixed on +different timescales. In addition we have implemented a position +version of the second order minimal norm integration scheme, denoted by +{\ttfamily 2MNPOSITION} in the input file. The latter must not be mixed with +the former two. + +The MD update is summarised in +algorithm~\ref{alg:mdupdate}. It computes the initial and final +Hamiltonians and calls in between the integration function with the +total number of timescales $N_\mathrm{ts}$ and the total trajectory +length $\tau$. + +\subsubsection{Reduced Precision in the MD Update} + +As shortly discussed previously, as long as the integration in the MD +udpate is reversible and area preserving there is large freedom in +choosing the integration scheme, but also the operator: it is not +necessary to use the Dirac operator here, it can be any approximation +to it. This is only useful if the acceptance rate is not strongly +affected by such an approximation. + +The code provides two possibilities to adapt the precision of the +Dirac operator used in the MD update: the first is to reduce the +precision in the inversions needed for the force computation. This +causes reduced iteration numbers needed for the integration of one +trajectory. The relevant input parameter is {\ttfamily + ForcePrecision} available for each monomial. The precision needed in +the acceptance and/or heatbath step can be adjusted separately using +{\ttfamily AcceptancePrecision}. It is advisable to have the +acceptance precision always close to machine precision. + + +\begin{algorithm}[t] + \caption{MD update} + \begin{algorithmic}[1] + \STATE $\mathcal{H}=\mathcal{H}'=0$ + \FOR{$i$ = 0 to $N_\mathrm{monomials}$} + \STATE $\mathcal{H}$ += monomial[$i$]$\rightarrow$heat-bath-function + \ENDFOR + + \STATE integrate($N_\mathrm{ts}$, $\tau$) + + \FOR{$i$ = 0 to $N_\mathrm{monomials}$} + \STATE $\mathcal{H}'$ += monomial[$i$]$\rightarrow$acceptance-function + \ENDFOR + \STATE accept with probability $\min\{1, \exp(-\Delta\mathcal{H})\}$ + \end{algorithmic} + \label{alg:mdupdate} +\end{algorithm} + + +The second possibility for influencing the Dirac operator is given by +the reduced precision Dirac operator described in +sub-section~\ref{sec:dirac}, which is switched on with the {\ttfamily + UseSloppyPrecision} input parameter. The two possibilities can also +be used in parallel. + +Note that one should always test for reversibility violations as +explained in sub-section \ref{sec:online}. + +\subsubsection{Chronological Solver} + +The idea of the chronological solver method (or similar methods +\cite{Brower:1994er}) is to optimize the initial guess for +the solution used in the solver. To this end the history of +$N_\mathrm{CSG}$ last solutions of the equation $M^2 \chi = \phi$ is +saved and then a linear combination of the fields $\chi_i$ with +coefficients $c_i$ is used as an initial guess for the next +inversion. $M$ stands for the operator to be inverted and has to be +replaced by the different ratios of operators used in this paper. + +The coefficients $c_i$ are determined by solving +\begin{equation} + \label{eq:chrono} + \sum_i \chi_j^\dagger M^2 \chi_i c_i = \chi_j^\dagger \phi +\end{equation} +with respect to the coefficients $c_i$. This is equivalent to +minimising the functional that is minimised by the CG inverter +itself. + +The downside of this method is that the reversibility violations +increase significantly by one or two orders of magnitude in the +Hamiltonian when the CSG is switched on and all other parameters are +kept fixed. Therefore one has to adjust the residues in the solvers, +which increases the number of matrix vector multiplications again. +Our experience is that the methods described in the previous +sub-section are more effective in particular in the context of +multiple time scale integration, because the CSG is most effective for +small values of $\dtau$. + +The input parameters is the {\ttfamily CSGHistory} parameter +available for the relevant monomials. Setting it to zero means no +chronological solver, otherwise this parameter specifies the number of +last solutions $N_\mathrm{CSG}$ to be saved. + +\subsection{Online Measurements} +\label{sec:online} + +The HMC program includes the possibility to perform a certain number +of measurements after every trajectory \emph{online}, whether or not +the configuration is stored on disk. Some of those are performed per +default, namely all that are written to the output file {\ttfamily + output.data}: +\begin{enumerate} +\item the plaquette expectation value, defined as: + \[ + \langle P\rangle = \frac{1}{6 V}\ \sum_{\substack{ + \mu,\nu=1\ 1\leq\mu<\nu}}^4\ \re\tr(U^{1\times1}_{x,\mu,\nu})\, , + \] + where $V$ is the global lattice volume. +\item the rectangle expectation value, defined as: + \[ + \langle R\rangle = \frac{1}{12V}\ \sum_{\substack{\mu,\nu=1\ + \mu\neq\nu}}^4\ + \re\tr(U^{1\times2}_{x,\mu,\nu}) + \] +\item $\Delta\mathcal{H} = \mathcal{H}'-\mathcal{H}$ and $\exp(-\Delta\mathcal{H})$. +\end{enumerate} +See the overview section for details about the {\ttfamily output.data} +file. These observables all come with no extra computational cost. + +Optionally, other online measurements can be performed, which -- +however -- need in general extra inversions of the Dirac +operator. First of all the computation of certain correlation +functions is implemented. They need \emph{one} extra inversion of the +Dirac operator, as discussed in Ref.~\cite{Boucaud:2008xu}, using the +one-end-trick. Define a stochastic source $\xi$ as follows +\begin{equation} + \label{eq:source} + \lim_{R\to\infty}[\xi_i^*\xi_j] = \delta_{ij},\quad + \lim_{R\to\infty}[\xi_i\xi_j] = 0\, . +\end{equation} +Here $R$ labels the number of samples and $i$ all other degrees of +freedom. Then +\begin{equation} + \label{oneend} + [\phi_i^{r*}\phi_j^r]_R = M_{ik}^{-1*}\cdot M_{jk}^{-1} + + \textrm{noise}\, , +\end{equation} +if $\phi$ was computed from +\[ +\phi_j^r = M^{-1}_{jk}\xi_k^r\, . +\] +Having in mind the $\gamma_5$-hermiticity property of the Wilson and +Wilson twisted mass Dirac propagator $G_{u,d}$, i.e. +\[ +G_u(x,y) = \gamma_5 G_d(y,x)^\dagger \gamma_5 +\] +it is clear that eq.~(\ref{oneend}) can be used to evaluate +\[ +C_\pi(t) = \langle \tr[G_u(0,t)\gamma_5 G_d(t,0)\gamma_5]\rangle = +\langle \tr[G_u(0,t) G_u(0,t)^\dagger]\rangle +\] +with only one inversion. But, even if the one gamma structure at the +source is fixed to be $\gamma_5$ due to the $\gamma_5$-hermiticity +trick, we are still free to insert any $\gamma$-structure $\Gamma$ at the source, +i.e. we can evaluate any correlation function of the form +\[ +C_{P\Gamma}(t) = \langle\tr[G_u(0,t) \gamma_5 G_d(t,0) \Gamma]\rangle += \langle \tr[G_u(0,t) G_u(0,t)^\dagger\gamma_5\Gamma]\rangle\, . +\] +Useful combinations of correlation functions are $\langle P P\rangle$, +$\langle PA\rangle$ and $\langle PV\rangle$, with +\[ + P^\alpha = \bar\chi \gamma_5 \frac{\tau^\alpha}{2}\chi\, ,\quad + V^\alpha_\mu = \bar\chi \gamma_\mu\frac{\tau^\alpha}{2}\chi\, ,\quad + A^\alpha_\mu = \bar\chi \gamma_5\gamma_\mu\frac{\tau^\alpha}{2}\chi +\] +From $\langle P P\rangle$ one can extract the pseudo scalar mass, and +-- in the twisted mass case -- the pseudo scalar decay +constant. $\langle PA\rangle$ can be used together with $\langle P +P\rangle$ to extract the so called PCAC quark mass and $\langle +PV\rangle$ to measure the renormalisation constant $Z_\mathrm{V}$. For +details we refer the reader to Ref.~\cite{Boucaud:2008xu}. + +These online measurements are controlled with the two following input +parameters: {\ttfamily PerformOnlineMeasurements} to switch them on or +off and to specify the frequency {\ttfamily OnlineMeasurementsFreq}. The three +correlation functions are saved in files named {\ttfamily + onlinemeas.n}, where {\ttfamily n} is the trajectory number. Every +file contains five columns, specifying the type, the operator type and the +Euclidean time $t$. The last two columns are the values of the +correlation function itself, $C(t)$ and $C(-t)$, respectively. The +type is equal to $1$, $2$ or $6$ for the $\langle P P\rangle$, the +$\langle PA\rangle$ and the $\langle PV\rangle$ correlation +functions. The operator type is for online measurements always equal +to $1$ for local source and sink (no smearing of any kind), and the +time runs from $0$ to $T/2$. Hence, $C(-t)= C(T-t)$. $C(-0)$ and +$C(-T/2)$ are set to zero for convenience. + +In addition to correlation functions also the minimal and the maximal +eigenvalues of the $(\gamma_5 D)^2$ can be measured. + +An online measurement not related to physics, but related to the +algorithm are checks of reversibility violations. The HMC algorithm is +exact, if +and only if the integration scheme is reversible. On a computer with +finite precision this is only guaranteed up to machine precision. +These violations can be estimated by integrating one trajectory +forward and then backward in Monte Carlo time. The difference +$\delta\Delta\mathcal{H}$ among +the original Hamiltonian $\mathcal{H}$ and the final one +$\mathcal{H}''$ after integrating back can serve as one measure for +those violations, another one is provided by the difference among the +original gauge field $U$ and the final one $U''$ +\[ +\delta\Delta U = \frac{1}{12V} +\sum_{x,\mu}\sum_{i,j} (U_{x,\mu}-U_{x,\mu}'')_{i,j}^2 +\] +where we indicate with the $\delta\Delta$ that this is obtained after +integrating a trajectory forward and backward in time. The results for +$\delta\Delta \mathcal{H}$ and $\delta\Delta U$ are +stored in the file {\ttfamily return\_check.data}. The relevant input +parameters are {\ttfamily ReversibilityCheck} and {\ttfamily + ReversibilityCheckInterval}. + +\subsection{Iterative Solver and Eigensolver} + +There are several iterative solvers implemented in the tmLQCD +package for solving +\[ +D\ \chi = \phi +\] +for $\chi$. The minimal residual (MR), the conjugate gradient (CG), the +conjugate gradient squared (CGS), the generalised minimal residual +(GMRES), the generalised conjugate residual and the stabilised +bi-conjugate gradient (BiCGstab). For details regarding these +algorithms we refer to Refs.~\cite{saad:2003a,meister:1999}. + +For the {\ttfamily hmc\_tm} executable only the CG and the BiCGstab +solvers are available, while all the others can be used in the +{\ttfamily invert} executables. Most of them are both available with +and without even/odd preconditioning. For a performance comparison we +refer to Ref.~\cite{Chiarappa:2004ry,Chiarappa:2006hz}. + +The stopping criterion is implemented in two ways: the first is an +absolute stopping criterion, i.e. the solver is stopped when the +squared norm of the residual vector (depending on the solver this +might be the iterated residual or the real residual) fulfills +\[ +\|r\|^2 < \epsilon^2\, . +\] +The second is relative to the source vector, i.e. +\[ +\frac{\|r\|^2}{\|\phi\|^2} < \epsilon^2\, . +\] +The value of $\epsilon^2$ and the choice of relative or absolute precision can be +influenced via input parameters. + +The reduced precision Dirac operator, as discussed in sub-section +\ref{sec:dirac}, is available for the CG solver. In the CG solver the +full precision Dirac operator is only required at the beginning of the +CG search, because the relative size of the contribution to the +resulting vector decreases with the number of iterations. Thus, as soon +as a certain precision is achieved in the CG algorithm we can switch to +the reduced precision Dirac operator without spoiling the precision of +the final result. We switch to the lower precision operator +at a precision of $\sqrt{\epsilon}$ in the CG search, when aiming for a +final precision of $\epsilon < 1$. + +The eigensolver used to compute the eigenvalues (and vectors) of +$(\gamma_5 D)^2$ is the so called Jacobi-Davidson +method~\cite{Sleijpen:1996aa,Geus:2002}. For a discussion for the +application of this algorithm to lattice QCD we refer again to +Ref.~\cite{Chiarappa:2004ry,Chiarappa:2006hz}. + +All solver related files can be found in the sub-directory {\ttfamily + solver}. Note that there are a few more solvers implemented which +are, however, in an experimental status. + +\subsection{Stout Smearing} + +Smearing techniques have become an important tool to reduce +ultraviolet fluctuations in the gauge fields. One of those techniques, +coming with the advantage of being usable in the MD update, is usually +called stout smearing~\cite{Morningstar:2003gk}. + +The $(n+1)^{\rm th}$ level of stout smeared gauge links is obtained iteratively +from the $n^{\rm th}$ level by +\begin{equation*} + U_\mu^{(n+1)}(x)\;=\;e^{i\,Q_\mu^{(n)}(x)}\,U_\mu^{(n)}(x). +\end{equation*} +We refer to the unsmeared (``thin'') gauge field as $U_\mu\equiv +U_\mu^{(0)}$. +The ${\rm SU}(3)$ matrices $Q_\mu$ are defined via the staples $C_\mu$: +\begin{eqnarray} + Q_\mu^{(n)}(x) &=& \frac{i}2\Big[U^{(n)}_\mu(x){C_\mu^{(n)}}^\dagger(x) + - {\mathrm{h.c.}}\Big]\,-\,\frac{i}{6}\tr\Big[U^{(n)}_\mu(x){C_\mu^{(n)}}^\dagger(x) + - {\mathrm{h.c.}}\Big]\,,\nonumber\\ + C_\mu^{(n)} &=& \sum_{\nu\neq\mu}\,\rho_{\mu\nu}\, + \Big(U_\nu^{(n)}(x)U_\mu^{(n)}(x+\hat\nu){U_\nu^{(n)}}^\dagger(x+\hat\mu) + \nonumber\\ + && \;\;\; + +{U_\nu^{(n)}}^\dagger(x-\hat\nu)U_\mu^{(n)}(x-\hat\nu)U_\nu^{(n)}(x-\hat\nu+\hat\mu) + \Big)\,,\nonumber +\end{eqnarray} +where in general $\rho_{\mu\nu}$ is the smearing matrix. +In the tmLQCD package we have only implemented isotropic $4$-dimensional +smearing, i.e., $\rho_{\mu\nu}=\rho$. + +Currently stout smearing is only implemented for the {\ttfamily + invert} executables. I.e. the gauge field can be stout smeared at +the beginning of an inversion. The input parameters are {\ttfamily + UseStoutSmearing}, {\ttfamily StoutRho} and {\ttfamily + StoutNoIterations}. + +\subsection{Random Number Generator} + +The random number generator used in the code is the one proposed by +Martin L{\"u}scher and usually known under the name +RANLUX~\cite{Luscher:1993dy}. A single and double precision +implementation was made available by the author under the GNU General +Public License and can be downloaded~\cite{Luscher:ranluxweb}. For +convenience it is also included in the tmLQCD package. + + +\endinput + +%%% Local Variables: +%%% mode: latex +%%% TeX-master: "main" +%%% End: diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/doc/configure.tex b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/configure.tex new file mode 100644 index 0000000000000000000000000000000000000000..470139d74f6663aedce87abad96f8109321ecc01 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/configure.tex @@ -0,0 +1,131 @@ +The software ships with a GNU autoconf environment and a configure +script, which will generate GNU Makefiles to build the programmes. It +is supported and recommended to configure and build the executables in +a seperate directory. This also allows to have several builds with +different options from the same source code directory. + +\subsection{Prerequisites} + +In order to compile the programmes you need the {\ttfamily + LAPACK}~\cite{lapack:web} library (fortran version) installed. You also +need to find out which linker options are needed to link against +{\ttfamily LAPACK}, e.g. {\ttfamily -Lpath-to-lapack -llapack + -lblas}. In addition you need to download and compile the latest +version (tested is version 1.2.3) of {\ttfamily + C-LIME}~\cite{lime:web}, which is used as a packaging scheme to +read and write gauge configurations and propagators to files. + +If you want to use a C-compiler that is not the standard one on the +architecture you are working on, you also need to know the +C-compiler. On some platforms you may also need to know the +corresponding fortran 77 compiler, in particular in cases where you +cross-compile. + +\subsection{Configuring the hmc package} +\label{sec:config} + +In order to get a simple configuration of the hmc package it is enough +to just type +\begin{verbatim} +patch-to-src-code/configure --with-lime= \ + --with-lapack= CC= \ + F77= CFLAGS= +\end{verbatim} +in your build directory. If +you do not specify {\ttfamily CC, F77} and {\ttfamily CFLGAS}, +{\ttfamily configure} will guess them. + +The code was successfully compiled and run at least on the following +platforms: i686 and compatible, x64 and compatible, IBM Regatta +systems, IBM Blue Gene/L, IBM Blue Gene/P, SGI Altix and SGI PC +clusters, powerpc clusters. + +The configure script accepts certain options to influence the building +procedure. One can get an overview over all supported options with +{\ttfamily configure --help}. There are {\ttfamily enable|disable} +options switching on and off optional features and {\ttfamily + with|without} switches usually related to optional packages. In the +following we describe the most important of them (check {\ttfamily + configure --help} for the defaults and more options): + +\begin{itemize} +\item {\ttfamily --enable-mpi}:\\ + This option switches on the support for MPI. On certain platforms it + automatically chooses the correct parallel compiler or searches for + a command {\ttfamily mpicc} in the searchpath. + +\item {\ttfamily --enable-p4}:\\ + Enable the use of special petium4 instruction set and cache + management. + +\item {\ttfamily --enable-opteron}:\\ + Enable the use of special opteron instruction set and cache + management. + +%\item {\ttfamily --enable-sse}:\\ +% Enable the use of SSE instruction set. This means not much when 64 +% Bit precision is used. + +\item {\ttfamily --enable-sse2}:\\ + Enable the use of SSE2 instruction set. This is a huge improvement + on pentium4 and equivalent systems. + +\item {\ttfamily --enable-sse3}:\\ + Enable the use of SSE3 instruction set. This will give another 20\% + of speedup when compared to only SSE2. However, only a few + processors are capable of SSE3 so far. + +\item {\ttfamily --enable-gaugecopy}:\\ + See section \ref{sec:dirac} for details on this option. It will + increase the memory requirement of the code. + +\item {\ttfamily --enable-halfspinor}:\\ + If this option is enabled the Dirac operator using half spinor + fields is used. See sub-section \ref{sec:dirac} for details. If this + feature is switched on, also the gauge copy feature is switched + automatically. + +%\item {\ttfamily --enable-shmem}:\\ +% Use shared memory API instead of MPI for the communication of spinor +% fields. This is currently only usable on the Munich Altix machine. + +\item {\ttfamily --with-mpidimension=n}:\\ + This option has only effect if the preceeding one is switched + on. The number of parallel directions can be specified. 1,2,3 and 4 + dimensional parallelisation is supported. + +\item {\ttfamily --with-lapack=""}:\\ + the code requires lapack to be linked. All linker flags neccessary + to do so must be specified here. Note, that {\ttfamily LIBS="..."} + works similar. + +\item {\ttfamily --with-limedir=}:\\ + Tells configure where to find the lime package, which is required for + the build of the HMC. It is used for the ILDG file format. + +\end{itemize} + +The configure script will guess at the very beginning on which +platform the build is done. In case this fails or a cross compilation +must be performed please use the option {\ttfamily --host=HOST}. For +instance in order to compile for the BG/P you have to specify +{\ttfamily --host=ppc-ibm-bprts --build=ppc64-ibm-linux}. + +For certain architectures like the Blue Gene systems there are +{\ttfamily README.arch} files in the top source directory with +exsample configure calls. + +\subsection{Building and Installing} + +After successfully configuring the package you can build the code by +simply typing {\ttfamily make} in the build directory. This will +compile the standard executables. Typing {\ttfamily make install} will +copy these executables into the install directory. The default install +directory is {\ttfamily \$HOME/bin}, which can be influenced e.g. with +the {\ttfamily --prefix} option to {\ttfamily configure}. + + +%%% Local Variables: +%%% mode: latex +%%% TeX-master: "main" +%%% End: diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/doc/deflation.tex b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/deflation.tex new file mode 100644 index 0000000000000000000000000000000000000000..12ba1a2aa869b3533a49edfd8c868f52aa92dc1e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/deflation.tex @@ -0,0 +1,129 @@ +\subsection{Implementing Deflation} + +We are aiming to solve +\[ +D\psi = \eta +\] +using L{\"u}schers deflation method. Note that $D$ is the +non-hermitian (twisted) Wilson Dirac operator. + +Lets assume we have devided the whole lattice completely into blocks +$\Lambda(\vec b)$ on a four dimensional grid. Every block has a grid +coordinate $\vec b$ and we have a total number of $N_b$ blocks. (This +is actually what we have in the MPI environment) Lets +assume we have found $N$ approximate (global) eigenvectors $\psi_l, +l=1,...,N$ and we have restricted them to the blocks via +\[ +\phi_l^{\vec b}(x) = +\begin{cases} + \psi_l(x) & \textrm{if}\ x \in \Lambda(\vec b)\, ,\\ + 0 & \textrm{otherwise}\,. +\end{cases} +\] +obtaining in total $N_b\cdot N$ fields. And we have already +orthonormalised them using Gram-Schmidt or whatever. + +\subsubsection*{Construction of the little Dirac operator} + +The \emph{little Dirac operator} $A$ is then computed from +\[ +A_{(\vec a,k)(\vec b, l)} = \langle\phi_k^{\vec a}| D\phi_l^{\vec + b}\rangle +\] +$A_{(\vec a,k)(\vec b, l)}$ is non-zero only for $\vec a = \vec b$ or +$\vec b = \vec a \pm \vec \mu, \mu=1,...,4$, where $\vec\mu$ is a unit +vector in block space, because $D$ involves only next neighbour +interaction. + +All elements with $\vec a = \vec b$ can be computed by applying the +local Dirac operator $D^{\vec a}$ (i.e. all exterior boundaries set to +zero, because $\phi_l^{\vec a}$ has support only on block +$\Lambda(\vec a)$) +\[ +\varphi_l^{\vec a} = D \phi_l^{\vec a} = D^{\vec a} \phi_l^{\vec a},\quad +l=1,...,N +\] +and computing the scalar products $(\phi_l^{\vec a},\varphi_k^{\vec a})$ +for all combination of $l,k$ then. For the terms with $\vec a +\neq \vec b$ we have to be more carefully. Probably it is best done by +looping over all directions $\pm\mu$ and computing +\[ +\langle \phi_l^{\vec a}|\ \varphi_k^{\vec a+\vec\mu}\rangle_{\partial_{\mu}\Lambda(\vec a)} +\] +where $\partial_{\mu}\Lambda(\vec a)$ denotes the inner boundary in +$\mu$-direction of block $\Lambda(\vec a)$, where $\varphi_k^{\vec + a+\vec\mu}$ is non-zero. + +The action of the little Dirac operator on a \emph{little + quark field} $w$ (complex field of length $N=N_S\cdot N_b$) on block +$\Lambda(\vec a)$ reads: +\[ +\begin{split} + v_k^{\vec a} &= A_{(\vec a,k)(\vec b, l)} w_l^{\vec b}\\ + &= \sum_{l=1}^N A_{(\vec a,k)(\vec a, l)} w_l^{\vec a} + + \sum_{\mu} \sum_{l=1}^N [A_{(\vec a,k)(\vec a+\vec\mu, l)} w_l^{\vec + a+\vec\mu} + A_{(\vec a,k)(\vec a-\vec\mu, l)} w_l^{\vec a - \vec\mu}]\\ +\end{split} +\] +or in matrix notation +\[ +v^{\vec a} = A^{\vec a,\vec a} w^{\vec a} + \sum_\mu [A^{\vec a,\vec + a+\vec\mu} w^{\vec a+\vec\mu} + A^{\vec a,\vec a-\vec\mu} w^{\vec a-\vec\mu}] +\] +This involves again only next neighbour (block) interaction. +Every block matrix $A^{\vec a,\vec b}$ is a $N\times N$ complex +matrix. + +\subsubsection{global mode deflation} + +We want to use the global fields $\psi_l$ to deflate the little Dirac +operator $A$. This requires to find vectors $\chi_l$, which fulfill +\begin{equation} + \label{eq:llD} + B_{kl} = \langle\psi_k|D \psi_l\rangle = \langle u_k|A u_l\rangle +\end{equation} +for $k,l = 1,...,N_s$. We recall that the fields $\phi_i$, $i= +1,...,N_b\cdot N_s$ were obtained by restricting the fields $\psi_l$ to +the blocks $\vec a$ followed by an orthonormalisation process. So the +complex vectors $u_l^{\vec a}$ of length $N=N_s\cdot N_b$ on block +$\vec a$ are computed from +\begin{equation} + \label{eq:chis} + (u^{\vec a}_l)_i = \langle\phi_i|\psi_l^{\vec + a}\rangle,\qquad\forall \phi_i\ \textrm{has support on block}\ + \Lambda(\vec a) +\end{equation} +where the notation $(u^{\vec a})_i$ means the $i$-th component of +vector $u$ on block $\vec a$. The little little Dirac operator $B$ is +then given by +\[ +\begin{split} + B_{kl} &= \langle u_k|A u_l\rangle = + \langle(u_k)_i|\langle\phi_i|D \phi_j\rangle (u_l)_j\rangle\\ + &= \langle\psi_k|\phi_i\rangle\langle\phi_i|D + \phi_j\rangle\langle\phi_j |\psi_l\rangle = \langle\psi_k|D \psi_l\rangle \\ +\end{split} +\] +where summing over equal indices is understood. The little Dirac +operators is then deflated using the little oblique projectors $p_L$ +and $p_R$: +\[ +\begin{split} + p_L v &= v - \sum_{k,l}^{N_s} + A u_k(B^{-1})_{kl}\langle u_l|v\rangle\\ + p_R v &= v - \sum_{k,l}^{N_s} + u_k(B^{-1})_{kl}\langle u_l|A v\rangle\\ +\end{split} +\] +and the same algebra as before. + +\subsubsection*{What needs to be done} + +\begin{itemize} +\item implement a suitable preconditioner +\item ... +\end{itemize} +%%% Local Variables: +%%% mode: latex +%%% TeX-master: "main" +%%% End: diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/doc/dsfont.sty b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/dsfont.sty new file mode 100644 index 0000000000000000000000000000000000000000..8be042232cba231647ae1d1fcffc94c198dfdcd9 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/dsfont.sty @@ -0,0 +1,9 @@ +\ProvidesPackage{dsfont} + [1995/08/01 v0.1 Double stroke roman fonts] + +\def\ds@whichfont{dsrom} +\DeclareOption{sans}{\def\ds@whichfont{dsss}} +\ProcessOptions\relax + +\DeclareMathAlphabet{\mathds}{U}{\ds@whichfont}{m}{n} +\endinput diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/doc/eo_pre.tex b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/eo_pre.tex new file mode 100644 index 0000000000000000000000000000000000000000..7b8b9c2d90cfcf7c875e2b195c491d3ebe0ffcc1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/eo_pre.tex @@ -0,0 +1,779 @@ +\label{sec:eo} + +\subsection{HMC Update} + +In this section we describe how even/odd +\cite{DeGrand:1990dk,Jansen:1997yt} preconditioning can be used in the +HMC algorithm in +presence of a twisted mass term. Even/odd preconditioning is +implemented in the tmLQCD package in the HMC algorithm as well as in +the inversion of the Dirac operator, and can be used optionally. + +We start with the lattice fermion action in the hopping parameter +representation in the $\chi$-basis written as +\begin{equation} + \label{eq:eo0} + \begin{split} + S[\chi,\bar\chi,U] = \sum_x & \Biggl\{ \bar\chi(x)[1+2i \kappa\mu\gamma_5\tau^3]\chi(x) \Bigr. \\ + & -\kappa\bar\chi(x)\sum_{\mu = 1}^4\Bigl[ U(x,\mu)(r+\gamma_\mu)\chi(x+a\hat\mu)\bigr. \\ + & +\Bigl. \bigl. U^\dagger(x-a\hat\mu,\mu)(r-\gamma_\mu)\chi(x-a\hat\mu)\Bigr] + \Biggr\} \\ + \equiv &\sum_{x,y}\bar\chi(x) M_{xy}\chi(y)\ . + \end{split} +\end{equation} +For convenience we define +$\tilde\mu=2\kappa\mu$. Using the matrix $M$ one can define the +hermitian (two flavor) operator: +\begin{equation} + \label{eq:eo1} + Q\equiv \gamma_5 M = \begin{pmatrix} + \Qp & \\\ + & \Qm \\ + \end{pmatrix} +\end{equation} +where the sub-matrices $\Qpm$ can be factorised as follows (Schur +decomposition): +\begin{equation} + \label{eq:eo2} + \begin{split} + Q^\pm &= \gamma_5\begin{pmatrix} + 1\pm i\tilde\mu\gamma_5 & M_{eo} \\ + M_{oe} & 1\pm i\tilde\mu\gamma_5 \\ + \end{pmatrix} = + \gamma_5\begin{pmatrix} + M_{ee}^\pm & M_{eo} \\ + M_{oe} & M_{oo}^\pm \\ + \end{pmatrix} \\ + & = + \begin{pmatrix} + \gamma_5M_{ee}^\pm & 0 \\ + \gamma_5M_{oe} & 1 \\ + \end{pmatrix} + \begin{pmatrix} + 1 & (M_{ee}^\pm)^{-1}M_{eo}\\ + 0 & \gamma_5(M_{oo}^\pm-M_{oe}(M_{ee}^\pm)^{-1}M_{eo})\\ + \end{pmatrix}\, . +\end{split} +\end{equation} +Note that $(M_{ee}^\pm)^{-1}$ can be +computed to be +\begin{equation} + \label{eq:eo3} + (1\pm i\tilde\mu\gamma_5)^{-1} = \frac{1\mp i\tilde\mu\gamma_5}{1+\tilde\mu^2}. +\end{equation} +Using $\det(Q)=\det(\Qp)\det(\Qm)$ the following relation can be derived +\begin{equation} + \label{eq:eo4} + \begin{split} + \det(\Qpm) &\propto \det(\hQpm) \\ + \hQpm &= \gamma_5(M_{oo}^\pm - M_{oe}(M_{ee}^\pm )^{-1}M_{eo})\, , + \end{split} +\end{equation} +where $\hQpm$ is only defined on the odd sites of the lattice. In the +HMC algorithm the determinant is stochastically estimated using pseudo +fermion field $\phi_o$: Now we write the determinant with pseudo +fermion fields: +\begin{equation} + \begin{split} + \det(\hQp \hQm) &= \int \mathcal{D}\phi_o\,\mathcal{D}\phi^\dagger_o\ + \exp(-S_\mathrm{PF})\\ + S_\mathrm{PF} &\equiv\ \phi_o^\dagger\ \left(\hQp\hQm\right)^{-1}\phi_o\, , + \end{split} +\end{equation} +where the fields $\phi_o$ are defined only on the odd sites of the +lattice. In order to compute the force corresponding to the effective +action $S_\mathrm{PF}$ we need the variation of $S_\mathrm{PF}$ with respect to the gauge fields +(using $\delta (A^{-1})=-A^{-1}\delta A A^{-1}$): +\begin{equation} + \label{eq:eo5} + \begin{split} + \delta S_\mathrm{PF} &= -[\phi_o^\dagger (\hQp \hQm)^{-1}\delta \hQp(\hQp)^{-1}\phi_o + + \phi_o^\dagger(\hQm)^{-1}\delta \hQm (\hQp \hQm)^{-1} \phi_o ] \\ + &= -[X_o^\dagger \delta \hQp Y_o + Y_o^\dagger \delta\hQm X_o] + \end{split} +\end{equation} +with $X_o$ and $Y_o$ defined on the odd sides as +\begin{equation} + \label{eq:eo6} + X_o = (\hQp \hQm)^{-1} \phi_o,\quad Y_o = (\hQp)^{-1}\phi_o=\hat + \Qm X_o\ , +\end{equation} +where $(\hQpm)^\dagger = \hat Q^\mp$ has been used. The variation of +$\hQpm$ reads +\begin{equation} + \label{eq:eo7} + \delta \hQpm = \gamma_5\left(-\delta M_{oe}(M_{ee}^\pm )^{-1}M_{eo} - + M_{oe}(M_{ee}^\pm )^{-1}\delta M_{eo}\right), +\end{equation} +and one finds +\begin{equation} + \label{eq:eo8} + \begin{split} + \delta S_\mathrm{PF} &= -(X^\dagger\delta \Qp Y + Y^\dagger\delta \Qm X) \\ + &= -(X^\dagger\delta \Qp Y +(X^\dagger\delta \Qp Y)^\dagger) + \end{split} +\end{equation} +where $X$ and $Y$ are now defined over the full lattice as +\begin{equation} + \label{eq:eo9} + X = + \begin{pmatrix} + -(M_{ee}^-)^{-1}M_{eo}X_o \\ X_o\\ + \end{pmatrix},\quad + Y = + \begin{pmatrix} + -(M_{ee}^+)^{-1}M_{eo}Y_o \\ Y_o\\ + \end{pmatrix}. +\end{equation} +In addition $\delta\Qp = \delta\Qm= \delta Q, M_{eo}^\dagger = \gamma_5 M_{oe}\gamma_5$ and +$M_{oe}^\dagger = \gamma_5 M_{eo}\gamma_5$ has been used. Since the bosonic part +is quadratic in the $\phi_o$ fields, the $\phi_o$ are generated at the +beginning of each molecular dynamics trajectory with +\begin{equation} + \label{eq:eo10} + \phi_o = \hQp R, +\end{equation} +where $R$ is a random spinor field taken from a Gaussian distribution +with norm one. + +\subsubsection{Symmetric even/odd Preconditioning} + +One may write instead of eq. (\ref{eq:eo2}) the following symmetrical +factorisation of $\Qpm$: +\begin{equation} + \label{eq:sym1} + \Qpm = + \gamma_5\begin{pmatrix} + M_{ee}^\pm & 0 \\ + M_{oe} & M_{oo}^\pm \\ + \end{pmatrix} + \begin{pmatrix} + 1 & (M_{ee}^\pm)^{-1}M_{eo}\\ + 0 & (1-(M_{oo}^\pm)^{-1} M_{oe} (M_{ee}^\pm)^{-1} M_{eo})\\ + \end{pmatrix}\, . +\end{equation} +Where we can now re-define +\begin{equation} + \label{eq:sym2} + \hat Q_\pm = \gamma_5(1-(M_{oo}^\pm)^{-1} M_{oe} (M_{ee}^\pm)^{-1} + M_{eo}) +\end{equation} +With this re-definition the procedure is analogous to what we +discussed previously. Only the vectors $X$ and $Y$ need to be modified +to +\begin{equation} + \begin{split} + \label{eq:sym9} + X &= + \begin{pmatrix} + -(M_{ee}^-)^{-1}M_{eo}(M_{oo}^-)^{-1}X_o \\ X_o\\ + \end{pmatrix},\\ + Y &= + \begin{pmatrix} + -(M_{ee}^+)^{-1}M_{eo}(M_{oo}^+)^{-1}Y_o \\ Y_o\\ + \end{pmatrix}.\\ + \end{split} +\end{equation} +Note that the variation of the action is still given by +\begin{equation} + \label{eq:sym3} + \delta S_\mathrm{PF} = -\re(X^\dagger \delta Q_+ Y)\ . +\end{equation} + +\subsubsection{Mass non-degenerate flavour doublet} + +Even/odd preconditioning can also be implemented for the mass +non-degenerate flavour doublet Dirac operator $D_h$ +eq.~(\ref{eq:Dh}). Denoting +\[ +Q^h = \gamma_5 D_h +\] +the even/odd decomposition is as follows +\begin{equation} + \label{eq:Dheo} + \begin{split} + Q^h &= + \begin{pmatrix} + (\gamma_5+i\bar\mu\tau^3 -\bar\epsilon\gamma_5\tau^1) & Q^h_{eo}\\ + Q^h_{oe} & (\gamma_5+i\bar\mu\tau^3 -\bar\epsilon\gamma_5\tau^1)\\ + \end{pmatrix} \\ + &= + \begin{pmatrix} + Q^h_{ee} & 0 \\ + Q^h_{oe} & 1 \\ + \end{pmatrix} + \cdot + \begin{pmatrix} + 1 & (Q^h_{ee})^{-1}Q_{eo} \\ + 0 & \hat Q^h_{oo} \\ + \end{pmatrix} \\ + \end{split} +\end{equation} +where $\hat Q^h_{oo}$ is given in flavour space by +\begin{equation*} + \hat Q^h_{oo} = \gamma_5 + \begin{pmatrix} + 1 + i\bar\mu\gamma_5 - + \frac{M_{oe}(1-i\bar\mu\gamma_5)M_{eo}}{1+\bar\mu^2-\bar\epsilon^2} & + -\bar\epsilon\left(1+\frac{M_{oe}M_{eo}}{1+\bar\mu^2-\bar\epsilon^2}\right) \\ + -\bar\epsilon\left(1+\frac{M_{oe}M_{eo}}{1+\bar\mu^2-\bar\epsilon^2}\right) & + 1 - i\bar\mu\gamma_5 - + \frac{M_{oe}(1+i\bar\mu\gamma_5)M_{eo}}{1+\bar\mu^2-\bar\epsilon^2}\\ + \end{pmatrix} +\end{equation*} +with the previous definitions of $M_{eo}$ etc. The inplementation for +the HMC is very similar to the mass degenerate case. $\hat Q^h$ has +again a hermitian conjugate given by +\[ +(\hat Q^h)^\dagger = \tau^1\ \hat Q^h\ \tau^1 +\] + +\subsubsection{Combining Clover and Twisted mass term} \label{sec:clover_twist} + +We start again with the lattice fermion action in the hopping +parameter representation in the $\chi$-basis now including the clover +term written as +\begin{equation} + \label{eq:eosw0} + \begin{split} + S[\chi,\bar\chi,U] = \sum_x & \Biggl\{ \bar\chi(x)[1+2\kappa + c_{SW}T + 2i \kappa\mu\gamma_5\tau^3]\chi(x) \Bigr. \\ + & -\kappa\bar\chi(x)\sum_{\mu = 1}^4\Bigl[ U(x,\mu)(r+\gamma_\mu)\chi(x+a\hat\mu)\bigr. \\ + & +\Bigl. \bigl. U^\dagger(x-a\hat\mu,\mu)(r-\gamma_\mu)\chi(x-a\hat\mu)\Bigr] + \Biggr\} \\ + \equiv &\sum_{x,y}\bar\chi(x) M_{xy}\chi(y)\, , + \end{split} +\end{equation} +with the clover term $T$. For convenience we define +$\tilde\mu\equiv2\kappa\mu$ and $\tilde c_{SW} = 2\kappa +c_{SW}$. Using the matrix $M$ one can define the +(two flavor) operator: +\begin{equation} + \label{eq:eosw1} + Q\equiv \gamma_5 M = \begin{pmatrix} + \Qp & \\\ + & \Qm \\ + \end{pmatrix} +\end{equation} +where the sub-matrices $\Qpm$ can be factorised as follows (Schur +decomposition): +\begin{equation} + \label{eq:eosw2} + \begin{split} + Q^\pm &= \gamma_5\begin{pmatrix} + 1 + T_{ee} \pm i\tilde\mu\gamma_5 & M_{eo} \\ + M_{oe} & 1 + T_{oo} \pm i\tilde\mu\gamma_5 \\ + \end{pmatrix} = + \gamma_5\begin{pmatrix} + M_{ee}^\pm & M_{eo} \\ + M_{oe} & M_{oo}^\pm \\ + \end{pmatrix} \\ + & = + \begin{pmatrix} + \gamma_5M_{ee}^\pm & 0 \\ + \gamma_5M_{oe} & 1 \\ + \end{pmatrix} + \begin{pmatrix} + 1 & (M_{ee}^\pm)^{-1}M_{eo}\\ + 0 & \gamma_5(M_{oo}^\pm-M_{oe}(M_{ee}^\pm)^{-1}M_{eo})\\ + \end{pmatrix}\, . +\end{split} +\end{equation} +Note that $(M_{ee}^\pm)^{-1}$ cannot be computed as easily as in the +case of Twisted mass fermions without clover term. +Using $\det(Q)=\det(\Qp)\det(\Qm)$ the following relation can be derived +\begin{equation} + \label{eq:eosw4} + \begin{split} + \det(\Qpm) &\propto \det(1+T_{ee} \pm i\tilde\mu\gamma_5)\det(\hQpm) \\ + \hQpm &= \gamma_5((1 + T_{oo} \pm i\tilde\mu\gamma_5) - + M_{oe}( 1 + T_{ee} \pm i\tilde\mu\gamma_5 )^{-1}M_{eo})\, , + \end{split} +\end{equation} +where $\hQpm$ is only defined on the odd sites of the lattice. In the +HMC algorithm the second determinant is stochastically estimated using +pseudo fermion fields $\phi_o$: now we write the determinant with +pseudo fermion fields: +\begin{equation} + \begin{split} + \det(\hQp \hQm) &= \int \mathcal{D}\phi_o\,\mathcal{D}\phi^\dagger_o\ + \exp(-S_\mathrm{PF})\\ + S_\mathrm{PF} &\equiv\ \phi_o^\dagger\ \left(\hQp\hQm\right)^{-1}\phi_o\, , + \end{split} +\end{equation} +where the fields $\phi_o$ are defined only on the odd sites of the +lattice. From the first factor in the Schur decomposition a second +term needs to be taken into account in the effective action for the +fermion determinant, this reads +\begin{equation} + \label{eq:swdet} + \begin{split} + S_{\det} &= - \log[\det(1+T_{ee} + i\tilde\mu\gamma_5)\ \cdot\ + \det(1+T_{ee} - i\tilde\mu\gamma_5)]\\ + &= -\tr[\log(1+T_{ee} + i\tilde\mu\gamma_5) + \log(1+T_{ee} - + i\tilde\mu\gamma_5)]\, .\\ + \end{split} +\end{equation} +Note that for $\tilde\mu=0$ $\det(1+T_{ee})$ is real. For +$\tilde\mu\neq0$ however, $\det(1+T_{ee}+i\tilde\mu\gamma_5)$ is the +complex conjugate of $\det(1+T_{ee}-i\tilde\mu\gamma_5)$ as the +product of the two must be real. The latter can be seen from +\[ +\begin{split} + &(1+T_{ee} + i\tilde\mu\gamma_5)\ \cdot\ (1+T_{ee} - + i\tilde\mu\gamma_5) = \\ + &(1+T_{ee})^2 + \tilde\mu^2\, .\\ +\end{split} +\] +In order to compute the force corresponding to the effective +action $S_\mathrm{PF}$ we need the variation of $S_\mathrm{PF}$ with +respect to the gauge fields +(using $\delta (A^{-1})=-A^{-1}\delta A A^{-1}$): +\begin{equation} + \label{eq:eosw5} + \begin{split} + \delta S_\mathrm{PF} &= -[\phi_o^\dagger (\hQp \hQm)^{-1}\delta \hQp(\hQp)^{-1}\phi_o + + \phi_o^\dagger(\hQm)^{-1}\delta \hQm (\hQp \hQm)^{-1} \phi_o ] \\ + &= -[X_o^\dagger \delta \hQp Y_o + Y_o^\dagger \delta\hQm X_o] + \end{split} +\end{equation} +with $X_o$ and $Y_o$ defined on the odd sides as +\begin{equation} + \label{eq:eosw6} + X_o = (\hQp \hQm)^{-1} \phi_o,\quad Y_o = (\hQp)^{-1}\phi_o=\hat + \Qm X_o\ , +\end{equation} +where $(\hQpm)^\dagger = \hat Q^\mp$ has been used. The variation of +$\hQpm$ reads +\begin{equation} + \label{eq:eosw7} + \begin{split} + \delta \hQpm = \gamma_5 & \left( \delta T_{oo}-\delta M_{oe}(M_{ee}^\pm )^{-1}M_{eo} - + M_{oe}(M_{ee}^\pm )^{-1}\delta M_{eo}\right. \\ + &\left. + M_{oe}(M_{ee}^\pm )^{-1} \delta T_{ee} (M_{ee}^\pm )^{-1} M_{eo}\right), + \end{split} +\end{equation} +and one finds +\begin{equation} + \label{eq:eosw8} + \begin{split} + \delta S_\mathrm{PF} &= -(X^\dagger\delta \Qp Y + Y^\dagger\delta \Qm X) \\ + &= -(X^\dagger\delta \Qp Y +(X^\dagger\delta \Qp Y)^\dagger) + \end{split} +\end{equation} +where $X$ and $Y$ are now defined over the full lattice as +\begin{equation} + \label{eq:eosw9} + X = + \begin{pmatrix} + -(M_{ee}^-)^{-1}M_{eo}X_o \\ X_o\\ + \end{pmatrix},\quad + Y = + \begin{pmatrix} + -(M_{ee}^+)^{-1}M_{eo}Y_o \\ Y_o\\ + \end{pmatrix}. +\end{equation} +In addition $\delta\Qp = \delta\Qm = \delta Q, M_{eo}^\dagger = +\gamma_5 M_{oe}\gamma_5$ and $M_{oe}^\dagger = \gamma_5 +M_{eo}\gamma_5$ has been used. $\delta Q$ +is now the original +\[ +\delta Q = \gamma_5 +\begin{pmatrix} + \delta T_{ee} & \delta M_{eo} \\ + \delta M_{oe} & \delta T_{oo} \\ +\end{pmatrix} +\] +defined over the full lattice. Since the bosonic part +is quadratic in the $\phi_o$ fields, the $\phi_o$ are generated at the +beginning of each molecular dynamics trajectory with +\begin{equation} + \label{eq:eosw10} + \phi_o = \hQp R, +\end{equation} +where $R$ is a random spinor field taken from a Gaussian distribution +with norm one. + +The additional bit in the action $S_{\det}$ needs to be treated +seperately. The variation of this part is +\begin{equation} + \label{eq:eosw11} + \delta S_{\det} = -\tr \left\{ \left[(1+i\tilde\mu\gamma_5 + T_{ee})^{-1} + + (1-i\tilde\mu\gamma_5 + T_{ee})^{-1}\right] \delta T_{ee} \right\} \ . +\end{equation} +The main difference in between pure Twisted mass fermions and Twisted +mass fermions plus clover term is that the matrices $M_{ee}$ and +$M_{oo}$ need to be inverted numerically. A stable numerical method +for this task needs to be devised. + +For the implementation it is useful to compute the term +\begin{equation} + \label{eq:Tee} + 1+T_{a\alpha,b\beta} = 1 + \frac{i}{2} c_\mathrm{sw} + \kappa\sigma_{\mu\nu}^{\alpha\beta}F_{\mu\nu}^{\alpha\beta}(x) +\end{equation} +once for all $x$. This is implemented in {\ttfamily clover\_leaf.c} in +the routine {\ttfamily sw\_term}. The twisted mass term is not +included in this routine, as this would require double the storage for +plus and minus $\mu$, respectively. It is easier to add the twisted +mass term in later on. + +The term in eq.~(\ref{eq:Tee}) correpsonds to a $12\times12$ matrix +in colour and spin which reduces to two complex $6\times6$ matrices +per site because it is block-diagonal in spin (one matrix for the two +upper spin components, one for the two lower ones). +For each $6\times6$ matrix the off-diagonal $3\times3$ +matrices are just hermitian conjugate to each other since $1+T$ is hermitian. +We therefore get away with storing two times three +$3\times3$ complex matrices. These are stored in the array {\ttfamily + sw[VOLUME][3][2]} of type {\ttfamily su3}. Here, {\ttfamily + sw[x][0][0]} is the upper diagonal $3\times3$ matrix, {\ttfamily + sw[x][1][0]} the upper off-diagnoal $3\times3$ matrix and {\ttfamily + sw[x][2][0]} the lower diagonal matrix. The lower off-diagonal +matrix would be the inverse of {\ttfamily sw[x][1][0]}. The second +$6\times6$ matrix is stored following the same conventions. + +For computing $S_\mathrm{det}$, we take into account the structure +of the $24 \times 24$ flavour, spin and colour matrix: +\begin{equation} + \label{eq:cloverMee} + M_{ee}(x) = + \begin{pmatrix} + A(x) + i\tilde{\mu} & 0 & 0 & 0 \\ + 0 & B(x) - i\tilde{\mu} & 0 & 0 \\ + 0 & 0 & A(x) - i\tilde{\mu} & 0 \\ + 0 & 0 & 0 & B(x) + i\tilde{\mu} \\ + \end{pmatrix}\, , +\end{equation} +where A and B are the $6 \times 6$ matrices mentioned above and are individually hermitian. + +The implementation {\ttfamily sw\_trace} in {\ttfamily clover\_det.c} populates a temporary $6\times6$ array +from the {\ttfamily sw} array and adds $+i\mu$ to the diagonal. Using $\det(\gamma_5) = 1$, +the contribution to the effective action is then: +\begin{equation} + \label{eq:cloverdet} + \begin{aligned} + \log \det(M_{ee}) &= \log\left( |\det( A + i\tilde{\mu} )|^2 \cdot |\det( B + i\tilde{\mu} )|^2 \right) \\ + & = \log\left( |\det( A + i\tilde{\mu} )|^2 \right) + \log \left( |\det( B + i\tilde{\mu} )|^2 \right)\,, + \end{aligned} +\end{equation} +where the summands are computed individually in a loop. + +When it comes to computing the inverse of $1\pm i \mu\gamma_5 + +T_{ee}$, the dependence on the sign of $\mu$ is unavoidable. However, +it is only needed for even (odd) sites, so we can use an array +{\ttfamily sw\_inv[VOLUME][3][2]} of type {\ttfamily su3} to store +e.g. $+\mu$ at even and $-\mu$ at odd sites. + +For evaluating the force for $S_\mathrm{det}$ in the function +{\ttfamily sw\_deriv} we have to compute +\begin{equation} + \label{eq:trdiracdet} + \tr_\mathrm{dirac}[\ i\sigma_{\mu\nu}(1+T_{ee}(x)\pm + i\tilde\mu\gamma_5)^{-1}\ ]\, , +\end{equation} +with $\sigma_{\mu\nu} = i\gamma_\mu\gamma_\nu\ \forall \mu\neq\nu$. +The matrix $(1+T_{ee}(x)\pm i\tilde\mu\gamma_5)^{-1}$ has the general +structure +\[ +T_\mathrm{det} = +\begin{pmatrix} + u_0 & u_1 & 0 & 0 \\ + u_3 & u_2 & 0 & 0 \\ + 0 & 0 & l_0 & l_1 \\ + 0 & 0 & l_3 & l_2 \\ +\end{pmatrix}\,. +\] +Evaluating eq.~(\ref{eq:trdiracdet}) with matrix $T_\mathrm{det}$ for +$\mu\neq\nu$ leads to the following terms +\begin{eqnarray*} + \label{eq:trsigma} + \mu\nu & \\ + 01 & -i (( l_1 -u_1) + (l_3-u_3))\\ + 02 & (l_1 - u_1) - (l_3 - u_3)\\ + 03 & i((l_2-u_2) - (l_0-u_0))\\ + 12 & i((l_2+u_2) - (l_0-u_0))\\ + 13 & (l_3+u_3) - (l_1 + u_1)\\ + 23 & -i(l_3+u_3+l_1+u_1)\,. +\end{eqnarray*} +The force for $S_\mathrm{PF}$ can be computed in exactly the same way, +even if in this case the matrix $T_\mathrm{PF}$ is a full matrix +stemming from +\begin{equation} + \label{eq:trdiracpf} + \tr_\mathrm{dirac}[\ i\sigma_{\mu\nu}(\gamma_5Y(x)\otimes + X^\dagger(x) + \gamma_5X(x)\otimes Y^\dagger(x))\ ]\equiv + \tr_\mathrm{dirac}[\ i\sigma_{\mu\nu}\ T_\mathrm{PF}\ ]\, . +\end{equation} +$T_\mathrm{PF}$ is computed in the function {\ttfamily sw\_spinor}. +After multiplying with +$\sigma_{\mu\nu}$ only the upper left and lower right blocks survive +and the structure stays identical to the case discussed for +$T_\mathrm{det}$. So in both cases, in order to compute the trace, we +have to compute first in the functions {\ttfamily sw\_spinor} and +{\ttfamily sw\_deriv} only +\begin{equation} + m_i = l_i - u_i\,,\quad p_i = l_i + u_i\quad i = 0,...,3\,. +\end{equation} +The $m_i$ and $p_i$ are then passed on to the function {\ttfamily + sw\_all} which combines them to the correct insertion matrices, +whereafter the traceless antihermitian part of it is +computed. Finally, $\delta T_{ee}$ is computed and combined with the +insertion matrices. + +\subsubsection{Combining Clover and Nondegenerate Twisted mass term} + +Now we have +\[ +\hat Q^h_{oo} = \gamma_5(M_{oo}^h - +(M_{oe}^h\ (M_{ee}^h)^{-1}\ M_{eo}^h)\,, +\] +with +\begin{equation} + M_{oo|ee}^h = 1+T_{oo|ee}+i\bar\mu\gamma_5\tau^3-\bar\epsilon\tau^1\,. +\end{equation} + +The clover part $1+T_{ee}$ is identical to the one in the $N_f=2$ +flavour case and stored in the array {\ttfamily sw}. + +Because $1+T_{ee}$ is hermitian, we can invert $M_{ee}^h$ by +\begin{equation} + \label{eq:ndSdet} + (1+T_{ee}+i\bar\mu\gamma_5\tau^3-\bar\epsilon\tau^1)^{-1} = + \frac{(1+T_{ee}-i\bar\mu\gamma_5\tau^3+\bar\epsilon\tau^1)} + {(1+T_{ee})^2 + \bar\mu^2 - \bar\epsilon^2}\,. +\end{equation} +In practice we compute $((1+T_{ee})^2 + \bar\mu^2 - +\bar\epsilon^2)^{-1}$ and store the result in the first {\ttfamily + VOLUME/2} elements of the array {\ttfamily sw\_inv}. Wherever the +clover terms needs to be applied we then multiply with $((1+T_{ee})^2 ++ \bar\mu^2 - \bar\epsilon^2)^{-1}$ and then with the nominator in +eq.~(\ref{eq:ndSdet}). One could save computing time here for the +price of using more memory by storing the full inverse. Actually, it +would be only slightly more than in the two flavour case: in addition +we would only have to store $\bar\epsilon((1+T_{ee})^2 ++ \bar\mu^2 - \bar\epsilon^2)^{-1}$. This would also allow to re-use a +lot of the $N_f=2$ flavour implementation. + +The determinant we have to compute is +\[ +\det(Q^h) = +\det[\gamma_5(1+T_{ee}+i\bar\mu\gamma_5\tau^3-\bar\epsilon\tau^1)]\ +\det[\hat Q^h_{oo}]. +\] + +Again, the first factor can be computed as $S_\mathrm{det}$, for which we take into +account the structure of the $24 \times 24$ flavour, spin and colour matrix: +\begin{equation} + \label{eq:cloverMee_eps} + M^h_{ee}(x) = + \begin{pmatrix} + A(x) + i\bar{\mu} & 0 & -\bar{\epsilon} & 0 \\ + 0 & B(x) - i\bar{\mu} & 0 & -\bar{\epsilon} \\ + -\bar{\epsilon} & 0 & A(x) - i\bar{\mu} & 0 \\ + 0 & -\bar{\epsilon} & 0 & B(x) + i\bar{\mu} \\ + \end{pmatrix}\, , +\end{equation} +where A and B are the $6 \times 6$ matrices mentioned in sub-section +\ref{sec:clover_twist} and are individually hermitian. + +The determinant of the $24 \times 24$ matrix can be simplified by writing it as follows +in $12 \times 12$ blocks in flavour: +\begin{equation*} + \begin{aligned} + \det(M^h_{ee}) &= + \det + \begin{pmatrix} + K & D \\ + D & K^\dagger + \end{pmatrix} = + \det \left[ \begin{pmatrix} + K & D - K D^{-1} K^\dagger \\ + D & 0 + \end{pmatrix} \cdot + \begin{pmatrix} + 1 & D^{-1} K^\dagger \\ + 0 & 1 + \end{pmatrix} \right] \\ + &= - \det(D) \cdot \det( D - K D^{-1} K^\dagger ) \\ + &= \det( K K^\dagger - D^2 ) \\ + &= \det( A^2 + \bar{\mu}^2 - \bar{\epsilon}^2 ) \cdot \det( B^2 + \bar{\mu}^2 - \bar{\epsilon}^2 ) \,, + \end{aligned} +\end{equation*} +where the sign in the second line comes from the first term and in the third line the +proportionality of $D$ to the identity matrix was used. + +The implementation {\ttfamily sw\_trace\_nd} in {\ttfamily clover\_det.c} populates +a temporary $6\times6$ array from the {\ttfamily sw} array, squares it and +and adds $\bar{\mu}^2 - \bar{\epsilon}^2$ to the diagonal. Using $\det(\gamma_5) = 1$, +the contribution to the effective action is then: +\begin{equation} + \label{eq:cloverdet_nd} + \log \det(M_{ee}) = \log\left( \det( A^2 + \bar{\mu}^2 - \bar{\epsilon}^2 ) \cdot \det( B^2 + \bar{\mu}^2 - \bar{\epsilon}^2 ) \right). +\end{equation} + +For the variation of this term we have to compute now +\begin{equation} + \label{eq:ndtrdiracdet} + \tr_\mathrm{dirac, flavour}[\ i\sigma_{\mu\nu}(1+T_{ee}(x) + + i\bar\mu\gamma_5\tau^3 - \bar\epsilon\tau^1)^{-1}\ ]\, , +\end{equation} +which is equal to +\begin{equation} + \tr_\mathrm{dirac,flavour}\left[\ i\sigma_{\mu\nu} + \frac{(1+T_{ee}-i\bar\mu\gamma_5\tau^3+\bar\epsilon\tau^1)} + {(1+T_{ee})^2 + \bar\mu^2 - \bar\epsilon^2}\ \right]\,. +\end{equation} +The trace in flavour simplifies the computation to +\begin{equation} + \tr_\mathrm{dirac}\left[\ i\sigma_{\mu\nu} + \frac{2(1+T_{ee})} + {(1+T_{ee})^2 + \bar\mu^2 - \bar\epsilon^2}\ \right]\,. +\end{equation} +This can be treated analogously to the degenerate case described +above. + +\subsection{Inversion} + +In addition to even/odd preconditioning in the HMC algorithm as +described above, it can also be used to speed up the inversion of the +fermion matrix. + +Due to the factorization (\ref{eq:eo2}) the full fermion matrix can be +inverted by inverting the two matrices appearing in the factorization +\[ +\begin{pmatrix} + M_{ee}^\pm & M_{eo} \\ + M_{oe} & M_{oo}^\pm \\ +\end{pmatrix}^{-1} += +\begin{pmatrix} + 1 & (M_{ee}^\pm)^{-1}M_{eo}\\ + 0 & (M_{oo}^\pm-M_{oe}(M_{ee}^\pm)^{-1}M_{eo})\\ +\end{pmatrix}^{-1} +\begin{pmatrix} + M_{ee}^\pm & 0 \\ + M_{oe} & 1 \\ +\end{pmatrix}^{-1}\, . +\] +The two factors can be simplified as follows: +\[ +\begin{pmatrix} + M_{ee}^\pm & 0 \\ + M_{oe} & 1 \\ +\end{pmatrix}^{-1} += +\begin{pmatrix} + (M_{ee}^\pm)^{-1} & 0 \\ + -M_{oe} (M_{ee}^{\pm})^{-1} & 1 \\ + \end{pmatrix} +\] +and +\[ +\begin{split} + &\begin{pmatrix} + 1 & (M_{ee}^\pm)^{-1}M_{eo}\\ + 0 & (M_{oo}^\pm-M_{oe}(M_{ee}^\pm)^{-1}M_{eo})\\ + \end{pmatrix}^{-1} + \\=& + \begin{pmatrix} + 1 & -(M_{ee}^\pm)^{-1}M_{eo}(M_{oo}^\pm-M_{oe}(M_{ee}^\pm)^{-1}M_{eo})^{-1} \\ + 0 & (M_{oo}^\pm-M_{oe}(M_{ee}^\pm)^{-1}M_{eo})^{-1}\\ + \end{pmatrix}\, . +\end{split} +\] +The complete inversion is now performed in two separate steps: First +we compute for a given source field $\phi=(\phi_e,\phi_o)$ an intermediate +result $\varphi=(\varphi_e,\varphi_o)$ by: +\[ +\begin{pmatrix} + \varphi_e \\ \varphi_o\\ +\end{pmatrix} += +\begin{pmatrix} + M_{ee}^\pm & 0 \\ + M_{oe} & 1 \\ +\end{pmatrix}^{-1} +\begin{pmatrix} + \phi_e \\ \phi_o \\ +\end{pmatrix} += +\begin{pmatrix} + (M_{ee}^\pm)^{-1} \phi_e \\ + -M_{oe}( M_{ee}^\pm)^{-1} \phi_e + \phi_o \\ +\end{pmatrix}\, . +\] +This step requires only the application of $M_{oe}$ and +$(M_{ee}^\pm)^{-1}$, the latter of which is given by Eq~(\ref{eq:eo3}). +The final solution $\psi=(\psi_e,\psi_o)$ can then be computed with +\[ +\begin{pmatrix} + \psi_e \\ \psi_o \\ +\end{pmatrix} += +\begin{pmatrix} + 1 & (M_{ee}^\pm)^{-1}M_{eo}\\ + 0 & (M_{oo}^\pm-M_{oe}(M_{ee}^\pm)^{-1}M_{eo})\\ +\end{pmatrix}^{-1} +\begin{pmatrix} + \varphi_e \\ \varphi_o \\ +\end{pmatrix} += +\begin{pmatrix} + \varphi_e - (M_{ee}^\pm)^{-1}M_{eo}\psi_o \\ \psi_o \\ +\end{pmatrix}\, , +\] +where we defined +\[ +\psi_o = (M_{oo}^\pm-M_{oe}(M_{ee}^\pm)^{-1}M_{eo})^{-1} \varphi_o\, . +\] +Therefore the only inversion that has to be performed numerically is +the one to generate $\psi_o$ from $\varphi_o$ and this inversion +involves only an operator that is better conditioned than the original +fermion operator. + +Even/odd preconditioning can also be used for the mass non-degenerate +Dirac operator $D_h$ eq.~(\ref{eq:Dh}). The corresponding equations +follow immediately from the previous discussion and the definition +from eq.~(\ref{eq:Dheo}). + +\subsubsection{Inverting $M$ on $\phi_o$} + +In case inverting the full matrix $M$ is much faster than inverting +the even/odd preconditioned matrix -- as might be the case with +deflation, one may use for symmetric even/odd preconditioining +\begin{equation} + (\hat M^\pm)^{-1}\phi_o\ =\ P_{l\to o}\ (M_\pm)^{-1}\ P_{o\to l}\ + M^\pm_{oo}\ \phi_o +\end{equation} +Where $P_{l\to o}$ projects the odd sides of a full spinor and +$P_{o\to l}$ reverses this by filling up with zeros. $M_\pm$ is here just +$\gamma_5 Q_\pm$. For asymmetric even/odd preconditioning the formula +reads +\begin{equation} + (\hat M^\pm)^{-1}\phi_o\ =\ P_{l\to o}\ (M_\pm)^{-1}\ P_{o\to l}\ + \phi_o\, . +\end{equation} +It is based on the observation that +\[ +M^{-1} = +\begin{pmatrix} + A_{ee} & A_{eo} \\ + A_{oe} & A_{oo} \\ +\end{pmatrix} +\] +with (skipping the $\pm$ index for brevity) +\[ +\begin{split} + A_{ee}\quad &=\quad (1- M_{ee}^{-1} M_{eo} M_{oo}^{-1} M_{oe})^{-1}\ M_{ee}^{-1} \\ + A_{eo}\quad &=\quad -M_{ee}^{-1}\ M_{eo}\ A_{oo} \\ + A_{oe}\quad &=\quad -M_{oo}^{-1}\ M_{oe}\ A_{ee} \\ + A_{oo}\quad &=\quad (1- M_{oo}^{-1} M_{oe} M_{ee}^{-1} M_{eo})^{-1}\ M_{oo}^{-1} \\ +\end{split} +\] +\endinput + +%%% Local Variables: +%%% mode: latex +%%% TeX-master: "main" +%%% End: diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/doc/gamma.tex b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/gamma.tex new file mode 100644 index 0000000000000000000000000000000000000000..0e678ef8ec44ccb5c90c05073c48bcd3c5b34bf3 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/gamma.tex @@ -0,0 +1,82 @@ +\label{sec:gammas} + +In the following we specify our conventions for $\gamma$- and +Pauli-matrices. + +\subsection{$\gamma$-matrices} + +We use the following convention for the Dirac $\gamma$-matrices: +\[ +\begin{split} + \gamma_0 = -\begin{pmatrix} + 0 & 0 & -1 & 0 \\ + 0 & 0 & 0 & -1 \\ + -1 & 0 & 0 & 0 \\ + 0 & -1 & 0 & 0 \\ + \end{pmatrix},\quad + \gamma_1 = -\begin{pmatrix} + 0 & 0 & 0 & -i \\ + 0 & 0 & -i & 0 \\ + 0 & +i & 0 & 0 \\ + +i & 0 & 0 & 0 \\ + \end{pmatrix},\\ + \gamma_2 = -\begin{pmatrix} + 0 & 0 & 0 & -1 \\ + 0 & 0 & +1 & 0 \\ + 0 & +1 & 0 & 0 \\ + -1 & 0 & 0 & 0 \\ + \end{pmatrix},\quad + \gamma_3 = -\begin{pmatrix} + 0 & 0 & -i & 0 \\ + 0 & 0 & 0 & +i \\ + +i & 0 & 0 & 0 \\ + 0 & -i & 0 & 0 \\ + \end{pmatrix}\ .\\ +\end{split} +\] +In this representation $\gamma_5$ is diagonal and reads +\[ + \gamma_5 = + \begin{pmatrix} + +1 & 0 & 0 & 0 \\ + 0 & +1 & 0 & 0 \\ + 0 & 0 & -1 & 0 \\ + 0 & 0 & 0 & -1 \\ + \end{pmatrix}\ . +\] + +\subsection{Pauli-matrices} + +For the Pauli-matrices acting in flavour space we use the following +convention: +\[ +\begin{split} + 1_f = + \begin{pmatrix} + 1 & 0 \\ + 0 & 1 \\ + \end{pmatrix},\quad + \tau^1 = + \begin{pmatrix} + 0 & 1 \\ + 1 & 0 \\ + \end{pmatrix},\quad + \tau^2 = + \begin{pmatrix} + 0 & -i \\ + i & 0 \\ + \end{pmatrix},\quad + \tau^3 = + \begin{pmatrix} + 1 & 0 \\ + 0 & -1 \\ + \end{pmatrix} +\end{split} +\] + +\endinput + +%%% Local Variables: +%%% mode: latex +%%% TeX-master: "main" +%%% End: diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/doc/gensources.tex b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/gensources.tex new file mode 100644 index 0000000000000000000000000000000000000000..045dc07d001190388a9438d80ae4637f0f662b01 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/gensources.tex @@ -0,0 +1,41 @@ +\subsection{Programme {\ttfamily gen\_sources}} + +The programme {\ttfamily gen\_sources} provides an interface to +generate stochastic sources for several different situations. It is +able to generate those for the nucleon case (which should not be used, +because point sources are optimal), for mesons in general and for the +special case of the pion only. + +The programme offers command line options as follows: +\begin{itemize} +\item {\ttfamily -h|?} a help. +\item {\ttfamily -L} the spatical lattice size +\item {\ttfamily -T} the temporal lattice size +\item {\ttfamily -o} the base filename of the sources (default is + {\ttfamily source}) +\item {\ttfamily -n} the configuration number (default is $0$) +\item {\ttfamily -s} the sample number (default is $0$) +\item {\ttfamily -t} the value of the start timeslice (default $0$) +\item {\ttfamily -S} the spatial spacing/dilution (default $1$) +\item {\ttfamily -P} the temporal spacing/dilution (default $T$) +\item {\ttfamily -N} produce nucleon sources (default meson sources) +\item {\ttfamily -p} plain output filename (see below) +\item {\ttfamily -O} the special pion only case +\item {\ttfamily -E} extended sources for pion three point + functions. Together with {\ttfamily -O} +\item {\ttfamily -d} write source in double precision (default single) +\item {\ttfamily -a} write all sources in one file rather than $12$ + (pion only is one file anyhow) +\end{itemize} +The output filename is generated like {\ttfamily + base.sampleno.gaugeno.tsno.00 -11}, unless {\ttfamily -p} is chosen, +which would correspond to {\ttfamily base.00-11}. + +The special pion only case corresponds to a single timeslice source +without any dilution in spin or colour or space. + +\endinput +%%% Local Variables: +%%% mode: latex +%%% TeX-master: "main" +%%% End: diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/doc/h-physrev5.bst b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/h-physrev5.bst new file mode 100644 index 0000000000000000000000000000000000000000..344aa0929affe0ba0ee4e3b17c3ef98504dc7705 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/h-physrev5.bst @@ -0,0 +1,1891 @@ +%%h-physrev5.bst +%% modified to work with new eprint format +%% Carsten Urbach 07/08 + +%%h-physrev4.bst + +%%Modified by jonivar skullerud + +%%(1) eprint numbers in square brackets for published articles + +%%(2) for unpublished articles (==missing journal), only the eprint number is given + +%%(3) no comma before and in lists of names (author lists) + +%%h-physrev3.bst + +%%Modified to avoid extra comma at end of citations + +%%h-physrev2.bst + +%%Jonathan Flynn's h-physrev.bst modified to allow citation information + +%%for SPIRES processing by Heath O'Connell following suggestions by + +%%Jacques Distler. + + + +ENTRY + +{ +address +eprint +author +booktitle +chapter +collaboration +SLACcitation +edition +editor +howpublished +institution +journal +key +month +note +number +organization +pages +publisher +school +series +title +type +volume +year +archive +archivePrefix +primaryClass +url +doi +} + +{} + +{ label } + +INTEGERS { output.state before.all mid.sentence after.sentence after.block } + +FUNCTION {init.state.consts} + +{ #0 'before.all := + +#1 'mid.sentence := + +#2 'after.sentence := + +#3 'after.block := + +} + +STRINGS { s t } + +FUNCTION {output.nonnull} + +{ 's := + +output.state mid.sentence = + +{ ", " * write$ } + +{ output.state after.block = + +{ "," * write$ + +newline$ + +"\newblock " write$ + +} + +{ output.state before.all = + +'write$ + +{ add.period$ " " * write$ } + +if$ + +} + +if$ + +mid.sentence 'output.state := + +} + +if$ + +s + +} + +FUNCTION {output} + +{ duplicate$ empty$ + +'pop$ + +'output.nonnull + +if$ + +} + +FUNCTION {output.check} + +{ 't := + +duplicate$ empty$ + +{ pop$ "empty " t * " in " * cite$ * warning$ } + +'output.nonnull + +if$ + +} + +FUNCTION {output.bibitem} + +{ newline$ + +"\bibitem{" write$ + +cite$ write$ + +"}" write$ + +newline$ + +"" + +before.all 'output.state := + +} + +FUNCTION {fin.entry} + +{ add.period$ + +write$ + +% modified to add SLACcitation field if present + +SLACcitation empty$ + + 'skip$ + + { newline$ SLACcitation write$ } + + if$ + +% end of modification + +newline$ + +} + +FUNCTION {new.block} + +{ output.state before.all = + +'skip$ + +{ after.block 'output.state := } + +if$ + +} + +FUNCTION {new.sentence} + +{ skip$ + +} + +FUNCTION {not} + +{ { #0 } + +{ #1 } + +if$ + +} + +FUNCTION {and} + +{ 'skip$ + +{ pop$ #0 } + +if$ + +} + +FUNCTION {or} + +{ { pop$ #1 } + +'skip$ + +if$ + +} + +FUNCTION {new.block.checka} + +{ empty$ + +'skip$ + +'new.block + +if$ + +} + +FUNCTION {new.block.checkb} + +{ empty$ + +swap$ empty$ + +and + +'skip$ + +'new.block + +if$ + +} + +FUNCTION {new.sentence.checka} + +{ empty$ + +'skip$ + +'new.sentence + +if$ + +} + +FUNCTION {new.sentence.checkb} + +{ empty$ + +swap$ empty$ + +and + +'skip$ + +'new.sentence + +if$ + +} + +FUNCTION {field.or.null} + +{ duplicate$ empty$ + +{ pop$ "" } + +'skip$ + +if$ + +} + +FUNCTION {emphasize} + +{ duplicate$ empty$ + +{ pop$ "" } + +{ "{\em " swap$ * "}" * } + +if$ + +} + +FUNCTION {embolden} + +{ duplicate$ empty$ + +{ pop$ "" } + +{ "{\bf " swap$ * "}" * } + +if$ + +} + +FUNCTION {paren} + +{ duplicate$ empty$ + +{ pop$ "" } + +{ "(" swap$ * ")" * } + +if$ + +} + +FUNCTION {sparen} + +{ duplicate$ empty$ + +{ pop$ "" } + +{ "[" swap$ * "]" * } + +if$ + +} + +INTEGERS { nameptr namesleft numnames } + +INTEGERS { etal } + +FUNCTION {format.names} + +{ 's := + +#1 'nameptr := + +s num.names$ 'numnames := + +numnames #5 > + +s numnames "{ll}" format.name$ "others" = numnames #1 > and + +or 'etal := + +etal + +{ #1 #1 + 'namesleft := } + +{ numnames 'namesleft := } + +if$ + +{ namesleft #0 > } + +{ s nameptr "{f.~}{vv~}{ll}{, jj}" format.name$ 't := + +nameptr #1 > + +{ namesleft #1 > + +{ ", " * t * } + +{ + +% jis: we do not want comma before and in any case. uncomment if you do want it + +% nameptr #2 > + +%{ "," * } + +%'skip$ + +%if$ + +t "others" = + +etal or + +{ " {\em et~al.}" * } + +{ " and " * t * } + +if$ + +} + +if$ + +} + +'t + +if$ + +nameptr #1 + 'nameptr := + +namesleft #1 - 'namesleft := + +} + +while$ + +} + +FUNCTION {format.authors} + +{ author empty$ + +{ "" } + +{ author format.names } + +if$ + +} + +FUNCTION {format.collaboration} +{ collaboration empty$ + { "" } + { "{\bf " collaboration * "} " * "Collaboration" * } + if$ +} + +FUNCTION {format.archive} +{ + archivePrefix empty$ + { "" } + { archivePrefix ":" *} + if$ +} + +FUNCTION {format.primaryClass} +{ + primaryClass empty$ + { "" } + { " [" primaryClass * "]" *} + if$ +} + +FUNCTION {format.eprint} +{ eprint empty$ + { ""} + { archive empty$ + {"\href{http://arxiv.org/abs/" eprint * "}" * + "{{\tt " * format.archive * eprint * + format.primaryClass * "}}" *} + {"\href{" archive * "/" * eprint * "}" * + "{{\tt " * format.archive * eprint * + format.primaryClass * "}}" *} + if$ + } + if$ +} + +FUNCTION {format.url} +{ url empty$ + { "" } + {"\url{" url * "}" *} + if$ +} + +FUNCTION {add.doi} +{ duplicate$ empty$ + { skip$ } + { doi empty$ + {} + {"\href{http://dx.doi.org/" doi * "}{" * swap$ * "}" *} + if$ + } + if$ +} + +FUNCTION {format.editors} + +{ editor empty$ + +{ "" } + +{ editor format.names + +editor num.names$ #1 > + +{ ", editors" * } + +{ ", editor" * } + +if$ + +} + +if$ + +} + +FUNCTION {format.edited} + +{ editor empty$ + +{ "" } + +{ "edited by " editor format.names * } + +if$ + +} + +FUNCTION {format.title} + +{ title empty$ + +{ "" } + +{ title "t" change.case$ } + +if$ + +} + +FUNCTION {n.dashify} + +{ 't := + +"" + +{ t empty$ not } + +{ t #1 #1 substring$ "-" = + +{ t #1 #2 substring$ "--" = not + +{ "--" * + +t #2 global.max$ substring$ 't := + +} + +{ { t #1 #1 substring$ "-" = } + +{ "-" * + +t #2 global.max$ substring$ 't := + +} + +while$ + +} + +if$ + +} + +{ t #1 #1 substring$ * + +t #2 global.max$ substring$ 't := + +} + +if$ + +} + +while$ + +} + +FUNCTION {first.page} + +{ 't := + +"" + +{ t empty$ not t #1 #1 substring$ "-" = not and } + +{ t #1 #1 substring$ * + +t #2 global.max$ substring$ 't := + +} + +while$ + +} + +FUNCTION {format.date} + +{ year empty$ + +{ "" } + +'year + +if$ + +} + +%FUNCTION {format.SLACcitation} + +%{ SLACcitation empty$ + +% {""} + +% { SLACcitation } + +% if$ + +%} + +FUNCTION {format.btitle} + +{ title emphasize + +} + +FUNCTION {tie.or.space.connect} + +{ duplicate$ text.length$ #3 < + +{ "~" } + +{ " " } + +if$ + +swap$ * * + +} + +FUNCTION {either.or.check} + +{ empty$ + +'pop$ + +{ "can't use both " swap$ * " fields in " * cite$ * warning$ } + +if$ + +} + +FUNCTION {format.bvolume} + +{ volume empty$ + +{ "" } + +{ series empty$ + +'skip$ + +{ ", " series * } + +if$ + +" Vol." volume tie.or.space.connect * + +"volume and number" number either.or.check + +} + +if$ + +} + +FUNCTION {format.number.series} + +{ volume empty$ + +{ number empty$ + +{ series field.or.null } + +{ series empty$ + +{ "there's a number but no series in " cite$ * warning$ } + +{ ", " series * } + +if$ + +" No. " number tie.or.space.connect * + +} + +if$ + +} + +{ "" } + +if$ + +} + +FUNCTION {format.edition} + +{ edition empty$ + +{ "" } + +{ output.state mid.sentence = + +{ ", " edition "l" change.case$ * } + +{ ", " edition "t" change.case$ * } + +if$ + +" ed." * + +} + +if$ + +} + +INTEGERS { multiresult } + +FUNCTION {multi.page.check} + +{ 't := + +#0 'multiresult := + +{ multiresult not + +t empty$ not + +and + +} + +{ t #1 #1 substring$ + +duplicate$ "-" = + +swap$ duplicate$ "," = + +swap$ "+" = + +or or + +{ #1 'multiresult := } + +{ t #2 global.max$ substring$ 't := } + +if$ + +} + +while$ + +multiresult + +} + +FUNCTION {format.pages} + +{ pages empty$ + +{ "" } + +{ pages multi.page.check + +{ "pp." pages n.dashify tie.or.space.connect } + +{ "p." pages tie.or.space.connect } + +if$ + +} + +if$ + +} + +FUNCTION {format.pages.a} + +{ pages empty$ + +{ "" } + +{ "p." pages first.page tie.or.space.connect } + +if$ + +} + +FUNCTION {format.vol.num.pages} + +{ volume field.or.null embolden + +" " swap$ * * + +pages empty$ + +'skip$ + +{ duplicate$ empty$ + +{ pop$ format.pages.a } + +{ ", " * pages first.page * } + +if$ + +} + +if$ + +} + +FUNCTION {format.chapter.pages} + +{ chapter empty$ + +'format.pages + +{ type empty$ + +{ "chap." } + +{ type "l" change.case$ } + +if$ + +chapter tie.or.space.connect + +pages empty$ + +'skip$ + +{ ", " * format.pages * } + +if$ + +} + +if$ + +} + +FUNCTION {format.pub.addr.date} + +{ publisher empty$ + +{ "" "empty publisher in " cite$ * warning$ } + +{ publisher + +address empty$ + +'skip$ + +{ ", " * address * } + +if$ + +} + +if$ + +year empty$ + +{ "empty year in " cite$ * warning$ } + +{ ", " * year * } + +if$ + +paren " " swap$ * + +} + +FUNCTION {format.book.entry} + +{ format.btitle + +format.bvolume * + +format.number.series * + +format.edition * + +format.pub.addr.date * + +} + +FUNCTION {format.inbook.entry} + +{ format.book.entry + +", " * + +format.chapter.pages * + +} + +FUNCTION {format.in.ed.booktitle} + +{ booktitle empty$ + +{ "" } + +{ editor empty$ + +{ "in " booktitle emphasize * } + +{ "in " booktitle emphasize * ", " * format.edited * } + +if$ + +} + +if$ + +} + +FUNCTION {empty.misc.check} + +{ author empty$ title empty$ howpublished empty$ + +month empty$ year empty$ note empty$ + +and and and and and + +{ "all relevant fields are empty in " cite$ * warning$ } + +'skip$ + +if$ + +} + +FUNCTION {format.thesis.type} + +{ type empty$ + +'skip$ + +{ pop$ + +type "t" change.case$ + +} + +if$ + +} + +FUNCTION {format.inst.tr.num.date} + +{ institution empty$ + +{ "" "empty institution in " cite$ * warning$ } + +{ institution } + +if$ + +" Report No." * + +number empty$ + +{ "" } + +{ number tie.or.space.connect } + +if$ + +year empty$ + +{ "empty year in " cite$ * warning$ } + +{ ", " * year * " (unpublished)" * } + +if$ + +} + +FUNCTION {format.article.crossref} + +{ key empty$ + +{ journal empty$ + +{ "need key or journal for " cite$ * " to crossref " * crossref * + +warning$ + +"" + +} + +{ "In " journal * } + +if$ + +} + +{ "In " key * } + +if$ + +" \cite{" * crossref * "}" * + +} + +FUNCTION {format.crossref.editor} + +{ editor #1 "{vv~}{ll}" format.name$ + +editor num.names$ duplicate$ + +#2 > + +{ pop$ " {\em et~al.}" * } + +{ #2 < + +'skip$ + +{ editor #2 "{ff }{vv }{ll}{ jj}" format.name$ "others" = + +{ " {\em et~al.}" * } + +{ " and " * editor #2 "{vv~}{ll}" format.name$ * } + +if$ + +} + +if$ + +} + +if$ + +} + +FUNCTION {format.book.crossref} + +{ volume empty$ + +{ "empty volume in " cite$ * "'s crossref of " * crossref * warning$ + +"In " + +} + +{ "Volume" volume tie.or.space.connect + +" of " * + +} + +if$ + +editor empty$ + +editor field.or.null author field.or.null = + +or + +{ key empty$ + +{ series empty$ + +{ "need editor, key, or series for " cite$ * " to crossref " * + +crossref * warning$ + +"" * + +} + +{ "{\em " * series * "\/}" * } + +if$ + +} + +{ key * } + +if$ + +} + +{ format.crossref.editor * } + +if$ + +" \cite{" * crossref * "}" * + +} + +FUNCTION {format.incoll.inproc.crossref} + +{ editor empty$ + +editor field.or.null author field.or.null = + +or + +{ key empty$ + +{ booktitle empty$ + +{ "need editor, key, or booktitle for " cite$ * " to crossref " * + +crossref * warning$ + +"" + +} + +{ "In {\em " booktitle * "\/}" * } + +if$ + +} + +{ "In " key * } + +if$ + +} + +{ "In " format.crossref.editor * } + +if$ + +" \cite{" * crossref * "}" * + +} + +FUNCTION {article} +{ output.bibitem + format.collaboration output + format.authors "author" output.check + new.block + crossref missing$ + { journal missing$ + 'skip$ + { journal field.or.null + format.vol.num.pages + format.date empty$ + 'skip$ + { duplicate$ empty$ + { pop$ format.date paren } + { " " * format.date paren * } + if$ + } + if$ + output + } + if$ + } + { format.article.crossref output.nonnull + format.pages output + } + if$ + journal missing$ + { eprint missing$ +% put in the year at least... + { format.date paren output } + { format.eprint output } + if$ + } + { format.eprint output } + if$ + new.block + format.url output + new.block + note output + %format.SLACcitation output + fin.entry +} + +FUNCTION {book} +{ output.bibitem + collaboration output + author empty$ + { format.editors "author and editor" output.check } + { format.authors output.nonnull + crossref missing$ + { "author and editor" editor either.or.check } + 'skip$ + if$ + } + if$ + new.block + crossref missing$ + { format.book.entry output } + { new.block + format.book.crossref output.nonnull + } + if$ + format.eprint output + new.block + note output + %format.SLACcitation output + fin.entry +} + +FUNCTION {booklet} +{ output.bibitem + format.collaboration output + format.authors output + new.block + format.title "title" output.check + howpublished address new.block.checkb + howpublished output + address output + format.date output + format.eprint output + new.block + note output + %format.SLACcitation output + fin.entry +} + +FUNCTION {inbook} +{ output.bibitem + format.collaboration output + author empty$ + { format.editors "author and editor" output.check } + { format.authors output.nonnull + crossref missing$ + { "author and editor" editor either.or.check } + 'skip$ + if$ + } + if$ + new.block + crossref missing$ + { format.inbook.entry output } + { format.chapter.pages "chapter and pages" output.check + new.block + format.book.crossref output.nonnull + } + if$ + format.eprint output + new.block + note output + %format.SLACcitation output + fin.entry +} + +FUNCTION {incollection} +{ output.bibitem + format.collaboration output + format.authors "author" output.check + new.block + format.title "title" output.check + new.block + crossref missing$ + { format.in.ed.booktitle "booktitle" output.check + format.bvolume output + format.number.series output + format.chapter.pages output + new.sentence + publisher "publisher" output.check + address output + format.edition output + format.date "year" output.check + } + { format.incoll.inproc.crossref output.nonnull + format.chapter.pages output + } + if$ + format.eprint output + new.block + note output + %format.SLACcitation output + fin.entry +} + +FUNCTION {inproceedings} +{ output.bibitem + format.collaboration output + format.authors "author" output.check + new.block + format.title "title" output.check + new.block + crossref missing$ + { format.in.ed.booktitle "booktitle" output.check + format.bvolume output + format.number.series output + format.pages output + address empty$ + { organization publisher new.sentence.checkb + organization output + publisher output + format.date "year" output.check + } + { address output.nonnull + format.date "year" output.check + new.sentence + organization output + publisher output + } + if$ + } + { format.incoll.inproc.crossref output.nonnull + format.pages output + } + if$ + format.eprint output + new.block + note output + %format.SLACcitation output + fin.entry +} + +FUNCTION {conference} { inproceedings } + +FUNCTION {manual} + +{ output.bibitem + +format.collaboration output + +author empty$ + +{ organization empty$ + +'skip$ + +{ organization output.nonnull + +address output + +} + +if$ + +} + +{ format.authors output.nonnull } + +if$ + +new.block + +format.btitle "title" output.check + +author empty$ + +{ organization empty$ + +{ address new.block.checka + +address output + +} + +'skip$ + +if$ + +} + +{ organization address new.block.checkb + +organization output + +address output + +} + +if$ + +format.edition output + +format.date output + +format.eprint output + +new.block + +note output + +fin.entry + +} + +FUNCTION {mastersthesis} + +{ output.bibitem + +format.authors "author" output.check + +new.block + +format.title "title" output.check + +new.block + +"Master's thesis" format.thesis.type output.nonnull + +school "school" output.check + +address output + +format.date "year" output.check + +format.eprint output + +new.block + +note output + +fin.entry + +} + +FUNCTION {misc} + +{ output.bibitem + +format.collaboration output + +format.authors output + +title howpublished new.block.checkb + +format.title output + +howpublished new.block.checka + +howpublished output + +format.date output + +format.eprint output + +new.block + +note output + +fin.entry + +empty.misc.check + +} + +FUNCTION {phdthesis} + +{ output.bibitem + +format.authors "author" output.check + +new.block + +format.btitle "title" output.check + +new.block + +"PhD thesis" format.thesis.type output.nonnull + +school "school" output.check + +address output + +format.date "year" output.check + +format.eprint output + +new.block +format.url output +new.block +note output + +%format.SLACcitation output + +fin.entry + +} + +FUNCTION {proceedings} + +{ output.bibitem + +format.collaboration output + +editor empty$ + +{ organization output } + +{ format.editors output.nonnull } + +if$ + +new.block + +format.btitle "title" output.check + +format.bvolume output + +format.number.series output + +address empty$ + +{ editor empty$ + +{ publisher new.sentence.checka } + +{ organization publisher new.sentence.checkb + +organization output + +} + +if$ + +publisher output + +format.date "year" output.check + +} + +{ address output.nonnull + +format.date "year" output.check + +new.sentence + +editor empty$ + +'skip$ + +{ organization output } + +if$ + +publisher output + +} + +if$ + +format.eprint output + +new.block + +note output + +%format.SLACcitation output + +fin.entry + +} + +FUNCTION {techreport} + +{ output.bibitem + +format.collaboration output + +format.authors "author" output.check + +new.block + +format.inst.tr.num.date output.nonnull + +format.eprint output + +new.block + +note output + +fin.entry + +} + +FUNCTION {unpublished} + +{ output.bibitem + +format.collaboration output + +format.authors "author" output.check + +new.block + +format.title "title" output.check + +format.eprint output +new.block + +note "note" output.check + +format.date output + +%format.SLACcitation output + +fin.entry + +} + +FUNCTION {default.type} { misc } + +MACRO {jan} {"Jan."} + +MACRO {feb} {"Feb."} + +MACRO {mar} {"Mar."} + +MACRO {apr} {"Apr."} + +MACRO {may} {"May"} + +MACRO {jun} {"June"} + +MACRO {jul} {"July"} + +MACRO {aug} {"Aug."} + +MACRO {sep} {"Sept."} + +MACRO {oct} {"Oct."} + +MACRO {nov} {"Nov."} + +MACRO {dec} {"Dec."} + +MACRO {acmcs} {"ACM Comput. Surv."} + +MACRO {acta} {"Acta Inf."} + +MACRO {cacm} {"Commun. ACM"} + +MACRO {ibmjrd} {"IBM J. Res. Dev."} + +MACRO {ibmsj} {"IBM Syst.~J."} + +MACRO {ieeese} {"IEEE Trans. Softw. Eng."} + +MACRO {ieeetc} {"IEEE Trans. Comput."} + +MACRO {ieeetcad} + +{"IEEE Trans. Comput.-Aided Design Integrated Circuits"} + +MACRO {ipl} {"Inf. Process. Lett."} + +MACRO {jacm} {"J.~ACM"} + +MACRO {jcss} {"J.~Comput. Syst. Sci."} + +MACRO {scp} {"Sci. Comput. Programming"} + +MACRO {sicomp} {"SIAM J. Comput."} + +MACRO {tocs} {"ACM Trans. Comput. Syst."} + +MACRO {tods} {"ACM Trans. Database Syst."} + +MACRO {tog} {"ACM Trans. Gr."} + +MACRO {toms} {"ACM Trans. Math. Softw."} + +MACRO {toois} {"ACM Trans. Office Inf. Syst."} + +MACRO {toplas} {"ACM Trans. Prog. Lang. Syst."} + +MACRO {tcs} {"Theoretical Comput. Sci."} + +MACRO {advp} {"Adv. Phys."} + +MACRO {ajp} {"Am. J. Phys."} + +MACRO {ao} {"Appl. Opt."} + +MACRO {apj} {"Astrophys. J."} + +MACRO {apl} {"Appl. Phys. Lett."} + +MACRO {arnps} {"Ann. Rev. Nucl. Sci."} + +MACRO {arns} {"Ann. Rev. Nucl. Part. Sci."} + +MACRO {baps} {"Bull. Am. Phys. Soc."} + +MACRO {cpc} {"Computer Phys. Comm."} + +MACRO {cppcf} {"Comments Plasma Phys. Controlled Fusion"} + +MACRO {fed} {"Fusion Eng. Design"} + +MACRO {ft} {"Fusion Tech."} + +MACRO {ieeens} {"IEEE Trans. Nucl. Sci."} + +MACRO {ieeeps} {"IEEE Trans. Plasma Sci."} + +MACRO {ijimw} {"Int. J. Infrared Millimeter Waves"} + +MACRO {ip} {"Infrared Phys."} + +MACRO {jap} {"J. Appl. Phys."} + +MACRO {jcp} {"J. Comput. Phys."} + +MACRO {jetp} {"Sov. Phys.-JETP"} + +MACRO {jfe} {"J. Fusion Energy"} + +MACRO {jfm} {"J. Fluid Mech."} + +MACRO {jgr} {"J. Geophys. Res."} + +MACRO {jmp} {"J. Math. Phys."} + +MACRO {jne} {"J. Nucl. Energy"} + +MACRO {jnm} {"J. Nucl. Mater."} + +MACRO {josa} {"J. Opt. Soc. Am."} + +MACRO {jpg} {"J. Phys.~G: Nucl. and Part. Phys."} + +MACRO {jphys} {"J. Phys"} + +MACRO {jpp} {"J. Plasma Phys."} + +MACRO {jpsj} {"J. Phys. Soc. Jpn"} + +MACRO {jvst} {"J. Vac. Sci. Technol."} + +MACRO {modphyslettA} {"Mod. Phys. Lett. A"} + +MACRO {nedf} {"Nucl. Eng. Design/Fusion"} + +MACRO {nf} {"Nucl. Fusion"} + +MACRO {nim} {"Nucl. Instrum. Methods"} + +MACRO {np} {"Nucl. Phys."} + +MACRO {npb} {"Nucl. Phys.~B"} + +MACRO {npbps} {"Nucl. Phys. B (Proc. Suppl.)"} + +MACRO {nt/f} {"Nucl. Tech./Fusion"} + +MACRO {pf} {"Phys. Fluids"} + +MACRO {pl} {"Phys. Lett."} + +MACRO {plb} {"Phys. Lett.~B"} + +MACRO {pnas} {"Proc. Nat. Acad. Sci. USA"} + +MACRO {pp} {"Plasma Phys."} + +MACRO {physrep} {"Phys. Rep."} + +MACRO {physrev} {"Phys. Rev."} + +MACRO {pr} {"Phys. Rev."} + +MACRO {prd} {"Phys. Rev.~D"} + +MACRO {prl} {"Phys. Rev. Lett."} + +MACRO {procroysoc} {"Proc. Roy. Soc"} + +MACRO {ps} {"Physica Scripta"} + +MACRO {rmp} {"Rev. Mod. Phys."} + +MACRO {rsi} {"Rev. Sci. Instrum."} + +MACRO {sjnp} {"Sov. J. Nucl. Phys."} + +MACRO {sjpp} {"Sov. J. Plasma Phys."} + +MACRO {spd} {"Sov. Phys.-Dokl."} + +MACRO {sptp} {"Sov. Phys.-Tech. Phys."} + +MACRO {spu} {"Sov. Phys.-Usp."} + +MACRO {zp} {"Z. Phys."} + +MACRO {zpc} {"Z. Phys.~C"} + +READ + +STRINGS { longest.label } + +INTEGERS { number.label longest.label.width } + +FUNCTION {initialize.longest.label} + +{ "" 'longest.label := + +#1 'number.label := + +#0 'longest.label.width := + +} + +FUNCTION {longest.label.pass} + +{ number.label int.to.str$ 'label := + +number.label #1 + 'number.label := + +label width$ longest.label.width > + +{ label 'longest.label := + +label width$ 'longest.label.width := + +} + +'skip$ + +if$ + +} + +EXECUTE {initialize.longest.label} + +ITERATE {longest.label.pass} + +FUNCTION {begin.bib} + +{ preamble$ empty$ + +'skip$ + +{ preamble$ write$ newline$ } + +if$ + +"\begin{thebibliography}{" longest.label * "}" * write$ newline$ + +} + +EXECUTE {begin.bib} + +EXECUTE {init.state.consts} + +ITERATE {call.type$} + +FUNCTION {end.bib} + +{ newline$ + +"\end{thebibliography}" write$ newline$ + +} + +EXECUTE {end.bib} + + + + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/doc/hmcflow.eps b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/hmcflow.eps new file mode 100644 index 0000000000000000000000000000000000000000..7bab4dc6b148f9631ab294b320af389e644a7fc5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/hmcflow.eps @@ -0,0 +1,3313 @@ +%!PS-Adobe-2.0 EPSF-2.0 +%%Title: /home/urbach/daten/workdir/etmc/cpc40/hmcflow.dia +%%Creator: Dia v0.96.1 +%%CreationDate: Fri Jan 30 14:39:12 2009 +%%For: urbach +%%Orientation: Portrait +%%Magnification: 1.0000 +%%BoundingBox: 0 0 657 576 +%%BeginSetup +%%EndSetup +%%EndComments +%%BeginProlog +[ /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef +/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef +/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef +/.notdef /.notdef /space /exclam /quotedbl /numbersign /dollar /percent /ampersand /quoteright +/parenleft /parenright /asterisk /plus /comma /hyphen /period /slash /zero /one +/two /three /four /five /six /seven /eight /nine /colon /semicolon +/less /equal /greater /question /at /A /B /C /D /E +/F /G /H /I /J /K /L /M /N /O +/P /Q /R /S /T /U /V /W /X /Y +/Z /bracketleft /backslash /bracketright /asciicircum /underscore /quoteleft /a /b /c +/d /e /f /g /h /i /j /k /l /m +/n /o /p /q /r /s /t /u /v /w +/x /y /z /braceleft /bar /braceright /asciitilde /.notdef /.notdef /.notdef +/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef +/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef +/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef +/space /exclamdown /cent /sterling /currency /yen /brokenbar /section /dieresis /copyright +/ordfeminine /guillemotleft /logicalnot /hyphen /registered /macron /degree /plusminus /twosuperior /threesuperior +/acute /mu /paragraph /periodcentered /cedilla /onesuperior /ordmasculine /guillemotright /onequarter /onehalf +/threequarters /questiondown /Agrave /Aacute /Acircumflex /Atilde /Adieresis /Aring /AE /Ccedilla +/Egrave /Eacute /Ecircumflex /Edieresis /Igrave /Iacute /Icircumflex /Idieresis /Eth /Ntilde +/Ograve /Oacute /Ocircumflex /Otilde /Odieresis /multiply /Oslash /Ugrave /Uacute /Ucircumflex +/Udieresis /Yacute /Thorn /germandbls /agrave /aacute /acircumflex /atilde /adieresis /aring +/ae /ccedilla /egrave /eacute /ecircumflex /edieresis /igrave /iacute /icircumflex /idieresis +/eth /ntilde /ograve /oacute /ocircumflex /otilde /odieresis /divide /oslash /ugrave +/uacute /ucircumflex /udieresis /yacute /thorn /ydieresis] /isolatin1encoding exch def +/cp {closepath} bind def +/c {curveto} bind def +/f {fill} bind def +/a {arc} bind def +/ef {eofill} bind def +/ex {exch} bind def +/gr {grestore} bind def +/gs {gsave} bind def +/sa {save} bind def +/rs {restore} bind def +/l {lineto} bind def +/m {moveto} bind def +/rm {rmoveto} bind def +/n {newpath} bind def +/s {stroke} bind def +/sh {show} bind def +/slc {setlinecap} bind def +/slj {setlinejoin} bind def +/slw {setlinewidth} bind def +/srgb {setrgbcolor} bind def +/rot {rotate} bind def +/sc {scale} bind def +/sd {setdash} bind def +/ff {findfont} bind def +/sf {setfont} bind def +/scf {scalefont} bind def +/sw {stringwidth pop} bind def +/tr {translate} bind def + +/ellipsedict 8 dict def +ellipsedict /mtrx matrix put +/ellipse +{ ellipsedict begin + /endangle exch def + /startangle exch def + /yrad exch def + /xrad exch def + /y exch def + /x exch def /savematrix mtrx currentmatrix def + x y tr xrad yrad sc + 0 0 1 startangle endangle arc + savematrix setmatrix + end +} def + +/mergeprocs { +dup length +3 -1 roll +dup +length +dup +5 1 roll +3 -1 roll +add +array cvx +dup +3 -1 roll +0 exch +putinterval +dup +4 2 roll +putinterval +} bind def +/dpi_x 300 def +/dpi_y 300 def +/conicto { + /to_y exch def + /to_x exch def + /conic_cntrl_y exch def + /conic_cntrl_x exch def + currentpoint + /p0_y exch def + /p0_x exch def + /p1_x p0_x conic_cntrl_x p0_x sub 2 3 div mul add def + /p1_y p0_y conic_cntrl_y p0_y sub 2 3 div mul add def + /p2_x p1_x to_x p0_x sub 1 3 div mul add def + /p2_y p1_y to_y p0_y sub 1 3 div mul add def + p1_x p1_y p2_x p2_y to_x to_y curveto +} bind def +/start_ol { gsave 1.1 dpi_x div dup scale} bind def +/end_ol { closepath fill grestore } bind def +28.346000 -28.346000 scale +-1.950000 -22.270000 translate +%%EndProlog + + +1.000000 1.000000 1.000000 srgb +n 2.050000 2.000000 m 2.050000 4.100000 l 25.050000 4.100000 l 25.050000 2.000000 l f +0.100000 slw +[] 0 sd +[] 0 sd +0 slj +0.000000 0.000000 0.000000 srgb +n 2.050000 2.000000 m 2.050000 4.100000 l 25.050000 4.100000 l 25.050000 2.000000 l cp s +gsave 9.827500 3.227500 translate 0.035278 -0.035278 scale +start_ol +2445 1914 moveto +2620 1854 2786 1657 conicto +2953 1461 3120 1117 conicto +3648 0 lineto +3087 0 lineto +2571 1049 lineto +2373 1460 2186 1594 conicto +2000 1728 1678 1728 conicto +1088 1728 lineto +1088 0 lineto +512 0 lineto +512 4032 lineto +1769 4032 lineto +2458 4032 2797 3746 conicto +3136 3460 3136 2884 conicto +3136 2507 2959 2258 conicto +2782 2010 2445 1914 conicto +1088 3584 moveto +1088 2176 lineto +1769 2176 lineto +2160 2176 2360 2355 conicto +2560 2535 2560 2883 conicto +2560 3231 2360 3407 conicto +2160 3584 1769 3584 conicto +1088 3584 lineto +end_ol grestore +gsave 10.302055 3.227500 translate 0.035278 -0.035278 scale +start_ol +3136 1584 moveto +3136 1344 lineto +832 1344 lineto +865 875 1142 629 conicto +1420 384 1916 384 conicto +2203 384 2472 448 conicto +2742 512 3008 640 conicto +3008 192 lineto +2741 67 2460 1 conicto +2179 -64 1890 -64 conicto +1166 -64 743 352 conicto +320 768 320 1477 conicto +320 2211 723 2641 conicto +1126 3072 1809 3072 conicto +2423 3072 2779 2672 conicto +3136 2272 3136 1584 conicto +2624 1728 moveto +2619 2137 2395 2380 conicto +2172 2624 1804 2624 conicto +1387 2624 1136 2388 conicto +886 2153 848 1725 conicto +2624 1728 lineto +end_ol grestore +gsave 10.749135 3.227500 translate 0.035278 -0.035278 scale +start_ol +1882 1536 moveto +1289 1536 1060 1402 conicto +832 1268 832 944 conicto +832 686 1003 535 conicto +1175 384 1470 384 conicto +1876 384 2122 667 conicto +2368 951 2368 1422 conicto +2368 1536 lineto +1882 1536 lineto +2880 1739 moveto +2880 0 lineto +2368 0 lineto +2368 448 lineto +2199 186 1946 61 conicto +1693 -64 1328 -64 conicto +866 -64 593 196 conicto +320 456 320 893 conicto +320 1402 662 1661 conicto +1004 1920 1682 1920 conicto +2368 1920 lineto +2368 1962 lineto +2368 2278 2140 2451 conicto +1912 2624 1500 2624 conicto +1238 2624 989 2560 conicto +741 2496 512 2368 conicto +512 2816 lineto +789 2944 1049 3008 conicto +1310 3072 1556 3072 conicto +2222 3072 2551 2741 conicto +2880 2411 2880 1739 conicto +end_ol grestore +gsave 11.196214 3.227500 translate 0.035278 -0.035278 scale +start_ol +2496 2560 moveto +2496 4160 lineto +3008 4160 lineto +3008 0 lineto +2496 0 lineto +2496 448 lineto +2342 188 2106 62 conicto +1870 -64 1540 -64 conicto +999 -64 659 368 conicto +320 800 320 1504 conicto +320 2208 659 2640 conicto +999 3072 1540 3072 conicto +1870 3072 2106 2946 conicto +2342 2820 2496 2560 conicto +832 1504 moveto +832 980 1053 682 conicto +1275 384 1663 384 conicto +2050 384 2273 682 conicto +2496 980 2496 1504 conicto +2496 2028 2273 2326 conicto +2050 2624 1663 2624 conicto +1275 2624 1053 2326 conicto +832 2028 832 1504 conicto +end_ol grestore +gsave 11.658276 3.227500 translate 0.035278 -0.035278 scale +start_ol +end_ol grestore +gsave 11.890555 3.227500 translate 0.035278 -0.035278 scale +start_ol +512 4032 moveto +1088 4032 lineto +1088 0 lineto +512 0 lineto +512 4032 lineto +end_ol grestore +gsave 12.105348 3.227500 translate 0.035278 -0.035278 scale +start_ol +3008 1829 moveto +3008 0 lineto +2496 0 lineto +2496 1813 lineto +2496 2220 2335 2422 conicto +2175 2624 1854 2624 conicto +1469 2624 1246 2379 conicto +1024 2135 1024 1713 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1198 2817 1434 2944 conicto +1670 3072 1978 3072 conicto +2486 3072 2747 2756 conicto +3008 2441 3008 1829 conicto +end_ol grestore +gsave 12.567409 3.227500 translate 0.035278 -0.035278 scale +start_ol +1024 448 moveto +1024 -1152 lineto +512 -1152 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1178 2820 1414 2946 conicto +1650 3072 1977 3072 conicto +2521 3072 2860 2640 conicto +3200 2208 3200 1504 conicto +3200 800 2860 368 conicto +2521 -64 1977 -64 conicto +1650 -64 1414 62 conicto +1178 188 1024 448 conicto +2688 1504 moveto +2688 2028 2466 2326 conicto +2244 2624 1856 2624 conicto +1468 2624 1246 2326 conicto +1024 2028 1024 1504 conicto +1024 980 1246 682 conicto +1468 384 1856 384 conicto +2244 384 2466 682 conicto +2688 980 2688 1504 conicto +end_ol grestore +gsave 13.029471 3.227500 translate 0.035278 -0.035278 scale +start_ol +512 1177 moveto +512 3008 lineto +1024 3008 lineto +1024 1196 lineto +1024 790 1184 587 conicto +1344 384 1664 384 conicto +2049 384 2272 628 conicto +2496 872 2496 1293 conicto +2496 3008 lineto +3008 3008 lineto +3008 0 lineto +2496 0 lineto +2496 448 lineto +2320 188 2087 62 conicto +1854 -64 1546 -64 conicto +1038 -64 775 252 conicto +512 568 512 1177 conicto +1744 3072 moveto +1744 3072 lineto +end_ol grestore +gsave 13.491533 3.227500 translate 0.035278 -0.035278 scale +start_ol +1024 3840 moveto +1024 3008 lineto +2048 3008 lineto +2048 2624 lineto +1024 2624 lineto +1024 1016 lineto +1024 654 1125 551 conicto +1226 448 1537 448 conicto +2048 448 lineto +2048 0 lineto +1537 0 lineto +955 0 733 219 conicto +512 438 512 1016 conicto +512 2624 lineto +128 2624 lineto +128 3008 lineto +512 3008 lineto +512 3840 lineto +1024 3840 lineto +end_ol grestore +gsave 13.776266 3.227500 translate 0.035278 -0.035278 scale +start_ol +end_ol grestore +gsave 14.008545 3.227500 translate 0.035278 -0.035278 scale +start_ol +512 4032 moveto +2816 4032 lineto +2816 3584 lineto +1088 3584 lineto +1088 2368 lineto +2624 2368 lineto +2624 1920 lineto +1088 1920 lineto +1088 0 lineto +512 0 lineto +512 4032 lineto +end_ol grestore +gsave 14.375696 3.227500 translate 0.035278 -0.035278 scale +start_ol +2304 2560 moveto +2220 2593 2120 2608 conicto +2021 2624 1902 2624 conicto +1478 2624 1251 2359 conicto +1024 2094 1024 1597 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1182 2820 1434 2946 conicto +1687 3072 2049 3072 conicto +2100 3072 2162 3072 conicto +2225 3072 2301 3072 conicto +2304 2560 lineto +end_ol grestore +gsave 14.660429 3.227500 translate 0.035278 -0.035278 scale +start_ol +1697 2624 moveto +1297 2624 1064 2324 conicto +832 2025 832 1504 conicto +832 983 1063 683 conicto +1294 384 1697 384 conicto +2095 384 2327 684 conicto +2560 985 2560 1504 conicto +2560 2020 2327 2322 conicto +2095 2624 1697 2624 conicto +1696 3072 moveto +2338 3072 2705 2656 conicto +3072 2240 3072 1504 conicto +3072 771 2705 353 conicto +2338 -64 1696 -64 conicto +1051 -64 685 353 conicto +320 771 320 1504 conicto +320 2240 685 2656 conicto +1051 3072 1696 3072 conicto +end_ol grestore +gsave 15.105011 3.227500 translate 0.035278 -0.035278 scale +start_ol +2858 2449 moveto +3041 2768 3296 2920 conicto +3551 3072 3895 3072 conicto +4360 3072 4612 2748 conicto +4864 2425 4864 1829 conicto +4864 0 lineto +4352 0 lineto +4352 1813 lineto +4352 2225 4203 2424 conicto +4055 2624 3750 2624 conicto +3377 2624 3160 2379 conicto +2944 2135 2944 1713 conicto +2944 0 lineto +2432 0 lineto +2432 1813 lineto +2432 2227 2283 2425 conicto +2135 2624 1824 2624 conicto +1457 2624 1240 2378 conicto +1024 2132 1024 1713 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1190 2822 1423 2947 conicto +1656 3072 1975 3072 conicto +2298 3072 2524 2912 conicto +2750 2753 2858 2449 conicto +end_ol grestore +gsave 15.814343 3.227500 translate 0.035278 -0.035278 scale +start_ol +end_ol grestore +gsave 16.046622 3.227500 translate 0.035278 -0.035278 scale +start_ol +512 4032 moveto +2816 4032 lineto +2816 3584 lineto +1088 3584 lineto +1088 2368 lineto +2624 2368 lineto +2624 1920 lineto +1088 1920 lineto +1088 0 lineto +512 0 lineto +512 4032 lineto +end_ol grestore +gsave 16.413773 3.227500 translate 0.035278 -0.035278 scale +start_ol +512 3008 moveto +1024 3008 lineto +1024 0 lineto +512 0 lineto +512 3008 lineto +512 4160 moveto +1024 4160 lineto +1024 3520 lineto +512 3520 lineto +512 4160 lineto +end_ol grestore +gsave 16.616080 3.227500 translate 0.035278 -0.035278 scale +start_ol +512 4160 moveto +1024 4160 lineto +1024 0 lineto +512 0 lineto +512 4160 lineto +end_ol grestore +gsave 16.818387 3.227500 translate 0.035278 -0.035278 scale +start_ol +3136 1584 moveto +3136 1344 lineto +832 1344 lineto +865 875 1142 629 conicto +1420 384 1916 384 conicto +2203 384 2472 448 conicto +2742 512 3008 640 conicto +3008 192 lineto +2741 67 2460 1 conicto +2179 -64 1890 -64 conicto +1166 -64 743 352 conicto +320 768 320 1477 conicto +320 2211 723 2641 conicto +1126 3072 1809 3072 conicto +2423 3072 2779 2672 conicto +3136 2272 3136 1584 conicto +2624 1728 moveto +2619 2137 2395 2380 conicto +2172 2624 1804 2624 conicto +1387 2624 1136 2388 conicto +886 2153 848 1725 conicto +2624 1728 lineto +end_ol grestore +1.000000 1.000000 1.000000 srgb +n 2.050000 4.950000 m 2.050000 7.050000 l 25.000000 7.050000 l 25.000000 4.950000 l f +0.100000 slw +[] 0 sd +[] 0 sd +0 slj +0.000000 0.000000 0.000000 srgb +n 2.050000 4.950000 m 2.050000 7.050000 l 25.000000 7.050000 l 25.000000 4.950000 l cp s +gsave 5.643750 6.177500 translate 0.035278 -0.035278 scale +start_ol +512 4032 moveto +1088 4032 lineto +1088 0 lineto +512 0 lineto +512 4032 lineto +end_ol grestore +gsave 5.858542 6.177500 translate 0.035278 -0.035278 scale +start_ol +3008 1829 moveto +3008 0 lineto +2496 0 lineto +2496 1813 lineto +2496 2220 2335 2422 conicto +2175 2624 1854 2624 conicto +1469 2624 1246 2379 conicto +1024 2135 1024 1713 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1198 2817 1434 2944 conicto +1670 3072 1978 3072 conicto +2486 3072 2747 2756 conicto +3008 2441 3008 1829 conicto +end_ol grestore +gsave 6.320604 6.177500 translate 0.035278 -0.035278 scale +start_ol +512 3008 moveto +1024 3008 lineto +1024 0 lineto +512 0 lineto +512 3008 lineto +512 4160 moveto +1024 4160 lineto +1024 3520 lineto +512 3520 lineto +512 4160 lineto +end_ol grestore +gsave 6.522911 6.177500 translate 0.035278 -0.035278 scale +start_ol +1024 3840 moveto +1024 3008 lineto +2048 3008 lineto +2048 2624 lineto +1024 2624 lineto +1024 1016 lineto +1024 654 1125 551 conicto +1226 448 1537 448 conicto +2048 448 lineto +2048 0 lineto +1537 0 lineto +955 0 733 219 conicto +512 438 512 1016 conicto +512 2624 lineto +128 2624 lineto +128 3008 lineto +512 3008 lineto +512 3840 lineto +1024 3840 lineto +end_ol grestore +gsave 6.807644 6.177500 translate 0.035278 -0.035278 scale +start_ol +512 3008 moveto +1024 3008 lineto +1024 0 lineto +512 0 lineto +512 3008 lineto +512 4160 moveto +1024 4160 lineto +1024 3520 lineto +512 3520 lineto +512 4160 lineto +end_ol grestore +gsave 7.009951 6.177500 translate 0.035278 -0.035278 scale +start_ol +1882 1536 moveto +1289 1536 1060 1402 conicto +832 1268 832 944 conicto +832 686 1003 535 conicto +1175 384 1470 384 conicto +1876 384 2122 667 conicto +2368 951 2368 1422 conicto +2368 1536 lineto +1882 1536 lineto +2880 1739 moveto +2880 0 lineto +2368 0 lineto +2368 448 lineto +2199 186 1946 61 conicto +1693 -64 1328 -64 conicto +866 -64 593 196 conicto +320 456 320 893 conicto +320 1402 662 1661 conicto +1004 1920 1682 1920 conicto +2368 1920 lineto +2368 1962 lineto +2368 2278 2140 2451 conicto +1912 2624 1500 2624 conicto +1238 2624 989 2560 conicto +741 2496 512 2368 conicto +512 2816 lineto +789 2944 1049 3008 conicto +1310 3072 1556 3072 conicto +2222 3072 2551 2741 conicto +2880 2411 2880 1739 conicto +end_ol grestore +gsave 7.457031 6.177500 translate 0.035278 -0.035278 scale +start_ol +512 4160 moveto +1024 4160 lineto +1024 0 lineto +512 0 lineto +512 4160 lineto +end_ol grestore +gsave 7.659338 6.177500 translate 0.035278 -0.035278 scale +start_ol +512 3008 moveto +1024 3008 lineto +1024 0 lineto +512 0 lineto +512 3008 lineto +512 4160 moveto +1024 4160 lineto +1024 3520 lineto +512 3520 lineto +512 4160 lineto +end_ol grestore +gsave 7.861645 6.177500 translate 0.035278 -0.035278 scale +start_ol +2432 2880 moveto +2432 2432 lineto +2227 2528 2007 2576 conicto +1787 2624 1551 2624 conicto +1191 2624 1011 2516 conicto +832 2408 832 2192 conicto +832 2027 962 1933 conicto +1093 1839 1487 1754 conicto +1654 1717 lineto +2181 1605 2402 1402 conicto +2624 1199 2624 834 conicto +2624 420 2291 178 conicto +1959 -64 1378 -64 conicto +1136 -64 873 -16 conicto +611 32 320 128 conicto +320 640 lineto +594 512 860 448 conicto +1126 384 1387 384 conicto +1736 384 1924 498 conicto +2112 612 2112 820 conicto +2112 1013 1977 1115 conicto +1843 1218 1388 1313 conicto +1218 1352 lineto +743 1448 531 1646 conicto +320 1845 320 2192 conicto +320 2613 620 2842 conicto +920 3072 1472 3072 conicto +1746 3072 1987 3024 conicto +2228 2976 2432 2880 conicto +end_ol grestore +gsave 8.241289 6.177500 translate 0.035278 -0.035278 scale +start_ol +3136 1584 moveto +3136 1344 lineto +832 1344 lineto +865 875 1142 629 conicto +1420 384 1916 384 conicto +2203 384 2472 448 conicto +2742 512 3008 640 conicto +3008 192 lineto +2741 67 2460 1 conicto +2179 -64 1890 -64 conicto +1166 -64 743 352 conicto +320 768 320 1477 conicto +320 2211 723 2641 conicto +1126 3072 1809 3072 conicto +2423 3072 2779 2672 conicto +3136 2272 3136 1584 conicto +2624 1728 moveto +2619 2137 2395 2380 conicto +2172 2624 1804 2624 conicto +1387 2624 1136 2388 conicto +886 2153 848 1725 conicto +2624 1728 lineto +end_ol grestore +gsave 8.688368 6.177500 translate 0.035278 -0.035278 scale +start_ol +end_ol grestore +gsave 8.920648 6.177500 translate 0.035278 -0.035278 scale +start_ol +1088 3584 moveto +1088 2048 lineto +1769 2048 lineto +2147 2048 2353 2248 conicto +2560 2448 2560 2817 conicto +2560 3184 2353 3384 conicto +2147 3584 1769 3584 conicto +1088 3584 lineto +512 4032 moveto +1769 4032 lineto +2444 4032 2790 3723 conicto +3136 3414 3136 2817 conicto +3136 2215 2790 1907 conicto +2444 1600 1769 1600 conicto +1088 1600 lineto +1088 0 lineto +512 0 lineto +512 4032 lineto +end_ol grestore +gsave 9.347743 6.177500 translate 0.035278 -0.035278 scale +start_ol +2304 2560 moveto +2220 2593 2120 2608 conicto +2021 2624 1902 2624 conicto +1478 2624 1251 2359 conicto +1024 2094 1024 1597 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1182 2820 1434 2946 conicto +1687 3072 2049 3072 conicto +2100 3072 2162 3072 conicto +2225 3072 2301 3072 conicto +2304 2560 lineto +end_ol grestore +gsave 9.632476 6.177500 translate 0.035278 -0.035278 scale +start_ol +1697 2624 moveto +1297 2624 1064 2324 conicto +832 2025 832 1504 conicto +832 983 1063 683 conicto +1294 384 1697 384 conicto +2095 384 2327 684 conicto +2560 985 2560 1504 conicto +2560 2020 2327 2322 conicto +2095 2624 1697 2624 conicto +1696 3072 moveto +2338 3072 2705 2656 conicto +3072 2240 3072 1504 conicto +3072 771 2705 353 conicto +2338 -64 1696 -64 conicto +1051 -64 685 353 conicto +320 771 320 1504 conicto +320 2240 685 2656 conicto +1051 3072 1696 3072 conicto +end_ol grestore +gsave 10.077059 6.177500 translate 0.035278 -0.035278 scale +start_ol +2496 1535 moveto +2496 2053 2277 2338 conicto +2058 2624 1663 2624 conicto +1270 2624 1051 2338 conicto +832 2053 832 1535 conicto +832 1019 1051 733 conicto +1270 448 1663 448 conicto +2058 448 2277 733 conicto +2496 1019 2496 1535 conicto +3008 404 moveto +3008 -384 2670 -768 conicto +2332 -1152 1635 -1152 conicto +1377 -1152 1148 -1105 conicto +920 -1058 704 -960 conicto +704 -448 lineto +917 -579 1124 -641 conicto +1332 -704 1547 -704 conicto +2023 -704 2259 -452 conicto +2496 -201 2496 308 conicto +2496 512 lineto +2344 255 2107 127 conicto +1870 0 1540 0 conicto +991 0 655 420 conicto +320 841 320 1535 conicto +320 2231 655 2651 conicto +991 3072 1540 3072 conicto +1870 3072 2107 2944 conicto +2344 2817 2496 2560 conicto +2496 3008 lineto +3008 3008 lineto +3008 404 lineto +end_ol grestore +gsave 10.539120 6.177500 translate 0.035278 -0.035278 scale +start_ol +2304 2560 moveto +2220 2593 2120 2608 conicto +2021 2624 1902 2624 conicto +1478 2624 1251 2359 conicto +1024 2094 1024 1597 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1182 2820 1434 2946 conicto +1687 3072 2049 3072 conicto +2100 3072 2162 3072 conicto +2225 3072 2301 3072 conicto +2304 2560 lineto +end_ol grestore +gsave 10.838835 6.177500 translate 0.035278 -0.035278 scale +start_ol +1882 1536 moveto +1289 1536 1060 1402 conicto +832 1268 832 944 conicto +832 686 1003 535 conicto +1175 384 1470 384 conicto +1876 384 2122 667 conicto +2368 951 2368 1422 conicto +2368 1536 lineto +1882 1536 lineto +2880 1739 moveto +2880 0 lineto +2368 0 lineto +2368 448 lineto +2199 186 1946 61 conicto +1693 -64 1328 -64 conicto +866 -64 593 196 conicto +320 456 320 893 conicto +320 1402 662 1661 conicto +1004 1920 1682 1920 conicto +2368 1920 lineto +2368 1962 lineto +2368 2278 2140 2451 conicto +1912 2624 1500 2624 conicto +1238 2624 989 2560 conicto +741 2496 512 2368 conicto +512 2816 lineto +789 2944 1049 3008 conicto +1310 3072 1556 3072 conicto +2222 3072 2551 2741 conicto +2880 2411 2880 1739 conicto +end_ol grestore +gsave 11.285915 6.177500 translate 0.035278 -0.035278 scale +start_ol +2858 2449 moveto +3041 2768 3296 2920 conicto +3551 3072 3895 3072 conicto +4360 3072 4612 2748 conicto +4864 2425 4864 1829 conicto +4864 0 lineto +4352 0 lineto +4352 1813 lineto +4352 2225 4203 2424 conicto +4055 2624 3750 2624 conicto +3377 2624 3160 2379 conicto +2944 2135 2944 1713 conicto +2944 0 lineto +2432 0 lineto +2432 1813 lineto +2432 2227 2283 2425 conicto +2135 2624 1824 2624 conicto +1457 2624 1240 2378 conicto +1024 2132 1024 1713 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1190 2822 1423 2947 conicto +1656 3072 1975 3072 conicto +2298 3072 2524 2912 conicto +2750 2753 2858 2449 conicto +end_ol grestore +gsave 11.995247 6.177500 translate 0.035278 -0.035278 scale +start_ol +2858 2449 moveto +3041 2768 3296 2920 conicto +3551 3072 3895 3072 conicto +4360 3072 4612 2748 conicto +4864 2425 4864 1829 conicto +4864 0 lineto +4352 0 lineto +4352 1813 lineto +4352 2225 4203 2424 conicto +4055 2624 3750 2624 conicto +3377 2624 3160 2379 conicto +2944 2135 2944 1713 conicto +2944 0 lineto +2432 0 lineto +2432 1813 lineto +2432 2227 2283 2425 conicto +2135 2624 1824 2624 conicto +1457 2624 1240 2378 conicto +1024 2132 1024 1713 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1190 2822 1423 2947 conicto +1656 3072 1975 3072 conicto +2298 3072 2524 2912 conicto +2750 2753 2858 2449 conicto +end_ol grestore +gsave 12.704578 6.177500 translate 0.035278 -0.035278 scale +start_ol +3136 1584 moveto +3136 1344 lineto +832 1344 lineto +865 875 1142 629 conicto +1420 384 1916 384 conicto +2203 384 2472 448 conicto +2742 512 3008 640 conicto +3008 192 lineto +2741 67 2460 1 conicto +2179 -64 1890 -64 conicto +1166 -64 743 352 conicto +320 768 320 1477 conicto +320 2211 723 2641 conicto +1126 3072 1809 3072 conicto +2423 3072 2779 2672 conicto +3136 2272 3136 1584 conicto +2624 1728 moveto +2619 2137 2395 2380 conicto +2172 2624 1804 2624 conicto +1387 2624 1136 2388 conicto +886 2153 848 1725 conicto +2624 1728 lineto +end_ol grestore +gsave 13.151658 6.177500 translate 0.035278 -0.035278 scale +start_ol +end_ol grestore +gsave 13.383937 6.177500 translate 0.035278 -0.035278 scale +start_ol +1399 4032 moveto +1856 4032 lineto +457 -512 lineto +0 -512 lineto +1399 4032 lineto +end_ol grestore +gsave 13.628701 6.177500 translate 0.035278 -0.035278 scale +start_ol +end_ol grestore +gsave 13.860981 6.177500 translate 0.035278 -0.035278 scale +start_ol +2445 1914 moveto +2620 1854 2786 1657 conicto +2953 1461 3120 1117 conicto +3648 0 lineto +3087 0 lineto +2571 1049 lineto +2373 1460 2186 1594 conicto +2000 1728 1678 1728 conicto +1088 1728 lineto +1088 0 lineto +512 0 lineto +512 4032 lineto +1769 4032 lineto +2458 4032 2797 3746 conicto +3136 3460 3136 2884 conicto +3136 2507 2959 2258 conicto +2782 2010 2445 1914 conicto +1088 3584 moveto +1088 2176 lineto +1769 2176 lineto +2160 2176 2360 2355 conicto +2560 2535 2560 2883 conicto +2560 3231 2360 3407 conicto +2160 3584 1769 3584 conicto +1088 3584 lineto +end_ol grestore +gsave 14.335536 6.177500 translate 0.035278 -0.035278 scale +start_ol +3136 1584 moveto +3136 1344 lineto +832 1344 lineto +865 875 1142 629 conicto +1420 384 1916 384 conicto +2203 384 2472 448 conicto +2742 512 3008 640 conicto +3008 192 lineto +2741 67 2460 1 conicto +2179 -64 1890 -64 conicto +1166 -64 743 352 conicto +320 768 320 1477 conicto +320 2211 723 2641 conicto +1126 3072 1809 3072 conicto +2423 3072 2779 2672 conicto +3136 2272 3136 1584 conicto +2624 1728 moveto +2619 2137 2395 2380 conicto +2172 2624 1804 2624 conicto +1387 2624 1136 2388 conicto +886 2153 848 1725 conicto +2624 1728 lineto +end_ol grestore +gsave 14.782615 6.177500 translate 0.035278 -0.035278 scale +start_ol +1882 1536 moveto +1289 1536 1060 1402 conicto +832 1268 832 944 conicto +832 686 1003 535 conicto +1175 384 1470 384 conicto +1876 384 2122 667 conicto +2368 951 2368 1422 conicto +2368 1536 lineto +1882 1536 lineto +2880 1739 moveto +2880 0 lineto +2368 0 lineto +2368 448 lineto +2199 186 1946 61 conicto +1693 -64 1328 -64 conicto +866 -64 593 196 conicto +320 456 320 893 conicto +320 1402 662 1661 conicto +1004 1920 1682 1920 conicto +2368 1920 lineto +2368 1962 lineto +2368 2278 2140 2451 conicto +1912 2624 1500 2624 conicto +1238 2624 989 2560 conicto +741 2496 512 2368 conicto +512 2816 lineto +789 2944 1049 3008 conicto +1310 3072 1556 3072 conicto +2222 3072 2551 2741 conicto +2880 2411 2880 1739 conicto +end_ol grestore +gsave 15.229695 6.177500 translate 0.035278 -0.035278 scale +start_ol +2496 2560 moveto +2496 4160 lineto +3008 4160 lineto +3008 0 lineto +2496 0 lineto +2496 448 lineto +2342 188 2106 62 conicto +1870 -64 1540 -64 conicto +999 -64 659 368 conicto +320 800 320 1504 conicto +320 2208 659 2640 conicto +999 3072 1540 3072 conicto +1870 3072 2106 2946 conicto +2342 2820 2496 2560 conicto +832 1504 moveto +832 980 1053 682 conicto +1275 384 1663 384 conicto +2050 384 2273 682 conicto +2496 980 2496 1504 conicto +2496 2028 2273 2326 conicto +2050 2624 1663 2624 conicto +1275 2624 1053 2326 conicto +832 2028 832 1504 conicto +end_ol grestore +gsave 15.691757 6.177500 translate 0.035278 -0.035278 scale +start_ol +end_ol grestore +gsave 15.924036 6.177500 translate 0.035278 -0.035278 scale +start_ol +1088 3584 moveto +1088 448 lineto +1739 448 lineto +2563 448 2945 826 conicto +3328 1204 3328 2020 conicto +3328 2830 2945 3207 conicto +2563 3584 1739 3584 conicto +1088 3584 lineto +512 4032 moveto +1656 4032 lineto +2817 4032 3360 3544 conicto +3904 3057 3904 2020 conicto +3904 978 3358 489 conicto +2812 0 1656 0 conicto +512 0 lineto +512 4032 lineto +end_ol grestore +gsave 16.486011 6.177500 translate 0.035278 -0.035278 scale +start_ol +1882 1536 moveto +1289 1536 1060 1402 conicto +832 1268 832 944 conicto +832 686 1003 535 conicto +1175 384 1470 384 conicto +1876 384 2122 667 conicto +2368 951 2368 1422 conicto +2368 1536 lineto +1882 1536 lineto +2880 1739 moveto +2880 0 lineto +2368 0 lineto +2368 448 lineto +2199 186 1946 61 conicto +1693 -64 1328 -64 conicto +866 -64 593 196 conicto +320 456 320 893 conicto +320 1402 662 1661 conicto +1004 1920 1682 1920 conicto +2368 1920 lineto +2368 1962 lineto +2368 2278 2140 2451 conicto +1912 2624 1500 2624 conicto +1238 2624 989 2560 conicto +741 2496 512 2368 conicto +512 2816 lineto +789 2944 1049 3008 conicto +1310 3072 1556 3072 conicto +2222 3072 2551 2741 conicto +2880 2411 2880 1739 conicto +end_ol grestore +gsave 16.933091 6.177500 translate 0.035278 -0.035278 scale +start_ol +1024 3840 moveto +1024 3008 lineto +2048 3008 lineto +2048 2624 lineto +1024 2624 lineto +1024 1016 lineto +1024 654 1125 551 conicto +1226 448 1537 448 conicto +2048 448 lineto +2048 0 lineto +1537 0 lineto +955 0 733 219 conicto +512 438 512 1016 conicto +512 2624 lineto +128 2624 lineto +128 3008 lineto +512 3008 lineto +512 3840 lineto +1024 3840 lineto +end_ol grestore +gsave 17.217824 6.177500 translate 0.035278 -0.035278 scale +start_ol +1882 1536 moveto +1289 1536 1060 1402 conicto +832 1268 832 944 conicto +832 686 1003 535 conicto +1175 384 1470 384 conicto +1876 384 2122 667 conicto +2368 951 2368 1422 conicto +2368 1536 lineto +1882 1536 lineto +2880 1739 moveto +2880 0 lineto +2368 0 lineto +2368 448 lineto +2199 186 1946 61 conicto +1693 -64 1328 -64 conicto +866 -64 593 196 conicto +320 456 320 893 conicto +320 1402 662 1661 conicto +1004 1920 1682 1920 conicto +2368 1920 lineto +2368 1962 lineto +2368 2278 2140 2451 conicto +1912 2624 1500 2624 conicto +1238 2624 989 2560 conicto +741 2496 512 2368 conicto +512 2816 lineto +789 2944 1049 3008 conicto +1310 3072 1556 3072 conicto +2222 3072 2551 2741 conicto +2880 2411 2880 1739 conicto +end_ol grestore +gsave 17.664904 6.177500 translate 0.035278 -0.035278 scale +start_ol +end_ol grestore +gsave 17.897183 6.177500 translate 0.035278 -0.035278 scale +start_ol +2048 4160 moveto +2048 3712 lineto +1571 3712 lineto +1299 3712 1193 3612 conicto +1088 3513 1088 3254 conicto +1088 3008 lineto +1920 3008 lineto +1920 2624 lineto +1088 2624 lineto +1088 0 lineto +576 0 lineto +576 2624 lineto +128 2624 lineto +128 3008 lineto +576 3008 lineto +576 3202 lineto +576 3702 816 3931 conicto +1056 4160 1577 4160 conicto +2048 4160 lineto +end_ol grestore +gsave 18.154441 6.177500 translate 0.035278 -0.035278 scale +start_ol +2304 2560 moveto +2220 2593 2120 2608 conicto +2021 2624 1902 2624 conicto +1478 2624 1251 2359 conicto +1024 2094 1024 1597 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1182 2820 1434 2946 conicto +1687 3072 2049 3072 conicto +2100 3072 2162 3072 conicto +2225 3072 2301 3072 conicto +2304 2560 lineto +end_ol grestore +gsave 18.439174 6.177500 translate 0.035278 -0.035278 scale +start_ol +1697 2624 moveto +1297 2624 1064 2324 conicto +832 2025 832 1504 conicto +832 983 1063 683 conicto +1294 384 1697 384 conicto +2095 384 2327 684 conicto +2560 985 2560 1504 conicto +2560 2020 2327 2322 conicto +2095 2624 1697 2624 conicto +1696 3072 moveto +2338 3072 2705 2656 conicto +3072 2240 3072 1504 conicto +3072 771 2705 353 conicto +2338 -64 1696 -64 conicto +1051 -64 685 353 conicto +320 771 320 1504 conicto +320 2240 685 2656 conicto +1051 3072 1696 3072 conicto +end_ol grestore +gsave 18.883756 6.177500 translate 0.035278 -0.035278 scale +start_ol +2858 2449 moveto +3041 2768 3296 2920 conicto +3551 3072 3895 3072 conicto +4360 3072 4612 2748 conicto +4864 2425 4864 1829 conicto +4864 0 lineto +4352 0 lineto +4352 1813 lineto +4352 2225 4203 2424 conicto +4055 2624 3750 2624 conicto +3377 2624 3160 2379 conicto +2944 2135 2944 1713 conicto +2944 0 lineto +2432 0 lineto +2432 1813 lineto +2432 2227 2283 2425 conicto +2135 2624 1824 2624 conicto +1457 2624 1240 2378 conicto +1024 2132 1024 1713 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1190 2822 1423 2947 conicto +1656 3072 1975 3072 conicto +2298 3072 2524 2912 conicto +2750 2753 2858 2449 conicto +end_ol grestore +gsave 19.593088 6.177500 translate 0.035278 -0.035278 scale +start_ol +end_ol grestore +gsave 19.825367 6.177500 translate 0.035278 -0.035278 scale +start_ol +1088 3584 moveto +1088 448 lineto +1739 448 lineto +2563 448 2945 826 conicto +3328 1204 3328 2020 conicto +3328 2830 2945 3207 conicto +2563 3584 1739 3584 conicto +1088 3584 lineto +512 4032 moveto +1656 4032 lineto +2817 4032 3360 3544 conicto +3904 3057 3904 2020 conicto +3904 978 3358 489 conicto +2812 0 1656 0 conicto +512 0 lineto +512 4032 lineto +end_ol grestore +gsave 20.387342 6.177500 translate 0.035278 -0.035278 scale +start_ol +512 3008 moveto +1024 3008 lineto +1024 0 lineto +512 0 lineto +512 3008 lineto +512 4160 moveto +1024 4160 lineto +1024 3520 lineto +512 3520 lineto +512 4160 lineto +end_ol grestore +gsave 20.589649 6.177500 translate 0.035278 -0.035278 scale +start_ol +2432 2880 moveto +2432 2432 lineto +2227 2528 2007 2576 conicto +1787 2624 1551 2624 conicto +1191 2624 1011 2516 conicto +832 2408 832 2192 conicto +832 2027 962 1933 conicto +1093 1839 1487 1754 conicto +1654 1717 lineto +2181 1605 2402 1402 conicto +2624 1199 2624 834 conicto +2624 420 2291 178 conicto +1959 -64 1378 -64 conicto +1136 -64 873 -16 conicto +611 32 320 128 conicto +320 640 lineto +594 512 860 448 conicto +1126 384 1387 384 conicto +1736 384 1924 498 conicto +2112 612 2112 820 conicto +2112 1013 1977 1115 conicto +1843 1218 1388 1313 conicto +1218 1352 lineto +743 1448 531 1646 conicto +320 1845 320 2192 conicto +320 2613 620 2842 conicto +920 3072 1472 3072 conicto +1746 3072 1987 3024 conicto +2228 2976 2432 2880 conicto +end_ol grestore +gsave 20.969293 6.177500 translate 0.035278 -0.035278 scale +start_ol +512 4160 moveto +1024 4160 lineto +1024 1711 lineto +2503 3008 lineto +3136 3008 lineto +1536 1601 lineto +3200 0 lineto +2554 0 lineto +1024 1469 lineto +1024 0 lineto +512 0 lineto +512 4160 lineto +end_ol grestore +1.000000 1.000000 1.000000 srgb +n 2.050000 8.000000 m 2.050000 10.100000 l 25.000000 10.100000 l 25.000000 8.000000 l f +0.100000 slw +[] 0 sd +[] 0 sd +0 slj +0.000000 0.000000 0.000000 srgb +n 2.050000 8.000000 m 2.050000 10.100000 l 25.000000 10.100000 l 25.000000 8.000000 l cp s +gsave 2.500000 9.227500 translate 0.035278 -0.035278 scale +start_ol +1088 3584 moveto +1088 448 lineto +1739 448 lineto +2563 448 2945 826 conicto +3328 1204 3328 2020 conicto +3328 2830 2945 3207 conicto +2563 3584 1739 3584 conicto +1088 3584 lineto +512 4032 moveto +1656 4032 lineto +2817 4032 3360 3544 conicto +3904 3057 3904 2020 conicto +3904 978 3358 489 conicto +2812 0 1656 0 conicto +512 0 lineto +512 4032 lineto +end_ol grestore +gsave 3.061975 9.227500 translate 0.035278 -0.035278 scale +start_ol +1697 2624 moveto +1297 2624 1064 2324 conicto +832 2025 832 1504 conicto +832 983 1063 683 conicto +1294 384 1697 384 conicto +2095 384 2327 684 conicto +2560 985 2560 1504 conicto +2560 2020 2327 2322 conicto +2095 2624 1697 2624 conicto +1696 3072 moveto +2338 3072 2705 2656 conicto +3072 2240 3072 1504 conicto +3072 771 2705 353 conicto +2338 -64 1696 -64 conicto +1051 -64 685 353 conicto +320 771 320 1504 conicto +320 2240 685 2656 conicto +1051 3072 1696 3072 conicto +end_ol grestore +gsave 3.506558 9.227500 translate 0.035278 -0.035278 scale +start_ol +end_ol grestore +gsave 3.738837 9.227500 translate 0.035278 -0.035278 scale +start_ol +512 4032 moveto +1279 4032 lineto +2944 659 lineto +2944 4032 lineto +3520 4032 lineto +3520 0 lineto +2753 0 lineto +1088 3373 lineto +1088 0 lineto +512 0 lineto +512 4032 lineto +end_ol grestore +gsave 4.283325 9.227500 translate 0.035278 -0.035278 scale +start_ol +1697 2624 moveto +1297 2624 1064 2324 conicto +832 2025 832 1504 conicto +832 983 1063 683 conicto +1294 384 1697 384 conicto +2095 384 2327 684 conicto +2560 985 2560 1504 conicto +2560 2020 2327 2322 conicto +2095 2624 1697 2624 conicto +1696 3072 moveto +2338 3072 2705 2656 conicto +3072 2240 3072 1504 conicto +3072 771 2705 353 conicto +2338 -64 1696 -64 conicto +1051 -64 685 353 conicto +320 771 320 1504 conicto +320 2240 685 2656 conicto +1051 3072 1696 3072 conicto +end_ol grestore +gsave 4.715414 9.227500 translate 0.035278 -0.035278 scale +start_ol +640 704 moveto +1216 704 lineto +1216 0 lineto +640 0 lineto +640 704 lineto +end_ol grestore +gsave 4.947693 9.227500 translate 0.035278 -0.035278 scale +start_ol +end_ol grestore +gsave 5.179973 9.227500 translate 0.035278 -0.035278 scale +start_ol +1697 2624 moveto +1297 2624 1064 2324 conicto +832 2025 832 1504 conicto +832 983 1063 683 conicto +1294 384 1697 384 conicto +2095 384 2327 684 conicto +2560 985 2560 1504 conicto +2560 2020 2327 2322 conicto +2095 2624 1697 2624 conicto +1696 3072 moveto +2338 3072 2705 2656 conicto +3072 2240 3072 1504 conicto +3072 771 2705 353 conicto +2338 -64 1696 -64 conicto +1051 -64 685 353 conicto +320 771 320 1504 conicto +320 2240 685 2656 conicto +1051 3072 1696 3072 conicto +end_ol grestore +gsave 5.624556 9.227500 translate 0.035278 -0.035278 scale +start_ol +2048 4160 moveto +2048 3712 lineto +1571 3712 lineto +1299 3712 1193 3612 conicto +1088 3513 1088 3254 conicto +1088 3008 lineto +1920 3008 lineto +1920 2624 lineto +1088 2624 lineto +1088 0 lineto +576 0 lineto +576 2624 lineto +128 2624 lineto +128 3008 lineto +576 3008 lineto +576 3202 lineto +576 3702 816 3931 conicto +1056 4160 1577 4160 conicto +2048 4160 lineto +end_ol grestore +gsave 5.881813 9.227500 translate 0.035278 -0.035278 scale +start_ol +end_ol grestore +gsave 6.114093 9.227500 translate 0.035278 -0.035278 scale +start_ol +0 4032 moveto +3392 4032 lineto +3392 3584 lineto +1984 3584 lineto +1984 0 lineto +1408 0 lineto +1408 3584 lineto +0 3584 lineto +0 4032 lineto +end_ol grestore +gsave 6.451271 9.227500 translate 0.035278 -0.035278 scale +start_ol +2304 2560 moveto +2220 2593 2120 2608 conicto +2021 2624 1902 2624 conicto +1478 2624 1251 2359 conicto +1024 2094 1024 1597 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1182 2820 1434 2946 conicto +1687 3072 2049 3072 conicto +2100 3072 2162 3072 conicto +2225 3072 2301 3072 conicto +2304 2560 lineto +end_ol grestore +gsave 6.750986 9.227500 translate 0.035278 -0.035278 scale +start_ol +1882 1536 moveto +1289 1536 1060 1402 conicto +832 1268 832 944 conicto +832 686 1003 535 conicto +1175 384 1470 384 conicto +1876 384 2122 667 conicto +2368 951 2368 1422 conicto +2368 1536 lineto +1882 1536 lineto +2880 1739 moveto +2880 0 lineto +2368 0 lineto +2368 448 lineto +2199 186 1946 61 conicto +1693 -64 1328 -64 conicto +866 -64 593 196 conicto +320 456 320 893 conicto +320 1402 662 1661 conicto +1004 1920 1682 1920 conicto +2368 1920 lineto +2368 1962 lineto +2368 2278 2140 2451 conicto +1912 2624 1500 2624 conicto +1238 2624 989 2560 conicto +741 2496 512 2368 conicto +512 2816 lineto +789 2944 1049 3008 conicto +1310 3072 1556 3072 conicto +2222 3072 2551 2741 conicto +2880 2411 2880 1739 conicto +end_ol grestore +gsave 7.198066 9.227500 translate 0.035278 -0.035278 scale +start_ol +512 3008 moveto +1024 3008 lineto +1024 -36 lineto +1024 -624 797 -888 conicto +571 -1152 67 -1152 conicto +-128 -1152 lineto +-128 -704 lineto +32 -704 lineto +310 -704 411 -574 conicto +512 -445 512 -36 conicto +512 3008 lineto +512 4160 moveto +1024 4160 lineto +1024 3520 lineto +512 3520 lineto +512 4160 lineto +end_ol grestore +gsave 7.400373 9.227500 translate 0.035278 -0.035278 scale +start_ol +3136 1584 moveto +3136 1344 lineto +832 1344 lineto +865 875 1142 629 conicto +1420 384 1916 384 conicto +2203 384 2472 448 conicto +2742 512 3008 640 conicto +3008 192 lineto +2741 67 2460 1 conicto +2179 -64 1890 -64 conicto +1166 -64 743 352 conicto +320 768 320 1477 conicto +320 2211 723 2641 conicto +1126 3072 1809 3072 conicto +2423 3072 2779 2672 conicto +3136 2272 3136 1584 conicto +2624 1728 moveto +2619 2137 2395 2380 conicto +2172 2624 1804 2624 conicto +1387 2624 1136 2388 conicto +886 2153 848 1725 conicto +2624 1728 lineto +end_ol grestore +gsave 7.847452 9.227500 translate 0.035278 -0.035278 scale +start_ol +2688 2816 moveto +2688 2368 lineto +2479 2496 2268 2560 conicto +2058 2624 1843 2624 conicto +1363 2624 1097 2329 conicto +832 2035 832 1504 conicto +832 973 1097 678 conicto +1363 384 1843 384 conicto +2058 384 2268 448 conicto +2479 512 2688 640 conicto +2688 192 lineto +2482 64 2261 0 conicto +2041 -64 1793 -64 conicto +1116 -64 718 360 conicto +320 784 320 1504 conicto +320 2235 722 2653 conicto +1124 3072 1825 3072 conicto +2052 3072 2268 3008 conicto +2485 2944 2688 2816 conicto +end_ol grestore +gsave 8.247072 9.227500 translate 0.035278 -0.035278 scale +start_ol +1024 3840 moveto +1024 3008 lineto +2048 3008 lineto +2048 2624 lineto +1024 2624 lineto +1024 1016 lineto +1024 654 1125 551 conicto +1226 448 1537 448 conicto +2048 448 lineto +2048 0 lineto +1537 0 lineto +955 0 733 219 conicto +512 438 512 1016 conicto +512 2624 lineto +128 2624 lineto +128 3008 lineto +512 3008 lineto +512 3840 lineto +1024 3840 lineto +end_ol grestore +gsave 8.531805 9.227500 translate 0.035278 -0.035278 scale +start_ol +1697 2624 moveto +1297 2624 1064 2324 conicto +832 2025 832 1504 conicto +832 983 1063 683 conicto +1294 384 1697 384 conicto +2095 384 2327 684 conicto +2560 985 2560 1504 conicto +2560 2020 2327 2322 conicto +2095 2624 1697 2624 conicto +1696 3072 moveto +2338 3072 2705 2656 conicto +3072 2240 3072 1504 conicto +3072 771 2705 353 conicto +2338 -64 1696 -64 conicto +1051 -64 685 353 conicto +320 771 320 1504 conicto +320 2240 685 2656 conicto +1051 3072 1696 3072 conicto +end_ol grestore +gsave 8.976388 9.227500 translate 0.035278 -0.035278 scale +start_ol +2304 2560 moveto +2220 2593 2120 2608 conicto +2021 2624 1902 2624 conicto +1478 2624 1251 2359 conicto +1024 2094 1024 1597 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1182 2820 1434 2946 conicto +1687 3072 2049 3072 conicto +2100 3072 2162 3072 conicto +2225 3072 2301 3072 conicto +2304 2560 lineto +end_ol grestore +gsave 9.276103 9.227500 translate 0.035278 -0.035278 scale +start_ol +512 3008 moveto +1024 3008 lineto +1024 0 lineto +512 0 lineto +512 3008 lineto +512 4160 moveto +1024 4160 lineto +1024 3520 lineto +512 3520 lineto +512 4160 lineto +end_ol grestore +gsave 9.478410 9.227500 translate 0.035278 -0.035278 scale +start_ol +3136 1584 moveto +3136 1344 lineto +832 1344 lineto +865 875 1142 629 conicto +1420 384 1916 384 conicto +2203 384 2472 448 conicto +2742 512 3008 640 conicto +3008 192 lineto +2741 67 2460 1 conicto +2179 -64 1890 -64 conicto +1166 -64 743 352 conicto +320 768 320 1477 conicto +320 2211 723 2641 conicto +1126 3072 1809 3072 conicto +2423 3072 2779 2672 conicto +3136 2272 3136 1584 conicto +2624 1728 moveto +2619 2137 2395 2380 conicto +2172 2624 1804 2624 conicto +1387 2624 1136 2388 conicto +886 2153 848 1725 conicto +2624 1728 lineto +end_ol grestore +gsave 9.925490 9.227500 translate 0.035278 -0.035278 scale +start_ol +2432 2880 moveto +2432 2432 lineto +2227 2528 2007 2576 conicto +1787 2624 1551 2624 conicto +1191 2624 1011 2516 conicto +832 2408 832 2192 conicto +832 2027 962 1933 conicto +1093 1839 1487 1754 conicto +1654 1717 lineto +2181 1605 2402 1402 conicto +2624 1199 2624 834 conicto +2624 420 2291 178 conicto +1959 -64 1378 -64 conicto +1136 -64 873 -16 conicto +611 32 320 128 conicto +320 640 lineto +594 512 860 448 conicto +1126 384 1387 384 conicto +1736 384 1924 498 conicto +2112 612 2112 820 conicto +2112 1013 1977 1115 conicto +1843 1218 1388 1313 conicto +1218 1352 lineto +743 1448 531 1646 conicto +320 1845 320 2192 conicto +320 2613 620 2842 conicto +920 3072 1472 3072 conicto +1746 3072 1987 3024 conicto +2228 2976 2432 2880 conicto +end_ol grestore +gsave 10.305134 9.227500 translate 0.035278 -0.035278 scale +start_ol +end_ol grestore +gsave 10.537413 9.227500 translate 0.035278 -0.035278 scale +start_ol +1024 3840 moveto +1024 3008 lineto +2048 3008 lineto +2048 2624 lineto +1024 2624 lineto +1024 1016 lineto +1024 654 1125 551 conicto +1226 448 1537 448 conicto +2048 448 lineto +2048 0 lineto +1537 0 lineto +955 0 733 219 conicto +512 438 512 1016 conicto +512 2624 lineto +128 2624 lineto +128 3008 lineto +512 3008 lineto +512 3840 lineto +1024 3840 lineto +end_ol grestore +gsave 10.822146 9.227500 translate 0.035278 -0.035278 scale +start_ol +512 3008 moveto +1024 3008 lineto +1024 0 lineto +512 0 lineto +512 3008 lineto +512 4160 moveto +1024 4160 lineto +1024 3520 lineto +512 3520 lineto +512 4160 lineto +end_ol grestore +gsave 11.024453 9.227500 translate 0.035278 -0.035278 scale +start_ol +2858 2449 moveto +3041 2768 3296 2920 conicto +3551 3072 3895 3072 conicto +4360 3072 4612 2748 conicto +4864 2425 4864 1829 conicto +4864 0 lineto +4352 0 lineto +4352 1813 lineto +4352 2225 4203 2424 conicto +4055 2624 3750 2624 conicto +3377 2624 3160 2379 conicto +2944 2135 2944 1713 conicto +2944 0 lineto +2432 0 lineto +2432 1813 lineto +2432 2227 2283 2425 conicto +2135 2624 1824 2624 conicto +1457 2624 1240 2378 conicto +1024 2132 1024 1713 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1190 2822 1423 2947 conicto +1656 3072 1975 3072 conicto +2298 3072 2524 2912 conicto +2750 2753 2858 2449 conicto +end_ol grestore +gsave 11.733785 9.227500 translate 0.035278 -0.035278 scale +start_ol +3136 1584 moveto +3136 1344 lineto +832 1344 lineto +865 875 1142 629 conicto +1420 384 1916 384 conicto +2203 384 2472 448 conicto +2742 512 3008 640 conicto +3008 192 lineto +2741 67 2460 1 conicto +2179 -64 1890 -64 conicto +1166 -64 743 352 conicto +320 768 320 1477 conicto +320 2211 723 2641 conicto +1126 3072 1809 3072 conicto +2423 3072 2779 2672 conicto +3136 2272 3136 1584 conicto +2624 1728 moveto +2619 2137 2395 2380 conicto +2172 2624 1804 2624 conicto +1387 2624 1136 2388 conicto +886 2153 848 1725 conicto +2624 1728 lineto +end_ol grestore +gsave 12.180864 9.227500 translate 0.035278 -0.035278 scale +start_ol +2432 2880 moveto +2432 2432 lineto +2227 2528 2007 2576 conicto +1787 2624 1551 2624 conicto +1191 2624 1011 2516 conicto +832 2408 832 2192 conicto +832 2027 962 1933 conicto +1093 1839 1487 1754 conicto +1654 1717 lineto +2181 1605 2402 1402 conicto +2624 1199 2624 834 conicto +2624 420 2291 178 conicto +1959 -64 1378 -64 conicto +1136 -64 873 -16 conicto +611 32 320 128 conicto +320 640 lineto +594 512 860 448 conicto +1126 384 1387 384 conicto +1736 384 1924 498 conicto +2112 612 2112 820 conicto +2112 1013 1977 1115 conicto +1843 1218 1388 1313 conicto +1218 1352 lineto +743 1448 531 1646 conicto +320 1845 320 2192 conicto +320 2613 620 2842 conicto +920 3072 1472 3072 conicto +1746 3072 1987 3024 conicto +2228 2976 2432 2880 conicto +end_ol grestore +1.000000 1.000000 1.000000 srgb +n 3.050000 10.800000 m 3.050000 17.900000 l 25.050000 17.900000 l 25.050000 10.800000 l f +0.100000 slw +[] 0 sd +[] 0 sd +0 slj +0.000000 0.000000 0.000000 srgb +n 3.050000 10.800000 m 3.050000 17.900000 l 25.050000 17.900000 l 25.050000 10.800000 l cp s +gsave 3.500000 12.456500 translate 0.035278 -0.035278 scale +start_ol +512 4032 moveto +1357 4032 lineto +2335 1282 lineto +3317 4032 lineto +4160 4032 lineto +4160 0 lineto +3584 0 lineto +3584 3539 lineto +2596 768 lineto +2076 768 lineto +1088 3539 lineto +1088 0 lineto +512 0 lineto +512 4032 lineto +end_ol grestore +gsave 4.129411 12.456500 translate 0.035278 -0.035278 scale +start_ol +1697 2624 moveto +1297 2624 1064 2324 conicto +832 2025 832 1504 conicto +832 983 1063 683 conicto +1294 384 1697 384 conicto +2095 384 2327 684 conicto +2560 985 2560 1504 conicto +2560 2020 2327 2322 conicto +2095 2624 1697 2624 conicto +1696 3072 moveto +2338 3072 2705 2656 conicto +3072 2240 3072 1504 conicto +3072 771 2705 353 conicto +2338 -64 1696 -64 conicto +1051 -64 685 353 conicto +320 771 320 1504 conicto +320 2240 685 2656 conicto +1051 3072 1696 3072 conicto +end_ol grestore +gsave 4.573993 12.456500 translate 0.035278 -0.035278 scale +start_ol +512 4160 moveto +1024 4160 lineto +1024 0 lineto +512 0 lineto +512 4160 lineto +end_ol grestore +gsave 4.776300 12.456500 translate 0.035278 -0.035278 scale +start_ol +3136 1584 moveto +3136 1344 lineto +832 1344 lineto +865 875 1142 629 conicto +1420 384 1916 384 conicto +2203 384 2472 448 conicto +2742 512 3008 640 conicto +3008 192 lineto +2741 67 2460 1 conicto +2179 -64 1890 -64 conicto +1166 -64 743 352 conicto +320 768 320 1477 conicto +320 2211 723 2641 conicto +1126 3072 1809 3072 conicto +2423 3072 2779 2672 conicto +3136 2272 3136 1584 conicto +2624 1728 moveto +2619 2137 2395 2380 conicto +2172 2624 1804 2624 conicto +1387 2624 1136 2388 conicto +886 2153 848 1725 conicto +2624 1728 lineto +end_ol grestore +gsave 5.223380 12.456500 translate 0.035278 -0.035278 scale +start_ol +2688 2816 moveto +2688 2368 lineto +2479 2496 2268 2560 conicto +2058 2624 1843 2624 conicto +1363 2624 1097 2329 conicto +832 2035 832 1504 conicto +832 973 1097 678 conicto +1363 384 1843 384 conicto +2058 384 2268 448 conicto +2479 512 2688 640 conicto +2688 192 lineto +2482 64 2261 0 conicto +2041 -64 1793 -64 conicto +1116 -64 718 360 conicto +320 784 320 1504 conicto +320 2235 722 2653 conicto +1124 3072 1825 3072 conicto +2052 3072 2268 3008 conicto +2485 2944 2688 2816 conicto +end_ol grestore +gsave 5.623000 12.456500 translate 0.035278 -0.035278 scale +start_ol +512 1177 moveto +512 3008 lineto +1024 3008 lineto +1024 1196 lineto +1024 790 1184 587 conicto +1344 384 1664 384 conicto +2049 384 2272 628 conicto +2496 872 2496 1293 conicto +2496 3008 lineto +3008 3008 lineto +3008 0 lineto +2496 0 lineto +2496 448 lineto +2320 188 2087 62 conicto +1854 -64 1546 -64 conicto +1038 -64 775 252 conicto +512 568 512 1177 conicto +1744 3072 moveto +1744 3072 lineto +end_ol grestore +gsave 6.085062 12.456500 translate 0.035278 -0.035278 scale +start_ol +512 4160 moveto +1024 4160 lineto +1024 0 lineto +512 0 lineto +512 4160 lineto +end_ol grestore +gsave 6.287369 12.456500 translate 0.035278 -0.035278 scale +start_ol +1882 1536 moveto +1289 1536 1060 1402 conicto +832 1268 832 944 conicto +832 686 1003 535 conicto +1175 384 1470 384 conicto +1876 384 2122 667 conicto +2368 951 2368 1422 conicto +2368 1536 lineto +1882 1536 lineto +2880 1739 moveto +2880 0 lineto +2368 0 lineto +2368 448 lineto +2199 186 1946 61 conicto +1693 -64 1328 -64 conicto +866 -64 593 196 conicto +320 456 320 893 conicto +320 1402 662 1661 conicto +1004 1920 1682 1920 conicto +2368 1920 lineto +2368 1962 lineto +2368 2278 2140 2451 conicto +1912 2624 1500 2624 conicto +1238 2624 989 2560 conicto +741 2496 512 2368 conicto +512 2816 lineto +789 2944 1049 3008 conicto +1310 3072 1556 3072 conicto +2222 3072 2551 2741 conicto +2880 2411 2880 1739 conicto +end_ol grestore +gsave 6.734449 12.456500 translate 0.035278 -0.035278 scale +start_ol +2304 2560 moveto +2220 2593 2120 2608 conicto +2021 2624 1902 2624 conicto +1478 2624 1251 2359 conicto +1024 2094 1024 1597 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1182 2820 1434 2946 conicto +1687 3072 2049 3072 conicto +2100 3072 2162 3072 conicto +2225 3072 2301 3072 conicto +2304 2560 lineto +end_ol grestore +gsave 7.034164 12.456500 translate 0.035278 -0.035278 scale +start_ol +end_ol grestore +gsave 7.266443 12.456500 translate 0.035278 -0.035278 scale +start_ol +1088 3584 moveto +1088 448 lineto +1739 448 lineto +2563 448 2945 826 conicto +3328 1204 3328 2020 conicto +3328 2830 2945 3207 conicto +2563 3584 1739 3584 conicto +1088 3584 lineto +512 4032 moveto +1656 4032 lineto +2817 4032 3360 3544 conicto +3904 3057 3904 2020 conicto +3904 978 3358 489 conicto +2812 0 1656 0 conicto +512 0 lineto +512 4032 lineto +end_ol grestore +gsave 7.828418 12.456500 translate 0.035278 -0.035278 scale +start_ol +1799 -256 moveto +1587 -813 1386 -982 conicto +1185 -1152 848 -1152 conicto +448 -1152 lineto +448 -704 lineto +742 -704 lineto +948 -704 1062 -611 conicto +1177 -518 1315 -171 conicto +1405 54 lineto +192 3008 lineto +703 3008 lineto +1655 655 lineto +2606 3008 lineto +3136 3008 lineto +1799 -256 lineto +end_ol grestore +gsave 8.260507 12.456500 translate 0.035278 -0.035278 scale +start_ol +3008 1829 moveto +3008 0 lineto +2496 0 lineto +2496 1813 lineto +2496 2220 2335 2422 conicto +2175 2624 1854 2624 conicto +1469 2624 1246 2379 conicto +1024 2135 1024 1713 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1198 2817 1434 2944 conicto +1670 3072 1978 3072 conicto +2486 3072 2747 2756 conicto +3008 2441 3008 1829 conicto +end_ol grestore +gsave 8.722569 12.456500 translate 0.035278 -0.035278 scale +start_ol +1882 1536 moveto +1289 1536 1060 1402 conicto +832 1268 832 944 conicto +832 686 1003 535 conicto +1175 384 1470 384 conicto +1876 384 2122 667 conicto +2368 951 2368 1422 conicto +2368 1536 lineto +1882 1536 lineto +2880 1739 moveto +2880 0 lineto +2368 0 lineto +2368 448 lineto +2199 186 1946 61 conicto +1693 -64 1328 -64 conicto +866 -64 593 196 conicto +320 456 320 893 conicto +320 1402 662 1661 conicto +1004 1920 1682 1920 conicto +2368 1920 lineto +2368 1962 lineto +2368 2278 2140 2451 conicto +1912 2624 1500 2624 conicto +1238 2624 989 2560 conicto +741 2496 512 2368 conicto +512 2816 lineto +789 2944 1049 3008 conicto +1310 3072 1556 3072 conicto +2222 3072 2551 2741 conicto +2880 2411 2880 1739 conicto +end_ol grestore +gsave 9.169649 12.456500 translate 0.035278 -0.035278 scale +start_ol +2858 2449 moveto +3041 2768 3296 2920 conicto +3551 3072 3895 3072 conicto +4360 3072 4612 2748 conicto +4864 2425 4864 1829 conicto +4864 0 lineto +4352 0 lineto +4352 1813 lineto +4352 2225 4203 2424 conicto +4055 2624 3750 2624 conicto +3377 2624 3160 2379 conicto +2944 2135 2944 1713 conicto +2944 0 lineto +2432 0 lineto +2432 1813 lineto +2432 2227 2283 2425 conicto +2135 2624 1824 2624 conicto +1457 2624 1240 2378 conicto +1024 2132 1024 1713 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1190 2822 1423 2947 conicto +1656 3072 1975 3072 conicto +2298 3072 2524 2912 conicto +2750 2753 2858 2449 conicto +end_ol grestore +gsave 9.878980 12.456500 translate 0.035278 -0.035278 scale +start_ol +512 3008 moveto +1024 3008 lineto +1024 0 lineto +512 0 lineto +512 3008 lineto +512 4160 moveto +1024 4160 lineto +1024 3520 lineto +512 3520 lineto +512 4160 lineto +end_ol grestore +gsave 10.081287 12.456500 translate 0.035278 -0.035278 scale +start_ol +2688 2816 moveto +2688 2368 lineto +2479 2496 2268 2560 conicto +2058 2624 1843 2624 conicto +1363 2624 1097 2329 conicto +832 2035 832 1504 conicto +832 973 1097 678 conicto +1363 384 1843 384 conicto +2058 384 2268 448 conicto +2479 512 2688 640 conicto +2688 192 lineto +2482 64 2261 0 conicto +2041 -64 1793 -64 conicto +1116 -64 718 360 conicto +320 784 320 1504 conicto +320 2235 722 2653 conicto +1124 3072 1825 3072 conicto +2052 3072 2268 3008 conicto +2485 2944 2688 2816 conicto +end_ol grestore +gsave 10.480907 12.456500 translate 0.035278 -0.035278 scale +start_ol +2432 2880 moveto +2432 2432 lineto +2227 2528 2007 2576 conicto +1787 2624 1551 2624 conicto +1191 2624 1011 2516 conicto +832 2408 832 2192 conicto +832 2027 962 1933 conicto +1093 1839 1487 1754 conicto +1654 1717 lineto +2181 1605 2402 1402 conicto +2624 1199 2624 834 conicto +2624 420 2291 178 conicto +1959 -64 1378 -64 conicto +1136 -64 873 -16 conicto +611 32 320 128 conicto +320 640 lineto +594 512 860 448 conicto +1126 384 1387 384 conicto +1736 384 1924 498 conicto +2112 612 2112 820 conicto +2112 1013 1977 1115 conicto +1843 1218 1388 1313 conicto +1218 1352 lineto +743 1448 531 1646 conicto +320 1845 320 2192 conicto +320 2613 620 2842 conicto +920 3072 1472 3072 conicto +1746 3072 1987 3024 conicto +2228 2976 2432 2880 conicto +end_ol grestore +gsave 10.860551 12.456500 translate 0.035278 -0.035278 scale +start_ol +end_ol grestore +gsave 11.092831 12.456500 translate 0.035278 -0.035278 scale +start_ol +448 4032 moveto +1024 4032 lineto +1024 1596 lineto +1024 951 1250 667 conicto +1477 384 1985 384 conicto +2491 384 2717 667 conicto +2944 951 2944 1596 conicto +2944 4032 lineto +3520 4032 lineto +3520 1528 lineto +3520 740 3131 338 conicto +2743 -64 1985 -64 conicto +1225 -64 836 338 conicto +448 740 448 1528 conicto +448 4032 lineto +end_ol grestore +gsave 11.624833 12.456500 translate 0.035278 -0.035278 scale +start_ol +1024 448 moveto +1024 -1152 lineto +512 -1152 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1178 2820 1414 2946 conicto +1650 3072 1977 3072 conicto +2521 3072 2860 2640 conicto +3200 2208 3200 1504 conicto +3200 800 2860 368 conicto +2521 -64 1977 -64 conicto +1650 -64 1414 62 conicto +1178 188 1024 448 conicto +2688 1504 moveto +2688 2028 2466 2326 conicto +2244 2624 1856 2624 conicto +1468 2624 1246 2326 conicto +1024 2028 1024 1504 conicto +1024 980 1246 682 conicto +1468 384 1856 384 conicto +2244 384 2466 682 conicto +2688 980 2688 1504 conicto +end_ol grestore +gsave 12.086895 12.456500 translate 0.035278 -0.035278 scale +start_ol +2496 2560 moveto +2496 4160 lineto +3008 4160 lineto +3008 0 lineto +2496 0 lineto +2496 448 lineto +2342 188 2106 62 conicto +1870 -64 1540 -64 conicto +999 -64 659 368 conicto +320 800 320 1504 conicto +320 2208 659 2640 conicto +999 3072 1540 3072 conicto +1870 3072 2106 2946 conicto +2342 2820 2496 2560 conicto +832 1504 moveto +832 980 1053 682 conicto +1275 384 1663 384 conicto +2050 384 2273 682 conicto +2496 980 2496 1504 conicto +2496 2028 2273 2326 conicto +2050 2624 1663 2624 conicto +1275 2624 1053 2326 conicto +832 2028 832 1504 conicto +end_ol grestore +gsave 12.548957 12.456500 translate 0.035278 -0.035278 scale +start_ol +1882 1536 moveto +1289 1536 1060 1402 conicto +832 1268 832 944 conicto +832 686 1003 535 conicto +1175 384 1470 384 conicto +1876 384 2122 667 conicto +2368 951 2368 1422 conicto +2368 1536 lineto +1882 1536 lineto +2880 1739 moveto +2880 0 lineto +2368 0 lineto +2368 448 lineto +2199 186 1946 61 conicto +1693 -64 1328 -64 conicto +866 -64 593 196 conicto +320 456 320 893 conicto +320 1402 662 1661 conicto +1004 1920 1682 1920 conicto +2368 1920 lineto +2368 1962 lineto +2368 2278 2140 2451 conicto +1912 2624 1500 2624 conicto +1238 2624 989 2560 conicto +741 2496 512 2368 conicto +512 2816 lineto +789 2944 1049 3008 conicto +1310 3072 1556 3072 conicto +2222 3072 2551 2741 conicto +2880 2411 2880 1739 conicto +end_ol grestore +gsave 12.996036 12.456500 translate 0.035278 -0.035278 scale +start_ol +1024 3840 moveto +1024 3008 lineto +2048 3008 lineto +2048 2624 lineto +1024 2624 lineto +1024 1016 lineto +1024 654 1125 551 conicto +1226 448 1537 448 conicto +2048 448 lineto +2048 0 lineto +1537 0 lineto +955 0 733 219 conicto +512 438 512 1016 conicto +512 2624 lineto +128 2624 lineto +128 3008 lineto +512 3008 lineto +512 3840 lineto +1024 3840 lineto +end_ol grestore +gsave 13.280769 12.456500 translate 0.035278 -0.035278 scale +start_ol +3136 1584 moveto +3136 1344 lineto +832 1344 lineto +865 875 1142 629 conicto +1420 384 1916 384 conicto +2203 384 2472 448 conicto +2742 512 3008 640 conicto +3008 192 lineto +2741 67 2460 1 conicto +2179 -64 1890 -64 conicto +1166 -64 743 352 conicto +320 768 320 1477 conicto +320 2211 723 2641 conicto +1126 3072 1809 3072 conicto +2423 3072 2779 2672 conicto +3136 2272 3136 1584 conicto +2624 1728 moveto +2619 2137 2395 2380 conicto +2172 2624 1804 2624 conicto +1387 2624 1136 2388 conicto +886 2153 848 1725 conicto +2624 1728 lineto +end_ol grestore +gsave 3.500000 14.456500 translate 0.035278 -0.035278 scale +start_ol +2179 3648 moveto +1590 3648 1243 3209 conicto +896 2771 896 2015 conicto +896 1261 1243 822 conicto +1590 384 2179 384 conicto +2768 384 3112 822 conicto +3456 1261 3456 2015 conicto +3456 2771 3112 3209 conicto +2768 3648 2179 3648 conicto +2179 4096 moveto +3022 4096 3527 3530 conicto +4032 2965 4032 2015 conicto +4032 1067 3527 501 conicto +3022 -64 2179 -64 conicto +1333 -64 826 500 conicto +320 1064 320 2015 conicto +320 2965 826 3530 conicto +1333 4096 2179 4096 conicto +end_ol grestore +gsave 4.074460 14.456500 translate 0.035278 -0.035278 scale +start_ol +3008 1829 moveto +3008 0 lineto +2496 0 lineto +2496 1813 lineto +2496 2220 2335 2422 conicto +2175 2624 1854 2624 conicto +1469 2624 1246 2379 conicto +1024 2135 1024 1713 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1198 2817 1434 2944 conicto +1670 3072 1978 3072 conicto +2486 3072 2747 2756 conicto +3008 2441 3008 1829 conicto +end_ol grestore +gsave 4.536522 14.456500 translate 0.035278 -0.035278 scale +start_ol +512 4160 moveto +1024 4160 lineto +1024 0 lineto +512 0 lineto +512 4160 lineto +end_ol grestore +gsave 4.738829 14.456500 translate 0.035278 -0.035278 scale +start_ol +512 3008 moveto +1024 3008 lineto +1024 0 lineto +512 0 lineto +512 3008 lineto +512 4160 moveto +1024 4160 lineto +1024 3520 lineto +512 3520 lineto +512 4160 lineto +end_ol grestore +gsave 4.941136 14.456500 translate 0.035278 -0.035278 scale +start_ol +3008 1829 moveto +3008 0 lineto +2496 0 lineto +2496 1813 lineto +2496 2220 2335 2422 conicto +2175 2624 1854 2624 conicto +1469 2624 1246 2379 conicto +1024 2135 1024 1713 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1198 2817 1434 2944 conicto +1670 3072 1978 3072 conicto +2486 3072 2747 2756 conicto +3008 2441 3008 1829 conicto +end_ol grestore +gsave 5.403198 14.456500 translate 0.035278 -0.035278 scale +start_ol +3136 1584 moveto +3136 1344 lineto +832 1344 lineto +865 875 1142 629 conicto +1420 384 1916 384 conicto +2203 384 2472 448 conicto +2742 512 3008 640 conicto +3008 192 lineto +2741 67 2460 1 conicto +2179 -64 1890 -64 conicto +1166 -64 743 352 conicto +320 768 320 1477 conicto +320 2211 723 2641 conicto +1126 3072 1809 3072 conicto +2423 3072 2779 2672 conicto +3136 2272 3136 1584 conicto +2624 1728 moveto +2619 2137 2395 2380 conicto +2172 2624 1804 2624 conicto +1387 2624 1136 2388 conicto +886 2153 848 1725 conicto +2624 1728 lineto +end_ol grestore +gsave 5.850277 14.456500 translate 0.035278 -0.035278 scale +start_ol +end_ol grestore +gsave 6.082557 14.456500 translate 0.035278 -0.035278 scale +start_ol +512 4032 moveto +1357 4032 lineto +2335 1282 lineto +3317 4032 lineto +4160 4032 lineto +4160 0 lineto +3584 0 lineto +3584 3539 lineto +2596 768 lineto +2076 768 lineto +1088 3539 lineto +1088 0 lineto +512 0 lineto +512 4032 lineto +end_ol grestore +gsave 6.711967 14.456500 translate 0.035278 -0.035278 scale +start_ol +3136 1584 moveto +3136 1344 lineto +832 1344 lineto +865 875 1142 629 conicto +1420 384 1916 384 conicto +2203 384 2472 448 conicto +2742 512 3008 640 conicto +3008 192 lineto +2741 67 2460 1 conicto +2179 -64 1890 -64 conicto +1166 -64 743 352 conicto +320 768 320 1477 conicto +320 2211 723 2641 conicto +1126 3072 1809 3072 conicto +2423 3072 2779 2672 conicto +3136 2272 3136 1584 conicto +2624 1728 moveto +2619 2137 2395 2380 conicto +2172 2624 1804 2624 conicto +1387 2624 1136 2388 conicto +886 2153 848 1725 conicto +2624 1728 lineto +end_ol grestore +gsave 7.159047 14.456500 translate 0.035278 -0.035278 scale +start_ol +1882 1536 moveto +1289 1536 1060 1402 conicto +832 1268 832 944 conicto +832 686 1003 535 conicto +1175 384 1470 384 conicto +1876 384 2122 667 conicto +2368 951 2368 1422 conicto +2368 1536 lineto +1882 1536 lineto +2880 1739 moveto +2880 0 lineto +2368 0 lineto +2368 448 lineto +2199 186 1946 61 conicto +1693 -64 1328 -64 conicto +866 -64 593 196 conicto +320 456 320 893 conicto +320 1402 662 1661 conicto +1004 1920 1682 1920 conicto +2368 1920 lineto +2368 1962 lineto +2368 2278 2140 2451 conicto +1912 2624 1500 2624 conicto +1238 2624 989 2560 conicto +741 2496 512 2368 conicto +512 2816 lineto +789 2944 1049 3008 conicto +1310 3072 1556 3072 conicto +2222 3072 2551 2741 conicto +2880 2411 2880 1739 conicto +end_ol grestore +gsave 7.606127 14.456500 translate 0.035278 -0.035278 scale +start_ol +2432 2880 moveto +2432 2432 lineto +2227 2528 2007 2576 conicto +1787 2624 1551 2624 conicto +1191 2624 1011 2516 conicto +832 2408 832 2192 conicto +832 2027 962 1933 conicto +1093 1839 1487 1754 conicto +1654 1717 lineto +2181 1605 2402 1402 conicto +2624 1199 2624 834 conicto +2624 420 2291 178 conicto +1959 -64 1378 -64 conicto +1136 -64 873 -16 conicto +611 32 320 128 conicto +320 640 lineto +594 512 860 448 conicto +1126 384 1387 384 conicto +1736 384 1924 498 conicto +2112 612 2112 820 conicto +2112 1013 1977 1115 conicto +1843 1218 1388 1313 conicto +1218 1352 lineto +743 1448 531 1646 conicto +320 1845 320 2192 conicto +320 2613 620 2842 conicto +920 3072 1472 3072 conicto +1746 3072 1987 3024 conicto +2228 2976 2432 2880 conicto +end_ol grestore +gsave 7.985771 14.456500 translate 0.035278 -0.035278 scale +start_ol +512 1177 moveto +512 3008 lineto +1024 3008 lineto +1024 1196 lineto +1024 790 1184 587 conicto +1344 384 1664 384 conicto +2049 384 2272 628 conicto +2496 872 2496 1293 conicto +2496 3008 lineto +3008 3008 lineto +3008 0 lineto +2496 0 lineto +2496 448 lineto +2320 188 2087 62 conicto +1854 -64 1546 -64 conicto +1038 -64 775 252 conicto +512 568 512 1177 conicto +1744 3072 moveto +1744 3072 lineto +end_ol grestore +gsave 8.447832 14.456500 translate 0.035278 -0.035278 scale +start_ol +2304 2560 moveto +2220 2593 2120 2608 conicto +2021 2624 1902 2624 conicto +1478 2624 1251 2359 conicto +1024 2094 1024 1597 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1182 2820 1434 2946 conicto +1687 3072 2049 3072 conicto +2100 3072 2162 3072 conicto +2225 3072 2301 3072 conicto +2304 2560 lineto +end_ol grestore +gsave 8.732565 14.456500 translate 0.035278 -0.035278 scale +start_ol +3136 1584 moveto +3136 1344 lineto +832 1344 lineto +865 875 1142 629 conicto +1420 384 1916 384 conicto +2203 384 2472 448 conicto +2742 512 3008 640 conicto +3008 192 lineto +2741 67 2460 1 conicto +2179 -64 1890 -64 conicto +1166 -64 743 352 conicto +320 768 320 1477 conicto +320 2211 723 2641 conicto +1126 3072 1809 3072 conicto +2423 3072 2779 2672 conicto +3136 2272 3136 1584 conicto +2624 1728 moveto +2619 2137 2395 2380 conicto +2172 2624 1804 2624 conicto +1387 2624 1136 2388 conicto +886 2153 848 1725 conicto +2624 1728 lineto +end_ol grestore +gsave 9.179645 14.456500 translate 0.035278 -0.035278 scale +start_ol +2858 2449 moveto +3041 2768 3296 2920 conicto +3551 3072 3895 3072 conicto +4360 3072 4612 2748 conicto +4864 2425 4864 1829 conicto +4864 0 lineto +4352 0 lineto +4352 1813 lineto +4352 2225 4203 2424 conicto +4055 2624 3750 2624 conicto +3377 2624 3160 2379 conicto +2944 2135 2944 1713 conicto +2944 0 lineto +2432 0 lineto +2432 1813 lineto +2432 2227 2283 2425 conicto +2135 2624 1824 2624 conicto +1457 2624 1240 2378 conicto +1024 2132 1024 1713 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1190 2822 1423 2947 conicto +1656 3072 1975 3072 conicto +2298 3072 2524 2912 conicto +2750 2753 2858 2449 conicto +end_ol grestore +gsave 9.888976 14.456500 translate 0.035278 -0.035278 scale +start_ol +3136 1584 moveto +3136 1344 lineto +832 1344 lineto +865 875 1142 629 conicto +1420 384 1916 384 conicto +2203 384 2472 448 conicto +2742 512 3008 640 conicto +3008 192 lineto +2741 67 2460 1 conicto +2179 -64 1890 -64 conicto +1166 -64 743 352 conicto +320 768 320 1477 conicto +320 2211 723 2641 conicto +1126 3072 1809 3072 conicto +2423 3072 2779 2672 conicto +3136 2272 3136 1584 conicto +2624 1728 moveto +2619 2137 2395 2380 conicto +2172 2624 1804 2624 conicto +1387 2624 1136 2388 conicto +886 2153 848 1725 conicto +2624 1728 lineto +end_ol grestore +gsave 10.336056 14.456500 translate 0.035278 -0.035278 scale +start_ol +3008 1829 moveto +3008 0 lineto +2496 0 lineto +2496 1813 lineto +2496 2220 2335 2422 conicto +2175 2624 1854 2624 conicto +1469 2624 1246 2379 conicto +1024 2135 1024 1713 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1198 2817 1434 2944 conicto +1670 3072 1978 3072 conicto +2486 3072 2747 2756 conicto +3008 2441 3008 1829 conicto +end_ol grestore +gsave 10.798118 14.456500 translate 0.035278 -0.035278 scale +start_ol +1024 3840 moveto +1024 3008 lineto +2048 3008 lineto +2048 2624 lineto +1024 2624 lineto +1024 1016 lineto +1024 654 1125 551 conicto +1226 448 1537 448 conicto +2048 448 lineto +2048 0 lineto +1537 0 lineto +955 0 733 219 conicto +512 438 512 1016 conicto +512 2624 lineto +128 2624 lineto +128 3008 lineto +512 3008 lineto +512 3840 lineto +1024 3840 lineto +end_ol grestore +gsave 11.082851 14.456500 translate 0.035278 -0.035278 scale +start_ol +2432 2880 moveto +2432 2432 lineto +2227 2528 2007 2576 conicto +1787 2624 1551 2624 conicto +1191 2624 1011 2516 conicto +832 2408 832 2192 conicto +832 2027 962 1933 conicto +1093 1839 1487 1754 conicto +1654 1717 lineto +2181 1605 2402 1402 conicto +2624 1199 2624 834 conicto +2624 420 2291 178 conicto +1959 -64 1378 -64 conicto +1136 -64 873 -16 conicto +611 32 320 128 conicto +320 640 lineto +594 512 860 448 conicto +1126 384 1387 384 conicto +1736 384 1924 498 conicto +2112 612 2112 820 conicto +2112 1013 1977 1115 conicto +1843 1218 1388 1313 conicto +1218 1352 lineto +743 1448 531 1646 conicto +320 1845 320 2192 conicto +320 2613 620 2842 conicto +920 3072 1472 3072 conicto +1746 3072 1987 3024 conicto +2228 2976 2432 2880 conicto +end_ol grestore +gsave 3.500000 16.456500 translate 0.035278 -0.035278 scale +start_ol +192 4032 moveto +738 4032 lineto +1578 624 lineto +2415 4032 lineto +3022 4032 lineto +3862 624 lineto +4700 4032 lineto +5248 4032 lineto +4245 0 lineto +3565 0 lineto +2723 3500 lineto +1872 0 lineto +1192 0 lineto +192 4032 lineto +end_ol grestore +gsave 4.186858 16.456500 translate 0.035278 -0.035278 scale +start_ol +2304 2560 moveto +2220 2593 2120 2608 conicto +2021 2624 1902 2624 conicto +1478 2624 1251 2359 conicto +1024 2094 1024 1597 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1182 2820 1434 2946 conicto +1687 3072 2049 3072 conicto +2100 3072 2162 3072 conicto +2225 3072 2301 3072 conicto +2304 2560 lineto +end_ol grestore +gsave 4.486573 16.456500 translate 0.035278 -0.035278 scale +start_ol +512 3008 moveto +1024 3008 lineto +1024 0 lineto +512 0 lineto +512 3008 lineto +512 4160 moveto +1024 4160 lineto +1024 3520 lineto +512 3520 lineto +512 4160 lineto +end_ol grestore +gsave 4.688880 16.456500 translate 0.035278 -0.035278 scale +start_ol +1024 3840 moveto +1024 3008 lineto +2048 3008 lineto +2048 2624 lineto +1024 2624 lineto +1024 1016 lineto +1024 654 1125 551 conicto +1226 448 1537 448 conicto +2048 448 lineto +2048 0 lineto +1537 0 lineto +955 0 733 219 conicto +512 438 512 1016 conicto +512 2624 lineto +128 2624 lineto +128 3008 lineto +512 3008 lineto +512 3840 lineto +1024 3840 lineto +end_ol grestore +gsave 4.973613 16.456500 translate 0.035278 -0.035278 scale +start_ol +3136 1584 moveto +3136 1344 lineto +832 1344 lineto +865 875 1142 629 conicto +1420 384 1916 384 conicto +2203 384 2472 448 conicto +2742 512 3008 640 conicto +3008 192 lineto +2741 67 2460 1 conicto +2179 -64 1890 -64 conicto +1166 -64 743 352 conicto +320 768 320 1477 conicto +320 2211 723 2641 conicto +1126 3072 1809 3072 conicto +2423 3072 2779 2672 conicto +3136 2272 3136 1584 conicto +2624 1728 moveto +2619 2137 2395 2380 conicto +2172 2624 1804 2624 conicto +1387 2624 1136 2388 conicto +886 2153 848 1725 conicto +2624 1728 lineto +end_ol grestore +gsave 5.420693 16.456500 translate 0.035278 -0.035278 scale +start_ol +end_ol grestore +gsave 5.652972 16.456500 translate 0.035278 -0.035278 scale +start_ol +1088 3584 moveto +1088 448 lineto +1739 448 lineto +2563 448 2945 826 conicto +3328 1204 3328 2020 conicto +3328 2830 2945 3207 conicto +2563 3584 1739 3584 conicto +1088 3584 lineto +512 4032 moveto +1656 4032 lineto +2817 4032 3360 3544 conicto +3904 3057 3904 2020 conicto +3904 978 3358 489 conicto +2812 0 1656 0 conicto +512 0 lineto +512 4032 lineto +end_ol grestore +gsave 6.214947 16.456500 translate 0.035278 -0.035278 scale +start_ol +1882 1536 moveto +1289 1536 1060 1402 conicto +832 1268 832 944 conicto +832 686 1003 535 conicto +1175 384 1470 384 conicto +1876 384 2122 667 conicto +2368 951 2368 1422 conicto +2368 1536 lineto +1882 1536 lineto +2880 1739 moveto +2880 0 lineto +2368 0 lineto +2368 448 lineto +2199 186 1946 61 conicto +1693 -64 1328 -64 conicto +866 -64 593 196 conicto +320 456 320 893 conicto +320 1402 662 1661 conicto +1004 1920 1682 1920 conicto +2368 1920 lineto +2368 1962 lineto +2368 2278 2140 2451 conicto +1912 2624 1500 2624 conicto +1238 2624 989 2560 conicto +741 2496 512 2368 conicto +512 2816 lineto +789 2944 1049 3008 conicto +1310 3072 1556 3072 conicto +2222 3072 2551 2741 conicto +2880 2411 2880 1739 conicto +end_ol grestore +gsave 6.662027 16.456500 translate 0.035278 -0.035278 scale +start_ol +1024 3840 moveto +1024 3008 lineto +2048 3008 lineto +2048 2624 lineto +1024 2624 lineto +1024 1016 lineto +1024 654 1125 551 conicto +1226 448 1537 448 conicto +2048 448 lineto +2048 0 lineto +1537 0 lineto +955 0 733 219 conicto +512 438 512 1016 conicto +512 2624 lineto +128 2624 lineto +128 3008 lineto +512 3008 lineto +512 3840 lineto +1024 3840 lineto +end_ol grestore +gsave 6.946760 16.456500 translate 0.035278 -0.035278 scale +start_ol +1882 1536 moveto +1289 1536 1060 1402 conicto +832 1268 832 944 conicto +832 686 1003 535 conicto +1175 384 1470 384 conicto +1876 384 2122 667 conicto +2368 951 2368 1422 conicto +2368 1536 lineto +1882 1536 lineto +2880 1739 moveto +2880 0 lineto +2368 0 lineto +2368 448 lineto +2199 186 1946 61 conicto +1693 -64 1328 -64 conicto +866 -64 593 196 conicto +320 456 320 893 conicto +320 1402 662 1661 conicto +1004 1920 1682 1920 conicto +2368 1920 lineto +2368 1962 lineto +2368 2278 2140 2451 conicto +1912 2624 1500 2624 conicto +1238 2624 989 2560 conicto +741 2496 512 2368 conicto +512 2816 lineto +789 2944 1049 3008 conicto +1310 3072 1556 3072 conicto +2222 3072 2551 2741 conicto +2880 2411 2880 1739 conicto +end_ol grestore +gsave 7.393840 16.456500 translate 0.035278 -0.035278 scale +start_ol +end_ol grestore +gsave 7.626119 16.456500 translate 0.035278 -0.035278 scale +start_ol +1024 3840 moveto +1024 3008 lineto +2048 3008 lineto +2048 2624 lineto +1024 2624 lineto +1024 1016 lineto +1024 654 1125 551 conicto +1226 448 1537 448 conicto +2048 448 lineto +2048 0 lineto +1537 0 lineto +955 0 733 219 conicto +512 438 512 1016 conicto +512 2624 lineto +128 2624 lineto +128 3008 lineto +512 3008 lineto +512 3840 lineto +1024 3840 lineto +end_ol grestore +gsave 7.910852 16.456500 translate 0.035278 -0.035278 scale +start_ol +1697 2624 moveto +1297 2624 1064 2324 conicto +832 2025 832 1504 conicto +832 983 1063 683 conicto +1294 384 1697 384 conicto +2095 384 2327 684 conicto +2560 985 2560 1504 conicto +2560 2020 2327 2322 conicto +2095 2624 1697 2624 conicto +1696 3072 moveto +2338 3072 2705 2656 conicto +3072 2240 3072 1504 conicto +3072 771 2705 353 conicto +2338 -64 1696 -64 conicto +1051 -64 685 353 conicto +320 771 320 1504 conicto +320 2240 685 2656 conicto +1051 3072 1696 3072 conicto +end_ol grestore +gsave 8.355435 16.456500 translate 0.035278 -0.035278 scale +start_ol +end_ol grestore +gsave 8.587714 16.456500 translate 0.035278 -0.035278 scale +start_ol +1088 3584 moveto +1088 448 lineto +1739 448 lineto +2563 448 2945 826 conicto +3328 1204 3328 2020 conicto +3328 2830 2945 3207 conicto +2563 3584 1739 3584 conicto +1088 3584 lineto +512 4032 moveto +1656 4032 lineto +2817 4032 3360 3544 conicto +3904 3057 3904 2020 conicto +3904 978 3358 489 conicto +2812 0 1656 0 conicto +512 0 lineto +512 4032 lineto +end_ol grestore +gsave 9.149689 16.456500 translate 0.035278 -0.035278 scale +start_ol +512 3008 moveto +1024 3008 lineto +1024 0 lineto +512 0 lineto +512 3008 lineto +512 4160 moveto +1024 4160 lineto +1024 3520 lineto +512 3520 lineto +512 4160 lineto +end_ol grestore +gsave 9.351996 16.456500 translate 0.035278 -0.035278 scale +start_ol +2432 2880 moveto +2432 2432 lineto +2227 2528 2007 2576 conicto +1787 2624 1551 2624 conicto +1191 2624 1011 2516 conicto +832 2408 832 2192 conicto +832 2027 962 1933 conicto +1093 1839 1487 1754 conicto +1654 1717 lineto +2181 1605 2402 1402 conicto +2624 1199 2624 834 conicto +2624 420 2291 178 conicto +1959 -64 1378 -64 conicto +1136 -64 873 -16 conicto +611 32 320 128 conicto +320 640 lineto +594 512 860 448 conicto +1126 384 1387 384 conicto +1736 384 1924 498 conicto +2112 612 2112 820 conicto +2112 1013 1977 1115 conicto +1843 1218 1388 1313 conicto +1218 1352 lineto +743 1448 531 1646 conicto +320 1845 320 2192 conicto +320 2613 620 2842 conicto +920 3072 1472 3072 conicto +1746 3072 1987 3024 conicto +2228 2976 2432 2880 conicto +end_ol grestore +gsave 9.731640 16.456500 translate 0.035278 -0.035278 scale +start_ol +512 4160 moveto +1024 4160 lineto +1024 1711 lineto +2503 3008 lineto +3136 3008 lineto +1536 1601 lineto +3200 0 lineto +2554 0 lineto +1024 1469 lineto +1024 0 lineto +512 0 lineto +512 4160 lineto +end_ol grestore +1.000000 1.000000 1.000000 srgb +n 2.050000 20.120000 m 2.050000 22.220000 l 25.050000 22.220000 l 25.050000 20.120000 l f +0.100000 slw +[] 0 sd +[] 0 sd +0 slj +0.000000 0.000000 0.000000 srgb +n 2.050000 20.120000 m 2.050000 22.220000 l 25.050000 22.220000 l 25.050000 20.120000 l cp s +gsave 12.193750 21.347500 translate 0.035278 -0.035278 scale +start_ol +512 4032 moveto +2816 4032 lineto +2816 3584 lineto +1088 3584 lineto +1088 2368 lineto +2624 2368 lineto +2624 1920 lineto +1088 1920 lineto +1088 0 lineto +512 0 lineto +512 4032 lineto +end_ol grestore +gsave 12.560901 21.347500 translate 0.035278 -0.035278 scale +start_ol +512 3008 moveto +1024 3008 lineto +1024 0 lineto +512 0 lineto +512 3008 lineto +512 4160 moveto +1024 4160 lineto +1024 3520 lineto +512 3520 lineto +512 4160 lineto +end_ol grestore +gsave 12.763208 21.347500 translate 0.035278 -0.035278 scale +start_ol +3008 1829 moveto +3008 0 lineto +2496 0 lineto +2496 1813 lineto +2496 2220 2335 2422 conicto +2175 2624 1854 2624 conicto +1469 2624 1246 2379 conicto +1024 2135 1024 1713 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1198 2817 1434 2944 conicto +1670 3072 1978 3072 conicto +2486 3072 2747 2756 conicto +3008 2441 3008 1829 conicto +end_ol grestore +gsave 13.225269 21.347500 translate 0.035278 -0.035278 scale +start_ol +1882 1536 moveto +1289 1536 1060 1402 conicto +832 1268 832 944 conicto +832 686 1003 535 conicto +1175 384 1470 384 conicto +1876 384 2122 667 conicto +2368 951 2368 1422 conicto +2368 1536 lineto +1882 1536 lineto +2880 1739 moveto +2880 0 lineto +2368 0 lineto +2368 448 lineto +2199 186 1946 61 conicto +1693 -64 1328 -64 conicto +866 -64 593 196 conicto +320 456 320 893 conicto +320 1402 662 1661 conicto +1004 1920 1682 1920 conicto +2368 1920 lineto +2368 1962 lineto +2368 2278 2140 2451 conicto +1912 2624 1500 2624 conicto +1238 2624 989 2560 conicto +741 2496 512 2368 conicto +512 2816 lineto +789 2944 1049 3008 conicto +1310 3072 1556 3072 conicto +2222 3072 2551 2741 conicto +2880 2411 2880 1739 conicto +end_ol grestore +gsave 13.672349 21.347500 translate 0.035278 -0.035278 scale +start_ol +512 4160 moveto +1024 4160 lineto +1024 0 lineto +512 0 lineto +512 4160 lineto +end_ol grestore +gsave 13.874656 21.347500 translate 0.035278 -0.035278 scale +start_ol +512 3008 moveto +1024 3008 lineto +1024 0 lineto +512 0 lineto +512 3008 lineto +512 4160 moveto +1024 4160 lineto +1024 3520 lineto +512 3520 lineto +512 4160 lineto +end_ol grestore +gsave 14.076963 21.347500 translate 0.035278 -0.035278 scale +start_ol +2432 2880 moveto +2432 2432 lineto +2227 2528 2007 2576 conicto +1787 2624 1551 2624 conicto +1191 2624 1011 2516 conicto +832 2408 832 2192 conicto +832 2027 962 1933 conicto +1093 1839 1487 1754 conicto +1654 1717 lineto +2181 1605 2402 1402 conicto +2624 1199 2624 834 conicto +2624 420 2291 178 conicto +1959 -64 1378 -64 conicto +1136 -64 873 -16 conicto +611 32 320 128 conicto +320 640 lineto +594 512 860 448 conicto +1126 384 1387 384 conicto +1736 384 1924 498 conicto +2112 612 2112 820 conicto +2112 1013 1977 1115 conicto +1843 1218 1388 1313 conicto +1218 1352 lineto +743 1448 531 1646 conicto +320 1845 320 2192 conicto +320 2613 620 2842 conicto +920 3072 1472 3072 conicto +1746 3072 1987 3024 conicto +2228 2976 2432 2880 conicto +end_ol grestore +gsave 14.456607 21.347500 translate 0.035278 -0.035278 scale +start_ol +3136 1584 moveto +3136 1344 lineto +832 1344 lineto +865 875 1142 629 conicto +1420 384 1916 384 conicto +2203 384 2472 448 conicto +2742 512 3008 640 conicto +3008 192 lineto +2741 67 2460 1 conicto +2179 -64 1890 -64 conicto +1166 -64 743 352 conicto +320 768 320 1477 conicto +320 2211 723 2641 conicto +1126 3072 1809 3072 conicto +2423 3072 2779 2672 conicto +3136 2272 3136 1584 conicto +2624 1728 moveto +2619 2137 2395 2380 conicto +2172 2624 1804 2624 conicto +1387 2624 1136 2388 conicto +886 2153 848 1725 conicto +2624 1728 lineto +end_ol grestore +0.100000 slw +[] 0 sd +[] 0 sd +0 slj +0 slc +n 14.050000 17.950300 m 14.050000 19.150000 l 2.450000 19.150000 l 2.450000 10.850000 l s +0 slj +n 2.700000 10.850000 m 2.450000 10.350000 l 2.200000 10.850000 l ef +showpage diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/doc/input.tex b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/input.tex new file mode 100644 index 0000000000000000000000000000000000000000..e46ac605f00d634dad47550149ea1f2de295979f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/input.tex @@ -0,0 +1,930 @@ +\subsection{Input parameter for main program} + +The main programs are called {\ttfamily hmc\_tm} for the HMC algorithm +and {\ttfamily invert} for even odd preconditioned inversion. They can +be called with +\begin{itemize} +\item {\ttfamily -f filename}:\\ + where {\ttfamily filename} is the name of the input file to be + used. The default name is {\ttfamily hmc.input} for + {\ttfamily hmc\_tm} and {\ttfamily invert.input} for + {\ttfamily invert}. + +\item {\ttfamily -o name}:\\ + {\ttfamily name} will be used as name for several output files. This + files differ by their suffix. Default for {\ttfamily name} is + {\ttfamily output}. + +\item {\ttfamily -v }:\\ + makes the code a bit more verbose. Unrelated to input parameter + {\ttfamily DebugLevel}. + +\item {\ttfamily -?|-h}:\\ + This will produce help output and exit then. + +\end{itemize} + +There are several input parameters read from an input file. The parser +is contained in the file {\ttfamily gwc/src/bin/read\_input.l}. The +file {\ttfamily read\_input.l} is converted to {\ttfamily + read\_input.c} using {\ttfamily flex} and defines the following +function: + +Definition:\\ +\begin{ttfamily} + int read\_input(char * conf\_file) +\end{ttfamily}\\ + +\begin{tabular}[h]{l l l} +{\ttfamily conf\_file} & in & string with input file name\\ +\end{tabular} + +The functions returns $0$, if no error occurs, $2$, if the input file +could not be opened. If no input file could be opened or if there is +no value given in the input file for a paramter, default values are +used. All default values can be found in the file {\ttfamily + gwc/src/bin/default\_input\_values.h}. The syntax is mostly +{\ttfamily keyword = value} and {\ttfamily keyword} must be at the +beginning of the line. Comments starting with {\ttfamily \#} and empty +lines are allowed. The order of the lines is not importand as long as +every keyword appears only once. If it appears more than once, the +last appearance becomes valid. The parser is case-insensitive. + +In the following a list of the currently +supported general input paramters: +\begin{enumerate} +\item {\ttfamily T}:\\ + The global time extension of the lattice. Default is $4$. + +\item {\ttfamily L}:\\ + The global spatial extension of the lattice. Default is $4$. + +\item {\ttfamily LX}:\\ + The global spatial x-extension of the lattice. Default is $4$. + +\item {\ttfamily LY}:\\ + The global spatial y-extension of the lattice. Default is $4$. + +\item {\ttfamily LZ}:\\ + The global spatial z-extension of the lattice. Default is $4$. + +\item {\ttfamily NrXProcs}:\\ + The number of processors in x-direction in case of two dimensional + parallelisation. This has no effect in case of one dimensional + parallelisation. In case of two dimensional parallelisation it must + be properly set. The number of processors in time direction is + automatically computed. + +\item {\ttfamily NrYProcs, NrZProcs}:\\ + See {\ttfamily NrXProcs}. + +\item {\ttfamily seed}:\\ + The seed for the random number generator. Default value is $123456$. + +\item {\ttfamily kappa}:\\ + The $\kappa$ value. Default is $0.12$. For the {\ttfamily hmc\_tm} + application, this must be set to the physical value! It can have + different values in the single monomials, but here we need the + target value. + +\item {\ttfamily csw}:\\ + The value of the clover coefficient $c_\mathrm{sw}$. Must be larger + than zero to have effect. For the {\ttfamily hmc\_tm} + application, this must be set to the physical value! It can have + different values in the single monomials, but here we need the + target value. If set to larger than zero it will automatically + trigger an additional monomial in the even/odd case for the trace + log of the clover term. Default behaviour is no clover term. + +\item {\ttfamily 2KappaMu}:\\ + Twisted mass parameter (the physical one) for twisted mass + action. This is for internal reasons $2\kappa\mu$. For the {\ttfamily + hmc\_tm} application, this must be set to the physical value! It + can have different values in the single monomials, but here we need + the target value. + +\item {\ttfamily 2KappaMuBar}:\\ + The average mass of the heavy doublet multiplied with $2\kappa$. For + the {\ttfamily hmc\_tm} application, this must be set to the physical value! It + can have different values in the single monomials, but here we need + the target value. + +\item {\ttfamily 2KappaEpsBar}:\\ + The splitting mass multiplied with $2\kappa$. For the {\ttfamily + hmc\_tm} application, this must be set to the physical value! It + can have different values in the single monomials, but here we need + the target value. + +\item {\ttfamily Measurements}:\\ + Number of measurements in units of trajectories to be done. Default + value is $3$. For the {\ttfamily invert} programme this counts the + number of gauge configurations to invert on. (See {\ttfamily Nsave} + for the increment in the gauge index!) + +\item {\ttfamily Nsave}:\\ + For {\ttfamily hmc\_tm}: save every n-th trajectory the + configuration to disk. + For the {\ttfamily invert} programme it means that every n-th + configuration is measured. This was formerly called {\ttfamily + Nskip}. + + For {\ttfamily invert}: if more than one measurement is performed + (see {\ttfamily Measurements} parameter), + the gauge index is incremented by {\ttfamily Nsave} for each new + measurement. + +\item {\ttfamily InitialStoreCounter}:\\ + Start with value to label measurements. Default is $0$. Can be also + set to {\ttfamily readin} which causes to let the code check for a + file {\ttfamily .nstore\_counter} and reads the initial value from + this file. If it is not existing, the counter will be set to $0$. + +\item {\ttfamily GaugeConfigInputFile}:\\ + Name of input file for the gauge field. Default is {\ttfamily conf} + +\item {\ttfamily ThetaT|X|Y|Z=x}:\\ + This sets the boundary condition angle for the fermion fields in + $t$, $x$, $y$ or $z$ direction to $\theta_t = \pi x$. Default value is + zero. A value of $1$ would mean antiperiodic boundary conditions + for the fermion fields. + +\item {\ttfamily DebugLevel}:\\ + If set to a value larger than $0$ this causes verbose output: + \begin{itemize} + \item {\ttfamily DebugLevel = 1}: forces, iteration counts and flops are printed out. + \item {\ttfamily DebugLevel = 2}: every iteration step is + printed. Chronological Solver gives details about which routines + are called, the same for the monomials. polynomial gets more verbose. + \item {\ttfamily DebugLevel > 2}: all available normal output. + \item {\ttfamily DebugLevel > 3}: all debug output. Involves extra + computations, so the code will be (significantly) slower + \end{itemize} + +\item {\ttfamily UseSloppyPrecision}:\\ + Use a reduced precision Dirac operator in the MC part of the + HMC. Possible values are yes and no, the latter being the + default. This could be possibly used in the invert code along the + lines of {\ttfamily hep-lat/0609023} in the future. + +\item {\ttfamily DisableIOChecks}:\\ + Defaults to no, if set to yes, this will disable several checks + performed on gauge configuration input files, such size verification or + SciDAC checksum matching. It will also disable the readback performed with + Lemon IO. + +\item {\ttfamily GaugeConfigRead|WritePrecision}:\\ + Read/Write gauge configurations in single (32) or double (64) + precision. Default is 64. + +\item {\ttfamily UseEvenOdd}:\\ + Whether or not to use even/odd preconditioning in the invert + executable. + +\item {\ttfamily OMPNumThreads}:\\ + Number of OpenMP threads to use per process when compiled with + OpenMP support. On some architectures, the {\ttfamily OMP\_NUM\_THREADS} + environment variable needs to be set to the same value for correct + operation. The default is 1. + +\end{enumerate} + +The following input parameters are {\ttfamily invert} specific: +\begin{enumerate} +\item {\ttfamily Indices=n-m}:\\ + Compute only components $n$ to $m$ of the quark propagator. $n,m$ must + be in $[0,99]$. If the start index is not zero the data will be + appended to the propagator file, unless {\ttfamily + SplittedPropagator} is chosen. The program does not take care of the + order, the data is just appended! + +\item {\ttfamily UseRelativePrecision}:\\ + Possible values {\ttfamily yes, no}. Indicates whether relative + precision is used in the inversions for the force and the acceptance + computation. Default is no. + +\item {\ttfamily GMRESMParameter}:\\ + Krylov subspace size $m$ in GMRES($m$) and such like iterative + solvers. Not yet working! + +\item {\ttfamily GMRESDRNrEv}:\\ + Number of eigenvalues to be deflated in GMRES-DR iterative + solver. Not yet working! + +\item {\ttfamily ReadSource}:\\ + If set to yes, then the source vector is read from a file. + +\item {\ttfamily SourceTimeSlice}:\\ + The time slice of the source to be read. At + the moment used only for + the automatic construction of filenames. The filename will then be + constructed as {\ttfamily basefilename.nstore.ts.index}. + {\ttfamily SourceTimeSlice} can be also set to {\ttfamily detect} in + order to let the code determine the appropriate timeslice + value. (this might be slow, though, but it is unavoidable if + {\ttfamily invert} should run more than one gauge in a single run + and the timeslice value changes on a gauge basis.) + + It has only effect, if every source is in a separate file + (i.e. SourceInfo.splitted is set, which is the default). + +\item {\ttfamily NoSamples}:\\ + in case of stochastic source the number of samples. + +\item {\ttfamily SourceType}:\\ + lets you chose the source type: {\ttfamily Volume, Point, TimeSlice} + are possible here. + +\item {\ttfamily ComputeEVs}:\\ + compute eigenvalues and vectors before inversion in invert. Values + can be no, yes and readin. In the latter case the eigenvalues and + vectors are only read from disk, if possible. In case of yes it is + also tried to read them from disk, but they are also recomputed, to + a possibly higher precision. + +\item {\ttfamily NoEigenvalues}:\\ + number of eigenvalues to compute. + +\item {\ttfamily EigenvaluePrecision}:\\ + precision for eigenvalues. + +\item {\ttfamily ComputeModeNumber}:\\ + compute the topological susceptibility using the spectral projectors + method. Values can be yes or no. + +\item {\ttfamily ComputeModeNumber}:\\ + compute the average number of eigenmodes of the massive hermitian + operator $D_{tm}^{\dagger}D_{tm}+m^2$ with eigenvalues + $\alpha\leq M^2$. The value can be + yes and no. + +\item {\ttfamily MStarSq}:\\ + value of the parameter $M_*^2$ necessary in order to compute the + mode number or the topological susceptibility using the method of + the spectral projectors. + +\item {\ttfamily NoSourcesZ2}:\\ + number of Z2 stochastic sources for the spectral projectors method. + +\item {\ttfamily SourceLocation}:\\ + integer indicating the location of the source. The location is computed as + {\ttfamily SourceLocation = z+L*y+L*L*x+L*L*L*t}. + +\item {\ttfamily UseStoutSmearing}:\\ + Whether or not to stout smear the configuration before inversion. + +\item {\ttfamily StoutRho} and {\ttfamily StoutNoIterations}:\\ + Stout smearing parameter. + +\item {\ttfamily WritePropagatorFormat} or {\ttfamily PropagatorType}:\\ + The type in which to store the propagator. There are + \begin{itemize} + \item {\ttfamily DiracFermion\_Sink} + \item {\ttfamily DiracFermion\_Source\_Sink\_Pairs} + \item {\ttfamily DiracFermion\_ScalarSource\_TwelveSink} + \item {\ttfamily DiracFermion\_ScalarSource\_FourSink} + \end{itemize} + available. However, only the first two are implemented so far. + +\item {\ttfamily ComputeReweightingFactor}:\\ + If enabled reweighting factors will be computed corresponding to + monomials that must be specified in the input file as well. + +\item {\ttfamily NoReweightingSamples}:\\ + Number of random samples used per gauge configuration to estimate + the reweighting factor. The default is $10$. + +\end{enumerate} + +%\noindent $n_f=2+1+1$ related input parameters, where the heavy part of the actions +%reads +%\begin{equation} +% \label{eq:haction} +% S_{F,h} = \bar\psi_h\left[ \delta_{x,y}(1+i\gamma_5\bar\mu\tau^3 + +% \bar\epsilon\tau^1) +% - \kappa\sum_\mu \delta_{x,y+\mu}(1+\gamma_\mu)U_{y,\mu}\right] \psi_h +%\end{equation} + +The following input parameters are {\ttfamily hmc\_tm} specific: +\begin{enumerate} +\item {\ttfamily ThermalisationSweeps}:\\ + As long as the number of trajectories is smaller than this number + the acceptance test will be discarded. This might help to faster + equilibrate the system. + +\item {\ttfamily Startcondition}:\\ + The starting condition for a run. Possible values are {\ttfamily + hot, cold, restart, continue}. Default is {\ttfamily + cold}. Restart uses the seed to reset the random number + generator. In case of {\ttfamily continue} the programme uses the + file {\ttfamily .nstore\_counter} to get the information about from + where to read the gauge and the random number status. If this file + does not exist (its written in the course of the HMC) then the input + parameter described here are used instead. + +\item {\ttfamily ReversibilityCheck}:\\ + If set to {\ttfamily yes} the program will perform a check of + reversibility violation in the integrator by integrating back in + time. If not yet existing, the program creates a file {\ttfamily + return\_check.data} in which it stores the reversibility violation + as the difference in the Hamiltonian, the difference in the gauge + fields and the relative difference in the Hamiltonian. + +\item {\ttfamily ReversibilityCheckIntervall}:\\ + Here one can specify the intervall in terms of trajectories the + program should check the reversibility violation. + +\end{enumerate} +Following the CHROMA notation we call every part in the action a +monomial. A monomial is added to the action in the input file in the +following way: +\begin{verbatim} +BeginMonomial TYPE + Option = value +EndMonomial +\end{verbatim} +{\ttfamily TYPE} can be one of the following +\begin{itemize} +\item {\ttfamily DET}: pseudo fermion representation of the (mass degenerate)\\ + \[ + \det(Q^2(\kappa) + \mu^2) + \] +\item {\ttfamily CLOVERDET}: pseudo fermion representation of the + (mass degenerate) + \[ + \det(Q_\mathrm{sw}^2(\kappa, c_\mathrm{sw})) + \] + for the clover operator without twisted mass term. This monomial is + only available with even/odd preconditioning right now. It + automatically adds another monomial for the $\tr\ln$ part of the + clover term. + +\item {\ttfamily DETRATIO}: pseudo fermion representation of\\ + \[ + \det(Q^2(\kappa) + \mu^2)/\det(Q^2(\kappa_2) + \mu_2^2) + \] +\item {\ttfamily GAUGE}:\\ + \[ + \frac{\beta}{3}\sum_x\left( c_0\sum_{\substack{ + \mu,\nu=1\\1\leq\mu<\nu}}^4\{1-\re\tr(U^{1\times1}_{x,\mu,\nu})\}\Bigr. + \Bigl.\ +\ + c_1\sum_{\substack{\mu,\nu=1\\\mu\neq\nu}}^4\{1 + -\re\tr(U^{1\times2}_{x,\mu,\nu})\}\right)\, , + \] +\item {\ttfamily NDPOLY}: polynomial representation of the (possibly + non-degenerate) Wilson twisted mass doublet + \[ + [\det(\hat Q_{h}(\bar\epsilon, \bar\mu)^2)]^{1/2} \approx \det(\mathcal{R}^{-1}) + \] +\item {\ttfamily NDRAT}: rational representation of the (possibly + non-degenerate) Wilson twisted mass doublet + \[ + [\det(\hat Q_{h}(\bar\epsilon, \bar\mu)^2)]^{1/2} + \] + with an approximation + \[ + \mathcal{R}(Q_{nd}^2)\ = \ \prod_{i = 1}^N \frac{Q_{nd}^2 + + a_{2i}}{\hat Q_{h}^2 + a_{2i-1}}\approx\quad\frac{1}{\sqrt{\hat Q_{h}^2}} + \] +\item {\ttfamily NDRATCOR}: correction monomial for approximation + errors in the rational approximation used in {\ttfamily NDRAT} + \[ + \det\left( \hat Q_h \mathcal{R} \right)\,. + \] +\item {\ttfamily NDCLOVERRAT, NDCLOVERRATCOR}: clover versions of + {\ttfamily NDRAT} and {\ttfamily NDRATCOR}, respectively. +\item {\ttfamily NDCLOVER}: polynomial representation of the (possibly + non-degenerate) clover twisted mass doublet + \[ + [\det(Q_{nd}(\bar\epsilon, \bar\mu)^2), c_\mathrm{sw}]^{1/2} + \] +\item {\ttfamily POLY}: polynomial approximation ($P_n(x) \approx \frac{1}{x}$) of the mass degenerate determinant\\ + \[ + \left[\det(P_{n}(Q^2(\kappa) + \mu^2))\right]^{-1} + \] +\item {\ttfamily POLYDETRATIO}: pseudo fermion representation of (for PHMC + mass precondition)\\ + \[ + \left[\det(P_{n}(Q^2(\kappa) + \mu^2)) det(Q^2(\kappa_2) + \mu^2_2)\right]^{-1} + \] +\end{itemize} +Each of them has different options: +\begin{itemize} +\item {\ttfamily DET, CLOVERDET}: + \begin{itemize} + \item {\ttfamily 2KappaMu} + \end{itemize} +\item {\ttfamily CLOVERDET}: + \begin{itemize} + \item {\ttfamily csw} + \end{itemize} +\item {\ttfamily DET, CLOVERDET}: + \begin{itemize} + \item {\ttfamily Kappa} + \item {\ttfamily Timescale}: the timescale on which to integrate + this monomial. Counting starts from zero up to the total number of + timescales minus 1. + \item {\ttfamily CSGHistory}: the maximal number of vectors to store + for the chronolical predictor (for CG and BiCGstab), default $0$. + \item {\ttfamily CSGHistory2}: the maximal number of vectors to store + for the second chronolical predictor (for BiCGstab only), default + $0$. + \item {\ttfamily ForcePrecision}: the solver precision used in the + force computation + \item {\ttfamily AcceptancePrecision}: the solver precision used in the + acceptance and heatbath + \item {\ttfamily MaxSolverIterations}: default is $5000$ + \item {\ttfamily Solver}: the solver to be used, either CG or + BiCGstab. Default is CG. + \item {\ttfamily Name}: a name to be assigned to the monomial. The + default is {\ttfamily DET} + \end{itemize} +\item {\ttfamily DETRATIO}: the same as for {\ttfamily DET}, but in + addition: + \begin{itemize} + \item {\ttfamily 2KappaMu2} + \item {\ttfamily Kappa2} + \item {\ttfamily Name}: a name to be assigned to the monomial. The + default is {\ttfamily DETRATIO} + \end{itemize} + + +\item {\ttfamily GAUGE}: + \begin{itemize} + \item {\ttfamily Timescale}: the timescale on which to integrate + this monomial. Counting starts from zero up to the total number of + timescales minus 1. + \item {\ttfamily Name}: a name to be assigned to the monomial. The + default is {\ttfamily GAUGE}. + + \item {\ttfamily beta}:\\ + The invers coupling $\beta$. Default value is $5.2$. + + \item {\ttfamily Type}: can be one of\ {\ttfamily Wilson, tlsym, + Iwasaki, DBW2, user}. For type user you can specify also the two + following options. Default is {\ttfamily user} here. + \item {\ttfamily UseRectangleStaples}: can be yes or no, indicating + whether to use also the rectangle staples. No corresponds to pure + Wilson plaquette. Default is no. Is effective only for {\ttfamily + type = user}. + \item {\ttfamily RectangleCoefficient}: the value of the parameter + $c_1$. The coefficient $c_0$ is computed from $c_0 = 1-8c_1$. Is + effective only for {\ttfamily type = user}. + \end{itemize} + There is maximally one instance allowed of this type. + + +\item {\ttfamily NDPOLY}: switches + on the PHMC part for the non-degenerate heavy doublet and lets you + specify the timescale on which to integrate this and the parameters. + \begin{itemize} + \item {\ttfamily 2KappaMubar}: $2\kappa\bar\mu$ the heavy twisted mass + \item {\ttfamily 2KappaEpsbar}: $2\kappa\bar\epsilon$ the heavy + splitting + \item {\ttfamily Kappa}: the $\kappa$ value + \item {\ttfamily Timescale}: the timescale on which to integrate + this monomial. Counting starts from zero up to the total number of + timescales minus 1. + + \item {\ttfamily Name}: a name to be assigned to the monomial. The + default is {\ttfamily NDPOLY} + + \item {\ttfamily ComputeEVFreq}: + If you want to calculate the eigenvalues every n'th trajectory + then set this parameter to n if you want no eigenvalues set this to 0 + during thermalization you should set this to 1 or 2 to follow the evolution + of smallest and largest eigenvalue to adjust the approximation interval + of the polynomial + + \item {\ttfamily ComputeOnlyEVs}: Computes only once at the very + beginning of the run the eigenvalues of the heavy split operator + and exits. + + \item {\ttfamily StildeMin}: lower bound for the approximation interval of the polynomial + + \item {\ttfamily StildeMax}: + upper bound for the approximation interval of the polynomial + + \item {\ttfamily DegreeOfMDPolynomial}: + degree of the less precise polynomial $P$. Must be identical to the + degree used to compute the roots. + + \item {\ttfamily LocNormConst}: + Constant (local normalisation constant) which is multiplied to each monomial (of the polynomial $P_n$). + \item {\ttfamily RootsFile}: + File name specifying a file containing the $n=$ {\ttfamily Degree} roots of the Polynomial + + \item {\ttfamily PrecisionPtilde}: + Precision of the more precise polynomial $\tilde P$ used in the + heat-bath and the acceptance step of the PHMC. + + \item {\ttfamily PrecisionHfinal}: + \end{itemize} + So far, there is maximally one instance allowed for this type. This + might change in the future. + +\item {\ttfamily NDRAT}: like {\ttfamily NDPOLY}, but with a rational + approximation. + \begin{itemize} + \item {\ttfamily 2KappaMubar}: $2\kappa\bar\mu$ the heavy twisted mass + \item {\ttfamily 2KappaEpsbar}: $2\kappa\bar\epsilon$ the heavy + splitting + \item {\ttfamily Kappa}: the $\kappa$ value + \item {\ttfamily DegreeOfRational}: the order $N$ of the rational approximation + \item {\ttfamily StildeMin}: lower bound for the approximation + interval of the rational approximation + + \item {\ttfamily StildeMax}: + upper bound for the approximation interval of the rational approximation + \item {\ttfamily Cmin}: it is possible to use only pairs of coefficients + in the range from $[c_a,c_b]$ in order to introduce an frequency + splitting. {\ttfamily Cmin} corresponds to $0 \mu_1 > ... > \mu_{N-1}\,, + \] + and hence $c_a = N-1$ and $c_b = N-1$ would generate a rational + with only the smallest and, therefore, most expensive shift (which + one would typically integrate on a coarse timescale). $c_a + = 0$ and $c_b = k < N$ would correspond to a rational with the + $k+1$ largest shifts. + \item {\ttfamily Cmax}: $c_b\geq c_a$, see {\ttfamily Cmin}. + \item {\ttfamily ComputeOnlyEVs}: Computes only once at the very + beginning of the run the eigenvalues of the heavy split operator + and exits. + \item {\ttfamily ForcePrecision}: the CGMMS solver precision used in the + force computation + \item {\ttfamily AcceptancePrecision}: the CGMMS solver precision used in the + acceptance and heatbath + \item {\ttfamily MaxSolverIterations}: maximal number of CGMMS + solver iterations, default is $5000$. + \end{itemize} + It is important to realise that if the splitting is used, then every + partial fraction \emph{must appear once and only once}. Otherwise, the + algorithm will not describe the desired physics! Consequently, also + the different {\ttfamily NDRAT} monomials from the same rational + approximation used for frequency splitting have to have identical + order. + +\item {\ttfamily NDRATCOR}: correction monomial for approximation + errors in the rational approximation for the heavy doublet. This + monomial has no derivative part and it is only used in the heatbath + and acceptance steps. + \begin{itemize} + \item {\ttfamily 2KappaMubar}: $2\kappa\bar\mu$ the heavy twisted mass + \item {\ttfamily 2KappaEpsbar}: $2\kappa\bar\epsilon$ the heavy + splitting + \item {\ttfamily Kappa}: the $\kappa$ value + \item {\ttfamily DegreeOfRational}: the order $N$ of the rational + approximation. \emph{The order must match the order of the corresponding + (splitted) {\ttfamily NDRAT} monomial(s).} + \item {\ttfamily StildeMin}: lower bound for the approximation + interval of the rational approximation + + \item {\ttfamily StildeMax}: + upper bound for the approximation interval of the rational approximation + \item {\ttfamily ComputeOnlyEVs}: Computes only once at the very + beginning of the run the eigenvalues of the heavy split operator + and exits. + \item {\ttfamily ForcePrecision}: the CGMMS solver precision used in the + force computation + \item {\ttfamily AcceptancePrecision}: the CGMMS solver precision used in the + acceptance and heatbath + \item {\ttfamily MaxSolverIterations}: maximal number of CGMMS + solver iterations, default is $5000$. + \end{itemize} + +\item {\ttfamily NDCLOVERRAT, NDCLOVERRATCOR}: + The same as {\ttfamily NDRAT, NDRATCOR}, but with the additional + parameter {\ttfamily CSW} and only for {\ttfamily NDCLOVERRAT} + \begin{itemize} + \item {\ttfamily AddTrLog =yes|no}: adds a clover trlog monomial + with the parameters of this monomial. {\ttfamily no} is + default. One needs only one trlog monomial per non-degenerate + doublet, so one needs to take care in case of frequency splitting + of the rational approximation to have this set to {\ttfamily yes} + only once. + \end{itemize} + +\item {\ttfamily POLY, POLYDETRATIO}: + \begin{itemize} + \item {\ttfamily Degree}: + Degree of the Polynomial. + \item {\ttfamily Lmin}: + Lower bound of approximation interval. + \item {\ttfamily Lmax}: + Upper bound of approximation interval. + \item {\ttfamily LocNormConst}: + Constant (local normalisation constant) which is multiplied to each monomial (of the polynomial $P_n$). + \item {\ttfamily RootsFile}: + File name specifying a file containing the $n=$ {\ttfamily Degree} roots of the Polynomial + \item {\ttfamily + Parameters from DET \& DETRATIO monomial} + \end{itemize} +There can be arbitrary many POLY monomials. But take into account that there will be allocated $n/2$ number of spinor fields for EACH poly monomial. (Maybe in the future we should think about to share these fields with all POLY/NDPOLY monomials as there are used only for the computation of the force and have to be updated before each successive calculation of the force.)\\ +This monomial needs a valid {\ttfamily RootsFile} and {\ttfamily LocNormConst} parameter. Both can be obtained from the {\ttfamily oox} program in the {\ttfamily util/oox} subdirectory of the hmc code. It can be invoked by the command:\\ +{\ttfamily \$ oox -d -e }\\ +{\ttfamily } is to be replaced by the ratio {\ttfamily Lmin/Lmax}. +\end{itemize} + +\subsubsection{The Integrator} + +The Integrator can be specified similar to the monomials: +\begin{verbatim} +BeginIntegrator + Option = value +EndIntegrator +\end{verbatim} +with the following options available: +\begin{itemize} +\item {\ttfamily Tau}: total trajectory length. +\item {\ttfamily NumberOfTimescales}: total number of timescales. +\item {\ttfamily MonitorForces}: setting this to {\ttfamily yes} + enables the computation of the forces per monomial at the beginning + of each trajectory. +\item {\ttfamily IntegrationStepsN = M} where {\ttfamily N} is the + timescale (as integer value, counting starts from zero and goes up + to the number of timescales minus 1) and {\ttfamily M} is the number + of integration steps on that timescale. Note, that the integrators + or defined recursively. +\item {\ttfamily LambdaN = F} where {\ttfamily N} is the + timescale and {\ttfamily F} is a floating point number specifying + the $\lambda$ value to be used on this timescale in case of the + second order minimal norm integrator (2MN, 2MNPOSITION). The default + value is $0.19$. Note, that $\lambda = 1/6$ is the Sexton-Weingarte + scheme. +\item {\ttfamily TypeN = TYPE}: set the type of integrator to be used + on timescale {\ttfamily N}. The following types available: + {\ttfamily 2MN, 2MNPOSITION, LEAPFROG} + + The position versions are not compatible with the velocity versions, + thus they must not be used together. +\end{itemize} +A timescale must not be empty. Currently the maximal number of +timescales is $10$ and there cannot be more than $10$ monomials per +timescale. But there can be more than one monomial per timescale. + +\subsubsection{Chosing the Operator for Inversions} + +\begin{verbatim} +BeginOperator TYPE + Option = value +EndOperator +\end{verbatim} +{\ttfamily TYPE} can be one of the following +\begin{itemize} +\item {\ttfamily WILSON}: simple Wilson Dirac operator, with options: + \begin{itemize} + \item {\ttfamily UseEvenOdd} + \end{itemize} +\item {\ttfamily TMWILSON}: Wilson Twisted Mass Dirac operator, with + options: + \begin{itemize} + \item {\ttfamily 2KappaMu} + \item {\ttfamily UseEvenOdd} + \end{itemize} +\item {\ttfamily CLOVER}: Clover Twisted Mass Dirac operator, with + options: + \begin{itemize} + \item {\ttfamily 2KappaMu} + \item {\ttfamily UseEvenOdd} + \item {\ttfamily CSW} + \end{itemize} +\item {\ttfamily DBTMWILSON}: two flavour mass non-degenerate Wilson + Twisted Mass Dirac operator: + \begin{itemize} + \item {\ttfamily 2KappaMubar} + \item {\ttfamily 2KappaEpsbar} + \end{itemize} +\item {\ttfamily DBCLOVER}: two flavour mass non-degenerate Clover + Twisted Mass Dirac operator: + \begin{itemize} + \item {\ttfamily CSW} + \item {\ttfamily 2KappaMubar} + \item {\ttfamily 2KappaEpsbar} + \end{itemize} +\item {\ttfamily OVERLAP}: overlap operator: + \begin{itemize} + \item {\ttfamily m} + \item {\ttfamily s} + \item {\ttfamily DegreeOfPolynomial} + \item {\ttfamily NoKernerlEigenvalues} + \item {\ttfamily KernelEigenvaluePrecision} + \end{itemize} +\end{itemize} +All of them provide the following options available: +\begin{itemize} +\item {\ttfamily kappa}: +\item {\ttfamily Solver}:\\ + Sets the solver to be used. Possible values are among others + {\ttfamily CG, BiCGstab, CGS, GMRES, PCG and CGMMS}. +\item {\ttfamily MaxSolverIterations}: +\item {\ttfamily PropagatorPrecision}: +\item {\ttfamily SolverPrecision}: +\end{itemize} + +The {\ttfamily CGMMS} solver can be used to invert the operator for +multiple masses at the same time. To this end a list of masses needs +to be provided either as a comma-separated list or as the filename of a +text file which lists one mass per line. The masses must be provided in the +format $2 \kappa \mu_n$. The normal mass specified for the operator is +used as $\mu_0$. The masses must be ordered such that $\mu_0 < \mu_1 < ... < \mu_n$.: + +\begin{itemize} + \item{ {\ttfamily ExtraMasses = 0.12, 0.14, 0.17, 0.21, 0.30} } + \item{ {\ttfamily ExtraMasses = extra\_masses.input } } +\end{itemize} + +\subsubsection{Online Measurements} + +A number of measurements can be performed online while the hmc is +running. +\begin{verbatim} +BeginMeasurement TYPE + Option = value +EndMeasurement +\end{verbatim} +where {\ttfamily TYPE} can be currently one of the following: +\begin{itemize} +\item {\ttfamily CORRELATORS}: + \begin{itemize} + \item {\ttfamily MaxSolverIterations} + \end{itemize} + this is for zero temperature, so the stochastic source is at fixed + $t$. In addition it needs an operator defined in the input file, + otherwise it will do nothing. (see input keywords for {\ttfamily invert} above) +\item {\ttfamily PIONNORM}: + \begin{itemize} + \item {\ttfamily MaxSolverIterations} + \end{itemize} + this is for finite temperature, the stochastic source is at fixed $z$. + +\item {\ttfamily POLYAKOVLOOP}: + \begin{itemize} + \item {\ttfamily Directions} can be either $0$ for time- or $3$ for z-direction. + \end{itemize} +\end{itemize} +The frequency of measuring all of these can be adjusted with the +Option {\ttfamily Frequency}. + +\subsubsection{Example Input File} + +The following is a typical HMC input file: +\begin{verbatim} +L=8 +T=16 +Measurements = 1 +Startcondition = hot +2KappaMu = 0.03 +kappa = 0.090 +2KappaMubar = 1. +2KappaEpsbar = 0.2 + +#This is a comment + +PhmcRecEVInterval = 1 +Nsave = 50 +ThetaT = 1. +InitialStoreCounter = readin +UseEvenOdd = yes +ReversibilityCheck = no +ReversibilityCheckIntervall = 1 +DebugLevel = 3 + +BeginMeasurement CORRELATORS + MaxSolverIterations = 1000 + Frequency = 1 +EndMeasurement + +BeginMonomial GAUGE + beta = 3.30 + Timescale = 0 + Type = tlsym +EndMonomial + +BeginMonomial DET + Timescale = 1 + 2KappaMu = 0. + kappa = 0.125 + AcceptancePrecision = 1.e-20 + ForcePrecision = 1.e-12 + Name = det + solver = cg + CSGHistory = 10 + CSGHistory2 = 10 +EndMonomial + +BeginMonomial DETRATIO + Timescale = 2 + 2KappaMu = 0.03 + 2KappaMu2 = 0.1 + kappa = 0.125 + kappa2 = 0.125 + maxiter = 20000 + AcceptancePrecision = 1.e-20 + ForcePrecision = 1.e-12 + Name = detrat + solver = cg +EndMonomial + +# this is a NDPOLY monomial +# but commented out +#BeginMonomial NDPOLY +# Timescale = 1 +#EndMonomial + +BeginIntegrator + Type0 = 2MN + Type1 = 2MN + Type2 = 2MN + IntegrationSteps0 = 1 + IntegrationSteps1 = 2 + IntegrationSteps2 = 3 + tau = 1. + Lambda0 = 0.19 + NumberOfTimescales = 3 +EndIntegrator + +# for the CORRELATORS online measurement +BeginOperator TMWILSON + 2kappaMu = 0.177 + kappa = 0.177 + UseEvenOdd = yes + Solver = CG + SolverPrecision = 1e-14 + MaxSolverIterations = 1000 +EndOperator +\end{verbatim} + +There are realistic small volume sample input files in the +sub-directory {\ttfamily sample-input}, which also represent test runs +for the code. For the inverter a typical file would look like +\begin{verbatim} +L=4 +T=4 +DebugLevel = 2 +InitialStoreCounter = 1 +Indices = 0-7 +ReadSource = no +Measurements = 1 +ThetaT = 1. +UseEvenOdd = no +UseRelativePrecision = yes +SplittedPropagator = yes +PropagatorType = DiracFermion_Source_Sink_Pairs +UseStoutSmearing = no +StoutRho = 0.15 +StoutNoIterations = 10 +UseSloppyPrecision = yes + +# both operators will be inverted for +BeginOperator TMWILSON + Solver = CG + 2KappaMu = 0.177 + kappa = 0.177 + SolverPrecision = 1.e-15 + UseEvenOdd = yes +EndOperator + +BeginOperator DBTMWILSON + 2KappaMubar = 0.177 + 2KappaEpsbar = 0.190 + kappa = 0.177 +EndOperator + + +# and for reweighting possibly +BeginMonomial DETRATIO + Timescale = 2 + 2KappaMu = 0.03 + 2KappaMu2 = 0.0305 + kappa = 0.15 + kappa2 = 0.15 + maxiter = 20000 + AcceptancePrecision = 1.e-20 + Name = detrat + solver = cg +EndMonomial + +\end{verbatim} + +\subsubsection{Reread functionality} + +If you store a file with name {\ttfamily hmc.reread} in the working +directory of a running HMC, the program will read in this file after +the next finished trajectory. Then it will change the parameters +accordingly without the need of restarting the program. + +One cannot change from gauge action without rectangle part to gauge +action with rectangle part. If one wants to change $\mu$-, +$\epsilon^2$- or $N_i$-parameter one has to give allways all of +them. Otherwise the internal matching does not work and the program +will do nonsense. + +The file will be deleted automatically, if it was used. A message will +be posted to standard output and to the file {\ttfamily + history\_hmc\_tm} to let you identify the exact point where the +parameters changed. + +%%% Local Variables: +%%% mode: latex +%%% TeX-master: "main" +%%% End: diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/doc/install.tex b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/install.tex new file mode 100644 index 0000000000000000000000000000000000000000..a0334640309ea09ac7b2e0762c679dec0f23b264 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/install.tex @@ -0,0 +1,126 @@ +The software ships with a GNU autoconf environment and a configure +script, which will generate GNU Makefiles to build the programmes. It +is supported and recommended to configure and build the executables in +a separate build directory. This also allows to have several builds with +different options from the same source code directory. + +\subsection{Prerequisites} + +In order to compile the programmes the {\ttfamily + LAPACK}~\cite{lapack:web} library (fortran version) needs to be +installed. In addition it must be known which linker options are +needed to link against {\ttfamily LAPACK}, e.g. {\ttfamily + -Lpath-to-lapack -llapack -lblas}. Also a the latest +version (tested is version 1.2.3) of {\ttfamily + C-LIME}~\cite{lime:web} must be available, which is used as a +packaging scheme to read and write gauge configurations and +propagators to files. + +\subsection{Configuring the hmc package} +\label{sec:config} + +In order to get a simple configuration of the hmc package it is enough +to just type +\begin{verbatim} +path-to-src-code/configure --with-lime= \ + --with-lapack= CC= \ + F77= CFLAGS= +\end{verbatim} +in the build directory. If +{\ttfamily CC, F77} and {\ttfamily CFLGAS} are not specified, +{\ttfamily configure} will guess them. + +The code was successfully compiled and run at least on the following +platforms: i686 and compatible, x64 and compatible, IBM Regatta +systems, IBM Blue Gene/L, IBM Blue Gene/P, SGI Altix and SGI PC +clusters, powerpc clusters. + +The configure script accepts certain options to influence the building +procedure. One can get an overview over all supported options with +{\ttfamily configure --help}. There are {\ttfamily enable|disable} +options switching on and off optional features and {\ttfamily + with|without} switches usually related to optional packages. In the +following we describe the most important of them (check {\ttfamily + configure --help} for the defaults and more options): + +\begin{itemize} +\item {\ttfamily --enable-mpi}:\\ + This option switches on the support for MPI. On certain platforms it + automatically chooses the correct parallel compiler or searches for + a command {\ttfamily mpicc} in the search path. + +\item {\ttfamily --enable-p4}:\\ + Enable the use of special Pentium4 instruction set and cache + management. + +\item {\ttfamily --enable-opteron}:\\ + Enable the use of special opteron instruction set and cache + management. + +%\item {\ttfamily --enable-sse}:\\ +% Enable the use of SSE instruction set. This means not much when 64 +% Bit precision is used. + +\item {\ttfamily --enable-sse2}:\\ + Enable the use of SSE2 instruction set. This is a huge improvement + on Pentium4 and equivalent systems. + +\item {\ttfamily --enable-sse3}:\\ + Enable the use of SSE3 instruction set. This will give another 20\% + of speedup when compared to only SSE2. However, only a few + processors are capable of SSE3 so far. + +\item {\ttfamily --enable-gaugecopy}:\\ + See section \ref{sec:dirac} for details on this option. It will + increase the memory requirement of the code. + +\item {\ttfamily --enable-halfspinor}:\\ + If this option is enabled the Dirac operator using half spinor + fields is used. See sub-section \ref{sec:dirac} for details. If this + feature is switched on, also the gauge copy feature is switched + on automatically. + +%\item {\ttfamily --enable-shmem}:\\ +% Use shared memory API instead of MPI for the communication of spinor +% fields. This is currently only usable on the Munich Altix machine. + +\item {\ttfamily --with-mpidimension=n}:\\ + This option has only effect if the preceding one is switched + on. The number of parallel directions can be specified. 1,2,3 and 4 + dimensional parallelisation is supported. + +\item {\ttfamily --with-lapack=""}:\\ + the code requires lapack to be linked. All linker flags necessary + to do so must be specified here. Note, that {\ttfamily LIBS="..."} + works similar. + +\item {\ttfamily --with-limedir=}:\\ + Tells configure where to find the lime package, which is required for + the build of the HMC. It is used for the ILDG file format. + +\end{itemize} + +The configure script will guess at the very beginning on which +platform the build is done. In case this fails or a cross compilation +must be performed please use the option {\ttfamily --host=HOST}. For +instance in order to compile for the BG/P one needs to specify +{\ttfamily --host=ppc-ibm-bprts --build=ppc64-ibm-linux}. + +For certain architectures like the Blue Gene systems there are +{\ttfamily README.arch} files in the top source directory with +example configure calls. + +\subsection{Building and Installing} + +After successfully configuring the package the code can be build by +simply typing {\ttfamily make} in the build directory. This will +compile the standard executables. Typing {\ttfamily make install} will +copy these executables into the install directory. The default install +directory is {\ttfamily \$HOME/bin}, which can be influenced e.g. with +the {\ttfamily --prefix} option to {\ttfamily configure}. + + +%%% Local Variables: +%%% mode: latex +%%% TeX-master: "main" +%%% End: diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/doc/integrationschemes.tex b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/integrationschemes.tex new file mode 100644 index 0000000000000000000000000000000000000000..1b6164e4b1d9e286010079945f2d93ba320c8326 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/integrationschemes.tex @@ -0,0 +1,37 @@ +\subsection{Integration schemes} +\label{sec:integrators} + +Most of the details for the integration schemes can be found in +{\ttfamily hep-lat/0506011}. Therefore we give here only details for +the symplectic integration schemes {\ttfamily 2MN} and {\ttfamily + 2MNposition}. + +The second order minimal norm (2MN) integration scheme is a +generalisation of the Sexton-Weingarten scheme. While the latter is +build by the baisc integration step: +\begin{equation} + \label{int:0} + T_{\mathrm{SW}_0}\ =\ T_{\mathrm{S}_0}(\dtau_0/6)\ + T_\mathrm{U}(\dtau_0/2)\ T_{\mathrm{S}_0}(2\dtau_0/3)\ + T_\mathrm{U}(\dtau_0/2)\ T_{\mathrm{S}_0}(\dtau_0/6)\, , +\end{equation} +the 2MN scheme is build onto +\begin{equation} + \label{int:1} + T_{\mathrm{2MN}_0}\ =\ T_{\mathrm{S}_0}(\lambda_0\dtau_0)\ + T_\mathrm{U}(\dtau_0/2)\ T_{\mathrm{S}_0}((1-2\lambda_0)\dtau_0)\ + T_\mathrm{U}(\dtau_0/2)\ T_{\mathrm{S}_0}(\lambda_0\dtau_0)\, . +\end{equation} +$\lambda_0$ is a dimensionless parameter and the 2MN coincides with +the Sexton-Weingarten scheme in case $\lambda_0=1/6$. The optimal +value for $\lambda_0$ was given in Ref.~\cite{Takaishi:2005tz} to be +around $0.19$. But its value is likely to depend on the mass values +and the time scale under consideration. + +We can now introduce a parameter $\lambda_i$ for each timescale +$\dtau_i$ and tune them seperatly. + +%%% Local Variables: +%%% mode: latex +%%% TeX-master: "main" +%%% End: diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/doc/invertflow.eps b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/invertflow.eps new file mode 100644 index 0000000000000000000000000000000000000000..bbc97d2ffbc5893c54bffb5bd1b538db6c8c7165 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/invertflow.eps @@ -0,0 +1,3438 @@ +%!PS-Adobe-2.0 EPSF-2.0 +%%Title: /home/urbach/daten/workdir/etmc/cpc40/invertflow.dia +%%Creator: Dia v0.96.1 +%%CreationDate: Fri Jan 30 14:39:52 2009 +%%For: urbach +%%Orientation: Portrait +%%Magnification: 1.0000 +%%BoundingBox: 0 0 660 533 +%%BeginSetup +%%EndSetup +%%EndComments +%%BeginProlog +[ /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef +/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef +/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef +/.notdef /.notdef /space /exclam /quotedbl /numbersign /dollar /percent /ampersand /quoteright +/parenleft /parenright /asterisk /plus /comma /hyphen /period /slash /zero /one +/two /three /four /five /six /seven /eight /nine /colon /semicolon +/less /equal /greater /question /at /A /B /C /D /E +/F /G /H /I /J /K /L /M /N /O +/P /Q /R /S /T /U /V /W /X /Y +/Z /bracketleft /backslash /bracketright /asciicircum /underscore /quoteleft /a /b /c +/d /e /f /g /h /i /j /k /l /m +/n /o /p /q /r /s /t /u /v /w +/x /y /z /braceleft /bar /braceright /asciitilde /.notdef /.notdef /.notdef +/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef +/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef +/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef +/space /exclamdown /cent /sterling /currency /yen /brokenbar /section /dieresis /copyright +/ordfeminine /guillemotleft /logicalnot /hyphen /registered /macron /degree /plusminus /twosuperior /threesuperior +/acute /mu /paragraph /periodcentered /cedilla /onesuperior /ordmasculine /guillemotright /onequarter /onehalf +/threequarters /questiondown /Agrave /Aacute /Acircumflex /Atilde /Adieresis /Aring /AE /Ccedilla +/Egrave /Eacute /Ecircumflex /Edieresis /Igrave /Iacute /Icircumflex /Idieresis /Eth /Ntilde +/Ograve /Oacute /Ocircumflex /Otilde /Odieresis /multiply /Oslash /Ugrave /Uacute /Ucircumflex +/Udieresis /Yacute /Thorn /germandbls /agrave /aacute /acircumflex /atilde /adieresis /aring +/ae /ccedilla /egrave /eacute /ecircumflex /edieresis /igrave /iacute /icircumflex /idieresis +/eth /ntilde /ograve /oacute /ocircumflex /otilde /odieresis /divide /oslash /ugrave +/uacute /ucircumflex /udieresis /yacute /thorn /ydieresis] /isolatin1encoding exch def +/cp {closepath} bind def +/c {curveto} bind def +/f {fill} bind def +/a {arc} bind def +/ef {eofill} bind def +/ex {exch} bind def +/gr {grestore} bind def +/gs {gsave} bind def +/sa {save} bind def +/rs {restore} bind def +/l {lineto} bind def +/m {moveto} bind def +/rm {rmoveto} bind def +/n {newpath} bind def +/s {stroke} bind def +/sh {show} bind def +/slc {setlinecap} bind def +/slj {setlinejoin} bind def +/slw {setlinewidth} bind def +/srgb {setrgbcolor} bind def +/rot {rotate} bind def +/sc {scale} bind def +/sd {setdash} bind def +/ff {findfont} bind def +/sf {setfont} bind def +/scf {scalefont} bind def +/sw {stringwidth pop} bind def +/tr {translate} bind def + +/ellipsedict 8 dict def +ellipsedict /mtrx matrix put +/ellipse +{ ellipsedict begin + /endangle exch def + /startangle exch def + /yrad exch def + /xrad exch def + /y exch def + /x exch def /savematrix mtrx currentmatrix def + x y tr xrad yrad sc + 0 0 1 startangle endangle arc + savematrix setmatrix + end +} def + +/mergeprocs { +dup length +3 -1 roll +dup +length +dup +5 1 roll +3 -1 roll +add +array cvx +dup +3 -1 roll +0 exch +putinterval +dup +4 2 roll +putinterval +} bind def +/dpi_x 300 def +/dpi_y 300 def +/conicto { + /to_y exch def + /to_x exch def + /conic_cntrl_y exch def + /conic_cntrl_x exch def + currentpoint + /p0_y exch def + /p0_x exch def + /p1_x p0_x conic_cntrl_x p0_x sub 2 3 div mul add def + /p1_y p0_y conic_cntrl_y p0_y sub 2 3 div mul add def + /p2_x p1_x to_x p0_x sub 1 3 div mul add def + /p2_y p1_y to_y p0_y sub 1 3 div mul add def + p1_x p1_y p2_x p2_y to_x to_y curveto +} bind def +/start_ol { gsave 1.1 dpi_x div dup scale} bind def +/end_ol { closepath fill grestore } bind def +28.346000 -28.346000 scale +-1.850000 -19.850000 translate +%%EndProlog + + +1.000000 1.000000 1.000000 srgb +n 2.050000 1.100000 m 2.050000 3.200000 l 25.000000 3.200000 l 25.000000 1.100000 l f +0.100000 slw +[] 0 sd +[] 0 sd +0 slj +0.000000 0.000000 0.000000 srgb +n 2.050000 1.100000 m 2.050000 3.200000 l 25.000000 3.200000 l 25.000000 1.100000 l cp s +gsave 2.500000 2.327500 translate 0.035278 -0.035278 scale +start_ol +1088 3584 moveto +1088 448 lineto +1739 448 lineto +2563 448 2945 826 conicto +3328 1204 3328 2020 conicto +3328 2830 2945 3207 conicto +2563 3584 1739 3584 conicto +1088 3584 lineto +512 4032 moveto +1656 4032 lineto +2817 4032 3360 3544 conicto +3904 3057 3904 2020 conicto +3904 978 3358 489 conicto +2812 0 1656 0 conicto +512 0 lineto +512 4032 lineto +end_ol grestore +gsave 3.061975 2.327500 translate 0.035278 -0.035278 scale +start_ol +1697 2624 moveto +1297 2624 1064 2324 conicto +832 2025 832 1504 conicto +832 983 1063 683 conicto +1294 384 1697 384 conicto +2095 384 2327 684 conicto +2560 985 2560 1504 conicto +2560 2020 2327 2322 conicto +2095 2624 1697 2624 conicto +1696 3072 moveto +2338 3072 2705 2656 conicto +3072 2240 3072 1504 conicto +3072 771 2705 353 conicto +2338 -64 1696 -64 conicto +1051 -64 685 353 conicto +320 771 320 1504 conicto +320 2240 685 2656 conicto +1051 3072 1696 3072 conicto +end_ol grestore +gsave 3.506558 2.327500 translate 0.035278 -0.035278 scale +start_ol +end_ol grestore +gsave 3.738837 2.327500 translate 0.035278 -0.035278 scale +start_ol +512 4032 moveto +1279 4032 lineto +2944 659 lineto +2944 4032 lineto +3520 4032 lineto +3520 0 lineto +2753 0 lineto +1088 3373 lineto +1088 0 lineto +512 0 lineto +512 4032 lineto +end_ol grestore +gsave 4.283325 2.327500 translate 0.035278 -0.035278 scale +start_ol +1697 2624 moveto +1297 2624 1064 2324 conicto +832 2025 832 1504 conicto +832 983 1063 683 conicto +1294 384 1697 384 conicto +2095 384 2327 684 conicto +2560 985 2560 1504 conicto +2560 2020 2327 2322 conicto +2095 2624 1697 2624 conicto +1696 3072 moveto +2338 3072 2705 2656 conicto +3072 2240 3072 1504 conicto +3072 771 2705 353 conicto +2338 -64 1696 -64 conicto +1051 -64 685 353 conicto +320 771 320 1504 conicto +320 2240 685 2656 conicto +1051 3072 1696 3072 conicto +end_ol grestore +gsave 4.715414 2.327500 translate 0.035278 -0.035278 scale +start_ol +640 704 moveto +1216 704 lineto +1216 0 lineto +640 0 lineto +640 704 lineto +end_ol grestore +gsave 4.947693 2.327500 translate 0.035278 -0.035278 scale +start_ol +end_ol grestore +gsave 5.179973 2.327500 translate 0.035278 -0.035278 scale +start_ol +1697 2624 moveto +1297 2624 1064 2324 conicto +832 2025 832 1504 conicto +832 983 1063 683 conicto +1294 384 1697 384 conicto +2095 384 2327 684 conicto +2560 985 2560 1504 conicto +2560 2020 2327 2322 conicto +2095 2624 1697 2624 conicto +1696 3072 moveto +2338 3072 2705 2656 conicto +3072 2240 3072 1504 conicto +3072 771 2705 353 conicto +2338 -64 1696 -64 conicto +1051 -64 685 353 conicto +320 771 320 1504 conicto +320 2240 685 2656 conicto +1051 3072 1696 3072 conicto +end_ol grestore +gsave 5.624556 2.327500 translate 0.035278 -0.035278 scale +start_ol +2048 4160 moveto +2048 3712 lineto +1571 3712 lineto +1299 3712 1193 3612 conicto +1088 3513 1088 3254 conicto +1088 3008 lineto +1920 3008 lineto +1920 2624 lineto +1088 2624 lineto +1088 0 lineto +576 0 lineto +576 2624 lineto +128 2624 lineto +128 3008 lineto +576 3008 lineto +576 3202 lineto +576 3702 816 3931 conicto +1056 4160 1577 4160 conicto +2048 4160 lineto +end_ol grestore +gsave 5.881813 2.327500 translate 0.035278 -0.035278 scale +start_ol +end_ol grestore +gsave 6.114093 2.327500 translate 0.035278 -0.035278 scale +start_ol +512 4032 moveto +1357 4032 lineto +2335 1282 lineto +3317 4032 lineto +4160 4032 lineto +4160 0 lineto +3584 0 lineto +3584 3539 lineto +2596 768 lineto +2076 768 lineto +1088 3539 lineto +1088 0 lineto +512 0 lineto +512 4032 lineto +end_ol grestore +gsave 6.743503 2.327500 translate 0.035278 -0.035278 scale +start_ol +3136 1584 moveto +3136 1344 lineto +832 1344 lineto +865 875 1142 629 conicto +1420 384 1916 384 conicto +2203 384 2472 448 conicto +2742 512 3008 640 conicto +3008 192 lineto +2741 67 2460 1 conicto +2179 -64 1890 -64 conicto +1166 -64 743 352 conicto +320 768 320 1477 conicto +320 2211 723 2641 conicto +1126 3072 1809 3072 conicto +2423 3072 2779 2672 conicto +3136 2272 3136 1584 conicto +2624 1728 moveto +2619 2137 2395 2380 conicto +2172 2624 1804 2624 conicto +1387 2624 1136 2388 conicto +886 2153 848 1725 conicto +2624 1728 lineto +end_ol grestore +gsave 7.190583 2.327500 translate 0.035278 -0.035278 scale +start_ol +1882 1536 moveto +1289 1536 1060 1402 conicto +832 1268 832 944 conicto +832 686 1003 535 conicto +1175 384 1470 384 conicto +1876 384 2122 667 conicto +2368 951 2368 1422 conicto +2368 1536 lineto +1882 1536 lineto +2880 1739 moveto +2880 0 lineto +2368 0 lineto +2368 448 lineto +2199 186 1946 61 conicto +1693 -64 1328 -64 conicto +866 -64 593 196 conicto +320 456 320 893 conicto +320 1402 662 1661 conicto +1004 1920 1682 1920 conicto +2368 1920 lineto +2368 1962 lineto +2368 2278 2140 2451 conicto +1912 2624 1500 2624 conicto +1238 2624 989 2560 conicto +741 2496 512 2368 conicto +512 2816 lineto +789 2944 1049 3008 conicto +1310 3072 1556 3072 conicto +2222 3072 2551 2741 conicto +2880 2411 2880 1739 conicto +end_ol grestore +gsave 7.637663 2.327500 translate 0.035278 -0.035278 scale +start_ol +2432 2880 moveto +2432 2432 lineto +2227 2528 2007 2576 conicto +1787 2624 1551 2624 conicto +1191 2624 1011 2516 conicto +832 2408 832 2192 conicto +832 2027 962 1933 conicto +1093 1839 1487 1754 conicto +1654 1717 lineto +2181 1605 2402 1402 conicto +2624 1199 2624 834 conicto +2624 420 2291 178 conicto +1959 -64 1378 -64 conicto +1136 -64 873 -16 conicto +611 32 320 128 conicto +320 640 lineto +594 512 860 448 conicto +1126 384 1387 384 conicto +1736 384 1924 498 conicto +2112 612 2112 820 conicto +2112 1013 1977 1115 conicto +1843 1218 1388 1313 conicto +1218 1352 lineto +743 1448 531 1646 conicto +320 1845 320 2192 conicto +320 2613 620 2842 conicto +920 3072 1472 3072 conicto +1746 3072 1987 3024 conicto +2228 2976 2432 2880 conicto +end_ol grestore +gsave 8.017307 2.327500 translate 0.035278 -0.035278 scale +start_ol +512 1177 moveto +512 3008 lineto +1024 3008 lineto +1024 1196 lineto +1024 790 1184 587 conicto +1344 384 1664 384 conicto +2049 384 2272 628 conicto +2496 872 2496 1293 conicto +2496 3008 lineto +3008 3008 lineto +3008 0 lineto +2496 0 lineto +2496 448 lineto +2320 188 2087 62 conicto +1854 -64 1546 -64 conicto +1038 -64 775 252 conicto +512 568 512 1177 conicto +1744 3072 moveto +1744 3072 lineto +end_ol grestore +gsave 8.479368 2.327500 translate 0.035278 -0.035278 scale +start_ol +2304 2560 moveto +2220 2593 2120 2608 conicto +2021 2624 1902 2624 conicto +1478 2624 1251 2359 conicto +1024 2094 1024 1597 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1182 2820 1434 2946 conicto +1687 3072 2049 3072 conicto +2100 3072 2162 3072 conicto +2225 3072 2301 3072 conicto +2304 2560 lineto +end_ol grestore +gsave 8.764101 2.327500 translate 0.035278 -0.035278 scale +start_ol +3136 1584 moveto +3136 1344 lineto +832 1344 lineto +865 875 1142 629 conicto +1420 384 1916 384 conicto +2203 384 2472 448 conicto +2742 512 3008 640 conicto +3008 192 lineto +2741 67 2460 1 conicto +2179 -64 1890 -64 conicto +1166 -64 743 352 conicto +320 768 320 1477 conicto +320 2211 723 2641 conicto +1126 3072 1809 3072 conicto +2423 3072 2779 2672 conicto +3136 2272 3136 1584 conicto +2624 1728 moveto +2619 2137 2395 2380 conicto +2172 2624 1804 2624 conicto +1387 2624 1136 2388 conicto +886 2153 848 1725 conicto +2624 1728 lineto +end_ol grestore +gsave 9.211181 2.327500 translate 0.035278 -0.035278 scale +start_ol +2858 2449 moveto +3041 2768 3296 2920 conicto +3551 3072 3895 3072 conicto +4360 3072 4612 2748 conicto +4864 2425 4864 1829 conicto +4864 0 lineto +4352 0 lineto +4352 1813 lineto +4352 2225 4203 2424 conicto +4055 2624 3750 2624 conicto +3377 2624 3160 2379 conicto +2944 2135 2944 1713 conicto +2944 0 lineto +2432 0 lineto +2432 1813 lineto +2432 2227 2283 2425 conicto +2135 2624 1824 2624 conicto +1457 2624 1240 2378 conicto +1024 2132 1024 1713 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1190 2822 1423 2947 conicto +1656 3072 1975 3072 conicto +2298 3072 2524 2912 conicto +2750 2753 2858 2449 conicto +end_ol grestore +gsave 9.920512 2.327500 translate 0.035278 -0.035278 scale +start_ol +3136 1584 moveto +3136 1344 lineto +832 1344 lineto +865 875 1142 629 conicto +1420 384 1916 384 conicto +2203 384 2472 448 conicto +2742 512 3008 640 conicto +3008 192 lineto +2741 67 2460 1 conicto +2179 -64 1890 -64 conicto +1166 -64 743 352 conicto +320 768 320 1477 conicto +320 2211 723 2641 conicto +1126 3072 1809 3072 conicto +2423 3072 2779 2672 conicto +3136 2272 3136 1584 conicto +2624 1728 moveto +2619 2137 2395 2380 conicto +2172 2624 1804 2624 conicto +1387 2624 1136 2388 conicto +886 2153 848 1725 conicto +2624 1728 lineto +end_ol grestore +gsave 10.367592 2.327500 translate 0.035278 -0.035278 scale +start_ol +3008 1829 moveto +3008 0 lineto +2496 0 lineto +2496 1813 lineto +2496 2220 2335 2422 conicto +2175 2624 1854 2624 conicto +1469 2624 1246 2379 conicto +1024 2135 1024 1713 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1198 2817 1434 2944 conicto +1670 3072 1978 3072 conicto +2486 3072 2747 2756 conicto +3008 2441 3008 1829 conicto +end_ol grestore +gsave 10.829654 2.327500 translate 0.035278 -0.035278 scale +start_ol +1024 3840 moveto +1024 3008 lineto +2048 3008 lineto +2048 2624 lineto +1024 2624 lineto +1024 1016 lineto +1024 654 1125 551 conicto +1226 448 1537 448 conicto +2048 448 lineto +2048 0 lineto +1537 0 lineto +955 0 733 219 conicto +512 438 512 1016 conicto +512 2624 lineto +128 2624 lineto +128 3008 lineto +512 3008 lineto +512 3840 lineto +1024 3840 lineto +end_ol grestore +gsave 11.114387 2.327500 translate 0.035278 -0.035278 scale +start_ol +2432 2880 moveto +2432 2432 lineto +2227 2528 2007 2576 conicto +1787 2624 1551 2624 conicto +1191 2624 1011 2516 conicto +832 2408 832 2192 conicto +832 2027 962 1933 conicto +1093 1839 1487 1754 conicto +1654 1717 lineto +2181 1605 2402 1402 conicto +2624 1199 2624 834 conicto +2624 420 2291 178 conicto +1959 -64 1378 -64 conicto +1136 -64 873 -16 conicto +611 32 320 128 conicto +320 640 lineto +594 512 860 448 conicto +1126 384 1387 384 conicto +1736 384 1924 498 conicto +2112 612 2112 820 conicto +2112 1013 1977 1115 conicto +1843 1218 1388 1313 conicto +1218 1352 lineto +743 1448 531 1646 conicto +320 1845 320 2192 conicto +320 2613 620 2842 conicto +920 3072 1472 3072 conicto +1746 3072 1987 3024 conicto +2228 2976 2432 2880 conicto +end_ol grestore +gsave 11.494031 2.327500 translate 0.035278 -0.035278 scale +start_ol +end_ol grestore +gsave 11.726310 2.327500 translate 0.035278 -0.035278 scale +start_ol +1024 3840 moveto +1024 3008 lineto +2048 3008 lineto +2048 2624 lineto +1024 2624 lineto +1024 1016 lineto +1024 654 1125 551 conicto +1226 448 1537 448 conicto +2048 448 lineto +2048 0 lineto +1537 0 lineto +955 0 733 219 conicto +512 438 512 1016 conicto +512 2624 lineto +128 2624 lineto +128 3008 lineto +512 3008 lineto +512 3840 lineto +1024 3840 lineto +end_ol grestore +gsave 12.011043 2.327500 translate 0.035278 -0.035278 scale +start_ol +512 3008 moveto +1024 3008 lineto +1024 0 lineto +512 0 lineto +512 3008 lineto +512 4160 moveto +1024 4160 lineto +1024 3520 lineto +512 3520 lineto +512 4160 lineto +end_ol grestore +gsave 12.213350 2.327500 translate 0.035278 -0.035278 scale +start_ol +2858 2449 moveto +3041 2768 3296 2920 conicto +3551 3072 3895 3072 conicto +4360 3072 4612 2748 conicto +4864 2425 4864 1829 conicto +4864 0 lineto +4352 0 lineto +4352 1813 lineto +4352 2225 4203 2424 conicto +4055 2624 3750 2624 conicto +3377 2624 3160 2379 conicto +2944 2135 2944 1713 conicto +2944 0 lineto +2432 0 lineto +2432 1813 lineto +2432 2227 2283 2425 conicto +2135 2624 1824 2624 conicto +1457 2624 1240 2378 conicto +1024 2132 1024 1713 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1190 2822 1423 2947 conicto +1656 3072 1975 3072 conicto +2298 3072 2524 2912 conicto +2750 2753 2858 2449 conicto +end_ol grestore +gsave 12.922682 2.327500 translate 0.035278 -0.035278 scale +start_ol +3136 1584 moveto +3136 1344 lineto +832 1344 lineto +865 875 1142 629 conicto +1420 384 1916 384 conicto +2203 384 2472 448 conicto +2742 512 3008 640 conicto +3008 192 lineto +2741 67 2460 1 conicto +2179 -64 1890 -64 conicto +1166 -64 743 352 conicto +320 768 320 1477 conicto +320 2211 723 2641 conicto +1126 3072 1809 3072 conicto +2423 3072 2779 2672 conicto +3136 2272 3136 1584 conicto +2624 1728 moveto +2619 2137 2395 2380 conicto +2172 2624 1804 2624 conicto +1387 2624 1136 2388 conicto +886 2153 848 1725 conicto +2624 1728 lineto +end_ol grestore +gsave 13.369761 2.327500 translate 0.035278 -0.035278 scale +start_ol +2432 2880 moveto +2432 2432 lineto +2227 2528 2007 2576 conicto +1787 2624 1551 2624 conicto +1191 2624 1011 2516 conicto +832 2408 832 2192 conicto +832 2027 962 1933 conicto +1093 1839 1487 1754 conicto +1654 1717 lineto +2181 1605 2402 1402 conicto +2624 1199 2624 834 conicto +2624 420 2291 178 conicto +1959 -64 1378 -64 conicto +1136 -64 873 -16 conicto +611 32 320 128 conicto +320 640 lineto +594 512 860 448 conicto +1126 384 1387 384 conicto +1736 384 1924 498 conicto +2112 612 2112 820 conicto +2112 1013 1977 1115 conicto +1843 1218 1388 1313 conicto +1218 1352 lineto +743 1448 531 1646 conicto +320 1845 320 2192 conicto +320 2613 620 2842 conicto +920 3072 1472 3072 conicto +1746 3072 1987 3024 conicto +2228 2976 2432 2880 conicto +end_ol grestore +1.000000 1.000000 1.000000 srgb +n 2.950000 3.950000 m 2.950000 6.050000 l 24.950000 6.050000 l 24.950000 3.950000 l f +0.100000 slw +[] 0 sd +[] 0 sd +0 slj +0.000000 0.000000 0.000000 srgb +n 2.950000 3.950000 m 2.950000 6.050000 l 24.950000 6.050000 l 24.950000 3.950000 l cp s +gsave 3.400000 5.177500 translate 0.035278 -0.035278 scale +start_ol +2445 1914 moveto +2620 1854 2786 1657 conicto +2953 1461 3120 1117 conicto +3648 0 lineto +3087 0 lineto +2571 1049 lineto +2373 1460 2186 1594 conicto +2000 1728 1678 1728 conicto +1088 1728 lineto +1088 0 lineto +512 0 lineto +512 4032 lineto +1769 4032 lineto +2458 4032 2797 3746 conicto +3136 3460 3136 2884 conicto +3136 2507 2959 2258 conicto +2782 2010 2445 1914 conicto +1088 3584 moveto +1088 2176 lineto +1769 2176 lineto +2160 2176 2360 2355 conicto +2560 2535 2560 2883 conicto +2560 3231 2360 3407 conicto +2160 3584 1769 3584 conicto +1088 3584 lineto +end_ol grestore +gsave 3.874555 5.177500 translate 0.035278 -0.035278 scale +start_ol +3136 1584 moveto +3136 1344 lineto +832 1344 lineto +865 875 1142 629 conicto +1420 384 1916 384 conicto +2203 384 2472 448 conicto +2742 512 3008 640 conicto +3008 192 lineto +2741 67 2460 1 conicto +2179 -64 1890 -64 conicto +1166 -64 743 352 conicto +320 768 320 1477 conicto +320 2211 723 2641 conicto +1126 3072 1809 3072 conicto +2423 3072 2779 2672 conicto +3136 2272 3136 1584 conicto +2624 1728 moveto +2619 2137 2395 2380 conicto +2172 2624 1804 2624 conicto +1387 2624 1136 2388 conicto +886 2153 848 1725 conicto +2624 1728 lineto +end_ol grestore +gsave 4.321635 5.177500 translate 0.035278 -0.035278 scale +start_ol +1882 1536 moveto +1289 1536 1060 1402 conicto +832 1268 832 944 conicto +832 686 1003 535 conicto +1175 384 1470 384 conicto +1876 384 2122 667 conicto +2368 951 2368 1422 conicto +2368 1536 lineto +1882 1536 lineto +2880 1739 moveto +2880 0 lineto +2368 0 lineto +2368 448 lineto +2199 186 1946 61 conicto +1693 -64 1328 -64 conicto +866 -64 593 196 conicto +320 456 320 893 conicto +320 1402 662 1661 conicto +1004 1920 1682 1920 conicto +2368 1920 lineto +2368 1962 lineto +2368 2278 2140 2451 conicto +1912 2624 1500 2624 conicto +1238 2624 989 2560 conicto +741 2496 512 2368 conicto +512 2816 lineto +789 2944 1049 3008 conicto +1310 3072 1556 3072 conicto +2222 3072 2551 2741 conicto +2880 2411 2880 1739 conicto +end_ol grestore +gsave 4.768714 5.177500 translate 0.035278 -0.035278 scale +start_ol +2496 2560 moveto +2496 4160 lineto +3008 4160 lineto +3008 0 lineto +2496 0 lineto +2496 448 lineto +2342 188 2106 62 conicto +1870 -64 1540 -64 conicto +999 -64 659 368 conicto +320 800 320 1504 conicto +320 2208 659 2640 conicto +999 3072 1540 3072 conicto +1870 3072 2106 2946 conicto +2342 2820 2496 2560 conicto +832 1504 moveto +832 980 1053 682 conicto +1275 384 1663 384 conicto +2050 384 2273 682 conicto +2496 980 2496 1504 conicto +2496 2028 2273 2326 conicto +2050 2624 1663 2624 conicto +1275 2624 1053 2326 conicto +832 2028 832 1504 conicto +end_ol grestore +gsave 5.230776 5.177500 translate 0.035278 -0.035278 scale +start_ol +end_ol grestore +gsave 5.463055 5.177500 translate 0.035278 -0.035278 scale +start_ol +3264 588 moveto +3264 1664 lineto +2368 1664 lineto +2368 2112 lineto +3840 2112 lineto +3840 389 lineto +3524 165 3143 50 conicto +2763 -64 2331 -64 conicto +1386 -64 853 485 conicto +320 1034 320 2015 conicto +320 2998 849 3547 conicto +1379 4096 2318 4096 conicto +2709 4096 3061 3998 conicto +3414 3901 3712 3712 conicto +3712 3136 lineto +3412 3391 3075 3519 conicto +2738 3648 2366 3648 conicto +1632 3648 1264 3237 conicto +896 2827 896 2015 conicto +896 1205 1261 794 conicto +1627 384 2355 384 conicto +2639 384 2862 433 conicto +3086 483 3264 588 conicto +end_ol grestore +gsave 6.027527 5.177500 translate 0.035278 -0.035278 scale +start_ol +1882 1536 moveto +1289 1536 1060 1402 conicto +832 1268 832 944 conicto +832 686 1003 535 conicto +1175 384 1470 384 conicto +1876 384 2122 667 conicto +2368 951 2368 1422 conicto +2368 1536 lineto +1882 1536 lineto +2880 1739 moveto +2880 0 lineto +2368 0 lineto +2368 448 lineto +2199 186 1946 61 conicto +1693 -64 1328 -64 conicto +866 -64 593 196 conicto +320 456 320 893 conicto +320 1402 662 1661 conicto +1004 1920 1682 1920 conicto +2368 1920 lineto +2368 1962 lineto +2368 2278 2140 2451 conicto +1912 2624 1500 2624 conicto +1238 2624 989 2560 conicto +741 2496 512 2368 conicto +512 2816 lineto +789 2944 1049 3008 conicto +1310 3072 1556 3072 conicto +2222 3072 2551 2741 conicto +2880 2411 2880 1739 conicto +end_ol grestore +gsave 6.474607 5.177500 translate 0.035278 -0.035278 scale +start_ol +512 1177 moveto +512 3008 lineto +1024 3008 lineto +1024 1196 lineto +1024 790 1184 587 conicto +1344 384 1664 384 conicto +2049 384 2272 628 conicto +2496 872 2496 1293 conicto +2496 3008 lineto +3008 3008 lineto +3008 0 lineto +2496 0 lineto +2496 448 lineto +2320 188 2087 62 conicto +1854 -64 1546 -64 conicto +1038 -64 775 252 conicto +512 568 512 1177 conicto +1744 3072 moveto +1744 3072 lineto +end_ol grestore +gsave 6.936669 5.177500 translate 0.035278 -0.035278 scale +start_ol +2496 1535 moveto +2496 2053 2277 2338 conicto +2058 2624 1663 2624 conicto +1270 2624 1051 2338 conicto +832 2053 832 1535 conicto +832 1019 1051 733 conicto +1270 448 1663 448 conicto +2058 448 2277 733 conicto +2496 1019 2496 1535 conicto +3008 404 moveto +3008 -384 2670 -768 conicto +2332 -1152 1635 -1152 conicto +1377 -1152 1148 -1105 conicto +920 -1058 704 -960 conicto +704 -448 lineto +917 -579 1124 -641 conicto +1332 -704 1547 -704 conicto +2023 -704 2259 -452 conicto +2496 -201 2496 308 conicto +2496 512 lineto +2344 255 2107 127 conicto +1870 0 1540 0 conicto +991 0 655 420 conicto +320 841 320 1535 conicto +320 2231 655 2651 conicto +991 3072 1540 3072 conicto +1870 3072 2107 2944 conicto +2344 2817 2496 2560 conicto +2496 3008 lineto +3008 3008 lineto +3008 404 lineto +end_ol grestore +gsave 7.398731 5.177500 translate 0.035278 -0.035278 scale +start_ol +3136 1584 moveto +3136 1344 lineto +832 1344 lineto +865 875 1142 629 conicto +1420 384 1916 384 conicto +2203 384 2472 448 conicto +2742 512 3008 640 conicto +3008 192 lineto +2741 67 2460 1 conicto +2179 -64 1890 -64 conicto +1166 -64 743 352 conicto +320 768 320 1477 conicto +320 2211 723 2641 conicto +1126 3072 1809 3072 conicto +2423 3072 2779 2672 conicto +3136 2272 3136 1584 conicto +2624 1728 moveto +2619 2137 2395 2380 conicto +2172 2624 1804 2624 conicto +1387 2624 1136 2388 conicto +886 2153 848 1725 conicto +2624 1728 lineto +end_ol grestore +gsave 7.845810 5.177500 translate 0.035278 -0.035278 scale +start_ol +end_ol grestore +gsave 8.078090 5.177500 translate 0.035278 -0.035278 scale +start_ol +3584 3712 moveto +3584 3136 lineto +3307 3393 2994 3520 conicto +2681 3648 2328 3648 conicto +1634 3648 1265 3228 conicto +896 2809 896 2015 conicto +896 1223 1265 803 conicto +1634 384 2328 384 conicto +2681 384 2994 511 conicto +3307 639 3584 896 conicto +3584 320 lineto +3297 128 2975 32 conicto +2654 -64 2296 -64 conicto +1377 -64 848 493 conicto +320 1051 320 2015 conicto +320 2981 848 3538 conicto +1377 4096 2296 4096 conicto +2660 4096 2981 4000 conicto +3302 3904 3584 3712 conicto +end_ol grestore +gsave 8.587611 5.177500 translate 0.035278 -0.035278 scale +start_ol +1697 2624 moveto +1297 2624 1064 2324 conicto +832 2025 832 1504 conicto +832 983 1063 683 conicto +1294 384 1697 384 conicto +2095 384 2327 684 conicto +2560 985 2560 1504 conicto +2560 2020 2327 2322 conicto +2095 2624 1697 2624 conicto +1696 3072 moveto +2338 3072 2705 2656 conicto +3072 2240 3072 1504 conicto +3072 771 2705 353 conicto +2338 -64 1696 -64 conicto +1051 -64 685 353 conicto +320 771 320 1504 conicto +320 2240 685 2656 conicto +1051 3072 1696 3072 conicto +end_ol grestore +gsave 9.032194 5.177500 translate 0.035278 -0.035278 scale +start_ol +3008 1829 moveto +3008 0 lineto +2496 0 lineto +2496 1813 lineto +2496 2220 2335 2422 conicto +2175 2624 1854 2624 conicto +1469 2624 1246 2379 conicto +1024 2135 1024 1713 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1198 2817 1434 2944 conicto +1670 3072 1978 3072 conicto +2486 3072 2747 2756 conicto +3008 2441 3008 1829 conicto +end_ol grestore +gsave 9.494255 5.177500 translate 0.035278 -0.035278 scale +start_ol +2944 3008 moveto +2944 0 lineto +2432 0 lineto +2432 2624 lineto +1088 2624 lineto +1088 0 lineto +576 0 lineto +576 2624 lineto +128 2624 lineto +128 3008 lineto +576 3008 lineto +576 3214 lineto +576 3698 813 3929 conicto +1050 4160 1541 4160 conicto +2048 4160 lineto +2048 3712 lineto +1568 3712 lineto +1298 3712 1193 3612 conicto +1088 3513 1088 3254 conicto +1088 3008 lineto +2944 3008 lineto +2432 4160 moveto +2944 4160 lineto +2944 3520 lineto +2432 3520 lineto +2432 4160 lineto +end_ol grestore +gsave 9.953820 5.177500 translate 0.035278 -0.035278 scale +start_ol +2496 1535 moveto +2496 2053 2277 2338 conicto +2058 2624 1663 2624 conicto +1270 2624 1051 2338 conicto +832 2053 832 1535 conicto +832 1019 1051 733 conicto +1270 448 1663 448 conicto +2058 448 2277 733 conicto +2496 1019 2496 1535 conicto +3008 404 moveto +3008 -384 2670 -768 conicto +2332 -1152 1635 -1152 conicto +1377 -1152 1148 -1105 conicto +920 -1058 704 -960 conicto +704 -448 lineto +917 -579 1124 -641 conicto +1332 -704 1547 -704 conicto +2023 -704 2259 -452 conicto +2496 -201 2496 308 conicto +2496 512 lineto +2344 255 2107 127 conicto +1870 0 1540 0 conicto +991 0 655 420 conicto +320 841 320 1535 conicto +320 2231 655 2651 conicto +991 3072 1540 3072 conicto +1870 3072 2107 2944 conicto +2344 2817 2496 2560 conicto +2496 3008 lineto +3008 3008 lineto +3008 404 lineto +end_ol grestore +gsave 10.415882 5.177500 translate 0.035278 -0.035278 scale +start_ol +512 1177 moveto +512 3008 lineto +1024 3008 lineto +1024 1196 lineto +1024 790 1184 587 conicto +1344 384 1664 384 conicto +2049 384 2272 628 conicto +2496 872 2496 1293 conicto +2496 3008 lineto +3008 3008 lineto +3008 0 lineto +2496 0 lineto +2496 448 lineto +2320 188 2087 62 conicto +1854 -64 1546 -64 conicto +1038 -64 775 252 conicto +512 568 512 1177 conicto +1744 3072 moveto +1744 3072 lineto +end_ol grestore +gsave 10.877944 5.177500 translate 0.035278 -0.035278 scale +start_ol +2304 2560 moveto +2220 2593 2120 2608 conicto +2021 2624 1902 2624 conicto +1478 2624 1251 2359 conicto +1024 2094 1024 1597 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1182 2820 1434 2946 conicto +1687 3072 2049 3072 conicto +2100 3072 2162 3072 conicto +2225 3072 2301 3072 conicto +2304 2560 lineto +end_ol grestore +gsave 11.177659 5.177500 translate 0.035278 -0.035278 scale +start_ol +1882 1536 moveto +1289 1536 1060 1402 conicto +832 1268 832 944 conicto +832 686 1003 535 conicto +1175 384 1470 384 conicto +1876 384 2122 667 conicto +2368 951 2368 1422 conicto +2368 1536 lineto +1882 1536 lineto +2880 1739 moveto +2880 0 lineto +2368 0 lineto +2368 448 lineto +2199 186 1946 61 conicto +1693 -64 1328 -64 conicto +866 -64 593 196 conicto +320 456 320 893 conicto +320 1402 662 1661 conicto +1004 1920 1682 1920 conicto +2368 1920 lineto +2368 1962 lineto +2368 2278 2140 2451 conicto +1912 2624 1500 2624 conicto +1238 2624 989 2560 conicto +741 2496 512 2368 conicto +512 2816 lineto +789 2944 1049 3008 conicto +1310 3072 1556 3072 conicto +2222 3072 2551 2741 conicto +2880 2411 2880 1739 conicto +end_ol grestore +gsave 11.624738 5.177500 translate 0.035278 -0.035278 scale +start_ol +1024 3840 moveto +1024 3008 lineto +2048 3008 lineto +2048 2624 lineto +1024 2624 lineto +1024 1016 lineto +1024 654 1125 551 conicto +1226 448 1537 448 conicto +2048 448 lineto +2048 0 lineto +1537 0 lineto +955 0 733 219 conicto +512 438 512 1016 conicto +512 2624 lineto +128 2624 lineto +128 3008 lineto +512 3008 lineto +512 3840 lineto +1024 3840 lineto +end_ol grestore +gsave 11.909471 5.177500 translate 0.035278 -0.035278 scale +start_ol +512 3008 moveto +1024 3008 lineto +1024 0 lineto +512 0 lineto +512 3008 lineto +512 4160 moveto +1024 4160 lineto +1024 3520 lineto +512 3520 lineto +512 4160 lineto +end_ol grestore +gsave 12.111778 5.177500 translate 0.035278 -0.035278 scale +start_ol +1697 2624 moveto +1297 2624 1064 2324 conicto +832 2025 832 1504 conicto +832 983 1063 683 conicto +1294 384 1697 384 conicto +2095 384 2327 684 conicto +2560 985 2560 1504 conicto +2560 2020 2327 2322 conicto +2095 2624 1697 2624 conicto +1696 3072 moveto +2338 3072 2705 2656 conicto +3072 2240 3072 1504 conicto +3072 771 2705 353 conicto +2338 -64 1696 -64 conicto +1051 -64 685 353 conicto +320 771 320 1504 conicto +320 2240 685 2656 conicto +1051 3072 1696 3072 conicto +end_ol grestore +gsave 12.556361 5.177500 translate 0.035278 -0.035278 scale +start_ol +3008 1829 moveto +3008 0 lineto +2496 0 lineto +2496 1813 lineto +2496 2220 2335 2422 conicto +2175 2624 1854 2624 conicto +1469 2624 1246 2379 conicto +1024 2135 1024 1713 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1198 2817 1434 2944 conicto +1670 3072 1978 3072 conicto +2486 3072 2747 2756 conicto +3008 2441 3008 1829 conicto +end_ol grestore +gsave 13.018423 5.177500 translate 0.035278 -0.035278 scale +start_ol +end_ol grestore +gsave 13.250702 5.177500 translate 0.035278 -0.035278 scale +start_ol +2048 4160 moveto +2048 3712 lineto +1571 3712 lineto +1299 3712 1193 3612 conicto +1088 3513 1088 3254 conicto +1088 3008 lineto +1920 3008 lineto +1920 2624 lineto +1088 2624 lineto +1088 0 lineto +576 0 lineto +576 2624 lineto +128 2624 lineto +128 3008 lineto +576 3008 lineto +576 3202 lineto +576 3702 816 3931 conicto +1056 4160 1577 4160 conicto +2048 4160 lineto +end_ol grestore +gsave 13.507960 5.177500 translate 0.035278 -0.035278 scale +start_ol +2304 2560 moveto +2220 2593 2120 2608 conicto +2021 2624 1902 2624 conicto +1478 2624 1251 2359 conicto +1024 2094 1024 1597 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1182 2820 1434 2946 conicto +1687 3072 2049 3072 conicto +2100 3072 2162 3072 conicto +2225 3072 2301 3072 conicto +2304 2560 lineto +end_ol grestore +gsave 13.792693 5.177500 translate 0.035278 -0.035278 scale +start_ol +1697 2624 moveto +1297 2624 1064 2324 conicto +832 2025 832 1504 conicto +832 983 1063 683 conicto +1294 384 1697 384 conicto +2095 384 2327 684 conicto +2560 985 2560 1504 conicto +2560 2020 2327 2322 conicto +2095 2624 1697 2624 conicto +1696 3072 moveto +2338 3072 2705 2656 conicto +3072 2240 3072 1504 conicto +3072 771 2705 353 conicto +2338 -64 1696 -64 conicto +1051 -64 685 353 conicto +320 771 320 1504 conicto +320 2240 685 2656 conicto +1051 3072 1696 3072 conicto +end_ol grestore +gsave 14.237275 5.177500 translate 0.035278 -0.035278 scale +start_ol +2858 2449 moveto +3041 2768 3296 2920 conicto +3551 3072 3895 3072 conicto +4360 3072 4612 2748 conicto +4864 2425 4864 1829 conicto +4864 0 lineto +4352 0 lineto +4352 1813 lineto +4352 2225 4203 2424 conicto +4055 2624 3750 2624 conicto +3377 2624 3160 2379 conicto +2944 2135 2944 1713 conicto +2944 0 lineto +2432 0 lineto +2432 1813 lineto +2432 2227 2283 2425 conicto +2135 2624 1824 2624 conicto +1457 2624 1240 2378 conicto +1024 2132 1024 1713 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1190 2822 1423 2947 conicto +1656 3072 1975 3072 conicto +2298 3072 2524 2912 conicto +2750 2753 2858 2449 conicto +end_ol grestore +gsave 14.946607 5.177500 translate 0.035278 -0.035278 scale +start_ol +end_ol grestore +gsave 15.178886 5.177500 translate 0.035278 -0.035278 scale +start_ol +1088 3584 moveto +1088 448 lineto +1739 448 lineto +2563 448 2945 826 conicto +3328 1204 3328 2020 conicto +3328 2830 2945 3207 conicto +2563 3584 1739 3584 conicto +1088 3584 lineto +512 4032 moveto +1656 4032 lineto +2817 4032 3360 3544 conicto +3904 3057 3904 2020 conicto +3904 978 3358 489 conicto +2812 0 1656 0 conicto +512 0 lineto +512 4032 lineto +end_ol grestore +gsave 15.740861 5.177500 translate 0.035278 -0.035278 scale +start_ol +512 3008 moveto +1024 3008 lineto +1024 0 lineto +512 0 lineto +512 3008 lineto +512 4160 moveto +1024 4160 lineto +1024 3520 lineto +512 3520 lineto +512 4160 lineto +end_ol grestore +gsave 15.943168 5.177500 translate 0.035278 -0.035278 scale +start_ol +2432 2880 moveto +2432 2432 lineto +2227 2528 2007 2576 conicto +1787 2624 1551 2624 conicto +1191 2624 1011 2516 conicto +832 2408 832 2192 conicto +832 2027 962 1933 conicto +1093 1839 1487 1754 conicto +1654 1717 lineto +2181 1605 2402 1402 conicto +2624 1199 2624 834 conicto +2624 420 2291 178 conicto +1959 -64 1378 -64 conicto +1136 -64 873 -16 conicto +611 32 320 128 conicto +320 640 lineto +594 512 860 448 conicto +1126 384 1387 384 conicto +1736 384 1924 498 conicto +2112 612 2112 820 conicto +2112 1013 1977 1115 conicto +1843 1218 1388 1313 conicto +1218 1352 lineto +743 1448 531 1646 conicto +320 1845 320 2192 conicto +320 2613 620 2842 conicto +920 3072 1472 3072 conicto +1746 3072 1987 3024 conicto +2228 2976 2432 2880 conicto +end_ol grestore +gsave 16.322812 5.177500 translate 0.035278 -0.035278 scale +start_ol +512 4160 moveto +1024 4160 lineto +1024 1711 lineto +2503 3008 lineto +3136 3008 lineto +1536 1601 lineto +3200 0 lineto +2554 0 lineto +1024 1469 lineto +1024 0 lineto +512 0 lineto +512 4160 lineto +end_ol grestore +0.100000 slw +[] 0 sd +[] 0 sd +0 slj +0 slc +n 14.025000 18.925000 m 14.025000 19.800000 l 2.350000 19.800000 l 2.350000 3.950000 l s +0 slj +n 2.600000 3.950000 m 2.350000 3.450000 l 2.100000 3.950000 l ef +0.100000 slw +[] 0 sd +[] 0 sd +0 slc +n 3.000000 8.720000 m 3.000000 18.850000 l s +0.100000 slw +[] 0 sd +[] 0 sd +0 slc +n 3.000000 18.900000 m 25.050000 18.950000 l s +0.100000 slw +[] 0 sd +[] 0 sd +0 slj +0 slc +n 14.800000 17.849905 m 14.800000 18.400000 l 3.550000 18.400000 l 3.550000 9.400000 l s +0 slj +n 3.800000 9.400000 m 3.550000 8.900000 l 3.300000 9.400000 l ef +1.000000 1.000000 1.000000 srgb +n 3.000000 6.620000 m 3.000000 8.720000 l 25.000000 8.720000 l 25.000000 6.620000 l f +0.100000 slw +[] 0 sd +[] 0 sd +0 slj +0.000000 0.000000 0.000000 srgb +n 3.000000 6.620000 m 3.000000 8.720000 l 25.000000 8.720000 l 25.000000 6.620000 l cp s +gsave 3.450000 7.847500 translate 0.035278 -0.035278 scale +start_ol +1088 3584 moveto +1088 448 lineto +1739 448 lineto +2563 448 2945 826 conicto +3328 1204 3328 2020 conicto +3328 2830 2945 3207 conicto +2563 3584 1739 3584 conicto +1088 3584 lineto +512 4032 moveto +1656 4032 lineto +2817 4032 3360 3544 conicto +3904 3057 3904 2020 conicto +3904 978 3358 489 conicto +2812 0 1656 0 conicto +512 0 lineto +512 4032 lineto +end_ol grestore +gsave 4.011975 7.847500 translate 0.035278 -0.035278 scale +start_ol +1697 2624 moveto +1297 2624 1064 2324 conicto +832 2025 832 1504 conicto +832 983 1063 683 conicto +1294 384 1697 384 conicto +2095 384 2327 684 conicto +2560 985 2560 1504 conicto +2560 2020 2327 2322 conicto +2095 2624 1697 2624 conicto +1696 3072 moveto +2338 3072 2705 2656 conicto +3072 2240 3072 1504 conicto +3072 771 2705 353 conicto +2338 -64 1696 -64 conicto +1051 -64 685 353 conicto +320 771 320 1504 conicto +320 2240 685 2656 conicto +1051 3072 1696 3072 conicto +end_ol grestore +gsave 4.456558 7.847500 translate 0.035278 -0.035278 scale +start_ol +end_ol grestore +gsave 4.688837 7.847500 translate 0.035278 -0.035278 scale +start_ol +3008 1829 moveto +3008 0 lineto +2496 0 lineto +2496 1813 lineto +2496 2220 2335 2422 conicto +2175 2624 1854 2624 conicto +1469 2624 1246 2379 conicto +1024 2135 1024 1713 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1198 2817 1434 2944 conicto +1670 3072 1978 3072 conicto +2486 3072 2747 2756 conicto +3008 2441 3008 1829 conicto +end_ol grestore +gsave 5.150899 7.847500 translate 0.035278 -0.035278 scale +start_ol +1697 2624 moveto +1297 2624 1064 2324 conicto +832 2025 832 1504 conicto +832 983 1063 683 conicto +1294 384 1697 384 conicto +2095 384 2327 684 conicto +2560 985 2560 1504 conicto +2560 2020 2327 2322 conicto +2095 2624 1697 2624 conicto +1696 3072 moveto +2338 3072 2705 2656 conicto +3072 2240 3072 1504 conicto +3072 771 2705 353 conicto +2338 -64 1696 -64 conicto +1051 -64 685 353 conicto +320 771 320 1504 conicto +320 2240 685 2656 conicto +1051 3072 1696 3072 conicto +end_ol grestore +gsave 5.582988 7.847500 translate 0.035278 -0.035278 scale +start_ol +640 704 moveto +1216 704 lineto +1216 0 lineto +640 0 lineto +640 704 lineto +end_ol grestore +gsave 5.815268 7.847500 translate 0.035278 -0.035278 scale +start_ol +end_ol grestore +gsave 6.047547 7.847500 translate 0.035278 -0.035278 scale +start_ol +1697 2624 moveto +1297 2624 1064 2324 conicto +832 2025 832 1504 conicto +832 983 1063 683 conicto +1294 384 1697 384 conicto +2095 384 2327 684 conicto +2560 985 2560 1504 conicto +2560 2020 2327 2322 conicto +2095 2624 1697 2624 conicto +1696 3072 moveto +2338 3072 2705 2656 conicto +3072 2240 3072 1504 conicto +3072 771 2705 353 conicto +2338 -64 1696 -64 conicto +1051 -64 685 353 conicto +320 771 320 1504 conicto +320 2240 685 2656 conicto +1051 3072 1696 3072 conicto +end_ol grestore +gsave 6.492130 7.847500 translate 0.035278 -0.035278 scale +start_ol +2048 4160 moveto +2048 3712 lineto +1571 3712 lineto +1299 3712 1193 3612 conicto +1088 3513 1088 3254 conicto +1088 3008 lineto +1920 3008 lineto +1920 2624 lineto +1088 2624 lineto +1088 0 lineto +576 0 lineto +576 2624 lineto +128 2624 lineto +128 3008 lineto +576 3008 lineto +576 3202 lineto +576 3702 816 3931 conicto +1056 4160 1577 4160 conicto +2048 4160 lineto +end_ol grestore +gsave 6.749387 7.847500 translate 0.035278 -0.035278 scale +start_ol +end_ol grestore +gsave 6.981667 7.847500 translate 0.035278 -0.035278 scale +start_ol +2944 3904 moveto +2944 3392 lineto +2643 3521 2376 3584 conicto +2109 3648 1860 3648 conicto +1429 3648 1194 3480 conicto +960 3313 960 3004 conicto +960 2745 1115 2613 conicto +1271 2481 1704 2400 conicto +2023 2332 lineto +2626 2215 2913 1920 conicto +3200 1626 3200 1133 conicto +3200 544 2808 240 conicto +2416 -64 1659 -64 conicto +1373 -64 1051 0 conicto +729 65 384 192 conicto +384 768 lineto +714 577 1031 480 conicto +1348 384 1654 384 conicto +2119 384 2371 568 conicto +2624 753 2624 1095 conicto +2624 1393 2447 1561 conicto +2271 1730 1869 1814 conicto +1548 1879 lineto +933 1999 658 2254 conicto +384 2509 384 2964 conicto +384 3490 755 3793 conicto +1127 4096 1780 4096 conicto +2060 4096 2350 4048 conicto +2640 4000 2944 3904 conicto +end_ol grestore +gsave 7.443728 7.847500 translate 0.035278 -0.035278 scale +start_ol +1697 2624 moveto +1297 2624 1064 2324 conicto +832 2025 832 1504 conicto +832 983 1063 683 conicto +1294 384 1697 384 conicto +2095 384 2327 684 conicto +2560 985 2560 1504 conicto +2560 2020 2327 2322 conicto +2095 2624 1697 2624 conicto +1696 3072 moveto +2338 3072 2705 2656 conicto +3072 2240 3072 1504 conicto +3072 771 2705 353 conicto +2338 -64 1696 -64 conicto +1051 -64 685 353 conicto +320 771 320 1504 conicto +320 2240 685 2656 conicto +1051 3072 1696 3072 conicto +end_ol grestore +gsave 7.888311 7.847500 translate 0.035278 -0.035278 scale +start_ol +512 1177 moveto +512 3008 lineto +1024 3008 lineto +1024 1196 lineto +1024 790 1184 587 conicto +1344 384 1664 384 conicto +2049 384 2272 628 conicto +2496 872 2496 1293 conicto +2496 3008 lineto +3008 3008 lineto +3008 0 lineto +2496 0 lineto +2496 448 lineto +2320 188 2087 62 conicto +1854 -64 1546 -64 conicto +1038 -64 775 252 conicto +512 568 512 1177 conicto +1744 3072 moveto +1744 3072 lineto +end_ol grestore +gsave 8.350373 7.847500 translate 0.035278 -0.035278 scale +start_ol +2304 2560 moveto +2220 2593 2120 2608 conicto +2021 2624 1902 2624 conicto +1478 2624 1251 2359 conicto +1024 2094 1024 1597 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1182 2820 1434 2946 conicto +1687 3072 2049 3072 conicto +2100 3072 2162 3072 conicto +2225 3072 2301 3072 conicto +2304 2560 lineto +end_ol grestore +gsave 8.635106 7.847500 translate 0.035278 -0.035278 scale +start_ol +2688 2816 moveto +2688 2368 lineto +2479 2496 2268 2560 conicto +2058 2624 1843 2624 conicto +1363 2624 1097 2329 conicto +832 2035 832 1504 conicto +832 973 1097 678 conicto +1363 384 1843 384 conicto +2058 384 2268 448 conicto +2479 512 2688 640 conicto +2688 192 lineto +2482 64 2261 0 conicto +2041 -64 1793 -64 conicto +1116 -64 718 360 conicto +320 784 320 1504 conicto +320 2235 722 2653 conicto +1124 3072 1825 3072 conicto +2052 3072 2268 3008 conicto +2485 2944 2688 2816 conicto +end_ol grestore +gsave 9.034726 7.847500 translate 0.035278 -0.035278 scale +start_ol +3136 1584 moveto +3136 1344 lineto +832 1344 lineto +865 875 1142 629 conicto +1420 384 1916 384 conicto +2203 384 2472 448 conicto +2742 512 3008 640 conicto +3008 192 lineto +2741 67 2460 1 conicto +2179 -64 1890 -64 conicto +1166 -64 743 352 conicto +320 768 320 1477 conicto +320 2211 723 2641 conicto +1126 3072 1809 3072 conicto +2423 3072 2779 2672 conicto +3136 2272 3136 1584 conicto +2624 1728 moveto +2619 2137 2395 2380 conicto +2172 2624 1804 2624 conicto +1387 2624 1136 2388 conicto +886 2153 848 1725 conicto +2624 1728 lineto +end_ol grestore +gsave 9.481805 7.847500 translate 0.035278 -0.035278 scale +start_ol +2432 2880 moveto +2432 2432 lineto +2227 2528 2007 2576 conicto +1787 2624 1551 2624 conicto +1191 2624 1011 2516 conicto +832 2408 832 2192 conicto +832 2027 962 1933 conicto +1093 1839 1487 1754 conicto +1654 1717 lineto +2181 1605 2402 1402 conicto +2624 1199 2624 834 conicto +2624 420 2291 178 conicto +1959 -64 1378 -64 conicto +1136 -64 873 -16 conicto +611 32 320 128 conicto +320 640 lineto +594 512 860 448 conicto +1126 384 1387 384 conicto +1736 384 1924 498 conicto +2112 612 2112 820 conicto +2112 1013 1977 1115 conicto +1843 1218 1388 1313 conicto +1218 1352 lineto +743 1448 531 1646 conicto +320 1845 320 2192 conicto +320 2613 620 2842 conicto +920 3072 1472 3072 conicto +1746 3072 1987 3024 conicto +2228 2976 2432 2880 conicto +end_ol grestore +gsave 9.861449 7.847500 translate 0.035278 -0.035278 scale +start_ol +end_ol grestore +gsave 10.093729 7.847500 translate 0.035278 -0.035278 scale +start_ol +1024 3840 moveto +1024 3008 lineto +2048 3008 lineto +2048 2624 lineto +1024 2624 lineto +1024 1016 lineto +1024 654 1125 551 conicto +1226 448 1537 448 conicto +2048 448 lineto +2048 0 lineto +1537 0 lineto +955 0 733 219 conicto +512 438 512 1016 conicto +512 2624 lineto +128 2624 lineto +128 3008 lineto +512 3008 lineto +512 3840 lineto +1024 3840 lineto +end_ol grestore +gsave 10.378462 7.847500 translate 0.035278 -0.035278 scale +start_ol +512 3008 moveto +1024 3008 lineto +1024 0 lineto +512 0 lineto +512 3008 lineto +512 4160 moveto +1024 4160 lineto +1024 3520 lineto +512 3520 lineto +512 4160 lineto +end_ol grestore +gsave 10.580769 7.847500 translate 0.035278 -0.035278 scale +start_ol +2858 2449 moveto +3041 2768 3296 2920 conicto +3551 3072 3895 3072 conicto +4360 3072 4612 2748 conicto +4864 2425 4864 1829 conicto +4864 0 lineto +4352 0 lineto +4352 1813 lineto +4352 2225 4203 2424 conicto +4055 2624 3750 2624 conicto +3377 2624 3160 2379 conicto +2944 2135 2944 1713 conicto +2944 0 lineto +2432 0 lineto +2432 1813 lineto +2432 2227 2283 2425 conicto +2135 2624 1824 2624 conicto +1457 2624 1240 2378 conicto +1024 2132 1024 1713 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1190 2822 1423 2947 conicto +1656 3072 1975 3072 conicto +2298 3072 2524 2912 conicto +2750 2753 2858 2449 conicto +end_ol grestore +gsave 11.290100 7.847500 translate 0.035278 -0.035278 scale +start_ol +3136 1584 moveto +3136 1344 lineto +832 1344 lineto +865 875 1142 629 conicto +1420 384 1916 384 conicto +2203 384 2472 448 conicto +2742 512 3008 640 conicto +3008 192 lineto +2741 67 2460 1 conicto +2179 -64 1890 -64 conicto +1166 -64 743 352 conicto +320 768 320 1477 conicto +320 2211 723 2641 conicto +1126 3072 1809 3072 conicto +2423 3072 2779 2672 conicto +3136 2272 3136 1584 conicto +2624 1728 moveto +2619 2137 2395 2380 conicto +2172 2624 1804 2624 conicto +1387 2624 1136 2388 conicto +886 2153 848 1725 conicto +2624 1728 lineto +end_ol grestore +gsave 11.737180 7.847500 translate 0.035278 -0.035278 scale +start_ol +2432 2880 moveto +2432 2432 lineto +2227 2528 2007 2576 conicto +1787 2624 1551 2624 conicto +1191 2624 1011 2516 conicto +832 2408 832 2192 conicto +832 2027 962 1933 conicto +1093 1839 1487 1754 conicto +1654 1717 lineto +2181 1605 2402 1402 conicto +2624 1199 2624 834 conicto +2624 420 2291 178 conicto +1959 -64 1378 -64 conicto +1136 -64 873 -16 conicto +611 32 320 128 conicto +320 640 lineto +594 512 860 448 conicto +1126 384 1387 384 conicto +1736 384 1924 498 conicto +2112 612 2112 820 conicto +2112 1013 1977 1115 conicto +1843 1218 1388 1313 conicto +1218 1352 lineto +743 1448 531 1646 conicto +320 1845 320 2192 conicto +320 2613 620 2842 conicto +920 3072 1472 3072 conicto +1746 3072 1987 3024 conicto +2228 2976 2432 2880 conicto +end_ol grestore +1.000000 1.000000 1.000000 srgb +n 4.600000 9.150000 m 4.600000 17.800000 l 25.000000 17.800000 l 25.000000 9.150000 l f +0.100000 slw +[] 0 sd +[] 0 sd +0 slj +0.000000 0.000000 0.000000 srgb +n 4.600000 9.150000 m 4.600000 17.800000 l 25.000000 17.800000 l 25.000000 9.150000 l cp s +gsave 5.050000 11.581500 translate 0.035278 -0.035278 scale +start_ol +3264 588 moveto +3264 1664 lineto +2368 1664 lineto +2368 2112 lineto +3840 2112 lineto +3840 389 lineto +3524 165 3143 50 conicto +2763 -64 2331 -64 conicto +1386 -64 853 485 conicto +320 1034 320 2015 conicto +320 2998 849 3547 conicto +1379 4096 2318 4096 conicto +2709 4096 3061 3998 conicto +3414 3901 3712 3712 conicto +3712 3136 lineto +3412 3391 3075 3519 conicto +2738 3648 2366 3648 conicto +1632 3648 1264 3237 conicto +896 2827 896 2015 conicto +896 1205 1261 794 conicto +1627 384 2355 384 conicto +2639 384 2862 433 conicto +3086 483 3264 588 conicto +end_ol grestore +gsave 5.614472 11.581500 translate 0.035278 -0.035278 scale +start_ol +3136 1584 moveto +3136 1344 lineto +832 1344 lineto +865 875 1142 629 conicto +1420 384 1916 384 conicto +2203 384 2472 448 conicto +2742 512 3008 640 conicto +3008 192 lineto +2741 67 2460 1 conicto +2179 -64 1890 -64 conicto +1166 -64 743 352 conicto +320 768 320 1477 conicto +320 2211 723 2641 conicto +1126 3072 1809 3072 conicto +2423 3072 2779 2672 conicto +3136 2272 3136 1584 conicto +2624 1728 moveto +2619 2137 2395 2380 conicto +2172 2624 1804 2624 conicto +1387 2624 1136 2388 conicto +886 2153 848 1725 conicto +2624 1728 lineto +end_ol grestore +gsave 6.061552 11.581500 translate 0.035278 -0.035278 scale +start_ol +3008 1829 moveto +3008 0 lineto +2496 0 lineto +2496 1813 lineto +2496 2220 2335 2422 conicto +2175 2624 1854 2624 conicto +1469 2624 1246 2379 conicto +1024 2135 1024 1713 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1198 2817 1434 2944 conicto +1670 3072 1978 3072 conicto +2486 3072 2747 2756 conicto +3008 2441 3008 1829 conicto +end_ol grestore +gsave 6.523613 11.581500 translate 0.035278 -0.035278 scale +start_ol +3136 1584 moveto +3136 1344 lineto +832 1344 lineto +865 875 1142 629 conicto +1420 384 1916 384 conicto +2203 384 2472 448 conicto +2742 512 3008 640 conicto +3008 192 lineto +2741 67 2460 1 conicto +2179 -64 1890 -64 conicto +1166 -64 743 352 conicto +320 768 320 1477 conicto +320 2211 723 2641 conicto +1126 3072 1809 3072 conicto +2423 3072 2779 2672 conicto +3136 2272 3136 1584 conicto +2624 1728 moveto +2619 2137 2395 2380 conicto +2172 2624 1804 2624 conicto +1387 2624 1136 2388 conicto +886 2153 848 1725 conicto +2624 1728 lineto +end_ol grestore +gsave 6.970693 11.581500 translate 0.035278 -0.035278 scale +start_ol +2304 2560 moveto +2220 2593 2120 2608 conicto +2021 2624 1902 2624 conicto +1478 2624 1251 2359 conicto +1024 2094 1024 1597 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1182 2820 1434 2946 conicto +1687 3072 2049 3072 conicto +2100 3072 2162 3072 conicto +2225 3072 2301 3072 conicto +2304 2560 lineto +end_ol grestore +gsave 7.270408 11.581500 translate 0.035278 -0.035278 scale +start_ol +1882 1536 moveto +1289 1536 1060 1402 conicto +832 1268 832 944 conicto +832 686 1003 535 conicto +1175 384 1470 384 conicto +1876 384 2122 667 conicto +2368 951 2368 1422 conicto +2368 1536 lineto +1882 1536 lineto +2880 1739 moveto +2880 0 lineto +2368 0 lineto +2368 448 lineto +2199 186 1946 61 conicto +1693 -64 1328 -64 conicto +866 -64 593 196 conicto +320 456 320 893 conicto +320 1402 662 1661 conicto +1004 1920 1682 1920 conicto +2368 1920 lineto +2368 1962 lineto +2368 2278 2140 2451 conicto +1912 2624 1500 2624 conicto +1238 2624 989 2560 conicto +741 2496 512 2368 conicto +512 2816 lineto +789 2944 1049 3008 conicto +1310 3072 1556 3072 conicto +2222 3072 2551 2741 conicto +2880 2411 2880 1739 conicto +end_ol grestore +gsave 7.717488 11.581500 translate 0.035278 -0.035278 scale +start_ol +1024 3840 moveto +1024 3008 lineto +2048 3008 lineto +2048 2624 lineto +1024 2624 lineto +1024 1016 lineto +1024 654 1125 551 conicto +1226 448 1537 448 conicto +2048 448 lineto +2048 0 lineto +1537 0 lineto +955 0 733 219 conicto +512 438 512 1016 conicto +512 2624 lineto +128 2624 lineto +128 3008 lineto +512 3008 lineto +512 3840 lineto +1024 3840 lineto +end_ol grestore +gsave 8.002221 11.581500 translate 0.035278 -0.035278 scale +start_ol +3136 1584 moveto +3136 1344 lineto +832 1344 lineto +865 875 1142 629 conicto +1420 384 1916 384 conicto +2203 384 2472 448 conicto +2742 512 3008 640 conicto +3008 192 lineto +2741 67 2460 1 conicto +2179 -64 1890 -64 conicto +1166 -64 743 352 conicto +320 768 320 1477 conicto +320 2211 723 2641 conicto +1126 3072 1809 3072 conicto +2423 3072 2779 2672 conicto +3136 2272 3136 1584 conicto +2624 1728 moveto +2619 2137 2395 2380 conicto +2172 2624 1804 2624 conicto +1387 2624 1136 2388 conicto +886 2153 848 1725 conicto +2624 1728 lineto +end_ol grestore +gsave 8.449301 11.581500 translate 0.035278 -0.035278 scale +start_ol +end_ol grestore +gsave 8.681580 11.581500 translate 0.035278 -0.035278 scale +start_ol +1399 4032 moveto +1856 4032 lineto +457 -512 lineto +0 -512 lineto +1399 4032 lineto +end_ol grestore +gsave 8.926344 11.581500 translate 0.035278 -0.035278 scale +start_ol +end_ol grestore +gsave 9.158624 11.581500 translate 0.035278 -0.035278 scale +start_ol +2445 1914 moveto +2620 1854 2786 1657 conicto +2953 1461 3120 1117 conicto +3648 0 lineto +3087 0 lineto +2571 1049 lineto +2373 1460 2186 1594 conicto +2000 1728 1678 1728 conicto +1088 1728 lineto +1088 0 lineto +512 0 lineto +512 4032 lineto +1769 4032 lineto +2458 4032 2797 3746 conicto +3136 3460 3136 2884 conicto +3136 2507 2959 2258 conicto +2782 2010 2445 1914 conicto +1088 3584 moveto +1088 2176 lineto +1769 2176 lineto +2160 2176 2360 2355 conicto +2560 2535 2560 2883 conicto +2560 3231 2360 3407 conicto +2160 3584 1769 3584 conicto +1088 3584 lineto +end_ol grestore +gsave 9.633179 11.581500 translate 0.035278 -0.035278 scale +start_ol +3136 1584 moveto +3136 1344 lineto +832 1344 lineto +865 875 1142 629 conicto +1420 384 1916 384 conicto +2203 384 2472 448 conicto +2742 512 3008 640 conicto +3008 192 lineto +2741 67 2460 1 conicto +2179 -64 1890 -64 conicto +1166 -64 743 352 conicto +320 768 320 1477 conicto +320 2211 723 2641 conicto +1126 3072 1809 3072 conicto +2423 3072 2779 2672 conicto +3136 2272 3136 1584 conicto +2624 1728 moveto +2619 2137 2395 2380 conicto +2172 2624 1804 2624 conicto +1387 2624 1136 2388 conicto +886 2153 848 1725 conicto +2624 1728 lineto +end_ol grestore +gsave 10.080258 11.581500 translate 0.035278 -0.035278 scale +start_ol +1882 1536 moveto +1289 1536 1060 1402 conicto +832 1268 832 944 conicto +832 686 1003 535 conicto +1175 384 1470 384 conicto +1876 384 2122 667 conicto +2368 951 2368 1422 conicto +2368 1536 lineto +1882 1536 lineto +2880 1739 moveto +2880 0 lineto +2368 0 lineto +2368 448 lineto +2199 186 1946 61 conicto +1693 -64 1328 -64 conicto +866 -64 593 196 conicto +320 456 320 893 conicto +320 1402 662 1661 conicto +1004 1920 1682 1920 conicto +2368 1920 lineto +2368 1962 lineto +2368 2278 2140 2451 conicto +1912 2624 1500 2624 conicto +1238 2624 989 2560 conicto +741 2496 512 2368 conicto +512 2816 lineto +789 2944 1049 3008 conicto +1310 3072 1556 3072 conicto +2222 3072 2551 2741 conicto +2880 2411 2880 1739 conicto +end_ol grestore +gsave 10.527338 11.581500 translate 0.035278 -0.035278 scale +start_ol +2496 2560 moveto +2496 4160 lineto +3008 4160 lineto +3008 0 lineto +2496 0 lineto +2496 448 lineto +2342 188 2106 62 conicto +1870 -64 1540 -64 conicto +999 -64 659 368 conicto +320 800 320 1504 conicto +320 2208 659 2640 conicto +999 3072 1540 3072 conicto +1870 3072 2106 2946 conicto +2342 2820 2496 2560 conicto +832 1504 moveto +832 980 1053 682 conicto +1275 384 1663 384 conicto +2050 384 2273 682 conicto +2496 980 2496 1504 conicto +2496 2028 2273 2326 conicto +2050 2624 1663 2624 conicto +1275 2624 1053 2326 conicto +832 2028 832 1504 conicto +end_ol grestore +gsave 10.989400 11.581500 translate 0.035278 -0.035278 scale +start_ol +end_ol grestore +gsave 11.221679 11.581500 translate 0.035278 -0.035278 scale +start_ol +2944 3904 moveto +2944 3392 lineto +2643 3521 2376 3584 conicto +2109 3648 1860 3648 conicto +1429 3648 1194 3480 conicto +960 3313 960 3004 conicto +960 2745 1115 2613 conicto +1271 2481 1704 2400 conicto +2023 2332 lineto +2626 2215 2913 1920 conicto +3200 1626 3200 1133 conicto +3200 544 2808 240 conicto +2416 -64 1659 -64 conicto +1373 -64 1051 0 conicto +729 65 384 192 conicto +384 768 lineto +714 577 1031 480 conicto +1348 384 1654 384 conicto +2119 384 2371 568 conicto +2624 753 2624 1095 conicto +2624 1393 2447 1561 conicto +2271 1730 1869 1814 conicto +1548 1879 lineto +933 1999 658 2254 conicto +384 2509 384 2964 conicto +384 3490 755 3793 conicto +1127 4096 1780 4096 conicto +2060 4096 2350 4048 conicto +2640 4000 2944 3904 conicto +end_ol grestore +gsave 11.683741 11.581500 translate 0.035278 -0.035278 scale +start_ol +1697 2624 moveto +1297 2624 1064 2324 conicto +832 2025 832 1504 conicto +832 983 1063 683 conicto +1294 384 1697 384 conicto +2095 384 2327 684 conicto +2560 985 2560 1504 conicto +2560 2020 2327 2322 conicto +2095 2624 1697 2624 conicto +1696 3072 moveto +2338 3072 2705 2656 conicto +3072 2240 3072 1504 conicto +3072 771 2705 353 conicto +2338 -64 1696 -64 conicto +1051 -64 685 353 conicto +320 771 320 1504 conicto +320 2240 685 2656 conicto +1051 3072 1696 3072 conicto +end_ol grestore +gsave 12.128324 11.581500 translate 0.035278 -0.035278 scale +start_ol +512 1177 moveto +512 3008 lineto +1024 3008 lineto +1024 1196 lineto +1024 790 1184 587 conicto +1344 384 1664 384 conicto +2049 384 2272 628 conicto +2496 872 2496 1293 conicto +2496 3008 lineto +3008 3008 lineto +3008 0 lineto +2496 0 lineto +2496 448 lineto +2320 188 2087 62 conicto +1854 -64 1546 -64 conicto +1038 -64 775 252 conicto +512 568 512 1177 conicto +1744 3072 moveto +1744 3072 lineto +end_ol grestore +gsave 12.590385 11.581500 translate 0.035278 -0.035278 scale +start_ol +2304 2560 moveto +2220 2593 2120 2608 conicto +2021 2624 1902 2624 conicto +1478 2624 1251 2359 conicto +1024 2094 1024 1597 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1182 2820 1434 2946 conicto +1687 3072 2049 3072 conicto +2100 3072 2162 3072 conicto +2225 3072 2301 3072 conicto +2304 2560 lineto +end_ol grestore +gsave 12.875118 11.581500 translate 0.035278 -0.035278 scale +start_ol +2688 2816 moveto +2688 2368 lineto +2479 2496 2268 2560 conicto +2058 2624 1843 2624 conicto +1363 2624 1097 2329 conicto +832 2035 832 1504 conicto +832 973 1097 678 conicto +1363 384 1843 384 conicto +2058 384 2268 448 conicto +2479 512 2688 640 conicto +2688 192 lineto +2482 64 2261 0 conicto +2041 -64 1793 -64 conicto +1116 -64 718 360 conicto +320 784 320 1504 conicto +320 2235 722 2653 conicto +1124 3072 1825 3072 conicto +2052 3072 2268 3008 conicto +2485 2944 2688 2816 conicto +end_ol grestore +gsave 13.274738 11.581500 translate 0.035278 -0.035278 scale +start_ol +3136 1584 moveto +3136 1344 lineto +832 1344 lineto +865 875 1142 629 conicto +1420 384 1916 384 conicto +2203 384 2472 448 conicto +2742 512 3008 640 conicto +3008 192 lineto +2741 67 2460 1 conicto +2179 -64 1890 -64 conicto +1166 -64 743 352 conicto +320 768 320 1477 conicto +320 2211 723 2641 conicto +1126 3072 1809 3072 conicto +2423 3072 2779 2672 conicto +3136 2272 3136 1584 conicto +2624 1728 moveto +2619 2137 2395 2380 conicto +2172 2624 1804 2624 conicto +1387 2624 1136 2388 conicto +886 2153 848 1725 conicto +2624 1728 lineto +end_ol grestore +gsave 5.050000 13.581500 translate 0.035278 -0.035278 scale +start_ol +512 4032 moveto +1088 4032 lineto +1088 0 lineto +512 0 lineto +512 4032 lineto +end_ol grestore +gsave 5.264792 13.581500 translate 0.035278 -0.035278 scale +start_ol +3008 1829 moveto +3008 0 lineto +2496 0 lineto +2496 1813 lineto +2496 2220 2335 2422 conicto +2175 2624 1854 2624 conicto +1469 2624 1246 2379 conicto +1024 2135 1024 1713 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1198 2817 1434 2944 conicto +1670 3072 1978 3072 conicto +2486 3072 2747 2756 conicto +3008 2441 3008 1829 conicto +end_ol grestore +gsave 5.726854 13.581500 translate 0.035278 -0.035278 scale +start_ol +192 3008 moveto +719 3008 lineto +1664 483 lineto +2609 3008 lineto +3136 3008 lineto +2002 0 lineto +1326 0 lineto +192 3008 lineto +end_ol grestore +gsave 6.158943 13.581500 translate 0.035278 -0.035278 scale +start_ol +3136 1584 moveto +3136 1344 lineto +832 1344 lineto +865 875 1142 629 conicto +1420 384 1916 384 conicto +2203 384 2472 448 conicto +2742 512 3008 640 conicto +3008 192 lineto +2741 67 2460 1 conicto +2179 -64 1890 -64 conicto +1166 -64 743 352 conicto +320 768 320 1477 conicto +320 2211 723 2641 conicto +1126 3072 1809 3072 conicto +2423 3072 2779 2672 conicto +3136 2272 3136 1584 conicto +2624 1728 moveto +2619 2137 2395 2380 conicto +2172 2624 1804 2624 conicto +1387 2624 1136 2388 conicto +886 2153 848 1725 conicto +2624 1728 lineto +end_ol grestore +gsave 6.606023 13.581500 translate 0.035278 -0.035278 scale +start_ol +2304 2560 moveto +2220 2593 2120 2608 conicto +2021 2624 1902 2624 conicto +1478 2624 1251 2359 conicto +1024 2094 1024 1597 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1182 2820 1434 2946 conicto +1687 3072 2049 3072 conicto +2100 3072 2162 3072 conicto +2225 3072 2301 3072 conicto +2304 2560 lineto +end_ol grestore +gsave 6.905738 13.581500 translate 0.035278 -0.035278 scale +start_ol +1024 3840 moveto +1024 3008 lineto +2048 3008 lineto +2048 2624 lineto +1024 2624 lineto +1024 1016 lineto +1024 654 1125 551 conicto +1226 448 1537 448 conicto +2048 448 lineto +2048 0 lineto +1537 0 lineto +955 0 733 219 conicto +512 438 512 1016 conicto +512 2624 lineto +128 2624 lineto +128 3008 lineto +512 3008 lineto +512 3840 lineto +1024 3840 lineto +end_ol grestore +gsave 7.190471 13.581500 translate 0.035278 -0.035278 scale +start_ol +end_ol grestore +gsave 7.422750 13.581500 translate 0.035278 -0.035278 scale +start_ol +1088 3584 moveto +1088 448 lineto +1739 448 lineto +2563 448 2945 826 conicto +3328 1204 3328 2020 conicto +3328 2830 2945 3207 conicto +2563 3584 1739 3584 conicto +1088 3584 lineto +512 4032 moveto +1656 4032 lineto +2817 4032 3360 3544 conicto +3904 3057 3904 2020 conicto +3904 978 3358 489 conicto +2812 0 1656 0 conicto +512 0 lineto +512 4032 lineto +end_ol grestore +gsave 7.984725 13.581500 translate 0.035278 -0.035278 scale +start_ol +512 3008 moveto +1024 3008 lineto +1024 0 lineto +512 0 lineto +512 3008 lineto +512 4160 moveto +1024 4160 lineto +1024 3520 lineto +512 3520 lineto +512 4160 lineto +end_ol grestore +gsave 8.187032 13.581500 translate 0.035278 -0.035278 scale +start_ol +2304 2560 moveto +2220 2593 2120 2608 conicto +2021 2624 1902 2624 conicto +1478 2624 1251 2359 conicto +1024 2094 1024 1597 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1182 2820 1434 2946 conicto +1687 3072 2049 3072 conicto +2100 3072 2162 3072 conicto +2225 3072 2301 3072 conicto +2304 2560 lineto +end_ol grestore +gsave 8.486747 13.581500 translate 0.035278 -0.035278 scale +start_ol +1882 1536 moveto +1289 1536 1060 1402 conicto +832 1268 832 944 conicto +832 686 1003 535 conicto +1175 384 1470 384 conicto +1876 384 2122 667 conicto +2368 951 2368 1422 conicto +2368 1536 lineto +1882 1536 lineto +2880 1739 moveto +2880 0 lineto +2368 0 lineto +2368 448 lineto +2199 186 1946 61 conicto +1693 -64 1328 -64 conicto +866 -64 593 196 conicto +320 456 320 893 conicto +320 1402 662 1661 conicto +1004 1920 1682 1920 conicto +2368 1920 lineto +2368 1962 lineto +2368 2278 2140 2451 conicto +1912 2624 1500 2624 conicto +1238 2624 989 2560 conicto +741 2496 512 2368 conicto +512 2816 lineto +789 2944 1049 3008 conicto +1310 3072 1556 3072 conicto +2222 3072 2551 2741 conicto +2880 2411 2880 1739 conicto +end_ol grestore +gsave 8.933827 13.581500 translate 0.035278 -0.035278 scale +start_ol +2688 2816 moveto +2688 2368 lineto +2479 2496 2268 2560 conicto +2058 2624 1843 2624 conicto +1363 2624 1097 2329 conicto +832 2035 832 1504 conicto +832 973 1097 678 conicto +1363 384 1843 384 conicto +2058 384 2268 448 conicto +2479 512 2688 640 conicto +2688 192 lineto +2482 64 2261 0 conicto +2041 -64 1793 -64 conicto +1116 -64 718 360 conicto +320 784 320 1504 conicto +320 2235 722 2653 conicto +1124 3072 1825 3072 conicto +2052 3072 2268 3008 conicto +2485 2944 2688 2816 conicto +end_ol grestore +gsave 9.333447 13.581500 translate 0.035278 -0.035278 scale +start_ol +end_ol grestore +gsave 9.565726 13.581500 translate 0.035278 -0.035278 scale +start_ol +2179 3648 moveto +1590 3648 1243 3209 conicto +896 2771 896 2015 conicto +896 1261 1243 822 conicto +1590 384 2179 384 conicto +2768 384 3112 822 conicto +3456 1261 3456 2015 conicto +3456 2771 3112 3209 conicto +2768 3648 2179 3648 conicto +2179 4096 moveto +3022 4096 3527 3530 conicto +4032 2965 4032 2015 conicto +4032 1067 3527 501 conicto +3022 -64 2179 -64 conicto +1333 -64 826 500 conicto +320 1064 320 2015 conicto +320 2965 826 3530 conicto +1333 4096 2179 4096 conicto +end_ol grestore +gsave 10.140186 13.581500 translate 0.035278 -0.035278 scale +start_ol +1024 448 moveto +1024 -1152 lineto +512 -1152 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1178 2820 1414 2946 conicto +1650 3072 1977 3072 conicto +2521 3072 2860 2640 conicto +3200 2208 3200 1504 conicto +3200 800 2860 368 conicto +2521 -64 1977 -64 conicto +1650 -64 1414 62 conicto +1178 188 1024 448 conicto +2688 1504 moveto +2688 2028 2466 2326 conicto +2244 2624 1856 2624 conicto +1468 2624 1246 2326 conicto +1024 2028 1024 1504 conicto +1024 980 1246 682 conicto +1468 384 1856 384 conicto +2244 384 2466 682 conicto +2688 980 2688 1504 conicto +end_ol grestore +gsave 10.602248 13.581500 translate 0.035278 -0.035278 scale +start_ol +3136 1584 moveto +3136 1344 lineto +832 1344 lineto +865 875 1142 629 conicto +1420 384 1916 384 conicto +2203 384 2472 448 conicto +2742 512 3008 640 conicto +3008 192 lineto +2741 67 2460 1 conicto +2179 -64 1890 -64 conicto +1166 -64 743 352 conicto +320 768 320 1477 conicto +320 2211 723 2641 conicto +1126 3072 1809 3072 conicto +2423 3072 2779 2672 conicto +3136 2272 3136 1584 conicto +2624 1728 moveto +2619 2137 2395 2380 conicto +2172 2624 1804 2624 conicto +1387 2624 1136 2388 conicto +886 2153 848 1725 conicto +2624 1728 lineto +end_ol grestore +gsave 11.049328 13.581500 translate 0.035278 -0.035278 scale +start_ol +2304 2560 moveto +2220 2593 2120 2608 conicto +2021 2624 1902 2624 conicto +1478 2624 1251 2359 conicto +1024 2094 1024 1597 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1182 2820 1434 2946 conicto +1687 3072 2049 3072 conicto +2100 3072 2162 3072 conicto +2225 3072 2301 3072 conicto +2304 2560 lineto +end_ol grestore +gsave 11.349043 13.581500 translate 0.035278 -0.035278 scale +start_ol +1882 1536 moveto +1289 1536 1060 1402 conicto +832 1268 832 944 conicto +832 686 1003 535 conicto +1175 384 1470 384 conicto +1876 384 2122 667 conicto +2368 951 2368 1422 conicto +2368 1536 lineto +1882 1536 lineto +2880 1739 moveto +2880 0 lineto +2368 0 lineto +2368 448 lineto +2199 186 1946 61 conicto +1693 -64 1328 -64 conicto +866 -64 593 196 conicto +320 456 320 893 conicto +320 1402 662 1661 conicto +1004 1920 1682 1920 conicto +2368 1920 lineto +2368 1962 lineto +2368 2278 2140 2451 conicto +1912 2624 1500 2624 conicto +1238 2624 989 2560 conicto +741 2496 512 2368 conicto +512 2816 lineto +789 2944 1049 3008 conicto +1310 3072 1556 3072 conicto +2222 3072 2551 2741 conicto +2880 2411 2880 1739 conicto +end_ol grestore +gsave 11.796123 13.581500 translate 0.035278 -0.035278 scale +start_ol +1024 3840 moveto +1024 3008 lineto +2048 3008 lineto +2048 2624 lineto +1024 2624 lineto +1024 1016 lineto +1024 654 1125 551 conicto +1226 448 1537 448 conicto +2048 448 lineto +2048 0 lineto +1537 0 lineto +955 0 733 219 conicto +512 438 512 1016 conicto +512 2624 lineto +128 2624 lineto +128 3008 lineto +512 3008 lineto +512 3840 lineto +1024 3840 lineto +end_ol grestore +gsave 12.080856 13.581500 translate 0.035278 -0.035278 scale +start_ol +1697 2624 moveto +1297 2624 1064 2324 conicto +832 2025 832 1504 conicto +832 983 1063 683 conicto +1294 384 1697 384 conicto +2095 384 2327 684 conicto +2560 985 2560 1504 conicto +2560 2020 2327 2322 conicto +2095 2624 1697 2624 conicto +1696 3072 moveto +2338 3072 2705 2656 conicto +3072 2240 3072 1504 conicto +3072 771 2705 353 conicto +2338 -64 1696 -64 conicto +1051 -64 685 353 conicto +320 771 320 1504 conicto +320 2240 685 2656 conicto +1051 3072 1696 3072 conicto +end_ol grestore +gsave 12.525438 13.581500 translate 0.035278 -0.035278 scale +start_ol +2304 2560 moveto +2220 2593 2120 2608 conicto +2021 2624 1902 2624 conicto +1478 2624 1251 2359 conicto +1024 2094 1024 1597 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1182 2820 1434 2946 conicto +1687 3072 2049 3072 conicto +2100 3072 2162 3072 conicto +2225 3072 2301 3072 conicto +2304 2560 lineto +end_ol grestore +gsave 12.825153 13.581500 translate 0.035278 -0.035278 scale +start_ol +end_ol grestore +gsave 13.057433 13.581500 translate 0.035278 -0.035278 scale +start_ol +1697 2624 moveto +1297 2624 1064 2324 conicto +832 2025 832 1504 conicto +832 983 1063 683 conicto +1294 384 1697 384 conicto +2095 384 2327 684 conicto +2560 985 2560 1504 conicto +2560 2020 2327 2322 conicto +2095 2624 1697 2624 conicto +1696 3072 moveto +2338 3072 2705 2656 conicto +3072 2240 3072 1504 conicto +3072 771 2705 353 conicto +2338 -64 1696 -64 conicto +1051 -64 685 353 conicto +320 771 320 1504 conicto +320 2240 685 2656 conicto +1051 3072 1696 3072 conicto +end_ol grestore +gsave 13.502015 13.581500 translate 0.035278 -0.035278 scale +start_ol +3008 1829 moveto +3008 0 lineto +2496 0 lineto +2496 1813 lineto +2496 2220 2335 2422 conicto +2175 2624 1854 2624 conicto +1469 2624 1246 2379 conicto +1024 2135 1024 1713 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1198 2817 1434 2944 conicto +1670 3072 1978 3072 conicto +2486 3072 2747 2756 conicto +3008 2441 3008 1829 conicto +end_ol grestore +gsave 13.964077 13.581500 translate 0.035278 -0.035278 scale +start_ol +end_ol grestore +gsave 14.196356 13.581500 translate 0.035278 -0.035278 scale +start_ol +2944 3904 moveto +2944 3392 lineto +2643 3521 2376 3584 conicto +2109 3648 1860 3648 conicto +1429 3648 1194 3480 conicto +960 3313 960 3004 conicto +960 2745 1115 2613 conicto +1271 2481 1704 2400 conicto +2023 2332 lineto +2626 2215 2913 1920 conicto +3200 1626 3200 1133 conicto +3200 544 2808 240 conicto +2416 -64 1659 -64 conicto +1373 -64 1051 0 conicto +729 65 384 192 conicto +384 768 lineto +714 577 1031 480 conicto +1348 384 1654 384 conicto +2119 384 2371 568 conicto +2624 753 2624 1095 conicto +2624 1393 2447 1561 conicto +2271 1730 1869 1814 conicto +1548 1879 lineto +933 1999 658 2254 conicto +384 2509 384 2964 conicto +384 3490 755 3793 conicto +1127 4096 1780 4096 conicto +2060 4096 2350 4048 conicto +2640 4000 2944 3904 conicto +end_ol grestore +gsave 14.658418 13.581500 translate 0.035278 -0.035278 scale +start_ol +1697 2624 moveto +1297 2624 1064 2324 conicto +832 2025 832 1504 conicto +832 983 1063 683 conicto +1294 384 1697 384 conicto +2095 384 2327 684 conicto +2560 985 2560 1504 conicto +2560 2020 2327 2322 conicto +2095 2624 1697 2624 conicto +1696 3072 moveto +2338 3072 2705 2656 conicto +3072 2240 3072 1504 conicto +3072 771 2705 353 conicto +2338 -64 1696 -64 conicto +1051 -64 685 353 conicto +320 771 320 1504 conicto +320 2240 685 2656 conicto +1051 3072 1696 3072 conicto +end_ol grestore +gsave 15.103001 13.581500 translate 0.035278 -0.035278 scale +start_ol +512 1177 moveto +512 3008 lineto +1024 3008 lineto +1024 1196 lineto +1024 790 1184 587 conicto +1344 384 1664 384 conicto +2049 384 2272 628 conicto +2496 872 2496 1293 conicto +2496 3008 lineto +3008 3008 lineto +3008 0 lineto +2496 0 lineto +2496 448 lineto +2320 188 2087 62 conicto +1854 -64 1546 -64 conicto +1038 -64 775 252 conicto +512 568 512 1177 conicto +1744 3072 moveto +1744 3072 lineto +end_ol grestore +gsave 15.565063 13.581500 translate 0.035278 -0.035278 scale +start_ol +2304 2560 moveto +2220 2593 2120 2608 conicto +2021 2624 1902 2624 conicto +1478 2624 1251 2359 conicto +1024 2094 1024 1597 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1182 2820 1434 2946 conicto +1687 3072 2049 3072 conicto +2100 3072 2162 3072 conicto +2225 3072 2301 3072 conicto +2304 2560 lineto +end_ol grestore +gsave 15.849796 13.581500 translate 0.035278 -0.035278 scale +start_ol +2688 2816 moveto +2688 2368 lineto +2479 2496 2268 2560 conicto +2058 2624 1843 2624 conicto +1363 2624 1097 2329 conicto +832 2035 832 1504 conicto +832 973 1097 678 conicto +1363 384 1843 384 conicto +2058 384 2268 448 conicto +2479 512 2688 640 conicto +2688 192 lineto +2482 64 2261 0 conicto +2041 -64 1793 -64 conicto +1116 -64 718 360 conicto +320 784 320 1504 conicto +320 2235 722 2653 conicto +1124 3072 1825 3072 conicto +2052 3072 2268 3008 conicto +2485 2944 2688 2816 conicto +end_ol grestore +gsave 16.249416 13.581500 translate 0.035278 -0.035278 scale +start_ol +3136 1584 moveto +3136 1344 lineto +832 1344 lineto +865 875 1142 629 conicto +1420 384 1916 384 conicto +2203 384 2472 448 conicto +2742 512 3008 640 conicto +3008 192 lineto +2741 67 2460 1 conicto +2179 -64 1890 -64 conicto +1166 -64 743 352 conicto +320 768 320 1477 conicto +320 2211 723 2641 conicto +1126 3072 1809 3072 conicto +2423 3072 2779 2672 conicto +3136 2272 3136 1584 conicto +2624 1728 moveto +2619 2137 2395 2380 conicto +2172 2624 1804 2624 conicto +1387 2624 1136 2388 conicto +886 2153 848 1725 conicto +2624 1728 lineto +end_ol grestore +gsave 5.050000 15.581500 translate 0.035278 -0.035278 scale +start_ol +2944 3904 moveto +2944 3392 lineto +2643 3521 2376 3584 conicto +2109 3648 1860 3648 conicto +1429 3648 1194 3480 conicto +960 3313 960 3004 conicto +960 2745 1115 2613 conicto +1271 2481 1704 2400 conicto +2023 2332 lineto +2626 2215 2913 1920 conicto +3200 1626 3200 1133 conicto +3200 544 2808 240 conicto +2416 -64 1659 -64 conicto +1373 -64 1051 0 conicto +729 65 384 192 conicto +384 768 lineto +714 577 1031 480 conicto +1348 384 1654 384 conicto +2119 384 2371 568 conicto +2624 753 2624 1095 conicto +2624 1393 2447 1561 conicto +2271 1730 1869 1814 conicto +1548 1879 lineto +933 1999 658 2254 conicto +384 2509 384 2964 conicto +384 3490 755 3793 conicto +1127 4096 1780 4096 conicto +2060 4096 2350 4048 conicto +2640 4000 2944 3904 conicto +end_ol grestore +gsave 5.512062 15.581500 translate 0.035278 -0.035278 scale +start_ol +1024 3840 moveto +1024 3008 lineto +2048 3008 lineto +2048 2624 lineto +1024 2624 lineto +1024 1016 lineto +1024 654 1125 551 conicto +1226 448 1537 448 conicto +2048 448 lineto +2048 0 lineto +1537 0 lineto +955 0 733 219 conicto +512 438 512 1016 conicto +512 2624 lineto +128 2624 lineto +128 3008 lineto +512 3008 lineto +512 3840 lineto +1024 3840 lineto +end_ol grestore +gsave 5.796795 15.581500 translate 0.035278 -0.035278 scale +start_ol +1697 2624 moveto +1297 2624 1064 2324 conicto +832 2025 832 1504 conicto +832 983 1063 683 conicto +1294 384 1697 384 conicto +2095 384 2327 684 conicto +2560 985 2560 1504 conicto +2560 2020 2327 2322 conicto +2095 2624 1697 2624 conicto +1696 3072 moveto +2338 3072 2705 2656 conicto +3072 2240 3072 1504 conicto +3072 771 2705 353 conicto +2338 -64 1696 -64 conicto +1051 -64 685 353 conicto +320 771 320 1504 conicto +320 2240 685 2656 conicto +1051 3072 1696 3072 conicto +end_ol grestore +gsave 6.241377 15.581500 translate 0.035278 -0.035278 scale +start_ol +2304 2560 moveto +2220 2593 2120 2608 conicto +2021 2624 1902 2624 conicto +1478 2624 1251 2359 conicto +1024 2094 1024 1597 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1182 2820 1434 2946 conicto +1687 3072 2049 3072 conicto +2100 3072 2162 3072 conicto +2225 3072 2301 3072 conicto +2304 2560 lineto +end_ol grestore +gsave 6.526110 15.581500 translate 0.035278 -0.035278 scale +start_ol +3136 1584 moveto +3136 1344 lineto +832 1344 lineto +865 875 1142 629 conicto +1420 384 1916 384 conicto +2203 384 2472 448 conicto +2742 512 3008 640 conicto +3008 192 lineto +2741 67 2460 1 conicto +2179 -64 1890 -64 conicto +1166 -64 743 352 conicto +320 768 320 1477 conicto +320 2211 723 2641 conicto +1126 3072 1809 3072 conicto +2423 3072 2779 2672 conicto +3136 2272 3136 1584 conicto +2624 1728 moveto +2619 2137 2395 2380 conicto +2172 2624 1804 2624 conicto +1387 2624 1136 2388 conicto +886 2153 848 1725 conicto +2624 1728 lineto +end_ol grestore +gsave 6.973190 15.581500 translate 0.035278 -0.035278 scale +start_ol +end_ol grestore +gsave 7.205469 15.581500 translate 0.035278 -0.035278 scale +start_ol +1088 3584 moveto +1088 2048 lineto +1769 2048 lineto +2147 2048 2353 2248 conicto +2560 2448 2560 2817 conicto +2560 3184 2353 3384 conicto +2147 3584 1769 3584 conicto +1088 3584 lineto +512 4032 moveto +1769 4032 lineto +2444 4032 2790 3723 conicto +3136 3414 3136 2817 conicto +3136 2215 2790 1907 conicto +2444 1600 1769 1600 conicto +1088 1600 lineto +1088 0 lineto +512 0 lineto +512 4032 lineto +end_ol grestore +gsave 7.632565 15.581500 translate 0.035278 -0.035278 scale +start_ol +2304 2560 moveto +2220 2593 2120 2608 conicto +2021 2624 1902 2624 conicto +1478 2624 1251 2359 conicto +1024 2094 1024 1597 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1182 2820 1434 2946 conicto +1687 3072 2049 3072 conicto +2100 3072 2162 3072 conicto +2225 3072 2301 3072 conicto +2304 2560 lineto +end_ol grestore +gsave 7.917298 15.581500 translate 0.035278 -0.035278 scale +start_ol +1697 2624 moveto +1297 2624 1064 2324 conicto +832 2025 832 1504 conicto +832 983 1063 683 conicto +1294 384 1697 384 conicto +2095 384 2327 684 conicto +2560 985 2560 1504 conicto +2560 2020 2327 2322 conicto +2095 2624 1697 2624 conicto +1696 3072 moveto +2338 3072 2705 2656 conicto +3072 2240 3072 1504 conicto +3072 771 2705 353 conicto +2338 -64 1696 -64 conicto +1051 -64 685 353 conicto +320 771 320 1504 conicto +320 2240 685 2656 conicto +1051 3072 1696 3072 conicto +end_ol grestore +gsave 8.361881 15.581500 translate 0.035278 -0.035278 scale +start_ol +1024 448 moveto +1024 -1152 lineto +512 -1152 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1178 2820 1414 2946 conicto +1650 3072 1977 3072 conicto +2521 3072 2860 2640 conicto +3200 2208 3200 1504 conicto +3200 800 2860 368 conicto +2521 -64 1977 -64 conicto +1650 -64 1414 62 conicto +1178 188 1024 448 conicto +2688 1504 moveto +2688 2028 2466 2326 conicto +2244 2624 1856 2624 conicto +1468 2624 1246 2326 conicto +1024 2028 1024 1504 conicto +1024 980 1246 682 conicto +1468 384 1856 384 conicto +2244 384 2466 682 conicto +2688 980 2688 1504 conicto +end_ol grestore +gsave 8.823942 15.581500 translate 0.035278 -0.035278 scale +start_ol +1882 1536 moveto +1289 1536 1060 1402 conicto +832 1268 832 944 conicto +832 686 1003 535 conicto +1175 384 1470 384 conicto +1876 384 2122 667 conicto +2368 951 2368 1422 conicto +2368 1536 lineto +1882 1536 lineto +2880 1739 moveto +2880 0 lineto +2368 0 lineto +2368 448 lineto +2199 186 1946 61 conicto +1693 -64 1328 -64 conicto +866 -64 593 196 conicto +320 456 320 893 conicto +320 1402 662 1661 conicto +1004 1920 1682 1920 conicto +2368 1920 lineto +2368 1962 lineto +2368 2278 2140 2451 conicto +1912 2624 1500 2624 conicto +1238 2624 989 2560 conicto +741 2496 512 2368 conicto +512 2816 lineto +789 2944 1049 3008 conicto +1310 3072 1556 3072 conicto +2222 3072 2551 2741 conicto +2880 2411 2880 1739 conicto +end_ol grestore +gsave 9.271022 15.581500 translate 0.035278 -0.035278 scale +start_ol +2496 1535 moveto +2496 2053 2277 2338 conicto +2058 2624 1663 2624 conicto +1270 2624 1051 2338 conicto +832 2053 832 1535 conicto +832 1019 1051 733 conicto +1270 448 1663 448 conicto +2058 448 2277 733 conicto +2496 1019 2496 1535 conicto +3008 404 moveto +3008 -384 2670 -768 conicto +2332 -1152 1635 -1152 conicto +1377 -1152 1148 -1105 conicto +920 -1058 704 -960 conicto +704 -448 lineto +917 -579 1124 -641 conicto +1332 -704 1547 -704 conicto +2023 -704 2259 -452 conicto +2496 -201 2496 308 conicto +2496 512 lineto +2344 255 2107 127 conicto +1870 0 1540 0 conicto +991 0 655 420 conicto +320 841 320 1535 conicto +320 2231 655 2651 conicto +991 3072 1540 3072 conicto +1870 3072 2107 2944 conicto +2344 2817 2496 2560 conicto +2496 3008 lineto +3008 3008 lineto +3008 404 lineto +end_ol grestore +gsave 9.733084 15.581500 translate 0.035278 -0.035278 scale +start_ol +1882 1536 moveto +1289 1536 1060 1402 conicto +832 1268 832 944 conicto +832 686 1003 535 conicto +1175 384 1470 384 conicto +1876 384 2122 667 conicto +2368 951 2368 1422 conicto +2368 1536 lineto +1882 1536 lineto +2880 1739 moveto +2880 0 lineto +2368 0 lineto +2368 448 lineto +2199 186 1946 61 conicto +1693 -64 1328 -64 conicto +866 -64 593 196 conicto +320 456 320 893 conicto +320 1402 662 1661 conicto +1004 1920 1682 1920 conicto +2368 1920 lineto +2368 1962 lineto +2368 2278 2140 2451 conicto +1912 2624 1500 2624 conicto +1238 2624 989 2560 conicto +741 2496 512 2368 conicto +512 2816 lineto +789 2944 1049 3008 conicto +1310 3072 1556 3072 conicto +2222 3072 2551 2741 conicto +2880 2411 2880 1739 conicto +end_ol grestore +gsave 10.180163 15.581500 translate 0.035278 -0.035278 scale +start_ol +1024 3840 moveto +1024 3008 lineto +2048 3008 lineto +2048 2624 lineto +1024 2624 lineto +1024 1016 lineto +1024 654 1125 551 conicto +1226 448 1537 448 conicto +2048 448 lineto +2048 0 lineto +1537 0 lineto +955 0 733 219 conicto +512 438 512 1016 conicto +512 2624 lineto +128 2624 lineto +128 3008 lineto +512 3008 lineto +512 3840 lineto +1024 3840 lineto +end_ol grestore +gsave 10.464896 15.581500 translate 0.035278 -0.035278 scale +start_ol +1697 2624 moveto +1297 2624 1064 2324 conicto +832 2025 832 1504 conicto +832 983 1063 683 conicto +1294 384 1697 384 conicto +2095 384 2327 684 conicto +2560 985 2560 1504 conicto +2560 2020 2327 2322 conicto +2095 2624 1697 2624 conicto +1696 3072 moveto +2338 3072 2705 2656 conicto +3072 2240 3072 1504 conicto +3072 771 2705 353 conicto +2338 -64 1696 -64 conicto +1051 -64 685 353 conicto +320 771 320 1504 conicto +320 2240 685 2656 conicto +1051 3072 1696 3072 conicto +end_ol grestore +gsave 10.909479 15.581500 translate 0.035278 -0.035278 scale +start_ol +2304 2560 moveto +2220 2593 2120 2608 conicto +2021 2624 1902 2624 conicto +1478 2624 1251 2359 conicto +1024 2094 1024 1597 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1182 2820 1434 2946 conicto +1687 3072 2049 3072 conicto +2100 3072 2162 3072 conicto +2225 3072 2301 3072 conicto +2304 2560 lineto +end_ol grestore +gsave 11.209194 15.581500 translate 0.035278 -0.035278 scale +start_ol +end_ol grestore +gsave 11.441473 15.581500 translate 0.035278 -0.035278 scale +start_ol +1697 2624 moveto +1297 2624 1064 2324 conicto +832 2025 832 1504 conicto +832 983 1063 683 conicto +1294 384 1697 384 conicto +2095 384 2327 684 conicto +2560 985 2560 1504 conicto +2560 2020 2327 2322 conicto +2095 2624 1697 2624 conicto +1696 3072 moveto +2338 3072 2705 2656 conicto +3072 2240 3072 1504 conicto +3072 771 2705 353 conicto +2338 -64 1696 -64 conicto +1051 -64 685 353 conicto +320 771 320 1504 conicto +320 2240 685 2656 conicto +1051 3072 1696 3072 conicto +end_ol grestore +gsave 11.886056 15.581500 translate 0.035278 -0.035278 scale +start_ol +3008 1829 moveto +3008 0 lineto +2496 0 lineto +2496 1813 lineto +2496 2220 2335 2422 conicto +2175 2624 1854 2624 conicto +1469 2624 1246 2379 conicto +1024 2135 1024 1713 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1198 2817 1434 2944 conicto +1670 3072 1978 3072 conicto +2486 3072 2747 2756 conicto +3008 2441 3008 1829 conicto +end_ol grestore +gsave 12.348118 15.581500 translate 0.035278 -0.035278 scale +start_ol +end_ol grestore +gsave 12.580397 15.581500 translate 0.035278 -0.035278 scale +start_ol +1088 3584 moveto +1088 448 lineto +1739 448 lineto +2563 448 2945 826 conicto +3328 1204 3328 2020 conicto +3328 2830 2945 3207 conicto +2563 3584 1739 3584 conicto +1088 3584 lineto +512 4032 moveto +1656 4032 lineto +2817 4032 3360 3544 conicto +3904 3057 3904 2020 conicto +3904 978 3358 489 conicto +2812 0 1656 0 conicto +512 0 lineto +512 4032 lineto +end_ol grestore +gsave 13.142372 15.581500 translate 0.035278 -0.035278 scale +start_ol +512 3008 moveto +1024 3008 lineto +1024 0 lineto +512 0 lineto +512 3008 lineto +512 4160 moveto +1024 4160 lineto +1024 3520 lineto +512 3520 lineto +512 4160 lineto +end_ol grestore +gsave 13.344679 15.581500 translate 0.035278 -0.035278 scale +start_ol +2432 2880 moveto +2432 2432 lineto +2227 2528 2007 2576 conicto +1787 2624 1551 2624 conicto +1191 2624 1011 2516 conicto +832 2408 832 2192 conicto +832 2027 962 1933 conicto +1093 1839 1487 1754 conicto +1654 1717 lineto +2181 1605 2402 1402 conicto +2624 1199 2624 834 conicto +2624 420 2291 178 conicto +1959 -64 1378 -64 conicto +1136 -64 873 -16 conicto +611 32 320 128 conicto +320 640 lineto +594 512 860 448 conicto +1126 384 1387 384 conicto +1736 384 1924 498 conicto +2112 612 2112 820 conicto +2112 1013 1977 1115 conicto +1843 1218 1388 1313 conicto +1218 1352 lineto +743 1448 531 1646 conicto +320 1845 320 2192 conicto +320 2613 620 2842 conicto +920 3072 1472 3072 conicto +1746 3072 1987 3024 conicto +2228 2976 2432 2880 conicto +end_ol grestore +gsave 13.724323 15.581500 translate 0.035278 -0.035278 scale +start_ol +512 4160 moveto +1024 4160 lineto +1024 1711 lineto +2503 3008 lineto +3136 3008 lineto +1536 1601 lineto +3200 0 lineto +2554 0 lineto +1024 1469 lineto +1024 0 lineto +512 0 lineto +512 4160 lineto +end_ol grestore +showpage diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/doc/main.tex b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/main.tex new file mode 100644 index 0000000000000000000000000000000000000000..bd73eed10610c83467132236df39ef53ccdf2331 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/main.tex @@ -0,0 +1,101 @@ +\documentclass[a4paper,12pt,dvips]{article} +%amsmath +\usepackage{amssymb} +\usepackage{amsmath} +\usepackage{amscd} +\usepackage{latexsym} +\usepackage{epsfig} +\usepackage{multirow} +\usepackage{color} +\usepackage{dsfont} +%\usepackage[bf,footnotesize]{caption} +%\setlength{\captionmargin}{15pt} +\usepackage{hyperref} +\usepackage{subfigure} +\usepackage{pifont} +\usepackage{algorithm} +\usepackage{algorithmic} + +%\setlength{\parindent}{0pt} +% Absatzabstand +\setlength{\parskip}{3pt plus 3pt} +% Hoehe der Kopfzeilen +%\setlength{\headheight}{26pt} + +\input{command} + +\date{\today} +\title{tmLQCD package documentation} + +\begin{document} +%\begin{titlepage} +\begin{center} + {\Large\bf tmLQCD Package Documentation}\\ +\end{center} + +\tableofcontents + +\begin{flushright} + Copyright \textcopyright\ 2009 Carsten Urbach +\end{flushright} + +\section{Theoretical Background} + +\myinput{basis} + +\section{Installation and Usage} + +\myinput{install} +\myinput{input} +\myinput{output} +\myinput{gensources} + +\section{Implementation} + +\myinput{overview} +\myinput{components} +\myinput{test} + +%\myinput{eo_pre} +%\myinput{martins-trick} +%\myinput{deflation} +%\myinput{c-code} +%\myinput{integrationschemes} + +%\myinput{parallel} +%\myinput{operator} +%\myinput{online} + +\section{File Formats and IO} +\myinput{prop_format.tex} + +\section{Interfaces to external QCD libraries} +\myinput{quda.tex} + +\clearpage +\bibliographystyle{h-physrev5} +\bibliography{bibliography} +\clearpage + +\begin{appendix} + \section{$\gamma$ and Pauli Matrices} + \myinput{gamma} + + \section{Initialising the PHMC} + \myinput{root} + \section{Even/Odd Preconditioning} + \myinput{eo_pre} + \myinput{martins-trick} + \myinput{rational} + + \section{Deflation} + \myinput{deflation} +\end{appendix} + + +\end{document} + +%%% Local Variables: +%%% mode: latex +%%% TeX-master: "main" +%%% End: diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/doc/martins-trick.tex b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/martins-trick.tex new file mode 100644 index 0000000000000000000000000000000000000000..2ff9bed4c4955c53591239a99885718fea410d91 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/martins-trick.tex @@ -0,0 +1,274 @@ +\subsection{Hasenbusch trick for dynamical tmQCD} + +%The trick presented in \cite{Hasenbusch:2002ai} is based on the +%observation that writing +%\[ +%\det[Q^2] = \det[W^2]\cdot\det[W^{-2}Q^2] +%\] +%is advantagous for the HMC, if the condition number of $W^{2}$ and of +%$W^{-2}Q^{2}$ is significantly reduced compared to the condition number +%of only $Q^2$. Thus if we define +%\[ +%\begin{split} +% \Qpm = \gamma_5 D_W \pm i\mu_1\, ,\\ +% \Wpm = \gamma_5 D_W \pm i\mu_2\, ,\\ +%\end{split} +%\] +%with $\mu_2 =\mu_1+\Delta\mu$ it follows immidiatly that the condition number of +%$\Wp\Wm$ is lower than the one of $\Qp\Qm$ if for $\lambda_{\textrm{min}}$ +%and $\lambda_{\textrm{max}}$ the lowest and the largest eigenvalue of +%$\Qp\Qm$, respectively $|\lambda_{\textrm{min}}|\ll\mu_2^2\ll|\lambda_{\textrm{max}}|$ +%holds: It is $|\lambda_{\textrm{max}}|/\mu_2^2$. The condition number of +%$W^{-2}Q^{2}$ contrariwise is $\mu_2^2/|\lambda_{\textrm{min}}|^2$. We can take +%$\mu_1$ which is a lower bound for $|\lambda_{\textrm{min}}|$ to write down +%the condition numbers: +%\[ +%k_{W^2} = \frac{|\lambda_{\textrm{max}}|}{\mu_2^2}\, ,\quad +%k_{W^{-2}Q^{2}} \leq \frac{\mu_2^2}{\mu_1^2}\, . +%\] +%This again leads to an optimal choice for +%$\mu_2^2=\sqrt{|\lambda_{\textrm{max}}|\cdot\mu_1}$. It is also illuminating to take +%a look at the force coming from $W^{-2}Q^{2}$. Noticing that +%\[ +%\Wpm = \Qpm \pm i\Delta\mu\, , +%\] +%we see immidiatly that +%\begin{equation} +% \label{eq:mt01} +% \Wpm^{-1}\Qpm = \mathds{1} \mp i\Delta\mu\Wpm^{-1}\, . +%\end{equation} +%Now we again write the determinant with pseudo fermion +%fields: +%\[ +%\begin{split} +% \det[\Wp^{-1}\Qp\Qm\Wm^{-1}] &= \int D[\phi]D[\phi^\dagger] +% \exp(-\phi^\dagger\Wm\Qm^{-1}\Qp^{-1}\Wp\phi) \\ +% &= \int D[\phi]D[\phi^\dagger] \exp(-S_F)\, . +%\end{split} +%\] +%Using equation (\ref{eq:mt01}) we get the following expression for +%$S_F$: +%\begin{equation} +% \label{eq:mt02} +% S_F = \phi^\dagger(\mathds{1} +i\Delta\mu\Qp^{-1} +% -i\Delta\mu\Qm^{-1}+\Delta\mu^2(\Qp\Qm)^{-1}) \phi\, . +%\end{equation} +%Without explicitly computing the variation of $S_F$ with respect to +%the gauge fields we can see that it can only contain term proportional +%to $\Delta\mu$ and $\Delta\mu^2$, since $S_F$ is a constant up to terms of this +%order. If we take $\Delta\mu$ to be small we expect a smaller force comming +%from $W^{-2}Q^{2}$ than it would come from $Q^2$ and therefore a +%smoother evolution of the Hamiltonian. + +%This we will now apply on top of even odd preconditioning. For this we +%start directly with the even odd preconditioned matrices $\hQpm$, but +%probably it is also possible to start from the results obtained in +%this subsection. Nevertheless the above discussion was usefull to +%understand a possible gain with this trick. + +We shall now discuss the the trick presented in +\cite{Hasenbusch:2002ai} (mass preconditioning) for dynamical twisted +mass lattice QCD. +Let $\hQpm$ and $\hWpm$ be two matrices as defined in (\ref{eq:eo4}) +with two parameters $\mu_1$ and $\mu_2$, respectively. The idea is to +choose $\mu_2$ bigger than $\mu_1$. With this we can +write +\begin{equation} + \label{eq:mt0} + \det[\hQp\hQm] = \det[\hWp\hWm]\cdot\det[\hWp^{-1}\hQp\hQm\hWm^{-1}]. +\end{equation} +The first term on the right hand side of (\ref{eq:mt0}) can be handled +as described in the previous section. The second term needs some +further investigation: we again write the determinant as an integral +over pseudo fermion fields: +\begin{equation} + \label{eq:mt1} + \begin{split} + \det[\hWp^{-1}\hQp\hQm\hWm^{-1}] &\propto \int + D[\phi_o]D[\phi_o^\dagger]\exp(-\phi_o^\dagger (\hWp^{-1}\hQp\hQm\hWm^{-1})^{-1} + \phi_o) \\ + &= \int D[\phi_o]D[\phi_o^\dagger]\exp(-\phi_o^\dagger \hWm\hQm^{-1}\hQp^{-1}\hWp\phi_o) \\ + &= \int D[\phi_o]D[\phi_o^\dagger]\exp(-S_{F_2}) + \end{split} +\end{equation} +The variation of $S_{F_2}$, needed for the HMC, then reads as follows: +\begin{equation} + \label{eq:mt2} + \begin{split} + \delta S_{F_2} &= \phi_o^\dagger[\delta\hWm(\hQp\hQm)^{-1}\hWp + +\hWm(\hQp\hQm)^{-1}\delta\hWp]\phi_o\\ + &-\phi_o^\dagger[\hWm\hQm^{-1}\delta\hQm(\hQp\hQm)^{-1}\hWp + + \hWm(\hQp\hQm)^{-1}\delta\hQp\hQp^{-1}\hWp]\phi_o + \end{split} +\end{equation} +If we define now +\begin{equation} + \label{eq:mt3} + X_W = (\hQp\hQm)^{-1}\hWp\phi_o\, ,\quad Y_W = \hQp^{-1}\hWp\phi_o = \hQm + X_W\, , +\end{equation} +we can rewrite (\ref{eq:mt2}): +\begin{equation} + \label{eq:mt4} + \begin{split} + \delta S_{F_2} &= \phi_o^\dagger\delta\hWm X_W + X_W^\dagger\delta\hWp\phi_o\\ + &-Y_W^\dagger\delta\hQm X_W - X_W^\dagger\delta\hQp Y_W\, . + \end{split} +\end{equation} +Recalling the variation of $\hQpm$ (and of $\hWpm$): +\begin{equation} + \label{eq:mt5} + \begin{split} + \delta\hQpm &= \gamma_5\left(-\delta M_{oe}(1\pm i\mu_1\gamma_5 )^{-1}M_{eo} - + M_{oe}(1\pm i\mu_1\gamma_5 )^{-1}\delta M_{eo}\right)\, , \\ + \delta\hWpm &= \gamma_5\left(-\delta M_{oe}(1\pm i\mu_2\gamma_5 )^{-1}M_{eo} - + M_{oe}(1\pm i\mu_2\gamma_5)^{-1}\delta M_{eo}\right)\, , + \end{split} +\end{equation} +we find: +\begin{equation} + \label{eq:mt6} + \begin{split} + \delta S_{F_2} &= Y_2^\dagger \delta Q X_2 + X_2^\dagger\delta QY_2 -X_1^\dagger\delta Q Y_1 - + Y_1^\dagger\delta Q X_1\\ + &= 2\re\left[Y_2^\dagger \delta Q X_2 - Y_1^\dagger\delta Q X_1 \right]\, , + \end{split} +\end{equation} +where the fields $X_{1,2}$, $Y_{1,2}$ and the matrix $\delta Q$ are now +defined over the full lattice as follows: +\begin{equation} + \label{eq:mt7} + \begin{split} + Y_1 &= + \begin{pmatrix} + -(1+i\mu_{1}\gamma_5)^{-1}M_{eo}Y_W \\ Y_W\\ + \end{pmatrix}\, ,\quad + Y_2 = + \begin{pmatrix} + -(1+i\mu_{2}\gamma_5)^{-1}M_{eo}\phi_o \\ \phi_o\\ + \end{pmatrix},\\ + X_{1,2} &= + \begin{pmatrix} + -(1-i\mu_{1,2}\gamma_5)^{-1}M_{eo}X_W \\ X_W\\ + \end{pmatrix},\quad + \delta Q = \gamma_5 + \begin{pmatrix} + 0 & \delta M_{eo}\\ + \delta M_{oe} & 0\\ + \end{pmatrix}\, . + \end{split} +\end{equation} +The bosonic part is again quadratic in the fields $\phi_o$ and can be +therefore generated at the beginning of each molecular dynamics +trajectory with: +\begin{equation} + \label{eq:mt8} + \phi_o = \hWp^{-1}\hQp R +\end{equation} +where $R$ is again a random spinor field taken from a Gaussian +distribution with norm one. + +This can again be used also with symmetrical even/odd preconditioning +by re-defining $Y_{1,2}$ and $X_{1,2}$ +\begin{equation} + \label{eq:mt9} + \begin{split} + Y_1 &= + \begin{pmatrix} + -(1+i\mu_{1}\gamma_5)^{-1}M_{eo}(1+i\mu_{1}\gamma_5)^{-1}Y_W \\ Y_W\\ + \end{pmatrix}\, \\ + Y_2 &= + \begin{pmatrix} + -(1+i\mu_{2}\gamma_5)^{-1}M_{eo}(1+i\mu_{2}\gamma_5)^{-1}\phi_o \\ \phi_o\\ + \end{pmatrix},\\ + X_{1,2} &= + \begin{pmatrix} + -(1-i\mu_{1,2}\gamma_5)^{-1}M_{eo}(1-i\mu_{1,2}\gamma_5)^{-1}X_W \\ X_W\\ + \end{pmatrix}\, .\\ + \end{split} +\end{equation} + +\subsubsection{Hasenbusch-Trick and Twisted-Clover} + +In order to avoid to recompute $(1\pm i \mu\gamma_5 + T_{ee}(x))^{-1}$ +too often, the following version of mass preconditioning -- which is +close to the original paper~\cite{Hasenbusch:2002ai} -- might be best +suited for Twisted-Clover: define +\begin{equation} + \label{eq:swWhat} + \hWpm = \hQpm \pm i \rho = \gamma_5(\hat{M_\pm} \pm i \rho\gamma_5) +\end{equation} +with a real mass-shift $\rho$. Here $\hQpm$ is now the even/odd +preconditioned clover operator eq.~(\ref{eq:eosw4}) +\[ +\hQpm = \gamma_5((1 + T_{oo} \pm i\tilde\mu\gamma_5) - + M_{oe}( 1 + T_{ee} \pm i\tilde\mu\gamma_5 )^{-1}M_{eo})\,. +\] +Then we have $\hWp = \hWm^\dagger$ and $\delta \hQpm = +\delta\hWpm$. The latter is given by eq.~(\ref{eq:eosw7}) +\begin{equation*} + \begin{split} + \delta \hQpm = \gamma_5 & \left( \delta T_{oo}-\delta M_{oe}(M_{ee}^\pm )^{-1}M_{eo} - + M_{oe}(M_{ee}^\pm )^{-1}\delta M_{eo}\right. \\ + &\left. + M_{oe}(M_{ee}^\pm )^{-1} \delta T_{ee} (M_{ee}^\pm )^{-1} M_{eo}\right), + \end{split} +\end{equation*} +which is in particular independent of $\rho$. +The pseudo-fermion action of a determinant ratio is then +given by +\[ +S_\mathrm{PF} = \phi^\dagger\hWm(\hQp\hQm)^{-1}\hWp\phi +\] +and the variation of $S_\mathrm{PF}$ is again given by +eq.~(\ref{eq:mt2}). We also define again $X_W$ and $Y_W$ as in +eq.~(\ref{eq:mt3}) +\begin{equation*} + X_W = (\hQp\hQm)^{-1}\hWp\phi_o\, ,\quad Y_W = \hQp^{-1}\hWp\phi_o = \hQm + X_W\, +\end{equation*} +With this definition the variation of +$S_\mathrm{PF}$ reads +\begin{equation} + \label{eq:swdeltaS} + \begin{split} + \delta S_\mathrm{PF} &= \phi^\dagger\delta\hQm X_W + X_W^\dagger + \delta \hQp\phi - Y_W^\dagger\delta \hQm X_W - X_W^\dagger\delta\hQp + Y_W\\ + &= (\phi- Y_W)^\dagger\delta\hQm X_W + X_W^\dagger\delta\hQp(\phi + - Y_W) \\ + &= 2\re[ (\phi- Y_W)^\dagger\delta\hQm X_W ]\,. + \end{split} +\end{equation} +Defining analogously to eq.~(\ref{eq:mt7}) two full vectors $X,Y$ as +follows +\begin{equation} + \label{eq:swXY} + \begin{split} + Y &= + \begin{pmatrix} + -(1+i\mu\gamma_5+T_{ee}(x))^{-1}M_{eo} (\phi-Y_W) \\ (\phi-Y_W)\\ + \end{pmatrix}\, ,\\ + X &= + \begin{pmatrix} + -(1-i\mu\gamma_5+T_{ee}(x))^{-1}M_{eo}X_W \\ X_W\\ + \end{pmatrix}\,, + \end{split} +\end{equation} +we get +\begin{equation} + \label{eq:swdS} + \delta S_\mathrm{PF} = 2 \re[Y^\dagger\, \delta Q\, X]\,, +\end{equation} +where again +\[ +\delta Q = \gamma_5 +\begin{pmatrix} + \delta T_{ee} & \delta M_{eo}\\ + \delta M_{oe} & \delta T_{oo}\\ +\end{pmatrix}\,. +\] + +%%% Local Variables: +%%% mode: latex +%%% TeX-master: "main" +%%% End: diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/doc/monomial.eps b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/monomial.eps new file mode 100644 index 0000000000000000000000000000000000000000..c397e0f40f38b600490d473f92befba610a5ab60 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/monomial.eps @@ -0,0 +1,2229 @@ +%!PS-Adobe-2.0 EPSF-2.0 +%%Title: /home/urbach/daten/workdir/etmc/cpc40/monomial.dia +%%Creator: Dia v0.96.1 +%%CreationDate: Fri Jan 30 15:51:22 2009 +%%For: urbach +%%Orientation: Portrait +%%Magnification: 1.0000 +%%BoundingBox: 0 0 817 494 +%%BeginSetup +%%EndSetup +%%EndComments +%%BeginProlog +[ /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef +/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef +/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef +/.notdef /.notdef /space /exclam /quotedbl /numbersign /dollar /percent /ampersand /quoteright +/parenleft /parenright /asterisk /plus /comma /hyphen /period /slash /zero /one +/two /three /four /five /six /seven /eight /nine /colon /semicolon +/less /equal /greater /question /at /A /B /C /D /E +/F /G /H /I /J /K /L /M /N /O +/P /Q /R /S /T /U /V /W /X /Y +/Z /bracketleft /backslash /bracketright /asciicircum /underscore /quoteleft /a /b /c +/d /e /f /g /h /i /j /k /l /m +/n /o /p /q /r /s /t /u /v /w +/x /y /z /braceleft /bar /braceright /asciitilde /.notdef /.notdef /.notdef +/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef +/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef +/.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef /.notdef +/space /exclamdown /cent /sterling /currency /yen /brokenbar /section /dieresis /copyright +/ordfeminine /guillemotleft /logicalnot /hyphen /registered /macron /degree /plusminus /twosuperior /threesuperior +/acute /mu /paragraph /periodcentered /cedilla /onesuperior /ordmasculine /guillemotright /onequarter /onehalf +/threequarters /questiondown /Agrave /Aacute /Acircumflex /Atilde /Adieresis /Aring /AE /Ccedilla +/Egrave /Eacute /Ecircumflex /Edieresis /Igrave /Iacute /Icircumflex /Idieresis /Eth /Ntilde +/Ograve /Oacute /Ocircumflex /Otilde /Odieresis /multiply /Oslash /Ugrave /Uacute /Ucircumflex +/Udieresis /Yacute /Thorn /germandbls /agrave /aacute /acircumflex /atilde /adieresis /aring +/ae /ccedilla /egrave /eacute /ecircumflex /edieresis /igrave /iacute /icircumflex /idieresis +/eth /ntilde /ograve /oacute /ocircumflex /otilde /odieresis /divide /oslash /ugrave +/uacute /ucircumflex /udieresis /yacute /thorn /ydieresis] /isolatin1encoding exch def +/cp {closepath} bind def +/c {curveto} bind def +/f {fill} bind def +/a {arc} bind def +/ef {eofill} bind def +/ex {exch} bind def +/gr {grestore} bind def +/gs {gsave} bind def +/sa {save} bind def +/rs {restore} bind def +/l {lineto} bind def +/m {moveto} bind def +/rm {rmoveto} bind def +/n {newpath} bind def +/s {stroke} bind def +/sh {show} bind def +/slc {setlinecap} bind def +/slj {setlinejoin} bind def +/slw {setlinewidth} bind def +/srgb {setrgbcolor} bind def +/rot {rotate} bind def +/sc {scale} bind def +/sd {setdash} bind def +/ff {findfont} bind def +/sf {setfont} bind def +/scf {scalefont} bind def +/sw {stringwidth pop} bind def +/tr {translate} bind def + +/ellipsedict 8 dict def +ellipsedict /mtrx matrix put +/ellipse +{ ellipsedict begin + /endangle exch def + /startangle exch def + /yrad exch def + /xrad exch def + /y exch def + /x exch def /savematrix mtrx currentmatrix def + x y tr xrad yrad sc + 0 0 1 startangle endangle arc + savematrix setmatrix + end +} def + +/mergeprocs { +dup length +3 -1 roll +dup +length +dup +5 1 roll +3 -1 roll +add +array cvx +dup +3 -1 roll +0 exch +putinterval +dup +4 2 roll +putinterval +} bind def +/dpi_x 300 def +/dpi_y 300 def +/conicto { + /to_y exch def + /to_x exch def + /conic_cntrl_y exch def + /conic_cntrl_x exch def + currentpoint + /p0_y exch def + /p0_x exch def + /p1_x p0_x conic_cntrl_x p0_x sub 2 3 div mul add def + /p1_y p0_y conic_cntrl_y p0_y sub 2 3 div mul add def + /p2_x p1_x to_x p0_x sub 1 3 div mul add def + /p2_y p1_y to_y p0_y sub 1 3 div mul add def + p1_x p1_y p2_x p2_y to_x to_y curveto +} bind def +/start_ol { gsave 1.1 dpi_x div dup scale} bind def +/end_ol { closepath fill grestore } bind def +28.346000 -28.346000 scale +-1.950000 -20.100000 translate +%%EndProlog + + +0.100000 slw +[] 0 sd +[] 0 sd +0 slj +1.000000 1.000000 1.000000 srgb +n 2.000000 2.750000 m 2.000000 20.050000 l 30.700000 20.050000 l 30.700000 2.750000 l f +0.000000 0.000000 0.000000 srgb +n 2.000000 2.750000 m 2.000000 20.050000 l 30.700000 20.050000 l 30.700000 2.750000 l cp s +1.000000 1.000000 1.000000 srgb +n 3.950000 3.850000 m 3.950000 6.450000 l 13.100000 6.450000 l 13.100000 3.850000 l f +0.100000 slw +[] 0 sd +[] 0 sd +0 slj +0.000000 0.000000 0.000000 srgb +n 3.950000 3.850000 m 3.950000 6.450000 l 13.100000 6.450000 l 13.100000 3.850000 l cp s +gsave 6.752500 5.327500 translate 0.035278 -0.035278 scale +start_ol +512 4032 moveto +1357 4032 lineto +2335 1282 lineto +3317 4032 lineto +4160 4032 lineto +4160 0 lineto +3584 0 lineto +3584 3539 lineto +2596 768 lineto +2076 768 lineto +1088 3539 lineto +1088 0 lineto +512 0 lineto +512 4032 lineto +end_ol grestore +gsave 7.381911 5.327500 translate 0.035278 -0.035278 scale +start_ol +1697 2624 moveto +1297 2624 1064 2324 conicto +832 2025 832 1504 conicto +832 983 1063 683 conicto +1294 384 1697 384 conicto +2095 384 2327 684 conicto +2560 985 2560 1504 conicto +2560 2020 2327 2322 conicto +2095 2624 1697 2624 conicto +1696 3072 moveto +2338 3072 2705 2656 conicto +3072 2240 3072 1504 conicto +3072 771 2705 353 conicto +2338 -64 1696 -64 conicto +1051 -64 685 353 conicto +320 771 320 1504 conicto +320 2240 685 2656 conicto +1051 3072 1696 3072 conicto +end_ol grestore +gsave 7.826493 5.327500 translate 0.035278 -0.035278 scale +start_ol +3008 1829 moveto +3008 0 lineto +2496 0 lineto +2496 1813 lineto +2496 2220 2335 2422 conicto +2175 2624 1854 2624 conicto +1469 2624 1246 2379 conicto +1024 2135 1024 1713 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1198 2817 1434 2944 conicto +1670 3072 1978 3072 conicto +2486 3072 2747 2756 conicto +3008 2441 3008 1829 conicto +end_ol grestore +gsave 8.288555 5.327500 translate 0.035278 -0.035278 scale +start_ol +1697 2624 moveto +1297 2624 1064 2324 conicto +832 2025 832 1504 conicto +832 983 1063 683 conicto +1294 384 1697 384 conicto +2095 384 2327 684 conicto +2560 985 2560 1504 conicto +2560 2020 2327 2322 conicto +2095 2624 1697 2624 conicto +1696 3072 moveto +2338 3072 2705 2656 conicto +3072 2240 3072 1504 conicto +3072 771 2705 353 conicto +2338 -64 1696 -64 conicto +1051 -64 685 353 conicto +320 771 320 1504 conicto +320 2240 685 2656 conicto +1051 3072 1696 3072 conicto +end_ol grestore +gsave 8.733138 5.327500 translate 0.035278 -0.035278 scale +start_ol +2858 2449 moveto +3041 2768 3296 2920 conicto +3551 3072 3895 3072 conicto +4360 3072 4612 2748 conicto +4864 2425 4864 1829 conicto +4864 0 lineto +4352 0 lineto +4352 1813 lineto +4352 2225 4203 2424 conicto +4055 2624 3750 2624 conicto +3377 2624 3160 2379 conicto +2944 2135 2944 1713 conicto +2944 0 lineto +2432 0 lineto +2432 1813 lineto +2432 2227 2283 2425 conicto +2135 2624 1824 2624 conicto +1457 2624 1240 2378 conicto +1024 2132 1024 1713 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1190 2822 1423 2947 conicto +1656 3072 1975 3072 conicto +2298 3072 2524 2912 conicto +2750 2753 2858 2449 conicto +end_ol grestore +gsave 9.442469 5.327500 translate 0.035278 -0.035278 scale +start_ol +512 3008 moveto +1024 3008 lineto +1024 0 lineto +512 0 lineto +512 3008 lineto +512 4160 moveto +1024 4160 lineto +1024 3520 lineto +512 3520 lineto +512 4160 lineto +end_ol grestore +gsave 9.644776 5.327500 translate 0.035278 -0.035278 scale +start_ol +1882 1536 moveto +1289 1536 1060 1402 conicto +832 1268 832 944 conicto +832 686 1003 535 conicto +1175 384 1470 384 conicto +1876 384 2122 667 conicto +2368 951 2368 1422 conicto +2368 1536 lineto +1882 1536 lineto +2880 1739 moveto +2880 0 lineto +2368 0 lineto +2368 448 lineto +2199 186 1946 61 conicto +1693 -64 1328 -64 conicto +866 -64 593 196 conicto +320 456 320 893 conicto +320 1402 662 1661 conicto +1004 1920 1682 1920 conicto +2368 1920 lineto +2368 1962 lineto +2368 2278 2140 2451 conicto +1912 2624 1500 2624 conicto +1238 2624 989 2560 conicto +741 2496 512 2368 conicto +512 2816 lineto +789 2944 1049 3008 conicto +1310 3072 1556 3072 conicto +2222 3072 2551 2741 conicto +2880 2411 2880 1739 conicto +end_ol grestore +gsave 10.091856 5.327500 translate 0.035278 -0.035278 scale +start_ol +512 4160 moveto +1024 4160 lineto +1024 0 lineto +512 0 lineto +512 4160 lineto +end_ol grestore +1.000000 1.000000 1.000000 srgb +n 18.250000 3.950000 m 18.250000 6.050000 l 29.550000 6.050000 l 29.550000 3.950000 l f +0.100000 slw +[] 0 sd +[] 0 sd +0 slj +0.000000 0.000000 0.000000 srgb +n 18.250000 3.950000 m 18.250000 6.050000 l 29.550000 6.050000 l 29.550000 3.950000 l cp s +gsave 18.700000 5.177500 translate 0.035278 -0.035278 scale +start_ol +0 4032 moveto +3392 4032 lineto +3392 3584 lineto +1984 3584 lineto +1984 0 lineto +1408 0 lineto +1408 3584 lineto +0 3584 lineto +0 4032 lineto +end_ol grestore +gsave 19.032184 5.177500 translate 0.035278 -0.035278 scale +start_ol +1799 -256 moveto +1587 -813 1386 -982 conicto +1185 -1152 848 -1152 conicto +448 -1152 lineto +448 -704 lineto +742 -704 lineto +948 -704 1062 -611 conicto +1177 -518 1315 -171 conicto +1405 54 lineto +192 3008 lineto +703 3008 lineto +1655 655 lineto +2606 3008 lineto +3136 3008 lineto +1799 -256 lineto +end_ol grestore +gsave 19.464274 5.177500 translate 0.035278 -0.035278 scale +start_ol +1024 448 moveto +1024 -1152 lineto +512 -1152 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1178 2820 1414 2946 conicto +1650 3072 1977 3072 conicto +2521 3072 2860 2640 conicto +3200 2208 3200 1504 conicto +3200 800 2860 368 conicto +2521 -64 1977 -64 conicto +1650 -64 1414 62 conicto +1178 188 1024 448 conicto +2688 1504 moveto +2688 2028 2466 2326 conicto +2244 2624 1856 2624 conicto +1468 2624 1246 2326 conicto +1024 2028 1024 1504 conicto +1024 980 1246 682 conicto +1468 384 1856 384 conicto +2244 384 2466 682 conicto +2688 980 2688 1504 conicto +end_ol grestore +gsave 19.926335 5.177500 translate 0.035278 -0.035278 scale +start_ol +3136 1584 moveto +3136 1344 lineto +832 1344 lineto +865 875 1142 629 conicto +1420 384 1916 384 conicto +2203 384 2472 448 conicto +2742 512 3008 640 conicto +3008 192 lineto +2741 67 2460 1 conicto +2179 -64 1890 -64 conicto +1166 -64 743 352 conicto +320 768 320 1477 conicto +320 2211 723 2641 conicto +1126 3072 1809 3072 conicto +2423 3072 2779 2672 conicto +3136 2272 3136 1584 conicto +2624 1728 moveto +2619 2137 2395 2380 conicto +2172 2624 1804 2624 conicto +1387 2624 1136 2388 conicto +886 2153 848 1725 conicto +2624 1728 lineto +end_ol grestore +1.000000 1.000000 1.000000 srgb +n 18.247500 6.520000 m 18.247500 8.620000 l 29.547500 8.620000 l 29.547500 6.520000 l f +0.100000 slw +[] 0 sd +[] 0 sd +0 slj +0.000000 0.000000 0.000000 srgb +n 18.247500 6.520000 m 18.247500 8.620000 l 29.547500 8.620000 l 29.547500 6.520000 l cp s +gsave 18.697500 7.747500 translate 0.035278 -0.035278 scale +start_ol +1024 448 moveto +1024 -1152 lineto +512 -1152 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1178 2820 1414 2946 conicto +1650 3072 1977 3072 conicto +2521 3072 2860 2640 conicto +3200 2208 3200 1504 conicto +3200 800 2860 368 conicto +2521 -64 1977 -64 conicto +1650 -64 1414 62 conicto +1178 188 1024 448 conicto +2688 1504 moveto +2688 2028 2466 2326 conicto +2244 2624 1856 2624 conicto +1468 2624 1246 2326 conicto +1024 2028 1024 1504 conicto +1024 980 1246 682 conicto +1468 384 1856 384 conicto +2244 384 2466 682 conicto +2688 980 2688 1504 conicto +end_ol grestore +gsave 19.159562 7.747500 translate 0.035278 -0.035278 scale +start_ol +2432 2880 moveto +2432 2432 lineto +2227 2528 2007 2576 conicto +1787 2624 1551 2624 conicto +1191 2624 1011 2516 conicto +832 2408 832 2192 conicto +832 2027 962 1933 conicto +1093 1839 1487 1754 conicto +1654 1717 lineto +2181 1605 2402 1402 conicto +2624 1199 2624 834 conicto +2624 420 2291 178 conicto +1959 -64 1378 -64 conicto +1136 -64 873 -16 conicto +611 32 320 128 conicto +320 640 lineto +594 512 860 448 conicto +1126 384 1387 384 conicto +1736 384 1924 498 conicto +2112 612 2112 820 conicto +2112 1013 1977 1115 conicto +1843 1218 1388 1313 conicto +1218 1352 lineto +743 1448 531 1646 conicto +320 1845 320 2192 conicto +320 2613 620 2842 conicto +920 3072 1472 3072 conicto +1746 3072 1987 3024 conicto +2228 2976 2432 2880 conicto +end_ol grestore +gsave 19.539206 7.747500 translate 0.035278 -0.035278 scale +start_ol +3136 1584 moveto +3136 1344 lineto +832 1344 lineto +865 875 1142 629 conicto +1420 384 1916 384 conicto +2203 384 2472 448 conicto +2742 512 3008 640 conicto +3008 192 lineto +2741 67 2460 1 conicto +2179 -64 1890 -64 conicto +1166 -64 743 352 conicto +320 768 320 1477 conicto +320 2211 723 2641 conicto +1126 3072 1809 3072 conicto +2423 3072 2779 2672 conicto +3136 2272 3136 1584 conicto +2624 1728 moveto +2619 2137 2395 2380 conicto +2172 2624 1804 2624 conicto +1387 2624 1136 2388 conicto +886 2153 848 1725 conicto +2624 1728 lineto +end_ol grestore +gsave 19.986285 7.747500 translate 0.035278 -0.035278 scale +start_ol +512 1177 moveto +512 3008 lineto +1024 3008 lineto +1024 1196 lineto +1024 790 1184 587 conicto +1344 384 1664 384 conicto +2049 384 2272 628 conicto +2496 872 2496 1293 conicto +2496 3008 lineto +3008 3008 lineto +3008 0 lineto +2496 0 lineto +2496 448 lineto +2320 188 2087 62 conicto +1854 -64 1546 -64 conicto +1038 -64 775 252 conicto +512 568 512 1177 conicto +1744 3072 moveto +1744 3072 lineto +end_ol grestore +gsave 20.448347 7.747500 translate 0.035278 -0.035278 scale +start_ol +2496 2560 moveto +2496 4160 lineto +3008 4160 lineto +3008 0 lineto +2496 0 lineto +2496 448 lineto +2342 188 2106 62 conicto +1870 -64 1540 -64 conicto +999 -64 659 368 conicto +320 800 320 1504 conicto +320 2208 659 2640 conicto +999 3072 1540 3072 conicto +1870 3072 2106 2946 conicto +2342 2820 2496 2560 conicto +832 1504 moveto +832 980 1053 682 conicto +1275 384 1663 384 conicto +2050 384 2273 682 conicto +2496 980 2496 1504 conicto +2496 2028 2273 2326 conicto +2050 2624 1663 2624 conicto +1275 2624 1053 2326 conicto +832 2028 832 1504 conicto +end_ol grestore +gsave 20.910409 7.747500 translate 0.035278 -0.035278 scale +start_ol +1697 2624 moveto +1297 2624 1064 2324 conicto +832 2025 832 1504 conicto +832 983 1063 683 conicto +1294 384 1697 384 conicto +2095 384 2327 684 conicto +2560 985 2560 1504 conicto +2560 2020 2327 2322 conicto +2095 2624 1697 2624 conicto +1696 3072 moveto +2338 3072 2705 2656 conicto +3072 2240 3072 1504 conicto +3072 771 2705 353 conicto +2338 -64 1696 -64 conicto +1051 -64 685 353 conicto +320 771 320 1504 conicto +320 2240 685 2656 conicto +1051 3072 1696 3072 conicto +end_ol grestore +gsave 21.354992 7.747500 translate 0.035278 -0.035278 scale +start_ol +end_ol grestore +gsave 21.587271 7.747500 translate 0.035278 -0.035278 scale +start_ol +2048 4160 moveto +2048 3712 lineto +1571 3712 lineto +1299 3712 1193 3612 conicto +1088 3513 1088 3254 conicto +1088 3008 lineto +1920 3008 lineto +1920 2624 lineto +1088 2624 lineto +1088 0 lineto +576 0 lineto +576 2624 lineto +128 2624 lineto +128 3008 lineto +576 3008 lineto +576 3202 lineto +576 3702 816 3931 conicto +1056 4160 1577 4160 conicto +2048 4160 lineto +end_ol grestore +gsave 21.844529 7.747500 translate 0.035278 -0.035278 scale +start_ol +3136 1584 moveto +3136 1344 lineto +832 1344 lineto +865 875 1142 629 conicto +1420 384 1916 384 conicto +2203 384 2472 448 conicto +2742 512 3008 640 conicto +3008 192 lineto +2741 67 2460 1 conicto +2179 -64 1890 -64 conicto +1166 -64 743 352 conicto +320 768 320 1477 conicto +320 2211 723 2641 conicto +1126 3072 1809 3072 conicto +2423 3072 2779 2672 conicto +3136 2272 3136 1584 conicto +2624 1728 moveto +2619 2137 2395 2380 conicto +2172 2624 1804 2624 conicto +1387 2624 1136 2388 conicto +886 2153 848 1725 conicto +2624 1728 lineto +end_ol grestore +gsave 22.291608 7.747500 translate 0.035278 -0.035278 scale +start_ol +2304 2560 moveto +2220 2593 2120 2608 conicto +2021 2624 1902 2624 conicto +1478 2624 1251 2359 conicto +1024 2094 1024 1597 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1182 2820 1434 2946 conicto +1687 3072 2049 3072 conicto +2100 3072 2162 3072 conicto +2225 3072 2301 3072 conicto +2304 2560 lineto +end_ol grestore +gsave 22.578838 7.747500 translate 0.035278 -0.035278 scale +start_ol +2858 2449 moveto +3041 2768 3296 2920 conicto +3551 3072 3895 3072 conicto +4360 3072 4612 2748 conicto +4864 2425 4864 1829 conicto +4864 0 lineto +4352 0 lineto +4352 1813 lineto +4352 2225 4203 2424 conicto +4055 2624 3750 2624 conicto +3377 2624 3160 2379 conicto +2944 2135 2944 1713 conicto +2944 0 lineto +2432 0 lineto +2432 1813 lineto +2432 2227 2283 2425 conicto +2135 2624 1824 2624 conicto +1457 2624 1240 2378 conicto +1024 2132 1024 1713 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1190 2822 1423 2947 conicto +1656 3072 1975 3072 conicto +2298 3072 2524 2912 conicto +2750 2753 2858 2449 conicto +end_ol grestore +gsave 23.288170 7.747500 translate 0.035278 -0.035278 scale +start_ol +512 3008 moveto +1024 3008 lineto +1024 0 lineto +512 0 lineto +512 3008 lineto +512 4160 moveto +1024 4160 lineto +1024 3520 lineto +512 3520 lineto +512 4160 lineto +end_ol grestore +gsave 23.490477 7.747500 translate 0.035278 -0.035278 scale +start_ol +1697 2624 moveto +1297 2624 1064 2324 conicto +832 2025 832 1504 conicto +832 983 1063 683 conicto +1294 384 1697 384 conicto +2095 384 2327 684 conicto +2560 985 2560 1504 conicto +2560 2020 2327 2322 conicto +2095 2624 1697 2624 conicto +1696 3072 moveto +2338 3072 2705 2656 conicto +3072 2240 3072 1504 conicto +3072 771 2705 353 conicto +2338 -64 1696 -64 conicto +1051 -64 685 353 conicto +320 771 320 1504 conicto +320 2240 685 2656 conicto +1051 3072 1696 3072 conicto +end_ol grestore +gsave 23.935059 7.747500 translate 0.035278 -0.035278 scale +start_ol +3008 1829 moveto +3008 0 lineto +2496 0 lineto +2496 1813 lineto +2496 2220 2335 2422 conicto +2175 2624 1854 2624 conicto +1469 2624 1246 2379 conicto +1024 2135 1024 1713 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1198 2817 1434 2944 conicto +1670 3072 1978 3072 conicto +2486 3072 2747 2756 conicto +3008 2441 3008 1829 conicto +end_ol grestore +gsave 24.397121 7.747500 translate 0.035278 -0.035278 scale +start_ol +end_ol grestore +gsave 24.629400 7.747500 translate 0.035278 -0.035278 scale +start_ol +2944 3008 moveto +2944 0 lineto +2432 0 lineto +2432 2624 lineto +1088 2624 lineto +1088 0 lineto +576 0 lineto +576 2624 lineto +128 2624 lineto +128 3008 lineto +576 3008 lineto +576 3214 lineto +576 3698 813 3929 conicto +1050 4160 1541 4160 conicto +2048 4160 lineto +2048 3712 lineto +1568 3712 lineto +1298 3712 1193 3612 conicto +1088 3513 1088 3254 conicto +1088 3008 lineto +2944 3008 lineto +2432 4160 moveto +2944 4160 lineto +2944 3520 lineto +2432 3520 lineto +2432 4160 lineto +end_ol grestore +gsave 25.088965 7.747500 translate 0.035278 -0.035278 scale +start_ol +3136 1584 moveto +3136 1344 lineto +832 1344 lineto +865 875 1142 629 conicto +1420 384 1916 384 conicto +2203 384 2472 448 conicto +2742 512 3008 640 conicto +3008 192 lineto +2741 67 2460 1 conicto +2179 -64 1890 -64 conicto +1166 -64 743 352 conicto +320 768 320 1477 conicto +320 2211 723 2641 conicto +1126 3072 1809 3072 conicto +2423 3072 2779 2672 conicto +3136 2272 3136 1584 conicto +2624 1728 moveto +2619 2137 2395 2380 conicto +2172 2624 1804 2624 conicto +1387 2624 1136 2388 conicto +886 2153 848 1725 conicto +2624 1728 lineto +end_ol grestore +gsave 25.536045 7.747500 translate 0.035278 -0.035278 scale +start_ol +512 4160 moveto +1024 4160 lineto +1024 0 lineto +512 0 lineto +512 4160 lineto +end_ol grestore +gsave 25.738352 7.747500 translate 0.035278 -0.035278 scale +start_ol +2496 2560 moveto +2496 4160 lineto +3008 4160 lineto +3008 0 lineto +2496 0 lineto +2496 448 lineto +2342 188 2106 62 conicto +1870 -64 1540 -64 conicto +999 -64 659 368 conicto +320 800 320 1504 conicto +320 2208 659 2640 conicto +999 3072 1540 3072 conicto +1870 3072 2106 2946 conicto +2342 2820 2496 2560 conicto +832 1504 moveto +832 980 1053 682 conicto +1275 384 1663 384 conicto +2050 384 2273 682 conicto +2496 980 2496 1504 conicto +2496 2028 2273 2326 conicto +2050 2624 1663 2624 conicto +1275 2624 1053 2326 conicto +832 2028 832 1504 conicto +end_ol grestore +1.000000 1.000000 1.000000 srgb +n 18.245000 9.040000 m 18.245000 11.140000 l 29.545000 11.140000 l 29.545000 9.040000 l f +0.100000 slw +[] 0 sd +[] 0 sd +0 slj +0.000000 0.000000 0.000000 srgb +n 18.245000 9.040000 m 18.245000 11.140000 l 29.545000 11.140000 l 29.545000 9.040000 l cp s +gsave 18.695000 10.267500 translate 0.035278 -0.035278 scale +start_ol +3008 1829 moveto +3008 0 lineto +2496 0 lineto +2496 1813 lineto +2496 2220 2335 2422 conicto +2175 2624 1854 2624 conicto +1469 2624 1246 2379 conicto +1024 2135 1024 1713 conicto +1024 0 lineto +512 0 lineto +512 4160 lineto +1024 4160 lineto +1024 2560 lineto +1198 2817 1434 2944 conicto +1670 3072 1978 3072 conicto +2486 3072 2747 2756 conicto +3008 2441 3008 1829 conicto +end_ol grestore +gsave 19.157062 10.267500 translate 0.035278 -0.035278 scale +start_ol +3136 1584 moveto +3136 1344 lineto +832 1344 lineto +865 875 1142 629 conicto +1420 384 1916 384 conicto +2203 384 2472 448 conicto +2742 512 3008 640 conicto +3008 192 lineto +2741 67 2460 1 conicto +2179 -64 1890 -64 conicto +1166 -64 743 352 conicto +320 768 320 1477 conicto +320 2211 723 2641 conicto +1126 3072 1809 3072 conicto +2423 3072 2779 2672 conicto +3136 2272 3136 1584 conicto +2624 1728 moveto +2619 2137 2395 2380 conicto +2172 2624 1804 2624 conicto +1387 2624 1136 2388 conicto +886 2153 848 1725 conicto +2624 1728 lineto +end_ol grestore +gsave 19.604141 10.267500 translate 0.035278 -0.035278 scale +start_ol +1882 1536 moveto +1289 1536 1060 1402 conicto +832 1268 832 944 conicto +832 686 1003 535 conicto +1175 384 1470 384 conicto +1876 384 2122 667 conicto +2368 951 2368 1422 conicto +2368 1536 lineto +1882 1536 lineto +2880 1739 moveto +2880 0 lineto +2368 0 lineto +2368 448 lineto +2199 186 1946 61 conicto +1693 -64 1328 -64 conicto +866 -64 593 196 conicto +320 456 320 893 conicto +320 1402 662 1661 conicto +1004 1920 1682 1920 conicto +2368 1920 lineto +2368 1962 lineto +2368 2278 2140 2451 conicto +1912 2624 1500 2624 conicto +1238 2624 989 2560 conicto +741 2496 512 2368 conicto +512 2816 lineto +789 2944 1049 3008 conicto +1310 3072 1556 3072 conicto +2222 3072 2551 2741 conicto +2880 2411 2880 1739 conicto +end_ol grestore +gsave 20.051221 10.267500 translate 0.035278 -0.035278 scale +start_ol +1024 3840 moveto +1024 3008 lineto +2048 3008 lineto +2048 2624 lineto +1024 2624 lineto +1024 1016 lineto +1024 654 1125 551 conicto +1226 448 1537 448 conicto +2048 448 lineto +2048 0 lineto +1537 0 lineto +955 0 733 219 conicto +512 438 512 1016 conicto +512 2624 lineto +128 2624 lineto +128 3008 lineto +512 3008 lineto +512 3840 lineto +1024 3840 lineto +end_ol grestore +gsave 20.335954 10.267500 translate 0.035278 -0.035278 scale +start_ol +end_ol grestore +gsave 20.568233 10.267500 translate 0.035278 -0.035278 scale +start_ol +2688 1504 moveto +2688 2028 2466 2326 conicto +2244 2624 1856 2624 conicto +1468 2624 1246 2326 conicto +1024 2028 1024 1504 conicto +1024 980 1246 682 conicto +1468 384 1856 384 conicto +2244 384 2466 682 conicto +2688 980 2688 1504 conicto +1024 2560 moveto +1178 2820 1414 2946 conicto +1650 3072 1977 3072 conicto +2521 3072 2860 2640 conicto +3200 2208 3200 1504 conicto +3200 800 2860 368 conicto +2521 -64 1977 -64 conicto +1650 -64 1414 62 conicto +1178 188 1024 448 conicto +1024 0 lineto +512 0 lineto +512 4160 lineto +1024 4160 lineto +1024 2560 lineto +end_ol grestore +gsave 21.030295 10.267500 translate 0.035278 -0.035278 scale +start_ol +1882 1536 moveto +1289 1536 1060 1402 conicto +832 1268 832 944 conicto +832 686 1003 535 conicto +1175 384 1470 384 conicto +1876 384 2122 667 conicto +2368 951 2368 1422 conicto +2368 1536 lineto +1882 1536 lineto +2880 1739 moveto +2880 0 lineto +2368 0 lineto +2368 448 lineto +2199 186 1946 61 conicto +1693 -64 1328 -64 conicto +866 -64 593 196 conicto +320 456 320 893 conicto +320 1402 662 1661 conicto +1004 1920 1682 1920 conicto +2368 1920 lineto +2368 1962 lineto +2368 2278 2140 2451 conicto +1912 2624 1500 2624 conicto +1238 2624 989 2560 conicto +741 2496 512 2368 conicto +512 2816 lineto +789 2944 1049 3008 conicto +1310 3072 1556 3072 conicto +2222 3072 2551 2741 conicto +2880 2411 2880 1739 conicto +end_ol grestore +gsave 21.477375 10.267500 translate 0.035278 -0.035278 scale +start_ol +1024 3840 moveto +1024 3008 lineto +2048 3008 lineto +2048 2624 lineto +1024 2624 lineto +1024 1016 lineto +1024 654 1125 551 conicto +1226 448 1537 448 conicto +2048 448 lineto +2048 0 lineto +1537 0 lineto +955 0 733 219 conicto +512 438 512 1016 conicto +512 2624 lineto +128 2624 lineto +128 3008 lineto +512 3008 lineto +512 3840 lineto +1024 3840 lineto +end_ol grestore +gsave 21.762108 10.267500 translate 0.035278 -0.035278 scale +start_ol +3008 1829 moveto +3008 0 lineto +2496 0 lineto +2496 1813 lineto +2496 2220 2335 2422 conicto +2175 2624 1854 2624 conicto +1469 2624 1246 2379 conicto +1024 2135 1024 1713 conicto +1024 0 lineto +512 0 lineto +512 4160 lineto +1024 4160 lineto +1024 2560 lineto +1198 2817 1434 2944 conicto +1670 3072 1978 3072 conicto +2486 3072 2747 2756 conicto +3008 2441 3008 1829 conicto +end_ol grestore +gsave 22.224170 10.267500 translate 0.035278 -0.035278 scale +start_ol +end_ol grestore +gsave 22.456449 10.267500 translate 0.035278 -0.035278 scale +start_ol +2048 4160 moveto +2048 3712 lineto +1571 3712 lineto +1299 3712 1193 3612 conicto +1088 3513 1088 3254 conicto +1088 3008 lineto +1920 3008 lineto +1920 2624 lineto +1088 2624 lineto +1088 0 lineto +576 0 lineto +576 2624 lineto +128 2624 lineto +128 3008 lineto +576 3008 lineto +576 3202 lineto +576 3702 816 3931 conicto +1056 4160 1577 4160 conicto +2048 4160 lineto +end_ol grestore +gsave 22.713707 10.267500 translate 0.035278 -0.035278 scale +start_ol +512 1177 moveto +512 3008 lineto +1024 3008 lineto +1024 1196 lineto +1024 790 1184 587 conicto +1344 384 1664 384 conicto +2049 384 2272 628 conicto +2496 872 2496 1293 conicto +2496 3008 lineto +3008 3008 lineto +3008 0 lineto +2496 0 lineto +2496 448 lineto +2320 188 2087 62 conicto +1854 -64 1546 -64 conicto +1038 -64 775 252 conicto +512 568 512 1177 conicto +1744 3072 moveto +1744 3072 lineto +end_ol grestore +gsave 23.175768 10.267500 translate 0.035278 -0.035278 scale +start_ol +3008 1829 moveto +3008 0 lineto +2496 0 lineto +2496 1813 lineto +2496 2220 2335 2422 conicto +2175 2624 1854 2624 conicto +1469 2624 1246 2379 conicto +1024 2135 1024 1713 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1198 2817 1434 2944 conicto +1670 3072 1978 3072 conicto +2486 3072 2747 2756 conicto +3008 2441 3008 1829 conicto +end_ol grestore +gsave 23.637830 10.267500 translate 0.035278 -0.035278 scale +start_ol +2688 2816 moveto +2688 2368 lineto +2479 2496 2268 2560 conicto +2058 2624 1843 2624 conicto +1363 2624 1097 2329 conicto +832 2035 832 1504 conicto +832 973 1097 678 conicto +1363 384 1843 384 conicto +2058 384 2268 448 conicto +2479 512 2688 640 conicto +2688 192 lineto +2482 64 2261 0 conicto +2041 -64 1793 -64 conicto +1116 -64 718 360 conicto +320 784 320 1504 conicto +320 2235 722 2653 conicto +1124 3072 1825 3072 conicto +2052 3072 2268 3008 conicto +2485 2944 2688 2816 conicto +end_ol grestore +gsave 24.037450 10.267500 translate 0.035278 -0.035278 scale +start_ol +1024 3840 moveto +1024 3008 lineto +2048 3008 lineto +2048 2624 lineto +1024 2624 lineto +1024 1016 lineto +1024 654 1125 551 conicto +1226 448 1537 448 conicto +2048 448 lineto +2048 0 lineto +1537 0 lineto +955 0 733 219 conicto +512 438 512 1016 conicto +512 2624 lineto +128 2624 lineto +128 3008 lineto +512 3008 lineto +512 3840 lineto +1024 3840 lineto +end_ol grestore +gsave 24.322183 10.267500 translate 0.035278 -0.035278 scale +start_ol +512 3008 moveto +1024 3008 lineto +1024 0 lineto +512 0 lineto +512 3008 lineto +512 4160 moveto +1024 4160 lineto +1024 3520 lineto +512 3520 lineto +512 4160 lineto +end_ol grestore +gsave 24.524490 10.267500 translate 0.035278 -0.035278 scale +start_ol +1697 2624 moveto +1297 2624 1064 2324 conicto +832 2025 832 1504 conicto +832 983 1063 683 conicto +1294 384 1697 384 conicto +2095 384 2327 684 conicto +2560 985 2560 1504 conicto +2560 2020 2327 2322 conicto +2095 2624 1697 2624 conicto +1696 3072 moveto +2338 3072 2705 2656 conicto +3072 2240 3072 1504 conicto +3072 771 2705 353 conicto +2338 -64 1696 -64 conicto +1051 -64 685 353 conicto +320 771 320 1504 conicto +320 2240 685 2656 conicto +1051 3072 1696 3072 conicto +end_ol grestore +gsave 24.969073 10.267500 translate 0.035278 -0.035278 scale +start_ol +3008 1829 moveto +3008 0 lineto +2496 0 lineto +2496 1813 lineto +2496 2220 2335 2422 conicto +2175 2624 1854 2624 conicto +1469 2624 1246 2379 conicto +1024 2135 1024 1713 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1198 2817 1434 2944 conicto +1670 3072 1978 3072 conicto +2486 3072 2747 2756 conicto +3008 2441 3008 1829 conicto +end_ol grestore +1.000000 1.000000 1.000000 srgb +n 18.192500 11.660000 m 18.192500 13.760000 l 29.492500 13.760000 l 29.492500 11.660000 l f +0.100000 slw +[] 0 sd +[] 0 sd +0 slj +0.000000 0.000000 0.000000 srgb +n 18.192500 11.660000 m 18.192500 13.760000 l 29.492500 13.760000 l 29.492500 11.660000 l cp s +gsave 18.642500 12.887500 translate 0.035278 -0.035278 scale +start_ol +2496 2560 moveto +2496 4160 lineto +3008 4160 lineto +3008 0 lineto +2496 0 lineto +2496 448 lineto +2342 188 2106 62 conicto +1870 -64 1540 -64 conicto +999 -64 659 368 conicto +320 800 320 1504 conicto +320 2208 659 2640 conicto +999 3072 1540 3072 conicto +1870 3072 2106 2946 conicto +2342 2820 2496 2560 conicto +832 1504 moveto +832 980 1053 682 conicto +1275 384 1663 384 conicto +2050 384 2273 682 conicto +2496 980 2496 1504 conicto +2496 2028 2273 2326 conicto +2050 2624 1663 2624 conicto +1275 2624 1053 2326 conicto +832 2028 832 1504 conicto +end_ol grestore +gsave 19.104562 12.887500 translate 0.035278 -0.035278 scale +start_ol +3136 1584 moveto +3136 1344 lineto +832 1344 lineto +865 875 1142 629 conicto +1420 384 1916 384 conicto +2203 384 2472 448 conicto +2742 512 3008 640 conicto +3008 192 lineto +2741 67 2460 1 conicto +2179 -64 1890 -64 conicto +1166 -64 743 352 conicto +320 768 320 1477 conicto +320 2211 723 2641 conicto +1126 3072 1809 3072 conicto +2423 3072 2779 2672 conicto +3136 2272 3136 1584 conicto +2624 1728 moveto +2619 2137 2395 2380 conicto +2172 2624 1804 2624 conicto +1387 2624 1136 2388 conicto +886 2153 848 1725 conicto +2624 1728 lineto +end_ol grestore +gsave 19.551641 12.887500 translate 0.035278 -0.035278 scale +start_ol +2304 2560 moveto +2220 2593 2120 2608 conicto +2021 2624 1902 2624 conicto +1478 2624 1251 2359 conicto +1024 2094 1024 1597 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1182 2820 1434 2946 conicto +1687 3072 2049 3072 conicto +2100 3072 2162 3072 conicto +2225 3072 2301 3072 conicto +2304 2560 lineto +end_ol grestore +gsave 19.851356 12.887500 translate 0.035278 -0.035278 scale +start_ol +512 3008 moveto +1024 3008 lineto +1024 0 lineto +512 0 lineto +512 3008 lineto +512 4160 moveto +1024 4160 lineto +1024 3520 lineto +512 3520 lineto +512 4160 lineto +end_ol grestore +gsave 20.053663 12.887500 translate 0.035278 -0.035278 scale +start_ol +192 3008 moveto +719 3008 lineto +1664 483 lineto +2609 3008 lineto +3136 3008 lineto +2002 0 lineto +1326 0 lineto +192 3008 lineto +end_ol grestore +gsave 20.485753 12.887500 translate 0.035278 -0.035278 scale +start_ol +1882 1536 moveto +1289 1536 1060 1402 conicto +832 1268 832 944 conicto +832 686 1003 535 conicto +1175 384 1470 384 conicto +1876 384 2122 667 conicto +2368 951 2368 1422 conicto +2368 1536 lineto +1882 1536 lineto +2880 1739 moveto +2880 0 lineto +2368 0 lineto +2368 448 lineto +2199 186 1946 61 conicto +1693 -64 1328 -64 conicto +866 -64 593 196 conicto +320 456 320 893 conicto +320 1402 662 1661 conicto +1004 1920 1682 1920 conicto +2368 1920 lineto +2368 1962 lineto +2368 2278 2140 2451 conicto +1912 2624 1500 2624 conicto +1238 2624 989 2560 conicto +741 2496 512 2368 conicto +512 2816 lineto +789 2944 1049 3008 conicto +1310 3072 1556 3072 conicto +2222 3072 2551 2741 conicto +2880 2411 2880 1739 conicto +end_ol grestore +gsave 20.932833 12.887500 translate 0.035278 -0.035278 scale +start_ol +1024 3840 moveto +1024 3008 lineto +2048 3008 lineto +2048 2624 lineto +1024 2624 lineto +1024 1016 lineto +1024 654 1125 551 conicto +1226 448 1537 448 conicto +2048 448 lineto +2048 0 lineto +1537 0 lineto +955 0 733 219 conicto +512 438 512 1016 conicto +512 2624 lineto +128 2624 lineto +128 3008 lineto +512 3008 lineto +512 3840 lineto +1024 3840 lineto +end_ol grestore +gsave 21.217566 12.887500 translate 0.035278 -0.035278 scale +start_ol +512 3008 moveto +1024 3008 lineto +1024 0 lineto +512 0 lineto +512 3008 lineto +512 4160 moveto +1024 4160 lineto +1024 3520 lineto +512 3520 lineto +512 4160 lineto +end_ol grestore +gsave 21.419873 12.887500 translate 0.035278 -0.035278 scale +start_ol +192 3008 moveto +719 3008 lineto +1664 483 lineto +2609 3008 lineto +3136 3008 lineto +2002 0 lineto +1326 0 lineto +192 3008 lineto +end_ol grestore +gsave 21.851962 12.887500 translate 0.035278 -0.035278 scale +start_ol +3136 1584 moveto +3136 1344 lineto +832 1344 lineto +865 875 1142 629 conicto +1420 384 1916 384 conicto +2203 384 2472 448 conicto +2742 512 3008 640 conicto +3008 192 lineto +2741 67 2460 1 conicto +2179 -64 1890 -64 conicto +1166 -64 743 352 conicto +320 768 320 1477 conicto +320 2211 723 2641 conicto +1126 3072 1809 3072 conicto +2423 3072 2779 2672 conicto +3136 2272 3136 1584 conicto +2624 1728 moveto +2619 2137 2395 2380 conicto +2172 2624 1804 2624 conicto +1387 2624 1136 2388 conicto +886 2153 848 1725 conicto +2624 1728 lineto +end_ol grestore +gsave 22.299042 12.887500 translate 0.035278 -0.035278 scale +start_ol +end_ol grestore +gsave 22.531321 12.887500 translate 0.035278 -0.035278 scale +start_ol +2048 4160 moveto +2048 3712 lineto +1571 3712 lineto +1299 3712 1193 3612 conicto +1088 3513 1088 3254 conicto +1088 3008 lineto +1920 3008 lineto +1920 2624 lineto +1088 2624 lineto +1088 0 lineto +576 0 lineto +576 2624 lineto +128 2624 lineto +128 3008 lineto +576 3008 lineto +576 3202 lineto +576 3702 816 3931 conicto +1056 4160 1577 4160 conicto +2048 4160 lineto +end_ol grestore +gsave 22.788579 12.887500 translate 0.035278 -0.035278 scale +start_ol +512 1177 moveto +512 3008 lineto +1024 3008 lineto +1024 1196 lineto +1024 790 1184 587 conicto +1344 384 1664 384 conicto +2049 384 2272 628 conicto +2496 872 2496 1293 conicto +2496 3008 lineto +3008 3008 lineto +3008 0 lineto +2496 0 lineto +2496 448 lineto +2320 188 2087 62 conicto +1854 -64 1546 -64 conicto +1038 -64 775 252 conicto +512 568 512 1177 conicto +1744 3072 moveto +1744 3072 lineto +end_ol grestore +gsave 23.250640 12.887500 translate 0.035278 -0.035278 scale +start_ol +3008 1829 moveto +3008 0 lineto +2496 0 lineto +2496 1813 lineto +2496 2220 2335 2422 conicto +2175 2624 1854 2624 conicto +1469 2624 1246 2379 conicto +1024 2135 1024 1713 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1198 2817 1434 2944 conicto +1670 3072 1978 3072 conicto +2486 3072 2747 2756 conicto +3008 2441 3008 1829 conicto +end_ol grestore +gsave 23.712702 12.887500 translate 0.035278 -0.035278 scale +start_ol +2688 2816 moveto +2688 2368 lineto +2479 2496 2268 2560 conicto +2058 2624 1843 2624 conicto +1363 2624 1097 2329 conicto +832 2035 832 1504 conicto +832 973 1097 678 conicto +1363 384 1843 384 conicto +2058 384 2268 448 conicto +2479 512 2688 640 conicto +2688 192 lineto +2482 64 2261 0 conicto +2041 -64 1793 -64 conicto +1116 -64 718 360 conicto +320 784 320 1504 conicto +320 2235 722 2653 conicto +1124 3072 1825 3072 conicto +2052 3072 2268 3008 conicto +2485 2944 2688 2816 conicto +end_ol grestore +gsave 24.112322 12.887500 translate 0.035278 -0.035278 scale +start_ol +1024 3840 moveto +1024 3008 lineto +2048 3008 lineto +2048 2624 lineto +1024 2624 lineto +1024 1016 lineto +1024 654 1125 551 conicto +1226 448 1537 448 conicto +2048 448 lineto +2048 0 lineto +1537 0 lineto +955 0 733 219 conicto +512 438 512 1016 conicto +512 2624 lineto +128 2624 lineto +128 3008 lineto +512 3008 lineto +512 3840 lineto +1024 3840 lineto +end_ol grestore +gsave 24.397055 12.887500 translate 0.035278 -0.035278 scale +start_ol +512 3008 moveto +1024 3008 lineto +1024 0 lineto +512 0 lineto +512 3008 lineto +512 4160 moveto +1024 4160 lineto +1024 3520 lineto +512 3520 lineto +512 4160 lineto +end_ol grestore +gsave 24.599362 12.887500 translate 0.035278 -0.035278 scale +start_ol +1697 2624 moveto +1297 2624 1064 2324 conicto +832 2025 832 1504 conicto +832 983 1063 683 conicto +1294 384 1697 384 conicto +2095 384 2327 684 conicto +2560 985 2560 1504 conicto +2560 2020 2327 2322 conicto +2095 2624 1697 2624 conicto +1696 3072 moveto +2338 3072 2705 2656 conicto +3072 2240 3072 1504 conicto +3072 771 2705 353 conicto +2338 -64 1696 -64 conicto +1051 -64 685 353 conicto +320 771 320 1504 conicto +320 2240 685 2656 conicto +1051 3072 1696 3072 conicto +end_ol grestore +gsave 25.043945 12.887500 translate 0.035278 -0.035278 scale +start_ol +3008 1829 moveto +3008 0 lineto +2496 0 lineto +2496 1813 lineto +2496 2220 2335 2422 conicto +2175 2624 1854 2624 conicto +1469 2624 1246 2379 conicto +1024 2135 1024 1713 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1198 2817 1434 2944 conicto +1670 3072 1978 3072 conicto +2486 3072 2747 2756 conicto +3008 2441 3008 1829 conicto +end_ol grestore +1.000000 1.000000 1.000000 srgb +n 18.190000 14.230000 m 18.190000 16.330000 l 29.490000 16.330000 l 29.490000 14.230000 l f +0.100000 slw +[] 0 sd +[] 0 sd +0 slj +0.000000 0.000000 0.000000 srgb +n 18.190000 14.230000 m 18.190000 16.330000 l 29.490000 16.330000 l 29.490000 14.230000 l cp s +gsave 18.640000 15.457500 translate 0.035278 -0.035278 scale +start_ol +1882 1536 moveto +1289 1536 1060 1402 conicto +832 1268 832 944 conicto +832 686 1003 535 conicto +1175 384 1470 384 conicto +1876 384 2122 667 conicto +2368 951 2368 1422 conicto +2368 1536 lineto +1882 1536 lineto +2880 1739 moveto +2880 0 lineto +2368 0 lineto +2368 448 lineto +2199 186 1946 61 conicto +1693 -64 1328 -64 conicto +866 -64 593 196 conicto +320 456 320 893 conicto +320 1402 662 1661 conicto +1004 1920 1682 1920 conicto +2368 1920 lineto +2368 1962 lineto +2368 2278 2140 2451 conicto +1912 2624 1500 2624 conicto +1238 2624 989 2560 conicto +741 2496 512 2368 conicto +512 2816 lineto +789 2944 1049 3008 conicto +1310 3072 1556 3072 conicto +2222 3072 2551 2741 conicto +2880 2411 2880 1739 conicto +end_ol grestore +gsave 19.087080 15.457500 translate 0.035278 -0.035278 scale +start_ol +2688 2816 moveto +2688 2368 lineto +2479 2496 2268 2560 conicto +2058 2624 1843 2624 conicto +1363 2624 1097 2329 conicto +832 2035 832 1504 conicto +832 973 1097 678 conicto +1363 384 1843 384 conicto +2058 384 2268 448 conicto +2479 512 2688 640 conicto +2688 192 lineto +2482 64 2261 0 conicto +2041 -64 1793 -64 conicto +1116 -64 718 360 conicto +320 784 320 1504 conicto +320 2235 722 2653 conicto +1124 3072 1825 3072 conicto +2052 3072 2268 3008 conicto +2485 2944 2688 2816 conicto +end_ol grestore +gsave 19.486700 15.457500 translate 0.035278 -0.035278 scale +start_ol +2688 2816 moveto +2688 2368 lineto +2479 2496 2268 2560 conicto +2058 2624 1843 2624 conicto +1363 2624 1097 2329 conicto +832 2035 832 1504 conicto +832 973 1097 678 conicto +1363 384 1843 384 conicto +2058 384 2268 448 conicto +2479 512 2688 640 conicto +2688 192 lineto +2482 64 2261 0 conicto +2041 -64 1793 -64 conicto +1116 -64 718 360 conicto +320 784 320 1504 conicto +320 2235 722 2653 conicto +1124 3072 1825 3072 conicto +2052 3072 2268 3008 conicto +2485 2944 2688 2816 conicto +end_ol grestore +gsave 19.886320 15.457500 translate 0.035278 -0.035278 scale +start_ol +3136 1584 moveto +3136 1344 lineto +832 1344 lineto +865 875 1142 629 conicto +1420 384 1916 384 conicto +2203 384 2472 448 conicto +2742 512 3008 640 conicto +3008 192 lineto +2741 67 2460 1 conicto +2179 -64 1890 -64 conicto +1166 -64 743 352 conicto +320 768 320 1477 conicto +320 2211 723 2641 conicto +1126 3072 1809 3072 conicto +2423 3072 2779 2672 conicto +3136 2272 3136 1584 conicto +2624 1728 moveto +2619 2137 2395 2380 conicto +2172 2624 1804 2624 conicto +1387 2624 1136 2388 conicto +886 2153 848 1725 conicto +2624 1728 lineto +end_ol grestore +gsave 20.333399 15.457500 translate 0.035278 -0.035278 scale +start_ol +1024 448 moveto +1024 -1152 lineto +512 -1152 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1178 2820 1414 2946 conicto +1650 3072 1977 3072 conicto +2521 3072 2860 2640 conicto +3200 2208 3200 1504 conicto +3200 800 2860 368 conicto +2521 -64 1977 -64 conicto +1650 -64 1414 62 conicto +1178 188 1024 448 conicto +2688 1504 moveto +2688 2028 2466 2326 conicto +2244 2624 1856 2624 conicto +1468 2624 1246 2326 conicto +1024 2028 1024 1504 conicto +1024 980 1246 682 conicto +1468 384 1856 384 conicto +2244 384 2466 682 conicto +2688 980 2688 1504 conicto +end_ol grestore +gsave 20.795461 15.457500 translate 0.035278 -0.035278 scale +start_ol +1024 3840 moveto +1024 3008 lineto +2048 3008 lineto +2048 2624 lineto +1024 2624 lineto +1024 1016 lineto +1024 654 1125 551 conicto +1226 448 1537 448 conicto +2048 448 lineto +2048 0 lineto +1537 0 lineto +955 0 733 219 conicto +512 438 512 1016 conicto +512 2624 lineto +128 2624 lineto +128 3008 lineto +512 3008 lineto +512 3840 lineto +1024 3840 lineto +end_ol grestore +gsave 21.080194 15.457500 translate 0.035278 -0.035278 scale +start_ol +1882 1536 moveto +1289 1536 1060 1402 conicto +832 1268 832 944 conicto +832 686 1003 535 conicto +1175 384 1470 384 conicto +1876 384 2122 667 conicto +2368 951 2368 1422 conicto +2368 1536 lineto +1882 1536 lineto +2880 1739 moveto +2880 0 lineto +2368 0 lineto +2368 448 lineto +2199 186 1946 61 conicto +1693 -64 1328 -64 conicto +866 -64 593 196 conicto +320 456 320 893 conicto +320 1402 662 1661 conicto +1004 1920 1682 1920 conicto +2368 1920 lineto +2368 1962 lineto +2368 2278 2140 2451 conicto +1912 2624 1500 2624 conicto +1238 2624 989 2560 conicto +741 2496 512 2368 conicto +512 2816 lineto +789 2944 1049 3008 conicto +1310 3072 1556 3072 conicto +2222 3072 2551 2741 conicto +2880 2411 2880 1739 conicto +end_ol grestore +gsave 21.527274 15.457500 translate 0.035278 -0.035278 scale +start_ol +3008 1829 moveto +3008 0 lineto +2496 0 lineto +2496 1813 lineto +2496 2220 2335 2422 conicto +2175 2624 1854 2624 conicto +1469 2624 1246 2379 conicto +1024 2135 1024 1713 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1198 2817 1434 2944 conicto +1670 3072 1978 3072 conicto +2486 3072 2747 2756 conicto +3008 2441 3008 1829 conicto +end_ol grestore +gsave 21.989336 15.457500 translate 0.035278 -0.035278 scale +start_ol +2688 2816 moveto +2688 2368 lineto +2479 2496 2268 2560 conicto +2058 2624 1843 2624 conicto +1363 2624 1097 2329 conicto +832 2035 832 1504 conicto +832 973 1097 678 conicto +1363 384 1843 384 conicto +2058 384 2268 448 conicto +2479 512 2688 640 conicto +2688 192 lineto +2482 64 2261 0 conicto +2041 -64 1793 -64 conicto +1116 -64 718 360 conicto +320 784 320 1504 conicto +320 2235 722 2653 conicto +1124 3072 1825 3072 conicto +2052 3072 2268 3008 conicto +2485 2944 2688 2816 conicto +end_ol grestore +gsave 22.388956 15.457500 translate 0.035278 -0.035278 scale +start_ol +3136 1584 moveto +3136 1344 lineto +832 1344 lineto +865 875 1142 629 conicto +1420 384 1916 384 conicto +2203 384 2472 448 conicto +2742 512 3008 640 conicto +3008 192 lineto +2741 67 2460 1 conicto +2179 -64 1890 -64 conicto +1166 -64 743 352 conicto +320 768 320 1477 conicto +320 2211 723 2641 conicto +1126 3072 1809 3072 conicto +2423 3072 2779 2672 conicto +3136 2272 3136 1584 conicto +2624 1728 moveto +2619 2137 2395 2380 conicto +2172 2624 1804 2624 conicto +1387 2624 1136 2388 conicto +886 2153 848 1725 conicto +2624 1728 lineto +end_ol grestore +gsave 22.836035 15.457500 translate 0.035278 -0.035278 scale +start_ol +end_ol grestore +gsave 23.068315 15.457500 translate 0.035278 -0.035278 scale +start_ol +2048 4160 moveto +2048 3712 lineto +1571 3712 lineto +1299 3712 1193 3612 conicto +1088 3513 1088 3254 conicto +1088 3008 lineto +1920 3008 lineto +1920 2624 lineto +1088 2624 lineto +1088 0 lineto +576 0 lineto +576 2624 lineto +128 2624 lineto +128 3008 lineto +576 3008 lineto +576 3202 lineto +576 3702 816 3931 conicto +1056 4160 1577 4160 conicto +2048 4160 lineto +end_ol grestore +gsave 23.325572 15.457500 translate 0.035278 -0.035278 scale +start_ol +512 1177 moveto +512 3008 lineto +1024 3008 lineto +1024 1196 lineto +1024 790 1184 587 conicto +1344 384 1664 384 conicto +2049 384 2272 628 conicto +2496 872 2496 1293 conicto +2496 3008 lineto +3008 3008 lineto +3008 0 lineto +2496 0 lineto +2496 448 lineto +2320 188 2087 62 conicto +1854 -64 1546 -64 conicto +1038 -64 775 252 conicto +512 568 512 1177 conicto +1744 3072 moveto +1744 3072 lineto +end_ol grestore +gsave 23.787634 15.457500 translate 0.035278 -0.035278 scale +start_ol +3008 1829 moveto +3008 0 lineto +2496 0 lineto +2496 1813 lineto +2496 2220 2335 2422 conicto +2175 2624 1854 2624 conicto +1469 2624 1246 2379 conicto +1024 2135 1024 1713 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1198 2817 1434 2944 conicto +1670 3072 1978 3072 conicto +2486 3072 2747 2756 conicto +3008 2441 3008 1829 conicto +end_ol grestore +gsave 24.249696 15.457500 translate 0.035278 -0.035278 scale +start_ol +2688 2816 moveto +2688 2368 lineto +2479 2496 2268 2560 conicto +2058 2624 1843 2624 conicto +1363 2624 1097 2329 conicto +832 2035 832 1504 conicto +832 973 1097 678 conicto +1363 384 1843 384 conicto +2058 384 2268 448 conicto +2479 512 2688 640 conicto +2688 192 lineto +2482 64 2261 0 conicto +2041 -64 1793 -64 conicto +1116 -64 718 360 conicto +320 784 320 1504 conicto +320 2235 722 2653 conicto +1124 3072 1825 3072 conicto +2052 3072 2268 3008 conicto +2485 2944 2688 2816 conicto +end_ol grestore +gsave 24.649316 15.457500 translate 0.035278 -0.035278 scale +start_ol +1024 3840 moveto +1024 3008 lineto +2048 3008 lineto +2048 2624 lineto +1024 2624 lineto +1024 1016 lineto +1024 654 1125 551 conicto +1226 448 1537 448 conicto +2048 448 lineto +2048 0 lineto +1537 0 lineto +955 0 733 219 conicto +512 438 512 1016 conicto +512 2624 lineto +128 2624 lineto +128 3008 lineto +512 3008 lineto +512 3840 lineto +1024 3840 lineto +end_ol grestore +gsave 24.934049 15.457500 translate 0.035278 -0.035278 scale +start_ol +512 3008 moveto +1024 3008 lineto +1024 0 lineto +512 0 lineto +512 3008 lineto +512 4160 moveto +1024 4160 lineto +1024 3520 lineto +512 3520 lineto +512 4160 lineto +end_ol grestore +gsave 25.136356 15.457500 translate 0.035278 -0.035278 scale +start_ol +1697 2624 moveto +1297 2624 1064 2324 conicto +832 2025 832 1504 conicto +832 983 1063 683 conicto +1294 384 1697 384 conicto +2095 384 2327 684 conicto +2560 985 2560 1504 conicto +2560 2020 2327 2322 conicto +2095 2624 1697 2624 conicto +1696 3072 moveto +2338 3072 2705 2656 conicto +3072 2240 3072 1504 conicto +3072 771 2705 353 conicto +2338 -64 1696 -64 conicto +1051 -64 685 353 conicto +320 771 320 1504 conicto +320 2240 685 2656 conicto +1051 3072 1696 3072 conicto +end_ol grestore +gsave 25.580939 15.457500 translate 0.035278 -0.035278 scale +start_ol +3008 1829 moveto +3008 0 lineto +2496 0 lineto +2496 1813 lineto +2496 2220 2335 2422 conicto +2175 2624 1854 2624 conicto +1469 2624 1246 2379 conicto +1024 2135 1024 1713 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1198 2817 1434 2944 conicto +1670 3072 1978 3072 conicto +2486 3072 2747 2756 conicto +3008 2441 3008 1829 conicto +end_ol grestore +1.000000 1.000000 1.000000 srgb +n 18.187500 16.850000 m 18.187500 18.950000 l 29.487500 18.950000 l 29.487500 16.850000 l f +0.100000 slw +[] 0 sd +[] 0 sd +0 slj +0.000000 0.000000 0.000000 srgb +n 18.187500 16.850000 m 18.187500 18.950000 l 29.487500 18.950000 l 29.487500 16.850000 l cp s +gsave 18.637500 18.077500 translate 0.035278 -0.035278 scale +start_ol +1024 3840 moveto +1024 3008 lineto +2048 3008 lineto +2048 2624 lineto +1024 2624 lineto +1024 1016 lineto +1024 654 1125 551 conicto +1226 448 1537 448 conicto +2048 448 lineto +2048 0 lineto +1537 0 lineto +955 0 733 219 conicto +512 438 512 1016 conicto +512 2624 lineto +128 2624 lineto +128 3008 lineto +512 3008 lineto +512 3840 lineto +1024 3840 lineto +end_ol grestore +gsave 18.922233 18.077500 translate 0.035278 -0.035278 scale +start_ol +512 3008 moveto +1024 3008 lineto +1024 0 lineto +512 0 lineto +512 3008 lineto +512 4160 moveto +1024 4160 lineto +1024 3520 lineto +512 3520 lineto +512 4160 lineto +end_ol grestore +gsave 19.124540 18.077500 translate 0.035278 -0.035278 scale +start_ol +2858 2449 moveto +3041 2768 3296 2920 conicto +3551 3072 3895 3072 conicto +4360 3072 4612 2748 conicto +4864 2425 4864 1829 conicto +4864 0 lineto +4352 0 lineto +4352 1813 lineto +4352 2225 4203 2424 conicto +4055 2624 3750 2624 conicto +3377 2624 3160 2379 conicto +2944 2135 2944 1713 conicto +2944 0 lineto +2432 0 lineto +2432 1813 lineto +2432 2227 2283 2425 conicto +2135 2624 1824 2624 conicto +1457 2624 1240 2378 conicto +1024 2132 1024 1713 conicto +1024 0 lineto +512 0 lineto +512 3008 lineto +1024 3008 lineto +1024 2560 lineto +1190 2822 1423 2947 conicto +1656 3072 1975 3072 conicto +2298 3072 2524 2912 conicto +2750 2753 2858 2449 conicto +end_ol grestore +gsave 19.833871 18.077500 translate 0.035278 -0.035278 scale +start_ol +3136 1584 moveto +3136 1344 lineto +832 1344 lineto +865 875 1142 629 conicto +1420 384 1916 384 conicto +2203 384 2472 448 conicto +2742 512 3008 640 conicto +3008 192 lineto +2741 67 2460 1 conicto +2179 -64 1890 -64 conicto +1166 -64 743 352 conicto +320 768 320 1477 conicto +320 2211 723 2641 conicto +1126 3072 1809 3072 conicto +2423 3072 2779 2672 conicto +3136 2272 3136 1584 conicto +2624 1728 moveto +2619 2137 2395 2380 conicto +2172 2624 1804 2624 conicto +1387 2624 1136 2388 conicto +886 2153 848 1725 conicto +2624 1728 lineto +end_ol grestore +gsave 20.280951 18.077500 translate 0.035278 -0.035278 scale +start_ol +2432 2880 moveto +2432 2432 lineto +2227 2528 2007 2576 conicto +1787 2624 1551 2624 conicto +1191 2624 1011 2516 conicto +832 2408 832 2192 conicto +832 2027 962 1933 conicto +1093 1839 1487 1754 conicto +1654 1717 lineto +2181 1605 2402 1402 conicto +2624 1199 2624 834 conicto +2624 420 2291 178 conicto +1959 -64 1378 -64 conicto +1136 -64 873 -16 conicto +611 32 320 128 conicto +320 640 lineto +594 512 860 448 conicto +1126 384 1387 384 conicto +1736 384 1924 498 conicto +2112 612 2112 820 conicto +2112 1013 1977 1115 conicto +1843 1218 1388 1313 conicto +1218 1352 lineto +743 1448 531 1646 conicto +320 1845 320 2192 conicto +320 2613 620 2842 conicto +920 3072 1472 3072 conicto +1746 3072 1987 3024 conicto +2228 2976 2432 2880 conicto +end_ol grestore +gsave 20.660595 18.077500 translate 0.035278 -0.035278 scale +start_ol +2688 2816 moveto +2688 2368 lineto +2479 2496 2268 2560 conicto +2058 2624 1843 2624 conicto +1363 2624 1097 2329 conicto +832 2035 832 1504 conicto +832 973 1097 678 conicto +1363 384 1843 384 conicto +2058 384 2268 448 conicto +2479 512 2688 640 conicto +2688 192 lineto +2482 64 2261 0 conicto +2041 -64 1793 -64 conicto +1116 -64 718 360 conicto +320 784 320 1504 conicto +320 2235 722 2653 conicto +1124 3072 1825 3072 conicto +2052 3072 2268 3008 conicto +2485 2944 2688 2816 conicto +end_ol grestore +gsave 21.060215 18.077500 translate 0.035278 -0.035278 scale +start_ol +1882 1536 moveto +1289 1536 1060 1402 conicto +832 1268 832 944 conicto +832 686 1003 535 conicto +1175 384 1470 384 conicto +1876 384 2122 667 conicto +2368 951 2368 1422 conicto +2368 1536 lineto +1882 1536 lineto +2880 1739 moveto +2880 0 lineto +2368 0 lineto +2368 448 lineto +2199 186 1946 61 conicto +1693 -64 1328 -64 conicto +866 -64 593 196 conicto +320 456 320 893 conicto +320 1402 662 1661 conicto +1004 1920 1682 1920 conicto +2368 1920 lineto +2368 1962 lineto +2368 2278 2140 2451 conicto +1912 2624 1500 2624 conicto +1238 2624 989 2560 conicto +741 2496 512 2368 conicto +512 2816 lineto +789 2944 1049 3008 conicto +1310 3072 1556 3072 conicto +2222 3072 2551 2741 conicto +2880 2411 2880 1739 conicto +end_ol grestore +gsave 21.507295 18.077500 translate 0.035278 -0.035278 scale +start_ol +512 4160 moveto +1024 4160 lineto +1024 0 lineto +512 0 lineto +512 4160 lineto +end_ol grestore +gsave 21.709602 18.077500 translate 0.035278 -0.035278 scale +start_ol +3136 1584 moveto +3136 1344 lineto +832 1344 lineto +865 875 1142 629 conicto +1420 384 1916 384 conicto +2203 384 2472 448 conicto +2742 512 3008 640 conicto +3008 192 lineto +2741 67 2460 1 conicto +2179 -64 1890 -64 conicto +1166 -64 743 352 conicto +320 768 320 1477 conicto +320 2211 723 2641 conicto +1126 3072 1809 3072 conicto +2423 3072 2779 2672 conicto +3136 2272 3136 1584 conicto +2624 1728 moveto +2619 2137 2395 2380 conicto +2172 2624 1804 2624 conicto +1387 2624 1136 2388 conicto +886 2153 848 1725 conicto +2624 1728 lineto +end_ol grestore +0.100000 slw +[] 0 sd +[] 0 sd +0 slc +n 13.100000 5.150000 m 17.700233 5.016013 l s +0 slj +n 17.707512 5.265907 m 18.200021 5.001456 l 17.692955 4.766119 l ef +0.100000 slw +[] 0 sd +[] 0 sd +0 slc +n 13.100000 5.150000 m 17.749762 7.335998 l s +0 slj +n 17.643398 7.562242 m 18.202251 7.548727 l 17.856127 7.109753 l ef +0.100000 slw +[] 0 sd +[] 0 sd +0 slc +n 13.100000 5.150000 m 17.848268 9.709075 l s +0 slj +n 17.675120 9.889408 m 18.208933 10.055370 l 18.021415 9.528743 l ef +0.100000 slw +[] 0 sd +[] 0 sd +0 slc +n 13.100000 5.150000 m 17.885225 12.253840 l s +0 slj +n 17.677880 12.393510 m 18.164566 12.668531 l 18.092571 12.114169 l ef +0.100000 slw +[] 0 sd +[] 0 sd +0 slc +n 13.100000 5.150000 m 17.943063 14.788551 l s +0 slj +n 17.719677 14.900795 m 18.167551 15.235323 l 18.166449 14.676307 l ef +0.100000 slw +[] 0 sd +[] 0 sd +0 slc +n 13.100000 5.150000 m 17.983667 17.389165 l s +0 slj +n 17.751469 17.481817 m 18.168970 17.853560 l 18.215865 17.296514 l ef +showpage diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/doc/online.tex b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/online.tex new file mode 100644 index 0000000000000000000000000000000000000000..a208c9072dc4552bcfc545e5b26e4c69ff32415f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/online.tex @@ -0,0 +1,212 @@ +\subsection{Online Measurements} + +The HMC program includes the possibility to perform a certain number +of measurements after every trajectory \emph{online}, whether or not +the configuration is stored on disk. Some of those are done per +default, namely all that are written to the output file {\ttfamily + output.data}: +\begin{enumerate} +\item the plaquette expectation value, defined as: + \[ + \langle P\rangle = \frac{1}{6 V}\ \sum_{\substack{ + \mu,\nu=1\ 1\leq\mu<\nu}}^4\{1-\re\tr(U^{1\times1}_{x,\mu,\nu})\}\, , + \] + where $V$ is the global lattice volume. +\item the rectangle expectation value, defined as: + \[ + \langle R\rangle = \frac{1}{12V}\ \sum_{\substack{\mu,\nu=1 \mu\neq\nu}}^4\{1 + -\re\tr(U^{1\times2}_{x,\mu,\nu})\} + \] +\item $\Delta\mathcal{H}$ and $\exp(-\Delta\mathcal{H})$ defined in + the obvious way. +\end{enumerate} +See the overview section for details about the {\ttfamily output.data} +file. These observables all come with no extra computational cost. + +Optionally, other online mesurements can be performed, which -- +however -- need in general extra inversions of the Dirac +operator. First of all the computation of certain correlation +functions is implemented. They need \emph{one} extra inversion of the +Dirac operator, as discussed in Ref.~\cite{Boucaud:2008xu}, using the +one-end-trick. Define a stochastic source $\xi$ as follows +\begin{equation} + \label{eq:source} + \lim_{R\to\infty}[\xi_i^*\xi_j] = \delta_{ij},\quad + \lim_{R\to\infty}[\xi_i\xi_j] = 0\, . +\end{equation} +Here $i$ labels all degrees of freedom. Then +\begin{equation} + \label{oneend} + [\phi_i^{r*}\phi_j^r]_R = M_{ik}^{-1*}\cdot M_{jk}^{-1} + + \textrm{noise}\, , +\end{equation} +if $\phi$ was computed from +\[ +\phi_j^r = M^{-1}_{jk}\xi_k^r\, . +\] +Having in mind the $\gamma_5$-hermiticity property of the Wilson and +Wilson twisted mass Dirac propagator $G_{u,d}$, i.e. +\[ +G_u(x,y) = \gamma_5 G_d(y,x)^\dagger \gamma_5 +\] +it is clear that eq.~(\ref{oneend}) can be used to evaluate +\[ +C_\pi(t) = \langle \tr[G_u(0,t)\gamma_5 G_d(t,0)\gamma_5]\rangle = +\langle \tr[G_u(0,t) G_u(0,t)^\dagger]\rangle +\] +with only one inversion. But, even if the one gamma structure at the +source is fixed to be $\gamma_5$ due to the $\gamma_5$-hermiticity +trick, we are still free insert any $\gamma$-structure $\Gamma$ at the source, +i.e. we can evaluate any correlation function of the form +\[ +C_{P\Gamma}(t) = \langle\tr[G_u(0,t) \gamma_5 G_d(t,0) \Gamma]\rangle += \langle \tr[G_u(0,t) G_u(0,t)^\dagger\gamma_5\Gamma]\rangle\, . +\] +Useful combinations of correlation functions are $\langle P P\rangle$, +$\langle PA\rangle$ and $\langle PV\rangle$, with +\[ + P^\alpha = \bar\chi \gamma_5 \frac{\tau^\alpha}{2}\chi\, ,\quad + V^\alpha_\mu = \bar\chi \gamma_\mu\frac{\tau^\alpha}{2}\chi\, ,\quad + A^\alpha_\mu = \bar\chi \gamma_5\gamma_\mu\frac{\tau^\alpha}{2}\chi +\] +From $\langle P P\rangle$ one can extract the pseudo scalar mass, and +-- in the twisted mass case -- the pseudo scalar decay +constant. $\langle PA\rangle$ can be used together with $\langle P +P\rangle$ to extract the so called PCAC quark mass and $\langle +PV\rangle$ to measure the renormalisation constant $Z_\mathrm{V}$. For +details we refer the reader to Ref.~\cite{Boucaud:2008xu}. + +These online measurements are controlled with the input parameters +{\ttfamily BeginMeasurement CORRELATORS} to enable them and +{\ttfamily Frequency = n} to specify the frequency. The three +correlation functions are saved in files named {\ttfamily + onlinemeas.n}, where {\ttfamily n} is the trajectory number. Every +file contains five columns, specifying the type, the operator type and the +Euclidean time $t$. The last two colums are the values of the +correlation function itself, $C(t)$ and $C(-t)$, respectively. The +type is equal to $1$, $2$ or $6$ for the $\langle P P\rangle$, the +$\langle PA\rangle$ and the $\langle PV\rangle$ correlation +functions. The operator type is for online measurements always equal +to $1$ for local source and sink (no smearing of any kind), and the +time runs from $0$ to $T/2$. Hence, $C(-t)= C(T-t)$. $C(-0)$ and +$C(-T/2)$ are set to zero for convenience. + +In addition to correlation functions also the minimal and the maximal +eigenvalues of the $(\gamma_5 D)^2$ can be measured. + +An online measurement not related to physics, but related to the +algorithm are checks of reversibility violations. The HMC algorithm is +exact, if +and only if the integration scheme is reversible. On a computer with +finite precision this is only guaranteed up to machine precision. +These violations can be estimated by integrating a trajectory +forward and then backward in Monte Carlo time. The difference among +the original Hamiltonian $\mathcal{H}$ and the final one +$\mathcal{H}''$ after integrating back can serve as one measure for +those violations, another one is provided by the difference among the +original gauge field $U$ and the final one $U''$ +\[ +\delta\Delta U = \frac{1}{12V} +\sum_{x,\mu}\sum_{i,j} (U_{x,\mu}-U_{x,\mu}'')_{i,j}^2 +\] +where we indicate with the $\delta\Delta$ that this is obtained after +integrating a trajectory forward and backward in time. The results for +$\delta\Delta \mathcal{H}$ and $\delta\Delta U$ are +stored in the file {\ttfamily return\_check.data}. The relevant input +parameters are {\ttfamily ReversibilityCheck} and {\ttfamily + ReversibilityCheckInterval}. + +\subsection{Iterative Solver and Eigensolver} + +There are several iterative solvers implemented in the tmLQCD +package for solving +\[ +D\ \chi = \phi +\] +for $\chi$. The minimal residual (MR), the conjugate gradient (CG), the +conjugate gradient squared (CGS), the generalised minimal residual +(GMRES), the generalised conjugate residual and the stabilised +bi-conjugate gradient (BiCGstab). For details regarding these +algorithms we refer to Refs.~\cite{saad:2003a,meister:1999}. + +For the {\ttfamily hmc\_tm} executable only the CG and the BiCGstab +solvers are available, while all the others can be used in the +{\ttfamily invert} executables. Most of them are both available with +and without even/odd preconditioning. For a performance comparison we +refer to Ref.~\cite{Chiarappa:2004ry,Chiarappa:2006hz}. + +The stopping criterion is implemented in two ways: the first is an +absolute stopping criterion, i.e. the solver is stopped when the +squared norm of the residual vector (depending on the solver this +might be the iterated residual or the real residual) fulfilles +\[ +\|r\|^2 < \epsilon^2\, . +\] +The second is relative to the source vector, i.e. +\[ +\frac{\|r\|^2}{\|\phi\|^2} < \epsilon^2\, . +\] +The value of $\epsilon^2$ and relative of absolute precision can be +influenced via input parameters. + +The reduced precision Dirac operator, as discussed in sub-section +\ref{sec:dirac}, is available for the CG solver. In the CG solver the +full precision Dirac operator is only required at the beginning of the +CG search, because the relative size of the contribution to the +resulting vector decreases with the number of iterations. Thus, as soon +as a certain precision is achived in the CG algorithm we can switch to +the reduced precision Dirac operator without spoiling the precision of +the final result. Our experience is that this switch can be performed +when the precision is $\sqrt{\epsilon}$ is reached where aiming for a +final precision of $\epsilon < 1$. + +The eigensolver used to compute the eigenvalues (and vectors) of +$(\gamma_5 D)^2$ is the so called Jacobi-Davidson +method~\cite{Sleijpen:1996aa,Geus:2002}. For a discussion for the +application of this algorithm to lattice QCD we refer again to +Ref.~\cite{Chiarappa:2004ry,Chiarappa:2006hz}. + +All solver related files can be found in the sub-directory {\ttfamily + solver}. Note that there are a few more solvers implemented which +are, however, in an experimental status. + +\subsection{Stout Smearing} + +Smearing techniques have become an important tool to reduce +ultraviolet fluctuations in the gauge fields. One of those techniques, +coming with the advantage of being usable in the MD update, is usually +called stout smearing~\cite{Morningstar:2003gk}. + +The $(n+1)^{\rm th}$ level of stout smeared gauge links is obtained iteratively +from the $n^{\rm th}$ level by +\begin{equation*} + U_\mu^{(n+1)}(x)\;=\;e^{i\,Q_\mu^{(n)}(x)}\,U_\mu^{(n)}(x). +\end{equation*} +We refer to the unsmeared (``thin'') gauge field as $U_\mu\equiv +U_\mu^{(0)}$. +The ${\rm SU}(3)$ matrices $Q_\mu$ are defined via the staples $C_\mu$: +\begin{eqnarray} + Q_\mu^{(n)}(x) &=& \frac{i}2\Big[U^{(n)}_\mu(x){C_\mu^{(n)}}^\dagger(x) + - {\mathrm{h.c.}}\Big]\,-\,\frac{i}{6}\tr\Big[U^{(n)}_\mu(x){C_\mu^{(n)}}^\dagger(x) + - {\mathrm{h.c.}}\Big]\,,\nonumber\\ + C_\mu^{(n)} &=& \sum_{\nu\neq\mu}\,\rho_{\mu\nu}\, + \Big(U_\nu^{(n)}(x)U_\mu^{(n)}(x+\hat\nu){U_\nu^{(n)}}^\dagger(x+\hat\mu) + \nonumber\\ + && \;\;\; + +{U_\nu^{(n)}}^\dagger(x-\hat\nu)U_\mu^{(n)}(x-\hat\nu)U_\nu^{(n)}(x-\hat\nu+\hat\mu) + \Big)\,,\nonumber +\end{eqnarray} +where in general $\rho_{\mu\nu}$ is the smearing matrix. +In the tmLQCD package we have only implemented isotropic $4$-dimensional +smearing, i.e., $\rho_{\mu\nu}=\rho$. + +Currently stout smearing is only implemented for the {\ttfamily + invert} executables. I.e. the gauge field can be stout smeared at +the beginning of an inversion. The input parameters are {\ttfamily + UseStoutSmearing}, {\ttfamily StoutRho} and {\ttfamily + StoutNoIterations}. + +%%% Local Variables: +%%% mode: latex +%%% TeX-master: "main" +%%% End: diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/doc/operator.tex b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/operator.tex new file mode 100644 index 0000000000000000000000000000000000000000..9469c52694afd1b0e313d5cf8fe5c7e63b2d0c62 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/operator.tex @@ -0,0 +1,150 @@ +\subsection{Dirac Operator} +\label{sec:dirac} + +The Dirac operator is the kernel routine of any lattice QCD +application, because its inverse is needed for the HMC update +procedure and also for computing correlation functions. The inversion +is usually performed by means of iterative solvers, like the conjugate +gradient algorithm, and hence the repeated application of the Dirac +operator to a spinor field is needed. Thus the optimisation of this +routine deserves special attention. + +At some space-time point $x$ the application of a Wilson type Dirac +operator is mainly given by +\begin{equation} + \label{eq:Dpsi} + \begin{split} + \phi(x) = & (m_0 + 4r +i\mu_q\gamma_5)\psi(x) \\ + &- \frac{1}{2}\sum_{\mu = 1}^4\Bigl[ + U_{x,\mu}(r+\gamma_\mu) \psi(x+a\hat\mu) + U^\dagger_{x-a\hat\mu,\mu} + (r-\gamma_\mu)\psi(x-a\hat\mu)\Bigr] \\ + \end{split} +\end{equation} +where $r$ is the Wilson parameter, which we set to one in the +following. The most computer time consuming part is the next-neighbour +interaction part. + +For this part it is useful to observe that +\[ +(1\pm \gamma_\mu)\psi +\] +has only two independent spinor components, the other two follow +trivially. So only two of the components need to be computed, the +multiplied with the corresponding gauge field $U$, and then the other +two components are to be reconstructed. + +The operation (\ref{eq:Dpsi}) must be performed for each space-time +point $x$. If the loop over $x$ is performed such that all elements +of $\phi$ are accessed sequentially (one output stream), it is clear +that the elements in $\psi$ and $U$ cannot be accessed sequentially as +well. This non-sequential access may lead to serious performance +degradations due to too many cache misses, because modern processing +units have only a very limited number of input streams available. + +While the $\psi$ field is usually different from +one to the next application of the Dirac operator, the gauge field +stays often the same for a large number of applications. This is for +instance so in iterative solvers, where the Dirac operator is applied +$\mathcal{O}(1000)$ times with fixed gauge fields. Therefore it is +useful to construct a double copy of the original gauge field sorted +such that the elements are accessed exactly in the order needed in the +Dirac operator. For the price of additional memory, with this simple +change one can obtain large performance improvements, depending on the +architecture. The double copy must be updated whenever the gauge field +change. This feature is available in the code at configure time, the +relevant switch is {\ttfamily --with-gaugecopy}. + +Above we were assuming that we run sequentially through the resulting +spinor field $\phi$. Another possibility is to run sequentially +through the source spinor field $\psi$. Moreover, one could split up +the operation (\ref{eq:Dpsi}) as follows, introducing intermediate +result vectors $\varphi^\pm$ with only two spinor components per lattice +site\footnote{We thank Peter Boyle for useful discussions on this + point.}. Concentrating on the hopping part only, we would have +\begin{equation} + \label{eq:Dsplit} + \begin{split} + \varphi^+(x, \mu) &= P_\mu^{4\to2}\ U_{x,\mu}(r+\gamma_\mu) \psi(x) \\ + \varphi^-(x, \mu) &= P_\mu^{4\to2}\ (r-\gamma_\mu) \psi(x) \\ + \end{split} +\end{equation} +From $\varphi^\pm$ we can then reconstruct the resulting spinor field +as +\begin{equation} + \label{eq:Dunsplit} + \begin{split} + \phi(x-a\hat\mu) &= P_\mu^{2\to4}\ \varphi^+(x, \mu) \\ + \phi(x+a\hat\mu) &= P_\mu^{2\to4}\ + U^\dagger_{x-a\hat\mu,\mu}\varphi^-(x, \mu) + \end{split} +\end{equation} +Here we denote with $P_\mu^{4\to2}$ the projetion to the two +independent spinor components for $\gamma_\mu$ and with +$P_\mu^{2\to4}$ the reconstruction from two to four spinor +components. The half spinor fields $\varphi^\pm$ can be interlayed in +memory such that $\psi(x)$ as well as $\varphi^\pm(x)$ are always +accessed sequentially in memory. The same is possible for the gauge +fields, as explained above. However, so far we did not win much, +apart from a more balanced treatment of forward and backward +directions. + +The advantage of this implementation of the Dirac operator comes +in the parallel case. In step (\ref{eq:Dsplit}) we need only $\psi(x)$ +locally available on each node. So this step can be performed without +any communication. In between step (\ref{eq:Dsplit}) and +(\ref{eq:Dunsplit}) one then needs to communicate part of +$\varphi^\pm$, however only half the amount is needed compared to a +communication of $\psi$. After the second step there is then no +further communication needed. Hence, one can reduce the amount of data +to be send by a factor of two. + +There is yet another performance improvement possible with this form +of the Dirac operator, this time for the price of precision. One can +store the intermediate fields $\varphi^\pm$ with reduced precision, +e.g. in single precision when the regular spinor fields are in double +precision. This will lead to a result with reduced precision, however, +in situation where this is not important, as for instance in the MD +update procedure, it reduces the data to be communicated by another +factor of two. And the required memory bandwith is reduced as well. +This version of the hopping matrix (currently it is only implemented +for the hopping matrix) is available at configure time with the switch +{\ttfamily --enable-halfspinor}. + +The reduced precision version (sloppy precision) is available through +the input parameter {\ttfamily UseSloppyPrecision}. It will be used in +the MD update where appropriate. Moreover, it is implemented in the CG +iterative solver following the ideas outlined in +Ref.~\cite{Chiarappa:2006hz} for the overlap operator. + +The various implementation of the Dirac operator can be found in the +file {\ttfamily D\_psi.c} and -- as needed for even/odd +preconditioning -- the hopping matrix in the file {\ttfamily + Hopping\_Matrix.c}. There are many different versions of these two +routines available, each optimised for a particular architecture, +e.g. for the Blue Gene/P double hummer processor or the streaming SIMD +extensions of modern PC processors (SSE2 and SSE3), see also +Ref.~\cite{Luscher:2001tx}. Martin L{\"u}scher has made available his +standard C and SSE/SSE2 Dirac operator~\cite{Luscher:sse} under the +GNU General Public License, which are partly included into the tmLQCD +package. + +\subsubsection{Boundary Conditions} + +As discussed previously we allow for arbitrary phase factors in the +boundary conditions of the fermion fields. This is conveniently +implemented in the Dirac operator as a phase factor in the hopping +term +\[ +\sum \Bigl[ + e^{i\theta_\mu \pi/L_\mu}\ U_{x,\mu}(r+\gamma_\mu) + \psi(x+a\hat\mu) + e^{-i\theta_\mu \pi/L_\mu}\ + U^\dagger_{x-a\hat\mu,\mu} + (r-\gamma_\mu)\psi(x-a\hat\mu)\Bigr]\, . +\] +The relevant input parameters are {\ttfamily ThetaT}, {\ttfamily + ThetaX}, {\ttfamily ThetaY}, {\ttfamily ThetaZ}. + +%%% Local Variables: +%%% mode: latex +%%% TeX-master: "main" +%%% End: diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/doc/output.tex b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/output.tex new file mode 100644 index 0000000000000000000000000000000000000000..912e5d771e402bbc149b4b746dcd82212f1acef2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/output.tex @@ -0,0 +1,48 @@ +\subsection{Output files} + +\subsubsection*{\ttfamily output.data} + +The file {\ttfamily output.data} contains lines for each performed +trajectory. Each line has entries with the following meaning: +\begin{enumerate} +\item Plaquette value. +\item $\Delta H$ +\item $\exp(-\Delta H)$ +\item number of pseudo fermion monomials times two integers. The first + is the number of CG or BiCGstab solveriterations used in the acceptance + and heatbath steps, the second is the number of CG (BiCGstab) iterations + used for the force computation. +\item Acceptance (0 is rejected, 1 is accepted). +\item Time in seconds needed for this trajectory. In case of non MPI + this is zero, because not measured. +\item Value of the rectangle part in the gauge action, if used. +\end{enumerate} +Every new run will append its numbers to an already existing file. + +\subsubsection*{\ttfamily output.para} +This file contains the parameters used in this run. Old files will be +overwritten. + +\subsubsection*{\ttfamily history\_hmc\_tm} +This file provides a mapping between the configuration number and its +plaquette and Poliakov loop values. Moreover the simulation parameters +are stored there and in case of a reread the time point can be found there. + +\subsubsection*{\ttfamily return\_check.data} +Contains the reversibility violation measurements, if they are +performed. + +\subsubsection*{\ttfamily conf.save} +This file is written after each trajectory, if no regular +configuration is saved. It contains the most recent gauge +configuration and the status of the random number generator for a +restart of the programme. + +\subsubsection*{\ttfamily onlinemeas.N} +Contains the online measurement for trajectory {\ttfamily N} if this +feature is switched on. + +%%% Local Variables: +%%% mode: latex +%%% TeX-master: "main" +%%% End: diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/doc/overview.tex b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/overview.tex new file mode 100644 index 0000000000000000000000000000000000000000..9422032ecb27bbfac0d8b91f333bb2ac3891a5a4 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/overview.tex @@ -0,0 +1,234 @@ +The general strategy of the tmLQCD package is to provide programs for +the main applications used in lattice QCD with Wilson twisted mass +fermions. The code and the algorithms are designed to be general +enough such as to compile and run efficiently on any modern computer +architecture. This is achieved code-wise by using standard C as +programming language and for parallelisation the message passing +interface (MPI) standard version 1.1. + +Performance improvements are achieved by providing dedicated code for +certain widely used architectures, like PC's or the Blue Gene family. +Dedicated code is mainly available for the kernel routine -- the +application of the Dirac operator, which will be discussed in +detail in section~\ref{sec:dirac}, and for the communication +routines. + +The tmLQCD package provides three main applications. The first is an +implementation of the (P)HMC algorithm, the second and the third are +executables to invert the Wilson twisted mass Dirac operator +(\ref{eq:Dtm}) and the non-degenerate Wilson twisted mass Dirac operator +(\ref{eq:Dh}), respectively. All three do have a wide range of +run-time options, which can be influenced using an input file. The +syntax of the input file is explained in the documentation which ships +with the source code. The relevant input parameters will be mentioned +in the following where appropriate, to ease usage. + +We shall firstly discuss the general layout of the three +aforementioned applications, followed by a general discussion of the +parallelisation strategy used in all three of them. + +\subsection{{\ttfamily hmc\_tm}} + +\begin{figure}[t] + \centering + \includegraphics[width=0.7\linewidth]{hmcflow.eps} + \caption{Flowchart for the {\ttfamily hmc\_tm} executable} + \label{fig:hmcflow} +\end{figure} + +In figure~\ref{fig:hmcflow} the programme flow of the {\ttfamily + hmc\_tm} executable is depicted. In the first block the input file +is parsed and parameters are set accordingly. Then the required memory +is allocated and, depending on the input parameters, data is read from +disk in order to continue a previous run. + +The main part of this application is the molecular dynamics +update. For a number of trajectories, which must be specified in the +input file, first a heat-bath is performed, then the integration +according to the equations of motion using the integrator as specified +in the input file, and finally the acceptance step. + +After each trajectory certain online measurements are performed, +such as measuring the plaquette value. Other online measurements are +optional, like measuring the pseudo scalar correlation function. + +\subsubsection{command line arguments} + +The programme offers command line options as follows: +\begin{itemize} +\item {\ttfamily -h|?} prints a help message and exits. +\item {\ttfamily -f} input file name. The default is {\ttfamily + hmc.input} +\item {\ttfamily -o} the prefix of the output filenames. The default is + {\ttfamily output}. The code will generate or append to two files, + {\ttfamily output.data} and {\ttfamily output.para}. +\end{itemize} + +\subsubsection{Input / Output} + +The parameters of each run are read from an input file with default +name {\ttfamily hmc.input}. If it is missing all parameters will be +set to their default values. Any parameter not set in the input file +will also be set to its default value. + +During the run the {\ttfamily hmc\_tm} program will generate two +output files, one called per default {\ttfamily output.data}, the +other one {\ttfamily output.para}. Into the latter important +parameters will be written at the beginning of the run. + +The file {\ttfamily output.data} has several columns with the +following meanings +\begin{enumerate} +\item Plaquette value. +\item $\Delta H$ +\item $\exp(-\Delta H)$ +\item number of pseudo fermion monomials times two integers. The first + of the two is the sum of solver iterations needed + in the acceptance and heatbath steps, the second is the sum of + iterations needed for the force computation of the whole trajectory. +\item Acceptance ($0$ or $1$). +\item Time in seconds needed for this trajectory. +\item Value of the rectangle part in the gauge action, if used. +\end{enumerate} +Every new run will append its numbers to an already existing file. + +In addition, the program will create a file {\ttfamily + history\_hmc\_tm}. This file provides a mapping between the +configuration number and its plaquette and Polyakov loop +values. Moreover the simulation parameters are stored there and in +case of a reread the time point can be found there. + +After every trajectory the program will save the current configuration +in the file {\ttfamily conf.save}. + +\subsection{{\ttfamily invert} and {\ttfamily invert\_doublet}} + +\begin{figure}[t] + \centering + \includegraphics[width=0.7\linewidth]{invertflow.eps} + \caption{Flowchart for the main part of the {\ttfamily invert} and + {\ttfamily invert\_doublet} executables.} + \label{fig:invertflow} +\end{figure} + +The two applications {\ttfamily invert} and {\ttfamily + invert\_doublet} are very similar. The main difference is that in +{\ttfamily invert} the one flavour Wilson twisted mass Dirac operator +is inverted, whereas in {\ttfamily invert\_doublet} the non-degenerate +doublet is inverted. + +The main part of the two executables is depicted in +figure~\ref{fig:invertflow}. Each measurement corresponds to one gauge +configuration that is read from disk into memory. For each of these +gauge configurations a number of inversions will be performed. + +The sources can be either generated or read in from disk. In +the former case the programme can currently generate point sources at +random location in space time. In the latter case the name of the +source file can be specified in the input file. + +The relevant Dirac operator is then inverted on each source and the +result is stored on disk. The inversion can be performed with a number +of inversion algorithms, such as conjugate gradient (CG), BiCGstab, +and others~\cite{saad:2003a}. And optionally even/odd preconditioning +as described previously can be used. + +\subsubsection{command line arguments} + +The two programmes offer command line options as follows: +\begin{itemize} +\item {\ttfamily -h|?} prints a help message and exits. +\item {\ttfamily -f} input file name. The default is {\ttfamily + hmc.input} +\item {\ttfamily -o} the prefix of the output filenames. The default is + {\ttfamily output}. The code will generate or append to one file + called {\ttfamily output.para}. +\end{itemize} + +\subsubsection{Output} + +The program will create a file called {\ttfamily output.data} with +information about the parameters of the run. +Of course, also the propagators are stored on disc. The corresponding +file names can be influenced via input parameters. The file format +is discussed in some detail in sub-section~\ref{sec:io}. + +One particularity of the {\ttfamily invert\_doublet} program is that +the propagators written to disk correspond to the two flavour Dirac +operator of eq.~(\ref{eq:altDh}), i.e. +\[ +D_h'(\mu_\sigma,\mu_\delta) = D_\mathrm{W}\cdot 1_f + +i\mu_\sigma\tau^1 + \gamma_5 \mu_\delta \tau^3\, , +\] +essentially for compatibility reasons. For the two flavour components +written the first is the would be \emph{strange} component and the +second one the would be \emph{charm} one. + +\subsection{Parallelisation} + +The whole lattice can be parallelised in up to 4 space-time directions. +It is controlled with configure switches, see section~\ref{sec:config}. +The Message Passing Interface (MPI, standard version 1.1) is used to +implement the parallelisation. So for compiling the parallel +executables a working MPI implementation is needed. + +Depending on the number of parallelised space-time directions the +$t$-direction, the $t$- and $x$-direction, the $t$-, $x$- and +$y$-direction or the $t$-, $x$- and $y$- and $z$-direction are +parallelised. + +The number of processors per space direction must be specified at run time, +i.e. in the input file. The relevant parameters are {\ttfamily + NrXProcs}, {\ttfamily NrYProcs} and {\ttfamily NrZProcs}. The number +of processors in time direction is determined by the program +automatically. Note that the extension in any direction must divide by +the number of processors in this direction. + +In case of even/odd preconditioning further constraints have to be +fulfilled: the local number of lattice sites must be even and the +local $L_z$ must be even. Moreover, the local product $L_t\times L_x +\times L_y$ must be even in case of even/odd preconditioning. + + +\begin{figure}[htbp] +\centering +\includegraphics[width=0.65\linewidth]{partition} +\caption{Boundary exchange in a two dimensional parallel setup. One + can see that the internal boundary is send while the external one + is received. The corners need a two step procedure.} +\label{fig:partition} +\end{figure} + +The communication is organised using boundary buffer, as sketched in +figure~\ref{fig:partition}. +%In general the order for gauge and spinor fields in memory is as +%follows: first the local fields, then the \emph{right} t-boundary fields, then +%the \emph{left} t-boundary fields, then the \emph{right} x-boundary +%fields, the \emph{left} x-boundary fields and finally the corners (see +%figure \ref{fig:partition}). +% +The MPI setup is contained in the file {\ttfamily mpi\_init.c}. The +corresponding function must be called at the beginning of a main +program just after the parameters are read in, also in case of a +serial run. In this function also +the various {\ttfamily MPI\_Datatype}s are constructed needed for the +exchange of the boundary fields. The routines performing the +communication for the various data types are located in files starting +with {\ttfamily xchange\_}. + +The communication is implemented using different types of MPI +functions. One implementation uses the {\ttfamily MPI\_Sendrecv} +function to communicate the data. A second one uses non-blocking MPI +functions and a third one persistent MPI calls. See the MPI standard +for details~\cite{mpi:web}. On machines with network capable of +sending in several directions in parallel the non-blocking version is +the most efficient one. The relevant configure switches are {\ttfamily + --with-nonblockingmpi} and {\ttfamily --with-persistentmpi}, the +latter of which is only available for the Dirac operator with +halfspinor fields, see section~\ref{sec:dirac}. + + +%%% Local Variables: +%%% mode: latex +%%% TeX-master: "main" +%%% End: diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/doc/parallel.tex b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/parallel.tex new file mode 100644 index 0000000000000000000000000000000000000000..f2ee00807e1ad6b9e14444aad6ac19d0c4f73f40 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/parallel.tex @@ -0,0 +1,234 @@ +\subsection{Storage of fields and index arrays} +\label{subsec:storage} + +While the spinor fields are stored in even/odd order, the gauge +fields in lexicographial order. + +In order to allow for continuous +memory access of the gauge fields (without being strided) in the +hopping matrix one can install a copy of the gauge field in correct +order. Ordered exactly in the order they are accessed in the hopping +matrix. The price for this trick is more memory requirement +and the need to copy the gauge field once. The time for copying is +negligible if it is done for instance once before an inversion of the +Dirac operator, during which the gauge fields stay constant. This +trick is switched on by the configure option {\ttfamily + --enable-gaugecopy}. + +Note that the additional memory needs might also affect the cash +coherency and moreover, your application might run out off memory. + +\subsubsection{Space-time index arrays} + +In general the fastest index is the z-direction, then the y-, x- and +time-direction. The local gauge field volume has size +{\ttfamily T*LX*LY*LZ=VOLUME}. + +The spinor fields have only size {\ttfamily (VOLUME)/2}. Thus +there are either only even or only odd sites stored. The two +index arrays called {\ttfamily lexic2eo} and {\ttfamily eo2lexic} map +from lexical to even/odd order and vice versa. {\ttfamily + eo2lexic[ix]} returns the lexical index where {\ttfamily ix} is a +even/odd index and {\ttfamily lexic2eo[ix]} returns the even/odd index +where {\ttfamily ix} is now a lexical index. {\ttfamily lexic2eosub} +maps the lexical index to the even/odd index in a spinor field of half +size, thus either with only odd or with only even points. + +This means that {\ttfamily lexic2eo} and {\ttfamily eo2lexic} return +values between {\ttfamily 0} and {\ttfamily VOLUME} in even/odd order +or in lexical oder, respectively. In case of even/odd order there are +first the even points and then the odd points. Since the spinor +fields have only length {\ttfamily (VOLUME)/2} we need to subtract +{\ttfamily (VOLUME)/2} in case of odd points, which is automatically +done by {\ttfamily lexic2eosub} taking values between {\ttfamily +0} and {\ttfamily VOLUME/2}. Therefore, by using {\ttfamily lexic2eosub} +the information on whether the point is even or odd is lost, but the +index can be used immediately in the actual spinor arrays due to its +correct range. + +With the index array {\ttfamily g\_ipt[t][x][y][z]} one can map the +point $(t,x,y,z)$ to the lexical index. The indices of the next +neighbours of point with lexical index {\ttfamily ix} in forward and +backward direction $\mu$ you can get with +{\ttfamily g\_iup[ix][$\mu$]} and {\ttfamily g\_idn[ix][$\mu$]} +respectively. + +In case the gauge fields are stored with even/odd order the even +fields are stored first. + +\subsection{Parallelisation} + +The whole lattice can be parallelised in up to 4 space-time directions. +It is controled with the configure switches {\ttfamily --enable-mpi} +and {\ttfamily --with-mpidimension=1|2|3|4}. + +\subsubsection{One dimesional parallelisation} + +In this situation only the time direction is parallelized. {\ttfamily + T} is set to the local time extension. The global time extension is +given by {\ttfamily T*g\_nproc\_t}, where {\ttfamily g\_nproc\_t} +contains the number of processors in time direction. The local time +extension must be equal on each processor. Note that in the input file +the global time extension must be specified. + +In case the gauge fields are stored in lexical order the local gauge +fields are located first in memory with indices {\ttfamily 0 to + T*LX*LY*LZ-1}, then come the right boundary fields ({\ttfamily t=T}) +with indices {\ttfamily T*LX*LY*LZ} to {\ttfamily (T+1)*LX*LY*LZ-1} +and then the left boundary fields with index {\ttfamily + (T+1)*LX*LY*LZ} to {\ttfamily (T+2)*LX*LY*LZ-1}. + +The amount needed for the boundary fields is defined to be {\ttfamily + RAND}, such that {\ttfamily VOLUME+RAND=VOLUMEPLUSRAND=2*LX*LY*LZ} +is the total gauge field size and {\ttfamily (VOLUME+RAND)/2} the +total size of a (half) spinor field. + +In case of even/odd ordering for the gauge fields the storage in +memory is as follows: first come the even local gauge fields, then the +right even boundary fields, then the left even boundary fields, then +the odd local fields and then the odd boundary fields correspondingly. + +The spinor fields are stored with the local fields first, then the +right and then the left boundary fields. + +Note that in the case of one dimensional parallelization the edges do +not need extra care since they are available in the boundary +automatically. + +\subsubsection{Two dimensional parallelisation} + +When the two dimensional parallelization is used the time- and +x-direction are parallelized. Now {\ttfamily T} and {\ttfamily LX} +correspond to the local time- and x-extension of the lattice, +respectively. The global extensions are obtained by multiplying with +{\ttfamily g\_nproc\_t} and {\ttfamily g\_nproc\_x}. Again, in the +input file the global time- and x-extensions must be specified. + +In this case the storage of boundaries and the exchange procedures are +more complicated. The procdure is represented schematically in figure +\ref{fig:partition}). + +\begin{figure}[htbp] +\centering +\includegraphics[width=0.65\linewidth]{partition} +\caption{Boundary exchange in a two dimensional parallel setup. One + can see that the internal boundary is sended while the external one + is received. The edges need a two step procedure.} +\label{fig:partition} +\end{figure} + +In general the order for gauge and spinor fields in memory is as +follows: first the local fields, then the right t-boundary fields, then +the left t-boundary fields, then the right x-boundary fields, the left +x-boundary fields and finally the edges (see figure +\ref{fig:partition}. {\ttfamily RAND} is now defined to be {\ttfamily + 2*LY*LZ*(LX+T)} not including the edges, which are included only in +{\ttfamily VOLUMEPLUSRAND = LY*LZ*(T+2)*(LX+2)}. (Note that this means +{\ttfamily VOLUME+RAND} is not equal to {\ttfamily VOLUMEPLUSRAND} in +the two dimensional parallelization for historical reasons!) + +\subsubsection{MPI setup} + +The parallelization is setup using MPI. In particular, the number of +available processors is mapped to a cartesian grid using the +corresponding MPI functionality (see the MPI documentation for +details). Therefore in the input file only the number of processors in +x-direction must be specified (and only in the case of a two +dimensional parallelisation). The number of processors in +time-direction is computed automatically, which means that the +executable is independent of the number of processors used in the +actual run. + +In the case of two dimensional parallelisation the internal boundary +in x-direction is not continous anymore, but strided. Therefore +{\ttfamily MPI\_Type\_vector} is used in order to avoid to copy the data +in a send buffer before sending it. Note that the external boundary is +still continuous. + +The MPI setup is contained in the function {\ttfamily mpi\_init} that +must be called at the beginning of a main program just after the +parameters are read in, also in the serial case. In this function also +the various {\ttfamily MPI\_Datatype}s are constructed needed for the +exchange of the boundary fields. + +The actual setup is controlled by several variables, which are all set +in {\ttfamily mpi\_init}: +\begin{itemize} +\item {\ttfamily g\_proc\_coords[]} containing the cartesian + coordinates of the local processor in the MPI cartesian grid. +\item {\ttfamily g\_nproc} containing the global number of processors. +\item {\ttfamily g\_proc\_id} containing the processor id in the + original MPI communicator {\ttfamily MPI\_COMM\_WORLD}. +\item {\ttfamily g\_nproc\_x, g\_nproc\_t} containing the number of + processors in x- and time-direction. +\item {\ttfamily g\_cart\_id} containing the processor id in the MPI + cartesian grid. +\item {\ttfamily g\_cart\_grid} containing the MPI communicator for + the cartesian grid. +\item {\ttfamily g\_nb\_t\_dn}, {\ttfamily g\_nb\_t\_up}, {\ttfamily + g\_nb\_x\_dn}, {\ttfamily g\_nb\_x\_up} containing the processor + ids of the neighbouring processors in the corresponding direction in + the cartesian grid. +\item {\ttfamily mpi\_time\_slices} containing a MPI communicator + for a time-row in the cartesian grid. These communicators are needed + for the computation of observables in the parallel setup. +\item {\ttfamily mpi\_time\_rank} containing the processor id in the + time slice communicators. +\end{itemize} + +\subsubsection{Exchange routines} + +There are exchange routines available for the gauge fields ({\ttfamily + xchange\_gauge}), for a (half) spinor field ({\ttfamily + xchange\_spinor}) and for the derivatives ({\ttfamily xchange\_deri}), +respectively. + +In the test directory there is a routine called {\ttfamily + check\_xchange} which tests whether the exchange routines work correctly. + + +\begin{table}[t] + \centering + \begin{tabular*}{1.\textwidth}{@{\extracolsep{\fill}}ccc} + \hline\hline + & start address & size \\ + \hline\hline + local volume & 0 & T*LX*LY*LZ \\ + \hline\hline + t-Rand & VOLUME & 2*LX*LY*LZ \\ + x-Rand & ...+2*LX*LY*LZ & 2*T*LY*LZ\\ + y-Rand & ...+2*T*LY*LZ & 2*T*LX*LZ \\ + z-Rand & ...+2*T*LX*LZ & 2*T*LX*LY \\ + \hline\hline + xt-edge & VOLUME+RAND & 4*LY*LZ \\ + yx-edge & ...+4*LY*LZ & 4*T*LZ \\ + ty-edge & ...+4*T*LZ & 4*LX*LZ \\ + zx-edge & ...+4*LX*LZ & 4*T*LY\\ + tz-edge & ...+4*T*LY & 4*LX*LY\\ + zy-edge & ...+4*LX*LY & 4*T*LX\\ + \hline\hline + t2-Rand & VOLUMEPLUSRAND & 2*LX*LY*LZ \\ + x2-Rand & ...+2*LX*LY*LZ & 2*T*LY*LZ \\ + y2-Rand & ...+2*T*LY*LZ & 2*T*LX*LZ \\ + z2-Rand & ...+2*T*LX*LZ & 2*T*LX*LY \\ + \hline\hline + t2x-edge & VOLUMEPLUSRAND+RAND & 4*LY*LZ \\ + x2t-edge & ...+4*LY*LZ & 4*LY*LZ \\ + x2y-edge & ...+4*LY*LZ & 4*T*LZ \\ + y2x-edge & ...+4*T*LZ & 4*T*LZ \\ + t2y-edge & ...+4*T*LZ & 4*LX*LZ \\ + y2t-edge & ...+4*LX*LZ & 4*LX*LZ \\ + t2z-edge & ...+4*LX*LZ & 4*LX*LY \\ + z2t-edge & ...+4*LX*LY & 4*LX*LY \\ + z2x-edge & ...+4*LX*LY & 4*T*LY \\ + x2z-edge & ...+4*T*LY & 4*T*LY \\ + z2y-edge & ...+4*T*LY & 4*T*LX \\ + y2z-edge & ...+4*T*LX & 4*T*LX \\ + \hline\hline + \end{tabular*} +\end{table} + +%%% Local Variables: +%%% mode: latex +%%% TeX-master: "main" +%%% End: diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/doc/partition.eps b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/partition.eps new file mode 100644 index 0000000000000000000000000000000000000000..bc27d6b26c9da53fd42e4eff1926cbee3c2352ab --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/partition.eps @@ -0,0 +1,644 @@ +%!PS-Adobe-2.0 EPSF-2.0 +%%Title: partition.eps +%%Creator: fig2dev Version 3.2 Patchlevel 3c +%%CreationDate: Mon Mar 7 09:53:53 2005 +%%For: urbach@master (Carsten Urbach) +%%BoundingBox: 0 0 238 215 +%%Magnification: 1.0000 +%%EndComments +/MyAppDict 100 dict dup begin def +/$F2psDict 200 dict def +$F2psDict begin +$F2psDict /mtrx matrix put +/col-1 {0 setgray} bind def +/col0 {0.000 0.000 0.000 srgb} bind def +/col1 {0.000 0.000 1.000 srgb} bind def +/col2 {0.000 1.000 0.000 srgb} bind def +/col3 {0.000 1.000 1.000 srgb} bind def +/col4 {1.000 0.000 0.000 srgb} bind def +/col5 {1.000 0.000 1.000 srgb} bind def +/col6 {1.000 1.000 0.000 srgb} bind def +/col7 {1.000 1.000 1.000 srgb} bind def +/col8 {0.000 0.000 0.560 srgb} bind def +/col9 {0.000 0.000 0.690 srgb} bind def +/col10 {0.000 0.000 0.820 srgb} bind def +/col11 {0.530 0.810 1.000 srgb} bind def +/col12 {0.000 0.560 0.000 srgb} bind def +/col13 {0.000 0.690 0.000 srgb} bind def +/col14 {0.000 0.820 0.000 srgb} bind def +/col15 {0.000 0.560 0.560 srgb} bind def +/col16 {0.000 0.690 0.690 srgb} bind def +/col17 {0.000 0.820 0.820 srgb} bind def +/col18 {0.560 0.000 0.000 srgb} bind def +/col19 {0.690 0.000 0.000 srgb} bind def +/col20 {0.820 0.000 0.000 srgb} bind def +/col21 {0.560 0.000 0.560 srgb} bind def +/col22 {0.690 0.000 0.690 srgb} bind def +/col23 {0.820 0.000 0.820 srgb} bind def +/col24 {0.500 0.190 0.000 srgb} bind def +/col25 {0.630 0.250 0.000 srgb} bind def +/col26 {0.750 0.380 0.000 srgb} bind def +/col27 {1.000 0.500 0.500 srgb} bind def +/col28 {1.000 0.630 0.630 srgb} bind def +/col29 {1.000 0.750 0.750 srgb} bind def +/col30 {1.000 0.880 0.880 srgb} bind def +/col31 {1.000 0.840 0.000 srgb} bind def + +end +save +newpath 0 215 moveto 0 0 lineto 238 0 lineto 238 215 lineto closepath clip newpath +-13.0 228.0 translate +1 -1 scale + +% This junk string is used by the show operators +/PATsstr 1 string def +/PATawidthshow { % cx cy cchar rx ry string + % Loop over each character in the string + { % cx cy cchar rx ry char + % Show the character + dup % cx cy cchar rx ry char char + PATsstr dup 0 4 -1 roll put % cx cy cchar rx ry char (char) + false charpath % cx cy cchar rx ry char + /clip load PATdraw + % Move past the character (charpath modified the + % current point) + currentpoint % cx cy cchar rx ry char x y + newpath + moveto % cx cy cchar rx ry char + % Reposition by cx,cy if the character in the string is cchar + 3 index eq { % cx cy cchar rx ry + 4 index 4 index rmoveto + } if + % Reposition all characters by rx ry + 2 copy rmoveto % cx cy cchar rx ry + } forall + pop pop pop pop pop % - + currentpoint + newpath + moveto +} bind def +/PATcg { + 7 dict dup begin + /lw currentlinewidth def + /lc currentlinecap def + /lj currentlinejoin def + /ml currentmiterlimit def + /ds [ currentdash ] def + /cc [ currentrgbcolor ] def + /cm matrix currentmatrix def + end +} bind def +% PATdraw - calculates the boundaries of the object and +% fills it with the current pattern +/PATdraw { % proc + save exch + PATpcalc % proc nw nh px py + 5 -1 roll exec % nw nh px py + newpath + PATfill % - + restore +} bind def +% PATfill - performs the tiling for the shape +/PATfill { % nw nh px py PATfill - + PATDict /CurrentPattern get dup begin + setfont + % Set the coordinate system to Pattern Space + PatternGState PATsg + % Set the color for uncolored pattezns + PaintType 2 eq { PATDict /PColor get PATsc } if + % Create the string for showing + 3 index string % nw nh px py str + % Loop for each of the pattern sources + 0 1 Multi 1 sub { % nw nh px py str source + % Move to the starting location + 3 index 3 index % nw nh px py str source px py + moveto % nw nh px py str source + % For multiple sources, set the appropriate color + Multi 1 ne { dup PC exch get PATsc } if + % Set the appropriate string for the source + 0 1 7 index 1 sub { 2 index exch 2 index put } for pop + % Loop over the number of vertical cells + 3 index % nw nh px py str nh + { % nw nh px py str + currentpoint % nw nh px py str cx cy + 2 index oldshow % nw nh px py str cx cy + YStep add moveto % nw nh px py str + } repeat % nw nh px py str + } for + 5 { pop } repeat + end +} bind def + +% PATkshow - kshow with the current pattezn +/PATkshow { % proc string + exch bind % string proc + 1 index 0 get % string proc char + % Loop over all but the last character in the string + 0 1 4 index length 2 sub { + % string proc char idx + % Find the n+1th character in the string + 3 index exch 1 add get % string proe char char+1 + exch 2 copy % strinq proc char+1 char char+1 char + % Now show the nth character + PATsstr dup 0 4 -1 roll put % string proc chr+1 chr chr+1 (chr) + false charpath % string proc char+1 char char+1 + /clip load PATdraw + % Move past the character (charpath modified the current point) + currentpoint newpath moveto + % Execute the user proc (should consume char and char+1) + mark 3 1 roll % string proc char+1 mark char char+1 + 4 index exec % string proc char+1 mark... + cleartomark % string proc char+1 + } for + % Now display the last character + PATsstr dup 0 4 -1 roll put % string proc (char+1) + false charpath % string proc + /clip load PATdraw + neewath + pop pop % - +} bind def +% PATmp - the makepattern equivalent +/PATmp { % patdict patmtx PATmp patinstance + exch dup length 7 add % We will add 6 new entries plus 1 FID + dict copy % Create a new dictionary + begin + % Matrix to install when painting the pattern + TilingType PATtcalc + /PatternGState PATcg def + PatternGState /cm 3 -1 roll put + % Check for multi pattern sources (Level 1 fast color patterns) + currentdict /Multi known not { /Multi 1 def } if + % Font dictionary definitions + /FontType 3 def + % Create a dummy encoding vector + /Encoding 256 array def + 3 string 0 1 255 { + Encoding exch dup 3 index cvs cvn put } for pop + /FontMatrix matrix def + /FontBBox BBox def + /BuildChar { + mark 3 1 roll % mark dict char + exch begin + Multi 1 ne {PaintData exch get}{pop} ifelse % mark [paintdata] + PaintType 2 eq Multi 1 ne or + { XStep 0 FontBBox aload pop setcachedevice } + { XStep 0 setcharwidth } ifelse + currentdict % mark [paintdata] dict + /PaintProc load % mark [paintdata] dict paintproc + end + gsave + false PATredef exec true PATredef + grestore + cleartomark % - + } bind def + currentdict + end % newdict + /foo exch % /foo newlict + definefont % newfont +} bind def +% PATpcalc - calculates the starting point and width/height +% of the tile fill for the shape +/PATpcalc { % - PATpcalc nw nh px py + PATDict /CurrentPattern get begin + gsave + % Set up the coordinate system to Pattern Space + % and lock down pattern + PatternGState /cm get setmatrix + BBox aload pop pop pop translate + % Determine the bounding box of the shape + pathbbox % llx lly urx ury + grestore + % Determine (nw, nh) the # of cells to paint width and height + PatHeight div ceiling % llx lly urx qh + 4 1 roll % qh llx lly urx + PatWidth div ceiling % qh llx lly qw + 4 1 roll % qw qh llx lly + PatHeight div floor % qw qh llx ph + 4 1 roll % ph qw qh llx + PatWidth div floor % ph qw qh pw + 4 1 roll % pw ph qw qh + 2 index sub cvi abs % pw ph qs qh-ph + exch 3 index sub cvi abs exch % pw ph nw=qw-pw nh=qh-ph + % Determine the starting point of the pattern fill + %(px, py) + 4 2 roll % nw nh pw ph + PatHeight mul % nw nh pw py + exch % nw nh py pw + PatWidth mul exch % nw nh px py + end +} bind def + +% Save the original routines so that we can use them later on +/oldfill /fill load def +/oldeofill /eofill load def +/oldstroke /stroke load def +/oldshow /show load def +/oldashow /ashow load def +/oldwidthshow /widthshow load def +/oldawidthshow /awidthshow load def +/oldkshow /kshow load def + +% These defs are necessary so that subsequent procs don't bind in +% the originals +/fill { oldfill } bind def +/eofill { oldeofill } bind def +/stroke { oldstroke } bind def +/show { oldshow } bind def +/ashow { oldashow } bind def +/widthshow { oldwidthshow } bind def +/awidthshow { oldawidthshow } bind def +/kshow { oldkshow } bind def +/PATredef { + MyAppDict begin + { + /fill { /clip load PATdraw newpath } bind def + /eofill { /eoclip load PATdraw newpath } bind def + /stroke { PATstroke } bind def + /show { 0 0 null 0 0 6 -1 roll PATawidthshow } bind def + /ashow { 0 0 null 6 3 roll PATawidthshow } + bind def + /widthshow { 0 0 3 -1 roll PATawidthshow } + bind def + /awidthshow { PATawidthshow } bind def + /kshow { PATkshow } bind def + } { + /fill { oldfill } bind def + /eofill { oldeofill } bind def + /stroke { oldstroke } bind def + /show { oldshow } bind def + /ashow { oldashow } bind def + /widthshow { oldwidthshow } bind def + /awidthshow { oldawidthshow } bind def + /kshow { oldkshow } bind def + } ifelse + end +} bind def +false PATredef +% Conditionally define setcmykcolor if not available +/setcmykcolor where { pop } { + /setcmykcolor { + 1 sub 4 1 roll + 3 { + 3 index add neg dup 0 lt { pop 0 } if 3 1 roll + } repeat + setrgbcolor - pop + } bind def +} ifelse +/PATsc { % colorarray + aload length % c1 ... cn length + dup 1 eq { pop setgray } { 3 eq { setrgbcolor } { setcmykcolor + } ifelse } ifelse +} bind def +/PATsg { % dict + begin + lw setlinewidth + lc setlinecap + lj setlinejoin + ml setmiterlimit + ds aload pop setdash + cc aload pop setrgbcolor + cm setmatrix + end +} bind def + +/PATDict 3 dict def +/PATsp { + true PATredef + PATDict begin + /CurrentPattern exch def + % If it's an uncolored pattern, save the color + CurrentPattern /PaintType get 2 eq { + /PColor exch def + } if + /CColor [ currentrgbcolor ] def + end +} bind def +% PATstroke - stroke with the current pattern +/PATstroke { + countdictstack + save + mark + { + currentpoint strokepath moveto + PATpcalc % proc nw nh px py + clip newpath PATfill + } stopped { + (*** PATstroke Warning: Path is too complex, stroking + with gray) = + cleartomark + restore + countdictstack exch sub dup 0 gt + { { end } repeat } { pop } ifelse + gsave 0.5 setgray oldstroke grestore + } { pop restore pop } ifelse + newpath +} bind def +/PATtcalc { % modmtx tilingtype PATtcalc tilematrix + % Note: tiling types 2 and 3 are not supported + gsave + exch concat % tilingtype + matrix currentmatrix exch % cmtx tilingtype + % Tiling type 1 and 3: constant spacing + 2 ne { + % Distort the pattern so that it occupies + % an integral number of device pixels + dup 4 get exch dup 5 get exch % tx ty cmtx + XStep 0 dtransform + round exch round exch % tx ty cmtx dx.x dx.y + XStep div exch XStep div exch % tx ty cmtx a b + 0 YStep dtransform + round exch round exch % tx ty cmtx a b dy.x dy.y + YStep div exch YStep div exch % tx ty cmtx a b c d + 7 -3 roll astore % { a b c d tx ty } + } if + grestore +} bind def +/PATusp { + false PATredef + PATDict begin + CColor PATsc + end +} bind def + +% right30 +11 dict begin +/PaintType 1 def +/PatternType 1 def +/TilingType 1 def +/BBox [0 0 1 1] def +/XStep 1 def +/YStep 1 def +/PatWidth 1 def +/PatHeight 1 def +/Multi 2 def +/PaintData [ + { clippath } bind + { 32 16 true [ 32 0 0 -16 0 16 ] + {<00030003000c000c0030003000c000c0030003000c000c00 + 30003000c000c00000030003000c000c0030003000c000c0 + 030003000c000c0030003000c000c000>} + imagemask } bind +] def +/PaintProc { + pop + exec fill +} def +currentdict +end +/P2 exch def + +% left45 +11 dict begin +/PaintType 1 def +/PatternType 1 def +/TilingType 1 def +/BBox [0 0 1 1] def +/XStep 1 def +/YStep 1 def +/PatWidth 1 def +/PatHeight 1 def +/Multi 2 def +/PaintData [ + { clippath } bind + { 32 32 true [ 32 0 0 -32 0 32 ] + {<808080804040404020202020101010100808080804040404 + 020202020101010180808080404040402020202010101010 + 080808080404040402020202010101018080808040404040 + 202020201010101008080808040404040202020201010101 + 808080804040404020202020101010100808080804040404 + 0202020201010101>} + imagemask } bind +] def +/PaintProc { + pop + exec fill +} def +currentdict +end +/P4 exch def + +% crosshatch45 +11 dict begin +/PaintType 1 def +/PatternType 1 def +/TilingType 1 def +/BBox [0 0 1 1] def +/XStep 1 def +/YStep 1 def +/PatWidth 1 def +/PatHeight 1 def +/Multi 2 def +/PaintData [ + { clippath } bind + { 32 32 true [ 32 0 0 -32 0 32 ] + {<828282824444444428282828101010102828282844444444 + 828282820101010182828282444444442828282810101010 + 282828284444444482828282010101018282828244444444 + 282828281010101028282828444444448282828201010101 + 828282824444444428282828101010102828282844444444 + 8282828201010101>} + imagemask } bind +] def +/PaintProc { + pop + exec fill +} def +currentdict +end +/P6 exch def + +/cp {closepath} bind def +/ef {eofill} bind def +/gr {grestore} bind def +/gs {gsave} bind def +/sa {save} bind def +/rs {restore} bind def +/l {lineto} bind def +/m {moveto} bind def +/rm {rmoveto} bind def +/n {newpath} bind def +/s {stroke} bind def +/sh {show} bind def +/slc {setlinecap} bind def +/slj {setlinejoin} bind def +/slw {setlinewidth} bind def +/srgb {setrgbcolor} bind def +/rot {rotate} bind def +/sc {scale} bind def +/sd {setdash} bind def +/ff {findfont} bind def +/sf {setfont} bind def +/scf {scalefont} bind def +/sw {stringwidth} bind def +/tr {translate} bind def +/tnt {dup dup currentrgbcolor + 4 -2 roll dup 1 exch sub 3 -1 roll mul add + 4 -2 roll dup 1 exch sub 3 -1 roll mul add + 4 -2 roll dup 1 exch sub 3 -1 roll mul add srgb} + bind def +/shd {dup dup currentrgbcolor 4 -2 roll mul 4 -2 roll mul + 4 -2 roll mul srgb} bind def +/$F2psBegin {$F2psDict begin /$F2psEnteredState save def} def +/$F2psEnd {$F2psEnteredState restore end} def + +$F2psBegin +%%Page: 1 1 +10 setmiterlimit + 0.06299 0.06299 sc +% +% Fig objects follow +% +% Polyline +7.500 slw +n 450 2475 m 2025 2475 l 2025 2700 l 450 2700 l + cp gs /PC [[1.00 1.00 1.00] [0.00 0.00 0.00]] def +15.00 15.00 sc P4 [16 0 0 -16 30.00 165.00] PATmp PATsp ef gr PATusp gs col0 s gr +% Polyline +n 450 2700 m 2025 2700 l 2025 2925 l 450 2925 l + cp gs /PC [[1.00 1.00 1.00] [0.00 0.00 0.00]] def +15.00 15.00 sc P2 [16 0 0 -8 30.00 180.00] PATmp PATsp ef gr PATusp gs col0 s gr +% Polyline +n 450 1350 m 2025 1350 l 2025 1575 l 450 1575 l + cp gs /PC [[1.00 1.00 1.00] [0.00 0.00 0.00]] def +15.00 15.00 sc P2 [16 0 0 -8 30.00 90.00] PATmp PATsp ef gr PATusp gs col0 s gr +% Polyline +n 450 1125 m 2025 1125 l 2025 1350 l 450 1350 l + cp gs /PC [[1.00 1.00 1.00] [0.00 0.00 0.00]] def +15.00 15.00 sc P4 [16 0 0 -16 30.00 75.00] PATmp PATsp ef gr PATusp gs col0 s gr +% Arc +15.000 slw +gs clippath +2261 2610 m 2315 2636 l 2406 2453 l 2313 2574 l 2353 2426 l cp +eoclip + [60] 0 sd +n 1068.8 1912.5 1410.7 -29.6 29.6 arc +gs col0 s gr + gr + [] 0 sd +% arrowhead +n 2353 2426 m 2313 2574 l 2406 2453 l 2366 2466 l 2353 2426 l + cp gs 0.00 setgray ef gr col0 s +% Arc +gs clippath +2132 1060 m 2160 1113 l 2341 1018 l 2195 1062 l 2313 965 l cp +eoclip + [60] 0 sd +n 2835.0 2205.0 1312.0 -59.0 -121.0 arcn +gs col0 s gr + gr + [] 0 sd +% arrowhead +n 2313 965 m 2195 1062 l 2341 1018 l 2301 1006 l 2313 965 l + cp gs 0.00 setgray ef gr col0 s +% Arc +gs clippath +2365 2582 m 2374 2641 l 2576 2610 l 2424 2604 l 2567 2551 l cp +eoclip +n 2272.0 1377.0 1238.2 0.8 84.8 arc +gs col0 s gr + gr + +% arrowhead +n 2567 2551 m 2424 2604 l 2576 2610 l 2542 2585 l 2567 2551 l + cp gs 0.00 setgray ef gr col0 s +% Arc +gs clippath +1316 2560 m 1365 2594 l 1483 2426 l 1373 2532 l 1434 2391 l cp +eoclip +n 544.5 1912.5 1036.6 -39.0 39.0 arc +gs col0 s gr + gr + +% arrowhead +n 1434 2391 m 1373 2532 l 1483 2426 l 1441 2433 l 1434 2391 l + cp gs 0.00 setgray ef gr col0 s +% Arc +gs clippath +933 1489 m 884 1455 l 766 1623 l 877 1518 l 815 1658 l cp +eoclip +n 1705.5 2137.5 1036.6 141.0 -141.0 arc +gs col0 s gr + gr + +% arrowhead +n 815 1658 m 877 1518 l 766 1623 l 808 1616 l 815 1658 l + cp gs 0.00 setgray ef gr col0 s +% Polyline +7.500 slw +n 225 2475 m 450 2475 l 450 2700 l 225 2700 l + cp gs col0 s gr +% Polyline +n 225 2700 m 450 2700 l 450 2925 l 225 2925 l + cp gs col0 s gr +% Polyline +n 2025 2700 m 2250 2700 l 2250 2925 l 2025 2925 l + cp gs col0 s gr +% Polyline +n 2025 2475 m 2250 2475 l 2250 2700 l 2025 2700 l + cp gs /PC [[1.00 1.00 1.00] [0.00 0.00 0.00]] def +15.00 15.00 sc P6 [16 0 0 -16 135.00 165.00] PATmp PATsp ef gr PATusp gs col0 s gr +% Polyline +n 225 1350 m 450 1350 l 450 1575 l 225 1575 l + cp gs col0 s gr +% Polyline +n 2025 1125 m 2250 1125 l 2250 1350 l 2025 1350 l + cp gs /PC [[1.00 1.00 1.00] [0.00 0.00 0.00]] def +15.00 15.00 sc P6 [16 0 0 -16 135.00 75.00] PATmp PATsp ef gr PATusp gs col0 s gr +% Polyline +n 2025 1350 m 2250 1350 l 2250 1575 l 2025 1575 l + cp gs col0 s gr +% Polyline +n 3150 2700 m 3375 2700 l + 3375 2475 l gs col0 s gr +% Polyline +n 3375 1125 m 3600 1125 l 3600 1350 l 3375 1350 l + cp gs /PC [[1.00 1.00 1.00] [0.00 0.00 0.00]] def +15.00 15.00 sc P6 [16 0 0 -16 225.00 75.00] PATmp PATsp ef gr PATusp gs col0 s gr +% Polyline +n 3150 1350 m 3375 1350 l + 3375 1575 l gs col0 s gr +% Polyline +n 3150 2475 m + 3150 3150 l gs col0 s gr +% Polyline +n 3375 2700 m + 3555 2700 l gs col0 s gr +% Polyline +n 3375 2700 m + 3375 2790 l gs col0 s gr +% Polyline +n 3150 2475 m + 3645 2475 l gs col0 s gr +% Polyline +n 3150 450 m 3150 1575 l + 3960 1575 l gs col0 s gr +% Polyline +n 3825 1350 m 3375 1350 l + 3375 675 l gs col0 s gr +% Polyline +n 225 2925 m + 225 3600 l gs col0 s gr +% Polyline +n 450 2925 m + 450 3465 l gs col0 s gr +% Polyline +n 2025 2925 m + 2025 3420 l gs col0 s gr +% Polyline +n 2250 2925 m + 2250 3330 l gs col0 s gr +% Polyline +n 225 1350 m + 225 225 l gs col0 s gr +% Polyline +n 450 1125 m + 450 360 l gs col0 s gr +% Polyline +n 2025 1125 m + 2025 450 l gs col0 s gr +% Polyline +n 2250 1125 m + 2250 450 l gs col0 s gr +$F2psEnd +rs +end diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/doc/partition.fig b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/partition.fig new file mode 100644 index 0000000000000000000000000000000000000000..f0b52956b695f2379a7b2652035eacdea0494667 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/partition.fig @@ -0,0 +1,77 @@ +#FIG 3.2 +Landscape +Center +Metric +A4 +100.00 +Single +-2 +1200 2 +6 225 225 3960 3600 +5 1 1 2 0 7 50 0 -1 4.000 0 0 1 0 1068.750 1912.500 2295 1215 2475 2025 2295 2610 + 2 1 2.00 60.00 120.00 +5 1 1 2 0 7 50 0 -1 4.000 0 1 1 0 2835.000 2205.000 3510 1080 2700 900 2160 1080 + 2 1 2.00 60.00 120.00 +5 1 0 2 0 7 50 0 -1 0.000 0 0 1 0 2271.979 1377.017 3510 1395 3150 2250 2385 2610 + 2 1 2.00 60.00 120.00 +5 1 0 2 0 7 50 0 -1 0.000 0 0 1 0 544.500 1912.500 1350 1260 1575 2025 1350 2565 + 2 1 2.00 60.00 120.00 +5 1 0 2 0 7 50 0 -1 0.000 0 0 1 0 1705.500 2137.500 900 2790 675 2025 900 1485 + 2 1 2.00 60.00 120.00 +2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 + 225 2475 450 2475 450 2700 225 2700 225 2475 +2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 + 225 2700 450 2700 450 2925 225 2925 225 2700 +2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 + 2025 2700 2250 2700 2250 2925 2025 2925 2025 2700 +2 2 0 1 0 7 50 0 46 0.000 0 0 -1 0 0 5 + 2025 2475 2250 2475 2250 2700 2025 2700 2025 2475 +2 2 0 1 0 7 52 0 44 0.000 0 0 -1 0 0 5 + 450 2475 2025 2475 2025 2700 450 2700 450 2475 +2 2 0 1 0 7 52 0 42 0.000 0 0 -1 0 0 5 + 450 2700 2025 2700 2025 2925 450 2925 450 2700 +2 2 0 1 0 7 52 0 42 0.000 0 0 -1 0 0 5 + 450 1350 2025 1350 2025 1575 450 1575 450 1350 +2 2 0 1 0 7 52 0 44 0.000 0 0 -1 0 0 5 + 450 1125 2025 1125 2025 1350 450 1350 450 1125 +2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 + 225 1350 450 1350 450 1575 225 1575 225 1350 +2 2 0 1 0 7 50 0 46 0.000 0 0 -1 0 0 5 + 2025 1125 2250 1125 2250 1350 2025 1350 2025 1125 +2 2 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 5 + 2025 1350 2250 1350 2250 1575 2025 1575 2025 1350 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 3 + 3150 2700 3375 2700 3375 2475 +2 1 0 1 0 7 30 0 -1 0.000 0 0 -1 0 0 2 + 3150 2475 3150 3150 +2 1 0 1 0 7 30 0 -1 0.000 0 0 -1 0 0 2 + 3375 2700 3555 2700 +2 1 0 1 0 7 30 0 -1 0.000 0 0 -1 0 0 2 + 3375 2700 3375 2790 +2 1 0 1 0 7 30 0 -1 0.000 0 0 -1 0 0 2 + 3150 2475 3645 2475 +2 2 0 1 0 7 50 0 46 0.000 0 0 -1 0 0 5 + 3375 1125 3600 1125 3600 1350 3375 1350 3375 1125 +2 1 0 1 0 7 50 0 -1 0.000 0 0 -1 0 0 3 + 3150 1350 3375 1350 3375 1575 +2 1 0 1 0 7 30 0 -1 0.000 0 0 -1 0 0 3 + 3150 450 3150 1575 3960 1575 +2 1 0 1 0 7 30 0 -1 0.000 0 0 -1 0 0 3 + 3825 1350 3375 1350 3375 675 +2 1 0 1 0 7 30 0 -1 0.000 0 0 -1 0 0 2 + 225 2925 225 3600 +2 1 0 1 0 7 30 0 -1 0.000 0 0 -1 0 0 2 + 450 2925 450 3465 +2 1 0 1 0 7 30 0 -1 0.000 0 0 -1 0 0 2 + 2025 2925 2025 3420 +2 1 0 1 0 7 30 0 -1 0.000 0 0 -1 0 0 2 + 2250 2925 2250 3330 +2 1 0 1 0 7 30 0 -1 0.000 0 0 -1 0 0 2 + 225 1350 225 225 +2 1 0 1 0 7 30 0 -1 0.000 0 0 -1 0 0 2 + 450 1125 450 360 +2 1 0 1 0 7 30 0 -1 0.000 0 0 -1 0 0 2 + 2025 1125 2025 450 +2 1 0 1 0 7 30 0 -1 0.000 0 0 -1 0 0 2 + 2250 1125 2250 450 +-6 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/doc/prop_format.tex b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/prop_format.tex new file mode 100644 index 0000000000000000000000000000000000000000..f2c3c2c329a4f66297007a5811aeb014ccfd60df --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/prop_format.tex @@ -0,0 +1,134 @@ +\subsection{Fermion Field File Formats} + +We note at the beginning, that we do not use a different format for +source or sink fermion fields. They are both stored using the same +lime records. The meta-data stored in the same lime-packed file is +supposed to clarify all other things. + +\subsubsection{Propagators} + +Here we mainly concentrate on storing propagators (sink). The file can +contain only sources, or both, source and sink. We (plan to) support +four different formats +\begin{enumerate} +\item (arbitrary number of) sink, no sources +\item (arbitrary number of) source/sink pairs +\item one source, 12 sink +\item one source, 4 sink +\end{enumerate} +This is very similar to the formats in use in parts of the US +community. However, they use XML as a markup language, which we don't +(yet) use. + +We adopt the SCIDAC chechsum for gauge and propagator files. + +Every source and sink has to be in a seperate lime record. The order +in one file for the four formats mentioned above is supposed to be +\begin{enumerate} +\item sink, no sources: - +\item source/sink pairs: first source, then sink +\item one source, 12 sink: first source, then 12 sinks +\item one source, 4 sink: first source, then 4 sinks +\end{enumerate} +All fermion field files must have a record indicating the type. The +record itself is of type {\ttfamily propagator-type} and the +record has a single entry (ascii string) which can contain one of +\begin{itemize} +\item {\ttfamily DiracFermion\_Sink} +\item {\ttfamily DiracFermion\_Source\_Sink\_Pairs} +\item {\ttfamily DiracFermion\_ScalarSource\_TwelveSink} +\item {\ttfamily DiracFermion\_ScalarSource\_FourSink} +\end{itemize} +Those strings are also used in the input files of the hmc code for the +input parameter {\ttfamily PropagatorType}. +The binary data corresponding to one Dirac fermion field (source or +sink) is then stored with at least two (three) records. The first is +of type \\ +{\ttfamily etmc-propagator-format} \\ +and should contain the following information: +\begin{verbatim} + + + diracFermion + 32 + 1 + 4 + 4 + 4 + 4 + +\end{verbatim} +The {\ttfamily flavours} entry must be set to {\ttfamily 1} for a one +flavour propagator (flavour diagonal case) and to {\ttfamily 2} for a two +flavour propagator (flavour non-diagonal 2-flavour operator). In the +former case there follows one record of type +{\ttfamily scidac-binary-data}, which is identical to the SCIDAC +format, containing the fermion field. In the latter case there follow +two of such records, the first of which is the upper flavour. To be +precise, lets call the two flavours $s$ and $c$. Then we always store +the $s$ component first and then the $c$ component. +Any number of other records can be added for convenience. + +The first two types are by now supported. In the future the other two +might follow. + +The indices in the binary data {\ttfamily scidac-binary-data} are in +the following order: +\[ +t, z, y, x, s, c\, , +\] +where $t$ is the slowest and colour the fastest running index. +The binary data is stored big endian and either in single or in double +precision, depending on the {\ttfamily precision} parameter in the +{\ttfamily etmc-propagator-format} record. + +The $\gamma$-matrix convention is the one of the HMC code (see +subsection \ref{gammas}) and the operator is normalised to +\[ +D = +\frac{1}{2}[\gamma_\mu(\nabla_\mu+\nabla_\mu^*)-a\nabla_\mu^*\nabla_\mu] ++ m_0 + i \mu\gamma_5\tau^3\, . +\] +For the non-degenerate case with the two flavour operator the +following operator is inverted: +\[ +D_\mathrm{nd} = +\frac{1}{2}[\gamma_\mu(\nabla_\mu+\nabla_\mu^*)-a\nabla_\mu^*\nabla_\mu] ++ m_0 + i\bar\mu\gamma_5\tau_1+\bar\epsilon\tau_3 +\] + +\subsubsection{Source Fields} + +Source fields are, as mentioned before, stored with the same binary +data format. There are again several types of source files possible: +\begin{itemize} +\item {\ttfamily DiracFermion\_Source} +\item {\ttfamily DiracFermion\_ScalarSource} +\item {\ttfamily DiracFermion\_FourScalarSource} +\item {\ttfamily DiracFermion\_TwelveScalarSource} +\end{itemize} +This type is stored in a record called {\ttfamily source-type} in the +lime file. There might be several sources stored within the same +file. We add a format reacord {\ttfamily etmc-source-format} looking like +\begin{verbatim} + + + diracFermion + 32 + 1 + 4 + 4 + 4 + 4 + 4 + 3 + +\end{verbatim} +with obvious meaning for every {\ttfamily scidac-binary-data} record +within the lime packed file. This format record also allows to store a +subset of the whole field, e.g. a timeslize. + +%%% Local Variables: +%%% mode: latex +%%% TeX-master: "main" +%%% End: diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/doc/quda.tex b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/quda.tex new file mode 100644 index 0000000000000000000000000000000000000000..9cf3be9f7b59f246217ab0d497e1ef491a7f0c85 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/quda.tex @@ -0,0 +1,101 @@ +%author: Mario Schroeck +%date: 04/2015 + +\subsection{QUDA: A library for QCD on GPUs} + + +The QUDA \cite{Clark:2009wm, Babich:2011np, Strelchenko:2013vaa} interface is complementary to tmLQCD's own CUDA kernels for computations on the GPU by Florian Burger. +So far it is exclusively used for inversions. + +\subsubsection{Design goals of the interface} +The QUDA interface has been designed with the following goals in mind, sorted by priority: +\begin{enumerate} + \item \emph{Safety.} Naturally, highest priority is given to the correctness of the output of the interface. + This is trivially achieved by always checking the final residual on the CPU with the default tmLQCD routines. + \item \emph{Ease of use.} Within the operator declarations of the input file (between {\ttfamily BeginOperator} and {\ttfamily EndOperator}) a simple flag {\ttfamily UseQudaInverter} is introduced which, when set to {\ttfamily yes}, will let QUDA perform the inversion of that operator. The operators {\ttfamily TMWILSON, WILSON, DBTMWILSON} and {\ttfamily CLOVER} are supported.\footnote{{\ttfamily DBCLOVER} is supported by the interface but not by QUDA as of version 0.7.0.} + \item \emph{Minimality.} Minimal changes in the form of {\ttfamily \#ifdef QUDA} precompiler directives to the tmLQCD code base. The main bulk of the interface lies in a single separate file {\ttfamily quda\_interface.c} (with corresponding header file). In the file {\ttfamily operators.c}, the QUDA library is initialized when an operator is initialized which has set {\ttfamily UseQudaInverter = yes}. There, the actual call to the inverter is conditionally replaced with a call to the QUDA interface. + \item \emph{Performance.} The higher priority of the previous items results in small performance detriments. In particular: + \begin{itemize} + \item tmLQCD's $\theta$-boundary conditions are not compatible with QUDA's 8 and 12 parameter reconstruction of the gauge fields (as of QUDA-0.7.0). Therefore reconstruction/compression is deactivated by default, although it may be activated via the input file, see below. + \item The gaugefield is transferred each time to the GPU before the inversion starts in order to ensure not to miss any modifications of the gaugefield. + \end{itemize} +\end{enumerate} + + +\subsubsection{Installation} +If not already installed, you have to install QUDA first. Download the most recent version from \url{http://lattice.github.io/quda/}. Note that QUDA version $\geq 0.7.0$ is required (chiral gamma basis). + +QUDA can be installed without any dependencies, consider, e.g., the following minimal configuration: + +\begin{verbatim} +./configure CC=mpicc CXX=mpiCC \ +CFLAGS="-O3 -std=c99" CXXFLAGS="-O3 -std=c++0x" \ +--prefix=$QUDADIR \ +--with-mpi=$MPI_PATH \ +--with-cuda=$CUDADIR \ +--enable-os=linux \ +--enable-cpu-arch=x86_64 \ +--enable-gpu-arch=sm_35 \ +--enable-multi-gpu \ +--enable-wilson-dirac \ +--enable-clover-dirac \ +--enable-twisted-mass-dirac \ +--enable-ndeg-twisted-mass-dirac \ +--enable-twisted-clover-dirac \ +--enable-device-pack +\end{verbatim} +where {\ttfamily \$CUDADIR} and {\ttfamily \$MPI\_PATH} have to be set appropriately. +{\ttfamily \$QUDADIR} is your choice for the installation directory of QUDA. +Note that if you want to use QUDA in a scalar build of tmLQCD, you should remove the lines {\ttfamily --enable-multi-gpu} and {\ttfamily --with-mpi=\$MPI\_PATH} in the configuration (and probably you want to replace the MPI compilers). +In order to profit from QUDA's autotuning functionality, set the environment variable {\ttfamily QUDA\_RESOURCE\_PATH} to a directory of your choice, e.g., add +\begin{verbatim} +export QUDA_RESOURCE_PATH=${HOME}/quda_resources/ +\end{verbatim} +to your {\ttfamily $\sim$/.bash\_profile}. + +Once QUDA is installed, a minimal configuration of tmLQCD could look like, e.g., +\begin{verbatim} +./configure CC=mpicc \ +--prefix=$TMLQCDDIR \ +--with-limedir=$LIMEDIR \ +--with-lapack= \ +--enable-mpi \ +--with-mpidimension=4 \ +CXX=mpiCC \ +--with-qudadir=$QUDADIR \ +--with-cudadir=${CUDADIR}/lib +\end{verbatim} +Note that a {\ttfamily C++} compiler is required for linking against the QUDA library, therefore set {\ttfamily CXX} appropriately. {\ttfamily \${QUDADIR}} is where you installed QUDA in the previous step and {\ttfamily \${CUDADIR}} is required again for linking. + + +\subsubsection{Usage} +Any main program that reads and handles the operator declaration from an input file can easily be set up to use the QUDA inverter by setting the {\ttfamily UseQudaInverter} flag to {\ttfamily yes}. For example, in the input file for the {\ttfamily invert} executable, add the flag to the operator declaration as +\begin{verbatim} +BeginOperator TMWILSON + 2kappaMu = 0.05 + kappa = 0.177 + UseEvenOdd = yes + Solver = CG + SolverPrecision = 1e-14 + MaxSolverIterations = 1000 + UseQudaInverter = yes +EndOperator +\end{verbatim} +and the operator of interest will be inverted using QUDA. The initialization of QUDA is done automatically within the operator initialization, the QUDA library should be finalized by a call to {\ttfamily \_endQuda()} just before finalizing MPI. When you use the QUDA interface for work that is being published, don't forget to cite \cite{Clark:2009wm, Babich:2011np, Strelchenko:2013vaa}. + + +\subsubsection{More advanced settings} +To achieve higher performance you may choose single (default) or even half precision as sloppy precision for the inner solver of the mixed precision inverter with reliable updates. After {\ttfamily BeginOperator} and before {\ttfamily EndOperator} set {\ttfamily UseSloppyPrecision = double|single|half}. + +To activate compression of the gauge fields (in order to save bandwidth and thus to achieve higher performance), set {\ttfamily UseCompression = 8|12|18} within {\ttfamily BeginOperator} and {\ttfamily EndOperator}. The default is 18 which corresponds to no compression. Note that if you use compression, trivial (anti)periodic boundary conditions will be applied to the gauge fields, instead of the default $\theta$-boundary conditions. As a consequence, the residual check on tmLQCD side will fail. Moreover, compression is not applicable when using general $\theta$-boundary conditions in the spatial directions. If trying to do so, compression will be activated automatically and the user gets informed via the standard output. + + +\subsubsection{Functionality} +The QUDA interface can currently be used to invert {\ttfamily TMWILSON, WILSON, DBTMWILSON} and {\ttfamily CLOVER} within a 4D multi-GPU (MPI) parallel environment with CG or BICGSTAB. QUDA uses even-odd preconditioning, if wanted ({\ttfamily UseEvenOdd = yes}), and the interface is set up to use a mixed precision solver by default. For more details on the QUDA settings check the function {\ttfamily \_initQuda()} in {\ttfamily quda\_interface.c}. + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/doc/rational.tex b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/rational.tex new file mode 100644 index 0000000000000000000000000000000000000000..032015b557ae68ee7869561b3f93d72077d0290e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/doc/rational.tex @@ -0,0 +1,231 @@ +\subsection{Rational HMC} + +For the heavy doublet one may alternatively use a rational +approximation +\[ +\mathcal{R}(\hat Q_h^2)\ = \ \prod_{i = 1}^N \frac{\hat Q_h^2 + + a_{2i-1}}{\hat Q_h^2 + a_{2i}}\approx\quad\frac{1}{\sqrt{\hat Q_h^2}} +\] +where we used the shorthand notation +\[ +\hat Q_h^2\ =\ \gamma_5 \hat D_h \tau^1\gamma_5\hat D_h \tau^1 +\] +and $\hat Q_h=\gamma_5\hat D_h\tau^1$ is the even/odd preconditioned version +of $Q_h$ defined in Eq.~(\ref{eq:Dh}). Obviously, we have $\hat +Q_h^\dagger = \hat Q_h$. +We are using the Zolotarev solution for the +optimal rational approximation to $1/\sqrt{y}$. The coefficients $a_i$ +fulfill the property +\[ +a_1 > a_2 > ... > a_{2N} > 0\, . +\] +We use the partial fraction expansion to re-express +\[ +\mathcal{R}(\hat Q_h^2)\ = \ 1 + \sum_{i=1}^{N} \frac{q_i}{\hat Q_h^2 + + \mu_i^2}\, . +\] +The coefficients $r_i$ are given as +\[ +q_i = (a_{2i-1} - a_{2i}) \prod_{m=1, m\neq i}^N \frac{a_{2m-1} + - a_{2i}}{a_{2m} - a_{2i}}\,,\quad i = 1,...,N\,. +\] +If we defined -- following L{\"u}scher -- $\mu_i = \sqrt{a_{2i}}$ and $\nu_i += \sqrt{a_{2i-1}}$, we may rewrite $q_i$ as +\[ +q_i = (\nu_i^2 - \mu_i^2)\prod_{m=1, m\neq i}^N \frac{\nu_m^2 - + \mu_i^2}{\mu_m^2 - \mu_i^2}\,,\quad i = 1,...,N\, . +\] +For the heatbath step we need to generate pseudo-fermion fields from +Gaussian random fields $R$ +\[ +R^\dagger R = \phi^\dagger \mathcal{R} \phi +\] +and, therefore, we need operators $C^\dagger, C$ with +\[ +\mathcal{R}^{-1} = C^\dagger\ \cdot\ C\,,\qquad \phi = C\cdot R\,. +\] +$C$ is given by (inspired by twisted mass) +\[ +C\ =\ \prod_{i=1}^N \frac{\hat Q_h + i\mu_i}{\hat Q_h + i\nu_i} +\] +which can again be written as a partial fraction +\[ +C\ =\ 1 + i\sum_{i=1}^N \frac{r_i}{\hat Q_h + i\nu_i}\,, +\] +with +\[ +r_i = (\mu_i - \nu_i)\prod_{m=1, m\neq i}^N \frac{\mu_m - + \nu_i}{\nu_m - \nu_i}\,,\quad i = 1,...,N\, . +\] +The rational approximation $\mathcal{R}$ can be applied to a vector +using a multi-mass solver and the partial fraction representation. The +same works for $C$: after solving $N$ equations simultaneously for +$(\hat Q_h^2 + \nu_i^2)^{-1},\quad i = 1,...,N$, we have to multiply +every term with $(\hat Q_h - i\nu_i)$. The hermitian conjugate of $C$ +is given by +\[ +C^\dagger\ =\ 1 - i\sum_{i=1}^N \frac{r_i}{\hat Q_h - i\nu_i}\,, +\] +using $\hat Q_h^\dagger = \hat Q_h$. + +For the acceptance step one just needs an application of $\mathcal{R}$. + +\subsubsection{Force Computation} + +For the derivative and the force computation we have to consider terms +of the form +\[ +\phi^\dagger \frac{q_i}{\hat Q_h^2 + \mu_i^2}\phi\,, +\] +and its variation with respect to the gauge fields: +\[ +\begin{split} +\delta_U\ \phi^\dagger \frac{q_i}{\hat Q_h^2 + \mu_i^2}\phi &= +q_i\phi^\dagger\frac{1}{\hat Q_h + i\mu_i}\frac{1}{\hat Q_h - +i\mu_i}(-\delta_U \hat Q_h)\frac{1}{\hat Q_h -i\mu_i}\phi\ +\ +\textrm{h.c.}\\ +&= -2 \re\left( q_i\phi^\dagger\frac{1}{\hat Q_h^2 + \mu_i^2} +(\delta_U \hat Q_h)\frac{1}{\hat Q_h -i\mu_i}\phi\ \right) +\end{split} +\] + +\subsubsection{Splitting of the Rational} + +For preconditioning the fermion determinant it is useful to split the +rational into several products +\[ +\mathcal{R}(\hat Q_h^2)\ = r_{0}^{l}(\hat Q_h^2)\cdot r_{l}^{k}(\hat +Q_h^2)\cdot ... +\] +with terms +\[ +r_{c_0}^{c_1} = \ \prod_{i = c_0}^{c_1} \frac{\hat Q_h^2 + + a_{2i-1}}{\hat Q_h^2 + a_{2i}}\,. +\] +Every term $r_{c_0}^{c_1}$ can then again be written as a partial +fraction with the same coefficients as given above. In +Ref.~\cite{Clark:2006fx} it was shown that the different partial +fractions contribute quite differently in their magnitude of the +corresponding force to the MD evolution: the smallest shifts and, +therefore, most expensive ones contribute the least to the +force. Hence, those can be integrated on a larger timescale than the +larger shifts, which contribute significantly more to the total MD +force. + +\subsubsection{Correction Monomial} + +The rational approximation has a finite precision. In the HMC one can +account for this effect by estimating +\[ +1 - |\hat Q_h| R\,, +\] +which can be done in different ways: +\begin{itemize} +\item we include an additional monomial for + \[ + \det (|\hat Q_h| R) + \] + in the Hamiltonian. If the rational apprximation is precise enough, + it is sufficient to only include this in the heatbath and acceptance + step and ignore the contribution to the derivative. For generating + the pseudo-fermion field for this monomial, one needs to find + \[ + B\cdot B^\dagger = |\hat Q_h| \mathcal{R}, + \] + which, following Ref.~\cite{Luscher:2010ae}, can be expanded in + terms of + \[ + Z = \hat Q_h^2\mathcal{R}^2 -1\,. + \] + The series + \[ + B = (1+Z)^{1/4} = 1 + \frac{1}{4} Z - \frac{3}{32} Z^2 + \frac{7}{122} Z^3 + ... + \] + is rapidly converging and can usually be truncated after the $Z^2$ + or latest $Z^3$ term, see + Refs.~\cite{Luscher:2010ae,Luscher:2012av}. We then obtain the + pseudo-fermion field $\phi$ by + \[ + \phi = B\cdot R\,, + \] + where $R$ is again a random Gaussian field. For the acceptance step + one needs to compute + \[ + \phi^\dagger (|\hat Q_h|\mathcal{R})^{-1}\phi\,, + \] + which, again expanding in $Z$ is obtained by + \[ + \phi^\dagger (1+Z)^{-1/2} \phi = \phi^\dagger (1 - \frac{1}{2}Z + + \frac{3}{8}Z^3 + ...) \phi\, . + \] + Also here the series can be truncated after the first few terms. +\item the second possibility is to include this correction as a + reweighting factor. +\item the third is to use a more precise rational approximation for + the heatbath and acceptance steps. +\end{itemize} + +\subsubsection{CGMMS Solver} + +\begin{algorithm} + \caption{CGMMS algorithm} + \label{alg:cgm} + \begin{algorithmic}[1] + \vspace{.2cm} + \STATE $n=0, x_0^k = 0, r_0 = p_0 = p_0^k = b, k_\mathrm{max}, + \delta, \epsilon$ + \STATE $\biggl.\biggr.\alpha_{-1} = \zeta_{-1}^k = \zeta_0^k = 1, \beta_0^k = \beta_0 = 0$ + \REPEAT + \STATE $\alpha_n = (r_n, r_n) / (p_n, A p_n)$ + \FOR{$k = 1$ to $k_\mathrm{max}$} + \STATE $\biggl.\biggr.\zeta_{n+1}^k = (\zeta^k_n \alpha_{n-1}) / + (\alpha_n \beta_n(1 - \zeta_n^k / \zeta^k_{n-1}) + \alpha_{n-1} + (1-\sigma_k\alpha_n))$ + \STATE $\alpha^k_n = (\alpha_n \zeta_{n+1}^k)/ \zeta_n^k$ + \STATE $\biggl.\biggr.x_{n+1}^k = x_n^k + \alpha_n^k p_n^k$ + \IF{$\|\alpha^{k_\mathrm{max}} p^{k_\mathrm{max}}\| < \delta$} + \STATE $k_\mathrm{max} = k_\mathrm{max} -1$ + \ENDIF + \ENDFOR + \STATE $x_{n+1} = x_n + \alpha_n p_n$ + \STATE $\biggl.\biggr.r_{n+1} = r_n - \alpha_n Ap_n$ + \STATE $\beta_{n+1} = (r_{n+1}, r_{n+1}) / (r_n, r_n)$ + \STATE $\beta_{n+1}^k = \frac{\beta_{n+1} \zeta_{n+1}^k \alpha_n^k}{\zeta_{n}^k\alpha_n}$ + \STATE $\biggl.\biggr.p_{n+1}^k = \zeta_{n+1}^k r_{n+1} + \beta_{n+1}^k p_n^k$ + \STATE $n=n+1$ + \UNTIL{$\|r_n\|<\epsilon$} + \end{algorithmic} +\end{algorithm} + + +For evaluating the rational approximation $\mathcal{R}$ applied to a +spinor field $\psi$ a multi-mass or multi-shift solver (see +algorithm~\ref{alg:cgm}) can be used, see Ref.~\cite{Chiarappa:2006hz} +and references therein. However, a little care is needed +as the shift vary over several orders of magnitudes. + +The original Krylov space is build for the shift smallest in +modulus. This will converge slowest and, therefore, the other shifts +will have the same or better precision guaranteed. But, if the +range in the shifts is too large, one needs to remove the highest +shifts in the course of the CG solve before the smallest shift is +converged. This will prevent the appearance of double precision +underflow and hence the appearance of exact zeros in +$\zeta^{k_\mathrm{max}}$, which would lead to +NaNs in the solution vectors. + +In order to avoid to compute the residue for all the shift frequently +during the CGMMS solve, one can rather monitor the norm of the +correction vector $p^\sigma$ of the currently biggest shift $\sigma$ +still in the process. The CG works such that the correction decreases +with decreasing residue. Therefore, one can remove the shift $\sigma$ +when +\[ +\|\alpha^\sigma p^\sigma\| < \delta\,, +\] +where one could for instance chose $\delta = +c\cdot\epsilon$. $\epsilon$ is the desired precision of the CGMMS +solve and $0. + * + * File expo.c + * + * + * The externally accessible functions are + * + * void exposu3(su3* const vr, const su3adj* const p); + * extern void exposu3_check(su3* const vr, const su3adj* const p, int im); + * extern void restoresu3(su3* const vr,const su3* const u); + * extern void restoresu3_in_place(su3* const u); + * extern void exposu3_in_place(su3* const u); + * + * Author: Martin Hasenbusch + * Tue Aug 28 10:06:56 MEST 2001 + * + ************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef SSE +# undef SSE +#endif +#ifdef SSE2 +# undef SSE2 +#endif +#ifdef SSE3 +# undef SSE3 +#endif +#include +#include +#include +#include "sse.h" +#include "su3.h" +#include "su3adj.h" +#include "expo.h" + +void exposu3(su3* const vr, const su3adj* const p) { + int i; + su3 ALIGN v,v2; + double ALIGN fac,r; + double ALIGN a,b; + _Complex double ALIGN a0,a1,a2,a1p; + + /* it writes 'p=vec(h_{j,mu})' in matrix form 'v' */ + _make_su3(v,*p); + /* calculates v^2 */ + _su3_times_su3(v2,v,v); + /* */ + a = 0.5 * (creal(v2.c00) + creal(v2.c11) + creal(v2.c22)); + /* 1/3 imaginary part of tr v*v2 */ + b = 0.33333333333333333 * cimag(v.c00 * v2.c00 + v.c01 * v2.c10 + v.c02 * v2.c20 + + v.c10 * v2.c01 + v.c11 * v2.c11 + v.c12 * v2.c21 + + v.c20 * v2.c02 + v.c21 * v2.c12 + v.c22 * v2.c22 ); + a0 = 0.16059043836821615e-9; + a1 = 0.11470745597729725e-10; + a2 = 0.76471637318198165e-12; + fac = 0.20876756987868099e-8; /* 1/12! */ + r = 12.0; + for(i = 3; i <= 15; ++i) + { + a1p = a0 + a * a2; + a0 = fac + b * I * a2; + a2 = a1; + a1 = a1p; + fac *= r; + r -= 1.0; + } + /* vr = a0 + a1*v + a2*v2 */ + vr->c00 = a0 + a1 * v.c00 + a2 * v2.c00; + vr->c01 = a1 * v.c01 + a2 * v2.c01; + vr->c02 = a1 * v.c02 + a2 * v2.c02; + vr->c10 = a1 * v.c10 + a2 * v2.c10; + vr->c11 = a0 + a1 * v.c11 + a2 * v2.c11; + vr->c12 = a1 * v.c12 + a2 * v2.c12; + vr->c20 = a1 * v.c20 + a2 * v2.c20; + vr->c21 = a1 * v.c21 + a2 * v2.c21; + vr->c22 = a0 + a1 * v.c22 + a2 * v2.c22; +} + +void exposu3_check(su3* const vr, const su3adj* const p, int im) { + /* compute the result by taylor series */ + su3 ALIGN v,v2,v3; + double ALIGN fac; + int i; + + _make_su3(v, *p); + _su3_one(*vr); + _su3_acc(*vr, v); + _su3_times_su3(v2, v, v); + _su3_refac_acc(*vr, 0.5, v2); + fac = 0.5; + for(i = 3; i <= im; i++) { + fac = fac/i; + _su3_times_su3(v3, v2, v); + _su3_refac_acc(*vr, fac, v3); + _su3_assign(v2, v3); + } +} + +void restoresu3(su3* const vr, const su3* const u) { + double ALIGN n0,n1; + + /* normalize rows 1 and 2 */ + n0 = 1.0 / sqrt(conj(u->c00) * u->c00 + conj(u->c01) * u->c01 + conj(u->c02) * u->c02); + n1 = 1.0 / sqrt(conj(u->c10) * u->c10 + conj(u->c11) * u->c11 + conj(u->c12) * u->c12); + + vr->c00 = n0 * u->c00; + vr->c01 = n0 * u->c01; + vr->c02 = n0 * u->c02; + + vr->c10 = n1 * u->c10; + vr->c11 = n1 * u->c11; + vr->c12 = n1 * u->c12; + + /* compute row 3 as the conjugate of the cross-product of 1 and 2 */ + vr->c20 = conj(vr->c01 * vr->c12 - vr->c02 * vr->c11); + vr->c21 = conj(vr->c02 * vr->c10 - vr->c00 * vr->c12); + vr->c22 = conj(vr->c00 * vr->c11 - vr->c01 * vr->c10); +} + +void restoresu3_in_place(su3* const u) { + double ALIGN n0,n1; + + /* normalize rows 1 and 2 */ + n0 = 1.0 / sqrt(conj(u->c00) * u->c00 + conj(u->c01) * u->c01 + conj(u->c02) * u->c02); + n1 = 1.0 / sqrt(conj(u->c10) * u->c10 + conj(u->c11) * u->c11 + conj(u->c12) * u->c12); + + u->c00 = n0 * u->c00; + u->c01 = n0 * u->c01; + u->c02 = n0 * u->c02; + + u->c10 = n1 * u->c10; + u->c11 = n1 * u->c11; + u->c12 = n1 * u->c12; + + /* compute row 3 as the conjugate of the cross-product of 1 and 2 */ + u->c20 = conj(u->c01 * u->c12 - u->c02 * u->c11); + u->c21 = conj(u->c02 * u->c10 - u->c00 * u->c12); + u->c22 = conj(u->c00 * u->c11 - u->c01 * u->c10); +} + +/* Exponentiates a hermitian 3x3 matrix Q */ +/* Convenience function -- wrapper around Hasenbusch's implementation */ +void exposu3_in_place(su3* const u) { + su3adj ALIGN p; + + _trace_lambda(p, *u); /* Projects onto the Gell-Mann matrices */ + /* -2.0 to get su3 to su3adjoint consistency ****/ + p.d1 *= -0.5; p.d2 *= -0.5; p.d3 *= -0.5; p.d4 *= -0.5; + p.d5 *= -0.5; p.d6 *= -0.5; p.d7 *= -0.5; p.d8 *= -0.5; + exposu3(u,&p); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/expo.h b/qcd/part_cpu/applications/QCD/src/kernel_D/expo.h new file mode 100644 index 0000000000000000000000000000000000000000..dd0c3657f0291b5abbce94358b83f9044079aef1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/expo.h @@ -0,0 +1,28 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _EXPO_H +#define _EXPO_H + +extern void exposu3(su3* const vr, const su3adj* const p); +extern void exposu3_check(su3* const vr, const su3adj* const p, int im); +extern void restoresu3(su3* const vr, const su3* const u); +extern void restoresu3_in_place(su3* const u); +extern void exposu3_in_place(su3* const u); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/fatal_error.c b/qcd/part_cpu/applications/QCD/src/kernel_D/fatal_error.c new file mode 100644 index 0000000000000000000000000000000000000000..f7a153b6fc0b13a120a8512c753e7073b3e5e709 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/fatal_error.c @@ -0,0 +1,57 @@ +/*********************************************************************** + * + * Copyright (C) 2012 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include + +#ifdef MPI +#include +#endif + +#include "fatal_error.h" + +void fatal_error(char const *error, char const *function) +{ + if (error != NULL) + { + fprintf(stderr, "FATAL ERROR\n"); + if (function != NULL) + { +#ifdef MPI + fprintf(stderr, " Within %s (reported by node %d):\n", function, g_proc_id); +#else + fprintf(stderr, " Within %s:\n", function); +#endif + } + fprintf(stderr, " %s\n", error); + fflush(stderr); + } + +#ifdef MPI + MPI_Abort(MPI_COMM_WORLD, 1); + MPI_Finalize(); +#endif + + exit(500); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/fatal_error.h b/qcd/part_cpu/applications/QCD/src/kernel_D/fatal_error.h new file mode 100644 index 0000000000000000000000000000000000000000..7a1e6622acebae91ef1eb0c25088934836866eaa --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/fatal_error.h @@ -0,0 +1,29 @@ +/*********************************************************************** + * + * Copyright (C) 2012 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _FATAL_ERROR_H +#define _FATAL_ERROR_H + +/* Function to cleanly exit the program with an error messages. The two parameters + are strings describing the error and identifying the originating function respectively */ + +void fatal_error(char const *error, char const *function); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/fixed_volume.h.in b/qcd/part_cpu/applications/QCD/src/kernel_D/fixed_volume.h.in new file mode 100644 index 0000000000000000000000000000000000000000..a71f01a7611ddb50ca7f0d3548a990ff7eb550cd --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/fixed_volume.h.in @@ -0,0 +1,70 @@ +/******************************************************** + * + * In case the code was configure for fixed volume at + * compiletime, the parameters have to be set here! + * + * Note: + * - 4dim. parallesitaion: T*LX*LY must be even + * - always: LZ must be even. + * + * - on the Blue Gene/L try the following setup + * in oder to get the best performance: + * nodecard (64CPU): 4x4x2x2 + * 4nodecard(128CPU): 8x4x2x2 + * midplane(1024CPU): 8x8x8x2 + * rack (2048CPU): 8x8x16x2 + * + * + ********************************************************/ + +#ifndef _FIXED_VOLUME_H +#define _FIXED_VOLUME_H + +# if defined FIXEDVOLUME + +/* Set the next 8 number! */ + +# define Tdef 48 +# define Xdef 24 +# define Ydef 24 +# define Zdef 24 + +# define N_PROC_T 1 +# define N_PROC_X 1 +# define N_PROC_Y 1 +# define N_PROC_Z 1 + +/* The rest is done automatially */ + +# define T (Tdef/N_PROC_T) +# define LX (Xdef/N_PROC_X) +# define LY (Ydef/N_PROC_Y) +# define LZ (Zdef/N_PROC_Z) +# define L LX +# define VOLUME (T*LX*LY*LZ) +# define SPACEVOLUME (LX*LY*LZ) +# define TEOSLICE ((LX*LY*LZ)/2) + +# ifdef PARALLELT +# define RAND (2*LX*LY*LZ) +# define EDGES 0 +# elif defined PARALLELXT +# define RAND (2*LZ*(LY*LX + T*LY)) +# define EDGES (4*LZ*LY); +# elif defined PARALLELXYT +# define RAND (2*LZ*(LY*LX + T*LY + T*LX)) +# define EDGES (4*LZ*(LY + T + LX)) +# elif defined PARALLELXYZT +# define RAND (2*LZ*(LY*LX + T*LY + T*LX) + 2*T*LX*LY) +# define EDGES (4*LZ*(LY + T + LX) + 4*LY*T + 4*LY*LX + 4*T*LX) +# else +# define RAND 0 +# define EDGES 0 +# endif + /* Note that VOLUMEPLUSRAND is in general not equal to VOLUME+RAND */ + /* VOLUMEPLUSRAND rather includes the edges */ +# define VOLUMEPLUSRAND (VOLUME + RAND + EDGES) +# define SPACERAND (RAND/T) +# endif + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/gamma.c b/qcd/part_cpu/applications/QCD/src/kernel_D/gamma.c new file mode 100644 index 0000000000000000000000000000000000000000..eb0fcb90b168d85e54e720b9269fbe61055147bb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/gamma.c @@ -0,0 +1,178 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +/******************************************************************************* + * + * File gamma.c + * + * void gammaXY ( const Q, const P) + * Makes (*Q) = gammaXY*(*P) there are 4 gamma_mu, gamma_5 and 4 gamma_5*gamma_mu + * + * + *******************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "su3spinor.h" +#include "gamma.h" +#ifdef OMP +#include +#endif + +/* (*Q) = gammaXY*(*P) */ + +void gamma0( const int Q, const int P, const int V){ +#ifdef OMP +#pragma omp parallel for +#endif + for (int ix = 0; ix < V; ix++){ + _gamma0(g_spinor_field[Q][ix], g_spinor_field[P][ix]); + } +} +void gamma1( const int Q, const int P, const int V){ +#ifdef OMP +#pragma omp parallel for +#endif + for (int ix=0;ix. + ***********************************************************************/ + +#ifndef _GAMMA_H +#define _GAMMA_H + +#include "su3.h" + +/* Makes (*Q) = gammaXY*(*P) there are 4 gamma_mu, gamma_5 and 4 gamma_5*gamma_mu */ + +void gamma0(const int Q, const int P, const int V); +void gamma1( const int Q, const int P, const int V); +void gamma2( const int Q, const int P, const int V); +void gamma3( const int Q, const int P, const int V); + +void gamma5(spinor * const Q, spinor * const P, const int V); + +void gamma50( const int Q, const int P, const int V); +void gamma51( const int Q, const int P, const int V); +void gamma52( const int Q, const int P, const int V); +void gamma53( const int Q, const int P, const int V); + +void P_plus(spinor * const Q, spinor * const P, const int V); +void P_minus(spinor * const Q, spinor * const P, const int V); +void Proj(spinor * const Q, spinor * const P, const int V, const int flag); + +#endif + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/gen_sources.c b/qcd/part_cpu/applications/QCD/src/kernel_D/gen_sources.c new file mode 100644 index 0000000000000000000000000000000000000000..2b2c3fca5e36c3a1c41c30609a6861db9d3e0c0c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/gen_sources.c @@ -0,0 +1,294 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +/******************************************************************************* +* +* +* source generation main file +* +* Author: Carsten Urbach +* urbach@physik.fu-berlin.de +* +*******************************************************************************/ +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef BENCHMARK +#include <./c-lime/include/lime.h> +#else +#include +#endif + +#include +#include +#include +#include +#include +#include +#ifdef MPI +# include +#endif +#ifdef OMP +# include +#endif +#include "global.h" +#include "getopt.h" +#include "geometry_eo.h" +#include "start.h" +#include +#include "read_input.h" +#include "mpi_init.h" +#include "source_generation.h" +#include "init/init.h" +#include "linalg_eo.h" +#include "phmc.h" + +void usage() { + fprintf(stdout, "Code to generate stochastic sources\n"); + fprintf(stdout, "Version %s \n\n", PACKAGE_VERSION); + fprintf(stdout, "Please send bug reports to %s\n", PACKAGE_BUGREPORT); + fprintf(stdout, "Usage: gen_sources [options]\n"); + fprintf(stdout, "Options: -L spatial lattice size\n"); + fprintf(stdout, " -T temporal lattice size\n"); + fprintf(stdout, " -o output-filename basename [optional, default source]\n"); + fprintf(stdout, " -n configuration number [optional, default 0]\n"); + fprintf(stdout, " -s sample number [optional, default 0]\n"); + fprintf(stdout, " -t start timslice [optional, default 0]\n"); + fprintf(stdout, " -S spatial spacing [optional, default 1]\n"); + fprintf(stdout, " -P temporal spacing [optional, default T]\n"); + fprintf(stdout, " -N produce nucleon sources [optional, default meson]\n"); + fprintf(stdout, " -p use plain output filename [default, complex]\n"); + fprintf(stdout, " -O pion only -> wallsource at start timeslice \n"); + fprintf(stdout, " -E extended source for pion only \n"); + fprintf(stdout, " -d double precision \n"); + fprintf(stdout, " -a store all sources in one file\n"); + fprintf(stdout, " -h|-? this help \n\n"); + fprintf(stdout, "plain output file (-p) corresponds to basename.00 - basename.11\n"); + fprintf(stdout, "complex ones (no -p) to basename.samplenr.gaugenr.tsnr.00 - 11\n"); + exit(0); +} + +extern int nstore; +const int rlxdsize = 105; + +int main(int argc,char *argv[]) { + + char spinorfilename[100]; + char * filename = NULL; + int sample=0, ts=0, ss=1, typeflag = 1, t0=0, piononly = 0, ext_sourceflag = 0; + int is, ic, j, filenameflag = 0, appendflag = 0; + complex co; + int c; + int prec=32; + + verbose = 0; + g_use_clover_flag = 0; + nstore = 0; + L=0; + T=0; + +#ifdef MPI + MPI_Init(&argc, &argv); +#endif + +#ifdef OMP + /* FIXME: in principle this should not be set like this as it could result + in thread oversubscription when more than one process is run locally + unfortunately, there does not seem to be a standard way to determine + the number of "local" MPI processes */ + omp_num_threads = omp_get_max_threads(); + init_openmp(); +#endif + + while ((c = getopt(argc, argv, "h?NCpOEdao:L:T:n:t:s:S:P:")) != -1) { + switch (c) { + case 'L': + L = atoi(optarg); + LX = L; + LY = L; + LZ = L; + break; + case 'T': + T = atoi(optarg); + T_global = T; + break; + case 'N': + typeflag = 0; + break; + case 'd': + prec = 64; + break; + case 'O': + piononly = 1; + break; + case 'n': + nstore = atoi(optarg); + break; + case 's': + sample = atoi(optarg); + break; + case 't': + t0 = atoi(optarg); + break; + case 'S': + ss = atoi(optarg); + break; + case 'P': + ts = atoi(optarg); + break; + case 'o': + filename = calloc(200, sizeof(char)); + strcpy(filename,optarg); + break; + case 'E': + ext_sourceflag = 1; + break; + case 'p': + filenameflag = 1; + break; + case 'a': + appendflag = 1; + break; + case 'h': + case '?': + default: + usage(); + break; + } + } + if(ts == 0) { + ts = T; + } + if(filename == NULL){ + filename = "source"; + } + if(L==0 || T==0) { + if(g_proc_id == 0) { + fprintf(stderr, "L and T must be specified! Aborting...\n"); + fflush( stderr ); + } + exit(1); + } + + tmlqcd_mpi_init(argc, argv); + + j = init_geometry_indices(VOLUMEPLUSRAND); + if ( j!= 0) { + fprintf(stderr, "Not enough memory for geometry_indices! Aborting...\n"); + exit(0); + } + if(!ext_sourceflag) { + j = init_spinor_field(VOLUMEPLUSRAND/2, 2); + } + else { + j = init_spinor_field(VOLUMEPLUSRAND/2, 4); + } + if ( j!= 0) { + fprintf(stderr, "Not enough memory for spinor fields! Aborting...\n"); + exit(0); + } + + /* define the geometry */ + geometry(); + + if(!piononly) { + for(is = 0; is < 4; is ++) { + for(ic = 0; ic < 3; ic++) { + if(!filenameflag && !appendflag) { + if(T_global > 99) { + sprintf(spinorfilename, "%s.%.4d.%.4d.%.3d.%.2d", filename, nstore, sample, t0, 3*is+ic); + } + else { + sprintf(spinorfilename, "%s.%.4d.%.4d.%.2d.%.2d", filename, nstore, sample, t0, 3*is+ic); + } + } + else if(!filenameflag && appendflag) { + if(T_global > 99) sprintf(spinorfilename, "%s.%.4d.%.4d.%.3d", filename, nstore, sample, t0); + else sprintf(spinorfilename, "%s.%.4d.%.4d.%.2d", filename, nstore, sample, t0); + } + else{ + sprintf(spinorfilename, "%s.%.2d", filename, 3*is+ic); + } + if(!appendflag || (is == 0 && ic ==0)) { + printf("Generating source %s!\n", spinorfilename); + fflush(stdout); + } + + source_generation_nucleon(g_spinor_field[0], g_spinor_field[1], + is, ic, t0, ts, ss, sample, nstore, typeflag); + + co = scalar_prod(g_spinor_field[1], g_spinor_field[1], VOLUME/2, 1); + if((is == 0 && ic == 0) || appendflag == 0) { + write_source_type(0, spinorfilename); + } + write_source(g_spinor_field[0], g_spinor_field[1], spinorfilename, 1, prec); + } + } + } + else { + if(!ext_sourceflag) { + if(!filenameflag) { + if(T_global > 99) sprintf(spinorfilename, "%s.%.4d.%.4d.%.3d", filename, nstore, sample, t0); + else sprintf(spinorfilename, "%s.%.4d.%.4d.%.2d", filename, nstore, sample, t0); + } + else { + sprintf(spinorfilename, "%s", filename); + } + printf("Generating source %s!\n", spinorfilename); + fflush(stdout); + source_generation_pion_only(g_spinor_field[0], g_spinor_field[1], + t0, sample, nstore); + + co = scalar_prod(g_spinor_field[1], g_spinor_field[1], VOLUME/2, 1); + write_source_type(0, spinorfilename); + write_source(g_spinor_field[0], g_spinor_field[1], spinorfilename, 1, prec); + } + else { + if(!filenameflag) { + if(T_global > 99) sprintf(spinorfilename, "%s.%.4d.%.4d.%.3d.inverted", filename, nstore, sample, t0); + else sprintf(spinorfilename, "%s.%.4d.%.4d.%.2d.inverted", filename, nstore, sample, t0); + } + else { + sprintf(spinorfilename, "%s.inverted", filename); + } + read_lime_spinor(g_spinor_field[0], g_spinor_field[1], spinorfilename, 0); + + printf("Generating ext. pion source %s!\n", spinorfilename); + extended_pion_source(g_spinor_field[2], g_spinor_field[3], + g_spinor_field[0], g_spinor_field[1], + t0, 0., 0., 0.); + if(!filenameflag) { + if(T_global > 99) sprintf(spinorfilename, "g%s.%.4d.%.4d.%.3d", filename, nstore, sample, t0); + else sprintf(spinorfilename, "g%s.%.4d.%.4d.%.2d", filename, nstore, sample, t0); + } + else { + sprintf(spinorfilename, "g%s", filename); + } + write_source_type(0, spinorfilename); + write_source(g_spinor_field[2], g_spinor_field[3], spinorfilename, 1, prec); + } + } + +#ifdef MPI + MPI_Finalize(); +#endif + free_geometry_indices(); + free_spinor_field(); + return(0); +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/geometry_eo.c b/qcd/part_cpu/applications/QCD/src/kernel_D/geometry_eo.c new file mode 100644 index 0000000000000000000000000000000000000000..119d2ebc6ceac8c3c7d32112ba6c1222334b1630 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/geometry_eo.c @@ -0,0 +1,1535 @@ +/***************************************************************************** + * Copyright (C) 2001 Martin Hasenbusch + * 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * Modified by Jenifer Gonzalez Lopez 31.03.2009 + * + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * + * Subroutines related to the lattice geometry + * + * The externally accessible function is + * + * void geometry_eo(void) + * Computes the index arrays g_ipt, g_iup, g_idn, g_lexic2eo and g_eo2lexic + * + * + *******************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "su3adj.h" +#include "mpi_init.h" + +void Hopping_Matrix_Indices(void); + +#if ((defined PARALLELX) || (defined PARALLELXY) || (defined PARALLELXYZ)) + +/* This is the version of the function Index introduced for Aurora-like parallelizations (mainly xyz) */ +int Index(const int x0, const int x1, const int x2, const int x3) { + /* defined for all points in the internal lattice */ + /* and for those points in the external lattice such that: */ + /* - up to 2 directions out of the lattice, */ + /* - one direction up to distance 2 out of the lattice, */ + /* - the other direction up to distance 1 out of the lattice. */ + + int y0, y1, y2, y3, ix; + + y0 = (x0 + T ) % T; + y1 = (x1 + LX) % LX; + y2 = (x2 + LY) % LY; + y3 = (x3 + LZ) % LZ; + ix = ((y0*LX + y1)*LY + y2)*LZ + y3; + + /* x-Rand */ + if(x1 == LX){ + ix = VOLUME + y0*LY*LZ + y2*LZ + y3; + } + if(x1 == -1){ + ix = VOLUME + T*LY*LZ + y0*LY*LZ + y2*LZ + y3; + } + +#if (defined PARALLELXY || defined PARALLELXYZ) + /* y-Rand */ + if(x2 == LY) { + ix = VOLUME + 2*T*LY*LZ + y0*LX*LZ + y1*LZ + y3; + } + if(x2 == -1) { + ix = VOLUME + 2*T*LY*LZ + T*LX*LZ + y0*LX*LZ + y1*LZ + y3; + } + /* yx-edge */ + if(x1 == LX) { + if(x2 == LY) { + ix = VOLUME + RAND + y0*LZ + y3; + } + if(x2 == -1) { + ix = VOLUME + RAND + T*LZ + y0*LZ + y3; + } + } + if(x1 == -1) { + if(x2 == LY) { + ix = VOLUME + RAND + 2*T*LZ + y0*LZ + y3; + } + if(x2 == -1) { + ix = VOLUME + RAND + 3*T*LZ + y0*LZ + y3; + } + } +#endif /* endif of PARALLELXY || PARALLELXYZ */ + +#if defined PARALLELXYZ + /* z-Rand */ + if(x3 == LZ) { + ix = VOLUME + 2*T*LY*LZ + 2*T*LX*LZ + y0*LX*LY + y1*LY + y2; + } + if(x3 == -1) { + ix = VOLUME + 2*T*LY*LZ + 2*T*LX*LZ + T*LX*LY + y0*LX*LY + y1*LY + y2; + } + /* zx-edge */ + if(x1 == LX) { + if(x3 == LZ) { + ix = VOLUME + RAND + 4*T*LZ + y0*LY + y2; + } + if(x3 == -1) { + ix = VOLUME + RAND + 4*T*LZ + T*LY + y0*LY + y2; + } + } + if(x1 == -1) { + if(x3 == LZ) { + ix = VOLUME + RAND + 4*T*LZ + 2*T*LY + y0*LY + y2; + } + if(x3 == -1) { + ix = VOLUME + RAND + 4*T*LZ + 3*T*LY + y0*LY + y2; + } + } + /* zy-edge */ + if(x3 == LZ) { + if(x2 == LY) { + ix = VOLUME + RAND + 4*T*LZ + 4*T*LY + y0*LX + y1; + } + if(x2 == -1) { + ix = VOLUME + RAND + 4*T*LZ + 4*T*LY + 2*T*LX + y0*LX + y1; + } + } + if(x3 == -1) { + if(x2 == LY) { + ix = VOLUME + RAND + 4*T*LZ + 4*T*LY + T*LX + y0*LX + y1; + } + if(x2 == -1) { + ix = VOLUME + RAND + 4*T*LZ + 4*T*LY + 3*T*LX + y0*LX + y1; + } + } + +#endif /* endif of PARALLELXYZ */ + + /* The DBW2 stuff --> second boundary slice */ + /* This we put a the very end. */ + + /* x2-rand+ */ + if(x1 == LX+1) { + ix = VOLUMEPLUSRAND + y0*LY*LZ + y2*LZ + y3; +# if (defined PARALLELXY || defined PARALLELXYZ) + /* x2y */ + if(x2 == LY) { + ix = VOLUMEPLUSRAND + RAND + y0*LZ + y3; + } + else if(x2 == -1) { + ix = VOLUMEPLUSRAND + RAND + 1*T*LZ + y0*LZ + y3; + } +# endif /* endif of PARALLELXY || PARALLELXYZ */ +# if defined PARALLELXYZ + /* x2z */ + else if(x3 == LZ) { + ix = VOLUMEPLUSRAND + RAND + 8*T*LZ + 4*T*LY + y0*LY + y2; + } + else if(x3 == -1) { + ix = VOLUMEPLUSRAND + RAND + 8*T*LZ + 5*T*LY + y0*LY + y2; + } +# endif /* endif of PARALLELXYZ */ + } + /* x2-rand- */ + if(x1 == -2) { + ix = VOLUMEPLUSRAND + T*LY*LZ + y0*LY*LZ + y2*LZ + y3; +# if (defined PARALLELXY || defined PARALLELXYZ) + /* x2y */ + if(x2 == LY) { + ix = VOLUMEPLUSRAND + RAND + 2*T*LZ + y0*LZ + y3; + } + else if(x2 == -1) { + ix = VOLUMEPLUSRAND + RAND + 3*T*LZ + y0*LZ + y3; + } +# endif /* endif of PARALLELXY || PARALLELXYZ */ +# if defined PARALLELXYZ + /* x2z */ + else if(x3 == LZ) { + ix = VOLUMEPLUSRAND + RAND + 8*T*LZ + 6*T*LY + y0*LY + y2; + } + else if(x3 == -1) { + ix = VOLUMEPLUSRAND + RAND + 8*T*LZ + 7*T*LY + y0*LY + y2; + } +# endif /* endif of PARALLELXYZ */ + } +#if (defined PARALLELXY || defined PARALLELXYZ) + /* y2-rand+ */ + if(x2 == LY+1) { + ix = VOLUMEPLUSRAND + 2*T*LY*LZ + y0*LX*LZ + y1*LZ + y3; + /* y2x */ + if(x1 == LX) { + ix = VOLUMEPLUSRAND + RAND + 4*T*LZ + y0*LZ + y3; + } + else if (x1 == -1) { + ix = VOLUMEPLUSRAND + RAND + 6*T*LZ + y0*LZ + y3; + } +# if defined PARALLELXYZ + /* y2z */ + else if(x3 == LZ) { + ix = VOLUMEPLUSRAND + RAND + 8*T*LZ + 8*T*LY + 4*T*LX + y0*LX + y1; + } + else if(x3 == -1) { + ix = VOLUMEPLUSRAND + RAND + 8*T*LZ + 8*T*LY + 5*T*LX + y0*LX + y1; + } +# endif /* endif of PARALLELXYZ */ + } + /* y2-rand- */ + if(x2 == -2) { + ix = VOLUMEPLUSRAND + 2*T*LY*LZ + T*LX*LZ + y0*LX*LZ + y1*LZ + y3; + /* y2x */ + if(x1 == LX) { + ix = VOLUMEPLUSRAND + RAND + 5*T*LZ + y0*LZ + y3; + } + else if (x1 == -1) { + ix = VOLUMEPLUSRAND + RAND + 7*T*LZ + y0*LZ + y3; + } +# if defined PARALLELXYZ + /* y2z */ + else if(x3 == LZ) { + ix = VOLUMEPLUSRAND + RAND + 8*T*LZ + 8*T*LY + 6*T*LX + y0*LX + y1; + } + else if(x3 == -1) { + ix = VOLUMEPLUSRAND + RAND + 8*T*LZ + 8*T*LY + 7*T*LX + y0*LX + y1; + } +# endif /* endif of PARALLELXYZ */ + } +#endif /* endif of PARALLELXY || PARALLELXYZ */ +#if defined PARALLELXYZ + /* z2-rand+ */ + if(x3 == LZ+1) { + ix = VOLUMEPLUSRAND + 2*T*LY*LZ + 2*T*LX*LZ + y0*LX*LY + y1*LY + y2; + /* z2x */ + if(x1 == LX) { + ix = VOLUMEPLUSRAND + RAND + 8*T*LZ + y0*LY + y2; + } + else if (x1 == -1) { + ix = VOLUMEPLUSRAND + RAND + 8*T*LZ + 2*T*LY + y0*LY + y2; + } + /* z2y */ + else if(x2 == LY) { + ix = VOLUMEPLUSRAND + RAND + 8*T*LZ + 8*T*LY + y0*LX + y1; + } + else if(x2 == -1) { + ix = VOLUMEPLUSRAND + RAND + 8*T*LZ + 8*T*LY + 2*T*LX + y0*LX + y1; + } + } + /* z2-rand- */ + if(x3 == -2) { + ix = VOLUMEPLUSRAND + 2*T*LY*LZ + 2*T*LX*LZ + T*LX*LY + y0*LX*LY + y1*LY + y2; + /* z2x */ + if(x1 == LX) { + ix = VOLUMEPLUSRAND + RAND + 8*T*LZ + T*LY + y0*LY + y2; + } + else if(x1 == -1) { + ix = VOLUMEPLUSRAND + RAND + 8*T*LZ + 3*T*LY + y0*LY + y2; + } + /* z2y */ + else if(x2 == LY) { + ix = VOLUMEPLUSRAND + RAND + 8*T*LZ + 8*T*LY + 1*T*LX + y0*LX + y1; + } + else if(x2 == -1) { + ix = VOLUMEPLUSRAND + RAND + 8*T*LZ + 8*T*LY + 3*T*LX + y0*LX + y1; + } + } +#endif /* endif of PARALLELXYZ */ + + return(ix); +} + +#else /* original version of Index(): used for no parallelization or PARALLEL*T */ + +int Index(const int x0, const int x1, const int x2, const int x3) { + int y0, y1, y2, y3, ix; + +#ifdef WITHLAPH + y0 = x0; +#else + y0 = (x0 + T ) % T; +#endif + y1 = (x1 + LX) % LX; + y2 = (x2 + LY) % LY; + y3 = (x3 + LZ) % LZ; + ix = ((y0*LX + y1)*LY + y2)*LZ + y3; + +#if ((defined PARALLELT) || (defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT)) + if(x0 == T) { + ix = VOLUME + y3 + LZ*y2 + LZ*LY*y1; + } + /* the slice at time -1 is put to T+1 */ + else if(x0 == -1) { + ix = VOLUME + LX*LY*LZ + y3 + LZ*y2 + LZ*LY*y1; + } +#endif +#if ((defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT)) + if(x1 == LX){ + ix = VOLUME + 2*LX*LY*LZ + y0*LY*LZ + y2*LZ + y3; + } + if(x1 == -1){ + ix = VOLUME + 2*LX*LY*LZ + T*LY*LZ + y0*LY*LZ + y2*LZ + y3; + } + /* The edges */ + /* xt-edge */ + if(x0 == T){ + if(x1 == LX){ + ix = VOLUME+RAND+y2*LZ+y3; + } + if(x1 == -1){ + ix = VOLUME+RAND+LY*LZ+y2*LZ+y3; + } + } + if(x0 == -1){ + if(x1 == LX){ + ix = VOLUME+RAND+2*LY*LZ+y2*LZ+y3; + } + if(x1 == -1){ + ix = VOLUME+RAND+3*LY*LZ+y2*LZ+y3; + } + } + +#endif /* endif of PARALLELXT || PARALLELXYT || PARALLELXYZT */ + +#if (defined PARALLELXYT || defined PARALLELXYZT) + /* y-Rand */ + if(x2 == LY) { + ix = VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + y0*LX*LZ + y1*LZ + y3; + } + if(x2 == -1) { + ix = VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + T*LX*LZ + y0*LX*LZ + y1*LZ + y3; + } + /* the edges */ + /* yx-edge */ + if(x1 == LX) { + if(x2 == LY) { + ix = VOLUME + RAND + 4*LY*LZ + y0*LZ + y3; + } + if(x2 == -1) { + ix = VOLUME + RAND + 4*LY*LZ + T*LZ + y0*LZ + y3; + } + } + if(x1 == -1) { + if(x2 == LY) { + ix = VOLUME + RAND + 4*LY*LZ + 2*T*LZ + y0*LZ + y3; + } + if(x2 == -1) { + ix = VOLUME + RAND + 4*LY*LZ + 3*T*LZ + y0*LZ + y3; + } + } + /* ty-edge */ + /* Be carefully here! Here we need y first, then t */ + /* this is because the chain is first t dir, then y direction */ + /* this is oposit to the other edges ! */ + if(x2 == LY) { + if(x0 == T) { + ix = VOLUME + RAND + 4*LY*LZ + 4*T*LZ + y1*LZ + y3; + } + if(x0 == -1) { + ix = VOLUME + RAND + 4*LY*LZ + 4*T*LZ + LX*LZ + y1*LZ + y3; + } + } + if(x2 == -1) { + if(x0 == T) { + ix = VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 2*LX*LZ + y1*LZ + y3; + } + if(x0 == -1) { + ix = VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 3*LX*LZ + y1*LZ + y3; + } + } + +#endif /* endif of PARALLELXYT || PARALLELXYZT */ +#if defined PARALLELXYZT + /* z-Rand */ + if(x3 == LZ) { + ix = VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + + y0*LX*LY + y1*LY + y2; + } + if(x3 == -1) { + ix = VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + T*LX*LY + + y0*LX*LY + y1*LY + y2; + } + /* the edges */ + /* zx-edge */ + if(x1 == LX) { + if(x3 == LZ) { + ix = VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ + + y0*LY + y2; + } + if(x3 == -1) { + ix = VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ + T*LY + + y0*LY + y2; + } + } + if(x1 == -1) { + if(x3 == LZ) { + ix = VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ + 2*T*LY + + y0*LY + y2; + } + if(x3 == -1) { + ix = VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ + 3*T*LY + + y0*LY + y2; + } + } + /* tz-edge */ + if(x3 == LZ) { + if(x0 == T) { + ix = VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ + 4*T*LY + + y1*LY + y2; + } + if(x0 == -1) { + ix = VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ + 4*T*LY + LX*LY + + y1*LY + y2; + } + } + if(x3 == -1) { + if(x0 == T) { + ix = VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ + 4*T*LY + 2*LX*LY + + y1*LY + y2; + } + if(x0 == -1) { + ix = VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ + 4*T*LY + 3*LX*LY + + y1*LY + y2; + } + } + /* zy-edge */ + if(x3 == LZ) { + if(x2 == LY) { + ix = VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ + 4*T*LY + 4*LX*LY + + y0*LX + y1; + } + if(x2 == -1) { + ix = VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ + 4*T*LY + 4*LX*LY + 2*T*LX + + y0*LX + y1; + } + } + if(x3 == -1) { + if(x2 == LY) { + ix = VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ + 4*T*LY + 4*LX*LY + T*LX + + y0*LX + y1; + } + if(x2 == -1) { + ix = VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ + 4*T*LY + 4*LX*LY + 3*T*LX + + y0*LX + y1; + } + } + + +#endif /* endif of PARALLELXYZT */ + + /* The DBW2 stuff --> second boundary slice */ + /* This we put a the very end. */ +#if ((defined PARALLELT) || (defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT)) + if(x0 == T+1) { + ix = VOLUMEPLUSRAND + y3 + LZ*y2 + LZ*LY*y1; +# if ((defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT)) + /* t2x */ + if(x1 == LX) { + ix = VOLUMEPLUSRAND + RAND + + y2*LZ + y3; + } + else if (x1 == -1) { + ix = VOLUMEPLUSRAND + RAND + 1*LY*LZ + + y2*LZ + y3; + } +# endif /* endif of PARALLELXT || PARALLELXYT || PARALLELXYZT */ +# if (defined PARALLELXYT || defined PARALLELXYZT) + /* t2y */ + else if(x2 == LY) { + ix = VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + + y1*LZ + y3; + } + else if(x2 == -1) { + ix = VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 2*LX*LZ + + y1*LZ + y3; + } +# endif /* endif of PARALLELXYT || PARALLELXYZT */ +# if defined PARALLELXYZT + /* t2z */ + else if(x3 == LZ) { + ix = VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + + y1*LY + y2; + } + else if(x3 == -1) { + ix = VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 2*LX*LY + + y1*LY + y2; + } +# endif /* endif of PARALLELXYZT */ + } + /* the slice at time -2 is put behind the one at time T+1 */ + else if(x0 == -2) { + ix = VOLUMEPLUSRAND + LX*LY*LZ + y3 + LZ*y2 + LZ*LY*y1; +# if ((defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT)) + /* t2x */ + if(x1 == LX) { + ix = VOLUMEPLUSRAND + RAND + 2*LY*LZ + + y2*LZ + y3; + } + else if (x1 == -1) { + ix = VOLUMEPLUSRAND + RAND + 3*LY*LZ + + y2*LZ + y3; + } +# endif /* endif of PARALLELXT || PARALLELXYT || PARALLELXYZT */ +# if (defined PARALLELXYT || defined PARALLELXYZT) + /* t2y */ + else if(x2 == LY) { + ix = VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + LX*LZ + + y1*LZ + y3; + } + else if(x2 == -1) { + ix = VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 3*LX*LZ + + y1*LZ + y3; + } +# endif /* endif of PARALLELXYT || PARALLELXYZT */ +# if defined PARALLELXYZT + /* t2z */ + else if(x3 == LZ) { + ix = VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + LX*LY + + y1*LY + y2; + } + else if(x3 == -1) { + ix = VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 3*LX*LY + + y1*LY + y2; + } +# endif /* endif of PARALLELXYZT */ + } +#endif /* endif of PARALLELT || PARALLELXT || PARALLELXYT || PARALLELXYZT */ +#if ((defined PARALLELXT) || (defined PARALLELXYT) || defined PARALLELXYZT) + if(x1 == LX+1) { + ix = VOLUMEPLUSRAND + 2*LX*LY*LZ + y0*LY*LZ + y2*LZ + y3; + /* x2t */ + if(x0 == T) { + ix = VOLUMEPLUSRAND + RAND + 4*LY*LZ + + y2*LZ + y3; + } + else if (x0 == -1) { + ix = VOLUMEPLUSRAND + RAND + 6*LY*LZ + + y2*LZ + y3; + } +# if (defined PARALLELXYT || defined PARALLELXYZT) + /* x2y */ + else if(x2 == LY) { + ix = VOLUMEPLUSRAND + RAND + 8*LY*LZ + + y0*LZ + y3; + } + else if(x2 == -1) { + ix = VOLUMEPLUSRAND + RAND + 8*LY*LZ + 1*T*LZ + + y0*LZ + y3; + } +# endif /* endif of PARALLELXYT || PARALLELXYZT */ +# if defined PARALLELXYZT + /* x2z */ + else if(x3 == LZ) { + ix = VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 4*T*LY + + y0*LY + y2; + } + else if(x3 == -1) { + ix = VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 5*T*LY + + y0*LY + y2; + } +# endif /* endif of PARALLELXYZT */ + } + if(x1 == -2) { + ix = VOLUMEPLUSRAND + 2*LX*LY*LZ + T*LY*LZ + y0*LY*LZ + y2*LZ + y3; + /* x2t */ + if(x0 == T) { + ix = VOLUMEPLUSRAND + RAND + 5*LY*LZ + + y2*LZ + y3; + } + else if(x0 == -1) { + ix = VOLUMEPLUSRAND + RAND + 7*LY*LZ + + y2*LZ + y3; + } +# if (defined PARALLELXYT || defined PARALLELXYZT) + /* x2y */ + else if(x2 == LY) { + ix = VOLUMEPLUSRAND + RAND + 8*LY*LZ + 2*T*LZ + + y0*LZ + y3; + } + else if(x2 == -1) { + ix = VOLUMEPLUSRAND + RAND + 8*LY*LZ + 3*T*LZ + + y0*LZ + y3; + } +# endif /* endif of PARALLELXYT || PARALLELXYZT */ +# if defined PARALLELXYZT + /* x2z */ + else if(x3 == LZ) { + ix = VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 6*T*LY + + y0*LY + y2; + } + else if(x3 == -1) { + ix = VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 7*T*LY + + y0*LY + y2; + } +# endif /* endif of PARALLELXYZT */ + } +#endif /* endif of PARALLELXT || PARALLELXYT || PARALLELXYZT */ +#if (defined PARALLELXYT || defined PARALLELXYZT) + if(x2 == LY+1) { + ix = VOLUMEPLUSRAND + 2*LX*LY*LZ + 2*T*LY*LZ + y0*LX*LZ + y1*LZ + y3; + /* y2x */ + if(x1 == LX) { + ix = VOLUMEPLUSRAND + RAND + 8*LY*LZ + 4*T*LZ + + y0*LZ + y3; + } + else if (x1 == -1) { + ix = VOLUMEPLUSRAND + RAND + 8*LY*LZ + 6*T*LZ + + y0*LZ + y3; + } + /* y2t */ + else if(x0 == T) { + ix = VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 4*LX*LZ + + y1*LZ + y3; + } + else if(x0 == -1) { + ix = VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 5*LX*LZ + + y1*LZ + y3; + } +# if defined PARALLELXYZT + /* y2z */ + else if(x3 == LZ) { + ix = VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 8*T*LY + 4*T*LX + + y0*LX + y1; + } + else if(x3 == -1) { + ix = VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 8*T*LY + 5*T*LX + + y0*LX + y1; + } +# endif /* endif of PARALLELXYZT */ + } + if(x2 == -2) { + ix = VOLUMEPLUSRAND + 2*LX*LY*LZ + 2*T*LY*LZ + T*LX*LZ + y0*LX*LZ + y1*LZ + y3; + /* y2x */ + if(x1 == LX) { + ix = VOLUMEPLUSRAND + RAND + 8*LY*LZ + 5*T*LZ + + y0*LZ + y3; + } + else if (x1 == -1) { + ix = VOLUMEPLUSRAND + RAND + 8*LY*LZ + 7*T*LZ + + y0*LZ + y3; + } + /* y2t */ + else if(x0 == T) { + ix = VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 6*LX*LZ + + y1*LZ + y3; + } + else if (x0 == -1) { + ix = VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 7*LX*LZ + + y1*LZ + y3; + } +# if defined PARALLELXYZT + /* y2z */ + else if(x3 == LZ) { + ix = VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 8*T*LY + 6*T*LX + + y0*LX + y1; + } + else if(x3 == -1) { + ix = VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 8*T*LY + 7*T*LX + + y0*LX + y1; + } +# endif /* endif of PARALLELXYZT */ + } +#endif /* endif of PARALLELXYT || PARALLELXYZT */ +#if defined PARALLELXYZT + /* z2-Rand */ + if(x3 == LZ+1) { + if((x0 < T) && (x0 > -1) && (x1 < LX) && (x1 > -1) && (x2 > -1) && (x2 < LY)) { + ix = VOLUMEPLUSRAND + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + y0*LX*LY + y1*LY + y2; + } + /* z2x */ + else if(x1 == LX) { + ix = VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + + y0*LY + y2; + } + else if (x1 == -1) { + ix = VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 2*T*LY + + y0*LY + y2; + } + /* z2t */ + else if(x0 == T) { + ix = VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 4*LX*LY + + y1*LY + y2; + } + else if(x0 == -1) { + ix = VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 5*LX*LY + + y1*LY + y2; + } + /* z2y */ + else if(x2 == LY) { + ix = VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 8*T*LY + + y0*LX + y1; + } + else if(x2 == -1) { + ix = VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 8*T*LY + 2*T*LX + + y0*LX + y1; + } + } + if(x3 == -2) { + if((x0 < T) && (x0 > -1) && (x1 < LX) && (x1 > -1) && (x2 > -1) && (x2 < LY)) { + ix = VOLUMEPLUSRAND + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + T*LX*LY + y0*LX*LY + y1*LY + y2; + } + /* z2x */ + else if(x1 == LX) { + ix = VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + T*LY + + y0*LY + y2; + } + else if(x1 == -1) { + ix = VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 3*T*LY + + y0*LY + y2; + } + /* z2t */ + else if(x0 == T) { + ix = VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 6*LX*LY + + y1*LY + y2; + } + else if(x0 == -1) { + ix = VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 7*LX*LY + + y1*LY + y2; + } + /* z2y */ + else if(x2 == LY) { + ix = VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 8*T*LY + 1*T*LX + + y0*LX + y1; + } + else if(x2 == -1) { + ix = VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 8*T*LY + 3*T*LX + + y0*LX + y1; + } + } +#endif /* endif of PARALLELXYZT */ +/* if(ix == 372) { */ +/* printf("## %d %d %d %d ix = %d, %d %d %d %d\n", x0, x1, x2, x3, ix, T, LX, LY, LZ); */ +/* } */ + return(ix); +} + +#endif /* PARALLEL??? */ + +void geometry_KD(){ + + int x0,x1,x2,x3,ix; + int y0, y1, y2, y3, j; + int bndcnt=0; + int i_even,i_odd; + int startvaluet = 0; + int startvaluex = 0; + int startvaluey = 0; + int startvaluez = 0; + int * xeven; +#if defined MPI + int isp, *ones, *oneS, *oneL; + int lsliceS, lsliceL, check_struct_zt; +#endif + + xeven = malloc(VOLUMEPLUSRAND*sizeof(int)); + +#if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) + startvaluet = 1; +#endif +#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ ) + startvaluex = 1; +#endif +#if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ ) + startvaluey = 1; +#endif +#if (defined PARALLELXYZT || defined PARALLELXYZ ) + startvaluez = 1; +#endif + + /* extended for boundary slices */ + for (x0 = -startvaluet; x0 < (T+startvaluet); x0++){ + for (x1 = -startvaluex; x1 < (LX+startvaluex); x1++){ + for (x2 = -startvaluey; x2 < (LY+startvaluey); x2++){ + for (x3 = -startvaluez; x3 < (LZ+startvaluez); x3++){ + bndcnt = 0; + if(x0 < 0 || x0 > T-1) bndcnt++; + if(x1 < 0 || x1 > LX-1) bndcnt++; + if(x2 < 0 || x2 > LY-1) bndcnt++; + if(x3 < 0 || x3 > LZ-1) bndcnt++; + + y0=x0; y1=x1; y2=x2; y3=x3; + if(x0 == -1) { + y0 = T+1; + } + if(x1 == -1) { + y1 = LX+1; + } + if(x2 == -1) { + y2 = LY+1; + } + if(x3 == -1) { + y3 = LZ+1; + } + if(bndcnt > 2) { + /* Should not be needed, set it to -1 */ + g_ipt[y0][y1][y2][y3] = -1; + } + else { + ix=Index(x0, x1, x2, x3); + g_ipt[y0][y1][y2][y3] = ix; + /* g_proc_id*T|LX|LY|LZ is added to allow for odd T|LX|LY|LZ when the number of + nodes is even */ + if((x0 + x1 + x2 + x3 + + g_proc_coords[0]*T + g_proc_coords[1]*LX + + g_proc_coords[2]*LY + g_proc_coords[3]*LZ)%2==0) { + xeven[ix]=1; + } + else { + xeven[ix]=0; + } + + g_iup[ix][0] = Index(x0+1, x1, x2, x3); + g_idn[ix][0] = Index(x0-1, x1, x2, x3); + + g_iup[ix][1] = Index(x0, x1+1, x2, x3); + g_idn[ix][1] = Index(x0, x1-1, x2, x3); + + g_iup[ix][2] = Index(x0, x1, x2+1, x3); + g_idn[ix][2] = Index(x0, x1, x2-1, x3); + + g_iup[ix][3] = Index(x0, x1, x2, x3+1); + g_idn[ix][3] = Index(x0, x1, x2, x3-1); + + if(ix LX-1) bndcnt++; + if(x2 < 0 || x2 > LY-1) bndcnt++; + if(x3 < 0 || x3 > LZ-1) bndcnt++; + if(bndcnt < 2) { + /* t2 Rand and t2x and t2y */ + x0 = -2; + ix = Index(x0, x1, x2, x3); + if(ix < VOLUMEPLUSRAND) { + printf("#### -2t %d %d %d %d\n",x0, x1, x2, x3); + } + + g_iup[ix][0] = Index(x0+1, x1, x2, x3); + g_idn[ix][0] = -1; + + if(x1 < LX) g_iup[ix][1] = Index(x0, x1+1, x2, x3); + if(x1 > -1) g_idn[ix][1] = Index(x0, x1-1, x2, x3); + + if(x2 < LY) g_iup[ix][2] = Index(x0, x1, x2+1, x3); + if(x2 > -1) g_idn[ix][2] = Index(x0, x1, x2-1, x3); + + if(x3 < LZ) g_iup[ix][3] = Index(x0, x1, x2, x3+1); + if(x3 > -1) g_idn[ix][3] = Index(x0, x1, x2, x3-1); + + x0 = T+1; + ix = Index(x0, x1, x2, x3); + if(ix < VOLUMEPLUSRAND) { + printf("#### +2t %d %d %d %d\n",x0, x1, x2, x3); + } + g_iup[ix][0] = -1; + g_idn[ix][0] = Index(x0-1, x1, x2, x3); + + if(x1 < LX) g_iup[ix][1] = Index(x0, x1+1, x2, x3); + if(x1 > -1) g_idn[ix][1] = Index(x0, x1-1, x2, x3); + + if(x2 < LY) g_iup[ix][2] = Index(x0, x1, x2+1, x3); + if(x2 > -1) g_idn[ix][2] = Index(x0, x1, x2-1, x3); + + if(x3 < LZ) g_iup[ix][3] = Index(x0, x1, x2, x3+1); + if(x3 > -1) g_idn[ix][3] = Index(x0, x1, x2, x3-1); + + } + } + } + } +#endif +#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ) + for (x0 = -startvaluet; x0 < (T+startvaluet); x0++){ + for (x2 = -startvaluey; x2 < (LY+startvaluey); x2++) { + for (x3 = -startvaluez; x3 < (LZ+startvaluez); x3++) { + bndcnt = 0; + if(x0 < 0 || x0 > T-1) bndcnt++; + if(x2 < 0 || x2 > LY-1) bndcnt++; + if(x3 < 0 || x3 > LZ-1) bndcnt++; + if(bndcnt < 2) { + /* x2-Rand and x2t and x2y */ + x1 = -2; + ix = Index(x0, x1, x2, x3); + if(ix < VOLUMEPLUSRAND) { + printf("#### -2x %d %d %d %d\n",x0, x1, x2, x3); + } + if(x0 < T) g_iup[ix][0] = Index(x0+1, x1, x2, x3); + if(x0 > -1) g_idn[ix][0] = Index(x0-1, x1, x2, x3); + + g_iup[ix][1] = Index(x0, x1+1, x2, x3); + g_idn[ix][1] = -1; + + if(x2 < LY) g_iup[ix][2] = Index(x0, x1, x2+1, x3); + if(x2 > -1) g_idn[ix][2] = Index(x0, x1, x2-1, x3); + + if(x3 < LZ) g_iup[ix][3] = Index(x0, x1, x2, x3+1); + if(x3 > -1) g_idn[ix][3] = Index(x0, x1, x2, x3-1); + + x1 = LX+1; + ix = Index(x0, x1, x2, x3); + if(ix < VOLUMEPLUSRAND) { + printf("#### +2x %d %d %d %d\n",x0, x1, x2, x3); + } + if(x0 < T) g_iup[ix][0] = Index(x0+1, x1, x2, x3); + if(x0 > -1) g_idn[ix][0] = Index(x0-1, x1, x2, x3); + + g_iup[ix][1] = -1; + g_idn[ix][1] = Index(x0, x1-1, x2, x3); + + if(x2 < LY) g_iup[ix][2] = Index(x0, x1, x2+1, x3); + if(x2 > -1) g_idn[ix][2] = Index(x0, x1, x2-1, x3); + + if(x3 < LZ) g_iup[ix][3] = Index(x0, x1, x2, x3+1); + if(x3 > -1) g_idn[ix][3] = Index(x0, x1, x2, x3-1); + } + } + } + } +#endif +#if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ) + for (x0 = -startvaluet; x0 < (T+startvaluet); x0++){ + for (x1 = -startvaluex; x1 < (LX+startvaluex); x1++) { + for (x3 = -startvaluez; x3 < (LZ+startvaluez); x3++) { + bndcnt = 0; + if(x0 < 0 || x0 > T-1) bndcnt++; + if(x1 < 0 || x1 > LX-1) bndcnt++; + if(x3 < 0 || x3 > LZ-1) bndcnt++; + if(bndcnt < 2) { + /* y2-Rand y2t and y2x */ + x2 = -2; + ix = Index(x0, x1, x2, x3); + if(ix < VOLUMEPLUSRAND || ix >= VOLUMEPLUSRAND + g_dbw2rand) { + printf("#### -2y %d %d %d %d\n",x0, x1, x2, x3); + } + if(x0 < T) g_iup[ix][0] = Index(x0+1, x1, x2, x3); + if(x0 > -1) g_idn[ix][0] = Index(x0-1, x1, x2, x3); + + if(x1 < LX) g_iup[ix][1] = Index(x0, x1+1, x2, x3); + if(x1 > -1) g_idn[ix][1] = Index(x0, x1-1, x2, x3); + + g_iup[ix][2] = Index(x0, x1, x2+1, x3); + g_idn[ix][2] = -1; + + if(x3 < LZ) g_iup[ix][3] = Index(x0, x1, x2, x3+1); + if(x3 > -1) g_idn[ix][3] = Index(x0, x1, x2, x3-1); + + x2 = LY+1; + ix = Index(x0, x1, x2, x3); + if(ix < VOLUMEPLUSRAND || ix >= VOLUMEPLUSRAND + g_dbw2rand) { + printf("#### +2y %d %d %d %d\n",x0, x1, x2, x3); + } + if(x0 < T) g_iup[ix][0] = Index(x0+1, x1, x2, x3); + if(x0 > -1) g_idn[ix][0] = Index(x0-1, x1, x2, x3); + + if(x1 < LX) g_iup[ix][1] = Index(x0, x1+1, x2, x3); + if(x1 > -1) g_idn[ix][1] = Index(x0, x1-1, x2, x3); + + g_iup[ix][2] = -1; + g_idn[ix][2] = Index(x0, x1, x2-1, x3); + + if(x3 < LZ) g_iup[ix][3] = Index(x0, x1, x2, x3+1); + if(x3 > -1) g_idn[ix][3] = Index(x0, x1, x2, x3-1); + } + } + } + } +#endif +#if (defined PARALLELXYZT || defined PARALLELXYZ) + for (x0 = -startvaluet; x0 < (T+startvaluet); x0++){ + for (x1 = -startvaluex; x1 < (LX+startvaluex); x1++) { + for (x2 = -startvaluey; x2 < (LY+startvaluey); x2++) { + bndcnt = 0; + if(x0 < 0 || x0 > T-1) bndcnt++; + if(x1 < 0 || x1 > LX-1) bndcnt++; + if(x2 < 0 || x2 > LY-1) bndcnt++; + if(bndcnt < 2) { + /* z2-Rand t2z and z2x z2y*/ + x3 = -2; + ix = Index(x0, x1, x2, x3); + if(ix < VOLUMEPLUSRAND || ix >= VOLUMEPLUSRAND + g_dbw2rand) { + printf("#### -2z %d %d %d %d %d %d %d %d %d %d %d\n",x0, x1, x2, x3, ix, + VOLUMEPLUSRAND, VOLUMEPLUSRAND + g_dbw2rand, T, LX, LY, LZ); + } + if(x0 < T) g_iup[ix][0] = Index(x0+1, x1, x2, x3); + if(x0 > -1) g_idn[ix][0] = Index(x0-1, x1, x2, x3); + + if(x1 < LX) g_iup[ix][1] = Index(x0, x1+1, x2, x3); + if(x1 > -1) g_idn[ix][1] = Index(x0, x1-1, x2, x3); + + if(x2 < LY) g_iup[ix][2] = Index(x0, x1, x2+1, x3); + if(x2 > -1) g_idn[ix][2] = Index(x0, x1, x2-1, x3); + + g_iup[ix][3] = Index(x0, x1, x2, x3+1); + g_idn[ix][3] = -1; + + x3 = LZ+1; + ix = Index(x0, x1, x2, x3); + if(ix < VOLUMEPLUSRAND || ix >= VOLUMEPLUSRAND + g_dbw2rand) { + printf("#### +2z %d %d %d %d\n",x0, x1, x2, x3); + } + if(x0 < T) g_iup[ix][0] = Index(x0+1, x1, x2, x3); + if(x0 > -1) g_idn[ix][0] = Index(x0-1, x1, x2, x3); + + if(x1 < LX) g_iup[ix][1] = Index(x0, x1+1, x2, x3); + if(x1 > -1) g_idn[ix][1] = Index(x0, x1-1, x2, x3); + + if(x2 < LY) g_iup[ix][2] = Index(x0, x1, x2+1, x3); + if(x2 > -1) g_idn[ix][2] = Index(x0, x1, x2-1, x3); + + g_iup[ix][3] = -1; + g_idn[ix][3] = Index(x0, x1, x2, x3-1); + } + } + } + } +#endif + } + + Hopping_Matrix_Indices(); + + free(xeven); +} + + +void Hopping_Matrix_Indices(){ + int ix; + int ioff = (VOLUME+RAND)/2; + /**************** loop over all lattice sites ****************/ + for (int icx = 0, icy = (VOLUME+RAND)/2; icx < VOLUME/2; icx++, icy++) + { + ix=g_eo2lexic[icx]; + /*********************** direction +0 ************************/ + g_hi[(16*icx)] = g_iup[ix][0]; + g_hi[(16*icx)+1] = g_lexic2eosub[g_hi[(16*icx)]]; + g_hi[(16*icx)] = ix; + /*********************** direction -0 ************************/ + g_hi[(16*icx)+2] = g_idn[ix][0]; + g_hi[(16*icx)+3] = g_lexic2eosub[g_hi[(16*icx)+2]]; + /*********************** direction +1 ************************/ + g_hi[(16*icx)+4] = g_iup[ix][1]; + g_hi[(16*icx)+5] = g_lexic2eosub[g_hi[(16*icx)+4]]; + /*********************** direction -1 ************************/ + g_hi[(16*icx)+6] = g_idn[ix][1]; + g_hi[(16*icx)+7] = g_lexic2eosub[g_hi[(16*icx)+6]]; + /*********************** direction +2 ************************/ + g_hi[(16*icx)+8] = g_iup[ix][2]; + g_hi[(16*icx)+9] = g_lexic2eosub[g_hi[(16*icx)+8]]; + /*********************** direction -2 ************************/ + g_hi[(16*icx)+10] = g_idn[ix][2]; + g_hi[(16*icx)+11] = g_lexic2eosub[g_hi[(16*icx)+10]]; + /*********************** direction +3 ************************/ + g_hi[(16*icx)+12] = g_iup[ix][3]; + g_hi[(16*icx)+13] = g_lexic2eosub[g_hi[(16*icx)+12]]; + /*********************** direction -3 ************************/ + g_hi[(16*icx)+14] = g_idn[ix][3]; + g_hi[(16*icx)+15] = g_lexic2eosub[g_hi[(16*icx)+14]]; + /************************ end of loop ************************/ + ix=g_eo2lexic[icx+ioff]; + /*********************** direction +0 ************************/ + g_hi[(16*icy)] = g_iup[ix][0]; + g_hi[(16*icy)+1] = g_lexic2eosub[g_hi[(16*icy)]]; + g_hi[(16*icy)] = ix; + /*********************** direction -0 ************************/ + g_hi[(16*icy)+2] = g_idn[ix][0]; + g_hi[(16*icy)+3] = g_lexic2eosub[g_hi[(16*icy)+2]]; + /*********************** direction +1 ************************/ + g_hi[(16*icy)+4] = g_iup[ix][1]; + g_hi[(16*icy)+5] = g_lexic2eosub[g_hi[(16*icy)+4]]; + /*********************** direction -1 ************************/ + g_hi[(16*icy)+6] = g_idn[ix][1]; + g_hi[(16*icy)+7] = g_lexic2eosub[g_hi[(16*icy)+6]]; + /*********************** direction +2 ************************/ + g_hi[(16*icy)+8] = g_iup[ix][2]; + g_hi[(16*icy)+9] = g_lexic2eosub[g_hi[(16*icy)+8]]; + /*********************** direction -2 ************************/ + g_hi[(16*icy)+10] = g_idn[ix][2]; + g_hi[(16*icy)+11] = g_lexic2eosub[g_hi[(16*icy)+10]]; + /*********************** direction +3 ************************/ + g_hi[(16*icy)+12] = g_iup[ix][3]; + g_hi[(16*icy)+13] = g_lexic2eosub[g_hi[(16*icy)+12]]; + /*********************** direction -3 ************************/ + g_hi[(16*icy)+14] = g_idn[ix][3]; + g_hi[(16*icy)+15] = g_lexic2eosub[g_hi[(16*icy)+14]]; + /************************ end of loop ************************/ + + } + g_hi[(16*(VOLUME+RAND))] = 0; + g_hi[(16*(VOLUME+RAND))+1] = 0; + return; +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/geometry_eo.h b/qcd/part_cpu/applications/QCD/src/kernel_D/geometry_eo.h new file mode 100644 index 0000000000000000000000000000000000000000..fe56221f759459e88b18022296913f78ee42013f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/geometry_eo.h @@ -0,0 +1,25 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _GEOMETRY_EO_H +#define _GEOMETRY_EO_H + +int Index(const int, const int, const int, const int); +void geometry_KD(); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/get_rectangle_staples.c b/qcd/part_cpu/applications/QCD/src/kernel_D/get_rectangle_staples.c new file mode 100644 index 0000000000000000000000000000000000000000..f4db073d1afcb438c9db8ad8f51532144e7f7c69 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/get_rectangle_staples.c @@ -0,0 +1,187 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include "global.h" +#include "su3.h" +#include "get_rectangle_staples.h" + +void get_rectangle_staples(su3 * const v, const int x, const int mu) { + get_rectangle_staples_general(v,x,mu,g_gauge_field); +} + +void get_rectangle_staples_general(su3 * const v, const int x, const int mu, const su3** const gf) { + su3 ALIGN tmp1, tmp2; + int y, z, nu; + su3 * a, * b, * c, * d, * e; +#ifdef _KOJAK_INST +#pragma pomp inst begin(rectstaples) +#endif +#ifdef XLC +#pragma disjoint(*v, tmp1, tmp2, *a, *b, *c, *d, *e) +#endif + _su3_zero((*v)); + for(nu = 0; nu < 4; nu++) { + if(mu != nu) { + /* first contr. starting from x + * a b c e^+ d^+ + * c + * _ + * b| |e + * a| |d + */ + a = &gf[x][nu]; + y = g_iup[x][nu]; + b = &gf[y][nu]; + _su3_times_su3(tmp1, *a, *b); + z = g_iup[y][nu]; + c = &gf[z][mu]; + _su3_times_su3(tmp2, tmp1, *c); + + y = g_iup[x][mu]; + d = &gf[y][nu]; + z = g_iup[y][nu]; + e = &gf[z][nu]; + _su3_times_su3(tmp1, *d, *e); + _su3_times_su3d_acc((*v), tmp2, tmp1); + + /* 1 contr. starting idn[idn[x][nu]][nu] + * e^+ d^+ a b c + * + *e| |c + *d|_|b + * a + */ + y = g_idn[x][nu]; + z = g_idn[y][nu]; + d = &gf[z][nu]; + a = &gf[z][mu]; + _su3d_times_su3(tmp1, *d, *a); + e = &gf[y][nu]; + _su3d_times_su3(tmp2, *e, tmp1); + + y = g_iup[z][mu]; + b = &gf[y][nu]; + z = g_iup[y][nu]; + c = &gf[z][nu]; + _su3_times_su3(tmp1, *b, *c); + _su3_times_su3_acc((*v), tmp2, tmp1); + + /* second contr. starting from x + * a b c e^+ d^+ + * + * bc + * __ + * a| _|e + * d + */ + a = &gf[x][nu]; + y = g_iup[x][nu]; + b = &gf[y][mu]; + _su3_times_su3(tmp1, *a, *b); + z = g_iup[y][mu]; + c = &gf[z][mu]; + _su3_times_su3(tmp2, tmp1, *c); + + y = g_iup[x][mu]; + d = &gf[y][mu]; + z = g_iup[y][mu]; + e = &gf[z][nu]; + _su3_times_su3(tmp1, *d, *e); + _su3_times_su3d_acc((*v), tmp2, tmp1); + + /* 1 contr. starting idn[x][nu] + * d^+ a b c e^+ + * + * e + * _ + * d|__|c + * ab + */ + y = g_idn[x][nu]; + d = &gf[y][nu]; + a = &gf[y][mu]; + _su3d_times_su3(tmp1, *d, *a); + z = g_iup[y][mu]; + b = &gf[z][mu]; + _su3_times_su3(tmp2, tmp1, *b); + + y = g_iup[z][mu]; + c = &gf[y][nu]; + z = g_iup[x][mu]; + e = &gf[z][mu]; + _su3_times_su3d(tmp1, *c, *e); + _su3_times_su3_acc((*v), tmp2, tmp1); + + /* 1 contr. starting idn[idn[x][mu]][nu] + * e^+ d^+ a b c + * + * e + * _ + *d|__|c + * ab + */ + y = g_idn[x][mu]; + z = g_idn[y][nu]; + d = &gf[z][nu]; + a = &gf[z][mu]; + _su3d_times_su3(tmp1, *d, *a); + e = &gf[y][mu]; + _su3d_times_su3(tmp2, *e, tmp1); + + y = g_idn[x][nu]; + b = &gf[y][mu]; + z = g_iup[y][mu]; + c = &gf[z][nu]; + _su3_times_su3(tmp1, *b, *c); + _su3_times_su3_acc((*v), tmp2, tmp1); + + /* 1 contr. starting idn[x][mu] + * d^+ a b c e^+ + * + * bc + * __ + *a|_ |e + * d + */ + y = g_idn[x][mu]; + d = &gf[y][mu]; + z = g_iup[y][nu]; + a = &gf[y][nu]; + _su3d_times_su3(tmp1, *d, *a); + b = &gf[z][mu]; + _su3_times_su3(tmp2, tmp1, *b); + + y = g_iup[x][mu]; + e = &gf[y][nu]; + z = g_iup[x][nu]; + c = &gf[z][mu]; + _su3_times_su3d(tmp1, *c, *e); + _su3_times_su3_acc((*v), tmp2, tmp1); + } + } +#ifdef _KOJAK_INST +#pragma pomp inst end(rectstaples) +#endif +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/get_rectangle_staples.h b/qcd/part_cpu/applications/QCD/src/kernel_D/get_rectangle_staples.h new file mode 100644 index 0000000000000000000000000000000000000000..fa718c205bcd3a6d9cef51ae2e46c609054d9057 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/get_rectangle_staples.h @@ -0,0 +1,25 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _GET_RECTANGLE_STAPLES_H +#define _GET_RECTANGLE_STAPLES_H + +void get_rectangle_staples(su3 * const v, const int x, const int mu); +void get_rectangle_staples_general(su3 * const v, const int x, const int mu, const su3** const gf); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/get_staples.c b/qcd/part_cpu/applications/QCD/src/kernel_D/get_staples.c new file mode 100644 index 0000000000000000000000000000000000000000..49d32c1766cdb068701b39a2ff5104bcdcd6f41c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/get_staples.c @@ -0,0 +1,145 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "su3adj.h" +#include "start.h" +#include "get_staples.h" + + +void get_staples(su3* const staple, const int x, const int mu, const su3 ** in_gauge_field) { + + int iy; + su3 ALIGN st; + const su3 *w1,*w2,*w3; + +#ifdef _KOJAK_INST +#pragma pomp inst begin(staples) +#endif + + _su3_zero(*staple); + for(int k=0;k<4;k++) { + if(k!=mu){ + w1=&in_gauge_field[x][k]; + w2=&in_gauge_field[g_iup[x][k]][mu]; + w3=&in_gauge_field[g_iup[x][mu]][k]; + + /* st = w2 * w3^d */ + _su3_times_su3d(st,*w2,*w3); + /* v = v + w1 * st */ + _su3_times_su3_acc(*staple,*w1,st); + + iy=g_idn[x][k]; + w1=&in_gauge_field[iy][k]; + w2=&in_gauge_field[iy][mu]; + w3=&in_gauge_field[g_iup[iy][mu]][k]; + /* st = w2 * w3 */ + _su3_times_su3(st,*w2,*w3); + /* v = v + w1^d * st */ + _su3d_times_su3_acc(*staple,*w1,st); + } + } +#ifdef _KOJAK_INST +#pragma pomp inst end(staples) +#endif +} + + +void get_spacelike_staples(su3* const staple, const int x, const int mu, const su3 ** in_gauge_field) { + + int iy; + su3 ALIGN st; + const su3 *w1,*w2,*w3; + +#ifdef _KOJAK_INST +#pragma pomp inst begin(staples) +#endif + + _su3_zero(*staple); + for(int k=1;k<4;k++) { + if(k!=mu){ + w1=&in_gauge_field[x][k]; + w2=&in_gauge_field[g_iup[x][k]][mu]; + w3=&in_gauge_field[g_iup[x][mu]][k]; + + /* st = w2 * w3^d */ + _su3_times_su3d(st,*w2,*w3); + /* v = v + w1 * st */ + _su3_times_su3_acc(*staple,*w1,st); + + iy=g_idn[x][k]; + w1=&in_gauge_field[iy][k]; + w2=&in_gauge_field[iy][mu]; + w3=&in_gauge_field[g_iup[iy][mu]][k]; + /* st = w2 * w3 */ + _su3_times_su3(st,*w2,*w3); + /* v = v + w1^d * st */ + _su3d_times_su3_acc(*staple,*w1,st); + } + } +#ifdef _KOJAK_INST +#pragma pomp inst end(staples) +#endif +} + +void get_timelike_staples(su3* const staple, const int x, const int mu, const su3 ** in_gauge_field) { + + int iy; + su3 ALIGN st; + const su3 *w1,*w2,*w3; + +#ifdef _KOJAK_INST +#pragma pomp inst begin(staples) +#endif + + _su3_zero(*staple); + int k = 0; + if(k!=mu){ + w1=&in_gauge_field[x][k]; + w2=&in_gauge_field[g_iup[x][k]][mu]; + w3=&in_gauge_field[g_iup[x][mu]][k]; + + /* st = w2 * w3^d */ + _su3_times_su3d(st,*w2,*w3); + /* v = v + w1 * st */ + _su3_times_su3_acc(*staple,*w1,st); + + iy=g_idn[x][k]; + w1=&in_gauge_field[iy][k]; + w2=&in_gauge_field[iy][mu]; + w3=&in_gauge_field[g_iup[iy][mu]][k]; + /* st = w2 * w3 */ + _su3_times_su3(st,*w2,*w3); + /* v = v + w1^d * st */ + _su3d_times_su3_acc(*staple,*w1,st); + } +#ifdef _KOJAK_INST +#pragma pomp inst end(staples) +#endif +} + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/get_staples.h b/qcd/part_cpu/applications/QCD/src/kernel_D/get_staples.h new file mode 100644 index 0000000000000000000000000000000000000000..5feb2e66bfa8863cb998e694e99c4e65f8dade40 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/get_staples.h @@ -0,0 +1,28 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _GET_STAPLES_H +#define _GET_STAPLES_H + +#include"su3.h" + +void get_staples(su3* const staple, const int x, const int mu, const su3 ** in_gauge_field); +void get_timelike_staples(su3* const staple, const int x, const int mu, const su3 ** in_gauge_field); +void get_spacelike_staples(su3* const staple, const int x, const int mu, const su3 ** in_gauge_field); +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/getopt.c b/qcd/part_cpu/applications/QCD/src/kernel_D/getopt.c new file mode 100644 index 0000000000000000000000000000000000000000..835828c657d50ff0e36a461fc787350005eec147 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/getopt.c @@ -0,0 +1,1073 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +/* Getopt for GNU. + NOTE: getopt is now part of the C library, so if you don't know what + "Keep this file name-space clean" means, talk to drepper@gnu.org + before changing it! + + Copyright (C) 1987, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 2000 + Free Software Foundation, Inc. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with the GNU C Library; see the file COPYING.LIB. If not, + write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. */ + +/* This tells Alpha OSF/1 not to define a getopt prototype in . + Ditto for AIX 3.2 and . */ +#ifndef _NO_PROTO +# define _NO_PROTO +#endif + +#ifdef HAVE_CONFIG_H +# include +#endif + +#if !defined __STDC__ || !__STDC__ +/* This is a separate conditional since some stdc systems + reject `defined (const)'. */ +# ifndef const +# define const +# endif +#endif + +#include + +/* Comment out all this code if we are using the GNU C Library, and are not + actually compiling the library itself. This code is part of the GNU C + Library, but also included in many other GNU distributions. Compiling + and linking in this code is a waste when using the GNU C library + (especially if it is a shared library). Rather than having every GNU + program understand `configure --with-gnu-libc' and omit the object files, + it is simpler to just do this in the source for each such file. */ + +#define GETOPT_INTERFACE_VERSION 2 +#if !defined _LIBC && defined __GLIBC__ && __GLIBC__ >= 2 +# include +# if _GNU_GETOPT_INTERFACE_VERSION == GETOPT_INTERFACE_VERSION +# define ELIDE_CODE +# endif +#endif + +#ifndef ELIDE_CODE + + +/* This needs to come after some library #include + to get __GNU_LIBRARY__ defined. */ +#ifdef __GNU_LIBRARY__ +/* Don't include stdlib.h for non-GNU C libraries because some of them + contain conflicting prototypes for getopt. */ +# include +# include +#endif /* GNU C library. */ + +#ifdef VMS +# include +# if HAVE_STRING_H - 0 +# include +# endif +#endif + +#ifndef _ +/* This is for other GNU distributions with internationalized messages. + When compiling libc, the _ macro is predefined. */ +# ifdef HAVE_LIBINTL_H +# include +# define _(msgid) gettext (msgid) +# else +# define _(msgid) (msgid) +# endif +#endif + +/* This version of `getopt' appears to the caller like standard Unix `getopt' + but it behaves differently for the user, since it allows the user + to intersperse the options with the other arguments. + + As `getopt' works, it permutes the elements of ARGV so that, + when it is done, all the options precede everything else. Thus + all application programs are extended to handle flexible argument order. + + Setting the environment variable POSIXLY_CORRECT disables permutation. + Then the behavior is completely standard. + + GNU application programs can use a third alternative mode in which + they can distinguish the relative order of options and other arguments. */ + +#include "getopt.h" + +/* For communication from `getopt' to the caller. + When `getopt' finds an option that takes an argument, + the argument value is returned here. + Also, when `ordering' is RETURN_IN_ORDER, + each non-option ARGV-element is returned here. */ + +char *optarg; + +/* Index in ARGV of the next element to be scanned. + This is used for communication to and from the caller + and for communication between successive calls to `getopt'. + + On entry to `getopt', zero means this is the first call; initialize. + + When `getopt' returns -1, this is the index of the first of the + non-option elements that the caller should itself scan. + + Otherwise, `optind' communicates from one call to the next + how much of ARGV has been scanned so far. */ + +/* 1003.2 says this must be 1 before any call. */ +int optind = 1; + +/* Formerly, initialization of getopt depended on optind==0, which + causes problems with re-calling getopt as programs generally don't + know that. */ + +int __getopt_initialized; + +/* The next char to be scanned in the option-element + in which the last option character we returned was found. + This allows us to pick up the scan where we left off. + + If this is zero, or a null string, it means resume the scan + by advancing to the next ARGV-element. */ + +static char *nextchar; + +/* Callers store zero here to inhibit the error message + for unrecognized options. */ + +int opterr = 1; + +/* Set to an option character which was unrecognized. + This must be initialized on some systems to avoid linking in the + system's own getopt implementation. */ + +int optopt = '?'; + +/* Describe how to deal with options that follow non-option ARGV-elements. + + If the caller did not specify anything, + the default is REQUIRE_ORDER if the environment variable + POSIXLY_CORRECT is defined, PERMUTE otherwise. + + REQUIRE_ORDER means don't recognize them as options; + stop option processing when the first non-option is seen. + This is what Unix does. + This mode of operation is selected by either setting the environment + variable POSIXLY_CORRECT, or using `+' as the first character + of the list of option characters. + + PERMUTE is the default. We permute the contents of ARGV as we scan, + so that eventually all the non-options are at the end. This allows options + to be given in any order, even with programs that were not written to + expect this. + + RETURN_IN_ORDER is an option available to programs that were written + to expect options and other ARGV-elements in any order and that care about + the ordering of the two. We describe each non-option ARGV-element + as if it were the argument of an option with character code 1. + Using `-' as the first character of the list of option characters + selects this mode of operation. + + The special argument `--' forces an end of option-scanning regardless + of the value of `ordering'. In the case of RETURN_IN_ORDER, only + `--' can cause `getopt' to return -1 with `optind' != ARGC. */ + +static enum +{ + REQUIRE_ORDER, PERMUTE, RETURN_IN_ORDER +} ordering; + +/* Value of POSIXLY_CORRECT environment variable. */ +static char *posixly_correct; + +#ifdef __GNU_LIBRARY__ +/* We want to avoid inclusion of string.h with non-GNU libraries + because there are many ways it can cause trouble. + On some systems, it contains special magic macros that don't work + in GCC. */ +# include +# define my_index strchr +#else + +# if HAVE_STRING_H +# include +# else +# include +# endif + +/* Avoid depending on library functions or files + whose names are inconsistent. */ + +#ifndef getenv +extern char *getenv (); +#endif + +static char * +my_index (str, chr) + const char *str; + int chr; +{ + while (*str) + { + if (*str == chr) + return (char *) str; + str++; + } + return 0; +} + +/* If using GCC, we can safely declare strlen this way. + If not using GCC, it is ok not to declare it. */ +#ifdef __GNUC__ +/* Note that Motorola Delta 68k R3V7 comes with GCC but not stddef.h. + That was relevant to code that was here before. */ +# if (!defined __STDC__ || !__STDC__) && !defined strlen +/* gcc with -traditional declares the built-in strlen to return int, + and has done so at least since version 2.4.5. -- rms. */ +extern int strlen (const char *); +# endif /* not __STDC__ */ +#endif /* __GNUC__ */ + +#endif /* not __GNU_LIBRARY__ */ + +/* Handle permutation of arguments. */ + +/* Describe the part of ARGV that contains non-options that have + been skipped. `first_nonopt' is the index in ARGV of the first of them; + `last_nonopt' is the index after the last of them. */ + +static int first_nonopt; +static int last_nonopt; + +#ifdef _LIBC +/* Bash 2.0 gives us an environment variable containing flags + indicating ARGV elements that should not be considered arguments. */ + +/* Defined in getopt_init.c */ +extern char *__getopt_nonoption_flags; + +static int nonoption_flags_max_len; +static int nonoption_flags_len; + +static int original_argc; +static char *const *original_argv; + +/* Make sure the environment variable bash 2.0 puts in the environment + is valid for the getopt call we must make sure that the ARGV passed + to getopt is that one passed to the process. */ +static void +__attribute__ ((unused)) +store_args_and_env (int argc, char *const *argv) +{ + /* XXX This is no good solution. We should rather copy the args so + that we can compare them later. But we must not use malloc(3). */ + original_argc = argc; + original_argv = argv; +} +# ifdef text_set_element +text_set_element (__libc_subinit, store_args_and_env); +# endif /* text_set_element */ + +# define SWAP_FLAGS(ch1, ch2) \ + if (nonoption_flags_len > 0) \ + { \ + char __tmp = __getopt_nonoption_flags[ch1]; \ + __getopt_nonoption_flags[ch1] = __getopt_nonoption_flags[ch2]; \ + __getopt_nonoption_flags[ch2] = __tmp; \ + } +#else /* !_LIBC */ +# define SWAP_FLAGS(ch1, ch2) +#endif /* _LIBC */ + +/* Exchange two adjacent subsequences of ARGV. + One subsequence is elements [first_nonopt,last_nonopt) + which contains all the non-options that have been skipped so far. + The other is elements [last_nonopt,optind), which contains all + the options processed since those non-options were skipped. + + `first_nonopt' and `last_nonopt' are relocated so that they describe + the new indices of the non-options in ARGV after they are moved. */ + +#if defined __STDC__ && __STDC__ +static void exchange (char **); +#endif + +static void +exchange (argv) + char **argv; +{ + int bottom = first_nonopt; + int middle = last_nonopt; + int top = optind; + char *tem; + + /* Exchange the shorter segment with the far end of the longer segment. + That puts the shorter segment into the right place. + It leaves the longer segment in the right place overall, + but it consists of two parts that need to be swapped next. */ + +#ifdef _LIBC + /* First make sure the handling of the `__getopt_nonoption_flags' + string can work normally. Our top argument must be in the range + of the string. */ + if (nonoption_flags_len > 0 && top >= nonoption_flags_max_len) + { + /* We must extend the array. The user plays games with us and + presents new arguments. */ + char *new_str = malloc (top + 1); + if (new_str == NULL) + nonoption_flags_len = nonoption_flags_max_len = 0; + else + { + memset (__mempcpy (new_str, __getopt_nonoption_flags, + nonoption_flags_max_len), + '\0', top + 1 - nonoption_flags_max_len); + nonoption_flags_max_len = top + 1; + __getopt_nonoption_flags = new_str; + } + } +#endif + + while (top > middle && middle > bottom) + { + if (top - middle > middle - bottom) + { + /* Bottom segment is the short one. */ + int len = middle - bottom; + register int i; + + /* Swap it with the top part of the top segment. */ + for (i = 0; i < len; i++) + { + tem = argv[bottom + i]; + argv[bottom + i] = argv[top - (middle - bottom) + i]; + argv[top - (middle - bottom) + i] = tem; + SWAP_FLAGS (bottom + i, top - (middle - bottom) + i); + } + /* Exclude the moved bottom segment from further swapping. */ + top -= len; + } + else + { + /* Top segment is the short one. */ + int len = top - middle; + register int i; + + /* Swap it with the bottom part of the bottom segment. */ + for (i = 0; i < len; i++) + { + tem = argv[bottom + i]; + argv[bottom + i] = argv[middle + i]; + argv[middle + i] = tem; + SWAP_FLAGS (bottom + i, middle + i); + } + /* Exclude the moved top segment from further swapping. */ + bottom += len; + } + } + + /* Update records for the slots the non-options now occupy. */ + + first_nonopt += (optind - last_nonopt); + last_nonopt = optind; +} + +/* Initialize the internal data when the first call is made. */ + +#if defined __STDC__ && __STDC__ +static const char *_getopt_initialize (int, char *const *, const char *); +#endif +static const char * +_getopt_initialize (argc, argv, optstring) + int argc; + char *const *argv; + const char *optstring; +{ + /* Start processing options with ARGV-element 1 (since ARGV-element 0 + is the program name); the sequence of previously skipped + non-option ARGV-elements is empty. */ + + first_nonopt = last_nonopt = optind; + + nextchar = NULL; + + posixly_correct = getenv ("POSIXLY_CORRECT"); + + /* Determine how to handle the ordering of options and nonoptions. */ + + if (optstring[0] == '-') + { + ordering = RETURN_IN_ORDER; + ++optstring; + } + else if (optstring[0] == '+') + { + ordering = REQUIRE_ORDER; + ++optstring; + } + else if (posixly_correct != NULL) + ordering = REQUIRE_ORDER; + else + ordering = PERMUTE; + +#ifdef _LIBC + if (posixly_correct == NULL + && argc == original_argc && argv == original_argv) + { + if (nonoption_flags_max_len == 0) + { + if (__getopt_nonoption_flags == NULL + || __getopt_nonoption_flags[0] == '\0') + nonoption_flags_max_len = -1; + else + { + const char *orig_str = __getopt_nonoption_flags; + int len = nonoption_flags_max_len = strlen (orig_str); + if (nonoption_flags_max_len < argc) + nonoption_flags_max_len = argc; + __getopt_nonoption_flags = + (char *) malloc (nonoption_flags_max_len); + if (__getopt_nonoption_flags == NULL) + nonoption_flags_max_len = -1; + else + memset (__mempcpy (__getopt_nonoption_flags, orig_str, len), + '\0', nonoption_flags_max_len - len); + } + } + nonoption_flags_len = nonoption_flags_max_len; + } + else + nonoption_flags_len = 0; +#endif + + return optstring; +} + +/* Scan elements of ARGV (whose length is ARGC) for option characters + given in OPTSTRING. + + If an element of ARGV starts with '-', and is not exactly "-" or "--", + then it is an option element. The characters of this element + (aside from the initial '-') are option characters. If `getopt' + is called repeatedly, it returns successively each of the option characters + from each of the option elements. + + If `getopt' finds another option character, it returns that character, + updating `optind' and `nextchar' so that the next call to `getopt' can + resume the scan with the following option character or ARGV-element. + + If there are no more option characters, `getopt' returns -1. + Then `optind' is the index in ARGV of the first ARGV-element + that is not an option. (The ARGV-elements have been permuted + so that those that are not options now come last.) + + OPTSTRING is a string containing the legitimate option characters. + If an option character is seen that is not listed in OPTSTRING, + return '?' after printing an error message. If you set `opterr' to + zero, the error message is suppressed but we still return '?'. + + If a char in OPTSTRING is followed by a colon, that means it wants an arg, + so the following text in the same ARGV-element, or the text of the following + ARGV-element, is returned in `optarg'. Two colons mean an option that + wants an optional arg; if there is text in the current ARGV-element, + it is returned in `optarg', otherwise `optarg' is set to zero. + + If OPTSTRING starts with `-' or `+', it requests different methods of + handling the non-option ARGV-elements. + See the comments about RETURN_IN_ORDER and REQUIRE_ORDER, above. + + Long-named options begin with `--' instead of `-'. + Their names may be abbreviated as long as the abbreviation is unique + or is an exact match for some defined option. If they have an + argument, it follows the option name in the same ARGV-element, separated + from the option name by a `=', or else the in next ARGV-element. + When `getopt' finds a long-named option, it returns 0 if that option's + `flag' field is nonzero, the value of the option's `val' field + if the `flag' field is zero. + + The elements of ARGV aren't really const, because we permute them. + But we pretend they're const in the prototype to be compatible + with other systems. + + LONGOPTS is a vector of `struct option' terminated by an + element containing a name which is zero. + + LONGIND returns the index in LONGOPT of the long-named option found. + It is only valid when a long-named option has been found by the most + recent call. + + If LONG_ONLY is nonzero, '-' as well as '--' can introduce + long-named options. */ + +int +_getopt_internal (argc, argv, optstring, longopts, longind, long_only) + int argc; + char *const *argv; + const char *optstring; + const struct option *longopts; + int *longind; + int long_only; +{ + int print_errors = opterr; + if (optstring[0] == ':') + print_errors = 0; + + optarg = NULL; + + if (optind == 0 || !__getopt_initialized) + { + if (optind == 0) + optind = 1; /* Don't scan ARGV[0], the program name. */ + optstring = _getopt_initialize (argc, argv, optstring); + __getopt_initialized = 1; + } + + /* Test whether ARGV[optind] points to a non-option argument. + Either it does not have option syntax, or there is an environment flag + from the shell indicating it is not an option. The later information + is only used when the used in the GNU libc. */ +#ifdef _LIBC +# define NONOPTION_P (argv[optind][0] != '-' || argv[optind][1] == '\0' \ + || (optind < nonoption_flags_len \ + && __getopt_nonoption_flags[optind] == '1')) +#else +# define NONOPTION_P (argv[optind][0] != '-' || argv[optind][1] == '\0') +#endif + + if (nextchar == NULL || *nextchar == '\0') + { + /* Advance to the next ARGV-element. */ + + /* Give FIRST_NONOPT & LAST_NONOPT rational values if OPTIND has been + moved back by the user (who may also have changed the arguments). */ + if (last_nonopt > optind) + last_nonopt = optind; + if (first_nonopt > optind) + first_nonopt = optind; + + if (ordering == PERMUTE) + { + /* If we have just processed some options following some non-options, + exchange them so that the options come first. */ + + if (first_nonopt != last_nonopt && last_nonopt != optind) + exchange ((char **) argv); + else if (last_nonopt != optind) + first_nonopt = optind; + + /* Skip any additional non-options + and extend the range of non-options previously skipped. */ + + while (optind < argc && NONOPTION_P) + optind++; + last_nonopt = optind; + } + + /* The special ARGV-element `--' means premature end of options. + Skip it like a null option, + then exchange with previous non-options as if it were an option, + then skip everything else like a non-option. */ + + if (optind != argc && !strcmp (argv[optind], "--")) + { + optind++; + + if (first_nonopt != last_nonopt && last_nonopt != optind) + exchange ((char **) argv); + else if (first_nonopt == last_nonopt) + first_nonopt = optind; + last_nonopt = argc; + + optind = argc; + } + + /* If we have done all the ARGV-elements, stop the scan + and back over any non-options that we skipped and permuted. */ + + if (optind == argc) + { + /* Set the next-arg-index to point at the non-options + that we previously skipped, so the caller will digest them. */ + if (first_nonopt != last_nonopt) + optind = first_nonopt; + return -1; + } + + /* If we have come to a non-option and did not permute it, + either stop the scan or describe it to the caller and pass it by. */ + + if (NONOPTION_P) + { + if (ordering == REQUIRE_ORDER) + return -1; + optarg = argv[optind++]; + return 1; + } + + /* We have found another option-ARGV-element. + Skip the initial punctuation. */ + + nextchar = (argv[optind] + 1 + + (longopts != NULL && argv[optind][1] == '-')); + } + + /* Decode the current option-ARGV-element. */ + + /* Check whether the ARGV-element is a long option. + + If long_only and the ARGV-element has the form "-f", where f is + a valid short option, don't consider it an abbreviated form of + a long option that starts with f. Otherwise there would be no + way to give the -f short option. + + On the other hand, if there's a long option "fubar" and + the ARGV-element is "-fu", do consider that an abbreviation of + the long option, just like "--fu", and not "-f" with arg "u". + + This distinction seems to be the most useful approach. */ + + if (longopts != NULL + && (argv[optind][1] == '-' + || (long_only && (argv[optind][2] || !my_index (optstring, argv[optind][1]))))) + { + char *nameend; + const struct option *p; + const struct option *pfound = NULL; + int exact = 0; + int ambig = 0; + int indfound = -1; + int option_index; + + for (nameend = nextchar; *nameend && *nameend != '='; nameend++) + /* Do nothing. */ ; + + /* Test all long options for either exact match + or abbreviated matches. */ + for (p = longopts, option_index = 0; p->name; p++, option_index++) + if (!strncmp (p->name, nextchar, nameend - nextchar)) + { + if ((unsigned int) (nameend - nextchar) + == (unsigned int) strlen (p->name)) + { + /* Exact match found. */ + pfound = p; + indfound = option_index; + exact = 1; + break; + } + else if (pfound == NULL) + { + /* First nonexact match found. */ + pfound = p; + indfound = option_index; + } + else + /* Second or later nonexact match found. */ + ambig = 1; + } + + if (ambig && !exact) + { + if (print_errors) + fprintf (stderr, _("%s: option `%s' is ambiguous\n"), + argv[0], argv[optind]); + nextchar += strlen (nextchar); + optind++; + optopt = 0; + return '?'; + } + + if (pfound != NULL) + { + option_index = indfound; + optind++; + if (*nameend) + { + /* Don't test has_arg with >, because some C compilers don't + allow it to be used on enums. */ + if (pfound->has_arg) + optarg = nameend + 1; + else + { + if (print_errors) + { + if (argv[optind - 1][1] == '-') + /* --option */ + fprintf (stderr, + _("%s: option `--%s' doesn't allow an argument\n"), + argv[0], pfound->name); + else + /* +option or -option */ + fprintf (stderr, + _("%s: option `%c%s' doesn't allow an argument\n"), + argv[0], argv[optind - 1][0], pfound->name); + } + + nextchar += strlen (nextchar); + + optopt = pfound->val; + return '?'; + } + } + else if (pfound->has_arg == 1) + { + if (optind < argc) + optarg = argv[optind++]; + else + { + if (print_errors) + fprintf (stderr, + _("%s: option `%s' requires an argument\n"), + argv[0], argv[optind - 1]); + nextchar += strlen (nextchar); + optopt = pfound->val; + return optstring[0] == ':' ? ':' : '?'; + } + } + nextchar += strlen (nextchar); + if (longind != NULL) + *longind = option_index; + if (pfound->flag) + { + *(pfound->flag) = pfound->val; + return 0; + } + return pfound->val; + } + + /* Can't find it as a long option. If this is not getopt_long_only, + or the option starts with '--' or is not a valid short + option, then it's an error. + Otherwise interpret it as a short option. */ + if (!long_only || argv[optind][1] == '-' + || my_index (optstring, *nextchar) == NULL) + { + if (print_errors) + { + if (argv[optind][1] == '-') + /* --option */ + fprintf (stderr, _("%s: unrecognized option `--%s'\n"), + argv[0], nextchar); + else + /* +option or -option */ + fprintf (stderr, _("%s: unrecognized option `%c%s'\n"), + argv[0], argv[optind][0], nextchar); + } + nextchar = (char *) ""; + optind++; + optopt = 0; + return '?'; + } + } + + /* Look at and handle the next short option-character. */ + + { + char c = *nextchar++; + char *temp = my_index (optstring, c); + + /* Increment `optind' when we start to process its last character. */ + if (*nextchar == '\0') + ++optind; + + if (temp == NULL || c == ':') + { + if (print_errors) + { + if (posixly_correct) + /* 1003.2 specifies the format of this message. */ + fprintf (stderr, _("%s: illegal option -- %c\n"), + argv[0], c); + else + fprintf (stderr, _("%s: invalid option -- %c\n"), + argv[0], c); + } + optopt = c; + return '?'; + } + /* Convenience. Treat POSIX -W foo same as long option --foo */ + if (temp[0] == 'W' && temp[1] == ';') + { + char *nameend; + const struct option *p; + const struct option *pfound = NULL; + int exact = 0; + int ambig = 0; + int indfound = 0; + int option_index; + + /* This is an option that requires an argument. */ + if (*nextchar != '\0') + { + optarg = nextchar; + /* If we end this ARGV-element by taking the rest as an arg, + we must advance to the next element now. */ + optind++; + } + else if (optind == argc) + { + if (print_errors) + { + /* 1003.2 specifies the format of this message. */ + fprintf (stderr, _("%s: option requires an argument -- %c\n"), + argv[0], c); + } + optopt = c; + if (optstring[0] == ':') + c = ':'; + else + c = '?'; + return c; + } + else + /* We already incremented `optind' once; + increment it again when taking next ARGV-elt as argument. */ + optarg = argv[optind++]; + + /* optarg is now the argument, see if it's in the + table of longopts. */ + + for (nextchar = nameend = optarg; *nameend && *nameend != '='; nameend++) + /* Do nothing. */ ; + + /* Test all long options for either exact match + or abbreviated matches. */ + for (p = longopts, option_index = 0; p->name; p++, option_index++) + if (!strncmp (p->name, nextchar, nameend - nextchar)) + { + if ((unsigned int) (nameend - nextchar) == strlen (p->name)) + { + /* Exact match found. */ + pfound = p; + indfound = option_index; + exact = 1; + break; + } + else if (pfound == NULL) + { + /* First nonexact match found. */ + pfound = p; + indfound = option_index; + } + else + /* Second or later nonexact match found. */ + ambig = 1; + } + if (ambig && !exact) + { + if (print_errors) + fprintf (stderr, _("%s: option `-W %s' is ambiguous\n"), + argv[0], argv[optind]); + nextchar += strlen (nextchar); + optind++; + return '?'; + } + if (pfound != NULL) + { + option_index = indfound; + if (*nameend) + { + /* Don't test has_arg with >, because some C compilers don't + allow it to be used on enums. */ + if (pfound->has_arg) + optarg = nameend + 1; + else + { + if (print_errors) + fprintf (stderr, _("\ +%s: option `-W %s' doesn't allow an argument\n"), + argv[0], pfound->name); + + nextchar += strlen (nextchar); + return '?'; + } + } + else if (pfound->has_arg == 1) + { + if (optind < argc) + optarg = argv[optind++]; + else + { + if (print_errors) + fprintf (stderr, + _("%s: option `%s' requires an argument\n"), + argv[0], argv[optind - 1]); + nextchar += strlen (nextchar); + return optstring[0] == ':' ? ':' : '?'; + } + } + nextchar += strlen (nextchar); + if (longind != NULL) + *longind = option_index; + if (pfound->flag) + { + *(pfound->flag) = pfound->val; + return 0; + } + return pfound->val; + } + nextchar = NULL; + return 'W'; /* Let the application handle it. */ + } + if (temp[1] == ':') + { + if (temp[2] == ':') + { + /* This is an option that accepts an argument optionally. */ + if (*nextchar != '\0') + { + optarg = nextchar; + optind++; + } + else + optarg = NULL; + nextchar = NULL; + } + else + { + /* This is an option that requires an argument. */ + if (*nextchar != '\0') + { + optarg = nextchar; + /* If we end this ARGV-element by taking the rest as an arg, + we must advance to the next element now. */ + optind++; + } + else if (optind == argc) + { + if (print_errors) + { + /* 1003.2 specifies the format of this message. */ + fprintf (stderr, + _("%s: option requires an argument -- %c\n"), + argv[0], c); + } + optopt = c; + if (optstring[0] == ':') + c = ':'; + else + c = '?'; + } + else + /* We already incremented `optind' once; + increment it again when taking next ARGV-elt as argument. */ + optarg = argv[optind++]; + nextchar = NULL; + } + } + return c; + } +} + +int +getopt (argc, argv, optstring) + int argc; + char *const *argv; + const char *optstring; +{ + return _getopt_internal (argc, argv, optstring, + (const struct option *) 0, + (int *) 0, + 0); +} + +#endif /* Not ELIDE_CODE. */ + +#ifdef TEST + +/* Compile with -DTEST to make an executable for use in testing + the above definition of `getopt'. */ + +int +main (argc, argv) + int argc; + char **argv; +{ + int c; + int digit_optind = 0; + + while (1) + { + int this_option_optind = optind ? optind : 1; + + c = getopt (argc, argv, "abc:d:0123456789"); + if (c == -1) + break; + + switch (c) + { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + if (digit_optind != 0 && digit_optind != this_option_optind) + printf ("digits occur in two different argv-elements.\n"); + digit_optind = this_option_optind; + printf ("option %c\n", c); + break; + + case 'a': + printf ("option a\n"); + break; + + case 'b': + printf ("option b\n"); + break; + + case 'c': + printf ("option c with value `%s'\n", optarg); + break; + + case '?': + break; + + default: + printf ("?? getopt returned character code 0%o ??\n", c); + } + } + + if (optind < argc) + { + printf ("non-option ARGV-elements: "); + while (optind < argc) + printf ("%s ", argv[optind++]); + printf ("\n"); + } + + exit (0); +} + +#endif /* TEST */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/getopt.h b/qcd/part_cpu/applications/QCD/src/kernel_D/getopt.h new file mode 100644 index 0000000000000000000000000000000000000000..21ec89ba990adacac0ae10738585bad43da05e51 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/getopt.h @@ -0,0 +1,187 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +/* Declarations for getopt. + Copyright (C) 1989,90,91,92,93,94,96,97,98 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Library General Public License as + published by the Free Software Foundation; either version 2 of the + License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Library General Public License for more details. + + You should have received a copy of the GNU Library General Public + License along with the GNU C Library; see the file COPYING.LIB. If not, + write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + Boston, MA 02111-1307, USA. */ + +#ifndef _GETOPT_H + +#ifndef __need_getopt +# define _GETOPT_H 1 +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* For communication from `getopt' to the caller. + When `getopt' finds an option that takes an argument, + the argument value is returned here. + Also, when `ordering' is RETURN_IN_ORDER, + each non-option ARGV-element is returned here. */ + +extern char *optarg; + +/* Index in ARGV of the next element to be scanned. + This is used for communication to and from the caller + and for communication between successive calls to `getopt'. + + On entry to `getopt', zero means this is the first call; initialize. + + When `getopt' returns -1, this is the index of the first of the + non-option elements that the caller should itself scan. + + Otherwise, `optind' communicates from one call to the next + how much of ARGV has been scanned so far. */ + +extern int optind; + +/* Callers store zero here to inhibit the error message `getopt' prints + for unrecognized options. */ + +extern int opterr; + +/* Set to an option character which was unrecognized. */ + +extern int optopt; + +#ifndef __need_getopt +/* Describe the long-named options requested by the application. + The LONG_OPTIONS argument to getopt_long or getopt_long_only is a vector + of `struct option' terminated by an element containing a name which is + zero. + + The field `has_arg' is: + no_argument (or 0) if the option does not take an argument, + required_argument (or 1) if the option requires an argument, + optional_argument (or 2) if the option takes an optional argument. + + If the field `flag' is not NULL, it points to a variable that is set + to the value given in the field `val' when the option is found, but + left unchanged if the option is not found. + + To have a long-named option do something other than set an `int' to + a compiled-in constant, such as set a value from `optarg', set the + option's `flag' field to zero and its `val' field to a nonzero + value (the equivalent single-letter option character, if there is + one). For long options that have a zero `flag' field, `getopt' + returns the contents of the `val' field. */ + +struct option +{ +# if defined __STDC__ && __STDC__ + const char *name; +# else + char *name; +# endif + /* has_arg can't be an enum because some compilers complain about + type mismatches in all the code that assumes it is an int. */ + int has_arg; + int *flag; + int val; +}; + +/* Names for the values of the `has_arg' field of `struct option'. */ + +# define no_argument 0 +# define required_argument 1 +# define optional_argument 2 +#endif /* need getopt */ + + +/* Get definitions and prototypes for functions to process the + arguments in ARGV (ARGC of them, minus the program name) for + options given in OPTS. + + Return the option character from OPTS just read. Return -1 when + there are no more options. For unrecognized options, or options + missing arguments, `optopt' is set to the option letter, and '?' is + returned. + + The OPTS string is a list of characters which are recognized option + letters, optionally followed by colons, specifying that that letter + takes an argument, to be placed in `optarg'. + + If a letter in OPTS is followed by two colons, its argument is + optional. This behavior is specific to the GNU `getopt'. + + The argument `--' causes premature termination of argument + scanning, explicitly telling `getopt' that there are no more + options. + + If OPTS begins with `--', then non-option arguments are treated as + arguments to the option '\0'. This behavior is specific to the GNU + `getopt'. */ + +#if defined(__cplusplus) || (defined (__STDC__) && __STDC__) +# ifdef __GNU_LIBRARY__ +/* Many other libraries have conflicting prototypes for getopt, with + differences in the consts, in stdlib.h. To avoid compilation + errors, only prototype getopt for the GNU C library. */ +extern int getopt (int __argc, char *const *__argv, const char *__shortopts); +# else /* not __GNU_LIBRARY__ */ +/*extern int getopt ();*/ +# endif /* __GNU_LIBRARY__ */ + +# ifndef __need_getopt +extern int getopt_long (int __argc, char *const *__argv, const char *__shortopts, + const struct option *__longopts, int *__longind); +extern int getopt_long_only (int __argc, char *const *__argv, + const char *__shortopts, + const struct option *__longopts, int *__longind); + +/* Internal only. Users should not call this directly. */ +extern int _getopt_internal (int __argc, char *const *__argv, + const char *__shortopts, + const struct option *__longopts, int *__longind, + int __long_only); +# endif +#else /* not __STDC__ */ +extern int getopt (); +# ifndef __need_getopt +extern int getopt_long (); +extern int getopt_long_only (); + +extern int _getopt_internal (); +# endif +#endif /* __STDC__ */ + +#ifdef __cplusplus +} +#endif + +/* Make sure we later can get all the definitions and declarations. */ +#undef __need_getopt + +#endif /* getopt.h */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/gettime.c b/qcd/part_cpu/applications/QCD/src/kernel_D/gettime.c new file mode 100644 index 0000000000000000000000000000000000000000..ce3e3a33481e77b40bc1429ae54caedf5803410b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/gettime.c @@ -0,0 +1,79 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * Copyright (C) 2012 Bartosz Kostrzewa (gettime.[c,h]) + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . +***********************************************************************/ + + +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef HAVE_CLOCK_GETTIME +# ifndef _POSIX_C_SOURCE +# define _POSIX_C_SOURCE 199309L +# endif +# include +# include +#endif +#include +#if (defined BGL && !defined BGP) +# include +#endif +#ifdef MPI +# include +#endif + +#include "gettime.h" + +double gettime(void) { + double t; +#if (defined BGL && !defined BGP) + + const double clockspeed=1.0e-6/700.0; + t = rts_get_timebase() * clockspeed; + +#elif defined MPI + + t = MPI_Wtime(); + + /* clock_gettime is detected on BGL/BGP but it is an unsupported system call so we can't use it! */ +#elif (defined HAVE_CLOCK_GETTIME && !defined BGL) + + struct timespec ts; + + /* on the BGQ the monotonic clock is directly connected to the hardware counters + and reports process CPU time, that is not a good measurement for threaded applications */ +# ifdef BGQ + clock_gettime(CLOCK_REALTIME,&ts); +# else + clock_gettime(CLOCK_MONOTONIC,&ts); +# endif + t = ts.tv_sec + 1.0e-9*ts.tv_nsec; + +#else + /* This number is completely unreliable because the operating system and other processes + make the clock tick too. This is especially true with multiple threads where the number + of clock ticks will be multiplied by roughly the number of threads, but not quite, making + the measurement useless! */ + + t = (double)clock()/(CLOCKS_PER_SEC); + +#endif + + return t; +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/gettime.h b/qcd/part_cpu/applications/QCD/src/kernel_D/gettime.h new file mode 100644 index 0000000000000000000000000000000000000000..47bef5418e073db111a6df27fe70cdaa745225a8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/gettime.h @@ -0,0 +1,32 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * Copyright (C) 2012 Bartosz Kostrzewa (gettime.[c,h]) + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . +***********************************************************************/ + +#ifndef _GETTIME_H +#define _GETTIME_H + +/* gettime provides a time measurement with the BGL real time ticker, + MPI_Wtime, clock_gettime and clock in decreasing order of preference + depending on availability. Except for clock(), all these measurements + are good representations of walltime */ + +double gettime(void); + +#endif /* _GETTIME_H */ + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/git_hash.h b/qcd/part_cpu/applications/QCD/src/kernel_D/git_hash.h new file mode 100644 index 0000000000000000000000000000000000000000..a187ab56b6404b8821755ed634755c4b9a90756d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/git_hash.h @@ -0,0 +1,4 @@ +#ifndef _GIT_HASH_H +#define _GIT_HASH_H +const char git_hash[] = {"5.2.0"}; +#endif /* _GIT_HASH_H */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/global.h b/qcd/part_cpu/applications/QCD/src/kernel_D/global.h new file mode 100644 index 0000000000000000000000000000000000000000..6d28d9e4df6b41b8ceb50231cd60f6f28e0dc6cb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/global.h @@ -0,0 +1,303 @@ +/*********************************************************************** + * + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * Modified by Jenifer Gonzalez Lopez 31.03.2009 + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _GLOBAL_H +#define _GLOBAL_H +/*************************************************************** + * + * File global.h + * + * Global parameters and arrays + * + * + ***************************************************************/ +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#ifdef MPI +# include +#endif +#ifdef FIXEDVOLUME +# include "fixed_volume.h" +#endif +#include "su3.h" +#include "su3adj.h" +//# include + +#define N_CHEBYMAX 49 +#define NTILDE_CHEBYMAX 2000 + +/* size of the extra_masses array for operators using the CGMMS solver */ +#define MAX_EXTRA_MASSES 30 + +#if defined INIT_GLOBALS +# define EXTERN +#else +# define EXTERN extern +#endif + +#if ((defined SSE)||(defined SSE2)||(defined SSE3)) +# include "sse.h" +#elif defined BGL +# include "bgl.h" +#endif + +EXTERN int DUM_DERI, DUM_SOLVER, DUM_MATRIX; +EXTERN int NO_OF_SPINORFIELDS; +EXTERN int NO_OF_SPINORFIELDS_32; + +EXTERN int DUM_BI_DERI, DUM_BI_SOLVER, DUM_BI_MATRIX; +EXTERN int NO_OF_BISPINORFIELDS; + +EXTERN int g_update_gauge_copy; +EXTERN int g_update_gauge_copy_32; +EXTERN int g_relative_precision_flag; +EXTERN int g_debug_level; +EXTERN int g_disable_IO_checks; + +EXTERN int T_global; +#ifndef FIXEDVOLUME +EXTERN int T, L, LX, LY, LZ, VOLUME; +EXTERN int N_PROC_T, N_PROC_X, N_PROC_Y, N_PROC_Z; +EXTERN int RAND, EDGES, VOLUMEPLUSRAND; +EXTERN int TEOSLICE; +EXTERN int SPACEVOLUME, SPACERAND; +#endif + +/* translates from lexicographic order to even/odd order */ +EXTERN int * g_lexic2eo; +/* translates from even/odd order to lexicograhic order */ +EXTERN int * g_eo2lexic; +EXTERN int * g_lexic2eosub; +EXTERN int g_sloppy_precision_flag; +EXTERN int g_sloppy_precision; + +EXTERN int **** g_ipt; +EXTERN int ** g_iup; +EXTERN int ** g_idn; +EXTERN int ** g_iup_eo; /* NEW GIUPDNEO */ +EXTERN int ** g_idn_eo; +EXTERN int ** g_coord; +EXTERN int * g_hi; + + +EXTERN int * g_field_z_ipt_even; +EXTERN int * g_field_z_ipt_odd; + +EXTERN spinor ** g_spinor_field; +EXTERN spinor32 ** g_spinor_field32; + +EXTERN bispinor ** g_bispinor_field; +EXTERN spinor * g_tbuff; + +/* Index independent geometry */ + +EXTERN int * g_field_z_ipt_even; +EXTERN int * g_field_z_ipt_odd; +EXTERN int * g_field_z_disp_even_dn; +EXTERN int * g_field_z_disp_even_up; +EXTERN int * g_field_z_disp_odd_dn; +EXTERN int * g_field_z_disp_odd_up; + +/* this if statement will be removed in future and _INDEX_INDEP_GEOM will be the default */ +#ifdef _INDEX_INDEP_GEOM +EXTERN int g_1st_t_int_dn,g_1st_t_int_up,g_1st_t_ext_dn,g_1st_t_ext_up; +EXTERN int g_1st_x_int_dn,g_1st_x_int_up,g_1st_x_ext_dn,g_1st_x_ext_up; +EXTERN int g_1st_y_int_dn,g_1st_y_int_up,g_1st_y_ext_dn,g_1st_y_ext_up; +EXTERN int g_1st_z_int_dn,g_1st_z_int_up,g_1st_z_ext_dn,g_1st_z_ext_up; +EXTERN int gI_0_0_0_0,gI_L_0_0_0,gI_Lm1_0_0_0,gI_m1_0_0_0,gI_p1_0_0_0,gI_Lp1_0_0_0,gI_Lm2_0_0_0,gI_m2_0_0_0; +EXTERN int gI_0_L_0_0,gI_0_Lm1_0_0,gI_0_m1_0_0,gI_0_p1_0_0,gI_0_Lp1_0_0,gI_0_Lm2_0_0,gI_0_m2_0_0,gI_L_L_0_0; +EXTERN int gI_Lm1_L_0_0,gI_m1_L_0_0,gI_p1_L_0_0,gI_Lp1_L_0_0,gI_Lm2_L_0_0,gI_m2_L_0_0,gI_L_Lp1_0_0,gI_Lm1_Lp1_0_0; +EXTERN int gI_m1_Lp1_0_0,gI_0_0_L_0,gI_0_0_Lm1_0,gI_0_0_m1_0,gI_0_0_p1_0,gI_0_0_Lp1_0,gI_0_0_Lm2_0,gI_0_0_m2_0; +EXTERN int gI_0_L_L_0,gI_0_Lm1_L_0,gI_0_m1_L_0,gI_L_0_L_0,gI_L_0_Lm1_0,gI_L_0_m1_0,gI_0_p1_L_0,gI_0_Lp1_L_0; +EXTERN int gI_0_Lm2_L_0,gI_0_m2_L_0,gI_0_L_Lp1_0,gI_0_Lm1_Lp1_0,gI_0_m1_Lp1_0,gI_Lp1_0_L_0,gI_Lp1_0_Lm1_0; +EXTERN int gI_Lp1_0_m1_0,gI_L_0_p1_0,gI_L_0_Lp1_0,gI_L_0_Lm2_0,gI_L_0_m2_0,gI_0_0_0_L,gI_0_0_0_Lm1,gI_0_0_0_m1; +EXTERN int gI_0_0_0_p1,gI_0_0_0_Lp1,gI_0_0_0_Lm2,gI_0_0_0_m2,gI_0_L_0_L,gI_0_Lm1_0_L,gI_0_m1_0_L,gI_L_0_0_L; +EXTERN int gI_L_0_0_Lm1,gI_L_0_0_m1,gI_0_L_0_L,gI_0_Lm1_0_L,gI_0_m1_0_L,gI_Lp1_0_0_L,gI_Lp1_0_0_Lm1,gI_Lp1_0_0_m1; +EXTERN int gI_L_0_0_p1,gI_L_0_0_Lp1,gI_L_0_0_Lm2,gI_L_0_0_m2,gI_0_L_0_Lp1,gI_0_Lm1_0_Lp1,gI_0_m1_0_Lp1,gI_0_p1_0_L; +EXTERN int gI_0_Lp1_0_L,gI_0_Lm2_0_L,gI_0_m2_0_L,gI_0_0_L_Lp1,gI_0_0_Lm1_Lp1,gI_0_0_m1_Lp1,gI_0_0_p1_L; +EXTERN int gI_0_0_Lp1_L,gI_0_0_Lm2_L,gI_0_0_m2_L,gI_Lp1_m1_0_0,gI_m2_m1_0_0,gI_m2_0_L_0,gI_m2_0_m1_0,gI_0_Lp1_m1_0; +EXTERN int gI_0_m2_m1_0,gI_m2_0_0_L,gI_m2_0_0_m1,gI_0_Lp1_0_m1,gI_0_m2_0_m1,gI_0_0_Lp1_m1,gI_0_0_m2_m1,gI_m1_0_0_m2; +EXTERN int gI_0_0_L_L, gI_0_0_m1_L, gI_0_0_Lm1_L; + +# ifdef _USE_HALFSPINOR +EXTERN int g_HS_shift_t,g_HS_shift_x,g_HS_shift_y,g_HS_shift_z; +# endif + +# ifdef _USE_TSPLITPAR +EXTERN int ** g_field_zt_disp_even_dn; +EXTERN int ** g_field_zt_disp_even_up; +EXTERN int ** g_field_zt_disp_odd_dn; +EXTERN int ** g_field_zt_disp_odd_up; +EXTERN int ** g_1st_eot; +EXTERN int * g_1st_xt_int_dn; +EXTERN int * g_1st_xt_int_up; +EXTERN int * g_1st_xt_ext_dn; +EXTERN int * g_1st_xt_ext_up; +EXTERN int * g_1st_yt_int_dn; +EXTERN int * g_1st_yt_int_up; +EXTERN int * g_1st_yt_ext_dn; +EXTERN int * g_1st_yt_ext_up; +EXTERN int * g_1st_zt_int_dn; +EXTERN int * g_1st_zt_int_up; +EXTERN int * g_1st_zt_ext_dn; +EXTERN int * g_1st_zt_ext_up; +# endif +#endif /* _INDEX_INDEP_GEOM */ + +/* IF PHMC */ +EXTERN spinor ** g_chi_up_spinor_field; +EXTERN spinor ** g_chi_dn_spinor_field; +EXTERN int g_running_phmc; +/* End IF PHMC */ + +EXTERN su3 ** g_gauge_field; +EXTERN su3_32 ** g_gauge_field_32; +#ifdef _USE_HALFSPINOR +EXTERN su3 *** g_gauge_field_copy; +EXTERN su3_32 *** g_gauge_field_copy_32; +#elif (defined _USE_TSPLITPAR ) +EXTERN su3 ** g_gauge_field_copyt; +EXTERN su3 ** g_gauge_field_copys; +#else +EXTERN su3 ** g_gauge_field_copy; +EXTERN su3_32 ** g_gauge_field_copy_32; +#endif + +/*for temporalgauge in GPU part*/ +EXTERN su3 ** g_tempgauge_field; + +EXTERN su3adj ** moment; +EXTERN su3adj ** df0; +EXTERN su3adj ** ddummy; + +EXTERN int count00,count01,count10,count11,count20,count21; +EXTERN double g_kappa, g_c_sw, g_ka_csw_8, g_beta; +EXTERN double g_mu, g_mu1, g_mu2, g_mu3; +EXTERN double g_rgi_C0, g_rgi_C1; + +/* Parameters for non-degenrate case */ +EXTERN double g_mubar, g_epsbar; +EXTERN int g_use_clover_flag; + +/* MPI information */ +EXTERN int g_proc_id, g_nproc, g_stdio_proc, g_nproc_t, g_nproc_x, g_nproc_y, g_nproc_z, g_cart_id; +EXTERN int g_proc_coords[4]; +EXTERN int g_dbw2rand; +EXTERN int g_mpi_time_rank; +EXTERN int g_mpi_SV_rank; +EXTERN int g_mpi_z_rank; +EXTERN int g_mpi_ST_rank; +EXTERN int g_nb_list[8]; + +/* OpenMP Kahan accumulation arrays */ +EXTERN _Complex double *g_omp_acc_cp; +EXTERN double* g_omp_acc_re; + +/* Deflation information */ +EXTERN int g_dflgcr_flag; +EXTERN int g_N_s; +EXTERN int * index_block_eo; + +#ifdef MPI +EXTERN MPI_Status status; +EXTERN MPI_Request req1,req2,req3,req4; +EXTERN MPI_Comm g_cart_grid; +EXTERN MPI_Comm g_mpi_time_slices; +EXTERN MPI_Comm g_mpi_SV_slices; +EXTERN MPI_Comm g_mpi_z_slices; +EXTERN MPI_Comm g_mpi_ST_slices; + +/* the next neighbours for MPI */ +EXTERN int g_nb_x_up, g_nb_x_dn; +EXTERN int g_nb_y_up, g_nb_y_dn; +EXTERN int g_nb_t_up, g_nb_t_dn; +EXTERN int g_nb_z_up, g_nb_z_dn; + +#endif + +#ifdef OMP +EXTERN int omp_num_threads; +#endif + +/* something to evaluate time elaps */ +EXTERN double DeltaTtot, DeltaTcd, DeltaTev; +EXTERN int counter_Spsi; +/* end of the something ... */ + +EXTERN void* g_precWS; + +#ifdef WITHLAPH +/* Jacobi operator per Laplacian Heaviside (LapH) */ +EXTERN su3_vector ** g_jacobi_field; +EXTERN int gI_0_0_0, gI_L_0_0, gI_Lm1_0_0, gI_m1_0_0, gI_0_L_0, gI_0_Lm1_0, gI_0_m1_0, gI_0_0_L, gI_0_0_Lm1, gI_0_0_m1; +EXTERN int tempT,tempV,tempR; +EXTERN int ** g_iup3d; +EXTERN int ** g_idn3d; +#endif + + +#ifdef BENCHMARK +EXTERN double g_acc_Ptilde; +EXTERN double g_acc_Hfin; +EXTERN int g_rec_ev; + +EXTERN double g_eps_sq_force, g_eps_sq_acc; +EXTERN double g_eps_sq_force1, g_eps_sq_force2, g_eps_sq_force3; +EXTERN double g_eps_sq_acc1, g_eps_sq_acc2, g_eps_sq_acc3; +EXTERN int g_csg_N[8]; +EXTERN int g_nr_of_psf; +EXTERN int ITER_MAX_BCG; +EXTERN int ITER_MAX_CG; +#endif + +/*#undef EXTERN + #undef ALIGN */ + + +void fatal_error(char const *error, char const *function); + +/* enumeration type for the sloppy prec. of the inverter */ +typedef enum SloppyPrecision_s { + SLOPPY_DOUBLE = 0, + SLOPPY_SINGLE, + SLOPPY_HALF +} SloppyPrecision; + +/* enumeration type for the compression of the inverter */ +typedef enum CompressionType_s { + NO_COMPRESSION = 18, + COMPRESSION_12 = 12, + COMPRESSION_8 = 8 +} CompressionType; + +/* enumeration type for the external inverter */ +typedef enum ExternalInverter_s { + NO_EXT_INV = 0, + QUDA_INVERTER, + QPHIX_INVERTER +} ExternalInverter; + +#endif + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/hamiltonian_field.h b/qcd/part_cpu/applications/QCD/src/kernel_D/hamiltonian_field.h new file mode 100644 index 0000000000000000000000000000000000000000..e1751e472ecc6a4acbe093b6774149c15633d00b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/hamiltonian_field.h @@ -0,0 +1,38 @@ +/*********************************************************************** + * + * Copyright (C) 2012 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + ***********************************************************************/ + +#ifndef _HAMILTONIAN_FIELD_H +#define _HAMILTONIAN_FIELD_H + +#include +#include + +typedef struct { + su3 ** gaugefield; + su3adj ** momenta; + su3adj ** derivative; + int update_gauge_copy; + int traj_counter; +} hamiltonian_field_t; + + + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/hmc.input b/qcd/part_cpu/applications/QCD/src/kernel_D/hmc.input new file mode 100644 index 0000000000000000000000000000000000000000..883041fd37b0c4dc503c9765fd801e55bdaaafdf --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/hmc.input @@ -0,0 +1,65 @@ +L=8 +T=16 +Measurements = 1 +StartCondition = hot +2KappaMu = 0.03 +kappa = 0.090 +NSave = 50 +BCAngleT = 1 +BCGstabMaxIter = 0 +CGMaxIter = 1000 +GaugeConfigInputFile = conf.save +UseEvenOdd = yes +#PropagatorPrecision = 64 +ReversibilityCheck = no +ReversibilityCheckIntervall = 1 +DebugLevel = 1 + +BeginMonomial GAUGE + UseRectangleStaples = yes + RectangleCoefficient = -0.0833333 + Timescale=0 + Beta = 6.0 +EndMonomial + +BeginMonomial DET + Timescale = 1 + 2KappaMu = 0.1 + kappa = 0.125 + AcceptancePrecision = 1e-20 + ForcePrecision = 1e-12 + Name = det + Solver = CG +# CSGHistory = 7 +# CSGHistory2 = 3 +EndMonomial + + +BeginMonomial DETRATIO + Timescale = 2 + 2KappaMu = 0.03 + 2KappaMu2 = 0.1 + kappa = 0.125 + kappa2 = 0.125 + AcceptancePrecision = 1e-20 + ForcePrecision = 1e-12 + Name = detrat + Solver = CG + CSGHistory = 0 +EndMonomial + +#BeginMonomial NDPOLY +# Timescale = 1 +#EndMonomial + +BeginIntegrator + Type0 = 2MN + Type1 = 2MN + Type2 = 2MN + IntegrationSteps0 = 1 + IntegrationSteps1 = 2 + IntegrationSteps2 = 3 + Tau = 1 + Lambda0 = 0.19 + NumberOfTimescales = 3 +EndIntegrator diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/hmc_tm.c b/qcd/part_cpu/applications/QCD/src/kernel_D/hmc_tm.c new file mode 100644 index 0000000000000000000000000000000000000000..c8866020b2c32b86245e96700a34465b7066b3bd --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/hmc_tm.c @@ -0,0 +1,645 @@ +/*********************************************************************** + * + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * + * Hybrid-Monte-Carlo for twisted mass QCD + * + * Author: Carsten Urbach + * urbach@physik.fu-berlin.de + * + *******************************************************************************/ +#if HAVE_CONFIG_H +#include +#endif +#ifdef BENCHMARK +#include <./c-lime/include/lime.h> +#else +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef MPI +# include +#endif +#ifdef OMP +# include +#endif +#include "global.h" +#include "git_hash.h" +#include +#include +#include "getopt.h" +#include "ranlxd.h" +#include "geometry_eo.h" +#include "start.h" +#include "measure_gauge_action.h" +#include "measure_rectangles.h" +#ifdef MPI +# include "xchange/xchange.h" +#endif +#include "read_input.h" +#include "mpi_init.h" +#include "sighandler.h" +#include "update_tm.h" +#include "init/init.h" +#include "test/check_geometry.h" +#include "boundary.h" +#include "phmc.h" +#include "solver/solver.h" +#include "monomial/monomial.h" +#include "integrator.h" +#include "sighandler.h" +#include "meas/measurements.h" + +extern int nstore; + +int const rlxdsize = 105; + +static void usage(); +static void process_args(int argc, char *argv[], char ** input_filename, char ** filename); +static void set_default_filenames(char ** input_filename, char ** filename); + +int main(int argc,char *argv[]) { + + FILE *parameterfile=NULL, *countfile=NULL; + char *filename = NULL; + char datafilename[206]; + char parameterfilename[206]; + char gauge_filename[50]; + char nstore_filename[50]; + char tmp_filename[50]; + char *input_filename = NULL; + int status = 0, accept = 0; + int j,ix,mu, trajectory_counter=0; + unsigned int const io_max_attempts = 5; /* Make this configurable? */ + unsigned int const io_timeout = 5; /* Make this configurable? */ + struct timeval t1; + + /* Energy corresponding to the Gauge part */ + double plaquette_energy = 0., rectangle_energy = 0.; + /* Acceptance rate */ + int Rate=0; + /* Do we want to perform reversibility checks */ + /* See also return_check_flag in read_input.h */ + int return_check = 0; + + paramsXlfInfo *xlfInfo; + +/* For online measurements */ + measurement * meas; + int imeas; + +#ifdef _KOJAK_INST +#pragma pomp inst init +#pragma pomp inst begin(main) +#endif + +#if (defined SSE || defined SSE2 || SSE3) + signal(SIGILL,&catch_ill_inst); +#endif + + strcpy(gauge_filename,"conf.save"); + strcpy(nstore_filename,".nstore_counter"); + strcpy(tmp_filename, ".conf.tmp"); + + verbose = 1; + g_use_clover_flag = 0; + +#ifdef MPI + +# ifdef OMP + int mpi_thread_provided; + MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_thread_provided); +# else + MPI_Init(&argc, &argv); +# endif + + MPI_Comm_rank(MPI_COMM_WORLD, &g_proc_id); +#else + g_proc_id = 0; +#endif + + process_args(argc,argv,&input_filename,&filename); + set_default_filenames(&input_filename,&filename); + + /* Read the input file */ + if( (status = read_input(input_filename)) != 0) { + fprintf(stderr, "Could not find input file: %s\nAborting...\n", input_filename); + exit(-1); + } + +#ifdef OMP + init_openmp(); +#endif + + DUM_DERI = 4; + DUM_SOLVER = DUM_DERI+1; + DUM_MATRIX = DUM_SOLVER+6; + if(g_running_phmc) { + NO_OF_SPINORFIELDS = DUM_MATRIX+8; + } + else { + NO_OF_SPINORFIELDS = DUM_MATRIX+6; + } + DUM_BI_DERI = 6; + DUM_BI_SOLVER = DUM_BI_DERI+7; + + DUM_BI_MATRIX = DUM_BI_SOLVER+6; + NO_OF_BISPINORFIELDS = DUM_BI_MATRIX+6; + + //4 extra fields (corresponding to DUM_MATRIX+0..5) for deg. and ND matrix mult. + NO_OF_SPINORFIELDS_32 = 6; + + tmlqcd_mpi_init(argc, argv); + + if(nstore == -1) { + countfile = fopen(nstore_filename, "r"); + if(countfile != NULL) { + j = fscanf(countfile, "%d %d %s\n", &nstore, &trajectory_counter, gauge_input_filename); + if(j < 1) nstore = 0; + if(j < 2) trajectory_counter = 0; + fclose(countfile); + } + else { + nstore = 0; + trajectory_counter = 0; + } + } + +#ifndef MPI + g_dbw2rand = 0; +#endif + + + g_mu = g_mu1; + +#ifdef _GAUGE_COPY + status = init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1); + status += init_gauge_field_32(VOLUMEPLUSRAND + g_dbw2rand, 1); +#else + status = init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 0); + status += init_gauge_field_32(VOLUMEPLUSRAND + g_dbw2rand, 0); +#endif + /* need temporary gauge field for gauge reread checks and in update_tm */ + status += init_gauge_tmp(VOLUME); + + if (status != 0) { + fprintf(stderr, "Not enough memory for gauge_fields! Aborting...\n"); + exit(0); + } + j = init_geometry_indices(VOLUMEPLUSRAND + g_dbw2rand); + if (j != 0) { + fprintf(stderr, "Not enough memory for geometry_indices! Aborting...\n"); + exit(0); + } + if(even_odd_flag) { + j = init_spinor_field(VOLUMEPLUSRAND/2, NO_OF_SPINORFIELDS); + j += init_spinor_field_32(VOLUMEPLUSRAND/2, NO_OF_SPINORFIELDS_32); + } + else { + j = init_spinor_field(VOLUMEPLUSRAND, NO_OF_SPINORFIELDS); + j += init_spinor_field_32(VOLUMEPLUSRAND, NO_OF_SPINORFIELDS_32); + } + if (j != 0) { + fprintf(stderr, "Not enough memory for spinor fields! Aborting...\n"); + exit(0); + } + if(even_odd_flag) { + j = init_csg_field(VOLUMEPLUSRAND/2); + } + else { + j = init_csg_field(VOLUMEPLUSRAND); + } + if (j != 0) { + fprintf(stderr, "Not enough memory for csg fields! Aborting...\n"); + exit(0); + } + j = init_moment_field(VOLUME, VOLUMEPLUSRAND + g_dbw2rand); + if (j != 0) { + fprintf(stderr, "Not enough memory for moment fields! Aborting...\n"); + exit(0); + } + + if(g_running_phmc) { + j = init_bispinor_field(VOLUME/2, NO_OF_BISPINORFIELDS); + if (j!= 0) { + fprintf(stderr, "Not enough memory for bi-spinor fields! Aborting...\n"); + exit(0); + } + } + + /* list and initialize measurements*/ + if(g_proc_id == 0) { + printf("\n"); + for(j = 0; j < no_measurements; j++) { + printf("# measurement id %d, type = %d: Frequency %d\n", j, measurement_list[j].type, measurement_list[j].freq); + } + } + init_measurements(); + + /*construct the filenames for the observables and the parameters*/ + strncpy(datafilename,filename,200); + strcat(datafilename,".data"); + strncpy(parameterfilename,filename,200); + strcat(parameterfilename,".para"); + + if(g_proc_id == 0){ + parameterfile = fopen(parameterfilename, "a"); + write_first_messages(parameterfile, "hmc", git_hash); + } + + /* define the geometry */ + geometry(); + + /* define the boundary conditions for the fermion fields */ + boundary(g_kappa); + + status = check_geometry(); + + if (status != 0) { + fprintf(stderr, "Checking of geometry failed. Unable to proceed.\nAborting....\n"); + exit(1); + } + + +#ifdef _USE_HALFSPINOR + j = init_dirac_halfspinor(); + if (j!= 0) { + fprintf(stderr, "Not enough memory for halffield! Aborting...\n"); + exit(-1); + } + + j = init_dirac_halfspinor32(); + if (j != 0) + { + fprintf(stderr, "Not enough memory for 32-bit halffield! Aborting...\n"); + exit(-1); + } + +# if (defined _PERSISTENT) + init_xchange_halffield(); +# endif +#endif + + /* Initialise random number generator */ + start_ranlux_KD(rlxd_level, random_seed^trajectory_counter); + + /* Set up the gauge field */ + /* continue and restart */ + if(startoption==3 || startoption == 2) { + if(g_proc_id == 0) { + printf("# Trying to read gauge field from file %s in %s precision.\n", + gauge_input_filename, (gauge_precision_read_flag == 32 ? "single" : "double")); + fflush(stdout); + } + if( (status = read_gauge_field(gauge_input_filename,g_gauge_field)) != 0) { + fprintf(stderr, "Error %d while reading gauge field from %s\nAborting...\n", status, gauge_input_filename); + exit(-2); + } + + if (g_proc_id == 0){ + printf("# Finished reading gauge field.\n"); + fflush(stdout); + } + } + else if (startoption == 1) { + /* hot */ + random_gauge_field(reproduce_randomnumber_flag, g_gauge_field); + } + else if(startoption == 0) { + /* cold */ + unit_g_gauge_field(); + } + + /*For parallelization: exchange the gaugefield */ +#ifdef MPI + xchange_gauge(g_gauge_field); +#endif + + /*Convert to a 32 bit gauge field, after xchange*/ + convert_32_gauge_field(g_gauge_field_32, g_gauge_field, VOLUMEPLUSRAND + g_dbw2rand); + + + if(even_odd_flag) { + j = init_monomials(VOLUMEPLUSRAND/2, even_odd_flag); + } + else { + j = init_monomials(VOLUMEPLUSRAND, even_odd_flag); + } + if (j != 0) { + fprintf(stderr, "Not enough memory for monomial pseudo fermion fields! Aborting...\n"); + exit(0); + } + + init_integrator(); + + if(g_proc_id == 0) { + for(j = 0; j < no_monomials; j++) { + printf("# monomial id %d type = %d timescale %d\n", j, monomial_list[j].type, monomial_list[j].timescale); + } + } + + plaquette_energy = measure_plaquette( (const su3**) g_gauge_field); + if(g_rgi_C1 > 0. || g_rgi_C1 < 0.) { + rectangle_energy = measure_rectangles( (const su3**) g_gauge_field); + if(g_proc_id == 0){ + fprintf(parameterfile,"# Computed rectangle value: %14.12f.\n",rectangle_energy/(12.*VOLUME*g_nproc)); + } + } + //eneg = g_rgi_C0 * plaquette_energy + g_rgi_C1 * rectangle_energy; + + if(g_proc_id == 0) { + fprintf(parameterfile,"# Computed plaquette value: %14.12f.\n", plaquette_energy/(6.*VOLUME*g_nproc)); + printf("# Computed plaquette value: %14.12f.\n", plaquette_energy/(6.*VOLUME*g_nproc)); + fclose(parameterfile); + } + + /* set ddummy to zero */ + for(ix = 0; ix < VOLUMEPLUSRAND; ix++){ + for(mu=0; mu<4; mu++){ + ddummy[ix][mu].d1=0.; + ddummy[ix][mu].d2=0.; + ddummy[ix][mu].d3=0.; + ddummy[ix][mu].d4=0.; + ddummy[ix][mu].d5=0.; + ddummy[ix][mu].d6=0.; + ddummy[ix][mu].d7=0.; + ddummy[ix][mu].d8=0.; + } + } + + if(g_proc_id == 0) { + gettimeofday(&t1,NULL); + countfile = fopen("history_hmc_tm", "a"); + fprintf(countfile, "!!! Timestamp %ld, Nsave = %d, g_mu = %e, g_mu1 = %e, g_mu_2 = %e, g_mu3 = %e, beta = %f, kappa = %f, C1 = %f, ", + t1.tv_sec, Nsave, g_mu, g_mu1, g_mu2, g_mu3, g_beta, g_kappa, g_rgi_C1); + for(j = 0; j < Integrator.no_timescales; j++) { + fprintf(countfile, "n_int[%d] = %d ", j, Integrator.no_mnls_per_ts[j]); + } + fprintf(countfile, "\n"); + fclose(countfile); + } + + + /* Loop for measurements */ + for(j = 0; j < Nmeas; j++) { + if(g_proc_id == 0) { + printf("#\n# Starting trajectory no %d\n", trajectory_counter); + } + + return_check = return_check_flag && (trajectory_counter%return_check_interval == 0); + + accept = update_tm(&plaquette_energy, &rectangle_energy, datafilename, + return_check, trajectory_counter>=Ntherm, trajectory_counter); + Rate += accept; + + /* Save gauge configuration all Nsave times */ + if((Nsave !=0) && (trajectory_counter%Nsave == 0) && (trajectory_counter!=0)) { + sprintf(gauge_filename,"conf.%.4d", nstore); + if(g_proc_id == 0) { + countfile = fopen("history_hmc_tm", "a"); + fprintf(countfile, "%.4d, measurement %d of %d, Nsave = %d, Plaquette = %e, trajectory nr = %d\n", + nstore, j, Nmeas, Nsave, plaquette_energy/(6.*VOLUME*g_nproc), + trajectory_counter); + fclose(countfile); + } + nstore ++; + } + else { + sprintf(gauge_filename,"conf.save"); + } + if(((Nsave !=0) && (trajectory_counter%Nsave == 0) && (trajectory_counter!=0)) || (write_cp_flag == 1) || (j >= (Nmeas - 1))) { + /* If a reversibility check was performed this trajectory, and the trajectory was accepted, + * then the configuration is currently stored in .conf.tmp, written out by update_tm. + * In that case also a readback was performed, so no need to test .conf.tmp + * In all other cases the gauge configuration still needs to be written out here. */ + + sprintf(tmp_filename,".conf.t%05d.tmp",trajectory_counter); + + if (!(return_check && accept)) + for (unsigned int attempt = 1; attempt <= io_max_attempts; ++attempt) + { + if (g_proc_id == 0) + fprintf(stdout, "# Writing gauge field to %s.\n", tmp_filename); + + xlfInfo = construct_paramsXlfInfo(plaquette_energy/(6.*VOLUME*g_nproc), trajectory_counter); + status = write_gauge_field( tmp_filename, gauge_precision_write_flag, xlfInfo); + free(xlfInfo); + + if (status) { + /* Writing the gauge field failed directly */ + fprintf(stderr, "Error %d while writing gauge field to %s\nAborting...\n", status, tmp_filename); + exit(-2); + } + + if (g_disable_IO_checks) { + if (g_proc_id == 0) + fprintf(stdout, "# Write completed successfully. Write not verified!\n"); + break; + } + + /* Read gauge field back to verify the writeout */ + if (g_proc_id == 0) + fprintf(stdout, "# Write completed, verifying write...\n"); + + for(int read_attempt = 0; read_attempt < 2; ++read_attempt) { + status = read_gauge_field(tmp_filename,gauge_tmp); + if (!status) { + if (g_proc_id == 0) + fprintf(stdout, "# Write successfully verified.\n"); + break; + } else { + if(g_proc_id==0) { + if(read_attempt+1 < 2) { + fprintf(stdout, "# Reread attempt %d out of %d failed, trying again in %d seconds!\n",read_attempt+1,2, 2); + } else { + fprintf(stdout, "# Reread attept %d out of %d failed, write will be reattempted!\n",read_attempt+1,2); + } + } + sleep(2); + } + } + + /* we broke out of the read attempt loop, still need to break out of the write attempt loop ! */ + if(!status) { + break; + } + + if (g_proc_id == 0) { + fprintf(stdout, "# Writeout of %s returned no error, but verification discovered errors.\n", tmp_filename); + fprintf(stdout, "# Potential disk or MPI I/O error.\n"); + fprintf(stdout, "# This was writing attempt %d out of %d.\n", attempt, io_max_attempts); + } + + if (attempt == io_max_attempts) + kill_with_error(NULL, g_proc_id, "Persistent I/O failures!\n"); + + if (g_proc_id == 0) + fprintf(stdout, "# Will attempt to write again in %d seconds.\n", io_timeout); + + sleep(io_timeout); +#ifdef MPI + MPI_Barrier(MPI_COMM_WORLD); +#endif + } + /* Now move .conf.tmp into place */ + if(g_proc_id == 0) { + fprintf(stdout, "# Renaming %s to %s.\n", tmp_filename, gauge_filename); + if (rename(tmp_filename, gauge_filename) != 0) { + /* Errno can be inspected here for more descriptive error reporting */ + fprintf(stderr, "Error while trying to rename temporary file %s to %s. Unable to proceed.\n", tmp_filename, gauge_filename); + exit(-2); + } + countfile = fopen(nstore_filename, "w"); + fprintf(countfile, "%d %d %s\n", nstore, trajectory_counter+1, gauge_filename); + fclose(countfile); + } + } + + /* online measurements */ + for(imeas = 0; imeas < no_measurements; imeas++){ + meas = &measurement_list[imeas]; + if(trajectory_counter%meas->freq == 0){ + if (g_proc_id == 0) { + fprintf(stdout, "#\n# Beginning online measurement.\n"); + } + meas->measurefunc(trajectory_counter, imeas, even_odd_flag); + } + } + + if(g_proc_id == 0) { + verbose = 1; + } + ix = reread_input("hmc.reread"); + if(g_proc_id == 0) { + verbose = 0; + } + +#ifdef MPI + MPI_Barrier(MPI_COMM_WORLD); +#endif + if(ix == 0 && g_proc_id == 0) { + countfile = fopen("history_hmc_tm", "a"); + fprintf(countfile, "# Changed input parameters according to hmc.reread: measurement %d of %d\n", j, Nmeas); + fclose(countfile); + printf("# Changed input parameters according to hmc.reread (see stdout): measurement %d of %d\n", j, Nmeas); + remove("hmc.reread"); + } + trajectory_counter++; + } /* end of loop over trajectories */ + + if(g_proc_id == 0 && Nmeas != 0) { + printf("# Acceptance rate was %3.2f percent, %d out of %d trajectories accepted.\n", 100.*(double)Rate/(double)Nmeas, Rate, Nmeas); + fflush(stdout); + parameterfile = fopen(parameterfilename, "a"); + fprintf(parameterfile, "# Acceptance rate was %3.2f percent, %d out of %d trajectories accepted.\n", 100.*(double)Rate/(double)Nmeas, Rate, Nmeas); + fclose(parameterfile); + } + +#ifdef OMP + free_omp_accumulators(); +#endif + free_gauge_tmp(); + free_gauge_field(); + free_gauge_field_32(); + free_geometry_indices(); + free_spinor_field(); + free_spinor_field_32(); + free_moment_field(); + free_monomials(); + if(g_running_phmc) { + free_bispinor_field(); + free_chi_spinor_field(); + } + free(input_filename); + free(filename); +#ifdef MPI + MPI_Barrier(MPI_COMM_WORLD); + MPI_Finalize(); +#endif + return(0); +#ifdef _KOJAK_INST +#pragma pomp inst end(main) +#endif +} + +static void usage(){ + fprintf(stdout, "HMC for Wilson twisted mass QCD\n"); + fprintf(stdout, "Version %s \n\n", PACKAGE_VERSION); + fprintf(stdout, "Please send bug reports to %s\n", PACKAGE_BUGREPORT); + fprintf(stdout, "Usage: hmc_tm [options]\n"); + fprintf(stdout, "Options: [-f input-filename] default: hmc.input\n"); + fprintf(stdout, " [-o output-filename] default: output\n"); + fprintf(stdout, " [-v] more verbosity\n"); + fprintf(stdout, " [-V] print version information and exit\n"); + fprintf(stdout, " [-h|-? this help]\n"); + exit(0); +} + +static void process_args(int argc, char *argv[], char ** input_filename, char ** filename) { + int c; + while ((c = getopt(argc, argv, "h?vVf:o:")) != -1) { + switch (c) { + case 'f': + *input_filename = calloc(200, sizeof(char)); + strncpy(*input_filename, optarg, 200); + break; + case 'o': + *filename = calloc(200, sizeof(char)); + strncpy(*filename, optarg, 200); + break; + case 'v': + verbose = 1; + break; + case 'V': + if(g_proc_id == 0) { + fprintf(stdout,"%s %s\n",PACKAGE_STRING,git_hash); + } + exit(0); + break; + case 'h': + case '?': + default: + if( g_proc_id == 0 ) { + usage(); + } + break; + } + } +} + +static void set_default_filenames(char ** input_filename, char ** filename) { + if( *input_filename == NULL ) { + *input_filename = calloc(13, sizeof(char)); + strcpy(*input_filename,"hmc.input"); + } + + if( *filename == NULL ) { + *filename = calloc(7, sizeof(char)); + strcpy(*filename,"output"); + } +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/hopping_test.c b/qcd/part_cpu/applications/QCD/src/kernel_D/hopping_test.c new file mode 100644 index 0000000000000000000000000000000000000000..d33010f2d2bdedbacdbab20398e9a7e9672897f0 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/hopping_test.c @@ -0,0 +1,373 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +/******************************************************************************* +* +* Test program for the even-odd preconditioned Wilson-Dirac operator +* +* +*******************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#include +#include +#if (defined BGL && !defined BGP) +# include +#endif +#ifdef MPI +# include +# ifdef HAVE_LIBLEMON +# include +# include +# endif +#endif +#include "su3.h" +#include "su3adj.h" +#include "su3spinor.h" +#include "ranlxd.h" +#include "geometry_eo.h" +#include "read_input.h" +#include "start.h" +#include "boundary.h" +#include "operator/Hopping_Matrix.h" +#include "operator/Hopping_Matrix_nocom.h" +#include "operator/tm_operators.h" +#include "global.h" +#include "xchange/xchange.h" +#include "init/init.h" +#include "test/check_geometry.h" +#include "operator/D_psi.h" +#include "phmc.h" +#include "mpi_init.h" +#include "io/io_cm.h" + +#ifdef PARALLELT +# define SLICE (LX*LY*LZ/2) +#elif defined PARALLELXT +# define SLICE ((LX*LY*LZ/2)+(T*LY*LZ/2)) +#elif defined PARALLELXYT +# define SLICE ((LX*LY*LZ/2)+(T*LY*LZ/2) + (T*LX*LZ/2)) +#elif defined PARALLELXYZT +# define SLICE ((LX*LY*LZ/2)+(T*LY*LZ/2) + (T*LX*LZ/2) + (T*LX*LY/2)) +#elif defined PARALLELX +# define SLICE ((LY*LZ*T/2)) +#elif defined PARALLELXY +# define SLICE ((LY*LZ*T/2) + (LX*LZ*T/2)) +#elif defined PARALLELXYZ +# define SLICE ((LY*LZ*T/2) + (LX*LZ*T/2) + (LX*LY*T/2)) +#endif + + +#define MAX(A, B) ((A) > (B) ? (A) : (B)) +#define MIN(A, B) ((A) < (B) ? (A) : (B)) + +#if (defined BGL && !defined BGP) +static double clockspeed=1.0e-6/700.0; + +double bgl_wtime() { + return ( rts_get_timebase() * clockspeed ); +} +#else +# ifdef MPI +double bgl_wtime() { return(MPI_Wtime()); } +# else +double bgl_wtime() { return(0); } +# endif +#endif + +int check_xchange(); + +int main(int argc,char *argv[]) +{ + int j,j_max,k,k_max = 2; + paramsXlfInfo *xlfInfo; + int ix, n, *nn,*mm,i; + double delta, deltamax; + spinor rsp; + int status = 0; +#ifdef MPI + DUM_DERI = 6; + DUM_SOLVER = DUM_DERI+2; + DUM_MATRIX = DUM_SOLVER+6; + NO_OF_SPINORFIELDS = DUM_MATRIX+2; + + MPI_Init(&argc, &argv); +#endif + g_rgi_C1 = 1.; + + /* Read the input file */ + read_input("hopping_test.input"); + + tmlqcd_mpi_init(argc, argv); + + if(g_proc_id==0) { +#ifdef SSE + printf("# The code was compiled with SSE instructions\n"); +#endif +#ifdef SSE2 + printf("# The code was compiled with SSE2 instructions\n"); +#endif +#ifdef SSE3 + printf("# The code was compiled with SSE3 instructions\n"); +#endif +#ifdef P4 + printf("# The code was compiled for Pentium4\n"); +#endif +#ifdef OPTERON + printf("# The code was compiled for AMD Opteron\n"); +#endif +#ifdef _GAUGE_COPY + printf("# The code was compiled with -D_GAUGE_COPY\n"); +#endif +#ifdef BGL + printf("# The code was compiled for Blue Gene/L\n"); +#endif +#ifdef BGP + printf("# The code was compiled for Blue Gene/P\n"); +#endif +#ifdef _USE_HALFSPINOR + printf("# The code was compiled with -D_USE_HALFSPINOR\n"); +#endif +#ifdef _USE_SHMEM + printf("# the code was compiled with -D_USE_SHMEM\n"); +# ifdef _PERSISTENT + printf("# the code was compiled for persistent MPI calls (halfspinor only)\n"); +# endif +#endif +#ifdef _INDEX_INDEP_GEOM + printf("# the code was compiled with index independent geometry\n"); +#endif +#ifdef MPI +# ifdef _NON_BLOCKING + printf("# the code was compiled for non-blocking MPI calls (spinor and gauge)\n"); +# endif +# ifdef _USE_TSPLITPAR + printf("# the code was compiled with tsplit parallelization\n"); +# endif +#endif + printf("\n"); + fflush(stdout); + } + + +#ifdef _GAUGE_COPY + init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1); +#else + init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 0); +#endif + init_geometry_indices(VOLUMEPLUSRAND + g_dbw2rand); + + if(even_odd_flag) { + j = init_spinor_field(VOLUMEPLUSRAND/2, 2*k_max+1); + } + else { + j = init_spinor_field(VOLUMEPLUSRAND, 2*k_max); + } + + if ( j!= 0) { + fprintf(stderr, "Not enough memory for spinor fields! Aborting...\n"); + exit(0); + } + j = init_moment_field(VOLUME, VOLUMEPLUSRAND); + if ( j!= 0) { + fprintf(stderr, "Not enough memory for moment fields! Aborting...\n"); + exit(0); + } + + if(g_proc_id == 0) { + fprintf(stdout,"The number of processes is %d \n",g_nproc); + printf("# The lattice size is %d x %d x %d x %d\n", + (int)(T*g_nproc_t), (int)(LX*g_nproc_x), (int)(LY*g_nproc_y), (int)(g_nproc_z*LZ)); + printf("# The local lattice size is %d x %d x %d x %d\n", + (int)(T), (int)(LX), (int)(LY),(int) LZ); + if(even_odd_flag) { + printf("# testinging the even/odd preconditioned Dirac operator\n"); + } + else { + printf("# testinging the standard Dirac operator\n"); + } + fflush(stdout); + } + + /* define the geometry */ + geometry(); + /* define the boundary conditions for the fermion fields */ + boundary(g_kappa); + +#ifdef _USE_HALFSPINOR + j = init_dirac_halfspinor(); + if ( j!= 0) { + fprintf(stderr, "Not enough memory for halfspinor fields! Aborting...\n"); + exit(0); + } + if(g_sloppy_precision_flag == 1) { + g_sloppy_precision = 1; + j = init_dirac_halfspinor32(); + if ( j!= 0) { + fprintf(stderr, "Not enough memory for 32-Bit halfspinor fields! Aborting...\n"); + exit(0); + } + } +# if (defined _PERSISTENT) + init_xchange_halffield(); +# endif +#endif + + status = check_geometry(); + if (status != 0) { + fprintf(stderr, "Checking of geometry failed. Unable to proceed.\nAborting....\n"); + exit(1); + } + +#if (defined MPI && !(defined _USE_SHMEM)) + check_xchange(); +#endif + + start_ranlux_KD(1, 123456); + + xlfInfo = construct_paramsXlfInfo(0.5, 0); + + random_gauge_field(reproduce_randomnumber_flag, g_gauge_field); + if ( startoption == 2 ) { /* restart */ + write_gauge_field(gauge_input_filename,gauge_precision_write_flag,xlfInfo); + } else if ( startoption == 0 ) { /* cold */ + unit_g_gauge_field(); + } else if (startoption == 3 ) { /* continue */ + read_gauge_field(gauge_input_filename,g_gauge_field); + } else if ( startoption == 1 ) { /* hot */ + } + + +#ifdef MPI + /*For parallelization: exchange the gaugefield */ + xchange_gauge(g_gauge_field); +#endif + + if(even_odd_flag) { + /*initialize the pseudo-fermion fields*/ + j_max=1; + for (k = 0; k < k_max; k++) { + random_spinor_field_eo(g_spinor_field[k], reproduce_randomnumber_flag, RN_GAUSS); + } + + if (read_source_flag == 2) { /* save */ + /* even first, odd second */ + write_spinorfield_cm_single(g_spinor_field[0],g_spinor_field[1],SourceInfo.basename); + } else if (read_source_flag == 1) { /* yes */ + /* even first, odd second */ + read_spinorfield_cm_single(g_spinor_field[0],g_spinor_field[1],SourceInfo.basename,-1,0); +# if (!defined MPI) + if (write_cp_flag == 1) { + strcat(SourceInfo.basename,".2"); + read_spinorfield_cm_single(g_spinor_field[2],g_spinor_field[3],SourceInfo.basename,-1,0); + + nn=(int*)calloc(VOLUME,sizeof(int)); + if((void*)nn == NULL) return(100); + mm=(int*)calloc(VOLUME,sizeof(int)); + if((void*)mm == NULL) return(100); + + n=0; + deltamax=0.0; + for(ix=0;ix 1.0e-12) { + nn[n] = g_eo2lexic[ix]; + mm[n]=ix; + n++; + } + if(delta>deltamax) deltamax=delta; + } + if (n>0){ + printf("mismatch in even spincolorfield in %d points:\n",n); + for(i=0; i< MIN(n,1000); i++){ + printf("%d,(%d,%d,%d,%d):%f vs. %f\n",nn[i],g_coord[nn[i]][0],g_coord[nn[i]][1],g_coord[nn[i]][2],g_coord[nn[i]][3],creal((g_spinor_field[2][mm[i]].s0).c0), creal((g_spinor_field[0][mm[i]].s0).c0));fflush(stdout); + } + } + n = 0; + for(ix=0;ix 1.0e-12) { + nn[n]=g_eo2lexic[ix+(VOLUME+RAND)/2]; + mm[n]=ix; + n++; + } + if(delta>deltamax) deltamax=delta; + } + if (n>0){ + printf("mismatch in odd spincolorfield in %d points:\n",n); + for(i=0; i< MIN(n,1000); i++){ + printf("%d,(%d,%d,%d,%d):%f vs. %f\n",nn[i],g_coord[nn[i]][0],g_coord[nn[i]][1],g_coord[nn[i]][2],g_coord[nn[i]][3],creal(g_spinor_field[3][mm[i]].s0.c0), creal(g_spinor_field[1][mm[i]].s0.c0));fflush(stdout); + } + } + printf("max delta=%e",deltamax);fflush(stdout); + } +# endif + } + + if (read_source_flag > 0 && write_cp_flag == 0) { /* read-source yes or nobutsave; checkpoint no */ + /* first spinorial arg is output, the second is input */ + Hopping_Matrix(1, g_spinor_field[1], g_spinor_field[0]); /*ieo=1 M_{eo}*/ + Hopping_Matrix(0, g_spinor_field[0], g_spinor_field[1]); /*ieo=0 M_{oe}*/ + strcat(SourceInfo.basename,".out"); + write_spinorfield_cm_single(g_spinor_field[0],g_spinor_field[1],SourceInfo.basename); + printf("Check-field printed. Exiting...\n"); + fflush(stdout); + } + +#ifdef MPI + MPI_Barrier(MPI_COMM_WORLD); + MPI_Finalize(); +#endif + } + + free_gauge_field(); + free_geometry_indices(); + free_spinor_field(); + free_moment_field(); + return(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/include/tmLQCD.h b/qcd/part_cpu/applications/QCD/src/kernel_D/include/tmLQCD.h new file mode 100755 index 0000000000000000000000000000000000000000..edcebb5a850d9af6b21785c255a6ceb581a2868c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/include/tmLQCD.h @@ -0,0 +1,58 @@ +/*********************************************************************** + * + * Copyright (C) 2014 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * invert wrapper for using tmLQCD as a library + * + * Author: Carsten Urbach + * curbach@gmx.de + * + *******************************************************************************/ + +#ifndef _TMLQCD_H +#define _TMLQCD_H + +#ifdef __cplusplus +extern "C" +{ +#endif /* __cplusplus */ + + typedef struct { + unsigned int LX, LY, LZ, T, nstore, nsave, no_operators; + } tmLQCD_lat_params; + + typedef struct { + unsigned int nproc, nproc_t, nproc_x, nproc_y, nproc_z, cart_id, proc_id, time_rank, omp_num_threads; + unsigned int proc_coords[4]; + } tmLQCD_mpi_params; + + int tmLQCD_invert_init(int argc, char *argv[], const int verbose); + int tmLQCD_read_gauge(const int nconfig); + int tmLQCD_invert(double * const propagator, double * const source, + const int op_id, const int write_prop); + int tmLQCD_finalise(); + + int tmLQCD_get_gauge_field_pointer(double ** gf); + int tmLQCD_get_mpi_params(tmLQCD_mpi_params * params); + int tmLQCD_get_lat_params(tmLQCD_lat_params * params); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/init/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_D/init/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..b6fe3480700c3f27cb484a947e051ee7367bb51d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/init/Makefile @@ -0,0 +1,99 @@ + +srcdir = . +top_builddir = .. +abs_top_builddir = /home/jacob/Job/Prace/kernel_D_soft/kernel_D_update2 +top_srcdir = .. +abs_top_srcdir = /home/jacob/Job/Prace/kernel_D_soft/kernel_D_update2 +subdir = init +builddir = . + +CFLAGS = -std=c99 -fopenmp -pedantic -Wall +DEPFLAGS = -MM +LDFLAGS = -L${HOME}/lib -L${top_builddir}/lib +DEFS = -DHAVE_CONFIG_H +OPTARGS = -O +SOPTARGS = -O + +AR = ar +RANLIB = ranlib +CC = mpicc +CCDEP = gcc +CCLD = ${CC} +LINK = ${CCLD} ${CFLAGS} ${LDFLAGS} ${OPTARGS} -o $@ +LEX = flex +AUTOCONF = autoconf +DEFS = -DHAVE_CONFIG_H + +INCLUDES = -I$(HOME)/include/ -I. -I${abs_top_builddir}/ -I${abs_top_srcdir}/ -I/include/ -I/include/ +LDADD = +#COMPILE = ${CC} ${DEFS} ${INCLUDES} ${CFLAGS} +COMPILE = ${CC} $(DEFS) ${INCLUDES} ${CFLAGS} + +LIBRARIES = libinit +libinit_TARGETS = init_moment_field init_gauge_tmp init_gauge_field \ + init_geometry_indices init_spinor_field init_dirac_halfspinor \ + init_chi_spinor_field init_bispinor_field init_jacobi_field \ + init_omp_accumulators init_openmp + +libinit_STARGETS = + +libinit_OBJECTS = $(addsuffix .o, ${libinit_TARGETS}) +libinit_SOBJECTS = $(addsuffix .o, ${libinit_STARGETS}) + +# default rule + +all: Makefile dep libinit.a + +# rules for debugging +debug all-debug: CFLAGS := $(CFLAGS) -g +debug all-debug: all + +# rules for profiling information +profile all-profile: CFLAGS := $(filter-out -fomit-frame-pointer,${CFLAGS}) +profile all-profile: all + + +#include dep rules + +-include $(addsuffix .d,${libinit_TARGETS}) + +include ${top_srcdir}/Makefile.global + +# rule to compile objects + +${libinit_OBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h + $(COMPILE) ${OPTARGS} -c $< + +${libinit_SOBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h + $(COMPILE) ${SOPTARGS} -c $< + +# rule to make libinit + +libinit.a: ${libinit_OBJECTS} ${libinit_SOBJECTS} Makefile + @rm -f libinit.a + @${AR} cru libinit.a ${libinit_OBJECTS} ${libinit_SOBJECTS} + @$(RANLIB) libinit.a + @cp libinit.a ../lib/libinit.a + +# rule to generate .d files + +$(addsuffix .d, $(libinit_TARGETS) ${libinit_STARGETS}): %.d: ${srcdir}/%.c Makefile + @${CCDEP} ${DEFS} ${DEPFLAGS} ${INCLUDES} $< > $@ + +# rule to make dependencies + +dep: ${addsuffix .d, ${libinit_TARGETS} ${libinit_STARGETS}} + +# rules to clean + +compile-clean: Makefile + rm -f ${$(addsuffix _OBJECTS, ${LIBRARIES})} ${$(addsuffix _SOBJECTS, ${LIBRARIES})} *.d + +clean: compile-clean + rm -f $(addsuffix .a, ${LIBRARIES}) + rm -f ../lib/libinit.a + +distclean: clean + rm -f Makefile + +.PHONY: all dep clean compile-clean distclean profile all-profile debug all-debug diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/init/Makefile.in b/qcd/part_cpu/applications/QCD/src/kernel_D/init/Makefile.in new file mode 100644 index 0000000000000000000000000000000000000000..65cae4101ff818bf202dc0c78a88689774b4b844 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/init/Makefile.in @@ -0,0 +1,99 @@ + +srcdir = @srcdir@ +top_builddir = @top_builddir@ +abs_top_builddir = @abs_top_builddir@ +top_srcdir = @top_srcdir@ +abs_top_srcdir = @abs_top_srcdir@ +subdir = init +builddir = @builddir@ + +CFLAGS = @CFLAGS@ +DEPFLAGS = @DEPFLAGS@ +LDFLAGS = @LDFLAGS@ +DEFS = @DEFS@ +OPTARGS = @OPTARGS@ +SOPTARGS = @SOPTARGS@ + +AR = @AR@ +RANLIB = @RANLIB@ +CC = @CC@ +CCDEP = @CCDEP@ +CCLD = ${CC} +LINK = ${CCLD} ${CFLAGS} ${LDFLAGS} ${OPTARGS} -o $@ +LEX = @LEX@ +AUTOCONF = @AUTOCONF@ +DEFS = @DEFS@ + +INCLUDES = @INCLUDES@ +LDADD = +#COMPILE = ${CC} ${DEFS} ${INCLUDES} ${CFLAGS} +COMPILE = ${CC} $(DEFS) ${INCLUDES} ${CFLAGS} + +LIBRARIES = libinit +libinit_TARGETS = init_moment_field init_gauge_tmp init_gauge_field \ + init_geometry_indices init_spinor_field init_dirac_halfspinor \ + init_chi_spinor_field init_bispinor_field init_jacobi_field \ + init_omp_accumulators init_openmp + +libinit_STARGETS = + +libinit_OBJECTS = $(addsuffix .o, ${libinit_TARGETS}) +libinit_SOBJECTS = $(addsuffix .o, ${libinit_STARGETS}) + +# default rule + +all: Makefile dep libinit.a + +# rules for debugging +debug all-debug: CFLAGS := $(CFLAGS) @DEBUG_FLAG@ +debug all-debug: all + +# rules for profiling information +profile all-profile: CFLAGS := $(filter-out -fomit-frame-pointer,${CFLAGS}) @PROFILE_FLAG@ +profile all-profile: all + + +#include dep rules + +-include $(addsuffix .d,${libinit_TARGETS}) + +include ${top_srcdir}/Makefile.global + +# rule to compile objects + +${libinit_OBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h + $(COMPILE) ${OPTARGS} -c $< + +${libinit_SOBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h + $(COMPILE) ${SOPTARGS} -c $< + +# rule to make libinit + +libinit.a: ${libinit_OBJECTS} ${libinit_SOBJECTS} Makefile + @rm -f libinit.a + @${AR} cru libinit.a ${libinit_OBJECTS} ${libinit_SOBJECTS} + @$(RANLIB) libinit.a + @cp libinit.a ../lib/libinit.a + +# rule to generate .d files + +$(addsuffix .d, $(libinit_TARGETS) ${libinit_STARGETS}): %.d: ${srcdir}/%.c Makefile + @${CCDEP} ${DEFS} ${DEPFLAGS} ${INCLUDES} $< > $@ + +# rule to make dependencies + +dep: ${addsuffix .d, ${libinit_TARGETS} ${libinit_STARGETS}} + +# rules to clean + +compile-clean: Makefile + rm -f ${$(addsuffix _OBJECTS, ${LIBRARIES})} ${$(addsuffix _SOBJECTS, ${LIBRARIES})} *.d + +clean: compile-clean + rm -f $(addsuffix .a, ${LIBRARIES}) + rm -f ../lib/libinit.a + +distclean: clean + rm -f Makefile + +.PHONY: all dep clean compile-clean distclean profile all-profile debug all-debug diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/init/init.h b/qcd/part_cpu/applications/QCD/src/kernel_D/init/init.h new file mode 100644 index 0000000000000000000000000000000000000000..5449774b806170174f5b0d57f82bc5965d042592 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/init/init.h @@ -0,0 +1,41 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _INIT_H +#define _INIT_H + +#include "init/init_bispinor_field.h" +#include "init/init_chi_spinor_field.h" +#include "init/init_dirac_halfspinor.h" +#include "init/init_gauge_field.h" +#include "init/init_gauge_tmp.h" +#include "init/init_geometry_indices.h" +#ifdef WITHLAP +# include "init/init_jacobi_field.h" +#endif +#include "init/init_moment_field.h" +#include "init/init_spinor_field.h" +#include "init/init_stout_smear_vars.h" +#ifdef OMP +# include +# include "init/init_omp_accumulators.h" +# include "init/init_openmp.h" +#endif + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_bispinor_field.c b/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_bispinor_field.c new file mode 100644 index 0000000000000000000000000000000000000000..518903b7c696b05760a185436d07dbd90e6eef48 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_bispinor_field.c @@ -0,0 +1,65 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "sse.h" + + +bispinor * bisp = NULL; + +int init_bispinor_field(const int V, const int nr) { + int i = 0; + + if((void*)(bisp = (bispinor*)calloc(nr*V+1, sizeof(bispinor))) == NULL) { + printf ("malloc errno : %d\n",errno); + errno = 0; + return(1); + } + if((void*)(g_bispinor_field = malloc(nr*sizeof(bispinor*))) == NULL) { + printf ("malloc errno : %d\n",errno); + errno = 0; + return(2); + } +#if ( defined SSE || defined SSE2 || defined SSE3) + g_bispinor_field[0] = (bispinor*)(((unsigned long int)(bisp)+ALIGN_BASE)&~ALIGN_BASE); +#else + g_bispinor_field[0] = bisp; +#endif + + for(i = 1; i < nr; i++){ + g_bispinor_field[i] = g_bispinor_field[i-1]+V; + } + + return(0); +} + +void free_bispinor_field() { + + free(bisp); + /* free(sp_csg); */ +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_bispinor_field.h b/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_bispinor_field.h new file mode 100644 index 0000000000000000000000000000000000000000..be7df66cbf55ff947472f4ff582cb218ae58abd8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_bispinor_field.h @@ -0,0 +1,27 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _INIT_BISPINOR_FIELD_H +#define _INIT_BISPINOR_FIELD_H + + +int init_bispinor_field(const int V, const int nr); + +void free_bispinor_field(); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_chi_spinor_field.c b/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_chi_spinor_field.c new file mode 100644 index 0000000000000000000000000000000000000000..6a63ba4fb438110ceffc2437e1f91b223fb63b5e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_chi_spinor_field.c @@ -0,0 +1,80 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "sse.h" +#include "init_chi_spinor_field.h" + +spinor * sp_up = NULL; + +static int chi_initialised = 0; + +int init_chi_spinor_field(const int V, const int nr) { + int i = 0; + static int _nr = 0; + + if(!chi_initialised || nr > _nr) { + free_chi_spinor_field(); + _nr = nr; + chi_initialised = 1; + if((void*)(sp_up = (spinor*)calloc(2*nr*V+1, sizeof(spinor))) == NULL) { + printf ("malloc errno : %d\n",errno); + errno = 0; + return(1); + } + if((void*)(g_chi_up_spinor_field = malloc(2*nr*sizeof(spinor*))) == NULL) { + printf ("malloc errno : %d\n",errno); + errno = 0; + return(2); + } + if((void*)(g_chi_dn_spinor_field = malloc(nr*sizeof(spinor*))) == NULL) { + printf ("malloc errno : %d\n",errno); + errno = 0; + return(2); + } + g_chi_up_spinor_field[0] = (spinor*)(((unsigned long int)(sp_up)+ALIGN_BASE)&~ALIGN_BASE); + + for(int i = 1; i < 2*nr; i++){ + g_chi_up_spinor_field[i] = g_chi_up_spinor_field[i-1]+V; + } + for(int i = 0; i < nr; i++){ + g_chi_dn_spinor_field[i] = g_chi_up_spinor_field[nr+i]; + } + + } + return(0); +} + +void free_chi_spinor_field() { + if(chi_initialised) { + free(sp_up); + free(g_chi_dn_spinor_field); + free(g_chi_up_spinor_field); + } + return; +} + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_chi_spinor_field.h b/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_chi_spinor_field.h new file mode 100644 index 0000000000000000000000000000000000000000..6634b8db52eb1c7581b06437fe98ef2c675c074e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_chi_spinor_field.h @@ -0,0 +1,25 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _INIT_CHI_SPINOR_FIELD_H +#define _INIT_CHI_SPINOR_FIELD_H + +int init_chi_spinor_field(const int V, const int nr); +void free_chi_spinor_field(); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_dirac_halfspinor.c b/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_dirac_halfspinor.c new file mode 100644 index 0000000000000000000000000000000000000000..125981466b4d8c9b91ceb04c05c8bd7b1644c63c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_dirac_halfspinor.c @@ -0,0 +1,583 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2012 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#if (defined SPI) +# include "DirectPut.h" +#endif +#include "global.h" +#include "su3.h" +#include "init_dirac_halfspinor.h" +#include "fatal_error.h" + +#ifdef BGQ +# define SPI_ALIGN_BASE 0x7f +#else +# define SPI_ALIGN_BASE ALIGN_BASE +#endif + +halfspinor ** NBPointer_; +halfspinor * HalfSpinor_; +halfspinor * HalfSpinor ALIGN; +halfspinor *** NBPointer; +halfspinor * sendBuffer, * recvBuffer; +halfspinor * sendBuffer_, * recvBuffer_; + +/* The single precision versions */ +halfspinor32 ** NBPointer32_; +halfspinor32 * HalfSpinor32_; +halfspinor32 * HalfSpinor32 ALIGN; +halfspinor32 *** NBPointer32; +halfspinor32 * sendBuffer32, * recvBuffer32; +halfspinor32 * sendBuffer32_, * recvBuffer32_; + + +int init_dirac_halfspinor() { + int j=0, k; + int x, y, z, t; + + NBPointer = (halfspinor***) calloc(4,sizeof(halfspinor**)); + NBPointer_ = (halfspinor**) calloc(16,(VOLUME+RAND)*sizeof(halfspinor*)); + NBPointer[0] = NBPointer_; + NBPointer[1] = NBPointer_ + (8*(VOLUME+RAND)/2); + NBPointer[2] = NBPointer_ + (16*(VOLUME+RAND)/2); + NBPointer[3] = NBPointer_ + (24*(VOLUME+RAND)/2); + + if((void*)(HalfSpinor_ = (halfspinor*)calloc(4*(VOLUME)+1, sizeof(halfspinor))) == NULL) { + printf ("malloc errno : %d\n",errno); + errno = 0; + return(1); + } + + HalfSpinor = (halfspinor*)(((unsigned long int)(HalfSpinor_)+ALIGN_BASE+1)&~ALIGN_BASE); + +#ifdef MPI + if((void*)(sendBuffer_ = (halfspinor*)calloc(RAND/2+8, sizeof(halfspinor))) == NULL) { + printf ("malloc errno : %d\n",errno); + errno = 0; + return(1); + } + sendBuffer = (halfspinor*)(((unsigned long int)(sendBuffer_)+SPI_ALIGN_BASE+1)&~SPI_ALIGN_BASE); + if((void*)(recvBuffer_ = (halfspinor*)calloc(RAND/2+8, sizeof(halfspinor))) == NULL) { + printf ("malloc errno : %d\n",errno); + errno = 0; + return(1); + } + recvBuffer = (halfspinor*)(((unsigned long int)(recvBuffer_)+SPI_ALIGN_BASE+1)&~SPI_ALIGN_BASE); +#endif + + for(int ieo = 0; ieo < 2; ieo++) { + for(int i = 0; i < VOLUME/2; i++) { + j = g_eo2lexic[i + ((ieo+1)%2)*(VOLUME+RAND)/2]; + /* get (t,x,y,z) from j */ + t = j/(LX*LY*LZ); + x = (j-t*(LX*LY*LZ))/(LY*LZ); + y = (j-t*(LX*LY*LZ)-x*(LY*LZ))/(LZ); + z = (j-t*(LX*LY*LZ)-x*(LY*LZ) - y*LZ); + for(int mu = 0; mu < 4; mu++) { + NBPointer[ieo][8*i + 2*mu + 0] = &HalfSpinor[ 8*g_lexic2eosub[ g_idn[j][mu] ] + 2*mu + 0]; + NBPointer[ieo][8*i + 2*mu + 1] = &HalfSpinor[ 8*g_lexic2eosub[ g_iup[j][mu] ] + 2*mu + 1]; + } +#if ((defined PARALLELT) || (defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT)) + if(t == 0) { + k = (g_lexic2eosub[g_idn[j][0]] - VOLUME/2); + NBPointer[ieo][8*i] = &sendBuffer[ k ]; + } + if(t == T-1) { + k = (g_lexic2eosub[g_iup[j][0]] - VOLUME/2); + NBPointer[ieo][8*i + 1] = &sendBuffer[ k ]; + } +#endif +#if ((defined PARALLELX) || (defined PARALLELXY) || (defined PARALLELXYZ) || (defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT)) + if(x == 0) { + k = (g_lexic2eosub[g_idn[j][1]] - VOLUME/2); + NBPointer[ieo][8*i + 2] = &sendBuffer[ k ]; + } + if(x == LX-1) { + k = (g_lexic2eosub[g_iup[j][1]] - VOLUME/2); + NBPointer[ieo][8*i + 3] = &sendBuffer[ k ]; + } +#endif +#if ((defined PARALLELXY) || (defined PARALLELXYZ) || (defined PARALLELXYT) || (defined PARALLELXYZT)) + if(y == 0) { + k = (g_lexic2eosub[g_idn[j][2]] - VOLUME/2); + NBPointer[ieo][8*i + 4] = &sendBuffer[ k ]; + } + if(y == LY-1) { + k = (g_lexic2eosub[g_iup[j][2]] - VOLUME/2); + NBPointer[ieo][8*i + 5] = &sendBuffer[ k ]; + } +#endif +#if ((defined PARALLELXYZ) || (defined PARALLELXYZT)) + if(z == 0) { + k = (g_lexic2eosub[g_idn[j][3]] - VOLUME/2); + NBPointer[ieo][8*i + 6] = &sendBuffer[ k ]; + } + if(z == LZ-1) { + k = (g_lexic2eosub[g_iup[j][3]] - VOLUME/2); + NBPointer[ieo][8*i + 7] = &sendBuffer[ k ]; + } +#endif + } + for(int i = VOLUME/2; i < (VOLUME+RAND)/2; i++) { + for(int mu = 0; mu < 8; mu++) { + NBPointer[ieo][8*i + mu] = NBPointer[ieo][0]; + } + } + } + for(int ieo = 2; ieo < 4; ieo++) { + for(int i = 0; i < VOLUME/2; i++) { + j = g_eo2lexic[i + ((ieo+0)%2)*(VOLUME+RAND)/2]; + /* get (t,x,y,z) from j */ + t = j/(LX*LY*LZ); + x = (j-t*(LX*LY*LZ))/(LY*LZ); + y = (j-t*(LX*LY*LZ)-x*(LY*LZ))/(LZ); + z = (j-t*(LX*LY*LZ)-x*(LY*LZ) - y*LZ); + for(int mu = 0; mu < 8; mu++) { + NBPointer[ieo][8*i + mu] = &HalfSpinor[8*i + mu]; + } +#if ((defined PARALLELT) || (defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT)) + if(t == T-1) { + NBPointer[ieo][8*i] = &recvBuffer[ (g_lexic2eosub[ g_iup[j][0] ] - VOLUME/2)]; + } + if(t == 0) { + NBPointer[ieo][8*i + 1] = &recvBuffer[ (g_lexic2eosub[ g_idn[j][0] ] - VOLUME/2)]; + } +#endif +#if ((defined PARALLELX) || (defined PARALLELXY) || (defined PARALLELXYZ) || (defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT)) + if(x == LX-1) { + NBPointer[ieo][8*i + 2] = &recvBuffer[ (g_lexic2eosub[ g_iup[j][1] ] - VOLUME/2)]; + } + if(x == 0) { + NBPointer[ieo][8*i + 3] = &recvBuffer[ (g_lexic2eosub[ g_idn[j][1] ] - VOLUME/2)]; + } +#endif +#if ((defined PARALLELXY) || (defined PARALLELXYZ) || (defined PARALLELXYT) || (defined PARALLELXYZT)) + if(y == LY-1) { + NBPointer[ieo][8*i + 4] = &recvBuffer[ (g_lexic2eosub[ g_iup[j][2] ] - VOLUME/2)]; + } + if(y == 0) { + NBPointer[ieo][8*i + 5] = &recvBuffer[ (g_lexic2eosub[ g_idn[j][2] ] - VOLUME/2)]; + } +#endif +#if ((defined PARALLELXYZ) || (defined PARALLELXYZT)) + if(z == LZ-1) { + NBPointer[ieo][8*i + 6] = &recvBuffer[ (g_lexic2eosub[ g_iup[j][3] ] - VOLUME/2)]; + } + if(z == 0) { + NBPointer[ieo][8*i + 7] = &recvBuffer[ (g_lexic2eosub[ g_idn[j][3] ] - VOLUME/2)]; + } +#endif + } + for(int i = VOLUME/2; i < (VOLUME+RAND)/2; i++) { + for(int mu = 0; mu < 8; mu++) { + NBPointer[ieo][8*i + mu] = NBPointer[ieo][0]; + } + } + } +#if (defined SPI && defined MPI) + // here comes the SPI initialisation + uint64_t messageSizes[NUM_DIRS]; + uint64_t roffsets[NUM_DIRS], soffsets[NUM_DIRS]; + +#if (defined PARALLELT) + spi_num_dirs = 2; +#endif +#if (defined PARALLELXT) + spi_num_dirs = 4; +#endif +#if (defined PARALLELXYT) + spi_num_dirs = 6; +#endif +#if (defined PARALLELXYZT) + spi_num_dirs = 8; +#endif + + totalMessageSize = 0; + for(unsigned int i = 0; i < spi_num_dirs; i++) { + // message sizes in Bytes + if(i == 0 || i == 1) messageSizes[i] = LX*LY*LZ*6*sizeof(double); + else if(i == 2 || i == 3) messageSizes[i] = T*LY*LZ*6*sizeof(double); + else if(i == 4 || i == 5) messageSizes[i] = T*LX*LZ*6*sizeof(double); + else if(i == 6 || i == 7) messageSizes[i] = T*LX*LY*6*sizeof(double); + + soffsets[i] = totalMessageSize; + totalMessageSize += messageSizes[i]; + } + for(unsigned int i = 0; i < spi_num_dirs; i++) { + // forward here is backward on the right neighbour + // and the other way around... + if(i%2 == 0) { + roffsets[i] = soffsets[i] + messageSizes[i]; + } + else { + roffsets[i] = soffsets[i] - messageSizes[i-1]; + } + } + + Personality_t pers; + int rc = 0; + // get the CNK personality + Kernel_GetPersonality(&pers, sizeof(pers)); + int mypers[6]; + mypers[0] = pers.Network_Config.Acoord; + mypers[1] = pers.Network_Config.Bcoord; + mypers[2] = pers.Network_Config.Ccoord; + mypers[3] = pers.Network_Config.Dcoord; + mypers[4] = pers.Network_Config.Ecoord; + + get_destinations(mypers); + + // adjust the SPI pointers to the send and receive buffers + SPIrecvBuffers = (char *)(recvBuffer); + SPIsendBuffers = (char *)(sendBuffer); + + // Setup the FIFO handles + rc = msg_InjFifoInit ( &injFifoHandle, + 0, /* startingSubgroupId */ + 0, /* startingFifoId */ + spi_num_dirs, /* numFifos */ + INJ_MEMORY_FIFO_SIZE+1, /* fifoSize */ + NULL /* Use default attributes */ + ); + if(rc != 0) { + fprintf(stderr, "msg_InjFifoInit failed with rc=%d\n",rc); + exit(1); + } + + // Set up base address table for reception counter and buffer + setup_mregions_bats_counters(totalMessageSize); + + // Create descriptors + // Injection Direct Put Descriptor, one for each neighbour + SPIDescriptors = + ( MUHWI_Descriptor_t *)(((uint64_t)SPIDescriptorsMemory+64)&~(64-1)); + create_descriptors(SPIDescriptors, messageSizes, soffsets, roffsets, spi_num_dirs); + + // test communication + for(unsigned int i = 0; i < RAND/2; i++) { + sendBuffer[i].s0.c0 = (double)g_cart_id; + sendBuffer[i].s0.c1 = (double)g_cart_id; + sendBuffer[i].s0.c2 = (double)g_cart_id; + sendBuffer[i].s1.c0 = (double)g_cart_id; + sendBuffer[i].s1.c1 = (double)g_cart_id; + sendBuffer[i].s1.c2 = (double)g_cart_id; + } + + // Initialize the barrier, resetting the hardware. + rc = MUSPI_GIBarrierInit ( &GIBarrier, 0 /*comm world class route */); + if(rc) { + printf("MUSPI_GIBarrierInit returned rc = %d\n", rc); + exit(__LINE__); + } + // reset the recv counter + recvCounter = totalMessageSize; + global_barrier(); // make sure everybody is set recv counter + + //#pragma omp for nowait + for (unsigned int j = 0; j < spi_num_dirs; j++) { + descCount[ j ] = + msg_InjFifoInject ( injFifoHandle, + j, + &SPIDescriptors[j]); + } + // wait for receive completion + while ( recvCounter > 0 ); + + _bgq_msync(); + + j = 0; + for(unsigned int i = 0; i < spi_num_dirs; i++) { + if(i == 0) k = g_nb_t_up; + if(i == 1) k = g_nb_t_dn; + if(i == 2) k = g_nb_x_up; + if(i == 3) k = g_nb_x_dn; + if(i == 4) k = g_nb_y_up; + if(i == 5) k = g_nb_y_dn; + if(i == 6) k = g_nb_z_up; + if(i == 7) k = g_nb_z_dn; + for(int mu = 0; mu < messageSizes[i]/sizeof(halfspinor); mu++) { + if(k != (int)creal(recvBuffer[ soffsets[i]/sizeof(halfspinor) + mu ].s0.c0) || + k != (int)creal(recvBuffer[ soffsets[i]/sizeof(halfspinor) + mu ].s0.c1) || + k != (int)creal(recvBuffer[ soffsets[i]/sizeof(halfspinor) + mu ].s0.c2) || + k != (int)creal(recvBuffer[ soffsets[i]/sizeof(halfspinor) + mu ].s1.c0) || + k != (int)creal(recvBuffer[ soffsets[i]/sizeof(halfspinor) + mu ].s1.c1) || + k != (int)creal(recvBuffer[ soffsets[i]/sizeof(halfspinor) + mu ].s1.c2)) { + if(g_cart_id == 0) { + printf("SPI exchange doesn't work for dir %d: %d != %d at point %d\n", + i, k ,(int)creal(recvBuffer[ soffsets[i]/sizeof(halfspinor) + mu ].s0.c0), mu); + } + j++; + } + } + } + if(j > 0) { + printf("hmm, SPI exchange failed on proc %d...\n!", g_cart_id); + } + else { + if(g_cart_id == 0) printf("# SPI exchange successfully tested\n"); + } + +#endif // SPI + return(0); +} + + +int init_dirac_halfspinor32() { + int j=0, k; + + int x, y, z, t, mu; + + NBPointer32 = (halfspinor32***) calloc(4,sizeof(halfspinor32**)); + NBPointer32_ = (halfspinor32**) calloc(16,(VOLUME+RAND)*sizeof(halfspinor32*)); + NBPointer32[0] = NBPointer32_; + NBPointer32[1] = NBPointer32_ + (8*(VOLUME+RAND)/2); + NBPointer32[2] = NBPointer32_ + (16*(VOLUME+RAND)/2); + NBPointer32[3] = NBPointer32_ + (24*(VOLUME+RAND)/2); + + if((void*)(HalfSpinor32_ = (halfspinor32*)calloc(4*(VOLUME)+1, sizeof(halfspinor32))) == NULL) { + printf ("malloc errno : %d\n",errno); + errno = 0; + return(-1); + } + + HalfSpinor32 = (halfspinor32*)(((unsigned long int)(HalfSpinor32_)+ALIGN_BASE)&~ALIGN_BASE); + +#ifdef MPI + //re-use memory from 64Bit version + sendBuffer32 = (halfspinor32*)sendBuffer; + recvBuffer32 = (halfspinor32*)recvBuffer; +#endif + + for(int ieo = 0; ieo < 2; ieo++) { + for(int i = 0; i < VOLUME/2; i++) { + j = g_eo2lexic[i + ((ieo+1)%2)*(VOLUME+RAND)/2]; + /* get (t,x,y,z) from j */ + t = j/(LX*LY*LZ); + x = (j-t*(LX*LY*LZ))/(LY*LZ); + y = (j-t*(LX*LY*LZ)-x*(LY*LZ))/(LZ); + z = (j-t*(LX*LY*LZ)-x*(LY*LZ) - y*LZ); + for(mu = 0; mu < 4; mu++) { + NBPointer32[ieo][8*i + 2*mu + 0] = &HalfSpinor32[ 8*g_lexic2eosub[ g_idn[j][mu] ] + 2*mu + 0]; + NBPointer32[ieo][8*i + 2*mu + 1] = &HalfSpinor32[ 8*g_lexic2eosub[ g_iup[j][mu] ] + 2*mu + 1]; + } +#if ((defined PARALLELT) || (defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT)) + if(t == 0) { + k = (g_lexic2eosub[g_idn[j][0]] - VOLUME/2); + NBPointer32[ieo][8*i] = &sendBuffer32[ k ]; + } + if(t == T-1) { + k = (g_lexic2eosub[g_iup[j][0]] - VOLUME/2); + NBPointer32[ieo][8*i + 1] = &sendBuffer32[ k ]; + } +#endif +#if ((defined PARALLELX) || (defined PARALLELXY) || (defined PARALLELXYZ) || (defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT)) + if(x == 0) { + k = (g_lexic2eosub[g_idn[j][1]] - VOLUME/2); + NBPointer32[ieo][8*i + 2] = &sendBuffer32[ k ]; + } + if(x == LX-1) { + k = (g_lexic2eosub[g_iup[j][1]] - VOLUME/2); + NBPointer32[ieo][8*i + 3] = &sendBuffer32[ k ]; + } +#endif +#if ((defined PARALLELXY) || (defined PARALLELXYZ) || (defined PARALLELXYT) || (defined PARALLELXYZT)) + if(y == 0) { + k = (g_lexic2eosub[g_idn[j][2]] - VOLUME/2); + NBPointer32[ieo][8*i + 4] = &sendBuffer32[ k ]; + } + if(y == LY-1) { + k = (g_lexic2eosub[g_iup[j][2]] - VOLUME/2); + NBPointer32[ieo][8*i + 5] = &sendBuffer32[ k ]; + } +#endif +#if ((defined PARALLELXYZ) || (defined PARALLELXYZT)) + if(z == 0) { + k = (g_lexic2eosub[g_idn[j][3]] - VOLUME/2); + NBPointer32[ieo][8*i + 6] = &sendBuffer32[ k ]; + } + if(z == LZ-1) { + k = (g_lexic2eosub[g_iup[j][3]] - VOLUME/2); + NBPointer32[ieo][8*i + 7] = &sendBuffer32[ k ]; + } +#endif + } + for(int i = VOLUME/2; i < (VOLUME+RAND)/2; i++) { + for(int mu = 0; mu < 8; mu++) { + NBPointer32[ieo][8*i + mu] = NBPointer32[ieo][0]; + } + } + } + for(int ieo = 2; ieo < 4; ieo++) { + for(int i = 0; i < VOLUME/2; i++) { + j = g_eo2lexic[i + ((ieo+0)%2)*(VOLUME+RAND)/2]; + /* get (t,x,y,z) from j */ + t = j/(LX*LY*LZ); + x = (j-t*(LX*LY*LZ))/(LY*LZ); + y = (j-t*(LX*LY*LZ)-x*(LY*LZ))/(LZ); + z = (j-t*(LX*LY*LZ)-x*(LY*LZ) - y*LZ); + for(mu = 0; mu < 8; mu++) { + NBPointer32[ieo][8*i + mu] = &HalfSpinor32[8*i + mu]; + } +#if ((defined PARALLELT) || (defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT)) + if(t == T-1) { + NBPointer32[ieo][8*i] = &recvBuffer32[ (g_lexic2eosub[ g_iup[j][0] ] - VOLUME/2)]; + } + if(t == 0) { + NBPointer32[ieo][8*i + 1] = &recvBuffer32[ (g_lexic2eosub[ g_idn[j][0] ] - VOLUME/2)]; + } +#endif +#if ((defined PARALLELX) || (defined PARALLELXY) || (defined PARALLELXYZ) || (defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT)) + if(x == LX-1) { + NBPointer32[ieo][8*i + 2] = &recvBuffer32[ (g_lexic2eosub[ g_iup[j][1] ] - VOLUME/2)]; + } + if(x == 0) { + NBPointer32[ieo][8*i + 3] = &recvBuffer32[ (g_lexic2eosub[ g_idn[j][1] ] - VOLUME/2)]; + } +#endif +#if ((defined PARALLELXY) || (defined PARALLELXYZ) || (defined PARALLELXYT) || (defined PARALLELXYZT)) + if(y == LY-1) { + NBPointer32[ieo][8*i + 4] = &recvBuffer32[ (g_lexic2eosub[ g_iup[j][2] ] - VOLUME/2)]; + } + if(y == 0) { + NBPointer32[ieo][8*i + 5] = &recvBuffer32[ (g_lexic2eosub[ g_idn[j][2] ] - VOLUME/2)]; + } +#endif +#if ((defined PARALLELXYZ) || (defined PARALLELXYZT)) + if(z == LZ-1) { + NBPointer32[ieo][8*i + 6] = &recvBuffer32[ (g_lexic2eosub[ g_iup[j][3] ] - VOLUME/2)]; + } + if(z == 0) { + NBPointer32[ieo][8*i + 7] = &recvBuffer32[ (g_lexic2eosub[ g_idn[j][3] ] - VOLUME/2)]; + } +#endif + } + for(int i = VOLUME/2; i < (VOLUME+RAND)/2; i++) { + for(int mu = 0; mu < 8; mu++) { + NBPointer32[ieo][8*i + mu] = NBPointer32[ieo][0]; + } + } + } +#if (defined SPI && defined MPI) + // here comes the SPI initialisation + uint64_t messageSizes[NUM_DIRS]; + uint64_t roffsets[NUM_DIRS], soffsets[NUM_DIRS]; + + uint64_t tMS = 0; + for(unsigned int i = 0; i < spi_num_dirs; i++) { + // message sizes in Bytes + if(i == 0 || i == 1) messageSizes[i] = LX*LY*LZ*6*sizeof(float); + else if(i == 2 || i == 3) messageSizes[i] = T*LY*LZ*6*sizeof(float); + else if(i == 4 || i == 5) messageSizes[i] = T*LX*LZ*6*sizeof(float); + else if(i == 6 || i == 7) messageSizes[i] = T*LX*LY*6*sizeof(float); + + soffsets[i] = tMS; + tMS += messageSizes[i]; + } + for(unsigned int i = 0; i < spi_num_dirs; i++) { + // forward here is backward on the right neighbour + // and the other way around... + if(i%2 == 0) { + roffsets[i] = soffsets[i] + messageSizes[i]; + } + else { + roffsets[i] = soffsets[i] - messageSizes[i-1]; + } + } + + // Create descriptors + // Injection Direct Put Descriptor, one for each neighbour + SPIDescriptors32 = + ( MUHWI_Descriptor_t *)(((uint64_t)SPIDescriptorsMemory32+64)&~(64-1)); + create_descriptors(SPIDescriptors32, messageSizes, soffsets, roffsets, spi_num_dirs); + + // test communication + for(unsigned int i = 0; i < RAND/2; i++) { + sendBuffer32[i].s0.c0 = (float)g_cart_id; + sendBuffer32[i].s0.c1 = (float)g_cart_id; + sendBuffer32[i].s0.c2 = (float)g_cart_id; + sendBuffer32[i].s1.c0 = (float)g_cart_id; + sendBuffer32[i].s1.c1 = (float)g_cart_id; + sendBuffer32[i].s1.c2 = (float)g_cart_id; + } + + // Initialize the barrier, resetting the hardware. + int rc = MUSPI_GIBarrierInit ( &GIBarrier, 0 /*comm world class route */); + if(rc) { + printf("MUSPI_GIBarrierInit returned rc = %d\n", rc); + exit(__LINE__); + } + // reset the recv counter, note the division by 2, totalMessageSize has been set in init_dirac_halfspinor + // which must be called first! + recvCounter = totalMessageSize/2; + global_barrier(); // make sure everybody is set recv counter + + // could do communication with multiple threads + //#pragma omp for nowait + for (unsigned int j = 0; j < spi_num_dirs; j++) { + descCount[ j ] = + msg_InjFifoInject ( injFifoHandle, + j, + &SPIDescriptors32[j]); + } + // wait for receive completion + while ( recvCounter > 0 ); + + _bgq_msync(); + + j = 0; + for(unsigned int i = 0; i < spi_num_dirs; i++) { + if(i == 0) k = g_nb_t_up; + if(i == 1) k = g_nb_t_dn; + if(i == 2) k = g_nb_x_up; + if(i == 3) k = g_nb_x_dn; + if(i == 4) k = g_nb_y_up; + if(i == 5) k = g_nb_y_dn; + if(i == 6) k = g_nb_z_up; + if(i == 7) k = g_nb_z_dn; + for(int mu = 0; mu < messageSizes[i]/sizeof(halfspinor32); mu++) { + if(k != (int)creal(recvBuffer32[ soffsets[i]/sizeof(halfspinor32) + mu ].s0.c0) || + k != (int)creal(recvBuffer32[ soffsets[i]/sizeof(halfspinor32) + mu ].s0.c1) || + k != (int)creal(recvBuffer32[ soffsets[i]/sizeof(halfspinor32) + mu ].s0.c2) || + k != (int)creal(recvBuffer32[ soffsets[i]/sizeof(halfspinor32) + mu ].s1.c0) || + k != (int)creal(recvBuffer32[ soffsets[i]/sizeof(halfspinor32) + mu ].s1.c1) || + k != (int)creal(recvBuffer32[ soffsets[i]/sizeof(halfspinor32) + mu ].s1.c2)) { + if(g_cart_id == 0) { + printf("32 Bit SPI exchange doesn't work for dir %d: %d != %d at point %d\n", + i, k ,(int)creal(recvBuffer32[ soffsets[i]/sizeof(halfspinor32) + mu ].s0.c0), mu); + } + j++; + } + } + } + if(j > 0) { + printf("hmm, SPI exchange failed for 32 Bit halfspinor on proc %d...\n!", g_cart_id); + } + else { + if(g_cart_id == 0) printf("# 32 Bit SPI exchange successfully tested\n"); + } + +#endif + return(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_dirac_halfspinor.h b/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_dirac_halfspinor.h new file mode 100644 index 0000000000000000000000000000000000000000..fa4406f61cd4a86e8955c6dcf68ab279b70983c3 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_dirac_halfspinor.h @@ -0,0 +1,33 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _INIT_DIRAC_HALFSPINOR_H +#define _INIT_DIRAC_HALFSPINOR_H + +extern halfspinor * HalfSpinor ALIGN; +extern halfspinor *** NBPointer; +extern halfspinor32 * HalfSpinor32 ALIGN; +extern halfspinor32 *** NBPointer32; +extern halfspinor * ALIGN sendBuffer, * ALIGN recvBuffer; +extern halfspinor32 * ALIGN sendBuffer32, * ALIGN recvBuffer32; + +int init_dirac_halfspinor(); +int init_dirac_halfspinor32(); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_gauge_field.c b/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_gauge_field.c new file mode 100644 index 0000000000000000000000000000000000000000..a35d050c5cdc12a9b8c876039bc71434c3297ae4 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_gauge_field.c @@ -0,0 +1,295 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "sse.h" +#include "init_gauge_field.h" + +su3 * gauge_field = NULL; +su3_32 * gauge_field_32 = NULL; +#ifdef _USE_TSPLITPAR +su3 * gauge_field_copyt = NULL; +su3 * gauge_field_copys = NULL; +#else +su3 * gauge_field_copy = NULL; +su3_32 * gauge_field_copy_32 = NULL; +#endif + +int init_gauge_field(const int V, const int back) { + int i=0; + +#ifdef _USE_TSPLITPAR + g_gauge_field_copyt = NULL; + g_gauge_field_copys = NULL; +#else + g_gauge_field_copy = NULL; +#endif + + if((void*)(g_gauge_field = (su3**)calloc(V, sizeof(su3*))) == NULL) { + printf ("malloc errno : %d\n",errno); + errno = 0; + return(1); + } + if((void*)(gauge_field = (su3*)calloc(4*V+1, sizeof(su3))) == NULL) { + printf ("malloc errno : %d\n",errno); + errno = 0; + return(2); + } +#if (defined SSE || defined SSE2 || defined SSE3) + g_gauge_field[0] = (su3*)(((unsigned long int)(gauge_field)+ALIGN_BASE)&~ALIGN_BASE); +#else + g_gauge_field[0] = gauge_field; +#endif + for(i = 1; i < V; i++){ + g_gauge_field[i] = g_gauge_field[i-1]+4; + } + +# if defined _USE_HALFSPINOR + if(back == 1) { + /* + g_gauge_field_copy[ieo][PM][sites/2][mu] + */ + if((void*)(g_gauge_field_copy = (su3***)calloc(2, sizeof(su3**))) == NULL) { + printf ("malloc errno : %d\n",errno); + errno = 0; + return(3); + } + if((void*)(g_gauge_field_copy[0] = (su3**)calloc(VOLUME, sizeof(su3*))) == NULL) { + printf ("malloc errno : %d\n",errno); + errno = 0; + return(3); + } + g_gauge_field_copy[1] = g_gauge_field_copy[0] + (VOLUME)/2; + if((void*)(gauge_field_copy = (su3*)calloc(4*(VOLUME)+1, sizeof(su3))) == NULL) { + printf ("malloc errno : %d\n",errno); + errno = 0; + return(4); + } +# if (defined SSE || defined SSE2 || defined SSE3) + g_gauge_field_copy[0][0] = (su3*)(((unsigned long int)(gauge_field_copy)+ALIGN_BASE)&~ALIGN_BASE); +# else + g_gauge_field_copy[0][0] = gauge_field_copy; +# endif + for(i = 1; i < (VOLUME)/2; i++) { + g_gauge_field_copy[0][i] = g_gauge_field_copy[0][i-1]+4; + } + g_gauge_field_copy[1][0] = g_gauge_field_copy[0][0] + 2*VOLUME; + for(i = 1; i < (VOLUME)/2; i++) { + g_gauge_field_copy[1][i] = g_gauge_field_copy[1][i-1]+4; + } + } +# elif defined _USE_TSPLITPAR + if(back == 1) { + if((void*)(g_gauge_field_copyt = (su3**)calloc((VOLUME+RAND), sizeof(su3*))) == NULL) { + printf ("malloc errno : %d\n",errno); + errno = 0; + return(3); + } + if((void*)(g_gauge_field_copys = (su3**)calloc((VOLUME+RAND), sizeof(su3*))) == NULL) { + printf ("malloc errno : %d\n",errno); + errno = 0; + return(3); + } + if((void*)(gauge_field_copyt = (su3*)calloc(2*(VOLUME+RAND)+1, sizeof(su3))) == NULL) { + printf ("malloc errno : %d\n",errno); + errno = 0; + return(4); + } + if((void*)(gauge_field_copys = (su3*)calloc(6*(VOLUME+RAND)+1, sizeof(su3))) == NULL) { + printf ("malloc errno : %d\n",errno); + errno = 0; + return(4); + } +# if (defined SSE || defined SSE2 || defined SSE3) + g_gauge_field_copyt[0] = (su3*)(((unsigned long int)(gauge_field_copyt)+ALIGN_BASE)&~ALIGN_BASE); + g_gauge_field_copys[0] = (su3*)(((unsigned long int)(gauge_field_copys)+ALIGN_BASE)&~ALIGN_BASE); +# else + g_gauge_field_copyt[0] = gauge_field_copyt; + g_gauge_field_copys[0] = gauge_field_copys; +# endif + for(i = 1; i < (VOLUME+RAND); i++) { + g_gauge_field_copyt[i] = g_gauge_field_copyt[i-1]+2; + g_gauge_field_copys[i] = g_gauge_field_copys[i-1]+6; + } + } +# else /* than _USE_HALFSPINOR or _USE_TSPLITPAR */ + if(back == 1) { + if((void*)(g_gauge_field_copy = (su3**)calloc((VOLUME+RAND), sizeof(su3*))) == NULL) { + printf ("malloc errno : %d\n",errno); + errno = 0; + return(3); + } + if((void*)(gauge_field_copy = (su3*)calloc(8*(VOLUME+RAND)+1, sizeof(su3))) == NULL) { + printf ("malloc errno : %d\n",errno); + errno = 0; + return(4); + } +# if (defined SSE || defined SSE2 || defined SSE3) + g_gauge_field_copy[0] = (su3*)(((unsigned long int)(gauge_field_copy)+ALIGN_BASE)&~ALIGN_BASE); +# else + g_gauge_field_copy[0] = gauge_field_copy; +# endif + for(i = 1; i < (VOLUME+RAND); i++) { + g_gauge_field_copy[i] = g_gauge_field_copy[i-1]+8; + } + } +# endif + g_update_gauge_copy = 1; + return(0); +} + +void free_gauge_field() { + free(gauge_field); + free(g_gauge_field); +# if defined _USE_TSPLITPAR + free(gauge_field_copys); + free(gauge_field_copyt); +# else + free(gauge_field_copy); +# endif +} + + + +int init_gauge_field_32(const int V, const int back) { + int i=0; + + g_gauge_field_copy_32 = NULL; + + + if((void*)(g_gauge_field_32 = (su3_32**)calloc(V, sizeof(su3_32*))) == NULL) { + printf ("malloc errno : %d\n",errno); + errno = 0; + return(1); + } + if((void*)(gauge_field_32 = (su3_32*)calloc(4*V+1, sizeof(su3_32))) == NULL) { + printf ("malloc errno : %d\n",errno); + errno = 0; + return(2); + } + + /*doing alignment no matter what*/ + g_gauge_field_32[0] = (su3_32*)(((unsigned long int)(gauge_field_32)+ALIGN_BASE32)&~ALIGN_BASE32); + + for(i = 1; i < V; i++){ + g_gauge_field_32[i] = g_gauge_field_32[i-1]+4; + } + +# if defined _USE_HALFSPINOR + if(back == 1) { + /* + g_gauge_field_copy[ieo][PM][sites/2][mu] + */ + if((void*)(g_gauge_field_copy_32 = (su3_32***)calloc(2, sizeof(su3_32**))) == NULL) { + printf ("malloc errno : %d\n",errno); + errno = 0; + return(3); + } + if((void*)(g_gauge_field_copy_32[0] = (su3_32**)calloc(VOLUME, sizeof(su3_32*))) == NULL) { + printf ("malloc errno : %d\n",errno); + errno = 0; + return(3); + } + g_gauge_field_copy_32[1] = g_gauge_field_copy_32[0] + (VOLUME)/2; + if((void*)(gauge_field_copy_32 = (su3_32*)calloc(4*(VOLUME)+1, sizeof(su3_32))) == NULL) { + printf ("malloc errno : %d\n",errno); + errno = 0; + return(4); + } + /* doing alignment no matter what */ + g_gauge_field_copy_32[0][0] = (su3_32*)(((unsigned long int)(gauge_field_copy_32)+ALIGN_BASE32)&~ALIGN_BASE32); + + for(i = 1; i < (VOLUME)/2; i++) { + g_gauge_field_copy_32[0][i] = g_gauge_field_copy_32[0][i-1]+4; + } + g_gauge_field_copy_32[1][0] = g_gauge_field_copy_32[0][0] + 2*VOLUME; + for(i = 1; i < (VOLUME)/2; i++) { + g_gauge_field_copy_32[1][i] = g_gauge_field_copy_32[1][i-1]+4; + } + } +# else /* than _USE_HALFSPINOR */ + if(back == 1) { + if((void*)(g_gauge_field_copy_32 = (su3_32**)calloc((VOLUME+RAND), sizeof(su3_32*))) == NULL) { + printf ("malloc errno : %d\n",errno); + errno = 0; + return(3); + } + if((void*)(gauge_field_copy_32 = (su3_32*)calloc(8*(VOLUME+RAND)+1, sizeof(su3_32))) == NULL) { + printf ("malloc errno : %d\n",errno); + errno = 0; + return(4); + } + + /* doing alignment no matter what */ + g_gauge_field_copy_32[0] = (su3_32*)(((unsigned long int)(gauge_field_copy_32)+ALIGN_BASE32)&~ALIGN_BASE32); + + for(i = 1; i < (VOLUME+RAND); i++) { + g_gauge_field_copy_32[i] = g_gauge_field_copy_32[i-1]+8; + } + } +# endif + g_update_gauge_copy_32 = 1; + return(0); +} + +void free_gauge_field_32() { + free(gauge_field_32); + free(g_gauge_field_32); + free(gauge_field_copy_32); +} + + +void convert_32_gauge_field( su3_32** gf32, su3** gf, int V){ + int i,mu; + for(i = 0; i < V; i++) { + for(mu =0; mu<4; mu++){ + gf32[i][mu].c00 = (_Complex float) gf[i][mu].c00; + gf32[i][mu].c01 = (_Complex float) gf[i][mu].c01; + gf32[i][mu].c02 = (_Complex float) gf[i][mu].c02; + + gf32[i][mu].c10 = (_Complex float) gf[i][mu].c10; + gf32[i][mu].c11 = (_Complex float) gf[i][mu].c11; + gf32[i][mu].c12 = (_Complex float) gf[i][mu].c12; + + gf32[i][mu].c20 = (_Complex float) gf[i][mu].c20; + gf32[i][mu].c21 = (_Complex float) gf[i][mu].c21; + gf32[i][mu].c22 = (_Complex float) gf[i][mu].c22; + } + } +#if defined _USE_HALFSPINOR + + + + +#endif + +} + + + + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_gauge_field.h b/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_gauge_field.h new file mode 100644 index 0000000000000000000000000000000000000000..8245dfca25e80ddb43a2b59c089c9665df5e842e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_gauge_field.h @@ -0,0 +1,29 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _INIT_GAUGE_FIELD_H +#define _INIT_GAUGE_FIELD_H + +int init_gauge_field(const int V, const int back); +void free_gauge_field(); +int init_gauge_field_32(const int V, const int back); +void free_gauge_field_32(); + +void convert_32_gauge_field( su3_32** gf32, su3** gf, int V); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_gauge_tmp.c b/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_gauge_tmp.c new file mode 100644 index 0000000000000000000000000000000000000000..73a935ea74c1e5bd9a151fc8990dbdc6a48d1d50 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_gauge_tmp.c @@ -0,0 +1,61 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "sse.h" +#include "init_gauge_tmp.h" + +su3 * gauge_tmp_ = NULL; +su3 ** gauge_tmp = NULL; + +int init_gauge_tmp(const int V) { + int i=0; + + if((void*)(gauge_tmp = (su3**)calloc(V, sizeof(su3*))) == NULL) { + fprintf(stderr, "malloc errno : %d\n", errno); + errno = 0; + return(1); + } + if((void*)(gauge_tmp_ = (su3*)calloc(4*V+1, sizeof(su3))) == NULL) { + fprintf(stderr, "malloc errno : %d\n", errno); + errno = 0; + return(1); + } +#if (defined SSE || defined SSE2 || defined SSE3) + gauge_tmp[0] = (su3*)(((unsigned long int)(gauge_tmp_)+ALIGN_BASE)&~ALIGN_BASE); +#else + gauge_tmp[0] = gauge_tmp_; +#endif + for(i = 1; i < V; i++){ + gauge_tmp[i] = gauge_tmp[i-1]+4; + } + return(0); +} + +void free_gauge_tmp() { + free(gauge_tmp_); + free(gauge_tmp); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_gauge_tmp.h b/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_gauge_tmp.h new file mode 100644 index 0000000000000000000000000000000000000000..846d486682a6d6fc7ee6fbb7665f600db2ae405f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_gauge_tmp.h @@ -0,0 +1,27 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _INIT_GAUGE_TMP_H +#define _INIT_GAUGE_TMP_H + +extern su3 ** gauge_tmp; + +int init_gauge_tmp(const int V); +void free_gauge_tmp(); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_geometry_indices.c b/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_geometry_indices.c new file mode 100644 index 0000000000000000000000000000000000000000..ec6e6fdf9621f52c30450999eefd35f0d888add9 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_geometry_indices.c @@ -0,0 +1,185 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "global.h" +#include "init_geometry_indices.h" + +int *iup_KD = NULL, *idn_KD = NULL, *ipt_KD = NULL, **ipt_ = NULL, ***ipt__ = NULL; + +int init_geometry_indices(const int V) { + int i = 0; + + g_idn= (int**)calloc(V, sizeof(int*)); + if((void*)g_idn == NULL) return(1); + g_iup = (int**)calloc(V, sizeof(int*)); + if((void*)g_iup == NULL) return(2); + + idn_KD = (int*)calloc(4*V, sizeof(int)); + if((void*)idn_KD == NULL ) return(6); + iup_KD = (int*)calloc(4*V, sizeof(int)); + if((void*)iup_KD == NULL) return(7); + + g_ipt = (int****)calloc(T+4,sizeof(int*)); + if((void*)g_ipt == NULL) return(5); + ipt__ = (int***)calloc ((T+4)*(LX+4), sizeof(int*)); + if((void*)ipt__ == NULL) return(4); + ipt_ = (int**)calloc((T+4)*(LX+4)*(LY+4), sizeof(int*)); + if((void*)ipt_ == NULL) return(3); + ipt_KD = (int*)calloc((T+4)*(LX+4)*(LY+4)*(LZ+4), sizeof(int)); + if((void*)ipt_KD == NULL) return(8); + + g_lexic2eo = (int*)calloc(V, sizeof(int)); + if((void*)g_lexic2eo == NULL) return(9); + /* this +2 is for sanity reasons */ + g_lexic2eosub = (int*)calloc(V+2, sizeof(int)); + if((void*)g_lexic2eosub == NULL) return(10); + g_eo2lexic = (int*)calloc(V, sizeof(int)); + if((void*)g_eo2lexic == NULL) return(11); + +#if ( defined PARALLELXYZT || defined PARALLELXYZ ) + g_field_z_ipt_even = (int*)calloc(T*LX*LY, sizeof(int)); + if((void*)g_field_z_ipt_even == NULL) return(12); + g_field_z_ipt_odd = (int*)calloc(T*LX*LY, sizeof(int)); + if((void*)g_field_z_ipt_odd == NULL) return(13); + + g_field_z_disp_even_dn = (int*)calloc(T*LX*LY/2, sizeof(int)); + if((void*)g_field_z_disp_even_dn == NULL) return(14); + g_field_z_disp_even_up = (int*)calloc(T*LX*LY/2, sizeof(int)); + if((void*)g_field_z_disp_even_up == NULL) return(15); + g_field_z_disp_odd_dn = (int*)calloc(T*LX*LY/2, sizeof(int)); + if((void*)g_field_z_disp_odd_dn == NULL) return(16); + g_field_z_disp_odd_up = (int*)calloc(T*LX*LY/2, sizeof(int)); + if((void*)g_field_z_disp_odd_up == NULL) return(17); +#endif + +#ifdef _USE_TSPLITPAR + g_1st_eot= (int**)calloc(T, sizeof(int*)); + if((void*)g_1st_eot == NULL) return(18); + for(i=0;i. + ***********************************************************************/ +#ifndef _INIT_GEOMETRY_INDICES_H +#define _INIT_GEOMETRY_INDICES_H + +int init_geometry_indices(const int N); +void free_geometry_indices(); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_jacobi_field.c b/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_jacobi_field.c new file mode 100644 index 0000000000000000000000000000000000000000..52b4f1010f8a28e74cf23a217507529f9433bb62 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_jacobi_field.c @@ -0,0 +1,103 @@ +/*********************************************************************** + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +/* + * routine for the initialization of the jocobi field (for use in LapH_ev) + * Authors Luigi Scorzato, Marco Cristoforetti + * + * + *******************************************************************************/ +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "start.h" +#include "xchange/xchange.h" +#include "init_jacobi_field.h" + +#ifdef WITHLAPH + +su3_vector *jacobi_field = NULL; + +int init_jacobi_field(const int V, const int nr) +{ +int i=0; + + if((void*)(jacobi_field = (su3_vector*)calloc(nr*V+1, sizeof(su3_vector))) == NULL) + { + printf("malloc errno : %d\n",errno); + errno = 0; + return(1); + } + if((void*)(g_jacobi_field = (su3_vector**)malloc(nr*sizeof(su3_vector*))) == NULL) + { + printf("malloc errno : %d\n",errno); + errno = 0; + return(2); + } + + g_jacobi_field[0] = jacobi_field; + for(i=1; ic0 = v[0] + v[1] * I; + s->c1 = v[2] + v[3] * I; + s->c2 = v[4] + v[5] * I; + } +#ifdef MPI + xchange_jacobi(k); +#endif +} + +void random_jacobi_field(su3_vector * const k, const int V) +{ +int ix; +su3_vector *s; +double v[6]; + + for (ix=0; ix. + ***********************************************************************/ +/* + * routine for the initialization of the jocobi field (for use in LapH_ev) + * Authors Luigi Scorzato, Marco Cristoforetti + * + * + *******************************************************************************/ +#ifndef _INIT_JACOBI_FIELD_H +#define _INIT_JACOBI_FIELD_H + +# ifdef WITHLAPH +int init_jacobi_field(const int V, const int nr); +void free_jacobi_field(); +void random_gauss_jacobi_field(su3_vector * const k, const int V); +void random_jacobi_field(su3_vector * const k, const int V); +# endif +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_moment_field.c b/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_moment_field.c new file mode 100644 index 0000000000000000000000000000000000000000..876db4f9fc536cf3497f7ce144dd56fd4c18250c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_moment_field.c @@ -0,0 +1,105 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H + # include +#endif +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "su3adj.h" +#include "sse.h" + +su3adj * mo=NULL, *df=NULL, *du=NULL; + +int init_moment_field(const int V, const int VR) { + int i = 0; + +/* posix_memalign(void **memptr, size_t alignment, size_t size) */ + if( (int*)(mo = (su3adj*)calloc(4*V+1, sizeof(su3adj))) == NULL){ + printf ("malloc errno : %d\n",errno); + errno = 0; + return(1); + } + if((void*)(moment = (su3adj**)calloc(V,sizeof(su3adj*))) == NULL) { + printf ("malloc errno : %d\n",errno); + errno = 0; + return(2); + } +#if ( defined SSE || defined SSE2 || defined SSE3) + moment[0] = (su3adj*)(((unsigned long int)(mo)+ALIGN_BASE)&~ALIGN_BASE); +#else + moment[0] = mo; +#endif + + for(i = 1; i < V; i++){ + moment[i] = moment[i-1]+4; + } + + if((void*)(df = (su3adj*)calloc(4*VR+1, sizeof(su3adj))) == NULL) { + printf ("malloc errno : %d\n",errno); + errno = 0; + return(3); + } + if((void*)(df0 = (su3adj**)calloc(VR,sizeof(su3adj*))) == NULL) { + printf ("malloc errno : %d\n",errno); + errno = 0; + return(4); + } +#if ( defined SSE || defined SSE2 || defined SSE3) + df0[0] = (su3adj*)(((unsigned long int)(df)+ALIGN_BASE)&~ALIGN_BASE); +#else + df0[0] = df; +#endif + + for(i = 1; i < VR; i++) { + df0[i] = df0[i-1]+4; + } + + if((void*)(du = (su3adj*)calloc(4*VR+1, sizeof(su3adj))) == NULL) { + printf ("malloc errno : %d\n",errno); + errno = 0; + return(5); + } + if((void*)(ddummy = (su3adj**)calloc(VR,sizeof(su3adj*))) == NULL) { + printf ("malloc errno : %d\n",errno); + errno = 0; + return(6); + } +#if ( defined SSE || defined SSE2 || defined SSE3) + ddummy[0] = (su3adj*)(((unsigned long int)(du)+ALIGN_BASE)&~ALIGN_BASE); +#else + ddummy[0] = du; +#endif + + for(i = 1; i < VR; i++){ + ddummy[i] = ddummy[i-1]+4; + } + + return(0); +} + +void free_moment_field() { + + free(mo); + free(df); + free(du); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_moment_field.h b/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_moment_field.h new file mode 100644 index 0000000000000000000000000000000000000000..81ec999db1bf91e8112bdbaa77ee5c4da071e825 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_moment_field.h @@ -0,0 +1,25 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _INIT_MOMENT_FIELD_H +#define _INIT_MOMENT_FIELD_H + +int init_moment_field(const int V, const int VR); +void free_moment_field(); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_omp_accumulators.c b/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_omp_accumulators.c new file mode 100644 index 0000000000000000000000000000000000000000..47d733a46dcbc5f65806a269fa2047ecce2f4411 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_omp_accumulators.c @@ -0,0 +1,53 @@ +/*********************************************************************** + * Copyright (C) 2012 Bartosz Kostrzewa + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef OMP +#include +#endif +#include +#include +#include +#include "global.h" +#include "init_omp_accumulators.h" + +int init_omp_accumulators(const int num) { + g_omp_acc_cp=NULL; + g_omp_acc_re=NULL; + + if((void*)(g_omp_acc_cp = (_Complex double*)malloc(num*sizeof(_Complex double))) == NULL) { + printf ("init_omp_accumulators malloc errno : %d\n",errno); + errno = 0; + return(1); + } + if((void*)(g_omp_acc_re = (double*)malloc(num*sizeof(double))) == NULL) { + printf ("init_omp_accumulators malloc errno : %d\n",errno); + errno = 0; + return(2); + } + + return(0); +} + +void free_omp_accumulators() { + free(g_omp_acc_cp); + free(g_omp_acc_re); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_omp_accumulators.h b/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_omp_accumulators.h new file mode 100644 index 0000000000000000000000000000000000000000..6a6d6ca7502b3f601dc4c437d9abcfc2bf998280 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_omp_accumulators.h @@ -0,0 +1,37 @@ +/*********************************************************************** + * Copyright (C) 2012 Bartosz Kostrzewa + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +/* The two arrays + * + * g_omp_acc_re + * g_omp_acc_cp + * + * have as many elements as there are threads (set by ompnumthreads input parameter, + * stored in omp_num_threads configuration variable). They are initialiazed + * upon program launch and serve to hold thread-local values over the boundaries + * of parallel sections, such as for Kahan summations. _re is of type + * "double" while _cp is of type "_Complex double". They are declared in global.h */ + +#ifndef _INIT_OMP_ACCUMULATORS_H +#define _INIT_OMP_ACCUMULATORS_H + +int init_omp_accumulators(int num); +void free_omp_accumulators(); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_openmp.c b/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_openmp.c new file mode 100644 index 0000000000000000000000000000000000000000..2712010e77fc70c908371b3d8322bb8f2b3cf3e3 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_openmp.c @@ -0,0 +1,53 @@ +/*********************************************************************** + * Copyright (C) 2013 Bartosz Kostrzewa + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef OMP +#include +#include "init_omp_accumulators.h" +#endif +#include +#include +#include +#include "global.h" + +void init_openmp(void) { +#ifdef OMP + if(omp_num_threads > 0) + { + omp_set_num_threads(omp_num_threads); + if( g_debug_level > 0 && g_proc_id == 0 ) { + printf("# Instructing OpenMP to use %d threads.\n",omp_num_threads); + } + } + else { + if( g_proc_id == 0 ) + printf("# No value provided for OmpNumThreads, running in single-threaded mode!\n"); + + omp_num_threads = 1; + omp_set_num_threads(omp_num_threads); + } + + init_omp_accumulators(omp_num_threads); +#endif + return; +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_openmp.h b/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_openmp.h new file mode 100644 index 0000000000000000000000000000000000000000..e9c9e3d97bd717e34ba69c7c381dd15a5b489033 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_openmp.h @@ -0,0 +1,25 @@ +/*********************************************************************** + * Copyright (C) 2013 Bartosz Kostrzewa + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _INIT_OPENMP_H +#define _INIT_OPENMP_H + +int init_openmp(void); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_spinor_field.c b/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_spinor_field.c new file mode 100644 index 0000000000000000000000000000000000000000..885cba6912b4ff2f0746ec2ad454977b2412c27e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_spinor_field.c @@ -0,0 +1,266 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#ifdef _USE_SHMEM +# include +#endif +#include "global.h" +#include "su3.h" +#include "sse.h" + +#ifndef BENCHMARK +#include "monomial/monomial.h" +#endif + +spinor * sp = NULL; +spinor * sp_csg = NULL; +spinor * sp_tbuff = NULL; + +int init_spinor_field(const int V, const int nr) { + int i = 0; + +#if (defined _USE_SHMEM && !(defined _USE_HALFSPINOR)) + if((void*)(sp = (spinor*)shmalloc((nr*V+1)*sizeof(spinor))) == NULL) { + printf ("malloc errno : %d\n",errno); + errno = 0; + return(1); + } +#else + if((void*)(sp = (spinor*)calloc(nr*V+1, sizeof(spinor))) == NULL) { + printf ("malloc errno : %d\n",errno); + errno = 0; + return(1); + } +#endif + if((void*)(g_spinor_field = (spinor**)malloc(nr*sizeof(spinor*))) == NULL) { + printf ("malloc errno : %d\n",errno); + errno = 0; + return(2); + } +#if ( defined SSE || defined SSE2 || defined SSE3) + g_spinor_field[0] = (spinor*)(((unsigned long int)(sp)+ALIGN_BASE)&~ALIGN_BASE); +#else + g_spinor_field[0] = sp; +#endif + + for(i = 1; i < nr; i++){ + g_spinor_field[i] = g_spinor_field[i-1]+V; + } + + return(0); +} + +void free_spinor_field() { +#if (defined _USE_SHMEM && !(defined _USE_HALFSPINOR)) + shfree(sp); + shfree(sp_csg); +#else + free(sp); + free(sp_csg); +#endif +} + + +spinor32 * sp32 = NULL; +int init_spinor_field_32(const int V, const int nr) { + int i = 0; + +#if (defined _USE_SHMEM && !(defined _USE_HALFSPINOR)) + if((void*)(sp32 = (spinor32*)shmalloc((nr*V+1)*sizeof(spinor32))) == NULL) { + printf ("malloc errno : %d\n",errno); + errno = 0; + return(1); + } +#else + if((void*)(sp32 = (spinor32*)calloc(nr*V+1, sizeof(spinor32))) == NULL) { + printf ("malloc errno : %d\n",errno); + errno = 0; + return(1); + } +#endif + if((void*)(g_spinor_field32 = (spinor32**)malloc(nr*sizeof(spinor32*))) == NULL) { + printf ("malloc errno : %d\n",errno); + errno = 0; + return(2); + } +#if ( defined SSE || defined SSE2 || defined SSE3) + g_spinor_field32[0] = (spinor32*)(((unsigned long int)(sp32)+ALIGN_BASE32)&~ALIGN_BASE32); +#else + g_spinor_field32[0] = (spinor32*)(((unsigned long int)(sp32)+ALIGN_BASE32)&~ALIGN_BASE32); +#endif + + for(i = 1; i < nr; i++){ + g_spinor_field32[i] = g_spinor_field32[i-1]+V; + } + + return(0); +} + +void free_spinor_field_32() { +#if (defined _USE_SHMEM && !(defined _USE_HALFSPINOR)) + shfree(sp32); +#else + free(sp32); +#endif +} + + + + + + + + + + +/** + * costumized spinor allocation routines + */ +int allocate_spinor_field_array(spinor ***spinors,spinor **sp,const int V, const int nr) { + int i = 0; + +#if (defined _USE_SHMEM && !(defined _USE_HALFSPINOR)) + if((void*)((*sp) = (spinor*)shmalloc((nr*V+1)*sizeof(spinor))) == NULL) { + printf ("malloc errno : %d\n",errno); + errno = 0; + return(1); + } +#else + if((void*)((*sp) = (spinor*)calloc(nr*V+1, sizeof(spinor))) == NULL) { + printf ("malloc errno : %d\n",errno); + errno = 0; + return(1); + } +#endif + if((void*)((*spinors) = (spinor**)malloc(nr*sizeof(spinor*))) == NULL) { + printf ("malloc errno : %d\n",errno); + errno = 0; + return(2); + } +#if ( defined SSE || defined SSE2 || defined SSE3) + (*spinors)[0] = (spinor*)(((unsigned long int)(*sp)+ALIGN_BASE)&~ALIGN_BASE); +#else + (*spinors)[0] = *sp; +#endif + + for(i = 1; i < nr; i++){ + (*spinors)[i] = (*spinors)[i-1]+V; + } + + return(0); +} + +void free_spinor_field_array(spinor** sp) { +#if (defined _USE_SHMEM && !(defined _USE_HALFSPINOR)) + shfree(*sp); +#else + free(*sp); +#endif +} + + + +#ifndef _BENCH_ONLY +int init_csg_field(const int V) { + int i = 0, j = 0, sum = 0; + spinor * s; + for(i = 0; i < no_monomials; i++) { + sum += monomial_list[i].csg_N; + sum += monomial_list[i].csg_N2; + } + + /* if all histories are zero, we do not need initialisation */ + if(sum != 0) { +#if (defined _USE_SHMEM && !(defined _USE_HALFSPINOR)) + sp_csg = (spinor*)shmalloc((sum*V+1)*sizeof(spinor)); +#else + sp_csg = (spinor*)calloc(sum*V+1, sizeof(spinor)); +#endif + if(errno == ENOMEM) { + return(1); + } + for(i = 0; i < no_monomials; i++) { + monomial_list[i].csg_field = malloc((monomial_list[i].csg_N+1)*sizeof(spinor*)); + if(errno == ENOMEM) { + return(2); + } + monomial_list[i].csg_field2 = malloc(monomial_list[i].csg_N2*sizeof(spinor*)); + if(errno == ENOMEM) { + return(2); + } + } +#if ( defined SSE || defined SSE2 || defined SSE3) + s = (spinor*)(((unsigned long int)(sp_csg)+ALIGN_BASE)&~ALIGN_BASE); +#else + s = sp_csg; +#endif + for(j = 0; j < no_monomials; j++) { + if(monomial_list[j].csg_N != 0) { + for(i = 0; i < monomial_list[j].csg_N; i++) { + monomial_list[j].csg_field[i] = s; + s = s + V; + } + } + } + for(j = 0; j < no_monomials; j++) { + if(monomial_list[j].csg_N2 != 0) { + for(i = 0; i < monomial_list[j].csg_N2; i++) { + monomial_list[j].csg_field2[i] = s; + s = s + V; + } + } + } + + monomial_list[0].csg_index_array = (int*) malloc(sum*sizeof(int)); + for(i = 1; i < no_monomials; i++) { + monomial_list[i].csg_index_array = monomial_list[i-1].csg_index_array + monomial_list[i-1].csg_N; + } + monomial_list[0].csg_index_array2 = monomial_list[no_monomials-1].csg_index_array + + monomial_list[no_monomials-1].csg_N; + for(i = 1; i < no_monomials; i++) { + monomial_list[i].csg_index_array2 = monomial_list[i-1].csg_index_array2 + monomial_list[i-1].csg_N2; + } + } + return(0); +} + +#endif + +int init_timslice_buffer_field(const int t_slice) { + + if((void*)(sp_tbuff = (spinor*)calloc(t_slice+1, sizeof(spinor))) == NULL) { + printf ("malloc errno : %d\n",errno); + errno = 0; + return(3); + } + +#if (( defined SSE || defined SSE2 || defined SSE3) && defined _USE_TSPLITPAR ) + g_tbuff = (spinor*)(((unsigned long int)(sp_tbuff)+ALIGN_BASE)&~ALIGN_BASE); +#else + g_tbuff = sp_tbuff; +#endif + + return(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_spinor_field.h b/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_spinor_field.h new file mode 100644 index 0000000000000000000000000000000000000000..5aad71c1a43533b29f6b69f4971c3f5a5ba8bc04 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_spinor_field.h @@ -0,0 +1,33 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _INIT_SPINOR_FIELD_H +#define _INIT_SPINOR_FIELD_H + +int init_spinor_field(const int V, const int nr); +int init_spinor_field_32(const int V, const int nr); +int init_csg_field(const int V); + +int allocate_spinor_field_array(spinor ***spinors,spinor **sp,const int V, const int nr); +void free_spinor_field_array(spinor** sp); + +void free_spinor_field(); +void free_spinor_field_32(); +int init_timslice_buffer_field(const int t_slice); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_stout_smear_vars.c b/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_stout_smear_vars.c new file mode 100644 index 0000000000000000000000000000000000000000..3bbb986a9e8712b2fa2f0f991165ad7d7352d819 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_stout_smear_vars.c @@ -0,0 +1,566 @@ +/*********************************************************************** + * + * Copyright (C) 2007, 2008 Jan Volkholz, Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "sse.h" +#include "init_stout_smear_vars.h" + +su3 * gauge_field_saved; +su3 ** g_gauge_field_saved; +su3 * gauge_field_smeared; +su3 ** g_gauge_field_smeared; +su3 * C_smearing; +su3 ** g_C_smearing; +su3 * Q_smearing; +su3 ** g_Q_smearing; +su3 * Q_squared_smearing; +su3 ** g_Q_squared_smearing; +su3 * B1_smearing; +su3 ** g_B1_smearing; +su3 * B2_smearing; +su3 ** g_B2_smearing; +su3 * Gamma_smearing; +su3 ** g_Gamma_smearing; +su3 * Lambda_smearing; +su3 ** g_Lambda_smearing; + +double * g_c0_smearing; +double * g_c1_smearing; + +complex * g_f0_smearing; +complex * g_f1_smearing; +complex * g_f2_smearing; + +complex * g_b10_smearing; +complex * g_b11_smearing; +complex * g_b12_smearing; + +complex * g_b20_smearing; +complex * g_b21_smearing; +complex * g_b22_smearing; + +complex * g_r10_smearing; +complex * g_r11_smearing; +complex * g_r12_smearing; + +complex * g_r20_smearing; +complex * g_r21_smearing; +complex * g_r22_smearing; + +su3 * stout_force_field; +su3 ** g_stout_force_field; +su3 * previous_stout_force_field; +su3 ** g_previous_stout_force_field; + +/*----------------------------------------------------------------------------*/ + +int init_stout_smear_vars(const int V, const int stout_no_iter) +{ + + printf("Running init_stout_smear_vars\n"); + const int dim = 4 ; + + int i, k, x, mu; + + i = 0; + k = 0; + mu = 0; + + /* + * this is the field where we store the smeared force matrices \Sigma^{(k)}_\mu(x) + * eqtn (44) hep-lat/0311018 + */ + gauge_field_smeared = calloc(dim*V+1, sizeof(su3)); + if(errno == ENOMEM) + { + return(1); + } + + g_gauge_field_smeared = calloc(V, sizeof(su3**)); + if(errno == ENOMEM) + { + return(1); + } + +#if (defined SSE || defined SSE2 || defined SSE3) + g_gauge_field_smeared[0] = (su3*)(((unsigned long int)(gauge_field_smeared)+ALIGN_BASE)&~ALIGN_BASE); +#else + g_gauge_field_smeared[0] = gauge_field_smeared; +#endif + + for(x = 1; x < V; x++) + { + g_gauge_field_smeared[x] = g_gauge_field_smeared[x-1] + 4; + } + + /* + * this is the field where we store the smeared gauge_field + */ + gauge_field_saved = calloc(dim*V+1, sizeof(su3)); + if(errno == ENOMEM) + { + return(1); + } + + g_gauge_field_saved = calloc(V, sizeof(su3**)); + if(errno == ENOMEM) + { + return(1); + } + +#if (defined SSE || defined SSE2 || defined SSE3) + g_gauge_field_saved[0] = (su3*)(((unsigned long int)(gauge_field_saved)+ALIGN_BASE)&~ALIGN_BASE); +#else + g_gauge_field_saved[0] = gauge_field_saved; +#endif + + for(x = 1; x < V; x++) + { + g_gauge_field_saved[x] = g_gauge_field_saved[x-1] + 4; + } + + /* + * here we save the C matrix field from eqtn(1) in hep-lat/0311018 + */ + C_smearing = calloc(dim*V+1, sizeof(su3)); + if(errno == ENOMEM) + { + return(1); + } + + g_C_smearing = calloc(V, sizeof(su3**)); + if(errno == ENOMEM) + { + return(1); + } + +#if (defined SSE || defined SSE2 || defined SSE3) + g_C_smearing[0] = (su3*)(((unsigned long int)(C_smearing)+ALIGN_BASE)&~ALIGN_BASE); +#else + g_C_smearing[0] = C_smearing; +#endif + + for(x = 1; x < V; x++) + { + g_C_smearing[x] = g_C_smearing[x-1] + 4; + } + + /* + * here we save the Q matrix field from eqtn(2) in hep-lat/0311018 + */ + Q_smearing = calloc(dim*V+1, sizeof(su3)); + if(errno == ENOMEM) + { + return(1); + } + + g_Q_smearing = calloc(V, sizeof(su3**)); + if(errno == ENOMEM) + { + return(1); + } + +#if (defined SSE || defined SSE2 || defined SSE3) + g_Q_smearing[0] = (su3*)(((unsigned long int)(Q_smearing)+ALIGN_BASE)&~ALIGN_BASE); +#else + g_Q_smearing[0] = Q_smearing; +#endif + + for(x = 1; x < V; x++) + { + g_Q_smearing[x] = g_Q_smearing[x-1] + 4; + } + + /* + * this will hold the squared of the qbove + */ + Q_squared_smearing = calloc(dim*V+1, sizeof(su3)); + if(errno == ENOMEM) + { + return(1); + } + + g_Q_squared_smearing = calloc(V, sizeof(su3**)); + if(errno == ENOMEM) + { + return(1); + } + +#if (defined SSE || defined SSE2 || defined SSE3) + g_Q_squared_smearing[0] = (su3*)(((unsigned long int)(Q_squared_smearing)+ALIGN_BASE)&~ALIGN_BASE); +#else + g_Q_squared_smearing[0] = Q_squared_smearing; +#endif + + for(x = 1; x < V; x++) + { + g_Q_squared_smearing[x] = g_Q_squared_smearing[x-1] + 4; + } + + /* + * here we save the B1 and the B2 matrix field from eqtn(69) in hep-lat/0311018 + */ + B1_smearing = calloc(dim*V+1, sizeof(su3)); + if(errno == ENOMEM) + { + return(1); + } + B2_smearing = calloc(dim*V+1, sizeof(su3)); + if(errno == ENOMEM) + { + return(1); + } + + g_B1_smearing = calloc(V, sizeof(su3**)); + if(errno == ENOMEM) + { + return(1); + } + g_B2_smearing = calloc(V, sizeof(su3**)); + if(errno == ENOMEM) + { + return(1); + } + +#if (defined SSE || defined SSE2 || defined SSE3) + g_B1_smearing[0] = (su3*)(((unsigned long int)(B1_smearing)+ALIGN_BASE)&~ALIGN_BASE); + g_B2_smearing[0] = (su3*)(((unsigned long int)(B2_smearing)+ALIGN_BASE)&~ALIGN_BASE); +#else + g_B1_smearing[0] = B1_smearing; + g_B2_smearing[0] = B2_smearing; +#endif + for(x = 1; x < V; x++) + { + g_B1_smearing[x] = g_B1_smearing[x-1] + 4; + g_B2_smearing[x] = g_B2_smearing[x-1] + 4; + } + + /* + * here we hold the Gamma matrix field from eqtn(74) in hep-lat/0311018 + */ + Gamma_smearing = calloc(dim*V+1, sizeof(su3)); + if(errno == ENOMEM) + { + return(1); + } + + g_Gamma_smearing = calloc(V, sizeof(su3**)); + if(errno == ENOMEM) + { + return(1); + } + +#if (defined SSE || defined SSE2 || defined SSE3) + g_Gamma_smearing[0] = (su3*)(((unsigned long int)(Gamma_smearing)+ALIGN_BASE)&~ALIGN_BASE); +#else + g_Gamma_smearing[0] = Gamma_smearing; +#endif + + for(x = 1; x < V; x++) + { + g_Gamma_smearing[x] = g_Gamma_smearing[x-1] + 4; + } + + /* + * here we save the Lambda matrix field from eqtn(73) in hep-lat/0311018 + */ + Lambda_smearing = calloc(dim*V+1, sizeof(su3)); + if(errno == ENOMEM) + { + return(1); + } + + g_Lambda_smearing = calloc(V, sizeof(su3**)); + if(errno == ENOMEM) + { + return(1); + } + +#if (defined SSE || defined SSE2 || defined SSE3) + g_Lambda_smearing[0] = (su3*)(((unsigned long int)(Lambda_smearing)+ALIGN_BASE)&~ALIGN_BASE); +#else + g_Lambda_smearing[0] = Lambda_smearing; +#endif + + for(x = 1; x < V; x++) + { + g_Lambda_smearing[x] = g_Lambda_smearing[x-1] + 4; + } + + + /* + * these are the c_0 and c_1 fields from eqtns (14) and (15) in hep-lat/0311018 + */ + g_c0_smearing = calloc(V, sizeof(double)); + if(errno == ENOMEM) + { + return(1); + } + g_c1_smearing = calloc(V, sizeof(double)); + if(errno == ENOMEM) + { + return(1); + } + + /* + * these are the f0, f1 and f2 fields from eqtn(29) in hep-lat/0311018 + */ + g_f0_smearing = calloc(V, sizeof(complex)); + if(errno == ENOMEM) + { + return(1); + } + + g_f1_smearing = calloc(V, sizeof(complex)); + if(errno == ENOMEM) + { + return(1); + } + + g_f2_smearing = calloc(V, sizeof(complex)); + if(errno == ENOMEM) + { + return(1); + } + + /* + * these are the b10, b11f, b12, b20, b21 and b22 fields + * from eqtns (57) and (58) in hep-lat/0311018 + */ + g_b10_smearing = calloc(V, sizeof(complex)); + if(errno == ENOMEM) + { + return(1); + } + + g_b11_smearing = calloc(V, sizeof(complex)); + if(errno == ENOMEM) + { + return(1); + } + + g_b12_smearing = calloc(V, sizeof(complex)); + if(errno == ENOMEM) + { + return(1); + } + g_b20_smearing = calloc(V, sizeof(complex)); + if(errno == ENOMEM) + { + return(1); + } + + g_b21_smearing = calloc(V, sizeof(complex)); + if(errno == ENOMEM) + { + return(1); + } + + g_b22_smearing = calloc(V, sizeof(complex)); + if(errno == ENOMEM) + { + return(1); + } + + /* + * these are the r10, r11f, r12, r20, r21 and r22 fields + * from eqtns (57) and (58) in hep-lat/0311018 + */ + g_r10_smearing = calloc(V, sizeof(complex)); + if(errno == ENOMEM) + { + return(1); + } + + g_r11_smearing = calloc(V, sizeof(complex)); + if(errno == ENOMEM) + { + return(1); + } + + g_r12_smearing = calloc(V, sizeof(complex)); + if(errno == ENOMEM) + { + return(1); + } + g_r20_smearing = calloc(V, sizeof(complex)); + if(errno == ENOMEM) + { + return(1); + } + + g_r21_smearing = calloc(V, sizeof(complex)); + if(errno == ENOMEM) + { + return(1); + } + + g_r22_smearing = calloc(V, sizeof(complex)); + if(errno == ENOMEM) + { + return(1); + } + + /* + * this is the field where we store the smeared force matrices \Sigma^{(k)}_\mu(x) + * eqtn (44) hep-lat/0311018 + */ + stout_force_field = calloc(dim*V+1, sizeof(su3)); + if(errno == ENOMEM) + { + return(1); + } + + g_stout_force_field = calloc(V, sizeof(su3**)); + if(errno == ENOMEM) + { + return(1); + } + +#if (defined SSE || defined SSE2 || defined SSE3) + g_stout_force_field[0] = (su3*)(((unsigned long int)(stout_force_field)+ALIGN_BASE)&~ALIGN_BASE); +#else + g_stout_force_field[0] = stout_force_field; +#endif + + for(x = 1; x < V; x++) + { + g_stout_force_field[x] = g_stout_force_field[x-1] + 4; + } + + + /* + * we need a second force field to store \Sigma'_\mu(x) + * eqtn (44) hep-lat/0311018 + */ + previous_stout_force_field = calloc(dim*V+1, sizeof(su3)); + if(errno == ENOMEM) + { + return(1); + } + + g_previous_stout_force_field = calloc(V, sizeof(su3**)); + if(errno == ENOMEM) + { + return(1); + } + +#if (defined SSE || defined SSE2 || defined SSE3) + g_previous_stout_force_field[0] = (su3*)(((unsigned long int)(previous_stout_force_field)+ALIGN_BASE)&~ALIGN_BASE); +#else + g_previous_stout_force_field[0] = previous_stout_force_field; +#endif + + for(x = 1; x < V; x++) + { + g_previous_stout_force_field[x] = g_previous_stout_force_field[x-1] + 4; + } + + /*printf("Leaving init_stout_smear_vars\n");*/ + return(0); + + /* + * here we save the Q matrix field from eqtn(2) in hep-lat/0311018 + */ + /*Q_smear_iterations = calloc(stout_no_iter*dim*V, sizeof(su3)); + if(errno == ENOMEM) + { + return(1); + } + + g_Q_smear_iterations = calloc(stout_no_iter, sizeof(su3**)); + if(errno == ENOMEM) + { + return(1); + } + + tmp_su3_pointer = Q_smear_iterations; + for(i = 0; i < stout_no_iter; i++) + { + g_Q_smear_iterations[i] = calloc(V, sizeof(su3*)); + if(errno == ENOMEM) + { + return(1); + } + + for(x = 0; x < V; x++) + { + g_Q_smear_iterations[i][x] = tmp_su3_pointer; + tmp_su3_pointer += dim; + if(errno == ENOMEM) + { + return(1); + } + } + }*/ +} + +/*----------------------------------------------------------------------------*/ + +void free_stout_smear_vars() +{ + free(gauge_field_saved); + free(g_gauge_field_saved); + free(gauge_field_smeared); + free(g_gauge_field_smeared); + free(C_smearing); + free(g_C_smearing); + free(Q_smearing); + free(g_Q_smearing); + free(Q_squared_smearing); + free(g_Q_squared_smearing); + free(B1_smearing); + free(g_B1_smearing); + free(B2_smearing); + free(g_B2_smearing); + free(Gamma_smearing); + free(g_Gamma_smearing); + free(Lambda_smearing); + free(g_Lambda_smearing); + free(g_c0_smearing); + free(g_c1_smearing); + free(g_f0_smearing); + free(g_f1_smearing); + free(g_f2_smearing); + free(g_b10_smearing); + free(g_b11_smearing); + free(g_b12_smearing); + free(g_b20_smearing); + free(g_b21_smearing); + free(g_b22_smearing); + free(g_r10_smearing); + free(g_r11_smearing); + free(g_r12_smearing); + free(g_r20_smearing); + free(g_r21_smearing); + free(g_r22_smearing); + free(stout_force_field); + free(g_stout_force_field); + free(previous_stout_force_field); + free(g_previous_stout_force_field); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_stout_smear_vars.h b/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_stout_smear_vars.h new file mode 100644 index 0000000000000000000000000000000000000000..9da4058e3fd953afbc5694f0690e7fc8c9bb5d51 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/init/init_stout_smear_vars.h @@ -0,0 +1,27 @@ +/*********************************************************************** + * + * Copyright (C) 2007,2008 Jan Volkholz, Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _INIT_STOUT_SMEAR_VARS_H +#define _INIT_STOUT_SMEAR_VARS_H + +int init_stout_smear_vars(const int V, const int stout_no_iter); +void free_stout_smear_vars(); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/init/libinit.a b/qcd/part_cpu/applications/QCD/src/kernel_D/init/libinit.a new file mode 100644 index 0000000000000000000000000000000000000000..0e4b3f12dd61e7805d8abbf3be299d56cb2405e6 Binary files /dev/null and b/qcd/part_cpu/applications/QCD/src/kernel_D/init/libinit.a differ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/install-sh b/qcd/part_cpu/applications/QCD/src/kernel_D/install-sh new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/integrator.c b/qcd/part_cpu/applications/QCD/src/kernel_D/integrator.c new file mode 100644 index 0000000000000000000000000000000000000000..b2a709a91ee44d6bbf23e66d52b6703b122829f2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/integrator.c @@ -0,0 +1,348 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * 2012 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#include "global.h" +#include "monomial/monomial.h" +#include "update_momenta.h" +#include "update_gauge.h" +#include "hamiltonian_field.h" +#include "integrator.h" + +integrator Integrator; + +static const double omf4_rho = 0.2539785108410595; +static const double omf4_theta = -0.03230286765269967; +static const double omf4_vartheta = 0.08398315262876693; +static const double omf4_lamb = 0.6822365335719091; + +/* second order minimal norm integration scheme */ +void integrate_2mn(const double tau, const int S, const int halfstep); +/* second order minimal norm integration scheme in velocity version */ +void integrate_2mnp(const double tau, const int S, const int halfstep); +/* Leap Frog integration scheme */ +void integrate_leap_frog(const double tau, const int S, const int halfstep); +/* fourth order OMF scheme */ +void integrate_omf4(const double tau, const int S, const int halfstep); +/* half step function */ +void dohalfstep(const double tau, const int S); + +/* function to initialise the integrator, to be called once at the beginning */ + +int init_integrator() { + int i, ts; + Integrator.hf.gaugefield = (su3 **) NULL; + Integrator.hf.momenta = (su3adj **) NULL; + Integrator.hf.derivative = (su3adj **) NULL; + for(i = 0; i < 10; i++) { + Integrator.no_mnls_per_ts[i] = 0; + } + if(Integrator.type[Integrator.no_timescales-1] == MN2p) { + for(i = 0; i < Integrator.no_timescales; i++) { + Integrator.type[i] = MN2p; + Integrator.integrate[i] = &integrate_2mnp; + } + } + else { + for(i = 0; i < Integrator.no_timescales; i++) { + if(Integrator.type[i] == MN2 || Integrator.type[i] == MN2p) { + Integrator.integrate[i] = &integrate_2mn; + } + else if(Integrator.type[i] == LEAPFROG) { + Integrator.integrate[i] = &integrate_leap_frog; + } + else if(Integrator.type[i] == OMF4) { + Integrator.integrate[i] = &integrate_omf4; + } + } + } + + for(i = 0; i < no_monomials; i++) { + ts = monomial_list[i].timescale; + if(ts < Integrator.no_timescales && ts > -1) { + Integrator.mnls_per_ts[ ts ][ Integrator.no_mnls_per_ts[ts] ] = monomial_list[i].id; + Integrator.no_mnls_per_ts[ ts ]++; + } + else { + if(g_proc_id == 0) { + fprintf(stderr, "Warning: monomial %d is not on a valid timescale and will not be integrated\n", i); + } + } + } + for(i = 0; i < Integrator.no_timescales; i++) { + if(Integrator.no_mnls_per_ts[ i ] < 1) { + fprintf(stderr, "Error, no monomial on timescale %d!\nAborting...\n", i); + exit(-1); + } + } + return(0); +} + +/* function to set the gauge and momenta fields for the integration */ + +void integrator_set_fields(hamiltonian_field_t * hf) { + Integrator.hf.gaugefield = hf->gaugefield; + Integrator.hf.momenta = hf->momenta; + Integrator.hf.derivative = hf->derivative; + Integrator.hf.update_gauge_copy = hf->update_gauge_copy; + return; +} + +/* and unsets again (to NULL pointer ) */ + +void integrator_unset_fields() { + Integrator.hf.gaugefield = (su3 **) NULL; + Integrator.hf.momenta = (su3adj **) NULL; + Integrator.hf.derivative = (su3adj **) NULL; + return; +} + +void integrate_omf4(const double tau, const int S, const int halfstep) { + int i,j=0; + integrator * itgr = &Integrator; + double eps; + + if(S == itgr->no_timescales-1) { + dohalfstep(tau, S); + } + eps = tau/((double)itgr->n_int[S]); + + if(S == 0) { + + for(j = 1; j < itgr->n_int[0]; j++) { + update_gauge(omf4_rho*eps, &itgr->hf); + update_momenta(itgr->mnls_per_ts[0], omf4_lamb*eps, itgr->no_mnls_per_ts[0], &itgr->hf); + update_gauge(omf4_theta*eps, &itgr->hf); + update_momenta(itgr->mnls_per_ts[0], 0.5*(1-2.*(omf4_lamb+omf4_vartheta))*eps, itgr->no_mnls_per_ts[0], &itgr->hf); + update_gauge((1-2.*(omf4_theta+omf4_rho))*eps, &itgr->hf); + update_momenta(itgr->mnls_per_ts[0], 0.5*(1-2.*(omf4_lamb+omf4_vartheta))*eps, itgr->no_mnls_per_ts[0], &itgr->hf); + update_gauge(omf4_theta*eps, &itgr->hf); + update_momenta(itgr->mnls_per_ts[0], omf4_lamb*eps, itgr->no_mnls_per_ts[0], &itgr->hf); + update_gauge(omf4_rho*eps, &itgr->hf); + update_momenta(itgr->mnls_per_ts[0], 2*omf4_vartheta*eps, itgr->no_mnls_per_ts[0], &itgr->hf); + } + update_gauge(omf4_rho*eps, &itgr->hf); + update_momenta(itgr->mnls_per_ts[0], omf4_lamb*eps, itgr->no_mnls_per_ts[0], &itgr->hf); + update_gauge(omf4_theta*eps, &itgr->hf); + update_momenta(itgr->mnls_per_ts[0], 0.5*(1-2.*(omf4_lamb+omf4_vartheta))*eps, itgr->no_mnls_per_ts[0], &itgr->hf); + update_gauge((1-2.*(omf4_theta+omf4_rho))*eps, &itgr->hf); + update_momenta(itgr->mnls_per_ts[0], 0.5*(1-2.*(omf4_lamb+omf4_vartheta))*eps, itgr->no_mnls_per_ts[0], &itgr->hf); + update_gauge(omf4_theta*eps, &itgr->hf); + update_momenta(itgr->mnls_per_ts[0], omf4_lamb*eps, itgr->no_mnls_per_ts[0], &itgr->hf); + update_gauge(omf4_rho*eps, &itgr->hf); + if(halfstep != 1) { + update_momenta(itgr->mnls_per_ts[0], 2*omf4_vartheta*eps, itgr->no_mnls_per_ts[0], &itgr->hf); + } + } + else { + for(i = 1; i < itgr->n_int[S]; i++){ + itgr->integrate[S-1](omf4_rho*eps, S-1, 0); + update_momenta(itgr->mnls_per_ts[S], omf4_lamb*eps, itgr->no_mnls_per_ts[S], &itgr->hf); + itgr->integrate[S-1](omf4_theta*eps, S-1, 0); + update_momenta(itgr->mnls_per_ts[S], 0.5*(1-2.*(omf4_lamb+omf4_vartheta))*eps, itgr->no_mnls_per_ts[S], &itgr->hf); + itgr->integrate[S-1]((1-2.*(omf4_theta+omf4_rho))*eps, S-1, 0); + update_momenta(itgr->mnls_per_ts[S], 0.5*(1-2.*(omf4_lamb+omf4_vartheta))*eps, itgr->no_mnls_per_ts[S], &itgr->hf); + itgr->integrate[S-1](omf4_theta*eps, S-1, 0); + update_momenta(itgr->mnls_per_ts[S], omf4_lamb*eps, itgr->no_mnls_per_ts[S], &itgr->hf); + itgr->integrate[S-1](omf4_rho*eps, S-1, 0); + update_momenta(itgr->mnls_per_ts[S], 2*omf4_vartheta*eps, itgr->no_mnls_per_ts[S], &itgr->hf); + } + itgr->integrate[S-1](omf4_rho*eps, S-1, 0); + update_momenta(itgr->mnls_per_ts[S], omf4_lamb*eps, itgr->no_mnls_per_ts[S], &itgr->hf); + itgr->integrate[S-1](omf4_theta*eps, S-1, 0); + update_momenta(itgr->mnls_per_ts[S], 0.5*(1-2.*(omf4_lamb+omf4_vartheta))*eps, itgr->no_mnls_per_ts[S], &itgr->hf); + itgr->integrate[S-1]((1-2.*(omf4_theta+omf4_rho))*eps, S-1, 0); + update_momenta(itgr->mnls_per_ts[S], 0.5*(1-2.*(omf4_lamb+omf4_vartheta))*eps, itgr->no_mnls_per_ts[S], &itgr->hf); + itgr->integrate[S-1](omf4_theta*eps, S-1, 0); + update_momenta(itgr->mnls_per_ts[S], omf4_lamb*eps, itgr->no_mnls_per_ts[S], &itgr->hf); + if(S == itgr->no_timescales-1) { + itgr->integrate[S-1](omf4_rho*eps, S-1, 1); + } + else itgr->integrate[S-1](omf4_rho*eps, S-1, halfstep); + if(halfstep != 1 && S != itgr->no_timescales-1) { + update_momenta(itgr->mnls_per_ts[S], 2*omf4_vartheta*eps, itgr->no_mnls_per_ts[S], &itgr->hf); + } + } + + if(S == itgr->no_timescales-1) { + dohalfstep(tau, S); + } + return; +} + +/* the following are only needed locally */ + +void integrate_2mn(const double tau, const int S, const int halfstep) { + int i,j=0; + integrator * itgr = &Integrator; + double eps, + oneminus2lambda = (1.-2.*itgr->lambda[S]); + + if(S == itgr->no_timescales-1) { + dohalfstep(tau, S); + } + + eps = tau/((double)itgr->n_int[S]); + if(S == 0) { + + for(j = 1; j < itgr->n_int[0]; j++) { + update_gauge(0.5*eps, &itgr->hf); + update_momenta(itgr->mnls_per_ts[0], oneminus2lambda*eps, itgr->no_mnls_per_ts[0], &itgr->hf); + update_gauge(0.5*eps, &itgr->hf); + update_momenta(itgr->mnls_per_ts[0], 2.*itgr->lambda[0]*eps, itgr->no_mnls_per_ts[0], &itgr->hf); + } + update_gauge(0.5*eps, &itgr->hf); + update_momenta(itgr->mnls_per_ts[0], oneminus2lambda*eps, itgr->no_mnls_per_ts[0], &itgr->hf); + update_gauge(0.5*eps, &itgr->hf); + if(halfstep != 1) { + update_momenta(itgr->mnls_per_ts[0], 2*itgr->lambda[0]*eps, itgr->no_mnls_per_ts[0], &itgr->hf); + } + } + else { + for(i = 1; i < itgr->n_int[S]; i++){ + itgr->integrate[S-1](eps/2., S-1, 0); + update_momenta(itgr->mnls_per_ts[S], oneminus2lambda*eps, itgr->no_mnls_per_ts[S], &itgr->hf); + itgr->integrate[S-1](eps/2., S-1, 0); + update_momenta(itgr->mnls_per_ts[S], 2*itgr->lambda[S]*eps, itgr->no_mnls_per_ts[S], &itgr->hf); + } + itgr->integrate[S-1](eps/2., S-1, 0); + update_momenta(itgr->mnls_per_ts[S], oneminus2lambda*eps, itgr->no_mnls_per_ts[S], &itgr->hf); + if(S == itgr->no_timescales-1) { + itgr->integrate[S-1](eps/2., S-1, 1); + } + else itgr->integrate[S-1](eps/2., S-1, halfstep); + if(halfstep != 1 && S != itgr->no_timescales-1) { + update_momenta(itgr->mnls_per_ts[S], 2*itgr->lambda[S]*eps, itgr->no_mnls_per_ts[S], &itgr->hf); + } + } + + if(S == itgr->no_timescales-1) { + dohalfstep(tau, S); + } +} + +void integrate_2mnp(const double tau, const int S, const int halfstep) { + int i; + integrator * itgr = &Integrator; + double eps = tau/((double)itgr->n_int[S]); + double oneminus2lambda = (1.-2.*itgr->lambda[S]); + + if(S == 0) { + update_gauge(itgr->lambda[0]*eps, &itgr->hf); + for(i = 1; i < itgr->n_int[0]; i++) { + update_momenta(itgr->mnls_per_ts[0], 0.5*eps, itgr->no_mnls_per_ts[0], &itgr->hf); + update_gauge(oneminus2lambda*eps, &itgr->hf); + update_momenta(itgr->mnls_per_ts[0], 0.5*eps, itgr->no_mnls_per_ts[0], &itgr->hf); + update_gauge(2*itgr->lambda[0]*eps, &itgr->hf); + } + update_momenta(itgr->mnls_per_ts[0], 0.5*eps, itgr->no_mnls_per_ts[0], &itgr->hf); + update_gauge(oneminus2lambda*eps, &itgr->hf); + update_momenta(itgr->mnls_per_ts[0], 0.5*eps, itgr->no_mnls_per_ts[0], &itgr->hf); + update_gauge(itgr->lambda[0]*eps, &itgr->hf); + } + else { + for(i = 0; i < itgr->n_int[S]; i++) { + integrate_2mnp(itgr->lambda[S]*eps, S-1, halfstep); + update_momenta(itgr->mnls_per_ts[S], 0.5*eps, itgr->no_mnls_per_ts[S], &itgr->hf); + + integrate_2mnp(oneminus2lambda*eps, S-1, halfstep); + update_momenta(itgr->mnls_per_ts[S], 0.5*eps, itgr->no_mnls_per_ts[S], &itgr->hf); + + integrate_2mnp(itgr->lambda[S]*eps, S-1, halfstep); + } + } +} + + +void integrate_leap_frog(const double tau, const int S, const int halfstep) { + int i; + integrator * itgr = &Integrator; + double eps, eps0; + + if(S == itgr->no_timescales-1) { + dohalfstep(tau, S); + } + + eps = tau/((double)itgr->n_int[S]); + if(S == 0) { + eps0 = tau/((double)itgr->n_int[0]); + for(i = 1; i < itgr->n_int[0]; i++) { + update_gauge(eps0, &itgr->hf); + update_momenta(itgr->mnls_per_ts[0], eps0, itgr->no_mnls_per_ts[0], &itgr->hf); + } + update_gauge(eps0, &itgr->hf); + if(halfstep != 1) { + update_momenta(itgr->mnls_per_ts[0], eps0, itgr->no_mnls_per_ts[0], &itgr->hf); + } + } + else { + for(i = 1; i < itgr->n_int[S]; i++){ + itgr->integrate[S-1](eps, S-1, 0); + update_momenta(itgr->mnls_per_ts[S], eps, itgr->no_mnls_per_ts[S], &itgr->hf); + } + if(S == itgr->no_timescales-1) { + itgr->integrate[S-1](eps, S-1, 1); + } + else itgr->integrate[S-1](eps, S-1, halfstep); + if(halfstep != 1 && S != itgr->no_timescales-1) { + update_momenta(itgr->mnls_per_ts[S], eps, itgr->no_mnls_per_ts[S], &itgr->hf); + } + } + + if(S == itgr->no_timescales-1) { + dohalfstep(tau, S); + } +} + + +void dohalfstep(const double tau, const int S) { + integrator * itgr = &Integrator; + double eps = tau/((double)itgr->n_int[S]); + for(int i = S; i > 0; i--) { + if(itgr->type[i] == LEAPFROG) { + update_momenta(itgr->mnls_per_ts[i], 0.5*eps, itgr->no_mnls_per_ts[i], &itgr->hf); + eps /= ((double)itgr->n_int[i-1]); + } + else if(itgr->type[i] == MN2){ + update_momenta(itgr->mnls_per_ts[i], itgr->lambda[i]*eps, itgr->no_mnls_per_ts[i], &itgr->hf); + eps /= ((double)itgr->n_int[i-1])*2; + } + else if(itgr->type[i] == OMF4) { + update_momenta(itgr->mnls_per_ts[i], omf4_vartheta*eps, itgr->no_mnls_per_ts[i], &itgr->hf); + eps /= ((double)itgr->n_int[i-1])/omf4_rho; + } + } + if(itgr->type[0] == LEAPFROG) { + update_momenta(itgr->mnls_per_ts[0], 0.5*eps, itgr->no_mnls_per_ts[0], &itgr->hf); + } + else if(itgr->type[0] == MN2) { + update_momenta(itgr->mnls_per_ts[0], itgr->lambda[0]*eps, itgr->no_mnls_per_ts[0], &itgr->hf); + } + else if(itgr->type[0] == OMF4) { + update_momenta(itgr->mnls_per_ts[0], omf4_vartheta*eps, itgr->no_mnls_per_ts[0], &itgr->hf); + } + return; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/integrator.h b/qcd/part_cpu/applications/QCD/src/kernel_D/integrator.h new file mode 100644 index 0000000000000000000000000000000000000000..f8f48c90863a8e4584c1d90ef58fcfe2f773d299 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/integrator.h @@ -0,0 +1,70 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _INTEGRATOR_H +#define _INTEGRATOR_H + +#include +#include +#include + +#define LEAPFROG 1 +#define SEXTON 2 +#define EXTLEAPFROG 3 +#define EXTSEXTON 4 +#define IMPRLEAPFROG 5 +#define MN2 6 +#define MN2p 7 +#define OMF4 8 + +typedef void (*integratefk)(const double, const int, const int); + +typedef struct { + /* gauge, momenta and derivative fields to be used during integration */ + hamiltonian_field_t hf; + /* list of types of integrators */ + int type[10]; + /* number of timescales */ + int no_timescales; + /* monitor forces */ + int monitor_forces; + /* steps per timescale */ + int n_int[10]; + /* trajectory length */ + double tau; + /* lambda parameter for 2MN integration scheme */ + double lambda[10]; + /* monomials per timescale */ + int mnls_per_ts[10][10]; + /* number of monomials per timescale */ + int no_mnls_per_ts[10]; + /* function pointers to integration scheme functions */ + integratefk integrate[10]; +} integrator; + +extern integrator Integrator; + +/* all following functions are currently defined in integrator.c */ +/* function to initialise the integrator, to be called once at the beginning */ +int init_integrator(); +/* function to set the gauge and momenta fields for the integration */ +void integrator_set_fields(hamiltonian_field_t * hf); +/* and unsets again (to NULL pointer ) */ +void integrator_unset_fields(); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/invert.c b/qcd/part_cpu/applications/QCD/src/kernel_D/invert.c new file mode 100644 index 0000000000000000000000000000000000000000..123903b588340fb7fd3b6c017947c666b6d39c21 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/invert.c @@ -0,0 +1,596 @@ +/*********************************************************************** + * + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * invert for twisted mass QCD + * + * Author: Carsten Urbach + * urbach@physik.fu-berlin.de + * + *******************************************************************************/ +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef BENCHMARK +#include <./c-lime/include/lime.h> +#else +#include +#endif + +#include +#include +#include +#include +#include +#include +#ifdef MPI +#include +#endif +#ifdef OMP +# include +#endif +#include "global.h" +#include "git_hash.h" +#include "getopt.h" +#include "linalg_eo.h" +#include "geometry_eo.h" +#include "start.h" +/*#include "eigenvalues.h"*/ +#include "measure_gauge_action.h" +#ifdef MPI +#include "xchange/xchange.h" +#endif +#include +#include "source_generation.h" +#include "read_input.h" +#include "mpi_init.h" +#include "sighandler.h" +#include "boundary.h" +#include "solver/solver.h" +#include "init/init.h" +#include "smearing/stout.h" +#include "invert_eo.h" +#include "monomial/monomial.h" +#include "ranlxd.h" +#include "phmc.h" +#include "operator/D_psi.h" +#include "little_D.h" +#include "reweighting_factor.h" +#include "linalg/convert_eo_to_lexic.h" +#include "block.h" +#include "operator.h" +#include "sighandler.h" +#include "solver/dfl_projector.h" +#include "solver/generate_dfl_subspace.h" +#include "prepare_source.h" +#include +#include +#include +#include +#include "solver/dirac_operator_eigenvectors.h" +#include "P_M_eta.h" +#include "operator/tm_operators.h" +#include "operator/Dov_psi.h" +#include "solver/spectral_proj.h" +#ifdef QUDA +# include "quda_interface.h" +#endif +#include "meas/measurements.h" + +extern int nstore; +int check_geometry(); + +static void usage(); +static void process_args(int argc, char *argv[], char ** input_filename, char ** filename); +static void set_default_filenames(char ** input_filename, char ** filename); + +int main(int argc, char *argv[]) +{ + FILE *parameterfile = NULL; + int j, i, ix = 0, isample = 0, op_id = 0; + char datafilename[206]; + char parameterfilename[206]; + char conf_filename[50]; + char * input_filename = NULL; + char * filename = NULL; + double plaquette_energy; + struct stout_parameters params_smear; + spinor **s, *s_; + +#ifdef _KOJAK_INST +#pragma pomp inst init +#pragma pomp inst begin(main) +#endif + +#if (defined SSE || defined SSE2 || SSE3) + signal(SIGILL, &catch_ill_inst); +#endif + + DUM_DERI = 8; + DUM_MATRIX = DUM_DERI + 5; + NO_OF_SPINORFIELDS = DUM_MATRIX + 3; + + //4 extra fields (corresponding to DUM_MATRIX+0..5) for deg. and ND matrix mult. + NO_OF_SPINORFIELDS_32 = 6; + + verbose = 0; + g_use_clover_flag = 0; + +#ifdef MPI + +# ifdef OMP + int mpi_thread_provided; + MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_thread_provided); +# else + MPI_Init(&argc, &argv); +# endif + + MPI_Comm_rank(MPI_COMM_WORLD, &g_proc_id); +#else + g_proc_id = 0; +#endif + + process_args(argc,argv,&input_filename,&filename); + set_default_filenames(&input_filename, &filename); + + /* Read the input file */ + if( (j = read_input(input_filename)) != 0) { + fprintf(stderr, "Could not find input file: %s\nAborting...\n", input_filename); + exit(-1); + } + +#ifdef OMP + init_openmp(); +#endif + + /* this DBW2 stuff is not needed for the inversion ! */ + if (g_dflgcr_flag == 1) { + even_odd_flag = 0; + } + g_rgi_C1 = 0; + if (Nsave == 0) { + Nsave = 1; + } + + if (g_running_phmc) { + NO_OF_SPINORFIELDS = DUM_MATRIX + 8; + } + + tmlqcd_mpi_init(argc, argv); + + g_dbw2rand = 0; + + /* starts the single and double precision random number */ + /* generator */ + start_ranlux_KD(rlxd_level, random_seed); + + /* we need to make sure that we don't have even_odd_flag = 1 */ + /* if any of the operators doesn't use it */ + /* in this way even/odd can still be used by other operators */ + for(j = 0; j < no_operators; j++) if(!operator_list[j].even_odd_flag) even_odd_flag = 0; + +#ifndef MPI + g_dbw2rand = 0; +#endif + +#ifdef _GAUGE_COPY + j = init_gauge_field(VOLUMEPLUSRAND, 1); + j += init_gauge_field_32(VOLUMEPLUSRAND, 1); +#else + j = init_gauge_field(VOLUMEPLUSRAND, 0); + j += init_gauge_field_32(VOLUMEPLUSRAND, 0); +#endif + + if (j != 0) { + fprintf(stderr, "Not enough memory for gauge_fields! Aborting...\n"); + exit(-1); + } + j = init_geometry_indices(VOLUMEPLUSRAND); + if (j != 0) { + fprintf(stderr, "Not enough memory for geometry indices! Aborting...\n"); + exit(-1); + } + if (no_monomials > 0) { + if (even_odd_flag) { + j = init_monomials(VOLUMEPLUSRAND / 2, even_odd_flag); + } + else { + j = init_monomials(VOLUMEPLUSRAND, even_odd_flag); + } + if (j != 0) { + fprintf(stderr, "Not enough memory for monomial pseudo fermion fields! Aborting...\n"); + exit(-1); + } + } + if (even_odd_flag) { + j = init_spinor_field(VOLUMEPLUSRAND / 2, NO_OF_SPINORFIELDS); + j += init_spinor_field_32(VOLUMEPLUSRAND / 2, NO_OF_SPINORFIELDS_32); + } + else { + j = init_spinor_field(VOLUMEPLUSRAND, NO_OF_SPINORFIELDS); + j += init_spinor_field_32(VOLUMEPLUSRAND, NO_OF_SPINORFIELDS_32); + } + if (j != 0) { + fprintf(stderr, "Not enough memory for spinor fields! Aborting...\n"); + exit(-1); + } + + if (g_running_phmc) { + j = init_chi_spinor_field(VOLUMEPLUSRAND / 2, 20); + if (j != 0) { + fprintf(stderr, "Not enough memory for PHMC Chi fields! Aborting...\n"); + exit(-1); + } + } + + g_mu = g_mu1; + + if (g_cart_id == 0) { + /*construct the filenames for the observables and the parameters*/ + strncpy(datafilename, filename, 200); + strcat(datafilename, ".data"); + strncpy(parameterfilename, filename, 200); + strcat(parameterfilename, ".para"); + + parameterfile = fopen(parameterfilename, "w"); + write_first_messages(parameterfile, "invert", git_hash); + fclose(parameterfile); + } + + /* define the geometry */ + geometry(); + + /* define the boundary conditions for the fermion fields */ + boundary(g_kappa); + + phmc_invmaxev = 1.; + + init_operators(); + + /* list and initialize measurements*/ + if(g_proc_id == 0) { + printf("\n"); + for(int j = 0; j < no_measurements; j++) { + printf("# measurement id %d, type = %d\n", j, measurement_list[j].type); + } + } + init_measurements(); + + /* this could be maybe moved to init_operators */ +#ifdef _USE_HALFSPINOR + j = init_dirac_halfspinor(); + if (j != 0) { + fprintf(stderr, "Not enough memory for halffield! Aborting...\n"); + exit(-1); + } + /* for mixed precision solvers, the 32 bit halfspinor field must always be there */ + j = init_dirac_halfspinor32(); + if (j != 0) + { + fprintf(stderr, "Not enough memory for 32-bit halffield! Aborting...\n"); + exit(-1); + } +# if (defined _PERSISTENT) + if (even_odd_flag) + init_xchange_halffield(); +# endif +#endif + + for (j = 0; j < Nmeas; j++) { + sprintf(conf_filename, "%s.%.4d", gauge_input_filename, nstore); + if (g_cart_id == 0) { + printf("#\n# Trying to read gauge field from file %s in %s precision.\n", + conf_filename, (gauge_precision_read_flag == 32 ? "single" : "double")); + fflush(stdout); + } + if( (i = read_gauge_field(conf_filename,g_gauge_field)) !=0) { + fprintf(stderr, "Error %d while reading gauge field from %s\n Aborting...\n", i, conf_filename); + exit(-2); + } + + + if (g_cart_id == 0) { + printf("# Finished reading gauge field.\n"); + fflush(stdout); + } +#ifdef MPI + xchange_gauge(g_gauge_field); +#endif + /*Convert to a 32 bit gauge field, after xchange*/ + convert_32_gauge_field(g_gauge_field_32, g_gauge_field, VOLUMEPLUSRAND); + /*compute the energy of the gauge field*/ + plaquette_energy = measure_plaquette( (const su3**) g_gauge_field); + + if (g_cart_id == 0) { + printf("# The computed plaquette value is %e.\n", plaquette_energy / (6.*VOLUME*g_nproc)); + fflush(stdout); + } + + if (use_stout_flag == 1){ + params_smear.rho = stout_rho; + params_smear.iterations = stout_no_iter; +/* if (stout_smear((su3_tuple*)(g_gauge_field[0]), ¶ms_smear, (su3_tuple*)(g_gauge_field[0])) != 0) */ +/* exit(1) ; */ + g_update_gauge_copy = 1; + plaquette_energy = measure_plaquette( (const su3**) g_gauge_field); + + if (g_cart_id == 0) { + printf("# The plaquette value after stouting is %e\n", plaquette_energy / (6.*VOLUME*g_nproc)); + fflush(stdout); + } + } + + /* if any measurements are defined in the input file, do them here */ + measurement * meas; + for(int imeas = 0; imeas < no_measurements; imeas++){ + meas = &measurement_list[imeas]; + if (g_proc_id == 0) { + fprintf(stdout, "#\n# Beginning online measurement.\n"); + } + meas->measurefunc(nstore, imeas, even_odd_flag); + } + + if (reweighting_flag == 1) { + reweighting_factor(reweighting_samples, nstore); + } + + /* Compute minimal eigenvalues, if wanted */ + if (compute_evs != 0) { + eigenvalues(&no_eigenvalues, 5000, eigenvalue_precision, + 0, compute_evs, nstore, even_odd_flag); + } + if (phmc_compute_evs != 0) { +#ifdef MPI + MPI_Finalize(); +#endif + return(0); + } + + /* Compute the mode number or topological susceptibility using spectral projectors, if wanted*/ + + if(compute_modenumber != 0 || compute_topsus !=0){ + + s_ = calloc(no_sources_z2*VOLUMEPLUSRAND+1, sizeof(spinor)); + s = calloc(no_sources_z2, sizeof(spinor*)); + if(s_ == NULL) { + printf("Not enough memory in %s: %d",__FILE__,__LINE__); exit(42); + } + if(s == NULL) { + printf("Not enough memory in %s: %d",__FILE__,__LINE__); exit(42); + } + + + for(i = 0; i < no_sources_z2; i++) { +#if (defined SSE3 || defined SSE2 || defined SSE) + s[i] = (spinor*)(((unsigned long int)(s_)+ALIGN_BASE)&~ALIGN_BASE)+i*VOLUMEPLUSRAND; +#else + s[i] = s_+i*VOLUMEPLUSRAND; +#endif + + random_spinor_field_lexic(s[i], reproduce_randomnumber_flag,RN_Z2); + +/* what is this here needed for?? */ +/* spinor *aux_,*aux; */ +/* #if ( defined SSE || defined SSE2 || defined SSE3 ) */ +/* aux_=calloc(VOLUMEPLUSRAND+1, sizeof(spinor)); */ +/* aux = (spinor *)(((unsigned long int)(aux_)+ALIGN_BASE)&~ALIGN_BASE); */ +/* #else */ +/* aux_=calloc(VOLUMEPLUSRAND, sizeof(spinor)); */ +/* aux = aux_; */ +/* #endif */ + + if(g_proc_id == 0) { + printf("source %d \n", i); + } + + if(compute_modenumber != 0){ + mode_number(s[i], mstarsq); + } + + if(compute_topsus !=0) { + top_sus(s[i], mstarsq); + } + } + free(s); + free(s_); + } + + + /* move to operators as well */ + if (g_dflgcr_flag == 1) { + /* set up deflation blocks */ + init_blocks(nblocks_t, nblocks_x, nblocks_y, nblocks_z); + + /* the can stay here for now, but later we probably need */ + /* something like init_dfl_solver called somewhere else */ + /* create set of approximate lowest eigenvectors ("global deflation subspace") */ + + /* g_mu = 0.; */ + /* boundary(0.125); */ + generate_dfl_subspace(g_N_s, VOLUME, reproduce_randomnumber_flag); + /* boundary(g_kappa); */ + /* g_mu = g_mu1; */ + + /* Compute little Dirac operators */ + /* alt_block_compute_little_D(); */ + if (g_debug_level > 0) { + check_projectors(reproduce_randomnumber_flag); + check_local_D(reproduce_randomnumber_flag); + } + if (g_debug_level > 1) { + check_little_D_inversion(reproduce_randomnumber_flag); + } + + } + if(SourceInfo.type == 1) { + index_start = 0; + index_end = 1; + } + + g_precWS=NULL; + if(use_preconditioning == 1){ + /* todo load fftw wisdom */ +#if (defined HAVE_FFTW ) && !( defined MPI) + loadFFTWWisdom(g_spinor_field[0],g_spinor_field[1],T,LX); +#else + use_preconditioning=0; +#endif + } + + if (g_cart_id == 0) { + fprintf(stdout, "#\n"); /*Indicate starting of the operator part*/ + } + for(op_id = 0; op_id < no_operators; op_id++) { + boundary(operator_list[op_id].kappa); + g_kappa = operator_list[op_id].kappa; + g_mu = 0.; + + if(use_preconditioning==1 && PRECWSOPERATORSELECT[operator_list[op_id].solver]!=PRECWS_NO ){ + printf("# Using preconditioning with treelevel preconditioning operator: %s \n", + precWSOpToString(PRECWSOPERATORSELECT[operator_list[op_id].solver])); + /* initial preconditioning workspace */ + operator_list[op_id].precWS=(spinorPrecWS*)malloc(sizeof(spinorPrecWS)); + spinorPrecWS_Init(operator_list[op_id].precWS, + operator_list[op_id].kappa, + operator_list[op_id].mu/2./operator_list[op_id].kappa, + -(0.5/operator_list[op_id].kappa-4.), + PRECWSOPERATORSELECT[operator_list[op_id].solver]); + g_precWS = operator_list[op_id].precWS; + + if(PRECWSOPERATORSELECT[operator_list[op_id].solver] == PRECWS_D_DAGGER_D) { + fitPrecParams(op_id); + } + } + + for(isample = 0; isample < no_samples; isample++) { + for (ix = index_start; ix < index_end; ix++) { + if (g_cart_id == 0) { + fprintf(stdout, "#\n"); /*Indicate starting of new index*/ + } + /* we use g_spinor_field[0-7] for sources and props for the moment */ + /* 0-3 in case of 1 flavour */ + /* 0-7 in case of 2 flavours */ + prepare_source(nstore, isample, ix, op_id, read_source_flag, source_location); + //randmize initial guess for eigcg if needed-----experimental + if( (operator_list[op_id].solver == INCREIGCG) && (operator_list[op_id].solver_params.eigcg_rand_guess_opt) ){ //randomize the initial guess + gaussian_volume_source( operator_list[op_id].prop0, operator_list[op_id].prop1,isample,ix,0); //need to check this + } + operator_list[op_id].inverter(op_id, index_start, 1); + } + } + + + if(use_preconditioning==1 && operator_list[op_id].precWS!=NULL ){ + /* free preconditioning workspace */ + spinorPrecWS_Free(operator_list[op_id].precWS); + free(operator_list[op_id].precWS); + } + + if(operator_list[op_id].type == OVERLAP){ + free_Dov_WS(); + } + + } + nstore += Nsave; + } + +#ifdef OMP + free_omp_accumulators(); +#endif + free_blocks(); + free_dfl_subspace(); + free_gauge_field(); + free_gauge_field_32(); + free_geometry_indices(); + free_spinor_field(); + free_spinor_field_32(); + free_moment_field(); + free_chi_spinor_field(); + free(filename); + free(input_filename); +#ifdef QUDA + _endQuda(); +#endif +#ifdef MPI + MPI_Barrier(MPI_COMM_WORLD); + MPI_Finalize(); +#endif + return(0); +#ifdef _KOJAK_INST +#pragma pomp inst end(main) +#endif +} + +static void usage() +{ + fprintf(stdout, "Inversion for EO preconditioned Wilson twisted mass QCD\n"); + fprintf(stdout, "Version %s \n\n", PACKAGE_VERSION); + fprintf(stdout, "Please send bug reports to %s\n", PACKAGE_BUGREPORT); + fprintf(stdout, "Usage: invert [options]\n"); + fprintf(stdout, "Options: [-f input-filename]\n"); + fprintf(stdout, " [-o output-filename]\n"); + fprintf(stdout, " [-v] more verbosity\n"); + fprintf(stdout, " [-h|-? this help]\n"); + fprintf(stdout, " [-V] print version information and exit\n"); + exit(0); +} + +static void process_args(int argc, char *argv[], char ** input_filename, char ** filename) { + int c; + while ((c = getopt(argc, argv, "h?vVf:o:")) != -1) { + switch (c) { + case 'f': + *input_filename = calloc(200, sizeof(char)); + strncpy(*input_filename, optarg, 200); + break; + case 'o': + *filename = calloc(200, sizeof(char)); + strncpy(*filename, optarg, 200); + break; + case 'v': + verbose = 1; + break; + case 'V': + if(g_proc_id == 0) { + fprintf(stdout,"%s %s\n",PACKAGE_STRING,git_hash); + } + exit(0); + break; + case 'h': + case '?': + default: + if( g_proc_id == 0 ) { + usage(); + } + break; + } + } +} + +static void set_default_filenames(char ** input_filename, char ** filename) { + if( *input_filename == NULL ) { + *input_filename = calloc(13, sizeof(char)); + strcpy(*input_filename,"invert.input"); + } + + if( *filename == NULL ) { + *filename = calloc(7, sizeof(char)); + strcpy(*filename,"output"); + } +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/invert_clover_eo.c b/qcd/part_cpu/applications/QCD/src/kernel_D/invert_clover_eo.c new file mode 100644 index 0000000000000000000000000000000000000000..efb6feb712451a0b145d164111aa15adf0ca3285 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/invert_clover_eo.c @@ -0,0 +1,130 @@ +/*********************************************************************** + * Copyright (C) 2012 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * invert_clover_eo makes an inversion with EO preconditioned + * clover tm Operator + * + * Even and Odd are the numbers of spinor_field that contain + * the even and the odd sites of the source. The result is stored + * int Even_new and Odd_new. + * + * invert_clover_eo returns the number of iterations needed or -1 if the + * solver did not converge. + * + * Author: Carsten Urbach + * urbach@physik.fu-berlin.de + * + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include"global.h" +#include"su3.h" +#include"linalg_eo.h" +#include"operator/tm_operators.h" +#include"operator/Hopping_Matrix.h" +#include"operator/clovertm_operators.h" +#include"operator/clovertm_operators_32.h" +#include"operator/D_psi.h" +#include"gamma.h" +#include"solver/solver.h" +#include"solver/solver_params.h" +#include"invert_clover_eo.h" +#include "solver/dirac_operator_eigenvectors.h" +#ifdef QUDA +# include "quda_interface.h" +#endif + + +int invert_clover_eo(spinor * const Even_new, spinor * const Odd_new, + spinor * const Even, spinor * const Odd, + const double precision, const int max_iter, + const int solver_flag, const int rel_prec,solver_params_t solver_params, + su3 *** gf, matrix_mult Qsq, matrix_mult Qm, + const ExternalInverter inverter, const SloppyPrecision sloppy, const CompressionType compression) { + int iter; + +#ifdef QUDA + if( inverter==QUDA_INVERTER ) { + return invert_eo_quda(Even_new, Odd_new, Even, Odd, + precision, max_iter, + solver_flag, rel_prec, + 1, solver_params, + sloppy, compression); + } +#endif + + if(g_proc_id == 0 && g_debug_level > 0) { + printf("# Using even/odd preconditioning!\n"); fflush(stdout); + } + + assign_mul_one_sw_pm_imu_inv(EE, Even_new, Even, +g_mu); + + Hopping_Matrix(OE, g_spinor_field[DUM_DERI], Even_new); + /* The sign is plus, since in Hopping_Matrix */ + /* the minus is missing */ + assign_mul_add_r(g_spinor_field[DUM_DERI], +1., Odd, VOLUME/2); + /* Do the inversion with the preconditioned */ + /* matrix to get the odd sites */ + + /* Here we invert the hermitean operator squared */ + gamma5(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI], VOLUME/2); + if(g_proc_id == 0) { + //printf("# Using CG!\n"); + printf("# mu = %f, kappa = %f, csw = %f\n", + g_mu/2./g_kappa, g_kappa, g_c_sw); + fflush(stdout); + } + + if(solver_flag == CG){ + if(g_proc_id == 0) {printf("# Using CG!\n"); fflush(stdout);} + iter = cg_her(Odd_new, g_spinor_field[DUM_DERI], max_iter, + precision, rel_prec, + VOLUME/2, Qsq); + Qm(Odd_new, Odd_new); + }else if(solver_flag == INCREIGCG){ + if(g_proc_id == 0) {printf("# Using Incremental Eig-CG!\n"); fflush(stdout);} + iter = incr_eigcg(VOLUME/2,solver_params.eigcg_nrhs, solver_params.eigcg_nrhs1, Odd_new, g_spinor_field[DUM_DERI], solver_params.eigcg_ldh, Qsq, + solver_params.eigcg_tolsq1, solver_params.eigcg_tolsq, solver_params.eigcg_restolsq , solver_params.eigcg_rand_guess_opt, + rel_prec, max_iter, solver_params.eigcg_nev, solver_params.eigcg_vmax); + Qm(Odd_new, Odd_new); + }else if(solver_flag == MIXEDCG){ + iter = mixed_cg_her(Odd_new, g_spinor_field[DUM_DERI], solver_params, max_iter, precision, rel_prec, + VOLUME/2, &Qsw_pm_psi, &Qsw_pm_psi_32); + Qm(Odd_new, Odd_new); + }else if(solver_flag == RGMIXEDCG){ + iter = rg_mixed_cg_her(Odd_new, g_spinor_field[DUM_DERI], solver_params, max_iter, precision, rel_prec, + VOLUME/2, &Qsw_pm_psi, &Qsw_pm_psi_32); + Qm(Odd_new, Odd_new); + }else{ + if(g_proc_id == 0) {printf("# This solver is not available for this operator. Exisiting!\n"); fflush(stdout);} + return 0; + } + + + /* Reconstruct the even sites */ + Hopping_Matrix(EO, g_spinor_field[DUM_DERI], Odd_new); + clover_inv(g_spinor_field[DUM_DERI], +1, g_mu); + /* The sign is plus, since in Hopping_Matrix */ + /* the minus is missing */ + assign_add_mul_r(Even_new, g_spinor_field[DUM_DERI], +1., VOLUME/2); + + return(iter); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/invert_clover_eo.h b/qcd/part_cpu/applications/QCD/src/kernel_D/invert_clover_eo.h new file mode 100644 index 0000000000000000000000000000000000000000..4b3e25262b12ac70e7edfab99b43d516fa7e469f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/invert_clover_eo.h @@ -0,0 +1,16 @@ +#ifndef _INVERT_CLOVER_EO_H +#define _INVERT_CLOVER_EO_H + +#include "global.h" +#include "su3.h" +#include "solver/matrix_mult_typedef.h" +#include "solver/solver_params.h" + +int invert_clover_eo(spinor * const Even_new, spinor * const Odd_new, + spinor * const Even, spinor * const Odd, + const double precision, const int max_iter, + const int solver_flag, const int rel_prec,solver_params_t solver_params, + su3 *** gf, matrix_mult Qsq, matrix_mult Qm, + const ExternalInverter inverter, const SloppyPrecision sloppy, const CompressionType compression); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/invert_doublet_eo.c b/qcd/part_cpu/applications/QCD/src/kernel_D/invert_doublet_eo.c new file mode 100644 index 0000000000000000000000000000000000000000..1d66bd9d7f436f0cadfe3968a00724d637e63c9c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/invert_doublet_eo.c @@ -0,0 +1,254 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * invert_doublet_eo makes an inversion with EO precoditioned + * tm Operator with a nondegenerate doublet + * + * Even and Odd are the numbers of spinor_field that contain + * the even and the odd sites of the source. The result is stored + * int Even_new and Odd_new. + * + * invert_doublet_eo returns the number of iterations neede or -1 if the + * solver did not converge. + * + * Author: Carsten Urbach + * urbach@physik.fu-berlin.de + * + ****************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include"global.h" +#include"linalg_eo.h" +#include"operator/tm_operators.h" +#include"operator/Hopping_Matrix.h" +#include"operator/D_psi.h" +#include"gamma.h" +#include"solver/solver.h" +#include"read_input.h" +#include"xchange/xchange.h" +#include"operator/tm_operators_nd.h" +#include"operator/tm_operators_nd_32.h" +#include"invert_doublet_eo.h" +#ifdef QUDA +# include "quda_interface.h" +#endif + + +#ifdef HAVE_GPU +# include"GPU/cudadefs.h" +# include"temporalgauge.h" +# include"measure_gauge_action.h" +int mixedsolve_eo_nd (spinor *, spinor *, spinor *, spinor *, int, double, int); +int mixedsolve_eo_nd_mpi(spinor *, spinor *, spinor *, spinor *, int, double, int); +# ifdef TEMPORALGAUGE +extern su3* g_trafo; +# endif +#endif + + +int invert_doublet_eo(spinor * const Even_new_s, spinor * const Odd_new_s, + spinor * const Even_new_c, spinor * const Odd_new_c, + spinor * const Even_s, spinor * const Odd_s, + spinor * const Even_c, spinor * const Odd_c, + const double precision, const int max_iter, + const int solver_flag, const int rel_prec, solver_params_t solver_params, + const ExternalInverter inverter, const SloppyPrecision sloppy, const CompressionType compression) { + + int iter = 0; + +#ifdef QUDA + if( inverter==QUDA_INVERTER ) { + return invert_doublet_eo_quda( Even_new_s, Odd_new_s, Even_new_c, Odd_new_c, + Even_s, Odd_s, Even_c, Odd_c, + precision, max_iter, + solver_flag, rel_prec, 1, + sloppy, compression ); + } +#endif + +#ifdef HAVE_GPU +# ifdef TEMPORALGAUGE + if (usegpu_flag) { + gtrafo_eo_nd(Even_s, Odd_s, Even_c, Odd_c, + (spinor*const)NULL, (spinor*const)NULL, (spinor*const)NULL, (spinor*const)NULL, + GTRAFO_APPLY); + } +# endif +#endif /* HAVE_GPU*/ + + + /* here comes the inversion using even/odd preconditioning */ + if(g_proc_id == 0) {printf("# Using even/odd preconditioning!\n"); fflush(stdout);} + M_ee_inv_ndpsi(Even_new_s, Even_new_c, + Even_s, Even_c, + g_mubar, g_epsbar); + Hopping_Matrix(OE, g_spinor_field[DUM_DERI], Even_new_s); + Hopping_Matrix(OE, g_spinor_field[DUM_DERI+1], Even_new_c); + + /* The sign is plus, since in Hopping_Matrix */ + /* the minus is missing */ + assign_mul_add_r(g_spinor_field[DUM_DERI], +1., Odd_s, VOLUME/2); + assign_mul_add_r(g_spinor_field[DUM_DERI+1], +1., Odd_c, VOLUME/2); + + /* Do the inversion with the preconditioned */ + /* matrix to get the odd sites */ + + /* Here we invert the hermitean operator squared */ + + if(g_proc_id == 0) { + printf("# Using CG for TMWILSON flavour doublet!\n"); + fflush(stdout); + } + gamma5(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI], VOLUME/2); + gamma5(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI+1], VOLUME/2); + + +#ifdef HAVE_GPU + if (usegpu_flag) { // GPU, mixed precision solver +# if defined(MPI) && defined(PARALLELT) + iter = mixedsolve_eo_nd(Odd_new_s, Odd_new_c, g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1], + max_iter, precision, rel_prec); +# elif !defined(MPI) && !defined(PARALLELT) + iter = mixedsolve_eo_nd(Odd_new_s, Odd_new_c, g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1], + max_iter, precision, rel_prec); +# else + printf("MPI and/or PARALLELT are not appropriately set for the GPU implementation. Aborting...\n"); + exit(-1); +# endif + } + else { // CPU, conjugate gradient + iter = cg_her_nd(Odd_new_s, Odd_new_c, g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1], + max_iter, precision, rel_prec, + VOLUME/2, &Qtm_pm_ndpsi); + } +#else // CPU, conjugate gradient + if(solver_flag == RGMIXEDCG){ + iter = rg_mixed_cg_her_nd(Odd_new_s, Odd_new_c, g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1], + solver_params, max_iter, precision, rel_prec, VOLUME/2, + &Qtm_pm_ndpsi, &Qtm_pm_ndpsi_32); + } else { + iter = cg_her_nd(Odd_new_s, Odd_new_c, g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1], + max_iter, precision, rel_prec, VOLUME/2, &Qtm_pm_ndpsi); + } +#endif + + + Qtm_dagger_ndpsi(Odd_new_s, Odd_new_c, + Odd_new_s, Odd_new_c); + + /* Reconstruct the even sites */ + Hopping_Matrix(EO, g_spinor_field[DUM_DERI], Odd_new_s); + Hopping_Matrix(EO, g_spinor_field[DUM_DERI+1], Odd_new_c); + M_ee_inv_ndpsi(g_spinor_field[DUM_DERI+2], g_spinor_field[DUM_DERI+3], + g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1], + g_mubar, g_epsbar); + + /* The sign is plus, since in Hopping_Matrix */ + /* the minus is missing */ + assign_add_mul_r(Even_new_s, g_spinor_field[DUM_DERI+2], +1., VOLUME/2); + assign_add_mul_r(Even_new_c, g_spinor_field[DUM_DERI+3], +1., VOLUME/2); + + +#ifdef HAVE_GPU + /* return from temporal gauge again */ +# ifdef TEMPORALGAUGE + if (usegpu_flag) { + gtrafo_eo_nd(Even_s, Odd_s, Even_c, Odd_c, Even_new_s, Odd_new_s, Even_new_c, Odd_new_c, + GTRAFO_REVERT); + } +# endif +#endif + return(iter); +} + + +int invert_cloverdoublet_eo(spinor * const Even_new_s, spinor * const Odd_new_s, + spinor * const Even_new_c, spinor * const Odd_new_c, + spinor * const Even_s, spinor * const Odd_s, + spinor * const Even_c, spinor * const Odd_c, + const double precision, const int max_iter, + const int solver_flag, const int rel_prec, solver_params_t solver_params, + const ExternalInverter inverter, const SloppyPrecision sloppy, const CompressionType compression) { + + int iter = 0; + +#ifdef QUDA + if( inverter==QUDA_INVERTER ) { + return invert_doublet_eo_quda( Even_new_s, Odd_new_s, Even_new_c, Odd_new_c, + Even_s, Odd_s, Even_c, Odd_c, + precision, max_iter, + solver_flag, rel_prec, 1, + sloppy, compression ); + } +#endif + + /* here comes the inversion using even/odd preconditioning */ + if(g_proc_id == 0) {printf("# Using even/odd preconditioning!\n"); fflush(stdout);} + Msw_ee_inv_ndpsi(Even_new_s, Even_new_c, + Even_s, Even_c); + Hopping_Matrix(OE, g_spinor_field[DUM_DERI], Even_new_s); + Hopping_Matrix(OE, g_spinor_field[DUM_DERI+1], Even_new_c); + + /* The sign is plus, since in Hopping_Matrix */ + /* the minus is missing */ + assign_mul_add_r(g_spinor_field[DUM_DERI], +1., Odd_s, VOLUME/2); + assign_mul_add_r(g_spinor_field[DUM_DERI+1], +1., Odd_c, VOLUME/2); + + /* Do the inversion with the preconditioned */ + /* matrix to get the odd sites */ + + /* Here we invert the hermitean operator squared */ + + if(g_proc_id == 0) { + printf("# Using CG for TMWILSON flavour doublet!\n"); + fflush(stdout); + } + gamma5(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI], VOLUME/2); + gamma5(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI+1], VOLUME/2); + + if(solver_flag == RGMIXEDCG){ + iter = rg_mixed_cg_her_nd(Odd_new_s, Odd_new_c, g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1], + solver_params, max_iter, precision, rel_prec, VOLUME/2, + &Qsw_pm_ndpsi, &Qsw_pm_ndpsi_32); + } else { + iter = cg_her_nd(Odd_new_s, Odd_new_c, g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1], + max_iter, precision, rel_prec, + VOLUME/2, &Qsw_pm_ndpsi); + } + + + Qsw_dagger_ndpsi(Odd_new_s, Odd_new_c, + Odd_new_s, Odd_new_c); + + /* Reconstruct the even sites */ + Hopping_Matrix(EO, g_spinor_field[DUM_DERI], Odd_new_s); + Hopping_Matrix(EO, g_spinor_field[DUM_DERI+1], Odd_new_c); + Msw_ee_inv_ndpsi(g_spinor_field[DUM_DERI+2], g_spinor_field[DUM_DERI+3], + g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1]); + + /* The sign is plus, since in Hopping_Matrix */ + /* the minus is missing */ + assign_add_mul_r(Even_new_s, g_spinor_field[DUM_DERI+2], +1., VOLUME/2); + assign_add_mul_r(Even_new_c, g_spinor_field[DUM_DERI+3], +1., VOLUME/2); + + return(iter); +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/invert_doublet_eo.h b/qcd/part_cpu/applications/QCD/src/kernel_D/invert_doublet_eo.h new file mode 100644 index 0000000000000000000000000000000000000000..ee8041bb44f28c621d356eeb3e61a84a5830980a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/invert_doublet_eo.h @@ -0,0 +1,60 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +/**************************************************************** + * + * invert_doublet_eo makes an inversion with EO precoditioned + * tm Operator with a nondegenerate doublet + * + * Author: Carsten Urbach + * urbach@physik.fu-berlin.de + * + ****************************************************************/ + +#ifndef _INVERT_DOUBLET_EO_H +#define _INVERT_DOUBLET_EO_H + +#include "global.h" +#include "solver/solver_params.h" + +int invert_doublet_eo(spinor * const Even_new_s, spinor * const Odd_new_s, + spinor * const Even_new_c, spinor * const Odd_new_c, + spinor * const Even_s, spinor * const Odd_s, + spinor * const Even_c, spinor * const Odd_c, + const double precision, const int max_iter, + const int solver_flag, const int rel_prec, solver_params_t solver_params, + const ExternalInverter inverter, const SloppyPrecision sloppy, const CompressionType compression); + + +/* This is the full matrix multiplication */ +/* void M_full(spinor * const Even_new, spinor * const Odd_new, */ +/* spinor * const Even, spinor * const Odd); */ +/* void Q_full(spinor * const Even_new, spinor * const Odd_new, */ +/* spinor * const Even, spinor * const Odd); */ +/* void M_minus_1_timesC(spinor * const Even_new, spinor * const Odd_new, */ +/* spinor * const Even, spinor * const Odd); */ + +int invert_cloverdoublet_eo(spinor * const Even_new_s, spinor * const Odd_new_s, + spinor * const Even_new_c, spinor * const Odd_new_c, + spinor * const Even_s, spinor * const Odd_s, + spinor * const Even_c, spinor * const Odd_c, + const double precision, const int max_iter, + const int solver_flag, const int rel_prec, solver_params_t solver_params, + const ExternalInverter inverter, const SloppyPrecision sloppy, const CompressionType compression); +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/invert_eo.c b/qcd/part_cpu/applications/QCD/src/kernel_D/invert_eo.c new file mode 100644 index 0000000000000000000000000000000000000000..794e862fb727c2881e0c4d9cd41cea8e805772fa --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/invert_eo.c @@ -0,0 +1,560 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * invert_eo makes an inversion with EO preconditioned + * tm Operator + * + * Even and Odd are the numbers of spinor_field that contain + * the even and the odd sites of the source. The result is stored + * int Even_new and Odd_new. + * + * invert_eo returns the number of iterations needed or -1 if the + * solver did not converge. + * + * Author: Carsten Urbach + * urbach@physik.fu-berlin.de + * + ****************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include"global.h" +#include"linalg_eo.h" +#include"operator/tm_operators.h" +#include"operator/Hopping_Matrix.h" +#include"operator/D_psi.h" +#include"operator/tm_operators_32.h" +#include"gamma.h" +#include"solver/solver.h" +#include"read_input.h" +#include"xchange/xchange.h" +#include"solver/poly_precon.h" +#include"solver/dfl_projector.h" +#include"invert_eo.h" +#include "solver/dirac_operator_eigenvectors.h" +/* FIXME temporary includes and declarations until IO and interface for invert and CGMMS are generelized */ +#include "init/init_spinor_field.h" +#include +#include +#ifdef QUDA +# include "quda_interface.h" +#endif + +static double cgmms_reached_prec = 0.0; +static void cgmms_write_props(spinor ** const P, double const * const extra_masses, const int no_extra_masses, const int id, const int iteration); + +#ifdef HAVE_GPU +#include"GPU/cudadefs.h" +#include"temporalgauge.h" +#include"measure_gauge_action.h" + +extern int mixed_solve (spinor * const P, spinor * const Q, const int max_iter, + double eps, const int rel_prec,const int N); +extern int mixed_solve_eo (spinor * const P, spinor * const Q, const int max_iter, + double eps, const int rel_prec, const int N); +#ifdef TEMPORALGAUGE +extern su3* g_trafo; +#endif +#endif + +int invert_eo(spinor * const Even_new, spinor * const Odd_new, + spinor * const Even, spinor * const Odd, + const double precision, const int max_iter, + const int solver_flag, const int rel_prec, + const int sub_evs_flag, const int even_odd_flag, + const int no_extra_masses, double * const extra_masses, solver_params_t solver_params, const int id, + const ExternalInverter inverter, const SloppyPrecision sloppy, const CompressionType compression ) { + + int iter = 0; + +#ifdef QUDA + if( inverter==QUDA_INVERTER ) { + return invert_eo_quda(Even_new, Odd_new, Even, Odd, + precision, max_iter, + solver_flag, rel_prec, + even_odd_flag, solver_params, + sloppy, compression); + } +#endif + + /* here comes the inversion using even/odd preconditioning */ + if(even_odd_flag) { + if(g_proc_id == 0) {printf("# Using even/odd preconditioning!\n"); fflush(stdout);} + +#ifdef HAVE_GPU +#ifdef TEMPORALGAUGE + /* initialize temporal gauge here */ + int retval; + double dret; + double plaquette = 0.0; + + if(usegpu_flag){ + + /* need VOLUME here (not N=VOLUME/2)*/ + if((retval=init_temporalgauge_trafo(VOLUME, g_gauge_field)) !=0){ + if(g_proc_id == 0) printf("Error while gauge fixing to temporal gauge. Aborting...\n"); + exit(200); + } + plaquette = measure_plaquette(g_gauge_field); + if(g_proc_id == 0) printf("Plaquette before gauge fixing: %.16e\n", plaquette/6./VOLUME); + /* do trafo */ + apply_gtrafo(g_gauge_field, g_trafo); + plaquette = measure_plaquette(g_gauge_field); + if(g_proc_id == 0) printf("Plaquette after gauge fixing: %.16e\n", plaquette/6./VOLUME); + + /* do trafo to odd part of source */ + dret = square_norm(Odd, VOLUME/2 , 1); + if(g_proc_id == 0) printf("square norm before gauge fixing: %.16e\n", dret); + apply_gtrafo_spinor_odd(Odd, g_trafo); + dret = square_norm(Odd, VOLUME/2, 1); + if(g_proc_id == 0) printf("square norm after gauge fixing: %.16e\n", dret); + + /* do trafo to even part of source */ + dret = square_norm(Even, VOLUME/2 , 1); + if(g_proc_id == 0) printf("square norm before gauge fixing: %.16e\n", dret); + apply_gtrafo_spinor_even(Even, g_trafo); + dret = square_norm(Even, VOLUME/2, 1); + if(g_proc_id == 0) printf("square norm after gauge fixing: %.16e\n", dret); + } +#endif +#endif /* HAVE_GPU*/ + + + assign_mul_one_pm_imu_inv(Even_new, Even, +1., VOLUME/2); + + Hopping_Matrix(OE, g_spinor_field[DUM_DERI], Even_new); + /* The sign is plus, since in Hopping_Matrix */ + /* the minus is missing */ + assign_mul_add_r(g_spinor_field[DUM_DERI], +1., Odd, VOLUME/2); + /* Do the inversion with the preconditioned */ + /* matrix to get the odd sites */ + + if(solver_flag == BICGSTAB) { + if(g_proc_id == 0) {printf("# Using BiCGstab!\n"); fflush(stdout);} + mul_one_pm_imu_inv(g_spinor_field[DUM_DERI], +1., VOLUME/2); + iter = bicgstab_complex(Odd_new, g_spinor_field[DUM_DERI], max_iter, precision, rel_prec, VOLUME/2, &Mtm_plus_sym_psi); + } + else if(solver_flag == GMRES) { + if(g_proc_id == 0) {printf("# Using GMRES! m = %d\n", gmres_m_parameter); fflush(stdout);} + mul_one_pm_imu_inv(g_spinor_field[DUM_DERI], +1., VOLUME/2); + iter = gmres(Odd_new, g_spinor_field[DUM_DERI], gmres_m_parameter, max_iter/gmres_m_parameter, precision, rel_prec, VOLUME/2, 1, &Mtm_plus_sym_psi); + } + else if(solver_flag == GCR) { + if(g_proc_id == 0) {printf("# Using GCR! m = %d\n", gmres_m_parameter); fflush(stdout);} + mul_one_pm_imu_inv(g_spinor_field[DUM_DERI], +1., VOLUME/2); + iter = gcr(Odd_new, g_spinor_field[DUM_DERI], gmres_m_parameter, max_iter/gmres_m_parameter, precision, rel_prec, VOLUME/2, 0, &Mtm_plus_sym_psi); + } + else if(solver_flag == GMRESDR) { + if(g_proc_id == 0) {printf("# Using GMRES-DR! m = %d, NrEv = %d\n", + gmres_m_parameter, gmresdr_nr_ev); fflush(stdout);} + mul_one_pm_imu_inv(g_spinor_field[DUM_DERI], +1., VOLUME/2); + iter = gmres_dr(Odd_new, g_spinor_field[DUM_DERI], gmres_m_parameter, gmresdr_nr_ev, max_iter/gmres_m_parameter, precision, rel_prec, VOLUME/2, &Mtm_plus_sym_psi); + } + else if(solver_flag == FGMRES) { + if(g_proc_id == 0) {printf("# Using FGMRES!\n"); fflush(stdout);} + iter = fgmres(Odd_new, g_spinor_field[DUM_DERI], gmres_m_parameter, max_iter/gmres_m_parameter, precision, rel_prec, VOLUME/2, 0, &Qtm_pm_psi); + gamma5(Odd_new, Odd_new, VOLUME/2); + Qtm_minus_psi(Odd_new, Odd_new); + } + else if(solver_flag == BICGSTABELL) { + if(g_proc_id == 0) {printf("# Using BiCGstab2!\n"); fflush(stdout);} + mul_one_pm_imu_inv(g_spinor_field[DUM_DERI], +1., VOLUME/2); + iter = bicgstabell(Odd_new, g_spinor_field[DUM_DERI], max_iter, precision, rel_prec, 3, VOLUME/2, &Mtm_plus_sym_psi); + } + else if(solver_flag == PCG) { + /* Here we invert the hermitean operator squared */ + gamma5(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI], VOLUME/2); + if(g_proc_id == 0) {printf("# Using PCG!\n"); fflush(stdout);} + iter = pcg_her(Odd_new, g_spinor_field[DUM_DERI], max_iter, precision, rel_prec, VOLUME/2, &Qtm_pm_psi); + Qtm_minus_psi(Odd_new, Odd_new); + } + else if(solver_flag == INCREIGCG) { + /* Here we invert the hermitean operator squared */ + gamma5(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI], VOLUME/2); + if(g_proc_id == 0) {printf("# Using Incremental Eig-CG!\n"); fflush(stdout);} + iter = incr_eigcg(VOLUME/2,solver_params.eigcg_nrhs,solver_params.eigcg_nrhs1, Odd_new, g_spinor_field[DUM_DERI], solver_params.eigcg_ldh, &Qtm_pm_psi, + solver_params.eigcg_tolsq1, solver_params.eigcg_tolsq, solver_params.eigcg_restolsq , solver_params.eigcg_rand_guess_opt, + rel_prec, max_iter, solver_params.eigcg_nev, solver_params.eigcg_vmax); + Qtm_minus_psi(Odd_new, Odd_new); + } + else if(solver_flag == MIXEDCG) { + /* Here we invert the hermitean operator squared */ + gamma5(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI], VOLUME/2); + if(g_proc_id == 0) {printf("# Using Mixed Precision CG!\n"); fflush(stdout);} + iter = mixed_cg_her(Odd_new, g_spinor_field[DUM_DERI], solver_params, max_iter, precision, rel_prec, + VOLUME/2, &Qtm_pm_psi, &Qtm_pm_psi_32); + Qtm_minus_psi(Odd_new, Odd_new); + } + else if(solver_flag == RGMIXEDCG) { + /* Here we invert the hermitean operator squared */ + gamma5(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI], VOLUME/2); + if(g_proc_id == 0) {printf("# Using Mixed Precision CG!\n"); fflush(stdout);} + iter = rg_mixed_cg_her(Odd_new, g_spinor_field[DUM_DERI], solver_params, max_iter, precision, rel_prec, + VOLUME/2, &Qtm_pm_psi, &Qtm_pm_psi_32); + Qtm_minus_psi(Odd_new, Odd_new); + } + else if(solver_flag == CG) { + /* Here we invert the hermitean operator squared */ + gamma5(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI], VOLUME/2); + if(g_proc_id == 0) { + printf("# Using CG!\n"); + printf("# mu = %f, kappa = %f\n", g_mu/2./g_kappa, g_kappa); + fflush(stdout); + } +#ifdef HAVE_GPU + if(usegpu_flag){ + if(g_proc_id == 0) printf("Using GPU for inversion\n"); + iter = mixed_solve_eo(Odd_new, g_spinor_field[DUM_DERI], max_iter, precision, rel_prec, VOLUME/2); + }else{ + iter = cg_her(Odd_new, g_spinor_field[DUM_DERI], max_iter, precision, rel_prec, VOLUME/2, &Qtm_pm_psi); + Qtm_minus_psi(Odd_new, Odd_new); + } +#else + iter = cg_her(Odd_new, g_spinor_field[DUM_DERI], max_iter, precision, rel_prec, + VOLUME/2, &Qtm_pm_psi); + Qtm_minus_psi(Odd_new, Odd_new); +#endif /*HAVE_GPU*/ + } + else if(solver_flag == MR) { + if(g_proc_id == 0) {printf("# Using MR!\n"); fflush(stdout);} + iter = mr(Odd_new, g_spinor_field[DUM_DERI], max_iter, precision, rel_prec, VOLUME/2, 1, &Mtm_plus_psi); + } + else if(solver_flag == CGS) { + if(g_proc_id == 0) {printf("# Using CGS!\n"); fflush(stdout);} + mul_one_pm_imu_inv(g_spinor_field[DUM_DERI], +1., VOLUME/2); + iter = cgs_real(Odd_new, g_spinor_field[DUM_DERI], max_iter, precision, rel_prec, VOLUME/2, &Mtm_plus_sym_psi); + } + else { + if(g_proc_id == 0) {printf("# Using CG as default solver!\n"); fflush(stdout);} + gamma5(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI], VOLUME/2); +#ifdef HAVE_GPU + if(g_proc_id == 0) {printf("Using GPU for inversion\n"); + fflush(stdout);} + iter = mixed_solve_eo(Odd_new, g_spinor_field[DUM_DERI], max_iter, precision, rel_prec, VOLUME/2); +#else + iter = cg_her(Odd_new, g_spinor_field[DUM_DERI], max_iter, precision, rel_prec, VOLUME/2, &Qtm_pm_psi); + Qtm_minus_psi(Odd_new, Odd_new); +#endif + } + + /* In case of failure, redo with CG */ + if(iter == -1 && solver_flag !=CG) { + /* Here we invert the hermitean operator squared */ + mul_one_pm_imu(g_spinor_field[DUM_DERI], +1.); + gamma5(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI], VOLUME/2); + if(g_proc_id == 0) {printf("# Redoing it with CG!\n"); fflush(stdout);} + iter = cg_her(Odd_new, g_spinor_field[DUM_DERI], max_iter, precision, rel_prec, VOLUME/2, &Qtm_pm_psi); + Qtm_minus_psi(Odd_new, Odd_new); + } + + /* Reconstruct the even sites */ + Hopping_Matrix(EO, g_spinor_field[DUM_DERI], Odd_new); + mul_one_pm_imu_inv(g_spinor_field[DUM_DERI], +1., VOLUME/2); + /* The sign is plus, since in Hopping_Matrix */ + /* the minus is missing */ + assign_add_mul_r(Even_new, g_spinor_field[DUM_DERI], +1., VOLUME/2); + +#ifdef HAVE_GPU + /* return from temporal gauge again */ +#ifdef TEMPORALGAUGE + if(usegpu_flag){ + plaquette = measure_plaquette(g_gauge_field); + if(g_proc_id == 0) printf("Plaquette before inverse gauge fixing: %.16e\n", plaquette/6./VOLUME); + + /* undo trafo */ + + /*apply_inv_gtrafo(g_gauge_field, g_trafo);*/ + /* copy back the saved original field located in g_tempgauge_field -> update necessary*/ + copy_gauge_field(g_gauge_field, g_tempgauge_field); + g_update_gauge_copy = 1; + + + plaquette = measure_plaquette(g_gauge_field); + if(g_proc_id == 0) printf("Plaquette after inverse gauge fixing: %.16e\n", plaquette/6./VOLUME); + + /* undo trafo to source (Even, Odd) */ + dret = square_norm(Even, VOLUME/2 , 1); + if(g_proc_id == 0) printf("square norm before gauge fixing: %.16e\n", dret); + apply_inv_gtrafo_spinor_even(Even, g_trafo); + dret = square_norm(Even, VOLUME/2, 1); + if(g_proc_id == 0) printf("square norm after gauge fixing: %.16e\n", dret); + dret = square_norm(Odd, VOLUME/2 , 1); + if(g_proc_id == 0) printf("square norm before gauge fixing: %.16e\n", dret); + apply_inv_gtrafo_spinor_odd(Odd, g_trafo); + dret = square_norm(Odd, VOLUME/2, 1); + if(g_proc_id == 0) printf("square norm after gauge fixing: %.16e\n", dret); + + + dret = square_norm(Even_new, VOLUME/2 , 1); + if(g_proc_id == 0) printf("square norm before gauge fixing: %.16e\n", dret); + apply_inv_gtrafo_spinor_even(Even_new, g_trafo); + dret = square_norm(Even_new, VOLUME/2, 1); + if(g_proc_id == 0) printf("square norm after gauge fixing: %.16e\n", dret); + + dret = square_norm(Odd_new, VOLUME/2 , 1); + if(g_proc_id == 0) printf("square norm before gauge fixing: %.16e\n", dret); + apply_inv_gtrafo_spinor_odd(Odd_new, g_trafo); + dret = square_norm(Odd_new, VOLUME/2, 1); + if(g_proc_id == 0) printf("square norm after gauge fixing: %.16e\n", dret); + + + finalize_temporalgauge(); + } +#endif +#endif + + + } + + else { + /* here comes the inversion not using even/odd preconditioning */ + if(g_proc_id == 0) {printf("# Not using even/odd preconditioning!\n"); fflush(stdout);} + convert_eo_to_lexic(g_spinor_field[DUM_DERI], Even, Odd); + convert_eo_to_lexic(g_spinor_field[DUM_DERI+1], Even_new, Odd_new); + + if(solver_flag == BICGSTAB) { + if(g_proc_id == 0) {printf("# Using BiCGstab!\n"); fflush(stdout);} + if(use_preconditioning==1 && g_precWS!=NULL){ + if(g_proc_id == 0) {printf("# Using preconditioning (which one?)!\n");} + iter = bicgstab_complex(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI], max_iter, precision, rel_prec, VOLUME, &D_psi_prec); + } else { + if(g_proc_id == 0) {printf("# Not using preconditioning (which one?)!\n");} + iter = bicgstab_complex(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI], max_iter, precision, rel_prec, VOLUME, &D_psi); + } + } + else if(solver_flag == CGS) { + if(g_proc_id == 0) {printf("# Using CGS!\n"); fflush(stdout);} + + if(use_preconditioning==1 && g_precWS!=NULL){ + if(g_proc_id == 0) {printf("# Using preconditioning (which one?)!\n");} + iter = cgs_real(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI], max_iter, precision, rel_prec, VOLUME, &D_psi_prec); + } else { + if(g_proc_id == 0) {printf("# Not using preconditioning (which one?)!\n");} + iter = cgs_real(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI], max_iter, precision, rel_prec, VOLUME, &D_psi); + } + + + } + else if(solver_flag == GMRES) { + if(g_proc_id == 0) {printf("# Using GMRES! m = %d\n", gmres_m_parameter); fflush(stdout);} + + if(use_preconditioning==1 && g_precWS!=NULL){ + if(g_proc_id == 0) {printf("# Using preconditioning (which one?)!\n");} + iter = gmres(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI], gmres_m_parameter, max_iter/gmres_m_parameter, precision, rel_prec, VOLUME, 1, &D_psi_prec); + } else { + if(g_proc_id == 0) {printf("# not using preconditioning (which one?)!\n");} + iter = gmres(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI], gmres_m_parameter, max_iter/gmres_m_parameter, precision, rel_prec, VOLUME, 1, &D_psi); + } + } + else if(solver_flag == MIXEDCG) { + if(g_proc_id == 0) {printf("# Using MIXEDCG!\n"); fflush(stdout);} + gamma5(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI], VOLUME); + iter = mixed_cg_her(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1], solver_params, max_iter, + precision, rel_prec, VOLUME, &Q_pm_psi, &Q_pm_psi_32); + Q_minus_psi(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI]); + } else if(solver_flag == RGMIXEDCG) { + if(g_proc_id == 0) {printf("# Using MIXEDCG!\n"); fflush(stdout);} + gamma5(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI], VOLUME); + iter = rg_mixed_cg_her(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1], solver_params, max_iter, + precision, rel_prec, VOLUME, &Q_pm_psi, &Q_pm_psi_32); + Q_minus_psi(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI]); + } else if(solver_flag == FGMRES) { + if(g_proc_id == 0) {printf("# Using FGMRES! m = %d\n", gmres_m_parameter); fflush(stdout);} + iter = fgmres(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI], gmres_m_parameter, max_iter/gmres_m_parameter, precision, rel_prec, VOLUME, 1, &D_psi); + /* gamma5(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI], VOLUME); */ + /* iter = fgmres(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1], gmres_m_parameter, max_iter/gmres_m_parameter, precision, rel_prec, VOLUME, &Q_pm_psi); */ + /* Q_minus_psi(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI]); */ + } + else if(solver_flag == GCR) { + if(g_proc_id == 0) {printf("# Using GCR! m = %d\n", gmres_m_parameter); fflush(stdout);} + iter = gcr(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI], gmres_m_parameter, max_iter/gmres_m_parameter, precision, rel_prec, VOLUME, 1, &D_psi); + /* gamma5(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI], VOLUME); */ + /* iter = gcr(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1], gmres_m_parameter, max_iter/gmres_m_parameter, precision, rel_prec, VOLUME, &Q_pm_psi); */ + /* Q_minus_psi(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI]); */ + } + else if(solver_flag == DFLGCR || solver_flag == DFLFGMRES) { + if(g_proc_id == 0) {printf("# Using deflated solver! m = %d\n", gmres_m_parameter); fflush(stdout);} + /* apply P_L to source */ + project_left(g_spinor_field[DUM_DERI+2], g_spinor_field[DUM_DERI]); + if(g_proc_id == 0) printf("# Applied P_L to source\n"); + /* invert P_L D on source -> chi */ + if(solver_flag == DFLGCR) { + iter = gcr(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI+2], gmres_m_parameter, + max_iter/gmres_m_parameter, precision, rel_prec, VOLUME, 1, &project_left_D); + } + else { + iter = fgmres(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI+2], gmres_m_parameter, + max_iter/gmres_m_parameter, precision, rel_prec, VOLUME, 1, &project_left_D); + } + /* apply P_R to chi */ + project_right(g_spinor_field[DUM_DERI+2], g_spinor_field[DUM_DERI+1]); + if(g_proc_id == 0) printf("# Applied P_R to solution\n"); + /* reconstruct solution */ + project(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI]); + add(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI+2], VOLUME); + } + else if (solver_flag == CGMMS) { + /* FIXME temporary workaround for the multiple masses interface */ + double * shifts = (double*)calloc(no_extra_masses+1,sizeof(double)); + shifts[0]=g_mu; + for(int i = 0; i < no_extra_masses; ++i) + shifts[i+1] = extra_masses[i]; + g_mu = 0; + solver_pm_t solver_params; + solver_params.shifts = shifts; + solver_params.no_shifts = no_extra_masses+1; + solver_params.rel_prec = rel_prec; + solver_params.max_iter = max_iter; + solver_params.squared_solver_prec = precision; + solver_params.sdim = VOLUME; + solver_params.M_psi = &Q_pm_psi; + solver_params.type = solver_flag; + + /* FIXME temporary workaround for the multiple shift solver interface and integration of IO */ + spinor * P_memory; + spinor ** P; + allocate_spinor_field_array(&P,&P_memory,VOLUME,no_extra_masses+1); + + if(g_proc_id == 0) {printf("# Using multi mass CG!\n"); fflush(stdout);} + + gamma5(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI], VOLUME); + iter = cg_mms_tm(P, g_spinor_field[DUM_DERI+1],&solver_params,&cgmms_reached_prec); + g_mu = shifts[0]; + Q_minus_psi(g_spinor_field[DUM_DERI+1], P[0]); + + cgmms_write_props(P,shifts,no_extra_masses+1,id,iter); + + free_spinor_field_array(&P_memory); + free(P); + free(shifts); + } + else { + if(g_proc_id == 0) {printf("# Using CG!\n"); fflush(stdout);} +#ifdef HAVE_GPU + if(usegpu_flag){ + if(g_proc_id == 0) printf("# Using GPU for inversion\n"); + iter = mixed_solve(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI], max_iter, precision, rel_prec, VOLUME); + } + else{ + gamma5(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI], VOLUME); + iter = cg_her(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1], max_iter, precision, + rel_prec, VOLUME, &Q_pm_psi); + Q_minus_psi(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI]); + } +#else + gamma5(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI], VOLUME); + + if(use_preconditioning==1 && g_precWS!=NULL){ + spinorPrecWS *ws=(spinorPrecWS*)g_precWS; + static _Complex double alpha = 0.0; + if(g_proc_id==0) {printf("# Using preconditioning (which one?)!\n");} + + if(g_prec_sequence_d_dagger_d[2] != 0.0){ + alpha = g_prec_sequence_d_dagger_d[2]; + spinorPrecondition(g_spinor_field[DUM_DERI+1],g_spinor_field[DUM_DERI+1],ws,T,L,alpha,0,1); + } + + iter = cg_her(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1], max_iter, precision, + rel_prec, VOLUME, &Q_pm_psi_prec); + + if(g_prec_sequence_d_dagger_d[0] != 0.0){ + alpha = g_prec_sequence_d_dagger_d[0]; + spinorPrecondition(g_spinor_field[DUM_DERI],g_spinor_field[DUM_DERI],ws,T,L,alpha,0,1); + } + + } else { + if(g_proc_id==0) {printf("# Not using preconditioning!\n");} + iter = cg_her(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1], max_iter, precision, + rel_prec, VOLUME, &Q_pm_psi); + } + + + Q_minus_psi(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI]); + + if(use_preconditioning==1 && g_precWS!=NULL){ + spinorPrecWS *ws=(spinorPrecWS*)g_precWS; + static _Complex double alpha = 0.0; + if(g_prec_sequence_d_dagger_d[1] != 0.0){ + alpha = g_prec_sequence_d_dagger_d[1]; + spinorPrecondition(g_spinor_field[DUM_DERI+1],g_spinor_field[DUM_DERI+1],ws,T,L,alpha,0,1); + } + } +#endif + } + convert_lexic_to_eo(Even_new, Odd_new, g_spinor_field[DUM_DERI+1]); + } + return(iter); +} + +/* FIXME temporary solution for the writing of CGMMS propagators until the input/output interface for + invert_eo has been generalized + NOTE that no_shifts = no_extra_masses+1 */ +static void cgmms_write_props(spinor ** const P, double const * const shifts, const int no_shifts, const int id, const int iteration) { + int append = 0; + char filename[300]; + WRITER * writer = NULL; + paramsInverterInfo *inverterInfo = NULL; + paramsPropagatorFormat *propagatorFormat = NULL; + + spinor * temp_eo_spinors_memory; + spinor ** temp_eo_spinors; + + allocate_spinor_field_array(&temp_eo_spinors, &temp_eo_spinors_memory, VOLUME/2, 2); + + /* save all the results of (Q^dagger Q)^(-1) \gamma_5 \phi */ + for(int im = 0; im < no_shifts; im++) { + if(SourceInfo.type != 1) { + if (PropInfo.splitted) { + if(T_global > 99) sprintf(filename, "%s.%.2d.%.4d.%.3d.%.2d.cgmms.%.2d.inverted", SourceInfo.basename, id, SourceInfo.nstore, SourceInfo.t, SourceInfo.ix, im); + else sprintf(filename, "%s.%.2d.%.4d.%.2d.%.2d.cgmms.%.2d.inverted", SourceInfo.basename, id, SourceInfo.nstore, SourceInfo.t, SourceInfo.ix, im); + } else { + sprintf(filename, "%s.%.2d.%.4d.%.2d.cgmms.%.2d.inverted", SourceInfo.basename, id, SourceInfo.nstore, SourceInfo.t, im); + } + } else { + sprintf(filename, "%s.%.2d.%.4d.%.5d.cgmms.%.2d.0", SourceInfo.basename, id, SourceInfo.nstore, SourceInfo.sample, im); + } + + if(g_kappa != 0) { + mul_r(P[im], (2*g_kappa)*(2*g_kappa), P[im], VOLUME); + } + + append = !PropInfo.splitted; + + construct_writer(&writer, filename, append); + + if (PropInfo.splitted || SourceInfo.ix == index_start) { + //Create the inverter info NOTE: always set to TWILSON=12 and 1 flavour (to be adjusted) + inverterInfo = construct_paramsInverterInfo(cgmms_reached_prec, iteration+1, 12, 1); + inverterInfo->cgmms_mass = shifts[im]/(2 * inverterInfo->kappa); + write_spinor_info(writer, PropInfo.format, inverterInfo, append); + //Create the propagatorFormat NOTE: always set to 1 flavour (to be adjusted) + propagatorFormat = construct_paramsPropagatorFormat(PropInfo.precision, 1); + write_propagator_format(writer, propagatorFormat); + free(inverterInfo); + free(propagatorFormat); + } + convert_lexic_to_eo(temp_eo_spinors[1], temp_eo_spinors[0], P[im]); + write_spinor(writer, &temp_eo_spinors[1], &temp_eo_spinors[0], 1, PropInfo.precision); + destruct_writer(writer); + } + free_spinor_field_array(&temp_eo_spinors_memory); + free(temp_eo_spinors); +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/invert_eo.h b/qcd/part_cpu/applications/QCD/src/kernel_D/invert_eo.h new file mode 100644 index 0000000000000000000000000000000000000000..d64bb3c0edf4b176fdb345aafe382573f27f1afe --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/invert_eo.h @@ -0,0 +1,40 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * invert_eo makes an inversion with EO precoditioned + * tm Operator + * + * Author: Carsten Urbach + * urbach@physik.fu-berlin.de + * + ***********************************************************************/ + +#ifndef _INVERT_EO_H +#define _INVERT_EO_H +#include "global.h" +#include "solver/solver_params.h" + +int invert_eo(spinor * const Even_new, spinor * const Odd_new, + spinor * const Even, spinor * const Odd, + const double precision, const int iter_max, + const int solver_flag, const int rel_prec, + const int sub_evs_flag, const int even_odd_flag, + const int no_extra_masses, double * const extra_masses, solver_params_t solver_params, const int id, + const ExternalInverter inverter, const SloppyPrecision sloppy, const CompressionType compression ); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/invert_overlap.c b/qcd/part_cpu/applications/QCD/src/kernel_D/invert_overlap.c new file mode 100644 index 0000000000000000000000000000000000000000..c5e308f8b35bc979c600f94d8a981d36469b5325 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/invert_overlap.c @@ -0,0 +1,104 @@ +/*********************************************************************** + * + * Copyright (C) 2009 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include "global.h" +#include "solver/sumr.h" +#include "solver/cgs_real.h" +#include "operator.h" +#include "invert_overlap.h" +#include "operator/Dov_psi.h" +#include "linalg_eo.h" +#include "read_input.h" +#include "operator/tm_operators.h" +#include "gamma.h" +#include "solver/cg_her.h" + + +void invert_overlap(const int op_id, const int index_start) { + operator * optr; + void (*op)(spinor*,spinor*); + static _Complex double alpha = 0.; + spinorPrecWS *ws; + optr = &operator_list[op_id]; + op=&Dov_psi; + + /* here we need to (re)compute the kernel eigenvectors */ + /* for new gauge fields */ + + if(g_proc_id == 0) {printf("# Not using even/odd preconditioning!\n"); fflush(stdout);} + convert_eo_to_lexic(g_spinor_field[DUM_DERI], optr->sr0, optr->sr1); + convert_eo_to_lexic(g_spinor_field[DUM_DERI+1], optr->prop0, optr->prop1); + + if(optr->solver == 13 ){ + optr->iterations = sumr(g_spinor_field[DUM_DERI+1],g_spinor_field[DUM_DERI] , optr->maxiter, optr->eps_sq); + } + else if(optr->solver == 1 /* CG */) { + + gamma5(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI], VOLUME); + + if(use_preconditioning==1 && g_precWS!=NULL){ + ws=(spinorPrecWS*)g_precWS; + printf("# Using preconditioning (which one?)!\n"); + + alpha = ws->precExpo[2]; + spinorPrecondition(g_spinor_field[DUM_DERI+1],g_spinor_field[DUM_DERI+1],ws,T,L,alpha,0,1); + + /* iter = cg_her(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1], max_iter, precision, */ + /* rel_prec, VOLUME, &Q_pm_psi_prec); */ + optr->iterations = cg_her(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1], optr->maxiter, optr->eps_sq, + optr->rel_prec, VOLUME, &Qov_sq_psi_prec); + + alpha = ws->precExpo[0]; + spinorPrecondition(g_spinor_field[DUM_DERI],g_spinor_field[DUM_DERI],ws,T,L,alpha,0,1); + + } + else { + printf("# Not using preconditioning (which one?)!\n"); + /* iter = cg_her(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1], max_iter, precision, */ + /* rel_prec, VOLUME, &Q_pm_psi); */ + optr->iterations = cg_her(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1], optr->maxiter, optr->eps_sq, + optr->rel_prec, VOLUME, &Qov_sq_psi); + } + + + Qov_psi(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI]); + + if(use_preconditioning == 1 && g_precWS!=NULL){ + ws=(spinorPrecWS*)g_precWS; + alpha = ws->precExpo[1]; + spinorPrecondition(g_spinor_field[DUM_DERI+1],g_spinor_field[DUM_DERI+1],ws,T,L,alpha,0,1); + } + + } + + op(g_spinor_field[4],g_spinor_field[DUM_DERI+1]); + + convert_eo_to_lexic(g_spinor_field[DUM_DERI], optr->sr0, optr->sr1); + + optr->reached_prec=diff_and_square_norm(g_spinor_field[4],g_spinor_field[DUM_DERI],VOLUME); + + convert_lexic_to_eo(optr->prop0, optr->prop1 , g_spinor_field[DUM_DERI+1]); + + return; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/invert_overlap.h b/qcd/part_cpu/applications/QCD/src/kernel_D/invert_overlap.h new file mode 100644 index 0000000000000000000000000000000000000000..25d657a49cfb0572ac8d5bc582bae9cff3c08b17 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/invert_overlap.h @@ -0,0 +1,25 @@ +/*********************************************************************** + * Copyright (C) 2009 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _INVERT_OVERLAP_H +#define _INVERT_OVERLAP_H + +void invert_overlap(const int op_id, const int index_start); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/DML_crc32.c b/qcd/part_cpu/applications/QCD/src/kernel_D/io/DML_crc32.c new file mode 100644 index 0000000000000000000000000000000000000000..b576e4cf14e13bc72ae375da390feef09aa5eb61 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/DML_crc32.c @@ -0,0 +1,206 @@ +/* DML_crc32.c */ +/* Taken from QIO library by mcneile */ + +/* Taken from the GNU CVS distribution and + modified for SciDAC use C. DeTar 10/11/2003 */ + +/* crc32.c -- compute the CRC-32 of a data stream + * Copyright (C) 1995-1996 Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h + */ + +/* Copyright notice reproduced from zlib.h -- (C. DeTar) + + version 1.0.4, Jul 24th, 1996. + + Copyright (C) 1995-1996 Jean-loup Gailly and Mark Adler + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. + + Jean-loup Gailly Mark Adler + gzip@prep.ai.mit.edu madler@alumni.caltech.edu + + + The data format used by the zlib library is described by RFCs (Request for + Comments) 1950 to 1952 in the files ftp://ds.internic.net/rfc/rfc1950.txt + (zlib format), rfc1951.txt (deflate format) and rfc1952.txt (gzip format). +*/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include"dml.h" + +typedef uint32_t uLong; /* At least 32 bits */ +typedef unsigned char Byte; +typedef Byte Bytef; +typedef uLong uLongf; +#define Z_NULL 0 /* for initializing zalloc, zfree, opaque */ + +#define local static + +#ifdef DYNAMIC_CRC_TABLE + +local int crc_table_empty = 1; +local uLongf crc_table[256]; +local void make_crc_table OF((void)); + +/* + Generate a table for a byte-wise 32-bit CRC calculation on the polynomial: + x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1. + + Polynomials over GF(2) are represented in binary, one bit per coefficient, + with the lowest powers in the most significant bit. Then adding polynomials + is just exclusive-or, and multiplying a polynomial by x is a right shift by + one. If we call the above polynomial p, and represent a byte as the + polynomial q, also with the lowest power in the most significant bit (so the + byte 0xb1 is the polynomial x^7+x^3+x+1), then the CRC is (q*x^32) mod p, + where a mod b means the remainder after dividing a by b. + + This calculation is done using the shift-register method of multiplying and + taking the remainder. The register is initialized to zero, and for each + incoming bit, x^32 is added mod p to the register if the bit is a one (where + x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by + x (which is shifting right by one and adding x^32 mod p if the bit shifted + out is a one). We start with the highest power (least significant bit) of + q and repeat for all eight bits of q. + + The table is simply the CRC of all possible eight bit values. This is all + the information needed to generate CRC's on data a byte at a time for all + combinations of CRC register values and incoming bytes. +*/ +local void make_crc_table() +{ + uLong c; + int n, k; + uLong poly; /* polynomial exclusive-or pattern */ + /* terms of polynomial defining this crc (except x^32): */ + static Byte p[] = {0,1,2,4,5,7,8,10,11,12,16,22,23,26}; + + /* make exclusive-or pattern from polynomial (0xedb88320L) */ + poly = 0L; + for (n = 0; n < sizeof(p)/sizeof(Byte); n++) + poly |= 1L << (31 - p[n]); + + for (n = 0; n < 256; n++) + { + c = (uLong)n; + for (k = 0; k < 8; k++) + c = c & 1 ? poly ^ (c >> 1) : c >> 1; + crc_table[n] = c; + } + crc_table_empty = 0; +} +#else +/* ======================================================================== + * Table of CRC-32's of all single-byte values (made by make_crc_table) + */ +local uLongf crc_table[256] = { + 0x00000000L, 0x77073096L, 0xee0e612cL, 0x990951baL, 0x076dc419L, + 0x706af48fL, 0xe963a535L, 0x9e6495a3L, 0x0edb8832L, 0x79dcb8a4L, + 0xe0d5e91eL, 0x97d2d988L, 0x09b64c2bL, 0x7eb17cbdL, 0xe7b82d07L, + 0x90bf1d91L, 0x1db71064L, 0x6ab020f2L, 0xf3b97148L, 0x84be41deL, + 0x1adad47dL, 0x6ddde4ebL, 0xf4d4b551L, 0x83d385c7L, 0x136c9856L, + 0x646ba8c0L, 0xfd62f97aL, 0x8a65c9ecL, 0x14015c4fL, 0x63066cd9L, + 0xfa0f3d63L, 0x8d080df5L, 0x3b6e20c8L, 0x4c69105eL, 0xd56041e4L, + 0xa2677172L, 0x3c03e4d1L, 0x4b04d447L, 0xd20d85fdL, 0xa50ab56bL, + 0x35b5a8faL, 0x42b2986cL, 0xdbbbc9d6L, 0xacbcf940L, 0x32d86ce3L, + 0x45df5c75L, 0xdcd60dcfL, 0xabd13d59L, 0x26d930acL, 0x51de003aL, + 0xc8d75180L, 0xbfd06116L, 0x21b4f4b5L, 0x56b3c423L, 0xcfba9599L, + 0xb8bda50fL, 0x2802b89eL, 0x5f058808L, 0xc60cd9b2L, 0xb10be924L, + 0x2f6f7c87L, 0x58684c11L, 0xc1611dabL, 0xb6662d3dL, 0x76dc4190L, + 0x01db7106L, 0x98d220bcL, 0xefd5102aL, 0x71b18589L, 0x06b6b51fL, + 0x9fbfe4a5L, 0xe8b8d433L, 0x7807c9a2L, 0x0f00f934L, 0x9609a88eL, + 0xe10e9818L, 0x7f6a0dbbL, 0x086d3d2dL, 0x91646c97L, 0xe6635c01L, + 0x6b6b51f4L, 0x1c6c6162L, 0x856530d8L, 0xf262004eL, 0x6c0695edL, + 0x1b01a57bL, 0x8208f4c1L, 0xf50fc457L, 0x65b0d9c6L, 0x12b7e950L, + 0x8bbeb8eaL, 0xfcb9887cL, 0x62dd1ddfL, 0x15da2d49L, 0x8cd37cf3L, + 0xfbd44c65L, 0x4db26158L, 0x3ab551ceL, 0xa3bc0074L, 0xd4bb30e2L, + 0x4adfa541L, 0x3dd895d7L, 0xa4d1c46dL, 0xd3d6f4fbL, 0x4369e96aL, + 0x346ed9fcL, 0xad678846L, 0xda60b8d0L, 0x44042d73L, 0x33031de5L, + 0xaa0a4c5fL, 0xdd0d7cc9L, 0x5005713cL, 0x270241aaL, 0xbe0b1010L, + 0xc90c2086L, 0x5768b525L, 0x206f85b3L, 0xb966d409L, 0xce61e49fL, + 0x5edef90eL, 0x29d9c998L, 0xb0d09822L, 0xc7d7a8b4L, 0x59b33d17L, + 0x2eb40d81L, 0xb7bd5c3bL, 0xc0ba6cadL, 0xedb88320L, 0x9abfb3b6L, + 0x03b6e20cL, 0x74b1d29aL, 0xead54739L, 0x9dd277afL, 0x04db2615L, + 0x73dc1683L, 0xe3630b12L, 0x94643b84L, 0x0d6d6a3eL, 0x7a6a5aa8L, + 0xe40ecf0bL, 0x9309ff9dL, 0x0a00ae27L, 0x7d079eb1L, 0xf00f9344L, + 0x8708a3d2L, 0x1e01f268L, 0x6906c2feL, 0xf762575dL, 0x806567cbL, + 0x196c3671L, 0x6e6b06e7L, 0xfed41b76L, 0x89d32be0L, 0x10da7a5aL, + 0x67dd4accL, 0xf9b9df6fL, 0x8ebeeff9L, 0x17b7be43L, 0x60b08ed5L, + 0xd6d6a3e8L, 0xa1d1937eL, 0x38d8c2c4L, 0x4fdff252L, 0xd1bb67f1L, + 0xa6bc5767L, 0x3fb506ddL, 0x48b2364bL, 0xd80d2bdaL, 0xaf0a1b4cL, + 0x36034af6L, 0x41047a60L, 0xdf60efc3L, 0xa867df55L, 0x316e8eefL, + 0x4669be79L, 0xcb61b38cL, 0xbc66831aL, 0x256fd2a0L, 0x5268e236L, + 0xcc0c7795L, 0xbb0b4703L, 0x220216b9L, 0x5505262fL, 0xc5ba3bbeL, + 0xb2bd0b28L, 0x2bb45a92L, 0x5cb36a04L, 0xc2d7ffa7L, 0xb5d0cf31L, + 0x2cd99e8bL, 0x5bdeae1dL, 0x9b64c2b0L, 0xec63f226L, 0x756aa39cL, + 0x026d930aL, 0x9c0906a9L, 0xeb0e363fL, 0x72076785L, 0x05005713L, + 0x95bf4a82L, 0xe2b87a14L, 0x7bb12baeL, 0x0cb61b38L, 0x92d28e9bL, + 0xe5d5be0dL, 0x7cdcefb7L, 0x0bdbdf21L, 0x86d3d2d4L, 0xf1d4e242L, + 0x68ddb3f8l, 0x1fda836eL, 0x81be16cdL, 0xf6b9265bL, 0x6fb077e1L, + 0x18b74777L, 0x88085ae6L, 0xff0f6a70L, 0x66063bcaL, 0x11010b5cL, + 0x8f659effL, 0xf862ae69L, 0x616bffd3L, 0x166ccf45L, 0xa00ae278L, + 0xd70dd2eeL, 0x4e048354L, 0x3903b3c2L, 0xa7672661L, 0xd06016f7L, + 0x4969474dL, 0x3e6e77dbL, 0xaed16a4aL, 0xd9d65adcL, 0x40df0b66L, + 0x37d83bf0L, 0xa9bcae53L, 0xdebb9ec5L, 0x47b2cf7fL, 0x30b5ffe9L, + 0xbdbdf21cL, 0xcabac28aL, 0x53b39330L, 0x24b4a3a6L, 0xbad03605L, + 0xcdd70693L, 0x54de5729L, 0x23d967bfL, 0xb3667a2eL, 0xc4614ab8L, + 0x5d681b02L, 0x2a6f2b94L, 0xb40bbe37L, 0xc30c8ea1L, 0x5a05df1bL, + 0x2d02ef8dL +}; +#endif + +/* ========================================================================= + * This function can be used by asm versions of crc32() + */ +#if 0 /* we don't need it -- avoid unused code warnings */ +static uLongf *get_crc_table() +{ +#ifdef DYNAMIC_CRC_TABLE + if (crc_table_empty) make_crc_table(); +#endif + return (uLongf *)crc_table; +} +#endif + +/* ========================================================================= */ +#define DO1(buf) crc = crc_table[((int)crc ^ (*buf++)) & 0xff] ^ (crc >> 8); +#define DO2(buf) DO1(buf); DO1(buf); +#define DO4(buf) DO2(buf); DO2(buf); +#define DO8(buf) DO4(buf); DO4(buf); + +/* ========================================================================= */ +uint32_t DML_crc32(uint32_t crc, const unsigned char *buf, size_t len) +{ + if (buf == Z_NULL) return 0L; +#ifdef DYNAMIC_CRC_TABLE + if (crc_table_empty) + make_crc_table(); +#endif + crc = crc ^ 0xffffffffL; + while (len >= 8) + { + DO8(buf); + len -= 8; + } + if (len) do { + DO1(buf); + } while (--len); + return crc ^ 0xffffffffL; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_D/io/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..1f699ed5fa20c281357a2b4603cec4d7655d529f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/Makefile @@ -0,0 +1,134 @@ + +srcdir = . +top_builddir = .. +abs_top_builddir = /home/jacob/Job/Prace/kernel_D_soft/kernel_D_update2 +top_srcdir = .. +abs_top_srcdir = /home/jacob/Job/Prace/kernel_D_soft/kernel_D_update2 +subdir = io +builddir = . + +CFLAGS = -std=c99 -fopenmp -pedantic -Wall +DEPFLAGS = -MM +LDFLAGS = -L${HOME}/lib -L${top_builddir}/lib +DEFS = -DHAVE_CONFIG_H +OPTARGS = -O + +AR = ar +RANLIB = ranlib +CC = mpicc +CCDEP = gcc +CCLD = $(CC) +LINK = $(CCLD) $(CFLAGS) $(LDFLAGS) ${OPTARGS} -o $@ +LEX = flex +AUTOCONF = autoconf +DEFS = -DHAVE_CONFIG_H + +LEMON_AVAILABLE = 0 + +INCLUDES = -I$(HOME)/include/ -I. -I${abs_top_builddir}/ -I${abs_top_srcdir}/ -I/include/ -I/include/ +LDADD = +COMPILE = ${CC} ${DEFS} ${INCLUDES} ${CFLAGS} ${OPTARGS} + +LIBRARIES = libio + +libio_TARGETS = utils_engineering \ + utils_parse_checksum_xml \ + utils_write_message \ + utils_read_message \ + gauge_write_binary \ + gauge_read_binary \ + gauge_read \ + gauge_write \ + utils_write_xlf \ + utils_write_xlf_xml \ + utils_write_ildg_format \ + utils_write_header \ + utils_write_checksum \ + utils_write_inverter_info \ + utils_kill_with_error \ + utils_construct_reader \ + utils_destruct_reader \ + utils_construct_writer \ + utils_destruct_writer \ + utils_close_writer_record \ + utils_close_reader_record \ + utils_write_first_message \ + utils_parse_propagator_type \ + utils_parse_ildgformat_xml \ + params_construct_ildgFormat \ + params_construct_propagatorFormat \ + params_construct_sourceFormat \ + params_construct_xlfInfo \ + params_construct_InverterInfo \ + spinor_write \ + spinor_read \ + spinor_write_binary \ + spinor_read_binary \ + spinor_write_info \ + spinor_write_source_format \ + spinor_write_propagator_format \ + spinor_write_propagator_type \ + utils DML_crc32 dml \ + eospinor_write \ + eospinor_read \ + io_cm \ + deri_write_stdout spinor_write_stdout sw_write_stdout + +libio_OBJECTS = $(addsuffix .o, ${libio_TARGETS}) + +# default rule + +all: Makefile dep libio.a + +# rules for debugging +debug all-debug: CFLAGS := $(CFLAGS) -g +debug all-debug: all + +# rules for profiling information +profile all-profile: CFLAGS := $(filter-out -fomit-frame-pointer,${CFLAGS}) +profile all-profile: all + + +#include dep rules + + +-include $(addsuffix .d,${libio_TARGETS}) + +include ${top_srcdir}/Makefile.global + +# rule to compile objects + +%.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h + $(COMPILE) -c $< + + +# rule to make libio +libio.a: ${libio_OBJECTS} Makefile + @rm -f libio.a + @${AR} cru libio.a $(libio_OBJECTS) + @$(RANLIB) libio.a + @cp libio.a ${top_builddir}/lib/libio.a + +# rule to generate .d files + +$(addsuffix .d,$(libio_TARGETS)): %.d: ${srcdir}/%.c Makefile + @$(CCDEP) ${DEFS} ${DEPFLAGS} ${INCLUDES} $< > $@ + +# rule to make dependencies + +dep: ${addsuffix .d, ${libio_TARGETS}} + +# rules to clean + +compile-clean: Makefile + rm -f ${$(addsuffix _OBJECTS, ${LIBRARIES})} *.d + +clean: compile-clean + rm -f $(addsuffix .a, ${LIBRARIES}) + rm -f ../lib/libio.a + +distclean: clean + rm -f Makefile + + +.PHONY: all dep clean compile-clean distclean debug all-debug profile all-profile diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/Makefile.in b/qcd/part_cpu/applications/QCD/src/kernel_D/io/Makefile.in new file mode 100644 index 0000000000000000000000000000000000000000..1f9672a5ff8755420d068441ef8e7e554edcf889 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/Makefile.in @@ -0,0 +1,134 @@ + +srcdir = @srcdir@ +top_builddir = @top_builddir@ +abs_top_builddir = @abs_top_builddir@ +top_srcdir = @top_srcdir@ +abs_top_srcdir = @abs_top_srcdir@ +subdir = io +builddir = @builddir@ + +CFLAGS = @CFLAGS@ +DEPFLAGS = @DEPFLAGS@ +LDFLAGS = @LDFLAGS@ +DEFS = @DEFS@ +OPTARGS = @OPTARGS@ + +AR = @AR@ +RANLIB = @RANLIB@ +CC = @CC@ +CCDEP = @CCDEP@ +CCLD = $(CC) +LINK = $(CCLD) $(CFLAGS) $(LDFLAGS) ${OPTARGS} -o $@ +LEX = @LEX@ +AUTOCONF = @AUTOCONF@ +DEFS = @DEFS@ + +LEMON_AVAILABLE = @LEMON_AVAILABLE@ + +INCLUDES = @INCLUDES@ +LDADD = +COMPILE = ${CC} ${DEFS} ${INCLUDES} ${CFLAGS} ${OPTARGS} + +LIBRARIES = libio + +libio_TARGETS = utils_engineering \ + utils_parse_checksum_xml \ + utils_write_message \ + utils_read_message \ + gauge_write_binary \ + gauge_read_binary \ + gauge_read \ + gauge_write \ + utils_write_xlf \ + utils_write_xlf_xml \ + utils_write_ildg_format \ + utils_write_header \ + utils_write_checksum \ + utils_write_inverter_info \ + utils_kill_with_error \ + utils_construct_reader \ + utils_destruct_reader \ + utils_construct_writer \ + utils_destruct_writer \ + utils_close_writer_record \ + utils_close_reader_record \ + utils_write_first_message \ + utils_parse_propagator_type \ + utils_parse_ildgformat_xml \ + params_construct_ildgFormat \ + params_construct_propagatorFormat \ + params_construct_sourceFormat \ + params_construct_xlfInfo \ + params_construct_InverterInfo \ + spinor_write \ + spinor_read \ + spinor_write_binary \ + spinor_read_binary \ + spinor_write_info \ + spinor_write_source_format \ + spinor_write_propagator_format \ + spinor_write_propagator_type \ + utils DML_crc32 dml \ + eospinor_write \ + eospinor_read \ + io_cm \ + deri_write_stdout spinor_write_stdout sw_write_stdout + +libio_OBJECTS = $(addsuffix .o, ${libio_TARGETS}) + +# default rule + +all: Makefile dep libio.a + +# rules for debugging +debug all-debug: CFLAGS := $(CFLAGS) @DEBUG_FLAG@ +debug all-debug: all + +# rules for profiling information +profile all-profile: CFLAGS := $(filter-out -fomit-frame-pointer,${CFLAGS}) @PROFILE_FLAG@ +profile all-profile: all + + +#include dep rules + + +-include $(addsuffix .d,${libio_TARGETS}) + +include ${top_srcdir}/Makefile.global + +# rule to compile objects + +%.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h + $(COMPILE) -c $< + + +# rule to make libio +libio.a: ${libio_OBJECTS} Makefile + @rm -f libio.a + @${AR} cru libio.a $(libio_OBJECTS) + @$(RANLIB) libio.a + @cp libio.a ${top_builddir}/lib/libio.a + +# rule to generate .d files + +$(addsuffix .d,$(libio_TARGETS)): %.d: ${srcdir}/%.c Makefile + @$(CCDEP) ${DEFS} ${DEPFLAGS} ${INCLUDES} $< > $@ + +# rule to make dependencies + +dep: ${addsuffix .d, ${libio_TARGETS}} + +# rules to clean + +compile-clean: Makefile + rm -f ${$(addsuffix _OBJECTS, ${LIBRARIES})} *.d + +clean: compile-clean + rm -f $(addsuffix .a, ${LIBRARIES}) + rm -f ../lib/libio.a + +distclean: clean + rm -f Makefile + + +.PHONY: all dep clean compile-clean distclean debug all-debug profile all-profile diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/deri_write_stdout.c b/qcd/part_cpu/applications/QCD/src/kernel_D/io/deri_write_stdout.c new file mode 100644 index 0000000000000000000000000000000000000000..89fa970d3dde7d8476249a90c083b286e9ece396 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/deri_write_stdout.c @@ -0,0 +1,77 @@ +/*********************************************************************** +* Copyright (C) 2012 Carsten Urbach +* +* This file is part of tmLQCD. +* +* tmLQCD is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation, either version 3 of the License, or +* (at your option) any later version. +* +* tmLQCD is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with tmLQCD. If not, see . +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include "global.h" +#ifdef MPI +# include +#endif +#include "su3adj.h" +#include "io/deri_write_stdout.h" + +void deri_write_stdout(su3adj** const df) { + int X, Y, Z, t0, id = 0, ix, iy; + int coords[4]; + + for(int t = 0; t < g_nproc_t*T; t++) { + t0 = t - g_proc_coords[0]*T; + coords[0] = t / T; + for(int x = 0; x < g_nproc_x*LX; x++) { + X = x - g_proc_coords[1]*LX; + coords[1] = x / LX; + for(int y = 0; y < g_nproc_y*LY; y++) { + Y = y - g_proc_coords[2]*LY; + coords[2] = y / LY; + for(int z = 0; z < g_nproc_z*LZ; z++) { + Z = z - g_proc_coords[3]*LZ; + coords[3] = z / LZ; +#ifdef MPI + MPI_Cart_rank(g_cart_grid, coords, &id); +#endif + if(g_cart_id == id) { + ix = g_ipt[t0][X][Y][Z]; + iy = t*(g_nproc_x*LX*g_nproc_y*LY*g_nproc_z*LZ) + + x*(g_nproc_y*LY*g_nproc_z*LZ) + + y*(g_nproc_z*LZ) + z; + for(int mu = 0; mu < 4; mu++) { +/* printf(" %d %d %d %d %d, %d %d %d %d: %d %e %e %e %e %e %e %e %e\n", */ +/* iy, t, x, y, z, t0, X, Y, Z, */ +/* mu, df[ix][mu].d1, df[ix][mu].d2, */ +/* df[ix][mu].d3, df[ix][mu].d4, df[ix][mu].d5, df[ix][mu].d6, */ +/* df[ix][mu].d7, df[ix][mu].d8); */ + printf(" %d %d %d %d %d, %d %d %d %d: %d %e %e de\n", + iy, t, x, y, z, t0, X, Y, Z, + mu, df[ix][mu].d1, df[ix][mu].d2); + + fflush(stdout); + } + } +#ifdef MPI + MPI_Barrier(MPI_COMM_WORLD); +#endif + } + } + } + } + return; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/deri_write_stdout.h b/qcd/part_cpu/applications/QCD/src/kernel_D/io/deri_write_stdout.h new file mode 100644 index 0000000000000000000000000000000000000000..eb83f5b63f43941c6a73975b12874d0971e62198 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/deri_write_stdout.h @@ -0,0 +1,27 @@ +/*********************************************************************** +* Copyright (C) 2012 Carsten Urbach +* +* This file is part of tmLQCD. +* +* tmLQCD is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation, either version 3 of the License, or +* (at your option) any later version. +* +* tmLQCD is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with tmLQCD. If not, see . +***********************************************************************/ + +#ifndef _DERI_WRITE_STDOUT_H +#define _DERI_WRITE_STDOUT_H + +#include "su3adj.h" + +void deri_write_stdout(su3adj** const df); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/dml.c b/qcd/part_cpu/applications/QCD/src/kernel_D/io/dml.c new file mode 100644 index 0000000000000000000000000000000000000000..34c2a51827dae750386b62b7b90094b54758f0a9 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/dml.c @@ -0,0 +1,74 @@ +/* + A subset of the dml library for checksums + taken from QIO and adapted for tmLQCD by Carsten Urbach +*/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef MPI +# include +#endif +#include "global.h" +#include"dml.h" + + +/*------------------------------------------------------------------*/ +/* Checksum "class" */ +/* We do a crc32 sum on the site data -- then do two lexicographic- + rank-based bit rotations and XORs on the resulting crc32 + checksum */ + +/* Initialize checksums */ +void DML_checksum_init(DML_Checksum *checksum){ + checksum->suma = 0; + checksum->sumb = 0; +} + + +#ifdef MPI +int DML_global_xor(uint32_t *x) { + unsigned long work = (unsigned long)*x; + unsigned long dest; + int status; + + status = MPI_Allreduce((void *)&work, (void *)&dest, 1, + MPI_UNSIGNED_LONG, MPI_BXOR, MPI_COMM_WORLD); + + if (status == MPI_SUCCESS) { + *x = (uint32_t)dest; + } + return(status); +} +#else +int DML_global_xor(uint32_t *x){return(0);} +#endif + + +/* Accumulate checksums */ +void DML_checksum_accum(DML_Checksum *checksum, DML_SiteRank rank, + char *buf, size_t size){ + + DML_SiteRank rank29 = rank; + DML_SiteRank rank31 = rank; + uint32_t work = DML_crc32(0, (unsigned char*)buf, size); + + rank29 %= 29; rank31 %= 31; + + checksum->suma ^= work<>(32-rank29); + checksum->sumb ^= work<>(32-rank31); +} + + +/* Combine checksums over all nodes */ +void DML_checksum_combine(DML_Checksum *checksum){ + DML_global_xor(&checksum->suma); + DML_global_xor(&checksum->sumb); +} + +/* Add single checksum set to the total */ +void DML_checksum_peq(DML_Checksum *total, DML_Checksum *checksum){ + total->suma ^= checksum->suma; + total->sumb ^= checksum->sumb; +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/dml.h b/qcd/part_cpu/applications/QCD/src/kernel_D/io/dml.h new file mode 100644 index 0000000000000000000000000000000000000000..56d374a4cf5647579285d1823a7f16324f4097d4 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/dml.h @@ -0,0 +1,63 @@ +/*********************************************************************** + * + * Copyright (C) 2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _DML_H +#define _DML_H + +/* + Header file for the check sum from the QIO package. + +*/ + +#include + +typedef unsigned int uint32_t; + +/* from qio-2.2.0/include/dml.h **/ +typedef struct { + uint32_t suma; + uint32_t sumb; +} DML_Checksum; + + +typedef uint32_t DML_SiteRank; + + +/** + Function prototypes +**/ + +void DML_checksum_init(DML_Checksum *checksum) ; +int DML_global_xor(uint32_t *x) ; + +void DML_checksum_accum(DML_Checksum *checksum, DML_SiteRank rank, + char *buf, size_t size) ; + +void DML_checksum_combine(DML_Checksum *checksum) ; + + +void DML_checksum_peq(DML_Checksum *total, DML_Checksum *checksum) ; + +uint32_t DML_crc32(uint32_t crc, const unsigned char *buf, size_t len); + +#endif + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/eospinor.h b/qcd/part_cpu/applications/QCD/src/kernel_D/io/eospinor.h new file mode 100644 index 0000000000000000000000000000000000000000..c2ce15eca9565ca2570bfeed396dfc438d6830ec --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/eospinor.h @@ -0,0 +1,23 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + + +int read_eospinor(spinor * const s, char * filename); +int write_eospinor(spinor * const s, char * filename, + const double evalue, const double prec, const int nstore); diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/eospinor.ih b/qcd/part_cpu/applications/QCD/src/kernel_D/io/eospinor.ih new file mode 100644 index 0000000000000000000000000000000000000000..30f9a98329aa462b187d193da758325892fa3dcf --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/eospinor.ih @@ -0,0 +1,47 @@ +/*********************************************************************** +* Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach +* +* This file is part of tmLQCD. +* +* tmLQCD is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation, either version 3 of the License, or +* (at your option) any later version. +* +* tmLQCD is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with tmLQCD. If not, see . +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef BENCHMARK +#include <../c-lime/include/lime.h> +#else +#include +#endif +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include +#include +#include +#include +#ifdef MPI +# include +#endif +#include +#include +#include + +#include +#include +#include diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/eospinor_read.c b/qcd/part_cpu/applications/QCD/src/kernel_D/io/eospinor_read.c new file mode 100644 index 0000000000000000000000000000000000000000..ab82fb9c25e961e9e986d35b424e7f952f46419a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/eospinor_read.c @@ -0,0 +1,107 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#include "eospinor.ih" + +int read_eospinor(spinor * const s, char * filename) { + FILE * ifs; + int t, x, y , z, i = 0, status=0; + n_uint64_t bytes; + char * header_type; + LimeReader * limereader; +#ifdef MPI + int position; +#endif + spinor tmp[1]; + + if((ifs = fopen(filename, "r")) == (FILE*)NULL) { + if(g_proc_id == 0) { + fprintf(stderr, "Error opening file %s\n", filename); + } + return(-1); + } + + limereader = limeCreateReader( ifs ); + if( limereader == (LimeReader *)NULL ) { + if(g_proc_id == 0) { + fprintf(stderr, "Unable to open LimeReader\n"); + } + return(-1); + } + while( (status = limeReaderNextRecord(limereader)) != LIME_EOF ) { + if(status != LIME_SUCCESS ) { + fprintf(stderr, "limeReaderNextRecord returned error with status = %d!\n", status); + status = LIME_EOF; + break; + } + header_type = limeReaderType(limereader); + if(!strcmp("eospinor-binary-data",header_type)) break; + } + if(status == LIME_EOF) { + if(g_proc_id == 0) { + fprintf(stderr, "no eospinor-binary-data record found in file %s\n",filename); + } + limeDestroyReader(limereader); + fclose(ifs); + return(-1); + } + bytes = limeReaderBytes(limereader); + if((int)bytes != LX*g_nproc_x*LY*g_nproc_y*LZ*g_nproc_z*T*g_nproc_t*sizeof(spinor)/2) { + if(g_proc_id == 0) { + fprintf(stderr, "wrong length in eospinor: %d. Aborting read!\n", (int)bytes); + } + return(-1); + } + + bytes = sizeof(spinor); + for(x = 0; x < LX; x++) { + for(y = 0; y < LY; y++) { + for(z = 0; z < LZ; z++) { +#if (defined MPI) + limeReaderSeek(limereader, (n_uint64_t) + (g_proc_coords[0]*T+ + (((g_proc_coords[1]*LX+x)*g_nproc_y*LY+g_proc_coords[2]*LY+y)*g_nproc_z*LZ + + g_proc_coords[3]*LZ+z)*T*g_nproc_t)*sizeof(spinor)/2, + SEEK_SET); +#endif + for(t = 0; t < T; t++){ + i = g_lexic2eosub[ g_ipt[t][x][y][z] ]; + if((t+x+y+z+ + g_proc_coords[3]*LZ+g_proc_coords[2]*LY + +g_proc_coords[0]*T+g_proc_coords[1]*LX)%2==0) { + + status = limeReaderReadData(tmp, &bytes, limereader); + be_to_cpu_assign(s + i, tmp, sizeof(spinor)/8); + if(status < 0 && status != LIME_EOR) { + fprintf(stderr, "LIME read error occured with status = %d while reading file %s!\n Aborting...\n", status, filename); +#ifdef MPI + MPI_Abort(MPI_COMM_WORLD, 1); + MPI_Finalize(); +#endif + exit(500); + } + } + } + } + } + } + limeDestroyReader(limereader); + fclose(ifs); + return(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/eospinor_write.c b/qcd/part_cpu/applications/QCD/src/kernel_D/io/eospinor_write.c new file mode 100644 index 0000000000000000000000000000000000000000..dcbe21eddc09640d9597fa03783bc3cfadaff19c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/eospinor_write.c @@ -0,0 +1,166 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#include "eospinor.ih" + +/************************************************* + * + * This routine writes an even or odd spinor-field + * so really of size VOLUME/2 + * + * used for instance for storing eigenvectors + * of the precoditioned matrix + * + *************************************************/ + +int write_eospinor(spinor * const s, char * filename, + const double evalue, const double prec, const int nstore) { + FILE * ofs = NULL; + LimeWriter * limewriter = NULL; + LimeRecordHeader * limeheader = NULL; + int x, X, y, Y, z, Z, t, t0, tag=0, id=0, i=0; + int ME_flag=0, MB_flag=0, status=0; + spinor tmp[1]; + int coords[4]; + char message[500]; + n_uint64_t bytes; +#ifdef MPI + MPI_Status mpistatus; +#endif + + if(g_cart_id == 0){ + if(g_kappa > 0. || g_kappa < 0.) { + sprintf(message,"\n eigenvalue = %e\n prec = %e\n conf nr = %d\n beta = %f, kappa = %f, mu = %f, c2_rec = %f\n hmcversion = %s", + evalue, prec, nstore, g_beta, g_kappa, g_mu/2./g_kappa, g_rgi_C1, PACKAGE_VERSION); + } + else { + sprintf(message,"\n eigenvalue = %e\n prec = %e\n conf nr = %d\n beta = %f, kappa = %f, 2*kappa*mu = %f, c2_rec = %f\n hmcversion = %s", + evalue, prec, nstore, g_beta, g_kappa, g_mu, g_rgi_C1, PACKAGE_VERSION); + } + bytes = strlen( message ); + + if((ofs = fopen(filename, "w")) == (FILE*)NULL) { + fprintf(stderr, "Error writing eigenvector to file %s!\n", filename); + return(-1); + } + limewriter = limeCreateWriter( ofs ); + if(limewriter == (LimeWriter*)NULL) { + fprintf(stderr, "LIME error in file %s for writing!\n Aboring...\n", filename); +#ifdef MPI + MPI_Abort(MPI_COMM_WORLD, 1); + MPI_Finalize(); +#endif + exit(500); + } + + limeheader = limeCreateHeader(MB_flag, ME_flag, "xlf-info", bytes); + status = limeWriteRecordHeader( limeheader, limewriter); + if(status < 0 ) { + fprintf(stderr, "LIME write header (xlf-info) error %d\n", status); +#ifdef MPI + MPI_Abort(MPI_COMM_WORLD, 1); + MPI_Finalize(); +#endif + exit(500); + } + limeDestroyHeader( limeheader ); + limeWriteRecordData(message, &bytes, limewriter); + + bytes = LX*g_nproc_x*LY*g_nproc_y*LZ*g_nproc_z*T*g_nproc_t*sizeof(spinor)/2; + MB_flag=0; ME_flag=1; + limeheader = limeCreateHeader(MB_flag, ME_flag, "eospinor-binary-data", bytes); + status = limeWriteRecordHeader( limeheader, limewriter); + if(status < 0 ) { + fprintf(stderr, "LIME write header (eospinor-binary-data) error %d\n", status); +#ifdef MPI + MPI_Abort(MPI_COMM_WORLD, 1); + MPI_Finalize(); +#endif + exit(500); + } + limeDestroyHeader( limeheader ); + } + + bytes = sizeof(spinor); + for(x = 0; x < LX*g_nproc_x; x++){ + X = x - g_proc_coords[1]*LX; + coords[1] = x / LX; + for(y = 0; y < LY*g_nproc_y; y++){ + Y = y - g_proc_coords[2]*LY; + coords[2] = y / LY; + for(z = 0; z < LZ*g_nproc_z; z++){ + Z = z - g_proc_coords[3]*LZ; + coords[3] = z / LZ; + for(t0 = 0; t0 < T*g_nproc_t; t0++){ + t = t0 - T*g_proc_coords[0]; + coords[0] = t0 / T; +#ifdef MPI + MPI_Cart_rank(g_cart_grid, coords, &id); +#endif + i = g_lexic2eosub[ g_ipt[t][X][Y][Z] ]; + if((t+X+Y+Z+g_proc_coords[3]*LZ+g_proc_coords[2]*LY + + g_proc_coords[0]*T+g_proc_coords[1]*LX)%2 == 0) { + if(g_cart_id == 0) { + if(g_cart_id == id) { + be_to_cpu_assign(tmp, s + i , sizeof(spinor)/8); + status = limeWriteRecordData((void*)tmp, &bytes, limewriter); + } +#ifdef MPI + else { + MPI_Recv(tmp, sizeof(spinor)/8, MPI_DOUBLE, id, tag, g_cart_grid, &mpistatus); + status = limeWriteRecordData((void*)tmp, &bytes, limewriter); + } +#endif + if(status < 0 ) { + fprintf(stderr, "LIME write error %d\n", status); +#ifdef MPI + MPI_Abort(MPI_COMM_WORLD, 1); + MPI_Finalize(); +#endif + exit(500); + } + } +#ifdef MPI + else { + if(g_cart_id == id) { + be_to_cpu_assign(tmp, s + i, sizeof(spinor)/8); + MPI_Send((void*) tmp, sizeof(spinor)/8, MPI_DOUBLE, 0, tag, g_cart_grid); + } + } +#endif + tag++; + } + } +#ifdef MPI + MPI_Barrier(g_cart_grid); +#endif + tag=0; + } + } + } + if(g_cart_id == 0) { + if(ferror(ofs)) { + fprintf(stderr, "Warning! Error while writing to file %s \n", filename); + } + limeDestroyWriter( limewriter ); + fflush(ofs); + fclose(ofs); + } + return(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/gauge.h b/qcd/part_cpu/applications/QCD/src/kernel_D/io/gauge.h new file mode 100644 index 0000000000000000000000000000000000000000..89206cbe8d0f40d9f2ff005d908b592047065238 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/gauge.h @@ -0,0 +1,40 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _GAUGE_H +#define _GAUGE_H + +#if HAVE_CONFIG_H +#include +#endif + +#include +#include +#include + + +int read_gauge_field(char *filename, su3 ** const gf); +int read_binary_gauge_data(READER *reader, DML_Checksum *checksum, paramsIldgFormat * ildgformat, su3 ** const gf); + +int write_gauge_field(char * filename, int prec, paramsXlfInfo const *xlfInfo); +int write_binary_gauge_data(WRITER * writer, const int prec, DML_Checksum * checksum); + +void write_ildg_format(WRITER *writer, paramsIldgFormat const *format); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/gauge.ih b/qcd/part_cpu/applications/QCD/src/kernel_D/io/gauge.ih new file mode 100644 index 0000000000000000000000000000000000000000..d3f78c2b1bb5b7a56f307b695b93945001639119 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/gauge.ih @@ -0,0 +1,48 @@ +/*********************************************************************** +* Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach +* +* This file is part of tmLQCD. +* +* tmLQCD is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation, either version 3 of the License, or +* (at your option) any later version. +* +* tmLQCD is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with tmLQCD. If not, see . +***********************************************************************/ +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef BENCHMARK +#include <../c-lime/include/lime.h> +#else +#include +#endif +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include +#include +#include +#include +#ifdef MPI +# include +#endif +#include +#include +#include +#include "global.h" +#include "su3.h" + +#include +#include + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/gauge_read.c b/qcd/part_cpu/applications/QCD/src/kernel_D/io/gauge_read.c new file mode 100644 index 0000000000000000000000000000000000000000..c8d344fcfba29e4518558498c7b36e1ae6f29f9f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/gauge_read.c @@ -0,0 +1,186 @@ +/*********************************************************************** + * + * Copyright (C) 2009-2011 Albert Deuzeman, Siebren Reker, Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#include "gauge.ih" + +extern int gauge_precision_read_flag; +paramsGaugeInfo GaugeInfo = { 0., 0, {0,0}, NULL, NULL}; + +int read_gauge_field(char * filename, su3 ** const gf) { + int status = 0; + char *header_type = NULL; + READER *reader = NULL; + + paramsIldgFormat ildgformat_read; + paramsIldgFormat *ildgformat_input; + DML_Checksum checksum_read; + DML_Checksum checksum_calc; + int DML_read_flag = 0; + int gauge_read_flag = 0; + int gauge_binary_status = 0; + int ildgformat_read_flag = 0; + char *checksum_string = NULL; + char *ildgformat_string = NULL; + + construct_reader(&reader, filename); + GaugeInfo.gaugeRead = 0; + ildgformat_input = construct_paramsIldgFormat(gauge_precision_read_flag); + + if(g_cart_id == 0 && g_disable_IO_checks) { + fprintf(stdout, "# WARNING: IO CHECKS HAVE BEEN DISABLED\n"); + } + + while ((status = ReaderNextRecord(reader)) != LIME_EOF) { + if (status != LIME_SUCCESS) { + fprintf(stderr, "ReaderNextRecord returned status %d.\n", status); + break; + } + header_type = ReaderType(reader); + + if(g_cart_id == 0 && g_debug_level > 1) { + fprintf(stdout, "found header %s, will now read the message\n", header_type); + } + + if (strcmp("ildg-binary-data", header_type) == 0) { + if (gauge_read_flag && !g_disable_IO_checks) { /* a previous ildg-binary-data record has already been read from this file */ + fprintf(stderr, "In gauge file %s, multiple LIME records with name: \"ildg-binary-data\" found.\n", filename); + fprintf(stderr, "Unable to verify integrity of the gauge field data.\n"); + destruct_reader(reader); + free(ildgformat_input); + return(-1); + } + gauge_binary_status = read_binary_gauge_data(reader, &checksum_calc, ildgformat_input, gf); + if (gauge_binary_status) { + fprintf(stderr, "Gauge file reading failed at binary part, unable to proceed.\n"); + destruct_reader(reader); + free(ildgformat_input); + return(-1); + } + gauge_read_flag = 1; + GaugeInfo.gaugeRead = 1; + GaugeInfo.checksum = checksum_calc; + } + else if (strcmp("scidac-checksum", header_type) == 0) { + if(checksum_string == (char*)NULL) { + read_message(reader, &checksum_string); + DML_read_flag = parse_checksum_xml(checksum_string, &checksum_read); + free(checksum_string); + } + else { /* checksum_string is not NULL, so a scidac-checksum record was already found */ + if (!g_disable_IO_checks) { + fprintf(stderr, "In gauge file %s, multiple LIME records with name: \"scidac-checksum\" found.\n", filename); + fprintf(stderr, "Unable to verify integrity of the gauge field data.\n"); + destruct_reader(reader); + free(ildgformat_input); + return(-1); + } + } + } + else if (strcmp("xlf-info", header_type) == 0) { + read_message(reader, &GaugeInfo.xlfInfo); + } + else if (strcmp("ildg-data-lfn", header_type) == 0) { + read_message(reader, &GaugeInfo.ildg_data_lfn); + } + else if (strcmp("ildg-format", header_type) == 0) { + if(ildgformat_string == (char*)NULL) { + read_message(reader, &ildgformat_string); + ildgformat_read_flag = parse_ildgformat_xml(ildgformat_string, &ildgformat_read); + free(ildgformat_string); + } + else { /* ildgformat_string is not NULL, so a ildg-format record was already found */ + if (!g_disable_IO_checks) { + fprintf(stderr, "In gauge file %s, multiple LIME records with name: \"ildg-format\" found.\n", filename); + fprintf(stderr, "Unable to verify integrity of the gauge field data.\n"); + destruct_reader(reader); + free(ildgformat_input); + return(-1); + } + } + } + + close_reader_record(reader); + } + if (!g_disable_IO_checks) { + + if (!ildgformat_read_flag) { + fprintf(stderr, "LIME record with name: \"ildg-format\", in gauge file %s either missing or malformed.\n", filename); + fprintf(stderr, "Unable to verify gauge field size or precision.\n"); + destruct_reader(reader); + free(ildgformat_input); + return(-1); + } + + if (!gauge_read_flag) { + fprintf(stderr, "LIME record with name: \"ildg-binary-data\", in gauge file %s either missing or malformed.\n", filename); + fprintf(stderr, "No gauge field was read, unable to proceed.\n"); + destruct_reader(reader); + free(ildgformat_input); + return(-1); + } + + if (!DML_read_flag) { + fprintf(stderr, "LIME record with name: \"scidac-checksum\", in gauge file %s either missing or malformed.\n", filename); + fprintf(stderr, "Unable to verify integrity of gauge field data.\n"); + destruct_reader(reader); + free(ildgformat_input); + return(-1); + } + + if (g_cart_id == 0 && g_debug_level > 0) + { + /* Verify the integrity of the checksum */ + printf("# Scidac checksums for gaugefield %s:\n", filename); + printf("# Calculated : A = %#010x B = %#010x.\n", checksum_calc.suma, checksum_calc.sumb); + printf("# Read from LIME headers: A = %#010x B = %#010x.\n", checksum_read.suma, checksum_read.sumb); + fflush(stdout); + } + if (checksum_calc.suma != checksum_read.suma) { + fprintf(stderr, "For gauge file %s, calculated and stored values for SciDAC checksum A do not match.\n", filename); + destruct_reader(reader); + free(ildgformat_input); + return(-1); + } + if (checksum_calc.sumb != checksum_read.sumb) { + fprintf(stderr, "For gauge file %s, calculated and stored values for SciDAC checksum B do not match.\n", filename); + destruct_reader(reader); + free(ildgformat_input); + return(-1); + } + + if (g_cart_id == 0 && g_debug_level > 0) + { + /* Verify the datafile vs the hmc.input parameters */ + fprintf(stdout, "# Reading ildg-format record:\n"); + fprintf(stdout, "# Precision = %d bits (%s).\n",ildgformat_read.prec, (ildgformat_read.prec == 64 ? "double" : "single")); + fprintf(stdout, "# Lattice size: LX = %d, LY = %d, LZ = %d, LT = %d.\n", ildgformat_read.lx, ildgformat_read.ly, ildgformat_read.lz, ildgformat_read.lt); + fprintf(stdout, "# Input parameters:\n"); + fprintf(stdout, "# Precision = %d bits (%s).\n",ildgformat_input->prec, (ildgformat_input->prec == 64 ? "double" : "single")); + fprintf(stdout, "# Lattice size: LX = %d, LY = %d, LZ = %d, LT = %d.\n", ildgformat_input->lx, ildgformat_input->ly, ildgformat_input->lz, ildgformat_input->lt); + } + } + + free(ildgformat_input); + destruct_reader(reader); + + g_update_gauge_copy = 1; + + return(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/gauge_read_binary.c b/qcd/part_cpu/applications/QCD/src/kernel_D/io/gauge_read_binary.c new file mode 100644 index 0000000000000000000000000000000000000000..ebb8713b5965e545efc758db3a74faefd60a31b5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/gauge_read_binary.c @@ -0,0 +1,225 @@ +/*********************************************************************** +* Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach +* +* This file is part of tmLQCD. +* +* tmLQCD is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation, either version 3 of the License, or +* (at your option) any later version. +* +* tmLQCD is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with tmLQCD. If not, see . +***********************************************************************/ + +#include "gauge.ih" + +/* FIXME I will first fix this function by using referral. + Probably should be done better in the future. AD. */ + +#ifdef HAVE_LIBLEMON +int read_binary_gauge_data(LemonReader * lemonreader, DML_Checksum * checksum, paramsIldgFormat * input, su3 ** const gf) +{ + int t, x, y, z, status = 0; + int latticeSize[] = {input->lt, input->lx, input->ly, input->lz}; + int scidacMapping[] = {0, 3, 2, 1}; + DML_SiteRank rank; + MPI_Offset bytes; + uint64_t fbsu3; + char * filebuffer = NULL, * current = NULL; + double tick = 0, tock = 0; + char measure[64]; + + bytes = lemonReaderBytes(lemonreader); /* datalength of ildg-binary-data record in bytes */ + + if (bytes != (n_uint64_t)g_nproc * (n_uint64_t)VOLUME * 4 * (n_uint64_t)sizeof(su3) / (input->prec==64 ? 1 : 2)) { + fprintf(stderr, "Lattice size and precision found in data file do not match those requested at input.\n"); + fprintf(stderr, "Expected LX = %d, LY = %d, LZ = %d, LT = %d, and %s precision.\n", input->lx, input->ly, input->lz, input->lt, (input->prec==64 ? "double" : "single")); + fprintf(stderr, "Expected %lu bytes, found %lu bytes in gauge file.\n", (unsigned long)(n_uint64_t)g_nproc * (n_uint64_t)VOLUME * 4 * (n_uint64_t)sizeof(su3) / (input->prec==64 ? 1 : 2), (unsigned long)bytes); + fprintf(stderr, "Check input parameters T, L (LX, LY, LZ) and GaugeConfigReadPrecision.\n"); + return(-3); + } + + DML_checksum_init(checksum); + + fbsu3 = sizeof(su3); + if (input->prec == 32) { + fbsu3 /= 2; + } + bytes = 4 * fbsu3; + + + if((void*)(filebuffer = malloc(VOLUME * bytes)) == NULL) { + fprintf (stderr, "malloc errno %d in read_binary_gauge_data, returning without reading gauge file.\n", errno); + errno = 0; + return(-1); + } + + if (g_debug_level > 0) { + MPI_Barrier(g_cart_grid); + tick = MPI_Wtime(); + } + + status = lemonReadLatticeParallelMapped(lemonreader, filebuffer, bytes, latticeSize, scidacMapping); + + if (g_debug_level > 0) { + MPI_Barrier(g_cart_grid); + tock = MPI_Wtime(); + } + + if (status != LEMON_SUCCESS) { + free(filebuffer); + fprintf(stderr, "Lemon read error occurred with status = %d, while reading in gauge_read_binary.c!\n", status); + return(-2); + } + + if (g_debug_level > 0 && g_cart_id == 0) { + engineering(measure, latticeSize[0] * latticeSize[1] * latticeSize[2] * latticeSize[3] * bytes, "b"); + fprintf(stdout, "# Time spent reading %s ", measure); + engineering(measure, tock - tick, "s"); + fprintf(stdout, "was %s.\n", measure); + engineering(measure, latticeSize[0] * latticeSize[1] * latticeSize[2] * latticeSize[3] * bytes / (tock - tick), "b/s"); + fprintf(stdout, "# Reading speed: %s", measure); + engineering(measure, latticeSize[0] * latticeSize[1] * latticeSize[2] * latticeSize[3] * bytes / (g_nproc * (tock - tick)), "b/s"); + fprintf(stdout, " (%s per MPI process).\n", measure); + fflush(stdout); + } + + + for (t = 0; t < T; t++) { + for (z = 0; z < LZ; z++) { + for (y = 0; y < LY; y++) { + for (x = 0; x < LX; x++) { + rank = (DML_SiteRank)(g_proc_coords[1] * LX + + (((g_proc_coords[0] * T + t) * g_nproc_z * LZ + g_proc_coords[3] * LZ + z) * g_nproc_y * LY + + g_proc_coords[2] * LY + y) * ((DML_SiteRank)LX * g_nproc_x) + x); + current = filebuffer + bytes * (x + (y + (t * LZ + z) * LY) * LX); + DML_checksum_accum(checksum, rank, current, bytes); + if (input->prec == 32) { + be_to_cpu_assign_single2double(&gf[ g_ipt[t][x][y][z] ][1], current , sizeof(su3) / 8); + be_to_cpu_assign_single2double(&gf[ g_ipt[t][x][y][z] ][2], current + fbsu3, sizeof(su3) / 8); + be_to_cpu_assign_single2double(&gf[ g_ipt[t][x][y][z] ][3], current + 2 * fbsu3, sizeof(su3) / 8); + be_to_cpu_assign_single2double(&gf[ g_ipt[t][x][y][z] ][0], current + 3 * fbsu3, sizeof(su3) / 8); + } + else { + be_to_cpu_assign(&gf[ g_ipt[t][x][y][z] ][1], current , sizeof(su3) / 8); + be_to_cpu_assign(&gf[ g_ipt[t][x][y][z] ][2], current + fbsu3, sizeof(su3) / 8); + be_to_cpu_assign(&gf[ g_ipt[t][x][y][z] ][3], current + 2 * fbsu3, sizeof(su3) / 8); + be_to_cpu_assign(&gf[ g_ipt[t][x][y][z] ][0], current + 3 * fbsu3, sizeof(su3) / 8); + } + } + } + } + } + DML_global_xor(&checksum->suma); + DML_global_xor(&checksum->sumb); + free(filebuffer); + return(0); +} +#else /* HAVE_LIBLEMON */ +int read_binary_gauge_data(LimeReader * limereader, DML_Checksum * checksum, paramsIldgFormat * input, su3 ** const gf) { + + int t, x, y , z, status=0; + int latticeSize[] = {input->lt, input->lx, input->ly, input->lz}; + n_uint64_t bytes; + su3 tmp[4]; + float tmp2[72]; +#ifdef MPI + double tick = 0, tock = 0; +#endif + char measure[64]; + DML_SiteRank rank; + DML_checksum_init(checksum); + +#ifdef MPI + if (g_debug_level > 0) { + MPI_Barrier(g_cart_grid); + tick = MPI_Wtime(); + } +#endif + + bytes = limeReaderBytes(limereader); /* datalength of ildg-binary-data record in bytes */ + if (bytes != (n_uint64_t)g_nproc * (n_uint64_t)VOLUME * 4 * (n_uint64_t)sizeof(su3) / (input->prec==64 ? 1 : 2)) { + fprintf(stderr, "Lattice size and precision found in data file do not match those requested at input.\n"); + fprintf(stderr, "Expected LX = %d, LY = %d, LZ = %d, LT = %d, and %s precision.\n", input->lx, input->ly, input->lz, input->lt, (input->prec==64 ? "double" : "single")); + fprintf(stderr, "Expected %lu bytes, found %lu bytes.\n", (unsigned long)(n_uint64_t)g_nproc * (n_uint64_t)VOLUME * 4 * (n_uint64_t)sizeof(su3) / (input->prec==64 ? 1 : 2), (unsigned long)bytes); + fprintf(stderr, "Check input parameters T, L (LX, LY, LZ) and GaugeConfigReadPrecision.\n"); + return(-3); + } + + if(input->prec == 32) bytes = (n_uint64_t)2*sizeof(su3); + else bytes = (n_uint64_t)4*sizeof(su3); + for(t = 0; t < T; t++) { + for(z = 0; z < LZ; z++) { + for(y = 0; y < LY; y++) { +#ifdef MPI + limeReaderSeek(limereader,(n_uint64_t) + (((n_uint64_t) g_proc_coords[1]*LX) + + ((n_uint64_t) (((g_proc_coords[0]*T+t)*g_nproc_z*LZ+g_proc_coords[3]*LZ+z)*g_nproc_y*LY + + g_proc_coords[2]*LY+y)*LX*g_nproc_x))*bytes, + SEEK_SET); +#endif + for(x = 0; x < LX; x++) { + rank = (DML_SiteRank) (g_proc_coords[1]*LX + + (((g_proc_coords[0]*T+t)*g_nproc_z*LZ+g_proc_coords[3]*LZ+z)*g_nproc_y*LY + + g_proc_coords[2]*LY+y)*((DML_SiteRank)LX*g_nproc_x) + x); + if(input->prec == 32) { + status = limeReaderReadData(tmp2, &bytes, limereader); + DML_checksum_accum(checksum, rank, (char *) tmp2, bytes); + } + else { + status = limeReaderReadData(tmp, &bytes, limereader); + DML_checksum_accum(checksum, rank, (char *) tmp, bytes); + } + if(status < 0 && status != LIME_EOR) { + fprintf(stderr, "LIME read error occurred with status = %d while reading in gauge_read_binary.c!\n", status); +#ifdef MPI + MPI_Abort(MPI_COMM_WORLD, 1); + MPI_Finalize(); +#endif + return(-2); + } + if(input->prec == 32) { + be_to_cpu_assign_single2double(&gf[ g_ipt[t][x][y][z] ][0], &tmp2[3*18], sizeof(su3)/8); + be_to_cpu_assign_single2double(&gf[ g_ipt[t][x][y][z] ][1], &tmp2[0*18], sizeof(su3)/8); + be_to_cpu_assign_single2double(&gf[ g_ipt[t][x][y][z] ][2], &tmp2[1*18], sizeof(su3)/8); + be_to_cpu_assign_single2double(&gf[ g_ipt[t][x][y][z] ][3], &tmp2[2*18], sizeof(su3)/8); + } + else { + be_to_cpu_assign(&gf[ g_ipt[t][x][y][z] ][0], &tmp[3], sizeof(su3)/8); + be_to_cpu_assign(&gf[ g_ipt[t][x][y][z] ][1], &tmp[0], sizeof(su3)/8); + be_to_cpu_assign(&gf[ g_ipt[t][x][y][z] ][2], &tmp[1], sizeof(su3)/8); + be_to_cpu_assign(&gf[ g_ipt[t][x][y][z] ][3], &tmp[2], sizeof(su3)/8); + } + } + } + } + } + +#ifdef MPI + if (g_debug_level > 0) { + MPI_Barrier(g_cart_grid); + tock = MPI_Wtime(); + + if (g_cart_id == 0) { + engineering(measure, latticeSize[0] * latticeSize[1] * latticeSize[2] * latticeSize[3] * bytes, "b"); + fprintf(stdout, "# Time spent reading %s ", measure); + engineering(measure, tock-tick, "s"); + fprintf(stdout, "was %s.\n", measure); + engineering(measure, latticeSize[0] * latticeSize[1] * latticeSize[2] * latticeSize[3] * bytes / (tock-tick), "b/s"); + fprintf(stdout, "# Reading speed: %s", measure); + engineering(measure, latticeSize[0] * latticeSize[1] * latticeSize[2] * latticeSize[3] * bytes / (g_nproc * (tock-tick)), "b/s"); + fprintf(stdout, " (%s per MPI process).\n", measure); + } + } + + DML_checksum_combine(checksum); +#endif + return(0); +} +#endif /* HAVE_LIBLEMON */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/gauge_write.c b/qcd/part_cpu/applications/QCD/src/kernel_D/io/gauge_write.c new file mode 100644 index 0000000000000000000000000000000000000000..f911770bf7b1bc98183855f60ca854619b703a3f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/gauge_write.c @@ -0,0 +1,58 @@ +/*********************************************************************** +* Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach +* +* This file is part of tmLQCD. +* +* tmLQCD is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation, either version 3 of the License, or +* (at your option) any later version. +* +* tmLQCD is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with tmLQCD. If not, see . +***********************************************************************/ + +#include "gauge.ih" + +int write_gauge_field(char * filename, const int prec, paramsXlfInfo const *xlfInfo) +{ + WRITER * writer = NULL; + uint64_t bytes; + int status = 0; + DML_Checksum checksum; + paramsIldgFormat *ildg; + + bytes = (uint64_t)L * L * L * T_global * sizeof(su3) * prec / 16; + + /* all these functions, except for write_binary_gauge_data do their own error handling */ + construct_writer(&writer, filename, 0);/* the 0 is for not appending */ + + write_xlf_info(writer, xlfInfo); + + ildg = construct_paramsIldgFormat(prec); + write_ildg_format(writer, ildg); + free(ildg); + + /* Both begin and end bit are 0, the message is begun with the format, and will end with the checksum */ + write_header(writer, 0, 0, "ildg-binary-data", bytes); + status = write_binary_gauge_data(writer, prec, &checksum); + write_checksum(writer, &checksum, NULL); + + if (g_cart_id == 0 && g_debug_level > 0) + { + fprintf(stdout, "# Scidac checksums for gaugefield %s:\n", filename); + fprintf(stdout, "# Calculated : A = %#010x B = %#010x.\n", checksum.suma, checksum.sumb); + fflush(stdout); + } +#ifdef MPI + MPI_Barrier(MPI_COMM_WORLD); +#endif /* MPI */ + + destruct_writer(writer); + return status; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/gauge_write_binary.c b/qcd/part_cpu/applications/QCD/src/kernel_D/io/gauge_write_binary.c new file mode 100644 index 0000000000000000000000000000000000000000..3a794131377959fa0f9e3673b7efc7985ba72f8e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/gauge_write_binary.c @@ -0,0 +1,258 @@ +/*********************************************************************** +* Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach +* +* This file is part of tmLQCD. +* +* tmLQCD is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation, either version 3 of the License, or +* (at your option) any later version. +* +* tmLQCD is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with tmLQCD. If not, see . +***********************************************************************/ + +#include "gauge.ih" + +/* FIXME I will first fix this function by using referral. + Probably should be done better in the future. AD. */ + +#ifdef HAVE_LIBLEMON +int write_binary_gauge_data(LemonWriter * lemonwriter, const int prec, DML_Checksum * checksum) +{ + int x, xG, y, yG, z, zG, t, tG, status = 0; + su3 tmp3[4]; + int latticeSize[] = {T_global, g_nproc_x*LX, g_nproc_y*LY, g_nproc_z*LZ}; + int scidacMapping[] = {0, 3, 2, 1}; + unsigned long bufoffset; + char * filebuffer = NULL; + uint64_t bytes; + double tick = 0, tock = 0; + char measure[64]; + DML_SiteRank rank; + DML_checksum_init(checksum); + + bytes = (uint64_t)sizeof(su3) * (prec == 32 ? 2 : 4); + bufoffset = 0; + if((void*)(filebuffer = (char*)malloc(bytes * VOLUME)) == NULL) { + fprintf (stderr, "malloc errno in write_binary_gauge_data_parallel: %d\n",errno); + fflush(stderr); + errno = 0; + return 1; + } + + if (g_debug_level > 0) { + MPI_Barrier(g_cart_grid); + tick = MPI_Wtime(); + } + + tG = g_proc_coords[0]*T; + zG = g_proc_coords[3]*LZ; + yG = g_proc_coords[2]*LY; + xG = g_proc_coords[1]*LX; + for(t = 0; t < T; t++) { + for(z = 0; z < LZ; z++) { + for(y = 0; y < LY; y++) { + for(x = 0; x < LX; x++) { + rank = (DML_SiteRank) ((((tG + t)*L + zG + z)*L + yG + y)*L + xG + x); + memcpy(&tmp3[0], &g_gauge_field[ g_ipt[t][x][y][z] ][1], sizeof(su3)); + memcpy(&tmp3[1], &g_gauge_field[ g_ipt[t][x][y][z] ][2], sizeof(su3)); + memcpy(&tmp3[2], &g_gauge_field[ g_ipt[t][x][y][z] ][3], sizeof(su3)); + memcpy(&tmp3[3], &g_gauge_field[ g_ipt[t][x][y][z] ][0], sizeof(su3)); + if(prec == 32) + be_to_cpu_assign_double2single(filebuffer + bufoffset, tmp3, 4*sizeof(su3)/8); + else + be_to_cpu_assign(filebuffer + bufoffset, tmp3, 4*sizeof(su3)/8); + DML_checksum_accum(checksum, rank, (char*) filebuffer + bufoffset, bytes); + bufoffset += bytes; + } + } + } + } + + status = lemonWriteLatticeParallelMapped(lemonwriter, filebuffer, bytes, latticeSize, scidacMapping); + + if (status != LEMON_SUCCESS) + { + free(filebuffer); + fprintf(stderr, "LEMON write error occurred with status = %d, while writing in gauge_write_binary.c!\n", status); + return(-2); + } + + if (g_debug_level > 0) { + MPI_Barrier(g_cart_grid); + tock = MPI_Wtime(); + + if (g_cart_id == 0) { + engineering(measure, latticeSize[0] * latticeSize[1] * latticeSize[2] * latticeSize[3] * bytes, "b"); + fprintf(stdout, "# Time spent writing %s ", measure); + engineering(measure, tock - tick, "s"); + fprintf(stdout, "was %s.\n", measure); + engineering(measure, latticeSize[0] * latticeSize[1] * latticeSize[2] * latticeSize[3] * bytes / (tock - tick), "b/s"); + fprintf(stdout, "# Writing speed: %s", measure); + engineering(measure, latticeSize[0] * latticeSize[1] * latticeSize[2] * latticeSize[3] * bytes / (g_nproc * (tock - tick)), "b/s"); + fprintf(stdout, " (%s per MPI process).\n", measure); + fflush(stdout); + } + } + + lemonWriterCloseRecord(lemonwriter); + + free(filebuffer); + + status = DML_global_xor(&checksum->suma); + if (status != MPI_SUCCESS) { + fprintf(stderr, "DML Checksum accumulation error occurred with status = %d, while writing in gauge_write_binary.c!\n", status); + return(-2); + } + status = DML_global_xor(&checksum->sumb); + if (status != MPI_SUCCESS) { + fprintf(stderr, "DML Checksum accumulation error occurred with status = %d, while writing in gauge_write_binary.c!\n", status); + return(-2); + } + + return 0; +} + +#else /* HAVE_LIBLEMON */ + +int write_binary_gauge_data(LimeWriter * limewriter, const int prec, DML_Checksum * checksum) +{ + int x, X, y, Y, z, Z, tt, t0, tag=0, id=0, status=0; + int latticeSize[] = {T_global, g_nproc_x*LX, g_nproc_y*LY, g_nproc_z*LZ}; + su3 tmp[4]; + su3 tmp3[4]; + float tmp2[72]; + int coords[4]; + n_uint64_t bytes; + DML_SiteRank rank; +#ifdef MPI + double tick = 0, tock = 0; + char measure[64]; + MPI_Status mpi_status; +#endif + + DML_checksum_init(checksum); + +#ifdef MPI + if (g_debug_level > 0) { + MPI_Barrier(g_cart_grid); + tick = MPI_Wtime(); + } +#endif + if(prec == 32) bytes = (n_uint64_t)2*sizeof(su3); + else bytes = (n_uint64_t)4*sizeof(su3); + for(t0 = 0; t0 < T*g_nproc_t; t0++) { + tt = t0 - g_proc_coords[0]*T; + coords[0] = t0 / T; + for(z = 0; z < LZ*g_nproc_z; z++) { + Z = z - g_proc_coords[3]*LZ; + coords[3] = z / LZ; + for(y = 0; y < LY*g_nproc_y; y++) { + tag = 0; + Y = y - g_proc_coords[2]*LY; + coords[2] = y / LY; + for(x = 0; x < LX*g_nproc_x; x++) { + X = x - g_proc_coords[1]*LX; + coords[1] = x / LX; +#ifdef MPI + MPI_Cart_rank(g_cart_grid, coords, &id); +#endif + if(g_cart_id == 0) { + /* Rank should be computed by proc 0 only */ + rank = (DML_SiteRank) (((t0*LZ*g_nproc_z + z)*LY*g_nproc_y + y)*LX*g_nproc_x + x); + if(g_cart_id == id) { + memcpy(&tmp3[0], &g_gauge_field[ g_ipt[tt][X][Y][Z] ][1], sizeof(su3)); + memcpy(&tmp3[1], &g_gauge_field[ g_ipt[tt][X][Y][Z] ][2], sizeof(su3)); + memcpy(&tmp3[2], &g_gauge_field[ g_ipt[tt][X][Y][Z] ][3], sizeof(su3)); + memcpy(&tmp3[3], &g_gauge_field[ g_ipt[tt][X][Y][Z] ][0], sizeof(su3)); + + if(prec == 32) { + be_to_cpu_assign_double2single(tmp2, tmp3, 4*sizeof(su3)/8); + DML_checksum_accum(checksum, rank, (char*) tmp2, 4*sizeof(su3)/2); + status = limeWriteRecordData((void*)&tmp2, &bytes, limewriter); + } + else { + be_to_cpu_assign(tmp, tmp3, 4*sizeof(su3)/8); + DML_checksum_accum(checksum, rank, (char*) tmp, 4*sizeof(su3)); + status = limeWriteRecordData((void*)&tmp, &bytes, limewriter); + } + } +#ifdef MPI + else { + if(prec == 32) { + MPI_Recv(tmp2, 4*sizeof(su3)/8, MPI_FLOAT, id, tag, g_cart_grid, &mpi_status); + DML_checksum_accum(checksum, rank, (char*) tmp2, 4*sizeof(su3)/2); + status = limeWriteRecordData((void*)&tmp2, &bytes, limewriter); + } + else { + MPI_Recv(tmp, 4*sizeof(su3)/8, MPI_DOUBLE, id, tag, g_cart_grid, &mpi_status); + DML_checksum_accum(checksum, rank, (char*) tmp, 4*sizeof(su3)); + status = limeWriteRecordData((void*)&tmp, &bytes, limewriter); + } + } +#endif + if(status < 0 ) { + fprintf(stderr, "LIME write error occurred with status = %d, while writing in gauge_write_binary.c!\n", status); + fprintf(stderr, "x %d, y %d, z %d, t %d (%d,%d,%d,%d)\n",x,y,z,tt,X,Y,Z,tt); + fprintf(stderr, "id = %d, bytes = %lu, size = %d\n", g_cart_id, bytes, (int)(4*sizeof(su3)/8)); +#ifdef MPI + MPI_Abort(MPI_COMM_WORLD, 1); + MPI_Finalize(); +#endif + exit(500); + } + } +#ifdef MPI + else { + if(g_cart_id == id){ + memcpy(&tmp3[0], &g_gauge_field[ g_ipt[tt][X][Y][Z] ][1], sizeof(su3)); + memcpy(&tmp3[1], &g_gauge_field[ g_ipt[tt][X][Y][Z] ][2], sizeof(su3)); + memcpy(&tmp3[2], &g_gauge_field[ g_ipt[tt][X][Y][Z] ][3], sizeof(su3)); + memcpy(&tmp3[3], &g_gauge_field[ g_ipt[tt][X][Y][Z] ][0], sizeof(su3)); + if(prec == 32) { + be_to_cpu_assign_double2single(tmp2, tmp3, 4*sizeof(su3)/8); + MPI_Send((void*) tmp2, 4*sizeof(su3)/8, MPI_FLOAT, 0, tag, g_cart_grid); + } + else { + be_to_cpu_assign(tmp, tmp3, 4*sizeof(su3)/8); + MPI_Send((void*) tmp, 4*sizeof(su3)/8, MPI_DOUBLE, 0, tag, g_cart_grid); + } + } + } +#endif + tag++; + } +#ifdef MPI + MPI_Barrier(g_cart_grid); +#endif + } + } + } + +#ifdef MPI + if (g_debug_level > 0) { + MPI_Barrier(g_cart_grid); + tock = MPI_Wtime(); + + if (g_cart_id == 0) { + engineering(measure, latticeSize[0] * latticeSize[1] * latticeSize[2] * latticeSize[3] * bytes, "b"); + fprintf(stdout, "# Time spent writing %s ", measure); + engineering(measure, tock-tick, "s"); + fprintf(stdout, "was %s.\n", measure); + engineering(measure, latticeSize[0] * latticeSize[1] * latticeSize[2] * latticeSize[3] * bytes / (tock-tick), "b/s"); + fprintf(stdout, "# Writing speed: %s", measure); + engineering(measure, latticeSize[0] * latticeSize[1] * latticeSize[2] * latticeSize[3] * bytes / (g_nproc * (tock-tick)), "b/s"); + fprintf(stdout, " (%s per MPI process).\n", measure); + } + } +#endif + + return(0); +} +#endif /* HAVE_LIBLEMON */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/io_cm.c b/qcd/part_cpu/applications/QCD/src/kernel_D/io/io_cm.c new file mode 100644 index 0000000000000000000000000000000000000000..277845d5df9310999a4943cca38d1e08a77fcc05 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/io_cm.c @@ -0,0 +1,230 @@ +#include "io_cm.h" + +int read_spinorfield_cm_single(spinor * const s, spinor * const r, char * filename, + const int ts, const int vol) { + /* + * ts is the number of the timeslice to be used + * if ts < 0 read a volume source + * + * if ts >= 0 and vol > 0 the file is a volume file + * but only one timeslice should be read + */ + + FILE * ifs; + int t, x, y , z, i = 0; + spinor * p = NULL; + float tmp[24]; + + ifs = fopen(filename, "r"); + if(ifs == (FILE *)NULL) { + return(-1); + } + + for(x = 0; x < LX; x++) { + for(y = 0; y < LY; y++) { + for(z = 0; z < LZ; z++) { +#if (defined MPI) + fseek(ifs, + (g_proc_coords[0]*T+ + (((g_proc_coords[1]*LX+x)*g_nproc_y*LY+g_proc_coords[2]*LY+y)*g_nproc_z*LZ + + g_proc_coords[3]*LZ+z)*T*g_nproc_t)*sizeof(spinor)/2, + SEEK_SET); +#endif + for(t = 0; t < T; t++) { + + i = g_lexic2eosub[ g_ipt[t][x][y][z] ]; + if((t+x+y+z+ + g_proc_coords[0]*T+g_proc_coords[1]*LX+ + g_proc_coords[2]*LY+g_proc_coords[3]*LZ)%2==0) { + p = s; + } + else { + p = r; + } + + if(ts == t || ts < 0 || ts >= T){ + /* Read the data */ + fread(tmp, sizeof(spinor)/2, 1, ifs); + + /* Test if we read the data with the correct endian order */ + if(isnan(tmp[0]) || isnan(tmp[1]) || isnan(tmp[2]) || isnan(tmp[3]) || isnan(tmp[4]) || isnan(tmp[5]) || + isnan(tmp[6]) || isnan(tmp[7]) || isnan(tmp[8]) || isnan(tmp[9]) || isnan(tmp[10]) || isnan(tmp[11]) || + isnan(tmp[12]) || isnan(tmp[13]) || isnan(tmp[14]) || isnan(tmp[15]) || isnan(tmp[16]) || isnan(tmp[17]) || + isnan(tmp[18]) || isnan(tmp[19]) || isnan(tmp[20]) || isnan(tmp[21]) || isnan(tmp[22]) || isnan(tmp[23])) + { + if(g_proc_id == 0) + { + if(big_endian()) + printf("\nBig endian order gives some NaN. Trying little endian order instead...\n\n"); + else + printf("\nLittle endian order gives some NaN. Trying big endian order instead...\n\n"); + } + + fclose(ifs); + return read_spinorfield_cm_swap_single(s,r,filename,ts,vol); + } + single2double_cm(p+i, tmp); + } + else { + if(vol > 0) { + fread(tmp, sizeof(spinor)/2, 1, ifs); + } + /* Padding with zeros */ + zero_spinor(p+i); + } + } + } + } + } + fclose(ifs); + return(0); +} + +int read_spinorfield_cm_swap_single(spinor * const s, spinor * const r, char * filename, + const int ts, const int vol) { + /* + * ts is the number of the timeslice to be used + * if ts < 0 read a volume source + * + * if ts >= 0 and vol > 0 the file is a volume file + * but only one timeslice should be read + */ + + FILE * ifs; + int t, x, y , z, i = 0; + spinor * p = NULL; + float tmp[24]; + + ifs = fopen(filename, "r"); + if(ifs == (FILE *)NULL) { + fprintf(stderr, "Could not open file %s\n Aborting...\n", filename); +#ifdef MPI + MPI_Abort(MPI_COMM_WORLD, 1); + MPI_Finalize(); +#endif + exit(500); + } + + for(x = 0; x < LX; x++) { + for(y = 0; y < LY; y++) { + for(z = 0; z < LZ; z++) { +#if (defined MPI) + fseek(ifs, + (g_proc_coords[0]*T+ + (((g_proc_coords[1]*LX+x)*g_nproc_y*LY+g_proc_coords[2]*LY+y)*g_nproc_z*LZ + + g_proc_coords[3]*LZ+z)*T*g_nproc_t)*sizeof(spinor)/2, + SEEK_SET); +#endif + for(t = 0; t < T; t++) { + + i = g_lexic2eosub[ g_ipt[t][x][y][z] ]; + if((t+x+y+z+ + g_proc_coords[0]*T+g_proc_coords[1]*LX+ + g_proc_coords[2]*LY+g_proc_coords[3]*LZ)%2==0) { + p = s; + } + else { + p = r; + } + + if(ts == t || ts < 0 || ts >= T){ + /* Read the data */ + fread(tmp, sizeof(spinor)/2, 1, ifs); + + /* Swap and convert from single to double precision */ + be_to_cpu_assign_single2double(p+i, tmp, sizeof(spinor)/8); + } + else { + if(vol > 0) { + fread(tmp, sizeof(spinor)/2, 1, ifs); + } + /* Padding with zeros */ + zero_spinor(p+i); + } + } + } + } + } + fclose(ifs); + return(0); +} + + +int write_spinorfield_cm_single(spinor * const s, spinor * const r, char * filename) { + + FILE * ofs = NULL; + int t, x, y , z, i = 0; + int t0, X, Y, Z, id = 0; + spinor * p = NULL; + float tmp[24]; + int coords[4]; +#ifdef MPI + int tag = 0; + MPI_Status status; +#endif + + if(g_cart_id == 0) { + ofs = fopen(filename, "w"); + printf("# Writing in cmi format (32 Bit) to file %s\n", filename); + } + + for(x = 0; x < LX*g_nproc_x; x++) { + X = x - LX*g_proc_coords[1]; + coords[1] = x / LX; + for(y = 0; y < LY*g_nproc_y; y++) { + Y = y - LY*g_proc_coords[2]; + coords[2] = y / LY; + for(z = 0; z < LZ*g_nproc_z; z++) { + Z = z - LZ*g_proc_coords[3]; + coords[3] = z / LZ; + for(t0 = 0; t0 < T*g_nproc_t; t0++) { + t = t0 - T*g_proc_coords[0]; + coords[0] = t0 / T; +#ifdef MPI + MPI_Cart_rank(g_cart_grid, coords, &id); +#endif + if(g_cart_id == id) { + i = g_lexic2eosub[ g_ipt[t][X][Y][Z] ]; + if((t+X+Y+Z+g_proc_coords[3]*LZ+g_proc_coords[2]*LY + + g_proc_coords[0]*T+g_proc_coords[1]*LX)%2 == 0) { + p = s; + } + else { + p = r; + } + } + if(g_cart_id == 0){ + if(g_cart_id == id) { + double2single_cm(tmp, p + i); + } +#ifdef MPI + else { + MPI_Recv(tmp, sizeof(spinor)/8, MPI_FLOAT, id, tag, g_cart_grid, &status); + } +#endif + fwrite(tmp, sizeof(float), 24, ofs); + // printf("%e,%e\n",tmp[0],tmp[5]);fflush(stdout); + } +#ifdef MPI + else { + if(g_cart_id == id) { + double2single_cm(tmp, p + i); + MPI_Send((void*) tmp, sizeof(spinor)/8, MPI_FLOAT, 0, tag, g_cart_grid); + } + } + tag++; +#endif + } +#ifdef MPI + MPI_Barrier(g_cart_grid); + tag=0; +#endif + } + } + } + if(g_cart_id == 0) { + fclose(ofs); + } + return(0); +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/io_cm.h b/qcd/part_cpu/applications/QCD/src/kernel_D/io/io_cm.h new file mode 100644 index 0000000000000000000000000000000000000000..8cd9cbc4dbbd216c01c8d78d58358b0ac5424eaf --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/io_cm.h @@ -0,0 +1,34 @@ +#ifndef _IO_CM_H +#define _IO_CM_H + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include +#include +#include +#include +#ifdef MPI +# include +#endif +#include +#include +#include + +#include + +#include +#include +#include + +#include + + +int read_spinorfield_cm_single(spinor * const s, spinor * const r, char * filename, const int ts, const int vol); +int read_spinorfield_cm_swap_single(spinor * const s, spinor * const r, char * filename, const int ts, const int vol); +int write_spinorfield_cm_single(spinor * const s, spinor * const r, char * filename); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/params.h b/qcd/part_cpu/applications/QCD/src/kernel_D/io/params.h new file mode 100644 index 0000000000000000000000000000000000000000..c910f11d02f0a1b663c87c2a8b75118a46b4ae63 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/params.h @@ -0,0 +1,149 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _PARAMS_H +#define _PARAMS_H + +#include + +typedef struct +{ + char date[64]; + char package_version[32]; + char inverter[32]; + + double epssq; + double epsbar; + double kappa; + double mu; + double mubar; + double mu_inverted; + double mu_lowest; + + double cgmms_mass; + + int mms; + int iter; + int heavy; + int noflavours; +} +paramsInverterInfo; + +typedef struct +{ + int flavours; + int prec; + int lx; + int ly; + int lz; + int lt; +} +paramsPropagatorFormat; + +typedef struct +{ + int colours; + int flavours; + int prec; + int lx; + int ly; + int lz; + int lt; + int spins; +} +paramsSourceFormat; + +typedef struct +{ + char date[64]; + char package_version[32]; + + double beta; + double c2_rec; + double epsilonbar; + double kappa; + double mu; + double mubar; + double plaq; + + int counter; + + long int time; +} +paramsXlfInfo; + +typedef struct +{ + int lx; + int ly; + int lz; + int lt; + int prec; +} +paramsIldgFormat; + +typedef struct { + double plaquetteEnergy; + int gaugeRead; + DML_Checksum checksum; + char * xlfInfo; + char * ildg_data_lfn; +} paramsGaugeInfo; + +typedef struct { + int splitted; + int format; + int precision; + char * basename; +} paramsPropInfo; + +typedef struct { + /* later usage for the type of source */ + int type; + /* splitted or not (really needed?) */ + int splitted; + /* the IO format (needed?) */ + int format; + /* the IO precision */ + int precision; + /* the source location, where it applies*/ + int t, x, y, z; + /* automatic TS detection */ + int automaticTS; + /* sample, gauge no and index of source */ + int sample, nstore, ix; + /* is this a 2 flavour source */ + int no_flavours; + /* the base filename */ + char * basename; +} paramsSourceInfo; + +/* defined in gauge_read.c */ +extern paramsGaugeInfo GaugeInfo; +/* defined in spinor_read.c */ +extern paramsPropInfo PropInfo; +extern paramsSourceInfo SourceInfo; + +paramsIldgFormat * construct_paramsIldgFormat(int const prec); +paramsPropagatorFormat * construct_paramsPropagatorFormat(int const prec, int const flavours); +paramsSourceFormat * construct_paramsSourceFormat(int const prec, int const flavours, int const spins, int const sources); +paramsXlfInfo * construct_paramsXlfInfo(double const plaq, int const counter); +paramsInverterInfo * construct_paramsInverterInfo(double const epssq, const int iter, + const int solver, const int noflavours); +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/params.ih b/qcd/part_cpu/applications/QCD/src/kernel_D/io/params.ih new file mode 100644 index 0000000000000000000000000000000000000000..2c7364fe105779f80df8e9cc5d97c57b497fbad4 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/params.ih @@ -0,0 +1,21 @@ +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef BENCHMARK +#include <../c-lime/include/lime.h> +#else +#include +#endif +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include + +#include + +#include +#include + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/params_construct_InverterInfo.c b/qcd/part_cpu/applications/QCD/src/kernel_D/io/params_construct_InverterInfo.c new file mode 100644 index 0000000000000000000000000000000000000000..713e25cc93c01dbac76944452e53c51e3219d6ef --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/params_construct_InverterInfo.c @@ -0,0 +1,79 @@ +/*********************************************************************** +* Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach +* +* This file is part of tmLQCD. +* +* tmLQCD is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation, either version 3 of the License, or +* (at your option) any later version. +* +* tmLQCD is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with tmLQCD. If not, see . +***********************************************************************/ + +#include "params.ih" +#include "solver/solver.h" + +/* This needs fixing */ + +paramsInverterInfo *construct_paramsInverterInfo(double const epssq, const int iter, + const int solver, const int noflavours) { + int i; + struct timeval t1; + paramsInverterInfo *info = malloc(sizeof(paramsInverterInfo)); + + if (info == (paramsInverterInfo*)NULL) + kill_with_error(NULL, g_cart_id, "Could not allocate paramsInverterInfo."); + + gettimeofday(&t1, NULL); + + info->iter = iter; + info->epssq = epssq; + info->noflavours = noflavours; + + info->kappa = g_kappa; + info->mu = g_mu / 2. / g_kappa; + + strcpy(info->package_version, PACKAGE_VERSION); + + if(noflavours == 2) { + info->mubar = g_mubar / 2. / g_kappa; + info->epsbar = g_epsbar / 2. / g_kappa; + } + else { + info->mubar = 0.; + info->epsbar = 0.; + } + strcpy(info->date, ctime(&t1.tv_sec)); + info->mms = 0; + info->heavy = 0; + info->cgmms_mass = 0; + switch (solver) { + case CG: + strcpy(info->inverter, "CG"); + break; + case BICGSTAB: + strcpy(info->inverter, "BiCGstab"); + break; + case GMRES: + strcpy(info->inverter, "GMRES"); + break; + case CGMMS: + strcpy(info->inverter, "CGMMS"); + info->mms = 1; + break; + case CGS: + strcpy(info->inverter, "CGS"); + break; + default: + strcpy(info->inverter, "other"); + break; + } + return(info); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/params_construct_ildgFormat.c b/qcd/part_cpu/applications/QCD/src/kernel_D/io/params_construct_ildgFormat.c new file mode 100644 index 0000000000000000000000000000000000000000..7ea68c9ea3c94dbd14305d72749bae9c8d890518 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/params_construct_ildgFormat.c @@ -0,0 +1,17 @@ +#include "params.ih" + +paramsIldgFormat *construct_paramsIldgFormat(int const prec) +{ + paramsIldgFormat *format = malloc(sizeof(paramsIldgFormat)); + + if (format == (paramsIldgFormat*)NULL) + kill_with_error(NULL, g_cart_id, "Could not allocate paramsIldgFormat."); + + format->prec = prec; + format->lx = L; + format->ly = L; + format->lz = L; + format->lt = T_global; + + return format; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/params_construct_propagatorFormat.c b/qcd/part_cpu/applications/QCD/src/kernel_D/io/params_construct_propagatorFormat.c new file mode 100644 index 0000000000000000000000000000000000000000..f6b5b498941cf3feca39884a46486841c04f8b17 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/params_construct_propagatorFormat.c @@ -0,0 +1,19 @@ +#include "params.ih" + +paramsPropagatorFormat *construct_paramsPropagatorFormat(int const prec, int const flavours) +{ + paramsPropagatorFormat *format = malloc(sizeof(paramsPropagatorFormat)); + + if (format == (paramsPropagatorFormat*)NULL) + kill_with_error(NULL, g_cart_id, "Could not allocate paramsPropagatorFormat."); + + format->flavours = flavours; + format->prec = prec; + + format->lx = LX * g_nproc_x; + format->ly = LY * g_nproc_y; + format->lz = LZ * g_nproc_z; + format->lt = T * g_nproc_t; + + return format; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/params_construct_sourceFormat.c b/qcd/part_cpu/applications/QCD/src/kernel_D/io/params_construct_sourceFormat.c new file mode 100644 index 0000000000000000000000000000000000000000..d52ee6a62ed2528c16e53264815e412689d99c31 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/params_construct_sourceFormat.c @@ -0,0 +1,22 @@ +#include "params.ih" + +paramsSourceFormat *construct_paramsSourceFormat(int const prec, int const flavours, int const spins, int const colours) +{ + paramsSourceFormat *format = malloc(sizeof(paramsSourceFormat)); + + if (format == (paramsSourceFormat*)NULL) + kill_with_error(NULL, g_cart_id, "Could not allocate paramsSourceFormat."); + + format->prec = prec; + format->flavours = flavours; + + format->lx = LX * g_nproc_x; + format->ly = LY * g_nproc_y; + format->lz = LZ * g_nproc_z; + format->lt = T * g_nproc_t; + + format->spins = spins; + format->colours = colours; + + return format; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/params_construct_xlfInfo.c b/qcd/part_cpu/applications/QCD/src/kernel_D/io/params_construct_xlfInfo.c new file mode 100644 index 0000000000000000000000000000000000000000..15c9a256a5d09f353fc6cd1a0d8243cbf370a4fd --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/params_construct_xlfInfo.c @@ -0,0 +1,29 @@ +#include "params.ih" + +paramsXlfInfo *construct_paramsXlfInfo(double const plaq, int const counter) +{ + struct timeval t1; + paramsXlfInfo *info = malloc(sizeof(paramsXlfInfo)); + + if (info == (paramsXlfInfo*)NULL) + kill_with_error(NULL, g_cart_id, "Could not allocate paramsXlfInfo."); + + gettimeofday(&t1, NULL); + + info->plaq = plaq; + info->counter = counter; + + info->beta = g_beta; + info->kappa = g_kappa; + info->mu = g_mu / 2. / g_kappa; + info->c2_rec = g_rgi_C1; + info->time = t1.tv_sec; + + strcpy(info->package_version, PACKAGE_VERSION); + + info->mubar = g_mubar / 2. / g_kappa; + info->epsilonbar = g_epsbar / 2. / g_kappa; + + strcpy(info->date, ctime(&t1.tv_sec)); + return(info); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/selector.h b/qcd/part_cpu/applications/QCD/src/kernel_D/io/selector.h new file mode 100644 index 0000000000000000000000000000000000000000..15ce76d0401bed68a558a35b8bdd09fe52f55cf7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/selector.h @@ -0,0 +1,70 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _IO_SELECTOR_H +#define _IO_SELECTOR_H + + + +#ifdef BENCHMARK +#include +#else +#include +#ifdef HAVE_LIBLEMON +# include +#endif /* HAVE_LIBLEMON */ + +#ifdef HAVE_LIBLEMON +# define LIME_FILE MPI_File +# define WRITER LemonWriter +# define READER LemonReader +# define RECORD_HEADER LemonRecordHeader +# define CreateReader lemonCreateReader +# define CreateHeader lemonCreateHeader +# define ReaderBytes lemonReaderBytes +# define ReaderNextRecord lemonReaderNextRecord +# define ReaderType lemonReaderType +# define ReaderCloseRecord lemonReaderCloseRecord +# define ReaderReadData lemonReaderReadData +# define WriteRecordHeader lemonWriteRecordHeader +# define WriteRecordData lemonWriteRecordData +# define WriterCloseRecord lemonWriterCloseRecord +# define DestroyReader lemonDestroyReader +# define DestroyHeader lemonDestroyHeader +#else /* HAVE_LIBLEMON */ +# define LIME_FILE FILE +# define WRITER LimeWriter +# define READER LimeReader +# define RECORD_HEADER LimeRecordHeader +# define CreateReader limeCreateReader +# define CreateHeader limeCreateHeader +# define ReaderBytes limeReaderBytes +# define ReaderNextRecord limeReaderNextRecord +# define ReaderType limeReaderType +# define ReaderCloseRecord limeReaderCloseRecord +# define ReaderReadData limeReaderReadData +# define WriteRecordData limeWriteRecordData +# define WriteRecordHeader limeWriteRecordHeader +# define WriterCloseRecord limeWriterCloseRecord +# define DestroyReader limeDestroyReader +# define DestroyHeader limeDestroyHeader +#endif + +#endif +#endif \ No newline at end of file diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/spinor.h b/qcd/part_cpu/applications/QCD/src/kernel_D/io/spinor.h new file mode 100644 index 0000000000000000000000000000000000000000..494464661d289b4bd161b00bdbe2a0f165e76950 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/spinor.h @@ -0,0 +1,41 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _SPINOR_H +#define _SPINOR_H + +#include + +#include +#include + +int read_spinor(spinor * const s, spinor * const r, char * filename, const int position); +int read_binary_spinor_data(spinor * const s, spinor * const r, READER * reader, DML_Checksum * checksum); +int read_binary_spinor_data_l(spinor * const s, READER * reader, DML_Checksum * checksum); + +int write_spinor(WRITER * writer, spinor ** const s, spinor ** const r, const int flavours, const int prec); +int write_binary_spinor_data(spinor * const s, spinor * const r, WRITER * writer, DML_Checksum *checksum, int const prec); +int write_binary_spinor_data_l(spinor * const s, WRITER * writer, DML_Checksum * checksum, const int prec); + +void write_spinor_info(WRITER * writer, const int write_prop_format_flag, paramsInverterInfo * InverterInfo, int append); +void write_source_format(WRITER *writer, paramsSourceFormat const *format); +void write_propagator_format(WRITER *writer, paramsPropagatorFormat const *format); +void write_propagator_type(WRITER *writer, const int type); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/spinor.ih b/qcd/part_cpu/applications/QCD/src/kernel_D/io/spinor.ih new file mode 100644 index 0000000000000000000000000000000000000000..90979dde68e459d3e3be720b729637dad10c8fe1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/spinor.ih @@ -0,0 +1,49 @@ +/*********************************************************************** +* Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach +* +* This file is part of tmLQCD. +* +* tmLQCD is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation, either version 3 of the License, or +* (at your option) any later version. +* +* tmLQCD is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with tmLQCD. If not, see . +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef BENCHMARK +#include <../c-lime/include/lime.h> +#else +#include +#endif +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include +#include +#include +#include +#ifdef MPI +# include +#endif +#include +#include +#include + +#include + +#include +#include +#include diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/spinor_read.c b/qcd/part_cpu/applications/QCD/src/kernel_D/io/spinor_read.c new file mode 100644 index 0000000000000000000000000000000000000000..e9d030bd556a3b0d9a6cb6f158baccb64aeb95d5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/spinor_read.c @@ -0,0 +1,118 @@ +/*********************************************************************** +* Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach +* +* This file is part of tmLQCD. +* +* tmLQCD is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation, either version 3 of the License, or +* (at your option) any later version. +* +* tmLQCD is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with tmLQCD. If not, see . +***********************************************************************/ + +#include "spinor.ih" +#include "default_input_values.h" + +paramsPropInfo PropInfo = {_default_propagator_splitted, _default_source_format_flag, _default_prop_precision_flag, NULL}; +paramsSourceInfo SourceInfo = {0, _default_propagator_splitted, _default_source_format_flag, _default_prop_precision_flag, 0, 0, 0, 0, 0, 0, 0, 1, NULL}; + +int read_spinor(spinor * const s, spinor * const r, char * filename, const int position_) { + int status = 0, getpos = 0, bytes = 0, prec = 0, prop_type, position = position_, rstat=0; + char *header_type = NULL; + READER *reader = NULL; + DML_Checksum checksum; + construct_reader(&reader, filename); + /* determine the propagator type */ + prop_type = parse_propagator_type(reader); + + switch (prop_type) { + case 1: + /* strictly speaking the following depends on whether we read a source or a propagator */ + position = 2 * position_ +1; + break; + case 2: + case 3: + return(-2); + case 11: + case 12: + case 13: + return(-3); + case -1: + case 4: + prop_type = 0; + } + + /* seek back to beginning of file*/ + destruct_reader(reader); + construct_reader(&reader, filename); + + /* Find the desired propagator (could be more than one in a file) */ + while ((status = ReaderNextRecord(reader)) != LIME_EOF) { + if (status != LIME_SUCCESS) { + fprintf(stderr, "ReaderNextRecord returned status %d.\n", status); + break; + } + header_type = ReaderType(reader); + if (strcmp("scidac-binary-data", header_type) == 0) { + if (getpos == position) { + break; + } + ++getpos; + } + } + + if (status == LIME_EOF) { + fprintf(stderr, "Unable to find requested LIME record scidac-binary-data in file %s.\nEnd of file reached before record was found.\n", filename); + return(-5); + } + + bytes = ReaderBytes(reader); + + if ((int)bytes == LX * g_nproc_x * LY * g_nproc_y * LZ * g_nproc_z * T * g_nproc_t * sizeof(spinor)) { + prec = 64; + } + else { + if ((int)bytes == LX * g_nproc_x * LY * g_nproc_y * LZ * g_nproc_z * T * g_nproc_t * sizeof(spinor) / 2) { + prec = 32; + } + else { + fprintf(stderr, "Length of scidac-binary-data record in %s does not match input parameters.\n", filename); + fprintf(stderr, "Found %d bytes.\n", bytes); + return(-6); + } + } + + if (g_cart_id == 0 && g_debug_level >= 0) { + printf("# %s precision read (%d bits).\n", (prec == 64 ? "Double" : "Single") ,prec); + } + + if(r == NULL) { + if( (rstat = read_binary_spinor_data_l(s, reader, &checksum)) != 0) { + fprintf(stderr, "read_binary_spinor_data_l failed with return value %d", rstat); + return(-7); + } + } + else { + if( (rstat = read_binary_spinor_data(s, r, reader, &checksum)) != 0) { + fprintf(stderr, "read_binary_spinor_data failed with return value %d", rstat); + return(-7); + } + } + + if (g_cart_id == 0 && g_debug_level >= 0) { + printf("# Scidac checksums for DiracFermion field %s position %d:\n", filename, position); + printf("# Calculated : A = %#x B = %#x.\n", checksum.suma, checksum.sumb); + printf("# No Scidac checksum was read from headers, unable to check integrity of file.\n"); + } + + destruct_reader(reader); + + return(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/spinor_read_binary.c b/qcd/part_cpu/applications/QCD/src/kernel_D/io/spinor_read_binary.c new file mode 100644 index 0000000000000000000000000000000000000000..a16cfc3525f37b508f110e2dd342204c37ad85f2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/spinor_read_binary.c @@ -0,0 +1,360 @@ +/*********************************************************************** +* Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach +* +* This file is part of tmLQCD. +* +* tmLQCD is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation, either version 3 of the License, or +* (at your option) any later version. +* +* tmLQCD is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with tmLQCD. If not, see . +***********************************************************************/ + +#include "spinor.ih" + +#ifdef HAVE_LIBLEMON +int read_binary_spinor_data(spinor * const s, spinor * const r, LemonReader * lemonreader, DML_Checksum *checksum) { + + int t, x, y , z, i = 0, status = 0; + int latticeSize[] = {T_global, g_nproc_x*LX, g_nproc_y*LY, g_nproc_z*LZ}; + int scidacMapping[] = {0, 3, 2, 1}; + int prec = 0; + n_uint64_t bytes; + spinor *p = NULL; + char *filebuffer = NULL, *current = NULL; + double tick = 0, tock = 0; + DML_SiteRank rank; + char measure[64]; + + bytes = lemonReaderBytes(lemonreader); + + if (bytes == (n_uint64_t)g_nproc * (n_uint64_t)VOLUME * (n_uint64_t)sizeof(spinor)) { + prec = 64; + bytes = sizeof(spinor); + } + else { + if (bytes == (n_uint64_t)g_nproc * (n_uint64_t)VOLUME * (n_uint64_t)sizeof(spinor) / 2) { + prec = 32; + bytes = sizeof(spinor)/2; + } + else { + return(-3); + } + } + + DML_checksum_init(checksum); + + if((void*)(filebuffer = malloc(VOLUME * bytes)) == NULL) { + return(-1); + } + + if (g_debug_level > 0) { + MPI_Barrier(g_cart_grid); + tick = MPI_Wtime(); + } + status = lemonReadLatticeParallelMapped(lemonreader, filebuffer, bytes, latticeSize, scidacMapping); + + if (g_debug_level > 0) { + MPI_Barrier(g_cart_grid); + tock = MPI_Wtime(); + + if (g_cart_id == 0) { + engineering(measure, latticeSize[0] * latticeSize[1] * latticeSize[2] * latticeSize[3] * bytes, "b"); + fprintf(stdout, "# Time spent reading %s ", measure); + engineering(measure, tock - tick, "s"); + fprintf(stdout, "was %s.\n", measure); + engineering(measure, latticeSize[0] * latticeSize[1] * latticeSize[2] * latticeSize[3] * bytes / (tock - tick), "b/s"); + fprintf(stdout, "# Reading speed: %s", measure); + engineering(measure, latticeSize[0] * latticeSize[1] * latticeSize[2] * latticeSize[3] * bytes / (g_nproc * (tock - tick)), "b/s"); + fprintf(stdout, " (%s per MPI process).\n", measure); + fflush(stdout); + } + } + + if (status < 0 && status != LEMON_EOR) { + fprintf(stderr, "lemonReadLatticeParallelMapped returned error %d in spinor_read_binary.c", status); + free(filebuffer); + return(-2); + } + + for (t = 0; t < T; t++) { + for (z = 0; z < LZ; z++) { + for (y = 0; y < LY; y++) { + for (x = 0; x < LX; x++) { + rank = (DML_SiteRank)(g_proc_coords[1] * LX + + (((g_proc_coords[0] * T + t) * g_nproc_z * LZ + + g_proc_coords[3] * LZ + z) * g_nproc_y * LY + + g_proc_coords[2] * LY + y) * + ((DML_SiteRank) LX * g_nproc_x) + x); + current = filebuffer + bytes * (x + (y + (t * LZ + z) * LY) * LX); + DML_checksum_accum(checksum, rank, current, bytes); + + i = g_lexic2eosub[ g_ipt[t][x][y][z] ]; + p = ((t + x + y + z + + g_proc_coords[3] * LZ + g_proc_coords[2] * LY + + g_proc_coords[1] * LX + g_proc_coords[0] * T) % 2) ? r : s; + if (prec == 32) + be_to_cpu_assign_single2double(p + i, current, sizeof(spinor) / 8); + else + be_to_cpu_assign(p + i, current, sizeof(spinor) / 8); + } + } + } + } + + DML_global_xor(&checksum->suma); + DML_global_xor(&checksum->sumb); + + free(filebuffer); + return 0; +} +#else /* HAVE_LIBLEMON */ +int read_binary_spinor_data(spinor * const s, spinor * const r, LimeReader * limereader, DML_Checksum * checksum) { + int t, x, y , z, i = 0, status=0; + n_uint64_t bytes; + spinor * p = NULL; + spinor tmp[1]; + float tmp2[24]; + DML_SiteRank rank; + int prec; + + DML_checksum_init(checksum); + + bytes = limeReaderBytes(limereader); + if (bytes == (n_uint64_t)g_nproc * (n_uint64_t)VOLUME * (n_uint64_t)sizeof(spinor)) { + prec = 64; + bytes = sizeof(spinor); + } + else { + if (bytes == (n_uint64_t)g_nproc * (n_uint64_t)VOLUME * (n_uint64_t)sizeof(spinor) / 2) { + prec = 32; + bytes = sizeof(spinor)/2; + } + else { + return(-3); + } + } + + for(t = 0; t < T; t++) { + for(z = 0; z < LZ; z++) { + for(y = 0; y < LY; y++) { +#if (defined MPI) + limeReaderSeek(limereader,(n_uint64_t) + (g_proc_coords[1]*LX + + (((g_proc_coords[0]*T+t)*g_nproc_z*LZ+g_proc_coords[3]*LZ+z)*g_nproc_y*LY + + g_proc_coords[2]*LY+y)*LX*g_nproc_x)*bytes, + SEEK_SET); +#endif + for(x = 0; x < LX; x++){ + i = g_lexic2eosub[ g_ipt[t][x][y][z] ]; + if((t+x+y+z+ + g_proc_coords[3]*LZ+g_proc_coords[2]*LY + +g_proc_coords[0]*T+g_proc_coords[1]*LX)%2==0) { + p = s; + } + else { + p = r; + } + rank = (DML_SiteRank) (g_proc_coords[1]*LX + + (((g_proc_coords[0]*T+t)*g_nproc_z*LZ+g_proc_coords[3]*LZ+z)*g_nproc_y*LY + + g_proc_coords[2]*LY+y)*((DML_SiteRank)LX*g_nproc_x) + x); + if(prec == 32) { + status = limeReaderReadData(tmp2, &bytes, limereader); + DML_checksum_accum(checksum,rank,(char *) tmp2, bytes); + be_to_cpu_assign_single2double(p+i, (float*)tmp2, sizeof(spinor)/8); + } + else { + status = limeReaderReadData(tmp, &bytes, limereader); + DML_checksum_accum(checksum,rank,(char *) tmp, bytes); + be_to_cpu_assign(p + i, tmp, sizeof(spinor)/8); + } + if(status < 0 && status != LIME_EOR) { + fprintf(stderr, "LIME read error occurred with status = %d while reading in spinor_read_binary.c!\n", status); +#ifdef MPI + MPI_Abort(MPI_COMM_WORLD, 1); + MPI_Finalize(); +#endif + return(-2); + } + } + } + } + } +#ifdef MPI + DML_checksum_combine(checksum); +#endif + return(0); +} +#endif /* HAVE_LIBLEMON */ + + + +#ifdef HAVE_LIBLEMON +int read_binary_spinor_data_l(spinor * const s, LemonReader * lemonreader, DML_Checksum *checksum) { + + int t, x, y , z, i = 0, status = 0; + int latticeSize[] = {T_global, g_nproc_x*LX, g_nproc_y*LY, g_nproc_z*LZ}; + int scidacMapping[] = {0, 3, 2, 1}; + int prec = 0; + n_uint64_t bytes; + char *filebuffer = NULL, *current = NULL; + double tick = 0, tock = 0; + DML_SiteRank rank; + char measure[64]; + + bytes = lemonReaderBytes(lemonreader); + + if (bytes == (n_uint64_t)g_nproc * (n_uint64_t)VOLUME * (n_uint64_t)sizeof(spinor)) { + prec = 64; + bytes = sizeof(spinor); + } + else { + if (bytes == (n_uint64_t)g_nproc * (n_uint64_t)VOLUME * (n_uint64_t)sizeof(spinor) / 2) { + prec = 32; + bytes = sizeof(spinor)/2; + } + else { + return(-3); + } + } + + DML_checksum_init(checksum); + + if((void*)(filebuffer = malloc(VOLUME * bytes)) == NULL) { + return(-1); + } + + if (g_debug_level > 0) { + MPI_Barrier(g_cart_grid); + tick = MPI_Wtime(); + } + status = lemonReadLatticeParallelMapped(lemonreader, filebuffer, bytes, latticeSize, scidacMapping); + + if (g_debug_level > 0) { + MPI_Barrier(g_cart_grid); + tock = MPI_Wtime(); + + if (g_cart_id == 0) { + engineering(measure, latticeSize[0] * latticeSize[1] * latticeSize[2] * latticeSize[3] * bytes, "b"); + fprintf(stdout, "# Time spent reading %s ", measure); + engineering(measure, tock - tick, "s"); + fprintf(stdout, "was %s.\n", measure); + engineering(measure, latticeSize[0] * latticeSize[1] * latticeSize[2] * latticeSize[3] * bytes / (tock - tick), "b/s"); + fprintf(stdout, "# Reading speed: %s", measure); + engineering(measure, latticeSize[0] * latticeSize[1] * latticeSize[2] * latticeSize[3] * bytes / (g_nproc * (tock - tick)), "b/s"); + fprintf(stdout, " (%s per MPI process).\n", measure); + fflush(stdout); + } + } + + if (status < 0 && status != LEMON_EOR) { + fprintf(stderr, "lemonReadLatticeParallelMapped returned error %d in spinor_read_binary.c", status); + free(filebuffer); + return(-2); + } + + for (t = 0; t < T; t++) { + for (z = 0; z < LZ; z++) { + for (y = 0; y < LY; y++) { + for (x = 0; x < LX; x++) { + rank = (DML_SiteRank)(g_proc_coords[1] * LX + + (((g_proc_coords[0] * T + t) * g_nproc_z * LZ + + g_proc_coords[3] * LZ + z) * g_nproc_y * LY + + g_proc_coords[2] * LY + y) * + ((DML_SiteRank) LX * g_nproc_x) + x); + current = filebuffer + bytes * (x + (y + (t * LZ + z) * LY) * LX); + DML_checksum_accum(checksum, rank, current, bytes); + + i = g_ipt[t][x][y][z]; + if (prec == 32) + be_to_cpu_assign_single2double(s + i, current, sizeof(spinor) / 8); + else + be_to_cpu_assign(s + i, current, sizeof(spinor) / 8); + } + } + } + } + + DML_global_xor(&checksum->suma); + DML_global_xor(&checksum->sumb); + + free(filebuffer); + return 0; +} +#else /* HAVE_LIBLEMON */ +int read_binary_spinor_data_l(spinor * const s, LimeReader * limereader, DML_Checksum * checksum) { + int t, x, y , z, i = 0, status=0; + n_uint64_t bytes; + spinor tmp[1]; + float tmp2[24]; + DML_SiteRank rank; + int prec; + + + DML_checksum_init(checksum); + bytes = limeReaderBytes(limereader); + + if (bytes == (n_uint64_t)g_nproc * (n_uint64_t)VOLUME * (n_uint64_t)sizeof(spinor)) { + prec = 64; + bytes = sizeof(spinor); + } + else { + if (bytes == (n_uint64_t)g_nproc * (n_uint64_t)VOLUME * (n_uint64_t)sizeof(spinor) / 2) { + prec = 32; + bytes = sizeof(spinor)/2; + } + else { + return(-3); + } + } + + for(t = 0; t < T; t++) { + for(z = 0; z < LZ; z++) { + for(y = 0; y < LY; y++) { +#if (defined MPI) + limeReaderSeek(limereader,(n_uint64_t) + (g_proc_coords[1]*LX + + (((g_proc_coords[0]*T+t)*g_nproc_z*LZ+g_proc_coords[3]*LZ+z)*g_nproc_y*LY + + g_proc_coords[2]*LY+y)*LX*g_nproc_x)*bytes, + SEEK_SET); +#endif + for(x = 0; x < LX; x++){ + i = g_ipt[t][x][y][z]; + rank = (DML_SiteRank) (g_proc_coords[1]*LX + + (((g_proc_coords[0]*T+t)*g_nproc_z*LZ+g_proc_coords[3]*LZ+z)*g_nproc_y*LY + + g_proc_coords[2]*LY+y)*((DML_SiteRank)LX*g_nproc_x) + x); + if(prec == 32) { + status = limeReaderReadData(tmp2, &bytes, limereader); + DML_checksum_accum(checksum,rank,(char *) tmp2, bytes); + be_to_cpu_assign_single2double(s + i, (float*)tmp2, sizeof(spinor)/8); + } + else { + status = limeReaderReadData(tmp, &bytes, limereader); + DML_checksum_accum(checksum,rank,(char *) tmp, bytes); + be_to_cpu_assign(s + i, tmp, sizeof(spinor)/8); + } + if(status < 0 && status != LIME_EOR) { + fprintf(stderr, "LIME read error occurred with status = %d while reading in spinor_read_binary.c!\n", status); +#ifdef MPI + MPI_Abort(MPI_COMM_WORLD, 1); + MPI_Finalize(); +#endif + return(-2); + } + } + } + } + } +#ifdef MPI + DML_checksum_combine(checksum); +#endif + return(0); +} +#endif /* HAVE_LIBLEMON */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/spinor_write.c b/qcd/part_cpu/applications/QCD/src/kernel_D/io/spinor_write.c new file mode 100644 index 0000000000000000000000000000000000000000..5bb46be09fbc2211a9fb1203059fdd9dbfe9836b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/spinor_write.c @@ -0,0 +1,47 @@ +/*********************************************************************** +* Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach +* +* This file is part of tmLQCD. +* +* tmLQCD is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation, either version 3 of the License, or +* (at your option) any later version. +* +* tmLQCD is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with tmLQCD. If not, see . +***********************************************************************/ + + +#include "spinor.ih" + +int write_spinor(WRITER * writer, spinor ** const s, spinor ** const r, const int flavours, const int prec) +{ + DML_Checksum checksum; + uint64_t bytes; + int i = 0, status = 0; + + bytes = (n_uint64_t)LX * g_nproc_x * LY * g_nproc_y * LZ * g_nproc_z * T * g_nproc_t * (n_uint64_t)(sizeof(spinor) * prec / 64); + + if(r == NULL) { + for (i = 0; i < flavours; ++i) { + //DEBUG following line + write_header(writer, 1, 0, "scidac-binary-data", bytes); + status = write_binary_spinor_data_l(s[i], writer, &checksum, prec); + write_checksum(writer, &checksum, NULL); + } + } + else { + for (i = 0; i < flavours; ++i) { + write_header(writer, 1, 0, "scidac-binary-data", bytes); + status = write_binary_spinor_data(s[i], r[i], writer, &checksum, prec); + write_checksum(writer, &checksum, NULL); + } + } + return status; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/spinor_write_binary.c b/qcd/part_cpu/applications/QCD/src/kernel_D/io/spinor_write_binary.c new file mode 100644 index 0000000000000000000000000000000000000000..640422297b065c41bc6a01b5c7f0f7b31ab61268 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/spinor_write_binary.c @@ -0,0 +1,473 @@ +/*********************************************************************** +* Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach +* +* This file is part of tmLQCD. +* +* tmLQCD is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation, either version 3 of the License, or +* (at your option) any later version. +* +* tmLQCD is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with tmLQCD. If not, see . +***********************************************************************/ + +#include "spinor.ih" + +#ifdef HAVE_LIBLEMON +int write_binary_spinor_data(spinor * const s, spinor * const r, + LemonWriter * lemonwriter, DML_Checksum *checksum, int const prec) +{ + int x, y, z, t, i = 0, xG, yG, zG, tG, status = 0; + int latticeSize[] = {T_global, g_nproc_x*LX, g_nproc_y*LY, g_nproc_z*LZ}; + int scidacMapping[] = {0, 3, 2, 1}; + unsigned long bufoffset = 0; + char *filebuffer = NULL; + uint64_t bytes; + DML_SiteRank rank; + double tick = 0, tock = 0; + char measure[64]; + spinor *p = NULL; + + DML_checksum_init(checksum); + bytes = (uint64_t)sizeof(spinor); + if (prec == 32) { + bytes /= 2; + } + if((void*)(filebuffer = malloc(VOLUME * bytes)) == NULL) { + fprintf (stderr, "malloc errno in write_binary_spinor_data_parallel: %d\n", errno); + fflush(stderr); + errno = 0; + /* do we need to abort here? */ + return 1; + } + + tG = g_proc_coords[0]*T; + zG = g_proc_coords[3]*LZ; + yG = g_proc_coords[2]*LY; + xG = g_proc_coords[1]*LX; + for(t = 0; t < T; t++) { + for(z = 0; z < LZ; z++) { + for(y = 0; y < LY; y++) { + for(x = 0; x < LX; x++) { + rank = (DML_SiteRank) ((((tG + t)*L + zG + z)*L + yG + y)*L + xG + x); + i = g_lexic2eosub[g_ipt[t][x][y][z]]; + if ((z + zG + y + yG + + x + xG + t + tG) % 2 == 0) + p = s; + else + p = r; + + if (prec == 32) + be_to_cpu_assign_double2single((float*)(filebuffer + bufoffset), (double*)(p + i), sizeof(spinor) / 8); + else + be_to_cpu_assign((double*)(filebuffer + bufoffset), (double*)(p + i), sizeof(spinor) / 8); + DML_checksum_accum(checksum, rank, (char*) filebuffer + bufoffset, bytes); + bufoffset += bytes; + } + } + } + } + + if (g_debug_level > 0) { + MPI_Barrier(g_cart_grid); + tick = MPI_Wtime(); + } + + status = lemonWriteLatticeParallelMapped(lemonwriter, filebuffer, bytes, latticeSize, scidacMapping); + + if (status != LEMON_SUCCESS) + { + free(filebuffer); + fprintf(stderr, "LEMON write error occurred with status = %d, while in write_binary_spinor_data (spinor_write_binary.c)!\n", status); + return(-2); + } + + if (g_debug_level > 0) { + MPI_Barrier(g_cart_grid); + tock = MPI_Wtime(); + + if (g_cart_id == 0) { + engineering(measure, latticeSize[0] * latticeSize[1] * latticeSize[2] * latticeSize[3] * bytes, "b"); + fprintf(stdout, "# Time spent writing %s ", measure); + engineering(measure, tock - tick, "s"); + fprintf(stdout, "was %s.\n", measure); + engineering(measure, latticeSize[0] * latticeSize[1] * latticeSize[2] * latticeSize[3] * bytes / (tock - tick), "b/s"); + fprintf(stdout, "# Writing speed: %s", measure); + engineering(measure, latticeSize[0] * latticeSize[1] * latticeSize[2] * latticeSize[3] * bytes / (g_nproc * (tock - tick)), "b/s"); + fprintf(stdout, " (%s per MPI process).\n", measure); + fflush(stdout); + } + } + + lemonWriterCloseRecord(lemonwriter); + + DML_global_xor(&checksum->suma); + DML_global_xor(&checksum->sumb); + + free(filebuffer); + return 0; + +} + +#else /* HAVE_LIBLEMON */ +int write_binary_spinor_data(spinor * const s, spinor * const r, LimeWriter * limewriter, DML_Checksum * checksum, const int prec) +{ + int x, X, y, Y, z, Z, t, t0, tag=0, id=0, i=0, status=0; + int latticeSize[] = {T_global, g_nproc_x*LX, g_nproc_y*LY, g_nproc_z*LZ}; + spinor * p = NULL; + spinor tmp[1]; + float tmp2[24]; + int coords[4]; + n_uint64_t bytes; + DML_SiteRank rank; +#ifdef MPI + double tick = 0, tock = 0; + char measure[64]; + MPI_Status mstatus; +#endif + DML_checksum_init(checksum); + +#ifdef MPI + if (g_debug_level > 0) { + MPI_Barrier(g_cart_grid); + tick = MPI_Wtime(); + } +#endif + + if(prec == 32) bytes = (n_uint64_t)sizeof(spinor)/2; + else bytes = (n_uint64_t)sizeof(spinor); + for(t0 = 0; t0 < T*g_nproc_t; t0++) { + t = t0 - T*g_proc_coords[0]; + coords[0] = t0 / T; + for(z = 0; z < LZ*g_nproc_z; z++) { + Z = z - g_proc_coords[3]*LZ; + coords[3] = z / LZ; + for(y = 0; y < LY*g_nproc_y; y++) { + Y = y - g_proc_coords[2]*LY; + coords[2] = y / LY; + for(x = 0; x < LX*g_nproc_x; x++) { + X = x - g_proc_coords[1]*LX; + coords[1] = x / LX; +#ifdef MPI + MPI_Cart_rank(g_cart_grid, coords, &id); +#endif + if(g_cart_id == id) { + i = g_lexic2eosub[ g_ipt[t][X][Y][Z] ]; + if((t+X+Y+Z+g_proc_coords[3]*LZ+g_proc_coords[2]*LY + + g_proc_coords[0]*T+g_proc_coords[1]*LX)%2 == 0) { + p = s; + } + else { + p = r; + } + } + if(g_cart_id == 0) { + /* Rank should be computed by proc 0 only */ + rank = (DML_SiteRank) (((t0*LZ*g_nproc_z + z)*LY*g_nproc_y + y)*LX*g_nproc_x + x); + + if(g_cart_id == id) { + if(prec == 32) { + be_to_cpu_assign_double2single((float*)tmp2, p + i, sizeof(spinor)/8); + DML_checksum_accum(checksum,rank,(char *) tmp2,sizeof(spinor)/2); + status = limeWriteRecordData((void*)tmp2, &bytes, limewriter); + } + else { + be_to_cpu_assign(tmp, p + i , sizeof(spinor)/8); + DML_checksum_accum(checksum,rank,(char *) tmp,sizeof(spinor)); + status = limeWriteRecordData((void*)tmp, &bytes, limewriter); + } + } +#ifdef MPI + else{ + if(prec == 32) { + MPI_Recv((void*)tmp2, sizeof(spinor)/8, MPI_FLOAT, id, tag, g_cart_grid, &mstatus); + DML_checksum_accum(checksum,rank,(char *) tmp2, sizeof(spinor)/2); + status = limeWriteRecordData((void*)tmp2, &bytes, limewriter); + } + else { + MPI_Recv((void*)tmp, sizeof(spinor)/8, MPI_DOUBLE, id, tag, g_cart_grid, &mstatus); + DML_checksum_accum(checksum,rank,(char *) tmp, sizeof(spinor)); + status = limeWriteRecordData((void*)tmp, &bytes, limewriter); + } + } +#endif + if(status < 0 ) { + fprintf(stderr, "LIME write error occurred with status = %d, while in write_binary_spinor_data (spinor_write_binary.c)!\n", status); +#ifdef MPI + MPI_Abort(MPI_COMM_WORLD, 1); + MPI_Finalize(); +#endif + exit(500); + } + } +#ifdef MPI + else{ + if(g_cart_id == id){ + if(prec == 32) { + be_to_cpu_assign_double2single((float*)tmp2, p + i, sizeof(spinor)/8); + MPI_Send((void*) tmp2, sizeof(spinor)/8, MPI_FLOAT, 0, tag, g_cart_grid); + } + else { + be_to_cpu_assign(tmp, p + i, sizeof(spinor)/8); + MPI_Send((void*) tmp, sizeof(spinor)/8, MPI_DOUBLE, 0, tag, g_cart_grid); + } + } + } +#endif + tag++; + } +#ifdef MPI + MPI_Barrier(g_cart_grid); +#endif + tag=0; + } + } + } +#ifdef MPI + if (g_debug_level > 0) { + MPI_Barrier(g_cart_grid); + tock = MPI_Wtime(); + + if (g_cart_id == 0) { + engineering(measure, latticeSize[0] * latticeSize[1] * latticeSize[2] * latticeSize[3] * bytes, "b"); + fprintf(stdout, "# Time spent writing %s ", measure); + engineering(measure, tock - tick, "s"); + fprintf(stdout, "was %s.\n", measure); + engineering(measure, latticeSize[0] * latticeSize[1] * latticeSize[2] * latticeSize[3] * bytes / (tock - tick), "b/s"); + fprintf(stdout, "# Writing speed: %s", measure); + engineering(measure, latticeSize[0] * latticeSize[1] * latticeSize[2] * latticeSize[3] * bytes / (g_nproc * (tock - tick)), "b/s"); + fprintf(stdout, " (%s per MPI process).\n", measure); + fflush(stdout); + } + } +#endif + return(0); +} +#endif /* HAVE_LIBLEMON */ + + + +#ifdef HAVE_LIBLEMON +int write_binary_spinor_data_l(spinor * const s, + LemonWriter * lemonwriter, DML_Checksum *checksum, int const prec) +{ + int x, y, z, t, i = 0, xG, yG, zG, tG, status = 0; + int latticeSize[] = {T_global, g_nproc_x*LX, g_nproc_y*LY, g_nproc_z*LZ}; + int scidacMapping[] = {0, 3, 2, 1}; + unsigned long bufoffset = 0; + char *filebuffer = NULL; + uint64_t bytes; + DML_SiteRank rank; + double tick = 0, tock = 0; + char measure[64]; + + DML_checksum_init(checksum); + bytes = (uint64_t)sizeof(spinor); + if (prec == 32) { + bytes /= 2; + } + if((void*)(filebuffer = malloc(VOLUME * bytes)) == NULL) { + fprintf (stderr, "malloc errno in write_binary_spinor_data_parallel: %d\n", errno); + fflush(stderr); + errno = 0; + /* do we need to abort here? */ + return 1; + } + + tG = g_proc_coords[0]*T; + zG = g_proc_coords[3]*LZ; + yG = g_proc_coords[2]*LY; + xG = g_proc_coords[1]*LX; + for(t = 0; t < T; t++) { + for(z = 0; z < LZ; z++) { + for(y = 0; y < LY; y++) { + for(x = 0; x < LX; x++) { + rank = (DML_SiteRank) ((((tG + t)*L + zG + z)*L + yG + y)*L + xG + x); + i = g_ipt[t][x][y][z]; + + if (prec == 32) + be_to_cpu_assign_double2single((float*)(filebuffer + bufoffset), (double*)(s + i), sizeof(spinor) / 8); + else + be_to_cpu_assign((double*)(filebuffer + bufoffset), (double*)(s + i), sizeof(spinor) / 8); + DML_checksum_accum(checksum, rank, (char*) filebuffer + bufoffset, bytes); + bufoffset += bytes; + } + } + } + } + + if (g_debug_level > 0) { + MPI_Barrier(g_cart_grid); + tick = MPI_Wtime(); + } + + status = lemonWriteLatticeParallelMapped(lemonwriter, filebuffer, bytes, latticeSize, scidacMapping); + + if (status != LEMON_SUCCESS) + { + free(filebuffer); + fprintf(stderr, "LEMON write error occurred with status = %d, while in write_binary_spinor_data_l (spinor_write_binary.c)!\n", status); + return(-2); + } + + if (g_debug_level > 0) { + MPI_Barrier(g_cart_grid); + tock = MPI_Wtime(); + + if (g_cart_id == 0) { + engineering(measure, latticeSize[0] * latticeSize[1] * latticeSize[2] * latticeSize[3] * bytes, "b"); + fprintf(stdout, "# Time spent writing %s ", measure); + engineering(measure, tock - tick, "s"); + fprintf(stdout, "was %s.\n", measure); + engineering(measure, latticeSize[0] * latticeSize[1] * latticeSize[2] * latticeSize[3] * bytes / (tock - tick), "b/s"); + fprintf(stdout, "# Writing speed: %s", measure); + engineering(measure, latticeSize[0] * latticeSize[1] * latticeSize[2] * latticeSize[3] * bytes / (g_nproc * (tock - tick)), "b/s"); + fprintf(stdout, " (%s per MPI process).\n", measure); + fflush(stdout); + } + } + + lemonWriterCloseRecord(lemonwriter); + + DML_global_xor(&checksum->suma); + DML_global_xor(&checksum->sumb); + + free(filebuffer); + return 0; + +} + +#else /* HAVE_LIBLEMON */ +int write_binary_spinor_data_l(spinor * const s, LimeWriter * limewriter, DML_Checksum * checksum, const int prec) +{ + int x, X, y, Y, z, Z, t, t0, tag=0, id=0, i=0, status=0; + int latticeSize[] = {T_global, g_nproc_x*LX, g_nproc_y*LY, g_nproc_z*LZ}; + spinor tmp[1]; + float tmp2[24]; + int coords[4]; + n_uint64_t bytes; + DML_SiteRank rank; +#ifdef MPI + double tick = 0, tock = 0; + char measure[64]; + MPI_Status mstatus; +#endif + DML_checksum_init(checksum); + +#ifdef MPI + if (g_debug_level > 0) { + MPI_Barrier(g_cart_grid); + tick = MPI_Wtime(); + } +#endif + + if(prec == 32) bytes = (n_uint64_t)sizeof(spinor)/2; + else bytes = (n_uint64_t)sizeof(spinor); + for(t0 = 0; t0 < T*g_nproc_t; t0++) { + t = t0 - T*g_proc_coords[0]; + coords[0] = t0 / T; + for(z = 0; z < LZ*g_nproc_z; z++) { + Z = z - g_proc_coords[3]*LZ; + coords[3] = z / LZ; + for(y = 0; y < LY*g_nproc_y; y++) { + Y = y - g_proc_coords[2]*LY; + coords[2] = y / LY; + for(x = 0; x < LX*g_nproc_x; x++) { + X = x - g_proc_coords[1]*LX; + coords[1] = x / LX; +#ifdef MPI + MPI_Cart_rank(g_cart_grid, coords, &id); +#endif + if(g_cart_id == id) { + i = g_ipt[t][X][Y][Z]; + } + if(g_cart_id == 0) { + /* Rank should be computed by proc 0 only */ + rank = (DML_SiteRank) (((t0*LZ*g_nproc_z + z)*LY*g_nproc_y + y)*LX*g_nproc_x + x); + + if(g_cart_id == id) { + if(prec == 32) { + be_to_cpu_assign_double2single((float*)tmp2, s + i, sizeof(spinor)/8); + DML_checksum_accum(checksum,rank,(char *) tmp2,sizeof(spinor)/2); + status = limeWriteRecordData((void*)tmp2, &bytes, limewriter); + } + else { + be_to_cpu_assign(tmp, s + i , sizeof(spinor)/8); + DML_checksum_accum(checksum,rank,(char *) tmp,sizeof(spinor)); + status = limeWriteRecordData((void*)tmp, &bytes, limewriter); + } + } +#ifdef MPI + else{ + if(prec == 32) { + MPI_Recv((void*)tmp2, sizeof(spinor)/8, MPI_FLOAT, id, tag, g_cart_grid, &mstatus); + DML_checksum_accum(checksum,rank,(char *) tmp2, sizeof(spinor)/2); + status = limeWriteRecordData((void*)tmp2, &bytes, limewriter); + } + else { + MPI_Recv((void*)tmp, sizeof(spinor)/8, MPI_DOUBLE, id, tag, g_cart_grid, &mstatus); + DML_checksum_accum(checksum,rank,(char *) tmp, sizeof(spinor)); + status = limeWriteRecordData((void*)tmp, &bytes, limewriter); + } + } +#endif + if(status < 0 ) { + fprintf(stderr, "LIME write error occurred with status = %d, while in write_binary_spinor_data_l (spinor_write_binary.c)!\n", status); +#ifdef MPI + MPI_Abort(MPI_COMM_WORLD, 1); + MPI_Finalize(); +#endif + exit(500); + } + + } +#ifdef MPI + else{ + if(g_cart_id == id){ + if(prec == 32) { + be_to_cpu_assign_double2single((float*)tmp2, s + i, sizeof(spinor)/8); + MPI_Send((void*) tmp2, sizeof(spinor)/8, MPI_FLOAT, 0, tag, g_cart_grid); + } + else { + be_to_cpu_assign(tmp, s + i, sizeof(spinor)/8); + MPI_Send((void*) tmp, sizeof(spinor)/8, MPI_DOUBLE, 0, tag, g_cart_grid); + } + } + } +#endif + tag++; + } +#ifdef MPI + MPI_Barrier(g_cart_grid); +#endif + tag=0; + } + } + } +#ifdef MPI + if (g_debug_level > 0) { + MPI_Barrier(g_cart_grid); + tock = MPI_Wtime(); + + if (g_cart_id == 0) { + engineering(measure, latticeSize[0] * latticeSize[1] * latticeSize[2] * latticeSize[3] * bytes, "b"); + fprintf(stdout, "# Time spent writing %s ", measure); + engineering(measure, tock - tick, "s"); + fprintf(stdout, "was %s.\n", measure); + engineering(measure, latticeSize[0] * latticeSize[1] * latticeSize[2] * latticeSize[3] * bytes / (tock - tick), "b/s"); + fprintf(stdout, "# Writing speed: %s", measure); + engineering(measure, latticeSize[0] * latticeSize[1] * latticeSize[2] * latticeSize[3] * bytes / (g_nproc * (tock - tick)), "b/s"); + fprintf(stdout, " (%s per MPI process).\n", measure); + fflush(stdout); + } + } +#endif + return(0); +} +#endif /* HAVE_LIBLEMON */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/spinor_write_info.c b/qcd/part_cpu/applications/QCD/src/kernel_D/io/spinor_write_info.c new file mode 100644 index 0000000000000000000000000000000000000000..786b831c4fca86d3a69ffc1fb3746e6245949467 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/spinor_write_info.c @@ -0,0 +1,42 @@ +/*********************************************************************** +* Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach +* +* This file is part of tmLQCD. +* +* tmLQCD is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation, either version 3 of the License, or +* (at your option) any later version. +* +* tmLQCD is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with tmLQCD. If not, see . +***********************************************************************/ + +#include "spinor.ih" + +void write_spinor_info(WRITER * writer, const int write_prop_format_flag, + paramsInverterInfo * InverterInfo, int append) +{ + if (!append) { + if(GaugeInfo.xlfInfo != NULL) { + /* This message starts the gauge info, so it should be MB=1 ME=0 */ + write_header(writer, 1, 0, "xlf-info", strlen(GaugeInfo.xlfInfo)); + write_message(writer, GaugeInfo.xlfInfo, strlen(GaugeInfo.xlfInfo)); + close_writer_record(writer); + } + write_checksum(writer, &GaugeInfo.checksum, "gauge-scidac-checksum-copy"); + if(GaugeInfo.ildg_data_lfn != NULL) + { + /* This message always stands on its own: MB=1 ME=1 */ + write_header(writer, 1, 1, "gauge-ildg-data-lfn-copy", strlen(GaugeInfo.ildg_data_lfn)); + write_message(writer, GaugeInfo.ildg_data_lfn, strlen(GaugeInfo.ildg_data_lfn)); + close_writer_record(writer); + } + } + write_inverter_info(writer, InverterInfo); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/spinor_write_propagator_format.c b/qcd/part_cpu/applications/QCD/src/kernel_D/io/spinor_write_propagator_format.c new file mode 100644 index 0000000000000000000000000000000000000000..b43ece8f8d8e9cf3bc3b8c0b51fa289041fe6731 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/spinor_write_propagator_format.c @@ -0,0 +1,49 @@ +/*********************************************************************** +* Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach +* +* This file is part of tmLQCD. +* +* tmLQCD is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation, either version 3 of the License, or +* (at your option) any later version. +* +* tmLQCD is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with tmLQCD. If not, see . +***********************************************************************/ + +#include "spinor.ih" + +void write_propagator_format(WRITER *writer, paramsPropagatorFormat const *format) +{ + uint64_t bytes; + char *message; + message = (char*)malloc(512); + sprintf(message, "\n" + "\n" + " diracFermion\n" + " %d\n" + " %d\n" + " %d\n" + " %d\n" + " %d\n" + " %d\n" + "", + format->prec, format->flavours, + format->lx, format->ly, format->lx, format->lt); + + bytes = strlen(message); + /* The propagator format is the last part of metadata, therefore MB=0, ME=1 */ + write_header(writer, 0, 1, "etmc-propagator-format", bytes); + write_message(writer, message, bytes); + close_writer_record(writer); + free(message); + return; +} + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/spinor_write_propagator_type.c b/qcd/part_cpu/applications/QCD/src/kernel_D/io/spinor_write_propagator_type.c new file mode 100644 index 0000000000000000000000000000000000000000..ca49626dc98319fb65bd78519ce7852319604885 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/spinor_write_propagator_type.c @@ -0,0 +1,41 @@ +#include "spinor.ih" + +void write_propagator_type(WRITER *writer, const int type) +{ + uint64_t bytes; + char *message; + +#ifndef HAVE_LIBLEMON + if(g_cart_id == 0) { +#endif /* ! HAVE_LIBLEMON */ + + message = (char*)malloc(128); + + switch (type) { + case 0: + sprintf(message, "DiracFermion_Sink"); + break; + case 1: + sprintf(message, "DiracFermion_Source_Sink_Pairs"); + break; + case 2: + sprintf(message, "DiracFermion_ScalarSource_TwelveSink"); + break; + case 3: + sprintf(message, "DiracFermion_ScalarSource_FourSink"); + break; + case 4: + sprintf(message, "DiracFermion_Deflation_Field"); + break; + } + bytes = strlen(message); + + write_header(writer, 1, 1, "propagator-type", bytes); + write_message(writer, message, bytes); + + close_writer_record(writer); + free(message); +#ifndef HAVE_LIBLEMON + } +#endif /* ! HAVE_LIBLEMON */ +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/spinor_write_source_format.c b/qcd/part_cpu/applications/QCD/src/kernel_D/io/spinor_write_source_format.c new file mode 100644 index 0000000000000000000000000000000000000000..1e200dd7bbd316b91746b91036fb1a41c4643c70 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/spinor_write_source_format.c @@ -0,0 +1,56 @@ +/*********************************************************************** +* Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach +* +* This file is part of tmLQCD. +* +* tmLQCD is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation, either version 3 of the License, or +* (at your option) any later version. +* +* tmLQCD is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with tmLQCD. If not, see . +***********************************************************************/ + +#include "spinor.ih" + +void write_source_format(WRITER *writer, paramsSourceFormat const *format) +{ + uint64_t bytes; + char *buf = NULL; +#ifndef HAVE_LIBLEMON + if(g_cart_id == 0) { +#endif /* ! HAVE_LIBLEMON */ + buf = (char*)malloc(512); + sprintf(buf, "\n" + "\n" + " diracFermion\n" + " %d\n" + " %d\n" + " %d\n" + " %d\n" + " %d\n" + " %d\n" + " %d\n" + " %d\n" + "", + format->prec, format->flavours, + format->lx, format->ly, format->lz, format->lt, + format->spins, format->colours); + bytes = strlen(buf); + /* This message should be preceded by inverter info + * and followed by propagator format, so MB=ME=0 */ + write_header(writer, 0, 0, "etmc-source-format", bytes); + write_message(writer, buf, bytes); + close_writer_record(writer); + + free(buf); +#ifndef HAVE_LIBLEMON + } +#endif /* ! HAVE_LIBLEMON */ +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/spinor_write_stdout.c b/qcd/part_cpu/applications/QCD/src/kernel_D/io/spinor_write_stdout.c new file mode 100644 index 0000000000000000000000000000000000000000..5d4a2e25330ecf0999f649d3e09287cadf7edfcc --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/spinor_write_stdout.c @@ -0,0 +1,70 @@ +/*********************************************************************** +* Copyright (C) 2012 Carsten Urbach +* +* This file is part of tmLQCD. +* +* tmLQCD is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation, either version 3 of the License, or +* (at your option) any later version. +* +* tmLQCD is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with tmLQCD. If not, see . +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include "global.h" +#ifdef MPI +# include +#endif +#include "su3.h" +#include "io/spinor_write_stdout.h" + + +void spinor_write_stdout(spinor * const s) { + int X, Y, Z, t0, id = 0, ix, iy; + int coords[4]; + + for(int t = 0; t < g_nproc_t*T; t++) { + t0 = t - g_proc_coords[0]*T; + coords[0] = t / T; + for(int x = 0; x < g_nproc_x*LX; x++) { + X = x - g_proc_coords[1]*LX; + coords[1] = x / LX; + for(int y = 0; y < g_nproc_y*LY; y++) { + Y = y - g_proc_coords[2]*LY; + coords[2] = y / LY; + for(int z = 0; z < g_nproc_z*LZ; z++) { + Z = z - g_proc_coords[3]*LZ; + coords[3] = z / LZ; +#ifdef MPI + MPI_Cart_rank(g_cart_grid, coords, &id); +#endif + if((t+x+y+z)%2 == 0 && g_cart_id == id) { + ix = g_lexic2eosub[ g_ipt[t0][X][Y][Z] ]; + iy = t*(g_nproc_x*LX*g_nproc_y*LY*g_nproc_z*LZ) + + x*(g_nproc_y*LY*g_nproc_z*LZ) + + y*(g_nproc_z*LZ) + z; + printf(" %d %d %d %d %d, %d %d %d %d: %e %e sp\n", + iy, t, x, y, z, t0, X, Y, Z, + creal(s[ix].s0.c0), cimag(s[ix].s0.c0)); + fflush(stdout); + } +#ifdef MPI + MPI_Barrier(MPI_COMM_WORLD); +#endif + } + } + } + } + return; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/spinor_write_stdout.h b/qcd/part_cpu/applications/QCD/src/kernel_D/io/spinor_write_stdout.h new file mode 100644 index 0000000000000000000000000000000000000000..620a24650fcec029b0fe3bc6de8d3957a63b3bb2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/spinor_write_stdout.h @@ -0,0 +1,27 @@ +/*********************************************************************** +* Copyright (C) 2012 Carsten Urbach +* +* This file is part of tmLQCD. +* +* tmLQCD is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation, either version 3 of the License, or +* (at your option) any later version. +* +* tmLQCD is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with tmLQCD. If not, see . +***********************************************************************/ + +#ifndef _SPINOR_WRITE_STDOUT_H +#define _SPINOR_WRITE_STDOUT_H + +#include "su3.h" + +void spinor_write_stdout(spinor * const s); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/sw_write_stdout.c b/qcd/part_cpu/applications/QCD/src/kernel_D/io/sw_write_stdout.c new file mode 100644 index 0000000000000000000000000000000000000000..2d28b4ccc1e919f1528b9e2f3b293ca83e0a5f3e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/sw_write_stdout.c @@ -0,0 +1,77 @@ +/*********************************************************************** +* Copyright (C) 2012 Carsten Urbach +* +* This file is part of tmLQCD. +* +* tmLQCD is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation, either version 3 of the License, or +* (at your option) any later version. +* +* tmLQCD is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with tmLQCD. If not, see . +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include "global.h" +#ifdef MPI +# include +#endif +#include "su3.h" +#include "io/sw_write_stdout.h" + +void sw_write_stdout(su3 ** u) { + int X, Y, Z, t0, id = 0, ix, iy; + int coords[4]; + + for(int t = 0; t < g_nproc_t*T; t++) { + t0 = t - g_proc_coords[0]*T; + coords[0] = t / T; + for(int x = 0; x < g_nproc_x*LX; x++) { + X = x - g_proc_coords[1]*LX; + coords[1] = x / LX; + for(int y = 0; y < g_nproc_y*LY; y++) { + Y = y - g_proc_coords[2]*LY; + coords[2] = y / LY; + for(int z = 0; z < g_nproc_z*LZ; z++) { + Z = z - g_proc_coords[3]*LZ; + coords[3] = z / LZ; +#ifdef MPI + MPI_Cart_rank(g_cart_grid, coords, &id); +#endif + if(g_cart_id == id) { + ix = g_ipt[t0][X][Y][Z]; + iy = t*(g_nproc_x*LX*g_nproc_y*LY*g_nproc_z*LZ) + + x*(g_nproc_y*LY*g_nproc_z*LZ) + + y*(g_nproc_z*LZ) + z; + for(int mu = 0; mu < 4; mu++) { +/* printf(" %d %d %d %d %d, %d %d %d %d: %d %e %e %e %e %e %e %e %e\n", */ +/* iy, t, x, y, z, t0, X, Y, Z, */ +/* mu, df[ix][mu].d1, df[ix][mu].d2, */ +/* df[ix][mu].d3, df[ix][mu].d4, df[ix][mu].d5, df[ix][mu].d6, */ +/* df[ix][mu].d7, df[ix][mu].d8); */ + printf(" %d %d %d %d %d, %d %d %d %d: %d %e %e sw\n", + iy, t, x, y, z, t0, X, Y, Z, + mu, creal(u[ix][mu].c00), cimag(u[ix][mu].c02)); + + fflush(stdout); + } + } +#ifdef MPI + MPI_Barrier(MPI_COMM_WORLD); +#endif + } + } + } + } + return; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/sw_write_stdout.h b/qcd/part_cpu/applications/QCD/src/kernel_D/io/sw_write_stdout.h new file mode 100644 index 0000000000000000000000000000000000000000..9c7b81007fd8164ff434ab0c197534ba636efa0b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/sw_write_stdout.h @@ -0,0 +1,27 @@ +/*********************************************************************** +* Copyright (C) 2012 Carsten Urbach +* +* This file is part of tmLQCD. +* +* tmLQCD is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation, either version 3 of the License, or +* (at your option) any later version. +* +* tmLQCD is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with tmLQCD. If not, see . +***********************************************************************/ + +#ifndef _SW_WRITE_STDOUT_H +#define _SW_WRITE_STDOUT_H + +#include "su3.h" + +void sw_write_stdout(su3 ** u); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils.c b/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils.c new file mode 100644 index 0000000000000000000000000000000000000000..d335c3ec7222810d31aa9965ac1682ba9c2e49bb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils.c @@ -0,0 +1,103 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#include"utils.ih" + +int isnan_f (float x) { return x != x; } +int isnan_d (double x) { return x != x; } +int isnan_ld (long double x) { return x != x; } + + +int big_endian(){ + union{ + int l; + char c[sizeof(int)]; + } u; + + u.l=1; + return(u.c[sizeof(int) - 1] == 1); +} + +void write_su3(su3 * up, FILE * f) { + fprintf(f,"%f %f %f %f %f %f \n%f %f %f %f %f %f \n%f %f %f %f %f %f %d\n\n", + creal(up->c00), cimag(up->c00), creal(up->c01), cimag(up->c01), + creal(up->c02), cimag(up->c02), creal(up->c10), cimag(up->c10), + creal(up->c11), cimag(up->c11), creal(up->c12), cimag(up->c12), + creal(up->c20), cimag(up->c20), creal(up->c21), cimag(up->c21), + creal(up->c22), cimag(up->c22), g_cart_id); +} + + + +void single2double_cm(spinor * const R, float * const S) { + R->s0.c0 = S[ 0] + S[ 1] * I; + R->s0.c1 = S[ 2] + S[ 3] * I; + R->s0.c2 = S[ 4] + S[ 5] * I; + R->s1.c0 = S[ 6] + S[ 7] * I; + R->s1.c1 = S[ 8] + S[ 9] * I; + R->s1.c2 = S[10] + S[11] * I; + R->s2.c0 = S[12] + S[13] * I; + R->s2.c1 = S[14] + S[15] * I; + R->s2.c2 = S[16] + S[17] * I; + R->s3.c0 = S[18] + S[19] * I; + R->s3.c1 = S[20] + S[21] * I; + R->s3.c2 = S[22] + S[23] * I; +} + +void double2single_cm(float * const S, spinor * const R) { + S[ 0] = creal(R->s0.c0); + S[ 1] = cimag(R->s0.c0); + S[ 2] = creal(R->s0.c1); + S[ 3] = cimag(R->s0.c1); + S[ 4] = creal(R->s0.c2); + S[ 5] = cimag(R->s0.c2); + S[ 6] = creal(R->s1.c0); + S[ 7] = cimag(R->s1.c0); + S[ 8] = creal(R->s1.c1); + S[ 9] = cimag(R->s1.c1); + S[10] = creal(R->s1.c2); + S[11] = cimag(R->s1.c2); + S[12] = creal(R->s2.c0); + S[13] = cimag(R->s2.c0); + S[14] = creal(R->s2.c1); + S[15] = cimag(R->s2.c1); + S[16] = creal(R->s2.c2); + S[17] = cimag(R->s2.c2); + S[18] = creal(R->s3.c0); + S[19] = cimag(R->s3.c0); + S[20] = creal(R->s3.c1); + S[21] = cimag(R->s3.c1); + S[22] = creal(R->s3.c2); + S[23] = cimag(R->s3.c2); +} + +void zero_spinor(spinor * const R) { + R->s0.c0 = 0.; + R->s0.c1 = 0.; + R->s0.c2 = 0.; + R->s1.c0 = 0.; + R->s1.c1 = 0.; + R->s1.c2 = 0.; + R->s2.c0 = 0.; + R->s2.c1 = 0.; + R->s2.c2 = 0.; + R->s3.c0 = 0.; + R->s3.c1 = 0.; + R->s3.c2 = 0.; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils.h b/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils.h new file mode 100644 index 0000000000000000000000000000000000000000..dbaf0ac7b18013230370d16079d2b97ab1788389 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils.h @@ -0,0 +1,300 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _UTILS_H +#define _UTILS_H + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include +#include + +#include "su3.h" +#include +#include +#include + + +#ifndef isnan +# define isnan(x) \ + (sizeof (x) == sizeof (long double) ? isnan_ld (x) \ + : sizeof (x) == sizeof (double) ? isnan_d (x) \ + : isnan_f (x)) + +#endif + +/* These are factory functions, since the constructors for c-lime and lemon are different + and they need different ways of opening files. Moving this to utility functions unclutters + the main code, since we don't need additional #ifdefs anymore. + Since lemon is collective and c-lime isn't, some care needs to be taken. For now, a writer + will only be constructed on the node with g_cart_id == 0 for c-lime, while a reader will + be created everywhere to exploit trivial parallellization. Be careful not to call + construct_writer and friends from within a "if (node_num == 0)" type statement, because + it will cause lemon to get deadlocked! */ +void construct_writer(WRITER ** writer, char * filename, const int append); +void destruct_writer(WRITER * writer); + +void construct_reader(READER ** reader, char * filename); +void destruct_reader(READER * reader); + +void kill_with_error(LIME_FILE *fh, int const rank, char const *error); + +int read_message(READER *reader, char **buffer); +int write_message(WRITER * writer, char const *buffer, uint64_t bytes); +void write_header(WRITER * writer, int MB, int ME, char const *type, uint64_t bytes); + +void write_checksum(WRITER *writer, DML_Checksum const *checksum, char const *name); +void write_xlf_info(WRITER *writer, paramsXlfInfo const *info); +void write_xlf_info_xml(WRITER *writer, paramsXlfInfo const *info); +void write_inverter_info(WRITER * writer, paramsInverterInfo const *info); + +void close_reader_record(READER *reader); +void close_writer_record(WRITER *writer); + +void engineering(char *result, double value, char const *units); +int parse_checksum_xml(char *message, DML_Checksum *checksum); + +int big_endian(); +int write_ildg_format_xml(char *filename, LimeWriter * limewriter, const int precision); +void single2double_cm(spinor * const R, float * const S); +void double2single_cm(float * const S, spinor * const R); +void zero_spinor(spinor * const R); + +int write_first_messages(FILE * parameterfile, char const * const executable, char const * const git_hash); +int parse_propagator_type(READER * reader); + +int parse_ildgformat_xml(char *message, paramsIldgFormat *ildgformat); + +inline static void byte_swap(void * ptr, int nmemb){ + int j; + char char_in[8]; + char * in_ptr; + double * d_ptr; + + for(j = 0, d_ptr = (double *) ptr; j < nmemb; j++, d_ptr++){ + in_ptr = (char *) d_ptr; + + char_in[0] = in_ptr[0]; + char_in[1] = in_ptr[1]; + char_in[2] = in_ptr[2]; + char_in[3] = in_ptr[3]; + char_in[4] = in_ptr[4]; + char_in[5] = in_ptr[5]; + char_in[6] = in_ptr[6]; + char_in[7] = in_ptr[7]; + + in_ptr[0] = char_in[7]; + in_ptr[1] = char_in[6]; + in_ptr[2] = char_in[5]; + in_ptr[3] = char_in[4]; + in_ptr[4] = char_in[3]; + in_ptr[5] = char_in[2]; + in_ptr[6] = char_in[1]; + in_ptr[7] = char_in[0]; + } +} + +inline static void byte_swap32(void * ptr, int nmemb){ + int j; + char char_in[4]; + char * in_ptr; + int * int_ptr; + + for(j = 0, int_ptr = (int *) ptr; j < nmemb; j++, int_ptr++){ + in_ptr = (char *) int_ptr; + + char_in[0] = in_ptr[0]; + char_in[1] = in_ptr[1]; + char_in[2] = in_ptr[2]; + char_in[3] = in_ptr[3]; + + in_ptr[0] = char_in[3]; + in_ptr[1] = char_in[2]; + in_ptr[2] = char_in[1]; + in_ptr[3] = char_in[0]; + } +} + +inline static void byte_swap_assign(void * out_ptr, void * in_ptr, int nmemb){ + int j; + char * char_in_ptr, * char_out_ptr; + double * double_in_ptr, * double_out_ptr; + + double_in_ptr = (double *) in_ptr; + double_out_ptr = (double *) out_ptr; + for(j = 0; j < nmemb; j++){ + char_in_ptr = (char *) double_in_ptr; + char_out_ptr = (char *) double_out_ptr; + + char_out_ptr[7] = char_in_ptr[0]; + char_out_ptr[6] = char_in_ptr[1]; + char_out_ptr[5] = char_in_ptr[2]; + char_out_ptr[4] = char_in_ptr[3]; + char_out_ptr[3] = char_in_ptr[4]; + char_out_ptr[2] = char_in_ptr[5]; + char_out_ptr[1] = char_in_ptr[6]; + char_out_ptr[0] = char_in_ptr[7]; + double_in_ptr++; + double_out_ptr++; + } + return; +} + +inline static void byte_swap_assign32(void * out_ptr, void * in_ptr, int nmemb){ + int j; + char * char_in_ptr, * char_out_ptr; + float * float_in_ptr, * float_out_ptr; + + float_in_ptr = (float *) in_ptr; + float_out_ptr = (float *) out_ptr; + for(j = 0; j < nmemb; j++){ + char_in_ptr = (char *) float_in_ptr; + char_out_ptr = (char *) float_out_ptr; + + char_out_ptr[3] = char_in_ptr[0]; + char_out_ptr[2] = char_in_ptr[1]; + char_out_ptr[1] = char_in_ptr[2]; + char_out_ptr[0] = char_in_ptr[3]; + float_in_ptr++; + float_out_ptr++; + } + return; +} + + +#if BYTE_ORDER == LITTLE_ENDIAN + +inline static void be_to_cpu_assign(void * out_ptr, void * in_ptr, int nmemb){ + byte_swap_assign(out_ptr, in_ptr, nmemb); + return; +} + +#else + +inline static void be_to_cpu_assign(void * out_ptr, void * in_ptr, int nmemb){ + memcpy(out_ptr, in_ptr, 8*nmemb); + return; +} + +#endif + +inline static void single2double(void * out_ptr, void * in_ptr, int nmemb) { + int i; + float * float_ptr = (float*) in_ptr; + double * double_ptr = (double*) out_ptr; + + for(i = 0; i < nmemb; i++) { + (*double_ptr) = (double) (*float_ptr); + + float_ptr++; + double_ptr++; + } + +} + +inline static void double2single(void * out_ptr, void * in_ptr, int nmemb) { + int i; + float * float_ptr = (float*) out_ptr; + double * double_ptr = (double*) in_ptr; + + for(i = 0; i < nmemb; i++) { + (*float_ptr) = (float) (*double_ptr); + + float_ptr++; + double_ptr++; + } + +} + +#if BYTE_ORDER == LITTLE_ENDIAN + +inline static void be_to_cpu_assign_single2double(void * out_ptr, void * in_ptr, int nmemb){ + int j; + char * char_in_ptr, * char_out_ptr; + double * double_out_ptr; + float * float_in_ptr; + float tmp; + + float_in_ptr = (float *) in_ptr; + double_out_ptr = (double *) out_ptr; + char_out_ptr = (char *) &tmp; + for(j = 0; j < nmemb; j++){ + char_in_ptr = (char *) float_in_ptr; + + char_out_ptr[3] = char_in_ptr[0]; + char_out_ptr[2] = char_in_ptr[1]; + char_out_ptr[1] = char_in_ptr[2]; + char_out_ptr[0] = char_in_ptr[3]; + (*double_out_ptr) = (double) tmp; + float_in_ptr++; + double_out_ptr++; + } + return; +} + +#else + +inline static void be_to_cpu_assign_single2double(void * out_ptr, void * in_ptr, int nmemb){ + single2double(out_ptr, in_ptr, nmemb); + return; +} + +#endif + +#if BYTE_ORDER == LITTLE_ENDIAN + +inline static void be_to_cpu_assign_double2single(void * out_ptr, void * in_ptr, int nmemb){ + int j; + char * char_in_ptr, * char_out_ptr; + double * double_in_ptr; + float * float_out_ptr; + float tmp; + + float_out_ptr = (float *) out_ptr; + double_in_ptr = (double *) in_ptr; + char_in_ptr = (char *) &tmp; + for(j = 0; j < nmemb; j++){ + tmp = (float) (*double_in_ptr); + char_out_ptr = (char*) float_out_ptr; + + char_out_ptr[3] = char_in_ptr[0]; + char_out_ptr[2] = char_in_ptr[1]; + char_out_ptr[1] = char_in_ptr[2]; + char_out_ptr[0] = char_in_ptr[3]; + + float_out_ptr++; + double_in_ptr++; + } + return; +} + +#else + +inline static void be_to_cpu_assign_double2single(void * out_ptr, void * in_ptr, int nmemb){ + double2single(out_ptr, in_ptr, nmemb); + return; +} +#endif + + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils.ih b/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils.ih new file mode 100644 index 0000000000000000000000000000000000000000..0df68c33aefba0e53da92e8992d236ff72ad8517 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils.ih @@ -0,0 +1,46 @@ +/*********************************************************************** +* Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach +* +* This file is part of tmLQCD. +* +* tmLQCD is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation, either version 3 of the License, or +* (at your option) any later version. +* +* tmLQCD is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with tmLQCD. If not, see . +***********************************************************************/ + + +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef BENCHMARK +#include <../c-lime/include/lime.h> +#else +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#ifdef MPI +# include +#endif +#include +#include + +#include +#include +#include + +#include diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_close_reader_record.c b/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_close_reader_record.c new file mode 100644 index 0000000000000000000000000000000000000000..ca07d780f7fddfb980d2bedc68e5944fafba3489 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_close_reader_record.c @@ -0,0 +1,29 @@ +/*********************************************************************** +* Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach +* +* This file is part of tmLQCD. +* +* tmLQCD is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation, either version 3 of the License, or +* (at your option) any later version. +* +* tmLQCD is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with tmLQCD. If not, see . +***********************************************************************/ + +#include "utils.ih" + +void close_reader_record(READER *reader) +{ + if (reader != NULL) + ReaderCloseRecord(reader); + #ifdef MPI + MPI_Barrier(g_cart_grid); + #endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_close_writer_record.c b/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_close_writer_record.c new file mode 100644 index 0000000000000000000000000000000000000000..23faac7d2158e6edcfcf9663080f17247ce7678b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_close_writer_record.c @@ -0,0 +1,26 @@ +/*********************************************************************** +* Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach +* +* This file is part of tmLQCD. +* +* tmLQCD is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation, either version 3 of the License, or +* (at your option) any later version. +* +* tmLQCD is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with tmLQCD. If not, see . +***********************************************************************/ + +#include "utils.ih" + +void close_writer_record(WRITER *writer) +{ + if (writer != NULL) + WriterCloseRecord(writer); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_construct_reader.c b/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_construct_reader.c new file mode 100644 index 0000000000000000000000000000000000000000..79567e13367512061fb758ae4ef38232854bcef9 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_construct_reader.c @@ -0,0 +1,40 @@ +#include "utils.ih" + +void construct_reader(READER ** reader, char * filename) +{ + LIME_FILE *fh = NULL; + int status = 0; + + if(g_debug_level > 0 && g_cart_id == 0) { +#ifdef HAVE_LIBLEMON + printf("# Constructing LEMON reader for file %s ...\n", filename); +#else + printf("# Constructing LIME reader for file %s ...\n", filename); +#endif + } + + +#ifdef HAVE_LIBLEMON + fh = (MPI_File*)malloc(sizeof(MPI_File)); + status = MPI_File_open(g_cart_grid, filename, MPI_MODE_RDONLY, MPI_INFO_NULL, fh); + status = (status == MPI_SUCCESS) ? 0 : 1; +#else /* HAVE_LIBLEMON */ + fh = fopen(filename, "r"); + status = (fh == NULL) ? 1 : 0; + fflush(stderr); +#endif /* HAVE_LIBLEMON */ + + if (status) { + kill_with_error(fh, g_cart_id, "\nUnable to open file for reading.\nPlease verify file existence and access rights.\nUnable to continue.\n"); + } + +#ifdef HAVE_LIBLEMON + *reader = lemonCreateReader(fh, g_cart_grid); +#else /* HAVE_LIBLEMON */ + *reader = limeCreateReader(fh); +#endif /* HAVE_LIBLEMON */ + + if (*reader == (READER *)NULL) { + kill_with_error(fh, g_cart_id, "\nCould not create reader, unable to continue.\n"); + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_construct_writer.c b/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_construct_writer.c new file mode 100644 index 0000000000000000000000000000000000000000..3e8182060bd66b3e57cc63895e23297ebe1eb66c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_construct_writer.c @@ -0,0 +1,44 @@ +#include "utils.ih" + +void construct_writer(WRITER ** writer, char * filename, const int append) +{ + LIME_FILE *fh = NULL; + int status = 0; + if(g_debug_level > 0 && g_cart_id == 0) { +#ifdef HAVE_LIBLEMON + printf("# Constructing LEMON writer for file %s for append = %d\n", filename, append); +#else + printf("# Constructing LIME writer for file %s for append = %d\n", filename, append); +#endif + } + +#ifdef HAVE_LIBLEMON + fh = (MPI_File*)malloc(sizeof(MPI_File)); + if(append) { + status = MPI_File_open(g_cart_grid, filename, MPI_MODE_WRONLY | MPI_MODE_CREATE | MPI_MODE_APPEND, MPI_INFO_NULL, fh); + } + else { + status = MPI_File_open(g_cart_grid, filename, MPI_MODE_WRONLY | MPI_MODE_CREATE, MPI_INFO_NULL, fh); + if(status == MPI_SUCCESS) status = MPI_File_set_size(*fh, 0); + } + status = (status == MPI_SUCCESS) ? 0 : 1; + *writer = lemonCreateWriter(fh, g_cart_grid); + status = status || (writer == NULL); +#else /* HAVE_LIBLEMON */ + if (g_cart_id == 0) + { + if(append) { + fh = fopen(filename, "a"); + } + else { + fh = fopen(filename, "w"); + } + status = (fh == NULL); + *writer = limeCreateWriter(fh); + status = status || (writer == NULL); + } +#endif /* HAVE_LIBLEMON */ + + if (status) + kill_with_error(fh, g_cart_id, "Failed to create writer. Aborting...\n"); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_destruct_reader.c b/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_destruct_reader.c new file mode 100644 index 0000000000000000000000000000000000000000..ceb6826fa65856208902cbc9b715a7b2601ecee8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_destruct_reader.c @@ -0,0 +1,15 @@ +#include "utils.ih" + +void destruct_reader(READER * reader) +{ + LIME_FILE *fh = NULL; + + fh = reader->fp; + DestroyReader(reader); +#ifdef HAVE_LIBLEMON + MPI_File_close(fh); + free(fh); /* NB This assumes construct_writer was used to malloc memory! */ +#else /* HAVE_LIBLEMON */ + fclose(fh); +#endif /* HAVE_LIBLEMON */ +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_destruct_writer.c b/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_destruct_writer.c new file mode 100644 index 0000000000000000000000000000000000000000..743892f52ae0e7fc1e1c9c06d033f5132807e778 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_destruct_writer.c @@ -0,0 +1,20 @@ +#include "utils.ih" + +void destruct_writer(WRITER * writer) +{ + LIME_FILE *fh = NULL; + +#ifdef HAVE_LIBLEMON + fh = writer->fp; + lemonDestroyWriter(writer); + MPI_File_close(fh); + free(fh); /* NB This assumes construct_writer was used to malloc memory! */ +#else /* HAVE_LIBLEMON */ + if (g_cart_id == 0) + { + fh = writer->fp; + limeDestroyWriter(writer); + fclose(fh); + } +#endif /* HAVE_LIBLEMON */ +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_engineering.c b/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_engineering.c new file mode 100644 index 0000000000000000000000000000000000000000..762b2ee8b94cf8539916a7b137cf5a39d37c88b3 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_engineering.c @@ -0,0 +1,31 @@ +#include "utils.ih" + +static char prefix[] = {'z', 'a', 'f', 'p', 'u', 'm', ' ', 'k', 'M', 'G', 'T', 'P', 'E', 'Z', 'Y'}; + +void engineering(char *result, double value, char const *units) +{ + double logval = log10(value); + int logscale; + int digits = 2; + + logscale = (int)floor(logval / 3); + + if (logscale > -6 && logscale < 6) + { + value /= pow(1E3, (double)logscale); + if (value > 100) + digits = 0; + else + if (value > 10) + digits = 1; + + if (logscale) + sprintf(result, "%.*f %c%s", digits, value, prefix[logscale + 6], units); + else + sprintf(result, "%.*f %s", digits, value, units); + } + else + { + sprintf(result, "%4.2e %s", value, units); + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_kill_with_error.c b/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_kill_with_error.c new file mode 100644 index 0000000000000000000000000000000000000000..f6c39418157db128fca9d882e3e840df58915099 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_kill_with_error.c @@ -0,0 +1,23 @@ +#include "utils.ih" + +void kill_with_error(LIME_FILE *fh, int const rank, char const *error) +{ + if (error != NULL) + { + fprintf(stderr, "KILL_WITH_ERROR on node %d: %s", rank, error); + fflush(stderr); + } + + if (fh != NULL) +#ifdef HAVE_LIBLEMON + MPI_File_close(fh); +#else + fclose(fh); +#endif /* HAVE_LIBLEMON */ + +#ifdef MPI + MPI_Abort(MPI_COMM_WORLD, 1); + MPI_Finalize(); +#endif + exit(500); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_parse_checksum_xml.c b/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_parse_checksum_xml.c new file mode 100644 index 0000000000000000000000000000000000000000..c2bcde79238e5caf4ebd53b039584f1765811f6e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_parse_checksum_xml.c @@ -0,0 +1,46 @@ +/*********************************************************************** +* Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach +* +* This file is part of tmLQCD. +* +* tmLQCD is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation, either version 3 of the License, or +* (at your option) any later version. +* +* tmLQCD is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with tmLQCD. If not, see . +***********************************************************************/ + +#include "utils.ih" + +int parse_checksum_xml(char *message, DML_Checksum *checksum) +{ + int read_suma = 0, read_sumb = 0; + char *pos = strtok(message, "<> \n\t"); + + if (checksum == (DML_Checksum*)NULL) { + return 0; + } + + while (pos) + { + if (!strncmp(pos, "suma", 4)) { + pos = strtok(0, "<> \n\t"); + sscanf(pos, "%x", &checksum->suma); + read_suma = 1; + } + if (!strncmp(pos, "sumb", 4)) { + pos = strtok(0, "<> \n\t"); + sscanf(pos, "%x", &checksum->sumb); + read_sumb = 1; + } + pos = strtok(0, "<> \n\t"); + } + return (read_suma && read_sumb); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_parse_ildgformat_xml.c b/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_parse_ildgformat_xml.c new file mode 100644 index 0000000000000000000000000000000000000000..54df875093b66498fb6992d3aec75c574372d8e1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_parse_ildgformat_xml.c @@ -0,0 +1,61 @@ +/*********************************************************************** +* Copyright (C) 2011 Siebren Reker +* +* This file is part of tmLQCD. +* +* tmLQCD is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation, either version 3 of the License, or +* (at your option) any later version. +* +* tmLQCD is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with tmLQCD. If not, see . +***********************************************************************/ + +#include "utils.ih" + +int parse_ildgformat_xml(char *message, paramsIldgFormat *ildgformat) +{ + int read_prec = 0, read_lx = 0, read_ly = 0, read_lz = 0, read_lt = 0; + char *pos = strtok(message, "<> \n\t"); + + if (ildgformat == (paramsIldgFormat*)NULL) { + return 0; + } + + while (pos) + { + if (!strncmp(pos, "precision", 9)) { + pos = strtok(0, "<> \n\t"); + sscanf(pos, "%d", &ildgformat->prec); + read_prec = 1; + } + if (!strncmp(pos, "lx", 2)) { + pos = strtok(0, "<> \n\t"); + sscanf(pos, "%d", &ildgformat->lx); + read_lx = 1; + } + if (!strncmp(pos, "ly", 2)) { + pos = strtok(0, "<> \n\t"); + sscanf(pos, "%d", &ildgformat->ly); + read_ly = 1; + } + if (!strncmp(pos, "lz", 2)) { + pos = strtok(0, "<> \n\t"); + sscanf(pos, "%d", &ildgformat->lz); + read_lz = 1; + } + if (!strncmp(pos, "lt", 2)) { + pos = strtok(0, "<> \n\t"); + sscanf(pos, "%d", &ildgformat->lt); + read_lt = 1; + } + pos = strtok(0, "<> \n\t"); + } + return (read_prec && read_lx && read_ly && read_lz && read_lt); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_parse_propagator_type.c b/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_parse_propagator_type.c new file mode 100644 index 0000000000000000000000000000000000000000..9b91c65508819f85e452e9ff4a8a1d222ca2e82b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_parse_propagator_type.c @@ -0,0 +1,91 @@ +/*********************************************************************** +* Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach +* +* This file is part of tmLQCD. +* +* tmLQCD is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation, either version 3 of the License, or +* (at your option) any later version. +* +* tmLQCD is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with tmLQCD. If not, see . +***********************************************************************/ + +#include "utils.ih" + +int parse_propagator_type(READER * reader) { + char *prop_type_string = NULL; + char *header_type = NULL; + int prop_type = -1; + int status = 0; + int proptypefound = 0, sourcetypefound = 0; + + while ((status = ReaderNextRecord(reader)) != LIME_EOF) { + if (status != LIME_SUCCESS) { + fprintf(stderr, "ReaderNextRecord returned status %d.\n", status); + break; + } + header_type = ReaderType(reader); + if(g_cart_id == 0 && g_debug_level > 1) { + fprintf(stdout, "found header %s, will now read the message\n", header_type); + fflush(stdout); + } + if (strcmp("propagator-type", header_type) == 0) { + read_message(reader, &prop_type_string); + if(strcmp("DiracFermion_Sink", prop_type_string) == 0) + prop_type = 0; + else if(strcmp("DiracFermion_Source_Sink_Pairs", prop_type_string) == 0) + prop_type = 1; + else if(strcmp("DiracFermion_ScalarSource_TwelveSink", prop_type_string) == 0) + prop_type = 2; + else if(strcmp("DiracFermion_ScalarSource_FourSink", prop_type_string) == 0) + prop_type = 3; + else if(strcmp("DiracFermion_Deflation_Field", prop_type_string) == 0) + prop_type = 4; + else { + fprintf(stderr,"Unrecognized propagator-type, found type: %s.\n", prop_type_string); + break; + } + proptypefound = 1; + if(g_cart_id == 0 && g_debug_level > 0) { + printf("# file is of type %s for proc %d\n", prop_type_string, g_cart_id); + } + free(prop_type_string); + close_reader_record(reader); + break; + } + if (strcmp("source-type", header_type) == 0) { + read_message(reader, &prop_type_string); + if(strcmp("DiracFermion_Source", prop_type_string) == 0) + prop_type = 10; + else if(strcmp("DiracFermion_ScalarSource", prop_type_string) == 0) + prop_type = 11; + else if(strcmp("DiracFermion_FourScalarSource", prop_type_string) == 0) + prop_type = 12; + else if(strcmp("DiracFermion_TwelveScalarSource", prop_type_string) == 0) + prop_type = 13; + else { + fprintf(stderr,"Unrecognized source-type, found type: %s\n", prop_type_string); + break; + } + sourcetypefound = 1; + if(g_cart_id == 0 && g_debug_level > 0) { + printf("# file is of type %s", prop_type_string); + } + free(prop_type_string); + close_reader_record(reader); + break; + } + if ((sourcetypefound || proptypefound) == 0) { + fprintf(stderr, "Unable to find either source-type or propagator-type record.\nWARNING: Continuing in blind faith.\n"); + } + close_reader_record(reader); + } + return(prop_type); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_read_message.c b/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_read_message.c new file mode 100644 index 0000000000000000000000000000000000000000..2b0b4c6705600140cd157d58db49c25948b2f410 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_read_message.c @@ -0,0 +1,57 @@ +/*********************************************************************** +* Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach +* +* This file is part of tmLQCD. +* +* tmLQCD is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation, either version 3 of the License, or +* (at your option) any later version. +* +* tmLQCD is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with tmLQCD. If not, see . +***********************************************************************/ + +#include "utils.ih" + +int read_message(READER * reader, char **buffer) { + + int status; + n_uint64_t bytes, bytesRead; + + if (buffer == (char**)NULL) + return(-1); + + if ((*buffer) != (char*)NULL) + free(*buffer); + + bytes = ReaderBytes(reader); + bytesRead = bytes; + + /* this termination force gives sometimes random results and hanging code ... */ + /* with calloc instead of malloc it seems to be fine */ + *buffer = (char*)calloc(bytes + 1, sizeof(char)); + /* *buffer = (char*)calloc(bytes, sizeof(char)); */ + + if (*buffer == (char*)NULL) { + fprintf(stderr, "Couldn't malloc data buffer in read_message.\n"); + return(-1); + } + + status = ReaderReadData(*buffer, &bytesRead, reader); +#if MPI + MPI_Barrier(g_cart_grid); +#endif + + if (status != LIME_SUCCESS || bytes != bytesRead) + kill_with_error(reader->fp, g_cart_id, "Error in reading message.\n"); + + (*buffer)[bytes] = '\0'; /* Force termination for safety */ + + return(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_write_checksum.c b/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_write_checksum.c new file mode 100644 index 0000000000000000000000000000000000000000..061324d12787826189e5ac11e94be0824a757514 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_write_checksum.c @@ -0,0 +1,49 @@ +/*********************************************************************** +* Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach +* +* This file is part of tmLQCD. +* +* tmLQCD is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation, either version 3 of the License, or +* (at your option) any later version. +* +* tmLQCD is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with tmLQCD. If not, see . +***********************************************************************/ + +#include "utils.ih" + +void write_checksum(WRITER * writer, DML_Checksum const *checksum, char const *name) +{ + char *message; + uint64_t bytes; + message = (char*)malloc(512); + if (message == (char*)NULL) { + kill_with_error(writer->fp, g_cart_id, "Memory allocation error in write_checksum. Aborting\n"); + } + sprintf(message, "\n" + "\n" + " 1.0\n" + " %08x\n" + " %08x\n" + "", checksum->suma, checksum->sumb); + bytes = strlen(message); + /* The message begin bit is 0, because this is written as part of a data message + * the end bit is 1, since this should be the last record of a message */ + if (name == NULL) + write_header(writer, 0, 1, "scidac-checksum", bytes); + else + write_header(writer, 0, 1, name, bytes); + + write_message(writer, message, bytes); + + close_writer_record(writer); + free(message); + return; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_write_first_message.c b/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_write_first_message.c new file mode 100644 index 0000000000000000000000000000000000000000..11cea0520f96243dd05df30a4c7d4c378425e3d4 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_write_first_message.c @@ -0,0 +1,163 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#include + +#include "utils.ih" +#include + +int write_first_messages(FILE * parameterfile, char const * const executable, char const * const git_hash) { + char message[1024]; + snprintf(message, 1024, "This is the %s code for twisted mass Wilson QCD\n\nVersion %s, commit %s\n",executable,PACKAGE_VERSION,git_hash); + printf("%s",message); + fprintf(parameterfile,"%s",message); + +#ifdef SSE + printf("# The code is compiled with SSE instructions\n"); + fprintf(parameterfile, + "# The code is compiled with SSE instructions\n"); +#endif +#ifdef SSE2 + printf("# The code is compiled with SSE2 instructions\n"); + fprintf(parameterfile, + "# The code is compiled with SSE2 instructions\n"); +#endif +#ifdef SSE3 + printf("# The code is compiled with SSE3 instructions\n"); + fprintf(parameterfile, + "# The code is compiled with SSE3 instructions\n"); +#endif +#ifdef P4 + printf("# The code is compiled for Pentium4\n"); + fprintf(parameterfile, + "# The code is compiled for Pentium4\n"); +#endif +#if (defined BGL && !defined BGP) + printf("# The code is compiled for Blue Gene/L\n"); + fprintf(parameterfile, + "# The code is compiled for Blue Gene/L\n"); +#endif +#ifdef BGP + printf("# The code is compiled for Blue Gene/P\n"); + fprintf(parameterfile, + "# The code is compiled for Blue Gene/P\n"); +#endif +#if (defined BGQ && defined XLC) + printf("# The code is compiled with QPX intrinsics for Blue Gene/Q\n"); + fprintf(parameterfile, + "# The code is compiled with QPX intrinsics for Blue Gene/Q\n"); +#endif +#ifdef SPI + printf("# Compiled with BG/Q SPI communication\n"); + fprintf(parameterfile, + "# Compiled with IBM Blue Gene/Q SPI communication\n"); +#endif +#ifdef OPTERON + printf("# The code is compiled for AMD Opteron\n"); + fprintf(parameterfile, + "# The code is compiled for AMD Opteron\n"); +#endif +#ifdef _GAUGE_COPY + printf("# The code is compiled with -D_GAUGE_COPY\n"); + fprintf(parameterfile, + "# The code is compiled with -D_GAUGE_COPY\n"); +#endif +#ifdef _USE_HALFSPINOR + printf("# The code is compiled with -D_USE_HALFSPINOR\n"); + fprintf(parameterfile, + "# The code is compiled with -D_USE_HALFSPINOR\n"); +#endif +#ifdef _USE_SHMEM + printf("# the code is compiled with -D_USE_SHMEM\n"); + fprintf(parameterfile, + "# the code is compiled with -D_USE_SHMEM\n"); +# ifdef _PERSISTENT + printf("# the code is compiled for persistent MPI calls (halfspinor only)\n"); + fprintf(parameterfile, + "# the code is compiled for persistent MPI calls (halfspinor only)\n"); +# endif +#endif +#ifdef MPI +# ifdef _NON_BLOCKING + printf("# the code is compiled for non-blocking MPI calls (spinor and gauge)\n"); + fprintf(parameterfile, + "# the code is compiled for non-blocking MPI calls (spinor and gauge)\n"); +# endif +# ifdef HAVE_LIBLEMON + printf("# the code is compiled with MPI IO / Lemon\n"); + fprintf(parameterfile, + "# the code is compiled with MPI IO / Lemon\n"); +# endif +#endif +#ifdef OMP + printf("# the code is compiled with openMP support\n"); + fprintf(parameterfile, + "# the code is compiled with openMP support\n"); +#endif + if( bc_flag == 0 ) { + printf("# Periodic boundary conditions are used\n"); + fprintf(parameterfile, "# Periodic boundary conditions are used\n"); + } + if( bc_flag == 1 ) { + printf("# Schroedinger Functional boundary conditions are used\n"); + fprintf(parameterfile, "# Schroedinger Functional boundary conditions are used\n"); + } + printf("# The lattice size is %d x %d x %d x %d\n", + (int)(T*g_nproc_t), (int)(LX*g_nproc_x), (int)(LY*g_nproc_y), (int)(LZ*g_nproc_z)); + printf("# The local lattice size is %d x %d x %d x %d\n", + (int)(T), (int)(LX), (int)(LY),(int) LZ); + if(even_odd_flag) { + printf("# Even/odd preconditioning is used\n"); + fprintf(parameterfile, "# Even/odd preconditioning is used\n"); + } + else { + printf("# Even/odd preconditioning is not used\n"); + fprintf(parameterfile, "# Even/odd preconditioning is not used\n"); + } + printf("# beta = %f , kappa= %f\n", g_beta, g_kappa); + printf("# boundary conditions for fermion fields (t,x,y,z) * pi: %f %f %f %f \n",X0,X1,X2,X3); + if( strcmp(executable,"hmc") == 0 ) { + printf("# mu = %f\n", g_mu/2./g_kappa); + printf("# g_rgi_C0 = %f, g_rgi_C1 = %f\n", g_rgi_C0, g_rgi_C1); + printf("# Using %s precision for the inversions!\n", + g_relative_precision_flag ? "relative" : "absolute"); + } + fprintf(parameterfile, "# The lattice size is %d x %d x %d x %d\n", (int)(g_nproc_t*T), (int)(g_nproc_x*LX), + (int)(g_nproc_y*LY), (int)(g_nproc_z*LZ)); + fprintf(parameterfile, "# The local lattice size is %d x %d x %d x %d\n", (int)(T), (int)(LX), (int)(LY), (int)(LZ)); + fprintf(parameterfile, "# g_beta = %f , g_kappa= %f, g_kappa*csw/8= %f \n",g_beta,g_kappa,g_ka_csw_8); + fprintf(parameterfile, "# boundary conditions for fermion fields (t,x,y,z) * pi: %f %f %f %f \n",X0,X1,X2,X3); + if( strcmp(executable,"hmc") == 0 ) { + fprintf(parameterfile, "# Nmeas=%d, Nsave=%d \n", + Nmeas,Nsave); + fprintf(parameterfile, "# mu = %f\n", g_mu/2./g_kappa); + fprintf(parameterfile, "# g_rgi_C0 = %f, g_rgi_C1 = %f\n", g_rgi_C0, g_rgi_C1); + fprintf(parameterfile, "# Using %s precision for the inversions!\n", + g_relative_precision_flag ? "relative" : "absolute"); + } + if( strcmp(executable,"invert") == 0 ) { + printf("# beta = %f, mu = %f, kappa = %f\n", g_beta, g_mu/2./g_kappa, g_kappa); + fprintf(parameterfile, + "# beta = %f, mu = %f, kappa = %f\n", g_beta, g_mu/2./g_kappa, g_kappa); + } + fflush(stdout); fflush(parameterfile); + return(0); +} + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_write_header.c b/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_write_header.c new file mode 100644 index 0000000000000000000000000000000000000000..ee0870f749e4571a37178ac3f6b61f0be183c926 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_write_header.c @@ -0,0 +1,42 @@ +/*********************************************************************** +* Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach +* +* This file is part of tmLQCD. +* +* tmLQCD is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation, either version 3 of the License, or +* (at your option) any later version. +* +* tmLQCD is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with tmLQCD. If not, see . +***********************************************************************/ + +#include "utils.ih" + +void write_header(WRITER * writer, int MB, int ME, char const *type, uint64_t bytes) +{ + int status; + RECORD_HEADER *header; + +#ifndef HAVE_LIBLEMON + if(g_cart_id == 0) { +#endif /* ! HAVE_LIBLEMON */ + /* Nasty (but probably harmless) hack to get rid of const qualifier - the original c-lime was sloppy here. */ + header = CreateHeader(MB, ME, (char*)type, bytes); + status = WriteRecordHeader(header, writer); + DestroyHeader(header); + + if (status != LIME_SUCCESS) { + kill_with_error(writer->fp, g_cart_id, "Header writing error. Aborting\n"); + } +#ifndef HAVE_LIBLEMON + } +#endif /* ! HAVE_LIBLEMON */ + return; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_write_ildg_format.c b/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_write_ildg_format.c new file mode 100644 index 0000000000000000000000000000000000000000..6b44f006ed9c403de366c4bbe18d8b0353ac9eb7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_write_ildg_format.c @@ -0,0 +1,50 @@ +/*********************************************************************** +* Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach +* +* This file is part of tmLQCD. +* +* tmLQCD is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation, either version 3 of the License, or +* (at your option) any later version. +* +* tmLQCD is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with tmLQCD. If not, see . +***********************************************************************/ + +#include "gauge.ih" + +void write_ildg_format(WRITER *writer, paramsIldgFormat const *format) +{ + uint64_t bytes; + char *buf; + + buf = (char*)malloc(512); + if (buf == (char*)NULL) + kill_with_error(writer->fp, g_cart_id, "Memory allocation error in write_ildg_format. Aborting\n"); + + sprintf(buf, "\n" + "\n" + " 1.0\n" + " su3gauge\n" + " %d\n" + " %d\n" + " %d\n" + " %d\n" + " %d\n" + "", + format->prec, format->lx, format->ly, format->lz, format->lt); + + bytes = strlen(buf); + write_header(writer, 1, 0, "ildg-format", bytes); /* ME is 0 because a ildg-binary-data record MUST follow */ + write_message(writer, buf, bytes); + close_writer_record(writer); + free(buf); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_write_inverter_info.c b/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_write_inverter_info.c new file mode 100644 index 0000000000000000000000000000000000000000..82589789d92329a12fc34a7708ee29c975ad768e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_write_inverter_info.c @@ -0,0 +1,76 @@ +/*********************************************************************** +* Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach +* +* This file is part of tmLQCD. +* +* tmLQCD is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation, either version 3 of the License, or +* (at your option) any later version. +* +* tmLQCD is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with tmLQCD. If not, see . +***********************************************************************/ + +#include "spinor.ih" + +void write_inverter_info(WRITER * writer, paramsInverterInfo const *info) +{ + char *message; + n_uint64_t bytes; + message = (char*)malloc(1024); + + if (info->mms) { + sprintf(message, "solver = %s\n" + "result is for Q^dagger Q!\n" + "multiple mass solver\n" + "epssq = %e\n" + "noiter = %d\n" + "kappa = %f, inverted mu = %f, lowest mu = %f\n" + "inverter version = %s\n" + "date = %s", + info->inverter, + info->epssq, info->iter, info->kappa, + info->cgmms_mass, + info->mu, info->package_version, + info->date); + } + else { + if (!info->heavy) { + sprintf(message, "solver = %s\n" + "epssq = %e\n" + "noiter = %d\n" + "kappa = %f, mu = %f\n" + "inverter version = %s\n" + "date = %s", + info->inverter, + info->epssq, info->iter, info->kappa, info->mu, + info->package_version, info->date); + } + else { + sprintf(message, "solver = %s\n" + "epssq = %e\n" + "noiter = %d\n" + "kappa = %f, mubar = %f, epsbar=%f\n" + "inverter version = %s\n" + "date = %s", + info->inverter, + info->epssq, info->iter, info->kappa, info->mubar, + info->epsbar, + info->package_version, info->date); + } + } + bytes = strlen(message); + write_header(writer, 1, 0, "inverter-info", bytes); + write_message(writer, message, bytes); + close_writer_record(writer); + + free(message); + return; +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_write_message.c b/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_write_message.c new file mode 100644 index 0000000000000000000000000000000000000000..705a6b617930686b71146e751e8a65eb9b8d3fa5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_write_message.c @@ -0,0 +1,40 @@ +/*********************************************************************** +* Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach +* +* This file is part of tmLQCD. +* +* tmLQCD is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation, either version 3 of the License, or +* (at your option) any later version. +* +* tmLQCD is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with tmLQCD. If not, see . +***********************************************************************/ + +#include "utils.ih" + +int write_message(WRITER * writer, char const *buffer, uint64_t bytes) +{ + int status; + n_uint64_t bytesWritten = bytes; + +#ifndef HAVE_LIBLEMON + if(g_cart_id == 0){ +#endif /* ! HAVE_LIBLEMON */ + if (buffer == (char*)NULL) + return(0); + + status = WriteRecordData((void*)buffer, &bytes, writer); + if (status != LIME_SUCCESS || bytes != bytesWritten) + kill_with_error(writer->fp, g_cart_id, "I/O error on writing message. Aborting...\n"); +#ifndef HAVE_LIBLEMON + } +#endif /* ! HAVE_LIBLEMON */ + return(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_write_xlf.c b/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_write_xlf.c new file mode 100644 index 0000000000000000000000000000000000000000..2d5546bd1fd14c92a6d4e3ee0b90d1543742446d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_write_xlf.c @@ -0,0 +1,64 @@ +/*********************************************************************** +* Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach +* +* This file is part of tmLQCD. +* +* tmLQCD is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation, either version 3 of the License, or +* (at your option) any later version. +* +* tmLQCD is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with tmLQCD. If not, see . +***********************************************************************/ + +#include "utils.ih" + +void write_xlf_info(WRITER * writer, paramsXlfInfo const *info) +{ + char *message; + uint64_t bytes; + + message = (char*)malloc(512); + if (message == (char*)NULL) + kill_with_error(writer->fp, g_cart_id, "Memory allocation error in write_xlf_info. Aborting\n"); + + if (info->kappa != 0.0) { + sprintf(message, "plaquette = %14.12f\n" + " trajectory nr = %d\n" + " beta = %f, kappa = %f, mu = %f, c2_rec = %f\n" + " time = %ld\n" + " hmcversion = %s\n" + " mubar = %f\n" + " epsilonbar = %f\n" + " date = %s", + info->plaq, info->counter, info->beta, info->kappa, + info->mu, info->c2_rec, info->time, info->package_version, + info->mubar, info->epsilonbar, info->date); + } + else { + sprintf(message, "plaquette = %e\n" + " trajectory nr = %d\n" + " beta = %f\n" + " kappa = %f\n" + " 2*kappa*mu = %f\n" + " c2_rec = %f\n" + " date = %s", + info->plaq, info->counter, info->beta, info->kappa, + info->mu, info->c2_rec, info->date); + } + bytes = strlen(message); + + write_header(writer, 1, 1, "xlf-info", bytes); + write_message(writer, message, bytes); + + close_writer_record(writer); + + free(message); + return; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_write_xlf_xml.c b/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_write_xlf_xml.c new file mode 100644 index 0000000000000000000000000000000000000000..d536d646b6d0d6253297cdbb5bbf4f4da15c93ea --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/io/utils_write_xlf_xml.c @@ -0,0 +1,72 @@ +/*********************************************************************** +* Copyright (C) 2011 Siebren Reker +* +* This file is part of tmLQCD. +* +* tmLQCD is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation, either version 3 of the License, or +* (at your option) any later version. +* +* tmLQCD is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with tmLQCD. If not, see . +***********************************************************************/ + +#include "utils.ih" + +void write_xlf_info_xml(WRITER * writer, paramsXlfInfo const *info) +{ + char *message; + uint64_t bytes; + + message = (char*)malloc(512); + if (message == (char*)NULL) + kill_with_error(writer->fp, g_cart_id, "Memory allocation error in write_xlf_info_xml. Aborting\n"); + + if (info->kappa != 0.0) { + sprintf(message, "\n" + "\n" + " %14.12f\n" + " %d\n" + " %f\n" + " %f\n" + " %f\n" + " %f\n" + " \n" + " %s\n" + " %f\n" + " %f\n" + " %s\n" + "", info->plaq, info->counter, info->beta, info->kappa, + info->mu, info->c2_rec, info->time, info->package_version, + info->mubar, info->epsilonbar, info->date); + bytes = strlen(message); + } + else { + sprintf(message, "\n" + "\n" + " %e\n" + " %d\n" + " %f\n" + " %f\n" + " <2kappamu>%f\n" + " %f\n" + " %s\n" + "", info->plaq, info->counter, info->beta, info->kappa, + info->mu, info->c2_rec, info->date); + } + bytes = strlen(message); + + write_header(writer, 1, 1, "xlf-info", bytes); + write_message(writer, message, bytes); + + close_writer_record(writer); + + free(message); + return; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/jacobi.c b/qcd/part_cpu/applications/QCD/src/kernel_D/jacobi.c new file mode 100644 index 0000000000000000000000000000000000000000..b43b5eb2323fa27319659b4a650a193e143f78bb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/jacobi.c @@ -0,0 +1,74 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +/* + * Routine for the computation of the Jacobi operator (for use into LapH_ev) + * Authors Luigi Scorzato, Marco Cristoforetti + * + * + *******************************************************************************/ +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#include +#include +#ifdef MPI +# include +#endif +#include "global.h" +#include "su3.h" +#include "xchange/xchange.h" + +#ifdef WITHLAPH + +void Jacobi(su3_vector * const l, su3_vector * const k,int t) +{ + int ix,mu,tcoord,coord; + su3_vector lt; + +#ifdef MPI + xchange_jacobi(k); +#endif + + tcoord=t*SPACEVOLUME; + for(ix=0;ix. + ***********************************************************************/ +/* + * Routine for the computation of the Jacobi operator (for use into LapH_ev) + * Authors Luigi Scorzato, Marco Cristoforetti + * + * + *******************************************************************************/ +#ifndef _JACOBI_H +#define _JACOBI_H + +#include "su3.h" + +void Jacobi(su3_vector * const l, su3_vector * const k,int t); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/lib/libinit.a b/qcd/part_cpu/applications/QCD/src/kernel_D/lib/libinit.a new file mode 100644 index 0000000000000000000000000000000000000000..0e4b3f12dd61e7805d8abbf3be299d56cb2405e6 Binary files /dev/null and b/qcd/part_cpu/applications/QCD/src/kernel_D/lib/libinit.a differ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..3afdadd13d2174e215e976a5bef2c5c32414d0c5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/Makefile @@ -0,0 +1,111 @@ + +srcdir = . +top_builddir = .. +abs_top_builddir = /home/jacob/Job/Prace/kernel_D_soft/kernel_D_update2 +top_srcdir = .. +abs_top_srcdir = /home/jacob/Job/Prace/kernel_D_soft/kernel_D_update2 +subdir = linalg +builddir = . + +CFLAGS = -std=c99 -fopenmp -pedantic -Wall +DEPFLAGS = -MM +LDFLAGS = -L${HOME}/lib -L${top_builddir}/lib +DEFS = -DHAVE_CONFIG_H +OPTARGS = -O +SOPTARGS = -O + +AR = ar +RANLIB = ranlib +CC = mpicc +CCDEP = gcc +CCLD = ${CC} +LINK = ${CCLD} ${CFLAGS} ${LDFLAGS} ${OPTARGS} -o $@ +LEX = flex +AUTOCONF = autoconf +DEFS = -DHAVE_CONFIG_H + +INCLUDES = -I$(HOME)/include/ -I. -I${abs_top_builddir}/ -I${abs_top_srcdir}/ -I/include/ -I/include/ +LDADD = +#COMPILE = ${CC} ${DEFS} ${INCLUDES} ${CFLAGS} +COMPILE = ${CC} $(DEFS) ${INCLUDES} ${CFLAGS} + +LIBRARIES = liblinalg +liblinalg_TARGETS = assign_add_mul_r_add_mul \ + assign_mul_bra_add_mul_ket_add_r \ + scalar_prod_r scalar_prod_i \ + square_and_prod_r assign_mul_bra_add_mul_r mul_r mul_r_32 \ + diff_and_square_norm assign \ + scalar_prod mul_diff_r mul_diff_mul assign_add_mul assign_mul_add add \ + assign_diff_mul mul_add_mul mul assign_add_mul_add_mul \ + assign_mul_bra_add_mul_ket_add assign_mul_add_mul_add_mul_add_mul_r \ + mul_diff_mul_r assign_add_mul_add_mul_r \ + comp_decomp \ + convert_eo_to_lexic assign_mul_add_mul_r assign_mul_add_mul_r_32 \ + mul_add_mul_r assign_mul_add_mul_add_mul_r mattimesvec \ + scalar_prod_su3spinor \ + assign_mul_add_r_and_square \ + addto_32 scalar_prod_r_32 assign_mul_add_r_32 assign_add_mul_r_32 \ + square_norm_32 assign_to_32 diff_32 + +liblinalg_STARGETS = diff assign_add_mul_r assign_mul_add_r square_norm + +liblinalg_OBJECTS = $(addsuffix .o, ${liblinalg_TARGETS}) +liblinalg_SOBJECTS = $(addsuffix .o, ${liblinalg_STARGETS}) + +# default rule + +all: Makefile dep liblinalg.a + +# rules for debugging +debug all-debug: CFLAGS := $(CFLAGS) -g +debug all-debug: all + +# rules for profiling information +profile all-profile: CFLAGS := $(filter-out -fomit-frame-pointer,${CFLAGS}) +profile all-profile: all + + +#include dep rules + +-include $(addsuffix .d,${liblinalg_TARGETS}) + +include ${top_srcdir}/Makefile.global + +# rule to compile objects + +${liblinalg_OBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h + $(COMPILE) ${OPTARGS} -c $< + +${liblinalg_SOBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h + $(COMPILE) ${SOPTARGS} -c $< + +# rule to make liblinalg + +liblinalg.a: ${liblinalg_OBJECTS} ${liblinalg_SOBJECTS} Makefile + @rm -f liblinalg.a + @${AR} cru liblinalg.a ${liblinalg_OBJECTS} ${liblinalg_SOBJECTS} + @$(RANLIB) liblinalg.a + @cp liblinalg.a ../lib/liblinalg.a + +# rule to generate .d files + +$(addsuffix .d, $(liblinalg_TARGETS) ${liblinalg_STARGETS}): %.d: ${srcdir}/%.c Makefile + @${CCDEP} ${DEFS} ${DEPFLAGS} ${INCLUDES} $< > $@ + +# rule to make dependencies + +dep: ${addsuffix .d, ${liblinalg_TARGETS} ${liblinalg_STARGETS}} + +# rules to clean + +compile-clean: Makefile + rm -f ${$(addsuffix _OBJECTS, ${LIBRARIES})} ${$(addsuffix _SOBJECTS, ${LIBRARIES})} *.d + +clean: compile-clean + rm -f $(addsuffix .a, ${LIBRARIES}) + rm -f ../lib/liblinalg.a + +distclean: clean + rm -f Makefile + +.PHONY: all dep clean compile-clean distclean profile all-profile debug all-debug diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/Makefile.in b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/Makefile.in new file mode 100644 index 0000000000000000000000000000000000000000..39a4f8983fe4b22a0255db240d67f2082261d146 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/Makefile.in @@ -0,0 +1,111 @@ + +srcdir = @srcdir@ +top_builddir = @top_builddir@ +abs_top_builddir = @abs_top_builddir@ +top_srcdir = @top_srcdir@ +abs_top_srcdir = @abs_top_srcdir@ +subdir = linalg +builddir = @builddir@ + +CFLAGS = @CFLAGS@ +DEPFLAGS = @DEPFLAGS@ +LDFLAGS = @LDFLAGS@ +DEFS = @DEFS@ +OPTARGS = @OPTARGS@ +SOPTARGS = @SOPTARGS@ + +AR = @AR@ +RANLIB = @RANLIB@ +CC = @CC@ +CCDEP = @CCDEP@ +CCLD = ${CC} +LINK = ${CCLD} ${CFLAGS} ${LDFLAGS} ${OPTARGS} -o $@ +LEX = @LEX@ +AUTOCONF = @AUTOCONF@ +DEFS = @DEFS@ + +INCLUDES = @INCLUDES@ +LDADD = +#COMPILE = ${CC} ${DEFS} ${INCLUDES} ${CFLAGS} +COMPILE = ${CC} $(DEFS) ${INCLUDES} ${CFLAGS} + +LIBRARIES = liblinalg +liblinalg_TARGETS = assign_add_mul_r_add_mul \ + assign_mul_bra_add_mul_ket_add_r \ + scalar_prod_r scalar_prod_i \ + square_and_prod_r assign_mul_bra_add_mul_r mul_r mul_r_32 \ + diff_and_square_norm assign \ + scalar_prod mul_diff_r mul_diff_mul assign_add_mul assign_mul_add add \ + assign_diff_mul mul_add_mul mul assign_add_mul_add_mul \ + assign_mul_bra_add_mul_ket_add assign_mul_add_mul_add_mul_add_mul_r \ + mul_diff_mul_r assign_add_mul_add_mul_r \ + comp_decomp \ + convert_eo_to_lexic assign_mul_add_mul_r assign_mul_add_mul_r_32 \ + mul_add_mul_r assign_mul_add_mul_add_mul_r mattimesvec \ + scalar_prod_su3spinor \ + assign_mul_add_r_and_square \ + addto_32 scalar_prod_r_32 assign_mul_add_r_32 assign_add_mul_r_32 \ + square_norm_32 assign_to_32 diff_32 + +liblinalg_STARGETS = diff assign_add_mul_r assign_mul_add_r square_norm + +liblinalg_OBJECTS = $(addsuffix .o, ${liblinalg_TARGETS}) +liblinalg_SOBJECTS = $(addsuffix .o, ${liblinalg_STARGETS}) + +# default rule + +all: Makefile dep liblinalg.a + +# rules for debugging +debug all-debug: CFLAGS := $(CFLAGS) @DEBUG_FLAG@ +debug all-debug: all + +# rules for profiling information +profile all-profile: CFLAGS := $(filter-out -fomit-frame-pointer,${CFLAGS}) @PROFILE_FLAG@ +profile all-profile: all + + +#include dep rules + +-include $(addsuffix .d,${liblinalg_TARGETS}) + +include ${top_srcdir}/Makefile.global + +# rule to compile objects + +${liblinalg_OBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h + $(COMPILE) ${OPTARGS} -c $< + +${liblinalg_SOBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h + $(COMPILE) ${SOPTARGS} -c $< + +# rule to make liblinalg + +liblinalg.a: ${liblinalg_OBJECTS} ${liblinalg_SOBJECTS} Makefile + @rm -f liblinalg.a + @${AR} cru liblinalg.a ${liblinalg_OBJECTS} ${liblinalg_SOBJECTS} + @$(RANLIB) liblinalg.a + @cp liblinalg.a ../lib/liblinalg.a + +# rule to generate .d files + +$(addsuffix .d, $(liblinalg_TARGETS) ${liblinalg_STARGETS}): %.d: ${srcdir}/%.c Makefile + @${CCDEP} ${DEFS} ${DEPFLAGS} ${INCLUDES} $< > $@ + +# rule to make dependencies + +dep: ${addsuffix .d, ${liblinalg_TARGETS} ${liblinalg_STARGETS}} + +# rules to clean + +compile-clean: Makefile + rm -f ${$(addsuffix _OBJECTS, ${LIBRARIES})} ${$(addsuffix _SOBJECTS, ${LIBRARIES})} *.d + +clean: compile-clean + rm -f $(addsuffix .a, ${LIBRARIES}) + rm -f ../lib/liblinalg.a + +distclean: clean + rm -f Makefile + +.PHONY: all dep clean compile-clean distclean profile all-profile debug all-debug diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/add.c b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/add.c new file mode 100644 index 0000000000000000000000000000000000000000..0866d4187e2064024f1276968cb4d6a617d93706 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/add.c @@ -0,0 +1,146 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +/******************************************************************************* + * + * void add(spinor * const Q,spinor * const R,spinor * const S) + * Makes the sum (*Q) = (*R) + (*S) + *******************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef OMP +# include +#endif +#include +#include +#include +#include "su3.h" +#include "add.h" + +#if (defined BGQ && defined XLC) + +void add(spinor * const Q,const spinor * const R,const spinor * const S, const int N) { +#ifdef OMP +#pragma omp parallel + { +#endif + vector4double x0, x1, x2, x3, x4, x5, y0, y1, y2, y3, y4, y5; + vector4double z0, z1, z2, z3, z4, z5; + double *q; + double *r,*s; + + __alignx(32, s); + __alignx(32, r); + __alignx(32, q); + __alignx(32, S); + __alignx(32, R); + + __prefetch_by_load(S); + __prefetch_by_load(R); + __prefetch_by_stream(1, Q); + +#ifndef OMP +#pragma unroll(2) +#else +#pragma omp for +#endif + for (int ix = 0; ix < N; ++ix) { + s=(double*)((spinor *) S + ix); + r=(double*)((spinor *) R + ix); + q=(double*)((spinor *) Q + ix); + __prefetch_by_load(S + ix + 1); + __prefetch_by_load(R + ix + 1); + __prefetch_by_stream(1, Q + ix + 1); + x0 = vec_ld(0, r); + x1 = vec_ld(0, r+4); + x2 = vec_ld(0, r+8); + x3 = vec_ld(0, r+12); + x4 = vec_ld(0, r+16); + x5 = vec_ld(0, r+20); + y0 = vec_ld(0, s); + y1 = vec_ld(0, s+4); + y2 = vec_ld(0, s+8); + y3 = vec_ld(0, s+12); + y4 = vec_ld(0, s+16); + y5 = vec_ld(0, s+20); + z0 = vec_add(x0, y0); + z1 = vec_add(x1, y1); + z2 = vec_add(x2, y2); + z3 = vec_add(x3, y3); + z4 = vec_add(x4, y4); + z5 = vec_add(x5, y5); + vec_st(z0, 0, q); + vec_st(z1, 0, q+4); + vec_st(z2, 0, q+8); + vec_st(z3, 0, q+12); + vec_st(z4, 0, q+16); + vec_st(z5, 0, q+20); + } + +#ifdef OMP + } /*OpenMP parallel closing brace */ +#endif + return; +} + +#else + +/* Q output, R input, S input */ +void add(spinor * const Q,const spinor * const R,const spinor * const S, const int N){ +#ifdef OMP +#pragma omp parallel + { +#endif + + int ix; + spinor *q,*r,*s; + +#ifdef OMP +#pragma omp for +#endif + for (ix = 0; ix < N; ix++){ + q=(spinor *) Q + ix; + r=(spinor *) R + ix; + s=(spinor *) S + ix; + + q->s0.c0 = r->s0.c0 + s->s0.c0; + q->s0.c1 = r->s0.c1 + s->s0.c1; + q->s0.c2 = r->s0.c2 + s->s0.c2; + + q->s1.c0 = r->s1.c0 + s->s1.c0; + q->s1.c1 = r->s1.c1 + s->s1.c1; + q->s1.c2 = r->s1.c2 + s->s1.c2; + + q->s2.c0 = r->s2.c0 + s->s2.c0; + q->s2.c1 = r->s2.c1 + s->s2.c1; + q->s2.c2 = r->s2.c2 + s->s2.c2; + + q->s3.c0 = r->s3.c0 + s->s3.c0; + q->s3.c1 = r->s3.c1 + s->s3.c1; + q->s3.c2 = r->s3.c2 + s->s3.c2; + } + +#ifdef OMP + } /* OpenMP closing brace */ +#endif + +} + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/add.h b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/add.h new file mode 100644 index 0000000000000000000000000000000000000000..7ff0bbd3a0279629af87851e906522db3d54d2a0 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/add.h @@ -0,0 +1,29 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _ADD_H +#define _ADD_H + +#include "su3.h" + +/* Makes the sum (*Q) = (*R) + (*S) */ +void add(spinor * const Q, const spinor * const R, const spinor * const S, const int N); + + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/addto_32.c b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/addto_32.c new file mode 100644 index 0000000000000000000000000000000000000000..c15f994d4cb3319fd809df8da6547a42bb87b783 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/addto_32.c @@ -0,0 +1,55 @@ +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef OMP +# include +#endif +#include +#include +#include +#include "su3.h" +#include "addto_32.h" + + + +/* Q output, R input, S input */ +void addto_32(spinor * const Q, const spinor32 * const R, const int N) +{ +#ifdef OMP +#pragma omp parallel + { +#endif + + int ix; + spinor *q; + spinor32 * r; +#ifdef OMP +#pragma omp for +#endif + for (ix = 0; ix < N; ix++){ + q=(spinor *) Q + ix; + r=(spinor32 *) R + ix; + + + q->s0.c0 += r->s0.c0; + q->s0.c1 += r->s0.c1; + q->s0.c2 += r->s0.c2; + + q->s1.c0 += r->s1.c0; + q->s1.c1 += r->s1.c1; + q->s1.c2 += r->s1.c2; + + q->s2.c0 += r->s2.c0; + q->s2.c1 += r->s2.c1; + q->s2.c2 += r->s2.c2; + + q->s3.c0 += r->s3.c0; + q->s3.c1 += r->s3.c1; + q->s3.c2 += r->s3.c2; + } + +#ifdef OMP + } /* OpenMP closing brace */ +#endif + +} \ No newline at end of file diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/addto_32.h b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/addto_32.h new file mode 100644 index 0000000000000000000000000000000000000000..9afbc350ce81f7914cfd0fa66739ace02472fb6b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/addto_32.h @@ -0,0 +1,10 @@ +#ifndef _ADDTO_32_H +#define _ADDTO_32_H + +#include "su3.h" + +/* Makes the sum (*Q) = (*Q) + (*S) */ +void addto_32(spinor * const Q, const spinor32 * const R, const int N); + + +#endif \ No newline at end of file diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign.c b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign.c new file mode 100644 index 0000000000000000000000000000000000000000..7f5a4cdef280e1db9fee588f02ac934792fe1fd6 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign.c @@ -0,0 +1,69 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +/******************************************************************************* + * + * File assign.c + * + * void assign(spinor * const R, spinor * const S) + * Assign (*R) = (*S) + * + *******************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#include +#include "su3.h" +#include "assign.h" + + +/* S input, R output */ +/* S and R must not overlap */ +void assign(spinor * const R, spinor * const S, const int N) +{ + memcpy(R, S, N*sizeof(spinor)); + return; +} + +void assign_32(spinor32 * const R, spinor32 * const S, const int N) +{ + memcpy(R, S, N*sizeof(spinor32)); + return; +} + +#ifdef WITHLAPH +void assign_su3vect(su3_vector * const R, su3_vector * const S, const int N) +{ + su3_vector *r,*s; + + for (int ix = 0; ix < N; ++ix) + { + r=R+ix; + s=S+ix; + + r->c0 = s->c0; + r->c1 = s->c1; + r->c2 = s->c2; + } +} +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign.h b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign.h new file mode 100644 index 0000000000000000000000000000000000000000..302829aa63e7ec5a98256490910f4b8132c1a7a4 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign.h @@ -0,0 +1,30 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _ASSIGN_H +#define _ASSIGN_H + +#include "su3.h" + +/* Assign (*R) = (*S) */ +void assign(spinor * const R, spinor * const S, const int N); +void assign_32(spinor32 * const R, spinor32 * const S, const int N); +void assign_su3vect(su3_vector * const R, su3_vector * const S, const int N); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_add_mul.c b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_add_mul.c new file mode 100644 index 0000000000000000000000000000000000000000..127092fa0d98e236c14d5b0c88e6f6a3d8df8a4f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_add_mul.c @@ -0,0 +1,80 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +/******************************************************************************* + * + * File assign_add_mul.c + * + * void assign_add_mul(spinor * const P, spinor * const Q, const complex c) + * (*P) = (*P) + c(*Q) c is a complex constant + * + *******************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef OMP +# include +#endif +#include +#include +#include +#include "su3.h" +#include "assign_add_mul.h" + + +void assign_add_mul(spinor * const R, spinor * const S, const _Complex double c, const int N) +{ +#ifdef OMP +#pragma omp parallel + { +#endif + spinor *r,*s; + +#ifdef OMP +#pragma omp for +#endif + for (int ix=0; ixs0.c0 += c * s->s0.c0; + r->s0.c1 += c * s->s0.c1; + r->s0.c2 += c * s->s0.c2; + + r->s1.c0 += c * s->s1.c0; + r->s1.c1 += c * s->s1.c1; + r->s1.c2 += c * s->s1.c2; + + r->s2.c0 += c * s->s2.c0; + r->s2.c1 += c * s->s2.c1; + r->s2.c2 += c * s->s2.c2; + + r->s3.c0 += c * s->s3.c0; + r->s3.c1 += c * s->s3.c1; + r->s3.c2 += c * s->s3.c2; + } + +#ifdef OMP + } /* OpenMP closing brace */ +#endif + +} + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_add_mul.h b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_add_mul.h new file mode 100644 index 0000000000000000000000000000000000000000..89d67ee32db8f8fc41e3c1a52d8e95d4245a6dc1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_add_mul.h @@ -0,0 +1,28 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _ASSIGN_ADD_MUL_H +#define _ASSIGN_ADD_MUL_H + +#include "su3.h" + +/* (*P) = (*P) + c(*Q) c is a complex constant */ +void assign_add_mul(spinor * const P, spinor * const Q, const _Complex double c, const int N); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_add_mul_add_mul.c b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_add_mul_add_mul.c new file mode 100644 index 0000000000000000000000000000000000000000..34368eaf7ba22e289e5a88bd38a60e2e3e928054 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_add_mul_add_mul.c @@ -0,0 +1,81 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +/******************************************************************************* + * + * File assign_add_mul_add_mul.c + * + * void assign_add_mul_add_mul(spinor * const R,spinor * const S,spinor * const U,const _Complex double c1,const _Complex double c2) + * (*R) = (*R) + c1*(*S) + c2*(*U) with c1 and c2 _Complex double variables + * + *******************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef OMP +# include +#endif +#include +#include +#include +#include "su3.h" +#include "assign_add_mul_add_mul.h" + + +/* S,U input, R inoutput, c1,c2 input */ +void assign_add_mul_add_mul(spinor * const R, spinor * const S, spinor * const U, const _Complex double c1, const _Complex double c2, const int N) +{ +#ifdef OMP +#pragma omp parallel + { +#endif + + spinor *r, *s, *u; + +#ifdef OMP +#pragma omp for +#endif + for (int ix = 0; ix < N; ++ix) + { + r=(spinor *) R + ix; + s=(spinor *) S + ix; + u=(spinor *) U + ix; + + r->s0.c0 += c1 * s->s0.c0 + c2 * u->s0.c0; + r->s0.c1 += c1 * s->s0.c1 + c2 * u->s0.c1; + r->s0.c2 += c1 * s->s0.c2 + c2 * u->s0.c2; + + r->s1.c0 += c1 * s->s1.c0 + c2 * u->s1.c0; + r->s1.c1 += c1 * s->s1.c1 + c2 * u->s1.c1; + r->s1.c2 += c1 * s->s1.c2 + c2 * u->s1.c2; + + r->s2.c0 += c1 * s->s2.c0 + c2 * u->s2.c0; + r->s2.c1 += c1 * s->s2.c1 + c2 * u->s2.c1; + r->s2.c2 += c1 * s->s2.c2 + c2 * u->s2.c2; + + r->s3.c0 += c1 * s->s3.c0 + c2 * u->s3.c0; + r->s3.c1 += c1 * s->s3.c1 + c2 * u->s3.c1; + r->s3.c2 += c1 * s->s3.c2 + c2 * u->s3.c2; + } + +#ifdef OMP + } /* OpenMP closing brace */ +#endif + +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_add_mul_add_mul.h b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_add_mul_add_mul.h new file mode 100644 index 0000000000000000000000000000000000000000..20ea8faa226c58b61ef0d0ceaae0f943d8b85541 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_add_mul_add_mul.h @@ -0,0 +1,29 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _ASSIGN_ADD_MUL_ADD_MUL_H +#define _ASSIGN_ADD_MUL_ADD_MUL_H + +#include "su3.h" + +/* (*R) = (*R) + c1*(*S) + c2*(*U) */ +void assign_add_mul_add_mul(spinor * const R, spinor * const S, spinor * const U, const _Complex double c1, const _Complex double c2, const int N); + + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_add_mul_add_mul_r.c b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_add_mul_add_mul_r.c new file mode 100644 index 0000000000000000000000000000000000000000..7d372cd3779c1b8d233d8b43300d44a7fe99fa0f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_add_mul_add_mul_r.c @@ -0,0 +1,63 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +/******************************************************************************* + * + * File assign_add_mul_add_mul_r.c + * + * void assign_add_mul_add_mul_r(spinor * const R,spinor * const S,spinor * const U,const double c1,const double c2) + * (*R) = (*R) + c1*(*S) + c2*(*U) with c1 and c2 real variables + * + *******************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "su3.h" +#include "assign_add_mul_add_mul_r.h" + + +/* S,U input, R inoutput, c1,c2 input */ +void assign_add_mul_add_mul_r(spinor * const R, spinor * const S, spinor * const U, + const double c1, const double c2, const int N){ + int ix; + spinor *r,*s,*u; + + for (ix = 0; ix < N; ++ix) + { + r=(spinor *) R + ix; + s=(spinor *) S + ix; + u=(spinor *) U + ix; + + r->s0.c0 += c1 * s->s0.c0 + c2 * u->s0.c0; + r->s0.c1 += c1 * s->s0.c1 + c2 * u->s0.c1; + r->s0.c2 += c1 * s->s0.c2 + c2 * u->s0.c2; + r->s1.c0 += c1 * s->s1.c0 + c2 * u->s1.c0; + r->s1.c1 += c1 * s->s1.c1 + c2 * u->s1.c1; + r->s1.c2 += c1 * s->s1.c2 + c2 * u->s1.c2; + r->s2.c0 += c1 * s->s2.c0 + c2 * u->s2.c0; + r->s2.c1 += c1 * s->s2.c1 + c2 * u->s2.c1; + r->s2.c2 += c1 * s->s2.c2 + c2 * u->s2.c2; + r->s3.c0 += c1 * s->s3.c0 + c2 * u->s3.c0; + r->s3.c1 += c1 * s->s3.c1 + c2 * u->s3.c1; + r->s3.c2 += c1 * s->s3.c2 + c2 * u->s3.c2; + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_add_mul_add_mul_r.h b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_add_mul_add_mul_r.h new file mode 100644 index 0000000000000000000000000000000000000000..3b12c743d7c894e569abff28ba5a6321c25d9939 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_add_mul_add_mul_r.h @@ -0,0 +1,30 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _ASSIGN_ADD_MUL_ADD_MUL_R_H +#define _ASSIGN_ADD_MUL_ADD_MUL_R_H + +#include "su3.h" + +/* (*R) = (*R) + c1*(*S) + c2*(*U) */ +void assign_add_mul_add_mul_r(spinor * const R, spinor * const S, spinor * const U, + const double c1, const double c2, const int N); + + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_add_mul_r.c b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_add_mul_r.c new file mode 100644 index 0000000000000000000000000000000000000000..d81ca94ea7548d221bfdef360e8f2bf75b9ffec3 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_add_mul_r.c @@ -0,0 +1,411 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#include +#include +#include +#include +#ifdef MPI +# include +#endif +#ifdef OMP +# include +#endif +#ifdef HAVE_CONFIG_H +# include +#endif +#include "su3.h" +#include "assign_add_mul_r.h" + + +#if ( defined SSE2 || defined SSE3 ) +#include "sse.h" + +/* (*P) = (*P) + c(*Q) c is a real constant */ + +void assign_add_mul_r(spinor * const P, spinor * const Q, const double c, const int N) +{ +#ifdef OMP +#pragma omp parallel + { +#endif + int ix; + su3_vector *s,*r; + __asm__ __volatile__ ("movsd %0, %%xmm7 \n\t" + "unpcklpd %%xmm7, %%xmm7" + : + : + "m" (c)); +#ifndef OMP + s=&P[0].s0; + r=&Q[0].s0; +#endif + +#ifdef OMP +#pragma omp for +#endif + for (ix = 0;ix < 4*N; ix++) { +#ifdef OMP + s=&P[0].s0+ix; + r=&Q[0].s0+ix; +#endif + _sse_load_up(*r); + __asm__ __volatile__ ("mulpd %%xmm7, %%xmm3 \n\t" + "mulpd %%xmm7, %%xmm4 \n\t" + "mulpd %%xmm7, %%xmm5" + : + :); + _sse_load(*s); + __asm__ __volatile__ ("addpd %%xmm3, %%xmm0 \n\t" + "addpd %%xmm4, %%xmm1 \n\t" + "addpd %%xmm5, %%xmm2" + : + :); + _sse_store(*s); +#ifndef OMP + s++; r++; +#endif + } +#ifdef OMP + } /* OpenMP closing brace */ +#endif +} + +#elif (defined BGQ && defined XLC) + +void assign_add_mul_r(spinor * const R, spinor * const S, const double c, const int N) { +#ifdef OMP +#pragma omp parallel + { +#endif + vector4double x0, x1, x2, x3, x4, x5, y0, y1, y2, y3, y4, y5; + vector4double z0, z1, z2, z3, z4, z5, k; + double *s, *r; + double ALIGN _c; + _c = c; + __prefetch_by_load(S); + __prefetch_by_load(R); + + k = vec_splats(_c); + __alignx(32, s); + __alignx(32, r); + __alignx(32, S); + __alignx(32, R); + +#ifdef OMP +#pragma omp for +#else +#pragma unroll(2) +#endif + for(int i = 0; i < N; i++) { + s=(double*)((spinor *) S + i); + r=(double*)((spinor *) R + i); + __prefetch_by_load(S + i + 1); + __prefetch_by_stream(1, R + i + 1); + x0 = vec_ld(0, r); + x1 = vec_ld(0, r+4); + x2 = vec_ld(0, r+8); + x3 = vec_ld(0, r+12); + x4 = vec_ld(0, r+16); + x5 = vec_ld(0, r+20); + y0 = vec_ld(0, s); + y1 = vec_ld(0, s+4); + y2 = vec_ld(0, s+8); + y3 = vec_ld(0, s+12); + y4 = vec_ld(0, s+16); + y5 = vec_ld(0, s+20); + z0 = vec_madd(k, y0, x0); + z1 = vec_madd(k, y1, x1); + z2 = vec_madd(k, y2, x2); + z3 = vec_madd(k, y3, x3); + z4 = vec_madd(k, y4, x4); + z5 = vec_madd(k, y5, x5); + vec_st(z0, 0, r); + vec_st(z1, 0, r+4); + vec_st(z2, 0, r+8); + vec_st(z3, 0, r+12); + vec_st(z4, 0, r+16); + vec_st(z5, 0, r+20); + } +#ifdef OMP + } /* OpenMP closing brace */ +#endif + return; +} + +#elif ((defined BGL) && (defined XLC)) + +# include"bgl.h" + +void assign_add_mul_r(spinor * const R, spinor * const S, const double c, const int N) { + int ix = 1; + double *s ALIGN; + double *sp ALIGN; + double *r ALIGN; + double *rp ALIGN; + double _Complex x00, x01, x02, x03, x04, x05, x06, x07, + x08, x09, x10, x11; + double _Complex y00, y01, y02, y03, y04, y05, y06, y07, + y08, y09, y10, y11; + double _Complex a; + +#pragma disjoint(*S, *R) + a = __cmplx(c, c); + __alignx(16, S); + __alignx(16, R); + s = (double*) S; + r = (double*) R; + rp = r + 24; + sp = s + 24; + _prefetch_spinor(rp); + _prefetch_spinor(sp); + x00 = __lfpd(s); + x01 = __lfpd(s+2); + x02 = __lfpd(s+4); + x03 = __lfpd(s+6); + x04 = __lfpd(s+8); + x05 = __lfpd(s+10); + x06 = __lfpd(s+12); + x07 = __lfpd(s+14); + x08 = __lfpd(s+16); + x09 = __lfpd(s+18); + x10 = __lfpd(s+20); + x11 = __lfpd(s+22); + y00 = __lfpd(r); + y01 = __lfpd(r+2); + y02 = __lfpd(r+4); + y03 = __lfpd(r+6); + y04 = __lfpd(r+8); + y05 = __lfpd(r+10); + y06 = __lfpd(r+12); + y07 = __lfpd(r+14); + y08 = __lfpd(r+16); + y09 = __lfpd(r+18); + y10 = __lfpd(r+20); + y11 = __lfpd(r+22); + + y00 = __fpmadd(y00, x00, a); + y01 = __fpmadd(y01, x01, a); + y02 = __fpmadd(y02, x02, a); + y03 = __fpmadd(y03, x03, a); + y04 = __fpmadd(y04, x04, a); + y05 = __fpmadd(y05, x05, a); + y06 = __fpmadd(y06, x06, a); + y07 = __fpmadd(y07, x07, a); + y08 = __fpmadd(y08, x08, a); + y09 = __fpmadd(y09, x09, a); + y10 = __fpmadd(y10, x10, a); + y11 = __fpmadd(y11, x11, a); + __stfpd(r, y00); + __stfpd(r+2, y01); + __stfpd(r+4, y02); + __stfpd(r+6, y03); + __stfpd(r+8, y04); + __stfpd(r+10, y05); + __stfpd(r+12, y06); + __stfpd(r+14, y07); + __stfpd(r+16, y08); + __stfpd(r+18, y09); + __stfpd(r+20, y10); + __stfpd(r+22, y11); + s = sp; + r = rp; + +#pragma unroll(12) + for(ix = 1; ix < N-1; ix++) { + rp += 24; + sp += 24; + _prefetch_spinor(rp); + _prefetch_spinor(sp); + x00 = __lfpd(s); + x01 = __lfpd(s+2); + x02 = __lfpd(s+4); + x03 = __lfpd(s+6); + x04 = __lfpd(s+8); + x05 = __lfpd(s+10); + x06 = __lfpd(s+12); + x07 = __lfpd(s+14); + x08 = __lfpd(s+16); + x09 = __lfpd(s+18); + x10 = __lfpd(s+20); + x11 = __lfpd(s+22); + y00 = __lfpd(r); + y01 = __lfpd(r+2); + y02 = __lfpd(r+4); + y03 = __lfpd(r+6); + y04 = __lfpd(r+8); + y05 = __lfpd(r+10); + y06 = __lfpd(r+12); + y07 = __lfpd(r+14); + y08 = __lfpd(r+16); + y09 = __lfpd(r+18); + y10 = __lfpd(r+20); + y11 = __lfpd(r+22); + + y00 = __fpmadd(y00, x00, a); + y01 = __fpmadd(y01, x01, a); + y02 = __fpmadd(y02, x02, a); + y03 = __fpmadd(y03, x03, a); + y04 = __fpmadd(y04, x04, a); + y05 = __fpmadd(y05, x05, a); + y06 = __fpmadd(y06, x06, a); + y07 = __fpmadd(y07, x07, a); + y08 = __fpmadd(y08, x08, a); + y09 = __fpmadd(y09, x09, a); + y10 = __fpmadd(y10, x10, a); + y11 = __fpmadd(y11, x11, a); + __stfpd(r, y00); + __stfpd(r+2, y01); + __stfpd(r+4, y02); + __stfpd(r+6, y03); + __stfpd(r+8, y04); + __stfpd(r+10, y05); + __stfpd(r+12, y06); + __stfpd(r+14, y07); + __stfpd(r+16, y08); + __stfpd(r+18, y09); + __stfpd(r+20, y10); + __stfpd(r+22, y11); + s = sp; + r = rp; + + } + x00 = __lfpd(s); + x01 = __lfpd(s+2); + x02 = __lfpd(s+4); + x03 = __lfpd(s+6); + x04 = __lfpd(s+8); + x05 = __lfpd(s+10); + x06 = __lfpd(s+12); + x07 = __lfpd(s+14); + x08 = __lfpd(s+16); + x09 = __lfpd(s+18); + x10 = __lfpd(s+20); + x11 = __lfpd(s+22); + y00 = __lfpd(r); + y01 = __lfpd(r+2); + y02 = __lfpd(r+4); + y03 = __lfpd(r+6); + y04 = __lfpd(r+8); + y05 = __lfpd(r+10); + y06 = __lfpd(r+12); + y07 = __lfpd(r+14); + y08 = __lfpd(r+16); + y09 = __lfpd(r+18); + y10 = __lfpd(r+20); + y11 = __lfpd(r+22); + + y00 = __fpmadd(y00, x00, a); + y01 = __fpmadd(y01, x01, a); + y02 = __fpmadd(y02, x02, a); + y03 = __fpmadd(y03, x03, a); + y04 = __fpmadd(y04, x04, a); + y05 = __fpmadd(y05, x05, a); + y06 = __fpmadd(y06, x06, a); + y07 = __fpmadd(y07, x07, a); + y08 = __fpmadd(y08, x08, a); + y09 = __fpmadd(y09, x09, a); + y10 = __fpmadd(y10, x10, a); + y11 = __fpmadd(y11, x11, a); + __stfpd(r, y00); + __stfpd(r+2, y01); + __stfpd(r+4, y02); + __stfpd(r+6, y03); + __stfpd(r+8, y04); + __stfpd(r+10, y05); + __stfpd(r+12, y06); + __stfpd(r+14, y07); + __stfpd(r+16, y08); + __stfpd(r+18, y09); + __stfpd(r+20, y10); + __stfpd(r+22, y11); + + return; +} + +#else + +/* (*P) = (*P) + c(*Q) c is a real constant */ + +void assign_add_mul_r(spinor * const P, spinor * const Q, const double c, const int N) +{ +#ifdef OMP +#pragma omp parallel + { +#endif + register spinor *p; + register spinor *q; + + /* Change due to even-odd preconditioning : VOLUME to VOLUME/2 */ +#ifdef OMP +#pragma omp for +#endif + for (int ix = 0; ix < N; ++ix) + { + p = P + ix; + q = Q + ix; + p->s0.c0 += c * q->s0.c0; + p->s0.c1 += c * q->s0.c1; + p->s0.c2 += c * q->s0.c2; + + p->s1.c0 += c * q->s1.c0; + p->s1.c1 += c * q->s1.c1; + p->s1.c2 += c * q->s1.c2; + + p->s2.c0 += c * q->s2.c0; + p->s2.c1 += c * q->s2.c1; + p->s2.c2 += c * q->s2.c2; + + p->s3.c0 += c * q->s3.c0; + p->s3.c1 += c * q->s3.c1; + p->s3.c2 += c * q->s3.c2; + } +#ifdef OMP + } /* OpenMP closing brace */ +#endif +} +#endif + +#ifdef WITHLAPH +void assign_add_mul_r_su3vect(su3_vector * const P, su3_vector * const Q, const double c, const int N) +{ +#ifdef OMP +#pragma omp parallel + { +#endif + + su3_vector *p,*q; + +#ifdef OMP +#pragma omp for +#endif + for (int ix = 0; ix < N; ++ix) + { + p = P + ix; + q = Q + ix; + + p->c0 += c * q->c0; + p->c1 += c * q->c1; + p->c2 += c * q->c2; + } +#ifdef OMP + } /* OpenMP closing brace */ +#endif +} +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_add_mul_r.h b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_add_mul_r.h new file mode 100644 index 0000000000000000000000000000000000000000..46897542895ffd4c271259fb3fc8c8c90bcfb6eb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_add_mul_r.h @@ -0,0 +1,28 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _ASSIGN_ADD_MUL_R_H +#define _ASSIGN_ADD_MUL_R_H + +#include "su3.h" + +void assign_add_mul_r(spinor * const P, spinor * const Q, const double c, const int N); +void assign_add_mul_r_su3vect(su3_vector * const P, su3_vector * const Q, const double c, const int N); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_add_mul_r_32.c b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_add_mul_r_32.c new file mode 100644 index 0000000000000000000000000000000000000000..fe513a774605abc95dbd0cae6069b45478ff2bb2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_add_mul_r_32.c @@ -0,0 +1,143 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +/******************************************************************************* + * + * File assign_add_mul.c + * + * void assign_add_mul(spinor * const P, spinor * const Q, const complex c) + * (*P) = (*P) + c(*Q) c is a complex constant + * + *******************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef OMP +# include +#endif +#include +#include +#include +#include "su3.h" +#include "assign_add_mul_r_32.h" + + +#if (defined BGQ && defined XLC) +void assign_add_mul_r_32(spinor32 * const R, spinor32 * const S, const float c, const int N) { +#ifdef OMP +#pragma omp parallel + { +#endif + vector4double x0, x1, x2, x3, x4, x5, y0, y1, y2, y3, y4, y5; + vector4double z0, z1, z2, z3, z4, z5, k; + float *s, *r; + float ALIGN32 _c; + _c = c; + __prefetch_by_load(S); + __prefetch_by_load(R); + + k = vec_splats((double)_c); + __alignx(16, s); + __alignx(16, r); + __alignx(16, S); + __alignx(16, R); + +#ifdef OMP +#pragma omp for +#else +#pragma unroll(2) +#endif + for(int i = 0; i < N; i++) { + s=(float*)((spinor32 *) S + i); + r=(float*)((spinor32 *) R + i); + __prefetch_by_load(S + i + 1); + __prefetch_by_stream(1, R + i + 1); + x0 = vec_ld(0, r); + x1 = vec_ld(0, r+4); + x2 = vec_ld(0, r+8); + x3 = vec_ld(0, r+12); + x4 = vec_ld(0, r+16); + x5 = vec_ld(0, r+20); + y0 = vec_ld(0, s); + y1 = vec_ld(0, s+4); + y2 = vec_ld(0, s+8); + y3 = vec_ld(0, s+12); + y4 = vec_ld(0, s+16); + y5 = vec_ld(0, s+20); + z0 = vec_madd(k, y0, x0); + z1 = vec_madd(k, y1, x1); + z2 = vec_madd(k, y2, x2); + z3 = vec_madd(k, y3, x3); + z4 = vec_madd(k, y4, x4); + z5 = vec_madd(k, y5, x5); + vec_st(z0, 0, r); + vec_st(z1, 0, r+4); + vec_st(z2, 0, r+8); + vec_st(z3, 0, r+12); + vec_st(z4, 0, r+16); + vec_st(z5, 0, r+20); + } +#ifdef OMP + } /* OpenMP closing brace */ +#endif + return; +} + +#else + +void assign_add_mul_r_32(spinor32 * const R, spinor32 * const S, const float c, const int N) +{ +#ifdef OMP +#pragma omp parallel + { +#endif + spinor32 *r,*s; + +#ifdef OMP +#pragma omp for +#endif + for (int ix=0; ixs0.c0 += c * s->s0.c0; + r->s0.c1 += c * s->s0.c1; + r->s0.c2 += c * s->s0.c2; + + r->s1.c0 += c * s->s1.c0; + r->s1.c1 += c * s->s1.c1; + r->s1.c2 += c * s->s1.c2; + + r->s2.c0 += c * s->s2.c0; + r->s2.c1 += c * s->s2.c1; + r->s2.c2 += c * s->s2.c2; + + r->s3.c0 += c * s->s3.c0; + r->s3.c1 += c * s->s3.c1; + r->s3.c2 += c * s->s3.c2; + } + +#ifdef OMP + } /* OpenMP closing brace */ +#endif + +} + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_add_mul_r_32.h b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_add_mul_r_32.h new file mode 100644 index 0000000000000000000000000000000000000000..c3bec9ecf6d3ecea13d84ce70a3eb71043cd6341 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_add_mul_r_32.h @@ -0,0 +1,9 @@ +#ifndef _ASSIGN_ADD_MUL_32_H +#define _ASSIGN_ADD_MUL_32_H + +#include "su3.h" + +/* (*P) = (*P) + c(*Q) c is a complex constant */ +void assign_add_mul_r_32(spinor32 * const R, spinor32 * const S, const float c, const int N); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_add_mul_r_add_mul.c b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_add_mul_r_add_mul.c new file mode 100644 index 0000000000000000000000000000000000000000..bff23edddc573dd2e527bc19feae400d12aad24d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_add_mul_r_add_mul.c @@ -0,0 +1,127 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#ifdef MPI +#include +#endif +#ifdef OMP +# include +#endif +#include "su3.h" +#include "su3adj.h" +#include "sse.h" +#include "assign_add_mul_r_add_mul.h" + +#if ( defined SSE2 || defined SSE3 ) +void assign_add_mul_r_add_mul(spinor * const R, spinor * const S, spinor * const U, + const double c1,const double c2, const int N) { + + int ix; + su3_vector *s,*r,*t; + r=&R[0].s0; + s=&S[0].s0; + t=&U[0].s0; + __asm__ __volatile__ ("movsd %0, %%xmm6 \n\t" + "unpcklpd %%xmm6, %%xmm6" + : + : + "m" (c1)); + __asm__ __volatile__ ("movsd %0, %%xmm7 \n\t" + "unpcklpd %%xmm7, %%xmm7" + : + : + "m" (c2)); + + for (ix = 0; ix < 4*N; ix++) { + _sse_load_up(*s); + __asm__ __volatile__ ("mulpd %%xmm6, %%xmm3 \n\t" + "mulpd %%xmm6, %%xmm4 \n\t" + "mulpd %%xmm6, %%xmm5" + : + :); + _sse_load(*r); + __asm__ __volatile__ ("addpd %%xmm3, %%xmm0 \n\t" + "addpd %%xmm4, %%xmm1 \n\t" + "addpd %%xmm5, %%xmm2" + : + :); + _sse_load_up(*t); + __asm__ __volatile__ ("mulpd %%xmm7, %%xmm3 \n\t" + "mulpd %%xmm7, %%xmm4 \n\t" + "mulpd %%xmm7, %%xmm5" + : + :); + __asm__ __volatile__ ("addpd %%xmm3, %%xmm0 \n\t" + "addpd %%xmm4, %%xmm1 \n\t" + "addpd %%xmm5, %%xmm2" + : + :); + _sse_store(*r); + r++; s++; t++; + } +} +#else +/* j, k input, l output */ +void assign_add_mul_r_add_mul(spinor * const R, spinor * const S, spinor * const U, + const double c1,const double c2, const int N) { +#ifdef OMP +#pragma omp parallel + { +#endif + + spinor *r,*s,*t; + +#ifdef OMP +#pragma omp for +#endif + for (int ix = 0; ix < N; ++ix) + { + r=R+ix; + s=S+ix; + t=U+ix; + + r->s0.c0 += c1 * s->s0.c0 + c2 * t->s0.c0; + r->s0.c1 += c1 * s->s0.c1 + c2 * t->s0.c1; + r->s0.c2 += c1 * s->s0.c2 + c2 * t->s0.c2; + + r->s1.c0 += c1 * s->s1.c0 + c2 * t->s1.c0; + r->s1.c1 += c1 * s->s1.c1 + c2 * t->s1.c1; + r->s1.c2 += c1 * s->s1.c2 + c2 * t->s1.c2; + + r->s2.c0 += c1 * s->s2.c0 + c2 * t->s2.c0; + r->s2.c1 += c1 * s->s2.c1 + c2 * t->s2.c1; + r->s2.c2 += c1 * s->s2.c2 + c2 * t->s2.c2; + + r->s3.c0 += c1 * s->s3.c0 + c2 * t->s3.c0; + r->s3.c1 += c1 * s->s3.c1 + c2 * t->s3.c1; + r->s3.c2 += c1 * s->s3.c2 + c2 * t->s3.c2; + } + +#ifdef OMP + } /* OpenMP closing brace */ +#endif + +} +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_add_mul_r_add_mul.h b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_add_mul_r_add_mul.h new file mode 100644 index 0000000000000000000000000000000000000000..4fbf3c3f2f50a2dc4d799b93731cef6584759718 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_add_mul_r_add_mul.h @@ -0,0 +1,30 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _ASSIGN_ADD_MUL_R_ADD_MUL_H +#define _ASSIGN_ADD_MUL_R_ADD_MUL_H + +#include "su3.h" + +/* (*R) = (*R) + c1*(*S) + c2*(*U) */ +void assign_add_mul_r_add_mul(spinor * const R, spinor * const S, spinor * const U, + const double c1,const double c2, const int N); + + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_diff_mul.c b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_diff_mul.c new file mode 100644 index 0000000000000000000000000000000000000000..48b77167e48f106f82e23683875eb7764c11c127 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_diff_mul.c @@ -0,0 +1,66 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef OMP +# include +#endif +#include +#include "su3.h" +#include "assign_diff_mul.h" + +/* R=R-c*S */ +void assign_diff_mul(spinor * const R, spinor * const S, const _Complex double c, const int N) +{ +#ifdef OMP +#pragma omp parallel + { +#endif + spinor *r, *s; + +#ifdef OMP +#pragma omp for +#endif + for (int ix = 0; ix < N; ++ix) + { + r=(spinor *) R + ix; + s=(spinor *) S + ix; + + r->s0.c0 -= c * s->s0.c0; + r->s0.c1 -= c * s->s0.c1; + r->s0.c2 -= c * s->s0.c2; + + r->s1.c0 -= c * s->s1.c0; + r->s1.c1 -= c * s->s1.c1; + r->s1.c2 -= c * s->s1.c2; + + r->s2.c0 -= c * s->s2.c0; + r->s2.c1 -= c * s->s2.c1; + r->s2.c2 -= c * s->s2.c2; + + r->s3.c0 -= c * s->s3.c0; + r->s3.c1 -= c * s->s3.c1; + r->s3.c2 -= c * s->s3.c2; + } +#ifdef OMP + } /* OpenMP closing brace */ +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_diff_mul.h b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_diff_mul.h new file mode 100644 index 0000000000000000000000000000000000000000..8e6a68921b8851b54f9f5165f262e8723b0821f8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_diff_mul.h @@ -0,0 +1,27 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _ASSIGN_DIFF_MUL_H +#define _ASSIGN_DIFF_MUL_H + +#include "su3.h" + +void assign_diff_mul(spinor * const S,spinor * const R, const _Complex double c, const int N); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_add.c b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_add.c new file mode 100644 index 0000000000000000000000000000000000000000..7865ec9894a29797246d7f55858358e958a58cff --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_add.c @@ -0,0 +1,71 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef OMP +# include +#endif +#include +#include +#include +#include "su3.h" +#include "assign_mul_add.h" + +void assign_mul_add(spinor * const R, const _Complex double c, spinor * const S, const int N) +{ +#ifdef OMP +#pragma omp parallel + { +#endif + spinor *r,*s; + +#ifdef OMP +#pragma omp for +#endif + for (int ix=0; ixs0.c0 = c * r->s0.c0 + s->s0.c0; + r->s0.c1 = c * r->s0.c1 + s->s0.c1; + r->s0.c2 = c * r->s0.c2 + s->s0.c2; + + r->s1.c0 = c * r->s1.c0 + s->s1.c0; + r->s1.c1 = c * r->s1.c1 + s->s1.c1; + r->s1.c2 = c * r->s1.c2 + s->s1.c2; + + r->s2.c0 = c * r->s2.c0 + s->s2.c0; + r->s2.c1 = c * r->s2.c1 + s->s2.c1; + r->s2.c2 = c * r->s2.c2 + s->s2.c2; + + r->s3.c0 = c * r->s3.c0 + s->s3.c0; + r->s3.c1 = c * r->s3.c1 + s->s3.c1; + r->s3.c2 = c * r->s3.c2 + s->s3.c2; + } + +#ifdef OMP + } /* OpenMP closing brace */ +#endif + +} + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_add.h b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_add.h new file mode 100644 index 0000000000000000000000000000000000000000..4571c80e4a1372d300f4d2e3a76171a18d1ea53c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_add.h @@ -0,0 +1,28 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _ASSIGN_MUL_ADD_H +#define _ASSIGN_MUL_ADD_H + +#include "su3.h" + +/* (*P) = c(*P) + (*Q) c is a complex constant */ +void assign_mul_add(spinor * const P, const _Complex double c, spinor * const Q, const int N); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_add_mul_add_mul_add_mul_r.c b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_add_mul_add_mul_add_mul_r.c new file mode 100644 index 0000000000000000000000000000000000000000..72e12359797740a76c75f3c670bd259b49cfdb33 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_add_mul_add_mul_add_mul_r.c @@ -0,0 +1,83 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +/******************************************************************************* + * + * File assign_mul_add_mul_add_mul_add_mul_r.c + * + * void assign_mul_add_mul_add_mul_add_mul_r + * (spinor * const R,spinor * const S,spinor * const U,spinor * const V,const double c1,const double c2,const double c3,const double c4) + * Makes (*R) = c1*(*R) + c2*(*S) + c3*(*U) + c4*(*V)with c1, c2, c3, c4 real variables + * + *******************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef OMP +# include +#endif +#include +#include +#include +#include "su3.h" +#include "su3adj.h" +#include "assign_mul_add_mul_add_mul_add_mul_r.h" + + +/* S,U,V input, R inoutput, c1,c2,c3,c4 input */ +void assign_mul_add_mul_add_mul_add_mul_r(spinor * const R, spinor * const S, spinor * const U, spinor * const V, + const double c1, const double c2, const double c3, const double c4, const int N) +{ +#ifdef OMP +#pragma omp parallel + { +#endif + + spinor *r, *s, *u, *v; + +#ifdef OMP +#pragma omp for +#endif + for (int ix = 0; ix < N; ++ix) + { + r=(spinor *) R + ix; + s=(spinor *) S + ix; + u=(spinor *) U + ix; + v=(spinor *) V + ix; + + r->s0.c0 = c1 * r->s0.c0 + c2 * s->s0.c0 + c3 * u->s0.c0 + c4 * v->s0.c0; + r->s0.c1 = c1 * r->s0.c1 + c2 * s->s0.c1 + c3 * u->s0.c1 + c4 * v->s0.c1; + r->s0.c2 = c1 * r->s0.c2 + c2 * s->s0.c2 + c3 * u->s0.c2 + c4 * v->s0.c2; + + r->s1.c0 = c1 * r->s1.c0 + c2 * s->s1.c0 + c3 * u->s1.c0 + c4 * v->s1.c0; + r->s1.c1 = c1 * r->s1.c1 + c2 * s->s1.c1 + c3 * u->s1.c1 + c4 * v->s1.c1; + r->s1.c2 = c1 * r->s1.c2 + c2 * s->s1.c2 + c3 * u->s1.c2 + c4 * v->s1.c2; + + r->s2.c0 = c1 * r->s2.c0 + c2 * s->s2.c0 + c3 * u->s2.c0 + c4 * v->s2.c0; + r->s2.c1 = c1 * r->s2.c1 + c2 * s->s2.c1 + c3 * u->s2.c1 + c4 * v->s2.c1; + r->s2.c2 = c1 * r->s2.c2 + c2 * s->s2.c2 + c3 * u->s2.c2 + c4 * v->s2.c2; + + r->s3.c0 = c1 * r->s3.c0 + c2 * s->s3.c0 + c3 * u->s3.c0 + c4 * v->s3.c0; + r->s3.c1 = c1 * r->s3.c1 + c2 * s->s3.c1 + c3 * u->s3.c1 + c4 * v->s3.c1; + r->s3.c2 = c1 * r->s3.c2 + c2 * s->s3.c2 + c3 * u->s3.c2 + c4 * v->s3.c2; + } +#ifdef OMP + } /* OpenMP closing brace */ +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_add_mul_add_mul_add_mul_r.h b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_add_mul_add_mul_add_mul_r.h new file mode 100644 index 0000000000000000000000000000000000000000..0f343c8f63afb895a59fc74e0510166e15f89166 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_add_mul_add_mul_add_mul_r.h @@ -0,0 +1,29 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _ASSIGN_MUL_ADD_MUL_ADD_MUL_ADD_MUL_R_H +#define _ASSIGN_MUL_ADD_MUL_ADD_MUL_ADD_MUL_R_H + +#include "su3.h" + +/* Makes (*R) = c1*(*R) + c2*(*S) + c3*(*U) + c4*(*V)*/ +void assign_mul_add_mul_add_mul_add_mul_r(spinor * const R,spinor * const S,spinor * const U,spinor * const V,const double c1,const double c2,const double c3,const double c4, const int N); + + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_add_mul_add_mul_r.c b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_add_mul_add_mul_r.c new file mode 100644 index 0000000000000000000000000000000000000000..6ef2d60064709ede7dce78d08fb7eca9ec60d5b1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_add_mul_add_mul_r.c @@ -0,0 +1,82 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +/******************************************************************************* + * + * File assign_mul_add_mul_add_mul_r.c + * + * void assign_mul_add_mul_add_mul_r + * (spinor * const R,spinor * const S,spinor * const U,const double c1,const double c2,const double c3) + * Makes (*R) = c1*(*R) + c2*(*S) + c3*(*U) with c1, c2 and c3 real variables + * + *******************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef OMP +# include +#endif +#include +#include +#include +#include "su3.h" +#include "assign_mul_add_mul_add_mul_r.h" + +/* S,U input, R inoutput, c1,c2 input */ +void assign_mul_add_mul_add_mul_r(spinor * const R, spinor * const S, spinor * const U, + const double c1,const double c2,const double c3, + const int N) +{ +#ifdef OMP +#pragma omp parallel + { +#endif + + spinor *r,*s,*u; + +#ifdef OMP +#pragma omp for +#endif + for (int ix = 0; ix < N; ++ix) + { + r=(spinor *) R + ix; + s=(spinor *) S + ix; + u=(spinor *) U + ix; + + r->s0.c0 = c1 * r->s0.c0 + c2 * s->s0.c0 + c3 * u->s0.c0; + r->s0.c1 = c1 * r->s0.c1 + c2 * s->s0.c1 + c3 * u->s0.c1; + r->s0.c2 = c1 * r->s0.c2 + c2 * s->s0.c2 + c3 * u->s0.c2; + + r->s1.c0 = c1 * r->s1.c0 + c2 * s->s1.c0 + c3 * u->s1.c0; + r->s1.c1 = c1 * r->s1.c1 + c2 * s->s1.c1 + c3 * u->s1.c1; + r->s1.c2 = c1 * r->s1.c2 + c2 * s->s1.c2 + c3 * u->s1.c2; + + r->s2.c0 = c1 * r->s2.c0 + c2 * s->s2.c0 + c3 * u->s2.c0; + r->s2.c1 = c1 * r->s2.c1 + c2 * s->s2.c1 + c3 * u->s2.c1; + r->s2.c2 = c1 * r->s2.c2 + c2 * s->s2.c2 + c3 * u->s2.c2; + + r->s3.c0 = c1 * r->s3.c0 + c2 * s->s3.c0 + c3 * u->s3.c0; + r->s3.c1 = c1 * r->s3.c1 + c2 * s->s3.c1 + c3 * u->s3.c1; + r->s3.c2 = c1 * r->s3.c2 + c2 * s->s3.c2 + c3 * u->s3.c2; + } + +#ifdef OMP + } /* OpenMP closing brace */ +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_add_mul_add_mul_r.h b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_add_mul_add_mul_r.h new file mode 100644 index 0000000000000000000000000000000000000000..785997844a5399f57f4dc751a0db8c04a2ba8532 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_add_mul_add_mul_r.h @@ -0,0 +1,31 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _ASSIGN_MUL_ADD_MUL_ADD_MUL_R_H +#define _ASSIGN_MUL_ADD_MUL_ADD_MUL_R_H + +#include "su3.h" + +/* Makes (*R) = c1*(*R) + c2*(*S) + c3*(*U) */ +void assign_mul_add_mul_add_mul_r(spinor * const R, spinor * const S, spinor * const U, + const double c1,const double c2,const double c3, + const int N); + + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_add_mul_r.c b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_add_mul_r.c new file mode 100644 index 0000000000000000000000000000000000000000..d72e38a6865b3ed1e5379c1d831b5bf95c92dc73 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_add_mul_r.c @@ -0,0 +1,78 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +/******************************************************************************* + * Makes (*R)=c1*(*R)+c2*(*S) , c1 and c2 are real constants + *******************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef OMP +# include +#endif +#include +#include +#include +#include "su3.h" +#include "assign_mul_add_mul_r.h" + + +/* S input, R inoutput, c1,c2 input */ +void assign_mul_add_mul_r(spinor * const R,spinor * const S, + const double c1, const double c2, + const int N) { +#ifdef OMP +#pragma omp parallel + { +#endif + + spinor *r,*s; + +#ifdef OMP +#pragma omp for +#endif + for (int ix = 0; ix < N; ++ix){ + r=(spinor *) R + ix; + s=(spinor *) S + ix; + + r->s0.c0 = c1 * r->s0.c0 + c2 * s->s0.c0; + r->s0.c1 = c1 * r->s0.c1 + c2 * s->s0.c1; + r->s0.c2 = c1 * r->s0.c2 + c2 * s->s0.c2; + + r->s1.c0 = c1 * r->s1.c0 + c2 * s->s1.c0; + r->s1.c1 = c1 * r->s1.c1 + c2 * s->s1.c1; + r->s1.c2 = c1 * r->s1.c2 + c2 * s->s1.c2; + + r->s2.c0 = c1 * r->s2.c0 + c2 * s->s2.c0; + r->s2.c1 = c1 * r->s2.c1 + c2 * s->s2.c1; + r->s2.c2 = c1 * r->s2.c2 + c2 * s->s2.c2; + + r->s3.c0 = c1 * r->s3.c0 + c2 * s->s3.c0; + r->s3.c1 = c1 * r->s3.c1 + c2 * s->s3.c1; + r->s3.c2 = c1 * r->s3.c2 + c2 * s->s3.c2; + } +#ifdef OMP + } /* OpenMP closing brace */ +#endif +} + + + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_add_mul_r.h b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_add_mul_r.h new file mode 100644 index 0000000000000000000000000000000000000000..c23c9defb753c2e360ffda7e33e47a6b1cadb126 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_add_mul_r.h @@ -0,0 +1,30 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _ASSIGN_MUL_ADD_MUL_R_H +#define _ASSIGN_MUL_ADD_MUL_R_H + +#include "su3.h" + +/* Makes (*R)=c1*(*R)+c2*(*S) , c1 and c2 are real constants */ +void assign_mul_add_mul_r(spinor * const R,spinor * const S, + const double c1, const double c2, + const int N); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_add_mul_r_32.c b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_add_mul_r_32.c new file mode 100644 index 0000000000000000000000000000000000000000..0db5f160715d42c153cb68fffd8310f353e09a64 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_add_mul_r_32.c @@ -0,0 +1,78 @@ +/*********************************************************************** + * Copyright (C) 2015 Florian Burger + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +/******************************************************************************* + * Makes (*R)=c1*(*R)+c2*(*S) , c1 and c2 are real constants + *******************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef OMP +# include +#endif +#include +#include +#include +#include "su3.h" +#include "assign_mul_add_mul_r_32.h" + + +/* S input, R inoutput, c1,c2 input */ +void assign_mul_add_mul_r_32(spinor32 * const R, spinor32 * const S, + const float c1, const float c2, + const int N) { +#ifdef OMP +#pragma omp parallel + { +#endif + + spinor32 *r,*s; + +#ifdef OMP +#pragma omp for +#endif + for (int ix = 0; ix < N; ++ix){ + r=(spinor32 *) R + ix; + s=(spinor32 *) S + ix; + + r->s0.c0 = c1 * r->s0.c0 + c2 * s->s0.c0; + r->s0.c1 = c1 * r->s0.c1 + c2 * s->s0.c1; + r->s0.c2 = c1 * r->s0.c2 + c2 * s->s0.c2; + + r->s1.c0 = c1 * r->s1.c0 + c2 * s->s1.c0; + r->s1.c1 = c1 * r->s1.c1 + c2 * s->s1.c1; + r->s1.c2 = c1 * r->s1.c2 + c2 * s->s1.c2; + + r->s2.c0 = c1 * r->s2.c0 + c2 * s->s2.c0; + r->s2.c1 = c1 * r->s2.c1 + c2 * s->s2.c1; + r->s2.c2 = c1 * r->s2.c2 + c2 * s->s2.c2; + + r->s3.c0 = c1 * r->s3.c0 + c2 * s->s3.c0; + r->s3.c1 = c1 * r->s3.c1 + c2 * s->s3.c1; + r->s3.c2 = c1 * r->s3.c2 + c2 * s->s3.c2; + } +#ifdef OMP + } /* OpenMP closing brace */ +#endif +} + + + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_add_mul_r_32.h b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_add_mul_r_32.h new file mode 100644 index 0000000000000000000000000000000000000000..7c252793730932cdacaf40352d116de8f6636aac --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_add_mul_r_32.h @@ -0,0 +1,30 @@ +/*********************************************************************** + * Copyright (C) 2015 Florian Burger + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _ASSIGN_MUL_ADD_MUL_R_32_H +#define _ASSIGN_MUL_ADD_MUL_R_32_H + +#include "su3.h" + +/* Makes (*R)=c1*(*R)+c2*(*S) , c1 and c2 are real constants */ +void assign_mul_add_mul_r_32(spinor32 * const R,spinor32 * const S, + const float c1, const float c2, + const int N); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_add_r.c b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_add_r.c new file mode 100644 index 0000000000000000000000000000000000000000..dd1c528f5e1787880044a0fcad399607fa51cf3c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_add_r.c @@ -0,0 +1,405 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#ifdef OMP +# include +#endif +#include "su3.h" +#include "assign_mul_add_r.h" + + +#if ( defined SSE2 || defined SSE3 ) +#include "sse.h" + +/* k input , l output*/ +void assign_mul_add_r(spinor * const R, const double c, const spinor * const S, const int N) { +#ifdef OMP +#pragma omp parallel + { +#endif + int ix; + su3_vector *s,*r; + __asm__ __volatile__ ("movsd %0, %%xmm7 \n\t" + "unpcklpd %%xmm7, %%xmm7" + : + : + "m" (c)); +#ifndef OMP + s=&S[0].s0; + r=&R[0].s0; +#else +#pragma omp for +#endif + for (ix=0;ix<4*N;ix++) { +#ifdef OMP + s=&S[0].s0+ix; + r=&R[0].s0+ix; +#endif + _sse_load(*r); + __asm__ __volatile__ ("mulpd %%xmm7, %%xmm0 \n\t" + "mulpd %%xmm7, %%xmm1 \n\t" + "mulpd %%xmm7, %%xmm2" + : + :); + _sse_load_up(*s); + __asm__ __volatile__ ("addpd %%xmm3, %%xmm0 \n\t" + "addpd %%xmm4, %%xmm1 \n\t" + "addpd %%xmm5, %%xmm2" + : + :); + _sse_store(*r); +#ifndef OMP + s++; r++; +#endif + } + +#ifdef OMP + } /* OpenMP closing brace */ +#endif +} + +#elif (defined BGQ && defined XLC) + +void assign_mul_add_r(spinor * const R, const double c, const spinor * const S, const int N) { +#ifdef OMP +#pragma omp parallel + { +#endif + + vector4double x0, x1, x2, x3, x4, x5, y0, y1, y2, y3, y4, y5; + vector4double z0, z1, z2, z3, z4, z5, k; + double *s, *r; + double ALIGN _c; + _c = c; + __prefetch_by_load(S); + __prefetch_by_load(R); + + k = vec_splats(_c); + __alignx(32, s); + __alignx(32, r); + __alignx(32, S); + __alignx(32, R); + +#ifdef OMP +#pragma omp for +#else +#pragma unroll(4) +#endif + for(int i = 0; i < N; i++) { + s=(double*)((spinor *) S + i); + r=(double*)((spinor *) R + i); + __prefetch_by_load(S + i + 1); + __prefetch_by_stream(1, R + i + 1); + x0 = vec_ld(0, r); + x1 = vec_ld(0, r+4); + x2 = vec_ld(0, r+8); + x3 = vec_ld(0, r+12); + x4 = vec_ld(0, r+16); + x5 = vec_ld(0, r+20); + y0 = vec_ld(0, s); + y1 = vec_ld(0, s+4); + y2 = vec_ld(0, s+8); + y3 = vec_ld(0, s+12); + y4 = vec_ld(0, s+16); + y5 = vec_ld(0, s+20); + z0 = vec_madd(k, x0, y0); + z1 = vec_madd(k, x1, y1); + z2 = vec_madd(k, x2, y2); + z3 = vec_madd(k, x3, y3); + z4 = vec_madd(k, x4, y4); + z5 = vec_madd(k, x5, y5); + vec_st(z0, 0, r); + vec_st(z1, 0, r+4); + vec_st(z2, 0, r+8); + vec_st(z3, 0, r+12); + vec_st(z4, 0, r+16); + vec_st(z5, 0, r+20); + } +#ifdef OMP + } /* OpenMP closing brace */ +#endif + return; +} + +#elif ((defined BGL) && (defined XLC)) + +# include"bgl.h" + +void assign_mul_add_r(spinor * const R, const double c, const spinor * const S, const int N) { + int ix = 1; + const double *s ALIGN; + const double *sp ALIGN; + double *r ALIGN; + double *rp ALIGN; + double _Complex x00, x01, x02, x03, x04, x05, x06, x07, + x08, x09, x10, x11; + double _Complex y00, y01, y02, y03, y04, y05, y06, y07, + y08, y09, y10, y11; + double _Complex a; + +#pragma disjoint(*S, *R) + a = __cmplx(c, c); + __alignx(16, S); + __alignx(16, R); + s = (double*) S; + r = (double*) R; + rp = r + 24; + sp = s + 24; + _prefetch_spinor(rp); + _prefetch_spinor(sp); + x00 = __lfpd(r); + x01 = __lfpd(r+2); + x02 = __lfpd(r+4); + x03 = __lfpd(r+6); + x04 = __lfpd(r+8); + x05 = __lfpd(r+10); + x06 = __lfpd(r+12); + x07 = __lfpd(r+14); + x08 = __lfpd(r+16); + x09 = __lfpd(r+18); + x10 = __lfpd(r+20); + x11 = __lfpd(r+22); + y00 = __lfpd(s); + y01 = __lfpd(s+2); + y02 = __lfpd(s+4); + y03 = __lfpd(s+6); + y04 = __lfpd(s+8); + y05 = __lfpd(s+10); + y06 = __lfpd(s+12); + y07 = __lfpd(s+14); + y08 = __lfpd(s+16); + y09 = __lfpd(s+18); + y10 = __lfpd(s+20); + y11 = __lfpd(s+22); + + y00 = __fpmadd(y00, x00, a); + y01 = __fpmadd(y01, x01, a); + y02 = __fpmadd(y02, x02, a); + y03 = __fpmadd(y03, x03, a); + y04 = __fpmadd(y04, x04, a); + y05 = __fpmadd(y05, x05, a); + y06 = __fpmadd(y06, x06, a); + y07 = __fpmadd(y07, x07, a); + y08 = __fpmadd(y08, x08, a); + y09 = __fpmadd(y09, x09, a); + y10 = __fpmadd(y10, x10, a); + y11 = __fpmadd(y11, x11, a); + __stfpd(r, y00); + __stfpd(r+2, y01); + __stfpd(r+4, y02); + __stfpd(r+6, y03); + __stfpd(r+8, y04); + __stfpd(r+10, y05); + __stfpd(r+12, y06); + __stfpd(r+14, y07); + __stfpd(r+16, y08); + __stfpd(r+18, y09); + __stfpd(r+20, y10); + __stfpd(r+22, y11); + s = sp; + r = rp; + +#pragma unroll(12) + for(ix = 1; ix < N-1; ix++) { + rp += 24; + sp += 24; + _prefetch_spinor(rp); + _prefetch_spinor(sp); + x00 = __lfpd(r); + x01 = __lfpd(r+2); + x02 = __lfpd(r+4); + x03 = __lfpd(r+6); + x04 = __lfpd(r+8); + x05 = __lfpd(r+10); + x06 = __lfpd(r+12); + x07 = __lfpd(r+14); + x08 = __lfpd(r+16); + x09 = __lfpd(r+18); + x10 = __lfpd(r+20); + x11 = __lfpd(r+22); + y00 = __lfpd(s); + y01 = __lfpd(s+2); + y02 = __lfpd(s+4); + y03 = __lfpd(s+6); + y04 = __lfpd(s+8); + y05 = __lfpd(s+10); + y06 = __lfpd(s+12); + y07 = __lfpd(s+14); + y08 = __lfpd(s+16); + y09 = __lfpd(s+18); + y10 = __lfpd(s+20); + y11 = __lfpd(s+22); + + y00 = __fpmadd(y00, x00, a); + y01 = __fpmadd(y01, x01, a); + y02 = __fpmadd(y02, x02, a); + y03 = __fpmadd(y03, x03, a); + y04 = __fpmadd(y04, x04, a); + y05 = __fpmadd(y05, x05, a); + y06 = __fpmadd(y06, x06, a); + y07 = __fpmadd(y07, x07, a); + y08 = __fpmadd(y08, x08, a); + y09 = __fpmadd(y09, x09, a); + y10 = __fpmadd(y10, x10, a); + y11 = __fpmadd(y11, x11, a); + __stfpd(r, y00); + __stfpd(r+2, y01); + __stfpd(r+4, y02); + __stfpd(r+6, y03); + __stfpd(r+8, y04); + __stfpd(r+10, y05); + __stfpd(r+12, y06); + __stfpd(r+14, y07); + __stfpd(r+16, y08); + __stfpd(r+18, y09); + __stfpd(r+20, y10); + __stfpd(r+22, y11); + s = sp; + r = rp; + + } + x00 = __lfpd(r); + x01 = __lfpd(r+2); + x02 = __lfpd(r+4); + x03 = __lfpd(r+6); + x04 = __lfpd(r+8); + x05 = __lfpd(r+10); + x06 = __lfpd(r+12); + x07 = __lfpd(r+14); + x08 = __lfpd(r+16); + x09 = __lfpd(r+18); + x10 = __lfpd(r+20); + x11 = __lfpd(r+22); + y00 = __lfpd(s); + y01 = __lfpd(s+2); + y02 = __lfpd(s+4); + y03 = __lfpd(s+6); + y04 = __lfpd(s+8); + y05 = __lfpd(s+10); + y06 = __lfpd(s+12); + y07 = __lfpd(s+14); + y08 = __lfpd(s+16); + y09 = __lfpd(s+18); + y10 = __lfpd(s+20); + y11 = __lfpd(s+22); + + y00 = __fpmadd(y00, x00, a); + y01 = __fpmadd(y01, x01, a); + y02 = __fpmadd(y02, x02, a); + y03 = __fpmadd(y03, x03, a); + y04 = __fpmadd(y04, x04, a); + y05 = __fpmadd(y05, x05, a); + y06 = __fpmadd(y06, x06, a); + y07 = __fpmadd(y07, x07, a); + y08 = __fpmadd(y08, x08, a); + y09 = __fpmadd(y09, x09, a); + y10 = __fpmadd(y10, x10, a); + y11 = __fpmadd(y11, x11, a); + __stfpd(r, y00); + __stfpd(r+2, y01); + __stfpd(r+4, y02); + __stfpd(r+6, y03); + __stfpd(r+8, y04); + __stfpd(r+10, y05); + __stfpd(r+12, y06); + __stfpd(r+14, y07); + __stfpd(r+16, y08); + __stfpd(r+18, y09); + __stfpd(r+20, y10); + __stfpd(r+22, y11); + + return; +} + +#else + +/* R inoutput , c,S input*/ +/* (*R) = c*(*R) + (*S) c is a real constant */ + +void assign_mul_add_r(spinor * const R, const double c, const spinor * const S, const int N) +{ +#ifdef OMP +#pragma omp parallel + { +#endif + spinor *r; + const spinor *s; + + /* Change due to even-odd preconditioning : VOLUME to VOLUME/2 */ +#ifdef OMP +#pragma omp for +#endif + for (int ix = 0; ix < N; ++ix) + { + r = R + ix; + s = S + ix; + + r->s0.c0 = c * r->s0.c0 + s->s0.c0; + r->s0.c1 = c * r->s0.c1 + s->s0.c1; + r->s0.c2 = c * r->s0.c2 + s->s0.c2; + + r->s1.c0 = c * r->s1.c0 + s->s1.c0; + r->s1.c1 = c * r->s1.c1 + s->s1.c1; + r->s1.c2 = c * r->s1.c2 + s->s1.c2; + + r->s2.c0 = c * r->s2.c0 + s->s2.c0; + r->s2.c1 = c * r->s2.c1 + s->s2.c1; + r->s2.c2 = c * r->s2.c2 + s->s2.c2; + + r->s3.c0 = c * r->s3.c0 + s->s3.c0; + r->s3.c1 = c * r->s3.c1 + s->s3.c1; + r->s3.c2 = c * r->s3.c2 + s->s3.c2; + } +#ifdef OMP + } /* OpenMP closing brace */ +#endif +} + +#endif + +#ifdef WITHLAPH +void assign_mul_add_r_su3vect(su3_vector * const R, const double c, su3_vector * const S, const int N) +{ +#ifdef OMP +#pragma omp parallel + { +#endif + su3_vector *r,*s; + +#ifdef OMP +#pragma omp for +#endif + for (int ix = 0; ix < N; ++ix) + { + r = R + ix; + s = S + ix; + r->c0 = c * r->c0 + s->c0; + r->c1 = c * r->c1 + s->c1; + r->c2 = c * r->c2 + s->c2; + } +#ifdef OMP + } /* OpenMP closing brace */ +#endif +} +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_add_r.h b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_add_r.h new file mode 100644 index 0000000000000000000000000000000000000000..ed4d576c50bcc7be98d1f5731dde41317b31cb6c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_add_r.h @@ -0,0 +1,28 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef ASSIGN_MUL_ADD_R_H +#define ASSIGN_MUL_ADD_R_H + +#include "su3.h" + +void assign_mul_add_r(spinor * const R, const double c, const spinor * const S, const int N); +void assign_mul_add_r_su3vect(su3_vector * const R, const double c, su3_vector * const S, const int N); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_add_r_32.c b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_add_r_32.c new file mode 100644 index 0000000000000000000000000000000000000000..e48d08a9a7d22c2c7e3084ec7f1b6b42ea44d942 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_add_r_32.c @@ -0,0 +1,121 @@ +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#ifdef OMP +# include +#endif +#include "su3.h" +#include "assign_mul_add_r_32.h" + + +/* R inoutput , c,S input*/ +/* (*R) = c*(*R) + (*S) c is a real constant */ + +#if (defined BGQ && defined XLC) + +void assign_mul_add_r_32(spinor32 * const R, const float c, const spinor32 * const S, const int N) { +#ifdef OMP +#pragma omp parallel + { +#endif + + vector4double x0, x1, x2, x3, x4, x5, y0, y1, y2, y3, y4, y5; + vector4double z0, z1, z2, z3, z4, z5, k; + float *s, *r; + float ALIGN32 _c; + _c = c; + __prefetch_by_load(S); + __prefetch_by_load(R); + + k = vec_splats((double)_c); + __alignx(16, s); + __alignx(16, r); + __alignx(16, S); + __alignx(16, R); + +#ifdef OMP +#pragma omp for +#else +#pragma unroll(4) +#endif + for(int i = 0; i < N; i++) { + s=(float*)((spinor32 *) S + i); + r=(float*)((spinor32 *) R + i); + __prefetch_by_load(S + i + 1); + __prefetch_by_stream(1, R + i + 1); + x0 = vec_ld(0, r); + x1 = vec_ld(0, r+4); + x2 = vec_ld(0, r+8); + x3 = vec_ld(0, r+12); + x4 = vec_ld(0, r+16); + x5 = vec_ld(0, r+20); + y0 = vec_ld(0, s); + y1 = vec_ld(0, s+4); + y2 = vec_ld(0, s+8); + y3 = vec_ld(0, s+12); + y4 = vec_ld(0, s+16); + y5 = vec_ld(0, s+20); + z0 = vec_madd(k, x0, y0); + z1 = vec_madd(k, x1, y1); + z2 = vec_madd(k, x2, y2); + z3 = vec_madd(k, x3, y3); + z4 = vec_madd(k, x4, y4); + z5 = vec_madd(k, x5, y5); + vec_st(z0, 0, r); + vec_st(z1, 0, r+4); + vec_st(z2, 0, r+8); + vec_st(z3, 0, r+12); + vec_st(z4, 0, r+16); + vec_st(z5, 0, r+20); + } +#ifdef OMP + } /* OpenMP closing brace */ +#endif + return; +} + +#else + +void assign_mul_add_r_32(spinor32 * const R, const float c, const spinor32 * const S, const int N) +{ +#ifdef OMP +#pragma omp parallel + { +#endif + spinor32 *r; + const spinor32 *s; + + /* Change due to even-odd preconditioning : VOLUME to VOLUME/2 */ +#ifdef OMP +#pragma omp for +#endif + for (int ix = 0; ix < N; ++ix) + { + r = R + ix; + s = S + ix; + + r->s0.c0 = c * r->s0.c0 + s->s0.c0; + r->s0.c1 = c * r->s0.c1 + s->s0.c1; + r->s0.c2 = c * r->s0.c2 + s->s0.c2; + + r->s1.c0 = c * r->s1.c0 + s->s1.c0; + r->s1.c1 = c * r->s1.c1 + s->s1.c1; + r->s1.c2 = c * r->s1.c2 + s->s1.c2; + + r->s2.c0 = c * r->s2.c0 + s->s2.c0; + r->s2.c1 = c * r->s2.c1 + s->s2.c1; + r->s2.c2 = c * r->s2.c2 + s->s2.c2; + + r->s3.c0 = c * r->s3.c0 + s->s3.c0; + r->s3.c1 = c * r->s3.c1 + s->s3.c1; + r->s3.c2 = c * r->s3.c2 + s->s3.c2; + } +#ifdef OMP + } /* OpenMP closing brace */ +#endif +} + + +#endif \ No newline at end of file diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_add_r_32.h b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_add_r_32.h new file mode 100644 index 0000000000000000000000000000000000000000..6a7038cfbc72c131a8bdabdf449d46a0d54c5eff --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_add_r_32.h @@ -0,0 +1,8 @@ +#ifndef ASSIGN_MUL_ADD_R_32_H +#define ASSIGN_MUL_ADD_R_32_H + +#include "su3.h" + +void assign_mul_add_r_32(spinor32 * const R, const float c, const spinor32 * const S, const int N); + +#endif \ No newline at end of file diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_add_r_and_square.c b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_add_r_and_square.c new file mode 100644 index 0000000000000000000000000000000000000000..a8921af05b26408ba9546754a6abd83e4c76c02a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_add_r_and_square.c @@ -0,0 +1,219 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef MPI +# include +#endif +#include +#include +#ifdef OMP +# include +# include +#endif +#include "su3.h" +#include "assign_mul_add_r_and_square.h" + + +#if (defined BGQ && defined XLC) + +double assign_mul_add_r_and_square(spinor * const R, const double c, spinor * const S, + const int N, const int parallel) { + double ALIGN res = 0.0; +#ifdef MPI + double ALIGN mres; +#endif + +#ifdef OMP +#pragma omp parallel + { + int thread_num = omp_get_thread_num(); +#endif + vector4double x0, x1, x2, x3, x4, x5, y0, y1, y2, y3, y4, y5; + vector4double z0, z1, z2, z3, z4, z5, k; + vector4double r0, r1, r2, r3, r4, r5; + double *s, *r; + double ALIGN _c = c; + double ALIGN ds = 0.0; +#ifndef OMP + __prefetch_by_load(S); + __prefetch_by_load(R); +#endif + + k = vec_splats(_c); + __alignx(32, s); + __alignx(32, r); + __alignx(32, S); + __alignx(32, R); + r0 = vec_splats(0.); + r1 = vec_splats(0.); + r2 = vec_splats(0.); + r3 = vec_splats(0.); + r4 = vec_splats(0.); + r5 = vec_splats(0.); + + +#ifdef OMP +#pragma omp for +#endif + for(int i = 0; i < N; i++) { + s=(double*)((spinor *) S + i); + r=(double*)((spinor *) R + i); + __prefetch_by_load(S + i + 1); + __prefetch_by_stream(1, R + i + 1); + x0 = vec_ld(0, r); + x1 = vec_ld(0, r+4); + x2 = vec_ld(0, r+8); + x3 = vec_ld(0, r+12); + x4 = vec_ld(0, r+16); + x5 = vec_ld(0, r+20); + y0 = vec_ld(0, s); + y1 = vec_ld(0, s+4); + y2 = vec_ld(0, s+8); + y3 = vec_ld(0, s+12); + y4 = vec_ld(0, s+16); + y5 = vec_ld(0, s+20); + z0 = vec_madd(k, x0, y0); + z1 = vec_madd(k, x1, y1); + z2 = vec_madd(k, x2, y2); + z3 = vec_madd(k, x3, y3); + z4 = vec_madd(k, x4, y4); + z5 = vec_madd(k, x5, y5); + vec_st(z0, 0, r); + vec_st(z1, 0, r+4); + vec_st(z2, 0, r+8); + vec_st(z3, 0, r+12); + vec_st(z4, 0, r+16); + vec_st(z5, 0, r+20); + r0 = vec_madd(z0, z0, r0); + r1 = vec_madd(z1, z1, r1); + r2 = vec_madd(z2, z2, r2); + r3 = vec_madd(z3, z3, r3); + r4 = vec_madd(z4, z4, r4); + r5 = vec_madd(z5, z5, r5); + } + x0 = vec_add(r0, r1); + x1 = vec_add(r2, r3); + x2 = vec_add(r4, r5); + y0 = vec_add(x0, x1); + y1 = vec_add(x2, y0); + ds = y1[0] + y1[1] + y1[2] + y1[3]; + +#ifdef OMP + g_omp_acc_re[thread_num] = ds; + } /* OpenMP closing brace */ + + for(int i = 0; i < omp_num_threads; ++i) { + res += g_omp_acc_re[i]; + } +#else + res = ds; +#endif + +# ifdef MPI + if(parallel) { + MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + return(mres); + } +#endif + return(res); +} + +#else + +/* R inoutput , c,S input*/ +/* (*R) = c*(*R) + (*S) c is a real constant */ + +double assign_mul_add_r_and_square(spinor * const R, const double c, const spinor * const S, + const int N, const int parallel) { + double ALIGN res = 0.0; +#ifdef MPI + double ALIGN mres; +#endif + +#ifdef OMP +#pragma omp parallel + { + int thread_num = omp_get_thread_num(); +#endif + spinor *r; + const spinor *s; + double ALIGN ds = 0.0; + + /* Change due to even-odd preconditioning : VOLUME to VOLUME/2 */ +#ifdef OMP +#pragma omp for +#endif + for (int ix = 0; ix < N; ++ix) { + r = R + ix; + s = S + ix; + + r->s0.c0 = c * r->s0.c0 + s->s0.c0; + ds += creal(r->s0.c0)*creal(r->s0.c0) + cimag(r->s0.c0)*cimag(r->s0.c0); + r->s0.c1 = c * r->s0.c1 + s->s0.c1; + ds += creal(r->s0.c1)*creal(r->s0.c1) + cimag(r->s0.c1)*cimag(r->s0.c1); + r->s0.c2 = c * r->s0.c2 + s->s0.c2; + ds += creal(r->s0.c2)*creal(r->s0.c2) + cimag(r->s0.c2)*cimag(r->s0.c2); + + r->s1.c0 = c * r->s1.c0 + s->s1.c0; + ds += creal(r->s1.c0)*creal(r->s1.c0) + cimag(r->s1.c0)*cimag(r->s1.c0); + r->s1.c1 = c * r->s1.c1 + s->s1.c1; + ds += creal(r->s1.c1)*creal(r->s1.c1) + cimag(r->s1.c1)*cimag(r->s1.c1); + r->s1.c2 = c * r->s1.c2 + s->s1.c2; + ds += creal(r->s1.c2)*creal(r->s1.c2) + cimag(r->s1.c2)*cimag(r->s1.c2); + + r->s2.c0 = c * r->s2.c0 + s->s2.c0; + ds += creal(r->s2.c0)*creal(r->s2.c0) + cimag(r->s2.c0)*cimag(r->s2.c0); + r->s2.c1 = c * r->s2.c1 + s->s2.c1; + ds += creal(r->s2.c1)*creal(r->s2.c1) + cimag(r->s2.c1)*cimag(r->s2.c1); + r->s2.c2 = c * r->s2.c2 + s->s2.c2; + ds += creal(r->s2.c2)*creal(r->s2.c2) + cimag(r->s2.c2)*cimag(r->s2.c2); + + r->s3.c0 = c * r->s3.c0 + s->s3.c0; + ds += creal(r->s3.c0)*creal(r->s3.c0) + cimag(r->s3.c0)*cimag(r->s3.c0); + r->s3.c1 = c * r->s3.c1 + s->s3.c1; + ds += creal(r->s3.c1)*creal(r->s3.c1) + cimag(r->s3.c1)*cimag(r->s3.c1); + r->s3.c2 = c * r->s3.c2 + s->s3.c2; + ds += creal(r->s3.c2)*creal(r->s3.c2) + cimag(r->s3.c2)*cimag(r->s3.c2); + } + +#ifdef OMP + g_omp_acc_re[thread_num] = ds; + } /* OpenMP closing brace */ + + for(int i = 0; i < omp_num_threads; ++i) { + res += g_omp_acc_re[i]; + } +#else + res = ds; +#endif + +# ifdef MPI + if(parallel) { + MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + return(mres); + } +#endif + return(res); +} + +#endif + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_add_r_and_square.h b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_add_r_and_square.h new file mode 100644 index 0000000000000000000000000000000000000000..962fd224186859f0f919a7ad660f2d4ce973c7fc --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_add_r_and_square.h @@ -0,0 +1,28 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef ASSIGN_MUL_ADD_R_AND_SQUARE_H +#define ASSIGN_MUL_ADD_R_AND_SQUARE_H + +#include "su3.h" + +double assign_mul_add_r_and_square(spinor * const R, const double c, const spinor * const S, + const int N, const int parallel); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_bra_add_mul_ket_add.c b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_bra_add_mul_ket_add.c new file mode 100644 index 0000000000000000000000000000000000000000..ad1a2e6caf809a0a72d01efdf8af46608316b44d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_bra_add_mul_ket_add.c @@ -0,0 +1,82 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +/******************************************************************************* + * + * File assign_mul_bra_add_mul_ket_add.c + * + * void assign_mul_bra_add_mul_ket_add + * (spinor * const R,spinor * const S,spinor * const U,const double c1,const double c2) + * (*R) = c2*(*R + c1*(*S)) + (*U) with c1 and c2 complex variables + * + *******************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef OMP +# include +#endif +#include +#include +#include +#include "su3.h" +#include "sse.h" +#include "assign_mul_bra_add_mul_ket_add.h" + +/* (*R) = c2*(*R + c1*(*S)) + (*U) */ +/* R inoutput, S input, U input, c1 input, c2 input */ +void assign_mul_bra_add_mul_ket_add(spinor * const R, spinor * const S,spinor * const U, + const _Complex double c1, const _Complex double c2, const int N) { +#ifdef OMP +#pragma omp parallel + { +#endif + + spinor *r, *s, *u; + +#ifdef OMP +#pragma omp for +#endif + for (int ix = 0; ix < N; ++ix) + { + r=(spinor *) R + ix; + s=(spinor *) S + ix; + u=(spinor *) U + ix; + + /* *W = *R + c1*(*S) */ + r->s0.c0 = u->s0.c0 + c2 * (r->s0.c0 + c1 * s->s0.c0); + r->s0.c1 = u->s0.c1 + c2 * (r->s0.c1 + c1 * s->s0.c1); + r->s0.c2 = u->s0.c2 + c2 * (r->s0.c2 + c1 * s->s0.c2); + + r->s1.c0 = u->s1.c0 + c2 * (r->s1.c0 + c1 * s->s1.c0); + r->s1.c1 = u->s1.c1 + c2 * (r->s1.c1 + c1 * s->s1.c1); + r->s1.c2 = u->s1.c2 + c2 * (r->s1.c2 + c1 * s->s1.c2); + + r->s2.c0 = u->s2.c0 + c2 * (r->s2.c0 + c1 * s->s2.c0); + r->s2.c1 = u->s2.c1 + c2 * (r->s2.c1 + c1 * s->s2.c1); + r->s2.c2 = u->s2.c2 + c2 * (r->s2.c2 + c1 * s->s2.c2); + + r->s3.c0 = u->s3.c0 + c2 * (r->s3.c0 + c1 * s->s3.c0); + r->s3.c1 = u->s3.c1 + c2 * (r->s3.c1 + c1 * s->s3.c1); + r->s3.c2 = u->s3.c2 + c2 * (r->s3.c2 + c1 * s->s3.c2); + } +#ifdef OMP + } /* OpenMP closing brace */ +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_bra_add_mul_ket_add.h b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_bra_add_mul_ket_add.h new file mode 100644 index 0000000000000000000000000000000000000000..75a31ce4b7144ff11b29eb05c7e4d75880e66fe6 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_bra_add_mul_ket_add.h @@ -0,0 +1,28 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _ASSIGN_MUL_BRA_ADD_MUL_KET_ADD_H +#define _ASSIGN_MUL_BRA_ADD_MUL_KET_ADD_H + +#include "su3.h" + +/* (*R) = c2*(*R + c1*(*S)) + (*U) */ +void assign_mul_bra_add_mul_ket_add(spinor * const R, spinor * const S, spinor * const U, const _Complex double c1, const _Complex double c2, const int N); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_bra_add_mul_ket_add_r.c b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_bra_add_mul_ket_add_r.c new file mode 100644 index 0000000000000000000000000000000000000000..0b5fe6712e06e2fc22cb0025a4d82e03e07ef366 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_bra_add_mul_ket_add_r.c @@ -0,0 +1,81 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +/******************************************************************************* + * + * File assign_mul_bra_add_mul_ket_add_r.c + * + * void assign_mul_bra_add_mul_ket_add + * (spinor * const R,spinor * const S,spinor * const U,const double c1,const double c2) + * (*R) = c2*(*R + c1*(*S)) + (*U) with c1 and c2 real variables + * + *******************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef OMP +# include +#endif +#include +#include +#include +#include "su3.h" +#include "assign_mul_bra_add_mul_ket_add_r.h" + +void assign_mul_bra_add_mul_ket_add_r(spinor * const R, spinor * const S, spinor * const U, + const double c1, const double c2, const int N) { +#ifdef OMP +#pragma omp parallel + { +#endif + + int ix; + spinor *r,*s,*u; + +#ifdef OMP +#pragma omp for +#endif + for (ix = 0; ix < N; ix++) + { + r=(spinor *) R + ix; + s=(spinor *) S + ix; + u=(spinor *) U + ix; + + r->s0.c0 = c2 * (r->s0.c0 + c1 * s->s0.c0) + u->s0.c0; + r->s0.c1 = c2 * (r->s0.c1 + c1 * s->s0.c1) + u->s0.c1; + r->s0.c2 = c2 * (r->s0.c2 + c1 * s->s0.c2) + u->s0.c2; + + r->s1.c0 = c2 * (r->s1.c0 + c1 * s->s1.c0) + u->s1.c0; + r->s1.c1 = c2 * (r->s1.c1 + c1 * s->s1.c1) + u->s1.c1; + r->s1.c2 = c2 * (r->s1.c2 + c1 * s->s1.c2) + u->s1.c2; + + r->s2.c0 = c2 * (r->s2.c0 + c1 * s->s2.c0) + u->s2.c0; + r->s2.c1 = c2 * (r->s2.c1 + c1 * s->s2.c1) + u->s2.c1; + r->s2.c2 = c2 * (r->s2.c2 + c1 * s->s2.c2) + u->s2.c2; + + r->s3.c0 = c2 * (r->s3.c0 + c1 * s->s3.c0) + u->s3.c0; + r->s3.c1 = c2 * (r->s3.c1 + c1 * s->s3.c1) + u->s3.c1; + r->s3.c2 = c2 * (r->s3.c2 + c1 * s->s3.c2) + u->s3.c2; + } + +#ifdef OMP + } /* OpenMP closing brace */ +#endif + +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_bra_add_mul_ket_add_r.h b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_bra_add_mul_ket_add_r.h new file mode 100644 index 0000000000000000000000000000000000000000..84a03d84c84e2f39add63a596c743ab13260a290 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_bra_add_mul_ket_add_r.h @@ -0,0 +1,28 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _ASSIGN_MUL_BRA_ADD_MUL_KET_ADD_R_H +#define _ASSIGN_MUL_BRA_ADD_MUL_KET_ADD_R_H + +#include "su3.h" + +/* (*R) = c2*(*R + c1*(*S)) + (*U) */ +void assign_mul_bra_add_mul_ket_add_r(spinor * const R,spinor * const S,spinor * const U,const double c1,const double c2, const int N); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_bra_add_mul_r.c b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_bra_add_mul_r.c new file mode 100644 index 0000000000000000000000000000000000000000..bf37101ca76cbf33c4fdaa3b42e7b95a87cedb4b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_bra_add_mul_r.c @@ -0,0 +1,82 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +/******************************************************************************* + * + * File assign_mul_bra_add_mul_r.c + * + * void assign_mul_bra_add_mul_r(spinor * const R,const double c0, const double c,spinor * const S) + * (*R) = c0*(*R + c*(*S)) + * + *******************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef OMP +# include +#endif +#include +#include +#include +#include "su3.h" +#include "assign_mul_bra_add_mul_r.h" + +/* R output, S input, c0 input, c input */ +void assign_mul_bra_add_mul_r(spinor * const R,const double c0, const double c,spinor * const S, const int N){ +#ifdef OMP +#pragma omp parallel + { +#endif + + int ix; + double ALIGN fact0,fact; + spinor *r,*s; + + fact=c; + fact0=c0; + + +#ifdef OMP +#pragma omp for +#endif + for (ix = 0;ix < N; ++ix) + { + r=(spinor *) R + ix; + s=(spinor *) S + ix; + + r->s0.c0 = fact0 * (r->s0.c0 + fact * s->s0.c0); + r->s0.c1 = fact0 * (r->s0.c1 + fact * s->s0.c1); + r->s0.c2 = fact0 * (r->s0.c2 + fact * s->s0.c2); + + r->s1.c0 = fact0 * (r->s1.c0 + fact * s->s1.c0); + r->s1.c1 = fact0 * (r->s1.c1 + fact * s->s1.c1); + r->s1.c2 = fact0 * (r->s1.c2 + fact * s->s1.c2); + + r->s2.c0 = fact0 * (r->s2.c0 + fact * s->s2.c0); + r->s2.c1 = fact0 * (r->s2.c1 + fact * s->s2.c1); + r->s2.c2 = fact0 * (r->s2.c2 + fact * s->s2.c2); + + r->s3.c0 = fact0 * (r->s3.c0 + fact * s->s3.c0); + r->s3.c1 = fact0 * (r->s3.c1 + fact * s->s3.c1); + r->s3.c2 = fact0 * (r->s3.c2 + fact * s->s3.c2); + } +#ifdef OMP + } /* OpenMP closing brace */ +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_bra_add_mul_r.h b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_bra_add_mul_r.h new file mode 100644 index 0000000000000000000000000000000000000000..f931057f0e87c0f2810822af4e910206b1eefb3a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_mul_bra_add_mul_r.h @@ -0,0 +1,28 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _ASSIGN_MUL_BRA_ADD_MUL_R_H +#define _ASSIGN_MUL_BRA_ADD_MUL_R_H + +#include "su3.h" + +/* (*R) = c0*(*R + c*(*S)) */ +void assign_mul_bra_add_mul_r(spinor * const R,const double c0, const double c,spinor * const S, const int N); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_to_32.c b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_to_32.c new file mode 100644 index 0000000000000000000000000000000000000000..23fae1d8c4058bb2d84309edb47b22e1bd5a3b7a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_to_32.c @@ -0,0 +1,132 @@ +/*********************************************************************** + * Copyright (C) 2014 Florian Burger + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef OMP +# include +#endif +#include +#include +#include +#include +#include +#include "su3.h" +#include "assign_to_32.h" + + +/* S input, R output */ +/* S and R must not overlap */ +void assign_to_32(spinor32 * const R, spinor * const S, const int N) +{ +#ifdef OMP +#pragma omp parallel + { +#endif + spinor32 *r; + spinor *s; + +#ifdef OMP +#pragma omp for +#endif + for (int ix=0; ixs0.c0 = s->s0.c0; + r->s0.c1 = s->s0.c1; + r->s0.c2 = s->s0.c2; + + r->s1.c0 = s->s1.c0; + r->s1.c1 = s->s1.c1; + r->s1.c2 = s->s1.c2; + + r->s2.c0 = s->s2.c0; + r->s2.c1 = s->s2.c1; + r->s2.c2 = s->s2.c2; + + r->s3.c0 = s->s3.c0; + r->s3.c1 = s->s3.c1; + r->s3.c2 = s->s3.c2; + } + +#ifdef OMP + } /* OpenMP closing brace */ +#endif + + return; +} + + + + + +/* S input, R output */ +/* S and R must not overlap */ +void assign_to_64(spinor * const R, spinor32 * const S, const int N) +{ +#ifdef OMP +#pragma omp parallel + { +#endif + spinor *r; + spinor32 *s; + +#ifdef OMP +#pragma omp for +#endif + for (int ix=0; ixs0.c0 = s->s0.c0; + r->s0.c1 = s->s0.c1; + r->s0.c2 = s->s0.c2; + + r->s1.c0 = s->s1.c0; + r->s1.c1 = s->s1.c1; + r->s1.c2 = s->s1.c2; + + r->s2.c0 = s->s2.c0; + r->s2.c1 = s->s2.c1; + r->s2.c2 = s->s2.c2; + + r->s3.c0 = s->s3.c0; + r->s3.c1 = s->s3.c1; + r->s3.c2 = s->s3.c2; + } + +#ifdef OMP + } /* OpenMP closing brace */ +#endif + + return; +} + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_to_32.h b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_to_32.h new file mode 100644 index 0000000000000000000000000000000000000000..87bcdafb020399baf211679374a014de838812a4 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/assign_to_32.h @@ -0,0 +1,27 @@ +/*********************************************************************** + * Copyright (C) 2015 Florian Burger + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _ASSIGN_TO_32_H +#define _ASSIGN_TO_32_H + +#include "su3.h" +void assign_to_32(spinor32 * const R, spinor * const S, const int N); +void assign_to_64(spinor * const R, spinor32 * const S, const int N); +#endif + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/blas.h b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/blas.h new file mode 100644 index 0000000000000000000000000000000000000000..7d108385451dc6292dcce977861bb31d262e1bc1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/blas.h @@ -0,0 +1,93 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _BLAS_H +#define _BLAS_H + +#include +#include "linalg/fortran.h" + +#if defined CRAY || defined HITACHI +/* On the CRAY is all different, of course... */ +#include"fortran.h" +#define zgemm ZGEMM +#define zgemv ZGEMV +#define ddot DDOT +#define zdotc ZDOTC +#define daxpy DAXPY +#define dnrm2 DNRM2 +#define znrm2 ZNRM2 +#define zaxpy ZAXPY +#define dcopy DCOPY +#define dscal DSCAL +#define dgemv DGEMV +#define dgemm DGEMM +extern double _FT(dasum); +extern double _FT(ddot)(); +extern void _FT(zdotc)(); +extern double _FT(dnrm2)(); +extern double _FT(znrm2)(); +extern int _FT(idamax)(); +extern void _FT(daxpy)(); +extern void _FT(zaxpy)(); +extern void _FT(dcopy)(); +extern void _FT(dscal)(); +extern void _FT(dgemv)(); +extern void _FT(zgemv)(); +extern void _FT(dgemm)(); +extern void _FT(zgemm)(); +#else + +/* BLAS-1 functions */ +extern double _FT(dasum)(int* n, double x[], int* incx); +extern double _FT(ddot)(int* n, double x[], int* incx, double y[], + int* incy); +extern void _FT(zdotc)(int* n, _Complex double x[], int* incx, _Complex double y[], + int* incy); +extern double _FT(dnrm2)(int* n, double x[], int* incx); +extern double _FT(znrm2)(int* n, _Complex double x[], int* incx); +extern int _FT(idamax)(int* n, double x[], int* incx); + +/* BLAS-1 subroutines */ +extern void _FT(daxpy)(int* n, double* a, double x[], int* incx, + double y[], int* incy); +extern void _FT(zaxpy)(int* n, _Complex double* a, _Complex double x[], int* incx, + _Complex double y[], int* incy); +extern void _FT(dcopy)(int* n, double x[], int* incx, double y[], + int* incy); +extern void _FT(dscal)(int* n, double* a, double x[], int* incx); + +/* BLAS-2 subroutines */ +extern void _FT(dgemv)(char* trans, int* m, int* n, double* alpha, + double a[], int* lda, double x[], int* incx, double* beta, + double y[], int* incy, int len_trans); +extern void _FT(zgemv)(char* trans, int* m, int* n, _Complex double* alpha, + _Complex double a[], int* lda, _Complex double x[], int* incx, _Complex double* beta, + _Complex double y[], int* incy, int len_trans); + +/* BLAS-3 subroutines */ +extern void _FT(dgemm)(char* transa, char* transb, int* m, int* n, int* k, + double* alpha, double a[], int* lda, double b[], int* ldb, + double* beta, double c[], int* ldc, int len_transa, int len_transb); +extern void _FT(zgemm)(char* transa, char* transb, int* m, int* n, int* k, + _Complex double* alpha, _Complex double a[], int* lda, _Complex double b[], int* ldb, + _Complex double* beta, _Complex double c[], int* ldc, int len_transa, int len_transb); +#endif + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/comp_decomp.c b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/comp_decomp.c new file mode 100644 index 0000000000000000000000000000000000000000..89fd21f0c8347c7a4f7b8cec79e0af31d83df80d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/comp_decomp.c @@ -0,0 +1,203 @@ +/*********************************************************************** + * + * Copyright (C) 2006 Thomas Chiarappa + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * File comp_decomp.c + * + * void compact(spinor * const R, spinor * const S, spinor * const T); + * + * Builds the Bi-spinor R out of the spinors S and T + * S in first half (top) T in second half (bottom) + * + * + * void decompact(spinor * const S, spinor * const T, spinor * const R); + * + * Splits the Bi-spinor R in the spinors S and T + * S in first half (top) T in second half (bottom) + * + *****************************************************************************/ + + +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef OMP +# include +#endif +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "comp_decomp.h" + + +/* S and P inputs, R output */ +void compact(bispinor * const R, spinor * const S, spinor * const P) +{ +#ifdef OMP +#pragma omp parallel + { +#endif + spinor *r,*s; + spinor *u,*t; + +#ifdef OMP +#pragma omp for +#endif + for (int ix = 0; ix < VOLUME/2; ix++){ + r=(spinor *) &R[ix].sp_up; + s=(spinor *) S + ix; + + /* (*r) = (*s); */ + + r->s0.c0 = s->s0.c0; + r->s0.c1 = s->s0.c1; + r->s0.c2 = s->s0.c2; + + r->s1.c0 = s->s1.c0; + r->s1.c1 = s->s1.c1; + r->s1.c2 = s->s1.c2; + + r->s2.c0 = s->s2.c0; + r->s2.c1 = s->s2.c1; + r->s2.c2 = s->s2.c2; + + r->s3.c0 = s->s3.c0; + r->s3.c1 = s->s3.c1; + r->s3.c2 = s->s3.c2; + + u=(spinor *) &R[ix].sp_dn; + t=(spinor *) P + ix; + + u->s0.c0 = t->s0.c0; + u->s0.c1 = t->s0.c1; + u->s0.c2 = t->s0.c2; + + u->s1.c0 = t->s1.c0; + u->s1.c1 = t->s1.c1; + u->s1.c2 = t->s1.c2; + + u->s2.c0 = t->s2.c0; + u->s2.c1 = t->s2.c1; + u->s2.c2 = t->s2.c2; + + u->s3.c0 = t->s3.c0; + u->s3.c1 = t->s3.c1; + u->s3.c2 = t->s3.c2; + } + +#ifdef OMP + } /* OpenMP closing brace */ +#endif + + /* + The following IS NOT enough, since it copies the values + starting from the adress given by the pointer &R->sp_up, but + following the colour - spin - FLAVOUR - volume structure. + + In other words: staring with the FIRST site on the lattice ([0]) + the routine copies the first 3 (colour) * 4 (spin) component of + the spinor S onto the corresponding adresses of the spinor + R->sp_up . Then it continues by copying the component + S[1].s0.c0 onto the adress R[0].sp_dn (.s0.c0), + !!! AND NOT JUMPING TO R[1].sp_up (.s0.c0) !!! + because of the structure and mem. allocation of the bispinor + */ + + /* + assign(&R->sp_up, &S[0], VOLUME/2); + */ + /* + assign(&R->sp_dn, &P[0], VOLUME/2); + */ + +} + + +/* R input , S and P outputs */ +void decompact(spinor * const S, spinor * const P, bispinor * const R){ +#ifdef OMP +#pragma omp parallel + { +#endif + + spinor *r,*s; + spinor *u,*t; + +#ifdef OMP +#pragma omp for +#endif + for (int ix = 0; ix < VOLUME/2; ix++) + { + s=(spinor *) &R[ix].sp_up; + r=(spinor *) S + ix; + + r->s0.c0 = s->s0.c0; + r->s0.c1 = s->s0.c1; + r->s0.c2 = s->s0.c2; + + r->s1.c0 = s->s1.c0; + r->s1.c1 = s->s1.c1; + r->s1.c2 = s->s1.c2; + + r->s2.c0 = s->s2.c0; + r->s2.c1 = s->s2.c1; + r->s2.c2 = s->s2.c2; + + r->s3.c0 = s->s3.c0; + r->s3.c1 = s->s3.c1; + r->s3.c2 = s->s3.c2; + + + t=(spinor *) &R[ix].sp_dn; + u=(spinor *) P + ix; + + u->s0.c0 = t->s0.c0; + u->s0.c1 = t->s0.c1; + u->s0.c2 = t->s0.c2; + + u->s1.c0 = t->s1.c0; + u->s1.c1 = t->s1.c1; + u->s1.c2 = t->s1.c2; + + u->s2.c0 = t->s2.c0; + u->s2.c1 = t->s2.c1; + u->s2.c2 = t->s2.c2; + + u->s3.c0 = t->s3.c0; + u->s3.c1 = t->s3.c1; + u->s3.c2 = t->s3.c2; + } + +#ifdef OMP + } /* OpenMP closing brace */ +#endif + + /* !!! The following should be enough !!! */ + + /* + The following IS NOT enough, See explanation above + */ + /* + assign(&S[0], &R->sp_up, VOLUME/2); + + assign(&T[0], &R->sp_dn, VOLUME/2); + */ + +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/comp_decomp.h b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/comp_decomp.h new file mode 100644 index 0000000000000000000000000000000000000000..2fd6e689cca08ffe87249978291e28f71ad07b80 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/comp_decomp.h @@ -0,0 +1,32 @@ +/*********************************************************************** + * + * Copyright (C) 2006 Thomas Chiarappa + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _COMP_DECOMP_H +#define _COMP_DECOMP_H + +#include "su3.h" + +/* Build bispinor out of spinors : (*R) = ((*S), (*T)) */ +void compact(bispinor * const R, spinor * const S, spinor * const P); + +/* Splits bispinor into spinors : (*S) = top (*R) ; (*T) = bottom (*R) */ +void decompact(spinor * const S, spinor * const P, bispinor * const R); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/convert_eo_to_lexic.c b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/convert_eo_to_lexic.c new file mode 100644 index 0000000000000000000000000000000000000000..47b53f03d422667d7413896ecfcef30bf110c0a6 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/convert_eo_to_lexic.c @@ -0,0 +1,115 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#ifdef MPI +# include +#endif +#ifdef OMP +# include +#endif +#include "global.h" +#include "su3.h" +#include "convert_eo_to_lexic.h" + +void convert_eo_to_lexic(spinor * const P, spinor * const s, spinor * const r) { +#ifdef OMP +#pragma omp parallel + { +#endif + + int x, y, z, t, i, ix; + spinor * p = NULL; + +#ifdef OMP +#pragma omp for +#endif + for(x = 0; x < LX; x++) { + for(y = 0; y < LY; y++) { + for(z = 0; z < LZ; z++) { + for(t = 0; t < T; t++) { + ix = g_ipt[t][x][y][z]; + i = g_lexic2eosub[ ix ]; + if((t+x+y+z+g_proc_coords[3]*LZ+g_proc_coords[2]*LY + + g_proc_coords[0]*T+g_proc_coords[1]*LX)%2 == 0) { + p = s; + } + else { + p = r; + } + memcpy((P+ix), (p+i), sizeof(spinor)); + } + } + } + } + +#ifdef OMP + } /*OpenMP closing brace */ +#endif + + return; +} + +/* + * P: spinor with full volume + * s: new spinor even + * r: new spinor odd + */ +void convert_lexic_to_eo(spinor * const s, spinor * const r, spinor * const P) { +#ifdef OMP +#pragma omp parallel + { +#endif + + int x, y, z, t, i, ix; + spinor * p = NULL; + +#ifdef OMP +#pragma omp for +#endif + for(x = 0; x < LX; x++) { + for(y = 0; y < LY; y++) { + for(z = 0; z < LZ; z++) { + for(t = 0; t < T; t++) { + ix = g_ipt[t][x][y][z]; + i = g_lexic2eosub[ ix ]; + if((t+x+y+z+g_proc_coords[3]*LZ+g_proc_coords[2]*LY + + g_proc_coords[0]*T+g_proc_coords[1]*LX)%2 == 0) { + p = s; + } + else { + p = r; + } + memcpy((p+i), (P+ix), sizeof(spinor)); + } + } + } + } + +#ifdef OMP + } /* OpenMP closing brace */ +#endif + + return; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/convert_eo_to_lexic.h b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/convert_eo_to_lexic.h new file mode 100644 index 0000000000000000000000000000000000000000..2944bb1df84c44ae1dbb1fb4d693c10cff7a66c8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/convert_eo_to_lexic.h @@ -0,0 +1,26 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _CONVERT_EO_TO_LEXIC_H +#define _CONVERT_EO_TO_LEXIC_H + +void convert_eo_to_lexic(spinor * const P, spinor * const s, spinor * const r); +void convert_lexic_to_eo(spinor * const s, spinor * const r, spinor * const P); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/diff.c b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/diff.c new file mode 100644 index 0000000000000000000000000000000000000000..f133c9a8e3ae59e5f2ee2db2271c5366b78f3f41 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/diff.c @@ -0,0 +1,340 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +/******************************************************************************* + * + * void diff(spinor * const Q,spinor * const R,spinor * const S) + * Makes the difference (*Q) = (*R) - (*S) + *******************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef OMP +# include +#endif +#include +#include +#include +#include +#include "su3.h" +#include "diff.h" + +#if ((defined BGL) && (defined XLC)) + +/*************************************** + * + * diff with intrinsics + * + * Carsten.Urbach@liverpool.ac.uk + * + ***************************************/ + +# include"bgl.h" + +void diff(spinor * const Q,spinor * const R,spinor * const S, const int N) +{ + int ix = 1; + double *s ALIGN; + double *sp ALIGN; + double *r ALIGN; + double *rp ALIGN; + double *q ALIGN; + double _Complex x00, x01, x02, x03, x04, x05, x06, x07, + x08, x09, x10, x11; + double _Complex y00, y01, y02, y03, y04, y05, y06, y07, + y08, y09, y10, y11; +#pragma disjoint(*R, *S) + + __alignx(16, Q); + __alignx(16, R); + __alignx(16, S); + r = (double*) R; + s = (double*) S; + q = (double*) Q; + rp = r + 24; + sp = s + 24; + _prefetch_spinor(rp); + _prefetch_spinor(sp); + x00 = __lfpd(r); + x01 = __lfpd(r+2); + x02 = __lfpd(r+4); + x03 = __lfpd(r+6); + x04 = __lfpd(r+8); + x05 = __lfpd(r+10); + x06 = __lfpd(r+12); + x07 = __lfpd(r+14); + x08 = __lfpd(r+16); + x09 = __lfpd(r+18); + x10 = __lfpd(r+20); + x11 = __lfpd(r+22); + y00 = __lfpd(s); + y01 = __lfpd(s+2); + y02 = __lfpd(s+4); + y03 = __lfpd(s+6); + y04 = __lfpd(s+8); + y05 = __lfpd(s+10); + y06 = __lfpd(s+12); + y07 = __lfpd(s+14); + y08 = __lfpd(s+16); + y09 = __lfpd(s+18); + y10 = __lfpd(s+20); + y11 = __lfpd(s+22); + + __stfpd(q, __fpsub(x00, y00)); + __stfpd(q+2, __fpsub(x01, y01)); + __stfpd(q+4, __fpsub(x02, y02)); + __stfpd(q+6, __fpsub(x03, y03)); + __stfpd(q+8, __fpsub(x04, y04)); + __stfpd(q+10, __fpsub(x05, y05)); + __stfpd(q+12, __fpsub(x06, y06)); + __stfpd(q+14, __fpsub(x07, y07)); + __stfpd(q+16, __fpsub(x08, y08)); + __stfpd(q+18, __fpsub(x09, y09)); + __stfpd(q+20, __fpsub(x10, y10)); + __stfpd(q+22, __fpsub(x11, y11)); + s = sp; + r = rp; + q+=24; +#pragma unroll(12) + for(ix = 1; ix < N-1; ix++) { + rp+=24; + sp+=24; + _prefetch_spinor(rp); + _prefetch_spinor(sp); + x00 = __lfpd(r); + x01 = __lfpd(r+2); + x02 = __lfpd(r+4); + x03 = __lfpd(r+6); + x04 = __lfpd(r+8); + x05 = __lfpd(r+10); + x06 = __lfpd(r+12); + x07 = __lfpd(r+14); + x08 = __lfpd(r+16); + x09 = __lfpd(r+18); + x10 = __lfpd(r+20); + x11 = __lfpd(r+22); + y00 = __lfpd(s); + y01 = __lfpd(s+2); + y02 = __lfpd(s+4); + y03 = __lfpd(s+6); + y04 = __lfpd(s+8); + y05 = __lfpd(s+10); + y06 = __lfpd(s+12); + y07 = __lfpd(s+14); + y08 = __lfpd(s+16); + y09 = __lfpd(s+18); + y10 = __lfpd(s+20); + y11 = __lfpd(s+22); + + __stfpd(q, __fpsub(x00, y00)); + __stfpd(q+2, __fpsub(x01, y01)); + __stfpd(q+4, __fpsub(x02, y02)); + __stfpd(q+6, __fpsub(x03, y03)); + __stfpd(q+8, __fpsub(x04, y04)); + __stfpd(q+10, __fpsub(x05, y05)); + __stfpd(q+12, __fpsub(x06, y06)); + __stfpd(q+14, __fpsub(x07, y07)); + __stfpd(q+16, __fpsub(x08, y08)); + __stfpd(q+18, __fpsub(x09, y09)); + __stfpd(q+20, __fpsub(x10, y10)); + __stfpd(q+22, __fpsub(x11, y11)); + s = sp; + r = rp; + q+=24; + } + x00 = __lfpd(r); + x01 = __lfpd(r+2); + x02 = __lfpd(r+4); + x03 = __lfpd(r+6); + x04 = __lfpd(r+8); + x05 = __lfpd(r+10); + x06 = __lfpd(r+12); + x07 = __lfpd(r+14); + x08 = __lfpd(r+16); + x09 = __lfpd(r+18); + x10 = __lfpd(r+20); + x11 = __lfpd(r+22); + y00 = __lfpd(s); + y01 = __lfpd(s+2); + y02 = __lfpd(s+4); + y03 = __lfpd(s+6); + y04 = __lfpd(s+8); + y05 = __lfpd(s+10); + y06 = __lfpd(s+12); + y07 = __lfpd(s+14); + y08 = __lfpd(s+16); + y09 = __lfpd(s+18); + y10 = __lfpd(s+20); + y11 = __lfpd(s+22); + + __stfpd(q, __fpsub(x00, y00)); + __stfpd(q+2, __fpsub(x01, y01)); + __stfpd(q+4, __fpsub(x02, y02)); + __stfpd(q+6, __fpsub(x03, y03)); + __stfpd(q+8, __fpsub(x04, y04)); + __stfpd(q+10, __fpsub(x05, y05)); + __stfpd(q+12, __fpsub(x06, y06)); + __stfpd(q+14, __fpsub(x07, y07)); + __stfpd(q+16, __fpsub(x08, y08)); + __stfpd(q+18, __fpsub(x09, y09)); + __stfpd(q+20, __fpsub(x10, y10)); + __stfpd(q+22, __fpsub(x11, y11)); + + return; +} + +#elif (defined BGQ && defined XLC) + +void diff(spinor * const Q,const spinor * const R,const spinor * const S, const int N) { +#ifdef OMP +#pragma omp parallel + { +#endif + vector4double x0, x1, x2, x3, x4, x5, y0, y1, y2, y3, y4, y5; + vector4double z0, z1, z2, z3, z4, z5; + double *s, *r, *q; + + __alignx(32, s); + __alignx(32, r); + __alignx(32, q); + __alignx(32, S); + __alignx(32, R); + + __prefetch_by_load(S); + __prefetch_by_load(R); + __prefetch_by_load(Q); + +#ifndef OMP +#pragma unroll(2) +#else +#pragma omp for +#endif + for (int ix = 0; ix < N; ++ix) { + s=(double*)((spinor *) S + ix); + r=(double*)((spinor *) R + ix); + q=(double*)((spinor *) Q + ix); + __prefetch_by_load(S + ix + 1); + __prefetch_by_load(R + ix + 1); + __prefetch_by_stream(1, Q + ix + 1); + x0 = vec_ld(0, r); + x1 = vec_ld(0, r+4); + x2 = vec_ld(0, r+8); + x3 = vec_ld(0, r+12); + x4 = vec_ld(0, r+16); + x5 = vec_ld(0, r+20); + y0 = vec_ld(0, s); + y1 = vec_ld(0, s+4); + y2 = vec_ld(0, s+8); + y3 = vec_ld(0, s+12); + y4 = vec_ld(0, s+16); + y5 = vec_ld(0, s+20); + z0 = vec_sub(x0, y0); + z1 = vec_sub(x1, y1); + z2 = vec_sub(x2, y2); + z3 = vec_sub(x3, y3); + z4 = vec_sub(x4, y4); + z5 = vec_sub(x5, y5); + vec_st(z0, 0, q); + vec_st(z1, 0, q+4); + vec_st(z2, 0, q+8); + vec_st(z3, 0, q+12); + vec_st(z4, 0, q+16); + vec_st(z5, 0, q+20); + } + +#ifdef OMP + } /* OpenMP parallel closing brace */ +#endif + + return; +} + +#else + +void diff(spinor * const Q, const spinor * const R, const spinor * const S, const int N) +{ +#ifdef OMP +#pragma omp parallel + { +#endif + + spinor *q; + const spinor *r,*s; + +/* Change due to even-odd preconditioning : VOLUME to VOLUME/2 */ +#ifdef OMP +#pragma omp for +#endif + for (int ix = 0; ix < N; ix++) + { + q=(spinor *) Q + ix; + r=(spinor *) R + ix; + s=(spinor *) S + ix; + + q->s0.c0 = r->s0.c0 - s->s0.c0; + q->s0.c1 = r->s0.c1 - s->s0.c1; + q->s0.c2 = r->s0.c2 - s->s0.c2; + + q->s1.c0 = r->s1.c0 - s->s1.c0; + q->s1.c1 = r->s1.c1 - s->s1.c1; + q->s1.c2 = r->s1.c2 - s->s1.c2; + + q->s2.c0 = r->s2.c0 - s->s2.c0; + q->s2.c1 = r->s2.c1 - s->s2.c1; + q->s2.c2 = r->s2.c2 - s->s2.c2; + + q->s3.c0 = r->s3.c0 - s->s3.c0; + q->s3.c1 = r->s3.c1 - s->s3.c1; + q->s3.c2 = r->s3.c2 - s->s3.c2; + } +#ifdef OMP + } /* OpenMP closing brace */ +#endif +} + +#endif + +#ifdef WITHLAPH +void diff_su3vect(su3_vector * const Q,su3_vector * const R,su3_vector * const S, const int N) +{ +#ifdef OMP +#pragma omp parallel + { +#endif + + su3_vector *q,*r,*s; + +#ifdef OMP +#pragma omp for +#endif + for (int ix = 0; ix < N; ++ix) + { + q=(su3_vector *) Q + ix; + r=(su3_vector *) R + ix; + s=(su3_vector *) S + ix; + + q->c0 = r->c0 - s->c0; + q->c1 = r->c1 - s->c1; + q->c2 = r->c2 - s->c2; + } +#ifdef OMP + } /* OpenMP closing brace */ +#endif +} +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/diff.h b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/diff.h new file mode 100644 index 0000000000000000000000000000000000000000..c9f604c83b6ba79b48bb3694a8b605a02bdfe23d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/diff.h @@ -0,0 +1,30 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _DIFF_H +#define _DIFF_H + +#include "su3.h" + +/* Makes the difference (*Q) = (*R) - (*S) */ +void diff(spinor * const Q, const spinor * const R, const spinor * const S, const int N); +void diff_su3vect(su3_vector * const Q, su3_vector * const R, su3_vector * const S, const int N); + + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/diff_32.c b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/diff_32.c new file mode 100644 index 0000000000000000000000000000000000000000..cbc4ffba15a656cc0d874cfc2dd55204a689cb68 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/diff_32.c @@ -0,0 +1,78 @@ +/*********************************************************************** + * Copyright (C) 2015 Florian Burger + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +/******************************************************************************* + * + * void diff_32(spinor32 * const Q,spinor32 * const R,spinor32 * const S) + * Makes the difference (*Q) = (*R) - (*S) + *******************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef OMP +# include +#endif +#include +#include +#include +#include +#include "su3.h" +#include "diff_32.h" + + +void diff_32(spinor32 * const Q, const spinor32 * const R, const spinor32 * const S, const int N) +{ +#ifdef OMP +#pragma omp parallel + { +#endif + + spinor32 *q; + const spinor32 *r,*s; + +/* Change due to even-odd preconditioning : VOLUME to VOLUME/2 */ +#ifdef OMP +#pragma omp for +#endif + for (int ix = 0; ix < N; ix++) + { + q=(spinor32 *) Q + ix; + r=(spinor32 *) R + ix; + s=(spinor32 *) S + ix; + + q->s0.c0 = r->s0.c0 - s->s0.c0; + q->s0.c1 = r->s0.c1 - s->s0.c1; + q->s0.c2 = r->s0.c2 - s->s0.c2; + + q->s1.c0 = r->s1.c0 - s->s1.c0; + q->s1.c1 = r->s1.c1 - s->s1.c1; + q->s1.c2 = r->s1.c2 - s->s1.c2; + + q->s2.c0 = r->s2.c0 - s->s2.c0; + q->s2.c1 = r->s2.c1 - s->s2.c1; + q->s2.c2 = r->s2.c2 - s->s2.c2; + + q->s3.c0 = r->s3.c0 - s->s3.c0; + q->s3.c1 = r->s3.c1 - s->s3.c1; + q->s3.c2 = r->s3.c2 - s->s3.c2; + } +#ifdef OMP + } /* OpenMP closing brace */ +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/diff_32.h b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/diff_32.h new file mode 100644 index 0000000000000000000000000000000000000000..c5ab50daac85d2b5b9483793ef7a1e2911d57907 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/diff_32.h @@ -0,0 +1,29 @@ +/*********************************************************************** + * Copyright (C) 2015 Florian Burger + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _DIFF_32_H +#define _DIFF_32_H + +#include "su3.h" + +/* Makes the difference (*Q) = (*R) - (*S) */ +void diff_32(spinor32 * const Q, const spinor32 * const R, const spinor32 * const S, const int N); + + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/diff_and_square_norm.c b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/diff_and_square_norm.c new file mode 100644 index 0000000000000000000000000000000000000000..9b8ef181875b5a6420f8fc560d43d0008c9a0e1c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/diff_and_square_norm.c @@ -0,0 +1,82 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#ifdef MPI +# include +#endif +#include "su3.h" +#include "diff_and_square_norm.h" + +double diff_and_square_norm(spinor * const Q, spinor * const R, const int N) { + int ix; + static double ks,kc,ds,tr,ts,tt; + spinor *q,*r; + + ks=0.0; + kc=0.0; + + /* Change due to even-odd preconditioning : VOLUME to VOLUME/2 */ + for (ix = 0; ix < N; ix++) + { + q=Q+ix; + r=R+ix; + + q->s0.c0 = r->s0.c0 - q->s0.c0; + q->s0.c1 = r->s0.c1 - q->s0.c1; + q->s0.c2 = r->s0.c2 - q->s0.c2; + + ds = q->s0.c0 * conj(q->s0.c0) + q->s0.c1 * conj(q->s0.c1) + q->s0.c2 * conj(q->s0.c2); + + q->s1.c0 = r->s1.c0 - q->s1.c0; + q->s1.c1 = r->s1.c1 - q->s1.c1; + q->s1.c2 = r->s1.c2 - q->s1.c2; + + ds += q->s1.c0 * conj(q->s1.c0) + q->s1.c1 * conj(q->s1.c1) + q->s1.c2 * conj(q->s1.c2); + + q->s2.c0 = r->s2.c0 - q->s2.c0; + q->s2.c1 = r->s2.c1 - q->s2.c1; + q->s2.c2 = r->s2.c2 - q->s2.c2; + + ds += q->s2.c0 * conj(q->s2.c0) + q->s2.c1 * conj(q->s2.c1) + q->s2.c2 * conj(q->s2.c2); + + q->s3.c0 = r->s3.c0 - q->s3.c0; + q->s3.c1 = r->s3.c1 - q->s3.c1; + q->s3.c2 = r->s3.c2 - q->s3.c2; + + ds += q->s3.c0 * conj(q->s3.c0) + q->s3.c1 * conj(q->s3.c1) + q->s3.c2 * conj(q->s3.c2); + + tr = ds+kc; + ts = tr+ks; + tt = ts-ks; + ks = ts; + kc = tr-tt; + } + kc = ks+kc; +#ifdef MPI + MPI_Allreduce(&kc, &ks, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + return ks; +#else + return kc; +#endif + +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/diff_and_square_norm.h b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/diff_and_square_norm.h new file mode 100644 index 0000000000000000000000000000000000000000..1dbfe99adf5c25a2b5710b065f96fb722857f980 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/diff_and_square_norm.h @@ -0,0 +1,27 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef DIFF_AND_SQUARE_NORM_H +#define DIFF_AND_SQUARE_NORM_H + +#include "su3.h" + +double diff_and_square_norm(spinor * const Q, spinor * const R, const int N); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/fortran.h b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/fortran.h new file mode 100644 index 0000000000000000000000000000000000000000..95b8ccaf87f470aac44fcaf21a649b633d0139c6 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/fortran.h @@ -0,0 +1,28 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _FORTRAN_MY_H +#define _FORTRAN_MY_H + +#if (defined NOF77UNDERSCORE || defined NOF77_) +#define _FT(s) s +#else +#define _FT(s) s ## _ +#endif + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/lapack.h b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/lapack.h new file mode 100644 index 0000000000000000000000000000000000000000..2ea6bade610d1dc6abecd9fb62d5299824084f93 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/lapack.h @@ -0,0 +1,141 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _LAPACK_H +#define _LAPACK_H + +#include +#include "linalg/fortran.h" + +#if defined CRAY || defined HITACHI +#define zgels CGELS +#define zgesv CGESV +#define zgeevx CGEEVX +#define dsyev CSYEV +#define zheev CHEEV +#define dgetrs DGETRS +#define dgetrf DGETRF +#define dlarnv DLARNV +#define zlarnv CLARNV +#define dsyevx DSYEVX +#define zlacpy CLACPY +#define dlacpy DLACPY +#define dlaset DLASET +#define zlaset CLASET +#define dlamch DLAMCH +#define ilaenv ILAENV +#define zlapcy CLAPCY +#define zgetrf CGETRF +#define zgetrs CGETRS +#define zgeqrf ZGEQRF +#define zunmqr ZUNMQR + +extern void _FT(zgels)(); +extern void _FT(zgesv)(); +extern void _FT(zgeevx)(); +extern void _FT(dsyev)(); +extern void _FT(zheev)(); +extern void _FT(dgetrs)(); +extern void _FT(dgetrf)(); +extern void _FT(dlarnv)(); +extern void _FT(zlarnv)(); +extern void _FT(dsyevx)(); +extern void _FT(zlacpy)(); +extern void _FT(dlaset)(); +extern double _FT(dlamch)(); +extern int _FT(ilaenv)(); +extern void _FT(zgetrf)(); +extern void _FT(zgetrs)(); +extern void _FT(zgeqrf)(); +extern void _FT(zunmqr)(); + +#else + +void _FT(zgels)(char* transa, int* M, int* N, int* NRHS, _Complex double a[], int* lda, + _Complex double b[], int* ldb, _Complex double work[], int* lwork, int* info, int len_transa); + +void _FT(zgesv)(int* n, int* nrhs, _Complex double a[], int* lda, + int ipivot[], _Complex double b[], int* ldb, int *info); + +extern void _FT(zgeevx)(char* balanc, char* jobvl, char* jobvr, char* sense, + int* N, _Complex double A[], int* lda, _Complex double W[], _Complex double vl[], + int* ldvl, _Complex double vr[], int* ldvr, int* ilo, int* ihi, + double scale[], double* abnrm, double rcone[], double rconv[], + _Complex double work[], int* lwork, double work2[], int* info, + int len_balanc, int len_jobvl, int len_jobvr, int len_sense); + +extern void _FT(dsyev)(char* jobz, char* uplo, int* n, double a[], + int* lda, double w[], double work[], int* lwork, int* info, + int len_jobz, int len_uplo); +extern void _FT(zheev)(char* jobz, char* uplo, int* n, _Complex double a[], + int* lda, double w[], _Complex double work[], int* lwork, double* rwork, int* info,int len_jobz, int len_uplo); + +extern void _FT(dgetrs)(char* trans, int* n, int* nrhs, double a[], + int* lda, int ipiv[], double b[], int* ldb, int* info, + int len_trans); +extern void _FT(dgetrf)(int* m, int* n, double a[], int* lda, int ipiv[], + int* info); + +extern void _FT(zgetrs)(char* trans, int* n, int* nrhs, _Complex double a[], + int* lda, int ipiv[], _Complex double b[], int* ldb, int* info, + int len_trans); +extern void _FT(zgetrf)(int* m, int* n, _Complex double a[], int* lda, int ipiv[], + int* info); + +extern void _FT(zhetrs)(char* uplo, int* n, int* nrhs, _Complex double a[], + int* lda, int ipiv[], _Complex double b[], int* ldb, int* info, + int len_uplo); +extern void _FT(zhetrf)(char* uplo, int* n, _Complex double a[], int* lda, int ipiv[], + _Complex double work[], int * lwork, int* info, int len_uplo); + +extern void _FT(dlarnv)(int *IDIST, int *ISEED, int *N, double *X); +extern void _FT(zlarnv)(int *IDIST, int *ISEED, int *N, _Complex double *X); + +extern void _FT(dsyevx)(char* jobz, char* range, char* uplo, int* n, + double a[], int* lda, double* vl, double* vu, int* il, int* iu, + double* abstol, int* m, double w[], double z[], int* ldz, + double work[], int* lwork, int iwork[], int ifail[], int* info, + int len_jobz, int len_range, int len_uplo); + +extern void _FT(zlacpy)(char *UPLO, int *M, int *N, _Complex double *A, int *LDA, + _Complex double *B, int *LDB, int len_uplo); +extern void _FT(dlacpy)(char *UPLO, int *M, int *N, double *A, int *LDA, + double *B, int *LDB, int len_uplo); + +extern void _FT(dlaset)(char *UPLO, int *M, int *N, double *ALPHA, + double *BETA, double *A, int *LDA, int len_uplo ); +extern void _FT(zlaset)(char *UPLO, int *M, int *N, _Complex double *ALPHA, + _Complex double *BETA, _Complex double *A, + int *LDA, int len_uplo ); + +extern double _FT(dlamch)(char* name, int len_name); + +extern int _FT(ilaenv)(int *ISPEC, char *NAME, char *OPTS, int *N1, + int *N2, int *N3, int *N4, int len_name, int len_opts); + +extern void _FT(zgeqrf)(int *M, int *N, _Complex double *A, int *LDA, _Complex double *TAU, + _Complex double *WORK, int *LWORK, int *INFO); + + +extern void _FT(zunmqr)(char *SIDE, char *TRANS, int *M, int *N, int *K, + _Complex double *A, int *LDA, _Complex double *TAU, _Complex double *C, + int *LDC, _Complex double *WORK, int *LWORK, int *INFO); +#endif + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/map_to_blas.h b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/map_to_blas.h new file mode 100644 index 0000000000000000000000000000000000000000..a29b061e12ff5e1d602575dcb3d0447f487518ba --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/map_to_blas.h @@ -0,0 +1,51 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +/* + * Mapping from our linalg routines + * to blas routines + * + * Carsten Urbach May 2003 + * urbach@ifh.de + */ + +#ifndef MAP_TO_BLAS_H +#define MAP_TO_BLAS_H + +#ifdef _USE_BLAS + +#ifdef XLC +/*#include */ +#include "su3/_Complex double.h" +_Complex double zdotc(int, _Complex double*, int, _Complex double*, int); +void zaxpy(int, _Complex double ,_Complex double* ,int ,_Complex double* ,int); +void zcopy(int, _Complex double*, int, _Complex double*, int); +#define assign_add_mul(A,B,C) zaxpy(12*VOLUME,C,(_Complex double*)B,1,(_Complex double*)A,1) +#define scalar_prod(A,B) zdotc(12*VOLUME,(_Complex double*)A,1,(_Complex double*)B,1) +#define assign(A,B) zcopy(12*VOLUME,(_Complex double*)B,1,(_Complex double*)A,1) + +#else + +#define assign_add_mul(A,B,C) zaxpy(12*VOLUME,C,B,1,A,1) +#define scalar_prod(A,B) zdotc(12*VOLUME,A,1,B,1) +#define assign(A,B) zcopy(12*VOLUME,B,1,A,1) +#endif + +#endif +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mattimesvec.c b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mattimesvec.c new file mode 100644 index 0000000000000000000000000000000000000000..1dd86664055ff916819e815ee026dab8b6f21fb0 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mattimesvec.c @@ -0,0 +1,47 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#ifdef MPI +#include +#endif +#include "complex.h" +#include "mattimesvec.h" + +/* v = M*w */ +/* v,w complex vectors of length N */ +/* M a NxN complex matrix with */ +/* leading dimension ldM >= N */ +/* we should provide special SSE2 */ +/* and BG/P versions */ + +void mattimesvec(_Complex double * const v, _Complex double * const M, _Complex double * const w, + const int N, const int ldM) +{ + for(int i = 0; i < N; ++i) + { + v[i] = M[i*ldM] * w[0]; + for(int j = 1; j < N; ++j) + v[i] += M[i*ldM + j] * w[j]; + } + return; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mattimesvec.h b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mattimesvec.h new file mode 100644 index 0000000000000000000000000000000000000000..1705fee2edb04b15e889b927a5ef8a36d66c5a68 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mattimesvec.h @@ -0,0 +1,28 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _MATTIMESVEC_H +#define _MATTIMESVEC_H + +#include + +void mattimesvec(_Complex double * const v, _Complex double * const M, _Complex double * const w, + const int N, const int ldM); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mul.c b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mul.c new file mode 100644 index 0000000000000000000000000000000000000000..1d31fbd5cdfeb01cbf1004db20031ed6fc71def1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mul.c @@ -0,0 +1,75 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +/******************************************************************************* + * + * File mul.c + * + * void mul(spinor * const R, const _Complex double c, spinor * const S){ + * Makes (*R) = c*(*S) + * + *******************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef OMP +# include +#endif +#include +#include +#include +#include "su3.h" +#include "mul.h" + +void mul(spinor * const R, const _Complex double c, spinor * const S, const int N){ +#ifdef OMP +#pragma omp parallel + { +#endif + + spinor *r,*s; + +#ifdef OMP +#pragma omp for +#endif + for (int ix = 0; ix < N; ++ix) + { + r=(spinor *) R + ix; + s=(spinor *) S + ix; + + r->s0.c0 = c * s->s0.c0; + r->s0.c1 = c * s->s0.c1; + r->s0.c2 = c * s->s0.c2; + + r->s1.c0 = c * s->s1.c0; + r->s1.c1 = c * s->s1.c1; + r->s1.c2 = c * s->s1.c2; + + r->s2.c0 = c * s->s2.c0; + r->s2.c1 = c * s->s2.c1; + r->s2.c2 = c * s->s2.c2; + + r->s3.c0 = c * s->s3.c0; + r->s3.c1 = c * s->s3.c1; + r->s3.c2 = c * s->s3.c2; + } +#ifdef OMP + } /* OpenMP closing brace */ +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mul.h b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mul.h new file mode 100644 index 0000000000000000000000000000000000000000..47ef29ed840b7e36c3b4591c0d35bb6c5f1efa0c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mul.h @@ -0,0 +1,28 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _MUL_H +#define _MUL_H + +#include "su3.h" + +/* Makes (*R) = c*(*S) */ +void mul(spinor * const R, const _Complex double c, spinor * const S, const int N); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mul_add_mul.c b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mul_add_mul.c new file mode 100644 index 0000000000000000000000000000000000000000..28d3d620f67bb5559d5f84b30bc713f5fd136167 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mul_add_mul.c @@ -0,0 +1,71 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef OMP +# include +#endif +#include +#include +#include +#include "su3.h" +#include "mul_add_mul.h" + + +/* Makes (*R)=c1*(*S)+c2*(*U) , c1 and c2 are complex constants */ +void mul_add_mul(spinor * const R,spinor * const S,spinor * const U,const _Complex double c1,const _Complex double c2, const int N) +{ +#ifdef OMP +#pragma omp parallel + { +#endif + + spinor *r, *s, *u; + +#ifdef OMP +#pragma omp for +#endif + for (int ix = 0; ix < N; ++ix) + { + r=(spinor *) R + ix; + s=(spinor *) S + ix; + u=(spinor *) U + ix; + + r->s0.c0 = c1 * s->s0.c0 + c2 * u->s0.c0; + r->s0.c1 = c1 * s->s0.c1 + c2 * u->s0.c1; + r->s0.c2 = c1 * s->s0.c2 + c2 * u->s0.c2; + + r->s1.c0 = c1 * s->s1.c0 + c2 * u->s1.c0; + r->s1.c1 = c1 * s->s1.c1 + c2 * u->s1.c1; + r->s1.c2 = c1 * s->s1.c2 + c2 * u->s1.c2; + + r->s2.c0 = c1 * s->s2.c0 + c2 * u->s2.c0; + r->s2.c1 = c1 * s->s2.c1 + c2 * u->s2.c1; + r->s2.c2 = c1 * s->s2.c2 + c2 * u->s2.c2; + + r->s3.c0 = c1 * s->s3.c0 + c2 * u->s3.c0; + r->s3.c1 = c1 * s->s3.c1 + c2 * u->s3.c1; + r->s3.c2 = c1 * s->s3.c2 + c2 * u->s3.c2; + } +#ifdef OMP + } /* OpenMP closing brace */ +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mul_add_mul.h b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mul_add_mul.h new file mode 100644 index 0000000000000000000000000000000000000000..6a501a4fb022599d59ee162e19d0668971290aef --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mul_add_mul.h @@ -0,0 +1,29 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _MUL_ADD_MUL_H +#define _MUL_ADD_MUL_H + +#include "su3.h" + +/* Makes (*R)=c1*(*S)+c2*(*U) , c1 and c2 are complex constants */ +void mul_add_mul(spinor * const R,spinor * const S,spinor * const U,const _Complex double c1,const _Complex double c2, const int N); + + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mul_add_mul_r.c b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mul_add_mul_r.c new file mode 100644 index 0000000000000000000000000000000000000000..f2b93fc2e2e1a279895ac77ddb58869c7c7966c5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mul_add_mul_r.c @@ -0,0 +1,79 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +/******************************************************************************* + * Makes (*R)=c1*(*S)+c2*(*U) , c1 and c2 are real constants + *******************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef OMP +# include +#endif +#include +#include +#include +#include "su3.h" +#include "mul_add_mul_r.h" + + +/* S,U input, R inoutput, c1,c2 input */ +void mul_add_mul_r(spinor * const R, spinor * const S, spinor * const U, + const double c1,const double c2, const int N) { +#ifdef OMP +#pragma omp parallel + { +#endif + + int ix; + spinor *r,*s,*u; + +#ifdef OMP +#pragma omp for +#endif + for (ix=0; ix < N; ix++){ + r=(spinor *) R + ix; + s=(spinor *) S + ix; + u=(spinor *) U + ix; + + r->s0.c0 = c1 * s->s0.c0 + c2 * u->s0.c0; + r->s0.c1 = c1 * s->s0.c1 + c2 * u->s0.c1; + r->s0.c2 = c1 * s->s0.c2 + c2 * u->s0.c2; + + r->s1.c0 = c1 * s->s1.c0 + c2 * u->s1.c0; + r->s1.c1 = c1 * s->s1.c1 + c2 * u->s1.c1; + r->s1.c2 = c1 * s->s1.c2 + c2 * u->s1.c2; + + r->s2.c0 = c1 * s->s2.c0 + c2 * u->s2.c0; + r->s2.c1 = c1 * s->s2.c1 + c2 * u->s2.c1; + r->s2.c2 = c1 * s->s2.c2 + c2 * u->s2.c2; + + r->s3.c0 = c1 * s->s3.c0 + c2 * u->s3.c0; + r->s3.c1 = c1 * s->s3.c1 + c2 * u->s3.c1; + r->s3.c2 = c1 * s->s3.c2 + c2 * u->s3.c2; + } +#ifdef OMP + } /* OpenMP closing brace */ +#endif +} + + + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mul_add_mul_r.h b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mul_add_mul_r.h new file mode 100644 index 0000000000000000000000000000000000000000..90d470a914f11a7531df4ace7a5a52b7dca380ba --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mul_add_mul_r.h @@ -0,0 +1,30 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _MUL_ADD_MUL_R_H +#define _MUL_ADD_MUL_R_H + +#include "su3.h" + +/* Makes (*R)=c1*(*S)+c2*(*U) , c1 and c2 are real constants */ +void mul_add_mul_r(spinor * const R, spinor * const S, spinor * const U, + const double c1,const double c2, const int N); + + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mul_diff_mul.c b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mul_diff_mul.c new file mode 100644 index 0000000000000000000000000000000000000000..444ceb0ec8edbf87a0d0d68c1781c11d08bf90c0 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mul_diff_mul.c @@ -0,0 +1,74 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef OMP +# include +#endif +#include +#include +#include +#include "su3.h" +#include "mul_diff_mul.h" + + +/* Makes (*R)=c1*(*S)-c2*(*U) , c1 and c2 are complex constants */ +void mul_diff_mul(spinor * const R,spinor * const S,spinor * const U,const _Complex double c1,const _Complex double c2, const int N){ +#ifdef OMP +#pragma omp parallel + { +#endif + + int ix; + spinor *r,*s,*u; + +#ifdef OMP +#pragma omp for +#endif + for (ix=0;ixs0.c0 = c1 * s->s0.c0 - c2 * u->s0.c0; + r->s0.c1 = c1 * s->s0.c1 - c2 * u->s0.c1; + r->s0.c2 = c1 * s->s0.c2 - c2 * u->s0.c2; + + r->s1.c0 = c1 * s->s1.c0 - c2 * u->s1.c0; + r->s1.c1 = c1 * s->s1.c1 - c2 * u->s1.c1; + r->s1.c2 = c1 * s->s1.c2 - c2 * u->s1.c2; + + r->s2.c0 = c1 * s->s2.c0 - c2 * u->s2.c0; + r->s2.c1 = c1 * s->s2.c1 - c2 * u->s2.c1; + r->s2.c2 = c1 * s->s2.c2 - c2 * u->s2.c2; + + r->s3.c0 = c1 * s->s3.c0 - c2 * u->s3.c0; + r->s3.c1 = c1 * s->s3.c1 - c2 * u->s3.c1; + r->s3.c2 = c1 * s->s3.c2 - c2 * u->s3.c2; + } + +#ifdef OMP + } /* OpenMP closing brace */ +#endif +} + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mul_diff_mul.h b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mul_diff_mul.h new file mode 100644 index 0000000000000000000000000000000000000000..eb938a45db67fdd0b2e130fd8d92cba9168c075d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mul_diff_mul.h @@ -0,0 +1,29 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _MUL_DIFF_MUL_H +#define _MUL_DIFF_MUL_H + +#include "su3.h" + +/* Makes (*R)=c1*(*S)-c2*(*U) , c1 and c2 are complex constants */ +void mul_diff_mul(spinor * const R,spinor * const S,spinor * const U,const _Complex double c1,const _Complex double c2, const int N); + + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mul_diff_mul_r.c b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mul_diff_mul_r.c new file mode 100644 index 0000000000000000000000000000000000000000..78876ec1aa63bb844988c831f744f08e6b655eea --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mul_diff_mul_r.c @@ -0,0 +1,76 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef OMP +# include +#endif +#include +#include +#include +#include "su3.h" +#include "mul_diff_mul_r.h" + + +/* Makes (*R)=c1*(*S)-c2*(*U) , c1 and c2 are complex constants */ +void mul_diff_mul_r(spinor * const R, spinor * const S,spinor * const U, + const double c1, const double c2, const int N) +{ +#ifdef OMP +#pragma omp parallel + { +#endif + + spinor *r,*s,*u; + +#ifdef OMP +#pragma omp for +#endif + for (int ix = 0; ix < N; ++ix) + { + r=(spinor *) R + ix; + s=(spinor *) S + ix; + u=(spinor *) U + ix; + + r->s0.c0 = c1 * s->s0.c0 - c2 * u->s0.c0; + r->s0.c1 = c1 * s->s0.c1 - c2 * u->s0.c1; + r->s0.c2 = c1 * s->s0.c2 - c2 * u->s0.c2; + + r->s1.c0 = c1 * s->s1.c0 - c2 * u->s1.c0; + r->s1.c1 = c1 * s->s1.c1 - c2 * u->s1.c1; + r->s1.c2 = c1 * s->s1.c2 - c2 * u->s1.c2; + + r->s2.c0 = c1 * s->s2.c0 - c2 * u->s2.c0; + r->s2.c1 = c1 * s->s2.c1 - c2 * u->s2.c1; + r->s2.c2 = c1 * s->s2.c2 - c2 * u->s2.c2; + + r->s3.c0 = c1 * s->s3.c0 - c2 * u->s3.c0; + r->s3.c1 = c1 * s->s3.c1 - c2 * u->s3.c1; + r->s3.c2 = c1 * s->s3.c2 - c2 * u->s3.c2; + } + +#ifdef OMP + } /* OpenMP closing brace */ +#endif +} + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mul_diff_mul_r.h b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mul_diff_mul_r.h new file mode 100644 index 0000000000000000000000000000000000000000..1e2399a76b65bdad8d6b17ace16426dd788a094d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mul_diff_mul_r.h @@ -0,0 +1,30 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _MUL_DIFF_MUL_R_H +#define _MUL_DIFF_MUL_R_H + +#include "su3.h" + +/* Makes (*R)=c1*(*S)-c2*(*U) , c1 and c2 are complex constants */ +void mul_diff_mul_r(spinor * const R, spinor * const S, spinor * const U, + const double c1, const double c2, const int N); + + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mul_diff_r.c b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mul_diff_r.c new file mode 100644 index 0000000000000000000000000000000000000000..99f6de7fa7782a95d8475451b39b2e458e21625f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mul_diff_r.c @@ -0,0 +1,80 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +/******************************************************************************* + * Makes (*R)=c1*(*S)-(*U) , c1 is a real constant + *******************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef OMP +# include +#endif +#include +#include +#include +#include "su3.h" +#include "mul_diff_r.h" + + +/* S,U input, R inoutput, c1 input */ +void mul_diff_r(spinor * const R,spinor * const S,spinor * const U, const double c1, const int N) +{ +#ifdef OMP +#pragma omp parallel + { +#endif + + spinor *r,*s,*u; + +#ifdef OMP +#pragma omp for +#endif + for (int ix = 0; ix < N; ++ix) + { + r=(spinor *) R + ix; + s=(spinor *) S + ix; + u=(spinor *) U + ix; + + r->s0.c0 = c1 * s->s0.c0 - u->s0.c0; + r->s0.c1 = c1 * s->s0.c1 - u->s0.c1; + r->s0.c2 = c1 * s->s0.c2 - u->s0.c2; + + r->s1.c0 = c1 * s->s1.c0 - u->s1.c0; + r->s1.c1 = c1 * s->s1.c1 - u->s1.c1; + r->s1.c2 = c1 * s->s1.c2 - u->s1.c2; + + r->s2.c0 = c1 * s->s2.c0 - u->s2.c0; + r->s2.c1 = c1 * s->s2.c1 - u->s2.c1; + r->s2.c2 = c1 * s->s2.c2 - u->s2.c2; + + r->s3.c0 = c1 * s->s3.c0 - u->s3.c0; + r->s3.c1 = c1 * s->s3.c1 - u->s3.c1; + r->s3.c2 = c1 * s->s3.c2 - u->s3.c2; + } + +#ifdef OMP + } /* OpenMP closing brace */ +#endif +} + + + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mul_diff_r.h b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mul_diff_r.h new file mode 100644 index 0000000000000000000000000000000000000000..0e5cca0f3482407f70357516ec4143bd9180cf39 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mul_diff_r.h @@ -0,0 +1,29 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _MUL_DIFF_R_H +#define _MUL_DIFF_R_H + +#include "su3.h" + +/* Makes (*R)=c1*(*S) - (*U) , c1 is a real constant */ +void mul_diff_r(spinor * const R,spinor * const S,spinor * const U,const double c1, const int N); + + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mul_r.c b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mul_r.c new file mode 100644 index 0000000000000000000000000000000000000000..09f267ead81ccb6165007b49cf2ce7035e30a2ca --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mul_r.c @@ -0,0 +1,76 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +/******************************************************************************* + * + * File mul_r.c + * + * void mul_r(spinor * const R, const double c, spinor * const S){ + * Makes (*R) = c*(*S) c is a real constant + * + *******************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef OMP +# include +#endif +#include +#include +#include +#include "su3.h" +#include "mul_r.h" + +void mul_r(spinor * const R, const double c, spinor * const S, const int N){ +#ifdef OMP +#pragma omp parallel + { +#endif + + int ix; + spinor *r,*s; + +#ifdef OMP +#pragma omp for +#endif + for (ix = 0; ix < N; ix++){ + r=(spinor *) R + ix; + s=(spinor *) S + ix; + + r->s0.c0 = c * s->s0.c0; + r->s0.c1 = c * s->s0.c1; + r->s0.c2 = c * s->s0.c2; + + r->s1.c0 = c * s->s1.c0; + r->s1.c1 = c * s->s1.c1; + r->s1.c2 = c * s->s1.c2; + + r->s2.c0 = c * s->s2.c0; + r->s2.c1 = c * s->s2.c1; + r->s2.c2 = c * s->s2.c2; + + r->s3.c0 = c * s->s3.c0; + r->s3.c1 = c * s->s3.c1; + r->s3.c2 = c * s->s3.c2; + } +#ifdef OMP + } /*OpenMP closing brace */ +#endif + +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mul_r.h b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mul_r.h new file mode 100644 index 0000000000000000000000000000000000000000..f3386db5b5b945ad86a1fd233c50ace9ef243005 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mul_r.h @@ -0,0 +1,28 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _MUL_R_H +#define _MUL_R_H + +#include "su3.h" + +/* Makes (*R) = c*(*S) c is a real constant*/ +void mul_r(spinor * const R, const double c, spinor * const S, const int N); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mul_r_32.c b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mul_r_32.c new file mode 100644 index 0000000000000000000000000000000000000000..3c0d4399c3b339871bc54f23ea52efb847dc2c21 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mul_r_32.c @@ -0,0 +1,78 @@ +/*********************************************************************** + * Copyright (C) 2015 Florian Burger + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +/******************************************************************************* + * + * File mul_r_32.c + * + * void mul_r_32(spinor32 * const R, const float c, spinor32 * const S){ + * Makes (*R) = c*(*S) c is a real constant + * + *******************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef OMP +# include +#endif +#include +#include +#include +#include "su3.h" +#include "mul_r_32.h" + +void mul_r_32_orphaned(spinor32 * const R, const float c, spinor32 * const S, const int N){ + int ix; + spinor32 *r,*s; + +#ifdef OMP +#pragma omp for +#endif + for (ix = 0; ix < N; ix++){ + r=(spinor32 *) R + ix; + s=(spinor32 *) S + ix; + + r->s0.c0 = c * s->s0.c0; + r->s0.c1 = c * s->s0.c1; + r->s0.c2 = c * s->s0.c2; + + r->s1.c0 = c * s->s1.c0; + r->s1.c1 = c * s->s1.c1; + r->s1.c2 = c * s->s1.c2; + + r->s2.c0 = c * s->s2.c0; + r->s2.c1 = c * s->s2.c1; + r->s2.c2 = c * s->s2.c2; + + r->s3.c0 = c * s->s3.c0; + r->s3.c1 = c * s->s3.c1; + r->s3.c2 = c * s->s3.c2; + } +} + +void mul_r_32(spinor32 * const R, const float c, spinor32 * const S, const int N){ +#ifdef OMP +#pragma omp parallel + { +#endif + mul_r_32_orphaned(R,c,S,N); +#ifdef OMP + } /*OpenMP closing brace */ +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mul_r_32.h b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mul_r_32.h new file mode 100644 index 0000000000000000000000000000000000000000..3e95761d7b3ae38b580f0fd1913661dab8007c29 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/mul_r_32.h @@ -0,0 +1,29 @@ +/*********************************************************************** + * Copyright (C) 2015 Florian Burger + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _MUL_R_32_H +#define _MUL_R_32_H + +#include "su3.h" + +/* Makes (*R) = c*(*S) c is a real constant*/ +void mul_r_32(spinor32 * const R, const float c, spinor32 * const S, const int N); +void mul_r_32_orphaned(spinor32 * const R, const float c, spinor32 * const S, const int N); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/scalar_prod.c b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/scalar_prod.c new file mode 100644 index 0000000000000000000000000000000000000000..47c5537b89d09801261f7b7f118dd0760697a594 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/scalar_prod.c @@ -0,0 +1,142 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#ifdef MPI +#include +#endif +#ifdef OMP +# include +# include +#endif +#include "su3.h" +#include "scalar_prod.h" + +/* =S^* times R */ +_Complex double scalar_prod(const spinor * const S, const spinor * const R, const int N, const int parallel) { + _Complex double ALIGN res = 0.0; +#ifdef MPI + _Complex double ALIGN mres; +#endif + +#ifdef OMP +#pragma omp parallel + { + int thread_num = omp_get_thread_num(); +#endif + + _Complex double ALIGN ds,tr,ts,tt,ks,kc; + const spinor *s,*r; + + ks = 0.0; + kc = 0.0; + +#if (defined BGL && defined XLC) + __alignx(16, S); + __alignx(16, R); +#endif + +#ifdef OMP +#pragma omp for +#endif + for (int ix = 0; ix < N; ix++) + { + s= S + ix; + r= R + ix; + + ds = r->s0.c0 * conj(s->s0.c0) + r->s0.c1 * conj(s->s0.c1) + r->s0.c2 * conj(s->s0.c2) + + r->s1.c0 * conj(s->s1.c0) + r->s1.c1 * conj(s->s1.c1) + r->s1.c2 * conj(s->s1.c2) + + r->s2.c0 * conj(s->s2.c0) + r->s2.c1 * conj(s->s2.c1) + r->s2.c2 * conj(s->s2.c2) + + r->s3.c0 * conj(s->s3.c0) + r->s3.c1 * conj(s->s3.c1) + r->s3.c2 * conj(s->s3.c2); + + /* Kahan Summation */ + tr=ds+kc; + ts=tr+ks; + tt=ts-ks; + ks=ts; + kc=tr-tt; + } + kc=ks+kc; + +#ifdef OMP + g_omp_acc_cp[thread_num] = kc; + + } /* OpenMP closing brace */ + + /* having left the parallel section, we can now sum up the Kahan + corrected sums from each thread into kc */ + for(int i = 0; i < omp_num_threads; ++i) + res += g_omp_acc_cp[i]; +#else + res=kc; +#endif + +#ifdef MPI + if(parallel == 1) + { + MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD); + return(mres); + } +#endif + return(res); +} + +#ifdef WITHLAPH +_Complex double scalar_prod_su3vect(su3_vector * const S, su3_vector * const R, const int N, const int parallel) +{ + double ALIGN ks, ds, tr, ts, tt; + su3_vector *s, *r; + _Complex double c; +#ifdef MPI + _Complex double d; +#endif + + /* Real Part */ + + ks = 0.0; + c = 0.0; + for (int ix = 0; ix < N; ++ix) + { + s = (su3_vector *) S + ix; + r = (su3_vector *) R + ix; + + ds = r->c0 * conj(s->c0) + r->c1 * conj(s->c1) + r->c2 * conj(s->c2); + + /* Kahan Summation */ + tr = ds + c; + ts = tr + ks; + tt = ts - ks; + ks = ts; + c = tr - tt; + } + c = ks + c; + +#ifdef MPI + if(parallel == 1) + { + d = c; + MPI_Allreduce(&d, &c, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD); + } +#endif + return(c); +} +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/scalar_prod.h b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/scalar_prod.h new file mode 100644 index 0000000000000000000000000000000000000000..fa220332dc7e1abfab6f973da8681867a23a72ea --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/scalar_prod.h @@ -0,0 +1,28 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _SCALAR_PROD_H +#define _SCALAR_PROD_H + +#include "su3.h" +/* =SxR^* */ +_Complex double scalar_prod(const spinor * const S, const spinor * const R, const int N, const int parallel); +_Complex double scalar_prod_su3vect(su3_vector * const S,su3_vector * const R, const int N, const int parallel); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/scalar_prod_i.c b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/scalar_prod_i.c new file mode 100644 index 0000000000000000000000000000000000000000..fde1ea2c24403174ad3083ba480aaec29dd05164 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/scalar_prod_i.c @@ -0,0 +1,81 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +/******************************************************************************* + * + * File scalar_prod_r.c + * + * double scalar_prod_r(spinor * const S,spinor * const R, const int N) + * Returns the real part of the scalar product (*R,*S) + * + *******************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#ifdef MPI +# include +#endif +#include +#include "su3.h" +#include "scalar_prod_i.h" + +/* R input, S input */ + +double scalar_prod_i(spinor * const S,spinor * const R, const int N, const int parallel) +{ + static double ks,kc,ds,tr,ts,tt; + spinor *s,*r; + ks=0.0; + kc=0.0; + +#if (defined BGL && defined XLC) + __alignx(16, S); + __alignx(16, R); +#endif + + for (int ix = 0; ix < N; ++ix) + { + s=(spinor *) S + ix; + r=(spinor *) R + ix; + + ds=cimag(r->s0.c0 * conj(s->s0.c0) + r->s0.c1 * conj(s->s0.c1) + r->s0.c2 * conj(s->s0.c2) + + r->s1.c0 * conj(s->s1.c0) + r->s1.c1 * conj(s->s1.c1) + r->s1.c2 * conj(s->s1.c2) + + r->s2.c0 * conj(s->s2.c0) + r->s2.c1 * conj(s->s2.c1) + r->s2.c2 * conj(s->s2.c2) + + r->s3.c0 * conj(s->s3.c0) + r->s3.c1 * conj(s->s3.c1) + r->s3.c2 * conj(s->s3.c2) ); + + tr=ds+kc; + ts=tr+ks; + tt=ts-ks; + ks=ts; + kc=tr-tt; + } + kc=ks+kc; + +#if defined MPI + if(parallel == 1) { + MPI_Allreduce(&kc, &ks, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + kc = ks; + } +#endif + + return kc; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/scalar_prod_i.h b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/scalar_prod_i.h new file mode 100644 index 0000000000000000000000000000000000000000..b1ee5d9e6f3a65fbf94ab7928050342c142fc601 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/scalar_prod_i.h @@ -0,0 +1,28 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _SCALAR_PROD_I_H +#define _SCALAR_PROD_I_H + +#include "su3.h" + +/* Returns the imaginary part of the scalar product (*R,*S) */ +double scalar_prod_i(spinor * const S,spinor * const R, const int N, const int parallel); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/scalar_prod_r.c b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/scalar_prod_r.c new file mode 100644 index 0000000000000000000000000000000000000000..6a47493e1a572297835c22b915b4f2870d4fa24a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/scalar_prod_r.c @@ -0,0 +1,237 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * File scalar_prod_r.c + * + * double scalar_prod_r(spinor * const S,spinor * const R, const int N) + * Returns the real part of the scalar product (*R,*S) + * + *******************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef MPI +# include +#endif +#ifdef OMP +# include +# include +#endif +#include "su3.h" +#include "scalar_prod_r.h" + +/* R input, S input */ + +#include + +#if (defined BGQ && defined XLC) + +double scalar_prod_r(const spinor * const S, const spinor * const R, const int N, const int parallel) { + double ALIGN res = 0.0; +#ifdef MPI + double ALIGN mres; +#endif + +#ifdef OMP +#pragma omp parallel + { + int thread_num = omp_get_thread_num(); +#endif + vector4double ks, kc, ds, tr, ts, tt; + vector4double x0, x1, x2, x3, x4, x5, y0, y1, y2, y3, y4, y5; + vector4double z0, z1, z2, z3, z4, z5; + double *s, *r; + vector4double buffer; + __alignx(32, s); + __alignx(32, r); + __alignx(32, S); + __alignx(32, R); + + __prefetch_by_load(S); + __prefetch_by_load(R); + + ks = vec_splats(0.0); + kc = vec_splats(0.0); + +#ifndef OMP +#pragma unroll(2) +#else +#pragma omp for +#endif + for (int ix = 0; ix < N; ++ix) { + s=(double*)((spinor *) S + ix); + r=(double*)((spinor *) R + ix); + __prefetch_by_load(S + ix + 1); + __prefetch_by_load(R + ix + 1); + x0 = vec_ld(0, s); + x1 = vec_ld(0, s+4); + x2 = vec_ld(0, s+8); + x3 = vec_ld(0, s+12); + x4 = vec_ld(0, s+16); + x5 = vec_ld(0, s+20); + y0 = vec_ld(0, r); + y1 = vec_ld(0, r+4); + y2 = vec_ld(0, r+8); + y3 = vec_ld(0, r+12); + y4 = vec_ld(0, r+16); + y5 = vec_ld(0, r+20); + z0 = vec_mul(x0, y0); + z1 = vec_mul(x1, y1); + z2 = vec_mul(x2, y2); + z3 = vec_mul(x3, y3); + z4 = vec_mul(x4, y4); + z5 = vec_mul(x5, y5); + x0 = vec_add(z0, z1); + x1 = vec_add(z2, z3); + x2 = vec_add(z4, z5); + x3 = vec_add(x0, x1); + ds = vec_add(x2, x3); + + tr = vec_add(ds, kc); + ts = vec_add(tr, ks); + tt = vec_sub(ts, ks); + ks = ts; + kc = vec_sub(tr, tt); + } + buffer = vec_add(kc, ks); + +#ifdef OMP + g_omp_acc_re[thread_num] = buffer[0] + buffer[1] + buffer[2] + buffer[3]; + } /* OpenMP parallel closing brace */ + for( int i = 0; i < omp_num_threads; ++i) + res += g_omp_acc_re[i]; +#else + res = buffer[0] + buffer[1] + buffer[2] + buffer[3]; +#endif + +#if defined MPI + if(parallel) { + MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + return(mres); + } +#endif + + return (res); +} + +#else + +double scalar_prod_r(const spinor * const S, const spinor * const R, const int N, const int parallel) +{ + double ALIGN res = 0.0; +#ifdef MPI + double ALIGN mres; +#endif + +#ifdef OMP +#pragma omp parallel + { + int thread_num = omp_get_thread_num(); +#endif + double ALIGN kc,ks,ds,tr,ts,tt; + const spinor *s,*r; + + ks = 0.0; + kc = 0.0; + +#if (defined BGL && defined XLC) + __alignx(16, S); + __alignx(16, R); +#endif + +#ifdef OMP +#pragma omp for +#endif + for (int ix = 0; ix < N; ++ix) { + s = S + ix; + r = R + ix; + + ds = creal(r->s0.c0 * conj(s->s0.c0)) + creal(r->s0.c1 * conj(s->s0.c1)) + creal(r->s0.c2 * conj(s->s0.c2)) + + creal(r->s1.c0 * conj(s->s1.c0)) + creal(r->s1.c1 * conj(s->s1.c1)) + creal(r->s1.c2 * conj(s->s1.c2)) + + creal(r->s2.c0 * conj(s->s2.c0)) + creal(r->s2.c1 * conj(s->s2.c1)) + creal(r->s2.c2 * conj(s->s2.c2)) + + creal(r->s3.c0 * conj(s->s3.c0)) + creal(r->s3.c1 * conj(s->s3.c1)) + creal(r->s3.c2 * conj(s->s3.c2)); + + tr=ds+kc; + ts=tr+ks; + tt=ts-ks; + ks=ts; + kc=tr-tt; + } + kc=ks+kc; + +#ifdef OMP + g_omp_acc_re[thread_num] = kc; + + } /* OpenMP closing brace */ + + for(int i = 0; i < omp_num_threads; ++i) + res += g_omp_acc_re[i]; +#else + res = kc; +#endif + +#if defined MPI + if(parallel) + { + MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + return mres; + } +#endif + return res; +} + + +#endif + +#ifdef WITHLAPH +double scalar_prod_r_su3vect(su3_vector * const S,su3_vector * const R, const int N, const int parallel) +{ + int ix; + double ALIGN ks,kc,ds,tr,ts,tt; + su3_vector *s,*r; + + ks=0.0; + kc=0.0; + for (int ix = 0; ix < N; ++ix) + { + s = (su3_vector *) S + ix; + r = (su3_vector *) R + ix; + + ds = creal(r->c0) * creal(s->c0) + cimag(r->c0) * cimag(s->c0) + + creal(r->c1) * creal(s->c1) + cimag(r->c1) * cimag(s->c1) + + creal(r->c2) * creal(s->c2) + cimag(r->c2) * cimag(s->c2); + + tr = ds + kc; + ts = tr + ks; + tt = ts-ks; + ks = ts; + kc = tr-tt; + } + kc = ks + kc; +#if defined MPI + if(parallel) + { + MPI_Allreduce(&kc, &ks, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + return ks; + } +#endif + return kc; +} + +#endif // WITHLAPH diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/scalar_prod_r.h b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/scalar_prod_r.h new file mode 100644 index 0000000000000000000000000000000000000000..dac253ce588c6f0de569fcbb30694f651ffec85f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/scalar_prod_r.h @@ -0,0 +1,29 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _SCALAR_PROD_R_H +#define _SCALAR_PROD_R_H + +#include "su3.h" + +/* Returns the real part of the scalar product (*R,*S) */ +double scalar_prod_r(const spinor * const S, const spinor * const R, const int N, const int parallel); +double scalar_prod_r_su3vect(su3_vector * const S,su3_vector * const R, const int N, const int parallel); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/scalar_prod_r_32.c b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/scalar_prod_r_32.c new file mode 100644 index 0000000000000000000000000000000000000000..81f1e1dcada9b707b2a6b352755566ed6e7e7dbe --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/scalar_prod_r_32.c @@ -0,0 +1,168 @@ +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef MPI +# include +#endif +#ifdef OMP +# include +# include +#endif +#include "su3.h" +#include "scalar_prod_r_32.h" + +/* R input, S input */ + +#include +#if (defined BGQ && defined XLC) + +float scalar_prod_r_32(const spinor32 * const S, const spinor32 * const R, const int N, const int parallel) { + float ALIGN32 res = 0.0; +#ifdef MPI + float ALIGN32 mres; +#endif + +#ifdef OMP +#pragma omp parallel + { + int thread_num = omp_get_thread_num(); +#endif + vector4double ks, kc, ds, tr, ts, tt; + vector4double x0, x1, x2, x3, x4, x5, y0, y1, y2, y3, y4, y5; + vector4double z0, z1, z2, z3, z4, z5; + float *s, *r; + vector4double buffer; + __alignx(16, s); + __alignx(16, r); + __alignx(16, S); + __alignx(16, R); + + __prefetch_by_load(S); + __prefetch_by_load(R); + + ks = vec_splats(0.0); + kc = vec_splats(0.0); + +#ifndef OMP +#pragma unroll(2) +#else +#pragma omp for +#endif + for (int ix = 0; ix < N; ++ix) { + s=(float*)((spinor32 *) S + ix); + r=(float*)((spinor32 *) R + ix); + __prefetch_by_load(S + ix + 1); + __prefetch_by_load(R + ix + 1); + x0 = vec_ld(0, s); + x1 = vec_ld(0, s+4); + x2 = vec_ld(0, s+8); + x3 = vec_ld(0, s+12); + x4 = vec_ld(0, s+16); + x5 = vec_ld(0, s+20); + y0 = vec_ld(0, r); + y1 = vec_ld(0, r+4); + y2 = vec_ld(0, r+8); + y3 = vec_ld(0, r+12); + y4 = vec_ld(0, r+16); + y5 = vec_ld(0, r+20); + z0 = vec_mul(x0, y0); + z1 = vec_mul(x1, y1); + z2 = vec_mul(x2, y2); + z3 = vec_mul(x3, y3); + z4 = vec_mul(x4, y4); + z5 = vec_mul(x5, y5); + x0 = vec_add(z0, z1); + x1 = vec_add(z2, z3); + x2 = vec_add(z4, z5); + x3 = vec_add(x0, x1); + ds = vec_add(x2, x3); + + tr = vec_add(ds, kc); + ts = vec_add(tr, ks); + tt = vec_sub(ts, ks); + ks = ts; + kc = vec_sub(tr, tt); + } + buffer = vec_add(kc, ks); + +#ifdef OMP + g_omp_acc_re[thread_num] = buffer[0] + buffer[1] + buffer[2] + buffer[3]; + } /* OpenMP parallel closing brace */ + for( int i = 0; i < omp_num_threads; ++i) + res += g_omp_acc_re[i]; +#else + res = buffer[0] + buffer[1] + buffer[2] + buffer[3]; +#endif + +#if defined MPI + if(parallel) { + MPI_Allreduce(&res, &mres, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD); + return(mres); + } +#endif + + return (res); +} + +#else + +float scalar_prod_r_32(const spinor32 * const S, const spinor32 * const R, const int N, const int parallel) +{ + float ALIGN32 res = 0.0; +#ifdef MPI + float ALIGN32 mres; +#endif + +#ifdef OMP +#pragma omp parallel + { + int thread_num = omp_get_thread_num(); +#endif + float ALIGN32 kc,ks,ds,tr,ts,tt; + const spinor32 *s,*r; + + ks = 0.0; + kc = 0.0; + +#ifdef OMP +#pragma omp for +#endif + for (int ix = 0; ix < N; ++ix) { + s = S + ix; + r = R + ix; + + ds = creal(r->s0.c0 * conj(s->s0.c0)) + creal(r->s0.c1 * conj(s->s0.c1)) + creal(r->s0.c2 * conj(s->s0.c2)) + + creal(r->s1.c0 * conj(s->s1.c0)) + creal(r->s1.c1 * conj(s->s1.c1)) + creal(r->s1.c2 * conj(s->s1.c2)) + + creal(r->s2.c0 * conj(s->s2.c0)) + creal(r->s2.c1 * conj(s->s2.c1)) + creal(r->s2.c2 * conj(s->s2.c2)) + + creal(r->s3.c0 * conj(s->s3.c0)) + creal(r->s3.c1 * conj(s->s3.c1)) + creal(r->s3.c2 * conj(s->s3.c2)); + + tr=ds+kc; + ts=tr+ks; + tt=ts-ks; + ks=ts; + kc=tr-tt; + } + kc=ks+kc; + +#ifdef OMP + g_omp_acc_re[thread_num] = kc; + + } /* OpenMP closing brace */ + + for(int i = 0; i < omp_num_threads; ++i) + res += g_omp_acc_re[i]; +#else + res = kc; +#endif + +#if defined MPI + if(parallel) + { + MPI_Allreduce(&res, &mres, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD); + return mres; + } +#endif + return res; +} + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/scalar_prod_r_32.h b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/scalar_prod_r_32.h new file mode 100644 index 0000000000000000000000000000000000000000..f27c22b737934672a0fe26e204d79b4ec9d67720 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/scalar_prod_r_32.h @@ -0,0 +1,9 @@ +#ifndef _SCALAR_PROD_R_32_H +#define _SCALAR_PROD_R_32_H + +#include "su3.h" + +/* Returns the real part of the scalar product (*R,*S) */ +float scalar_prod_r_32(const spinor32 * const S, const spinor32 * const R, const int N, const int parallel); + +#endif \ No newline at end of file diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/scalar_prod_su3spinor.c b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/scalar_prod_su3spinor.c new file mode 100644 index 0000000000000000000000000000000000000000..8ca2a61c46d3a2026fb5342ea21ade60f5d37fad --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/scalar_prod_su3spinor.c @@ -0,0 +1,129 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#ifdef MPI +#include +#endif +#include "su3.h" +#include "scalar_prod_su3spinor.h" + +#ifdef WITHLAPH +complex_spinor scalar_prod_su3spinor(su3_vector * const S, spinor * const R, const int N, const int parallel){ + int ix; + static _Complex double ks, kc, ds, tr, ts, tt; + su3_vector *s, *r; + complex_spinor c; +#ifdef MPI + complex_spinor d; +#endif + + /* sc0 */ + ks = 0.0; + kc = 0.0; + for (ix = 0; ix < N; ix++) + { + s = (su3_vector *) S + ix; + r = &(R[ix].s0); + + ds = r->c0 * conj(s->c0) + r->c1 * conj(s->c1) + r->c2 * conj(s->c2); + + /* Kahan Summation */ + tr = ds + kc; + ts = tr + ks; + tt = ts - ks; + ks = ts; + kc = tr - tt; + } + kc = ks + kc; + c.sc0 = kc; + + /* sc1 */ + ks = 0.0; + kc = 0.0; + for (ix = 0; ix < N; ix++) + { + s = (su3_vector *) S + ix; + r = &(R[ix].s1); + + ds = r->c0 * conj(s->c0) + r->c1 * conj(s->c1) + r->c2 * conj(s->c2); + + /* Kahan Summation */ + tr = ds + kc; + ts = tr + ks; + tt = ts - ks; + ks = ts; + kc = tr - tt; + } + kc = ks + kc; + c.sc1 = kc; + + /* sc2 */ + ks = 0.0; + kc = 0.0; + for (ix = 0; ix < N; ix++) + { + s = (su3_vector *) S + ix; + r = &(R[ix].s2); + + ds = r->c0 * conj(s->c0) + r->c1 * conj(s->c1) + r->c2 * conj(s->c2); + + /* Kahan Summation */ + tr = ds + kc; + ts = tr + ks; + tt = ts - ks; + ks = ts; + kc = tr - tt; + } + kc = ks + kc; + c.sc2 = kc; + + /* sc3 */ + ks = 0.0; + kc = 0.0; + for (ix = 0; ix < N; ix++) + { + s = (su3_vector *) S + ix; + r = &(R[ix].s3); + + ds = r->c0 * conj(s->c0) + r->c1 * conj(s->c1) + r->c2 * conj(s->c2); + + /* Kahan Summation */ + tr = ds + kc; + ts = tr + ks; + tt = ts - ks; + ks = ts; + kc = tr - tt; + } + kc = ks + kc; + c.sc3 = kc; + +#ifdef MPI + if(parallel == 1) { + d = c; + MPI_Allreduce(&d, &c, 4, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD); //??? + } +#endif + + return(c); +} +#endif // WITHLAPH diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/scalar_prod_su3spinor.h b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/scalar_prod_su3spinor.h new file mode 100644 index 0000000000000000000000000000000000000000..849669f0f5a8f3567644dc05de030c6938b770c7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/scalar_prod_su3spinor.h @@ -0,0 +1,27 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _SCALAR_PRODSU3S_H +#define _SCALAR_PRODSU3S_H + +#include "su3.h" +/* T_alpha=S_a x R_alpha,a^* */ +complex_spinor scalar_prod_su3spinor(su3_vector * const S,spinor * const R, const int N, const int parallel); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/square_and_prod_r.c b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/square_and_prod_r.c new file mode 100644 index 0000000000000000000000000000000000000000..b6482c802ca7585d381bfd6ba07091a20655e53a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/square_and_prod_r.c @@ -0,0 +1,103 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +/******************************************************************************* + * + * File square_and_prod_r.c + * + * void square_and_prod_r(double * const x1, double * const x2, spinor * const S, spinor * const R) + * Returns the real part of (*R,*S) and the square norm of *S + * It's faster than using "scalar_prod_r" and "square_norm" + * + *******************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#ifdef MPI +# include +#endif +#include "su3.h" +#include "square_and_prod_r.h" + +void square_and_prod_r(double * const x1, double * const x2, spinor * const S, spinor * const R, const int N) +{ + int ix; + double ALIGN ks,kc,ds,tr,ts,tt; + double ALIGN xks,xkc,xds,xtr,xts,xtt; + spinor *s,*r; + + ks=0.0; + kc=0.0; + + xks=0.0; + xkc=0.0; + +#if (defined BGL && defined XLC) + __alignx(16, S); + __alignx(16, R); +#endif + + for (ix = 0; ix < N; ix++) + { + s=(spinor *) S + ix; + r=(spinor *) R + ix; + + ds= r->s0.c0 * conj(s->s0.c0) + r->s0.c1 * conj(s->s0.c1) + r->s0.c2 * conj(s->s0.c2) + + r->s1.c0 * conj(s->s1.c0) + r->s1.c1 * conj(s->s1.c1) + r->s1.c2 * conj(s->s1.c2) + + r->s2.c0 * conj(s->s2.c0) + r->s2.c1 * conj(s->s2.c1) + r->s2.c2 * conj(s->s2.c2) + + r->s3.c0 * conj(s->s3.c0) + r->s3.c1 * conj(s->s3.c1) + r->s3.c2 * conj(s->s3.c2); + + xds=s->s0.c0 * conj(s->s0.c0) + s->s0.c1 * conj(s->s0.c1) + s->s0.c2 * conj(s->s0.c2) + + s->s1.c0 * conj(s->s1.c0) + s->s1.c1 * conj(s->s1.c1) + s->s1.c2 * conj(s->s1.c2) + + s->s2.c0 * conj(s->s2.c0) + s->s2.c1 * conj(s->s2.c1) + s->s2.c2 * conj(s->s2.c2) + + s->s3.c0 * conj(s->s3.c0) + s->s3.c1 * conj(s->s3.c1) + s->s3.c2 * conj(s->s3.c2); + + tr=ds + kc; + ts=tr + ks; + tt=ts-ks; + ks=ts; + kc=tr-tt; + + xtr=xds + xkc; + xts=xtr + xks; + xtt=xts-xks; + xks=xts; + xkc=xtr-xtt; + } + xkc=xks + xkc; + *x1=xkc; + +#if defined MPI + + MPI_Allreduce(&xkc, x1, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + +#endif + kc=ks + kc; + *x2=kc; + +#if defined MPI + + MPI_Allreduce(&kc, x2, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + +#endif +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/square_and_prod_r.h b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/square_and_prod_r.h new file mode 100644 index 0000000000000000000000000000000000000000..4f46fa27161b6e593fe5c42d91f119ff61586c18 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/square_and_prod_r.h @@ -0,0 +1,29 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _SQUARE_AND_PROD_R_H +#define _SQUARE_AND_PROD_R_H + +#include "su3.h" + +/* Returns the real part of (*R,*S) and the square norm of *S + * It's faster than using "scalar_prod_r" and "square_norm" */ +void square_and_prod_r(double * const x1, double * const x2, spinor * const S, spinor * const R, const int N); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/square_norm.c b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/square_norm.c new file mode 100644 index 0000000000000000000000000000000000000000..02f50e74a472b70d206fe496e70bf525d2fd773d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/square_norm.c @@ -0,0 +1,357 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * File square_norm.c + * + * double square_norm(spinor * const P ) + * Returns the square norm of *P + * + *******************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#ifdef MPI +# include +#endif +#ifdef OMP +# include +# include "global.h" +#endif +#include +#include "su3.h" +#if (defined SSE || defined SSE2 || defined SSE3) +# include "sse.h" +#endif +#include "square_norm.h" + +#if ((defined BGL) && (defined XLC)) + +/*************************************** + * + * square norm with intrinsics + * + * Carsten.Urbach@liverpool.ac.uk + * + ***************************************/ + +# include"bgl.h" +double square_norm(spinor * const P, const int N, const int parallel) { + int ix=0; + double res, res2; + double *s ALIGN; + double *sp ALIGN; + double _Complex x00, x01, x02, x03, x04, x05, x06, x07, + x08, x09, x10, x11; + double _Complex y00, y01, y02, y03, y04, y05, y06, y07, + y08, y09, y10, y11; + + __alignx(16, P); + s = (double*)P; + sp = s+24; + _prefetch_spinor(sp); + x00 = __lfpd(s); + x01 = __lfpd(s+2); + x02 = __lfpd(s+4); + x03 = __lfpd(s+6); + x04 = __lfpd(s+8); + x05 = __lfpd(s+10); + x06 = __lfpd(s+12); + x07 = __lfpd(s+14); + x08 = __lfpd(s+16); + x09 = __lfpd(s+18); + x10 = __lfpd(s+20); + x11 = __lfpd(s+22); + + y00 = __fpmul(x00, x00); + y01 = __fpmul(x01, x01); + y02 = __fpmul(x02, x02); + y03 = __fpmul(x03, x03); + y04 = __fpmul(x04, x04); + y05 = __fpmul(x05, x05); + y06 = __fpmul(x06, x06); + y07 = __fpmul(x07, x07); + y08 = __fpmul(x08, x08); + y09 = __fpmul(x09, x09); + y10 = __fpmul(x10, x10); + y11 = __fpmul(x11, x11); + s = sp; + + +#pragma unroll(12) + for(ix = 1; ix < N-1; ix++) { + sp+=24;; + _prefetch_spinor(sp); + x00 = __lfpd(s); + x01 = __lfpd(s+2); + x02 = __lfpd(s+4); + x03 = __lfpd(s+6); + x04 = __lfpd(s+8); + x05 = __lfpd(s+10); + x06 = __lfpd(s+12); + x07 = __lfpd(s+14); + x08 = __lfpd(s+16); + x09 = __lfpd(s+18); + x10 = __lfpd(s+20); + x11 = __lfpd(s+22); + y00 = __fpmadd(y00, x00, x00); + y01 = __fpmadd(y01, x01, x01); + y02 = __fpmadd(y02, x02, x02); + y03 = __fpmadd(y03, x03, x03); + y04 = __fpmadd(y04, x04, x04); + y05 = __fpmadd(y05, x05, x05); + y06 = __fpmadd(y06, x06, x06); + y07 = __fpmadd(y07, x07, x07); + y08 = __fpmadd(y08, x08, x08); + y09 = __fpmadd(y09, x09, x09); + y10 = __fpmadd(y10, x10, x10); + y11 = __fpmadd(y11, x11, x11); + s=sp; + } + x00 = __lfpd(s); + x01 = __lfpd(s+2); + x02 = __lfpd(s+4); + x03 = __lfpd(s+6); + x04 = __lfpd(s+8); + x05 = __lfpd(s+10); + x06 = __lfpd(s+12); + x07 = __lfpd(s+14); + x08 = __lfpd(s+16); + x09 = __lfpd(s+18); + x10 = __lfpd(s+20); + x11 = __lfpd(s+22); + y00 = __fpmadd(y00, x00, x00); + y01 = __fpmadd(y01, x01, x01); + y02 = __fpmadd(y02, x02, x02); + y03 = __fpmadd(y03, x03, x03); + y04 = __fpmadd(y04, x04, x04); + y05 = __fpmadd(y05, x05, x05); + y06 = __fpmadd(y06, x06, x06); + y07 = __fpmadd(y07, x07, x07); + y08 = __fpmadd(y08, x08, x08); + y09 = __fpmadd(y09, x09, x09); + y10 = __fpmadd(y10, x10, x10); + y11 = __fpmadd(y11, x11, x11); + + y00 = __fpadd(y00, y01); + y02 = __fpadd(y02, y03); + y04 = __fpadd(y04, y05); + y06 = __fpadd(y06, y07); + y08 = __fpadd(y08, y09); + y10 = __fpadd(y10, y11); + y00 = __fpadd(y00, y02); + y04 = __fpadd(y04, y06); + y08 = __fpadd(y08, y10); + y00 = __fpadd(y00, y04); + y00 = __fpadd(y00, y08); + res = __creal(y00)+__cimag(y00); +# ifdef MPI + if(parallel) { + MPI_Allreduce(&res, &res2, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + return res2; + } +# endif + return res; +} + +#elif (defined BGQ && defined XLC) + +double square_norm(spinor * const P, const int N, const int parallel) { + double ALIGN res = 0.0; +#ifdef MPI + double ALIGN mres; +#endif + +#ifdef OMP +#pragma omp parallel + { + int thread_num = omp_get_thread_num(); +#endif + vector4double x0, x1, x2, x3, x4, x5, y0, y1, y2, y3, y4, y5; + vector4double ds,tt,tr,ts,kc,ks,buffer; + double *s ALIGN; + + ks = vec_splats(0.); + kc = vec_splats(0.); + +#ifndef OMP +#pragma unroll(4) +#else +#pragma omp for +#endif + for(int i = 0; i < N; i++) { + s = (double*)((spinor*) P+i); + __prefetch_by_load(P+i+1); + x0 = vec_ld(0, s); + x1 = vec_ld(0, s+4); + x2 = vec_ld(0, s+8); + x3 = vec_ld(0, s+12); + x4 = vec_ld(0, s+16); + x5 = vec_ld(0, s+20); + y0 = vec_mul(x0, x0); + y1 = vec_mul(x1, x1); + y2 = vec_mul(x2, x2); + y3 = vec_mul(x3, x3); + y4 = vec_mul(x4, x4); + y5 = vec_mul(x5, x5); + + x0 = vec_add(y0, y1); + x1 = vec_add(y2, y3); + x2 = vec_add(y4, y5); + x3 = vec_add(x0, x1); + ds = vec_add(x2, x3); + + tr = vec_add(ds, kc); + ts = vec_add(tr, ks); + tt = vec_sub(ts, ks); + ks = ts; + kc = vec_sub(tr, tt); + } + buffer = vec_add(kc,ks); + +#ifdef OMP + g_omp_acc_re[thread_num] = buffer[0] + buffer[1] + buffer[2] + buffer[3]; + } /* OpenMP closing brace */ + + for(int i = 0; i < omp_num_threads; ++i) + res += g_omp_acc_re[i]; +#else + res = buffer[0] + buffer[1] + buffer[2] + buffer[3]; +#endif + +# ifdef MPI + if(parallel) { + MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + return mres; + } +# endif + + return res; +} + + +#else + +double square_norm(const spinor * const P, const int N, const int parallel) +{ + double ALIGN res = 0.0; +#ifdef MPI + double ALIGN mres; +#endif + +#ifdef OMP +#pragma omp parallel + { + int thread_num = omp_get_thread_num(); + g_omp_acc_re[thread_num] = 0.0; +#endif + double ALIGN ks,kc,ds,tr,ts,tt; + const spinor *s; + + ks = 0.0; + kc = 0.0; + +#ifdef OMP +#pragma omp for +#endif + for (int ix = 0; ix < N; ix++) { + s = P + ix; + + ds = conj(s->s0.c0) * s->s0.c0 + + conj(s->s0.c1) * s->s0.c1 + + conj(s->s0.c2) * s->s0.c2 + + conj(s->s1.c0) * s->s1.c0 + + conj(s->s1.c1) * s->s1.c1 + + conj(s->s1.c2) * s->s1.c2 + + conj(s->s2.c0) * s->s2.c0 + + conj(s->s2.c1) * s->s2.c1 + + conj(s->s2.c2) * s->s2.c2 + + conj(s->s3.c0) * s->s3.c0 + + conj(s->s3.c1) * s->s3.c1 + + conj(s->s3.c2) * s->s3.c2; + + tr = ds + kc; + ts = tr + ks; + tt = ts-ks; + ks = ts; + kc = tr-tt; + } + kc=ks+kc; + +#ifdef OMP + g_omp_acc_re[thread_num] = kc; + + } /* OpenMP closing brace */ + + /* having left the parallel section, we can now sum up the Kahan + corrected sums from each thread into kc */ + for(int i = 0; i < omp_num_threads; ++i) + res += g_omp_acc_re[i]; +#else + res = kc; +#endif + +# ifdef MPI + if(parallel) { + MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + return mres; + } +#endif + + return res; +} + +#endif + +#ifdef WITHLAPH +double square_norm_su3vect(su3_vector * const P, const int N, const int parallel) +{ + int ix; + double ALIGN ks,kc,ds,tr,ts,tt; + su3_vector *s; + + ks = 0.0; + kc = 0.0; + + for (ix = 0; ix < N; ix++) + { + s = P + ix; + + ds = creal(s->c0) * creal(s->c0) + cimag(s->c0) * cimag(s->c0) + + creal(s->c1) * creal(s->c1) + cimag(s->c1) * cimag(s->c1) + + creal(s->c2) * creal(s->c2) + cimag(s->c2) * cimag(s->c2); + + tr = ds + kc; + ts = tr + ks; + tt = ts-ks; + ks = ts; + kc = tr-tt; + } + kc = ks + kc; +# ifdef MPI + if(parallel) { + MPI_Allreduce(&kc, &ks, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + return ks; + } +#endif + return kc; +} +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/square_norm.h b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/square_norm.h new file mode 100644 index 0000000000000000000000000000000000000000..29aa88ba844ebf5224c9859d15d2f001aacd51c6 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/square_norm.h @@ -0,0 +1,35 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _SQUARE_NORM_H +#define _SQUARE_NORM_H + +#include "su3.h" + +/* double square_norm(spinor * const P ) + * Returns the square norm of *P */ + +double square_norm(const spinor * const P, const int N, const int parallel); +double square_norm_su3vect(su3_vector * const P, const int N, const int parallel); + + +#endif + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/square_norm_32.c b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/square_norm_32.c new file mode 100644 index 0000000000000000000000000000000000000000..e9b4f784066f3aef6c5986cb25355b3afa8ca73a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/square_norm_32.c @@ -0,0 +1,164 @@ +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#ifdef MPI +# include +#endif +#ifdef OMP +# include +# include "global.h" +#endif +#include +#include "su3.h" +#include "square_norm_32.h" + +#if (defined BGQ && defined XLC) + +float square_norm_32(spinor32 * const P, const int N, const int parallel) { + float ALIGN32 res = 0.0; +#ifdef MPI + float ALIGN32 mres; +#endif + +#ifdef OMP +#pragma omp parallel + { + int thread_num = omp_get_thread_num(); +#endif + vector4double x0, x1, x2, x3, x4, x5, y0, y1, y2, y3, y4, y5; + vector4double ds,tt,tr,ts,kc,ks,buffer; + float *s ALIGN32; + + ks = vec_splats(0.); + kc = vec_splats(0.); + +#ifndef OMP +#pragma unroll(4) +#else +#pragma omp for +#endif + for(int i = 0; i < N; i++) { + s = (float*)((spinor32*) P+i); + __prefetch_by_load(P+i+1); + x0 = vec_ld(0, s); + x1 = vec_ld(0, s+4); + x2 = vec_ld(0, s+8); + x3 = vec_ld(0, s+12); + x4 = vec_ld(0, s+16); + x5 = vec_ld(0, s+20); + y0 = vec_mul(x0, x0); + y1 = vec_mul(x1, x1); + y2 = vec_mul(x2, x2); + y3 = vec_mul(x3, x3); + y4 = vec_mul(x4, x4); + y5 = vec_mul(x5, x5); + + x0 = vec_add(y0, y1); + x1 = vec_add(y2, y3); + x2 = vec_add(y4, y5); + x3 = vec_add(x0, x1); + ds = vec_add(x2, x3); + + tr = vec_add(ds, kc); + ts = vec_add(tr, ks); + tt = vec_sub(ts, ks); + ks = ts; + kc = vec_sub(tr, tt); + } + buffer = vec_add(kc,ks); + +#ifdef OMP + g_omp_acc_re[thread_num] = buffer[0] + buffer[1] + buffer[2] + buffer[3]; + } /* OpenMP closing brace */ + + for(int i = 0; i < omp_num_threads; ++i) + res += g_omp_acc_re[i]; +#else + res = buffer[0] + buffer[1] + buffer[2] + buffer[3]; +#endif + +# ifdef MPI + if(parallel) { + MPI_Allreduce(&res, &mres, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD); + return mres; + } +# endif + + return res; +} + + +#else +float square_norm_32(const spinor32 * const P, const int N, const int parallel) +{ + float ALIGN32 res = 0.0; +#ifdef MPI + float ALIGN32 mres; +#endif + +#ifdef OMP +#pragma omp parallel + { + int thread_num = omp_get_thread_num(); + g_omp_acc_re[thread_num] = 0.0; +#endif + float ALIGN32 ks,kc,ds,tr,ts,tt; + const spinor32 *s; + + ks = 0.0; + kc = 0.0; + +#ifdef OMP +#pragma omp for +#endif + for (int ix = 0; ix < N; ix++) { + s = P + ix; + + ds = conj(s->s0.c0) * s->s0.c0 + + conj(s->s0.c1) * s->s0.c1 + + conj(s->s0.c2) * s->s0.c2 + + conj(s->s1.c0) * s->s1.c0 + + conj(s->s1.c1) * s->s1.c1 + + conj(s->s1.c2) * s->s1.c2 + + conj(s->s2.c0) * s->s2.c0 + + conj(s->s2.c1) * s->s2.c1 + + conj(s->s2.c2) * s->s2.c2 + + conj(s->s3.c0) * s->s3.c0 + + conj(s->s3.c1) * s->s3.c1 + + conj(s->s3.c2) * s->s3.c2; + + tr = ds + kc; + ts = tr + ks; + tt = ts-ks; + ks = ts; + kc = tr-tt; + } + kc=ks+kc; + +#ifdef OMP + g_omp_acc_re[thread_num] = kc; + + } /* OpenMP closing brace */ + + /* having left the parallel section, we can now sum up the Kahan + corrected sums from each thread into kc */ + for(int i = 0; i < omp_num_threads; ++i) + res += g_omp_acc_re[i]; +#else + res = kc; +#endif + +# ifdef MPI + if(parallel) { + MPI_Allreduce(&res, &mres, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD); + return mres; + } +#endif + + return res; +} + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/square_norm_32.h b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/square_norm_32.h new file mode 100644 index 0000000000000000000000000000000000000000..1a2eb92f4f8bd6419ded73d0e69b35ec8590c08c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg/square_norm_32.h @@ -0,0 +1,10 @@ +#ifndef _SQUARE_NORM_32_H +#define _SQUARE_NORM_32_H + +#include "su3.h" + +/* double square_norm(spinor * const P ) + * Returns the square norm of *P */ + +float square_norm_32(const spinor32 * const P, const int N, const int parallel); +#endif \ No newline at end of file diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/linalg_eo.h b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg_eo.h new file mode 100644 index 0000000000000000000000000000000000000000..6084e47b18eabbc46ca720299a229b54c6d636fe --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/linalg_eo.h @@ -0,0 +1,69 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _LINALG_EO_H +#define _LINALG_EO_H + +#include "linalg/diff.h" +#include "linalg/diff_32.h" +#include "linalg/mul_r.h" +#include "linalg/mul_r_32.h" +#include "linalg/square_norm.h" +#include "linalg/square_norm_32.h" +#include "linalg/scalar_prod_r.h" +#include "linalg/scalar_prod_r_32.h" +#include "linalg/scalar_prod_i.h" +#include "linalg/square_and_prod_r.h" +#include "linalg/assign_add_mul_r.h" +#include "linalg/assign_add_mul_r_32.h" +#include "linalg/assign_mul_bra_add_mul_r.h" +#include "linalg/assign_add_mul_r_add_mul.h" +#include "linalg/assign_mul_bra_add_mul_ket_add_r.h" +#include "linalg/assign_mul_add_mul_add_mul_add_mul_r.h" +#include "linalg/diff_and_square_norm.h" +#include "linalg/assign.h" +#include "linalg/assign_to_32.h" +/* #include "linalg/deri_linalg.h" */ +#include "linalg/assign_mul_add_r.h" +#include "linalg/assign_mul_add_r_32.h" +#include "linalg/assign_mul_add_r_and_square.h" +#include "linalg/scalar_prod.h" +#include "linalg/mul_diff_mul.h" +#include "linalg/assign_add_mul.h" +#include "linalg/assign_mul_add.h" +#include "linalg/assign_diff_mul.h" +#include "linalg/mul_add_mul.h" +#include "linalg/mul.h" +#include "linalg/assign_add_mul_add_mul.h" +#include "linalg/assign_mul_bra_add_mul_ket_add.h" +#include "linalg/add.h" +#include "linalg/addto_32.h" +#include "linalg/assign_to_32.h" +#include "linalg/assign_mul_add_mul_r.h" +#include "linalg/assign_mul_add_mul_r_32.h" +#include "linalg/assign_mul_add_mul_add_mul_r.h" +#include "linalg/mul_add_mul_r.h" + +#include "linalg/comp_decomp.h" + +#include "linalg/mattimesvec.h" + +#include "linalg/convert_eo_to_lexic.h" + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/little_D.c b/qcd/part_cpu/applications/QCD/src/kernel_D/little_D.c new file mode 100644 index 0000000000000000000000000000000000000000..30775e9c537ab7cbc2b491ae09c012616d50bf0b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/little_D.c @@ -0,0 +1,830 @@ +/*********************************************************************** + * + * Copyright (C) 2008 Albert Deuzeman, Siebren Reker, Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#ifdef MPI +# include +#endif +#include "global.h" +#include +#include "block.h" +#include "linalg/blas.h" +#include "solver/gcr4complex.h" +#include "solver/generate_dfl_subspace.h" +#include "block.h" +#include "linalg_eo.h" +#include "little_D.h" + + +/* assume we have a little field w */ +/* which has length 9*nb_blocks*N_s */ +/* with usual order in space */ +/* nb_blocks = 2 currently fixed */ +/* and blocks devide z-direction by 2 */ +/* */ +/* block[0], block[1], block[0], block[1], block[0] ... */ +/* local , +t , -t ... */ +/* */ +/* block[0], block[1], block[0], block[1] */ +/* +z , -z */ +/* wasting some memory here... */ + +int dfl_subspace_updated = 1; + +/* some lapack related stuff */ +static int ONE = 1; +static _Complex double CONE, CZERO, CMONE; + +enum{ + NONE = 0, + T_UP = 1, + T_DN = 2, + X_UP = 3, + X_DN = 4, + Y_UP = 5, + Y_DN = 6, + Z_UP = 7, + Z_DN = 8 +} Direction; + +void init_little_field_exchange(_Complex double * w); +void wait_little_field_exchange(const int mu); + +void unit_little_D(_Complex double *v, _Complex double *w) { + memcpy(v, w, nb_blocks*g_N_s*sizeof(_Complex double)); + + return; +} + +/** ANOTHER TESTING FUNCTION */ +void invert_little_D_spinor(spinor *r, spinor *s){ + int i, j; + spinor **psi; + _Complex double *v, *w; + psi = calloc(nb_blocks, sizeof(spinor)); + v = calloc(nb_blocks * 9 * g_N_s, sizeof(_Complex double)); + w = calloc(nb_blocks * 9 * g_N_s, sizeof(_Complex double)); + psi[0] = calloc(VOLUME+nb_blocks, sizeof(spinor)); + for(i = 1; i < nb_blocks; i++) { + psi[i] = psi[i-1] + (VOLUME / nb_blocks) +1; + } + split_global_field_GEN(psi, s, nb_blocks); // ADAPT THIS + + for (j = 0; j < g_N_s; ++j) {/*loop over block.basis */ + for(i=0;i 0) { + printf("lgcr: %d iterations in invert_little_D_spinor\n", i); + } + + for(i = 0; i < nb_blocks; i++) { + mul(psi[i], w[i*g_N_s], block_list[i].basis[0], VOLUME/nb_blocks); + } + for(j = 1; j < g_N_s; ++j) { + for(i = 0; i < nb_blocks; i++) { + assign_add_mul(psi[i], block_list[i].basis[j], w[j+i*g_N_s], VOLUME/nb_blocks); + } + } + reconstruct_global_field_GEN(r, psi, nb_blocks); // ADAPT THIS + + free(v); + free(w); + free(psi[0]); + free(psi); +} + + +/** ANOTHER TESTING FUNCTION */ +void invert_little_D_eo_spinor(spinor *r, spinor *s){ + int i, j, iter,i_o, i_e; + spinor **psi; + _Complex double *v, *w, *v_o, *v_e, * v_eo, * w_eo, * ctmp2; + psi = calloc(nb_blocks, sizeof(spinor)); + v = calloc(nb_blocks * 9 * g_N_s, sizeof(_Complex double)); + w = calloc(nb_blocks * 9 * g_N_s, sizeof(_Complex double)); + v_e = calloc(nb_blocks * 9 * g_N_s, sizeof(_Complex double)); + v_o = calloc(nb_blocks * 9 * g_N_s, sizeof(_Complex double)); + v_eo = calloc(nb_blocks * 9 * g_N_s, sizeof(_Complex double)); + w_eo = calloc(nb_blocks * 9 * g_N_s, sizeof(_Complex double)); + ctmp2 = calloc(nb_blocks * 9 * g_N_s, sizeof(_Complex double)); + psi[0] = calloc(VOLUME+nb_blocks, sizeof(spinor)); + for(i = 1; i < nb_blocks; i++) { + psi[i] = psi[i-1] + (VOLUME / nb_blocks) +1; + } + split_global_field_GEN(psi, s, nb_blocks); // ADAPT THIS + + for (j = 0; j < g_N_s; ++j) {/*loop over block.basis */ + i_e=0; + i_o=0; + for(i=0;i 0) { + printf("lgcr: %d iterations in invert_little_D_eo_spinor\n", iter); + } + + for(i = 0; i < nb_blocks; i++) { + mul(psi[i], w[i*g_N_s], block_list[i].basis[0], VOLUME/nb_blocks); + } + for(j = 1; j < g_N_s; ++j) { + for(i = 0; i < nb_blocks; i++) { + assign_add_mul(psi[i], block_list[i].basis[j], w[j+i*g_N_s], VOLUME/nb_blocks); + } + } + reconstruct_global_field_GEN(r, psi, nb_blocks); // ADAPT THIS + + free(v); + free(w); + free(w_eo); + free(v_eo); + free(v_o); + free(v_e); + free(ctmp2); + free(psi[0]); + free(psi); +} + + +void project2(spinor * const out, spinor * const in); + +/** ANOTHER TESTING FUNCTION */ +void apply_little_D_spinor(spinor *r, spinor *s){ + int i,j, k; + spinor **psi; + _Complex double *v, *w; + + psi = (spinor **)calloc(nb_blocks, sizeof(spinor *)); + v = calloc(nb_blocks * 9 * g_N_s, sizeof(_Complex double)); + w = calloc(nb_blocks * 9 * g_N_s, sizeof(_Complex double)); + psi[0] = calloc(VOLUME + nb_blocks, sizeof(spinor)); + for(i = 1; i < nb_blocks; i++) { + psi[i] = psi[i-1] + (VOLUME / nb_blocks) + 1; + } + split_global_field_GEN(psi, s, nb_blocks); + + for (j = 0; j < g_N_s; ++j) { + for(i = 0; i < nb_blocks; i++) v[j + i*g_N_s] = scalar_prod(block_list[i].basis[j], psi[i], VOLUME/nb_blocks, 0); + } + + if (g_debug_level > 2){ + if (!g_cart_id) { + for (j = 0; j < nb_blocks* g_N_s; ++j) { + printf("LITTLE_D for 0: v[%u] = %1.5e + %1.5e i\n", j, creal(v[j]), cimag(v[j])); + } + } +#ifdef MPI + MPI_Barrier(MPI_COMM_WORLD); +#endif + } + + if (g_debug_level > 4) { + for (k = 1; k < 16; ++k) { + if (g_cart_id == k) { + for (j = 0; j < nb_blocks* g_N_s; ++j) { + printf("LITTLE_D for %u: v[%u] = %1.5e + %1.5e i\n", k, j, creal(v[j]), cimag(v[j])); + } + } +#ifdef MPI + MPI_Barrier(MPI_COMM_WORLD); +#endif + } + } + + little_D(w, v); + + if (g_debug_level > 2){ + if (!g_cart_id){ + for (j = 0; j < nb_blocks * g_N_s; ++j) { + printf("LITTLE_D for 0: w[%u] = %1.5e + %1.5e i\n", j, creal(w[j]), cimag(w[j])); + } + } +#ifdef MPI + MPI_Barrier(MPI_COMM_WORLD); +#endif + } + + if (g_debug_level > 4) { + for (k = 1; k < 16; ++k) { + if (g_cart_id == k) { + for (j = 0; j < nb_blocks* g_N_s; ++j) { + printf("LITTLE_D for %u: w[%u] = %1.5e + %1.5e i\n", k, j, creal(w[j]), cimag(w[j])); + } + } +#ifdef MPI + MPI_Barrier(MPI_COMM_WORLD); +#endif + } + } + for(i = 0; i < nb_blocks; i++) { + mul(psi[i], w[i*g_N_s], block_list[i].basis[0], VOLUME/nb_blocks); + } + for(j = 1; j < g_N_s; ++j) { + for(i = 0; i < nb_blocks; i++){ + assign_add_mul(psi[i], block_list[i].basis[j], w[i*g_N_s + j], VOLUME/nb_blocks); + } + } + reconstruct_global_field_GEN(r, psi, nb_blocks); + + free(v); + free(w); + free(psi[0]); + free(psi); +} + + +void alt_little_field_gather(_Complex double * w) { +#ifdef MPI + MPI_Status status; + int size = 25 * g_N_s * sizeof(_Complex double); + _Complex double *buf = malloc(size); + MPI_Buffer_attach((void*)buf, size); + + /* LOWER BLOCK */ + + /* Send t up */ + MPI_Bsend(w, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_t_up, T_UP, g_cart_grid); + MPI_Recv(w + 4 * g_N_s, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_t_dn, T_UP, g_cart_grid, &status); + + /* Send t down */ + MPI_Bsend(w, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_t_dn, T_DN, g_cart_grid); + MPI_Recv(w + 2 * g_N_s, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_t_up, T_DN, g_cart_grid, &status); + + /* Send x up */ + MPI_Bsend(w, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_x_up, X_UP, g_cart_grid); + MPI_Recv(w + 8 * g_N_s, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_x_dn, X_UP, g_cart_grid, &status); + + /* Send x down */ + MPI_Bsend(w, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_x_dn, X_DN, g_cart_grid); + MPI_Recv(w + 6 * g_N_s, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_x_up, X_DN, g_cart_grid, &status); + + /* Send y up */ + MPI_Bsend(w, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_y_up, Y_UP, g_cart_grid); + MPI_Recv(w + 12 * g_N_s, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_y_dn, Y_UP, g_cart_grid, &status); + + /* Send y down */ + MPI_Bsend(w, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_y_dn, Y_DN, g_cart_grid); + MPI_Recv(w + 10 * g_N_s, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_y_up, Y_DN, g_cart_grid, &status); + + /* Send z up */ + memcpy(w + 17 * g_N_s, w, g_N_s * sizeof(_Complex double)); + + /* Send z down */ + MPI_Bsend(w, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_z_dn, Z_DN, g_cart_grid); + MPI_Recv(w + 15 * g_N_s, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_z_up, Z_DN, g_cart_grid, &status); + + /* END LOWER BLOCK */ + + MPI_Barrier(MPI_COMM_WORLD); + + /* UPPER BLOCK */ + + /* Send t up */ + MPI_Bsend(w + g_N_s, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_t_up, T_UP, g_cart_grid); + MPI_Recv(w + 5 * g_N_s, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_t_dn, T_UP, g_cart_grid, &status); + + /* Send t down */ + MPI_Bsend(w + g_N_s, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_t_dn, T_DN, g_cart_grid); + MPI_Recv(w + 3 * g_N_s, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_t_up, T_DN, g_cart_grid, &status); + + /* Send x up */ + MPI_Bsend(w + g_N_s, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_x_up, X_UP, g_cart_grid); + MPI_Recv(w + 9 * g_N_s, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_x_dn, X_UP, g_cart_grid, &status); + + /* Send x down */ + MPI_Bsend(w + g_N_s, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_x_dn, X_DN, g_cart_grid); + MPI_Recv(w + 7 * g_N_s, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_x_up, X_DN, g_cart_grid, &status); + + /* Send y up */ + MPI_Bsend(w + g_N_s, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_y_up, Y_UP, g_cart_grid); + MPI_Recv(w + 13 * g_N_s, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_y_dn, Y_UP, g_cart_grid, &status); + + /* Send y down */ + MPI_Bsend(w + g_N_s, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_y_dn, Y_DN, g_cart_grid); + MPI_Recv(w + 11 * g_N_s, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_y_up, Y_DN, g_cart_grid, &status); + + /* Send z up */ + MPI_Bsend(w + g_N_s, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_z_up, Z_UP, g_cart_grid); + MPI_Recv(w + 16 * g_N_s, g_N_s, MPI_DOUBLE_COMPLEX, g_nb_z_dn, Z_UP, g_cart_grid, &status); + + /* Send z down */ + memcpy(w + 14 * g_N_s, w + g_N_s, g_N_s * sizeof(_Complex double)); + + MPI_Barrier(MPI_COMM_WORLD); + MPI_Buffer_detach((void*)buf, &size); + + free(buf); +#endif + return; +} + +#ifdef MPI +MPI_Request lrequests[16]; +MPI_Status lstatus[16]; +int waitcount = 0; +#endif + + +void little_field_gather(_Complex double * w) { +#ifdef MPI + int err, bt, bx, by, bz, pm, ib; + _Complex double *wt, *wx, *wy, *wz; + _Complex double *wt_buf, *wx_buf, *wy_buf, *wz_buf, *w_buf, *w_source, *w_dest; + /************************************************************************/ + /* This routine has been extended for multi_dimensional blocking */ + /* by Claude Tadonki (claude.tadonki@u-psud.fr) from PetaQCD project */ + /* June 2010 */ + /************************************************************************/ + + w_buf = calloc(8 * nb_blocks * g_N_s, sizeof(_Complex double)); // +-t +-x +-y +-z + + wt = w + ( 0*(2*nb_blocks) + nb_blocks ) * g_N_s; // Were data in the direction t starts + wx = w + ( 1*(2*nb_blocks) + nb_blocks ) * g_N_s; // Were data in the direction x starts + wy = w + ( 2*(2*nb_blocks) + nb_blocks ) * g_N_s; // Were data in the direction y starts + wz = w + ( 3*(2*nb_blocks) + nb_blocks ) * g_N_s; // Were data in the direction z starts + + wt_buf = w_buf + ( 0*(2*nb_blocks)) * g_N_s; // Were data in the direction t starts + wx_buf = w_buf + ( 1*(2*nb_blocks)) * g_N_s; // Were data in the direction x starts + wy_buf = w_buf + ( 2*(2*nb_blocks)) * g_N_s; // Were data in the direction y starts + wz_buf = w_buf + ( 3*(2*nb_blocks)) * g_N_s; // Were data in the direction z starts + + /* We first exchange the fields regardless of block considerations */ + /* The data need to be received in an intermediate buffer because of later shuffling */ + + /* Send t up */ + MPI_Isend(w, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_t_up, T_UP, g_cart_grid, &lrequests[0]); + MPI_Irecv(wt_buf + nb_blocks * g_N_s, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_t_dn, T_UP, g_cart_grid, &lrequests[1]); + + /* Send t down */ + MPI_Isend(w, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_t_dn, T_DN, g_cart_grid, &lrequests[2]); + MPI_Irecv(wt_buf, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_t_up, T_DN, g_cart_grid, &lrequests[3]); + + /* Send x up */ + MPI_Isend(w, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_x_up, X_UP, g_cart_grid, &lrequests[4]); + MPI_Irecv(wx_buf + nb_blocks * g_N_s, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_x_dn, X_UP, g_cart_grid, &lrequests[5]); + + /* Send x down */ + MPI_Isend(w, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_x_dn, X_DN, g_cart_grid, &lrequests[6]); + MPI_Irecv(wx_buf, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_x_up, X_DN, g_cart_grid, &lrequests[7]); + + /* Send y up */ + MPI_Isend(w, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_y_up, Y_UP, g_cart_grid, &lrequests[8]); + MPI_Irecv(wy_buf + nb_blocks * g_N_s, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_y_dn, Y_UP, g_cart_grid, &lrequests[9]); + + /* Send y down */ + MPI_Isend(w, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_y_dn, Y_DN, g_cart_grid, &lrequests[10]); + MPI_Irecv(wy_buf, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_y_up, Y_DN, g_cart_grid, &lrequests[11]); + + /* Send z up */ + MPI_Isend(w, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_z_up, Z_UP, g_cart_grid, &lrequests[12]); + MPI_Irecv(wz_buf + nb_blocks * g_N_s, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_z_dn, Z_UP, g_cart_grid, &lrequests[13]); + + /* Send z down */ + MPI_Isend(w, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_z_dn, Z_DN, g_cart_grid, &lrequests[14]); + MPI_Irecv(wz_buf, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_z_up, Z_DN, g_cart_grid, &lrequests[15]); + + err = MPI_Waitall(16, lrequests, lstatus); + + /* We now correct the field according to block partitionning */ + /* We could have avoid the previous corresponding MPI communication */ + /* We proceed like this for code simplicity, maybe will be optimized later */ + + for(pm = 0; pm < 8; pm++) { + for(bt = 0; bt < nblks_t; bt++) { + for(bx = 0; bx < nblks_x; bx++) { + for(by = 0; by < nblks_y; by++) { + for(bz = 0; bz < nblks_z; bz++) { + ib = block_index(bt, bx, by, bz) * g_N_s; + switch(pm){ + case 0: /* Direction +t */ + w_dest = wt + ib; + if( bt == nblks_t - 1 ) {ib = block_index(0, bx, by, bz) * g_N_s; w_source = wt_buf + ib;} // got it from the MPI exchange + else {ib = block_index(bt + 1, bx, by, bz) * g_N_s; w_source = w + ib;} // got it from the diagonal block + break; + case 1: /* Direction -t */ + w_dest = wt + ib + nb_blocks * g_N_s; + if( bt == 0 ) {ib = block_index(nblks_t - 1, bx, by, bz) * g_N_s; w_source = wt_buf + ib + nb_blocks * g_N_s;} // got it from the MPI exchange + else {ib = block_index(bt - 1, bx, by, bz) * g_N_s;w_source = w + ib;} // got it from the diagonal block + break; + case 2: /* Direction +x */ + w_dest = wx + ib; + if( bx == nblks_x - 1 ) {ib = block_index(bt, 0, by, bz) * g_N_s; w_source = wx_buf + ib;} // got it from the MPI exchange + else {ib = block_index(bt, bx + 1, by, bz) * g_N_s; w_source = w + ib;} // got it from the diagonal block + break; + case 3: /* Direction -x */ + w_dest = wx + ib + nb_blocks * g_N_s; + if( bx == 0 ) {ib = block_index(bt, nblks_x - 1, by, bz) * g_N_s; w_source = wx_buf + ib + nb_blocks * g_N_s;} // got it from the MPI exchange + else {ib = block_index(bt, bx - 1, by, bz) * g_N_s;w_source = w + ib;} // got it from the diagonal block + break; + case 4: /* Direction +y */ + w_dest = wy + ib; + if( by == nblks_y - 1 ) {ib = block_index(bt, bx, 0, bz) * g_N_s; w_source = wy_buf + ib;} // got it from the MPI exchange + else {ib = block_index(bt, bx, by + 1, bz) * g_N_s; w_source = w + ib;} // got it from the diagonal block + break; + case 5: /* Direction -y */ + w_dest = wy + ib + nb_blocks * g_N_s; + if( by == 0 ) {ib = block_index(bt, bx, nblks_y - 1, bz) * g_N_s; w_source = wy_buf + ib + nb_blocks * g_N_s;} // got it from the MPI exchange + else {ib = block_index(bt, bx, by - 1, bz) * g_N_s;w_source = w + ib;} // got it from the diagonal block + break; + case 6: /* Direction +z */ + w_dest = wz + ib; + if( bz == nblks_z - 1 ) {ib = block_index(bt, bx, by, 0) * g_N_s; w_source = wz_buf + ib; } // got it from the MPI exchange + else {ib = block_index(bt, bx, by, bz + 1) * g_N_s; w_source = w + ib; } // got it from the diagonal block + break; + case 7: /* Direction -z */ + w_dest = wz + ib + nb_blocks * g_N_s; + if( bz == 0 ) {ib = block_index(bt, bx, by, nblks_z - 1) * g_N_s; w_source = wz_buf + ib + nb_blocks * g_N_s;} // got it from the MPI exchange + else {ib = block_index(bt, bx, by, bz - 1) * g_N_s; w_source = w + ib; } // got it from the diagonal block + break; + + default: + w_dest = NULL; + w_source = NULL; + } + memcpy(w_dest, w_source, g_N_s * sizeof(_Complex double)); + } + } + } + } + } + + free(w_buf); + +#endif + return; +} + +void little_field_gather_eo(int eo, _Complex double * w) { +#ifdef MPI + int err, bt, bx, by, bz, pm, ib,ib2; + _Complex double *wt, *wx, *wy, *wz; + _Complex double *wt_buf, *wx_buf, *wy_buf, *wz_buf, *w_buf, *w_source, *w_dest; + + w_buf = calloc(8 * nb_blocks * g_N_s, sizeof(_Complex double)); // +-t +-x +-y +-z + + wt = w + ( 0*(2*nb_blocks) + nb_blocks ) * g_N_s; // Were data in the direction t starts + wx = w + ( 1*(2*nb_blocks) + nb_blocks ) * g_N_s; // Were data in the direction x starts + wy = w + ( 2*(2*nb_blocks) + nb_blocks ) * g_N_s; // Were data in the direction y starts + wz = w + ( 3*(2*nb_blocks) + nb_blocks ) * g_N_s; // Were data in the direction z starts + + wt_buf = w_buf + ( 0*(2*nb_blocks)) * g_N_s; // Were data in the direction t starts + wx_buf = w_buf + ( 1*(2*nb_blocks)) * g_N_s; // Were data in the direction x starts + wy_buf = w_buf + ( 2*(2*nb_blocks)) * g_N_s; // Were data in the direction y starts + wz_buf = w_buf + ( 3*(2*nb_blocks)) * g_N_s; // Were data in the direction z starts + + /* We first exchange the fields regardless of block considerations */ + /* The data need to be received in an intermediate buffer because of later shuffling */ + + /* Send t up */ + MPI_Isend(w, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_t_up, T_UP, g_cart_grid, &lrequests[0]); + MPI_Irecv(wt_buf + nb_blocks * g_N_s, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_t_dn, T_UP, g_cart_grid, &lrequests[1]); + + /* Send t down */ + MPI_Isend(w, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_t_dn, T_DN, g_cart_grid, &lrequests[2]); + MPI_Irecv(wt_buf, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_t_up, T_DN, g_cart_grid, &lrequests[3]); + + /* Send x up */ + MPI_Isend(w, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_x_up, X_UP, g_cart_grid, &lrequests[4]); + MPI_Irecv(wx_buf + nb_blocks * g_N_s, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_x_dn, X_UP, g_cart_grid, &lrequests[5]); + + /* Send x down */ + MPI_Isend(w, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_x_dn, X_DN, g_cart_grid, &lrequests[6]); + MPI_Irecv(wx_buf, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_x_up, X_DN, g_cart_grid, &lrequests[7]); + + /* Send y up */ + MPI_Isend(w, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_y_up, Y_UP, g_cart_grid, &lrequests[8]); + MPI_Irecv(wy_buf + nb_blocks * g_N_s, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_y_dn, Y_UP, g_cart_grid, &lrequests[9]); + + /* Send y down */ + MPI_Isend(w, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_y_dn, Y_DN, g_cart_grid, &lrequests[10]); + MPI_Irecv(wy_buf, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_y_up, Y_DN, g_cart_grid, &lrequests[11]); + + /* Send z up */ + MPI_Isend(w, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_z_up, Z_UP, g_cart_grid, &lrequests[12]); + MPI_Irecv(wz_buf + nb_blocks * g_N_s, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_z_dn, Z_UP, g_cart_grid, &lrequests[13]); + + /* Send z down */ + MPI_Isend(w, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_z_dn, Z_DN, g_cart_grid, &lrequests[14]); + MPI_Irecv(wz_buf, nb_blocks * g_N_s, MPI_DOUBLE_COMPLEX, g_nb_z_up, Z_DN, g_cart_grid, &lrequests[15]); + + err = MPI_Waitall(16, lrequests, lstatus); + + /* We now correct the field according to block partitionning */ + /* We could have avoid the previous corresponding MPI communication */ + /* We proceed like this for code simplicity, maybe will be optimized later */ + + for(pm = 0; pm < 8; pm++) { + ib2=0; + for(bt = 0; bt < nblks_t; bt++) { + for(bx = 0; bx < nblks_x; bx++) { + for(by = 0; by < nblks_y; by++) { + for(bz = 0; bz < nblks_z; bz++) { + if ((bt+bx+by+bz)%2==eo) { + ib2 = index_block_eo[block_index(bt, bx, by, bz)] * g_N_s; + + switch(pm){ + case 0: /* Direction +t */ + w_dest = wt + ib2; + if( bt == nblks_t - 1 ) {ib = index_block_eo[block_index(0,bx, by,bz)] * g_N_s; w_source = wt_buf + ib; } // got it from the MPI exchange + else {ib = index_block_eo[block_index(bt+1, bx, by, bz)] * g_N_s; w_source = w + ib; } // got it from the diagonal block + break; + case 1: /* Direction -t */ + w_dest = wt + ib2 + nb_blocks * g_N_s; + if( bt == 0) {ib = index_block_eo[block_index(nblks_t-1, bx,by,bz)] * g_N_s; w_source = wt_buf + ib + nb_blocks * g_N_s;} // got it from the MPI exchange + else {ib = index_block_eo[block_index(bt-1,bx, by, bz)] * g_N_s; w_source = w + ib; } // got it from the diagonal block + break; + case 2: /* Direction +x */ + w_dest = wx + ib2; + if( bx == nblks_x - 1 ) {ib = index_block_eo[block_index(bt, 0, by,bz)] * g_N_s; w_source = wx_buf + ib; } // got it from the MPI exchange + else {ib = index_block_eo[block_index(bt, bx+1, by, bz)] * g_N_s; w_source = w + ib; } // got it from the diagonal block + break; + case 3: /* Direction -x */ + w_dest = wx + ib2 + nb_blocks * g_N_s; + if( bx == 0) {ib = index_block_eo[block_index(bt, nblks_x-1, by,bz)] * g_N_s; w_source = wx_buf + ib + nb_blocks * g_N_s;} // got it from the MPI exchange + else {ib = index_block_eo[block_index(bt, bx-1, by, bz)] * g_N_s; w_source = w + ib; } // got it from the diagonal block + break; + case 4: /* Direction +y */ + w_dest = wy + ib2; + if( by == nblks_y - 1 ) {ib = index_block_eo[block_index(bt, bx, 0,bz)] * g_N_s; w_source = wy_buf + ib; } // got it from the MPI exchange + else {ib = index_block_eo[block_index(bt, bx, by+1, bz)] * g_N_s; w_source = w + ib; } // got it from the diagonal block + break; + case 5: /* Direction -y */ + w_dest = wy + ib2 + nb_blocks * g_N_s; + if( by == 0) {ib = index_block_eo[block_index(bt, bx, nblks_y-1, bz)] * g_N_s; w_source = wy_buf + ib + nb_blocks * g_N_s;} // got it from the MPI exchange + else {ib = index_block_eo[block_index(bt, bx, by-1, bz)] * g_N_s; w_source = w + ib; } // got it from the diagonal block + break; + case 6: /* Direction +z */ + w_dest = wz + ib2; + if( bz == nblks_z - 1 ) {ib = index_block_eo[block_index(bt, bx, by, 0)] * g_N_s; w_source = wz_buf + ib; } // got it from the MPI exchange + else {ib = index_block_eo[block_index(bt, bx, by, bz + 1)] * g_N_s; w_source = w + ib; } // got it from the diagonal block + break; + case 7: /* Direction -z */ + w_dest = wz + ib2 + nb_blocks * g_N_s; + if( bz == 0) {ib = index_block_eo[block_index(bt, bx, by, nblks_z - 1)] * g_N_s; w_source = wz_buf + ib + nb_blocks * g_N_s;} // got it from the MPI exchange + else {ib = index_block_eo[block_index(bt, bx, by, bz - 1)] * g_N_s; w_source = w + ib; } // got it from the diagonal block + break; + default: + w_dest = NULL; + w_source = NULL; + } + memcpy(w_dest, w_source, g_N_s * sizeof(_Complex double)); + } + } + } + } + } + } + free(w_buf); +#endif + return; +} + + + +void little_D(_Complex double * v, _Complex double *w) { + int i, j, sq = g_N_s*g_N_s; + CONE = 1.0; + CMONE = -1.0; + CZERO = 0.0; + + if(dfl_subspace_updated) { + compute_little_D(); + dfl_subspace_updated = 0; + } + +#ifdef MPI + /*init_little_field_exchange(w);*/ + little_field_gather(w); +#endif + + /* all the mpilocal stuff first */ + for(i = 0; i < nb_blocks; i++) { + /* diagonal term */ + _FT(zgemv)("N", &g_N_s, &g_N_s, &CONE, block_list[i].little_dirac_operator, + &g_N_s, w + i * g_N_s, &ONE, &CZERO, v + i * g_N_s, &ONE, 1); + + /* offdiagonal terms */ + for(j = 1; j < 9; j++) { + _FT(zgemv)("N", &g_N_s, &g_N_s, &CONE, block_list[i].little_dirac_operator + j * sq, + &g_N_s, w + (nb_blocks * j + i) * g_N_s, &ONE, &CONE, v + i * g_N_s, &ONE, 1); + } + } + return; +} + + +void little_D_sym(_Complex double * v, _Complex double *w) { + + _Complex double* tmpc1, * tmpc2, * tmpc3; + tmpc1 = calloc(nb_blocks * 9 * g_N_s, sizeof(_Complex double)); + tmpc2 = calloc(nb_blocks * 9 * g_N_s, sizeof(_Complex double)); + tmpc3 = calloc(nb_blocks * 9 * g_N_s, sizeof(_Complex double)); + + if(dfl_subspace_updated) { + compute_little_D(); + dfl_subspace_updated = 0; + } + + little_D_hop(0,tmpc1, w); + little_D_ee_inv(tmpc2,tmpc1); + little_D_hop(1,tmpc3, tmpc2); + little_Dhat_lhs(v, w,tmpc3); + + free(tmpc1); + free(tmpc2); + free(tmpc3); + return; +} + + +void little_D_ee_inv(_Complex double * v, _Complex double *w) { + int i; + CONE = 1.0; + CMONE = -1.0; + CZERO = 0.0; + + for(i = 0; i < nb_blocks/2; i++) { + _FT(zgemv)("N", &g_N_s, &g_N_s, &CONE, block_list[i].little_dirac_operator_eo, + &g_N_s, w + i * g_N_s, &ONE, &CZERO, v + i * g_N_s, &ONE, 1); + } + return; +} + + +void little_D_hop(int eo,_Complex double * v, _Complex double *w) { + int i, j, i_eo,sq = g_N_s*g_N_s; + CONE = 1.0; + CMONE = -1.0; + CZERO = 0.0; + + i_eo=(eo+1)%2; + +#ifdef MPI + /*init_little_field_exchange(w);*/ + little_field_gather_eo(eo,w+i_eo*nb_blocks*g_N_s/2); +#endif + + for(i = 0; i < nb_blocks/2; i++) { + for(j = 1; j < 9; j++) { + _FT(zgemv)("N", &g_N_s, &g_N_s, &CONE, block_list[eo*(nb_blocks/2)+i].little_dirac_operator_eo + j * sq, + &g_N_s, w + (nb_blocks * j + (nb_blocks/2)*i_eo+i) * g_N_s, &ONE, &CONE, v + (eo*nb_blocks/2+i) * g_N_s, &ONE, 1); + } + } + return; +} + +void little_Dhat_lhs(_Complex double * v, _Complex double *w, _Complex double *u) { + int i,j; + CONE = 1.0; + CMONE = -1.0; + CZERO = 0.0; + + + for(i = nb_blocks/2; i < nb_blocks; i++) { + _FT(zgemv)("N", &g_N_s, &g_N_s, &CONE, block_list[i].little_dirac_operator_eo, + &g_N_s, w + i * g_N_s, &ONE, &CZERO, v + i * g_N_s, &ONE, 1); + } + + for (i=nb_blocks/2; i < nb_blocks; i++) { + for (j=0;j. + ***********************************************************************/ + +#ifndef _LITTLE_D_H +#define _LITTLE_D_H + +#include + +extern int dfl_subspace_updated; +void little_D(_Complex double * v, _Complex double *w); +void little_D_sym(_Complex double * v, _Complex double *w); +void little_D_ee_inv(_Complex double * v, _Complex double *w); +void little_D_hop(int eo,_Complex double * v, _Complex double *w); +void little_Dhat_lhs(_Complex double * v, _Complex double *w, _Complex double *u); +void little_Dhat_rhs(int eo, _Complex double * v, double r, _Complex double *w); +void unit_little_D(_Complex double *v, _Complex double *w); +void invert_little_D_spinor(spinor *r, spinor *s); +void invert_little_D_eo_spinor(spinor *r, spinor *s); +void apply_little_D_spinor(spinor *r, spinor *s); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/matrix_utils.c b/qcd/part_cpu/applications/QCD/src/kernel_D/matrix_utils.c new file mode 100644 index 0000000000000000000000000000000000000000..fc2637a08c6765f209d0082981a0d774945578ef --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/matrix_utils.c @@ -0,0 +1,138 @@ +/*********************************************************************** + * + * Copyright (C) 2013 Albert Deuzeman + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + ************************************************************************/ + +#if HAVE_CONFIG_H +#include +#endif +#include +#include + +#if (defined SSE || defined SSE2 || defined SSE3) +# include "sse.h" +#endif +#include "su3.h" + +#ifndef OMP +static +#endif +void exponent_from_coefficients(su3 *out, _Complex double f0, _Complex double f1, _Complex double f2, su3 const *in) +{ + su3 ALIGN tmp; + _complex_times_su3(tmp, f2, *in); + _su3_add_equals_complex_identity(tmp, f1); + _su3_times_su3(*out, tmp, *in); + _su3_add_equals_complex_identity(*out, f0); +} + +void cayley_hamilton_exponent(su3* expA, su3 const *A) +{ + static double const fac_1_3 = 1 / 3.0; + + _Complex double f0,f1,f2; + + /* c0 = det[A] */ + double c0 = I * (A->c00 * (A->c11 * A->c22 - A->c12 * A->c21) + + A->c01 * (A->c12 * A->c20 - A->c10 * A->c22) + + A->c02 * (A->c10 * A->c21 - A->c11 * A->c20) ); + + /* c1 = 0.5 * Tr[AA] */ + double c1 = -0.5 * (A->c00 * A->c00 + A->c01 * A->c10 + A->c02 * A->c20 + + A->c10 * A->c01 + A->c11 * A->c11 + A->c12 * A->c21 + + A->c20 * A->c02 + A->c21 * A->c12 + A->c22 * A->c22 ); + + /* There is a special, but common (cold start) case where the given matrix is actually 0! + * We need to account for it. */ + if (c0 == 0 && c1 == 0) + { + _su3_one(*expA); + f1 = I; + f2 = -0.5; + return; + } + + /* P&M give symmetry relations that can be used when c0 < 0, to avoid the numerically problematic c0 -> -c0_max limit. + We note the sign here for future reference, then continue with c0 as if it were positive. */ + int c0_negative = (c0 < 0); + c0 = fabs(c0); + + /* The call to fmin below is needed, because for small deviations alpha from zero -- O(10e-12) -- rounding errors can cause c0 > c0max by epsilon. + In that case, acos(c0/c0max) will produce NaNs, whereas the mathematically correct conclusion would be that theta is zero to machine precision! + Note that this approach will *not* produce identity and zero for all output, but rather the correct answer of order (I + alpha) for exp(iQ). */ + + double c0max = 2.0 * pow(fac_1_3 * c1, 1.5); + double theta_3 = fac_1_3 * acos(fmin(c0 / c0max, 1.0)); + + double u = sqrt(fac_1_3 * c1) * cos(theta_3); + double w = sqrt(c1) * sin(theta_3); + + /* Calculate and cache some repeating factors. * + * We can fold in the sign immediately -- c.f. f_j(-c0, c1) = -1^j * conj(f_j(c0, c1)) + * This should just amount to potentially adding a minus to all imaginary components and an overall phase for f1. */ + _Complex double ma = cexp(2 * I * u); + _Complex double mb = cexp(-I * u); + double cw = cos(w); + double u2 = u * u; + double w2 = w * w; + + /* Modification w.r.t. Peardon & Morningstar: w is always positive, so |w| = w */ + double xi0 = (w > 0.05) ? (sin(w) / w) + : 1 - 0.16666666666666667 * w2 * (1 - 0.05 * w2 * (1 - 0.023809523809523808 * w2)); + double divisor = 1.0 / (9.0 * u2 - w2); + + f0 = divisor * (ma * (u * u - w * w) + mb * (8 * u * u * cw + 2 * I * u * (3 * u * u + w * w) * xi0)); + f1 = divisor * (-2 * I * u * ma + mb * (2 * I * u * cw + (3 * u * u - w * w) * xi0)); + f2 = divisor * (mb * (cw + 3 * I * u * xi0) - ma); + + /* The first point where we use the symmetry relations to calculate the negative c0 possibility */ + if (c0_negative) + { + f0 = conj(f0); + f1 = conj(f1); + f2 = conj(f2); + } + + exponent_from_coefficients(expA, f0, f1, f2, A); + + return; + } + +void project_traceless_antiherm(su3 *in) +{ + static const double fac_3 = 1.00 / 3.00; + double tr_in = fac_3 * (cimag(in->c00) + cimag(in->c11) + cimag(in->c22)); + + in->c00 = (cimag(in->c00) - tr_in) * I; + in->c11 = (cimag(in->c11) - tr_in) * I; + in->c22 = (cimag(in->c22) - tr_in) * I; + + in->c01 -= conj(in->c10); + in->c01 *= 0.50; + in->c10 = -conj(in->c01); + + in->c02 -= conj(in->c20); + in->c02 *= 0.50; + in->c20 = -conj(in->c02); + + in->c12 -= conj(in->c21); + in->c12 *= 0.50; + in->c21 = -conj(in->c12); +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/matrix_utils.h b/qcd/part_cpu/applications/QCD/src/kernel_D/matrix_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..c4dec2f8c4afee9b9a16b51981779d7a62aa06ea --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/matrix_utils.h @@ -0,0 +1,28 @@ +/*********************************************************************** + * + * Copyright (C) 2013 Albert Deuzeman + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + ************************************************************************/ + +#ifndef _MATRIX_UTILS_H +#define _MATRIX_UTILS_H + +void cayley_hamilton_exponent(su3* expA, su3 const *A); +void project_traceless_antiherm(su3* M); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/meas/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_D/meas/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..5c3019fe98a5769af23686e4ebcbd304fc7ff4ba --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/meas/Makefile @@ -0,0 +1,94 @@ + +srcdir = . +top_builddir = .. +abs_top_builddir = /home/jacob/Job/Prace/kernel_D_soft/kernel_D_update2 +top_srcdir = .. +abs_top_srcdir = /home/jacob/Job/Prace/kernel_D_soft/kernel_D_update2 +subdir = meas +builddir = . + +CFLAGS = -std=c99 -fopenmp -pedantic -Wall +DEPFLAGS = -MM +LDFLAGS = -L${HOME}/lib -L${top_builddir}/lib +DEFS = -DHAVE_CONFIG_H +OPTARGS = -O + +AR = ar +RANLIB = ranlib +CC = mpicc +CCDEP = gcc +CCLD = $(CC) +LINK = $(CCLD) $(CFLAGS) $(LDFLAGS) ${OPTARGS} -o $@ +LEX = flex +AUTOCONF = autoconf +DEFS = -DHAVE_CONFIG_H + +LEMON_AVAILABLE = 0 + +INCLUDES = -I$(HOME)/include/ -I. -I${abs_top_builddir}/ -I${abs_top_srcdir}/ -I/include/ -I/include/ +LDADD = +COMPILE = ${CC} ${DEFS} ${INCLUDES} ${CFLAGS} ${OPTARGS} + +LIBRARIES = libmeas + +libmeas_TARGETS = measurements \ + oriented_plaquettes \ + correlators \ + pion_norm \ + polyakov_loop \ + energy_density gradient_flow + +libmeas_OBJECTS = $(addsuffix .o, ${libmeas_TARGETS}) + +# default rule + +all: Makefile dep libmeas.a + +# rules for debugging +debug all-debug: CFLAGS := $(CFLAGS) -g +debug all-debug: all + +# rules for profiling information +profile all-profile: CFLAGS := $(filter-out -fomit-frame-pointer,${CFLAGS}) +profile all-profile: all + + +#include dep rules +-include $(addsuffix .d,${libmeas_TARGETS}) + +include ${top_srcdir}/Makefile.global + +# rule to compile objects + +%.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h + $(COMPILE) -c $< + +libmeas.a: ${libmeas_OBJECTS} Makefile + @rm -f libmeas.a + @${AR} cru libmeas.a $(libmeas_OBJECTS) + @$(RANLIB) libmeas.a + @cp libmeas.a ${top_builddir}/lib/libmeas.a + +# rule to generate .d files + +$(addsuffix .d,$(libmeas_TARGETS)): %.d: ${srcdir}/%.c Makefile + @$(CCDEP) ${DEFS} ${DEPFLAGS} ${INCLUDES} $< > $@ + +# rule to make dependencies + +dep: ${addsuffix .d, ${libmeas_TARGETS}} + +# rules to clean + +compile-clean: Makefile + rm -f ${$(addsuffix _OBJECTS, ${LIBRARIES})} *.d + +clean: compile-clean + rm -f $(addsuffix .a, ${LIBRARIES}) + rm -f ../lib/libmeas.a + +distclean: clean + rm -f Makefile + + +.PHONY: all dep clean compile-clean distclean debug all-debug profile all-profile diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/meas/Makefile.in b/qcd/part_cpu/applications/QCD/src/kernel_D/meas/Makefile.in new file mode 100644 index 0000000000000000000000000000000000000000..f672142956045e444ce8bdeadd97332b9d162bcd --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/meas/Makefile.in @@ -0,0 +1,94 @@ + +srcdir = @srcdir@ +top_builddir = @top_builddir@ +abs_top_builddir = @abs_top_builddir@ +top_srcdir = @top_srcdir@ +abs_top_srcdir = @abs_top_srcdir@ +subdir = meas +builddir = @builddir@ + +CFLAGS = @CFLAGS@ @MEASDIR@ +DEPFLAGS = @DEPFLAGS@ +LDFLAGS = @LDFLAGS@ +DEFS = @DEFS@ +OPTARGS = @OPTARGS@ + +AR = @AR@ +RANLIB = @RANLIB@ +CC = @CC@ +CCDEP = @CCDEP@ +CCLD = $(CC) +LINK = $(CCLD) $(CFLAGS) $(LDFLAGS) ${OPTARGS} -o $@ +LEX = @LEX@ +AUTOCONF = @AUTOCONF@ +DEFS = @DEFS@ + +LEMON_AVAILABLE = @LEMON_AVAILABLE@ + +INCLUDES = @INCLUDES@ +LDADD = +COMPILE = ${CC} ${DEFS} ${INCLUDES} ${CFLAGS} ${OPTARGS} + +LIBRARIES = libmeas + +libmeas_TARGETS = measurements \ + oriented_plaquettes \ + correlators \ + pion_norm \ + polyakov_loop \ + energy_density gradient_flow + +libmeas_OBJECTS = $(addsuffix .o, ${libmeas_TARGETS}) + +# default rule + +all: Makefile dep libmeas.a + +# rules for debugging +debug all-debug: CFLAGS := $(CFLAGS) @DEBUG_FLAG@ +debug all-debug: all + +# rules for profiling information +profile all-profile: CFLAGS := $(filter-out -fomit-frame-pointer,${CFLAGS}) @PROFILE_FLAG@ +profile all-profile: all + + +#include dep rules +-include $(addsuffix .d,${libmeas_TARGETS}) + +include ${top_srcdir}/Makefile.global + +# rule to compile objects + +%.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h + $(COMPILE) -c $< + +libmeas.a: ${libmeas_OBJECTS} Makefile + @rm -f libmeas.a + @${AR} cru libmeas.a $(libmeas_OBJECTS) + @$(RANLIB) libmeas.a + @cp libmeas.a ${top_builddir}/lib/libmeas.a + +# rule to generate .d files + +$(addsuffix .d,$(libmeas_TARGETS)): %.d: ${srcdir}/%.c Makefile + @$(CCDEP) ${DEFS} ${DEPFLAGS} ${INCLUDES} $< > $@ + +# rule to make dependencies + +dep: ${addsuffix .d, ${libmeas_TARGETS}} + +# rules to clean + +compile-clean: Makefile + rm -f ${$(addsuffix _OBJECTS, ${LIBRARIES})} *.d + +clean: compile-clean + rm -f $(addsuffix .a, ${LIBRARIES}) + rm -f ../lib/libmeas.a + +distclean: clean + rm -f Makefile + + +.PHONY: all dep clean compile-clean distclean debug all-debug profile all-profile diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/meas/correlators.c b/qcd/part_cpu/applications/QCD/src/kernel_D/meas/correlators.c new file mode 100644 index 0000000000000000000000000000000000000000..c5091ac05281d1040a17fa00b2be362c07836dec --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/meas/correlators.c @@ -0,0 +1,227 @@ +/*********************************************************************** + * + * Copyright (C) 2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#include "global.h" +#include "start.h" +#include "ranlxs.h" +#include "su3spinor.h" +#include "source_generation.h" +#include "operator.h" +#include "invert_eo.h" +#include "solver/solver.h" +#include "geometry_eo.h" +#include "linalg/convert_eo_to_lexic.h" +#include "measurements.h" +#include "correlators.h" +#include "gettime.h" + + +/****************************************************** + * + * This routine computes the correlators + * , and () + * using a stochastic time slice source + * and only one inversion (actually A_0) + * + * for we would need another inversion + * + * + * + ******************************************************/ + +void correlators_measurement(const int traj, const int id, const int ieo) { + int i, j, t, tt, t0; + double *Cpp = NULL, *Cpa = NULL, *Cp4 = NULL; + double res = 0., respa = 0., resp4 = 0.; + double atime, etime; + float tmp; + operator * optr; +#ifdef MPI + double mpi_res = 0., mpi_respa = 0., mpi_resp4 = 0.; + // send buffer for MPI_Gather + double *sCpp = NULL, *sCpa = NULL, *sCp4 = NULL; +#endif + FILE *ofs; + char *filename; + char buf[100]; + spinor phi; + filename=buf; + sprintf(filename,"%s%.6d", "onlinemeas." ,traj); + + init_operators(); + if(no_operators < 1) { + if(g_proc_id == 0) { + fprintf(stderr, "Warning! no operators defined in input file, cannot perform online correlator mesurements!\n"); + } + return; + } + if(no_operators > 1 && g_proc_id == 0) { + fprintf(stderr, "Warning! number of operators defined larger than 1, using only the first!\n"); + } + optr = &operator_list[0]; + // we don't want to do inversion twice for this purpose here + optr->DownProp = 0; + if(optr->type != TMWILSON && optr->type != WILSON && optr->type != CLOVER) { + if(g_proc_id == 0) { + fprintf(stderr, "Warning! correlator online measurement currently only implemented for TMWILSON, WILSON and CLOVER\n"); + fprintf(stderr, "Cannot perform correlator online measurement!\n"); + } + return; + } + + /* generate random timeslice */ + if(ranlxs_init == 0) { + rlxs_init(1, 123456); + } + ranlxs(&tmp, 1); + t0 = (int)(measurement_list[id].max_source_slice*tmp); +#ifdef MPI + MPI_Bcast(&t0, 1, MPI_INT, 0, MPI_COMM_WORLD); +#endif + if(g_debug_level > 1 && g_proc_id == 0) { + printf("# timeslice set to %d (T=%d) for online measurement\n", t0, g_nproc_t*T); + printf("# online measurements parameters: kappa = %g, mu = %g\n", optr->kappa, optr->mu/2./optr->kappa); + } + atime = gettime(); + +#ifdef MPI + sCpp = (double*) calloc(T, sizeof(double)); + sCpa = (double*) calloc(T, sizeof(double)); + sCp4 = (double*) calloc(T, sizeof(double)); + if(g_mpi_time_rank == 0) { + Cpp = (double*) calloc(g_nproc_t*T, sizeof(double)); + Cpa = (double*) calloc(g_nproc_t*T, sizeof(double)); + Cp4 = (double*) calloc(g_nproc_t*T, sizeof(double)); + } +#else + Cpp = (double*) calloc(T, sizeof(double)); + Cpa = (double*) calloc(T, sizeof(double)); + Cp4 = (double*) calloc(T, sizeof(double)); +#endif + source_generation_pion_only(g_spinor_field[0], g_spinor_field[1], + t0, 0, traj); + optr->sr0 = g_spinor_field[0]; + optr->sr1 = g_spinor_field[1]; + optr->prop0 = g_spinor_field[2]; + optr->prop1 = g_spinor_field[3]; + + // op_id = 0, index_start = 0, write_prop = 0 + optr->inverter(0, 0, 0); + + /* now we bring it to normal format */ + /* here we use implicitly DUM_MATRIX and DUM_MATRIX+1 */ + convert_eo_to_lexic(g_spinor_field[DUM_MATRIX], g_spinor_field[2], g_spinor_field[3]); + + /* now we sum only over local space for every t */ + for(t = 0; t < T; t++) { + j = g_ipt[t][0][0][0]; + res = 0.; + respa = 0.; + resp4 = 0.; + for(i = j; i < j+LX*LY*LZ; i++) { + res += _spinor_prod_re(g_spinor_field[DUM_MATRIX][i], g_spinor_field[DUM_MATRIX][i]); + _gamma0(phi, g_spinor_field[DUM_MATRIX][i]); + respa += _spinor_prod_re(g_spinor_field[DUM_MATRIX][i], phi); + _gamma5(phi, phi); + resp4 += _spinor_prod_im(g_spinor_field[DUM_MATRIX][i], phi); + } + +#if defined MPI + MPI_Reduce(&res, &mpi_res, 1, MPI_DOUBLE, MPI_SUM, 0, g_mpi_time_slices); + res = mpi_res; + MPI_Reduce(&respa, &mpi_respa, 1, MPI_DOUBLE, MPI_SUM, 0, g_mpi_time_slices); + respa = mpi_respa; + MPI_Reduce(&resp4, &mpi_resp4, 1, MPI_DOUBLE, MPI_SUM, 0, g_mpi_time_slices); + resp4 = mpi_resp4; + sCpp[t] = +res/(g_nproc_x*LX)/(g_nproc_y*LY)/(g_nproc_z*LZ)/2./optr->kappa/optr->kappa; + sCpa[t] = -respa/(g_nproc_x*LX)/(g_nproc_y*LY)/(g_nproc_z*LZ)/2./optr->kappa/optr->kappa; + sCp4[t] = +resp4/(g_nproc_x*LX)/(g_nproc_y*LY)/(g_nproc_z*LZ)/2./optr->kappa/optr->kappa; +#else + Cpp[t] = +res/(g_nproc_x*LX)/(g_nproc_y*LY)/(g_nproc_z*LZ)/2./optr->kappa/optr->kappa; + Cpa[t] = -respa/(g_nproc_x*LX)/(g_nproc_y*LY)/(g_nproc_z*LZ)/2./optr->kappa/optr->kappa; + Cp4[t] = +resp4/(g_nproc_x*LX)/(g_nproc_y*LY)/(g_nproc_z*LZ)/2./optr->kappa/optr->kappa; +#endif + } + +#ifdef MPI + /* some gymnastics needed in case of parallelisation */ + if(g_mpi_time_rank == 0) { + MPI_Gather(sCpp, T, MPI_DOUBLE, Cpp, T, MPI_DOUBLE, 0, g_mpi_SV_slices); + MPI_Gather(sCpa, T, MPI_DOUBLE, Cpa, T, MPI_DOUBLE, 0, g_mpi_SV_slices); + MPI_Gather(sCp4, T, MPI_DOUBLE, Cp4, T, MPI_DOUBLE, 0, g_mpi_SV_slices); + } +#endif + + /* and write everything into a file */ + if(g_mpi_time_rank == 0 && g_proc_coords[0] == 0) { + ofs = fopen(filename, "w"); + fprintf( ofs, "1 1 0 %e %e\n", Cpp[t0], 0.); + for(t = 1; t < g_nproc_t*T/2; t++) { + tt = (t0+t)%(g_nproc_t*T); + fprintf( ofs, "1 1 %d %e ", t, Cpp[tt]); + tt = (t0+g_nproc_t*T-t)%(g_nproc_t*T); + fprintf( ofs, "%e\n", Cpp[tt]); + } + tt = (t0+g_nproc_t*T/2)%(g_nproc_t*T); + fprintf( ofs, "1 1 %d %e %e\n", t, Cpp[tt], 0.); + + fprintf( ofs, "2 1 0 %e %e\n", Cpa[t0], 0.); + for(t = 1; t < g_nproc_t*T/2; t++) { + tt = (t0+t)%(g_nproc_t*T); + fprintf( ofs, "2 1 %d %e ", t, Cpa[tt]); + tt = (t0+g_nproc_t*T-t)%(g_nproc_t*T); + fprintf( ofs, "%e\n", Cpa[tt]); + } + tt = (t0+g_nproc_t*T/2)%(g_nproc_t*T); + fprintf( ofs, "2 1 %d %e %e\n", t, Cpa[tt], 0.); + + fprintf( ofs, "6 1 0 %e %e\n", Cp4[t0], 0.); + for(t = 1; t < g_nproc_t*T/2; t++) { + tt = (t0+t)%(g_nproc_t*T); + fprintf( ofs, "6 1 %d %e ", t, Cp4[tt]); + tt = (t0+g_nproc_t*T-t)%(g_nproc_t*T); + fprintf( ofs, "%e\n", Cp4[tt]); + } + tt = (t0+g_nproc_t*T/2)%(g_nproc_t*T); + fprintf( ofs, "6 1 %d %e %e\n", t, Cp4[tt], 0.); + fclose(ofs); + } +#ifdef MPI + if(g_mpi_time_rank == 0) { + free(Cpp); free(Cpa); free(Cp4); + } + free(sCpp); free(sCpa); free(sCp4); +#else + free(Cpp); free(Cpa); free(Cp4); +#endif + etime = gettime(); + + if(g_proc_id == 0 && g_debug_level > 0) { + printf("ONLINE: measurement done int t/s = %1.4e\n", etime - atime); + } + return; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/meas/correlators.h b/qcd/part_cpu/applications/QCD/src/kernel_D/meas/correlators.h new file mode 100644 index 0000000000000000000000000000000000000000..c9a1c4ac0c532029d04ce02fc568c8a8e29d5dec --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/meas/correlators.h @@ -0,0 +1,26 @@ +/*********************************************************************** + * + * Copyright (C) 2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _ONLINE_MEASUREMENT_H +#define _ONLINE_MEASUREMENT_H + +void correlators_measurement(const int traj, const int t0, const int ieo); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/meas/energy_density.c b/qcd/part_cpu/applications/QCD/src/kernel_D/meas/energy_density.c new file mode 100644 index 0000000000000000000000000000000000000000..0853f627da22c3ab85f912ee3678af7dcce69bed --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/meas/energy_density.c @@ -0,0 +1,152 @@ +/*********************************************************************** +* +* Copyright (C) 1995 Ulli Wolff, Stefan Sint +* 2001,2005 Martin Hasenbusch +* 2011,2012 Carsten Urbach +* 2013 Albert Deuzeman +* 2015 Bartosz Kostrzewa +* +* This file is part of tmLQCD. +* +* tmLQCD is free software: you can redistribute it and/or modify +* it under the terms of the GNU General Public License as published by +* the Free Software Foundation, either version 3 of the License, or +* (at your option) any later version. +* +* tmLQCD is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +* GNU General Public License for more details. +* +* You should have received a copy of the GNU General Public License +* along with tmLQCD. If not, see . +***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include +#include +#include +#include +#ifdef MPI +# include +#endif +#ifdef OMP +# include +#endif +#include "global.h" +#include "su3.h" +#include "sse.h" +#include "su3adj.h" +#include "matrix_utils.h" + +void measure_energy_density(const su3 ** const gf, double *ret) +{ + // we have iG_\mu\nu = 1/4 P_T.A. [clover] where P is the projection to the + // traceless anti-hermitian part + // the minus sign compensates for the i^2 in the lattice definition of G_\mu\nu + // our traceless anti-hermitian projection includes a factor of 0.5, so instead of + // the usual (1/8)^2 we get (1/4)^2 of the clover + // 1/4 from the definition of the energy density = 1\4 (G_\mu\nu)^2 + // The factor of 4 makes the result agree (at large t and keeping in mind discretization errors) + // with the plaquette definition and with papers... I don't understand where it comes from... + double normalization = - 4 / ( 4 * 16.0 * VOLUME * g_nproc); + double res = 0; +#ifdef MPI + double ALIGN mres=0; +#endif + +#ifdef OMP +#pragma omp parallel + { +#endif + su3 ALIGN v1, v2, plaq; + double ALIGN ac,tr,ts,tt,kc=0,ks=0; + su3 ALIGN trace; + + /* compute the clover-leave */ + /* l __ __ + | | | | + |__| |__| + __ __ + | | | | + |__| |__| k */ + +#ifdef OMP +#pragma omp for +#endif + for(int x = 0; x < VOLUME; x++) + { + for(int k = 0; k < 4; k++) + { + for(int l = k+1; l < 4; l++) + { + int xpk = g_iup[x][k]; + int xpl = g_iup[x][l]; + int xmk = g_idn[x][k]; + int xml = g_idn[x][l]; + int xpkml = g_idn[xpk][l]; + int xplmk = g_idn[xpl][k]; + int xmkml = g_idn[xml][k]; + const su3 *w1 = &gf[x][k]; + const su3 *w2 = &gf[xpk][l]; + const su3 *w3 = &gf[xpl][k]; + const su3 *w4 = &gf[x][l]; + _su3_times_su3(v1, *w1, *w2); + _su3_times_su3(v2, *w4, *w3); + _su3_times_su3d(plaq, v1, v2); + w1 = &gf[x][l]; + w2 = &gf[xplmk][k]; + w3 = &gf[xmk][l]; + w4 = &gf[xmk][k]; + _su3_times_su3d(v1, *w1, *w2); + _su3d_times_su3(v2, *w3, *w4); + _su3_times_su3_acc(plaq, v1, v2); + w1 = &gf[xmk][k]; + w2 = &gf[xmkml][l]; + w3 = &gf[xmkml][k]; + w4 = &gf[xml][l]; + _su3_times_su3(v1, *w2, *w1); + _su3_times_su3(v2, *w3, *w4); + _su3d_times_su3_acc(plaq, v1, v2); + w1 = &gf[xml][l]; + w2 = &gf[xml][k]; + w3 = &gf[xpkml][l]; + w4 = &gf[x][k]; + _su3d_times_su3(v1, *w1, *w2); + _su3_times_su3d(v2, *w3, *w4); + _su3_times_su3_acc(plaq, v1, v2); + project_traceless_antiherm(&plaq); + _trace_su3_times_su3(ac, plaq, plaq); // This should actually be the energy density already... + + // Kahan summation for each thread + tr=ac+kc; + ts=tr+ks; + tt=ts-ks; + ks=ts; + kc=tr-tt; + } + } + } + kc=kc+ks; +#ifdef OMP + int thread_num = omp_get_thread_num(); + g_omp_acc_re[thread_num] = kc; + } /* OpenMP parallel closing brace */ + + for(int i=0; i < omp_num_threads; ++i) { + res += g_omp_acc_re[i]; + } +#else + res = kc; +#endif +#ifdef MPI + MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + res = mres; +#endif + *ret = normalization * res; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/meas/energy_density.h b/qcd/part_cpu/applications/QCD/src/kernel_D/meas/energy_density.h new file mode 100644 index 0000000000000000000000000000000000000000..c58a6a06d8c0dbb46be6aba05b9bd6ad59833aef --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/meas/energy_density.h @@ -0,0 +1,26 @@ +/*********************************************************************** + * + * Copyright (C) 2015 Bartosz Kostrzewa + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _ENERGY_DENSITY_H +#define _ENERGY_DENSITY_H + +void measure_energy_density(const su3 ** const gf, double *ret); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/meas/gradient_flow.c b/qcd/part_cpu/applications/QCD/src/kernel_D/meas/gradient_flow.c new file mode 100644 index 0000000000000000000000000000000000000000..39a3e601cc0e75e92c193893df3744a3edeb2396 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/meas/gradient_flow.c @@ -0,0 +1,214 @@ +/*********************************************************************** + * + * Copyright (C) 2013 Albert Deuzeman + * 2015 Bartosz Kostrzewa + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + ************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef OMP +# include +#endif +#ifdef MPI +# include +#endif + +#include +#include +#include +#include "global.h" +#include "fatal_error.h" +#include "aligned_malloc.h" +#include "energy_density.h" +#include "expo.h" +#include "get_staples.h" +#include "get_rectangle_staples.h" +#include "gettime.h" +#include "measure_gauge_action.h" +#include "matrix_utils.h" +#include "xchange/xchange_gauge.h" +#include "gradient_flow.h" + +void step_gradient_flow(su3 ** x0, su3 ** x1, su3 ** x2, su3 ** z, const unsigned int type, const double eps ) { + double zfac[5] = { 1, (8.0)/(9.0), (-17.0)/(36.0), (3.0)/(4.0), -1 }; + double zepsfac[3] = { 0.25, 1, 1 }; + su3** fields[4]; + + fields[0] = x0; + fields[1] = x1; + fields[2] = x2; + fields[3] = x0; + +#ifdef OMP +#pragma omp parallel +#endif + { + + su3 ALIGN w,w1,w2; + su3 ALIGN z_tmp,z_tmp1; + +#ifdef MPI +#ifdef OMP +#pragma omp single +#endif + { + xchange_gauge(x0); + } +#endif + + // implementation of third-order Runge-Kutta integrator following Luescher's hep-lat/1006.4518 + // this can probably be improved... + + for( int f = 0; f < 3; ++f ){ +#ifdef OMP +#pragma omp for +#endif + for( int x = 0; x < VOLUME; ++x ){ + for( int mu = 0; mu < 4; ++mu ){ + get_staples(&w1, x, mu, fields[f]); + // usually we dagger the staples, but the sign convention seems to require this + _su3_times_su3d(z_tmp,w1,fields[f][x][mu]); + project_traceless_antiherm(&z_tmp); + + // implementing the Iwasaki, Symanzik or DBW2 flow from here should be a trivial extension + // but it will require adding some (more) parameters and making sure that g_dbw2rand exists + // also in the inverter if the measurement is to be carried out there + //get_rectangle_staples_general(&w2,x,mu,fields[f]); + //_su3_times_su3d(w1,w2,fields[f][x][mu]); + + if(f==0){ + _real_times_su3(z[x][mu],eps,z_tmp); + }else{ + _real_times_su3(z_tmp,eps*zfac[2*f-1],z_tmp); + _su3_refac_acc(z_tmp,zfac[2*f],z[x][mu]); + z[x][mu] = z_tmp; + } + _real_times_su3(z_tmp,zepsfac[f],z[x][mu]); + project_traceless_antiherm(&z_tmp); + cayley_hamilton_exponent(&w,&z_tmp); + _su3_times_su3(fields[f+1][x][mu],w,fields[f][x][mu]); + } + } +#ifdef MPI +#ifdef OMP +#pragma omp single +#endif + { + xchange_gauge(fields[f+1]); + } +#endif + } + } +} + +void gradient_flow_measurement(const int traj, const int id, const int ieo) { + + double E[3],t[3], P[3]; + double W=0, eps=0.01, tsqE=0; + double t1, t2; + + if( g_proc_id == 0 ) { + printf("# Doing gradient flow measurement.\n"); + } + + FILE *outfile; + if( g_proc_id == 0 ) { + char filename[100]; + snprintf(filename,100,"gradflow.%06d",traj); + outfile = fopen(filename,"w"); + + if( outfile == NULL ) { + char error_message[200]; + snprintf(error_message,200,"Couldn't open %s for writing during measurement %d!",filename, id); + fatal_error(error_message,"gradient_flow_measurement"); + } + + fprintf(outfile, "traj t P Eplaq Esym tsqEplaq tsqEsym Wsym\n"); + } + + aligned_su3_field_t vt = aligned_su3_field_alloc(VOLUMEPLUSRAND+g_dbw2rand); + aligned_su3_field_t x1 = aligned_su3_field_alloc(VOLUMEPLUSRAND+g_dbw2rand); + aligned_su3_field_t x2 = aligned_su3_field_alloc(VOLUMEPLUSRAND+g_dbw2rand); + aligned_su3_field_t z = aligned_su3_field_alloc(VOLUME); + +#ifdef MPI + xchange_gauge(g_gauge_field); +#endif + memcpy(vt.field[0],g_gauge_field[0],sizeof(su3)*4*(VOLUMEPLUSRAND+g_dbw2rand)); + + t[0] = E[0] = P[0] = 0.0; + t[1] = E[1] = P[1] = 0.0; + t[2] = E[2] = P[2] = 0.0; + + t1 = gettime(); + measure_energy_density(vt.field,&E[2]); + P[2] = measure_plaquette(vt.field)/(6.0*VOLUME*g_nproc); + t2 = gettime(); + if(g_proc_id==0 && g_debug_level > 2) { + printf("time for energy density measurement: %lf\n",t2-t1); + } + + while( t[1] < 9.99 ) { + t[0] = t[2]; + E[0] = E[2]; + P[0] = P[2]; + for(int step = 1; step < 3; ++step) { + t[step] = t[step-1]+eps; + step_gradient_flow(vt.field,x1.field,x2.field,z.field,0,eps); + measure_energy_density(vt.field,&E[step]); + P[step] = measure_plaquette(vt.field)/(6.0*VOLUME*g_nproc); + } + W = t[1]*t[1]*( 2*E[1] + t[1]*((E[2]-E[0])/(2*eps)) ) ; + tsqE = t[1]*t[1]*E[1]; + + if(g_proc_id==0 && g_debug_level > 3){ + printf("sym(plaq) t=%lf 1-P(t)=%1.8lf E(t)=%2.8lf(%2.8lf) t^2E=%2.8lf(%2.8lf) W(t)=%2.8lf \n",t[1],1-P[1], + E[1],36*(1-P[1]), + tsqE,t[1]*t[1]*36*(1-P[1]), + W); + } + if(g_proc_id==0){ + fprintf(outfile,"%06d %f %2.12lf %2.12lf %2.12lf %2.12lf %2.12lf %2.12lf \n", + traj,t[1],P[1], + 36*(1-P[1]),E[1], + t[1]*t[1]*36*(1-P[1]),tsqE, + W); + fflush(outfile); + } + + } + + aligned_su3_field_free(&vt); + aligned_su3_field_free(&x1); + aligned_su3_field_free(&x2); + aligned_su3_field_free(&z); + + t2 = gettime(); + + if( g_proc_id == 0 ) { + if(g_debug_level>2){ + printf("Gradient flow measurement done in %f seconds!\n",t2-t1); + } + fclose(outfile); + } + + return; +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/meas/gradient_flow.h b/qcd/part_cpu/applications/QCD/src/kernel_D/meas/gradient_flow.h new file mode 100644 index 0000000000000000000000000000000000000000..43a24ee3ed3b345ab3c1d94002c1fca7d87673dc --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/meas/gradient_flow.h @@ -0,0 +1,30 @@ +/*********************************************************************** + * + * Copyright (C) 2013 Albert Deuzeman + * 2015 Bartosz Kostrzewa + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _GRADIENT_FLOW_H +#define _GRADIENT_FLOW_H + +#include "su3.h" + +void step_gradient_flow(su3 ** vt, su3 ** x1, su3 ** x2, su3 ** z, const unsigned int type, const double eps); +void gradient_flow_measurement(const int traj, const int id, const int ieo); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/meas/measurements.c b/qcd/part_cpu/applications/QCD/src/kernel_D/meas/measurements.c new file mode 100644 index 0000000000000000000000000000000000000000..06456729d342d110a12180ecff426563e24d0cdb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/meas/measurements.c @@ -0,0 +1,105 @@ +/*********************************************************************** + * + * Copyright (C) 2008 Carsten Urbach + * 2009 Florian Burger + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#include "global.h" +#include "default_input_values.h" +#include "read_input.h" + +#include "pion_norm.h" +#include "correlators.h" +#include "polyakov_loop.h" +#include "oriented_plaquettes.h" +#include "gradient_flow.h" +#include "measurements.h" + +measurement measurement_list[max_no_measurements]; +int no_measurements = 0; + +int add_measurement(const enum MEAS_TYPE meas_type) { + + if(no_measurements == max_no_measurements) { + fprintf(stderr, "maximal number of measurementss %d exceeded!\n", max_no_measurements); + exit(-1); + } + measurement_list[no_measurements].measurefunc = &dummy_meas; + measurement_list[no_measurements].type = meas_type; + measurement_list[no_measurements].initialised = 1; + no_measurements++; + return(no_measurements); +} + +int init_measurements(){ + int i; + for(i = 0; i < no_measurements; i++) { + + if(measurement_list[i].type == ONLINE) { + measurement_list[i].measurefunc = &correlators_measurement; + measurement_list[i].max_source_slice = g_nproc_t*T; + } + + if(measurement_list[i].type == PIONNORM) { + measurement_list[i].measurefunc = &pion_norm_measurement; + measurement_list[i].max_source_slice = g_nproc_z*LZ; + } + + if(measurement_list[i].type == POLYAKOV) { + measurement_list[i].measurefunc = &polyakov_loop_measurement; + } + + if(measurement_list[i].type == ORIENTED_PLAQUETTES) { + measurement_list[i].measurefunc = &oriented_plaquettes_measurement; + } + + if(measurement_list[i].type == GRADIENT_FLOW) { + measurement_list[i].measurefunc = &gradient_flow_measurement; + } + + measurement_list[i].id = i; + } +return(0); +} + + + +void free_measurements(){ + + return; +} + + + +void dummy_meas(const int traj, const int id, const int ieo) { + if(g_proc_id == 0) { + fprintf(stderr, "dummy_meas was called for measurement with id=%d. Was that really intended?\n", id); + } + return; +} + + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/meas/measurements.h b/qcd/part_cpu/applications/QCD/src/kernel_D/meas/measurements.h new file mode 100644 index 0000000000000000000000000000000000000000..39921903ffbef98922b581ad0d44fc39c09a193e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/meas/measurements.h @@ -0,0 +1,75 @@ +/*********************************************************************** + * + * Copyright (C) 2008 Carsten Urbach + * + * Adapted from monomial.h by Florian Burger 2009/12/16 + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _MEASUREMENTS_H +#define _MEASUREMENTS_H + +#define max_no_measurements 20 + +/* Give the measurement types an unambiguous ID*/ +enum MEAS_TYPE { + ONLINE, + PIONNORM, + POLYAKOV, + ORIENTED_PLAQUETTES, + GRADIENT_FLOW + }; + +typedef struct { + enum MEAS_TYPE type; + int initialised; + int id; + + /* frequency of the measurement */ + int freq; + /* for maximal iterations in inversions for correlators */ + int max_iter; + /* for polyakov loop */ + int direction; + + /* how it's usually called */ + char name[100]; + + /* maximum number of slice, the source can be put + if the correlator is measured in T(Z)-direction this will be set to + T(LZ) by init_measurements + */ + int max_source_slice; + + /* functions for the measurement */ + void (*measurefunc) (const int traj, const int id, const int ieo); +} measurement; + + +/* list of all monomials */ +extern measurement measurement_list[max_no_measurements]; +extern int no_measurements; + +/* add a new measurement to the list of measurements */ +int add_measurement(const enum MEAS_TYPE); +/* initialise all measurements in the list */ +int init_measurements(); +/* free space again */ +void free_measurements(); + +void dummy_meas(const int traj, const int id, const int ieo); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/meas/oriented_plaquettes.c b/qcd/part_cpu/applications/QCD/src/kernel_D/meas/oriented_plaquettes.c new file mode 100644 index 0000000000000000000000000000000000000000..444dfcb19eb7b98bb607b79ea0c3742fc8e60647 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/meas/oriented_plaquettes.c @@ -0,0 +1,113 @@ +/*********************************************************************** + * + * Copyright (C) 2001 Martin Hasenbusch, 2012 Bartosz Kostrzewa + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + ************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef OMP +# include +#endif + +#include +#include + +#include "global.h" +#include "su3.h" +#include "geometry_eo.h" +#include "oriented_plaquettes.h" +#include "fatal_error.h" +#include "measurements.h" + +void measure_oriented_plaquettes(const su3 ** const gf, double *plaq) { +#ifdef MPI + double ALIGN mplaq[6]; +#endif + + int ix,ix1,ix2,mu1,mu2,plane; + su3 ALIGN pr1,pr2; + const su3 *v,*w; + double ALIGN pl; + double ALIGN ks[6] = {0.0,0.0,0.0,0.0,0.0,0.0}; + double ALIGN kc[6] = {0.0,0.0,0.0,0.0,0.0,0.0}; + double ALIGN tr[6],ts[6],tt[6]; + + for (ix=0;ix. + ***********************************************************************/ + +#ifndef _MEASURE_ORIENTED_PLAQUETTES_H +#define _MEASURE_ORIENTED_PLAQUETTES_H + +#include "su3.h" + +/* measures the lattice average of plaquettes oriented in the 6 + hyperplanes TX, TY, TZ, XY, XZ, YZ and stores them in this + order in the plaq array (of 6 elements) + + the caller must provide the memory for plaq */ + +void measure_oriented_plaquettes(const su3 ** const gf, double *plaq); + +/* implements the online measurement function for the oriented + plaquettes, writes (in append mode) into "oriented_plaquettes.data" */ + +void oriented_plaquettes_measurement(const int traj, const int id, const int ieo); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/meas/pion_norm.c b/qcd/part_cpu/applications/QCD/src/kernel_D/meas/pion_norm.c new file mode 100644 index 0000000000000000000000000000000000000000..b70012815093796da372a48e160120805ac5b3ce --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/meas/pion_norm.c @@ -0,0 +1,160 @@ +/*********************************************************************** + * + * Copyright (C) 2008 Carsten Urbach + * 2009 Florian Burger + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#include "global.h" +#include "start.h" +#include "ranlxd.h" +#include "ranlxs.h" +#include "su3spinor.h" +#include "source_generation.h" +#include "invert_eo.h" +#include "solver/solver.h" +#include "solver/solver_params.h" +#include "geometry_eo.h" +#include "linalg/convert_eo_to_lexic.h" +#include "measurements.h" +#include "pion_norm.h" +#include "gettime.h" + +void pion_norm_measurement(const int traj, const int id, const int ieo) { + int i, j, z, zz, z0; + double *Cpp; + double res = 0.; + double pionnorm; + double atime, etime; + float tmp; + solver_params_t tmp_solver_params; +#ifdef MPI + double mpi_res = 0.; +#endif + FILE *ofs, *ofs2; + char *filename, *filename2, *sourcefilename; + char buf[100]; + char buf2[100]; + char buf3[100]; + filename=buf; + filename2=buf2; + sourcefilename=buf3; + sprintf(filename,"pionnormcorrelator_finiteT.%.6d",traj); + sprintf(filename2,"%s", "pion_norm.data"); + + /* generate random source point */ + if(ranlxs_init == 0) { + rlxs_init(1, 123456); + } + ranlxs(&tmp, 1); + z0 = (int)(measurement_list[id].max_source_slice*tmp); +#ifdef MPI + MPI_Bcast(&z0, 1, MPI_INT, 0, MPI_COMM_WORLD); +#endif + + atime = gettime(); + + Cpp = (double*) calloc(g_nproc_z*LZ, sizeof(double)); + + printf("Doing finite Temperature online measurement\n"); + + /* stochastic source in z-slice */ + source_generation_pion_zdir(g_spinor_field[0], g_spinor_field[1], + z0, 0, traj); + + + /* invert on the stochastic source */ + invert_eo(g_spinor_field[2], g_spinor_field[3], + g_spinor_field[0], g_spinor_field[1], + 1.e-14, measurement_list[id].max_iter, CG, 1, 0, ieo, 0, NULL,tmp_solver_params, -1, + NO_EXT_INV, SLOPPY_DOUBLE, NO_COMPRESSION); + + /* now we bring it to normal format */ + /* here we use implicitly DUM_MATRIX and DUM_MATRIX+1 */ + convert_eo_to_lexic(g_spinor_field[DUM_MATRIX], g_spinor_field[2], g_spinor_field[3]); + + /* now we sums only over local space for every z */ + for(z = 0; z < LZ; z++) { + res = 0.; + /* sum here over all points in one z-slice + we have to look up g_ipt*/ + + j = g_ipt[0][0][0][z]; + for(i = 0; i < T*LX*LY ; i++) { + res += _spinor_prod_re(g_spinor_field[DUM_MATRIX][j], g_spinor_field[DUM_MATRIX][j]); + j += LZ; /* jump LZ sites in array, z ist fastest index */ + } + + + +#if defined MPI + MPI_Reduce(&res, &mpi_res, 1, MPI_DOUBLE, MPI_SUM, 0, g_mpi_z_slices); + res = mpi_res; +#endif + Cpp[z+g_proc_coords[3]*LZ] = +res/(g_nproc_x*LX)/(g_nproc_y*LY)/(g_nproc_t*T)*2.; + } + +#ifdef MPI + /* some gymnastics needed in case of parallelisation */ + if(g_mpi_z_rank == 0) { + MPI_Gather(&Cpp[g_proc_coords[3]*LZ], LZ, MPI_DOUBLE, Cpp, LZ, MPI_DOUBLE, 0, g_mpi_ST_slices); + } +#endif + + + /* and write everything into a file */ + if(g_mpi_z_rank == 0 && g_proc_coords[3] == 0) { + ofs = fopen(filename, "w"); + fprintf( ofs, "1 1 0 %e %e\n", Cpp[z0], 0.); + for(z = 1; z < g_nproc_z*LZ/2; z++) { + zz = (z0+z)%(g_nproc_z*LZ); + fprintf( ofs, "1 1 %d %e ", z, Cpp[zz]); + zz = (z0+g_nproc_z*LZ-z)%(g_nproc_z*LZ); + fprintf( ofs, "%e\n", Cpp[zz]); + } + zz = (z0+g_nproc_z*LZ/2)%(g_nproc_z*LZ); + fprintf( ofs, "1 1 %d %e %e\n", z, Cpp[zz], 0.); + fclose(ofs); + + /* sum over all Cpp to get pionnorm*/ + ofs2 = fopen(filename2, "a"); + pionnorm = 0.; + for(z=0; z 0) { + printf("PIONNORM : measurement done int t/s = %1.4e\n", etime - atime); + } + return; +} +/*end Florian Burger 4.11.2009 */ + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/meas/pion_norm.h b/qcd/part_cpu/applications/QCD/src/kernel_D/meas/pion_norm.h new file mode 100644 index 0000000000000000000000000000000000000000..49ca916d49d08cfee9ab8e2495e9486897eb27cf --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/meas/pion_norm.h @@ -0,0 +1,30 @@ +/*********************************************************************** + * + * Copyright (C) 2009 Florian Burger + * + * Adapted from online_measurement.h by Florian Burger 2009/12/16 + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _PION_NORM_H +#define _PION_NORM_H + +void pion_norm_measurement(const int traj, const int id, const int ieo); + +#endif + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/meas/polyakov_loop.c b/qcd/part_cpu/applications/QCD/src/kernel_D/meas/polyakov_loop.c new file mode 100644 index 0000000000000000000000000000000000000000..faa8851550806eb1a2358cf0cdee022972972a77 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/meas/polyakov_loop.c @@ -0,0 +1,558 @@ +/*********************************************************************** + * + * Copyright (C) 2005 Urs Wenger + * 2008,2009 Marcus Petschlies + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * Routine to calculate the Polyakov loop. + * + * Author: Urs Wenger + * Date: January 2005 + * + * Polyakov loop in time direction added by Marcus Petschlies + * 2008 + * + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#ifdef MPI +# include +#endif +#include "global.h" +#include +#include "sse.h" +#include "su3.h" +#include "read_input.h" +#include "start.h" +#include "mpi_init.h" +#include "polyakov_loop.h" +#include "gettime.h" + +void polyakov_loop(_Complex double * pl_, const int mu) { + + static int i0, i1, i2, i3, L0, L1, L2, L3, ixyzt, ixyzt_up; + static double vol; + static su3 tmp, tmp2; + su3 *v = NULL , *w = NULL; + static _Complex double pl; + /* For the Kahan summation:*/ +#ifdef MPI + static _Complex double pls; +#endif + static _Complex double ks = 0.0, kc = 0.0, tr, ts, tt; + + /* For the moment only the Polyakov loop in y- and z-direction + are implemented, since they are not affected by parallelisation: */ + if(mu == 0 || mu == 1 || mu > 3) { + fprintf(stderr, "Wrong parameter for Polyakov loop calculation in polyakov_loop.c:\n"); + fprintf(stderr, "Only direction %d and %d are allowed.\n",2,3); + fprintf(stderr, "Actual value is %d! Aborting...\n",mu); +#ifdef MPI + MPI_Abort(MPI_COMM_WORLD, 10); + MPI_Finalize(); +#endif + exit(0); + } + + + L0=T; + L1=LX; + if(mu==2) { + L2=LZ; + L3=LY; + } + else { + L2=LY; + L3=LZ; + } + /* loop over the spatial sites: */ + for (i0=0; i0 < L0; i0++) { + for (i1=0; i1 < L1; i1++) { + for (i2=0; i2 < L2; i2++) { + /* at each spatial site multiply the links in + temporal direction: */ + i3 = 0; + /* get the site index: */ + if(mu==2) { + ixyzt = g_ipt[i0][i1][i3][i2]; + } + else { + ixyzt = g_ipt[i0][i1][i2][i3]; + } + /* and its neigbour in direction mu: */ + ixyzt_up = g_iup[ixyzt][mu]; + + /* Get the links and multiply them: ixyzt --> ixyzt_up --> */ + v = &g_gauge_field[ixyzt][mu]; + w = &g_gauge_field[ixyzt_up][mu]; + _su3_times_su3(tmp, *v, *w); + + /* now start the loop over indices in mu-direction: */ + for (i3=1; i3 < L3-2; i3++) { + /* store the current result in v:*/ + _su3_assign(tmp2,tmp); + /* get the next site index: */ + ixyzt_up = g_iup[ixyzt_up][mu]; + /* and the corresponding link matrix: */ + w = &g_gauge_field[ixyzt_up][mu]; + /* and multiply them: */ + _su3_times_su3(tmp, tmp2, *w); + } + + /* for the last link we directly take the _Complex double trace: */ + ixyzt_up = g_iup[ixyzt_up][mu]; + w = &g_gauge_field[ixyzt_up][mu]; + _trace_su3_times_su3(pl,tmp,*w); + + /* printf("i0=%d, i1=%d, i2=%d, pl=(%e,%e)\n",i0,i1,i2,creal(pl),cimag(pl));*/ + + /* Kahan summation for real and imaginary part: */ + tr = pl + kc; + ts = tr + ks; + tt = ts - ks; + ks = ts; + kc = tr - tt; + } + } + } + /* Finish Kahan summation: */ + /* (Division by 3 is for normalising the colour trace.) */ + pl = (kc + ks) / 3.0; + /* printf("Polyakov loop before normalisation, pl.re=%e, pl.im=%e\n",creal(pl),cimag(pl));*/ + + + /* Collect the results and return:*/ +#ifdef MPI + MPI_Allreduce(&pl, &pls, 2, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + pl=pls; +#endif + + /* Normalise, i.e. divide by the number of loops: */ + vol = (double) L0*L1*L2*g_nproc_t*g_nproc_x; + /* printf("L0*L1*L2=%d, vol=%e\n",L0*L1*L2,vol); */ + pl /= vol; + /* printf("Polyakov loop after normalisation, pl.re=%e, pl.im=%e\n",creal(pl),cimag(pl)) */; + /* return pl; */ + *pl_ = pl; +} + + +/* here comes the one in time direction */ + +int polyakov_loop_0(const int nstore, _Complex double *pl) { + + int i0, i1, i2, i3, ixyz, ixyzt, ixyzt_up, VOL3, VOLUME3; + int L0, L1, L2, L3; + double retime, ratime; + _Complex double pl_tmp, tr, ts, tt, kc, ks; + su3 *tmp_loc = NULL, tmp, tmp2; + su3 *v = NULL, *w = NULL; + + FILE *ofs = NULL; + +#ifdef MPI + int iproc; + MPI_Status status; + su3 *tmp_nnb = NULL; +#endif + + L0 = LX; /* enable transparent comparison with existing Polyakov routines */ + L1 = LY; /* in spatial directions */ + L2 = LZ; + L3 = T; + + /************** + * local part * + **************/ + ratime = gettime(); + + VOL3 = L0*L1*L2; + tmp_loc = (su3 *)calloc(VOL3, sizeof(su3)); + + for(i0 = 0; i0 < LX; i0++) { + for(i1 = 0; i1 < LY; i1++) { + for(i2 = 0; i2 < LZ; i2++) { + ixyz = (i2 * L1 + i1) * L0 + i0; + i3 = 0; + ixyzt = g_ipt[i3][i0][i1][i2]; + ixyzt_up = g_iup[ixyzt][0]; + v = &g_gauge_field[ixyzt][0]; + w = &g_gauge_field[ixyzt_up][0]; + _su3_times_su3(tmp, *v, *w); + + for(i3 = 1; i3 < L3-1; i3++) { + _su3_assign(tmp2,tmp); + ixyzt_up = g_iup[ixyzt_up][0]; + w = &g_gauge_field[ixyzt_up][0]; + _su3_times_su3(tmp, tmp2, *w); + } + _su3_assign(tmp_loc[ixyz],tmp); + } + } + } + retime = gettime(); + if(g_debug_level>0) { + fprintf(stdout, "[polyakov_loop_0 | %3d] time for calculating local part = %e seconds\n", g_cart_id, retime-ratime); + } + + /********************************************************************************/ + +#ifdef MPI + /*************** + * global part * + ***************/ + + ratime = MPI_Wtime(); + + /* (1) collect contributions from different time slices to nodes with t-coord. 0 */ + tmp_nnb = (su3*)calloc(VOL3, sizeof(su3)); /* contains the next-neighbour-part*/ + + /* note: in the following loop t is taken as the time coordinate of nodes */ + for(iproc = g_nproc_t-1; iproc > 0; iproc--) { + if(g_proc_coords[0] == iproc) /* node is in the {t=iproc}-hyperplane */ { + MPI_Send(tmp_loc, VOL3, mpi_su3, g_nb_t_dn, 100+g_cart_id, g_cart_grid); + /* send tmp_loc from {t=iproc}-hyperplane to {t=iproc-1}-hyperplane */ + } + if(g_proc_coords[0] == iproc-1) { + /* so the node is right below the sending one in time(= 0)-direction */ + MPI_Recv(tmp_nnb, VOL3, mpi_su3, g_nb_t_up, 100+g_nb_t_up, g_cart_grid, &status); + /* receive tmp_loc from the tmp_loc from the + {t=my_own_t_index+1}-hyperplane */ + for(ixyz=0; ixyz0) { + fprintf(stdout, "[polyakov_loop_0 | %3d] time for calculating global part = %e seconds\n", g_cart_id, retime-ratime); + } + + /* (2) nodes with time coordinate 0 sum traces over local spatial points */ +#endif + pl_tmp = 0.0; + if(g_proc_coords[0] == 0) { + + kc = 0.0; ks = 0.0; + for(ixyz = 0; ixyz < VOL3; ixyz++) /* Kahan-summation of traces */ + { + pl_tmp = tmp_loc[ixyz].c00 + tmp_loc[ixyz].c11 + tmp_loc[ixyz].c22; + tr = pl_tmp + kc; + ts = tr + ks; + tt = ts - ks; + ks = ts; + kc = tr - tt; + } + pl_tmp = ks + kc; + } + +#ifdef MPI + /* (3) sum over all contributions from all nodes (also nodes with pl_tmp=0; + apparently the easiest way) */ + MPI_Reduce(&pl_tmp, pl, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, 0, g_cart_grid); + /* MPI_Reduce(&(creal(pl_tmp)), &(pl->re), 1, MPI_DOUBLE, MPI_SUM, 0, g_cart_grid); */ + /* MPI_Reduce(&(cimag(pl_tmp)), &(pl->im), 1, MPI_DOUBLE, MPI_SUM, 0, g_cart_grid); */ +#else + *pl = pl_tmp; +#endif + + /* normalization */ + VOLUME3 = VOL3; + + if(g_proc_id == 0) + { + VOLUME3 = VOLUME3 * g_nproc_x*g_nproc_y*g_nproc_z; + *pl /= 3 * VOLUME3; + } + + /* write result to file */ + if (g_proc_id == 0) { + if (nstore == 0) { + ofs = fopen("polyakov_loop_0.dat","w"); + } + else { + ofs = fopen("polyakov_loop_0.dat","a"); + } + fprintf(ofs, "%25.16e\t%25.16e\n", creal(*pl), cimag(*pl)); + fclose(ofs); + } +#ifdef MPI + free(tmp_nnb); +#endif + free(tmp_loc); + return(0); +} + + +/*********************************************************************************/ + +/* here comes the version using reduction operations for time- (dir==0) or + z- (dir==3) direction + the reduction operation is defined in mpi_init.h +*/ +void polyakov_loop_measurement(const int nstore, const int id, const int ieo) { + polyakov_loop_dir(nstore, measurement_list[id].direction); +} + + +int polyakov_loop_dir( + const int nstore /* in */, + const int dir /* in */) { + + int ixyz, ixyzt, ixyzt_up, VOL3, VOLUME3, ix, iy, iz, it; + _Complex double pl_tmp, tr, ts, tt, kc, ks, pl; + su3 *tmp_loc, tmp, tmp2; + su3 *u, *v, *w; + double ratime, retime; + char filename[50]; + + FILE *ofs; + +#ifdef MPI + int rank_slice, rank_ray; + MPI_Comm slice, ray; + su3 *tmp_ray; +#endif + + if(dir!=0 && dir!=3 && g_proc_id==0) { + fprintf(stderr, "Wrong direction; must be 0 (t) or 3 (z)\n"); + return(-1); + } + + pl = 0.0; + + /********************************************************************************/ + + /************** + * local part * + **************/ + ratime = gettime(); + + if(dir==0) { + VOL3 = LX*LY*LZ; + tmp_loc = (su3 *)calloc(VOL3, sizeof(su3)); + if((void*)tmp_loc == NULL) { + fprintf(stderr, "[%2d] Could not allocate memory for tmp_loc\n", g_proc_id); + return(-1); + } + + for(ix=0; ix dir==3 */ + VOL3 = T*LX*LY; + tmp_loc = (su3 *)calloc(VOL3, sizeof(su3)); + if((void*)tmp_loc == NULL) { + /* Abort */ + } + + for(it=0; it 0 && g_proc_id == 0) { + fprintf(stdout, "# [pl02 dir%1d proc%.2d] time for calculating local part"\ + " = %e seconds\n", dir, g_cart_id, retime-ratime); + } + + /********************************************************************************/ + +#ifdef MPI + /*************** + * global part * + ***************/ + /* choose the slice and ray communicators according to direction */ + if(dir==0) { + slice = g_mpi_time_slices; + ray = g_mpi_SV_slices; + rank_slice = g_mpi_time_rank; + rank_ray = g_mpi_SV_rank; + } + else { + slice = g_mpi_z_slices; + ray = g_mpi_ST_slices; + rank_slice = g_mpi_z_rank; + rank_ray = g_mpi_ST_rank; + } + + ratime = MPI_Wtime(); + + /* (1) collect contributions from different time/z slices to nodes with rank=0 + in spatial volume/space-time slices */ +# ifndef PARALLELXYZT + if(dir==0) { +# endif + tmp_ray = (su3*)calloc(VOL3, sizeof(su3)); /* */ + if((void*)tmp_ray== NULL) { + fprintf(stderr, "[%2d] Could not allocate memory for tmp_ray\n", g_proc_id); + return(-1); + } + + MPI_Reduce(tmp_loc, tmp_ray, VOL3, mpi_su3, mpi_reduce_su3_ray, 0, ray); +# ifndef PARALLELXYZT + } +# endif + + + retime = MPI_Wtime(); + if(g_proc_id==0 && g_debug_level>0) { + fprintf(stdout, "# [pl02 dir%1d proc%.2d] time for calculating global part"\ + " = %e seconds\n", dir, g_cart_id, retime-ratime); + } + + if(rank_ray == 0) { + +#endif + pl_tmp = 0.0; + kc = 0.0; + ks = 0.0; + +#ifdef MPI +# ifdef PARALLELXYZT + u = tmp_ray; +# else + if(dir==0) { u = tmp_ray; } + else { u = tmp_loc; } +# endif +#else + u = tmp_loc; +#endif + + for(ixyz=0; ixyz pl / ( 3 * 3-dim. volume)*/ + VOLUME3 = VOL3; + +#ifdef MPI + if(rank_slice==0 && rank_ray==0) { /* this process has the sum + of the Polyakov loop values */ + if(dir==0) { + VOLUME3 = VOLUME3 * g_nproc_x*g_nproc_y*g_nproc_z; + } + else { + VOLUME3 = VOLUME3 * g_nproc_t*g_nproc_x*g_nproc_y; + } +#endif + pl /= 3. * VOLUME3; + + /* write result to file */ + sprintf(filename, "polyakovloop_dir%1d", dir); + if (nstore == 0) { + ofs = fopen(filename,"w"); + } + else { + ofs = fopen(filename,"a"); + } + if((void*)ofs == NULL) { + fprintf(stderr, "Could not open file %s for writing\n", filename); + return(-1); + } + fprintf(ofs, "%4d\t%2d\t%25.16e\t%25.16e\n", nstore, dir, creal(pl), cimag(pl)); + fclose(ofs); +#if defined MPI + } +#endif + free(tmp_loc); + return(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/meas/polyakov_loop.h b/qcd/part_cpu/applications/QCD/src/kernel_D/meas/polyakov_loop.h new file mode 100644 index 0000000000000000000000000000000000000000..bc3d14d6eb8cff5035e907fa04e139dc86d1674e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/meas/polyakov_loop.h @@ -0,0 +1,29 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _POLYAKOV_LOOP_H +#define _POLYAKOV_LOOP_H + +#include "measurements.h" + +void polyakov_loop(_Complex double * pl_, const int mu); +int polyakov_loop_0(const int nstore, _Complex double* pl); +int polyakov_loop_dir(const int nstore, const int dir); +void polyakov_loop_measurement(const int nstore, const int id, const int ieo); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/measure_gauge_action.c b/qcd/part_cpu/applications/QCD/src/kernel_D/measure_gauge_action.c new file mode 100644 index 0000000000000000000000000000000000000000..04bb5f358198966f86912bf577c9643ece052318 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/measure_gauge_action.c @@ -0,0 +1,191 @@ +/*********************************************************************** + * + * Copyright (C) 2001 Martin Hasenbusch + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * File observables.c + * + * + * The externally accessible functions are + * + * double measure_gauge_action(void) + * Returns the value of the action + ************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#ifdef OMP +# include +#endif +#include "su3.h" +#include "su3adj.h" +#include "sse.h" +#include "geometry_eo.h" +#include "global.h" +#include +#include "measure_gauge_action.h" + +double measure_plaquette(const su3 ** const gf) { + static double res; +#ifdef MPI + double ALIGN mres; +#endif + +#ifdef OMP +#pragma omp parallel + { + int thread_num = omp_get_thread_num(); +#endif + + int ix1,ix2; + su3 ALIGN pr1,pr2; + const su3 * restrict v,* restrict w; + double ALIGN ac, ks, kc, tr, ts, tt; + + kc=0.0; ks=0.0; +#ifdef OMP +#pragma omp for +#endif + for (int ix = 0; ix < VOLUME; ix++){ + for (int mu1 = 0; mu1 < 3; mu1++){ + ix1 = g_iup[ix][mu1]; + for (int mu2 = mu1+1; mu2 < 4; mu2++){ + ix2 = g_iup[ix][mu2]; + v=&gf[ix][mu1]; + w=&gf[ix1][mu2]; + _su3_times_su3(pr1, *v, *w); + v=&gf[ix][mu2]; + w=&gf[ix2][mu1]; + _su3_times_su3(pr2, *v, *w); + _trace_su3_times_su3d(ac, pr1, pr2); + tr=ac+kc; + ts=tr+ks; + tt=ts-ks; + ks=ts; + kc=tr-tt; + } + } + } + kc=(kc+ks)/3.0; +#ifdef OMP + g_omp_acc_re[thread_num] = kc; +#else + res = kc; +#endif + +#ifdef OMP + } /* OpenMP parallel closing brace */ + + res = 0.0; + for(int i=0; i < omp_num_threads; ++i) + res += g_omp_acc_re[i]; +#endif +#ifdef MPI + MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + res = mres; +#endif + return res; +} + +double measure_gauge_action(const su3 ** const gf, const double lambda) { + static double res; +#ifdef MPI + double ALIGN mres; +#endif + +#ifdef OMP +#pragma omp parallel + { + int thread_num = omp_get_thread_num(); +#endif + + int ix1,ix2; + su3 ALIGN pr1,pr2; + const su3 * restrict v,* restrict w; + double ALIGN ac, ks, kc, tr, ts, tt; + + kc=0.0; ks=0.0; +#ifdef OMP +#pragma omp for +#endif + for (int ix = 0; ix < VOLUME; ix++){ + ix1 = g_iup[ix][0]; + // electric part + for (int mu2 = 1; mu2 < 4; mu2++){ + ix2 = g_iup[ix][mu2]; + v=&gf[ix][0]; + w=&gf[ix1][mu2]; + _su3_times_su3(pr1, *v, *w); + v=&gf[ix][mu2]; + w=&gf[ix2][0]; + _su3_times_su3(pr2, *v, *w); + _trace_su3_times_su3d(ac, pr1, pr2); + ac *= (1+lambda); + tr=ac+kc; + ts=tr+ks; + tt=ts-ks; + ks=ts; + kc=tr-tt; + } + // magnetic part + for (int mu1 = 1; mu1 < 3; mu1++){ + ix1 = g_iup[ix][mu1]; + for (int mu2 = mu1+1; mu2 < 4; mu2++){ + ix2 = g_iup[ix][mu2]; + v=&gf[ix][mu1]; + w=&gf[ix1][mu2]; + _su3_times_su3(pr1, *v, *w); + v=&gf[ix][mu2]; + w=&gf[ix2][mu1]; + _su3_times_su3(pr2, *v, *w); + _trace_su3_times_su3d(ac, pr1, pr2); + ac *= (1-lambda); + tr=ac+kc; + ts=tr+ks; + tt=ts-ks; + ks=ts; + kc=tr-tt; + } + } + } + kc=(kc+ks)/3.0; +#ifdef OMP + g_omp_acc_re[thread_num] = kc; +#else + res = kc; +#endif + +#ifdef OMP + } /* OpenMP parallel closing brace */ + + res = 0.0; + for(int i=0; i < omp_num_threads; ++i) + res += g_omp_acc_re[i]; +#endif +#ifdef MPI + MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + res = mres; +#endif + GaugeInfo.plaquetteEnergy = res; + return res; +} + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/measure_gauge_action.h b/qcd/part_cpu/applications/QCD/src/kernel_D/measure_gauge_action.h new file mode 100644 index 0000000000000000000000000000000000000000..780a04d3d621025345c7c3f311df7e3da14d2cc5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/measure_gauge_action.h @@ -0,0 +1,28 @@ +/*********************************************************************** + * + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _MEASURE_GAUGE_ACTION_H +#define _MEASURE_GAUGE_ACTION_H + +#include "su3.h" + +double measure_plaquette(const su3 ** const gf); +double measure_gauge_action(const su3 ** const gf, const double lambda); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/measure_rectangles.c b/qcd/part_cpu/applications/QCD/src/kernel_D/measure_rectangles.c new file mode 100644 index 0000000000000000000000000000000000000000..82ad165bdfcb2e3e2930ca6aed8076da51fbb38a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/measure_rectangles.c @@ -0,0 +1,140 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +/******************************************************************* + * + * Here the 1x2 rectangles are implemented + * for renormalization group improved gauge + * actions like the DBW2 or the Iwasaki + * gauge action. + * + * 1/3 \sum_{\mu\leq\nu;\mu,nu=1}^4 Tr U^{1x2} + * + * author: Carsten Urbach + * + * + *******************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#ifdef OMP +# include +#endif +#include "global.h" +#include "sse.h" +#include "su3.h" +#include "su3adj.h" +#include "geometry_eo.h" +#include "measure_rectangles.h" + + +double measure_rectangles(const su3 ** const gf) { + static double res; +#ifdef MPI + double ALIGN mres; +#endif + +#ifdef OMP +#pragma omp parallel + { + int thread_num = omp_get_thread_num(); +#endif + + int i, j, k, mu, nu; + su3 ALIGN pr1, pr2, tmp; + const su3 *v = NULL , *w = NULL; + double ALIGN ac, ks, kc, tr, ts, tt; + + kc = 0.0; + ks = 0.0; +#ifdef OMP +#pragma omp for +#endif + for (i = 0; i < VOLUME; i++) { + for (mu = 0; mu < 4; mu++) { + for (nu = 0; nu < 4; nu++) { + if(nu != mu) { + /* + ^ + | + ^ + | + -> + */ + j = g_iup[i][mu]; + k = g_iup[j][nu]; + v = &gf[i][mu]; + w = &gf[j][nu]; + _su3_times_su3(tmp, *v, *w); + v = &gf[k][nu]; + _su3_times_su3(pr1, tmp, *v); + /* + -> + ^ + | + ^ + | + */ + j = g_iup[i][nu]; + k = g_iup[j][nu]; + v = &gf[i][nu]; + w = &gf[j][nu]; + _su3_times_su3(tmp, *v, *w); + v = &gf[k][mu]; + _su3_times_su3(pr2, tmp, *v); + + /* Trace it */ + _trace_su3_times_su3d(ac,pr1,pr2); + /* printf("i mu nu: %d %d %d, ac = %e\n", i, mu, nu, ac); */ + /* Kahan summation */ + tr=ac+kc; + ts=tr+ks; + tt=ts-ks; + ks=ts; + kc=tr-tt; + } + } + } + } + kc=(kc+ks)/3.0; +#ifdef OMP + g_omp_acc_re[thread_num] = kc; +#else + res = kc; +#endif + +#ifdef OMP + } /* OpenMP parallel closing brace */ + + res = 0.0; + for(int i = 0; i < omp_num_threads; ++i) + res += g_omp_acc_re[i]; +#else +#endif +#ifdef MPI + MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + res = mres; +#endif + + return res; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/measure_rectangles.h b/qcd/part_cpu/applications/QCD/src/kernel_D/measure_rectangles.h new file mode 100644 index 0000000000000000000000000000000000000000..8959a1ce1e63238c1741e5f3928f362e788f327c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/measure_rectangles.h @@ -0,0 +1,26 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _MEASURE_RECTANGLES_H +#define _MEASURE_RECTANGLES_H + +#include "su3.h" + +double measure_rectangles(const su3 ** const gf); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..d550f816c9c1ed2e6a49940be7da5b48e00c8aed --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/Makefile @@ -0,0 +1,100 @@ + +srcdir = . +top_builddir = .. +abs_top_builddir = /home/jacob/Job/Prace/kernel_D_soft/kernel_D_update2 +top_srcdir = .. +abs_top_srcdir = /home/jacob/Job/Prace/kernel_D_soft/kernel_D_update2 +subdir = monomial +builddir = . + +CFLAGS = -std=c99 -fopenmp -pedantic -Wall +DEPFLAGS = -MM +LDFLAGS = -L${HOME}/lib -L${top_builddir}/lib +DEFS = -DHAVE_CONFIG_H +OPTARGS = -O +SOPTARGS = -O + +AR = ar +RANLIB = ranlib +CC = mpicc +CCDEP = gcc +CCLD = ${CC} +LINK = ${CCLD} ${CFLAGS} ${LDFLAGS} ${OPTARGS} -o $@ +LEX = flex +AUTOCONF = autoconf +DEFS = -DHAVE_CONFIG_H + +INCLUDES = -I$(HOME)/include/ -I. -I${abs_top_builddir}/ -I${abs_top_srcdir}/ -I/include/ -I/include/ +LDADD = +#COMPILE = ${CC} ${DEFS} ${INCLUDES} ${CFLAGS} +COMPILE = ${CC} $(DEFS) ${INCLUDES} ${CFLAGS} + +LIBRARIES = libmonomial +libmonomial_TARGETS = nddetratio_monomial monomial det_monomial detratio_monomial \ + gauge_monomial ndpoly_monomial clover_trlog_monomial cloverdet_monomial cloverdetratio_monomial \ + clovernd_trlog_monomial poly_monomial cloverndpoly_monomial moment_energy \ + ndrat_monomial ndratcor_monomial rat_monomial ratcor_monomial monitor_forces + + +libmonomial_STARGETS = + +libmonomial_OBJECTS = $(addsuffix .o, ${libmonomial_TARGETS}) +libmonomial_SOBJECTS = $(addsuffix .o, ${libmonomial_STARGETS}) + +# default rule + +all: Makefile dep libmonomial.a + +# rules for debugging +debug all-debug: CFLAGS := $(CFLAGS) -g +debug all-debug: all + +# rules for profiling information +profile all-profile: CFLAGS := $(filter-out -fomit-frame-pointer,${CFLAGS}) +profile all-profile: all + + +#include dep rules + +-include $(addsuffix .d,${libmonomial_TARGETS}) + +include ${top_srcdir}/Makefile.global + +# rule to compile objects + +${libmonomial_OBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h + $(COMPILE) ${OPTARGS} -c $< + +${libmonomial_SOBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h + $(COMPILE) ${SOPTARGS} -c $< + +# rule to make libmonomial + +libmonomial.a: ${libmonomial_OBJECTS} ${libmonomial_SOBJECTS} Makefile + @rm -f libmonomial.a + @${AR} cru libmonomial.a ${libmonomial_OBJECTS} ${libmonomial_SOBJECTS} + @$(RANLIB) libmonomial.a + @cp libmonomial.a ../lib/libmonomial.a + +# rule to generate .d files + +$(addsuffix .d, $(libmonomial_TARGETS) ${libmonomial_STARGETS}): %.d: ${srcdir}/%.c Makefile + @${CCDEP} ${DEFS} ${DEPFLAGS} ${INCLUDES} $< > $@ + +# rule to make dependencies + +dep: ${addsuffix .d, ${libmonomial_TARGETS} ${libmonomial_STARGETS}} + +# rules to clean + +compile-clean: Makefile + rm -f ${$(addsuffix _OBJECTS, ${LIBRARIES})} ${$(addsuffix _SOBJECTS, ${LIBRARIES})} *.d + +clean: compile-clean + rm -f $(addsuffix .a, ${LIBRARIES}) + rm -f ../lib/libmonomial.a + +distclean: clean + rm -f Makefile + +.PHONY: all dep clean compile-clean distclean profile all-profile debug all-debug diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/Makefile.in b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/Makefile.in new file mode 100644 index 0000000000000000000000000000000000000000..1e3a3e4cc32d02302b4b518f106bc23b9ace6d59 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/Makefile.in @@ -0,0 +1,100 @@ + +srcdir = @srcdir@ +top_builddir = @top_builddir@ +abs_top_builddir = @abs_top_builddir@ +top_srcdir = @top_srcdir@ +abs_top_srcdir = @abs_top_srcdir@ +subdir = monomial +builddir = @builddir@ + +CFLAGS = @CFLAGS@ +DEPFLAGS = @DEPFLAGS@ +LDFLAGS = @LDFLAGS@ +DEFS = @DEFS@ +OPTARGS = @OPTARGS@ +SOPTARGS = @SOPTARGS@ + +AR = @AR@ +RANLIB = @RANLIB@ +CC = @CC@ +CCDEP = @CCDEP@ +CCLD = ${CC} +LINK = ${CCLD} ${CFLAGS} ${LDFLAGS} ${OPTARGS} -o $@ +LEX = @LEX@ +AUTOCONF = @AUTOCONF@ +DEFS = @DEFS@ + +INCLUDES = @INCLUDES@ +LDADD = +#COMPILE = ${CC} ${DEFS} ${INCLUDES} ${CFLAGS} +COMPILE = ${CC} $(DEFS) ${INCLUDES} ${CFLAGS} + +LIBRARIES = libmonomial +libmonomial_TARGETS = nddetratio_monomial monomial det_monomial detratio_monomial \ + gauge_monomial ndpoly_monomial clover_trlog_monomial cloverdet_monomial cloverdetratio_monomial \ + clovernd_trlog_monomial poly_monomial cloverndpoly_monomial moment_energy \ + ndrat_monomial ndratcor_monomial rat_monomial ratcor_monomial monitor_forces + + +libmonomial_STARGETS = + +libmonomial_OBJECTS = $(addsuffix .o, ${libmonomial_TARGETS}) +libmonomial_SOBJECTS = $(addsuffix .o, ${libmonomial_STARGETS}) + +# default rule + +all: Makefile dep libmonomial.a + +# rules for debugging +debug all-debug: CFLAGS := $(CFLAGS) @DEBUG_FLAG@ +debug all-debug: all + +# rules for profiling information +profile all-profile: CFLAGS := $(filter-out -fomit-frame-pointer,${CFLAGS}) @PROFILE_FLAG@ +profile all-profile: all + + +#include dep rules + +-include $(addsuffix .d,${libmonomial_TARGETS}) + +include ${top_srcdir}/Makefile.global + +# rule to compile objects + +${libmonomial_OBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h + $(COMPILE) ${OPTARGS} -c $< + +${libmonomial_SOBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h + $(COMPILE) ${SOPTARGS} -c $< + +# rule to make libmonomial + +libmonomial.a: ${libmonomial_OBJECTS} ${libmonomial_SOBJECTS} Makefile + @rm -f libmonomial.a + @${AR} cru libmonomial.a ${libmonomial_OBJECTS} ${libmonomial_SOBJECTS} + @$(RANLIB) libmonomial.a + @cp libmonomial.a ../lib/libmonomial.a + +# rule to generate .d files + +$(addsuffix .d, $(libmonomial_TARGETS) ${libmonomial_STARGETS}): %.d: ${srcdir}/%.c Makefile + @${CCDEP} ${DEFS} ${DEPFLAGS} ${INCLUDES} $< > $@ + +# rule to make dependencies + +dep: ${addsuffix .d, ${libmonomial_TARGETS} ${libmonomial_STARGETS}} + +# rules to clean + +compile-clean: Makefile + rm -f ${$(addsuffix _OBJECTS, ${LIBRARIES})} ${$(addsuffix _SOBJECTS, ${LIBRARIES})} *.d + +clean: compile-clean + rm -f $(addsuffix .a, ${LIBRARIES}) + rm -f ../lib/libmonomial.a + +distclean: clean + rm -f Makefile + +.PHONY: all dep clean compile-clean distclean profile all-profile debug all-debug diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/clover_trlog_monomial.c b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/clover_trlog_monomial.c new file mode 100644 index 0000000000000000000000000000000000000000..c7abdd53c9fc600265b0c0b30841ab6b2c64adca --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/clover_trlog_monomial.c @@ -0,0 +1,91 @@ +/*********************************************************************** + * + * Copyright (C) 2012 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "su3adj.h" +#include "su3spinor.h" +#include "operator/clovertm_operators.h" +#include "operator/clover_leaf.h" +#include "monomial/monomial.h" +#include "operator/Hopping_Matrix.h" +#include "gettime.h" +#include "clover_trlog_monomial.h" + +void clover_trlog_derivative(const int id, hamiltonian_field_t * const hf) { + //monomial * mnl = &monomial_list[id]; + /* this term has no derivative */ + /* so a dummy function */ + if(g_proc_id == 0 && g_debug_level > 4) { + printf("called clover_trlog_derivative for id %d, which is a dummy function\n", id); + } + return; +} + + +void clover_trlog_heatbath(const int id, hamiltonian_field_t * const hf) { + monomial * mnl = &monomial_list[id]; + double atime, etime; + atime = gettime(); + mnl->energy0 = 0.; + + init_sw_fields(); + sw_term( (const su3**) hf->gaugefield, mnl->kappa, mnl->c_sw); + /*compute the contribution from the clover trlog term */ + mnl->energy0 = -sw_trace(EO, mnl->mu); + etime = gettime(); + if(g_proc_id == 0) { + if(g_debug_level > 1) { + printf("# Time for %s monomial heatbath: %e s\n", mnl->name, etime-atime); + } + if(g_debug_level > 3) { + printf("called clover_trlog_heatbath for id %d E = %e\n", id, mnl->energy0); + } + } + return; +} + +double clover_trlog_acc(const int id, hamiltonian_field_t * const hf) { + monomial * mnl = &monomial_list[id]; + double atime, etime; + atime = gettime(); + mnl->energy1 = 0.; + sw_term( (const su3**) hf->gaugefield, mnl->kappa, mnl->c_sw); + /*compute the contribution from the clover trlog term */ + mnl->energy1 = -sw_trace(EO, mnl->mu); + etime = gettime(); + if(g_proc_id == 0 && g_debug_level > 3) { + if(g_debug_level > 1) { + printf("# Time for %s monomial acc step: %e s\n", mnl->name, etime-atime); + } + if(g_debug_level > 3) { + printf("called clover_trlog_acc for id %d dH = %1.10e\n", + id, mnl->energy1 - mnl->energy0); + } + } + return(mnl->energy1 - mnl->energy0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/clover_trlog_monomial.h b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/clover_trlog_monomial.h new file mode 100644 index 0000000000000000000000000000000000000000..cbc04bf53ac7ac6f7603cfc7ec70b53398e33ab9 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/clover_trlog_monomial.h @@ -0,0 +1,30 @@ +/*********************************************************************** + * + * Copyright (C) 2012 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _CLOVER_TRLOG_MONOMIAL_H +#define _CLOVER_TRLOG_MONOMIAL_H + +#include "hamiltonian_field.h" + +void clover_trlog_derivative(const int id, hamiltonian_field_t * const hf); +void clover_trlog_heatbath(const int id, hamiltonian_field_t * const hf); +double clover_trlog_acc(const int id, hamiltonian_field_t * const hf); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/cloverdet_monomial.c b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/cloverdet_monomial.c new file mode 100644 index 0000000000000000000000000000000000000000..704f8e796c2d7078e86b50db1b50856a0a8583f4 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/cloverdet_monomial.c @@ -0,0 +1,224 @@ +/*********************************************************************** + * + * Copyright (C) 2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "su3adj.h" +#include "su3spinor.h" +#include "ranlxd.h" +#include "sse.h" +#include "start.h" +#include "gettime.h" +#include "linalg_eo.h" +#include "deriv_Sb.h" +#include "gamma.h" +#include "operator/tm_operators.h" +#include "operator/Hopping_Matrix.h" +#include "solver/chrono_guess.h" +#include "solver/solver.h" +#include "solver/monomial_solve.h" +#include "operator/clover_leaf.h" +#include "read_input.h" +#include "hamiltonian_field.h" +#include "boundary.h" +#include "monomial/monomial.h" +#include "operator/clovertm_operators.h" +#include "operator/clovertm_operators_32.h" +#include "cloverdet_monomial.h" + +/* think about chronological solver ! */ + +void cloverdet_derivative(const int id, hamiltonian_field_t * const hf) { + monomial * mnl = &monomial_list[id]; + double atime, etime; + atime = gettime(); + for(int i = 0; i < VOLUME; i++) { + for(int mu = 0; mu < 4; mu++) { + _su3_zero(swm[i][mu]); + _su3_zero(swp[i][mu]); + } + } + + mnl->forcefactor = 1.; + /********************************************************************* + * + * even/odd version + * + * This a term is det(\hat Q^2(\mu)) + * + *********************************************************************/ + + g_mu = mnl->mu; + g_mu3 = mnl->rho; + boundary(mnl->kappa); + + // we compute the clover term (1 + T_ee(oo)) for all sites x + sw_term( (const su3**) hf->gaugefield, mnl->kappa, mnl->c_sw); + // we invert it for the even sites only + sw_invert(EE, mnl->mu); + + if(mnl->solver == BICGSTAB && g_proc_id == 0) { + fprintf(stderr, "Bicgstab currently not implemented, using CG instead! (cloverdet_monomial.c)\n"); + } + + // Invert Q_{+} Q_{-} + // X_o -> w_fields[1] + chrono_guess(mnl->w_fields[1], mnl->pf, mnl->csg_field, mnl->csg_index_array, + mnl->csg_N, mnl->csg_n, VOLUME/2, mnl->Qsq); + mnl->iter1 += solve_degenerate(mnl->w_fields[1], mnl->pf, mnl->solver_params, mnl->maxiter, mnl->forceprec, + g_relative_precision_flag, VOLUME/2, mnl->Qsq, mnl->solver); + chrono_add_solution(mnl->w_fields[1], mnl->csg_field, mnl->csg_index_array, + mnl->csg_N, &mnl->csg_n, VOLUME/2); + + // Y_o -> w_fields[0] + mnl->Qm(mnl->w_fields[0], mnl->w_fields[1]); + + // apply Hopping Matrix M_{eo} + // to get the even sites of X_e + H_eo_sw_inv_psi(mnl->w_fields[2], mnl->w_fields[1], EO, -1, mnl->mu); + // \delta Q sandwitched by Y_o^\dagger and X_e + deriv_Sb(OE, mnl->w_fields[0], mnl->w_fields[2], hf, mnl->forcefactor); + + // to get the even sites of Y_e + H_eo_sw_inv_psi(mnl->w_fields[3], mnl->w_fields[0], EO, +1, mnl->mu); + // \delta Q sandwitched by Y_e^\dagger and X_o + // uses the gauge field in hf and changes the derivative fields in hf + deriv_Sb(EO, mnl->w_fields[3], mnl->w_fields[1], hf, mnl->forcefactor); + + // here comes the clover term... + // computes the insertion matrices for S_eff + // result is written to swp and swm + // even/even sites sandwiched by gamma_5 Y_e and gamma_5 X_e + sw_spinor(EE, mnl->w_fields[2], mnl->w_fields[3], mnl->forcefactor); + + // odd/odd sites sandwiched by gamma_5 Y_o and gamma_5 X_o + sw_spinor(OO, mnl->w_fields[0], mnl->w_fields[1], mnl->forcefactor); + + // compute the contribution for the det-part + // we again compute only the insertion matrices for S_det + // the result is added to swp and swm + // even sites only! + sw_deriv(EE, mnl->mu); + + // now we compute + // finally, using the insertion matrices stored in swm and swp + // we compute the terms F^{det} and F^{sw} at once + // uses the gaugefields in hf and changes the derivative field in hf + sw_all(hf, mnl->kappa, mnl->c_sw); + + g_mu = g_mu1; + g_mu3 = 0.; + boundary(g_kappa); + etime = gettime(); + if(g_debug_level > 1 && g_proc_id == 0) { + printf("# Time for %s monomial derivative: %e s\n", mnl->name, etime-atime); + } + return; +} + + +void cloverdet_heatbath(const int id, hamiltonian_field_t * const hf) { + + monomial * mnl = &monomial_list[id]; + double atime, etime; + atime = gettime(); + + g_mu = mnl->mu; + g_mu3 = mnl->rho; + g_c_sw = mnl->c_sw; + boundary(mnl->kappa); + mnl->csg_n = 0; + mnl->csg_n2 = 0; + mnl->iter0 = 0; + mnl->iter1 = 0; + + init_sw_fields(); + sw_term( (const su3**) hf->gaugefield, mnl->kappa, mnl->c_sw); + sw_invert(EE, mnl->mu); + + random_spinor_field_eo(mnl->w_fields[0], mnl->rngrepro, RN_GAUSS); + mnl->energy0 = square_norm(mnl->w_fields[0], VOLUME/2, 1); + + mnl->Qp(mnl->pf, mnl->w_fields[0]); + chrono_add_solution(mnl->pf, mnl->csg_field, mnl->csg_index_array, + mnl->csg_N, &mnl->csg_n, VOLUME/2); + + g_mu = g_mu1; + g_mu3 = 0.; + boundary(g_kappa); + etime = gettime(); + if(g_proc_id == 0) { + if(g_debug_level > 1) { + printf("# Time for %s monomial heatbath: %e s\n", mnl->name, etime-atime); + } + if(g_debug_level > 3) { + printf("called cloverdet_heatbath for id %d energy %f\n", id, mnl->energy0); + } + } + return; +} + + +double cloverdet_acc(const int id, hamiltonian_field_t * const hf) { + monomial * mnl = &monomial_list[id]; + int save_sloppy = g_sloppy_precision_flag; + double atime, etime; + atime = gettime(); + + g_mu = mnl->mu; + g_mu3 = mnl->rho; + g_c_sw = mnl->c_sw; + boundary(mnl->kappa); + + sw_term( (const su3**) hf->gaugefield, mnl->kappa, mnl->c_sw); + sw_invert(EE, mnl->mu); + + chrono_guess(mnl->w_fields[0], mnl->pf, mnl->csg_field, mnl->csg_index_array, + mnl->csg_N, mnl->csg_n, VOLUME/2, mnl->Qsq); + g_sloppy_precision_flag = 0; + mnl->iter0 += solve_degenerate(mnl->w_fields[0], mnl->pf, mnl->solver_params, mnl->maxiter, mnl->accprec, + g_relative_precision_flag, VOLUME/2, mnl->Qsq, mnl->solver); + mnl->Qm(mnl->w_fields[0], mnl->w_fields[0]); + + g_sloppy_precision_flag = save_sloppy; + /* Compute the energy contr. from first field */ + mnl->energy1 = square_norm(mnl->w_fields[0], VOLUME/2, 1); + + g_mu = g_mu1; + g_mu3 = 0.; + boundary(g_kappa); + etime = gettime(); + if(g_proc_id == 0) { + if(g_debug_level > 1) { + printf("# Time for %s monomial acc step: %e s\n", mnl->name, etime-atime); + } + if(g_debug_level > 3) { + printf("called cloverdet_acc for id %d dH = %1.10e\n", + id, mnl->energy1 - mnl->energy0); + } + } + return(mnl->energy1 - mnl->energy0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/cloverdet_monomial.h b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/cloverdet_monomial.h new file mode 100644 index 0000000000000000000000000000000000000000..fd8f308bd9a6926fcd943c0ca344ad1f352700b2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/cloverdet_monomial.h @@ -0,0 +1,30 @@ +/*********************************************************************** + * + * Copyright (C) 2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _CLOVERDET_MONOMIAL_H +#define _CLOVERDET_MONOMIAL_H + +#include "hamiltonian_field.h" + +void cloverdet_derivative(const int no, hamiltonian_field_t * const hf); +void cloverdet_heatbath(const int no, hamiltonian_field_t * const hf); +double cloverdet_acc(const int no, hamiltonian_field_t * const hf); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/cloverdetratio_monomial.c b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/cloverdetratio_monomial.c new file mode 100644 index 0000000000000000000000000000000000000000..016fd483cea791f0e225a276adb17ecce7f25021 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/cloverdetratio_monomial.c @@ -0,0 +1,325 @@ +/*********************************************************************** + * + * Copyright (C) 2012 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "start.h" +#include "gettime.h" +#include "linalg_eo.h" +#include "deriv_Sb.h" +#include "gamma.h" +#include "operator/tm_operators.h" +#include "operator/Hopping_Matrix.h" +#include "solver/chrono_guess.h" +#include "solver/solver.h" +#include "solver/monomial_solve.h" +#include "read_input.h" +#include "operator/clovertm_operators.h" +#include "operator/clovertm_operators_32.h" +#include "operator/clover_leaf.h" +#include "monomial/monomial.h" +#include "boundary.h" +#include "cloverdetratio_monomial.h" + +/* think about chronological solver ! */ + +void cloverdetratio_derivative_orig(const int no, hamiltonian_field_t * const hf) { + monomial * mnl = &monomial_list[no]; + double atime, etime; + atime = gettime(); + /* This factor 2* a missing factor 2 in trace_lambda */ + mnl->forcefactor = 1.; + + /********************************************************************* + * + * this is being run in case there is even/odd preconditioning + * + * This term is det((Q^2 + \mu_1^2)/(Q^2 + \mu_2^2)) + * mu1 and mu2 are set according to the monomial + * + *********************************************************************/ + /* First term coming from the second field */ + /* Multiply with W_+ */ + g_mu = mnl->mu; + g_mu3 = mnl->rho2; //rho2 + boundary(mnl->kappa); + + // we compute the clover term (1 + T_ee(oo)) for all sites x + sw_term( (const su3**) hf->gaugefield, mnl->kappa, mnl->c_sw); + // we invert it for the even sites only including mu + sw_invert(EE, mnl->mu); + + if(mnl->solver == BICGSTAB && g_proc_id==0) { + fprintf(stderr, "Bicgstab currently not implemented, using CG instead! (detratio_monomial.c)\n"); + } + + mnl->Qp(mnl->w_fields[2], mnl->pf); + g_mu3 = mnl->rho; // rho1 + + /* Invert Q_{+} Q_{-} */ + /* X_W -> w_fields[1] */ + chrono_guess(mnl->w_fields[1], mnl->w_fields[2], mnl->csg_field, + mnl->csg_index_array, mnl->csg_N, mnl->csg_n, VOLUME/2, mnl->Qsq); + mnl->iter1 += solve_degenerate(mnl->w_fields[1], mnl->w_fields[2], mnl->solver_params, mnl->maxiter, + mnl->forceprec, g_relative_precision_flag, VOLUME/2, mnl->Qsq, mnl->solver); + chrono_add_solution(mnl->w_fields[1], mnl->csg_field, mnl->csg_index_array, + mnl->csg_N, &mnl->csg_n, VOLUME/2); + /* Y_W -> w_fields[0] */ + mnl->Qm(mnl->w_fields[0], mnl->w_fields[1]); + + /* apply Hopping Matrix M_{eo} */ + /* to get the even sites of X */ + H_eo_sw_inv_psi(mnl->w_fields[2], mnl->w_fields[1], EO, -1, mnl->mu); + /* \delta Q sandwitched by Y_o^\dagger and X_e */ + deriv_Sb(OE, mnl->w_fields[0], mnl->w_fields[2], hf, mnl->forcefactor); + + /* to get the even sites of Y */ + H_eo_sw_inv_psi(mnl->w_fields[3], mnl->w_fields[0], EO, +1, mnl->mu); + /* \delta Q sandwitched by Y_e^\dagger and X_o */ + deriv_Sb(EO, mnl->w_fields[3], mnl->w_fields[1], hf, mnl->forcefactor); + + // here comes the clover term... + // computes the insertion matrices for S_eff + // result is written to swp and swm + // even/even sites sandwiched by gamma_5 Y_e and gamma_5 X_e + sw_spinor(EE, mnl->w_fields[2], mnl->w_fields[3], mnl->forcefactor); + + // odd/odd sites sandwiched by gamma_5 Y_o and gamma_5 X_o + sw_spinor(OO, mnl->w_fields[0], mnl->w_fields[1], mnl->forcefactor); + + g_mu3 = mnl->rho2; // rho2 + + /* Second term coming from the second field */ + /* The sign is opposite!! */ + mul_r(mnl->w_fields[0], -1., mnl->pf, VOLUME/2); + + /* apply Hopping Matrix M_{eo} */ + /* to get the even sites of X */ + H_eo_sw_inv_psi(mnl->w_fields[2], mnl->w_fields[1], EO, -1, mnl->mu); + /* \delta Q sandwitched by Y_o^\dagger and X_e */ + deriv_Sb(OE, mnl->w_fields[0], mnl->w_fields[2], hf, mnl->forcefactor); + + /* to get the even sites of Y */ + H_eo_sw_inv_psi(mnl->w_fields[3], mnl->w_fields[0], EO, +1, mnl->mu); + /* \delta Q sandwitched by Y_e^\dagger and X_o */ + deriv_Sb(EO, mnl->w_fields[3], mnl->w_fields[1], hf, mnl->forcefactor); + + // here comes the clover term... + // computes the insertion matrices for S_eff + // result is written to swp and swm + // even/even sites sandwiched by gamma_5 Y_e and gamma_5 X_e + sw_spinor(EE, mnl->w_fields[2], mnl->w_fields[3], mnl->forcefactor); + + // odd/odd sites sandwiched by gamma_5 Y_o and gamma_5 X_o + sw_spinor(OO, mnl->w_fields[0], mnl->w_fields[1], mnl->forcefactor); + + sw_all(hf, mnl->kappa, mnl->c_sw); + + g_mu = g_mu1; + g_mu3 = 0.; + boundary(g_kappa); + etime = gettime(); + if(g_debug_level > 1 && g_proc_id == 0) { + printf("# Time for %s monomial derivative: %e s\n", mnl->name, etime-atime); + } + return; +} + + +void cloverdetratio_derivative(const int no, hamiltonian_field_t * const hf) { + monomial * mnl = &monomial_list[no]; + double atime, etime; + atime = gettime(); + for(int i = 0; i < VOLUME; i++) { + for(int mu = 0; mu < 4; mu++) { + _su3_zero(swm[i][mu]); + _su3_zero(swp[i][mu]); + } + } + mnl->forcefactor = 1.; + + /********************************************************************* + * + * this is being run in case there is even/odd preconditioning + * + * This term is det((Q^2 + \mu_1^2)/(Q^2 + \mu_2^2)) + * mu1 and mu2 are set according to the monomial + * + *********************************************************************/ + /* First term coming from the second field */ + /* Multiply with W_+ */ + g_mu = mnl->mu; + boundary(mnl->kappa); + + // we compute the clover term (1 + T_ee(oo)) for all sites x + sw_term( (const su3**) hf->gaugefield, mnl->kappa, mnl->c_sw); + // we invert it for the even sites only including mu + sw_invert(EE, mnl->mu); + + if(mnl->solver == BICGSTAB && g_proc_id == 0) { + fprintf(stderr, "Bicgstab currently not implemented, using CG instead! (cloverdetratio_monomial.c)\n"); + } + + // apply W_{+} to phi + g_mu3 = mnl->rho2; //rho2 + mnl->Qp(mnl->w_fields[2], mnl->pf); + g_mu3 = mnl->rho; // rho1 + + // Invert Q_{+} Q_{-} + // X_W -> w_fields[1] + chrono_guess(mnl->w_fields[1], mnl->w_fields[2], mnl->csg_field, + mnl->csg_index_array, mnl->csg_N, mnl->csg_n, VOLUME/2, mnl->Qsq); + mnl->iter1 += solve_degenerate(mnl->w_fields[1], mnl->w_fields[2], mnl->solver_params, mnl->maxiter, + mnl->forceprec, g_relative_precision_flag, VOLUME/2, mnl->Qsq, mnl->solver); + chrono_add_solution(mnl->w_fields[1], mnl->csg_field, mnl->csg_index_array, + mnl->csg_N, &mnl->csg_n, VOLUME/2); + // Apply Q_{-} to get Y_W -> w_fields[0] + mnl->Qm(mnl->w_fields[0], mnl->w_fields[1]); + // Compute phi - Y_W -> w_fields[0] + diff(mnl->w_fields[0], mnl->w_fields[0], mnl->pf, VOLUME/2); + + /* apply Hopping Matrix M_{eo} */ + /* to get the even sites of X */ + H_eo_sw_inv_psi(mnl->w_fields[2], mnl->w_fields[1], EE, -1, mnl->mu); + /* \delta Q sandwitched by Y_o^\dagger and X_e */ + deriv_Sb(OE, mnl->w_fields[0], mnl->w_fields[2], hf, mnl->forcefactor); + + /* to get the even sites of Y */ + H_eo_sw_inv_psi(mnl->w_fields[3], mnl->w_fields[0], EE, +1, mnl->mu); + /* \delta Q sandwitched by Y_e^\dagger and X_o */ + deriv_Sb(EO, mnl->w_fields[3], mnl->w_fields[1], hf, mnl->forcefactor); + + // here comes the clover term... + // computes the insertion matrices for S_eff + // result is written to swp and swm + // even/even sites sandwiched by gamma_5 Y_e and gamma_5 X_e + sw_spinor(EO, mnl->w_fields[2], mnl->w_fields[3], mnl->forcefactor); + + // odd/odd sites sandwiched by gamma_5 Y_o and gamma_5 X_o + sw_spinor(OE, mnl->w_fields[0], mnl->w_fields[1], mnl->forcefactor); + + sw_all(hf, mnl->kappa, mnl->c_sw); + + g_mu = g_mu1; + g_mu3 = 0.; + boundary(g_kappa); + etime = gettime(); + if(g_debug_level > 1 && g_proc_id == 0) { + printf("# Time for %s monomial derivative: %e s\n", mnl->name, etime-atime); + } + return; +} + + +void cloverdetratio_heatbath(const int id, hamiltonian_field_t * const hf) { + monomial * mnl = &monomial_list[id]; + double atime, etime; + atime = gettime(); + g_mu = mnl->mu; + g_c_sw = mnl->c_sw; + boundary(mnl->kappa); + mnl->csg_n = 0; + mnl->csg_n2 = 0; + mnl->iter0 = 0; + mnl->iter1 = 0; + + init_sw_fields(); + sw_term( (const su3**) hf->gaugefield, mnl->kappa, mnl->c_sw); + sw_invert(EE, mnl->mu); + + random_spinor_field_eo(mnl->w_fields[0], mnl->rngrepro, RN_GAUSS); + mnl->energy0 = square_norm(mnl->w_fields[0], VOLUME/2, 1); + + g_mu3 = mnl->rho; + mnl->Qp(mnl->w_fields[1], mnl->w_fields[0]); + g_mu3 = mnl->rho2; + zero_spinor_field(mnl->pf,VOLUME/2); + + mnl->iter0 = solve_degenerate(mnl->pf, mnl->w_fields[1], mnl->solver_params, mnl->maxiter, mnl->accprec, + g_relative_precision_flag, VOLUME/2, mnl->Qsq, mnl->solver); + + chrono_add_solution(mnl->pf, mnl->csg_field, mnl->csg_index_array, + mnl->csg_N, &mnl->csg_n, VOLUME/2); + mnl->Qm(mnl->pf, mnl->pf); + etime = gettime(); + if(g_proc_id == 0) { + if(g_debug_level > 1) { + printf("# Time for %s monomial heatbath: %e s\n", mnl->name, etime-atime); + } + if(g_debug_level > 3) { + printf("called cloverdetratio_heatbath for id %d energy %f\n", id, mnl->energy0); + } + } + g_mu3 = 0.; + g_mu = g_mu1; + boundary(g_kappa); + return; +} + +double cloverdetratio_acc(const int id, hamiltonian_field_t * const hf) { + monomial * mnl = &monomial_list[id]; + int save_sloppy = g_sloppy_precision_flag; + double atime, etime; + atime = gettime(); + g_mu = mnl->mu; + boundary(mnl->kappa); + + sw_term( (const su3**) hf->gaugefield, mnl->kappa, mnl->c_sw); + sw_invert(EE, mnl->mu); + + g_mu3 = mnl->rho2; + mnl->Qp(mnl->w_fields[1], mnl->pf); + g_mu3 = mnl->rho; + + chrono_guess(mnl->w_fields[0], mnl->w_fields[1], mnl->csg_field, mnl->csg_index_array, + mnl->csg_N, mnl->csg_n, VOLUME/2, &Qtm_plus_psi); + g_sloppy_precision_flag = 0; + mnl->iter0 += solve_degenerate(mnl->w_fields[0], mnl->w_fields[1], mnl->solver_params, mnl->maxiter, mnl->accprec, + g_relative_precision_flag, VOLUME/2, mnl->Qsq, mnl->solver); + mnl->Qm(mnl->w_fields[0], mnl->w_fields[0]); + + g_sloppy_precision_flag = save_sloppy; + + /* Compute the energy contr. from second field */ + mnl->energy1 = square_norm(mnl->w_fields[0], VOLUME/2, 1); + + g_mu = g_mu1; + g_mu3 = 0.; + boundary(g_kappa); + etime = gettime(); + if(g_proc_id == 0) { + if(g_debug_level > 1) { + printf("# Time for %s monomial acc step: %e s\n", mnl->name, etime-atime); + } + if(g_debug_level > 3) { + printf("called cloverdetratio_acc for id %d dH = %1.10e\n", + id, mnl->energy1 - mnl->energy0); + } + } + return(mnl->energy1 - mnl->energy0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/cloverdetratio_monomial.h b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/cloverdetratio_monomial.h new file mode 100644 index 0000000000000000000000000000000000000000..5c7ce4fcfc6c8a3ef5aa5b6819ef5001714f8748 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/cloverdetratio_monomial.h @@ -0,0 +1,30 @@ +/*********************************************************************** + * + * Copyright (C) 2012 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + ***********************************************************************/ +#ifndef _CLOVERDETRATIO_MONOMIAL_H +#define _CLOVERDETRATIO_MONOMIAL_H + +#include "hamiltonian_field.h" + +void cloverdetratio_derivative(const int no, hamiltonian_field_t * const hf); +double cloverdetratio_acc(const int no, hamiltonian_field_t * const hf); +void cloverdetratio_heatbath(const int no, hamiltonian_field_t * const hf); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/clovernd_trlog_monomial.c b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/clovernd_trlog_monomial.c new file mode 100644 index 0000000000000000000000000000000000000000..fecc5400d125ce81304cc7f9d5da959c9ecd743b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/clovernd_trlog_monomial.c @@ -0,0 +1,91 @@ +/*********************************************************************** + * + * Copyright (C) 2012 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "su3adj.h" +#include "su3spinor.h" +#include "operator/clovertm_operators.h" +#include "operator/clover_leaf.h" +#include "monomial/monomial.h" +#include "operator/Hopping_Matrix.h" +#include "gettime.h" +#include "clovernd_trlog_monomial.h" + +void clovernd_trlog_derivative(const int id, hamiltonian_field_t * const hf) { + //monomial * mnl = &monomial_list[id]; + /* this term has no derivative */ + /* so a dummy function */ + if(g_proc_id == 0 && g_debug_level > 4) { + printf("called clovernd_trlog_derivative for id %d, which is a dummy function\n", id); + } + return; +} + + +void clovernd_trlog_heatbath(const int id, hamiltonian_field_t * const hf) { + monomial * mnl = &monomial_list[id]; + double atime, etime; + atime = gettime(); + mnl->energy0 = 0.; + + init_sw_fields(); + sw_term( (const su3**) hf->gaugefield, mnl->kappa, mnl->c_sw); + /*compute the contribution from the clover trlog term */ + mnl->energy0 = -sw_trace_nd(EE, mnl->mubar, mnl->epsbar); + etime = gettime(); + if(g_proc_id == 0) { + if(g_debug_level > 1) { + printf("# Time for %s monomial heatbath: %e s\n", mnl->name, etime-atime); + } + if(g_debug_level > 3) { + printf("called clovernd_trlog_heatbath for id %d E = %e\n", id, mnl->energy0); + } + } + return; +} + +double clovernd_trlog_acc(const int id, hamiltonian_field_t * const hf) { + monomial * mnl = &monomial_list[id]; + double atime, etime; + atime = gettime(); + mnl->energy1 = 0.; + sw_term( (const su3**) hf->gaugefield, mnl->kappa, mnl->c_sw); + /*compute the contribution from the clover trlog term */ + mnl->energy1 = -sw_trace_nd(EE, mnl->mubar, mnl->epsbar); + etime = gettime(); + if(g_proc_id == 0) { + if(g_debug_level > 1) { + printf("# Time for %s monomial acc step: %e s\n", mnl->name, etime-atime); + } + if(g_debug_level > 3) { + printf("called clovernd_trlog_acc for id %d dH = %1.10e\n", + id, mnl->energy1 - mnl->energy0); + } + } + return(mnl->energy1 - mnl->energy0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/clovernd_trlog_monomial.h b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/clovernd_trlog_monomial.h new file mode 100644 index 0000000000000000000000000000000000000000..69c50b0c622f3c26f2ac1978a5fb7b8c00ed363b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/clovernd_trlog_monomial.h @@ -0,0 +1,30 @@ +/*********************************************************************** + * + * Copyright (C) 2012 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _CLOVERND_TRLOG_MONOMIAL_H +#define _CLOVERND_TRLOG_MONOMIAL_H + +#include "hamiltonian_field.h" + +void clovernd_trlog_derivative(const int id, hamiltonian_field_t * const hf); +void clovernd_trlog_heatbath(const int id, hamiltonian_field_t * const hf); +double clovernd_trlog_acc(const int id, hamiltonian_field_t * const hf); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/cloverndpoly_monomial.c b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/cloverndpoly_monomial.c new file mode 100644 index 0000000000000000000000000000000000000000..2fcdeab1d695e9f3abd41076556ec50f64400366 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/cloverndpoly_monomial.c @@ -0,0 +1,241 @@ +/*********************************************************************** + * + * Copyright (C) 2008 Thomas Chiarappa, Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "linalg_eo.h" +#include "start.h" +#include "gettime.h" +#include "solver/solver.h" +#include "deriv_Sb.h" +#include "operator/tm_operators.h" +#include "operator/tm_operators_nd.h" +#include "operator/Hopping_Matrix.h" +#include "phmc.h" +#include "Ptilde_nd.h" +#include "monomial/monomial.h" +#include "hamiltonian_field.h" +#include "boundary.h" +#include "operator/clovertm_operators.h" +#include "operator/clover_leaf.h" +#include "cloverndpoly_monomial.h" + +/******************************************** + * + * Here \delta S_b is computed + * + ********************************************/ + +void cloverndpoly_derivative(const int id, hamiltonian_field_t * const hf) { + int j, k; + monomial * mnl = &monomial_list[id]; + double atime, etime; + atime = gettime(); + for(int i = 0; i < VOLUME; i++) { + for(int mu = 0; mu < 4; mu++) { + _su3_zero(swm[i][mu]); + _su3_zero(swp[i][mu]); + } + } + ndpoly_set_global_parameter(mnl, 0); + + // we compute the clover term (1 + T_ee(oo)) for all sites x + sw_term( (const su3**) hf->gaugefield, mnl->kappa, mnl->c_sw); + // we invert it for the even sites only + sw_invert_nd(mnl->mubar*mnl->mubar - mnl->epsbar*mnl->epsbar); + + mnl->forcefactor = -phmc_Cpol*mnl->EVMaxInv; + + /* Recall: The GAMMA_5 left of delta M_eo is done in deriv_Sb !!! */ + + /* Here comes the definitions for the chi_j fields */ + /* from j=0 (chi_0 = phi) ..... to j = n-1 */ + /* in g_chi_up_spinor_field[0] (g_chi_dn_spinor_field[0] we expect */ + /* to find the phi field, the pseudo fermion field */ + /* i.e. must be equal to mnl->pf (mnl->pf2) */ + + assign(g_chi_up_spinor_field[0], mnl->pf, VOLUME/2); + assign(g_chi_dn_spinor_field[0], mnl->pf2, VOLUME/2); + + for(k = 1; k < (mnl->MDPolyDegree-1); k++) { + Qsw_tau1_sub_const_ndpsi(g_chi_up_spinor_field[k], g_chi_dn_spinor_field[k], + g_chi_up_spinor_field[k-1], g_chi_dn_spinor_field[k-1], + mnl->MDPolyRoots[k-1], phmc_Cpol, phmc_invmaxev); + } + + /* Here comes the remaining fields chi_k ; k=n,...,2n-1 */ + /*They are evaluated step-by-step overwriting the same field (mnl->MDPolyDegree)*/ + + assign(g_chi_up_spinor_field[mnl->MDPolyDegree], g_chi_up_spinor_field[mnl->MDPolyDegree-2], VOLUME/2); + assign(g_chi_dn_spinor_field[mnl->MDPolyDegree], g_chi_dn_spinor_field[mnl->MDPolyDegree-2], VOLUME/2); + + for(j = (mnl->MDPolyDegree-1); j > 0; j--) { + assign(g_chi_up_spinor_field[mnl->MDPolyDegree-1], g_chi_up_spinor_field[mnl->MDPolyDegree], VOLUME/2); + assign(g_chi_dn_spinor_field[mnl->MDPolyDegree-1], g_chi_dn_spinor_field[mnl->MDPolyDegree], VOLUME/2); + + Qsw_tau1_sub_const_ndpsi(g_chi_up_spinor_field[mnl->MDPolyDegree], g_chi_dn_spinor_field[mnl->MDPolyDegree], + g_chi_up_spinor_field[mnl->MDPolyDegree-1], g_chi_dn_spinor_field[mnl->MDPolyDegree-1], + mnl->MDPolyRoots[2*mnl->MDPolyDegree-3-j], phmc_Cpol, phmc_invmaxev); + + /* Get the even parts of the (j-1)th chi_spinors */ + H_eo_sw_ndpsi(mnl->w_fields[0], mnl->w_fields[1], + g_chi_up_spinor_field[j-1], g_chi_dn_spinor_field[j-1]); + + /* \delta M_eo sandwitched by chi[j-1]_e^\dagger and chi[2N-j]_o */ + deriv_Sb(EO, mnl->w_fields[0], g_chi_up_spinor_field[mnl->MDPolyDegree], hf, mnl->forcefactor);/* UP */ + deriv_Sb(EO, mnl->w_fields[1], g_chi_dn_spinor_field[mnl->MDPolyDegree], hf, mnl->forcefactor);/* DN */ + + /* Get the even parts of the (2N-j)-th chi_spinors */ + H_eo_sw_ndpsi(mnl->w_fields[2], mnl->w_fields[3], + g_chi_up_spinor_field[mnl->MDPolyDegree], g_chi_dn_spinor_field[mnl->MDPolyDegree]); + + /* \delta M_oe sandwitched by chi[j-1]_o^\dagger and chi[2N-j]_e */ + deriv_Sb(OE, g_chi_up_spinor_field[j-1], mnl->w_fields[2], hf, mnl->forcefactor); + deriv_Sb(OE, g_chi_dn_spinor_field[j-1], mnl->w_fields[3], hf, mnl->forcefactor); + + // even/even sites sandwiched by tau_1 gamma_5 Y_e and gamma_5 X_e + sw_spinor(EE, mnl->w_fields[3], mnl->w_fields[0], mnl->forcefactor); + // odd/odd sites sandwiched by tau_1 gamma_5 Y_o and gamma_5 X_o + sw_spinor(OO, g_chi_up_spinor_field[j-1], g_chi_dn_spinor_field[mnl->MDPolyDegree], mnl->forcefactor); + + // even/even sites sandwiched by tau_1 gamma_5 Y_e and gamma_5 X_e + sw_spinor(EE, mnl->w_fields[2], mnl->w_fields[1], mnl->forcefactor); + // odd/odd sites sandwiched by tau_1 gamma_5 Y_o and gamma_5 X_o + sw_spinor(OO, g_chi_dn_spinor_field[j-1], g_chi_up_spinor_field[mnl->MDPolyDegree], mnl->forcefactor); + } + // trlog part does not depend on the normalisation of the polynomial + sw_deriv_nd(EE); + sw_all(hf, mnl->kappa, mnl->c_sw); + etime = gettime(); + if(g_debug_level > 1 && g_proc_id == 0) { + printf("# Time for %s monomial derivative: %e s\n", mnl->name, etime-atime); + } + return; +} + + +void cloverndpoly_heatbath(const int id, hamiltonian_field_t * const hf) { + int j; + monomial * mnl = &monomial_list[id]; + spinor *up0, *dn0, *up1, *dn1, *dummy; + double atime, etime; + atime = gettime(); + ndpoly_set_global_parameter(mnl, 0); + g_mu3 = 0.; + init_sw_fields(); + sw_term((const su3**)hf->gaugefield, mnl->kappa, mnl->c_sw); + sw_invert_nd(mnl->mubar*mnl->mubar - mnl->epsbar*mnl->epsbar); + + // we measure before trajectory! + if((mnl->rec_ev != 0) && (hf->traj_counter%mnl->rec_ev == 0)) { + phmc_compute_ev(hf->traj_counter-1, id, &Qsw_pm_ndbipsi); + } + + mnl->energy0 = 0.; + random_spinor_field_eo(g_chi_up_spinor_field[0], mnl->rngrepro, RN_GAUSS); + mnl->energy0 = square_norm(g_chi_up_spinor_field[0], VOLUME/2, 1); + + random_spinor_field_eo(g_chi_dn_spinor_field[0], mnl->rngrepro, RN_GAUSS); + mnl->energy0 += square_norm(g_chi_dn_spinor_field[0], VOLUME/2, 1); + + Qsw_ndpsi(g_chi_up_spinor_field[1], g_chi_dn_spinor_field[1], + g_chi_up_spinor_field[0], g_chi_dn_spinor_field[0]); + + up0 = g_chi_up_spinor_field[0]; + up1 = g_chi_up_spinor_field[1]; + dn0 = g_chi_dn_spinor_field[0]; + dn1 = g_chi_dn_spinor_field[1]; + + for(j = 1; j < (mnl->MDPolyDegree); j++){ + Qsw_tau1_sub_const_ndpsi(up0, dn0, + up1, dn1, + mnl->MDPolyRoots[mnl->MDPolyDegree-2+j], phmc_Cpol, phmc_invmaxev); + dummy = up1; up1 = up0; up0 = dummy; + dummy = dn1; dn1 = dn0; dn0 = dummy; + } + Ptilde_ndpsi(up0, dn0, mnl->PtildeCoefs, + mnl->PtildeDegree, up1, dn1, &Qsw_pm_ndpsi); + + assign(mnl->pf, up0, VOLUME/2); + assign(mnl->pf2, dn0, VOLUME/2); + etime = gettime(); + if(g_proc_id == 0) { + if(g_debug_level > 1) { + printf("# Time for %s monomial heatbath: %e s\n", mnl->name, etime-atime); + } + if(g_debug_level > 3) { + printf("called cloverndpoly_heatbath for id %d energy %f\n", id, mnl->energy0); + } + } + return; +} + + +double cloverndpoly_acc(const int id, hamiltonian_field_t * const hf) { + int j; + monomial * mnl = &monomial_list[id]; + spinor *up0, *dn0, *up1, *dn1, *dummy; + double atime, etime; + atime = gettime(); + ndpoly_set_global_parameter(mnl, 0); + g_mu3 = 0.; + sw_term((const su3**) hf->gaugefield, mnl->kappa, mnl->c_sw); + sw_invert_nd(mnl->mubar*mnl->mubar - mnl->epsbar*mnl->epsbar); + + mnl->energy1 = 0.; + + up0 = g_chi_up_spinor_field[0]; + up1 = g_chi_up_spinor_field[1]; + dn0 = g_chi_dn_spinor_field[0]; + dn1 = g_chi_dn_spinor_field[1]; + /* This is needed if we consider only "1" in eq. 9 */ + assign(up0, mnl->pf , VOLUME/2); + assign(dn0, mnl->pf2, VOLUME/2); + + for(j = 1; j <= (mnl->MDPolyDegree-1); j++) { + Qsw_tau1_sub_const_ndpsi(up1, dn1, up0, dn0, mnl->MDPolyRoots[j-1], phmc_Cpol, phmc_invmaxev); + + dummy = up1; up1 = up0; up0 = dummy; + dummy = dn1; dn1 = dn0; dn0 = dummy; + /* result always in up0 and dn0 */ + } + + mnl->energy1 = square_norm(up0, VOLUME/2, 1); + mnl->energy1 += square_norm(dn0, VOLUME/2, 1); + etime = gettime(); + if(g_proc_id == 0) { + if(g_debug_level > 1) { + printf("# Time for %s monomial acc step: %e s\n", mnl->name, etime-atime); + } + if(g_debug_level > 3) { + printf("called cloverndpoly_acc for id %d dH = %1.10e\n", id, mnl->energy1 - mnl->energy0); + } + } + return(mnl->energy1 - mnl->energy0); +} + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/cloverndpoly_monomial.h b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/cloverndpoly_monomial.h new file mode 100644 index 0000000000000000000000000000000000000000..47095f0b769a7d1b0baf1aca8f66df903927fd6b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/cloverndpoly_monomial.h @@ -0,0 +1,29 @@ +/*********************************************************************** + * + * Copyright (C) 2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _CLOVERNDPOLY_MONOMIAL_H +#define _CLOVERNDPOLY_MONOMIAL_H + +#include "hamiltonian_field.h" + +void cloverndpoly_derivative(const int id, hamiltonian_field_t * const hf); +double cloverndpoly_acc(const int id, hamiltonian_field_t * const hf); +void cloverndpoly_heatbath(const int id, hamiltonian_field_t * const hf); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/det_monomial.c b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/det_monomial.c new file mode 100644 index 0000000000000000000000000000000000000000..7985251a195f78d986b8959a798ba868952e01fd --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/det_monomial.c @@ -0,0 +1,263 @@ +/*********************************************************************** + * + * Copyright (C) 2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "start.h" +#include "gettime.h" +#include "linalg_eo.h" +#include "deriv_Sb.h" +#include "deriv_Sb_D_psi.h" +#include "operator/tm_operators.h" +#include "operator/Hopping_Matrix.h" +#include "solver/chrono_guess.h" +#include "solver/solver.h" +#include "solver/monomial_solve.h" +#include "read_input.h" +#include "hamiltonian_field.h" +#include "boundary.h" +#include "monomial/monomial.h" +#include "det_monomial.h" + +/* think about chronological solver ! */ + +void det_derivative(const int id, hamiltonian_field_t * const hf) { + monomial * mnl = &monomial_list[id]; + double atime, etime; + atime = gettime(); + mnl->forcefactor = 1.; + + if(mnl->even_odd_flag) { + /********************************************************************* + * + * even/odd version + * + * This a term is det(\hat Q^2(\mu)) + * + *********************************************************************/ + + g_mu = mnl->mu; + boundary(mnl->kappa); + + /* Invert Q_{+} Q_{-} */ + /* X_o -> w_fields[1] */ + chrono_guess(mnl->w_fields[1], mnl->pf, mnl->csg_field, mnl->csg_index_array, + mnl->csg_N, mnl->csg_n, VOLUME/2, mnl->Qsq); + + if(mnl->solver==BICGSTAB) + { + fprintf(stderr, "Bicgstab currently not implemented, using CG instead! (det_monomial.c)\n"); + mnl->iter1 += solve_degenerate(mnl->w_fields[1], mnl->pf, mnl->solver_params, mnl->maxiter, mnl->forceprec, + g_relative_precision_flag, VOLUME/2, mnl->Qsq, CG); + } + else{ + mnl->iter1 += solve_degenerate(mnl->w_fields[1], mnl->pf, mnl->solver_params, mnl->maxiter, mnl->forceprec, + g_relative_precision_flag, VOLUME/2, mnl->Qsq, mnl->solver); + } + + + chrono_add_solution(mnl->w_fields[1], mnl->csg_field, mnl->csg_index_array, + mnl->csg_N, &mnl->csg_n, VOLUME/2); + + /* Y_o -> w_fields[0] */ + mnl->Qm(mnl->w_fields[0], mnl->w_fields[1]); + + /* apply Hopping Matrix M_{eo} */ + /* to get the even sites of X_e */ + H_eo_tm_inv_psi(mnl->w_fields[2], mnl->w_fields[1], EO, -1.); + /* \delta Q sandwitched by Y_o^\dagger and X_e */ + deriv_Sb(OE, mnl->w_fields[0], mnl->w_fields[2], hf, mnl->forcefactor); + + /* to get the even sites of Y_e */ + H_eo_tm_inv_psi(mnl->w_fields[3], mnl->w_fields[0], EO, +1); + /* \delta Q sandwitched by Y_e^\dagger and X_o */ + deriv_Sb(EO, mnl->w_fields[3], mnl->w_fields[1], hf, mnl->forcefactor); + } + else { + /********************************************************************* + * non even/odd version + * + * This term is det(Q^2 + \mu_1^2) + * + *********************************************************************/ + g_mu = mnl->mu; + boundary(mnl->kappa); + if((mnl->solver == CG) || (mnl->solver == MIXEDCG) || (mnl->solver == RGMIXEDCG)) { + /* Invert Q_{+} Q_{-} */ + /* X -> w_fields[1] */ + chrono_guess(mnl->w_fields[1], mnl->pf, mnl->csg_field, mnl->csg_index_array, + mnl->csg_N, mnl->csg_n, VOLUME/2, &Q_pm_psi); + mnl->iter1 += solve_degenerate(mnl->w_fields[1], mnl->pf, mnl->solver_params, + mnl->maxiter, mnl->forceprec, g_relative_precision_flag, + VOLUME, &Q_pm_psi, mnl->solver); + chrono_add_solution(mnl->w_fields[1], mnl->csg_field, mnl->csg_index_array, + mnl->csg_N, &mnl->csg_n, VOLUME/2); + + /* Y -> w_fields[0] */ + Q_minus_psi(mnl->w_fields[0], mnl->w_fields[1]); + + } + else { + /* Invert first Q_+ */ + /* Y -> w_fields[0] */ + chrono_guess(mnl->w_fields[0], mnl->pf, mnl->csg_field, mnl->csg_index_array, + mnl->csg_N, mnl->csg_n, VOLUME/2, &Q_plus_psi); + mnl->iter1 += solve_degenerate(mnl->w_fields[0], mnl->pf, mnl->solver_params, + mnl->maxiter, mnl->forceprec, g_relative_precision_flag, + VOLUME, &Q_plus_psi, mnl->solver); + chrono_add_solution(mnl->w_fields[0], mnl->csg_field, mnl->csg_index_array, + mnl->csg_N, &mnl->csg_n, VOLUME/2); + + /* Now Q_- */ + /* X -> w_fields[1] */ + + chrono_guess(mnl->w_fields[1], mnl->w_fields[0], mnl->csg_field2, + mnl->csg_index_array2, mnl->csg_N2, mnl->csg_n2, VOLUME/2, &Q_minus_psi); + mnl->iter1 += solve_degenerate(mnl->w_fields[1], mnl->w_fields[0], mnl->solver_params, + mnl->maxiter, mnl->forceprec, g_relative_precision_flag, + VOLUME, &Q_minus_psi, mnl->solver); + chrono_add_solution(mnl->w_fields[1], mnl->csg_field2, mnl->csg_index_array2, + mnl->csg_N2, &mnl->csg_n2, VOLUME/2); + + } + + /* \delta Q sandwitched by Y^\dagger and X */ + deriv_Sb_D_psi(mnl->w_fields[0], mnl->w_fields[1], hf, mnl->forcefactor); + } + g_mu = g_mu1; + boundary(g_kappa); + etime = gettime(); + if(g_debug_level > 1 && g_proc_id == 0) { + printf("# Time for %s monomial derivative: %e s\n", mnl->name, etime-atime); + } + return; +} + + +void det_heatbath(const int id, hamiltonian_field_t * const hf) { + + monomial * mnl = &monomial_list[id]; + double atime, etime; + atime = gettime(); + g_mu = mnl->mu; + boundary(mnl->kappa); + mnl->csg_n = 0; + mnl->csg_n2 = 0; + mnl->iter0 = 0; + mnl->iter1 = 0; + + if(mnl->even_odd_flag) { + random_spinor_field_eo(mnl->w_fields[0], mnl->rngrepro, RN_GAUSS); + mnl->energy0 = square_norm(mnl->w_fields[0], VOLUME/2, 1); + + mnl->Qp(mnl->pf, mnl->w_fields[0]); + chrono_add_solution(mnl->pf, mnl->csg_field, mnl->csg_index_array, + mnl->csg_N, &mnl->csg_n, VOLUME/2); + if(mnl->solver != CG) { + chrono_add_solution(mnl->pf, mnl->csg_field2, mnl->csg_index_array2, + mnl->csg_N2, &mnl->csg_n2, VOLUME/2); + } + } + else { + random_spinor_field_lexic(mnl->w_fields[0], mnl->rngrepro,RN_GAUSS); + mnl->energy0 = square_norm(mnl->w_fields[0], VOLUME, 1); + + Q_plus_psi(mnl->pf, mnl->w_fields[0]); + chrono_add_solution(mnl->pf, mnl->csg_field, mnl->csg_index_array, + mnl->csg_N, &mnl->csg_n, VOLUME/2); + if(mnl->solver != CG) { + chrono_add_solution(mnl->pf, mnl->csg_field2, mnl->csg_index_array2, + mnl->csg_N2, &mnl->csg_n2, VOLUME/2); + } + } + g_mu = g_mu1; + boundary(g_kappa); + etime = gettime(); + if(g_proc_id == 0) { + if(g_debug_level > 1) { + printf("# Time for %s monomial heatbath: %e s\n", mnl->name, etime-atime); + } + if(g_debug_level > 3) { + printf("called det_heatbath for id %d energey %f\n", id, mnl->energy0); + } + } + return; +} + + +double det_acc(const int id, hamiltonian_field_t * const hf) { + monomial * mnl = &monomial_list[id]; + int save_sloppy = g_sloppy_precision_flag; + double atime, etime; + atime = gettime(); + g_mu = mnl->mu; + boundary(mnl->kappa); + if(mnl->even_odd_flag) { + + chrono_guess(mnl->w_fields[0], mnl->pf, mnl->csg_field, mnl->csg_index_array, + mnl->csg_N, mnl->csg_n, VOLUME/2, mnl->Qsq); + g_sloppy_precision_flag = 0; + mnl->iter0 += solve_degenerate(mnl->w_fields[0], mnl->pf, mnl->solver_params, mnl->maxiter, + mnl->accprec, g_relative_precision_flag,VOLUME/2, mnl->Qsq, mnl->solver); + mnl->Qm(mnl->w_fields[1], mnl->w_fields[0]); + g_sloppy_precision_flag = save_sloppy; + /* Compute the energy contr. from first field */ + mnl->energy1 = square_norm(mnl->w_fields[1], VOLUME/2, 1); + } + else { + if((mnl->solver == CG) || (mnl->solver == MIXEDCG)) { + chrono_guess(mnl->w_fields[1], mnl->pf, mnl->csg_field, mnl->csg_index_array, + mnl->csg_N, mnl->csg_n, VOLUME/2, &Q_pm_psi); + mnl->iter0 += solve_degenerate(mnl->w_fields[1], mnl->pf, mnl->solver_params, mnl->maxiter, + mnl->accprec, g_relative_precision_flag, + VOLUME, &Q_pm_psi, mnl->solver); + Q_minus_psi(mnl->w_fields[0], mnl->w_fields[1]); + /* Compute the energy contr. from first field */ + mnl->energy1 = square_norm(mnl->w_fields[0], VOLUME, 1); + } + else { + chrono_guess(mnl->w_fields[0], mnl->pf, mnl->csg_field, mnl->csg_index_array, + mnl->csg_N, mnl->csg_n, VOLUME/2, &Q_plus_psi); + mnl->iter0 += solve_degenerate(mnl->w_fields[0], mnl->pf, mnl->solver_params, + mnl->maxiter, mnl->forceprec, g_relative_precision_flag, + VOLUME, &Q_plus_psi, mnl->solver); + mnl->energy1 = square_norm(mnl->w_fields[0], VOLUME, 1); + } + } + g_mu = g_mu1; + boundary(g_kappa); + etime = gettime(); + if(g_proc_id == 0) { + if(g_debug_level > 1) { + printf("# Time for %s monomial acc step: %e s\n", mnl->name, etime-atime); + } + if(g_debug_level > 3) { + printf("called det_acc for id %d dH = %1.10e\n", + id, mnl->energy1 - mnl->energy0); + } + } + return(mnl->energy1 - mnl->energy0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/det_monomial.h b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/det_monomial.h new file mode 100644 index 0000000000000000000000000000000000000000..fb66a7b31742daa8ec03ac5f4fbd056714bd39dd --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/det_monomial.h @@ -0,0 +1,29 @@ +/*********************************************************************** + * + * Copyright (C) 2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _DET_MONOMIAL_H +#define _DET_MONOMIAL_H + +#include "hamiltonian_field.h" + +void det_derivative(const int no, hamiltonian_field_t * const hf); +void det_heatbath(const int no, hamiltonian_field_t * const hf); +double det_acc(const int no, hamiltonian_field_t * const hf); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/detratio_monomial.c b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/detratio_monomial.c new file mode 100644 index 0000000000000000000000000000000000000000..3614a05eac28e35675c9db92f1f7763dfa8f806c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/detratio_monomial.c @@ -0,0 +1,328 @@ +/*********************************************************************** + * + * Copyright (C) 1008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "start.h" +#include "gettime.h" +#include "linalg_eo.h" +#include "deriv_Sb.h" +#include "deriv_Sb_D_psi.h" +#include "operator/tm_operators.h" +#include "operator/Hopping_Matrix.h" +#include "solver/chrono_guess.h" +#include "solver/solver.h" +#include "solver/monomial_solve.h" +#include "read_input.h" +#include "gamma.h" +#include "monomial/monomial.h" +#include "boundary.h" +#include "detratio_monomial.h" + +/* think about chronological solver ! */ + +void detratio_derivative(const int no, hamiltonian_field_t * const hf) { + double atime, etime; + monomial * mnl = &monomial_list[no]; + + atime = gettime(); + mnl->forcefactor = 1.; + + if(mnl->even_odd_flag) { + /* + * this is being run in case there is even/odd preconditioning + */ + /********************************************************************* + * + * This term is det((Q^2 + \mu_1^2)/(Q^2 + \mu_2^2)) + * mu1 and mu2 are set according to the monomial + * + *********************************************************************/ + /* First term coming from the second field */ + /* Multiply with W_+ */ + g_mu = mnl->mu2; + boundary(mnl->kappa2); + + if(mnl->solver != CG) { + fprintf(stderr, "Bicgstab currently not implemented, using CG instead! (detratio_monomial.c)\n"); + } + + Qtm_plus_psi(mnl->w_fields[2], mnl->pf); + g_mu = mnl->mu; + boundary(mnl->kappa); + /* Invert Q_{+} Q_{-} */ + /* X_W -> w_fields[1] */ + chrono_guess(mnl->w_fields[1], mnl->w_fields[2], mnl->csg_field, + mnl->csg_index_array, mnl->csg_N, mnl->csg_n, VOLUME/2, &Qtm_pm_psi); + + if(mnl->solver == BICGSTAB) { + fprintf(stderr, "Bicgstab currently not implemented, using CG instead! (detratio_monomial.c)\n"); + mnl->iter1 += solve_degenerate(mnl->w_fields[1], mnl->w_fields[2], mnl->solver_params, mnl->maxiter, + mnl->forceprec, g_relative_precision_flag, VOLUME/2, &Qtm_pm_psi, CG); + } + else{ + mnl->iter1 += solve_degenerate(mnl->w_fields[1], mnl->w_fields[2], mnl->solver_params, mnl->maxiter, + mnl->forceprec, g_relative_precision_flag, VOLUME/2, &Qtm_pm_psi, mnl->solver); + } + chrono_add_solution(mnl->w_fields[1], mnl->csg_field, mnl->csg_index_array, + mnl->csg_N, &mnl->csg_n, VOLUME/2); + /* Y_W -> w_fields[0] */ + Qtm_minus_psi(mnl->w_fields[0], mnl->w_fields[1]); + + /* apply Hopping Matrix M_{eo} */ + /* to get the even sites of X */ + H_eo_tm_inv_psi(mnl->w_fields[2], mnl->w_fields[1], EO, -1.); + /* \delta Q sandwitched by Y_o^\dagger and X_e */ + deriv_Sb(OE, mnl->w_fields[0], mnl->w_fields[2], hf, mnl->forcefactor); + + /* to get the even sites of Y */ + H_eo_tm_inv_psi(mnl->w_fields[3], mnl->w_fields[0], EO, +1); + /* \delta Q sandwitched by Y_e^\dagger and X_o */ + deriv_Sb(EO, mnl->w_fields[3], mnl->w_fields[1], hf, mnl->forcefactor); + + g_mu = mnl->mu2; + boundary(mnl->kappa2); + + /* Second term coming from the second field */ + /* The sign is opposite!! */ + mul_r(mnl->w_fields[0], -1., mnl->pf, VOLUME/2); + + /* apply Hopping Matrix M_{eo} */ + /* to get the even sites of X */ + H_eo_tm_inv_psi(mnl->w_fields[2], mnl->w_fields[1], EO, -1.); + /* \delta Q sandwitched by Y_o^\dagger and X_e */ + deriv_Sb(OE, mnl->w_fields[0], mnl->w_fields[2], hf, mnl->forcefactor); + + /* to get the even sites of Y */ + H_eo_tm_inv_psi(mnl->w_fields[3], mnl->w_fields[0], EO, +1); + /* \delta Q sandwitched by Y_e^\dagger and X_o */ + deriv_Sb(EO, mnl->w_fields[3], mnl->w_fields[1], hf, mnl->forcefactor); + } + else { /* no even/odd preconditioning */ + /********************************************************************* + * + * This term is det((Q^2 + \mu_1^2)/(Q^2 + \mu_2^2)) + * mu1 and mu2 are set according to the monomial + * + *********************************************************************/ + /* First term coming from the second field */ + /* Multiply with W_+ */ + g_mu = mnl->mu2; + boundary(mnl->kappa2); + Q_plus_psi(mnl->w_fields[2], mnl->pf); + g_mu = mnl->mu; + boundary(mnl->kappa); + if((mnl->solver == CG) || (mnl->solver == MIXEDCG) || (mnl->solver == RGMIXEDCG)) { + /* If CG is used anyhow */ + /* gamma5(mnl->w_fields[1], mnl->w_fields[2], VOLUME/2); */ + /* Invert Q_{+} Q_{-} */ + /* X_W -> w_fields[1] */ + chrono_guess(mnl->w_fields[1], mnl->w_fields[2], mnl->csg_field, + mnl->csg_index_array, mnl->csg_N, mnl->csg_n, VOLUME/2, &Q_pm_psi); + mnl->iter1 += solve_degenerate(mnl->w_fields[1], mnl->w_fields[2], mnl->solver_params, + mnl->maxiter, mnl->forceprec, g_relative_precision_flag, + VOLUME, &Q_pm_psi, mnl->solver); + chrono_add_solution(mnl->w_fields[1], mnl->csg_field, mnl->csg_index_array, + mnl->csg_N, &mnl->csg_n, VOLUME/2); + + /* Y_W -> w_fields[0] */ + Q_minus_psi(mnl->w_fields[0], mnl->w_fields[1]); + } + else { + /* Invert first Q_+ */ + /* Y_o -> w_fields[0] */ + + chrono_guess(mnl->w_fields[0], mnl->w_fields[2], mnl->csg_field, mnl->csg_index_array, + mnl->csg_N, mnl->csg_n, VOLUME/2, &Q_plus_psi); + gamma5(mnl->w_fields[0], mnl->w_fields[0], VOLUME); + mnl->iter1 += solve_degenerate(mnl->w_fields[0], mnl->w_fields[2], mnl->solver_params, + mnl->maxiter, mnl->forceprec, g_relative_precision_flag, + VOLUME, Q_plus_psi, mnl->solver); + chrono_add_solution(mnl->w_fields[0], mnl->csg_field, mnl->csg_index_array, + mnl->csg_N, &mnl->csg_n, VOLUME/2); + + /* Now Q_- */ + /* X_o -> w_fields[1] */ + + chrono_guess(mnl->w_fields[1], mnl->w_fields[0], mnl->csg_field2, + mnl->csg_index_array2, mnl->csg_N2, mnl->csg_n2, VOLUME/2, &Q_minus_psi); + gamma5(mnl->w_fields[1], mnl->w_fields[1], VOLUME); + mnl->iter1 += solve_degenerate(mnl->w_fields[1],mnl->w_fields[0], mnl->solver_params, + mnl->maxiter, mnl->forceprec, g_relative_precision_flag, + VOLUME, Q_minus_psi, mnl->solver); + chrono_add_solution(mnl->w_fields[1], mnl->csg_field2, mnl->csg_index_array2, + mnl->csg_N2, &mnl->csg_n2, VOLUME/2); + + } + + /* \delta Q sandwitched by Y^\dagger and X */ + deriv_Sb_D_psi(mnl->w_fields[0], mnl->w_fields[1], hf, mnl->forcefactor); + + g_mu = mnl->mu2; + boundary(mnl->kappa2); + + /* Second term coming from the second field */ + /* The sign is opposite!! */ + mul_r(mnl->w_fields[0], -1., mnl->pf, VOLUME); + + /* \delta Q sandwitched by Y^\dagger and X */ + deriv_Sb_D_psi(mnl->w_fields[0], mnl->w_fields[1], hf, mnl->forcefactor); + } + g_mu = g_mu1; + boundary(g_kappa); + etime = gettime(); + if(g_debug_level > 1 && g_proc_id == 0) { + printf("# Time for %s monomial derivative: %e s\n", mnl->name, etime-atime); + } + return; +} + + +void detratio_heatbath(const int id, hamiltonian_field_t * const hf) { + monomial * mnl = &monomial_list[id]; + double atime, etime; + atime = gettime(); + g_mu = mnl->mu; + boundary(mnl->kappa); + mnl->csg_n = 0; + mnl->csg_n2 = 0; + mnl->iter0 = 0; + mnl->iter1 = 0; + if(mnl->even_odd_flag) { + random_spinor_field_eo(mnl->w_fields[0], mnl->rngrepro, RN_GAUSS); + mnl->energy0 = square_norm(mnl->w_fields[0], VOLUME/2, 1); + + mnl->Qp(mnl->w_fields[1], mnl->w_fields[0]); + g_mu = mnl->mu2; + boundary(mnl->kappa2); + zero_spinor_field(mnl->w_fields[0], VOLUME/2); + mnl->iter0 = solve_degenerate(mnl->w_fields[0], mnl->w_fields[1], mnl->solver_params, + mnl->maxiter, mnl->accprec, g_relative_precision_flag, + VOLUME/2, mnl->Qsq, mnl->solver); + mnl->Qm(mnl->pf, mnl->w_fields[0]); + chrono_add_solution(mnl->w_fields[0], mnl->csg_field, mnl->csg_index_array, + mnl->csg_N, &mnl->csg_n, VOLUME/2); + } + else { + random_spinor_field_lexic(mnl->w_fields[0], mnl->rngrepro,RN_GAUSS); + mnl->energy0 = square_norm(mnl->w_fields[0], VOLUME, 1); + + Q_plus_psi(mnl->w_fields[1], mnl->w_fields[0]); + g_mu = mnl->mu2; + boundary(mnl->kappa2); + zero_spinor_field(mnl->pf,VOLUME); + if((mnl->solver == CG) || (mnl->solver == MIXEDCG)){ + mnl->iter0 = solve_degenerate(mnl->w_fields[0], mnl->w_fields[1], mnl->solver_params, + mnl->maxiter, mnl->accprec, + g_relative_precision_flag, VOLUME, Q_pm_psi, mnl->solver); + Q_minus_psi(mnl->pf, mnl->w_fields[0]); + chrono_add_solution(mnl->pf, mnl->csg_field, mnl->csg_index_array, + mnl->csg_N, &mnl->csg_n, VOLUME/2); + }else{ + mnl->iter0 += solve_degenerate(mnl->pf, mnl->w_fields[1], mnl->solver_params, mnl->maxiter, mnl->accprec, + g_relative_precision_flag, VOLUME, Q_plus_psi, mnl->solver); + chrono_add_solution(mnl->pf, mnl->csg_field, mnl->csg_index_array, + mnl->csg_N, &mnl->csg_n, VOLUME/2); + chrono_add_solution(mnl->pf, mnl->csg_field2, mnl->csg_index_array2, + mnl->csg_N2, &mnl->csg_n2, VOLUME/2); + } + } + etime = gettime(); + if(g_proc_id == 0) { + if(g_debug_level > 1) { + printf("# Time for %s monomial heatbath: %e s\n", mnl->name, etime-atime); + } + if(g_debug_level > 3) { + printf("called detratio_heatbath for id %d energy %f\n", id, mnl->energy0); + } + } + g_mu = g_mu1; + boundary(g_kappa); + return; +} + +double detratio_acc(const int id, hamiltonian_field_t * const hf) { + monomial * mnl = &monomial_list[id]; + int save_sloppy = g_sloppy_precision_flag; + double etime, atime; + atime = gettime(); + g_mu = mnl->mu2; + boundary(mnl->kappa2); + if(even_odd_flag) { + mnl->Qp(mnl->w_fields[1], mnl->pf); + g_mu = mnl->mu; + boundary(mnl->kappa); + chrono_guess(mnl->w_fields[0], mnl->w_fields[1], mnl->csg_field, mnl->csg_index_array, + mnl->csg_N, mnl->csg_n, VOLUME/2, mnl->Qsq); + g_sloppy_precision_flag = 0; + mnl->iter0 += solve_degenerate(mnl->w_fields[0], mnl->w_fields[1], mnl->solver_params, mnl->maxiter, mnl->accprec, g_relative_precision_flag, + VOLUME/2, mnl->Qsq, mnl->solver); + mnl->Qm(mnl->w_fields[1], mnl->w_fields[0]); + g_sloppy_precision_flag = save_sloppy; + /* Compute the energy contr. from second field */ + mnl->energy1 = square_norm(mnl->w_fields[1], VOLUME/2, 1); + } + else { + Q_plus_psi(mnl->w_fields[1], mnl->pf); + g_mu = mnl->mu; + boundary(mnl->kappa); + chrono_guess(mnl->w_fields[0], mnl->w_fields[1], mnl->csg_field, mnl->csg_index_array, + mnl->csg_N, mnl->csg_n, VOLUME/2, &Q_plus_psi); + g_sloppy_precision_flag = 0; + if((mnl->solver == CG) || (mnl->solver == MIXEDCG)){ + + mnl->iter0 += solve_degenerate(mnl->w_fields[0], mnl->w_fields[1], mnl->solver_params, mnl->maxiter, mnl->accprec, g_relative_precision_flag, + VOLUME, &Q_pm_psi, mnl->solver); + Q_minus_psi(mnl->w_fields[1], mnl->w_fields[0]); + g_sloppy_precision_flag = save_sloppy; + /* Compute the energy contr. from second field */ + mnl->energy1 = square_norm(mnl->w_fields[1], VOLUME, 1); + } + else{ + mnl->iter0 += solve_degenerate(mnl->w_fields[0], mnl->w_fields[1], mnl->solver_params, + mnl->maxiter, mnl->accprec, g_relative_precision_flag, + VOLUME, Q_plus_psi, mnl->solver); + + /* Compute the energy contr. from second field */ + mnl->energy1 = square_norm(mnl->w_fields[0], VOLUME, 1); + } + + } + g_mu = g_mu1; + boundary(g_kappa); + etime = gettime(); + if(g_proc_id == 0) { + if(g_debug_level > 1) { + printf("# Time for %s monomial acc step: %e s\n", mnl->name, etime-atime); + } + if(g_debug_level > 3) { + printf("called detratio_acc for id %d dH = %1.10e\n", + id, mnl->energy1 - mnl->energy0); + } + } + return(mnl->energy1 - mnl->energy0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/detratio_monomial.h b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/detratio_monomial.h new file mode 100644 index 0000000000000000000000000000000000000000..a25b546582ffd2b1718316a667721c491cb3d233 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/detratio_monomial.h @@ -0,0 +1,29 @@ +/*********************************************************************** + * + * Copyright (C) 2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _DETRATIO_MONOMIAL_H +#define _DETRATIO_MONOMIAL_H + +#include "hamiltonian_field.h" + +void detratio_derivative(const int no, hamiltonian_field_t * const hf); +double detratio_acc(const int no, hamiltonian_field_t * const hf); +void detratio_heatbath(const int no, hamiltonian_field_t * const hf); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/gauge_monomial.c b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/gauge_monomial.c new file mode 100644 index 0000000000000000000000000000000000000000..a7d5ae689e805fcb41342b70c85770085a89bd13 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/gauge_monomial.c @@ -0,0 +1,205 @@ +/*********************************************************************** + * + * Copyright (C) 2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#ifdef OMP +# include +#endif +#include "global.h" +#include "su3.h" +#include "su3adj.h" +#include "ranlxd.h" +#include "sse.h" +#include "start.h" +#include "gettime.h" +#include "get_rectangle_staples.h" +#include "gamma.h" +#include "get_staples.h" +#include "read_input.h" +#include "measure_gauge_action.h" +#include "measure_rectangles.h" +#include "monomial/monomial.h" +#include "hamiltonian_field.h" +#include "gauge_monomial.h" + +/* this function calculates the derivative of the momenta: equation 13 of Gottlieb */ +void gauge_derivative(const int id, hamiltonian_field_t * const hf) { + monomial * mnl = &monomial_list[id]; + double factor = -1. * g_beta/3.0; + if(mnl->use_rectangles) { + mnl->forcefactor = 1.; + factor = -mnl->c0 * g_beta/3.0; + } + + double atime, etime; + atime = gettime(); +#ifdef OMP +#pragma omp parallel + { +#endif + + su3 ALIGN v, w; + int i, mu; + su3 *z; + su3adj *xm; + +#ifdef OMP +#pragma omp for +#endif + for(i = 0; i < VOLUME; i++) { + for(mu=0;mu<4;mu++) { + z=&hf->gaugefield[i][mu]; + xm=&hf->derivative[i][mu]; + get_staples(&v,i,mu, (const su3**) hf->gaugefield); + _su3_times_su3d(w,*z,v); + _trace_lambda_mul_add_assign((*xm), factor, w); + + if(mnl->use_rectangles) { + get_rectangle_staples(&v, i, mu); + _su3_times_su3d(w, *z, v); + _trace_lambda_mul_add_assign((*xm), factor*mnl->c1/mnl->c0, w); + } + } + } + +#ifdef OMP + } /* OpenMP closing brace */ +#endif + etime = gettime(); + if(g_debug_level > 1 && g_proc_id == 0) { + printf("# Time for %s monomial derivative: %e s\n", mnl->name, etime-atime); + } + return; +} + +/* this function calculates the derivative of the momenta: equation 13 of Gottlieb */ +void gauge_EMderivative(const int id, hamiltonian_field_t * const hf) { + monomial * mnl = &monomial_list[id]; + double factor = -1. * g_beta/3.0; + if(mnl->use_rectangles) { + mnl->forcefactor = 1.; + factor = -mnl->c0 * g_beta/3.0; + } + + double atime, etime; + atime = gettime(); +#ifdef OMP +#pragma omp parallel + { +#endif + + su3 ALIGN v, w; + int i, mu; + su3 *z; + su3adj *xm; + +#ifdef OMP +#pragma omp for +#endif + for(i = 0; i < VOLUME; i++) { + // electric part + z=&hf->gaugefield[i][0]; + xm=&hf->derivative[i][0]; + get_staples(&v, i, 0, (const su3**) hf->gaugefield); + _su3_times_su3d(w,*z,v); + _trace_lambda_mul_add_assign((*xm), (1.+mnl->glambda)*factor, w); + // lambda only acts on the plaquette, effectively changing c0 in the spatial and temporal parts, c1 remains untouched + if(mnl->use_rectangles) { + get_rectangle_staples(&v, i, 0); + _su3_times_su3d(w, *z, v); + _trace_lambda_mul_add_assign((*xm), factor*mnl->c1/mnl->c0, w); + } + // magnetic part + for(mu=1;mu<4;mu++) { + z=&hf->gaugefield[i][mu]; + xm=&hf->derivative[i][mu]; + + get_spacelike_staples(&v, i, mu, (const su3**) hf->gaugefield); + _su3_times_su3d(w, *z, v); + _trace_lambda_mul_add_assign((*xm), (1.-mnl->glambda)*factor, w); + + get_timelike_staples(&v, i, mu, (const su3**) hf->gaugefield); + _su3_times_su3d(w, *z, v); + _trace_lambda_mul_add_assign((*xm), (1.+mnl->glambda)*factor, w); + if(mnl->use_rectangles) { + get_rectangle_staples(&v, i, mu); + _su3_times_su3d(w, *z, v); + _trace_lambda_mul_add_assign((*xm), factor*mnl->c1/mnl->c0, w); + } + } + } + +#ifdef OMP + } /* OpenMP closing brace */ +#endif + etime = gettime(); + if(g_debug_level > 1 && g_proc_id == 0) { + printf("# Time for %s monomial derivative: %e s\n", mnl->name, etime-atime); + } + return; +} + +void gauge_heatbath(const int id, hamiltonian_field_t * const hf) { + monomial * mnl = &monomial_list[id]; + double atime, etime; + atime = gettime(); + if(mnl->use_rectangles) mnl->c0 = 1. - 8.*mnl->c1; + + mnl->energy0 = g_beta*(mnl->c0 * measure_gauge_action( (const su3**) hf->gaugefield, mnl->glambda)); + if(mnl->use_rectangles) { + mnl->energy0 += g_beta*(mnl->c1 * measure_rectangles( (const su3**) hf->gaugefield)); + } + etime = gettime(); + if(g_proc_id == 0) { + if(g_debug_level > 1) { + printf("# Time for %s monomial heatbath: %e s\n", mnl->name, etime-atime); + } + if(g_debug_level > 3) { + printf("called gauge_heatbath for id %d energy %f\n", id, mnl->energy0); + } + } + return; +} + +double gauge_acc(const int id, hamiltonian_field_t * const hf) { + monomial * mnl = &monomial_list[id]; + double atime, etime; + atime = gettime(); + mnl->energy1 = g_beta*(mnl->c0 * measure_gauge_action( (const su3**) hf->gaugefield, mnl->glambda)); + if(mnl->use_rectangles) { + mnl->energy1 += g_beta*(mnl->c1 * measure_rectangles( (const su3**) hf->gaugefield)); + } + etime = gettime(); + if(g_proc_id == 0) { + if(g_debug_level > 1) { + printf("# Time for %s monomial acc step: %e s\n", mnl->name, etime-atime); + } + if(g_debug_level > 3) { + printf("called gauge_acc for id %d dH = %1.10e\n", + id, mnl->energy0 - mnl->energy1); + } + } + return(mnl->energy0 - mnl->energy1); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/gauge_monomial.h b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/gauge_monomial.h new file mode 100644 index 0000000000000000000000000000000000000000..31d267215af8d28d397b17ca9da79125f2fcd09a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/gauge_monomial.h @@ -0,0 +1,30 @@ +/*********************************************************************** + * + * Copyright (C) 2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _GAUGE_MONOMIAL_H +#define _GAUGE_MONOMIAL_H + +#include "hamiltonian_field.h" + +void gauge_derivative(const int id, hamiltonian_field_t * const hf); +void gauge_EMderivative(const int id, hamiltonian_field_t * const hf); +void gauge_heatbath(const int id, hamiltonian_field_t * const hf); +double gauge_acc(const int id, hamiltonian_field_t * const hf); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/moment_energy.c b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/moment_energy.c new file mode 100644 index 0000000000000000000000000000000000000000..3366b49de06d64e68ac6420cff70490988d92855 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/moment_energy.c @@ -0,0 +1,90 @@ +/*********************************************************************** + * + * Copyright (C) 2001 Martin Hasebusch + * + * some changes by C. Urbach 2002-2008 + * + * Modified by Jenifer Gonzalez Lopez for the Schroedinger Functional + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#include "global.h" +#include "su3adj.h" +#include "su3spinor.h" +#include "gettime.h" +#include "moment_energy.h" + +/*----------------------------------------------------------------------------*/ + +/******************************************* + * + * This computes the contribution to + * the Hamiltonian coming from the momenta + * + *******************************************/ +double moment_energy(su3adj ** const momenta) { + double atime, etime; + atime = gettime(); + su3adj *xm; + int i,mu; + static double tt,tr,ts,kc,ks,sum; + kc=0.; ks=0.; + + for(i=0;i 1) { + printf("# Time for moment_energy: %e s\n", etime-atime); + } + if(g_debug_level > 3) { + printf("called moment_energy: energy %f\n", kc); + } + } + return kc; +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/moment_energy.h b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/moment_energy.h new file mode 100644 index 0000000000000000000000000000000000000000..0fbc3d69ddb8de6789904c2e7e678b74d89ed8f5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/moment_energy.h @@ -0,0 +1,24 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _MOMENT_ENERGY_H +#define _MOMENT_ENERGY_H + +double moment_energy(su3adj ** const momenta); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/monitor_forces.c b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/monitor_forces.c new file mode 100644 index 0000000000000000000000000000000000000000..2395b7e34090075a2b3ba2ec3a03f7e5c02b4533 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/monitor_forces.c @@ -0,0 +1,108 @@ +/*********************************************************************** + * + * Copyright (C) 2001 Martin Hasebusch + * 2002,2003,2004,2005,2006,2007,2008,2012 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#ifdef MPI +# include +#endif +#ifdef OMP +# include +#endif +#include "global.h" +#include "su3.h" +#include "su3adj.h" +#include "su3spinor.h" +#include "monomial/monomial.h" +#include "xchange/xchange.h" +#include "hamiltonian_field.h" +#include "monitor_forces.h" +#include "gettime.h" + +void monitor_forces(hamiltonian_field_t * const hf) { + + for(int id = 0; id < no_monomials; id++) { + if(monomial_list[ id ].derivativefunction != NULL) { +#ifdef OMP +#pragma omp parallel for +#endif + for(int i = 0; i < (VOLUMEPLUSRAND + g_dbw2rand);i++) { + for(int mu=0;mu<4;mu++) { + _zero_su3adj(hf->derivative[i][mu]); + } + } + + monomial_list[ id ].derivativefunction(id, hf); + +#ifdef MPI + xchange_deri(hf->derivative); +#endif + + double sum = 0., max = 0., sum2; +#ifdef OMP +#pragma omp parallel private(sum2) + { + int thread_num = omp_get_thread_num(); + g_omp_acc_re[thread_num] = 0.; +#pragma omp for reduction(+ : sum) nowait +#endif + for(int i = 0; i < VOLUME; i++) { + for(int mu = 0; mu < 4; mu++) { + sum2 = _su3adj_square_norm(hf->derivative[i][mu]); + sum += sum2; +#ifdef OMP + if(sum2 > g_omp_acc_re[thread_num]) g_omp_acc_re[thread_num] = sum2; +#else + if(sum2 > max) max = sum2; +#endif + } + } +#ifdef OMP + } /* OMP closing brace */ + max = g_omp_acc_re[0]; + for( int i = 1; i < omp_num_threads; i++) { + if(g_omp_acc_re[i] > max) max = g_omp_acc_re[i]; + } +#endif + + // output for force monitoring +#ifdef MPI + MPI_Reduce(&sum, &sum2, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); + sum = sum2; + MPI_Reduce(&max, &sum2, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); + max = sum2; +#endif + if(g_proc_id == 0) { + printf("# squared force for monomial %s on timescale %d: aver: %1.2e max: %1.2e\n", + monomial_list[ id ].name, + monomial_list[ id ].timescale, + sum/((double)(VOLUME*g_nproc))/4., max); + fflush(stdout); + } + } + } + return; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/monitor_forces.h b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/monitor_forces.h new file mode 100644 index 0000000000000000000000000000000000000000..eddf288afacaecdb784b237d40daee49963da29d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/monitor_forces.h @@ -0,0 +1,26 @@ +/*********************************************************************** + * Copyright (C) 2013 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _MONITOR_FORCES_H +#define _MONITOR_FORCES_H + +#include "hamiltonian_field.h" + +void monitor_forces(hamiltonian_field_t * const hf); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/monomial.c b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/monomial.c new file mode 100644 index 0000000000000000000000000000000000000000..2add011df5f7fed5f36311e476e9f7c7dbc5206a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/monomial.c @@ -0,0 +1,646 @@ +/*********************************************************************** + * + * Copyright (C) 2008,2011,2012 Carsten Urbach + * 2009 Jenifer Gonzalez Lopez + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "su3adj.h" +#include "su3spinor.h" +#include "operator/tm_operators.h" +#include "operator/tm_operators_32.h" +#include "operator/clovertm_operators.h" +#include "operator/clovertm_operators_32.h" +#include "operator/clover_leaf.h" +#include "ranlxd.h" +#include "sse.h" +#include "linalg_eo.h" +#include "default_input_values.h" +#include "read_input.h" +#include "monomial/monomial.h" + + + +monomial monomial_list[max_no_monomials]; +int no_monomials = 0; +int no_gauge_monomials = 0; +int clover_monomials[max_no_monomials]; +int clovernd_monomials[max_no_monomials]; +int no_clover_monomials = 0; +int no_clovernd_monomials = 0; +static spinor * _pf; +spinor ** w_fields; +const int no_wfields = 6; + +int add_monomial(const int type) { + + if(no_monomials == max_no_monomials) { + fprintf(stderr, "maximal number of monomials %d exceeded!\n", max_no_monomials); + exit(-1); + } + monomial_list[no_monomials].hbfunction = &dummy_heatbath; + monomial_list[no_monomials].accfunction = &dummy_acc; + monomial_list[no_monomials].derivativefunction = &dummy_derivative; + + monomial_list[no_monomials].pf = NULL; + monomial_list[no_monomials].pf2 = NULL; + monomial_list[no_monomials].w_fields = NULL; + monomial_list[no_monomials].csg_field = NULL; + monomial_list[no_monomials].csg_field2 = NULL; + monomial_list[no_monomials].csg_index_array = NULL; + monomial_list[no_monomials].csg_index_array2 = NULL; + monomial_list[no_monomials].no_wfields = no_wfields; + monomial_list[no_monomials].csg_N = 0; + monomial_list[no_monomials].csg_N2 = 0; + monomial_list[no_monomials].csg_n = 1; + monomial_list[no_monomials].csg_n2 = 1; + monomial_list[no_monomials].kappa = _default_g_kappa; + monomial_list[no_monomials].kappa2 = _default_g_kappa; + monomial_list[no_monomials].mu = _default_g_mu; + monomial_list[no_monomials].mu2 = _default_g_mu; + monomial_list[no_monomials].c_sw = _default_c_sw; + monomial_list[no_monomials].rho = _default_rho; + monomial_list[no_monomials].rho2 = _default_rho2; + monomial_list[no_monomials].mubar = _default_g_mubar; + monomial_list[no_monomials].mubar2 = _default_g_mubar; + monomial_list[no_monomials].epsbar = _default_g_epsbar; + monomial_list[no_monomials].epsbar2 = _default_g_epsbar; + monomial_list[no_monomials].epsilon = _default_g_epsbar; + monomial_list[no_monomials].timescale = _default_timescale; + monomial_list[no_monomials].accprec = _default_g_eps_sq_acc; + monomial_list[no_monomials].forceprec = _default_g_eps_sq_force; + monomial_list[no_monomials].maxiter = _default_max_solver_iterations; + if((monomial_list[no_monomials].type == NDRAT) || + (monomial_list[no_monomials].type == NDRATCOR) || + (monomial_list[no_monomials].type == NDCLOVERRAT) || + (monomial_list[no_monomials].type == NDCLOVERRATCOR) + ) { + monomial_list[no_monomials].solver = _default_nd_solver_flag; + } + else{ + monomial_list[no_monomials].solver = _default_solver_flag; + } + monomial_list[no_monomials].solver_params.mcg_delta = _default_mixcg_innereps; + monomial_list[no_monomials].even_odd_flag = _default_even_odd_flag; + monomial_list[no_monomials].forcefactor = 1.; + monomial_list[no_monomials].use_rectangles = 0; + monomial_list[no_monomials].c1 = _default_g_rgi_C1; + monomial_list[no_monomials].c0 = 1.; + monomial_list[no_monomials].beta = _default_g_beta; + monomial_list[no_monomials].glambda = 0.; + monomial_list[no_monomials].rngrepro = _default_reproduce_randomnumber_flag; + monomial_list[no_monomials].trlog = 0; + /* poly monomial */ + monomial_list[no_monomials].rec_ev = _default_g_rec_ev; + monomial_list[no_monomials].MDPolyDegree = _default_MDPolyDegree; + monomial_list[no_monomials].MDPolyLmin = _default_MDPolyLmin; + monomial_list[no_monomials].MDPolyLmax = _default_MDPolyLmax; + strcpy(monomial_list[no_monomials].MDPolyRootsFile,_default_MDPolyRootsFile); + monomial_list[no_monomials].MDPolyRoots = NULL; + monomial_list[no_monomials].MDPoly_chi_spinor_fields = (spinor**)NULL; + monomial_list[no_monomials].MDPolyLocNormConst = _default_MDPolyLocNormConst; + monomial_list[no_monomials].MDPolyDetRatio = _default_MDPolyDetRatio; + monomial_list[no_monomials].MaxPtildeDegree = NTILDE_CHEBYMAX; + monomial_list[no_monomials].StildeMin = _default_stilde_min; + monomial_list[no_monomials].StildeMax = _default_stilde_max; + monomial_list[no_monomials].PrecisionHfinal = _default_g_acc_Hfin; + monomial_list[no_monomials].PrecisionPtilde = _default_g_acc_Ptilde; + + monomial_list[no_monomials].rat.order = 12; + monomial_list[no_monomials].rat.range[0] = _default_stilde_min; + monomial_list[no_monomials].rat.range[1] = _default_stilde_max; + monomial_list[no_monomials].rat.crange[0] = 0; + monomial_list[no_monomials].rat.crange[1] = 11; + + monomial_list[no_monomials].initialised = 1; + if(monomial_list[no_monomials].type == NDDETRATIO) { + monomial_list[no_monomials].timescale = -5; + } + + no_monomials++; + return(no_monomials); +} + + +int init_monomials(const int V, const int even_odd_flag) { + int no=0; + int retval; + spinor * __pf = NULL; + double sw_mu=0., sw_k=0., sw_c=0.; + double swn_mubar=0., swn_epsbar = 0., swn_k=0., swn_c=0.; + for(int i = 0; i < no_monomials; i++) { + if((monomial_list[i].type != GAUGE) && (monomial_list[i].type != SFGAUGE)) no++; + /* non-degenerate monomials need two pseudo fermion fields */ + if((monomial_list[i].type == NDPOLY) || (monomial_list[i].type == NDDETRATIO) || + (monomial_list[i].type == NDCLOVER) || (monomial_list[i].type == NDRAT)|| + (monomial_list[i].type == NDRATCOR) || (monomial_list[i].type == NDCLOVERRATCOR) || + (monomial_list[i].type == NDCLOVERRAT)) no++; + } + if(no_monomials > 0) { + if((void*)(_pf = (spinor*)calloc((no+no_wfields)*V+1, sizeof(spinor))) == NULL) { + printf ("malloc errno in monomial pf fields: %d\n",errno); + errno = 0; + return(1); + } + else { + __pf = (spinor*)(((unsigned long int)(_pf)+ALIGN_BASE)&~ALIGN_BASE); + } + if((void*)(w_fields = (spinor**)calloc(no_wfields, sizeof(spinor*))) == NULL) { + printf ("malloc errno in monomial w_fields: %d\n",errno); + errno = 0; + return(1); + } + for(int i = 0; i < no_wfields; i++) { + w_fields[i] = __pf+(no+i)*V; + } + } + + no = 0; + for(int i = 0; i < no_monomials; i++) { + monomial_list[i].rngrepro = reproduce_randomnumber_flag; + if((monomial_list[i].type != GAUGE) && (monomial_list[i].type != SFGAUGE)) { + monomial_list[i].w_fields = w_fields; + monomial_list[i].pf = __pf+no*V; + no++; + + if(monomial_list[i].type == DET) { + monomial_list[i].hbfunction = &det_heatbath; + monomial_list[i].accfunction = &det_acc; + monomial_list[i].derivativefunction = &det_derivative; + monomial_list[i].Qsq = &Qtm_pm_psi; + monomial_list[i].Qsq32 = &Qtm_pm_psi_32; + monomial_list[i].Qp = &Qtm_plus_psi; + monomial_list[i].Qm = &Qtm_minus_psi; + if(g_proc_id == 0 && g_debug_level > 1) { + printf("# Initialised monomial of type DET, no_monomials= %d\n", no_monomials); + } + } + else if(monomial_list[i].type == CLOVERDET) { + monomial_list[i].hbfunction = &cloverdet_heatbath; + monomial_list[i].accfunction = &cloverdet_acc; + monomial_list[i].derivativefunction = &cloverdet_derivative; + monomial_list[i].even_odd_flag = 1; + monomial_list[i].Qsq = &Qsw_pm_psi; + monomial_list[i].Qp = &Qsw_plus_psi; + monomial_list[i].Qm = &Qsw_minus_psi; + init_swpm(VOLUME); + clover_monomials[no_clover_monomials] = i; + no_clover_monomials++; + if(g_proc_id == 0 && g_debug_level > 1) { + printf("# Initialised monomial of type CLOVERDET, no_monomials= %d\n", no_monomials); + } + } + else if(monomial_list[i].type == CLOVERDETRATIO) { + monomial_list[i].hbfunction = &cloverdetratio_heatbath; + monomial_list[i].accfunction = &cloverdetratio_acc; + monomial_list[i].derivativefunction = &cloverdetratio_derivative; + monomial_list[i].even_odd_flag = 1; + monomial_list[i].Qsq = &Qsw_pm_psi; + monomial_list[i].Qp = &Qsw_plus_psi; + monomial_list[i].Qm = &Qsw_minus_psi; + init_swpm(VOLUME); + if(g_proc_id == 0 && g_debug_level > 1) { + printf("# Initialised monomial of type CLOVERDETRATIO, no_monomials= %d\n", no_monomials); + } + } + else if(monomial_list[i].type == DETRATIO) { + monomial_list[i].hbfunction = &detratio_heatbath; + monomial_list[i].accfunction = &detratio_acc; + monomial_list[i].derivativefunction = &detratio_derivative; + monomial_list[i].Qsq = &Qtm_pm_psi; + monomial_list[i].Qsq32 = &Qtm_pm_psi_32; + monomial_list[i].Qp = &Qtm_plus_psi; + monomial_list[i].Qm = &Qtm_minus_psi; + if(g_proc_id == 0 && g_debug_level > 1) { + printf("# Initialised monomial of type DETRATIO, no_monomials= %d\n", no_monomials); + } + } + else if(monomial_list[i].type == POLY) { + monomial_list[i].hbfunction = &poly_heatbath; + monomial_list[i].accfunction = &poly_acc; + monomial_list[i].derivativefunction = &poly_derivative; + retval=init_poly_monomial(V,i); + if(retval != 0) { + return retval; + } + if(g_proc_id == 0 && g_debug_level > 1) { + printf("# Initialised monomial of type POLY, no_monomials= %d\n", no_monomials); + } + } + else if(monomial_list[i].type == POLYDETRATIO) { + monomial_list[i].hbfunction = &poly_heatbath; + monomial_list[i].accfunction = &poly_acc; + monomial_list[i].derivativefunction = &poly_derivative; + monomial_list[i].MDPolyDetRatio = 1; + retval=init_poly_monomial(V,i); + if(retval!=0) return retval; + if(g_proc_id == 0 && g_debug_level > 1) { + printf("# Initialised monomial of type POLYDETRATIO, no_monomials= %d\n", no_monomials); + } + } + else if(monomial_list[i].type == NDPOLY) { + monomial_list[i].hbfunction = &ndpoly_heatbath; + monomial_list[i].accfunction = &ndpoly_acc; + monomial_list[i].derivativefunction = &ndpoly_derivative; + monomial_list[i].even_odd_flag = 1; + monomial_list[i].pf2 = __pf+no*V; + no++; + retval = init_ndpoly_monomial(i); + if(g_proc_id == 0 && g_debug_level > 1) { + printf("# Initialised monomial of type NDPOLY, no_monomials= %d\n", no_monomials); + } + } + else if(monomial_list[i].type == NDCLOVER) { + init_swpm(VOLUME); + monomial_list[i].hbfunction = &cloverndpoly_heatbath; + monomial_list[i].accfunction = &cloverndpoly_acc; + monomial_list[i].derivativefunction = &cloverndpoly_derivative; + monomial_list[i].pf2 = __pf+no*V; + monomial_list[i].even_odd_flag = 1; + clovernd_monomials[no_clovernd_monomials] = i; + no_clovernd_monomials++; + //monomial_list[i].Qsq = &Qsw_pm_ndpsi; + //monomial_list[i].Qp = &Qsw_ndpsi; + //monomial_list[i].Qm = &Qsw_dagger_ndpsi; + no++; + retval = init_ndpoly_monomial(i); + if(g_proc_id == 0 && g_debug_level > 1) { + printf("# Initialised monomial of type NDCLOVER, no_monomials= %d\n", no_monomials); + } + } + else if(monomial_list[i].type == NDRAT) { + monomial_list[i].hbfunction = &ndrat_heatbath; + monomial_list[i].accfunction = &ndrat_acc; + monomial_list[i].derivativefunction = &ndrat_derivative; + monomial_list[i].even_odd_flag = 1; + monomial_list[i].pf2 = __pf+no*V; + no++; + retval = init_ndrat_monomial(i); + if(g_proc_id == 0 && g_debug_level > 1) { + printf("# Initialised monomial of type NDRAT, no_monomials= %d\n", no_monomials); + } + } + else if(monomial_list[i].type == RAT) { + monomial_list[i].hbfunction = &rat_heatbath; + monomial_list[i].accfunction = &rat_acc; + monomial_list[i].derivativefunction = &rat_derivative; + monomial_list[i].even_odd_flag = 1; + monomial_list[i].mu = 0.; + monomial_list[i].Qsq = &Qtm_pm_psi; + monomial_list[i].Qp = &Qtm_plus_psi; + monomial_list[i].Qm = &Qtm_minus_psi; + retval = init_ndrat_monomial(i); + if(g_proc_id == 0 && g_debug_level > 1) { + printf("# Initialised monomial of type RAT, no_monomials= %d\n", no_monomials); + } + } + else if(monomial_list[i].type == CLOVERRAT) { + monomial_list[i].hbfunction = &rat_heatbath; + monomial_list[i].accfunction = &rat_acc; + monomial_list[i].derivativefunction = &rat_derivative; + monomial_list[i].even_odd_flag = 1; + monomial_list[i].mu = 0.; + monomial_list[i].Qsq = &Qsw_pm_psi; + monomial_list[i].Qp = &Qsw_plus_psi; + monomial_list[i].Qm = &Qsw_minus_psi; + init_swpm(VOLUME); + if(monomial_list[i].trlog) { + clover_monomials[no_clover_monomials] = i; + no_clover_monomials++; + } + retval = init_ndrat_monomial(i); + if(g_proc_id == 0 && g_debug_level > 1) { + printf("# Initialised monomial of type CLOVERRAT, no_monomials= %d\n", no_monomials); + } + } + else if(monomial_list[i].type == NDCLOVERRAT) { + init_swpm(VOLUME); + monomial_list[i].hbfunction = &ndrat_heatbath; + monomial_list[i].accfunction = &ndrat_acc; + monomial_list[i].derivativefunction = &ndrat_derivative; + monomial_list[i].even_odd_flag = 1; + monomial_list[i].pf2 = __pf+no*V; + no++; + if(monomial_list[i].trlog) { + clovernd_monomials[no_clovernd_monomials] = i; + no_clovernd_monomials++; + } + retval = init_ndrat_monomial(i); + if(g_proc_id == 0 && g_debug_level > 1) { + printf("# Initialised monomial of type NDCLOVERRAT, no_monomials= %d\n", no_monomials); + } + } + else if(monomial_list[i].type == NDRATCOR) { + monomial_list[i].hbfunction = &ndratcor_heatbath; + monomial_list[i].accfunction = &ndratcor_acc; + monomial_list[i].derivativefunction = NULL; + monomial_list[i].even_odd_flag = 1; + monomial_list[i].pf2 = __pf+no*V; + monomial_list[i].rat.crange[0] = 0; + monomial_list[i].rat.crange[1] = monomial_list[i].rat.order-1; + + no++; + retval = init_ndrat_monomial(i); + if(g_proc_id == 0 && g_debug_level > 1) { + printf("# Initialised monomial of type NDRATCOR, no_monomials= %d\n", no_monomials); + } + } + else if(monomial_list[i].type == NDCLOVERRATCOR) { + init_swpm(VOLUME); + monomial_list[i].hbfunction = &ndratcor_heatbath; + monomial_list[i].accfunction = &ndratcor_acc; + monomial_list[i].derivativefunction = NULL; + monomial_list[i].even_odd_flag = 1; + monomial_list[i].pf2 = __pf+no*V; + monomial_list[i].rat.crange[0] = 0; + monomial_list[i].rat.crange[1] = monomial_list[i].rat.order-1; + + no++; + retval = init_ndrat_monomial(i); + if(g_proc_id == 0 && g_debug_level > 1) { + printf("# Initialised monomial of type NDCLOVERRATCOR, no_monomials= %d\n", no_monomials); + } + } + else if(monomial_list[i].type == RATCOR) { + monomial_list[i].hbfunction = &ratcor_heatbath; + monomial_list[i].accfunction = &ratcor_acc; + monomial_list[i].derivativefunction = NULL; + monomial_list[i].even_odd_flag = 1; + monomial_list[i].Qsq = &Qtm_pm_psi; + monomial_list[i].Qp = &Qtm_plus_psi; + monomial_list[i].Qm = &Qtm_minus_psi; + monomial_list[i].rat.crange[0] = 0; + monomial_list[i].rat.crange[1] = monomial_list[i].rat.order-1; + monomial_list[i].mu = 0.; + retval = init_ndrat_monomial(i); + if(g_proc_id == 0 && g_debug_level > 1) { + printf("# Initialised monomial of type RATCOR, no_monomials= %d\n", no_monomials); + } + } + else if(monomial_list[i].type == CLOVERRATCOR) { + init_swpm(VOLUME); + monomial_list[i].hbfunction = &ratcor_heatbath; + monomial_list[i].accfunction = &ratcor_acc; + monomial_list[i].derivativefunction = NULL; + monomial_list[i].even_odd_flag = 1; + monomial_list[i].Qsq = &Qsw_pm_psi; + monomial_list[i].Qp = &Qsw_plus_psi; + monomial_list[i].Qm = &Qsw_minus_psi; + monomial_list[i].mu = 0.; + monomial_list[i].rat.crange[0] = 0; + monomial_list[i].rat.crange[1] = monomial_list[i].rat.order-1; + retval = init_ndrat_monomial(i); + if(g_proc_id == 0 && g_debug_level > 1) { + printf("# Initialised monomial of type CLOVERRATCOR, no_monomials= %d\n", no_monomials); + } + } + else if(monomial_list[i].type == NDDETRATIO) { + monomial_list[i].hbfunction = &dummy_heatbath; + monomial_list[i].accfunction = &nddetratio_acc; + monomial_list[i].derivativefunction = NULL; + monomial_list[i].pf2 = __pf+no*V; + monomial_list[i].timescale = -5; + no++; + if(g_proc_id == 0 && g_debug_level > 1) { + printf("# Initialised monomial of type NDDETRATIO, no_monomials= %d\n", no_monomials); + } + } + } + else { + monomial_list[i].pf = NULL; + if(no_gauge_monomials > 0) { + fprintf(stderr, "maximal number of gauge monomials exceeded!\n"); + exit(-1); + } + else if(monomial_list[i].type == GAUGE) { + monomial_list[i].hbfunction = &gauge_heatbath; + monomial_list[i].accfunction = &gauge_acc; + monomial_list[i].derivativefunction = &gauge_derivative; + no_gauge_monomials++; + if(fabs( monomial_list[i].glambda) > 0) { + monomial_list[i].derivativefunction = &gauge_EMderivative; + } + if(!monomial_list[i].use_rectangles) { + monomial_list[i].c1 = 0.; + monomial_list[i].c0 = 1.; + } + g_rgi_C1 = monomial_list[i].c1; + monomial_list[i].c0 = 1. - 8.*monomial_list[i].c1; + g_rgi_C0 = monomial_list[i].c0; + if(g_proc_id == 0 && g_debug_level > 1) { + printf("# Initialised monomial of type GAUGE, no_monomials= %d\n", no_monomials); + } + } + } + monomial_list[i].id = i; + monomial_list[i].even_odd_flag = even_odd_flag; + } + /* initialize clovertrlog and cloverndtrlog monomials for all clover and clovernd monomials*/ + if( even_odd_flag ) { + for( int j = 0; j < no_clover_monomials; j++ ) { + monomial_list[no_monomials].type = CLOVERTRLOG; + strcpy( monomial_list[no_monomials].name, "CLOVERTRLOG"); + add_monomial(CLOVERTRLOG); + monomial_list[no_monomials-1].pf = NULL; + monomial_list[no_monomials-1].id = no_monomials-1; + monomial_list[no_monomials-1].rngrepro = reproduce_randomnumber_flag; + // set the parameters according to cloverdet monomial + // this need alltogether a more general approach + monomial_list[no_monomials-1].c_sw = monomial_list[clover_monomials[j]].c_sw; + monomial_list[no_monomials-1].mu = monomial_list[clover_monomials[j]].mu; + monomial_list[no_monomials-1].kappa = monomial_list[clover_monomials[j]].kappa; + monomial_list[no_monomials-1].hbfunction = &clover_trlog_heatbath; + monomial_list[no_monomials-1].accfunction = &clover_trlog_acc; + monomial_list[no_monomials-1].derivativefunction = NULL; + monomial_list[no_monomials-1].timescale = 0; + monomial_list[no_monomials-1].even_odd_flag = even_odd_flag; + if(g_proc_id == 0 && g_debug_level > 1) { + printf("# Initialised clover_trlog_monomial, no_monomials= %d\n", no_monomials); + } + } + for( int j = 0; j < no_clovernd_monomials; j++ ) { + monomial_list[no_monomials].type = CLOVERNDTRLOG; + strcpy( monomial_list[no_monomials].name, "CLOVERNDTRLOG"); + add_monomial(CLOVERNDTRLOG); + monomial_list[no_monomials-1].pf = NULL; + monomial_list[no_monomials-1].id = no_monomials-1; + monomial_list[no_monomials-1].rngrepro = reproduce_randomnumber_flag; + // set the parameters according to cloverdet monomial + // this need alltogether a more general approach + monomial_list[no_monomials-1].c_sw = monomial_list[clovernd_monomials[j]].c_sw; + monomial_list[no_monomials-1].mubar = monomial_list[clovernd_monomials[j]].mubar; + monomial_list[no_monomials-1].epsbar = monomial_list[clovernd_monomials[j]].epsbar; + monomial_list[no_monomials-1].kappa = monomial_list[clovernd_monomials[j]].kappa; + monomial_list[no_monomials-1].hbfunction = &clovernd_trlog_heatbath; + monomial_list[no_monomials-1].accfunction = &clovernd_trlog_acc; + monomial_list[no_monomials-1].derivativefunction = NULL; + monomial_list[no_monomials-1].timescale = 0; + monomial_list[no_monomials-1].even_odd_flag = 1; + if(g_proc_id == 0 && g_debug_level > 1) { + printf("# Initialised clovernd_trlog_monomial, no_monomials= %d\n", no_monomials); + } + } + } + return(0); +} + +void free_monomials() { + + free(_pf); + return; +} + + +int init_poly_monomial(const int V, const int id){ + + monomial * mnl = &monomial_list[id]; + int i,j,k; + FILE* rootsFile=NULL; + char title[101]; + char filename[257]; + FILE* constFile; + int errcode; + double eps; + + spinor *_pf=(spinor*)NULL; + + if((void*)(_pf = (spinor*)calloc((mnl->MDPolyDegree/2+2)*V+1, sizeof(spinor))) == NULL) { + printf ("malloc errno in init_poly_monomial pf fields: %d\n",errno); + errno = 0; + return(1); + } + + if((void*)(mnl->MDPoly_chi_spinor_fields=(spinor**)calloc(mnl->MDPolyDegree/2+2,sizeof(spinor*))) ==NULL ){ + printf ("malloc errno in init_poly_monomial pf fields: %d\n",errno); + errno = 0; + return(2); + } + + (mnl->MDPoly_chi_spinor_fields)[0] = (spinor*)(((unsigned long int)(_pf)+ALIGN_BASE)&~ALIGN_BASE); + + for(i = 1; i < (mnl->MDPolyDegree/2+2); i++){ + mnl->MDPoly_chi_spinor_fields[i] = mnl->MDPoly_chi_spinor_fields[i-1]+V; + } + + if(strlen(monomial_list[id].MDPolyRootsFile)==0){ + sprintf(monomial_list[id].MDPolyRootsFile, + "%s_deg_%d_eps_%1.16e.roots", + "1overX_poly", + monomial_list[id].MDPolyDegree, + monomial_list[id].MDPolyLmin/monomial_list[id].MDPolyLmax + ); + fprintf(stderr,"Warning you didnt specify a rootsfilename -> guessing:\n%s\n",filename); + } + if(monomial_list[id].MDPolyLocNormConst==-1.0){ + eps=monomial_list[id].MDPolyLmin/monomial_list[id].MDPolyLmax; + sprintf(filename, + "%s_deg_%d_eps_%1.16e.const", + "1overX_poly", + monomial_list[id].MDPolyDegree, + eps + ); + fprintf(stderr,"Warning you didnt specify a local normalization: trying to read it from\n%s\n",filename); + if((constFile=fopen(filename,"r"))!=NULL) { + errcode = fscanf(constFile,"%lf\n",&(mnl->MDPolyLocNormConst)); + fclose(constFile); + fprintf(stderr, "normierung local succesfully read -> lnc = %e \n", mnl->MDPolyLocNormConst); + } + else { + fprintf(stderr,"Reading local normalization from file FAILED\n Borting Ab\n"); +#ifdef MPI + MPI_Finalize(); +#endif + exit(6); + } + } + + /* read in the roots from the given file */ + + if((void*)(mnl->MDPolyRoots=(_Complex double*)calloc(mnl->MDPolyDegree,sizeof(_Complex double))) ==NULL ){ + printf ("malloc errno in init_poly_monomial roots array: %d\n",errno); + errno = 0; + return(3); + } + + printf("reading roots...!\n"); + if((rootsFile=fopen(mnl->MDPolyRootsFile,"r")) != (FILE*)NULL) { + if (fgets(title, 100, rootsFile) == NULL) { + fprintf(stderr, "Cant read Roots file: %s Aborting...\n", mnl->MDPolyRootsFile); +#ifdef MPI + MPI_Finalize(); +#endif + exit(6); + } + + /* Here we read in the 2n roots needed for the polinomial in sqrt(s) */ + for(j = 0; j < (mnl->MDPolyDegree); j++) { + errcode = fscanf(rootsFile," %d %lf %lf \n", &k, (double*)&(mnl->MDPolyRoots[j]), (double*)&(mnl->MDPolyRoots[j]) + 1); + } + fclose(rootsFile); + } + else { + fprintf(stderr, "Roots File %s is missing! Aborting...\n", mnl->MDPolyRootsFile ); +#ifdef MPI + MPI_Finalize(); +#endif + exit(6); + } + + if(g_proc_id == 0 && g_debug_level > 2) { + printf("# the root are:\n"); + for(j=0; j<(mnl->MDPolyDegree); j++){ + printf("# %lf %lf\n", creal(mnl->MDPolyRoots[j]), cimag(mnl->MDPolyRoots[j])); + } + } + + return 0; + +} + +void dummy_derivative(const int id, hamiltonian_field_t * const hf) { + if(g_proc_id == 0) { + fprintf(stderr, "dummy_derivative was called. Was that really intended?\n"); + fprintf(stderr, "callers monomial ID was %d\n", id); + } + return; +} + +void dummy_heatbath(const int id, hamiltonian_field_t * const hf) { + if(g_proc_id == 0) { + fprintf(stderr, "dummy_heatbath was called. Was that really intended?\n"); + fprintf(stderr, "callers monomial ID was %d\n", id); + } + return; +} + +double dummy_acc(const int id, hamiltonian_field_t * const hf) { + if(g_proc_id == 0) { + fprintf(stderr, "dummy_acc was called. Was that really intended?\n"); + fprintf(stderr, "callers monomial ID was %d\n", id); + } + return(0.); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/monomial.h b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/monomial.h new file mode 100644 index 0000000000000000000000000000000000000000..60acccf124b0e23d8d482b858a40734982a3eb90 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/monomial.h @@ -0,0 +1,170 @@ +/*********************************************************************** + * + * Copyright (C) 2008,2011,2012 Carsten Urbach + * 2009 Jenifer Gonzalez Lopez + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _MONOMIAL_H +#define _MONOMIAL_H + +#include "su3.h" +#include "su3spinor.h" +#include "hamiltonian_field.h" +#include "rational/rational.h" +#include "solver/solver_params.h" + +#define DET 0 +#define DETRATIO 1 +#define GAUGE 2 +#define POLY 3 +#define NDPOLY 4 +#define SFGAUGE 5 +#define NDDETRATIO 6 +#define POLYDETRATIO 7 +#define CLOVERTRLOG 8 +#define CLOVERDET 9 +#define CLOVERDETRATIO 10 +#define NDCLOVER 11 +#define CLOVERNDTRLOG 12 +#define NDRAT 13 +#define NDCLOVERRAT 14 +#define NDRATCOR 15 +#define NDCLOVERRATCOR 16 +#define RAT 17 +#define RATCOR 18 +#define CLOVERRAT 19 +#define CLOVERRATCOR 20 + +#define max_no_monomials 30 + +typedef struct { + int type; + int gtype; + int initialised; + int timescale; + int maxiter; + int id; + int even_odd_flag; + int rngrepro; + int solver; + int iter0, iter1, iter2; + int csg_N, csg_N2; + int csg_n, csg_n2; + int use_rectangles; + /* trlog */ + int trlog; + int * csg_index_array, *csg_index_array2; + /* det or detratio related */ + double mu, mu2, kappa, kappa2; + /* clover coefficient */ + double c_sw, rho, rho2; + /* polynomial related, not yet in use */ + double mubar, epsbar, mubar2, epsbar2; + /* energies at beginning and end of trajectory */ + double energy0; + double energy1; + /* gauge related */ + double c0, c1, beta, glambda; + /* solver related*/ + double epsilon; + double forceprec; + double accprec; + solver_params_t solver_params; + /* force normalisation */ + double forcefactor; + /* some book-keeping */ + char name[100]; + /* pseudo fermion field */ + /* second one needed for ND monomials */ + spinor * pf, * pf2; + /* parameters for the POLY Monomial*/ + int rec_ev; + int MDPolyDegree, MaxPtildeDegree, PtildeDegree; + double MDPolyLmin, MDPolyLmax; + char MDPolyRootsFile[256]; + _Complex double *MDPolyRoots; + spinor **MDPoly_chi_spinor_fields; + double MDPolyLocNormConst; + int MDPolyDetRatio; + int no_wfields; + double PrecisionPtilde; + double PrecisionHfinal; + double StildeMin, StildeMax; + double EVMin, EVMax, EVMaxInv; + double * MDPolyCoefs, * PtildeCoefs; + /* rational approximation */ + rational_t rat; + /* chronological solver fields */ + spinor ** csg_field; + spinor ** csg_field2; + spinor ** w_fields; + /* functions for the HMC update */ + void (*hbfunction) (const int no, hamiltonian_field_t * const hf); + double (*accfunction) (const int no, hamiltonian_field_t * const hf); + void (*derivativefunction) (const int no, hamiltonian_field_t * const hf); + /* the operator definitions */ + void (*Qsq) (spinor * const, spinor * const); + void (*Qsq32) (spinor32 * const, spinor32 * const); + void (*Qp) (spinor * const, spinor * const); + void (*Qm) (spinor * const, spinor * const); +} monomial; + +#include "monomial/det_monomial.h" +#include "monomial/detratio_monomial.h" +#include "monomial/poly_monomial.h" +#include "monomial/ndpoly_monomial.h" +#include "monomial/nddetratio_monomial.h" +#include "monomial/gauge_monomial.h" +#include "monomial/sf_gauge_monomial.h" +#include "monomial/clover_trlog_monomial.h" +#include "monomial/clovernd_trlog_monomial.h" +#include "monomial/cloverdet_monomial.h" +#include "monomial/cloverdetratio_monomial.h" +#include "monomial/cloverndpoly_monomial.h" +#include "monomial/ndrat_monomial.h" +#include "monomial/rat_monomial.h" +#include "monomial/ndratcor_monomial.h" +#include "monomial/ratcor_monomial.h" +#include "monomial/moment_energy.h" +#include "monomial/monitor_forces.h" + +/* list of all monomials */ +extern monomial monomial_list[max_no_monomials]; +/* number of initialised monomials */ +extern int no_monomials; +/* number of gauge monomials, currently 0 or 1 */ +extern int no_gauge_monomials; +/* number of ndpoly monomials, currently 0 or 1 */ +extern int no_ndpoly_monomials; + +/* add a new monomial to the list of monomials */ +int add_monomial(const int type); +/* initialise all monomials in the list */ +int init_monomials(const int V, const int even_odd_flag); +/* free space again */ +void free_monomials(); + +/* initialisation function for a poly monomial */ +int init_poly_monomial(const int V,const int id); + + +/* some dummy functions */ +void dummy_derivative(const int id, hamiltonian_field_t * const hf); +void dummy_heatbath(const int id, hamiltonian_field_t * const hf); +double dummy_acc(const int id, hamiltonian_field_t * const hf); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/nddetratio_monomial.c b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/nddetratio_monomial.c new file mode 100644 index 0000000000000000000000000000000000000000..81f96cfc15d27436c13b1b4a0c40d3e9b32379ee --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/nddetratio_monomial.c @@ -0,0 +1,89 @@ +/*********************************************************************** + * + * Copyright (C) 2008 Thomas Chiarappa, Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "su3adj.h" +#include "linalg_eo.h" +#include "start.h" +#include "solver/solver.h" +#include "deriv_Sb.h" +#include "operator/tm_operators.h" +#include "chebyshev_polynomial.h" +#include "operator/tm_operators_nd.h" +#include "operator/Hopping_Matrix.h" +#include "phmc.h" +#include "boundary.h" +#include "gamma.h" +#include "operator/tm_operators_nd.h" +#include "chebyshev_polynomial_nd.h" +#include "Ptilde_nd.h" +#include "gettime.h" +#include "reweighting_factor_nd.h" +#include "monomial/monomial.h" +#include "hamiltonian_field.h" +#include "nddetratio_monomial.h" + + + +double nddetratio_acc(const int id, hamiltonian_field_t * const hf) { + int iter; + monomial * mnl = &monomial_list[id]; + double atime, etime; + atime = gettime(); + + g_mubar = mnl->mubar; + g_epsbar = mnl->epsbar; + boundary(mnl->kappa); + + iter = cg_her_nd(mnl->w_fields[0], mnl->w_fields[1], mnl->pf, mnl->pf2, + mnl->maxiter, mnl->accprec, g_relative_precision_flag, + VOLUME/2, &Qtm_pm_ndpsi); + Qtm_dagger_ndpsi(mnl->w_fields[2], mnl->w_fields[3], + mnl->w_fields[0], mnl->w_fields[1]); + + g_mubar = mnl->mubar2; + g_epsbar = mnl->epsbar2; + boundary(mnl->kappa2); + + Qtm_ndpsi(mnl->w_fields[0], mnl->w_fields[1], + mnl->w_fields[2], mnl->w_fields[3]); + + mnl->energy1 = scalar_prod_r(mnl->pf , mnl->w_fields[0], VOLUME/2, 1); + mnl->energy1 += scalar_prod_r(mnl->pf2, mnl->w_fields[1], VOLUME/2, 1); + etime = gettime(); + if(g_proc_id == 0) { + if(g_debug_level > 1) { + printf("# Time for %s monomial acc step: %e s\n", mnl->name, etime-atime); + } + if(g_debug_level > 3) { + printf("called nddetratio_acc for id %d dH = %1.10e\n", + id, mnl->energy0 - mnl->energy1); + } + } + return(mnl->energy1 - mnl->energy0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/nddetratio_monomial.h b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/nddetratio_monomial.h new file mode 100644 index 0000000000000000000000000000000000000000..d775647ed792449b43dc500d68dec3a8c5c9fe16 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/nddetratio_monomial.h @@ -0,0 +1,27 @@ +/*********************************************************************** + * + * Copyright (C) 2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _NDDETRATIO_MONOMIAL_H +#define _NDDETRATIO_MONOMIAL_H + +#include "hamiltonian_field.h" + +double nddetratio_acc(const int id, hamiltonian_field_t * const hf); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/ndpoly_monomial.c b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/ndpoly_monomial.c new file mode 100644 index 0000000000000000000000000000000000000000..fcdff0bda1950cfc61cc53b23c6850469ea8f72f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/ndpoly_monomial.c @@ -0,0 +1,570 @@ +/*********************************************************************** + * + * Copyright (C) 2008 Thomas Chiarappa, Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "linalg_eo.h" +#include "start.h" +#include "gettime.h" +#include "solver/solver.h" +#include "deriv_Sb.h" +#include "operator/tm_operators.h" +#include "chebyshev_polynomial.h" +#include "operator/tm_operators_nd.h" +#include "operator/Hopping_Matrix.h" +#include "phmc.h" +#include "operator/tm_operators_nd.h" +#include "chebyshev_polynomial_nd.h" +#include "Ptilde_nd.h" +#include "reweighting_factor_nd.h" +#include "monomial/monomial.h" +#include "hamiltonian_field.h" +#include "boundary.h" +#include "phmc.h" +#include "init/init_chi_spinor_field.h" +#include "solver/matrix_mult_typedef_nd.h" +#include "operator/clover_leaf.h" +#include "operator/clovertm_operators.h" +#include "ndpoly_monomial.h" + +extern int phmc_exact_poly; + +/******************************************** + * + * Here \delta S_b is computed + * + ********************************************/ + +void ndpoly_derivative(const int id, hamiltonian_field_t * const hf) { + double atime, etime; + int j, k; + monomial * mnl = &monomial_list[id]; + atime = gettime(); + /* This factor 2 a missing factor 2 in trace_lambda */ + ndpoly_set_global_parameter(mnl, phmc_exact_poly); + mnl->forcefactor = -phmc_Cpol*mnl->EVMaxInv; + /* Recall: The GAMMA_5 left of delta M_eo is done in deriv_Sb !!! */ + + if (g_epsbar!=0.0 || phmc_exact_poly==0){ + /* Here comes the definitions for the chi_j fields */ + /* from j=0 (chi_0 = phi) ..... to j = n-1 */ + /* in g_chi_up_spinor_field[0] (g_chi_dn_spinor_field[0] we expect */ + /* to find the phi field, the pseudo fermion field */ + /* i.e. must be equal to mnl->pf (mnl->pf2) */ + + assign(g_chi_up_spinor_field[0], mnl->pf, VOLUME/2); + assign(g_chi_dn_spinor_field[0], mnl->pf2, VOLUME/2); + + for(k = 1; k < (mnl->MDPolyDegree-1); k++) { + Q_tau1_sub_const_ndpsi(g_chi_up_spinor_field[k], g_chi_dn_spinor_field[k], + g_chi_up_spinor_field[k-1], g_chi_dn_spinor_field[k-1], + mnl->MDPolyRoots[k-1], phmc_Cpol, phmc_invmaxev); + } + + /* Here comes the remaining fields chi_k ; k=n,...,2n-1 */ + /*They are evaluated step-by-step overwriting the same field (mnl->MDPolyDegree)*/ + + assign(g_chi_up_spinor_field[mnl->MDPolyDegree], g_chi_up_spinor_field[mnl->MDPolyDegree-2], VOLUME/2); + assign(g_chi_dn_spinor_field[mnl->MDPolyDegree], g_chi_dn_spinor_field[mnl->MDPolyDegree-2], VOLUME/2); + + for(j=(mnl->MDPolyDegree-1); j>=1; j--) { + assign(g_chi_up_spinor_field[mnl->MDPolyDegree-1], g_chi_up_spinor_field[mnl->MDPolyDegree], VOLUME/2); + assign(g_chi_dn_spinor_field[mnl->MDPolyDegree-1], g_chi_dn_spinor_field[mnl->MDPolyDegree], VOLUME/2); + + Q_tau1_sub_const_ndpsi(g_chi_up_spinor_field[mnl->MDPolyDegree], g_chi_dn_spinor_field[mnl->MDPolyDegree], + g_chi_up_spinor_field[mnl->MDPolyDegree-1], g_chi_dn_spinor_field[mnl->MDPolyDegree-1], + mnl->MDPolyRoots[2*mnl->MDPolyDegree-3-j], phmc_Cpol, phmc_invmaxev); + + /* Get the even parts of the (j-1)th chi_spinors */ + H_eo_tm_ndpsi(mnl->w_fields[0], mnl->w_fields[1], + g_chi_up_spinor_field[j-1], g_chi_dn_spinor_field[j-1], EO); + + /* \delta M_eo sandwitched by chi[j-1]_e^\dagger and chi[2N-j]_o */ + deriv_Sb(EO, mnl->w_fields[0], g_chi_up_spinor_field[mnl->MDPolyDegree], hf, mnl->forcefactor);/* UP */ + deriv_Sb(EO, mnl->w_fields[1], g_chi_dn_spinor_field[mnl->MDPolyDegree], hf, mnl->forcefactor);/* DN */ + + /* Get the even parts of the (2N-j)-th chi_spinors */ + H_eo_tm_ndpsi(mnl->w_fields[0], mnl->w_fields[1], + g_chi_up_spinor_field[mnl->MDPolyDegree], g_chi_dn_spinor_field[mnl->MDPolyDegree], EO); + + /* \delta M_oe sandwitched by chi[j-1]_o^\dagger and chi[2N-j]_e */ + deriv_Sb(OE, g_chi_up_spinor_field[j-1], mnl->w_fields[0], hf, mnl->forcefactor); + deriv_Sb(OE, g_chi_dn_spinor_field[j-1], mnl->w_fields[1], hf, mnl->forcefactor); + } + } + else if(g_epsbar == 0.0) { + /* Here comes the definitions for the chi_j fields */ + /* from j=0 (chi_0 = phi) ..... to j = n-1 */ + assign(g_chi_up_spinor_field[0], mnl->pf, VOLUME/2); + for(k = 1; k < (mnl->MDPolyDegree-1); k++) { + Qtm_pm_sub_const_nrm_psi(g_chi_up_spinor_field[k], + g_chi_up_spinor_field[k-1], + mnl->MDPolyRoots[k-1]); + } + assign(g_chi_up_spinor_field[mnl->MDPolyDegree], + g_chi_up_spinor_field[mnl->MDPolyDegree-2], VOLUME/2); + + for(j = (mnl->MDPolyDegree-1); j >= 1; j--) { + assign(g_chi_up_spinor_field[mnl->MDPolyDegree-1], + g_chi_up_spinor_field[mnl->MDPolyDegree], VOLUME/2); + + Qtm_pm_sub_const_nrm_psi(g_chi_up_spinor_field[mnl->MDPolyDegree], + g_chi_up_spinor_field[mnl->MDPolyDegree-1], + mnl->MDPolyRoots[2*mnl->MDPolyDegree-3-j]); + + Qtm_minus_psi(mnl->w_fields[3],g_chi_up_spinor_field[j-1]); + + H_eo_tm_inv_psi(mnl->w_fields[2], g_chi_up_spinor_field[phmc_dop_n_cheby], EO, -1.); + deriv_Sb(OE, mnl->w_fields[3], mnl->w_fields[2], hf, mnl->forcefactor); + + H_eo_tm_inv_psi(mnl->w_fields[2], mnl->w_fields[3], EO, 1.); + deriv_Sb(EO, mnl->w_fields[2], g_chi_up_spinor_field[phmc_dop_n_cheby], hf, mnl->forcefactor); + + Qtm_minus_psi(mnl->w_fields[3],g_chi_up_spinor_field[mnl->MDPolyDegree]); + + H_eo_tm_inv_psi(mnl->w_fields[2],mnl->w_fields[3], EO, +1.); + deriv_Sb(OE, g_chi_up_spinor_field[j-1] , mnl->w_fields[2], hf, mnl->forcefactor); + + H_eo_tm_inv_psi(mnl->w_fields[2], g_chi_up_spinor_field[j-1], EO, -1.); + deriv_Sb(EO, mnl->w_fields[2], mnl->w_fields[3], hf, mnl->forcefactor); + } + } + /* + Normalisation by the largest EW is done in update_momenta + using mnl->forcefactor + */ + etime = gettime(); + if(g_debug_level > 1 && g_proc_id == 0) { + printf("# Time for %s monomial derivative: %e s\n", mnl->name, etime-atime); + } + return; +} + + +void ndpoly_heatbath(const int id, hamiltonian_field_t * const hf) { + int j; + monomial * mnl = &monomial_list[id]; + + ndpoly_set_global_parameter(mnl, phmc_exact_poly); + + // we measure before trajectory! + if((mnl->rec_ev != 0) && (hf->traj_counter%mnl->rec_ev == 0)) { + phmc_compute_ev(hf->traj_counter-1, id, &Qtm_pm_ndbipsi); + } + + mnl->energy0 = 0.; + random_spinor_field_eo(g_chi_up_spinor_field[0], mnl->rngrepro, RN_GAUSS); + mnl->energy0 = square_norm(g_chi_up_spinor_field[0], VOLUME/2, 1); + + if(g_epsbar!=0.0 || phmc_exact_poly == 0) { + random_spinor_field_eo(g_chi_dn_spinor_field[0], mnl->rngrepro, RN_GAUSS); + mnl->energy0 += square_norm(g_chi_dn_spinor_field[0], VOLUME/2, 1); + } + else { + zero_spinor_field(g_chi_dn_spinor_field[0], VOLUME/2); + } + + if((g_proc_id == g_stdio_proc) && (g_debug_level > 5)) { + printf("# NDPOLY: OLD Energy DN + UP %e \n\n", mnl->energy0); + } + + if(phmc_exact_poly==0){ + Qtm_ndpsi(g_chi_up_spinor_field[1], g_chi_dn_spinor_field[1], + g_chi_up_spinor_field[0], g_chi_dn_spinor_field[0]); + + for(j = 1; j < (mnl->MDPolyDegree); j++){ + assign(g_chi_up_spinor_field[0], g_chi_up_spinor_field[1], VOLUME/2); + assign(g_chi_dn_spinor_field[0], g_chi_dn_spinor_field[1], VOLUME/2); + + Q_tau1_sub_const_ndpsi(g_chi_up_spinor_field[1], g_chi_dn_spinor_field[1], + g_chi_up_spinor_field[0], g_chi_dn_spinor_field[0], + mnl->MDPolyRoots[mnl->MDPolyDegree-2+j], phmc_Cpol, phmc_invmaxev); + } + Ptilde_ndpsi(g_chi_up_spinor_field[0], g_chi_dn_spinor_field[0], mnl->PtildeCoefs, + mnl->PtildeDegree, g_chi_up_spinor_field[1], g_chi_dn_spinor_field[1], &Qtm_pm_ndpsi); + } + else if( phmc_exact_poly==1 && g_epsbar!=0.0) { + /* Attention this is Q * tau1, up/dn are exchanged in the input spinor */ + /* this is used as an preconditioner */ + Qtm_ndpsi(g_chi_up_spinor_field[1],g_chi_dn_spinor_field[1], + g_chi_dn_spinor_field[0],g_chi_up_spinor_field[0]); + + assign(g_chi_up_spinor_field[0], g_chi_up_spinor_field[1], VOLUME/2); + assign(g_chi_dn_spinor_field[0], g_chi_dn_spinor_field[1], VOLUME/2); + + /* solve Q*tau1*P(Q^2) *x=y */ + cg_her_nd(g_chi_up_spinor_field[1],g_chi_dn_spinor_field[1], + g_chi_up_spinor_field[0],g_chi_dn_spinor_field[0], + 1000,1.e-16,0,VOLUME/2, Qtau1_P_ndpsi); + + /* phi= Bdagger phi */ + for(j = 1; j < (mnl->MDPolyDegree); j++){ + assign(g_chi_up_spinor_field[0], g_chi_up_spinor_field[1], VOLUME/2); + assign(g_chi_dn_spinor_field[0], g_chi_dn_spinor_field[1], VOLUME/2); + Q_tau1_sub_const_ndpsi(g_chi_up_spinor_field[1], g_chi_dn_spinor_field[1], + g_chi_up_spinor_field[0], g_chi_dn_spinor_field[0], + mnl->MDPolyRoots[mnl->MDPolyDegree-2+j], phmc_Cpol, phmc_invmaxev); + } + + assign(g_chi_up_spinor_field[0], g_chi_up_spinor_field[1], VOLUME/2); + assign(g_chi_dn_spinor_field[0], g_chi_dn_spinor_field[1], VOLUME/2); + } + else if(phmc_exact_poly==1 && g_epsbar==0.0) { + Qtm_pm_psi(g_chi_up_spinor_field[1], g_chi_up_spinor_field[0]); + + assign(g_chi_up_spinor_field[0], g_chi_up_spinor_field[1], VOLUME/2); + + /* solve (Q+)*(Q-)*P((Q+)*(Q-)) *x=y */ + cg_her(g_chi_up_spinor_field[1], g_chi_up_spinor_field[0], + 1000,1.e-16,0,VOLUME/2, Qtm_pm_Ptm_pm_psi); + + /* phi= Bdagger phi */ + for(j = 1; j < (mnl->MDPolyDegree); j++){ + assign(g_chi_up_spinor_field[0], g_chi_up_spinor_field[1], VOLUME/2); + Qtm_pm_sub_const_nrm_psi(g_chi_up_spinor_field[1], + g_chi_up_spinor_field[0], + mnl->MDPolyRoots[mnl->MDPolyDegree-2+j]); + } + assign(g_chi_up_spinor_field[0], g_chi_up_spinor_field[1], VOLUME/2); + } + + assign(mnl->pf, g_chi_up_spinor_field[0], VOLUME/2); + assign(mnl->pf2, g_chi_dn_spinor_field[0], VOLUME/2); + + if(g_proc_id == 0 && g_debug_level > 3) { + printf("called ndpoly_heatbath for id %d \n", id); + } + return; +} + + +double ndpoly_acc(const int id, hamiltonian_field_t * const hf) { + int j, ij=0; + double temp, sgn, fact, Diff; + double Ener[8]; + double factor[8]; + monomial * mnl = &monomial_list[id]; + spinor *up0, *dn0, *up1, *dn1, *dummy; + + ndpoly_set_global_parameter(mnl, phmc_exact_poly); + mnl->energy1 = 0.; + Ener[0] = 0; + factor[0] = 1.0; + for(j = 1; j < 8; j++){ + factor[j] = j*factor[j-1]; + Ener[j] = 0; + } + /* IF PHMC */ + up0 = g_chi_up_spinor_field[0]; + up1 = g_chi_up_spinor_field[1]; + dn0 = g_chi_dn_spinor_field[0]; + dn1 = g_chi_dn_spinor_field[1]; + /* This is needed if we consider only "1" in eq. 9 */ + assign(up0, mnl->pf , VOLUME/2); + assign(dn0, mnl->pf2, VOLUME/2); + + if(phmc_exact_poly==0) { + for(j = 1; j <= (mnl->MDPolyDegree-1); j++) { + /* Change this name !!*/ + Q_tau1_sub_const_ndpsi(up1, dn1, up0, dn0, mnl->MDPolyRoots[j-1], phmc_Cpol, phmc_invmaxev); + + dummy = up1; up1 = up0; up0 = dummy; + dummy = dn1; dn1 = dn0; dn0 = dummy; + /* result always in up0 and dn0 */ + } + + ij=0; + if(up0 != g_chi_up_spinor_field[ij]) { + assign(g_chi_up_spinor_field[ij], up0, VOLUME/2); + assign(g_chi_dn_spinor_field[ij], dn0, VOLUME/2); + } + + temp = square_norm(g_chi_up_spinor_field[ij], VOLUME/2, 1); + Ener[ij] = temp; + + temp = square_norm(g_chi_dn_spinor_field[ij], VOLUME/2, 1); + Ener[ij] += temp; + + if((g_proc_id == g_stdio_proc) && (g_debug_level > 4)) { + printf("# NDPOLY: At j=%d H before H-correction %e \n", ij, Ener[ij]); + } + + /* Here comes the loop for the evaluation of A, A^2, ... */ + for(j = 1; j < 8; j++){ /* To omit corrections just set j<1 */ + + if(j % 2){ /* Chi[j] = ( Qdag P Ptilde ) Chi[j-1] */ + Ptilde_ndpsi(g_chi_up_spinor_field[j], g_chi_dn_spinor_field[j], + mnl->PtildeCoefs, mnl->PtildeDegree, + g_chi_up_spinor_field[j-1], g_chi_dn_spinor_field[j-1], &Qtm_pm_ndpsi); + Ptilde_ndpsi(g_chi_up_spinor_field[j-1], g_chi_dn_spinor_field[j-1], + mnl->MDPolyCoefs, mnl->MDPolyDegree, + g_chi_up_spinor_field[j], g_chi_dn_spinor_field[j], &Qtm_pm_ndpsi); + + Qtm_dagger_ndpsi(g_chi_up_spinor_field[j], g_chi_dn_spinor_field[j], + g_chi_up_spinor_field[j-1], g_chi_dn_spinor_field[j-1]); + } + else { /* Chi[j] = ( Ptilde P Q ) Chi[j-1] */ + Qtm_ndpsi(g_chi_up_spinor_field[j], g_chi_dn_spinor_field[j], + g_chi_up_spinor_field[j-1], g_chi_dn_spinor_field[j-1]); + Ptilde_ndpsi(g_chi_up_spinor_field[j-1], g_chi_dn_spinor_field[j-1], + mnl->MDPolyCoefs, mnl->MDPolyDegree, g_chi_up_spinor_field[j], + g_chi_dn_spinor_field[j], &Qtm_pm_ndpsi); + Ptilde_ndpsi(g_chi_up_spinor_field[j], g_chi_dn_spinor_field[j], + mnl->PtildeCoefs, mnl->PtildeDegree, + g_chi_up_spinor_field[j-1], g_chi_dn_spinor_field[j-1], &Qtm_pm_ndpsi); + } + + Ener[j] = Ener[j-1] + Ener[0]; + sgn = -1.0; + for(ij = 1; ij < j; ij++){ + fact = factor[j] / (factor[ij] * factor[j-ij]); + if((g_proc_id == g_stdio_proc) && (g_debug_level > 4)) { + printf("# NDPOLY: Here j=%d and ij=%d sign=%f fact=%f \n", j ,ij, sgn, fact); + } + Ener[j] += sgn*fact*Ener[ij]; + sgn = -sgn; + } + temp = square_norm(g_chi_up_spinor_field[j], VOLUME/2, 1); + temp += square_norm(g_chi_dn_spinor_field[j], VOLUME/2, 1); + if((g_proc_id == g_stdio_proc) && (g_debug_level > 4)) { + printf("# NDPOLY: Here j=%d sign=%f temp=%e \n", j, sgn, temp); + } + + Ener[j] += sgn*temp; + + Diff = fabs(Ener[j] - Ener[j-1]); + if((g_proc_id == g_stdio_proc) && (g_debug_level > 0)) { + printf("# NDPOLY: H-Correction after %d steps: %e \n", j, Diff); + } + + if(Diff < mnl->PrecisionHfinal) { + break; + } + } + mnl->energy1 += Ener[ij]; /* this is quite sticky */ + } + else if(phmc_exact_poly==1 && g_epsbar!=0.0) { + /* B(Q*tau1) */ + for(j = 1; j <= (mnl->MDPolyDegree-1); j++){ + Q_tau1_sub_const_ndpsi(up1, dn1, up0, dn0, mnl->MDPolyRoots[j-1], phmc_Cpol, phmc_invmaxev); + + dummy = up1; up1 = up0; up0 = dummy; + dummy = dn1; dn1 = dn0; dn0 = dummy; + /* result always in up0 and dn0 */ + } + if(up0 != g_chi_up_spinor_field[0]) { + assign(g_chi_up_spinor_field[0], up0, VOLUME/2); + assign(g_chi_dn_spinor_field[0], dn0, VOLUME/2); + } + + temp = square_norm(g_chi_up_spinor_field[0], VOLUME/2, 1); + Ener[0] = temp; + + temp = square_norm(g_chi_dn_spinor_field[0], VOLUME/2, 1); + Ener[0] += temp; + + if((g_proc_id == g_stdio_proc) && (g_debug_level > 4)) { + ij=0; + printf("# NDPOLY: At j=%d PHMC Only Final Energy %e \n", ij, Ener[0]); + } + + mnl->energy1 += Ener[0]; + } + else if(phmc_exact_poly == 1 && g_epsbar == 0.0) { + for(j = 1; j < (mnl->MDPolyDegree); j++) { + assign(g_chi_up_spinor_field[0], g_chi_up_spinor_field[1], VOLUME/2); + Qtm_pm_sub_const_nrm_psi(g_chi_up_spinor_field[1], + g_chi_up_spinor_field[0], + mnl->MDPolyRoots[j-1]); + } + assign(g_chi_up_spinor_field[0], g_chi_up_spinor_field[1], VOLUME/2); + + temp = square_norm(g_chi_up_spinor_field[0], VOLUME/2, 1); + Ener[0] = temp; + + if((g_proc_id == g_stdio_proc) && (g_debug_level > 4)) { + printf("# NDPOLY: At j=%d PHMC Only Final Energy %e \n", ij, Ener[0]); + } + + mnl->energy1 += Ener[0]; + } + + if(g_proc_id == 0 && g_debug_level > 3) { + printf("called ndpoly_acc for id %d %d dH = %1.10e\n", id, g_running_phmc, mnl->energy1 - mnl->energy0); + } + /* END IF PHMC */ + return(mnl->energy1 - mnl->energy0); +} + + +int init_ndpoly_monomial(const int id) { + monomial * mnl = &monomial_list[id]; + int j, k, errcode; + FILE * ifs; + double *phmc_darray; + char title[100]; + matrix_mult_nd Qsq = &Qtm_pm_ndpsi; + double atime, etime; + + atime = gettime(); + if(mnl->type == NDCLOVER) { + Qsq = &Qsw_pm_ndpsi; + init_sw_fields(); + sw_term((const su3 **)g_gauge_field, mnl->kappa, mnl->c_sw); + sw_invert_nd(mnl->mubar*mnl->mubar - mnl->epsbar*mnl->epsbar); + } + + phmc_invmaxev = 1.0; + g_mubar = mnl->mubar; + g_epsbar = mnl->epsbar; + g_kappa = mnl->kappa; + g_c_sw = mnl->c_sw; + boundary(g_kappa); + if (g_epsbar!=0.0 || phmc_exact_poly==0){ + phmc_Cpol = sqrt(mnl->MDPolyLocNormConst); + } + else { + phmc_Cpol = mnl->MDPolyLocNormConst; + } + + /* This is the epsilon parameter */ + mnl->EVMin = mnl->StildeMin / mnl->StildeMax; + mnl->EVMax = 1.; + /* In the following there is the "sqrt" since the value refers to + the hermitian Dirac operator (used in EV-computation), namely + S = Q Q^dag + When "S" is applied, we call phmc_invmaxev twice !!! */ + if(g_epsbar!=0.0 || phmc_exact_poly==0) mnl->EVMaxInv = 1./(sqrt(mnl->StildeMax)); + else if(g_epsbar==0.0 && phmc_exact_poly==1) mnl->EVMaxInv = 1./mnl->StildeMax; + phmc_cheb_evmin = mnl->EVMin; + phmc_invmaxev = mnl->EVMaxInv; + phmc_cheb_evmax = 1.0; + + /* Here we prepare the less precise MD polynomial first */ + degree_of_polynomial_nd(&mnl->MDPolyDegree, &mnl->MDPolyCoefs, + mnl->EVMin, mnl->EVMax, + Qsq, mnl->rngrepro); + phmc_dop_n_cheby = mnl->MDPolyDegree; + phmc_dop_cheby_coef = mnl->MDPolyCoefs; + if((g_proc_id == 0) && (g_debug_level > 1)) { + printf("# monomial %s approximation interval [stilde_min, stilde_max] = [%e, %e]\n", + mnl->name, mnl->StildeMin, mnl->StildeMax); + printf("# monomial %s degree for P = %d, epsilont = %e, normalisation = %e", + mnl->name, mnl->MDPolyDegree-1, mnl->EVMin, mnl->EVMaxInv); + } + + /* Chi`s-spinors memory allocation */ + j = init_chi_spinor_field(VOLUMEPLUSRAND/2, (mnl->MDPolyDegree+1)); + if ( j!= 0) { + fprintf(stderr, "Not enough memory for PHMC Chi fields! Aborting...\n"); + exit(0); + } + + /* End memory allocation */ + /* Here we prepare the precise polynomial Ptilde */ + degree_of_Ptilde(&mnl->PtildeDegree, &mnl->PtildeCoefs, + mnl->EVMin, mnl->EVMax, mnl->MDPolyDegree, + mnl->PrecisionPtilde, Qsq, mnl->rngrepro); + phmc_ptilde_cheby_coef = mnl->PtildeCoefs; + phmc_ptilde_n_cheby = mnl->PtildeDegree; + + /* THIS IS THE OVERALL CONSTANT */ + /* write phmc_Cpol as the result of the simple-program files (BigC^(1/2))^1/2 + since BigC^(1/2) is the constant appearing in each factor of the + multiplication defining the monomial basis representation of the + polinomial in s, while its square phmc_root (BigC^(1/2))^1/2 is the + constant appearing in the multiplication representing the + polinomial in sqrt(s) . + */ + if(mnl->MDPolyLocNormConst < 0.0){ + fprintf(stderr, "Error, please specify LocNormConst in the input file! Aborting...\n"); +#ifdef MPI + MPI_Finalize(); +#endif + exit(6); + } + + mnl->MDPolyRoots = calloc((2*mnl->MDPolyDegree-2),sizeof(_Complex double)); + + if((ifs = fopen(mnl->MDPolyRootsFile, "r")) != (FILE*)NULL) { + if (fgets(title, 100, ifs) == NULL) { + fprintf(stderr, "Error in reading %s! Aborting...\n", mnl->MDPolyRootsFile); +#ifdef MPI + MPI_Finalize(); +#endif + exit(6); + } + + /* Here we read in the 2n roots needed for the polinomial in sqrt(s) */ + phmc_darray = (double*)mnl->MDPolyRoots; + for(j = 0; j< 2 * mnl->MDPolyDegree - 2; ++j) { + errcode = fscanf(ifs, " %d %lf %lf \n", &k, &phmc_darray[2 * j], &phmc_darray[2 * j + 1]); + } + fclose(ifs); + } + else { + fprintf(stderr, "File %s is missing! Aborting...\n", mnl->MDPolyRootsFile); +#ifdef MPI + MPI_Finalize(); +#endif + exit(6); + } + etime = gettime(); + if(g_debug_level > 0 && g_proc_id == 0) { + printf("# Time for init %s monomial: %e s\n", mnl->name, etime-atime); + } + return(0); +} + +void ndpoly_set_global_parameter(monomial * const mnl, const int exact) { + + g_mubar = mnl->mubar; + g_epsbar = mnl->epsbar; + g_kappa = mnl->kappa; + g_c_sw = mnl->c_sw; + boundary(g_kappa); + + if (g_epsbar!=0.0 || exact == 0){ + phmc_Cpol = sqrt(mnl->MDPolyLocNormConst); + } + else { + phmc_Cpol = mnl->MDPolyLocNormConst; + } + + phmc_root = mnl->MDPolyRoots; + phmc_cheb_evmin = mnl->EVMin; + phmc_invmaxev = mnl->EVMaxInv; + phmc_cheb_evmax = 1.0; + + phmc_dop_n_cheby = mnl->MDPolyDegree; + phmc_dop_cheby_coef = mnl->MDPolyCoefs; + + phmc_ptilde_cheby_coef = mnl->PtildeCoefs; + phmc_ptilde_n_cheby = mnl->PtildeDegree; + + return; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/ndpoly_monomial.h b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/ndpoly_monomial.h new file mode 100644 index 0000000000000000000000000000000000000000..d7787d4629534e84f6f578a109ef886af499eca0 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/ndpoly_monomial.h @@ -0,0 +1,31 @@ +/*********************************************************************** + * + * Copyright (C) 2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _NDPOLY_MONOMIAL_H +#define _NDPOLY_MONOMIAL_H + +#include "hamiltonian_field.h" + +void ndpoly_derivative(const int id, hamiltonian_field_t * const hf); +double ndpoly_acc(const int id, hamiltonian_field_t * const hf); +void ndpoly_heatbath(const int id, hamiltonian_field_t * const hf); +int init_ndpoly_monomial(const int id); +void ndpoly_set_global_parameter(monomial * const mnl, const int exact); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/ndrat_monomial.c b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/ndrat_monomial.c new file mode 100644 index 0000000000000000000000000000000000000000..7c260794813714caf828e4e45e3b187613b3df9d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/ndrat_monomial.c @@ -0,0 +1,356 @@ +/*********************************************************************** + * + * Copyright (C) 2013 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "linalg_eo.h" +#include "start.h" +#include "gettime.h" +#include "solver/solver.h" +#include "solver/monomial_solve.h" +#include "deriv_Sb.h" +#include "init/init_chi_spinor_field.h" +#include "operator/tm_operators.h" +#include "operator/tm_operators_32.h" +#include "operator/tm_operators_nd.h" +#include "operator/tm_operators_nd_32.h" +#include "operator/Hopping_Matrix.h" +#include "monomial/monomial.h" +#include "hamiltonian_field.h" +#include "boundary.h" +#include "operator/clovertm_operators.h" +#include "operator/clover_leaf.h" +#include "rational/rational.h" +#include "phmc.h" +#include "ndrat_monomial.h" + +void nd_set_global_parameter(monomial * const mnl) { + + g_mubar = mnl->mubar; + g_epsbar = mnl->epsbar; + g_kappa = mnl->kappa; + g_c_sw = mnl->c_sw; + boundary(g_kappa); + phmc_cheb_evmin = mnl->EVMin; + phmc_invmaxev = mnl->EVMaxInv; + phmc_cheb_evmax = 1.; + phmc_Cpol = 1.; + // used for preconditioning in cloverdetrat + g_mu3 = 0.; + + return; +} + + +/******************************************** + * + * Here \delta S_b is computed + * + ********************************************/ + +void ndrat_derivative(const int id, hamiltonian_field_t * const hf) { + monomial * mnl = &monomial_list[id]; + solver_pm_t solver_pm; + double atime, etime; + atime = gettime(); + nd_set_global_parameter(mnl); + if(mnl->type == NDCLOVERRAT) { + for(int i = 0; i < VOLUME; i++) { + for(int mu = 0; mu < 4; mu++) { + _su3_zero(swm[i][mu]); + _su3_zero(swp[i][mu]); + } + } + + // we compute the clover term (1 + T_ee(oo)) for all sites x + sw_term( (const su3**) hf->gaugefield, mnl->kappa, mnl->c_sw); + // we invert it for the even sites only + sw_invert_nd(mnl->mubar*mnl->mubar - mnl->epsbar*mnl->epsbar); + copy_32_sw_fields(); + } + mnl->forcefactor = mnl->EVMaxInv; + + solver_pm.max_iter = mnl->maxiter; + solver_pm.squared_solver_prec = mnl->forceprec; + solver_pm.no_shifts = mnl->rat.np; + solver_pm.shifts = mnl->rat.mu; + solver_pm.rel_prec = g_relative_precision_flag; + solver_pm.type = mnl->solver; + + solver_pm.M_ndpsi = &Qtm_pm_ndpsi; + solver_pm.M_ndpsi32 = &Qtm_pm_ndpsi_32; + if(mnl->type == NDCLOVERRAT) { + solver_pm.M_ndpsi = &Qsw_pm_ndpsi; + solver_pm.M_ndpsi32 = &Qsw_pm_ndpsi_32; + } + solver_pm.sdim = VOLUME/2; + // this generates all X_j,o (odd sites only) -> g_chi_up|dn_spinor_field + mnl->iter1 += solve_mms_nd(g_chi_up_spinor_field, g_chi_dn_spinor_field, + mnl->pf, mnl->pf2,&solver_pm); + + for(int j = (mnl->rat.np-1); j > -1; j--) { + if(mnl->type == NDCLOVERRAT) { + // multiply with Q_h * tau^1 + i mu_j to get Y_j,o (odd sites) + // needs phmc_Cpol = 1 to work for ndrat! + Qsw_tau1_sub_const_ndpsi(mnl->w_fields[0], mnl->w_fields[1], + g_chi_up_spinor_field[j], g_chi_dn_spinor_field[j], + -I*mnl->rat.mu[j], 1., mnl->EVMaxInv); + + /* Get the even parts X_j,e */ + /* H_eo_... includes tau_1 */ + H_eo_sw_ndpsi(mnl->w_fields[2], mnl->w_fields[3], + g_chi_up_spinor_field[j], g_chi_dn_spinor_field[j]); + + } else { + // multiply with Q_h * tau^1 + i mu_j to get Y_j,o (odd sites) + // needs phmc_Cpol = 1 to work for ndrat! + Q_tau1_sub_const_ndpsi(mnl->w_fields[0], mnl->w_fields[1], + g_chi_up_spinor_field[j], g_chi_dn_spinor_field[j], + -I*mnl->rat.mu[j], 1., mnl->EVMaxInv); + + /* Get the even parts X_j,e */ + /* H_eo_... includes tau_1 */ + H_eo_tm_ndpsi(mnl->w_fields[2], mnl->w_fields[3], + g_chi_up_spinor_field[j], g_chi_dn_spinor_field[j], EO); + } + /* X_j,e^dagger \delta M_eo Y_j,o */ + deriv_Sb(EO, mnl->w_fields[2], mnl->w_fields[0], + hf, mnl->rat.rmu[j]*mnl->forcefactor); + deriv_Sb(EO, mnl->w_fields[3], mnl->w_fields[1], + hf, mnl->rat.rmu[j]*mnl->forcefactor); + + if(mnl->type == NDCLOVERRAT) { + /* Get the even parts Y_j,e */ + H_eo_sw_ndpsi(mnl->w_fields[4], mnl->w_fields[5], + mnl->w_fields[0], mnl->w_fields[1]); + } + else { + /* Get the even parts Y_j,e */ + H_eo_tm_ndpsi(mnl->w_fields[4], mnl->w_fields[5], + mnl->w_fields[0], mnl->w_fields[1], EO); + + } + /* X_j,o \delta M_oe Y_j,e */ + deriv_Sb(OE, g_chi_up_spinor_field[j], mnl->w_fields[4], + hf, mnl->rat.rmu[j]*mnl->forcefactor); + deriv_Sb(OE, g_chi_dn_spinor_field[j], mnl->w_fields[5], + hf, mnl->rat.rmu[j]*mnl->forcefactor); + + if(mnl->type == NDCLOVERRAT) { + // even/even sites sandwiched by tau_1 gamma_5 Y_e and gamma_5 X_e + sw_spinor(EE, mnl->w_fields[5], mnl->w_fields[2], + mnl->rat.rmu[j]*mnl->forcefactor); + // odd/odd sites sandwiched by tau_1 gamma_5 Y_o and gamma_5 X_o + sw_spinor(OO, g_chi_up_spinor_field[j], mnl->w_fields[1], + mnl->rat.rmu[j]*mnl->forcefactor); + + // even/even sites sandwiched by tau_1 gamma_5 Y_e and gamma_5 X_e + sw_spinor(EE, mnl->w_fields[4], mnl->w_fields[3], + mnl->rat.rmu[j]*mnl->forcefactor); + // odd/odd sites sandwiched by tau_1 gamma_5 Y_o and gamma_5 X_o + sw_spinor(OO, g_chi_dn_spinor_field[j], mnl->w_fields[0], + mnl->rat.rmu[j]*mnl->forcefactor); + } + } + // trlog part does not depend on the normalisation + if(mnl->type == NDCLOVERRAT && mnl->trlog) { + sw_deriv_nd(EE); + } + if(mnl->type == NDCLOVERRAT) { + sw_all(hf, mnl->kappa, mnl->c_sw); + } + etime = gettime(); + if(g_debug_level > 1 && g_proc_id == 0) { + printf("# Time for %s monomial derivative: %e s\n", mnl->name, etime-atime); + } + return; +} + + +void ndrat_heatbath(const int id, hamiltonian_field_t * const hf) { + monomial * mnl = &monomial_list[id]; + solver_pm_t solver_pm; + double atime, etime; + atime = gettime(); + nd_set_global_parameter(mnl); + mnl->iter1 = 0; + if(mnl->type == NDCLOVERRAT) { + init_sw_fields(); + sw_term((const su3**)hf->gaugefield, mnl->kappa, mnl->c_sw); + sw_invert_nd(mnl->mubar*mnl->mubar - mnl->epsbar*mnl->epsbar); + copy_32_sw_fields(); + } + // we measure before the trajectory! + if((mnl->rec_ev != 0) && (hf->traj_counter%mnl->rec_ev == 0)) { + if(mnl->type != NDCLOVERRAT) phmc_compute_ev(hf->traj_counter-1, id, &Qtm_pm_ndbipsi); + else phmc_compute_ev(hf->traj_counter-1, id, &Qsw_pm_ndbipsi); + } + + // the Gaussian distributed random fields + mnl->energy0 = 0.; + random_spinor_field_eo(mnl->pf, mnl->rngrepro, RN_GAUSS); + mnl->energy0 = square_norm(mnl->pf, VOLUME/2, 1); + + random_spinor_field_eo(mnl->pf2, mnl->rngrepro, RN_GAUSS); + mnl->energy0 += square_norm(mnl->pf2, VOLUME/2, 1); + // set solver parameters + solver_pm.max_iter = mnl->maxiter; + solver_pm.squared_solver_prec = mnl->accprec; + solver_pm.no_shifts = mnl->rat.np; + solver_pm.shifts = mnl->rat.nu; + solver_pm.type = mnl->solver; + solver_pm.M_ndpsi = &Qtm_pm_ndpsi; + solver_pm.M_ndpsi32 = &Qtm_pm_ndpsi_32; + if(mnl->type == NDCLOVERRAT) { + solver_pm.M_ndpsi = &Qsw_pm_ndpsi; + solver_pm.M_ndpsi32 = &Qsw_pm_ndpsi_32; + } + solver_pm.sdim = VOLUME/2; + solver_pm.rel_prec = g_relative_precision_flag; + mnl->iter0 = solve_mms_nd(g_chi_up_spinor_field, g_chi_dn_spinor_field, + mnl->pf, mnl->pf2, &solver_pm); + + assign(mnl->w_fields[2], mnl->pf, VOLUME/2); + assign(mnl->w_fields[3], mnl->pf2, VOLUME/2); + + // apply C to the random field to generate pseudo-fermion fields + for(int j = (mnl->rat.np-1); j > -1; j--) { + // Q_h * tau^1 - i nu_j + // this needs phmc_Cpol = 1 to work! + if(mnl->type == NDCLOVERRAT) { + Qsw_tau1_sub_const_ndpsi(g_chi_up_spinor_field[mnl->rat.np], g_chi_dn_spinor_field[mnl->rat.np], + g_chi_up_spinor_field[j], g_chi_dn_spinor_field[j], + I*mnl->rat.nu[j], 1., mnl->EVMaxInv); + } + else { + Q_tau1_sub_const_ndpsi(g_chi_up_spinor_field[mnl->rat.np], g_chi_dn_spinor_field[mnl->rat.np], + g_chi_up_spinor_field[j], g_chi_dn_spinor_field[j], + I*mnl->rat.nu[j], 1., mnl->EVMaxInv); + } + assign_add_mul(mnl->pf, g_chi_up_spinor_field[mnl->rat.np], I*mnl->rat.rnu[j], VOLUME/2); + assign_add_mul(mnl->pf2, g_chi_dn_spinor_field[mnl->rat.np], I*mnl->rat.rnu[j], VOLUME/2); + } + + etime = gettime(); + if(g_proc_id == 0) { + if(g_debug_level > 1) { + printf("# Time for %s monomial heatbath: %e s\n", mnl->name, etime-atime); + } + if(g_debug_level > 3) { + printf("called ndrat_heatbath for id %d energy %f\n", id, mnl->energy0); + } + } + return; +} + + +double ndrat_acc(const int id, hamiltonian_field_t * const hf) { + solver_pm_t solver_pm; + monomial * mnl = &monomial_list[id]; + double atime, etime; + atime = gettime(); + nd_set_global_parameter(mnl); + if(mnl->type == NDCLOVERRAT) { + sw_term((const su3**) hf->gaugefield, mnl->kappa, mnl->c_sw); + sw_invert_nd(mnl->mubar*mnl->mubar - mnl->epsbar*mnl->epsbar); + copy_32_sw_fields(); + } + mnl->energy1 = 0.; + + solver_pm.max_iter = mnl->maxiter; + solver_pm.squared_solver_prec = mnl->accprec; + solver_pm.no_shifts = mnl->rat.np; + solver_pm.shifts = mnl->rat.mu; + solver_pm.type = mnl->solver; + + solver_pm.M_ndpsi = &Qtm_pm_ndpsi; + solver_pm.M_ndpsi32 = &Qtm_pm_ndpsi_32; + if(mnl->type == NDCLOVERRAT) { + solver_pm.M_ndpsi = &Qsw_pm_ndpsi; + solver_pm.M_ndpsi32 = &Qsw_pm_ndpsi_32; + } + solver_pm.sdim = VOLUME/2; + solver_pm.rel_prec = g_relative_precision_flag; + mnl->iter0 += solve_mms_nd(g_chi_up_spinor_field, g_chi_dn_spinor_field, + mnl->pf, mnl->pf2,&solver_pm); + + // apply R to the pseudo-fermion fields + assign(mnl->w_fields[0], mnl->pf, VOLUME/2); + assign(mnl->w_fields[1], mnl->pf2, VOLUME/2); + for(int j = (mnl->rat.np-1); j > -1; j--) { + assign_add_mul_r(mnl->w_fields[0], g_chi_up_spinor_field[j], + mnl->rat.rmu[j], VOLUME/2); + assign_add_mul_r(mnl->w_fields[1], g_chi_dn_spinor_field[j], + mnl->rat.rmu[j], VOLUME/2); + } + + mnl->energy1 = scalar_prod_r(mnl->pf, mnl->w_fields[0], VOLUME/2, 1); + mnl->energy1 += scalar_prod_r(mnl->pf2, mnl->w_fields[1], VOLUME/2, 1); + etime = gettime(); + if(g_proc_id == 0) { + if(g_debug_level > 1) { + printf("# Time for %s monomial acc step: %e s\n", mnl->name, etime-atime); + } + if(g_debug_level > 0) { // shoud be 3 + printf("called ndrat_acc for id %d dH = %1.10e\n", id, mnl->energy1 - mnl->energy0); + } + } + return(mnl->energy1 - mnl->energy0); +} + + +int init_ndrat_monomial(const int id) { + monomial * mnl = &monomial_list[id]; + + mnl->EVMin = mnl->StildeMin / mnl->StildeMax; + mnl->EVMax = 1.; + mnl->EVMaxInv = 1./(sqrt(mnl->StildeMax)); + + if(mnl->type == RAT || mnl->type == CLOVERRAT || + mnl->type == RATCOR || mnl->type == CLOVERRATCOR) { + init_rational(&mnl->rat, 1); + + if(init_chi_spinor_field(VOLUMEPLUSRAND/2, (mnl->rat.np+2)/2) != 0) { + fprintf(stderr, "Not enough memory for Chi fields! Aborting...\n"); + exit(0); + } + } + else { + init_rational(&mnl->rat, 0); + mnl->EVMin = mnl->StildeMin / mnl->StildeMax; + mnl->EVMax = 1.; + mnl->EVMaxInv = 1./(sqrt(mnl->StildeMax)); + + if(init_chi_spinor_field(VOLUMEPLUSRAND/2, (mnl->rat.np+1)) != 0) { + fprintf(stderr, "Not enough memory for Chi fields! Aborting...\n"); + exit(0); + } + } + + return(0); +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/ndrat_monomial.h b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/ndrat_monomial.h new file mode 100644 index 0000000000000000000000000000000000000000..8638cdbbffb2d02416134885c5aba94cfdf59fc9 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/ndrat_monomial.h @@ -0,0 +1,31 @@ +/*********************************************************************** + * + * Copyright (C) 2013 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _NDRAT_MONOMIAL_H +#define _NDRAT_MONOMIAL_H + +#include "hamiltonian_field.h" + +void ndrat_derivative(const int id, hamiltonian_field_t * const hf); +double ndrat_acc(const int id, hamiltonian_field_t * const hf); +void ndrat_heatbath(const int id, hamiltonian_field_t * const hf); +void nd_set_global_parameter(monomial * const mnl); +int init_ndrat_monomial(const int id); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/ndratcor_monomial.c b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/ndratcor_monomial.c new file mode 100644 index 0000000000000000000000000000000000000000..3f5b9e60b02c3f94aebeebd9fe94a773bba9e769 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/ndratcor_monomial.c @@ -0,0 +1,316 @@ +/*********************************************************************** + * + * Copyright (C) 2013 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "linalg_eo.h" +#include "start.h" +#include "gettime.h" +#include "solver/solver.h" +#include "solver/monomial_solve.h" +#include "deriv_Sb.h" +#include "init/init_chi_spinor_field.h" +#include "operator/tm_operators.h" +#include "operator/tm_operators_32.h" +#include "operator/tm_operators_nd.h" +#include "operator/tm_operators_nd_32.h" +#include "operator/Hopping_Matrix.h" +#include "monomial/monomial.h" +#include "hamiltonian_field.h" +#include "boundary.h" +#include "operator/clovertm_operators.h" +#include "operator/clover_leaf.h" +#include "rational/rational.h" +#include "phmc.h" +#include "ndrat_monomial.h" +#include "ndratcor_monomial.h" + +// computes ||(1 - C^dagger R C) phi|| +void check_C_ndpsi(spinor * const k_up, spinor * const k_dn, + spinor * const l_up, spinor * const l_dn, + const int id, hamiltonian_field_t * const hf, + solver_pm_t * solver_pm); + +// applies (Q^2 R^2 -1) phi +double apply_Z_ndpsi(spinor * const k_up, spinor * const k_dn, + spinor * const l_up, spinor * const l_dn, + const int id, hamiltonian_field_t * const hf, + solver_pm_t * solver_pm); + + + +void ndratcor_heatbath(const int id, hamiltonian_field_t * const hf) { + monomial * mnl = &monomial_list[id]; + solver_pm_t solver_pm; + double atime, etime, delta; + spinor * up0, * dn0, * up1, * dn1, * tup, * tdn; + double coefs[6] = {1./4., -3./32., 7./122., -77./2048., 231./8192., -1463./65536.}; + atime = gettime(); + nd_set_global_parameter(mnl); + g_mu3 = 0.; + mnl->iter0 = 0; + if(mnl->type == NDCLOVERRATCOR) { + init_sw_fields(); + sw_term((const su3**)hf->gaugefield, mnl->kappa, mnl->c_sw); + sw_invert_nd(mnl->mubar*mnl->mubar - mnl->epsbar*mnl->epsbar); + copy_32_sw_fields(); + } + // we measure before the trajectory! + if((mnl->rec_ev != 0) && (hf->traj_counter%mnl->rec_ev == 0)) { + if(mnl->type != NDCLOVERRAT) phmc_compute_ev(hf->traj_counter-1, id, &Qtm_pm_ndbipsi); + else phmc_compute_ev(hf->traj_counter-1, id, &Qsw_pm_ndbipsi); + } + + // the Gaussian distributed random fields + mnl->energy0 = 0.; + random_spinor_field_eo(mnl->pf, mnl->rngrepro, RN_GAUSS); + mnl->energy0 = square_norm(mnl->pf, VOLUME/2, 1); + + random_spinor_field_eo(mnl->pf2, mnl->rngrepro, RN_GAUSS); + mnl->energy0 += square_norm(mnl->pf2, VOLUME/2, 1); + + solver_pm.max_iter = mnl->maxiter; + solver_pm.squared_solver_prec = mnl->accprec; + solver_pm.no_shifts = mnl->rat.np; + solver_pm.shifts = mnl->rat.mu; + solver_pm.type = mnl->solver; + solver_pm.M_ndpsi = &Qtm_pm_ndpsi; + solver_pm.M_ndpsi32 = &Qtm_pm_ndpsi_32; + if(mnl->type == NDCLOVERRATCOR) { + solver_pm.M_ndpsi = &Qsw_pm_ndpsi; + solver_pm.M_ndpsi32 = &Qsw_pm_ndpsi_32; + } + solver_pm.sdim = VOLUME/2; + solver_pm.rel_prec = g_relative_precision_flag; + + // apply B to the random field to generate pseudo-fermion fields + assign(mnl->w_fields[0], mnl->pf, VOLUME/2); + assign(mnl->w_fields[1], mnl->pf2, VOLUME/2); + up0 = mnl->w_fields[0]; dn0 = mnl->w_fields[1]; + up1 = mnl->w_fields[2]; dn1 = mnl->w_fields[3]; + + for(int i = 1; i < 8; i++) { + delta = apply_Z_ndpsi(up1, dn1, up0, dn0, id, hf, &solver_pm); + assign_add_mul_r(mnl->pf, up1, coefs[i-1], VOLUME/2); + assign_add_mul_r(mnl->pf2, dn1, coefs[i-1], VOLUME/2); + if(delta < mnl->accprec) break; + tup = up0; tdn = dn0; + up0 = up1; dn0 = dn1; + up1 = tup; dn1 = tdn; + } + etime = gettime(); + if(g_proc_id == 0) { + if(g_debug_level > 1) { + printf("# Time for %s monomial heatbath: %e s\n", mnl->name, etime-atime); + } + if(g_debug_level > 3) { + printf("called ndratcor_heatbath for id %d energy %f\n", id, mnl->energy0); + } + } + return; +} + + +double ndratcor_acc(const int id, hamiltonian_field_t * const hf) { + solver_pm_t solver_pm; + monomial * mnl = &monomial_list[id]; + double atime, etime, delta; + spinor * up0, * dn0, * up1, * dn1, * tup, * tdn; + double coefs[6] = {-1./2., 3./8., -5./16., 35./128., -63./256., 231./1024.}; + atime = gettime(); + nd_set_global_parameter(mnl); + g_mu3 = 0.; + if(mnl->type == NDCLOVERRATCOR) { + sw_term((const su3**) hf->gaugefield, mnl->kappa, mnl->c_sw); + sw_invert_nd(mnl->mubar*mnl->mubar - mnl->epsbar*mnl->epsbar); + copy_32_sw_fields(); + } + mnl->energy1 = 0.; + + solver_pm.max_iter = mnl->maxiter; + solver_pm.squared_solver_prec = mnl->accprec; + solver_pm.no_shifts = mnl->rat.np; + solver_pm.shifts = mnl->rat.mu; + solver_pm.type = mnl->solver; + solver_pm.M_ndpsi = &Qtm_pm_ndpsi; + solver_pm.M_ndpsi32 = &Qtm_pm_ndpsi_32; + if(mnl->type == NDCLOVERRATCOR) { + solver_pm.M_ndpsi = &Qsw_pm_ndpsi; + solver_pm.M_ndpsi32 = &Qsw_pm_ndpsi_32; + } + solver_pm.sdim = VOLUME/2; + solver_pm.rel_prec = g_relative_precision_flag; + + // apply (Q R)^(-1) to pseudo-fermion fields + assign(mnl->w_fields[4], mnl->pf, VOLUME/2); + assign(mnl->w_fields[5], mnl->pf2, VOLUME/2); + up0 = mnl->w_fields[0]; dn0 = mnl->w_fields[1]; + up1 = mnl->w_fields[2]; dn1 = mnl->w_fields[3]; + + delta = apply_Z_ndpsi(up0, dn0, mnl->pf, mnl->pf2, id, hf, &solver_pm); + assign_add_mul_r(mnl->w_fields[4], up0, coefs[0], VOLUME/2); + assign_add_mul_r(mnl->w_fields[5], dn0, coefs[0], VOLUME/2); + + for(int i = 2; i < 8; i++) { + if(delta < mnl->accprec) break; + delta = apply_Z_ndpsi(up1, dn1, up0, dn0, id, hf, &solver_pm); + assign_add_mul_r(mnl->w_fields[4], up1, coefs[i-1], VOLUME/2); + assign_add_mul_r(mnl->w_fields[5], dn1, coefs[i-1], VOLUME/2); + tup = up0; tdn = dn0; + up0 = up1; dn0 = dn1; + up1 = tup; dn1 = tdn; + } + + mnl->energy1 = scalar_prod_r(mnl->pf, mnl->w_fields[4], VOLUME/2, 1); + mnl->energy1 += scalar_prod_r(mnl->pf2, mnl->w_fields[5], VOLUME/2, 1); + etime = gettime(); + if(g_proc_id == 0) { + if(g_debug_level > 1) { + printf("# Time for %s monomial acc step: %e s\n", mnl->name, etime-atime); + } + if(g_debug_level > 3) { // shoud be 3 + printf("called ndratcor_acc for id %d dH = %1.10e\n", id, mnl->energy1 - mnl->energy0); + } + } + return(mnl->energy1 - mnl->energy0); +} + +// applies ((Q_h\tau_1 * R)^2 - 1) + +double apply_Z_ndpsi(spinor * const k_up, spinor * const k_dn, + spinor * const l_up, spinor * const l_dn, + const int id, hamiltonian_field_t * const hf, + solver_pm_t * solver_pm) { + monomial * mnl = &monomial_list[id]; + + mnl->iter0 += solve_mms_nd(g_chi_up_spinor_field, g_chi_dn_spinor_field, + l_up, l_dn, solver_pm); + + // apply R to the pseudo-fermion fields + assign(k_up, l_up, VOLUME/2); + assign(k_dn, l_dn, VOLUME/2); + for(int j = (mnl->rat.np-1); j > -1; j--) { + assign_add_mul_r(k_up, g_chi_up_spinor_field[j], + mnl->rat.rmu[j], VOLUME/2); + assign_add_mul_r(k_dn, g_chi_dn_spinor_field[j], + mnl->rat.rmu[j], VOLUME/2); + } + + // apply R a second time + solve_mms_nd(g_chi_up_spinor_field, g_chi_dn_spinor_field, + k_up, k_dn, + solver_pm); + for(int j = (mnl->rat.np-1); j > -1; j--) { + assign_add_mul_r(k_up, g_chi_up_spinor_field[j], + mnl->rat.rmu[j], VOLUME/2); + assign_add_mul_r(k_dn, g_chi_dn_spinor_field[j], + mnl->rat.rmu[j], VOLUME/2); + } + mul_r(g_chi_up_spinor_field[mnl->rat.np], mnl->rat.A*mnl->rat.A, + k_up, VOLUME/2); + mul_r(g_chi_dn_spinor_field[mnl->rat.np], mnl->rat.A*mnl->rat.A, + k_dn, VOLUME/2); + // apply Q^2 and compute the residue + solver_pm->M_ndpsi(k_up, k_dn, + g_chi_up_spinor_field[mnl->rat.np], g_chi_dn_spinor_field[mnl->rat.np]); + diff(k_up, k_up, l_up, VOLUME/2); + diff(k_dn, k_dn, l_dn, VOLUME/2); + double resi = square_norm(k_up, VOLUME/2, 1) + square_norm(k_dn, VOLUME/2, 1); + if(g_debug_level > 2 && g_proc_id == 0) { + printf("# NDRATCOR: ||Z * phi|| = %e\n", resi); + } + return(resi); +} + +// computes ||(1 - C^dagger R C) phi|| + +void check_C_ndpsi(spinor * const k_up, spinor * const k_dn, + spinor * const l_up, spinor * const l_dn, + const int id, hamiltonian_field_t * const hf, + solver_pm_t * solver_pm) { + monomial * mnl = &monomial_list[id]; + mnl->iter0 = solve_mms_nd(g_chi_up_spinor_field, g_chi_dn_spinor_field, + l_up, l_dn, solver_pm); + + assign(k_up, l_up, VOLUME/2); + assign(k_dn, l_dn, VOLUME/2); + + // apply C to the random field to generate pseudo-fermion fields + for(int j = (mnl->rat.np-1); j > -1; j--) { + // Q_h * tau^1 - i nu_j + // this needs phmc_Cpol = 1 to work! + if(mnl->type == NDCLOVERRATCOR || mnl->type == NDCLOVERRAT) { + Qsw_tau1_sub_const_ndpsi(g_chi_up_spinor_field[mnl->rat.np], g_chi_dn_spinor_field[mnl->rat.np], + g_chi_up_spinor_field[j], g_chi_dn_spinor_field[j], + I*mnl->rat.nu[j], 1., mnl->EVMaxInv); + } + else { + Q_tau1_sub_const_ndpsi(g_chi_up_spinor_field[mnl->rat.np], g_chi_dn_spinor_field[mnl->rat.np], + g_chi_up_spinor_field[j], g_chi_dn_spinor_field[j], + I*mnl->rat.nu[j], 1., mnl->EVMaxInv); + } + assign_add_mul(k_up, g_chi_up_spinor_field[mnl->rat.np], I*mnl->rat.rnu[j], VOLUME/2); + assign_add_mul(k_dn, g_chi_dn_spinor_field[mnl->rat.np], I*mnl->rat.rnu[j], VOLUME/2); + } + //apply R + solver_pm->shifts = mnl->rat.mu; + solve_mms_nd(g_chi_up_spinor_field, g_chi_dn_spinor_field, + k_up, k_dn, + solver_pm); + for(int j = (mnl->rat.np-1); j > -1; j--) { + assign_add_mul_r(k_up, g_chi_up_spinor_field[j], + mnl->rat.rmu[j], VOLUME/2); + assign_add_mul_r(k_dn, g_chi_dn_spinor_field[j], + mnl->rat.rmu[j], VOLUME/2); + } + // apply C^dagger + solver_pm->shifts = mnl->rat.nu; + solve_mms_nd(g_chi_up_spinor_field, g_chi_dn_spinor_field, + k_up, k_dn, solver_pm); + for(int j = (mnl->rat.np-1); j > -1; j--) { + // Q_h * tau^1 + i nu_j + if(mnl->type == NDCLOVERRATCOR || mnl->type == NDCLOVERRAT) { + Qsw_tau1_sub_const_ndpsi(g_chi_up_spinor_field[mnl->rat.np], g_chi_dn_spinor_field[mnl->rat.np], + g_chi_up_spinor_field[j], g_chi_dn_spinor_field[j], + -I*mnl->rat.nu[j], 1., mnl->EVMaxInv); + } + else { + Q_tau1_sub_const_ndpsi(g_chi_up_spinor_field[mnl->rat.np], g_chi_dn_spinor_field[mnl->rat.np], + g_chi_up_spinor_field[j], g_chi_dn_spinor_field[j], + -I*mnl->rat.nu[j], 1., mnl->EVMaxInv); + } + assign_add_mul(k_up, g_chi_up_spinor_field[mnl->rat.np], -I*mnl->rat.rnu[j], VOLUME/2); + assign_add_mul(k_dn, g_chi_dn_spinor_field[mnl->rat.np], -I*mnl->rat.rnu[j], VOLUME/2); + } + diff(k_up, k_up, l_up, VOLUME/2); + diff(k_dn, k_dn, l_dn, VOLUME/2); + double resi = square_norm(k_up, VOLUME/2, 1); + resi += square_norm(k_dn, VOLUME/2, 1); + if(g_proc_id == 0) printf("|| (1-C^dagger R C)*phi|| = %e\n", resi); + + return; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/ndratcor_monomial.h b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/ndratcor_monomial.h new file mode 100644 index 0000000000000000000000000000000000000000..82ba0c53f66fee8bb8557def3c7496684d09bfe7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/ndratcor_monomial.h @@ -0,0 +1,28 @@ +/*********************************************************************** + * + * Copyright (C) 2013 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _NDRATCOR_MONOMIAL_H +#define _NDRATCOR_MONOMIAL_H + +#include "hamiltonian_field.h" + +double ndratcor_acc(const int id, hamiltonian_field_t * const hf); +void ndratcor_heatbath(const int id, hamiltonian_field_t * const hf); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/poly_monomial.c b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/poly_monomial.c new file mode 100644 index 0000000000000000000000000000000000000000..07fc20455e9b1039f2841ca34208ab2790786914 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/poly_monomial.c @@ -0,0 +1,349 @@ +/*********************************************************************** + * + * Copyright (C) 2010 Andreas Nube + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "global.h" +#include "start.h" +#include "gettime.h" +#include "read_input.h" +#include "monomial/monomial.h" +#include "poly_monomial.h" +#include "boundary.h" +#include "linalg/square_norm.h" +#include "linalg/assign.h" +#include "linalg/mul_r.h" +#include "linalg/diff.h" +#include "linalg_eo.h" +#include "deriv_Sb.h" +#include "operator/tm_operators.h" +#include "solver/solver.h" +#include "solver/chrono_guess.h" +#include "solver/eigenvalues.h" +#include "operator/tm_operators_nd.h" +#include "operator/Hopping_Matrix.h" +#include "hamiltonian_field.h" +#include "phmc.h" + + + +inline void setPhmcVars(monomial *mnl){ + phmc_invmaxev=1.0/mnl->MDPolyLmax; + phmc_dop_n_cheby=(mnl->MDPolyDegree/2)+1; + phmc_Cpol=mnl->MDPolyLocNormConst; + phmc_root=mnl->MDPolyRoots; +} + +void poly_derivative(const int id, hamiltonian_field_t * const hf){ + double atime, etime; + monomial * mnl = &monomial_list[id]; + int k,j; + int degreehalf=mnl->MDPolyDegree/2; + + spinor** chi_spinor_field=mnl->MDPoly_chi_spinor_fields; + + atime = gettime(); + (*mnl).forcefactor = -mnl->MDPolyLocNormConst/mnl->MDPolyLmax; + + + /* push and set phmc vars */ + pushPhmcVars(); + setPhmcVars(mnl); + + + if(mnl->even_odd_flag){ + + + + if(mnl->MDPolyDetRatio==1){ + g_mu=mnl->mu2; + boundary(mnl->kappa2); + Qtm_plus_psi(chi_spinor_field[0],mnl->pf); + } else { + assign(chi_spinor_field[0],mnl->pf,VOLUME/2); + } + + + g_mu=mnl->mu; + boundary(mnl->kappa); + + + /* Here comes the definitions for the chi_j fields */ + /* from j=0 (chi_0 = phi) ..... to j = n-1 */ + for(k = 0; k < degreehalf-1 ; k++) { + Qtm_pm_sub_const_nrm_psi(chi_spinor_field[k+1], + chi_spinor_field[k], + mnl->MDPolyRoots[k]); + } + + + + assign(chi_spinor_field[degreehalf+1], + chi_spinor_field[degreehalf-1], VOLUME/2); + + /* loop over monoms */ + for(j=degreehalf; j>=1; j--) { + + assign(chi_spinor_field[degreehalf], + chi_spinor_field[degreehalf+1], VOLUME/2); + + Qtm_pm_sub_const_nrm_psi(chi_spinor_field[degreehalf+1], + chi_spinor_field[degreehalf], + mnl->MDPolyRoots[mnl->MDPolyDegree-(j+1)]); + + + Qtm_minus_psi(mnl->w_fields[1],chi_spinor_field[j-1]); + + H_eo_tm_inv_psi(mnl->w_fields[0], chi_spinor_field[degreehalf+1], EO, -1.); + deriv_Sb(OE, mnl->w_fields[1], mnl->w_fields[0], hf, mnl->forcefactor); + + H_eo_tm_inv_psi(mnl->w_fields[0], mnl->w_fields[1], EO, 1.); + deriv_Sb(EO, mnl->w_fields[0], chi_spinor_field[degreehalf+1], hf, mnl->forcefactor); + + Qtm_minus_psi(mnl->w_fields[1],chi_spinor_field[degreehalf+1]); + + H_eo_tm_inv_psi(mnl->w_fields[0],mnl->w_fields[1], EO, +1.); + deriv_Sb(OE, chi_spinor_field[j-1] , mnl->w_fields[0], hf, mnl->forcefactor); + + H_eo_tm_inv_psi(mnl->w_fields[0], chi_spinor_field[j-1], EO, -1.); + deriv_Sb(EO, mnl->w_fields[0], mnl->w_fields[1], hf, mnl->forcefactor); + } + + + if(mnl->MDPolyDetRatio==1){ + /****************************************** + * multiply with the last missing monomial * + * such that we get an evaluation of P * + ******************************************/ + Qtm_pm_sub_const_nrm_psi(chi_spinor_field[degreehalf], + chi_spinor_field[degreehalf+1], + mnl->MDPolyRoots[mnl->MDPolyDegree-1]); + + /* devide by this factor cause its multiplied again in update_fermion_momenta see comment below */ + mul_r(chi_spinor_field[degreehalf], + 1./mnl->MDPolyLocNormConst*mnl->MDPolyLmax, + chi_spinor_field[degreehalf], + VOLUME/2); + + + g_mu=mnl->mu2; + boundary(mnl->kappa2); + + H_eo_tm_inv_psi(mnl->w_fields[0],chi_spinor_field[degreehalf], EO, -1.); + deriv_Sb(OE, mnl->pf , mnl->w_fields[0], hf, mnl->forcefactor); + + H_eo_tm_inv_psi(mnl->w_fields[0], mnl->pf, EO, +1.); + deriv_Sb(EO, mnl->w_fields[0], chi_spinor_field[degreehalf], hf, mnl->forcefactor); + } + } + else { + if(g_proc_id == 0) { + fprintf(stderr,"Error: PHMC for light quarks not implementeted for non even/odd preconditioning\n"); + } + + g_mu = g_mu1; + boundary(g_kappa); + popPhmcVars(); + + return; + } + + /* restore all changed global vars */ + g_mu = g_mu1; + boundary(g_kappa); + popPhmcVars(); + etime = gettime(); + if(g_debug_level > 1 && g_proc_id == 0) { + printf("# Time for %s monomial derivative: %e s\n", mnl->name, etime-atime); + } + return; +} + +double poly_acc(const int id, hamiltonian_field_t * const hf){ + + monomial * mnl = &monomial_list[id]; + int j; + double diff; + int no_eigenvalues=-1; + double atime, etime; + atime = gettime(); + if(mnl->even_odd_flag) { + if(mnl->MDPolyDetRatio==1) { + g_mu = mnl->mu2; + boundary(mnl->kappa2); + Qtm_plus_psi(mnl->w_fields[1],mnl->pf); + } + else { + assign(mnl->w_fields[1],mnl->pf,VOLUME/2); + } + + g_mu = mnl->mu; + boundary(mnl->kappa); + + /* push and set phmc vars */ + pushPhmcVars(); + setPhmcVars(mnl); + + /* apply B */ + for(j = 0; j < mnl->MDPolyDegree/2; j++){ + assign(mnl->w_fields[0], mnl->w_fields[1], VOLUME/2); + Qtm_pm_sub_const_nrm_psi(mnl->w_fields[1], + mnl->w_fields[0], + mnl->MDPolyRoots[j]); + } + + mnl->energy1 = square_norm(mnl->w_fields[1], VOLUME/2,1); + + /* calculate evs */ + if (compute_evs != 0) { + no_eigenvalues=10; + eigenvalues(&no_eigenvalues, mnl->maxiter, eigenvalue_precision, + 0/* compute minimal evs*/, 0/*dont write evecs*/, nstore, mnl->even_odd_flag); + + no_eigenvalues=1; + eigenvalues(&no_eigenvalues, mnl->maxiter, eigenvalue_precision, + 1/* compute maximal evs*/, 0/*dont write evecs*/, nstore, mnl->even_odd_flag); + } + + + /* restore global phmc vars */ + popPhmcVars(); + + + /* return the energy differnce */ + g_mu = g_mu1; + boundary(g_kappa); + + + + if(g_proc_id == 0 && g_debug_level > 3) { + fprintf(stderr," Poly energy1 = %e \n" , mnl->energy1); + fprintf(stderr," Poly energy0 = %e \n" , mnl->energy0); + diff = mnl->energy1 - mnl->energy0; + fprintf(stderr," Poly energy diff = %e \n" , diff); + } + } + else { + if(g_proc_id == 0) { + fprintf(stderr,"Error: PHMC for light quarks not implementeted for non even/odd preconditioning\n"); + } + + g_mu = g_mu1; + boundary(g_kappa); + + return NAN; + } + etime = gettime(); + if(g_proc_id == 0) { + if(g_debug_level > 1) { + printf("# Time for %s monomial acc step: %e s\n", mnl->name, etime-atime); + } + if(g_debug_level > 3) { + printf("called poly_acc for id %d dH = %1.10e\n", + id, mnl->energy1 - mnl->energy0); + } + } + return (mnl->energy1 - mnl->energy0); +} + +void poly_heatbath(const int id, hamiltonian_field_t * const hf){ + monomial * mnl = &monomial_list[id]; + int j; + double atime, etime; + atime = gettime(); + mnl->csg_n = 0; + mnl->csg_n2 = 0; + mnl->iter0 = 0; + mnl->iter1 = 0; + + g_mu = mnl->mu; + boundary(mnl->kappa); + + /* push and set phmc vars */ + pushPhmcVars(); + setPhmcVars(mnl); + + if(mnl->even_odd_flag) { + + + random_spinor_field_eo(mnl->w_fields[0], mnl->rngrepro, RN_GAUSS); + mnl->energy0 = square_norm(mnl->w_fields[0], VOLUME/2, 1); + + if(g_proc_id == 0 && g_debug_level > 3) { + fprintf(stderr," Poly energy0 = %e \n" , mnl->energy0); + } + + /* calculate the phmc hamiltonian */ + Qtm_pm_psi(mnl->w_fields[1], mnl->w_fields[0]); + + /* solve (Q+)*(Q-)*P((Q+)*(Q-)) *x=y */ + cg_her(mnl->w_fields[0], mnl->w_fields[1], + 1000,mnl->accprec,g_relative_precision_flag,VOLUME/2, Qtm_pm_Ptm_pm_psi); + + /* phi= Bdagger phi */ + for(j = 0; j < (mnl->MDPolyDegree/2); j++){ + assign(mnl->w_fields[1], mnl->w_fields[0], VOLUME/2); + Qtm_pm_sub_const_nrm_psi(mnl->w_fields[0], + mnl->w_fields[1], + mnl->MDPolyRoots[mnl->MDPolyDegree/2+j]); + } + + + if(mnl->MDPolyDetRatio==1){ + g_mu = mnl->mu2; + boundary(mnl->kappa2); + zero_spinor_field(mnl->pf,VOLUME/2); + mnl->iter0 = cg_her(mnl->w_fields[1], mnl->w_fields[0], mnl->maxiter, mnl->accprec, g_relative_precision_flag, + VOLUME/2, &Qtm_pm_psi); + Qtm_minus_psi(mnl->pf, mnl->w_fields[1]); + + chrono_add_solution(mnl->w_fields[1], mnl->csg_field, mnl->csg_index_array, + mnl->csg_N, &mnl->csg_n, VOLUME/2); + + } else { + /* store constructed phi field */ + assign(mnl->pf, mnl->w_fields[0], VOLUME/2); + } + + } + else { + /* not implemented */ + fprintf(stderr,"Error: non even odd preconditioned \"light\" phmc not implemented \n"); + } + + g_mu = g_mu1; + boundary(g_kappa); + popPhmcVars(); + + etime = gettime(); + if(g_proc_id == 0) { + if(g_debug_level > 1) { + printf("# Time for %s monomial heatbath: %e s\n", mnl->name, etime-atime); + } + if(g_debug_level > 3) { + printf("called poly_heatbath for id %d energy %f\n", id, mnl->energy0); + } + } + return; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/poly_monomial.h b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/poly_monomial.h new file mode 100644 index 0000000000000000000000000000000000000000..ce01e493f838495dbc917b5a5c7973ef13a531f2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/poly_monomial.h @@ -0,0 +1,30 @@ +/*********************************************************************** + * + * Copyright (C) 2010 Andreas Nube + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _POLY_MONOMIAL_H +#define _POLY_MONOMIAL_H + +#include "hamiltonian_field.h" + +void poly_derivative(const int id, hamiltonian_field_t * const hf); +double poly_acc(const int id, hamiltonian_field_t * const hf); +void poly_heatbath(const int id, hamiltonian_field_t * const hf); + + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/rat_monomial.c b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/rat_monomial.c new file mode 100644 index 0000000000000000000000000000000000000000..3e3aeef7cc8d11a1b39ee6e73d8ec0d4659fa14d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/rat_monomial.c @@ -0,0 +1,257 @@ +/*********************************************************************** + * + * Copyright (C) 2013 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "linalg_eo.h" +#include "start.h" +#include "gettime.h" +#include "solver/solver.h" +#include "deriv_Sb.h" +#include "init/init_chi_spinor_field.h" +#include "operator/tm_operators.h" +#include "operator/tm_operators_nd.h" +#include "operator/Hopping_Matrix.h" +#include "monomial/monomial.h" +#include "hamiltonian_field.h" +#include "boundary.h" +#include "operator/clovertm_operators.h" +#include "operator/clover_leaf.h" +#include "rational/rational.h" +#include "phmc.h" +#include "rat_monomial.h" + + +/******************************************** + * + * Here \delta S_b is computed + * + ********************************************/ + +void rat_derivative(const int id, hamiltonian_field_t * const hf) { + monomial * mnl = &monomial_list[id]; + solver_pm_t solver_pm; + double atime, etime, dummy; + atime = gettime(); + g_mu = 0; + g_mu3 = 0.; + boundary(mnl->kappa); + + if(mnl->type == CLOVERRAT) { + g_c_sw = mnl->c_sw; + for(int i = 0; i < VOLUME; i++) { + for(int mu = 0; mu < 4; mu++) { + _su3_zero(swm[i][mu]); + _su3_zero(swp[i][mu]); + } + } + + // we compute the clover term (1 + T_ee(oo)) for all sites x + sw_term( (const su3**) hf->gaugefield, mnl->kappa, mnl->c_sw); + // we invert it for the even sites only + sw_invert(EE, 0.); + } + //mnl->forcefactor = mnl->EVMaxInv*mnl->EVMaxInv; + mnl->forcefactor = 1.; + + solver_pm.max_iter = mnl->maxiter; + solver_pm.squared_solver_prec = mnl->forceprec; + solver_pm.no_shifts = mnl->rat.np; + solver_pm.shifts = mnl->rat.mu; + solver_pm.rel_prec = g_relative_precision_flag; + solver_pm.type = CGMMS; + solver_pm.M_psi = mnl->Qsq; + solver_pm.sdim = VOLUME/2; + // this generates all X_j,o (odd sites only) -> g_chi_up_spinor_field + mnl->iter1 += cg_mms_tm(g_chi_up_spinor_field, mnl->pf, + &solver_pm, &dummy); + + for(int j = (mnl->rat.np-1); j > -1; j--) { + mnl->Qp(mnl->w_fields[0], g_chi_up_spinor_field[j]); + if(mnl->type == CLOVERRAT) { + // apply Hopping Matrix M_{eo} + // to get the even sites of X_e + H_eo_sw_inv_psi(mnl->w_fields[2], g_chi_up_spinor_field[j], EO, -1, mnl->mu); + // \delta Q sandwitched by Y_o^\dagger and X_e + deriv_Sb(OE, mnl->w_fields[0], mnl->w_fields[2], hf, + mnl->rat.rmu[j]*mnl->forcefactor); + + // to get the even sites of Y_e + H_eo_sw_inv_psi(mnl->w_fields[3], mnl->w_fields[0], EO, +1, mnl->mu); + // \delta Q sandwitched by Y_e^\dagger and X_o + // uses the gauge field in hf and changes the derivative fields in hf + deriv_Sb(EO, mnl->w_fields[3], g_chi_up_spinor_field[j], hf, + mnl->rat.rmu[j]*mnl->forcefactor); + + // even/even sites sandwiched by gamma_5 Y_e and gamma_5 X_e + sw_spinor(EE, mnl->w_fields[2], mnl->w_fields[3], mnl->rat.rmu[j]*mnl->forcefactor); + + // odd/odd sites sandwiched by gamma_5 Y_o and gamma_5 X_o + sw_spinor(OO, mnl->w_fields[0], g_chi_up_spinor_field[j], mnl->rat.rmu[j]*mnl->forcefactor); + + } + else { + /* apply Hopping Matrix M_{eo} */ + /* to get the even sites of X_e */ + H_eo_tm_inv_psi(mnl->w_fields[2], g_chi_up_spinor_field[j], EO, -1.); + /* \delta Q sandwitched by Y_o^\dagger and X_e */ + deriv_Sb(OE, mnl->w_fields[0], mnl->w_fields[2], hf, + mnl->rat.rmu[j]*mnl->forcefactor); + + /* to get the even sites of Y_e */ + H_eo_tm_inv_psi(mnl->w_fields[3], mnl->w_fields[0], EO, +1); + /* \delta Q sandwitched by Y_e^\dagger and X_o */ + deriv_Sb(EO, mnl->w_fields[3], g_chi_up_spinor_field[j], hf, + mnl->rat.rmu[j]*mnl->forcefactor); + } + } + if(mnl->type == CLOVERRAT && mnl->trlog) { + sw_deriv(EE, 0.); + } + if(mnl->type == CLOVERRAT) { + sw_all(hf, mnl->kappa, mnl->c_sw); + } + etime = gettime(); + if(g_debug_level > 1 && g_proc_id == 0) { + printf("# Time for %s monomial derivative: %e s\n", mnl->name, etime-atime); + } + return; +} + + +void rat_heatbath(const int id, hamiltonian_field_t * const hf) { + monomial * mnl = &monomial_list[id]; + solver_pm_t solver_pm; + double atime, etime, dummy; + atime = gettime(); + // only for non-twisted operators + g_mu = 0.; + g_mu3 = 0.; + boundary(mnl->kappa); + + mnl->iter1 = 0; + g_mu3 = 0.; + if(mnl->type == CLOVERRAT) { + g_c_sw = mnl->c_sw; + init_sw_fields(); + sw_term((const su3**)hf->gaugefield, mnl->kappa, mnl->c_sw); + sw_invert(EE, 0.); + } + // we measure before the trajectory! + if((mnl->rec_ev != 0) && (hf->traj_counter%mnl->rec_ev == 0)) { + //if(mnl->type != CLOVERRAT) phmc_compute_ev(hf->traj_counter-1, id, &Qtm_pm_ndbipsi); + //else phmc_compute_ev(hf->traj_counter-1, id, &Qsw_pm_ndbipsi); + } + + // the Gaussian distributed random fields + mnl->energy0 = 0.; + random_spinor_field_eo(mnl->pf, mnl->rngrepro, RN_GAUSS); + mnl->energy0 = square_norm(mnl->pf, VOLUME/2, 1); + + // set solver parameters + solver_pm.max_iter = mnl->maxiter; + solver_pm.squared_solver_prec = mnl->accprec; + solver_pm.no_shifts = mnl->rat.np; + solver_pm.shifts = mnl->rat.nu; + solver_pm.type = CGMMS; + solver_pm.M_psi = mnl->Qsq; + solver_pm.sdim = VOLUME/2; + solver_pm.rel_prec = g_relative_precision_flag; + mnl->iter0 = cg_mms_tm(g_chi_up_spinor_field, mnl->pf, + &solver_pm, &dummy); + + assign(mnl->w_fields[2], mnl->pf, VOLUME/2); + + // apply C to the random field to generate pseudo-fermion fields + for(int j = (mnl->rat.np-1); j > -1; j--) { + // Q - i nu_j (not twisted mass term, so Qp=Qm=Q + mnl->Qp(g_chi_up_spinor_field[mnl->rat.np], g_chi_up_spinor_field[j]); + assign_add_mul(g_chi_up_spinor_field[mnl->rat.np], g_chi_up_spinor_field[j], -I*mnl->rat.nu[j], VOLUME/2); + assign_add_mul(mnl->pf, g_chi_up_spinor_field[mnl->rat.np], I*mnl->rat.rnu[j], VOLUME/2); + } + + etime = gettime(); + if(g_proc_id == 0) { + if(g_debug_level > 1) { + printf("# Time for %s monomial heatbath: %e s\n", mnl->name, etime-atime); + } + if(g_debug_level > 3) { + printf("called rat_heatbath for id %d energy %f\n", id, mnl->energy0); + } + } + return; +} + + +double rat_acc(const int id, hamiltonian_field_t * const hf) { + solver_pm_t solver_pm; + monomial * mnl = &monomial_list[id]; + double atime, etime, dummy; + atime = gettime(); + // only for non-twisted operators + g_mu = 0.; + g_mu3 = 0.; + boundary(mnl->kappa); + if(mnl->type == CLOVERRAT) { + g_c_sw = mnl->c_sw; + sw_term((const su3**) hf->gaugefield, mnl->kappa, mnl->c_sw); + sw_invert(EE, 0.); + } + mnl->energy1 = 0.; + + solver_pm.max_iter = mnl->maxiter; + solver_pm.squared_solver_prec = mnl->accprec; + solver_pm.no_shifts = mnl->rat.np; + solver_pm.shifts = mnl->rat.mu; + solver_pm.type = CGMMS; + solver_pm.M_psi = mnl->Qsq; + solver_pm.sdim = VOLUME/2; + solver_pm.rel_prec = g_relative_precision_flag; + mnl->iter0 += cg_mms_tm(g_chi_up_spinor_field, mnl->pf, + &solver_pm, &dummy); + + // apply R to the pseudo-fermion fields + assign(mnl->w_fields[0], mnl->pf, VOLUME/2); + for(int j = (mnl->rat.np-1); j > -1; j--) { + assign_add_mul_r(mnl->w_fields[0], g_chi_up_spinor_field[j], + mnl->rat.rmu[j], VOLUME/2); + } + + mnl->energy1 = scalar_prod_r(mnl->pf, mnl->w_fields[0], VOLUME/2, 1); + etime = gettime(); + if(g_proc_id == 0) { + if(g_debug_level > 1) { + printf("# Time for %s monomial acc step: %e s\n", mnl->name, etime-atime); + } + if(g_debug_level > 0) { // shoud be 3 + printf("called rat_acc for id %d dH = %1.10e\n", id, mnl->energy1 - mnl->energy0); + } + } + return(mnl->energy1 - mnl->energy0); +} + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/rat_monomial.h b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/rat_monomial.h new file mode 100644 index 0000000000000000000000000000000000000000..7da7b333d123fb1cfbc5fb988216f406b31ee070 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/rat_monomial.h @@ -0,0 +1,29 @@ +/*********************************************************************** + * + * Copyright (C) 2013 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _RAT_MONOMIAL_H +#define _RAT_MONOMIAL_H + +#include "hamiltonian_field.h" + +void rat_derivative(const int id, hamiltonian_field_t * const hf); +double rat_acc(const int id, hamiltonian_field_t * const hf); +void rat_heatbath(const int id, hamiltonian_field_t * const hf); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/ratcor_monomial.c b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/ratcor_monomial.c new file mode 100644 index 0000000000000000000000000000000000000000..a7354c3605c90a9d514208e98c47046e9e44100a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/ratcor_monomial.c @@ -0,0 +1,275 @@ +/*********************************************************************** + * + * Copyright (C) 2013 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "linalg_eo.h" +#include "start.h" +#include "gettime.h" +#include "solver/solver.h" +#include "deriv_Sb.h" +#include "init/init_chi_spinor_field.h" +#include "operator/tm_operators.h" +#include "operator/tm_operators_nd.h" +#include "operator/Hopping_Matrix.h" +#include "monomial/monomial.h" +#include "hamiltonian_field.h" +#include "boundary.h" +#include "operator/clovertm_operators.h" +#include "operator/clover_leaf.h" +#include "rational/rational.h" +#include "phmc.h" +#include "ratcor_monomial.h" + +// computes ||(1 - C^dagger R C) phi|| +void check_C_psi(spinor * const k_up, spinor * const l_up, + const int id, hamiltonian_field_t * const hf, + solver_pm_t * solver_pm); + +// applies (Q^2 R^2 -1) phi +double apply_Z_psi(spinor * const k_up, spinor * const l_up, + const int id, hamiltonian_field_t * const hf, + solver_pm_t * solver_pm); + + + +void ratcor_heatbath(const int id, hamiltonian_field_t * const hf) { + monomial * mnl = &monomial_list[id]; + solver_pm_t solver_pm; + double atime, etime, delta; + spinor * up0, * up1, * tup; + double coefs[6] = {1./4., -3./32., 7./122., -77./2048., 231./8192., -1463./65536.}; + atime = gettime(); + nd_set_global_parameter(mnl); + g_mu = 0.; + g_mu3 = 0.; + g_kappa = mnl->kappa; + mnl->iter0 = 0; + boundary(mnl->kappa); + if(mnl->type == CLOVERRATCOR) { + g_c_sw = mnl->c_sw; + init_sw_fields(); + sw_term((const su3**)hf->gaugefield, mnl->kappa, mnl->c_sw); + sw_invert(EE, mnl->mu); + } + // we measure before the trajectory! + if((mnl->rec_ev != 0) && (hf->traj_counter%mnl->rec_ev == 0)) { + //if(mnl->type != NDCLOVERRAT) phmc_compute_ev(hf->traj_counter-1, id, &Qtm_pm_ndbipsi); + //else phmc_compute_ev(hf->traj_counter-1, id, &Qsw_pm_ndbipsi); + } + + // the Gaussian distributed random fields + mnl->energy0 = 0.; + random_spinor_field_eo(mnl->pf, mnl->rngrepro, RN_GAUSS); + mnl->energy0 = square_norm(mnl->pf, VOLUME/2, 1); + + solver_pm.max_iter = mnl->maxiter; + solver_pm.squared_solver_prec = mnl->accprec; + solver_pm.no_shifts = mnl->rat.np; + solver_pm.shifts = mnl->rat.mu; + solver_pm.type = CGMMS; + solver_pm.M_psi = mnl->Qsq; + solver_pm.sdim = VOLUME/2; + solver_pm.rel_prec = g_relative_precision_flag; + + // apply B to the random field to generate pseudo-fermion fields + assign(mnl->w_fields[0], mnl->pf, VOLUME/2); + up0 = mnl->w_fields[0]; + up1 = mnl->w_fields[2]; + + for(int i = 1; i < 8; i++) { + delta = apply_Z_psi(up1, up0, id, hf, &solver_pm); + assign_add_mul_r(mnl->pf, up1, coefs[i-1], VOLUME/2); + if(delta < mnl->accprec) break; + tup = up0; + up0 = up1; + up1 = tup; + } + etime = gettime(); + if(g_proc_id == 0) { + if(g_debug_level > 1) { + printf("# Time for %s monomial heatbath: %e s\n", mnl->name, etime-atime); + } + if(g_debug_level > 3) { + printf("called ratcor_heatbath for id %d energy %f\n", id, mnl->energy0); + } + } + return; +} + + +double ratcor_acc(const int id, hamiltonian_field_t * const hf) { + solver_pm_t solver_pm; + monomial * mnl = &monomial_list[id]; + double atime, etime, delta; + spinor * up0, * up1, * tup; + double coefs[6] = {-1./2., 3./8., -5./16., 35./128., -63./256., 231./1024.}; + atime = gettime(); + nd_set_global_parameter(mnl); + g_mu = 0.; + g_mu3 = 0.; + g_kappa = mnl->kappa; + boundary(mnl->kappa); + if(mnl->type == CLOVERRATCOR) { + g_c_sw = mnl->c_sw; + sw_term((const su3**) hf->gaugefield, mnl->kappa, mnl->c_sw); + sw_invert(EE, mnl->mu); + } + mnl->energy1 = 0.; + + solver_pm.max_iter = mnl->maxiter; + solver_pm.squared_solver_prec = mnl->accprec; + solver_pm.no_shifts = mnl->rat.np; + solver_pm.shifts = mnl->rat.mu; + solver_pm.type = CGMMS; + solver_pm.M_psi = mnl->Qsq; + solver_pm.sdim = VOLUME/2; + solver_pm.rel_prec = g_relative_precision_flag; + + // apply (Q R)^(-1) to pseudo-fermion fields + assign(mnl->w_fields[4], mnl->pf, VOLUME/2); + up0 = mnl->w_fields[0]; + up1 = mnl->w_fields[2]; + + delta = apply_Z_psi(up0, mnl->pf, id, hf, &solver_pm); + assign_add_mul_r(mnl->w_fields[4], up0, coefs[0], VOLUME/2); + + for(int i = 2; i < 8; i++) { + if(delta < mnl->accprec) break; + delta = apply_Z_psi(up1, up0, id, hf, &solver_pm); + assign_add_mul_r(mnl->w_fields[4], up1, coefs[i-1], VOLUME/2); + tup = up0; + up0 = up1; + up1 = tup; + } + + mnl->energy1 = scalar_prod_r(mnl->pf, mnl->w_fields[4], VOLUME/2, 1); + etime = gettime(); + if(g_proc_id == 0) { + if(g_debug_level > 1) { + printf("# Time for %s monomial acc step: %e s\n", mnl->name, etime-atime); + } + if(g_debug_level > 3) { // shoud be 3 + printf("called ratcor_acc for id %d dH = %1.10e\n", id, mnl->energy1 - mnl->energy0); + } + } + return(mnl->energy1 - mnl->energy0); +} + +// applies ((Q_h\tau_1 * R)^2 - 1) + +double apply_Z_psi(spinor * const k_up, spinor * const l_up, + const int id, hamiltonian_field_t * const hf, + solver_pm_t * solver_pm) { + monomial * mnl = &monomial_list[id]; + double dummy; + + mnl->iter0 += cg_mms_tm(g_chi_up_spinor_field, l_up, + solver_pm, &dummy); + + // apply R to the pseudo-fermion fields + assign(k_up, l_up, VOLUME/2); + for(int j = (mnl->rat.np-1); j > -1; j--) { + assign_add_mul_r(k_up, g_chi_up_spinor_field[j], + mnl->rat.rmu[j], VOLUME/2); + } + + // apply R a second time + cg_mms_tm(g_chi_up_spinor_field, k_up, + solver_pm, &dummy); + for(int j = (mnl->rat.np-1); j > -1; j--) { + assign_add_mul_r(k_up, g_chi_up_spinor_field[j], + mnl->rat.rmu[j], VOLUME/2); + } + mul_r(g_chi_up_spinor_field[mnl->rat.np], mnl->rat.A*mnl->rat.A, + k_up, VOLUME/2); + + // apply Q^2 and compute the residue + solver_pm->M_psi(k_up, g_chi_up_spinor_field[mnl->rat.np]); + diff(k_up, k_up, l_up, VOLUME/2); + double resi = square_norm(k_up, VOLUME/2, 1); + if(g_debug_level > 2 && g_proc_id == 0) { + printf("# RATCOR: ||Z * phi|| = %e\n", resi); + } + return(resi); +} + +// computes ||(1 - C^dagger R C) phi|| + +void check_C_psi(spinor * const k_up, spinor * const l_up, + const int id, hamiltonian_field_t * const hf, + solver_pm_t * solver_pm) { + monomial * mnl = &monomial_list[id]; + double dummy; + mnl->iter0 = cg_mms_tm(g_chi_up_spinor_field, l_up, solver_pm, &dummy); + + assign(k_up, l_up, VOLUME/2); + + // apply C to the random field to generate pseudo-fermion fields + for(int j = (mnl->rat.np-1); j > -1; j--) { + if(mnl->type == CLOVERRATCOR || mnl->type == CLOVERRAT) { + //Qsw_plus_psi(g_chi_up_spinor_field[mnl->rat.np], g_chi_up_spinor_field[j], + // I*mnl->rat.nu[j], 1., mnl->EVMaxInv); + } + else { + //Q_tau1_sub_const_ndpsi(g_chi_up_spinor_field[mnl->rat.np], g_chi_dn_spinor_field[mnl->rat.np], + // g_chi_up_spinor_field[j], g_chi_dn_spinor_field[j], + // I*mnl->rat.nu[j], 1., mnl->EVMaxInv); + } + assign_add_mul(k_up, g_chi_up_spinor_field[mnl->rat.np], I*mnl->rat.rnu[j], VOLUME/2); + } + //apply R + solver_pm->shifts = mnl->rat.mu; + cg_mms_tm(g_chi_up_spinor_field, k_up, + solver_pm, &dummy); + for(int j = (mnl->rat.np-1); j > -1; j--) { + assign_add_mul_r(k_up, g_chi_up_spinor_field[j], + mnl->rat.rmu[j], VOLUME/2); + } + // apply C^dagger + solver_pm->shifts = mnl->rat.nu; + cg_mms_tm(g_chi_up_spinor_field, k_up, + solver_pm, &dummy); + for(int j = (mnl->rat.np-1); j > -1; j--) { + if(mnl->type == NDCLOVERRATCOR || mnl->type == NDCLOVERRAT) { + //Qsw_tau1_sub_const_ndpsi(g_chi_up_spinor_field[mnl->rat.np], g_chi_dn_spinor_field[mnl->rat.np], + // g_chi_up_spinor_field[j], g_chi_dn_spinor_field[j], + // -I*mnl->rat.nu[j], 1., mnl->EVMaxInv); + } + else { + //Q_tau1_sub_const_ndpsi(g_chi_up_spinor_field[mnl->rat.np], g_chi_dn_spinor_field[mnl->rat.np], + // g_chi_up_spinor_field[j], g_chi_dn_spinor_field[j], + // -I*mnl->rat.nu[j], 1., mnl->EVMaxInv); + } + assign_add_mul(k_up, g_chi_up_spinor_field[mnl->rat.np], -I*mnl->rat.rnu[j], VOLUME/2); + } + diff(k_up, k_up, l_up, VOLUME/2); + double resi = square_norm(k_up, VOLUME/2, 1); + if(g_proc_id == 0) printf("|| (1-C^dagger R C)*phi|| = %e\n", resi); + + return; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/ratcor_monomial.h b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/ratcor_monomial.h new file mode 100644 index 0000000000000000000000000000000000000000..b3cd778b2e32f1879000ef95592013c604e08bdb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/ratcor_monomial.h @@ -0,0 +1,28 @@ +/*********************************************************************** + * + * Copyright (C) 2013 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _RATCOR_MONOMIAL_H +#define _RATCOR_MONOMIAL_H + +#include "hamiltonian_field.h" + +double ratcor_acc(const int id, hamiltonian_field_t * const hf); +void ratcor_heatbath(const int id, hamiltonian_field_t * const hf); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/sf_gauge_monomial.c b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/sf_gauge_monomial.c new file mode 100644 index 0000000000000000000000000000000000000000..075f62c49c9afb93f6679bc35dfbb477094f311c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/sf_gauge_monomial.c @@ -0,0 +1,162 @@ +/*********************************************************************** + * + * Jenifer Gonzalez Lopez + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "su3adj.h" +#include "ranlxd.h" +#include "sse.h" +#include "start.h" +#include "get_rectangle_staples.h" +#include "gamma.h" +#include "get_staples.h" +#include "read_input.h" +#include "measure_gauge_action.h" +#include "measure_rectangles.h" +#include "monomial/monomial.h" +#include "sf_gauge_monomial.h" +#include "hamiltonian_field.h" +#include "sf_utils.h" + +void sf_gauge_derivative(const int id, hamiltonian_field_t * const hf) { + + int i, mu; + static su3 v, w; + su3 *z; + su3adj *xm; + monomial * mnl = &monomial_list[id]; + double factor = -1. * g_beta/3.0; + + + if(mnl->use_rectangles) { + factor = -mnl->c0 * g_beta/3.0; + } + + for(i = 0; i < VOLUME; i++) { + for(mu=0;mu<4;mu++) { + z=&hf->gaugefield[i][mu]; + xm=&hf->derivative[i][mu]; + get_staples(&v,i,mu, (const su3**) hf->gaugefield); + _su3_times_su3d(w,*z,v); + _trace_lambda_mul_add_assign((*xm), factor, w); + + if(mnl->use_rectangles) { + get_rectangle_staples(&v, i, mu); + _su3_times_su3d(w, *z, v); + _trace_lambda_mul_add_assign((*xm), factor*mnl->c1/mnl->c0, w); + } + } + } + return; +} + +void sf_gauge_heatbath( const int id, hamiltonian_field_t * const hf) +{ + monomial* mnl = &(monomial_list[id]); + + if( mnl->use_rectangles ){ mnl->c0 = 1. - 8.*mnl->c1; } + + mnl->energy0 = g_beta * ( mnl->c0 * measure_gauge_action(hf->gaugefield) ); + + if(mnl->use_rectangles) { + mnl->energy0 += g_beta*(mnl->c1 * measure_rectangles(hf->gaugefield)); + } + if(g_proc_id == 0 && g_debug_level > 3) { + printf("called gauge_heatbath for id %d %d\n", id, mnl->even_odd_flag); + } +} + +double sf_gauge_acc( const int id, hamiltonian_field_t * const hf) +{ + monomial* mnl = &(monomial_list[id]); + double sq_plaq = 0; + double sq_bulk_plaq = 0; + double sq_boundary_space_space_plaq = 0; + double sq_boundary_space_time_plaq = 0; + double sq_wrapped_plaq = 0; + + double rect_plaq = 0; + + sq_plaq = calc_sq_plaq(); + sq_bulk_plaq = calc_bulk_sq_plaq(); + sq_boundary_space_space_plaq = calc_boundary_space_space_sq_plaq(); + sq_boundary_space_time_plaq = calc_boundary_space_time_sq_plaq(); + sq_wrapped_plaq = calc_wrapped_sq_plaq(); + + rect_plaq = calc_rect_plaq(); + + #if 1 + { + fprintf( stderr, "sq_plaq = %e\n", sq_plaq ); + fprintf( stderr, "beta * c0 * sq_plaq = %e\n", g_beta * mnl->c0 * sq_plaq ); + + fprintf( stderr, "sq_bulk_plaq = %e\n", sq_bulk_plaq ); + fprintf( stderr, "beta * c0 * sq_bulk_plaq = %e\n", g_beta * mnl->c0 * sq_bulk_plaq ); + + fprintf( stderr, "sq_wrapped_plaq = %e\n", sq_wrapped_plaq ); + fprintf( stderr, "beta * c0 * sq_wrapped_plaq = %e\n", g_beta * mnl->c0 * sq_wrapped_plaq ); + + fprintf( stderr, "rect_plaq = %e\n", rect_plaq ); + fprintf( stderr, "beta * c1 * rect_plaq = %e\n", g_beta * mnl->c1 * rect_plaq ); + + fprintf( stderr, "bulk + bound(ss) + bound(st) + wrapped = %e + %e + %e + %e = %e =?= %e = total\n", + sq_bulk_plaq, sq_boundary_space_space_plaq, sq_boundary_space_time_plaq, sq_wrapped_plaq, + sq_bulk_plaq + sq_boundary_space_space_plaq + sq_boundary_space_time_plaq + sq_wrapped_plaq, sq_plaq ); + + fprintf( stderr, "my energy = %e\n", g_beta * ( mnl->c0 * sq_plaq + mnl->c1 * rect_plaq ) ); + } + #endif + + /*mnl->energy1 = g_beta*( mnl->c0 * measure_gauge_action() );*/ + + /* The bulk contribution is the same. */ + mnl->energy1 = g_beta * mnl->c0 * sq_bulk_plaq; + + /* The space-time boundary contribution must be weighted differently. */ + fprintf( stderr, "mnl->ct = %e\n", mnl->ct ); + mnl->energy1 += g_beta * mnl->c0 * mnl->ct * sq_boundary_space_time_plaq; + + /* The space-space boundary contribution must be weighted differently. */ + fprintf( stderr, "mnl->cs = %e\n", mnl->cs ); + mnl->energy1 += g_beta * mnl->c0 * mnl->cs * sq_boundary_space_space_plaq; + + /* Include the missing plaquettes if requested. */ + if( g_sf_inc_wrap_sq == 1 ){ mnl->energy1 += g_beta * mnl->c0 * sq_wrapped_plaq; } + + if( mnl->use_rectangles ) + { + mnl->energy1 += g_beta*( mnl->c1 * measure_rectangles(hf->gaugefield) ); + } + fprintf( stderr, "mnl->energy1 = %e\n", mnl->energy1 ); + + if( ( g_proc_id == 0 ) & ( g_debug_level > 3 ) ) + { + printf( "called sf_gauge_acc for id %d %d dH = %1.10e\n", + id, mnl->even_odd_flag, mnl->energy0 - mnl->energy1 ); + } + + return ( mnl->energy0 - mnl->energy1 ); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/sf_gauge_monomial.h b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/sf_gauge_monomial.h new file mode 100644 index 0000000000000000000000000000000000000000..dd139c1306e7bade4200ef89717f927c7e972513 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/monomial/sf_gauge_monomial.h @@ -0,0 +1,29 @@ +/*********************************************************************** + * + * Jenifer Gonzalez Lopez + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _SFGAUGE_MONOMIAL_H +#define _SFGAUGE_MONOMIAL_H + +#include "hamiltonian_field.h" + +void sf_gauge_derivative(const int id, hamiltonian_field_t * const hf); +void sf_gauge_heatbath(const int id, hamiltonian_field_t * const hf); +double sf_gauge_acc(const int id, hamiltonian_field_t * const hf); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/mpi_init.c b/qcd/part_cpu/applications/QCD/src/kernel_D/mpi_init.c new file mode 100644 index 0000000000000000000000000000000000000000..f691ef89a5f7697e41a68aab48fc84617c2e92cd --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/mpi_init.c @@ -0,0 +1,802 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#ifdef MPI +# include +#endif +#ifdef _USE_SHMEM +# include +#endif +#include "global.h" +#include "read_input.h" +#include "mpi_init.h" + +#ifdef MPI +/* Datatypes for the data exchange */ +MPI_Datatype mpi_su3; +MPI_Datatype gauge_point; +MPI_Datatype gauge_time_slice_cont; +MPI_Datatype gauge_time_slice_split; +MPI_Datatype deri_point; +MPI_Datatype deri_time_slice_cont; +MPI_Datatype deri_time_slice_split; + +MPI_Datatype field_point; +MPI_Datatype field_point32; +MPI_Datatype field_time_slice_cont; +MPI_Datatype lfield_time_slice_cont; +MPI_Datatype lfield_time_slice_cont32; +MPI_Datatype gauge_x_slice_cont; +MPI_Datatype gauge_x_subslice; +MPI_Datatype gauge_x_slice_gath; +MPI_Datatype field_x_slice_cont; +MPI_Datatype field_x_subslice; +MPI_Datatype field_x_slice_gath; +MPI_Datatype lfield_x_slice_cont; +MPI_Datatype lfield_x_slice_cont32; +MPI_Datatype lfield_x_subslice; +MPI_Datatype lfield_x_subslice32; +MPI_Datatype lfield_x_slice_gath; +MPI_Datatype lfield_x_slice_gath32; +MPI_Datatype deri_x_slice_cont; +MPI_Datatype deri_x_subslice; +MPI_Datatype deri_x_slice_gath; +MPI_Datatype gauge_xt_edge_cont; +MPI_Datatype gauge_xt_edge_gath; +MPI_Datatype deri_xt_edge_cont; + +MPI_Datatype gauge_y_slice_gath; +MPI_Datatype gauge_y_slice_cont; +MPI_Datatype gauge_y_subslice; + +MPI_Datatype field_y_slice_gath; +MPI_Datatype field_y_slice_cont; +MPI_Datatype field_y_subslice; +MPI_Datatype lfield_y_slice_gath; +MPI_Datatype lfield_y_slice_gath32; +MPI_Datatype lfield_y_slice_cont; +MPI_Datatype lfield_y_slice_cont32; +MPI_Datatype lfield_y_subslice; +MPI_Datatype lfield_y_subslice32; + +MPI_Datatype field_z_slice_gath; +MPI_Datatype field_z_subslice; +MPI_Datatype field_z_slice_cont; +MPI_Datatype lfield_z_slice_gath; +MPI_Datatype lfield_z_slice_gath32; +MPI_Datatype lfield_z_slice_cont; +MPI_Datatype lfield_z_slice_cont32; +MPI_Datatype field_z_slice_half; + +MPI_Datatype deri_y_slice_cont; +MPI_Datatype deri_y_subslice; +MPI_Datatype deri_y_slice_gath; + +MPI_Datatype gauge_yx_edge_cont; +MPI_Datatype gauge_yx_edge_gath; +MPI_Datatype deri_yx_edge_cont; + +MPI_Datatype gauge_ty_edge_cont; +MPI_Datatype gauge_ty_edge_gath; +MPI_Datatype deri_ty_edge_cont; + +MPI_Datatype gauge_z_slice_gath; +MPI_Datatype gauge_z_slice_cont; +MPI_Datatype gauge_z_subslice; + +MPI_Datatype deri_z_slice_cont; +MPI_Datatype deri_z_subslice; +MPI_Datatype deri_z_slice_gath; + +MPI_Datatype gauge_zx_edge_cont; +MPI_Datatype gauge_zx_edge_gath; +MPI_Datatype deri_zx_edge_cont; + +MPI_Datatype gauge_tz_edge_cont; +MPI_Datatype gauge_tz_edge_gath; +MPI_Datatype deri_tz_edge_cont; + +MPI_Datatype gauge_zy_edge_cont; +MPI_Datatype gauge_zy_edge_gath; +MPI_Datatype deri_zy_edge_cont; + +MPI_Datatype halffield_point; +MPI_Datatype halffield_time_slice_cont; + +MPI_Datatype halffield_x_slice_cont; +MPI_Datatype halffield_x_subslice; +MPI_Datatype halffield_x_slice_gath; + +MPI_Datatype halffield_y_slice_cont; +MPI_Datatype halffield_y_subslice; +MPI_Datatype halffield_y_slice_gath; + +MPI_Datatype halffield_z_slice_cont; + + +#ifdef _USE_TSPLITPAR +MPI_Datatype field_xt_slice_int; +MPI_Datatype field_xt_slice_ext; +MPI_Datatype field_yt_slice_int; +MPI_Datatype field_yt_slice_ext; +# ifdef PARALLELXYZ +MPI_Datatype field_zt_slice_ext_L; +MPI_Datatype field_zt_slice_ext_S; +MPI_Datatype field_zt_slice_even_dn_et; +MPI_Datatype field_zt_slice_even_up_et; +MPI_Datatype field_zt_slice_odd_dn_et; +MPI_Datatype field_zt_slice_odd_up_et; +MPI_Datatype field_zt_slice_even_dn_ot; +MPI_Datatype field_zt_slice_even_up_ot; +MPI_Datatype field_zt_slice_odd_dn_ot; +MPI_Datatype field_zt_slice_odd_up_ot; +# endif +#endif +#ifdef WITHLAPH +MPI_Datatype su3vect_point; +MPI_Datatype jfield_x_slice_cont; +MPI_Datatype jfield_y_slice_cont; +MPI_Datatype jfield_z_slice_cont; +MPI_Datatype jfield_x_slice_gath; +MPI_Datatype jfield_y_slice_gath; +MPI_Datatype jfield_z_slice_gath; +MPI_Datatype jfield_y_subslice; +#endif + +#if ( defined PARALLELXYZT || defined PARALLELXYZ ) +MPI_Datatype field_z_slice_even_dn; +MPI_Datatype field_z_slice_even_up; +MPI_Datatype field_z_slice_odd_dn; +MPI_Datatype field_z_slice_odd_up; + +# if (!defined _INDEX_INDEP_GEOM) +spinor * field_buffer_z ALIGN; +spinor * field_buffer_z2 ALIGN; +spinor * field_buffer_z3 ALIGN; +spinor * field_buffer_z4 ALIGN; +halfspinor * halffield_buffer_z ALIGN; +halfspinor * halffield_buffer_z2 ALIGN; +# endif +#endif + +MPI_Op mpi_reduce_su3_ray; + +void reduce_su3_ray( + void *u_i /* in */, + void *u_io /* in/out */, + int *len /* in */, + MPI_Datatype *dt /* in */) { + + int n; + su3 *u, *v, tmp; + u = (su3 *)u_i; + v = (su3 *)u_io; + + if(*dt != mpi_su3) { + fprintf(stderr, "\nInvalid datatype for reduce_su3_ray(); abort.\n"); + MPI_Abort(MPI_COMM_WORLD, 1); + } + for(n=0; n<*len; n++) { + _su3_times_su3(tmp,*(u+n),*(v+n)) + _su3_assign(*(v+n),tmp) + } +} + +#endif + + +void tmlqcd_mpi_init() { + int i; +#ifdef MPI + int periods[] = {1,1,1,1}; + int dims[] = {0,0,0,0}; + int ndims = 0; + int nalldims = 4; + int reorder = 1, namelen; + char processor_name[MPI_MAX_PROCESSOR_NAME]; +#endif + g_proc_coords[0] = 0; + g_proc_coords[1] = 0; + g_proc_coords[2] = 0; + g_proc_coords[3] = 0; + for(i = 0; i < 8; i++) { + g_nb_list[i] = 0; + } + + +#ifdef MPI +# ifdef _USE_SHMEM + /* we need that the PE number in MPI_COMM_WORL */ + /* exactly correspond to the one in g_cart_grid */ + reorder = 0; +# endif + +# ifndef FIXEDVOLUME + N_PROC_T=0; /* the other N_PROC_? are read from input, if not constraint below */ + /* N_PROC_T will be set by MPI_Dims_create, if not constraint below */ +# endif + +# if defined PARALLELT + ndims = 1; +# ifndef FIXEDVOLUME + N_PROC_X = 1; + N_PROC_Y = 1; + N_PROC_Z = 1; +# endif +# endif +# if defined PARALLELX + ndims = 1; +# ifndef FIXEDVOLUME + N_PROC_T = 1; + N_PROC_Y = 1; + N_PROC_Z = 1; +# endif +# endif +# if defined PARALLELXT + ndims = 2; +# ifndef FIXEDVOLUME + N_PROC_Y = 1; + N_PROC_Z = 1; +# endif +# endif +# if defined PARALLELXY + ndims = 2; +# ifndef FIXEDVOLUME + N_PROC_T = 1; + N_PROC_Z = 1; +# endif +# endif +# if defined PARALLELXYT + ndims = 3; +# ifndef FIXEDVOLUME + N_PROC_Z = 1; +# endif +# endif +# if defined PARALLELXYZ + ndims = 3; +# ifndef FIXEDVOLUME + N_PROC_T = 1; +# endif +# endif +# if defined PARALLELXYZT + ndims = 4; +# endif + dims[0] = N_PROC_T; + dims[1] = N_PROC_X; + dims[2] = N_PROC_Y; + dims[3] = N_PROC_Z; + + + MPI_Comm_size(MPI_COMM_WORLD, &g_nproc); + MPI_Comm_rank(MPI_COMM_WORLD, &g_proc_id); + MPI_Get_processor_name(processor_name, &namelen); + MPI_Dims_create(g_nproc, nalldims, dims); + if(g_proc_id == 0){ + printf("# Creating the following cartesian grid for a %d dimensional parallelisation:\n# %d x %d x %d x %d\n" + , ndims, dims[0], dims[1], dims[2], dims[3]); + } + + g_nproc_t = dims[0]; + g_nproc_x = dims[1]; + g_nproc_y = dims[2]; + g_nproc_z = dims[3]; + + if( (g_nproc_t < 1 || g_nproc_x < 1 || g_nproc_y < 1 || g_nproc_z < 1) || + (LX%g_nproc_x != 0 || LY%g_nproc_y != 0 || LZ%g_nproc_z != 0 || T_global%g_nproc_t != 0) ) { + if(g_proc_id == 0) { + fprintf(stderr, "The lattice cannot be properly mapped on the processor grid\n"); + fprintf(stderr, "Please check your number of processors and the Nr?Procs input variables\n"); + fprintf(stderr, "Aborting...!\n"); + } + MPI_Abort(MPI_COMM_WORLD, 1); + MPI_Finalize(); + exit(-1); + } + +# ifndef FIXEDVOLUME + N_PROC_T = g_nproc_t; + N_PROC_X = g_nproc_x; + N_PROC_Y = g_nproc_y; + N_PROC_Z = g_nproc_z; + T = T_global/g_nproc_t; + LX = LX/g_nproc_x; + LY = LY/g_nproc_y; + LZ = LZ/g_nproc_z; + VOLUME = (T*LX*LY*LZ); + SPACEVOLUME = VOLUME/T; +# ifdef _USE_TSPLITPAR + TEOSLICE = (LX*LY*LZ)/2; +# endif +# ifdef PARALLELT + RAND = (2*LX*LY*LZ); + EDGES = 0; +# elif defined PARALLELX + RAND = (2*T*LY*LZ); + EDGES = 0; +# elif defined PARALLELXT + RAND = 2*LZ*(LY*LX + T*LY); + EDGES = 4*LZ*LY; +# elif defined PARALLELXY + RAND = 2*LZ*T*(LX + LY); + EDGES = 4*LZ*T; +# elif defined PARALLELXYT + RAND = 2*LZ*(LY*LX + T*LY + T*LX); + EDGES = 4*LZ*(LY + T + LX); +# elif defined PARALLELXYZ + RAND = 2*T*(LY*LZ + LX*LZ + LX*LY); + EDGES = 4*T*(LX + LY + LZ); +# elif defined PARALLELXYZT + RAND = 2*LZ*LY*LX + 2*LZ*T*LY + 2*LZ*T*LX + 2*T*LX*LY; + EDGES = 4*LZ*LY + 4*LZ*T + 4*LZ*LX + 4*LY*T + 4*LY*LX + 4*T*LX; +# else /* ifdef PARALLELT */ + RAND = 0; + EDGES = 0; +# endif /* ifdef PARALLELT */ + /* Note that VOLUMEPLUSRAND is not always equal to VOLUME+RAND */ + /* VOLUMEPLUSRAND rather includes the edges */ + VOLUMEPLUSRAND = VOLUME + RAND + EDGES; + SPACERAND=RAND/T; +# endif /* ifndef FIXEDVOLUME */ + g_dbw2rand = (RAND + 2*EDGES); + +# if (!defined _INDEX_INDEP_GEOM) +# if ( defined PARALLELXYZT || defined PARALLELXYZ ) + field_buffer_z = (spinor*)malloc(T*LX*LY/2*sizeof(spinor)); + field_buffer_z2 = (spinor*)malloc(T*LX*LY/2*sizeof(spinor)); +# ifdef _NON_BLOCKING + field_buffer_z3 = (spinor*)malloc(T*LX*LY/2*sizeof(spinor)); + field_buffer_z4 = (spinor*)malloc(T*LX*LY/2*sizeof(spinor)); +# endif + halffield_buffer_z = (halfspinor*)malloc(T*LX*LY/2*sizeof(halfspinor)); + halffield_buffer_z2 = (halfspinor*)malloc(T*LX*LY/2*sizeof(halfspinor)); +# endif +# endif + + MPI_Cart_create(MPI_COMM_WORLD, nalldims, dims, periods, reorder, &g_cart_grid); + MPI_Comm_rank(g_cart_grid, &g_cart_id); + MPI_Cart_coords(g_cart_grid, g_cart_id, nalldims, g_proc_coords); + if (g_debug_level > 1) { + fprintf(stdout,"# Process %d of %d on %s: cart_id %d, coordinates (%d %d %d %d)\n", + g_proc_id, g_nproc, processor_name, g_cart_id, + g_proc_coords[0], g_proc_coords[1], g_proc_coords[2], g_proc_coords[3]); + fflush(stdout); + } + if(g_stdio_proc == -1){ + g_stdio_proc = g_proc_id; + } + for(i = 0; i < 8; i++) { + g_nb_list[i] = g_cart_id; + } +# if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) + MPI_Cart_shift(g_cart_grid, 0, 1, &g_nb_t_dn, &g_nb_t_up); + g_nb_list[0] = g_nb_t_up; + g_nb_list[1] = g_nb_t_dn; +# endif +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ ) + MPI_Cart_shift(g_cart_grid, 1, 1, &g_nb_x_dn, &g_nb_x_up); + g_nb_list[2] = g_nb_x_up; + g_nb_list[3] = g_nb_x_dn; +# endif +# if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ ) + MPI_Cart_shift(g_cart_grid, 2, 1, &g_nb_y_dn, &g_nb_y_up); + g_nb_list[4] = g_nb_y_up; + g_nb_list[5] = g_nb_y_dn; +# endif +# if (defined PARALLELXYZT || defined PARALLELXYZ ) + MPI_Cart_shift(g_cart_grid, 3, 1, &g_nb_z_dn, &g_nb_z_up); + g_nb_list[6] = g_nb_z_up; + g_nb_list[7] = g_nb_z_dn; +# endif + + +# if ((defined _INDEX_INDEP_GEOM) && (defined _USE_HALFSPINOR)) +# if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) + g_HS_shift_t = 0; + g_HS_shift_x = LX*LY*LZ; + g_HS_shift_y = LX*LY*LZ + T*LY*LZ; + g_HS_shift_z = LX*LY*LZ + T*LY*LZ + T*LX*LZ; +# endif +# if (defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ ) + g_HS_shift_t = 0; + g_HS_shift_x = 0; + g_HS_shift_y = T*LY*LZ; + g_HS_shift_z = T*LY*LZ + T*LX*LZ; +# endif +# endif + + /* With internal boundary we mean the fields that are send */ + /* to another processor. It is located wihtin the local */ + /* volume, whereas the external boundary is the boundary */ + /* received from another processor lying on the RAND. */ + /* In general the external bondaries are continuous in */ + /* memory, while this is not always true for the internal */ + /* one. */ + + /* first the gauge fields */ + MPI_Type_contiguous(18, MPI_DOUBLE, &mpi_su3); + MPI_Type_commit(&mpi_su3); + /* This is a gauge field on one space-time point */ + MPI_Type_contiguous(4, mpi_su3, &gauge_point); + /* This is a type for one gauge time slice continuous */ + MPI_Type_contiguous(LX*LY*LZ, gauge_point, &gauge_time_slice_cont); + /* This is a type for one gauge time slice dis-continuous -> NEW_GEOMETRY */ + /* This are 2 continuous ensembles of gauge_points of length LX*LY*LZ/2 */ + /* separated in memory by (VOLUME)/2 gauge_points */ + MPI_Type_vector(2, LX*LY*LZ/2, (VOLUME)/2, gauge_point, &gauge_time_slice_split); + /* Commit the new types */ + MPI_Type_commit(&gauge_time_slice_split); + MPI_Type_commit(&gauge_time_slice_cont); + + /* Continuous x-slice as it is found in the external memory.*/ + MPI_Type_contiguous(T*LY*LZ, gauge_point, &gauge_x_slice_cont); + /* this is a continuous gauge xt-slice */ + MPI_Type_contiguous(LY*LZ, gauge_point, &gauge_x_subslice); + /* Put T of the latter together, each of which has length 1 (in units */ + /* of gauge_yy_subslice). They are separated by LX of those. */ + /* This is as the gauge fields are located in the internal memory */ + MPI_Type_vector(T, 1, LX, gauge_x_subslice, &gauge_x_slice_gath); + MPI_Type_commit(&gauge_x_slice_gath); + MPI_Type_commit(&gauge_x_slice_cont); + + /* Continuous y-slice as it is found in the external memory.*/ + MPI_Type_contiguous(T*LX*LZ, gauge_point, &gauge_y_slice_cont); + /* this is a continuous gauge xyt-slice */ + MPI_Type_contiguous(LZ, gauge_point, &gauge_y_subslice); + /* Put T*LX together, separated by LY of those */ + MPI_Type_vector(T*LX, 1, LY, gauge_y_subslice, &gauge_y_slice_gath); + MPI_Type_commit(&gauge_y_slice_cont); + MPI_Type_commit(&gauge_y_slice_gath); + + /* Continuous z-slice as it is found in the external memory.*/ + MPI_Type_contiguous(T*LX*LY, gauge_point, &gauge_z_slice_cont); + /* Put T*LX*LY gauge-points together, separated by LZ of those */ + MPI_Type_vector(T*LX*LY, 1, LZ, gauge_point, &gauge_z_slice_gath); + MPI_Type_commit(&gauge_z_slice_cont); + MPI_Type_commit(&gauge_z_slice_gath); + + + /* external edges: on x-Rand send in t-direction*/ + MPI_Type_contiguous(2*LY*LZ ,gauge_point, &gauge_xt_edge_cont); + MPI_Type_commit(&gauge_xt_edge_cont); + /* internal edges, lying in memory nevertheless in the boundary */ + MPI_Type_vector(2, 1, T, gauge_x_subslice, &gauge_xt_edge_gath); + MPI_Type_commit(&gauge_xt_edge_gath); + + /* external edges: y-Rand send in x-direction */ + MPI_Type_contiguous(2*T*LZ ,gauge_point, &gauge_yx_edge_cont); + MPI_Type_commit(&gauge_yx_edge_cont); + /* internal edges */ + MPI_Type_vector(2*T, LZ, LX*LZ, gauge_point, &gauge_yx_edge_gath); + MPI_Type_commit(&gauge_yx_edge_gath); + + /* external edges: t-Rand send in y-direction */ + MPI_Type_contiguous(2*LX*LZ ,gauge_point, &gauge_ty_edge_cont); + MPI_Type_commit(&gauge_ty_edge_cont); + /* internal edges */ + MPI_Type_vector(2*LX, LZ, LY*LZ, gauge_point, &gauge_ty_edge_gath); + MPI_Type_commit(&gauge_ty_edge_gath); + + /* external edges: z-Rand send in x-direction */ + /* zx-edge */ + MPI_Type_contiguous(2*T*LY ,gauge_point, &gauge_zx_edge_cont); + MPI_Type_commit(&gauge_zx_edge_cont); + /* internal edges */ + MPI_Type_vector(2*T, LY, LY*LX, gauge_point, &gauge_zx_edge_gath); + MPI_Type_commit(&gauge_zx_edge_gath); + + /* external edges: t-Rand send in z-direction */ + /* tz-edge */ + MPI_Type_contiguous(2*LX*LY ,gauge_point, &gauge_tz_edge_cont); + MPI_Type_commit(&gauge_tz_edge_cont); + /* internal edges */ + MPI_Type_vector(2*LX*LY, 1, LZ, gauge_point, &gauge_tz_edge_gath); + MPI_Type_commit(&gauge_tz_edge_gath); + + /* external edges: z-Rand send in y-direction */ + /* zy-edge */ + MPI_Type_contiguous(2*T*LX ,gauge_point, &gauge_zy_edge_cont); + MPI_Type_commit(&gauge_zy_edge_cont); + /* internal edges */ + MPI_Type_vector(2*T*LX, 1, LY, gauge_point, &gauge_zy_edge_gath); + MPI_Type_commit(&gauge_zy_edge_gath); + + /* The spinor fields */ + /* this is a single spinor field on one space-time point */ + MPI_Type_contiguous(24, MPI_DOUBLE, &field_point); + MPI_Type_contiguous(24, MPI_FLOAT, &field_point32); + /* Tis is an even or odd spinor field time slice, continuous */ +/* MPI_Type_contiguous(LX*LY*LZ/2, field_point, &field_time_slice_cont); */ + MPI_Type_contiguous(LX*LY*LZ*12, MPI_DOUBLE, &field_time_slice_cont); + /* Commit the new types */ + MPI_Type_commit(&field_time_slice_cont); + + /* this is the not even/odd field */ + MPI_Type_contiguous(LX*LY*LZ, field_point, &lfield_time_slice_cont); + MPI_Type_commit(&lfield_time_slice_cont); + MPI_Type_contiguous(LX*LY*LZ, field_point32, &lfield_time_slice_cont32); + MPI_Type_commit(&lfield_time_slice_cont32); + + /* This is an even or odd continuous spinor field x-slice */ + MPI_Type_contiguous(T*LY*LZ/2, field_point, &field_x_slice_cont); +/* MPI_Type_contiguous(12*T*LY*LZ, MPI_DOUBLE, &field_x_slice_cont); */ + /* this is an even or odd continuous spinor field xt-slice */ + MPI_Type_contiguous(LY*LZ/2, field_point, &field_x_subslice); + /* this type puts T xt-slices together being the internal x-boundary in */ + /* even/odd ordered spinor fields */ + MPI_Type_vector(T, 1, LX, field_x_subslice, &field_x_slice_gath); +/* MPI_Type_vector(T, 12*LY*LZ, 12*LX*LY*LZ, MPI_DOUBLE, &field_x_slice_gath); */ + MPI_Type_commit(&field_x_slice_gath); + MPI_Type_commit(&field_x_slice_cont); + + /* this is the not even/odd field */ + MPI_Type_contiguous(T*LY*LZ, field_point, &lfield_x_slice_cont); + MPI_Type_contiguous(LY*LZ, field_point, &lfield_x_subslice); + MPI_Type_vector(T, 1, LX, lfield_x_subslice, &lfield_x_slice_gath); + MPI_Type_commit(&lfield_x_slice_gath); + MPI_Type_commit(&lfield_x_slice_cont); + + MPI_Type_contiguous(T*LY*LZ, field_point32, &lfield_x_slice_cont32); + MPI_Type_contiguous(LY*LZ, field_point32, &lfield_x_subslice32); + MPI_Type_vector(T, 1, LX, lfield_x_subslice32, &lfield_x_slice_gath32); + MPI_Type_commit(&lfield_x_slice_gath32); + MPI_Type_commit(&lfield_x_slice_cont32); + + /* This is an even or odd continuous spinor field y-slice */ + MPI_Type_contiguous(T*LX*LZ/2, field_point, &field_y_slice_cont); +/* MPI_Type_contiguous(12*T*LX*LZ, MPI_DOUBLE, &field_y_slice_cont); */ + /* this is an even or odd continuous spinor field txy-slice */ + MPI_Type_contiguous(LZ/2, field_point, &field_y_subslice); + /* this type puts T*LX xt-slices together being the internal y-boundary in */ + /* even/odd ordered spinor fields */ + MPI_Type_vector(T*LX, 1, LY, field_y_subslice, &field_y_slice_gath); +/* MPI_Type_vector(T*LX, 12*LZ, 12*LY*LZ, MPI_DOUBLE, &field_y_slice_gath); */ + MPI_Type_commit(&field_y_slice_gath); + MPI_Type_commit(&field_y_slice_cont); + + /* this is the not even/odd field */ + MPI_Type_contiguous(T*LX*LZ, field_point, &lfield_y_slice_cont); + MPI_Type_contiguous(LZ, field_point, &lfield_y_subslice); + MPI_Type_vector(T*LX, 1, LY, lfield_y_subslice, &lfield_y_slice_gath); + MPI_Type_commit(&lfield_y_slice_cont); + MPI_Type_commit(&lfield_y_slice_gath); + + MPI_Type_contiguous(T*LX*LZ, field_point32, &lfield_y_slice_cont32); + MPI_Type_contiguous(LZ, field_point32, &lfield_y_subslice32); + MPI_Type_vector(T*LX, 1, LY, lfield_y_subslice32, &lfield_y_slice_gath32); + MPI_Type_commit(&lfield_y_slice_cont32); + MPI_Type_commit(&lfield_y_slice_gath32); + + /* If z-dir is parallelized, I have assumed that both LZ and T*LX*LY are even */ + /* This is an even or odd continuous spinor field z-slice */ + MPI_Type_contiguous(T*LX*LY/2, field_point, &field_z_slice_cont); + + /* this type puts T*LX*LY field_point together being the internal z-boundary in */ + /* even/odd ordered spinor fields */ + MPI_Type_vector(T*LX*LY/2, 12, 24, MPI_DOUBLE, &field_z_slice_half); /* this is ?!? (Not used) */ + MPI_Type_commit(&field_z_slice_half); + MPI_Type_commit(&field_z_slice_cont); + + /* this is the not even/odd field */ + MPI_Type_contiguous(T*LX*LY, field_point, &lfield_z_slice_cont); + MPI_Type_vector(T*LX*LY, 1, LZ, field_point, &lfield_z_slice_gath); + MPI_Type_commit(&lfield_z_slice_cont); + MPI_Type_commit(&lfield_z_slice_gath); + + MPI_Type_contiguous(T*LX*LY, field_point32, &lfield_z_slice_cont32); + MPI_Type_vector(T*LX*LY, 1, LZ, field_point32, &lfield_z_slice_gath32); + MPI_Type_commit(&lfield_z_slice_cont32); + MPI_Type_commit(&lfield_z_slice_gath32); + +#ifdef _USE_TSPLITPAR + /* here I construct the xt yt zt edges for use in _USE_TSPLITPAR */ + MPI_Type_contiguous(LY*LZ/2, field_point, &field_xt_slice_int); /* OK */ + MPI_Type_vector(LX, LZ/2, LY*LZ/2, field_point, &field_yt_slice_int); /* OK */ + MPI_Type_contiguous(LY*LZ/2, field_point, &field_xt_slice_ext); /* OK */ + MPI_Type_contiguous(LX*LZ/2, field_point, &field_yt_slice_ext); /* OK */ + MPI_Type_commit(&field_xt_slice_int); + MPI_Type_commit(&field_xt_slice_ext); + MPI_Type_commit(&field_yt_slice_int); + MPI_Type_commit(&field_yt_slice_ext); +# ifdef PARALLELXYZ + MPI_Type_contiguous((LX*LY+1)/2, field_point, &field_zt_slice_ext_L); /* OK */ + MPI_Type_contiguous(LX*LY/2, field_point, &field_zt_slice_ext_S); /* OK */ + MPI_Type_commit(&field_zt_slice_ext_L); + MPI_Type_commit(&field_zt_slice_ext_S); +# endif +#endif + +#ifdef WITHLAPH + MPI_Type_contiguous(6, MPI_DOUBLE, &su3vect_point); + + MPI_Type_contiguous(LY*LZ, su3vect_point, &jfield_x_slice_cont); + MPI_Type_contiguous(LX*LZ, su3vect_point, &jfield_y_slice_cont); + MPI_Type_contiguous(LX*LY, su3vect_point, &jfield_z_slice_cont); + MPI_Type_contiguous(LY*LZ, su3vect_point, &jfield_x_slice_gath); + MPI_Type_contiguous(LZ, su3vect_point, &jfield_y_subslice); + MPI_Type_vector(LX, 1, LY, jfield_y_subslice, &jfield_y_slice_gath); + MPI_Type_vector(LX*LY, 1, LZ, su3vect_point, &jfield_z_slice_gath); + MPI_Type_commit(&jfield_x_slice_gath); + MPI_Type_commit(&jfield_x_slice_cont); + MPI_Type_commit(&jfield_y_slice_cont); + MPI_Type_commit(&jfield_y_slice_gath); + MPI_Type_commit(&jfield_z_slice_cont); + MPI_Type_commit(&jfield_z_slice_gath); +#endif + + /* The internal z_ and zt_ slices are constructed in geometry() with MPI_Type_indexed() */ + + /* Now the derivative fields */ + /* this is a derivative field on one space-time point */ + MPI_Type_contiguous(32, MPI_DOUBLE, &deri_point); + /* This is a type for one derivative time slice continuous */ + MPI_Type_contiguous(LX*LY*LZ, deri_point, &deri_time_slice_cont); + /* This is a type for one derivative time slice dis-continuous -> NEW_GEOMETRY */ + MPI_Type_vector(2, LX*LY*LZ/2, VOLUME/2, deri_point, &deri_time_slice_split); + /* Commit the new types */ + MPI_Type_commit(&deri_time_slice_split); + MPI_Type_commit(&deri_time_slice_cont); + + MPI_Type_contiguous(T*LY*LZ, deri_point, &deri_x_slice_cont); + MPI_Type_contiguous(LY*LZ, deri_point, &deri_x_subslice); + MPI_Type_vector(T, 1, LX, deri_x_subslice, &deri_x_slice_gath); + MPI_Type_commit(&deri_x_slice_gath); + MPI_Type_commit(&deri_x_slice_cont); + + /* external edges: on x-boundary send in t-direction first */ + MPI_Type_contiguous(2*LY*LZ ,deri_point, &deri_xt_edge_cont); + MPI_Type_commit(&deri_xt_edge_cont); + /* external edges: y-boundary send in x-direction */ + MPI_Type_contiguous(2*T*LZ ,deri_point, &deri_yx_edge_cont); + MPI_Type_commit(&deri_yx_edge_cont); + /* external edges: t-boundary send in y-direction */ + MPI_Type_contiguous(2*LX*LZ ,deri_point, &deri_ty_edge_cont); + MPI_Type_commit(&deri_ty_edge_cont); + /* external edges: z-boundary send in x-direction */ + MPI_Type_contiguous(2*T*LY ,deri_point, &deri_zx_edge_cont); + MPI_Type_commit(&deri_zx_edge_cont); + /* external edges: t-boundary send in z-direction */ + MPI_Type_contiguous(2*LX*LY ,deri_point, &deri_tz_edge_cont); + MPI_Type_commit(&deri_tz_edge_cont); + /* external edges: z-boundary send in y-direction */ + MPI_Type_contiguous(2*T*LX ,deri_point, &deri_zy_edge_cont); + MPI_Type_commit(&deri_zy_edge_cont); + + MPI_Type_contiguous(T*LX*LZ, deri_point, &deri_y_slice_cont); + MPI_Type_contiguous(LZ, deri_point, &deri_y_subslice); + MPI_Type_vector(T*LX, 1, LY, deri_y_subslice, &deri_y_slice_gath); + MPI_Type_commit(&deri_y_slice_gath); + MPI_Type_commit(&deri_y_slice_cont); + + MPI_Type_contiguous(T*LX*LY, deri_point, &deri_z_slice_cont); + MPI_Type_vector(T*LX*LY, 1, LZ, deri_point, &deri_z_slice_gath); + MPI_Type_commit(&deri_z_slice_gath); + MPI_Type_commit(&deri_z_slice_cont); + + /* this is a single halfspinor field on one space-time point */ + MPI_Type_contiguous(12, MPI_DOUBLE, &halffield_point); + MPI_Type_vector(LX*LY*LZ/2, 1, 8, halffield_point, &halffield_time_slice_cont); + + /* Commit the new types */ + MPI_Type_commit(&halffield_time_slice_cont); + + MPI_Type_vector(LY*LZ/2, 1, 8, halffield_point, &halffield_x_subslice); + MPI_Type_vector(T, 1, LX, halffield_x_subslice, &halffield_x_slice_gath); + MPI_Type_commit(&halffield_x_slice_gath); + + MPI_Type_vector(LZ/2, 1, 8, halffield_point, &halffield_y_subslice); + MPI_Type_vector(T*LX, 1, LY, halffield_y_subslice, &halffield_y_slice_gath); + MPI_Type_commit(&halffield_y_slice_gath); + + /* For observables we need communicators for Cartesian time slices */ + MPI_Comm_split(g_cart_grid, g_proc_coords[0], g_cart_id, &g_mpi_time_slices); + MPI_Comm_rank(g_mpi_time_slices, &g_mpi_time_rank); + if(g_debug_level > 4) { + fprintf(stdout, "# My mpi_time_rank = %d, g_proc_coords = (%d,%d,%d,%d), g_cart_id = %d\n", + g_mpi_time_rank, g_proc_coords[0], g_proc_coords[1], g_proc_coords[2], g_proc_coords[3], + g_cart_id); + } + + /* and communicators for Cartesian z-slices */ + MPI_Comm_split(g_cart_grid, g_proc_coords[3], g_cart_id, &g_mpi_z_slices); + MPI_Comm_rank(g_mpi_z_slices, &g_mpi_z_rank); + if(g_debug_level > 4) { + fprintf(stdout, "# My mpi_z_rank = %d, g_proc_coords = (%d,%d,%d,%d), g_cart_id = %d\n", + g_mpi_z_rank, g_proc_coords[0], g_proc_coords[1], g_proc_coords[2], g_proc_coords[3], + g_cart_id); + } + + /* and spatial volume slices */ + MPI_Comm_split(g_cart_grid, g_mpi_time_rank, g_proc_coords[0], &g_mpi_SV_slices); + MPI_Comm_rank(g_mpi_SV_slices, &g_mpi_SV_rank); + if(g_debug_level > 4) { + fprintf(stdout, "# My mpi_SV_rank = %d, g_proc_coords = (%d,%d,%d,%d), g_cart_id = %d\n", + g_mpi_SV_rank, g_proc_coords[0], g_proc_coords[1], g_proc_coords[2], g_proc_coords[3], + g_cart_id); + } + + /* and tim-volume slices orthogonal to the z-direction */ + MPI_Comm_split(g_cart_grid, g_mpi_z_rank, g_proc_coords[3], &g_mpi_ST_slices); + MPI_Comm_rank(g_mpi_ST_slices, &g_mpi_ST_rank); + if(g_debug_level > 4) { + fprintf(stdout, "# My mpi_ST_rank = %d, g_proc_coords = (%d,%d,%d,%d), g_cart_id = %d\n", + g_mpi_ST_rank, g_proc_coords[0], g_proc_coords[1], g_proc_coords[2], g_proc_coords[3], + g_cart_id); + } + + MPI_Op_create(reduce_su3_ray, 0, &mpi_reduce_su3_ray); + +#else /*ifdef MPI */ + g_nproc = 1; + g_proc_id = 0; + g_nproc_x = 1; + g_nproc_y = 1; + g_nproc_z = 1; + g_nproc_t = 1; + g_cart_id = 0; + g_mpi_time_rank = 0; + g_mpi_z_rank = 0; + g_mpi_SV_rank = 0; + g_mpi_ST_rank = 0; + g_stdio_proc = 0; + +# ifndef FIXEDVOLUME + T = T_global; + VOLUME = (T*LX*LY*LZ); + SPACEVOLUME = VOLUME/T; +# ifdef _USE_TSPLITPAR + TEOSLICE = (LX*LY*LZ)/2; +# endif + RAND = 0; + EDGES = 0; + VOLUMEPLUSRAND = VOLUME; + SPACERAND=0; + N_PROC_T = 1; + N_PROC_X = 1; + N_PROC_Y = 1; + N_PROC_Z = 1; +# endif + g_dbw2rand = 0; +#endif /*ifdef MPI */ + + /* Here we perform some checks in order not to */ + /* run into trouble later */ +#if (defined PARALLELXYZT || defined PARALLELXYZ ) + if((T*LX*LY)%2 != 0 && even_odd_flag == 1) { + fprintf(stderr, "T*LX*LY must be even!\nAborting prgram...\n"); +# ifdef MPI + MPI_Finalize(); +# endif + exit(-1); + } +#endif + + if(LZ%2 != 0 && even_odd_flag == 1) { + fprintf(stderr, "LZ must be even!\nAborting prgram...\n"); +#ifdef MPI + MPI_Finalize(); +#endif + exit(-1); + } +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/mpi_init.h b/qcd/part_cpu/applications/QCD/src/kernel_D/mpi_init.h new file mode 100644 index 0000000000000000000000000000000000000000..6bd3e1b04cbcde1bde788f922503e18954029c58 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/mpi_init.h @@ -0,0 +1,170 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _MPI_INIT_H +#define _MPI_INIT_H + +#ifdef MPI +#include + + +/* Datatypes for the data exchange */ +extern MPI_Datatype mpi_su3; +extern MPI_Datatype field_point; +extern MPI_Datatype field_point32; +extern MPI_Datatype gauge_time_slice_cont; +extern MPI_Datatype gauge_time_slice_split; +extern MPI_Datatype deri_time_slice_cont; +extern MPI_Datatype deri_time_slice_split; +extern MPI_Datatype field_time_slice_cont; +extern MPI_Datatype lfield_time_slice_cont; +extern MPI_Datatype lfield_time_slice_cont32; +extern MPI_Datatype gauge_x_slice_cont; +extern MPI_Datatype gauge_x_slice_gath; +extern MPI_Datatype field_x_slice_cont; +extern MPI_Datatype field_x_slice_gath; +extern MPI_Datatype lfield_x_slice_cont; +extern MPI_Datatype lfield_x_slice_cont32; +extern MPI_Datatype lfield_x_slice_gath; +extern MPI_Datatype lfield_x_slice_gath32; +extern MPI_Datatype deri_x_slice_cont; +extern MPI_Datatype deri_x_slice_gath; +extern MPI_Datatype gauge_xt_edge_cont; +extern MPI_Datatype gauge_xt_edge_gath; +extern MPI_Datatype deri_xt_edge_cont; +extern MPI_Datatype deri_xt_edge_gath; + +extern MPI_Datatype gauge_yx_edge_cont; +extern MPI_Datatype gauge_yx_edge_gath; +extern MPI_Datatype deri_yx_edge_cont; + +extern MPI_Datatype gauge_ty_edge_cont; +extern MPI_Datatype gauge_ty_edge_gath; +extern MPI_Datatype deri_ty_edge_cont; + +extern MPI_Datatype gauge_zx_edge_cont; +extern MPI_Datatype gauge_zx_edge_gath; +extern MPI_Datatype deri_zx_edge_cont; + +extern MPI_Datatype gauge_tz_edge_cont; +extern MPI_Datatype gauge_tz_edge_gath; +extern MPI_Datatype deri_tz_edge_cont; + +extern MPI_Datatype gauge_zy_edge_cont; +extern MPI_Datatype gauge_zy_edge_gath; +extern MPI_Datatype deri_zy_edge_cont; + +extern MPI_Datatype gauge_y_slice_cont; +extern MPI_Datatype gauge_y_slice_gath; +extern MPI_Datatype field_y_slice_cont; +extern MPI_Datatype field_y_slice_gath; +extern MPI_Datatype lfield_y_slice_cont; +extern MPI_Datatype lfield_y_slice_cont32; +extern MPI_Datatype lfield_y_slice_gath; +extern MPI_Datatype lfield_y_slice_gath32; +extern MPI_Datatype deri_y_slice_cont; +extern MPI_Datatype deri_y_slice_gath; + +extern MPI_Datatype deri_z_slice_cont; +extern MPI_Datatype deri_z_slice_gath; + +extern MPI_Datatype gauge_z_slice_gath; +extern MPI_Datatype gauge_z_slice_cont; + +extern MPI_Datatype field_z_slice_cont; +extern MPI_Datatype field_z_slice_gath; +extern MPI_Datatype lfield_z_slice_cont; +extern MPI_Datatype lfield_z_slice_cont32; +extern MPI_Datatype lfield_z_slice_gath; +extern MPI_Datatype lfield_z_slice_gath32; +extern MPI_Datatype field_z_slice_half; + +extern MPI_Datatype halffield_point; +extern MPI_Datatype halffield_time_slice_cont; +extern MPI_Datatype halffield_x_slice_cont; +extern MPI_Datatype halffield_x_slice_gath; +extern MPI_Datatype halffield_y_slice_cont; +extern MPI_Datatype halffield_y_slice_gath; +extern MPI_Datatype halffield_z_slice_cont; + +#ifdef _USE_TSPLITPAR +extern MPI_Datatype field_xt_slice_int; +extern MPI_Datatype field_yt_slice_int; +extern MPI_Datatype field_xt_slice_ext; +extern MPI_Datatype field_yt_slice_ext; +# ifdef PARALLELXYZ +extern MPI_Datatype field_zt_slice_ext_L; +extern MPI_Datatype field_zt_slice_ext_S; +extern MPI_Datatype field_zt_slice_even_dn_et; +extern MPI_Datatype field_zt_slice_even_up_et; +extern MPI_Datatype field_zt_slice_odd_dn_et; +extern MPI_Datatype field_zt_slice_odd_up_et; +extern MPI_Datatype field_zt_slice_even_dn_ot; +extern MPI_Datatype field_zt_slice_even_up_ot; +extern MPI_Datatype field_zt_slice_odd_dn_ot; +extern MPI_Datatype field_zt_slice_odd_up_ot; +# endif +#endif +#ifdef WITHLAPH +extern MPI_Datatype su3vect_point; +extern MPI_Datatype jfield_x_slice_cont; +extern MPI_Datatype jfield_y_slice_cont; +extern MPI_Datatype jfield_z_slice_cont; +extern MPI_Datatype jfield_x_slice_gath; +extern MPI_Datatype jfield_y_slice_gath; +extern MPI_Datatype jfield_z_slice_gath; +extern MPI_Datatype jfield_y_subslice; +#endif + +#if ( defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXYZ ) +extern MPI_Datatype field_z_slice_even_dn; +extern MPI_Datatype field_z_slice_even_up; +extern MPI_Datatype field_z_slice_odd_dn; +extern MPI_Datatype field_z_slice_odd_up; + +# if (!defined _INDEX_INDEP_GEOM) +extern spinor * field_buffer_z ALIGN; +extern spinor * field_buffer_z2 ALIGN; +extern spinor * field_buffer_z3 ALIGN; +extern spinor * field_buffer_z4 ALIGN; +extern spinor * field_buffer_y ALIGN; +extern spinor * field_buffer_y2 ALIGN; +extern spinor * field_buffer_y3 ALIGN; +extern spinor * field_buffer_y4 ALIGN; +extern spinor * field_buffer_x ALIGN; +extern spinor * field_buffer_x2 ALIGN; +extern spinor * field_buffer_x3 ALIGN; +extern spinor * field_buffer_x4 ALIGN; +extern spinor * field_buffer_t ALIGN; +extern spinor * field_buffer_t2 ALIGN; +extern spinor * field_buffer_t3 ALIGN; +extern spinor * field_buffer_t4 ALIGN; + +extern halfspinor * halffield_buffer_z ALIGN; +extern halfspinor * halffield_buffer_z2 ALIGN; +# endif +#endif + +extern MPI_Op mpi_reduce_su3_ray; +void reduce_su3_ray(void *u_i, void *u_io, int *len, MPI_Datatype *dt); + +#endif + +void tmlqcd_mpi_init(void); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/offline_measurement.c b/qcd/part_cpu/applications/QCD/src/kernel_D/offline_measurement.c new file mode 100644 index 0000000000000000000000000000000000000000..bf2cdb836ae21c86856a39a61b5c0faaa4a1f7ec --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/offline_measurement.c @@ -0,0 +1,411 @@ +/*********************************************************************** + * + * Copyright (C) 2012 Carsten Urbach, Albert Deuzeman, Bartosz Kostrzewa + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * naive pion correlator for twisted mass QCD + * + *******************************************************************************/ + +#define MAIN_PROGRAM +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef BENCHMARK +#include <./c-lime/include/lime.h> +#else +#include +#endif +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#include +#include +#ifdef MPI +#include +#endif +#ifdef OMP +# include +#endif +#include "global.h" +#include "git_hash.h" +#include "getopt.h" +#include "linalg_eo.h" +#include "geometry_eo.h" +#include "start.h" +#include "measure_gauge_action.h" +#ifdef MPI +#include "xchange/xchange.h" +#endif +#include +#include "read_input.h" +#include "mpi_init.h" +#include "sighandler.h" +#include "boundary.h" +#include "solver/solver.h" +#include "init/init.h" +#include "invert_eo.h" +#include "monomial/monomial.h" +#include "ranlxd.h" +#include "phmc.h" +#include "operator/D_psi.h" +#include "little_D.h" +#include "reweighting_factor.h" +#include "linalg/convert_eo_to_lexic.h" +#include "block.h" +#include "operator.h" +#include "sighandler.h" +#include "solver/dfl_projector.h" +#include "solver/generate_dfl_subspace.h" +#include "prepare_source.h" +#include +#include +#include +#include +#include "solver/dirac_operator_eigenvectors.h" +#include "P_M_eta.h" +#include "operator/tm_operators.h" +#include "operator/Dov_psi.h" +#include "gettime.h" +#include "meas/measurements.h" + +extern int nstore; +int check_geometry(); + +static void usage(); +static void process_args(int argc, char *argv[], char ** input_filename, char ** filename); +static void set_default_filenames(char ** input_filename, char ** filename); + +int main(int argc, char *argv[]) +{ + FILE *parameterfile = NULL; + int j, i, ix = 0, isample = 0, op_id = 0; + char datafilename[206]; + char parameterfilename[206]; + char conf_filename[50]; + char * input_filename = NULL; + char * filename = NULL; + double plaquette_energy; + +#ifdef _KOJAK_INST +#pragma pomp inst init +#pragma pomp inst begin(main) +#endif + +#if (defined SSE || defined SSE2 || SSE3) + signal(SIGILL, &catch_ill_inst); +#endif + + DUM_DERI = 8; + DUM_MATRIX = DUM_DERI + 5; +#if ((defined BGL && defined XLC) || defined _USE_TSPLITPAR) + NO_OF_SPINORFIELDS = DUM_MATRIX + 3; +#else + NO_OF_SPINORFIELDS = DUM_MATRIX + 3; +#endif + + verbose = 0; + g_use_clover_flag = 0; + +#ifdef MPI + +# ifdef OMP + int mpi_thread_provided; + MPI_Init_thread(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_thread_provided); +# else + MPI_Init(&argc, &argv); +# endif + + MPI_Comm_rank(MPI_COMM_WORLD, &g_proc_id); +#else + g_proc_id = 0; +#endif + + process_args(argc,argv,&input_filename,&filename); + set_default_filenames(&input_filename, &filename); + + /* Read the input file */ + if( (j = read_input(input_filename)) != 0) { + fprintf(stderr, "Could not find input file: %s\nAborting...\n", input_filename); + exit(-1); + } + +#ifdef OMP + init_openmp(); +#endif + + /* this DBW2 stuff is not needed for the inversion ! */ + if (g_dflgcr_flag == 1) { + even_odd_flag = 0; + } + if (Nsave == 0) { + Nsave = 1; + } + + if (g_running_phmc) { + NO_OF_SPINORFIELDS = DUM_MATRIX + 8; + } + + tmlqcd_mpi_init(argc, argv); + + /* starts the single and double precision random number */ + /* generator */ + start_ranlux_KD(rlxd_level, random_seed); + + /* we need to make sure that we don't have even_odd_flag = 1 */ + /* if any of the operators doesn't use it */ + /* in this way even/odd can still be used by other operators */ + for(j = 0; j < no_operators; j++) if(!operator_list[j].even_odd_flag) even_odd_flag = 0; + +#ifndef MPI + g_dbw2rand = 0; +#endif + +#ifdef _GAUGE_COPY + j = init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1); +#else + j = init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 0); +#endif + if (j != 0) { + fprintf(stderr, "Not enough memory for gauge_fields! Aborting...\n"); + exit(-1); + } + j = init_geometry_indices(VOLUMEPLUSRAND + g_dbw2rand); + if (j != 0) { + fprintf(stderr, "Not enough memory for geometry indices! Aborting...\n"); + exit(-1); + } + if (no_monomials > 0) { + if (even_odd_flag) { + j = init_monomials(VOLUMEPLUSRAND / 2, even_odd_flag); + } + else { + j = init_monomials(VOLUMEPLUSRAND, even_odd_flag); + } + if (j != 0) { + fprintf(stderr, "Not enough memory for monomial pseudo fermion fields! Aborting...\n"); + exit(-1); + } + } + if (even_odd_flag) { + j = init_spinor_field(VOLUMEPLUSRAND / 2, NO_OF_SPINORFIELDS); + } + else { + j = init_spinor_field(VOLUMEPLUSRAND, NO_OF_SPINORFIELDS); + } + if (j != 0) { + fprintf(stderr, "Not enough memory for spinor fields! Aborting...\n"); + exit(-1); + } + + if (g_running_phmc) { + j = init_chi_spinor_field(VOLUMEPLUSRAND / 2, 20); + if (j != 0) { + fprintf(stderr, "Not enough memory for PHMC Chi fields! Aborting...\n"); + exit(-1); + } + } + + g_mu = g_mu1; + + if (g_cart_id == 0) { + /*construct the filenames for the observables and the parameters*/ + strncpy(datafilename, filename, 200); + strcat(datafilename, ".data"); + strncpy(parameterfilename, filename, 200); + strcat(parameterfilename, ".para"); + + parameterfile = fopen(parameterfilename, "w"); + write_first_messages(parameterfile, "invert", git_hash); + fclose(parameterfile); + } + + /* define the geometry */ + geometry(); + int status = check_geometry(); + + if (status != 0) { + fprintf(stderr, "Checking of geometry failed. Unable to proceed.\nAborting....\n"); + exit(1); + } + + /* define the boundary conditions for the fermion fields */ + boundary(g_kappa); + + phmc_invmaxev = 1.; + + init_operators(); + + /* list and initialize measurements*/ + if(g_proc_id == 0) { + printf("\n"); + for(int j = 0; j < no_measurements; j++) { + printf("# measurement id %d, type = %d\n", j, measurement_list[j].type); + } + } + init_measurements(); + + /* this could be maybe moved to init_operators */ +#ifdef _USE_HALFSPINOR + j = init_dirac_halfspinor(); + if (j != 0) { + fprintf(stderr, "Not enough memory for halffield! Aborting...\n"); + exit(-1); + } + if (g_sloppy_precision_flag == 1) { + j = init_dirac_halfspinor32(); + if (j != 0) + { + fprintf(stderr, "Not enough memory for 32-bit halffield! Aborting...\n"); + exit(-1); + } + } +# if (defined _PERSISTENT) + if (even_odd_flag) + init_xchange_halffield(); +# endif +#endif + + for (j = 0; j < Nmeas; j++) { + sprintf(conf_filename, "%s.%.4d", gauge_input_filename, nstore); + if (g_cart_id == 0) { + printf("#\n# Trying to read gauge field from file %s in %s precision.\n", + conf_filename, (gauge_precision_read_flag == 32 ? "single" : "double")); + fflush(stdout); + } + if( (i = read_gauge_field(conf_filename,g_gauge_field)) !=0) { + fprintf(stderr, "Error %d while reading gauge field from %s\n Aborting...\n", i, conf_filename); + exit(-2); + } + + if (g_cart_id == 0) { + printf("# Finished reading gauge field.\n"); + fflush(stdout); + } + +#ifdef MPI + xchange_gauge(g_gauge_field); +#endif + + /*compute the energy of the gauge field*/ + plaquette_energy = measure_plaquette( (const su3** const) g_gauge_field); + + if (g_cart_id == 0) { + printf("# The computed plaquette value is %e.\n", plaquette_energy / (6.*VOLUME*g_nproc)); + fflush(stdout); + } + + if (g_cart_id == 0) { + fprintf(stdout, "#\n"); /*Indicate starting of the operator part*/ + } + + + /* offline measurements */ + measurement * meas; + for(int imeas = 0; imeas < no_measurements; imeas++){ + meas = &measurement_list[imeas]; + if (g_proc_id == 0) { + fprintf(stdout, "#\n# Beginning offline measurement.\n"); + } + meas->measurefunc(nstore, imeas, even_odd_flag); + } + nstore += Nsave; + } + +#ifdef OMP + free_omp_accumulators(); +#endif + + free_blocks(); + free_dfl_subspace(); + free_geometry_indices(); + free_spinor_field(); + + free_chi_spinor_field(); + + free(filename); + free(input_filename); + +#ifdef MPI + MPI_Barrier(MPI_COMM_WORLD); + MPI_Finalize(); +#endif + return(0); + + +#ifdef _KOJAK_INST +#pragma pomp inst end(main) +#endif +} + +static void usage() +{ + fprintf(stdout, "Offline version of the online measurements for twisted mass QCD\n"); + fprintf(stdout, "Version %s \n\n", PACKAGE_VERSION); + fprintf(stdout, "Please send bug reports to %s\n", PACKAGE_BUGREPORT); + fprintf(stdout, "Usage: invert [options]\n"); + fprintf(stdout, "Options: [-f input-filename]\n"); + fprintf(stdout, " [-v] more verbosity\n"); + fprintf(stdout, " [-h|-? this help]\n"); + fprintf(stdout, " [-V] print version information and exit\n"); + exit(0); +} + +static void process_args(int argc, char *argv[], char ** input_filename, char ** filename) { + int c; + while ((c = getopt(argc, argv, "h?vVf:o:")) != -1) { + switch (c) { + case 'f': + *input_filename = calloc(200, sizeof(char)); + strncpy(*input_filename, optarg, 200); + break; + case 'v': + verbose = 1; + break; + case 'V': + if(g_proc_id == 0) { + fprintf(stdout,"%s %s\n",PACKAGE_STRING,git_hash); + } + exit(0); + break; + case 'h': + case '?': + default: + if( g_proc_id == 0 ) { + usage(); + } + break; + } + } +} + +static void set_default_filenames(char ** input_filename, char ** filename) { + if( *input_filename == NULL ) { + *input_filename = calloc(28, sizeof(char)); + strcpy(*input_filename,"offline_measurement.input"); + } + + if( *filename == NULL ) { + *filename = calloc(7, sizeof(char)); + strcpy(*filename,"output"); + } +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator.c b/qcd/part_cpu/applications/QCD/src/kernel_D/operator.c new file mode 100644 index 0000000000000000000000000000000000000000..d4c226b6d7dab321297ae6ab976fd5ba2d0f9189 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator.c @@ -0,0 +1,523 @@ +/*********************************************************************** + * + * Copyright (C) 2009 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#include +#include +#ifdef MPI +# include +#endif +#include "global.h" +#include "default_input_values.h" +#include "read_input.h" +#include "su3.h" +#include "operator/tm_operators.h" +#include "linalg_eo.h" +#include "operator/D_psi.h" +#include "operator/Dov_psi.h" +#include "operator/tm_operators_nd.h" +#include "operator/Hopping_Matrix.h" +#include "invert_eo.h" +#include "invert_doublet_eo.h" +#include "invert_overlap.h" +#include "invert_clover_eo.h" +#include "boundary.h" +#include "start.h" +#include "solver/eigenvalues.h" +#include "solver/solver.h" +#include +#include +#include +#include +#include "test/overlaptests.h" +#include "solver/index_jd.h" +#include "operator/clovertm_operators.h" +#include "operator/clovertm_operators_32.h" +#include "operator/clover_leaf.h" +#include "operator.h" +#include "gettime.h" +#ifdef QUDA +# include "quda_interface.h" +#endif + +void dummy_D(spinor * const, spinor * const); +void dummy_DbD(spinor * const s, spinor * const r, spinor * const p, spinor * const q); +void op_invert(const int op_id, const int index_start, const int write_prop); +void op_write_prop(const int op_id, const int index_start, const int append_); + +operator operator_list[max_no_operators]; + +int no_operators = 0; + +int add_operator(const int type) { + + operator * optr = &operator_list[no_operators]; + if(no_operators == max_no_operators) { + fprintf(stderr, "maximal number of operators %d exceeded!\n", max_no_operators); + exit(-1); + } + optr->type = type; + optr->kappa = _default_g_kappa; + optr->mu = _default_g_mu; + optr->c_sw = _default_c_sw; + optr->sloppy_precision = _default_operator_sloppy_precision_flag; + optr->compression_type = _default_compression_type; + optr->coefs = NULL; + optr->rel_prec = _default_g_relative_precision_flag; + optr->eps_sq = _default_solver_precision; + optr->maxiter = _default_max_solver_iterations; + optr->even_odd_flag = _default_even_odd_flag; + optr->solver = _default_solver_flag; + optr->mubar = _default_g_mubar; + optr->epsbar = _default_g_epsbar; + optr->sr0 = NULL; + optr->sr1 = NULL; + optr->sr2 = NULL; + optr->sr3 = NULL; + optr->prop0 = NULL; + optr->prop1 = NULL; + optr->prop2 = NULL; + optr->prop3 = NULL; + optr->error_code = 0; + optr->prop_precision = _default_prop_precision_flag; + optr->no_flavours = 1; + optr->DownProp = 0; + optr->conf_input = _default_gauge_input_filename; + optr->no_extra_masses = 0; + + (optr->solver_params).mcg_delta = _default_mixcg_innereps; + + optr->applyM = &dummy_D; + optr->applyQ = &dummy_D; + optr->applyQp = &dummy_D; + optr->applyQm = &dummy_D; + optr->applyMp = &dummy_D; + optr->applyMm = &dummy_D; + optr->applyQsq = &dummy_D; + optr->applyDbQsq = &dummy_DbD; + + optr->inverter = &op_invert; + optr->write_prop = &op_write_prop; + + /* Overlap needs special treatment */ + if(optr->type == OVERLAP) { + optr->even_odd_flag = 0; + optr->solver = 13; + optr->no_ev = 10; + optr->no_ev_index = 8; + optr->ev_prec = 1.e-15; + optr->ev_readwrite = 0; + optr->deg_poly = 50; + optr->s = 0.6; + optr->m = 0.; + optr->inverter = &op_invert; + } + if(optr->type == DBTMWILSON || optr->type == DBCLOVER) { + optr->no_flavours = 2; + g_running_phmc = 1; + } + + optr->precWS=NULL; + + optr->initialised = 1; + + no_operators++; + return(no_operators); +} + +int init_operators() { + static int oinit = 0; + operator * optr; + if(!oinit) { + oinit = 1; + for(int i = 0; i < no_operators; i++) { + optr = operator_list + i; + /* This is a hack, it should be set on an operator basis. */ + optr->rel_prec = g_relative_precision_flag; + if(optr->type == TMWILSON || optr->type == WILSON) { + if(optr->c_sw > 0) { + init_sw_fields(); + } + if(optr->even_odd_flag) { + optr->applyQp = &Qtm_plus_psi; + optr->applyQm = &Qtm_minus_psi; + optr->applyQsq = &Qtm_pm_psi; + optr->applyMp = &Mtm_plus_psi; + optr->applyMm = &Mtm_minus_psi; + } + else { + optr->applyQp = &Q_plus_psi; + optr->applyQm = &Q_minus_psi; + optr->applyQsq = &Q_pm_psi; + optr->applyMp = &D_psi; + optr->applyMm = &D_psi; + } + if(optr->solver == CGMMS) { + if (g_cart_id == 0 && optr->even_odd_flag == 1) + fprintf(stderr, "CG Multiple mass solver works only without even/odd! Forcing!\n"); + optr->even_odd_flag = 0; + if (g_cart_id == 0 && optr->DownProp) + fprintf(stderr, "CGMMS doesn't need AddDownPropagator! Switching Off!\n"); + optr->DownProp = 0; + } + + if(optr->solver == INCREIGCG){ + if (g_cart_id == 0 && optr->DownProp){ + fprintf(stderr,"Warning: When even-odd preconditioning is used, the eigenvalues for +mu and -mu will be little different\n"); + fprintf(stderr,"Incremental EigCG solver will still work however.\n"); + } + if (g_cart_id == 0 && optr->even_odd_flag == 0) + fprintf(stderr,"Incremental EigCG solver is added only with Even-Odd preconditioning!. Forcing\n"); + optr->even_odd_flag = 1; + } + }else if(optr->type == OVERLAP) { + optr->even_odd_flag = 0; + optr->applyM = &Dov_psi; + optr->applyQ = &Qov_psi; + }else if(optr->type == DBTMWILSON) { + optr->even_odd_flag = 1; + optr->applyDbQsq = &Qtm_pm_ndpsi; + /* TODO: this should be here! */ + /* Chi`s-spinors memory allocation */ + /* if(init_chi_spinor_field(VOLUMEPLUSRAND/2, 20) != 0) { */ + /* fprintf(stderr, "Not enough memory for 20 NDPHMC Chi fields! Aborting...\n"); */ + /* exit(0); */ + /* } */ + }else if(optr->type == DBCLOVER) { + optr->even_odd_flag = 1; + optr->applyDbQsq = &Qtm_pm_ndpsi; + } + } /* loop over operators */ + + if(optr->external_inverter==QUDA_INVERTER ) { +#ifdef QUDA + _initQuda(); +#else + if(g_proc_id == 0) { + fprintf(stderr, "Error: You're trying to use QUDA but this build was not configured for QUDA usage.\n"); + exit(-2); + } +#endif + } + } + return(0); +} + +void dummy_D(spinor * const s, spinor * const r) { + if(g_proc_id == 0) { + fprintf(stderr, "dummy_D was called. Was that really intended?\n"); + } + return; +} + +void dummy_DbD(spinor * const s, spinor * const r, spinor * const p, spinor * const q) { + if(g_proc_id == 0) { + fprintf(stderr, "dummy_DbD was called. Was that really intended?\n"); + } + return; +} + +void op_invert(const int op_id, const int index_start, const int write_prop) { + operator * optr = &operator_list[op_id]; + double atime = 0., etime = 0., nrm1 = 0., nrm2 = 0.; + int i; + optr->iterations = 0; + optr->reached_prec = -1.; + g_kappa = optr->kappa; + boundary(g_kappa); + + atime = gettime(); + if(optr->type == TMWILSON || optr->type == WILSON || optr->type == CLOVER) { + g_mu = optr->mu; + g_c_sw = optr->c_sw; + if(optr->type == CLOVER) { + if (g_cart_id == 0 && g_debug_level > 1) { + printf("#\n# csw = %e, computing clover leafs\n", g_c_sw); + } + init_sw_fields(VOLUME); + + sw_term( (const su3**) g_gauge_field, optr->kappa, optr->c_sw); + /* this must be EE here! */ + /* to match clover_inv in Qsw_psi */ + sw_invert(EE, optr->mu); + /* now copy double sw and sw_inv fields to 32bit versions */ + copy_32_sw_fields(); + } + + for(i = 0; i < 2; i++) { + // we need this here again for the sign switch at i == 1 + g_mu = optr->mu; + if (g_cart_id == 0) { + printf("#\n# 2 kappa mu = %e, kappa = %e, c_sw = %e\n", g_mu, g_kappa, g_c_sw); + } + if(optr->type != CLOVER) { + if(use_preconditioning){ + g_precWS=(void*)optr->precWS; + } else { + g_precWS=NULL; + } + optr->iterations = invert_eo( optr->prop0, optr->prop1, optr->sr0, optr->sr1, + optr->eps_sq, optr->maxiter, + optr->solver, optr->rel_prec, + 0, optr->even_odd_flag,optr->no_extra_masses, + optr->extra_masses, optr->solver_params, optr->id, + optr->external_inverter, optr->sloppy_precision, optr->compression_type); + + /* check result */ + M_full(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1], optr->prop0, optr->prop1); + } else { + /* this must be EE here! */ + /* to match clover_inv in Qsw_psi */ + sw_invert(EE, optr->mu); + /* now copy double sw and sw_inv fields to 32bit versions */ + copy_32_sw_fields(); + + optr->iterations = invert_clover_eo(optr->prop0, optr->prop1, optr->sr0, optr->sr1, + optr->eps_sq, optr->maxiter, + optr->solver, optr->rel_prec,optr->solver_params, + &g_gauge_field, &Qsw_pm_psi, &Qsw_minus_psi, + optr->external_inverter, optr->sloppy_precision, optr->compression_type); + + /* check result */ + Msw_full(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+1], optr->prop0, optr->prop1); + } + + diff(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI], optr->sr0, VOLUME / 2); + diff(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI+1], optr->sr1, VOLUME / 2); + + nrm1 = square_norm(g_spinor_field[DUM_DERI], VOLUME / 2, 1); + nrm2 = square_norm(g_spinor_field[DUM_DERI+1], VOLUME / 2, 1); + optr->reached_prec = nrm1 + nrm2; + + /* convert to standard normalisation */ + /* we have to mult. by 2*kappa */ + if (optr->kappa != 0.) { + mul_r(optr->prop0, (2*optr->kappa), optr->prop0, VOLUME / 2); + mul_r(optr->prop1, (2*optr->kappa), optr->prop1, VOLUME / 2); + } + if (optr->solver != CGMMS && write_prop) /* CGMMS handles its own I/O */ + optr->write_prop(op_id, index_start, i); + if(optr->DownProp) { + optr->mu = -optr->mu; + } else + break; + } + } else if(optr->type == DBTMWILSON || optr->type == DBCLOVER) { + g_mubar = optr->mubar; + g_epsbar = optr->epsbar; + g_c_sw = 0.; + if(optr->type == DBCLOVER) { + g_c_sw = optr->c_sw; + if (g_cart_id == 0 && g_debug_level > 1) { + printf("#\n# csw = %e, computing clover leafs\n", g_c_sw); + } + init_sw_fields(VOLUME); + sw_term( (const su3**) g_gauge_field, optr->kappa, optr->c_sw); + sw_invert_nd(optr->mubar*optr->mubar-optr->epsbar*optr->epsbar); + /* now copy double sw and sw_inv fields to 32bit versions */ + copy_32_sw_fields(); + } + + for(i = 0; i < SourceInfo.no_flavours; i++) { + if(optr->type != DBCLOVER) { + optr->iterations = invert_doublet_eo( optr->prop0, optr->prop1, optr->prop2, optr->prop3, + optr->sr0, optr->sr1, optr->sr2, optr->sr3, + optr->eps_sq, optr->maxiter, + optr->solver, optr->rel_prec, optr->solver_params, + optr->external_inverter, optr->sloppy_precision, optr->compression_type); + } else { + optr->iterations = invert_cloverdoublet_eo( optr->prop0, optr->prop1, optr->prop2, optr->prop3, + optr->sr0, optr->sr1, optr->sr2, optr->sr3, + optr->eps_sq, optr->maxiter, + optr->solver, optr->rel_prec, optr->solver_params, + optr->external_inverter, optr->sloppy_precision, optr->compression_type); + } + g_mu = optr->mubar; + if(optr->type != DBCLOVER) { + M_full(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI+2], optr->prop0, optr->prop1); + } else { + Msw_full(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI+2], optr->prop0, optr->prop1); + } + assign_add_mul_r(g_spinor_field[DUM_DERI+1], optr->prop2, -optr->epsbar, VOLUME/2); + assign_add_mul_r(g_spinor_field[DUM_DERI+2], optr->prop3, -optr->epsbar, VOLUME/2); + + g_mu = -g_mu; + if(optr->type != DBCLOVER) { + M_full(g_spinor_field[DUM_DERI+3], g_spinor_field[DUM_DERI+4], optr->prop2, optr->prop3); + } else { + Msw_full(g_spinor_field[DUM_DERI+3], g_spinor_field[DUM_DERI+4], optr->prop2, optr->prop3); + } + assign_add_mul_r(g_spinor_field[DUM_DERI+3], optr->prop0, -optr->epsbar, VOLUME/2); + assign_add_mul_r(g_spinor_field[DUM_DERI+4], optr->prop1, -optr->epsbar, VOLUME/2); + + diff(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI+1], optr->sr0, VOLUME/2); + diff(g_spinor_field[DUM_DERI+2], g_spinor_field[DUM_DERI+2], optr->sr1, VOLUME/2); + diff(g_spinor_field[DUM_DERI+3], g_spinor_field[DUM_DERI+3], optr->sr2, VOLUME/2); + diff(g_spinor_field[DUM_DERI+4], g_spinor_field[DUM_DERI+4], optr->sr3, VOLUME/2); + + nrm1 = square_norm(g_spinor_field[DUM_DERI+1], VOLUME/2, 1); + nrm1 += square_norm(g_spinor_field[DUM_DERI+2], VOLUME/2, 1); + nrm1 += square_norm(g_spinor_field[DUM_DERI+3], VOLUME/2, 1); + nrm1 += square_norm(g_spinor_field[DUM_DERI+4], VOLUME/2, 1); + optr->reached_prec = nrm1; + g_mu = g_mu1; + /* For standard normalisation */ + /* we have to mult. by 2*kappa */ + mul_r(g_spinor_field[DUM_DERI], (2*optr->kappa), optr->prop0, VOLUME/2); + mul_r(g_spinor_field[DUM_DERI+1], (2*optr->kappa), optr->prop1, VOLUME/2); + mul_r(g_spinor_field[DUM_DERI+2], (2*optr->kappa), optr->prop2, VOLUME/2); + mul_r(g_spinor_field[DUM_DERI+3], (2*optr->kappa), optr->prop3, VOLUME/2); + /* the final result should be stored in the convention used in */ + /* hep-lat/0606011 */ + /* this requires multiplication of source with */ + /* (1+itau_2)/sqrt(2) and the result with (1-itau_2)/sqrt(2) */ + + mul_one_pm_itau2(optr->prop0, optr->prop2, g_spinor_field[DUM_DERI], + g_spinor_field[DUM_DERI+2], -1., VOLUME/2); + mul_one_pm_itau2(optr->prop1, optr->prop3, g_spinor_field[DUM_DERI+1], + g_spinor_field[DUM_DERI+3], -1., VOLUME/2); + /* write propagator */ + if(write_prop) optr->write_prop(op_id, index_start, i); + + mul_r(optr->prop0, 1./(2*optr->kappa), g_spinor_field[DUM_DERI], VOLUME/2); + mul_r(optr->prop1, 1./(2*optr->kappa), g_spinor_field[DUM_DERI+1], VOLUME/2); + mul_r(optr->prop2, 1./(2*optr->kappa), g_spinor_field[DUM_DERI+2], VOLUME/2); + mul_r(optr->prop3, 1./(2*optr->kappa), g_spinor_field[DUM_DERI+3], VOLUME/2); + + /* mirror source, but not for volume sources */ + if(i == 0 && SourceInfo.no_flavours == 2 && SourceInfo.type != 1) { + if (g_cart_id == 0) { + fprintf(stdout, "# Inversion done in %d iterations, squared residue = %e!\n", + optr->iterations, optr->reached_prec); + } + mul_one_pm_itau2(g_spinor_field[DUM_DERI], g_spinor_field[DUM_DERI+2], optr->sr0, optr->sr2, -1., VOLUME/2); + mul_one_pm_itau2(g_spinor_field[DUM_DERI+1], g_spinor_field[DUM_DERI+3], optr->sr1, optr->sr3, -1., VOLUME/2); + + mul_one_pm_itau2(optr->sr0, optr->sr2, g_spinor_field[DUM_DERI+2], g_spinor_field[DUM_DERI], +1., VOLUME/2); + mul_one_pm_itau2(optr->sr1, optr->sr3, g_spinor_field[DUM_DERI+3], g_spinor_field[DUM_DERI+1], +1., VOLUME/2); + } + /* volume sources need only one inversion */ + else if(SourceInfo.type == 1) i++; + } + } else if(optr->type == OVERLAP) { + g_mu = 0.; + m_ov=optr->m; + eigenvalues(&optr->no_ev, 5000, optr->ev_prec, 0, optr->ev_readwrite, nstore, optr->even_odd_flag); +/* ov_check_locality(); */ +/* index_jd(&optr->no_ev_index, 5000, 1.e-12, optr->conf_input, nstore, 4); */ + ov_n_cheby=optr->deg_poly; + + if(use_preconditioning==1) + g_precWS=(void*)optr->precWS; + else + g_precWS=NULL; + + + if(g_debug_level > 3) ov_check_ginsparg_wilson_relation_strong(); + + invert_overlap(op_id, index_start); + + if(write_prop) optr->write_prop(op_id, index_start, 0); + } + etime = gettime(); + if (g_cart_id == 0 && g_debug_level > 0) { + fprintf(stdout, "# Inversion done in %d iterations, squared residue = %e!\n", + optr->iterations, optr->reached_prec); + fprintf(stdout, "# Inversion done in %1.2e sec. \n", etime - atime); + } + return; +} + + +void op_write_prop(const int op_id, const int index_start, const int append_) { + operator * optr = &operator_list[op_id]; + char filename[100]; + char ending[15]; + WRITER *writer = NULL; + int append = 0; + int status = 0; + + paramsSourceFormat *sourceFormat = NULL; + paramsPropagatorFormat *propagatorFormat = NULL; + paramsInverterInfo *inverterInfo = NULL; + if(optr->type == DBTMWILSON || optr->type == DBCLOVER) { + strcpy(ending, "hinverted"); + } + else if(optr->type == OVERLAP) { + strcpy(ending, "ovinverted"); + } + else { + strcpy(ending, "inverted"); + } + + if(SourceInfo.type != 1) { + if (PropInfo.splitted) { + if(T_global > 99) sprintf(filename, "%s.%.4d.%.3d.%.2d.%s", SourceInfo.basename, SourceInfo.nstore, SourceInfo.t, SourceInfo.ix, ending); + else sprintf(filename, "%s.%.4d.%.2d.%.2d.%s", SourceInfo.basename, SourceInfo.nstore, SourceInfo.t, SourceInfo.ix, ending); + } + else { + if(T_global > 99) sprintf(filename, "%s.%.4d.%.3d.%s", SourceInfo.basename, SourceInfo.nstore, SourceInfo.t, ending); + else sprintf(filename, "%s.%.4d.%.2d.%s", SourceInfo.basename, SourceInfo.nstore, SourceInfo.t, ending); + } + } + else { + sprintf(filename, "%s.%.4d.%.5d.%s", SourceInfo.basename, SourceInfo.nstore, SourceInfo.sample, ending); + } + + if(!PropInfo.splitted || append_) + append = 1; + /* the 1 is for appending */ + construct_writer(&writer, filename, append); + if (PropInfo.splitted || SourceInfo.ix == index_start) { + inverterInfo = construct_paramsInverterInfo(optr->reached_prec, optr->iterations, + optr->solver, optr->no_flavours); + write_spinor_info(writer, PropInfo.format, inverterInfo, append); + free(inverterInfo); + } + /* write the source depending on format */ + /* to be fixed for 2 fl tmwilson */ + if (PropInfo.format == 1) { + sourceFormat = construct_paramsSourceFormat(SourceInfo.precision, optr->no_flavours, 4, 3); + write_source_format(writer, sourceFormat); + status = write_spinor(writer, &operator_list[op_id].sr0, &operator_list[op_id].sr1, + 1, SourceInfo.precision); + if(optr->no_flavours == 2) { + status = write_spinor(writer, &operator_list[op_id].sr2, &operator_list[op_id].sr3, + 1, SourceInfo.precision); + } + free(sourceFormat); + } + propagatorFormat = construct_paramsPropagatorFormat(optr->prop_precision, optr->no_flavours); + write_propagator_format(writer, propagatorFormat); + free(propagatorFormat); + + if(optr->no_flavours == 2) { + status = write_spinor(writer, &operator_list[op_id].prop2, &operator_list[op_id].prop3, 1, optr->prop_precision); + } + status = write_spinor(writer, &operator_list[op_id].prop0, &operator_list[op_id].prop1, 1, optr->prop_precision); + destruct_writer(writer); + return; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator.h b/qcd/part_cpu/applications/QCD/src/kernel_D/operator.h new file mode 100644 index 0000000000000000000000000000000000000000..46430a8675d06889e989531f57f600036e6ac954 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator.h @@ -0,0 +1,126 @@ + +/*********************************************************************** + * + * Copyright (C) 2009 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _OPERATOR_H +#define _OPERATOR_H + +#include +#include "solver/dirac_operator_eigenvectors.h" +#include "su3.h" +#include "solver/solver_params.h" + + +#define TMWILSON 0 +#define OVERLAP 1 +#define WILSON 2 +#define DBTMWILSON 3 +#define CLOVER 4 +#define DBCLOVER 5 + +#define max_no_operators 10 + +typedef struct { + /* ID of the operator */ + int type; + int id; + /* for overlap */ + int n_cheby; + int deg_poly; + int no_ev; + + SloppyPrecision sloppy_precision; + int even_odd_flag; + int solver; + int N_s; + int initialised; + int rel_prec; + int maxiter; + int iterations; + int prop_precision; + int no_flavours; + int DownProp; + int no_ev_index; + ExternalInverter external_inverter; + CompressionType compression_type; + + int error_code; + + double kappa; + /* for twisted */ + double mu; + double mubar; + /* for 2 flavour twisted */ + double epsbar; + /* solver residue */ + double eps_sq; + /* clover coefficient */ + double c_sw; + /* precision reached during inversion */ + double reached_prec; + /* for the overlap */ + double m; + double s; + double ev_qnorm; + double ev_minev; + double ev_prec; + int ev_readwrite; + /* generic place for sources */ + spinor *sr0, *sr1, *sr2, *sr3; + /* generic place for propagators */ + spinor *prop0, *prop1, *prop2, *prop3; + + /*solver parameters struct*/ + solver_params_t solver_params; + + /* multiple masses for CGMMS */ + double extra_masses[MAX_EXTRA_MASSES]; + int no_extra_masses; + + /* chebyshef coefficients for the overlap */ + double * coefs; + /* various versions of the Dirac operator */ + void (*applyM) (spinor * const, spinor * const); + void (*applyQ) (spinor * const, spinor * const); + /* with even/odd */ + void (*applyQp) (spinor * const, spinor * const); + void (*applyQm) (spinor * const, spinor * const); + void (*applyQsq) (spinor * const, spinor * const); + void (*applyMp) (spinor * const, spinor * const); + void (*applyMm) (spinor * const, spinor * const); + void (*applyDbQsq) (spinor * const, spinor * const, spinor * const, spinor * const); + /* the generic invert function */ + void (*inverter) (const int op_id, const int index_start, const int write_prop); + /* write the propagator */ + void (*write_prop) (const int op_id, const int index_start, const int append_); + char * conf_input; + + spinorPrecWS *precWS; + +} operator; + +/* operator list defined in operator.c */ +extern operator operator_list[max_no_operators]; +extern int no_operators; + +int add_operator(const int type); +int init_operators(); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/D_psi.c b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/D_psi.c new file mode 100644 index 0000000000000000000000000000000000000000..988e7f985b2f78740b1c553d992e2277396f7b22 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/D_psi.c @@ -0,0 +1,1601 @@ +/*********************************************************************** + * + * Copyright (C) 2001 Martin Luescher + * original code + * changed and extended for twisted mass 2002 Andrea Shindler + * 2007,2008 Carsten Urbach + * + * Blue Gene version Copyright (C) 2007 Carsten Urbach + * Block Dirac operator Copyright (C) 2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * Action of a Dirac operator D (Wilson or twisted) on a given spinor field + * + * various versions including a block version. + * + *******************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "sse.h" +#include "boundary.h" +#ifdef MPI +# include "xchange/xchange.h" +#endif +#include "update_backward_gauge.h" +#include "block.h" +#include "operator/D_psi.h" + +#ifndef BENCHMARK +#include "solver/dirac_operator_eigenvectors.h" +//#else +//#include "benchmark_deps.h" +#endif + +#if (defined SSE23 || defined SSE33) + +#elif (defined BGL && defined XLC) + + +/* We have 32 registers available */ +static double _Complex reg00, reg01, reg02, reg03, reg04, reg05; +static double _Complex reg10, reg11, reg12, reg13, reg14, reg15; +/* For the gauge field, reuse the first three!*/ +static double _Complex u00, u01, u02, u10, u11, u12; +static double _Complex reg20, reg21; +/* The following contains the result spinor (12 regs) */ +static double _Complex rs00, rs01, rs02, rs10, rs11, rs12, rs20, rs21, rs22, + rs30, rs31, rs32; + + +/* this is the hopping part only */ +void local_H(spinor * const rr, spinor * const s, su3 * u, int * _idx) { + + int * idx = _idx; + su3 * restrict up ALIGN; + su3 * restrict um ALIGN; + spinor * restrict sp ALIGN; + spinor * restrict sm ALIGN; + +#pragma disjoint(*s, *sp, *sm, *rr, *up, *um) + + __alignx(16,rr); + __alignx(16,s); + + /*********************** direction +0 ************************/ + up = u; + sp = (spinor *) s + (*idx); + idx++; + + um = up+1; + _prefetch_su3(um); + sm = (spinor *) s + (*idx); + _prefetch_spinor(sm); + idx++; + + _bgl_load_reg0(sp->s0); + _bgl_load_reg1(sp->s1); + _bgl_load_reg0_up(sp->s2); + _bgl_load_reg1_up(sp->s3); + _bgl_vector_add_reg0(); + _bgl_vector_add_reg1(); + /* result is now in regx0, regx1, regx2 x = 0,1 */ + + _bgl_su3_multiply_double((*up)); + _bgl_vector_cmplx_mul_double(phase_0); + _bgl_add_to_rs0_reg0(); + _bgl_add_to_rs2_reg0(); + _bgl_add_to_rs1_reg1(); + _bgl_add_to_rs3_reg1(); + + /*********************** direction -0 ************************/ + up = um+1; + _prefetch_su3(up); + sp = (spinor*) s + (*idx); + _prefetch_spinor(sp); + idx++; + + _bgl_load_reg0(sm->s0); + _bgl_load_reg1(sm->s1); + _bgl_load_reg0_up(sm->s2); + _bgl_load_reg1_up(sm->s3); + _bgl_vector_sub_reg0(); + _bgl_vector_sub_reg1(); + + _bgl_su3_inverse_multiply_double((*um)); + _bgl_vector_cmplxcg_mul_double(phase_0); + + _bgl_add_to_rs0_reg0(); + _bgl_sub_from_rs2_reg0(); + _bgl_add_to_rs1_reg1(); + _bgl_sub_from_rs3_reg1(); + + /*********************** direction +1 ************************/ + + um = up+1; + _prefetch_su3(um); + sm = (spinor*) s + (*idx); + _prefetch_spinor(sm); + idx++; + + _bgl_load_reg0(sp->s0); + _bgl_load_reg1(sp->s1); + _bgl_load_reg0_up(sp->s3); + _bgl_load_reg1_up(sp->s2); + _bgl_vector_i_mul_add_reg0(); + _bgl_vector_i_mul_add_reg1(); + + _bgl_su3_multiply_double((*up)); + _bgl_vector_cmplx_mul_double(phase_1); + + _bgl_add_to_rs0_reg0(); + _bgl_i_mul_sub_from_rs3_reg0(); + _bgl_add_to_rs1_reg1(); + _bgl_i_mul_sub_from_rs2_reg1(); + + /*********************** direction -1 ************************/ + + up = um+1; + _prefetch_su3(up); + sp = (spinor*) s + (*idx); + _prefetch_spinor(sp); + idx++; + + _bgl_load_reg0(sm->s0); + _bgl_load_reg1(sm->s1); + _bgl_load_reg0_up(sm->s3); + _bgl_load_reg1_up(sm->s2); + _bgl_vector_i_mul_sub_reg0(); + _bgl_vector_i_mul_sub_reg1(); + + _bgl_su3_inverse_multiply_double((*um)); + _bgl_vector_cmplxcg_mul_double(phase_1); + + _bgl_add_to_rs0_reg0(); + _bgl_add_to_rs1_reg1(); + _bgl_i_mul_add_to_rs3_reg0(); + _bgl_i_mul_add_to_rs2_reg1(); + + /*********************** direction +2 ************************/ + + um = up+1; + _prefetch_su3(um); + sm = (spinor*) s + (*idx); + _prefetch_spinor(sm); + idx++; + + _bgl_load_reg0(sp->s0); + _bgl_load_reg1(sp->s1); + _bgl_load_reg1_up(sp->s2); + _bgl_load_reg0_up(sp->s3); + _bgl_vector_add_reg0(); + _bgl_vector_sub_reg1(); + + _bgl_su3_multiply_double((*up)); + _bgl_vector_cmplx_mul_double(phase_2); + + _bgl_add_to_rs0_reg0(); + _bgl_add_to_rs1_reg1(); + _bgl_sub_from_rs2_reg1(); + _bgl_add_to_rs3_reg0(); + + + /*********************** direction -2 ************************/ + up = um+1; + _prefetch_su3(up); + sp = (spinor*) s + (*idx); + _prefetch_spinor(sp); + idx++; + + _bgl_load_reg0(sm->s0); + _bgl_load_reg1(sm->s1); + _bgl_load_reg1_up(sm->s2); + _bgl_load_reg0_up(sm->s3); + _bgl_vector_sub_reg0(); + _bgl_vector_add_reg1(); + + _bgl_su3_inverse_multiply_double((*um)); + _bgl_vector_cmplxcg_mul_double(phase_2); + + _bgl_add_to_rs0_reg0(); + _bgl_add_to_rs1_reg1(); + _bgl_add_to_rs2_reg1(); + _bgl_sub_from_rs3_reg0(); + + /*********************** direction +3 ************************/ + um = up+1; + _prefetch_su3(um); + sm = (spinor*) s + (*idx); + _prefetch_spinor(sm); + + _bgl_load_reg0(sp->s0); + _bgl_load_reg1(sp->s1); + _bgl_load_reg0_up(sp->s2); + _bgl_load_reg1_up(sp->s3); + _bgl_vector_i_mul_add_reg0(); + _bgl_vector_i_mul_sub_reg1(); + + _bgl_su3_multiply_double((*up)); + _bgl_vector_cmplx_mul_double(phase_3); + + _bgl_add_to_rs0_reg0(); + _bgl_add_to_rs1_reg1(); + _bgl_i_mul_sub_from_rs2_reg0(); + _bgl_i_mul_add_to_rs3_reg1(); + + /*********************** direction -3 ************************/ + + _bgl_load_reg0(sm->s0); + _bgl_load_reg1(sm->s1); + _bgl_load_reg0_up(sm->s2); + _bgl_load_reg1_up(sm->s3); + _bgl_vector_i_mul_sub_reg0(); + _bgl_vector_i_mul_add_reg1(); + + _bgl_su3_inverse_multiply_double((*um)); + _bgl_vector_cmplxcg_mul_double(phase_3); + + _bgl_add_to_rs0_reg0(); + _bgl_store_rs0(rr->s0); + _bgl_i_mul_add_to_rs2_reg0(); + _bgl_store_rs2(rr->s2); + + _bgl_add_to_rs1_reg1(); + _bgl_store_rs1(rr->s1); + _bgl_i_mul_sub_from_rs3_reg1(); + _bgl_store_rs3(rr->s3); + +} + + +#else + + +static inline void p0add(spinor * restrict const tmpr , spinor const * restrict const s, + su3 const * restrict const u, const _Complex double phase) { + +#ifdef OMP +#define static +#endif + static su3_vector chi, psi; +#ifdef OMP +#undef static +#endif + + _vector_add(psi,s->s0, s->s2); + _su3_multiply(chi, (*u), psi); + + _complex_times_vector(psi, phase, chi); + _vector_add_assign(tmpr->s0, psi); + _vector_add_assign(tmpr->s2, psi); + + _vector_add(psi, s->s1, s->s3); + _su3_multiply(chi, (*u), psi); + + _complex_times_vector(psi, phase, chi); + _vector_add_assign(tmpr->s1, psi); + _vector_add_assign(tmpr->s3, psi); + + return; +} + + +static inline void m0add(spinor * restrict const tmpr, spinor const * restrict const s, + su3 const * restrict const u, const _Complex double phase) { +#ifdef OMP +#define static +#endif + static su3_vector chi, psi; +#ifdef OMP +#undef static +#endif + + _vector_sub(psi, s->s0, s->s2); + _su3_inverse_multiply(chi, (*u), psi); + + _complexcjg_times_vector(psi, phase, chi); + _vector_add_assign(tmpr->s0, psi); + _vector_sub_assign(tmpr->s2, psi); + + _vector_sub(psi, s->s1, s->s3); + _su3_inverse_multiply(chi, (*u), psi); + + _complexcjg_times_vector(psi, phase, chi); + _vector_add_assign(tmpr->s1, psi); + _vector_sub_assign(tmpr->s3, psi); + + return; +} + +static inline void p1add(spinor * restrict const tmpr, spinor const * restrict const s, + su3 const * restrict const u, const _Complex double phase) { +#ifdef OMP +#define static +#endif + static su3_vector chi, psi; +#ifdef OMP +#undef static +#endif + + _vector_i_add(psi,s->s0,s->s3); + _su3_multiply(chi,(*u),psi); + + _complex_times_vector(psi, phase, chi); + _vector_add_assign(tmpr->s0, psi); + _vector_i_sub_assign(tmpr->s3, psi); + + _vector_i_add(psi, s->s1, s->s2); + _su3_multiply(chi, (*u), psi); + + _complex_times_vector(psi, phase, chi); + _vector_add_assign(tmpr->s1, psi); + _vector_i_sub_assign(tmpr->s2, psi); + + return; +} + +static inline void m1add(spinor * restrict const tmpr, spinor const * restrict const s, + su3 const * restrict const u, const _Complex double phase) { +#ifdef OMP +#define static +#endif + static su3_vector chi, psi; +#ifdef OMP +#undef static +#endif + + _vector_i_sub(psi,s->s0, s->s3); + _su3_inverse_multiply(chi,(*u), psi); + + _complexcjg_times_vector(psi, phase, chi); + _vector_add_assign(tmpr->s0, psi); + _vector_i_add_assign(tmpr->s3, psi); + + _vector_i_sub(psi, s->s1, s->s2); + _su3_inverse_multiply(chi, (*u), psi); + + _complexcjg_times_vector(psi, phase, chi); + _vector_add_assign(tmpr->s1, psi); + _vector_i_add_assign(tmpr->s2, psi); + + return; +} + +static inline void p2add(spinor * restrict const tmpr, spinor const * restrict const s, + su3 const * restrict const u, const _Complex double phase) { +#ifdef OMP +#define static +#endif + static su3_vector chi, psi; +#ifdef OMP +#undef static +#endif + + _vector_add(psi,s->s0,s->s3); + _su3_multiply(chi, (*u), psi); + + _complex_times_vector(psi, phase, chi); + _vector_add_assign(tmpr->s0, psi); + _vector_add_assign(tmpr->s3, psi); + + _vector_sub(psi,s->s1,s->s2); + _su3_multiply(chi, (*u), psi); + + _complex_times_vector(psi, phase, chi); + _vector_add_assign(tmpr->s1, psi); + _vector_sub_assign(tmpr->s2, psi); + + + return; +} + +static inline void m2add(spinor * restrict const tmpr, spinor const * restrict const s, + su3 const * restrict const u, const _Complex double phase) { +#ifdef OMP +#define static +#endif + static su3_vector chi, psi; +#ifdef OMP +#undef static +#endif + + _vector_sub(psi, s->s0, s->s3); + _su3_inverse_multiply(chi, (*u), psi); + + _complexcjg_times_vector(psi, phase, chi); + _vector_add_assign(tmpr->s0, psi); + _vector_sub_assign(tmpr->s3, psi); + + _vector_add(psi, s->s1, s->s2); + _su3_inverse_multiply(chi, (*u),psi); + + _complexcjg_times_vector(psi, phase, chi); + _vector_add_assign(tmpr->s1, psi); + _vector_add_assign(tmpr->s2, psi); + + return; +} + +static inline void p3add(spinor * restrict const tmpr, spinor const * restrict const s, + su3 const * restrict const u, const _Complex double phase) { +#ifdef OMP +#define static +#endif + static su3_vector chi, psi; +#ifdef OMP +#undef static +#endif + + _vector_i_add(psi, s->s0, s->s2); + _su3_multiply(chi, (*u), psi); + + _complex_times_vector(psi, phase, chi); + _vector_add_assign(tmpr->s0, psi); + _vector_i_sub_assign(tmpr->s2, psi); + + _vector_i_sub(psi,s->s1, s->s3); + _su3_multiply(chi, (*u), psi); + + _complex_times_vector(psi, phase, chi); + _vector_add_assign(tmpr->s1, psi); + _vector_i_add_assign(tmpr->s3, psi); + + return; +} + +static inline void m3addandstore(spinor * restrict const r, spinor const * restrict const s, + su3 const * restrict const u, const _Complex double phase, + spinor const * restrict const tmpr) { +#ifdef OMP +#define static +#endif + static su3_vector chi, psi; +#ifdef OMP +#undef static +#endif + + _vector_i_sub(psi,s->s0, s->s2); + _su3_inverse_multiply(chi, (*u), psi); + + _complexcjg_times_vector(psi, phase, chi); + _vector_add(r->s0, tmpr->s0, psi); + _vector_i_add(r->s2, tmpr->s2, psi); + + _vector_i_add(psi, s->s1, s->s3); + _su3_inverse_multiply(chi, (*u), psi); + + _complexcjg_times_vector(psi, phase, chi); + _vector_add(r->s1, tmpr->s1, psi); + _vector_i_sub(r->s3, tmpr->s3, psi); + + return; +} + +/* this is the hopping part only */ +static inline void local_H(spinor * const rr, spinor const * const s, su3 const * restrict u, int * _idx, spinor * const restrict tmpr) { + + int * idx = _idx; + + /****** direction +0 ******/ + p0add(tmpr, s + (*idx), u, phase_0); + u++; + idx++; + /****** direction -0 ******/ + m0add(tmpr, s + (*idx), u, phase_0); + u++; + idx++; + /****** direction +1 ******/ + p1add(tmpr, s + (*idx), u, phase_1); + u++; + idx++; + /****** direction -1 ******/ + m1add(tmpr, s + (*idx), u, phase_1); + u++; + idx++; + /****** direction +2 ******/ + p2add(tmpr, s + (*idx), u, phase_2); + u++; + idx++; + /****** direction -2 ******/ + m2add(tmpr, s + (*idx), u, phase_2); + u++; + idx++; + /****** direction +3 ******/ + p3add(tmpr, s + (*idx), u, phase_3); + u++; + idx++; + /****** direction -3 ******/ + m3addandstore(rr, s + (*idx), u, phase_3, tmpr); + + return; +} + + +#endif + +#if (defined SSE2 || defined SSE3) + +/* Serially Checked ! */ +void D_psi(spinor * const P, spinor * const Q){ + + if(P==Q){ + printf("Error in D_psi (operator.c):\n"); + printf("Arguments must be differen spinor fields\n"); + printf("Program aborted\n"); + exit(1); + } + +#ifdef _GAUGE_COPY2 + if(g_update_gauge_copy) { + update_backward_gauge(g_gauge_field); + } +#endif + +# if defined MPI + xchange_lexicfield(Q); +# endif + +#ifdef OMP +#pragma omp parallel + { +#endif + int ix,iy,iz; + su3 *up,*um; + spinor *s,*sp,*sm,*rn; + _Complex double fact1, fact2; + spinor rs __attribute__ ((aligned (16))); + + fact1 = 1. + g_mu * I; + fact2 = conj(fact1); + +#ifndef OMP + iy=g_iup[0][0]; + sp=(spinor *) Q + iy; + up=&g_gauge_field[0][0]; +#endif + + /************************ loop over all lattice sites *************************/ +#ifdef OMP +#pragma omp for +#endif + for (ix=0;ixs0); + _sse_load_up(sp->s2); + _sse_vector_add(); + + _sse_su3_multiply((*up)); + _sse_vector_cmplx_mul(phase_0); + _sse_store_up(rs.s2); + + _sse_load_up(s->s0); + _sse_vector_cmplx_mul(fact1); +/* _sse_vector_mul(fact1); */ + _sse_load(rs.s2); + _sse_vector_add(); + _sse_store(rs.s0); + + _sse_load_up(s->s2); + _sse_vector_cmplx_mul(fact2); +/* _sse_vector_mul(fact1); */ + _sse_load(rs.s2); + _sse_vector_add(); + _sse_store(rs.s2); + + um=&g_gauge_field[iy][0]; + _prefetch_su3(um); + + _sse_load(sp->s1); + _sse_load_up(sp->s3); + _sse_vector_add(); + + _sse_su3_multiply((*up)); + _sse_vector_cmplx_mul(phase_0); + _sse_store_up(rs.s3); + + _sse_load_up(s->s1); + _sse_vector_cmplx_mul(fact1); +/* _sse_vector_mul(fact1); */ + _sse_load(rs.s3); + _sse_vector_add(); + _sse_store(rs.s1); + + _sse_load_up(s->s3); + _sse_vector_cmplx_mul(fact2); +/* _sse_vector_mul(fact1); */ + _sse_load(rs.s3); + _sse_vector_add(); + _sse_store(rs.s3); + + /******************************* direction -0 *********************************/ + + iy=g_iup[ix][1]; + + sp = (spinor *) Q + iy; + _prefetch_spinor(sp); + + _sse_load(sm->s0); + _sse_load_up(sm->s2); + _sse_vector_sub(); + + _sse_su3_inverse_multiply((*um)); + _sse_vector_cmplxcg_mul(phase_0); + _sse_load(rs.s0); + _sse_vector_add(); + _sse_store(rs.s0); + + _sse_load(rs.s2); + _sse_vector_sub(); + _sse_store(rs.s2); + + up+=1; + _prefetch_su3(up); + + _sse_load(sm->s1); + _sse_load_up(sm->s3); + _sse_vector_sub(); + + _sse_su3_inverse_multiply((*um)); + _sse_vector_cmplxcg_mul(phase_0); + _sse_load(rs.s1); + _sse_vector_add(); + _sse_store(rs.s1); + + _sse_load(rs.s3); + _sse_vector_sub(); + _sse_store(rs.s3); + + /******************************* direction +1 *********************************/ + + iy=g_idn[ix][1]; + + sm = (spinor *) Q + iy; + _prefetch_spinor(sm); + + _sse_load(sp->s0); + _sse_load_up(sp->s3); + _sse_vector_i_mul(); + _sse_vector_add(); + + _sse_su3_multiply((*up)); + _sse_vector_cmplx_mul(phase_1); + _sse_load(rs.s0); + _sse_vector_add(); + _sse_store(rs.s0); + + _sse_load(rs.s3); + _sse_vector_i_mul(); + _sse_vector_sub(); + _sse_store(rs.s3); + + um=&g_gauge_field[iy][1]; + _prefetch_su3(um); + + _sse_load(sp->s1); + _sse_load_up(sp->s2); + _sse_vector_i_mul(); + _sse_vector_add(); + + _sse_su3_multiply((*up)); + _sse_vector_cmplx_mul(phase_1); + _sse_load(rs.s1); + _sse_vector_add(); + _sse_store(rs.s1); + + _sse_load(rs.s2); + _sse_vector_i_mul(); + _sse_vector_sub(); + _sse_store(rs.s2); + + /******************************* direction -1 *********************************/ + + iy=g_iup[ix][2]; + + sp = (spinor *) Q + iy; + _prefetch_spinor(sp); + + _sse_load(sm->s0); + _sse_load_up(sm->s3); + _sse_vector_i_mul(); + _sse_vector_sub(); + + _sse_su3_inverse_multiply((*um)); + _sse_vector_cmplxcg_mul(phase_1); + _sse_load(rs.s0); + _sse_vector_add(); + _sse_store(rs.s0); + + _sse_load(rs.s3); + _sse_vector_i_mul(); + _sse_vector_add(); + _sse_store(rs.s3); + + up+=1; + _prefetch_su3(up); + + _sse_load(sm->s1); + _sse_load_up(sm->s2); + _sse_vector_i_mul(); + _sse_vector_sub(); + + _sse_su3_inverse_multiply((*um)); + _sse_vector_cmplxcg_mul(phase_1); + _sse_load(rs.s1); + _sse_vector_add(); + _sse_store(rs.s1); + + _sse_load(rs.s2); + _sse_vector_i_mul(); + _sse_vector_add(); + _sse_store(rs.s2); + + /******************************* direction +2 *********************************/ + + iy=g_idn[ix][2]; + + sm = (spinor *) Q + iy; + _prefetch_spinor(sm); + + _sse_load(sp->s0); + _sse_load_up(sp->s3); + _sse_vector_add(); + + _sse_su3_multiply((*up)); + _sse_vector_cmplx_mul(phase_2); + _sse_load(rs.s0); + _sse_vector_add(); + _sse_store(rs.s0); + + _sse_load(rs.s3); + _sse_vector_add(); + _sse_store(rs.s3); + + um=&g_gauge_field[iy][2]; + _prefetch_su3(um); + + _sse_load(sp->s1); + _sse_load_up(sp->s2); + _sse_vector_sub(); + + _sse_su3_multiply((*up)); + _sse_vector_cmplx_mul(phase_2); + _sse_load(rs.s1); + _sse_vector_add(); + _sse_store(rs.s1); + + _sse_load(rs.s2); + _sse_vector_sub(); + _sse_store(rs.s2); + + /******************************* direction -2 *********************************/ + + iy=g_iup[ix][3]; + + sp = (spinor *) Q + iy; + _prefetch_spinor(sp); + + _sse_load(sm->s0); + _sse_load_up(sm->s3); + _sse_vector_sub(); + + _sse_su3_inverse_multiply((*um)); + _sse_vector_cmplxcg_mul(phase_2); + _sse_load(rs.s0); + _sse_vector_add(); + _sse_store(rs.s0); + + _sse_load(rs.s3); + _sse_vector_sub(); + _sse_store(rs.s3); + + up+=1; + _prefetch_su3(up); + + _sse_load(sm->s1); + _sse_load_up(sm->s2); + _sse_vector_add(); + + _sse_su3_inverse_multiply((*um)); + _sse_vector_cmplxcg_mul(phase_2); + _sse_load(rs.s1); + _sse_vector_add(); + _sse_store(rs.s1); + + _sse_load(rs.s2); + _sse_vector_add(); + _sse_store(rs.s2); + + /******************************* direction +3 *********************************/ + + iy=g_idn[ix][3]; + + sm = (spinor *) Q + iy; + _prefetch_spinor(sm); + + _sse_load(sp->s0); + _sse_load_up(sp->s2); + _sse_vector_i_mul(); + _sse_vector_add(); + + _sse_su3_multiply((*up)); + _sse_vector_cmplx_mul(phase_3); + _sse_load(rs.s0); + _sse_vector_add(); + _sse_store(rs.s0); + + _sse_load(rs.s2); + _sse_vector_i_mul(); + _sse_vector_sub(); + _sse_store(rs.s2); + + um=&g_gauge_field[iy][3]; + _prefetch_su3(um); + + _sse_load(sp->s1); + _sse_load_up(sp->s3); + _sse_vector_i_mul(); + _sse_vector_sub(); + + _sse_su3_multiply((*up)); + _sse_vector_cmplx_mul(phase_3); + _sse_load(rs.s1); + _sse_vector_add(); + _sse_store(rs.s1); + + _sse_load(rs.s3); + _sse_vector_i_mul(); + _sse_vector_add(); + _sse_store(rs.s3); + + /******************************* direction -3 *********************************/ + + iz=(ix+1+VOLUME)%VOLUME; + + iy=g_iup[iz][0]; + + sp = (spinor *) Q + iy; + _prefetch_spinor(sp); + + _sse_load(sm->s0); + _sse_load_up(sm->s2); + _sse_vector_i_mul(); + _sse_vector_sub(); + + _sse_su3_inverse_multiply((*um)); + _sse_vector_cmplxcg_mul(phase_3); + rn = (spinor *) P + ix; + + _sse_load(rs.s0); + _sse_vector_add(); +/* _sse_vector_mul(fact2); */ + _sse_store_nt(rn->s0); + + _sse_load(rs.s2); + _sse_vector_i_mul(); + _sse_vector_add(); +/* _sse_vector_mul(fact2); */ + _sse_store_nt(rn->s2); + + up=&g_gauge_field[iz][0]; + _prefetch_su3(up); + + _sse_load(sm->s1); + _sse_load_up(sm->s3); + _sse_vector_i_mul(); + _sse_vector_add(); + + _sse_su3_inverse_multiply((*um)); + _sse_vector_cmplxcg_mul(phase_3); + _sse_load(rs.s1); + _sse_vector_add(); +/* _sse_vector_mul(fact2); */ + _sse_store_nt(rn->s1); + + _sse_load(rs.s3); + _sse_vector_i_mul(); + _sse_vector_sub(); +/* _sse_vector_mul(fact2); */ + _sse_store_nt(rn->s3); + + /******************************** end of loop *********************************/ + + } +#ifdef OMP + } /* OpenMP closing brace */ +#endif +} + +#elif ((defined BGL) && (defined XLC)) + + +/********************************** + * + * Blue Gene/L Version + * + * Author: Carsten Urbach + * + **********************************/ +/* Checked! */ +void D_psi(spinor * const P, spinor * const Q){ + int ix,iy,iz; + static _Complex double fact1; + su3 * restrict up ALIGN; + su3 * restrict um ALIGN; + spinor * restrict s ALIGN; + spinor * restrict sp ALIGN; + spinor * restrict sm ALIGN; + spinor * restrict rn ALIGN; + +#pragma disjoint(*s, *sp, *sm, *rn, *up, *um, *P, *Q) + + __alignx(16,P); + __alignx(16,Q); + +#ifdef _GAUGE_COPY + if(g_update_gauge_copy) { + update_backward_gauge(g_gauge_field); + } +#endif + +# if (defined MPI && !(defined _NO_COMM)) + xchange_lexicfield(Q); +# endif + + fact1 = 1.0 + g_mu * I; + + iy=g_iup[0][0]; + sp=(spinor *) Q + iy; + up=&g_gauge_field[0][0]; + + /**************** loop over all lattice sites ******************/ + for(ix = 0; ix < VOLUME; ix++){ + s=(spinor *) Q + ix; + rn = (spinor *) P + ix; + /*********************** direction +0 ************************/ + + iy=g_idn[ix][0]; + + um=&g_gauge_field[iy][0]; + + _prefetch_su3(um); + sm = (spinor*) Q + iy; + _prefetch_spinor(sm); + + _bgl_load_reg0(sp->s0); + _bgl_load_reg1(sp->s1); + _bgl_load_reg0_up(sp->s2); + _bgl_load_reg1_up(sp->s3); + _bgl_vector_add_reg0(); + _bgl_vector_add_reg1(); + /* result is now in regx0, regx1, regx2 x = 0,1 */ + + _bgl_su3_multiply_double((*up)); + _bgl_vector_cmplx_mul_double(phase_0); + _bgl_load_rs0(s->s0); + _bgl_load_rs1(s->s1); + _bgl_load_rs2(s->s2); + _bgl_load_rs3(s->s3); + _bgl_vector_cmplx_mul_rs(fact1); + _bgl_add_to_rs0_reg0(); + _bgl_add_to_rs2_reg0(); + _bgl_add_to_rs1_reg1(); + _bgl_add_to_rs3_reg1(); + + /*********************** direction -0 ************************/ + + iy=g_iup[ix][1]; + + up+=1; + _prefetch_su3(up); + sp = (spinor *) Q + iy; + _prefetch_spinor(sp); + + _bgl_load_reg0(sm->s0); + _bgl_load_reg1(sm->s1); + _bgl_load_reg0_up(sm->s2); + _bgl_load_reg1_up(sm->s3); + _bgl_vector_sub_reg0(); + _bgl_vector_sub_reg1(); + + _bgl_su3_inverse_multiply_double((*um)); + _bgl_vector_cmplxcg_mul_double(phase_0); + + _bgl_add_to_rs0_reg0(); + _bgl_sub_from_rs2_reg0(); + _bgl_add_to_rs1_reg1(); + _bgl_sub_from_rs3_reg1(); + + /*********************** direction +1 ************************/ + + iy=g_idn[ix][1]; + + um=&g_gauge_field[iy][1]; + + _prefetch_su3(um); + sm = (spinor *) Q + iy; + _prefetch_spinor(sm); + + _bgl_load_reg0(sp->s0); + _bgl_load_reg1(sp->s1); + _bgl_load_reg0_up(sp->s3); + _bgl_load_reg1_up(sp->s2); + _bgl_vector_i_mul_add_reg0(); + _bgl_vector_i_mul_add_reg1(); + + _bgl_su3_multiply_double((*up)); + _bgl_vector_cmplx_mul_double(phase_1); + + _bgl_add_to_rs0_reg0(); + _bgl_i_mul_sub_from_rs3_reg0(); + _bgl_add_to_rs1_reg1(); + _bgl_i_mul_sub_from_rs2_reg1(); + + /*********************** direction -1 ************************/ + + iy=g_iup[ix][2]; + + up+=1; + _prefetch_su3(up); + sp = (spinor *) Q + iy; + _prefetch_spinor(sp); + + _bgl_load_reg0(sm->s0); + _bgl_load_reg1(sm->s1); + _bgl_load_reg0_up(sm->s3); + _bgl_load_reg1_up(sm->s2); + _bgl_vector_i_mul_sub_reg0(); + _bgl_vector_i_mul_sub_reg1(); + + _bgl_su3_inverse_multiply_double((*um)); + _bgl_vector_cmplxcg_mul_double(phase_1); + + _bgl_add_to_rs0_reg0(); + _bgl_add_to_rs1_reg1(); + _bgl_i_mul_add_to_rs3_reg0(); + _bgl_i_mul_add_to_rs2_reg1(); + + /*********************** direction +2 ************************/ + + iy=g_idn[ix][2]; + + um=&g_gauge_field[iy][2]; + _prefetch_su3(um); + sm = (spinor *) Q + iy; + _prefetch_spinor(sm); + + _bgl_load_reg0(sp->s0); + _bgl_load_reg1(sp->s1); + _bgl_load_reg1_up(sp->s2); + _bgl_load_reg0_up(sp->s3); + _bgl_vector_add_reg0(); + _bgl_vector_sub_reg1(); + + _bgl_su3_multiply_double((*up)); + _bgl_vector_cmplx_mul_double(phase_2); + + _bgl_add_to_rs0_reg0(); + _bgl_add_to_rs1_reg1(); + _bgl_sub_from_rs2_reg1(); + _bgl_add_to_rs3_reg0(); + + + /*********************** direction -2 ************************/ + + iy=g_iup[ix][3]; + + up+=1; + _prefetch_su3(up); + sp = (spinor *) Q + iy; + _prefetch_spinor(sp); + + _bgl_load_reg0(sm->s0); + _bgl_load_reg1(sm->s1); + _bgl_load_reg1_up(sm->s2); + _bgl_load_reg0_up(sm->s3); + _bgl_vector_sub_reg0(); + _bgl_vector_add_reg1(); + + _bgl_su3_inverse_multiply_double((*um)); + _bgl_vector_cmplxcg_mul_double(phase_2); + + _bgl_add_to_rs0_reg0(); + _bgl_add_to_rs1_reg1(); + _bgl_add_to_rs2_reg1(); + _bgl_sub_from_rs3_reg0(); + + /*********************** direction +3 ************************/ + + iy=g_idn[ix][3]; + + um=&g_gauge_field[iy][3]; + _prefetch_su3(um); + sm = (spinor *) Q + iy; + _prefetch_spinor(sm); + + _bgl_load_reg0(sp->s0); + _bgl_load_reg1(sp->s1); + _bgl_load_reg0_up(sp->s2); + _bgl_load_reg1_up(sp->s3); + _bgl_vector_i_mul_add_reg0(); + _bgl_vector_i_mul_sub_reg1(); + + _bgl_su3_multiply_double((*up)); + _bgl_vector_cmplx_mul_double(phase_3); + + _bgl_add_to_rs0_reg0(); + _bgl_add_to_rs1_reg1(); + _bgl_i_mul_sub_from_rs2_reg0(); + _bgl_i_mul_add_to_rs3_reg1(); + + /*********************** direction -3 ************************/ + + iz=(ix+1+VOLUME)%VOLUME; + + iy=g_iup[iz][0]; + + up=&g_gauge_field[iz][0]; + _prefetch_su3(up); + sp = (spinor *) Q + iy; + _prefetch_spinor(sp); + + _bgl_load_reg0(sm->s0); + _bgl_load_reg1(sm->s1); + _bgl_load_reg0_up(sm->s2); + _bgl_load_reg1_up(sm->s3); + _bgl_vector_i_mul_sub_reg0(); + _bgl_vector_i_mul_add_reg1(); + + _bgl_su3_inverse_multiply_double((*um)); + _bgl_vector_cmplxcg_mul_double(phase_3); + + _bgl_add_to_rs0_reg0(); + _bgl_store_rs0(rn->s0); + _bgl_i_mul_add_to_rs2_reg0(); + _bgl_store_rs2(rn->s2); + + _bgl_add_to_rs1_reg1(); + _bgl_store_rs1(rn->s1); + _bgl_i_mul_sub_from_rs3_reg1(); + _bgl_store_rs3(rn->s3); + + /************************ end of loop ************************/ + } +} + + +#else + +/* Serially Checked ! */ + +void D_psi(spinor * const P, spinor * const Q){ + if(P==Q){ + printf("Error in D_psi (operator.c):\n"); + printf("Arguments must be different spinor fields\n"); + printf("Program aborted\n"); + exit(1); + } +#ifdef _GAUGE_COPY + if(g_update_gauge_copy) { + update_backward_gauge(g_gauge_field); + } +#endif +# if defined MPI + xchange_lexicfield(Q); +# endif + +#ifdef OMP +#pragma omp parallel + { +#endif + + int ix,iy; + su3 * restrict up,* restrict um; + spinor * restrict rr; + spinor const * restrict s; + spinor const * restrict sp; + spinor const * restrict sm; + _Complex double rho1, rho2; + spinor tmpr; + + rho1 = 1. + g_mu * I; + rho2 = conj(rho1); + + /************************ loop over all lattice sites *************************/ + +#ifdef OMP +#pragma omp for +#endif + for (ix=0;ixs0); + _complex_times_vector(tmpr.s1, rho1, s->s1); + _complex_times_vector(tmpr.s2, rho2, s->s2); + _complex_times_vector(tmpr.s3, rho2, s->s3); + + /******************************* direction +0 *********************************/ + iy=g_iup[ix][0]; + sp = (spinor *) Q +iy; + up=&g_gauge_field[ix][0]; + p0add(&tmpr, sp, up, phase_0); + + /******************************* direction -0 *********************************/ + iy=g_idn[ix][0]; + sm = (spinor *) Q +iy; + um=&g_gauge_field[iy][0]; + m0add(&tmpr, sm, um, phase_0); + + /******************************* direction +1 *********************************/ + iy=g_iup[ix][1]; + sp = (spinor *) Q +iy; + up=&g_gauge_field[ix][1]; + p1add(&tmpr, sp, up, phase_1); + + /******************************* direction -1 *********************************/ + iy=g_idn[ix][1]; + sm = (spinor *) Q +iy; + um=&g_gauge_field[iy][1]; + m1add(&tmpr, sm, um, phase_1); + + /******************************* direction +2 *********************************/ + iy=g_iup[ix][2]; + sp = (spinor *) Q +iy; + up=&g_gauge_field[ix][2]; + p2add(&tmpr, sp, up, phase_2); + + /******************************* direction -2 *********************************/ + iy=g_idn[ix][2]; + sm = (spinor *) Q +iy; + um=&g_gauge_field[iy][2]; + m2add(&tmpr, sm, um, phase_2); + + /******************************* direction +3 *********************************/ + iy=g_iup[ix][3]; + sp = (spinor *) Q +iy; + up=&g_gauge_field[ix][3]; + p3add(&tmpr, sp, up, phase_3); + + /******************************* direction -3 *********************************/ + iy=g_idn[ix][3]; + sm = (spinor *) Q +iy; + um=&g_gauge_field[iy][3]; + m3addandstore(rr, sm, um, phase_3, &tmpr); + } +#ifdef OMP + } /* OpenMP closing brace */ +#endif +} + +#endif + +#ifndef BENCHMARK +void D_psi_prec(spinor * const P, spinor * const Q){ + + /* todo: do preconditioning */ + spinorPrecWS *ws=(spinorPrecWS*)g_precWS; + static _Complex double alpha = -1.0; + + alpha = -0.5; + spinorPrecondition(P,Q,ws,T,L,alpha,0,1); + D_psi(g_spinor_field[DUM_MATRIX],P); + alpha = -0.5; + spinorPrecondition(P,g_spinor_field[DUM_MATRIX],ws,T,L,alpha,0,1); +} + +/* apply the Dirac operator to the block local spinor field s */ +/* and store the result in block local spinor field rr */ +/* for block blk */ +/* the block local gauge field is assumed to be in the order */ +/* that is needed int local_D, which means also that it is a */ +/* double copy */ + +void Block_D_psi(block * blk, spinor * const rr, spinor * const s) { + int i; + spinor *r = rr; + spinor *t = s; + su3 * u = blk->u; + int * idx = blk->idx; + static _Complex double rhoa, rhob; + spinor tmpr; +#if (defined BGL && defined XLC) + __alignx(16,s); +#endif + if(blk_gauge_eo) { + init_blocks_gaugefield(); + } + rhoa = 1.0 + g_mu * I; + rhob = conj(rhoa); + + /* set the boundary term to zero */ + _spinor_null(rr[blk->volume]); + _spinor_null(s[blk->volume]); + + for(i = 0; i < blk->volume; i++) { +#if (defined BGL && defined XLC) + _bgl_load_rs0(t->s0); + _bgl_load_rs1(t->s1); + _bgl_load_rs2(t->s2); + _bgl_load_rs3(t->s3); + _bgl_vector_cmplx_mul_rs(rhoa); +#else + _complex_times_vector(tmpr.s0, rhoa, t->s0); + _complex_times_vector(tmpr.s1, rhoa, t->s1); + _complex_times_vector(tmpr.s2, rhob, t->s2); + _complex_times_vector(tmpr.s3, rhob, t->s3); +#endif + + local_H(r, s, u, idx, &tmpr); + + r++; + t++; + idx += 8; + u += 8; + } + return; +} + +/* Apply Hopping Matrix to a even(odd) spinor */ +void Block_H_psi(block * blk, spinor * const rr, spinor * const s, const int eo) { + int i; + spinor *r = rr; + su3 * u = blk->u; + int * eoidx = blk->evenidx; + spinor tmpr; + + if(!blk_gauge_eo) { + init_blocks_eo_gaugefield(); + } + + /* for OE */ + if(eo == 1) { + u = blk->u + blk->volume*8/2; + eoidx = blk->oddidx; + } + + /* set the boundary term to zero */ + _spinor_null(rr[blk->volume/2]); + _spinor_null(s[blk->volume/2]); + + for(i = 0; i < blk->volume/2; i++) { +#if (defined BGL && defined XLC) + _spinor_null(tmpr); + _bgl_load_rs0(tmpr.s0); + _bgl_load_rs1(tmpr.s1); + _bgl_load_rs2(tmpr.s2); + _bgl_load_rs3(tmpr.s3); +#else + _spinor_null(tmpr); +#endif + + local_H(r, s, u, eoidx, &tmpr); + + r++; + eoidx += 8; + u += 8; + } + return; +} + +#endif +/* direction +t */ +void boundary_D_0(spinor * const r, spinor * const s, su3 * const u) { + + static su3_vector chi, psi; + + _vector_add(psi,s->s0,s->s2); + + _su3_multiply(chi,(*u),psi); + + _complex_times_vector(r->s0, phase_0, chi); + _vector_assign(r->s2,r->s0); + + _vector_add(psi,s->s1,s->s3); + + _su3_multiply(chi,(*u),psi); + + _complex_times_vector(r->s1, phase_0, chi); + _vector_assign(r->s3, r->s1); + + return; +} + +/* direction -t */ +void boundary_D_1(spinor * const r, spinor * const s, su3 * restrict u) { + + static su3_vector chi, psi; + + _vector_sub(psi, s->s0, s->s2); + + _su3_inverse_multiply(chi, (*u), psi); + + _complexcjg_times_vector(r->s0, phase_0, chi); + _vector_minus_assign(r->s2, r->s0); + + _vector_sub(psi,s->s1,s->s3); + + _su3_inverse_multiply(chi,(*u),psi); + + _complexcjg_times_vector(r->s1,phase_0,chi); + _vector_minus_assign(r->s3, r->s1); + + return; +} + +/* direction +x */ +void boundary_D_2(spinor * const r, spinor * const s, su3 * restrict u) { + + static su3_vector chi, psi; + + _vector_i_add(psi,s->s0,s->s3); + + _su3_multiply(chi,(*u),psi); + + _complex_times_vector(r->s0, phase_1, chi); + _vector_null(r->s3); + _vector_i_sub_assign(r->s3, r->s0); + + _vector_i_add(psi,s->s1,s->s2); + + _su3_multiply(chi,(*u),psi); + + _complex_times_vector(r->s1, phase_1, chi); + _vector_null(r->s2); + _vector_i_sub_assign(r->s2, r->s1); + + return; +} + +/* direction -x */ +void boundary_D_3(spinor * const r, spinor * const s, su3 * restrict u) { + + static su3_vector chi, psi; + + _vector_i_sub(psi,s->s0,s->s3); + + _su3_inverse_multiply(chi,(*u),psi); + + _complexcjg_times_vector(r->s0, phase_1, chi); + _vector_null(r->s3); + _vector_i_add_assign(r->s3, r->s0); + + _vector_i_sub(psi,s->s1,s->s2); + + _su3_inverse_multiply(chi,(*u),psi); + + _complexcjg_times_vector(r->s1, phase_1, chi); + _vector_null(r->s2); + _vector_i_add_assign(r->s2, r->s1); + + return; +} + +/* direction +y */ +void boundary_D_4(spinor * const r, spinor * const s, su3 * restrict u) { + + static su3_vector chi, psi; + + _vector_add(psi,s->s0,s->s3); + + _su3_multiply(chi,(*u),psi); + + _complex_times_vector(r->s0, phase_2, chi); + _vector_assign(r->s3, r->s0); + + _vector_sub(psi,s->s1,s->s2); + + _su3_multiply(chi,(*u),psi); + + _complex_times_vector(r->s1, phase_2, chi); + _vector_minus_assign(r->s2, r->s1); + + return; +} + +/* direction -y */ +void boundary_D_5(spinor * const r, spinor * const s, su3 * restrict u) { + + static su3_vector chi, psi; + + _vector_sub(psi,s->s0,s->s3); + + _su3_inverse_multiply(chi,(*u),psi); + + _complexcjg_times_vector(r->s0, phase_2, chi); + _vector_minus_assign(r->s3, r->s0); + + _vector_add(psi,s->s1,s->s2); + + _su3_inverse_multiply(chi,(*u),psi); + + _complexcjg_times_vector(r->s1, phase_2, chi); + _vector_assign(r->s2, r->s1); + + + return; +} + +/* direction +z */ +void boundary_D_6(spinor * const r, spinor * const s, su3 * restrict u) { + + static su3_vector chi, psi; + + _vector_i_add(psi,s->s0,s->s2); + + _su3_multiply(chi,(*u),psi); + + _complex_times_vector(r->s0, phase_3, chi); + _vector_null(r->s2); + _vector_i_sub_assign(r->s2, r->s0); + + _vector_i_sub(psi,s->s1,s->s3); + + _su3_multiply(chi,(*u),psi); + + _complex_times_vector(r->s1, phase_3, chi); + _vector_null(r->s3); + _vector_i_add_assign(r->s3, r->s1); + + return; +} + +/* direction -z */ +void boundary_D_7(spinor * const r, spinor * const s, su3 * restrict u) { + + static su3_vector chi, psi; + + _vector_i_sub(psi,s->s0,s->s2); + + _su3_inverse_multiply(chi,(*u),psi); + + _complexcjg_times_vector(r->s0, phase_3, chi); + _vector_null(r->s2); + _vector_i_add_assign(r->s2, r->s0); + + _vector_i_add(psi,s->s1,s->s3); + + _su3_inverse_multiply(chi,(*u),psi); + + _complexcjg_times_vector(r->s1, phase_3, chi); + _vector_null(r->s3); + _vector_i_sub_assign(r->s3, r->s1); + + return; +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/D_psi.h b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/D_psi.h new file mode 100644 index 0000000000000000000000000000000000000000..a5e802c076c3ae9e39ef477f1ff4e38d4e152ac5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/D_psi.h @@ -0,0 +1,40 @@ +/*********************************************************************** + * + * Copyright (C) 2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _D_PSI_H +#define _D_PSI_H + +#include "block.h" + +void D_psi(spinor * const P, spinor * const Q); +void D_psi_prec(spinor * const P, spinor * const Q); +void Block_D_psi(block * blk, spinor * const rr, spinor * const s); +void Block_H_psi(block * blk, spinor * const rr, spinor * const s, const int eo); + +void boundary_D_0(spinor * const r, spinor * const s, su3 *u); +void boundary_D_1(spinor * const r, spinor * const s, su3 *u); +void boundary_D_2(spinor * const r, spinor * const s, su3 *u); +void boundary_D_3(spinor * const r, spinor * const s, su3 *u); +void boundary_D_4(spinor * const r, spinor * const s, su3 *u); +void boundary_D_5(spinor * const r, spinor * const s, su3 *u); +void boundary_D_6(spinor * const r, spinor * const s, su3 *u); +void boundary_D_7(spinor * const r, spinor * const s, su3 *u); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/D_psi_32.c b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/D_psi_32.c new file mode 100644 index 0000000000000000000000000000000000000000..e8efcdc6ade475083fcac214cb70154d77f9bb3e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/D_psi_32.c @@ -0,0 +1,407 @@ +/*********************************************************************** + * + * Copyright (C) 2001 Martin Luescher + * original code + * changed and extended for twisted mass 2002 Andrea Shindler + * 2007,2008 Carsten Urbach + * + * 32 bit version 2015 Florian Burger + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * Action of a Dirac operator D (Wilson or twisted) on a given spinor field + * + * + *******************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif + +// work-around for missing single precision implementation of inline SSE +#ifdef SSE +#define REDEFSSE +#undef SSE +#endif + +#ifdef SSE2 +#define REDEFSSE2 +#undef SSE2 +#endif + +#ifdef SSE3 +#define REDEFSSE3 +#undef SSE3 +#endif + +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "boundary.h" +#ifdef MPI +# include "xchange/xchange.h" +#endif +#include "update_backward_gauge.h" +#include "operator/D_psi_32.h" + + + +static inline void p0add32(spinor32 * restrict const tmpr , spinor32 const * restrict const s, + su3_32 const * restrict const u, const _Complex float phase) { + +#ifdef OMP +#define static +#endif + static su3_vector32 chi, psi; +#ifdef OMP +#undef static +#endif + + _vector_add(psi,s->s0, s->s2); + _su3_multiply(chi, (*u), psi); + + _complex_times_vector(psi, phase, chi); + _vector_add_assign(tmpr->s0, psi); + _vector_add_assign(tmpr->s2, psi); + + _vector_add(psi, s->s1, s->s3); + _su3_multiply(chi, (*u), psi); + + _complex_times_vector(psi, phase, chi); + _vector_add_assign(tmpr->s1, psi); + _vector_add_assign(tmpr->s3, psi); + + return; +} + + +static inline void m0add32(spinor32 * restrict const tmpr, spinor32 const * restrict const s, + su3_32 const * restrict const u, const _Complex float phase) { +#ifdef OMP +#define static +#endif + static su3_vector32 chi, psi; +#ifdef OMP +#undef static +#endif + + _vector_sub(psi, s->s0, s->s2); + _su3_inverse_multiply(chi, (*u), psi); + + _complexcjg_times_vector(psi, phase, chi); + _vector_add_assign(tmpr->s0, psi); + _vector_sub_assign(tmpr->s2, psi); + + _vector_sub(psi, s->s1, s->s3); + _su3_inverse_multiply(chi, (*u), psi); + + _complexcjg_times_vector(psi, phase, chi); + _vector_add_assign(tmpr->s1, psi); + _vector_sub_assign(tmpr->s3, psi); + + return; +} + +static inline void p1add32(spinor32 * restrict const tmpr, spinor32 const * restrict const s, + su3_32 const * restrict const u, const _Complex float phase) { +#ifdef OMP +#define static +#endif + static su3_vector32 chi, psi; +#ifdef OMP +#undef static +#endif + + _vector_i_add(psi,s->s0,s->s3); + _su3_multiply(chi,(*u),psi); + + _complex_times_vector(psi, phase, chi); + _vector_add_assign(tmpr->s0, psi); + _vector_i_sub_assign(tmpr->s3, psi); + + _vector_i_add(psi, s->s1, s->s2); + _su3_multiply(chi, (*u), psi); + + _complex_times_vector(psi, phase, chi); + _vector_add_assign(tmpr->s1, psi); + _vector_i_sub_assign(tmpr->s2, psi); + + return; +} + +static inline void m1add32(spinor32 * restrict const tmpr, spinor32 const * restrict const s, + su3_32 const * restrict const u, const _Complex float phase) { +#ifdef OMP +#define static +#endif + static su3_vector32 chi, psi; +#ifdef OMP +#undef static +#endif + + _vector_i_sub(psi,s->s0, s->s3); + _su3_inverse_multiply(chi,(*u), psi); + + _complexcjg_times_vector(psi, phase, chi); + _vector_add_assign(tmpr->s0, psi); + _vector_i_add_assign(tmpr->s3, psi); + + _vector_i_sub(psi, s->s1, s->s2); + _su3_inverse_multiply(chi, (*u), psi); + + _complexcjg_times_vector(psi, phase, chi); + _vector_add_assign(tmpr->s1, psi); + _vector_i_add_assign(tmpr->s2, psi); + + return; +} + +static inline void p2add32(spinor32 * restrict const tmpr, spinor32 const * restrict const s, + su3_32 const * restrict const u, const _Complex float phase) { +#ifdef OMP +#define static +#endif + static su3_vector32 chi, psi; +#ifdef OMP +#undef static +#endif + + _vector_add(psi,s->s0,s->s3); + _su3_multiply(chi, (*u), psi); + + _complex_times_vector(psi, phase, chi); + _vector_add_assign(tmpr->s0, psi); + _vector_add_assign(tmpr->s3, psi); + + _vector_sub(psi,s->s1,s->s2); + _su3_multiply(chi, (*u), psi); + + _complex_times_vector(psi, phase, chi); + _vector_add_assign(tmpr->s1, psi); + _vector_sub_assign(tmpr->s2, psi); + + + return; +} + +static inline void m2add32(spinor32 * restrict const tmpr, spinor32 const * restrict const s, + su3_32 const * restrict const u, const _Complex float phase) { +#ifdef OMP +#define static +#endif + static su3_vector32 chi, psi; +#ifdef OMP +#undef static +#endif + + _vector_sub(psi, s->s0, s->s3); + _su3_inverse_multiply(chi, (*u), psi); + + _complexcjg_times_vector(psi, phase, chi); + _vector_add_assign(tmpr->s0, psi); + _vector_sub_assign(tmpr->s3, psi); + + _vector_add(psi, s->s1, s->s2); + _su3_inverse_multiply(chi, (*u),psi); + + _complexcjg_times_vector(psi, phase, chi); + _vector_add_assign(tmpr->s1, psi); + _vector_add_assign(tmpr->s2, psi); + + return; +} + +static inline void p3add32(spinor32 * restrict const tmpr, spinor32 const * restrict const s, + su3_32 const * restrict const u, const _Complex float phase) { +#ifdef OMP +#define static +#endif + static su3_vector32 chi, psi; +#ifdef OMP +#undef static +#endif + + _vector_i_add(psi, s->s0, s->s2); + _su3_multiply(chi, (*u), psi); + + _complex_times_vector(psi, phase, chi); + _vector_add_assign(tmpr->s0, psi); + _vector_i_sub_assign(tmpr->s2, psi); + + _vector_i_sub(psi,s->s1, s->s3); + _su3_multiply(chi, (*u), psi); + + _complex_times_vector(psi, phase, chi); + _vector_add_assign(tmpr->s1, psi); + _vector_i_add_assign(tmpr->s3, psi); + + return; +} + +static inline void m3addandstore32(spinor32 * restrict const r, spinor32 const * restrict const s, + su3_32 const * restrict const u, const _Complex float phase, + spinor32 const * restrict const tmpr) { +#ifdef OMP +#define static +#endif + static su3_vector32 chi, psi; +#ifdef OMP +#undef static +#endif + + _vector_i_sub(psi,s->s0, s->s2); + _su3_inverse_multiply(chi, (*u), psi); + + _complexcjg_times_vector(psi, phase, chi); + _vector_add(r->s0, tmpr->s0, psi); + _vector_i_add(r->s2, tmpr->s2, psi); + + _vector_i_add(psi, s->s1, s->s3); + _su3_inverse_multiply(chi, (*u), psi); + + _complexcjg_times_vector(psi, phase, chi); + _vector_add(r->s1, tmpr->s1, psi); + _vector_i_sub(r->s3, tmpr->s3, psi); + + return; +} + + + + +void D_psi_32(spinor32 * const P, spinor32 * const Q){ + if(P==Q){ + printf("Error in D_psi (operator.c):\n"); + printf("Arguments must be different spinor fields\n"); + printf("Program aborted\n"); + exit(1); + } +//convert phases to float locally +_Complex float ALIGN32 phase_0_32 = (_Complex float) phase_0; +_Complex float ALIGN32 phase_1_32 = (_Complex float) phase_1; +_Complex float ALIGN32 phase_2_32 = (_Complex float) phase_2; +_Complex float ALIGN32 phase_3_32 = (_Complex float) phase_3; + +#ifdef _GAUGE_COPY + if(g_update_gauge_copy_32) { + update_backward_gauge_32(g_gauge_field_32); + } +#endif +# if defined MPI + xchange_lexicfield32(Q); +# endif + +#ifdef OMP +#pragma omp parallel + { +#endif + + int ix,iy; + su3_32 * restrict up,* restrict um; + spinor32 * restrict rr; + spinor32 const * restrict s; + spinor32 const * restrict sp; + spinor32 const * restrict sm; + _Complex float rho1, rho2; + spinor32 tmpr; + + rho1 = 1.f + (float) g_mu * I; + rho2 = conj(rho1); + + /************************ loop over all lattice sites *************************/ + +#ifdef OMP +#pragma omp for +#endif + for (ix=0;ixs0); + _complex_times_vector(tmpr.s1, rho1, s->s1); + _complex_times_vector(tmpr.s2, rho2, s->s2); + _complex_times_vector(tmpr.s3, rho2, s->s3); + + /******************************* direction +0 *********************************/ + iy=g_iup[ix][0]; + sp = (spinor32 *) Q +iy; + up=&g_gauge_field_32[ix][0]; + p0add32(&tmpr, sp, up, phase_0_32); + + /******************************* direction -0 *********************************/ + iy=g_idn[ix][0]; + sm = (spinor32 *) Q +iy; + um=&g_gauge_field_32[iy][0]; + m0add32(&tmpr, sm, um, phase_0_32); + + /******************************* direction +1 *********************************/ + iy=g_iup[ix][1]; + sp = (spinor32 *) Q +iy; + up=&g_gauge_field_32[ix][1]; + p1add32(&tmpr, sp, up, phase_1_32); + + /******************************* direction -1 *********************************/ + iy=g_idn[ix][1]; + sm = (spinor32 *) Q +iy; + um=&g_gauge_field_32[iy][1]; + m1add32(&tmpr, sm, um, phase_1_32); + + /******************************* direction +2 *********************************/ + iy=g_iup[ix][2]; + sp = (spinor32 *) Q +iy; + up=&g_gauge_field_32[ix][2]; + p2add32(&tmpr, sp, up, phase_2_32); + + /******************************* direction -2 *********************************/ + iy=g_idn[ix][2]; + sm = (spinor32 *) Q +iy; + um=&g_gauge_field_32[iy][2]; + m2add32(&tmpr, sm, um, phase_2_32); + + /******************************* direction +3 *********************************/ + iy=g_iup[ix][3]; + sp = (spinor32 *) Q +iy; + up=&g_gauge_field_32[ix][3]; + p3add32(&tmpr, sp, up, phase_3_32); + + /******************************* direction -3 *********************************/ + iy=g_idn[ix][3]; + sm = (spinor32 *) Q +iy; + um=&g_gauge_field_32[iy][3]; + m3addandstore32(rr, sm, um, phase_3_32, &tmpr); + } +#ifdef OMP + } /* OpenMP closing brace */ +#endif +} + +#ifdef REDEFSSE +#undef REDEFSSE +#define SSE +#endif + +#ifdef REDEFSSE2 +#undef REDEFSSE2 +#define SSE2 +#endif + +#ifdef REDEFSSE3 +#undef REDEFSSE3 +#define SSE3 +#endif \ No newline at end of file diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/D_psi_32.h b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/D_psi_32.h new file mode 100644 index 0000000000000000000000000000000000000000..bb72096d2d0c0e16ba79b08c1621b25c059d5863 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/D_psi_32.h @@ -0,0 +1,27 @@ +/*********************************************************************** + * + * Copyright (C) 2015 Florian Burger + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _D_PSI32_H +#define _D_PSI32_H + + +void D_psi_32(spinor32 * const P, spinor32 * const Q); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/Dov_proj.c b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/Dov_proj.c new file mode 100644 index 0000000000000000000000000000000000000000..f13a2e193beeb4542e9b38f86ce1dcdc1a6eba35 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/Dov_proj.c @@ -0,0 +1,66 @@ + +/********************************************************** + * + * Dov_proj_plus and Dov_proj_minus + * are the projections of Dov onto the + * positive and negative chiral sector, respectively + * + * Both need one work_field! + * + * Author: Carsten Urbach + * Die Sep 21 15:21:33 CEST 2004 + * + **********************************************************/ + +#include +#include "global.h" +#include "su3.h" +#include "linalg_eo.h" +#include "Dov_proj.h" +#include "gamma.h" +#include "Dov_psi.h" + + +void Dov_proj_plus(spinor * const R, spinor * const S) +{ + spinor *aux_ = NULL, *aux; + int N = VOLUMEPLUSRAND; + +#if ( defined SSE || defined SSE2 || defined SSE3) + aux_=calloc(VOLUMEPLUSRAND+1, sizeof(spinor)); + aux = (spinor *)(((unsigned long int)(aux_)+ALIGN_BASE)&~ALIGN_BASE); +#else + aux_=calloc(VOLUMEPLUSRAND, sizeof(spinor)); + aux = aux_; +#endif + + Proj(aux, S, N, _PLUS); + Dov_psi(R, aux); + Proj(R, R, N, _PLUS); + + free(aux_); +} + + +void Dov_proj_minus(spinor * const R, spinor * const S) +{ + spinor *aux_ = NULL, *aux; + int N = VOLUMEPLUSRAND; + +#if ( defined SSE || defined SSE2 || defined SSE3) + aux_=calloc(VOLUMEPLUSRAND+1, sizeof(spinor)); + aux = (spinor *)(((unsigned long int)(aux_)+ALIGN_BASE)&~ALIGN_BASE); +#else + aux_=calloc(VOLUMEPLUSRAND, sizeof(spinor)); + aux = aux_; +#endif + + Proj(aux, S, N, _MINUS); + Dov_psi(R, aux); + Proj(R, R, N, _MINUS); + + free(aux_); +} + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/Dov_proj.h b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/Dov_proj.h new file mode 100644 index 0000000000000000000000000000000000000000..b5d84fa15d5deb58c1f98fd6cb178939bcf91145 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/Dov_proj.h @@ -0,0 +1,13 @@ + +#ifndef _DOV_PROJ_H +#define _DOV_PROJ_H + +#include "su3.h" + +#define _PLUS 0 +#define _MINUS 1 + +void Dov_proj_plus(spinor * const R, spinor * const S); +void Dov_proj_minus(spinor * const R, spinor * const S); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/Dov_psi.c b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/Dov_psi.c new file mode 100644 index 0000000000000000000000000000000000000000..68231dbce1a39ba4669932ba13c3bcefd8ee0bb1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/Dov_psi.c @@ -0,0 +1,493 @@ +/*********************************************************************** + * + * Copyright (C) 2003 Ines Wetzorke + * 2006 Urs Wenger + * 2004, 2009 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * Action of the overlap Dirac operator D on a given spinor field + * + * This software is distributed under the terms of the GNU General Public + * License (GPL) + * + * The externally accessible function is + * + * void Dov_psi(spinor * const P, spinor * const S) + * Action of the overlap operator Dov on a given spinor field + * Dov = (1+s-m0/2){1+gamma5 Q/sqrt(Q^2)} + m0 + * with Q = gamma5*(-(1+s)+D_W) + * + * void Qov_psi(spinor * const P, spinor * const S) + * Action of the hermitian overlap operator Dov on a given spinor field + * i.e. Qov = gamma_5 * Dov + * + *************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "linalg_eo.h" +#include "start.h" +#include "D_psi.h" +#include "gamma.h" +#include "chebyshev_polynomial_nd.h" +#include "solver/eigenvalues.h" +#include "solver/sub_low_ev.h" +#include "Dov_psi.h" +#include "init/init.h" +#include "solver/dirac_operator_eigenvectors.h" + +void addproj_q_invsqrt(spinor * const Q, spinor * const P, const int n, const int N); +/* |R>=rnorm^2 Q^2 |S> */ +void norm_Q_sqr_psi(spinor * const R, spinor * const S, + const double rnorm); +/* void norm_Q_n_psi(spinor *R, spinor *S, double m, int n, double rnorm) */ +/* norm_Q_n_psi makes multiplication of any power of */ +/* Q== gamma5*D_W, initial vector S, final R, finally the */ +/* vector R is multiplied by a factor rnorm^n */ +/* |R>=rnorm^n Q^n |S> where m is a mass */ +void norm_Q_n_psi(spinor * const R, spinor * const S, + const int n, const double rnorm); +/* this is Q/sqrt(Q^2) */ +void Q_over_sqrt_Q_sqr(spinor * const R, double * const c, + const int n, spinor * const S, + const double rnorm, const double minev); + +double ov_s = 0.6; +double m_ov = 0.; +int ov_n_cheby=100; +double * ov_cheby_coef = NULL; +Dov_WS *dov_ws=NULL; + + +void Dov_psi_prec(spinor * const P, spinor * const S) { + /* todo: do preconditioning */ + spinorPrecWS *ws=(spinorPrecWS*)g_precWS; + static _Complex double alpha; + Dov_psi(P,S); + alpha = -1.0; + spinorPrecondition(P,P,ws,T,L,alpha,0,1); + +} + +void calculateOverlapPolynomial(){ + if(ov_cheby_coef != NULL) free(ov_cheby_coef); + ov_cheby_coef = (double*)malloc(ov_n_cheby*sizeof(double)); + chebyshev_coefs(ev_minev, 1., ov_cheby_coef, ov_n_cheby, -0.5); + printf("last chebycheff coefficients\n"); + for(int i = ov_n_cheby-3;in_spinors=7; + if(g_proc_id==0) printf("Initilizing Dov spinor workspace with %d spinors!!!\n",dov_ws->n_spinors); + allocate_spinor_field_array(&(dov_ws->dum_spinors),&(dov_ws->dum_spinors_membuf),VOLUMEPLUSRAND,dov_ws->n_spinors); + dov_ws->lock_map=malloc(sizeof(int)*dov_ws->n_spinors); + for(i = 0 ; i< dov_ws->n_spinors;i++) + dov_ws->lock_map[i]=0; +} + +void free_Dov_WS(){ + if(dov_ws!=NULL){ + free_spinor_field_array(&(dov_ws->dum_spinors_membuf)); + free(dov_ws->lock_map); + free(dov_ws); + dov_ws=NULL; + } +} + + +spinor * lock_Dov_WS_spinor(int num){ + + if(numn_spinors){ + if(dov_ws->lock_map[num]==0){ + dov_ws->lock_map[num]=1; + return dov_ws->dum_spinors[num]; + } else { + if(g_proc_id == 0) fprintf(stderr,"spinor %d locked already\n" , num+1); + return NULL; + } + } else { + if(g_proc_id == 0) fprintf(stderr,"Error number of spinor fields exceeded: adjust it to %d in Dov_psi.c !!!!\n" , num+1); + return NULL; + } + +} + +void unlock_Dov_WS_spinor(int num){ + if(numn_spinors){ + if(dov_ws->lock_map[num]==1){ + dov_ws->lock_map[num]=0; + } else { + if(g_proc_id == 0) fprintf(stderr,"spinor %d was not locked already (double unlock ?? )\n" , num+1); + } + } else { + if(g_proc_id == 0) fprintf(stderr,"Error number of spinor fields exceeded (in unlock ?? check your unlock indices against lock indices !!! ): adjust it to %d in Dov_psi.c !!!!\n" , num+1); + } + +} + +void Dov_psi(spinor * const P, spinor * const S) { + + double c0,c1; + spinor *s; + static int n_cheby = 0; + static int rec_coefs = 1; + + ov_s = 0.5*(1./g_kappa - 8.) - 1.; +/* printf("Degree of Polynomial set to %d\n", ov_n_cheby); */ + if(n_cheby != ov_n_cheby || rec_coefs) { + calculateOverlapPolynomial(); + n_cheby = ov_n_cheby; + rec_coefs = 0; + } + + if(dov_ws==NULL){ + init_Dov_WS(); + } + +/* s_ = calloc(VOLUMEPLUSRAND+1, sizeof(spinor)); */ + +/* #if (defined SSE3 || defined SSE2 || defined SSE) */ +/* s = (spinor*)(((unsigned long int)(s_)+ALIGN_BASE)&~ALIGN_BASE); */ +/* #else */ +/* s = s_; */ +/* #endif */ + + s=lock_Dov_WS_spinor(0); + + /* here we do with M = 1 + s */ + /* M + m_ov/2 + (M - m_ov/2)\gamma_5 sign(Q(-M)) */ + c0 = -(1.0 + ov_s - 0.5*m_ov); + c1 = -(1.0 + ov_s + 0.5*m_ov); + + Q_over_sqrt_Q_sqr(s, ov_cheby_coef, ov_n_cheby, S, ev_qnorm, ev_minev); + gamma5(s, s, VOLUME); + assign_mul_add_mul_r(s, S, c0, c1, VOLUME); + assign(P, s, VOLUME); + +/* free(s_); */ + unlock_Dov_WS_spinor(0); + return; +} + +void Qov_psi(spinor * const P, spinor * const S) { + Dov_psi(P, S); + gamma5(P, P, VOLUME); + return; +} + +void Qov_sq_psi(spinor * const P, spinor * const S) { + Dov_psi(g_spinor_field[DUM_MATRIX], S); + gamma5(g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX], VOLUME); + Dov_psi(P,g_spinor_field[DUM_MATRIX]); + gamma5(P,P, VOLUME); + + return; +} + +void Qov_sq_psi_prec(spinor * const P, spinor * const S) { + + + spinorPrecWS *ws=(spinorPrecWS*)g_precWS; + static _Complex double alpha = 0.0; + + alpha = ws->precExpo[0]; + spinorPrecondition(P,S,ws,T,L,alpha,0,1); + + + Dov_psi(g_spinor_field[DUM_MATRIX], P); + gamma5(P, g_spinor_field[DUM_MATRIX], VOLUME); + + alpha = ws->precExpo[1]; + spinorPrecondition(P,P,ws,T,L,alpha,0,1); + + Dov_psi(g_spinor_field[DUM_MATRIX], P); + gamma5(P, g_spinor_field[DUM_MATRIX], VOLUME); + + alpha = ws->precExpo[2]; + spinorPrecondition(P,P,ws,T,L,alpha,0,1); + + return; +} + + +void addproj_q_invsqrt(spinor * const Q, spinor * const P, const int n, const int N) { + + int j; + spinor *aux; + _Complex double cnorm, lambda; + static double save_ev[2]={-1.,-1.}; + static int * ev_sign = NULL; + + if(eigenvls[0] != save_ev[0] && eigenvls[1] != save_ev[1] ) { + if(g_proc_id == 0 && g_debug_level > 1) { + printf("# Recomputing eigenvalue signs!\n"); + fflush(stdout); + } + for(j = 0; j < 2; j++) { + save_ev[j] = eigenvls[j]; + } + free(ev_sign); + ev_sign = (int*) malloc(n * sizeof(int)); + + aux=lock_Dov_WS_spinor(1); + + for(j=0; j < n; j++) { + D_psi(aux, &(eigenvectors[j*evlength])); + gamma5(aux, aux, N); + + lambda = scalar_prod(&(eigenvectors[j*evlength]), aux, N, 1); + if (creal(lambda) < 0) { + ev_sign[j] = -1; + } + else { + ev_sign[j] = 1; + } + } + + unlock_Dov_WS_spinor(1); +/* free(aux_); */ + } + + for(j = 0; j < n; j++) { + cnorm = scalar_prod(&(eigenvectors[j*evlength]), P, N, 1); + + cnorm *= ev_sign[j]; + + assign_add_mul(Q, &(eigenvectors[j*evlength]), cnorm, N); + } + return; +} + + +/* |R>=rnorm^2 Q^2 |S> */ +void norm_Q_sqr_psi(spinor * const R, spinor * const S, + const double rnorm) { + + spinor *aux; + aux=lock_Dov_WS_spinor(1); + + /* Term -1-s is done in D_psi! does this comment make sense for HMC? */ + /* no, it doesn't, we do have to work on this */ + /* here we need to set kappa = 1./(2 (-1-s) + 8) */ + D_psi(R, S); + gamma5(aux, R, VOLUME); + D_psi(R, aux); + gamma5(R, R, VOLUME); + mul_r(R, rnorm*rnorm, R, VOLUME); + + unlock_Dov_WS_spinor(1); + return; +} + +/* void norm_Q_n_psi(spinor *R, spinor *S, double m, int n, double rnorm) */ +/* norm_Q_n_psi makes multiplication of any power of */ +/* Q== gamma5*D_W, initial vector S, final R, finally the */ +/* vector R is multiplied by a factor rnorm^n */ +/* |R>=rnorm^n Q^n |S> */ +void norm_Q_n_psi(spinor * const R, spinor * const S, + const int n, const double rnorm) { + + int i; + double npar = 1.; + spinor *aux; + + aux=lock_Dov_WS_spinor(1); + + assign(aux, S, VOLUME); + + + for(i=0; i < n; i++){ + D_psi(R, aux); + /* Term -1-s is done in D_psi! does this comment make sense for HMC? */ + gamma5(aux, R, VOLUME); + npar *= rnorm; + } + mul_r(R, npar, aux, VOLUME); + unlock_Dov_WS_spinor(1); + return; +} + +void Q_over_sqrt_Q_sqr(spinor * const R, double * const c, + const int n, spinor * const S, + const double rnorm, const double minev) { + + int j; + double fact1, fact2, temp1, temp2, temp3, temp4, maxev, tnorm; + spinor *sv, *d, *dd, *aux, *aux3; + double ap_eps_sq = 0.; + + sv=lock_Dov_WS_spinor(2); + d=lock_Dov_WS_spinor(3); + dd=lock_Dov_WS_spinor(4); + aux=lock_Dov_WS_spinor(5); + aux3=lock_Dov_WS_spinor(6); + + + eigenvalues_for_cg_computed = no_eigenvalues - 1; + if(eigenvalues_for_cg_computed < 0) eigenvalues_for_cg_computed = 0; + maxev=1.0; + + fact1=4/(maxev-minev); + fact2=-2*(maxev+minev)/(maxev-minev); + + zero_spinor_field(d, VOLUME); + zero_spinor_field(dd, VOLUME); + + if(1) assign_sub_lowest_eigenvalues(aux3, S, no_eigenvalues-1, VOLUME); + else assign(aux3, S, VOLUME); + + /* Check whether switch for adaptive precision is on */ + /* this might be implemented again in the future */ + /* Use the 'old' version using Clenshaw's recursion for the + Chebysheff polynomial + */ + if(1) { + for (j = n-1; j >= 1; j--) { + assign(sv, d, VOLUME); + + if ((j%10) == 0 ) { + assign_sub_lowest_eigenvalues(aux, d, no_eigenvalues-1, VOLUME); + } + else { + assign(aux, d, VOLUME); + } + + norm_Q_sqr_psi(R, aux, rnorm); + temp1=-1.0; + temp2=c[j]; + assign_mul_add_mul_add_mul_add_mul_r(d, R, dd, aux3, fact2, fact1, temp1, temp2, VOLUME); + assign(dd, sv, VOLUME); + } + + if(1) assign_sub_lowest_eigenvalues(R, d, no_eigenvalues-1, VOLUME); + else assign(R, d, VOLUME); + + norm_Q_sqr_psi(aux, R, rnorm); + temp1=-1.0; + temp2=c[0]/2.; + temp3=fact1/2.; + temp4=fact2/2.; + assign_mul_add_mul_add_mul_add_mul_r(aux, d, dd, aux3, temp3, temp4, temp1, temp2, VOLUME); + norm_Q_n_psi(R, aux, 1, rnorm); + } + else { + /* Use the adaptive precision version using the forward recursion + for the Chebysheff polynomial + */ + + /* d = T_0(Q^2) */ + assign(d, aux3, VOLUME); + /* dd = T_1(Q^2) */ + norm_Q_sqr_psi(dd, d, rnorm); + temp3 = fact1/2.; + temp4 = fact2/2.; + assign_mul_add_mul_r(dd, d, temp3, temp4, VOLUME); + /* r = c_1 T_1(Q^2) + 1./2 c_0 */ + temp1 = c[1]; + temp2 = c[0]/2.; + mul_add_mul_r(R, dd, d, temp1, temp2, VOLUME); + + temp1=-1.0; + for (j = 2; j <= n-1; j++) { + /* aux = T_j(Q^2) = 2 Q^2 T_{j-1}(Q^2) - T_{j-2}(Q^2) */ + norm_Q_sqr_psi(aux, dd, rnorm); + assign_mul_add_mul_add_mul_r(aux, dd, d, fact1, fact2, temp1, VOLUME); + /* r = r + c_j T_j(Q^2) */ + temp2 = c[j]; + assign_add_mul_r(R, aux, temp2, VOLUME); + /* The stoppping criterio tnorm = |T_j(Q^2)| */ + tnorm = square_norm(aux, VOLUME, 1) * temp2 * temp2; + + /* + auxnorm=square_norm(R); + if(g_proc_id == g_stdio_proc){printf("j= %d\t|c T|^2= %g\t c_j= %g\t|r|^2= %g\n",j,tnorm,temp2,auxnorm); fflush( stdout);}; + */ + + if(tnorm < ap_eps_sq) break; + /* d = T_{j-1}(Q^2) */ + assign(d, dd, VOLUME); + /* dd = T_{j}(Q^2) */ + assign(dd, aux, VOLUME); + } + if(g_proc_id == g_stdio_proc && g_debug_level > 0) { + printf("Order of Chebysheff approximation = %d\n",j); + fflush( stdout); + } + + /* r = Q r */ + assign(aux, R, VOLUME); + norm_Q_n_psi(R, aux, 1, rnorm); + + } + /* add in piece from projected subspace */ + addproj_q_invsqrt(R, S, no_eigenvalues-1, VOLUME); + + unlock_Dov_WS_spinor(2); + unlock_Dov_WS_spinor(3); + unlock_Dov_WS_spinor(4); + unlock_Dov_WS_spinor(5); + unlock_Dov_WS_spinor(6); + return; +} + +void CheckApproximation(spinor * const P, spinor * const S) { + + spinor *s, *s_; + static int n_cheby = 0; + static int rec_coefs = 1; + + ov_s = 0.5*(1./g_kappa - 8.) - 1.; + + if(n_cheby != ov_n_cheby || rec_coefs) { + calculateOverlapPolynomial(); + n_cheby = ov_n_cheby; + rec_coefs = 0; + } + + s_ = calloc(VOLUMEPLUSRAND+1, sizeof(spinor)); + +#if (defined SSE3 || defined SSE2 || defined SSE) + s = (spinor*)(((unsigned long int)(s_)+ALIGN_BASE)&~ALIGN_BASE); +#else + s = s_; +#endif + + + + Q_over_sqrt_Q_sqr(s, ov_cheby_coef, ov_n_cheby, S, ev_qnorm, ev_minev); + Q_over_sqrt_Q_sqr(P, ov_cheby_coef, ov_n_cheby, s, ev_qnorm, ev_minev); + + + free(s); + return; +} + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/Dov_psi.h b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/Dov_psi.h new file mode 100644 index 0000000000000000000000000000000000000000..d3b3a819bd634ec211c65474b161c014db3e8e4d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/Dov_psi.h @@ -0,0 +1,76 @@ +/*********************************************************************** + * + * Copyright (C) 2003 Ines Wetzorke + * 2009 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * Action of the overlap Dirac operator D on a given spinor field + * + * This software is distributed under the terms of the GNU General Public + * License (GPL) + * + ************************************************************************/ + +#ifndef _DOV_PSI_H +#define _DOV_PSI_H + +#include "su3.h" + + +/** + * this is for bookeeping of auxiliary spinors among the routines + * index + * Dov_psi: (1 auxiliary spinors) 0 + * + * addproj_q_invsqrt: (1 auxiliary spinor ) 1 > these functions share their contigent + * norm_Q_sqr_psi : (1 auxiliary spinor ) 1 > as they are not called at the same time + * norm_Q_n_psi : (1 auxiliary spinor ) 1 > (if this is not the case anymore it will be detected by the code see below) + * + * Q_over_sqrt_Q_sqr: (5 auxiliary spinors) 2-6 + * -------------------------- + * 7 auxiliary spinors 0-6 + * + * for additional safety the Dov_WS struct has a lock_map member tacking track of + * locks to specific spinors, if a function requests one spinor before it has been + * unlocked by a previously called function lock_Dov_WS_spinor will strike + */ +typedef struct Dov_WS_{ + int n_spinors; + spinor **dum_spinors; + spinor *dum_spinors_membuf; + int *lock_map; +} Dov_WS; + +extern double m_ov; +extern int ov_n_cheby; +extern double * ov_cheby_coef; +extern Dov_WS *dov_ws; + +void Dov_psi(spinor * const, spinor * const); +void Dov_psi_prec(spinor * const, spinor * const); +void Qov_psi(spinor * const, spinor * const); +void Qov_sq_psi(spinor * const P, spinor * const S); +void Qov_sq_psi_prec(spinor * const P, spinor * const S); + +void Q_over_sqrt_Q_sqr(spinor * const R, double * const c, + const int n, spinor * const S, + const double rnorm, const double minev); + +void calculateOverlapPolynomial(); + +void free_Dov_WS(); +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/Hopping_Matrix.c b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/Hopping_Matrix.c new file mode 100644 index 0000000000000000000000000000000000000000..d75d2e78115cd38b45399724e98592036a339d49 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/Hopping_Matrix.c @@ -0,0 +1,160 @@ +/********************************************************************** + * + * Copyright (C) 2001 Martin Luescher + * 2002 Martin Hasenbusch + * 2003, 2004, 2005, 2006, 2007, 2008 Carsten Urbach + * + * BG and halfspinor versions (C) 2007, 2008 Carsten Urbach + * + * This file is based on an implementation of the Dirac operator + * written by Martin Luescher, modified by Martin Hasenbusch in 2002 + * and modified and extended by Carsten Urbach from 2003-2008 + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * + * Hopping_Matrix is the conventional Wilson + * hopping matrix + * + * \kappa\sum_{\pm\mu}(r+\gamma_\mu)U_{x,\mu} + * + * for ieo = 0 this is M_{eo}, for ieo = 1 + * it is M_{oe} + * + * l is the output, k the input field + * + * Structure of top level precompiler directives + * + * - defining _USE_HALFSPINOR implies that we also use + * a "gauge copy" + * + * - such that we are checking for the _USE_GAUGECOPY feature seperatly in the + * ELSE branch of the "if defined _USE_HALFSPINOR" statement + * + ****************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#ifdef OMP +#include +#endif +#include "global.h" +#include "su3.h" +#ifdef MPI +# include "xchange/xchange.h" +#endif +#include "boundary.h" +#include "init/init_dirac_halfspinor.h" +#include "update_backward_gauge.h" +#ifdef BGQ +# include"DirectPut.h" +#endif +#include "operator/Hopping_Matrix.h" + +#if defined _USE_HALFSPINOR +# include "operator/halfspinor_hopping.h" + +# if ((defined SSE2)||(defined SSE3)) +# include "sse.h" + +# elif (defined BGL && defined XLC) +# include "bgl.h" + +# elif (defined BGQ && defined XLC) +# include "bgq.h" +# include "bgq2.h" +# include "xlc_prefetch.h" + +# endif + +void Hopping_Matrix(const int ieo, spinor * const l, spinor * const k) { + +#ifdef _GAUGE_COPY + if(g_update_gauge_copy) { + update_backward_gauge(g_gauge_field); + } +#endif + +#ifdef OMP +#pragma omp parallel + { + su3 * restrict u0 ALIGN; +#endif + +# include "operator/halfspinor_body.c" + +# ifdef OMP + } /* OpenMP closing brace */ +# endif + return; +} + +#else /* thats _USE_HALFSPINOR */ + +# if (((defined SSE2)||(defined SSE3)) && defined _USE_TSPLITPAR) +# include "sse.h" +# include "operator/hopping_sse_dbl.c" + +# else +# include "operator/hopping.h" +# if ((defined SSE2)||(defined SSE3)) +# include "sse.h" + +# elif (defined BGL && defined XLC) +# include "bgl.h" + +# elif (defined BGQ && defined XLC) +# include "bgq.h" +# include "bgq2.h" +# include "xlc_prefetch.h" + +# elif defined XLC +# include"xlc_prefetch.h" + +# endif +void Hopping_Matrix(const int ieo, spinor * const l, spinor * const k) { +# ifdef XLC +# pragma disjoint(*l, *k) +# endif +# ifdef _GAUGE_COPY + if(g_update_gauge_copy) { + update_backward_gauge(g_gauge_field); + } +# endif + +# if (defined MPI && !(defined _NO_COMM)) + xchange_field(k, ieo); +# endif + +# ifdef OMP +# pragma omp parallel + { +# endif + +# include "operator/hopping_body_dbl.c" + +# ifdef OMP + } /* OpenMP closing brace */ +# endif + return; +} +# endif + +#endif /* thats _USE_HALFSPINOR */ + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/Hopping_Matrix.h b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/Hopping_Matrix.h new file mode 100644 index 0000000000000000000000000000000000000000..b2f72346adf9f366854f28aa879c36c0ac2224c2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/Hopping_Matrix.h @@ -0,0 +1,31 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _HOPPING_MATRIX_H +# define _HOPPING_MATRIX_H + +# define EO 0 +# define OE 1 +# define OO 1 +# define EE 0 + +# include "su3.h" + +void Hopping_Matrix(const int ieo, spinor * const l, spinor * const k); +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/Hopping_Matrix_32.c b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/Hopping_Matrix_32.c new file mode 100644 index 0000000000000000000000000000000000000000..3e5a0d45ecd7dc0f0be12433d882a5f26735577a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/Hopping_Matrix_32.c @@ -0,0 +1,145 @@ +/********************************************************************** + * Copyright (C) 2013 Florian Burger + * derived from Hopping_Matrix.c + * Copyright (C) 2001 Martin Luescher + * 2002 Martin Hasenbusch + * 2003, 2004, 2005, 2006, 2007, 2008 Carsten Urbach + * + * BG and halfspinor versions (C) 2007, 2008 Carsten Urbach + * + * This file is based on an implementation of the Dirac operator + * written by Martin Luescher, modified by Martin Hasenbusch in 2002 + * and modified and extended by Carsten Urbach from 2003-2008 + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * + * Hopping_Matrix is the conventional Wilson + * hopping matrix + * + * \kappa\sum_{\pm\mu}(r+\gamma_\mu)U_{x,\mu} + * + * for ieo = 0 this is M_{eo}, for ieo = 1 + * it is M_{oe} + * + * l is the output, k the input field + * + * Structure of top level precompiler directives + * + * - defining _USE_HALFSPINOR implies that we also use + * a "gauge copy" + * + * - such that we are checking for the _USE_GAUGECOPY feature seperatly in the + * ELSE branch of the "if defined _USE_HALFSPINOR" statement + * + ****************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif + +// work-around for missing single precision implementation of inline SSE +#ifdef SSE +#define REDEFSSE +#undef SSE +#endif + +#ifdef SSE2 +#define REDEFSSE2 +#undef SSE2 +#endif + +#ifdef SSE3 +#define REDEFSSE3 +#undef SSE3 +#endif + +#include +#include +#ifdef OMP +#include +#endif +#include "global.h" +#include "su3.h" +#ifdef USE_MPI +# include "xchange/xchange.h" +#endif +#include "boundary.h" +#include "init/init_dirac_halfspinor.h" +#include "update_backward_gauge.h" +#ifdef SPI +# include"DirectPut.h" +#endif +#include "operator/Hopping_Matrix_32.h" + +#if defined _USE_HALFSPINOR +# include "operator/halfspinor_hopping_32.h" +#endif + + +#if (defined BGQ && defined XLC) +# include "bgq.h" +# include "bgq2.h" +# include "xlc_prefetch.h" +#endif + +void Hopping_Matrix_32_orphaned(const int ieo, spinor32 * const l, spinor32 * const k) { +#if defined _USE_HALFSPINOR + #ifdef _GAUGE_COPY + if(g_update_gauge_copy_32) { + update_backward_gauge_32_orphaned(g_gauge_field_32); + } + #endif + + #ifdef OMP + su3_32 * restrict u0 ALIGN32; + #endif + + # include "operator/halfspinor_body_32.c" +#else + printf("Error: Single precision Matrix only implemented with HALFSPINOR\n"); + exit(200); +#endif +} + + +void Hopping_Matrix_32(const int ieo, spinor32 * const l, spinor32 * const k) { +#ifdef OMP +#pragma omp parallel + { +#endif + Hopping_Matrix_32_orphaned(ieo,l,k); +#ifdef OMP + } +#endif + return; +} + +#ifdef REDEFSSE +#undef REDEFSSE +#define SSE +#endif + +#ifdef REDEFSSE2 +#undef REDEFSSE2 +#define SSE2 +#endif + +#ifdef REDEFSSE3 +#undef REDEFSSE3 +#define SSE3 +#endif + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/Hopping_Matrix_32.h b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/Hopping_Matrix_32.h new file mode 100644 index 0000000000000000000000000000000000000000..610ac67a124409fb82b6a68be6124c0f83d00553 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/Hopping_Matrix_32.h @@ -0,0 +1,33 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _HOPPING_MATRIX32_H +# define _HOPPING_MATRIX32_H + +# define EO 0 +# define OE 1 +# define OO 1 +# define EE 0 + +# include "su3.h" + +void Hopping_Matrix_32_orphaned(const int ieo, spinor32 * const l, spinor32 * const k); +void Hopping_Matrix_32(const int ieo, spinor32 * const l, spinor32 * const k); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/Hopping_Matrix_32_nocom.c b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/Hopping_Matrix_32_nocom.c new file mode 100644 index 0000000000000000000000000000000000000000..ae55dcedccc2ac8fd0e45d1379241c4e8fb43837 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/Hopping_Matrix_32_nocom.c @@ -0,0 +1,54 @@ +/*********************************************************************** + * Copyright (C) 2013 Florian Burger + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + + +#ifdef HAVE_CONFIG_H +# include +#endif + +// work-around for missing single precision implementation of inline SSE +#ifdef SSE +#define REDEFSSE +#undef SSE +#endif + +#ifdef SSE2 +#define REDEFSSE2 +#undef SSE2 +#endif + +#ifdef SSE3 +#define REDEFSSE3 +#undef SSE3 +#endif + +#include +#include +#include "global.h" +#include "xchange/xchange.h" +#include "su3.h" +#include "sse.h" +#include "boundary.h" +#include "operator/Hopping_Matrix_32.h" + +#define Hopping_Matrix_32 Hopping_Matrix_32_nocom +#define Hopping_Matrix_32_orphaned Hopping_Matrix_32_orphaned_nocom +#define _NO_COMM 1 + +#include "Hopping_Matrix_32.c" diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/Hopping_Matrix_nocom.c b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/Hopping_Matrix_nocom.c new file mode 100644 index 0000000000000000000000000000000000000000..028a266309c5abc09aeb7acb4ff82044bee8afac --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/Hopping_Matrix_nocom.c @@ -0,0 +1,56 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +/****************************************** + * Hopping_Matrix is the conventional Wilson + * hopping matrix + * + * + * But the communication is left out by a + * dirty trick... + * + * \kappa\sum_{\pm\mu}(r+\gamma_\mu)U_{x,\mu} + * + * for ieo = 0 this is M_{eo}, for ieo = 1 + * it is M_{oe} + * + * l is the number of the output field + * k is the number of the input field + * + ******************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include "global.h" +#include "xchange/xchange.h" +#include "su3.h" +#include "sse.h" +#include "boundary.h" +#include "operator/Hopping_Matrix.h" + +#define Hopping_Matrix Hopping_Matrix_nocom +#define _NO_COMM 1 +#ifdef _KOJAK_INST +#undef _KOJAK_INST +#endif + +#include "Hopping_Matrix.c" diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/Hopping_Matrix_nocom.h b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/Hopping_Matrix_nocom.h new file mode 100644 index 0000000000000000000000000000000000000000..6599a8d1716c46b1ae1050e83aa484253833fb3f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/Hopping_Matrix_nocom.h @@ -0,0 +1,27 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _HOPPING_MATRIX_NOCOM_H +#define _HOPPING_MATRIX_NOCOM_H + +#include "su3.h" + +void Hopping_Matrix_nocom(const int ieo, spinor * const l, spinor * const k); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..aaf64095fceff7a248b50a3a1eca630906d5d622 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/Makefile @@ -0,0 +1,93 @@ + +srcdir = . +top_builddir = .. +abs_top_builddir = /home/jacob/Job/Prace/kernel_D_soft/kernel_D_update2 +top_srcdir = .. +abs_top_srcdir = /home/jacob/Job/Prace/kernel_D_soft/kernel_D_update2 +subdir = operator +builddir = . + +CFLAGS = -std=c99 -fopenmp -pedantic -Wall +DEPFLAGS = -MM +LDFLAGS = -L${HOME}/lib -L${top_builddir}/lib +DEFS = -DHAVE_CONFIG_H +OPTARGS = -O +SOPTARGS = -O + +AR = ar +RANLIB = ranlib +CC = mpicc +CCDEP = gcc +CCLD = ${CC} +LINK = ${CCLD} ${CFLAGS} ${LDFLAGS} ${OPTARGS} -o $@ +LEX = flex +AUTOCONF = autoconf +DEFS = -DHAVE_CONFIG_H + +INCLUDES = -I$(HOME)/include/ -I. -I${abs_top_builddir}/ -I${abs_top_srcdir}/ -I/include/ -I/include/ +LDADD = +#COMPILE = ${CC} ${DEFS} ${INCLUDES} ${CFLAGS} +COMPILE = ${CC} $(DEFS) ${INCLUDES} ${CFLAGS} + +LIBRARIES = liboperator +liboperator_TARGETS = clover_accumulate_deriv clover_deriv clovertm_operators clover_leaf \ + tm_operators_nd tm_operators_nd_32 clover_term clover_invert clover_det \ + clovertm_operators_32 + +liboperator_STARGETS = Hopping_Matrix_nocom tm_times_Hopping_Matrix Hopping_Matrix Hopping_Matrix_32 Hopping_Matrix_32_nocom \ + tm_operators tm_operators_32 tm_sub_Hopping_Matrix D_psi D_psi_32 Dov_psi Dov_proj + +liboperator_OBJECTS = $(addsuffix .o, ${liboperator_TARGETS}) +liboperator_SOBJECTS = $(addsuffix .o, ${liboperator_STARGETS}) + +# default rule + +all: Makefile dep liboperator.a + +# rules for debugging +debug all-debug: CFLAGS := $(CFLAGS) -g +debug all-debug: all + +# rules for profiling information +profile all-profile: CFLAGS := $(filter-out -fomit-frame-pointer,${CFLAGS}) +profile all-profile: all + +#include dep rules +-include $(addsuffix .d,${liboperator_TARGETS}) + +include ${top_srcdir}/Makefile.global + +# rule to compile objects +${liboperator_OBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h + $(COMPILE) ${OPTARGS} -c $< + +${liboperator_SOBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h + $(COMPILE) ${SOPTARGS} -c $< + +# rule to make liboperator +liboperator.a: ${liboperator_OBJECTS} ${liboperator_SOBJECTS} Makefile + @rm -f liboperator.a + @${AR} cru liboperator.a ${liboperator_OBJECTS} ${liboperator_SOBJECTS} + @$(RANLIB) liboperator.a + @cp liboperator.a ../lib/liboperator.a + +# rule to generate .d files +$(addsuffix .d, $(liboperator_TARGETS) ${liboperator_STARGETS}): %.d: ${srcdir}/%.c Makefile + @${CCDEP} ${DEFS} ${DEPFLAGS} ${INCLUDES} $< > $@ + +# rule to make dependencies +dep: ${addsuffix .d, ${liboperator_TARGETS} ${liboperator_STARGETS}} + +# rules to clean + +compile-clean: Makefile + rm -f ${$(addsuffix _OBJECTS, ${LIBRARIES})} ${$(addsuffix _SOBJECTS, ${LIBRARIES})} *.d + +clean: compile-clean + rm -f $(addsuffix .a, ${LIBRARIES}) + rm -f ../lib/liboperator.a + +distclean: clean + rm -f Makefile + +.PHONY: all dep clean compile-clean distclean profile all-profile debug all-debug diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/Makefile.in b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/Makefile.in new file mode 100644 index 0000000000000000000000000000000000000000..f1da51804bc35a9edd955d662e8cafd38d9e0c2b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/Makefile.in @@ -0,0 +1,93 @@ + +srcdir = @srcdir@ +top_builddir = @top_builddir@ +abs_top_builddir = @abs_top_builddir@ +top_srcdir = @top_srcdir@ +abs_top_srcdir = @abs_top_srcdir@ +subdir = operator +builddir = @builddir@ + +CFLAGS = @CFLAGS@ +DEPFLAGS = @DEPFLAGS@ +LDFLAGS = @LDFLAGS@ +DEFS = @DEFS@ +OPTARGS = @OPTARGS@ +SOPTARGS = @SOPTARGS@ + +AR = @AR@ +RANLIB = @RANLIB@ +CC = @CC@ +CCDEP = @CCDEP@ +CCLD = ${CC} +LINK = ${CCLD} ${CFLAGS} ${LDFLAGS} ${OPTARGS} -o $@ +LEX = @LEX@ +AUTOCONF = @AUTOCONF@ +DEFS = @DEFS@ + +INCLUDES = @INCLUDES@ +LDADD = +#COMPILE = ${CC} ${DEFS} ${INCLUDES} ${CFLAGS} +COMPILE = ${CC} $(DEFS) ${INCLUDES} ${CFLAGS} + +LIBRARIES = liboperator +liboperator_TARGETS = clover_accumulate_deriv clover_deriv clovertm_operators clover_leaf \ + tm_operators_nd tm_operators_nd_32 clover_term clover_invert clover_det \ + clovertm_operators_32 + +liboperator_STARGETS = Hopping_Matrix_nocom tm_times_Hopping_Matrix Hopping_Matrix Hopping_Matrix_32 Hopping_Matrix_32_nocom \ + tm_operators tm_operators_32 tm_sub_Hopping_Matrix D_psi D_psi_32 Dov_psi Dov_proj + +liboperator_OBJECTS = $(addsuffix .o, ${liboperator_TARGETS}) +liboperator_SOBJECTS = $(addsuffix .o, ${liboperator_STARGETS}) + +# default rule + +all: Makefile dep liboperator.a + +# rules for debugging +debug all-debug: CFLAGS := $(CFLAGS) @DEBUG_FLAG@ +debug all-debug: all + +# rules for profiling information +profile all-profile: CFLAGS := $(filter-out -fomit-frame-pointer,${CFLAGS}) @PROFILE_FLAG@ +profile all-profile: all + +#include dep rules +-include $(addsuffix .d,${liboperator_TARGETS}) + +include ${top_srcdir}/Makefile.global + +# rule to compile objects +${liboperator_OBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h + $(COMPILE) ${OPTARGS} -c $< + +${liboperator_SOBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h + $(COMPILE) ${SOPTARGS} -c $< + +# rule to make liboperator +liboperator.a: ${liboperator_OBJECTS} ${liboperator_SOBJECTS} Makefile + @rm -f liboperator.a + @${AR} cru liboperator.a ${liboperator_OBJECTS} ${liboperator_SOBJECTS} + @$(RANLIB) liboperator.a + @cp liboperator.a ../lib/liboperator.a + +# rule to generate .d files +$(addsuffix .d, $(liboperator_TARGETS) ${liboperator_STARGETS}): %.d: ${srcdir}/%.c Makefile + @${CCDEP} ${DEFS} ${DEPFLAGS} ${INCLUDES} $< > $@ + +# rule to make dependencies +dep: ${addsuffix .d, ${liboperator_TARGETS} ${liboperator_STARGETS}} + +# rules to clean + +compile-clean: Makefile + rm -f ${$(addsuffix _OBJECTS, ${LIBRARIES})} ${$(addsuffix _SOBJECTS, ${LIBRARIES})} *.d + +clean: compile-clean + rm -f $(addsuffix .a, ${LIBRARIES}) + rm -f ../lib/liboperator.a + +distclean: clean + rm -f Makefile + +.PHONY: all dep clean compile-clean distclean profile all-profile debug all-debug diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/clover_accumulate_deriv.c b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/clover_accumulate_deriv.c new file mode 100644 index 0000000000000000000000000000000000000000..05eba8f753ca3e344d71bb1cfaeda1d46b4da1ad --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/clover_accumulate_deriv.c @@ -0,0 +1,207 @@ +/*********************************************************************** + * + * Copyright (C) 1995 Ulli Wolff, Stefan Sint + * 2001,2005 Martin Hasenbusch + * 2011,2012 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef SSE +# undef SSE +#endif +#ifdef SSE2 +# undef SSE2 +#endif +#ifdef SSE3 +# undef SSE3 +#endif + +#include +#include +#include +#include +#include +#include +#ifdef MPI +# include +#endif +#ifdef OMP +# include +#endif +#include "global.h" +#include "su3.h" +#include "sse.h" +#include "su3adj.h" +#include "operator/clovertm_operators.h" +#include "operator/clover_leaf.h" + +// now we sum up all term from the clover term +// after sw_spinor and sw_deriv have been called + +void sw_all(hamiltonian_field_t * const hf, const double kappa, + const double c_sw) { +#ifdef OMP +#pragma omp parallel + { +#endif + + int k,l; + int x,xpk,xpl,xmk,xml,xpkml,xplmk,xmkml; + const su3 *w1,*w2,*w3,*w4; + double ka_csw_8 = kappa*c_sw/8.; + su3 ALIGN v1,v2,vv1,vv2,plaq; + su3 ALIGN vis[4][4]; + +#ifdef OMP +#pragma omp for +#endif + for(x = 0; x < VOLUME; x++) { + _minus_itimes_su3_plus_su3(vis[0][1],swm[x][1],swm[x][3]); + _su3_minus_su3(vis[0][2],swm[x][1],swm[x][3]); + _itimes_su3_minus_su3(vis[0][3],swm[x][2],swm[x][0]); + + _minus_itimes_su3_plus_su3(vis[2][3],swp[x][1],swp[x][3]); + _su3_minus_su3(vis[1][3],swp[x][3],swp[x][1]); + _itimes_su3_minus_su3(vis[1][2],swp[x][2],swp[x][0]); + + // project to the traceless anti-hermitian part + _su3_dagger(v1,vis[0][1]); + _su3_minus_su3(vis[0][1],vis[0][1],v1); + _su3_dagger(v1,vis[0][2]); + _su3_minus_su3(vis[0][2],vis[0][2],v1); + _su3_dagger(v1,vis[0][3]); + _su3_minus_su3(vis[0][3],vis[0][3],v1); + _su3_dagger(v1,vis[2][3]); + _su3_minus_su3(vis[2][3],vis[2][3],v1); + _su3_dagger(v1,vis[1][3]); + _su3_minus_su3(vis[1][3],vis[1][3],v1); + _su3_dagger(v1,vis[1][2]); + _su3_minus_su3(vis[1][2],vis[1][2],v1); + + for(k = 0; k < 4; k++) { + for(l = k+1; l < 4; l++) { + xpk=g_iup[x][k]; + xpl=g_iup[x][l]; + xmk=g_idn[x][k]; + xml=g_idn[x][l]; + xpkml=g_idn[xpk][l]; + xplmk=g_idn[xpl][k]; + xmkml=g_idn[xml][k]; + w1=&hf->gaugefield[x][k]; + w2=&hf->gaugefield[xpk][l]; + w3=&hf->gaugefield[xpl][k]; /*dag*/ + w4=&hf->gaugefield[x][l]; /*dag*/ + + _su3_times_su3(v1,*w1,*w2); + _su3_times_su3(v2,*w4,*w3); + _su3_times_su3d(plaq,v1,v2); + + _su3_times_su3(vv1,plaq,vis[k][l]); + _trace_lambda_mul_add_assign_nonlocal(hf->derivative[x][k], -2.*ka_csw_8, vv1); + + _su3d_times_su3(vv2,*w1,vv1); + _su3_times_su3(vv1,vv2,*w1); + _trace_lambda_mul_add_assign_nonlocal(hf->derivative[xpk][l], -2.*ka_csw_8, vv1); + + _su3_times_su3(vv2,vis[k][l],plaq); + _su3_dagger(vv1,vv2); + _trace_lambda_mul_add_assign_nonlocal(hf->derivative[x][l], -2.*ka_csw_8, vv1); + + _su3d_times_su3(vv2,*w4,vv1); + _su3_times_su3(vv1,vv2,*w4); + _trace_lambda_mul_add_assign_nonlocal(hf->derivative[xpl][k], -2.*ka_csw_8, vv1); + + w1=&hf->gaugefield[x][l]; + w2=&hf->gaugefield[xplmk][k]; /*dag*/ + w3=&hf->gaugefield[xmk][l]; /*dag*/ + w4=&hf->gaugefield[xmk][k]; + _su3_times_su3d(v1,*w1,*w2); + _su3d_times_su3(v2,*w3,*w4); + _su3_times_su3(plaq,v1,v2); + + _su3_times_su3(vv1,plaq,vis[k][l]); + _trace_lambda_mul_add_assign_nonlocal(hf->derivative[x][l], -2.*ka_csw_8, vv1); + + _su3_dagger(vv1,v1); + _su3_times_su3d(vv2,vv1,vis[k][l]); + _su3_times_su3d(vv1,vv2,v2); + _trace_lambda_mul_add_assign_nonlocal(hf->derivative[xplmk][k], -2.*ka_csw_8, vv1); + + _su3_times_su3(vv2,*w3,vv1); + _su3_times_su3d(vv1,vv2,*w3); + _trace_lambda_mul_add_assign_nonlocal(hf->derivative[xmk][l], -2.*ka_csw_8, vv1); + + _su3_dagger(vv2,vv1); + _trace_lambda_mul_add_assign_nonlocal(hf->derivative[xmk][k], -2.*ka_csw_8, vv2); + + w1=&hf->gaugefield[xmk][k]; /*dag*/ + w2=&hf->gaugefield[xmkml][l]; /*dag*/ + w3=&hf->gaugefield[xmkml][k]; + w4=&hf->gaugefield[xml][l]; + _su3_times_su3(v1,*w2,*w1); + _su3_times_su3(v2,*w3,*w4); + + _su3_times_su3d(vv1,*w1,vis[k][l]); + _su3_times_su3d(vv2,vv1,v2); + _su3_times_su3(vv1,vv2,*w2); + _trace_lambda_mul_add_assign_nonlocal(hf->derivative[xmk][k], -2.*ka_csw_8, vv1); + + _su3_times_su3(vv2,*w2,vv1); + _su3_times_su3d(vv1,vv2,*w2); + _trace_lambda_mul_add_assign_nonlocal(hf->derivative[xmkml][l], -2.*ka_csw_8, vv1); + + _su3_dagger(vv2,vv1); + _trace_lambda_mul_add_assign_nonlocal(hf->derivative[xmkml][k], -2.*ka_csw_8, vv2); + + _su3d_times_su3(vv1,*w3,vv2); + _su3_times_su3(vv2,vv1,*w3); + _trace_lambda_mul_add_assign_nonlocal(hf->derivative[xml][l], -2.*ka_csw_8, vv2); + + w1=&hf->gaugefield[xml][l]; /*dag*/ + w2=&hf->gaugefield[xml][k]; + w3=&hf->gaugefield[xpkml][l]; + w4=&hf->gaugefield[x][k]; /*dag*/ + _su3d_times_su3(v1,*w1,*w2); + _su3_times_su3d(v2,*w3,*w4); + + _su3_times_su3d(vv1,*w1,vis[k][l]); + _su3_times_su3d(vv2,vv1,v2); + _su3_times_su3d(vv1,vv2,*w2); + _trace_lambda_mul_add_assign_nonlocal(hf->derivative[xml][l], -2.*ka_csw_8, vv1); + + _su3_dagger(vv2,vv1); + _trace_lambda_mul_add_assign_nonlocal(hf->derivative[xml][k], -2.*ka_csw_8, vv2); + + _su3d_times_su3(vv1,*w2,vv2); + _su3_times_su3(vv2,vv1,*w2); + _trace_lambda_mul_add_assign_nonlocal(hf->derivative[xpkml][l], -2.*ka_csw_8, vv2); + + _su3_dagger(vv2,v2); + _su3_times_su3d(vv1,vv2,v1); + _su3_times_su3d(vv2,vv1,vis[k][l]); + _trace_lambda_mul_add_assign_nonlocal(hf->derivative[x][k], -2.*ka_csw_8, vv2); + } + } + } +#ifdef OMP + } /* OpenMP closing brace */ +#endif + return; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/clover_deriv.c b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/clover_deriv.c new file mode 100644 index 0000000000000000000000000000000000000000..47f9f77de4621d8c366a831b17b1832b92875176 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/clover_deriv.c @@ -0,0 +1,320 @@ +/*********************************************************************** + * + * Copyright (C) 1995 Ulli Wolff, Stefan Sint + * 2001,2005 Martin Hasenbusch + * 2011,2012 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef SSE +# undef SSE +#endif +#ifdef SSE2 +# undef SSE2 +#endif +#ifdef SSE3 +# undef SSE3 +#endif + +#include +#include +#include +#include +#include +#include +#ifdef MPI +# include +#endif +#ifdef OMP +# include +#endif +#include "global.h" +#include "su3.h" +#include "sse.h" +#include "su3adj.h" +#include "operator/clovertm_operators.h" +#include "operator/clover_leaf.h" +#include "operator/clover_inline.h" + +// this is (-tr(1+T_ee(+mu)) -tr(1+T_ee(-mu))) +// (or T_oo of course) +// +// see equation (24) of hep-lat/9603008 +// +// or in more detail the insertion matrix at even sites +// is computed +// and stored in swm and swp, which are 4 su3 matrices +// each per site +// refereing to upwards or downwards winding paths +// +// swm and swp are representing 6x6 complex matrices +// (colour matrices) +// +// this function depends on mu + +void sw_deriv(const int ieo, const double mu) { +#ifdef OMP +#pragma omp parallel + { +#endif + int icy; + int ioff; + int x; + double fac = 1.0000; + su3 ALIGN lswp[4], lswm[4]; + + /* convention: Tr clover-leaf times insertion */ + if(ieo == 0) { + ioff=0; + } + else { + ioff = (VOLUME+RAND)/2; + } + if(fabs(mu) > 0.) fac = 0.5; + +#ifndef OMP + icy = 0; +#endif + +#ifdef OMP +#pragma omp for +#endif + for(int icx = ioff; icx < (VOLUME/2+ioff); icx++) { +#ifdef OMP + icy = icx - ioff; +#endif + x = g_eo2lexic[icx]; + /* compute the insertion matrix */ + _su3_plus_su3(lswp[0], sw_inv[icy][0][1], sw_inv[icy][0][0]); + _su3_plus_su3(lswp[1], sw_inv[icy][1][1], sw_inv[icy][1][0]); + _su3_plus_su3(lswp[2], sw_inv[icy][2][1], sw_inv[icy][2][0]); + _su3_plus_su3(lswp[3], sw_inv[icy][3][1], sw_inv[icy][3][0]); + + _su3_minus_su3(lswm[0], sw_inv[icy][0][1], sw_inv[icy][0][0]); + _su3_minus_su3(lswm[1], sw_inv[icy][1][1], sw_inv[icy][1][0]); + _su3_minus_su3(lswm[2], sw_inv[icy][2][1], sw_inv[icy][2][0]); + _su3_minus_su3(lswm[3], sw_inv[icy][3][1], sw_inv[icy][3][0]); + + /* add up to swm[] and swp[] */ + _su3_refac_acc(swm[x][0], fac, lswm[0]); + _su3_refac_acc(swm[x][1], fac, lswm[1]); + _su3_refac_acc(swm[x][2], fac, lswm[2]); + _su3_refac_acc(swm[x][3], fac, lswm[3]); + _su3_refac_acc(swp[x][0], fac, lswp[0]); + _su3_refac_acc(swp[x][1], fac, lswp[1]); + _su3_refac_acc(swp[x][2], fac, lswp[2]); + _su3_refac_acc(swp[x][3], fac, lswp[3]); + if(fabs(mu) > 0.) { + /* compute the insertion matrix */ + _su3_plus_su3(lswp[0], sw_inv[icy+VOLUME/2][0][1], sw_inv[icy+VOLUME/2][0][0]); + _su3_plus_su3(lswp[1], sw_inv[icy+VOLUME/2][1][1], sw_inv[icy+VOLUME/2][1][0]); + _su3_plus_su3(lswp[2], sw_inv[icy+VOLUME/2][2][1], sw_inv[icy+VOLUME/2][2][0]); + _su3_plus_su3(lswp[3], sw_inv[icy+VOLUME/2][3][1], sw_inv[icy+VOLUME/2][3][0]); + + _su3_minus_su3(lswm[0], sw_inv[icy+VOLUME/2][0][1], sw_inv[icy+VOLUME/2][0][0]); + _su3_minus_su3(lswm[1], sw_inv[icy+VOLUME/2][1][1], sw_inv[icy+VOLUME/2][1][0]); + _su3_minus_su3(lswm[2], sw_inv[icy+VOLUME/2][2][1], sw_inv[icy+VOLUME/2][2][0]); + _su3_minus_su3(lswm[3], sw_inv[icy+VOLUME/2][3][1], sw_inv[icy+VOLUME/2][3][0]); + + /* add up to swm[] and swp[] */ + _su3_refac_acc(swm[x][0], fac, lswm[0]); + _su3_refac_acc(swm[x][1], fac, lswm[1]); + _su3_refac_acc(swm[x][2], fac, lswm[2]); + _su3_refac_acc(swm[x][3], fac, lswm[3]); + _su3_refac_acc(swp[x][0], fac, lswp[0]); + _su3_refac_acc(swp[x][1], fac, lswp[1]); + _su3_refac_acc(swp[x][2], fac, lswp[2]); + _su3_refac_acc(swp[x][3], fac, lswp[3]); + } +#ifndef OMP + ++icy; +#endif + } +#ifdef OMP + } /* OpenMP closing brace */ +#endif + return; +} + +void sw_deriv_nd(const int ieo) { +#ifdef OMP +#pragma omp parallel + { +#endif + int icy; + int ioff; + int x; + double fac = 1.0000; + su3 ALIGN lswp[4], lswm[4], v; + _Complex double ALIGN a0[6][6], a1[6][6], b[6][6], c[6][6]; + + /* convention: Tr clover-leaf times insertion */ + if(ieo == 0) { + ioff=0; + } + else { + ioff = (VOLUME+RAND)/2; + } + +#ifndef OMP + icy = 0; +#endif + +#ifdef OMP +#pragma omp for +#endif + for(int icx = ioff; icx < (VOLUME/2+ioff); icx++) { +#ifdef OMP + icy = icx - ioff; +#endif + x = g_eo2lexic[icx]; + /* compute the insertion matrix */ + populate_6x6_matrix(b, &sw[x][0][0], 0, 0); + populate_6x6_matrix(b, &sw[x][1][0], 0, 3); + _su3_dagger(v, sw[x][1][0]); + populate_6x6_matrix(b, &v, 3, 0); + populate_6x6_matrix(b, &sw[x][2][0], 3, 3); + + populate_6x6_matrix(c, &sw_inv[icy][0][0], 0, 0); + populate_6x6_matrix(c, &sw_inv[icy][1][0], 0, 3); + populate_6x6_matrix(c, &sw_inv[icy][2][0], 3, 3); + populate_6x6_matrix(c, &sw_inv[icy][3][0], 3, 0); + + mult_6x6(a0, b, c); + + populate_6x6_matrix(b, &sw[x][0][1], 0, 0); + populate_6x6_matrix(b, &sw[x][1][1], 0, 3); + _su3_dagger(v, sw[x][1][1]); + populate_6x6_matrix(b, &v, 3, 0); + populate_6x6_matrix(b, &sw[x][2][1], 3, 3); + + populate_6x6_matrix(c, &sw_inv[icy][0][1], 0, 0); + populate_6x6_matrix(c, &sw_inv[icy][1][1], 0, 3); + populate_6x6_matrix(c, &sw_inv[icy][2][1], 3, 3); + populate_6x6_matrix(c, &sw_inv[icy][3][1], 3, 0); + + mult_6x6(a1, b, c); + add_6x6(b, a1, a0); + get_3x3_block_matrix(&lswp[0], b, 0, 0); + get_3x3_block_matrix(&lswp[1], b, 0, 3); + get_3x3_block_matrix(&lswp[2], b, 3, 3); + get_3x3_block_matrix(&lswp[3], b, 3, 0); + + sub_6x6(b, a1, a0); + get_3x3_block_matrix(&lswm[0], b, 0, 0); + get_3x3_block_matrix(&lswm[1], b, 0, 3); + get_3x3_block_matrix(&lswm[2], b, 3, 3); + get_3x3_block_matrix(&lswm[3], b, 3, 0); + + /* add up to swm[] and swp[] */ + _su3_refac_acc(swm[x][0], fac, lswm[0]); + _su3_refac_acc(swm[x][1], fac, lswm[1]); + _su3_refac_acc(swm[x][2], fac, lswm[2]); + _su3_refac_acc(swm[x][3], fac, lswm[3]); + _su3_refac_acc(swp[x][0], fac, lswp[0]); + _su3_refac_acc(swp[x][1], fac, lswp[1]); + _su3_refac_acc(swp[x][2], fac, lswp[2]); + _su3_refac_acc(swp[x][3], fac, lswp[3]); +#ifndef OMP + ++icy; +#endif + } +#ifdef OMP + } /* OpenMP closing brace */ +#endif + return; +} + + +// direct product of Y_e(o) and X_e(o) in colour space +// with insertion matrix at site x +// see equation (22) of hep-lat/9603008 +// result is again stored in swm and swp +// includes a gamma5 multiplication for kk + +void sw_spinor(const int ieo, const spinor * const kk, const spinor * const ll, + const double fac) { +#ifdef OMP +#pragma omp parallel + { +#endif + + int ioff; + int icx; + int x; + const spinor *r,*s; + su3 ALIGN v0,v1,v2,v3; + su3 ALIGN u0,u1,u2,u3; + su3 ALIGN lswp[4],lswm[4]; + + if(ieo == 0) { + ioff=0; + } + else { + ioff=(VOLUME+RAND)/2; + } + /************************ loop over half of the lattice sites ***********/ + +#ifdef OMP +#pragma omp for +#endif + for(icx = ioff; icx < (VOLUME/2+ioff); icx++) { + x = g_eo2lexic[icx]; + r = kk + icx - ioff; + s = ll + icx - ioff; + + _vector_tensor_vector(v0,(*r).s0,(*s).s0); + _vector_tensor_vector(v1,(*r).s0,(*s).s1); + _vector_tensor_vector(v2,(*r).s1,(*s).s1); + _vector_tensor_vector(v3,(*r).s1,(*s).s0); + // mvector takes g5 into account + _mvector_tensor_vector(u0,(*r).s2,(*s).s2); + _mvector_tensor_vector(u1,(*r).s2,(*s).s3); + _mvector_tensor_vector(u2,(*r).s3,(*s).s3); + _mvector_tensor_vector(u3,(*r).s3,(*s).s2); + + /* compute the insertion matrix */ + _su3_plus_su3(lswp[0],u0,v0); + _su3_plus_su3(lswp[1],u1,v1); + _su3_plus_su3(lswp[2],u2,v2); + _su3_plus_su3(lswp[3],u3,v3); + + _su3_minus_su3(lswm[0],u0,v0); + _su3_minus_su3(lswm[1],u1,v1); + _su3_minus_su3(lswm[2],u2,v2); + _su3_minus_su3(lswm[3],u3,v3); + + /* add up to swm[0] and swp[0] */ + _su3_refac_acc(swm[x][0], fac, lswm[0]); + _su3_refac_acc(swm[x][1], fac, lswm[1]); + _su3_refac_acc(swm[x][2], fac, lswm[2]); + _su3_refac_acc(swm[x][3], fac, lswm[3]); + _su3_refac_acc(swp[x][0], fac, lswp[0]); + _su3_refac_acc(swp[x][1], fac, lswp[1]); + _su3_refac_acc(swp[x][2], fac, lswp[2]); + _su3_refac_acc(swp[x][3], fac, lswp[3]); + } +#ifdef OMP + } /* OpenMP closing brace */ +#endif + return; +} + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/clover_det.c b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/clover_det.c new file mode 100644 index 0000000000000000000000000000000000000000..f66915f7fe91bbf92ae29080159cf0d4356cf371 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/clover_det.c @@ -0,0 +1,279 @@ +/*********************************************************************** + * + * Copyright (C) 1995 Ulli Wolff, Stefan Sint + * 2001,2005 Martin Hasenbusch + * 2011,2012 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef SSE +# undef SSE +#endif +#ifdef SSE2 +# undef SSE2 +#endif +#ifdef SSE3 +# undef SSE3 +#endif + +#include +#include +#include +#include +#include +#include +#ifdef MPI +# include +#endif +#ifdef OMP +# include +#endif +#include "global.h" +#include "su3.h" +#include "sse.h" +#include "su3adj.h" +#include "operator/clovertm_operators.h" +#include "operator/clover_leaf.h" +#include "operator/clover_inline.h" + +#define nm1 5 +void six_det(_Complex double* const rval, _Complex double a[6][6]) +{ + /* required for thread safety */ + _Complex double ALIGN sigma,z; + _Complex double ALIGN det; + double ALIGN p[nm1+1]; + double ALIGN s,q; + int i,j,k; + int ifail; + ifail=0; + /* compute the determinant:*/ + det = 1.0; + + for(k = 0; k < nm1; k++) { + s=0.0; + for(j = k+1; j <= nm1; ++j) { + s += conj(a[j][k]) * a[j][k]; + } + s = sqrt(1. + s / (conj(a[k][k]) * a[k][k])); + sigma = s * a[k][k]; + + /* determinant */ + det *= sigma; + q = sigma * conj(sigma); + if (q < tiny_t) + ifail++; + + a[k][k] += sigma; + p[k] = sigma * conj(a[k][k]); + + /* reflect all columns to the right */ + for(j = k+1; j <= nm1; j++) { + z = 0.; + for(i = k; i <= nm1; i++) { + z += conj(a[i][k]) * a[i][j]; + } + z /= p[k]; + for(i = k; i <= nm1; i++) { + a[i][j] -= z * a[i][k]; + } + } + } + sigma = a[nm1][nm1]; + + /* determinant */ + det *= sigma; + q = conj(sigma) * sigma; + + if(q < tiny_t) { + ifail++; + } + if(g_proc_id == 0 && ifail > 0) { + fprintf(stderr, "Warning: ifail = %d > 0 in six_det\n", ifail); + } + *rval = det; +} + + +double sw_trace(const int ieo, const double mu) { + double ALIGN res = 0.0; +#ifdef MPI + double ALIGN mres; +#endif + +#ifdef OMP +#pragma omp parallel + { + int thread_num = omp_get_thread_num(); +#endif + + int i,x,ioff; + su3 ALIGN v; + _Complex double ALIGN a[6][6]; + double ALIGN tra; + double ALIGN ks,kc,tr,ts,tt; + _Complex double ALIGN det; + + ks = 0.0; + kc = 0.0; + + if(ieo==0) { + ioff=0; + } + else { + ioff=(VOLUME+RAND)/2; + } + +#ifdef OMP +#pragma omp for +#endif + for(int icx = ioff; icx < (VOLUME/2+ioff); icx++) { + x = g_eo2lexic[icx]; + for(i=0;i<2;i++) { + populate_6x6_matrix(a, &sw[x][0][i], 0, 0); + populate_6x6_matrix(a, &sw[x][1][i], 0, 3); + _su3_dagger(v, sw[x][1][i]); + populate_6x6_matrix(a, &v, 3, 0); + populate_6x6_matrix(a, &sw[x][2][i], 3, 3); + // we add the twisted mass term (the sign is of no importance + // because we compute the modulus squared and they are complex + // conjugates of each other) + add_tm(a, mu); + // and compute the tr log (or log det) + six_det(&det,a); + tra = log(conj(det)*det); + + tr=tra+kc; + ts=tr+ks; + tt=ts-ks; + ks=ts; + kc=tr-tt; + } + } + kc=ks+kc; + +#ifdef OMP + g_omp_acc_re[thread_num] = kc; + } /* OpenMP parallel closing brace */ + + for(int i = 0; i < omp_num_threads; ++i) { + res += g_omp_acc_re[i]; + } +#else + res=kc; +#endif + +#ifdef MPI + MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + return(mres); +#else + return(res); +#endif + +} + + +// This function computes the trace-log part of the clover term +// in case of even/odd preconditioning in the nd case +// +// it is expected that sw_term is called beforehand such that +// the array sw is populated properly +// +// it is tested to deliver bit-identical results to sw_trace +// if eps is set to zero + +double sw_trace_nd(const int ieo, const double mu, const double eps) { + double ALIGN res = 0.0; +#ifdef MPI + double ALIGN mres; +#endif + +#ifdef OMP +#pragma omp parallel + { + int thread_num = omp_get_thread_num(); +#endif + + int x,ioff; + su3 ALIGN v; + _Complex double ALIGN a[6][6], t[6][6]; + double ALIGN tra; + double ALIGN ks,kc,tr,ts,tt; + _Complex double ALIGN det[2]; + double mu_sq_m_eps_sq = mu*mu - eps*eps; + ks=0.0; + kc=0.0; + + if(ieo==0) { + ioff=0; + } + else { + ioff=(VOLUME+RAND)/2; + } + +#ifdef OMP +#pragma omp for +#endif + for(unsigned int icx = ioff; icx < (VOLUME/2+ioff); icx++) { + x = g_eo2lexic[icx]; + for(unsigned int i = 0; i < 2; i++) { + populate_6x6_matrix(a, &sw[x][0][i], 0, 0); + populate_6x6_matrix(a, &sw[x][1][i], 0, 3); + _su3_dagger(v, sw[x][1][i]); + populate_6x6_matrix(a, &v, 3, 0); + populate_6x6_matrix(a, &sw[x][2][i], 3, 3); + + // square the matrix + six_mul_six(t,a,a); + // add the diagonal elements mu^2-eps^2 + add_shift_6x6(t,mu_sq_m_eps_sq); + + six_det(&det[i], t); + } + // and compute the tr log (or log det) + // taking into account that either determinant must be real + tra = log(creal(det[0])*creal(det[1])); + + tr=tra+kc; + ts=tr+ks; + tt=ts-ks; + ks=ts; + kc=tr-tt; + } + kc=ks+kc; + +#ifdef OMP + g_omp_acc_re[thread_num] = kc; + } /* OpenMP parallel closing brace */ + + for(int i = 0; i < omp_num_threads; ++i) { + res += g_omp_acc_re[i]; + } +#else + res=kc; +#endif + +#ifdef MPI + MPI_Allreduce(&res, &mres, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + return(mres); +#else + return(res); +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/clover_inline.h b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/clover_inline.h new file mode 100644 index 0000000000000000000000000000000000000000..d997be3568a3ff881523977df14246e9135a68fa --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/clover_inline.h @@ -0,0 +1,74 @@ +/*********************************************************************** + * + * Copyright (C) 2005 Martin Hasenbusch + * 2011 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +/*definitions needed for the functions sw_trace(int ieo) and sw_trace_nd(int ieo)*/ +static inline void populate_6x6_matrix(_Complex double a[6][6], const su3 * const C, const int row, const int col) { + a[0+row][0+col] = C->c00; + a[0+row][1+col] = C->c01; + a[0+row][2+col] = C->c02; + a[1+row][0+col] = C->c10; + a[1+row][1+col] = C->c11; + a[1+row][2+col] = C->c12; + a[2+row][0+col] = C->c20; + a[2+row][1+col] = C->c21; + a[2+row][2+col] = C->c22; + return; +} + +static inline void get_3x3_block_matrix(su3 * const C, _Complex double a[6][6], const int row, const int col) { + C->c00 = a[0+row][0+col]; + C->c01 = a[0+row][1+col]; + C->c02 = a[0+row][2+col]; + C->c10 = a[1+row][0+col]; + C->c11 = a[1+row][1+col]; + C->c12 = a[1+row][2+col]; + C->c20 = a[2+row][0+col]; + C->c21 = a[2+row][1+col]; + C->c22 = a[2+row][2+col]; + return; +} + +static inline void six_mul_six(_Complex double c[6][6], _Complex double a[6][6], _Complex double b[6][6]) { + for(unsigned int i = 0; i < 6; ++i) { + for(unsigned int j = 0; j < 6; ++j) { + c[i][j] = 0; + for(unsigned int k = 0; k < 6; ++k) { + c[i][j] += a[i][k] * b[k][j]; + } + } + } + return; +} + +static inline void add_tm(_Complex double a[6][6], const double mu) { + for(int i = 0; i < 6; i++) { + a[i][i] += I*mu; + } + return; +} + +static inline void add_shift_6x6(_Complex double a[6][6], const double mshift) { + for(int i = 0; i < 6; i++) { + a[i][i] += mshift; + } + return; +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/clover_invert.c b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/clover_invert.c new file mode 100644 index 0000000000000000000000000000000000000000..b9a241bb6ad39bdabab6e0da4c02749a4091e7bd --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/clover_invert.c @@ -0,0 +1,319 @@ +/*********************************************************************** + * + * Copyright (C) 1995 Ulli Wolff, Stefan Sint + * 2001,2005 Martin Hasenbusch + * 2011,2012 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef SSE +# undef SSE +#endif +#ifdef SSE2 +# undef SSE2 +#endif +#ifdef SSE3 +# undef SSE3 +#endif + +#include +#include +#include +#include +#include +#include +#ifdef MPI +# include +#endif +#ifdef OMP +# include +#endif +#include "global.h" +#include "su3.h" +#include "sse.h" +#include "su3adj.h" +#include "operator/clovertm_operators.h" +#include "operator/clover_leaf.h" +#include "operator/clover_inline.h" + +/* + !--------------------------------------------------------------! + ! The subroutine sw_invert is needed for the ! + ! even_odd preconditioned Dirac operator with SW improvement. ! + ! Details can be found in the notes sw.ps on tsun.desy.de ! + ! by P. Weisz and U. Wolff. ! + !--------------------------------------------------------------! + ! inversion in place of complex matrix a without pivoting ! + ! triangularization by householder reflections ! + ! inversion of triangular matrix ! + ! inverse reflections ! + !--------------------------------------------------------------! + ! a square matrix, dimensioned 0:n-1 ! + ! itrouble is counted up, when a dangerously small diagonal ! + ! element is encountered in the tringular matrix ! + ! has to be initialized outside ! + ! ! + ! Author: U. Wolff, adapted to fortran90 by S. Sint, 29/10/95 ! + !--------------------------------------------------------------! + ! ported to C by M.Hasenbusch Wed Oct 24 15:46:46 MEST 2001 ! + !______________________________________________________________! +*/ + + +/* six_invert and six_det are called from multiple threads, they are thus + * made thread-safe by removing the static keywords but they are NOT + * parallelised for OpenMP */ + +#define nm1 5 +void six_invert(int* ifail ,_Complex double a[6][6]) +{ + /* required for thread safety */ + _Complex double ALIGN d[nm1+1],u[nm1+1]; + _Complex double ALIGN sigma,z; + double ALIGN p[nm1+1]; + double ALIGN s,q; + int i,j,k; + *ifail=0; + for(k = 0; k < nm1; ++k) + { + s=0.0; + for(j = k+1; j <= nm1; ++j) + s += conj(a[j][k]) * a[j][k]; + s = sqrt(1. + s / (conj(a[k][k]) * a[k][k])); + sigma = s * a[k][k]; + + a[k][k] += sigma; + p[k] = conj(sigma) * a[k][k]; + q = conj(sigma) * sigma; + if (q < tiny_t) + (*ifail)++; + d[k] = -conj(sigma) / q; + + /* reflect all columns to the right */ + for(j = k+1; j <= nm1; ++j) + { + z = 0.0; + for(i = k; i <= nm1; ++i) + z += conj(a[i][k]) * a[i][j]; + z /= p[k]; + for(i = k; i <= nm1; ++i) + a[i][j] -= z * a[i][k]; + } + } + sigma = a[nm1][nm1]; + q = conj(sigma) * sigma; + if (q < tiny_t) + (*ifail)++; + d[nm1] = conj(sigma) / q; + + /* inversion of upper triangular matrix in place + (diagonal elements done already): */ + + for(k = nm1; k >= 0; k--) { + for(i = k-1; i >= 0;i--) { + z = 0.0; + for(j = i+1; j < k; j++) + z += a[i][j] * a[j][k]; + z += a[i][k] * d[k]; + a[i][k] = -z * d[i]; + } + } + /* execute reflections in reverse order from the right: */ + + a[nm1][nm1] = d[nm1]; + for(k = nm1-1; k >= 0; k--) + { + for(j=k;j<=nm1;j++) + u[j] = a[j][k]; + a[k][k] = d[k]; + for(j = k+1; j <= nm1; j++) + a[j][k] = 0.0; + for(i = 0; i <= nm1; i++) + { + z = 0.0; + for(j = k; j <= nm1; j++) + z += a[i][j] * u[j]; + z /= p[k]; /* normalization */ + + for(j = k; j <= nm1; j++) + a[i][j] -= conj(u[j]) * z; /* reflection */ + } + } +} + +// This function computes the inverse of +// (1 + T_ee \pm I\mu\gamma_5) +// +// + is stored in sw_inv[0-(VOLUME/2-1)] +// - is stored in sw_inv[VOLUME/2-(VOLUME-1)] + +void sw_invert(const int ieo, const double mu) { +#ifdef OMP +#pragma omp parallel + { +#endif + int icy; + int ioff, err=0; + int i, x; + su3 ALIGN v; + _Complex double ALIGN a[6][6]; + + if(ieo==0) { + ioff=0; + } + else { + ioff=(VOLUME+RAND)/2; + } + +#ifndef OMP + icy=0; +#endif + +#ifdef OMP +#pragma omp for +#endif + for(int icx = ioff; icx < (VOLUME/2+ioff); icx++) { +#ifdef OMP + icy = icx - ioff; +#endif + x = g_eo2lexic[icx]; + + for(i = 0; i < 2; i++) { + populate_6x6_matrix(a, &sw[x][0][i], 0, 0); + populate_6x6_matrix(a, &sw[x][1][i], 0, 3); + _su3_dagger(v, sw[x][1][i]); + populate_6x6_matrix(a, &v, 3, 0); + populate_6x6_matrix(a, &sw[x][2][i], 3, 3); + // we add the twisted mass term + if(i == 0) add_tm(a, +mu); + else add_tm(a, -mu); + // and invert the resulting matrix + + six_invert(&err,a); + // here we need to catch the error! + if(err > 0 && g_proc_id == 0) { + printf("# inversion failed in six_invert code %d\n", err); + err = 0; + } + + /* copy "a" back to sw_inv */ + get_3x3_block_matrix(&sw_inv[icy][0][i], a, 0, 0); + get_3x3_block_matrix(&sw_inv[icy][1][i], a, 0, 3); + get_3x3_block_matrix(&sw_inv[icy][2][i], a, 3, 3); + get_3x3_block_matrix(&sw_inv[icy][3][i], a, 3, 0); + } + + if(fabs(mu) > 0.) { + for(i = 0; i < 2; i++) { + populate_6x6_matrix(a, &sw[x][0][i], 0, 0); + populate_6x6_matrix(a, &sw[x][1][i], 0, 3); + _su3_dagger(v, sw[x][1][i]); + populate_6x6_matrix(a, &v, 3, 0); + populate_6x6_matrix(a, &sw[x][2][i], 3, 3); + + // we add the twisted mass term + if(i == 0) add_tm(a, -mu); + else add_tm(a, +mu); + // and invert the resulting matrix + six_invert(&err,a); + // here we need to catch the error! + if(err > 0 && g_proc_id == 0) { + printf("# %d\n", err); + err = 0; + } + + /* copy "a" back to sw_inv */ + get_3x3_block_matrix(&sw_inv[icy+VOLUME/2][0][i], a, 0, 0); + get_3x3_block_matrix(&sw_inv[icy+VOLUME/2][1][i], a, 0, 3); + get_3x3_block_matrix(&sw_inv[icy+VOLUME/2][2][i], a, 3, 3); + get_3x3_block_matrix(&sw_inv[icy+VOLUME/2][3][i], a, 3, 0); + } + } +#ifndef OMP + ++icy; +#endif + } +#ifdef OMP + } /* OpenMP closing brace */ +#endif + return; +} + +// This function computes +// +// 1/((1+T)^2 + barmu^2 - bareps^2)^{-1} +// +// for all even x, +// which is stored in sw_inv[0-(VOLUME/2-1)] +// +// it is the complement of sw_invert for the +// non-degenerate case +// multiplication with +// (1+T - i\bar\mu\gamma_5\tau^3 + \bar\epsion\tau^1) +// must be done elsewhere because of flavour structure + +void sw_invert_nd(const double mshift) { +#ifdef OMP +#pragma omp parallel + { +#endif + int err=0; + int i, x; + su3 ALIGN v; + _Complex double ALIGN a[6][6], b[6][6]; + +#ifdef OMP +#pragma omp for +#endif + for(int icx = 0; icx < (VOLUME/2); icx++) { + x = g_eo2lexic[icx]; + + for(i = 0; i < 2; i++) { + populate_6x6_matrix(a, &sw[x][0][i], 0, 0); + populate_6x6_matrix(a, &sw[x][1][i], 0, 3); + _su3_dagger(v, sw[x][1][i]); + populate_6x6_matrix(a, &v, 3, 0); + populate_6x6_matrix(a, &sw[x][2][i], 3, 3); + + // compute (1+T)^2 and store in b + mult_6x6(b, a, a); + // we add the mass shift term, which is a real number + add_shift_6x6(b, mshift); + // so b = (1+T)^2 + shift + // now invert this matrix + six_invert(&err, b); + // here we need to catch the error! + if(err > 0 && g_proc_id == 0) { + printf("# inversion failed in six_invert_nd code %d\n", err); + err = 0; + } + + /* copy "a" back to sw_inv */ + get_3x3_block_matrix(&sw_inv[icx][0][i], b, 0, 0); + get_3x3_block_matrix(&sw_inv[icx][1][i], b, 0, 3); + get_3x3_block_matrix(&sw_inv[icx][2][i], b, 3, 3); + get_3x3_block_matrix(&sw_inv[icx][3][i], b, 3, 0); + } + } +#ifdef OMP + } /* OpenMP closing brace */ +#endif + return; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/clover_leaf.c b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/clover_leaf.c new file mode 100644 index 0000000000000000000000000000000000000000..41df662cbdea107274a97e8755c11523edb8aa3b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/clover_leaf.c @@ -0,0 +1,143 @@ +/*********************************************************************** + * + * Copyright (C) 1995 Ulli Wolff, Stefan Sint + * 2001,2005 Martin Hasenbusch + * 2011,2012 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef SSE +# undef SSE +#endif +#ifdef SSE2 +# undef SSE2 +#endif +#ifdef SSE3 +# undef SSE3 +#endif + +#include +#include +#include +#include +#include +#include +#ifdef MPI +# include +#endif +#ifdef OMP +# include +#endif +#include "global.h" +#include "su3.h" +#include "sse.h" +#include "su3adj.h" +#include "operator/clovertm_operators.h" +#include "operator/clover_leaf.h" + +const double tiny_t = 1.0e-20; + +su3 ** swm, ** swp; + +void mult_6x6(_Complex double a[6][6], _Complex double b[6][6], _Complex double d[6][6]) { + + for(int i = 0; i < 6; i++) { + for(int j = 0; j < 6; j++) { + a[i][j] = 0.; + for(int k = 0; k < 6; k++) { + a[i][j] += b[i][k] * d[k][j]; + } + } + } + return; +} + +void add_6x6(_Complex double a[6][6], _Complex double b[6][6], _Complex double d[6][6]) { + + for(int i = 0; i < 6; i++) { + for(int j = 0; j < 6; j++) { + a[i][j] = b[i][j] + d[i][j]; + } + } + return; +} + +void sub_6x6(_Complex double a[6][6], _Complex double b[6][6], _Complex double d[6][6]) { + + for(int i = 0; i < 6; i++) { + for(int j = 0; j < 6; j++) { + a[i][j] = b[i][j] - d[i][j]; + } + } + return; +} + +void copy_6x6(_Complex double a[6][6], const _Complex double b[6][6]) { + for(int i = 0; i < 6; i++) { + for(int j = 0; j < 6; j++) { + a[i][j] = b[i][j]; + } + } + return; +} + + + + + + + + +su3 * _swp; + +int init_swpm(const int V) { + int i=0; + static int swpm_init=0; + + if(!swpm_init) { + if((void*)(swp = (su3**)calloc(V, sizeof(su3*))) == NULL) { + printf ("malloc errno : %d\n",errno); + errno = 0; + return(1); + } + if((void*)(swm = (su3**)calloc(V, sizeof(su3*))) == NULL) { + printf ("malloc errno : %d\n",errno); + errno = 0; + return(1); + } + if((void*)(_swp = (su3*)calloc(2*4*V+1, sizeof(su3))) == NULL) { + printf ("malloc errno : %d\n",errno); + errno = 0; + return(2); + } +#if (defined SSE || defined SSE2 || defined SSE3) + swp[0] = (su3*)(((unsigned long int)(_swp)+ALIGN_BASE)&~ALIGN_BASE); +#else + swp[0] = _swp; +#endif + swm[0] = swp[0] + 4*V; + for(i = 1; i < V; i++){ + swp[i] = swp[i-1]+4; + swm[i] = swm[i-1]+4; + } + swpm_init = 1; + } + return(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/clover_leaf.h b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/clover_leaf.h new file mode 100644 index 0000000000000000000000000000000000000000..44db299cf93abd38ce796b3147d33495161f2a3a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/clover_leaf.h @@ -0,0 +1,46 @@ +/*********************************************************************** + * + * Copyright (C) 2005 Martin Hasenbusch + * 2011 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _CLOVER_LEAF_H +#define _CLOVER_LEAF_H +#include "su3.h" +#include "hamiltonian_field.h" + +extern su3 ** swm, ** swp; +extern const double tiny_t; + +void sw_term(const su3 ** const gf, const double kappa, const double c_sw); +double sw_trace(const int ieo, const double mu); +double sw_trace_nd(const int ieo, const double mu, const double eps); +void sw_invert(const int ieo, const double mu); +void sw_invert_nd(const double mshift); +void sw_deriv(const int ieo, const double mu); +void sw_deriv_nd(const int ieo); +void sw_spinor(const int ieo, const spinor * const kk, const spinor * const ll, const double fac); +void sw_all(hamiltonian_field_t * const hf, const double kappa, const double c_sw); +int init_swpm(const int V); + +void mult_6x6(_Complex double a[6][6], _Complex double b[6][6], _Complex double d[6][6]); +void add_6x6(_Complex double a[6][6], _Complex double b[6][6], _Complex double d[6][6]); +void sub_6x6(_Complex double a[6][6], _Complex double b[6][6], _Complex double d[6][6]); +void copy_6x6(_Complex double a[6][6], const _Complex double b[6][6]); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/clover_term.c b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/clover_term.c new file mode 100644 index 0000000000000000000000000000000000000000..02273048dd06585ace63f4671d1ac5dfbb5afe03 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/clover_term.c @@ -0,0 +1,204 @@ +/*********************************************************************** + * + * Copyright (C) 1995 Ulli Wolff, Stefan Sint + * 2001,2005 Martin Hasenbusch + * 2011,2012 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef SSE +# undef SSE +#endif +#ifdef SSE2 +# undef SSE2 +#endif +#ifdef SSE3 +# undef SSE3 +#endif + +#include +#include +#include +#include +#include +#include +#ifdef MPI +# include +#endif +#ifdef OMP +# include +#endif +#include "global.h" +#include "su3.h" +#include "sse.h" +#include "su3adj.h" +#include "operator/clovertm_operators.h" +#include "operator/clover_leaf.h" + +// the clover term is written as +// +// 1 + T_{xa\alpha,yb\beta} +// = 1 + i csw kappa/2 sigma_munu^alphabeta F_munu^ab(x)delta_xy +// +// see hep-lat/9603008 for all glory details +// +// per site we have to store two six-by-six complex matrices. +// As the off-diagonal 3x3 matrices are just inverse to +// each other, we get away with two times three 3x3 complex matrices +// +// these are stored in the array sw[VOLUME][3][2] of type su3 +// where x is the space time index +// a runs from 0 to 2 +// b runs from 0 to 1 +// sw[x][0][0] is the upper diagonal 3x3 matrix +// sw[x][1][0] the upper off-diagnoal 3x3 matrix +// sw[x][2][0] the lower diagonal 3x3 matrix +// the lower off-diagonal 3x3 matrix would be the inverser of sw[x][1][0] +// +// identical convention for the second six-by-six matrix +// just with second index set to 1 +// +// so the application of the clover term +// plus twisted mass term to a spinor would just be +// +// r_0 = sw[0][0] s_0 + sw[1][0] s_1 + i mu s_0 +// r_1 = sw[1][0]^-1 s_0 + sw[2][0] s_1 + i mu s_1 +// r_2 = sw[0][1] s_2 + sw[1][1] s_3 - i mu s_2 +// r_3 = sw[1][1]^-1 s_2 + sw[2][1] s_3 - i mu s_3 +// +// suppressing space-time indices + +void sw_term(const su3 ** const gf, const double kappa, const double c_sw) { +#ifdef OMP +#pragma omp parallel + { +#endif + + int k,l; + int x,xpk,xpl,xmk,xml,xpkml,xplmk,xmkml; + const su3 *w1,*w2,*w3,*w4; + double ka_csw_8 = kappa*c_sw/8.; + su3 ALIGN v1,v2,plaq; + su3 ALIGN fkl[4][4]; + su3 ALIGN magnetic[4],electric[4]; + su3 ALIGN aux; + + + /* compute the clover-leave */ + /* l __ __ + | | | | + |__| |__| + __ __ + | | | | + |__| |__| k */ + +#ifdef OMP +#pragma omp for +#endif + for(x = 0; x < VOLUME; x++) { + for(k = 0; k < 4; k++) { + for(l = k+1; l < 4; l++) { + xpk=g_iup[x][k]; + xpl=g_iup[x][l]; + xmk=g_idn[x][k]; + xml=g_idn[x][l]; + xpkml=g_idn[xpk][l]; + xplmk=g_idn[xpl][k]; + xmkml=g_idn[xml][k]; + w1=&gf[x][k]; + w2=&gf[xpk][l]; + w3=&gf[xpl][k]; + w4=&gf[x][l]; + _su3_times_su3(v1,*w1,*w2); + _su3_times_su3(v2,*w4,*w3); + _su3_times_su3d(plaq,v1,v2); + w1=&gf[x][l]; + w2=&gf[xplmk][k]; + w3=&gf[xmk][l]; + w4=&gf[xmk][k]; + _su3_times_su3d(v1,*w1,*w2); + _su3d_times_su3(v2,*w3,*w4); + _su3_times_su3_acc(plaq,v1,v2); + w1=&gf[xmk][k]; + w2=&gf[xmkml][l]; + w3=&gf[xmkml][k]; + w4=&gf[xml][l]; + _su3_times_su3(v1,*w2,*w1); + _su3_times_su3(v2,*w3,*w4); + _su3d_times_su3_acc(plaq,v1,v2); + w1=&gf[xml][l]; + w2=&gf[xml][k]; + w3=&gf[xpkml][l]; + w4=&gf[x][k]; + _su3d_times_su3(v1,*w1,*w2); + _su3_times_su3d(v2,*w3,*w4); + _su3_times_su3_acc(plaq,v1,v2); + _su3_dagger(v2,plaq); + _su3_minus_su3(fkl[k][l],plaq,v2); + } + } + + // this is the one in flavour and colour space + // twisted mass term is treated in clover, sw_inv and + // clover_gamma5 and the corresponding nd versions + _su3_one(sw[x][0][0]); + _su3_one(sw[x][2][0]); + _su3_one(sw[x][0][1]); + _su3_one(sw[x][2][1]); + + for(k = 1; k < 4; k++) + { + _su3_assign(electric[k], fkl[0][k]); + } + _su3_assign(magnetic[1], fkl[2][3]); + _su3_minus_assign(magnetic[2], fkl[1][3]); + _su3_assign(magnetic[3], fkl[1][2]); + + /* upper left block 6x6 matrix */ + + _itimes_su3_minus_su3(aux,electric[3],magnetic[3]); + _su3_refac_acc(sw[x][0][0],ka_csw_8,aux); + + _itimes_su3_minus_su3(aux,electric[1],magnetic[1]); + _su3_minus_su3(v2,electric[2],magnetic[2]); + _su3_acc(aux,v2); + _real_times_su3(sw[x][1][0],ka_csw_8,aux); + + _itimes_su3_minus_su3(aux,magnetic[3],electric[3]); + _su3_refac_acc(sw[x][2][0],ka_csw_8,aux); + + /* lower right block 6x6 matrix */ + + _itimes_su3_plus_su3(aux,electric[3],magnetic[3]); + _su3_refac_acc(sw[x][0][1],(-ka_csw_8),aux); + + _itimes_su3_plus_su3(aux,electric[1],magnetic[1]); + _su3_plus_su3(v2,electric[2],magnetic[2]); + _su3_acc(aux,v2); + _real_times_su3(sw[x][1][1],(-ka_csw_8),aux); + + _itimes_su3_plus_su3(aux,magnetic[3],electric[3]); + _su3_refac_acc(sw[x][2][1],ka_csw_8,aux); + } +#ifdef OMP + } /* OpenMP closing brace */ +#endif + return; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/clovertm_operators.c b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/clovertm_operators.c new file mode 100644 index 0000000000000000000000000000000000000000..55a412c537f9aa76d8e10022ac41028cd8f685ed --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/clovertm_operators.c @@ -0,0 +1,1171 @@ +/*********************************************************************** + * + * Copyright (C) 2005 Martin Hasenbusch + * 2011 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#include +#include +#ifdef MPI +# include +#endif +#include "global.h" +#include "su3.h" +#include "sse.h" +#include "linalg_eo.h" +#include "operator/Hopping_Matrix.h" +#include "operator/Hopping_Matrix_32.h" + +#include "tm_operators.h" +#include "tm_operators_32.h" + +#include "operator/clovertm_operators.h" + + +su3 *** sw; +su3 *** sw_inv; + +su3_32 *** sw_32; +su3_32 *** sw_inv_32; + +void clover_gamma5(const int ieo, + spinor * const l, const spinor * const k, const spinor * const j, + const double mu); +void clover(const int ieo, + spinor * const l, const spinor * const k, const spinor * const j, + const double mu); + +void Msw_full(spinor * const Even_new, spinor * const Odd_new, + spinor * const Even, spinor * const Odd) { + /* Even sites */ + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX], Odd); + assign_mul_one_sw_pm_imu(EE, Even_new, Even, +g_mu); + assign_add_mul_r(Even_new, g_spinor_field[DUM_MATRIX], -1., VOLUME/2); + + /* Odd sites */ + Hopping_Matrix(OE, g_spinor_field[DUM_MATRIX], Even); + assign_mul_one_sw_pm_imu(OO, Odd_new, Odd, +g_mu); + assign_add_mul_r(Odd_new, g_spinor_field[DUM_MATRIX], -1., VOLUME/2); +} + + +/******************************************************************* + * + * + * \hat Q_{+} = + * \gamma_5(M_{oo}^+ - M_{oe}(M_{ee}^+ )^{-1}M_{eo}) + * + * with clover term! + * see documentation for details + * k is the input field + * l is the output field + * + * it acts only on the odd part or only + * on a half spinor + *******************************************************************/ + + +// this is the clover Qhat with mu = 0 +void Qsw_psi(spinor * const l, spinor * const k) { + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX+1], k); + clover_inv(g_spinor_field[DUM_MATRIX+1], +1, g_mu); + Hopping_Matrix(OE, g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+1]); + clover_gamma5(OO, l, k, g_spinor_field[DUM_MATRIX], 0.); +} + +// this is the twisted clover Qhat with -mu +void Qsw_minus_psi(spinor * const l, spinor * const k) { + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX+1], k); + clover_inv(g_spinor_field[DUM_MATRIX+1], -1, g_mu); + Hopping_Matrix(OE, g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+1]); + clover_gamma5(OO, l, k, g_spinor_field[DUM_MATRIX], -(g_mu + g_mu3)); +} + +// this is the twisted clover Qhat with +mu +void Qsw_plus_psi(spinor * const l, spinor * const k) { + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX+1], k); + clover_inv(g_spinor_field[DUM_MATRIX+1], +1, g_mu); + Hopping_Matrix(OE, g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+1]); + clover_gamma5(OO, l, k, g_spinor_field[DUM_MATRIX], +(g_mu + g_mu3)); +} + + +void Qsw_sq_psi(spinor * const l, spinor * const k) { + /* \hat Q_{-} */ + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX+1], k); + clover_inv(g_spinor_field[DUM_MATRIX+1], +1, g_mu); + Hopping_Matrix(OE, g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+1]); + clover_gamma5(OO, g_spinor_field[DUM_MATRIX], k, g_spinor_field[DUM_MATRIX], 0.); + /* \hat Q_{+} */ + Hopping_Matrix(EO, l, g_spinor_field[DUM_MATRIX]); + clover_inv(l, +1, g_mu); + Hopping_Matrix(OE, g_spinor_field[DUM_MATRIX+1], l); + clover_gamma5(OO, l, g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+1], 0.); +} + +void Qsw_pm_psi(spinor * const l, spinor * const k) { + /* \hat Q_{-} */ + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX+1], k); + clover_inv(g_spinor_field[DUM_MATRIX+1], -1, g_mu); + Hopping_Matrix(OE, g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+1]); + clover_gamma5(OO, g_spinor_field[DUM_MATRIX], k, g_spinor_field[DUM_MATRIX], -(g_mu + g_mu3)); + /* \hat Q_{+} */ + Hopping_Matrix(EO, l, g_spinor_field[DUM_MATRIX]); + clover_inv(l, +1, g_mu); + Hopping_Matrix(OE, g_spinor_field[DUM_MATRIX+1], l); + clover_gamma5(OO, l, g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+1], +(g_mu + g_mu3)); +} + +// this is the clover Mhat with mu = 0 +void Msw_psi(spinor * const l, spinor * const k) { + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX+1], k); + clover_inv(g_spinor_field[DUM_MATRIX+1], +1, g_mu); + Hopping_Matrix(OE, g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+1]); + clover(OO, l, k, g_spinor_field[DUM_MATRIX], 0.); +} + +void Msw_plus_psi(spinor * const l, spinor * const k) { + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX+1], k); + clover_inv(g_spinor_field[DUM_MATRIX+1], +1, g_mu); + Hopping_Matrix(OE, g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+1]); + clover(OO, l, k, g_spinor_field[DUM_MATRIX], +(g_mu + g_mu3)); +} + +void Msw_minus_psi(spinor * const l, spinor * const k) { + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX+1], k); + clover_inv(g_spinor_field[DUM_MATRIX+1], -1, g_mu); + Hopping_Matrix(OE, g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+1]); + clover(OO, l, k, g_spinor_field[DUM_MATRIX], -(g_mu + g_mu3)); +} + + +void H_eo_sw_inv_psi(spinor * const l, spinor * const k, const int ieo, const int tau3sign, const double mu) { + Hopping_Matrix(ieo, l, k); + clover_inv(l, tau3sign, mu); + return; +} + + +/********************************************************** + * + * clover_inv applies the inverse of the clover term + * to spinor field l + * it is assumed that the corresponding inverted matrices + * are stored in sw_inv + * + * this is needed for even/odd preconditioning + * + **********************************************************/ + +void clover_inv(spinor * const l, const int tau3sign, const double mu) { +#ifdef OMP +#pragma omp parallel + { +#endif + int icy; + su3_vector ALIGN psi, chi, phi1, phi3; + int ioff = 0; + const su3 *w1, *w2, *w3, *w4; + spinor *rn; + + if(tau3sign < 0 && fabs(mu) > 0) { + ioff = VOLUME/2; + } + +#ifndef OMP + icy = ioff; +#endif + /************************ loop over all lattice sites *************************/ +#ifdef OMP +#pragma omp for +#endif + for(int icx = 0; icx < (VOLUME/2); icx++) { +#ifdef OMP + icy = ioff + icx; +#endif + + rn = l + icx; + _vector_assign(phi1,(*rn).s0); + _vector_assign(phi3,(*rn).s2); + + w1=&sw_inv[icy][0][0]; + w2=w1+2; /* &sw_inv[icy][1][0]; */ + w3=w1+4; /* &sw_inv[icy][2][0]; */ + w4=w1+6; /* &sw_inv[icy][3][0]; */ + _su3_multiply(psi,*w1,phi1); + _su3_multiply(chi,*w2,(*rn).s1); + _vector_add((*rn).s0,psi,chi); + _su3_multiply(psi,*w4,phi1); + _su3_multiply(chi,*w3,(*rn).s1); + _vector_add((*rn).s1,psi,chi); + + w1++; /* &sw_inv[icy][0][1]; */ + w2++; /* &sw_inv[icy][1][1]; */ + w3++; /* &sw_inv[icy][2][1]; */ + w4++; /* &sw_inv[icy][3][1]; */ + _su3_multiply(psi,*w1,phi3); + _su3_multiply(chi,*w2,(*rn).s3); + _vector_add((*rn).s2,psi,chi); + _su3_multiply(psi,*w4,phi3); + _su3_multiply(chi,*w3,(*rn).s3); + _vector_add((*rn).s3,psi,chi); + +#ifndef OMP + ++icy; +#endif + + /******************************** end of loop *********************************/ + } +#ifdef OMP + } /* OpenMP closing brace */ +#endif + return; +} + +void clover_inv_nd(const int ieo, spinor * const l_c, spinor * const l_s) { +#ifdef OMP +#pragma omp parallel + { +#endif + int icy; + su3_vector ALIGN psi, chi, phi1, phi3; + int ioff = 0; + const su3 *w1, *w2, *w3, *w4; + spinor *rn_s, *rn_c; + + + if(ieo == 1) ioff = VOLUME/2; + +#ifndef OMP + icy = ioff; +#endif + +#ifdef OMP +#pragma omp for +#endif + for(unsigned int icx = 0; icx < (VOLUME/2); icx++) { +#ifdef OMP + icy = ioff + icx; +#endif + + rn_s = l_s + icx; + rn_c = l_c + icx; + _vector_assign(phi1,(*rn_s).s0); + + w1=&sw_inv[icy][0][0]; + w2=w1+2; /* &sw_inv[icy][1][0]; */ + w3=w1+4; /* &sw_inv[icy][2][0]; */ + w4=w1+6; /* &sw_inv[icy][3][0]; */ + _su3_multiply(psi, *w1, phi1); + _su3_multiply(chi, *w2, (*rn_s).s1); + _vector_add((*rn_s).s0, psi,chi); + _su3_multiply(psi, *w4, phi1); + _su3_multiply(chi, *w3, (*rn_s).s1); + _vector_add((*rn_s).s1, psi, chi); + + _vector_assign(phi1,(*rn_c).s0); + + _su3_multiply(psi, *w1, phi1); + _su3_multiply(chi, *w2, (*rn_c).s1); + _vector_add((*rn_c).s0, psi,chi); + _su3_multiply(psi, *w4, phi1); + _su3_multiply(chi, *w3, (*rn_c).s1); + _vector_add((*rn_c).s1, psi, chi); + + _vector_assign(phi3,(*rn_s).s2); + + w1++; /* &sw_inv[icy][0][1]; */ + w2++; /* &sw_inv[icy][1][1]; */ + w3++; /* &sw_inv[icy][2][1]; */ + w4++; /* &sw_inv[icy][3][1]; */ + _su3_multiply(psi, *w1, phi3); + _su3_multiply(chi, *w2, (*rn_s).s3); + _vector_add((*rn_s).s2, psi, chi); + _su3_multiply(psi, *w4, phi3); + _su3_multiply(chi, *w3, (*rn_s).s3); + _vector_add((*rn_s).s3, psi, chi); + + _vector_assign(phi3,(*rn_c).s2); + + _su3_multiply(psi, *w1, phi3); + _su3_multiply(chi, *w2, (*rn_c).s3); + _vector_add((*rn_c).s2, psi, chi); + _su3_multiply(psi, *w4, phi3); + _su3_multiply(chi, *w3, (*rn_c).s3); + _vector_add((*rn_c).s3, psi, chi); + +#ifndef OMP + ++icy; +#endif + + /******************************** end of loop *********************************/ + } +#ifdef OMP + } /* OpenMP closing brace */ +#endif + return; +} + + +/************************************************************** + * + * clover_gamma5 applies the clover term to spinor k, adds k + * to j then and stores it in l multiplied by gamma_5 + * + * it is assumed that the clover leaf is computed and stored + * in sw[VOLUME][3][2] + * the corresponding routine can be found in clover_leaf.c + * + **************************************************************/ + +void clover_gamma5(const int ieo, + spinor * const l, const spinor * const k, const spinor * const j, + const double mu) { +#ifdef OMP +#pragma omp parallel + { +#endif + su3_vector ALIGN chi, psi1, psi2; + int ix; + int ioff,icx; + const su3 *w1,*w2,*w3; + spinor *r; + const spinor *s,*t; + + if(ieo == 0) { + ioff = 0; + } + else { + ioff = (VOLUME+RAND)/2; + } + +/************************ loop over all lattice sites *************************/ +#ifdef OMP +#pragma omp for +#endif + for(icx = ioff; icx < (VOLUME/2+ioff); icx++) { + ix = g_eo2lexic[icx]; + + r = l + icx-ioff; + s = k + icx-ioff; + t = j + icx-ioff; + + w1=&sw[ix][0][0]; + w2=w1+2; /*&sw[ix][1][0];*/ + w3=w1+4; /*&sw[ix][2][0];*/ + _su3_multiply(psi1,*w1,(*s).s0); + _su3_multiply(chi,*w2,(*s).s1); + _vector_add_assign(psi1,chi); + _su3_inverse_multiply(psi2,*w2,(*s).s0); + _su3_multiply(chi,*w3,(*s).s1); + _vector_add_assign(psi2,chi); + // add in the twisted mass term (plus in the upper components) + _vector_add_i_mul(psi1, mu, (*s).s0); + _vector_add_i_mul(psi2, mu, (*s).s1); + + _vector_sub((*r).s0,psi1,(*t).s0); + _vector_sub((*r).s1,psi2,(*t).s1); + + w1++; /*=&sw[ix][0][1];*/ + w2++; /*=&sw[ix][1][1];*/ + w3++; /*=&sw[ix][2][1];*/ + _su3_multiply(psi1,*w1,(*s).s2); _su3_multiply(chi,*w2,(*s).s3); + _vector_add_assign(psi1,chi); + _su3_inverse_multiply(psi2,*w2,(*s).s2); _su3_multiply(chi,*w3,(*s).s3); + _vector_add_assign(psi2,chi); + // add in the twisted mass term (minus from g5 in the lower components) + _vector_add_i_mul(psi1, -mu, (*s).s2); + _vector_add_i_mul(psi2, -mu, (*s).s3); + + /**************** multiply with gamma5 included ******************************/ + _vector_sub((*r).s2,(*t).s2,psi1); + _vector_sub((*r).s3,(*t).s3,psi2); + /******************************** end of loop *********************************/ + } +#ifdef OMP + } /* OMP closing brace */ +#endif + return; +} + + +/************************************************************** + * + * clover applies (1 + T + imug5) to spinor k, + * subtracts j from k and stores in l + * + * it is assumed that the clover leaf is computed and stored + * in sw[VOLUME][3][2] + * the corresponding routine can be found in clover_leaf.c + * + **************************************************************/ + + +void clover(const int ieo, + spinor * const l, const spinor * const k, const spinor * const j, + const double mu) { +#ifdef OMP +#pragma omp parallel + { +#endif + su3_vector ALIGN chi, psi1, psi2; + int ix; + int ioff; + const su3 *w1,*w2,*w3; + spinor *r; + const spinor *s,*t; + + if(ieo == 0) { + ioff = 0; + } + else { + ioff = (VOLUME+RAND)/2; + } +#ifdef OMP +#pragma omp for +#endif + for(unsigned int icx = ioff; icx < (VOLUME/2+ioff); icx++) { + ix = g_eo2lexic[icx]; + + r = l + icx-ioff; + s = k + icx-ioff; + t = j + icx-ioff; + + // upper two spin components first + w1=&sw[ix][0][0]; + w2=w1+2; /*&sw[ix][1][0];*/ + w3=w1+4; /*&sw[ix][2][0];*/ + _su3_multiply(psi1,*w1,(*s).s0); + _su3_multiply(chi,*w2,(*s).s1); + _vector_add_assign(psi1,chi); + _su3_inverse_multiply(psi2,*w2,(*s).s0); + _su3_multiply(chi,*w3,(*s).s1); + _vector_add_assign(psi2,chi); + + // add in the twisted mass term (plus in the upper components) + _vector_add_i_mul(psi1, mu, (*s).s0); + _vector_add_i_mul(psi2, mu, (*s).s1); + + _vector_sub((*r).s0,psi1,(*t).s0); + _vector_sub((*r).s1,psi2,(*t).s1); + + // now lower to spin components + w1++; /*=&sw[ix][0][1];*/ + w2++; /*=&sw[ix][1][1];*/ + w3++; /*=&sw[ix][2][1];*/ + _su3_multiply(psi1,*w1,(*s).s2); + _su3_multiply(chi,*w2,(*s).s3); + _vector_add_assign(psi1,chi); + _su3_inverse_multiply(psi2,*w2,(*s).s2); + _su3_multiply(chi,*w3,(*s).s3); + _vector_add_assign(psi2,chi); + + // add in the twisted mass term (minus from g5 in the lower components) + _vector_add_i_mul(psi1, -mu, (*s).s2); + _vector_add_i_mul(psi2, -mu, (*s).s3); + + _vector_sub((*r).s2,psi1,(*t).s2); + _vector_sub((*r).s3,psi2,(*t).s3); + } +#ifdef OMP + } /* OpenMP closing brace */ +#endif + return; +} + +/************************************************************** + * + * clover_nd applies the clover (1 + T + imug5tau3 + epstau1) + * term to spinor k, subtracts j from k and stores in l + * + * it is assumed that the clover leaf is computed and stored + * in sw[VOLUME][3][2] + * the corresponding routine can be found in clover_leaf.c + * + **************************************************************/ + +void clover_nd(const int ieo, + spinor * const l_c, spinor * const l_s, + const spinor * const k_c, const spinor * const k_s, + const spinor * const j_c, const spinor * const j_s, + const double mubar, const double epsbar) { +#ifdef OMP +#pragma omp parallel + { +#endif + su3_vector ALIGN chi, psi1, psi2; + int ix; + int ioff; + const su3 *w1,*w2,*w3; + spinor *r_s, *r_c; + const spinor *s_s, *s_c, *t_s, *t_c; + + if(ieo == 0) { + ioff = 0; + } + else { + ioff = (VOLUME+RAND)/2; + } + /************************ loop over all lattice sites *************************/ +#ifdef OMP +#pragma omp for +#endif + for(unsigned int icx = ioff; icx < (VOLUME/2+ioff); icx++) { + ix = g_eo2lexic[icx]; + + r_s = l_s + icx-ioff; + r_c = l_c + icx-ioff; + s_s = k_s + icx-ioff; + s_c = k_c + icx-ioff; + t_s = j_s + icx-ioff; + t_c = j_c + icx-ioff; + + // upper two spin components first + w1=&sw[ix][0][0]; + w2=w1+2; /*&sw[ix][1][0];*/ + w3=w1+4; /*&sw[ix][2][0];*/ + _su3_multiply(psi1, *w1, (*s_s).s0); + _su3_multiply(chi, *w2, (*s_s).s1); + _vector_add_assign(psi1, chi); + _su3_inverse_multiply(psi2, *w2, (*s_s).s0); + _su3_multiply(chi, *w3, (*s_s).s1); + _vector_add_assign(psi2, chi); + + // add in the twisted mass term (plus in the upper components) + _vector_add_i_mul(psi1, mubar, (*s_s).s0); + _vector_add_i_mul(psi2, mubar, (*s_s).s1); + + _vector_add_mul(psi1, epsbar, (*s_c).s0); + _vector_add_mul(psi2, epsbar, (*s_c).s1); + + _vector_sub((*r_s).s0, psi1, (*t_s).s0); + _vector_sub((*r_s).s1, psi2, (*t_s).s1); + + _su3_multiply(psi1, *w1, (*s_c).s0); + _su3_multiply(chi, *w2, (*s_c).s1); + _vector_add_assign(psi1, chi); + _su3_inverse_multiply(psi2, *w2, (*s_c).s0); + _su3_multiply(chi, *w3, (*s_c).s1); + _vector_add_assign(psi2, chi); + + // add in the twisted mass term (plus in the upper components) + _vector_add_i_mul(psi1, -mubar, (*s_c).s0); + _vector_add_i_mul(psi2, -mubar, (*s_c).s1); + + _vector_add_mul(psi1, epsbar, (*s_s).s0); + _vector_add_mul(psi2, epsbar, (*s_s).s1); + + _vector_sub((*r_c).s0, psi1, (*t_c).s0); + _vector_sub((*r_c).s1, psi2, (*t_c).s1); + + + // now lower to spin components + w1++; /*=&sw[ix][0][1];*/ + w2++; /*=&sw[ix][1][1];*/ + w3++; /*=&sw[ix][2][1];*/ + _su3_multiply(psi1, *w1, (*s_s).s2); + _su3_multiply(chi, *w2, (*s_s).s3); + _vector_add_assign(psi1, chi); + _su3_inverse_multiply(psi2, *w2, (*s_s).s2); + _su3_multiply(chi, *w3, (*s_s).s3); + _vector_add_assign(psi2, chi); + + // add in the twisted mass term (minus from g5 in the lower components) + _vector_add_i_mul(psi1, -mubar, (*s_s).s2); + _vector_add_i_mul(psi2, -mubar, (*s_s).s3); + + _vector_add_mul(psi1, epsbar, (*s_c).s2); + _vector_add_mul(psi2, epsbar, (*s_c).s3); + + _vector_sub((*r_s).s2,psi1,(*t_s).s2); + _vector_sub((*r_s).s3,psi2,(*t_s).s3); + + _su3_multiply(psi1, *w1, (*s_c).s2); + _su3_multiply(chi, *w2, (*s_c).s3); + _vector_add_assign(psi1, chi); + _su3_inverse_multiply(psi2, *w2, (*s_c).s2); + _su3_multiply(chi, *w3, (*s_c).s3); + _vector_add_assign(psi2, chi); + + // add in the twisted mass term (minus from g5 in the lower components) + _vector_add_i_mul(psi1, mubar, (*s_c).s2); + _vector_add_i_mul(psi2, mubar, (*s_c).s3); + + _vector_add_mul(psi1, epsbar, (*s_s).s2); + _vector_add_mul(psi2, epsbar, (*s_s).s3); + + _vector_sub((*r_c).s2, psi1, (*t_c).s2); + _vector_sub((*r_c).s3, psi2, (*t_c).s3); + } +#ifdef OMP + } /* OpenMP closing brace */ +#endif + return; +} + +void clover_gamma5_nd(const int ieo, + spinor * const l_c, spinor * const l_s, + const spinor * const k_c, const spinor * const k_s, + const spinor * const j_c, const spinor * const j_s, + const double mubar, const double epsbar) { +#ifdef OMP +#pragma omp parallel + { +#endif + su3_vector ALIGN chi, psi1, psi2; + int ix; + int ioff; + const su3 *w1,*w2,*w3; + spinor *r_s, *r_c; + const spinor *s_s, *s_c, *t_s, *t_c; + + if(ieo == 0) { + ioff = 0; + } + else { + ioff = (VOLUME+RAND)/2; + } + /************************ loop over all lattice sites *************************/ +#ifdef OMP +#pragma omp for +#endif + for(unsigned int icx = ioff; icx < (VOLUME/2+ioff); icx++) { + ix = g_eo2lexic[icx]; + + r_s = l_s + icx-ioff; + r_c = l_c + icx-ioff; + s_s = k_s + icx-ioff; + s_c = k_c + icx-ioff; + t_s = j_s + icx-ioff; + t_c = j_c + icx-ioff; + + // upper two spin components first + w1=&sw[ix][0][0]; + w2=w1+2; /*&sw[ix][1][0];*/ + w3=w1+4; /*&sw[ix][2][0];*/ + _su3_multiply(psi1, *w1, (*s_s).s0); + _su3_multiply(chi, *w2, (*s_s).s1); + _vector_add_assign(psi1, chi); + _su3_inverse_multiply(psi2, *w2, (*s_s).s0); + _su3_multiply(chi, *w3, (*s_s).s1); + _vector_add_assign(psi2, chi); + + // add in the twisted mass term (plus in the upper components) + _vector_add_i_mul(psi1, mubar, (*s_s).s0); + _vector_add_i_mul(psi2, mubar, (*s_s).s1); + + _vector_add_mul(psi1, epsbar, (*s_c).s0); + _vector_add_mul(psi2, epsbar, (*s_c).s1); + + _vector_sub((*r_s).s0, psi1, (*t_s).s0); + _vector_sub((*r_s).s1, psi2, (*t_s).s1); + + _su3_multiply(psi1, *w1, (*s_c).s0); + _su3_multiply(chi, *w2, (*s_c).s1); + _vector_add_assign(psi1, chi); + _su3_inverse_multiply(psi2, *w2, (*s_c).s0); + _su3_multiply(chi, *w3, (*s_c).s1); + _vector_add_assign(psi2, chi); + + // add in the twisted mass term (plus in the upper components) + _vector_add_i_mul(psi1, -mubar, (*s_c).s0); + _vector_add_i_mul(psi2, -mubar, (*s_c).s1); + + _vector_add_mul(psi1, epsbar, (*s_s).s0); + _vector_add_mul(psi2, epsbar, (*s_s).s1); + + _vector_sub((*r_c).s0, psi1, (*t_c).s0); + _vector_sub((*r_c).s1, psi2, (*t_c).s1); + + + // now lower to spin components + w1++; /*=&sw[ix][0][1];*/ + w2++; /*=&sw[ix][1][1];*/ + w3++; /*=&sw[ix][2][1];*/ + _su3_multiply(psi1, *w1, (*s_s).s2); + _su3_multiply(chi, *w2, (*s_s).s3); + _vector_add_assign(psi1, chi); + _su3_inverse_multiply(psi2, *w2, (*s_s).s2); + _su3_multiply(chi, *w3, (*s_s).s3); + _vector_add_assign(psi2, chi); + + // add in the twisted mass term (minus from g5 in the lower components) + _vector_add_i_mul(psi1, -mubar, (*s_s).s2); + _vector_add_i_mul(psi2, -mubar, (*s_s).s3); + + _vector_add_mul(psi1, epsbar, (*s_c).s2); + _vector_add_mul(psi2, epsbar, (*s_c).s3); + + _vector_sub((*r_s).s2, (*t_s).s2, psi1); + _vector_sub((*r_s).s3, (*t_s).s3, psi2); + + _su3_multiply(psi1, *w1, (*s_c).s2); + _su3_multiply(chi, *w2, (*s_c).s3); + _vector_add_assign(psi1, chi); + _su3_inverse_multiply(psi2, *w2, (*s_c).s2); + _su3_multiply(chi, *w3, (*s_c).s3); + _vector_add_assign(psi2, chi); + + // add in the twisted mass term (minus from g5 in the lower components) + _vector_add_i_mul(psi1, mubar, (*s_c).s2); + _vector_add_i_mul(psi2, mubar, (*s_c).s3); + + _vector_add_mul(psi1, epsbar, (*s_s).s2); + _vector_add_mul(psi2, epsbar, (*s_s).s3); + + _vector_sub((*r_c).s2, (*t_c).s2, psi1); + _vector_sub((*r_c).s3, (*t_c).s3, psi2); + } +#ifdef OMP + } /* OpenMP closing brace */ +#endif + return; +} + + +/************************************************************** + * + * assign_mul_one_sw_pm_imu applies (1 + T + imug5) to spinor l + * and stores it in k + * + * it is assumed that the clover leaf is computed and stored + * in sw[VOLUME][3][2] + * the corresponding routine can be found in clover_leaf.c + * + **************************************************************/ + + +void assign_mul_one_sw_pm_imu(const int ieo, + spinor * const k, const spinor * const l, + const double mu) { +#ifdef OMP +#pragma omp parallel + { +#endif + su3_vector ALIGN chi, psi1, psi2; + int ix; + int ioff; + const su3 *w1, *w2, *w3; + spinor *r; + const spinor *s; + + if(ieo == 0) { + ioff = 0; + } + else { + ioff = (VOLUME+RAND)/2; + } + /************************ loop over all lattice sites *************************/ +#ifdef OMP +#pragma omp for +#endif + for(unsigned icx = ioff; icx < (VOLUME/2+ioff); icx++) { + ix = g_eo2lexic[icx]; + + r = k + icx-ioff; + s = l + icx-ioff; + + // upper two spin components first + w1=&sw[ix][0][0]; + w2=w1+2; /*&sw[ix][1][0];*/ + w3=w1+4; /*&sw[ix][2][0];*/ + _su3_multiply(psi1,*w1,(*s).s0); + _su3_multiply(chi,*w2,(*s).s1); + _vector_add_assign(psi1,chi); + _su3_inverse_multiply(psi2,*w2,(*s).s0); + _su3_multiply(chi,*w3,(*s).s1); + _vector_add_assign(psi2,chi); + + // add in the twisted mass term (plus in the upper components) + _vector_add_i_mul(psi1, mu, (*s).s0); + _vector_add_i_mul(psi2, mu, (*s).s1); + + _vector_assign((*r).s0, psi1); + _vector_assign((*r).s1, psi2); + + // now lower to spin components + w1++; /*=&sw[ix][0][1];*/ + w2++; /*=&sw[ix][1][1];*/ + w3++; /*=&sw[ix][2][1];*/ + _su3_multiply(psi1,*w1,(*s).s2); + _su3_multiply(chi,*w2,(*s).s3); + _vector_add_assign(psi1,chi); + _su3_inverse_multiply(psi2,*w2,(*s).s2); + _su3_multiply(chi,*w3,(*s).s3); + _vector_add_assign(psi2,chi); + + // add in the twisted mass term (minus from g5 in the lower components) + _vector_add_i_mul(psi1, -mu, (*s).s2); + _vector_add_i_mul(psi2, -mu, (*s).s3); + + _vector_assign((*r).s2, psi1); + _vector_assign((*r).s3, psi2); + } +#ifdef OMP + } /* OpenMP closing brace */ +#endif + return; +} + +/************************************************************** + * + * assign_mul_one_sw_pm_imu_eps applies + * (1 + T + imug5tau3 + epstau1) to spinor l + * and stores it in k + * + * it is assumed that the clover leaf is computed and stored + * in sw[VOLUME][3][2] + * the corresponding routine can be found in clover_leaf.c + * + **************************************************************/ + + +void assign_mul_one_sw_pm_imu_eps(const int ieo, + spinor * const k_s, spinor * const k_c, + const spinor * const l_s, const spinor * const l_c, + const double mu, const double eps) { +#ifdef OMP +#pragma omp parallel + { +#endif + su3_vector ALIGN chi, psi1, psi2; + int ix; + int ioff; + const su3 *w1, *w2, *w3; + spinor *r_s, *r_c; + const spinor *s_s, *s_c; + + if(ieo == 0) { + ioff = 0; + } + else { + ioff = (VOLUME+RAND)/2; + } + /************************ loop over all lattice sites *************************/ +#ifdef OMP +#pragma omp for +#endif + for(unsigned int icx = ioff; icx < (VOLUME/2+ioff); icx++) { + ix = g_eo2lexic[icx]; + + r_s = k_s + icx-ioff; + r_c = k_c + icx-ioff; + s_s = l_s + icx-ioff; + s_c = l_c + icx-ioff; + + // upper two spin components first + w1=&sw[ix][0][0]; + w2=w1+2; /*&sw[ix][1][0];*/ + w3=w1+4; /*&sw[ix][2][0];*/ + _su3_multiply(psi1, *w1, (*s_s).s0); + _su3_multiply(chi, *w2, (*s_s).s1); + _vector_add_assign(psi1, chi); + _su3_inverse_multiply(psi2, *w2, (*s_s).s0); + _su3_multiply(chi, *w3, (*s_s).s1); + _vector_add_assign(psi2, chi); + + // add in the twisted mass term (plus in the upper components) + _vector_add_i_mul(psi1, mu, (*s_s).s0); + _vector_add_i_mul(psi2, mu, (*s_s).s1); + + _vector_add_mul(psi1, eps, (*s_c).s0); + _vector_add_mul(psi2, eps, (*s_c).s1); + + _vector_assign((*r_s).s0, psi1); + _vector_assign((*r_s).s1, psi2); + + _su3_multiply(psi1, *w1, (*s_c).s0); + _su3_multiply(chi, *w2, (*s_c).s1); + _vector_add_assign(psi1, chi); + _su3_inverse_multiply(psi2, *w2, (*s_c).s0); + _su3_multiply(chi, *w3, (*s_c).s1); + _vector_add_assign(psi2, chi); + + // add in the twisted mass term (plus in the upper components) + _vector_add_i_mul(psi1, -mu, (*s_c).s0); + _vector_add_i_mul(psi2, -mu, (*s_c).s1); + + _vector_add_mul(psi1, eps, (*s_s).s0); + _vector_add_mul(psi2, eps, (*s_s).s1); + + _vector_assign((*r_c).s0, psi1); + _vector_assign((*r_c).s1, psi2); + + // now lower two spin components + w1++; /*=&sw[ix][0][1];*/ + w2++; /*=&sw[ix][1][1];*/ + w3++; /*=&sw[ix][2][1];*/ + _su3_multiply(psi1, *w1, (*s_s).s2); + _su3_multiply(chi, *w2, (*s_s).s3); + _vector_add_assign(psi1, chi); + _su3_inverse_multiply(psi2, *w2, (*s_s).s2); + _su3_multiply(chi, *w3, (*s_s).s3); + _vector_add_assign(psi2, chi); + + // add in the twisted mass term (minus from g5 in the lower components) + _vector_add_i_mul(psi1, -mu, (*s_s).s2); + _vector_add_i_mul(psi2, -mu, (*s_s).s3); + + _vector_add_mul(psi1, eps, (*s_c).s2); + _vector_add_mul(psi2, eps, (*s_c).s3); + + _vector_assign((*r_s).s2, psi1); + _vector_assign((*r_s).s3, psi2); + + _su3_multiply(psi1, *w1, (*s_c).s2); + _su3_multiply(chi, *w2, (*s_c).s3); + _vector_add_assign(psi1, chi); + _su3_inverse_multiply(psi2, *w2, (*s_c).s2); + _su3_multiply(chi, *w3, (*s_c).s3); + _vector_add_assign(psi2, chi); + + // add in the twisted mass term (minus from g5 in the lower components) + _vector_add_i_mul(psi1, mu, (*s_c).s2); + _vector_add_i_mul(psi2, mu, (*s_c).s3); + + _vector_add_mul(psi1, eps, (*s_s).s2); + _vector_add_mul(psi2, eps, (*s_s).s3); + + _vector_assign((*r_c).s2, psi1); + _vector_assign((*r_c).s3, psi2); + + } +#ifdef OMP + } /* OpenMP closing brace */ +#endif + return; +} + + + +void assign_mul_one_sw_pm_imu_inv(const int ieo, + spinor * const k, const spinor * const l, + const double mu) { +#ifdef OMP +#pragma omp parallel + { +#endif + su3_vector ALIGN psi, chi, phi1, phi3; + const su3 *w1, *w2, *w3, *w4; + const spinor *rn; + spinor *s; + + /************************ loop over all lattice sites *************************/ +#ifdef OMP +#pragma omp for +#endif + for(int icx = 0; icx < (VOLUME/2); icx++) { + + rn = l + icx; + s = k + icx; + _vector_assign(phi1,(*rn).s0); + _vector_assign(phi3,(*rn).s2); + + w1=&sw_inv[icx][0][0]; + w2=w1+2; /* &sw_inv[icx][1][0]; */ + w3=w1+4; /* &sw_inv[icx][2][0]; */ + w4=w1+6; /* &sw_inv[icx][3][0]; */ + _su3_multiply(psi,*w1,phi1); + _su3_multiply(chi,*w2,(*rn).s1); + _vector_add((*s).s0,psi,chi); + _su3_multiply(psi,*w4,phi1); + _su3_multiply(chi,*w3,(*rn).s1); + _vector_add((*s).s1,psi,chi); + + w1++; /* &sw_inv[icx][0][1]; */ + w2++; /* &sw_inv[icx][1][1]; */ + w3++; /* &sw_inv[icx][2][1]; */ + w4++; /* &sw_inv[icx][3][1]; */ + _su3_multiply(psi,*w1,phi3); + _su3_multiply(chi,*w2,(*rn).s3); + _vector_add((*s).s2,psi,chi); + _su3_multiply(psi,*w4,phi3); + _su3_multiply(chi,*w3,(*rn).s3); + _vector_add((*s).s3,psi,chi); + + /******************************** end of loop *********************************/ + } +#ifdef OMP + } /* OpenMP closing brace */ +#endif + return; +} + + +/******** + * + * temporary initialisation function + * + ********/ + +su3 ** sw1, ** sw_inv1; +su3 * _sw, *_sw_inv; + +su3_32 ** sw1_32, ** sw_inv1_32; +su3_32 * _sw_32, *_sw_inv_32; + +void init_sw_fields() { + int V = VOLUME; + su3 * tmp; + su3_32 * tmp_32; + static int sw_init = 0; + + if(!sw_init) { + if((void*)(sw = (su3***)calloc(V, sizeof(su3**))) == NULL) { + fprintf (stderr, "sw malloc err\n"); + } + if((void*)(sw_inv = (su3***)calloc(V, sizeof(su3**))) == NULL) { + fprintf (stderr, "sw_inv malloc err\n"); + } + if((void*)(sw1 = (su3**)calloc(3*V, sizeof(su3*))) == NULL) { + fprintf (stderr, "sw1 malloc err\n"); + } + if((void*)(sw_inv1 = (su3**)calloc(4*V, sizeof(su3*))) == NULL) { + fprintf (stderr, "sw_inv1 malloc err\n"); + } + if((void*)(_sw = (su3*)calloc(3*2*V+1, sizeof(su3))) == NULL) { + fprintf (stderr, "_sw malloc err\n"); + } + if((void*)(_sw_inv = (su3*)calloc(4*2*V+1, sizeof(su3))) == NULL) { + fprintf (stderr, "_sw_inv malloc err\n"); + } + sw[0] = sw1; + sw_inv[0] = sw_inv1; + for(int i = 1; i < V; i++) { + sw[i] = sw[i-1]+3; + sw_inv[i] = sw_inv[i-1]+4; + } + sw[0][0] = (su3*)(((unsigned long int)(_sw)+ALIGN_BASE)&~ALIGN_BASE); + sw_inv[0][0] = (su3*)(((unsigned long int)(_sw_inv)+ALIGN_BASE)&~ALIGN_BASE); + tmp = sw[0][0]; + for(int i = 0; i < V; i++) { + for(int j = 0; j < 3; j++) { + sw[i][j] = tmp; + tmp = tmp+2; + } + } + + tmp = sw_inv[0][0]; + for(int i = 0; i < V; i++) { + for(int j = 0; j < 4; j++) { + sw_inv[i][j] = tmp; + tmp = tmp+2; + } + } + + /* 32 bit fields */ + if((void*)(sw_32 = (su3_32***)calloc(V, sizeof(su3_32**))) == NULL) { + fprintf (stderr, "sw (32 bit) malloc err\n"); + } + if((void*)(sw_inv_32 = (su3_32***)calloc(V, sizeof(su3_32**))) == NULL) { + fprintf (stderr, "sw_inv (32 bit) malloc err\n"); + } + if((void*)(sw1_32 = (su3_32**)calloc(3*V, sizeof(su3_32*))) == NULL) { + fprintf (stderr, "sw1 (32 bit) malloc err\n"); + } + if((void*)(sw_inv1_32 = (su3_32**)calloc(4*V, sizeof(su3_32*))) == NULL) { + fprintf (stderr, "sw_inv1 (32 bit) malloc err\n"); + } + if((void*)(_sw_32 = (su3_32*)calloc(3*2*V+1, sizeof(su3_32))) == NULL) { + fprintf (stderr, "_sw (32 bit) malloc err\n"); + } + if((void*)(_sw_inv_32 = (su3_32*)calloc(4*2*V+1, sizeof(su3_32))) == NULL) { + fprintf (stderr, "_sw_inv (32 bit) malloc err\n"); + } + + sw_32[0] = sw1_32; + sw_inv_32[0] = sw_inv1_32; + for(int i = 1; i < V; i++) { + sw_32[i] = sw_32[i-1]+3; + sw_inv_32[i] = sw_inv_32[i-1]+4; + } + sw_32[0][0] = (su3_32*)(((unsigned long int)(_sw_32)+ALIGN_BASE32)&~ALIGN_BASE32); + sw_inv_32[0][0] = (su3_32*)(((unsigned long int)(_sw_inv_32)+ALIGN_BASE32)&~ALIGN_BASE32); + tmp_32 = sw_32[0][0]; + for(int i = 0; i < V; i++) { + for(int j = 0; j < 3; j++) { + sw_32[i][j] = tmp_32; + tmp_32 = tmp_32+2; + } + } + + tmp_32 = sw_inv_32[0][0]; + for(int i = 0; i < V; i++) { + for(int j = 0; j < 4; j++) { + sw_inv_32[i][j] = tmp_32; + tmp_32 = tmp_32+2; + } + } + + + + sw_init = 1; + } + return; +} + + +void copy_32_sw_fields(){ + + int V = VOLUME; + + for(int i = 0; i < V; i++) { + for(int j = 0; j < 3; j++) { + for(int k = 0; k < 2; k++) { + sw_32[i][j][k].c00 = (_Complex float) sw[i][j][k].c00; + sw_32[i][j][k].c01 = (_Complex float) sw[i][j][k].c01; + sw_32[i][j][k].c02 = (_Complex float) sw[i][j][k].c02; + + sw_32[i][j][k].c10 = (_Complex float) sw[i][j][k].c10; + sw_32[i][j][k].c11 = (_Complex float) sw[i][j][k].c11; + sw_32[i][j][k].c12 = (_Complex float) sw[i][j][k].c12; + + sw_32[i][j][k].c20 = (_Complex float) sw[i][j][k].c20; + sw_32[i][j][k].c21 = (_Complex float) sw[i][j][k].c21; + sw_32[i][j][k].c22 = (_Complex float) sw[i][j][k].c22; + } + } + } + + for(int i = 0; i < V; i++) { + for(int j = 0; j < 4; j++) { + for(int k = 0; k < 2; k++) { + sw_inv_32[i][j][k].c00 = (_Complex float) sw_inv[i][j][k].c00; + sw_inv_32[i][j][k].c01 = (_Complex float) sw_inv[i][j][k].c01; + sw_inv_32[i][j][k].c02 = (_Complex float) sw_inv[i][j][k].c02; + + sw_inv_32[i][j][k].c10 = (_Complex float) sw_inv[i][j][k].c10; + sw_inv_32[i][j][k].c11 = (_Complex float) sw_inv[i][j][k].c11; + sw_inv_32[i][j][k].c12 = (_Complex float) sw_inv[i][j][k].c12; + + sw_inv_32[i][j][k].c20 = (_Complex float) sw_inv[i][j][k].c20; + sw_inv_32[i][j][k].c21 = (_Complex float) sw_inv[i][j][k].c21; + sw_inv_32[i][j][k].c22 = (_Complex float) sw_inv[i][j][k].c22; + } + } + } +} + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/clovertm_operators.h b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/clovertm_operators.h new file mode 100644 index 0000000000000000000000000000000000000000..a2764c76499f2c14de4647e75d63110de8c2700d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/clovertm_operators.h @@ -0,0 +1,67 @@ +/*********************************************************************** + * + * Copyright (C) 2005 Martin Hasenbusch + * 2009 Carsten Urbach + * 2012 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _CLOVERTM_OPERATORS_H +#define _CLOVERTM_OPERATORS_H + +#include "su3.h" + +extern su3 *** sw; +extern su3 *** sw_inv; +extern su3_32 *** sw_32; +extern su3_32 *** sw_inv_32; +extern su3 ** swm, ** swp; + +void assign_mul_one_sw_pm_imu(const int ieo, spinor * const k, const spinor * const l, const double mu); +void assign_mul_one_sw_pm_imu_inv(const int ieo, spinor * const k, const spinor * const l, const double mu); +void Msw_full(spinor * const Even_new, spinor * const Odd_new, + spinor * const Even, spinor * const Odd); +void clover_inv(spinor * const l, const int tau3sign, const double mu); +void Qsw_psi(spinor * const l, spinor * const k); +void Qsw_plus_psi(spinor * const l, spinor * const k); +void Qsw_minus_psi(spinor * const l, spinor * const k); +void Qsw_sq_psi(spinor * const l, spinor * const k); +void Qsw_pm_psi(spinor * const l, spinor * const k); +void Msw_psi(spinor * const l, spinor * const k); +void Msw_plus_psi(spinor * const l, spinor * const k); +void Msw_minus_psi(spinor * const l, spinor * const k); +void H_eo_sw_inv_psi(spinor * const l, spinor * const k, const int ieo, const int tau3sign, const double mu); +void init_sw_fields(); +void copy_32_sw_fields(); + +void clover_nd(const int ieo, + spinor * const l_s, spinor * const l_c, + const spinor * const k_s, const spinor * const k_c, + const spinor * const j_s, const spinor * const j_c, + const double mubar, const double epsbar); +void clover_gamma5_nd(const int ieo, + spinor * const l_s, spinor * const l_c, + const spinor * const k_s, const spinor * const k_c, + const spinor * const j_s, const spinor * const j_c, + const double mubar, const double epsbar); +void clover_inv_nd(const int ieo, spinor * const l_s, spinor * const l_c); + +void assign_mul_one_sw_pm_imu_eps(const int ieo, + spinor * const k_s, spinor * const k_c, + const spinor * const l_s, const spinor * const l_c, + const double mu, const double eps); +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/clovertm_operators_32.c b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/clovertm_operators_32.c new file mode 100644 index 0000000000000000000000000000000000000000..21cd9b3d3bc5d6b963d11d469362d42a60a95de3 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/clovertm_operators_32.c @@ -0,0 +1,595 @@ +/*********************************************************************** + * + * Copyright (C) 2005 Martin Hasenbusch + * 2011 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif + +// work-around for missing single precision implementation of inline SSE +#ifdef SSE +#define REDEFSSE +#undef SSE +#endif + +#ifdef SSE2 +#define REDEFSSE2 +#undef SSE2 +#endif + +#ifdef SSE3 +#define REDEFSSE3 +#undef SSE3 +#endif + +#include +#include +#include +#include +#include +#include +#ifdef MPI +# include +#endif +#include "global.h" +#include "su3.h" +#include "sse.h" +#include "linalg_eo.h" +#include "operator/Hopping_Matrix.h" +#include "operator/Hopping_Matrix_32.h" + +#include "tm_operators.h" +#include "tm_operators_32.h" + +#include "operator/clovertm_operators.h" +#include "operator/clovertm_operators_32.h" + + +void Qsw_pm_psi_32(spinor32 * const l, spinor32 * const k) { +#ifdef OMP +#pragma omp parallel + { +#endif + /* \hat Q_{-} */ + Hopping_Matrix_32_orphaned(EO, g_spinor_field32[1], k); + clover_inv_32_orphaned(g_spinor_field32[1], -1, g_mu); + Hopping_Matrix_32_orphaned(OE, g_spinor_field32[0], g_spinor_field32[1]); + clover_gamma5_32_orphaned(OO, g_spinor_field32[0], k, g_spinor_field32[0], -(g_mu + g_mu3)); + /* \hat Q_{+} */ + Hopping_Matrix_32_orphaned(EO, l, g_spinor_field32[0]); + clover_inv_32_orphaned(l, +1, g_mu); + Hopping_Matrix_32_orphaned(OE, g_spinor_field32[1], l); + clover_gamma5_32_orphaned(OO, l, g_spinor_field32[0], g_spinor_field32[1], +(g_mu + g_mu3)); +#ifdef OMP + } /* OpenMP parallel closing brace */ +#endif +} + +void clover_inv_32_orphaned(spinor32 * const l, const int tau3sign, const double mu) { + int icy; + su3_vector32 ALIGN32 psi, chi, phi1, phi3; + int ioff = 0; + const su3_32 *w1, *w2, *w3, *w4; + spinor32 *rn; + + if(tau3sign < 0 && fabs(mu) > 0) { + ioff = VOLUME/2; + } + +#ifndef OMP + icy = ioff; +#endif + /************************ loop over all lattice sites *************************/ +#ifdef OMP +#pragma omp for +#endif + for(int icx = 0; icx < (VOLUME/2); icx++) { +#ifdef OMP + icy = ioff + icx; +#endif + + rn = l + icx; + _vector_assign(phi1,(*rn).s0); + _vector_assign(phi3,(*rn).s2); + + w1=&sw_inv_32[icy][0][0]; + w2=w1+2; /* &sw_inv_32[icy][1][0]; */ + w3=w1+4; /* &sw_inv_32[icy][2][0]; */ + w4=w1+6; /* &sw_inv_32[icy][3][0]; */ + _su3_multiply(psi,*w1,phi1); + _su3_multiply(chi,*w2,(*rn).s1); + _vector_add((*rn).s0,psi,chi); + _su3_multiply(psi,*w4,phi1); + _su3_multiply(chi,*w3,(*rn).s1); + _vector_add((*rn).s1,psi,chi); + + w1++; /* &sw_inv_32[icy][0][1]; */ + w2++; /* &sw_inv_32[icy][1][1]; */ + w3++; /* &sw_inv_32[icy][2][1]; */ + w4++; /* &sw_inv_32[icy][3][1]; */ + _su3_multiply(psi,*w1,phi3); + _su3_multiply(chi,*w2,(*rn).s3); + _vector_add((*rn).s2,psi,chi); + _su3_multiply(psi,*w4,phi3); + _su3_multiply(chi,*w3,(*rn).s3); + _vector_add((*rn).s3,psi,chi); + +#ifndef OMP + ++icy; +#endif + + /******************************** end of loop *********************************/ + } +} + +void clover_inv_32(spinor32 * const l, const int tau3sign, const double mu) { +#ifdef OMP +#pragma omp parallel + { +#endif + clover_inv_32_orphaned(l,tau3sign,mu); +#ifdef OMP + } /* OpenMP closing brace */ +#endif + return; +} + +void clover_inv_nd_32_orphaned(const int ieo, spinor32 * const l_c, spinor32 * const l_s) { + int icy; + su3_vector32 ALIGN psi, chi, phi1, phi3; + int ioff = 0; + const su3_32 *w1, *w2, *w3, *w4; + spinor32 *rn_s, *rn_c; + + + if(ieo == 1) ioff = VOLUME/2; + +#ifndef OMP + icy = ioff; +#endif + +#ifdef OMP +#pragma omp for +#endif + for(unsigned int icx = 0; icx < (VOLUME/2); icx++) { +#ifdef OMP + icy = ioff + icx; +#endif + + rn_s = l_s + icx; + rn_c = l_c + icx; + _vector_assign(phi1,(*rn_s).s0); + + w1=&sw_inv_32[icy][0][0]; + w2=w1+2; /* &sw_inv_32[icy][1][0]; */ + w3=w1+4; /* &sw_inv_32[icy][2][0]; */ + w4=w1+6; /* &sw_inv_32[icy][3][0]; */ + _su3_multiply(psi, *w1, phi1); + _su3_multiply(chi, *w2, (*rn_s).s1); + _vector_add((*rn_s).s0, psi,chi); + _su3_multiply(psi, *w4, phi1); + _su3_multiply(chi, *w3, (*rn_s).s1); + _vector_add((*rn_s).s1, psi, chi); + + _vector_assign(phi1,(*rn_c).s0); + + _su3_multiply(psi, *w1, phi1); + _su3_multiply(chi, *w2, (*rn_c).s1); + _vector_add((*rn_c).s0, psi,chi); + _su3_multiply(psi, *w4, phi1); + _su3_multiply(chi, *w3, (*rn_c).s1); + _vector_add((*rn_c).s1, psi, chi); + + _vector_assign(phi3,(*rn_s).s2); + + w1++; /* &sw_inv_32[icy][0][1]; */ + w2++; /* &sw_inv_32[icy][1][1]; */ + w3++; /* &sw_inv_32[icy][2][1]; */ + w4++; /* &sw_inv_32[icy][3][1]; */ + _su3_multiply(psi, *w1, phi3); + _su3_multiply(chi, *w2, (*rn_s).s3); + _vector_add((*rn_s).s2, psi, chi); + _su3_multiply(psi, *w4, phi3); + _su3_multiply(chi, *w3, (*rn_s).s3); + _vector_add((*rn_s).s3, psi, chi); + + _vector_assign(phi3,(*rn_c).s2); + + _su3_multiply(psi, *w1, phi3); + _su3_multiply(chi, *w2, (*rn_c).s3); + _vector_add((*rn_c).s2, psi, chi); + _su3_multiply(psi, *w4, phi3); + _su3_multiply(chi, *w3, (*rn_c).s3); + _vector_add((*rn_c).s3, psi, chi); + +#ifndef OMP + ++icy; +#endif + + /******************************** end of loop *********************************/ + } + return; +} + +void clover_inv_nd_32(const int ieo, spinor32 * const l_c, spinor32 * const l_s) { +#ifdef OMP +#pragma omp parallel + { +#endif + clover_inv_nd_32_orphaned(ieo,l_c,l_s); +#ifdef OMP + } /* OpenMP closing brace */ +#endif + return; +} + +void clover_gamma5_32_orphaned(const int ieo, + spinor32 * const l, const spinor32 * const k, const spinor32 * const j, + const double mu) { + + su3_vector32 ALIGN32 chi, psi1, psi2; + int ix; + int ioff,icx; + const su3_32 *w1,*w2,*w3; + spinor32 *r; + const spinor32 *s,*t; + + if(ieo == 0) { + ioff = 0; + } + else { + ioff = (VOLUME+RAND)/2; + } + +/************************ loop over all lattice sites *************************/ +#ifdef OMP +#pragma omp for +#endif + for(icx = ioff; icx < (VOLUME/2+ioff); icx++) { + ix = g_eo2lexic[icx]; + + r = l + icx-ioff; + s = k + icx-ioff; + t = j + icx-ioff; + + w1=&sw_32[ix][0][0]; + w2=w1+2; /*&sw[ix][1][0];*/ + w3=w1+4; /*&sw[ix][2][0];*/ + _su3_multiply(psi1,*w1,(*s).s0); + _su3_multiply(chi,*w2,(*s).s1); + _vector_add_assign(psi1,chi); + _su3_inverse_multiply(psi2,*w2,(*s).s0); + _su3_multiply(chi,*w3,(*s).s1); + _vector_add_assign(psi2,chi); + // add in the twisted mass term (plus in the upper components) + _vector_add_i_mul(psi1, (float)mu, (*s).s0); + _vector_add_i_mul(psi2, (float)mu, (*s).s1); + + _vector_sub((*r).s0,psi1,(*t).s0); + _vector_sub((*r).s1,psi2,(*t).s1); + + w1++; /*=&sw[ix][0][1];*/ + w2++; /*=&sw[ix][1][1];*/ + w3++; /*=&sw[ix][2][1];*/ + _su3_multiply(psi1,*w1,(*s).s2); _su3_multiply(chi,*w2,(*s).s3); + _vector_add_assign(psi1,chi); + _su3_inverse_multiply(psi2,*w2,(*s).s2); _su3_multiply(chi,*w3,(*s).s3); + _vector_add_assign(psi2,chi); + // add in the twisted mass term (minus from g5 in the lower components) + _vector_add_i_mul(psi1, -mu, (*s).s2); + _vector_add_i_mul(psi2, -mu, (*s).s3); + + /**************** multiply with gamma5 included ******************************/ + _vector_sub((*r).s2,(*t).s2,psi1); + _vector_sub((*r).s3,(*t).s3,psi2); + /******************************** end of loop *********************************/ + } +} + +void clover_gamma5_32(const int ieo, + spinor32 * const l, const spinor32 * const k, const spinor32 * const j, + const double mu) { +#ifdef OMP +#pragma omp parallel + { +#endif + clover_gamma5_32_orphaned(ieo,l,k,j,mu); +#ifdef OMP + } /* OMP closing brace */ +#endif + return; +} + +void clover_gamma5_nd_32_orphaned(const int ieo, + spinor32 * const l_c, spinor32 * const l_s, + const spinor32 * const k_c, const spinor32 * const k_s, + const spinor32 * const j_c, const spinor32 * const j_s, + const float mubar, const float epsbar) { + su3_vector32 ALIGN chi, psi1, psi2; + int ix; + int ioff; + const su3_32 *w1,*w2,*w3; + spinor32 *r_s, *r_c; + const spinor32 *s_s, *s_c, *t_s, *t_c; + + if(ieo == 0) { + ioff = 0; + } + else { + ioff = (VOLUME+RAND)/2; + } + /************************ loop over all lattice sites *************************/ +#ifdef OMP +#pragma omp for +#endif + for(unsigned int icx = ioff; icx < (VOLUME/2+ioff); icx++) { + ix = g_eo2lexic[icx]; + + r_s = l_s + icx-ioff; + r_c = l_c + icx-ioff; + s_s = k_s + icx-ioff; + s_c = k_c + icx-ioff; + t_s = j_s + icx-ioff; + t_c = j_c + icx-ioff; + + // upper two spin components first + w1=&sw_32[ix][0][0]; + w2=w1+2; /*&sw_32[ix][1][0];*/ + w3=w1+4; /*&sw_32[ix][2][0];*/ + _su3_multiply(psi1, *w1, (*s_s).s0); + _su3_multiply(chi, *w2, (*s_s).s1); + _vector_add_assign(psi1, chi); + _su3_inverse_multiply(psi2, *w2, (*s_s).s0); + _su3_multiply(chi, *w3, (*s_s).s1); + _vector_add_assign(psi2, chi); + + // add in the twisted mass term (plus in the upper components) + _vector_add_i_mul(psi1, mubar, (*s_s).s0); + _vector_add_i_mul(psi2, mubar, (*s_s).s1); + + _vector_add_mul(psi1, epsbar, (*s_c).s0); + _vector_add_mul(psi2, epsbar, (*s_c).s1); + + _vector_sub((*r_s).s0, psi1, (*t_s).s0); + _vector_sub((*r_s).s1, psi2, (*t_s).s1); + + _su3_multiply(psi1, *w1, (*s_c).s0); + _su3_multiply(chi, *w2, (*s_c).s1); + _vector_add_assign(psi1, chi); + _su3_inverse_multiply(psi2, *w2, (*s_c).s0); + _su3_multiply(chi, *w3, (*s_c).s1); + _vector_add_assign(psi2, chi); + + // add in the twisted mass term (plus in the upper components) + _vector_add_i_mul(psi1, -mubar, (*s_c).s0); + _vector_add_i_mul(psi2, -mubar, (*s_c).s1); + + _vector_add_mul(psi1, epsbar, (*s_s).s0); + _vector_add_mul(psi2, epsbar, (*s_s).s1); + + _vector_sub((*r_c).s0, psi1, (*t_c).s0); + _vector_sub((*r_c).s1, psi2, (*t_c).s1); + + + // now lower to spin components + w1++; /*=&sw_32[ix][0][1];*/ + w2++; /*=&sw_32[ix][1][1];*/ + w3++; /*=&sw_32[ix][2][1];*/ + _su3_multiply(psi1, *w1, (*s_s).s2); + _su3_multiply(chi, *w2, (*s_s).s3); + _vector_add_assign(psi1, chi); + _su3_inverse_multiply(psi2, *w2, (*s_s).s2); + _su3_multiply(chi, *w3, (*s_s).s3); + _vector_add_assign(psi2, chi); + + // add in the twisted mass term (minus from g5 in the lower components) + _vector_add_i_mul(psi1, -mubar, (*s_s).s2); + _vector_add_i_mul(psi2, -mubar, (*s_s).s3); + + _vector_add_mul(psi1, epsbar, (*s_c).s2); + _vector_add_mul(psi2, epsbar, (*s_c).s3); + + _vector_sub((*r_s).s2, (*t_s).s2, psi1); + _vector_sub((*r_s).s3, (*t_s).s3, psi2); + + _su3_multiply(psi1, *w1, (*s_c).s2); + _su3_multiply(chi, *w2, (*s_c).s3); + _vector_add_assign(psi1, chi); + _su3_inverse_multiply(psi2, *w2, (*s_c).s2); + _su3_multiply(chi, *w3, (*s_c).s3); + _vector_add_assign(psi2, chi); + + // add in the twisted mass term (minus from g5 in the lower components) + _vector_add_i_mul(psi1, mubar, (*s_c).s2); + _vector_add_i_mul(psi2, mubar, (*s_c).s3); + + _vector_add_mul(psi1, epsbar, (*s_s).s2); + _vector_add_mul(psi2, epsbar, (*s_s).s3); + + _vector_sub((*r_c).s2, (*t_c).s2, psi1); + _vector_sub((*r_c).s3, (*t_c).s3, psi2); + } +} + +void clover_gamma5_nd_32(const int ieo, + spinor32 * const l_c, spinor32 * const l_s, + const spinor32 * const k_c, const spinor32 * const k_s, + const spinor32 * const j_c, const spinor32 * const j_s, + const float mubar, const float epsbar) { +#ifdef OMP +#pragma omp parallel + { +#endif + clover_gamma5_nd_32_orphaned(ieo,l_c,l_s,k_c,k_s,j_c,j_s,mubar,epsbar); +#ifdef OMP + } /* OpenMP parallel closing brace */ +#endif +} + + + +/************************************************************** + * + * assign_mul_one_sw_pm_imu_eps applies + * (1 + T + imug5tau3 + epstau1) to spinor l + * and stores it in k + * + * it is assumed that the clover leaf is computed and stored + * in sw[VOLUME][3][2] + * the corresponding routine can be found in clover_leaf.c + * + **************************************************************/ + +void assign_mul_one_sw_pm_imu_eps_32_orphaned(const int ieo, + spinor32 * const k_s, spinor32 * const k_c, + const spinor32 * const l_s, const spinor32 * const l_c, + const float mu, const float eps) { + su3_vector32 ALIGN chi, psi1, psi2; + int ix; + int ioff; + const su3_32 *w1, *w2, *w3; + spinor32 *r_s, *r_c; + const spinor32 *s_s, *s_c; + + if(ieo == 0) { + ioff = 0; + } + else { + ioff = (VOLUME+RAND)/2; + } + /************************ loop over all lattice sites *************************/ +#ifdef OMP +#pragma omp for +#endif + for(unsigned int icx = ioff; icx < (VOLUME/2+ioff); icx++) { + ix = g_eo2lexic[icx]; + + r_s = k_s + icx-ioff; + r_c = k_c + icx-ioff; + s_s = l_s + icx-ioff; + s_c = l_c + icx-ioff; + + // upper two spin components first + w1=&sw_32[ix][0][0]; + w2=w1+2; /*&sw_32[ix][1][0];*/ + w3=w1+4; /*&sw_32[ix][2][0];*/ + _su3_multiply(psi1, *w1, (*s_s).s0); + _su3_multiply(chi, *w2, (*s_s).s1); + _vector_add_assign(psi1, chi); + _su3_inverse_multiply(psi2, *w2, (*s_s).s0); + _su3_multiply(chi, *w3, (*s_s).s1); + _vector_add_assign(psi2, chi); + + // add in the twisted mass term (plus in the upper components) + _vector_add_i_mul(psi1, mu, (*s_s).s0); + _vector_add_i_mul(psi2, mu, (*s_s).s1); + + _vector_add_mul(psi1, eps, (*s_c).s0); + _vector_add_mul(psi2, eps, (*s_c).s1); + + _vector_assign((*r_s).s0, psi1); + _vector_assign((*r_s).s1, psi2); + + _su3_multiply(psi1, *w1, (*s_c).s0); + _su3_multiply(chi, *w2, (*s_c).s1); + _vector_add_assign(psi1, chi); + _su3_inverse_multiply(psi2, *w2, (*s_c).s0); + _su3_multiply(chi, *w3, (*s_c).s1); + _vector_add_assign(psi2, chi); + + // add in the twisted mass term (plus in the upper components) + _vector_add_i_mul(psi1, -mu, (*s_c).s0); + _vector_add_i_mul(psi2, -mu, (*s_c).s1); + + _vector_add_mul(psi1, eps, (*s_s).s0); + _vector_add_mul(psi2, eps, (*s_s).s1); + + _vector_assign((*r_c).s0, psi1); + _vector_assign((*r_c).s1, psi2); + + // now lower two spin components + w1++; /*=&sw_32[ix][0][1];*/ + w2++; /*=&sw_32[ix][1][1];*/ + w3++; /*=&sw_32[ix][2][1];*/ + _su3_multiply(psi1, *w1, (*s_s).s2); + _su3_multiply(chi, *w2, (*s_s).s3); + _vector_add_assign(psi1, chi); + _su3_inverse_multiply(psi2, *w2, (*s_s).s2); + _su3_multiply(chi, *w3, (*s_s).s3); + _vector_add_assign(psi2, chi); + + // add in the twisted mass term (minus from g5 in the lower components) + _vector_add_i_mul(psi1, -mu, (*s_s).s2); + _vector_add_i_mul(psi2, -mu, (*s_s).s3); + + _vector_add_mul(psi1, eps, (*s_c).s2); + _vector_add_mul(psi2, eps, (*s_c).s3); + + _vector_assign((*r_s).s2, psi1); + _vector_assign((*r_s).s3, psi2); + + _su3_multiply(psi1, *w1, (*s_c).s2); + _su3_multiply(chi, *w2, (*s_c).s3); + _vector_add_assign(psi1, chi); + _su3_inverse_multiply(psi2, *w2, (*s_c).s2); + _su3_multiply(chi, *w3, (*s_c).s3); + _vector_add_assign(psi2, chi); + + // add in the twisted mass term (minus from g5 in the lower components) + _vector_add_i_mul(psi1, mu, (*s_c).s2); + _vector_add_i_mul(psi2, mu, (*s_c).s3); + + _vector_add_mul(psi1, eps, (*s_s).s2); + _vector_add_mul(psi2, eps, (*s_s).s3); + + _vector_assign((*r_c).s2, psi1); + _vector_assign((*r_c).s3, psi2); + + } +} + +void assign_mul_one_sw_pm_imu_eps_32(const int ieo, + spinor32 * const k_s, spinor32 * const k_c, + const spinor32 * const l_s, const spinor32 * const l_c, + const float mu, const float eps) { + #ifdef OMP + #pragma omp parallel + { + #endif + assign_mul_one_sw_pm_imu_eps_32_orphaned(ieo,k_s,k_c,l_s,l_c,mu,eps); + #ifdef OMP + } /* OpenMP parallel closing brace */ + #endif +} + +#ifdef REDEFSSE +#undef REDEFSSE +#define SSE +#endif + +#ifdef REDEFSSE2 +#undef REDEFSSE2 +#define SSE2 +#endif + +#ifdef REDEFSSE3 +#undef REDEFSSE3 +#define SSE3 +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/clovertm_operators_32.h b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/clovertm_operators_32.h new file mode 100644 index 0000000000000000000000000000000000000000..fcf2b17eb92c05e7aea1f68e1d680582c2e3081c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/clovertm_operators_32.h @@ -0,0 +1,68 @@ +/*********************************************************************** + * + * Copyright (C) 2005 Martin Hasenbusch + * 2009 Carsten Urbach + * 2012 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _CLOVERTM_OPERATORS_32_H +#define _CLOVERTM_OPERATORS_32_H + +#include "su3.h" + +extern su3 *** sw; +extern su3 *** sw_inv; +extern su3_32 *** sw_32; +extern su3_32 *** sw_inv_32; +extern su3 ** swm, ** swp; + +void clover_inv_32_orphaned(spinor32 * const l, const int tau3sign, const double mu); +void clover_inv_32(spinor32 * const l, const int tau3sign, const double mu); +void Qsw_pm_psi_32(spinor32 * const l, spinor32 * const k); +void clover_gamma5_32_orphaned(const int ieo, + spinor32 * const l, const spinor32 * const k, const spinor32 * const j, + const double mu); +void clover_gamma5_32(const int ieo, + spinor32 * const l, const spinor32 * const k, const spinor32 * const j, + const double mu); + +void assign_mul_one_sw_pm_imu_eps_32(const int ieo, + spinor32 * const k_s, spinor32 * const k_c, + const spinor32 * const l_s, const spinor32 * const l_c, + const float mu, const float eps); +void assign_mul_one_sw_pm_imu_eps_32_orphaned(const int ieo, + spinor32 * const k_s, spinor32 * const k_c, + const spinor32 * const l_s, const spinor32 * const l_c, + const float mu, const float eps); + +void clover_gamma5_nd_32(const int ieo, + spinor32 * const l_c, spinor32 * const l_s, + const spinor32 * const k_c, const spinor32 * const k_s, + const spinor32 * const j_c, const spinor32 * const j_s, + const float mubar, const float epsbar); +void clover_gamma5_nd_32_orphaned(const int ieo, + spinor32 * const l_c, spinor32 * const l_s, + const spinor32 * const k_c, const spinor32 * const k_s, + const spinor32 * const j_c, const spinor32 * const j_s, + const float mubar, const float epsbar); + +void clover_inv_nd_32(const int ieo, spinor32 * const l_c, spinor32 * const l_s); +void clover_inv_nd_32_orphaned(const int ieo, spinor32 * const l_c, spinor32 * const l_s); + +#endif + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/halfspinor_bg_dbl.c b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/halfspinor_bg_dbl.c new file mode 100644 index 0000000000000000000000000000000000000000..c052eeeee31bba15b7472a5658fdada2886366a8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/halfspinor_bg_dbl.c @@ -0,0 +1,290 @@ +/********************************************************************** + * + * + * Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008 Carsten Urbach + * + * BG and halfspinor versions (C) 2007, 2008 Carsten Urbach + * + * This file is based on an implementation of the Dirac operator + * written by Martin Luescher, modified by Martin Hasenbusch in 2002 + * and modified and extended by Carsten Urbach from 2003-2008 + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + **********************************************************************/ + +void Hopping_Matrix(const int ieo, spinor * const l, spinor * const k){ + int i, ix; + su3 * restrict U ALIGN; + spinor * restrict s ALIGN; + halfspinor * restrict * phi ALIGN; + halfspinor32 * restrict * phi32 ALIGN; + _declare_hregs(); +#ifdef _KOJAK_INST +#pragma pomp inst begin(hoppingmatrix) +#endif +#pragma disjoint(*s, *U) + +#ifdef _GAUGE_COPY + if(g_update_gauge_copy) { + update_backward_gauge(g_gauge_field); + } +#endif + + __alignx(16, l); + __alignx(16, k); + if(g_sloppy_precision == 1 && g_sloppy_precision_flag == 1) { + /* Take the 64 Bit precision part and replace */ + /* _bgl_load_reg0|1 with _bgl_load_reg0|1_32 */ + /* _bgl_load_rs0|1|2|3 with _bgl_load_rs0|1|2|3_32*/ + /* phi with phi32*/ + /* _bgl_store_reg0|1 with _bgl_store_reg0|1_32 */ + /* _bgl_store_reg0|1_up with _bgl_store_reg0|1_up_32 */ + /* HalfSpinor with Halfspinor32 */ + /* _bgl_load_rs0|1 with _bgl_load_rs0|1_32*/ + /* xchange_halffield with xchange_halffield_32 */ + __alignx(16, HalfSpinor32); + /* We will run through the source vector now */ + /* instead of the solution vector */ + s = k; + _prefetch_spinor(s); + + /* s contains the source vector */ + + if(ieo == 0) { + U = g_gauge_field_copy[0][0]; + } + else { + U = g_gauge_field_copy[1][0]; + } + phi32 = NBPointer32[ieo]; + + _prefetch_su3(U); + /**************** loop over all lattice sites ******************/ + ix=0; + for(i = 0; i < (VOLUME)/2; i++){ + + /*********************** direction +0 ************************/ + _hop_t_p_pre32(); + s++; + U++; + ix++; + + /*********************** direction -0 ************************/ + _hop_t_m_pre32(); + ix++; + + /*********************** direction +1 ************************/ + _hop_x_p_pre32(); + ix++; + U++; + + /*********************** direction -1 ************************/ + _hop_x_m_pre32(); + ix++; + + + /*********************** direction +2 ************************/ + _hop_y_p_pre32(); + ix++; + U++; + + /*********************** direction -2 ************************/ + _hop_y_m_pre32(); + ix++; + + /*********************** direction +3 ************************/ + _hop_z_p_pre32(); + _prefetch_su3(U+1); + ix++; + U++; + + /*********************** direction -3 ************************/ + _hop_z_m_pre32(); + ix++; + + /************************ end of loop ************************/ + } + +# if (defined MPI && !defined _NO_COMM) + xchange_halffield32(); +# endif + s = l; + phi32 = NBPointer32[2 + ieo]; + if(ieo == 0) { + U = g_gauge_field_copy[1][0]; + } + else { + U = g_gauge_field_copy[0][0]; + } + _prefetch_halfspinor(phi32[0]); + _prefetch_su3(U); + + /* Now we sum up and expand to a full spinor */ + ix = 0; + /* _prefetch_spinor_for_store(s); */ + for(i = 0; i < (VOLUME)/2; i++){ + /* This causes a lot of trouble, do we understand this? */ + /* _prefetch_spinor_for_store(s); */ + _prefetch_halfspinor(phi32[ix+1]); + /*********************** direction +0 ************************/ + _hop_t_p_post32(); + ix++; + /*********************** direction -0 ************************/ + _hop_t_m_post32(); + U++; + ix++; + /*********************** direction +1 ************************/ + _hop_x_p_post32(); + ix++; + /*********************** direction -1 ************************/ + _hop_x_m_post32(); + U++; + ix++; + /*********************** direction +2 ************************/ + _hop_y_p_post32(); + ix++; + /*********************** direction -2 ************************/ + _hop_y_m_post32(); + U++; + ix++; + /*********************** direction +3 ************************/ + _hop_z_p_post32(); + ix++; + /*********************** direction -3 ************************/ + _hop_z_m_post32(); + U++; + ix++; + s++; + } + } + else { + __alignx(16, HalfSpinor); + /* We will run through the source vector now */ + /* instead of the solution vector */ + s = k; + _prefetch_spinor(s); + + /* s contains the source vector */ + + if(ieo == 0) { + U = g_gauge_field_copy[0][0]; + } + else { + U = g_gauge_field_copy[1][0]; + } + phi = NBPointer[ieo]; + + _prefetch_su3(U); + /**************** loop over all lattice sites ******************/ + ix=0; + for(i = 0; i < (VOLUME)/2; i++){ + /*********************** direction +0 ************************/ + _hop_t_p_pre(); + s++; + U++; + ix++; + + /*********************** direction -0 ************************/ + _hop_t_m_pre(); + ix++; + + /*********************** direction +1 ************************/ + _hop_x_p_pre(); + ix++; + U++; + + /*********************** direction -1 ************************/ + _hop_x_m_pre(); + ix++; + + + /*********************** direction +2 ************************/ + _hop_y_p_pre(); + ix++; + U++; + + /*********************** direction -2 ************************/ + _hop_y_m_pre(); + ix++; + + /*********************** direction +3 ************************/ + _hop_z_p_pre(); + ix++; + U++; + + /*********************** direction -3 ************************/ + _hop_z_m_pre(); + ix++; + + /************************ end of loop ************************/ + } + +# if (defined MPI && !defined _NO_COMM) + xchange_halffield(); +# endif + s = l; + phi = NBPointer[2 + ieo]; + _prefetch_halfspinor(phi[0]); + if(ieo == 0) { + U = g_gauge_field_copy[1][0]; + } + else { + U = g_gauge_field_copy[0][0]; + } + _prefetch_su3(U); + + /* Now we sum up and expand to a full spinor */ + ix = 0; + /* _prefetch_spinor_for_store(s); */ + for(i = 0; i < (VOLUME)/2; i++){ + /* This causes a lot of trouble, do we understand this? */ + /*********************** direction +0 ************************/ + _hop_t_p_post(); + ix++; + /*********************** direction -0 ************************/ + _hop_t_m_post(); + U++; + ix++; + /*********************** direction +1 ************************/ + _hop_x_p_post(); + ix++; + /*********************** direction -1 ************************/ + _hop_x_m_post(); + U++; + ix++; + /*********************** direction +2 ************************/ + _hop_y_p_post(); + ix++; + /*********************** direction -2 ************************/ + _hop_y_m_post(); + U++; + ix++; + /*********************** direction +3 ************************/ + _hop_z_p_post(); + ix++; + /*********************** direction -3 ************************/ + _hop_z_m_post(); + U++; + ix++; + s++; + } + } +#ifdef _KOJAK_INST +#pragma pomp inst end(hoppingmatrix) +#endif +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/halfspinor_bgq_dbl.c b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/halfspinor_bgq_dbl.c new file mode 100644 index 0000000000000000000000000000000000000000..0eb5516ef6102e3e6eded7a32edb1eb0561ac598 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/halfspinor_bgq_dbl.c @@ -0,0 +1,284 @@ +/********************************************************************** + * + * + * Copyright (C) 2012 Carsten Urbach + * + * BG and halfspinor versions (C) 2007, 2008 Carsten Urbach + * + * This file is based on an implementation of the Dirac operator + * written by Martin Luescher, modified by Martin Hasenbusch in 2002 + * and modified and extended by Carsten Urbach from 2003-2008 + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + **********************************************************************/ + + +void Hopping_Matrix(const int ieo, spinor * const l, spinor * const k){ + int ix; + su3 * restrict ALIGN U; + spinor * restrict ALIGN s; + halfspinor * restrict * phi ALIGN; + halfspinor32 * restrict * phi32 ALIGN; + /* We have 32 registers available */ + _declare_hregs(); + +#ifdef _KOJAK_INST +#pragma pomp inst begin(hoppingmatrix) +#endif +#pragma disjoint(*s, *U) + +#ifdef _GAUGE_COPY + if(g_update_gauge_copy) { + update_backward_gauge(g_gauge_field); + } +#endif + + __alignx(16, l); + __alignx(16, k); + if(g_sloppy_precision == 1 && g_sloppy_precision_flag == 1) { + __alignx(16, HalfSpinor32); + /* We will run through the source vector now */ + /* instead of the solution vector */ + s = k; + _prefetch_spinor(s); + + /* s contains the source vector */ + + if(ieo == 0) { + U = g_gauge_field_copy[0][0]; + } + else { + U = g_gauge_field_copy[1][0]; + } + phi32 = NBPointer32[ieo]; + + _prefetch_su3(U); + /**************** loop over all lattice sites ******************/ + ix=0; + for(int i = 0; i < (VOLUME)/2; i++){ + /*********************** direction +0 ************************/ + _hop_t_p_pre32(); + s++; + U++; + ix++; + + /*********************** direction -0 ************************/ + _hop_t_m_pre32(); + ix++; + + /*********************** direction +1 ************************/ + _hop_x_p_pre32(); + ix++; + U++; + + /*********************** direction -1 ************************/ + _hop_x_m_pre32(); + ix++; + + /*********************** direction +2 ************************/ + _hop_y_p_pre32(); + + ix++; + U++; + + /*********************** direction -2 ************************/ + _hop_y_m_pre32(); + ix++; + + /*********************** direction +3 ************************/ + _hop_z_p_pre32(); + ix++; + U++; + + /*********************** direction -3 ************************/ + _hop_z_m_pre32(); + ix++; + + /************************ end of loop ************************/ + } + +# if (defined MPI && !defined _NO_COMM) + xchange_halffield32(); +# endif + s = l; + phi32 = NBPointer32[2 + ieo]; + if(ieo == 0) { + U = g_gauge_field_copy[1][0]; + } + else { + U = g_gauge_field_copy[0][0]; + } + //_prefetch_halfspinor(phi32[0]); + _prefetch_su3(U); + + /* Now we sum up and expand to a full spinor */ + ix = 0; + /* _prefetch_spinor_for_store(s); */ + for(int i = 0; i < (VOLUME)/2; i++){ + /* This causes a lot of trouble, do we understand this? */ + /* _prefetch_spinor_for_store(s); */ + //_prefetch_halfspinor(phi32[ix+1]); + /*********************** direction +0 ************************/ + _hop_t_p_post32(); + ix++; + /*********************** direction -0 ************************/ + _hop_t_m_post32(); + U++; + ix++; + /*********************** direction +1 ************************/ + _hop_x_p_post32(); + ix++; + /*********************** direction -1 ************************/ + _hop_x_m_post32(); + U++; + ix++; + /*********************** direction +2 ************************/ + _hop_y_p_post32(); + ix++; + /*********************** direction -2 ************************/ + _hop_y_m_post32(); + U++; + ix++; + /*********************** direction +3 ************************/ + _hop_z_p_post32(); + ix++; + /*********************** direction -3 ************************/ + _hop_z_m_post32(); + U++; + ix++; + s++; + } + } + else { + __alignx(16, HalfSpinor); + /* We will run through the source vector now */ + /* instead of the solution vector */ + s = k; + _prefetch_spinor(s); + + /* s contains the source vector */ + + if(ieo == 0) { + U = g_gauge_field_copy[0][0]; + } + else { + U = g_gauge_field_copy[1][0]; + } + phi = NBPointer[ieo]; + + _prefetch_su3(U); + /**************** loop over all lattice sites ******************/ + ix=0; + for(int i = 0; i < (VOLUME)/2; i++){ + /*********************** direction +0 ************************/ + _hop_t_p_pre(); + s++; + U++; + ix++; + + /*********************** direction -0 ************************/ + _hop_t_m_pre(); + ix++; + + /*********************** direction +1 ************************/ + _hop_x_p_pre(); + ix++; + U++; + + /*********************** direction -1 ************************/ + _hop_x_m_pre(); + ix++; + + + /*********************** direction +2 ************************/ + _hop_y_p_pre(); + ix++; + U++; + + /*********************** direction -2 ************************/ + _hop_y_m_pre(); + ix++; + + /*********************** direction +3 ************************/ + _hop_z_p_pre(); + ix++; + U++; + + /*********************** direction -3 ************************/ + _hop_z_m_pre(); + ix++; + + /************************ end of loop ************************/ + + } + +# if (defined MPI && !defined _NO_COMM) + xchange_halffield(); +# endif + s = l; + phi = NBPointer[2 + ieo]; + //_prefetch_halfspinor(phi[0]); + if(ieo == 0) { + U = g_gauge_field_copy[1][0]; + } + else { + U = g_gauge_field_copy[0][0]; + } + _prefetch_su3(U); + + /* Now we sum up and expand to a full spinor */ + ix = 0; + /* _prefetch_spinor_for_store(s); */ + for(int i = 0; i < (VOLUME)/2; i++){ + /* This causes a lot of trouble, do we understand this? */ + /* _prefetch_spinor_for_store(s); */ + //_prefetch_halfspinor(phi[ix+1]); + /*********************** direction +0 ************************/ + _hop_t_p_post(); + ix++; + /*********************** direction -0 ************************/ + _hop_t_m_post(); + U++; + ix++; + /*********************** direction +1 ************************/ + _hop_x_p_post(); + ix++; + /*********************** direction -1 ************************/ + _hop_x_m_post(); + U++; + ix++; + /*********************** direction +2 ************************/ + _hop_y_p_post(); + ix++; + /*********************** direction -2 ************************/ + _hop_y_m_post(); + U++; + ix++; + /*********************** direction +3 ************************/ + _hop_z_p_post(); + ix++; + /*********************** direction -3 ************************/ + _hop_z_m_post(); + U++; + ix++; + s++; + } + } +#ifdef _KOJAK_INST +#pragma pomp inst end(hoppingmatrix) +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/halfspinor_body.c b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/halfspinor_body.c new file mode 100644 index 0000000000000000000000000000000000000000..69e8d55f535a1d5c99ac29908da0c6409ac0b190 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/halfspinor_body.c @@ -0,0 +1,402 @@ +/********************************************************************** + * + * Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2012 Carsten Urbach + * + * This file is based on an implementation of the Dirac operator + * written by Martin Luescher, modified by Martin Hasenbusch in 2002 + * this is a new version based on the aforementioned implementations + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + **********************************************************************/ + + +int ix; +su3 * restrict U ALIGN; +spinor * restrict s ALIGN; +halfspinor * restrict * phi ALIGN; +halfspinor32 * restrict * phi32 ALIGN; +_declare_hregs(); + +#ifdef XLC +# pragma disjoint(*l, *k) +# pragma disjoint(*k, *U) +# pragma disjoint(*l, *U) +# pragma disjoint(*U, *s) +# pragma disjoint(*k, *s) +# pragma disjoint(*l, *s) +__alignx(32, l); +__alignx(32, k); +__alignx(32, U); +__alignx(32, s); +#endif + + +#ifdef _KOJAK_INST +#pragma pomp inst begin(hoppingmatrix) +#endif + +#ifndef OMP +s = k; +_prefetch_spinor(s); +if(ieo == 0) { + U = g_gauge_field_copy[0][0]; + } + else { + U = g_gauge_field_copy[1][0]; + } +_prefetch_su3(U); +#else +if(ieo == 0) { + u0 = g_gauge_field_copy[0][0]; + } + else { + u0 = g_gauge_field_copy[1][0]; + } +#endif +#if (defined SSE2 || defined SSE3) +g_sloppy_precision = 0; +#endif +if(g_sloppy_precision == 1 && g_sloppy_precision_flag == 1) { + phi32 = NBPointer32[ieo]; + +#ifdef OMP +#pragma omp for +#else + ix=0; +#endif + for(unsigned int i = 0; i < (VOLUME)/2; i++){ +#ifdef OMP + U=u0+i*4; + s=k+i; + ix=i*8; +#endif + _hop_t_p_pre32(); + U++; + ix++; + + _hop_t_m_pre32(); + ix++; + + _hop_x_p_pre32(); + U++; + ix++; + + _hop_x_m_pre32(); + ix++; + + _hop_y_p_pre32(); + U++; + ix++; + + _hop_y_m_pre32(); + ix++; + + _hop_z_p_pre32(); + U++; + ix++; + + _hop_z_m_pre32(); + +#ifndef OMP + s++; + ix++; +#endif + } + +#ifdef OMP +#pragma omp single + { +#endif + +# if (defined MPI && !defined _NO_COMM) +# ifdef SPI + + // Initialize the barrier, resetting the hardware. + int rc = MUSPI_GIBarrierInit ( &GIBarrier, 0 /*comm world class route */); + if(rc) { + printf("MUSPI_GIBarrierInit returned rc = %d\n", rc); + exit(__LINE__); + } + // reset the recv counter + recvCounter = totalMessageSize/2; + global_barrier(); // make sure everybody is set recv counter + + //#pragma omp for nowait + for (unsigned int j = 0; j < spi_num_dirs; j++) { + descCount[ j ] = + msg_InjFifoInject ( injFifoHandle, + j, + &SPIDescriptors32[j]); + } + // wait for receive completion + while ( recvCounter > 0 ); + _bgq_msync(); +# else + xchange_halffield32(); +# endif +# endif + +#ifdef OMP + } +#endif + +#ifndef OMP + s = l; + if(ieo == 0) { + U = g_gauge_field_copy[1][0]; + } + else { + U = g_gauge_field_copy[0][0]; + } +#else + if(ieo == 0) { + u0 = g_gauge_field_copy[1][0]; + } + else { + u0 = g_gauge_field_copy[0][0]; + } +#endif + + phi32 = NBPointer32[2 + ieo]; + +#ifdef OMP +#pragma omp for +#else + ix = 0; +#endif + for(unsigned int i = 0; i < (VOLUME)/2; i++){ +#ifdef OMP + ix=i*8; + s=l+i; + U=u0+i*4; +#endif +#ifdef _TM_SUB_HOP + pn=p+i; +#endif + _hop_t_p_post32(); + ix++; + + _hop_t_m_post32(); + ix++; + U++; + + _hop_x_p_post32(); + ix++; + + _hop_x_m_post32(); + U++; + ix++; + + _hop_y_p_post32(); + ix++; + + _hop_y_m_post32(); + U++; + ix++; + + _hop_z_p_post32(); + ix++; + + _hop_z_m_post32(); + +#ifdef _MUL_G5_CMPLX + _hop_mul_g5_cmplx_and_store(s); +#elif defined _TM_SUB_HOP + _g5_cmplx_sub_hop_and_g5store(s); +#else + _hop_store_post(s); +#endif + +#ifndef OMP + U++; + ix++; + s++; +#endif + } + } + else { + phi = NBPointer[ieo]; + +#ifdef OMP +#pragma omp for +#else + ix=0; +#endif + for(unsigned int i = 0; i < (VOLUME)/2; i++){ +#ifdef OMP + s=k+i; + _prefetch_spinor(s); + ix=i*8; + U=u0+i*4; + _prefetch_su3(U); +#endif + + _hop_t_p_pre(); + U++; + ix++; + + _hop_t_m_pre(); + ix++; + + _hop_x_p_pre(); + U++; + ix++; + + _hop_x_m_pre(); + ix++; + + _hop_y_p_pre(); + U++; + ix++; + + _hop_y_m_pre(); + ix++; + + _hop_z_p_pre(); + U++; + ix++; + + _hop_z_m_pre(); + +#ifndef OMP + s++; + ix++; +#endif + } + +#ifdef OMP +#pragma omp single + { +#endif + +# if (defined MPI && !defined _NO_COMM) +# ifdef SPI + + // Initialize the barrier, resetting the hardware. + int rc = MUSPI_GIBarrierInit ( &GIBarrier, 0 /*comm world class route */); + if(rc) { + printf("MUSPI_GIBarrierInit returned rc = %d\n", rc); + exit(__LINE__); + } + // reset the recv counter + recvCounter = totalMessageSize; + global_barrier(); // make sure everybody is set recv counter + + //#pragma omp for nowait + for (unsigned int j = 0; j < spi_num_dirs; j++) { + descCount[ j ] = + msg_InjFifoInject ( injFifoHandle, + j, + &SPIDescriptors[j]); + } + // wait for receive completion + while ( recvCounter > 0 ); + _bgq_msync(); + +# else // SPI + xchange_halffield(); +# endif // SPI +# endif + +#ifdef OMP + } +#endif + +#ifndef OMP + s = l; + if(ieo == 0) { + U = g_gauge_field_copy[1][0]; + } + else { + U = g_gauge_field_copy[0][0]; + } + _prefetch_su3(U); +#else + if(ieo == 0) { + u0 = g_gauge_field_copy[1][0]; + } + else { + u0 = g_gauge_field_copy[0][0]; + } +#endif + + phi = NBPointer[2 + ieo]; + +#ifdef OMP +#pragma omp for +#else + ix = 0; +#endif + /* #pragma ivdep */ + for(unsigned int i = 0; i < (VOLUME)/2; i++){ +#ifdef OMP + ix=i*8; + U=u0+i*4; + _prefetch_su3(U); + s=l+i; + _prefetch_spinor(s); +#endif +#ifdef _TM_SUB_HOP + pn=p+i; +#endif + _hop_t_p_post(); + ix++; + + _hop_t_m_post(); + ix++; + U++; + + _hop_x_p_post(); + ix++; + + _hop_x_m_post(); + U++; + ix++; + + _hop_y_p_post(); + ix++; + + _hop_y_m_post(); + U++; + ix++; + + _hop_z_p_post(); + ix++; + + _hop_z_m_post(); + +#ifdef _MUL_G5_CMPLX + _hop_mul_g5_cmplx_and_store(s); +#elif defined _TM_SUB_HOP + _g5_cmplx_sub_hop_and_g5store(s); +#else + _hop_store_post(s); +#endif + +#ifndef OMP + U++; + ix++; + s++; +#endif + } + } +#ifdef _KOJAK_INST +#pragma pomp inst end(hoppingmatrix) +#endif + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/halfspinor_body_32.c b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/halfspinor_body_32.c new file mode 100644 index 0000000000000000000000000000000000000000..c9e8b305822329a6592b60f5447b61dc8d21cbdc --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/halfspinor_body_32.c @@ -0,0 +1,228 @@ +/********************************************************************** + * single precision version Copyright (C) 2013 Florian Burger + * based on halfspinor_body.c by Carsten Urbach + * + * This file is based on an implementation of the Dirac operator + * written by Martin Luescher, modified by Martin Hasenbusch in 2002 + * this is a new version based on the aforementioned implementations + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + **********************************************************************/ + + +int ix; +su3_32 * restrict U ALIGN32; +spinor32 * restrict s ALIGN32; +halfspinor32 * restrict * phi2 ALIGN32; +_declare_hregs(); + +#ifdef XLC +# pragma disjoint(*l, *k) +# pragma disjoint(*k, *U) +# pragma disjoint(*l, *U) +# pragma disjoint(*U, *s) +# pragma disjoint(*k, *s) +# pragma disjoint(*l, *s) +__alignx(16, l); +__alignx(16, k); +__alignx(16, U); +__alignx(16, s); +#endif + +//convert kappas to float locally +_Complex float ALIGN32 ka0_32 = (_Complex float) ka0; +_Complex float ALIGN32 ka1_32 = (_Complex float) ka1; +_Complex float ALIGN32 ka2_32 = (_Complex float) ka2; +_Complex float ALIGN32 ka3_32 = (_Complex float) ka3; + +#ifndef OMP +s = k; +_prefetch_spinor_32(s); +if(ieo == 0) { + U = g_gauge_field_copy_32[0][0]; + } + else { + U = g_gauge_field_copy_32[1][0]; + } +_prefetch_su3_32(U); +#else +if(ieo == 0) { + u0 = g_gauge_field_copy_32[0][0]; + } + else { + u0 = g_gauge_field_copy_32[1][0]; + } +#endif + + phi2 = NBPointer32[ieo]; + +#ifdef OMP +#pragma omp for +#else + ix=0; +#endif + for(unsigned int i = 0; i < (VOLUME)/2; i++){ +#ifdef OMP + U=u0+i*4; + s=k+i; + ix=i*8; +#endif + _hop_t_p_pre32(); + U++; + ix++; + + _hop_t_m_pre32(); + ix++; + + _hop_x_p_pre32(); + U++; + ix++; + + _hop_x_m_pre32(); + ix++; + + _hop_y_p_pre32(); + U++; + ix++; + + _hop_y_m_pre32(); + ix++; + + _hop_z_p_pre32(); + U++; + ix++; + + _hop_z_m_pre32(); + +#ifndef OMP + s++; + ix++; +#endif + } + +#ifdef OMP +#pragma omp single + { +#endif + +# if (defined MPI && !defined _NO_COMM) +# ifdef SPI + + // Initialize the barrier, resetting the hardware. + int rc = MUSPI_GIBarrierInit ( &GIBarrier, 0 /*comm world class route*/ ); + if(rc) { + printf("MUSPI_GIBarrierInit returned rc = %d\n", rc); + exit(__LINE__); + } + // reset the recv counter + recvCounter = totalMessageSize/2; + global_barrier(); // make sure everybody is set recv counter + + //#pragma omp for nowait + for (unsigned int j = 0; j < spi_num_dirs; j++) { + descCount[ j ] = + msg_InjFifoInject ( injFifoHandle, + j, + &SPIDescriptors32[j]); + } + // wait for receive completion + while ( recvCounter > 0 ); + _bgq_msync(); +# else + xchange_halffield32(); +# endif +# endif + +#ifdef OMP + } +#endif + +#ifndef OMP + s = l; + if(ieo == 0) { + U = g_gauge_field_copy_32[1][0]; + } + else { + U = g_gauge_field_copy_32[0][0]; + } +#else + if(ieo == 0) { + u0 = g_gauge_field_copy_32[1][0]; + } + else { + u0 = g_gauge_field_copy_32[0][0]; + } +#endif + + phi2 = NBPointer32[2 + ieo]; + +#ifdef OMP +#pragma omp for +#else + ix = 0; +#endif + for(unsigned int i = 0; i < (VOLUME)/2; i++){ +#ifdef OMP + ix=i*8; + s=l+i; + U=u0+i*4; +#endif +#ifdef _TM_SUB_HOP + pn=p+i; +#endif + _hop_t_p_post32(); + ix++; + + _hop_t_m_post32(); + ix++; + U++; + + _hop_x_p_post32(); + ix++; + + _hop_x_m_post32(); + U++; + ix++; + + _hop_y_p_post32(); + ix++; + + _hop_y_m_post32(); + U++; + ix++; + + _hop_z_p_post32(); + ix++; + + _hop_z_m_post32(); + +#ifdef _MUL_G5_CMPLX + _hop_mul_g5_cmplx_and_store32(s); +#elif defined _TM_SUB_HOP + _g5_cmplx_sub_hop_and_g5store32(s); +#else + _hop_store_post32(s); +#endif + +#ifndef OMP + U++; + ix++; + s++; +#endif + } + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/halfspinor_hopping.h b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/halfspinor_hopping.h new file mode 100644 index 0000000000000000000000000000000000000000..bc7f29222c810ab68fb6315426a3579a4e9e5006 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/halfspinor_hopping.h @@ -0,0 +1,1425 @@ +/********************************************************************** + * + * Copyright (C) 2012 Carsten Urbach + * + * BG and halfspinor versions (C) 2007, 2008 Carsten Urbach + * + * This file is based on an implementation of the Dirac operator + * written by Martin Luescher, modified by Martin Hasenbusch in 2002 + * and modified and extended by Carsten Urbach from 2003-2008 + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + **********************************************************************/ + +#ifndef _HALFSPINOR_HOPPING_H +#define _HALFSPINOR_HOPPING_H + +#if (defined SSE2 || defined SSE3) + +#define _hop_t_p_pre32() +#define _hop_t_m_pre32() +#define _hop_x_p_pre32() +#define _hop_x_m_pre32() +#define _hop_y_p_pre32() +#define _hop_y_m_pre32() +#define _hop_z_p_pre32() +#define _hop_z_m_pre32() +#define _hop_t_p_post32() +#define _hop_t_m_post32() +#define _hop_x_p_post32() +#define _hop_x_m_post32() +#define _hop_y_p_post32() +#define _hop_y_m_post32() +#define _hop_z_p_post32() +#define _hop_z_m_post32() + +#define _hop_t_p_pre() \ + _prefetch_su3(U+predist); \ + _sse_load(s->s0); \ + _sse_load_up(s->s2); \ + _sse_vector_add(); \ + _sse_su3_multiply((*U)); \ + _sse_vector_cmplx_mul(ka0); \ + _sse_store_nt_up(phi[ix]->s0); \ + _sse_load(s->s1); \ + _sse_load_up(s->s3); \ + _sse_vector_add(); \ + _sse_su3_multiply((*U)); \ + _sse_vector_cmplx_mul(ka0); \ + _sse_store_nt_up(phi[ix]->s1); + +#define _hop_t_m_pre() \ + _sse_load(s->s0); \ + _sse_load_up(s->s2); \ + _sse_vector_sub(); \ + _sse_store_nt(phi[ix]->s0); \ + _sse_load(s->s1); \ + _sse_load_up(s->s3); \ + _sse_vector_sub(); \ + _sse_store_nt(phi[ix]->s1); + +#define _hop_x_p_pre() \ + _prefetch_su3(U+predist); \ + _sse_load(s->s0); \ + _sse_load_up(s->s3); \ + _sse_vector_i_mul(); \ + _sse_vector_add(); \ + _sse_su3_multiply((*U)); \ + _sse_vector_cmplx_mul(ka1); \ + _sse_store_nt_up(phi[ix]->s0); \ + _sse_load(s->s1); \ + _sse_load_up(s->s2); \ + _sse_vector_i_mul(); \ + _sse_vector_add(); \ + _sse_su3_multiply((*U)); \ + _sse_vector_cmplx_mul(ka1); \ + _sse_store_nt_up(phi[ix]->s1); + +#define _hop_x_m_pre() \ + _sse_load(s->s0); \ + _sse_load_up(s->s3); \ + _sse_vector_i_mul(); \ + _sse_vector_sub(); \ + _sse_store_nt(phi[ix]->s0); \ + _sse_load(s->s1); \ + _sse_load_up(s->s2); \ + _sse_vector_i_mul(); \ + _sse_vector_sub(); \ + _sse_store_nt(phi[ix]->s1); + +#define _hop_y_p_pre() \ + _prefetch_su3(U+predist); \ + _sse_load(s->s0); \ + _sse_load_up(s->s3); \ + _sse_vector_add(); \ + _sse_su3_multiply((*U)); \ + _sse_vector_cmplx_mul(ka2); \ + _sse_store_nt_up(phi[ix]->s0); \ + _sse_load(s->s1); \ + _sse_load_up(s->s2); \ + _sse_vector_sub(); \ + _sse_su3_multiply((*U)); \ + _sse_vector_cmplx_mul(ka2); \ + _sse_store_nt_up(phi[ix]->s1); + +#define _hop_y_m_pre() \ + _sse_load(s->s0); \ + _sse_load_up(s->s3); \ + _sse_vector_sub(); \ + _sse_store_nt(phi[ix]->s0); \ + _sse_load(s->s1); \ + _sse_load_up(s->s2); \ + _sse_vector_add(); \ + _sse_store_nt(phi[ix]->s1); + +#define _hop_z_p_pre() \ + _prefetch_su3(U+predist); \ + _prefetch_spinor(s+1); \ + _sse_load(s->s0); \ + _sse_load_up(s->s2); \ + _sse_vector_i_mul(); \ + _sse_vector_add(); \ + _sse_su3_multiply((*U)); \ + _sse_vector_cmplx_mul(ka3); \ + _sse_store_nt_up(phi[ix]->s0); \ + _sse_load(s->s1); \ + _sse_load_up(s->s3); \ + _sse_vector_i_mul(); \ + _sse_vector_sub(); \ + _sse_su3_multiply((*U)); \ + _sse_vector_cmplx_mul(ka3); \ + _sse_store_nt_up(phi[ix]->s1); \ + +#define _hop_z_m_pre() \ + _sse_load(s->s0); \ + _sse_load_up(s->s2); \ + _sse_vector_i_mul(); \ + _sse_vector_sub(); \ + _sse_store_nt(phi[ix]->s0); \ + _sse_load(s->s1); \ + _sse_load_up(s->s3); \ + _sse_vector_i_mul(); \ + _sse_vector_add(); \ + _sse_store_nt(phi[ix]->s1); + +#define _hop_t_p_post() \ + _vector_assign(rs.s0, phi[ix]->s0); \ + _vector_assign(rs.s2, phi[ix]->s0); \ + _vector_assign(rs.s1, phi[ix]->s1); \ + _vector_assign(rs.s3, phi[ix]->s1); + +#define _hop_t_m_post() \ + _prefetch_su3(U+predist); \ + _sse_load(phi[ix]->s0); \ + _sse_su3_inverse_multiply((*U)); \ + _sse_vector_cmplxcg_mul(ka0); \ + _sse_load(rs.s0); \ + _sse_vector_add(); \ + _sse_store(rs.s0); \ + _sse_load(rs.s2); \ + _sse_vector_sub(); \ + _sse_store(rs.s2); \ + _sse_load(phi[ix]->s1); \ + _sse_su3_inverse_multiply((*U)); \ + _sse_vector_cmplxcg_mul(ka0); \ + _sse_load(rs.s1); \ + _sse_vector_add(); \ + _sse_store(rs.s1); \ + _sse_load(rs.s3); \ + _sse_vector_sub(); \ + _sse_store(rs.s3); + +#define _hop_x_p_post() \ + _sse_load_up(phi[ix]->s0); \ + _sse_load(rs.s0); \ + _sse_vector_add(); \ + _sse_store(rs.s0); \ + _sse_load(rs.s3); \ + _sse_vector_i_mul(); \ + _sse_vector_sub(); \ + _sse_store(rs.s3); \ + _sse_load_up(phi[ix]->s1); \ + _sse_load(rs.s1); \ + _sse_vector_add(); \ + _sse_store(rs.s1); \ + _sse_load(rs.s2); \ + _sse_vector_i_mul(); \ + _sse_vector_sub(); \ + _sse_store(rs.s2); + +#define _hop_x_m_post() \ + _prefetch_su3(U+predist); \ + _sse_load(phi[ix]->s0); \ + _sse_su3_inverse_multiply((*U)); \ + _sse_vector_cmplxcg_mul(ka1); \ + _sse_load(rs.s0); \ + _sse_vector_add(); \ + _sse_store(rs.s0); \ + _sse_load(rs.s3); \ + _sse_vector_i_mul(); \ + _sse_vector_add(); \ + _sse_store(rs.s3); \ + _sse_load(phi[ix]->s1); \ + _sse_su3_inverse_multiply((*U)); \ + _sse_vector_cmplxcg_mul(ka1); \ + _sse_load(rs.s1); \ + _sse_vector_add(); \ + _sse_store(rs.s1); \ + _sse_load(rs.s2); \ + _sse_vector_i_mul(); \ + _sse_vector_add(); \ + _sse_store(rs.s2); + +#define _hop_y_p_post() \ + _sse_load_up(phi[ix]->s0); \ + _sse_load(rs.s0); \ + _sse_vector_add(); \ + _sse_store(rs.s0); \ + _sse_load(rs.s3); \ + _sse_vector_add(); \ + _sse_store(rs.s3); \ + _sse_load_up(phi[ix]->s1); \ + _sse_load(rs.s1); \ + _sse_vector_add(); \ + _sse_store(rs.s1); \ + _sse_load(rs.s2); \ + _sse_vector_sub(); \ + _sse_store(rs.s2); + +#define _hop_y_m_post() \ + _prefetch_su3(U+predist); \ + _sse_load(phi[ix]->s0); \ + _sse_su3_inverse_multiply((*U)); \ + _sse_vector_cmplxcg_mul(ka2); \ + _sse_load(rs.s0); \ + _sse_vector_add(); \ + _sse_store(rs.s0); \ + _sse_load(rs.s3); \ + _sse_vector_sub(); \ + _sse_store(rs.s3); \ + _sse_load(phi[ix]->s1); \ + _sse_su3_inverse_multiply((*U)); \ + _sse_vector_cmplxcg_mul(ka2); \ + _sse_load(rs.s1); \ + _sse_vector_add(); \ + _sse_store(rs.s1); \ + _sse_load(rs.s2); \ + _sse_vector_add(); \ + _sse_store(rs.s2); + +#define _hop_z_p_post() \ + _sse_load_up(phi[ix]->s0); \ + _sse_load(rs.s0); \ + _sse_vector_add(); \ + _sse_store(rs.s0); \ + _sse_load(rs.s2); \ + _sse_vector_i_mul(); \ + _sse_vector_sub(); \ + _sse_store(rs.s2); \ + _sse_load_up(phi[ix]->s1); \ + _sse_load(rs.s1); \ + _sse_vector_add(); \ + _sse_store(rs.s1); \ + _sse_load(rs.s3); \ + _sse_vector_i_mul(); \ + _sse_vector_add(); \ + _sse_store(rs.s3); + +#define _hop_z_m_post() \ + _prefetch_su3(U+predist); \ + _prefetch_spinor(s+1); \ + _sse_load(phi[ix]->s0); \ + _sse_su3_inverse_multiply((*U)); \ + _sse_vector_cmplxcg_mul(ka3); \ + _sse_load(rs.s0); \ + _sse_vector_add(); \ + _sse_store_nt(s->s0); \ + _sse_load(rs.s2); \ + _sse_vector_i_mul(); \ + _sse_vector_add(); \ + _sse_store_nt(s->s2); \ + _sse_load(phi[ix]->s1); \ + _sse_su3_inverse_multiply((*U)); \ + _sse_vector_cmplxcg_mul(ka3); \ + _sse_load(rs.s1); \ + _sse_vector_add(); \ + _sse_store_nt(s->s1); \ + _sse_load(rs.s3); \ + _sse_vector_i_mul(); \ + _sse_vector_sub(); \ + _sse_store_nt(s->s3); + +#define _hop_mul_g5_cmplx_and_store(res) \ + _sse_load_up((res)->s0); \ + _sse_vector_cmplx_mul(cf); \ + _sse_store_nt_up((res)->s0); \ + _sse_load_up((res)->s1); \ + _sse_vector_cmplx_mul(cf); \ + _sse_store_nt_up((res)->s1); \ + _sse_load_up((res)->s2); \ + _sse_vector_cmplxcg_mul(cf); \ + _sse_store_nt_up((res)->s2); \ + _sse_load_up((res)->s3); \ + _sse_vector_cmplxcg_mul(cf); \ + _sse_store_nt_up((res)->s3); + +#define _g5_cmplx_sub_hop_and_g5store(res) \ + _sse_load_up(pn->s0); \ + _sse_vector_cmplx_mul(cf); \ + _sse_load((res)->s0); \ + _sse_vector_sub_up(); \ + _sse_store_nt_up((res)->s0); \ + _sse_load_up(pn->s1); \ + _sse_vector_cmplx_mul(cf); \ + _sse_load((res)->s1); \ + _sse_vector_sub_up(); \ + _sse_store_nt_up((res)->s1); \ + _sse_load_up(pn->s2); \ + _sse_vector_cmplxcg_mul(cf); \ + _sse_load((res)->s2); \ + _sse_vector_sub(); \ + _sse_store_nt((res)->s2); \ + _sse_load_up(pn->s3); \ + _sse_vector_cmplxcg_mul(cf); \ + _sse_load((res)->s3); \ + _sse_vector_sub(); \ + _sse_store_nt((res)->s3); + +#define _hop_store_post(res) + +#if defined OPTERON +# define _declare_hregs() \ + spinor rs ALIGN; \ + const int predist=2; +#else +# define _declare_hregs() \ + spinor rs ALIGN; \ + const int predist=1; +#endif + +#elif (defined BGL && defined XLC) + +#define _declare_hregs() \ + double _Complex reg00, reg01, reg02, reg03, reg04, reg05; \ + double _Complex reg10, reg11, reg12, reg13, reg14, reg15; \ + double _Complex u00, u01, u02, u10, u11, u12; \ + double _Complex reg20, reg21; \ + double _Complex rs00, rs01, rs02, rs10, rs11, rs12, rs20, rs21, rs22, \ + rs30, rs31, rs32; + +#define _hop_t_p_pre32() \ + _bgl_load_rs0(s->s0); \ + _bgl_load_rs1(s->s1); \ + _bgl_load_rs2(s->s2); \ + _bgl_load_rs3(s->s3); \ + _prefetch_spinor(s+1); \ + _prefetch_su3(U+1); \ + _bgl_vector_add_rs2_to_rs0_reg0(); \ + _bgl_vector_add_rs3_to_rs1_reg1(); \ + _bgl_su3_multiply_double((*U)); \ + _bgl_vector_cmplx_mul_double(ka0); \ + _bgl_store_reg0_up_32(phi32[ix]->s0); \ + _bgl_store_reg1_up_32(phi32[ix]->s1); + +#define _hop_t_m_pre32() \ + _bgl_vector_sub_rs2_from_rs0_reg0(); \ + _bgl_vector_sub_rs3_from_rs1_reg1(); \ + _bgl_store_reg0_32(phi32[ix]->s0); \ + _bgl_store_reg1_32(phi32[ix]->s1); + +#define _hop_x_p_pre32() \ + _prefetch_su3(U+1); \ + _bgl_vector_i_mul_add_rs3_to_rs0_reg0(); \ + _bgl_vector_i_mul_add_rs2_to_rs1_reg1(); \ + _bgl_su3_multiply_double((*U)); \ + _bgl_vector_cmplx_mul_double(ka1); \ + _bgl_store_reg0_up_32(phi32[ix]->s0); \ + _bgl_store_reg1_up_32(phi32[ix]->s1); + +#define _hop_x_m_pre32() \ + _bgl_vector_i_mul_sub_rs3_from_rs0_reg0(); \ + _bgl_vector_i_mul_sub_rs2_from_rs1_reg1(); \ + _bgl_store_reg0_32(phi32[ix]->s0); \ + _bgl_store_reg1_32(phi32[ix]->s1); + +#define _hop_y_p_pre32() \ + _prefetch_su3(U+1); \ + _bgl_vector_add_rs3_to_rs0_reg0(); \ + _bgl_vector_sub_rs2_from_rs1_reg1(); \ + _bgl_su3_multiply_double((*U)); \ + _bgl_vector_cmplx_mul_double(ka2); \ + _bgl_store_reg0_up_32(phi32[ix]->s0); \ + _bgl_store_reg1_up_32(phi32[ix]->s1); + +#define _hop_y_m_pre32() \ + _bgl_vector_sub_rs3_from_rs0_reg0(); \ + _bgl_vector_add_rs2_to_rs1_reg1(); \ + _bgl_store_reg0_32(phi32[ix]->s0); \ + _bgl_store_reg1_32(phi32[ix]->s1); + +#define _hop_z_p_pre32() \ + _bgl_vector_i_mul_add_rs2_to_rs0_reg0(); \ + _bgl_vector_i_mul_sub_rs3_from_rs1_reg1(); \ + _bgl_su3_multiply_double((*U)); \ + _bgl_vector_cmplx_mul_double(ka3); \ + _bgl_store_reg0_up_32(phi32[ix]->s0); \ + _bgl_store_reg1_up_32(phi32[ix]->s1); + +#define _hop_z_m_pre32() \ + _bgl_vector_i_mul_sub_rs2_from_rs0_reg0(); \ + _bgl_vector_i_mul_add_rs3_to_rs1_reg1(); \ + _bgl_store_reg0_32(phi32[ix]->s0); \ + _bgl_store_reg1_32(phi32[ix]->s1); + +#define _hop_t_p_post32() \ + _bgl_load_rs0_32(phi32[ix]->s0); \ + rs20 = rs00; \ + rs21 = rs01; \ + rs22 = rs02; \ + _bgl_load_rs1_32(phi32[ix]->s1); \ + rs30 = rs10; \ + rs31 = rs11; \ + rs32 = rs12; + +#define _hop_t_m_post32() \ + _prefetch_su3(U+1); \ + _bgl_load_reg0_32(phi32[ix]->s0); \ + _bgl_load_reg1_32(phi32[ix]->s1); \ + _bgl_su3_inverse_multiply_double((*U)); \ + _bgl_vector_cmplxcg_mul_double(ka0); \ + _bgl_add_to_rs0_reg0(); \ + _bgl_sub_from_rs2_reg0(); \ + _bgl_add_to_rs1_reg1(); \ + _bgl_sub_from_rs3_reg1(); + +#define _hop_x_p_post32() \ + _bgl_load_reg0_up_32(phi32[ix]->s0); \ + _bgl_load_reg1_up_32(phi32[ix]->s1); \ + _bgl_add_to_rs0_reg0(); \ + _bgl_i_mul_sub_from_rs3_reg0(); \ + _bgl_add_to_rs1_reg1(); \ + _bgl_i_mul_sub_from_rs2_reg1(); + +#define _hop_x_m_post32() \ + _prefetch_su3(U+1); \ + _bgl_load_reg0_32(phi32[ix]->s0); \ + _bgl_load_reg1_32(phi32[ix]->s1); \ + _bgl_su3_inverse_multiply_double((*U)); \ + _bgl_vector_cmplxcg_mul_double(ka1); \ + _bgl_add_to_rs0_reg0(); \ + _bgl_add_to_rs1_reg1(); \ + _bgl_i_mul_add_to_rs3_reg0(); \ + _bgl_i_mul_add_to_rs2_reg1(); + +#define _hop_y_p_post32() \ + _bgl_load_reg0_up_32(phi32[ix]->s0); \ + _bgl_load_reg1_up_32(phi32[ix]->s1); \ + _bgl_add_to_rs0_reg0(); \ + _bgl_add_to_rs1_reg1(); \ + _bgl_sub_from_rs2_reg1(); \ + _bgl_add_to_rs3_reg0(); + +#define _hop_y_m_post32() \ + _prefetch_su3(U+1); \ + _bgl_load_reg0_32(phi32[ix]->s0); \ + _bgl_load_reg1_32(phi32[ix]->s1); \ + _bgl_su3_inverse_multiply_double((*U)); \ + _bgl_vector_cmplxcg_mul_double(ka2); \ + _bgl_add_to_rs0_reg0(); \ + _bgl_add_to_rs1_reg1(); \ + _bgl_add_to_rs2_reg1(); \ + _bgl_sub_from_rs3_reg0(); + +#define _hop_z_p_post32() \ + _bgl_load_reg0_up_32(phi32[ix]->s0); \ + _bgl_load_reg1_up_32(phi32[ix]->s1); \ + _bgl_add_to_rs0_reg0(); \ + _bgl_add_to_rs1_reg1(); \ + _bgl_i_mul_sub_from_rs2_reg0(); \ + _bgl_i_mul_add_to_rs3_reg1(); + +#define _hop_z_m_post32() \ + _prefetch_su3(U+1); \ + _bgl_load_reg0_32(phi32[ix]->s0); \ + _bgl_load_reg1_32(phi32[ix]->s1); \ + _bgl_su3_inverse_multiply_double((*U)); \ + _bgl_vector_cmplxcg_mul_double(ka3); \ + _bgl_add_to_rs0_reg0(); \ + _bgl_i_mul_add_to_rs2_reg0(); \ + _bgl_add_to_rs1_reg1(); \ + _bgl_i_mul_sub_from_rs3_reg1(); + +#define _hop_t_p_pre() \ + _prefetch_halfspinor(phi[ix+4]); \ + _bgl_load_rs0(s->s0); \ + _bgl_load_rs1(s->s1); \ + _bgl_load_rs2(s->s2); \ + _bgl_load_rs3(s->s3); \ + _prefetch_spinor(s+1); \ + _prefetch_su3(U+1); \ + _bgl_vector_add_rs2_to_rs0_reg0(); \ + _bgl_vector_add_rs3_to_rs1_reg1(); \ + _bgl_su3_multiply_double((*U)); \ + _bgl_vector_cmplx_mul_double(ka0); \ + _bgl_store_reg0_up(phi[ix]->s0); \ + _bgl_store_reg1_up(phi[ix]->s1); + +#define _hop_t_m_pre() \ + _prefetch_halfspinor(phi[ix+4]); \ + _bgl_vector_sub_rs2_from_rs0_reg0(); \ + _bgl_vector_sub_rs3_from_rs1_reg1(); \ + _bgl_store_reg0(phi[ix]->s0); \ + _bgl_store_reg1(phi[ix]->s1); + +#define _hop_x_p_pre() \ + _prefetch_halfspinor(phi[ix+4]); \ + _prefetch_su3(U+1); \ + _bgl_vector_i_mul_add_rs3_to_rs0_reg0(); \ + _bgl_vector_i_mul_add_rs2_to_rs1_reg1(); \ + _bgl_su3_multiply_double((*U)); \ + _bgl_vector_cmplx_mul_double(ka1); \ + _bgl_store_reg0_up(phi[ix]->s0); \ + _bgl_store_reg1_up(phi[ix]->s1); + +#define _hop_x_m_pre() \ + _prefetch_halfspinor(phi[ix+4]); \ + _bgl_vector_i_mul_sub_rs3_from_rs0_reg0(); \ + _bgl_vector_i_mul_sub_rs2_from_rs1_reg1(); \ + _bgl_store_reg0(phi[ix]->s0); \ + _bgl_store_reg1(phi[ix]->s1); + +#define _hop_y_p_pre() \ + _prefetch_halfspinor(phi[ix+4]); \ + _prefetch_su3(U+1); \ + _bgl_vector_add_rs3_to_rs0_reg0(); \ + _bgl_vector_sub_rs2_from_rs1_reg1(); \ + _bgl_su3_multiply_double((*U)); \ + _bgl_vector_cmplx_mul_double(ka2); \ + _bgl_store_reg0_up(phi[ix]->s0); \ + _bgl_store_reg1_up(phi[ix]->s1); + +#define _hop_y_m_pre() \ + _prefetch_halfspinor(phi[ix+4]); \ + _bgl_vector_sub_rs3_from_rs0_reg0(); \ + _bgl_vector_add_rs2_to_rs1_reg1(); \ + _bgl_store_reg0(phi[ix]->s0); \ + _bgl_store_reg1(phi[ix]->s1); + +#define _hop_z_p_pre() \ + _prefetch_halfspinor(phi[ix+4]); \ + _prefetch_su3(U+1); \ + _bgl_vector_i_mul_add_rs2_to_rs0_reg0(); \ + _bgl_vector_i_mul_sub_rs3_from_rs1_reg1(); \ + _bgl_su3_multiply_double((*U)); \ + _bgl_vector_cmplx_mul_double(ka3); \ + _bgl_store_reg0_up(phi[ix]->s0); \ + _bgl_store_reg1_up(phi[ix]->s1); + +#define _hop_z_m_pre() \ + _prefetch_halfspinor(phi[ix+4]); \ + _bgl_vector_i_mul_sub_rs2_from_rs0_reg0(); \ + _bgl_vector_i_mul_add_rs3_to_rs1_reg1(); \ + _bgl_store_reg0(phi[ix]->s0); \ + _bgl_store_reg1(phi[ix]->s1); + +#define _hop_t_p_post(); \ + _prefetch_halfspinor(phi[ix+3]); \ + _bgl_load_rs0(phi[ix]->s0); \ + rs20 = rs00; \ + rs21 = rs01; \ + rs22 = rs02; \ + _bgl_load_rs1(phi[ix]->s1); \ + rs30 = rs10; \ + rs31 = rs11; \ + rs32 = rs12; + +#define _hop_t_m_post(); \ + _prefetch_halfspinor(phi[ix+3]); \ + _prefetch_su3(U+1); \ + _bgl_load_reg0(phi[ix]->s0); \ + _bgl_load_reg1(phi[ix]->s1); \ + _bgl_su3_inverse_multiply_double((*U)); \ + _bgl_vector_cmplxcg_mul_double(ka0); \ + _bgl_add_to_rs0_reg0(); \ + _bgl_sub_from_rs2_reg0(); \ + _bgl_add_to_rs1_reg1(); \ + _bgl_sub_from_rs3_reg1(); + +#define _hop_x_p_post(); \ + _prefetch_halfspinor(phi[ix+3]); \ + _bgl_load_reg0_up(phi[ix]->s0); \ + _bgl_load_reg1_up(phi[ix]->s1); \ + _bgl_add_to_rs0_reg0(); \ + _bgl_i_mul_sub_from_rs3_reg0(); \ + _bgl_add_to_rs1_reg1(); \ + _bgl_i_mul_sub_from_rs2_reg1(); + +#define _hop_x_m_post(); \ + _prefetch_halfspinor(phi[ix+3]); \ + _prefetch_su3(U+1); \ + _bgl_load_reg0(phi[ix]->s0); \ + _bgl_load_reg1(phi[ix]->s1); \ + _bgl_su3_inverse_multiply_double((*U)); \ + _bgl_vector_cmplxcg_mul_double(ka1); \ + _bgl_add_to_rs0_reg0(); \ + _bgl_add_to_rs1_reg1(); \ + _bgl_i_mul_add_to_rs3_reg0(); \ + _bgl_i_mul_add_to_rs2_reg1(); + +#define _hop_y_p_post(); \ + _prefetch_halfspinor(phi[ix+3]); \ + _bgl_load_reg0_up(phi[ix]->s0); \ + _bgl_load_reg1_up(phi[ix]->s1); \ + _bgl_add_to_rs0_reg0(); \ + _bgl_add_to_rs1_reg1(); \ + _bgl_sub_from_rs2_reg1(); \ + _bgl_add_to_rs3_reg0(); + +#define _hop_y_m_post(); \ + _prefetch_halfspinor(phi[ix+3]); \ + _prefetch_su3(U+1); \ + _bgl_load_reg0(phi[ix]->s0); \ + _bgl_load_reg1(phi[ix]->s1); \ + _bgl_su3_inverse_multiply_double((*U)); \ + _bgl_vector_cmplxcg_mul_double(ka2); \ + _bgl_add_to_rs0_reg0(); \ + _bgl_add_to_rs1_reg1(); \ + _bgl_add_to_rs2_reg1(); \ + _bgl_sub_from_rs3_reg0(); + +#define _hop_z_p_post(); \ + _prefetch_halfspinor(phi[ix+3]); \ + _bgl_load_reg0_up(phi[ix]->s0); \ + _bgl_load_reg1_up(phi[ix]->s1); \ + _bgl_add_to_rs0_reg0(); \ + _bgl_add_to_rs1_reg1(); \ + _bgl_i_mul_sub_from_rs2_reg0(); \ + _bgl_i_mul_add_to_rs3_reg1(); + +#define _hop_z_m_post(); \ + _prefetch_spinor(s); \ + _prefetch_halfspinor(phi[ix+3]); \ + _prefetch_su3(U+1); \ + _bgl_load_reg0(phi[ix]->s0); \ + _bgl_load_reg1(phi[ix]->s1); \ + _bgl_su3_inverse_multiply_double((*U)); \ + _bgl_vector_cmplxcg_mul_double(ka3); \ + _bgl_add_to_rs0_reg0(); \ + _bgl_i_mul_add_to_rs2_reg0(); \ + _bgl_add_to_rs1_reg1(); \ + _bgl_i_mul_sub_from_rs3_reg1(); + + + +#define _hop_store_post(res) \ + _bgl_store_rs0((res)->s0); \ + _bgl_store_rs1((res)->s1); \ + _bgl_store_rs2((res)->s2); \ + _bgl_store_rs3((res)->s3); + + +#elif (defined BGQ && defined XLC) + +#define _hop_t_p_pre32() \ + _vec_load2(rs0, rs1, rs2, s->s0); \ + _vec_load2(rs3, rs4, rs5, s->s1); \ + _vec_load2(rs6, rs7, rs8, s->s2); \ + _vec_load2(rs9, rs10, rs11, s->s3); \ + _prefetch_spinor(s+1); \ + _prefetch_su3(U+1); \ + _vec_add_to2(r0, r1, r2, rs0, rs1, rs2, rs6, rs7, rs8); \ + _vec_add_to2(r3, r4, r5, rs3, rs4, rs5, rs9, rs10, rs11); \ + rtmp = vec_ld2(0, (double*) &ka0); \ + _vec_su3_multiply_double2(U); \ + _vec_cmplx_mul_double2(r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, rtmp); \ + _vec_store2_32(phi32[ix]->s0, r0, r1, r2); \ + _vec_store2_32(phi32[ix]->s1, r3, r4, r5); + +#define _hop_t_m_pre32() \ + _vec_sub_to2(r0, r1, r2, rs0, rs1, rs2, rs6, rs7, rs8); \ + _vec_sub_to2(r3, r4, r5, rs3, rs4, rs5, rs9, rs10, rs11); \ + _vec_store2_32(phi32[ix]->s0, r0, r1, r2); \ + _vec_store2_32(phi32[ix]->s1, r3, r4, r5); + +#define _hop_x_p_pre32() \ + _prefetch_su3(U+1); \ + _vec_i_mul_add_to2(r0, r1, r2, rs0, rs1, rs2, rs9, rs10, rs11, U0); \ + _vec_i_mul_add_to2(r3, r4, r5, rs3, rs4, rs5, rs6, rs7, rs8, U0); \ + rtmp = vec_ld2(0, (double*) &ka1); \ + _vec_su3_multiply_double2(U); \ + _vec_cmplx_mul_double2(r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, rtmp); \ + _vec_store2_32(phi32[ix]->s0, r0, r1, r2); \ + _vec_store2_32(phi32[ix]->s1, r3, r4, r5); + +#define _hop_x_m_pre32() \ + _vec_i_mul_sub_to2(r0, r1, r2, rs0, rs1, rs2, rs9, rs10, rs11, U0); \ + _vec_i_mul_sub_to2(r3, r4, r5, rs3, rs4, rs5, rs6, rs7, rs8, U0); \ + _vec_store2_32(phi32[ix]->s0, r0, r1, r2); \ + _vec_store2_32(phi32[ix]->s1, r3, r4, r5); + +#define _hop_y_p_pre32() \ + _prefetch_su3(U+1); \ + _vec_add_to2(r0, r1, r2, rs0, rs1, rs2, rs9, rs10, rs11); \ + _vec_sub_to2(r3, r4, r5, rs3, rs4, rs5, rs6, rs7, rs8); \ + rtmp = vec_ld2(0, (double*) &ka2); \ + _vec_su3_multiply_double2(U); \ + _vec_cmplx_mul_double2(r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, rtmp); \ + _vec_store2_32(phi32[ix]->s0, r0, r1, r2); \ + _vec_store2_32(phi32[ix]->s1, r3, r4, r5); + +#define _hop_y_m_pre32() \ + _vec_sub_to2(r0, r1, r2, rs0, rs1, rs2, rs9, rs10, rs11); \ + _vec_add_to2(r3, r4, r5, rs3, rs4, rs5, rs6, rs7, rs8); \ + _vec_store2_32(phi32[ix]->s0, r0, r1, r2); \ + _vec_store2_32(phi32[ix]->s1, r3, r4, r5); + +#define _hop_z_p_pre32() \ + _prefetch_su3(U+1); \ + _vec_i_mul_add_to2(r0, r1, r2, rs0, rs1, rs2, rs6, rs7, rs8, U0); \ + _vec_i_mul_sub_to2(r3, r4, r5, rs3, rs4, rs5, rs9, rs10, rs11, U0); \ + rtmp = vec_ld2(0, (double*) &ka3); \ + _vec_su3_multiply_double2(U); \ + _vec_cmplx_mul_double2(r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, rtmp); \ + _vec_store2_32(phi32[ix]->s0, r0, r1, r2); \ + _vec_store2_32(phi32[ix]->s1, r3, r4, r5); + +#define _hop_z_m_pre32() \ + _vec_i_mul_sub_to2(r0, r1, r2, rs0, rs1, rs2, rs6, rs7, rs8, U0); \ + _vec_i_mul_add_to2(r3, r4, r5, rs3, rs4, rs5, rs9, rs10, rs11, U0); \ + _vec_store2_32(phi32[ix]->s0, r0, r1, r2); \ + _vec_store2_32(phi32[ix]->s1, r3, r4, r5); + +#define _hop_t_p_post32() \ + _vec_load2_32(rs0, rs1, rs2, phi32[ix]->s0); \ + rs6 = rs0; \ + rs7 = rs1; \ + rs8 = rs2; \ + _vec_load2_32(rs3, rs4, rs5, phi32[ix]->s1); \ + rs9 = rs3; \ + rs10= rs4; \ + rs11= rs5; + +#define _hop_t_m_post32() \ + _prefetch_su3(U+1); \ + _vec_load2_32(r0, r1, r2, phi32[ix]->s0); \ + _vec_load2_32(r3, r4, r5, phi32[ix]->s1); \ + rtmp = vec_ld2(0, (double*) &ka0); \ + _vec_su3_inverse_multiply_double2(U); \ + _vec_cmplxcg_mul_double2(r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, rtmp); \ + _vec_add2(rs0, rs1, rs2, r0, r1, r2); \ + _vec_sub2(rs6, rs7, rs8, r0, r1, r2); \ + _vec_add2(rs3, rs4, rs5, r3, r4, r5); \ + _vec_sub2(rs9, rs10, rs11, r3, r4, r5); + +#define _hop_x_p_post32() \ + _vec_load2_32(r0, r1, r2, phi32[ix]->s0); \ + _vec_load2_32(r3, r4, r5, phi32[ix]->s1); \ + _vec_add2(rs0, rs1, rs2, r0, r1, r2); \ + _vec_i_mul_sub2(rs9, rs10, rs11, r0, r1, r2, U0); \ + _vec_add2(rs3, rs4, rs5, r3, r4, r5); \ + _vec_i_mul_sub2(rs6, rs7, rs8, r3, r4, r5, U0); + +#define _hop_x_m_post32() \ + _prefetch_su3(U+1); \ + _vec_load2_32(r0, r1, r2, phi32[ix]->s0); \ + _vec_load2_32(r3, r4, r5, phi32[ix]->s1); \ + rtmp = vec_ld2(0, (double*) &ka1); \ + _vec_su3_inverse_multiply_double2(U); \ + _vec_cmplxcg_mul_double2(r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, rtmp); \ + _vec_add_double2(rs0, rs1, rs2, rs3, rs4, rs5, r0, r1, r2, r3, r4, r5); \ + _vec_i_mul_add_double2(rs9, rs10, rs11, rs6, rs7, rs8, r0, r1, r2, r3, r4, r5, U0); + +#define _hop_y_p_post32() \ + _vec_load2_32(r0, r1, r2, phi32[ix]->s0); \ + _vec_load2_32(r3, r4, r5, phi32[ix]->s1); \ + _vec_add_double2(rs0, rs1, rs2, rs3, rs4, rs5, r0, r1, r2, r3, r4, r5); \ + _vec_sub2(rs6, rs7, rs8, r3, r4, r5); \ + _vec_add2(rs9, rs10, rs11, r0, r1, r2); + +#define _hop_y_m_post32() \ + _prefetch_su3(U+1); \ + _vec_load2_32(r0, r1, r2, phi32[ix]->s0); \ + _vec_load2_32(r3, r4, r5, phi32[ix]->s1); \ + rtmp = vec_ld2(0, (double*) &ka2); \ + _vec_su3_inverse_multiply_double2(U); \ + _vec_cmplxcg_mul_double2(r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, rtmp); \ + _vec_add_double2(rs0, rs1, rs2, rs3, rs4, rs5, r0, r1, r2, r3, r4, r5); \ + _vec_add2(rs6, rs7, rs8, r3, r4, r5); \ + _vec_sub2(rs9, rs10, rs11, r0, r1, r2); + +#define _hop_z_p_post32() \ + _vec_load2_32(r0, r1, r2, phi32[ix]->s0); \ + _vec_load2_32(r3, r4, r5, phi32[ix]->s1); \ + _vec_add_double2(rs0, rs1, rs2, rs3, rs4, rs5, r0, r1, r2, r3, r4, r5); \ + _vec_i_mul_sub2(rs6, rs7, rs8, r0, r1, r2, U0); \ + _vec_i_mul_add2(rs9, rs10, rs11, r3, r4, r5, U0); + +#define _hop_z_m_post32() \ + _prefetch_su3(U+1); \ + _vec_load2_32(r0, r1, r2, phi32[ix]->s0); \ + _vec_load2_32(r3, r4, r5, phi32[ix]->s1); \ + rtmp = vec_ld2(0, (double*) &ka3); \ + _vec_su3_inverse_multiply_double2(U); \ + _vec_cmplxcg_mul_double2(r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, rtmp); \ + _vec_add2(rs0, rs1, rs2, r0, r1, r2); \ + _vec_i_mul_add2(rs6, rs7, rs8, r0, r1, r2, U0); \ + _vec_add2(rs3, rs4, rs5, r3, r4, r5); \ + _vec_i_mul_sub2(rs9, rs10, rs11, r3, r4, r5, U0); + +#define _hop_t_p_pre2() \ + _vec_load2(rs0, rs1, rs2, s->s0); \ + _vec_load2(rs3, rs4, rs5, s->s1); \ + _vec_load2(rs6, rs7, rs8, s->s2); \ + _vec_load2(rs9, rs10, rs11, s->s3); \ + _prefetch_spinor(s+1); \ + _prefetch_su3(U+1); \ + _vec_add_to2(r0, r1, r2, rs0, rs1, rs2, rs6, rs7, rs8); \ + _vec_add_to2(r3, r4, r5, rs3, rs4, rs5, rs9, rs10, rs11); \ + rtmp = vec_ld2(0, (double*) &ka0); \ + _vec_su3_multiply_double2(U); \ + _vec_cmplx_mul_double2(r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, rtmp); \ + _vec_store2(phi[ix]->s0, r0, r1, r2); \ + _vec_store2(phi[ix]->s1, r3, r4, r5); + +// note that this _hop_t_p_pre stores the quadwords in phi[ix] +// in a different order than expected, but this is taken care of in +// the correspondin _hop_t_p_post version +// +// it might be good to check whether unfusing is better done here +// instead of in the corresponding post version!? +#define _hop_t_p_pre() \ + _vec_load(rs0, rs1, s->s0); \ + _vec_load16(rs2, rs3, s->s1, rtmp); \ + _vec_load(rs4, rs5, s->s2); \ + _vec_load16(rs6, rs7, s->s3, rtmp); \ + _prefetch_spinor(s+1); \ + _prefetch_su3(U+1); \ + _vec_add(r0, r1, rs0, rs1, rs4, rs5); \ + _vec_add(r2, r3, rs2, rs3, rs6, rs7); \ + _vec_su3_multiply_double2c(U); \ + rtmp = vec_ld2(0, (double*) &ka0); \ + _vec_cmplx_mul_double2c(r0, r1, r2, r4, r5, r6, rtmp); \ + _vec_store_halfspinor(phi[ix]->s0, r0, r1, r2); + +#define _hop_t_m_pre2() \ + _vec_sub_to2(r0, r1, r2, rs0, rs1, rs2, rs6, rs7, rs8); \ + _vec_sub_to2(r3, r4, r5, rs3, rs4, rs5, rs9, rs10, rs11); \ + _vec_store2(phi[ix]->s0, r0, r1, r2); \ + _vec_store2(phi[ix]->s1, r3, r4, r5); + +#define _hop_t_m_pre() \ + _vec_sub(r0, r1, rs0, rs1, rs4, rs5); \ + _vec_sub(r2, r3, rs2, rs3, rs6, rs7); \ + _vec_store(phi[ix]->s0, r0, r1); \ + _vec_store16(phi[ix]->s1, r2, r3, U0); + +#define _hop_x_p_pre2() \ + _prefetch_su3(U+1); \ + _vec_i_mul_add_to2(r0, r1, r2, rs0, rs1, rs2, rs9, rs10, rs11, U0); \ + _vec_i_mul_add_to2(r3, r4, r5, rs3, rs4, rs5, rs6, rs7, rs8, U0); \ + rtmp = vec_ld2(0, (double*) &ka1); \ + _vec_su3_multiply_double2(U); \ + _vec_cmplx_mul_double2(r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, rtmp); \ + _vec_store2(phi[ix]->s0, r0, r1, r2); \ + _vec_store2(phi[ix]->s1, r3, r4, r5); + +#define _hop_x_p_pre() \ + _prefetch_su3(U+1); \ + _vec_i_mul_add(r0, r1, rs0, rs1, rs6, rs7, U0); \ + _vec_i_mul_add(r2, r3, rs2, rs3, rs4, rs5, U0); \ + rtmp = vec_ld2(0, (double*) &ka1); \ + _vec_su3_multiply_double2c(U); \ + _vec_cmplx_mul_double2c(r0, r1, r2, r4, r5, r6, rtmp); \ + _vec_store_halfspinor(phi[ix]->s0, r0, r1, r2); + +#define _hop_x_m_pre2() \ + _vec_i_mul_sub_to2(r0, r1, r2, rs0, rs1, rs2, rs9, rs10, rs11, U0); \ + _vec_i_mul_sub_to2(r3, r4, r5, rs3, rs4, rs5, rs6, rs7, rs8, U0); \ + _vec_store2(phi[ix]->s0, r0, r1, r2); \ + _vec_store2(phi[ix]->s1, r3, r4, r5); + +#define _hop_x_m_pre() \ + _vec_i_mul_sub(r0, r1, rs0, rs1, rs6, rs7, U0); \ + _vec_i_mul_sub(r2, r3, rs2, rs3, rs4, rs5, U0); \ + _vec_store(phi[ix]->s0, r0, r1); \ + _vec_store16(phi[ix]->s1, r2, r3, U0); + +#define _hop_y_p_pre2() \ + _prefetch_su3(U+1); \ + _vec_add_to2(r0, r1, r2, rs0, rs1, rs2, rs9, rs10, rs11); \ + _vec_sub_to2(r3, r4, r5, rs3, rs4, rs5, rs6, rs7, rs8); \ + rtmp = vec_ld2(0, (double*) &ka2); \ + _vec_su3_multiply_double2(U); \ + _vec_cmplx_mul_double2(r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, rtmp); \ + _vec_store2(phi[ix]->s0, r0, r1, r2); \ + _vec_store2(phi[ix]->s1, r3, r4, r5); + +#define _hop_y_p_pre() \ + _prefetch_su3(U+1); \ + _vec_add(r0, r1, rs0, rs1, rs6, rs7); \ + _vec_sub(r2, r3, rs2, rs3, rs4, rs5); \ + rtmp = vec_ld2(0, (double*) &ka2); \ + _vec_su3_multiply_double2c(U); \ + _vec_cmplx_mul_double2c(r0, r1, r2, r4, r5, r6, rtmp); \ + _vec_store_halfspinor(phi[ix]->s0, r0, r1, r2); + +#define _hop_y_m_pre2() \ + _vec_sub_to2(r0, r1, r2, rs0, rs1, rs2, rs9, rs10, rs11); \ + _vec_add_to2(r3, r4, r5, rs3, rs4, rs5, rs6, rs7, rs8); \ + _vec_store2(phi[ix]->s0, r0, r1, r2); \ + _vec_store2(phi[ix]->s1, r3, r4, r5); + +#define _hop_y_m_pre() \ + _vec_sub(r0, r1, rs0, rs1, rs6, rs7); \ + _vec_add(r2, r3, rs2, rs3, rs4, rs5); \ + _vec_store(phi[ix]->s0, r0, r1); \ + _vec_store16(phi[ix]->s1, r2, r3, U0); + +#define _hop_z_p_pre2() \ + _prefetch_su3(U+1); \ + _vec_i_mul_add_to2(r0, r1, r2, rs0, rs1, rs2, rs6, rs7, rs8, U0); \ + _vec_i_mul_sub_to2(r3, r4, r5, rs3, rs4, rs5, rs9, rs10, rs11, U0); \ + rtmp = vec_ld2(0, (double*) &ka3); \ + _vec_su3_multiply_double2(U); \ + _vec_cmplx_mul_double2(r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, rtmp); \ + _vec_store2(phi[ix]->s0, r0, r1, r2); \ + _vec_store2(phi[ix]->s1, r3, r4, r5); + +#define _hop_z_p_pre() \ + _prefetch_su3(U+1); \ + _vec_i_mul_add(r0, r1, rs0, rs1, rs4, rs5, U0); \ + _vec_i_mul_sub(r2, r3, rs2, rs3, rs6, rs7, U0); \ + rtmp = vec_ld2(0, (double*) &ka3); \ + _vec_su3_multiply_double2c(U); \ + _vec_cmplx_mul_double2c(r0, r1, r2, r4, r5, r6, rtmp); \ + _vec_store_halfspinor(phi[ix]->s0, r0, r1, r2); + +#define _hop_z_m_pre2() \ + _vec_i_mul_sub_to2(r0, r1, r2, rs0, rs1, rs2, rs6, rs7, rs8, U0); \ + _vec_i_mul_add_to2(r3, r4, r5, rs3, rs4, rs5, rs9, rs10, rs11, U0); \ + _vec_store2(phi[ix]->s0, r0, r1, r2); \ + _vec_store2(phi[ix]->s1, r3, r4, r5); + +#define _hop_z_m_pre() \ + _vec_i_mul_sub(r0, r1, rs0, rs1, rs4, rs5, U0); \ + _vec_i_mul_add(r2, r3, rs2, rs3, rs6, rs7, U0); \ + _vec_store(phi[ix]->s0, r0, r1); \ + _vec_store16(phi[ix]->s1, r2, r3, U0); + +#define _hop_t_p_post2() \ + _vec_load2(rs0, rs1, rs2, phi[ix]->s0); \ + rs6 = rs0; \ + rs7 = rs1; \ + rs8 = rs2; \ + _vec_load2(rs3, rs4, rs5, phi[ix]->s1); \ + rs9 = rs3; \ + rs10= rs4; \ + rs11= rs5; + +#define _hop_t_p_post() \ + _vec_load_halfspinor(rs0, rs1, rs2, phi[ix]->s0); \ + _vec_unfuse(rs0, rs1, rs2, rs3, rs4, rs5); \ + rs6 = rs0; rs7 = rs1; rs8 = rs2; \ + rs9 = rs3; rs10= rs4; rs11= rs5; + +#define _hop_t_m_post2() \ + _prefetch_su3(U+1); \ + _vec_load2(r0, r1, r2, phi[ix]->s0); \ + _vec_load2(r3, r4, r5, phi[ix]->s1); \ + rtmp = vec_ld2(0, (double*) &ka0); \ + _vec_su3_inverse_multiply_double2(U); \ + _vec_cmplxcg_mul_double2(r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, rtmp); \ + _vec_add2(rs0, rs1, rs2, r0, r1, r2); \ + _vec_sub2(rs6, rs7, rs8, r0, r1, r2); \ + _vec_add2(rs3, rs4, rs5, r3, r4, r5); \ + _vec_sub2(rs9, rs10, rs11, r3, r4, r5); + + +#define _hop_t_m_post() \ + _prefetch_su3(U+1); \ + _vec_load(r0, r1, phi[ix]->s0); \ + _vec_load16(r2, r3, phi[ix]->s1, rtmp); \ + rtmp = vec_ld2(0, (double*) &ka0); \ + _vec_su3_inverse_multiply_double2c(U); \ + _vec_cmplxcg_mul_double2c(r0, r1, r2, r4, r5, r6, rtmp); \ + _vec_unfuse(r0, r1, r2, r3, r4, r5); \ + _vec_add_double2(rs0, rs1, rs2, rs3, rs4, rs5, r0, r1, r2, r3, r4, r5); \ + _vec_sub_double2(rs6, rs7, rs8, rs9, rs10, rs11, r0, r1, r2, r3, r4, r5); + +#define _hop_x_p_post2() \ + _vec_load2(r0, r1, r2, phi[ix]->s0); \ + _vec_load2(r3, r4, r5, phi[ix]->s1); \ + _vec_add2(rs0, rs1, rs2, r0, r1, r2); \ + _vec_i_mul_sub2(rs9, rs10, rs11, r0, r1, r2, U0); \ + _vec_add2(rs3, rs4, rs5, r3, r4, r5); \ + _vec_i_mul_sub2(rs6, rs7, rs8, r3, r4, r5, U0); + +#define _hop_x_p_post() \ + _vec_load_halfspinor(r0, r1, r2, phi[ix]->s0); \ + _vec_unfuse(r0, r1, r2, r3, r4, r5); \ + _vec_add_double2(rs0, rs1, rs2, rs3, rs4, rs5, r0, r1, r2, r3, r4, r5); \ + _vec_i_mul_sub2(rs6, rs7, rs8, r3, r4, r5, U0); \ + _vec_i_mul_sub2(rs9, rs10, rs11, r0, r1, r2, U1); + +#define _hop_x_m_post2() \ + _prefetch_su3(U+1); \ + _vec_load2(r0, r1, r2, phi[ix]->s0); \ + _vec_load2(r3, r4, r5, phi[ix]->s1); \ + rtmp = vec_ld2(0, (double*) &ka1); \ + _vec_su3_inverse_multiply_double2(U); \ + _vec_cmplxcg_mul_double2(r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, rtmp); \ + _vec_add_double2(rs0, rs1, rs2, rs3, rs4, rs5, r0, r1, r2, r3, r4, r5); \ + _vec_i_mul_add_double2(rs9, rs10, rs11, rs6, rs7, rs8, r0, r1, r2, r3, r4, r5, U0); + +#define _hop_x_m_post() \ + _prefetch_su3(U+1); \ + _vec_load(r0, r1, phi[ix]->s0); \ + _vec_load16(r2, r3, phi[ix]->s1, rtmp); \ + rtmp = vec_ld2(0, (double*) &ka1); \ + _vec_su3_inverse_multiply_double2c(U); \ + _vec_cmplxcg_mul_double2c(r0, r1, r2, r4, r5, r6, rtmp); \ + _vec_unfuse(r0, r1, r2, r3, r4, r5); \ + _vec_add_double2(rs0, rs1, rs2, rs3, rs4, rs5, r0, r1, r2, r3, r4, r5); \ + _vec_i_mul_add_double2(rs9, rs10, rs11, rs6, rs7, rs8, r0, r1, r2, r3, r4, r5, U0); + +#define _hop_y_p_post2() \ + _vec_load2(r0, r1, r2, phi[ix]->s0); \ + _vec_load2(r3, r4, r5, phi[ix]->s1); \ + _vec_add_double2(rs0, rs1, rs2, rs3, rs4, rs5, r0, r1, r2, r3, r4, r5); \ + _vec_sub2(rs6, rs7, rs8, r3, r4, r5); \ + _vec_add2(rs9, rs10, rs11, r0, r1, r2); + +#define _hop_y_p_post() \ + _vec_load_halfspinor(r0, r1, r2, phi[ix]->s0); \ + _vec_unfuse(r0, r1, r2, r3, r4, r5); \ + _vec_add_double2(rs0, rs1, rs2, rs3, rs4, rs5, r0, r1, r2, r3, r4, r5); \ + _vec_sub2(rs6, rs7, rs8, r3, r4, r5); \ + _vec_add2(rs9, rs10, rs11, r0, r1, r2); + +#define _hop_y_m_post2() \ + _prefetch_su3(U+1); \ + _vec_load2(r0, r1, r2, phi[ix]->s0); \ + _vec_load2(r3, r4, r5, phi[ix]->s1); \ + rtmp = vec_ld2(0, (double*) &ka2); \ + _vec_su3_inverse_multiply_double2(U); \ + _vec_cmplxcg_mul_double2(r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, rtmp); \ + _vec_add_double2(rs0, rs1, rs2, rs3, rs4, rs5, r0, r1, r2, r3, r4, r5); \ + _vec_add2(rs6, rs7, rs8, r3, r4, r5); \ + _vec_sub2(rs9, rs10, rs11, r0, r1, r2); + +#define _hop_y_m_post() \ + _prefetch_su3(U+1); \ + _vec_load(r0, r1, phi[ix]->s0); \ + _vec_load16(r2, r3, phi[ix]->s1, rtmp); \ + rtmp = vec_ld2(0, (double*) &ka2); \ + _vec_su3_inverse_multiply_double2c(U); \ + _vec_cmplxcg_mul_double2c(r0, r1, r2, r4, r5, r6, rtmp); \ + _vec_unfuse(r0, r1, r2, r3, r4, r5); \ + _vec_add_double2(rs0, rs1, rs2, rs3, rs4, rs5, r0, r1, r2, r3, r4, r5); \ + _vec_add2(rs6, rs7, rs8, r3, r4, r5); \ + _vec_sub2(rs9, rs10, rs11, r0, r1, r2); + +#define _hop_z_p_post2() \ + _vec_load2(r0, r1, r2, phi[ix]->s0); \ + _vec_load2(r3, r4, r5, phi[ix]->s1); \ + _vec_add_double2(rs0, rs1, rs2, rs3, rs4, rs5, r0, r1, r2, r3, r4, r5); \ + _vec_i_mul_sub2(rs6, rs7, rs8, r0, r1, r2, U0); \ + _vec_i_mul_add2(rs9, rs10, rs11, r3, r4, r5, U0); + +#define _hop_z_p_post() \ + _vec_load_halfspinor(r0, r1, r2, phi[ix]->s0); \ + _vec_unfuse(r0, r1, r2, r3, r4, r5); \ + _vec_add_double2(rs0, rs1, rs2, rs3, rs4, rs5, r0, r1, r2, r3, r4, r5); \ + _vec_i_mul_sub2(rs6, rs7, rs8, r0, r1, r2, U0); \ + _vec_i_mul_add2(rs9, rs10, rs11, r3, r4, r5, U1); + +#define _hop_z_m_post2() \ + _prefetch_su3(U+1); \ + _vec_load2(r0, r1, r2, phi[ix]->s0); \ + _vec_load2(r3, r4, r5, phi[ix]->s1); \ + rtmp = vec_ld2(0, (double*) &ka3); \ + _vec_su3_inverse_multiply_double2(U); \ + _vec_cmplxcg_mul_double2(r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, rtmp); \ + _vec_add2(rs0, rs1, rs2, r0, r1, r2); \ + _vec_add2(rs3, rs4, rs5, r3, r4, r5); \ + _vec_i_mul_add2(rs6, rs7, rs8, r0, r1, r2, U0); \ + _vec_i_mul_sub2(rs9, rs10, rs11, r3, r4, r5, U0); + +#define _hop_z_m_post() \ + _prefetch_su3(U+1); \ + _vec_load(r0, r1, phi[ix]->s0); \ + _vec_load16(r2, r3, phi[ix]->s1, rtmp); \ + rtmp = vec_ld2(0, (double*) &ka3); \ + _vec_su3_inverse_multiply_double2c(U); \ + _vec_cmplxcg_mul_double2c(r0, r1, r2, r4, r5, r6, rtmp); \ + _vec_unfuse(r0, r1, r2, r3, r4, r5); \ + _vec_add_double2(rs0, rs1, rs2, rs3, rs4, rs5, r0, r1, r2, r3, r4, r5); \ + _vec_i_mul_add2(rs6, rs7, rs8, r0, r1, r2, U0); \ + _vec_i_mul_sub2(rs9, rs10, rs11, r3, r4, r5, U1); + +#define _hop_mul_g5_cmplx_and_store(res) \ + _vec_cmplx_mul_double2(r0, r1, r2, r3, r4, r5, rs0, rs1, rs2, rs3, rs4, rs5, cf); \ + _vec_cmplxcg_mul_double2(r6, r7, r8, r9, r10, r11, rs6, rs7, rs8, rs9, rs10, rs11, cf); \ + _vec_store2((res)->s0, r0, r1, r2); \ + _vec_store2((res)->s1, r3, r4, r5); \ + _vec_store2((res)->s2, r6, r7, r8); \ + _vec_store2((res)->s3, r9, r10, r11); + +#define _g5_cmplx_sub_hop_and_g5store(res) \ + _vec_load_halfspinor(r3, r4, r5, pn->s0); \ + _vec_cmplx_mul_double2c(r0, r1, r2, r3, r4, r5, cf); \ + _vec_unfuse(r0, r1, r2, r3, r4, r5); \ + _vec_sub_double2(r0, r3, r1, r4, r2, r5, rs0, rs1, rs2, rs3, rs4, rs5); \ + _vec_store2((res)->s0, r0, r3, r1); \ + _vec_store2((res)->s1, r4, r2, r5); \ + _vec_load_halfspinor(r3, r4, r5, pn->s2); \ + _vec_cmplxcg_mul_double2c(r0, r1, r2, r3, r4, r5, cf); \ + _vec_unfuse(r0, r1, r2, r3, r4, r5); \ + _vec_sub_double2(rs6, rs7, rs8, rs9, rs10, rs11, r0, r3, r1, r4, r2, r5); \ + _vec_store2((res)->s2, rs6, rs7, rs8); \ + _vec_store2((res)->s3, rs9, rs10, rs11); + +#define _hop_store_post(res) \ + _vec_store2((res)->s0, rs0, rs1, rs2); \ + _vec_store2((res)->s1, rs3, rs4, rs5); \ + _vec_store2((res)->s2, rs6, rs7, rs8); \ + _vec_store2((res)->s3, rs9, rs10, rs11); + + +#define _declare_hregs() \ + vector4double ALIGN r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11; \ + vector4double ALIGN rs0, rs1, rs2, rs3, rs4, rs5, rs6, rs7, rs8, rs9, rs10, rs11; \ + vector4double ALIGN U0, U1, U2, U3, U4, U6, U7; \ + vector4double ALIGN rtmp; + +#else + +#define _prefetch_spinor(s) +#define _prefetch_halfspinor(hs) +#define _prefetch_su3(U) + +#define _hop_t_p_pre32() \ + _vector_assign(rs.s0, s->s0); \ + _vector_assign(rs.s1, s->s1); \ + _vector_assign(rs.s2, s->s2); \ + _vector_assign(rs.s3, s->s3); \ + _vector_add(psi, rs.s0, rs.s2); \ + _su3_multiply(chi,(*U),psi); \ + _complex_times_vector(phi32[ix]->s0, ka0, chi); \ + _vector_add(psi, rs.s1, rs.s3); \ + _su3_multiply(chi,(*U),psi); \ + _complex_times_vector(phi32[ix]->s1, ka0, chi); + +#define _hop_t_m_pre32() \ + _vector_sub(phi32[ix]->s0, rs.s0, rs.s2); \ + _vector_sub(phi32[ix]->s1, rs.s1, rs.s3); + +#define _hop_x_p_pre32() \ + _vector_i_add(psi, rs.s0, rs.s3); \ + _su3_multiply(chi, (*U), psi); \ + _complex_times_vector(phi32[ix]->s0, ka1, chi); \ + _vector_i_add(psi, rs.s1, rs.s2); \ + _su3_multiply(chi, (*U), psi); \ + _complex_times_vector(phi32[ix]->s1, ka1, chi); + +#define _hop_x_m_pre32() \ + _vector_i_sub(phi32[ix]->s0, rs.s0, rs.s3); \ + _vector_i_sub(phi32[ix]->s1, rs.s1, rs.s2); + +#define _hop_y_p_pre32() \ + _vector_add(psi, rs.s0, rs.s3); \ + _su3_multiply(chi,(*U),psi); \ + _complex_times_vector(phi32[ix]->s0, ka2, chi); \ + _vector_sub(psi, rs.s1, rs.s2); \ + _su3_multiply(chi,(*U),psi); \ + _complex_times_vector(phi32[ix]->s1, ka2, chi); + +#define _hop_y_m_pre32() \ + _vector_sub(phi32[ix]->s0, rs.s0, rs.s3); \ + _vector_add(phi32[ix]->s1, rs.s1, rs.s2); + +#define _hop_z_p_pre32() \ + _vector_i_add(psi, rs.s0, rs.s2); \ + _su3_multiply(chi, (*U), psi); \ + _complex_times_vector(phi32[ix]->s0, ka3, chi); \ + _vector_i_sub(psi, rs.s1, rs.s3); \ + _su3_multiply(chi,(*U),psi); \ + _complex_times_vector(phi32[ix]->s1, ka3, chi); + +#define _hop_z_m_pre32() \ + _vector_i_sub(phi32[ix]->s0, rs.s0, rs.s2); \ + _vector_i_add(phi32[ix]->s1, rs.s1, rs.s3); + +#define _hop_t_p_post32(); \ + _vector_assign(rs.s0, phi32[ix]->s0); \ + _vector_assign(rs.s2, phi32[ix]->s0); \ + _vector_assign(rs.s1, phi32[ix]->s1); \ + _vector_assign(rs.s3, phi32[ix]->s1); \ + +#define _hop_t_m_post32(); \ + _vector_assign(psi, phi32[ix]->s0); \ + _su3_inverse_multiply(chi,(*U), psi); \ + _complexcjg_times_vector(psi,ka0,chi); \ + _vector_add_assign(rs.s0, psi); \ + _vector_sub_assign(rs.s2, psi); \ + _vector_assign(psi, phi32[ix]->s1); \ + _su3_inverse_multiply(chi,(*U), psi); \ + _complexcjg_times_vector(psi,ka0,chi); \ + _vector_add_assign(rs.s1, psi); \ + _vector_sub_assign(rs.s3, psi); + +#define _hop_x_p_post32(); \ + _vector_add_assign(rs.s0, phi32[ix]->s0); \ + _vector_i_sub_assign(rs.s3, phi32[ix]->s0); \ + _vector_add_assign(rs.s1, phi32[ix]->s1); \ + _vector_i_sub_assign(rs.s2, phi32[ix]->s1); + +#define _hop_x_m_post32(); \ + _vector_assign(psi, phi32[ix]->s0); \ + _su3_inverse_multiply(chi,(*U), psi); \ + _complexcjg_times_vector(psi,ka1,chi); \ + _vector_add_assign(rs.s0, psi); \ + _vector_i_add_assign(rs.s3, psi); \ + _vector_assign(psi, phi32[ix]->s1); \ + _su3_inverse_multiply(chi,(*U), psi); \ + _complexcjg_times_vector(psi,ka1,chi); \ + _vector_add_assign(rs.s1, psi); \ + _vector_i_add_assign(rs.s2, psi); + +#define _hop_y_p_post32(); \ + _vector_add_assign(rs.s0, phi32[ix]->s0); \ + _vector_add_assign(rs.s3, phi32[ix]->s0); \ + _vector_add_assign(rs.s1, phi32[ix]->s1); \ + _vector_sub_assign(rs.s2, phi32[ix]->s1); + +#define _hop_y_m_post32(); \ + _vector_assign(psi, phi32[ix]->s0); \ + _su3_inverse_multiply(chi,(*U), psi); \ + _complexcjg_times_vector(psi,ka2,chi); \ + _vector_add_assign(rs.s0, psi); \ + _vector_sub_assign(rs.s3, psi); \ + _vector_assign(psi, phi32[ix]->s1); \ + _su3_inverse_multiply(chi, (*U), psi); \ + _complexcjg_times_vector(psi,ka2,chi); \ + _vector_add_assign(rs.s1, psi); \ + _vector_add_assign(rs.s2, psi); + +#define _hop_z_p_post32(); \ + _vector_add_assign(rs.s0, phi32[ix]->s0); \ + _vector_i_sub_assign(rs.s2, phi32[ix]->s0); \ + _vector_add_assign(rs.s1, phi32[ix]->s1); \ + _vector_i_add_assign(rs.s3, phi32[ix]->s1); + +#define _hop_z_m_post32(); \ + _vector_assign(psi, phi32[ix]->s0); \ + _su3_inverse_multiply(chi,(*U), psi); \ + _complexcjg_times_vector(psi,ka3,chi); \ + _vector_add_assign(rs.s0, psi); \ + _vector_i_add_assign(rs.s2, psi); \ + _vector_assign(psi, phi32[ix]->s1); \ + _su3_inverse_multiply(chi,(*U), psi); \ + _complexcjg_times_vector(psi,ka3,chi); \ + _vector_add_assign(rs.s1, psi); \ + _vector_i_sub_assign(rs.s3, psi); + +#define _hop_t_p_pre() \ + _vector_assign(rs.s0, s->s0); \ + _vector_assign(rs.s1, s->s1); \ + _vector_assign(rs.s2, s->s2); \ + _vector_assign(rs.s3, s->s3); \ + _vector_add(psi, rs.s0, rs.s2); \ + _vector_add(psi2, rs.s1, rs.s3); \ + _su3_multiply(chi,(*U),psi); \ + _su3_multiply(chi2,(*U),psi2); \ + _complex_times_vector(phi[ix]->s0, ka0, chi); \ + _complex_times_vector(phi[ix]->s1, ka0, chi2); + +#define _hop_t_m_pre() \ + _vector_sub(phi[ix]->s0, rs.s0, rs.s2); \ + _vector_sub(phi[ix]->s1, rs.s1, rs.s3); + +#define _hop_x_p_pre() \ + _vector_i_add(psi, rs.s0, rs.s3); \ + _vector_i_add(psi2, rs.s1, rs.s2); \ + _su3_multiply(chi, (*U), psi); \ + _su3_multiply(chi2, (*U), psi2); \ + _complex_times_vector(phi[ix]->s0, ka1, chi); \ + _complex_times_vector(phi[ix]->s1, ka1, chi2); + +#define _hop_x_m_pre() \ + _vector_i_sub(phi[ix]->s0, rs.s0, rs.s3); \ + _vector_i_sub(phi[ix]->s1, rs.s1, rs.s2); + +#define _hop_y_p_pre() \ + _vector_add(psi, rs.s0, rs.s3); \ + _vector_sub(psi2, rs.s1, rs.s2); \ + _su3_multiply(chi,(*U),psi); \ + _su3_multiply(chi2,(*U),psi2); \ + _complex_times_vector(phi[ix]->s0, ka2, chi); \ + _complex_times_vector(phi[ix]->s1, ka2, chi2); + +#define _hop_y_m_pre() \ + _vector_sub(phi[ix]->s0, rs.s0, rs.s3); \ + _vector_add(phi[ix]->s1, rs.s1, rs.s2); + +#define _hop_z_p_pre() \ + _vector_i_add(psi, rs.s0, rs.s2); \ + _vector_i_sub(psi2, rs.s1, rs.s3); \ + _su3_multiply(chi, (*U), psi); \ + _su3_multiply(chi2,(*U),psi2); \ + _complex_times_vector(phi[ix]->s0, ka3, chi); \ + _complex_times_vector(phi[ix]->s1, ka3, chi2); + +#define _hop_z_m_pre() \ + _vector_i_sub(phi[ix]->s0, rs.s0, rs.s2); \ + _vector_i_add(phi[ix]->s1, rs.s1, rs.s3); + +#define _hop_t_p_post() \ + _vector_assign(rs.s0, phi[ix]->s0); \ + _vector_assign(rs.s2, phi[ix]->s0); \ + _vector_assign(rs.s1, phi[ix]->s1); \ + _vector_assign(rs.s3, phi[ix]->s1); + +#define _hop_t_m_post() \ + _su3_inverse_multiply(chi,(*U),phi[ix]->s0); \ + _su3_inverse_multiply(chi2,(*U),phi[ix]->s1); \ + _complexcjg_times_vector(psi,ka0,chi); \ + _complexcjg_times_vector(psi2,ka0,chi2); \ + _vector_add_assign(rs.s0, psi); \ + _vector_sub_assign(rs.s2, psi); \ + _vector_add_assign(rs.s1, psi2); \ + _vector_sub_assign(rs.s3, psi2); + +#define _hop_x_p_post() \ + _vector_add_assign(rs.s0, phi[ix]->s0); \ + _vector_i_sub_assign(rs.s3, phi[ix]->s0); \ + _vector_add_assign(rs.s1, phi[ix]->s1); \ + _vector_i_sub_assign(rs.s2, phi[ix]->s1); + +#define _hop_x_m_post() \ + _su3_inverse_multiply(chi,(*U), phi[ix]->s0); \ + _su3_inverse_multiply(chi2, (*U), phi[ix]->s1); \ + _complexcjg_times_vector(psi,ka1,chi); \ + _complexcjg_times_vector(psi2,ka1,chi2); \ + _vector_add_assign(rs.s0, psi); \ + _vector_i_add_assign(rs.s3, psi); \ + _vector_add_assign(rs.s1, psi2); \ + _vector_i_add_assign(rs.s2, psi2); + +#define _hop_y_p_post() \ + _vector_add_assign(rs.s0, phi[ix]->s0); \ + _vector_add_assign(rs.s3, phi[ix]->s0); \ + _vector_add_assign(rs.s1, phi[ix]->s1); \ + _vector_sub_assign(rs.s2, phi[ix]->s1); + +#define _hop_y_m_post() \ + _su3_inverse_multiply(chi,(*U), phi[ix]->s0); \ + _su3_inverse_multiply(chi2, (*U), phi[ix]->s1); \ + _complexcjg_times_vector(psi,ka2,chi); \ + _complexcjg_times_vector(psi2,ka2,chi2); \ + _vector_add_assign(rs.s0, psi); \ + _vector_sub_assign(rs.s3, psi); \ + _vector_add_assign(rs.s1, psi2); \ + _vector_add_assign(rs.s2, psi2); + +#define _hop_z_p_post() \ + _vector_add_assign(rs.s0, phi[ix]->s0); \ + _vector_i_sub_assign(rs.s2, phi[ix]->s0); \ + _vector_add_assign(rs.s1, phi[ix]->s1); \ + _vector_i_add_assign(rs.s3, phi[ix]->s1); + +#define _hop_z_m_post() \ + _su3_inverse_multiply(chi,(*U), phi[ix]->s0); \ + _su3_inverse_multiply(chi2, (*U), phi[ix]->s1); \ + _complexcjg_times_vector(psi,ka3,chi); \ + _complexcjg_times_vector(psi2,ka3,chi2); \ + _vector_add_assign(rs.s0, psi); \ + _vector_add_assign(rs.s1, psi2); \ + _vector_i_add_assign(rs.s2, psi); \ + _vector_i_sub_assign(rs.s3, psi2); + +#define _hop_mul_g5_cmplx_and_store(res) \ + _complex_times_vector((res)->s0, cfactor, rs.s0); \ + _complex_times_vector((res)->s1, cfactor, rs.s1); \ + _complexcjg_times_vector((res)->s2, cfactor, rs.s2); \ + _complexcjg_times_vector((res)->s3, cfactor, rs.s3); + +#define _g5_cmplx_sub_hop_and_g5store(res) \ + _complex_times_vector(psi, cfactor, pn->s0); \ + _vector_sub((res)->s0, psi, rs.s0); \ + _complex_times_vector(psi2, cfactor, pn->s1); \ + _vector_sub((res)->s1, psi2, rs.s1); \ + _complexcjg_times_vector(psi, cfactor, pn->s2); \ + _vector_sub((res)->s2, rs.s2, psi); \ + _complexcjg_times_vector(psi2, cfactor, pn->s3); \ + _vector_sub((res)->s3, rs.s3, psi2); + + +#define _hop_store_post(res) \ + _vector_assign(res->s0, rs.s0); \ + _vector_assign(res->s1, rs.s1); \ + _vector_assign(res->s2, rs.s2); \ + _vector_assign(res->s3, rs.s3); + + +#define _declare_hregs() \ + spinor ALIGN rs; \ + su3_vector ALIGN psi, chi, psi2, chi2; + +#endif + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/halfspinor_hopping_32.h b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/halfspinor_hopping_32.h new file mode 100644 index 0000000000000000000000000000000000000000..697fa14963d85195f4461abcbf30ba69c988f383 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/halfspinor_hopping_32.h @@ -0,0 +1,408 @@ +/********************************************************************** + * + * Copyright (C) 2013 Florian Burger + * + * A 32-bit version of the Half-spinor implementation by Carsten Urbach + * + * This file is based on an implementation of the Dirac operator + * written by Martin Luescher, modified by Martin Hasenbusch in 2002 + * and modified and extended by Carsten Urbach from 2003-2008 + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + **********************************************************************/ + +#ifndef _HALFSPINOR_HOPPING32_H +#define _HALFSPINOR_HOPPING32_H + +#if (defined BGQ && defined XLC) + +#define _hop_t_p_pre32() \ + _vec_load_32(rs0, rs1, s->s0); \ + _vec_load16_32(rs2, rs3, s->s1, rtmp); \ + _vec_load_32(rs4, rs5, s->s2); \ + _vec_load16_32(rs6, rs7, s->s3, rtmp); \ + _prefetch_spinor_32(s+1); \ + _prefetch_su3_32(U+1); \ + _vec_add(r0, r1, rs0, rs1, rs4, rs5); \ + _vec_add(r2, r3, rs2, rs3, rs6, rs7); \ + _vec_su3_multiply_double2c_32(U); \ + rtmp = vec_ld2(0, (float*) &ka0_32); \ + _vec_cmplx_mul_double2c(r0, r1, r2, r4, r5, r6, rtmp); \ + _vec_store_halfspinor_32(phi2[ix]->s0, r0, r1, r2); + + + +#define _hop_t_m_pre32() \ + _vec_sub(r0, r1, rs0, rs1, rs4, rs5); \ + _vec_sub(r2, r3, rs2, rs3, rs6, rs7); \ + _vec_store_32(phi2[ix]->s0, r0, r1); \ + _vec_store16_32(phi2[ix]->s1, r2, r3, U0); + + +#define _hop_x_p_pre32() \ + _prefetch_su3_32(U+1); \ + _vec_i_mul_add(r0, r1, rs0, rs1, rs6, rs7, U0); \ + _vec_i_mul_add(r2, r3, rs2, rs3, rs4, rs5, U0); \ + rtmp = vec_ld2(0, (float*) &ka1_32); \ + _vec_su3_multiply_double2c_32(U); \ + _vec_cmplx_mul_double2c(r0, r1, r2, r4, r5, r6, rtmp); \ + _vec_store_halfspinor_32(phi2[ix]->s0, r0, r1, r2); + + +#define _hop_x_m_pre32() \ + _vec_i_mul_sub(r0, r1, rs0, rs1, rs6, rs7, U0); \ + _vec_i_mul_sub(r2, r3, rs2, rs3, rs4, rs5, U0); \ + _vec_store_32(phi2[ix]->s0, r0, r1); \ + _vec_store16_32(phi2[ix]->s1, r2, r3, U0); + + +#define _hop_y_p_pre32() \ + _prefetch_su3_32(U+1); \ + _vec_add(r0, r1, rs0, rs1, rs6, rs7); \ + _vec_sub(r2, r3, rs2, rs3, rs4, rs5); \ + rtmp = vec_ld2(0, (float*) &ka2_32); \ + _vec_su3_multiply_double2c_32(U); \ + _vec_cmplx_mul_double2c(r0, r1, r2, r4, r5, r6, rtmp); \ + _vec_store_halfspinor_32(phi2[ix]->s0, r0, r1, r2); + + + +#define _hop_y_m_pre32() \ + _vec_sub(r0, r1, rs0, rs1, rs6, rs7); \ + _vec_add(r2, r3, rs2, rs3, rs4, rs5); \ + _vec_store_32(phi2[ix]->s0, r0, r1); \ + _vec_store16_32(phi2[ix]->s1, r2, r3, U0); + + +#define _hop_z_p_pre32() \ + _prefetch_su3_32(U+1); \ + _vec_i_mul_add(r0, r1, rs0, rs1, rs4, rs5, U0); \ + _vec_i_mul_sub(r2, r3, rs2, rs3, rs6, rs7, U0); \ + rtmp = vec_ld2(0, (float*) &ka3_32); \ + _vec_su3_multiply_double2c_32(U); \ + _vec_cmplx_mul_double2c(r0, r1, r2, r4, r5, r6, rtmp); \ + _vec_store_halfspinor_32(phi2[ix]->s0, r0, r1, r2); + + +#define _hop_z_m_pre32() \ + _vec_i_mul_sub(r0, r1, rs0, rs1, rs4, rs5, U0); \ + _vec_i_mul_add(r2, r3, rs2, rs3, rs6, rs7, U0); \ + _vec_store_32(phi2[ix]->s0, r0, r1); \ + _vec_store16_32(phi2[ix]->s1, r2, r3, U0); + + +#define _hop_t_p_post32() \ + _vec_load_halfspinor_32(rs0, rs1, rs2, phi2[ix]->s0); \ + _vec_unfuse(rs0, rs1, rs2, rs3, rs4, rs5); \ + rs6 = rs0; rs7 = rs1; rs8 = rs2; \ + rs9 = rs3; rs10= rs4; rs11= rs5; + + +#define _hop_t_m_post32() \ + _prefetch_su3_32(U+1); \ + _vec_load_32(r0, r1, phi2[ix]->s0); \ + _vec_load16_32(r2, r3, phi2[ix]->s1, rtmp); \ + rtmp = vec_ld2(0, (float*) &ka0_32); \ + _vec_su3_inverse_multiply_double2c_32(U); \ + _vec_cmplxcg_mul_double2c(r0, r1, r2, r4, r5, r6, rtmp); \ + _vec_unfuse(r0, r1, r2, r3, r4, r5); \ + _vec_add_double2(rs0, rs1, rs2, rs3, rs4, rs5, r0, r1, r2, r3, r4, r5); \ + _vec_sub_double2(rs6, rs7, rs8, rs9, rs10, rs11, r0, r1, r2, r3, r4, r5); + + +#define _hop_x_p_post32() \ + _vec_load_halfspinor_32(r0, r1, r2, phi2[ix]->s0); \ + _vec_unfuse(r0, r1, r2, r3, r4, r5); \ + _vec_add_double2(rs0, rs1, rs2, rs3, rs4, rs5, r0, r1, r2, r3, r4, r5); \ + _vec_i_mul_sub2(rs6, rs7, rs8, r3, r4, r5, U0); \ + _vec_i_mul_sub2(rs9, rs10, rs11, r0, r1, r2, U1); + + +#define _hop_x_m_post32() \ + _prefetch_su3_32(U+1); \ + _vec_load_32(r0, r1, phi2[ix]->s0); \ + _vec_load16_32(r2, r3, phi2[ix]->s1, rtmp); \ + rtmp = vec_ld2(0, (float*) &ka1_32); \ + _vec_su3_inverse_multiply_double2c_32(U); \ + _vec_cmplxcg_mul_double2c(r0, r1, r2, r4, r5, r6, rtmp); \ + _vec_unfuse(r0, r1, r2, r3, r4, r5); \ + _vec_add_double2(rs0, rs1, rs2, rs3, rs4, rs5, r0, r1, r2, r3, r4, r5); \ + _vec_i_mul_add_double2(rs9, rs10, rs11, rs6, rs7, rs8, r0, r1, r2, r3, r4, r5, U0); + + + +#define _hop_y_p_post32() \ + _vec_load_halfspinor_32(r0, r1, r2, phi2[ix]->s0); \ + _vec_unfuse(r0, r1, r2, r3, r4, r5); \ + _vec_add_double2(rs0, rs1, rs2, rs3, rs4, rs5, r0, r1, r2, r3, r4, r5); \ + _vec_sub2(rs6, rs7, rs8, r3, r4, r5); \ + _vec_add2(rs9, rs10, rs11, r0, r1, r2); + + +#define _hop_y_m_post32() \ + _prefetch_su3_32(U+1); \ + _vec_load_32(r0, r1, phi2[ix]->s0); \ + _vec_load16_32(r2, r3, phi2[ix]->s1, rtmp); \ + rtmp = vec_ld2(0, (float*) &ka2_32); \ + _vec_su3_inverse_multiply_double2c_32(U); \ + _vec_cmplxcg_mul_double2c(r0, r1, r2, r4, r5, r6, rtmp); \ + _vec_unfuse(r0, r1, r2, r3, r4, r5); \ + _vec_add_double2(rs0, rs1, rs2, rs3, rs4, rs5, r0, r1, r2, r3, r4, r5); \ + _vec_add2(rs6, rs7, rs8, r3, r4, r5); \ + _vec_sub2(rs9, rs10, rs11, r0, r1, r2); + + +#define _hop_z_p_post32() \ + _vec_load_halfspinor_32(r0, r1, r2, phi2[ix]->s0); \ + _vec_unfuse(r0, r1, r2, r3, r4, r5); \ + _vec_add_double2(rs0, rs1, rs2, rs3, rs4, rs5, r0, r1, r2, r3, r4, r5); \ + _vec_i_mul_sub2(rs6, rs7, rs8, r0, r1, r2, U0); \ + _vec_i_mul_add2(rs9, rs10, rs11, r3, r4, r5, U1); + + +#define _hop_z_m_post32() \ + _prefetch_su3_32(U+1); \ + _vec_load_32(r0, r1, phi2[ix]->s0); \ + _vec_load16_32(r2, r3, phi2[ix]->s1, rtmp); \ + rtmp = vec_ld2(0, (float*) &ka3_32); \ + _vec_su3_inverse_multiply_double2c_32(U); \ + _vec_cmplxcg_mul_double2c(r0, r1, r2, r4, r5, r6, rtmp); \ + _vec_unfuse(r0, r1, r2, r3, r4, r5); \ + _vec_add_double2(rs0, rs1, rs2, rs3, rs4, rs5, r0, r1, r2, r3, r4, r5); \ + _vec_i_mul_add2(rs6, rs7, rs8, r0, r1, r2, U0); \ + _vec_i_mul_sub2(rs9, rs10, rs11, r3, r4, r5, U1); + + + + +//end new versions + + + + + +#define _hop_mul_g5_cmplx_and_store32(res) \ + _vec_cmplx_mul_double2(r0, r1, r2, r3, r4, r5, rs0, rs1, rs2, rs3, rs4, rs5, cf); \ + _vec_cmplxcg_mul_double2(r6, r7, r8, r9, r10, r11, rs6, rs7, rs8, rs9, rs10, rs11, cf); \ + _vec_store2_32((res)->s0, r0, r1, r2); \ + _vec_store2_32((res)->s1, r3, r4, r5); \ + _vec_store2_32((res)->s2, r6, r7, r8); \ + _vec_store2_32((res)->s3, r9, r10, r11); + +#define _g5_cmplx_sub_hop_and_g5store32(res) \ + _vec_load_halfspinor_32(r3, r4, r5, pn->s0); \ + _vec_cmplx_mul_double2c_32(r0, r1, r2, r3, r4, r5, cf); \ + _vec_unfuse(r0, r1, r2, r3, r4, r5); \ + _vec_sub_double2(r0, r3, r1, r4, r2, r5, rs0, rs1, rs2, rs3, rs4, rs5); \ + _vec_store2_32((res)->s0, r0, r3, r1); \ + _vec_store2_32((res)->s1, r4, r2, r5); \ + _vec_load_halfspinor_32(r3, r4, r5, pn->s2); \ + _vec_cmplxcg_mul_double2c(r0, r1, r2, r3, r4, r5, cf); \ + _vec_unfuse(r0, r1, r2, r3, r4, r5); \ + _vec_sub_double2(rs6, rs7, rs8, rs9, rs10, rs11, r0, r3, r1, r4, r2, r5); \ + _vec_store2_32((res)->s2, rs6, rs7, rs8); \ + _vec_store2_32((res)->s3, rs9, rs10, rs11); + +#define _hop_store_post32(res) \ + _vec_store2_32((res)->s0, rs0, rs1, rs2); \ + _vec_store2_32((res)->s1, rs3, rs4, rs5); \ + _vec_store2_32((res)->s2, rs6, rs7, rs8); \ + _vec_store2_32((res)->s3, rs9, rs10, rs11); + + +#define _declare_hregs() \ + vector4double ALIGN r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11; \ + vector4double ALIGN rs0, rs1, rs2, rs3, rs4, rs5, rs6, rs7, rs8, rs9, rs10, rs11; \ + vector4double ALIGN U0, U1, U2, U3, U4, U6, U7; \ + vector4double ALIGN rtmp; + +#else + +#ifdef _prefetch_spinor +# undef _prefetch_spinor +#endif +#define _prefetch_spinor(s) +#ifdef _prefetch_halfspinor +# undef _prefetch_halfspinor +#endif +#define _prefetch_halfspinor(hs) +#ifdef _prefetch_spinor_32 +# undef _prefetch_spinor_32 +#endif +#define _prefetch_spinor_32(s) +#ifdef _prefetch_su3_32 +# undef _prefetch_su3_32 +#endif +#define _prefetch_su3_32(U) + + +#define _hop_t_p_pre32() \ + _vector_assign(rs.s0, s->s0); \ + _vector_assign(rs.s1, s->s1); \ + _vector_assign(rs.s2, s->s2); \ + _vector_assign(rs.s3, s->s3); \ + _vector_add(psi, rs.s0, rs.s2); \ + _su3_multiply(chi,(*U),psi); \ + _complex_times_vector(phi2[ix]->s0, ka0_32, chi); \ + _vector_add(psi, rs.s1, rs.s3); \ + _su3_multiply(chi,(*U),psi); \ + _complex_times_vector(phi2[ix]->s1, ka0_32, chi); + +#define _hop_t_m_pre32() \ + _vector_sub(phi2[ix]->s0, rs.s0, rs.s2); \ + _vector_sub(phi2[ix]->s1, rs.s1, rs.s3); + +#define _hop_x_p_pre32() \ + _vector_i_add(psi, rs.s0, rs.s3); \ + _su3_multiply(chi, (*U), psi); \ + _complex_times_vector(phi2[ix]->s0, ka1_32, chi); \ + _vector_i_add(psi, rs.s1, rs.s2); \ + _su3_multiply(chi, (*U), psi); \ + _complex_times_vector(phi2[ix]->s1, ka1_32, chi); + +#define _hop_x_m_pre32() \ + _vector_i_sub(phi2[ix]->s0, rs.s0, rs.s3); \ + _vector_i_sub(phi2[ix]->s1, rs.s1, rs.s2); + +#define _hop_y_p_pre32() \ + _vector_add(psi, rs.s0, rs.s3); \ + _su3_multiply(chi,(*U),psi); \ + _complex_times_vector(phi2[ix]->s0, ka2_32, chi); \ + _vector_sub(psi, rs.s1, rs.s2); \ + _su3_multiply(chi,(*U),psi); \ + _complex_times_vector(phi2[ix]->s1, ka2_32, chi); + +#define _hop_y_m_pre32() \ + _vector_sub(phi2[ix]->s0, rs.s0, rs.s3); \ + _vector_add(phi2[ix]->s1, rs.s1, rs.s2); + +#define _hop_z_p_pre32() \ + _vector_i_add(psi, rs.s0, rs.s2); \ + _su3_multiply(chi, (*U), psi); \ + _complex_times_vector(phi2[ix]->s0, ka3_32, chi); \ + _vector_i_sub(psi, rs.s1, rs.s3); \ + _su3_multiply(chi,(*U),psi); \ + _complex_times_vector(phi2[ix]->s1, ka3_32, chi); + +#define _hop_z_m_pre32() \ + _vector_i_sub(phi2[ix]->s0, rs.s0, rs.s2); \ + _vector_i_add(phi2[ix]->s1, rs.s1, rs.s3); + +#define _hop_t_p_post32(); \ + _vector_assign(rs.s0, phi2[ix]->s0); \ + _vector_assign(rs.s2, phi2[ix]->s0); \ + _vector_assign(rs.s1, phi2[ix]->s1); \ + _vector_assign(rs.s3, phi2[ix]->s1); \ + +#define _hop_t_m_post32(); \ + _vector_assign(psi, phi2[ix]->s0); \ + _su3_inverse_multiply(chi,(*U), psi); \ + _complexcjg_times_vector(psi,ka0_32,chi); \ + _vector_add_assign(rs.s0, psi); \ + _vector_sub_assign(rs.s2, psi); \ + _vector_assign(psi, phi2[ix]->s1); \ + _su3_inverse_multiply(chi,(*U), psi); \ + _complexcjg_times_vector(psi,ka0_32,chi); \ + _vector_add_assign(rs.s1, psi); \ + _vector_sub_assign(rs.s3, psi); + +#define _hop_x_p_post32(); \ + _vector_add_assign(rs.s0, phi2[ix]->s0); \ + _vector_i_sub_assign(rs.s3, phi2[ix]->s0); \ + _vector_add_assign(rs.s1, phi2[ix]->s1); \ + _vector_i_sub_assign(rs.s2, phi2[ix]->s1); + +#define _hop_x_m_post32(); \ + _vector_assign(psi, phi2[ix]->s0); \ + _su3_inverse_multiply(chi,(*U), psi); \ + _complexcjg_times_vector(psi,ka1_32,chi); \ + _vector_add_assign(rs.s0, psi); \ + _vector_i_add_assign(rs.s3, psi); \ + _vector_assign(psi, phi2[ix]->s1); \ + _su3_inverse_multiply(chi,(*U), psi); \ + _complexcjg_times_vector(psi,ka1_32,chi); \ + _vector_add_assign(rs.s1, psi); \ + _vector_i_add_assign(rs.s2, psi); + +#define _hop_y_p_post32(); \ + _vector_add_assign(rs.s0, phi2[ix]->s0); \ + _vector_add_assign(rs.s3, phi2[ix]->s0); \ + _vector_add_assign(rs.s1, phi2[ix]->s1); \ + _vector_sub_assign(rs.s2, phi2[ix]->s1); + +#define _hop_y_m_post32(); \ + _vector_assign(psi, phi2[ix]->s0); \ + _su3_inverse_multiply(chi,(*U), psi); \ + _complexcjg_times_vector(psi,ka2_32,chi); \ + _vector_add_assign(rs.s0, psi); \ + _vector_sub_assign(rs.s3, psi); \ + _vector_assign(psi, phi2[ix]->s1); \ + _su3_inverse_multiply(chi, (*U), psi); \ + _complexcjg_times_vector(psi,ka2_32,chi); \ + _vector_add_assign(rs.s1, psi); \ + _vector_add_assign(rs.s2, psi); + +#define _hop_z_p_post32(); \ + _vector_add_assign(rs.s0, phi2[ix]->s0); \ + _vector_i_sub_assign(rs.s2, phi2[ix]->s0); \ + _vector_add_assign(rs.s1, phi2[ix]->s1); \ + _vector_i_add_assign(rs.s3, phi2[ix]->s1); + +#define _hop_z_m_post32(); \ + _vector_assign(psi, phi2[ix]->s0); \ + _su3_inverse_multiply(chi,(*U), psi); \ + _complexcjg_times_vector(psi,ka3_32,chi); \ + _vector_add_assign(rs.s0, psi); \ + _vector_i_add_assign(rs.s2, psi); \ + _vector_assign(psi, phi2[ix]->s1); \ + _su3_inverse_multiply(chi,(*U), psi); \ + _complexcjg_times_vector(psi,ka3_32,chi); \ + _vector_add_assign(rs.s1, psi); \ + _vector_i_sub_assign(rs.s3, psi); + +#define _hop_mul_g5_cmplx_and_store32(res) \ + _complex_times_vector((res)->s0, cfactor, rs.s0); \ + _complex_times_vector((res)->s1, cfactor, rs.s1); \ + _complexcjg_times_vector((res)->s2, cfactor, rs.s2); \ + _complexcjg_times_vector((res)->s3, cfactor, rs.s3); + +#define _g5_cmplx_sub_hop_and_g5store32(res) \ + _complex_times_vector(psi, cfactor, pn->s0); \ + _vector_sub((res)->s0, psi, rs.s0); \ + _complex_times_vector(psi2, cfactor, pn->s1); \ + _vector_sub((res)->s1, psi2, rs.s1); \ + _complexcjg_times_vector(psi, cfactor, pn->s2); \ + _vector_sub((res)->s2, rs.s2, psi); \ + _complexcjg_times_vector(psi2, cfactor, pn->s3); \ + _vector_sub((res)->s3, rs.s3, psi2); + + +#define _hop_store_post32(res) \ + _vector_assign(res->s0, rs.s0); \ + _vector_assign(res->s1, rs.s1); \ + _vector_assign(res->s2, rs.s2); \ + _vector_assign(res->s3, rs.s3); + + +#define _declare_hregs() \ + spinor32 ALIGN32 rs; \ + su3_vector32 ALIGN32 psi, chi, psi2, chi2; + +#endif + +#endif + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/halfspinor_sse_dbl.c b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/halfspinor_sse_dbl.c new file mode 100644 index 0000000000000000000000000000000000000000..354ec1a02cb085eed87f86a0a0929c00e4652c16 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/halfspinor_sse_dbl.c @@ -0,0 +1,219 @@ +/********************************************************************** + * + * + * Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008 Carsten Urbach + * + * BG and halfspinor versions (C) 2007, 2008 Carsten Urbach + * + * This file is based on an implementation of the Dirac operator + * written by Martin Luescher, modified by Martin Hasenbusch in 2002 + * and modified and extended by Carsten Urbach from 2003-2008 + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + **********************************************************************/ + +/* input on k; output on l */ +void Hopping_Matrix(const int ieo, spinor * const l, spinor * const k){ +#ifdef _GAUGE_COPY + if(g_update_gauge_copy) { + update_backward_gauge(g_gauge_field); + } +#endif + +#ifdef OMP +#pragma omp parallel +{ + su3 * restrict U0 ALIGN; +#endif + + int ix; + su3 * restrict U ALIGN; + spinor * restrict s ALIGN; + halfspinor ** phi ALIGN; + _declare_hregs(); + +#ifdef _KOJAK_INST +#pragma pomp inst begin(hoppingmatrix) +#endif + +#ifndef OMP + /* We will run through the source vector now */ + /* instead of the solution vector */ + s = k; + _prefetch_spinor(s); + + if(ieo == 0) { + U = g_gauge_field_copy[0][0]; + } + else { + U = g_gauge_field_copy[1][0]; + } + _prefetch_su3(U); +#else + if(ieo == 0) { + U0 = g_gauge_field_copy[0][0]; + } + else { + U0 = g_gauge_field_copy[1][0]; + } +#endif + phi = NBPointer[ieo]; + + /**************** loop over all lattice sites ******************/ +#ifdef OMP +#pragma omp for +#else + ix = 0; +#endif + for(int i = 0; i < (VOLUME)/2; i++){ +#ifdef OMP + s = k+i; + _prefetch_spinor(s); + U = U0+i*4; + _prefetch_su3(U); + ix = i*8; +#endif + /*********************** direction +0 ************************/ + _hop_t_p_pre(); + U++; + ix++; + /*********************** direction -0 ************************/ + _hop_t_m_pre(); + ix++; + + /*********************** direction +1 ************************/ + _hop_x_p_pre(); + ix++; + U++; + + /*********************** direction -1 ************************/ + _hop_x_m_pre(); + ix++; + + /*********************** direction +2 ************************/ + _hop_y_p_pre(); + ix++; + U++; + /*********************** direction -2 ************************/ + _hop_y_m_pre(); + ix++; + + /*********************** direction +3 ************************/ + _hop_z_p_pre(); + ix++; + U++; + + /*********************** direction -3 ************************/ + _hop_z_m_pre(); +#ifndef OMP + ix++; + s++; +#endif + } + +#ifdef OMP +#pragma omp single +{ +#endif +# if (defined MPI && !defined _NO_COMM) + xchange_halffield(); +# endif +#ifdef OMP +} +#endif + +#ifndef OMP + s = l; + if(ieo == 0) { + U = g_gauge_field_copy[1][0]; + } + else { + U = g_gauge_field_copy[0][0]; + } + _prefetch_su3(U); +#else + if(ieo == 0) { + U0 = g_gauge_field_copy[1][0]; + } + else { + U0 = g_gauge_field_copy[0][0]; + } +#endif + phi = NBPointer[2 + ieo]; + + + /* Now we sum up and expand to a full spinor */ +#ifdef OMP +#pragma omp for +#else + ix = 0; +#endif + for(int i = 0; i < (VOLUME)/2; i++){ +#ifdef OMP + U = U0 + i*4; + _prefetch_su3(U); + ix = i*8; + s = l + i; +#endif + /*********************** direction +0 ************************/ + _hop_t_p_post(); + ix++; + + /*********************** direction -0 ************************/ + _hop_t_m_post(); + + ix++; + U++; + /*********************** direction +1 ************************/ + _hop_x_p_post(); + ix++; + + /*********************** direction -1 ************************/ + _hop_x_m_post(); + ix++; + U++; + + /*********************** direction +2 ************************/ + _hop_y_p_post(); + ix++; + + /*********************** direction -2 ************************/ + _hop_y_m_post(); + ix++; + U++; + /*********************** direction +3 ************************/ + _hop_z_p_post(); + + ix++; + /*********************** direction -3 ************************/ + _hop_z_m_post(); +#ifndef OMP + ix++; + U++; + s++; +#endif + } +#ifdef _KOJAK_INST +#pragma pomp inst end(hoppingmatrix) +#endif + +#ifdef OMP + } /* omp parallel closing bracket */ +#endif +} + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/hopping.h b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/hopping.h new file mode 100644 index 0000000000000000000000000000000000000000..6b13ff11d6c7a409e23a7fec8deedef78a892c57 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/hopping.h @@ -0,0 +1,698 @@ +/********************************************************************** + * + * Copyright (C) 2012 Carsten Urbach + * + * BG and halfspinor versions (C) 2007, 2008 Carsten Urbach + * + * This file is based on an implementation of the Dirac operator + * written by Martin Luescher, modified by Martin Hasenbusch in 2002 + * and modified and extended by Carsten Urbach from 2003-2008 + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + **********************************************************************/ + +#ifndef _HOPPING_H +#define _HOPPING_H + +# if (defined BGQ && defined XLC) + +/* We have 32 registers available */ +#define _declare_regs() \ + vector4double ALIGN r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11; \ + vector4double ALIGN rs0, rs1, rs2, rs3, rs4, rs5, rs6, rs7, rs8, rs9, rs10, rs11; \ + vector4double ALIGN U0, U1, U2, U3, U4, U6, U7; \ + vector4double ALIGN rtmp; \ + __alignx(16,l); \ + __alignx(16,k); + +#define _hop_t_p() \ + _vec_load_spinor(r4, r5, r6, r7, r8, r9, sp->s0); \ + _vec_add_ul_spinor(r0, r1, r2, r4, r5, r6, r7, r8, r9); \ + _vec_su3_multiply_double2ct(up); \ + rtmp = vec_ld2(0, (double*) &ka0); \ + _vec_cmplx_mul_double2c(rs0, rs1, rs2, r4, r5, r6, rtmp); \ + _vec_unfuse(rs0, rs1, rs2, rs3, rs4, rs5); \ + rs6 = rs0; rs7 = rs1; rs8 = rs2; \ + rs9 = rs3; rs10= rs4; rs11= rs5; + +#define _hop_t_m() \ + _vec_load_spinor(r4, r5, r6, r7, r8, r9, sm->s0); \ + _vec_sub_ul_spinor(r0, r1, r2, r4, r5, r6, r7, r8, r9); \ + _vec_su3_inverse_multiply_double2ct(um); \ + _vec_cmplxcg_mul_double2c(r0, r1, r2, r4, r5, r6, rtmp); \ + _vec_unfuse(r0, r1, r2, r3, r4, r5); \ + _vec_add_double2(rs0, rs1, rs2, rs3, rs4, rs5, r0, r1, r2, r3, r4, r5); \ + _vec_sub_double2(rs6, rs7, rs8, rs9, rs10, rs11, r0, r1, r2, r3, r4, r5); + +#define _hop_x_p() \ + _vec_load(r4, r5, sp->s0); \ + _vec_load16(r6, r7, sp->s1, U0); \ + _vec_load(r10, r11, sp->s2); \ + _vec_load16(r8, r9, sp->s3, U0); \ + _vec_i_mul_add(r0, r1, r4, r5, r8, r9, U0); \ + _vec_i_mul_add(r2, r3, r6, r7, r10, r11, U0); \ + _vec_su3_multiply_double2c(up); \ + rtmp = vec_ld2(0, (double*) &ka1); \ + _vec_cmplx_mul_double2c(r0, r1, r2, r4, r5, r6, rtmp); \ + _vec_unfuse(r0, r1, r2, r3, r4, r5); \ + _vec_add_double2(rs0, rs1, rs2, rs3, rs4, rs5, r0, r1, r2, r3, r4, r5); \ + _vec_i_mul_sub2(rs6, rs7, rs8, r3, r4, r5, U0); \ + _vec_i_mul_sub2(rs9, rs10, rs11, r0, r1, r2, U1); + +#define _hop_x_m() \ + _vec_load(r4, r5, sm->s0); \ + _vec_load16(r6, r7, sm->s1, U0); \ + _vec_load(r10, r11, sm->s2); \ + _vec_load16(r8, r9, sm->s3, U0); \ + _vec_i_mul_sub(r0, r1, r4, r5, r8, r9, U0); \ + _vec_i_mul_sub(r2, r3, r6, r7, r10, r11, U0); \ + _vec_su3_inverse_multiply_double2c(um); \ + _vec_cmplxcg_mul_double2c(r0, r1, r2, r4, r5, r6, rtmp); \ + _vec_unfuse(r0, r1, r2, r3, r4, r5); \ + _vec_add_double2(rs0, rs1, rs2, rs3, rs4, rs5, r0, r1, r2, r3, r4, r5); \ + _vec_i_mul_add2(rs6, rs7, rs8, r3, r4, r5, U0); \ + _vec_i_mul_add2(rs9, rs10, rs11, r0, r1, r2, U1); + +#define _hop_y_p() \ + _vec_load(r4, r5, sp->s0); \ + _vec_load16(r6, r7, sp->s1, U0); \ + _vec_load(r10, r11, sp->s2); \ + _vec_load16(r8, r9, sp->s3, U0); \ + _vec_add(r0, r1, r4, r5, r8, r9); \ + _vec_sub(r2, r3, r6, r7, r10, r11); \ + _vec_su3_multiply_double2c(up); \ + rtmp = vec_ld2(0, (double*) &ka2); \ + _vec_cmplx_mul_double2c(r0, r1, r2, r4, r5, r6, rtmp); \ + _vec_unfuse(r0, r1, r2, r3, r4, r5); \ + _vec_add_double2(rs0, rs1, rs2, rs3, rs4, rs5, r0, r1, r2, r3, r4, r5); \ + _vec_sub2(rs6, rs7, rs8, r3, r4, r5); \ + _vec_add2(rs9, rs10, rs11, r0, r1, r2); + +#define _hop_y_m() \ + _vec_load(r4, r5, sm->s0); \ + _vec_load16(r6, r7, sm->s1, U0); \ + _vec_load(r10, r11, sm->s2); \ + _vec_load16(r8, r9, sm->s3, U0); \ + _vec_sub(r0, r1, r4, r5, r8, r9); \ + _vec_add(r2, r3, r6, r7, r10, r11); \ + _vec_su3_inverse_multiply_double2c(um); \ + _vec_cmplxcg_mul_double2c(r0, r1, r2, r4, r5, r6, rtmp); \ + _vec_unfuse(r0, r1, r2, r3, r4, r5); \ + _vec_add_double2(rs0, rs1, rs2, rs3, rs4, rs5, r0, r1, r2, r3, r4, r5); \ + _vec_add2(rs6, rs7, rs8, r3, r4, r5); \ + _vec_sub2(rs9, rs10, rs11, r0, r1, r2); + +#define _hop_z_p() \ + _vec_load(r4, r5, sp->s0); \ + _vec_load16(r6, r7, sp->s1, U0); \ + _vec_load(r8, r9, sp->s2); \ + _vec_load16(r10, r11, sp->s3, U0); \ + _vec_i_mul_add(r0, r1, r4, r5, r8, r9, U0); \ + _vec_i_mul_sub(r2, r3, r6, r7, r10, r11, U1); \ + _vec_su3_multiply_double2c(up); \ + rtmp = vec_ld2(0, (double*) &ka3); \ + _vec_cmplx_mul_double2c(r0, r1, r2, r4, r5, r6, rtmp); \ + _vec_unfuse(r0, r1, r2, r3, r4, r5); \ + _vec_add_double2(rs0, rs1, rs2, rs3, rs4, rs5, r0, r1, r2, r3, r4, r5); \ + _vec_i_mul_sub2(rs6, rs7, rs8, r0, r1, r2, U0); \ + _vec_i_mul_add2(rs9, rs10, rs11, r3, r4, r5, U1); + +#define _hop_z_m() \ + _vec_load(r4, r5, sm->s0); \ + _vec_load16(r6, r7, sm->s1, U0); \ + _vec_load(r8, r9, sm->s2); \ + _vec_load16(r10, r11, sm->s3, U0); \ + _vec_i_mul_sub(r0, r1, r4, r5, r8, r9, U0); \ + _vec_i_mul_add(r2, r3, r6, r7, r10, r11, U1); \ + _vec_su3_inverse_multiply_double2c(um); \ + _vec_cmplxcg_mul_double2c(r0, r1, r2, r4, r5, r6, rtmp); \ + _vec_unfuse(r0, r1, r2, r3, r4, r5); \ + _vec_add_double2(rs0, rs1, rs2, rs3, rs4, rs5, r0, r1, r2, r3, r4, r5); \ + _vec_i_mul_add2(rs6, rs7, rs8, r0, r1, r2, U0); \ + _vec_i_mul_sub2(rs9, rs10, rs11, r3, r4, r5, U1); + +#define _hop_mul_g5_cmplx_and_store() \ + _vec_cmplx_mul_double2(r0, r1, r2, r3, r4, r5, rs0, rs1, rs2, rs3, rs4, rs5, cf); \ + _vec_cmplxcg_mul_double2(r6, r7, r8, r9, r10, r11, rs6, rs7, rs8, rs9, rs10, rs11, cf); \ + _vec_store2(rn->s0, r0, r1, r2); \ + _vec_store2(rn->s1, r3, r4, r5); \ + _vec_store2(rn->s2, r6, r7, r8); \ + _vec_store2(rn->s3, r9, r10, r11); + +#define _g5_cmplx_sub_hop_and_g5store() \ + _vec_load_halfspinor(r3, r4, r5, pn->s0); \ + _vec_cmplx_mul_double2c(r0, r1, r2, r3, r4, r5, cf); \ + _vec_unfuse(r0, r1, r2, r3, r4, r5); \ + _vec_sub_double2(r0, r3, r1, r4, r2, r5, rs0, rs1, rs2, rs3, rs4, rs5); \ + _vec_store2(rn->s0, r0, r3, r1); \ + _vec_store2(rn->s1, r4, r2, r5); \ + _vec_load_halfspinor(r3, r4, r5, pn->s2); \ + _vec_cmplxcg_mul_double2c(r0, r1, r2, r3, r4, r5, cf); \ + _vec_unfuse(r0, r1, r2, r3, r4, r5); \ + _vec_sub_double2(rs6, rs7, rs8, rs9, rs10, rs11, r0, r3, r1, r4, r2, r5); \ + _vec_store2(rn->s2, rs6, rs7, rs8); \ + _vec_store2(rn->s3, rs9, rs10, rs11); + + +#define _store_res() \ + _vec_store2(rn->s0, rs0, rs1, rs2); \ + _vec_store2(rn->s1, rs3, rs4, rs5); \ + _vec_store2(rn->s2, rs6, rs7, rs8); \ + _vec_store2(rn->s3, rs9, rs10, rs11); + +# elif (defined BGL && defined XLC) + +#define _declare_regs() \ + double _Complex reg00, reg01, reg02, reg03, reg04, reg05; \ + double _Complex reg10, reg11, reg12, reg13, reg14, reg15; \ + double _Complex u00, u01, u02, u10, u11, u12; \ + double _Complex reg20, reg21; \ + double _Complex rs00, rs01, rs02, rs10, rs11, rs12, rs20, rs21, rs22, \ + rs30, rs31, rs32; + +#define _hop_t_p() \ + _prefetch_su3(um); \ + _prefetch_spinor(sm); \ + _bgl_load_reg0(sp->s0); \ + _bgl_load_reg1(sp->s1); \ + _bgl_load_reg0_up(sp->s2); \ + _bgl_load_reg1_up(sp->s3); \ + _bgl_vector_add_reg0(); \ + _bgl_vector_add_reg1(); \ + _bgl_su3_multiply_double((*up)); \ + _bgl_vector_cmplx_mul_double(ka0); \ + _bgl_store_reg0_up_rs0(); \ + _bgl_store_reg0_up_rs2(); \ + _bgl_store_reg1_up_rs1(); \ + _bgl_store_reg1_up_rs3(); + +#define _hop_t_m() \ + _prefetch_su3(up); \ + _prefetch_spinor(sp); \ + _bgl_load_reg0(sm->s0); \ + _bgl_load_reg1(sm->s1); \ + _bgl_load_reg0_up(sm->s2); \ + _bgl_load_reg1_up(sm->s3); \ + _bgl_vector_sub_reg0(); \ + _bgl_vector_sub_reg1(); \ + _bgl_su3_inverse_multiply_double((*um)); \ + _bgl_vector_cmplxcg_mul_double(ka0); \ + _bgl_add_to_rs0_reg0(); \ + _bgl_sub_from_rs2_reg0(); \ + _bgl_add_to_rs1_reg1(); \ + _bgl_sub_from_rs3_reg1(); + +#define _hop_x_p() \ + _prefetch_su3(um); \ + _prefetch_spinor(sm); \ + _bgl_load_reg0(sp->s0); \ + _bgl_load_reg1(sp->s1); \ + _bgl_load_reg0_up(sp->s3); \ + _bgl_load_reg1_up(sp->s2); \ + _bgl_vector_i_mul_add_reg0(); \ + _bgl_vector_i_mul_add_reg1(); \ + _bgl_su3_multiply_double((*up)); \ + _bgl_vector_cmplx_mul_double(ka1); \ + _bgl_add_to_rs0_reg0(); \ + _bgl_i_mul_sub_from_rs3_reg0(); \ + _bgl_add_to_rs1_reg1(); \ + _bgl_i_mul_sub_from_rs2_reg1(); + +#define _hop_x_m() \ + _prefetch_su3(up); \ + _prefetch_spinor(sp); \ + _bgl_load_reg0(sm->s0); \ + _bgl_load_reg1(sm->s1); \ + _bgl_load_reg0_up(sm->s3); \ + _bgl_load_reg1_up(sm->s2); \ + _bgl_vector_i_mul_sub_reg0(); \ + _bgl_vector_i_mul_sub_reg1(); \ + _bgl_su3_inverse_multiply_double((*um)); \ + _bgl_vector_cmplxcg_mul_double(ka1); \ + _bgl_add_to_rs0_reg0(); \ + _bgl_add_to_rs1_reg1(); \ + _bgl_i_mul_add_to_rs3_reg0(); \ + _bgl_i_mul_add_to_rs2_reg1(); + +#define _hop_y_p() \ + _prefetch_su3(um); \ + _prefetch_spinor(sm); \ + _bgl_load_reg0(sp->s0); \ + _bgl_load_reg1(sp->s1); \ + _bgl_load_reg1_up(sp->s2); \ + _bgl_load_reg0_up(sp->s3); \ + _bgl_vector_add_reg0(); \ + _bgl_vector_sub_reg1(); \ + _bgl_su3_multiply_double((*up)); \ + _bgl_vector_cmplx_mul_double(ka2); \ + _bgl_add_to_rs0_reg0(); \ + _bgl_add_to_rs1_reg1(); \ + _bgl_sub_from_rs2_reg1(); \ + _bgl_add_to_rs3_reg0(); + +#define _hop_y_m() \ + _prefetch_su3(up); \ + _prefetch_spinor(sp); \ + _bgl_load_reg0(sm->s0); \ + _bgl_load_reg1(sm->s1); \ + _bgl_load_reg1_up(sm->s2); \ + _bgl_load_reg0_up(sm->s3); \ + _bgl_vector_sub_reg0(); \ + _bgl_vector_add_reg1(); \ + _bgl_su3_inverse_multiply_double((*um)); \ + _bgl_vector_cmplxcg_mul_double(ka2); \ + _bgl_add_to_rs0_reg0(); \ + _bgl_add_to_rs1_reg1(); \ + _bgl_add_to_rs2_reg1(); \ + _bgl_sub_from_rs3_reg0(); + +#define _hop_z_p() \ + _prefetch_su3(um); \ + _prefetch_spinor(sm); \ + _bgl_load_reg0(sp->s0); \ + _bgl_load_reg1(sp->s1); \ + _bgl_load_reg0_up(sp->s2); \ + _bgl_load_reg1_up(sp->s3); \ + _bgl_vector_i_mul_add_reg0(); \ + _bgl_vector_i_mul_sub_reg1(); \ + _bgl_su3_multiply_double((*up)); \ + _bgl_vector_cmplx_mul_double(ka3); \ + _bgl_add_to_rs0_reg0(); \ + _bgl_add_to_rs1_reg1(); \ + _bgl_i_mul_sub_from_rs2_reg0(); \ + _bgl_i_mul_add_to_rs3_reg1(); + +#define _hop_z_m() \ + _prefetch_su3(up); \ + _prefetch_spinor(sp); \ + _bgl_load_reg0(sm->s0); \ + _bgl_load_reg1(sm->s1); \ + _bgl_load_reg0_up(sm->s2); \ + _bgl_load_reg1_up(sm->s3); \ + _bgl_vector_i_mul_sub_reg0(); \ + _bgl_vector_i_mul_add_reg1(); \ + _bgl_su3_inverse_multiply_double((*um)); \ + _bgl_vector_cmplxcg_mul_double(ka3); \ + _bgl_add_to_rs0_reg0(); \ + _bgl_i_mul_add_to_rs2_reg0(); \ + _bgl_add_to_rs1_reg1(); \ + _bgl_i_mul_sub_from_rs3_reg1(); + +#define _store_res() \ + _bgl_store_rs0(rn->s0); \ + _bgl_store_rs1(rn->s1); \ + _bgl_store_rs2(rn->s2); \ + _bgl_store_rs3(rn->s3); + +# elif (defined SSE2 || defined SSE3) + +#define _declare_regs() \ + spinor ALIGN rs; + +#define _hop_t_p() \ + _prefetch_su3(um); \ + _sse_load(sp->s0); \ + _sse_load_up(sp->s2); \ + _sse_vector_add(); \ + _sse_su3_multiply((*up)); \ + _sse_vector_cmplx_mul(ka0); \ + _sse_store_up(rs.s0); \ + _sse_store_up(rs.s2); \ + _sse_load(sp->s1); \ + _sse_load_up(sp->s3); \ + _sse_vector_add(); \ + _sse_su3_multiply((*up)); \ + _sse_vector_cmplx_mul(ka0); \ + _sse_store_up(rs.s1); \ + _sse_store_up(rs.s3); + +#define _hop_t_m() \ + _prefetch_su3(up); \ + _sse_load(sm->s0); \ + _sse_load_up(sm->s2); \ + _sse_vector_sub(); \ + _sse_su3_inverse_multiply((*um)); \ + _sse_vector_cmplxcg_mul(ka0); \ + _sse_load(rs.s0); \ + _sse_vector_add(); \ + _sse_store(rs.s0); \ + _sse_load(rs.s2); \ + _sse_vector_sub(); \ + _sse_store(rs.s2); \ + _sse_load(sm->s1); \ + _sse_load_up(sm->s3); \ + _sse_vector_sub(); \ + _sse_su3_inverse_multiply((*um)); \ + _sse_vector_cmplxcg_mul(ka0); \ + _sse_load(rs.s1); \ + _sse_vector_add(); \ + _sse_store(rs.s1); \ + _sse_load(rs.s3); \ + _sse_vector_sub(); \ + _sse_store(rs.s3); + +#define _hop_x_p() \ + _prefetch_su3(um); \ + _sse_load(sp->s0); \ + _sse_load_up(sp->s3); \ + _sse_vector_i_mul(); \ + _sse_vector_add(); \ + _sse_su3_multiply((*up)); \ + _sse_vector_cmplx_mul(ka1); \ + _sse_load(rs.s0); \ + _sse_vector_add(); \ + _sse_store(rs.s0); \ + _sse_load(rs.s3); \ + _sse_vector_i_mul(); \ + _sse_vector_sub(); \ + _sse_store(rs.s3); \ + _sse_load(sp->s1); \ + _sse_load_up(sp->s2); \ + _sse_vector_i_mul(); \ + _sse_vector_add(); \ + _sse_su3_multiply((*up)); \ + _sse_vector_cmplx_mul(ka1); \ + _sse_load(rs.s1); \ + _sse_vector_add(); \ + _sse_store(rs.s1); \ + _sse_load(rs.s2); \ + _sse_vector_i_mul(); \ + _sse_vector_sub(); \ + _sse_store(rs.s2); + +#define _hop_x_m() \ + _prefetch_su3(up); \ + _sse_load(sm->s0); \ + _sse_load_up(sm->s3); \ + _sse_vector_i_mul(); \ + _sse_vector_sub(); \ + _sse_su3_inverse_multiply((*um)); \ + _sse_vector_cmplxcg_mul(ka1); \ + _sse_load(rs.s0); \ + _sse_vector_add(); \ + _sse_store(rs.s0); \ + _sse_load(rs.s3); \ + _sse_vector_i_mul(); \ + _sse_vector_add(); \ + _sse_store(rs.s3); \ + _sse_load(sm->s1); \ + _sse_load_up(sm->s2); \ + _sse_vector_i_mul(); \ + _sse_vector_sub(); \ + _sse_su3_inverse_multiply((*um)); \ + _sse_vector_cmplxcg_mul(ka1); \ + _sse_load(rs.s1); \ + _sse_vector_add(); \ + _sse_store(rs.s1); \ + _sse_load(rs.s2); \ + _sse_vector_i_mul(); \ + _sse_vector_add(); \ + _sse_store(rs.s2); + +#define _hop_y_p() \ + _prefetch_su3(um); \ + _sse_load(sp->s0); \ + _sse_load_up(sp->s3); \ + _sse_vector_add(); \ + _sse_su3_multiply((*up)); \ + _sse_vector_cmplx_mul(ka2); \ + _sse_load(rs.s0); \ + _sse_vector_add(); \ + _sse_store(rs.s0); \ + _sse_load(rs.s3); \ + _sse_vector_add(); \ + _sse_store(rs.s3); \ + _sse_load(sp->s1); \ + _sse_load_up(sp->s2); \ + _sse_vector_sub(); \ + _sse_su3_multiply((*up)); \ + _sse_vector_cmplx_mul(ka2); \ + _sse_load(rs.s1); \ + _sse_vector_add(); \ + _sse_store(rs.s1); \ + _sse_load(rs.s2); \ + _sse_vector_sub(); \ + _sse_store(rs.s2); + +#define _hop_y_m() \ + _prefetch_su3(up); \ + _sse_load(sm->s0); \ + _sse_load_up(sm->s3); \ + _sse_vector_sub(); \ + _sse_su3_inverse_multiply((*um)); \ + _sse_vector_cmplxcg_mul(ka2); \ + _sse_load(rs.s0); \ + _sse_vector_add(); \ + _sse_store(rs.s0); \ + _sse_load(rs.s3); \ + _sse_vector_sub(); \ + _sse_store(rs.s3); \ + _sse_load(sm->s1); \ + _sse_load_up(sm->s2); \ + _sse_vector_add(); \ + _sse_su3_inverse_multiply((*um)); \ + _sse_vector_cmplxcg_mul(ka2); \ + _sse_load(rs.s1); \ + _sse_vector_add(); \ + _sse_store(rs.s1); \ + _sse_load(rs.s2); \ + _sse_vector_add(); \ + _sse_store(rs.s2); + +#define _hop_z_p() \ + _prefetch_su3(um); \ + _sse_load(sp->s0); \ + _sse_load_up(sp->s2); \ + _sse_vector_i_mul(); \ + _sse_vector_add(); \ + _sse_su3_multiply((*up)); \ + _sse_vector_cmplx_mul(ka3); \ + _sse_load(rs.s0); \ + _sse_vector_add(); \ + _sse_store(rs.s0); \ + _sse_load(rs.s2); \ + _sse_vector_i_mul(); \ + _sse_vector_sub(); \ + _sse_store(rs.s2); \ + _sse_load(sp->s1); \ + _sse_load_up(sp->s3); \ + _sse_vector_i_mul(); \ + _sse_vector_sub(); \ + _sse_su3_multiply((*up)); \ + _sse_vector_cmplx_mul(ka3); \ + _sse_load(rs.s1); \ + _sse_vector_add(); \ + _sse_store(rs.s1); \ + _sse_load(rs.s3); \ + _sse_vector_i_mul(); \ + _sse_vector_add(); \ + _sse_store(rs.s3); + +#define _hop_z_m() \ + _prefetch_su3(up); \ + _sse_load(sm->s0); \ + _sse_load_up(sm->s2); \ + _sse_vector_i_mul(); \ + _sse_vector_sub(); \ + _sse_su3_inverse_multiply((*um)); \ + _sse_vector_cmplxcg_mul(ka3); \ + _sse_load(rs.s0); \ + _sse_vector_add(); \ + _sse_store_nt(rn->s0); \ + _sse_load(rs.s2); \ + _sse_vector_i_mul(); \ + _sse_vector_add(); \ + _sse_store_nt(rn->s2); \ + _sse_load(sm->s1); \ + _sse_load_up(sm->s3); \ + _sse_vector_i_mul(); \ + _sse_vector_add(); \ + _sse_su3_inverse_multiply((*um)); \ + _sse_vector_cmplxcg_mul(ka3); \ + _sse_load(rs.s1); \ + _sse_vector_add(); \ + _sse_store_nt(rn->s1); \ + _sse_load(rs.s3); \ + _sse_vector_i_mul(); \ + _sse_vector_sub(); \ + _sse_store_nt(rn->s3); + +#define _hop_mul_g5_cmplx_and_store() \ + _sse_load_up(rn->s0); \ + _sse_vector_cmplx_mul(cf); \ + _sse_store_nt_up(rn->s0); \ + _sse_load_up(rn->s1); \ + _sse_vector_cmplx_mul(cf); \ + _sse_store_nt_up(rn->s1); \ + _sse_load_up(rn->s2); \ + _sse_vector_cmplxcg_mul(cf); \ + _sse_store_nt_up(rn->s2); \ + _sse_load_up(rn->s3); \ + _sse_vector_cmplxcg_mul(cf); \ + _sse_store_nt_up(rn->s3); + +#define _g5_cmplx_sub_hop_and_g5store() \ + _sse_load_up(pn->s0); \ + _sse_vector_cmplx_mul(cf); \ + _sse_load(rn->s0); \ + _sse_vector_sub_up(); \ + _sse_store_nt_up(rn->s0); \ + _sse_load_up(pn->s1); \ + _sse_vector_cmplx_mul(cf); \ + _sse_load(rn->s1); \ + _sse_vector_sub_up(); \ + _sse_store_nt_up(rn->s1); \ + _sse_load_up(pn->s2); \ + _sse_vector_cmplxcg_mul(cf); \ + _sse_load(rn->s2); \ + _sse_vector_sub(); \ + _sse_store_nt(rn->s2); \ + _sse_load_up(pn->s3); \ + _sse_vector_cmplxcg_mul(cf); \ + _sse_load(rn->s3); \ + _sse_vector_sub(); \ + _sse_store_nt(rn->s3); + +#define _store_res() + +# else + +#define _declare_regs() \ + su3_vector ALIGN psi, chi; \ + spinor ALIGN temp; + +#define _hop_t_p() \ + _vector_add(psi,sp->s0,sp->s2); \ + _su3_multiply(chi,(*up),psi); \ + _complex_times_vector(psi,ka0,chi); \ + _vector_assign(temp.s0,psi); \ + _vector_assign(temp.s2,psi); \ + _vector_add(psi,sp->s1,sp->s3); \ + _su3_multiply(chi,(*up),psi); \ + _complex_times_vector(psi,ka0,chi); \ + _vector_assign(temp.s1,psi); \ + _vector_assign(temp.s3,psi); + +#define _hop_t_m() \ + _vector_sub(psi,sm->s0,sm->s2); \ + _su3_inverse_multiply(chi,(*um),psi); \ + _complexcjg_times_vector(psi,ka0,chi); \ + _vector_add_assign(temp.s0,psi); \ + _vector_sub_assign(temp.s2,psi); \ + _vector_sub(psi,sm->s1,sm->s3); \ + _su3_inverse_multiply(chi,(*um),psi); \ + _complexcjg_times_vector(psi,ka0,chi); \ + _vector_add_assign(temp.s1,psi); \ + _vector_sub_assign(temp.s3,psi); + +#define _hop_x_p() \ + _vector_i_add(psi,sp->s0,sp->s3); \ + _su3_multiply(chi,(*up),psi); \ + _complex_times_vector(psi,ka1,chi); \ + _vector_add_assign(temp.s0,psi); \ + _vector_i_sub_assign(temp.s3,psi); \ + _vector_i_add(psi,sp->s1,sp->s2); \ + _su3_multiply(chi,(*up),psi); \ + _complex_times_vector(psi,ka1,chi); \ + _vector_add_assign(temp.s1,psi); \ + _vector_i_sub_assign(temp.s2,psi); + +#define _hop_x_m() \ + _vector_i_sub(psi,sm->s0,sm->s3); \ + _su3_inverse_multiply(chi,(*um),psi); \ + _complexcjg_times_vector(psi,ka1,chi); \ + _vector_add_assign(temp.s0,psi); \ + _vector_i_add_assign(temp.s3,psi); \ + _vector_i_sub(psi,sm->s1,sm->s2); \ + _su3_inverse_multiply(chi,(*um),psi); \ + _complexcjg_times_vector(psi,ka1,chi); \ + _vector_add_assign(temp.s1,psi); \ + _vector_i_add_assign(temp.s2,psi); + +#define _hop_y_p() \ + _vector_add(psi,sp->s0,sp->s3); \ + _su3_multiply(chi,(*up),psi); \ + _complex_times_vector(psi,ka2,chi); \ + _vector_add_assign(temp.s0,psi); \ + _vector_add_assign(temp.s3,psi); \ + _vector_sub(psi,sp->s1,sp->s2); \ + _su3_multiply(chi,(*up),psi); \ + _complex_times_vector(psi,ka2,chi); \ + _vector_add_assign(temp.s1,psi); \ + _vector_sub_assign(temp.s2,psi); + +#define _hop_y_m() \ + _vector_sub(psi,sm->s0,sm->s3); \ + _su3_inverse_multiply(chi,(*um),psi); \ + _complexcjg_times_vector(psi,ka2,chi); \ + _vector_add_assign(temp.s0,psi); \ + _vector_sub_assign(temp.s3,psi); \ + _vector_add(psi,sm->s1,sm->s2); \ + _su3_inverse_multiply(chi,(*um),psi); \ + _complexcjg_times_vector(psi,ka2,chi); \ + _vector_add_assign(temp.s1,psi); \ + _vector_add_assign(temp.s2,psi); + +#define _hop_z_p() \ + _vector_i_add(psi,sp->s0,sp->s2); \ + _su3_multiply(chi,(*up),psi); \ + _complex_times_vector(psi,ka3,chi); \ + _vector_add_assign(temp.s0,psi); \ + _vector_i_sub_assign(temp.s2,psi); \ + _vector_i_sub(psi,sp->s1,sp->s3); \ + _su3_multiply(chi,(*up),psi); \ + _complex_times_vector(psi,ka3,chi); \ + _vector_add_assign(temp.s1,psi); \ + _vector_i_add_assign(temp.s3,psi); + +#define _hop_z_m() \ + _vector_i_sub(psi,sm->s0,sm->s2); \ + _su3_inverse_multiply(chi,(*um),psi); \ + _complexcjg_times_vector(psi,ka3,chi); \ + _vector_add_assign(temp.s0, psi); \ + _vector_i_add_assign(temp.s2, psi); \ + _vector_i_add(psi,sm->s1,sm->s3); \ + _su3_inverse_multiply(chi,(*um),psi); \ + _complexcjg_times_vector(psi,ka3,chi); \ + _vector_add_assign(temp.s1, psi); \ + _vector_i_sub_assign(temp.s3, psi); + +#define _hop_mul_g5_cmplx_and_store() \ + _complex_times_vector(rn->s0, cfactor, temp.s0); \ + _complex_times_vector(rn->s1, cfactor, temp.s1); \ + _complexcjg_times_vector(rn->s2, cfactor, temp.s2); \ + _complexcjg_times_vector(rn->s3, cfactor, temp.s3); + +#define _g5_cmplx_sub_hop_and_g5store() \ + _complex_times_vector(psi, cfactor, pn->s0); \ + _vector_sub(rn->s0, psi, temp.s0); \ + _complex_times_vector(chi, cfactor, pn->s1); \ + _vector_sub(rn->s1, chi, temp.s1); \ + _complexcjg_times_vector(psi, cfactor, pn->s2); \ + _vector_sub(rn->s2, temp.s2, psi); \ + _complexcjg_times_vector(chi, cfactor, pn->s3); \ + _vector_sub(rn->s3, temp.s3, chi); + +#define _store_res() \ + _vector_assign(rn->s0, temp.s0); \ + _vector_assign(rn->s1, temp.s1); \ + _vector_assign(rn->s2, temp.s2); \ + _vector_assign(rn->s3, temp.s3); + +# endif + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/hopping_bg_dbl.c b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/hopping_bg_dbl.c new file mode 100644 index 0000000000000000000000000000000000000000..498feb91a5e6516884791b6d62a94e856be30e56 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/hopping_bg_dbl.c @@ -0,0 +1,193 @@ +/********************************************************************** + * + * + * Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008 Carsten Urbach + * + * BG and halfspinor versions (C) 2007, 2008 Carsten Urbach + * + * This file is based on an implementation of the Dirac operator + * written by Martin Luescher, modified by Martin Hasenbusch in 2002 + * and modified and extended by Carsten Urbach from 2003-2008 + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + **********************************************************************/ + + +void Hopping_Matrix(const int ieo, spinor * const l, spinor * const k){ + int icx,icy,icz,ioff,ioff2; + int ix,iy,iz; + su3 * restrict up ALIGN; + su3 * restrict um ALIGN; + spinor * restrict sp ALIGN; + spinor * restrict sm ALIGN; + spinor * restrict rn ALIGN; + _declare_regs(); + +#pragma disjoint(*sp, *sm, *rn, *up, *um, *l, *k) + + __alignx(16,l); + __alignx(16,k); + +#ifdef _GAUGE_COPY + if(g_update_gauge_copy) { + update_backward_gauge(g_gauge_field); + } +#endif + +# if (defined MPI && !(defined _NO_COMM)) + xchange_field(k, ieo); +# endif + + if(ieo == 0){ + ioff = 0; + } + else{ + ioff = (VOLUME+RAND)/2; + } + ioff2 = (VOLUME+RAND)/2-ioff; + + ix=g_eo2lexic[ioff]; + iy=g_iup[ix][0]; + icy=g_lexic2eosub[iy]; + + sp=k+icy; + +# if ((defined _GAUGE_COPY)) + up=&g_gauge_field_copy[ioff][0]; +# else + up=&g_gauge_field[ix][0]; +# endif + /**************** loop over all lattice sites ******************/ + for(icx = ioff; icx < (VOLUME/2+ioff); icx++){ + rn=l+(icx-ioff); + ix=g_eo2lexic[icx]; + /*********************** direction +0 ************************/ + iy=g_idn[ix][0]; + icy=g_lexic2eosub[iy]; +# if (!defined _GAUGE_COPY) + um=&g_gauge_field[iy][0]; +# else + um=up+1; +# endif + sm=k+icy; + + _hop_t_p(); + + /*********************** direction -0 ************************/ + + iy=g_iup[ix][1]; + icy=g_lexic2eosub[iy]; + +# if ((defined _GAUGE_COPY)) + up=um+1; +# else + up+=1; +# endif + sp=k+icy; + + + _hop_t_m(); + + /*********************** direction +1 ************************/ + + iy=g_idn[ix][1]; + icy=g_lexic2eosub[iy]; + +# ifndef _GAUGE_COPY + um=&g_gauge_field[iy][1]; +# else + um = up+1; +# endif + sm=k+icy; + _hop_x_p(); + + /*********************** direction -1 ************************/ + + iy=g_iup[ix][2]; + icy=g_lexic2eosub[iy]; + +# if ((defined _GAUGE_COPY)) + up=um+1; +# else + up+=1; +# endif + sp=k+icy; + + _hop_x_m(); + + /*********************** direction +2 ************************/ + + iy=g_idn[ix][2]; + icy=g_lexic2eosub[iy]; + +# ifndef _GAUGE_COPY + um=&g_gauge_field[iy][2]; +# else + um= up+1; +# endif + sm=k+icy; + + _hop_y_p(); + + + /*********************** direction -2 ************************/ + + iy=g_iup[ix][3]; + icy=g_lexic2eosub[iy]; + +# if ((defined _GAUGE_COPY)) + up=um+1; +# else + up+=1; +# endif + sp=k+icy; + _hop_y_m(); + + /*********************** direction +3 ************************/ + + iy=g_idn[ix][3]; + icy=g_lexic2eosub[iy]; + +# ifndef _GAUGE_COPY + um=&g_gauge_field[iy][3]; +# else + um=up+1; +# endif + sm=k+icy; + _hop_z_p(); + + /*********************** direction -3 ************************/ + + icz=icx+1; + if(icz==((VOLUME+RAND)/2+ioff)) icz=ioff; + iz=g_eo2lexic[icz]; + iy=g_iup[iz][0]; icy=g_lexic2eosub[iy]; + + + +# if ((defined _GAUGE_COPY)) + up=um+1; +# else + up=&g_gauge_field[iz][0]; +# endif + sp=k+icy; + _hop_z_m(); + _store_res(); + + /************************ end of loop ************************/ + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/hopping_body_dbl.c b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/hopping_body_dbl.c new file mode 100644 index 0000000000000000000000000000000000000000..ba7a434734e59adfd1789f3c1eb7b9579a499df4 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/hopping_body_dbl.c @@ -0,0 +1,181 @@ +/********************************************************************** + * + * + * Copyright (C) 2012 Carsten Urbach, Bartosz Kostrzewa + * + * This file is based on an implementation of the Dirac operator + * written by Martin Luescher, modified by Martin Hasenbusch in 2002 + * and modified and extended by Carsten Urbach from 2003-2008 + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + **********************************************************************/ + + int ioff; + int * hi; + su3 * restrict ALIGN up; + su3 * restrict ALIGN um; + spinor * restrict ALIGN sp; + spinor * restrict ALIGN sm; + spinor * restrict ALIGN rn; + +#ifdef XLC +# pragma disjoint(*sp, *sm, *rn, *up, *um, *l) +#endif + _declare_regs(); + + if(ieo == 0){ + ioff = 0; + } + else{ + ioff = (VOLUME+RAND)/2; + } + +#ifndef OMP + hi = &g_hi[16*ioff]; + +# if ((defined _GAUGE_COPY)) + up=&g_gauge_field_copy[ioff][0]; +# else + up=&g_gauge_field[(*hi)][0]; +# endif + hi++; + sp=k+(*hi); + hi++; +#endif + + /**************** loop over all lattice sites ******************/ +#ifdef OMP +# pragma omp for +#endif + for(int icx = ioff; icx < (VOLUME/2+ioff); icx++){ +#ifdef OMP + hi = &g_hi[16*icx]; +# if ((defined _GAUGE_COPY)) + up=&g_gauge_field_copy[icx][0]; +# else + up=&g_gauge_field[(*hi)][0]; +# endif + hi++; + sp=k+(*hi); + hi++; +#endif + rn=l+(icx-ioff); +#ifdef _TM_SUB_HOP + pn=p+(icx-ioff); +#endif + /*********************** direction +t ************************/ +# if (!defined _GAUGE_COPY) + um=&g_gauge_field[(*hi)][0]; +# else + um=up+1; +# endif + hi++; + sm=k+(*hi); + hi+=2; + + _hop_t_p(); + + /*********************** direction -t ************************/ +# if ((defined _GAUGE_COPY)) + up=um+1; +# else + up+=1; +# endif + sp=k+(*hi); + hi++; + + _hop_t_m(); + + /*********************** direction +1 ************************/ +# ifndef _GAUGE_COPY + um=&g_gauge_field[(*hi)][1]; +# else + um = up+1; +# endif + hi++; + sm=k+(*hi); + hi+=2; + + _hop_x_p(); + + /*********************** direction -1 ************************/ +# if ((defined _GAUGE_COPY)) + up=um+1; +# else + up+=1; +# endif + sp=k+(*hi); + hi++; + + _hop_x_m(); + + /*********************** direction +2 ************************/ +# ifndef _GAUGE_COPY + um=&g_gauge_field[(*hi)][2]; +# else + um= up+1; +# endif + hi++; + sm=k+(*hi); + hi+=2; + + _hop_y_p(); + + /*********************** direction -2 ************************/ +# if ((defined _GAUGE_COPY)) + up=um+1; +# else + up+=1; +# endif + sp=k+(*hi); + hi++; + + _hop_y_m(); + + /*********************** direction +3 ************************/ +# ifndef _GAUGE_COPY + um=&g_gauge_field[(*hi)][3]; +# else + um=up+1; +# endif + hi++; + sm=k+(*hi); + hi++; + + _hop_z_p(); + + /*********************** direction -3 ************************/ +#ifndef OMP +# if ((defined _GAUGE_COPY)) + up=um+1; +# else + up=&g_gauge_field[(*hi)][0]; +# endif + hi++; + sp=k+(*hi); + hi++; +#endif + _hop_z_m(); + +#ifdef _MUL_G5_CMPLX + _hop_mul_g5_cmplx_and_store(); +#elif defined _TM_SUB_HOP + _g5_cmplx_sub_hop_and_g5store(); +#else + _store_res(); +#endif + } diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/hopping_sgl.c b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/hopping_sgl.c new file mode 100644 index 0000000000000000000000000000000000000000..843fd6a1bdb6cf7a9e1e8c62af1bb3d86e199f6d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/hopping_sgl.c @@ -0,0 +1,288 @@ +/********************************************************************** + * + * + * Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008 Carsten Urbach + * + * BG and halfspinor versions (C) 2007, 2008 Carsten Urbach + * + * This file is based on an implementation of the Dirac operator + * written by Martin Luescher, modified by Martin Hasenbusch in 2002 + * and modified and extended by Carsten Urbach from 2003-2008 + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + **********************************************************************/ + +static su3_vector32 psi1, psi2, psi, chi, phi1, phi3; + +/* l output , k input*/ +/* for ieo=0, k resides on odd sites and l on even sites */ +void Hopping_Matrix(int ieo, spinor32 * const l, spinor32 * const k){ + int ix,iy; + int ioff,ioff2,icx,icy; + su332 * restrict up, * restrict um; + spinor32 * restrict r, * restrict sp, * restrict sm; + spinor32 temp; + +#ifdef _GAUGE_COPY + if(g_update_gauge_copy) { + update_backward_gauge(); + } +#endif + + /* for parallelization */ +# if (defined MPI && !(defined _NO_COMM)) + xchange_field(k, ieo); +# endif + + if(k == l){ + printf("Error in H_psi (simple.c):\n"); + printf("Arguments k and l must be different\n"); + printf("Program aborted\n"); + exit(1); + } + if(ieo == 0){ + ioff = 0; + } + else{ + ioff = (VOLUME+RAND)/2; + } + ioff2 = (VOLUME+RAND)/2-ioff; + /**************** loop over all lattice sites ****************/ + + for (icx = ioff; icx < (VOLUME/2 + ioff); icx++){ + ix=g_eo2lexic[icx]; + + r=l+(icx-ioff); + + /*********************** direction +0 ************************/ + iy=g_iup[ix][0]; icy=g_lexic2eosub[iy]; + + + sp=k+icy; +# if ((defined _GAUGE_COPY)) + up=&g_gauge_field_copy[icx][0]; +# else + up=&g_gauge_field[ix][0]; +# endif + + _vector_add(psi,(*sp).s0,(*sp).s2); + + _su3_multiply(chi,(*up),psi); + _complex_times_vector(psi,ka0,chi); + + _vector_assign(temp.s0,psi); + _vector_assign(temp.s2,psi); + + _vector_add(psi,(*sp).s1,(*sp).s3); + + _su3_multiply(chi,(*up),psi); + _complex_times_vector(psi,ka0,chi); + + _vector_assign(temp.s1,psi); + _vector_assign(temp.s3,psi); + + /*********************** direction -0 ************************/ + + iy=g_idn[ix][0]; icy=g_lexic2eosub[iy]; + + sm=k+icy; +# if ((defined _GAUGE_COPY)) + um = up+1; +# else + um=&g_gauge_field[iy][0]; +# endif + + _vector_sub(psi,(*sm).s0,(*sm).s2); + + _su3_inverse_multiply(chi,(*um),psi); + _complexcjg_times_vector(psi,ka0,chi); + + _vector_add_assign(temp.s0,psi); + _vector_sub_assign(temp.s2,psi); + + _vector_sub(psi,(*sm).s1,(*sm).s3); + + _su3_inverse_multiply(chi,(*um),psi); + _complexcjg_times_vector(psi,ka0,chi); + + _vector_add_assign(temp.s1,psi); + _vector_sub_assign(temp.s3,psi); + + /*********************** direction +1 ************************/ + + iy=g_iup[ix][1]; icy=g_lexic2eosub[iy]; + + sp=k+icy; + +# if ((defined _GAUGE_COPY)) + up=um+1; +# else + up+=1; +# endif + + _vector_i_add(psi,(*sp).s0,(*sp).s3); + + _su3_multiply(chi,(*up),psi); + _complex_times_vector(psi,ka1,chi); + + _vector_add_assign(temp.s0,psi); + _vector_i_sub_assign(temp.s3,psi); + + _vector_i_add(psi,(*sp).s1,(*sp).s2); + + _su3_multiply(chi,(*up),psi); + _complex_times_vector(psi,ka1,chi); + + _vector_add_assign(temp.s1,psi); + _vector_i_sub_assign(temp.s2,psi); + + /*********************** direction -1 ************************/ + + iy=g_idn[ix][1]; icy=g_lexic2eosub[iy]; + + sm=k+icy; +# ifndef _GAUGE_COPY + um=&g_gauge_field[iy][1]; +# else + um=up+1; +# endif + + _vector_i_sub(psi,(*sm).s0,(*sm).s3); + + _su3_inverse_multiply(chi,(*um),psi); + _complexcjg_times_vector(psi,ka1,chi); + + _vector_add_assign(temp.s0,psi); + _vector_i_add_assign(temp.s3,psi); + + _vector_i_sub(psi,(*sm).s1,(*sm).s2); + + _su3_inverse_multiply(chi,(*um),psi); + _complexcjg_times_vector(psi,ka1,chi); + + _vector_add_assign(temp.s1,psi); + _vector_i_add_assign(temp.s2,psi); + + /*********************** direction +2 ************************/ + + iy=g_iup[ix][2]; icy=g_lexic2eosub[iy]; + + sp=k+icy; +# if ((defined _GAUGE_COPY)) + up=um+1; +# else + up+=1; +# endif + _vector_add(psi,(*sp).s0,(*sp).s3); + + _su3_multiply(chi,(*up),psi); + _complex_times_vector(psi,ka2,chi); + + _vector_add_assign(temp.s0,psi); + _vector_add_assign(temp.s3,psi); + + _vector_sub(psi,(*sp).s1,(*sp).s2); + + _su3_multiply(chi,(*up),psi); + _complex_times_vector(psi,ka2,chi); + + _vector_add_assign(temp.s1,psi); + _vector_sub_assign(temp.s2,psi); + + + /*********************** direction -2 ************************/ + + iy=g_idn[ix][2]; icy=g_lexic2eosub[iy]; + + sm=k+icy; +# ifndef _GAUGE_COPY + um = &g_gauge_field[iy][2]; +# else + um = up +1; +# endif + + _vector_sub(psi,(*sm).s0,(*sm).s3); + + _su3_inverse_multiply(chi,(*um),psi); + _complexcjg_times_vector(psi,ka2,chi); + + _vector_add_assign(temp.s0,psi); + _vector_sub_assign(temp.s3,psi); + + _vector_add(psi,(*sm).s1,(*sm).s2); + + _su3_inverse_multiply(chi,(*um),psi); + _complexcjg_times_vector(psi,ka2,chi); + + _vector_add_assign(temp.s1,psi); + _vector_add_assign(temp.s2,psi); + + /*********************** direction +3 ************************/ + + iy=g_iup[ix][3]; icy=g_lexic2eosub[iy]; + + sp=k+icy; +# if ((defined _GAUGE_COPY)) + up=um+1; +# else + up+=1; +# endif + _vector_i_add(psi,(*sp).s0,(*sp).s2); + + _su3_multiply(chi,(*up),psi); + _complex_times_vector(psi,ka3,chi); + + _vector_add_assign(temp.s0,psi); + _vector_i_sub_assign(temp.s2,psi); + + _vector_i_sub(psi,(*sp).s1,(*sp).s3); + + _su3_multiply(chi,(*up),psi); + _complex_times_vector(psi,ka3,chi); + + _vector_add_assign(temp.s1,psi); + _vector_i_add_assign(temp.s3,psi); + + /*********************** direction -3 ************************/ + + iy=g_idn[ix][3]; icy=g_lexic2eosub[iy]; + + sm=k+icy; +# ifndef _GAUGE_COPY + um = &g_gauge_field[iy][3]; +# else + um = up+1; +# endif + + _vector_i_sub(psi,(*sm).s0,(*sm).s2); + + _su3_inverse_multiply(chi,(*um),psi); + _complexcjg_times_vector(psi,ka3,chi); + + _vector_add((*r).s0, temp.s0, psi); + _vector_i_add((*r).s2, temp.s2, psi); + + _vector_i_add(psi,(*sm).s1,(*sm).s3); + + _su3_inverse_multiply(chi,(*um),psi); + _complexcjg_times_vector(psi,ka3,chi); + + _vector_add((*r).s1, temp.s1, psi); + _vector_i_sub((*r).s3, temp.s3, psi); + /************************ end of loop ************************/ + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/hopping_sse_dbl.c b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/hopping_sse_dbl.c new file mode 100644 index 0000000000000000000000000000000000000000..bb9eb70c44692a8f0e07843c81cf6749f5a38b12 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/hopping_sse_dbl.c @@ -0,0 +1,393 @@ +/********************************************************************** + * + * + * Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008 Carsten Urbach + * + * BG and halfspinor versions (C) 2007, 2008 Carsten Urbach + * + * This file is based on an implementation of the Dirac operator + * written by Martin Luescher, modified by Martin Hasenbusch in 2002 + * and modified and extended by Carsten Urbach from 2003-2008 + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + **********************************************************************/ + + +# if (defined _USE_TSPLITPAR) /* needs also SSE */ + +/*********************************** + * + * Aurora version + * Author: Luigi Scorzato (scorzato@ect.it) + * (last modified 20.4.2009) + * The strategy of the code is explained in the file Strategy.txt + * + ************************************/ + +/* 4. */ +/* input on k; output on l */ +void Hopping_Matrix(const int ieo, spinor * const l, spinor * const k){ + int icx,icz,ioff; + int ix,iz; + int x0,icx0,jj; + su3 *restrict up; + su3 * restrict um; + spinor * restrict sp; + spinor * restrict sm; + spinor * restrict rn; + +# if (defined MPI) +# ifdef PARALLELX +# define REQC 4 +# elif defined PARALLELXY +# define REQC 8 +# elif defined PARALLELXYZ +# define REQC 12 +# endif + MPI_Request requests[REQC]; + MPI_Status status[REQC]; +# endif + +#ifdef _GAUGE_COPY + if(g_update_gauge_copy) { + update_backward_gauge(g_gauge_field); + } +#endif + + if(ieo == 0){ /* even out - odd in */ + ioff = 0; + } + else{ /* odd out - even in */ + ioff = (VOLUME+RAND)/2; + } + + /* Loop over time direction. This is the outmost loop */ + for(x0=0;x0s0); + _sse_load_up(sp->s2); + _sse_vector_add(); + + _sse_su3_multiply((*up)); + _sse_vector_cmplx_mul(ka0); + _sse_store_up(rn->s0); + _sse_store_up(rn->s2); + + _sse_load(sp->s1); + _sse_load_up(sp->s3); + _sse_vector_add(); + + _sse_su3_multiply((*up)); + _sse_vector_cmplx_mul(ka0); + _sse_store_up(rn->s1); + _sse_store_up(rn->s3); + + /*********************** direction -0 ************************/ + + sm=k+g_idn_eo[icx][0]; + um=up+1; + + _sse_load(sm->s0); + _sse_load_up(sm->s2); + _sse_vector_sub(); + + _sse_su3_inverse_multiply((*um)); + _sse_vector_cmplxcg_mul(ka0); + + _sse_load(rn->s0); + _sse_vector_add(); + _sse_store(rn->s0); + + _sse_load(rn->s2); + _sse_vector_sub(); + _sse_store(rn->s2); + + _sse_load(sm->s1); + _sse_load_up(sm->s3); + _sse_vector_sub(); + + _sse_su3_inverse_multiply((*um)); + _sse_vector_cmplxcg_mul(ka0); + + _sse_load(rn->s1); + _sse_vector_add(); + _sse_store(rn->s1); + + _sse_load(rn->s3); + _sse_vector_sub(); + _sse_store(rn->s3); + jj++; + } /* end of loop over timeslice (At)*/ + + + /* complete the communication of the timslice borders (and wait) */ +#if (defined MPI && !defined _NO_COMM) + xchange_field_close(requests, status, REQC); /* MPI_Waitall */ +#endif + + /* loop over timeslice. Bt: contribution of spacelike links */ + um=&g_gauge_field_copys[icx0][0]-1; + for(icx = icx0; icx < icx0+TEOSLICE; icx++){ + ix=g_eo2lexic[icx]; + rn=l+(icx-ioff); + /*********************** direction +1 ************************/ + + sp=k+g_iup_eo[icx][1]; + up=um+1; + + _sse_load(sp->s0); + _sse_load_up(sp->s3); + _sse_vector_i_mul(); + _sse_vector_add(); + + _sse_su3_multiply((*up)); + _sse_vector_cmplx_mul(ka1); + + _sse_load(rn->s0); + _sse_vector_add(); + _sse_store(rn->s0); + + _sse_load(rn->s3); + _sse_vector_i_mul(); + _sse_vector_sub(); + _sse_store(rn->s3); + + _sse_load(sp->s1); + _sse_load_up(sp->s2); + _sse_vector_i_mul(); + _sse_vector_add(); + + _sse_su3_multiply((*up)); + _sse_vector_cmplx_mul(ka1); + + _sse_load(rn->s1); + _sse_vector_add(); + _sse_store(rn->s1); + + _sse_load(rn->s2); + _sse_vector_i_mul(); + _sse_vector_sub(); + _sse_store(rn->s2); + + /*********************** direction -1 ************************/ + + sm=k+g_idn_eo[icx][1]; + um=up+1; + + _sse_load(sm->s0); + _sse_load_up(sm->s3); + _sse_vector_i_mul(); + _sse_vector_sub(); + + _sse_su3_inverse_multiply((*um)); + _sse_vector_cmplxcg_mul(ka1); + + _sse_load(rn->s0); + _sse_vector_add(); + _sse_store(rn->s0); + + _sse_load(rn->s3); + _sse_vector_i_mul(); + _sse_vector_add(); + _sse_store(rn->s3); + + _sse_load(sm->s1); + _sse_load_up(sm->s2); + _sse_vector_i_mul(); + _sse_vector_sub(); + + _sse_su3_inverse_multiply((*um)); + _sse_vector_cmplxcg_mul(ka1); + + _sse_load(rn->s1); + _sse_vector_add(); + _sse_store(rn->s1); + + _sse_load(rn->s2); + _sse_vector_i_mul(); + _sse_vector_add(); + _sse_store(rn->s2); + + /*********************** direction +2 ************************/ + + sp=k+g_iup_eo[icx][2]; + up=um+1; + + _sse_load(sp->s0); + _sse_load_up(sp->s3); + _sse_vector_add(); + + _sse_su3_multiply((*up)); + _sse_vector_cmplx_mul(ka2); + + _sse_load(rn->s0); + _sse_vector_add(); + _sse_store(rn->s0); + + _sse_load(rn->s3); + _sse_vector_add(); + _sse_store(rn->s3); + + _sse_load(sp->s1); + _sse_load_up(sp->s2); + _sse_vector_sub(); + + _sse_su3_multiply((*up)); + _sse_vector_cmplx_mul(ka2); + + _sse_load(rn->s1); + _sse_vector_add(); + _sse_store(rn->s1); + + _sse_load(rn->s2); + _sse_vector_sub(); + _sse_store(rn->s2); + + /*********************** direction -2 ************************/ + + sm=k+g_idn_eo[icx][2]; + um=up+1; + + _sse_load(sm->s0); + _sse_load_up(sm->s3); + _sse_vector_sub(); + + _sse_su3_inverse_multiply((*um)); + _sse_vector_cmplxcg_mul(ka2); + + _sse_load(rn->s0); + _sse_vector_add(); + _sse_store(rn->s0); + + _sse_load(rn->s3); + _sse_vector_sub(); + _sse_store(rn->s3); + + _sse_load(sm->s1); + _sse_load_up(sm->s2); + _sse_vector_add(); + + _sse_su3_inverse_multiply((*um)); + _sse_vector_cmplxcg_mul(ka2); + + _sse_load(rn->s1); + _sse_vector_add(); + _sse_store(rn->s1); + + _sse_load(rn->s2); + _sse_vector_add(); + _sse_store(rn->s2); + + /*********************** direction +3 ************************/ + + sp=k+g_iup_eo[icx][3]; + up=um+1; + + _sse_load(sp->s0); + _sse_load_up(sp->s2); + _sse_vector_i_mul(); + _sse_vector_add(); + + _sse_su3_multiply((*up)); + _sse_vector_cmplx_mul(ka3); + + _sse_load(rn->s0); + _sse_vector_add(); + _sse_store(rn->s0); + + _sse_load(rn->s2); + _sse_vector_i_mul(); + _sse_vector_sub(); + _sse_store(rn->s2); + + _sse_load(sp->s1); + _sse_load_up(sp->s3); + _sse_vector_i_mul(); + _sse_vector_sub(); + + _sse_su3_multiply((*up)); + _sse_vector_cmplx_mul(ka3); + + _sse_load(rn->s1); + _sse_vector_add(); + _sse_store(rn->s1); + + _sse_load(rn->s3); + _sse_vector_i_mul(); + _sse_vector_add(); + _sse_store(rn->s3); + + /*********************** direction -3 ************************/ + + sm=k+g_idn_eo[icx][3]; + um=up+1; + + _sse_load(sm->s0); + _sse_load_up(sm->s2); + _sse_vector_i_mul(); + _sse_vector_sub(); + + _sse_su3_inverse_multiply((*um)); + _sse_vector_cmplxcg_mul(ka3); + + _sse_load(rn->s0); + _sse_vector_add(); + _sse_store(rn->s0); + + _sse_load(rn->s2); + _sse_vector_i_mul(); + _sse_vector_add(); + _sse_store(rn->s2); + + _sse_load(sm->s1); + _sse_load_up(sm->s3); + _sse_vector_i_mul(); + _sse_vector_add(); + + _sse_su3_inverse_multiply((*um)); + _sse_vector_cmplxcg_mul(ka3); + + _sse_load(rn->s1); + _sse_vector_add(); + _sse_store(rn->s1); + + _sse_load(rn->s3); + _sse_vector_i_mul(); + _sse_vector_sub(); + _sse_store(rn->s3); + } /* end of loop over timeslice (Bt)*/ + } /* x0=0; x0. + * + **********************************************************************/ + + +/* input on k; output on l */ +void Hopping_Matrix(const int ieo, spinor * const l, spinor * const k){ + int icx,icy,icz,ioff,ioff2; + int ix,iy,iz; + su3 *restrict up; + su3 * restrict um; + spinor * restrict sp; + spinor * restrict sm; + spinor * restrict rn; + static spinor rs; + + /* for parallelization */ +#ifdef _GAUGE_COPY + if(g_update_gauge_copy) { + update_backward_gauge(); + } +#endif + +# if (defined MPI && !defined _NO_COMM) + xchange_field(k, ieo); +# endif + + if(k == l){ + printf("Error in subroutine D_psi: improper arguments\n"); + printf("Program aborted\n"); + exit(1); + } + if(ieo == 0){ + ioff = 0; + } + else{ + ioff = (VOLUME+RAND)/2; + } + ioff2 = (VOLUME+RAND)/2-ioff; + + ix=g_eo2lexic[ioff]; + iy=g_iup[ix][0]; + icy=g_lexic2eosub[iy]; + + sp=k+icy; +# if ((defined _GAUGE_COPY)) + up=&g_gauge_field_copy[ioff][0]; +# else + up=&g_gauge_field[ix][0]; +# endif + + /**************** loop over all lattice sites ******************/ + for(icx = ioff; icx < (VOLUME/2+ioff); icx++){ + ix=g_eo2lexic[icx]; + /*********************** direction +0 ************************/ + + iy=g_idn[ix][0]; icy=g_lexic2eosub[iy]; + + sm=k+icy; + _prefetch_spinor(sm); + +# if ((defined _GAUGE_COPY)) + um=up+1; +# else + um=&g_gauge_field[iy][0]; +# endif + _prefetch_su3(um); + + _sse_load((*sp).s0); + _sse_load_up((*sp).s2); + _sse_vector_add(); + + _sse_su3_multiply((*up)); + _sse_vector_cmplx_mul(ka0); + _sse_store_up(rs.s0); + _sse_store_up(rs.s2); + + _sse_load((*sp).s1); + _sse_load_up((*sp).s3); + _sse_vector_add(); + + _sse_su3_multiply((*up)); + _sse_vector_cmplx_mul(ka0); + _sse_store_up(rs.s1); + _sse_store_up(rs.s3); + + /*********************** direction -0 ************************/ + + iy=g_iup[ix][1]; icy=g_lexic2eosub[iy]; + + sp=k+icy; + _prefetch_spinor(sp); + +# if ((defined _GAUGE_COPY)) + up = um + 1; +# else + up+=1; +# endif + _prefetch_su3(up); + + _sse_load((*sm).s0); + _sse_load_up((*sm).s2); + _sse_vector_sub(); + + _sse_su3_inverse_multiply((*um)); + _sse_vector_cmplxcg_mul(ka0); + + _sse_load(rs.s0); + _sse_vector_add(); + _sse_store(rs.s0); + + _sse_load(rs.s2); + _sse_vector_sub(); + _sse_store(rs.s2); + + _sse_load((*sm).s1); + _sse_load_up((*sm).s3); + _sse_vector_sub(); + + _sse_su3_inverse_multiply((*um)); + _sse_vector_cmplxcg_mul(ka0); + + _sse_load(rs.s1); + _sse_vector_add(); + _sse_store(rs.s1); + + _sse_load(rs.s3); + _sse_vector_sub(); + _sse_store(rs.s3); + + /*********************** direction +1 ************************/ + + iy=g_idn[ix][1]; icy=g_lexic2eosub[iy]; + + sm=k+icy; + _prefetch_spinor(sm); + +# ifndef _GAUGE_COPY + um=&g_gauge_field[iy][1]; +# else + um=up+1; +# endif + _prefetch_su3(um); + + _sse_load((*sp).s0); + _sse_load_up((*sp).s3); + _sse_vector_i_mul(); + _sse_vector_add(); + + _sse_su3_multiply((*up)); + _sse_vector_cmplx_mul(ka1); + + _sse_load(rs.s0); + _sse_vector_add(); + _sse_store(rs.s0); + + _sse_load(rs.s3); + _sse_vector_i_mul(); + _sse_vector_sub(); + _sse_store(rs.s3); + + _sse_load((*sp).s1); + _sse_load_up((*sp).s2); + _sse_vector_i_mul(); + _sse_vector_add(); + + _sse_su3_multiply((*up)); + _sse_vector_cmplx_mul(ka1); + + _sse_load(rs.s1); + _sse_vector_add(); + _sse_store(rs.s1); + + _sse_load(rs.s2); + _sse_vector_i_mul(); + _sse_vector_sub(); + _sse_store(rs.s2); + + /*********************** direction -1 ************************/ + + iy=g_iup[ix][2]; icy=g_lexic2eosub[iy]; + + sp=k+icy; + _prefetch_spinor(sp); + +# if ((defined _GAUGE_COPY)) + up = um + 1; +# else + up+=1; +# endif + _prefetch_su3(up); + + _sse_load((*sm).s0); + _sse_load_up((*sm).s3); + _sse_vector_i_mul(); + _sse_vector_sub(); + + _sse_su3_inverse_multiply((*um)); + _sse_vector_cmplxcg_mul(ka1); + + _sse_load(rs.s0); + _sse_vector_add(); + _sse_store(rs.s0); + + _sse_load(rs.s3); + _sse_vector_i_mul(); + _sse_vector_add(); + _sse_store(rs.s3); + + _sse_load((*sm).s1); + _sse_load_up((*sm).s2); + _sse_vector_i_mul(); + _sse_vector_sub(); + + _sse_su3_inverse_multiply((*um)); + _sse_vector_cmplxcg_mul(ka1); + + _sse_load(rs.s1); + _sse_vector_add(); + _sse_store(rs.s1); + + _sse_load(rs.s2); + _sse_vector_i_mul(); + _sse_vector_add(); + _sse_store(rs.s2); + + /*********************** direction +2 ************************/ + + iy=g_idn[ix][2]; icy=g_lexic2eosub[iy]; + + sm=k+icy; + _prefetch_spinor(sm); + +# ifndef _GAUGE_COPY + um=&g_gauge_field[iy][2]; +# else + um=up+1; +# endif + _prefetch_su3(um); + + _sse_load((*sp).s0); + _sse_load_up((*sp).s3); + _sse_vector_add(); + + _sse_su3_multiply((*up)); + _sse_vector_cmplx_mul(ka2); + + _sse_load(rs.s0); + _sse_vector_add(); + _sse_store(rs.s0); + + _sse_load(rs.s3); + _sse_vector_add(); + _sse_store(rs.s3); + + _sse_load((*sp).s1); + _sse_load_up((*sp).s2); + _sse_vector_sub(); + + _sse_su3_multiply((*up)); + _sse_vector_cmplx_mul(ka2); + + _sse_load(rs.s1); + _sse_vector_add(); + _sse_store(rs.s1); + + _sse_load(rs.s2); + _sse_vector_sub(); + _sse_store(rs.s2); + + /*********************** direction -2 ************************/ + + iy=g_iup[ix][3]; icy=g_lexic2eosub[iy]; + + sp=k+icy; + _prefetch_spinor(sp); + +# if ((defined _GAUGE_COPY)) + up = um + 1; +# else + up+=1; +# endif + _prefetch_su3(up); + + _sse_load((*sm).s0); + _sse_load_up((*sm).s3); + _sse_vector_sub(); + + _sse_su3_inverse_multiply((*um)); + _sse_vector_cmplxcg_mul(ka2); + + _sse_load(rs.s0); + _sse_vector_add(); + _sse_store(rs.s0); + + _sse_load(rs.s3); + _sse_vector_sub(); + _sse_store(rs.s3); + + _sse_load((*sm).s1); + _sse_load_up((*sm).s2); + _sse_vector_add(); + + _sse_su3_inverse_multiply((*um)); + _sse_vector_cmplxcg_mul(ka2); + + _sse_load(rs.s1); + _sse_vector_add(); + _sse_store(rs.s1); + + _sse_load(rs.s2); + _sse_vector_add(); + _sse_store(rs.s2); + + /*********************** direction +3 ************************/ + + iy=g_idn[ix][3]; icy=g_lexic2eosub[iy]; + + sm=k+icy; + _prefetch_spinor(sm); + +# ifndef _GAUGE_COPY + um=&g_gauge_field[iy][3]; +# else + um=up+1; +# endif + _prefetch_su3(um); + + _sse_load((*sp).s0); + _sse_load_up((*sp).s2); + _sse_vector_i_mul(); + _sse_vector_add(); + + _sse_su3_multiply((*up)); + _sse_vector_cmplx_mul(ka3); + + _sse_load(rs.s0); + _sse_vector_add(); + _sse_store(rs.s0); + + _sse_load(rs.s2); + _sse_vector_i_mul(); + _sse_vector_sub(); + _sse_store(rs.s2); + + _sse_load((*sp).s1); + _sse_load_up((*sp).s3); + _sse_vector_i_mul(); + _sse_vector_sub(); + + _sse_su3_multiply((*up)); + _sse_vector_cmplx_mul(ka3); + + _sse_load(rs.s1); + _sse_vector_add(); + _sse_store(rs.s1); + + _sse_load(rs.s3); + _sse_vector_i_mul(); + _sse_vector_add(); + _sse_store(rs.s3); + + /*********************** direction -3 ************************/ + + icz=icx+1; + if(icz==((VOLUME+RAND)/2+ioff)) icz=ioff; + iz=g_eo2lexic[icz]; + iy=g_iup[iz][0]; icy=g_lexic2eosub[iy]; + + sp=k+icy; + _prefetch_spinor(sp); + +# if ((defined _GAUGE_COPY)) + up=&g_gauge_field_copy[icz][0]; +# else + up=&g_gauge_field[iz][0]; +# endif + _prefetch_su3(up); + + _sse_load((*sm).s0); + _sse_load_up((*sm).s2); + _sse_vector_i_mul(); + _sse_vector_sub(); + + _sse_su3_inverse_multiply((*um)); + _sse_vector_cmplxcg_mul(ka3); + + rn=l+(icx-ioff); + + _sse_load(rs.s0); + _sse_vector_add(); + _sse_store_nt((*rn).s0); + + _sse_load(rs.s2); + _sse_vector_i_mul(); + _sse_vector_add(); + _sse_store_nt((*rn).s2); + + _sse_load((*sm).s1); + _sse_load_up((*sm).s3); + _sse_vector_i_mul(); + _sse_vector_add(); + + _sse_su3_inverse_multiply((*um)); + _sse_vector_cmplxcg_mul(ka3); + + _sse_load(rs.s1); + _sse_vector_add(); + _sse_store_nt((*rn).s1); + + _sse_load(rs.s3); + _sse_vector_i_mul(); + _sse_vector_sub(); + _sse_store_nt((*rn).s3); + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/tm_operators.c b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/tm_operators.c new file mode 100644 index 0000000000000000000000000000000000000000..a0361f00218437a496939267594a83025fa9872c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/tm_operators.c @@ -0,0 +1,863 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * * + * This file contains operators for twisted mass Wilson QCD * + * prepared for even odd preconditioning * + * * + * see documentation for details * + * Author: Carsten Urbach * + * urbach@physik.fu-berlin.de * + **************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include "global.h" +#include "su3.h" +#include "operator/Hopping_Matrix.h" +#include "operator/Hopping_Matrix_nocom.h" +#include "operator/tm_times_Hopping_Matrix.h" +#include "operator/tm_sub_Hopping_Matrix.h" +#include "sse.h" +#include "linalg_eo.h" +#include "gamma.h" +#include "operator/D_psi.h" +#ifdef BGL +# include "bgl.h" +#endif +#ifdef BGQ +# include "bgq.h" +#endif + +#ifndef BENCHMARK +#include "solver/dirac_operator_eigenvectors.h" +#else +#include "benchmark_deps.h +#endif + +#include "tm_operators.h" + +#if (defined SSE2 || defined SSE3 || defined BGL) +const int predist=2; +#endif +/* internal */ + +/****************************************** + * mul_one_pm_imu_inv computes + * l = (1\pm i\mu\gamma_5)^{-1} * l + * + * sign is the sign used in + * 1\pm i\mu\gamma_5 + * l is number of input and output field + * + ******************************************/ +void mul_one_pm_imu_inv(spinor * const l, const double _sign, const int N); +void mul_one_pm_imu(spinor * const l, const double _sign); +/****************************************** + * mul_one_pm_imu_sub_mul_gamma5 computes + * l = gamma_5*((1\pm i\mu\gamma_5)*k - j) + * + * l is the number of the output field + * k and j the numbers of the input fields + * + * sign indicates which sign should be used + * in 1\pm i\mu\gamma_5 + ******************************************/ +void mul_one_pm_imu_sub_mul_gamma5(spinor * const l, spinor * const k, + spinor * const j, const double _sign); +void mul_one_sub_mul_gamma5(spinor * const l, spinor * const k, + spinor * const j); + +/****************************************** + * mul_one_pm_imu_sub_mul computes + * l = ((1\pm i\mu\gamma_5)*k - j) + * + * l is the number of the output field + * k and j the numbers of the input fields + * + * sign indicates which sign should be used + * in 1\pm i\mu\gamma_5 + ******************************************/ +void mul_one_pm_imu_sub_mul(spinor * const l, spinor * const k, + spinor * const j, const double _sign, const int N); +void tm_sub_H_eo_gamma5(spinor* const l, spinor * const p, spinor * const k, + const int ieo, const double _sign); + +/* external functions */ + +void M_full(spinor * const Even_new, spinor * const Odd_new, + spinor * const Even, spinor * const Odd) { + /* Even sites */ + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX], Odd); + assign_mul_one_pm_imu(Even_new, Even, 1., VOLUME/2); + assign_add_mul_r(Even_new, g_spinor_field[DUM_MATRIX], -1., VOLUME/2); + + /* Odd sites */ + Hopping_Matrix(OE, g_spinor_field[DUM_MATRIX], Even); + assign_mul_one_pm_imu(Odd_new, Odd, 1., VOLUME/2); + assign_add_mul_r(Odd_new, g_spinor_field[DUM_MATRIX], -1., VOLUME/2); +} + +void Q_full(spinor * const Even_new, spinor * const Odd_new, + spinor * const Even, spinor * const Odd) { + /* Even sites */ + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX], Odd); + assign_mul_one_pm_imu(Even_new, Even, 1., VOLUME/2); + assign_add_mul_r(Even_new, g_spinor_field[DUM_MATRIX], -1., VOLUME/2); + gamma5(Even_new, Even_new, VOLUME/2); + + /* Odd sites */ + Hopping_Matrix(OE, g_spinor_field[DUM_MATRIX], Even); + assign_mul_one_pm_imu(Odd_new, Odd, 1., VOLUME/2); + assign_add_mul_r(Odd_new, g_spinor_field[DUM_MATRIX], -1., VOLUME/2); + gamma5(Odd_new, Odd_new, VOLUME/2); +} + +void M_minus_1_timesC(spinor * const Even_new, spinor * const Odd_new, + spinor * const Even, spinor * const Odd) { + /* Even sites */ + Hopping_Matrix(EO, Even_new, Odd); + mul_one_pm_imu_inv(Even_new, 1., VOLUME/2); + + /* Odd sites */ + Hopping_Matrix(OE, Odd_new, Even); + mul_one_pm_imu_inv(Odd_new, 1., VOLUME/2); +} + + + +/****************************************** + * + * This is the implementation of + * + * \hat Q_{+} = + * \gamma_5(M_{oo}^+ - M_{oe}(M_{ee}^+ )^{-1}M_{eo}) + * + * see documentation for details + * k is the number of the input field + * l is the number of the output field + * + * it acts only on the odd part or only + * on a half spinor + ******************************************/ +void Qtm_plus_psi(spinor * const l, spinor * const k){ + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX+1], k); + mul_one_pm_imu_inv(g_spinor_field[DUM_MATRIX+1], +1., VOLUME/2); + Hopping_Matrix(OE, g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+1]); + mul_one_pm_imu_sub_mul_gamma5(l, k, g_spinor_field[DUM_MATRIX], +1.); +} + +void Qtm_plus_psi_nocom(spinor * const l, spinor * const k){ + Hopping_Matrix_nocom(EO, g_spinor_field[DUM_MATRIX+1], k); + mul_one_pm_imu_inv(g_spinor_field[DUM_MATRIX+1], +1., VOLUME/2); + Hopping_Matrix_nocom(OE, g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+1]); + mul_one_pm_imu_sub_mul_gamma5(l, k, g_spinor_field[DUM_MATRIX], +1.); +} + +void Qtm_plus_sym_psi(spinor * const l, spinor * const k){ + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX+1], k); + mul_one_pm_imu_inv(g_spinor_field[DUM_MATRIX+1], +1., VOLUME/2); + Hopping_Matrix(OE, g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+1]); + mul_one_pm_imu_inv(g_spinor_field[DUM_MATRIX], +1., VOLUME/2); + mul_one_sub_mul_gamma5(l, k, g_spinor_field[DUM_MATRIX]); +} + +void Qtm_plus_sym_psi_nocom(spinor * const l, spinor * const k){ + Hopping_Matrix_nocom(EO, g_spinor_field[DUM_MATRIX+1], k); + mul_one_pm_imu_inv(g_spinor_field[DUM_MATRIX+1], +1., VOLUME/2); + Hopping_Matrix_nocom(OE, g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+1]); + mul_one_pm_imu_inv(g_spinor_field[DUM_MATRIX], +1., VOLUME/2); + mul_one_sub_mul_gamma5(l, k, g_spinor_field[DUM_MATRIX]); +} + +/****************************************** + * + * This is the implementation of + * + * \hat Q_{-} = + * \gamma_5(M_{oo}^- - M_{oe}(M_{ee}^- )^{-1}M_{eo}) + * + * see documentation for details + * k is the number of the input field + * l is the number of the output field + * + * it acts only on the odd part or only + * on a half spinor + ******************************************/ +void Qtm_minus_psi(spinor * const l, spinor * const k) { + H_eo_tm_inv_psi(g_spinor_field[DUM_MATRIX+1], k, EO, -1); + Hopping_Matrix(OE, g_spinor_field[DUM_MATRIX+2], g_spinor_field[DUM_MATRIX+1]); + mul_one_pm_imu_sub_mul_gamma5(l, k, g_spinor_field[DUM_MATRIX+2], -1); + //tm_sub_H_eo_gamma5(l, k, g_spinor_field[DUM_MATRIX+1], OE, -1.); +} + +void Qtm_minus_sym_psi(spinor * const l, spinor * const k){ + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX+1], k); + mul_one_pm_imu_inv(g_spinor_field[DUM_MATRIX+1], -1., VOLUME/2); + Hopping_Matrix(OE, g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+1]); + mul_one_pm_imu_inv(g_spinor_field[DUM_MATRIX], -1., VOLUME/2); + mul_one_sub_mul_gamma5(l, k, g_spinor_field[DUM_MATRIX]); +} + +/****************************************** + * + * This is the implementation of + * + * \gamma_5 \hat Q_{+} = + * (M_{oo}^+ - M_{oe}(M_{ee}^+ )^{-1}M_{eo}) + * + * see documentation for details + * k is the number of the input field + * l is the number of the output field + * + * it acts only on the odd part or only + * on a half spinor + ******************************************/ +void Mtm_plus_psi(spinor * const l, spinor * const k){ + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX+1], k); + mul_one_pm_imu_inv(g_spinor_field[DUM_MATRIX+1], +1., VOLUME/2); + Hopping_Matrix(OE, g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+1]); + mul_one_pm_imu_sub_mul(l, k, g_spinor_field[DUM_MATRIX], +1., VOLUME/2); +} + +void Mtm_plus_psi_nocom(spinor * const l, spinor * const k){ + Hopping_Matrix_nocom(EO, g_spinor_field[DUM_MATRIX+1], k); + mul_one_pm_imu_inv(g_spinor_field[DUM_MATRIX+1], +1., VOLUME/2); + Hopping_Matrix_nocom(OE, g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+1]); + mul_one_pm_imu_sub_mul(l, k, g_spinor_field[DUM_MATRIX], +1., VOLUME/2); +} + +void Mtm_plus_sym_psi(spinor * const l, spinor * const k){ + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX+1], k); + mul_one_pm_imu_inv(g_spinor_field[DUM_MATRIX+1], +1., VOLUME/2); + Hopping_Matrix(OE, g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+1]); + mul_one_pm_imu_inv(g_spinor_field[DUM_MATRIX], +1., VOLUME/2); + diff(l, k, g_spinor_field[DUM_MATRIX], VOLUME/2); +} + +void Mtm_plus_sym_psi_nocom(spinor * const l, spinor * const k){ + Hopping_Matrix_nocom(EO, g_spinor_field[DUM_MATRIX+1], k); + mul_one_pm_imu_inv(g_spinor_field[DUM_MATRIX+1], +1., VOLUME/2); + Hopping_Matrix_nocom(OE, g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+1]); + mul_one_pm_imu_inv(g_spinor_field[DUM_MATRIX], +1., VOLUME/2); + diff(l, k, g_spinor_field[DUM_MATRIX], VOLUME/2); +} + +/****************************************** + * + * This is the implementation of + * + * \gamma_5 \hat Q_{-} = + * (M_{oo}^- - M_{oe}(M_{ee}^- )^{-1}M_{eo}) + * + * see documentation for details + * k is the number of the input field + * l is the number of the output field + * + * it acts only on the odd part or only + * on a half spinor + ******************************************/ +void Mtm_minus_psi(spinor * const l, spinor * const k) { + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX+1], k); + mul_one_pm_imu_inv(g_spinor_field[DUM_MATRIX+1], -1., VOLUME/2); + Hopping_Matrix(OE, g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+1]); + mul_one_pm_imu_sub_mul(l, k, g_spinor_field[DUM_MATRIX], -1., VOLUME/2); +} + +void Mtm_minus_sym_psi(spinor * const l, spinor * const k) { + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX+1], k); + mul_one_pm_imu_inv(g_spinor_field[DUM_MATRIX+1], -1., VOLUME/2); + Hopping_Matrix(OE, g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+1]); + mul_one_pm_imu_inv(g_spinor_field[DUM_MATRIX], -1., VOLUME/2); + diff(l, k, g_spinor_field[DUM_MATRIX], VOLUME/2); +} + +void Mtm_minus_sym_psi_nocom(spinor * const l, spinor * const k) { + Hopping_Matrix_nocom(EO, g_spinor_field[DUM_MATRIX+1], k); + mul_one_pm_imu_inv(g_spinor_field[DUM_MATRIX+1], -1., VOLUME/2); + Hopping_Matrix_nocom(OE, g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+1]); + mul_one_pm_imu_inv(g_spinor_field[DUM_MATRIX], -1., VOLUME/2); + diff(l, k, g_spinor_field[DUM_MATRIX], VOLUME/2); +} + +/****************************************** + * + * This is the implementation of + * + * \hat Q_{+} \hat Q_{-} + * + * see documentation for details + * k is the number of the input field + * l is the number of the output field + * + * it acts only on the odd part or only + * on a half spinor + ******************************************/ +void Qtm_pm_psi(spinor * const l, spinor * const k){ + /* Q_{-} */ + H_eo_tm_inv_psi(g_spinor_field[DUM_MATRIX+1], k, EO, -1); + tm_sub_H_eo_gamma5(g_spinor_field[DUM_MATRIX], k, g_spinor_field[DUM_MATRIX+1], OE, -1); + /* Q_{+} */ + H_eo_tm_inv_psi(g_spinor_field[DUM_MATRIX+1], g_spinor_field[DUM_MATRIX], EO, +1); + tm_sub_H_eo_gamma5(l, g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+1], OE, +1); +} + +void Qtm_pm_sym_psi(spinor * const l, spinor * const k){ + /* Q_{-} */ + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX+1], k); + mul_one_pm_imu_inv(g_spinor_field[DUM_MATRIX+1], -1., VOLUME/2); + Hopping_Matrix(OE, g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+1]); + mul_one_pm_imu_inv(g_spinor_field[DUM_MATRIX], -1., VOLUME/2); + diff(l, k, g_spinor_field[DUM_MATRIX], VOLUME/2); + gamma5(l, l, VOLUME/2); + + /* Q_{+} */ + Hopping_Matrix(EO, l, g_spinor_field[DUM_MATRIX]); + mul_one_pm_imu_inv(l, +1., VOLUME/2); + Hopping_Matrix(OE, g_spinor_field[DUM_MATRIX+1], l); + mul_one_pm_imu_inv(g_spinor_field[DUM_MATRIX], +1., VOLUME/2); + diff(l, k, g_spinor_field[DUM_MATRIX], VOLUME/2); + gamma5(l, l, VOLUME/2); + +} + +void Qtm_pm_psi_nocom(spinor * const l, spinor * const k){ + /* Q_{-} */ + Hopping_Matrix_nocom(EO, g_spinor_field[DUM_MATRIX+1], k); + mul_one_pm_imu_inv(g_spinor_field[DUM_MATRIX+1], -1., VOLUME/2); + Hopping_Matrix_nocom(OE, g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+1]); + mul_one_pm_imu_sub_mul_gamma5(g_spinor_field[DUM_MATRIX], k, g_spinor_field[DUM_MATRIX], -1.); + /* Q_{+} */ + Hopping_Matrix_nocom(EO, l, g_spinor_field[DUM_MATRIX]); + mul_one_pm_imu_inv(l, +1., VOLUME/2); + Hopping_Matrix_nocom(OE, g_spinor_field[DUM_MATRIX+1], l); + mul_one_pm_imu_sub_mul_gamma5(l, g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+1], +1.); +} + +/* the "full" operators */ +void Q_pm_psi(spinor * const l, spinor * const k) +{ + g_mu = -g_mu; + D_psi(l, k); + gamma5(g_spinor_field[DUM_MATRIX], l, VOLUME); + g_mu = -g_mu; + D_psi(l, g_spinor_field[DUM_MATRIX]); + gamma5(l, l, VOLUME); +} + + +/* the "full" operators */ +void Q_pm_psi_prec(spinor * const l, spinor * const k) +{ + spinorPrecWS *ws=(spinorPrecWS*)g_precWS; + + _Complex double ALIGN alpha= -1.0; + + if(g_prec_sequence_d_dagger_d[0]!=0.0) + { + alpha = g_prec_sequence_d_dagger_d[0]; + spinorPrecondition(l,k,ws,T,L,alpha,0,1); + } + else + assign(l,k,VOLUME); + + g_mu = -g_mu; + D_psi(g_spinor_field[DUM_MATRIX], l); + gamma5(l, g_spinor_field[DUM_MATRIX], VOLUME); + g_mu = -g_mu; + + if(g_prec_sequence_d_dagger_d[1]!=0.0) + { + alpha = g_prec_sequence_d_dagger_d[1]; + spinorPrecondition(l,l,ws,T,L,alpha,0,1); + } + + D_psi(g_spinor_field[DUM_MATRIX], l); + gamma5(l, g_spinor_field[DUM_MATRIX], VOLUME); + + if(g_prec_sequence_d_dagger_d[2]!=0.0) + { + alpha = g_prec_sequence_d_dagger_d[2]; + spinorPrecondition(l,l,ws,T,L,alpha,0,1); + } + +} + + + +/* This is the version for the gpu with interchanged order of gamma5 and D_psi (Florian Burger)*/ +void Q_pm_psi_gpu(spinor * const l, spinor * const k) +{ + gamma5(k, k, VOLUME); + g_mu = -g_mu; + D_psi(l, k); + gamma5(g_spinor_field[DUM_MATRIX], l, VOLUME); + g_mu = -g_mu; + D_psi(l, g_spinor_field[DUM_MATRIX]); + +} + +/* the "full" operators */ +void Q_pm_psi2(spinor * const l, spinor * const k) +{ + g_mu = -10.*g_mu; + D_psi(l, k); + gamma5(g_spinor_field[DUM_MATRIX], l, VOLUME); + g_mu = -g_mu/10.; + D_psi(l, g_spinor_field[DUM_MATRIX]); + gamma5(l, l, VOLUME); +} + +void Q_minus_psi(spinor * const l, spinor * const k) +{ + g_mu = -g_mu; + D_psi(l, k); + g_mu = -g_mu; + gamma5(l, l, VOLUME); +} + +/* This is the version for the gpu (Florian Burger)*/ +void Q_minus_psi_gpu(spinor * const l, spinor * const k) +{ + gamma5(k,k,VOLUME); + g_mu = -g_mu; + D_psi(l, k); + g_mu = -g_mu; + gamma5(l, l, VOLUME); +} + +void Q_plus_psi(spinor * const l, spinor * const k) +{ + D_psi(l, k); + gamma5(l, l, VOLUME); +} + +/****************************************** + * + * This is the implementation of + * + * (M_{ee}^\pm)^{-1}M_{eo} + * + * see documentation for details + * k is the number of the input field + * l is the number of the output field + * + * it acts only on the odd part or only + * on a half spinor + ******************************************/ + +void H_eo_tm_inv_psi(spinor * const l, spinor * const k, + const int ieo, const double _sign) { +#if ((defined BGL && defined XLC) || defined _USE_TSPLITPAR) + Hopping_Matrix(ieo, l, k); + mul_one_pm_imu_inv(l, _sign, VOLUME/2); +#else + double ALIGN nrm = 1./(1.+g_mu*g_mu); + double sign=-1.; + complex double ALIGN z; + if(_sign < 0.){ + sign = 1.; + } + + z = nrm + (sign * nrm * g_mu) * I; + tm_times_Hopping_Matrix(ieo, l, k, z); + return; +#endif + +} + +void tm_sub_H_eo_gamma5(spinor* const l, spinor * const p, spinor * const k, + const int ieo, const double _sign) { +#if ((defined BGL && defined XLC) || defined _USE_TSPLITPAR) + Hopping_Matrix(ieo, g_spinor_field[DUM_MATRIX+2], k); + mul_one_pm_imu_sub_mul_gamma5(l, p, g_spinor_field[DUM_MATRIX+2], _sign); +#else + _Complex double ALIGN z; + double sign=1.; + + if(_sign < 0.){ + sign = -1.; + } + + z = 1. + (sign * g_mu) * I; + tm_sub_Hopping_Matrix(ieo, l, p, k, z); +#endif + + return; +} + + +/********************************************** + * + * All the results are only stored in the first + * half of the spinor fields, they have only + * length VOLUME/2 + * + * That's why mul_... do not need a iput + * parameter ieo. + * + * the next functions are internal and you + * can find comments above at the declaration + * + **********************************************/ + +void mul_one_pm_imu_inv(spinor * const l, const double _sign, const int N){ +#ifdef OMP +#pragma omp parallel + { +#endif + _Complex double ALIGN z,w; + int ix; + double sign=-1.; + spinor *r; + + su3_vector ALIGN phi1; + + double ALIGN nrm = 1./(1.+g_mu*g_mu); + + if(_sign < 0.){ + sign = 1.; + } + + z = nrm + (sign * nrm * g_mu) * I; + w = conj(z); + /************ loop over all lattice sites ************/ +#ifdef OMP +#pragma omp for +#endif + for(ix = 0; ix < N; ix++){ + r=l + ix; + /* Multiply the spinorfield with the inverse of 1+imu\gamma_5 */ +#if ( defined SSE2 || defined SSE3 ) + _prefetch_spinor((r+predist)); + _sse_load_up(r->s0); + _sse_vector_cmplx_mul(z); + _sse_store_nt_up(r->s0); + _sse_load_up(r->s1); + _sse_vector_cmplx_mul_two(); + _sse_store_nt_up(r->s1); + _sse_load_up(r->s2); + _sse_vector_cmplx_mul(w); + _sse_store_nt_up(r->s2); + _sse_load_up(r->s3); + _sse_vector_cmplx_mul_two(); + _sse_store_nt_up(r->s3); +#else + _complex_times_vector(phi1, z, r->s0); + _vector_assign(r->s0, phi1); + _complex_times_vector(phi1, z, r->s1); + _vector_assign(r->s1, phi1); + _complex_times_vector(phi1, w, r->s2); + _vector_assign(r->s2, phi1); + _complex_times_vector(phi1, w, r->s3); + _vector_assign(r->s3, phi1); +#endif + } + +#ifdef OMP + } /* OpenMP closing brace */ +#endif + +} + +void assign_mul_one_pm_imu_inv(spinor * const l, spinor * const k, const double _sign, const int N){ +#ifdef OMP +#pragma omp parallel + { +#endif + _Complex double z,w; + int ix; + double sign=-1.; + spinor *r, *s; + double nrm = 1./(1.+g_mu*g_mu); + + if(_sign < 0.){ + sign = 1.; + } + + z = nrm + (sign * nrm * g_mu) * I; + w = conj(z); + + /************ loop over all lattice sites ************/ +#ifdef OMP +#pragma omp for +#endif + for(ix = 0; ix < N; ix++){ + r=k+ix; + s=l+ix; + /* Multiply the spinorfield with the inverse of 1+imu\gamma_5 */ + _complex_times_vector(s->s0, z, r->s0); + _complex_times_vector(s->s1, z, r->s1); + _complex_times_vector(s->s2, w, r->s2); + _complex_times_vector(s->s3, w, r->s3); + } + +#ifdef OMP + } /* OpenMP closing brace */ +#endif +} + +void mul_one_pm_imu(spinor * const l, const double _sign){ +#ifdef OMP +#pragma omp parallel + { +#endif + _Complex double z,w; + int ix; + double sign = 1.; + spinor *r; + + su3_vector ALIGN phi1; + + if(_sign < 0.){ + sign = -1.; + } + + z = 1. + (sign * g_mu) * I; + w = conj(z); + + /************ loop over all lattice sites ************/ +#ifdef OMP +#pragma omp for +#endif + for(ix = 0; ix < (VOLUME/2); ix++){ + r=l+ix; + /* Multiply the spinorfield with 1+imu\gamma_5 */ + _complex_times_vector(phi1, z, r->s0); + _vector_assign(r->s0, phi1); + _complex_times_vector(phi1, z, r->s1); + _vector_assign(r->s1, phi1); + _complex_times_vector(phi1, w, r->s2); + _vector_assign(r->s2, phi1); + _complex_times_vector(phi1, w, r->s3); + _vector_assign(r->s3, phi1); + } + +#ifdef OMP + } /* OpenMP closing brace */ +#endif + +} + +void assign_mul_one_pm_imu(spinor * const l, spinor * const k, const double _sign, const int N){ +#ifdef OMP +#pragma omp parallel + { +#endif + _Complex double z,w; + int ix; + double sign = 1.; + spinor *r, *s; + + if(_sign < 0.){ + sign = -1.; + } + + z = 1. + (sign * g_mu) * I; + w = conj(z); + + /************ loop over all lattice sites ************/ +#ifdef OMP +#pragma omp for +#endif + for(ix = 0; ix < N; ix++){ + s=l+ix; + r=k+ix; + + /* Multiply the spinorfield with of 1+imu\gamma_5 */ +#if ( defined SSE2 || defined SSE3 ) + _prefetch_spinor((r+predist)); + _prefetch_spinor((s+predist)); + _sse_load_up(r->s0); + _sse_vector_cmplx_mul(z); + _sse_store_nt_up(s->s0); + _sse_load_up(r->s1); + _sse_vector_cmplx_mul_two(); + _sse_store_nt_up(s->s1); + _sse_load_up(r->s2); + _sse_vector_cmplx_mul(w); + _sse_store_nt_up(s->s2); + _sse_load_up(r->s3); + _sse_vector_cmplx_mul_two(); + _sse_store_nt_up(s->s3); +#else + _complex_times_vector(s->s0, z, r->s0); + _complex_times_vector(s->s1, z, r->s1); + _complex_times_vector(s->s2, w, r->s2); + _complex_times_vector(s->s3, w, r->s3); +#endif + } +#ifdef OMP + } /* OpenMP closing brace */ +#endif +} + +void mul_one_sub_mul_gamma5(spinor * const l, spinor * const k, + spinor * const j){ +#ifdef OMP +#pragma omp parallel + { +#endif + spinor *r, *s, *t; + + /************ loop over all lattice sites ************/ +#ifdef OMP +#pragma omp for +#endif + for(int ix = 0; ix < (VOLUME/2); ++ix) + { + r = k+ix; + s = j+ix; + t = l+ix; + /* Subtract s and store the result in t */ + /* multiply with gamma5 included by */ + /* reversed order of s and r (2&3) */ + _vector_sub(t->s0, r->s0, s->s0); + _vector_sub(t->s1, r->s1, s->s1); + _vector_sub(t->s2, s->s2, r->s2); + _vector_sub(t->s3, s->s3, r->s3); + } + +#ifdef OMP + } /* OpenMP closing brace */ +#endif +} + + +void mul_one_pm_imu_sub_mul_gamma5(spinor * const l, spinor * const k, + spinor * const j, const double _sign){ +#ifdef OMP +#pragma omp parallel + { +#endif + _Complex double z,w; + int ix; + double sign=1.; + spinor *r, *s, *t; + + su3_vector ALIGN phi1, phi2, phi3, phi4; + + if(_sign < 0.){ + sign = -1.; + } + + z = 1. + (sign * g_mu) * I; + w = conj(z); + + /************ loop over all lattice sites ************/ +#ifdef OMP +#pragma omp for +#endif + for(ix = 0; ix < (VOLUME/2); ix++){ + r = k+ix; + s = j+ix; + t = l+ix; + /* Multiply the spinorfield with 1+imu\gamma_5 */ + _complex_times_vector(phi1, z, r->s0); + _complex_times_vector(phi2, z, r->s1); + _complex_times_vector(phi3, w, r->s2); + _complex_times_vector(phi4, w, r->s3); + /* Subtract s and store the result in t */ + /* multiply with gamma5 included by */ + /* reversed order of s and phi3|4 */ + _vector_sub(t->s0, phi1, s->s0); + _vector_sub(t->s1, phi2, s->s1); + _vector_sub(t->s2, s->s2, phi3); + _vector_sub(t->s3, s->s3, phi4); + } + +#ifdef OMP + } /* OpenMP closing brace */ +#endif +} + +void mul_one_pm_imu_sub_mul(spinor * const l, spinor * const k, + spinor * const j, const double _sign, const int N){ +#ifdef OMP +#pragma omp parallel + { +#endif + _Complex double z,w; + int ix; + double sign=1.; + spinor *r, *s, *t; + +#if (!defined SSE2 && !defined SSE3) + + su3_vector ALIGN phi1, phi2, phi3, phi4; + +#endif + + if(_sign < 0.){ + sign = -1.; + } + + z = 1. + (sign * g_mu) * I; + w = conj(z); + /************ loop over all lattice sites ************/ +#ifdef OMP +#pragma omp for +#endif + for(ix = 0; ix < N; ix++){ + r = k+ix; + s = j+ix; + t = l+ix; + /* Multiply the spinorfield with 1+imu\gamma_5 */ +#if (defined SSE2 || defined SSE3) + _prefetch_spinor((r+predist)); + _prefetch_spinor((s+predist)); + _sse_load_up(r->s0); + _sse_vector_cmplx_mul(z); + _sse_load(s->s0); + _sse_vector_sub_up(); + _sse_store_nt_up(t->s0); + _sse_load_up(r->s1); + _sse_vector_cmplx_mul_two(); + _sse_load(s->s1); + _sse_vector_sub_up(); + _sse_store_nt_up(t->s1); + _sse_load_up(r->s2); + _sse_vector_cmplx_mul(w); + _sse_load(s->s2); + _sse_vector_sub_up(); + _sse_store_nt_up(t->s2); + _sse_load_up(r->s3); + _sse_vector_cmplx_mul_two(); + _sse_load(s->s3); + _sse_vector_sub_up(); + _sse_store_nt_up(t->s3); +#else + _complex_times_vector(phi1, z, r->s0); + _complex_times_vector(phi2, z, r->s1); + _complex_times_vector(phi3, w, r->s2); + _complex_times_vector(phi4, w, r->s3); + /* Subtract s and store the result in t */ + _vector_sub(t->s0, phi1, s->s0); + _vector_sub(t->s1, phi2, s->s1); + _vector_sub(t->s2, phi3, s->s2); + _vector_sub(t->s3, phi4, s->s3); +#endif + } + +#ifdef OMP + } /* OpenMP closing brace */ +#endif +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/tm_operators.h b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/tm_operators.h new file mode 100644 index 0000000000000000000000000000000000000000..e1ceff78ee35cc4d3b6e8c522e750ee8ed50cc8f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/tm_operators.h @@ -0,0 +1,66 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _TM_OPERATORS_H +#define _TM_OPERATORS_H + +#include "su3.h" + +/* This is the full matrix multiplication */ +void M_full(spinor * const Even_new, spinor * const Odd_new, + spinor * const Even, spinor * const Odd); +void Q_full(spinor * const Even_new, spinor * const Odd_new, + spinor * const Even, spinor * const Odd); +void M_minus_1_timesC(spinor * const Even_new, spinor * const Odd_new, + spinor * const Even, spinor * const Odd); + +void Qtm_plus_psi(spinor * const l, spinor * const k); +void Qtm_plus_psi_nocom(spinor * const l, spinor * const k); +void Qtm_minus_psi(spinor * const l, spinor * const k); +void Mtm_plus_psi(spinor * const l, spinor * const k); +void Mtm_plus_psi_nocom(spinor * const l, spinor * const k); +void Mtm_minus_psi(spinor * const l, spinor * const k); +void Qtm_pm_psi(spinor * const l, spinor * const k); +void Qtm_pm_psi_nocom(spinor * const l, spinor * const k); +void H_eo_tm_inv_psi(spinor * const l, spinor * const k, const int ieo, const double sign); +void mul_one_pm_imu_inv(spinor * const l, const double _sign, const int N); +void assign_mul_one_pm_imu_inv(spinor * const l, spinor * const k, const double _sign, const int N); +void assign_mul_one_pm_imu(spinor * const l, spinor * const k, const double _sign, const int N); +void mul_one_pm_imu(spinor * const l, const double _sign); +void mul_one_pm_imu_sub_mul(spinor * const l, spinor * const k, + spinor * const j, const double _sign, const int N); + +void Qtm_plus_sym_psi(spinor * const l, spinor * const k); +void Qtm_plus_sym_psi_nocom(spinor * const l, spinor * const k); +void Qtm_minus_sym_psi(spinor * const l, spinor * const k); +void Mtm_plus_sym_psi(spinor * const l, spinor * const k); +void Mtm_minus_sym_psi(spinor * const l, spinor * const k); +void Mtm_plus_sym_psi_nocom(spinor * const l, spinor * const k); +void Mtm_minus_sym_psi_nocom(spinor * const l, spinor * const k); +void Qtm_pm_sym_psi(spinor * const l, spinor * const k); + +void Q_pm_psi(spinor * const l, spinor * const k); +void Q_pm_psi_prec(spinor * const l, spinor * const k); +void Q_pm_psi_gpu(spinor * const l, spinor * const k); +void Q_pm_psi2(spinor * const l, spinor * const k); +void Q_minus_psi(spinor * const l, spinor * const k); +void Q_minus_psi_gpu(spinor * const l, spinor * const k); +void Q_plus_psi(spinor * const l, spinor * const k); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/tm_operators_32.c b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/tm_operators_32.c new file mode 100644 index 0000000000000000000000000000000000000000..ec0bfa58d40f5ccf67c905db95103768d63846e9 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/tm_operators_32.c @@ -0,0 +1,150 @@ +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include "global.h" +#include "su3.h" +#include "operator/Hopping_Matrix.h" +#include "operator/Hopping_Matrix_32.h" +#include "linalg_eo.h" +#include "gamma.h" +#include "operator/D_psi_32.h" +#include "tm_operators_32.h" + + +/* note that most 32 bit functions make use of orphaned directives! + in order to take advantage of threads, they must be called from within + a parallel section and care must be taken that within those parallel + sections, no nested parallelism is generated through further parallel section */ + +void mul_one_pm_imu_inv_32_orphaned(spinor32 * const l, const float _sign, const int N){ + _Complex float ALIGN z,w; + int ix; + float sign=-1.; + spinor32 *r; + + su3_vector32 ALIGN phi1; + + double ALIGN nrm = 1./(1.+g_mu*g_mu); + + if(_sign < 0.){ + sign = 1.; + } + + z = nrm + (sign * nrm * g_mu) * I; + w = conj(z); + /************ loop over all lattice sites ************/ +#ifdef OMP +#pragma omp for +#endif + for(ix = 0; ix < N; ix++){ + r=l + ix; + /* Multiply the spinorfield with the inverse of 1+imu\gamma_5 */ + _complex_times_vector(phi1, z, r->s0); + _vector_assign(r->s0, phi1); + _complex_times_vector(phi1, z, r->s1); + _vector_assign(r->s1, phi1); + _complex_times_vector(phi1, w, r->s2); + _vector_assign(r->s2, phi1); + _complex_times_vector(phi1, w, r->s3); + _vector_assign(r->s3, phi1); + } +} + +void mul_one_pm_imu_sub_mul_gamma5_32_orphaned(spinor32 * const l, spinor32 * const k, + spinor32 * const j, const float _sign){ + _Complex float z,w; + int ix; + float sign=1.; + spinor32 *r, *s, *t; + + su3_vector32 ALIGN phi1, phi2, phi3, phi4; + + if(_sign < 0.){ + sign = -1.; + } + + z = 1. + (sign * g_mu) * I; + w = conj(z); + + /************ loop over all lattice sites ************/ +#ifdef OMP +#pragma omp for +#endif + for(ix = 0; ix < (VOLUME/2); ix++){ + r = k+ix; + s = j+ix; + t = l+ix; + /* Multiply the spinorfield with 1+imu\gamma_5 */ + _complex_times_vector(phi1, z, r->s0); + _complex_times_vector(phi2, z, r->s1); + _complex_times_vector(phi3, w, r->s2); + _complex_times_vector(phi4, w, r->s3); + /* Subtract s and store the result in t */ + /* multiply with gamma5 included by */ + /* reversed order of s and phi3|4 */ + _vector_sub(t->s0, phi1, s->s0); + _vector_sub(t->s1, phi2, s->s1); + _vector_sub(t->s2, s->s2, phi3); + _vector_sub(t->s3, s->s3, phi4); + } +} + +void Qtm_pm_psi_32(spinor32 * const l, spinor32 * const k){ + /* Q_{-} */ +#ifdef OMP +#pragma omp parallel + { +#endif + Hopping_Matrix_32_orphaned(EO, g_spinor_field32[1], k); + mul_one_pm_imu_inv_32_orphaned(g_spinor_field32[1], -1., VOLUME/2); + Hopping_Matrix_32_orphaned(OE, g_spinor_field32[0], g_spinor_field32[1]); + mul_one_pm_imu_sub_mul_gamma5_32_orphaned(g_spinor_field32[0], k, g_spinor_field32[0], -1.); + /* Q_{+} */ + Hopping_Matrix_32_orphaned(EO, l, g_spinor_field32[0]); + mul_one_pm_imu_inv_32_orphaned(l, +1., VOLUME/2); + Hopping_Matrix_32_orphaned(OE, g_spinor_field32[1], l); + mul_one_pm_imu_sub_mul_gamma5_32_orphaned(l, g_spinor_field32[0], g_spinor_field32[1], +1.); +#ifdef OMP + } /* OpenMP closing brace */ +#endif +} + +void gamma5_32_orphaned(spinor32 * const l, spinor32 * const k, const int V){ + int ix; + spinor32 *r,*s; +#ifdef OMP +#pragma omp for +#endif + for (ix = 0; ix < V; ix++){ + r=l+ix; + s=k+ix; + _vector_assign((*r).s0,(*s).s0); + _vector_assign((*r).s1,(*s).s1); + _vector_minus_assign((*r).s2,(*s).s2); + _vector_minus_assign((*r).s3,(*s).s3); + } +} + +void gamma5_32(spinor32 * const l, spinor32 * const k, const int V){ +#ifdef OMP +#pragma omp parallel + { +#endif + gamma5_32_orphaned(l,k,V); +#ifdef OMP + } /*OpenMP closing brace */ +#endif +} + +void Q_pm_psi_32(spinor32 * const l, spinor32 * const k) +{ + g_mu = -g_mu; + D_psi_32(l, k); + gamma5_32(g_spinor_field32[0], l, VOLUME); + g_mu = -g_mu; + D_psi_32(l, g_spinor_field32[0]); + gamma5_32(l, l, VOLUME); +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/tm_operators_32.h b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/tm_operators_32.h new file mode 100644 index 0000000000000000000000000000000000000000..84ba678000a8fbdc9cc2b7d2436fa1e05ca1e6f6 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/tm_operators_32.h @@ -0,0 +1,12 @@ + +#ifndef _TM_OPERATORS_32_H +#define _TM_OPERATORS_32_H + +void mul_one_pm_imu_inv_32_orphaned(spinor32 * const l, const float _sign, const int N); +void mul_one_pm_imu_sub_mul_gamma5_32_orphaned(spinor32 * const l, spinor32 * const k, spinor32 * const j, const float _sign); +void Qtm_pm_psi_32(spinor32 * const l, spinor32 * const k); +void Q_pm_psi_32(spinor32 * const l, spinor32 * const k); +void gamma5_32_orphaned(spinor32 * const l, spinor32 * const k, const int V); +void gamma5_32(spinor32 * const l, spinor32 * const k, const int V); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/tm_operators_nd.c b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/tm_operators_nd.c new file mode 100644 index 0000000000000000000000000000000000000000..dc931b3a650d00077b6c80fc3238929cab2d7b5b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/tm_operators_nd.c @@ -0,0 +1,960 @@ +/*********************************************************************** + * + * Copyright (C) 2006,2007,2008 Karl Jansen, Thomas Chiarappa, + * Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * This file contains operators for twisted mass Wilson QCD + * to construct a multiplication with a non-degenerate + * flavour matrix + * + * + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "operator/Hopping_Matrix.h" +#include "phmc.h" +#include "gamma.h" +#include "linalg_eo.h" +#include "operator/tm_operators.h" +#include "operator/clovertm_operators.h" +#include "operator/tm_operators_nd.h" + + +void mul_one_pm_iconst(spinor * const l, spinor * const k, + const double mu_, const int sign_); + +void M_oo_sub_g5_ndpsi(spinor * const l_s, spinor * const l_c, + spinor * const k_s, spinor * const k_c, + spinor * const j_s, spinor * const j_c, + const double mu, const double eps); + +/* external functions */ + +/****************************************** + * + * This is the implementation of + * + * Qhat(2x2) = gamma_5 * [ M_oo - M_oe M_ee^-1 M_eo ] + * + * see documentation for details + * k_charm and k_strange are the input fields + * l_* the output fields + * + * it acts only on the odd part or only + * on a half spinor + ******************************************/ +void Qtm_ndpsi(spinor * const l_strange, spinor * const l_charm, + spinor * const k_strange, spinor * const k_charm){ + + /* Here the M_oe Mee^-1 M_eo implementation */ + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX], k_strange); + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX+1], k_charm); + + M_ee_inv_ndpsi(g_spinor_field[DUM_MATRIX+3], g_spinor_field[DUM_MATRIX+2], + g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+1], + g_mubar, g_epsbar); + + Hopping_Matrix(OE, l_strange, g_spinor_field[DUM_MATRIX+3]); + Hopping_Matrix(OE, l_charm, g_spinor_field[DUM_MATRIX+2]); + + /* Here the M_oo implementation */ + M_oo_sub_g5_ndpsi(g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+1], k_strange, k_charm, + l_strange, l_charm, + -g_mubar, -g_epsbar); + /* At the end, the normalisation by the max. eigenvalue */ + mul_r(l_strange, phmc_invmaxev, g_spinor_field[DUM_MATRIX], VOLUME/2); + mul_r(l_charm, phmc_invmaxev, g_spinor_field[DUM_MATRIX+1], VOLUME/2); +} + +void Qsw_ndpsi(spinor * const l_strange, spinor * const l_charm, + spinor * const k_strange, spinor * const k_charm) { + + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX], k_charm); + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX+1], k_strange); + + assign_mul_one_sw_pm_imu_eps(EE, g_spinor_field[DUM_MATRIX+2], g_spinor_field[DUM_MATRIX+3], + g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+1], g_mubar, g_epsbar); + clover_inv_nd(EE, g_spinor_field[DUM_MATRIX+2], g_spinor_field[DUM_MATRIX+3]); + + Hopping_Matrix(OE, g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+2]); + Hopping_Matrix(OE, g_spinor_field[DUM_MATRIX+1], g_spinor_field[DUM_MATRIX+3]); + + clover_gamma5_nd(OO, g_spinor_field[DUM_MATRIX+2], g_spinor_field[DUM_MATRIX+3], + k_charm, k_strange, + g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+1], + g_mubar, -g_epsbar); + mul_r(l_charm, phmc_invmaxev, g_spinor_field[DUM_MATRIX+2], VOLUME/2); + mul_r(l_strange, phmc_invmaxev, g_spinor_field[DUM_MATRIX+3], VOLUME/2); + return; +} + +/****************************************** + * + * This is the implementation of + * + * Qhat(2x2)^dagger = tau_1 Qhat(2x2) tau_1 = + * + * = Qhat(2x2) with g_mubar -> - g_mubar + * + * With respect to Qtm_ndpsi the role of charme and strange fields + * are interchenged, since Qdagger=tau_1 Q tau_1 + * see documentation for details + * k_charm and k_strange are the input fields + * l_* the output fields + * + * it acts only on the odd part or only + * on a half spinor + ******************************************/ +void Qtm_dagger_ndpsi(spinor * const l_strange, spinor * const l_charm, + spinor * const k_strange, spinor * const k_charm) { + + /* Here the M_oe Mee^-1 M_eo implementation */ + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX], k_charm); + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX+1], k_strange); + + M_ee_inv_ndpsi(g_spinor_field[DUM_MATRIX+2], g_spinor_field[DUM_MATRIX+3], + g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+1], + g_mubar, g_epsbar); + + Hopping_Matrix(OE, g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+2]); + Hopping_Matrix(OE, g_spinor_field[DUM_MATRIX+1], g_spinor_field[DUM_MATRIX+3]); + + /* Here the M_oo implementation */ + M_oo_sub_g5_ndpsi(l_strange, l_charm, k_strange, k_charm, + g_spinor_field[DUM_MATRIX+1], g_spinor_field[DUM_MATRIX], + g_mubar, -g_epsbar); + /* At the end, the normalisation by the max. eigenvalue */ + mul_r(l_charm, phmc_invmaxev, l_charm, VOLUME/2); + mul_r(l_strange, phmc_invmaxev, l_strange, VOLUME/2); + +} + +void Qsw_dagger_ndpsi(spinor * const l_strange, spinor * const l_charm, + spinor * const k_strange, spinor * const k_charm) { + + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX], k_charm); + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX+1], k_strange); + + assign_mul_one_sw_pm_imu_eps(EE, g_spinor_field[DUM_MATRIX+2], g_spinor_field[DUM_MATRIX+3], + g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+1], -g_mubar, g_epsbar); + clover_inv_nd(EE, g_spinor_field[DUM_MATRIX+2], g_spinor_field[DUM_MATRIX+3]); + + Hopping_Matrix(OE, g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+2]); + Hopping_Matrix(OE, g_spinor_field[DUM_MATRIX+1], g_spinor_field[DUM_MATRIX+3]); + + clover_gamma5_nd(OO, g_spinor_field[DUM_MATRIX+2], g_spinor_field[DUM_MATRIX+3], + k_charm, k_strange, + g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+1], + -g_mubar, -g_epsbar); + mul_r(l_charm, phmc_invmaxev, g_spinor_field[DUM_MATRIX+2], VOLUME/2); + mul_r(l_strange, phmc_invmaxev, g_spinor_field[DUM_MATRIX+3], VOLUME/2); + return; +} + + +/****************************************** + * + * This is the implementation of + * + * Qhat(2x2) Qhat(2x2)^dagger + * + * + * For details, see documentation and comments of the + * above mentioned routines + * + * k_charm and k_strange are the input fields + * l_* the output fields + * + * l_ and k_ can be identical + * + * it acts only on the odd part or only + * on a half spinor + ******************************************/ +void Qtm_pm_ndpsi(spinor * const l_strange, spinor * const l_charm, + spinor * const k_strange, spinor * const k_charm){ + + /* first the Qhat(2x2)^dagger PART*/ + /* Here the M_oe Mee^-1 M_eo implementation */ + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX], k_charm); + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX+1], k_strange); + + M_ee_inv_ndpsi(g_spinor_field[DUM_MATRIX+2], g_spinor_field[DUM_MATRIX+3], + g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+1], + g_mubar, g_epsbar); + + Hopping_Matrix(OE, g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+2]); + Hopping_Matrix(OE, g_spinor_field[DUM_MATRIX+1], g_spinor_field[DUM_MATRIX+3]); + + /* Here the M_oo implementation */ + M_oo_sub_g5_ndpsi(g_spinor_field[DUM_MATRIX+2], g_spinor_field[DUM_MATRIX+3], k_charm, k_strange, + g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+1], + -g_mubar, -g_epsbar); + /* We have to reassigin as follows to avoid overwriting */ + /* Recall in fact that Q^hat = tau_1 Q tau_1 , hence */ + /* and then the Qhat(2x2) PART */ + + /* Here the M_oe Mee^-1 M_eo implementation */ + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+3]); + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX+1], g_spinor_field[DUM_MATRIX+2]); + + M_ee_inv_ndpsi(g_spinor_field[DUM_MATRIX+5], g_spinor_field[DUM_MATRIX+4], + g_spinor_field[DUM_MATRIX+1], g_spinor_field[DUM_MATRIX], + -g_mubar, g_epsbar); + + Hopping_Matrix(OE, l_strange, g_spinor_field[DUM_MATRIX+4]); + Hopping_Matrix(OE, l_charm, g_spinor_field[DUM_MATRIX+5]); + + /* Here the M_oo implementation */ + M_oo_sub_g5_ndpsi(l_strange, l_charm, g_spinor_field[DUM_MATRIX+3], g_spinor_field[DUM_MATRIX+2], + l_strange, l_charm, + -g_mubar, -g_epsbar); + /* At the end, the normalisation by the max. eigenvalue */ + /* Twice phmc_invmaxev since we consider here D Ddag !!! */ + mul_r(l_charm, phmc_invmaxev*phmc_invmaxev, l_charm, VOLUME/2); + mul_r(l_strange, phmc_invmaxev*phmc_invmaxev, l_strange, VOLUME/2); + return; +} + +void Qsw_pm_ndpsi(spinor * const l_strange, spinor * const l_charm, + spinor * const k_strange, spinor * const k_charm) { + + /* FIRST THE Qhat(2x2)^dagger PART*/ + /* Here the M_oe Mee^-1 M_eo implementation */ + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX], k_charm); + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX+1], k_strange); + + assign_mul_one_sw_pm_imu_eps(EE, g_spinor_field[DUM_MATRIX+2], g_spinor_field[DUM_MATRIX+3], + g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+1], -g_mubar, g_epsbar); + clover_inv_nd(EE, g_spinor_field[DUM_MATRIX+2], g_spinor_field[DUM_MATRIX+3]); + + Hopping_Matrix(OE, g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+2]); + Hopping_Matrix(OE, g_spinor_field[DUM_MATRIX+1], g_spinor_field[DUM_MATRIX+3]); + + // Here the M_oo implementation + clover_gamma5_nd(OO, g_spinor_field[DUM_MATRIX+2], g_spinor_field[DUM_MATRIX+3], + k_charm, k_strange, + g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+1], + -g_mubar, -g_epsbar); + + // and then the Qhat(2x2) PART + // Recall in fact that Q^hat = tau_1 Q tau_1 + // Here the M_oe Mee^-1 M_eo implementation + // the re-ordering in s and c components is due to tau_1 + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+3]); + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX+1], g_spinor_field[DUM_MATRIX+2]); + + assign_mul_one_sw_pm_imu_eps(EE, g_spinor_field[DUM_MATRIX+7], g_spinor_field[DUM_MATRIX+6], + g_spinor_field[DUM_MATRIX+1], g_spinor_field[DUM_MATRIX], g_mubar, g_epsbar); + clover_inv_nd(EE, g_spinor_field[DUM_MATRIX+6], g_spinor_field[DUM_MATRIX+7]); + + Hopping_Matrix(OE, g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+6]); + Hopping_Matrix(OE, g_spinor_field[DUM_MATRIX+1], g_spinor_field[DUM_MATRIX+7]); + + clover_gamma5_nd(OO, l_charm, l_strange, + g_spinor_field[DUM_MATRIX+2], g_spinor_field[DUM_MATRIX+3], + g_spinor_field[DUM_MATRIX+1], g_spinor_field[DUM_MATRIX], + g_mubar, -g_epsbar); + + /* At the end, the normalisation by the max. eigenvalue */ + /* Twice phmc_invmaxev since we consider here D Ddag !!! */ + mul_r(l_charm, phmc_invmaxev*phmc_invmaxev, l_charm, VOLUME/2); + mul_r(l_strange, phmc_invmaxev*phmc_invmaxev, l_strange, VOLUME/2); + return; +} + + + +/****************************************** + * + * This is the implementation of + * + * Q_tau1_sub_const_ndpsi = Cpol*( M - z_k ) + * + * with M = Qhat(2x2) tau_1 and z_k \in Complex + * + * + * needed in the evaluation of the forces when + * the Polynomial approximation is used + * + * + * For details, see documentation and comments of the + * above mentioned routines + * + * k_charm and k_strange are the input fields + * l_* the output fields + * + * it acts only on the odd part or only + * on a half spinor + ******************************************/ +void Q_tau1_sub_const_ndpsi(spinor * const l_strange, spinor * const l_charm, + spinor * const k_strange, spinor * const k_charm, + const _Complex double z, const double Cpol, const double invev) { + + spinor *r, *s; + su3_vector ALIGN phi1; + + /* tau_1 inverts the k_charm <-> k_strange spinors */ + /* Apply first Qhat(2x2) and finally substract the constant */ + + /* Here the M_oe Mee^-1 M_eo implementation */ + + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX], k_charm); + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX+1], k_strange); + + M_ee_inv_ndpsi(g_spinor_field[DUM_MATRIX+3], g_spinor_field[DUM_MATRIX+2], + g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+1], + g_mubar, g_epsbar); + + Hopping_Matrix(OE, l_strange, g_spinor_field[DUM_MATRIX+3]); + Hopping_Matrix(OE, l_charm, g_spinor_field[DUM_MATRIX+2]); + + /* Here the M_oo implementation */ + M_oo_sub_g5_ndpsi(g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+1], k_charm, k_strange, + l_strange, l_charm, + -g_mubar, -g_epsbar); + + /* At the end, the normalisation by the max. eigenvalue */ + mul_r(l_strange, Cpol*invev, g_spinor_field[DUM_MATRIX], VOLUME/2); + mul_r(l_charm, Cpol*invev, g_spinor_field[DUM_MATRIX+1], VOLUME/2); + + /* Finally, we add k to l and multiply all */ + /* by the constant phmc_Cpol */ + /* which renders the polynomial in monomials */ + /* identical to the polynomial a la clenshaw */; +#ifdef OMP +#pragma omp parallel for private(r) private(s) private(phi1) +#endif + for(int ix = 0; ix < (VOLUME/2); ix++){ + + r=l_strange + ix; + s=k_strange + ix; + + _complex_times_vector(phi1, Cpol*z, s->s0); + _vector_sub_assign(r->s0, phi1); + _complex_times_vector(phi1, Cpol*z, s->s1); + _vector_sub_assign(r->s1, phi1); + _complex_times_vector(phi1, Cpol*z, s->s2); + _vector_sub_assign(r->s2, phi1); + _complex_times_vector(phi1, Cpol*z, s->s3); + _vector_sub_assign(r->s3, phi1); + + r=l_charm + ix; + s=k_charm + ix; + + _complex_times_vector(phi1, Cpol*z, s->s0); + _vector_sub_assign(r->s0, phi1); + _complex_times_vector(phi1, Cpol*z, s->s1); + _vector_sub_assign(r->s1, phi1); + _complex_times_vector(phi1, Cpol*z, s->s2); + _vector_sub_assign(r->s2, phi1); + _complex_times_vector(phi1, Cpol*z, s->s3); + _vector_sub_assign(r->s3, phi1); + } + return; +} + +void Qsw_tau1_sub_const_ndpsi(spinor * const l_strange, spinor * const l_charm, + spinor * const k_strange, spinor * const k_charm, + const _Complex double z, const double Cpol, const double invev) { + + spinor *r, *s; + su3_vector ALIGN phi1; + + /* tau_1 inverts the k_charm <-> k_strange spinors */ + /* Apply first Qhat(2x2) and finally substract the constant */ + + /* Here the M_oe Mee^-1 M_eo implementation */ + + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX], k_charm); + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX+1], k_strange); + + assign_mul_one_sw_pm_imu_eps(EE, g_spinor_field[DUM_MATRIX+3], g_spinor_field[DUM_MATRIX+2], + g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+1], -g_mubar, g_epsbar); + clover_inv_nd(EE, g_spinor_field[DUM_MATRIX+2], g_spinor_field[DUM_MATRIX+3]); + + Hopping_Matrix(OE, l_strange, g_spinor_field[DUM_MATRIX+3]); + Hopping_Matrix(OE, l_charm, g_spinor_field[DUM_MATRIX+2]); + + /* Here the M_oo implementation */ + clover_gamma5_nd(OO, g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+1], + k_charm, k_strange, + l_strange, l_charm, + -g_mubar, -g_epsbar); + + /* At the end, the normalisation by the max. eigenvalue */ + mul_r(l_strange, Cpol*invev, g_spinor_field[DUM_MATRIX], VOLUME/2); + mul_r(l_charm, Cpol*invev, g_spinor_field[DUM_MATRIX+1], VOLUME/2); + + /* Finally, we add k to l and multiply all */ + /* by the constant phmc_Cpol */ + /* which renders the polynomial in monomials */ + /* identical to the polynomial a la clenshaw */; +#ifdef OMP +#pragma omp parallel for private(r) private(s) private(phi1) +#endif + for(int ix = 0; ix < (VOLUME/2); ix++){ + + r=l_strange + ix; + s=k_strange + ix; + + _complex_times_vector(phi1, Cpol*z, s->s0); + _vector_sub_assign(r->s0, phi1); + _complex_times_vector(phi1, Cpol*z, s->s1); + _vector_sub_assign(r->s1, phi1); + _complex_times_vector(phi1, Cpol*z, s->s2); + _vector_sub_assign(r->s2, phi1); + _complex_times_vector(phi1, Cpol*z, s->s3); + _vector_sub_assign(r->s3, phi1); + + r=l_charm + ix; + s=k_charm + ix; + + _complex_times_vector(phi1, Cpol*z, s->s0); + _vector_sub_assign(r->s0, phi1); + _complex_times_vector(phi1, Cpol*z, s->s1); + _vector_sub_assign(r->s1, phi1); + _complex_times_vector(phi1, Cpol*z, s->s2); + _vector_sub_assign(r->s2, phi1); + _complex_times_vector(phi1, Cpol*z, s->s3); + _vector_sub_assign(r->s3, phi1); + } + return; +} + + + + +/****************************************** + * + * This is the same implementation as above of + * + * Qhat(2x2) Qhat(2x2)^dagger + * + * + * but now input and output are bispinors !!!! + * + * For details, see documentation and comments of the + * above mentioned routines + * + * k_charm and k_strange are the input fields + * l_* the output fields + * + * it acts only on the odd part or only + * on a half spinor + ******************************************/ +void Qtm_pm_ndbipsi(bispinor * const bisp_l, bispinor * const bisp_k) { + + /* create 2 spinors out of 1 (input) bispinor */ + decompact(g_spinor_field[DUM_MATRIX+6], g_spinor_field[DUM_MATRIX+7], bisp_k); + + Qtm_pm_ndpsi(g_spinor_field[DUM_MATRIX+6], g_spinor_field[DUM_MATRIX+7], + g_spinor_field[DUM_MATRIX+6], g_spinor_field[DUM_MATRIX+7]); + + /* create 1 (output) bispinor out of 2 spinors */ + compact(bisp_l, g_spinor_field[DUM_MATRIX+6], g_spinor_field[DUM_MATRIX+7]); + return; +} + +void Qsw_pm_ndbipsi(bispinor * const bisp_l, bispinor * const bisp_k) { + + /* create 2 spinors out of 1 (input) bispinor */ + decompact(g_spinor_field[DUM_MATRIX+6], g_spinor_field[DUM_MATRIX+7], bisp_k); + + Qsw_pm_ndpsi(g_spinor_field[DUM_MATRIX+6], g_spinor_field[DUM_MATRIX+7], + g_spinor_field[DUM_MATRIX+6], g_spinor_field[DUM_MATRIX+7]); + + /* create 1 (output) bispinor out of 2 spinors */ + compact(bisp_l, g_spinor_field[DUM_MATRIX+6], g_spinor_field[DUM_MATRIX+7]); + return; +} + + +/****************************************** + * + * This is the implementation of + * + * (M_{ee}^\pm)^{-1}M_{eo} tau^1 + * + * see documentation for details + * k is the number of the input field + * l is the number of the output field + * + * it acts only on the odd part or only + * on a half spinor + ******************************************/ + +void H_eo_tm_ndpsi(spinor * const l_strange, spinor * const l_charm, + spinor * const k_strange, spinor * const k_charm, + const int ieo) { + /* recall: strange <-> up while charm <-> dn */ + Hopping_Matrix(ieo, g_spinor_field[DUM_MATRIX], k_strange); + Hopping_Matrix(ieo, g_spinor_field[DUM_MATRIX+1], k_charm); + + M_ee_inv_ndpsi(l_charm, l_strange, + g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+1], + -g_mubar, g_epsbar); + return; +} + +void H_eo_sw_ndpsi(spinor * const l_strange, spinor * const l_charm, + spinor * const k_strange, spinor * const k_charm) { + + /* recall: strange <-> up while charm <-> dn */ + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX], k_strange); + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX+1], k_charm); + + assign_mul_one_sw_pm_imu_eps(EE, l_charm, l_strange, + g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+1], + g_mubar, g_epsbar); + // here the order doesn't matter + clover_inv_nd(EE, l_strange, l_charm); + + return; +} + +// for this routine we need to have sw_invert_nd and sw_term called before hand +// and the clover term must be initialised +void Msw_ee_inv_ndpsi(spinor * const l_strange, spinor * const l_charm, + spinor * const k_strange, spinor * const k_charm) { + + + /* recall: strange <-> up while charm <-> dn */ + + assign_mul_one_sw_pm_imu_eps(EE, l_strange, l_charm, k_strange, k_charm, -g_mubar, g_epsbar); + + clover_inv_nd(EE, l_strange, l_charm); + return; +} + + + +void Q_test_epsilon(spinor * const l_strange, spinor * const l_charm, + spinor * const k_strange, spinor * const k_charm){ + + double nrm = 1./(1.+g_mubar*g_mubar-g_epsbar*g_epsbar); + + /* Here the M_oe Mee^-1 M_eo implementation */ + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX], k_strange); + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX+1], k_charm); + + Hopping_Matrix(OE, g_spinor_field[DUM_MATRIX+2], g_spinor_field[DUM_MATRIX]); + Hopping_Matrix(OE, g_spinor_field[DUM_MATRIX+3], g_spinor_field[DUM_MATRIX+1]); + + assign_add_mul_r(k_strange, g_spinor_field[DUM_MATRIX+2], nrm, VOLUME/2); + assign_add_mul_r(k_charm, g_spinor_field[DUM_MATRIX+3], nrm, VOLUME/2); + + mul_r(l_strange, -2, k_strange, VOLUME/2); + mul_r(l_charm, -2, k_charm, VOLUME/2); + + /* and finally the gamma_5 multiplication */ + gamma5(l_strange, l_strange, VOLUME/2); + gamma5(l_charm, l_charm, VOLUME/2); + + /* At the end, the normalisation by the max. eigenvalue */ + mul_r(l_charm, phmc_invmaxev, l_charm, VOLUME/2); + mul_r(l_strange, phmc_invmaxev, l_strange, VOLUME/2); + return; +} + + +void mul_one_pm_itau2(spinor * const p, spinor * const q, + spinor * const r, spinor * const s, + const double sign, const int N) { + double fac = 1./sqrt(2.); + + if(sign > 0) { + add(p, r, s, N); + diff(q, s, r, N); + } + else { + diff(p, r, s, N); + add(q, r, s, N); + } + mul_r(p, fac, p, N); + mul_r(q, fac, q, N); +} + +void mul_one_pm_iconst(spinor * const l, spinor * const k, + const double mu_, const int sign_) { +#ifdef OMP +#pragma omp parallel + { +#endif + + spinor *r, *s; + su3_vector ALIGN phi1; + double mu = mu_; + if(sign_ < 0) { + mu = -mu_; + } + + /************ loop over all lattice sites ************/ +#ifdef OMP +#pragma omp for +#endif + for(unsigned int ix = 0; ix < (VOLUME/2); ++ix){ + r=l + ix; + s=k + ix; + /* Multiply the spinorfield with 1+imu\gamma_5 */ + _complex_times_vector(phi1, (1. + mu * I), s->s0); + _vector_assign(r->s0, phi1); + _complex_times_vector(phi1, (1. + mu * I), s->s1); + _vector_assign(r->s1, phi1); + _complex_times_vector(phi1, (1. - mu * I), s->s2); + _vector_assign(r->s2, phi1); + _complex_times_vector(phi1, (1. - mu * I), s->s3); + _vector_assign(r->s3, phi1); + } + +#ifdef OMP + } /* OpenMP closing brace */ +#endif + + return; +} + +// l_ and k_ are allowed to be the same spinors +void M_ee_inv_ndpsi(spinor * const l_s, spinor * const l_c, + spinor * const k_s, spinor * const k_c, + const double mu, const double eps) { +#ifdef OMP +#pragma omp parallel + { +#endif + double nrm = 1./(1.+ mu*mu - eps*eps); + spinor *r_s, *r_c, *s_s, *s_c; + su3_vector ALIGN phi1, phi2; + +#ifdef OMP +#pragma omp for +#endif + for(unsigned int ix = 0; ix < (VOLUME/2); ++ix){ + r_s = l_s + ix; + r_c = l_c + ix; + s_s = k_s + ix; + s_c = k_c + ix; + + _complex_times_vector(phi1, (1. - mu * I), s_s->s0); + _vector_add_mul(phi1, eps, s_c->s0); + _complex_times_vector(phi2, (1. + mu * I), s_c->s0); + _vector_add_mul(phi2, eps, s_s->s0); + _vector_mul(r_s->s0, nrm, phi1); + _vector_mul(r_c->s0, nrm, phi2); + + _complex_times_vector(phi1, (1. - mu * I), s_s->s1); + _vector_add_mul(phi1, eps, s_c->s1); + _complex_times_vector(phi2, (1. + mu * I), s_c->s1); + _vector_add_mul(phi2, eps, s_s->s1); + _vector_mul(r_s->s1, nrm, phi1); + _vector_mul(r_c->s1, nrm, phi2); + + _complex_times_vector(phi1, (1. + mu * I), s_s->s2); + _vector_add_mul(phi1, eps, s_c->s2); + _complex_times_vector(phi2, (1. - mu * I), s_c->s2); + _vector_add_mul(phi2, eps, s_s->s2); + _vector_mul(r_s->s2, nrm, phi1); + _vector_mul(r_c->s2, nrm, phi2); + + _complex_times_vector(phi1, (1. + mu * I), s_s->s3); + _vector_add_mul(phi1, eps, s_c->s3); + _complex_times_vector(phi2, (1. - mu * I), s_c->s3); + _vector_add_mul(phi2, eps, s_s->s3); + _vector_mul(r_s->s3, nrm, phi1); + _vector_mul(r_c->s3, nrm, phi2); + + } + +#ifdef OMP + } /* OpenMP closing brace */ +#endif + + return; +} + + +// l_ and k_ are allowed to be the same spinors +void M_oo_sub_g5_ndpsi(spinor * const l_s, spinor * const l_c, + spinor * const k_s, spinor * const k_c, + spinor * const j_s, spinor * const j_c, + const double mu, const double eps) { +#ifdef OMP +#pragma omp parallel + { +#endif + spinor *r_s, *r_c, *s_s, *s_c, *t_s, *t_c; + su3_vector ALIGN phi1, phi2; + +#ifdef OMP +#pragma omp for +#endif + for(unsigned int ix = 0; ix < (VOLUME/2); ++ix){ + r_s = l_s + ix; + r_c = l_c + ix; + s_s = k_s + ix; + s_c = k_c + ix; + t_s = j_s + ix; + t_c = j_c + ix; + + _complex_times_vector(phi1, (1. - mu * I), s_s->s0); + _vector_add_mul(phi1, eps, s_c->s0); + _complex_times_vector(phi2, (1. + mu * I), s_c->s0); + _vector_add_mul(phi2, eps, s_s->s0); + _vector_sub(r_s->s0, phi1, t_s->s0); + _vector_sub(r_c->s0, phi2, t_c->s0); + + _complex_times_vector(phi1, (1. - mu * I), s_s->s1); + _vector_add_mul(phi1, eps, s_c->s1); + _complex_times_vector(phi2, (1. + mu * I), s_c->s1); + _vector_add_mul(phi2, eps, s_s->s1); + _vector_sub(r_s->s1, phi1, t_s->s1); + _vector_sub(r_c->s1, phi2, t_c->s1); + + _complex_times_vector(phi1, (1. + mu * I), s_s->s2); + _vector_add_mul(phi1, eps, s_c->s2); + _complex_times_vector(phi2, (1. - mu * I), s_c->s2); + _vector_add_mul(phi2, eps, s_s->s2); + _vector_sub(r_s->s2, t_s->s2, phi1); + _vector_sub(r_c->s2, t_c->s2, phi2); + + _complex_times_vector(phi1, (1. + mu * I), s_s->s3); + _vector_add_mul(phi1, eps, s_c->s3); + _complex_times_vector(phi2, (1. - mu * I), s_c->s3); + _vector_add_mul(phi2, eps, s_s->s3); + _vector_sub(r_s->s3, t_s->s3, phi1); + _vector_sub(r_c->s3, t_c->s3, phi2); + } + +#ifdef OMP + } /* OpenMP closing brace */ +#endif + + return; +} + + +/* calculates P(Q Q^dagger) for the nondegenerate case */ + +void P_ndpsi(spinor * const l_strange, spinor * const l_charm, + spinor * const k_strange, spinor * const k_charm){ + + + + int j; + spinor *dum_up,*dum_dn; + dum_up=g_chi_up_spinor_field[DUM_MATRIX]; + dum_dn=g_chi_dn_spinor_field[DUM_MATRIX]; + + assign(dum_up,k_strange,VOLUME/2); + assign(dum_dn,k_charm,VOLUME/2); + + for(j = 0; j < (2*phmc_dop_n_cheby -2); j++) { + if(j>0) { + assign(dum_up,l_strange,VOLUME/2); + assign(dum_dn,l_charm,VOLUME/2); + } + + Q_tau1_sub_const_ndpsi(l_strange, l_charm, + dum_up, dum_dn, + phmc_root[j], phmc_Cpol, phmc_invmaxev); + } + return; +} + + +/* calculates Q * \tau^1 for the nondegenerate case */ +void Qtau1_P_ndpsi(spinor * const l_strange, spinor * const l_charm, + spinor * const k_strange, spinor * const k_charm){ + + + spinor * dum_up,* dum_dn; + dum_up = g_chi_up_spinor_field[DUM_MATRIX+1]; + dum_dn = g_chi_dn_spinor_field[DUM_MATRIX+1]; + + P_ndpsi(l_strange, l_charm, k_strange, k_charm); + + assign(dum_up, l_strange, VOLUME/2); + assign(dum_dn, l_charm, VOLUME/2); + + Qtm_ndpsi(l_strange, l_charm, dum_dn, dum_up); + return; +} + + + +/* this is neccessary for the calculation of the polynomial */ + +void Qtm_pm_sub_const_nrm_psi(spinor * const l, spinor * const k, + const _Complex double z){ + su3_vector ALIGN phi1; + spinor *r,*s; + int ix; + + Qtm_pm_psi(l, k); + mul_r(l, phmc_invmaxev, l, VOLUME/2); + + /* AND FINALLY WE SUBSTRACT THE C-CONSTANT */ + + + /************ loop over all lattice sites ************/ +#ifdef OMP +#pragma omp parallel for private(ix) private(r) private(s) private(phi1) +#endif + for(ix = 0; ix < (VOLUME/2); ix++){ + + r=l + ix; + s=k + ix; + + _complex_times_vector(phi1, z, s->s0); + _vector_sub_assign(r->s0, phi1); + _complex_times_vector(phi1, z, s->s1); + _vector_sub_assign(r->s1, phi1); + _complex_times_vector(phi1, z, s->s2); + _vector_sub_assign(r->s2, phi1); + _complex_times_vector(phi1, z, s->s3); + _vector_sub_assign(r->s3, phi1); + } + + mul_r(l, phmc_Cpol, l, VOLUME/2); + return; +} + +/* calculate a polynomial in (Q+)*(Q-) */ + + +void Ptm_pm_psi(spinor * const l, spinor * const k){ + + int j; + spinor *spinDum; + spinDum=g_spinor_field[DUM_MATRIX+2]; + + assign(spinDum,k,VOLUME/2); + + + for(j=0; j<(2*phmc_dop_n_cheby -2); j++){ + if(j>0) { + assign(spinDum,l,VOLUME/2); + } + + Qtm_pm_sub_const_nrm_psi(l,spinDum,phmc_root[j]); + } + return; +} + +/* ********************************************** + * Qpm * P(Qpm) + * this operator is neccessary for the inverter + ************************************************/ + +void Qtm_pm_Ptm_pm_psi(spinor * const l, spinor * const k){ + spinor * spinDum; + + spinDum=g_spinor_field[DUM_MATRIX+3]; + Ptm_pm_psi(l,k); + assign(spinDum,l,VOLUME/2); + Qtm_pm_psi(l,spinDum); + return; +} + + +/* ************************************************ + * for noise reduction + * this implements + * a = B^dagger H b + * + * with Hopping matrix H and + * + * B = (1-i\g5\tau^1\musigma-\tau^3\mudelta)/c + * where + * c = 1+\musigma^2-\mudelta^2 + * + * so it is in the convention of hep-lat/0606011 + * not in the internal one, see documentation + * + **************************************************/ + +void red_noise_nd(spinor * const lse, spinor * const lso, + spinor * const lce, spinor * const lco) +{ + + double nrm0 = (1.-g_epsbar)/(1+g_mubar*g_mubar-g_epsbar*g_epsbar); + double nrm1 = (1.+g_epsbar)/(1+g_mubar*g_mubar-g_epsbar*g_epsbar); + _Complex double z; + int ix, i; + su3_vector ALIGN phi; + spinor * r, * s; + + /* need B^\dagger, so change sign of g_mubar */ + z = (g_mubar / (1 + g_mubar * g_mubar - g_epsbar * g_epsbar)) * I; + + /* first multiply with Hopping matrix */ + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX], lso); + Hopping_Matrix(OE, g_spinor_field[DUM_MATRIX+1], lse); + + Hopping_Matrix(EO, g_spinor_field[DUM_MATRIX+2], lco); + Hopping_Matrix(OE, g_spinor_field[DUM_MATRIX+3], lce); + + /* now with A^{-1}*/ + mul_r(lse, nrm0, g_spinor_field[DUM_MATRIX], VOLUME/2); + mul_r(lso, nrm0, g_spinor_field[DUM_MATRIX+1], VOLUME/2); + + mul_r(lce, nrm1, g_spinor_field[DUM_MATRIX+2], VOLUME/2); + mul_r(lco, nrm1, g_spinor_field[DUM_MATRIX+3], VOLUME/2); + + /************ loop over all lattice sites ************/ + for(i = 0; i < 4; i++) { + if(i == 0) { + r = lse, s = g_spinor_field[DUM_MATRIX]; + } + else if(i == 1) { + r = lso, s = g_spinor_field[DUM_MATRIX+1]; + } + else if(i == 2) { + r = lce, s = g_spinor_field[DUM_MATRIX+2]; + } + else { + r = lco, s = g_spinor_field[DUM_MATRIX+3]; + } + for(ix = 0; ix < (VOLUME/2); ix++){ + /* Multiply the spinorfield with (i epsbar \gamma_5)/c */ + /* and add it to */ + _complex_times_vector(phi, z, s->s0); + _vector_add_assign(r->s0, phi); + _complex_times_vector(phi, z, s->s1); + _vector_add_assign(r->s1, phi); + _complex_times_vector(phi, -z, s->s2); + _vector_add_assign(r->s2, phi); + _complex_times_vector(phi, -z, s->s3); + _vector_add_assign(r->s3, phi); + r++; s++; + } + } + return; +} + + + + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/tm_operators_nd.h b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/tm_operators_nd.h new file mode 100644 index 0000000000000000000000000000000000000000..347f326a545c8d41171a4cc1bd4eb230364d5468 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/tm_operators_nd.h @@ -0,0 +1,94 @@ +/*********************************************************************** + * + * Copyright (C) 2006,2007,2008 Karl Jansen, Thomas Chiarappa, + * Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _TM_OPERATTORS_ND_H +#define _TM_OPERATTORS_ND_H + +void mul_one_pm_itau2(spinor * const p, spinor * const q, + spinor * const r, spinor * const s, + const double sign, const int N); + +void Qtm_ndpsi(spinor * const l_strange, spinor * const l_charm, + spinor * const k_strange, spinor * const k_charm); +void Qsw_ndpsi(spinor * const l_strange, spinor * const l_charm, + spinor * const k_strange, spinor * const k_charm); + +void Qtm_dagger_ndpsi(spinor * const l_strange, spinor * const l_charm, + spinor * const k_strange, spinor * const k_charm); +void Qsw_dagger_ndpsi(spinor * const l_strange, spinor * const l_charm, + spinor * const k_strange, spinor * const k_charm); + +void Qtm_pm_ndpsi(spinor * const l_strange, spinor * const l_charm, + spinor * const k_strange, spinor * const k_charm); +void Qsw_pm_ndpsi(spinor * const l_strange, spinor * const l_charm, + spinor * const k_strange, spinor * const k_charm); + +void Qtm_pm_ndbipsi(bispinor * const bisp_l, bispinor * const bisp_k); +void Qsw_pm_ndbipsi(bispinor * const bisp_l, bispinor * const bisp_k); + +void Q_tau1_sub_const_ndpsi(spinor * const l_strange, spinor * const l_charm, + spinor * const k_strange, spinor * const k_charm, + const _Complex double z, const double Cpol, const double invev); +void Qsw_tau1_sub_const_ndpsi(spinor * const l_strange, spinor * const l_charm, + spinor * const k_strange, spinor * const k_charm, + const _Complex double z, const double Cpol, const double invev); + +void H_eo_tm_ndpsi(spinor * const l_strange, spinor * const l_charm, + spinor * const k_strange, spinor * const k_charm, + const int ieo); +void H_eo_sw_ndpsi(spinor * const l_strange, spinor * const l_charm, + spinor * const k_strange, spinor * const k_charm); + + +void M_ee_inv_ndpsi(spinor * const l_strange, spinor * const l_charm, + spinor * const k_strange, spinor * const k_charm, + const double mu, const double eps); + +void Msw_ee_inv_ndpsi(spinor * const l_strange, spinor * const l_charm, + spinor * const k_strange, spinor * const k_charm); + +void Q_test_epsilon(spinor * const l_strange, spinor * const l_charm, + spinor * const k_strange, spinor * const k_charm); + +void Qtau1_P_ndpsi(spinor * const l_strange, spinor * const l_charm, + spinor * const k_strange, spinor * const k_charm); + +void Qtm_pm_Ptm_pm_psi(spinor * const l, spinor * const k); + +void Qtm_pm_sub_const_nrm_psi(spinor * const l, spinor * const k,const _Complex double z); + +/* ************************************************ + * for noise reduction + * this implements + * a = B^dagger H b + * + * with Hopping matrix H and + * + * B = (1-i\g5\tau^1\musigma-\tau^3\mudelta)/c + * where + * c = 1+\musigma^2-\mudelta^2 + * + **************************************************/ + +void red_noise_nd(spinor * const lse, spinor * const lso, spinor * const lce, spinor * const lco); + + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/tm_operators_nd_32.c b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/tm_operators_nd_32.c new file mode 100644 index 0000000000000000000000000000000000000000..72d6e9f5c3faf784967663fd3fe6d2e9f1e61847 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/tm_operators_nd_32.c @@ -0,0 +1,318 @@ +/*********************************************************************** + * + * Copyright (C) 2015 Florian Burger + * based on the corresponding 64 bit operators in tm_operators_nd.c + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * This file contains operators for twisted mass Wilson QCD + * to construct a multiplication with a non-degenerate + * flavour matrix + * + * + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "operator/Hopping_Matrix_32.h" +#include "phmc.h" +#include "gamma.h" +#include "linalg_eo.h" +#include "operator/tm_operators_32.h" +#include "operator/tm_operators_nd.h" +#include "operator/clovertm_operators_32.h" +#include "operator/D_psi_32.h" +#include "tm_operators_nd_32.h" + + + +void sub_epsbar_tau1_32(spinor32 * const l_strange, spinor32 * const l_charm , spinor32 * const k_strange, spinor32 * const k_charm){ + mul_r_32(g_spinor_field32[2], (float) g_epsbar, k_strange , VOLUME); + mul_r_32(g_spinor_field32[3], (float) g_epsbar, k_charm, VOLUME); + diff_32(l_strange, l_strange, g_spinor_field32[3], VOLUME); + diff_32(l_charm, l_charm, g_spinor_field32[2], VOLUME); +} + + +void Q_pm_ndpsi_32(spinor32 * const l_strange, spinor32 * const l_charm, spinor32 * const k_strange, spinor32 * const k_charm) +{ + + //D_h^{dagger} + //tau^1 by s<->c + + D_psi_32(l_strange, k_charm); + g_mu = -g_mu; + D_psi_32(l_charm, k_strange); + g_mu = -g_mu; + + sub_epsbar_tau1_32(l_strange, l_charm, k_charm, k_strange); + + gamma5_32(g_spinor_field32[0], l_strange, VOLUME); + gamma5_32(g_spinor_field32[1], l_charm, VOLUME); + + //D_h + //tau^1 by s<->c + D_psi_32(l_strange, g_spinor_field32[1]); + g_mu = -g_mu; + D_psi_32(l_charm, g_spinor_field32[0]); + g_mu = -g_mu; + sub_epsbar_tau1_32(l_strange, l_charm, g_spinor_field32[1], g_spinor_field32[0]); + + gamma5_32(l_strange, l_strange, VOLUME); + gamma5_32(l_charm, l_charm, VOLUME); + /* At the end, the normalisation by the max. eigenvalue */ + /* Twice phmc_invmaxev since we consider here D Ddag !!! */ + mul_r_32(l_charm, (float) phmc_invmaxev*phmc_invmaxev, l_charm, VOLUME); + mul_r_32(l_strange, (float) phmc_invmaxev*phmc_invmaxev, l_strange, VOLUME); + +} + +// l_ and k_ are allowed to be the same spinors +void M_ee_inv_ndpsi_32_orphaned(spinor32 * const l_s, spinor32 * const l_c, + spinor32 * const k_s, spinor32 * const k_c, + const float mu, const float eps) { + float nrm = 1./(1.+ mu*mu - eps*eps); + spinor32 *r_s, *r_c, *s_s, *s_c; + su3_vector32 ALIGN32 phi1, phi2; + +#ifdef OMP +#pragma omp for +#endif + for(unsigned int ix = 0; ix < (VOLUME/2); ++ix){ + r_s = l_s + ix; + r_c = l_c + ix; + s_s = k_s + ix; + s_c = k_c + ix; + + _complex_times_vector(phi1, (1. - mu * I), s_s->s0); + _vector_add_mul(phi1, eps, s_c->s0); + _complex_times_vector(phi2, (1. + mu * I), s_c->s0); + _vector_add_mul(phi2, eps, s_s->s0); + _vector_mul(r_s->s0, nrm, phi1); + _vector_mul(r_c->s0, nrm, phi2); + + _complex_times_vector(phi1, (1. - mu * I), s_s->s1); + _vector_add_mul(phi1, eps, s_c->s1); + _complex_times_vector(phi2, (1. + mu * I), s_c->s1); + _vector_add_mul(phi2, eps, s_s->s1); + _vector_mul(r_s->s1, nrm, phi1); + _vector_mul(r_c->s1, nrm, phi2); + + _complex_times_vector(phi1, (1. + mu * I), s_s->s2); + _vector_add_mul(phi1, eps, s_c->s2); + _complex_times_vector(phi2, (1. - mu * I), s_c->s2); + _vector_add_mul(phi2, eps, s_s->s2); + _vector_mul(r_s->s2, nrm, phi1); + _vector_mul(r_c->s2, nrm, phi2); + + _complex_times_vector(phi1, (1. + mu * I), s_s->s3); + _vector_add_mul(phi1, eps, s_c->s3); + _complex_times_vector(phi2, (1. - mu * I), s_c->s3); + _vector_add_mul(phi2, eps, s_s->s3); + _vector_mul(r_s->s3, nrm, phi1); + _vector_mul(r_c->s3, nrm, phi2); + + } +} + +void M_ee_inv_ndpsi_32(spinor32 * const l_s, spinor32 * const l_c, + spinor32 * const k_s, spinor32 * const k_c, + const float mu, const float eps) { +#ifdef OMP +#pragma omp parallel + { +#endif + M_ee_inv_ndpsi_32_orphaned(l_s, l_c, k_s, k_c, mu, eps); +#ifdef OMP + } /* OpenMP closing brace */ +#endif + return; +} + +// l_ and k_ are allowed to be the same spinors +void M_oo_sub_g5_ndpsi_32_orphaned(spinor32 * const l_s, spinor32 * const l_c, + spinor32 * const k_s, spinor32 * const k_c, + spinor32 * const j_s, spinor32 * const j_c, + const float mu, const float eps) { + spinor32 *r_s, *r_c, *s_s, *s_c, *t_s, *t_c; + su3_vector32 ALIGN32 phi1, phi2; + +#ifdef OMP +#pragma omp for +#endif + for(unsigned int ix = 0; ix < (VOLUME/2); ++ix){ + r_s = l_s + ix; + r_c = l_c + ix; + s_s = k_s + ix; + s_c = k_c + ix; + t_s = j_s + ix; + t_c = j_c + ix; + + _complex_times_vector(phi1, (1. - mu * I), s_s->s0); + _vector_add_mul(phi1, eps, s_c->s0); + _complex_times_vector(phi2, (1. + mu * I), s_c->s0); + _vector_add_mul(phi2, eps, s_s->s0); + _vector_sub(r_s->s0, phi1, t_s->s0); + _vector_sub(r_c->s0, phi2, t_c->s0); + + _complex_times_vector(phi1, (1. - mu * I), s_s->s1); + _vector_add_mul(phi1, eps, s_c->s1); + _complex_times_vector(phi2, (1. + mu * I), s_c->s1); + _vector_add_mul(phi2, eps, s_s->s1); + _vector_sub(r_s->s1, phi1, t_s->s1); + _vector_sub(r_c->s1, phi2, t_c->s1); + + _complex_times_vector(phi1, (1. + mu * I), s_s->s2); + _vector_add_mul(phi1, eps, s_c->s2); + _complex_times_vector(phi2, (1. - mu * I), s_c->s2); + _vector_add_mul(phi2, eps, s_s->s2); + _vector_sub(r_s->s2, t_s->s2, phi1); + _vector_sub(r_c->s2, t_c->s2, phi2); + + _complex_times_vector(phi1, (1. + mu * I), s_s->s3); + _vector_add_mul(phi1, eps, s_c->s3); + _complex_times_vector(phi2, (1. - mu * I), s_c->s3); + _vector_add_mul(phi2, eps, s_s->s3); + _vector_sub(r_s->s3, t_s->s3, phi1); + _vector_sub(r_c->s3, t_c->s3, phi2); + } +} + +void M_oo_sub_g5_ndpsi_32(spinor32 * const l_s, spinor32 * const l_c, + spinor32 * const k_s, spinor32 * const k_c, + spinor32 * const j_s, spinor32 * const j_c, + const float mu, const float eps) { +#ifdef OMP +#pragma omp parallel + { +#endif + M_oo_sub_g5_ndpsi_32_orphaned(l_s,l_c,k_s,k_c,j_s,j_c,mu,eps); +#ifdef OMP + } /* OpenMP closing brace */ +#endif + return; +} + +void Qtm_pm_ndpsi_32(spinor32 * const l_strange, spinor32 * const l_charm, + spinor32 * const k_strange, spinor32 * const k_charm){ +#ifdef OMP +#pragma omp parallel + { +#endif + /* first the Qhat(2x2)^dagger PART*/ + /* Here the M_oe Mee^-1 M_eo implementation */ + Hopping_Matrix_32_orphaned(EO, g_spinor_field32[0], k_charm); + Hopping_Matrix_32_orphaned(EO, g_spinor_field32[1], k_strange); + + M_ee_inv_ndpsi_32_orphaned(g_spinor_field32[2], g_spinor_field32[3], + g_spinor_field32[0], g_spinor_field32[1], + (float) g_mubar, (float) g_epsbar); + + Hopping_Matrix_32_orphaned(OE, g_spinor_field32[0], g_spinor_field32[2]); + Hopping_Matrix_32_orphaned(OE, g_spinor_field32[1], g_spinor_field32[3]); + + /* Here the M_oo implementation */ + M_oo_sub_g5_ndpsi_32_orphaned(g_spinor_field32[2], g_spinor_field32[3], k_charm, k_strange, + g_spinor_field32[0], g_spinor_field32[1], + (float)(-g_mubar), (float)(-g_epsbar)); + /* We have to reassigin as follows to avoid overwriting */ + /* Recall in fact that Q^hat = tau_1 Q tau_1 , hence */ + /* and then the Qhat(2x2) PART */ + + /* Here the M_oe Mee^-1 M_eo implementation */ + Hopping_Matrix_32_orphaned(EO, g_spinor_field32[0], g_spinor_field32[3]); + Hopping_Matrix_32_orphaned(EO, g_spinor_field32[1], g_spinor_field32[2]); + + M_ee_inv_ndpsi_32_orphaned(g_spinor_field32[5], g_spinor_field32[4], + g_spinor_field32[1], g_spinor_field32[0], + (float)(-g_mubar), (float)g_epsbar); + + Hopping_Matrix_32_orphaned(OE, l_strange, g_spinor_field32[4]); + Hopping_Matrix_32_orphaned(OE, l_charm, g_spinor_field32[5]); + + /* Here the M_oo implementation */ + M_oo_sub_g5_ndpsi_32_orphaned(l_strange, l_charm, g_spinor_field32[3], g_spinor_field32[2], + l_strange, l_charm, (float)(-g_mubar), (float)(-g_epsbar)); + /* At the end, the normalisation by the max. eigenvalue */ + /* Twice phmc_invmaxev since we consider here D Ddag !!! */ + mul_r_32_orphaned(l_charm, (float) phmc_invmaxev*phmc_invmaxev, l_charm, VOLUME/2); + mul_r_32_orphaned(l_strange, (float) phmc_invmaxev*phmc_invmaxev, l_strange, VOLUME/2); +#ifdef OMP + } /* OpenMP closing brace */ +#endif + return; +} + +void Qsw_pm_ndpsi_32(spinor32 * const l_strange, spinor32 * const l_charm, + spinor32 * const k_strange, spinor32 * const k_charm) { +#ifdef OMP +#pragma omp parallel + { +#endif + /* FIRST THE Qhat(2x2)^dagger PART*/ + /* Here the M_oe Mee^-1 M_eo implementation */ + Hopping_Matrix_32_orphaned(EO, g_spinor_field32[0], k_charm); + Hopping_Matrix_32_orphaned(EO, g_spinor_field32[1], k_strange); + + assign_mul_one_sw_pm_imu_eps_32_orphaned(EE, g_spinor_field32[2], g_spinor_field32[3], + g_spinor_field32[0], g_spinor_field32[1], -g_mubar, g_epsbar); + clover_inv_nd_32_orphaned(EE, g_spinor_field32[2], g_spinor_field32[3]); + + Hopping_Matrix_32_orphaned(OE, g_spinor_field32[0], g_spinor_field32[2]); + Hopping_Matrix_32_orphaned(OE, g_spinor_field32[1], g_spinor_field32[3]); + + // Here the M_oo implementation + clover_gamma5_nd_32_orphaned(OO, g_spinor_field32[2], g_spinor_field32[3], + k_charm, k_strange, + g_spinor_field32[0], g_spinor_field32[1], + -g_mubar, -g_epsbar); + + // and then the Qhat(2x2) PART + // Recall in fact that Q^hat = tau_1 Q tau_1 + // Here the M_oe Mee^-1 M_eo implementation + // the re-ordering in s and c components is due to tau_1 + Hopping_Matrix_32_orphaned(EO, g_spinor_field32[0], g_spinor_field32[3]); + Hopping_Matrix_32_orphaned(EO, g_spinor_field32[1], g_spinor_field32[2]); + + assign_mul_one_sw_pm_imu_eps_32_orphaned(EE, g_spinor_field32[4], g_spinor_field32[5], + g_spinor_field32[1], g_spinor_field32[0], g_mubar, g_epsbar); + clover_inv_nd_32_orphaned(EE, g_spinor_field32[4], g_spinor_field32[5]); + + Hopping_Matrix_32_orphaned(OE, g_spinor_field32[0], g_spinor_field32[5]); + Hopping_Matrix_32_orphaned(OE, g_spinor_field32[1], g_spinor_field32[4]); + + clover_gamma5_nd_32_orphaned(OO, l_charm, l_strange, + g_spinor_field32[2], g_spinor_field32[3], + g_spinor_field32[1], g_spinor_field32[0], + g_mubar, -g_epsbar); + + /* At the end, the normalisation by the max. eigenvalue */ + /* Twice phmc_invmaxev since we consider here D Ddag !!! */ + mul_r_32_orphaned(l_charm, phmc_invmaxev*phmc_invmaxev, l_charm, VOLUME/2); + mul_r_32_orphaned(l_strange, phmc_invmaxev*phmc_invmaxev, l_strange, VOLUME/2); + +#ifdef OMP /* OpenMP parallel closing brace */ + } +#endif + + return; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/tm_operators_nd_32.h b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/tm_operators_nd_32.h new file mode 100644 index 0000000000000000000000000000000000000000..fedc818f702e8ecc9d4ffe43d3899ec3d0803cd3 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/tm_operators_nd_32.h @@ -0,0 +1,30 @@ +/*********************************************************************** + * + * Copyright (C) 2015 Florian Burger + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _TM_OPERATORS_ND_32_H +#define _TM_OPERATORS_ND_32_H + +void Q_pm_ndpsi_32(spinor32 * const l_strange, spinor32 * const l_charm, spinor32 * const k_strange, spinor32 * const k_charm); + +void Qtm_pm_ndpsi_32(spinor32 * const l_strange, spinor32 * const l_charm, + spinor32 * const k_strange, spinor32 * const k_charm); +void Qsw_pm_ndpsi_32(spinor32 * const l_strange, spinor32 * const l_charm, + spinor32 * const k_strange, spinor32 * const k_charm); +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/tm_sub_Hopping_Matrix.c b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/tm_sub_Hopping_Matrix.c new file mode 100644 index 0000000000000000000000000000000000000000..dd96ca221a87d83272e8e5ef8232986d1cf345bf --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/tm_sub_Hopping_Matrix.c @@ -0,0 +1,157 @@ +/********************************************************************** + * + * Copyright (C) 2012 Carsten Urbach + * + * This file is based on an implementation of the Dirac operator + * written by Martin Luescher, modified by Martin Hasenbusch in 2002 + * and modified and extended by Carsten Urbach from 2003-2008 + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * + * Hopping_Matrix is the conventional Wilson + * hopping matrix + * + ****************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#ifdef OMP +#include +#endif +#include +#include "global.h" +#include "su3.h" +#ifdef BGQ +# include"DirectPut.h" +#endif +#ifdef MPI +# include "xchange/xchange.h" +#endif +#include "boundary.h" +#include "init/init_dirac_halfspinor.h" +#include "update_backward_gauge.h" +#include "tm_sub_Hopping_Matrix.h" + +// now comes the definition of tm_times_Hopping_Matrix +// which does (a + g5 i b - Hopping_Matrix) +// where cfactor = a + i b +// + +#if (defined _USE_HALFSPINOR) +# include "operator/halfspinor_hopping.h" + +# if ((defined SSE2)||(defined SSE3)) +# include "sse.h" + +# elif (defined BGL && defined XLC) +# include "bgl.h" + +# elif (defined BGQ && defined XLC) +# include "bgq.h" +# include "bgq2.h" +# include "xlc_prefetch.h" + +# endif + +void tm_sub_Hopping_Matrix(const int ieo, spinor * const l, spinor * const p, spinor * const k, + complex double const cfactor) { + +# ifdef _GAUGE_COPY + if(g_update_gauge_copy) { + update_backward_gauge(g_gauge_field); + } +# endif + +# ifdef OMP +# pragma omp parallel + { + su3 * restrict u0 ALIGN; +# endif + +# define _TM_SUB_HOP + spinor * pn; +# if (defined BGQ && defined XLC) + complex double ALIGN bla = cfactor; + vector4double ALIGN cf = vec_ld2(0, (double*) &bla); +# elif (defined SSE2 || defined SSE3) + _Complex double ALIGN cf = cfactor; + su3_vector ALIGN psi, psi2; +# endif +# include "operator/halfspinor_body.c" +# undef _TM_SUB_HOP +# ifdef OMP + } /* OpenMP closing brace */ +# endif + return; +} + +#elif (!defined _NO_COMM && !defined _USE_HALFSPINOR) +# include "operator/hopping.h" +# if ((defined SSE2)||(defined SSE3)) +# include "sse.h" + +# elif (defined BGL && defined XLC) +# include "bgl.h" + +# elif (defined BGQ && defined XLC) +# include "bgq.h" +# include "bgq2.h" +# include "xlc_prefetch.h" + +# elif defined XLC +# include"xlc_prefetch.h" + +# endif +void tm_sub_Hopping_Matrix(const int ieo, spinor * const l, spinor * p, spinor * const k, + complex double const cfactor) { +# ifdef XLC +# pragma disjoint(*l, *k) +# endif +# ifdef _GAUGE_COPY + if(g_update_gauge_copy) { + update_backward_gauge(g_gauge_field); + } +# endif + +# if (defined MPI) + xchange_field(k, ieo); +# endif + +# ifdef OMP +# pragma omp parallel + { +# endif +# define _TM_SUB_HOP + spinor * pn; +# if (defined BGQ && defined XLC) + complex double ALIGN bla = cfactor; + vector4double ALIGN cf = vec_ld2(0, (double*) &bla); +# elif (defined SSE2 || defined SSE3) + _Complex double ALIGN cf = cfactor; + su3_vector ALIGN psi, psi2; +# endif +# include "operator/hopping_body_dbl.c" +# undef _TM_SUB_HOP +# ifdef OMP + } /* OpenMP closing brace */ +# endif + return; +} +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/tm_sub_Hopping_Matrix.h b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/tm_sub_Hopping_Matrix.h new file mode 100644 index 0000000000000000000000000000000000000000..cd9f0840c086ef842bfc38090ff898e6021b3bff --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/tm_sub_Hopping_Matrix.h @@ -0,0 +1,27 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _TM_SUB_HOPPING_MATRIX_H +# define _TM_SUB_HOPPING_MATRIX_H + +# include "su3.h" + +void tm_sub_Hopping_Matrix(const int ieo, spinor * const l, spinor * p, spinor * const k, + complex double const cfactor); +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/tm_times_Hopping_Matrix.c b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/tm_times_Hopping_Matrix.c new file mode 100644 index 0000000000000000000000000000000000000000..183bf737bca07c63cfed3d6e4ea76e9c8df74306 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/tm_times_Hopping_Matrix.c @@ -0,0 +1,153 @@ +/********************************************************************** + * + * Copyright (C) 2012 Carsten Urbach + * + * This file is based on an implementation of the Dirac operator + * written by Martin Luescher, modified by Martin Hasenbusch in 2002 + * and modified and extended by Carsten Urbach from 2003-2008 + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * + * Hopping_Matrix is the conventional Wilson + * hopping matrix + * + ****************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#ifdef OMP +#include +#endif +#include +#include "global.h" +#include "su3.h" +#ifdef BGQ +# include"DirectPut.h" +#endif +#ifdef MPI +# include "xchange/xchange.h" +#endif +#include "boundary.h" +#include "init/init_dirac_halfspinor.h" +#include "update_backward_gauge.h" +#include "tm_times_Hopping_Matrix.h" + +// now comes the definition of tm_times_Hopping_Matrix +// which does (a + g5 i b) * Hopping_Matrix +// where cfactor = a + i b +// + +#if (defined _USE_HALFSPINOR && !defined _NO_COMM) +# include "operator/halfspinor_hopping.h" + +# if ((defined SSE2)||(defined SSE3)) +# include "sse.h" + +# elif (defined BGL && defined XLC) +# include "bgl.h" + +# elif (defined BGQ && defined XLC) +# include "bgq.h" +# include "bgq2.h" +# include "xlc_prefetch.h" + +# endif + +void tm_times_Hopping_Matrix(const int ieo, spinor * const l, spinor * const k, complex double const cfactor) { + +# ifdef _GAUGE_COPY + if(g_update_gauge_copy) { + update_backward_gauge(g_gauge_field); + } +# endif + +# ifdef OMP +# pragma omp parallel + { + su3 * restrict u0 ALIGN; +# endif + +# define _MUL_G5_CMPLX +# if (defined BGQ && defined XLC) + complex double ALIGN bla = cfactor; + vector4double ALIGN cf = vec_ld2(0, (double*) &bla); +# elif (defined SSE2 || defined SSE3) + _Complex double ALIGN cf = cfactor; +# endif +# include "operator/halfspinor_body.c" +# undef _MUL_G5_CMPLX +# ifdef OMP + } /* OpenMP closing brace */ +# endif + return; +} + +#elif (!defined _NO_COMM && !defined _USE_HALFSPINOR) +# include "operator/hopping.h" +# if ((defined SSE2)||(defined SSE3)) +# include "sse.h" + +# elif (defined BGL && defined XLC) +# include "bgl.h" + +# elif (defined BGQ && defined XLC) +# include "bgq.h" +# include "bgq2.h" +# include "xlc_prefetch.h" + +# elif defined XLC +# include"xlc_prefetch.h" + +# endif +void tm_times_Hopping_Matrix(const int ieo, spinor * const l, spinor * const k, double complex const cfactor) { +# ifdef XLC +# pragma disjoint(*l, *k) +# endif +# ifdef _GAUGE_COPY + if(g_update_gauge_copy) { + update_backward_gauge(g_gauge_field); + } +# endif + +# if (defined MPI) + xchange_field(k, ieo); +# endif + +# ifdef OMP +# pragma omp parallel + { +# endif +# define _MUL_G5_CMPLX +# if (defined BGQ && defined XLC) + complex double ALIGN bla = cfactor; + vector4double ALIGN cf = vec_ld2(0, (double*) &bla); +# elif (defined SSE2 || defined SSE3) + _Complex double ALIGN cf = cfactor; +# endif +# include "operator/hopping_body_dbl.c" +# undef _MUL_G5_CMPLX +# ifdef OMP + } /* OpenMP closing brace */ +# endif + return; +} +#endif + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/operator/tm_times_Hopping_Matrix.h b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/tm_times_Hopping_Matrix.h new file mode 100644 index 0000000000000000000000000000000000000000..e53face77e8846fa2e244d4e0b23aa8333656b74 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/operator/tm_times_Hopping_Matrix.h @@ -0,0 +1,27 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _TM_TIMES_HOPPING_MATRIX_H +# define _TM_TIMES_HOPPING_MATRIX_H + +# include "su3.h" + +void tm_times_Hopping_Matrix(const int ieo, spinor * const l, spinor * const k, complex double const cfactor); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/overrelaxation.c b/qcd/part_cpu/applications/QCD/src/kernel_D/overrelaxation.c new file mode 100644 index 0000000000000000000000000000000000000000..7813c27afe7dbc66aec053eee9316da2442dd287 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/overrelaxation.c @@ -0,0 +1,232 @@ +/*********************************************************************** + * + * Copyright (C) 2001 Martin Hasenbusch + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +/*---------------------------------------------------------------------- +! +! Author +! M. Hasenbusch 2001 +! Martin.Hasenbusch@desy.de +! This file provides the fuctions heatbath_sweep and overrel_sweep +! to update the gauge field. +! In the main program one merely says +! +! . +! . +! heatbath_sweep(); +! overrel_sweep(); +! +! without any arguments. The random number generator +! is assumed to be initialized, e.g. by a call to rcinit from +! the main program. Similarly, the geometry has to be defined +! by ``call geometry". A sweep through the lattice proceeds +! in sequential order in a given time slice. +! The updating procedure uses three Cabibbo-Marinari +! subgroups for both, the over relaxation and the heatbath. +! For the latter we employ the procedure by Fabricius and Haan. +! Details and references can be found in the notes by Peter Weisz. +! +! The code is based on the F code provide by +! Stefan Sint 15/8/95 and Stefano Capitani - Jan/Feb 1997 +! New: +! fuctions heatbath_sweep_adj and overrel_sweep_adj +! to update the gauge field with a mixed fundamental and adjoint +! action. +! +! S_G = -\beta_f \sum_P 1/N Re Tr_f U_P +! -\beta_a \sum_P 1/N^2 ( Tr_f U_P^* )( Tr_f U_P ) +! +! for simplicity we have fixed betap = 6.0 here. +! +! new Wed Oct 1 11:01:48 MEST 2003: 1-dim parallelisation for the +! gauge-update (pure Wilson only) by M.Hasenbusch +! in the present version, it is not expected that the auxiliary fields +! for the boundaries are set consistently before the update is called. +! However, after the update, the auxiliary fields are not at their proper +! values, and xchange_gaugefield(); has to be called before e.g. D_psi(); +! can be used. +* +* +* Checking that 1-dim. parallelisation (x-direction) works, +* (Check and correction in +* bin/pure_gauge.c, bin/local_update.c, bin/geometry.c, +* message-passing/xchange_gaugefield.c, observable/plaquette.c) +* done by Kei-ichi Nagai +* +*/ +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "su3adj.h" +#include "monomial/moment_energy.h" +#include "ranlxd.h" +#include "sse.h" +#include "get_staples.h" +#include "overrelaxation.h" + +/****************************************************************/ +/* + flip_subgroup + input: int ix, int mu, su3 vv, int i +*/ +/****************************************************************/ + +void flip_subgroup(int ix, int mu, su3 vv, int i){ + static double vv0,vv1,vv2,vv3,aa0,aa1,aa2,aa3; + static double aux,norm_vv_sq; + + static su3 a,w,v; + su3 *z; + _su3_assign(v,vv); + _su3_one(a); + z=&g_gauge_field[ix][mu]; + _su3_times_su3d(w,*z,v); + + /* + According to Peter's notes ``A Cabibbo-Marinari SU(3)....", eqs. (A.14-A.17) + we have */ + if(i==1) + { + vv0 = creal(w.c00) + creal(w.c11); + vv3 = -cimag(w.c00) + cimag(w.c11); + vv1 = -cimag(w.c01) - cimag(w.c10); + vv2 = -creal(w.c01) + creal(w.c10); + } + else if(i==2) + { + vv0 = creal(w.c00) + creal(w.c22); + vv3 = -cimag(w.c00) + cimag(w.c22); + vv1 = -cimag(w.c02) - cimag(w.c20); + vv2 = -creal(w.c02) + creal(w.c20); + } + else + { + vv0 = creal(w.c11) + creal(w.c22); + vv3 = -cimag(w.c11) + cimag(w.c22); + vv1 = -cimag(w.c12) - cimag(w.c21); + vv2 = -creal(w.c12) + creal(w.c21); + } + + norm_vv_sq= vv0 * vv0 + vv1 * vv1 + vv2 * vv2 + vv3 * vv3; + + aux= 2.0 * vv0 / norm_vv_sq; + aa0 = aux * vv0-1.0; + aa1 = aux * vv1; + aa2 = aux * vv2; + aa3 = aux * vv3; + + /* aa is embedded in the SU(3) matrix (a) which can be multiplied on + the link variable using the su3_type operator * . */ + + if(i==1) + { + a.c00 = aa0 + aa3 * I; + a.c11 = conj(a.c00); + a.c01 = aa2 + aa1 * I; + a.c10 = -conj(a.c01); + } + else if(i==2) + { + a.c00 = aa0 + aa3 * I; + a.c22 = conj(a.c00); + a.c02 = aa2 + aa1 * I; + a.c20 = -conj(a.c02); + } + else + { + a.c11 = aa0 + aa3 * I; + a.c22 = conj(a.c11); + a.c12 = aa2 + aa1 * I; + a.c21 = -conj(a.c12); + } + + _su3_times_su3(w,a,*z); + *z=w; +} + +#if defined PARALLEL1 +void overrel_sweep(){ + int x0,x1,x2,x3; + int mu,ix; + static su3 v; + if(LX<2) {printf("LX is smaller than 2 \n"); exit(0);} +/* xchange the gauge-field */ + xchange_gaugefield(g_gauge_field); +/* update the left half of the sublattice */ + for(x1=0;x1. + ***********************************************************************/ +#ifndef _OVERRELAXATION_H +#define _OVERRELAXATION_H + +extern void overrel_sweep(); +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/parallel_io.h b/qcd/part_cpu/applications/QCD/src/kernel_D/parallel_io.h new file mode 100644 index 0000000000000000000000000000000000000000..8cc26777f5a696a6abc2491d8bb46407ec42bad2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/parallel_io.h @@ -0,0 +1,40 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _PARALLEL_IO_H +#define _PARALLEL_IO_H + +#include +#include"dml.h" + +int read_lemon_gauge_field_parallel(char *filename); +int read_lemon_gauge_field_singleprec_parallel(char const * filename); + +int read_binary_gauge_data_parallel(LemonReader * lemonreader, DML_Checksum * checksum); +int read_checksum_parallel(LemonReader * lemonreader, DML_Checksum * checksum); + +int write_lemon_gauge_field_parallel(char * filename, const double plaq, const int counter, const int prec); + +int write_binary_gauge_data_parallel(LemonWriter * lemonwriter, const int prec, DML_Checksum * ans); +int write_checksum_parallel(LemonWriter * lemonwriter, DML_Checksum * checksum); + +int write_xlf_info_parallel(LemonWriter * lemonwriter, const double plaq, const int counter); +int write_ildg_format_parallel(LemonWriter *writer, const int prec); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/phmc.c b/qcd/part_cpu/applications/QCD/src/kernel_D/phmc.c new file mode 100644 index 0000000000000000000000000000000000000000..a27f869c0bbc95be79b1e5119227645a0aa17bf5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/phmc.c @@ -0,0 +1,317 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#include +#include "global.h" + +#include "read_input.h" +#include "solver/eigenvalues_bi.h" +#include "solver/solver.h" +#include "init/init.h" +#include "chebyshev_polynomial_nd.h" +#include "Ptilde_nd.h" +#include "operator/tm_operators_nd.h" +#include "phmc.h" +#include "monomial/monomial.h" +#include "solver/matrix_mult_typedef_bi.h" +#include "gettime.h" + +// --> in monomial +double phmc_Cpol; // --> MDPolyLocNormConst +double phmc_cheb_evmin, phmc_cheb_evmax; // --> EVMin, EVMax +double phmc_invmaxev; // --> EVMaxInv +_Complex double * phmc_root; // --> MDPolyRoots +int phmc_dop_n_cheby; // --> MDPolyDegree +double * phmc_dop_cheby_coef; // --> MDPolyCoefs +int phmc_ptilde_n_cheby; // --> PtildeDegree +double * phmc_ptilde_cheby_coef; // --> PtildeCoefs +int errcode; +phmc_vars *phmc_var_stack=NULL; +int phmc_max_ptilde_degree = NTILDE_CHEBYMAX; + +void init_phmc() { + int max_iter_ev, j, k; + FILE *roots; + char *filename_phmc_root = "Square_root_BR_roots.dat"; + char *filename_phmc_root_oox = "Square_root_BR_roots.dat.oox"; + char title[100]; + + FILE *Const; + char *filename_const = "normierungLocal.dat"; + char *filename_const_oox = "normierungLocal.dat.oox"; + + /* contains info about the mnl poly_monomial*/ + monomial *mnl=NULL; + + for(j=0;j 1)) { + printf("PHMC: interval of approximation [stilde_min, stilde_max] = [%e, %e]\n", stilde_min, stilde_max); + printf("PHMC: degree for P = %d, epsilont = %e, normalisation = %e", + phmc_dop_n_cheby-1, phmc_cheb_evmin, phmc_invmaxev); + } + + /* Chi`s-spinors memory allocation */ + j = init_chi_spinor_field(VOLUMEPLUSRAND/2, (phmc_dop_n_cheby+1)); + if ( j!= 0) { + fprintf(stderr, "Not enough memory for PHMC Chi fields! Aborting...\n"); + exit(0); + } + + /* End memory allocation */ + /* Here we prepare the precise polynomial */ + //degree_of_Ptilde(); + + /* THIS IS THE OVERALL CONSTANT */ + /* write phmc_Cpol as the result of the simple-program files (BigC^(1/2))^1/2 + since BigC^(1/2) is the constant appearing in each factor of the + multiplication defining the monomial basis representation of the + polinomial in s, while its square phmc_root (BigC^(1/2))^1/2 is the + constant appearing in the multiplication representing the + polinomial in sqrt(s) . + */ + + if(mnl->MDPolyLocNormConst == -1.0){ + if(!(g_epsbar!=0.0 || phmc_exact_poly==0)) + filename_const=filename_const_oox; + if((Const=fopen(filename_const,"r")) != (FILE*)NULL) { + errcode = fscanf(Const, " %lf \n", &phmc_Cpol); + fclose(Const); + } else { + fprintf(stderr, "File %s is missing! Aborting...\n", filename_const); +#ifdef MPI + MPI_Finalize(); +#endif + exit(6); + } + } else { + phmc_Cpol=mnl->MDPolyLocNormConst; + fprintf(stderr,"phmc_Cpol set to %e " , phmc_Cpol); + } + + if(g_epsbar!=0.0 || phmc_exact_poly==0) phmc_Cpol = sqrt(phmc_Cpol); + + phmc_root = calloc((2*phmc_dop_n_cheby-2),sizeof(_Complex double)); + + + if(g_epsbar==0.0 && phmc_exact_poly == 1) + filename_phmc_root=filename_phmc_root_oox; + + if(strlen(mnl->MDPolyRootsFile)!=0) + filename_phmc_root=mnl->MDPolyRootsFile; + + if((roots=fopen(filename_phmc_root,"r")) != (FILE*)NULL) { + if (fgets(title, 100, roots) == NULL) + { + fprintf(stderr, "Error in reading %s! Aborting...\n", filename_phmc_root); + #ifdef MPI + MPI_Finalize(); + #endif + exit(6); + } + + /* Here we read in the 2n roots needed for the polinomial in sqrt(s) */ + double *phmc_darray = (double*)phmc_root; + for(j = 0; j< 2 * phmc_dop_n_cheby - 2; ++j) + errcode = fscanf(roots," %d %lf %lf \n", &k, &phmc_darray[2 * j], &phmc_darray[2 * j + 1]); + fclose(roots); + } + else { + fprintf(stderr, "File %s is missing! Aborting...\n", filename_phmc_root); +#ifdef MPI + MPI_Finalize(); +#endif + exit(6); + } + + /* END IF PHMC */ + return; +} + + +void phmc_compute_ev(const int trajectory_counter, + const int id, + matrix_mult_bi Qsq) { + double atime, etime, temp=0., temp2=0.; + int max_iter_ev, no_eigenvalues; + char buf[100]; + char * phmcfilename = buf; + FILE * countfile; + monomial * mnl = &monomial_list[id];; + + sprintf(phmcfilename,"monomial-%.2d.data", id); + atime = gettime(); + + max_iter_ev = 1000; + + if((g_proc_id == 0) && (g_debug_level > 0)) { + printf("# Computing eigenvalues for heavy doublet\n"); + } + + no_eigenvalues = 1; + + temp = eigenvalues_bi(&no_eigenvalues, max_iter_ev, eigenvalue_precision, 0, Qsq); + + no_eigenvalues = 1; + temp2 = eigenvalues_bi(&no_eigenvalues, max_iter_ev, eigenvalue_precision, 1, Qsq); + + if((g_proc_id == 0) && (g_debug_level > 1)) { + printf("# %s: lowest eigenvalue end of trajectory %d = %e\n", + mnl->name, trajectory_counter, temp); + printf("# %s: maximal eigenvalue end of trajectory %d = %e\n", + mnl->name, trajectory_counter, temp2); + } + if(g_proc_id == 0) { + if(temp2 > 1.) { + fprintf(stderr, "\nWarning: largest eigenvalue for monomial %s larger than upper bound!\n\n", mnl->name); + } + if(temp < mnl->EVMin) { + fprintf(stderr, "\nWarning: smallest eigenvalue for monomial %s smaller than lower bound!\n\n", mnl->name); + } + countfile = fopen(phmcfilename, "a"); + fprintf(countfile, "%.8d %1.5e %1.5e %1.5e %1.5e\n", + trajectory_counter, temp, temp2, mnl->EVMin, 1.); + fclose(countfile); + } + etime = gettime(); + if((g_proc_id == 0) && g_debug_level > 1) { + printf("# %s: time/s for eigenvalue computation %e\n", mnl->name, etime-atime); + } +} + + +/** + * creates a new stack element and stores a set of phmc + * variables needed in the operators + */ +void pushPhmcVars(){ + if(phmc_var_stack==NULL){ + phmc_var_stack=(phmc_vars*)malloc(sizeof(phmc_vars)); + phmc_var_stack->previous=NULL; + phmc_var_stack->stacksize=1; + } else { + phmc_var_stack->next=malloc(sizeof(phmc_vars)); + ((phmc_vars*)phmc_var_stack->next)->previous=(void*)phmc_var_stack; + phmc_var_stack=(phmc_vars*)phmc_var_stack->next; + phmc_var_stack->stacksize=((phmc_vars*)phmc_var_stack->previous)->stacksize+1; + } + + phmc_var_stack->next=NULL; + + /* save global phmc variables */ + phmc_var_stack->invmaxev=phmc_invmaxev; + phmc_var_stack->Cpol=phmc_Cpol; + phmc_var_stack->root=phmc_root; + phmc_var_stack->dop_n_cheby=phmc_dop_n_cheby; + + if(g_proc_id==0) + fprintf(stderr,"phmc variable stack size is now %d \n",phmc_var_stack->stacksize); + +} + +/** + * restores the variables to the values stored in the + * top stack element and removes it + */ +void popPhmcVars(){ + + if(phmc_var_stack!=NULL){ + phmc_vars *prev; + + /* restore global phmc variables */ + phmc_invmaxev=phmc_var_stack->invmaxev; + phmc_Cpol=phmc_var_stack->Cpol; + phmc_root=phmc_var_stack->root; + phmc_dop_n_cheby=phmc_var_stack->dop_n_cheby; + + + + prev=(phmc_vars*)phmc_var_stack->previous; + + free(phmc_var_stack); + + phmc_var_stack=prev; + + if(phmc_var_stack!=NULL) + phmc_var_stack->next=NULL; + + } else { + if(g_proc_id==0) + fprintf(stderr,"Error: there is no element on the stack\n"); + } + + +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/phmc.h b/qcd/part_cpu/applications/QCD/src/kernel_D/phmc.h new file mode 100644 index 0000000000000000000000000000000000000000..f5dfa73aa9c1c250282724f721d175b738f52c2d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/phmc.h @@ -0,0 +1,62 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _PHMC_H +#define _PHMC_H + +#include "solver/matrix_mult_typedef_bi.h" + +/* the normalisation constant appearing in the product representation of */ +/* the polynomial */ +extern double phmc_Cpol; +/* maximal and minimal eigenvalue of the ND operator */ +extern double phmc_cheb_evmin, phmc_cheb_evmax; +/* inverse maximal EV, needed for normalisation */ +extern double phmc_invmaxev; +/* These are the roots */ +extern _Complex double * phmc_root; +/* degree and coefs of P */ +extern int phmc_dop_n_cheby; +extern double * phmc_dop_cheby_coef; +/* degree of coefs \tilde P */ +extern int phmc_ptilde_n_cheby; +extern double * phmc_ptilde_cheby_coef; +extern int phmc_max_ptilde_degree; + +/* structure for holding a set of phmc specific variables*/ +typedef struct phmc_vars_ { + void *previous,*next; + double invmaxev; + double Cpol; + int dop_n_cheby; + _Complex double *root; + int stacksize; +} phmc_vars; + +/* stack for saving and restoring phmc variables*/ +extern phmc_vars *phmc_var_stack; + +/* functions for pushing and poping phmc vars */ +void pushPhmcVars(); +void popPhmcVars(); + +void phmc_compute_ev(const int trajectory_counter, const int id, + matrix_mult_bi Qsq); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/prepare_source.c b/qcd/part_cpu/applications/QCD/src/kernel_D/prepare_source.c new file mode 100644 index 0000000000000000000000000000000000000000..35f402d6e2caaedae2dd83b769479b21721d2e4d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/prepare_source.c @@ -0,0 +1,266 @@ +/*********************************************************************** + * + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#include +#ifdef MPI +# include +#endif +#include "global.h" +#include +#include +#include +#include +#include "solver/solver.h" +#include "start.h" +#include "ranlxd.h" +#include "su3.h" +#include "operator.h" +#include "linalg_eo.h" +#include "operator/tm_operators_nd.h" +#include "source_generation.h" +#include "prepare_source.h" + +void prepare_source(const int nstore, const int isample, const int ix, const int op_id, + const int read_source_flag, + const int source_location) { + + FILE * ifs = NULL; + int is = ix / 3, ic = ix %3, err = 0, rstat=0, t = 0; + operator * optr = &operator_list[op_id]; + char source_filename[100]; + int source_type = SourceInfo.type; + static int nstore_ = -1; + static int isample_ = -1; + static int ix_ = -1; + static int op_id_ = -1; + + SourceInfo.nstore = nstore; + SourceInfo.sample = isample; + SourceInfo.ix = ix; + + if(optr->type != DBTMWILSON && optr->type != DBCLOVER) { + SourceInfo.no_flavours = 1; + /* no volume sources */ + if(source_type != 1) { + /* either "Don't read inversion source from file" or */ + /* "Don't read inversion source from file, but save the one generated" */ + if (read_source_flag == 0 || read_source_flag == 2) { + if (source_location == 0) { + source_spinor_field(g_spinor_field[0], g_spinor_field[1], is, ic); + } + else { + source_spinor_field_point_from_file(g_spinor_field[0], g_spinor_field[1], is, ic, source_location); + } + } + /* "Read inversion source from file" */ + else { + if (SourceInfo.splitted) { + /* timeslice needs to be put into filename */ + if(SourceInfo.automaticTS) { + /* automatic timeslice detection */ + if(g_proc_id == 0) { + for(t = 0; t < g_nproc_t*T; t++) { + if(T_global > 99) sprintf(source_filename, "%s.%.4d.%.3d.%.2d", SourceInfo.basename, nstore, t, ix); + else sprintf(source_filename, "%s.%.4d.%.2d.%.2d", SourceInfo.basename, nstore, t, ix); + if( (ifs = fopen(source_filename, "r")) != NULL) { + fclose(ifs); + break; + } + } + } +#ifdef MPI + MPI_Bcast(&t, 1, MPI_INT, 0, MPI_COMM_WORLD); +#endif + SourceInfo.t = t; + } + if(T_global > 99) sprintf(source_filename, "%s.%.4d.%.3d.%.2d", SourceInfo.basename, nstore, SourceInfo.t, ix); + else sprintf(source_filename, "%s.%.4d.%.2d.%.2d", SourceInfo.basename, nstore, SourceInfo.t, ix); + if (g_cart_id == 0) { + printf("# Trying to read source from %s\n", source_filename); + } + rstat = read_spinor(g_spinor_field[0], g_spinor_field[1], source_filename, 0); + } + else { + sprintf(source_filename, "%s", SourceInfo.basename); + if (g_cart_id == 0) { + printf("# Trying to read source no %d from %s\n", ix, source_filename); + } + rstat = read_spinor(g_spinor_field[0], g_spinor_field[1], source_filename, ix); + } + if(rstat) { + fprintf(stderr, "Error reading file %s in prepare_source.c\nUnable to proceed, aborting....\n", source_filename); + exit(-1); + } + } + if (PropInfo.splitted) { + if(T_global > 99) sprintf(source_filename, "%s.%.4d.%.3d.%.2d.inverted", PropInfo.basename, nstore, SourceInfo.t, ix); + else sprintf(source_filename, "%s.%.4d.%.2d.%.2d.inverted", PropInfo.basename, nstore, SourceInfo.t, ix); + } + else { + if(T_global > 99) sprintf(source_filename, "%s.%.4d.%.3d.inverted", PropInfo.basename, nstore, SourceInfo.t); + else sprintf(source_filename, "%s.%.4d.%.2d.inverted", PropInfo.basename, nstore, SourceInfo.t); + } + } + else if(source_type == 1) { + /* Volume sources */ + if(read_source_flag == 0 || read_source_flag == 2) { + if(g_proc_id == 0 && g_debug_level > 0) { + printf("# Preparing 1 flavour volume source\n"); + } + gaussian_volume_source(g_spinor_field[0], g_spinor_field[1], isample, nstore, 0); + } + else { + sprintf(source_filename, "%s.%.4d.%.5d", SourceInfo.basename, nstore, isample); + if (g_cart_id == 0) { + printf("# Trying to read source from %s\n", source_filename); + } + rstat = read_spinor(g_spinor_field[0], g_spinor_field[1], source_filename, 0); + if(rstat) { + fprintf(stderr, "Error reading file %s in prepare_source.c.\nUnable to proceed, aborting....\n", source_filename); + exit(-1); + } + } + sprintf(source_filename, "%s.%.4d.%.5d.inverted", PropInfo.basename, nstore, isample); + } + optr->sr0 = g_spinor_field[0]; + optr->sr1 = g_spinor_field[1]; + optr->prop0 = g_spinor_field[2]; + optr->prop1 = g_spinor_field[3]; + + + /* If the solver is _not_ CG we might read in */ + /* here some better guess */ + /* This also works for re-iteration */ + if (optr->solver != CG && optr->solver != PCG && optr->solver != MIXEDCG && optr->solver != RGMIXEDCG) { + ifs = fopen(source_filename, "r"); + if (ifs != NULL) { + if (g_cart_id == 0) { + printf("# Trying to read guess from file %s\n", source_filename); + fflush(stdout); + } + fclose(ifs); + err = 0; + /* iter = get_propagator_type(source_filename); */ + rstat = read_spinor(optr->prop0, optr->prop1, source_filename, (PropInfo.splitted ? 0 : ix)); + if(rstat) { + fprintf(stderr, "Error reading file %s in prepare_source.c, rstat = %d\n", source_filename, rstat); + exit(-1); + } + if (g_kappa != 0.) { + mul_r(optr->prop1, 1. / (2*optr->kappa), optr->prop1, VOLUME / 2); + mul_r(optr->prop0, 1. / (2*optr->kappa), optr->prop0, VOLUME / 2); + } + + if (err != 0) { + zero_spinor_field(optr->prop0, VOLUME / 2); + zero_spinor_field(optr->prop1, VOLUME / 2); + } + } + else { + zero_spinor_field(optr->prop0, VOLUME / 2); + zero_spinor_field(optr->prop1, VOLUME / 2); + } + } + else { + zero_spinor_field(optr->prop0, VOLUME / 2); + zero_spinor_field(optr->prop1, VOLUME / 2); + } + /* if(optr->even_odd_flag) { */ + /* assign(optr->sr0, g_spinor_field[0], VOLUME/2); */ + /* assign(optr->sr1, g_spinor_field[1], VOLUME/2); */ + /* } */ + /* else { */ + /* convert_eo_to_lexic(optr->sr0, g_spinor_field[0], g_spinor_field[1]); */ + /* } */ + } + else { /* for the ND 2 flavour twisted operator */ + SourceInfo.no_flavours = 2; + zero_spinor_field(g_spinor_field[0], VOLUME/2); + zero_spinor_field(g_spinor_field[1], VOLUME/2); + if(source_type != 1) { + if(read_source_flag == 0 || read_source_flag == 2) { + if(source_location == 0) { + source_spinor_field(g_spinor_field[2], g_spinor_field[3], is, ic); + } + else { + source_spinor_field_point_from_file(g_spinor_field[2], g_spinor_field[3], + is, ic, source_location); + } + } + else { + if(SourceInfo.splitted) { + if(T_global > 99) sprintf(source_filename, "%s.%.4d.%.3d.%.2d", SourceInfo.basename, nstore, SourceInfo.t, ix); + else sprintf(source_filename, "%s.%.4d.%.2d.%.2d", SourceInfo.basename, nstore, SourceInfo.t, ix); + } + else { + sprintf(source_filename,"%s", SourceInfo.basename); + } + if(g_proc_id == 0) { + printf("# Trying to read source from %s\n", source_filename); + } + if(read_spinor(g_spinor_field[2], g_spinor_field[3], source_filename, 0) != 0) { + fprintf(stderr, "Error reading source! Aborting...\n"); +#ifdef MPI + MPI_Abort(MPI_COMM_WORLD, 1); + MPI_Finalize(); +#endif + exit(-1); + } + } + } + else if(source_type == 1) { + /* Volume sources */ + if(g_proc_id == 0 && g_debug_level > 0) { + printf("# Preparing 2 flavour volume source\n"); + } + gaussian_volume_source(g_spinor_field[0], g_spinor_field[1], + isample, nstore, 1); + gaussian_volume_source(g_spinor_field[2], g_spinor_field[3], + isample, nstore, 2); + } + mul_one_pm_itau2(g_spinor_field[4], g_spinor_field[6], g_spinor_field[0], g_spinor_field[2], +1., VOLUME/2); + mul_one_pm_itau2(g_spinor_field[5], g_spinor_field[7], g_spinor_field[1], g_spinor_field[3], +1., VOLUME/2); + assign(g_spinor_field[0], g_spinor_field[4], VOLUME/2); + assign(g_spinor_field[1], g_spinor_field[5], VOLUME/2); + assign(g_spinor_field[2], g_spinor_field[6], VOLUME/2); + assign(g_spinor_field[3], g_spinor_field[7], VOLUME/2); + + optr->sr0 = g_spinor_field[0]; + optr->sr1 = g_spinor_field[1]; + optr->sr2 = g_spinor_field[2]; + optr->sr3 = g_spinor_field[3]; + optr->prop0 = g_spinor_field[4]; + optr->prop1 = g_spinor_field[5]; + optr->prop2 = g_spinor_field[6]; + optr->prop3 = g_spinor_field[7]; + } + nstore_ = nstore; + isample_ = isample; + ix_ = ix; + op_id_ = op_id; + return; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/prepare_source.h b/qcd/part_cpu/applications/QCD/src/kernel_D/prepare_source.h new file mode 100644 index 0000000000000000000000000000000000000000..3b903ab016dd9295a690ac36d5eac8b093f5d303 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/prepare_source.h @@ -0,0 +1,28 @@ +/*********************************************************************** + * + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _PREPARE_SOURCE_H +#define _PREPARE_SOURCE_H + +void prepare_source(const int nstore, const int isample, const int ix, const int op_id, + const int read_source_flag, + const int source_location); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/quda_interface.c b/qcd/part_cpu/applications/QCD/src/kernel_D/quda_interface.c new file mode 100644 index 0000000000000000000000000000000000000000..f48de6b820a713dc4aafef3ea8ac5117b225def7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/quda_interface.c @@ -0,0 +1,831 @@ +/*********************************************************************** + * + * Copyright (C) 2015 Mario Schroeck + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * + ***********************************************************************/ +/*********************************************************************** +* +* File quda_interface.h +* +* Author: Mario Schroeck +* +* Last changes: 06/2015 +* +* +* Interface to QUDA for multi-GPU inverters +* +* The externally accessible functions are +* +* void _initQuda() +* Initializes the QUDA library. Carries over the lattice size and the +* MPI process grid and thus must be called after initializing MPI. +* Currently it is called in init_operators() if optr->use_qudainverter +* flag is set. +* Memory for the QUDA gaugefield on the host is allocated but not filled +* yet (the latter is done in _loadGaugeQuda(), see below). +* Performance critical settings are done here and can be changed. +* +* void _endQuda() +* Finalizes the QUDA library. Call before MPI_Finalize(). +* +* void _loadGaugeQuda() +* Copies and reorders the gaugefield on the host and copies it to the GPU. +* Must be called between last changes on the gaugefield (smearing etc.) +* and first call of the inverter. In particular, 'boundary(const double kappa)' +* must be called before if nontrivial boundary conditions are to be used since +* those will be applied directly to the gaugefield. Currently it is called just +* before the inversion is done (might result in wasted loads...). +* +* The functions +* +* int invert_eo_quda(...); +* int invert_doublet_eo_quda(...); +* void M_full_quda(...); +* void D_psi_quda(...); +* +* mimic their tmLQCD counterparts in functionality as well as input and +* output parameters. The invert functions will check the parameters +* g_mu, g_c_sw do decide which QUDA operator to create. +* +* To activate those, set "UseQudaInverter = yes" in the operator +* declaration of the input file. For details see the documentation. +* +* +* Notes: +* +* Minimum QUDA version is 0.7.0 (see https://github.com/lattice/quda/issues/151 +* and https://github.com/lattice/quda/issues/157). +* +* +**************************************************************************/ + +#include "quda_interface.h" +#include +#include +#include +#include +#include "boundary.h" +#include "linalg/convert_eo_to_lexic.h" +#include "solver/solver.h" +#include "solver/solver_field.h" +#include "gettime.h" +#include "boundary.h" +#include "quda.h" + +double X0, X1, X2, X3; + +// define order of the spatial indices +// default is LX-LY-LZ-T, see below def. of local lattice size, this is related to +// the gamma basis transformation from tmLQCD -> UKQCD +// for details see https://github.com/lattice/quda/issues/157 +#define USE_LZ_LY_LX_T 0 + +// TRIVIAL_BC are trivial (anti-)periodic boundary conditions, +// i.e. 1 or -1 on last timeslice +// tmLQCD uses twisted BC, i.e. phases on all timeslices. +// if using TRIVIAL_BC: can't compare inversion result to tmLQCD +// if not using TRIVIAL_BC: BC will be applied to gauge field, +// can't use 12 parameter reconstruction +#define TRIVIAL_BC 0 + +#define MAX(a,b) ((a)>(b)?(a):(b)) + +// gauge and invert paramameter structs; init. in _initQuda() +QudaGaugeParam gauge_param; +QudaInvertParam inv_param; + +// pointer to the QUDA gaugefield +double *gauge_quda[4]; + +// pointer to a temp. spinor, used for reordering etc. +double *tempSpinor; + +// function that maps coordinates in the communication grid to MPI ranks +int commsMap(const int *coords, void *fdata) { +#if USE_LZ_LY_LX_T + int n[4] = {coords[3], coords[2], coords[1], coords[0]}; +#else + int n[4] = {coords[3], coords[0], coords[1], coords[2]}; +#endif + + int rank = 0; +#ifdef MPI + MPI_Cart_rank( g_cart_grid, n, &rank ); +#endif + + return rank; +} + +// variable to check if quda has been initialized +static int quda_initialized = 0; + +void _initQuda() { + if( quda_initialized ) + return; + + if( g_debug_level > 0 ) + if(g_proc_id == 0) + printf("\n# QUDA: Detected QUDA version %d.%d.%d\n\n", QUDA_VERSION_MAJOR, QUDA_VERSION_MINOR, QUDA_VERSION_SUBMINOR); + if( QUDA_VERSION_MAJOR == 0 && QUDA_VERSION_MINOR < 7) { + fprintf(stderr, "Error: minimum QUDA version required is 0.7.0 (for support of chiral basis and removal of bug in mass normalization with preconditioning).\n"); + exit(-2); + } + + gauge_param = newQudaGaugeParam(); + inv_param = newQudaInvertParam(); + + // *** QUDA parameters begin here (sloppy prec. will be adjusted in invert) + QudaPrecision cpu_prec = QUDA_DOUBLE_PRECISION; + QudaPrecision cuda_prec = QUDA_DOUBLE_PRECISION; + QudaPrecision cuda_prec_sloppy = QUDA_SINGLE_PRECISION; + QudaPrecision cuda_prec_precondition = QUDA_HALF_PRECISION; + + QudaTune tune = QUDA_TUNE_YES; + + + // *** the remainder should not be changed for this application + // local lattice size +#if USE_LZ_LY_LX_T + gauge_param.X[0] = LZ; + gauge_param.X[1] = LY; + gauge_param.X[2] = LX; + gauge_param.X[3] = T; +#else + gauge_param.X[0] = LX; + gauge_param.X[1] = LY; + gauge_param.X[2] = LZ; + gauge_param.X[3] = T; +#endif + + inv_param.Ls = 1; + + gauge_param.anisotropy = 1.0; + gauge_param.type = QUDA_WILSON_LINKS; + gauge_param.gauge_order = QUDA_QDP_GAUGE_ORDER; + + gauge_param.cpu_prec = cpu_prec; + gauge_param.cuda_prec = cuda_prec; + gauge_param.reconstruct = 18; + gauge_param.cuda_prec_sloppy = cuda_prec_sloppy; + gauge_param.reconstruct_sloppy = 18; + gauge_param.cuda_prec_precondition = cuda_prec_precondition; + gauge_param.reconstruct_precondition = 18; + gauge_param.gauge_fix = QUDA_GAUGE_FIXED_NO; + + inv_param.dagger = QUDA_DAG_NO; + inv_param.mass_normalization = QUDA_KAPPA_NORMALIZATION; + inv_param.solver_normalization = QUDA_DEFAULT_NORMALIZATION; + + inv_param.pipeline = 0; + inv_param.gcrNkrylov = 10; + + // require both L2 relative and heavy quark residual to determine convergence +// inv_param.residual_type = (QudaResidualType)(QUDA_L2_RELATIVE_RESIDUAL | QUDA_HEAVY_QUARK_RESIDUAL); + inv_param.tol_hq = 1.0;//1e-3; // specify a tolerance for the residual for heavy quark residual + inv_param.reliable_delta = 1e-2; // ignored by multi-shift solver + + // domain decomposition preconditioner parameters + inv_param.inv_type_precondition = QUDA_CG_INVERTER; + inv_param.schwarz_type = QUDA_ADDITIVE_SCHWARZ; + inv_param.precondition_cycle = 1; + inv_param.tol_precondition = 1e-1; + inv_param.maxiter_precondition = 10; + inv_param.verbosity_precondition = QUDA_SILENT; + inv_param.cuda_prec_precondition = cuda_prec_precondition; + inv_param.omega = 1.0; + + inv_param.cpu_prec = cpu_prec; + inv_param.cuda_prec = cuda_prec; + inv_param.cuda_prec_sloppy = cuda_prec_sloppy; + + inv_param.clover_cpu_prec = cpu_prec; + inv_param.clover_cuda_prec = cuda_prec; + inv_param.clover_cuda_prec_sloppy = cuda_prec_sloppy; + inv_param.clover_cuda_prec_precondition = cuda_prec_precondition; + + inv_param.preserve_source = QUDA_PRESERVE_SOURCE_YES; + inv_param.gamma_basis = QUDA_CHIRAL_GAMMA_BASIS; + inv_param.dirac_order = QUDA_DIRAC_ORDER; + + inv_param.input_location = QUDA_CPU_FIELD_LOCATION; + inv_param.output_location = QUDA_CPU_FIELD_LOCATION; + + inv_param.tune = tune ? QUDA_TUNE_YES : QUDA_TUNE_NO; + + gauge_param.ga_pad = 0; // 24*24*24/2; + inv_param.sp_pad = 0; // 24*24*24/2; + inv_param.cl_pad = 0; // 24*24*24/2; + + // For multi-GPU, ga_pad must be large enough to store a time-slice + int x_face_size = gauge_param.X[1]*gauge_param.X[2]*gauge_param.X[3]/2; + int y_face_size = gauge_param.X[0]*gauge_param.X[2]*gauge_param.X[3]/2; + int z_face_size = gauge_param.X[0]*gauge_param.X[1]*gauge_param.X[3]/2; + int t_face_size = gauge_param.X[0]*gauge_param.X[1]*gauge_param.X[2]/2; + int pad_size =MAX(x_face_size, y_face_size); + pad_size = MAX(pad_size, z_face_size); + pad_size = MAX(pad_size, t_face_size); + gauge_param.ga_pad = pad_size; + + // solver verbosity + if( g_debug_level == 0 ) + inv_param.verbosity = QUDA_SILENT; + else if( g_debug_level == 1 ) + inv_param.verbosity = QUDA_SUMMARIZE; + else + inv_param.verbosity = QUDA_VERBOSE; + + // general verbosity + setVerbosityQuda( QUDA_SUMMARIZE, "# QUDA: ", stdout); + + // declare the grid mapping used for communications in a multi-GPU grid +#if USE_LZ_LY_LX_T + int grid[4] = {g_nproc_z, g_nproc_y, g_nproc_x, g_nproc_t}; +#else + int grid[4] = {g_nproc_x, g_nproc_y, g_nproc_z, g_nproc_t}; +#endif + + initCommsGridQuda(4, grid, commsMap, NULL); + + // alloc gauge_quda + size_t gSize = (gauge_param.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float); + + for (int dir = 0; dir < 4; dir++) { + gauge_quda[dir] = (double*) malloc(VOLUME*18*gSize); + if(gauge_quda[dir] == NULL) { + fprintf(stderr, "_initQuda: malloc for gauge_quda[dir] failed"); + exit(-2); + } + } + + // alloc space for a temp. spinor, used throughout this module + tempSpinor = (double*)malloc( 2*VOLUME*24*sizeof(double) ); /* factor 2 for doublet */ + if(tempSpinor == NULL) { + fprintf(stderr, "_initQuda: malloc for tempSpinor failed"); + exit(-2); + } + + // initialize the QUDA library +#ifdef MPI + initQuda(-1); //sets device numbers automatically +#else + initQuda(0); //scalar build: use device 0 +#endif + quda_initialized = 1; +} + +// finalize the QUDA library +void _endQuda() { + if( quda_initialized ) { + freeGaugeQuda(); + free((void*)tempSpinor); + endQuda(); + } +} + + +void _loadGaugeQuda( const int compression ) { + if( inv_param.verbosity > QUDA_SILENT ) + if(g_proc_id == 0) + printf("# QUDA: Called _loadGaugeQuda\n"); + + _Complex double tmpcplx; + + size_t gSize = (gauge_param.cpu_prec == QUDA_DOUBLE_PRECISION) ? sizeof(double) : sizeof(float); + + // now copy and reorder + for( int x0=0; x00.0 || fabs(X2)>0.0 || fabs(X3)>0.0 || (fabs(X0)!=0.0 && fabs(X0)!=1.0) ) { + if( *compression!=NO_COMPRESSION ) { + if(g_proc_id == 0) { + printf("\n# QUDA: WARNING you can't use compression %d with boundary conditions for fermion fields (t,x,y,z)*pi: (%f,%f,%f,%f) \n", *compression,X0,X1,X2,X3); + printf("# QUDA: disabling compression.\n\n"); + *compression=NO_COMPRESSION; + } + } + } + + QudaReconstructType link_recon; + QudaReconstructType link_recon_sloppy; + + if( *compression==NO_COMPRESSION ) { // theta BC + gauge_param.t_boundary = QUDA_PERIODIC_T; // BC will be applied to gaugefield + link_recon = 18; + link_recon_sloppy = 18; + } + else { // trivial BC + gauge_param.t_boundary = ( fabs(X0)>0.0 ? QUDA_ANTI_PERIODIC_T : QUDA_PERIODIC_T ); + link_recon = 12; + link_recon_sloppy = *compression; + if( g_debug_level > 0 ) + if(g_proc_id == 0) + printf("\n# QUDA: WARNING using %d compression with trivial (A)PBC instead of theta-BC ((t,x,y,z)*pi: (%f,%f,%f,%f))! This works fine but the residual check on the host (CPU) will fail.\n",*compression,X0,X1,X2,X3); + } + + gauge_param.reconstruct = link_recon; + gauge_param.reconstruct_sloppy = link_recon_sloppy; + gauge_param.reconstruct_precondition = link_recon_sloppy; +} + +void set_sloppy_prec( const SloppyPrecision sloppy_precision ) { + + // choose sloppy prec. + QudaPrecision cuda_prec_sloppy; + if( sloppy_precision==SLOPPY_DOUBLE ) { + cuda_prec_sloppy = QUDA_DOUBLE_PRECISION; + if(g_proc_id == 0) printf("# QUDA: Using double prec. as sloppy!\n"); + } + else if( sloppy_precision==SLOPPY_HALF ) { + cuda_prec_sloppy = QUDA_HALF_PRECISION; + if(g_proc_id == 0) printf("# QUDA: Using half prec. as sloppy!\n"); + } + else { + cuda_prec_sloppy = QUDA_SINGLE_PRECISION; + if(g_proc_id == 0) printf("# QUDA: Using single prec. as sloppy!\n"); + } + gauge_param.cuda_prec_sloppy = cuda_prec_sloppy; + inv_param.cuda_prec_sloppy = cuda_prec_sloppy; + inv_param.clover_cuda_prec_sloppy = cuda_prec_sloppy; +} + +int invert_eo_quda(spinor * const Even_new, spinor * const Odd_new, + spinor * const Even, spinor * const Odd, + const double precision, const int max_iter, + const int solver_flag, const int rel_prec, + const int even_odd_flag, solver_params_t solver_params, + SloppyPrecision sloppy_precision, + CompressionType compression) { + + spinor ** solver_field = NULL; + const int nr_sf = 2; + init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf); + + convert_eo_to_lexic(solver_field[0], Even, Odd); +// convert_eo_to_lexic(solver_field[1], Even_new, Odd_new); + + void *spinorIn = (void*)solver_field[0]; // source + void *spinorOut = (void*)solver_field[1]; // solution + + if ( rel_prec ) + inv_param.residual_type = QUDA_L2_RELATIVE_RESIDUAL; + else + inv_param.residual_type = QUDA_L2_ABSOLUTE_RESIDUAL; + + inv_param.kappa = g_kappa; + + // figure out which BC to use (theta, trivial...) + set_boundary_conditions(&compression); + + // set the sloppy precision of the mixed prec solver + set_sloppy_prec(sloppy_precision); + + // load gauge after setting precision + _loadGaugeQuda(compression); + + // choose dslash type + if( g_mu != 0.0 && g_c_sw > 0.0 ) { + inv_param.dslash_type = QUDA_TWISTED_CLOVER_DSLASH; + inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; + inv_param.solution_type = QUDA_MAT_SOLUTION; + inv_param.clover_order = QUDA_PACKED_CLOVER_ORDER; + inv_param.mu = fabs(g_mu/2./g_kappa); + inv_param.clover_coeff = g_c_sw*g_kappa; + + } + else if( g_mu != 0.0 ) { + inv_param.dslash_type = QUDA_TWISTED_MASS_DSLASH; + inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN_ASYMMETRIC; + inv_param.solution_type = QUDA_MAT_SOLUTION; + inv_param.mu = fabs(g_mu/2./g_kappa); + } + else if( g_c_sw > 0.0 ) { + inv_param.dslash_type = QUDA_CLOVER_WILSON_DSLASH; + inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; + inv_param.solution_type = QUDA_MAT_SOLUTION; + inv_param.clover_order = QUDA_PACKED_CLOVER_ORDER; + inv_param.clover_coeff = g_c_sw*g_kappa; + } + else { + inv_param.dslash_type = QUDA_WILSON_DSLASH; + inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; + inv_param.solution_type = QUDA_MAT_SOLUTION; + } + + // choose solver + if(solver_flag == BICGSTAB) { + if(g_proc_id == 0) {printf("# QUDA: Using BiCGstab!\n"); fflush(stdout);} + inv_param.inv_type = QUDA_BICGSTAB_INVERTER; + } + else { + /* Here we invert the hermitean operator squared */ + inv_param.inv_type = QUDA_CG_INVERTER; + if(g_proc_id == 0) { + printf("# QUDA: Using mixed precision CG!\n"); + printf("# QUDA: mu = %f, kappa = %f\n", g_mu/2./g_kappa, g_kappa); + fflush(stdout); + } + } + + // direct or norm-op. solve + if( inv_param.inv_type == QUDA_CG_INVERTER ) { + if( even_odd_flag ) { + inv_param.solve_type = QUDA_NORMOP_PC_SOLVE; + if(g_proc_id == 0) printf("# QUDA: Using preconditioning!\n"); + } + else { + inv_param.solve_type = QUDA_NORMOP_SOLVE; + if(g_proc_id == 0) printf("# QUDA: Not using preconditioning!\n"); + } + } + else { + if( even_odd_flag ) { + inv_param.solve_type = QUDA_DIRECT_PC_SOLVE; + if(g_proc_id == 0) printf("# QUDA: Using preconditioning!\n"); + } + else { + inv_param.solve_type = QUDA_DIRECT_SOLVE; + if(g_proc_id == 0) printf("# QUDA: Not using preconditioning!\n"); + } + } + + + inv_param.tol = sqrt(precision)*0.25; + inv_param.maxiter = max_iter; + + // IMPORTANT: use opposite TM flavor since gamma5 -> -gamma5 (until LXLYLZT prob. resolved) + inv_param.twist_flavor = (g_mu < 0.0 ? QUDA_TWIST_PLUS : QUDA_TWIST_MINUS); + inv_param.Ls = 1; + + // NULL pointers to host fields to force + // construction instead of download of the clover field: + if( g_c_sw > 0.0 ) + loadCloverQuda(NULL, NULL, &inv_param); + + // reorder spinor + reorder_spinor_toQuda( (double*)spinorIn, inv_param.cpu_prec, 0, NULL ); + + // perform the inversion + invertQuda(spinorOut, spinorIn, &inv_param); + + if( inv_param.verbosity == QUDA_VERBOSE ) + if(g_proc_id == 0) + printf("# QUDA: Device memory used: Spinor: %f GiB, Gauge: %f GiB, Clover: %f GiB\n", + inv_param.spinorGiB, gauge_param.gaugeGiB, inv_param.cloverGiB); + if( inv_param.verbosity > QUDA_SILENT ) + if(g_proc_id == 0) + printf("# QUDA: Done: %i iter / %g secs = %g Gflops\n", + inv_param.iter, inv_param.secs, inv_param.gflops/inv_param.secs); + + // number of CG iterations + int iteration = inv_param.iter; + + // reorder spinor + reorder_spinor_fromQuda( (double*)spinorIn, inv_param.cpu_prec, 0, NULL ); + reorder_spinor_fromQuda( (double*)spinorOut, inv_param.cpu_prec, 0, NULL ); + convert_lexic_to_eo(Even, Odd, solver_field[0]); + convert_lexic_to_eo(Even_new, Odd_new, solver_field[1]); + + finalize_solver(solver_field, nr_sf); + freeGaugeQuda(); + + if(iteration >= max_iter) + return(-1); + + return(iteration); +} + +int invert_doublet_eo_quda(spinor * const Even_new_s, spinor * const Odd_new_s, + spinor * const Even_new_c, spinor * const Odd_new_c, + spinor * const Even_s, spinor * const Odd_s, + spinor * const Even_c, spinor * const Odd_c, + const double precision, const int max_iter, + const int solver_flag, const int rel_prec, const int even_odd_flag, + const SloppyPrecision sloppy_precision, + CompressionType compression) { + + spinor ** solver_field = NULL; + const int nr_sf = 4; + init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf); + + convert_eo_to_lexic(solver_field[0], Even_s, Odd_s); + convert_eo_to_lexic(solver_field[1], Even_c, Odd_c); +// convert_eo_to_lexic(g_spinor_field[DUM_DERI+1], Even_new, Odd_new); + + void *spinorIn = (void*)solver_field[0]; // source + void *spinorIn_c = (void*)solver_field[1]; // charme source + void *spinorOut = (void*)solver_field[2]; // solution + void *spinorOut_c = (void*)solver_field[3]; // charme solution + + if ( rel_prec ) + inv_param.residual_type = QUDA_L2_RELATIVE_RESIDUAL; + else + inv_param.residual_type = QUDA_L2_ABSOLUTE_RESIDUAL; + + inv_param.kappa = g_kappa; + + // IMPORTANT: use opposite TM mu-flavor since gamma5 -> -gamma5 + inv_param.mu = -g_mubar /2./g_kappa; + inv_param.epsilon = g_epsbar/2./g_kappa; + + + // figure out which BC to use (theta, trivial...) + set_boundary_conditions(&compression); + + // set the sloppy precision of the mixed prec solver + set_sloppy_prec(sloppy_precision); + + // load gauge after setting precision + _loadGaugeQuda(compression); + + // choose dslash type + if( g_c_sw > 0.0 ) { + inv_param.dslash_type = QUDA_TWISTED_CLOVER_DSLASH; + inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN; + inv_param.solution_type = QUDA_MAT_SOLUTION; + inv_param.clover_order = QUDA_PACKED_CLOVER_ORDER; + inv_param.clover_coeff = g_c_sw*g_kappa; + } + else { + inv_param.dslash_type = QUDA_TWISTED_MASS_DSLASH; + inv_param.matpc_type = QUDA_MATPC_EVEN_EVEN_ASYMMETRIC; + inv_param.solution_type = QUDA_MAT_SOLUTION; + } + + // choose solver + if(solver_flag == BICGSTAB) { + if(g_proc_id == 0) {printf("# QUDA: Using BiCGstab!\n"); fflush(stdout);} + inv_param.inv_type = QUDA_BICGSTAB_INVERTER; + } + else { + /* Here we invert the hermitean operator squared */ + inv_param.inv_type = QUDA_CG_INVERTER; + if(g_proc_id == 0) { + printf("# QUDA: Using mixed precision CG!\n"); + printf("# QUDA: mu = %f, kappa = %f\n", g_mu/2./g_kappa, g_kappa); + fflush(stdout); + } + } + + if( even_odd_flag ) { + inv_param.solve_type = QUDA_NORMOP_PC_SOLVE; + if(g_proc_id == 0) printf("# QUDA: Using preconditioning!\n"); + } + else { + inv_param.solve_type = QUDA_NORMOP_SOLVE; + if(g_proc_id == 0) printf("# QUDA: Not using preconditioning!\n"); + } + + inv_param.tol = sqrt(precision)*0.25; + inv_param.maxiter = max_iter; + + inv_param.twist_flavor = QUDA_TWIST_NONDEG_DOUBLET; + inv_param.Ls = 2; + + // NULL pointers to host fields to force + // construction instead of download of the clover field: + if( g_c_sw > 0.0 ) + loadCloverQuda(NULL, NULL, &inv_param); + + // reorder spinor + reorder_spinor_toQuda( (double*)spinorIn, inv_param.cpu_prec, 1, (double*)spinorIn_c ); + + // perform the inversion + invertQuda(spinorOut, spinorIn, &inv_param); + + if( inv_param.verbosity == QUDA_VERBOSE ) + if(g_proc_id == 0) + printf("# QUDA: Device memory used: Spinor: %f GiB, Gauge: %f GiB, Clover: %f GiB\n", + inv_param.spinorGiB, gauge_param.gaugeGiB, inv_param.cloverGiB); + if( inv_param.verbosity > QUDA_SILENT ) + if(g_proc_id == 0) + printf("# QUDA: Done: %i iter / %g secs = %g Gflops\n", + inv_param.iter, inv_param.secs, inv_param.gflops/inv_param.secs); + + // number of CG iterations + int iteration = inv_param.iter; + + // reorder spinor + reorder_spinor_fromQuda( (double*)spinorIn, inv_param.cpu_prec, 1, (double*)spinorIn_c ); + reorder_spinor_fromQuda( (double*)spinorOut, inv_param.cpu_prec, 1, (double*)spinorOut_c ); + convert_lexic_to_eo(Even_s, Odd_s, solver_field[0]); + convert_lexic_to_eo(Even_c, Odd_c, solver_field[1]); + convert_lexic_to_eo(Even_new_s, Odd_new_s, solver_field[2]); + convert_lexic_to_eo(Even_new_c, Odd_new_c, solver_field[3]); + + finalize_solver(solver_field, nr_sf); + freeGaugeQuda(); + + if(iteration >= max_iter) + return(-1); + + return(iteration); +} + +// if even_odd_flag set +void M_full_quda(spinor * const Even_new, spinor * const Odd_new, spinor * const Even, spinor * const Odd) { + inv_param.kappa = g_kappa; + inv_param.mu = fabs(g_mu); + inv_param.epsilon = 0.0; + + // IMPORTANT: use opposite TM flavor since gamma5 -> -gamma5 (until LXLYLZT prob. resolved) + inv_param.twist_flavor = (g_mu < 0.0 ? QUDA_TWIST_PLUS : QUDA_TWIST_MINUS); + inv_param.Ls = (inv_param.twist_flavor == QUDA_TWIST_NONDEG_DOUBLET || + inv_param.twist_flavor == QUDA_TWIST_DEG_DOUBLET ) ? 2 : 1; + + void *spinorIn = (void*)g_spinor_field[DUM_DERI]; // source + void *spinorOut = (void*)g_spinor_field[DUM_DERI+1]; // solution + + // reorder spinor + convert_eo_to_lexic( spinorIn, Even, Odd ); + reorder_spinor_toQuda( (double*)spinorIn, inv_param.cpu_prec, 0, NULL ); + + // multiply + inv_param.solution_type = QUDA_MAT_SOLUTION; + MatQuda( spinorOut, spinorIn, &inv_param); + + // reorder spinor + reorder_spinor_fromQuda( (double*)spinorOut, inv_param.cpu_prec, 0, NULL ); + convert_lexic_to_eo( Even_new, Odd_new, spinorOut ); +} + +// no even-odd +void D_psi_quda(spinor * const P, spinor * const Q) { + inv_param.kappa = g_kappa; + inv_param.mu = fabs(g_mu); + inv_param.epsilon = 0.0; + + // IMPORTANT: use opposite TM flavor since gamma5 -> -gamma5 (until LXLYLZT prob. resolved) + inv_param.twist_flavor = (g_mu < 0.0 ? QUDA_TWIST_PLUS : QUDA_TWIST_MINUS); + inv_param.Ls = (inv_param.twist_flavor == QUDA_TWIST_NONDEG_DOUBLET || + inv_param.twist_flavor == QUDA_TWIST_DEG_DOUBLET ) ? 2 : 1; + + void *spinorIn = (void*)Q; + void *spinorOut = (void*)P; + + // reorder spinor + reorder_spinor_toQuda( (double*)spinorIn, inv_param.cpu_prec, 0, NULL ); + + // multiply + inv_param.solution_type = QUDA_MAT_SOLUTION; + MatQuda( spinorOut, spinorIn, &inv_param); + + // reorder spinor + reorder_spinor_fromQuda( (double*)spinorIn, inv_param.cpu_prec, 0, NULL ); + reorder_spinor_fromQuda( (double*)spinorOut, inv_param.cpu_prec, 0, NULL ); +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/quda_interface.h b/qcd/part_cpu/applications/QCD/src/kernel_D/quda_interface.h new file mode 100644 index 0000000000000000000000000000000000000000..7d97d848a490e9e91c4fcc4afed6074b2fba35bb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/quda_interface.h @@ -0,0 +1,111 @@ +/*********************************************************************** + * + * Copyright (C) 2015 Mario Schroeck + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + ***********************************************************************/ +/*********************************************************************** +* +* File quda_interface.h +* +* Author: Mario Schroeck +* +* Last changes: 06/2015 +* +* +* Interface to QUDA for multi-GPU inverters +* +* The externally accessible functions are +* +* void _initQuda() +* Initializes the QUDA library. Carries over the lattice size and the +* MPI process grid and thus must be called after initializing MPI. +* Currently it is called in init_operators() if optr->use_qudainverter +* flag is set. +* Memory for the QUDA gaugefield on the host is allocated but not filled +* yet (the latter is done in _loadGaugeQuda(), see below). +* Performance critical settings are done here and can be changed. +* +* void _endQuda() +* Finalizes the QUDA library. Call before MPI_Finalize(). +* +* void _loadGaugeQuda() +* Copies and reorders the gaugefield on the host and copies it to the GPU. +* Must be called between last changes on the gaugefield (smearing etc.) +* and first call of the inverter. In particular, 'boundary(const double kappa)' +* must be called before if nontrivial boundary conditions are to be used since +* those will be applied directly to the gaugefield. Currently it is called just +* before the inversion is done (might result in wasted loads...). +* +* The functions +* +* int invert_eo_quda(...); +* int invert_doublet_eo_quda(...); +* void M_full_quda(...); +* void D_psi_quda(...); +* +* mimic their tmLQCD counterparts in functionality as well as input and +* output parameters. The invert functions will check the parameters +* g_mu, g_c_sw do decide which QUDA operator to create. +* +* To activate those, set "UseQudaInverter = yes" in the operator +* declaration of the input file. For details see the documentation. +* +* +* Notes: +* +* Minimum QUDA version is 0.7.0 (see https://github.com/lattice/quda/issues/151 +* and https://github.com/lattice/quda/issues/157). +* +* +**************************************************************************/ + +#ifndef QUDA_INTERFACE_H_ +#define QUDA_INTERFACE_H_ +#include "global.h" +#include "su3.h" +#include "solver/solver_params.h" + + +// wrapper functions +void _initQuda(); +void _endQuda(); +void _loadGaugeQuda(); + +// to be called instead of tmLQCD functions to use the QUDA inverter +int invert_eo_quda(spinor * const Even_new, spinor * const Odd_new, + spinor * const Even, spinor * const Odd, + const double precision, const int max_iter, + const int solver_flag, const int rel_prec, + const int even_odd_flag, solver_params_t solver_params, + const SloppyPrecision sloppy_precision, + CompressionType compression); + +int invert_doublet_eo_quda(spinor * const Even_new_s, spinor * const Odd_new_s, + spinor * const Even_new_c, spinor * const Odd_new_c, + spinor * const Even_s, spinor * const Odd_s, + spinor * const Even_c, spinor * const Odd_c, + const double precision, const int max_iter, + const int solver_flag, const int rel_prec, const int even_odd_flag, + const SloppyPrecision sloppy_precision, + CompressionType compression); + +// apply the TM operator using QUDA +void M_full_quda(spinor * const Even_new, spinor * const Odd_new, spinor * const Even, spinor * const Odd); +void D_psi_quda(spinor * const P, spinor * const Q); + +#endif /* QUDA_INTERFACE_H_ */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/ranlxd.c b/qcd/part_cpu/applications/QCD/src/kernel_D/ranlxd.c new file mode 100644 index 0000000000000000000000000000000000000000..99615ca564e486af8af2021869480cb3dac7ff16 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/ranlxd.c @@ -0,0 +1,673 @@ +/******************************************************************************* + * + * File ranlxd.c + * + * Copyright (C) 2005 Martin Luescher + * + * This software is distributed under the terms of the GNU General Public + * License (GPL) + * + * Random number generator "ranlxd". See the notes + * + * "User's guide for ranlxs and ranlxd v3.2" (December 2005) + * + * "Algorithms used in ranlux v3.0" (May 2001) + * + * for a detailed description + * + * The externally accessible functions are + * + * void ranlxd(double r[],int n) + * Computes the next n double-precision random numbers and + * assigns them to the elements r[0],...,r[n-1] of the array r[] + * + * void rlxd_init(int level,int seed) + * Initialization of the generator + * + * int rlxd_size(void) + * Returns the number of integers required to save the state of + * the generator + * + * void rlxd_get(int state[]) + * Extracts the current state of the generator and stores the + * information in the array state[N] where N>=rlxd_size() + * + * void rlxd_reset(int state[]) + * Resets the generator to the state defined by the array state[N] + * + * modified by C. Urbach to work with the tmLQCD package + * + *******************************************************************************/ + +#define RANLXD_C + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#include +#include "ranlxd.h" + +int ranlxd_init = 0; + +#if ((defined SSE)||(defined SSE2)||(defined SSE3)) + +typedef struct +{ + float c1,c2,c3,c4; +} vec_t __attribute__ ((aligned (16))); + +typedef struct +{ + vec_t c1,c2; +} dble_vec_t __attribute__ ((aligned (16))); + +static int init=0,pr,prm,ir,jr,is,is_old,next[96]; +static vec_t one,one_bit,carry; + +static union +{ + dble_vec_t vec[12]; + float num[96]; +} x __attribute__ ((aligned (16))); + +#define STEP(pi,pj) \ + __asm__ __volatile__ ("movaps %4, %%xmm4 \n\t" \ + "movaps %%xmm2, %%xmm3 \n\t" \ + "subps %2, %%xmm4 \n\t" \ + "movaps %%xmm1, %%xmm5 \n\t" \ + "cmpps $0x6, %%xmm4, %%xmm2 \n\t" \ + "andps %%xmm2, %%xmm5 \n\t" \ + "subps %%xmm3, %%xmm4 \n\t" \ + "andps %%xmm0, %%xmm2 \n\t" \ + "addps %%xmm4, %%xmm5 \n\t" \ + "movaps %%xmm5, %0 \n\t" \ + "movaps %5, %%xmm6 \n\t" \ + "movaps %%xmm2, %%xmm3 \n\t" \ + "subps %3, %%xmm6 \n\t" \ + "movaps %%xmm1, %%xmm7 \n\t" \ + "cmpps $0x6, %%xmm6, %%xmm2 \n\t" \ + "andps %%xmm2, %%xmm7 \n\t" \ + "subps %%xmm3, %%xmm6 \n\t" \ + "andps %%xmm0, %%xmm2 \n\t" \ + "addps %%xmm6, %%xmm7 \n\t" \ + "movaps %%xmm7, %1" \ + : \ + "=m" ((*pi).c1), \ + "=m" ((*pi).c2) \ + : \ + "m" ((*pi).c1), \ + "m" ((*pi).c2), \ + "m" ((*pj).c1), \ + "m" ((*pj).c2)) + + +static void error(int no) +{ + switch(no) + { + case 1: + fprintf(stderr, "Error in subroutine rlxd_init\n"); + fprintf(stderr, "Bad choice of luxury level (should be 1 or 2)\n"); + break; + case 2: + fprintf(stderr, "Error in subroutine rlxd_init\n"); + fprintf(stderr, "Bad choice of seed (should be between 1 and 2^31-1)\n"); + break; + case 3: + fprintf(stderr, "Error in rlxd_get\n"); + fprintf(stderr, "Undefined state (ranlxd is not initialized\n"); + break; + case 5: + fprintf(stderr, "Error in rlxd_reset\n"); + fprintf(stderr, "Unexpected input data\n"); + break; + } + fprintf(stderr, "Program aborted\n"); + exit(0); +} + + +static void update(void) +{ + int k,kmax; + dble_vec_t *pmin,*pmax,*pi,*pj; + + kmax=pr; + pmin=&x.vec[0]; + pmax=pmin+12; + pi=&x.vec[ir]; + pj=&x.vec[jr]; + + __asm__ __volatile__ ("movaps %0, %%xmm0 \n\t" + "movaps %1, %%xmm1 \n\t" + "movaps %2, %%xmm2" + : + : + "m" (one_bit), + "m" (one), + "m" (carry)); + + + for (k=0;k=12) + ir-=12; + if (jr>=12) + jr-=12; + is=8*ir; + is_old=is; +} + + +static void define_constants(void) +{ + int k; + float b; + + one.c1=1.0f; + one.c2=1.0f; + one.c3=1.0f; + one.c4=1.0f; + + b=(float)(ldexp(1.0,-24)); + one_bit.c1=b; + one_bit.c2=b; + one_bit.c3=b; + one_bit.c4=b; + + for (k=0;k<96;k++) + { + next[k]=(k+1)%96; + if ((k%4)==3) + next[k]=(k+5)%96; + } +} + + +void rlxd_init(int level,int seed) +{ + int i,k,l; + int ibit,jbit,xbit[31]; + int ix,iy; + + define_constants(); + + if (level==1) + pr=202; + else if (level==2) + pr=397; + else + error(1); + + i=seed; + + for (k=0;k<31;k++) + { + xbit[k]=i%2; + i/=2; + } + + if ((seed<=0)||(i!=0)) + error(2); + + ibit=0; + jbit=18; + + for (i=0;i<4;i++) + { + for (k=0;k<24;k++) + { + ix=0; + + for (l=0;l<24;l++) + { + iy=xbit[ibit]; + ix=2*ix+iy; + + xbit[ibit]=(xbit[ibit]+xbit[jbit])%2; + ibit=(ibit+1)%31; + jbit=(jbit+1)%31; + } + + if ((k%4)!=i) + ix=16777215-ix; + + x.num[4*k+i]=(float)(ldexp((double)(ix),-24)); + } + } + + carry.c1=0.0f; + carry.c2=0.0f; + carry.c3=0.0f; + carry.c4=0.0f; + + ir=0; + jr=7; + is=91; + is_old=0; + prm=pr%12; + init=1; + ranlxd_init = 1; +} + + +void ranlxd(double r[],int n) +{ + int k; + + if (init==0) + rlxd_init(1,1); + + for (k=0;k=167777216)) + error(5); + + x.num[k]=(float)(ldexp((double)(state[k+1]),-24)); + } + + if (((state[97]!=0)&&(state[97]!=1))|| + ((state[98]!=0)&&(state[98]!=1))|| + ((state[99]!=0)&&(state[99]!=1))|| + ((state[100]!=0)&&(state[100]!=1))) + error(5); + + carry.c1=(float)(ldexp((double)(state[97]),-24)); + carry.c2=(float)(ldexp((double)(state[98]),-24)); + carry.c3=(float)(ldexp((double)(state[99]),-24)); + carry.c4=(float)(ldexp((double)(state[100]),-24)); + + pr=state[101]; + ir=state[102]; + jr=state[103]; + is=state[104]; + is_old=8*ir; + prm=pr%12; + init=1; + ranlxd_init = 1; + + if (((pr!=202)&&(pr!=397))|| + (ir<0)||(ir>11)||(jr<0)||(jr>11)||(jr!=((ir+7)%12))|| + (is<0)||(is>91)) + error(5); +} + +#else + +#define BASE 0x1000000 +#define MASK 0xffffff + +typedef struct +{ + int c1,c2,c3,c4; +} vec_t; + +typedef struct +{ + vec_t c1,c2; +} dble_vec_t; + +static int init=0,pr,prm,ir,jr,is,is_old,next[96]; +static double one_bit; +static vec_t carry; + +static union +{ + dble_vec_t vec[12]; + int num[96]; +} x; + +#define STEP(pi,pj) \ + d=(*pj).c1.c1-(*pi).c1.c1-carry.c1; \ + (*pi).c2.c1+=(d<0); \ + d+=BASE; \ + (*pi).c1.c1=d&MASK; \ + d=(*pj).c1.c2-(*pi).c1.c2-carry.c2; \ + (*pi).c2.c2+=(d<0); \ + d+=BASE; \ + (*pi).c1.c2=d&MASK; \ + d=(*pj).c1.c3-(*pi).c1.c3-carry.c3; \ + (*pi).c2.c3+=(d<0); \ + d+=BASE; \ + (*pi).c1.c3=d&MASK; \ + d=(*pj).c1.c4-(*pi).c1.c4-carry.c4; \ + (*pi).c2.c4+=(d<0); \ + d+=BASE; \ + (*pi).c1.c4=d&MASK; \ + d=(*pj).c2.c1-(*pi).c2.c1; \ + carry.c1=(d<0); \ + d+=BASE; \ + (*pi).c2.c1=d&MASK; \ + d=(*pj).c2.c2-(*pi).c2.c2; \ + carry.c2=(d<0); \ + d+=BASE; \ + (*pi).c2.c2=d&MASK; \ + d=(*pj).c2.c3-(*pi).c2.c3; \ + carry.c3=(d<0); \ + d+=BASE; \ + (*pi).c2.c3=d&MASK; \ + d=(*pj).c2.c4-(*pi).c2.c4; \ + carry.c4=(d<0); \ + d+=BASE; \ + (*pi).c2.c4=d&MASK + + +static void error(int no) +{ + switch(no) + { + case 0: + fprintf(stderr, "Error in rlxd_init\n"); + fprintf(stderr, "Arithmetic on this machine is not suitable for ranlxd\n"); + break; + case 1: + fprintf(stderr, "Error in subroutine rlxd_init\n"); + fprintf(stderr, "Bad choice of luxury level (should be 1 or 2)\n"); + break; + case 2: + fprintf(stderr, "Error in subroutine rlxd_init\n"); + fprintf(stderr, "Bad choice of seed (should be between 1 and 2^31-1)\n"); + break; + case 3: + fprintf(stderr, "Error in rlxd_get\n"); + fprintf(stderr, "Undefined state (ranlxd is not initialized)\n"); + break; + case 4: + fprintf(stderr, "Error in rlxd_reset\n"); + fprintf(stderr, "Arithmetic on this machine is not suitable for ranlxd\n"); + break; + case 5: + fprintf(stderr, "Error in rlxd_reset\n"); + fprintf(stderr, "Unexpected input data\n"); + break; + } + fprintf(stderr, "Program aborted\n"); + exit(0); +} + + +static void update(void) +{ + int k,kmax,d; + dble_vec_t *pmin,*pmax,*pi,*pj; + + kmax=pr; + pmin=&x.vec[0]; + pmax=pmin+12; + pi=&x.vec[ir]; + pj=&x.vec[jr]; + + for (k=0;k=12) + ir-=12; + if (jr>=12) + jr-=12; + is=8*ir; + is_old=is; +} + + +static void define_constants(void) +{ + int k; + + one_bit=ldexp(1.0,-24); + + for (k=0;k<96;k++) + { + next[k]=(k+1)%96; + if ((k%4)==3) + next[k]=(k+5)%96; + } +} + + +void rlxd_init(int level,int seed) +{ + int i,k,l; + int ibit,jbit,xbit[31]; + int ix,iy; + + if ((INT_MAX<2147483647)||(FLT_RADIX!=2)||(FLT_MANT_DIG<24)|| + (DBL_MANT_DIG<48)) + error(0); + + define_constants(); + + if (level==1) + pr=202; + else if (level==2) + pr=397; + else + error(1); + + i=seed; + + for (k=0;k<31;k++) + { + xbit[k]=i%2; + i/=2; + } + + if ((seed<=0)||(i!=0)) + error(2); + + ibit=0; + jbit=18; + + for (i=0;i<4;i++) + { + for (k=0;k<24;k++) + { + ix=0; + + for (l=0;l<24;l++) + { + iy=xbit[ibit]; + ix=2*ix+iy; + + xbit[ibit]=(xbit[ibit]+xbit[jbit])%2; + ibit=(ibit+1)%31; + jbit=(jbit+1)%31; + } + + if ((k%4)!=i) + ix=16777215-ix; + + x.num[4*k+i]=ix; + } + } + + carry.c1=0; + carry.c2=0; + carry.c3=0; + carry.c4=0; + + ir=0; + jr=7; + is=91; + is_old=0; + prm=pr%12; + init=1; + ranlxd_init = 1; +} + + +void ranlxd(double r[],int n) +{ + int k; + + if (init==0) + rlxd_init(1,1); + + for (k=0;k=167777216)) + error(5); + + x.num[k]=state[k+1]; + } + + if (((state[97]!=0)&&(state[97]!=1))|| + ((state[98]!=0)&&(state[98]!=1))|| + ((state[99]!=0)&&(state[99]!=1))|| + ((state[100]!=0)&&(state[100]!=1))) + error(5); + + carry.c1=state[97]; + carry.c2=state[98]; + carry.c3=state[99]; + carry.c4=state[100]; + + pr=state[101]; + ir=state[102]; + jr=state[103]; + is=state[104]; + is_old=8*ir; + prm=pr%12; + init=1; + ranlxd_init = 1; + + if (((pr!=202)&&(pr!=397))|| + (ir<0)||(ir>11)||(jr<0)||(jr>11)||(jr!=((ir+7)%12))|| + (is<0)||(is>91)) + error(5); +} + +#endif + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/ranlxd.h b/qcd/part_cpu/applications/QCD/src/kernel_D/ranlxd.h new file mode 100644 index 0000000000000000000000000000000000000000..be8f66ba80fbb8136d79b7b4ae23bcdbd78bcf90 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/ranlxd.h @@ -0,0 +1,36 @@ +/******************************************************************************* + * + * file ranlxd.h + * + * Copyright (C) 2005 Martin Luescher + * + * This software is distributed under the terms of the GNU General Public + * License (GPL) + * + * + * modified by C. Urbach to work in the tmLQCD package + * + ***********************************************************************/ + + +#ifndef _RANLXD_H +#define _RANLXD_H + +#ifdef __cplusplus +extern "C" +{ +#endif /* __cplusplus */ + + extern int ranlxd_init; + + void ranlxd(double * const r, const int n); + void rlxd_init(const int level, const int seed); + void rlxd_get(int * const state); + void rlxd_reset(int state[]); + int rlxd_size(void); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/ranlxs.c b/qcd/part_cpu/applications/QCD/src/kernel_D/ranlxs.c new file mode 100644 index 0000000000000000000000000000000000000000..05f1310c4d8f84d88a9c05279ccb8fbcdbf9064c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/ranlxs.c @@ -0,0 +1,661 @@ +/******************************************************************************* + * + * File ranlxs.c + * + * Copyright (C) 2005 Martin Luescher + * + * This software is distributed under the terms of the GNU General Public + * License (GPL) + * + * Random number generator "ranlxs". See the notes + * + * "User's guide for ranlxs and ranlxd v3.2" (December 2005) + * + * "Algorithms used in ranlux v3.0" (May 2001) + * + * for a detailed description + * + * The externally accessible functions are + * + * void ranlxs(float r[],int n) + * Computes the next n single-precision random numbers and + * assigns them to the elements r[0],...,r[n-1] of the array r[] + * + * void rlxs_init(int level,int seed) + * Initialization of the generator + * + * int rlxs_size(void) + * Returns the number of integers required to save the state of + * the generator + * + * void rlxs_get(int state[]) + * Extracts the current state of the generator and stores the + * information in the array state[N] where N>=rlxs_size() + * + * void rlxs_reset(int state[]) + * Resets the generator to the state defined by the array state[N] + * + *******************************************************************************/ + +#define RANLXS_C + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#include + +int ranlxs_init = 0; + +#if ((defined SSE)||(defined SSE2)||(defined SSE3)) + +typedef struct +{ + float c1,c2,c3,c4; +} vec_t __attribute__ ((aligned (16))); + +typedef struct +{ + vec_t c1,c2; +} dble_vec_t __attribute__ ((aligned (16))); + +static int init=0,pr,prm,ir,jr,is,is_old,next[96]; +static vec_t one,one_bit,carry; + +static union +{ + dble_vec_t vec[12]; + float num[96]; +} x __attribute__ ((aligned (16))); + +#define STEP(pi,pj) \ + __asm__ __volatile__ ("movaps %4, %%xmm4 \n\t" \ + "movaps %%xmm2, %%xmm3 \n\t" \ + "subps %2, %%xmm4 \n\t" \ + "movaps %%xmm1, %%xmm5 \n\t" \ + "cmpps $0x6, %%xmm4, %%xmm2 \n\t" \ + "andps %%xmm2, %%xmm5 \n\t" \ + "subps %%xmm3, %%xmm4 \n\t" \ + "andps %%xmm0, %%xmm2 \n\t" \ + "addps %%xmm4, %%xmm5 \n\t" \ + "movaps %%xmm5, %0 \n\t" \ + "movaps %5, %%xmm6 \n\t" \ + "movaps %%xmm2, %%xmm3 \n\t" \ + "subps %3, %%xmm6 \n\t" \ + "movaps %%xmm1, %%xmm7 \n\t" \ + "cmpps $0x6, %%xmm6, %%xmm2 \n\t" \ + "andps %%xmm2, %%xmm7 \n\t" \ + "subps %%xmm3, %%xmm6 \n\t" \ + "andps %%xmm0, %%xmm2 \n\t" \ + "addps %%xmm6, %%xmm7 \n\t" \ + "movaps %%xmm7, %1" \ + : \ + "=m" ((*pi).c1), \ + "=m" ((*pi).c2) \ + : \ + "m" ((*pi).c1), \ + "m" ((*pi).c2), \ + "m" ((*pj).c1), \ + "m" ((*pj).c2)) + + +static void error(int no) +{ + switch(no) + { + case 1: + fprintf(stderr, "Error in subroutine rlxs_init\n"); + fprintf(stderr, "Bad choice of luxury level (should be 0,1 or 2)\n"); + break; + case 2: + fprintf(stderr, "Error in subroutine rlxs_init\n"); + fprintf(stderr, "Bad choice of seed (should be between 1 and 2^31-1)\n"); + break; + case 3: + fprintf(stderr, "Error in rlxs_get\n"); + fprintf(stderr, "Undefined state (ranlxs is not initialized\n"); + break; + case 5: + fprintf(stderr, "Error in rlxs_reset\n"); + fprintf(stderr, "Unexpected input data\n"); + break; + } + fprintf(stderr, "Program aborted\n"); + exit(0); +} + + +static void update(void) +{ + int k,kmax; + dble_vec_t *pmin,*pmax,*pi,*pj; + + kmax=pr; + pmin=&x.vec[0]; + pmax=pmin+12; + pi=&x.vec[ir]; + pj=&x.vec[jr]; + + __asm__ __volatile__ ("movaps %0, %%xmm0 \n\t" + "movaps %1, %%xmm1 \n\t" + "movaps %2, %%xmm2" + : + : + "m" (one_bit), + "m" (one), + "m" (carry)); + + for (k=0;k=12) + ir-=12; + if (jr>=12) + jr-=12; + is=8*ir; + is_old=is; +} + + +static void define_constants(void) +{ + int k; + float b; + + one.c1=1.0f; + one.c2=1.0f; + one.c3=1.0f; + one.c4=1.0f; + + b=(float)(ldexp(1.0,-24)); + one_bit.c1=b; + one_bit.c2=b; + one_bit.c3=b; + one_bit.c4=b; + + for (k=0;k<96;k++) + next[k]=(k+1)%96; +} + + +void rlxs_init(int level,int seed) +{ + int i,k,l; + int ibit,jbit,xbit[31]; + int ix,iy; + + define_constants(); + + if (level==0) + pr=109; + else if (level==1) + pr=202; + else if (level==2) + pr=397; + else + error(1); + + i=seed; + + for (k=0;k<31;k++) + { + xbit[k]=i%2; + i/=2; + } + + if ((seed<=0)||(i!=0)) + error(2); + + ibit=0; + jbit=18; + + for (i=0;i<4;i++) + { + for (k=0;k<24;k++) + { + ix=0; + + for (l=0;l<24;l++) + { + iy=xbit[ibit]; + ix=2*ix+iy; + + xbit[ibit]=(xbit[ibit]+xbit[jbit])%2; + ibit=(ibit+1)%31; + jbit=(jbit+1)%31; + } + + if ((k%4)==i) + ix=16777215-ix; + + x.num[4*k+i]=(float)(ldexp((double)(ix),-24)); + } + } + + carry.c1=0.0f; + carry.c2=0.0f; + carry.c3=0.0f; + carry.c4=0.0f; + + ir=0; + jr=7; + is=95; + is_old=0; + prm=pr%12; + init=1; + ranlxs_init = 1; +} + + +void ranlxs(float r[],int n) +{ + int k; + + if (init==0) + rlxs_init(0,1); + + for (k=0;k=167777216)) + error(5); + + x.num[k]=(float)(ldexp((double)(state[k+1]),-24)); + } + + if (((state[97]!=0)&&(state[97]!=1))|| + ((state[98]!=0)&&(state[98]!=1))|| + ((state[99]!=0)&&(state[99]!=1))|| + ((state[100]!=0)&&(state[100]!=1))) + error(5); + + carry.c1=(float)(ldexp((double)(state[97]),-24)); + carry.c2=(float)(ldexp((double)(state[98]),-24)); + carry.c3=(float)(ldexp((double)(state[99]),-24)); + carry.c4=(float)(ldexp((double)(state[100]),-24)); + + pr=state[101]; + ir=state[102]; + jr=state[103]; + is=state[104]; + is_old=8*ir; + prm=pr%12; + init=1; + + if (((pr!=109)&&(pr!=202)&&(pr!=397))|| + (ir<0)||(ir>11)||(jr<0)||(jr>11)||(jr!=((ir+7)%12))|| + (is<0)||(is>95)) + error(5); +} + +#else + +#define BASE 0x1000000 +#define MASK 0xffffff + +typedef struct +{ + int c1,c2,c3,c4; +} vec_t; + +typedef struct +{ + vec_t c1,c2; +} dble_vec_t; + +static int init=0,pr,prm,ir,jr,is,is_old,next[96]; +static float one_bit; +static vec_t carry; + +static union +{ + dble_vec_t vec[12]; + int num[96]; +} x; + +#define STEP(pi,pj) \ + d=(*pj).c1.c1-(*pi).c1.c1-carry.c1; \ + (*pi).c2.c1+=(d<0); \ + d+=BASE; \ + (*pi).c1.c1=d&MASK; \ + d=(*pj).c1.c2-(*pi).c1.c2-carry.c2; \ + (*pi).c2.c2+=(d<0); \ + d+=BASE; \ + (*pi).c1.c2=d&MASK; \ + d=(*pj).c1.c3-(*pi).c1.c3-carry.c3; \ + (*pi).c2.c3+=(d<0); \ + d+=BASE; \ + (*pi).c1.c3=d&MASK; \ + d=(*pj).c1.c4-(*pi).c1.c4-carry.c4; \ + (*pi).c2.c4+=(d<0); \ + d+=BASE; \ + (*pi).c1.c4=d&MASK; \ + d=(*pj).c2.c1-(*pi).c2.c1; \ + carry.c1=(d<0); \ + d+=BASE; \ + (*pi).c2.c1=d&MASK; \ + d=(*pj).c2.c2-(*pi).c2.c2; \ + carry.c2=(d<0); \ + d+=BASE; \ + (*pi).c2.c2=d&MASK; \ + d=(*pj).c2.c3-(*pi).c2.c3; \ + carry.c3=(d<0); \ + d+=BASE; \ + (*pi).c2.c3=d&MASK; \ + d=(*pj).c2.c4-(*pi).c2.c4; \ + carry.c4=(d<0); \ + d+=BASE; \ + (*pi).c2.c4=d&MASK + + +static void error(int no) +{ + switch(no) + { + case 0: + fprintf(stderr, "Error in rlxs_init\n"); + fprintf(stderr, "Arithmetic on this machine is not suitable for ranlxs\n"); + break; + case 1: + fprintf(stderr, "Error in subroutine rlxs_init\n"); + fprintf(stderr, "Bad choice of luxury level (should be 0,1 or 2)\n"); + break; + case 2: + fprintf(stderr, "Error in subroutine rlxs_init\n"); + fprintf(stderr, "Bad choice of seed (should be between 1 and 2^31-1)\n"); + break; + case 3: + fprintf(stderr, "Error in rlxs_get\n"); + fprintf(stderr, "Undefined state (ranlxs is not initialized)\n"); + break; + case 4: + fprintf(stderr, "Error in rlxs_reset\n"); + fprintf(stderr, "Arithmetic on this machine is not suitable for ranlxs\n"); + break; + case 5: + fprintf(stderr, "Error in rlxs_reset\n"); + fprintf(stderr, "Unexpected input data\n"); + break; + } + fprintf(stderr, "Program aborted\n"); + exit(0); +} + + +static void update(void) +{ + int k,kmax,d; + dble_vec_t *pmin,*pmax,*pi,*pj; + + kmax=pr; + pmin=&x.vec[0]; + pmax=pmin+12; + pi=&x.vec[ir]; + pj=&x.vec[jr]; + + for (k=0;k=12) + ir-=12; + if (jr>=12) + jr-=12; + is=8*ir; + is_old=is; +} + + +static void define_constants(void) +{ + int k; + + one_bit=(float)(ldexp(1.0,-24)); + + for (k=0;k<96;k++) + next[k]=(k+1)%96; +} + + +void rlxs_init(int level,int seed) +{ + int i,k,l; + int ibit,jbit,xbit[31]; + int ix,iy; + + if ((INT_MAX<2147483647)||(FLT_RADIX!=2)||(FLT_MANT_DIG<24)) + error(0); + + define_constants(); + + if (level==0) + pr=109; + else if (level==1) + pr=202; + else if (level==2) + pr=397; + else + error(1); + + i=seed; + + for (k=0;k<31;k++) + { + xbit[k]=i%2; + i/=2; + } + + if ((seed<=0)||(i!=0)) + error(2); + + ibit=0; + jbit=18; + + for (i=0;i<4;i++) + { + for (k=0;k<24;k++) + { + ix=0; + + for (l=0;l<24;l++) + { + iy=xbit[ibit]; + ix=2*ix+iy; + + xbit[ibit]=(xbit[ibit]+xbit[jbit])%2; + ibit=(ibit+1)%31; + jbit=(jbit+1)%31; + } + + if ((k%4)==i) + ix=16777215-ix; + + x.num[4*k+i]=ix; + } + } + + carry.c1=0; + carry.c2=0; + carry.c3=0; + carry.c4=0; + + ir=0; + jr=7; + is=95; + is_old=0; + prm=pr%12; + init=1; + ranlxs_init = 1; +} + + +void ranlxs(float r[],int n) +{ + int k; + + if (init==0) + rlxs_init(0,1); + + for (k=0;k=167777216)) + error(5); + + x.num[k]=state[k+1]; + } + + if (((state[97]!=0)&&(state[97]!=1))|| + ((state[98]!=0)&&(state[98]!=1))|| + ((state[99]!=0)&&(state[99]!=1))|| + ((state[100]!=0)&&(state[100]!=1))) + error(5); + + carry.c1=state[97]; + carry.c2=state[98]; + carry.c3=state[99]; + carry.c4=state[100]; + + pr=state[101]; + ir=state[102]; + jr=state[103]; + is=state[104]; + is_old=8*ir; + prm=pr%12; + init=1; + + if (((pr!=109)&&(pr!=202)&&(pr!=397))|| + (ir<0)||(ir>11)||(jr<0)||(jr>11)||(jr!=((ir+7)%12))|| + (is<0)||(is>95)) + error(5); +} + +#endif + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/ranlxs.h b/qcd/part_cpu/applications/QCD/src/kernel_D/ranlxs.h new file mode 100644 index 0000000000000000000000000000000000000000..ed227c6f746323b4352ac6cea1a7b7a5ac6138a3 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/ranlxs.h @@ -0,0 +1,35 @@ +/******************************************************************************* + * + * file ranlxs.h + * + * Copyright (C) 2005 Martin Luescher + * + * This software is distributed under the terms of the GNU General Public + * License (GPL) + * + * + * modified by C. Urbach to work in the tmLQCD package + * + ***********************************************************************/ + +#ifndef _RANLXS_H +#define _RANLXS_H + +#ifdef __cplusplus +extern "C" +{ +#endif /* __cplusplus */ + + extern int ranlxs_init; + + void ranlxs(float r[],int n); + void rlxs_init(int level,int seed); + void rlxs_get(int state[]); + void rlxs_reset(int state[]); + void fabhaan_vect(); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/rational/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_D/rational/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..7cc007e3361ccd745f9325b144205c3312b041ba --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/rational/Makefile @@ -0,0 +1,96 @@ + +srcdir = . +top_builddir = .. +abs_top_builddir = /home/jacob/Job/Prace/kernel_D_soft/kernel_D_update2 +top_srcdir = .. +abs_top_srcdir = /home/jacob/Job/Prace/kernel_D_soft/kernel_D_update2 +subdir = rational +builddir = . + +CFLAGS = -std=c99 -fopenmp -pedantic -Wall +DEPFLAGS = -MM +LDFLAGS = -L${HOME}/lib -L${top_builddir}/lib +DEFS = -DHAVE_CONFIG_H +OPTARGS = -O +SOPTARGS = -O + +AR = ar +RANLIB = ranlib +CC = mpicc +CCDEP = gcc +CCLD = ${CC} +LINK = ${CCLD} ${CFLAGS} ${LDFLAGS} ${OPTARGS} -o $@ +LEX = flex +AUTOCONF = autoconf +DEFS = -DHAVE_CONFIG_H + +INCLUDES = -I$(HOME)/include/ -I. -I${abs_top_builddir}/ -I${abs_top_srcdir}/ -I/include/ -I/include/ +LDADD = +#COMPILE = ${CC} ${DEFS} ${INCLUDES} ${CFLAGS} +COMPILE = ${CC} $(DEFS) ${INCLUDES} ${CFLAGS} + +LIBRARIES = librational +librational_TARGETS = zolotarev elliptic rational + +librational_STARGETS = + +librational_OBJECTS = $(addsuffix .o, ${librational_TARGETS}) +librational_SOBJECTS = $(addsuffix .o, ${librational_STARGETS}) + +# default rule + +all: Makefile dep librational.a + +# rules for debugging +debug all-debug: CFLAGS := $(CFLAGS) -g +debug all-debug: all + +# rules for profiling information +profile all-profile: CFLAGS := $(filter-out -fomit-frame-pointer,${CFLAGS}) +profile all-profile: all + + +#include dep rules + +-include $(addsuffix .d,${librational_TARGETS}) + +include ${top_srcdir}/Makefile.global + +# rule to compile objects + +${librational_OBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h + $(COMPILE) ${OPTARGS} -c $< + +${librational_SOBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h + $(COMPILE) ${SOPTARGS} -c $< + +# rule to make librational + +librational.a: ${librational_OBJECTS} ${librational_SOBJECTS} Makefile + @rm -f librational.a + @${AR} cru librational.a ${librational_OBJECTS} ${librational_SOBJECTS} + @$(RANLIB) librational.a + @cp librational.a ../lib/librational.a + +# rule to generate .d files + +$(addsuffix .d, $(librational_TARGETS) ${librational_STARGETS}): %.d: ${srcdir}/%.c Makefile + @${CCDEP} ${DEFS} ${DEPFLAGS} ${INCLUDES} $< > $@ + +# rule to make dependencies + +dep: ${addsuffix .d, ${librational_TARGETS} ${librational_STARGETS}} + +# rules to clean + +compile-clean: Makefile + rm -f ${$(addsuffix _OBJECTS, ${LIBRARIES})} ${$(addsuffix _SOBJECTS, ${LIBRARIES})} *.d + +clean: compile-clean + rm -f $(addsuffix .a, ${LIBRARIES}) + rm -f ../lib/librational.a + +distclean: clean + rm -f Makefile + +.PHONY: all dep clean compile-clean distclean profile all-profile debug all-debug diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/rational/Makefile.in b/qcd/part_cpu/applications/QCD/src/kernel_D/rational/Makefile.in new file mode 100644 index 0000000000000000000000000000000000000000..225a9e5b7446c14f0c1efd8b1a2add21dff86ff8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/rational/Makefile.in @@ -0,0 +1,96 @@ + +srcdir = @srcdir@ +top_builddir = @top_builddir@ +abs_top_builddir = @abs_top_builddir@ +top_srcdir = @top_srcdir@ +abs_top_srcdir = @abs_top_srcdir@ +subdir = rational +builddir = @builddir@ + +CFLAGS = @CFLAGS@ +DEPFLAGS = @DEPFLAGS@ +LDFLAGS = @LDFLAGS@ +DEFS = @DEFS@ +OPTARGS = @OPTARGS@ +SOPTARGS = @SOPTARGS@ + +AR = @AR@ +RANLIB = @RANLIB@ +CC = @CC@ +CCDEP = @CCDEP@ +CCLD = ${CC} +LINK = ${CCLD} ${CFLAGS} ${LDFLAGS} ${OPTARGS} -o $@ +LEX = @LEX@ +AUTOCONF = @AUTOCONF@ +DEFS = @DEFS@ + +INCLUDES = @INCLUDES@ +LDADD = +#COMPILE = ${CC} ${DEFS} ${INCLUDES} ${CFLAGS} +COMPILE = ${CC} $(DEFS) ${INCLUDES} ${CFLAGS} + +LIBRARIES = librational +librational_TARGETS = zolotarev elliptic rational + +librational_STARGETS = + +librational_OBJECTS = $(addsuffix .o, ${librational_TARGETS}) +librational_SOBJECTS = $(addsuffix .o, ${librational_STARGETS}) + +# default rule + +all: Makefile dep librational.a + +# rules for debugging +debug all-debug: CFLAGS := $(CFLAGS) @DEBUG_FLAG@ +debug all-debug: all + +# rules for profiling information +profile all-profile: CFLAGS := $(filter-out -fomit-frame-pointer,${CFLAGS}) @PROFILE_FLAG@ +profile all-profile: all + + +#include dep rules + +-include $(addsuffix .d,${librational_TARGETS}) + +include ${top_srcdir}/Makefile.global + +# rule to compile objects + +${librational_OBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h + $(COMPILE) ${OPTARGS} -c $< + +${librational_SOBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h + $(COMPILE) ${SOPTARGS} -c $< + +# rule to make librational + +librational.a: ${librational_OBJECTS} ${librational_SOBJECTS} Makefile + @rm -f librational.a + @${AR} cru librational.a ${librational_OBJECTS} ${librational_SOBJECTS} + @$(RANLIB) librational.a + @cp librational.a ../lib/librational.a + +# rule to generate .d files + +$(addsuffix .d, $(librational_TARGETS) ${librational_STARGETS}): %.d: ${srcdir}/%.c Makefile + @${CCDEP} ${DEFS} ${DEPFLAGS} ${INCLUDES} $< > $@ + +# rule to make dependencies + +dep: ${addsuffix .d, ${librational_TARGETS} ${librational_STARGETS}} + +# rules to clean + +compile-clean: Makefile + rm -f ${$(addsuffix _OBJECTS, ${LIBRARIES})} ${$(addsuffix _SOBJECTS, ${LIBRARIES})} *.d + +clean: compile-clean + rm -f $(addsuffix .a, ${LIBRARIES}) + rm -f ../lib/librational.a + +distclean: clean + rm -f Makefile + +.PHONY: all dep clean compile-clean distclean profile all-profile debug all-debug diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/rational/elliptic.c b/qcd/part_cpu/applications/QCD/src/kernel_D/rational/elliptic.c new file mode 100644 index 0000000000000000000000000000000000000000..7d7c581cdce802983c8c5a7ef47e9b734c84191e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/rational/elliptic.c @@ -0,0 +1,264 @@ + +/******************************************************************************* +* +* File elliptic.c +* +* Copyright (C) 2008, 2012 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Computation of the Jacobi elliptic functions sn, cn and dn +* +* The externally accessible functions are +* +* double ellipticK(double rk) +* Returns the complete elliptic integral K(k) for 0<=k<1. The value +* of k is to be passed through the argument rk=k/k' (see the notes). +* +* void sncndn(double u,double rk,double *sn,double *cn,double *dn) +* Computes the Jacobi elliptic functions sn(u,k), cn(u,k), dn(u,k) +* for specified real u and 0<=k<1. The value of k is to be passed +* through the argument rk=k/k' (see the notes). +* +* Notes: +* +* The complete elliptic integral and the Jacobi elliptic functions in the +* range -K/2<=u<=K/2 are obtained practically to machine precision. In +* particular, sn(u,k)=u+O(u^3) and cn(u,k)=1-u^2/2+O(u^4) exactly. +* +* Other values of u are first mapped to the interval 0<=u<=K/2 using the +* symmetry properties of the elliptic functions and the numerically computed +* value of K. In general this implies a loss of significance of the argument +* which propagates to the computed functions. +* +* The complete elliptic integral is obtained via the arithmetic-geometric +* mean. For small u, the Jacobi elliptic functions are calculated using +* the Taylor expansion. Elsewhere the descending Landen transformation is +* used. See +* +* M. Abramowitz, I. A. Stegun: "Handbook of mathematical functions", +* (Dover Publications, New York, 1972) +* +* for example. +* +* These methods eventually require both k and k'=sqrt(1-k*k) as input. While +* k' can be computed for given k, there can be important significance losses +* at this point if k is close to 1. On the other hand, if rk=k/k' is given, +* k and k' can be computed with negligible significance losses through +* +* k=rk/sqrt(1+rk^2), k'=1/sqrt(1+rk^2). +* +* This is why rk is chosen as input parameter in the programs in this file. +* +*******************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#include "elliptic.h" + + +static double agm(double x,double y) +{ + double px,py; + + for (;;) + { + px=x; + py=y; + + x=0.5*(px+py); + y=sqrt(px*py); + + if ((x<=y)||(x>=px)||(y<=py)) + return x; + } +} + + +double ellipticK(const double rk) +{ + double x,y; + + if (rk<0.0) + { + fprintf(stderr, "Argument rk in ellipticK out of range\n"); + return 1.0; + } + + x=1.0+rk/sqrt(1.0+rk*rk); + y=1.0/(x*(1.0+rk*rk)); + + return (2.0*atan(1.0))/agm(x,y); +} + + +static double sn_small(double u,double rk) +{ + double m,u2,sn; + double s0,s2,s4,s6; + + m=(rk*rk)/(1.0+rk*rk); + + s0=1.0; + s2=-(1.0+m)/6.0; + s4=(1.0+14.0*m+m*m)/120.0; + s6=-(1.0+135.0*m*(1.0+m)+m*m*m)/5040.0; + + u2=u*u; + sn=s4+s6*u2; + sn=s2+sn*u2; + sn=s0+sn*u2; + + return sn*u; +} + + +static void sncn_limit(double u,double rk,double *sn,double *cn) +{ + double k,m,s,c,r; + + k=rk/sqrt(1.0+rk*rk); + m=k*k; + + s=sin(u); + c=cos(u); + r=0.25*m*(u-s*c); + + (*sn)=s-r*c; + (*cn)=c+r*s; +} + + +static void landen(double u,double rk,double *sn,double *cn) +{ + int n; + double k,kp,kt,ktp; + double delta,fact; + + delta=sqrt(DBL_EPSILON); + kp=1.0/sqrt(1.0+rk*rk); + k=rk*kp; + + for (n=0;k>delta;n++) + { + kt=(k*k)/((1.0+kp)*(1.0+kp)); + ktp=(2.0*sqrt(kp))/(1.0+kp); + u*=(0.5+0.5*kp); + + k=kt; + kp=ktp; + } + + sncn_limit(u,k/kp,sn,cn); + + kt=k; + ktp=kp; + + for (;n>0;n--) + { + k=(2.0*sqrt(kt))/(1.0+kt); + kp=(ktp*ktp)/((1.0+kt)*(1.0+kt)); + + fact=1.0/(1.0+kt*(*sn)*(*sn)); + (*sn)=(1.0+kt)*(*sn)*fact; + (*cn)=(*cn)*sqrt(ktp*ktp+kt*kt*(*cn)*(*cn))*fact; + + kt=k; + ktp=kp; + } +} + + +void sncndn(const double _u, const double rk,double *sn,double *cn,double *dn) +{ + int n,flip; + double k,kp,K,delta,cd,sd,nd; + double sgn_sn,sgn_cn; + double u = _u; + + if (rk<0.0) + { + fprintf(stderr, "Argument rk in sncndn is out of range\n"); + + (*sn)=0.0; + (*cn)=1.0; + (*dn)=0.0; + + return; + } + + sgn_sn=1.0; + sgn_cn=1.0; + + if (u<0.0) + { + u=-u; + sgn_sn*=-1.0; + } + + K=ellipticK(rk); + n=(int)(u/K); + u-=(double)(n)*K; + n=n%4; + + if (n==1) + { + u=K-u; + sgn_cn*=-1.0; + } + else if (n==2) + { + sgn_sn*=-1.0; + sgn_cn*=-1.0; + } + else if (n==3) + { + u=K-u; + sgn_sn*=-1.0; + } + + if ((2.0*u)<=K) + flip=0; + else + { + u=K-u; + flip=1; + } + + kp=1.0/sqrt(1.0+rk*rk); + k=rk*kp; + + delta=pow(DBL_EPSILON,0.125); + if (delta>1.0e-3) + delta=1.0e-3; + + if (fabs(u). + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "global.h" +#include "zolotarev.h" +#include "rational.h" + +// init a rational approximation in the range [eps:1] +// rat->range[0,1] should be the spectral range of the squared operator +// eps is computed to be range[0]/range[1] +// order is the order n of the rational approximation [n,n] +// ca and cb specify the range of monomials to use (0 to order-1) + +int init_rational(rational_t * rat, const unsigned int scale) { + int order = rat->order; + double * ars = malloc(2*order*sizeof(double)); + double * ar; + double pmu, pnu; + double a = rat->range[0], b = rat->range[1]; + double sb = 1.; + int ca = rat->crange[0], cb = rat->crange[1]; + + // sanity check of input parameters + if(ca > order-1 || cb > order-1 || ca < 0 || cb < 0 || ca > cb || order < 1|| + b < a || b < 0 || a < 0) { + fprintf(stderr, "parameters to init_rational out of range\n"); + fprintf(stderr, "ca = %d, cb = %d, order = %d, a = %e, b = %e\n", ca, cb, order, a, b); + return(-1); + } + int np = cb - ca + 1; + if(scale) { + sb = sqrt(b); + } + rat->np = np; + if(((rat->mu = (double*)malloc(np*sizeof(double))) == NULL) || + ((rat->rmu = (double*)malloc(np*sizeof(double))) == NULL) || + ((rat->nu = (double*)malloc(np*sizeof(double))) == NULL) || + ((rat->rnu = (double*)malloc(np*sizeof(double))) == NULL)) { + fprintf(stderr, "Could not allocate memory for coefficients in init_rational\n"); + return(-2); + } + rat->eps = a/b; + + // compute optimal zolotarev approximation + zolotarev(order, rat->eps, &rat->A, ars, &rat->delta); + rat->A /= sb; + if(g_proc_id == 0 && g_debug_level > 0) { + printf("# rational approximation of order %d generated with max deviation delta = %e\n", rat->order, rat->delta); + } + // restrict to relevant coefficients [2*ca:2*cb] + ar = ars + 2*ca; + // compute mu[] and nu[] = sqrt(ar), mu: r even, nu: r odd + for (int i = 0; i < np; i++) { + rat->mu[np-i-1] = sb*sqrt(ar[2*i + 1]); + rat->nu[np-i-1] = sb*sqrt(ar[2*i]); + } + // compute the partial fraction coefficients rmu and rnu + for (int i = 0; i < np; i++) { + pmu = 1.0; + pnu = 1.0; + + for (int j = 0; j < np; j++) { + if (j != i) { + pmu *= ((ar[2*j]-ar[2*i+1]) / (ar[2*j+1]-ar[2*i+1])); + pnu *= ((rat->mu[j]-rat->nu[i]) / (rat->nu[j]-rat->nu[i])); + } + } + + rat->rmu[np-i-1] = sb*sb*(ar[2*i]-ar[2*i+1])*pmu; + rat->rnu[i] = (rat->mu[i]-rat->nu[i])*pnu; + } + + free(ars); + return(0); +} + + +int free_rational(rational_t * rat) { + free(rat->mu); + free(rat->nu); + free(rat->rmu); + free(rat->rnu); + rat->mu = NULL; + rat->nu = NULL; + rat->rmu = NULL; + rat->rnu = NULL; + return(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/rational/rational.h b/qcd/part_cpu/applications/QCD/src/kernel_D/rational/rational.h new file mode 100644 index 0000000000000000000000000000000000000000..8195756281b99cd8937c1d960aa2565072a887d8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/rational/rational.h @@ -0,0 +1,37 @@ +/*********************************************************************** + * + * Copyright (C) 2013 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _RATIONAL_H +#define _RATIONAL_H + +typedef struct { + int order, np; + int crange[2]; + double range[2]; + double eps; + double A, delta; + double *mu,*rmu; + double *nu,*rnu; +} rational_t; + +int init_rational(rational_t * rat, const unsigned int scale); +int free_rational(rational_t * rat); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/rational/zolotarev.c b/qcd/part_cpu/applications/QCD/src/kernel_D/rational/zolotarev.c new file mode 100644 index 0000000000000000000000000000000000000000..d86fb5065abac34df88aa0be36c956faa30cd891 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/rational/zolotarev.c @@ -0,0 +1,114 @@ + +/******************************************************************************* +* +* File zolotarev.c +* +* Copyright (C) 2008, 2012 Martin Luescher +* +* This software is distributed under the terms of the GNU General Public +* License (GPL) +* +* Computation of the Zolotarev rational approximation to 1/sqrt(y) +* +* The externally accessible function is +* +* void zolotarev(int n,double eps,double *A,double *ar,double *delta) +* Computes the amplitude A, the coefficients ar[r-1]=a_r, r=1,..,2n, +* and the error delta of the Zolotarev optimal rational approximation +* of degree [n,n] to the function f(y)=1/sqrt(y). +* +* Notes: +* +* The optimal rational approximation R(y) of degree [n,n] to 1/sqrt(y) +* in the range eps<=y<=1 is given by +* +* R(y)=A*P(y)/Q(y), +* +* P(y)=(y+a_1)*(y+a_3)*..*(y+a_{2n-1}), +* +* Q(y)=(y+a_2)*(y+a_4)*..*(y+a_{2n}), +* +* a_r={cn(r*v,k)/sn(r*v,k)}^2, v=K/(2n+1), k=sqrt(1-eps), +* +* where sn(u,k), cn(u,k) and K=K(k) denote the Jacobi elliptic functions +* and the complete elliptic integral respectively. The formulae for the +* the amplitude A and the relative error delta, +* +* A={2/[1+sqrt(1-d^2)]}*[c_1*c_3*..*c_{2n-1}]/[c_2*c_4*..*c_{2n}], +* +* delta=d^2/[1+sqrt(1-d^2)]^2, +* +* involve the coefficients +* +* c_r={sn(r*v,k)}^2, r=1,..,2n, +* +* d=k^{2n+1}*{c_1*c_3*..*c_{2n-1}}^2. +* +* See N.I. Achiezer: "Theory of Approximation" (Dover Publications, New York, +* 1992) for the proof of these formulae. +* +*******************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "elliptic.h" +#include "zolotarev.h" + +void zolotarev(const int n, const double eps, + double * A, double *ar, double *delta) +{ + int r; + double v,k,rk,d,s; + double sn,cn,dn,snx,cnx,dnx; + + if ((n<1)||(eps<=0.0)||(eps>=1.0)) + { + fprintf(stderr, "Arguments in zolotarev are out of range\n"); + + (*A)=1.0; + (*delta)=1.0; + + return; + } + + k=sqrt(1.0-eps); + rk=k/sqrt(eps); + v=ellipticK(rk)/(double)(2*n+1); + + (*A)=1.0; + d=k; + + for (r=1;r<=(2*n);r++) + { + if (r<=n) + { + sncndn((double)(r)*v,rk,&sn,&cn,&dn); + ar[r-1]=(cn*cn)/(sn*sn); + } + else + { + sncndn((double)(2*n+1-r)*v,rk,&snx,&cnx,&dnx); + ar[r-1]=eps*((snx*snx)/(cnx*cnx)); + sn=cnx/dnx; + } + + s=sn*sn; + + if ((r%2)==0) + (*A)/=s; + else + { + (*A)*=s; + s*=k; + d*=(s*s); + } + } + + s=1.0+sqrt(1.0-d*d); + (*A)*=(2.0/s); + (*delta)=(d*d)/(s*s); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/rational/zolotarev.h b/qcd/part_cpu/applications/QCD/src/kernel_D/rational/zolotarev.h new file mode 100644 index 0000000000000000000000000000000000000000..25626536caa2566adaf0130a3933a03239353eef --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/rational/zolotarev.h @@ -0,0 +1,7 @@ +#ifndef _ZOLOTAREV_H +#define _ZOLOTAREV_H + +void zolotarev(const int n, const double eps, + double * A, double *ar, double *delta); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/read_input.c b/qcd/part_cpu/applications/QCD/src/kernel_D/read_input.c new file mode 100644 index 0000000000000000000000000000000000000000..4a4b1a14c6cd8d411d41a02109534ecef573bd4b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/read_input.c @@ -0,0 +1,7391 @@ + +#line 3 "" + +#define YY_INT_ALIGNED short int + +/* A lexical scanner generated by flex */ + +#define FLEX_SCANNER +#define YY_FLEX_MAJOR_VERSION 2 +#define YY_FLEX_MINOR_VERSION 5 +#define YY_FLEX_SUBMINOR_VERSION 33 +#if YY_FLEX_SUBMINOR_VERSION > 0 +#define FLEX_BETA +#endif + +/* First, we deal with platform-specific or compiler-specific issues. */ + +/* begin standard C headers. */ +#include +#include +#include +#include + +/* end standard C headers. */ + +/* flex integer type definitions */ + +#ifndef FLEXINT_H +#define FLEXINT_H + +/* C99 systems have . Non-C99 systems may or may not. */ + +#if __STDC_VERSION__ >= 199901L + +/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h, + * if you want the limit (max/min) macros for int types. + */ +#ifndef __STDC_LIMIT_MACROS +#define __STDC_LIMIT_MACROS 1 +#endif + +#include +typedef int8_t flex_int8_t; +typedef uint8_t flex_uint8_t; +typedef int16_t flex_int16_t; +typedef uint16_t flex_uint16_t; +typedef int32_t flex_int32_t; +typedef uint32_t flex_uint32_t; +#else +typedef signed char flex_int8_t; +typedef short int flex_int16_t; +typedef int flex_int32_t; +typedef unsigned char flex_uint8_t; +typedef unsigned short int flex_uint16_t; +typedef unsigned int flex_uint32_t; +#endif /* ! C99 */ + +/* Limits of integral types. */ +#ifndef INT8_MIN +#define INT8_MIN (-128) +#endif +#ifndef INT16_MIN +#define INT16_MIN (-32767-1) +#endif +#ifndef INT32_MIN +#define INT32_MIN (-2147483647-1) +#endif +#ifndef INT8_MAX +#define INT8_MAX (127) +#endif +#ifndef INT16_MAX +#define INT16_MAX (32767) +#endif +#ifndef INT32_MAX +#define INT32_MAX (2147483647) +#endif +#ifndef UINT8_MAX +#define UINT8_MAX (255U) +#endif +#ifndef UINT16_MAX +#define UINT16_MAX (65535U) +#endif +#ifndef UINT32_MAX +#define UINT32_MAX (4294967295U) +#endif + +#endif /* ! FLEXINT_H */ + +#ifdef __cplusplus + +/* The "const" storage-class-modifier is valid. */ +#define YY_USE_CONST + +#else /* ! __cplusplus */ + +#if __STDC__ + +#define YY_USE_CONST + +#endif /* __STDC__ */ +#endif /* ! __cplusplus */ + +#ifdef YY_USE_CONST +#define yyconst const +#else +#define yyconst +#endif + +/* Returned upon end-of-file. */ +#define YY_NULL 0 + +/* Promotes a possibly negative, possibly signed char to an unsigned + * integer for use as an array index. If the signed char is negative, + * we want to instead treat it as an 8-bit unsigned char, hence the + * double cast. + */ +#define YY_SC_TO_UI(c) ((unsigned int) (unsigned char) c) + +/* Enter a start condition. This macro really ought to take a parameter, + * but we do it the disgusting crufty way forced on us by the ()-less + * definition of BEGIN. + */ +#define BEGIN (yy_start) = 1 + 2 * + +/* Translate the current start state into a value that can be later handed + * to BEGIN to return to the state. The YYSTATE alias is for lex + * compatibility. + */ +#define YY_START (((yy_start) - 1) / 2) +#define YYSTATE YY_START + +/* Action number for EOF rule of a given start state. */ +#define YY_STATE_EOF(state) (YY_END_OF_BUFFER + state + 1) + +/* Special action meaning "start processing a new file". */ +#define YY_NEW_FILE yyrestart(yyin ) + +#define YY_END_OF_BUFFER_CHAR 0 + +/* Size of default input buffer. */ +#ifndef YY_BUF_SIZE +#define YY_BUF_SIZE 16384 +#endif + +/* The state buf must be large enough to hold one state per character in the main buffer. + */ +#define YY_STATE_BUF_SIZE ((YY_BUF_SIZE + 2) * sizeof(yy_state_type)) + +#ifndef YY_TYPEDEF_YY_BUFFER_STATE +#define YY_TYPEDEF_YY_BUFFER_STATE +typedef struct yy_buffer_state *YY_BUFFER_STATE; +#endif + +extern int yyleng; + +extern FILE *yyin, *yyout; + +#define EOB_ACT_CONTINUE_SCAN 0 +#define EOB_ACT_END_OF_FILE 1 +#define EOB_ACT_LAST_MATCH 2 + + #define YY_LESS_LINENO(n) + +/* Return all but the first "n" matched characters back to the input stream. */ +#define yyless(n) \ + do \ + { \ + /* Undo effects of setting up yytext. */ \ + int yyless_macro_arg = (n); \ + YY_LESS_LINENO(yyless_macro_arg);\ + *yy_cp = (yy_hold_char); \ + YY_RESTORE_YY_MORE_OFFSET \ + (yy_c_buf_p) = yy_cp = yy_bp + yyless_macro_arg - YY_MORE_ADJ; \ + YY_DO_BEFORE_ACTION; /* set up yytext again */ \ + } \ + while ( 0 ) + +#define unput(c) yyunput( c, (yytext_ptr) ) + +/* The following is because we cannot portably get our hands on size_t + * (without autoconf's help, which isn't available because we want + * flex-generated scanners to compile on their own). + */ + +#ifndef YY_TYPEDEF_YY_SIZE_T +#define YY_TYPEDEF_YY_SIZE_T +typedef unsigned int yy_size_t; +#endif + +#ifndef YY_STRUCT_YY_BUFFER_STATE +#define YY_STRUCT_YY_BUFFER_STATE +struct yy_buffer_state + { + FILE *yy_input_file; + + char *yy_ch_buf; /* input buffer */ + char *yy_buf_pos; /* current position in input buffer */ + + /* Size of input buffer in bytes, not including room for EOB + * characters. + */ + yy_size_t yy_buf_size; + + /* Number of characters read into yy_ch_buf, not including EOB + * characters. + */ + int yy_n_chars; + + /* Whether we "own" the buffer - i.e., we know we created it, + * and can realloc() it to grow it, and should free() it to + * delete it. + */ + int yy_is_our_buffer; + + /* Whether this is an "interactive" input source; if so, and + * if we're using stdio for input, then we want to use getc() + * instead of fread(), to make sure we stop fetching input after + * each newline. + */ + int yy_is_interactive; + + /* Whether we're considered to be at the beginning of a line. + * If so, '^' rules will be active on the next match, otherwise + * not. + */ + int yy_at_bol; + + int yy_bs_lineno; /**< The line count. */ + int yy_bs_column; /**< The column count. */ + + /* Whether to try to fill the input buffer when we reach the + * end of it. + */ + int yy_fill_buffer; + + int yy_buffer_status; + +#define YY_BUFFER_NEW 0 +#define YY_BUFFER_NORMAL 1 + /* When an EOF's been seen but there's still some text to process + * then we mark the buffer as YY_EOF_PENDING, to indicate that we + * shouldn't try reading from the input source any more. We might + * still have a bunch of tokens to match, though, because of + * possible backing-up. + * + * When we actually see the EOF, we change the status to "new" + * (via yyrestart()), so that the user can continue scanning by + * just pointing yyin at a new input file. + */ +#define YY_BUFFER_EOF_PENDING 2 + + }; +#endif /* !YY_STRUCT_YY_BUFFER_STATE */ + +/* Stack of input buffers. */ +static size_t yy_buffer_stack_top = 0; /**< index of top of stack. */ +static size_t yy_buffer_stack_max = 0; /**< capacity of stack. */ +static YY_BUFFER_STATE * yy_buffer_stack = 0; /**< Stack as an array. */ + +/* We provide macros for accessing buffer states in case in the + * future we want to put the buffer states in a more general + * "scanner state". + * + * Returns the top of the stack, or NULL. + */ +#define YY_CURRENT_BUFFER ( (yy_buffer_stack) \ + ? (yy_buffer_stack)[(yy_buffer_stack_top)] \ + : NULL) + +/* Same as previous macro, but useful when we know that the buffer stack is not + * NULL or when we need an lvalue. For internal use only. + */ +#define YY_CURRENT_BUFFER_LVALUE (yy_buffer_stack)[(yy_buffer_stack_top)] + +/* yy_hold_char holds the character lost when yytext is formed. */ +static char yy_hold_char; +static int yy_n_chars; /* number of characters read into yy_ch_buf */ +int yyleng; + +/* Points to current character in buffer. */ +static char *yy_c_buf_p = (char *) 0; +static int yy_init = 0; /* whether we need to initialize */ +static int yy_start = 0; /* start state number */ + +/* Flag which is used to allow yywrap()'s to do buffer switches + * instead of setting up a fresh yyin. A bit of a hack ... + */ +static int yy_did_buffer_switch_on_eof; + +void yyrestart (FILE *input_file ); +void yy_switch_to_buffer (YY_BUFFER_STATE new_buffer ); +YY_BUFFER_STATE yy_create_buffer (FILE *file,int size ); +void yy_delete_buffer (YY_BUFFER_STATE b ); +void yy_flush_buffer (YY_BUFFER_STATE b ); +void yypush_buffer_state (YY_BUFFER_STATE new_buffer ); +void yypop_buffer_state (void ); + +static void yyensure_buffer_stack (void ); +static void yy_load_buffer_state (void ); +static void yy_init_buffer (YY_BUFFER_STATE b,FILE *file ); + +#define YY_FLUSH_BUFFER yy_flush_buffer(YY_CURRENT_BUFFER ) + +YY_BUFFER_STATE yy_scan_buffer (char *base,yy_size_t size ); +YY_BUFFER_STATE yy_scan_string (yyconst char *yy_str ); +YY_BUFFER_STATE yy_scan_bytes (yyconst char *bytes,int len ); + +void *yyalloc (yy_size_t ); +void *yyrealloc (void *,yy_size_t ); +void yyfree (void * ); + +#define yy_new_buffer yy_create_buffer + +#define yy_set_interactive(is_interactive) \ + { \ + if ( ! YY_CURRENT_BUFFER ){ \ + yyensure_buffer_stack (); \ + YY_CURRENT_BUFFER_LVALUE = \ + yy_create_buffer(yyin,YY_BUF_SIZE ); \ + } \ + YY_CURRENT_BUFFER_LVALUE->yy_is_interactive = is_interactive; \ + } + +#define yy_set_bol(at_bol) \ + { \ + if ( ! YY_CURRENT_BUFFER ){\ + yyensure_buffer_stack (); \ + YY_CURRENT_BUFFER_LVALUE = \ + yy_create_buffer(yyin,YY_BUF_SIZE ); \ + } \ + YY_CURRENT_BUFFER_LVALUE->yy_at_bol = at_bol; \ + } + +#define YY_AT_BOL() (YY_CURRENT_BUFFER_LVALUE->yy_at_bol) + +/* Begin user sect3 */ + +typedef unsigned char YY_CHAR; + +FILE *yyin = (FILE *) 0, *yyout = (FILE *) 0; + +typedef int yy_state_type; + +extern int yylineno; + +int yylineno = 1; + +extern char *yytext; +#define yytext_ptr yytext + +static yy_state_type yy_get_previous_state (void ); +static yy_state_type yy_try_NUL_trans (yy_state_type current_state ); +static int yy_get_next_buffer (void ); +static void yy_fatal_error (yyconst char msg[] ); + +/* Done after the current pattern has been matched and before the + * corresponding action - sets up yytext. + */ +#define YY_DO_BEFORE_ACTION \ + (yytext_ptr) = yy_bp; \ + yyleng = (size_t) (yy_cp - yy_bp); \ + (yy_hold_char) = *yy_cp; \ + *yy_cp = '\0'; \ + (yy_c_buf_p) = yy_cp; + +#define YY_NUM_RULES 371 +#define YY_END_OF_BUFFER 372 +/* This struct is not used in this scanner, + but its presence is necessary. */ +struct yy_trans_info + { + flex_int32_t yy_verify; + flex_int32_t yy_nxt; + }; +static yyconst flex_int16_t yy_accept[2644] = + {} ; + +static yyconst flex_int32_t yy_ec[256] = + { 0, + 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, + 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 4, 1, 5, 6, 1, 1, 1, 1, 1, + 1, 1, 7, 1, 8, 9, 5, 10, 11, 12, + 13, 14, 15, 16, 15, 15, 15, 1, 1, 1, + 17, 1, 1, 1, 20, 21, 22, 23, 24, 25, + 26, 27, 28, 5, 29, 30, 31, 32, 33, 34, + 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 18, 18, 18, 18, 19, 18, 20, 21, 22, 23, + + 24, 25, 26, 27, 28, 5, 29, 30, 31, 32, + 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, + 43, 44, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1 + } ; + +static yyconst flex_int32_t yy_meta[45] = + { 0, + 1, 2, 3, 1, 4, 1, 5, 5, 6, 7, + 7, 7, 7, 7, 7, 7, 1, 4, 4, 4, + 4, 4, 4, 8, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4 + } ; + +static yyconst flex_int16_t yy_base[2999] = + {} ; + +static yyconst flex_int16_t yy_def[2999] = + { 0, + 2644, 2643, 2645, 2645, 2646, 2646, 2647, 2647, 2648, 2648, + 2649, 2649, 2650, 2650, 2651, 2651, 2652, 2652, 2653, 2653, + 2654, 2654, 2655, 2655, 2656, 2656, 2657, 2657, 2658, 2658, + 2659, 2659, 2659, 2659, 2659, 35, 35, 37, 2659, 39, + 39, 41, 2659, 2659, 39, 39, 2660, 2660, 2661, 2661, + 2662, 2662, 2663, 2663, 2664, 2664, 2665, 2665, 2666, 2666, + 2667, 2667, 2668, 2668, 2669, 2669, 2670, 2670, 2671, 2671, + 2672, 2672, 2673, 2673, 2674, 2674, 2675, 2675, 2676, 2676, + 2677, 2677, 2678, 2678, 2679, 2679, 2680, 2680, 2681, 2681, + 2682, 2682, 2683, 2683, 2684, 2684, 39, 39, 39, 39, + + 41, 101, 2685, 2685, 2686, 2686, 2687, 2687, 2688, 2688, + 39, 111, 111, 111, 2689, 2689, 39, 39, 39, 39, + 39, 39, 39, 39, 39, 39, 39, 39, 2690, 2690, + 2690, 131, 131, 133, 133, 135, 135, 137, 2691, 2691, + 2692, 2692, 142, 143, 143, 145, 145, 147, 2693, 2693, + 150, 151, 151, 153, 153, 155, 111, 111, 39, 39, + 39, 39, 2694, 2694, 164, 164, 164, 164, 2695, 2695, + 170, 170, 111, 111, 2695, 2695, 2696, 2696, 2697, 2697, + 2698, 2698, 2699, 2699, 2690, 2690, 2700, 2700, 2701, 2701, + 2702, 2702, 192, 193, 182, 182, 2703, 2703, 2704, 2704, + + 2705, 2705, 2695, 2695, 170, 170, 170, 207, 2706, 2706, + 2707, 2707, 2708, 2708, 2709, 2709, 2690, 2690, 2710, 2710, + 2711, 2711, 2712, 2712, 2713, 2713, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2714, 2715, 2714, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2716, 2643, 2643, 2643, 2717, + 2643, 2643, 2643, 2718, 2719, 2718, 2643, 2643, 2643, 2720, + 2721, 2720, 2643, 2643, 2643, 2722, 2723, 2722, 2643, 2643, + 2643, 2724, 2643, 2643, 2643, 2725, 2726, 2725, 2643, 2643, + + 2643, 2727, 2728, 2727, 2643, 2643, 2643, 2729, 2730, 2729, + 2643, 2643, 2643, 2731, 2732, 2731, 2643, 2643, 2643, 2733, + 2734, 2733, 2643, 2643, 2643, 2735, 2643, 2643, 2643, 2643, + 2736, 2737, 2736, 2738, 2739, 2738, 2740, 2643, 2741, 2742, + 2741, 2643, 2643, 2643, 2643, 2643, 2643, 2743, 2643, 2643, + 2744, 2643, 2643, 2745, 2643, 2643, 2746, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2747, 2643, 2643, + 2643, 2748, 2643, 2643, 2643, 2643, 2749, 2643, 2643, 2643, + 2750, 2643, 2643, 2643, 2751, 2643, 2643, 2643, 2752, 2643, + 2643, 2643, 2753, 2643, 2643, 2643, 2754, 2643, 2643, 2643, + 2755, 2643, 2643, 2643, 2756, 2643, 2643, 2643, 2757, 2643, + 2643, 2643, 2643, 2643, 2758, 2759, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2760, 2761, 2762, 2763, 2762, 2643, 2643, + 2764, 2643, 2643, 2643, 2765, 2766, 2765, 2643, 2643, 2643, + 2767, 2643, 2643, 2643, 2768, 2769, 2768, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2770, 2643, 2771, 2643, 2772, + + 2643, 2773, 2643, 2774, 2775, 2776, 2777, 2778, 2777, 2643, + 2779, 2780, 2779, 2781, 2782, 2781, 2783, 2784, 2783, 2785, + 2786, 2785, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2787, + 2788, 2787, 2643, 2789, 2790, 2789, 2791, 2792, 2791, 2793, + 2794, 2793, 2643, 2643, 2795, 2796, 2795, 2643, 2643, 2797, + 2798, 2797, 2643, 2799, 2800, 2799, 2801, 2802, 2801, 2643, + 2643, 2803, 2804, 2643, 2643, 2805, 2643, 2806, 2807, 2643, + 2643, 2808, 2643, 2809, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2810, 2811, 2810, + 2643, 2643, 2643, 2812, 2643, 2643, 2643, 2643, 2643, 2643, + + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2813, 2814, 2813, 2643, + 2815, 2816, 2815, 2643, 2817, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2818, 2643, + 2643, 2643, 2643, 2643, 2643, 2819, 2643, 2820, 2821, 2820, + 2643, 2643, 2822, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2823, 2643, 2824, + 2824, 2825, 2825, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2826, 2827, + 2826, 2828, 2643, 2643, 2643, 2829, 2830, 2831, 2832, 2831, + 2833, 2834, 2835, 2834, 2836, 2837, 2838, 2837, 2839, 2840, + 2841, 2842, 2841, 2843, 2844, 2845, 2844, 2846, 2847, 2848, + 2847, 2849, 2850, 2851, 2850, 2852, 2853, 2854, 2853, 2855, + 2856, 2857, 2858, 2857, 2859, 2860, 2861, 2860, 2862, 2863, + 2864, 2865, 2864, 2866, 2643, 2643, 2643, 2643, 2643, 2867, + + 2868, 2869, 2870, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2871, 2872, 2643, + 2873, 2874, 2875, 2876, 2877, 2878, 2879, 2880, 2881, 2643, + 2643, 2882, 2883, 2643, 2643, 2643, 2884, 2885, 2886, 2887, + 2886, 2888, 2889, 2890, 2891, 2890, 2892, 2893, 2894, 2895, + 2894, 2896, 2643, 2643, 2643, 2643, 2643, 2897, 2898, 2899, + 2900, 2901, 2902, 2903, 2904, 2905, 2904, 2906, 2907, 2908, + 2907, 2909, 2910, 2911, 2910, 2912, 2913, 2914, 2913, 2915, + 2916, 2917, 2916, 2918, 2643, 2643, 2919, 2920, 2919, 2921, + + 2922, 2923, 2922, 2924, 2925, 2926, 2925, 2927, 2928, 2929, + 2928, 2930, 2931, 2932, 2931, 2933, 2934, 2935, 2934, 2936, + 2937, 2938, 2937, 2939, 2940, 2941, 2940, 2942, 2643, 2643, + 2943, 2944, 2945, 2946, 2947, 2948, 2949, 2643, 2643, 2643, + 2643, 2643, 2643, 2950, 2951, 2950, 2952, 2953, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2954, 2955, 2954, + 2956, 2957, 2958, 2957, 2959, 2960, 2643, 2643, 2643, 2643, + 2961, 2643, 2643, 2962, 2963, 2964, 2963, 2965, 2966, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2967, 2968, + 2969, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2970, 2970, + 2643, 2643, 2643, 2643, 2971, 2971, 2972, 2972, 2973, 2973, + 2974, 2974, 2975, 2975, 2976, 2976, 2977, 2977, 2978, 2978, + 2979, 2979, 2980, 2980, 2981, 2981, 2643, 2643, 2643, 2643, + + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2982, + 2982, 2983, 2983, 2984, 2984, 2643, 2643, 2643, 2643, 2985, + 2985, 2986, 2986, 2987, 2987, 2988, 2988, 2989, 2989, 2643, + 2990, 2990, 2991, 2991, 2992, 2992, 2993, 2993, 2994, 2994, + 2995, 2995, 2996, 2996, 2997, 2997, 2643, 2643, 2643, 2643, + 2998, 2998, 2643, 2643, 2643, 2643, 2643, 2643, 2956, 1169, + 2959, 1171, 2643, 2643, 2643, 2965, 1176, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 0, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643 + + } ; + +static yyconst flex_int16_t yy_nxt[6138] = + { 0, + 228, 228, 229, 228, 228, 230, 228, 228, 228, 228, + 228, 231, 228, 228, 228, 228, 228, 228, 228, 232, + 233, 234, 235, 236, 237, 238, 228, 239, 240, 241, + 242, 243, 244, 245, 246, 247, 248, 249, 250, 228, + 251, 228, 228, 228, 229, 353, 229, 253, 254, 254, + 255, 256, 256, 256, 256, 256, 256, 256, 229, 721, + 722, 257, 254, 254, 255, 256, 256, 256, 256, 256, + 256, 256, 229, 229, 229, 259, 263, 329, 229, 229, + 229, 330, 329, 330, 229, 229, 229, 378, 359, 510, + 229, 260, 260, 362, 1264, 360, 261, 261, 361, 229, + + 360, 1265, 364, 361, 379, 262, 262, 229, 229, 365, + 265, 459, 366, 491, 266, 266, 266, 266, 266, 266, + 266, 229, 693, 229, 267, 229, 681, 460, 266, 266, + 266, 266, 266, 266, 266, 229, 694, 461, 269, 560, + 700, 462, 270, 270, 270, 270, 270, 270, 270, 229, + 561, 701, 271, 686, 612, 687, 270, 270, 270, 270, + 270, 270, 270, 229, 688, 613, 273, 274, 274, 275, + 276, 276, 276, 276, 276, 276, 276, 229, 1001, 1355, + 277, 274, 274, 275, 276, 276, 276, 276, 276, 276, + 276, 229, 1356, 1002, 279, 280, 280, 281, 282, 282, + + 282, 282, 282, 282, 282, 229, 1069, 1069, 283, 280, + 280, 281, 282, 282, 282, 282, 282, 282, 282, 229, + 1075, 1075, 285, 286, 286, 287, 288, 288, 288, 288, + 288, 288, 288, 229, 1077, 1077, 289, 286, 286, 287, + 288, 288, 288, 288, 288, 288, 288, 229, 353, 229, + 291, 716, 683, 717, 292, 292, 292, 292, 292, 292, + 292, 229, 698, 1059, 293, 1357, 1060, 699, 292, 292, + 292, 292, 292, 292, 292, 229, 1061, 1358, 295, 296, + 296, 297, 298, 298, 298, 298, 298, 298, 298, 229, + 1079, 1079, 299, 296, 296, 297, 298, 298, 298, 298, + + 298, 298, 298, 229, 1081, 1081, 301, 302, 302, 303, + 304, 304, 304, 304, 304, 304, 304, 229, 1083, 1083, + 305, 302, 302, 303, 304, 304, 304, 304, 304, 304, + 304, 229, 1085, 1085, 307, 308, 308, 309, 310, 310, + 310, 310, 310, 310, 310, 229, 1087, 1087, 311, 308, + 308, 309, 310, 310, 310, 310, 310, 310, 310, 229, + 1089, 1089, 313, 314, 314, 315, 316, 316, 316, 316, + 316, 316, 316, 229, 1091, 1091, 317, 314, 314, 315, + 316, 316, 316, 316, 316, 316, 316, 229, 1093, 1093, + 319, 320, 320, 321, 322, 322, 322, 322, 322, 322, + + 322, 229, 1095, 1095, 323, 320, 320, 321, 322, 322, + 322, 322, 322, 322, 322, 229, 709, 1071, 325, 1072, + 710, 811, 326, 326, 326, 326, 326, 326, 326, 229, + 812, 1361, 327, 1120, 1120, 711, 326, 326, 326, 326, + 326, 326, 326, 229, 1122, 1122, 329, 331, 331, 332, + 333, 333, 333, 333, 333, 333, 333, 334, 334, 335, + 336, 336, 336, 336, 336, 336, 336, 229, 1023, 229, + 329, 1362, 382, 1024, 337, 337, 337, 337, 337, 337, + 337, 338, 339, 339, 340, 341, 341, 341, 341, 341, + 341, 341, 229, 229, 229, 329, 367, 380, 383, 229, + + 1363, 343, 385, 365, 229, 229, 366, 387, 390, 384, + 1124, 1124, 718, 344, 379, 719, 495, 345, 229, 346, + 229, 392, 229, 395, 720, 463, 347, 229, 383, 1364, + 330, 1130, 1130, 388, 388, 229, 343, 575, 397, 384, + 994, 460, 560, 1365, 389, 389, 995, 393, 344, 393, + 1056, 461, 345, 561, 346, 462, 1132, 1132, 394, 1057, + 394, 347, 338, 576, 398, 1366, 348, 348, 348, 348, + 348, 348, 348, 342, 577, 399, 1367, 348, 348, 348, + 348, 348, 348, 348, 229, 712, 1368, 350, 1134, 1134, + 578, 351, 351, 351, 351, 351, 351, 351, 229, 713, + + 1369, 352, 714, 715, 1370, 351, 351, 351, 351, 351, + 351, 351, 353, 353, 229, 353, 576, 355, 353, 353, + 1016, 1016, 1016, 229, 1136, 1136, 400, 577, 353, 353, + 353, 353, 229, 353, 815, 356, 353, 353, 1184, 816, + 1371, 229, 817, 1185, 402, 1372, 353, 353, 353, 353, + 229, 353, 398, 355, 353, 353, 229, 1138, 1138, 405, + 491, 1141, 1141, 399, 353, 353, 353, 353, 229, 353, + 403, 356, 353, 353, 495, 1373, 492, 1374, 493, 1375, + 494, 404, 353, 353, 229, 403, 328, 369, 229, 1340, + 492, 524, 493, 723, 494, 724, 404, 328, 1341, 725, + + 328, 229, 370, 371, 527, 229, 372, 373, 597, 1143, + 1143, 328, 374, 598, 599, 375, 229, 525, 1282, 376, + 1033, 1034, 1035, 229, 229, 1283, 571, 573, 526, 1376, + 525, 1145, 1145, 229, 370, 371, 667, 229, 372, 373, + 582, 526, 1377, 668, 374, 1378, 669, 375, 229, 1147, + 1147, 407, 579, 579, 1379, 408, 408, 408, 408, 408, + 408, 408, 229, 580, 580, 409, 583, 1149, 1149, 408, + 408, 408, 408, 408, 408, 408, 229, 584, 1380, 411, + 1063, 1063, 1063, 412, 412, 412, 412, 412, 412, 412, + 1381, 229, 1012, 229, 600, 229, 670, 1013, 585, 598, + + 599, 229, 229, 668, 601, 607, 669, 1014, 229, 413, + 229, 610, 1382, 414, 1151, 1151, 1383, 412, 412, 412, + 412, 412, 412, 412, 583, 1384, 229, 1153, 1153, 661, + 602, 608, 1359, 229, 603, 584, 614, 608, 1155, 1155, + 1360, 604, 609, 413, 229, 662, 663, 416, 609, 664, + 1385, 417, 417, 417, 417, 417, 417, 417, 229, 1161, + 1161, 418, 612, 1169, 1169, 417, 417, 417, 417, 417, + 417, 417, 229, 613, 1386, 420, 704, 704, 704, 421, + 421, 421, 421, 421, 421, 421, 229, 1171, 1171, 422, + 1388, 705, 1389, 421, 421, 421, 421, 421, 421, 421, + + 229, 1176, 1176, 424, 1017, 1017, 1017, 425, 425, 425, + 425, 425, 425, 425, 229, 2643, 2643, 426, 1390, 1018, + 1391, 425, 425, 425, 425, 425, 425, 425, 229, 2643, + 2643, 428, 1019, 1019, 1019, 429, 429, 429, 429, 429, + 429, 429, 229, 2643, 2643, 430, 1392, 1020, 1393, 429, + 429, 429, 429, 429, 429, 429, 229, 1395, 1396, 432, + 1021, 1021, 1021, 433, 433, 433, 433, 433, 433, 433, + 229, 1397, 1398, 434, 1399, 1022, 1404, 433, 433, 433, + 433, 433, 433, 433, 229, 1405, 1406, 436, 1044, 1044, + 1044, 437, 437, 437, 437, 437, 437, 437, 229, 1400, + + 1409, 438, 1401, 1045, 1410, 437, 437, 437, 437, 437, + 437, 437, 229, 1411, 1414, 440, 1046, 1046, 1046, 441, + 441, 441, 441, 441, 441, 441, 229, 1415, 1416, 442, + 1417, 1047, 1420, 441, 441, 441, 441, 441, 441, 441, + 229, 1421, 1422, 444, 732, 732, 732, 445, 445, 445, + 445, 445, 445, 445, 229, 229, 1423, 446, 605, 733, + 1424, 445, 445, 445, 445, 445, 445, 445, 229, 1425, + 1426, 448, 1194, 1194, 1194, 449, 449, 449, 449, 449, + 449, 449, 1429, 229, 602, 450, 628, 1195, 603, 229, + 229, 1402, 631, 633, 1036, 604, 1037, 451, 229, 1403, + + 1430, 452, 1048, 1038, 1049, 449, 449, 449, 449, 449, + 449, 449, 629, 1427, 1431, 450, 1050, 229, 629, 634, + 636, 1428, 1051, 630, 1016, 1016, 1016, 451, 229, 630, + 635, 454, 1432, 455, 1433, 456, 456, 456, 456, 456, + 456, 456, 229, 1434, 1435, 457, 634, 455, 1438, 456, + 456, 456, 456, 456, 456, 456, 338, 635, 1439, 1442, + 464, 464, 464, 464, 464, 464, 464, 342, 1209, 1209, + 1209, 464, 464, 464, 464, 464, 464, 464, 338, 1210, + 1210, 1210, 465, 465, 465, 465, 465, 465, 465, 342, + 1211, 1211, 1211, 465, 465, 465, 465, 465, 465, 465, + + 466, 466, 467, 468, 468, 468, 468, 468, 468, 468, + 229, 1443, 1444, 470, 1017, 1017, 1017, 471, 471, 471, + 471, 471, 471, 471, 229, 1445, 1446, 472, 1448, 1018, + 1449, 471, 471, 471, 471, 471, 471, 471, 229, 1450, + 1451, 474, 475, 475, 476, 477, 477, 477, 477, 477, + 477, 477, 229, 1452, 1455, 478, 475, 475, 476, 477, + 477, 477, 477, 477, 477, 477, 229, 1456, 1457, 480, + 1019, 1019, 1019, 481, 481, 481, 481, 481, 481, 481, + 229, 1458, 1453, 482, 1459, 1020, 1454, 481, 481, 481, + 481, 481, 481, 481, 229, 1460, 1461, 484, 485, 485, + + 486, 487, 487, 487, 487, 487, 487, 487, 229, 1464, + 1465, 488, 485, 485, 486, 487, 487, 487, 487, 487, + 487, 487, 338, 1219, 1219, 1219, 328, 328, 328, 328, + 328, 328, 328, 229, 1466, 229, 665, 695, 641, 689, + 690, 1467, 229, 696, 1468, 644, 691, 1469, 489, 1470, + 692, 697, 662, 663, 1471, 1472, 664, 1473, 1474, 490, + 328, 328, 229, 328, 642, 338, 328, 328, 1475, 229, + 229, 642, 656, 659, 1476, 643, 328, 328, 328, 328, + 229, 328, 643, 342, 328, 328, 1235, 1235, 1235, 1477, + 229, 1478, 1479, 672, 328, 328, 497, 1480, 657, 657, + + 498, 498, 498, 498, 498, 498, 498, 499, 1256, 658, + 658, 498, 498, 498, 498, 498, 498, 498, 338, 673, + 1257, 1258, 500, 500, 500, 500, 500, 500, 500, 342, + 674, 1481, 1482, 500, 500, 500, 500, 500, 500, 500, + 501, 1236, 1236, 1236, 502, 502, 502, 502, 502, 502, + 502, 503, 1063, 1063, 1063, 502, 502, 502, 502, 502, + 502, 502, 501, 1294, 1294, 1294, 504, 504, 504, 504, + 504, 504, 504, 503, 1209, 1209, 1209, 504, 504, 504, + 504, 504, 504, 504, 501, 1210, 1210, 1210, 505, 505, + 505, 505, 505, 505, 505, 503, 1211, 1211, 1211, 505, + + 505, 505, 505, 505, 505, 505, 501, 1312, 1312, 1312, + 506, 506, 506, 506, 506, 506, 506, 503, 1313, 1313, + 1313, 506, 506, 506, 506, 506, 506, 506, 229, 1021, + 1021, 1021, 507, 507, 508, 509, 509, 509, 509, 509, + 509, 509, 229, 1483, 1022, 510, 507, 507, 508, 509, + 509, 509, 509, 509, 509, 509, 229, 1219, 1219, 1219, + 511, 511, 512, 513, 513, 513, 513, 513, 513, 513, + 514, 514, 515, 516, 516, 516, 516, 516, 516, 516, + 517, 517, 518, 519, 519, 519, 519, 519, 519, 519, + 520, 520, 521, 522, 522, 522, 522, 522, 522, 522, + + 229, 1484, 1485, 529, 530, 530, 531, 532, 532, 532, + 532, 532, 532, 532, 229, 1486, 1487, 533, 530, 530, + 531, 532, 532, 532, 532, 532, 532, 532, 529, 534, + 534, 535, 536, 536, 536, 536, 536, 536, 536, 537, + 537, 538, 539, 539, 539, 539, 539, 539, 539, 540, + 540, 541, 542, 542, 542, 542, 542, 542, 542, 229, + 1488, 1489, 544, 545, 545, 546, 547, 547, 547, 547, + 547, 547, 547, 229, 1490, 1491, 548, 545, 545, 546, + 547, 547, 547, 547, 547, 547, 547, 549, 550, 550, + 551, 552, 552, 552, 552, 552, 552, 552, 554, 554, + + 555, 556, 556, 556, 556, 556, 556, 556, 557, 557, + 558, 559, 559, 559, 559, 559, 559, 559, 491, 1315, + 1315, 1315, 562, 562, 562, 562, 562, 562, 562, 495, + 1235, 1235, 1235, 562, 562, 562, 562, 562, 562, 562, + 491, 1236, 1236, 1236, 563, 563, 563, 563, 563, 563, + 563, 495, 1026, 1026, 1026, 563, 563, 563, 563, 563, + 563, 563, 229, 1492, 1493, 565, 1462, 1029, 1494, 566, + 566, 566, 566, 566, 566, 566, 229, 1497, 1463, 567, + 1354, 1354, 1354, 566, 566, 566, 566, 566, 566, 566, + 565, 1387, 1387, 1387, 568, 568, 568, 568, 568, 568, + + 568, 568, 568, 568, 568, 568, 568, 568, 565, 1294, + 1294, 1294, 569, 569, 569, 569, 569, 569, 569, 569, + 569, 569, 569, 569, 569, 569, 229, 1498, 1499, 571, + 1215, 1215, 1215, 572, 572, 572, 572, 572, 572, 572, + 229, 1500, 1501, 573, 1502, 1216, 1503, 572, 572, 572, + 572, 572, 572, 572, 571, 1394, 1394, 1394, 574, 574, + 574, 574, 574, 574, 574, 574, 574, 574, 574, 574, + 574, 574, 229, 1504, 1505, 587, 588, 588, 589, 590, + 590, 590, 590, 590, 590, 590, 229, 1506, 1508, 591, + 588, 588, 589, 590, 590, 590, 590, 590, 590, 590, + + 229, 1509, 1510, 593, 1217, 1217, 1217, 594, 594, 594, + 594, 594, 594, 594, 229, 1512, 1513, 595, 1514, 1218, + 1515, 594, 594, 594, 594, 594, 594, 594, 229, 1521, + 1522, 616, 617, 617, 618, 619, 619, 619, 619, 619, + 619, 619, 229, 1523, 1524, 620, 617, 617, 618, 619, + 619, 619, 619, 619, 619, 619, 616, 621, 621, 622, + 623, 623, 623, 623, 623, 623, 623, 624, 1312, 1312, + 1312, 625, 625, 625, 625, 625, 625, 625, 626, 1221, + 1221, 1221, 625, 625, 625, 625, 625, 625, 625, 229, + 229, 1525, 638, 675, 1222, 1526, 639, 639, 639, 639, + + 639, 639, 639, 229, 1527, 1528, 640, 1313, 1313, 1313, + 639, 639, 639, 639, 639, 639, 639, 645, 1529, 673, + 1530, 646, 646, 646, 646, 646, 646, 646, 647, 1531, + 674, 1532, 646, 646, 646, 646, 646, 646, 646, 645, + 648, 648, 649, 650, 650, 650, 650, 650, 650, 650, + 229, 1533, 1534, 652, 1044, 1044, 1044, 653, 653, 653, + 653, 653, 653, 653, 229, 1536, 1537, 654, 1539, 1045, + 1540, 653, 653, 653, 653, 653, 653, 653, 229, 1541, + 1542, 677, 1046, 1046, 1046, 678, 678, 678, 678, 678, + 678, 678, 229, 1543, 1550, 679, 1551, 1047, 1552, 678, + + 678, 678, 678, 678, 678, 678, 704, 704, 704, 726, + 1252, 1252, 1252, 727, 1287, 1287, 1287, 1194, 1194, 1194, + 1553, 705, 728, 729, 1554, 1253, 1555, 730, 731, 1288, + 1556, 1557, 1195, 1315, 1315, 1315, 1558, 1296, 1296, 1296, + 1215, 1215, 1215, 1217, 1217, 1217, 706, 707, 708, 732, + 732, 732, 1297, 1342, 1544, 1216, 1559, 1343, 1218, 1447, + 1447, 1447, 1560, 1545, 733, 1344, 1561, 734, 1221, 1221, + 1221, 1354, 1354, 1354, 735, 736, 1026, 1026, 1026, 1345, + 1345, 1345, 1562, 1222, 1563, 1564, 1027, 1028, 1252, 1252, + 1252, 1029, 1565, 1566, 1346, 1030, 1326, 1327, 1328, 1287, + + 1287, 1287, 1567, 1253, 1568, 1329, 1330, 1569, 1331, 1570, + 1332, 1333, 1571, 1572, 1288, 1296, 1296, 1296, 1407, 1407, + 1407, 1412, 1412, 1412, 1418, 1418, 1418, 1436, 1436, 1436, + 1297, 1573, 1574, 1408, 1575, 1576, 1413, 1577, 1578, 1419, + 1579, 1580, 1437, 1440, 1440, 1440, 1345, 1345, 1345, 1387, + 1387, 1387, 1394, 1394, 1394, 1495, 1495, 1495, 1441, 1581, + 1582, 1346, 1407, 1407, 1407, 1507, 1507, 1507, 1583, 1584, + 1496, 1412, 1412, 1412, 1511, 1511, 1511, 1408, 1418, 1418, + 1418, 1516, 1516, 1516, 1585, 1586, 1413, 1517, 1517, 1517, + 1519, 1519, 1519, 1419, 1436, 1436, 1436, 1535, 1535, 1535, + + 1591, 1589, 1518, 1590, 1592, 1520, 1440, 1440, 1440, 1437, + 1538, 1538, 1538, 1447, 1447, 1447, 1587, 1546, 1594, 1595, + 1547, 1441, 1548, 1588, 1495, 1495, 1495, 1593, 1593, 1593, + 1549, 1596, 1597, 1598, 1599, 1600, 1601, 1601, 1601, 1496, + 1603, 1604, 1507, 1507, 1507, 1605, 1606, 1607, 1511, 1511, + 1511, 1602, 1608, 1609, 1610, 1611, 1516, 1516, 1516, 1517, + 1517, 1517, 1612, 1612, 1612, 1519, 1519, 1519, 1613, 1613, + 1613, 1614, 1615, 1616, 1518, 1617, 1618, 1619, 1620, 1621, + 1520, 1622, 1623, 1624, 1625, 1626, 1627, 1535, 1535, 1535, + 1628, 1629, 1538, 1538, 1538, 1630, 1631, 1632, 1633, 1634, + + 1635, 1636, 1637, 1638, 1639, 1640, 1641, 1642, 1643, 1644, + 1645, 1646, 1647, 1648, 1649, 1650, 1651, 1652, 1653, 1654, + 1655, 1656, 1657, 1658, 1659, 1660, 1661, 1662, 1663, 1664, + 1665, 1666, 1666, 1666, 1668, 1669, 1670, 1670, 1670, 1672, + 1673, 1674, 1675, 1676, 1677, 1678, 1667, 1679, 1680, 1681, + 1682, 1671, 1593, 1593, 1593, 1683, 1684, 1685, 1686, 1687, + 1688, 1601, 1601, 1601, 1689, 1689, 1689, 1690, 1691, 1693, + 1694, 1695, 1696, 1692, 1703, 1704, 1602, 1697, 1697, 1697, + 1699, 1699, 1699, 1701, 1701, 1701, 1612, 1612, 1612, 1613, + 1613, 1613, 1698, 1705, 1706, 1700, 1707, 1708, 1702, 1709, + + 1710, 1711, 1712, 1713, 1714, 1715, 1716, 1717, 1718, 1719, + 1720, 1721, 1722, 1723, 1724, 1725, 1726, 1727, 1728, 1729, + 1730, 1732, 1733, 1731, 1734, 1735, 1735, 1735, 1737, 1738, + 1739, 1740, 1741, 1742, 1743, 1744, 1745, 1746, 1747, 1748, + 1736, 1749, 1750, 1751, 1752, 1753, 1666, 1666, 1666, 1756, + 1754, 1755, 1755, 1755, 1757, 1670, 1670, 1670, 1758, 1758, + 1758, 1667, 1759, 1760, 1761, 1761, 1761, 1763, 1764, 1765, + 1671, 1766, 1766, 1766, 1770, 1771, 1772, 1773, 1774, 1762, + 1775, 1767, 1768, 1776, 1777, 1778, 1769, 1779, 1689, 1689, + 1689, 1780, 1781, 1782, 1783, 1784, 1785, 1786, 1697, 1697, + + 1697, 1787, 1787, 1787, 1699, 1699, 1699, 1788, 1788, 1788, + 1701, 1701, 1701, 1698, 1789, 1789, 1789, 1790, 1791, 1700, + 1792, 1793, 1794, 1795, 1796, 1702, 1797, 1797, 1797, 1799, + 1800, 1801, 1802, 1803, 1804, 1805, 1806, 1807, 1808, 1809, + 1810, 1798, 1811, 1812, 1813, 1814, 1815, 1816, 1817, 1818, + 1819, 1820, 1821, 1822, 1735, 1735, 1735, 1823, 1823, 1823, + 1824, 1825, 1826, 1827, 1828, 1829, 1830, 1831, 1832, 1736, + 1833, 1834, 1835, 1836, 1837, 1838, 1839, 1840, 1841, 1755, + 1755, 1755, 1842, 1842, 1842, 1844, 1758, 1758, 1758, 1845, + 1846, 1761, 1761, 1761, 1847, 1847, 1847, 1843, 1848, 1849, + + 1849, 1849, 1851, 1766, 1766, 1766, 1762, 1852, 1852, 1852, + 1854, 1854, 1854, 1857, 1850, 1856, 1856, 1856, 1769, 1858, + 1858, 1858, 1853, 1860, 1861, 1855, 1862, 1863, 1864, 1865, + 1866, 1867, 1868, 1869, 1859, 1870, 1870, 1870, 1872, 1872, + 1872, 1874, 1875, 1876, 1787, 1787, 1787, 1788, 1788, 1788, + 1871, 1877, 1878, 1873, 1789, 1789, 1789, 1879, 1880, 1881, + 1882, 1882, 1882, 1884, 1797, 1797, 1797, 1885, 1885, 1885, + 1886, 1887, 1888, 1889, 1890, 1883, 1894, 1895, 1891, 1798, + 1892, 1892, 1892, 1896, 1897, 1898, 1899, 1900, 1901, 1901, + 1901, 1903, 1904, 1905, 1906, 1893, 1907, 1908, 1909, 1910, + + 1911, 1912, 1913, 1902, 1823, 1823, 1823, 1914, 1915, 1916, + 1916, 1916, 1918, 1919, 1920, 1921, 1922, 1923, 1924, 1925, + 1926, 1927, 1928, 1929, 1917, 1930, 1931, 1931, 1931, 1842, + 1842, 1842, 1936, 1936, 1936, 1937, 1932, 1933, 1934, 1938, + 1939, 1935, 1940, 1942, 1843, 1847, 1847, 1847, 1849, 1849, + 1849, 1941, 1941, 1941, 1852, 1852, 1852, 1943, 1943, 1943, + 1854, 1854, 1854, 1850, 1944, 1944, 1944, 1945, 1947, 1853, + 1856, 1856, 1856, 1950, 1951, 1855, 1858, 1858, 1858, 1946, + 1946, 1946, 1948, 1955, 1952, 1958, 1959, 1949, 1956, 1956, + 1956, 1859, 1953, 1960, 1870, 1870, 1870, 1954, 1961, 1961, + + 1961, 1963, 1964, 1957, 1872, 1872, 1872, 1965, 1966, 1871, + 1962, 1962, 1962, 1967, 1968, 1969, 1970, 1972, 1973, 1873, + 1882, 1882, 1882, 1971, 1971, 1971, 1885, 1885, 1885, 1974, + 1975, 1976, 1978, 1979, 1981, 1883, 1892, 1892, 1892, 1977, + 1980, 1980, 1980, 1982, 1983, 1984, 1985, 1986, 1987, 1989, + 1990, 1893, 1901, 1901, 1901, 1988, 1988, 1988, 1991, 1992, + 1993, 1994, 1995, 1996, 1997, 1998, 1999, 1902, 2000, 2001, + 1916, 1916, 1916, 2002, 2002, 2002, 2003, 2004, 2005, 2006, + 2007, 2008, 2009, 2010, 2011, 1917, 2012, 1931, 1931, 1931, + 2013, 2013, 2013, 2015, 2015, 2015, 2017, 2017, 2017, 2019, + + 2019, 2019, 1935, 2020, 2021, 2014, 2024, 2025, 2016, 2030, + 2031, 2018, 1936, 1936, 1936, 2022, 2022, 2022, 1941, 1941, + 1941, 1943, 1943, 1943, 1944, 1944, 1944, 2026, 2026, 2026, + 2023, 1946, 1946, 1946, 2032, 2033, 2034, 2027, 2028, 2035, + 2036, 2037, 2029, 2038, 1956, 1956, 1956, 2039, 2039, 2039, + 2040, 2041, 2042, 1961, 1961, 1961, 1962, 1962, 1962, 1957, + 2043, 2044, 2044, 2044, 2046, 2047, 2048, 2048, 2048, 2050, + 2051, 2052, 1971, 1971, 1971, 2053, 2045, 2054, 2055, 2056, + 2057, 2049, 2058, 2059, 2060, 1980, 1980, 1980, 2061, 2062, + 2063, 2064, 2065, 2066, 2067, 1988, 1988, 1988, 2068, 2069, + + 2069, 2069, 2071, 2072, 2073, 2074, 2075, 2076, 2077, 2078, + 2079, 2080, 2081, 2082, 2070, 2002, 2002, 2002, 2083, 2084, + 2085, 2086, 2087, 2088, 2089, 2090, 2013, 2013, 2013, 2091, + 2091, 2091, 2015, 2015, 2015, 2092, 2092, 2092, 2017, 2017, + 2017, 2014, 2093, 2093, 2093, 2094, 2095, 2016, 2019, 2019, + 2019, 2097, 2098, 2018, 2022, 2022, 2022, 2096, 2096, 2096, + 2026, 2026, 2026, 2099, 2099, 2099, 2101, 2101, 2101, 2023, + 2103, 2103, 2103, 2104, 2105, 2029, 2110, 2111, 2100, 2112, + 2113, 2102, 2106, 2107, 2108, 2109, 2114, 2115, 2039, 2039, + 2039, 2116, 2117, 2118, 2119, 2044, 2044, 2044, 2120, 2120, + + 2120, 2121, 2121, 2121, 2123, 2048, 2048, 2048, 2125, 2126, + 2045, 2124, 2124, 2124, 2129, 2130, 2122, 2127, 2127, 2127, + 2049, 2131, 2133, 2134, 2134, 2134, 2138, 2139, 2132, 2136, + 2136, 2136, 2128, 2140, 2141, 2142, 2142, 2142, 2135, 2144, + 2145, 2146, 2147, 2148, 2137, 2069, 2069, 2069, 2150, 2151, + 2143, 2149, 2149, 2149, 2152, 2153, 2154, 2155, 2156, 2157, + 2070, 2158, 2159, 2160, 2161, 2162, 2163, 2164, 2165, 2166, + 2167, 2168, 2169, 2169, 2169, 2091, 2091, 2091, 2092, 2092, + 2092, 2093, 2093, 2093, 2171, 2174, 2175, 2170, 2172, 2172, + 2172, 2096, 2096, 2096, 2099, 2099, 2099, 2176, 2176, 2176, + + 2101, 2101, 2101, 2173, 2177, 2177, 2177, 2178, 2179, 2100, + 2103, 2103, 2103, 2188, 2192, 2102, 2180, 2180, 2180, 2182, + 2182, 2182, 2184, 2184, 2184, 2186, 2186, 2186, 2189, 2189, + 2189, 2181, 2193, 2194, 2183, 2195, 2196, 2185, 2197, 2198, + 2187, 2199, 2201, 2190, 2120, 2120, 2120, 2121, 2121, 2121, + 2200, 2200, 2200, 2124, 2124, 2124, 2202, 2191, 2203, 2127, + 2127, 2127, 2122, 2204, 2204, 2204, 2205, 2206, 2206, 2206, + 2208, 2209, 2210, 2213, 2128, 2134, 2134, 2134, 2211, 2211, + 2211, 2216, 2207, 2136, 2136, 2136, 2212, 2212, 2212, 2217, + 2135, 2214, 2214, 2214, 2142, 2142, 2142, 2219, 2137, 2218, + + 2218, 2218, 2220, 2221, 2222, 2223, 2215, 2224, 2227, 2143, + 2149, 2149, 2149, 2225, 2225, 2225, 2228, 2229, 2229, 2229, + 2231, 2231, 2231, 2233, 2233, 2233, 2235, 2236, 2226, 2237, + 2238, 2239, 2230, 2240, 2241, 2232, 2242, 2243, 2234, 2244, + 2245, 2246, 2169, 2169, 2169, 2250, 2252, 2247, 2249, 2249, + 2249, 2253, 2248, 2172, 2172, 2172, 2254, 2170, 2251, 2251, + 2251, 2176, 2176, 2176, 2177, 2177, 2177, 2255, 2173, 2180, + 2180, 2180, 2256, 2256, 2256, 2182, 2182, 2182, 2257, 2257, + 2257, 2184, 2184, 2184, 2181, 2258, 2258, 2258, 2263, 2264, + 2183, 2186, 2186, 2186, 2265, 2266, 2185, 2259, 2259, 2259, + + 2260, 2260, 2260, 2189, 2189, 2189, 2187, 2262, 2262, 2262, + 2267, 2268, 2268, 2268, 2270, 2261, 2271, 2272, 2190, 2200, + 2200, 2200, 2273, 2274, 2275, 2276, 2269, 2204, 2204, 2204, + 2206, 2206, 2206, 2277, 2277, 2277, 2278, 2279, 2280, 2211, + 2211, 2211, 2212, 2212, 2212, 2207, 2281, 2214, 2214, 2214, + 2282, 2282, 2282, 2283, 2284, 2218, 2218, 2218, 2285, 2285, + 2285, 2287, 2215, 2288, 2289, 2289, 2289, 2291, 2291, 2291, + 2293, 2297, 2301, 2286, 2225, 2225, 2225, 2304, 2305, 2290, + 2306, 2307, 2292, 2294, 2294, 2294, 2295, 2295, 2295, 2226, + 2229, 2229, 2229, 2298, 2298, 2298, 2231, 2231, 2231, 2308, + + 2309, 2296, 2299, 2299, 2299, 2230, 2233, 2233, 2233, 2310, + 2311, 2232, 2300, 2300, 2300, 2302, 2302, 2302, 2312, 2313, + 2314, 2234, 2315, 2249, 2249, 2249, 2316, 2251, 2251, 2251, + 2303, 2317, 2318, 2319, 2320, 2320, 2320, 2256, 2256, 2256, + 2257, 2257, 2257, 2258, 2258, 2258, 2259, 2259, 2259, 2321, + 2260, 2260, 2260, 2322, 2322, 2322, 2262, 2262, 2262, 2323, + 2323, 2323, 2327, 2328, 2329, 2261, 2330, 2330, 2330, 2324, + 2325, 2268, 2268, 2268, 2326, 2332, 2332, 2332, 2333, 2334, + 2336, 2331, 2337, 2338, 2335, 2339, 2269, 2340, 2277, 2277, + 2277, 2341, 2342, 2343, 2344, 2282, 2282, 2282, 2345, 2346, + + 2285, 2285, 2285, 2347, 2347, 2347, 2348, 2349, 2289, 2289, + 2289, 2350, 2350, 2350, 2352, 2286, 2291, 2291, 2291, 2351, + 2351, 2351, 2354, 2290, 2294, 2294, 2294, 2295, 2295, 2295, + 2355, 2292, 2353, 2353, 2353, 2298, 2298, 2298, 2299, 2299, + 2299, 2357, 2296, 2300, 2300, 2300, 2302, 2302, 2302, 2356, + 2356, 2356, 2358, 2359, 2360, 2361, 2361, 2361, 2363, 2363, + 2363, 2303, 2365, 2366, 2367, 2368, 2369, 2370, 2371, 2372, + 2362, 2373, 2380, 2364, 2320, 2320, 2320, 2374, 2374, 2374, + 2322, 2322, 2322, 2323, 2323, 2323, 2375, 2375, 2375, 2321, + 2377, 2377, 2377, 2379, 2379, 2379, 2381, 2382, 2326, 2384, + + 2385, 2376, 2330, 2330, 2330, 2378, 2383, 2383, 2383, 2332, + 2332, 2332, 2386, 2387, 2388, 2389, 2390, 2331, 2391, 2392, + 2392, 2392, 2394, 2395, 2395, 2395, 2397, 2398, 2399, 2347, + 2347, 2347, 2400, 2401, 2393, 2350, 2350, 2350, 2396, 2351, + 2351, 2351, 2402, 2353, 2353, 2353, 2403, 2404, 2404, 2404, + 2356, 2356, 2356, 2406, 2407, 2408, 2409, 2361, 2361, 2361, + 2412, 2413, 2405, 2410, 2410, 2410, 2363, 2363, 2363, 2411, + 2411, 2411, 2362, 2414, 2415, 2416, 2417, 2417, 2417, 2419, + 2420, 2364, 2374, 2374, 2374, 2375, 2375, 2375, 2421, 2421, + 2421, 2418, 2377, 2377, 2377, 2422, 2422, 2422, 2423, 2424, + + 2376, 2379, 2379, 2379, 2425, 2426, 2427, 2378, 2383, 2383, + 2383, 2428, 2428, 2428, 2433, 2434, 2435, 2436, 2436, 2436, + 2438, 2429, 2430, 2431, 2440, 2442, 2432, 2392, 2392, 2392, + 2443, 2449, 2437, 2439, 2439, 2439, 2395, 2395, 2395, 2441, + 2441, 2441, 2393, 2444, 2444, 2444, 2447, 2447, 2447, 2450, + 2454, 2396, 2451, 2451, 2451, 2404, 2404, 2404, 2445, 2455, + 2456, 2448, 2453, 2453, 2453, 2459, 2460, 2452, 2461, 2446, + 2405, 2457, 2457, 2457, 2410, 2410, 2410, 2411, 2411, 2411, + 2462, 2463, 2463, 2463, 2467, 2470, 2458, 2417, 2417, 2417, + 2466, 2466, 2466, 2468, 2468, 2468, 2464, 2421, 2421, 2421, + + 2471, 2472, 2418, 2422, 2422, 2422, 2475, 2485, 2469, 2486, + 2465, 2473, 2473, 2473, 2428, 2428, 2428, 2476, 2476, 2476, + 2478, 2478, 2478, 2480, 2480, 2480, 2474, 2494, 2496, 2432, + 2498, 2502, 2477, 2503, 2504, 2479, 2506, 2507, 2481, 2482, + 2482, 2482, 2483, 2483, 2483, 2436, 2436, 2436, 2487, 2487, + 2487, 2488, 2488, 2488, 2439, 2439, 2439, 2484, 2508, 2510, + 2437, 2490, 2490, 2490, 2511, 2515, 2489, 2441, 2441, 2441, + 2492, 2492, 2492, 2444, 2444, 2444, 2491, 2495, 2495, 2495, + 2447, 2447, 2447, 2516, 2518, 2493, 2523, 2524, 2445, 2497, + 2497, 2497, 2499, 2499, 2499, 2448, 2451, 2451, 2451, 2501, + + 2501, 2501, 2453, 2453, 2453, 2528, 2529, 2500, 2457, 2457, + 2457, 2452, 2505, 2505, 2505, 2463, 2463, 2463, 2509, 2509, + 2509, 2530, 2534, 2458, 2466, 2466, 2466, 2468, 2468, 2468, + 2464, 2512, 2512, 2512, 2513, 2513, 2513, 2473, 2473, 2473, + 2537, 2538, 2469, 2517, 2517, 2517, 2476, 2476, 2476, 2514, + 2539, 2547, 2474, 2519, 2519, 2519, 2478, 2478, 2478, 2548, + 2551, 2477, 2520, 2520, 2520, 2480, 2480, 2480, 2521, 2521, + 2521, 2479, 2482, 2482, 2482, 2483, 2483, 2483, 2552, 2553, + 2481, 2522, 2522, 2522, 2487, 2487, 2487, 2488, 2488, 2488, + 2484, 2525, 2525, 2525, 2490, 2490, 2490, 2526, 2526, 2526, + + 2554, 2555, 2489, 2492, 2492, 2492, 2527, 2527, 2527, 2491, + 2495, 2495, 2495, 2497, 2497, 2497, 2562, 2563, 2493, 2499, + 2499, 2499, 2531, 2531, 2531, 2501, 2501, 2501, 2532, 2532, + 2532, 2535, 2535, 2535, 2500, 2505, 2505, 2505, 2509, 2509, + 2509, 2570, 2571, 2533, 2575, 2578, 2536, 2540, 2540, 2540, + 2544, 2544, 2544, 2512, 2512, 2512, 2579, 2541, 2542, 2513, + 2513, 2513, 2543, 2582, 2583, 2545, 2546, 2546, 2546, 2517, + 2517, 2517, 2586, 2587, 2514, 2549, 2549, 2549, 2519, 2519, + 2519, 2520, 2520, 2520, 2521, 2521, 2521, 2522, 2522, 2522, + 2550, 2525, 2525, 2525, 2526, 2526, 2526, 2527, 2527, 2527, + + 2531, 2531, 2531, 2532, 2532, 2532, 2556, 2556, 2556, 2557, + 2557, 2557, 2535, 2535, 2535, 2559, 2559, 2559, 2533, 2560, + 2560, 2560, 2589, 2591, 2558, 2592, 2593, 2536, 2540, 2540, + 2540, 2564, 2564, 2564, 2561, 2566, 2566, 2566, 2568, 2568, + 2568, 2594, 2597, 2543, 2598, 2599, 2565, 2544, 2544, 2544, + 2567, 2569, 2569, 2569, 2546, 2546, 2546, 2549, 2549, 2549, + 2600, 2601, 2545, 2572, 2572, 2572, 2573, 2573, 2573, 2576, + 2576, 2576, 2550, 2556, 2556, 2556, 2557, 2557, 2557, 2602, + 2608, 2574, 2609, 2610, 2577, 2580, 2580, 2580, 2559, 2559, + 2559, 2558, 2560, 2560, 2560, 2581, 2581, 2581, 2564, 2564, + + 2564, 2584, 2584, 2584, 2566, 2566, 2566, 2561, 2585, 2585, + 2585, 2611, 2614, 2565, 2568, 2568, 2568, 2619, 2623, 2567, + 2569, 2569, 2569, 2572, 2572, 2572, 2573, 2573, 2573, 2588, + 2588, 2588, 2576, 2576, 2576, 2590, 2590, 2590, 2580, 2580, + 2580, 2574, 2581, 2581, 2581, 2624, 2625, 2577, 2584, 2584, + 2584, 2585, 2585, 2585, 2595, 2595, 2595, 2588, 2588, 2588, + 2590, 2590, 2590, 2595, 2595, 2595, 2603, 2603, 2603, 2596, + 2604, 2604, 2604, 2606, 2606, 2606, 2627, 2628, 2596, 2603, + 2603, 2603, 2604, 2604, 2604, 2605, 2629, 2630, 2607, 2612, + 2612, 2612, 2606, 2606, 2606, 2617, 2631, 2605, 2613, 2613, + + 2613, 2615, 2615, 2615, 2612, 2612, 2612, 2607, 2618, 2613, + 2613, 2613, 2620, 2620, 2620, 2632, 2616, 2615, 2615, 2615, + 2622, 2622, 2622, 2620, 2620, 2620, 2633, 2621, 2626, 2626, + 2626, 2634, 2616, 2622, 2622, 2622, 2635, 2636, 2621, 2626, + 2626, 2626, 2637, 2638, 2639, 2640, 2641, 2642, 228, 228, + 228, 228, 228, 228, 228, 228, 252, 252, 252, 252, + 252, 252, 252, 252, 258, 258, 258, 258, 258, 258, + 258, 258, 264, 264, 264, 264, 264, 264, 264, 264, + 268, 268, 268, 268, 268, 268, 268, 268, 272, 272, + 272, 272, 272, 272, 272, 272, 278, 278, 278, 278, + + 278, 278, 278, 278, 284, 284, 284, 284, 284, 284, + 284, 284, 290, 290, 290, 290, 290, 290, 290, 290, + 294, 294, 294, 294, 294, 294, 294, 294, 300, 300, + 300, 300, 300, 300, 300, 300, 306, 306, 306, 306, + 306, 306, 306, 306, 312, 312, 312, 312, 312, 312, + 312, 312, 318, 318, 318, 318, 318, 318, 318, 318, + 324, 324, 324, 324, 324, 324, 324, 324, 328, 328, + 328, 328, 328, 328, 328, 328, 349, 349, 349, 349, + 349, 349, 349, 349, 354, 354, 354, 354, 354, 354, + 354, 354, 357, 357, 357, 357, 357, 357, 357, 357, + + 358, 358, 358, 358, 358, 358, 358, 358, 363, 363, + 363, 363, 363, 363, 363, 363, 368, 368, 368, 368, + 368, 368, 368, 368, 377, 377, 377, 377, 377, 377, + 377, 377, 381, 381, 381, 381, 381, 381, 381, 381, + 386, 386, 386, 386, 386, 386, 386, 386, 391, 391, + 391, 391, 391, 391, 391, 391, 396, 396, 396, 396, + 396, 396, 396, 396, 401, 401, 401, 401, 401, 401, + 401, 401, 406, 406, 406, 406, 406, 406, 406, 406, + 410, 410, 410, 410, 410, 410, 410, 410, 415, 415, + 415, 415, 415, 415, 415, 415, 419, 419, 419, 419, + + 419, 419, 419, 419, 423, 423, 423, 423, 423, 423, + 423, 423, 427, 427, 427, 427, 427, 427, 427, 427, + 431, 431, 431, 431, 431, 431, 431, 431, 435, 435, + 435, 435, 435, 435, 435, 435, 439, 439, 439, 439, + 439, 439, 439, 439, 443, 443, 443, 443, 443, 443, + 443, 443, 447, 447, 447, 447, 447, 447, 447, 447, + 453, 453, 453, 453, 453, 453, 453, 453, 458, 458, + 458, 458, 458, 458, 458, 458, 469, 469, 469, 469, + 469, 469, 469, 469, 473, 473, 473, 473, 473, 473, + 473, 473, 479, 479, 479, 479, 479, 479, 479, 479, + + 483, 483, 483, 483, 483, 483, 483, 483, 496, 496, + 496, 496, 496, 496, 496, 496, 353, 353, 353, 353, + 353, 353, 353, 353, 523, 523, 523, 523, 523, 523, + 523, 523, 528, 528, 528, 528, 528, 528, 528, 528, + 543, 543, 543, 543, 543, 543, 543, 543, 564, 564, + 564, 564, 564, 564, 564, 564, 570, 570, 570, 570, + 570, 570, 570, 570, 581, 581, 581, 581, 581, 581, + 581, 581, 586, 586, 586, 586, 586, 586, 586, 586, + 592, 592, 592, 592, 592, 592, 592, 592, 596, 596, + 596, 596, 596, 596, 596, 596, 606, 606, 606, 606, + + 606, 606, 606, 606, 611, 611, 611, 611, 611, 611, + 611, 611, 615, 615, 615, 615, 615, 615, 615, 615, + 627, 627, 627, 627, 627, 627, 627, 627, 632, 632, + 632, 632, 632, 632, 632, 632, 637, 637, 637, 637, + 637, 637, 637, 637, 651, 651, 651, 651, 651, 651, + 651, 651, 655, 655, 655, 655, 655, 655, 655, 655, + 660, 660, 660, 660, 660, 660, 660, 660, 666, 666, + 666, 666, 666, 666, 666, 666, 671, 671, 671, 671, + 671, 671, 671, 671, 676, 676, 676, 676, 676, 676, + 676, 676, 680, 680, 680, 680, 680, 680, 680, 680, + + 682, 682, 682, 682, 682, 682, 682, 682, 740, 740, + 741, 741, 746, 747, 749, 749, 750, 750, 753, 753, + 754, 754, 757, 757, 758, 758, 760, 762, 762, 763, + 763, 766, 766, 767, 767, 770, 770, 771, 771, 774, + 774, 775, 775, 778, 778, 779, 779, 781, 783, 783, + 784, 784, 787, 787, 788, 788, 790, 792, 792, 793, + 793, 800, 801, 802, 828, 802, 802, 802, 803, 829, + 803, 803, 803, 831, 832, 833, 834, 835, 836, 837, + 838, 839, 842, 843, 847, 848, 850, 850, 851, 851, + 853, 855, 855, 856, 856, 858, 860, 860, 861, 861, + + 868, 869, 868, 868, 868, 870, 871, 872, 873, 874, + 876, 876, 877, 877, 880, 880, 881, 881, 884, 884, + 885, 885, 888, 888, 889, 889, 892, 892, 893, 893, + 898, 898, 899, 899, 902, 902, 903, 903, 906, 906, + 907, 907, 910, 910, 911, 911, 914, 914, 915, 915, + 918, 918, 919, 919, 922, 922, 923, 923, 926, 926, + 927, 927, 931, 932, 933, 934, 935, 936, 937, 945, + 945, 946, 946, 948, 959, 959, 960, 960, 963, 963, + 964, 964, 966, 971, 974, 976, 976, 977, 977, 979, + 989, 990, 990, 746, 990, 990, 990, 990, 990, 991, + + 747, 760, 991, 991, 991, 991, 991, 741, 741, 740, + 740, 1070, 781, 1070, 750, 750, 749, 749, 1076, 790, + 1076, 754, 754, 753, 753, 1078, 800, 1078, 758, 758, + 757, 757, 1080, 801, 1080, 763, 763, 762, 762, 1082, + 828, 1082, 767, 767, 766, 766, 1084, 829, 1084, 771, + 771, 770, 770, 1086, 831, 1086, 775, 775, 774, 774, + 1088, 832, 1088, 779, 779, 778, 778, 1090, 833, 1090, + 784, 784, 783, 783, 1092, 834, 1092, 788, 788, 787, + 787, 1094, 835, 1094, 793, 793, 792, 792, 1096, 836, + 1096, 802, 837, 802, 802, 802, 803, 838, 803, 803, + + 803, 839, 842, 843, 847, 848, 851, 851, 850, 850, + 1121, 853, 1121, 856, 856, 855, 855, 1123, 858, 1123, + 861, 861, 860, 860, 1125, 869, 1125, 868, 870, 868, + 868, 868, 871, 872, 873, 874, 877, 877, 876, 876, + 1131, 931, 1131, 881, 881, 880, 880, 1133, 932, 1133, + 885, 885, 884, 884, 1135, 933, 1135, 889, 889, 888, + 888, 1137, 934, 1137, 893, 893, 892, 892, 1139, 935, + 1139, 899, 899, 898, 898, 1142, 936, 1142, 903, 903, + 902, 902, 1144, 937, 1144, 907, 907, 906, 906, 1146, + 948, 1146, 911, 911, 910, 910, 1148, 966, 1148, 915, + + 915, 914, 914, 1150, 971, 1150, 919, 919, 918, 918, + 1152, 974, 1152, 923, 923, 922, 922, 1154, 979, 1154, + 927, 927, 926, 926, 1156, 989, 1156, 946, 946, 945, + 945, 1162, 1070, 1162, 960, 960, 959, 959, 1170, 1076, + 1170, 964, 964, 963, 963, 1172, 1078, 1172, 977, 977, + 976, 976, 1177, 1080, 1177, 990, 990, 1082, 990, 990, + 990, 990, 990, 991, 1084, 1086, 991, 991, 991, 991, + 991, 1088, 1090, 1092, 1094, 1096, 1121, 1123, 1125, 1131, + 1133, 1135, 1137, 1139, 1142, 1144, 1146, 1148, 1150, 1152, + 1154, 1156, 1162, 1353, 1352, 1351, 1350, 1349, 1348, 1347, + + 1339, 1338, 1337, 1336, 1335, 1334, 1325, 1324, 1323, 1322, + 1321, 1320, 1319, 1318, 1317, 1316, 1314, 1311, 1310, 1309, + 1308, 1307, 1306, 1305, 1304, 1303, 1302, 1301, 1300, 1299, + 1298, 1295, 1293, 1292, 1291, 1290, 1289, 1286, 1285, 1284, + 1281, 1280, 1279, 1278, 1277, 1276, 1275, 1274, 1273, 1272, + 1271, 1270, 1269, 1268, 1267, 1266, 1263, 1262, 1261, 1260, + 1259, 1255, 1254, 1251, 1250, 1249, 1248, 1247, 1246, 1245, + 1244, 1243, 1242, 1241, 1240, 1239, 1238, 1237, 1234, 1233, + 1232, 1231, 1230, 1229, 1228, 1227, 1226, 1225, 1224, 1223, + 1220, 1214, 1213, 1212, 1208, 1207, 1206, 1205, 1204, 1203, + + 1202, 1201, 1200, 1199, 1198, 1197, 1196, 1193, 1192, 1191, + 1190, 1189, 1188, 1187, 1186, 1183, 1182, 1181, 1180, 1179, + 1178, 978, 975, 978, 1175, 1174, 1173, 965, 962, 965, + 961, 958, 961, 1168, 1167, 1166, 1165, 1164, 1163, 947, + 944, 947, 1160, 1159, 1158, 1157, 928, 925, 928, 924, + 921, 924, 920, 917, 920, 916, 913, 916, 912, 909, + 912, 908, 905, 908, 904, 901, 904, 900, 897, 900, + 1140, 894, 891, 894, 890, 887, 890, 886, 883, 886, + 882, 879, 882, 878, 875, 878, 1129, 1128, 1127, 1126, + 862, 859, 862, 857, 854, 857, 852, 849, 852, 1119, + + 1118, 1117, 1116, 1115, 1114, 1113, 1112, 1111, 1110, 1109, + 1108, 1107, 1106, 1105, 1104, 1103, 1102, 1101, 1100, 1099, + 1098, 1097, 794, 791, 794, 789, 786, 789, 785, 782, + 785, 780, 777, 780, 776, 773, 776, 772, 769, 772, + 768, 765, 768, 764, 761, 764, 759, 756, 759, 755, + 752, 755, 751, 748, 751, 1074, 1073, 742, 739, 742, + 1068, 1067, 1066, 1065, 1064, 1062, 1058, 1055, 1054, 1053, + 1052, 1043, 1042, 1041, 1040, 1039, 1032, 1031, 1025, 1015, + 1011, 1010, 1009, 1008, 1007, 1006, 1005, 1004, 1003, 1000, + 999, 998, 997, 996, 993, 992, 988, 987, 986, 985, + + 984, 983, 982, 981, 980, 975, 978, 975, 973, 972, + 970, 969, 968, 967, 962, 965, 962, 958, 961, 958, + 957, 956, 955, 954, 953, 952, 951, 950, 949, 944, + 947, 944, 943, 942, 941, 940, 939, 938, 930, 929, + 925, 928, 925, 921, 924, 921, 917, 920, 917, 913, + 916, 913, 909, 912, 909, 905, 908, 905, 901, 904, + 901, 897, 900, 897, 896, 895, 891, 894, 891, 887, + 890, 887, 883, 886, 883, 879, 882, 879, 875, 878, + 875, 867, 866, 865, 864, 863, 859, 862, 859, 854, + 857, 854, 849, 852, 849, 846, 845, 844, 841, 840, + + 830, 827, 826, 825, 824, 823, 822, 821, 820, 819, + 818, 814, 813, 810, 809, 808, 807, 806, 805, 804, + 799, 798, 797, 796, 795, 791, 794, 791, 786, 789, + 786, 782, 785, 782, 777, 780, 777, 773, 776, 773, + 769, 772, 769, 765, 768, 765, 761, 764, 761, 756, + 759, 756, 752, 755, 752, 748, 751, 748, 745, 744, + 743, 739, 742, 739, 738, 737, 703, 702, 685, 684, + 2643, 229, 229, 647, 620, 553, 553, 553, 533, 533, + 533, 510, 510, 510, 510, 342, 342, 342, 330, 330, + 330, 229, 227, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643 + } ; + +static yyconst flex_int16_t yy_chk[6138] = + { 0, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 3, 225, 225, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 4, 246, + 246, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 5, 6, 31, 5, 6, 31, 32, 33, + 34, 32, 33, 34, 59, 53, 218, 59, 53, 218, + 54, 5, 6, 54, 1098, 53, 5, 6, 53, 55, + + 54, 1098, 55, 54, 59, 5, 6, 7, 95, 55, + 7, 95, 55, 157, 7, 7, 7, 7, 7, 7, + 7, 8, 235, 224, 8, 189, 224, 95, 8, 8, + 8, 8, 8, 8, 8, 9, 235, 95, 9, 157, + 238, 95, 9, 9, 9, 9, 9, 9, 9, 10, + 157, 238, 10, 233, 189, 233, 10, 10, 10, 10, + 10, 10, 10, 11, 233, 189, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 12, 692, 1254, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 13, 1255, 692, 13, 13, 13, 13, 13, 13, + + 13, 13, 13, 13, 13, 14, 742, 742, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, + 751, 751, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 16, 755, 755, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 17, 226, 226, + 17, 244, 226, 244, 17, 17, 17, 17, 17, 17, + 17, 18, 237, 730, 18, 1256, 730, 237, 18, 18, + 18, 18, 18, 18, 18, 19, 730, 1257, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 20, + 759, 759, 20, 20, 20, 20, 20, 20, 20, 20, + + 20, 20, 20, 21, 764, 764, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 22, 768, 768, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 23, 772, 772, 23, 23, 23, 23, 23, 23, + 23, 23, 23, 23, 23, 24, 776, 776, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, + 780, 780, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 26, 785, 785, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 27, 789, 789, + 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + + 27, 28, 794, 794, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 29, 242, 743, 29, 743, + 242, 373, 29, 29, 29, 29, 29, 29, 29, 30, + 373, 1259, 30, 852, 852, 242, 30, 30, 30, 30, + 30, 30, 30, 35, 857, 857, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 37, 37, 37, + 37, 37, 37, 37, 37, 37, 37, 39, 709, 61, + 39, 1261, 61, 709, 39, 39, 39, 39, 39, 39, + 39, 41, 41, 41, 41, 41, 41, 41, 41, 41, + 41, 41, 43, 56, 60, 43, 56, 60, 61, 62, + + 1262, 43, 62, 56, 63, 64, 56, 63, 64, 61, + 862, 862, 245, 43, 60, 245, 158, 43, 65, 43, + 66, 65, 96, 66, 245, 96, 43, 44, 62, 1263, + 44, 878, 878, 63, 64, 67, 44, 173, 67, 62, + 686, 96, 158, 1264, 63, 64, 686, 65, 44, 66, + 728, 96, 44, 158, 44, 96, 882, 882, 65, 728, + 66, 44, 45, 173, 67, 1265, 45, 45, 45, 45, + 45, 45, 45, 46, 173, 67, 1266, 46, 46, 46, + 46, 46, 46, 46, 47, 243, 1267, 47, 886, 886, + 174, 47, 47, 47, 47, 47, 47, 47, 48, 243, + + 1268, 48, 243, 243, 1269, 48, 48, 48, 48, 48, + 48, 48, 49, 49, 49, 49, 174, 49, 49, 49, + 705, 705, 705, 68, 890, 890, 68, 174, 49, 49, + 50, 50, 50, 50, 379, 50, 50, 50, 993, 379, + 1270, 69, 379, 993, 69, 1271, 50, 50, 51, 51, + 51, 51, 68, 51, 51, 51, 70, 894, 894, 70, + 113, 900, 900, 68, 51, 51, 52, 52, 52, 52, + 69, 52, 52, 52, 114, 1272, 113, 1273, 113, 1274, + 113, 69, 52, 52, 57, 70, 113, 57, 139, 1242, + 114, 139, 114, 247, 114, 247, 70, 113, 1242, 247, + + 114, 140, 57, 57, 140, 183, 57, 57, 183, 904, + 904, 114, 57, 183, 183, 57, 58, 139, 1184, 58, + 714, 714, 714, 175, 176, 1184, 175, 176, 139, 1275, + 140, 908, 908, 215, 58, 58, 215, 177, 58, 58, + 177, 140, 1276, 215, 58, 1278, 215, 58, 71, 912, + 912, 71, 175, 176, 1279, 71, 71, 71, 71, 71, + 71, 71, 72, 175, 176, 72, 177, 916, 916, 72, + 72, 72, 72, 72, 72, 72, 73, 177, 1280, 73, + 733, 733, 733, 73, 73, 73, 73, 73, 73, 73, + 1281, 184, 702, 216, 184, 178, 216, 702, 178, 184, + + 184, 185, 187, 216, 185, 187, 216, 702, 188, 73, + 74, 188, 1282, 74, 920, 920, 1283, 74, 74, 74, + 74, 74, 74, 74, 178, 1284, 213, 924, 924, 213, + 185, 187, 1258, 190, 185, 178, 190, 188, 928, 928, + 1258, 185, 187, 74, 75, 213, 213, 75, 188, 213, + 1285, 75, 75, 75, 75, 75, 75, 75, 76, 947, + 947, 76, 190, 961, 961, 76, 76, 76, 76, 76, + 76, 76, 77, 190, 1286, 77, 704, 704, 704, 77, + 77, 77, 77, 77, 77, 77, 78, 965, 965, 78, + 1289, 704, 1290, 78, 78, 78, 78, 78, 78, 78, + + 79, 978, 978, 79, 706, 706, 706, 79, 79, 79, + 79, 79, 79, 79, 80, 1169, 1169, 80, 1291, 706, + 1292, 80, 80, 80, 80, 80, 80, 80, 81, 1171, + 1171, 81, 707, 707, 707, 81, 81, 81, 81, 81, + 81, 81, 82, 1176, 1176, 82, 1293, 707, 1295, 82, + 82, 82, 82, 82, 82, 82, 83, 1298, 1299, 83, + 708, 708, 708, 83, 83, 83, 83, 83, 83, 83, + 84, 1300, 1301, 84, 1302, 708, 1305, 84, 84, 84, + 84, 84, 84, 84, 85, 1306, 1307, 85, 721, 721, + 721, 85, 85, 85, 85, 85, 85, 85, 86, 1303, + + 1309, 86, 1303, 721, 1310, 86, 86, 86, 86, 86, + 86, 86, 87, 1311, 1316, 87, 722, 722, 722, 87, + 87, 87, 87, 87, 87, 87, 88, 1317, 1318, 88, + 1319, 722, 1321, 88, 88, 88, 88, 88, 88, 88, + 89, 1322, 1323, 89, 732, 732, 732, 89, 89, 89, + 89, 89, 89, 89, 90, 186, 1324, 90, 186, 732, + 1325, 90, 90, 90, 90, 90, 90, 90, 91, 1326, + 1327, 91, 1002, 1002, 1002, 91, 91, 91, 91, 91, + 91, 91, 1329, 197, 186, 91, 197, 1002, 186, 198, + 199, 1304, 198, 199, 715, 186, 715, 91, 92, 1304, + + 1330, 92, 723, 715, 723, 92, 92, 92, 92, 92, + 92, 92, 197, 1328, 1331, 92, 723, 200, 198, 199, + 200, 1328, 723, 197, 1016, 1016, 1016, 92, 93, 198, + 199, 93, 1332, 93, 1333, 93, 93, 93, 93, 93, + 93, 93, 94, 1334, 1335, 94, 200, 94, 1337, 94, + 94, 94, 94, 94, 94, 94, 97, 200, 1338, 1340, + 97, 97, 97, 97, 97, 97, 97, 98, 1018, 1018, + 1018, 98, 98, 98, 98, 98, 98, 98, 99, 1020, + 1020, 1020, 99, 99, 99, 99, 99, 99, 99, 100, + 1022, 1022, 1022, 100, 100, 100, 100, 100, 100, 100, + + 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, + 103, 1341, 1342, 103, 1017, 1017, 1017, 103, 103, 103, + 103, 103, 103, 103, 104, 1343, 1344, 104, 1347, 1017, + 1348, 104, 104, 104, 104, 104, 104, 104, 105, 1349, + 1350, 105, 105, 105, 105, 105, 105, 105, 105, 105, + 105, 105, 106, 1351, 1353, 106, 106, 106, 106, 106, + 106, 106, 106, 106, 106, 106, 107, 1355, 1356, 107, + 1019, 1019, 1019, 107, 107, 107, 107, 107, 107, 107, + 108, 1357, 1352, 108, 1358, 1019, 1352, 108, 108, 108, + 108, 108, 108, 108, 109, 1359, 1360, 109, 109, 109, + + 109, 109, 109, 109, 109, 109, 109, 109, 110, 1362, + 1363, 110, 110, 110, 110, 110, 110, 110, 110, 110, + 110, 110, 111, 1029, 1029, 1029, 111, 111, 111, 111, + 111, 111, 111, 214, 1364, 203, 214, 236, 203, 234, + 234, 1365, 204, 236, 1366, 204, 234, 1367, 111, 1368, + 234, 236, 214, 214, 1369, 1370, 214, 1371, 1372, 111, + 115, 115, 115, 115, 203, 115, 115, 115, 1373, 211, + 212, 204, 211, 212, 1374, 203, 115, 115, 116, 116, + 116, 116, 204, 116, 116, 116, 1045, 1045, 1045, 1375, + 219, 1376, 1377, 219, 116, 116, 117, 1379, 211, 212, + + 117, 117, 117, 117, 117, 117, 117, 118, 1067, 211, + 212, 118, 118, 118, 118, 118, 118, 118, 119, 219, + 1067, 1067, 119, 119, 119, 119, 119, 119, 119, 120, + 219, 1380, 1381, 120, 120, 120, 120, 120, 120, 120, + 121, 1047, 1047, 1047, 121, 121, 121, 121, 121, 121, + 121, 122, 1063, 1063, 1063, 122, 122, 122, 122, 122, + 122, 122, 123, 1195, 1195, 1195, 123, 123, 123, 123, + 123, 123, 123, 124, 1209, 1209, 1209, 124, 124, 124, + 124, 124, 124, 124, 125, 1210, 1210, 1210, 125, 125, + 125, 125, 125, 125, 125, 126, 1211, 1211, 1211, 126, + + 126, 126, 126, 126, 126, 126, 127, 1216, 1216, 1216, + 127, 127, 127, 127, 127, 127, 127, 128, 1218, 1218, + 1218, 128, 128, 128, 128, 128, 128, 128, 129, 1021, + 1021, 1021, 129, 129, 129, 129, 129, 129, 129, 129, + 129, 129, 130, 1382, 1021, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 131, 1219, 1219, 1219, + 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, + 133, 133, 133, 133, 133, 133, 133, 133, 133, 133, + 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, + 137, 137, 137, 137, 137, 137, 137, 137, 137, 137, + + 141, 1383, 1384, 141, 141, 141, 141, 141, 141, 141, + 141, 141, 141, 141, 142, 1385, 1386, 142, 142, 142, + 142, 142, 142, 142, 142, 142, 142, 142, 143, 143, + 143, 143, 143, 143, 143, 143, 143, 143, 143, 145, + 145, 145, 145, 145, 145, 145, 145, 145, 145, 147, + 147, 147, 147, 147, 147, 147, 147, 147, 147, 149, + 1388, 1389, 149, 149, 149, 149, 149, 149, 149, 149, + 149, 149, 149, 150, 1390, 1391, 150, 150, 150, 150, + 150, 150, 150, 150, 150, 150, 150, 151, 151, 151, + 151, 151, 151, 151, 151, 151, 151, 151, 153, 153, + + 153, 153, 153, 153, 153, 153, 153, 153, 155, 155, + 155, 155, 155, 155, 155, 155, 155, 155, 159, 1222, + 1222, 1222, 159, 159, 159, 159, 159, 159, 159, 160, + 1235, 1235, 1235, 160, 160, 160, 160, 160, 160, 160, + 161, 1236, 1236, 1236, 161, 161, 161, 161, 161, 161, + 161, 162, 1026, 1026, 1026, 162, 162, 162, 162, 162, + 162, 162, 163, 1392, 1393, 163, 1361, 1026, 1395, 163, + 163, 163, 163, 163, 163, 163, 164, 1397, 1361, 164, + 1253, 1253, 1253, 164, 164, 164, 164, 164, 164, 164, + 165, 1288, 1288, 1288, 165, 165, 165, 165, 165, 165, + + 165, 166, 166, 166, 166, 166, 166, 166, 167, 1294, + 1294, 1294, 167, 167, 167, 167, 167, 167, 167, 168, + 168, 168, 168, 168, 168, 168, 169, 1398, 1399, 169, + 1027, 1027, 1027, 169, 169, 169, 169, 169, 169, 169, + 170, 1400, 1401, 170, 1402, 1027, 1403, 170, 170, 170, + 170, 170, 170, 170, 171, 1297, 1297, 1297, 171, 171, + 171, 171, 171, 171, 171, 172, 172, 172, 172, 172, + 172, 172, 179, 1404, 1405, 179, 179, 179, 179, 179, + 179, 179, 179, 179, 179, 179, 180, 1406, 1409, 180, + 180, 180, 180, 180, 180, 180, 180, 180, 180, 180, + + 181, 1410, 1411, 181, 1028, 1028, 1028, 181, 181, 181, + 181, 181, 181, 181, 182, 1414, 1415, 182, 1416, 1028, + 1417, 182, 182, 182, 182, 182, 182, 182, 191, 1422, + 1423, 191, 191, 191, 191, 191, 191, 191, 191, 191, + 191, 191, 192, 1424, 1425, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 193, 193, 193, 193, + 193, 193, 193, 193, 193, 193, 193, 195, 1312, 1312, + 1312, 195, 195, 195, 195, 195, 195, 195, 196, 1031, + 1031, 1031, 196, 196, 196, 196, 196, 196, 196, 201, + 220, 1426, 201, 220, 1031, 1427, 201, 201, 201, 201, + + 201, 201, 201, 202, 1428, 1429, 202, 1313, 1313, 1313, + 202, 202, 202, 202, 202, 202, 202, 205, 1430, 220, + 1431, 205, 205, 205, 205, 205, 205, 205, 206, 1432, + 220, 1433, 206, 206, 206, 206, 206, 206, 206, 207, + 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, + 209, 1434, 1435, 209, 1044, 1044, 1044, 209, 209, 209, + 209, 209, 209, 209, 210, 1438, 1439, 210, 1442, 1044, + 1443, 210, 210, 210, 210, 210, 210, 210, 221, 1444, + 1445, 221, 1046, 1046, 1046, 221, 221, 221, 221, 221, + 221, 221, 222, 1446, 1450, 222, 1451, 1046, 1452, 222, + + 222, 222, 222, 222, 222, 222, 241, 241, 241, 248, + 1064, 1064, 1064, 248, 1188, 1188, 1188, 1194, 1194, 1194, + 1453, 241, 248, 248, 1454, 1064, 1455, 248, 248, 1188, + 1456, 1457, 1194, 1315, 1315, 1315, 1458, 1197, 1197, 1197, + 1215, 1215, 1215, 1217, 1217, 1217, 241, 241, 241, 249, + 249, 249, 1197, 1243, 1448, 1215, 1459, 1243, 1217, 1346, + 1346, 1346, 1460, 1448, 249, 1243, 1461, 249, 1221, 1221, + 1221, 1354, 1354, 1354, 249, 249, 711, 711, 711, 1244, + 1244, 1244, 1462, 1221, 1463, 1464, 711, 711, 1252, 1252, + 1252, 711, 1465, 1466, 1244, 711, 1233, 1233, 1233, 1287, + + 1287, 1287, 1467, 1252, 1468, 1233, 1233, 1469, 1233, 1470, + 1233, 1233, 1471, 1472, 1287, 1296, 1296, 1296, 1308, 1308, + 1308, 1314, 1314, 1314, 1320, 1320, 1320, 1336, 1336, 1336, + 1296, 1474, 1475, 1308, 1476, 1478, 1314, 1481, 1482, 1320, + 1483, 1484, 1336, 1339, 1339, 1339, 1345, 1345, 1345, 1387, + 1387, 1387, 1394, 1394, 1394, 1396, 1396, 1396, 1339, 1485, + 1486, 1345, 1407, 1407, 1407, 1408, 1408, 1408, 1487, 1488, + 1396, 1412, 1412, 1412, 1413, 1413, 1413, 1407, 1418, 1418, + 1418, 1419, 1419, 1419, 1489, 1490, 1412, 1420, 1420, 1420, + 1421, 1421, 1421, 1418, 1436, 1436, 1436, 1437, 1437, 1437, + + 1493, 1492, 1420, 1492, 1494, 1421, 1440, 1440, 1440, 1436, + 1441, 1441, 1441, 1447, 1447, 1447, 1491, 1449, 1497, 1498, + 1449, 1440, 1449, 1491, 1495, 1495, 1495, 1496, 1496, 1496, + 1449, 1499, 1500, 1501, 1502, 1503, 1504, 1504, 1504, 1495, + 1505, 1506, 1507, 1507, 1507, 1508, 1509, 1510, 1511, 1511, + 1511, 1504, 1512, 1513, 1514, 1515, 1516, 1516, 1516, 1517, + 1517, 1517, 1518, 1518, 1518, 1519, 1519, 1519, 1520, 1520, + 1520, 1521, 1522, 1523, 1517, 1524, 1525, 1526, 1527, 1528, + 1519, 1529, 1530, 1531, 1532, 1533, 1534, 1535, 1535, 1535, + 1536, 1537, 1538, 1538, 1538, 1539, 1540, 1541, 1542, 1543, + + 1544, 1545, 1546, 1547, 1548, 1549, 1550, 1551, 1552, 1553, + 1554, 1555, 1556, 1557, 1558, 1559, 1560, 1561, 1562, 1563, + 1564, 1566, 1567, 1568, 1569, 1570, 1571, 1572, 1574, 1577, + 1578, 1579, 1579, 1579, 1580, 1581, 1582, 1582, 1582, 1583, + 1584, 1585, 1586, 1587, 1588, 1589, 1579, 1590, 1591, 1592, + 1594, 1582, 1593, 1593, 1593, 1595, 1596, 1597, 1598, 1599, + 1600, 1601, 1601, 1601, 1602, 1602, 1602, 1603, 1604, 1605, + 1606, 1607, 1608, 1604, 1614, 1615, 1601, 1609, 1609, 1609, + 1610, 1610, 1610, 1611, 1611, 1611, 1612, 1612, 1612, 1613, + 1613, 1613, 1609, 1616, 1617, 1610, 1618, 1619, 1611, 1620, + + 1621, 1622, 1623, 1624, 1625, 1626, 1627, 1628, 1629, 1630, + 1631, 1632, 1633, 1634, 1635, 1636, 1637, 1638, 1639, 1640, + 1641, 1642, 1643, 1641, 1644, 1645, 1645, 1645, 1646, 1647, + 1648, 1649, 1650, 1651, 1652, 1653, 1654, 1656, 1657, 1658, + 1645, 1659, 1661, 1662, 1664, 1665, 1666, 1666, 1666, 1668, + 1665, 1667, 1667, 1667, 1669, 1670, 1670, 1670, 1671, 1671, + 1671, 1666, 1672, 1673, 1674, 1674, 1674, 1675, 1676, 1677, + 1670, 1678, 1678, 1678, 1679, 1680, 1681, 1682, 1683, 1674, + 1684, 1678, 1678, 1685, 1686, 1687, 1678, 1688, 1689, 1689, + 1689, 1690, 1691, 1692, 1693, 1694, 1695, 1696, 1697, 1697, + + 1697, 1698, 1698, 1698, 1699, 1699, 1699, 1700, 1700, 1700, + 1701, 1701, 1701, 1697, 1702, 1702, 1702, 1703, 1704, 1699, + 1705, 1706, 1707, 1708, 1709, 1701, 1710, 1710, 1710, 1711, + 1712, 1713, 1714, 1715, 1716, 1717, 1718, 1719, 1720, 1721, + 1722, 1710, 1723, 1724, 1725, 1726, 1727, 1728, 1729, 1730, + 1731, 1732, 1733, 1734, 1735, 1735, 1735, 1736, 1736, 1736, + 1737, 1738, 1739, 1740, 1741, 1742, 1743, 1744, 1745, 1735, + 1746, 1747, 1748, 1749, 1750, 1751, 1752, 1753, 1754, 1755, + 1755, 1755, 1756, 1756, 1756, 1757, 1758, 1758, 1758, 1759, + 1760, 1761, 1761, 1761, 1762, 1762, 1762, 1756, 1763, 1764, + + 1764, 1764, 1765, 1766, 1766, 1766, 1761, 1767, 1767, 1767, + 1768, 1768, 1768, 1770, 1764, 1769, 1769, 1769, 1766, 1771, + 1771, 1771, 1767, 1772, 1773, 1768, 1774, 1775, 1776, 1777, + 1778, 1779, 1780, 1781, 1771, 1782, 1782, 1782, 1783, 1783, + 1783, 1784, 1785, 1786, 1787, 1787, 1787, 1788, 1788, 1788, + 1782, 1790, 1791, 1783, 1789, 1789, 1789, 1792, 1793, 1794, + 1795, 1795, 1795, 1796, 1797, 1797, 1797, 1798, 1798, 1798, + 1799, 1800, 1801, 1802, 1803, 1795, 1805, 1806, 1803, 1797, + 1804, 1804, 1804, 1807, 1808, 1809, 1810, 1811, 1812, 1812, + 1812, 1813, 1814, 1815, 1816, 1804, 1817, 1818, 1819, 1820, + + 1821, 1822, 1824, 1812, 1823, 1823, 1823, 1825, 1826, 1827, + 1827, 1827, 1828, 1829, 1830, 1831, 1832, 1833, 1834, 1835, + 1836, 1837, 1838, 1839, 1827, 1840, 1841, 1841, 1841, 1842, + 1842, 1842, 1843, 1843, 1843, 1844, 1841, 1841, 1841, 1845, + 1846, 1841, 1848, 1851, 1842, 1847, 1847, 1847, 1849, 1849, + 1849, 1850, 1850, 1850, 1852, 1852, 1852, 1853, 1853, 1853, + 1854, 1854, 1854, 1849, 1855, 1855, 1855, 1857, 1860, 1852, + 1856, 1856, 1856, 1862, 1863, 1854, 1858, 1858, 1858, 1859, + 1859, 1859, 1861, 1865, 1864, 1867, 1868, 1861, 1866, 1866, + 1866, 1858, 1864, 1869, 1870, 1870, 1870, 1864, 1871, 1871, + + 1871, 1874, 1875, 1866, 1872, 1872, 1872, 1876, 1877, 1870, + 1873, 1873, 1873, 1878, 1879, 1880, 1881, 1884, 1886, 1872, + 1882, 1882, 1882, 1883, 1883, 1883, 1885, 1885, 1885, 1887, + 1888, 1889, 1890, 1891, 1894, 1882, 1892, 1892, 1892, 1889, + 1893, 1893, 1893, 1895, 1896, 1897, 1898, 1899, 1900, 1903, + 1904, 1892, 1901, 1901, 1901, 1902, 1902, 1902, 1905, 1906, + 1907, 1908, 1909, 1910, 1911, 1912, 1913, 1901, 1914, 1915, + 1916, 1916, 1916, 1917, 1917, 1917, 1918, 1919, 1920, 1921, + 1922, 1925, 1926, 1927, 1929, 1916, 1930, 1931, 1931, 1931, + 1932, 1932, 1932, 1933, 1933, 1933, 1934, 1934, 1934, 1935, + + 1935, 1935, 1931, 1937, 1938, 1932, 1940, 1942, 1933, 1947, + 1948, 1934, 1936, 1936, 1936, 1939, 1939, 1939, 1941, 1941, + 1941, 1943, 1943, 1943, 1944, 1944, 1944, 1945, 1945, 1945, + 1939, 1946, 1946, 1946, 1949, 1950, 1951, 1945, 1945, 1952, + 1953, 1954, 1945, 1955, 1956, 1956, 1956, 1957, 1957, 1957, + 1958, 1959, 1960, 1961, 1961, 1961, 1962, 1962, 1962, 1956, + 1963, 1964, 1964, 1964, 1965, 1966, 1967, 1967, 1967, 1968, + 1969, 1970, 1971, 1971, 1971, 1972, 1964, 1973, 1974, 1975, + 1976, 1967, 1977, 1978, 1979, 1980, 1980, 1980, 1981, 1982, + 1983, 1984, 1985, 1986, 1987, 1988, 1988, 1988, 1989, 1990, + + 1990, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, + 1999, 2000, 2001, 2003, 1990, 2002, 2002, 2002, 2004, 2005, + 2006, 2007, 2008, 2010, 2011, 2012, 2013, 2013, 2013, 2014, + 2014, 2014, 2015, 2015, 2015, 2016, 2016, 2016, 2017, 2017, + 2017, 2013, 2018, 2018, 2018, 2020, 2021, 2015, 2019, 2019, + 2019, 2024, 2025, 2017, 2022, 2022, 2022, 2023, 2023, 2023, + 2026, 2026, 2026, 2027, 2027, 2027, 2028, 2028, 2028, 2022, + 2029, 2029, 2029, 2030, 2031, 2026, 2033, 2034, 2027, 2035, + 2036, 2028, 2032, 2032, 2032, 2032, 2037, 2038, 2039, 2039, + 2039, 2040, 2041, 2042, 2043, 2044, 2044, 2044, 2045, 2045, + + 2045, 2046, 2046, 2046, 2047, 2048, 2048, 2048, 2050, 2051, + 2044, 2049, 2049, 2049, 2053, 2054, 2046, 2052, 2052, 2052, + 2048, 2055, 2056, 2057, 2057, 2057, 2059, 2060, 2055, 2058, + 2058, 2058, 2052, 2061, 2062, 2063, 2063, 2063, 2057, 2064, + 2065, 2066, 2067, 2068, 2058, 2069, 2069, 2069, 2071, 2072, + 2063, 2070, 2070, 2070, 2073, 2074, 2075, 2076, 2077, 2078, + 2069, 2079, 2080, 2081, 2082, 2083, 2084, 2085, 2086, 2087, + 2088, 2089, 2090, 2090, 2090, 2091, 2091, 2091, 2092, 2092, + 2092, 2093, 2093, 2093, 2094, 2097, 2098, 2090, 2095, 2095, + 2095, 2096, 2096, 2096, 2099, 2099, 2099, 2100, 2100, 2100, + + 2101, 2101, 2101, 2095, 2102, 2102, 2102, 2104, 2105, 2099, + 2103, 2103, 2103, 2110, 2112, 2101, 2106, 2106, 2106, 2107, + 2107, 2107, 2108, 2108, 2108, 2109, 2109, 2109, 2111, 2111, + 2111, 2106, 2113, 2114, 2107, 2115, 2116, 2108, 2117, 2118, + 2109, 2119, 2123, 2111, 2120, 2120, 2120, 2121, 2121, 2121, + 2122, 2122, 2122, 2124, 2124, 2124, 2125, 2111, 2126, 2127, + 2127, 2127, 2121, 2128, 2128, 2128, 2129, 2130, 2130, 2130, + 2131, 2132, 2133, 2138, 2127, 2134, 2134, 2134, 2135, 2135, + 2135, 2140, 2130, 2136, 2136, 2136, 2137, 2137, 2137, 2141, + 2134, 2139, 2139, 2139, 2142, 2142, 2142, 2144, 2136, 2143, + + 2143, 2143, 2145, 2146, 2147, 2148, 2139, 2150, 2152, 2142, + 2149, 2149, 2149, 2151, 2151, 2151, 2153, 2154, 2154, 2154, + 2155, 2155, 2155, 2156, 2156, 2156, 2157, 2158, 2151, 2159, + 2160, 2161, 2154, 2162, 2163, 2155, 2164, 2165, 2156, 2166, + 2167, 2168, 2169, 2169, 2169, 2171, 2174, 2168, 2170, 2170, + 2170, 2175, 2168, 2172, 2172, 2172, 2178, 2169, 2173, 2173, + 2173, 2176, 2176, 2176, 2177, 2177, 2177, 2179, 2172, 2180, + 2180, 2180, 2181, 2181, 2181, 2182, 2182, 2182, 2183, 2183, + 2183, 2184, 2184, 2184, 2180, 2185, 2185, 2185, 2191, 2192, + 2182, 2186, 2186, 2186, 2193, 2194, 2184, 2187, 2187, 2187, + + 2188, 2188, 2188, 2189, 2189, 2189, 2186, 2190, 2190, 2190, + 2195, 2196, 2196, 2196, 2197, 2188, 2198, 2199, 2189, 2200, + 2200, 2200, 2201, 2202, 2203, 2205, 2196, 2204, 2204, 2204, + 2206, 2206, 2206, 2207, 2207, 2207, 2208, 2209, 2210, 2211, + 2211, 2211, 2212, 2212, 2212, 2206, 2213, 2214, 2214, 2214, + 2215, 2215, 2215, 2216, 2217, 2218, 2218, 2218, 2219, 2219, + 2219, 2220, 2214, 2221, 2222, 2222, 2222, 2223, 2223, 2223, + 2224, 2228, 2235, 2219, 2225, 2225, 2225, 2237, 2238, 2222, + 2239, 2240, 2223, 2226, 2226, 2226, 2227, 2227, 2227, 2225, + 2229, 2229, 2229, 2230, 2230, 2230, 2231, 2231, 2231, 2241, + + 2242, 2227, 2232, 2232, 2232, 2229, 2233, 2233, 2233, 2243, + 2244, 2231, 2234, 2234, 2234, 2236, 2236, 2236, 2245, 2246, + 2247, 2233, 2248, 2249, 2249, 2249, 2250, 2251, 2251, 2251, + 2236, 2252, 2253, 2254, 2255, 2255, 2255, 2256, 2256, 2256, + 2257, 2257, 2257, 2258, 2258, 2258, 2259, 2259, 2259, 2255, + 2260, 2260, 2260, 2261, 2261, 2261, 2262, 2262, 2262, 2263, + 2263, 2263, 2264, 2265, 2266, 2260, 2267, 2267, 2267, 2263, + 2263, 2268, 2268, 2268, 2263, 2269, 2269, 2269, 2270, 2271, + 2272, 2267, 2273, 2274, 2271, 2275, 2268, 2276, 2277, 2277, + 2277, 2278, 2279, 2280, 2281, 2282, 2282, 2282, 2283, 2284, + + 2285, 2285, 2285, 2286, 2286, 2286, 2287, 2288, 2289, 2289, + 2289, 2290, 2290, 2290, 2293, 2285, 2291, 2291, 2291, 2292, + 2292, 2292, 2297, 2289, 2294, 2294, 2294, 2295, 2295, 2295, + 2301, 2291, 2296, 2296, 2296, 2298, 2298, 2298, 2299, 2299, + 2299, 2304, 2295, 2300, 2300, 2300, 2302, 2302, 2302, 2303, + 2303, 2303, 2305, 2306, 2307, 2308, 2308, 2308, 2309, 2309, + 2309, 2302, 2310, 2311, 2313, 2314, 2315, 2316, 2317, 2318, + 2308, 2319, 2327, 2309, 2320, 2320, 2320, 2321, 2321, 2321, + 2322, 2322, 2322, 2323, 2323, 2323, 2324, 2324, 2324, 2320, + 2325, 2325, 2325, 2326, 2326, 2326, 2328, 2329, 2323, 2333, + + 2334, 2324, 2330, 2330, 2330, 2325, 2331, 2331, 2331, 2332, + 2332, 2332, 2335, 2336, 2337, 2338, 2339, 2330, 2340, 2341, + 2341, 2341, 2342, 2343, 2343, 2343, 2344, 2345, 2346, 2347, + 2347, 2347, 2348, 2349, 2341, 2350, 2350, 2350, 2343, 2351, + 2351, 2351, 2352, 2353, 2353, 2353, 2354, 2355, 2355, 2355, + 2356, 2356, 2356, 2357, 2358, 2359, 2360, 2361, 2361, 2361, + 2365, 2366, 2355, 2362, 2362, 2362, 2363, 2363, 2363, 2364, + 2364, 2364, 2361, 2367, 2369, 2370, 2371, 2371, 2371, 2372, + 2373, 2363, 2374, 2374, 2374, 2375, 2375, 2375, 2376, 2376, + 2376, 2371, 2377, 2377, 2377, 2378, 2378, 2378, 2380, 2381, + + 2375, 2379, 2379, 2379, 2382, 2384, 2385, 2377, 2383, 2383, + 2383, 2386, 2386, 2386, 2387, 2388, 2389, 2390, 2390, 2390, + 2391, 2386, 2386, 2386, 2394, 2397, 2386, 2392, 2392, 2392, + 2398, 2401, 2390, 2393, 2393, 2393, 2395, 2395, 2395, 2396, + 2396, 2396, 2392, 2399, 2399, 2399, 2400, 2400, 2400, 2402, + 2406, 2395, 2403, 2403, 2403, 2404, 2404, 2404, 2399, 2407, + 2408, 2400, 2405, 2405, 2405, 2412, 2413, 2403, 2414, 2399, + 2404, 2409, 2409, 2409, 2410, 2410, 2410, 2411, 2411, 2411, + 2415, 2416, 2416, 2416, 2419, 2423, 2409, 2417, 2417, 2417, + 2418, 2418, 2418, 2420, 2420, 2420, 2416, 2421, 2421, 2421, + + 2424, 2425, 2417, 2422, 2422, 2422, 2427, 2434, 2420, 2435, + 2416, 2426, 2426, 2426, 2428, 2428, 2428, 2429, 2429, 2429, + 2430, 2430, 2430, 2431, 2431, 2431, 2426, 2443, 2446, 2428, + 2449, 2454, 2429, 2455, 2456, 2430, 2459, 2461, 2431, 2432, + 2432, 2432, 2433, 2433, 2433, 2436, 2436, 2436, 2437, 2437, + 2437, 2438, 2438, 2438, 2439, 2439, 2439, 2433, 2462, 2465, + 2436, 2440, 2440, 2440, 2467, 2471, 2438, 2441, 2441, 2441, + 2442, 2442, 2442, 2444, 2444, 2444, 2440, 2445, 2445, 2445, + 2447, 2447, 2447, 2472, 2475, 2442, 2485, 2486, 2444, 2448, + 2448, 2448, 2450, 2450, 2450, 2447, 2451, 2451, 2451, 2452, + + 2452, 2452, 2453, 2453, 2453, 2494, 2496, 2450, 2457, 2457, + 2457, 2451, 2458, 2458, 2458, 2463, 2463, 2463, 2464, 2464, + 2464, 2498, 2503, 2457, 2466, 2466, 2466, 2468, 2468, 2468, + 2463, 2469, 2469, 2469, 2470, 2470, 2470, 2473, 2473, 2473, + 2506, 2507, 2468, 2474, 2474, 2474, 2476, 2476, 2476, 2470, + 2508, 2515, 2473, 2477, 2477, 2477, 2478, 2478, 2478, 2516, + 2523, 2476, 2479, 2479, 2479, 2480, 2480, 2480, 2481, 2481, + 2481, 2478, 2482, 2482, 2482, 2483, 2483, 2483, 2524, 2528, + 2480, 2484, 2484, 2484, 2487, 2487, 2487, 2488, 2488, 2488, + 2483, 2489, 2489, 2489, 2490, 2490, 2490, 2491, 2491, 2491, + + 2529, 2530, 2488, 2492, 2492, 2492, 2493, 2493, 2493, 2490, + 2495, 2495, 2495, 2497, 2497, 2497, 2538, 2539, 2492, 2499, + 2499, 2499, 2500, 2500, 2500, 2501, 2501, 2501, 2502, 2502, + 2502, 2504, 2504, 2504, 2499, 2505, 2505, 2505, 2509, 2509, + 2509, 2547, 2548, 2502, 2552, 2554, 2504, 2510, 2510, 2510, + 2511, 2511, 2511, 2512, 2512, 2512, 2555, 2510, 2510, 2513, + 2513, 2513, 2510, 2562, 2563, 2511, 2514, 2514, 2514, 2517, + 2517, 2517, 2570, 2571, 2513, 2518, 2518, 2518, 2519, 2519, + 2519, 2520, 2520, 2520, 2521, 2521, 2521, 2522, 2522, 2522, + 2518, 2525, 2525, 2525, 2526, 2526, 2526, 2527, 2527, 2527, + + 2531, 2531, 2531, 2532, 2532, 2532, 2533, 2533, 2533, 2534, + 2534, 2534, 2535, 2535, 2535, 2536, 2536, 2536, 2532, 2537, + 2537, 2537, 2575, 2578, 2534, 2579, 2582, 2535, 2540, 2540, + 2540, 2541, 2541, 2541, 2537, 2542, 2542, 2542, 2543, 2543, + 2543, 2583, 2587, 2540, 2589, 2591, 2541, 2544, 2544, 2544, + 2542, 2545, 2545, 2545, 2546, 2546, 2546, 2549, 2549, 2549, + 2592, 2593, 2544, 2550, 2550, 2550, 2551, 2551, 2551, 2553, + 2553, 2553, 2549, 2556, 2556, 2556, 2557, 2557, 2557, 2594, + 2599, 2551, 2600, 2601, 2553, 2558, 2558, 2558, 2559, 2559, + 2559, 2557, 2560, 2560, 2560, 2561, 2561, 2561, 2564, 2564, + + 2564, 2565, 2565, 2565, 2566, 2566, 2566, 2560, 2567, 2567, + 2567, 2602, 2608, 2564, 2568, 2568, 2568, 2611, 2617, 2566, + 2569, 2569, 2569, 2572, 2572, 2572, 2573, 2573, 2573, 2574, + 2574, 2574, 2576, 2576, 2576, 2577, 2577, 2577, 2580, 2580, + 2580, 2573, 2581, 2581, 2581, 2618, 2619, 2576, 2584, 2584, + 2584, 2585, 2585, 2585, 2586, 2586, 2586, 2588, 2588, 2588, + 2590, 2590, 2590, 2595, 2595, 2595, 2596, 2596, 2596, 2586, + 2597, 2597, 2597, 2598, 2598, 2598, 2623, 2624, 2595, 2603, + 2603, 2603, 2604, 2604, 2604, 2597, 2625, 2627, 2598, 2605, + 2605, 2605, 2606, 2606, 2606, 2610, 2628, 2604, 2607, 2607, + + 2607, 2609, 2609, 2609, 2612, 2612, 2612, 2606, 2610, 2613, + 2613, 2613, 2614, 2614, 2614, 2629, 2609, 2615, 2615, 2615, + 2616, 2616, 2616, 2620, 2620, 2620, 2630, 2614, 2621, 2621, + 2621, 2631, 2615, 2622, 2622, 2622, 2633, 2634, 2620, 2626, + 2626, 2626, 2635, 2636, 2637, 2638, 2640, 2641, 2644, 2644, + 2644, 2644, 2644, 2644, 2644, 2644, 2645, 2645, 2645, 2645, + 2645, 2645, 2645, 2645, 2646, 2646, 2646, 2646, 2646, 2646, + 2646, 2646, 2647, 2647, 2647, 2647, 2647, 2647, 2647, 2647, + 2648, 2648, 2648, 2648, 2648, 2648, 2648, 2648, 2649, 2649, + 2649, 2649, 2649, 2649, 2649, 2649, 2650, 2650, 2650, 2650, + + 2650, 2650, 2650, 2650, 2651, 2651, 2651, 2651, 2651, 2651, + 2651, 2651, 2652, 2652, 2652, 2652, 2652, 2652, 2652, 2652, + 2653, 2653, 2653, 2653, 2653, 2653, 2653, 2653, 2654, 2654, + 2654, 2654, 2654, 2654, 2654, 2654, 2655, 2655, 2655, 2655, + 2655, 2655, 2655, 2655, 2656, 2656, 2656, 2656, 2656, 2656, + 2656, 2656, 2657, 2657, 2657, 2657, 2657, 2657, 2657, 2657, + 2658, 2658, 2658, 2658, 2658, 2658, 2658, 2658, 2659, 2659, + 2659, 2659, 2659, 2659, 2659, 2659, 2660, 2660, 2660, 2660, + 2660, 2660, 2660, 2660, 2661, 2661, 2661, 2661, 2661, 2661, + 2661, 2661, 2662, 2662, 2662, 2662, 2662, 2662, 2662, 2662, + + 2663, 2663, 2663, 2663, 2663, 2663, 2663, 2663, 2664, 2664, + 2664, 2664, 2664, 2664, 2664, 2664, 2665, 2665, 2665, 2665, + 2665, 2665, 2665, 2665, 2666, 2666, 2666, 2666, 2666, 2666, + 2666, 2666, 2667, 2667, 2667, 2667, 2667, 2667, 2667, 2667, + 2668, 2668, 2668, 2668, 2668, 2668, 2668, 2668, 2669, 2669, + 2669, 2669, 2669, 2669, 2669, 2669, 2670, 2670, 2670, 2670, + 2670, 2670, 2670, 2670, 2671, 2671, 2671, 2671, 2671, 2671, + 2671, 2671, 2672, 2672, 2672, 2672, 2672, 2672, 2672, 2672, + 2673, 2673, 2673, 2673, 2673, 2673, 2673, 2673, 2674, 2674, + 2674, 2674, 2674, 2674, 2674, 2674, 2675, 2675, 2675, 2675, + + 2675, 2675, 2675, 2675, 2676, 2676, 2676, 2676, 2676, 2676, + 2676, 2676, 2677, 2677, 2677, 2677, 2677, 2677, 2677, 2677, + 2678, 2678, 2678, 2678, 2678, 2678, 2678, 2678, 2679, 2679, + 2679, 2679, 2679, 2679, 2679, 2679, 2680, 2680, 2680, 2680, + 2680, 2680, 2680, 2680, 2681, 2681, 2681, 2681, 2681, 2681, + 2681, 2681, 2682, 2682, 2682, 2682, 2682, 2682, 2682, 2682, + 2683, 2683, 2683, 2683, 2683, 2683, 2683, 2683, 2684, 2684, + 2684, 2684, 2684, 2684, 2684, 2684, 2685, 2685, 2685, 2685, + 2685, 2685, 2685, 2685, 2686, 2686, 2686, 2686, 2686, 2686, + 2686, 2686, 2687, 2687, 2687, 2687, 2687, 2687, 2687, 2687, + + 2688, 2688, 2688, 2688, 2688, 2688, 2688, 2688, 2689, 2689, + 2689, 2689, 2689, 2689, 2689, 2689, 2690, 2690, 2690, 2690, + 2690, 2690, 2690, 2690, 2691, 2691, 2691, 2691, 2691, 2691, + 2691, 2691, 2692, 2692, 2692, 2692, 2692, 2692, 2692, 2692, + 2693, 2693, 2693, 2693, 2693, 2693, 2693, 2693, 2694, 2694, + 2694, 2694, 2694, 2694, 2694, 2694, 2695, 2695, 2695, 2695, + 2695, 2695, 2695, 2695, 2696, 2696, 2696, 2696, 2696, 2696, + 2696, 2696, 2697, 2697, 2697, 2697, 2697, 2697, 2697, 2697, + 2698, 2698, 2698, 2698, 2698, 2698, 2698, 2698, 2699, 2699, + 2699, 2699, 2699, 2699, 2699, 2699, 2700, 2700, 2700, 2700, + + 2700, 2700, 2700, 2700, 2701, 2701, 2701, 2701, 2701, 2701, + 2701, 2701, 2702, 2702, 2702, 2702, 2702, 2702, 2702, 2702, + 2703, 2703, 2703, 2703, 2703, 2703, 2703, 2703, 2704, 2704, + 2704, 2704, 2704, 2704, 2704, 2704, 2705, 2705, 2705, 2705, + 2705, 2705, 2705, 2705, 2706, 2706, 2706, 2706, 2706, 2706, + 2706, 2706, 2707, 2707, 2707, 2707, 2707, 2707, 2707, 2707, + 2708, 2708, 2708, 2708, 2708, 2708, 2708, 2708, 2709, 2709, + 2709, 2709, 2709, 2709, 2709, 2709, 2710, 2710, 2710, 2710, + 2710, 2710, 2710, 2710, 2711, 2711, 2711, 2711, 2711, 2711, + 2711, 2711, 2712, 2712, 2712, 2712, 2712, 2712, 2712, 2712, + + 2713, 2713, 2713, 2713, 2713, 2713, 2713, 2713, 2714, 2714, + 2715, 2715, 2716, 2717, 2718, 2718, 2719, 2719, 2720, 2720, + 2721, 2721, 2722, 2722, 2723, 2723, 2724, 2725, 2725, 2726, + 2726, 2727, 2727, 2728, 2728, 2729, 2729, 2730, 2730, 2731, + 2731, 2732, 2732, 2733, 2733, 2734, 2734, 2735, 2736, 2736, + 2737, 2737, 2738, 2738, 2739, 2739, 2740, 2741, 2741, 2742, + 2742, 2743, 2744, 2745, 2747, 2745, 2745, 2745, 2746, 2748, + 2746, 2746, 2746, 2749, 2750, 2751, 2752, 2753, 2754, 2755, + 2756, 2757, 2758, 2759, 2760, 2761, 2762, 2762, 2763, 2763, + 2764, 2765, 2765, 2766, 2766, 2767, 2768, 2768, 2769, 2769, + + 2770, 2771, 2770, 2770, 2770, 2772, 2773, 2774, 2775, 2776, + 2777, 2777, 2778, 2778, 2779, 2779, 2780, 2780, 2781, 2781, + 2782, 2782, 2783, 2783, 2784, 2784, 2785, 2785, 2786, 2786, + 2787, 2787, 2788, 2788, 2789, 2789, 2790, 2790, 2791, 2791, + 2792, 2792, 2793, 2793, 2794, 2794, 2795, 2795, 2796, 2796, + 2797, 2797, 2798, 2798, 2799, 2799, 2800, 2800, 2801, 2801, + 2802, 2802, 2803, 2804, 2805, 2806, 2807, 2808, 2809, 2810, + 2810, 2811, 2811, 2812, 2813, 2813, 2814, 2814, 2815, 2815, + 2816, 2816, 2817, 2818, 2819, 2820, 2820, 2821, 2821, 2822, + 2823, 2824, 2824, 2829, 2824, 2824, 2824, 2824, 2824, 2825, + + 2830, 2840, 2825, 2825, 2825, 2825, 2825, 2826, 2826, 2827, + 2827, 2828, 2856, 2828, 2831, 2831, 2832, 2832, 2833, 2863, + 2833, 2834, 2834, 2835, 2835, 2836, 2867, 2836, 2837, 2837, + 2838, 2838, 2839, 2868, 2839, 2841, 2841, 2842, 2842, 2843, + 2871, 2843, 2844, 2844, 2845, 2845, 2846, 2872, 2846, 2847, + 2847, 2848, 2848, 2849, 2873, 2849, 2850, 2850, 2851, 2851, + 2852, 2874, 2852, 2853, 2853, 2854, 2854, 2855, 2875, 2855, + 2857, 2857, 2858, 2858, 2859, 2876, 2859, 2860, 2860, 2861, + 2861, 2862, 2877, 2862, 2864, 2864, 2865, 2865, 2866, 2878, + 2866, 2869, 2879, 2869, 2869, 2869, 2870, 2880, 2870, 2870, + + 2870, 2881, 2882, 2883, 2884, 2885, 2886, 2886, 2887, 2887, + 2888, 2889, 2888, 2890, 2890, 2891, 2891, 2892, 2893, 2892, + 2894, 2894, 2895, 2895, 2896, 2898, 2896, 2897, 2899, 2897, + 2897, 2897, 2900, 2901, 2902, 2903, 2904, 2904, 2905, 2905, + 2906, 2943, 2906, 2907, 2907, 2908, 2908, 2909, 2944, 2909, + 2910, 2910, 2911, 2911, 2912, 2945, 2912, 2913, 2913, 2914, + 2914, 2915, 2946, 2915, 2916, 2916, 2917, 2917, 2918, 2947, + 2918, 2919, 2919, 2920, 2920, 2921, 2948, 2921, 2922, 2922, + 2923, 2923, 2924, 2949, 2924, 2925, 2925, 2926, 2926, 2927, + 2953, 2927, 2928, 2928, 2929, 2929, 2930, 2960, 2930, 2931, + + 2931, 2932, 2932, 2933, 2961, 2933, 2934, 2934, 2935, 2935, + 2936, 2962, 2936, 2937, 2937, 2938, 2938, 2939, 2966, 2939, + 2940, 2940, 2941, 2941, 2942, 2967, 2942, 2950, 2950, 2951, + 2951, 2952, 2970, 2952, 2954, 2954, 2955, 2955, 2956, 2971, + 2956, 2957, 2957, 2958, 2958, 2959, 2972, 2959, 2963, 2963, + 2964, 2964, 2965, 2973, 2965, 2968, 2968, 2974, 2968, 2968, + 2968, 2968, 2968, 2969, 2975, 2976, 2969, 2969, 2969, 2969, + 2969, 2977, 2978, 2979, 2980, 2981, 2982, 2983, 2984, 2985, + 2986, 2987, 2988, 2989, 2990, 2991, 2992, 2993, 2994, 2995, + 2996, 2997, 2998, 1251, 1250, 1249, 1248, 1247, 1246, 1245, + + 1241, 1240, 1239, 1238, 1237, 1234, 1232, 1231, 1230, 1229, + 1228, 1227, 1226, 1225, 1224, 1223, 1220, 1214, 1213, 1212, + 1208, 1207, 1206, 1205, 1204, 1203, 1202, 1201, 1200, 1199, + 1198, 1196, 1193, 1192, 1191, 1190, 1189, 1187, 1186, 1185, + 1183, 1180, 1165, 1164, 1128, 1119, 1118, 1116, 1110, 1108, + 1106, 1104, 1102, 1101, 1100, 1099, 1097, 1074, 1072, 1071, + 1068, 1066, 1065, 1062, 1061, 1060, 1059, 1058, 1057, 1056, + 1055, 1054, 1053, 1052, 1051, 1050, 1049, 1048, 1043, 1042, + 1041, 1040, 1039, 1038, 1037, 1036, 1035, 1034, 1033, 1032, + 1030, 1025, 1024, 1023, 1015, 1014, 1013, 1012, 1011, 1010, + + 1009, 1008, 1007, 1006, 1005, 1004, 1003, 1001, 1000, 999, + 998, 997, 996, 995, 994, 992, 988, 984, 983, 982, + 981, 977, 976, 975, 973, 970, 968, 964, 963, 962, + 960, 959, 958, 957, 955, 953, 952, 950, 949, 946, + 945, 944, 943, 941, 939, 930, 927, 926, 925, 923, + 922, 921, 919, 918, 917, 915, 914, 913, 911, 910, + 909, 907, 906, 905, 903, 902, 901, 899, 898, 897, + 896, 893, 892, 891, 889, 888, 887, 885, 884, 883, + 881, 880, 879, 877, 876, 875, 867, 866, 865, 864, + 861, 860, 859, 856, 855, 854, 851, 850, 849, 846, + + 844, 840, 830, 827, 825, 823, 821, 819, 817, 816, + 815, 814, 812, 811, 810, 809, 808, 799, 798, 797, + 796, 795, 793, 792, 791, 788, 787, 786, 784, 783, + 782, 779, 778, 777, 775, 774, 773, 771, 770, 769, + 767, 766, 765, 763, 762, 761, 758, 757, 756, 754, + 753, 752, 750, 749, 748, 745, 744, 741, 740, 739, + 738, 737, 736, 735, 734, 731, 729, 727, 726, 725, + 724, 720, 719, 718, 717, 716, 713, 712, 710, 703, + 701, 700, 699, 698, 697, 696, 695, 694, 693, 691, + 690, 689, 688, 687, 685, 684, 674, 673, 669, 668, + + 664, 663, 662, 658, 657, 650, 649, 648, 643, 642, + 635, 634, 630, 629, 623, 622, 621, 619, 618, 617, + 613, 612, 609, 608, 604, 603, 602, 599, 598, 590, + 589, 588, 584, 583, 580, 579, 577, 576, 561, 560, + 559, 558, 557, 556, 555, 554, 552, 551, 550, 547, + 546, 545, 542, 541, 540, 539, 538, 537, 536, 535, + 534, 532, 531, 530, 526, 525, 522, 521, 520, 519, + 518, 517, 516, 515, 514, 513, 512, 511, 509, 508, + 507, 494, 493, 492, 490, 489, 487, 486, 485, 477, + 476, 475, 468, 467, 466, 462, 461, 460, 451, 450, + + 413, 404, 403, 399, 398, 394, 393, 389, 388, 384, + 383, 375, 374, 372, 371, 370, 366, 365, 361, 360, + 347, 346, 345, 344, 343, 341, 340, 339, 336, 335, + 334, 333, 332, 331, 322, 321, 320, 316, 315, 314, + 310, 309, 308, 304, 303, 302, 298, 297, 296, 288, + 287, 286, 282, 281, 280, 276, 275, 274, 262, 261, + 260, 256, 255, 254, 251, 250, 240, 239, 232, 231, + 227, 223, 217, 208, 194, 156, 154, 152, 148, 146, + 144, 138, 136, 134, 132, 112, 102, 42, 40, 38, + 36, 1, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643 + } ; + +static yy_state_type yy_last_accepting_state; +static char *yy_last_accepting_cpos; + +extern int yy_flex_debug; +int yy_flex_debug = 0; + +/* The intent behind this definition is that it'll catch + * any uses of REJECT which flex missed. + */ +#define REJECT reject_used_but_not_detected +#define yymore() yymore_used_but_not_detected +#define YY_MORE_ADJ 0 +#define YY_RESTORE_YY_MORE_OFFSET +char *yytext; +#line 1 "read_input.l" +/* + * $Id: read_input.c,v 1.69 2008/08/01 14:04:29 urbach Exp $ + * + * This is the parser. (Dec 2002) + * The .c-file is generated from .l using flex. + * Please edit read_input.l instead of read_input.c! + * flex should be said to be case insensitive! + * + * After modifiing read_input.l please call once + * make flex_read_input + * to update read_input.c + * + * Autor: Carsten Urbach + * urbach@physik.fu-berlin.de + */ +#line 28 "read_input.l" +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include"global.h" +#include"read_input.h" +#include"default_input_values.h" + + /* Name of the parsing routine */ +#define YY_DECL int parse_config() +#define YY_NO_UNPUT + + /* declaration of input parameters */ + int line_of_file=1; + int verbose=0; + int startoption; + int Ntherm; + int Nmeas; + int Nskip; + int integtyp; + int int_n[4]; + double lambda[4]; + int nsmall; + int solver_flag; + int gmres_m_parameter, gmresdr_nr_ev; + int operator_flag; + int matrix_element_flag; + int save_config_flag; + int save_prop_flag; + int save_prop_g2_flag; + int write_cp_flag; + int cp_interval; + int nstore; + int index_start, index_end; + int random_seed; + double dtau, tau; + int Nsteps; + char rlxd_input_filename[100]; + char gauge_input_filename[100]; + int first_prop_flag; + int max_solver_iterations; + double solver_precision; + int mass_number; + int read_source_flag; + char source_input_filename[100]; + int return_check_flag, return_check_interval; + int source_format_flag; + int source_time_slice; + int gauge_precision_read_flag; + int gauge_precision_write_flag; + int prop_precision_flag; + int gmres_m_parameter, gmresdr_nr_ev; + int reproduce_randomnumber_flag; + double stout_rho; + int stout_no_iter; + int use_stout_flag; + int phmc_no_flavours; + int phmc_heavy_timescale; + int phmc_exact_poly; + int compute_evs; + int phmc_compute_evs; + double stilde_max; + double stilde_min; + int degree_of_p; + int propagator_splitted; + int source_splitted; + int source_location; + int no_eigenvalues; + double eigenvalue_precision; + int sub_evs_cg_flag; + int even_odd_flag; + int write_prop_format_flag; + int online_measurement_flag; + int online_measurement_freq; + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +#line 2957 "" + +#define INITIAL 0 +#define BETA 1 +#define STARTCOND 2 +#define THERMSWEEPS 3 +#define NMEAS 4 +#define KAPPA 5 +#define ACCPTILDE 6 +#define ACCHFIN 7 +#define RECEV 8 +#define MUBAR 9 +#define EPSBAR 10 +#define MU 11 +#define MU2 12 +#define MU3 13 +#define SEED 14 +#define Q1 15 +#define Q2 16 +#define DTAU 17 +#define TAU 18 +#define NSTEPS 19 +#define CSW 20 +#define INTTYP 21 +#define NSMALL 22 +#define NSKIP 23 +#define RLXDINPUTFILE 24 +#define GAUGEINPUTFILE 25 +#define GAUGERPREC 26 +#define GAUGEWPREC 27 +#define SOLVFLAG 28 +#define OPFLAG 29 +#define MEFLAG 30 +#define SAVECONF 31 +#define SAVEPROP 32 +#define SAVEPRG2 33 +#define WRITECP 34 +#define CPINT 35 +#define NSTORE 36 +#define TT 37 +#define LL 38 +#define LLX 39 +#define LLY 40 +#define LLZ 41 +#define NPROCX 42 +#define NPROCY 43 +#define NPROCZ 44 +#define IOPROC 45 +#define IDX 46 +#define FPROP 47 +#define CGMAX 48 +#define BCGMAX 49 +#define BOUND 50 +#define SITER 51 +#define SPREC 52 +#define MNR 53 +#define RGIC 54 +#define READSOURCE 55 +#define SOURCEFORMAT 56 +#define SOURCEFILE 57 +#define SOURCETS 58 +#define INT0 59 +#define INT1 60 +#define INT2 61 +#define INT3 62 +#define INT4 63 +#define LAMBDA0 64 +#define LAMBDA1 65 +#define LAMBDA2 66 +#define LAMBDA3 67 +#define LAMBDA4 68 +#define RELPREC 69 +#define FORCEPREC 70 +#define FORCEPREC1 71 +#define FORCEPREC2 72 +#define FORCEPREC3 73 +#define ACCPREC 74 +#define ACCPREC1 75 +#define ACCPREC2 76 +#define ACCPREC3 77 +#define REVCHECK 78 +#define REVINT 79 +#define DEBUG 80 +#define CSGN1 81 +#define CSGN2 82 +#define CSGN3 83 +#define GMRESM 84 +#define GMRESDRNEV 85 +#define REPRORND 86 +#define SLOPPYPREC 87 +#define USESTOUT 88 +#define STOUTRHO 89 +#define STOUTITER 90 +#define PHMCFLAV 91 +#define COMPUTEEVS 92 +#define PCOMPUTEEVS 93 +#define PPP 94 +#define SMAX 95 +#define SMIN 96 +#define DEGP 97 +#define SPLITPROP 98 +#define SPLITSOURCE 99 +#define SRCLOC 100 +#define SUBEVCG 101 +#define NOEV 102 +#define PRECEV 103 +#define HEAVYTS 104 +#define EO 105 +#define WRPROPFLAG 106 +#define PROPPREC 107 +#define PROPTYPE 108 +#define ONMEAS 109 +#define ONFREQ 110 +#define COMMENT 111 +#define ERROR 112 + +#ifndef YY_NO_UNISTD_H +/* Special case for "unistd.h", since it is non-ANSI. We include it way + * down here because we want the user's section 1 to have been scanned first. + * The user has a chance to override it with an option. + */ +#include +#endif + +#ifndef YY_EXTRA_TYPE +#define YY_EXTRA_TYPE void * +#endif + +static int yy_init_globals (void ); + +/* Macros after this point can all be overridden by user definitions in + * section 1. + */ + +#ifndef YY_SKIP_YYWRAP +#ifdef __cplusplus +extern "C" int yywrap (void ); +#else +extern int yywrap (void ); +#endif +#endif + + static void yyunput (int c,char *buf_ptr ); + +#ifndef yytext_ptr +static void yy_flex_strncpy (char *,yyconst char *,int ); +#endif + +#ifdef YY_NEED_STRLEN +static int yy_flex_strlen (yyconst char * ); +#endif + +#ifndef YY_NO_INPUT + +#ifdef __cplusplus +static int yyinput (void ); +#else +static int input (void ); +#endif + +#endif + +/* Amount of stuff to slurp up with each read. */ +#ifndef YY_READ_BUF_SIZE +#define YY_READ_BUF_SIZE 8192 +#endif + +/* Copy whatever the last rule matched to the standard output. */ +#ifndef ECHO +/* This used to be an fputs(), but since the string might contain NUL's, + * we now use fwrite(). + */ +#define ECHO (void) fwrite( yytext, yyleng, 1, yyout ) +#endif + +/* Gets input and stuffs it into "buf". number of characters read, or YY_NULL, + * is returned in "result". + */ +#ifndef YY_INPUT +#define YY_INPUT(buf,result,max_size) \ + if ( YY_CURRENT_BUFFER_LVALUE->yy_is_interactive ) \ + { \ + int c = '*'; \ + size_t n; \ + for ( n = 0; n < max_size && \ + (c = getc( yyin )) != EOF && c != '\n'; ++n ) \ + buf[n] = (char) c; \ + if ( c == '\n' ) \ + buf[n++] = (char) c; \ + if ( c == EOF && ferror( yyin ) ) \ + YY_FATAL_ERROR( "input in flex scanner failed" ); \ + result = n; \ + } \ + else \ + { \ + errno=0; \ + while ( (result = fread(buf, 1, max_size, yyin))==0 && ferror(yyin)) \ + { \ + if( errno != EINTR) \ + { \ + YY_FATAL_ERROR( "input in flex scanner failed" ); \ + break; \ + } \ + errno=0; \ + clearerr(yyin); \ + } \ + }\ +\ + +#endif + +/* No semi-colon after return; correct usage is to write "yyterminate();" - + * we don't want an extra ';' after the "return" because that will cause + * some compilers to complain about unreachable statements. + */ +#ifndef yyterminate +#define yyterminate() return YY_NULL +#endif + +/* Number of entries by which start-condition stack grows. */ +#ifndef YY_START_STACK_INCR +#define YY_START_STACK_INCR 25 +#endif + +/* Report a fatal error. */ +#ifndef YY_FATAL_ERROR +#define YY_FATAL_ERROR(msg) yy_fatal_error( msg ) +#endif + +/* end tables serialization structures and prototypes */ + +/* Default declaration of generated scanner - a define so the user can + * easily add parameters. + */ +#ifndef YY_DECL +#define YY_DECL_IS_OURS 1 + +extern int yylex (void); + +#define YY_DECL int yylex (void) +#endif /* !YY_DECL */ + +/* Code executed at the beginning of each rule, after yytext and yyleng + * have been set up. + */ +#ifndef YY_USER_ACTION +#define YY_USER_ACTION +#endif + +/* Code executed at the end of each rule. */ +#ifndef YY_BREAK +#define YY_BREAK break; +#endif + +#define YY_RULE_SETUP \ + if ( yyleng > 0 ) \ + YY_CURRENT_BUFFER_LVALUE->yy_at_bol = \ + (yytext[yyleng - 1] == '\n'); \ + YY_USER_ACTION + +/** The main scanner function which does all the work. + */ +YY_DECL +{ + register yy_state_type yy_current_state; + register char *yy_cp, *yy_bp; + register int yy_act; + +#line 222 "read_input.l" + +#line 3227 "" + + if ( !(yy_init) ) + { + (yy_init) = 1; + +#ifdef YY_USER_INIT + YY_USER_INIT; +#endif + + if ( ! (yy_start) ) + (yy_start) = 1; /* first start state */ + + if ( ! yyin ) + yyin = stdin; + + if ( ! yyout ) + yyout = stdout; + + if ( ! YY_CURRENT_BUFFER ) { + yyensure_buffer_stack (); + YY_CURRENT_BUFFER_LVALUE = + yy_create_buffer(yyin,YY_BUF_SIZE ); + } + + yy_load_buffer_state( ); + } + + while ( 1 ) /* loops until end-of-file is reached */ + { + yy_cp = (yy_c_buf_p); + + /* Support of yytext. */ + *yy_cp = (yy_hold_char); + + /* yy_bp points to the position in yy_ch_buf of the start of + * the current run. + */ + yy_bp = yy_cp; + + yy_current_state = (yy_start); + yy_current_state += YY_AT_BOL(); +yy_match: + do + { + register YY_CHAR yy_c = yy_ec[YY_SC_TO_UI(*yy_cp)]; + if ( yy_accept[yy_current_state] ) + { + (yy_last_accepting_state) = yy_current_state; + (yy_last_accepting_cpos) = yy_cp; + } + while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) + { + yy_current_state = (int) yy_def[yy_current_state]; + if ( yy_current_state >= 2644 ) + yy_c = yy_meta[(unsigned int) yy_c]; + } + yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c]; + ++yy_cp; + } + while ( yy_current_state != 2643 ); + yy_cp = (yy_last_accepting_cpos); + yy_current_state = (yy_last_accepting_state); + +yy_find_action: + yy_act = yy_accept[yy_current_state]; + + YY_DO_BEFORE_ACTION; + +do_action: /* This label is used only to access EOF actions. */ + + switch ( yy_act ) + { /* beginning of action switch */ + case 0: /* must back up */ + /* undo the effects of YY_DO_BEFORE_ACTION */ + *yy_cp = (yy_hold_char); + yy_cp = (yy_last_accepting_cpos); + yy_current_state = (yy_last_accepting_state); + goto yy_find_action; + +case 1: +/* rule 1 can match eol */ +YY_RULE_SETUP +#line 223 "read_input.l" +BEGIN(TT); + YY_BREAK +case 2: +/* rule 2 can match eol */ +YY_RULE_SETUP +#line 224 "read_input.l" +BEGIN(LL); + YY_BREAK +case 3: +/* rule 3 can match eol */ +YY_RULE_SETUP +#line 225 "read_input.l" +BEGIN(LLX); + YY_BREAK +case 4: +/* rule 4 can match eol */ +YY_RULE_SETUP +#line 226 "read_input.l" +BEGIN(LLY); + YY_BREAK +case 5: +/* rule 5 can match eol */ +YY_RULE_SETUP +#line 227 "read_input.l" +BEGIN(LLZ); + YY_BREAK +case 6: +/* rule 6 can match eol */ +YY_RULE_SETUP +#line 228 "read_input.l" +BEGIN(NPROCX); + YY_BREAK +case 7: +/* rule 7 can match eol */ +YY_RULE_SETUP +#line 229 "read_input.l" +BEGIN(NPROCY); + YY_BREAK +case 8: +/* rule 8 can match eol */ +YY_RULE_SETUP +#line 230 "read_input.l" +BEGIN(NPROCZ); + YY_BREAK +case 9: +/* rule 9 can match eol */ +YY_RULE_SETUP +#line 231 "read_input.l" +BEGIN(KAPPA); + YY_BREAK +case 10: +/* rule 10 can match eol */ +YY_RULE_SETUP +#line 232 "read_input.l" +BEGIN(MU); + YY_BREAK +case 11: +/* rule 11 can match eol */ +YY_RULE_SETUP +#line 233 "read_input.l" +BEGIN(MU2); + YY_BREAK +case 12: +/* rule 12 can match eol */ +YY_RULE_SETUP +#line 234 "read_input.l" +BEGIN(MU3); + YY_BREAK +case 13: +/* rule 13 can match eol */ +YY_RULE_SETUP +#line 235 "read_input.l" +BEGIN(MUBAR); + YY_BREAK +case 14: +/* rule 14 can match eol */ +YY_RULE_SETUP +#line 236 "read_input.l" +BEGIN(MUBAR); + YY_BREAK +case 15: +/* rule 15 can match eol */ +YY_RULE_SETUP +#line 237 "read_input.l" +BEGIN(PPP); + YY_BREAK +case 16: +/* rule 16 can match eol */ +YY_RULE_SETUP +#line 238 "read_input.l" +BEGIN(EPSBAR); + YY_BREAK +case 17: +/* rule 17 can match eol */ +YY_RULE_SETUP +#line 239 "read_input.l" +BEGIN(EPSBAR); + YY_BREAK +case 18: +/* rule 18 can match eol */ +YY_RULE_SETUP +#line 240 "read_input.l" +BEGIN(BETA); + YY_BREAK +case 19: +/* rule 19 can match eol */ +YY_RULE_SETUP +#line 241 "read_input.l" +BEGIN(ACCPTILDE); + YY_BREAK +case 20: +/* rule 20 can match eol */ +YY_RULE_SETUP +#line 242 "read_input.l" +BEGIN(ACCPTILDE); + YY_BREAK +case 21: +/* rule 21 can match eol */ +YY_RULE_SETUP +#line 243 "read_input.l" +BEGIN(ACCHFIN); + YY_BREAK +case 22: +/* rule 22 can match eol */ +YY_RULE_SETUP +#line 244 "read_input.l" +BEGIN(ACCHFIN); + YY_BREAK +case 23: +/* rule 23 can match eol */ +YY_RULE_SETUP +#line 245 "read_input.l" +BEGIN(RECEV); + YY_BREAK +case 24: +/* rule 24 can match eol */ +YY_RULE_SETUP +#line 246 "read_input.l" +BEGIN(RECEV); + YY_BREAK +case 25: +/* rule 25 can match eol */ +YY_RULE_SETUP +#line 247 "read_input.l" +BEGIN(NOEV); + YY_BREAK +case 26: +/* rule 26 can match eol */ +YY_RULE_SETUP +#line 248 "read_input.l" +BEGIN(PRECEV); + YY_BREAK +case 27: +/* rule 27 can match eol */ +YY_RULE_SETUP +#line 249 "read_input.l" +BEGIN(SEED); + YY_BREAK +case 28: +/* rule 28 can match eol */ +YY_RULE_SETUP +#line 250 "read_input.l" +BEGIN(STARTCOND); + YY_BREAK +case 29: +/* rule 29 can match eol */ +YY_RULE_SETUP +#line 251 "read_input.l" +BEGIN(THERMSWEEPS); + YY_BREAK +case 30: +/* rule 30 can match eol */ +YY_RULE_SETUP +#line 252 "read_input.l" +BEGIN(NMEAS); + YY_BREAK +case 31: +/* rule 31 can match eol */ +YY_RULE_SETUP +#line 253 "read_input.l" +BEGIN(NSKIP); + YY_BREAK +case 32: +/* rule 32 can match eol */ +YY_RULE_SETUP +#line 254 "read_input.l" +BEGIN(GAUGEINPUTFILE); + YY_BREAK +case 33: +/* rule 33 can match eol */ +YY_RULE_SETUP +#line 255 "read_input.l" +BEGIN(RLXDINPUTFILE); + YY_BREAK +case 34: +/* rule 34 can match eol */ +YY_RULE_SETUP +#line 256 "read_input.l" +BEGIN(SOLVFLAG); + YY_BREAK +case 35: +/* rule 35 can match eol */ +YY_RULE_SETUP +#line 257 "read_input.l" +BEGIN(SUBEVCG); + YY_BREAK +case 36: +/* rule 36 can match eol */ +YY_RULE_SETUP +#line 258 "read_input.l" +BEGIN(OPFLAG); + YY_BREAK +case 37: +/* rule 37 can match eol */ +YY_RULE_SETUP +#line 259 "read_input.l" +BEGIN(MEFLAG); + YY_BREAK +case 38: +/* rule 38 can match eol */ +YY_RULE_SETUP +#line 260 "read_input.l" +BEGIN(SAVECONF); + YY_BREAK +case 39: +/* rule 39 can match eol */ +YY_RULE_SETUP +#line 261 "read_input.l" +BEGIN(SAVEPROP); + YY_BREAK +case 40: +/* rule 40 can match eol */ +YY_RULE_SETUP +#line 262 "read_input.l" +BEGIN(SAVEPRG2); + YY_BREAK +case 41: +/* rule 41 can match eol */ +YY_RULE_SETUP +#line 263 "read_input.l" +BEGIN(WRITECP); + YY_BREAK +case 42: +/* rule 42 can match eol */ +YY_RULE_SETUP +#line 264 "read_input.l" +BEGIN(CPINT); + YY_BREAK +case 43: +/* rule 43 can match eol */ +YY_RULE_SETUP +#line 265 "read_input.l" +BEGIN(GAUGEINPUTFILE); + YY_BREAK +case 44: +/* rule 44 can match eol */ +YY_RULE_SETUP +#line 266 "read_input.l" +BEGIN(RLXDINPUTFILE); + YY_BREAK +case 45: +/* rule 45 can match eol */ +YY_RULE_SETUP +#line 267 "read_input.l" +BEGIN(NSTORE); + YY_BREAK +case 46: +/* rule 46 can match eol */ +YY_RULE_SETUP +#line 268 "read_input.l" +BEGIN(IOPROC); + YY_BREAK +case 47: +/* rule 47 can match eol */ +YY_RULE_SETUP +#line 269 "read_input.l" +BEGIN(IDX); + YY_BREAK +case 48: +/* rule 48 can match eol */ +YY_RULE_SETUP +#line 270 "read_input.l" +BEGIN(FPROP); + YY_BREAK +case 49: +/* rule 49 can match eol */ +YY_RULE_SETUP +#line 271 "read_input.l" +BEGIN(CSW); + YY_BREAK +case 50: +/* rule 50 can match eol */ +YY_RULE_SETUP +#line 272 "read_input.l" +BEGIN(Q1); + YY_BREAK +case 51: +/* rule 51 can match eol */ +YY_RULE_SETUP +#line 273 "read_input.l" +BEGIN(Q2); + YY_BREAK +case 52: +/* rule 52 can match eol */ +YY_RULE_SETUP +#line 274 "read_input.l" +BEGIN(INTTYP); + YY_BREAK +case 53: +/* rule 53 can match eol */ +YY_RULE_SETUP +#line 275 "read_input.l" +BEGIN(NSMALL); + YY_BREAK +case 54: +/* rule 54 can match eol */ +YY_RULE_SETUP +#line 276 "read_input.l" +BEGIN(DTAU); + YY_BREAK +case 55: +/* rule 55 can match eol */ +YY_RULE_SETUP +#line 277 "read_input.l" +BEGIN(TAU); + YY_BREAK +case 56: +/* rule 56 can match eol */ +YY_RULE_SETUP +#line 278 "read_input.l" +BEGIN(NSTEPS); + YY_BREAK +case 57: +/* rule 57 can match eol */ +YY_RULE_SETUP +#line 279 "read_input.l" +BEGIN(BCGMAX); + YY_BREAK +case 58: +/* rule 58 can match eol */ +YY_RULE_SETUP +#line 280 "read_input.l" +BEGIN(CGMAX); + YY_BREAK +case 59: +/* rule 59 can match eol */ +YY_RULE_SETUP +#line 281 "read_input.l" +BEGIN(BOUND); + YY_BREAK +case 60: +/* rule 60 can match eol */ +YY_RULE_SETUP +#line 282 "read_input.l" +BEGIN(BOUND); + YY_BREAK +case 61: +/* rule 61 can match eol */ +YY_RULE_SETUP +#line 283 "read_input.l" +BEGIN(SITER); + YY_BREAK +case 62: +/* rule 62 can match eol */ +YY_RULE_SETUP +#line 284 "read_input.l" +BEGIN(SPREC); + YY_BREAK +case 63: +/* rule 63 can match eol */ +YY_RULE_SETUP +#line 285 "read_input.l" +BEGIN(MNR); + YY_BREAK +case 64: +/* rule 64 can match eol */ +YY_RULE_SETUP +#line 286 "read_input.l" +BEGIN(RGIC); + YY_BREAK +case 65: +/* rule 65 can match eol */ +YY_RULE_SETUP +#line 287 "read_input.l" +BEGIN(READSOURCE); + YY_BREAK +case 66: +/* rule 66 can match eol */ +YY_RULE_SETUP +#line 288 "read_input.l" +BEGIN(SOURCEFILE); + YY_BREAK +case 67: +/* rule 67 can match eol */ +YY_RULE_SETUP +#line 289 "read_input.l" +BEGIN(SOURCEFORMAT); + YY_BREAK +case 68: +/* rule 68 can match eol */ +YY_RULE_SETUP +#line 290 "read_input.l" +BEGIN(SOURCETS); + YY_BREAK +case 69: +/* rule 69 can match eol */ +YY_RULE_SETUP +#line 291 "read_input.l" +BEGIN(INT0); + YY_BREAK +case 70: +/* rule 70 can match eol */ +YY_RULE_SETUP +#line 292 "read_input.l" +BEGIN(INT0); + YY_BREAK +case 71: +/* rule 71 can match eol */ +YY_RULE_SETUP +#line 293 "read_input.l" +BEGIN(INT0); + YY_BREAK +case 72: +/* rule 72 can match eol */ +YY_RULE_SETUP +#line 294 "read_input.l" +BEGIN(INT1); + YY_BREAK +case 73: +/* rule 73 can match eol */ +YY_RULE_SETUP +#line 295 "read_input.l" +BEGIN(INT1); + YY_BREAK +case 74: +/* rule 74 can match eol */ +YY_RULE_SETUP +#line 296 "read_input.l" +BEGIN(INT2); + YY_BREAK +case 75: +/* rule 75 can match eol */ +YY_RULE_SETUP +#line 297 "read_input.l" +BEGIN(INT2); + YY_BREAK +case 76: +/* rule 76 can match eol */ +YY_RULE_SETUP +#line 298 "read_input.l" +BEGIN(INT3); + YY_BREAK +case 77: +/* rule 77 can match eol */ +YY_RULE_SETUP +#line 299 "read_input.l" +BEGIN(INT3); + YY_BREAK +case 78: +/* rule 78 can match eol */ +YY_RULE_SETUP +#line 300 "read_input.l" +BEGIN(INT4); + YY_BREAK +case 79: +/* rule 79 can match eol */ +YY_RULE_SETUP +#line 301 "read_input.l" +BEGIN(INT4); + YY_BREAK +case 80: +/* rule 80 can match eol */ +YY_RULE_SETUP +#line 302 "read_input.l" +BEGIN(LAMBDA0); + YY_BREAK +case 81: +/* rule 81 can match eol */ +YY_RULE_SETUP +#line 303 "read_input.l" +BEGIN(LAMBDA1); + YY_BREAK +case 82: +/* rule 82 can match eol */ +YY_RULE_SETUP +#line 304 "read_input.l" +BEGIN(LAMBDA2); + YY_BREAK +case 83: +/* rule 83 can match eol */ +YY_RULE_SETUP +#line 305 "read_input.l" +BEGIN(LAMBDA3); + YY_BREAK +case 84: +/* rule 84 can match eol */ +YY_RULE_SETUP +#line 306 "read_input.l" +BEGIN(LAMBDA4); + YY_BREAK +case 85: +/* rule 85 can match eol */ +YY_RULE_SETUP +#line 307 "read_input.l" +BEGIN(RELPREC); + YY_BREAK +case 86: +/* rule 86 can match eol */ +YY_RULE_SETUP +#line 308 "read_input.l" +BEGIN(FORCEPREC); + YY_BREAK +case 87: +/* rule 87 can match eol */ +YY_RULE_SETUP +#line 309 "read_input.l" +BEGIN(FORCEPREC1); + YY_BREAK +case 88: +/* rule 88 can match eol */ +YY_RULE_SETUP +#line 310 "read_input.l" +BEGIN(FORCEPREC2); + YY_BREAK +case 89: +/* rule 89 can match eol */ +YY_RULE_SETUP +#line 311 "read_input.l" +BEGIN(FORCEPREC3); + YY_BREAK +case 90: +/* rule 90 can match eol */ +YY_RULE_SETUP +#line 312 "read_input.l" +BEGIN(ACCPREC); + YY_BREAK +case 91: +/* rule 91 can match eol */ +YY_RULE_SETUP +#line 313 "read_input.l" +BEGIN(ACCPREC1); + YY_BREAK +case 92: +/* rule 92 can match eol */ +YY_RULE_SETUP +#line 314 "read_input.l" +BEGIN(ACCPREC2); + YY_BREAK +case 93: +/* rule 93 can match eol */ +YY_RULE_SETUP +#line 315 "read_input.l" +BEGIN(ACCPREC3); + YY_BREAK +case 94: +/* rule 94 can match eol */ +YY_RULE_SETUP +#line 316 "read_input.l" +BEGIN(REVCHECK); + YY_BREAK +case 95: +/* rule 95 can match eol */ +YY_RULE_SETUP +#line 317 "read_input.l" +BEGIN(REVINT); + YY_BREAK +case 96: +/* rule 96 can match eol */ +YY_RULE_SETUP +#line 318 "read_input.l" +BEGIN(DEBUG); + YY_BREAK +case 97: +/* rule 97 can match eol */ +YY_RULE_SETUP +#line 319 "read_input.l" +BEGIN(CSGN1); + YY_BREAK +case 98: +/* rule 98 can match eol */ +YY_RULE_SETUP +#line 320 "read_input.l" +BEGIN(CSGN1); + YY_BREAK +case 99: +/* rule 99 can match eol */ +YY_RULE_SETUP +#line 321 "read_input.l" +BEGIN(CSGN2); + YY_BREAK +case 100: +/* rule 100 can match eol */ +YY_RULE_SETUP +#line 322 "read_input.l" +BEGIN(CSGN2); + YY_BREAK +case 101: +/* rule 101 can match eol */ +YY_RULE_SETUP +#line 323 "read_input.l" +BEGIN(CSGN3); + YY_BREAK +case 102: +/* rule 102 can match eol */ +YY_RULE_SETUP +#line 324 "read_input.l" +BEGIN(CSGN3); + YY_BREAK +case 103: +/* rule 103 can match eol */ +YY_RULE_SETUP +#line 325 "read_input.l" +BEGIN(GMRESM); + YY_BREAK +case 104: +/* rule 104 can match eol */ +YY_RULE_SETUP +#line 326 "read_input.l" +BEGIN(GMRESDRNEV); + YY_BREAK +case 105: +/* rule 105 can match eol */ +YY_RULE_SETUP +#line 327 "read_input.l" +BEGIN(GAUGERPREC); + YY_BREAK +case 106: +/* rule 106 can match eol */ +YY_RULE_SETUP +#line 328 "read_input.l" +BEGIN(GAUGEWPREC); + YY_BREAK +case 107: +/* rule 107 can match eol */ +YY_RULE_SETUP +#line 329 "read_input.l" +BEGIN(PROPPREC); + YY_BREAK +case 108: +/* rule 108 can match eol */ +YY_RULE_SETUP +#line 330 "read_input.l" +BEGIN(REPRORND); + YY_BREAK +case 109: +/* rule 109 can match eol */ +YY_RULE_SETUP +#line 331 "read_input.l" +BEGIN(SLOPPYPREC); + YY_BREAK +case 110: +/* rule 110 can match eol */ +YY_RULE_SETUP +#line 332 "read_input.l" +BEGIN(USESTOUT); + YY_BREAK +case 111: +/* rule 111 can match eol */ +YY_RULE_SETUP +#line 333 "read_input.l" +BEGIN(STOUTRHO); + YY_BREAK +case 112: +/* rule 112 can match eol */ +YY_RULE_SETUP +#line 334 "read_input.l" +BEGIN(STOUTITER); + YY_BREAK +case 113: +/* rule 113 can match eol */ +YY_RULE_SETUP +#line 335 "read_input.l" +BEGIN(PHMCFLAV); + YY_BREAK +case 114: +/* rule 114 can match eol */ +YY_RULE_SETUP +#line 336 "read_input.l" +BEGIN(PCOMPUTEEVS); + YY_BREAK +case 115: +/* rule 115 can match eol */ +YY_RULE_SETUP +#line 337 "read_input.l" +BEGIN(COMPUTEEVS); + YY_BREAK +case 116: +/* rule 116 can match eol */ +YY_RULE_SETUP +#line 338 "read_input.l" +BEGIN(SMAX); + YY_BREAK +case 117: +/* rule 117 can match eol */ +YY_RULE_SETUP +#line 339 "read_input.l" +BEGIN(SMIN); + YY_BREAK +case 118: +/* rule 118 can match eol */ +YY_RULE_SETUP +#line 340 "read_input.l" +BEGIN(DEGP); + YY_BREAK +case 119: +/* rule 119 can match eol */ +YY_RULE_SETUP +#line 341 "read_input.l" +BEGIN(SPLITPROP); + YY_BREAK +case 120: +/* rule 120 can match eol */ +YY_RULE_SETUP +#line 342 "read_input.l" +BEGIN(SPLITSOURCE); + YY_BREAK +case 121: +/* rule 121 can match eol */ +YY_RULE_SETUP +#line 343 "read_input.l" +BEGIN(SRCLOC); + YY_BREAK +case 122: +/* rule 122 can match eol */ +YY_RULE_SETUP +#line 344 "read_input.l" +BEGIN(HEAVYTS); + YY_BREAK +case 123: +/* rule 123 can match eol */ +YY_RULE_SETUP +#line 345 "read_input.l" +BEGIN(EO); + YY_BREAK +case 124: +/* rule 124 can match eol */ +YY_RULE_SETUP +#line 346 "read_input.l" +BEGIN(WRPROPFLAG); + YY_BREAK +case 125: +/* rule 125 can match eol */ +YY_RULE_SETUP +#line 347 "read_input.l" +BEGIN(WRPROPFLAG); + YY_BREAK +case 126: +/* rule 126 can match eol */ +YY_RULE_SETUP +#line 348 "read_input.l" +BEGIN(ONMEAS); + YY_BREAK +case 127: +/* rule 127 can match eol */ +YY_RULE_SETUP +#line 349 "read_input.l" +BEGIN(ONFREQ); + YY_BREAK +case 128: +YY_RULE_SETUP +#line 351 "read_input.l" +{ +#ifndef FIXEDVOLUME + T_global = atoi(yytext); +#endif + if(verbose!=0) printf("T =%s\n", yytext); +} + YY_BREAK +case 129: +YY_RULE_SETUP +#line 357 "read_input.l" +{ +#ifndef FIXEDVOLUME + L = atoi(yytext); +#endif + if(verbose!=0) printf("L =%s\n", yytext); +} + YY_BREAK +case 130: +YY_RULE_SETUP +#line 363 "read_input.l" +{ +#ifndef FIXEDVOLUME + LX = atoi(yytext); +#endif + if(verbose!=0) printf("LX =%s\n", yytext); +} + YY_BREAK +case 131: +YY_RULE_SETUP +#line 369 "read_input.l" +{ +#ifndef FIXEDVOLUME + LY = atoi(yytext); +#endif + if(verbose!=0) printf("LY =%s\n", yytext); +} + YY_BREAK +case 132: +YY_RULE_SETUP +#line 375 "read_input.l" +{ +#ifndef FIXEDVOLUME + LZ = atoi(yytext); +#endif + if(verbose!=0) printf("LZ =%s\n", yytext); +} + YY_BREAK +case 133: +YY_RULE_SETUP +#line 381 "read_input.l" +{ +#ifndef FIXEDVOLUME + N_PROC_X = atoi(yytext); +#endif + if(verbose!=0) printf("Nr of processors in x direction = %s\n", yytext); +} + YY_BREAK +case 134: +YY_RULE_SETUP +#line 387 "read_input.l" +{ +#ifndef FIXEDVOLUME + N_PROC_Y = atoi(yytext); +#endif + if(verbose!=0) printf("Nr of processors in y direction = %s\n", yytext); +} + YY_BREAK +case 135: +YY_RULE_SETUP +#line 393 "read_input.l" +{ +#ifndef FIXEDVOLUME + N_PROC_Z = atoi(yytext); +#endif + if(verbose!=0) printf("Nr of processors in z direction = %s\n", yytext); +} + YY_BREAK +case 136: +YY_RULE_SETUP +#line 399 "read_input.l" +{ + random_seed=atoi(yytext); + if(verbose!=0) printf("seed=%s \n", yytext); +} + YY_BREAK +case 137: +YY_RULE_SETUP +#line 403 "read_input.l" +{ + g_kappa=atof(yytext); + if(verbose!=0) printf("kappa=%s \n", yytext); +} + YY_BREAK +case 138: +YY_RULE_SETUP +#line 407 "read_input.l" +{ + g_acc_Ptilde=atof(yytext); + if(verbose!=0) printf("Acc_Ptilde=%s \n", yytext); +} + YY_BREAK +case 139: +YY_RULE_SETUP +#line 411 "read_input.l" +{ + g_acc_Hfin=atof(yytext); + if(verbose!=0) printf("Acc_Hfin=%s \n", yytext); +} + YY_BREAK +case 140: +YY_RULE_SETUP +#line 415 "read_input.l" +{ + g_rec_ev = atoi(yytext); + if(verbose!=0) printf("Rec_EV=%s \n", yytext); +} + YY_BREAK +case 141: +YY_RULE_SETUP +#line 419 "read_input.l" +{ + g_mubar=atof(yytext); + if(verbose!=0) printf("mubar=%s \n", yytext); +} + YY_BREAK +case 142: +YY_RULE_SETUP +#line 423 "read_input.l" +{ + g_epsbar=atof(yytext); + if(verbose!=0) printf("epsbar=%s \n", yytext); +} + YY_BREAK +case 143: +YY_RULE_SETUP +#line 427 "read_input.l" +{ + g_mu1=atof(yytext); + if(verbose!=0) printf("mu=%s \n", yytext); +} + YY_BREAK +case 144: +YY_RULE_SETUP +#line 431 "read_input.l" +{ + g_mu2=atof(yytext); + if(verbose!=0) printf("mu2=%s \n", yytext); +} + YY_BREAK +case 145: +YY_RULE_SETUP +#line 435 "read_input.l" +{ + g_mu3=atof(yytext); + if(verbose!=0) printf("mu3=%s \n", yytext); +} + YY_BREAK +case 146: +YY_RULE_SETUP +#line 439 "read_input.l" +{ + g_beta=atof(yytext); + if(verbose!=0) printf("beta=%s \n",yytext); +} + YY_BREAK +case 147: +YY_RULE_SETUP +#line 443 "read_input.l" +{ + startoption=0; + if(verbose!=0) printf("Start Condition is %s \n",yytext); +} + YY_BREAK +case 148: +YY_RULE_SETUP +#line 447 "read_input.l" +{ + startoption=1; + if(verbose!=0) printf("Start Condition is %s \n",yytext); +} + YY_BREAK +case 149: +YY_RULE_SETUP +#line 451 "read_input.l" +{ + startoption=2; + if(verbose!=0) printf("Start Condition is %s \n",yytext); +} + YY_BREAK +case 150: +YY_RULE_SETUP +#line 455 "read_input.l" +{ + startoption=3; + if(verbose!=0) printf("Start Condition is %s \n",yytext); +} + YY_BREAK +case 151: +YY_RULE_SETUP +#line 459 "read_input.l" +{ + Ntherm=atoi(yytext); + if(verbose!=0) printf("Nterm= %s \n",yytext); +} + YY_BREAK +case 152: +YY_RULE_SETUP +#line 463 "read_input.l" +{ + Nmeas=atoi(yytext); + if(verbose!=0) printf("Nmeas= %s \n",yytext); +} + YY_BREAK +case 153: +YY_RULE_SETUP +#line 467 "read_input.l" +{ + Nskip=atoi(yytext); + if(verbose!=0) printf("Nskip= %s \n",yytext); +} + YY_BREAK +case 154: +YY_RULE_SETUP +#line 471 "read_input.l" +{ + solver_flag=0; + if(verbose!=0) printf("Use BiCGStab Solver"); +} + YY_BREAK +case 155: +YY_RULE_SETUP +#line 475 "read_input.l" +{ + solver_flag=1; + if(verbose!=0) printf("Use CG Solver\n"); +} + YY_BREAK +case 156: +YY_RULE_SETUP +#line 479 "read_input.l" +{ + solver_flag=9; + if(verbose!=0) printf("Use PCG Solver (eigenvectors needed) \n"); +} + YY_BREAK +case 157: +YY_RULE_SETUP +#line 483 "read_input.l" +{ + solver_flag=2; + if(verbose!=0) printf("Use GMRES Solver\n"); +} + YY_BREAK +case 158: +YY_RULE_SETUP +#line 487 "read_input.l" +{ + solver_flag=7; + if(verbose!=0) printf("Use GCR Solver\n"); +} + YY_BREAK +case 159: +YY_RULE_SETUP +#line 491 "read_input.l" +{ + solver_flag=8; + if(verbose!=0) printf("Use GMRES-DR Solver\n"); +} + YY_BREAK +case 160: +YY_RULE_SETUP +#line 495 "read_input.l" +{ + solver_flag=3; + if(verbose!=0) printf("Use CGS Solver\n"); +} + YY_BREAK +case 161: +YY_RULE_SETUP +#line 499 "read_input.l" +{ + solver_flag=4; + if(verbose!=0) printf("Use MR Solver \n"); +} + YY_BREAK +case 162: +YY_RULE_SETUP +#line 503 "read_input.l" +{ + solver_flag=5; + if(verbose!=0) printf("Use BiCGstab(2) Solver \n"); +} + YY_BREAK +case 163: +YY_RULE_SETUP +#line 507 "read_input.l" +{ + solver_flag=6; + if(verbose!=0) printf("Use FGMRES solver (eigenvectors needed) \n"); +} + YY_BREAK +case 164: +YY_RULE_SETUP +#line 511 "read_input.l" +{ + gmres_m_parameter = atoi(yytext); + if(verbose!=0) printf("Use Krylov Space of size %d in GMRES \n", gmres_m_parameter); +} + YY_BREAK +case 165: +YY_RULE_SETUP +#line 515 "read_input.l" +{ + gmresdr_nr_ev = atoi(yytext); + if(verbose!=0) printf("Deflate %d eigenvectors in GMRES-DR \n", gmresdr_nr_ev); +} + YY_BREAK +case 166: +YY_RULE_SETUP +#line 519 "read_input.l" +{ + max_solver_iterations = atoi(yytext); + if(verbose!=0) printf("Use %d iterations in the solvers!\n", max_solver_iterations); +} + YY_BREAK +case 167: +YY_RULE_SETUP +#line 523 "read_input.l" +{ + solver_precision = atof(yytext); + if(verbose!=0) printf("Use %e as convergence precision for the solvers!\n", solver_precision); +} + YY_BREAK +case 168: +YY_RULE_SETUP +#line 527 "read_input.l" +{ + operator_flag=2; + if(verbose!=0) printf("Operator Flag is set to %s\n",yytext); +} + YY_BREAK +case 169: +YY_RULE_SETUP +#line 531 "read_input.l" +{ + operator_flag=1; + if(verbose!=0) printf("Operator Flag is set to %s\n",yytext); +} + YY_BREAK +case 170: +YY_RULE_SETUP +#line 535 "read_input.l" +{ + operator_flag=0; + if(verbose!=0) printf("Operator Flag is set to %s\n",yytext); +} + YY_BREAK +case 171: +YY_RULE_SETUP +#line 539 "read_input.l" +{ + matrix_element_flag=1; + if(verbose!=0) printf("Compute Matrix Elements: %s\n", yytext); +} + YY_BREAK +case 172: +YY_RULE_SETUP +#line 543 "read_input.l" +{ + matrix_element_flag=0; + if(verbose!=0) printf("Compute Matrix Elements: %s\n", yytext); +} + YY_BREAK +case 173: +YY_RULE_SETUP +#line 547 "read_input.l" +{ + save_config_flag=1; + if(verbose!=0) printf("Save configurations\n"); +} + YY_BREAK +case 174: +YY_RULE_SETUP +#line 551 "read_input.l" +{ + save_config_flag=0; + if(verbose!=0) printf("Don't save configurations\n"); +} + YY_BREAK +case 175: +YY_RULE_SETUP +#line 555 "read_input.l" +{ + save_prop_flag=1; + if(verbose!=0) printf("Save propagators\n"); +} + YY_BREAK +case 176: +YY_RULE_SETUP +#line 559 "read_input.l" +{ + save_prop_flag=0; + if(verbose!=0) printf("Don't save propagators\n"); +} + YY_BREAK +case 177: +YY_RULE_SETUP +#line 563 "read_input.l" +{ + save_prop_g2_flag=1; + if(verbose!=0) printf("Save generalized propagators\n"); +} + YY_BREAK +case 178: +YY_RULE_SETUP +#line 567 "read_input.l" +{ + save_prop_g2_flag=0; + if(verbose!=0) printf("Don't save generalized propagators\n"); +} + YY_BREAK +case 179: +YY_RULE_SETUP +#line 571 "read_input.l" +{ + write_cp_flag=1; + if(verbose!=0) printf("Write Checkpoints\n"); +} + YY_BREAK +case 180: +YY_RULE_SETUP +#line 575 "read_input.l" +{ + write_cp_flag=0; + if(verbose!=0) printf("Don't write Checkpoints\n"); +} + YY_BREAK +case 181: +YY_RULE_SETUP +#line 579 "read_input.l" +{ + cp_interval=atoi(yytext); + if(verbose!=0) printf("Write Checkpoint all %s measurements\n",yytext); +} + YY_BREAK +case 182: +YY_RULE_SETUP +#line 583 "read_input.l" +{ + strcpy(rlxd_input_filename,yytext); + if(verbose!=0) printf("Ranluxd input filename set to %s\n",yytext); +} + YY_BREAK +case 183: +YY_RULE_SETUP +#line 587 "read_input.l" +{ + strcpy(gauge_input_filename,yytext); + if(verbose!=0) printf("Gauge Configuration input filename set to %s\n",yytext); +} + YY_BREAK +case 184: +YY_RULE_SETUP +#line 591 "read_input.l" +{ + nstore=atoi(yytext); + if(verbose!=0) printf("Initial store counter set to %s\n",yytext); +} + YY_BREAK +case 185: +YY_RULE_SETUP +#line 595 "read_input.l" +{ + nstore=-1; + if(verbose!=0) printf("Trying to read InitialStoreCounter from file .nstore_counter\n"); +} + YY_BREAK +case 186: +YY_RULE_SETUP +#line 599 "read_input.l" +{ + g_stdio_proc = -1; + if(verbose!=0) printf("All processors will give output to stdout\n"); +} + YY_BREAK +case 187: +YY_RULE_SETUP +#line 603 "read_input.l" +{ + g_stdio_proc = -2; + if(verbose!=0) printf("No processor will give output to stdout\n"); +} + YY_BREAK +case 188: +YY_RULE_SETUP +#line 607 "read_input.l" +{ + g_stdio_proc = atoi(yytext); + if(verbose!=0) printf("processor %s will give output to stdout\n", yytext); +} + YY_BREAK +case 189: +YY_RULE_SETUP +#line 611 "read_input.l" +{ + index_start = atoi(yytext); + index_end = index_start+1; + if((index_start < 0)||(index_start >11)){ + printf("Error in line %d! index_start must be in [0,11]! Exiting...!\n", line_of_file); + exit(1); + } + if(verbose!=0) printf("inverting for index %s\n", yytext); +} + YY_BREAK +case 190: +YY_RULE_SETUP +#line 620 "read_input.l" +{ + sscanf(yytext, "-%d", &index_end); + if((index_end < 0)||(index_end >11)){ + printf("Error in line %d! index_end must be in [0,11]! Exiting...!\n", line_of_file); + exit(1); + } + if((index_end < 0)||(index_end >11)){ + printf("Warnig! index_end bigger than index_start. Will compute no propagator!\n"); + } + if(verbose!=0) printf("inverting up to color index %d\n", index_end); + index_end+=1; +} + YY_BREAK +case 191: +YY_RULE_SETUP +#line 632 "read_input.l" +{ + first_prop_flag = -1; + if(verbose!=0) printf("Do not compute the first propagator (default)\n"); +} + YY_BREAK +case 192: +YY_RULE_SETUP +#line 636 "read_input.l" +{ + first_prop_flag = 0; + if(verbose!=0) printf("Computing the first propagator (default)\n"); +} + YY_BREAK +case 193: +YY_RULE_SETUP +#line 640 "read_input.l" +{ + first_prop_flag = 1; + if(verbose!=0) printf("Reading in the first propagator\n"); +} + YY_BREAK +case 194: +YY_RULE_SETUP +#line 644 "read_input.l" +{ + integtyp = 1; + if(verbose!=0) printf("Using Leap Frog integrator!\n"); +} + YY_BREAK +case 195: +YY_RULE_SETUP +#line 648 "read_input.l" +{ + integtyp = 2; + if(verbose!=0) printf("Using SW integrator!\n"); +} + YY_BREAK +case 196: +YY_RULE_SETUP +#line 652 "read_input.l" +{ + integtyp = 3; + if(verbose!=0) printf("Using multiple time scale Leapfrog integrator!\n"); +} + YY_BREAK +case 197: +YY_RULE_SETUP +#line 656 "read_input.l" +{ + integtyp = 4; + if(verbose!=0) printf("Using multiple time scale Sexton-Weingarten integrator!\n"); +} + YY_BREAK +case 198: +YY_RULE_SETUP +#line 660 "read_input.l" +{ + integtyp = 5; + if(verbose!=0) printf("Using higher order Leapfrog integrator!\n"); +} + YY_BREAK +case 199: +YY_RULE_SETUP +#line 664 "read_input.l" +{ + integtyp = 6; + if(verbose!=0) printf("Using Second order Minimal norm integrator!\n"); +} + YY_BREAK +case 200: +YY_RULE_SETUP +#line 668 "read_input.l" +{ + integtyp = 7; + if(verbose!=0) printf("Using Second order Minimal norm integrator (position version)!\n"); +} + YY_BREAK +case 201: +YY_RULE_SETUP +#line 672 "read_input.l" +{ + nsmall = atoi(yytext); + if(verbose!=0) printf("nsmall set to %d\n", nsmall); +} + YY_BREAK +case 202: +YY_RULE_SETUP +#line 676 "read_input.l" +{ + g_c_sw = atof(yytext); + if(verbose!=0) printf("c_sw set to %e\n", g_c_sw); +} + YY_BREAK +case 203: +YY_RULE_SETUP +#line 680 "read_input.l" +{ + dtau = atof(yytext); + if(verbose!=0) printf("dtau set to %e\n", dtau); +} + YY_BREAK +case 204: +YY_RULE_SETUP +#line 684 "read_input.l" +{ + tau = atof(yytext); + if(verbose!=0) printf("tau set to %e\n", tau); +} + YY_BREAK +case 205: +YY_RULE_SETUP +#line 688 "read_input.l" +{ + Nsteps = atoi(yytext); + if(verbose!=0) printf("NSteps set to %d\n", Nsteps); +} + YY_BREAK +case 206: +YY_RULE_SETUP +#line 692 "read_input.l" +{ + ITER_MAX_BCG = atoi(yytext); + if(verbose != 0) printf("Maximal number of iterations for BCGstab set ro %d\n", ITER_MAX_BCG); +} + YY_BREAK +case 207: +YY_RULE_SETUP +#line 696 "read_input.l" +{ + ITER_MAX_CG = atoi(yytext); + if(verbose != 0) printf("Maximal number of iterations for CG set ro %d\n", ITER_MAX_CG); +} + YY_BREAK +case 208: +YY_RULE_SETUP +#line 700 "read_input.l" +{ + X0 = atof(yytext); + if(verbose != 0) printf("X0 for boundary cond. in time set to %e\n", X0); +} + YY_BREAK +case 209: +YY_RULE_SETUP +#line 704 "read_input.l" +{ + mass_number = atoi(yytext); + if(verbose != 0) printf("Setting mass number to %s\n", yytext); +} + YY_BREAK +case 210: +YY_RULE_SETUP +#line 708 "read_input.l" +{ + g_rgi_C1=atof(yytext); + if(verbose!=0) printf("g_rgi_C1=%s \n", yytext); +} + YY_BREAK +case 211: +YY_RULE_SETUP +#line 712 "read_input.l" +{ + read_source_flag=1; + if(verbose!=0) printf("Read inversion source from file\n"); +} + YY_BREAK +case 212: +YY_RULE_SETUP +#line 716 "read_input.l" +{ + read_source_flag=0; + if(verbose!=0) printf("Don't read inversion source from file\n"); +} + YY_BREAK +case 213: +YY_RULE_SETUP +#line 720 "read_input.l" +{ + strcpy(source_input_filename,yytext); + if(verbose!=0) printf("source input filename set to %s\n",yytext); +} + YY_BREAK +case 214: +YY_RULE_SETUP +#line 724 "read_input.l" +{ + source_format_flag = 0; + if(verbose!=0) printf("Using standard ETMC binary format for source input file\n"); +} + YY_BREAK +case 215: +YY_RULE_SETUP +#line 728 "read_input.l" +{ + source_format_flag = 1; + if(verbose!=0) printf("Using CM format for source input file\n"); +} + YY_BREAK +case 216: +YY_RULE_SETUP +#line 732 "read_input.l" +{ + source_format_flag = 2; + if(verbose!=0) printf("Using GWC format for source input file\n"); +} + YY_BREAK +case 217: +YY_RULE_SETUP +#line 736 "read_input.l" +{ + source_time_slice = atoi(yytext); + if(verbose!=0) printf("Using only timeslice %s of the source, padding the rest with zeros\n", yytext); +} + YY_BREAK +case 218: +YY_RULE_SETUP +#line 740 "read_input.l" +{ + int_n[0] = atoi(yytext); + if(verbose!=0) printf("Number of steps in ExtLeapFrog integrator for gauge set to %d!\n", int_n[0]); +} + YY_BREAK +case 219: +YY_RULE_SETUP +#line 744 "read_input.l" +{ + int_n[1] = atoi(yytext); + if(verbose!=0) printf("Number of steps in ExtLeapFrog integrator for psf 1 (mu) set to %d!\n", int_n[1]); +} + YY_BREAK +case 220: +YY_RULE_SETUP +#line 748 "read_input.l" +{ + int_n[2] = atoi(yytext); + if(verbose!=0) printf("Number of steps in ExtLeapFrog integrator for psf 2 (mu2) set to %d!\n", int_n[2]); +} + YY_BREAK +case 221: +YY_RULE_SETUP +#line 752 "read_input.l" +{ + int_n[3] = atoi(yytext); + if(verbose!=0) printf("Number of steps in ExtLeapFrog integrator for psf 3 (mu3) set to %d!\n", int_n[3]); +} + YY_BREAK +case 222: +YY_RULE_SETUP +#line 756 "read_input.l" +{ + if(verbose!=0) printf("Number of steps in ExtLeapFrog integrator for psf 4 (mu4) set to %d!\n", int_n[1]); +} + YY_BREAK +case 223: +YY_RULE_SETUP +#line 759 "read_input.l" +{ + lambda[0] = atof(yytext); + if(verbose!=0) printf("Set lambda parameter for gauge fields (in the 2MN integrator) to %f!\n", lambda[0]); +} + YY_BREAK +case 224: +YY_RULE_SETUP +#line 763 "read_input.l" +{ + lambda[1] = atof(yytext); + if(verbose!=0) printf("Set lambda parameter for psf 1 (in the 2MN integrator) to %f!\n", lambda[0]); +} + YY_BREAK +case 225: +YY_RULE_SETUP +#line 767 "read_input.l" +{ + lambda[2] = atof(yytext); + if(verbose!=0) printf("Set lambda parameter for psf 2 (in the 2MN integrator) to %f!\n", lambda[0]); +} + YY_BREAK +case 226: +YY_RULE_SETUP +#line 771 "read_input.l" +{ + lambda[3] = atof(yytext); + if(verbose!=0) printf("Set lambda parameter for psf 3 (in the 2MN integrator) to %f!\n", lambda[0]); +} + YY_BREAK +case 227: +YY_RULE_SETUP +#line 775 "read_input.l" +{ + if(verbose!=0) printf("Set lambda parameter for psf 4 (in the 2MN integrator) to %f! (not yet implemented)\n", lambda[0]); +} + YY_BREAK +case 228: +YY_RULE_SETUP +#line 778 "read_input.l" +{ + g_eps_sq_force=atof(yytext); + if(verbose!=0) printf("g_eps_sq_force=%s Residual for inversions in the force computation\n", yytext); +} + YY_BREAK +case 229: +YY_RULE_SETUP +#line 782 "read_input.l" +{ + g_eps_sq_force1=atof(yytext); + if(verbose!=0) printf("g_eps_sq_force(mu)=%s Residual for inversions in the force computation\n", yytext); +} + YY_BREAK +case 230: +YY_RULE_SETUP +#line 786 "read_input.l" +{ + g_eps_sq_force2=atof(yytext); + if(verbose!=0) printf("g_eps_sq_force(mu2)=%s Residual for inversions in the force computation\n", yytext); +} + YY_BREAK +case 231: +YY_RULE_SETUP +#line 790 "read_input.l" +{ + g_eps_sq_force3=atof(yytext); + if(verbose!=0) printf("g_eps_sq_force(mu3)=%s Residual for inversions in the force computation\n", yytext); +} + YY_BREAK +case 232: +YY_RULE_SETUP +#line 794 "read_input.l" +{ + g_eps_sq_acc=atof(yytext); + if(verbose!=0) printf("g_eps_sq_acc=%s Residual for inversions in the acceptance step\n", yytext); +} + YY_BREAK +case 233: +YY_RULE_SETUP +#line 798 "read_input.l" +{ + g_eps_sq_acc1=atof(yytext); + if(verbose!=0) printf("g_eps_sq_acc(mu)=%s Residual for inversions in the acceptance step\n", yytext); +} + YY_BREAK +case 234: +YY_RULE_SETUP +#line 802 "read_input.l" +{ + g_eps_sq_acc2=atof(yytext); + if(verbose!=0) printf("g_eps_sq_acc(mu2)=%s Residual for inversions in the acceptance step\n", yytext); +} + YY_BREAK +case 235: +YY_RULE_SETUP +#line 806 "read_input.l" +{ + g_eps_sq_acc3=atof(yytext); + if(verbose!=0) printf("g_eps_sq_acc(mu3)=%s Residual for inversions in the acceptance step\n", yytext); +} + YY_BREAK +case 236: +YY_RULE_SETUP +#line 810 "read_input.l" +{ + g_relative_precision_flag = 1; + if(verbose!=0) printf("Using relative precision\n"); +} + YY_BREAK +case 237: +YY_RULE_SETUP +#line 814 "read_input.l" +{ + g_relative_precision_flag = 0; + if(verbose!=0) printf("Using absolute precision\n"); +} + YY_BREAK +case 238: +YY_RULE_SETUP +#line 818 "read_input.l" +{ + return_check_flag = 1; + if(verbose!=0) printf("Perform checks of Reversibility\n"); +} + YY_BREAK +case 239: +YY_RULE_SETUP +#line 822 "read_input.l" +{ + return_check_flag = 0; + if(verbose!=0) printf("Don't perform checks of Reversibility\n"); +} + YY_BREAK +case 240: +YY_RULE_SETUP +#line 826 "read_input.l" +{ + return_check_interval = atoi(yytext); + if(verbose!=0) printf("Check reversibility all %d trajectories\n", return_check_interval); +} + YY_BREAK +case 241: +YY_RULE_SETUP +#line 830 "read_input.l" +{ + g_debug_level = atoi(yytext); + if(verbose!=0) printf("Debug level = %d\n", g_debug_level); +} + YY_BREAK +case 242: +YY_RULE_SETUP +#line 834 "read_input.l" +{ + g_csg_N[0] = atoi(yytext); + if(verbose!=0) printf("Chronological Invertier history length for mu set to %d\n", g_csg_N[0]); +} + YY_BREAK +case 243: +YY_RULE_SETUP +#line 838 "read_input.l" +{ + g_csg_N[2] = atoi(yytext); + if(verbose!=0) printf("Chronological Invertier history length for mu set to %d\n", g_csg_N[2]); +} + YY_BREAK +case 244: +YY_RULE_SETUP +#line 842 "read_input.l" +{ + g_csg_N[4] = atoi(yytext); + if(verbose!=0) printf("Chronological Invertier history length for mu set to %d\n", g_csg_N[4]); +} + YY_BREAK +case 245: +YY_RULE_SETUP +#line 846 "read_input.l" +{ + gauge_precision_read_flag = 32; + if(verbose!=0) printf("Read gauges in 32 Bit precision!\n"); +} + YY_BREAK +case 246: +YY_RULE_SETUP +#line 850 "read_input.l" +{ + gauge_precision_read_flag = 64; + if(verbose!=0) printf("Read gauges in 64 Bit precision!\n"); +} + YY_BREAK +case 247: +YY_RULE_SETUP +#line 854 "read_input.l" +{ + gauge_precision_write_flag = 32; + if(verbose!=0) printf("Save gauges in 32 Bit precision!\n"); +} + YY_BREAK +case 248: +YY_RULE_SETUP +#line 858 "read_input.l" +{ + gauge_precision_write_flag = 64; + if(verbose!=0) printf("Save gauges in 64 Bit precision!\n"); +} + YY_BREAK +case 249: +YY_RULE_SETUP +#line 862 "read_input.l" +{ + prop_precision_flag = 32; + if(verbose!=0) printf("Save propagators in 32 Bit precision!\n"); +} + YY_BREAK +case 250: +YY_RULE_SETUP +#line 866 "read_input.l" +{ + prop_precision_flag = 64; + if(verbose!=0) printf("Save propagators in 64 Bit precision!\n"); +} + YY_BREAK +case 251: +YY_RULE_SETUP +#line 870 "read_input.l" +{ + reproduce_randomnumber_flag = 1; + if(verbose!=0) printf("Use reproducable randomnumbers!\n"); +} + YY_BREAK +case 252: +YY_RULE_SETUP +#line 874 "read_input.l" +{ + reproduce_randomnumber_flag = 0; + if(verbose!=0) printf("Use a different seed for each process in ranlxd!\n"); +} + YY_BREAK +case 253: +YY_RULE_SETUP +#line 878 "read_input.l" +{ + g_sloppy_precision_flag = 1; + if(verbose!=0) printf("Use sloppy precision if available!\n"); +} + YY_BREAK +case 254: +YY_RULE_SETUP +#line 882 "read_input.l" +{ + g_sloppy_precision_flag = 0; + if(verbose!=0) printf("Don't use sloppy precision!\n"); +} + YY_BREAK +case 255: +YY_RULE_SETUP +#line 886 "read_input.l" +{ + use_stout_flag = 1; + if(verbose!=0) printf("Use stout smearing for invert!\n"); +} + YY_BREAK +case 256: +YY_RULE_SETUP +#line 890 "read_input.l" +{ + use_stout_flag = 0; + if(verbose!=0) printf("Don't use stout smearing for invert!\n"); +} + YY_BREAK +case 257: +YY_RULE_SETUP +#line 894 "read_input.l" +{ + stout_rho=atof(yytext); + if(verbose!=0) printf("use stout rho=%e!\n", stout_rho); +} + YY_BREAK +case 258: +YY_RULE_SETUP +#line 898 "read_input.l" +{ + stout_no_iter=atoi(yytext); + if(verbose!=0) printf("make %d stout iterations!\n", stout_no_iter); +} + YY_BREAK +case 259: +YY_RULE_SETUP +#line 902 "read_input.l" +{ + phmc_no_flavours=4; + if(verbose!=0) printf("Simulate 2+1+1 flavours (1+1 PHMC).\n"); +} + YY_BREAK +case 260: +YY_RULE_SETUP +#line 906 "read_input.l" +{ + phmc_no_flavours=2; + if(verbose!=0) printf("Simulate 1+1 flavours only (1+1 PHMC).\n"); +} + YY_BREAK +case 261: +YY_RULE_SETUP +#line 910 "read_input.l" +{ + phmc_compute_evs=1; + if(verbose!=0) printf("Compute Eigenvalues and exit."); +} + YY_BREAK +case 262: +YY_RULE_SETUP +#line 914 "read_input.l" +{ + phmc_compute_evs=0; +} + YY_BREAK +case 263: +YY_RULE_SETUP +#line 917 "read_input.l" +{ + compute_evs=1; + if(verbose!=0) printf("Compute Eigenvalues in invert."); +} + YY_BREAK +case 264: +YY_RULE_SETUP +#line 921 "read_input.l" +{ + compute_evs=0; + if(verbose!=0) printf("Do not compute Eigenvalues in invert."); +} + YY_BREAK +case 265: +YY_RULE_SETUP +#line 925 "read_input.l" +{ + compute_evs=2; + if(verbose!=0) printf("Try to only read in eigenvalues and vectors in invert."); +} + YY_BREAK +case 266: +YY_RULE_SETUP +#line 929 "read_input.l" +{ + phmc_exact_poly = 0; + if(verbose!=0) printf("Run the PHMC as usual."); +} + YY_BREAK +case 267: +YY_RULE_SETUP +#line 933 "read_input.l" +{ + phmc_exact_poly = 1; + if(verbose!=0) printf("Run the PHMC only with usage of the less accurate polynomial."); +} + YY_BREAK +case 268: +YY_RULE_SETUP +#line 938 "read_input.l" +{ + stilde_max = atof(yytext); + if(verbose!=0) printf("Stilde max for PHMC set to %e.\n", stilde_max); +} + YY_BREAK +case 269: +YY_RULE_SETUP +#line 942 "read_input.l" +{ + stilde_min = atof(yytext); + if(verbose!=0) printf("Stilde min for PHMC set to %e.\n", stilde_min); +} + YY_BREAK +case 270: +YY_RULE_SETUP +#line 946 "read_input.l" +{ + degree_of_p = atoi(yytext); + if(verbose!=0) printf("Degree for less precise polynomial P set to %d \n", degree_of_p); +} + YY_BREAK +case 271: +YY_RULE_SETUP +#line 950 "read_input.l" +{ + propagator_splitted=1; + if(verbose!=0) printf("Split the propagator in several files! (invert)\n"); +} + YY_BREAK +case 272: +YY_RULE_SETUP +#line 954 "read_input.l" +{ + propagator_splitted=0; + if(verbose!=0) printf("Do not split the propagator in several files (default) (invert)!\n"); +} + YY_BREAK +case 273: +YY_RULE_SETUP +#line 958 "read_input.l" +{ + source_splitted=1; + if(verbose!=0) printf("Expect source to be split in several files (invert)!\n"); +} + YY_BREAK +case 274: +YY_RULE_SETUP +#line 962 "read_input.l" +{ + source_splitted=0; + if(verbose!=0) printf("Do not expect source to be split in several files (default) (invert)!\n"); +} + YY_BREAK +case 275: +YY_RULE_SETUP +#line 966 "read_input.l" +{ + source_location=atoi(yytext); + if(verbose!=0) printf("source_location = %s\n",yytext); +} + YY_BREAK +case 276: +YY_RULE_SETUP +#line 970 "read_input.l" +{ + eigenvalue_precision = atof(yytext); + if(verbose!=0) printf("precision for eigenvalues = %e\n", eigenvalue_precision); +} + YY_BREAK +case 277: +YY_RULE_SETUP +#line 974 "read_input.l" +{ + no_eigenvalues = atoi(yytext); + if(verbose!=0) printf("no of eigenvalues = %d\n", no_eigenvalues); +} + YY_BREAK +case 278: +YY_RULE_SETUP +#line 978 "read_input.l" +{ + sub_evs_cg_flag = 1; + if(verbose!=0) printf("project out eigenvector subspace\n"); +} + YY_BREAK +case 279: +YY_RULE_SETUP +#line 982 "read_input.l" +{ + sub_evs_cg_flag = 0; + if(verbose!=0) printf("Do no project out eigenvector subspace\n"); +} + YY_BREAK +case 280: +YY_RULE_SETUP +#line 986 "read_input.l" +{ + phmc_heavy_timescale = atoi(yytext); + if(verbose!=0) printf("Integrate heavy doublet on timescale %d\n", phmc_heavy_timescale); +} + YY_BREAK +case 281: +YY_RULE_SETUP +#line 990 "read_input.l" +{ + even_odd_flag = 1; + if(verbose) printf("Use even/odd preconditioning\n"); +} + YY_BREAK +case 282: +YY_RULE_SETUP +#line 994 "read_input.l" +{ + even_odd_flag = 0; + if(verbose) printf("Do not use even/odd preconditioning\n"); +} + YY_BREAK +case 283: +YY_RULE_SETUP +#line 998 "read_input.l" +{ + write_prop_format_flag = 10; + if(verbose!=0) fprintf(stderr, "GWC format no longer supported for writing propagators\n"); +} + YY_BREAK +case 284: +YY_RULE_SETUP +#line 1002 "read_input.l" +{ + write_prop_format_flag = 11; + if(verbose!=0) fprintf(stderr, "CM format no longer supported for writing propagators\n"); +} + YY_BREAK +case 285: +YY_RULE_SETUP +#line 1006 "read_input.l" +{ + write_prop_format_flag = 0; + if(verbose!=0) printf("Propagator type: DiracFermion_Sinks\n"); +} + YY_BREAK +case 286: +YY_RULE_SETUP +#line 1010 "read_input.l" +{ + write_prop_format_flag = 1; + if(verbose!=0) printf("Propagator type: DiracFermion_Source_Sink_Pairs\n"); +} + YY_BREAK +case 287: +YY_RULE_SETUP +#line 1014 "read_input.l" +{ + write_prop_format_flag = 1; + fprintf(stderr, "Propagator type: DiracFermion_ScalarSource_TwelveSink, not yet supported\n"); +} + YY_BREAK +case 288: +YY_RULE_SETUP +#line 1018 "read_input.l" +{ + write_prop_format_flag = 1; + fprintf(stderr, "Propagator type: DiracFermion_ScalarSource_FourSink, not yet supported\n"); +} + YY_BREAK +case 289: +YY_RULE_SETUP +#line 1022 "read_input.l" +{ + online_measurement_flag = 1; + if(verbose!=0) fprintf(stderr, "Switched on online measurements\n"); +} + YY_BREAK +case 290: +YY_RULE_SETUP +#line 1026 "read_input.l" +{ + online_measurement_flag = 0; + if(verbose!=0) fprintf(stderr, "Online measurements not switched on\n"); +} + YY_BREAK +case 291: +YY_RULE_SETUP +#line 1030 "read_input.l" +{ + online_measurement_freq = atoi(yytext); + if(verbose!=0) fprintf(stderr, "Frequency for online measurements set to %s\n", yytext); +} + YY_BREAK +case 292: +YY_RULE_SETUP +#line 1035 "read_input.l" +BEGIN(COMMENT); + YY_BREAK +case 293: +YY_RULE_SETUP +#line 1036 "read_input.l" +BEGIN(COMMENT); + YY_BREAK +case 294: +YY_RULE_SETUP +#line 1037 "read_input.l" +BEGIN(COMMENT); + YY_BREAK +case 295: +YY_RULE_SETUP +#line 1038 "read_input.l" +BEGIN(COMMENT); + YY_BREAK +case 296: +YY_RULE_SETUP +#line 1039 "read_input.l" +BEGIN(COMMENT); + YY_BREAK +case 297: +YY_RULE_SETUP +#line 1040 "read_input.l" +BEGIN(COMMENT); + YY_BREAK +case 298: +YY_RULE_SETUP +#line 1041 "read_input.l" +BEGIN(COMMENT); + YY_BREAK +case 299: +YY_RULE_SETUP +#line 1042 "read_input.l" +BEGIN(COMMENT); + YY_BREAK +case 300: +YY_RULE_SETUP +#line 1043 "read_input.l" +BEGIN(COMMENT); + YY_BREAK +case 301: +YY_RULE_SETUP +#line 1044 "read_input.l" +BEGIN(COMMENT); + YY_BREAK +case 302: +YY_RULE_SETUP +#line 1045 "read_input.l" +BEGIN(COMMENT); + YY_BREAK +case 303: +YY_RULE_SETUP +#line 1046 "read_input.l" +BEGIN(COMMENT); + YY_BREAK +case 304: +YY_RULE_SETUP +#line 1047 "read_input.l" +{ + ; +} + YY_BREAK +case 305: +/* rule 305 can match eol */ +YY_RULE_SETUP +#line 1052 "read_input.l" +{ + line_of_file++; + BEGIN(0); +} + YY_BREAK +case 306: +YY_RULE_SETUP +#line 1057 "read_input.l" +{ + printf("Unknown seed in line %d.\n Must be an integer. Exiting...!\n",line_of_file); + exit(1); +} + YY_BREAK +case 307: +YY_RULE_SETUP +#line 1061 "read_input.l" +{ + printf("Unknown kappa in line %d.\n Must be a floating point number. Exiting...!\n",line_of_file); + exit(1); +} + YY_BREAK +case 308: +YY_RULE_SETUP +#line 1065 "read_input.l" +{ + printf("Unknown PhmcPrecisionPtilde in line %d.\n Must be a floating point number. Exiting...!\n",line_of_file); + exit(1); +} + YY_BREAK +case 309: +YY_RULE_SETUP +#line 1069 "read_input.l" +{ + printf("Unknown PhmcPrecisionHfin in line %d.\n Must be a floating point number. Exiting...!\n",line_of_file); + exit(1); +} + YY_BREAK +case 310: +YY_RULE_SETUP +#line 1073 "read_input.l" +{ + printf("Unknown Rec_EV in line %d.\n Must be an integer number. Exiting...!\n",line_of_file); + exit(1); +} + YY_BREAK +case 311: +YY_RULE_SETUP +#line 1077 "read_input.l" +{ + printf("Unknown PhmcMuBar in line %d.\n Must be a floating point number. Exiting...!\n",line_of_file); + exit(1); +} + YY_BREAK +case 312: +YY_RULE_SETUP +#line 1081 "read_input.l" +{ + printf("Unknown PhmcEpsBar in line %d.\n Must be a floating point number. Exiting...!\n",line_of_file); + exit(1); +} + YY_BREAK +case 313: +YY_RULE_SETUP +#line 1085 "read_input.l" +{ + printf("Unknown mu in line %d.\n Must be a floating point number. Exiting...!\n",line_of_file); + exit(1); +} + YY_BREAK +case 314: +YY_RULE_SETUP +#line 1089 "read_input.l" +{ + printf("Unknown mu in line %d.\n Must be a floating point number. Exiting...!\n",line_of_file); + exit(1); +} + YY_BREAK +case 315: +YY_RULE_SETUP +#line 1093 "read_input.l" +{ + printf("Unknown mu in line %d.\n Must be a floating point number. Exiting...!\n",line_of_file); + exit(1); +} + YY_BREAK +case 316: +YY_RULE_SETUP +#line 1097 "read_input.l" +{ + printf("Unknown beta in line %d.\n Must be a floating point number. Exiting...!\n",line_of_file); + exit(1); +} + YY_BREAK +case 317: +YY_RULE_SETUP +#line 1101 "read_input.l" +{ + printf("Unknown Startcondition in line %d! \n Must be hot, cold, continue or restart. Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 318: +YY_RULE_SETUP +#line 1105 "read_input.l" +{ + printf("Unknown number of TermSteps in line %d! \n Must be an integer. Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 319: +YY_RULE_SETUP +#line 1109 "read_input.l" +{ + printf("Unknown number of MeasSteps in line %d! \n Must be an integer. Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 320: +YY_RULE_SETUP +#line 1113 "read_input.l" +{ + printf("Unknown number of Sweeps to skip in line %d! \n Must be an integer. Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 321: +YY_RULE_SETUP +#line 1117 "read_input.l" +{ + printf("Unknown value for solver_flag in line %d! \n Must be bicgstab, cg, cgs, mr or gmres. Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 322: +YY_RULE_SETUP +#line 1121 "read_input.l" +{ + printf("Unknown value for operator_flag in line %d! \n Must be an integer. Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 323: +YY_RULE_SETUP +#line 1125 "read_input.l" +{ + printf("Unknown value for matrix_element_flag in line %d! \n Must be yes or no. Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 324: +YY_RULE_SETUP +#line 1129 "read_input.l" +{ + printf("Unknown value for save_config_flag in line %d! \n Must be yes or no! Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 325: +YY_RULE_SETUP +#line 1133 "read_input.l" +{ + printf("Unknown value for save_prop_flag in line %d! \n Must be yes or no! Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 326: +YY_RULE_SETUP +#line 1137 "read_input.l" +{ + printf("Unknown value for save_prop_g2_flag in line %d! \n Must be yes or no! Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 327: +YY_RULE_SETUP +#line 1141 "read_input.l" +{ + printf("Unknown value for write_checkpoint_flag in line %d! \n Must be yes or no! Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 328: +YY_RULE_SETUP +#line 1145 "read_input.l" +{ + printf("Unknown value for checkpoint interval in line %d! \n Must be an integer! Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 329: +YY_RULE_SETUP +#line 1149 "read_input.l" +{ + printf("Unknown value for Initial store counter in line %d! \n Must be an integer! Exiting...!\n",line_of_file); + exit(1); +} + YY_BREAK +case 330: +YY_RULE_SETUP +#line 1153 "read_input.l" +{ + printf("Unknown value for T in line %d!\n Must be an integer value! Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 331: +YY_RULE_SETUP +#line 1157 "read_input.l" +{ + printf("Unknown value for L in line %d!\n Must be an integer value! Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 332: +YY_RULE_SETUP +#line 1161 "read_input.l" +{ + printf("Unknown value for LX in line %d!\n Must be an integer value! Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 333: +YY_RULE_SETUP +#line 1165 "read_input.l" +{ + printf("Unknown value for LY in line %d!\n Must be an integer value! Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 334: +YY_RULE_SETUP +#line 1169 "read_input.l" +{ + printf("Unknown value for LZ in line %d!\n Must be an integer value! Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 335: +YY_RULE_SETUP +#line 1173 "read_input.l" +{ + printf("Unknown value for NRXProcs in line %d!\n Must be an integer value! Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 336: +YY_RULE_SETUP +#line 1177 "read_input.l" +{ + printf("Unknown value for NRYProcs in line %d!\n Must be an integer value! Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 337: +YY_RULE_SETUP +#line 1181 "read_input.l" +{ + printf("Unknown value for NRYProcs in line %d!\n Must be an integer value! Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 338: +YY_RULE_SETUP +#line 1185 "read_input.l" +{ + printf("Unknown value for StdIOProcessor in line %d!\n Must be all, no or an integer value! Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 339: +YY_RULE_SETUP +#line 1189 "read_input.l" +{ + printf("Error in line %d! Must be 32 or 64 Bit precision!\n", line_of_file); + exit(1); +} + YY_BREAK +case 340: +YY_RULE_SETUP +#line 1193 "read_input.l" +{ + printf("Error in line %d! Must be 32 or 64 Bit precision!\n", line_of_file); + exit(1); +} + YY_BREAK +case 341: +YY_RULE_SETUP +#line 1197 "read_input.l" +{ + printf("Error in line %d! Must be 32 or 64 Bit precision!\n", line_of_file); + exit(1); +} + YY_BREAK +case 342: +YY_RULE_SETUP +#line 1201 "read_input.l" +{ + printf("Error in line %d! Exiting...!\n",line_of_file); + exit(1); +} + YY_BREAK +case 343: +YY_RULE_SETUP +#line 1205 "read_input.l" +{ + printf("Error in line %d! Must be compute or readin! Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 344: +YY_RULE_SETUP +#line 1209 "read_input.l" +{ + printf("Error in line %d!\n", line_of_file); + exit(1); +} + YY_BREAK +case 345: +YY_RULE_SETUP +#line 1213 "read_input.l" +{ + printf("Error in line %d!\n", line_of_file); + exit(1); +} + YY_BREAK +case 346: +YY_RULE_SETUP +#line 1217 "read_input.l" +{ + printf("Unknown value for MaxSolverIterations in line %d! Must be an integer. Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 347: +YY_RULE_SETUP +#line 1221 "read_input.l" +{ + printf("Unknown value for SolverPrecision in line %d! Must be a floating point number. Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 348: +YY_RULE_SETUP +#line 1225 "read_input.l" +{ + printf("Unknown value for MassNumber in line %d! Must be an integer. Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 349: +YY_RULE_SETUP +#line 1229 "read_input.l" +{ + printf("Unknown value for RGIC1 in line %d! Must be a floating point number. Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 350: +YY_RULE_SETUP +#line 1233 "read_input.l" +{ + printf("Should be yes or no for relative precision in line %d! Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 351: +YY_RULE_SETUP +#line 1237 "read_input.l" +{ + printf("Unknown value for ForcePrecision in line %d! Must be a floating point number. Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 352: +YY_RULE_SETUP +#line 1241 "read_input.l" +{ + printf("Unknown value for AcceptancePrecision in line %d! Must be a floating point number. Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 353: +YY_RULE_SETUP +#line 1245 "read_input.l" +{ + printf("Unknown value for CSGHistMu in line %d! Must be an integer number. Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 354: +YY_RULE_SETUP +#line 1249 "read_input.l" +{ + printf("Unknown value in line %d! Must be yes or no. Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 355: +YY_RULE_SETUP +#line 1253 "read_input.l" +{ + printf("Unknown value in line %d! Must be a floating point number. Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 356: +YY_RULE_SETUP +#line 1257 "read_input.l" +{ + printf("Unknown value in line %d! Must be an integer. Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 357: +YY_RULE_SETUP +#line 1261 "read_input.l" +{ + printf("Unknown value in line %d! Must be 2+1+1 or 1+1. Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 358: +YY_RULE_SETUP +#line 1265 "read_input.l" +{ + printf("Unknown value in line %d! Must be an yes or no. Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 359: +YY_RULE_SETUP +#line 1269 "read_input.l" +{ + printf("Unknown value in line %d! Must be an yes or no. Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 360: +YY_RULE_SETUP +#line 1273 "read_input.l" +{ + printf("Unknown value in line %d! Must be a floating point number. Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 361: +YY_RULE_SETUP +#line 1277 "read_input.l" +{ + printf("Unknown value for SplittedPropagator in line %d! Must be yes or no. Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 362: +YY_RULE_SETUP +#line 1281 "read_input.l" +{ + printf("Unknown value for SplittedSource in line %d! Must be yes or no. Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 363: +YY_RULE_SETUP +#line 1285 "read_input.l" +{ + printf("Unknown source_location in line %d.\n Must be an integer. Exiting...!\n",line_of_file); + exit(1); +} + YY_BREAK +case 364: +YY_RULE_SETUP +#line 1289 "read_input.l" +{ + printf("Unknown value for TimeScaleHeavyDoublet in line %d\n", line_of_file); + exit(1); +} + YY_BREAK +case 365: +YY_RULE_SETUP +#line 1293 "read_input.l" +{ + printf("Unknown value for UseEvenOdd in line %d\n Must be yes or no. Aborting!\n", line_of_file); +} + YY_BREAK +case 366: +YY_RULE_SETUP +#line 1296 "read_input.l" +{ + printf("Unknown value for WritePropagatorFormat in line %d\n Must be gwc or cmi. Aborting!\n", line_of_file); +} + YY_BREAK +case 367: +YY_RULE_SETUP +#line 1299 "read_input.l" +{ + printf("Unknown value for PerformOnlineMeasurements in line %d\n Must be yes or no. Aborting!\n", line_of_file); +} + YY_BREAK +case 368: +YY_RULE_SETUP +#line 1302 "read_input.l" +{ + printf("Unknown value for OnlineMeasurementsFreq in line %d\n Must be an integer. Aborting!\n", line_of_file); +} + YY_BREAK +case 369: +YY_RULE_SETUP +#line 1307 "read_input.l" +BEGIN(ERROR); + YY_BREAK +case 370: +YY_RULE_SETUP +#line 1308 "read_input.l" +{ + printf("Error in line %d: %s \n",line_of_file,yytext); + exit(1); +} + YY_BREAK +case 371: +YY_RULE_SETUP +#line 1314 "read_input.l" +ECHO; + YY_BREAK +#line 6001 "" +case YY_STATE_EOF(INITIAL): +case YY_STATE_EOF(BETA): +case YY_STATE_EOF(STARTCOND): +case YY_STATE_EOF(THERMSWEEPS): +case YY_STATE_EOF(NMEAS): +case YY_STATE_EOF(KAPPA): +case YY_STATE_EOF(ACCPTILDE): +case YY_STATE_EOF(ACCHFIN): +case YY_STATE_EOF(RECEV): +case YY_STATE_EOF(MUBAR): +case YY_STATE_EOF(EPSBAR): +case YY_STATE_EOF(MU): +case YY_STATE_EOF(MU2): +case YY_STATE_EOF(MU3): +case YY_STATE_EOF(SEED): +case YY_STATE_EOF(Q1): +case YY_STATE_EOF(Q2): +case YY_STATE_EOF(DTAU): +case YY_STATE_EOF(TAU): +case YY_STATE_EOF(NSTEPS): +case YY_STATE_EOF(CSW): +case YY_STATE_EOF(INTTYP): +case YY_STATE_EOF(NSMALL): +case YY_STATE_EOF(NSKIP): +case YY_STATE_EOF(RLXDINPUTFILE): +case YY_STATE_EOF(GAUGEINPUTFILE): +case YY_STATE_EOF(GAUGERPREC): +case YY_STATE_EOF(GAUGEWPREC): +case YY_STATE_EOF(SOLVFLAG): +case YY_STATE_EOF(OPFLAG): +case YY_STATE_EOF(MEFLAG): +case YY_STATE_EOF(SAVECONF): +case YY_STATE_EOF(SAVEPROP): +case YY_STATE_EOF(SAVEPRG2): +case YY_STATE_EOF(WRITECP): +case YY_STATE_EOF(CPINT): +case YY_STATE_EOF(NSTORE): +case YY_STATE_EOF(TT): +case YY_STATE_EOF(LL): +case YY_STATE_EOF(LLX): +case YY_STATE_EOF(LLY): +case YY_STATE_EOF(LLZ): +case YY_STATE_EOF(NPROCX): +case YY_STATE_EOF(NPROCY): +case YY_STATE_EOF(NPROCZ): +case YY_STATE_EOF(IOPROC): +case YY_STATE_EOF(IDX): +case YY_STATE_EOF(FPROP): +case YY_STATE_EOF(CGMAX): +case YY_STATE_EOF(BCGMAX): +case YY_STATE_EOF(BOUND): +case YY_STATE_EOF(SITER): +case YY_STATE_EOF(SPREC): +case YY_STATE_EOF(MNR): +case YY_STATE_EOF(RGIC): +case YY_STATE_EOF(READSOURCE): +case YY_STATE_EOF(SOURCEFORMAT): +case YY_STATE_EOF(SOURCEFILE): +case YY_STATE_EOF(SOURCETS): +case YY_STATE_EOF(INT0): +case YY_STATE_EOF(INT1): +case YY_STATE_EOF(INT2): +case YY_STATE_EOF(INT3): +case YY_STATE_EOF(INT4): +case YY_STATE_EOF(LAMBDA0): +case YY_STATE_EOF(LAMBDA1): +case YY_STATE_EOF(LAMBDA2): +case YY_STATE_EOF(LAMBDA3): +case YY_STATE_EOF(LAMBDA4): +case YY_STATE_EOF(RELPREC): +case YY_STATE_EOF(FORCEPREC): +case YY_STATE_EOF(FORCEPREC1): +case YY_STATE_EOF(FORCEPREC2): +case YY_STATE_EOF(FORCEPREC3): +case YY_STATE_EOF(ACCPREC): +case YY_STATE_EOF(ACCPREC1): +case YY_STATE_EOF(ACCPREC2): +case YY_STATE_EOF(ACCPREC3): +case YY_STATE_EOF(REVCHECK): +case YY_STATE_EOF(REVINT): +case YY_STATE_EOF(DEBUG): +case YY_STATE_EOF(CSGN1): +case YY_STATE_EOF(CSGN2): +case YY_STATE_EOF(CSGN3): +case YY_STATE_EOF(GMRESM): +case YY_STATE_EOF(GMRESDRNEV): +case YY_STATE_EOF(REPRORND): +case YY_STATE_EOF(SLOPPYPREC): +case YY_STATE_EOF(USESTOUT): +case YY_STATE_EOF(STOUTRHO): +case YY_STATE_EOF(STOUTITER): +case YY_STATE_EOF(PHMCFLAV): +case YY_STATE_EOF(COMPUTEEVS): +case YY_STATE_EOF(PCOMPUTEEVS): +case YY_STATE_EOF(PPP): +case YY_STATE_EOF(SMAX): +case YY_STATE_EOF(SMIN): +case YY_STATE_EOF(DEGP): +case YY_STATE_EOF(SPLITPROP): +case YY_STATE_EOF(SPLITSOURCE): +case YY_STATE_EOF(SRCLOC): +case YY_STATE_EOF(SUBEVCG): +case YY_STATE_EOF(NOEV): +case YY_STATE_EOF(PRECEV): +case YY_STATE_EOF(HEAVYTS): +case YY_STATE_EOF(EO): +case YY_STATE_EOF(WRPROPFLAG): +case YY_STATE_EOF(PROPPREC): +case YY_STATE_EOF(PROPTYPE): +case YY_STATE_EOF(ONMEAS): +case YY_STATE_EOF(ONFREQ): +case YY_STATE_EOF(COMMENT): +case YY_STATE_EOF(ERROR): + yyterminate(); + + case YY_END_OF_BUFFER: + { + /* Amount of text matched not including the EOB char. */ + int yy_amount_of_matched_text = (int) (yy_cp - (yytext_ptr)) - 1; + + /* Undo the effects of YY_DO_BEFORE_ACTION. */ + *yy_cp = (yy_hold_char); + YY_RESTORE_YY_MORE_OFFSET + + if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_NEW ) + { + /* We're scanning a new file or input source. It's + * possible that this happened because the user + * just pointed yyin at a new source and called + * yylex(). If so, then we have to assure + * consistency between YY_CURRENT_BUFFER and our + * globals. Here is the right place to do so, because + * this is the first action (other than possibly a + * back-up) that will match for the new input source. + */ + (yy_n_chars) = YY_CURRENT_BUFFER_LVALUE->yy_n_chars; + YY_CURRENT_BUFFER_LVALUE->yy_input_file = yyin; + YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = YY_BUFFER_NORMAL; + } + + /* Note that here we test for yy_c_buf_p "<=" to the position + * of the first EOB in the buffer, since yy_c_buf_p will + * already have been incremented past the NUL character + * (since all states make transitions on EOB to the + * end-of-buffer state). Contrast this with the test + * in input(). + */ + if ( (yy_c_buf_p) <= &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] ) + { /* This was really a NUL. */ + yy_state_type yy_next_state; + + (yy_c_buf_p) = (yytext_ptr) + yy_amount_of_matched_text; + + yy_current_state = yy_get_previous_state( ); + + /* Okay, we're now positioned to make the NUL + * transition. We couldn't have + * yy_get_previous_state() go ahead and do it + * for us because it doesn't know how to deal + * with the possibility of jamming (and we don't + * want to build jamming into it because then it + * will run more slowly). + */ + + yy_next_state = yy_try_NUL_trans( yy_current_state ); + + yy_bp = (yytext_ptr) + YY_MORE_ADJ; + + if ( yy_next_state ) + { + /* Consume the NUL. */ + yy_cp = ++(yy_c_buf_p); + yy_current_state = yy_next_state; + goto yy_match; + } + + else + { + yy_cp = (yy_last_accepting_cpos); + yy_current_state = (yy_last_accepting_state); + goto yy_find_action; + } + } + + else switch ( yy_get_next_buffer( ) ) + { + case EOB_ACT_END_OF_FILE: + { + (yy_did_buffer_switch_on_eof) = 0; + + if ( yywrap( ) ) + { + /* Note: because we've taken care in + * yy_get_next_buffer() to have set up + * yytext, we can now set up + * yy_c_buf_p so that if some total + * hoser (like flex itself) wants to + * call the scanner after we return the + * YY_NULL, it'll still work - another + * YY_NULL will get returned. + */ + (yy_c_buf_p) = (yytext_ptr) + YY_MORE_ADJ; + + yy_act = YY_STATE_EOF(YY_START); + goto do_action; + } + + else + { + if ( ! (yy_did_buffer_switch_on_eof) ) + YY_NEW_FILE; + } + break; + } + + case EOB_ACT_CONTINUE_SCAN: + (yy_c_buf_p) = + (yytext_ptr) + yy_amount_of_matched_text; + + yy_current_state = yy_get_previous_state( ); + + yy_cp = (yy_c_buf_p); + yy_bp = (yytext_ptr) + YY_MORE_ADJ; + goto yy_match; + + case EOB_ACT_LAST_MATCH: + (yy_c_buf_p) = + &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)]; + + yy_current_state = yy_get_previous_state( ); + + yy_cp = (yy_c_buf_p); + yy_bp = (yytext_ptr) + YY_MORE_ADJ; + goto yy_find_action; + } + break; + } + + default: + YY_FATAL_ERROR( + "fatal flex scanner internal error--no action found" ); + } /* end of action switch */ + } /* end of scanning one token */ +} /* end of yylex */ + +/* yy_get_next_buffer - try to read in a new buffer + * + * Returns a code representing an action: + * EOB_ACT_LAST_MATCH - + * EOB_ACT_CONTINUE_SCAN - continue scanning from current position + * EOB_ACT_END_OF_FILE - end of file + */ +static int yy_get_next_buffer (void) +{ + register char *dest = YY_CURRENT_BUFFER_LVALUE->yy_ch_buf; + register char *source = (yytext_ptr); + register int number_to_move, i; + int ret_val; + + if ( (yy_c_buf_p) > &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars) + 1] ) + YY_FATAL_ERROR( + "fatal flex scanner internal error--end of buffer missed" ); + + if ( YY_CURRENT_BUFFER_LVALUE->yy_fill_buffer == 0 ) + { /* Don't try to fill the buffer, so this is an EOF. */ + if ( (yy_c_buf_p) - (yytext_ptr) - YY_MORE_ADJ == 1 ) + { + /* We matched a single character, the EOB, so + * treat this as a final EOF. + */ + return EOB_ACT_END_OF_FILE; + } + + else + { + /* We matched some text prior to the EOB, first + * process it. + */ + return EOB_ACT_LAST_MATCH; + } + } + + /* Try to read more data. */ + + /* First move last chars to start of buffer. */ + number_to_move = (int) ((yy_c_buf_p) - (yytext_ptr)) - 1; + + for ( i = 0; i < number_to_move; ++i ) + *(dest++) = *(source++); + + if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_EOF_PENDING ) + /* don't do the read, it's not guaranteed to return an EOF, + * just force an EOF + */ + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars) = 0; + + else + { + int num_to_read = + YY_CURRENT_BUFFER_LVALUE->yy_buf_size - number_to_move - 1; + + while ( num_to_read <= 0 ) + { /* Not enough room in the buffer - grow it. */ + + /* just a shorter name for the current buffer */ + YY_BUFFER_STATE b = YY_CURRENT_BUFFER; + + int yy_c_buf_p_offset = + (int) ((yy_c_buf_p) - b->yy_ch_buf); + + if ( b->yy_is_our_buffer ) + { + int new_size = b->yy_buf_size * 2; + + if ( new_size <= 0 ) + b->yy_buf_size += b->yy_buf_size / 8; + else + b->yy_buf_size *= 2; + + b->yy_ch_buf = (char *) + /* Include room in for 2 EOB chars. */ + yyrealloc((void *) b->yy_ch_buf,b->yy_buf_size + 2 ); + } + else + /* Can't grow it, we don't own it. */ + b->yy_ch_buf = 0; + + if ( ! b->yy_ch_buf ) + YY_FATAL_ERROR( + "fatal error - scanner input buffer overflow" ); + + (yy_c_buf_p) = &b->yy_ch_buf[yy_c_buf_p_offset]; + + num_to_read = YY_CURRENT_BUFFER_LVALUE->yy_buf_size - + number_to_move - 1; + + } + + if ( num_to_read > YY_READ_BUF_SIZE ) + num_to_read = YY_READ_BUF_SIZE; + + /* Read in more data. */ + YY_INPUT( (&YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[number_to_move]), + (yy_n_chars), (size_t) num_to_read ); + + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars); + } + + if ( (yy_n_chars) == 0 ) + { + if ( number_to_move == YY_MORE_ADJ ) + { + ret_val = EOB_ACT_END_OF_FILE; + yyrestart(yyin ); + } + + else + { + ret_val = EOB_ACT_LAST_MATCH; + YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = + YY_BUFFER_EOF_PENDING; + } + } + + else + ret_val = EOB_ACT_CONTINUE_SCAN; + + (yy_n_chars) += number_to_move; + YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] = YY_END_OF_BUFFER_CHAR; + YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars) + 1] = YY_END_OF_BUFFER_CHAR; + + (yytext_ptr) = &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[0]; + + return ret_val; +} + +/* yy_get_previous_state - get the state just before the EOB char was reached */ + + static yy_state_type yy_get_previous_state (void) +{ + register yy_state_type yy_current_state; + register char *yy_cp; + + yy_current_state = (yy_start); + yy_current_state += YY_AT_BOL(); + + for ( yy_cp = (yytext_ptr) + YY_MORE_ADJ; yy_cp < (yy_c_buf_p); ++yy_cp ) + { + register YY_CHAR yy_c = (*yy_cp ? yy_ec[YY_SC_TO_UI(*yy_cp)] : 1); + if ( yy_accept[yy_current_state] ) + { + (yy_last_accepting_state) = yy_current_state; + (yy_last_accepting_cpos) = yy_cp; + } + while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) + { + yy_current_state = (int) yy_def[yy_current_state]; + if ( yy_current_state >= 2644 ) + yy_c = yy_meta[(unsigned int) yy_c]; + } + yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c]; + } + + return yy_current_state; +} + +/* yy_try_NUL_trans - try to make a transition on the NUL character + * + * synopsis + * next_state = yy_try_NUL_trans( current_state ); + */ + static yy_state_type yy_try_NUL_trans (yy_state_type yy_current_state ) +{ + register int yy_is_jam; + register char *yy_cp = (yy_c_buf_p); + + register YY_CHAR yy_c = 1; + if ( yy_accept[yy_current_state] ) + { + (yy_last_accepting_state) = yy_current_state; + (yy_last_accepting_cpos) = yy_cp; + } + while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) + { + yy_current_state = (int) yy_def[yy_current_state]; + if ( yy_current_state >= 2644 ) + yy_c = yy_meta[(unsigned int) yy_c]; + } + yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c]; + yy_is_jam = (yy_current_state == 2643); + + return yy_is_jam ? 0 : yy_current_state; +} + + static void yyunput (int c, register char * yy_bp ) +{ + register char *yy_cp; + + yy_cp = (yy_c_buf_p); + + /* undo effects of setting up yytext */ + *yy_cp = (yy_hold_char); + + if ( yy_cp < YY_CURRENT_BUFFER_LVALUE->yy_ch_buf + 2 ) + { /* need to shift things up to make room */ + /* +2 for EOB chars. */ + register int number_to_move = (yy_n_chars) + 2; + register char *dest = &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[ + YY_CURRENT_BUFFER_LVALUE->yy_buf_size + 2]; + register char *source = + &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[number_to_move]; + + while ( source > YY_CURRENT_BUFFER_LVALUE->yy_ch_buf ) + *--dest = *--source; + + yy_cp += (int) (dest - source); + yy_bp += (int) (dest - source); + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = + (yy_n_chars) = YY_CURRENT_BUFFER_LVALUE->yy_buf_size; + + if ( yy_cp < YY_CURRENT_BUFFER_LVALUE->yy_ch_buf + 2 ) + YY_FATAL_ERROR( "flex scanner push-back overflow" ); + } + + *--yy_cp = (char) c; + + (yytext_ptr) = yy_bp; + (yy_hold_char) = *yy_cp; + (yy_c_buf_p) = yy_cp; +} + +#ifndef YY_NO_INPUT +#ifdef __cplusplus + static int yyinput (void) +#else + static int input (void) +#endif + +{ + int c; + + *(yy_c_buf_p) = (yy_hold_char); + + if ( *(yy_c_buf_p) == YY_END_OF_BUFFER_CHAR ) + { + /* yy_c_buf_p now points to the character we want to return. + * If this occurs *before* the EOB characters, then it's a + * valid NUL; if not, then we've hit the end of the buffer. + */ + if ( (yy_c_buf_p) < &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] ) + /* This was really a NUL. */ + *(yy_c_buf_p) = '\0'; + + else + { /* need more input */ + int offset = (yy_c_buf_p) - (yytext_ptr); + ++(yy_c_buf_p); + + switch ( yy_get_next_buffer( ) ) + { + case EOB_ACT_LAST_MATCH: + /* This happens because yy_g_n_b() + * sees that we've accumulated a + * token and flags that we need to + * try matching the token before + * proceeding. But for input(), + * there's no matching to consider. + * So convert the EOB_ACT_LAST_MATCH + * to EOB_ACT_END_OF_FILE. + */ + + /* Reset buffer status. */ + yyrestart(yyin ); + + /*FALLTHROUGH*/ + + case EOB_ACT_END_OF_FILE: + { + if ( yywrap( ) ) + return EOF; + + if ( ! (yy_did_buffer_switch_on_eof) ) + YY_NEW_FILE; +#ifdef __cplusplus + return yyinput(); +#else + return input(); +#endif + } + + case EOB_ACT_CONTINUE_SCAN: + (yy_c_buf_p) = (yytext_ptr) + offset; + break; + } + } + } + + c = *(unsigned char *) (yy_c_buf_p); /* cast for 8-bit char's */ + *(yy_c_buf_p) = '\0'; /* preserve yytext */ + (yy_hold_char) = *++(yy_c_buf_p); + + YY_CURRENT_BUFFER_LVALUE->yy_at_bol = (c == '\n'); + + return c; +} +#endif /* ifndef YY_NO_INPUT */ + +/** Immediately switch to a different input stream. + * @param input_file A readable stream. + * + * @note This function does not reset the start condition to @c INITIAL . + */ + void yyrestart (FILE * input_file ) +{ + + if ( ! YY_CURRENT_BUFFER ){ + yyensure_buffer_stack (); + YY_CURRENT_BUFFER_LVALUE = + yy_create_buffer(yyin,YY_BUF_SIZE ); + } + + yy_init_buffer(YY_CURRENT_BUFFER,input_file ); + yy_load_buffer_state( ); +} + +/** Switch to a different input buffer. + * @param new_buffer The new input buffer. + * + */ + void yy_switch_to_buffer (YY_BUFFER_STATE new_buffer ) +{ + + /* TODO. We should be able to replace this entire function body + * with + * yypop_buffer_state(); + * yypush_buffer_state(new_buffer); + */ + yyensure_buffer_stack (); + if ( YY_CURRENT_BUFFER == new_buffer ) + return; + + if ( YY_CURRENT_BUFFER ) + { + /* Flush out information for old buffer. */ + *(yy_c_buf_p) = (yy_hold_char); + YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = (yy_c_buf_p); + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars); + } + + YY_CURRENT_BUFFER_LVALUE = new_buffer; + yy_load_buffer_state( ); + + /* We don't actually know whether we did this switch during + * EOF (yywrap()) processing, but the only time this flag + * is looked at is after yywrap() is called, so it's safe + * to go ahead and always set it. + */ + (yy_did_buffer_switch_on_eof) = 1; +} + +static void yy_load_buffer_state (void) +{ + (yy_n_chars) = YY_CURRENT_BUFFER_LVALUE->yy_n_chars; + (yytext_ptr) = (yy_c_buf_p) = YY_CURRENT_BUFFER_LVALUE->yy_buf_pos; + yyin = YY_CURRENT_BUFFER_LVALUE->yy_input_file; + (yy_hold_char) = *(yy_c_buf_p); +} + +/** Allocate and initialize an input buffer state. + * @param file A readable stream. + * @param size The character buffer size in bytes. When in doubt, use @c YY_BUF_SIZE. + * + * @return the allocated buffer state. + */ + YY_BUFFER_STATE yy_create_buffer (FILE * file, int size ) +{ + YY_BUFFER_STATE b; + + b = (YY_BUFFER_STATE) yyalloc(sizeof( struct yy_buffer_state ) ); + if ( ! b ) + YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" ); + + b->yy_buf_size = size; + + /* yy_ch_buf has to be 2 characters longer than the size given because + * we need to put in 2 end-of-buffer characters. + */ + b->yy_ch_buf = (char *) yyalloc(b->yy_buf_size + 2 ); + if ( ! b->yy_ch_buf ) + YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" ); + + b->yy_is_our_buffer = 1; + + yy_init_buffer(b,file ); + + return b; +} + +/** Destroy the buffer. + * @param b a buffer created with yy_create_buffer() + * + */ + void yy_delete_buffer (YY_BUFFER_STATE b ) +{ + + if ( ! b ) + return; + + if ( b == YY_CURRENT_BUFFER ) /* Not sure if we should pop here. */ + YY_CURRENT_BUFFER_LVALUE = (YY_BUFFER_STATE) 0; + + if ( b->yy_is_our_buffer ) + yyfree((void *) b->yy_ch_buf ); + + yyfree((void *) b ); +} + +/* Initializes or reinitializes a buffer. + * This function is sometimes called more than once on the same buffer, + * such as during a yyrestart() or at EOF. + */ + static void yy_init_buffer (YY_BUFFER_STATE b, FILE * file ) + +{ + int oerrno = errno; + + yy_flush_buffer(b ); + + b->yy_input_file = file; + b->yy_fill_buffer = 1; + + /* If b is the current buffer, then yy_init_buffer was _probably_ + * called from yyrestart() or through yy_get_next_buffer. + * In that case, we don't want to reset the lineno or column. + */ + if (b != YY_CURRENT_BUFFER){ + b->yy_bs_lineno = 1; + b->yy_bs_column = 0; + } + + b->yy_is_interactive = 0; + + errno = oerrno; +} + +/** Discard all buffered characters. On the next scan, YY_INPUT will be called. + * @param b the buffer state to be flushed, usually @c YY_CURRENT_BUFFER. + * + */ + void yy_flush_buffer (YY_BUFFER_STATE b ) +{ + if ( ! b ) + return; + + b->yy_n_chars = 0; + + /* We always need two end-of-buffer characters. The first causes + * a transition to the end-of-buffer state. The second causes + * a jam in that state. + */ + b->yy_ch_buf[0] = YY_END_OF_BUFFER_CHAR; + b->yy_ch_buf[1] = YY_END_OF_BUFFER_CHAR; + + b->yy_buf_pos = &b->yy_ch_buf[0]; + + b->yy_at_bol = 1; + b->yy_buffer_status = YY_BUFFER_NEW; + + if ( b == YY_CURRENT_BUFFER ) + yy_load_buffer_state( ); +} + +/** Pushes the new state onto the stack. The new state becomes + * the current state. This function will allocate the stack + * if necessary. + * @param new_buffer The new state. + * + */ +void yypush_buffer_state (YY_BUFFER_STATE new_buffer ) +{ + if (new_buffer == NULL) + return; + + yyensure_buffer_stack(); + + /* This block is copied from yy_switch_to_buffer. */ + if ( YY_CURRENT_BUFFER ) + { + /* Flush out information for old buffer. */ + *(yy_c_buf_p) = (yy_hold_char); + YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = (yy_c_buf_p); + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars); + } + + /* Only push if top exists. Otherwise, replace top. */ + if (YY_CURRENT_BUFFER) + (yy_buffer_stack_top)++; + YY_CURRENT_BUFFER_LVALUE = new_buffer; + + /* copied from yy_switch_to_buffer. */ + yy_load_buffer_state( ); + (yy_did_buffer_switch_on_eof) = 1; +} + +/** Removes and deletes the top of the stack, if present. + * The next element becomes the new top. + * + */ +void yypop_buffer_state (void) +{ + if (!YY_CURRENT_BUFFER) + return; + + yy_delete_buffer(YY_CURRENT_BUFFER ); + YY_CURRENT_BUFFER_LVALUE = NULL; + if ((yy_buffer_stack_top) > 0) + --(yy_buffer_stack_top); + + if (YY_CURRENT_BUFFER) { + yy_load_buffer_state( ); + (yy_did_buffer_switch_on_eof) = 1; + } +} + +/* Allocates the stack if it does not exist. + * Guarantees space for at least one push. + */ +static void yyensure_buffer_stack (void) +{ + int num_to_alloc; + + if (!(yy_buffer_stack)) { + + /* First allocation is just for 2 elements, since we don't know if this + * scanner will even need a stack. We use 2 instead of 1 to avoid an + * immediate realloc on the next call. + */ + num_to_alloc = 1; + (yy_buffer_stack) = (struct yy_buffer_state**)yyalloc + (num_to_alloc * sizeof(struct yy_buffer_state*) + ); + + memset((yy_buffer_stack), 0, num_to_alloc * sizeof(struct yy_buffer_state*)); + + (yy_buffer_stack_max) = num_to_alloc; + (yy_buffer_stack_top) = 0; + return; + } + + if ((yy_buffer_stack_top) >= ((yy_buffer_stack_max)) - 1){ + + /* Increase the buffer to prepare for a possible push. */ + int grow_size = 8 /* arbitrary grow size */; + + num_to_alloc = (yy_buffer_stack_max) + grow_size; + (yy_buffer_stack) = (struct yy_buffer_state**)yyrealloc + ((yy_buffer_stack), + num_to_alloc * sizeof(struct yy_buffer_state*) + ); + + /* zero only the new slots.*/ + memset((yy_buffer_stack) + (yy_buffer_stack_max), 0, grow_size * sizeof(struct yy_buffer_state*)); + (yy_buffer_stack_max) = num_to_alloc; + } +} + +/** Setup the input buffer state to scan directly from a user-specified character buffer. + * @param base the character buffer + * @param size the size in bytes of the character buffer + * + * @return the newly allocated buffer state object. + */ +YY_BUFFER_STATE yy_scan_buffer (char * base, yy_size_t size ) +{ + YY_BUFFER_STATE b; + + if ( size < 2 || + base[size-2] != YY_END_OF_BUFFER_CHAR || + base[size-1] != YY_END_OF_BUFFER_CHAR ) + /* They forgot to leave room for the EOB's. */ + return 0; + + b = (YY_BUFFER_STATE) yyalloc(sizeof( struct yy_buffer_state ) ); + if ( ! b ) + YY_FATAL_ERROR( "out of dynamic memory in yy_scan_buffer()" ); + + b->yy_buf_size = size - 2; /* "- 2" to take care of EOB's */ + b->yy_buf_pos = b->yy_ch_buf = base; + b->yy_is_our_buffer = 0; + b->yy_input_file = 0; + b->yy_n_chars = b->yy_buf_size; + b->yy_is_interactive = 0; + b->yy_at_bol = 1; + b->yy_fill_buffer = 0; + b->yy_buffer_status = YY_BUFFER_NEW; + + yy_switch_to_buffer(b ); + + return b; +} + +/** Setup the input buffer state to scan a string. The next call to yylex() will + * scan from a @e copy of @a str. + * @param yystr a NUL-terminated string to scan + * + * @return the newly allocated buffer state object. + * @note If you want to scan bytes that may contain NUL values, then use + * yy_scan_bytes() instead. + */ +YY_BUFFER_STATE yy_scan_string (yyconst char * yystr ) +{ + + return yy_scan_bytes(yystr,strlen(yystr) ); +} + +/** Setup the input buffer state to scan the given bytes. The next call to yylex() will + * scan from a @e copy of @a bytes. + * @param bytes the byte buffer to scan + * @param len the number of bytes in the buffer pointed to by @a bytes. + * + * @return the newly allocated buffer state object. + */ +YY_BUFFER_STATE yy_scan_bytes (yyconst char * yybytes, int _yybytes_len ) +{ + YY_BUFFER_STATE b; + char *buf; + yy_size_t n; + int i; + + /* Get memory for full buffer, including space for trailing EOB's. */ + n = _yybytes_len + 2; + buf = (char *) yyalloc(n ); + if ( ! buf ) + YY_FATAL_ERROR( "out of dynamic memory in yy_scan_bytes()" ); + + for ( i = 0; i < _yybytes_len; ++i ) + buf[i] = yybytes[i]; + + buf[_yybytes_len] = buf[_yybytes_len+1] = YY_END_OF_BUFFER_CHAR; + + b = yy_scan_buffer(buf,n ); + if ( ! b ) + YY_FATAL_ERROR( "bad buffer in yy_scan_bytes()" ); + + /* It's okay to grow etc. this buffer, and we should throw it + * away when we're done. + */ + b->yy_is_our_buffer = 1; + + return b; +} + +#ifndef YY_EXIT_FAILURE +#define YY_EXIT_FAILURE 2 +#endif + +static void yy_fatal_error (yyconst char* msg ) +{ + (void) fprintf( stderr, "%s\n", msg ); + exit( YY_EXIT_FAILURE ); +} + +/* Redefine yyless() so it works in section 3 code. */ + +#undef yyless +#define yyless(n) \ + do \ + { \ + /* Undo effects of setting up yytext. */ \ + int yyless_macro_arg = (n); \ + YY_LESS_LINENO(yyless_macro_arg);\ + yytext[yyleng] = (yy_hold_char); \ + (yy_c_buf_p) = yytext + yyless_macro_arg; \ + (yy_hold_char) = *(yy_c_buf_p); \ + *(yy_c_buf_p) = '\0'; \ + yyleng = yyless_macro_arg; \ + } \ + while ( 0 ) + +/* Accessor methods (get/set functions) to struct members. */ + +/** Get the current line number. + * + */ +int yyget_lineno (void) +{ + + return yylineno; +} + +/** Get the input stream. + * + */ +FILE *yyget_in (void) +{ + return yyin; +} + +/** Get the output stream. + * + */ +FILE *yyget_out (void) +{ + return yyout; +} + +/** Get the length of the current token. + * + */ +int yyget_leng (void) +{ + return yyleng; +} + +/** Get the current token. + * + */ + +char *yyget_text (void) +{ + return yytext; +} + +/** Set the current line number. + * @param line_number + * + */ +void yyset_lineno (int line_number ) +{ + + yylineno = line_number; +} + +/** Set the input stream. This does not discard the current + * input buffer. + * @param in_str A readable stream. + * + * @see yy_switch_to_buffer + */ +void yyset_in (FILE * in_str ) +{ + yyin = in_str ; +} + +void yyset_out (FILE * out_str ) +{ + yyout = out_str ; +} + +int yyget_debug (void) +{ + return yy_flex_debug; +} + +void yyset_debug (int bdebug ) +{ + yy_flex_debug = bdebug ; +} + +static int yy_init_globals (void) +{ + /* Initialization is the same as for the non-reentrant scanner. + * This function is called from yylex_destroy(), so don't allocate here. + */ + + (yy_buffer_stack) = 0; + (yy_buffer_stack_top) = 0; + (yy_buffer_stack_max) = 0; + (yy_c_buf_p) = (char *) 0; + (yy_init) = 0; + (yy_start) = 0; + +/* Defined in main.c */ +#ifdef YY_STDINIT + yyin = stdin; + yyout = stdout; +#else + yyin = (FILE *) 0; + yyout = (FILE *) 0; +#endif + + /* For future reference: Set errno on error, since we are called by + * yylex_init() + */ + return 0; +} + +/* yylex_destroy is for both reentrant and non-reentrant scanners. */ +int yylex_destroy (void) +{ + + /* Pop the buffer stack, destroying each element. */ + while(YY_CURRENT_BUFFER){ + yy_delete_buffer(YY_CURRENT_BUFFER ); + YY_CURRENT_BUFFER_LVALUE = NULL; + yypop_buffer_state(); + } + + /* Destroy the stack itself. */ + yyfree((yy_buffer_stack) ); + (yy_buffer_stack) = NULL; + + /* Reset the globals. This is important in a non-reentrant scanner so the next time + * yylex() is called, initialization will occur. */ + yy_init_globals( ); + + return 0; +} + +/* + * Internal utility routines. + */ + +#ifndef yytext_ptr +static void yy_flex_strncpy (char* s1, yyconst char * s2, int n ) +{ + register int i; + for ( i = 0; i < n; ++i ) + s1[i] = s2[i]; +} +#endif + +#ifdef YY_NEED_STRLEN +static int yy_flex_strlen (yyconst char * s ) +{ + register int n; + for ( n = 0; s[n]; ++n ) + ; + + return n; +} +#endif + +void *yyalloc (yy_size_t size ) +{ + return (void *) malloc( size ); +} + +void *yyrealloc (void * ptr, yy_size_t size ) +{ + /* The cast to (char *) in the following accommodates both + * implementations that use char* generic pointers, and those + * that use void* generic pointers. It works with the latter + * because both ANSI C and C++ allow castless assignment from + * any pointer type to void*, and deal with argument conversions + * as though doing an assignment. + */ + return (void *) realloc( (char *) ptr, size ); +} + +void yyfree (void * ptr ) +{ + free( (char *) ptr ); /* see yyrealloc() for (char *) cast */ +} + +#define YYTABLES_NAME "yytables" + +#line 1314 "read_input.l" + + + +/* + * Dummy (but not dumb) routine - well, function + */ + +int yywrap() +{ + return(1); +} + +/* + * This is the function to parse the input file. + * default values for all paramters will be set + * correspondig to settings in + * default_input_values.h + * + * read_input expects the filename of the input file + * as an input parameter. + * + * read_input returns 2 if the input file did not exist + */ + +int read_input(char * conf_file){ + + /******************************************** + * Setting default values! + ********************************************/ + +#ifndef FIXEDVOLUME + T_global = _default_T_global; + L = _default_L; + LX = _default_LX; + LY = _default_LY; + LZ = _default_LZ; + N_PROC_X = _default_N_PROC_X; + N_PROC_Y = _default_N_PROC_Y; + N_PROC_Z = _default_N_PROC_Z; +#endif + g_kappa = _default_g_kappa; + g_acc_Ptilde = _default_g_acc_Ptilde; + g_acc_Hfin = _default_g_acc_Hfin; + g_rec_ev = _default_g_rec_ev; + g_mubar = _default_g_mubar; + g_epsbar = _default_g_epsbar; + g_mu = _default_g_mu; + g_mu1 = _default_g_mu1; + g_mu2 = _default_g_mu2; + g_mu3 = _default_g_mu3; + g_beta = _default_g_beta; + g_c_sw = _default_g_c_sw; + dtau = _default_dtau; + tau = _default_tau; + Nsteps = _default_Nsteps; + nsmall = _default_nsmall; + integtyp = _default_integtyp; + random_seed = _default_random_seed; + matrix_element_flag = _default_matrix_element_flag; + solver_flag = _default_solver_flag; + operator_flag = _default_operator_flag; + startoption = _default_startoption; + Ntherm = _default_Ntherm; + Nmeas = _default_Nmeas; + Nskip = _default_Nskip; + save_config_flag = _default_save_config_flag; + save_prop_flag = _default_save_prop_flag; + save_prop_g2_flag = _default_save_prop_g2_flag; + write_cp_flag = _default_write_cp_flag; + cp_interval = _default_cp_interval; + nstore = _default_nstore; + strcpy(rlxd_input_filename, _default_rlxd_input_filename); + strcpy(gauge_input_filename, _default_gauge_input_filename); + g_stdio_proc = _default_g_stdio_proc; + index_start = _default_index_start; + index_end = _default_index_end; + first_prop_flag = _default_first_prop_flag; + ITER_MAX_CG = _default_ITER_MAX_CG; + ITER_MAX_BCG = _default_ITER_MAX_BCG; + X0 = _default_X0; + max_solver_iterations = _default_max_solver_iterations; + solver_precision = _default_solver_precision; + mass_number = _default_mass_number; + g_rgi_C1 = _default_g_rgi_C1; + read_source_flag= _default_read_source_flag; + strcpy(source_input_filename, _default_source_filename); + g_eps_sq_force = _default_g_eps_sq_force; + g_eps_sq_acc = _default_g_eps_sq_acc; + g_eps_sq_force1 = _default_g_eps_sq_force1; + g_eps_sq_acc1 = _default_g_eps_sq_acc1; + g_eps_sq_force2 = _default_g_eps_sq_force2; + g_eps_sq_acc2 = _default_g_eps_sq_acc2; + g_eps_sq_force3 = _default_g_eps_sq_force3; + g_eps_sq_acc3 = _default_g_eps_sq_acc3; + g_relative_precision_flag = _default_g_relative_precision_flag; + return_check_flag = _default_return_check_flag; + return_check_interval = _default_return_check_interval; + g_debug_level = _default_g_debug_level; + g_csg_N[0] = _default_g_csg_N; + g_csg_N[2] = _default_g_csg_N; + g_csg_N[4] = _default_g_csg_N; + g_csg_N[6] = _default_g_csg_N; + lambda[0] = _default_2mn_lambda; + lambda[1] = _default_2mn_lambda; + lambda[2] = _default_2mn_lambda; + lambda[3] = _default_2mn_lambda; + source_format_flag = _default_source_format_flag; + source_time_slice = _default_source_time_slice; + gmres_m_parameter = _default_gmres_m_parameter; + gmresdr_nr_ev = _default_gmresdr_nr_ev; + gauge_precision_read_flag = _default_gauge_precision_read_flag; + gauge_precision_write_flag = _default_gauge_precision_write_flag; + prop_precision_flag = _default_prop_precision_flag; + reproduce_randomnumber_flag = _default_reproduce_randomnumber_flag; + g_sloppy_precision_flag = _default_g_sloppy_precision_flag; + use_stout_flag = _default_use_stout_flag; + stout_rho = _default_stout_rho; + stout_no_iter = _default_stout_no_iter; + /* check for reread ! */ + phmc_no_flavours = _default_phmc_no_flavours; + phmc_compute_evs = _default_phmc_compute_evs; + compute_evs = _default_compute_evs; + stilde_min = _default_stilde_min; + stilde_max = _default_stilde_max; + degree_of_p = _default_degree_of_p; + propagator_splitted = _default_propagator_splitted; + source_splitted = _default_source_splitted; + source_location = _default_source_location; + eigenvalue_precision = _default_eigenvalue_precision; + no_eigenvalues = _default_no_eigenvalues; + sub_evs_cg_flag = _default_sub_evs_cg_flag; + phmc_heavy_timescale = _default_phmc_heavy_timescale; + phmc_exact_poly = _default_phmc_exact_poly; + even_odd_flag = _default_even_odd_flag; + online_measurement_flag = _default_online_measurement_flag; + online_measurement_freq = _default_online_measurement_freq; + + /* Put -1 in write_prop_format_flag to see if parse_config() will + change the value. If not then set it to source_format_flag */ + write_prop_format_flag = -1; + /********************************************/ + + if ((yyin = fopen(conf_file, "rt")) == NULL){ + return(2); + } + yyout = fopen("/dev/null", "w"); + + parse_config(); +#ifndef FIXEDVOLUME + if(LX == 0) { + LX = L; + } + if(LY == 0) { + LY = L; + } + if(LZ == 0) { + LZ = L; + } +#endif + + if(g_eps_sq_force1 < 0) g_eps_sq_force1 = g_eps_sq_force; + if(g_eps_sq_force2 < 0) g_eps_sq_force2 = g_eps_sq_force; + if(g_eps_sq_force3 < 0) g_eps_sq_force3 = g_eps_sq_force; + if(g_eps_sq_acc1 < 0) g_eps_sq_acc1 = g_eps_sq_acc; + if(g_eps_sq_acc2 < 0) g_eps_sq_acc2 = g_eps_sq_acc; + if(g_eps_sq_acc3 < 0) g_eps_sq_acc3 = g_eps_sq_acc; + + if(write_prop_format_flag == -1) write_prop_format_flag = source_format_flag; + g_rgi_C0 = 1. - 8.*g_rgi_C1; + g_ka_csw_8 = g_kappa*g_c_sw/8.; + + fclose(yyout); + fclose(yyin); + return(0); +} + + +/* + * This is the function to parse the input file + * again. Only parameters are changed, that + * are specified in the input file. + * default values for paramters will not be set. + * + * reread_input expects the filename of the input file + * as an input parameter. + * + * reread_input returns 2 if the input file did not exist + */ + +int reread_input(char * conf_file){ +#ifndef FIXEDVOLUME + int tt=T, ll=L, lx = LX, ly = LY, lz = LZ, + np=N_PROC_X, npy = N_PROC_Y; +#endif + int nst=nstore, j=0; + double m2 = g_mu2, m3 = g_mu3; + int n1 = g_csg_N[0], n2 = g_csg_N[2], n3 = g_csg_N[4], n4 = g_csg_N[6]; + double x; + + /******************************************** + * Setting default values! + ********************************************/ + + /********************************************/ + + if ((yyin = fopen(conf_file, "rt")) == NULL){ + return(2); + } + yyout = fopen("/dev/null", "w"); + + parse_config(); + +#ifndef FIXEDVOLUME + T = tt; + L = ll; + LX = lx; + LY = ly; + LZ = lz; + N_PROC_X = np; + N_PROC_Y = npy; +#endif + g_csg_N[0] = n1; + g_csg_N[2] = n2; + g_csg_N[4] = n3; + g_csg_N[6] = n4; + + + if(g_dbw2rand == 0) { + g_rgi_C1 = 0.; + } + nstore = nst; + + g_rgi_C0 = 1. - 8.*g_rgi_C1; + g_ka_csw_8 = g_kappa*g_c_sw/8.; + + if(g_mu3 > 0. && g_mu3 != m3) { + g_mu = g_mu1; + g_mu1 = g_mu3; + g_mu3 = g_mu; + + j = int_n[1]; + int_n[1] = int_n[3]; + int_n[3] = j; + + x = lambda[1]; + lambda[1] = lambda[3]; + lambda[3] = x; + + g_nr_of_psf = 3; + } + else if(g_mu2 > 0. && g_mu2 != m2) { + g_mu = g_mu1; + g_mu1 = g_mu2; + g_mu2 = g_mu; + + int_n[3] = int_n[1]; + int_n[1] = int_n[2]; + int_n[2] = int_n[3]; + + lambda[3] = lambda[1]; + lambda[1] = lambda[2]; + lambda[2] = lambda[3]; + + g_nr_of_psf = 2; + } + for(j = 0; j < g_nr_of_psf+1; j++) { + if(int_n[j] == 0) int_n[j] = 1; + } + if(g_nr_of_psf == 3) { + g_eps_sq_force = g_eps_sq_force1; + g_eps_sq_force1 = g_eps_sq_force3; + g_eps_sq_force3 = g_eps_sq_force; + g_eps_sq_acc = g_eps_sq_acc1; + g_eps_sq_acc1 = g_eps_sq_acc3; + g_eps_sq_acc3 = g_eps_sq_acc; + } + if(g_nr_of_psf == 2) { + g_eps_sq_force = g_eps_sq_force1; + g_eps_sq_force1 = g_eps_sq_force2; + g_eps_sq_force2 = g_eps_sq_force; + g_eps_sq_acc = g_eps_sq_acc1; + g_eps_sq_acc1 = g_eps_sq_acc2; + g_eps_sq_acc2 = g_eps_sq_acc; + } + g_mu = g_mu1; + g_eps_sq_acc = g_eps_sq_acc1; + g_eps_sq_force = g_eps_sq_force1; + + fclose(yyout); + fclose(yyin); + return(0); +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/read_input.h b/qcd/part_cpu/applications/QCD/src/kernel_D/read_input.h new file mode 100644 index 0000000000000000000000000000000000000000..f74b2519fe935ac3889af5da2de16fa99b3e630a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/read_input.h @@ -0,0 +1,104 @@ +/* $Id: read_input.h,v 1.31 2008/07/31 22:07:49 urbach Exp $ */ + +/* + * This is the function to parse the input file. + * No default values for any paramter will be set + * + * read_inputg expects the filename of the input file + * as an input parameter. + * + * read_input returns 2 if the input file did not exist + */ + +#ifndef _PARSER_H +#define _PARSER_H + +#define COLD 0 +#define HOT 1 +#define RESTART 2 +#define CONTINUE 3 + +#ifdef __cplusplus +extern "C" +{ +#endif /* __cplusplus */ + + /* input parameters defined in */ + /* read_input.h */ + extern int verbose; + extern int startoption; + extern int Ntherm; + extern int Nmeas; + extern int Nskip; + extern int solver_flag; + extern int gmres_m_parameter, gmresdr_nr_ev; + extern int operator_flag; + extern int matrix_element_flag; + extern int save_config_flag; + extern int save_prop_flag; + extern int save_prop_g2_flag; + extern int write_cp_flag; + extern int cp_interval; + extern int nstore; + extern int int_n[4]; + extern double lambda[4]; + extern int crylov_space_dim; + extern char rlxd_input_filename[100]; + extern char gauge_input_filename[100]; + extern int subforwilson_flag; + extern int eigenvalue_method_flag; + extern int eigenvalue_max_iterations; + extern double eigenvalue_precision; + extern int index_start; + extern int index_end; + extern int first_prop_flag; + extern double dtau, tau; + extern int Nsteps; + extern int random_seed; + extern int integtyp,nsmall; + extern int ITER_MAX_BCG; + extern int ITER_MAX_CG; + extern double X0; + extern int max_solver_iterations; + extern double solver_precision; + extern int mass_number; + extern int read_source_flag; + extern char source_input_filename[100]; + extern int return_check_flag; + extern int return_check_interval; + extern int source_format_flag; + extern int source_time_slice; + extern int gauge_precision_read_flag; + extern int gauge_precision_write_flag; + extern int prop_precision_flag; + extern int reproduce_randomnumber_flag; + extern double stout_rho; + extern int stout_no_iter; + extern int use_stout_flag; + extern int phmc_no_flavours; + extern int phmc_heavy_timescale; + extern int phmc_compute_evs; + extern int phmc_exact_poly; + extern int compute_evs; + extern int no_eigenvalues; + extern double eigenvalue_precision; + extern double stilde_max; + extern double stilde_min; + extern int degree_of_p; + extern int propagator_splitted; + extern int source_splitted; + extern int source_location; + extern int sub_evs_cg_flag; + extern int even_odd_flag; + extern int write_prop_format_flag; + extern int online_measurement_flag; + extern int online_measurement_freq; + + int read_input(char *); + int reread_input(char *); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/reweighting_factor.c b/qcd/part_cpu/applications/QCD/src/kernel_D/reweighting_factor.c new file mode 100644 index 0000000000000000000000000000000000000000..bfc3685a61d58e1d83c9fdbd929cca66d85f7da3 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/reweighting_factor.c @@ -0,0 +1,95 @@ +/*********************************************************************** + * + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "global.h" +#include "linalg_eo.h" +#include "start.h" +#include "monomial/monomial.h" +#include "hamiltonian_field.h" +#include "reweighting_factor.h" + +void reweighting_factor(const int N, const int nstore) { + int i, j, n = VOLUME; + double sq_norm, x, y; + double * sum, * sum_sq; + monomial * mnl; + FILE * ofs; + hamiltonian_field_t hf; + + hf.gaugefield = g_gauge_field; + hf.momenta = NULL; + hf.derivative = NULL; + hf.update_gauge_copy = g_update_gauge_copy; + + sum = (double*)calloc(no_monomials, sizeof(double)); + sum_sq = (double*)calloc(no_monomials, sizeof(double)); + + for(i = 0; i < N; i++) { + sq_norm = 0.; + for(j = 0; j < no_monomials; j++) { + mnl = &monomial_list[j]; + if(mnl->type != GAUGE) { + if(mnl->even_odd_flag) { + random_spinor_field_eo(mnl->pf, mnl->rngrepro, RN_GAUSS); + } + else random_spinor_field_lexic(mnl->pf, mnl->rngrepro, RN_GAUSS); + mnl->energy0 = square_norm(mnl->pf, n, 1); + if(mnl->type == NDDETRATIO) { + if(mnl->even_odd_flag) { + random_spinor_field_eo(mnl->pf2, mnl->rngrepro, RN_GAUSS); + } + else random_spinor_field_lexic(mnl->pf, mnl->rngrepro, RN_GAUSS); + mnl->energy0 += square_norm(mnl->pf2, n, 1); + } + } + } + + for(j = 0; j < no_monomials; j++) { + mnl = &monomial_list[j]; + if(mnl->type != GAUGE) { + y = mnl->accfunction(j, &hf); + sq_norm -= y; + x = exp(sq_norm); + sum[j] += x; + sum_sq[j] += x*x; + if(g_proc_id == 0 && g_debug_level > 0) { + printf("monomial[%d] %s, w_%d=%e W=%e\n", j, mnl->name, j, y, x); + } + } + } + } + + if(g_proc_id == 0) { + ofs = fopen("reweighting_factor.data", "a"); + fprintf(ofs, "%d ", nstore); + for(j = 0; j < no_monomials; j++) { + fprintf(ofs, "%e %e ", sum[j]/N, sqrt((-sum[j]*sum[j]/N/N + sum_sq[j]/N)/(N-1)/N)); + } + fprintf(ofs, "\n"); + fclose(ofs); + } +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/reweighting_factor.h b/qcd/part_cpu/applications/QCD/src/kernel_D/reweighting_factor.h new file mode 100644 index 0000000000000000000000000000000000000000..76f872a87078b34045313cf050342e98f5a43df5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/reweighting_factor.h @@ -0,0 +1,27 @@ +/*********************************************************************** + * + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + + +#ifndef _REWEIGHTING_FACTOR_H +#define _REWEIGHTING_FACTOR_H + +void reweighting_factor(const int N, const int nstore); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/reweighting_factor_nd.c b/qcd/part_cpu/applications/QCD/src/kernel_D/reweighting_factor_nd.c new file mode 100644 index 0000000000000000000000000000000000000000..87f8a927406c7c98dfce00cb7ff027e7f1ca24f7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/reweighting_factor_nd.c @@ -0,0 +1,90 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "global.h" +#include "linalg_eo.h" +#include "start.h" +#include "operator/tm_operators.h" +#include "operator/tm_operators_nd.h" +#include "Ptilde_nd.h" +#include "phmc.h" +#include "reweighting_factor_nd.h" + +double reweighting_factor_nd(const int N, const int repro) +{ + int i, n_iter; + double sq_norm, corr, sum=0., sq_sum = 0., temp1; + double mu1, mu2; + + _Complex double temp2; + + mu1 = g_mu1; + mu2 = g_mu1; + + /* Use spinor_field 2,3,5 */ + /* in order not to conflict with anything else... */ + + for(i = 0; i < N; ++i) + { + random_spinor_field_eo(g_chi_up_spinor_field[2], repro, RN_GAUSS); + random_spinor_field_eo(g_chi_dn_spinor_field[2], repro, RN_GAUSS); + zero_spinor_field(g_chi_up_spinor_field[3], VOLUME/2); + zero_spinor_field(g_chi_dn_spinor_field[3], VOLUME/2); + + temp1 = phmc_ptilde_cheby_coef[0]; + phmc_ptilde_cheby_coef[0] = temp1 - 1; + + Ptilde_ndpsi(g_chi_up_spinor_field[3], g_chi_dn_spinor_field[3], phmc_ptilde_cheby_coef, phmc_ptilde_n_cheby, g_chi_up_spinor_field[2], g_chi_dn_spinor_field[2], &Qtm_pm_ndpsi); + + phmc_ptilde_cheby_coef[0] = temp1; + + temp2 = scalar_prod(g_chi_up_spinor_field[2], g_chi_up_spinor_field[3], VOLUME / 2, 1); + if(cimag(temp2) > 1.0e-8) + { + printf("!!! WARNING Immaginary part of CORR-UP LARGER than 10^-8 !!! \n"); + printf(" CORR-UP: Re=%12.10e Im=%12.10e \n", creal(temp2), cimag(temp2)); + } + corr = temp2; + printf(" CORR-UP: Re=%12.10e \n", corr); + temp2 = scalar_prod(g_chi_dn_spinor_field[2], g_chi_dn_spinor_field[3], VOLUME / 2, 1); + if(cimag(temp2) > 1.0e-8) + { + printf("!!! WARNING Immaginary part of CORR_DN LARGER than 10^-8 !!! \n"); + printf(" CORR-DN: Re=%12.10e Im=%12.10e \n", creal(temp2), cimag(temp2)); + } + corr += temp2; + printf(" CORR-DN: Re=%12.10e \n", cimag(temp2)); + + temp1 = -corr; + sum += temp1; + sq_sum += temp1 * temp1; + printf("rew: n_iter = %d, sq_norm = %e, corr = %e\n", n_iter, sq_norm, corr); + } + sum /= N; + sq_sum /= N; + printf("rew: factor = %e, err = %e\n", sum, sqrt(sum * sum - sq_sum) / (N - 1)); + return(sum); +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/reweighting_factor_nd.h b/qcd/part_cpu/applications/QCD/src/kernel_D/reweighting_factor_nd.h new file mode 100644 index 0000000000000000000000000000000000000000..c2b84f4879a42e32a7001ed41e08868ab522b66f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/reweighting_factor_nd.h @@ -0,0 +1,25 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _REWEIGHTING_FACTOR_ND_H +#define _REWEIGHTING_FACTOR_ND_H + +double reweighting_factor_nd(const int N, const int repro); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/rnd_gauge_trafo.c b/qcd/part_cpu/applications/QCD/src/kernel_D/rnd_gauge_trafo.c new file mode 100644 index 0000000000000000000000000000000000000000..3f5fe71299588a6658dacbca2c26b1a68bc82338 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/rnd_gauge_trafo.c @@ -0,0 +1,72 @@ +/*********************************************************************** + * + * Copyright (C) 2003 Mauro Papinutto + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * perform a random gauge transformation + * + * + *******************************************************************************/ + +#if HAVE_CONFIG_H +#include +#endif +#include +#include +#include "global.h" +#include "su3.h" +#include "start.h" +#include "rnd_gauge_trafo.h" + +void rnd_gauge_trafo(const int repro, su3 ** const gf){ + int ix,iy,mu; + static su3 u,v,w,x,y; + su3 * _gauge_trafo = NULL; + su3 * gauge_trafo = NULL; + + if((_gauge_trafo = calloc(VOLUMEPLUSRAND+1, sizeof(su3))) == NULL) { + fprintf(stderr, "Could not allocate memory in rnd_gauge_trafo. Exiting!\n"); + exit(0); + } + gauge_trafo = (su3*)(((unsigned long int)(gauge_trafo)+ALIGN_BASE)&~ALIGN_BASE); + + random_gauge_field(repro, gauge_trafo); + +#ifdef MPI + xchange_gauge(gauge_trafo); +#endif + + for (ix=0;ix 1) + +// Precision of cl_F corresponding to MAXORD. +// +// A good guess is: DIGIT = 70+2.8*MAXORD +// +// but one has to check this by two runs with increasing precision. + +// 700 +#define DIGIT 700 // Precision of cl_F + + +// Define constants to the desired precision + +cl_F ONE = "1.0e+0_700"; // Precise 1 +cl_F TWO = "2.0e+0_700"; // Precise 2 +cl_F ZERO = "0.0e+0_700"; // Precise 0 +cl_F HALF = "0.5e+0_700"; // Precise 0.5 +cl_F HUND = "100.e+0_700"; + +// Define basic parameters to the desired precision + +int MAXPOW = 48; + +/* cl_F ALPHA = "-0.500e+0_700", */ +/* EPSILON = "0.1e+0_700", */ +/* LAMBDA = "1.00e+0_700"; */ + +cl_F ALPHA = "0.500e+0_700", + EPSILON = "0.0043e+0_700", + LAMBDA = "1.e+0_700"; + +// Define output format and files + +char Format[] = "C"; +char Filename[] = "recur_A25_8_002.cff"; +char Filenamr[] = "roots_A25_8_002.cff"; + +/******************************************************************************/ + + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/clover_roots.dat b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/clover_roots.dat new file mode 100644 index 0000000000000000000000000000000000000000..2999e60842ecdf01694d3a3027039c73815e6b6a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/clover_roots.dat @@ -0,0 +1,97 @@ +Nr. Re Im +0 -8.9098604765327671e-01 4.5074023389346268e-02 +1 -7.0521755425263163e-02 6.0986181422472217e-02 +2 8.5940433190103771e-01 5.0319968469774365e-02 +3 1.0043512422763268e+00 3.1636826180314933e-03 +4 -5.4886289787580045e-01 7.7329052278625937e-02 +5 4.9397115089376387e-01 7.9385556139429547e-02 +6 -9.7968537134226641e-01 2.1906208061652683e-02 +7 -3.1854163755834597e-01 8.1675780066108220e-02 +8 6.9938103374781302e-01 6.8159933702328160e-02 +9 9.6335510933106017e-01 2.7960365396897370e-02 +10 -7.4412011327798677e-01 6.4234828280787096e-02 +11 2.5742534856994170e-01 8.0679880062335896e-02 +12 -9.4306964853078168e-01 3.3861735312366539e-02 +13 -1.9555476022692905e-01 7.8209886572080278e-02 +14 7.8581782322697336e-01 5.9931944976320019e-02 +15 9.9199268902477511e-01 1.5732777929054854e-02 +16 -6.5178876086130810e-01 7.1674463081838988e-02 +17 3.7854117416401190e-01 8.1660806016135512e-02 +18 -1.0002260243442995e+00 9.4739108135040214e-03 +19 -4.3711111468617142e-01 8.0856014824308473e-02 +20 6.0154461274290694e-01 7.4743652461674512e-02 +21 9.1891318465105021e-01 3.9577188890777028e-02 +22 -8.2429947903305645e-01 5.5283143607108863e-02 +23 1.3332587028590551e-01 7.3063488817650540e-02 +24 -9.1891318465105021e-01 3.9577188890777028e-02 +25 -1.3332587028590551e-01 7.3063488817650540e-02 +26 8.2429947903305645e-01 5.5283143607108863e-02 +27 1.0002260243442995e+00 9.4739108135040214e-03 +28 -6.0154461274290694e-01 7.4743652461674512e-02 +29 4.3711111468617142e-01 8.0856014824308473e-02 +30 -9.9199268902477511e-01 1.5732777929054854e-02 +31 -3.7854117416401190e-01 8.1660806016135512e-02 +32 6.5178876086130810e-01 7.1674463081838988e-02 +33 9.4306964853078168e-01 3.3861735312366539e-02 +34 -7.8581782322697336e-01 5.9931944976320019e-02 +35 1.9555476022692905e-01 7.8209886572080278e-02 +36 -9.6335510933106017e-01 2.7960365396897370e-02 +37 -2.5742534856994170e-01 8.0679880062335896e-02 +38 7.4412011327798677e-01 6.4234828280787096e-02 +39 9.7968537134226641e-01 2.1906208061652683e-02 +40 -6.9938103374781302e-01 6.8159933702328160e-02 +41 3.1854163755834597e-01 8.1675780066108220e-02 +42 -1.0043512422763268e+00 3.1636826180314933e-03 +43 -4.9397115089376387e-01 7.9385556139429547e-02 +44 5.4886289787580045e-01 7.7329052278625937e-02 +45 8.9098604765327671e-01 4.5074023389346268e-02 +46 -8.5940433190103771e-01 5.0319968469774365e-02 +47 7.0521755425263163e-02 6.0986181422472217e-02 +48 7.0521755425263163e-02 -6.0986181422472217e-02 +49 -8.5940433190103771e-01 -5.0319968469774365e-02 +50 8.9098604765327671e-01 -4.5074023389346268e-02 +51 5.4886289787580045e-01 -7.7329052278625937e-02 +52 -4.9397115089376387e-01 -7.9385556139429547e-02 +53 -1.0043512422763268e+00 -3.1636826180314933e-03 +54 3.1854163755834597e-01 -8.1675780066108220e-02 +55 -6.9938103374781302e-01 -6.8159933702328160e-02 +56 9.7968537134226641e-01 -2.1906208061652683e-02 +57 7.4412011327798677e-01 -6.4234828280787096e-02 +58 -2.5742534856994170e-01 -8.0679880062335896e-02 +59 -9.6335510933106017e-01 -2.7960365396897370e-02 +60 1.9555476022692905e-01 -7.8209886572080278e-02 +61 -7.8581782322697336e-01 -5.9931944976320019e-02 +62 9.4306964853078168e-01 -3.3861735312366539e-02 +63 6.5178876086130810e-01 -7.1674463081838988e-02 +64 -3.7854117416401190e-01 -8.1660806016135512e-02 +65 -9.9199268902477511e-01 -1.5732777929054854e-02 +66 4.3711111468617142e-01 -8.0856014824308473e-02 +67 -6.0154461274290694e-01 -7.4743652461674512e-02 +68 1.0002260243442995e+00 -9.4739108135040214e-03 +69 8.2429947903305645e-01 -5.5283143607108863e-02 +70 -1.3332587028590551e-01 -7.3063488817650540e-02 +71 -9.1891318465105021e-01 -3.9577188890777028e-02 +72 1.3332587028590551e-01 -7.3063488817650540e-02 +73 -8.2429947903305645e-01 -5.5283143607108863e-02 +74 9.1891318465105021e-01 -3.9577188890777028e-02 +75 6.0154461274290694e-01 -7.4743652461674512e-02 +76 -4.3711111468617142e-01 -8.0856014824308473e-02 +77 -1.0002260243442995e+00 -9.4739108135040214e-03 +78 3.7854117416401190e-01 -8.1660806016135512e-02 +79 -6.5178876086130810e-01 -7.1674463081838988e-02 +80 9.9199268902477511e-01 -1.5732777929054854e-02 +81 7.8581782322697336e-01 -5.9931944976320019e-02 +82 -1.9555476022692905e-01 -7.8209886572080278e-02 +83 -9.4306964853078168e-01 -3.3861735312366539e-02 +84 2.5742534856994170e-01 -8.0679880062335896e-02 +85 -7.4412011327798677e-01 -6.4234828280787096e-02 +86 9.6335510933106017e-01 -2.7960365396897370e-02 +87 6.9938103374781302e-01 -6.8159933702328160e-02 +88 -3.1854163755834597e-01 -8.1675780066108220e-02 +89 -9.7968537134226641e-01 -2.1906208061652683e-02 +90 4.9397115089376387e-01 -7.9385556139429547e-02 +91 -5.4886289787580045e-01 -7.7329052278625937e-02 +92 1.0043512422763268e+00 -3.1636826180314933e-03 +93 8.5940433190103771e-01 -5.0319968469774365e-02 +94 -7.0521755425263163e-02 -6.0986181422472217e-02 +95 -8.9098604765327671e-01 -4.5074023389346268e-02 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/extra_masses.input b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/extra_masses.input new file mode 100644 index 0000000000000000000000000000000000000000..b7fc265b8d6b733c3db10e0a9386962bde0066f5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/extra_masses.input @@ -0,0 +1,5 @@ +0.15 +0.22 +0.34 +0.55 + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/normierungLocal.dat b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/normierungLocal.dat new file mode 100644 index 0000000000000000000000000000000000000000..8ed697eb68dd55fde095b5750451af02ef2328cc --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/normierungLocal.dat @@ -0,0 +1 @@ +3.3394134092406311254 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/offline_measurement.input b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/offline_measurement.input new file mode 100644 index 0000000000000000000000000000000000000000..60517b9c6e31a256aeb0589902080bcf0b349cd7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/offline_measurement.input @@ -0,0 +1,51 @@ +# example input file for offline measurements using "offline_measurement" +# requires 2 8^4 gauge configuration conf.0000 and conf.0002 + +L=8 +T=8 + +DebugLevel = 5 +ompnumthreads=4 + +InitialStoreCounter = 0 +Measurements = 2 +# measurements will be carried out in nsave steps +# e.g. for conf.0000 and conf.0002 in this case +nsave=2 +2kappamu = 0.05 +kappa = 0.177 +BCAngleT = 1 +GaugeConfigInputFile = conf +UseEvenOdd = yes + +# the correlators measurement requires ONE operator to be defined +# if multiple operators are defined, only the first one is used! +BeginMeasurement CORRELATORS + Frequency = 1 +EndMeasurement + +BeginMeasurement POLYAKOVLOOP + Frequency = 1 +EndMeasurement + +BeginMeasurement ORIENTEDPLAQUETTES + Frequency = 1 +EndMeasurement + +# requirements are the same as for the correlators measurement +BeginMeasurement PIONNORM + Frequency = 1 +EndMeasurement + +# note: setting the solver to CGMMS will result in the CGMMS inversion taking place +# because the solver is not properly decoupled form the rest of the code +BeginOperator TMWILSON + 2kappaMu = 0.05 + kappa = 0.177 + UseEvenOdd = yes + Solver = CG + SolverPrecision = 1e-14 + MaxSolverIterations = 1000 + AddDownPropagator = no +EndOperator + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-bicgstab.input b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-bicgstab.input new file mode 100644 index 0000000000000000000000000000000000000000..77b3a05f5bf1af8a0f100d8d51cf3906fddce7e8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-bicgstab.input @@ -0,0 +1,30 @@ +# example input file for invert +# for bicgstab solver +# requires a 4^4 gauge configuration conf.0000 + +L=4 +T=4 + +DebugLevel = 5 +InitialStoreCounter = 0 +Measurements = 1 +2kappamu = 0.05 +kappa = 0.177 +BCAngleT = 1 +GaugeConfigInputFile = conf +UseEvenOdd = yes + +SourceType = Volume +ReadSource = no +NoSamples = 12 + +BeginOperator TMWILSON + 2kappaMu = 0.05 + kappa = 0.177 + UseEvenOdd = yes + Solver = bicgstab + SolverPrecision = 1e-14 + MaxSolverIterations = 1000 + AddDownPropagator = yes +EndOperator + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-cg-loop.input b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-cg-loop.input new file mode 100644 index 0000000000000000000000000000000000000000..9b7da16c5f350b7c9b461e7afae2a0ea6169a142 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-cg-loop.input @@ -0,0 +1,33 @@ +# example input file for invert +# for CG solver doing two gauges in one go +# requires two 4^4 gauge configuration conf.0000 and conf.0010 + +L=4 +T=4 + +DebugLevel = 5 +InitialStoreCounter = 0 +# loop over two gauges +Measurements = 2 +# use InitialStoreCounter, InitialStoreCounter + 10, ... +Nsave = 10 +2kappamu = 0.05 +kappa = 0.177 +BCAngleT = 1 +GaugeConfigInputFile = conf +UseEvenOdd = yes + +SourceType = Volume +ReadSource = no +NoSamples = 12 + +BeginOperator TMWILSON + 2kappaMu = 0.05 + kappa = 0.177 + UseEvenOdd = yes + Solver = CG + SolverPrecision = 1e-14 + MaxSolverIterations = 1000 + AddDownPropagator = yes +EndOperator + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-cg-tmclover.input b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-cg-tmclover.input new file mode 100644 index 0000000000000000000000000000000000000000..993c19c2b234ed35650e485783469ec297e05d20 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-cg-tmclover.input @@ -0,0 +1,34 @@ +# sample inverter file for a twisted clover operator + +L=4 +T=4 + +Measurements = 1 +InitialStoreCounter = 0 +2KappaMu = 0.1 +csw = 1.00 +kappa = 0.160 +ThetaT = 1. +UseEvenOdd = yes +DebugLevel = 2 +NoSamples = 1 +SourceType = Point +SplittedPropagator = yes +Indices = 0 +ReadSource = no +UseRelativePrecision = yes +UseSloppyPrecision = yes + +BeginOperator CLOVER + 2KappaMu = 0.1 + kappa = 0.160 + csw = 1.00 +# the following are not available for this operator (yet) +# Solver = CG +# UseEvenOdd = yes + SolverPrecision = 1.e-16 + MaxSolverIterations = 100 + PropagatorPrecision = 64 + AddDownPropagator = yes +EndOperator + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-cg.input b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-cg.input new file mode 100644 index 0000000000000000000000000000000000000000..3229eab2c70d4e2e7d03e42afc3e2375f1c9498a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-cg.input @@ -0,0 +1,38 @@ +# example input file for invert +# for CG solver +# requires a 4^4 gauge configuration conf.0000 + +L=4 +T=4 + +DebugLevel = 5 +InitialStoreCounter = 0 +Measurements = 1 +2kappamu = 0.05 +kappa = 0.177 +BCAngleT = 1 +GaugeConfigInputFile = conf +UseEvenOdd = yes + +SourceType = Volume +ReadSource = no +NoSamples = 12 + +BeginOperator TMWILSON + 2kappaMu = 0.05 + kappa = 0.177 + UseEvenOdd = yes + Solver = CG + SolverPrecision = 1e-14 + MaxSolverIterations = 1000 + AddDownPropagator = yes +EndOperator + +BeginOperator DBTMWILSON + 2KappaMubar = 0.139 + 2KappaEpsbar = 0.15 + kappa = 0.177 + Solver = CG + SolverPrecision = 1e-12 +EndOperator + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-cgmms.input b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-cgmms.input new file mode 100644 index 0000000000000000000000000000000000000000..b882b8b08964abcb57d323d17a177c697c2333ac --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-cgmms.input @@ -0,0 +1,53 @@ +# example input file for CGMMS solver +# requires a 4^4 gauge configuration conf.0000 +# +# Extra masses can be specified in this file directly as +# one line of floating point numbers separated by commas +# or by naming an extra file which contains the extra masses +# one on each line + +# NOTE: Masses must be specified as 2*Kappa*Mu + +# the maximum number of extra masses is configured +# by setting MAX_EXTRA_MASSES in global.h + +L=4 +T=4 + +DebugLevel = 5 +InitialStoreCounter = 0 +Measurements = 1 +2kappamu = 0.05 +kappa = 0.177 +BCAngleT = 1 +GaugeConfigInputFile = conf +UseEvenOdd = no + +SourceType = Volume +ReadSource = no +NoSamples = 12 + +BeginOperator TMWILSON + 2kappaMu = 0.05 + kappa = 0.177 +# this is mandatory + UseEvenOdd = no + Solver = CGMMS + SolverPrecision = 1e-14 + MaxSolverIterations = 1000 +# this is automatic + AddDownPropagator = no + ExtraMasses = 0.06,0.07,0.08,0.10,0.12,0.14,0.16,0.19 +EndOperator + +BeginOperator TMWILSON + 2KappaMu = 0.09 + Kappa = 0.164 + UseEvenOdd = no + Solver = CGMMS + SolverPrecision = 1e-15 + MaxSolverIterations = 1000 + AddDownPropagator = no + ExtraMasses = extra_masses.input +EndOperator + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-cgs.input b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-cgs.input new file mode 100644 index 0000000000000000000000000000000000000000..4bb0abc3f22f10d936a3a18a4bc5b2f328c1b4f4 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-cgs.input @@ -0,0 +1,31 @@ +# example input file for invert +# for CGS solver +# requires a 4^4 gauge configuration conf.0000 + +L=4 +T=4 + +DebugLevel = 5 +InitialStoreCounter = 0 +Measurements = 1 +2kappamu = 0.05 +kappa = 0.177 +BCAngleT = 1 +GaugeConfigInputFile = conf +UseEvenOdd = yes + +SourceType = Volume +ReadSource = no +NoSamples = 12 + +BeginOperator TMWILSON + 2kappaMu = 0.05 + kappa = 0.177 + UseEvenOdd = yes + Solver = CGS + SolverPrecision = 1e-14 + MaxSolverIterations = 1000 + AddDownPropagator = yes +EndOperator + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-eigcg-tmclover-invert.input b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-eigcg-tmclover-invert.input new file mode 100644 index 0000000000000000000000000000000000000000..5ce70b09b692fc9d1e9163b91c411ffffe27c46c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-eigcg-tmclover-invert.input @@ -0,0 +1,45 @@ +# sample inverter file for a twisted clover operator + +L=16 +T=32 +NrXProcs = 1 +NrYProcs = 1 +NrZProcs = 1 +OmpNumThreads = 1 + +Measurements = 1 +InitialStoreCounter = 0 +2KappaMu = 0.0024135 +csw = 1.00 +kappa = 0.160900 +ThetaT = 1. +UseEvenOdd = yes +DebugLevel = 2 +NoSamples = 1 +SourceType = Point +SplittedPropagator = yes +Indices = 0 +ReadSource = no +UseRelativePrecision = yes +UseSloppyPrecision = no +DisableIOChecks = yes +GaugeConfigInputFile = conf + +BeginOperator CLOVER + 2KappaMu = 0.0024135 + kappa = 0.160900 + csw = 1.00 + Solver = INCREIGCG + #UseEvenOdd = yes + SolverPrecision = 1.e-16 + MaxSolverIterations = 2000 + PropagatorPrecision = 64 + AddDownPropagator = no + EigCGnrhs = 12 + EigCGnev = 10 + EigCGvmax = 40 + EigCGldh = 100 + EigCGrestolsq = 1e-8 +EndOperator + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-gmres.input b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-gmres.input new file mode 100644 index 0000000000000000000000000000000000000000..0558b8e3eaf4a89a5bfcd07a39c54aa9cda3462d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-gmres.input @@ -0,0 +1,35 @@ +# example input file for invert +# for GMRES(m) solver +# same for GCR, FGMRES, etc +# requires a 4^4 gauge configuration conf.0000 + +L=4 +T=4 + +DebugLevel = 5 +InitialStoreCounter = 0 +Measurements = 1 +2kappamu = 0.05 +kappa = 0.177 +BCAngleT = 1 +GaugeConfigInputFile = conf +UseEvenOdd = yes + +SourceType = Volume +ReadSource = no +NoSamples = 12 + +# GMRES parameter M (no of iterations until restart) +# this parameter is also used for GCR, FGMRES, etc... +GMRESMParameter = 10 + +BeginOperator TMWILSON + 2kappaMu = 0.05 + kappa = 0.177 + UseEvenOdd = yes + Solver = GMRES + SolverPrecision = 1e-14 + MaxSolverIterations = 1000 + AddDownPropagator = yes +EndOperator + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-hmc-cloverdet.input b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-hmc-cloverdet.input new file mode 100644 index 0000000000000000000000000000000000000000..69f08eedc3b7045c362e9611d02302e5a67b8b22 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-hmc-cloverdet.input @@ -0,0 +1,59 @@ +# this is a sample input file for a single cloverdet +# monomial +# +# the plaquette value should be +# 0.5905(3) +# + +L=4 +T=4 +Measurements = 10000 +Startcondition = hot +2KappaMu = 0.0 +CSW = 1.76 +kappa = 0.138 +NSave = 500000 +ThetaT = 1.0 +UseEvenOdd = yes +ReversibilityCheck = yes +ReversibilityCheckIntervall = 100 +InitialStoreCounter = 0 +DebugLevel = 0 + +BeginMeasurement CORRELATORS + Frequency = 2 +EndMeasurement + +BeginMonomial GAUGE + Type = Wilson + beta = 5.60 + Timescale = 0 +EndMonomial + +BeginMonomial CLOVERDET + Timescale = 1 + CSW = 1.76 + kappa = 0.138 + AcceptancePrecision = 1.e-20 + ForcePrecision = 1.e-12 + Name = cloverdet + solver = CG +EndMonomial + +BeginIntegrator + Type0 = 2MN + Type1 = 2MN + IntegrationSteps0 = 2 + IntegrationSteps1 = 12 + tau = 1.00 + Lambda0 = 0.19 + Lambda1 = 0.20 + NumberOfTimescales = 2 +EndIntegrator + +BeginOperator CLOVER + CSW = 1.76 + kappa = 0.208333 + SolverPrecision = 1e-14 + MaxSolverIterations = 1000 +EndOperator diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-hmc-ndcloverrat.input b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-hmc-ndcloverrat.input new file mode 100644 index 0000000000000000000000000000000000000000..e4403b7c6700d960095ec6418850a892e0762296 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-hmc-ndcloverrat.input @@ -0,0 +1,96 @@ +# this is identical to sample-hmc2.input, just +# using NDRAT instead of NDPOLY +# and a different gauge action +# +# plaquette value should be 0.64875(5) + +L=4 +T=4 +Measurements = 20 +StartCondition = hot +2KappaMu = 0.01 +kappa = 0.170 +NSave = 500000 +ThetaT = 1 +UseEvenOdd = yes +ReversibilityCheck = yes +ReversibilityCheckIntervall = 100 +InitialStoreCounter = 0 +DebugLevel = 1 +OmpNumThreads = 1 +ReproduceRandomNumbers = yes + +BeginMeasurement CORRELATORS + Frequency = 2 +EndMeasurement + +BeginMonomial GAUGE + Type = tlsym + beta = 3.30 + Timescale = 0 +EndMonomial + +BeginMonomial DET + Timescale = 1 + 2KappaMu = 0.01 + kappa = 0.170 + AcceptancePrecision = 1e-20 + ForcePrecision = 1e-12 + Name = det + Solver = CG +EndMonomial + +BeginMonomial NDCLOVERRAT + Timescale = 1 + kappa = 0.170 + CSW = 1. + AcceptancePrecision = 1e-20 + ForcePrecision = 1e-12 + StildeMin = 0.01225 + StildeMax = 3.5 + Name = ndrat + DegreeOfRational = 12 + Cmin = 0 + Cmax = 11 + ComputeEVFreq = 1 + 2KappaEpsBar = 0.0935 + 2Kappamubar = 0.1105 + AddTrLog = yes +EndMonomial + +# correction monomial for approximation error +BeginMonomial NDCLOVERRATCOR + Timescale = 1 + kappa = 0.170 + CSW = 1. + AcceptancePrecision = 1e-20 + ForcePrecision = 1e-12 + StildeMin = 0.01225 + StildeMax = 3.5 + Name = ndratcor + DegreeOfRational = 12 + ComputeEVFreq = 0 + 2KappaEpsBar = 0.0935 + 2Kappamubar = 0.1105 +EndMonomial + +BeginIntegrator + Type0 = 2MN + Type1 = 2MN + IntegrationSteps0 = 2 + IntegrationSteps1 = 10 + Tau = 1 + Lambda0 = 0.19 + Lambda1 = 0.20 + NumberOfTimescales = 2 +EndIntegrator + +BeginOperator TMWILSON + 2kappaMu = 0.01 + kappa = 0.170 + UseEvenOdd = yes + Solver = CG + SolverPrecision = 1e-14 + MaxSolverIterations = 1000 +EndOperator + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-hmc-ndrat-split.input b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-hmc-ndrat-split.input new file mode 100644 index 0000000000000000000000000000000000000000..25ae6b2e2dd296d3f57155e0cd8d410a86dac056 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-hmc-ndrat-split.input @@ -0,0 +1,117 @@ +# this is identical to sample-hmc2.input, just +# using two NDRAT instead of NDPOLY +# +# the expected plaquette value is 0.53347(17) +# the expected rect. plaq. value is 0.30393(22) +# +# PP correlator +# 1.963(2) +# 0.2846(2) +# 0.1078(2) +# +# smallest EV: 0.01890(3) +# largest EV: 0.82744(4) + +L=4 +T=4 +Measurements = 20 +StartCondition = hot +2KappaMu = 0.01 +kappa = 0.170 +NSave = 500000 +ThetaT = 1 +UseEvenOdd = yes +ReversibilityCheck = yes +ReversibilityCheckIntervall = 100 +InitialStoreCounter = 0 +DebugLevel = 1 +OmpNumThreads = 1 +ReproduceRandomNumbers = yes + +BeginMeasurement CORRELATORS + Frequency = 2 +EndMeasurement + +BeginMonomial GAUGE + Type = tlsym + beta = 3.30 + Timescale = 0 +EndMonomial + +BeginMonomial DET + Timescale = 1 + 2KappaMu = 0.01 + kappa = 0.170 + AcceptancePrecision = 1e-20 + ForcePrecision = 1e-12 + Name = det + Solver = CG +EndMonomial + +BeginMonomial NDRAT + Timescale = 1 + kappa = 0.170 + AcceptancePrecision = 1e-20 + ForcePrecision = 1e-12 + StildeMin = 0.013577 + StildeMax = 3.096935 + Name = ndrat + DegreeOfRational = 12 + Cmin = 0 + Cmax = 5 + ComputeEVFreq = 1 + 2KappaEpsBar = 0.0935 + 2Kappamubar = 0.1105 +EndMonomial + +BeginMonomial NDRAT + Timescale = 1 + kappa = 0.170 + AcceptancePrecision = 1e-20 + ForcePrecision = 1e-12 + StildeMin = 0.013577 + StildeMax = 3.096935 + Name = ndrat + DegreeOfRational = 12 + Cmin = 6 + Cmax = 11 + ComputeEVFreq = 1 + 2KappaEpsBar = 0.0935 + 2Kappamubar = 0.1105 +EndMonomial + +# correction monomial for approximation error +BeginMonomial NDRATCOR + Timescale = 1 + kappa = 0.170 + AcceptancePrecision = 1e-20 + ForcePrecision = 1e-12 + StildeMin = 0.013577 + StildeMax = 3.096935 + Name = ndratcor + DegreeOfRational = 12 + ComputeEVFreq = 0 + 2KappaEpsBar = 0.0935 + 2Kappamubar = 0.1105 +EndMonomial + +BeginIntegrator + Type0 = 2MN + Type1 = 2MN + IntegrationSteps0 = 2 + IntegrationSteps1 = 6 + Tau = 1 + Lambda0 = 0.19 + Lambda1 = 0.20 + NumberOfTimescales = 2 +EndIntegrator + +BeginOperator TMWILSON + 2kappaMu = 0.01 + kappa = 0.170 + UseEvenOdd = yes + Solver = CG + SolverPrecision = 1e-14 + MaxSolverIterations = 1000 +EndOperator + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-hmc-ndrat.input b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-hmc-ndrat.input new file mode 100644 index 0000000000000000000000000000000000000000..81c8226cd84a4b8a9a5e010beab42563519b8dbd --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-hmc-ndrat.input @@ -0,0 +1,101 @@ +# this is identical to sample-hmc2.input, just +# using NDRAT instead of NDPOLY +# +# the expected plaquette value is 0.53347(17) +# the expected rect. plaq. value is 0.30393(22) +# +# PP correlator +# 1.963(2) +# 0.2846(2) +# 0.1078(2) +# +# smallest EV: 0.01890(3) +# largest EV: 0.82744(4) + +L=4 +T=4 +Measurements = 20 +StartCondition = hot +2KappaMu = 0.01 +kappa = 0.170 +NSave = 500000 +ThetaT = 1 +UseEvenOdd = yes +ReversibilityCheck = yes +ReversibilityCheckIntervall = 100 +InitialStoreCounter = 0 +DebugLevel = 1 +OmpNumThreads = 1 +ReproduceRandomNumbers = yes + +BeginMeasurement CORRELATORS + Frequency = 2 +EndMeasurement + +BeginMonomial GAUGE + Type = tlsym + beta = 3.30 + Timescale = 0 +EndMonomial + +BeginMonomial DET + Timescale = 1 + 2KappaMu = 0.01 + kappa = 0.170 + AcceptancePrecision = 1e-20 + ForcePrecision = 1e-12 + Name = det + Solver = CG +EndMonomial + +BeginMonomial NDRAT + Timescale = 1 + kappa = 0.170 + AcceptancePrecision = 1e-20 + ForcePrecision = 1e-12 + StildeMin = 0.013577 + StildeMax = 3.096935 + Name = ndrat + DegreeOfRational = 12 + Cmin = 0 + Cmax = 11 + ComputeEVFreq = 1 + 2KappaEpsBar = 0.0935 + 2Kappamubar = 0.1105 +EndMonomial + +# correction monomial for approximation error +BeginMonomial NDRATCOR + Timescale = 1 + kappa = 0.170 + AcceptancePrecision = 1e-20 + ForcePrecision = 1e-12 + StildeMin = 0.013577 + StildeMax = 3.096935 + Name = ndratcor + DegreeOfRational = 12 + ComputeEVFreq = 0 + 2KappaEpsBar = 0.0935 + 2Kappamubar = 0.1105 +EndMonomial + +BeginIntegrator + Type0 = 2MN + Type1 = 2MN + IntegrationSteps0 = 2 + IntegrationSteps1 = 6 + Tau = 1 + Lambda0 = 0.19 + Lambda1 = 0.20 + NumberOfTimescales = 2 +EndIntegrator + +BeginOperator TMWILSON + 2kappaMu = 0.01 + kappa = 0.170 + UseEvenOdd = yes + Solver = CG + SolverPrecision = 1e-14 + MaxSolverIterations = 1000 +EndOperator + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-hmc-poly.input b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-hmc-poly.input new file mode 100644 index 0000000000000000000000000000000000000000..0dae0937f8a3e1b32aea41eedf8204c6816f7c7d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-hmc-poly.input @@ -0,0 +1,67 @@ +# the expected plaquette value is 0.62450(5) +# +# The PP correlator is (2KappaMu = 0.177, kappa = 0.177) +# still to be measured +# t C(t) +# 0 ? +# 1 ? +# 2 ? + +L=4 +T=4 + +Measurements = 10000 +2KappaMu = 0.177 +kappa = 0.177 +NSave = 10000 +ThetaT = 1 +UseEvenOdd = yes +ReversibilityCheck = yes +ReversibilityCheckIntervall = 100 +DebugLevel = 1 +StartCondition = hot +ComputeEVs = no + +BeginMeasurement CORRELATORS + Frequency = 5 +EndMeasurement + +BeginMonomial GAUGE + Type = Wilson + Beta = 6.00 + Timescale = 0 +EndMonomial + + +BeginMonomial POLY + Timescale = 1 + Degree = 90 + Lmin = 0.1 + Lmax = 4.0 + LocNormConst = 3.0187720224543191 + 2KappaMu = 0.177 + Kappa = 0.177 + RootsFile = "Square_root_BR_roots.dat.oox.90.2.5000000000000001e-02" + AcceptancePrecision = 1.e-20 + ForcePrecision = 1.e-12 +EndMonomial + +BeginIntegrator + Type0 = 2MN + Type1 = 2MN + IntegrationSteps0 = 3 + IntegrationSteps1 = 20 + Tau = 2. + Lambda0 = 0.19 + Lambda1 = 0.20 + NumberOfTimescales = 2 +EndIntegrator + +BeginOperator TMWILSON + 2KappaMu = 0.177 + kappa = 0.177 + UseEvenOdd = yes + Solver = CG + SolverPrecision = 1e-14 + MaxSolverIterations = 1000 +EndOperator diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-hmc-rat.input b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-hmc-rat.input new file mode 100644 index 0000000000000000000000000000000000000000..02a95cc9041b635ef1add091ced0ed15f8cc4111 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-hmc-rat.input @@ -0,0 +1,86 @@ + +L=4 +T=4 +Measurements = 10 +StartCondition = hot +2KappaMu = 0.01 +kappa = 0.170 +NSave = 500000 +ThetaT = 1 +UseEvenOdd = yes +ReversibilityCheck = yes +ReversibilityCheckIntervall = 100 +InitialStoreCounter = 0 +DebugLevel = 4 +OmpNumThreads = 1 +ReproduceRandomNumbers = yes + +BeginMeasurement CORRELATORS + Frequency = 2 +EndMeasurement + +BeginMonomial GAUGE + Type = tlsym + beta = 3.30 + Timescale = 0 +EndMonomial + +BeginMonomial DET + Timescale = 1 + 2KappaMu = 0.01 + kappa = 0.170 + AcceptancePrecision = 1e-20 + ForcePrecision = 1e-12 + Name = det + Solver = CG +EndMonomial + +# single Wilson or Wilson clover fermion +# not for twisted fermions +BeginMonomial RAT + Timescale = 1 + kappa = 0.170 + AcceptancePrecision = 1e-20 + ForcePrecision = 1e-12 + StildeMin = 0.013577 + StildeMax = 3.096935 + Name = rat + DegreeOfRational = 12 + Cmin = 0 + Cmax = 11 + ComputeEVFreq = 1 +EndMonomial + +# correction monomial for approximation error +BeginMonomial RATCOR + Timescale = 1 + kappa = 0.170 + AcceptancePrecision = 1e-20 + ForcePrecision = 1e-12 + StildeMin = 0.013577 + StildeMax = 3.096935 + Name = ratcor + DegreeOfRational = 12 + ComputeEVFreq = 0 +EndMonomial + +BeginIntegrator + Type0 = 2MN + Type1 = 2MN + IntegrationSteps0 = 2 + IntegrationSteps1 = 8 + Tau = 1 + Lambda0 = 0.19 + Lambda1 = 0.20 + NumberOfTimescales = 2 +EndIntegrator + +BeginOperator TMWILSON + 2kappaMu = 0.0 + kappa = 0.170 + UseEvenOdd = yes + Solver = CG + SolverPrecision = 1e-14 + MaxSolverIterations = 1000 +EndOperator + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-hmc-tmcloverdet.input b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-hmc-tmcloverdet.input new file mode 100644 index 0000000000000000000000000000000000000000..a5644902a0ee514e4047122c53f58f16a6859ad4 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-hmc-tmcloverdet.input @@ -0,0 +1,62 @@ +# this is a sample input file for a single cloverdet +# monomial with a twisted mass +# +# the plaquette value should be +# 0.57375(7) +# + +L=4 +T=4 +Measurements = 10000 +Startcondition = hot +2KappaMu = 0.0 +CSW = 1.76 +kappa = 0.138 +NSave = 500000 +ThetaT = 1.0 +UseEvenOdd = yes +ReversibilityCheck = yes +ReversibilityCheckIntervall = 100 +InitialStoreCounter = 0 +DebugLevel = 0 + +BeginMeasurement CORRELATORS + Frequency = 2 +EndMeasurement + +BeginMonomial GAUGE + Type = Wilson + beta = 5.60 + Timescale = 0 +EndMonomial + +BeginMonomial CLOVERDET + Timescale = 1 + 2KappaMu = 0.01 + rho = 0.0 + CSW = 1.00 + kappa = 0.138 + AcceptancePrecision = 1.e-20 + ForcePrecision = 1.e-12 + Name = cloverdet + solver = CG +EndMonomial + +BeginIntegrator + Type0 = 2MN + Type1 = 2MN + IntegrationSteps0 = 2 + IntegrationSteps1 = 8 + tau = 1.00 + Lambda0 = 0.19 + Lambda1 = 0.20 + NumberOfTimescales = 2 +EndIntegrator + +BeginOperator CLOVER + 2KappaMu = 0.01 + CSW = 1.00 + kappa = 0.138 + SolverPrecision = 1e-14 + MaxSolverIterations = 1000 +EndOperator diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-hmc-tmcloverdetratio.input b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-hmc-tmcloverdetratio.input new file mode 100644 index 0000000000000000000000000000000000000000..cf672d6e78555c390487318aadb3df9fa1ccab58 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-hmc-tmcloverdetratio.input @@ -0,0 +1,78 @@ +# this is a sample input file for a cloverdet + cloverdetratio +# monomial +# +# the plaquette value should be +# 0.57375(7) +# + +L=4 +T=4 +Measurements = 10000 +Startcondition = hot +2KappaMu = 0.01 +CSW = 1.00 +kappa = 0.138 +NSave = 500000 +ThetaT = 1.0 +UseEvenOdd = yes +ReversibilityCheck = yes +ReversibilityCheckIntervall = 100 +InitialStoreCounter = 0 +DebugLevel = 0 + +BeginMeasurement CORRELATORS + Frequency = 2 +EndMeasurement + +BeginMonomial GAUGE + Type = Wilson + beta = 5.60 + Timescale = 0 +EndMonomial + +BeginMonomial CLOVERDET + Timescale = 1 + 2KappaMu = 0.01 + CSW = 1.00 + # nominator shift + rho = 0.1 + kappa = 0.138 + AcceptancePrecision = 1.e-20 + ForcePrecision = 1.e-12 + Name = cloverdet + solver = CG +EndMonomial + +BeginMonomial CLOVERDETRATIO + Timescale = 1 + 2KappaMu = 0.01 + # nominator shift + rho = 0.0 + # denominator shift, should match CLOVERDET shift + rho2 = 0.1 + CSW = 1.00 + kappa = 0.138 + AcceptancePrecision = 1.e-20 + ForcePrecision = 1.e-12 + Name = cloverdetratio + solver = CG +EndMonomial + +BeginIntegrator + Type0 = 2MN + Type1 = 2MN + IntegrationSteps0 = 2 + IntegrationSteps1 = 6 + tau = 1.00 + Lambda0 = 0.19 + Lambda1 = 0.20 + NumberOfTimescales = 2 +EndIntegrator + +BeginOperator CLOVER + 2KappaMu = 0.01 + CSW = 1.00 + kappa = 0.138 + SolverPrecision = 1e-14 + MaxSolverIterations = 1000 +EndOperator diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-hmc0.input b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-hmc0.input new file mode 100644 index 0000000000000000000000000000000000000000..88d9ee14a69fc046aa87499c4ae7903dd7c2e6ca --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-hmc0.input @@ -0,0 +1,62 @@ +# this sample corresponds to the first case in README +# the expected plaquette value is 0.62450(5) +# +# The PP correlator is (2KappaMu = 0.177, kappa = 0.177) +# t C(t) +# 0 1.638(5) +# 1 0.2020(6) +# 2 0.0424(2) + +L=4 +T=4 +Measurements = 1000 +StartCondition = hot +2KappaMu = 0.177 +kappa = 0.177 +NSave = 500000 +ThetaT = 1 +UseEvenOdd = yes +ReversibilityCheck = yes +ReversibilityCheckIntervall = 100 +InitialStoreCounter = 0 +DebugLevel = 1 + +BeginMeasurement CORRELATORS + Frequency = 2 +EndMeasurement + +BeginMonomial GAUGE + Type = Wilson + beta = 6.00 + Timescale = 0 +EndMonomial + +BeginMonomial DET + Timescale = 1 + 2KappaMu = 0.177 + kappa = 0.177 + AcceptancePrecision = 1e-20 + ForcePrecision = 1e-12 + Name = det + Solver = CG +EndMonomial + +BeginIntegrator + Type0 = 2MN + Type1 = 2MN + IntegrationSteps0 = 2 + IntegrationSteps1 = 6 + Tau = 1 + Lambda0 = 0.19 + Lambda1 = 0.20 + NumberOfTimescales = 2 +EndIntegrator + +BeginOperator TMWILSON + 2kappaMu = 0.177 + kappa = 0.177 + UseEvenOdd = yes + Solver = CG + SolverPrecision = 1e-14 + MaxSolverIterations = 1000 +EndOperator diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-hmc1.input b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-hmc1.input new file mode 100644 index 0000000000000000000000000000000000000000..156115fc816d23218e347b491ae65f5a0cfafd4e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-hmc1.input @@ -0,0 +1,76 @@ +# this sample corresponds to the first case in README +# like sample-hmc0.input, but with preconditioning +# the expected plaquette value is 0.62450(5) +# +# The PP correlator is (2KappaMu = 0.177, kappa = 0.177) +# t C(t) +# 0 1.638(5) +# 1 0.2020(6) +# 2 0.0424(2) + +L=4 +T=4 +Measurements = 100000 +#startCondition = continue +StartCondition = hot +2KappaMu = 0.177 +kappa = 0.177 +NSave = 500000 +ThetaT = 1 +UseEvenOdd = yes +ReversibilityCheck = yes +ReversibilityCheckIntervall = 100 +DebugLevel = 1 + +BeginMeasurement CORRELATORS + Frequency = 100 +EndMeasurement + +BeginMonomial GAUGE + Type = Wilson + beta = 6.00 + Timescale = 0 +EndMonomial + +BeginMonomial DET + Timescale = 1 + 2KappaMu = 0.5 + kappa = 0.177 + AcceptancePrecision = 1e-20 + ForcePrecision = 1e-12 + Name = det + Solver = CG +EndMonomial + +BeginMonomial DETRATIO + Timescale = 2 + 2KappaMu = 0.177 + 2KappaMu2 = 0.5 + kappa = 0.177 + kappa2 = 0.177 + AcceptancePrecision = 1e-20 + ForcePrecision = 1e-12 + Name = detrat + Solver = CG +EndMonomial + +BeginIntegrator + Type0 = 2MN + Type1 = 2MN + Type2 = 2MN + IntegrationSteps0 = 1 + IntegrationSteps1 = 2 + IntegrationSteps2 = 4 + Tau = 1 + Lambda0 = 0.19 + NumberOfTimescales = 3 +EndIntegrator + +BeginOperator TMWILSON + 2kappaMu = 0.177 + kappa = 0.177 + UseEvenOdd = yes + Solver = CG + SolverPrecision = 1e-14 + MaxSolverIterations = 1000 +EndOperator diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-hmc2.input b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-hmc2.input new file mode 100644 index 0000000000000000000000000000000000000000..d78d90b66a160da7808333ab87f9c1792a996f75 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-hmc2.input @@ -0,0 +1,84 @@ +# this sample corresponds to a 2+1+1 test case +# roots and the normalisation are in Square_root_BR_roots.dat +# and normierungLocal.dat in this directory +# they were generated using the chebyRoot.H file, which can also +# be found in this directory +# the expected plaquette value is 0.53347(17) +# the expected rect. plaq. value is 0.30393(22) +# +# PP correlator +# 1.963(2) +# 0.2846(2) +# 0.1078(2) +# +# smallest EV: 0.01890(3) +# largest EV: 0.82744(4) + +L=4 +T=4 +Measurements = 35000 +StartCondition = hot +2KappaMu = 0.01 +2Kappamubar = 0.1105 +2Kappaepsbar = 0.0935 +kappa = 0.170 +NSave = 500000 +ThetaT = 1 +GaugeConfigInputFile = conf.save +UseEvenOdd = yes +ReversibilityCheck = yes +ReversibilityCheckIntervall = 100 +DebugLevel = 1 + +BeginMeasurement CORRELATORS + Frequency = 100 +EndMeasurement + +BeginMonomial GAUGE + Type = tlsym + beta = 3.30 + Timescale = 0 +EndMonomial + +BeginMonomial DET + Timescale = 1 + 2KappaMu = 0.01 + kappa = 0.170 + AcceptancePrecision = 1e-20 + ForcePrecision = 1e-12 + Name = det + Solver = CG +EndMonomial + +BeginMonomial NDPOLY + Timescale = 1 + StildeMin = 0.013577 + StildeMax = 3.096935 + LocNormConst = 3.3394134092406311254 + PrecisionPtilde = 1e-05 + DegreeOfMDPolynomial = 48 + PrecisionHfinal = 1e-10 + 2Kappamubar = 0.1105 + 2Kappaepsbar = 0.0935 + kappa = 0.170 + ComputeEVFreq = 2 +EndMonomial + +BeginIntegrator + Type0 = 2MN + Type1 = 2MN + IntegrationSteps0 = 2 + IntegrationSteps1 = 6 + Tau = 1 + Lambda0 = 0.19 + NumberOfTimescales = 2 +EndIntegrator + +BeginOperator TMWILSON + 2kappaMu = 0.01 + kappa = 0.170 + UseEvenOdd = yes + Solver = CG + SolverPrecision = 1e-14 + MaxSolverIterations = 1000 +EndOperator diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-hmc3.input b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-hmc3.input new file mode 100644 index 0000000000000000000000000000000000000000..b162119517e46fd7c89e0403c8a7bab7852a85f0 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-hmc3.input @@ -0,0 +1,100 @@ +# this sample corresponds to a nf=4 run +# the expected plaquette value is 0.59515(8) +# the expected rectangular value is 0.3637(1) +# +# PP correlator +# 1.88(2) +# 0.242(2) +# 0.084(1) + +L=4 +T=4 +Measurements = 4000 +StartCondition = hot +2KappaMu = 0.002740961 +kappa = 0.163260 +NSave = 100 +ThetaT = 1 +GaugeConfigInputFile = conf.save +UseEvenOdd = yes +ReversibilityCheck = yes +ReversibilityCheckIntervall = 100 +DebugLevel = 1 +InitialStoreCounter = 0 + +BeginMeasurement CORRELATORS + Frequency = 100 +EndMeasurement + +BeginMonomial GAUGE + Type = Iwasaki + beta = 1.95 + Timescale = 0 +EndMonomial + +BeginMonomial DET + Timescale = 1 + 2KappaMu = 0.01 + kappa = 0.163260 + AcceptancePrecision = 1e-22 + ForcePrecision = 1e-14 + Name = det1 + Solver = CG +EndMonomial + +BeginMonomial DETRATIO + Timescale = 2 + 2KappaMu = 0.002740961 + 2KappaMu2 = 0.01 + kappa = 0.163260 + kappa2 = 0.163260 + AcceptancePrecision = 1e-22 + ForcePrecision = 1e-14 + Name = detratio1 + Solver = CG +EndMonomial + +BeginMonomial DET + Timescale = 1 + 2KappaMu = 0.01 + kappa = 0.163260 + AcceptancePrecision = 1e-22 + ForcePrecision = 1e-14 + Name = det2 + Solver = CG +EndMonomial + +BeginMonomial DETRATIO + Timescale = 2 + 2KappaMu = 0.002740961 + 2KappaMu2 = 0.01 + kappa = 0.163260 + kappa2 = 0.163260 + AcceptancePrecision = 1e-22 + ForcePrecision = 1e-14 + Name = detratio2 + Solver = CG +EndMonomial + +BeginIntegrator + Type0 = 2MN + Type1 = 2MN + Type2 = 2MN + IntegrationSteps0 = 1 + IntegrationSteps1 = 4 + IntegrationSteps2 = 2 + Tau = 1 + Lambda0 = 0.19 + Lambda1 = 0.21 + Lambda2 = 0.2 + NumberOfTimescales = 3 +EndIntegrator + +BeginOperator TMWILSON + kappa = 0.163260 + 2KappaMu = 0.002740961 + UseEvenOdd = yes + Solver = CG + SolverPrecision = 1e-14 + MaxSolverIterations = 1000 +EndOperator diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-hmc4.input b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-hmc4.input new file mode 100644 index 0000000000000000000000000000000000000000..2d4ecff1180287c66b84d55227741b54681704e8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-hmc4.input @@ -0,0 +1,66 @@ +# the expected plaquette value is 0.62597(8) +# the expected rectangle value is 0.3935(1) +# in this example the topologie should be fixed. +# expect huge autocorrelation times +# PP correlator +# 1.632(8) +# 0.333(2) +# 0.0775(5) +# + +L=4 +T=4 +Measurements = 100000 +StartCondition = cold +ThermalisationSweeps=100 +2KappaMu = 0.083333 +kappa = 0.208333 +NSave = 10000 +ThetaT = 1 +UseEvenOdd = yes +ReversibilityCheck = no +ReversibilityCheckIntervall = 1 +DebugLevel = 1 + +BeginMeasurement CORRELATORS + Frequency = 20 +EndMeasurement + +BeginMonomial GAUGE + Timescale = 0 + Beta = 2.3 + Type = Iwasaki +EndMonomial + +BeginMonomial DETRATIO + Timescale = 1 + 2KappaMu = 0.0 + 2KappaMu2 = 0.083333 + kappa = 0.208333 + kappa2 = 0.208333 + AcceptancePrecision = 1e-20 + ForcePrecision = 1e-12 + Name = detrat + Solver = CG + CSGHistory = 0 +EndMonomial + +BeginIntegrator + Type0 = 2MN + Type1 = 2MN + IntegrationSteps0 = 2 + IntegrationSteps1 = 4 + Tau = 0.5 + Lambda0 = 0.19 + Lambda1 = 0.21 + NumberOfTimescales = 2 +EndIntegrator + +BeginOperator TMWILSON + 2KappaMu = 0.083333 + kappa = 0.208333 + UseEvenOdd = yes + Solver = CG + SolverPrecision = 1e-14 + MaxSolverIterations = 1000 +EndOperator diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-incr-eigcg-invert.input b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-incr-eigcg-invert.input new file mode 100644 index 0000000000000000000000000000000000000000..75f2ea9e4e64fccb1bfc6c9a768784ee312297d9 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-incr-eigcg-invert.input @@ -0,0 +1,49 @@ +L=16 +T=32 + +NrXProcs = 1 +NrYProcs = 1 +NrZProcs = 1 + +OmpNumThreads = 1 + + +DebugLevel = 4 +InitialStoreCounter = 200 +Measurements = 1 +2kappamu = 0.0024135 +kappa = 0.160900 +BCAngleT = 1. +GaugeConfigInputFile = conf +UseEvenOdd = yes +UseRelativePrecision = yes +SourceType = Point +#SourceType = Volume +Indices = 0-9 +ReadSource = no +NoSamples = 1 +DisableIOChecks = yes + + + + +BeginOperator TMWILSON + + 2kappaMu = 0.0024135 + kappa = 0.160900 + UseEvenOdd = yes + Solver = INCREIGCG + SolverPrecision = 1.e-8 #tolerance for systems nrhs1+1,..,nrhs + MaxSolverIterations = 2000 #maximum number of iterations when solving a linear system + EigCGnrhs = 10 #total number of systems + EigCGnrhs1 = 5 #first nrhs1 systems that will be solved to tolerance tolsq1 + EigCGnev = 10 #number of eigenvectors to be computed for every system in the build-up phase + EigCGvmax = 40 #size of the search subspace used by eigcg to compute nev eigenvectors + EigCGldh = 20 #total number of approximate eigenvectors to be computed + EigCGtolsq1 = 1.e-16 #tolerance for the systems 1,..,nrhs1 + EigCGrestolsq = 1e-4 #tolerance for restarting eigcg after the eigenvectors has been computed + EigCGRandGuessOpt = 1 #if 0 means use zero initial guess, 1 means use random intial guess as a volume gaussian spinor + AddDownPropagator = no + +EndOperator + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-invert0_gpu.input b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-invert0_gpu.input new file mode 100644 index 0000000000000000000000000000000000000000..0e5db0aba47338ba74b06c8a61828baa9edf666a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-invert0_gpu.input @@ -0,0 +1,43 @@ +# example input file for invert +# requires a 4^4 gauge configuration conf.0000 + +L=4 +T=4 + +DebugLevel = 5 +InitialStoreCounter = 0 +Measurements = 1 +2kappamu = 0.05 +kappa = 0.177 +BCAngleT = 1 +GaugeConfigInputFile = conf +UseEvenOdd = yes + +SourceType = Volume +ReadSource = no +NoSamples = 12 + +BeginOperator TMWILSON + 2kappaMu = 0.05 + kappa = 0.177 + UseEvenOdd = yes + Solver = CG + SolverPrecision = 1e-14 + MaxSolverIterations = 1000 + AddDownPropagator = yes +EndOperator + +BeginOperator DBTMWILSON + 2KappaMubar = 0.139 + 2KappaEpsbar = 0.15 + kappa = 0.177 + Solver = CG + SolverPrecision = 1e-12 +EndOperator + +BeginGPUInit + MaxInnerSolverIteration = 1000 + InnerSolverPrecision = 1.0e-4 +# DeviceNum=2 +EndGPUInit + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-invert0_gpu_eo_nd.input b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-invert0_gpu_eo_nd.input new file mode 100644 index 0000000000000000000000000000000000000000..d37f96c3708103290f8124997ed5614bd4d9c9e4 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-invert0_gpu_eo_nd.input @@ -0,0 +1,56 @@ +# example input file for invert +# requires a 4^4 gauge configuration conf.0000 + +L=4 +T=4 + +DebugLevel = 5 +InitialStoreCounter = 0 +Measurements = 1 +2kappamu = 0.05 +kappa = 0.177 +BCAngleT = 1 +GaugeConfigInputFile = conf +UseEvenOdd = yes + +SourceType = Volume +ReadSource = no +NoSamples = 12 + +#BeginOperator TMWILSON +# 2kappaMu = 0.05 +# kappa = 0.177 +# UseEvenOdd = yes +# Solver = CG +# SolverPrecision = 1e-14 +# MaxSolverIterations = 1000 +# AddDownPropagator = yes +#EndOperator + +BeginOperator DBTMWILSON + 2KappaMubar = 0.052 + 2KappaEpsbar = 0.012 + kappa = 0.2 + Solver = CG + SolverPrecision = 1e-12 + MaxSolverIterations = 2000 +EndOperator + +# originally: +#BeginOperator DBTMWILSON +# 2KappaMubar = 0.139 +# 2KappaEpsbar = 0.15 +# kappa = 0.177 +# Solver = CG +# SolverPrecision = 1e-12 +#EndOperator + +BeginGPUInit + MaxInnerSolverIteration = 70 +# InnerSolverPrecision = 1e-4 + InnersolverPrecisionCheckAbs = 1 + InnersolverPrecisionCheckRel = 1 + InnersolverPrecisionAbs = 1.0e-13 + InnersolverPrecisionRel = 1.0e-7 +# DeviceNum=2 +EndGPUInit diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-mixedcg.input b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-mixedcg.input new file mode 100644 index 0000000000000000000000000000000000000000..87589920d23aee48a384d468fd7342a39f5d4dee --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-mixedcg.input @@ -0,0 +1,46 @@ +#example input file for invert +# for MIXEDCG solver +# requires a 4^4 gauge configuration conf.0000 + +L=4 +T=4 + +DebugLevel = 5 +InitialStoreCounter = 0 +Measurements = 1 +2kappamu = 0.05 +kappa = 0.177 +BCAngleT = 1 +GaugeConfigInputFile = conf +UseEvenOdd = yes + +SourceType = Volume +ReadSource = no +NoSamples = 12 + +# residual reduction factor for inner solver in mixed cg (1.e-6 seems to work quite well in general) +MixCGInnerEps = 1.e-6 +# maximum number of inner solver iterations for MIXEDCG per restart +MixCGMaxIter = 10000 + +BeginOperator TMWILSON + 2kappaMu = 0.05 + kappa = 0.177 + UseEvenOdd = yes + Solver = MIXEDCG + SolverPrecision = 1e-14 + # MIXECG internally calculates the number of outer iterations from MaxSolverIterations and MixCGMaIter, but does + # at least 10 outer iterations + MaxSolverIterations = 30000 + AddDownPropagator = yes +EndOperator + +BeginOperator CLOVER + 2kappaMu = 0.05 + kappa = 0.177 + csw = 1.74 + Solver = MIXEDCG + SolverPrecision = 1e-14 + MaxSolverIterations = 30000 + AddDownPropagator = yes +EndOperator diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-ndclover.input b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-ndclover.input new file mode 100644 index 0000000000000000000000000000000000000000..04c77925c82213490a835c2e37e56d6b85c7939e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-ndclover.input @@ -0,0 +1,75 @@ +# this sample corresponds to a 2+1+1 test case +# roots and the normalisation are in Square_root_BR_roots.dat +# and normierungLocal.dat in this directory +# they were generated using the chebyRoot.H file, which can also +# be found in this directory +L=4 +T=4 +Measurements = 1 +StartCondition = hot +2KappaMu = 0.01 +2Kappamubar = 0.1105 +2Kappaepsbar = 0.0935 +kappa = 0.170 +NSave = 500000 +ThetaT = 1 +GaugeConfigInputFile = conf.save +UseEvenOdd = yes +ReversibilityCheck = yes +ReversibilityCheckIntervall = 100 +DebugLevel = 1 + +BeginMeasurement CORRELATORS + Frequency = 1 +EndMeasurement + +BeginMonomial GAUGE + Type = tlsym + beta = 3.30 + Timescale = 0 +EndMonomial + +BeginMonomial DET + Timescale = 1 + 2KappaMu = 0.01 + kappa = 0.170 + AcceptancePrecision = 1e-20 + ForcePrecision = 1e-12 + Name = det + Solver = CG +EndMonomial + +BeginMonomial NDCLOVER + Timescale = 1 + StildeMin = 0.01225 + StildeMax = 3.5 + LocNormConst = 3.3775885577830275786 + PrecisionPtilde = 1e-05 + DegreeOfMDPolynomial = 48 + PrecisionHfinal = 1e-10 + ComputeEVFreq = 1 + 2KappaEpsBar = 0.0935 + 2Kappamubar = 0.1105 + kappa = 0.170 + CSW = 1.0 + RootsFile = "clover_roots.dat" +EndMonomial + +BeginIntegrator + Type0 = 2MN + Type1 = 2MN + IntegrationSteps0 = 2 + IntegrationSteps1 = 12 + Tau = 1 + Lambda0 = 0.19 + NumberOfTimescales = 2 +EndIntegrator + +BeginOperator TMWILSON + 2KappaMu = 0.01 + kappa = 0.170 + UseEvenOdd = yes + Solver = CG + SolverPrecision = 1e-14 + MaxSolverIterations = 1000 +EndOperator diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-sf-quenched0.input b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-sf-quenched0.input new file mode 100644 index 0000000000000000000000000000000000000000..53bde326dd2a5722ac2c590e7a98b012090620f3 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/sample-input/sample-sf-quenched0.input @@ -0,0 +1,62 @@ +# this sample corresponds to the first case in README +# like sample-hmc0.input, but with preconditioning +# the expected plaquette value is 0.62450(5) +# +# The PP correlator is (2KappaMu = 0.177, kappa = 0.177) +# t C(t) +# 0 1.638(5) +# 1 0.2020(6) +# 2 0.0424(2) + +####################### +# General parameters: # +####################### +# Bc=yes => SFbc / no => PBC +Bc = yes +L=4 +T=4 +Measurements = 5 +NSave = 500000 +ThetaT = 0 +UseEvenOdd = yes +DebugLevel = 1 +###################### +# hmc_tm parameters: # +###################### +StartCondition = hot +ReversibilityCheck = no +ReversibilityCheckIntervall = 100 +PerformOnlineMeasurements = no +OnlineMeasurementsFreq = 10 +# g_Tbsf = at which time slice to put the SF boundary +g_Tbsf = 3 +######################################### +# invert and invert_doublet parameters: # +######################################### + +############## +# MONOMIALS: # +############## + +BeginMonomial SFGAUGE + Type = sf_user + Timescale = 0 + UseRectangleStaples = no + RectangleCoefficient = 0 + RectangleCoefficientSS = 0 + RectangleCoefficientTSS = 0 + RectangleCoefficientTTS = 0 + PlaquetteCoefficientT = 1.0 + PlaquetteCoefficientS = 0.5 + eta = 0.0 + beta = 6.00 +EndMonomial + + +BeginIntegrator + Type0 = LEAPFROG + Tau = 1 + NumberOfTimescales = 1 + IntegrationSteps0 = 50 + Lambda0 = 0.19 +EndIntegrator diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/sf/sf_calc_action.c b/qcd/part_cpu/applications/QCD/src/kernel_D/sf/sf_calc_action.c new file mode 100644 index 0000000000000000000000000000000000000000..7767b7901177efdee0a4a6c1921cbf3dea76de19 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/sf/sf_calc_action.c @@ -0,0 +1,2038 @@ +/******************************************* +* +* FILE: sf_calc_action.c +* +* Author: Jenifer Gonzalez Lopez +* +********************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "sse.h" +#include "su3.h" +#include "su3adj.h" +#include "global.h" +#include "geometry_eo.h" +#include "sf_calc_action.h" + +int Index(const int x0, const int x1, const int x2, const int x3); + +/**************************************************************************************************/ + +/* the next function imposes Dirichlet b.c. +by just setting all the gauge links in the time direction (from t on) to zero. +Note that the rest of the links at the boundaries (spatial links) are not yet touched here */ +void dirichlet_boundary_conditions(int t) { + + int ix; + + for (ix=0;ix 2 links on a (time) boundary */ + + ac *= c1_tss; + + } + + else if (g_t[ix] == 0 && mu2 == 0) {/* 2 movement in t <=> 1 links on a (time) boundary */ + + ac *= c1_tts; + + } + + else if (g_t[ix] == (t-1) && mu1 == 0) {/* 1 movement in t <=> 2 links on a (time) boundary */ + + ac *= c1_tss; + + } + else if (g_t[ix] == (t-1) && mu2 == 0) {/* 2 movement in t <=> 1 links on a (time) boundary */ + + ac = 0.; + + } + + else if (g_t[ix] == (t-2) && mu2 == 0) {/* 2 movement in t <=> 1 links on a (time) boundary */ + + ac *= c1_tts; + + } + else if (g_t[ix] == t && (mu1 == 0 || mu2 == 0)) {/* out of the lattice on the right side */ + + ac = 0.; + + } + + else { + + ac *= c1; + + } + + sum += ac; + } + } + } + } + + ga = sum*2.0; + + return ga; + +} + + + +/***** ACTIONS *****/ + +/*----------------------------------------------------------*/ + +/*** for PBC ***/ + +/* standard Wilson */ +double measure_wilson_action(double beta) { + + double plaquette; + double wilson; + + plaquette = measure_plaquette(); + + wilson = - (beta/(2.*3.)) * plaquette; + /* wilson = beta * (6.*VOLUME*g_nproc - plaquette); */ + + return wilson; +} + +/* Iwasaki */ +double measure_iwasaki_action(double beta, double c0, double c1) { + + double plaquette; + double rectangle; + double iwasaki; + + plaquette = measure_plaquette(); + + rectangle = measure_rectangle(); + + iwasaki = - (beta/(2.*3.)) * ( c0*plaquette + c1*rectangle ); + + return iwasaki; +} + +/*----------------------------------------------------------*/ + + +/*** SF boundary conditions ***/ + +/* standard Wilson action for SF b.c. without improvement coefficients "hard-coded" */ +double measure_wilson_action_sf(int t, double beta) { + + double plaquette; + double wilson; + + plaquette = measure_plaquette_sf_weights(t); + + wilson = - (beta/(2.*3.)) * plaquette; + + return wilson; +} + +/* standard Wilson action for SF b.c. WITH improvement coefficients "hard-coded" */ +double measure_wilson_action_sf_weights_improvement(int t, double beta, double cs, double ct) { + + double plaquette; + double wilson; + + plaquette = measure_plaquette_sf_weights_improvement(t, cs, ct); + + wilson = - (beta/(2.*3.)) * plaquette; + + return wilson; +} + + +/* it is the same as "measure_wilson_action_sf": +standard Wilson action for SF b.c. without improvement coefficients +but "not hard-coded" */ +double measure_wilson_action_sf_separate_boundary(int t, double beta) { + + double plaquette; + double wilson; + + plaquette = measure_plaquette_sf_weights_bulk(t) + + measure_plaquette_sf_weights_boundary_0() + measure_plaquette_sf_weights_boundary_t(t); + + wilson = - (beta/(2.*3.)) * plaquette; + + return wilson; +} + +/* it is the same as "measure_wilson_action_sf_weights_improvement": +standard Wilson action for SF b.c. WITH (alpha collab.) improvement coefficients +but "not hard-coded" */ +double measure_wilson_action_sf_weights_improvement_separate_boundary(int t, double beta, double cs, double ct) { + + double plaquette; + double wilson; + + plaquette = measure_plaquette_sf_weights_improved_bulk(t) + + measure_plaquette_sf_weights_improved_boundary_0(cs, ct) + + measure_plaquette_sf_weights_improved_boundary_t(t, cs) + + measure_plaquette_sf_weights_improved_boundary_t_minus_1(t, ct); + + wilson = - (beta/(2.*3.)) * plaquette; + + return wilson; +} + +/* Iwasaki action with SF b.c. "hard-coded" */ +double measure_iwasaki_action_sf(int t, double beta, double cs, double ct, double c0, + double c1, double c1_ss, double c1_tss, double c1_tts) { + + double plaquette; + double rectangle; + double iwasaki; + + plaquette = measure_plaquette_sf_iwasaki(t, cs, ct, c0); + + rectangle = measure_rectangle_sf_iwasaki(t, c1, c1_ss, c1_tss, c1_tts); + + + iwasaki = - (beta/(2.*3.)) * ( plaquette + rectangle ); + + return iwasaki; +} + +/****************************************************************************************/ +/****************************************************************************************/ +/****************************************************************************************/ + + +/*** FUNCTIONS NEEDED FOR THE BACKGROUND FIELD ACTION and DERIVATIVE WITH RESPECT TO ETA ***/ + + +/* it calculates an su3 matrix "u" which is gonna be the partial with respect to eta of the + (lattice) spatially constant abelian field "C_k"*/ +#define _su3_partial_eta_spatially_constant_abelian_field_phi(u) \ + (u).c00 = cexp(1./LX); \ + (u).c01 = 0.0; \ + (u).c02 = 0.0; \ + (u).c10 = 0.0; \ + (u).c11 = cexp((-1./2.)/LX); \ + (u).c12 = 0.0; \ + (u).c20 = 0.0; \ + (u).c21 = 0.0; \ + (u).c22 = cexp((-1./2.)/LX); \ + + +/* it calculates an su3 matrix "u" which is gonna be the partial with respect to eta of the + (lattice) spatially constant abelian field "(C_k)^prime"*/ +#define _su3_partial_eta_spatially_constant_abelian_field_phi_prime(u) \ + (u).c00 = cexp(-1./LX); \ + (u).c01 = 0.0; \ + (u).c02 = 0.0; \ + (u).c10 = 0.0; \ + (u).c11 = cexp((1./2.)/LX); \ + (u).c12 = 0.0; \ + (u).c20 = 0.0; \ + (u).c21 = 0.0; \ + (u).c22 = cexp((1./2.)/LX); \ + + +/*--------------------------------------------------------------------------------------------------*/ + +/** PLAQUETTE (only) **/ + +/* this function defines the (continuum) constant abelian induced background field "B_{mu}(x)" */ +/* that is, the minimal action configuration when SF b.c. are considered + (note that in an infinite extent, p.b.c., the minimal action configuration is A_{mu}(x)=0) */ +void induced_continuum_background(su3 **b, int t, double eta) { + + int ix; + double pi; + double phi1_0, phi2_0, phi3_0; + double phi1_T, phi2_T, phi3_T; + double p1, p2, p3; + + pi = acos(-1.); + + phi1_0 = eta - pi/3.0; + phi2_0 = - 0.5 * eta; + phi3_0 = - 0.5 * eta + pi/3.0; + + phi1_T = - phi1_0 - (4.0*pi)/3.0; + phi2_T = - phi3_0 + (2.0*pi)/3.0; + phi3_T = - phi2_0 + (2.0*pi)/3.0; + + phi1_0 /= (double)LX; + phi2_0 /= (double)LX; + phi3_0 /= (double)LX; + + phi1_T /= (double)LX; + phi2_T /= (double)LX; + phi3_T /= (double)LX; + + phi1_0 /= (double)t; + phi2_0 /= (double)t; + phi3_0 /= (double)t; + + phi1_T /= (double)t; + phi2_T /= (double)t; + phi3_T /= (double)t; + + + for (ix=0;ix. + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#undef SSE +#undef SSE2 +#undef SSE3 +#include "global.h" +#include "su3.h" +#include "sf_get_rectangle_staples.h" + +/* this function is valid ONLY IF Nt => 6 */ +void sf_get_rectangle_staples(su3 * const v, const int x, const int mu) { + static su3 tmp1, tmp2; + int y, z, nu; + su3 * a, * b, * c, * d, * e; +#ifdef _KOJAK_INST +#pragma pomp inst begin(rectstaples) +#endif +#ifdef XLC +#pragma disjoint(*v, tmp1, tmp2, *a, *b, *c, *d, *e) +#endif + _su3_zero((*v)); + for(nu = 0; nu < 4; nu++) { + if(mu != nu) { + if (g_t[x] > 2 && g_t[x] < (g_Tbsf-2)) { + /* first contr. starting from x + * a b c e^+ d^+ + * c + * _ + * b| |e + * a| |d + */ + a = &g_gauge_field[x][nu]; + y = g_iup[x][nu]; + b = &g_gauge_field[y][nu]; + _su3_times_su3(tmp1, *a, *b); + z = g_iup[y][nu]; + c = &g_gauge_field[z][mu]; + _su3_times_su3(tmp2, tmp1, *c); + + y = g_iup[x][mu]; + d = &g_gauge_field[y][nu]; + z = g_iup[y][nu]; + e = &g_gauge_field[z][nu]; + _su3_times_su3(tmp1, *d, *e); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3d_acc((*v), tmp2, tmp1); + + /* 1 contr. starting idn[idn[x][nu]][nu] + * e^+ d^+ a b c + * + *e| |c + *d|_|b + * a + */ + y = g_idn[x][nu]; + z = g_idn[y][nu]; + d = &g_gauge_field[z][nu]; + a = &g_gauge_field[z][mu]; + _su3d_times_su3(tmp1, *d, *a); + e = &g_gauge_field[y][nu]; + _su3d_times_su3(tmp2, *e, tmp1); + + y = g_iup[z][mu]; + b = &g_gauge_field[y][nu]; + z = g_iup[y][nu]; + c = &g_gauge_field[z][nu]; + _su3_times_su3(tmp1, *b, *c); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3_acc((*v), tmp2, tmp1); + + /* second contr. starting from x + * a b c e^+ d^+ + * + * bc + * __ + * a| _|e + * d + */ + a = &g_gauge_field[x][nu]; + y = g_iup[x][nu]; + b = &g_gauge_field[y][mu]; + _su3_times_su3(tmp1, *a, *b); + z = g_iup[y][mu]; + c = &g_gauge_field[z][mu]; + _su3_times_su3(tmp2, tmp1, *c); + + y = g_iup[x][mu]; + d = &g_gauge_field[y][mu]; + z = g_iup[y][mu]; + e = &g_gauge_field[z][nu]; + _su3_times_su3(tmp1, *d, *e); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3d_acc((*v), tmp2, tmp1); + + /* 1 contr. starting idn[x][nu] + * d^+ a b c e^+ + * + * e + * _ + * d|__|c + * ab + */ + y = g_idn[x][nu]; + d = &g_gauge_field[y][nu]; + a = &g_gauge_field[y][mu]; + _su3d_times_su3(tmp1, *d, *a); + z = g_iup[y][mu]; + b = &g_gauge_field[z][mu]; + _su3_times_su3(tmp2, tmp1, *b); + + y = g_iup[z][mu]; + c = &g_gauge_field[y][nu]; + z = g_iup[x][mu]; + e = &g_gauge_field[z][mu]; + _su3_times_su3d(tmp1, *c, *e); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3_acc((*v), tmp2, tmp1); + + /* 1 contr. starting idn[idn[x][mu]][nu] + * e^+ d^+ a b c + * + * e + * _ + *d|__|c + * ab + */ + y = g_idn[x][mu]; + z = g_idn[y][nu]; + d = &g_gauge_field[z][nu]; + a = &g_gauge_field[z][mu]; + _su3d_times_su3(tmp1, *d, *a); + e = &g_gauge_field[y][mu]; + _su3d_times_su3(tmp2, *e, tmp1); + + y = g_idn[x][nu]; + b = &g_gauge_field[y][mu]; + z = g_iup[y][mu]; + c = &g_gauge_field[z][nu]; + _su3_times_su3(tmp1, *b, *c); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3_acc((*v), tmp2, tmp1); + + /* 1 contr. starting idn[x][mu] + * d^+ a b c e^+ + * + * bc + * __ + *a|_ |e + * d + */ + y = g_idn[x][mu]; + d = &g_gauge_field[y][mu]; + z = g_iup[y][nu]; + a = &g_gauge_field[y][nu]; + _su3d_times_su3(tmp1, *d, *a); + b = &g_gauge_field[z][mu]; + _su3_times_su3(tmp2, tmp1, *b); + + y = g_iup[x][mu]; + e = &g_gauge_field[y][nu]; + z = g_iup[x][nu]; + c = &g_gauge_field[z][mu]; + _su3_times_su3d(tmp1, *c, *e); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3_acc((*v), tmp2, tmp1); + } + else if (g_t[x] == 2) { + if (mu == 0 || (mu != 0 && nu != 0)){ + + /* first contr. starting from x + * a b c e^+ d^+ + * c + * _ + * b| |e + * a| |d + */ + a = &g_gauge_field[x][nu]; + y = g_iup[x][nu]; + b = &g_gauge_field[y][nu]; + _su3_times_su3(tmp1, *a, *b); + z = g_iup[y][nu]; + c = &g_gauge_field[z][mu]; + _su3_times_su3(tmp2, tmp1, *c); + + y = g_iup[x][mu]; + d = &g_gauge_field[y][nu]; + z = g_iup[y][nu]; + e = &g_gauge_field[z][nu]; + _su3_times_su3(tmp1, *d, *e); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3d_acc((*v), tmp2, tmp1); + + /* 1 contr. starting idn[idn[x][nu]][nu] + * e^+ d^+ a b c + * + *e| |c + *d|_|b + * a + */ + y = g_idn[x][nu]; + z = g_idn[y][nu]; + d = &g_gauge_field[z][nu]; + a = &g_gauge_field[z][mu]; + _su3d_times_su3(tmp1, *d, *a); + e = &g_gauge_field[y][nu]; + _su3d_times_su3(tmp2, *e, tmp1); + + y = g_iup[z][mu]; + b = &g_gauge_field[y][nu]; + z = g_iup[y][nu]; + c = &g_gauge_field[z][nu]; + _su3_times_su3(tmp1, *b, *c); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3_acc((*v), tmp2, tmp1); + + #if 1 /* PROBLEMS HERE for T=4 */ + + /* second contr. starting from x + * a b c e^+ d^+ + * + * bc + * __ + * a| _|e + * d + */ + a = &g_gauge_field[x][nu]; + y = g_iup[x][nu]; + b = &g_gauge_field[y][mu]; + _su3_times_su3(tmp1, *a, *b); + z = g_iup[y][mu]; + c = &g_gauge_field[z][mu]; + _su3_times_su3(tmp2, tmp1, *c); + + y = g_iup[x][mu]; + d = &g_gauge_field[y][mu]; + z = g_iup[y][mu]; + e = &g_gauge_field[z][nu]; + _su3_times_su3(tmp1, *d, *e); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3d_acc((*v), tmp2, tmp1); + #endif + + #if 1 /*PROBLEMS HERE for T=4 */ + + /* 1 contr. starting idn[x][nu] + * d^+ a b c e^+ + * + * e + * _ + * d|__|c + * ab + */ + y = g_idn[x][nu]; + d = &g_gauge_field[y][nu]; + a = &g_gauge_field[y][mu]; + _su3d_times_su3(tmp1, *d, *a); + z = g_iup[y][mu]; + b = &g_gauge_field[z][mu]; + _su3_times_su3(tmp2, tmp1, *b); + + y = g_iup[z][mu]; + c = &g_gauge_field[y][nu]; + z = g_iup[x][mu]; + e = &g_gauge_field[z][mu]; + _su3_times_su3d(tmp1, *c, *e); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3_acc((*v), tmp2, tmp1); + #endif + + + /* 1 contr. starting idn[idn[x][mu]][nu] + * e^+ d^+ a b c + * + * e + * _ + *d|__|c + * ab + */ + y = g_idn[x][mu]; + z = g_idn[y][nu]; + d = &g_gauge_field[z][nu]; + a = &g_gauge_field[z][mu]; + _su3d_times_su3(tmp1, *d, *a); + e = &g_gauge_field[y][mu]; + _su3d_times_su3(tmp2, *e, tmp1); + + y = g_idn[x][nu]; + b = &g_gauge_field[y][mu]; + z = g_iup[y][mu]; + c = &g_gauge_field[z][nu]; + _su3_times_su3(tmp1, *b, *c); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3_acc((*v), tmp2, tmp1); + + /* 1 contr. starting idn[x][mu] + * d^+ a b c e^+ + * + * bc + * __ + *a|_ |e + * d + */ + y = g_idn[x][mu]; + d = &g_gauge_field[y][mu]; + z = g_iup[y][nu]; + a = &g_gauge_field[y][nu]; + _su3d_times_su3(tmp1, *d, *a); + b = &g_gauge_field[z][mu]; + _su3_times_su3(tmp2, tmp1, *b); + + y = g_iup[x][mu]; + e = &g_gauge_field[y][nu]; + z = g_iup[x][nu]; + c = &g_gauge_field[z][mu]; + _su3_times_su3d(tmp1, *c, *e); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3_acc((*v), tmp2, tmp1); + } + else if (mu != 0 && nu == 0){ + + #if 1 /* PROBLEMS HERE for T=4 */ + + /* first contr. starting from x + * a b c e^+ d^+ + * c + * _ + * b| |e + * a| |d + */ + a = &g_gauge_field[x][nu]; + y = g_iup[x][nu]; + b = &g_gauge_field[y][nu]; + _su3_times_su3(tmp1, *a, *b); + z = g_iup[y][nu]; + c = &g_gauge_field[z][mu]; + _su3_times_su3(tmp2, tmp1, *c); + + y = g_iup[x][mu]; + d = &g_gauge_field[y][nu]; + z = g_iup[y][nu]; + e = &g_gauge_field[z][nu]; + _su3_times_su3(tmp1, *d, *e); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3d_acc((*v), tmp2, tmp1); + #endif + + /* 1 contr. starting idn[idn[x][nu]][nu] + * e^+ d^+ a b c + * + *e| |c + *d|_|b + * a + */ + y = g_idn[x][nu]; + z = g_idn[y][nu]; + d = &g_gauge_field[z][nu]; + a = &g_gauge_field[z][mu]; + _su3d_times_su3(tmp1, *d, *a); + e = &g_gauge_field[y][nu]; + _su3d_times_su3(tmp2, *e, tmp1); + + y = g_iup[z][mu]; + b = &g_gauge_field[y][nu]; + z = g_iup[y][nu]; + c = &g_gauge_field[z][nu]; + _su3_times_su3(tmp1, *b, *c); + /* tmp1 = c1tts * tmp1 */ + _real_times_su3(tmp1,g_C1tts,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3_acc((*v), tmp2, tmp1); + + + /* second contr. starting from x + * a b c e^+ d^+ + * + * bc + * __ + * a| _|e + * d + */ + a = &g_gauge_field[x][nu]; + y = g_iup[x][nu]; + b = &g_gauge_field[y][mu]; + _su3_times_su3(tmp1, *a, *b); + z = g_iup[y][mu]; + c = &g_gauge_field[z][mu]; + _su3_times_su3(tmp2, tmp1, *c); + + y = g_iup[x][mu]; + d = &g_gauge_field[y][mu]; + z = g_iup[y][mu]; + e = &g_gauge_field[z][nu]; + _su3_times_su3(tmp1, *d, *e); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3d_acc((*v), tmp2, tmp1); + + /* 1 contr. starting idn[x][nu] + * d^+ a b c e^+ + * + * e + * _ + * d|__|c + * ab + */ + y = g_idn[x][nu]; + d = &g_gauge_field[y][nu]; + a = &g_gauge_field[y][mu]; + _su3d_times_su3(tmp1, *d, *a); + z = g_iup[y][mu]; + b = &g_gauge_field[z][mu]; + _su3_times_su3(tmp2, tmp1, *b); + + y = g_iup[z][mu]; + c = &g_gauge_field[y][nu]; + z = g_iup[x][mu]; + e = &g_gauge_field[z][mu]; + _su3_times_su3d(tmp1, *c, *e); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3_acc((*v), tmp2, tmp1); + + /* 1 contr. starting idn[idn[x][mu]][nu] + * e^+ d^+ a b c + * + * e + * _ + *d|__|c + * ab + */ + y = g_idn[x][mu]; + z = g_idn[y][nu]; + d = &g_gauge_field[z][nu]; + a = &g_gauge_field[z][mu]; + _su3d_times_su3(tmp1, *d, *a); + e = &g_gauge_field[y][mu]; + _su3d_times_su3(tmp2, *e, tmp1); + + y = g_idn[x][nu]; + b = &g_gauge_field[y][mu]; + z = g_iup[y][mu]; + c = &g_gauge_field[z][nu]; + _su3_times_su3(tmp1, *b, *c); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3_acc((*v), tmp2, tmp1); + + /* 1 contr. starting idn[x][mu] + * d^+ a b c e^+ + * + * bc + * __ + *a|_ |e + * d + */ + y = g_idn[x][mu]; + d = &g_gauge_field[y][mu]; + z = g_iup[y][nu]; + a = &g_gauge_field[y][nu]; + _su3d_times_su3(tmp1, *d, *a); + b = &g_gauge_field[z][mu]; + _su3_times_su3(tmp2, tmp1, *b); + + y = g_iup[x][mu]; + e = &g_gauge_field[y][nu]; + z = g_iup[x][nu]; + c = &g_gauge_field[z][mu]; + _su3_times_su3d(tmp1, *c, *e); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3_acc((*v), tmp2, tmp1); + } + } + else if (g_t[x] == (g_Tbsf-2)) { + if (mu == 0){ + /* first contr. starting from x + * a b c e^+ d^+ + * c + * _ + * b| |e + * a| |d + */ + a = &g_gauge_field[x][nu]; + y = g_iup[x][nu]; + b = &g_gauge_field[y][nu]; + _su3_times_su3(tmp1, *a, *b); + z = g_iup[y][nu]; + c = &g_gauge_field[z][mu]; + _su3_times_su3(tmp2, tmp1, *c); + + y = g_iup[x][mu]; + d = &g_gauge_field[y][nu]; + z = g_iup[y][nu]; + e = &g_gauge_field[z][nu]; + _su3_times_su3(tmp1, *d, *e); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3d_acc((*v), tmp2, tmp1); + + /* 1 contr. starting idn[idn[x][nu]][nu] + * e^+ d^+ a b c + * + *e| |c + *d|_|b + * a + */ + y = g_idn[x][nu]; + z = g_idn[y][nu]; + d = &g_gauge_field[z][nu]; + a = &g_gauge_field[z][mu]; + _su3d_times_su3(tmp1, *d, *a); + e = &g_gauge_field[y][nu]; + _su3d_times_su3(tmp2, *e, tmp1); + + y = g_iup[z][mu]; + b = &g_gauge_field[y][nu]; + z = g_iup[y][nu]; + c = &g_gauge_field[z][nu]; + _su3_times_su3(tmp1, *b, *c); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3_acc((*v), tmp2, tmp1); + + /* second contr. starting from x + * a b c e^+ d^+ + * + * bc + * __ + * a| _|e + * d + */ + a = &g_gauge_field[x][nu]; + y = g_iup[x][nu]; + b = &g_gauge_field[y][mu]; + _su3_times_su3(tmp1, *a, *b); + z = g_iup[y][mu]; + c = &g_gauge_field[z][mu]; + _su3_times_su3(tmp2, tmp1, *c); + + y = g_iup[x][mu]; + d = &g_gauge_field[y][mu]; + z = g_iup[y][mu]; + e = &g_gauge_field[z][nu]; + _su3_times_su3(tmp1, *d, *e); + /* tmp1 = c1tts * tmp1 */ + _real_times_su3(tmp1,g_C1tts,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3d_acc((*v), tmp2, tmp1); + + /* 1 contr. starting idn[x][nu] + * d^+ a b c e^+ + * + * e + * _ + * d|__|c + * ab + */ + y = g_idn[x][nu]; + d = &g_gauge_field[y][nu]; + a = &g_gauge_field[y][mu]; + _su3d_times_su3(tmp1, *d, *a); + z = g_iup[y][mu]; + b = &g_gauge_field[z][mu]; + _su3_times_su3(tmp2, tmp1, *b); + + y = g_iup[z][mu]; + c = &g_gauge_field[y][nu]; + z = g_iup[x][mu]; + e = &g_gauge_field[z][mu]; + _su3_times_su3d(tmp1, *c, *e); + /* tmp1 = c1tts * tmp1 */ + _real_times_su3(tmp1,g_C1tts,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3_acc((*v), tmp2, tmp1); + + /* 1 contr. starting idn[idn[x][mu]][nu] + * e^+ d^+ a b c + * + * e + * _ + *d|__|c + * ab + */ + y = g_idn[x][mu]; + z = g_idn[y][nu]; + d = &g_gauge_field[z][nu]; + a = &g_gauge_field[z][mu]; + _su3d_times_su3(tmp1, *d, *a); + e = &g_gauge_field[y][mu]; + _su3d_times_su3(tmp2, *e, tmp1); + + y = g_idn[x][nu]; + b = &g_gauge_field[y][mu]; + z = g_iup[y][mu]; + c = &g_gauge_field[z][nu]; + _su3_times_su3(tmp1, *b, *c); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3_acc((*v), tmp2, tmp1); + + /* 1 contr. starting idn[x][mu] + * d^+ a b c e^+ + * + * bc + * __ + *a|_ |e + * d + */ + y = g_idn[x][mu]; + d = &g_gauge_field[y][mu]; + z = g_iup[y][nu]; + a = &g_gauge_field[y][nu]; + _su3d_times_su3(tmp1, *d, *a); + b = &g_gauge_field[z][mu]; + _su3_times_su3(tmp2, tmp1, *b); + + y = g_iup[x][mu]; + e = &g_gauge_field[y][nu]; + z = g_iup[x][nu]; + c = &g_gauge_field[z][mu]; + _su3_times_su3d(tmp1, *c, *e); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3_acc((*v), tmp2, tmp1); + } + + else if (mu != 0 && nu == 0){ + /* first contr. starting from x + * a b c e^+ d^+ + * c + * _ + * b| |e + * a| |d + */ + a = &g_gauge_field[x][nu]; + y = g_iup[x][nu]; + b = &g_gauge_field[y][nu]; + _su3_times_su3(tmp1, *a, *b); + z = g_iup[y][nu]; + c = &g_gauge_field[z][mu]; + _su3_times_su3(tmp2, tmp1, *c); + + y = g_iup[x][mu]; + d = &g_gauge_field[y][nu]; + z = g_iup[y][nu]; + e = &g_gauge_field[z][nu]; + _su3_times_su3(tmp1, *d, *e); + /* tmp1 = c1tts * tmp1 */ + _real_times_su3(tmp1,g_C1tts,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3d_acc((*v), tmp2, tmp1); + + #if 1 /* PROBLEMS HERE for T=4 */ + + /* 1 contr. starting idn[idn[x][nu]][nu] + * e^+ d^+ a b c + * + *e| |c + *d|_|b + * a + */ + y = g_idn[x][nu]; + z = g_idn[y][nu]; + d = &g_gauge_field[z][nu]; + a = &g_gauge_field[z][mu]; + _su3d_times_su3(tmp1, *d, *a); + e = &g_gauge_field[y][nu]; + _su3d_times_su3(tmp2, *e, tmp1); + + y = g_iup[z][mu]; + b = &g_gauge_field[y][nu]; + z = g_iup[y][nu]; + c = &g_gauge_field[z][nu]; + _su3_times_su3(tmp1, *b, *c); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3_acc((*v), tmp2, tmp1); + #endif + + + /* second contr. starting from x + * a b c e^+ d^+ + * + * bc + * __ + * a| _|e + * d + */ + a = &g_gauge_field[x][nu]; + y = g_iup[x][nu]; + b = &g_gauge_field[y][mu]; + _su3_times_su3(tmp1, *a, *b); + z = g_iup[y][mu]; + c = &g_gauge_field[z][mu]; + _su3_times_su3(tmp2, tmp1, *c); + + y = g_iup[x][mu]; + d = &g_gauge_field[y][mu]; + z = g_iup[y][mu]; + e = &g_gauge_field[z][nu]; + _su3_times_su3(tmp1, *d, *e); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3d_acc((*v), tmp2, tmp1); + + /* 1 contr. starting idn[x][nu] + * d^+ a b c e^+ + * + * e + * _ + * d|__|c + * ab + */ + y = g_idn[x][nu]; + d = &g_gauge_field[y][nu]; + a = &g_gauge_field[y][mu]; + _su3d_times_su3(tmp1, *d, *a); + z = g_iup[y][mu]; + b = &g_gauge_field[z][mu]; + _su3_times_su3(tmp2, tmp1, *b); + + y = g_iup[z][mu]; + c = &g_gauge_field[y][nu]; + z = g_iup[x][mu]; + e = &g_gauge_field[z][mu]; + _su3_times_su3d(tmp1, *c, *e); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3_acc((*v), tmp2, tmp1); + + /* 1 contr. starting idn[idn[x][mu]][nu] + * e^+ d^+ a b c + * + * e + * _ + *d|__|c + * ab + */ + y = g_idn[x][mu]; + z = g_idn[y][nu]; + d = &g_gauge_field[z][nu]; + a = &g_gauge_field[z][mu]; + _su3d_times_su3(tmp1, *d, *a); + e = &g_gauge_field[y][mu]; + _su3d_times_su3(tmp2, *e, tmp1); + + y = g_idn[x][nu]; + b = &g_gauge_field[y][mu]; + z = g_iup[y][mu]; + c = &g_gauge_field[z][nu]; + _su3_times_su3(tmp1, *b, *c); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3_acc((*v), tmp2, tmp1); + + /* 1 contr. starting idn[x][mu] + * d^+ a b c e^+ + * + * bc + * __ + *a|_ |e + * d + */ + y = g_idn[x][mu]; + d = &g_gauge_field[y][mu]; + z = g_iup[y][nu]; + a = &g_gauge_field[y][nu]; + _su3d_times_su3(tmp1, *d, *a); + b = &g_gauge_field[z][mu]; + _su3_times_su3(tmp2, tmp1, *b); + + y = g_iup[x][mu]; + e = &g_gauge_field[y][nu]; + z = g_iup[x][nu]; + c = &g_gauge_field[z][mu]; + _su3_times_su3d(tmp1, *c, *e); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3_acc((*v), tmp2, tmp1); + } + else if (mu != 0 && nu != 0){ + /* first contr. starting from x + * a b c e^+ d^+ + * c + * _ + * b| |e + * a| |d + */ + a = &g_gauge_field[x][nu]; + y = g_iup[x][nu]; + b = &g_gauge_field[y][nu]; + _su3_times_su3(tmp1, *a, *b); + z = g_iup[y][nu]; + c = &g_gauge_field[z][mu]; + _su3_times_su3(tmp2, tmp1, *c); + + y = g_iup[x][mu]; + d = &g_gauge_field[y][nu]; + z = g_iup[y][nu]; + e = &g_gauge_field[z][nu]; + _su3_times_su3(tmp1, *d, *e); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3d_acc((*v), tmp2, tmp1); + + /* 1 contr. starting idn[idn[x][nu]][nu] + * e^+ d^+ a b c + * + *e| |c + *d|_|b + * a + */ + y = g_idn[x][nu]; + z = g_idn[y][nu]; + d = &g_gauge_field[z][nu]; + a = &g_gauge_field[z][mu]; + _su3d_times_su3(tmp1, *d, *a); + e = &g_gauge_field[y][nu]; + _su3d_times_su3(tmp2, *e, tmp1); + + y = g_iup[z][mu]; + b = &g_gauge_field[y][nu]; + z = g_iup[y][nu]; + c = &g_gauge_field[z][nu]; + _su3_times_su3(tmp1, *b, *c); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3_acc((*v), tmp2, tmp1); + + /* second contr. starting from x + * a b c e^+ d^+ + * + * bc + * __ + * a| _|e + * d + */ + a = &g_gauge_field[x][nu]; + y = g_iup[x][nu]; + b = &g_gauge_field[y][mu]; + _su3_times_su3(tmp1, *a, *b); + z = g_iup[y][mu]; + c = &g_gauge_field[z][mu]; + _su3_times_su3(tmp2, tmp1, *c); + + y = g_iup[x][mu]; + d = &g_gauge_field[y][mu]; + z = g_iup[y][mu]; + e = &g_gauge_field[z][nu]; + _su3_times_su3(tmp1, *d, *e); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3d_acc((*v), tmp2, tmp1); + + /* 1 contr. starting idn[x][nu] + * d^+ a b c e^+ + * + * e + * _ + * d|__|c + * ab + */ + y = g_idn[x][nu]; + d = &g_gauge_field[y][nu]; + a = &g_gauge_field[y][mu]; + _su3d_times_su3(tmp1, *d, *a); + z = g_iup[y][mu]; + b = &g_gauge_field[z][mu]; + _su3_times_su3(tmp2, tmp1, *b); + + y = g_iup[z][mu]; + c = &g_gauge_field[y][nu]; + z = g_iup[x][mu]; + e = &g_gauge_field[z][mu]; + _su3_times_su3d(tmp1, *c, *e); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3_acc((*v), tmp2, tmp1); + + /* 1 contr. starting idn[idn[x][mu]][nu] + * e^+ d^+ a b c + * + * e + * _ + *d|__|c + * ab + */ + y = g_idn[x][mu]; + z = g_idn[y][nu]; + d = &g_gauge_field[z][nu]; + a = &g_gauge_field[z][mu]; + _su3d_times_su3(tmp1, *d, *a); + e = &g_gauge_field[y][mu]; + _su3d_times_su3(tmp2, *e, tmp1); + + y = g_idn[x][nu]; + b = &g_gauge_field[y][mu]; + z = g_iup[y][mu]; + c = &g_gauge_field[z][nu]; + _su3_times_su3(tmp1, *b, *c); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3_acc((*v), tmp2, tmp1); + + /* 1 contr. starting idn[x][mu] + * d^+ a b c e^+ + * + * bc + * __ + *a|_ |e + * d + */ + y = g_idn[x][mu]; + d = &g_gauge_field[y][mu]; + z = g_iup[y][nu]; + a = &g_gauge_field[y][nu]; + _su3d_times_su3(tmp1, *d, *a); + b = &g_gauge_field[z][mu]; + _su3_times_su3(tmp2, tmp1, *b); + + y = g_iup[x][mu]; + e = &g_gauge_field[y][nu]; + z = g_iup[x][nu]; + c = &g_gauge_field[z][mu]; + _su3_times_su3d(tmp1, *c, *e); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3_acc((*v), tmp2, tmp1); + } + } + else if (g_t[x] == 1) { + if (mu == 0) { + /* first contr. starting from x + * a b c e^+ d^+ + * c + * _ + * b| |e + * a| |d + */ + a = &g_gauge_field[x][nu]; + y = g_iup[x][nu]; + b = &g_gauge_field[y][nu]; + _su3_times_su3(tmp1, *a, *b); + z = g_iup[y][nu]; + c = &g_gauge_field[z][mu]; + _su3_times_su3(tmp2, tmp1, *c); + + y = g_iup[x][mu]; + d = &g_gauge_field[y][nu]; + z = g_iup[y][nu]; + e = &g_gauge_field[z][nu]; + _su3_times_su3(tmp1, *d, *e); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3d_acc((*v), tmp2, tmp1); + + /* 1 contr. starting idn[idn[x][nu]][nu] + * e^+ d^+ a b c + * + *e| |c + *d|_|b + * a + */ + y = g_idn[x][nu]; + z = g_idn[y][nu]; + d = &g_gauge_field[z][nu]; + a = &g_gauge_field[z][mu]; + _su3d_times_su3(tmp1, *d, *a); + e = &g_gauge_field[y][nu]; + _su3d_times_su3(tmp2, *e, tmp1); + + y = g_iup[z][mu]; + b = &g_gauge_field[y][nu]; + z = g_iup[y][nu]; + c = &g_gauge_field[z][nu]; + _su3_times_su3(tmp1, *b, *c); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3_acc((*v), tmp2, tmp1); + + /* second contr. starting from x + * a b c e^+ d^+ + * + * bc + * __ + * a| _|e + * d + */ + a = &g_gauge_field[x][nu]; + y = g_iup[x][nu]; + b = &g_gauge_field[y][mu]; + _su3_times_su3(tmp1, *a, *b); + z = g_iup[y][mu]; + c = &g_gauge_field[z][mu]; + _su3_times_su3(tmp2, tmp1, *c); + + y = g_iup[x][mu]; + d = &g_gauge_field[y][mu]; + z = g_iup[y][mu]; + e = &g_gauge_field[z][nu]; + _su3_times_su3(tmp1, *d, *e); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3d_acc((*v), tmp2, tmp1); + + /* 1 contr. starting idn[x][nu] + * d^+ a b c e^+ + * + * e + * _ + * d|__|c + * ab + */ + y = g_idn[x][nu]; + d = &g_gauge_field[y][nu]; + a = &g_gauge_field[y][mu]; + _su3d_times_su3(tmp1, *d, *a); + z = g_iup[y][mu]; + b = &g_gauge_field[z][mu]; + _su3_times_su3(tmp2, tmp1, *b); + + y = g_iup[z][mu]; + c = &g_gauge_field[y][nu]; + z = g_iup[x][mu]; + e = &g_gauge_field[z][mu]; + _su3_times_su3d(tmp1, *c, *e); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3_acc((*v), tmp2, tmp1); + + /* 1 contr. starting idn[idn[x][mu]][nu] + * e^+ d^+ a b c + * + * e + * _ + *d|__|c + * ab + */ + y = g_idn[x][mu]; + z = g_idn[y][nu]; + d = &g_gauge_field[z][nu]; + a = &g_gauge_field[z][mu]; + _su3d_times_su3(tmp1, *d, *a); + e = &g_gauge_field[y][mu]; + _su3d_times_su3(tmp2, *e, tmp1); + + y = g_idn[x][nu]; + b = &g_gauge_field[y][mu]; + z = g_iup[y][mu]; + c = &g_gauge_field[z][nu]; + _su3_times_su3(tmp1, *b, *c); + /* tmp1 = c1tts * tmp1 */ + _real_times_su3(tmp1,g_C1tts,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3_acc((*v), tmp2, tmp1); + + /* 1 contr. starting idn[x][mu] + * d^+ a b c e^+ + * + * bc + * __ + *a|_ |e + * d + */ + y = g_idn[x][mu]; + d = &g_gauge_field[y][mu]; + z = g_iup[y][nu]; + a = &g_gauge_field[y][nu]; + _su3d_times_su3(tmp1, *d, *a); + b = &g_gauge_field[z][mu]; + _su3_times_su3(tmp2, tmp1, *b); + + y = g_iup[x][mu]; + e = &g_gauge_field[y][nu]; + z = g_iup[x][nu]; + c = &g_gauge_field[z][mu]; + _su3_times_su3d(tmp1, *c, *e); + /* tmp1 = c1tts * tmp1 */ + _real_times_su3(tmp1,g_C1tts,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3_acc((*v), tmp2, tmp1); + } + else if (mu != 0 && nu == 0) { + + /* first contr. starting from x + * a b c e^+ d^+ + * c + * _ + * b| |e + * a| |d + */ + a = &g_gauge_field[x][nu]; + y = g_iup[x][nu]; + b = &g_gauge_field[y][nu]; + _su3_times_su3(tmp1, *a, *b); + z = g_iup[y][nu]; + c = &g_gauge_field[z][mu]; + _su3_times_su3(tmp2, tmp1, *c); + + y = g_iup[x][mu]; + d = &g_gauge_field[y][nu]; + z = g_iup[y][nu]; + e = &g_gauge_field[z][nu]; + _su3_times_su3(tmp1, *d, *e); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3d_acc((*v), tmp2, tmp1); + + /* second contr. starting from x + * a b c e^+ d^+ + * + * bc + * __ + * a| _|e + * d + */ + a = &g_gauge_field[x][nu]; + y = g_iup[x][nu]; + b = &g_gauge_field[y][mu]; + _su3_times_su3(tmp1, *a, *b); + z = g_iup[y][mu]; + c = &g_gauge_field[z][mu]; + _su3_times_su3(tmp2, tmp1, *c); + + y = g_iup[x][mu]; + d = &g_gauge_field[y][mu]; + z = g_iup[y][mu]; + e = &g_gauge_field[z][nu]; + _su3_times_su3(tmp1, *d, *e); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3d_acc((*v), tmp2, tmp1); + + /* 1 contr. starting idn[x][nu] + * d^+ a b c e^+ + * + * e + * _ + * d|__|c + * ab + */ + y = g_idn[x][nu]; + d = &g_gauge_field[y][nu]; + a = &g_gauge_field[y][mu]; + _su3d_times_su3(tmp1, *d, *a); + z = g_iup[y][mu]; + b = &g_gauge_field[z][mu]; + _su3_times_su3(tmp2, tmp1, *b); + + y = g_iup[z][mu]; + c = &g_gauge_field[y][nu]; + z = g_iup[x][mu]; + e = &g_gauge_field[z][mu]; + _su3_times_su3d(tmp1, *c, *e); + /* tmp1 = c1tss * tmp1 */ + _real_times_su3(tmp1,g_C1tss,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3_acc((*v), tmp2, tmp1); + + /* 1 contr. starting idn[idn[x][mu]][nu] + * e^+ d^+ a b c + * + * e + * _ + *d|__|c + * ab + */ + y = g_idn[x][mu]; + z = g_idn[y][nu]; + d = &g_gauge_field[z][nu]; + a = &g_gauge_field[z][mu]; + _su3d_times_su3(tmp1, *d, *a); + e = &g_gauge_field[y][mu]; + _su3d_times_su3(tmp2, *e, tmp1); + + y = g_idn[x][nu]; + b = &g_gauge_field[y][mu]; + z = g_iup[y][mu]; + c = &g_gauge_field[z][nu]; + _su3_times_su3(tmp1, *b, *c); + /* tmp1 = c1tss * tmp1 */ + _real_times_su3(tmp1,g_C1tss,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3_acc((*v), tmp2, tmp1); + + /* 1 contr. starting idn[x][mu] + * d^+ a b c e^+ + * + * bc + * __ + *a|_ |e + * d + */ + y = g_idn[x][mu]; + d = &g_gauge_field[y][mu]; + z = g_iup[y][nu]; + a = &g_gauge_field[y][nu]; + _su3d_times_su3(tmp1, *d, *a); + b = &g_gauge_field[z][mu]; + _su3_times_su3(tmp2, tmp1, *b); + + y = g_iup[x][mu]; + e = &g_gauge_field[y][nu]; + z = g_iup[x][nu]; + c = &g_gauge_field[z][mu]; + _su3_times_su3d(tmp1, *c, *e); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3_acc((*v), tmp2, tmp1); + } + else if (mu != 0 && nu != 0) { + /* first contr. starting from x + * a b c e^+ d^+ + * c + * _ + * b| |e + * a| |d + */ + a = &g_gauge_field[x][nu]; + y = g_iup[x][nu]; + b = &g_gauge_field[y][nu]; + _su3_times_su3(tmp1, *a, *b); + z = g_iup[y][nu]; + c = &g_gauge_field[z][mu]; + _su3_times_su3(tmp2, tmp1, *c); + + y = g_iup[x][mu]; + d = &g_gauge_field[y][nu]; + z = g_iup[y][nu]; + e = &g_gauge_field[z][nu]; + _su3_times_su3(tmp1, *d, *e); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3d_acc((*v), tmp2, tmp1); + + /* 1 contr. starting idn[idn[x][nu]][nu] + * e^+ d^+ a b c + * + *e| |c + *d|_|b + * a + */ + y = g_idn[x][nu]; + z = g_idn[y][nu]; + d = &g_gauge_field[z][nu]; + a = &g_gauge_field[z][mu]; + _su3d_times_su3(tmp1, *d, *a); + e = &g_gauge_field[y][nu]; + _su3d_times_su3(tmp2, *e, tmp1); + + y = g_iup[z][mu]; + b = &g_gauge_field[y][nu]; + z = g_iup[y][nu]; + c = &g_gauge_field[z][nu]; + _su3_times_su3(tmp1, *b, *c); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3_acc((*v), tmp2, tmp1); + + /* second contr. starting from x + * a b c e^+ d^+ + * + * bc + * __ + * a| _|e + * d + */ + a = &g_gauge_field[x][nu]; + y = g_iup[x][nu]; + b = &g_gauge_field[y][mu]; + _su3_times_su3(tmp1, *a, *b); + z = g_iup[y][mu]; + c = &g_gauge_field[z][mu]; + _su3_times_su3(tmp2, tmp1, *c); + + y = g_iup[x][mu]; + d = &g_gauge_field[y][mu]; + z = g_iup[y][mu]; + e = &g_gauge_field[z][nu]; + _su3_times_su3(tmp1, *d, *e); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3d_acc((*v), tmp2, tmp1); + + /* 1 contr. starting idn[x][nu] + * d^+ a b c e^+ + * + * e + * _ + * d|__|c + * ab + */ + y = g_idn[x][nu]; + d = &g_gauge_field[y][nu]; + a = &g_gauge_field[y][mu]; + _su3d_times_su3(tmp1, *d, *a); + z = g_iup[y][mu]; + b = &g_gauge_field[z][mu]; + _su3_times_su3(tmp2, tmp1, *b); + + y = g_iup[z][mu]; + c = &g_gauge_field[y][nu]; + z = g_iup[x][mu]; + e = &g_gauge_field[z][mu]; + _su3_times_su3d(tmp1, *c, *e); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3_acc((*v), tmp2, tmp1); + + /* 1 contr. starting idn[idn[x][mu]][nu] + * e^+ d^+ a b c + * + * e + * _ + *d|__|c + * ab + */ + y = g_idn[x][mu]; + z = g_idn[y][nu]; + d = &g_gauge_field[z][nu]; + a = &g_gauge_field[z][mu]; + _su3d_times_su3(tmp1, *d, *a); + e = &g_gauge_field[y][mu]; + _su3d_times_su3(tmp2, *e, tmp1); + + y = g_idn[x][nu]; + b = &g_gauge_field[y][mu]; + z = g_iup[y][mu]; + c = &g_gauge_field[z][nu]; + _su3_times_su3(tmp1, *b, *c); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3_acc((*v), tmp2, tmp1); + + /* 1 contr. starting idn[x][mu] + * d^+ a b c e^+ + * + * bc + * __ + *a|_ |e + * d + */ + y = g_idn[x][mu]; + d = &g_gauge_field[y][mu]; + z = g_iup[y][nu]; + a = &g_gauge_field[y][nu]; + _su3d_times_su3(tmp1, *d, *a); + b = &g_gauge_field[z][mu]; + _su3_times_su3(tmp2, tmp1, *b); + + y = g_iup[x][mu]; + e = &g_gauge_field[y][nu]; + z = g_iup[x][nu]; + c = &g_gauge_field[z][mu]; + _su3_times_su3d(tmp1, *c, *e); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3_acc((*v), tmp2, tmp1); + } + } + else if (g_t[x] == (g_Tbsf-1)) { + if (mu == 0) { + /* first contr. starting from x + * a b c e^+ d^+ + * c + * _ + * b| |e + * a| |d + */ + a = &g_gauge_field[x][nu]; + y = g_iup[x][nu]; + b = &g_gauge_field[y][nu]; + _su3_times_su3(tmp1, *a, *b); + z = g_iup[y][nu]; + c = &g_gauge_field[z][mu]; + _su3_times_su3(tmp2, tmp1, *c); + + y = g_iup[x][mu]; + d = &g_gauge_field[y][nu]; + z = g_iup[y][nu]; + e = &g_gauge_field[z][nu]; + _su3_times_su3(tmp1, *d, *e); + /* tmp1 = c1tss * tmp1 */ + _real_times_su3(tmp1,g_C1tss,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3d_acc((*v), tmp2, tmp1); + + /* 1 contr. starting idn[idn[x][nu]][nu] + * e^+ d^+ a b c + * + *e| |c + *d|_|b + * a + */ + y = g_idn[x][nu]; + z = g_idn[y][nu]; + d = &g_gauge_field[z][nu]; + a = &g_gauge_field[z][mu]; + _su3d_times_su3(tmp1, *d, *a); + e = &g_gauge_field[y][nu]; + _su3d_times_su3(tmp2, *e, tmp1); + + y = g_iup[z][mu]; + b = &g_gauge_field[y][nu]; + z = g_iup[y][nu]; + c = &g_gauge_field[z][nu]; + _su3_times_su3(tmp1, *b, *c); + /* tmp1 = c1tss * tmp1 */ + _real_times_su3(tmp1,g_C1tss,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3_acc((*v), tmp2, tmp1); + + /* 1 contr. starting idn[idn[x][mu]][nu] + * e^+ d^+ a b c + * + * e + * _ + *d|__|c + * ab + */ + y = g_idn[x][mu]; + z = g_idn[y][nu]; + d = &g_gauge_field[z][nu]; + a = &g_gauge_field[z][mu]; + _su3d_times_su3(tmp1, *d, *a); + e = &g_gauge_field[y][mu]; + _su3d_times_su3(tmp2, *e, tmp1); + + y = g_idn[x][nu]; + b = &g_gauge_field[y][mu]; + z = g_iup[y][mu]; + c = &g_gauge_field[z][nu]; + _su3_times_su3(tmp1, *b, *c); + /* tmp1 = c1tts * tmp1 */ + _real_times_su3(tmp1,g_C1tts,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3_acc((*v), tmp2, tmp1); + + /* 1 contr. starting idn[x][mu] + * d^+ a b c e^+ + * + * bc + * __ + *a|_ |e + * d + */ + y = g_idn[x][mu]; + d = &g_gauge_field[y][mu]; + z = g_iup[y][nu]; + a = &g_gauge_field[y][nu]; + _su3d_times_su3(tmp1, *d, *a); + b = &g_gauge_field[z][mu]; + _su3_times_su3(tmp2, tmp1, *b); + + y = g_iup[x][mu]; + e = &g_gauge_field[y][nu]; + z = g_iup[x][nu]; + c = &g_gauge_field[z][mu]; + _su3_times_su3d(tmp1, *c, *e); + /* tmp1 = c1tts * tmp1 */ + _real_times_su3(tmp1,g_C1tts,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3_acc((*v), tmp2, tmp1); + } + else if (mu != 0 && nu == 0) { + + /* 1 contr. starting idn[idn[x][nu]][nu] + * e^+ d^+ a b c + * + *e| |c + *d|_|b + * a + */ + y = g_idn[x][nu]; + z = g_idn[y][nu]; + d = &g_gauge_field[z][nu]; + a = &g_gauge_field[z][mu]; + _su3d_times_su3(tmp1, *d, *a); + e = &g_gauge_field[y][nu]; + _su3d_times_su3(tmp2, *e, tmp1); + + y = g_iup[z][mu]; + b = &g_gauge_field[y][nu]; + z = g_iup[y][nu]; + c = &g_gauge_field[z][nu]; + _su3_times_su3(tmp1, *b, *c); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3_acc((*v), tmp2, tmp1); + + /* second contr. starting from x + * a b c e^+ d^+ + * + * bc + * __ + * a| _|e + * d + */ + a = &g_gauge_field[x][nu]; + y = g_iup[x][nu]; + b = &g_gauge_field[y][mu]; + _su3_times_su3(tmp1, *a, *b); + z = g_iup[y][mu]; + c = &g_gauge_field[z][mu]; + _su3_times_su3(tmp2, tmp1, *c); + + y = g_iup[x][mu]; + d = &g_gauge_field[y][mu]; + z = g_iup[y][mu]; + e = &g_gauge_field[z][nu]; + _su3_times_su3(tmp1, *d, *e); + /* tmp1 = c1tss * tmp1 */ + _real_times_su3(tmp1,g_C1tss,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3d_acc((*v), tmp2, tmp1); + + /* 1 contr. starting idn[x][nu] + * d^+ a b c e^+ + * + * e + * _ + * d|__|c + * ab + */ + y = g_idn[x][nu]; + d = &g_gauge_field[y][nu]; + a = &g_gauge_field[y][mu]; + _su3d_times_su3(tmp1, *d, *a); + z = g_iup[y][mu]; + b = &g_gauge_field[z][mu]; + _su3_times_su3(tmp2, tmp1, *b); + + y = g_iup[z][mu]; + c = &g_gauge_field[y][nu]; + z = g_iup[x][mu]; + e = &g_gauge_field[z][mu]; + _su3_times_su3d(tmp1, *c, *e); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3_acc((*v), tmp2, tmp1); + + /* 1 contr. starting idn[idn[x][mu]][nu] + * e^+ d^+ a b c + * + * e + * _ + *d|__|c + * ab + */ + y = g_idn[x][mu]; + z = g_idn[y][nu]; + d = &g_gauge_field[z][nu]; + a = &g_gauge_field[z][mu]; + _su3d_times_su3(tmp1, *d, *a); + e = &g_gauge_field[y][mu]; + _su3d_times_su3(tmp2, *e, tmp1); + + y = g_idn[x][nu]; + b = &g_gauge_field[y][mu]; + z = g_iup[y][mu]; + c = &g_gauge_field[z][nu]; + _su3_times_su3(tmp1, *b, *c); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3_acc((*v), tmp2, tmp1); + + /* 1 contr. starting idn[x][mu] + * d^+ a b c e^+ + * + * bc + * __ + *a|_ |e + * d + */ + y = g_idn[x][mu]; + d = &g_gauge_field[y][mu]; + z = g_iup[y][nu]; + a = &g_gauge_field[y][nu]; + _su3d_times_su3(tmp1, *d, *a); + b = &g_gauge_field[z][mu]; + _su3_times_su3(tmp2, tmp1, *b); + + y = g_iup[x][mu]; + e = &g_gauge_field[y][nu]; + z = g_iup[x][nu]; + c = &g_gauge_field[z][mu]; + _su3_times_su3d(tmp1, *c, *e); + /* tmp1 = c1tss * tmp1 */ + _real_times_su3(tmp1,g_C1tss,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3_acc((*v), tmp2, tmp1); + } + else if (mu != 0 && nu != 0) { + /* first contr. starting from x + * a b c e^+ d^+ + * c + * _ + * b| |e + * a| |d + */ + a = &g_gauge_field[x][nu]; + y = g_iup[x][nu]; + b = &g_gauge_field[y][nu]; + _su3_times_su3(tmp1, *a, *b); + z = g_iup[y][nu]; + c = &g_gauge_field[z][mu]; + _su3_times_su3(tmp2, tmp1, *c); + + y = g_iup[x][mu]; + d = &g_gauge_field[y][nu]; + z = g_iup[y][nu]; + e = &g_gauge_field[z][nu]; + _su3_times_su3(tmp1, *d, *e); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3d_acc((*v), tmp2, tmp1); + + /* 1 contr. starting idn[idn[x][nu]][nu] + * e^+ d^+ a b c + * + *e| |c + *d|_|b + * a + */ + y = g_idn[x][nu]; + z = g_idn[y][nu]; + d = &g_gauge_field[z][nu]; + a = &g_gauge_field[z][mu]; + _su3d_times_su3(tmp1, *d, *a); + e = &g_gauge_field[y][nu]; + _su3d_times_su3(tmp2, *e, tmp1); + + y = g_iup[z][mu]; + b = &g_gauge_field[y][nu]; + z = g_iup[y][nu]; + c = &g_gauge_field[z][nu]; + _su3_times_su3(tmp1, *b, *c); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3_acc((*v), tmp2, tmp1); + + /* second contr. starting from x + * a b c e^+ d^+ + * + * bc + * __ + * a| _|e + * d + */ + a = &g_gauge_field[x][nu]; + y = g_iup[x][nu]; + b = &g_gauge_field[y][mu]; + _su3_times_su3(tmp1, *a, *b); + z = g_iup[y][mu]; + c = &g_gauge_field[z][mu]; + _su3_times_su3(tmp2, tmp1, *c); + + y = g_iup[x][mu]; + d = &g_gauge_field[y][mu]; + z = g_iup[y][mu]; + e = &g_gauge_field[z][nu]; + _su3_times_su3(tmp1, *d, *e); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3d_acc((*v), tmp2, tmp1); + + /* 1 contr. starting idn[x][nu] + * d^+ a b c e^+ + * + * e + * _ + * d|__|c + * ab + */ + y = g_idn[x][nu]; + d = &g_gauge_field[y][nu]; + a = &g_gauge_field[y][mu]; + _su3d_times_su3(tmp1, *d, *a); + z = g_iup[y][mu]; + b = &g_gauge_field[z][mu]; + _su3_times_su3(tmp2, tmp1, *b); + + y = g_iup[z][mu]; + c = &g_gauge_field[y][nu]; + z = g_iup[x][mu]; + e = &g_gauge_field[z][mu]; + _su3_times_su3d(tmp1, *c, *e); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3_acc((*v), tmp2, tmp1); + + /* 1 contr. starting idn[idn[x][mu]][nu] + * e^+ d^+ a b c + * + * e + * _ + *d|__|c + * ab + */ + y = g_idn[x][mu]; + z = g_idn[y][nu]; + d = &g_gauge_field[z][nu]; + a = &g_gauge_field[z][mu]; + _su3d_times_su3(tmp1, *d, *a); + e = &g_gauge_field[y][mu]; + _su3d_times_su3(tmp2, *e, tmp1); + + y = g_idn[x][nu]; + b = &g_gauge_field[y][mu]; + z = g_iup[y][mu]; + c = &g_gauge_field[z][nu]; + _su3_times_su3(tmp1, *b, *c); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3_acc((*v), tmp2, tmp1); + + /* 1 contr. starting idn[x][mu] + * d^+ a b c e^+ + * + * bc + * __ + *a|_ |e + * d + */ + y = g_idn[x][mu]; + d = &g_gauge_field[y][mu]; + z = g_iup[y][nu]; + a = &g_gauge_field[y][nu]; + _su3d_times_su3(tmp1, *d, *a); + b = &g_gauge_field[z][mu]; + _su3_times_su3(tmp2, tmp1, *b); + + y = g_iup[x][mu]; + e = &g_gauge_field[y][nu]; + z = g_iup[x][nu]; + c = &g_gauge_field[z][mu]; + _su3_times_su3d(tmp1, *c, *e); + /* tmp1 = c1 * tmp1 */ + _real_times_su3(tmp1,g_rgi_C1,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3_acc((*v), tmp2, tmp1); + } + } + else if (g_t[x] == 0) { + /* first contr. starting from x + * a b c e^+ d^+ + * c + * _ + * b| |e + * a| |d + */ + a = &g_gauge_field[x][nu]; + y = g_iup[x][nu]; + b = &g_gauge_field[y][nu]; + _su3_times_su3(tmp1, *a, *b); + z = g_iup[y][nu]; + c = &g_gauge_field[z][mu]; + _su3_times_su3(tmp2, tmp1, *c); + + y = g_iup[x][mu]; + d = &g_gauge_field[y][nu]; + z = g_iup[y][nu]; + e = &g_gauge_field[z][nu]; + _su3_times_su3(tmp1, *d, *e); + /* tmp1 = c1tss * tmp1 */ + _real_times_su3(tmp1,g_C1tss,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3d_acc((*v), tmp2, tmp1); + + /* 1 contr. starting idn[idn[x][nu]][nu] + * e^+ d^+ a b c + * + *e| |c + *d|_|b + * a + */ + y = g_idn[x][nu]; + z = g_idn[y][nu]; + d = &g_gauge_field[z][nu]; + a = &g_gauge_field[z][mu]; + _su3d_times_su3(tmp1, *d, *a); + e = &g_gauge_field[y][nu]; + _su3d_times_su3(tmp2, *e, tmp1); + + y = g_iup[z][mu]; + b = &g_gauge_field[y][nu]; + z = g_iup[y][nu]; + c = &g_gauge_field[z][nu]; + _su3_times_su3(tmp1, *b, *c); + /* tmp1 = c1tss * tmp1 */ + _real_times_su3(tmp1,g_C1tss,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3_acc((*v), tmp2, tmp1); + + /* second contr. starting from x + * a b c e^+ d^+ + * + * bc + * __ + * a| _|e + * d + */ + a = &g_gauge_field[x][nu]; + y = g_iup[x][nu]; + b = &g_gauge_field[y][mu]; + _su3_times_su3(tmp1, *a, *b); + z = g_iup[y][mu]; + c = &g_gauge_field[z][mu]; + _su3_times_su3(tmp2, tmp1, *c); + + y = g_iup[x][mu]; + d = &g_gauge_field[y][mu]; + z = g_iup[y][mu]; + e = &g_gauge_field[z][nu]; + _su3_times_su3(tmp1, *d, *e); + /* tmp1 = c1tts * tmp1 */ + _real_times_su3(tmp1,g_C1tts,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3d_acc((*v), tmp2, tmp1); + + /* 1 contr. starting idn[x][nu] + * d^+ a b c e^+ + * + * e + * _ + * d|__|c + * ab + */ + y = g_idn[x][nu]; + d = &g_gauge_field[y][nu]; + a = &g_gauge_field[y][mu]; + _su3d_times_su3(tmp1, *d, *a); + z = g_iup[y][mu]; + b = &g_gauge_field[z][mu]; + _su3_times_su3(tmp2, tmp1, *b); + + y = g_iup[z][mu]; + c = &g_gauge_field[y][nu]; + z = g_iup[x][mu]; + e = &g_gauge_field[z][mu]; + _su3_times_su3d(tmp1, *c, *e); + /* tmp1 = c1tts * tmp1 */ + _real_times_su3(tmp1,g_C1tts,tmp1); /* that is the new thing specific of SF */ + _su3_times_su3_acc((*v), tmp2, tmp1); + + } + } + } +#ifdef _KOJAK_INST +#pragma pomp inst end(rectstaples) +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/sf/sf_get_rectangle_staples.h b/qcd/part_cpu/applications/QCD/src/kernel_D/sf/sf_get_rectangle_staples.h new file mode 100644 index 0000000000000000000000000000000000000000..f8c42e3796bb40c396b65b781d3c0493cca6cc52 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/sf/sf_get_rectangle_staples.h @@ -0,0 +1,24 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _SF_GET_RECTANGLE_STAPLES_H +#define _SF_GET_RECTANGLE_STAPLES_H + +void sf_get_rectangle_staples(su3 * const v, const int x, const int mu); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/sf/sf_get_staples.c b/qcd/part_cpu/applications/QCD/src/kernel_D/sf/sf_get_staples.c new file mode 100644 index 0000000000000000000000000000000000000000..fc27981cf3b94737602b66e050d6d8e8ceeb023e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/sf/sf_get_staples.c @@ -0,0 +1,102 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * Jenifer Gonzalez Lopez + * (SF piece of the code) + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "su3adj.h" +#include "start.h" +#include "sf_get_staples.h" + +su3 sf_get_staples(int x, int mu, su3 ** in_gauge_field) { + + int k, iy, flag1, flag2; + static su3 v, st, cst; + su3 *w1, *w2, *w3; +#ifdef _KOJAK_INST +#pragma pomp inst begin(staples) +#endif + flag1 = -1; + flag2 = -1; + _su3_zero(v); + for (k = 0; k < 4; k++) { + if (k != mu) { + if (g_t[x] > 1 && g_t[x] < (g_Tbsf - 1)) { + flag1 = 0; + flag2 = 0; + } else if (g_t[x] == 0 && mu == 0) { + flag1 = 1; + flag2 = 1; + } else if (g_t[x] == 1 && mu == 0) { + flag1 = 0; + flag2 = 0; + } else if (g_t[x] == 1 && mu != 0) { + if (k != 0) { + flag1 = 0; + flag2 = 0; + } else if (k == 0) { + flag1 = 0; + flag2 = 1; + } + } else if (g_t[x] == (g_Tbsf - 1) && mu == 0) { + flag1 = 1; + flag2 = 1; + } else if (g_t[x] == (g_Tbsf - 1) && mu != 0) { + if (k != 0) { + flag1 = 0; + flag2 = 0; + } else if (k == 0) { + flag1 = 1; + flag2 = 0; + } + } + if (flag1 < 0 || flag2 < 0) + exit(0); + w1 = &in_gauge_field[x][k]; + w2 = &in_gauge_field[g_iup[x][k]][mu]; + w3 = &in_gauge_field[g_iup[x][mu]][k]; + _su3_times_su3d(st, *w2, *w3); + _real_times_su3(cst, (flag1 == 0 ? g_rgi_C0 : g_Ct) ,st); /* specific to SF */ + _su3_times_su3_acc(v, *w1, cst); + iy = g_idn[x][k]; + w1 = &in_gauge_field[iy][k]; + w2 = &in_gauge_field[iy][mu]; + w3 = &in_gauge_field[g_iup[iy][mu]][k]; + _su3_times_su3(st, *w2, *w3); + _real_times_su3(cst, (flag2 == 0 ? g_rgi_C0 : g_Ct) ,st); /* specific to SF */ + _su3d_times_su3_acc(v, *w1, cst); + } + } + + return v; +#ifdef _KOJAK_INST +#pragma pomp inst end(staples) +#endif +} + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/sf/sf_get_staples.h b/qcd/part_cpu/applications/QCD/src/kernel_D/sf/sf_get_staples.h new file mode 100644 index 0000000000000000000000000000000000000000..20a3d2988c3d87aa7dab6171f96622cf889e736e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/sf/sf_get_staples.h @@ -0,0 +1,27 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _SF_GET_STAPLES_H +#define _SF_GET_STAPLES_H + +#include"su3.h" + +su3 sf_get_staples(int x, int mu, su3 ** in_gauge_field); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/sf/sf_observables.c b/qcd/part_cpu/applications/QCD/src/kernel_D/sf/sf_observables.c new file mode 100644 index 0000000000000000000000000000000000000000..0bb3d17dd102963b15f6cba509125695bbc58175 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/sf/sf_observables.c @@ -0,0 +1,230 @@ +/******************************************* +* +* FILE: sf_observables.c +* +* Author: Jenifer Gonzalez Lopez +* +********************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "sse.h" +#include "su3.h" +#include "su3adj.h" +#include "global.h" +#include "geometry_eo.h" +#include "sf_calc_action.h" +#include "sf_observables.h" + +void sf_observables() { + + double plaquette_energy; + double rectangle_energy; + double wilson_action; + double wilson_action_sepbound; + double iwasaki_action; + double factor; + + /* sf b.c. abelian field and standard sf weight factors included (only plaquette here) */ + plaquette_energy = measure_plaquette_sf_weights(g_Tbsf); + wilson_action = measure_wilson_action_sf(g_Tbsf, g_beta); + wilson_action_sepbound = measure_wilson_action_sf_separate_boundary(g_Tbsf, g_beta); + if(g_proc_id==0){ + printf("\n"); fflush(stdout); + printf("SF b.c. abelian and standard sf weight factors included (only plaquette): \n"); fflush(stdout); + printf("The plaquette value is %e\n", plaquette_energy/(3.*6.*VOLUME*g_nproc)); fflush(stdout); + printf("The Wilson action value is %e\n", wilson_action); fflush(stdout); + printf("The Wilson action value sep bound is %e\n", wilson_action_sepbound); fflush(stdout); + } + /* sf b.c. abelian field and weight factors for O(a)-improvement included (only plaquette here) */ + plaquette_energy = measure_plaquette_sf_weights_improvement(g_Tbsf, g_Cs, g_Ct) ; + wilson_action = measure_wilson_action_sf_weights_improvement(g_Tbsf, g_beta, g_Cs, g_Ct); + wilson_action_sepbound = measure_wilson_action_sf_weights_improvement_separate_boundary(g_Tbsf, g_beta, g_Cs, g_Ct); + if(g_proc_id==0){ + printf("\n"); fflush(stdout); + printf("SF b.c. abelian and weight factors for O(a)-improvement included (only plaquette): \n"); fflush(stdout); + printf("The plaquette value is %e\n", plaquette_energy/(3.*6.*VOLUME*g_nproc)); fflush(stdout); + printf("The Wilson action value is %e\n", wilson_action); fflush(stdout); + printf("The Wilson action value sep bound is %e\n", wilson_action_sepbound); fflush(stdout); + } + /* sf b.c. abelian field and weight factors for O(a)-improvement included (plaquette and rectangle) */ + plaquette_energy = measure_plaquette_sf_iwasaki(g_Tbsf, g_Cs, g_Ct, g_rgi_C0) ; + rectangle_energy = measure_rectangle_sf_iwasaki(g_Tbsf, g_rgi_C1, g_C1ss, g_C1tss, g_C1tts); + iwasaki_action = measure_iwasaki_action_sf(g_Tbsf, g_beta, g_Cs, g_Ct, g_rgi_C0, g_rgi_C1, g_C1ss, g_C1tss, g_C1tts); + if(g_proc_id==0){ + printf("\n"); fflush(stdout); + printf("SF b.c. abelian and weight factors for O(a)-improvement included (Iwasaki = plaquette and rectangle): \n"); + fflush(stdout); + printf("The plaquette value is %e\n", plaquette_energy/(3.*6.*VOLUME*g_nproc)); fflush(stdout); + printf("The rectangle value is %e\n", rectangle_energy/(2.*3.*6.*VOLUME*g_nproc)); fflush(stdout); + printf("The Iwasaki action value is %e\n", iwasaki_action); fflush(stdout); + } + + /* COUPLING CALCULATION */ + + if(g_rgi_C1 > 0. || g_rgi_C1 < 0.) { + + /* print the value of the leading order effective action \Gamma[V] and its derivative \Gamma'[V] (plaquette case) */ + /* note that the derivative is precisely the constant factor in the definition of the coupling constant */ + if(g_proc_id==0){ + printf("\n"); fflush(stdout); + printf("Constant factor K: \n"); + printf("K = %e\n", partial_lattice_lo_effective_iwasaki_action_sf_k(g_Tbsf, g_beta, g_rgi_C0, g_rgi_C1, g_eta)); fflush(stdout); + } + + /* print the value of the "\partial(S)/\partial(eta)" which will have to be averaged later on to obtain the coupling constant */ + if(g_proc_id==0){ + printf("\n"); fflush(stdout); + printf("'Definition' of the coupling constant, partial(S)/partial(eta)\n"); fflush(stdout); + printf("S'[V,U] = %e\n", partial_iwasaki_action_sf_respect_to_eta(g_Tbsf, g_beta, g_Cs, g_Ct, g_rgi_C0, + g_rgi_C1, g_C1ss, g_C1tss, g_C1tts)); fflush(stdout); + printf("S'[V,U]/K = %e\n",partial_iwasaki_action_sf_respect_to_eta(g_Tbsf, g_beta, g_Cs, g_Ct, g_rgi_C0, + g_rgi_C1, g_C1ss, g_C1tss, g_C1tts)/partial_lattice_lo_effective_iwasaki_action_sf_k(g_Tbsf, g_beta, g_rgi_C0, g_rgi_C1, g_eta)); fflush(stdout); + printf("\n"); fflush(stdout); + } + + } + + else { + + factor = 1./(1. - (1. - g_Ct)*(2./((double)g_Tbsf))); + + /* print the value of the leading order effective action \Gamma[V] and its derivative \Gamma'[V] (plaquette case) */ + /* note that the derivative is precisely the constant factor in the definition of the coupling constant */ + if(g_proc_id==0){ + printf("\n"); fflush(stdout); + printf("Effective action and its derivative with respect to eta, at leading order: \n"); + printf("Gamma[V] = %e\n", lattice_lo_effective_plaquette_action_sf(g_Tbsf, g_beta, g_Ct, g_eta)); fflush(stdout); + printf("Gamma'[V] = %e\n", partial_lattice_lo_effective_plaquette_action_sf(g_Tbsf, g_beta, g_Ct, g_eta)); fflush(stdout); + printf("factor*Gamma'[V] = %e\n", factor*partial_lattice_lo_effective_plaquette_action_sf(g_Tbsf, g_beta, g_Ct, g_eta)); fflush(stdout); + printf("K_plaquette = %e\n", partial_lattice_lo_effective_plaquette_action_sf_k(g_Tbsf, g_beta, g_Ct, g_eta)); fflush(stdout); + } + + /* print the value of the "\partial(S)/\partial(eta)" which will have to be averaged later on to obtain the coupling constant */ + if(g_proc_id==0){ + printf("\n"); fflush(stdout); + printf("'Definition' of the coupling constant, partial(S)/partial(eta)\n"); fflush(stdout); + printf("S'[V,U] = %e\n",partial_wilson_action_sf_respect_to_eta(g_Tbsf, g_beta, g_Cs, g_Ct)); fflush(stdout); + printf("S'[V,U]/Gamma'[V] = %e\n",partial_wilson_action_sf_respect_to_eta(g_Tbsf, g_beta, g_Cs, g_Ct)/partial_lattice_lo_effective_plaquette_action_sf(g_Tbsf, g_beta, g_Ct, g_eta)); fflush(stdout); + printf("S'[V,U]/(factor*Gamma'[V]) = %e\n",partial_wilson_action_sf_respect_to_eta(g_Tbsf, g_beta, g_Cs, g_Ct)/(factor*partial_lattice_lo_effective_plaquette_action_sf(g_Tbsf, g_beta, g_Ct, g_eta))); fflush(stdout); + printf("S'[V,U]/K_plaquette = %e\n",partial_wilson_action_sf_respect_to_eta(g_Tbsf, g_beta, g_Cs, g_Ct)/partial_lattice_lo_effective_plaquette_action_sf_k(g_Tbsf, g_beta, g_Ct, g_eta)); fflush(stdout); + printf("\n"); fflush(stdout); + } + } + + /*****************************************************************************************************************************/ + /*****************************************************************************************************************************/ + /*****************************************************************************************************************************/ + +#if 0 + /* (1): identifying the gauge fields "g_gauge_fields = V" and then calculating the plaquette as usually */ + induced_lattice_background(g_gauge_field, g_Tbsf, g_eta); + + wilson_action = measure_wilson_action_sf_weights_improvement(g_Tbsf, g_beta, g_Cs, g_Ct); + wilson_action_sepbound = measure_wilson_action_sf_weights_improvement_separate_boundary(g_Tbsf, g_beta, g_Cs, g_Ct); + iwasaki_action = measure_iwasaki_action_sf(g_Tbsf, g_beta, g_Cs, g_Ct, g_rgi_C0, g_rgi_C1, g_C1ss, g_C1tss, g_C1tts); + partial_iwasaki_action = partial_iwasaki_action_sf_respect_to_eta(g_Tbsf, g_beta, g_Cs, g_Ct, g_rgi_C0, g_rgi_C1, g_C1ss, g_C1tss, g_C1tts); + + printf(" Assigning U=V with the functions defined for that and then calculating S[V] from the same functions to calculate the actions as in previous cases \n"); fflush(stdout); + printf("\n"); fflush(stdout); + printf("S_sf_wilson_sepbound[U,W',W] = %e \n", wilson_action_sepbound); fflush(stdout); + printf("S_sf_wilson_notsepbd[U,W',W] = %e \n", wilson_action ); fflush(stdout); + printf("S_sf_iwasaki_notsepb[U,W',W] = %e \n", iwasaki_action); fflush(stdout); + printf("G[V] = %e \n", (6./g_beta)*iwasaki_action); fflush(stdout); + printf("S'[V] = %e \n", partial_iwasaki_action); fflush(stdout); + printf("G'[V] = %e \n", (6./g_beta)*partial_iwasaki_action);fflush(stdout); + printf("\n"); fflush(stdout); + printf("measure_plaquette_sf_weights_improved_bulk = %e \n", measure_plaquette_sf_weights_improved_bulk(g_Tbsf)); fflush(stdout); + printf("measure_plaquette_sf_weights_improved_boundary_0(cs,ct) = %e \n", measure_plaquette_sf_weights_improved_boundary_0(g_Cs, g_Ct)); fflush(stdout); + printf("measure_plaquette_sf_weights_improved_boundary_t(cs) = %e \n", measure_plaquette_sf_weights_improved_boundary_t(g_Tbsf, g_Cs)); fflush(stdout); + printf("measure_plaquette_sf_weights_improved_boundary_t_minus_1(ct) = %e \n", measure_plaquette_sf_weights_improved_boundary_t_minus_1(g_Tbsf, g_Ct)); fflush(stdout); + printf("\n"); fflush(stdout); + + + /* obtain normalization factor by calculation Wilson action for U=1 in all the lattice + and substract it to the previous result for the action. + Therefore, it should agree with the result obtained from the analytical expression implemented below */ + set_all_links_to_one_with_dirichlet(g_Tbsf); + + iwasaki_action -= measure_iwasaki_action_sf(g_Tbsf, g_beta, g_Cs, g_Ct, g_rgi_C0, g_rgi_C1, g_C1ss, g_C1tss, g_C1tts); + partial_iwasaki_action -= partial_iwasaki_action_sf_respect_to_eta(g_Tbsf, g_beta, g_Cs, g_Ct, g_rgi_C0, g_rgi_C1, g_C1ss, g_C1tss, g_C1tts); + + printf("\n"); fflush(stdout); + printf(" Previous case but substracting the normalization factor to the action: \n"); fflush(stdout); + printf("\n"); fflush(stdout); + printf("Norm - S_sf_iwasaki_notsepb[U,W',W] = %e \n", iwasaki_action); fflush(stdout); + printf("Norm - G[V] = %e \n", (6./g_beta)*iwasaki_action); fflush(stdout); + printf("Norm' - S'[V] = %e \n", partial_iwasaki_action); fflush(stdout); + printf("Norm' - G'[V] = %e \n", (6./g_beta)*partial_iwasaki_action);fflush(stdout); + printf("\n"); fflush(stdout); + + + /* (2): directly from the analytical expression which has been implemente in: */ + printf("\n"); fflush(stdout); + printf(" Assigning U=V: but directly using the analytical expression of the action S[V] \n"); fflush(stdout); + printf("\n"); fflush(stdout); + printf("S[V]_analy = %e \n", lattice_background_plaquette_action_sf(g_Tbsf, g_beta, g_Ct, g_eta)); fflush(stdout); + printf("G[V]_analy = %e \n", lattice_lo_effective_plaquette_action_sf(g_Tbsf, g_beta, g_Ct, g_eta)); fflush(stdout); + printf("S'[V]_analy = %e \n", partial_lattice_background_plaquette_action_sf(g_Tbsf, g_beta, g_Ct, g_eta)); fflush(stdout); + printf("G'[V]_analy = %e \n", partial_lattice_lo_effective_plaquette_action_sf(g_Tbsf, g_beta, g_Ct, g_eta)); fflush(stdout); + printf("\n"); fflush(stdout); + + + /* obtain normalization factor by calculation Wilson action for U=1 in all the lattice */ + set_all_links_to_one_with_dirichlet(g_Tbsf); + + printf("\n"); fflush(stdout); + printf(" Setting U=Id and Dirichlet at x0= 0, t \n"); fflush(stdout); + printf("\n"); fflush(stdout); + /* The next three prints give me the same result, from 3 different functions. + The first two functions were cross-checked with Dru ==> they should be right. + Hoever, the result here obtained still differs to what we obtain by doing the + differenct between our result (for U=V) and the analytical expression */ + printf("S_sf_wilson_sepbound[U,W',W] = %e \n", measure_wilson_action_sf_weights_improvement_separate_boundary(g_Tbsf, g_beta, g_Cs, g_Ct)); fflush(stdout); + printf("S_sf_wilson_notsepbd[U,W',W] = %e \n", measure_wilson_action_sf_weights_improvement(g_Tbsf, g_beta, g_Cs, g_Ct) ); fflush(stdout); + printf("S_sf_iwasaki_notsepb[U,W',W] = %e \n", measure_iwasaki_action_sf(g_Tbsf, g_beta, g_Cs, g_Ct, g_rgi_C0, g_rgi_C1, g_C1ss, g_C1tss, g_C1tts)); fflush(stdout); + printf("G[U,W',W] = %e \n", (6./g_beta)*measure_iwasaki_action_sf(g_Tbsf, g_beta, g_Cs, g_Ct, g_rgi_C0, g_rgi_C1, g_C1ss, g_C1tss, g_C1tts)); fflush(stdout); + printf("S'[U,W',W] = %e \n", partial_iwasaki_action_sf_respect_to_eta(g_Tbsf, g_beta, g_Cs, g_Ct, g_rgi_C0, g_rgi_C1, g_C1ss, g_C1tss, g_C1tts)); fflush(stdout); + printf("G'[U,W',W] = %e \n", (6./g_beta)*partial_iwasaki_action_sf_respect_to_eta(g_Tbsf, g_beta, g_Cs, g_Ct, g_rgi_C0, g_rgi_C1, g_C1ss, g_C1tss, g_C1tts));fflush(stdout); + printf("\n"); fflush(stdout); + printf("measure_plaquette_sf_weights_improved_bulk = %e \n", measure_plaquette_sf_weights_improved_bulk(g_Tbsf)); fflush(stdout); + printf("measure_plaquette_sf_weights_improved_boundary_0(cs,ct) = %e \n", measure_plaquette_sf_weights_improved_boundary_0(g_Cs, g_Ct)); fflush(stdout); + printf("measure_plaquette_sf_weights_improved_boundary_t(cs) = %e \n", measure_plaquette_sf_weights_improved_boundary_t(g_Tbsf, g_Cs)); fflush(stdout); + printf("measure_plaquette_sf_weights_improved_boundary_t_minus_1(ct) = %e \n", measure_plaquette_sf_weights_improved_boundary_t_minus_1(g_Tbsf, g_Ct)); fflush(stdout); + printf("\n"); fflush(stdout); + + + /* obtain normalization factor by calculation Wilson action for U=1 in all the lattice */ + set_all_links_to_one(); + + printf("\n"); fflush(stdout); + printf(" Setting U=Id \n"); fflush(stdout); + printf("\n"); fflush(stdout); + /* For the first case below, pbc, I've gotten the number I expected: "(Nc*12*L^4)/g02". + Thus, since the function "measure_iwasaki_action(g_beta, g_rgi_C0, g_rgi_C1))" was crosschecked bf with Dru it should be right. + It somehow tells me that also the function which assigns the gauge fields to one "set_all_links_to_one()" should be right.*/ + printf("S_pbc[U,W',W] = %e \n", measure_iwasaki_action(g_beta, g_rgi_C0, g_rgi_C1)); fflush(stdout); + /* The next three prints give me the same result, from 3 different functions. + The first two functions were cross-checked with Dru ==> they should be right. + Hoever, the result here obtained still differs to what we obtain by doing the + differenct between our result (for U=V) and the analytical expression */ + printf("S_sf_wilson_sepbound[U,W',W] = %e \n", measure_wilson_action_sf_weights_improvement_separate_boundary(g_Tbsf, g_beta, g_Cs, g_Ct)); fflush(stdout); + printf("S_sf_wilson_notsepbd[U,W',W] = %e \n", measure_wilson_action_sf_weights_improvement(g_Tbsf, g_beta, g_Cs, g_Ct) ); fflush(stdout); + printf("S_sf_iwasaki_notsepb[U,W',W] = %e \n", measure_iwasaki_action_sf(g_Tbsf, g_beta, g_Cs, g_Ct, g_rgi_C0, g_rgi_C1, g_C1ss, g_C1tss, g_C1tts)); fflush(stdout); + printf("G[U,W',W] = %e \n", (6./g_beta)*measure_iwasaki_action_sf(g_Tbsf, g_beta, g_Cs, g_Ct, g_rgi_C0, g_rgi_C1, g_C1ss, g_C1tss, g_C1tts)); fflush(stdout); + printf("S'[U,W',W] = %e \n", partial_iwasaki_action_sf_respect_to_eta(g_Tbsf, g_beta, g_Cs, g_Ct, g_rgi_C0, g_rgi_C1, g_C1ss, g_C1tss, g_C1tts)); fflush(stdout); + printf("G'[U,W',W] = %e \n", (6./g_beta)*partial_iwasaki_action_sf_respect_to_eta(g_Tbsf, g_beta, g_Cs, g_Ct, g_rgi_C0, g_rgi_C1, g_C1ss, g_C1tss, g_C1tts));fflush(stdout); + printf("\n"); fflush(stdout); + printf("measure_plaquette_sf_weights_improved_bulk = %e \n", measure_plaquette_sf_weights_improved_bulk(g_Tbsf)); fflush(stdout); + printf("measure_plaquette_sf_weights_improved_boundary_0(cs,ct) = %e \n", measure_plaquette_sf_weights_improved_boundary_0(g_Cs, g_Ct)); fflush(stdout); + printf("measure_plaquette_sf_weights_improved_boundary_t(cs) = %e \n", measure_plaquette_sf_weights_improved_boundary_t(g_Tbsf, g_Cs)); fflush(stdout); + printf("measure_plaquette_sf_weights_improved_boundary_t_minus_1(ct) = %e \n", measure_plaquette_sf_weights_improved_boundary_t_minus_1(g_Tbsf, g_Ct)); fflush(stdout); + printf("\n"); fflush(stdout); + +#endif + +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/sf/sf_observables.h b/qcd/part_cpu/applications/QCD/src/kernel_D/sf/sf_observables.h new file mode 100644 index 0000000000000000000000000000000000000000..1bf7b1a1f8b3ad0c7c2a5c4de5e60c567fe7dd46 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/sf/sf_observables.h @@ -0,0 +1,14 @@ +/******************************************* +* +* FILE: sf_observables.h +* +* Author: Jenifer Gonzalez Lopez +* +********************************************/ +#ifndef _SF_OBSERVABLES_H +#define _SF_OBSERVABLES_H + +void sf_observables(); + + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/sf/sf_utils.c b/qcd/part_cpu/applications/QCD/src/kernel_D/sf/sf_utils.c new file mode 100644 index 0000000000000000000000000000000000000000..ecf0c0c75008fd3f214b66393c0db1b264231fb3 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/sf/sf_utils.c @@ -0,0 +1,281 @@ +/******************************************************************************* + *******************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "global.h" + +double calc_sq_plaq( void ) +{ + int x; + int x_p_mu; + int x_p_nu; + int mu; + int nu; + su3* u_mu_x; + su3* u_nu_x_p_mu; + su3* u_mu_x_p_nu; + su3* u_nu_x; + su3 tmp1; + su3 tmp2; + double tr; + double sum = 0; + + for( x=0; x. + ***********************************************************************/ + +/************************************************************ + * + * Routines to handle system signals + * + * void catch_ill_inst(int s) + * + * catches illegal instructions signal + * and writes an error indication to + * stdout. + * + * input: + * int s: signal number (not needed) + * + ************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#ifdef MPI +# include +#endif + + +/* Catch an illegal instruction in order */ +/* to give the user a hint what was wrong */ +void catch_ill_inst(int s){ + fprintf(stderr, "An illegal instruction occured!\n"); +#ifdef SSE + fprintf(stderr, "Your code was compiled to use SSE1 instructions.\n"); +#endif +#ifdef SSE2 + fprintf(stderr, "Your code was compiled to use SSE2 instructions.\n"); +#endif +#ifdef SSE3 + fprintf(stderr, "Your code was compiled to use SSE3 instructions.\n"); +#endif + fprintf(stderr, "Probably this caused the exception.\n"); + fprintf(stderr, "Please check whether your processor supports SSE1/2/3) instructions!\n"); + fprintf(stderr, "Aborting...\n"); + fflush(stdout); +#ifdef MPI + MPI_Abort(MPI_COMM_WORLD, 1); + MPI_Finalize(); +#endif + exit(0); +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/sighandler.h b/qcd/part_cpu/applications/QCD/src/kernel_D/sighandler.h new file mode 100644 index 0000000000000000000000000000000000000000..aa2e9ea74a3bd05934925e50207392d070336d92 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/sighandler.h @@ -0,0 +1,69 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +/************************************************************ + * + * Routines to handle system signals + * + * void catch_ill_inst(int s) + * + * catches illegal instructions signal + * and writes an error indication to + * stdout. + * + * input: + * int s: signal number (not used) + * + * + * void catch_del_sig(int s) + * + * catches some user defined signals + * and saves configuration and + * random number status to disk + * + * input: + * int s: signal number (not used) + ************************************************************/ + +#ifndef _SIGHANDLER_H +#define _SIGHANDLER_H +/* During critical regions one does not want */ +/* the configuration to be dumped */ +/* in this case set dontdump to 1 while */ +/* the program is in the region */ +/* don't forget to reset this value... */ +extern int dontdump; + +/* If a signal is catched while dontdump==1 */ +/* forcedump is set to 1 */ +/* This can be used to dump data to disk and */ +/* exit savely after the critical region has finished */ +extern int forcedump; + +/* Catch an illegal instruction in order */ +/* to give the user a hint what was wrong */ +void catch_ill_inst(int); + +/* catch some signals as SIGUSR1|2 and SIGTERM */ +/* to save the current configuration and */ +/* random number state */ +/* This might help to save computing time */ +void catch_del_sig(int); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/Makefile.in b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/Makefile.in new file mode 100644 index 0000000000000000000000000000000000000000..408565722dfd65a4de48e9ebd81af5e24267ffc5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/Makefile.in @@ -0,0 +1,97 @@ + +srcdir = @srcdir@ +top_builddir = @top_builddir@ +abs_top_builddir = @abs_top_builddir@ +top_srcdir = @top_srcdir@ +abs_top_srcdir = @abs_top_srcdir@ +subdir = smearing +builddir = @builddir@ + +CFLAGS = @CFLAGS@ +DEPFLAGS = @DEPFLAGS@ +LDFLAGS = @LDFLAGS@ +DEFS = @DEFS@ +OPTARGS = @OPTARGS@ + +AR = @AR@ +RANLIB = @RANLIB@ +CC = @CC@ +CCDEP = @CCDEP@ +CCLD = $(CC) +LINK = $(CCLD) $(CFLAGS) $(LDFLAGS) ${OPTARGS} -o $@ +LEX = @LEX@ +AUTOCONF = @AUTOCONF@ +DEFS = @DEFS@ + +LEMON_AVAILABLE = @LEMON_AVAILABLE@ + +INCLUDES = @INCLUDES@ +LDADD = +COMPILE = ${CC} ${DEFS} ${INCLUDES} ${CFLAGS} ${OPTARGS} + +LIBRARIES = libsmear + +libsmear_TARGETS = hex_hex_smear hex_stout_exclude_none hex_stout_exclude_one hex_stout_exclude_two \ + hyp_APE_project_exclude_one hyp_APE_project_exclude_two hyp_APE_project_exclude_none \ + hyp_hyp_staples_exclude_none hyp_hyp_staples_exclude_one hyp_hyp_staples_exclude_two \ + hyp_hyp_smear stout_stout_smear ape_ape_smear utils_reunitarize utils_generic_staples \ + utils_project_antiherm utils_print_su3 utils_print_config_to_screen + +libsmear_OBJECTS = $(addsuffix .o, ${libsmear_TARGETS}) + +# default rule + +all: Makefile dep libsmear.a + +# rules for debugging +debug all-debug: CFLAGS := $(CFLAGS) @DEBUG_FLAG@ +debug all-debug: all + +# rules for profiling information +profile all-profile: CFLAGS := $(filter-out -fomit-frame-pointer,${CFLAGS}) @PROFILE_FLAG@ +profile all-profile: all + + +#include dep rules + + +-include $(addsuffix .d,${libsmear_TARGETS}) + +include ${top_srcdir}/Makefile.global + +# rule to compile objects + +%.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h + $(COMPILE) -c $< + + +# rule to make libsmear +libsmear.a: ${libsmear_OBJECTS} Makefile + @rm -f libsmear.a + @${AR} cru libsmear.a $(libsmear_OBJECTS) + @$(RANLIB) libsmear.a + @cp libsmear.a ${top_builddir}/lib/libsmear.a + +# rule to generate .d files + +$(addsuffix .d,$(libsmear_TARGETS)): %.d: ${srcdir}/%.c Makefile + @$(CCDEP) ${DEFS} ${DEPFLAGS} ${INCLUDES} $< > $@ + +# rule to make dependencies + +dep: ${addsuffix .d, ${libsmear_TARGETS}} + +# rules to clean + +compile-clean: Makefile + rm -f ${$(addsuffix _OBJECTS, ${LIBRARIES})} *.d + +clean: compile-clean + rm -f $(addsuffix .a, ${LIBRARIES}) + rm -f ../lib/libsmear.a + +distclean: clean + rm -f Makefile + + +.PHONY: all dep clean compile-clean distclean debug all-debug profile all-profile diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/ape.h b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/ape.h new file mode 100644 index 0000000000000000000000000000000000000000..ddfa3e5a219d59c60dd0fbbe6c38c56e6a945339 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/ape.h @@ -0,0 +1,11 @@ +#pragma once + +#include + +struct ape_parameters +{ + double rho; + int iterations; +}; + +int ape_smear(su3_tuple *m_field_out, struct ape_parameters const *params, su3_tuple *m_field_in); \ No newline at end of file diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/ape.ih b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/ape.ih new file mode 100644 index 0000000000000000000000000000000000000000..c1ec7fc101626b323522627b205ee75f7ba795b7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/ape.ih @@ -0,0 +1,21 @@ +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include \ No newline at end of file diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/ape_ape_smear.c b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/ape_ape_smear.c new file mode 100644 index 0000000000000000000000000000000000000000..1f12b273f841c7e36f444b14684a9dd754e385b2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/ape_ape_smear.c @@ -0,0 +1,49 @@ +#include "ape.ih" + +int ape_smear(su3_tuple *m_field_out, struct ape_parameters const *params, su3_tuple *m_field_in) +{ + static int initialized = 0; + static su3_tuple *buffer; + static su3 tmp; + double const rho_p = 1 - params->rho; + double const rho_s = params->rho / 6.0; + + if (!initialized) + { + /* Allocate consecutive memory for both of the buffers upon first instantiation */ + buffer = (su3_tuple*)malloc(sizeof(su3_tuple) * VOLUMEPLUSRAND + 1); +#if (defined SSE || defined SSE2 || defined SSE3) + buffer = (su3_tuple*)(((unsigned long int)(buffer) + ALIGN_BASE) & ~ALIGN_BASE); +#endif + + if (buffer == (su3_tuple*)NULL) + return -1; + initialized = 1; + } + + /* start of the the stout smearing **/ + for(int iter = 0; iter < params->iterations; ++iter) + { + for (int x = 0; x < VOLUME; ++x) + for (int mu = 0; mu < 4; ++mu) + { + generic_staples(&tmp, x, mu, m_field_in); + _real_times_su3_plus_real_times_su3(buffer[x][mu], rho_p, m_field_in[x][mu], rho_s, tmp) + reunitarize(&buffer[x][mu]); + } + + for(int x = 0; x < VOLUME; ++x) + for(int mu = 0 ; mu < 4; ++mu) + { + _su3_assign(m_field_out[x][mu], buffer[x][mu]); + } + + generic_exchange(m_field_out, sizeof(su3_tuple)); + m_field_in = m_field_out; /* Prepare for next iteration */ + } + + return(0); +} + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/hex.h b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/hex.h new file mode 100644 index 0000000000000000000000000000000000000000..30ef4931a05114347fe0e35d78f030a08b485c03 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/hex.h @@ -0,0 +1,14 @@ +#pragma once + +#include +#include + +/* Just to have a consistent look to the interface */ +typedef struct hyp_parameters hex_parameters; + +/* All defined in terms of arrays of tuples -- needed to allow for g_gauge_field as input */ +void stout_exclude_none(gauge_field_t buff_out, double const coeff, gauge_field_array_t staples, gauge_field_t buff_in); +void stout_exclude_one (gauge_field_array_t buff_out, double const coeff, gauge_field_array_t staples, gauge_field_t buff_in); +void stout_exclude_two (gauge_field_array_t buff_out, double const coeff, gauge_field_array_t staples, gauge_field_t buff_in); + +int hex_smear(gauge_field_t m_field_out, hex_parameters const *params, gauge_field_t m_field_in); /* 4 components in, 4 components out */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/hex.ih b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/hex.ih new file mode 100644 index 0000000000000000000000000000000000000000..33971fbabdf2f48fbfadbb9db126e6ef7d7846b2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/hex.ih @@ -0,0 +1,21 @@ +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include \ No newline at end of file diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/hex_hex_smear.c b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/hex_hex_smear.c new file mode 100644 index 0000000000000000000000000000000000000000..d0f5538a8437b1d803693b44321545a096db9087 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/hex_hex_smear.c @@ -0,0 +1,51 @@ +#include "hex.ih" + +int hex_smear(su3_tuple *m_field_out, hex_parameters const *params, su3_tuple *m_field_in) +{ + static int initialized = 0; + static su3_tuple *gamma_buffer[3]; + static su3_tuple *v_buffer[3]; + + if (!initialized) + { + /* Allocate consecutive memory for both of the buffers upon first instantiation */ + /* Three times 4 buffers needed for compatibility purposes (similar signature to gauge_field...) */ + for (int idx = 0; idx < 3; ++idx) + { + gamma_buffer[idx] = (su3_tuple*)malloc(sizeof(su3_tuple) * VOLUMEPLUSRAND + 1); + v_buffer[idx] = (su3_tuple*)malloc(sizeof(su3_tuple) * VOLUMEPLUSRAND + 1); + if ((gamma_buffer[idx] == (su3_tuple*)NULL) || (v_buffer[idx] == (su3_tuple*)NULL)) + return -1; +#if (defined SSE || defined SSE2 || defined SSE3) + gamma_buffer[idx] = (su3_tuple*)(((unsigned long int)(gamma_buffer[idx]) + ALIGN_BASE) & ~ALIGN_BASE); + v_buffer[idx] = (su3_tuple*)(((unsigned long int)(v_buffer[idx]) + ALIGN_BASE) & ~ALIGN_BASE); +#endif + } + initialized = 1; + } + + for (int iter = 0; iter < params->iterations; ++iter) + { + /* First level of contractions */ + hyp_staples_exclude_two(gamma_buffer, m_field_in); + stout_exclude_two(v_buffer, params->alpha[2], gamma_buffer, m_field_in); + for (int idx = 0; idx < 3; ++idx) + generic_exchange(v_buffer[idx], sizeof(su3_tuple)); + + /* Second level of contractions */ + hyp_staples_exclude_one(gamma_buffer, v_buffer); + stout_exclude_one(v_buffer, params->alpha[1], gamma_buffer, m_field_in); + for (int idx = 0; idx < 3; ++idx) + generic_exchange(v_buffer[idx], sizeof(su3_tuple)); + + /* Final level of contractions */ + hyp_staples_exclude_none(gamma_buffer, v_buffer); + stout_exclude_none(m_field_out, params->alpha[0], gamma_buffer, m_field_in); + generic_exchange(m_field_out, sizeof(su3_tuple)); + + m_field_in = m_field_out; /* Prepare for next iteration */ + } + + return 0; +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/hex_stout_exclude_none.c b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/hex_stout_exclude_none.c new file mode 100644 index 0000000000000000000000000000000000000000..0d696ac70bcff854e9a2c4f9030f6e839f02b0dd --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/hex_stout_exclude_none.c @@ -0,0 +1,24 @@ +#include "hex.ih" + +void stout_exclude_none(su3_tuple *buff_out, double const coeff, su3_tuple **staples, su3_tuple *buff_in) +{ + static su3 tmp; + +#define _MULTIPLY_AND_EXPONENTIATE(x, principal) \ + { \ + _su3_times_su3d(tmp, (*staples)[x][principal], buff_in[x][principal]); \ + project_antiherm(&tmp); \ + _real_times_su3(buff_out[x][principal], coeff, tmp); \ + exposu3_in_place(&buff_out[x][principal]); \ + } + + for (int x = 0; x < VOLUME; ++x) + { + _MULTIPLY_AND_EXPONENTIATE(x, I0_0); + _MULTIPLY_AND_EXPONENTIATE(x, I0_1); + _MULTIPLY_AND_EXPONENTIATE(x, I0_2); + _MULTIPLY_AND_EXPONENTIATE(x, I0_3); + } + +#undef _MULTIPLY_AND_EXPONENTIATE +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/hex_stout_exclude_one.c b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/hex_stout_exclude_one.c new file mode 100644 index 0000000000000000000000000000000000000000..4071e29a82d3a7130f41683121ba902e9cb95e3d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/hex_stout_exclude_one.c @@ -0,0 +1,35 @@ +#include "hex.ih" + +void stout_exclude_one(su3_tuple **buff_out, double const coeff, su3_tuple **staples, su3_tuple *buff_in) +{ + static su3 tmp; + +#define _MULTIPLY_AND_EXPONENTIATE(x, principal, component) \ + { \ + _su3_times_su3d(tmp, staples[component / 4][x][component % 4], buff_in[x][principal]); \ + project_antiherm(&tmp); \ + _real_times_su3(buff_out[component / 4][x][component % 4], coeff, tmp); \ + exposu3_in_place(&buff_out[component / 4][x][component % 4]); \ + } + + for (int x = 0; x < VOLUME; ++x) + { + _MULTIPLY_AND_EXPONENTIATE(x, I0_0, I1_0_1); + _MULTIPLY_AND_EXPONENTIATE(x, I0_0, I1_0_2); + _MULTIPLY_AND_EXPONENTIATE(x, I0_0, I1_0_3); + + _MULTIPLY_AND_EXPONENTIATE(x, I0_1, I1_1_0); + _MULTIPLY_AND_EXPONENTIATE(x, I0_1, I1_1_2); + _MULTIPLY_AND_EXPONENTIATE(x, I0_1, I1_1_3); + + _MULTIPLY_AND_EXPONENTIATE(x, I0_2, I1_2_0); + _MULTIPLY_AND_EXPONENTIATE(x, I0_2, I1_2_1); + _MULTIPLY_AND_EXPONENTIATE(x, I0_2, I1_2_3); + + _MULTIPLY_AND_EXPONENTIATE(x, I0_3, I1_3_0); + _MULTIPLY_AND_EXPONENTIATE(x, I0_3, I1_3_1); + _MULTIPLY_AND_EXPONENTIATE(x, I0_3, I1_3_2); + } + +#undef _MULTIPLY_AND_EXPONENTIATE +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/hex_stout_exclude_two.c b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/hex_stout_exclude_two.c new file mode 100644 index 0000000000000000000000000000000000000000..921457bbe0566295cf144cd2a4d90dab0a04c3ed --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/hex_stout_exclude_two.c @@ -0,0 +1,35 @@ +#include "hex.ih" + +void stout_exclude_two(su3_tuple **buff_out, double const coeff, su3_tuple **staples, su3_tuple *buff_in) +{ + static su3 tmp; + +#define _MULTIPLY_AND_EXPONENTIATE(x, principal, component) \ + { \ + _su3_times_su3d(tmp, staples[component / 4][x][component % 4], buff_in[x][principal]); \ + project_antiherm(&tmp); \ + _real_times_su3(buff_out[component / 4][x][component % 4], coeff, tmp); \ + exposu3_in_place(&buff_out[component / 4][x][component % 4]); \ + } + + for (int x = 0; x < VOLUME; ++x) + { + _MULTIPLY_AND_EXPONENTIATE(x, I0_0, I2_0_12); + _MULTIPLY_AND_EXPONENTIATE(x, I0_0, I2_0_23); + _MULTIPLY_AND_EXPONENTIATE(x, I0_0, I2_0_13); + + _MULTIPLY_AND_EXPONENTIATE(x, I0_1, I2_1_02); + _MULTIPLY_AND_EXPONENTIATE(x, I0_1, I2_1_03); + _MULTIPLY_AND_EXPONENTIATE(x, I0_1, I2_1_23); + + _MULTIPLY_AND_EXPONENTIATE(x, I0_2, I2_2_01); + _MULTIPLY_AND_EXPONENTIATE(x, I0_2, I2_2_03); + _MULTIPLY_AND_EXPONENTIATE(x, I0_2, I2_2_13); + + _MULTIPLY_AND_EXPONENTIATE(x, I0_3, I2_3_01); + _MULTIPLY_AND_EXPONENTIATE(x, I0_3, I2_3_02); + _MULTIPLY_AND_EXPONENTIATE(x, I0_3, I2_3_12); + } + +#undef _MULTIPLY_AND_EXPONENTIATE +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/hyp.h b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/hyp.h new file mode 100644 index 0000000000000000000000000000000000000000..52df0588bf0c1dc0a44e7a2a86efbb799e6086a3 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/hyp.h @@ -0,0 +1,21 @@ +#pragma once + +#include + +struct hyp_parameters +{ + double alpha[3]; + int iterations; +}; + +/* All defined in terms of arrays of tuples -- needed to allow for g_gauge_field as input */ + +void hyp_staples_exclude_none(su3_tuple **buff_out, su3_tuple **buff_in); /* 12 components in, 12 components out */ +void hyp_staples_exclude_one (su3_tuple **buff_out, su3_tuple **buff_in); /* 12 components in, 12 components out */ +void hyp_staples_exclude_two (su3_tuple **buff_out, su3_tuple *buff_in); /* 4 components in, 12 components out */ + +void APE_project_exclude_none(su3_tuple *buff_out, double const coeff, su3_tuple **staples, su3_tuple *buff_in); +void APE_project_exclude_one (su3_tuple **buff_out, double const coeff, su3_tuple **staples, su3_tuple *buff_in); +void APE_project_exclude_two (su3_tuple **buff_out, double const coeff, su3_tuple **staples, su3_tuple *buff_in); + +int hyp_smear(su3_tuple *m_field_out, struct hyp_parameters const *params, su3_tuple *m_field_in); /* 4 components in, 4 components out */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/hyp.ih b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/hyp.ih new file mode 100644 index 0000000000000000000000000000000000000000..b71c28c43143b4e7bdd80c146db415fd1c707866 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/hyp.ih @@ -0,0 +1,21 @@ +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include \ No newline at end of file diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/hyp_APE_project_exclude_none.c b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/hyp_APE_project_exclude_none.c new file mode 100644 index 0000000000000000000000000000000000000000..c484515b9667e9fe94131be8b3c2a1fc566cc593 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/hyp_APE_project_exclude_none.c @@ -0,0 +1,23 @@ +#include "utils.ih" + +void APE_project_exclude_none(su3_tuple *buff_out, double const coeff, su3_tuple **staples, su3_tuple *buff_in) +{ + double const coeff_principal = 1.0 - coeff; + double const coeff_staples = coeff / 6.0; + +#define _ADD_AND_REUNITARIZE(x, component) \ + { \ + _real_times_su3_plus_real_times_su3(buff_out[x][component], coeff_principal, buff_in[x][component], coeff_staples, (*staples)[x][component]) \ + reunitarize(buff_out[x] + component); \ + } + + for (int x = 0; x < VOLUME; ++x) + { + _ADD_AND_REUNITARIZE(x, I0_0); + _ADD_AND_REUNITARIZE(x, I0_1); + _ADD_AND_REUNITARIZE(x, I0_2); + _ADD_AND_REUNITARIZE(x, I0_3); + } + +#undef _ADD_AND_REUNITARIZE +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/hyp_APE_project_exclude_one.c b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/hyp_APE_project_exclude_one.c new file mode 100644 index 0000000000000000000000000000000000000000..869e23226ba6a642937341d91d3155a7970efde7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/hyp_APE_project_exclude_one.c @@ -0,0 +1,34 @@ +#include "hyp.ih" + +void APE_project_exclude_one(su3_tuple **buff_out, double const coeff, su3_tuple **staples, su3_tuple *buff_in) +{ + double const coeff_principal = 1.0 - coeff; + double const coeff_staples = coeff / 4.0; + +#define _ADD_AND_REUNITARIZE(x, principal, component) \ + { \ + _real_times_su3_plus_real_times_su3(buff_out[component / 4][x][component % 4], coeff_principal, buff_in[x][principal], coeff_staples, staples[component / 4][x][component % 4]) \ + reunitarize(buff_out[component / 4][x] + (component % 4)); \ + } + + for (int x = 0; x < VOLUME; ++x) + { + _ADD_AND_REUNITARIZE(x, I0_0, I1_0_1); + _ADD_AND_REUNITARIZE(x, I0_0, I1_0_2); + _ADD_AND_REUNITARIZE(x, I0_0, I1_0_3); + + _ADD_AND_REUNITARIZE(x, I0_1, I1_1_0); + _ADD_AND_REUNITARIZE(x, I0_1, I1_1_2); + _ADD_AND_REUNITARIZE(x, I0_1, I1_1_3); + + _ADD_AND_REUNITARIZE(x, I0_2, I1_2_0); + _ADD_AND_REUNITARIZE(x, I0_2, I1_2_1); + _ADD_AND_REUNITARIZE(x, I0_2, I1_2_3); + + _ADD_AND_REUNITARIZE(x, I0_3, I1_3_0); + _ADD_AND_REUNITARIZE(x, I0_3, I1_3_1); + _ADD_AND_REUNITARIZE(x, I0_3, I1_3_2); + } + +#undef _ADD_AND_REUNITARIZE +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/hyp_APE_project_exclude_two.c b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/hyp_APE_project_exclude_two.c new file mode 100644 index 0000000000000000000000000000000000000000..ee47fd5036b9391fe158467d83a871bbc753b542 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/hyp_APE_project_exclude_two.c @@ -0,0 +1,34 @@ +#include "hyp.ih" + +void APE_project_exclude_two(su3_tuple **buff_out, double const coeff, su3_tuple **staples, su3_tuple *buff_in) +{ + double const coeff_principal = 1.0 - coeff; + double const coeff_staples = coeff / 2.0; + +#define _ADD_AND_REUNITARIZE(x, principal, component) \ + { \ + _real_times_su3_plus_real_times_su3(buff_out[component / 4][x][component % 4], coeff_principal, buff_in[x][principal], coeff_staples, staples[component / 4][x][component % 4]) \ + reunitarize(buff_out[component / 4][x] + (component % 4)); \ + } + + for (int x = 0; x < VOLUME; ++x) + { + _ADD_AND_REUNITARIZE(x, I0_0, I2_0_12); + _ADD_AND_REUNITARIZE(x, I0_0, I2_0_23); + _ADD_AND_REUNITARIZE(x, I0_0, I2_0_13); + + _ADD_AND_REUNITARIZE(x, I0_1, I2_1_02); + _ADD_AND_REUNITARIZE(x, I0_1, I2_1_03); + _ADD_AND_REUNITARIZE(x, I0_1, I2_1_23); + + _ADD_AND_REUNITARIZE(x, I0_2, I2_2_01); + _ADD_AND_REUNITARIZE(x, I0_2, I2_2_03); + _ADD_AND_REUNITARIZE(x, I0_2, I2_2_13); + + _ADD_AND_REUNITARIZE(x, I0_3, I2_3_01); + _ADD_AND_REUNITARIZE(x, I0_3, I2_3_02); + _ADD_AND_REUNITARIZE(x, I0_3, I2_3_12); + } + +#undef _ADD_AND_REUNITARIZE +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/hyp_hyp_smear.c b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/hyp_hyp_smear.c new file mode 100644 index 0000000000000000000000000000000000000000..59356adf0eed9cab05086005a8eea079482764ac --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/hyp_hyp_smear.c @@ -0,0 +1,51 @@ +#include "hyp.ih" + +int hyp_smear(su3_tuple *m_field_out, struct hyp_parameters const *params, su3_tuple *m_field_in) +{ + static int initialized = 0; + static su3_tuple *gamma_buffer[3]; + static su3_tuple *v_buffer[3]; + + if (!initialized) + { + /* Allocate consecutive memory for both of the buffers upon first instantiation */ + /* Three times 4 buffers needed for compatibility purposes (similar signature to gauge_field...) */ + for (int idx = 0; idx < 3; ++idx) + { + gamma_buffer[idx] = (su3_tuple*)malloc(sizeof(su3_tuple) * VOLUMEPLUSRAND + 1); + v_buffer[idx] = (su3_tuple*)malloc(sizeof(su3_tuple) * VOLUMEPLUSRAND + 1); + if ((gamma_buffer[idx] == (su3_tuple*)NULL) || (v_buffer[idx] == (su3_tuple*)NULL)) + return -1; +#if (defined SSE || defined SSE2 || defined SSE3) + gamma_buffer[idx] = (su3_tuple*)(((unsigned long int)(gamma_buffer[idx]) + ALIGN_BASE) & ~ALIGN_BASE); + v_buffer[idx] = (su3_tuple*)(((unsigned long int)(v_buffer[idx]) + ALIGN_BASE) & ~ALIGN_BASE); +#endif + } + initialized = 1; + } + + for (int iter = 0; iter < params->iterations; ++iter) + { + /* First level of contractions */ + hyp_staples_exclude_two(gamma_buffer, m_field_in); + APE_project_exclude_two(v_buffer, params->alpha[2], gamma_buffer, m_field_in); + for (int idx = 0; idx < 3; ++idx) + generic_exchange(v_buffer[idx], sizeof(su3_tuple)); + + /* Second level of contractions */ + hyp_staples_exclude_one(gamma_buffer, v_buffer); + APE_project_exclude_one(v_buffer, params->alpha[1], gamma_buffer, m_field_in); + for (int idx = 0; idx < 3; ++idx) + generic_exchange(v_buffer[idx], sizeof(su3_tuple)); + + /* Final level of contractions */ + hyp_staples_exclude_none(gamma_buffer, v_buffer); + APE_project_exclude_none(m_field_out, params->alpha[0], gamma_buffer, m_field_in); + generic_exchange(m_field_out, sizeof(su3_tuple)); + + m_field_in = m_field_out; /* Prepare for next iteration */ + } + + return 0; +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/hyp_hyp_staples_exclude_none.c b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/hyp_hyp_staples_exclude_none.c new file mode 100644 index 0000000000000000000000000000000000000000..45862352a869d40cc634ae4644e69222869acb12 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/hyp_hyp_staples_exclude_none.c @@ -0,0 +1,37 @@ +#include "hyp.ih" + +void hyp_staples_exclude_none(su3_tuple **buff_out, su3_tuple **buff_in) +{ + static su3 tmp; + for (int idx = 0; idx < 3; ++idx) + memset(buff_out[idx], 0, sizeof(su3_tuple) * VOLUMEPLUSRAND); /* Brutal but fast zeroing of buffer... */ + +#define _ADD_STAPLES_TO_COMPONENT(component, to, via, x) \ + { \ + _su3_times_su3d(tmp, buff_in[I1_ ## to ## _ ## via / 4][g_iup[x][via]][I1_ ## to ## _ ## via % 4], buff_in[I1_ ## via ## _ ## to / 4][g_iup[x][to]][I1_ ## via ## _ ## to % 4]); \ + _su3_times_su3_acc(buff_out[component / 4][x][component % 4], buff_in[I1_ ## via ## _ ## to / 4][x][I1_ ## via ## _ ## to % 4], tmp); \ + _su3_times_su3(tmp, buff_in[I1_ ## to ## _ ## via / 4][g_idn[x][via]][I1_ ## to ## _ ## via % 4], buff_in[I1_ ## via ## _ ## to / 4][g_iup[g_idn[x][via]][to]][I1_ ## via ## _ ## to % 4]); \ + _su3d_times_su3_acc(buff_out[component / 4][x][component % 4], buff_in[I1_ ## via ## _ ## to / 4][g_idn[x][via]][I1_ ## via ## _ ## to % 4], tmp); \ + } + + for (int x = 0; x < VOLUME; ++x) + { + _ADD_STAPLES_TO_COMPONENT(I0_0, 0, 1, x); + _ADD_STAPLES_TO_COMPONENT(I0_0, 0, 2, x); + _ADD_STAPLES_TO_COMPONENT(I0_0, 0, 3, x); + + _ADD_STAPLES_TO_COMPONENT(I0_1, 1, 0, x); + _ADD_STAPLES_TO_COMPONENT(I0_1, 1, 2, x); + _ADD_STAPLES_TO_COMPONENT(I0_1, 1, 3, x); + + _ADD_STAPLES_TO_COMPONENT(I0_2, 2, 0, x); + _ADD_STAPLES_TO_COMPONENT(I0_2, 2, 1, x); + _ADD_STAPLES_TO_COMPONENT(I0_2, 2, 3, x); + + _ADD_STAPLES_TO_COMPONENT(I0_3, 3, 0, x); + _ADD_STAPLES_TO_COMPONENT(I0_3, 3, 1, x); + _ADD_STAPLES_TO_COMPONENT(I0_3, 3, 2, x); + } + +#undef _ADD_STAPLES_TO_COMPONENT +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/hyp_hyp_staples_exclude_one.c b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/hyp_hyp_staples_exclude_one.c new file mode 100644 index 0000000000000000000000000000000000000000..5d538fc576f9e39c42f3fe6a00dc29e8bb17e5bc --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/hyp_hyp_staples_exclude_one.c @@ -0,0 +1,49 @@ +#include "hyp.ih" + +void hyp_staples_exclude_one(su3_tuple **buff_out, su3_tuple **buff_in) +{ + static su3 tmp; + for (int idx = 0; idx < 3; ++idx) + memset(buff_out[idx], 0, sizeof(su3_tuple) * VOLUMEPLUSRAND); /* Brutal but fast zeroing of buffer... */ + +#define _ADD_STAPLES_TO_COMPONENT(component, to, excl, via, x) \ + { \ + _su3_times_su3d(tmp, buff_in[I2_ ## to ## _ ## excl ## via / 4][g_iup[x][via]][I2_ ## to ## _ ## excl ## via % 4], buff_in[I2_ ## via ## _ ## to ## excl / 4][g_iup[x][to]][I2_ ## via ## _ ## to ## excl % 4]); \ + _su3_times_su3_acc(buff_out[component / 4][x][component % 4], buff_in[I2_ ## via ## _ ## to ## excl / 4][x][I2_ ## via ## _ ## to ## excl % 4], tmp); \ + _su3_times_su3(tmp, buff_in[I2_ ## to ## _ ## excl ## via / 4][g_idn[x][via]][I2_ ## to ## _ ## excl ## via % 4], buff_in[I2_ ## via ## _ ## to ## excl / 4][g_iup[g_idn[x][via]][to]][I2_ ## via ## _ ## to ## excl % 4]); \ + _su3d_times_su3_acc(buff_out[component / 4][x][component % 4], buff_in[I2_ ## via ## _ ## to ## excl / 4][g_idn[x][via]][I2_ ## via ## _ ## to ## excl % 4], tmp); \ + } + + for (int x = 0; x < VOLUME; ++x) + { + _ADD_STAPLES_TO_COMPONENT(I1_0_1, 0, 1, 2, x); + _ADD_STAPLES_TO_COMPONENT(I1_0_1, 0, 1, 3, x); + _ADD_STAPLES_TO_COMPONENT(I1_0_2, 0, 2, 1, x); + _ADD_STAPLES_TO_COMPONENT(I1_0_2, 0, 2, 3, x); + _ADD_STAPLES_TO_COMPONENT(I1_0_3, 0, 3, 1, x); + _ADD_STAPLES_TO_COMPONENT(I1_0_3, 0, 3, 2, x); + + _ADD_STAPLES_TO_COMPONENT(I1_1_0, 1, 0, 2, x); + _ADD_STAPLES_TO_COMPONENT(I1_1_0, 1, 0, 3, x); + _ADD_STAPLES_TO_COMPONENT(I1_1_2, 1, 2, 0, x); + _ADD_STAPLES_TO_COMPONENT(I1_1_2, 1, 2, 3, x); + _ADD_STAPLES_TO_COMPONENT(I1_1_3, 1, 3, 0, x); + _ADD_STAPLES_TO_COMPONENT(I1_1_3, 1, 3, 2, x); + + _ADD_STAPLES_TO_COMPONENT(I1_2_0, 2, 0, 1, x); + _ADD_STAPLES_TO_COMPONENT(I1_2_0, 2, 0, 3, x); + _ADD_STAPLES_TO_COMPONENT(I1_2_1, 2, 1, 0, x); + _ADD_STAPLES_TO_COMPONENT(I1_2_1, 2, 1, 3, x); + _ADD_STAPLES_TO_COMPONENT(I1_2_3, 2, 3, 0, x); + _ADD_STAPLES_TO_COMPONENT(I1_2_3, 2, 3, 1, x); + + _ADD_STAPLES_TO_COMPONENT(I1_3_0, 3, 0, 1, x); + _ADD_STAPLES_TO_COMPONENT(I1_3_0, 3, 0, 2, x); + _ADD_STAPLES_TO_COMPONENT(I1_3_1, 3, 1, 0, x); + _ADD_STAPLES_TO_COMPONENT(I1_3_1, 3, 1, 2, x); + _ADD_STAPLES_TO_COMPONENT(I1_3_2, 3, 2, 0, x); + _ADD_STAPLES_TO_COMPONENT(I1_3_2, 3, 2, 1, x); + } + +#undef _ADD_STAPLES_TO_COMPONENT +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/hyp_hyp_staples_exclude_two.c b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/hyp_hyp_staples_exclude_two.c new file mode 100644 index 0000000000000000000000000000000000000000..ab845ddb152fc349d912defe0c0556794ed30e6b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/hyp_hyp_staples_exclude_two.c @@ -0,0 +1,37 @@ +#include "hyp.ih" + +void hyp_staples_exclude_two(su3_tuple **buff_out, su3_tuple *buff_in) +{ + static su3 tmp; + for (int idx = 0; idx < 3; ++idx) + memset(buff_out[idx], 0, sizeof(su3_tuple) * VOLUMEPLUSRAND); /* Brutal but fast zeroing of buffer... */ + + #define _ADD_STAPELS_TO_COMPONENT(component, to, via, x) \ + { \ + _su3_times_su3d(tmp, buff_in[g_iup[x][via]][to], buff_in[g_iup[x][to]][via]); \ + _su3_times_su3_acc(buff_out[component / 4][x][component % 4], buff_in[x][via], tmp); \ + _su3_times_su3(tmp, buff_in[g_idn[x][via]][to], buff_in[g_idn[g_iup[x][to]][via]][via]); \ + _su3d_times_su3_acc(buff_out[component / 4][x][component % 4], buff_in[g_idn[x][via]][via], tmp); \ + } + + for (int x = 0; x < VOLUME; ++x) + { + _ADD_STAPELS_TO_COMPONENT(I2_0_12, 0, 3, x); + _ADD_STAPELS_TO_COMPONENT(I2_0_13, 0, 2, x); + _ADD_STAPELS_TO_COMPONENT(I2_0_23, 0, 1, x); + + _ADD_STAPELS_TO_COMPONENT(I2_1_02, 1, 3, x); + _ADD_STAPELS_TO_COMPONENT(I2_1_03, 1, 2, x); + _ADD_STAPELS_TO_COMPONENT(I2_1_23, 1, 0, x); + + _ADD_STAPELS_TO_COMPONENT(I2_2_01, 2, 3, x); + _ADD_STAPELS_TO_COMPONENT(I2_2_03, 2, 1, x); + _ADD_STAPELS_TO_COMPONENT(I2_2_13, 2, 0, x); + + _ADD_STAPELS_TO_COMPONENT(I2_3_01, 3, 2, x); + _ADD_STAPELS_TO_COMPONENT(I2_3_02, 3, 1, x); + _ADD_STAPELS_TO_COMPONENT(I2_3_12, 3, 0, x); + } + +#undef _ADD_STAPELS_TO_COMPONENT +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/stout.h b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/stout.h new file mode 100644 index 0000000000000000000000000000000000000000..0390476735991ca5e8aebbfea8e57a4e7fb931e0 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/stout.h @@ -0,0 +1,11 @@ +#pragma once + +#include + +struct stout_parameters +{ + double rho; + int iterations; +}; + +int stout_smear(su3_tuple *m_field_out, struct stout_parameters const *params, su3_tuple *m_field_in); \ No newline at end of file diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/stout.ih b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/stout.ih new file mode 100644 index 0000000000000000000000000000000000000000..8f36ee701bbddd74b95b1fc2cff900e19aa03e78 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/stout.ih @@ -0,0 +1,31 @@ +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef SSE +# undef SSE +#endif +#ifdef SSE2 +# undef SSE2 +#endif +#ifdef SSE3 +# undef SSE3 +#endif + + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/stout_stout_smear.c b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/stout_stout_smear.c new file mode 100644 index 0000000000000000000000000000000000000000..e0a752913240738f60dc4c4a064d66552ddddfea --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/stout_stout_smear.c @@ -0,0 +1,48 @@ +#include "stout.ih" + +int stout_smear(su3_tuple *m_field_out, struct stout_parameters const *params, su3_tuple *m_field_in) +{ + static int initialized = 0; + static su3_tuple *buffer; + static su3 tmp; + + if (!initialized) + { + /* Allocate consecutive memory for both of the buffers upon first instantiation */ + buffer = (su3_tuple*)malloc(sizeof(su3_tuple) * VOLUMEPLUSRAND + 1); +#if (defined SSE || defined SSE2 || defined SSE3) + buffer = (su3_tuple*)(((unsigned long int)(buffer) + ALIGN_BASE) & ~ALIGN_BASE); +#endif + + if (buffer == (su3_tuple*)NULL) + return -1; + initialized = 1; + } + + /* start of the the stout smearing **/ + for(int iter = 0; iter < params->iterations; ++iter) + { + for (int x = 0; x < VOLUME; ++x) + for (int mu = 0; mu < 4; ++mu) + { + generic_staples(&tmp, x, mu, m_field_in); + _real_times_su3(tmp, params->rho, tmp); + _su3_times_su3d(buffer[x][mu], tmp, m_field_in[x][mu]); + project_antiherm(&buffer[x][mu]); + exposu3_in_place(&buffer[x][mu]); + } + + for(int x = 0; x < VOLUME; ++x) + for(int mu = 0 ; mu < 4; ++mu) + { + /* Input and output are allowed to be aliases -- use tmp */ + _su3_times_su3(tmp, buffer[x][mu], m_field_in[x][mu]); + _su3_assign(m_field_out[x][mu], tmp); + } + +// generic_exchange(m_field_out, sizeof(su3_tuple)); + m_field_in = m_field_out; /* Prepare for next iteration */ + } + + return(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/uils_print_config_to_screen.c b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/uils_print_config_to_screen.c new file mode 100644 index 0000000000000000000000000000000000000000..ae645604b2e46d5d0bae01e78f5c6924d07f0f7c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/uils_print_config_to_screen.c @@ -0,0 +1,13 @@ +#include "utils.ih" + +void print_config_to_screen(su3 **in) +{ + int x, mu; + for(x = 0; x < VOLUME; x++) + for(mu = 0; mu < 4; mu++) + { + printf("x = %d mu = %d\n", x, mu); + /*print_su3_full_hex_precision(&(in[x][mu]));*/ + print_su3(&(in[x][mu])); + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/utils.h b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/utils.h new file mode 100644 index 0000000000000000000000000000000000000000..71a624f773acae92b6ceab2c89b972e35d3fca50 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/utils.h @@ -0,0 +1,40 @@ +#pragma once + +#include +#include + +/* We need a number of indices to do the bookkeeping. + This will always amount to at most 12 fields, but + we define some aliases so that we don't have to do + the mental mapping all the time. */ + +enum I0 +{ + I0_0 = 0, I0_1 = 1, I0_2 = 2, I0_3 = 3 +}; + +enum I1 +{ + I1_0_1 = 0, I1_0_2 = 1, I1_0_3 = 2, I1_1_0 = 3, + I1_1_2 = 4, I1_1_3 = 5, I1_2_0 = 6, I1_2_1 = 7, + I1_2_3 = 8, I1_3_0 = 9, I1_3_1 = 10, I1_3_2 = 11 +}; + +enum I2 +{ + I2_0_12 = 0, I2_0_21 = 0, I2_0_13 = 1, I2_0_31 = 1, + I2_0_23 = 2, I2_0_32 = 2, I2_1_02 = 3, I2_1_20 = 3, + I2_1_03 = 4, I2_1_30 = 4, I2_1_23 = 5, I2_1_32 = 5, + I2_2_01 = 6, I2_2_10 = 6, I2_2_03 = 7, I2_2_30 = 7, + I2_2_13 = 8, I2_2_31 = 8, I2_3_01 = 9, I2_3_10 = 9, + I2_3_02 = 10, I2_3_20 = 10, I2_3_12 = 11, I2_3_21 = 11 +}; + +void generic_staples(gauge_field_t buff_out, int x, int mu, gauge_field_t buff_in); +void generic_exchange(void *field_in, int bytes_per_site); +void project_antiherm(su3 *omega); +void project_herm(su3 *omega); +void reunitarize(su3 *omega); + +void print_su3(su3 *in); +void print_config_to_screen(su3 **in); diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/utils.ih b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/utils.ih new file mode 100644 index 0000000000000000000000000000000000000000..28914576b11a30d92a5685fb62831c1515c36bc1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/utils.ih @@ -0,0 +1,21 @@ +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "utils.h" diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/utils_generic_staples.c b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/utils_generic_staples.c new file mode 100644 index 0000000000000000000000000000000000000000..424782ea5e2328e27df3f2758b667bd6b5a00279 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/utils_generic_staples.c @@ -0,0 +1,45 @@ +#include "utils.ih" + +void generic_staples(su3 *buff_out, int x, int mu, su3_tuple *buff_in) +{ + static su3 tmp; + +#define _ADD_STAPLES_TO_COMPONENT(to, via) \ + { \ + _su3_times_su3d(tmp, buff_in[g_iup[x][via]][to], buff_in[g_iup[x][to]][via]); \ + _su3_times_su3_acc(*buff_out, buff_in[x][via], tmp); \ + _su3_times_su3(tmp, buff_in[g_idn[x][via]][to], buff_in[g_iup[g_idn[x][via]][to]][via]); \ + _su3d_times_su3_acc(*buff_out, buff_in[g_idn[x][via]][via], tmp); \ + } + + _su3_zero(*buff_out); + + switch (mu) + { + case 0: + _ADD_STAPLES_TO_COMPONENT(0, 1); + _ADD_STAPLES_TO_COMPONENT(0, 2); + _ADD_STAPLES_TO_COMPONENT(0, 3); + break; + + case 1: + _ADD_STAPLES_TO_COMPONENT(1, 0); + _ADD_STAPLES_TO_COMPONENT(1, 2); + _ADD_STAPLES_TO_COMPONENT(1, 3); + break; + + case 2: + _ADD_STAPLES_TO_COMPONENT(2, 0); + _ADD_STAPLES_TO_COMPONENT(2, 1); + _ADD_STAPLES_TO_COMPONENT(2, 3); + break; + + case 3: + _ADD_STAPLES_TO_COMPONENT(3, 0); + _ADD_STAPLES_TO_COMPONENT(3, 1); + _ADD_STAPLES_TO_COMPONENT(3, 2); + break; + } + +#undef _ADD_STAPLES_TO_COMPONENT +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/utils_print_config_to_screen.c b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/utils_print_config_to_screen.c new file mode 100644 index 0000000000000000000000000000000000000000..18156261d00229288f11f0553f3e6387e928432f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/utils_print_config_to_screen.c @@ -0,0 +1,11 @@ +#include "utils.ih" + +void print_config_to_screen(su3 **in) +{ + for(int x = 0; x < VOLUME; ++x) + for(int mu = 0; mu < 4; ++mu) + { + printf("x = %d mu = %d\n", x, mu); + print_su3(&(in[x][mu])); + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/utils_print_su3.c b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/utils_print_su3.c new file mode 100644 index 0000000000000000000000000000000000000000..8b72008a107baaf7a80f85d095723d8a4c6265e1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/utils_print_su3.c @@ -0,0 +1,17 @@ +#include "utils.ih" + +void print_su3(su3 *in) +{ + printf("[ %12.14f + %12.14f * i, %12.14f + %12.14f * i, %12.14f + %12.14f * i; \n", + creal(in->c00), cimag(in->c00), + creal(in->c01), cimag(in->c01), + creal(in->c02), cimag(in->c02)) ; + printf(" %12.14f + %12.14f * i, %12.14f + %12.14f * i, %12.14f + %12.14f * i; \n", + creal(in->c10), cimag(in->c10), + creal(in->c11), cimag(in->c11), + creal(in->c12), cimag(in->c12)) ; + printf(" %12.14f + %12.14f * i, %12.14f + %12.14f * i, %12.14f + %12.14f * i ] \n", + creal(in->c20), cimag(in->c20), + creal(in->c21), cimag(in->c21), + creal(in->c22), cimag(in->c22)) ; +} \ No newline at end of file diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/utils_project_antiherm.c b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/utils_project_antiherm.c new file mode 100644 index 0000000000000000000000000000000000000000..e0deca1bce6cb445343b2c9aaf83db09c7aa7dd2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/utils_project_antiherm.c @@ -0,0 +1,25 @@ +#include "utils.ih" + +void project_antiherm(su3 *omega) +{ + static const double fac_3 = 1.00 / 3.00; + double tr_omega = creal(-I * fac_3 * (omega->c00 + omega->c11 + omega->c22); + + + omega->c00 = (cimag(omega->c00) - tr_omega) * I; + omega->c11 = (cimag(omega->c11) - tr_omega) * I; + omega->c22 = (cimag(omega->c22) - tr_omega) * I; + + omega->c01 -= conj(omega->c10); + omega->c01 *= 0.50; + omega->c10 = -conj(omega->c01); + + + omega->c02 -= conj(omega->c20); + omega->c02 *= 0.50; + omega->c20 = -conj(omega->c02); + + omega->c12 -= conj(omega->c21); + omega->c12 *= 0.50; + omega->c21 = -conj(omega->c12); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/utils_project_herm.c b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/utils_project_herm.c new file mode 100644 index 0000000000000000000000000000000000000000..f42e8b63fb260afe5f274d165eda9dc7599091ff --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/utils_project_herm.c @@ -0,0 +1,22 @@ +#include "utils.ih" + +/* This implements the approach of taking (I/2) * (Omega' - Omega) - (I/6) * Tr(Omega' - Omega) */ + +void project_herm(su3 *omega) +{ + static const double fac_3 = 1.00 / 3.00; + double tr_omega = fac_3 * (cimag(omega->c00) + cimag(omega->c11) + cimag(omega->c22)); + + omega->c00 = cimag(omega->c00) - tr_omega; + omega->c11 = cimag(omega->c11) - tr_omega; + omega->c22 = cimag(omega->c22) - tr_omega; + + omega->c01 = 0.5 * (omega->c10 - conj(omega->c01)); + omega->c10 = conj(omega->c01); + + omega->c02 = 0.5 * (omega->c20 - conj(omega->c02)); + omega->c20 = conj(omega->c02); + + omega->c21 = 0.5 * (omega->c12 - conj(omega->c21)); + omega->c12 = conj(omega->c21); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/utils_reunitarize.c b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/utils_reunitarize.c new file mode 100644 index 0000000000000000000000000000000000000000..d106d2e7d958b6b49d84330f6a41a91408739399 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/utils_reunitarize.c @@ -0,0 +1,80 @@ +#include "utils.ih" + +/* Method based on Givens' rotations, as used by Urs Wenger */ +void reunitarize(su3 *omega) +{ + static su3 w, rot, tmp; + static double trace_old, trace_new; + static _Complex double s0, s1; + static double scale; + + _su3_one(w); + trace_old = omega->c00 + omega->c11 + omega->c22; + + for (int iter = 0; iter < 200; ++iter) + { + /* Givens' rotation 01 */ + s0 = omega->c00 + conj(omega->c11); + s1 = omega->c01 - conj(omega->c10); + scale = 1.0 / sqrt(conj(s0) * s0 + conj(s1) * s1); + s0 *= scale; + s1 *= scale; + + /* Projecting */ + _su3_one(rot); + rot.c00 = s0; + rot.c11 = conj(s0); + rot.c01 = s1; + rot.c10 = -conj(s1); + + _su3_times_su3(tmp, rot, w); + _su3_assign(w, tmp); + _su3_times_su3d(tmp, *omega, rot); + _su3_assign(*omega, tmp); + + /* Givens' rotation 12 */ + s0 = omega->c11 + conj(omega->c22); + s1 = omega->c12 - conj(omega->c21); + scale = 1.0 / sqrt(conj(s0) * s0 + conj(s1) * s1); + s0 *= scale; + s1 *= scale; + + /* Projecting */ + _su3_one(rot); + rot.c11 = s0; + rot.c22 = conj(s0); + rot.c12 = s1; + rot.c21 = -conj(s1); + + _su3_times_su3(tmp, rot, w); + _su3_assign(w, tmp); + _su3_times_su3d(tmp, *omega, rot); + _su3_assign(*omega, tmp); + + /* Givens' rotation 20 */ + s0 = omega->c22 + conj(omega->c00); + s1 = omega->c20 - conj(omega->c02); + scale = 1.0 / sqrt(conj(s0) * s0 + conj(s1) * s1); + s0 *= scale; + s1 *= scale; + + /* Projecting */ + _su3_one(rot); + rot.c22 = s0; + rot.c00 = conj(s0); + rot.c20 = s1; + rot.c02 = -conj(s1); + + _su3_times_su3(tmp, rot, w); + _su3_assign(w, tmp); + _su3_times_su3d(tmp, *omega, rot); + _su3_assign(*omega, tmp); + + trace_new = omega->c00 + omega->c11 + omega->c22; + + if (trace_new - trace_old < 1e-15) + break; + trace_old = trace_new; + } + _su3_assign(*omega, w); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/utils_reunitarize_MILC.c b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/utils_reunitarize_MILC.c new file mode 100644 index 0000000000000000000000000000000000000000..1bf7ed2db827744d37f179155005315477f78dd1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/smearing/utils_reunitarize_MILC.c @@ -0,0 +1,43 @@ +#include "utils.ih" + +/* No reunitarization code seems to be available, so I've adapted (stolen) this routine from the MILC code (who stole it elsewhere, I think ;]) -- AD. */ +void reunitarize(su3 *omega) +{ + _Complex double a, bj0, bj1, bj2, t; + + /* first normalize row 0 */ + a = 1.0 / sqrt(conj(omega->c00) * omega->c00 + conj(omega->c01) * omega->c01 +conj(omega->c02) * omega->c02); + + omega->c00 *= a; + omega->c01 *= a; + omega->c02 *= a; + + /* now make row 1 orthogonal to row 0 */ + a = conj(omega->c00) * omega->c10 + conj(omega->c01) * omega->c11 + conj(omega->c02) * omega->c12; + + /* row 1 -= a * row 0 */ + omega->c10 -= a * omega->c00; + omega->c11 -= a * omega->c01; + omega->c12 -= a * omega->c02; + + /* now normalize row 1 */ + a = 1.0 / sqrt(conj(omega->c10) * omega->c10 + conj(omega->c11) * omega->c11 +conj(omega->c12) * omega->c12); + + omega->c10 *= a; + omega->c11 *= a; + omega->c12 *= a; + + /* reconstruct row 2 */ + bj0 = omega->c00; + bj1 = omega->c01; + bj2 = omega->c02; + + omega->c20 = bj1 * omega->c12; + omega->c20 -= bj2 * omega->c11 + + omega->c21 = bj2 * omega->c10; + omega->c21 -= bj0 * omega->c12; + + omega->c22 = bj0 * omega->c11; + omega->c22 -= bj1r * omega->c10; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..8d99dab2fed2e97886be000d79c6d90b359f4f2b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/Makefile @@ -0,0 +1,104 @@ + +srcdir = . +top_builddir = .. +abs_top_builddir = /home/jacob/Job/Prace/kernel_D_soft/kernel_D_update2 +top_srcdir = .. +abs_top_srcdir = /home/jacob/Job/Prace/kernel_D_soft/kernel_D_update2 +subdir = solver +builddir = . + +CFLAGS = -std=c99 -fopenmp -pedantic -Wall +DEPFLAGS = -MM +LDFLAGS = -L${HOME}/lib -L${top_builddir}/lib +DEFS = -DHAVE_CONFIG_H +OPTARGS = -O + +AR = ar +RANLIB = ranlib +CC = mpicc +CCDEP = gcc +CCLD = $(CC) +LINK = $(CCLD) $(CFLAGS) $(LDFLAGS) ${OPTARGS} -o $@ +LEX = flex +AUTOCONF = autoconf +DEFS = -DHAVE_CONFIG_H + +INCLUDES = -I$(HOME)/include/ -I. -I${abs_top_builddir}/ -I${abs_top_srcdir}/ -I/include/ -I/include/ +LDADD = +#COMPILE = ${CC} ${DEFS} $(INCLUDES) ${CFLAGS} +COMPILE = ${CC} ${DEFS} ${INCLUDES} ${CFLAGS} ${OPTARGS} + +LIBRARIES = libsolver +libsolver_TARGETS = bicgstab_complex gmres incr_eigcg eigcg restart_X ortho \ + cgs_real cg_her mr chrono_guess \ + bicgstabell bicgstab2 eigenvalues fgmres \ + gcr gcr4complex diagonalise_general_matrix \ + quicksort gmres_dr lu_solve jdher Msap \ + jdher_bi gram-schmidt eigenvalues_bi \ + bicgstab_complex_bi cg_her_bi pcg_her \ + sub_low_ev cg_her_nd poly_precon \ + generate_dfl_subspace dfl_projector \ + cg_mms_tm cg_mms_tm_nd mixed_cg_mms_tm_nd \ + solver_field sumr mixed_cg_her index_jd \ + rg_mixed_cg_her rg_mixed_cg_her_nd \ + dirac_operator_eigenvectors spectral_proj \ + jdher_su3vect cg_her_su3vect eigenvalues_Jacobi monomial_solve + +libsolver_OBJECTS = $(addsuffix .o, ${libsolver_TARGETS}) + +# default rule + +all: Makefile dep libsolver.a + +# rules for debugging +debug all-debug: CFLAGS := $(CFLAGS) -g +debug all-debug: all + +# rules for profiling information +profile all-profile: CFLAGS := $(filter-out -fomit-frame-pointer,${CFLAGS}) +profile all-profile: all + + +#include dep rules + +-include $(addsuffix .d,${libsolver_TARGETS}) + +include ${top_srcdir}/Makefile.global + +# rule to compile objects + +%.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h + $(COMPILE) -c $< + + +# rule to make liblinalg + +libsolver.a: ${libsolver_OBJECTS} Makefile + @rm -f libsolver.a + @${AR} cru libsolver.a $(libsolver_OBJECTS) + @$(RANLIB) libsolver.a + @cp libsolver.a ${top_builddir}/lib/libsolver.a + +# rule to generate .d files + +$(addsuffix .d,$(libsolver_TARGETS)): %.d: ${srcdir}/%.c Makefile + @$(CCDEP) ${DEFS} ${DEPFLAGS} ${INCLUDES} $< > $@ + +# rule to make dependencies + +dep: ${addsuffix .d, ${libsolver_TARGETS}} + +# rules to clean + +compile-clean: Makefile + rm -f ${$(addsuffix _OBJECTS, ${LIBRARIES})} *.d + +clean: compile-clean + rm -f $(addsuffix .a, ${LIBRARIES}) + rm -f ../lib/libsolver.a + +distclean: clean + rm -f Makefile + + +.PHONY: all dep clean compile-clean distclean debug all-debug profile all-profile diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/Makefile.in b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/Makefile.in new file mode 100644 index 0000000000000000000000000000000000000000..d4c5ee2dd66453ea0a4c2a07a593cc575b528bd3 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/Makefile.in @@ -0,0 +1,104 @@ + +srcdir = @srcdir@ +top_builddir = @top_builddir@ +abs_top_builddir = @abs_top_builddir@ +top_srcdir = @top_srcdir@ +abs_top_srcdir = @abs_top_srcdir@ +subdir = solver +builddir = @builddir@ + +CFLAGS = @CFLAGS@ @SOLVEROUT@ +DEPFLAGS = @DEPFLAGS@ +LDFLAGS = @LDFLAGS@ +DEFS = @DEFS@ +OPTARGS = @OPTARGS@ + +AR = @AR@ +RANLIB = @RANLIB@ +CC = @CC@ +CCDEP = @CCDEP@ +CCLD = $(CC) +LINK = $(CCLD) $(CFLAGS) $(LDFLAGS) ${OPTARGS} -o $@ +LEX = @LEX@ +AUTOCONF = @AUTOCONF@ +DEFS = @DEFS@ + +INCLUDES = @INCLUDES@ +LDADD = +#COMPILE = ${CC} ${DEFS} $(INCLUDES) ${CFLAGS} +COMPILE = ${CC} ${DEFS} ${INCLUDES} ${CFLAGS} ${OPTARGS} + +LIBRARIES = libsolver +libsolver_TARGETS = bicgstab_complex gmres incr_eigcg eigcg restart_X ortho \ + cgs_real cg_her mr chrono_guess \ + bicgstabell bicgstab2 eigenvalues fgmres \ + gcr gcr4complex diagonalise_general_matrix \ + quicksort gmres_dr lu_solve jdher Msap \ + jdher_bi gram-schmidt eigenvalues_bi \ + bicgstab_complex_bi cg_her_bi pcg_her \ + sub_low_ev cg_her_nd poly_precon \ + generate_dfl_subspace dfl_projector \ + cg_mms_tm cg_mms_tm_nd mixed_cg_mms_tm_nd \ + solver_field sumr mixed_cg_her index_jd \ + rg_mixed_cg_her rg_mixed_cg_her_nd \ + dirac_operator_eigenvectors spectral_proj \ + jdher_su3vect cg_her_su3vect eigenvalues_Jacobi monomial_solve + +libsolver_OBJECTS = $(addsuffix .o, ${libsolver_TARGETS}) + +# default rule + +all: Makefile dep libsolver.a + +# rules for debugging +debug all-debug: CFLAGS := $(CFLAGS) @DEBUG_FLAG@ +debug all-debug: all + +# rules for profiling information +profile all-profile: CFLAGS := $(filter-out -fomit-frame-pointer,${CFLAGS}) @PROFILE_FLAG@ +profile all-profile: all + + +#include dep rules + +-include $(addsuffix .d,${libsolver_TARGETS}) + +include ${top_srcdir}/Makefile.global + +# rule to compile objects + +%.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h + $(COMPILE) -c $< + + +# rule to make liblinalg + +libsolver.a: ${libsolver_OBJECTS} Makefile + @rm -f libsolver.a + @${AR} cru libsolver.a $(libsolver_OBJECTS) + @$(RANLIB) libsolver.a + @cp libsolver.a ${top_builddir}/lib/libsolver.a + +# rule to generate .d files + +$(addsuffix .d,$(libsolver_TARGETS)): %.d: ${srcdir}/%.c Makefile + @$(CCDEP) ${DEFS} ${DEPFLAGS} ${INCLUDES} $< > $@ + +# rule to make dependencies + +dep: ${addsuffix .d, ${libsolver_TARGETS}} + +# rules to clean + +compile-clean: Makefile + rm -f ${$(addsuffix _OBJECTS, ${LIBRARIES})} *.d + +clean: compile-clean + rm -f $(addsuffix .a, ${LIBRARIES}) + rm -f ../lib/libsolver.a + +distclean: clean + rm -f Makefile + + +.PHONY: all dep clean compile-clean distclean debug all-debug profile all-profile diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/Msap.c b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/Msap.c new file mode 100644 index 0000000000000000000000000000000000000000..77a58427bad36f8cff62f91403dd897fd3307557 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/Msap.c @@ -0,0 +1,190 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "start.h" +#include "linalg_eo.h" +#include "operator/tm_operators.h" +#include "boundary.h" +#include "gmres.h" +#include "solver.h" +#include "block.h" +#include "operator/Hopping_Matrix.h" +#include "solver_field.h" +#include "operator/D_psi.h" + +void dummy_Di(spinor * const P, spinor * const Q, const int i) { + Block_D_psi(&block_list[i], P, Q); + return; +} + + +void Mtm_plus_block_psi(spinor * const l, spinor * const k, const int i) { + block * blk = &block_list[i]; + int vol = (*blk).volume/2; + Block_H_psi(blk, g_spinor_field[DUM_MATRIX+1], k, EO); + mul_one_pm_imu_inv(g_spinor_field[DUM_MATRIX+1], +1., vol); + Block_H_psi(blk, g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+1], OE); + mul_one_pm_imu_sub_mul(l, k, g_spinor_field[DUM_MATRIX], +1., vol); + return; +} + +void Mtm_plus_sym_block_psi(spinor * const l, spinor * const k, const int i) { + block * blk = &block_list[i]; + int vol = (*blk).volume/2; + Block_H_psi(blk, g_spinor_field[DUM_MATRIX+1], k, EO); + mul_one_pm_imu_inv(g_spinor_field[DUM_MATRIX+1], +1., vol); + Block_H_psi(blk, g_spinor_field[DUM_MATRIX], g_spinor_field[DUM_MATRIX+1], OE); + mul_one_pm_imu_inv(g_spinor_field[DUM_MATRIX], +1., vol); + diff(l, k, g_spinor_field[DUM_MATRIX], vol); + return; +} + + +void dummy_D0(spinor * const P, spinor * const Q) { + Block_D_psi(&block_list[0], P, Q); + return; +} + +void dummy_D1(spinor * const P, spinor * const Q) { + Block_D_psi(&block_list[1], P, Q); + return; +} + +void Msap(spinor * const P, spinor * const Q, const int Ncy) { + int blk, ncy = 0, eo, vol; + spinor * r, * a, * b; + double nrm; + spinor ** solver_field = NULL; + const int nr_sf = 3; + + /* + * here it would be probably better to get the working fields as a parameter + * from the calling function + */ + init_solver_field(&solver_field, VOLUME, nr_sf); + r = solver_field[0]; + a = solver_field[1]; + b = solver_field[2]; + + for(ncy = 0; ncy < Ncy; ncy++) { + /* compute the global residue */ + /* this can be done more efficiently */ + /* here only a naive implementation */ + for(eo = 0; eo < 2; eo++) { + D_psi(r, P); + diff(r, Q, r, VOLUME); + nrm = square_norm(r, VOLUME, 1); + if(g_proc_id == 0 && g_debug_level > 1 && eo == 1) { + printf("Msap: %d %1.3e\n", ncy, nrm); + } + /* choose the even (odd) block */ + + /*blk = eolist[eo];*/ + + for (blk = 0; blk < nb_blocks; blk++) { + if(block_list[blk].evenodd == eo) { + vol = block_list[blk].volume; + + /* get part of r corresponding to block blk into b */ + copy_global_to_block(b, r, blk); + + mrblk(a, b, 16, 1.e-31, 1, vol, &dummy_Di, blk); + + /* add a up to full spinor P */ + add_block_to_global(P, a, blk); + } + } + } + } + finalize_solver(solver_field, nr_sf); + return; +} + + +void Msap_eo(spinor * const P, spinor * const Q, const int Ncy) { + int blk, ncy = 0, eo, vol; + spinor * r, * a, * b; + double nrm; + spinor * b_even, * b_odd, * a_even, * a_odd; + spinor ** solver_field = NULL; + const int nr_sf = 3; + + /* + * here it would be probably better to get the working fields as a parameter + * from the calling function + */ + init_solver_field(&solver_field, VOLUME, nr_sf); + r = solver_field[0]; + a = solver_field[1]; + b = solver_field[2]; + + vol = block_list[0].volume/2; + b_even = b; + b_odd = b + vol + 1; + a_even = a; + a_odd = a + vol + 1; + + for(ncy = 0; ncy < Ncy; ncy++) { + /* compute the global residue */ + /* this can be done more efficiently */ + /* here only a naive implementation */ + for(eo = 0; eo < 2; eo++) { + D_psi(r, P); + diff(r, Q, r, VOLUME); + nrm = square_norm(r, VOLUME, 1); + if(g_proc_id == 0 && g_debug_level > 1 && eo == 1) { + printf("Msap: %d %1.3e\n", ncy, nrm); + } + /* choose the even (odd) block */ + + for (blk = 0; blk < nb_blocks; blk++) { + if(block_list[blk].evenodd == eo) { + /* get part of r corresponding to block blk into b_even and b_odd */ + copy_global_to_block_eo(b_even, b_odd, r, blk); + + assign_mul_one_pm_imu_inv(a_even, b_even, +1., vol); + Block_H_psi(&block_list[blk], a_odd, a_even, OE); + /* a_odd = a_odd - b_odd */ + assign_mul_add_r(a_odd, -1., b_odd, vol); + + mrblk(b_odd, a_odd, 3, 1.e-31, 1, vol, &Mtm_plus_block_psi, blk); + + Block_H_psi(&block_list[blk], b_even, b_odd, EO); + mul_one_pm_imu_inv(b_even, +1., vol); + /* a_even = a_even - b_even */ + assign_add_mul_r(a_even, b_even, -1., vol); + + /* add even and odd part up to full spinor P */ + add_eo_block_to_global(P, a_even, b_odd, blk); + } + } + } + } + finalize_solver(solver_field, nr_sf); + return; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/Msap.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/Msap.h new file mode 100644 index 0000000000000000000000000000000000000000..7a1808098c8b2c95b74c3458a460b6bae10a1df7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/Msap.h @@ -0,0 +1,27 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _MSAP_H +#define _MSAP_H + +void Msap(spinor * const P, spinor * const Q, const int Ncy); +void Msap_eo(spinor * const P, spinor * const Q, const int Ncy); +void Mtm_plus_block_psi(spinor * const l, spinor * const k, const int i); +void Mtm_plus_sym_block_psi(spinor * const l, spinor * const k, const int i); +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/bicgstab2.c b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/bicgstab2.c new file mode 100644 index 0000000000000000000000000000000000000000..4c7d58d65bc0926fd4e13dd752ff24a7b2bc10fe --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/bicgstab2.c @@ -0,0 +1,209 @@ +/*********************************************************************** + * + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * This is an implementation of bicgstab(l) + * corresponding to the paper of G. L.G. Sleijpen and + * D.R. Fokkema + * Transactions on Numerical Analysis + * Volume1, pp. 11-32, 1993 + * + * Author: Carsten Urbach + * urbach@physik.fu-berlin.de + * + *************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "global.h" +#include "complex.h" +#include "su3.h" +#include "linalg_eo.h" +#include "start.h" +#include "solver/matrix_mult_typedef.h" +#include "solver_field.h" +#include "bicgstab2.h" + +int bicgstab2(spinor * const x0, spinor * const b, const int max_iter, + double eps_sq, const int rel_prec, const int N, matrix_mult f) { + + const int l = 2; + double err; + int i, j, k; + int update_app = 0, update_res = 0; + double rho0, rho1, beta, alpha, omega, gamma_hat, + sigma, kappa0, kappal, rho, zeta0; + double squarenorm, Mx=0., Mr=0.; + spinor * r[5], * u[5], * r0_tilde, * u0, * x, * xp, * bp; + double Z[3][3], y0[3], yl[3], yp[3], ypp[3]; + spinor ** solver_field = NULL; + const int nr_sf = 10; + + k = -l; + if(N == VOLUME) { + init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf); + } + else { + init_solver_field(&solver_field, VOLUMEPLUSRAND/2, nr_sf); + } + r0_tilde = solver_field[0]; + u0 = solver_field[1]; + r[0] = solver_field[2]; + u[0] = solver_field[3]; + r[1] = solver_field[4]; + u[1] = solver_field[5]; + r[2] = solver_field[6]; + u[2] = solver_field[7]; + bp = solver_field[8]; + xp = x0; + x = solver_field[9]; + + zero_spinor_field(x, N); + assign(u[0], b, N); + f(r0_tilde, xp); + diff(r[0], u[0], r0_tilde, N); + zero_spinor_field(u0, N); + assign(r0_tilde, r[0], N); +/* random_spinor_field(r0_tilde, N); */ + assign(bp, r[0], N); + squarenorm = square_norm(b, N, 1); + + rho0 = 1.; + alpha = rho0; + omega = rho0; + err = square_norm(r[0], N, 1); + Mr = err; + Mx = err; + zeta0 = err; + while( k < max_iter && (((err > eps_sq) && (rel_prec == 0)) + || ((err > eps_sq*squarenorm) && (rel_prec == 1)) + )) { + k+=l; + + /* The BiCG part */ + rho0 *= -omega; + for(j = 0; j < l; j++) { + rho1 = scalar_prod_r(r[j], r0_tilde, N, 1); + beta = alpha*(rho1/rho0); + rho0 = rho1; +/* if(g_proc_id == 0) {printf("beta = %e, alpha = %e, rho0 = %e\n", beta, alpha, rho0);fflush(stdout);} */ + for(i = 0; i <= j; i++) { + /* u_i = r_i - \beta u_i */ + assign_mul_add_r(u[i], -beta, r[i], N); + } + f(u[j+1], u[j]); + sigma = scalar_prod_r(u[j+1], r0_tilde, N, 1); + alpha = rho1/sigma; +/* if(g_proc_id == 0) {printf("sigma = %e, alpha = %e\n", sigma, alpha);fflush(stdout);} */ + /* x = x + \alpha u_0 */ + assign_add_mul_r(x, u[0], alpha, N); + /* r_i = r_i - \alpha u_{i+1} */ + for(i = 0; i <= j; i++) { + assign_add_mul_r(r[i], u[i+1], -alpha, N); + } + f(r[j+1], r[j]); + err = square_norm(r[j+1], N, 1); + if(g_proc_id == 0 && g_debug_level > 1) {printf("%d %d err = %e\n", k, j, err);fflush(stdout);} + if(err > Mr) Mr = err; + if(err > Mx) Mx = err; + } + + /* The polynomial part */ + + /* Z = R* R */ + for(i = 0; i <= l; i++){ + for(j = 0; j <= i; j++){ + Z[i][j] = scalar_prod_r(r[j], r[i], N, 1); + Z[j][i] = Z[i][j]; + } + } + + /* r0tilde and rl_tilde */ + y0[0] = -1; + y0[2] = 0.; + y0[1] = Z[1][0]/Z[1][1]; + + yl[0] = 0.; + yl[2] = -1.; + yl[1] = Z[1][2]/Z[1][1]; + + /* Convex combination */ + for(i = 0; i < l+1; i++){ + yp[i] = 0.; + ypp[i] = 0.; + for(j = 0; j < l+1; j++) { + yp[i] +=Z[i][j]*y0[j]; + ypp[i] +=Z[i][j]*yl[j]; + } + } + kappa0 = sqrt( y0[0]*yp[0] + y0[1]*yp[1] + y0[2]*yp[2] ); + kappal = sqrt( yl[0]*ypp[0] + yl[1]*ypp[1] + yl[2]*ypp[2] ); + rho = (yl[0]*yp[0] + yl[1]*yp[1] + yl[2]*yp[2])/kappa0/kappal; + if(fabs(rho) > 0.7) { + gamma_hat = rho; + } + else { + gamma_hat = rho*0.7/fabs(rho); + } + for(i = 0; i <= l; i++) { + y0[i] -= gamma_hat*kappa0*yl[i]/kappal; + } + + /* Update */ + omega = y0[l]; + for(i = 1; i < l+1; i++) { + assign_add_mul_r(u[0], u[i], -y0[i], N); + assign_add_mul_r(x, r[i-1], y0[i], N); + assign_add_mul_r(r[0], r[i], -y0[i], N); + } + err = kappa0*kappa0; + /* Reliable update part */ + if(err > Mr) Mr = err; + if(err > Mx) Mx = err; + update_app = (err < 1.e-4*zeta0 && zeta0 <= Mx); + update_res = ((err < 1.e-4*Mr && zeta0 <= Mr) || update_app); + if(update_res) { + if(g_proc_id == 0 && g_debug_level > 1) printf("Update res\n"); + f(r[0], x); + diff(r[0], bp, r[0], N); + Mr = err; + if(update_app) { + if(g_proc_id == 0 && g_debug_level > 1) printf("Update app\n"); + Mx = err; + assign_add_mul_r(xp, x, 1., N); + zero_spinor_field(x, N); + assign(bp, r[0], N); + } + } + update_app = 0; + update_res = 0; + if(g_proc_id == 0 && g_debug_level > 0){ + printf(" BiCGstab(2)convex iterated %d %d, %e rho0 = %e, alpha = %e, gamma_hat= %e\n", + l, k, err, rho0, alpha, gamma_hat); + fflush( stdout ); + } + } + assign_add_mul_r(x, xp, 1., N); + assign(x0, x, N); + if(k == max_iter) return(-1); + return(k); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/bicgstab2.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/bicgstab2.h new file mode 100644 index 0000000000000000000000000000000000000000..d7788e5a70ef68353ca28b8f9822c3c14b97f9ca --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/bicgstab2.h @@ -0,0 +1,26 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _BICGSTAB2_H +#define _BICGSTAB2_H + +int bicgstab2(spinor * const x0, spinor * const b, const int max_iter, + double eps_sq, const int rel_prec, const int N, matrix_mult f); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/bicgstab_complex.c b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/bicgstab_complex.c new file mode 100644 index 0000000000000000000000000000000000000000..accabcf10581996b4bfb3873719b5b5572e02ef7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/bicgstab_complex.c @@ -0,0 +1,116 @@ +/*********************************************************************** + * + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * The externally accessible functions are + * + * int bicgstab(spinor * const, spinor * const, const int, double, matrix_mult) + * BiCGstab solver + * + * + * + * + * Author: Carsten Urbach + * + * + **************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "linalg_eo.h" +#include "start.h" +#include "solver_field.h" +#include "bicgstab_complex.h" + +/* P inout (guess for the solving spinor) + Q input +*/ +int bicgstab_complex(spinor * const P,spinor * const Q, const int max_iter, + double eps_sq, const int rel_prec, + const int N, matrix_mult f){ + double err, squarenorm; + _Complex double rho0, rho1, omega, alpha, beta, nom, denom; + int i; + spinor * r, * p, * v, *hatr, * s, * t; + spinor ** solver_field = NULL; + const int nr_sf = 6; + + if(N == VOLUME) { + init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf); + } + else { + init_solver_field(&solver_field, VOLUMEPLUSRAND/2, nr_sf); + } + hatr = solver_field[0]; + r = solver_field[1]; + v = solver_field[2]; + p = solver_field[3]; + s = solver_field[4]; + t = solver_field[5]; + + f(r, P); + diff(p, Q, r, N); + assign(r, p, N); + assign(hatr, p, N); + rho0 = scalar_prod(hatr, r, N, 1); + squarenorm = square_norm(Q, N, 1); + + for(i = 0; i < max_iter; i++){ + err = square_norm(r, N, 1); + if(g_proc_id == g_stdio_proc && g_debug_level > 2) { + printf("%d %e\n", i, err); + fflush(stdout); + } + + if((((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq*squarenorm) && (rel_prec == 1))) && i>0) { + finalize_solver(solver_field, nr_sf); + return(i); + } + f(v, p); + denom = scalar_prod(hatr, v, N, 1); + alpha = rho0 / denom; + assign(s, r, N); + assign_diff_mul(s, v, alpha, N); + f(t, s); + omega = scalar_prod(t,s, N, 1); + omega /= square_norm(t, N, 1); + assign_add_mul_add_mul(P, p, s, alpha, omega, N); + assign(r, s, N); + assign_diff_mul(r, t, omega, N); + rho1 = scalar_prod(hatr, r, N, 1); + if(fabs(creal(rho1)) < 1.e-25 && fabs(cimag(rho1)) < 1.e-25) + { + finalize_solver(solver_field, nr_sf); + return(-1); + } + nom = alpha * rho1; + denom = omega * rho0; + beta = nom / denom; + omega = -omega; + assign_mul_bra_add_mul_ket_add(p, v, r, omega, beta, N); + rho0 = rho1; + } + finalize_solver(solver_field, nr_sf); + return -1; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/bicgstab_complex.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/bicgstab_complex.h new file mode 100644 index 0000000000000000000000000000000000000000..3a204cffd42cbc2d92069b866c4b1e05884d9f41 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/bicgstab_complex.h @@ -0,0 +1,29 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _BICGSTAB_COMPLEX_H +#define _BICGSTAB_COMPLEX_H + +#include"solver/matrix_mult_typedef.h" +#include"su3.h" + +int bicgstab_complex(spinor * const, spinor * const, const int max_iter, double eps_sq, + const int rel_prec, const int N, matrix_mult f); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/bicgstab_complex_bi.c b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/bicgstab_complex_bi.c new file mode 100644 index 0000000000000000000000000000000000000000..bdacd25a5abfb339087f7880c878c5a5a8bdc810 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/bicgstab_complex_bi.c @@ -0,0 +1,111 @@ +/*********************************************************************** + * + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * The externally accessible functions are + * + * int bicgstab(bispinor * const, bispinor * const, const int, double, matrix_mult_bi) + * BiCGstab solver + * + * + * + * + * Author: Thomas Chiarappa + * Thomas.Chiarappa@mib.infn.it + * + **************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "su3.h" +#include "global.h" +#include "linalg_eo.h" +#include "start.h" +#include "solver_field.h" +#include "bicgstab_complex_bi.h" + +/* P inout (guess for the solving bispinor) + Q input +*/ +int bicgstab_complex_bi(bispinor * const P, bispinor * const Q, const int max_iter, double eps_sq, const int rel_prec, const int N, matrix_mult_bi f){ + + double err, squarenorm; + _Complex double rho0, rho1, omega, alpha, beta, nom, denom; + int i; + bispinor * r, * p, * v, *hatr, * s, * t; + bispinor ** bisolver_field = NULL; + const int nr_sf = 6; + + if(N == VOLUME) { + init_bisolver_field(&bisolver_field, VOLUMEPLUSRAND, nr_sf); + } + else { + init_bisolver_field(&bisolver_field, VOLUMEPLUSRAND/2, nr_sf); + } + + hatr = bisolver_field[0]; + r = bisolver_field[1]; + v = bisolver_field[2]; + p = bisolver_field[3]; + s = bisolver_field[4]; + t = bisolver_field[5]; + + f(r, P); + diff((spinor*)p, (spinor*)Q, (spinor*)r, 2*N); + assign((spinor*)r, (spinor*)p, 2*N); + assign((spinor*)hatr, (spinor*)p, 2*N); + rho0 = scalar_prod((spinor*)hatr, (spinor*)r, 2*N, 1); + squarenorm = square_norm((spinor*)Q, 2*N, 1); + + for(i = 0; i < max_iter; i++){ + err = square_norm((spinor*)r, 2*N, 1); + if(g_proc_id == g_stdio_proc && g_debug_level > 2) { + printf("%d %e\n", i, err); + fflush(stdout); + } + + if((((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq*squarenorm) && (rel_prec == 1))) && i>0) { + finalize_bisolver(bisolver_field, nr_sf); + return(i); + } + f(v, p); + denom = scalar_prod((spinor*)hatr, (spinor*)v, 2*N, 1); + alpha = rho0 / denom; + assign((spinor*)s, (spinor*)r, 2*N); + assign_diff_mul((spinor*)s, (spinor*)v, alpha, 2*N); + f(t, s); + omega = scalar_prod((spinor*)t, (spinor*)s, 2*N, 1); + omega /= square_norm((spinor*)t, 2*N, 1); + assign_add_mul_add_mul((spinor*)P, (spinor*)p, (spinor*)s, alpha, omega, 2*N); + assign((spinor*)r, (spinor*)s, 2*N); + assign_diff_mul((spinor*)r, (spinor*)t, omega, 2*N); + rho1 = scalar_prod((spinor*)hatr, (spinor*)r, 2*N, 1); + nom = alpha * rho1; + denom = omega * rho0; + beta = nom / denom; + omega = -omega; + assign_mul_bra_add_mul_ket_add((spinor*)p, (spinor*)v, (spinor*)r, omega, beta, 2*N); + rho0 = rho1; + } + finalize_bisolver(bisolver_field, nr_sf); + return -1; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/bicgstab_complex_bi.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/bicgstab_complex_bi.h new file mode 100644 index 0000000000000000000000000000000000000000..aaf50252b8f427c8c03cb4e01846bb82d8acf451 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/bicgstab_complex_bi.h @@ -0,0 +1,30 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _BICGSTAB_COMPLEX_BI_H +#define _BICGSTAB_COMPLEX_BI_H + +#include"solver/matrix_mult_typedef.h" +#include"su3.h" + +#include"solver/matrix_mult_typedef_bi.h" + +int bicgstab_complex_bi(bispinor * const, bispinor * const, const int max_iter, double eps_sq, const int rel_prec, const int N, matrix_mult_bi f); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/bicgstabell.c b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/bicgstabell.c new file mode 100644 index 0000000000000000000000000000000000000000..1550a2df0d0613ab20019adb21eb55c8378feb4a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/bicgstabell.c @@ -0,0 +1,157 @@ +/*********************************************************************** + * + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * This is an implementation of bicgstab(l) + * corresponding to the paper of G. L.G. Sleijpen and + * D.R. Fokkema + * Transactions on Numerical Analysis + * Volume1, pp. 11-32, 1993 + * + * Author: Carsten Urbach + * urbach@physik.fu-berlin.de + * + *************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "linalg_eo.h" +#include "start.h" +#include "solver/matrix_mult_typedef.h" +#include "solver_field.h" +#include "bicgstabell.h" + +int bicgstabell(spinor * const x0, spinor * const b, const int max_iter, + double eps_sq, const int rel_prec, const int _l, const int N, matrix_mult f) { + + double err; + int i, j, k, l; + double rho0, rho1, beta, alpha, omega, gamma0 = 0., squarenorm; + spinor * r[5], * u[5], * r0_tilde, * x; + double tau[5][5], gamma[25], gammap[25], gammapp[25], sigma[25]; + spinor ** solver_field = NULL; + const int nr_sf = 2*(_l+1)+2; + + l = _l; + k = -l; + + if(N == VOLUME) { + init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf); + } + else { + init_solver_field(&solver_field, VOLUMEPLUSRAND/2, nr_sf); + } + r0_tilde = solver_field[0]; + for(i = 0; i <= l; i++){ + r[i] = solver_field[2+2*i]; + u[i] = solver_field[3+2*i]; + } + + x = x0; + assign(u[0], b, N); + f(r0_tilde, x); + diff(r[0], u[0], r0_tilde, N); + zero_spinor_field(solver_field[1], N); + assign(r0_tilde, r[0], N); + squarenorm = square_norm(b, N, 1); + + rho0 = 1.; + alpha = 0.; + omega = 1.; + err = square_norm(r0_tilde, N, 1); + while( k < max_iter && (((err > eps_sq) && (rel_prec == 0)) + || ((err > eps_sq*squarenorm) && (rel_prec == 1)) + )) { + k+=l; + + /* The BiCG part */ + + rho0 *= -omega; + for(j = 0; j < l; j++) { + rho1 = scalar_prod_r(r[j], r0_tilde, N, 1); + beta = (rho1/rho0); + beta *= alpha; + rho0 = rho1; + for(i = 0; i <= j; i++) { + /* u_i = r_i - \beta u_i */ + assign_mul_add_r(u[i], -beta, r[i], N); + } + f(u[j+1], u[j]); + gamma0 = scalar_prod_r(u[j+1], r0_tilde, N, 1); + alpha = rho0/gamma0; + /* r_i = r_i - \alpha u_{i+1} */ + for(i = 0; i <= j; i++) { + assign_add_mul_r(r[i], u[i+1], -alpha, N); + } + f(r[j+1], r[j]); + /* x = x + \alpha u_0 */ + assign_add_mul_r(x, u[0], alpha, N); + err = square_norm(r[j+1], N, 1); + if(g_proc_id == 0 && g_debug_level > 2) {printf("%d %d err = %e\n", k, j, err);fflush(stdout);} + } + + /* The MR part */ + + for(j = 1; j <= l; j++){ + for(i = 1; i < j; i++){ + tau[i][j] = scalar_prod_r(r[j], r[i], N, 1)/sigma[i]; + assign_add_mul_r(r[j], r[i], -tau[i][j], N); + } + sigma[j] = scalar_prod_r(r[j], r[j], N, 1); + gammap[j] = scalar_prod_r(r[0], r[j], N, 1)/sigma[j]; + } + gamma[l] = gammap[l]; + omega = gamma[l]; + for(j = l-1; j > 0; j--) { + gamma[j] = gammap[j]; + for(i = j+1; i <= l; i++) { + gamma[j] -= (tau[j][i]*gamma[i]); + } + } + for(j = 1; j < l; j++) { + gammapp[j] = gamma[j+1]; + for(i = j+1; i < l; i++){ + gammapp[j] += (tau[j][i]*gamma[i+1]); + } + } + assign_add_mul_r(x, r[0], gamma[1], N); + assign_add_mul_r(r[0], r[l], -gammap[l], N); + for(j = 1; j < l; j++){ + assign_add_mul_r(x, r[j], gammapp[j], N); + assign_add_mul_r(r[0], r[j], -gammap[j], N); + } + assign_add_mul_r(u[0], u[l], -gamma[l], N); + for(j = 1; j < l; j++){ + assign_add_mul_r(u[0], u[j], -gamma[j], N); + } + err = square_norm(r[0], N, 1); + if(g_proc_id == 0 && g_debug_level > 0){ + printf(" BiCGstabell iterated %d %d, %e rho0 = %e, alpha = %e, gamma0= %e\n", l, k, err, rho0, alpha, gamma0); + fflush( stdout ); + } + } + finalize_solver(solver_field, nr_sf); + if(k == max_iter) return(-1); + return(k); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/bicgstabell.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/bicgstabell.h new file mode 100644 index 0000000000000000000000000000000000000000..9fd32bc73d128f1243296543e011a8e120d79850 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/bicgstabell.h @@ -0,0 +1,26 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _BICGSTABELL_H +#define _BICGSTABELL_H + +int bicgstabell(spinor * const x0, spinor * const b, const int max_iter, + double eps_sq, const int rel_prec, const int _l, const int N, matrix_mult f); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/cg_her.c b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/cg_her.c new file mode 100644 index 0000000000000000000000000000000000000000..6bf0d3b1a077d5e03743cc5883f1b56c29817a22 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/cg_her.c @@ -0,0 +1,149 @@ +/*********************************************************************** + * + * Copyright (C) 2001 Martin Hasenbusch + * 2003 Thomas Chiarappa + * 2002,2003,2004,2005,2010 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * + * File: cg_her.c + * + * CG solver for hermitian f only! + * + * The externally accessible functions are + * + * + * int cg(spinor * const P, spinor * const Q, double m, const int subtract_ev) + * CG solver + * + * input: + * Q: source + * inout: + * P: initial guess and result + * + * + **************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#ifdef MPI +# include +#endif +#include "global.h" +#include "su3.h" +#include "linalg_eo.h" +#include "start.h" +#include "gettime.h" +#include "solver/matrix_mult_typedef.h" +#include "sub_low_ev.h" +#include "poly_precon.h" +#include "solver_field.h" +#include "cg_her.h" + +int cg_her(spinor * const P, spinor * const Q, const int max_iter, + double eps_sq, const int rel_prec, const int N, matrix_mult f) { + + static double normsq,pro,err,alpha_cg,beta_cg,squarenorm; + int iteration; + int save_sloppy = g_sloppy_precision; + double atime, etime, flops; + spinor ** solver_field = NULL; + spinor * stmp; + const int nr_sf = 3; + + if(N == VOLUME) { + init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf); + } + else { + init_solver_field(&solver_field, VOLUMEPLUSRAND/2, nr_sf); + } + /* initialize residue r and search vector p */ + atime = gettime(); + squarenorm = square_norm(Q, N, 1); + + f(solver_field[0], P); + + diff(solver_field[1], Q, solver_field[0], N); + assign(solver_field[2], solver_field[1], N); + normsq=square_norm(solver_field[1], N, 1); + + /* main loop */ + for(iteration = 1; iteration <= max_iter; iteration++) { + f(solver_field[0], solver_field[2]); + pro = scalar_prod_r(solver_field[2], solver_field[0], N, 1); + alpha_cg = normsq / pro; + assign_add_mul_r(P, solver_field[2], alpha_cg, N); + +#if (defined SSE2 || defined SSE3) + assign_mul_add_r(solver_field[0], -alpha_cg, solver_field[1], N); + err = square_norm(solver_field[0], N, 1); +#else + err = assign_mul_add_r_and_square(solver_field[0], -alpha_cg, solver_field[1], N, 1); +#endif + + if(g_proc_id == g_stdio_proc && g_debug_level > 2) { + printf("CG: iterations: %d res^2 %e\n", iteration, err); + fflush(stdout); + } + + if (((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq*squarenorm) && (rel_prec == 1))) { + break; + } +#ifdef _USE_HALFSPINOR + if(((err*err <= eps_sq) && (rel_prec == 0)) || ((err*err <= eps_sq*squarenorm) && (rel_prec == 1))) { + g_sloppy_precision = 1; + if(g_debug_level > 2 && g_proc_id == g_stdio_proc && g_sloppy_precision_flag == 1) { + printf("sloppy precision on\n"); fflush( stdout); + } + } +#endif + + beta_cg = err / normsq; + assign_mul_add_r(solver_field[2], beta_cg, solver_field[0], N); + stmp = solver_field[0]; + solver_field[0] = solver_field[1]; + solver_field[1] = stmp; + normsq = err; + } + etime = gettime(); + g_sloppy_precision = save_sloppy; + /* 2 A + 2 Nc Ns + N_Count ( 2 A + 10 Nc Ns ) */ + /* 2*1608.0 because the linalg is over VOLUME/2 */ + flops = (2*(2*1608.0+2*3*4) + 2*3*4 + iteration*(2.*(2*1608.0+2*3*4) + 10*3*4))*N/1.0e6f; + if(g_debug_level > 0 && g_proc_id == 0 && N != VOLUME) { + printf("# CG: iter: %d eps_sq: %1.4e t/s: %1.4e\n", iteration, eps_sq, etime-atime); + printf("# CG: flopcount (for e/o tmWilson only): t/s: %1.4e mflops_local: %.1f mflops: %.1f\n", + etime-atime, flops/(etime-atime), g_nproc*flops/(etime-atime)); + } + finalize_solver(solver_field, nr_sf); + if(iteration > max_iter) return(-1); + return(iteration); +} + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/cg_her.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/cg_her.h new file mode 100644 index 0000000000000000000000000000000000000000..bbb4f702fdd7249da4cbf2cfc1cec2aca2ed8270 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/cg_her.h @@ -0,0 +1,28 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _CG_HER_H +#define _CG_HER_H + +#include"solver/matrix_mult_typedef.h" +#include"su3.h" + +int cg_her(spinor * const, spinor * const, const int max_iter, double eps_sq, const int rel_prec, + const int N, matrix_mult f); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/cg_her_bi.c b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/cg_her_bi.c new file mode 100644 index 0000000000000000000000000000000000000000..20d52fa6782cd6ec10234c211983412592d23685 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/cg_her_bi.c @@ -0,0 +1,139 @@ +/*********************************************************************** + * + * Copyright (C) 2005 Thomas Chiarappa + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * + * File: cg_her_bi.c + * + * CG solver for hermitian f only! + * + * The externally accessible functions are + * + * + * int cg_bi(bispinor * const P, bispinor * const Q, double m, const int subtract_ev) + * CG solver for bispinor structure + * + * + * + * !!!!! SO FAR NOT IMPLEMENTED FOR EW-SUBTRACTION !!!!!! + * + * + * + * input: + * m: Mass to be use in D_psi + * subtrac_ev: if set to 1, the lowest eigenvectors of Q^2 will + * be projected out. + * Q: source + * inout: + * P: initial guess and result + * + * + **************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "linalg_eo.h" +#include "start.h" +#include "solver/matrix_mult_typedef.h" +#include "sub_low_ev.h" +#include "cg_her_bi.h" +#include "solver_field.h" +#include"solver/matrix_mult_typedef_bi.h" + + +/* P output = solution , Q input = source */ +int cg_her_bi(bispinor * const P, bispinor * const Q, const int max_iter, + double eps_sq, const int rel_prec, const int N, matrix_mult_bi f) { + + double normsp, normsq, pro, err, alpha_cg, beta_cg, squarenorm; + int iteration; + bispinor ** bisolver_field = NULL; + const int nr_sf = 6; + + if(N == VOLUME) { + init_bisolver_field(&bisolver_field, VOLUMEPLUSRAND, nr_sf); + } + else { + init_bisolver_field(&bisolver_field, VOLUMEPLUSRAND/2, nr_sf); + } + squarenorm = square_norm((spinor*)Q, 2*N, 1); + /* !!!! INITIALIZATION !!!! */ + assign((spinor*)bisolver_field[0], (spinor*)P, 2*N); + /* (r_0,r_0) = normsq */ + normsp=square_norm((spinor*)P, 2*N, 1); + assign((spinor*)bisolver_field[5], (spinor*)Q, 2*N); + + /* initialize residue r and search vector p */ + if(normsp == 0) { + /* if a starting solution vector equal to zero is chosen */ + assign((spinor*)bisolver_field[1], (spinor*)bisolver_field[5], 2*N); + assign((spinor*)bisolver_field[2], (spinor*)bisolver_field[5], 2*N); + normsq=square_norm((spinor*)Q, 2*N, 1); + } + else { + /* if a starting solution vector different from zero is chosen */ + f(bisolver_field[3], bisolver_field[0]); + diff((spinor*)bisolver_field[1], (spinor*)bisolver_field[5], + (spinor*)bisolver_field[3], 2*N); + assign((spinor*)bisolver_field[2], (spinor*)bisolver_field[1], 2*N); + normsq=square_norm((spinor*)bisolver_field[2], 2*N, 1); + } + + /* main loop */ + for(iteration = 0; iteration < max_iter; iteration++) { + f(bisolver_field[4], bisolver_field[2]); + pro=scalar_prod_r((spinor*)bisolver_field[2], (spinor*)bisolver_field[4], 2*N, 1); + + /* Compute alpha_cg(i+1) */ + alpha_cg=normsq/pro; + + /* Compute x_(i+1) = x_i + alpha_cg(i+1) p_i */ + assign_add_mul_r((spinor*)bisolver_field[0], (spinor*)bisolver_field[2], alpha_cg, 2*N); + /* Compute r_(i+1) = r_i - alpha_cg(i+1) Qp_i */ + assign_add_mul_r((spinor*)bisolver_field[1], (spinor*)bisolver_field[4], -alpha_cg, 2*N); + + /* Check whether the precision is reached ... */ + err=square_norm((spinor*)bisolver_field[1], 2*N, 1); + + if((g_proc_id == g_stdio_proc) && (g_debug_level > 2)) { + printf("%d\t%g\n",iteration,err); fflush( stdout); + } + + if(((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq*squarenorm) && (rel_prec == 1))) { + assign((spinor*)P, (spinor*)bisolver_field[0], 2*N); + finalize_bisolver(bisolver_field, nr_sf); + return(iteration+1); + } + + /* Compute beta_cg(i+1) + Compute p_(i+1) = r_i+1 + beta_(i+1) p_i */ + beta_cg=err/normsq; + assign_mul_add_r((spinor*)bisolver_field[2], beta_cg, (spinor*)bisolver_field[1], 2*N); + normsq=err; + } + + assign((spinor*)P, (spinor*)bisolver_field[0], 2*N); + finalize_bisolver(bisolver_field, nr_sf); + return(-1); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/cg_her_bi.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/cg_her_bi.h new file mode 100644 index 0000000000000000000000000000000000000000..2d41e64281eeb8d0b3ab5770748783c0978a783d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/cg_her_bi.h @@ -0,0 +1,30 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _CG_HER_BI_H +#define _CG_HER_BI_H + +#include"solver/matrix_mult_typedef.h" +#include"su3.h" + +#include"solver/matrix_mult_typedef_bi.h" + +int cg_her_bi(bispinor * const, bispinor * const, const int max_iter, + double eps_sq, const int rel_prec, const int N, matrix_mult_bi f); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/cg_her_nd.c b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/cg_her_nd.c new file mode 100644 index 0000000000000000000000000000000000000000..e0fe53411045ff79c705c5b30a0d2a7861340999 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/cg_her_nd.c @@ -0,0 +1,172 @@ +/*********************************************************************** + * + * Copyright (C) 2005 Thomas Chiarappa + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * File: cg_her.c + * + * CG solver for hermitian f only! + * + * The externally accessible functions are + * + * + * int cg(spinor * const P, spinor * const Q, double m, const int subtract_ev) + * CG solver + * + * input: + * m: Mass to be use in D_psi + * subtrac_ev: if set to 1, the lowest eigenvectors of Q^2 will + * be projected out. + * Q: source + * inout: + * P: initial guess and result + * + * + **************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "linalg_eo.h" +#include "start.h" +#include "solver/matrix_mult_typedef_nd.h" +#include "sub_low_ev.h" +#include "solver_field.h" +#include "cg_her.h" + +/* P output = solution , Q input = source */ +int cg_her_nd(spinor * const P_up,spinor * P_dn, spinor * const Q_up, spinor * const Q_dn, + const int max_iter, double eps_sq, const int rel_prec, + const int N, matrix_mult_nd f) { + double normsp, normsq, pro, err, alpha_cg, beta_cg, squarenorm; + int iteration; + double err1, err2; + spinor ** up_field = NULL; + spinor ** dn_field = NULL; + const int nr_sf = 5; + /* do we really need so many fields??? */ + init_solver_field(&up_field, VOLUMEPLUSRAND, nr_sf); + init_solver_field(&dn_field, VOLUMEPLUSRAND, nr_sf); + + squarenorm = square_norm(Q_up, N, 1); + squarenorm+= square_norm(Q_dn, N, 1); + /* !!!! INITIALIZATION !!!! */ + assign(up_field[0], P_up, N); + assign(dn_field[0], P_dn, N); + + /* (r_0,r_0) = normsq */ + normsp =square_norm(P_up, N, 1); + normsp+=square_norm(P_dn, N, 1); + +/* assign(up_field[5], Q_up, N); */ +/* assign(dn_field[5], Q_dn, N); */ + + /* initialize residue r and search vector p */ + if(normsp==0){ + /* if a starting solution vector equal to zero is chosen */ + assign(up_field[1], Q_up, N); + assign(dn_field[1], Q_dn, N); + assign(up_field[2], Q_up, N); + assign(dn_field[2], Q_dn, N); + normsq =square_norm(Q_up, N, 1); + normsq+=square_norm(Q_dn, N, 1); + } + else { + /* if a starting solution vector different from zero is chosen */ + f(up_field[3],dn_field[3], + up_field[0],dn_field[0]); + + diff(up_field[1], Q_up, up_field[3], N); + diff(dn_field[1], Q_dn, dn_field[3], N); + assign(up_field[2], up_field[1], N); + assign(dn_field[2], dn_field[1], N); + normsq =square_norm(up_field[2], N, 1); + normsq+=square_norm(dn_field[2], N, 1); + } + + /* main loop */ + for(iteration=0;iteration 2 && g_proc_id == g_stdio_proc) { + printf("cg_her_nd : i = %d esqr %e = %e + %e \n",iteration,err, err1, err2); fflush( stdout); + } + + if(((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq*squarenorm) && (rel_prec == 1))) { + assign(P_up, up_field[0], N); + assign(P_dn, dn_field[0], N); + g_sloppy_precision = 0; + finalize_solver(up_field, nr_sf); + finalize_solver(dn_field, nr_sf); + return(iteration+1); + } +#ifdef _USE_HALFSPINOR + if(((err*err <= eps_sq) && (rel_prec == 0)) || ((err*err <= eps_sq*squarenorm) && (rel_prec == 1))) { + g_sloppy_precision = 1; + if(g_debug_level > 2 && g_proc_id == g_stdio_proc) { + printf("sloppy precision on\n"); fflush( stdout); + } + } +#endif + /* Compute beta_cg(i+1) + Compute p_(i+1) = r_i+1 + beta_(i+1) p_i */ + beta_cg=err/normsq; + assign_mul_add_r(up_field[2], beta_cg, up_field[1], N); + assign_mul_add_r(dn_field[2], beta_cg, dn_field[1], N); + normsq=err; + } + + assign(P_up, up_field[0], N); + assign(P_dn, dn_field[0], N); + g_sloppy_precision = 0; + + finalize_solver(up_field, nr_sf); + finalize_solver(dn_field, nr_sf); + return(-1); +} + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/cg_her_nd.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/cg_her_nd.h new file mode 100644 index 0000000000000000000000000000000000000000..20f2f5ddc0a6cf71a0fd3f84ded70e80d918b2b0 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/cg_her_nd.h @@ -0,0 +1,29 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _CG_HER_ND_H +#define _CG_HER_ND_H + +#include"solver/matrix_mult_typedef_nd.h" +#include"su3.h" + +int cg_her_nd(spinor * const, spinor * const,spinor * const, spinor * const, + const int max_iter, double eps_sq, const int rel_prec, + const int N, matrix_mult_nd f); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/cg_her_su3vect.c b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/cg_her_su3vect.c new file mode 100755 index 0000000000000000000000000000000000000000..514aea4bf03ae578d6a4ab0c53de5bb890ba27b0 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/cg_her_su3vect.c @@ -0,0 +1,98 @@ +/*********************************************************************** + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + **************************************************************************/ + +/* ************************************************************************ + * Conjugate Gradient for su3 vectors + * Authors: Luigi Scorzato, Marco Cristoforetti + * based on cg_her.c + * + **************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#ifdef MPI +# include +#endif +#include "global.h" +#include "su3.h" +#include "linalg_eo.h" +#include "start.h" +#include "gettime.h" +#include "solver/matrix_mult_typedef.h" +#include "cg_her_su3vect.h" + +#ifdef WITHLAPH + +int cg_her_su3vect(su3_vector * const P, su3_vector * const Q, const int max_iter, + double eps_sq, const int rel_prec, const int N,const int tslice, matrix_mult_su3vect f) { + + static double normsq,pro,err,alpha_cg,beta_cg,squarenorm; + int iteration; + int save_sloppy = g_sloppy_precision; + double atime, etime; + + + atime = gettime(); + squarenorm = square_norm_su3vect(Q, N, 1); + + f(g_jacobi_field[0],P,tslice); + + diff_su3vect(g_jacobi_field[1], Q, g_jacobi_field[0], N); + assign_su3vect(g_jacobi_field[2], g_jacobi_field[1], N); + normsq=square_norm_su3vect(g_jacobi_field[1], N, 1); + + /* main loop */ + for(iteration = 1; iteration <= max_iter; iteration++) { + f(g_jacobi_field[0], g_jacobi_field[2],tslice); + pro = scalar_prod_r_su3vect(g_jacobi_field[2], g_jacobi_field[0], N, 1); + alpha_cg = normsq / pro; + assign_add_mul_r_su3vect(P, g_jacobi_field[2], alpha_cg, N); + + assign_mul_add_r_su3vect(g_jacobi_field[0], -alpha_cg, g_jacobi_field[1], N); + err=square_norm_su3vect(g_jacobi_field[0], N, 1); + + if(g_proc_id == g_stdio_proc && g_debug_level > 2) { + printf("CG: iterations: %d res^2 %e\n", iteration, err); + fflush(stdout); + } + + if (((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq*squarenorm) && (rel_prec == 1))) { + break; + } + beta_cg = err / normsq; + assign_mul_add_r_su3vect(g_jacobi_field[2], beta_cg, g_jacobi_field[0], N); + assign_su3vect(g_jacobi_field[1], g_jacobi_field[0], N); + normsq = err; + } + etime = gettime(); + g_sloppy_precision = save_sloppy; + /* FLOPS= 2 A + 2 Nc Ns + N_Count ( 2 A + 10 Nc Ns ) */ + if(g_debug_level > 0 && g_proc_id == 0) { + printf("CG: iter: %d eps_sq: %1.4e t/s: %1.4e\n", iteration, eps_sq, etime-atime); + } + if(iteration > max_iter) return(-1); + return(iteration); +} + +#endif // WITHLAPH diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/cg_her_su3vect.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/cg_her_su3vect.h new file mode 100755 index 0000000000000000000000000000000000000000..85e8e20b41877bab4176602654b77364afb3276e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/cg_her_su3vect.h @@ -0,0 +1,27 @@ +/*********************************************************************** + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _CG_HERSU3V_H +#define _CG_HERSU3V_H + +#include"solver/matrix_mult_typedef.h" +#include"su3.h" + +int cg_her_su3vect(su3_vector * const P, su3_vector * const Q, const int max_iter, double eps_sq, const int rel_prec, + const int N, const int tslice, matrix_mult_su3vect f); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/cg_mms_tm.c b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/cg_mms_tm.c new file mode 100644 index 0000000000000000000000000000000000000000..9e88ef743d68171f6b73d2d02f04df4990084a03 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/cg_mms_tm.c @@ -0,0 +1,241 @@ +/*********************************************************************** + * + * Copyright (C) 2004 Andrea Shindler + * 2013 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * Author: Andrea Shindler Jan 2004 + * + * This is a Multi-Shift CG solver + * it expects that the shifts fulfil + * + * shift[0] < shift[1] < shift{2] < ... < shift[no_shifts-1] + * + * in modulus. The code will use shift[i]^2, which are all >0 + * + * parameters: + * shifts are given to the solver in solver_pm->shifts + * number of shifts is in solver_pm->no_shifts + * the operator to invert in solver_pm->M_ndpsi + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "gamma.h" +#include "linalg_eo.h" +#include "start.h" +#include "gettime.h" +#include "solver/solver.h" +#include "solver_field.h" +#include "cg_mms_tm.h" +#include + +static spinor * ps_qmms; +static spinor ** ps_mms_solver; +static double * sigma; +static double * zitam1, * zita; +static double * alphas, * betas; + +extern int index_start; + +static void init_mms_tm(const unsigned int nr, const unsigned int N); +static void free_mms_tm(); + +/* P output = solution , Q input = source */ +int cg_mms_tm(spinor ** const P, spinor * const Q, + solver_pm_t * solver_pm, double * cgmms_reached_prec) { + + static double normsq, pro, err, squarenorm; + int iteration, N = solver_pm->sdim, no_shifts = solver_pm->no_shifts; + static double gamma, alpham1; + spinor ** solver_field = NULL; + double atime, etime; + const int nr_sf = 3; + + atime = gettime(); + if(solver_pm->sdim == VOLUME) { + init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf); + init_mms_tm(no_shifts, VOLUMEPLUSRAND); + } + else { + init_solver_field(&solver_field, VOLUMEPLUSRAND/2, nr_sf); + init_mms_tm(no_shifts, VOLUMEPLUSRAND/2); + } + + zero_spinor_field(P[0], N); + alphas[0] = 1.0; + betas[0] = 0.0; + sigma[0] = solver_pm->shifts[0]*solver_pm->shifts[0]; + if(g_proc_id == 0 && g_debug_level > 2) printf("# CGMMS: shift %d is %e\n", 0, sigma[0]); + + for(int im = 1; im < no_shifts; im++) { + sigma[im] = solver_pm->shifts[im]*solver_pm->shifts[im] - sigma[0]; + if(g_proc_id == 0 && g_debug_level > 2) printf("# CGMMS: shift %d is %e\n", im, sigma[im]); + // these will be the result spinor fields + zero_spinor_field(P[im], N); + // these are intermediate fields + assign(ps_mms_solver[im-1], Q, N); + zitam1[im] = 1.0; + zita[im] = 1.0; + alphas[im] = 1.0; + betas[im] = 0.0; + } + + /* currently only implemented for P=0 */ + squarenorm = square_norm(Q, N, 1); + /* if a starting solution vector equal to zero is chosen */ + assign(solver_field[0], Q, N); + assign(solver_field[1], Q, N); + normsq = squarenorm; + + /* main loop */ + for(iteration = 0; iteration < solver_pm->max_iter; iteration++) { + + /* Q^2*p and then (p,Q^2*p) */ + solver_pm->M_psi(solver_field[2], solver_field[1]); + // add the zero's shift + assign_add_mul_r(solver_field[2], solver_field[1], sigma[0], N); + pro = scalar_prod_r(solver_field[1], solver_field[2], N, 1); + + /* For the update of the coeff. of the shifted pol. we need alphas[0](i-1) and alpha_cg(i). + This is the reason why we need this double definition of alpha */ + alpham1 = alphas[0]; + + /* Compute alphas[0](i+1) */ + alphas[0] = normsq/pro; + for(int im = 1; im < no_shifts; im++) { + + /* Now gamma is a temp variable that corresponds to zita(i+1) */ + gamma = zita[im]*alpham1/(alphas[0]*betas[0]*(1.-zita[im]/zitam1[im]) + + alpham1*(1.+sigma[im]*alphas[0])); + + // Now zita(i-1) is put equal to the old zita(i) + zitam1[im] = zita[im]; + // Now zita(i+1) is updated + zita[im] = gamma; + // Update of alphas(i) = alphas[0](i)*zita(i+1)/zita(i) + alphas[im] = alphas[0]*zita[im]/zitam1[im]; + + // Compute xs(i+1) = xs(i) + alphas(i)*ps(i) + assign_add_mul_r(P[im], ps_mms_solver[im-1], alphas[im], N); + // in the CG the corrections are decreasing with the iteration number increasing + // therefore, we can remove shifts when the norm of the correction vector + // falls below a threshold + // this is useful for computing time and needed, because otherwise + // zita might get smaller than DOUBLE_EPS and, hence, zero + if(iteration > 0 && (iteration % 20 == 0) && (im == no_shifts-1)) { + double sn = square_norm(ps_mms_solver[im-1], N, 1); + if(alphas[no_shifts-1]*alphas[no_shifts-1]*sn <= solver_pm->squared_solver_prec) { + no_shifts--; + if(g_debug_level > 2 && g_proc_id == 0) { + printf("# CGMMS: at iteration %d removed one shift, %d remaining\n", iteration, no_shifts); + } + } + } + } + + /* Compute x_(i+1) = x_i + alphas[0](i+1) p_i */ + assign_add_mul_r(P[0], solver_field[1], alphas[0], N); + /* Compute r_(i+1) = r_i - alphas[0](i+1) Qp_i */ + assign_add_mul_r(solver_field[0], solver_field[2], -alphas[0], N); + + /* Check whether the precision eps_sq is reached */ + + err = square_norm(solver_field[0], N, 1); + + if(g_debug_level > 2 && g_proc_id == g_stdio_proc) { + printf("# CGMMS iteration: %d residue: %g\n", iteration, err); fflush( stdout ); + } + + if( ((err <= solver_pm->squared_solver_prec) && (solver_pm->rel_prec == 0)) || + ((err <= solver_pm->squared_solver_prec*squarenorm) && (solver_pm->rel_prec > 0)) || + (iteration == solver_pm->max_iter -1) ) { + /* FIXME temporary output of precision until a better solution can be found */ + *cgmms_reached_prec = err; + break; + } + + /* Compute betas[0](i+1) = (r(i+1),r(i+1))/(r(i),r(i)) + Compute p(i+1) = r(i+1) + beta(i+1)*p(i) */ + betas[0] = err/normsq; + assign_mul_add_r(solver_field[1], betas[0], solver_field[0], N); + normsq = err; + + /* Compute betas(i+1) = betas[0](i+1)*(zita(i+1)*alphas(i))/(zita(i)*alphas[0](i)) + Compute ps(i+1) = zita(i+1)*r(i+1) + betas(i+1)*ps(i) */ + for(int im = 1; im < no_shifts; im++) { + betas[im] = betas[0]*zita[im]*alphas[im]/(zitam1[im]*alphas[0]); + assign_mul_add_mul_r(ps_mms_solver[im-1], solver_field[0], betas[im], zita[im], N); + } + } + etime = gettime(); + g_sloppy_precision = 0; + if(iteration == solver_pm->max_iter -1) iteration = -1; + else iteration++; + if(g_debug_level > 0 && g_proc_id == 0) { + printf("# CGMMS (%d shifts): iter: %d eps_sq: %1.4e %1.4e t/s\n", solver_pm->no_shifts, iteration, solver_pm->squared_solver_prec, etime - atime); + } + + finalize_solver(solver_field, nr_sf); + return(iteration); +} + + +static unsigned int ini_mms = 0; +static unsigned int mms_nr_allocated = 0; + +static void init_mms_tm(const unsigned int _nr, const unsigned int N) { + if(ini_mms == 0 || _nr > mms_nr_allocated) { + if(mms_nr_allocated != 0) { + free_mms_tm(); + } + + sigma = (double*)calloc((_nr), sizeof(double)); + zitam1 = (double*)calloc((_nr), sizeof(double)); + zita = (double*)calloc((_nr), sizeof(double)); + alphas = (double*)calloc((_nr), sizeof(double)); + betas = (double*)calloc((_nr), sizeof(double)); + + ps_qmms = (spinor*)calloc(N*_nr,sizeof(spinor)); + ps_mms_solver = (spinor**)calloc(_nr,sizeof(spinor*)); + + for(int i = 0; i < _nr; i++) { + ps_mms_solver[i]=(spinor*)(((unsigned long int)(ps_qmms)+ALIGN_BASE)&~ALIGN_BASE) + i*N; + } + mms_nr_allocated = _nr; + ini_mms = 1; + } +} + +static void free_mms_tm() { + free(sigma); + free(zitam1); + free(zita); + free(alphas); + free(betas); + free(ps_qmms); + free(ps_mms_solver); + mms_nr_allocated = 0; + ini_mms = 0; + return; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/cg_mms_tm.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/cg_mms_tm.h new file mode 100644 index 0000000000000000000000000000000000000000..1b70facbebf1797d8fcd0d3199737ae8d0477792 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/cg_mms_tm.h @@ -0,0 +1,33 @@ +/*********************************************************************** + * + * + * Copyright (C) 2004 Andrea Shindler + * 2009 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + ***********************************************************************/ + +#ifndef _CG_MMS_TM_H +#define _CG_MMS_TM_H + +#include "solver.h" +#include "matrix_mult_typedef.h" +#include "su3.h" + +int cg_mms_tm(spinor ** const P,spinor * const Q, solver_pm_t * const params, double * reached_prec); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/cg_mms_tm_nd.c b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/cg_mms_tm_nd.c new file mode 100644 index 0000000000000000000000000000000000000000..9da378692f650ea442c82a73efb47d1a95ec9f7c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/cg_mms_tm_nd.c @@ -0,0 +1,256 @@ +/*********************************************************************** + * + * Copyright (C) 2004 Andrea Shindler + * 2009 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * Author: Andrea Shindler Jan 2004 + * + * This is a Multi-Shift CG solver + * it expects that the shifts fulfil + * + * shift[0] < shift[1] < shift{2] < ... < shift[no_shifts-1] + * + * in modulus. The code will use shift[i]^2, which are all >0 + * + * parameters: + * shifts are given to the solver in solver_pm->shifts + * number of shifts is in solver_pm->no_shifts + * the operator to invert in solver_pm->M_ndpsi + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "gamma.h" +#include "linalg_eo.h" +#include "start.h" +#include "gettime.h" +#include "solver/solver.h" +#include "solver_field.h" +#include "cg_mms_tm_nd.h" + +static spinor * ps_qmms; +static spinor ** ps_mms_solver; +static double * sigma; +static double * zitam1, * zita; +static double * alphas, * betas; + +extern int index_start; + +static void init_mms_tm_nd(const unsigned int nr, const unsigned int N); +static void free_mms_tm_nd(); + +/* P output = solution , Q input = source */ +int cg_mms_tm_nd(spinor ** const Pup, spinor ** const Pdn, + spinor * const Qup, spinor * const Qdn, + solver_pm_t * solver_pm) { + + static double normsq, pro, err, squarenorm; + int iteration, N = solver_pm->sdim, shifts = solver_pm->no_shifts; + static double gamma, alpham1; + spinor ** solver_field = NULL; + double atime, etime; + const int nr_sf = 4; + + atime = gettime(); + if(solver_pm->sdim == VOLUME) { + init_solver_field(&solver_field, VOLUMEPLUSRAND, 2*nr_sf); + } + else { + init_solver_field(&solver_field, VOLUMEPLUSRAND/2, 2*nr_sf); + } + + // don't need boundaries, because we never apply f to them + // so N is enough + //init_mms_tm_nd(shifts, solver_pm->N); + init_mms_tm_nd(shifts, VOLUMEPLUSRAND/2); + zero_spinor_field(Pup[0], N); + zero_spinor_field(Pdn[0], N); + assign(ps_mms_solver[0], Qup, N); + assign(ps_mms_solver[1], Qdn, N); + alphas[0] = 1.0; + betas[0] = 0.0; + sigma[0] = solver_pm->shifts[0]*solver_pm->shifts[0]; + if(g_proc_id == 0 && g_debug_level > 2) printf("# CGMMSND: shift %d is %e\n", 0, sigma[0]); + + /* currently only implemented for P=0 */ + for(int im = 1; im < shifts; im++) { + sigma[im] = solver_pm->shifts[im]*solver_pm->shifts[im] - sigma[0]; + if(g_proc_id == 0 && g_debug_level > 2) printf("# CGMMSND: shift %d is %e\n", im, sigma[im]); + // these will be the result spinor fields + zero_spinor_field(Pup[im], N); + zero_spinor_field(Pdn[im], N); + // these are intermediate fields + assign(ps_mms_solver[2*im], Qup, N); + assign(ps_mms_solver[2*im+1], Qdn, N); + zitam1[im] = 1.0; + zita[im] = 1.0; + alphas[im] = 1.0; + betas[im] = 0.0; + } + + squarenorm = square_norm(Qup, N, 1) + square_norm(Qdn, N, 1); + /* if a starting solution vector equal to zero is chosen */ + assign(solver_field[0], Qup, N); + assign(solver_field[1], Qdn, N); + assign(solver_field[2], Qup, N); + assign(solver_field[3], Qdn, N); + normsq = squarenorm; + + /* main loop */ + for(iteration = 0; iteration < solver_pm->max_iter; iteration++) { + + /* Q^2*p and then (p,Q^2*p) */ + solver_pm->M_ndpsi(solver_field[6], solver_field[7], solver_field[2], solver_field[3]); + // add the zero's shift + assign_add_mul_r(solver_field[6], solver_field[2], sigma[0], N); + assign_add_mul_r(solver_field[7], solver_field[3], sigma[0], N); + pro = scalar_prod_r(solver_field[2], solver_field[6], N, 1); + pro += scalar_prod_r(solver_field[3], solver_field[7], N, 1); + + /* For the update of the coeff. of the shifted pol. we need alphas[0](i-1) and alpha_cg(i). + This is the reason why we need this double definition of alpha */ + alpham1 = alphas[0]; + + /* Compute alphas[0](i+1) */ + alphas[0] = normsq/pro; + for(int im = 1; im < shifts; im++) { + + /* Now gamma is a temp variable that corresponds to zita(i+1) */ + gamma = zita[im]*alpham1/(alphas[0]*betas[0]*(1.-zita[im]/zitam1[im]) + + alpham1*(1.+sigma[im]*alphas[0])); + + // Now zita(i-1) is put equal to the old zita(i) + zitam1[im] = zita[im]; + // Now zita(i+1) is updated + zita[im] = gamma; + // Update of alphas(i) = alphas[0](i)*zita(i+1)/zita(i) + alphas[im] = alphas[0]*zita[im]/zitam1[im]; + + // Compute xs(i+1) = xs(i) + alphas(i)*ps(i) + assign_add_mul_r(Pup[im], ps_mms_solver[2*im], alphas[im], N); + assign_add_mul_r(Pdn[im], ps_mms_solver[2*im+1], alphas[im], N); + // in the CG the corrections are decreasing with the iteration number increasing + // therefore, we can remove shifts when the norm of the correction vector + // falls below a threshold + // this is useful for computing time and needed, because otherwise + // zita might get smaller than DOUBLE_EPS and, hence, zero + if(iteration > 0 && (iteration % 20 == 0) && (im == shifts-1)) { + double sn = square_norm(ps_mms_solver[2*im], N, 1); + sn += square_norm(ps_mms_solver[2*im+1], N, 1); + if(alphas[shifts-1]*alphas[shifts-1]*sn <= solver_pm->squared_solver_prec) { + shifts--; + if(g_debug_level > 2 && g_proc_id == 0) { + printf("# CGMMSND: at iteration %d removed one shift, %d remaining\n", iteration, shifts); + } + } + } + } + + /* Compute x_(i+1) = x_i + alphas[0](i+1) p_i */ + assign_add_mul_r(Pup[0], solver_field[2], alphas[0], N); + assign_add_mul_r(Pdn[0], solver_field[3], alphas[0], N); + /* Compute r_(i+1) = r_i - alphas[0](i+1) Qp_i */ + assign_add_mul_r(solver_field[0], solver_field[6], -alphas[0], N); + assign_add_mul_r(solver_field[1], solver_field[7], -alphas[0], N); + + /* Check whether the precision eps_sq is reached */ + + err = square_norm(solver_field[0], N, 1) + square_norm(solver_field[1], N, 1); + + if(g_debug_level > 2 && g_proc_id == g_stdio_proc) { + printf("# CGMMSND iteration: %d residue: %g\n", iteration, err); fflush( stdout ); + } + + if( ((err <= solver_pm->squared_solver_prec) && (solver_pm->rel_prec == 0)) || + ((err <= solver_pm->squared_solver_prec*squarenorm) && (solver_pm->rel_prec > 0)) || + (iteration == solver_pm->max_iter -1) ) { + break; + } + + /* Compute betas[0](i+1) = (r(i+1),r(i+1))/(r(i),r(i)) + Compute p(i+1) = r(i+1) + beta(i+1)*p(i) */ + betas[0] = err/normsq; + assign_mul_add_r(solver_field[2], betas[0], solver_field[0], N); + assign_mul_add_r(solver_field[3], betas[0], solver_field[1], N); + normsq = err; + + /* Compute betas(i+1) = betas[0](i)*(zita(i+1)*alphas(i))/(zita(i)*alphas[0](i)) + Compute ps(i+1) = zita(i+1)*r(i+1) + betas(i+1)*ps(i) */ + for(int im = 1; im < shifts; im++) { + betas[im] = betas[0]*zita[im]*alphas[im]/(zitam1[im]*alphas[0]); + assign_mul_add_mul_r(ps_mms_solver[2*im], solver_field[0], betas[im], zita[im], N); + assign_mul_add_mul_r(ps_mms_solver[2*im+1], solver_field[1], betas[im], zita[im], N); + } + } + etime = gettime(); + g_sloppy_precision = 0; + if(iteration == solver_pm->max_iter -1) iteration = -1; + else iteration++; + if(g_debug_level > 0 && g_proc_id == 0) { + printf("# CGMMS (%d shifts): iter: %d eps_sq: %1.4e %1.4e t/s\n", solver_pm->no_shifts, iteration, solver_pm->squared_solver_prec, etime - atime); + } + + finalize_solver(solver_field, 2*nr_sf); + return(iteration); +} + + +static unsigned int ini_mms_nd = 0; +static unsigned int nr_nd = 0; + +static void init_mms_tm_nd(const unsigned int _nr, const unsigned int N) { + if(ini_mms_nd == 0 || _nr > nr_nd) { + if(nr_nd != 0) { + free_mms_tm_nd(); + } + nr_nd = _nr; + + sigma = (double*)calloc((nr_nd), sizeof(double)); + zitam1 = (double*)calloc((nr_nd), sizeof(double)); + zita = (double*)calloc((nr_nd), sizeof(double)); + alphas = (double*)calloc((nr_nd), sizeof(double)); + betas = (double*)calloc((nr_nd), sizeof(double)); + + ps_qmms = (spinor*)calloc(N*(2*nr_nd)+1,sizeof(spinor)); + ps_mms_solver = (spinor**)calloc((2*nr_nd)+1,sizeof(spinor*)); + + for(int i = 0; i < 2*nr_nd; i++) { + ps_mms_solver[i]=(spinor*)(((unsigned long int)(ps_qmms)+ALIGN_BASE)&~ALIGN_BASE) + i*N; + } + ini_mms_nd = 1; + } +} + +static void free_mms_tm_nd() { + free(sigma); + free(zitam1); + free(zita); + free(alphas); + free(betas); + free(ps_qmms); + free(ps_mms_solver); + nr_nd = 0; + ini_mms_nd = 0; + return; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/cg_mms_tm_nd.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/cg_mms_tm_nd.h new file mode 100644 index 0000000000000000000000000000000000000000..1c767d903c94691a4d9f92bdc73fc780f1cfd4cd --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/cg_mms_tm_nd.h @@ -0,0 +1,34 @@ +/*********************************************************************** + * + * + * Copyright (C) 2004 Andrea Shindler + * 2009 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + ***********************************************************************/ + +#ifndef _CG_MMS_TM_ND_H +#define _CG_MMS_TM_ND_H + +#include"su3.h" +#include"solver.h" + +int cg_mms_tm_nd(spinor ** const Pup, spinor ** const Pdn, + spinor * const Qup, spinor * const Qdn, + solver_pm_t * solver_pm); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/cgs_real.c b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/cgs_real.c new file mode 100644 index 0000000000000000000000000000000000000000..d55c385a02809505db5f0221ab18c425d978f497 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/cgs_real.c @@ -0,0 +1,114 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "su3.h" +#include "global.h" +#include "linalg_eo.h" +#include "start.h" +#include "linalg/mul_diff_mul_r.h" +#include "linalg/assign_add_mul_add_mul_r.h" +#include "solver/matrix_mult_typedef.h" +#include "solver_field.h" +#include "cgs_real.h" + + +/* P inout (guess for the solving spinor) + Q input +*/ + +int cgs_real(spinor * const P, spinor * const Q, const int max_iter, + double eps_sq, const int rel_prec, const int N, matrix_mult f) { + static double alpha, beta,rjr0,nom,denom,one; + static double res_sq, squarenorm; + int i; + spinor ** solver_field = NULL; + const int nr_sf = 6; + + if(N == VOLUME) { + init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf); + } + else { + init_solver_field(&solver_field, VOLUMEPLUSRAND/2, nr_sf); + } + one=1.; + /* Initialisierung der sf-Felder */ + f(solver_field[0],P); + diff(solver_field[0],Q,solver_field[0], N); /* residual in sf0 */ + assign(solver_field[1],solver_field[0], N); + assign(solver_field[2],solver_field[0], N); + assign(solver_field[5],solver_field[0], N); /* ri=pi=ui=r0 */ + squarenorm = square_norm(Q, N, 1); + + /* loop! */ + for(i=0;i<=max_iter;i++) { + res_sq=square_norm(solver_field[0], N, 1); + if(g_proc_id == g_stdio_proc && g_debug_level > 2) { + printf("%d\t%g\n",i,res_sq); + fflush( stdout ); + } + rjr0 = scalar_prod_r(solver_field[0], solver_field[5], N, 1); + /* square_and_prod(&res_sq,&rjr0,solver_field[0],solver_field[5]); */ + if(((res_sq. + ***********************************************************************/ + + +#ifndef _CGS_REAL_H +#define _CGS_REAL_H + +#include"solver/matrix_mult_typedef.h" +#include"su3.h" + +int cgs_real(spinor * const, spinor * const, const int max_iter, double eps_sq, + const int rel_prec, const int N, matrix_mult f); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/chrono_guess.c b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/chrono_guess.c new file mode 100644 index 0000000000000000000000000000000000000000..a2ecfd7f95c03c5ba34cd9a63cb7b8d0a98645c9 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/chrono_guess.c @@ -0,0 +1,172 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "start.h" +#include "linalg_eo.h" +#include "solver/matrix_mult_typedef.h" +#include "solver/lu_solve.h" +#include "solver/chrono_guess.h" + + +/* N is the number of vectors to be stored maximally */ +/* _n is the last added vector */ +/* index_array holds the indices of all the vectors */ +/* to avoid copying things around */ +/* V is the volume */ +/* trial is the vector to be added */ +/* v is the array of vectors */ + +void chrono_add_solution(spinor * const trial, spinor ** const v, int index_array[], + const int N, int * _n, const int V) { + + double norm = 0.; + int i; + + if(N > 0) { + if(g_proc_id == 0 && g_debug_level > 1) { + printf("# CSG: adding vector %d to the list of length %d\n", (*_n)+1, N); + fflush(stdout); + } + if((*_n) < N) { + index_array[(*_n)] = *_n; + (*_n)= (*_n)+1; + /* normalise vector */ + norm = sqrt(square_norm(trial, V, 1)); + mul_r(v[index_array[(*_n)-1]], 1/norm, trial, V); + } + else { + /* Reorder the index_array */ + /* Keep most recent first */ + for(i = 1; i < N; i++) { + index_array[i-1] = index_array[i]; + } + index_array[N-1] = (index_array[N-2]+1)%N; + /* and normalise */ + norm = sqrt(square_norm(trial, V, 1)); + mul_r(v[index_array[N-1]], 1/norm, trial, V); + } + } + + return; +} + +/* index_array, _N, _n, V as explained above */ +/* trial is the guess vector to be returned */ +/* phi is the right hand side of A*x = b to be */ +/* solved */ + +int chrono_guess(spinor * const trial, spinor * const phi, spinor ** const v, int index_array[], + const int _N, const int _n, const int V, matrix_mult f) { + int info = 0; + int i, j, N=_N, n=_n; + _Complex double s; + static int init_csg = 0; + static _Complex double *bn = NULL; + static _Complex double *G = NULL; + int max_N = 20; + + if(N > 0) { + if(g_proc_id == 0 && g_debug_level > 1) { + printf("# CSG: preparing trial vector \n"); + fflush(stdout); + } + if(init_csg == 0) { + init_csg = 1; + bn = (_Complex double*) malloc(max_N*sizeof(_Complex double)); + G = (_Complex double*) malloc(max_N*max_N*sizeof(_Complex double)); + } + + /* Construct an orthogonal basis */ + for(j = n-1; j > n-2; j--) { + for(i = j-1; i > -1; i--) { + s = scalar_prod(v[index_array[j]], v[index_array[i]], V, 1); + assign_diff_mul(v[index_array[i]], v[index_array[j]], s, V); + if(g_debug_level > 2) { + s = scalar_prod(v[index_array[i]], v[index_array[j]], V, 1); + if(g_proc_id == 0) { + printf("# CSG: <%d,%d> = %e +i %e \n", i, j, creal(s), cimag(s));fflush(stdout); + } + } + } + } + + /* Generate "interaction matrix" V^\dagger f V */ + /* We assume that f is hermitian */ + /* Generate also the right hand side */ + + for (j = 0; j < n; j++){ + f(trial, v[index_array[j]]); + + /* Only the upper triangular part is stored */ + for(i = 0; i < j+1; i++){ + G[i*N + j] = scalar_prod(v[index_array[i]], trial, V, 1); + if(j != i) { + (G[j*N + i]) = conj(G[i*N + j]); + } + if(g_proc_id == 0 && g_debug_level > 2) { + printf("# CSG: G[%d*N + %d]= %e + i %e \n", i, j, creal(G[i*N + j]), cimag(G[i*N + j])); + fflush(stdout); + } + } + /* The right hand side */ + bn[j] = scalar_prod(v[index_array[j]], phi, V, 1); + } + + /* Solver G y = bn for y and store it in bn */ + LUSolve(n, G, N, bn); + + /* Construct the new guess vector */ + if(info == 0) { + mul(trial, bn[n-1], v[index_array[n-1]], V); + if(g_proc_id == 0 && g_debug_level > 2) { + printf("# CSG: bn[%d] = %f %f\n", index_array[n-1], creal(bn[index_array[n-1]]), cimag(bn[index_array[n-1]])); + } + for(i = n-2; i > -1; i--) { + assign_add_mul(trial, v[index_array[i]], bn[i], V); + if(g_proc_id == 0 && g_debug_level > 2) { + printf("# CSG: bn[%d] = %f %f\n", index_array[i], creal(bn[index_array[i]]), cimag(bn[index_array[i]])); + } + } + } + else { + assign(trial, phi, V); + } + + if(g_proc_id == 0 && g_debug_level > 1) { + printf("# CSG: done! n= %d N=%d \n", n, N);fflush(stdout); + } + } + else { + if(g_proc_id == 0 && g_debug_level > 2) { + printf("# CSG: using zero trial vector \n"); + fflush(stdout); + } + zero_spinor_field(trial, V); + } + + return(info); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/chrono_guess.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/chrono_guess.h new file mode 100644 index 0000000000000000000000000000000000000000..e561e7ec79b5f0f6a2547aca0ae0ea811c8a235e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/chrono_guess.h @@ -0,0 +1,31 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _CHRONO_GUESS_H +#define _CHRONO_GUESS_H + +#include "solver/matrix_mult_typedef.h" + +void chrono_add_solution(spinor * const trial, spinor ** const v, int index_array[], + const int _N, int * _n, const int V); + +int chrono_guess(spinor * const trial, spinor * const phi, spinor ** const v, int index_array[], + const int N, const int n, const int V, matrix_mult f); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/dfl_projector.c b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/dfl_projector.c new file mode 100644 index 0000000000000000000000000000000000000000..eb79f2467746d9cdf2250252931a33a3ed277658 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/dfl_projector.c @@ -0,0 +1,1036 @@ +/*********************************************************************** + * + * Copyright (C) 2008 Alber Deuzeman, Siebren Reker, Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#ifdef MPI +# include +#endif +#include "global.h" +#include "start.h" +#include +#include "block.h" +#include "linalg/blas.h" +#include "operator/D_psi.h" +#include "operator/Hopping_Matrix.h" +#include "little_D.h" +#include "block.h" +#include "linalg_eo.h" +#include "gcr4complex.h" +#include "generate_dfl_subspace.h" +#include "operator/tm_operators.h" +#include "boundary.h" +#include "Msap.h" +#include "mr.h" +#include "solver_field.h" +#include "dfl_projector.h" + +double dfl_little_D_prec = 1.e-24; +int dfl_sloppy_prec = 0; +int init_dfl_projector = 0; +spinor **psi; +_Complex double *inprod; +_Complex double *inprod_eo; +_Complex double *inprod_o; +_Complex double *inprod_e; +_Complex double *invvec; +_Complex double *invvec_eo; +_Complex double *ctmp; +_Complex double *work_block; +const int dfl_work_size = 16; +_Complex double *work[16]; + +static void alloc_dfl_projector(); + +/* Break up full volume spinor to blocks + * loop over block.basis + * compute inner product and store as _Complex double vector + * compute A^-1 * _Complex double vector + * loop over block.basis + * compute sum of basis vectors times _Complex double element + * create global vector */ + +/* this is phi_k A^{-1}_{kl} (phi_k, in) */ +void project(spinor * const out, spinor * const in) { + int i,j, i_e, i_o, iter; + int evenodd = 0; + int usePL = 0; + int vol = block_list[0].volume; + _Complex double * v, * w; + double prec; + + if(init_dfl_projector == 0) { + alloc_dfl_projector(); + } + v = work[0]; + w = work[1]; + /*initialize the local (block) parts of the spinor*/ + split_global_field_GEN(psi, in, nb_blocks); + + for (j = 0; j < g_N_s*nb_blocks*9; j++) { + (inprod[j]) = 0.0; + (inprod_o[j]) = 0.0; + (inprod_eo[j]) = 0.0; + (inprod_e[j]) = 0.0; + (invvec[j]) = 0.0; + (invvec_eo[j]) = 0.0; + (ctmp[j]) = 0.0; + (w[j]) = 0.0; + (v[j]) = 0.0; + } + + for (j = 0; j < g_N_s; j++) {/*loop over block.basis */ + i_o=0; + i_e=0; + for(i = 0; i < nb_blocks; i++) { + inprod[j + i*g_N_s] = scalar_prod(block_list[i].basis[j], psi[i], vol, 0); + if(evenodd) { + if (block_list[i].evenodd==0) { + inprod_eo[j + i_e*g_N_s] = inprod[j + i*g_N_s]; + i_e++; + } + if (block_list[i].evenodd==1) { + inprod_eo[j + nb_blocks*g_N_s/2+i_o*g_N_s] = inprod[j + i*g_N_s]; + i_o++; + } + } + } + } + + if(evenodd) { + little_D_ee_inv(inprod_e,inprod_eo); + little_D_hop(1,inprod_o, inprod_e); + little_Dhat_rhs(1,inprod_o,-1,inprod_eo); + } + + + /* if(dfl_sloppy_prec) prec = dfl_little_D_prec;*/ + if(dfl_sloppy_prec) prec = 1.e-12; + else prec = 1.e-24; + + + + if(!usePL) { + if(evenodd) { + iter = gcr4complex(invvec_eo,inprod_o,10,1000,prec,1,nb_blocks*g_N_s,1,nb_blocks*9*g_N_s,&little_D_sym); + + little_D_hop(0,ctmp, invvec_eo); + little_D_ee_inv(invvec_eo,ctmp); + little_Dhat_rhs(0,invvec_eo, -1., inprod_e); + + for (j = 0; j < g_N_s; j++) { + i_o=0; + i_e=0; + for(i = 0; i < nb_blocks; i++) { + if (block_list[i].evenodd==0) { + invvec[j + i*g_N_s] = invvec_eo[j + i_e*g_N_s]; + i_e++; + } + if (block_list[i].evenodd==1) { + invvec[j + i*g_N_s] = invvec_eo[j + nb_blocks*g_N_s/2+i_o*g_N_s]; + i_o++; + } + } + } + if(g_proc_id == 0 && g_debug_level > 0) {/*CT: was "g_debug_level > -1" */ + printf("lgcr evenodd number of iterations %d (no P_L)\n", iter); + } + } + else { + iter = gcr4complex(invvec, inprod, 10, 1000, prec, 1, nb_blocks * g_N_s, 1, nb_blocks * 9 * g_N_s, &little_D); + if(g_proc_id == 0 && g_debug_level > 0) {/*CT: was "g_debug_level > -1" */ + printf("lgcr number of iterations %d (no P_L)\n", iter); + } + } + } + else { + if(evenodd) { + little_P_L_sym(v, inprod_o); + iter = gcr4complex(w, v, 10, 1000, prec, 1, nb_blocks * g_N_s, 1, nb_blocks * 9 * g_N_s, &little_P_L_D_sym); + little_P_R_sym(v, w); +/* little_project(w, inprod_o, g_N_s);*/ + little_project_eo(w,inprod_o,g_N_s); + for(i = 0; i < nb_blocks*g_N_s; ++i) + invvec_eo[i] = w[i] + v[i]; + little_D_hop(0,ctmp, invvec_eo); + little_D_ee_inv(invvec_eo,ctmp); + little_Dhat_rhs(0,invvec_eo, -1., inprod_e); + for (j = 0; j < g_N_s; j++) { + i_o=0; + i_e=0; + for(i = 0; i < nb_blocks; i++){ + if (block_list[i].evenodd==0) { + invvec[j + i*g_N_s] = invvec_eo[j + i_e*g_N_s]; + i_e++; + } + if (block_list[i].evenodd==1) { + invvec[j + i*g_N_s] = invvec_eo[j + nb_blocks*g_N_s/2+i_o*g_N_s]; + i_o++; + } + } + } + if(g_proc_id == 0 && g_debug_level > 0) {/*CT: was "g_debug_level > -1" */ + printf("lgcr even/odd number of iterations %d (using P_L)\n", iter); + } + } + else { + little_P_L(v, inprod); + iter = gcr4complex(w, v, 10, 1000, prec, 1, nb_blocks * g_N_s, 1, nb_blocks * 9 * g_N_s, &little_P_L_D); + little_P_R(v, w); + little_project(w, inprod, g_N_s); + for(i = 0; i < nb_blocks*g_N_s; ++i) + invvec[i] = w[i] + v[i]; + if(g_proc_id == 0 && g_debug_level > 0) {/*CT: was "g_debug_level > -1" */ + printf("lgcr number of iterations %d (using P_L)\n", iter); + } + } + } + /* sum up */ + for(i = 0 ; i < nb_blocks ; i++) { + mul(psi[i], invvec[i*g_N_s], block_list[i].basis[0], vol); + } + for(j = 1; j < g_N_s; j++) { + for(i = 0 ; i < nb_blocks ; i++) { + assign_add_mul(psi[i], block_list[i].basis[j], invvec[i*g_N_s + j], vol); + } + } + + /* reconstruct global field */ + reconstruct_global_field_GEN(out, psi, nb_blocks); + free_dfl_projector(); + return; +} + +static void alloc_dfl_projector() { + int i; + + psi = calloc(2*nb_blocks, sizeof(spinor*)); /*block local version of global spinor */ + inprod = calloc(nb_blocks * 9 * g_N_s, sizeof(_Complex double)); /*inner product of spinors with bases */ + inprod_eo = calloc(nb_blocks * 9 * g_N_s, sizeof(_Complex double)); /*inner product of spinors with bases */ + inprod_o = calloc(nb_blocks * 9 * g_N_s, sizeof(_Complex double)); /*inner product of spinors with bases */ + inprod_e = calloc(nb_blocks * 9 * g_N_s, sizeof(_Complex double)); /*inner product of spinors with bases */ + ctmp = calloc(nb_blocks * 9 * g_N_s, sizeof(_Complex double)); /*inner product of spinors with bases */ + invvec = calloc(nb_blocks * 9 * g_N_s, sizeof(_Complex double)); /*inner product of spinors with bases */ + invvec_eo = calloc(nb_blocks * 9 * g_N_s, sizeof(_Complex double)); /*inner product of spinors with bases */ + work_block = calloc(dfl_work_size * nb_blocks * 9 * g_N_s, sizeof(_Complex double)); + for(i = 0; i < dfl_work_size; ++i) { + work[i] = work_block + i * nb_blocks * 9 * g_N_s; + } + + /* no loop below because further down we also don't take this cleanly into account */ + psi[0] = calloc(nb_blocks*(block_list[0].volume + block_list[0].spinpad), sizeof(spinor)); + for(i = 1 ;i < nb_blocks ;i++) { + psi[i] = psi[i-1] + (block_list[0].volume + block_list[0].spinpad); + } + init_dfl_projector = 1; + return; +} + + +void free_dfl_projector() { + free(*psi); + free(psi); + free(invvec); + free(invvec_eo); + free(inprod); + free(inprod_eo); + free(inprod_e); + free(inprod_o); + free(ctmp); + free(work_block); + init_dfl_projector = 0; + return; +} + +/* this is phi_k (phi_k, in) */ +void project2(spinor * const out, spinor * const in) { + int i, j; + int vol = block_list[0].volume; + + if(init_dfl_projector == 0) { + alloc_dfl_projector(); + } + /*initialize the local (block) parts of the spinor*/ + split_global_field_GEN(psi, in, nb_blocks); + + /* compute inner product */ + for (j = 0; j < g_N_s; j++) { + /*loop over block.basis */ + for(i = 0 ; i < nb_blocks ; i++) inprod[j + i*g_N_s] = scalar_prod(block_list[i].basis[j], psi[i], vol, 0); + } + + /* sum up */ + for(i = 0 ; i < nb_blocks ; i++) mul(psi[i], inprod[i*g_N_s], block_list[i].basis[0], vol); + for(j = 1; j < g_N_s; j++) { + for(i = 0 ; i < nb_blocks ; i++) assign_add_mul(psi[i], block_list[i].basis[j], inprod[i*g_N_s + j], vol); + } + + /* reconstruct global field */ + reconstruct_global_field_GEN(out, psi, nb_blocks); + return; +} + +void project_left(spinor * const out, spinor * const in) { + /* out = P_L in = in - D proj in */ + + project(out, in); + D_psi(g_spinor_field[DUM_MATRIX], out); + diff(out, in, g_spinor_field[DUM_MATRIX], VOLUME); + return; +} + +void project_right(spinor * const out, spinor * const in) { + /* out = P_R in = in - proj D in */ + + D_psi(out, in); + project(g_spinor_field[DUM_MATRIX], out); + diff(out, in, g_spinor_field[DUM_MATRIX], VOLUME); + return; +} + +void project_left_D(spinor * const out, spinor * const in) { + /* out = P_L D in = D in - D proj D in*/ + D_psi(g_spinor_field[DUM_MATRIX+1], in); + project_left(out, g_spinor_field[DUM_MATRIX+1]); + return; +} + +void D_project_right(spinor * const out, spinor * const in) { + project_right(g_spinor_field[DUM_MATRIX+1], in); + D_psi(out, g_spinor_field[DUM_MATRIX+1]); + return; +} + + +/* out = |phi_k> A^{-1}_kl */ +void little_project(_Complex double * const out, _Complex double * const in, const int N) { + int i, j; + static _Complex double *phi; + static _Complex double *psi; + + if(init_dfl_projector == 0) { + alloc_dfl_projector(); + } + + phi = work[2]; + psi = work[3]; + + /* NOTE IS THIS REALLY NECESSARY/CORRECT? */ + for(i = 0; i < N; i++) { + phi[i] = lscalar_prod(little_dfl_fields[i], in, nb_blocks*N, 0); + } + +#ifdef MPI + MPI_Allreduce(phi, psi, N, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD); +#else + memcpy(psi, phi, N*sizeof(_Complex double)); +#endif + + /* apply inverse of little_A */ + for(i = 0; i < N; i++) { + (phi[i]) = 0.0; + for(j = 0; j < N; j++) { + (phi[i]) += (little_A[j*N + i]) * (psi[j]); + } + } + + lmul(out, phi[0], little_dfl_fields[0], nb_blocks*N); + for(i = 1; i < N; i++) { + lassign_add_mul(out, little_dfl_fields[i], phi[i], nb_blocks*N); + } + return; +} + +void little_project_eo(_Complex double * const out, _Complex double * const in, const int N) { + int i, j; + static _Complex double *phi; + static _Complex double *psi; + + if(init_dfl_projector == 0) { + alloc_dfl_projector(); + } + + phi = work[2]; + psi = work[3]; + + /* NOTE IS THIS REALLY NECESSARY/CORRECT? */ + for(i = 0; i < N; i++) { + phi[i] = lscalar_prod(little_dfl_fields_eo[i], in, nb_blocks*N, 0); + } + +#ifdef MPI + MPI_Allreduce(phi, psi, N, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD); +#else + memcpy(psi, phi, N*sizeof(_Complex double)); +#endif + + /* apply inverse of little_A_eo */ + for(i = 0; i < N; i++) { + (phi[i]) = 0.0; + for(j = 0; j < N; j++) { + (phi[i]) += (little_A_eo[j*N + i]) * (psi[j]); + } + } + + lmul(out, phi[0], little_dfl_fields_eo[0], nb_blocks*N); + for(i = 1; i < N; i++) { + lassign_add_mul(out, little_dfl_fields_eo[i], phi[i], nb_blocks*N); + } + return; +} + + +void little_project2(_Complex double * const out, _Complex double * const in, const int N) { + int i; + static _Complex double *phi; + static _Complex double *psi; + + if(init_dfl_projector == 0) {alloc_dfl_projector();} + phi = work[4]; + psi = work[5]; + + for(i = 0; i < N; i++) { + phi[i] = lscalar_prod(little_dfl_fields[i], in, nb_blocks*N, 0); + } +#ifdef MPI + MPI_Allreduce(phi, psi, g_N_s, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD); +#else + memcpy(psi, phi, g_N_s*sizeof(_Complex double)); +#endif + + lmul(out, psi[0], little_dfl_fields[0], nb_blocks*g_N_s); + for(i = 1; i < N; i++) { + lassign_add_mul(out, little_dfl_fields[i], psi[i], nb_blocks*g_N_s); + } + + return; +} + + +void little_P_L(_Complex double * const out, _Complex double * const in) { + if(init_dfl_projector == 0) {alloc_dfl_projector();} + little_project(out, in, g_N_s); + little_D(work[6], out); + ldiff(out, in, work[6], nb_blocks*g_N_s); + return; +} + +void little_P_R(_Complex double * const out, _Complex double * const in) { + if(init_dfl_projector == 0) {alloc_dfl_projector();} + little_D(out, in); + little_project(work[7], out, g_N_s); + ldiff(out, in, work[7], nb_blocks*g_N_s); + return; +} + +void little_P_L_sym(_Complex double * const out, _Complex double * const in) { + if(init_dfl_projector == 0) {alloc_dfl_projector();} +/* little_project(out, in, g_N_s);*/ + little_project_eo(out,in,g_N_s); + little_D_sym(work[13], out); + ldiff(out, in, work[13], nb_blocks*g_N_s); + return; +} + +void little_P_R_sym(_Complex double * const out, _Complex double * const in) { + if(init_dfl_projector == 0) {alloc_dfl_projector();} + little_D_sym(out, in); +/* little_project(work[14], out, g_N_s);*/ + little_project_eo(work[14],out,g_N_s); + ldiff(out, in, work[14], nb_blocks*g_N_s); + return; +} + +void little_P_L_D(_Complex double * const out, _Complex double * const in) { + if(init_dfl_projector == 0) {alloc_dfl_projector();} + little_D(work[8], in); + little_P_L(out, work[8]); + return; +} + +void little_P_L_D_sym(_Complex double * const out, _Complex double * const in) { + if(init_dfl_projector == 0) {alloc_dfl_projector();} + little_D_sym(work[15], in); + little_P_L_sym(out, work[15]); + return; +} + +void little_D_P_R(_Complex double * const out, _Complex double * const in) { + if(init_dfl_projector == 0) {alloc_dfl_projector();} + little_P_R(work[9], in); + little_D(out, work[9]); + return; +} + + +int check_projectors(const int repro) { + double nrm = 0.; + int i,j; + spinor **phi; + spinor **wphi; + _Complex double *v; + spinor ** work_fields = NULL; + const int nr_wf = 4; + + init_solver_field(&work_fields, VOLUMEPLUSRAND, nr_wf); + phi = malloc(nb_blocks*sizeof(spinor *)); + wphi = malloc(nb_blocks*sizeof(spinor *)); + + random_spinor_field_lexic(work_fields[0], repro, RN_GAUSS); + nrm = square_norm(work_fields[0], VOLUME, 1); + if(g_cart_id == 0) { + printf("\nNow we check the DFL projection routines!\n\n"); + printf("||psi|| = %1.5e\n", sqrt(nrm)); + } + + + + /* Check generalized split/reconstruct */ + phi[0] = calloc(VOLUME + nb_blocks, sizeof(spinor)); + for(j = 1; j < nb_blocks; j++) { + phi[j] = phi[j-1] + (VOLUME/nb_blocks + 1); + } + split_global_field_GEN(phi, work_fields[0],nb_blocks); + reconstruct_global_field_GEN(work_fields[1],phi,nb_blocks); + diff(work_fields[2], work_fields[0], work_fields[1], VOLUME); + nrm = square_norm(work_fields[2], VOLUME, 1); + if(g_cart_id == 0) { + printf("||psi_orig - psi_recon|| = %1.5e\n", sqrt(nrm)); + fflush(stdout); + } + /* Check even/odd split reconstruct */ + assign(work_fields[3], work_fields[0], VOLUME); + copy_global_to_block_eo(work_fields[1], work_fields[2], work_fields[0], 0); + copy_block_eo_to_global(work_fields[3], work_fields[1], work_fields[2], 0); + diff(work_fields[2], work_fields[0], work_fields[3], VOLUME); + nrm = square_norm(work_fields[2], VOLUME, 1); + if(g_cart_id == 0) { + printf("even/odd split: ||psi_orig - psi_recon|| = %1.5e\n", sqrt(nrm)); + fflush(stdout); + } + + project2(work_fields[1], work_fields[0]); + project2(work_fields[2], work_fields[1]); + diff(work_fields[3], work_fields[1], work_fields[2], VOLUME); + nrm = square_norm(work_fields[3], VOLUME, 1); + if(g_cart_id == 0) { + printf("||P2 psi - P2 P2 psi|| = %1.5e\n", sqrt(nrm)); + fflush(stdout); + } + + project_left_D(work_fields[1], work_fields[0]); + D_project_right(work_fields[2], work_fields[0]); + diff(work_fields[3], work_fields[2], work_fields[1], VOLUME); + nrm = square_norm(work_fields[3], VOLUME, 1); + if(g_cart_id == 0) { + printf("||P_L D psi - D P_R psi|| = %1.5e\n", sqrt(nrm)); + fflush(stdout); + } + + project_left(work_fields[1], work_fields[0]); + project_left(work_fields[2], work_fields[1]); + diff(work_fields[3], work_fields[2], work_fields[1], VOLUME); + nrm = square_norm(work_fields[3], VOLUME, 1); + if(g_cart_id == 0) { + printf("||P_L^2 psi - P_L psi|| = %1.5e\n", sqrt(nrm)); + fflush(stdout); + } + + project_right(work_fields[1], work_fields[0]); + project_right(work_fields[2], work_fields[1]); + diff(work_fields[3], work_fields[2], work_fields[1], VOLUME); + nrm = square_norm(work_fields[3], VOLUME, 1); + if(g_cart_id == 0) { + printf("||P_R^2 psi - P_R psi|| = %1.5e\n", sqrt(nrm)); + fflush(stdout); + } + + project_left(work_fields[1], work_fields[0]); + project2(work_fields[2], work_fields[1]); + nrm = square_norm(work_fields[2], VOLUME, 1); + if(g_cart_id == 0) { + printf("||P P_L psi|| = %1.5e\n", sqrt(nrm)); + fflush(stdout); + } + + project2(work_fields[1], work_fields[0]); + project_right(work_fields[2], work_fields[1]); + nrm = square_norm(work_fields[2], VOLUME, 1); + if(g_cart_id == 0) { + printf("||P_R P psi|| = %1.5e\n", sqrt(nrm)); + fflush(stdout); + } + + project2(work_fields[1], work_fields[0]); + project(work_fields[2], work_fields[1]); + D_psi(work_fields[3], work_fields[2]); + project2(work_fields[2], work_fields[3]); + diff(work_fields[3], work_fields[2], work_fields[1], VOLUME); + nrm = square_norm(work_fields[3], VOLUME, 1); + if(g_cart_id == 0) { + printf("||P D A^-1 P psi - P psi|| = %1.5e\n", sqrt(nrm)); + fflush(stdout); + } + + project2(work_fields[1], work_fields[0]); + D_psi(work_fields[2], work_fields[1]); + project(work_fields[3], work_fields[2]); + project2(work_fields[2], work_fields[3]); + diff(work_fields[3], work_fields[2], work_fields[1], VOLUME); + nrm = square_norm(work_fields[3], VOLUME, 1); + if(g_cart_id == 0) { + printf("||P A^-1 D P psi - P psi|| = %1.5e\n", sqrt(nrm)); + fflush(stdout); + } + + invert_little_D_spinor(work_fields[1], work_fields[0]); + project2(work_fields[2], work_fields[1]); + D_psi(work_fields[3], work_fields[2]); + project2(work_fields[2], work_fields[3]); + project2(work_fields[1], work_fields[0]); + diff(work_fields[3], work_fields[2], work_fields[1], VOLUME); + nrm = square_norm(work_fields[3], VOLUME, 1); + if(g_cart_id == 0) { + printf("||P D P (P D P)^-1 psi - P psi|| = %1.5e\n", sqrt(nrm)); + fflush(stdout); + } + + + invert_little_D_spinor(work_fields[1], work_fields[0]); + invert_little_D_eo_spinor(work_fields[2], work_fields[0]); + diff(work_fields[3], work_fields[1], work_fields[2], VOLUME); + nrm = square_norm(work_fields[3], VOLUME, 1); + if(g_cart_id == 0) { + printf("||A^-1 psi - A^-1_eo psi|| = %1.5e\n", sqrt(nrm)); + fflush(stdout); + } + + + invert_little_D_spinor(work_fields[1], work_fields[0]); + apply_little_D_spinor(work_fields[2], work_fields[1]); + project2(work_fields[3], work_fields[0]); + diff(work_fields[1], work_fields[3], work_fields[2], VOLUME); + nrm = square_norm(work_fields[1], VOLUME, 1); + if(g_cart_id == 0) { + printf("||A A^-1 psi - P psi|| = %1.5e\n", sqrt(nrm)); + fflush(stdout); + } + + invert_little_D_spinor(work_fields[1], work_fields[0]); + apply_little_D_spinor(work_fields[2], work_fields[1]); + project2(work_fields[3], work_fields[0]); + project2(work_fields[1], work_fields[2]); + diff(work_fields[2], work_fields[3], work_fields[1], VOLUME); + nrm = square_norm(work_fields[2], VOLUME, 1); + if(g_cart_id == 0) { + printf("||P A A^-1 psi - P psi|| = %1.5e\n", sqrt(nrm)); + fflush(stdout); + } + + + /* Different flavours for kappa != 0. First project to only a single block */ + for (j = 0; j < (VOLUME * sizeof(spinor) / sizeof(_Complex double)); ++j){ + ((_Complex double*)work_fields[1])[j] = 0.; + ((_Complex double*)work_fields[2])[j] = 0.; + } + + if (!g_cart_id){ + wphi[0] = block_list[0].basis[0]; + for(i = 1; i< nb_blocks; i++) { + wphi[i] = work_fields[2]; + } + reconstruct_global_field_GEN(work_fields[1], wphi, nb_blocks); + } + apply_little_D_spinor(work_fields[3], work_fields[1]); + D_psi(work_fields[2], work_fields[1]); + + if (g_cart_id == 0 && g_debug_level > 4){ + v = calloc(nb_blocks * 9 * g_N_s, sizeof(_Complex double)); + split_global_field_GEN(phi, work_fields[2], nb_blocks); + + for (j = 0; j < g_N_s; ++j) { + for(i = 0; i < nb_blocks; i++) { + v[j + i*g_N_s] = scalar_prod(block_list[i].basis[j], phi[i], VOLUME/nb_blocks, 0); + } + } + + for (j = 0; j < nb_blocks* g_N_s; ++j) { + printf("AFTER D: w[%u] = %1.5e + %1.5e i\n", j, creal(v[j]), cimag(v[j])); + } + free(v); + } + + project2(work_fields[1], work_fields[2]); + + + diff(work_fields[2], work_fields[3], work_fields[1], VOLUME); + nrm = square_norm(work_fields[2], VOLUME, 1); + if(g_proc_id == 0) { + printf("||(P D - A) phi_i || = %1.5e\n", sqrt(nrm)); + fflush(stdout); + } + + reconstruct_global_field_GEN_ID(work_fields[1], block_list, 0, nb_blocks); + apply_little_D_spinor(work_fields[3], work_fields[1]); + D_psi(work_fields[2], work_fields[1]); + if (!g_proc_id && g_debug_level > 4){ + v = calloc(nb_blocks * 9 * g_N_s, sizeof(_Complex double)); + split_global_field_GEN(phi, work_fields[2],nb_blocks); + for (j = 0; j < g_N_s; ++j) + for(i = 0; i < nb_blocks; i++) + v[j + i*g_N_s] = scalar_prod(block_list[i].basis[j], phi[i], VOLUME/nb_blocks, 0); + for (j = 0; j < nb_blocks* g_N_s; ++j) { + printf("AFTER D: w[%u] = %1.5e + %1.5e i\n", j, creal(v[j]), cimag(v[j])); + } + free(v); + } + project2(work_fields[1], work_fields[2]); + diff(work_fields[2], work_fields[3], work_fields[1], VOLUME); + nrm = square_norm(work_fields[2], VOLUME, 1); + if(g_proc_id == 0) { + printf("||(P D - A) phi || = %1.5e\n", sqrt(nrm)); + fflush(stdout); + } + + apply_little_D_spinor(work_fields[3], work_fields[0]); + project2(work_fields[1], work_fields[0]); + D_psi(work_fields[2], work_fields[1]); + project2(work_fields[1], work_fields[2]); + diff(work_fields[2], work_fields[3], work_fields[1], VOLUME); + nrm = square_norm(work_fields[2], VOLUME, 1); + if(g_proc_id == 0 && g_debug_level > 4) { + printf("||P D P psi - A psi|| = %1.5e\n", sqrt(nrm)); + printf("\n*** Comparison of the leading spinor components ***\n"); + printf("%1.5e\t%1.5e\n", creal(work_fields[1]->s0.c0), creal(work_fields[3]->s0.c0)); + printf("%1.5e\t%1.5e\n", cimag(work_fields[1]->s0.c0), cimag(work_fields[3]->s0.c0)); + printf("%1.5e\t%1.5e\n", creal(work_fields[1]->s0.c1), creal(work_fields[3]->s0.c1)); + printf("%1.5e\t%1.5e\n", cimag(work_fields[1]->s0.c1), cimag(work_fields[3]->s0.c1)); + printf("%1.5e\t%1.5e\n", creal(work_fields[1]->s0.c2), creal(work_fields[3]->s0.c2)); + printf("%1.5e\t%1.5e\n", cimag(work_fields[1]->s0.c2), cimag(work_fields[3]->s0.c2)); + printf("%1.5e\t%1.5e\n", creal(work_fields[1]->s1.c0), creal(work_fields[3]->s1.c0)); + printf("%1.5e\t%1.5e\n", cimag(work_fields[1]->s1.c0), cimag(work_fields[3]->s1.c0)); + printf("%1.5e\t%1.5e\n", creal(work_fields[1]->s1.c1), creal(work_fields[3]->s1.c1)); + printf("%1.5e\t%1.5e\n", cimag(work_fields[1]->s1.c1), cimag(work_fields[3]->s1.c1)); + printf("%1.5e\t%1.5e\n", creal(work_fields[1]->s1.c2), creal(work_fields[3]->s1.c2)); + printf("%1.5e\t%1.5e\n", cimag(work_fields[1]->s1.c2), cimag(work_fields[3]->s1.c2)); + printf("%1.5e\t%1.5e\n", creal(work_fields[1]->s2.c0), creal(work_fields[3]->s2.c0)); + printf("%1.5e\t%1.5e\n", cimag(work_fields[1]->s2.c0), cimag(work_fields[3]->s2.c0)); + printf("%1.5e\t%1.5e\n", creal(work_fields[1]->s2.c1), creal(work_fields[3]->s2.c1)); + printf("%1.5e\t%1.5e\n", cimag(work_fields[1]->s2.c1), cimag(work_fields[3]->s2.c1)); + printf("%1.5e\t%1.5e\n", creal(work_fields[1]->s2.c2), creal(work_fields[3]->s2.c2)); + printf("%1.5e\t%1.5e\n", cimag(work_fields[1]->s2.c2), cimag(work_fields[3]->s2.c2)); + printf("*** End of dump ***\n\n"); + fflush(stdout); + } + + /* check little projectors now */ + if(g_cart_id == 0) { + printf("\nNow the little little projection routines\n\n"); + } + if(init_dfl_projector == 0) { + alloc_dfl_projector(); + } + + memcpy(work[10], work_fields[0], nb_blocks*g_N_s*sizeof(_Complex double)); + little_project2(work[11], work[10], g_N_s); + little_project2(work[12], work[11], g_N_s); + ldiff(work[12], work[12], work[11], nb_blocks*g_N_s); + nrm = lsquare_norm(work[12], nb_blocks*g_N_s, 1); + if(g_cart_id == 0) { + printf("||lP2 v - lP2 lP2 v|| = %1.5e\n", sqrt(nrm)); + fflush(stdout); + } + + little_P_L_D(work[11], work[10]); + little_P_L_D(work[12], work[10]); + ldiff(work[12], work[12], work[11], nb_blocks*g_N_s); + nrm = lsquare_norm(work[12], nb_blocks*g_N_s, 1); + if(g_cart_id == 0) { + printf("||lP_L lD v - lP_L lD v|| = %1.5e\n", sqrt(nrm)); + fflush(stdout); + } + + little_P_L_D(work[11], work[10]); + little_D_P_R(work[12], work[10]); + ldiff(work[12], work[12], work[11], nb_blocks*g_N_s); + nrm = lsquare_norm(work[12], nb_blocks*g_N_s, 1); + if(g_cart_id == 0) { + printf("||lP_L lD v - lD lP_R v|| = %1.5e\n", sqrt(nrm)); + fflush(stdout); + } + + little_P_R(work[11], work[10]); + little_P_R(work[12], work[11]); + ldiff(work[12], work[12], work[11], nb_blocks*g_N_s); + nrm = lsquare_norm(work[12], nb_blocks*g_N_s, 1); + if(g_cart_id == 0) { + printf("||lP_R^2 v - lP_R v|| = %1.5e\n", sqrt(nrm)); + fflush(stdout); + } + + little_P_L(work[11], work[10]); + little_P_L(work[12], work[11]); + ldiff(work[12], work[12], work[11], nb_blocks*g_N_s); + nrm = lsquare_norm(work[12], nb_blocks*g_N_s, 1); + if(g_cart_id == 0) { + printf("||lP_L^2 v - lP_L v|| = %1.5e\n", sqrt(nrm)); + fflush(stdout); + } + + free(phi[0]); + free(phi); + free(wphi); + finalize_solver(work_fields, nr_wf); + return(0); +} + +void check_little_D_inversion(const int repro) { + int i,j,ctr_t; + int contig_block = LZ / nb_blocks; + int vol = block_list[0].volume; + _Complex double *result, *v, *w; + double dif; + spinor ** work_fields = NULL; + const int nr_wf = 1; + + init_solver_field(&work_fields, VOLUMEPLUSRAND, nr_wf); + random_spinor_field_lexic(work_fields[0], repro, RN_GAUSS); + if(init_dfl_projector == 0) { + alloc_dfl_projector(); + } + v = work[11]; + w = work[12]; + + result = calloc(nb_blocks * 9 * g_N_s, sizeof(_Complex double)); /*inner product of spinors with bases */ + + /* no loop below because further down we also don't take this cleanly into account */ + + /*initialize the local (block) parts of the spinor*/ + for (ctr_t = 0; ctr_t < (VOLUME / LZ); ++ctr_t) { + for(i=0; i< nb_blocks; i++) { + memcpy(psi[i] + ctr_t * contig_block, work_fields[0] + (nb_blocks * ctr_t + i) * contig_block, contig_block * sizeof(spinor)); + } + } + for (i = 0; i < nb_blocks; ++i) {/* loop over blocks */ + /* compute inner product */ + for (j = 0; j < g_N_s; ++j) {/*loop over block.basis */ + /* inprod[j + i*g_N_s] = block_scalar_prod(block_list[i].basis[j], psi[i], vol); */ + inprod[j + i*g_N_s] = scalar_prod(psi[i], block_list[i].basis[j], vol, 0); + } + } + + if(1) { + gcr4complex(invvec, inprod, 10, 1000, 1.e-24, 0, nb_blocks * g_N_s, 1, nb_blocks * 9 * g_N_s, &little_D); + } + else { + little_P_L(v, inprod); + gcr4complex(w, v, 10, 1000, 1.e-24, 1, nb_blocks * g_N_s, 1, nb_blocks * 9 * g_N_s, &little_P_L_D); + little_P_R(v, w); + little_project(w, inprod, g_N_s); + for(i = 0; i < nb_blocks*g_N_s; ++i) + invvec[i] = w[i] + v[i]; + } + little_D(result, invvec); /* This should be a proper inverse now */ + + dif = 0.0; + for(ctr_t = 0; ctr_t < nb_blocks * g_N_s; ++ctr_t){ + dif += (creal(inprod[ctr_t]) - creal(result[ctr_t])) * (creal(inprod[ctr_t]) - creal(result[ctr_t])); + dif += (cimag(inprod[ctr_t]) - cimag(result[ctr_t])) * (cimag(inprod[ctr_t]) - cimag(result[ctr_t])); + } + dif = sqrt(dif); + + if (dif > 1e-8 * VOLUME){ + printf("[WARNING] check_little_D_inversion: deviation found of size %1.5e!\n", dif); + } +#ifdef MPI + MPI_Barrier(MPI_COMM_WORLD); +#endif + + if ((g_debug_level > 2) && !g_proc_id){ + printf("Inversion check on little_D\nStart:\n"); + for(ctr_t = 0; ctr_t < nb_blocks * g_N_s; ++ctr_t){ + printf("%1.9e + %1.9e I ", creal(inprod[ctr_t]), cimag(inprod[ctr_t])); + if (ctr_t == g_N_s - 1) + printf("\n"); + } + printf("\nInverted:\n"); + for(ctr_t = 0; ctr_t < nb_blocks * g_N_s; ++ctr_t){ + printf("%1.9e + %19e I ", creal(invvec[ctr_t]), cimag(invvec[ctr_t])); + if (ctr_t == g_N_s - 1 ) + printf("\n"); + } + printf("\nResult:\n"); + for(ctr_t = 0; ctr_t < nb_blocks * g_N_s; ++ctr_t){ + printf("%1.9e + %1.9e I ", creal(result[ctr_t]), cimag(result[ctr_t])); + if (ctr_t == g_N_s - 1) + printf("\n"); + } + printf("\n"); + } + + finalize_solver(work_fields, nr_wf); + free(result); + return; +} + +void check_local_D(const int repro) +{ + spinor * r[8]; + int j, vol = block_list[0].volume/2, i; + double nrm; + spinor ** work_fields = NULL; + const int nr_wf = 7; + + init_solver_field(&work_fields, VOLUMEPLUSRAND, nr_wf); + block_convert_lexic_to_eo(work_fields[0], work_fields[1], block_list[0].basis[0]); + block_convert_eo_to_lexic(work_fields[2], work_fields[0], work_fields[1]); + diff(work_fields[0], work_fields[2], block_list[0].basis[0], block_list[0].volume); + nrm = square_norm(work_fields[0], block_list[0].volume, 0); + if(g_proc_id == 0) { + printf("\nblock even/odd: ||psi - psi_recon|| = %1.5e\n", sqrt(nrm)); + fflush(stdout); + } + + for(j = 0; j < nb_blocks; j++) { + zero_spinor_field(work_fields[0], VOLUME); + Block_D_psi(&block_list[j], work_fields[6], block_list[j].basis[0]); + + /* Now test the block hopping matrix */ + /* split into even/odd sites */ + block_convert_lexic_to_eo(work_fields[0], work_fields[1], block_list[j].basis[0]); + + /* Even sites */ + Block_H_psi(&block_list[j], g_spinor_field[DUM_DERI], work_fields[1], EO); + assign_mul_one_pm_imu(work_fields[2], work_fields[0], 1., vol); + assign_add_mul_r(work_fields[2], g_spinor_field[DUM_DERI], 1., vol); + + /* Odd sites */ + Block_H_psi(&block_list[j], g_spinor_field[DUM_DERI], work_fields[0], OE); + assign_mul_one_pm_imu(work_fields[3], work_fields[1], 1., vol); + assign_add_mul_r(work_fields[3], g_spinor_field[DUM_DERI], 1., vol); + + /* convert back to block spinor */ + block_convert_eo_to_lexic(work_fields[5], work_fields[2], work_fields[3]); + + if(g_proc_id == 0 && g_debug_level > 5) { + for(i = 0; i < block_list[0].volume; i++) { + if(fabs(creal(work_fields[6][i].s0.c0)) > 1.e-15 || fabs(creal(work_fields[5][i].s0.c0)) > 1.e-15) { + printf("%d %e %d\n", i, creal(work_fields[6][i].s0.c0), block_list[0].volume); + printf("%d %e\n", i, creal(work_fields[5][i].s0.c0)); + } + } + } + + diff(work_fields[4], work_fields[5], work_fields[6], block_list[0].volume); + nrm = square_norm(work_fields[4], block_list[0].volume, 0); + if(sqrt(nrm) > 1.e-12) { + printf("Check failed for local D against Hopping Matrix: ||delta|| = %1.5e block %d process %d\n", sqrt(nrm), j, g_proc_id); + } + } + /* check Msap and Msap_eo on a radom vector */ + random_spinor_field_lexic(work_fields[0], repro, RN_GAUSS); + zero_spinor_field(work_fields[1], VOLUME); + Msap(work_fields[1], work_fields[0], 2); + D_psi(work_fields[2], work_fields[1]); + diff(work_fields[3], work_fields[2], work_fields[0], VOLUME); + nrm = square_norm(work_fields[3], VOLUME, 1); + if(g_proc_id == 0) { + printf("Msap relaxed the residue to ||r||^2 = %1.5e\n", nrm); + } + + zero_spinor_field(work_fields[1], VOLUME); + Msap_eo(work_fields[1], work_fields[0], 2); + D_psi(work_fields[2], work_fields[1]); + diff(work_fields[3], work_fields[2], work_fields[0], VOLUME); + nrm = square_norm(work_fields[3], VOLUME, 1); + if(g_proc_id == 0) { + printf("Msap_eo relaxed the residue to ||r||^2 = %1.5e\n", nrm); + } + + for(j = 0; j < 6; j++) { + r[j] = work_fields[j]; + } + for(j = 0; j < nb_blocks; j++) { + + block_convert_lexic_to_eo(r[0], r[1], block_list[j].basis[0]); + /* check even/odd inversion for Block_D_psi*/ + /* varphi_e in r[2] */ + assign_mul_one_pm_imu_inv(r[2], r[0], +1., vol); + Block_H_psi(&block_list[j], r[3], r[2], OE); + /* a_odd = a_odd + b_odd */ + /* varphi_o in r[3] */ + assign_mul_add_r(r[3], -1., r[1], vol); + /* psi_o in r[1] */ + mrblk(r[1], r[3], 3, 1.e-31, 1, vol, &Mtm_plus_block_psi, j); + + Block_H_psi(&block_list[j], r[0], r[1], EO); + mul_one_pm_imu_inv(r[0], +1., vol); + /* a_even = a_even + b_even */ + /* check this sign +1 seems to be right in Msap_eo */ + assign_add_mul_r(r[2], r[0], -1., vol); + + block_convert_eo_to_lexic(r[4], r[2], r[1]); + + Block_D_psi(&block_list[j], r[5], r[4]); + diff(r[0], block_list[j].basis[0], r[5], block_list[j].volume); + nrm = square_norm(r[0], block_list[j].volume, 0); + if(g_proc_id == 0) { + printf("mr_eo, block=%d: ||r||^2 = %1.5e\n", j, nrm); + } + } + for(j = 0; j < nb_blocks; j++) { + block_convert_lexic_to_eo(r[0], r[1], block_list[j].basis[0]); + /* check even/odd inversion for Block_D_psi*/ + /* varphi_e in r[2] */ + assign_mul_one_pm_imu_inv(r[2], r[0], +1., vol); + Block_H_psi(&block_list[j], r[3], r[2], OE); + /* a_odd = a_odd + b_odd */ + /* varphi_o in r[3] */ + assign_mul_add_r(r[3], -1., r[1], vol); + /* psi_o in r[1] */ + mul_one_pm_imu_inv(r[3], +1., vol); + mrblk(r[1], r[3], 3, 1.e-31, 1, vol, &Mtm_plus_sym_block_psi, j); + + Block_H_psi(&block_list[j], r[0], r[1], EO); + mul_one_pm_imu_inv(r[0], +1., vol); + /* a_even = a_even + b_even */ + /* check this sign +1 seems to be right in Msap_eo */ + assign_add_mul_r(r[2], r[0], -1., vol); + + block_convert_eo_to_lexic(r[4], r[2], r[1]); + + Block_D_psi(&block_list[j], r[5], r[4]); + diff(r[0], block_list[j].basis[0], r[5], block_list[j].volume); + nrm = square_norm(r[0], block_list[j].volume, 0); + if(g_proc_id == 0) { + printf("mr_eo (symmetric eo), block=%d: ||r||^2 = %1.5e\n", j, nrm); + } + } + finalize_solver(work_fields, nr_wf); + return; +} + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/dfl_projector.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/dfl_projector.h new file mode 100644 index 0000000000000000000000000000000000000000..ae4839a6fb42b771ecacab413b5677470b4d03d5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/dfl_projector.h @@ -0,0 +1,48 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _DFL_PROJECTOR_H +#define _DFL_PROJECTOR_H + +#include "su3spinor.h" + +void project(spinor * const out, spinor * const in); +void project_left(spinor * const out, spinor * const in); +void project_right(spinor * const out, spinor * const in); +void project_left_D(spinor * const out, spinor * const in); +void D_project_right(spinor * const out, spinor * const in); +int check_projectors(const int repro); +void check_little_D_inversion(const int repro); +void check_local_D(const int repro); +void free_dfl_projector(); + +void little_project(_Complex double * const out, _Complex double * const in, const int N); +void little_project_eo(_Complex double * const out, _Complex double * const in, const int N); +void little_P_L_D(_Complex double * const out, _Complex double * const in); +void little_P_L_D_sym(_Complex double * const out, _Complex double * const in); +void little_D_P_R(_Complex double * const out, _Complex double * const in); +void little_P_R(_Complex double * const out, _Complex double * const in); +void little_P_L(_Complex double * const out, _Complex double * const in); +void little_P_R_sym(_Complex double * const out, _Complex double * const in); +void little_P_L_sym(_Complex double * const out, _Complex double * const in); + +extern double dfl_little_D_prec; +extern int dfl_sloppy_prec; + + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/diagonalise_general_matrix.c b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/diagonalise_general_matrix.c new file mode 100644 index 0000000000000000000000000000000000000000..782ef971a25aa1b6f40d3076c2c1706d0e2ca1b9 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/diagonalise_general_matrix.c @@ -0,0 +1,109 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +/****************************************************** + * + * subroutine to diagonalise a _Complex double n times n + * matrix. Input is a _Complex double matrix in _C_ like + * order. Output is again _C_ like. + * + * The lapack routine zgeevx is used instead of + * zgeev, because zgeev is not standard e.g. on + * IBM systems with (p)essl library. + * + * The left and right eigenvectors are computed + * as well as the eigenvalues. + * + * The right eigenvectors are returned in A, + * the left one in vl. The eigenvalues are stored + * in evalues. + * + * Author: Urs Wenger + * Carsten Urbach + * + ******************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "linalg/lapack.h" +#include "diagonalise_general_matrix.h" + +void diagonalise_general_matrix(int n, _Complex double * A, int lda, _Complex double * vl, + _Complex double * evalues ) { + + _Complex double *vr = NULL, *temp = NULL, *work = NULL, dummy; + double * rwork = NULL, * scale = NULL, abnrm, * rcone = NULL, * rconv = NULL; + int lwork, info, i, j, ilo, ihi; + + rwork = malloc(2*n*sizeof(double)); + vr = malloc(n*n*sizeof(_Complex double)); +/* temp = malloc(n*n*sizeof(_Complex double)); */ + scale = malloc(n*sizeof(double)); + rcone = malloc(n*sizeof(double)); + rconv = malloc(n*sizeof(double)); + + /* don't transpose A: */ + for(i=0;i<0;i++) { + for(j=0;j. + ***********************************************************************/ + +#ifndef _DIAGONALISE_GENERAL_MATRIX_H +#define _DIAGONALISE_GENERAL_MATRIX_H + +void diagonalise_general_matrix(int n, _Complex double * A, int lda, _Complex double * vl, + _Complex double * evalues); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/dirac_operator_eigenvectors.c b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/dirac_operator_eigenvectors.c new file mode 100644 index 0000000000000000000000000000000000000000..bdef1ec24d9b953594279cb81d462d752c2fbb07 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/dirac_operator_eigenvectors.c @@ -0,0 +1,2321 @@ +/*********************************************************************** + * + * Copyright (C) 2014 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ************************************************************************/ + +#ifdef HAVE_CONFIG_H +#include +#endif +#include +#include +#include +#include +#ifdef FFTW + #include +#endif +#ifdef _USE_SHMEM +# include +#endif +#include +#include + +#include "global.h" +#include "config.h" +#include "su3.h" +#include "sse.h" +#include "monomial/monomial.h" +#include +#include "dirac_operator_eigenvectors.h" +#include "geometry_eo.h" +#include "linalg_eo.h" +#include "linalg/lapack.h" +#include "linalg/blas.h" +#include "operator.h" +#include "operator/tm_operators.h" +#include "operator/D_psi.h" +#include "ranlxd.h" +#include "operator/Dov_psi.h" + +/* typedef enum tm_operator_ {PRECWS_DTM,PRECWS_QTM,PRECWS_D_DAGGER_D} tm_operator; */ + +tm_operator PRECWSOPERATORSELECT[14]={PRECWS_DTM, /* BICGSTAB 0 */ + PRECWS_D_DAGGER_D, /* CG 1 */ + PRECWS_DTM, /* GMRES 2 */ + PRECWS_DTM, /* CGS 3 */ + PRECWS_NO, /* MR 4 */ + PRECWS_NO, /* BICGSTABELL 5 */ + PRECWS_NO, /* FGMRES 6 */ + PRECWS_NO, /* GCR 7 */ + PRECWS_NO, /* GMRESDR 8 */ + PRECWS_NO, /* PCG 9 */ + PRECWS_NO, /* DFLGCR 10 */ + PRECWS_NO, /* DFLFGMRES 11 */ + PRECWS_NO, /* CGMMS 12 */ + PRECWS_DOV_DAGGER_DOV /* MIXEDCG 13 */ +}; + +const char opstrings[][32]={"NO","Dtm","QTM","D^\\dagger D","D_Overlap","D_Overlap^\\dagger D_overlap"}; + + +extern int nstore; + +double g_prec_sequence_d_dagger_d[3]={-0.25,-0.5,-0.25}; + +const char* precWSOpToString(tm_operator op){ + switch(op){ + case PRECWS_NO: return opstrings[0]; break; + case PRECWS_DTM: return opstrings[1]; break; + case PRECWS_QTM: return opstrings[2]; break; + case PRECWS_D_DAGGER_D: return opstrings[3]; break; + case PRECWS_DOV: return opstrings[4]; break; + case PRECWS_DOV_DAGGER_DOV: return opstrings[5]; break; + default: return (const char*)NULL; break; + } +} + +/** + * some helper functions + */ + +/** + * computes the SU2 representation of a quaternion given by p: + * the result is p_0*id - i p_i \sigam_i + * @param p components of the quaternion + * @param gamma_conv denotes one of two possible gamma0 conventions: + * if set to true gamma0 is assumed to be + * ( 0 -1 ) + * ( -1 0 ) + * and + * ( 0 1 ) + * ( 1 0 ) otherwise + */ +void makeQuaternionAsSu2(double *q,const double *p,unsigned int dagger,unsigned int gamma0_conv){ + if(gamma0_conv) + q[0]=q[6]=-p[0]; + else + q[0]=q[6]=p[0]; + + if(!dagger){ + q[3]=q[5]=-p[1]; + q[2]=-p[2];q[4]=p[2]; + q[1]=-p[3];q[7]=p[3]; + } else { + q[3]=q[5]=p[1]; + q[2]=p[2];q[4]=-p[2]; + q[1]=p[3];q[7]=-p[3]; + } +} + + +void M_ti_M_2d(double *result,const double *A,const double *B){ + result[0]=A[0]*B[0]-A[1]*B[1]+A[2]*B[4]-A[3]*B[5]; + result[1]=A[0]*B[1]+A[1]*B[0]+A[2]*B[5]+A[3]*B[4]; + + result[2]=A[0]*B[2]-A[1]*B[3]+A[2]*B[6]-A[3]*B[7]; + result[3]=A[0]*B[3]+A[1]*B[2]+A[2]*B[7]+A[3]*B[6]; + + + result[4]=A[4]*B[0]-A[5]*B[1]+A[6]*B[4]-A[7]*B[5]; + result[5]=A[4]*B[1]+A[5]*B[0]+A[6]*B[5]+A[7]*B[4]; + + result[6]=A[4]*B[2]-A[5]*B[3]+A[6]*B[6]-A[7]*B[7]; + result[7]=A[4]*B[3]+A[5]*B[2]+A[6]*B[7]+A[7]*B[6]; +} + +int cyclicDiff(int a,int b, int period){ + if ( b > a){ + return min( b-a , a+ period -b); + } else { + return min( a-b , b+ period -a); + } +} + +void calcPmuLattice(const int *praw,double *p_mu,int tt,int ll){ + p_mu[0]=M_PI/(double)tt*(2.*(double)praw[0]+1.); + p_mu[1]=p_mu[2]=p_mu[3]=2*M_PI/(double)ll; + p_mu[1]*=(double)praw[1]; + p_mu[2]*=(double)praw[2]; + p_mu[3]*=(double)praw[3]; + + p_mu[0]=sin(p_mu[0]); + p_mu[1]=sin(p_mu[1]); + p_mu[2]=sin(p_mu[2]); + p_mu[3]=sin(p_mu[3]); +} + +double calcPmuLatticeSq(const int *praw,int tt,int ll){ + return sin(M_PI/(double)tt*(2.*(double)praw[0]+1.))* + sin(M_PI/(double)tt*(2.*(double)praw[0]+1.))+ + + sin(2*M_PI*(double)praw[1]/(double)LX)* + sin(2*M_PI*(double)praw[1]/(double)LX)+ + + sin(2*M_PI*(double)praw[2]/(double)LY)* + sin(2*M_PI*(double)praw[2]/(double)LY)+ + + sin(2*M_PI*(double)praw[3]/(double)LZ)* + sin(2*M_PI*(double)praw[3]/(double)LZ); +} + + + +void calcPmuLatticeTilde(const int *praw,double *p_mu_t,int tt,int ll/* ,unsigned int aperiodic */){ + int i; +/* if(aperiodic) */ + p_mu_t[0]=M_PI/(double)(2*tt)*(2.*(double)praw[0]+1.); +/* else */ +/* p_mu_t[0]=M_PI/(double)tt*(double)praw[0]; */ + + + p_mu_t[1]=p_mu_t[2]=p_mu_t[3]=M_PI/(double)ll; + p_mu_t[1]*=(double)praw[1]; + p_mu_t[2]*=(double)praw[2]; + p_mu_t[3]*=(double)praw[3]; + + for(i=0;i<4;i++){ + p_mu_t[i]=2*sin(p_mu_t[i]); + } +} + +double calcPmuLatticeTildeSq(const int *praw,int tt,int ll){ + return 4*( + sin(M_PI/(double)(2*tt)*(2.*(double)praw[0]+1.))* + sin(M_PI/(double)(2*tt)*(2.*(double)praw[0]+1.))+ + + sin(M_PI*(double)praw[1]/(double)LX)* + sin(M_PI*(double)praw[1]/(double)LX)+ + + sin(M_PI*(double)praw[2]/(double)LY)* + sin(M_PI*(double)praw[2]/(double)LY)+ + + sin(M_PI*(double)praw[3]/(double)LZ)* + sin(M_PI*(double)praw[3]/(double)LZ)); +} + + +_Complex double calcDtmEvalue(const int *praw,double kappa,double mu,int tt,int ll,double sign){ + + static double p_mu[4]; + static double p_mu_t[4]; + double psq,psq_tilde; + _Complex double lambda; + + + calcPmuLattice(praw,p_mu,tt,ll); + psq=p_mu[0]*p_mu[0]+p_mu[1]*p_mu[1]+p_mu[2]*p_mu[2]+p_mu[3]*p_mu[3]; + + calcPmuLatticeTilde(praw,p_mu_t,tt,ll); + psq_tilde=p_mu_t[0]*p_mu_t[0]+p_mu_t[1]*p_mu_t[1]+p_mu_t[2]*p_mu_t[2]+p_mu_t[3]*p_mu_t[3]; + + lambda = (0.5 / kappa - 4 + 0.5 * psq_tilde) + (sign * sqrt(mu * mu + psq)) * I; + return lambda; + +} + +_Complex double calcDovEvalue(const int *praw,double kappa,double rho,int tt,int ll,double sign){ + + static double p_mu[4]; + static double p_mu_t[4]; + double psq,psq_tilde; + _Complex double lambda; + double denominator; + + calcPmuLattice(praw,p_mu,tt,ll); + psq=p_mu[0]*p_mu[0]+p_mu[1]*p_mu[1]+p_mu[2]*p_mu[2]+p_mu[3]*p_mu[3]; + + calcPmuLatticeTilde(praw,p_mu_t,tt,ll); + psq_tilde=p_mu_t[0]*p_mu_t[0]+p_mu_t[1]*p_mu_t[1]+p_mu_t[2]*p_mu_t[2]+p_mu_t[3]*p_mu_t[3]; + + + + lambda = (0.5 * psq_tilde - rho) + (sign * sqrt(psq)) * I; + + denominator=cabs(lambda); + lambda *= rho/denominator; + lambda += rho; + + return lambda; + +} + + +_Complex double calcQtmEvalue(const int *praw,double kappa,double mu,int tt,int ll,double sign/* =1.0 */){ + static double p_mu[4]; + static double p_mu_t[4]; + double psq,psq_tilde,M_wilson; + _Complex double lambda; + + calcPmuLattice(praw,p_mu,tt,ll); + psq=p_mu[0]*p_mu[0]+p_mu[1]*p_mu[1]+p_mu[2]*p_mu[2]+p_mu[3]*p_mu[3]; + + calcPmuLatticeTilde(praw,p_mu_t,tt,ll); + psq_tilde=p_mu_t[0]*p_mu_t[0]+p_mu_t[1]*p_mu_t[1]+p_mu_t[2]*p_mu_t[2]+p_mu_t[3]*p_mu_t[3]; + + M_wilson=((0.5/kappa-4.)+0.5*psq_tilde); + + lambda = sign * sqrt(M_wilson * M_wilson + psq) + mu * I; + return lambda; + +} + +_Complex double calcDDaggerDtmEvalue(const int *praw,double kappa,double mu,int tt,int ll) +{ + static double p_mu[4]; + static double p_mu_t[4]; + double M_wilson; + _Complex double lambda; + double psq_tilde,psq; + + calcPmuLattice(praw,p_mu,tt,ll); + psq=p_mu[0]*p_mu[0]+p_mu[1]*p_mu[1]+p_mu[2]*p_mu[2]+p_mu[3]*p_mu[3]; + + calcPmuLatticeTilde(praw,p_mu_t,tt,ll); + psq_tilde=p_mu_t[0]*p_mu_t[0]+p_mu_t[1]*p_mu_t[1]+p_mu_t[2]*p_mu_t[2]+p_mu_t[3]*p_mu_t[3]; + + M_wilson=((0.5/kappa-4.)+0.5*psq_tilde); + + lambda = (psq + M_wilson * M_wilson + mu * mu); + + return lambda; +} + + +_Complex double calcDDaggerDovEvalue(const int *praw,double kappa,double rho,int tt,int ll){ + static double p_mu[4]; + static double p_mu_t[4]; + _Complex double lambda; + double abslam,diff; + double u,v; + + calcPmuLattice(praw,p_mu,tt,ll); + v=p_mu[0]*p_mu[0]+p_mu[1]*p_mu[1]+p_mu[2]*p_mu[2]+p_mu[3]*p_mu[3]; + + calcPmuLatticeTilde(praw,p_mu_t,tt,ll); + u=p_mu_t[0]*p_mu_t[0]+p_mu_t[1]*p_mu_t[1]+p_mu_t[2]*p_mu_t[2]+p_mu_t[3]*p_mu_t[3]; + u=u*0.5-rho; + + lambda = calcDovEvalue(praw, kappa, rho, tt, ll, 1.); + abslam = lambda * conj(lambda); + + lambda = (2. * (u / sqrt(u * u + v) + 1.) * rho * rho); + + diff=abslam - cabs(lambda); + if(diff>1.e-12) + printf("Error in Eigenvalue computation for Dov ^ dagger Dov: at praw = (%d,%d,%d,%d)(difference = %lf)!!! \n",praw[0],praw[1],praw[2],praw[3],diff); + + return lambda; + + +} + + +void spinor_fft(spinor * spinor_in,spinor *spinor_out,int tt,int ll,unsigned int forward){ +#ifdef HAVE_FFTW + fftw_plan plan=spinor_fftw_plan(spinor_in,spinor_out,tt,ll,forward,FFTW_WISDOM_ONLY); + fftw_execute(plan); +#else + fprintf(stderr,"Error fftw not available. Thus cant perform spinor_fft !!!\n"); + fflush(stderr); + exit(-1); +#endif +} + + + +/** + * here comes the con- and destructor for the precWs + * struct from the former C++ implementation + */ + + +/** + * for using a phenomenologically fitted function for the eigenvalues instead of + * the analytic treelevel ones (potential improvement by a factor of 2 has been observed + * on a 32x16^3 lattice not tested very much so far + */ +double spinorPrecWS_evalCorrectionFunction(spinorPrecWS *ws,double pmuSq,double pmuTildeSq){ + return (ws->ai[0]*pmuTildeSq*pmuTildeSq+ws->ai[1]*pmuTildeSq+ws->ai[2]+ws->ai[3]*pmuSq); +} + +double spinorPrecWS_evalCorrectionFunctionDk(double pmuSq,double pmuTildeSq,int k){ + switch(k){ + case 0: + return pmuTildeSq*pmuTildeSq; + break; + case 1: + return pmuTildeSq; + break; + case 2: + return 1; + break; + case 3: + return pmuSq; + break; + default : return -1; break; + } +} + +void spinorPrecWS_RecalcDDaggerDEvs(spinorPrecWS *ws,double kappa,double mu){ + int index,rawp[4]; + double pmuSq,pmuTildeSq; + _Complex double lambda; + double twokappa=2.*kappa; + + + if(ws->m_op==PRECWS_D_DAGGER_D){ + + FORXYZT(rawp[0],rawp[1],rawp[2],rawp[3],T,L); + + index= Index(rawp[0],rawp[1],rawp[2],rawp[3]); + + if(ws->useCorrectionFunc==1){ + pmuSq=calcPmuLatticeSq(rawp,T,LX); + pmuTildeSq=calcPmuLatticeTildeSq(rawp,T,LX); + lambda = (spinorPrecWS_evalCorrectionFunction(ws,pmuSq,pmuTildeSq)); + } else { + lambda=calcDDaggerDtmEvalue(rawp,kappa,mu,T,L); + } + + lambda *= twokappa; + lambda *= twokappa; + + + + memcpy(ws->evs+index,&lambda,sizeof(_Complex double)); + + + + ENDFORXYZT; + + } + +} + + +void spinorPrecWS_Init(spinorPrecWS *ws, double kappa,double mu,double rho,tm_operator op){ + + /* spinor fv_dum; */ + int index,rawp[4]; + _Complex double lambda,averageLambda; + spinor *up_plus; + + + ws->spinorMemBuff=NULL; + static int epsilon[12]={1,1,1,1,1,1,-1,-1,-1,-1,-1,-1}; + static int k[12] ={0,0,0,1,1,1,0,0,0,1,1,1}; + /* static int color[12] ={0,1,2,0,1,2,0,1,2,0,1,2}; */ + + double twokappa=2.*kappa; + double pmuSq,pmuTildeSq; + double absLamMax=0.0,absLamMin=1.0,absLam; + + FILE *precSeqFileDD=NULL; + const char *precSeqFileNameDD="prec_seq_dd.in"; + char strBuffer[256]; + + precSeqFileDD=fopen(precSeqFileNameDD,"r"); + + if(precSeqFileDD != NULL){ + fgets(strBuffer,255,precSeqFileDD); + sscanf(strBuffer,"%lf %lf %lf",g_prec_sequence_d_dagger_d+0,g_prec_sequence_d_dagger_d+1,g_prec_sequence_d_dagger_d+2); + printf("read preconditioning sequence: %lf %lf %lf \n", + g_prec_sequence_d_dagger_d[0], + g_prec_sequence_d_dagger_d[1], + g_prec_sequence_d_dagger_d[2]); + fclose(precSeqFileDD); + } + + ws->m_op=op; + if(ws->m_op!=PRECWS_D_DAGGER_D && ws->m_op!=PRECWS_DOV_DAGGER_DOV){ + + allocate_spinor_field_array(&(ws->spinor_up),&(ws->spinorMemBuff),VOLUMEPLUSRAND,1); + + } + + + if(ws->m_op==PRECWS_D_DAGGER_D){ + ws->useCorrectionFunc=1; + ws->ai[0]=0.25; + ws->ai[1]=0.5/kappa-4.; + ws->ai[2]=ws->ai[1]*ws->ai[1]+mu*mu; + ws->ai[3]=1.; + } else { + ws->useCorrectionFunc=0; + } + + ws->evs=(_Complex double*)malloc(sizeof(_Complex double)*T*LX*LY*LZ); + ws->c_table=(double*)malloc(sizeof(_Complex double)*T); + ws->s_table=(double*)malloc(sizeof(_Complex double)*T); + + if(ws->m_op==PRECWS_D_DAGGER_D){ + ws->precExpo[0]=-0.25; + ws->precExpo[1]=-0.5; + ws->precExpo[2]=-0.25; + } else if(ws->m_op==PRECWS_DOV_DAGGER_DOV){ + ws->precExpo[0]=-.25; + ws->precExpo[1]=-.5; + ws->precExpo[2]=-.25; + } + + + averageLambda = 0.0; + + + FORXYZT(rawp[0],rawp[1],rawp[2],rawp[3],T,L); + + index= Index(rawp[0],rawp[1],rawp[2],rawp[3]); + + + + if(op==PRECWS_DTM) + lambda=calcDtmEvalue(rawp,kappa,mu,T,L,epsilon[0]); + else if(op==PRECWS_DOV) + lambda=calcDovEvalue(rawp,g_kappa,rho,T,L,epsilon[0]); + else if(op==PRECWS_QTM) + lambda=calcQtmEvalue(rawp,kappa,mu,T,L,epsilon[0]); + else if(ws->m_op==PRECWS_D_DAGGER_D){ + ws->precExpo[0]=-0.25; + ws->precExpo[1]=-0.5; + ws->precExpo[2]=-0.25; + if(ws->useCorrectionFunc==1){ + pmuSq=calcPmuLatticeSq(rawp,T,LX); + pmuTildeSq=calcPmuLatticeTildeSq(rawp,T,LX); + lambda = (spinorPrecWS_evalCorrectionFunction(ws,pmuSq,pmuTildeSq)); + } else { + lambda=calcDDaggerDtmEvalue(rawp,kappa,mu,T,L); + } + + /* in this case an extra factor of 2kappa is needed as we apply the dirac operator two times */ + lambda *= twokappa; + + } else if(ws->m_op==PRECWS_DOV_DAGGER_DOV){ + lambda=calcDDaggerDovEvalue(rawp,kappa,rho,T,L); + } + + if(op!=PRECWS_DOV && op!=PRECWS_DOV_DAGGER_DOV){ /* overlap operator eigevalue routine does it itself */ + lambda *= twokappa; + } + + + /* if(rawp[0]==1 && rawp[1]==1 && rawp[2]==1 && rawp[3]==1 ) */ + /* cerr << lambda << endl; */ + + memcpy(ws->evs+index,&lambda,sizeof(_Complex double)); + + /* calculate maximal and minimal modulus of all eigenvalues + */ + + absLam=cabs(lambda); + + if(rawp[0]==0 && rawp[1]==0 && rawp[2]==0 && rawp[3]==0) + { + absLamMax=absLamMin=absLam; + } + else { + if(absLam>absLamMax) absLamMax=absLam; + /* (else) */ + if(absLamm_op!=PRECWS_D_DAGGER_D && ws->m_op!=PRECWS_DOV_DAGGER_DOV){ + up_plus=(ws->spinor_up)[0]+index; + + + if(op==PRECWS_DTM || op==PRECWS_DOV){ + spinorStructEigenvecDtmSu3Vector(up_plus,mu,epsilon[0],k[0],0,rawp,T,LX); + spinorStructEigenvecDtmSu3Vector(up_plus,mu,epsilon[3],k[3],1,rawp,T,LX); + } else if(op==PRECWS_QTM) { + spinorStructEigenvecQtmSu3Vector(up_plus,kappa,mu,epsilon[0],k[0],0,rawp,T,LX); + spinorStructEigenvecQtmSu3Vector(up_plus,kappa,mu,epsilon[3],k[3],1,rawp,T,LX); + } + + } + + + ENDFORXYZT; + + if(g_proc_id==0) printf("theoretical condition number improvement: %lf\n" ,absLamMax/absLamMin); + + averageLambda = (averageLambda) * (1./(double)(VOLUME)); + + /* create a sinus/cosinus lookup table */ + for( rawp[0] = 0;rawp[0]c_table)[rawp[0]]=cos(M_PI*(double)rawp[0]/(double)T); + (ws->s_table)[rawp[0]]=sin(M_PI*(double)rawp[0]/(double)T); + } + + +} + +void spinorPrecWS_Free(spinorPrecWS *ws){ + free(ws->c_table); + free(ws->s_table); + free_spinor_field_array(&(ws->spinorMemBuff)); +} + + + +/** + * End of precWS functions + */ + + + + + + + + + + +void eigenvector_Dtm(spinor *spin,double mu,int epsilon,int k,int color,int rawp[4]){ + + +#ifdef HAVE_FFTW + fftw_plan p1bw; +#endif + int i=0; + int u_index; + spinor *up_plus; + spinor *phi; + + spinorPrecWS *ws=(spinorPrecWS*)g_precWS; + + + for(i=0;ispinor_up[0]+u_index; + phi=spin+u_index; + + switch(color){ + case 0: + if(k==0){ + phi->s0.c0=up_plus->s0.c0; + phi->s1.c0=up_plus->s1.c0; + phi->s2.c0=up_plus->s2.c0; + phi->s3.c0=up_plus->s3.c0; + } else { + phi->s0.c0=up_plus->s0.c1; + phi->s1.c0=up_plus->s1.c1; + phi->s2.c0=up_plus->s2.c1; + phi->s3.c0=up_plus->s3.c1; + } + phi->s2.c0 = phi->s2.c0 * (double)epsilon; + phi->s3.c0 = phi->s3.c0 * (double)epsilon; + + break; + case 1: + if(k==0){ + phi->s0.c1=up_plus->s0.c0; + phi->s1.c1=up_plus->s1.c0; + phi->s2.c1=up_plus->s2.c0; + phi->s3.c1=up_plus->s3.c0; + } else { + phi->s0.c1=up_plus->s0.c1; + phi->s1.c1=up_plus->s1.c1; + phi->s2.c1=up_plus->s2.c1; + phi->s3.c1=up_plus->s3.c1; + } + phi->s2.c1 = phi->s2.c1 * (double)epsilon; + phi->s3.c1 = phi->s3.c1 * (double)epsilon; + break; + case 2: + if(k==0){ + phi->s0.c2=up_plus->s0.c0; + phi->s1.c2=up_plus->s1.c0; + phi->s2.c2=up_plus->s2.c0; + phi->s3.c2=up_plus->s3.c0; + } else { + phi->s0.c2=up_plus->s0.c1; + phi->s1.c2=up_plus->s1.c1; + phi->s2.c2=up_plus->s2.c1; + phi->s3.c2=up_plus->s3.c1; + } + phi->s2.c2 = phi->s2.c2 * (double)epsilon; + phi->s3.c2 = phi->s3.c2 * (double)epsilon; + break; + default:break; + } + +/* spinorStructEigenvecDtm(spinor+u_index,mu,epsilon,k,color,rawp,T,L); */ + + + + + _spinor_muleq_real(*phi,1.0/sqrt((double)(VOLUME))); + + + +#ifdef HAVE_FFTW + p1bw=spinor_fftw_plan(spin,spin,T,L,0,FFTW_WISDOM_ONLY); + fftw_execute(p1bw); +#endif + + /* spinor mulp half phase */ + +} + + + + + +#ifdef HAVE_FFTW +fftw_plan spinor_fftw_plan(spinor *spinor_in,spinor *spinor_out,int T,int ll,unsigned int forward,int fftw_flags){ + +/* int index_s = gsi(get_index(it, ix, iy, iz, tt, ll)); */ +/* double *xi_ = xi + index_s; */ + + int Dim1[4]; +/* cerr << "Trying to create a plan for T=" << T << " L=" << L ; */ +/* cerr.flush(); */ + + int rank=4; + + int stride=12; + int dist=1; + int howmany=12; + fftw_plan plan; + + + Dim1[0]=tt; + Dim1[1]=LX;Dim1[2]=LY;Dim1[3]=LZ; + + + if(fftw_flags==-1){fftw_flags=FFTW_ESTIMATE;} + if(forward){ + plan=fftw_plan_many_dft(rank, Dim1, howmany, (fftw_complex*)spinor_in, NULL, stride, dist, + (fftw_complex*)spinor_out,NULL,stride,dist, + FFTW_FORWARD,fftw_flags); + } else { + plan=fftw_plan_many_dft(rank, Dim1, howmany, (fftw_complex*)spinor_in, NULL, stride, dist, + (fftw_complex*)spinor_out,NULL,stride,dist, + FFTW_BACKWARD,fftw_flags); + } +/* if(plan!=NULL) cerr << " [OK]"<< endl; */ +/* else cerr << " [FAIL]"<< endl; */ +/* cerr.flush(); */ + + return plan; + +} +#endif + +void planeWave(spinor *spinor,int k,int rawp[4],int tt,int ll,unsigned int momspace){ + int i; + int u_index; + + for(i=0;ic_table, ws->s_table,1,1.); */ + plan_fw=spinor_fftw_plan(spinor_in,spinor_out,tt,ll,1 /* = true */,FFTW_WISDOM_ONLY); + fftw_execute(plan_fw); +#endif + } else if(spinor_in!=spinor_out) { + projectionInplace=0; + } + + /* projectionInplace=5; /\* do no projection at all*\/ */ + + if(projectionInplace==1){ + /* printf("projection is inplace \n"); */ + + + FORXYZT(rawp[0],rawp[1],rawp[2],rawp[3],tt,LX); + + index=Index(rawp[0],rawp[1],rawp[2],rawp[3]); + + /* obtain eigenvalues and eigenvectors */ + lambda_plus=ws->evs[index]; + + if(ws->m_op == PRECWS_DTM || ws->m_op == PRECWS_DOV){ + lambda_minus = conj(lambda_plus); + } + else if( ws->m_op == PRECWS_QTM){ + lambda_minus = -conj(lambda_plus); + } + + /* conjugate eigenvalue if conjugation of operator was requested */ + if(dagger){ + lambda_plus=conj(lambda_plus); + lambda_minus=conj(lambda_minus); + } + + _pow_complex(lambda_plus,lambda_plus,alpha,dummy); + _pow_complex(lambda_minus,lambda_minus,alpha,dummy); + + phi_o=spinor_out+index; + + + /* calculate projections */ + + if(ws->m_op != PRECWS_D_DAGGER_D && ws->m_op != PRECWS_DOV_DAGGER_DOV){ + + _spinor_null(phi_plus); + + up_plus=(ws->spinor_up[0])+index; + + + PROJECTSPLIT(p_plus,up_plus,c0,phi_o,phi_plus,c0); + PROJECTSPLIT(p_plus,up_plus,c0,phi_o,phi_plus,c1); + PROJECTSPLIT(p_plus,up_plus,c0,phi_o,phi_plus,c2); + + PROJECTSPLIT(p_plus,up_plus,c1,phi_o,phi_plus,c0); + PROJECTSPLIT(p_plus,up_plus,c1,phi_o,phi_plus,c1); + PROJECTSPLIT(p_plus,up_plus,c1,phi_o,phi_plus,c2); + + + _spinor_muleq_complex(*phi_o,lambda_minus,muleqdum); + _spinor_muleq_complex(phi_plus,lambda_plus,muleqdum); + + _vector_sub(phi_o->s0,phi_o->s0,phi_plus.s0); + _vector_sub(phi_o->s1,phi_o->s1,phi_plus.s1); + _vector_sub(phi_o->s2,phi_o->s2,phi_plus.s2); + _vector_sub(phi_o->s3,phi_o->s3,phi_plus.s3); + + + + } else /* is the case if we want to precondition D^dagger x D */ { + _spinor_muleq_real(*phi_o,creal(lambda_plus)); + } + ENDFORXYZT; + } else if(projectionInplace==0) { + printf("projection is out of place \n"); + fflush(stdout); + FORXYZT(rawp[0],rawp[1],rawp[2],rawp[3],tt,LX); + + + index=Index(rawp[0],rawp[1],rawp[2],rawp[3]); + + + /* obtain eigenvalues and eigenvectors */ + lambda_plus=ws->evs[index]; + + if(ws->m_op == PRECWS_DTM || ws->m_op == PRECWS_DOV){ + lambda_minus = conj(lambda_plus); + } + else if( ws->m_op == PRECWS_QTM){ + lambda_minus = -conj(lambda_plus); + } + + /* conjugate eigenvalue if conjugation of operator was requested */ + if(dagger) + { + lambda_plus = conj(lambda_plus); + lambda_minus = conj(lambda_minus); + } + + _pow_complex(lambda_plus,lambda_plus,alpha,dummy); + _pow_complex(lambda_minus,lambda_minus,alpha,dummy); + + + + + + phi_i=spinor_in+index; + psi=spinor_out+index; + + + + if(ws->m_op != PRECWS_D_DAGGER_D && ws->m_op != PRECWS_DOV_DAGGER_DOV){ + + memcpy(psi,phi_i,sizeof(spinor)); + + /* obtain eigenvectors */ + up_plus=(ws->spinor_up[0])+index; + + + /* todo: adapt for out of place macro */ + PROJECTSPLIT(p_plus,up_plus,c0,psi,phi_plus,c0); + PROJECTSPLIT(p_plus,up_plus,c0,psi,phi_plus,c1); + PROJECTSPLIT(p_plus,up_plus,c0,psi,phi_plus,c2); + + PROJECTSPLIT(p_plus,up_plus,c1,psi,phi_plus,c0); + PROJECTSPLIT(p_plus,up_plus,c1,psi,phi_plus,c1); + PROJECTSPLIT(p_plus,up_plus,c1,psi,phi_plus,c2); + + + _spinor_muleq_complex(*psi,lambda_minus,muleqdum); + _spinor_muleq_complex(phi_plus,lambda_plus,muleqdum); + + _vector_sub(psi->s0,psi->s0,phi_plus.s0); + _vector_sub(psi->s1,psi->s1,phi_plus.s1); + _vector_sub(psi->s2,psi->s2,phi_plus.s2); + _vector_sub(psi->s3,psi->s3,phi_plus.s3); + + + + + + + } else /* is the case if we want to precondition D^dagger x D */ { + _spinor_mul_complex(*psi,lambda_plus,*phi_i); + } + ENDFORXYZT; + } + + if(autofft == 1){ +#ifdef HAVE_FFTW + plan_bw=spinor_fftw_plan(spinor_out,spinor_out,tt,LX,0,FFTW_WISDOM_ONLY); + fftw_execute(plan_bw); +#endif + mul_r(spinor_out,OOVOL,spinor_out,VOLUME); + /* spinor_mulp_half_phase(spinor_out,spinor_out,ws->c_table, ws->s_table,0,OOVOL); */ + } +} + +void spinorStructEigenvecDtm(spinor *fv,double mu,int epsilon,int k,int color,int rawp[4],int tt,int ll){ + double q[8]; + double p_mu[4]; + double prefactor; + double psq; + double beta,norm_factor; + int index; + double *fv_=(double*)fv; + + calcPmuLattice(rawp,p_mu,tt,LX); + + psq=p_mu[0]*p_mu[0]+ + p_mu[1]*p_mu[1]+ + p_mu[2]*p_mu[2]+ + p_mu[3]*p_mu[3]; + +/* p_mu[3]*=-1.; */ + makeQuaternionAsSu2(q,p_mu,1/* dagger ? */,1); + + /* this comes just from the calculation of q itself */ + prefactor=sqrt(mu*mu+psq); + prefactor*=(double)epsilon; + prefactor=(prefactor-mu)/psq; + + /* this comes from the overall normalization of the spinor */ + beta=mu/sqrt(psq); + norm_factor=1./sqrt(2.*(1+beta*(beta-epsilon*sqrt(beta*beta+1)))); + prefactor*=norm_factor; + + + q[0]*=prefactor; q[1]*=prefactor; q[2]*=prefactor; q[3]*=prefactor; + q[4]*=prefactor; q[5]*=prefactor; q[6]*=prefactor; q[7]*=prefactor; + + +/* for(i=0;i<24;i+=2){ */ +/* fv[i]=0; */ +/* fv[i+1]=0; */ +/* } */ + _spinor_null(*fv); + + index=color*2; + + if(k==0){ + /* set unit vector */ + fv_[index]=1.0*norm_factor; + + /* jump two entries further (in spinor space) */ + index+=12; + fv_[index]=q[0]; + fv_[index+1]=q[1]; + /* jump two entries further (in spinor space) */ + index+=6; + fv_[index]=q[4]; + fv_[index+1]=q[5]; + } else /* if(k==1) */ { + /* set second unit vector */ + index+=6; + fv_[index]=1.0*norm_factor; + + /* jump two entries further (in spinor space) */ + index+=6; + fv_[index]=q[2]; + fv_[index+1]=q[3]; + /* jump two entries further (in spinor space) */ + index+=6; + fv_[index]=q[6]; + fv_[index+1]=q[7]; + } + +} + + +void spinorStructEigenvecDtmSu3Vector(spinor *fv, double mu, int epsilon, int k, int store_color, int rawp[4], int tt, int ll) +{ + double q[8]; + double p_mu[4]; + double prefactor; + double psq; + double beta,norm_factor; + + calcPmuLattice(rawp,p_mu,tt,LX); + + psq=p_mu[0] * p_mu[0]+ p_mu[1] * p_mu[1] + p_mu[2] * p_mu[2] + p_mu[3] * p_mu[3]; + +/* p_mu[3]*=-1.; */ + makeQuaternionAsSu2(q,p_mu,1/* dagger ? */,1); + + /* this comes just from the calculation of q itself */ + prefactor = (epsilon * sqrt(mu * mu + psq) - mu) / psq; + + /* this comes from the overall normalization of the spinor */ + beta = mu/sqrt(psq); + norm_factor = 1./sqrt(2. * (1 + beta * (beta - epsilon * sqrt(beta * beta + 1)))); + prefactor *= norm_factor; + + q[0]*=prefactor; q[1]*=prefactor; q[2]*=prefactor; q[3]*=prefactor; + q[4]*=prefactor; q[5]*=prefactor; q[6]*=prefactor; q[7]*=prefactor; + +/* _vector_null(*fv); */ + + switch(store_color) + { + case 0: + if(k==0) + { + fv->s0.c0 = norm_factor; + fv->s1.c0 = 0.0; + fv->s2.c0 = q[0] + q[1] * I; + fv->s3.c0 = q[4] + q[5] * I; + } + else /* if(k==1) */ + { + fv->s0.c0 = 0.0; + fv->s1.c0 = norm_factor; + fv->s2.c0 = q[2] + q[3] * I; + fv->s3.c0 = q[6] + q[7] * I; + } + break; + case 1: + if(k==0) + { + fv->s0.c1 = norm_factor; + fv->s1.c1 = 0.0; + fv->s2.c1 = q[0] + q[1] * I; + fv->s3.c1 = q[4] + q[5] * I; + } + else /* if(k==1) */ + { + fv->s0.c1 = 0.0; + fv->s1.c1 = norm_factor; + fv->s2.c1 = q[2] + q[3] * I; + fv->s3.c1 = q[6] + q[7] * I; + } + break; + case 2: + if(k==0) + { + fv->s0.c2 = norm_factor; + fv->s1.c2 = 0.0; + fv->s2.c2 = q[0] + q[1] * I; + fv->s3.c2 = q[4] + q[5] * I; + } + else /* if(k==1) */ + { + fv->s0.c2 = 0.0; + fv->s1.c2 = norm_factor; + fv->s2.c2 = q[2] + q[3] * I; + fv->s3.c2 = q[6] + q[7] * I; + } + break; + } + +} + +void spinorStructEigenvecQtm(spinor *fv,double kappa,double mu,int epsilon,int k,int color,int rawp[4],int tt,int ll){ + double q[8]; + double p_mu[4]; + double p_mu_t[4]; + double psq,psq_tilde,M_wilson,prefactor,beta,norm_factor,swap_dummy; + double *fv_=(double*)fv; + int index; + + calcPmuLattice(rawp,p_mu,tt,ll); + psq=p_mu[0]*p_mu[0]+ + p_mu[1]*p_mu[1]+ + p_mu[2]*p_mu[2]+ + p_mu[3]*p_mu[3]; + + calcPmuLatticeTilde(rawp,p_mu_t,tt,ll); + psq_tilde=p_mu_t[0]*p_mu_t[0]+p_mu_t[1]*p_mu_t[1]+p_mu_t[2]*p_mu_t[2]+p_mu_t[3]*p_mu_t[3]; + + makeQuaternionAsSu2(q,p_mu,1/* dagger ? */, 1 /* gamma_0 convention */); + + /* this comes just from the calculation of q itself */ + M_wilson=((0.5/kappa-4.)+0.5*psq_tilde); + prefactor=(M_wilson-epsilon*sqrt(psq+M_wilson*M_wilson))/psq; + + /* this comes from the overall normalization of the spinor */ + beta=M_wilson/sqrt(psq); + norm_factor=1./sqrt(2.*(1.+beta*(beta-epsilon*sqrt(beta*beta+1.)))); +/* cerr << "Norm factor is " << norm_factor << endl; */ +/* norm_factor=1.0; */ + prefactor*=norm_factor; + + /* multiply with i ... */ + /* .. so first swap re <-> im .. */ + SWAP(q[0],q[1],swap_dummy); + SWAP(q[2],q[3],swap_dummy); + SWAP(q[4],q[5],swap_dummy); + SWAP(q[6],q[7],swap_dummy); + + /* and multiply new real part (former imag part) with -1 */ + q[0]*=-prefactor; q[1]*=prefactor; q[2]*=-prefactor; q[3]*=prefactor; + q[4]*=-prefactor; q[5]*=prefactor; q[6]*=-prefactor; q[7]*=prefactor; + + + _spinor_null(*fv); + + + index=color*2; + + if(k==0) + { + /* set unit vector */ + fv_[index]=1.0*norm_factor; + + /* jump two entries further (in spinor space) */ + index+=12; + fv_[index]=q[0]; + fv_[index+1]=q[1]; + /* jump two entries further (in spinor space) */ + index+=6; + fv_[index]=q[4]; + fv_[index+1]=q[5]; + } + else /* if(k==1) */ + { + /* set second unit vector */ + index+=6; + fv_[index]=1.0*norm_factor; + + /* jump two entries further (in spinor space) */ + index+=6; + fv_[index]=q[2]; + fv_[index+1]=q[3]; + /* jump two entries further (in spinor space) */ + index+=6; + fv_[index]=q[6]; + fv_[index+1]=q[7]; + } +} + + +void spinorStructEigenvecQtmSu3Vector(spinor *fv,double kappa,double mu,int epsilon,int k,int store_color,int rawp[4],int tt,int ll){ + double q[8]; + double p_mu[4]; + double p_mu_t[4]; + double psq,psq_tilde,M_wilson,prefactor,beta,norm_factor,swap_dummy; + + calcPmuLattice(rawp,p_mu,tt,ll); + psq=p_mu[0]*p_mu[0]+ + p_mu[1]*p_mu[1]+ + p_mu[2]*p_mu[2]+ + p_mu[3]*p_mu[3]; + + calcPmuLatticeTilde(rawp,p_mu_t,tt,ll); + psq_tilde=p_mu_t[0]*p_mu_t[0]+p_mu_t[1]*p_mu_t[1]+p_mu_t[2]*p_mu_t[2]+p_mu_t[3]*p_mu_t[3]; + + makeQuaternionAsSu2(q,p_mu,1/* dagger ? */, 1 /* gamma_0 convention */); + + /* this comes just from the calculation of q itself */ + M_wilson=((0.5/kappa-4.)+0.5*psq_tilde); + prefactor=(M_wilson-epsilon*sqrt(psq+M_wilson*M_wilson))/psq; + + /* this comes from the overall normalization of the spinor */ + beta=M_wilson/sqrt(psq); + norm_factor=1./sqrt(2.*(1.+beta*(beta-epsilon*sqrt(beta*beta+1.)))); +/* cerr << "Norm factor is " << norm_factor << endl; */ +/* norm_factor=1.0; */ + prefactor*=norm_factor; + + /* multiply with i ... */ + /* .. so first swap re <-> im .. */ + SWAP(q[0],q[1],swap_dummy); + SWAP(q[2],q[3],swap_dummy); + SWAP(q[4],q[5],swap_dummy); + SWAP(q[6],q[7],swap_dummy); + + /* and multiply new real part (former imag part) with -1 */ + q[0]*=-prefactor; q[1]*=prefactor; q[2]*=-prefactor; q[3]*=prefactor; + q[4]*=-prefactor; q[5]*=prefactor; q[6]*=-prefactor; q[7]*=prefactor; + + switch(store_color) + { + case 0: + if(k==0) + { + fv->s0.c0 = norm_factor; + fv->s1.c0 = 0.0; + fv->s2.c0 = q[0] + q[1] * I; + fv->s3.c0 = q[4] + q[5] * I; + } + else /* if(k==1) */ + { + fv->s0.c0 = 0.0; + fv->s1.c0 = norm_factor; + fv->s2.c0 = q[2] + q[3] * I; + fv->s3.c0 = q[6] + q[7] * I; + } + break; + case 1: + if(k==0) + { + fv->s0.c1 = norm_factor; + fv->s1.c1 = 0.0; + fv->s2.c1 = q[0] + q[1] * I; + fv->s3.c1 = q[4] + q[5] * I; + } + else /* if(k==1) */ + { + fv->s0.c1 = 0.0; + fv->s1.c1 = norm_factor; + fv->s2.c1 = q[2] + q[3] * I; + fv->s3.c1 = q[6] + q[7] * I; + } + break; + case 2: + if(k==0) + { + fv->s0.c2 = norm_factor; + fv->s1.c2 = 0.0; + fv->s2.c2 = q[0] + q[1] * I; + fv->s3.c2 = q[4] + q[5] * I; + } + else /* if(k==1) */ + { + fv->s0.c2 = 0.0; + fv->s1.c2 = norm_factor; + fv->s2.c2 = q[2] + q[3] * I; + fv->s3.c2 = q[6] + q[7] * I; + } + break; + } +} + + +void spinor_mulp_half_phase(spinor *spinor_out,const spinor *spinor_in, + double *c_table,double *s_table, + unsigned forward,double mulp){ + int t,x,z,y; + int myindex; + unsigned int useDummy=0; + unsigned int deleteArrays=0; + _Complex double phase; + int i; + + if(spinor_in==spinor_out) useDummy=1; + + if(s_table==NULL || c_table==NULL){ + s_table=(double*)malloc(sizeof(double)*T); + c_table=(double*)malloc(sizeof(double)*T); + deleteArrays=1; + for(i=0;i>>>> sorry could not load fftw wisdom <<<<<\n"); + else + fprintf(stderr, " >>>>> Successfully loaded FFTW WISDOM for Lattice size %02d x %02d <<<<<<<<<<<<<<<\n" , tt , ll ); + fclose(wisdomFile); + } + + /* out of place plan */ + fftw_plan plan=spinor_fftw_plan(spinor_in,spinor_out,tt,ll,1,FFTW_WISDOM_ONLY); + if(plan==NULL){ + fftw_forget_wisdom(); + /* fftw_plan spinor_fftw_plan(spinor *spinor_in,spinor *spinor_out,int tt,int ll,unsigned int forward,int fftw_flags){ */ + + /* forward plan */ + fftw_plan plan=spinor_fftw_plan(spinor_in,spinor_out,tt,ll,1,FFTW_MEASURE | FFTW_EXHAUSTIVE | FFTW_PATIENT); + /* backward plan */ + plan=spinor_fftw_plan(spinor_in,spinor_out,tt,ll,0,FFTW_MEASURE | FFTW_EXHAUSTIVE | FFTW_PATIENT); +/* plan=spinor_fftw_plan(spinor_in,spinor_out,tt,ll,0,FFTW_WISDOM_ONLY); */ + writeWisdom=1; + } + + /* inplace plan */ + plan=spinor_fftw_plan(spinor_in,spinor_in,tt,ll,1,FFTW_WISDOM_ONLY); + if(plan==NULL){ + /* forward plan */ + fftw_plan plan=spinor_fftw_plan(spinor_in,spinor_in,tt,ll,1,FFTW_MEASURE | FFTW_EXHAUSTIVE | FFTW_PATIENT); + /* backward plan */ + plan=spinor_fftw_plan(spinor_in,spinor_in,tt,ll,0,FFTW_MEASURE | FFTW_EXHAUSTIVE | FFTW_PATIENT); + writeWisdom=1; + } + if(writeWisdom==1){ + writeFFTWWisdom(tt,ll); + } +} + +void writeFFTWWisdom(int tt,int ll){ +/* ostringstream filename_fftw_wisdom; */ +/* filename_fftw_wisdom << "fftw_wisdom_" << setw(2) << setfill('0') << T << "x"<< setw(2) << setfill('0') << L; */ + char filename_fftw_wisdom[513]; + sprintf(filename_fftw_wisdom,"fftw_wisdom_%02dx%02d",tt,ll); + + + FILE *wisdomFile; + wisdomFile=fopen(filename_fftw_wisdom,"w+"); + if(wisdomFile!=NULL){ + fftw_export_wisdom_to_file(wisdomFile); + fclose(wisdomFile); + } + +} +#endif + +_Complex double calcMatrixElement(spinor *field1,spinor *field2,_Complex double mat[144],int praw1[4],int praw2[4], void (*op)(spinor*,spinor*),int diag,int jTo){ + + int j,i; + _Complex double sprod; + _Complex double avg=0.0; + int avgcount=0; + + + for(j=0;j 1e-2) */ +/* printf(" (%5.2f,%5.2f)",creal(sprod),cimag(sprod)); */ +/* else */ +/* printf(" "); */ +/* if(i==11) printf("\n"); */ +/* fflush(stdout); */ + } + else{ + for(i = 0 ;i 1.e-3) + printf(" (%5.2f,%5.2f)",creal(sprod),cimag(sprod)); + else + printf(" "); + fflush(stdout); + + } + printf("\n"); + } + + + } + avg /= (double)avgcount; + return avg; + +} + + +void diagMatrixElement(_Complex double mat[144]){ + + const int const N=12; + + char JOBVL[]="N"; + char JOBVR[]="N"; + + _Complex double *EVS; + _Complex double *EVECS; + + _Complex double *WORK; + double *RWORK; + + int LWORK=396; + _Complex double DUMMY[1]; + + int ONE=1; + int INFO; + + int i; + + EVS=(_Complex double*)malloc(N*sizeof(_Complex double)); + EVECS=(_Complex double*)malloc(N*N*sizeof(_Complex double)); + + WORK=(_Complex double*)malloc(2*N*sizeof(_Complex double)); + RWORK=(double*)malloc(2*N*sizeof(double)); + + _FT(zgeev)(JOBVL, JOBVR,&N,mat,&N,EVS,DUMMY,&ONE,DUMMY,&ONE,WORK,&LWORK,RWORK,&INFO); + + for( i = 0;i<12;i++) + printf(" ev i : %9.2e + %9.2e i \n", creal(EVS[i]), cimag(EVS[i])); + +/* printf(" LWORK[0] = %e \n" , creal(WORK[0])); */ + + + +/* for( i = 0;i<12;i++){ */ +/* for( j =0;j<6;j++){ */ +/* printf(" %9.2e + %9.2e i ", creal(EVECS[j*12+i]), cimag(EVECS[j*12+i])); */ +/* } */ +/* printf("\n"); */ +/* } */ + + free(EVS); + free(EVECS); + free(RWORK); + free(WORK); + + +} + + +/** + * creates a list of lattice momenta + * leading to an more equal distribution + * in p~_mu^2 - p~~_mu^2 space + */ + +int * makeEqualPmuMap(int n){ + + /* loop var*/ + int i=0; + /* random numbers*/ + double r[4]; + /* raw lattice momentum*/ + int rawp[4]; + + /* we discretise the plane in which the above mentioned distribution is defined */ + /** + * <- # divPmu -> + * *----*----*- ... -* ^ + * | | | | | + * | | | | + * *----*----*- ... -* # + * | | | | d + * | | | | i + * *----*----*- ... -* v + * . . . . P + * . . . . m + * *----*----*- ... -* u + * | | | | T + * | | | | + * *----*----*- ... -* V + */ + + /* # of divisions */ + int divPmu=30; + int divPmuT=30; + + /* size of one division */ + double divSPmu=4./(double)divPmu; + double divSPmuT=16./(double)divPmuT; + + /* ok it works like this */ + /* - first we make a loop over all!!! raw lattice momenta and + * store the number of points that lie in each division + * because it can happen that some divisions are not + * "reachable" by any of the raw lattice momenta + * - then we throw rice seeds randomly into all of the divisions (that are reachable) + * and decide how many samples have to be generated in one division + * - then random lattice momenta can "fall" into the divisions until they "fill" + * one division which is "closed" then for beeing fallen into + */ + + /** + * this contains the number of lattice momenta in each division + * for a full sweep through the lattice + */ + int *possMap; + + /** + * this will contain the desired distribution + */ + int *counts; + + /* calculated map indices from continous values (for adressing)*/ + int iPmu,iPmuT; + /* squared lattice momenta */ + double pmuSq,pmuTSq; + + /* this will contain the final map */ + int *pmuMap; + /*for checking */ + int sum; + /* number of free divisions*/ + int numFreeFields=divPmu*divPmuT; + /* filling degree of pmuMap */ + int numRawPs=0; + /* pointer buffer for counts+i */ + int *pc; + + /* allocate space for the pmuMap */ + pmuMap=(int*)malloc(sizeof(int)*n*4); + + /* allocate space for counts */ + counts=(int*)malloc(divPmu*divPmuT*sizeof(int)); + + /* allocate space for the possibility map */ + possMap=(int*)malloc(divPmu*divPmuT*sizeof(int)); + + + /* initilize to 0*/ + for(i=0;i=12 ) fprintf(stderr, "Errorr!!!!!!!!!!!! Pmu Index out of bounds : to large\n"); */ +/* if(iPmuT>=12) fprintf(stderr, "Errorr!!!!!!!!!!!! pmu~ Index out of bounds : to large\n"); */ +/* if(iPmu<0 ) fprintf(stderr, "Errorr!!!!!!!!!!!! Pmu Index out of bounds : to small: %e \n",pmuSq); */ +/* if(iPmuT<0) fprintf(stderr, "Errorr!!!!!!!!!!!! pmu~ Index out of bounds : to small: %e \n",pmuTSq); */ + fflush(stderr); + ++(possMap[iPmu+divPmu*iPmuT]); + }}}} + + printf("Here comes the \"Fish\" (possibility map: \"-\" = possible \"0\" = impossible)\n"); + + for(i=0;i0){ + ++counts[iPmu+divPmu*iPmuT]; + ++i; + } + } + + /* verbosity / check */ + sum=0; + for(i=0;i0){ + + /*create random raw lattice momentum*/ + ranlxd(r,4); + rawp[0]=(int)(r[0]*(double)T); + rawp[1]=(int)(r[1]*(double)LX); + rawp[2]=(int)(r[2]*(double)LY); + rawp[3]=(int)(r[3]*(double)LZ); + + /* calculate squared lattice momenta */ + pmuSq=calcPmuLatticeSq(rawp,T,L); + pmuTSq=calcPmuLatticeTildeSq(rawp,T,L); + +/* printf("pmuSq %f pmuTSq %f \n" , pmuSq, pmuTSq); */ + + /* calculate indices */ + iPmu=(int)floor(pmuSq/divSPmu); + iPmuT=(int)floor(pmuTSq/divSPmuT); + + /* buffer pointer */ + pc=counts+iPmu+divPmu*iPmuT; + /* check bounds */ + /* if( iPmu+divPmu*iPmuT >= divPmu* divPmuT ) fprintf(stderr,"Error index out of bounds\n"); */ + + /* if this field still "needs" a rice seed accept the raw momentum and store it in the array */ + if(*pc>0){ + pmuMap[numRawPs*4+0]=rawp[0]; + pmuMap[numRawPs*4+1]=rawp[1]; + pmuMap[numRawPs*4+2]=rawp[2]; + pmuMap[numRawPs*4+3]=rawp[3]; + ++numRawPs; + /* reduce the desired number of "rice seeds" in this division by one */ + --(*pc); + + /* if the count is 0 now we have one division less to fill */ + if((*pc) == 0 ){ + --numFreeFields; +/* printf(" numFreeFields = %d \n", numFreeFields); */ +/* if(numFreeFields==10||1){ */ +/* for(i=0;i0) + printf(" x"); + else + printf(" "); + if((i +1)% divPmu == 0) + printf("\n"); + } + free(possMap); + +} + +/** + * make a completly random map of lattice momenta + */ +int * makeRandomPmuMap(int n){ + + int i=0; + double r[4]; + int rawp[4]; + /* # of divitions */ + int numRawPs=0; + int *pmuMap; + + /* allocate space for the pmuMap */ + pmuMap=(int*)malloc(sizeof(int)*n*4); + + while(numRawPsmu; + g_kappa=optr->kappa; + g_precWS=optr->precWS; + + switch(PRECWSOPERATORSELECT[optr->solver]){ + case PRECWS_D_DAGGER_D: + op_noprec=&Q_pm_psi; + break; + case PRECWS_DOV_DAGGER_DOV: + op_noprec=&Qov_sq_psi; + break; + case PRECWS_DTM: + op_noprec=&D_psi; + break; + case PRECWS_DOV: + op_noprec=&Dov_psi; + break; + default: + op_noprec=NULL; + break; + } + + pmumap=makeEqualPmuMap(numMatrixElements); + /* pmumap=makeRandomPmuMap(50); */ + printRawPMap(pmumap,numMatrixElements); + + for(i =0;i< (int)fmin(numReplaceTheFirst,numMatrixElements);i++){ + pmumap[4*i+0]=replaceTheFirst[4*i+0]; + pmumap[4*i+1]=replaceTheFirst[4*i+1]; + pmumap[4*i+2]=replaceTheFirst[4*i+2]; + pmumap[4*i+3]=replaceTheFirst[4*i+3]; + } + + + fitData=malloc(sizeof(double)*numMatrixElements*3); + + for(i=0;iprecWS->ai[i]=corrRHS[i]/(4.*g_kappa*g_kappa); + optr->precWS->useCorrectionFunc=0; + spinorPrecWS_RecalcDDaggerDEvs(optr->precWS,g_kappa,g_mu/2./g_kappa); + + } + printf("\n"); + + + } + + + free(pmumap); + g_mu=g_mu_save; + g_kappa=g_kappa_save; + g_precWS=g_precWS_save; +} + + +void computeEigenvectorMatrixElementDtm(int rawp[4],void (*op)(spinor*,spinor*),int eps,int k,int color){ + + _Complex double ev; + double sqnorm; + + _Complex double ev_calc; + +/* void eigenvector_Dtm(spinor *spinor,double mu,int epsilon,int k,int color,int rawp[4]){ */ + + eigenvector_Dtm(g_spinor_field[0],g_mu/2./g_kappa,eps,k,color,rawp); + + op(g_spinor_field[1],g_spinor_field[0]); + + ev=scalar_prod(g_spinor_field[0],g_spinor_field[1],VOLUME,0); + + mul(g_spinor_field[2],ev,g_spinor_field[0],VOLUME); + sqnorm=diff_and_square_norm(g_spinor_field[2],g_spinor_field[1],VOLUME); + + +/* _Complex double calcDovEvalue(const int *praw,double kappa,double rho,int T,int L,double sign){ */ +/* ev_calc=calcDovEvalue(rawp,g_kappa,1.,T,L,eps); */ + + memcpy(&ev_calc,((spinorPrecWS*)g_precWS)->evs+Index(rawp[0],rawp[1],rawp[2],rawp[3]),sizeof(_Complex double)); + + printf("eigenvalue is %e + %e i (theoretical ev %e + %e i) |(Ax - lambda y)|= %e \n" , creal(ev),cimag(ev),creal(ev_calc),cimag(ev_calc),sqnorm); + + +} + +void computeEigenvectorMatrixElementDDaggerD(int rawp[4],void (*op)(spinor*,spinor*),int k){ + + _Complex double ev; + double sqnorm; + + _Complex double ev_calc; + + planeWave(g_spinor_field[0],k,rawp,T,LX,0); + + op(g_spinor_field[1],g_spinor_field[0]); + + ev=scalar_prod(g_spinor_field[0],g_spinor_field[1],VOLUME,0); + + mul(g_spinor_field[2],ev,g_spinor_field[0],VOLUME); + sqnorm=diff_and_square_norm(g_spinor_field[2],g_spinor_field[1],VOLUME); + + +/* _Complex double calcDovEvalue(const int *praw,double kappa,double rho,int T,int L,double sign){ */ +/* ev_calc=calcDovEvalue(rawp,g_kappa,1.,T,L,eps); */ + + if(g_precWS!=NULL) + memcpy(&ev_calc,((spinorPrecWS*)g_precWS)->evs+Index(rawp[0],rawp[1],rawp[2],rawp[3]),sizeof(_Complex double)); + + printf("eigenvalue is %e + %e i (theoretical ev %e + %e i) |(Ax - lambda y)|= %e (in DdaggerD)\n" , creal(ev),cimag(ev),creal(ev_calc),cimag(ev_calc),sqnorm); + + +} + + + +/** + * make a completly random map of lattice momenta + */ +int * makeDiagFalloffPmuMap(int n,int maxdmanhat){ + + int i=0; + double r[4]; + int rawp[4],drawp[4]; + /* # of divitions */ + int numRawPs=0; + int *pmuMap; + int dmanhat; +/* const int maxdmanhat=10; */ + + FILE *drawpStatFile; + + drawpStatFile=fopen("drawp_stat.csv","w"); + + /* allocate space for the pmuMap */ + pmuMap=(int*)malloc(sizeof(int)*n*8); + + while(numRawPs \n" , pmuMap[8*i+0], pmuMap[8*i+1], pmuMap[8*i+2], pmuMap[8*i+3], pmuMap[8*i+4], pmuMap[8*i+5], pmuMap[8*i+6], pmuMap[8*i+7]); + } + + return pmuMap; + +} + + + +/* void op_invert(const int op_id, const int index_start) { */ +/* operator * optr = &operator_list[op_id]; */ +void calculateDiagFalloffElements(const int op_id){ + + double g_mu_save=g_mu; + operator * optr = &operator_list[op_id]; + + spinorPrecWS *g_precWS_save=g_precWS; + int rawp[4],rawp2[4]; + int *pmumap; + int i,j; + void (*op)(spinor*,spinor*); + void (*op_noprec)(spinor*,spinor*); + double frbnorm,diag; + _Complex double matrix[144]; +/* static int epsilon[12]={1,1,1,1,1,1,-1,-1,-1,-1,-1,-1}; */ +/* static int k[12] ={0,0,0,1,1,1,0,0,0,1,1,1}; */ +/* static int color[12] ={0,1,2,0,1,2,0,1,2,0,1,2}; */ + static int numMatrixElements=500; + + +/* int replaceTheFirst[]={ */ +/* 0,0,0,0, */ +/* 0,1,0,0, */ +/* 0,0,1,0, */ +/* 0,1,0,1, */ +/* 1,0,0,1 */ +/* }; */ +/* int numReplaceTheFirst=sizeof(replaceTheFirst)/(4*sizeof(int)); */ + + FILE *num_matrix_elements_file=NULL; + const char *num_matrix_elements_file_name="num_matrix_elements.csv"; + const int readbuflen=512; + char readbuf[readbuflen+1]; + + + FILE *elementsFile=NULL; + char elementsFileName[512]; + + FILE *elementsNormFile=NULL; + char elementsNormFileName[512]; + + sprintf(elementsFileName,"%04d_matrix_elements.csv",nstore); + elementsFile=fopen(elementsFileName, "w"); + + sprintf(elementsNormFileName,"%04d_matrix_elements_norm.csv",nstore); + elementsNormFile=fopen(elementsNormFileName, "w"); + + if(g_precWS==NULL){ + /* we are going to need fft*/ + +#ifdef HAVE_FFTW + loadFFTWWisdom(g_spinor_field[0],g_spinor_field[1],T,LX); +#endif + } + + + printf("trying to open \"%s\" ...",num_matrix_elements_file_name); + num_matrix_elements_file=fopen(num_matrix_elements_file_name,"r"); + printf("[DONE] \n"); + if(num_matrix_elements_file !=(FILE*) NULL ) { + fgets(readbuf,readbuflen,num_matrix_elements_file); + printf("read %s from %s \n",readbuf,num_matrix_elements_file_name); + fflush(stdout); + numMatrixElements=atoi(readbuf); + /* restrict values to reasonable range */ + numMatrixElements=max(numMatrixElements,0); + numMatrixElements=min(numMatrixElements,500); + fclose(num_matrix_elements_file); + } + + + g_mu = optr->mu; + + + + g_precWS=optr->precWS; + + + switch(PRECWSOPERATORSELECT[optr->solver]){ + case PRECWS_D_DAGGER_D: + op=&Q_pm_psi_prec; + op_noprec=&Q_pm_psi; +/* fprintf(stdout,"Operator for diag falloff is Q^2\n"); */ + break; + case PRECWS_DOV_DAGGER_DOV: + op=&Qov_sq_psi_prec; + op_noprec=&Qov_sq_psi; + break; + case PRECWS_DTM: + op=&D_psi_prec; + op_noprec=&D_psi; + break; + case PRECWS_DOV: + op=&Dov_psi_prec; + op_noprec=&Dov_psi; + break; + default: + op=NULL; + op_noprec=NULL; + break; + } + + printf("num_matrix_elements = %d\n",numMatrixElements); + + pmumap=makeDiagFalloffPmuMap(numMatrixElements,10); +/* printRawPMap(pmumap,numMatrixElements); */ + +/* for(i =0;i< (int)fmin(numReplaceTheFirst,numMatrixElements);i++){ */ +/* pmumap[4*i+0]=replaceTheFirst[4*i+0]; */ +/* pmumap[4*i+1]=replaceTheFirst[4*i+1]; */ +/* pmumap[4*i+2]=replaceTheFirst[4*i+2]; */ +/* pmumap[4*i+3]=replaceTheFirst[4*i+3]; */ +/* } */ + + + + for(i=0;i - \\delta_p1_p2 = %e \n", frbnorm); + + fprintf(elementsNormFile,"%d %d %d %d %d %d %d %d %d %e\n" , + rawp[0], rawp[1], rawp[2], rawp[3], + rawp2[0], rawp2[1], rawp2[2], rawp2[3], + cyclicDiff(rawp[0],rawp2[0],T) + cyclicDiff(rawp[1],rawp2[1],LX) +cyclicDiff(rawp[2],rawp2[2],LY) +cyclicDiff(rawp[3],rawp2[3],LZ) ,frbnorm); + + fflush(elementsNormFile); + + for(j=0;j<144;j++){ + + fprintf(elementsFile,"(%e;%e) ", creal(matrix[j]),cimag(matrix[j])); + if((j+1)%12==0) fprintf(elementsFile,"\n"); + + } + + fflush(elementsFile); + + + } + + + fclose(elementsFile); + + fclose(elementsNormFile); + + free(pmumap); + g_mu=g_mu_save; + g_precWS=g_precWS_save; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/dirac_operator_eigenvectors.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/dirac_operator_eigenvectors.h new file mode 100644 index 0000000000000000000000000000000000000000..da8f10187b1282762e68d2fa92e339bc7fed2ecb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/dirac_operator_eigenvectors.h @@ -0,0 +1,290 @@ +/*********************************************************************** + * + * Copyright (C) 2014 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ************************************************************************/ + +#ifndef _DIRAC_EIGENVALUES_H +#define _DIRAC_EIGENVALUES_H + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif +#ifdef HAVE_FFTW + #include +#endif + +#include +#include "linalg/lapack.h" + +/* some macros for 4d loops */ +#define FORXYZT(t,x,y,z,tt,ll) for(t=0;ty)?x:y) + + +/* precondition types */ +typedef enum tm_operator_ {PRECWS_NO=-1, + PRECWS_DTM, + PRECWS_QTM, + PRECWS_D_DAGGER_D, + PRECWS_DOV, + PRECWS_DOV_DAGGER_DOV +} tm_operator; +/* this is a map telling which preconditioner to use for which solver */ +extern tm_operator PRECWSOPERATORSELECT[14]; + + +/* */ +extern double g_prec_sequence_d_dagger_d[3]; + + +#ifdef HAVE_FFTW + fftw_plan spinor_fftw_plan(spinor *spinor_in,spinor *spinor_out,int tt,int ll,unsigned int forward,int fftw_flags); +#endif + +/* translates a tm_operator value to a human readable string */ +const char* precWSOpToString(tm_operator op); + + +extern void _FT(zgeev)( char* jobvl, char* jobvr, int const * n, _Complex double* a, + int const * lda, _Complex double* w, _Complex double* vl, int* ldvl, _Complex double* vr, int* ldvr, + _Complex double* work, int* lwork, double* rwork, int* info ); + +extern void _FT(dposv)( char* jobvl, int const * n,int const * nrhs,double* mat, int const * lda,double *rhs,int const *ldrhs,int const * lapackINfo); + + +/* struct conaining all neccessary information to perform the preconditioning */ +typedef struct spinorPrecWS_{ + /* spinor containing projectors belonging to all eigenvalues with positive imaginary part */ + /* spinor containing projectors belonging to all eigenvalues with positive imaginary part */ + spinor **spinor_up; + + spinor* spinorMemBuff; + + + /* array containing eigenvalues */ + _Complex double *evs; + + /* sinus and cosinus lookup table */ + double *c_table; + double *s_table; + + tm_operator m_op; + + _Complex double averageLambda; + + /* correction function parameters */ + unsigned int useCorrectionFunc; + double ai[4]; + + double precExpo[3]; + +} spinorPrecWS; + + +/* fills the struct above, allocates fields, calculate eigenvalues */ +void spinorPrecWS_Init(spinorPrecWS *ws, double kappa,double mu,double rho,tm_operator op); +/* clean up everything */ +void spinorPrecWS_Free(spinorPrecWS *ws); + + + +/** + *@func computes the spinor structure of the eigenvector with impuls p + *@param fv four vector where to store the result + *@param mu twisted mass parameter + *@param epsilon solution parameter, can be +1 or -1 + *@param k further free solution parameter, can be 0 or 1 + *@param color the color index, can be 0 1 2 + *@param rawp raw lattice momentum (how it goes to the fft), will be converted to the correct lattice momentum internally + *@param tt,ll time and spacial extend + */ +void spinorStructEigenvecDtm(spinor *fv,double mu,int epsilon,int k,int color,int rawp[4],int tt,int ll); +void spinorStructEigenvecQtm(spinor *fv,double kappa,double mu,int epsilon,int k,int color,int rawp[4],int tt,int ll); + + +/** + * the su3 variant pack the different eigenvectors into the color components of the given spinor + */ +void spinorStructEigenvecDtmSu3Vector(spinor *fv,double mu,int epsilon,int k,int store_color,int rawp[4],int tt,int ll); +void spinorStructEigenvecQtmSu3Vector(spinor *fv,double kappa,double mu,int epsilon,int k,int store_color,int rawp[4],int tt,int ll); + + +/* calculate a complete treelevel eigenvector for the Wilson-Twisted-Mass Operator */ +void eigenvector_Dtm(spinor *two_spinor,double mu,int epsilon,int k,int color,int rawp[4]); + +/** + * the fanction performing the actual precondition + * this function applies the desired treelevel Dirac operator with an arbitrary (_Complex double) exponent to the given spinor + */ +void spinorPrecondition(spinor *spinor_out,const spinor* spinor_in,spinorPrecWS* ws,int tt,int ll,const _Complex double alpha,unsigned int dagger,unsigned int autofft); + +/** + * creates a plane wave representation in momentum or space time domain depending on + * the parameter momspace + */ +void planeWave(spinor *spinor,int k,int rawp[4],int tt,int ll,unsigned int momspace/* =false */); + +/** + * applies a (half) phase factor to the spinor + * this is neccessary if one wants to calculate fourier transforms with + * half frequencies efficiently + */ +void spinor_mulp_half_phase(spinor *spinor_out,const spinor *spinor_in, + double *c_table,double *s_table, + unsigned forward,double mulp); + +/** + * read and write fftw wisdoms + * this is supposed to speed up things + */ +#ifdef HAVE_FFTW +void writeFFTWWisdom(int tt,int ll); +void loadFFTWWisdom(spinor *spinor_in,spinor *spinor_out,int tt,int ll); +#endif + +/** + * calculate matrix elements of the pre- und unpreconditioned operator + */ +_Complex double calcMatrixElement(spinor* field1,spinor *field2,_Complex double mat[144],int praw1[4],int praw2[4], void (*op)(spinor*,spinor*),int diag,int jTo); +/** + * diagonalizes matrix elements with lapack + */ +void diagMatrixElement(_Complex double mat[144]); + +/** + * calculates the matrix element of the (intended) eigenvector given by the parameters + * this is a check if the inteded eigenvalue is realy an eigenvalue + */ +void computeEigenvectorMatrixElementDtm(int rawp[4],void (*op)(spinor*,spinor*),int eps,int k,int color); + +/** + * these functions are for creating raw lattice momenta beeing either equaly distributed in the + * (\hat{p}^2 , \tilde{p}^2 ) plane or in the p^lattice_raw_mu space + */ +int * makeEqualPmuMap(int n); +int * makeRandomPmuMap(int n); +void printRawPMap(int *rawps,int n); + +/** + * calculates random matrix elements and performs a fit + * for the optimal eigenvalue formula of D^dagger D + */ +void fitPrecParams(int op_id); + +void calculateDiagFalloffElements(const int op_id); + +int cyclicDiff(int a,int b, int period); + + + +/** + * some algebraic macros + */ + +#define _exp_complex(/*_Complex double*/ x,/*_Complex double*/ z,/*double*/ dum)\ + x = cexp(z); + +/* res = z^x = exp ( x * ln(z)) */ +#define _pow_complex(/*_Complex double*/ res,/*_Complex double*/ z,/*_Complex double*/ x,/*_Complex double*/ dum)\ + res = cpow(z, x); + +#define _spinor_muleq_real(s,r)\ + (s).s0.c0*=r; \ + (s).s0.c1*=r; \ + (s).s0.c2*=r; \ + (s).s1.c0*=r; \ + (s).s1.c1*=r; \ + (s).s1.c2*=r; \ + (s).s2.c0*=r; \ + (s).s2.c1*=r; \ + (s).s2.c2*=r; \ + (s).s3.c0*=r; \ + (s).s3.c1*=r; \ + (s).s3.c2*=r; \ + +#define _complex_muleq_complex(z1,z2,dum)\ + (z1) *= (z2); + +#define _spinor_muleq_complex(s,c,dum)\ + _complex_muleq_complex((s).s0.c0,c,dum);\ + _complex_muleq_complex((s).s0.c1,c,dum);\ + _complex_muleq_complex((s).s0.c2,c,dum);\ + _complex_muleq_complex((s).s1.c0,c,dum);\ + _complex_muleq_complex((s).s1.c1,c,dum);\ + _complex_muleq_complex((s).s1.c2,c,dum);\ + _complex_muleq_complex((s).s2.c0,c,dum);\ + _complex_muleq_complex((s).s2.c1,c,dum);\ + _complex_muleq_complex((s).s2.c2,c,dum);\ + _complex_muleq_complex((s).s3.c0,c,dum);\ + _complex_muleq_complex((s).s3.c1,c,dum);\ + _complex_muleq_complex((s).s3.c2,c,dum); + + +/* #define _spinor_scalar_prod(proj,a,b)\ */ +/* proj.re=_spinor_prod_re(a,b); \ */ +/* proj.im=_spinor_prod_im(a,b); */ + + +#define _spinor_scalar_prod(proj,r,s)\ + (proj) = conj((r).s0.c0) * (s).s0.c0 + \ + conj((r).s0.c1) * (s).s0.c1 + \ + conj((r).s0.c2) * (s).s0.c2 + \ + conj((r).s1.c0) * (s).s1.c0 + \ + conj((r).s1.c1) * (s).s1.c1 + \ + conj((r).s1.c2) * (s).s1.c2 + \ + conj((r).s2.c0) * (s).s2.c0 + \ + conj((r).s2.c1) * (s).s2.c1 + \ + conj((r).s2.c2) * (s).s2.c2 + \ + conj((r).s3.c0) * (s).s3.c0 + \ + conj((r).s3.c1) * (s).s3.c1 + \ + conj((r).s3.c2) * (s).s3.c2; + + +#define PROJECTSPLIT(p_plus,up_plus,col_proj,phi_o,phi_plus,col_phi)\ + p_plus = 0; \ + p_plus += conj(up_plus->s0.col_proj) * (phi_o->s0.col_phi); \ + p_plus += conj(up_plus->s1.col_proj) * (phi_o->s1.col_phi); \ + p_plus += conj(up_plus->s2.col_proj) * (phi_o->s2.col_phi);\ + p_plus += conj(up_plus->s3.col_proj) * (phi_o->s3.col_phi);\ + /* project out from input vector "positive" modes */\ + phi_o->s0.col_phi -= (p_plus) * (up_plus->s0.col_proj); \ + phi_o->s1.col_phi -= (p_plus) * (up_plus->s1.col_proj);\ + phi_o->s2.col_phi -= (p_plus) * (up_plus->s2.col_proj);\ + phi_o->s3.col_phi -= (p_plus) * (up_plus->s3.col_proj);\ + /* buil up vector with "positive projectors" */ \ + phi_plus.s0.col_phi -= (p_plus) * (up_plus->s0.col_proj); \ + phi_plus.s1.col_phi -= (p_plus) * (up_plus->s1.col_proj); \ + phi_plus.s2.col_phi -= (p_plus) * (up_plus->s2.col_proj);\ + phi_plus.s3.col_phi -= (p_plus) * (up_plus->s3.col_proj); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/eigcg.c b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/eigcg.c new file mode 100644 index 0000000000000000000000000000000000000000..2516a0af77657120c2fbbe25044e304974d9d692 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/eigcg.c @@ -0,0 +1,627 @@ +/*********************************************************************** + * Copyright (C) 2008,2009,2010,2011,2012 + * Andreas Stathopoulos, Kostas Orginos, Abdou M. Abdel-Rehim + * + * This program is based on interfacing the eigCG solver to the tmLQCD code. + * It was written by Abdou M. Abdel-Rehim. The original code was written + * by Andreas Stathopoulos and Kostas Orginos and integrated in Chroma. + * In this interface we use functions from tmLQCD. + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + + +/*------------------------------------------------------------------------- + + EIGCG Solve Ax=b by the conjugate gradient method. + At the same time compute smallest abs eigenvalues/vectors of A. + Refs: Golub and Van Loan + Stathopoulos and Orginos + + Matrix A is Hermitian positive definite. It is accessed by + a matrix-vector multiplication function. + + Parameters: + ---------------------------------- + n active problem size as a number of spinor components, The active part of A is an n-by-n spinors + lde physical problem size (as spinors). A and vectors are stored in lde size spinors + note that each spinor is 12 complex components (color and spin) + ---------------------------------- + CG-related parameters + ---------------------------------- + x (IN) the initial guess + (OUT) the computed approximate solution + b (IN) the right hand side of the system + normb (IN/OUT) ||b|| is computed. On input, if flag==3, normb=||b|| + eps_sq (IN) error tolerance ||r|| < sqrt(eps_sq)*||b|| (if using relative precision) + OR ||r|| < sqrt(eps_sq) (if using absolute precision)where r is the residual + restart_eps_sq (IN) restart CG when ||r|| < sqrt(restart_eps_sq)*||b-Ax0|| if using relative + precison or ||r|| < sqrt(restart_eps_sq) if using absolute precison. + rel_prec (IN) 0 means use absolute precision, 1 means use relative precison + maxit (IN) maximum number of iterations + reshist (OUT) achievd residual squared value + iter (IN/OUT) number CG iterations performed in previous restarts (IN) + and previous+current iterations total (OUT) + flag (OUT) exit status (see below) + work (IN/OUT) work array. Must be of size 4*lde >= 4*n + f function that performs matrix-vector multiplication with matrix A + ---------------------------------- + eigen-related parameters + ---------------------------------- + nev (IN) number of eigenvalues to find + v_max (IN) maximum number of basis vectors + V (IN) the basis vectors (lde \times v_max) + (OUT) the first (lde \times nev) contain the Ritz vectors, vector by + vector. Users may then copy them to the desired data structure + esize (IN) size of ework, the eigenwork space: the more the better + N+2*nev <= esize <= (2*nev+1)*N + ework temp work space of size esize + ---------------------------------- + + On exit, if flag is + + 0 then CG converged to the desired tolerance within maxit iterations + + 1 then CG iterated maxit times but did not converge. + + 2 then one of the scalar quantities computed during CG was zero + + 3 then CG stopped because the restarting tolerance (related to initCG) + is satisfied. + + ---------------------------------- + g_debug_level + > 0 prints CG linear system info on exit (num its/ flag) + > 2 prints linear system residuals at every iteration + > 3 information about computed eigenvectors after each rhs in the first phase is printed + ----------------------------------*/ +/***********************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#ifdef MPI +# include +#endif +#include "global.h" +#include "linalg_eo.h" +#include "start.h" +#include "solver_field.h" +#include "linalg/blas.h" +#include "linalg/lapack.h" +#include "solver/restart_X.h" +#include "solver/eigcg.h" + + +/* print information about iteration */ +static void displayInfo(float tol, + int maxit, + int flag, + int iter, + float resnorm) { + + if (flag != 0) { + fprintf(stdout,"eigCG stopped at iteration %d with flag %d. ", iter, flag); + } + + switch(flag) { + case 0: + if (iter == 0) + fprintf(stdout,"The initial guess has relative residual %0.2g which is within\nthe desired tolerance %0.2g\n", resnorm, tol); + else + fprintf(stdout,"eigCG converged at iteration %d to a solution with residual norm %0.2g", iter, resnorm); + break; + case 1: + fprintf(stdout,"\nbecause the maximum number of iterations was reached."); + break; + case 2: + fprintf(stdout,"\nbecause a scalar quantity became too small."); + break; + } + + if (flag != 0) + fprintf(stdout,"\nThe iterate returned at iteration %d has residual norm %0.2g",iter,resnorm); + + fprintf(stdout,"\n"); + fflush(stdout); + +} + + +void eigcg(int n, int lde, spinor * const x, spinor * const b, double *normb, + const double eps_sq, double restart_eps_sq, const int rel_prec, int maxit, int *iter, + double *reshist, int *flag, spinor **work, matrix_mult f, + int nev, int v_max, spinor *V, int esize, _Complex double *ework) +{ + double tolb; + double alpha, beta; /* CG scalars */ + double rho, rhoprev; + double pAp; + int it; /* current iteration number */ + int i, j; /* loop variables */ + int zs,ds,tmpsize; + spinor *r, *p, *Ap; /* ptrs in work for CG vectors */ + _Complex double tempz; /* double precision complex temp var */ + double tempd; /* double temp var */ + int tempi; /* int temp var */ + int ONE = 1; /* var for passing 1 into BLAS routines */ + /*---------------------------------------------------------------------- + Eigen variables and setup + ----------------------------------------------------------------------*/ + /* Some constants */ + char cR = 'R'; char cL = 'L'; char cN ='N'; + char cV = 'V'; char cU = 'U'; char cC ='C'; + double betaprev, alphaprev; /* remember the previous iterations scalars */ + int v_size; /* tracks the size of V */ + int lwork = 3*v_max; /* the size of zwork */ + spinor *Ap_prev; + void *_h; + _Complex double *H; /* the V'AV projection matrix */ + void *_hevecs; + _Complex double *Hevecs; /* the eigenvectors of H */ + void *_hevecsold; + _Complex double *Hevecsold; /* the eigenvectors of H(v_max-1,v_max-1) */ + void *_hevals; + double *Hevals; /* the eigenvalues of H */ + void *_hevalsold; + double *Hevalsold; /* the eigenvalues of H(m-1,m-1) */ + void *_tau; + _Complex double *TAU; + void *_zwork; + _Complex double *zwork; /* double complex work array needed by zheev */ + void *_rwork; + double *rwork; /* double work array needed by zheev */ + + int parallel; + + double tmpd; + _Complex double tmpz; + + zs = sizeof(_Complex double); + ds = sizeof(double); + + int info, allelems = v_max*v_max; + +#ifdef MPI + parallel=1; +#else + parallel=0; +#endif + + if(nev > 0) /*allocate memory only if eigenvalues will be used */ + { + #if (defined SSE || defined SSE2 || defined SSE3) + if ((_h = calloc(v_max*v_max+ALIGN_BASE,zs)) == NULL) + { + if( g_proc_id == g_stdio_proc) + {fprintf(stderr,"ERROR Could not allocate H\n"); exit(1);} + } + else + H = (_Complex double *)(((unsigned long int)(_h)+ALIGN_BASE)&~ALIGN_BASE); + + + if ((_hevecs = calloc(v_max*v_max+ALIGN_BASE,zs)) == NULL) + { + if( g_proc_id == g_stdio_proc ) + {fprintf(stderr, "ERROR Could not allocate Hevecs\n"); exit(1);} + }else + Hevecs = (_Complex double *)(((unsigned long int)(_hevecs)+ALIGN_BASE)&~ALIGN_BASE); + + if ((_hevecsold = calloc(v_max*v_max+ALIGN_BASE,zs)) == NULL) + { + if( g_proc_id == g_stdio_proc ) + {fprintf(stderr, "ERROR Could not allocate Hevecsold\n"); exit(1);} + }else + Hevecsold = (_Complex double *)(((unsigned long int)(_hevecsold)+ALIGN_BASE)&~ALIGN_BASE); + + if ((_hevals = calloc(v_max+ALIGN_BASE,ds)) == NULL) + { + if( g_proc_id == g_stdio_proc) + {fprintf(stderr, "ERROR Could not allocate Hevals\n"); exit(1);} + + }else + Hevals = (double *)(((unsigned long int)(_hevals)+ALIGN_BASE)&~ALIGN_BASE); + + if ((_hevalsold = calloc(v_max+ALIGN_BASE,ds)) == NULL) + { + if( g_proc_id == g_stdio_proc) + {fprintf(stderr, "ERROR Could not allocate Hevalsold\n"); exit(1); } + + }else + Hevalsold = (double *)(((unsigned long int)(_hevalsold)+ALIGN_BASE)&~ALIGN_BASE); + + if ((_tau = calloc(2*nev+ALIGN_BASE,zs)) == NULL) + { + if( g_proc_id == g_stdio_proc ) + {fprintf(stderr, "ERROR Could not allocate TAU\n"); exit(1); } + + }else + TAU = (_Complex double *)(((unsigned long int)(_tau)+ALIGN_BASE)&~ALIGN_BASE); + + if ((_zwork = calloc(lwork+ALIGN_BASE,zs)) == NULL) + { + if( g_proc_id == g_stdio_proc) + {fprintf(stderr, "ERROR Could not allocate zwork\n"); exit(1);} + + }else + zwork = (_Complex double *)(((unsigned long int)(_zwork)+ALIGN_BASE)&~ALIGN_BASE); + + if ((_rwork = calloc(3*v_max+ALIGN_BASE,ds)) == NULL) + { + if( g_proc_id == g_stdio_proc) + {fprintf(stderr, "ERROR Could not allocate rwork\n"); exit(1);} + + }else + rwork = (double *)(((unsigned long int)(_rwork)+ALIGN_BASE)&~ALIGN_BASE); + + #else + + if ((H = (_Complex double *) calloc(v_max*v_max, zs)) == NULL) + { + if( g_proc_id == g_stdio_proc) + {fprintf(stderr, "ERROR Could not allocate H\n"); exit(1);} + } + + if ((Hevecs = (_Complex double *) calloc(v_max*v_max, zs)) == NULL) + { + if( g_proc_id == g_stdio_proc ) + {fprintf(stderr, "ERROR Could not allocate Hevecs\n"); exit(1);} + } + + if ((Hevecsold = (_Complex double *) calloc(v_max*v_max, zs)) == NULL) + { + if( g_proc_id == g_stdio_proc ) + {fprintf(stderr, "ERROR Could not allocate Hevecsold\n"); exit(1);} + } + + if ((Hevals = (double *) calloc(v_max, ds)) == NULL) + { + if( g_proc_id == g_stdio_proc) + {fprintf(stderr, "ERROR Could not allocate Hevals\n"); exit(1);} + } + + + if ((Hevalsold = (double *) calloc(v_max, ds)) == NULL) + { + if( g_proc_id == g_stdio_proc) + {fprintf(stderr, "ERROR Could not allocate Hevalsold\n"); exit(1); } + } + + + if ((TAU = (_Complex double *) calloc(2*nev, zs)) == NULL) + { + if( g_proc_id == g_stdio_proc ) + {fprintf(stderr, "ERROR Could not allocate TAU\n"); exit(1); } + + } + + + if ((zwork = (_Complex double *) calloc(lwork, zs)) == NULL) + { + if( g_proc_id == g_stdio_proc) + {fprintf(stderr, "ERROR Could not allocate zwork\n"); exit(1);} + + } + + if ((rwork = (double *) calloc(3*v_max, ds)) == NULL) + { + if( g_proc_id == g_stdio_proc) + {fprintf(stderr, "ERROR Could not allocate rwork\n"); exit(1);} + + } + + #endif + } /* end if (nev > 0) */ + + /*----------------------------------------------------------------------*/ + + /* setup pointers into work */ + r = work[0]; + p = work[1]; + Ap = work[2]; + Ap_prev = work[3]; + + + + /*-------------------------------------------------------------------- + Initialization phase + --------------------------------------------------------------------*/ + + if (*flag != 3) + { + + /* If flag == 3, the eigCG is called after restart with the same b + * whose norm is already known in normb, so no need for these */ + + tempd = square_norm(b,n,parallel); /* Norm of rhs, b */ + *normb = sqrt(tempd); + + /* If right hand side is zero return zero solution. ITER stays the same */ + if (*normb == 0.0) + { + for (i=0; i 0 && g_proc_id == g_stdio_proc) + displayInfo(eps_sq,maxit,*flag,*iter,*reshist); + return; + } + + } + + /* Set up for the method */ + *flag = 1; + tolb = eps_sq * (*normb)*(*normb); /* Relative to b tolerance */ + + /* Zero-th residual: r = b - A*x */ + f(r,x); + diff(r,b,r,n); + + rho = 0.0; + alpha = 1.0; + beta = 0.0; + v_size = 0; + + double reshist_init=square_norm(r,n,parallel); + + //if( g_proc_id == g_stdio_proc ) + //fprintf(stdout, "reshist init %f\n", reshist_init); + + /*-------------------------------------------------------------------- + main CG loop + --------------------------------------------------------------------*/ + for (it = 0; it < maxit; it++) { + + rhoprev = rho; + rho=square_norm(r,n,parallel); + *reshist = rho; + if ( (g_debug_level > 2) && (g_proc_id == g_stdio_proc) ) + { fprintf(stdout, " Linsys res( %d ): %g\n",*iter+it,*reshist); fflush(stdout); } + + /* Convergence test */ + if ( ( (*reshist < eps_sq) && (rel_prec==0) ) || ( (*reshist < eps_sq*(*normb)*(*normb)) && (rel_prec ==1 ) ) ) + { + *flag = 0; + break; /* break do not return */ + } + + /* Restart test */ + if(nev==0) + { + if (*reshist < (restart_eps_sq*reshist_init) ) + { + *flag = 3; + break; /* break do not return */ + } + } + + if (it == 0) + assign(p,r,n); + else { + betaprev = beta; + beta = rho / rhoprev; + if (beta == 0.0) { + *flag = 2; + break; + } + assign_mul_add_r(p,beta,r,n); /* p = beta*p + r */ + } + + /*----- eigCG specific code -------------------------------------------*/ + /* Remember Ap from previous iteration to be used at restart */ + if (nev > 0 && v_size == v_max) + assign(Ap_prev,Ap,n); + /*---------------------------------------------------------------------*/ + + f(Ap,p); + + /*----- eigCG specific code -------------------------------------------*/ + if (nev > 0) { + /* record the diagonal vAv for the previous vector */ + if (it > 0) { + H[(v_size-1)*v_max+v_size-1]= 1.0/alpha + betaprev/alphaprev; + //H[(v_size-1)*v_max+v_size-1].im = 0.0; + } + + /* Restarting V */ + if (v_size == v_max) { + /* Solve (v_max) and (v_max-1) eigenproblems */ + tempi = v_max; + allelems=v_max*v_max; + _FT(zcopy)(&allelems, H, &ONE, Hevecs, &ONE); + _FT(zheev)(&cV,&cU,&tempi,Hevecs,&v_max,Hevals,zwork,&lwork,rwork,&info,1,1); + if( (info != 0 ) && (g_proc_id==g_stdio_proc)) + {fprintf(stderr, "Error: ZHEEV in eigcg at v_max step, info %d\n",info); exit(1);} + + tempi = v_max-1; + _FT(zcopy)(&allelems, H, &ONE, Hevecsold, &ONE); + _FT(zheev)(&cV,&cU,&tempi,Hevecsold,&v_max,Hevalsold,zwork,&lwork,rwork,&info,1,1); + + if( (info != 0 ) && (g_proc_id==g_stdio_proc)) + {fprintf(stderr, "Error: ZHEEV in eigcg at (v_max-1) step, info %d\n",info); exit(1);} + + + /* fill 0s in vmax-th elem of oldevecs to match Hevecs */ + for(i=1; i <= v_max ; i++) + {Hevecsold[i*v_max-1] = 0.0 ;} + + /* Attach the first nev oldevecs at the end of the nev latest ones */ + tempi = nev*v_max; + _FT(zcopy)(&tempi,Hevecsold,&ONE,&Hevecs[tempi],&ONE); + + /* Orthogonalize the 2*nev (new+old) vectors Hevecs=QR */ + v_size = 2*nev; + _FT(zgeqrf)(&v_max,&v_size,Hevecs,&v_max,TAU,zwork,&lwork,&info) ; + + if( (info != 0 ) && (g_proc_id==g_stdio_proc)) + {fprintf(stderr, "Error: ZGEQRF in eigcg info %d\n",info); exit(1);} + + /* use as a temp space Hevecsold = Q^THQ */ + _FT(zcopy)(&allelems,H,&ONE,Hevecsold,&ONE); + _FT(zunmqr)(&cR,&cN,&v_max,&v_max,&v_size,Hevecs,&v_max, + TAU,Hevecsold,&v_max,zwork,&lwork,&info); + + if( (info != 0 ) && (g_proc_id==g_stdio_proc)) + {fprintf(stderr, "Error: ZGEQRF call 1 in eigcg info %d\n",info); exit(1);} + + _FT(zunmqr)(&cL,&cC,&v_max,&v_size,&v_size,Hevecs,&v_max, + TAU,Hevecsold,&v_max,zwork,&lwork,&info); + + if( (info != 0 ) && (g_proc_id==g_stdio_proc)) + {fprintf(stderr, "Error: ZGEQRF call 2 in eigcg info %d\n",info); exit(1);} + + /* solve the small Hevecsold v_size x v_size eigenproblem */ + _FT(zheev)(&cV,&cU,&v_size,Hevecsold,&v_max,Hevals, zwork,&lwork,rwork,&info,1,1); + if( (info != 0 ) && (g_proc_id==g_stdio_proc)) + {fprintf(stderr, "Error: ZHEEV in eigcg info %d\n",info); exit(1);} + + + + /* zero out unused part of eigenectors in Hevecsold */ + tempi = 0; + for(i = 0; i < v_size; i++ ) + { + for(j = v_size; j < v_max; j++) + {Hevecsold[tempi + j]=0.0;} + tempi += v_max; + + } + + + /* Compute the Hevecsold = Hevecs*Hevecsold */ + _FT(zunmqr)(&cL,&cN,&v_max,&v_size,&v_size,Hevecs,&v_max, + TAU,Hevecsold,&v_max,zwork,&lwork,&info); + + + if( (info != 0 ) && (g_proc_id==g_stdio_proc)) + {fprintf(stderr, "Error: ZUNMQR, info %d\n",info); exit(1);} + + + /* Restart V = V(n,v_max)*Hevecsold(v_max,v_size) */ + Zrestart_X((_Complex double *) V, 12*lde, Hevecsold, 12*n, v_max, v_size, ework, esize); + + /* Restart H = diag(Hevals) plus a column and a row */ + for (i = 0; i < allelems; i++ ) {H[i] = 0.0; } + for (i = 0; i < v_size; i++) H[i*(v_max+1)]= Hevals[i]; + + + + /* The next residual to be added (v = r/sqrt(rho)) + * needs the (nev+1)-th column and row, through V(:,1:vs)'*A*v. + * Instead of a matvec, we use the Ap and Ap_prev to obtain this: + * V(:,1:vs)'*A*V(:,vs+1) = V(:,1:vs)'*A*r/sqrt(rho) = + * V'(A(p-beta*p_prev))/sqrt(rho) = V'(Ap - beta*Ap_prev)/sqrt(rho)*/ + + tmpd=-beta; + assign_mul_add_r(Ap_prev,tmpd,Ap,n); /* Ap_prev=Ap-beta*Ap_prev */ + + tempi=v_size*v_max; + for (i=0; i 0) + { + H[(v_size-1)*v_max + v_size]= -sqrt(beta)/alpha; + H[v_size*v_max + v_size-1] = creal(H[(v_size-1)*v_max + v_size]); + } + + } /* of else */ + /* Augment V with the current CG residual r normalized by sqrt(rho) */ + + tmpd=1.0/sqrt(rho); + mul_r(&V[v_size*lde],tmpd,r,n); + v_size++; + } /* end of if nev >0 , ie., the eigCG specific code */ + /*---------------------------------------------------------------------*/ + + /* pAp = p' * Ap */ + tempz=scalar_prod(p,Ap,n,parallel); + pAp = creal(tempz); + if (pAp == 0.0) { + *flag = 2; + break; + } + + alphaprev = alpha; + alpha = rho / pAp; + + assign_add_mul_r(x,p,alpha,n); /*update x*/ + tmpd=-alpha; + assign_add_mul_r(r,Ap,tmpd,n); /*update r*/ + + //next line useful for debugging + //printf("%d beta, alpha, rho, pAp %le %le %le %le\n",it,beta,alpha,rho,pAp); + } /* for it = 0 : maxit-1 */ + + *iter = *iter + it+1; /* record the number of CG iterations plus any older */ + if( g_proc_id == g_stdio_proc && g_debug_level > 0) + displayInfo(eps_sq,maxit,*flag,*iter-1,*reshist); + + + if(nev > 0 ) + { + #if (defined SSE || defined SSE2 || defined SSE3) + H= NULL; + free(_h); + Hevecs=NULL; + free(_hevecs); + Hevecsold=NULL; + free(_hevecsold); + Hevals=NULL; + free(_hevals); + Hevalsold=NULL; + free(_hevalsold); + TAU=NULL; + free(_tau); + zwork=NULL; + free(_zwork); + rwork=NULL; + free(_rwork); + #else + free(H); + free(Hevecs); + free(Hevecsold); + free(Hevals); + free(Hevalsold); + free(TAU); + free(zwork); + free(rwork); + #endif + } + + return; +} +/* end of EIGPCG ************************************************************/ + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/eigcg.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/eigcg.h new file mode 100644 index 0000000000000000000000000000000000000000..33761c1d2cf4c4e0e0863ab80b5152765c32d369 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/eigcg.h @@ -0,0 +1,39 @@ +/**************************************************************************** + * Copyright (C) 2008,2009,2010,2011,2012 + * Andreas Stathopoulos, Kostas Orginos, Abdou M. Abdel-Rehim + * + * This program is based on interfacing the eigCG solver to the tmLQCD code. + * It was written by Abdou M. Abdel-Rehim. The original code was written + * by Andreas Stathopoulos and Kostas Orginos and integrated in Chroma. + * In this interface we use functions from tmLQCD. + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ******************************************************************************/ + +#ifndef _EIGCG_H +#define _EIGCG_H + +#include "su3.h" +#include "solver/matrix_mult_typedef.h" + + +void eigcg(int n, int lde, spinor * const x, spinor * const b, double *normb, const double eps_sq, + double restart_eps_sq, const int rel_prec, int maxit, int *iter, double *reshist, int *flag, + spinor **work, matrix_mult f, int nev, int v_max, spinor *V, int esize, _Complex double *ework); + + + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/eigenvalues.c b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/eigenvalues.c new file mode 100644 index 0000000000000000000000000000000000000000..1f81444b2109f6db9543f4cdf5c5f3014f1fc1fd --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/eigenvalues.c @@ -0,0 +1,332 @@ +/*********************************************************************** + * + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * Here we compute the nr_of_eigenvalues lowest eigenvalues + * of (gamma5*D)^2. Therefore we use the arnoldi routines. + * + * The computed eigenvalues are stored in g_eigenvalues + * and the computed eigenvectors in g_ev + * + * inout: + * nr_of_eigenvalues: input: Number of eigenvalues to compute + * output: Number of computed eigenvalues + * + * Autor: Carsten Urbach + * + **************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "global.h" +#include "su3.h" +#include +#include +#include +#include +#include +#include "operator/tm_operators.h" +#include "solver/solver.h" +#include "solver/jdher.h" +#include "solver/matrix_mult_typedef.h" +#include "linalg_eo.h" +#include "operator/Dov_psi.h" +#include "eigenvalues.h" +#include "gettime.h" + +spinor *eigenvectors = NULL; +double * eigenvls = NULL; +double max_eigenvalue; +double * inv_eigenvls = NULL; +int eigenvalues_for_cg_computed = 0; +int no_eigenvalues, evlength; + +/* the folowing two are needed for the overlap */ +double ev_minev=-1., ev_qnorm=-1.; + +double eigenvalues(int * nr_of_eigenvalues, const int max_iterations, + const double precision, const int maxmin, + const int readwrite, const int nstore, + const int even_odd_flag) { + double returnvalue; + _Complex double norm2; +#ifdef HAVE_LAPACK + static spinor * eigenvectors_ = NULL; + static int allocated = 0; + char filename[200]; + FILE * ofs; + double atime, etime; + + /********************** + * For Jacobi-Davidson + **********************/ + int verbosity = g_debug_level, converged = 0, blocksize = 1, blockwise = 0; + int solver_it_max = 50, j_max, j_min, ii, jj; + /*int it_max = 10000;*/ + /* _Complex double *eigv_ = NULL, *eigv; */ + double decay_min = 1.7, decay_max = 1.5, prec, + threshold_min = 1.e-3, threshold_max = 5.e-2; + + /* static int v0dim = 0; */ + int v0dim = 0; + matrix_mult f; + int N = (VOLUME)/2, N2 = (VOLUMEPLUSRAND)/2; + spinor * max_eigenvector_ = NULL, * max_eigenvector; + + /********************** + * General variables + **********************/ + int returncode=0; + int returncode2=0; + + char eigenvector_prefix[512]; + char eigenvalue_prefix[512]; + + + no_eigenvalues = *nr_of_eigenvalues; + + sprintf(eigenvector_prefix,"eigenvector.%%s.%%.2d.%%.4d"); + sprintf(eigenvalue_prefix,"eigenvalues.%%s.%%.4d"); + + if(!even_odd_flag) { + N = VOLUME; + N2 = VOLUMEPLUSRAND; + f = &Q_pm_psi; + } + else { + f = &Qtm_pm_psi; + } + evlength = N2; + if(g_proc_id == g_stdio_proc && g_debug_level >0) { + printf("Number of %s eigenvalues to compute = %d\n", + maxmin ? "maximal" : "minimal",(*nr_of_eigenvalues)); + printf("Using Jacobi-Davidson method! \n"); + } + + if((*nr_of_eigenvalues) < 8){ + j_max = 15; + j_min = 8; + } + else{ + j_max = 2*(*nr_of_eigenvalues); + j_min = *nr_of_eigenvalues; + } + if(precision < 1.e-14){ + prec = 1.e-14; + } + else{ + prec = precision; + } +#if (defined SSE || defined SSE2 || defined SSE3) + max_eigenvector_ = calloc(N2+1, sizeof(spinor)); + max_eigenvector = (spinor *)(((unsigned long int)(max_eigenvector_)+ALIGN_BASE)&~ALIGN_BASE); +#else + max_eigenvector_= calloc(N2, sizeof(spinor)); + max_eigenvector = max_eigenvector_; +#endif + + if(allocated == 0) { + allocated = 1; +#if (defined SSE || defined SSE2 || defined SSE3) + eigenvectors_ = calloc(N2*(*nr_of_eigenvalues)+1, sizeof(spinor)); + eigenvectors = (spinor *)(((unsigned long int)(eigenvectors_)+ALIGN_BASE)&~ALIGN_BASE); +#else + eigenvectors_= calloc(N2*(*nr_of_eigenvalues), sizeof(spinor)); + eigenvectors = eigenvectors_; +#endif + eigenvls = (double*)malloc((*nr_of_eigenvalues)*sizeof(double)); + inv_eigenvls = (double*)malloc((*nr_of_eigenvalues)*sizeof(double)); + } + + solver_it_max = 50; + /* compute the maximal one first */ + jdher(N*sizeof(spinor)/sizeof(_Complex double), N2*sizeof(spinor)/sizeof(_Complex double), + 50., 1.e-12, + 1, 15, 8, max_iterations, 1, 0, 0, NULL, + CG, solver_it_max, + threshold_max, decay_max, verbosity, + &converged, (_Complex double*) max_eigenvector, (double*) &max_eigenvalue, + &returncode2, JD_MAXIMAL, 1, + f); + + if(readwrite) { + if(even_odd_flag){ + for(v0dim = 0; v0dim < (*nr_of_eigenvalues); v0dim++) { + sprintf(filename, eigenvector_prefix , maxmin ? "max" : "min", v0dim, nstore); + if((read_eospinor(&eigenvectors[v0dim*N2], filename)) != 0) { + break; + } + } + } else { + FILE *testfile; + spinor *s; + double sqnorm; + for(v0dim = 0; v0dim < (*nr_of_eigenvalues); v0dim++) { + sprintf(filename, eigenvector_prefix, maxmin ? "max" : "min", v0dim, nstore); + + printf("reading eigenvectors ... "); + testfile=fopen(filename,"r"); + if( testfile != NULL){ + fclose(testfile); + s=(spinor*)&eigenvectors[v0dim*N2]; + read_spinor(s,NULL, filename,0); + sqnorm=square_norm(s,VOLUME,1); + printf(" has | |^2 = %e \n",sqnorm); + + } else { + printf(" no more eigenvectors \n"); + break; + } + } + } + } + + if(readwrite != 2) { + atime = gettime(); + + /* (re-) compute minimal eigenvalues */ + converged = 0; + solver_it_max = 200; + + if(maxmin) + jdher(N*sizeof(spinor)/sizeof(_Complex double), N2*sizeof(spinor)/sizeof(_Complex double), + 50., prec, + (*nr_of_eigenvalues), j_max, j_min, + max_iterations, blocksize, blockwise, v0dim, (_Complex double*) eigenvectors, + CG, solver_it_max, + threshold_max, decay_max, verbosity, + &converged, (_Complex double*) eigenvectors, eigenvls, + &returncode, JD_MAXIMAL, 1, + f); + else + jdher(N*sizeof(spinor)/sizeof(_Complex double), N2*sizeof(spinor)/sizeof(_Complex double), + 0., prec, + (*nr_of_eigenvalues), j_max, j_min, + max_iterations, blocksize, blockwise, v0dim, (_Complex double*) eigenvectors, + CG, solver_it_max, + threshold_min, decay_min, verbosity, + &converged, (_Complex double*) eigenvectors, eigenvls, + &returncode, JD_MINIMAL, 1, + f); + + etime = gettime(); + if(g_proc_id == 0) { + printf("Eigenvalues computed in %e sec. gettime)\n", etime-atime); + } + } + else { + sprintf(filename, eigenvalue_prefix, maxmin ? "max" : "min", nstore); + if((ofs = fopen(filename, "r")) != (FILE*) NULL) { + for(v0dim = 0; v0dim < (*nr_of_eigenvalues); v0dim++) { + fscanf(ofs, "%d %lf\n", &v0dim, &eigenvls[v0dim]); + if(feof(ofs)) break; + converged = v0dim; + } + } + fclose(ofs); + } + + (*nr_of_eigenvalues) = converged; + no_eigenvalues = converged; + ev_minev = eigenvls[(*nr_of_eigenvalues)-1]; + eigenvalues_for_cg_computed = converged; + + for (ii = 0; ii < (*nr_of_eigenvalues); ii++){ + for (jj = 0; jj <= ii; jj++){ + norm2 = scalar_prod(&(eigenvectors[ii*N2]),&(eigenvectors[jj*N2]), VOLUME, 1); + if(ii==jj){ + if((fabs(1.-creal(norm2))>1e-12) || (fabs(cimag(norm2))>1e-12) || 1) { + if(g_proc_id == g_stdio_proc){ + printf("< %d | %d> =\t %e +i * %e \n", ii+1, jj+1, creal(norm2), cimag(norm2)); + fflush(stdout); + } + } + } + else{ + if((fabs(creal(norm2))>1e-12) || (fabs(cimag(norm2))>1e-12) || 1) { + if(g_proc_id == g_stdio_proc){ + printf("< %d | %d> =\t %e +i * %e \n", ii+1, jj+1, creal(norm2), cimag(norm2)); + fflush(stdout); + } + } + } + } + } + + + if(readwrite == 1 ) { + if(even_odd_flag) + for(v0dim = 0; v0dim < (*nr_of_eigenvalues); v0dim++) { + sprintf(filename, eigenvector_prefix, maxmin ? "max" : "min", v0dim, nstore); + if((write_eospinor(&eigenvectors[v0dim*N2], filename, eigenvls[v0dim], prec, nstore)) != 0) { + break; + } + } + else{ + WRITER *writer=NULL; + spinor *s; + double sqnorm; + paramsPropagatorFormat *propagatorFormat = NULL; + + for(v0dim = 0; v0dim < (*nr_of_eigenvalues); v0dim++) { + sprintf(filename, eigenvector_prefix, maxmin ? "max" : "min", v0dim, nstore); + + construct_writer(&writer, filename, 0); + /* todo write propagator format */ + propagatorFormat = construct_paramsPropagatorFormat(64, 1); + write_propagator_format(writer, propagatorFormat); + free(propagatorFormat); + + + s=(spinor*)&eigenvectors[v0dim*N2]; + write_spinor(writer, &s,NULL, 1, 64); + destruct_writer(writer); + writer=NULL; + sqnorm=square_norm(s,VOLUME,1); + printf(" wrote eigenvector | |^2 = %e \n",sqnorm); + + + } + } + } + if(g_proc_id == 0 && readwrite != 2) { + sprintf(filename, eigenvalue_prefix , maxmin ? "max" : "min", nstore); + ofs = fopen(filename, "w"); + for(v0dim = 0; v0dim < (*nr_of_eigenvalues); v0dim++) { + fprintf(ofs, "%d %e\n", v0dim, eigenvls[v0dim]); + } + fclose(ofs); + } + for(v0dim = 0; v0dim < converged; v0dim++) { + inv_eigenvls[v0dim] = 1./eigenvls[v0dim]; + } + + ev_qnorm=1.0/(sqrt(max_eigenvalue)+0.1); + ev_minev*=ev_qnorm*ev_qnorm; + /* ov_n_cheby is initialized in Dov_psi.c */ + returnvalue=eigenvls[0]; + free(max_eigenvector_); +#else + fprintf(stderr, "lapack not available, so JD method for EV computation not available \n"); +#endif + return(returnvalue); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/eigenvalues.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/eigenvalues.h new file mode 100644 index 0000000000000000000000000000000000000000..ee6165057277745dfccdfa9ee5ec4e277499ef25 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/eigenvalues.h @@ -0,0 +1,37 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _EIGENVALUES_H +#define _EIGENVALUES_H + +extern spinor * eigenvectors; +extern double * eigenvls; +extern double * inv_eigenvls; +extern int eigenvalues_for_cg_computed; +extern int no_eigenvalues; +extern int evlength; + +/* the folowing two are needed for the overlap */ +extern double ev_qnorm, ev_minev; + +double eigenvalues(int * nr_of_eigenvalues, const int max_iterations, + const double precision, const int maxmin, + const int readwrite, const int nstore, + const int even_odd_flag); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/eigenvalues_Jacobi.c b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/eigenvalues_Jacobi.c new file mode 100644 index 0000000000000000000000000000000000000000..a61f3deecce7e7558c8f65382f3f8badf4b15a5c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/eigenvalues_Jacobi.c @@ -0,0 +1,225 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +/* ************************************************************************ + * Main routine for the LapH_ev program: computes eigensystem of the Laplacian operator. + * Authors: Luigi Scorzato, Marco Cristoforetti + * + **************************************************************************/ +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "global.h" +#include "su3.h" +#include +#include +#include +#include +#include +#include "jacobi.h" +#include "solver/solver.h" +#include "solver/jdher_su3vect.h" +#include "solver/matrix_mult_typedef.h" +#include "linalg_eo.h" +#include "eigenvalues_Jacobi.h" +#include "gettime.h" + +#ifdef WITHLAPH + +su3_vector *eigenvectors_su3v = NULL; +double *eigenvls_su3v = NULL; +double max_eigenvalue_su3v; +double * inv_eigenvls_su3v = NULL; + +int eigenvalues_for_cg_computed_su3v = 0; +int evlength_su3v; + +double eigenvalues_Jacobi(int * nr_of_eigenvalues, const int max_iterations, + const double precision, const int maxmin,int tslice, + const int nstore) { + double returnvalue; + static int allocated = 0; + +#ifdef HAVE_LAPACK + + + int verbosity = 1, converged = 0, blocksize = 1 , blockwise=0; + int solver_it_max = 50, j_max, j_min; + double decay_min = 1.7, decay_max = 1.5, prec, threshold_min = 1.e-3, threshold_max = 5.e-2; + int v0dim = 0; + matrix_mult_su3vect f; + int N=SPACEVOLUME, N2=(SPACEVOLUME + SPACERAND); + su3_vector * max_eigenvector_ = NULL, *max_eigenvector; + + int returncode=0; + int returncode2=0; + su3_vector *s; + double sqnorm; + + char filename[200]; + char eigvl_filename[200]; + // int dims[]={T*g_nproc_t, LX*g_nproc_x, LY*g_nproc_y, LZ*g_nproc_z}; + int dims[]={1, LX*g_nproc_x, LY*g_nproc_y, LZ*g_nproc_z}; + FILE *efp; + +#ifdef MPI + double atime, etime; + MPI_File fp; + MPI_Offset siteSize=3*2*sizeof(double); + LemonRecordHeader *header; + LemonWriter *writer; +#else + FILE *fp; + int siteSize=3*2*sizeof(double); +#endif + + f = &Jacobi; + evlength_su3v = N2; + + if(g_proc_id == g_stdio_proc && g_debug_level >0) + { + printf("Number of %s eigenvalues to compute = %d\n", + maxmin ? "maximal" : "minimal",(*nr_of_eigenvalues)); + printf("Using Jacobi-Davidson method! \n"); + } + if((*nr_of_eigenvalues) < 8){ + j_max = 15; + j_min = 8; + } + else{ + j_max = 2*(*nr_of_eigenvalues); + j_min = (*nr_of_eigenvalues); + } + if(precision < 1.e-14){ + prec = 1.e-14; + } + else{ + prec = precision; + } + max_eigenvector_= calloc(N2, sizeof(su3_vector)); + max_eigenvector = max_eigenvector_; + + if(allocated == 0) + { + allocated = 1; + eigenvectors_su3v = calloc(N2*(*nr_of_eigenvalues), sizeof(su3_vector));; + eigenvls_su3v = (double*)malloc((*nr_of_eigenvalues)*sizeof(double)); + inv_eigenvls_su3v = (double*)malloc((*nr_of_eigenvalues)*sizeof(double)); + } + + solver_it_max = 64; + /* compute the maximal one first */ + /* DEBUG + jdher_su3vect(N*sizeof(su3_vector)/sizeof(_Complex double), N2*sizeof(su3_vector)/sizeof(_Complex double), + 50., 1.e-12, + 1, 15, 8, max_iterations, 1, 0, 0, NULL, + CG, solver_it_max, + threshold_max, decay_max, verbosity, + &converged, (_Complex double*) max_eigenvector, (double*) &max_eigenvalue_su3v, + &returncode2, JD_MAXIMAL, 1,tslice,f); + */ + + atime = gettime(); + + /* (re-) compute minimal eigenvalues */ + converged = 0; + solver_it_max = 256; + + if(maxmin) + jdher_su3vect(N*sizeof(su3_vector)/sizeof(_Complex double), N2*sizeof(su3_vector)/sizeof(_Complex double), + 50., prec, + (*nr_of_eigenvalues), j_max, j_min, + max_iterations, blocksize, blockwise, v0dim, (_Complex double*) eigenvectors_su3v, + CG, solver_it_max, + threshold_max, decay_max, verbosity, + &converged, (_Complex double*) eigenvectors_su3v, eigenvls_su3v, + &returncode, JD_MAXIMAL, 1,tslice, + f); + else + jdher_su3vect(N*sizeof(su3_vector)/sizeof(_Complex double), N2*sizeof(su3_vector)/sizeof(_Complex double), + 0., prec, + (*nr_of_eigenvalues), j_max, j_min, + max_iterations, blocksize, blockwise, v0dim, (_Complex double*) eigenvectors_su3v, + CG, solver_it_max, + threshold_min, decay_min, verbosity, + &converged, (_Complex double*) eigenvectors_su3v, eigenvls_su3v, + &returncode, JD_MINIMAL, 1,tslice, + f); + + etime = gettime(); + if(g_proc_id == 0) { + printf("Eigenvalues computed in %e sec. (gettime)\n", etime-atime); + } + + + /* Printout eigenvalues. */ + if(g_proc_id == 0) { + sprintf(eigvl_filename,"eigenvalues.%.3d.%.4d", tslice, nstore); + efp=fopen(eigvl_filename,"w"); + for(v0dim = 0; v0dim < (*nr_of_eigenvalues); v0dim++) { + fprintf(efp,"%e\n",eigenvls_su3v[v0dim]); + } + fclose(efp); + } + + /* Printout eigenvectors. */ + for(v0dim = 0; v0dim < (*nr_of_eigenvalues); v0dim++) { + sprintf(filename, "eigenvector.%.3d.%.3d.%.4d", v0dim, tslice, nstore); + s=(su3_vector*)&eigenvectors_su3v[v0dim*N2]; +#ifdef MPI +# ifdef HAVE_LIBLEMON + // SEGNO: dovrebbe stampare 8*2*3*SPACEVOLUME data per file, ma ne stampa 8*2*4n*SPACEVOLUME (n=4-1 per ev 0-3) + + MPI_File_open(g_cart_grid, filename, MPI_MODE_WRONLY | MPI_MODE_CREATE, MPI_INFO_NULL, &fp); + writer = lemonCreateWriter(&fp, g_cart_grid); + header = lemonCreateHeader(1 /* MB */, 1 /* ME */, "lattice-su3_vector-data",SPACEVOLUME*3*sizeof(_Complex double)); + lemonWriteRecordHeader(header, writer); + lemonDestroyHeader(header); + lemonWriteLatticeParallel(writer, s, siteSize, dims); + lemonWriterCloseRecord(writer); + lemonDestroyWriter(writer); + MPI_File_close(&fp); +# else + if(g_proc_id == 0) { + printf("Cannot write eigenvectors: you need LEMON for writing eigenvectors with MPI\n"); + } +# endif +#else + fp=fopen(filename,"wb"); + fwrite(s,siteSize,SPACEVOLUME,fp); + fclose(fp); +#endif // MPI + sqnorm=square_norm_su3vect(s,SPACEVOLUME,1); + if(g_proc_id == 0) { + printf("wrote eigenvector | |^2 = %e \n",sqnorm); + } + } + + returnvalue=eigenvls_su3v[0]; + free(max_eigenvector_); +#else + fprintf(stderr, "lapack not available, so JD method for EV computation not available \n"); +#endif // LAPACK + return(returnvalue); +} + +#endif // WITHLAPH diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/eigenvalues_Jacobi.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/eigenvalues_Jacobi.h new file mode 100755 index 0000000000000000000000000000000000000000..b7ac8d9c192b9999e1872be9281ada24fa6d3a97 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/eigenvalues_Jacobi.h @@ -0,0 +1,33 @@ +/*********************************************************************** + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _EIGENVALUESJ_H +#define _EIGENVALUESJ_H + +#include "su3.h" + +extern su3_vector *eigenvectors_su3v; +extern double *eigenvls_su3v; +extern double * inv_eigenvls_su3v; +extern int eigenvalues_for_cg_computed_su3v; +extern int no_eigenvalues_su3v; +extern int evlength_su3v; + +double eigenvalues_Jacobi(int * nr_of_eigenvalues, const int max_iterations, + const double precision, const int maxmin, int tslice, const int nstore); + +#endif // _EIGENVALUESJ_H diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/eigenvalues_bi.c b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/eigenvalues_bi.c new file mode 100644 index 0000000000000000000000000000000000000000..68f0c15afca71bb1a6a7a650e173ba87659506a5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/eigenvalues_bi.c @@ -0,0 +1,152 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * Here we compute the nr_of_eigenvalues lowest eigenvalues + * of (gamma5*D)^2. Therefore we use the arnoldi routines. + * + * The computed eigenvalues are stored in g_eigenvalues + * and the computed eigenvectors in g_ev + * + * inout: + * nr_of_eigenvalues: input: Number of eigenvalues to compute + * output: Number of computed eigenvalues + * input: + * crylov_space_dimension: Dimension of crylov space dimension + * to be used in the arnoldi routines + * + * Autor: Thomas Chiarappa + * Thomas.Chiarappa@mib.infn.it + * + *******************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#ifdef MPI +# include +#endif +#include "global.h" +#include "su3.h" +#include "linalg_eo.h" +#include "start.h" +#include "operator/tm_operators.h" +#include "solver/solver.h" +#include "solver/jdher_bi.h" +#include "solver/matrix_mult_typedef_bi.h" +#include "eigenvalues_bi.h" +#include "operator/tm_operators_nd.h" + + +double eigenvalues_bi(int * nr_of_eigenvalues, + const int max_iterations, const double precision, + const int maxmin, matrix_mult_bi Qsq) { + + + static bispinor * eigenvectors_bi_ = NULL; + static int allocated = 0; + static bispinor *eigenvectors_bi = NULL; + static double * eigenvls_bi = NULL; + + /********************** + * For Jacobi-Davidson + **********************/ + int verbosity = g_debug_level, converged = 0, blocksize = 1, blockwise = 0; + int solver_it_max = 200, j_max, j_min; + double decay_min = 1.7, decay_max = 1.5, prec, + threshold_min = 1.e-3, threshold_max = 5.e-2, + startvalue, threshold, decay, returnvalue; + int v0dim = 0; + + /********************** + * General variables + **********************/ + int returncode=0; + + if(maxmin == JD_MINIMAL) { + startvalue = 0.; + threshold = threshold_min; + decay = decay_min; + solver_it_max = 200; + } + else { + startvalue = 50.; + threshold = threshold_max; + decay = decay_max; + solver_it_max = 50; + } + + if(g_proc_id == g_stdio_proc) { + printf("Number of %s eigenvalues to compute = %d\n", + maxmin ? "maximal" : "minimal",(*nr_of_eigenvalues)); + printf("Using Jacobi-Davidson method! \n"); + } + + if((*nr_of_eigenvalues) < 8){ + j_max = 15; + j_min = 8; + } + else{ + j_max = 2*(*nr_of_eigenvalues); + j_min = *nr_of_eigenvalues; + } + if(precision < 1.e-14){ + prec = 1.e-14; + } + else{ + prec = precision; + } + + if(allocated == 0) { + allocated = 1; +#if (defined SSE || defined SSE2 || defined SSE3) + eigenvectors_bi_ = calloc((VOLUME)/2*(*nr_of_eigenvalues)+1, sizeof(bispinor)); + eigenvectors_bi = (bispinor *)(((unsigned long int)(eigenvectors_bi_)+ALIGN_BASE)&~ALIGN_BASE); +#else + eigenvectors_bi_= calloc((VOLUME)/2*(*nr_of_eigenvalues), sizeof(bispinor)); + eigenvectors_bi = eigenvectors_bi_; +#endif + eigenvls_bi = (double*)malloc((*nr_of_eigenvalues)*sizeof(double)); + } + + /* compute eigenvalues */ + + if((g_proc_id==0) && (g_debug_level > 4)) { + printf(" Values of mu = %e mubar = %e eps = %e precision = %e \n \n", g_mu, g_mubar, g_epsbar, precision); + } + + /* here n and lda are equal, because Q_Qdagger_ND_BI does an internal */ + /* conversion to non _bi fields which are subject to xchange_fields */ + /* so _bi fields do not need boundary */ + jdher_bi((VOLUME)/2*sizeof(bispinor)/sizeof(_Complex double), (VOLUME)/2*sizeof(bispinor)/sizeof(_Complex double), + startvalue, prec, + (*nr_of_eigenvalues), j_max, j_min, + max_iterations, blocksize, blockwise, v0dim, (_Complex double*) eigenvectors_bi, + BICGSTAB, solver_it_max, + threshold, decay, verbosity, + &converged, (_Complex double*) eigenvectors_bi, eigenvls_bi, + &returncode, maxmin, 1, + Qsq); + + *nr_of_eigenvalues = converged; + + returnvalue = eigenvls_bi[0]; + return(returnvalue); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/eigenvalues_bi.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/eigenvalues_bi.h new file mode 100644 index 0000000000000000000000000000000000000000..1245063a500b15ccacfd1ad99249ed3c473dd54e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/eigenvalues_bi.h @@ -0,0 +1,28 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _EIGENVALUES_BI_H +#define _EIGENVALUES_BI_H + +#include "matrix_mult_typedef_bi.h" + +double eigenvalues_bi(int * nev, const int max_iterations, + const double prec, const int maxmin, + matrix_mult_bi Qsq); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/fgmres.c b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/fgmres.c new file mode 100644 index 0000000000000000000000000000000000000000..283ff0f806534d9eef057bc69b90233b4c974f88 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/fgmres.c @@ -0,0 +1,273 @@ +/*********************************************************************** + * + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * Generalized minimal residual (FGMRES) with a maximal number of restarts. + * Solves Q=AP for _Complex double regular matrices A. Flexibel version of GMRES + * with the ability for variable right preconditioning. + * + * Inout: + * spinor * P : guess for the solving spinor + * Input: + * spinor * Q : source spinor + * int m : Maximal dimension of Krylov subspace + * int max_restarts : maximal number of restarts + * double eps : stopping criterium + * matrix_mult f : pointer to a function containing the matrix mult + * for type matrix_mult see matrix_mult_typedef.h + * + * Autor: Carsten Urbach + ********************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include"global.h" +#include"su3.h" +#include"linalg_eo.h" +#include"gmres_precon.h" +#include"operator/tm_operators.h" +#include"sub_low_ev.h" +#include"poly_precon.h" +#include "Msap.h" +#include"gamma.h" +#include "start.h" +#include "solver_field.h" +#include"fgmres.h" + +static void init_gmres(const int _M, const int _V); + +static _Complex double ** H; +static _Complex double * alpha; +static _Complex double * c; +static double * s; +static spinor ** V; +static spinor * _v; +static spinor ** Z; +static spinor * _z; +static _Complex double * _h; +static _Complex double * alpha; +static _Complex double * c; +static double * s; +extern int dfl_poly_iter; + +int fgmres(spinor * const P,spinor * const Q, + const int m, const int max_restarts, + const double eps_sq, const int rel_prec, + const int N, const int precon, matrix_mult f){ + + int restart, i, j, k; + double beta, eps, norm; + _Complex double tmp1, tmp2; + spinor * r0; + spinor ** solver_field = NULL; + const int nr_sf = 3; + + if(N == VOLUME) { + init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf);/* #ifdef HAVE_LAPACK */ +/* _FT(zhetrf)("U", &n, G, &N, ipiv, work, &lwork, &info, 1); */ +/* #endif */ +/* if(info != 0) { */ +/* printf("Error in zhetrf info = %d\n", info); */ +/* } */ +/* else { */ +/* #ifdef HAVE_LAPACK */ +/* _FT(zhetrs)("U", &n, &ONE, G, &N, ipiv, bn, &N, &info, 1); */ +/* #endif */ +/* if(info != 0) { */ +/* printf("Error in zhetrs info = %d\n", info); */ +/* } */ +/* } */ + /* solution again stored in bn */ + + } + else { + init_solver_field(&solver_field, VOLUMEPLUSRAND/2, nr_sf); + } + eps=sqrt(eps_sq); + init_gmres(m, VOLUMEPLUSRAND); + r0 = solver_field[0]; + + norm = sqrt(square_norm(Q, N, 1)); + + assign(solver_field[2], P, N); + for(restart = 0; restart < max_restarts; restart++){ + /* r_0=Q-AP (b=Q, x+0=P) */ + f(r0, solver_field[2]); + diff(r0, Q, r0, N); + + /* v_0=r_0/||r_0|| */ + alpha[0] = sqrt(square_norm(r0, N, 1)); + + if(g_proc_id == g_stdio_proc && g_debug_level > 0){ + printf("FGMRES %d\t%g true residue\n", restart*m, creal(alpha[0])*creal(alpha[0])); + fflush(stdout); + } + + if(creal(alpha[0])==0.){ + assign(P, solver_field[2], N); + finalize_solver(solver_field, nr_sf); + return(restart*m); + } + + mul_r(V[0], 1./creal(alpha[0]), r0, N); + + for(j = 0; j < m; j++){ + /* solver_field[0]=A*M^-1*v_j */ + + if(precon == 0) { + assign(Z[j], V[j], N); + } + else { + zero_spinor_field(Z[j], N); + /* poly_nonherm_precon(Z[j], V[j], 0.3, 1.1, 80, N); */ + Msap(Z[j], V[j], 8); + } + f(r0, Z[j]); + /* Set h_ij and omega_j */ + /* solver_field[1] <- omega_j */ + assign(solver_field[1], solver_field[0], N); + for(i = 0; i <= j; i++){ + H[i][j] = scalar_prod(V[i], solver_field[1], N, 1); + assign_diff_mul(solver_field[1], V[i], H[i][j], N); + } + + H[j+1][j] = sqrt(square_norm(solver_field[1], N, 1)); + for(i = 0; i < j; i++){ + tmp1 = H[i][j]; + tmp2 = H[i+1][j]; + (H[i][j]) = (tmp2) * (s[i]); + (H[i][j]) += conj(c[i]) * (tmp1); + (H[i+1][j]) = (tmp1) * (s[i]); + (H[i+1][j]) -= (c[i]) * (tmp2); + } + + /* Set beta, s, c, alpha[j],[j+1] */ + beta = sqrt(creal(H[j][j] * conj(H[j][j])) + creal(H[j+1][j] * conj(H[j+1][j]))); + s[j] = creal(H[j+1][j]) / beta; + (c[j]) = (H[j][j]) / beta; + (H[j][j]) = beta; + (alpha[j+1]) = (alpha[j]) * (s[j]); + tmp1 = alpha[j]; + (alpha[j]) = conj(c[j]) * (tmp1); + + /* precision reached? */ + if(g_proc_id == g_stdio_proc && g_debug_level > 0){ + printf("FGMRES\t%d\t%g iterated residue\n", restart*m+j, creal(alpha[j+1])*creal(alpha[j+1])); + fflush(stdout); + } + if(((creal(alpha[j+1]) <= eps) && (rel_prec == 0)) || ((creal(alpha[j+1]) <= eps*norm) && (rel_prec == 1))){ + (alpha[j]) = (alpha[j]) * (1./creal(H[j][j])); + assign_add_mul(solver_field[2], Z[j], alpha[j], N); + for(i = j-1; i >= 0; i--){ + for(k = i+1; k <= j; k++){ + (tmp1) = (H[i][k]) * (alpha[k]); + (alpha[i]) -= tmp1; + } + (alpha[i]) = (alpha[i]) * (1./creal(H[i][i])); + assign_add_mul(solver_field[2], Z[i], alpha[i], N); + } + for(i = 0; i < m; i++){ + alpha[i] = creal(alpha[i]); + } + assign(P, solver_field[2], N); + finalize_solver(solver_field, nr_sf); + return(restart*m+j); + } + /* if not */ + else{ + if(j != m-1){ + mul_r(V[(j+1)], 1./creal(H[j+1][j]), solver_field[1], N); + } + } + + } + j=m-1; + /* prepare for restart */ + (alpha[j]) = (alpha[j]) * (1./creal(H[j][j])); + assign_add_mul(solver_field[2], Z[j], alpha[j], N); + for(i = j-1; i >= 0; i--){ + for(k = i+1; k <= j; k++){ + (tmp1) = (H[i][k]) * (alpha[k]); + (alpha[i]) -= tmp1; + } + (alpha[i]) = (alpha[i]) * (1./creal(H[i][i])); + assign_add_mul(solver_field[2], Z[i], alpha[i], N); + } + for(i = 0; i < m; i++){ + alpha[i] = creal(alpha[i]); + } + } + + /* If maximal number of restarts is reached */ + assign(P, solver_field[2], N); + finalize_solver(solver_field, nr_sf); + return(-1); +} + +static void init_gmres(const int _M, const int _V){ + static int Vo = -1; + static int M = -1; + static int init = 0; + int i; + if((M != _M)||(init == 0)||(Vo != _V)){ + if(init == 1){ + free(H); + free(V); + free(_h); + free(_v); + free(alpha); + free(c); + free(s); + } + Vo = _V; + M = _M; + H = calloc(M+1, sizeof(_Complex double *)); + V = calloc(M, sizeof(spinor *)); + Z = calloc(M, sizeof(spinor *)); +#if (defined SSE || defined SSE2) + _h = calloc((M+2)*M, sizeof(_Complex double)); + H[0] = (_Complex double *)(((unsigned long int)(_h)+ALIGN_BASE)&~ALIGN_BASE); + _v = calloc(M*Vo+1, sizeof(spinor)); + V[0] = (spinor *)(((unsigned long int)(_v)+ALIGN_BASE)&~ALIGN_BASE); + _z = calloc(M*Vo+1, sizeof(spinor)); + Z[0] = (spinor *)(((unsigned long int)(_z)+ALIGN_BASE)&~ALIGN_BASE); +#else + _h = calloc((M+1)*M, sizeof(_Complex double)); + H[0] = _h; + _v = calloc(M*Vo, sizeof(spinor)); + V[0] = _v; + _z = calloc(M*Vo, sizeof(spinor)); + Z[0] = _z; +#endif + s = calloc(M, sizeof(double)); + c = calloc(M, sizeof(_Complex double)); + alpha = calloc(M+1, sizeof(_Complex double)); + for(i = 1; i < M; i++){ + V[i] = V[i-1] + Vo; + H[i] = H[i-1] + M; + Z[i] = Z[i-1] + Vo; + } + H[M] = H[M-1] + M; + init = 1; + } + return; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/fgmres.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/fgmres.h new file mode 100644 index 0000000000000000000000000000000000000000..ec41e42466ac901455b3ee2d2c5ce3737b2113c9 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/fgmres.h @@ -0,0 +1,61 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +/******************************************************************************* + * Generalized minimal residual (GMRES) with a maximal number of restarts. + * Solves Q=AP for _Complex double regular matrices A. + * For details see: Andreas Meister, Numerik linearer Gleichungssysteme + * or the original citation: + * Y. Saad, M.H.Schultz in GMRES: A generalized minimal residual algorithm + * for solving nonsymmetric linear systems. + * SIAM J. Sci. Stat. Comput., 7: 856-869, 1986 + * + * int gmres(spinor * const P,spinor * const Q, + * const int m, const int max_restarts, + * const double eps_sq, matrix_mult f) + * + * Returns the number of iterations needed or -1 if maximal number of restarts + * has been reached. + * + * Inout: + * spinor * P : guess for the solving spinor + * Input: + * spinor * Q : source spinor + * int m : Maximal dimension of Krylov subspace + * int max_restarts : maximal number of restarts + * double eps : stopping criterium + * matrix_mult f : pointer to a function containing the matrix mult + * for type matrix_mult see matrix_mult_typedef.h + * + * Autor: Carsten Urbach + ********************************************************************************/ + +#ifndef _FGMRES_H +#define _FGMRES_H + +#include"solver/matrix_mult_typedef.h" +#include"su3.h" + +int fgmres(spinor * const P,spinor * const Q, + const int m, const int max_restarts, + const double eps, const int rel_prec, + const int N, const int precon, matrix_mult f); + + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/gcr.c b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/gcr.c new file mode 100644 index 0000000000000000000000000000000000000000..21156bfeda26a5cda42e74eec7f182dab949a4c2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/gcr.c @@ -0,0 +1,198 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include"global.h" +#include"su3.h" +#include"linalg_eo.h" +#include"solver/gmres_precon.h" +#include"start.h" +#include"operator/tm_operators.h" +#include"solver/poly_precon.h" +#include"solver/cg_her.h" +#include"operator/D_psi.h" +#include"Msap.h" +#include"dfl_projector.h" +#include "solver_field.h" +#include"gcr.h" + +static void init_gcr(const int _M, const int _V); + +static _Complex double ** a; +static _Complex double * _a; +static double * b; +static _Complex double * c; +static spinor ** chi; +static spinor * _chi; +static spinor ** xi; +static spinor * _xi; +static _Complex double * alpha; +extern int dfl_poly_iter; + +int gcr(spinor * const P, spinor * const Q, + const int m, const int max_restarts, + const double eps_sq, const int rel_prec, + const int N, const int precon, matrix_mult f) { + + int k, l, restart, i, iter = 0; + double norm_sq, err; + spinor * rho, * tmp; + _Complex double ctmp; + spinor ** solver_field = NULL; + const int nr_sf = 2; + + if(N == VOLUME) { + init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf); + } + else { + init_solver_field(&solver_field, VOLUMEPLUSRAND/2, nr_sf); + } + + rho = solver_field[0]; + tmp = solver_field[1]; + + init_gcr(m, N+RAND); + + norm_sq = square_norm(Q, N, 1); + if(norm_sq < 1.e-32) { + norm_sq = 1.; + } + + for(restart = 0; restart < max_restarts; restart++) { + dfl_sloppy_prec = 0; + f(tmp, P); + diff(rho, Q, tmp, N); + err = square_norm(rho, N, 1); + if(g_proc_id == g_stdio_proc && g_debug_level > 1){ + printf("GCR: iteration number: %d, true residue: %g\n", iter, err); + fflush(stdout); + } + if(((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq*norm_sq) && (rel_prec == 1))) { + finalize_solver(solver_field, nr_sf); + return(iter); + } + for(k = 0; k < m; k++) { + + if(precon == 0) { + assign(xi[k], rho, N); + } + else { + zero_spinor_field(xi[k], N); + Msap_eo(xi[k], rho, 6); + /* Msap(xi[k], rho, 8); */ + } + + dfl_sloppy_prec = 1; + dfl_little_D_prec = 1.e-12; + f(tmp, xi[k]); + + /* tmp will become chi[k] */ + for(l = 0; l < k; l++) { + a[l][k] = scalar_prod(chi[l], tmp, N, 1); + assign_diff_mul(tmp, chi[l], a[l][k], N); + } + b[k] = sqrt(square_norm(tmp, N, 1)); + mul_r(chi[k], 1./b[k], tmp, N); + c[k] = scalar_prod(chi[k], rho, N, 1); + assign_diff_mul(rho, chi[k], c[k], N); + err = square_norm(rho, N, 1); + iter ++; + if(g_proc_id == g_stdio_proc && g_debug_level > 2){ + if(rel_prec == 1) printf("# GCR: %d\t%g >= %g iterated residue\n", iter, err, eps_sq*norm_sq); + else printf("# GCR: %d\t%g >= %giterated residue\n", iter, err, eps_sq); + fflush(stdout); + } + /* Precision reached? */ + if((k == m-1) || ((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq*norm_sq) && (rel_prec == 1))) { + break; + } + } + + /* prepare for restart */ + c[k] /= b[k]; + assign_add_mul(P, xi[k], c[k], N); + for(l = k - 1; l >= 0; --l) + { + for(i = l+1; i <= k; ++i) + { + ctmp = a[l][i] * c[i]; + c[l] -= ctmp; + } + c[l] /= b[l]; + assign_add_mul(P, xi[l], c[l], N); + } + } + finalize_solver(solver_field, nr_sf); + return(-1); +} + +static void init_gcr(const int _M, const int _V){ + static int Vo = -1; + static int M = -1; + static int init = 0; + int i; + if((M != _M)||(init == 0)||(Vo != _V)){ + if(init == 1){ + free(a); + free(chi); + free(_a); + free(_chi); + free(alpha); + free(c); + free(_xi); + free(xi); + } + Vo = _V; + M = _M; + a = calloc(M+1, sizeof(_Complex double *)); + chi = calloc(M, sizeof(spinor *)); + xi = calloc(M, sizeof(spinor *)); +#if (defined SSE || defined SSE2 || defined SSE3) + _a = calloc((M+2)*M, sizeof(_Complex double)); + a[0] = (_Complex double *)(((unsigned long int)(_a)+ALIGN_BASE)&~ALIGN_BASE); + _chi = calloc(M*Vo+1, sizeof(spinor)); + chi[0] = (spinor *)(((unsigned long int)(_chi)+ALIGN_BASE)&~ALIGN_BASE); + _xi = calloc(M*Vo+1, sizeof(spinor)); + xi[0] = (spinor *)(((unsigned long int)(_xi)+ALIGN_BASE)&~ALIGN_BASE); +#else + _a = calloc((M+1)*M, sizeof(_Complex double)); + a[0] = _a; + _chi = calloc(M*Vo, sizeof(spinor)); + chi[0] = _chi; + _xi = calloc(M*Vo, sizeof(spinor)); + xi[0] = _xi; +#endif + if(_xi == NULL) {printf("Unable to allocated space for GCR iterations\n");exit(0); } + b = calloc(M, sizeof(double)); + c = calloc(M, sizeof(_Complex double)); + alpha = calloc(M+1, sizeof(_Complex double)); + for(i = 1; i < M; i++){ + chi[i] = chi[i-1] + Vo; + xi[i] = xi[i-1] + Vo; + a[i] = a[i-1] + M; + } + a[M] = a[M-1] + M; + init = 1; + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/gcr.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/gcr.h new file mode 100644 index 0000000000000000000000000000000000000000..e4efb1aaec28d542ad71e81aa9ce77fdd0cf64a1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/gcr.h @@ -0,0 +1,31 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _GCR_H +#define _GCR_H + +#include"solver/matrix_mult_typedef.h" +#include"su3.h" + +int gcr(spinor * const P, spinor * const Q, + const int m, const int max_restarts, + const double eps_sq, const int rel_prec, + const int N, const int precon, matrix_mult f); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/gcr4complex.c b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/gcr4complex.c new file mode 100644 index 0000000000000000000000000000000000000000..f887090444a4ec85fd8a1c48003777e7fbe30031 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/gcr4complex.c @@ -0,0 +1,255 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * 2010 claude Tadonki + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#include"global.h" +#include"su3.h" +#include"linalg_eo.h" +#include"gcr4complex.h" + +static void init_lgcr(const int _M, const int _V); +static void free_lgcr(); +static _Complex double ** a = NULL; +static _Complex double * _a = NULL; +static double * b = NULL; +static _Complex double * c = NULL; +static _Complex double ** chi = NULL; +static _Complex double * _chi = NULL; +static _Complex double ** xi = NULL; +static _Complex double * _xi = NULL; +static _Complex double * alpha = NULL; +static _Complex double * tmp = NULL; +static _Complex double * rho = NULL; +static int lgcr_init = 0; + +int gcr4complex(_Complex double * const P, _Complex double * const Q, + const int m, const int max_restarts, + const double eps_sq, const int rel_prec, + const int N, const int parallel, + const int lda, c_matrix_mult f) { + + int k, l, restart, i, p=0; + double norm_sq, err; + _Complex double ctmp; + + init_lgcr(m, lda); + + norm_sq = lsquare_norm(Q, N, parallel); + if(norm_sq < 1.e-20) { + norm_sq = 1.; + } + for(restart = 0; restart < max_restarts; restart++) { + f(tmp, P); + ldiff(rho, Q, tmp, N); + err = lsquare_norm(rho, N, parallel); + if(g_proc_id == g_stdio_proc && g_debug_level > 1){/*CT: was "g_debug_level > 0" */ + printf("lGCR: %d\t%g true residue %1.3e\n", restart * m, err, norm_sq); + fflush(stdout); + } + if(((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq * norm_sq) && (rel_prec == 1))) { + if(g_proc_id == 0 && g_debug_level > 1) printf("lgcr: %d %e %e %e %e\n", p, err, norm_sq, err/norm_sq, eps_sq); + return (p); + } + for(k = 0; ; k++) { + memcpy(xi[k], rho, N*sizeof(_Complex double)); + /* here we could put in a preconditioner */ + f(tmp, xi[k]); + /* tmp will become chi[k] */ + for(l = 0; l < k; l++) { + a[l][k] = lscalar_prod(chi[l], tmp, N, parallel); + lassign_diff_mul(tmp, chi[l], a[l][k], N); + } + b[k] = sqrt(lsquare_norm(tmp, N, parallel)); + lmul_r(chi[k], 1./b[k], tmp, N); + c[k] = lscalar_prod(chi[k], rho, N, parallel); + lassign_diff_mul(rho, chi[k], c[k], N); + err = lsquare_norm(rho, N, parallel); + if(g_proc_id == g_stdio_proc && g_debug_level > 1){ + printf("lGCR: %d\t%g iterated residue\n", restart*m+k, err); + fflush(stdout); + } + p++; + /* Precision reached? */ + if((k == m-1) || ((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq*norm_sq) && (rel_prec == 1))) { + break; + } + } + /* prepare for restart */ + c[k] /= b[k]; + lassign_add_mul(P, xi[k], c[k], N); + for(l = k-1; l >= 0; --l) + { + for(i = l+1; i <= k; ++i) + { + ctmp = a[l][i] * c[i]; + c[l] -= ctmp; + } + c[l] /= b[l]; + lassign_add_mul(P, xi[l], c[l], N); + } + } + if(g_proc_id == 0 && g_debug_level > 1) printf("lgcr: for -1 %d %e %e %e %e\n", p, err, norm_sq, err/norm_sq, eps_sq); + return(-1); +} + +static void init_lgcr(const int _M, const int _V){ + static int Vo = -1; + static int M = -1; + + int i; + if((M != _M)||(lgcr_init == 0)||(Vo != _V)){ + if(lgcr_init == 1) free_lgcr(); + Vo = _V; + M = _M; + a = calloc(M+1, sizeof(_Complex double *)); + chi = calloc(M, sizeof(_Complex double *)); + xi = calloc(M, sizeof(_Complex double *)); + tmp = calloc(Vo, sizeof(_Complex double)); + rho = calloc(Vo, sizeof(_Complex double)); + _a = calloc((M+1)*M, sizeof(_Complex double)); + a[0] = _a; + _chi = calloc(M*Vo, sizeof(_Complex double)); + chi[0] = _chi; + _xi = calloc(M*Vo, sizeof(_Complex double)); + xi[0] = _xi; + + b = calloc(M, sizeof(double)); + c = calloc(M, sizeof(_Complex double)); + alpha = calloc(M+1, sizeof(_Complex double)); + for(i = 1; i < M; i++) { + chi[i] = chi[i-1] + Vo; + xi[i] = xi[i-1] + Vo; + a[i] = a[i-1] + M; + } + a[M] = a[M-1] + M; + lgcr_init = 1; + } +} + +static void free_lgcr() +{ + lgcr_init = 0; + free(a); + free(chi); + free(_a); + free(_chi); + free(alpha); + free(c); + free(_xi); + free(xi); + free(rho); + free(tmp); + return; +} + + +void ldiff(_Complex double * const Q, _Complex double * const R, _Complex double * const S, const int N) +{ + for(int i = 0; i < N; ++i) + Q[i] = R[i] - S[i]; + return; +} + +void ldiff_assign(_Complex double * const Q, _Complex double * const S, const int N) +{ + for(int i = 0; i < N; ++i) + Q[i] -= S[i]; + return; +} + +void ladd(_Complex double * const Q, _Complex double * const R, _Complex double * const S, const int N) +{ + for(int i = 0; i < N; ++i) + Q[i] = R[i] + S[i]; + return; +} + +void ladd_assign(_Complex double * const Q, _Complex double * const S, const int N) +{ + for(int i = 0; i < N; ++i) + Q[i] += S[i]; + return; +} + +double lsquare_norm(_Complex double * const Q, const int N, const int parallel) +{ + double nrm = 0.0; + + for(int i = 0; i < N; ++i) + + nrm += conj(Q[i]) * Q[i]; +#ifdef MPI + if(parallel) + { + double nrm2 = nrm; + MPI_Allreduce(&nrm2, &nrm, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + } +#endif + + return(nrm); +} + +_Complex double lscalar_prod(_Complex double * const R, _Complex double * const S, const int N, const int parallel) +{ + _Complex double res = 0.0; + + for(int i = 0; i < N; ++i) + res += conj(R[i]) * S[i]; + +#ifdef MPI + if(parallel) + { + _Complex double res2 = res; + MPI_Allreduce(&res2, &res, 1, MPI_DOUBLE_COMPLEX, MPI_SUM, MPI_COMM_WORLD); + } +#endif + + return(res); +} + +void lmul_r(_Complex double * const R, const double c, _Complex double * const S, const int N) +{ + for(int i = 0; i < N; ++i) + R[i] = c * S[i]; +} + +void lmul(_Complex double * const R, const _Complex double c, _Complex double * const S, const int N) +{ + for(int i = 0; i < N; ++i) + R[i] = c * S[i]; +} + +void lassign_add_mul(_Complex double * const R, _Complex double * const S, const _Complex double c, const int N) +{ + for(int i = 0; i < N; ++i) + R[i] += c * S[i]; +} + +void lassign_diff_mul(_Complex double * const R, _Complex double * const S, const _Complex double c, const int N) +{ + for(int i = 0; i < N; i++) + R[i] -= c * S[i]; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/gcr4complex.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/gcr4complex.h new file mode 100644 index 0000000000000000000000000000000000000000..4fa1823b7d7a48c3de4e299f4b064413f2a72a5f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/gcr4complex.h @@ -0,0 +1,47 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _GCR4COMPLEX_H +#define _GCR4COMPLEX_H + +#include"solver/matrix_mult_typedef.h" +#include"su3.h" + +void ldiff(_Complex double * Q, _Complex double * const R, _Complex double * const S, const int N); +void ladd(_Complex double * Q, _Complex double * const R, _Complex double * const S, const int N); +double lsquare_norm(_Complex double * const Q, const int N, const int parallel); +_Complex double lscalar_prod(_Complex double * const R, _Complex double * const S, const int N, const int parallel); +void lmul_r(_Complex double * const R, const double c, _Complex double * const S, const int N); +void lmul(_Complex double * const R, const _Complex double c, _Complex double * const S, const int N); +void lassign_diff_mul(_Complex double * const R, _Complex double * const S, const _Complex double c, const int N); +void lassign_add_mul(_Complex double * const R, _Complex double * const S, const _Complex double c, const int N); +void ldiff_assign(_Complex double * const Q, _Complex double * const S, + const int N); +void ladd_assign(_Complex double * const Q, _Complex double * const S, + const int N); + + +int gcr4complex(_Complex double * const P, _Complex double * const Q, + const int m, const int max_restarts, + const double eps_sq, const int rel_prec, + const int N, const int parallel, + const int lda, c_matrix_mult f); + + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/generate_dfl_subspace.c b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/generate_dfl_subspace.c new file mode 100644 index 0000000000000000000000000000000000000000..305d8f4a3d0fff8e7e9d76ef51192eba56bfe99d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/generate_dfl_subspace.c @@ -0,0 +1,515 @@ +/*********************************************************************** + * + * Copyright (C) 2008 Albert Deuzeman, Siebren Reker, Carsten Urbach + * Claude Tadonki + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + + This file was modified according to a flexible number of blocks + by Claude Tadonki - PetaQCD - April 2010 ( claude.tadonki@lal.in2p3.fr ) + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +#include +#endif +#include +#include +#include +#include +#include "global.h" +#include "su3.h" +#include +#include "start.h" +#include "ranlxs.h" +#include "operator/D_psi.h" +#include "poly_precon.h" +#include "Msap.h" +#include "gmres_precon.h" +#include "linalg_eo.h" +#include "gram-schmidt.h" +#include "lu_solve.h" +#include "block.h" +#include "little_D.h" +#include "gcr4complex.h" +#include "boundary.h" +#include +#include +#include +#include +#include "solver_field.h" +#include "generate_dfl_subspace.h" + +int init_little_dfl_subspace(const int N_s); + +spinor ** dfl_fields = NULL; +static spinor * _dfl_fields = NULL; +_Complex double ** little_dfl_fields = NULL; +static _Complex double *_little_dfl_fields = NULL; +_Complex double ** little_dfl_fields_eo = NULL; +static _Complex double *_little_dfl_fields_eo = NULL; +static int init_subspace = 0; +static int init_little_subspace = 0; + +static void random_fields(const int Ns) { + + int i, j, ix; + float r,s[24]; + double *t; + + r=(float)(1.0/sqrt(24.0*(double)(VOLUME))); + + for (i = 0; i < Ns; i++) { + t=(double*)(dfl_fields[i]); + for (ix = 0; ix < VOLUME; ix++){ + ranlxs(s,24); + for (j = 0; j < 24; j++) { + (*t) = (double)(r*(s[j]-0.5f)); + (*t) = 1.; + t += 1; + } + } + } + return; +} + +int generate_dfl_subspace(const int Ns, const int N, const int repro) { + int ix, i_o,i, j, k, p, blk, vpr = VOLUMEPLUSRAND*sizeof(spinor)/sizeof(_Complex double), + vol = VOLUME*sizeof(spinor)/sizeof(_Complex double); + spinor **psi; + double nrm, e = 0.3, d = 1.1, atime, etime; + _Complex double s; + _Complex double * work; + WRITER *writer = NULL; + FILE *fp_dfl_fields; + char file_name[500]; // CT + double musave = g_mu; + spinor ** work_fields = NULL; + const int nr_wf = 2; + +#ifdef MPI + atime = MPI_Wtime(); +#else + atime = (double)clock()/(double)(CLOCKS_PER_SEC); +#endif + init_solver_field(&work_fields, VOLUMEPLUSRAND, nr_wf); + work = (_Complex double*)malloc(nb_blocks*9*Ns*sizeof(_Complex double)); + psi = (spinor **)calloc(nb_blocks, sizeof(spinor *)); + psi[0] = calloc(VOLUME + nb_blocks, sizeof(spinor)); + for(i = 1; i < nb_blocks; i++) psi[i] = psi[i-1] + (VOLUME / nb_blocks) + 1; + + if(init_subspace == 0) i = init_dfl_subspace(Ns); + + if(init_little_subspace == 0) i = init_little_dfl_subspace(Ns); + + random_fields(Ns); + if(g_debug_level > 4) { + for(e = 0.; e < 1.; e=e+0.05) { + random_spinor_field_lexic(dfl_fields[0], repro, RN_GAUSS); + nrm = sqrt(square_norm(dfl_fields[0], N, 1)); + mul_r(dfl_fields[0], 1./nrm, dfl_fields[0], N); + d = 1.1; + /* gmres_precon(work_fields[0], dfl_fields[0], 20, 1, 1.e-20, 0, N, &D_psi); */ + poly_nonherm_precon(work_fields[0], dfl_fields[0], e, d, 30, N); + D_psi(work_fields[1], work_fields[0]); + diff(work_fields[0], work_fields[1], dfl_fields[0], N); + nrm = square_norm(work_fields[0], N, 1); + if(g_proc_id == 0) { + printf(" e= %f d= %f nrm = %1.5e\n", e, d, nrm); + } + } + d = 1.1; + e=0.3; + } + + boundary(g_kappa); + g_mu = 0.; + /* + CT: We try to read dfl_fields[i] from file if it exists, + otherwise we recalculate it + */ + /* CU: reading and writing should be done with lemon! */ + for(p = 0; p < Ns; p++) { + sprintf(file_name,"dfl_fields.%.2d", p); + if((fp_dfl_fields = fopen(file_name, "r")) == NULL) { + break; + } + else { + fclose(fp_dfl_fields); + if((i = read_spinor(dfl_fields[p], NULL, file_name, 0)) != 0) { + if(g_proc_id == 0) { + fprintf(stderr, "Could not read from file %s err = %d\n", file_name, i); + } + break; + } + } + } + + if((g_proc_id == 0) && (p < Ns) && (g_debug_level > 0)) printf("Compute remaining fields from scratch\n"); + /*CT: We do Ns x 80 x 20 evaluation of Dpsi */ + /* ModifiedGS((_Complex double*)dfl_fields[i], vol, i, (_Complex double*)dfl_fields[0], vpr); */ + /* nrm = sqrt(square_norm(dfl_fields[i], N, 1)); */ + /* mul_r(dfl_fields[i], 1./nrm, dfl_fields[i], N); */ + if(p < Ns) { + if(1) { + for(i = 0; i < Ns; i++) { + /* ModifiedGS((_Complex double*)dfl_fields[i], vol, i, (_Complex double*)dfl_fields[0], vpr); + nrm = sqrt(square_norm(dfl_fields[i], N, 1)); + mul_r(dfl_fields[i], 1./nrm, dfl_fields[i], N); + */ + for(j = 0; j < 20; j++) { + zero_spinor_field(g_spinor_field[0],VOLUME); + g_sloppy_precision = 1; + Msap_eo(g_spinor_field[0], dfl_fields[i], j+1); + /* poly_nonherm_precon(g_spinor_field[0], dfl_fields[i], e, d, 2, N);*/ + /* gmres_precon(work_fields[0], dfl_fields[i], 20, 1, 1.e-20, 0, N, &D_psi); */ + + for (ix=0;ix -1) { + D_psi(work_fields[0], dfl_fields[i]); + nrm = sqrt(square_norm(work_fields[0], N, 1)); + if(g_proc_id == 0) { + printf(" ||D psi_%d||/||psi_%d|| = %1.5e\n", i, i, nrm*nrm); + } + } + } + } + + if(0) { + for(j = 0; j < 4; j++) {/*dfl_field_iter = 80 by default */ + for(i = p; i < Ns; i++) { + ModifiedGS((_Complex double*)dfl_fields[i], vol, i, (_Complex double*)dfl_fields[0], vpr); + nrm = sqrt(square_norm(dfl_fields[i], N, 1)); + mul_r(dfl_fields[i], 1./nrm, dfl_fields[i], N); + for(k = 0; k < 3; k++) { + g_sloppy_precision = 1; + /* dfl_poly_iter = 20 by default */ + zero_spinor_field(g_spinor_field[0],VOLUME); + Msap_eo(g_spinor_field[0], dfl_fields[i], 4); + /* poly_nonherm_precon(g_spinor_field[0], dfl_fields[i], e, d, 4, N); */ + g_sloppy_precision = 0; + ModifiedGS((_Complex double*)g_spinor_field[0], vol, i, (_Complex double*)dfl_fields[0], vpr); + nrm = sqrt(square_norm(g_spinor_field[0], N, 1)); + mul_r(dfl_fields[i], 1./nrm, g_spinor_field[0], N); + } + + /* test quality */ + if(g_debug_level > -1) { + D_psi(work_fields[0], dfl_fields[i]); + nrm = sqrt(square_norm(work_fields[0], N, 1)); + if(g_proc_id == 0) { + printf(" ||D psi_%d||/||psi_%d|| = %1.5e\n", i, i, nrm); + } + } + } + } + } + for(i = 0; i < Ns; i++) { + /* + CT: We save dfl_fields[i] in a binary file, + using a generic nomenclature proc_i__dfl_fields for later reads + */ + sprintf(file_name,"dfl_fields.%.2d", i); + construct_writer(&writer, file_name, 0); + write_propagator_type(writer, 4); + write_spinor(writer, &dfl_fields[i], NULL, 1, 64); + destruct_writer(writer); + } + } + g_mu = musave; + g_sloppy_precision = 0; + boundary(g_kappa); + if(g_debug_level > 2) { + for(i = 0; i < Ns; i++) { + for(j = 0; j < Ns; j++) { + s = scalar_prod(dfl_fields[i], dfl_fields[j], N, 1); + if(g_proc_id == 0) { + printf("<%d, %d> = %1.3e +i %1.3e\n", i, j, creal(s), cimag(s)); + } + } + } + } + for (i = 0; i < Ns; i++) { + /* add it to the basis */ + /* split_global_field(block_list[0].basis[i], block_list[1].basis[i], dfl_fields[i]); */ + split_global_field_GEN_ID(block_list, i, dfl_fields[i], nb_blocks); + } + + /* perform local orthonormalization */ + for(i = 0; i < nb_blocks; i++) block_orthonormalize(block_list+i); + /* block_orthonormalize(block_list+1); */ + + dfl_subspace_updated = 1; + + for(j = 0; j < Ns; j++) { + for(i = 0; i < nb_blocks*9*Ns; i++) { + (little_dfl_fields[j][i]) = 0.0; + (work[i]) = 0.0; + } + } + + /* compute the little little basis */ + /* r = work_fields[0]; */ + /* q = g_spinor_field[DUM__SOLVER+1]; */ + + for(i = 0; i < Ns; i++) { + /* split_global_field(r, q, dfl_fields[i]); */ + split_global_field_GEN(psi, dfl_fields[i], nb_blocks); + /* now take the local scalar products */ + for(j = 0; j < Ns; j++) { + //p = r; + for(blk = 0; blk < nb_blocks; blk++) { + //if(blk == 0) p = r; else p = q; + little_dfl_fields[i][j + blk*Ns] = scalar_prod(block_list[blk].basis[j], psi[blk], block_list[0].volume, 0); + } + } + } + + /* orthonormalise */ + for(i = 0; i < Ns; i++) { + for (j = 0; j < i; j++) { + s = lscalar_prod(little_dfl_fields[j], little_dfl_fields[i], nb_blocks*Ns, 1); + lassign_diff_mul(little_dfl_fields[i], little_dfl_fields[j], s, nb_blocks*Ns); + } + s = lsquare_norm(little_dfl_fields[i], nb_blocks*Ns, 1); + lmul_r(little_dfl_fields[i], 1./sqrt(creal(s)), little_dfl_fields[i], nb_blocks*Ns); + } + if(g_debug_level > 0) { + for(i = 0; i < Ns; i++) { + for(j = 0; j < Ns; j++) { + s = lscalar_prod(little_dfl_fields[i], little_dfl_fields[j], nb_blocks*Ns, 1); + if(g_proc_id == 0) { + printf("<%d, %d> = %1.3e +i %1.3e\n", i, j, creal(s), cimag(s)); + } + } + } + } + + for(i = 0; i < Ns; i++) { + little_D(work, little_dfl_fields[i]); + for(j = 0; j < Ns; j++) { + little_A[i * Ns + j] = lscalar_prod(little_dfl_fields[j], work, nb_blocks*Ns, 1); + if(g_proc_id == 0 && g_debug_level > 4) { + printf("%1.3e %1.3ei, ", creal(little_A[i * Ns + j]), cimag(little_A[i * Ns + j])); + } + } + if(g_proc_id == 0 && g_debug_level > 4) printf("\n"); + } + if(g_proc_id == 0 && g_debug_level > 4) printf("\n"); + /* the precision in the inversion is not yet satisfactory! */ + LUInvert(Ns, little_A, Ns); + /* inverse of little little D now in little_A */ + + + for(j = 0; j < Ns; j++) { + for(i = 0; i < nb_blocks*9*Ns; i++) { + (little_dfl_fields_eo[j][i]) = 0.0; + (work[i]) = 0.0; + } + } + + /* compute the eo little little basis */ + /* r = work_fields[0]; */ + /* q = g_spinor_field[DUM__SOLVER+1]; */ + + for(i = 0; i < Ns; i++) { + /* split_global_field(r, q, dfl_fields[i]); */ + split_global_field_GEN(psi, dfl_fields[i], nb_blocks); + /* now take the local scalar products */ + for(j = 0; j < Ns; j++) { + i_o=0; + for(blk = 0; blk < nb_blocks; blk++) { + if (block_list[blk].evenodd==1) { + little_dfl_fields_eo[i][j + (nb_blocks/2+i_o)*Ns] = scalar_prod(block_list[blk].basis[j], psi[blk], block_list[0].volume, 0); + i_o++; + } + } + } + } + + /* orthonormalise */ + for(i = 0; i < Ns; i++) { + for (j = 0; j < i; j++) { + s = lscalar_prod(little_dfl_fields_eo[j], little_dfl_fields_eo[i], nb_blocks*Ns, 1); + lassign_diff_mul(little_dfl_fields_eo[i], little_dfl_fields_eo[j], s, nb_blocks*Ns); + } + s = lsquare_norm(little_dfl_fields_eo[i], nb_blocks*Ns, 1); + lmul_r(little_dfl_fields_eo[i], 1./sqrt(creal(s)), little_dfl_fields_eo[i], nb_blocks*Ns); + } + if(g_debug_level > 0) { + for(i = 0; i < Ns; i++) { + for(j = 0; j < Ns; j++) { + s = lscalar_prod(little_dfl_fields_eo[i], little_dfl_fields_eo[j], nb_blocks*Ns, 1); + if(g_proc_id == 0) { + printf("<%d, %d> = %1.3e +i %1.3e\n", i, j, creal(s), cimag(s)); + } + } + } + } + + for(i = 0; i < Ns; i++) { + little_D_sym(work, little_dfl_fields_eo[i]); + for(j = 0; j < Ns; j++) { + little_A_eo[i * Ns + j] = lscalar_prod(little_dfl_fields_eo[j], work, nb_blocks*Ns, 1); + if(g_proc_id == 0 && g_debug_level > 4) { + printf("%1.3e %1.3ei, ", creal(little_A_eo[i * Ns + j]), cimag(little_A_eo[i * Ns + j])); + } + } + if(g_proc_id == 0 && g_debug_level > 4) printf("\n"); + } + if(g_proc_id == 0 && g_debug_level > 4) printf("\n"); + /* the precision in the inversion is not yet satisfactory! */ + LUInvert(Ns, little_A_eo, Ns); + /* inverse of eo little little D now in little_A_eo */ + + + +#ifdef MPI + etime = MPI_Wtime(); +#else + etime = (double)clock()/(double)(CLOCKS_PER_SEC); +#endif + if(g_proc_id == 0) { + printf("time for subspace generation %1.3e s\n", etime-atime); + fflush(stdout); + } + + finalize_solver(work_fields, nr_wf); + free_dfl_subspace(); + free(work); + free(psi[0]); + free(psi); + return(0); +} + +int generate_dfl_subspace_free(const int Ns, const int N) { + int i,j, vpr = VOLUMEPLUSRAND*sizeof(spinor)/sizeof(_Complex double), + vol = VOLUME*sizeof(spinor)/sizeof(_Complex double); + double nrm; + _Complex double s; + spinor ** work_fields = NULL; + const int nr_wf = 1; + init_solver_field(&work_fields, VOLUMEPLUSRAND, nr_wf); + + if(init_subspace == 0) init_dfl_subspace(Ns); + + for(i = 0; i < 12; i++) { + constant_spinor_field(dfl_fields[i], i, N); + ModifiedGS((_Complex double*)dfl_fields[i], vol, i, (_Complex double*)dfl_fields[0], vpr); + nrm = sqrt(square_norm(dfl_fields[i], N, 1)); + mul_r(dfl_fields[i], 1./nrm, dfl_fields[i], N); + + /* test quality */ + if(g_debug_level > -1) { + D_psi(work_fields[0], dfl_fields[i]); + nrm = sqrt(square_norm(work_fields[0], N, 1)); + if(g_proc_id == 0) { + printf(" ||D psi_%d||/||psi_%d|| = %1.5e\n", i, i, nrm); + } + } + } + + if(g_debug_level > 4) { + for(i = 0; i < 12; i++) { + for(j = 0; j < 12; j++) { + s = scalar_prod(dfl_fields[i], dfl_fields[j], N, 1); + if(g_proc_id == 0) { + printf("<%d, %d> = %1.3e +i %1.3e\n", i, j, creal(s), cimag(s)); + } + } + } + } + finalize_solver(work_fields, nr_wf); + return(0); +} + +int init_little_dfl_subspace(const int N_s) { + int i; + if(init_little_subspace == 0) { + if((void*)(_little_dfl_fields = (_Complex double*)calloc((N_s)*nb_blocks*9*N_s+4, sizeof(_Complex double))) == NULL) { + return(1); + } + if((void*)(little_dfl_fields = (_Complex double**)calloc(N_s, sizeof(_Complex double*))) == NULL) { + return(1); + } + if((void*)(_little_dfl_fields_eo = (_Complex double*)calloc((N_s)*nb_blocks*9*N_s+4, sizeof(_Complex double))) == NULL) { + return(1); + } + if((void*)(little_dfl_fields_eo = (_Complex double**)calloc(N_s, sizeof(_Complex double*))) == NULL) { + return(1); + } +#if ( defined SSE || defined SSE2 || defined SSE3) + little_dfl_fields[0] = (_Complex double*)(((unsigned long int)(_little_dfl_fields)+ALIGN_BASE)&~ALIGN_BASE); + little_dfl_fields_eo[0] = (_Complex double*)(((unsigned long int)(_little_dfl_fields_eo)+ALIGN_BASE)&~ALIGN_BASE); +#else + little_dfl_fields[0] = _little_dfl_fields; + little_dfl_fields_eo[0] = _little_dfl_fields_eo; +#endif + for (i = 1; i < N_s; i++) { + little_dfl_fields[i] = little_dfl_fields[i-1] + nb_blocks*9*N_s; + little_dfl_fields_eo[i] = little_dfl_fields_eo[i-1] + nb_blocks*9*N_s; + } + if((void*)(little_A = (_Complex double*)calloc(N_s*N_s, sizeof(_Complex double))) == NULL) { + return(1); + } + if((void*)(little_A_eo = (_Complex double*)calloc(N_s*N_s, sizeof(_Complex double))) == NULL) { + return(1); + } + init_little_subspace = 1; + } + return(0); +} + +int init_dfl_subspace(const int N_s) { + int i; + init_subspace = 1; + if((void*)(_dfl_fields = calloc((N_s)*VOLUMEPLUSRAND+1, sizeof(spinor))) == NULL) { + return(1); + } + if ((void*)(dfl_fields = calloc((N_s), sizeof(spinor *))) == NULL) { + return(1); + } +#if ( defined SSE || defined SSE2 || defined SSE3) + dfl_fields[0] = (spinor*)(((unsigned long int)(_dfl_fields)+ALIGN_BASE)&~ALIGN_BASE); +#else + dfl_fields[0] = _dfl_fields; +#endif + for (i = 1; i < N_s; ++i) { + dfl_fields[i] = dfl_fields[i-1] + VOLUMEPLUSRAND; + } + return 0; +} + +int free_dfl_subspace() { + if(init_subspace == 1) { + free(dfl_fields); + free(_dfl_fields); + init_subspace = 0; + } + return 0; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/generate_dfl_subspace.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/generate_dfl_subspace.h new file mode 100644 index 0000000000000000000000000000000000000000..dc5848539cf267e1fa5a3d8704fa5508df45903d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/generate_dfl_subspace.h @@ -0,0 +1,34 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _GENERATE_DFL_SUBSPACE +#define _GENERATE_DFL_SUBSPACE + +#include "su3.h" +#include + +int init_dfl_subspace(const int); +int free_dfl_subspace(); +int generate_dfl_subspace(const int Ns, const int N, const int repro); +int generate_dfl_subspace_free(const int Ns, const int N); + +extern spinor ** dfl_fields; +extern _Complex double ** little_dfl_fields; +extern _Complex double ** little_dfl_fields_eo; + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/gmres.c b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/gmres.c new file mode 100644 index 0000000000000000000000000000000000000000..036545b250214c98bfbe775d7694f666913ff8c8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/gmres.c @@ -0,0 +1,246 @@ +/*********************************************************************** + * + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * Generalized minimal residual (GMRES) with a maximal number of restarts. + * Solves Q=AP for _Complex double regular matrices A. + * For details see: Andreas Meister, Numerik linearer Gleichungssysteme + * or the original citation: + * Y. Saad, M.H.Schultz in GMRES: A generalized minimal residual algorithm + * for solving nonsymmetric linear systems. + * SIAM J. Sci. Stat. Comput., 7: 856-869, 1986 + * + * int gmres(spinor * const P,spinor * const Q, + * const int m, const int max_restarts, + * const double eps_sq, matrix_mult f) + * + * Returns the number of iterations needed or -1 if maximal number of restarts + * has been reached. + * + * Inout: + * spinor * P : guess for the solving spinor + * Input: + * spinor * Q : source spinor + * int m : Maximal dimension of Krylov subspace + * int max_restarts : maximal number of restarts + * double eps : stopping criterium + * matrix_mult f : pointer to a function containing the matrix mult + * for type matrix_mult see matrix_mult_typedef.h + * + * Autor: Carsten Urbach + ********************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include"global.h" +#include"su3.h" +#include"linalg_eo.h" +#include "solver_field.h" +#include"gmres.h" + + +static void init_gmres(const int _M, const int _V); + +static _Complex double ** H; +static _Complex double * alpha; +static _Complex double * c; +static double * s; +static spinor ** V; +static spinor * _v; +static _Complex double * _h; +static _Complex double * alpha; +static _Complex double * c; +static double * s; + +int gmres(spinor * const P,spinor * const Q, + const int m, const int max_restarts, + const double eps_sq, const int rel_prec, + const int N, const int parallel, matrix_mult f){ + + int restart, i, j, k; + double beta, eps, norm; + _Complex double tmp1, tmp2; + spinor ** solver_field = NULL; + const int nr_sf = 3; + + if(N == VOLUME) { + init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf); + } + else { + init_solver_field(&solver_field, VOLUMEPLUSRAND/2, nr_sf); + } + + eps=sqrt(eps_sq); + init_gmres(m, VOLUMEPLUSRAND); + + norm = sqrt(square_norm(Q, N, parallel)); + + assign(solver_field[2], P, N); + for(restart = 0; restart < max_restarts; restart++){ + /* r_0=Q-AP (b=Q, x+0=P) */ + f(solver_field[0], solver_field[2]); + diff(solver_field[0], Q, solver_field[0], N); + + /* v_0=r_0/||r_0|| */ + alpha[0] = sqrt(square_norm(solver_field[0], N, parallel)); + + if(g_proc_id == g_stdio_proc && g_debug_level > 1){ + printf("%d\t%g true residue\n", restart*m, creal(alpha[0])*creal(alpha[0])); + fflush(stdout); + } + + if(creal(alpha[0])==0.){ + assign(P, solver_field[2], N); + finalize_solver(solver_field, nr_sf); + return(restart*m); + } + + mul_r(V[0], 1./creal(alpha[0]), solver_field[0], N); + + for(j = 0; j < m; j++){ + /* solver_field[0]=A*v_j */ + + f(solver_field[0], V[j]); + + /* Set h_ij and omega_j */ + /* solver_field[1] <- omega_j */ + assign(solver_field[1], solver_field[0], N); + for(i = 0; i <= j; i++){ + H[i][j] = scalar_prod(V[i], solver_field[1], N, parallel); + assign_diff_mul(solver_field[1], V[i], H[i][j], N); + } + + H[j+1][j] = sqrt(square_norm(solver_field[1], N, parallel)); + for(i = 0; i < j; i++){ + tmp1 = H[i][j]; + tmp2 = H[i+1][j]; + (H[i][j]) = (tmp2) * (s[i]); + (H[i][j]) += conj(c[i]) * (tmp1); + (H[i+1][j]) = (tmp1) * (s[i]); + (H[i+1][j]) -= (c[i]) * (tmp2); + } + + /* Set beta, s, c, alpha[j],[j+1] */ + beta = sqrt(creal(H[j][j] * conj(H[j][j])) + creal(H[j+1][j] * conj(H[j+1][j]))); + s[j] = creal(H[j+1][j]) / beta; + (c[j]) = (H[j][j]) / beta; + (H[j][j]) = beta; + (alpha[j+1]) = (alpha[j]) * (s[j]); + tmp1 = alpha[j]; + (alpha[j]) = conj(c[j]) * (tmp1); + + /* precision reached? */ + if(g_proc_id == g_stdio_proc && g_debug_level > 1){ + printf("%d\t%g residue\n", restart*m+j, creal(alpha[j+1])*creal(alpha[j+1])); + fflush(stdout); + } + if(((creal(alpha[j+1]) <= eps) && (rel_prec == 0)) || ((creal(alpha[j+1]) <= eps*norm) && (rel_prec == 1))){ + (alpha[j]) = (alpha[j]) * (1./creal(H[j][j])); + assign_add_mul(solver_field[2], V[j], alpha[j], N); + for(i = j-1; i >= 0; i--){ + for(k = i+1; k <= j; k++){ + (tmp1) = (H[i][k]) * (alpha[k]); + (alpha[i]) -= tmp1; + } + (alpha[i]) = (alpha[i]) * (1./creal(H[i][i])); + assign_add_mul(solver_field[2], V[i], alpha[i], N); + } + for(i = 0; i < m; i++){ + alpha[i] = creal(alpha[i]); + } + assign(P, solver_field[2], N); + finalize_solver(solver_field, nr_sf); + return(restart*m+j); + } + /* if not */ + else{ + if(j != m-1){ + mul_r(V[(j+1)], 1./creal(H[j+1][j]), solver_field[1], N); + } + } + + } + j=m-1; + /* prepare for restart */ + (alpha[j]) = (alpha[j]) * (1./creal(H[j][j])); + assign_add_mul(solver_field[2], V[j], alpha[j], N); + for(i = j-1; i >= 0; i--){ + for(k = i+1; k <= j; k++){ + (tmp1) = (H[i][k]) * (alpha[k]); + (alpha[i]) -= tmp1; + } + (alpha[i]) = (alpha[i]) * (1./creal(H[i][i])); + assign_add_mul(solver_field[2], V[i], alpha[i], N); + } + for(i = 0; i < m; i++){ + alpha[i] = creal(alpha[i]); + } + } + + /* If maximal number of restarts is reached */ + assign(P, solver_field[2], N); + finalize_solver(solver_field, nr_sf); + return(-1); +} + +static void init_gmres(const int _M, const int _V){ + static int Vo = -1; + static int M = -1; + static int init = 0; + int i; + if((M != _M)||(init == 0)||(Vo != _V)){ + if(init == 1){ + free(H); + free(V); + free(_h); + free(_v); + free(alpha); + free(c); + free(s); + } + Vo = _V; + M = _M; + H = calloc(M+1, sizeof(_Complex double *)); + V = calloc(M, sizeof(spinor *)); +#if (defined SSE || defined SSE2) + _h = calloc((M+2)*M+8, sizeof(_Complex double)); + H[0] = (_Complex double *)(((unsigned long int)(_h)+ALIGN_BASE)&~ALIGN_BASE); + _v = calloc(M*Vo+1, sizeof(spinor)); + V[0] = (spinor *)(((unsigned long int)(_v)+ALIGN_BASE)&~ALIGN_BASE); +#else + _h = calloc((M+1)*M, sizeof(_Complex double)); + H[0] = _h; + _v = calloc(M*Vo, sizeof(spinor)); + V[0] = _v; +#endif + s = calloc(M, sizeof(double)); + c = calloc(M, sizeof(_Complex double)); + alpha = calloc(M+1, sizeof(_Complex double)); + for(i = 1; i < M; i++){ + V[i] = V[i-1] + Vo; + H[i] = H[i-1] + M; + } + H[M] = H[M-1] + M; + init = 1; + } + return; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/gmres.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/gmres.h new file mode 100644 index 0000000000000000000000000000000000000000..57a1e3119fad30c47490f211909928c4cf21c2fc --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/gmres.h @@ -0,0 +1,61 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +/******************************************************************************* + * Generalized minimal residual (GMRES) with a maximal number of restarts. + * Solves Q=AP for _Complex double regular matrices A. + * For details see: Andreas Meister, Numerik linearer Gleichungssysteme + * or the original citation: + * Y. Saad, M.H.Schultz in GMRES: A generalized minimal residual algorithm + * for solving nonsymmetric linear systems. + * SIAM J. Sci. Stat. Comput., 7: 856-869, 1986 + * + * int gmres(spinor * const P,spinor * const Q, + * const int m, const int max_restarts, + * const double eps_sq, matrix_mult f) + * + * Returns the number of iterations needed or -1 if maximal number of restarts + * has been reached. + * + * Inout: + * spinor * P : guess for the solving spinor + * Input: + * spinor * Q : source spinor + * int m : Maximal dimension of Krylov subspace + * int max_restarts : maximal number of restarts + * double eps : stopping criterium + * matrix_mult f : pointer to a function containing the matrix mult + * for type matrix_mult see matrix_mult_typedef.h + * + * Autor: Carsten Urbach + ********************************************************************************/ + +#ifndef _GMRES_H +#define _GMRES_H + +#include"solver/matrix_mult_typedef.h" +#include"su3.h" + +int gmres(spinor * const P,spinor * const Q, + const int m, const int max_restarts, + const double eps, const int rel_prec, + const int N, const int parallel, matrix_mult f); + + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/gmres_dr.c b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/gmres_dr.c new file mode 100644 index 0000000000000000000000000000000000000000..3a55a137c2eb656de0eabc22bc8a84123c0b78ab --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/gmres_dr.c @@ -0,0 +1,556 @@ +/*********************************************************************** + * + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * Generalized minimal residual (GMRES) with deflated restarting (Morgan) + * + * This requires LAPACK to run... + * + * Inout: + * spinor * P : guess for the solving spinor + * Input: + * spinor * Q : source spinor + * int m : Maximal dimension of Krylov subspace + * int nr_ev : number of eigenvectors to be deflated + * int max_restarts : maximal number of restarts + * double eps : stopping criterium + * matrix_mult f : pointer to a function containing the matrix mult + * for type matrix_mult see matrix_mult_typedef.h + * + * Autor: Carsten Urbach + ********************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include"global.h" +#include +#include"su3.h" +#include"linalg_eo.h" +#include"diagonalise_general_matrix.h" +#include"quicksort.h" +#include"linalg/lapack.h" +#include"linalg/blas.h" +#include"solver/gram-schmidt.h" +#include"solver/gmres.h" +#include "solver/solver_field.h" +#include"gmres_dr.h" + +#ifndef HAVE_LAPACK +/* In case there is no lapack use normal gmres */ +int gmres_dr(spinor * const P,spinor * const Q, + const int m, const int nr_ev, const int max_restarts, + const double eps_sq, const int rel_prec, + const int N, matrix_mult f){ + return(gmres(P, Q, m, max_restarts, eps_sq, rel_prec, N, 1, f)); +} + +#else + +static void init_gmres_dr(const int _M, const int _V); +_Complex double short_scalar_prod(_Complex double * const x, _Complex double * const y, const int N); +void short_ModifiedGS(_Complex double v[], int n, int m, _Complex double A[], int lda); + +static _Complex double ** work; +static _Complex double * _work; +static _Complex double ** work2; +static _Complex double * _work2; +static _Complex double ** H; +static _Complex double ** G; +static _Complex double * alpha; +static _Complex double * c; +static double * s; +static spinor ** V; +static spinor * _v; +static spinor ** Z; +static spinor * _z; +static _Complex double * _h; +static _Complex double * _g; +static _Complex double * alpha; +static _Complex double * c; +static double * s; +static _Complex double * evalues; +static double * sortarray; +static int * idx; +static int one = 1; +static _Complex double cmone; +static _Complex double cpone; +static _Complex double czero; + +int gmres_dr(spinor * const P,spinor * const Q, + const int m, const int nr_ev, const int max_restarts, + const double eps_sq, const int rel_prec, + const int N, matrix_mult f){ + + int restart=0, i, j, k, l; + double beta, eps, norm, beta2=0.; + _Complex double *lswork = NULL; + int lwork; + _Complex double tmp1, tmp2; + int info=0; + int _m = m, mp1 = m+1, np1 = nr_ev+1, ne = nr_ev, V2 = 12*(VOLUMEPLUSRAND)/2, _N = 12*N; + spinor ** solver_field = NULL; + const int nr_sf = 3; + + if(N == VOLUME) { + init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf); + } + else { + init_solver_field(&solver_field, VOLUMEPLUSRAND/2, nr_sf); + } + double err=0.; + spinor * r0, * x0; + + cmone = -1.; + cpone = 1.; + czero = 0.; + + r0 = solver_field[0]; + x0 = solver_field[2]; + eps=sqrt(eps_sq); + init_gmres_dr(m, (VOLUMEPLUSRAND)); + norm = sqrt(square_norm(Q, N, 1)); + + assign(x0, P, N); + + /* first normal GMRES cycle */ + /* r_0=Q-AP (b=Q, x+0=P) */ + f(r0, x0); + diff(r0, Q, r0, N); + + /* v_0=r_0/||r_0|| */ + alpha[0] = sqrt(square_norm(r0, N, 1)); + err = alpha[0]; + + if(g_proc_id == g_stdio_proc && g_debug_level > 0){ + printf("%d\t%e true residue\n", restart * m, creal(alpha[0])*creal(alpha[0])); + fflush(stdout); + } + + if(creal(alpha[0])==0.){ + assign(P, x0, N); + finalize_solver(solver_field, nr_sf); + return(restart*m); + } + + mul_r(V[0], 1./creal(alpha[0]), r0, N); + + for(j = 0; j < m; j++){ + /* solver_field[0]=A*v_j */ + + /* Set h_ij and omega_j */ + /* solver_field[1] <- omega_j */ + f(solver_field[1], V[j]); +/* assign(solver_field[1], solver_field[0], N); */ + for(i = 0; i <= j; i++){ + H[i][j] = scalar_prod(V[i], solver_field[1], N, 1); + /* G, work and work2 are in Fortran storage: columns first */ + G[j][i] = H[i][j]; + work2[j][i] = H[i][j]; + work[i][j] = conj(H[i][j]); + assign_diff_mul(solver_field[1], V[i], H[i][j], N); + } + + H[j+1][j] = sqrt(square_norm(solver_field[1], N, 1)); + G[j][j+1] = H[j+1][j]; + work2[j][j+1] = H[j+1][j]; + work[j+1][j] = conj(H[j+1][j]); + beta2 = creal(H[j+1][j])*creal(H[j+1][j]); + for(i = 0; i < j; i++){ + tmp1 = H[i][j]; + tmp2 = H[i+1][j]; + H[i][j] = (tmp2) * (s[i]); + H[i][j] += conj(c[i]) * (tmp1); + H[i+1][j] = (tmp1) * (s[i]); + H[i+1][j] -= (c[i]) * (tmp2); + } + + /* Set beta, s, c, alpha[j],[j+1] */ + beta = sqrt(creal(H[j][j] * conj(H[j][j])) + creal(H[j+1][j] * conj(H[j+1][j]))); + s[j] = creal(H[j+1][j]) / beta; + c[j] = (H[j][j]) / beta; + H[j][j] = beta; + alpha[j+1] = (alpha[j]) * (s[j]); + tmp1 = alpha[j]; + alpha[j] = conj(c[j]) * (tmp1); + + /* precision reached? */ + if(g_proc_id == g_stdio_proc && g_debug_level > 0){ + printf("%d\t%e residue\n", restart*m+j, creal(alpha[j+1])*creal(alpha[j+1])); + fflush(stdout); + } + if(((creal(alpha[j+1]) <= eps) && (rel_prec == 0)) || ((creal(alpha[j+1]) <= eps*norm) && (rel_prec == 1))){ + alpha[j] = (alpha[j]) * (1./creal(H[j][j])); + assign_add_mul(x0, V[j], alpha[j], N); + for(i = j-1; i >= 0; i--){ + for(k = i+1; k <= j; k++){ + tmp1 = (H[i][k]) * (alpha[k]); + /* alpha[i] -= tmp1 */ + alpha[i] -= tmp1; + } + alpha[i] = (alpha[i]) * (1./creal(H[i][i])); + assign_add_mul(x0, V[i], alpha[i], N); + } + for(i = 0; i < m; i++){ + alpha[i] = creal(alpha[i]); + } + assign(P, x0, N); + finalize_solver(solver_field, nr_sf); + return(restart*m+j); + } + /* if not */ + else { + mul_r(V[(j+1)], 1./creal(H[j+1][j]), solver_field[1], N); + } + + } + j=m-1; + /* prepare for restart */ + alpha[j] = (alpha[j]) * (1./creal(H[j][j])); + assign_add_mul(x0, V[j], alpha[j], N); + if(g_proc_id == 0 && g_debug_level > 3) { + printf("alpha: %e %e\n", creal(alpha[j]), cimag(alpha[j])); + } + for(i = j-1; i >= 0; i--){ + for(k = i+1; k <= j; k++){ + tmp1 = (H[i][k]) * (alpha[k]); + alpha[i] -= tmp1; + } + alpha[i] = (alpha[i]) * (1./creal(H[i][i])); + if(g_proc_id == 0 && g_debug_level > 3) { + printf("alpha: %e %e\n", creal(alpha[i]), cimag(alpha[i])); + } + assign_add_mul(x0, V[i], alpha[i], N); + } + + /* This produces c=V_m+1*r0 */ + for(i = 0; i < mp1; i++) { + c[i] = scalar_prod(V[i], r0, N, 1); + if(g_proc_id == 0 && g_debug_level > 3) { + printf("c: %e %e err = %e\n", creal(c[i]), cimag(c[i]), err); + } + } + + for(restart = 1; restart < max_restarts; restart++) { + + /* compute c-\bar H \alpha */ + _FT(zgemv)("N", &mp1, &_m, &cmone, G[0], &mp1, alpha, &one, &cpone, c, &one, 1); + err = creal(sqrt(short_scalar_prod(c, c, mp1))); + if(g_proc_id == 0 && g_debug_level > 0) { + printf("%d\t %e short residue\n", m*restart, err*err); + } + + /* Compute new residual r0 */ + /* r_0=Q-AP (b=Q, x+0=P) */ + if(g_debug_level > 0) { + f(r0, x0); + diff(r0, Q, r0, N); + tmp1 = sqrt(square_norm(r0, N, 1)) * I; + if(g_proc_id == g_stdio_proc) + { + printf("%d\t%e true residue\n", m*restart, cimag(tmp1)*cimag(tmp1)); + fflush(stdout); + } + } + mul(r0, c[0], V[0], N); + for(i = 1; i < mp1; i++) { + assign_add_mul(r0, V[i], c[i], N); + } + if(g_debug_level > 3) { + tmp1 = sqrt(square_norm(r0, N, 1)) * I; + if(g_proc_id == g_stdio_proc){ + printf("%d\t%e residue\n", m*restart, cimag(tmp1)*cimag(tmp1)); + fflush(stdout); + } + } + /* Stop if satisfied */ + if(err < eps){ + assign(P, x0, N); + finalize_solver(solver_field, nr_sf); + return(restart*m); + } + + /* Prepare to compute harmonic Ritz pairs */ + for(i = 0; i < m-1; i++){ + alpha[i] = 0.; + } + alpha[m-1] = 1.; + _FT(zgesv)(&_m, &one, work[0], &mp1, idx, alpha, &_m, &info); + for(i = 0; i < m; i++) { + G[m-1][i] += beta2*alpha[idx[i]-1]; + } + if(g_proc_id == 0 && g_debug_level > 3){ + printf("zgesv returned info = %d, c[m-1]= %e, %e , idx[m-1]=%d\n", + info, creal(alpha[idx[m-1]-1]), cimag(alpha[idx[m-1]-1]), idx[m-1]); + } + /* c - \bar H * d -> c */ + /* G contains H + \beta^2 H^-He_n e_n^H */ + + /* Compute harmonic Ritz pairs */ + diagonalise_general_matrix(m, G[0], mp1, alpha, evalues); + for(i = 0; i < m; i++) { + sortarray[i] = creal(evalues[i] * conj(evalues[i])); + idx[i] = i; + } + quicksort(m, sortarray, idx); + if(g_proc_id == g_stdio_proc && g_debug_level > 1) { + for(i = 0; i < m; i++) + printf("# Evalues %d %e %e \n", i, creal(evalues[idx[i]]), cimag(evalues[idx[i]])); + fflush(stdout); + } + + /* Copy the first nr_ev eigenvectors to work */ + for(i = 0; i < ne; i++) { + for(l = 0; l < m; l++) { + work[i][l] = G[idx[i]][l]; + } + } + /* Orthonormalize them */ + for(i = 0; i < ne; i++) { + work[i][m] = 0.; + short_ModifiedGS(work[i], m, i, work[0], mp1); + } + /* Orthonormalize c - \bar H d to work */ + short_ModifiedGS(c, m+1, ne, work[0], mp1); + for(i = 0; i < mp1; i++) { + work[nr_ev][i] = c[i]; + } + /* Now compute \bar H = P^T_k+1 \bar H_m P_k */ + for(i = 0; i < mp1; i++) { + for(l = 0; l < mp1; l++) { + H[i][l] = 0.; + } + } + + _FT(zgemm)("N", "N", &mp1, &ne, &_m, &cpone, work2[0], &mp1, work[0], &mp1, &czero, G[0], &mp1, 1, 1); + _FT(zgemm)("C", "N", &np1, &ne , &mp1, &cpone, work[0], &mp1, G[0], &mp1, &czero, H[0], &mp1, 1, 1); + + if(g_debug_level > 3) { + for(i = 0; i < ne+1; i++) { + for(l = 0; l < ne+1; l++) { + if(g_proc_id == 0) { + printf("(g[%d], g[%d]) = %e, %e\n", i, l, creal(short_scalar_prod(work[i], work[l], m+1)), + creal(short_scalar_prod(work[i], work[l], m+1))); + printf("(g[%d], g[%d]) = %e, %e\n", l, i, creal(short_scalar_prod(work[l], work[i], m+1)), + creal(short_scalar_prod(work[l], work[i], m+1))); + } + } + } + } + /* V_k+1 = V_m+1 P_k+1 */ +/* _FT(zgemm)("N", "N", &_N, &np1, &mp1, &cpone, (_Complex double*)V[0], &V2, work[0], &mp1, &czero, (_Complex double*)Z[0], &V2, 1, 1); */ + for(l = 0; l < np1; l++) { + mul(Z[l], work[l][0], V[0], N); + for(i = 1; i < mp1; i++) { + assign_add_mul(Z[l], V[i], work[l][i], N); + } + } + /* copy back to V */ + for(i = 0; i < np1; i++) { + assign(V[i], Z[i], N); + } + /* Reorthogonalise v_nr_ev */ + ModifiedGS((_Complex double*)V[nr_ev], _N, nr_ev, (_Complex double*)V[0], V2); + if(g_debug_level > 3) { + for(i = 0; i < np1; i++) { + for(l = 0; l < np1; l++) { + tmp1 = scalar_prod(V[l], V[i], N, 1); + if(g_proc_id == 0) { + printf("(V[%d], V[%d]) = %e %e %d %d %d %d %d %d %e %e\n", l, i, creal(tmp1), cimag(tmp1), np1, mp1, ne, _m, _N, V2, creal(H[l][i]), cimag(H[l][i])); + } + } + } + } + /* Copy the content of H to work, work2 and G */ + for(i=0; i < mp1; i++) { + for(l = 0; l < mp1; l++) { + G[i][l] = H[i][l]; + work2[i][l] = H[i][l]; + work[l][i] = conj(H[i][l]); + } + } + + for(j = ne; j < m; j++) { + /* solver_field[0]=A*v_j */ + f(solver_field[1], V[j]); + + /* Set h_ij and omega_j */ + /* solver_field[1] <- omega_j */ +/* assign(solver_field[1], solver_field[0], N); */ + for(i = 0; i <= j; i++){ + H[j][i] = scalar_prod(V[i], solver_field[1], N, 1); + /* H, G, work and work2 are now all in Fortran storage: columns first */ + G[j][i] = H[j][i]; + work2[j][i] = H[j][i]; + work[i][j] = conj(H[j][i]); + assign_diff_mul(solver_field[1], V[i], H[j][i], N); + } + beta2 = square_norm(solver_field[1], N, 1); + H[j][j+1] = sqrt(beta2); + G[j][j+1] = H[j][j+1]; + work2[j][j+1] = H[j][j+1]; + work[j+1][j] = conj(H[j][j+1]); + mul_r(V[(j+1)], 1./creal(H[j][j+1]), solver_field[1], N); + } + + /* Solve the least square problem for alpha*/ + /* This produces c=V_m+1*r0 */ + for(i = 0; i < mp1; i++) { + c[i] = scalar_prod(V[i], r0, N, 1); + alpha[i] = c[i]; + if(g_proc_id == 0 && g_debug_level > 3) { + printf("c: %e %e err = %e\n", creal(c[i]), cimag(c[i]), err); + } + } + if(lswork == NULL) { + lwork = -1; + _FT(zgels)("N", &mp1, &_m, &one, H[0], &mp1, alpha, &mp1, &tmp1, &lwork, &info, 1); + lwork = (int)creal(tmp1); + lswork = (_Complex double*)malloc(lwork*sizeof(_Complex double)); + } + _FT(zgels)("N", &mp1, &_m, &one, H[0], &mp1, alpha, &mp1, lswork, &lwork, &info, 1); + if(g_proc_id == 0 && g_debug_level > 3) { + printf("zgels returned info = %d\n", info); + fflush(stdout); + } + /* Compute the new solution vector */ + for(i = 0; i < m; i++){ + if(g_proc_id == 0 && g_debug_level > 3) { + printf("alpha: %e %e\n", creal(alpha[i]), cimag(alpha[i])); + } + assign_add_mul(x0, V[i], alpha[i], N); + } + } + + + /* If maximal number of restart is reached */ + assign(P, x0, N); + finalize_solver(solver_field, nr_sf); + return(-1); +} + +_Complex double short_scalar_prod(_Complex double * const y, _Complex double * const x, const int N) +{ + _Complex double res = 0.0; + + for (int ix = 0; ix < N; ++ix) + res += conj(y[ix]) * x[ix]; + return(res); + +} + +void short_ModifiedGS(_Complex double v[], int n, int m, _Complex double A[], int lda) +{ + double r; + for (int i = 0; i < m; ++i) + { + _Complex double s = -short_scalar_prod(A + i * lda, v, n); + _FT(zaxpy)(&n, &s, A+i*lda, &one, v, &one); + } + + r = creal(sqrt(short_scalar_prod(v, v, n))); + for(int i = 0; i < n; ++i) + v[i] /= r; +} + +static void init_gmres_dr(const int _M, const int _V){ + static int Vo = -1; + static int M = -1; + static int init = 0; + int i; + + if((M != _M)||(init == 0)||(Vo != _V)){ + if(init == 1){ + free(Z); + free(_z); + free(H); + free(G); + free(V); + free(_h); + free(_g); + free(_v); + free(alpha); + free(c); + free(s); + free(evalues); + free(work); + free(_work); + free(work2); + free(_work2); + } + Vo = _V; + M = _M; + H = calloc(M+1, sizeof(_Complex double *)); + Z = calloc(M+1, sizeof(spinor *)); + G = calloc(M+1, sizeof(_Complex double *)); + V = calloc(M+1, sizeof(spinor *)); + work = calloc(M+1, sizeof(_Complex double *)); + work2 = calloc(M+1, sizeof(_Complex double *)); +#if (defined SSE || defined SSE2) + _h = calloc((M+2)*(M+1), sizeof(_Complex double)); + H[0] = (_Complex double *)(((unsigned long int)(_h)+ALIGN_BASE)&~ALIGN_BASE); + _work = calloc((M+2)*(M+1), sizeof(_Complex double)); + work[0] = (_Complex double *)(((unsigned long int)(_work)+ALIGN_BASE)&~ALIGN_BASE); + _work2 = calloc((M+2)*(M+1), sizeof(_Complex double)); + work2[0] = (_Complex double *)(((unsigned long int)(_work2)+ALIGN_BASE)&~ALIGN_BASE); + _g = calloc((M+2)*(M+1), sizeof(_Complex double)); + G[0] = (_Complex double *)(((unsigned long int)(_g)+ALIGN_BASE)&~ALIGN_BASE); + _v = calloc((M+1)*Vo+1, sizeof(spinor)); + V[0] = (spinor *)(((unsigned long int)(_v)+ALIGN_BASE)&~ALIGN_BASE); + _z = calloc((M+1)*Vo+1, sizeof(spinor)); + Z[0] = (spinor *)(((unsigned long int)(_z)+ALIGN_BASE)&~ALIGN_BASE); +#else + _h = calloc((M+1)*(M+1), sizeof(_Complex double)); + H[0] = _h; + _work = calloc((M+1)*(M+1), sizeof(_Complex double)); + work[0] = _work; + _work2 = calloc((M+1)*(M+1), sizeof(_Complex double)); + work2[0] = _work2; + _g = calloc((M+1)*(M+1), sizeof(_Complex double)); + G[0] = _g; + _v = calloc((M+1)*Vo, sizeof(spinor)); + V[0] = _v; + _z = calloc((M+1)*Vo, sizeof(spinor)); + Z[0] = _z; +#endif + s = calloc(M, sizeof(double)); + c = calloc(M+1, sizeof(_Complex double)); + alpha = calloc(M+1, sizeof(_Complex double)); + evalues = calloc(M+1, sizeof(_Complex double)); + sortarray = calloc(M+1, sizeof(double)); + idx = calloc(M+1, sizeof(int)); + for(i = 1; i < M; i++){ + V[i] = V[i-1] + Vo; + H[i] = H[i-1] + M+1; + Z[i] = Z[i-1] + Vo; + G[i] = G[i-1] + M+1; + work[i] = work[i-1] + M+1; + work2[i] = work2[i-1] + M+1; + } + work[M] = work[M-1] + M+1; + work2[M] = work2[M-1] + M+1; + H[M] = H[M-1] + M+1; + G[M] = G[M-1] + M+1; + V[M] = V[M-1] + Vo; + init = 1; + } +} +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/gmres_dr.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/gmres_dr.h new file mode 100644 index 0000000000000000000000000000000000000000..893638d35330e49c799f7591a635de76fea13111 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/gmres_dr.h @@ -0,0 +1,60 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +/******************************************************************************* + * Generalized minimal residual (GMRES) with a maximal number of restarts. + * Solves Q=AP for _Complex double regular matrices A. + * For details see: Andreas Meister, Numerik linearer Gleichungssysteme + * or the original citation: + * Y. Saad, M.H.Schultz in GMRES: A generalized minimal residual algorithm + * for solving nonsymmetric linear systems. + * SIAM J. Sci. Stat. Comput., 7: 856-869, 1986 + * + * int gmres(spinor * const P,spinor * const Q, + * const int m, const int max_restarts, + * const double eps_sq, matrix_mult f) + * + * Returns the number of iterations needed or -1 if maximal number of restarts + * has been reached. + * + * Inout: + * spinor * P : guess for the solving spinor + * Input: + * spinor * Q : source spinor + * int m : Maximal dimension of Krylov subspace + * int max_restarts : maximal number of restarts + * double eps : stopping criterium + * matrix_mult f : pointer to a function containing the matrix mult + * for type matrix_mult see matrix_mult_typedef.h + * + * Autor: Carsten Urbach + ********************************************************************************/ + +#ifndef _GMRES_DR_H +#define _GMRES_DR_H + +#include"solver/matrix_mult_typedef.h" +#include"su3.h" + +int gmres_dr(spinor * const P,spinor * const Q, + const int m, const int nr_ev, const int max_restarts, + const double eps, const int rel_prec, + const int N, matrix_mult f); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/gmres_precon.c b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/gmres_precon.c new file mode 100644 index 0000000000000000000000000000000000000000..bc3d6184a666c7d1b592093a546dc2d0d4d40b98 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/gmres_precon.c @@ -0,0 +1,354 @@ +/*********************************************************************** + * + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * Generalized minimal residual (GMRES) with a maximal number of restarts. + * Solves Q=AP for complex regular matrices A. This is a special version for + * precondition. + * + * int gmres(spinor * const P,spinor * const Q, + * const int m, const int max_restarts, + * const double eps_sq, matrix_mult f) + * + * Returns the number of iterations needed or -1 if maximal number of restarts + * has been reached. + * + * Inout: + * spinor * P : guess for the solving spinor + * Input: + * spinor * Q : source spinor + * int m : Maximal dimension of Krylov subspace + * int max_restarts : maximal number of restarts + * double eps : stopping criterium + * matrix_mult f : pointer to a function containing the matrix mult + * for type matrix_mult see matrix_mult_typedef.h + * + * Autor: Carsten Urbach + ********************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include"global.h" +#include"su3.h" +#include"linalg_eo.h" +#include"start.h" +#include "solver_field.h" +#include"gmres_precon.h" + +#ifdef _SOLVER_OUTPUT +#define _SO(x) x +#else +#define _SO(x) +#endif + +static void init_pgmres(const int _M, const int _V); + + +static complex ** H; +static complex * alpha; +static complex * c; +static double * s; +static spinor ** V; +static spinor * _v; +static complex * _h; +static complex * alpha; +static complex * c; +static double * s; + +int gmres_precon(spinor * const P,spinor * const Q, + const int m, const int max_restarts, + const double eps_sq, const int rel_prec, + const int N, matrix_mult f){ + + int restart, i, j, k; + double beta, eps, norm; + complex tmp1, tmp2; + spinor ** solver_field = NULL; + const int nr_sf = 2; + + if(N == VOLUME) { + init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf); + } + else { + init_solver_field(&solver_field, VOLUMEPLUSRAND/2, nr_sf); + } + eps=sqrt(eps_sq); + init_pgmres(m, VOLUMEPLUSRAND); + + norm = sqrt(square_norm(Q, N, 1)); + +/* assign(solver_field[1], P, N); */ + zero_spinor_field(solver_field[1], N); + for(restart = 0; restart < max_restarts; restart++){ + /* r_0=Q-AP (b=Q, x+0=P) */ + f(solver_field[1], solver_field[1]); + diff(solver_field[1], Q, solver_field[1], N); + /* v_0=r_0/||r_0|| */ + alpha[0].re=sqrt(square_norm(solver_field[1], N, 1)); + +/* if(alpha[0].re == 0.){ */ +/* assign(P, solver_field[1], N); */ +/* return(restart*m); */ +/* } */ + + if(alpha[0].re != 0.) { + mul_r(V[0], 1./alpha[0].re, solver_field[1], N); + + for(j = 0; j < m; j++){ + /* solver_field[1]=A*v_j */ + + f(solver_field[1], V[j]); + + /* Set h_ij and omega_j */ + /* solver_field[0] <- omega_j */ + assign(solver_field[0], solver_field[1], N); + for(i = 0; i <= j; i++){ + H[i][j] = scalar_prod(V[i], solver_field[0], N, 1); + assign_diff_mul(solver_field[0], V[i], H[i][j], N); + } + + _complex_set(H[j+1][j], sqrt(square_norm(solver_field[0], N, 1)), 0.); + for(i = 0; i < j; i++){ + tmp1 = H[i][j]; + tmp2 = H[i+1][j]; + _mult_real(H[i][j], tmp2, s[i]); + _add_assign_complex_conj(H[i][j], c[i], tmp1); + _mult_real(H[i+1][j], tmp1, s[i]); + _diff_assign_complex(H[i+1][j], c[i], tmp2); + } + + /* Set beta, s, c, alpha[j],[j+1] */ + beta = sqrt(_complex_square_norm(H[j][j]) + _complex_square_norm(H[j+1][j])); + s[j] = H[j+1][j].re / beta; + _mult_real(c[j], H[j][j], 1./beta); + _complex_set(H[j][j], beta, 0.); + _mult_real(alpha[j+1], alpha[j], s[j]); + tmp1 = alpha[j]; + _mult_assign_complex_conj(alpha[j], c[j], tmp1); + + /* precision reached? */ + if(g_proc_id == g_stdio_proc && g_debug_level > 0){ + printf("gmres precon\t%d\t%g residue\n", restart*m+j, alpha[j+1].re*alpha[j+1].re); + fflush(stdout); + } + if(((alpha[j+1].re <= eps) && (rel_prec == 0)) || ((alpha[j+1].re <= eps*norm) && (rel_prec == 1))){ + _mult_real(alpha[j], alpha[j], 1./H[j][j].re); + assign_add_mul(solver_field[1], V[j], alpha[j], N); + for(i = j-1; i >= 0; i--){ + for(k = i+1; k <= j; k++){ + _mult_assign_complex(tmp1, H[i][k], alpha[k]); + _diff_complex(alpha[i], tmp1); + } + _mult_real(alpha[i], alpha[i], 1./H[i][i].re); + assign_add_mul(solver_field[1], V[i], alpha[i], N); + } + for(i = 0; i < m; i++){ + alpha[i].im = 0.; + } + assign(P, solver_field[1], N); + finalize_solver(solver_field, nr_sf); + return(restart*m+j); + } + /* if not */ + else{ + if(j != m-1){ + mul_r(V[(j+1)], 1./H[j+1][j].re, solver_field[0], N); + } + } + } + + j=m-1; + /* prepare for restart */ + _mult_real(alpha[j], alpha[j], 1./H[j][j].re); + assign_add_mul(solver_field[1], V[j], alpha[j], N); + for(i = j-1; i >= 0; i--){ + for(k = i+1; k <= j; k++){ + _mult_assign_complex(tmp1, H[i][k], alpha[k]); + _diff_complex(alpha[i], tmp1); + } + _mult_real(alpha[i], alpha[i], 1./H[i][i].re); + assign_add_mul(solver_field[1], V[i], alpha[i], N); + } + for(i = 0; i < m; i++){ + alpha[i].im = 0.; + } + } + } + + /* If maximal number of restarts is reached */ + assign(P, solver_field[1], N); + + finalize_solver(solver_field, nr_sf); + return(-1); +} + +static void init_pgmres(const int _M, const int _V){ + static int Vo = -1; + static int M = -1; + static int pinit = 0; + int i; + if((M != _M)||(pinit == 0)||(Vo != _V)){ + if(pinit == 1){ + free(H); + free(V); + free(_h); + free(_v); + free(alpha); + free(c); + free(s); + } + Vo = _V; + M = _M; + H = calloc(M+1, sizeof(complex *)); + V = calloc(M, sizeof(spinor *)); +#if (defined SSE || defined SSE2) + _h = calloc((M+2)*M, sizeof(complex)); + H[0] = (complex *)(((unsigned long int)(_h)+ALIGN_BASE)&~ALIGN_BASE); + _v = calloc(M*Vo+1, sizeof(spinor)); + V[0] = (spinor *)(((unsigned long int)(_v)+ALIGN_BASE)&~ALIGN_BASE); +#else + _h = calloc((M+1)*M, sizeof(complex)); + H[0] = _h; + _v = calloc(M*Vo, sizeof(spinor)); + V[0] = _v; +#endif + s = calloc(M, sizeof(double)); + c = calloc(M, sizeof(complex)); + alpha = calloc(M+1, sizeof(complex)); + for(i = 1; i < M; i++){ + V[i] = V[i-1] + Vo; + H[i] = H[i-1] + M; + } + H[M] = H[M-1] + M; + pinit = 1; + } +} + + + +complex scalar_prod_nocom(spinor * const S,spinor * const R, const int N){ + int ix; + static double ks,kc,ds,tr,ts,tt; + spinor *s,*r; + complex c; + + /* Real Part */ + + ks=0.0; + kc=0.0; + + for (ix = 0; ix < N; ix++){ + s=(spinor *) S + ix; + r=(spinor *) R + ix; + + ds=(*r).s0.c0.re*(*s).s0.c0.re+(*r).s0.c0.im*(*s).s0.c0.im+ + (*r).s0.c1.re*(*s).s0.c1.re+(*r).s0.c1.im*(*s).s0.c1.im+ + (*r).s0.c2.re*(*s).s0.c2.re+(*r).s0.c2.im*(*s).s0.c2.im+ + (*r).s1.c0.re*(*s).s1.c0.re+(*r).s1.c0.im*(*s).s1.c0.im+ + (*r).s1.c1.re*(*s).s1.c1.re+(*r).s1.c1.im*(*s).s1.c1.im+ + (*r).s1.c2.re*(*s).s1.c2.re+(*r).s1.c2.im*(*s).s1.c2.im+ + (*r).s2.c0.re*(*s).s2.c0.re+(*r).s2.c0.im*(*s).s2.c0.im+ + (*r).s2.c1.re*(*s).s2.c1.re+(*r).s2.c1.im*(*s).s2.c1.im+ + (*r).s2.c2.re*(*s).s2.c2.re+(*r).s2.c2.im*(*s).s2.c2.im+ + (*r).s3.c0.re*(*s).s3.c0.re+(*r).s3.c0.im*(*s).s3.c0.im+ + (*r).s3.c1.re*(*s).s3.c1.re+(*r).s3.c1.im*(*s).s3.c1.im+ + (*r).s3.c2.re*(*s).s3.c2.re+(*r).s3.c2.im*(*s).s3.c2.im; + + /* Kahan Summation */ + tr=ds+kc; + ts=tr+ks; + tt=ts-ks; + ks=ts; + kc=tr-tt; + } + kc=ks+kc; + + c.re = kc; + + /* Imaginary Part */ + + ks=0.0; + kc=0.0; + + for (ix=0;ix. + ***********************************************************************/ + +/******************************************************************************* + * Generalized minimal residual (GMRES) with a maximal number of restarts. + * Solves Q=AP for _Complex double regular matrices A. + * For details see: Andreas Meister, Numerik linearer Gleichungssysteme + * or the original citation: + * Y. Saad, M.H.Schultz in GMRES: A generalized minimal residual algorithm + * for solving nonsymmetric linear systems. + * SIAM J. Sci. Stat. Comput., 7: 856-869, 1986 + * + * int gmres(spinor * const P,spinor * const Q, + * const int m, const int max_restarts, + * const double eps_sq, matrix_mult f) + * + * Returns the number of iterations needed or -1 if maximal number of restarts + * has been reached. + * + * Inout: + * spinor * P : guess for the solving spinor + * Input: + * spinor * Q : source spinor + * int m : Maximal dimension of Krylov subspace + * int max_restarts : maximal number of restarts + * double eps : stopping criterium + * matrix_mult f : pointer to a function containing the matrix mult + * for type matrix_mult see matrix_mult_typedef.h + * + * Autor: Carsten Urbach + ********************************************************************************/ + +#ifndef _GMRES_PRECON_H +#define _GMRES_PRECON_H + +#include"solver/matrix_mult_typedef.h" +#include"su3.h" + +int gmres_precon(spinor * const P,spinor * const Q, + const int m, const int max_restarts, + const double eps, const int rel_prec, + const int N, matrix_mult f); + + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/gram-schmidt.c b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/gram-schmidt.c new file mode 100644 index 0000000000000000000000000000000000000000..e16eee224d80e3da2f849b80e42cb8f6a39b893f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/gram-schmidt.c @@ -0,0 +1,152 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include "su3spinor.h" +#include +#include "linalg_eo.h" +#include "linalg/blas.h" +#ifdef CRAY +#include +#endif +#include "gram-schmidt.h" + +const int max_cgs_it=5; +static int ONE = 1; + +/* + * + * Iterated Classical Gram-Schmidt Orthogonalization + * + * Orthogonalizes v with respect to A. + * + */ + +void IteratedClassicalGS(_Complex double v[], double *vnrm, int n, int m, _Complex double A[], + _Complex double work1[], int lda) { + const double alpha = 0.5; + + double vnrm_old; + int i, isorth = 0; + int j; + _Complex double CMONE, CONE; + char *fupl_n = "N"; + + CMONE = -1.; + CONE = 1.; + + vnrm_old = sqrt(square_norm((spinor*) v, n*sizeof(_Complex double)/sizeof(spinor), 1)); + + for(i = 0; !isorth && i < max_cgs_it; i ++) { + + for(j = 0; j < m; j++){ + work1[j] = scalar_prod((spinor*)(A+j*lda), (spinor*) v, n*sizeof(_Complex double)/sizeof(spinor), 1); + } +#ifdef HAVE_LAPACK + _FT(zgemv)(fupl_n, &n, &m, &CMONE, A, &lda, work1, &ONE, &CONE, v, &ONE, 1); +#endif + (*vnrm) = sqrt(square_norm((spinor*) v, n*sizeof(_Complex double)/sizeof(spinor), 1)); + + isorth=((*vnrm) > alpha*vnrm_old); + vnrm_old = *vnrm; + } + if (i >= max_cgs_it) { +/* errorhandler(400,""); */ + } +} + +#ifdef WITHLAPH + +void IteratedClassicalGS_su3vect(_Complex double v[], double *vnrm, int n, int m, _Complex double A[], + _Complex double work1[], int lda) { + const double alpha = 0.5; + + double vnrm_old; + int i, isorth = 0; + int j; + _Complex double CMONE, CONE; + + char *fupl_n = "N"; + + CMONE = -1.; + CONE = 1.; + + vnrm_old = sqrt(square_norm_su3vect((su3_vector*) v, n*sizeof(_Complex double)/sizeof(su3_vector),1)); + + for(i = 0; !isorth && i < max_cgs_it; i ++) { + + for(j = 0; j < m; j++){ + work1[j] = scalar_prod_su3vect((su3_vector*)(A+j*lda), (su3_vector*) v, n*sizeof(_Complex double)/sizeof(su3_vector),1); + } +#ifdef HAVE_LAPACK + _FT(zgemv)(fupl_n, &n, &m, &CMONE, A, &lda, work1, &ONE, &CONE, v, &ONE, 1); +#endif + (*vnrm) = sqrt(square_norm_su3vect((su3_vector*) v, n*sizeof(_Complex double)/sizeof(su3_vector),1)); + + isorth=((*vnrm) > alpha*vnrm_old); + vnrm_old = *vnrm; + } + if (i >= max_cgs_it) { + /* errorhandler(400,""); */ + } +} + +#endif // WITHLAPH + +/* + * ModifiedGramSchmidt + * + * Orthogonlaizes v with respect to span{A[:,1:m]} + */ + +void ModifiedGS(_Complex double v[], int n, int m, _Complex double A[], int lda) { + + int i; + _Complex double s; + + for (i = 0; i < m; i ++) { + s = scalar_prod((spinor*)(A+i*lda), (spinor*) v, n*sizeof(_Complex double)/sizeof(spinor), 1); + s = -s; +#ifdef HAVE_LAPACK + _FT(zaxpy)(&n, &s, A+i*lda, &ONE, v, &ONE); +#endif + } +} + +#ifdef WITHLAPH + +void ModifiedGS_su3vect(_Complex double v[], int n, int m, _Complex double A[], int lda) { + + int i; + _Complex double s; + + for (i = 0; i < m; i ++) { + s = scalar_prod_su3vect((su3_vector*)(A+i*lda), (su3_vector*) v, n*sizeof(_Complex double)/sizeof(su3_vector),1); + s = -s; +#ifdef HAVE_LAPACK + _FT(zaxpy)(&n, &s, A+i*lda, &ONE, v, &ONE); +#endif + } +} + +#endif // WITHLAPH diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/gram-schmidt.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/gram-schmidt.h new file mode 100644 index 0000000000000000000000000000000000000000..0b26a9e62f8a02b9a8159e0d2ab88b95db8318e5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/gram-schmidt.h @@ -0,0 +1,31 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _GRAM_SCHMIDT_H +#define _GRAM_SCHMIDT_H +#include + +void IteratedClassicalGS(_Complex double v[], double *vnrm, int n, int m, _Complex double A[], + _Complex double work1[], int lda) ; +void IteratedClassicalGS_su3vect(_Complex double v[], double *vnrm, int n, int m, _Complex double A[], + _Complex double work1[], int lda); + +void ModifiedGS(_Complex double v[], int n, int m, _Complex double A[], int lda); +void ModifiedGS_su3vect(_Complex double v[], int n, int m, _Complex double A[], int lda); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/incr_eigcg.c b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/incr_eigcg.c new file mode 100644 index 0000000000000000000000000000000000000000..8b83feec6f4e663b1b91ed4a527307a7a32a2d0a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/incr_eigcg.c @@ -0,0 +1,588 @@ +/***************************************************************************** + * Copyright (C) 2008,2009,2010,2011,2012 + * Andreas Stathopoulos, Kostas Orginos, Abdou M. Abdel-Rehim + * + * This program is based on interfacing the eigCG solver to the tmLQCD code. + * It was written by Abdou M. Abdel-Rehim. The original code was written + * by Andreas Stathopoulos and Kostas Orginos and integrated in Chroma. + * In this interface we use functions from tmLQCD. + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * + * Incremental eigCG for solving linear systems with multiple right-hand sides + ****************************************************************************/ + + +/**************************************************************************** + * Notes: + * ====== + * This is a modified version of the code that was written by Andreas + * Stathopoulos and Kostas Orginos. The modifications are not in the method + * itself or the major structure of the code, rather are modifications for + * simplifications and to be consistent with the tmLQCD package. In pricipal, + * one could simply take the whole eigcg package and just write some interface + * for the way it is called. However, I decided to simplify things a little and + * also to use the notations and conventions of the tmLQCD package. Below I + * list some notes for this interface procedure implemented here. + * + * 1. Long vectors are stored in tmLQCD as set of spinors at each site while + * the eigcg code uses vectors as an array of complex numbers. To convert + * from the spinor representation to a purely complex array we need two + * things. First note that each spinor has 12 complex numbers. Second, + * given an array of spinors, i.e. type spinor *S, one can use the complex + * representation by casting S as complex *C=(complex *) S. This part is + * needed mainly for using with BLAS routines and mainly for the eigenvalue + * part of the code. One can avoid these by simply using functions in the + * tmLQCD code. This will make coding simpler and also more clear. We should + * keep this in mind. + * + * 2. The way incremental eigcg will be used is that right-hand sides will be + * solved one after the other and they will be passed one by one. This + * requires using static arrays to store the deflation subspace and other + * related variables that will be needed for subsequent right-hand sides. + * The original code assumes that all right-hand sides are bassed at once + * and the solutions are obtained after a single call to incremental eigcg. + * However, the way it is used in chroma is by calling the code for solving + * the systems one by one. This will be the way it is called here also and we + * just need to tell the code how many right-hand sides to be solved assuming + * they will be passed one at a time. + * + * 3. In this version, I will assume no precondtioning. This could be added in + * the future if needed. + * + * 4. Eigenvectors won't be stored after the last right-hand side is solved. + * Eigenvalues will be computed upon request and will be printed out. So, + * no output for eigenvalues or the projection matrix. In the future, we + * might decide to store the eigenvectors in the same way we store spinors. + * + * 5. Calls to LAPACK and BLAS are adjusted to be the same as in tmLQCD. + * + * + * 6. To use SSE,SSE2,etc. type of instructions, we need to align the memory + * for certain variables, specially the long vectors, on a given boundary. + * Also, to be able to use LAPACK and BLAS routines which are written in + * FORTRAN, we have to allocate 2 dimensional matrices coulumns. This is + * done for a matrix of spinors for example as is used in allocating a + * solver_field. It is also recommended to use the same alignment for small + * matrices. Note that the latest version of LAPACK and BLAS has a C + * interface and the interface can accept C type matrices where he elements + * are stored row-wise. + * + * 7. This version is double precision. For single precision, one has to + * perform the sums in the dot products in double. + * + * 8. When assigning a memory with spinor field, note that it is given a dimension + * VOLUMEPLUSRAND if N=VOLUME and VOLUMEPLUSRAND/2 if N=VOLUME/2. This is important + * when using these vectors inside a ALAPCK or BLAS routine. The active + * dimension is N, while the leading dimension is VOLUMEPLUSRAND or + * VOLUMEPLUSRAND/2. So, we need to define a parameter LDN (leading N). + * When casting as complex, these has to be multiplied by 12. + * + * 9. In the original code, there is a work array called ework which size was + * determined by the user and required to satisfy certain bounds. However, + * this is fixed here by choosing esize to be 2 times the length of a long + * vector plus a block of size (2nev)^2. The ework array is of type complex * + * and for proper counting we have to multiply N by 12 because of the 12 + * components of each spinor (color and spin). + * + * 10. The notation for matrix-vector multiplication is f(xout,xin) where + * xout= A*xin. + ****************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include +#include +#include +#include + +#ifdef MPI +# include +#endif + +#include "global.h" +#include "gettime.h" +#include "linalg_eo.h" +#include "start.h" +#include "linalg/blas.h" +#include "linalg/lapack.h" +#include "solver_field.h" +#include "solver/eigcg.h" +#include "solver/ortho.h" + +#include "solver/incr_eigcg.h" + +int incr_eigcg(const int N, const int nrhs, const int nrhs1, spinor * const x, spinor * const b, + const int ldh, matrix_mult f, const double eps_sq1, const double eps_sq, double restart_eps_sq, + const int rand_guess_opt, const int rel_prec, const int maxit, int nev, const int v_max) +{ + /*Static variables and arrays.*/ + static spinor **solver_field; /*4 spinor fields*/ + + static int ncurEvals=0; /* current number of stored eigenvectors */ + static int ncurRHS=0; /* current number of the system being solved */ + + static spinor **evecs; /* accumulated eigenvectors for deflation. */ + + static void *_evals; + static double *evals; /* Ritz values */ + + static void *_v; + static spinor *V; /* work array for eigenvector search basis in eigCG */ + + static void *_h; + static _Complex double *H; /* The ncurEvals^2 matrix: H=evecs'*A*evecs */ + + static void *_hu; + static _Complex double *HU; /* used for diagonalization of H if eigenvalues requested + also used as a copy of H if needed*/ + static void *_initwork; + static _Complex double *initwork; /* vector of size ldh using with init-CG */ + + static void *_ework; + static _Complex double *ework; + /* end of the thinking part */ + + static void *_work; + static _Complex double *work; + + static void *_rwork; + static double *rwork; + + static void *_IPIV; + static int *IPIV; /*integer array to store permutations when solving the small linear system*/ + + /* some constants */ + char cU='U'; char cN='N'; char cV='V'; + _Complex double tpone= 1.0e+00; + _Complex double tzero= 0.0e+00; + //tpone.re=+1.0e+00; tpone.im=0.0e+00; + //tzero.re=+0.0e+00; tzero.im=0.0e+00; + + /* Timing vars */ + double wt1,wt2,wE,wI; + + double eps_sq_used; + + + /* Variables */ + double machEps = 1e-15; + double normb, normsq, tmpd,tmpd2; + _Complex double tempz; + int i,j, ONE = 1; + int tmpsize,tmpi,info=0; + int numIts, flag, nAdded, nev_used; + int maxit_remain; + int esize,nrsf; + + int parallel; /* for parallel processing of the scalar products */ + + /* leading dimension for spinor vectors */ + int LDN; + if(N==VOLUME) + LDN = VOLUMEPLUSRAND; + else + LDN = VOLUMEPLUSRAND/2; + + + #ifdef MPI + parallel=1; + #else + parallel=0; + #endif + + /*think more about this */ + esize=2*12*N+4*nev*nev; /* fixed size for ework used for restarting in eigcg*/ + + nrsf=4; /*number of solver fields */ + + int lwork=3*ldh; + + double cur_res; //current residual squared (initial value will be computed in eigcg) + + /*increment the RHS counter*/ + ncurRHS = ncurRHS +1; + + //set the tolerance to be used for this right-hand side + if(ncurRHS > nrhs1){ + eps_sq_used = eps_sq; + } + else{ + eps_sq_used = eps_sq1; + } + + if(ncurRHS==1)/* If this is the first system, allocate needed memory for the solver*/ + { + init_solver_field(&solver_field, LDN, nrsf); + } + + if(nev==0){ /*incremental eigcg is used as a cg solver. No need to restart forcing no-restart*/ + if(g_proc_id == g_stdio_proc && g_debug_level > 0) { + fprintf(stdout, "CG won't be restarted in this mode since no deflation will take place (nev=0)\n"); + fflush(stdout); + } + + restart_eps_sq=0.0; + } + + + + + if((ncurRHS==1) && (nev >0) )/* If this is the first right-hand side and eigenvectors are needed, allocate needed memory*/ + { + init_solver_field(&evecs, LDN, ldh); + + #if (defined SSE || defined SSE2 || defined SSE3) + + /*Extra elements are needed for allignment */ + //_v = malloc(LDN*v_max*sizeof(spinor)+ALIGN_BASE); + _v = calloc(LDN*v_max+ALIGN_BASE,sizeof(spinor)); + V = (spinor *)(((unsigned long int)(_v)+ALIGN_BASE)&~ALIGN_BASE); + + //_h=malloc(ldh*ldh*sizeof(_Complex double )+ALIGN_BASE); + _h=calloc(ldh*ldh+ALIGN_BASE,sizeof(_Complex double )); + H = (_Complex double *)(((unsigned long int)(_h)+ALIGN_BASE)&~ALIGN_BASE); + + //_hu=malloc(ldh*ldh*sizeof(_Complex double )+ALIGN_BASE); + _hu=calloc(ldh*ldh+ALIGN_BASE,sizeof(_Complex double )); + HU = (_Complex double *)(((unsigned long int)(_hu)+ALIGN_BASE)&~ALIGN_BASE); + + //_ework = malloc(esize*sizeof(_Complex double )+ALIGN_BASE); + _ework = calloc(esize+ALIGN_BASE,sizeof(_Complex double )); + ework=(_Complex double *)(((unsigned long int)(_ework)+ALIGN_BASE)&~ALIGN_BASE); + + //_initwork = malloc(ldh*sizeof(_Complex double )+ALIGN_BASE); + _initwork = calloc(ldh+ALIGN_BASE,sizeof(_Complex double )); + initwork = (_Complex double *)(((unsigned long int)(_initwork)+ALIGN_BASE)&~ALIGN_BASE); + + //_work = malloc(lwork*sizeof(_Complex double )+ALIGN_BASE); + _work = calloc(lwork+ALIGN_BASE,sizeof(_Complex double )); + work = (_Complex double *)(((unsigned long int)(_work)+ALIGN_BASE)&~ALIGN_BASE); + + //_rwork = malloc(3*ldh*sizeof(double)+ALIGN_BASE); + _rwork = calloc(3*ldh+ALIGN_BASE,sizeof(double)); + rwork = (double *)(((unsigned long int)(_rwork)+ALIGN_BASE)&~ALIGN_BASE); + + + //_IPIV = malloc(ldh*sizeof(int)+ALIGN_BASE); + _IPIV = calloc(ldh+ALIGN_BASE,sizeof(int)); + IPIV = (int *)(((unsigned long int)(_IPIV)+ALIGN_BASE)&~ALIGN_BASE); + + //_evals = malloc(ldh*sizeof(double)+ALIGN_BASE); + _evals = calloc(ldh+ALIGN_BASE,sizeof(double)); + evals = (double *)(((unsigned long int)(_evals)+ALIGN_BASE)&~ALIGN_BASE); + + + #else + + V = (spinor *) calloc(LDN*v_max,sizeof(spinor)); + H = calloc(ldh*ldh, sizeof(_Complex double )); + HU= calloc(ldh*ldh, sizeof(_Complex double )); + initwork = calloc(ldh, sizeof(_Complex double )); + ework = calloc(esize, sizeof(_Complex double )); + work = calloc(lwork,sizeof(_Complex double )); + rwork= calloc(3*ldh,sizeof(double)); + IPIV = calloc(ldh, sizeof(int)); + evals = (double *) calloc(ldh, sizeof(double)); + + #endif + + } /*if(ncurRHS==1)*/ + + + if(g_proc_id == g_stdio_proc && g_debug_level > 0) { + fprintf(stdout, "System %d, eps_sq %e\n",ncurRHS,eps_sq_used); + fflush(stdout); + } + + /*---------------------------------------------------------------*/ + /* Call eigCG until this right-hand side converges */ + /*---------------------------------------------------------------*/ + wE = 0.0; wI = 0.0; /* Start accumulator timers */ + flag = -1; /* First time through. Run eigCG regularly */ + maxit_remain = maxit; /* Initialize Max and current # of iters */ + numIts = 0; + + while( flag == -1 || flag == 3) + { + //if(g_proc_id==g_stdio_proc) + //printf("flag= %d, ncurEvals= %d\n",flag,ncurEvals); + + if(ncurEvals > 0) + { + /* --------------------------------------------------------- */ + /* Perform init-CG with evecs vectors */ + /* xinit = xinit + evecs*Hinv*evec'*(b-Ax0) */ + /* --------------------------------------------------------- */ + + wt1 = gettime(); + + /*r0=b-Ax0*/ + normsq = square_norm(x,N,parallel); + if(normsq>0.0) + { + f(solver_field[0],x); /* solver_field[0]= A*x */ + diff(solver_field[1],b,solver_field[0],N); /* solver_filed[1]=b-A*x */ + } + else + assign(solver_field[1],b,N); /* solver_field[1]=b */ + + /* apply the deflation using init-CG */ + /* evecs'*(b-Ax) */ + for(i=0; i 0) */ + + + /* ------------------------------------------------------------ */ + /* Adjust nev for eigcg according to available ldh/restart */ + /* ------------------------------------------------------------ */ + if (flag == 3) { /* restart with the same rhs, set nev_used = 0 */ + nev_used = 0; + } + else + { + /* First time through this rhs. Find nev evecs */ + /* limited by the ldh evecs we can store in total */ + if (ldh-ncurEvals < nev) + nev = ldh - ncurEvals; + nev_used = nev; + } + + /* ------------------------------------------------------------ */ + /* Solve Ax = b with x initial guess */ + /* ------------------------------------------------------------ */ + + wt1 = gettime(); + + eigcg( N, LDN, x, b, &normb, eps_sq_used, restart_eps_sq, rel_prec, maxit_remain, + &numIts, &cur_res, &flag, solver_field, f, + nev_used, v_max, V, esize, ework); + + //if(g_proc_id == g_stdio_proc) + //printf("eigcg flag= %d \n",flag); + + wt2 = gettime(); + + wE = wE + wt2-wt1; + + /* if flag == 3 update the remain max number of iterations */ + maxit_remain = maxit - numIts; + + } + /* end while (flag ==-1 || flag == 3) */ + /* ------------------------------------------------ */ + + /* ---------- */ + /* Reporting */ + /* ---------- */ + /* compute the exact residual */ + f(solver_field[0],x); /* solver_field[0]= A*x */ + diff(solver_field[1],b,solver_field[0],N); /* solver_filed[1]=b-A*x */ + normsq=square_norm(solver_field[1],N,parallel); + if(g_debug_level > 0 && g_proc_id == g_stdio_proc) + { + fprintf(stdout, "For this rhs:\n"); + fprintf(stdout, "Total initCG Wallclock : %-f\n", wI); + fprintf(stdout, "Total eigpcg Wallclock : %-f\n", wE); + fprintf(stdout, "Iterations: %-d\n", numIts); + fprintf(stdout, "Residual: %e, Actual Resid of LinSys : %e\n", cur_res,normsq); + if (flag != 0) { + fprintf(stderr, "Error: eigcg returned with nonzero exit status\n"); + return flag; + fflush(stderr); + } + fflush(stdout); + } + /* ------------------------------------------------------------------- */ + /* ------------------------------------------------------------------- */ + /* Update the evecs and the factorization of evecs'*A*evecs */ + /* ------------------------------------------------------------------- */ + if (nev > 0) + { + + wt1 = gettime(); + + /* Append new Ritz vectors to the basis and orthogonalize them to evecs */ + for(i=0; i 0) + { + fprintf(stdout,"ncurRHS %d\n",ncurRHS); + fprintf(stdout,"ncurEvals %d \n",ncurEvals); + fprintf(stdout,"Update\n"); + fprintf(stdout,"Added %d vecs\n",nAdded); + fprintf(stdout,"U Wallclock : %-f\n", wt2-wt1); + fprintf(stdout,"Note: Update Wall time doesn't include time for computing eigenvalues and their residuals.\n"); + fflush(stdout); + } + + if(g_debug_level > 3) /*compute eigenvalues and their residuals if requested*/ + { + /* copy H into HU */ + tmpsize=ldh*ncurEvals; + _FT(zcopy) (&tmpsize,H,&ONE,HU,&ONE); + + /* compute eigenvalues and eigenvectors of HU (using V and spinor fields as tmp work spaces)*/ + _FT(zheev)(&cV, &cU, &ncurEvals, HU, &ldh, evals, work, &lwork, rwork, &info,1,1); + + if(info != 0) + { + if(g_proc_id == g_stdio_proc) + { + fprintf(stderr,"Error in ZHEEV:, info = %d\n",info); + fflush(stderr); + } + exit(1); + } + + /* compute residuals and print out results */ + for(i=0; i= 2)*/ + } /* if(nev>0) */ + + /*--------------------------------------*/ + /*free memory that is no longer needed */ + /* and reset ncurRHS and ncurEvals */ + /*--------------------------------------*/ + + if(ncurRHS == nrhs) /*this was the last system to be solved */ + { + ncurRHS=0; + ncurEvals=0; + finalize_solver(solver_field,nrsf); + } + + if( (ncurRHS == nrhs) && (nev >0) )/*this was the last system to be solved and there were allocated memory for eigenvector computation*/ + { + finalize_solver(evecs,ldh); + #if (defined SSE || defined SSE2 || defined SSE3) + free(_v); + free(_h); + free(_hu); + free(_ework); + free(_initwork); + free(_IPIV); + free(_evals); + free(_rwork); + free(_work); + #else + free(V); + free(H); + free(HU); + free(ework); + free(initwork); + free(IPIV); + free(evals); + free(rwork); + free(work); + #endif + } + + return numIts; +} + +/*------------------------------End of Incremental eigCG-------------------------------------------------------------*/ + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/incr_eigcg.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/incr_eigcg.h new file mode 100644 index 0000000000000000000000000000000000000000..fdcbbdced6643f5ba2a03221a50ef65bf8276c0d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/incr_eigcg.h @@ -0,0 +1,56 @@ +/***************************************************************************** + * Copyright (C) 2008,2009,2010,2011,2012 + * Andreas Stathopoulos, Kostas Orginos, Abdou M. Abdel-Rehim + * + * This program is based on interfacing the eigCG solver to the tmLQCD code. + * It was written by Abdou M. Abdel-Rehim. The original code was written + * by Andreas Stathopoulos and Kostas Orginos and integrated in Chroma. + * In this interface we use functions from tmLQCD. + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * + * Incremental eigCG for solving linear systems multiple right-hand sides + ****************************************************************************/ +/* A sample input is given in the sample-input folder */ + +#ifndef _INCR_EIGCG_H +#define _INCR_EIGCG_H + +#include "su3.h" +#include "solver/matrix_mult_typedef.h" + + +int incr_eigcg( + const int N, /*(IN) Number of lattice sites for this process*/ + const int nrhs, /*(IN) Number of right-hand sides to be solved*/ + const int nrhs1, /*(IN) First number of right-hand sides to be solved using tolerance eps_sq1*/ + spinor * const x, /*(IN/OUT) initial guess on input, solution on output for this RHS*/ + spinor * const b, /*(IN) right-hand side*/ + const int ldh, /*(IN) maximum number of eignvectors to be computed*/ + matrix_mult f, /*(IN) f(s,r) computes s=A*r, i.e. matrix-vector multiply*/ + const double eps_sq1, /*(IN) squared tolerance of convergence of the linear system for systems 1 till nrhs1*/ + const double eps_sq, /*(IN) squared tolerance of convergence of the linear system for systems nrhs1+1 till nrhs*/ + double restart_eps_sq, /*(IN) squared tolerance for restarting CG*/ + const int rand_guess_opt, /*(IN) set to non-zero if you want to use random intitial guess (volume Gaussian with mean 0)*/ + const int rel_preq, /*(IN)0 for using absoute error for convergence + 1 for using relative error for convergence*/ + const int maxit, /*(IN) Maximum allowed number of iterations to solution*/ + int nev, /*(IN)number of eigenvectors to be computed while solving + every right-hand side until the maximum number ldh is reached*/ + const int v_max); /*(IN) subspace size used to compute nev vectors*/ + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/index_jd.c b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/index_jd.c new file mode 100644 index 0000000000000000000000000000000000000000..a0a53691bb2a10d3189f664794f093ba631f95f2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/index_jd.c @@ -0,0 +1,445 @@ +/******************************************************************************* + + * + * This routine computes the index with the Jacobi-Davidson method + * + * Author: Carsten Urbach, urbach@physik.fu-berlin.de + *******************************************************************************/ + +#include +#include +#include + +#ifdef HAVE_CONFIG_H +# include +#endif + +#include "global.h" +#include "start.h" +#include "sse.h" +#include "su3.h" +#include "linalg_eo.h" +#include "eigenvalues.h" +#include +#include "solver/solver.h" +#include "solver/jdher.h" +#include "solver/eigenvalues.h" +#include "operator/Dov_proj.h" +#include "gamma.h" +#include "index_jd.h" + +#include +#include +#include + + +double shift; + +#define min(a,b)((a)<(b) ? (a) : (b)) +#define max(a,b)((a)<(b) ? (b) : (a)) + +void index_jd(int * nr_of_eigenvalues_ov, + const int max_iterations, const double precision_ov, char *conf_filename, + const int nstore, const int method){ + + _Complex double *eval; + spinor *eigenvectors_ov, *eigenvectors_ov_; + spinor *lowvectors, *lowvectors_; + int i=0 , k=0, returncode=0, index = 0, determined = 0, signed_index = 0; + char filename[120]; + FILE * ifs = NULL; + matrix_mult Operator[2]; + double absdifference; + const int N2 = VOLUMEPLUSRAND; + +#ifdef MPI + double atime, etime; +#endif + double lowestmodes[20]; + int intsign, max_iter, first_blocksize = 1; + int * idx = NULL; + + /********************** + * For Jacobi-Davidson + **********************/ + int verbosity = 3, converged = 0, blocksize = 1, blockwise = 0; + int solver_it_max = 50, j_max, j_min, v0dim = 0; + double * eigenvalues_ov = NULL; + double decay_min = 1.7, threshold_min = 1.e-3, prec; + + WRITER *writer=NULL; + spinor *s; + double sqnorm; + paramsPropagatorFormat *propagatorFormat = NULL; + + double ap_eps_sq; + int switch_on_adaptive_precision = 0; + double ov_s = 0; + + /********************** + * General variables + **********************/ + + eval= calloc((*nr_of_eigenvalues_ov),sizeof(_Complex double)); + shift = 0.0; + + // ov_s = 0.5*(1./g_kappa - 8.) - 1.; + ap_eps_sq = precision_ov*precision_ov; + +#if (defined SSE || defined SSE2 ) + eigenvectors_ov_= calloc(VOLUMEPLUSRAND*(*nr_of_eigenvalues_ov)+1, sizeof(spinor)); + eigenvectors_ov = (spinor *)(((unsigned long int)(eigenvectors_ov_)+ALIGN_BASE)&~ALIGN_BASE); + lowvectors_ = calloc(2*first_blocksize*VOLUMEPLUSRAND+1, sizeof(spinor)); + lowvectors = (spinor *)(((unsigned long int)(lowvectors_)+ALIGN_BASE)&~ALIGN_BASE); +#else + // eigenvectors_ov_ = calloc(VOLUMEPLUSRAND*(*nr_of_eigenvalues_ov), sizeof(spinor)); + eigenvectors_ov_ = calloc(VOLUMEPLUSRAND*(*nr_of_eigenvalues_ov), sizeof(spinor)); + lowvectors_ = calloc(2*first_blocksize*VOLUMEPLUSRAND, sizeof(spinor)); + eigenvectors_ov = eigenvectors_ov_; + lowvectors = lowvectors_; +#endif + + // idx = malloc((*nr_of_eigenvalues_ov)*sizeof(int)); + idx = malloc((*nr_of_eigenvalues_ov)*sizeof(int)); + Operator[0]=&Dov_proj_plus; + Operator[1]=&Dov_proj_minus; + + if(g_proc_id == g_stdio_proc){ + printf("Computing first the two lowest modes in the positive and negative chirality sector, respectively\n"); + if(switch_on_adaptive_precision == 1) { + printf("We have switched on adaptive precision with ap_eps_sq = %e!\n", ap_eps_sq); + } + printf("We have set the mass to zero within this computation!\n"); + fflush(stdout); + } + + prec = precision_ov; + j_min = 8; j_max = 16; + max_iter = 70; + +#ifdef MPI + atime = MPI_Wtime(); +#endif + + v0dim = first_blocksize; + blocksize = v0dim; + for(intsign = 0; intsign < 2; intsign++){ + converged = 0; + if(g_proc_id == g_stdio_proc){ + printf("%s chirality sector: \n", intsign ? "negative" : "positive"); + fflush(stdout); + } + if(max_iter == 70){ + /******************************************************************** + * + * We need random start spinor fields, but they must be half zero, + * that's why we apply the Projektor once + * + ********************************************************************/ + for(i = 0; i < first_blocksize; i++) { + random_spinor_field(&lowvectors[(first_blocksize*intsign+i)*VOLUMEPLUSRAND],N2,0); + Proj(&lowvectors[(first_blocksize*intsign+i)*VOLUMEPLUSRAND], + &lowvectors[(first_blocksize*intsign+i)*VOLUMEPLUSRAND],N2, intsign); + } + } + + jdher(VOLUME*sizeof(spinor)/sizeof(_Complex double), + VOLUMEPLUSRAND*sizeof(spinor)/sizeof(_Complex double), + shift, prec, blocksize, j_max, j_min, + max_iter, blocksize, blockwise, v0dim, (_Complex double*) &lowvectors[first_blocksize*intsign*VOLUMEPLUSRAND], + CG, solver_it_max, + threshold_min, decay_min, verbosity, + &converged, (_Complex double*) &lowvectors[first_blocksize*intsign*VOLUMEPLUSRAND], + &lowestmodes[first_blocksize*intsign], + &returncode, JD_MINIMAL, 1, + Operator[intsign]); + + if(converged != blocksize && max_iter == 70){ + if(g_proc_id == g_stdio_proc){ + printf("Restarting %s chirality sector with more iterations!\n", intsign ? "negative" : "positive"); + fflush(stdout); + } + max_iter = 140; + intsign-=1; + } + else { + max_iter = 70; + /* Save the allready computed eigenvectors_ov */ + for(i = 0; i< first_blocksize; i++) { + sprintf(filename, "eigenvector_of_D%s.%.2d.%s.%.4d",((intsign==0)?"plus":"minus"),i , conf_filename, nstore); + + construct_writer(&writer, filename, 0); + /* todo write propagator format */ + propagatorFormat = construct_paramsPropagatorFormat(64, 1); + write_propagator_format(writer, propagatorFormat); + free(propagatorFormat); + + + s=(spinor*)&lowvectors[first_blocksize*intsign*VOLUMEPLUSRAND]; + write_spinor(writer, &s,NULL, 1, 64); + destruct_writer(writer); + writer=NULL; + sqnorm=square_norm(s,VOLUME,1); + printf(" wrote eigenvector of overlap operator !!! | |^2 = %e \n",sqnorm); + + + } + } + } + +#ifdef MPI + etime = MPI_Wtime(); + if(g_proc_id == g_stdio_proc){ + printf("It took %f sec to determine the sector with zero modes, if any!\n", etime-atime); + } +#endif + + /*Compare the two lowest modes */ + absdifference = fabs(lowestmodes[0]-lowestmodes[first_blocksize]); + if(absdifference < 0.1*max(lowestmodes[0],lowestmodes[first_blocksize])){ + /* They are equal within the errors */ + if(g_proc_id == g_stdio_proc){ + printf("Index is 0!\n"); + fflush(stdout); + sprintf(filename, "eigenvalues_of_overlap_proj.%s.%.4d", conf_filename, nstore); + ifs = fopen(filename, "w"); + printf("\nThe following lowest modes have been computed:\n"); + fprintf(ifs, "Index is 0\n\n"); + fprintf(ifs, "Sector with positive chirality:\n"); + for(i = 0; i < first_blocksize; i++) { + lowestmodes[i] = 2.*(1.+ov_s)*lowestmodes[i]; + fprintf(ifs, "%d %e positive\n", i, lowestmodes[i]); + printf("%d %e positive\n", i, lowestmodes[i]); + } + fprintf(ifs, "Sector with negative chirality:\n"); + for(i = 0; i < first_blocksize; i++) { + lowestmodes[i+first_blocksize] = 2.*(1.+ov_s)*lowestmodes[i+first_blocksize]; + fprintf(ifs, "%d %e negative\n", i, lowestmodes[i+first_blocksize]); + printf("%d %e negative\n", i, lowestmodes[i+first_blocksize]); + } + fclose(ifs); + for(k = 0; k < 2; k++) { + sprintf(filename, "eigenvalues_of_D%s.%s.%.4d", + k ? "minus" : "plus", conf_filename, nstore); + ifs = fopen(filename, "w"); + fwrite(&first_blocksize, sizeof(int), 1, ifs); + index = 0; + fwrite(&index, sizeof(int), 1, ifs); + for(i = 0; i < first_blocksize; i++) { + fwrite(&lowestmodes[((intsign+1)%2)*first_blocksize+i], sizeof(double), 1, ifs); + } + fclose(ifs); + } + } + } + else{ + /* they are not equal */ + /* determine the sector with not trivial topology */ + if(lowestmodes[0] < lowestmodes[first_blocksize]){ + intsign = 0; + } + else{ + intsign = 1; + } + + if(g_proc_id == g_stdio_proc){ + printf("Computing now up to %d modes in the sector with %s chirality\n", + (*nr_of_eigenvalues_ov), intsign ? "negative" : "positive"); + fflush(stdout); + } + + /* Here we set the (absolute) precision to be */ + /* such that we can compare to the lowest mode */ + /* in the other sector */ + + prec = (lowestmodes[first_blocksize*((intsign+1)%2)])*1.e-1; + + eigenvalues_ov = (double*)malloc((*nr_of_eigenvalues_ov)*sizeof(double)); + + /* Copy the allready computed eigenvectors_ov */ + for(i = 0; i < first_blocksize; i++) { + assign(&eigenvectors_ov[i], &lowvectors[(first_blocksize*intsign+i)*VOLUMEPLUSRAND],N2); + eigenvalues_ov[i] = lowestmodes[first_blocksize*intsign+i]; + } + +#ifdef MPI + atime = MPI_Wtime(); +#endif + + blocksize = 3; + j_min = 8; j_max = 16; + converged = first_blocksize; + for(i = first_blocksize; i < (*nr_of_eigenvalues_ov); i+=3) { + + if((i + blocksize) > (*nr_of_eigenvalues_ov)) { + blocksize = (*nr_of_eigenvalues_ov) - i; + } + + /* Fill up the rest with random spinor fields */ + /* and project it to the corresponding sector */ + for(v0dim = i; v0dim < i+blocksize; v0dim++){ + random_spinor_field(&eigenvectors_ov[v0dim*VOLUMEPLUSRAND],N2,0); + Proj(&eigenvectors_ov[v0dim*VOLUMEPLUSRAND], &eigenvectors_ov[v0dim*VOLUMEPLUSRAND],N2, intsign); + } + v0dim = blocksize; + returncode = 0; + + /* compute minimal eigenvalues */ +#ifdef MPI + /* pjdher(VOLUME*sizeof(spinor)/sizeof(_Complex double), VOLUMEPLUSRAND*sizeof(spinor)/sizeof(_Complex double), + shift, prec, omega, n_omega, ev_tr, + i+blocksize, j_max, j_min, + max_iterations, blocksize, blockwise, v0dim, (_Complex double*)(&eigenvectors_ov[i*VOLUMEPLUSRAND]), + CG, solver_it_max, + threshold_min, decay_min, verbosity, + &converged, (_Complex double*) eigenvectors_ov, eigenvalues_ov, + &returncode, JD_MINIMAL, 1, use_AV, + Operator[intsign]);*/ +#else + jdher(VOLUME*sizeof(spinor)/sizeof(_Complex double), + VOLUMEPLUSRAND*sizeof(spinor)/sizeof(_Complex double), + shift, prec, blocksize, j_max, j_min, + max_iter, blocksize, blockwise, v0dim, (_Complex double*) &eigenvectors_ov[i*VOLUMEPLUSRAND], + CG, solver_it_max, + threshold_min, decay_min, verbosity, + &converged, (_Complex double*) eigenvectors_ov, + eigenvalues_ov, + &returncode, JD_MINIMAL, 1, + Operator[intsign]); +#endif + /* Save eigenvectors_ov temporary */ + /* in order to be able to restart */ + for (k=i; k < converged; k++){ + if(intsign == 0){ + sprintf(filename, "eigenvector_of_Dplus.%.2d.%s.%.4d", k, conf_filename, nstore); + } + else{ + sprintf(filename, "eigenvector_of_Dminus.%.2d.%s.%.4d", k, conf_filename, nstore); + } + /* write_spinorfield(&eigenvectors_ov[k*VOLUMEPLUSRAND], filename);*/ + } + + /* order the eigenvalues_ov and vectors */ + for(k = 0; k < converged; k++) { + idx[k] = k; + } + /* quicksort(converged, eigenvalues_ov, idx);*/ + + /* Check whether the index is detemined */ + index = 0; + for(k = 0; k < converged; k++) { + absdifference = fabs(lowestmodes[first_blocksize*((intsign+1)%2)] - eigenvalues_ov[k]); + if(absdifference < 0.1*lowestmodes[first_blocksize*((intsign+1)%2)]) { + /* We have found the first non zero */ + if(k < converged-1) { + determined = 1; + break; + } + else { + blocksize = 1; + shift = eigenvalues_ov[converged-1]; + } + } + else { + index++; + } + } + /* If we have determined the index or */ + /* hit the maximal number of ev */ + if(determined == 1 || converged == (*nr_of_eigenvalues_ov)) { + break; + } + else if(g_proc_id == g_stdio_proc) { + if(blocksize != 1) { + printf("Index %s (or equal) than %s%d, continuing!\n\n", + intsign ? "lower" : "bigger", + intsign ? "-" : "+", index); + fflush( stdout ); + } + else { + printf("Index is %s%d, one non zero is missing, continuing!\n\n", + intsign ? "-" : "+", index); + fflush( stdout ); + } + } + } + +#ifdef MPI + etime = MPI_Wtime(); +#endif + + /* Save the eigenvectors_ov */ + for(i = 0; i < converged; i++){ + eval[i] = 2.*(1.+ov_s)*eigenvalues_ov[i]; + if(intsign == 0){ + sprintf(filename, "eigenvector_of_Dplus.%.2d.%s.%.4d", i, conf_filename, nstore); + } + else{ + sprintf(filename, "eigenvector_of_Dminus.%.2d.%s.%.4d", i, conf_filename, nstore); + } + /* write_spinorfield(&eigenvectors_ov[idx[i]*VOLUMEPLUSRAND], filename);*/ + } + + /* Some Output */ + if(g_proc_id == g_stdio_proc) { + printf("Index is %s%d!\n", intsign ? "-" : "+", index); +#ifdef MPI + printf("Zero modes determined in %f sec!\n", etime-atime); +#endif + } + if(g_proc_id == 0) { + sprintf(filename, "eigenvalues_of_overlap_proj.%s.%.4d", conf_filename, nstore); + ifs = fopen(filename, "w"); + printf("\nThe following lowest modes have been computed:\n"); + fprintf(ifs, "Index is %s%d!\n\n", intsign ? "-" : "+", index); + for(k = 0; k < 2; k++) { + if(k == intsign) { + for (i=0; i < converged; i++) { + fprintf(ifs, "%d %e %s\n", i, creal(eval[i]), intsign ? "negative" : "positive"); + printf("%d %e %s\n", i, creal(eval[i]), intsign ? "negative" : "positive"); + } + } + else { + for(i = 0; i < first_blocksize; i++) { + lowestmodes[((intsign+1)%2)*first_blocksize+i] = 2.*(1.+ov_s)*lowestmodes[((intsign+1)%2)*first_blocksize+i]; + fprintf(ifs, "%d %e %s\n", i, lowestmodes[((intsign+1)%2)*first_blocksize+i], intsign ? "positive" : "negative"); + printf("%d %e %s\n", i, lowestmodes[((intsign+1)%2)*first_blocksize+i], intsign ? "positive" : "negative"); + } + } + } + fclose(ifs); + if(intsign != 0) signed_index = -index; + else signed_index = index; + for(k = 0; k < 2; k++) { + sprintf(filename, "eigenvalues_of_D%s.%s.%.4d", + k ? "minus" : "plus", conf_filename, nstore); + ifs = fopen(filename, "w"); + if(k == intsign) { + fwrite(&converged, sizeof(int), 1, ifs); + fwrite(&signed_index, sizeof(int), 1, ifs); + for (i=index; i < converged; ++i) + { + double eval_re = creal(eval[i]); + fwrite(&eval_re, sizeof(double), 1, ifs); + } + } + else { + fwrite(&first_blocksize, sizeof(int), 1, ifs); + fwrite(&signed_index, sizeof(int), 1, ifs); + for(i = 0; i < first_blocksize; i++) { + fwrite(&lowestmodes[((intsign+1)%2)*first_blocksize+i], sizeof(double), 1, ifs); + } + } + fclose(ifs); + } + } + } + + switch_on_adaptive_precision = 0; + /* Free memory */ + free(eigenvectors_ov_); + free(lowvectors_); + free(eval); + free(eigenvalues_ov); + free(idx); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/index_jd.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/index_jd.h new file mode 100644 index 0000000000000000000000000000000000000000..a15395a1a490f60550097e4908fd888659dca5c3 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/index_jd.h @@ -0,0 +1,16 @@ +#ifndef _INDEX_JD_H +#define _INDEX_JD_H + +#ifndef RESTART_JACOBI_DAVIDSON +#define RESTART_JACOBI_DAVIDSON 5 +#endif +#ifndef RESTART_RITZ_JACOBI +#define RESTART_RITZ_JACOBI 6 +#endif + +void index_jd(int * nr_of_eigenvalues, + const int max_iterations, const double precision, + char * conf_filename, const int nstore, + const int method); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/jdher.c b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/jdher.c new file mode 100644 index 0000000000000000000000000000000000000000..a7f5d05631ccc1fef5f485a3eca33c3b32e2346a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/jdher.c @@ -0,0 +1,875 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +/************************************************************************** + * + * This is an implementation of the Jacobi-Davidson merhod + * for hermitian matrices. + * + * It is an adaption of the implementation of + * + * R. Geus and O. Chinellato + * + * for symmetric real matrices. + * + * See http://www.inf.ethz.ch/personal/geus/software.html + * + * It is so far implemented without preconditioning and for + * the eigenvalue problem: + * + * A*x = lambda*x + * + * Author of this adaption: + * Carsten Urbach + * + **************************************************************************/ + + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#include +#include +#include +#include "global.h" +#include "sse.h" +#include "linalg/fortran.h" +#include "linalg/blas.h" +#include "linalg/lapack.h" +#include "linalg_eo.h" +#include +#include "solver/solver.h" +#include "solver/gram-schmidt.h" +#include "solver/quicksort.h" +#include "jdher.h" + +#define min(a,b)((a)<(b) ? (a) : (b)) +#define max(a,b)((a)<(b) ? (b) : (a)) + + + +/**************************************************************************** + * * + * Prototypes of static functions * + * * + ****************************************************************************/ +static void print_status(int clvl, int it, int k, int j, int kmax, + int blksize, int actblksize, + double *s, double *resnrm, int *actcorrits); +static void sorteig(int j, double S[], _Complex double U[], int ldu, double tau, + double dtemp[], int idx1[], int idx2[], int strategy); + +/* Projection routines */ +void Proj_A_psi(spinor * const y, spinor * const x); + +void jderrorhandler(const int i, char * message) { + fprintf(stderr, "jdher %s \n", message); +#ifdef MPI + MPI_Finalize(); +#endif + exit(i); +} + +/**************************************************************************** + * * + * Static variables * + * * + ****************************************************************************/ +/* static double DMONE = -1.0, DZER = 0.0, DONE = 1.0; */ +static int MONE = -1, ONE = 1; +static _Complex double CONE, CZERO, CMONE; + +/* Projector variables */ + +static int p_n, p_n2, p_k, p_lda; +static double p_theta; +_Complex double * p_Q; +_Complex double * p_work; +matrix_mult p_A_psi; + +static char * fupl_u = "U", *fupl_n = "N", * fupl_a = "A", *fupl_v = "V", *filaenv = "zhetrd", *fvu = "VU"; + +/**************************************************************************** + * * + * Main eigensolver routine * + * * + ****************************************************************************/ + +void jdher(int n, int lda, double tau, double tol, + int kmax, int jmax, int jmin, int itmax, + int blksize, int blkwise, + int V0dim, _Complex double *V0, + int solver_flag, + int linitmax, double eps_tr, double toldecay, + int verbosity, + int *k_conv, _Complex double *Q, double *lambda, int *it, + int maxmin, int shift_mode, + matrix_mult A_psi) { + + /**************************************************************************** + * * + * Local variables * + * * + ****************************************************************************/ + + /* constants */ + + /* allocatables: + * initialize with NULL, so we can free even unallocated ptrs */ + double *s = NULL, *resnrm = NULL, *resnrm_old = NULL, *dtemp = NULL, *rwork = NULL; + + _Complex double *V_ = NULL, *V, *Vtmp = NULL, *U = NULL, *M = NULL, *Z = NULL, + *Res_ = NULL, *Res, + *eigwork = NULL, *temp1_ = NULL, *temp1; + + int *idx1 = NULL, *idx2 = NULL, + *convind = NULL, *keepind = NULL, *solvestep = NULL, + *actcorrits = NULL; + + /* non-allocated ptrs */ + _Complex double *q, *v, *u, *r = NULL; +/* _Complex double *matdummy, *vecdummy; */ + + /* scalar vars */ + double theta, alpha, it_tol; + + int i, k, j, actblksize, eigworklen, found, conv, keep, n2; + int act, cnt, idummy, info, CntCorrIts=0, endflag=0; + int N = n*sizeof(_Complex double)/sizeof(spinor); + + /* variables for random number generator */ + int IDIST = 1; + int ISEED[4] = {2, 3, 5, 7}; + ISEED[0] = g_proc_id+2; + + /**************************************************************************** + * * + * Execution starts here... * + * * + ****************************************************************************/ + + + /* print info header */ + if ((verbosity > 2) && (g_proc_id == 0)){ + printf("Jacobi-Davidson method for hermitian Matrices\n"); + printf("Solving A*x = lambda*x \n\n"); + printf(" N= %10d ITMAX=%4d\n", n, itmax); + printf(" KMAX=%3d JMIN=%3d JMAX=%3d V0DIM=%3d\n", + kmax, jmin, jmax, V0dim); + printf(" BLKSIZE= %2d BLKWISE= %5s\n", + blksize, blkwise ? "TRUE" : "FALSE"); + printf(" TOL= %11.4e TAU= %11.4e\n", + tol, tau); + printf(" LINITMAX= %5d EPS_TR= %10.3e TOLDECAY=%9.2e\n", + linitmax, eps_tr, toldecay); + printf("\n Computing %s eigenvalues\n", + maxmin ? "maximal" : "minimal"); + printf("\n"); + fflush( stdout ); + } + + /* validate input parameters */ + if(tol <= 0) jderrorhandler(401,""); + if(kmax <= 0 || kmax > n) jderrorhandler(402,""); + if(jmax <= 0 || jmax > n) jderrorhandler(403,""); + if(jmin <= 0 || jmin > jmax) jderrorhandler(404,""); + if(itmax < 0) jderrorhandler(405,""); + if(blksize > jmin || blksize > (jmax - jmin)) jderrorhandler(406,""); + if(blksize <= 0 || blksize > kmax) jderrorhandler(406,""); + if(blkwise < 0 || blkwise > 1) jderrorhandler(407,""); + if(V0dim < 0 || V0dim >= jmax) jderrorhandler(408,""); + if(linitmax < 0) jderrorhandler(409,""); + if(eps_tr < 0.) jderrorhandler(500,""); + if(toldecay <= 1.0) jderrorhandler(501,""); + + CONE = 1.; + CZERO = 0.; + CMONE = -1.; + + /* Get hardware-dependent values: + * Opt size of workspace for ZHEEV is (NB+1)*j, where NB is the opt. + * block size... */ + eigworklen = (2 + _FT(ilaenv)(&ONE, filaenv, fvu, &jmax, &MONE, &MONE, &MONE, 6, 2)) * jmax; + + /* Allocating memory for matrices & vectors */ + + if((void*)(V_ = (_Complex double *)malloc((lda * jmax + 4) * sizeof(_Complex double))) == NULL) { + errno = 0; + jderrorhandler(300,"V in jdher"); + } +#if (defined SSE || defined SSE2 || defined SSE3) + V = (_Complex double*)(((unsigned long int)(V_)+ALIGN_BASE)&~ALIGN_BASE); +#else + V = V_; +#endif + if((void*)(U = (_Complex double *)malloc(jmax * jmax * sizeof(_Complex double))) == NULL) { + jderrorhandler(300,"U in jdher"); + } + if((void*)(s = (double *)malloc(jmax * sizeof(double))) == NULL) { + jderrorhandler(300,"s in jdher"); + } + if((void*)(Res_ = (_Complex double *)malloc((lda * blksize+4) * sizeof(_Complex double))) == NULL) { + jderrorhandler(300,"Res in jdher"); + } +#if (defined SSE || defined SSE2 || defined SSE3) + Res = (_Complex double*)(((unsigned long int)(Res_)+ALIGN_BASE)&~ALIGN_BASE); +#else + Res = Res_; +#endif + if((void*)(resnrm = (double *)malloc(blksize * sizeof(double))) == NULL) { + jderrorhandler(300,"resnrm in jdher"); + } + if((void*)(resnrm_old = (double *)calloc(blksize,sizeof(double))) == NULL) { + jderrorhandler(300,"resnrm_old in jdher"); + } + if((void*)(M = (_Complex double *)malloc(jmax * jmax * sizeof(_Complex double))) == NULL) { + jderrorhandler(300,"M in jdher"); + } + if((void*)(Vtmp = (_Complex double *)malloc(jmax * jmax * sizeof(_Complex double))) == NULL) { + jderrorhandler(300,"Vtmp in jdher"); + } + if((void*)(p_work = (_Complex double *)malloc(lda * sizeof(_Complex double))) == NULL) { + jderrorhandler(300,"p_work in jdher"); + } + + /* ... */ + if((void*)(idx1 = (int *)malloc(jmax * sizeof(int))) == NULL) { + jderrorhandler(300,"idx1 in jdher"); + } + if((void*)(idx2 = (int *)malloc(jmax * sizeof(int))) == NULL) { + jderrorhandler(300,"idx2 in jdher"); + } + + /* Indices for (non-)converged approximations */ + if((void*)(convind = (int *)malloc(blksize * sizeof(int))) == NULL) { + jderrorhandler(300,"convind in jdher"); + } + if((void*)(keepind = (int *)malloc(blksize * sizeof(int))) == NULL) { + jderrorhandler(300,"keepind in jdher"); + } + if((void*)(solvestep = (int *)malloc(blksize * sizeof(int))) == NULL) { + jderrorhandler(300,"solvestep in jdher"); + } + if((void*)(actcorrits = (int *)malloc(blksize * sizeof(int))) == NULL) { + jderrorhandler(300,"actcorrits in jdher"); + } + + if((void*)(eigwork = (_Complex double *)malloc(eigworklen * sizeof(_Complex double))) == NULL) { + jderrorhandler(300,"eigwork in jdher"); + } + if((void*)(rwork = (double *)malloc(3*jmax * sizeof(double))) == NULL) { + jderrorhandler(300,"rwork in jdher"); + } + if((void*)(temp1_ = (_Complex double *)malloc((lda+4) * sizeof(_Complex double))) == NULL) { + jderrorhandler(300,"temp1 in jdher"); + } +#if (defined SSE || defined SSE2 || defined SSE3) + temp1 = (_Complex double*)(((unsigned long int)(temp1_)+ALIGN_BASE)&~ALIGN_BASE); +#else + temp1 = temp1_; +#endif + if((void*)(dtemp = (double *)malloc(lda * sizeof(_Complex double))) == NULL) { + jderrorhandler(300,"dtemp in jdher"); + } + + /* Set variables for Projection routines */ + n2 = 2*n; + p_n = n; + p_n2 = n2; + p_Q = Q; + p_A_psi = A_psi; + p_lda = lda; + + /************************************************************************** + * * + * Generate initial search subspace V. Vectors are taken from V0 and if * + * necessary randomly generated. * + * * + **************************************************************************/ + + /* copy V0 to V */ + _FT(zlacpy)(fupl_a, &n, &V0dim, V0, &lda, V, &lda, 1); + j = V0dim; + /* if V0dim < blksize: generate additional random vectors */ + if (V0dim < blksize) { + idummy = (blksize - V0dim)*n; /* nof random numbers */ + _FT(zlarnv)(&IDIST, ISEED, &idummy, V + V0dim*lda); + j = blksize; + } + for (cnt = 0; cnt < j; cnt ++) { + ModifiedGS(V + cnt*lda, n, cnt, V, lda); + alpha = sqrt(square_norm((spinor*)(V+cnt*lda), N, 1)); + alpha = 1.0 / alpha; + _FT(dscal)(&n2, &alpha, (double *)(V + cnt*lda), &ONE); + } + + /* Generate interaction matrix M = V^dagger*A*V. Only the upper triangle + is computed. */ + for (cnt = 0; cnt < j; cnt++){ + A_psi((spinor*) temp1, (spinor*)(V+cnt*lda)); + idummy = cnt+1; + for(i = 0; i < idummy; i++){ + M[cnt*jmax+i] = scalar_prod((spinor*)(V+i*lda), (spinor*) temp1, N, 1); + } + } + + /* Other initializations */ + k = 0; (*it) = 0; + if((*k_conv) > 0) { + k = *k_conv; + } + + actblksize = blksize; + for(act = 0; act < blksize; act ++){ + solvestep[act] = 1; + } + + + /**************************************************************************** + * * + * Main JD-iteration loop * + * * + ****************************************************************************/ + + while((*it) < itmax) { + + /**************************************************************************** + * * + * Solving the projected eigenproblem * + * * + * M*u = V^dagger*A*V*u = s*u * + * M is hermitian, only the upper triangle is stored * + * * + ****************************************************************************/ + _FT(zlacpy)(fupl_u, &j, &j, M, &jmax, U, &jmax, 1); + _FT(zheev)(fupl_v, fupl_u, &j, U, &jmax, s, eigwork, &eigworklen, rwork, &info, 1, 1); + + if (info != 0) { + printf("error solving the projected eigenproblem."); + printf(" zheev: info = %d\n", info); + } + if(info != 0) jderrorhandler(502,"proble in zheev"); + + + /* Reverse order of eigenvalues if maximal value is needed */ + if(maxmin == 1){ + sorteig(j, s, U, jmax, s[j-1], dtemp, idx1, idx2, 0); + } + else{ + sorteig(j, s, U, jmax, 0., dtemp, idx1, idx2, 0); + } + /**************************************************************************** + * * + * Convergence/Restart Check * + * * + * In case of convergence, strip off a whole block or just the converged * + * ones and put 'em into Q. Update the matrices Q, V, U, s * + * * + * In case of a restart update the V, U and M matrices and recompute the * + * Eigenvectors * + * * + ****************************************************************************/ + + found = 1; + while(found) { + + /* conv/keep = Number of converged/non-converged Approximations */ + conv = 0; keep = 0; + + for(act=0; act < actblksize; act++){ + + /* Setting pointers for single vectors */ + q = Q + (act+k)*lda; + u = U + act*jmax; + r = Res + act*lda; + + /* Compute Ritz-Vector Q[:,k+cnt1]=V*U[:,cnt1] */ + theta = s[act]; + _FT(zgemv)(fupl_n, &n, &j, &CONE, V, &lda, u, &ONE, &CZERO, q, &ONE, 1); + + /* Compute the residual */ + A_psi((spinor*) r, (spinor*) q); + theta = -theta; + _FT(daxpy)(&n2, &theta, (double*) q, &ONE, (double*) r, &ONE); + + /* Compute norm of the residual and update arrays convind/keepind*/ + resnrm_old[act] = resnrm[act]; + resnrm[act] = sqrt(square_norm((spinor*) r, N, 1)); + if (resnrm[act] < tol){ + convind[conv] = act; + conv = conv + 1; + } + else{ + keepind[keep] = act; + keep = keep + 1; + } + + } /* for(act = 0; act < actblksize; act ++) */ + + /* Check whether the blkwise-mode is chosen and ALL the + approximations converged, or whether the strip-off mode is + active and SOME of the approximations converged */ + + found = ((blkwise==1 && conv==actblksize) || (blkwise==0 && conv!=0)) + && (j > actblksize || k == kmax - actblksize); + + /*************************************************************************** + * * + * Convergence Case * + * * + * In case of convergence, strip off a whole block or just the converged * + * ones and put 'em into Q. Update the matrices Q, V, U, s * + * * + **************************************************************************/ + + if (found) { + + /* Store Eigenvalues */ + for(act = 0; act < conv; act++) + lambda[k+act] = s[convind[act]]; + + /* Re-use non approximated Ritz-Values */ + for(act = 0; act < keep; act++) + s[act] = s[keepind[act]]; + + /* Shift the others in the right position */ + for(act = 0; act < (j-actblksize); act ++) + s[act+keep] = s[act+actblksize]; + + /* Update V. Re-use the V-Vectors not looked at yet. */ + idummy = j - actblksize; + for (act = 0; act < n; act = act + jmax) { + cnt = act + jmax > n ? n-act : jmax; + _FT(zlacpy)(fupl_a, &cnt, &j, V+act, &lda, Vtmp, &jmax, 1); + _FT(zgemm)(fupl_n, fupl_n, &cnt, &idummy, &j, &CONE, Vtmp, + &jmax, U+actblksize*jmax, &jmax, &CZERO, V+act+keep*lda, &lda, 1, 1); + } + + /* Insert the not converged approximations as first columns in V */ + for(act = 0; act < keep; act++){ + _FT(zlacpy)(fupl_a,&n,&ONE,Q+(k+keepind[act])*lda,&lda,V+act*lda,&lda,1); + } + + /* Store Eigenvectors */ + for(act = 0; act < conv; act++){ + _FT(zlacpy)(fupl_a,&n,&ONE,Q+(k+convind[act])*lda,&lda,Q+(k+act)*lda,&lda,1); + } + + /* Update SearchSpaceSize j */ + j = j - conv; + + /* Let M become a diagonalmatrix with the Ritzvalues as entries ... */ + _FT(zlaset)(fupl_u, &j, &j, &CZERO, &CZERO, M, &jmax, 1); + for (act = 0; act < j; act++) + M[act*jmax + act] = s[act]; + + /* ... and U the Identity(jnew,jnew) */ + _FT(zlaset)(fupl_a, &j, &j, &CZERO, &CONE, U, &jmax, 1); + + if(shift_mode == 1){ + if(maxmin == 0){ + for(act = 0; act < conv; act ++){ + if (lambda[k+act] > tau){ + tau = lambda[k+act]; + } + } + } + else{ + for(act = 0; act < conv; act ++){ + if (lambda[k+act] < tau){ + tau = lambda[k+act]; + } + } + } + } + + /* Update Converged-Eigenpair-counter and Pro_k */ + k = k + conv; + + /* Update the new blocksize */ + actblksize=min(blksize, kmax-k); + + /* Exit main iteration loop when kmax eigenpairs have been + approximated */ + if (k == kmax){ + endflag = 1; + break; + } + /* Counter for the linear-solver-accuracy */ + for(act = 0; act < keep; act++) + solvestep[act] = solvestep[keepind[act]]; + + /* Now we expect to have the next eigenvalues */ + /* allready with some accuracy */ + /* So we do not need to start from scratch... */ + for(act = keep; act < blksize; act++) + solvestep[act] = 1; + + } /* if(found) */ + if(endflag == 1){ + break; + } + /************************************************************************** + * * + * Restart * + * * + * The Eigenvector-Aproximations corresponding to the first jmin * + * Petrov-Vectors are kept. if (j+actblksize > jmax) { * + * * + **************************************************************************/ + if (j+actblksize > jmax) { + + idummy = j; j = jmin; + + for (act = 0; act < n; act = act + jmax) { /* V = V * U(:,1:j) */ + cnt = act+jmax > n ? n-act : jmax; + _FT(zlacpy)(fupl_a, &cnt, &idummy, V+act, &lda, Vtmp, &jmax, 1); + _FT(zgemm)(fupl_n, fupl_n, &cnt, &j, &idummy, &CONE, Vtmp, + &jmax, U, &jmax, &CZERO, V+act, &lda, 1, 1); + } + + _FT(zlaset)(fupl_a, &j, &j, &CZERO, &CONE, U, &jmax, 1); + _FT(zlaset)(fupl_u, &j, &j, &CZERO, &CZERO, M, &jmax, 1); + for (act = 0; act < j; act++) + M[act*jmax + act] = s[act]; + } + + } /* while(found) */ + + if(endflag == 1){ + break; + } + + /**************************************************************************** + * * + * Solving the correction equations * + * * + * * + ****************************************************************************/ + + /* Solve actblksize times the correction equation ... */ + for (act = 0; act < actblksize; act ++) { + + /* Setting start-value for vector v as zeros(n,1). Guarantees + orthogonality */ + v = V + j*lda; + for (cnt = 0; cnt < n; cnt ++){ + v[cnt] = 0.; + } + + /* Adaptive accuracy and shift for the lin.solver. In case the + residual is big, we don't need a too precise solution for the + correction equation, since even in exact arithmetic the + solution wouldn't be too usefull for the Eigenproblem. */ + r = Res + act*lda; + + if (resnrm[act] < eps_tr && resnrm[act] < s[act] && resnrm_old[act] > resnrm[act]){ + p_theta = s[act]; + } + else{ + p_theta = tau; + } + p_k = k + actblksize; + + /* if we are in blockwise mode, we do not want to */ + /* iterate solutions much more, if they have */ + /* allready the desired precision */ + if(blkwise == 1 && resnrm[act] < tol) { + it_tol = pow(toldecay, (double)(-5)); + } + else { + it_tol = pow(toldecay, (double)(-solvestep[act])); + } + solvestep[act] = solvestep[act] + 1; + + + /* equation and project if necessary */ + ModifiedGS(r, n, k + actblksize, Q, lda); + + /* Solve the correction equation ... */ + g_sloppy_precision = 1; + if(solver_flag == GMRES){ +/* info = gmres((spinor*) v, (spinor*) r, 10, linitmax/10, it_tol*it_tol, &Proj_A_psi, &Proj_A_psi); */ + info = gmres((spinor*) v, (spinor*) r, 10, linitmax/10, it_tol*it_tol, 0, + n*sizeof(_Complex double)/sizeof(spinor), 1, &Proj_A_psi); + } + if(solver_flag == CGS){ + info = cgs_real((spinor*) v, (spinor*) r, linitmax, it_tol*it_tol, 0, + n*sizeof(_Complex double)/sizeof(spinor), &Proj_A_psi); + } + else if (solver_flag == BICGSTAB){ + info = bicgstab_complex((spinor*) v, (spinor*) r, linitmax, it_tol*it_tol, 0, + n*sizeof(_Complex double)/sizeof(spinor), &Proj_A_psi); + } + else if (solver_flag == CG){ + info = cg_her((spinor*) v, (spinor*) r, linitmax, it_tol*it_tol, 0, + n*sizeof(_Complex double)/sizeof(spinor), &Proj_A_psi); + } + else{ + info = gmres((spinor*) v, (spinor*) r, 10, linitmax, it_tol*it_tol, 0, + n*sizeof(_Complex double)/sizeof(spinor), 1, &Proj_A_psi); + } + g_sloppy_precision = 0; + + /* Actualizing profiling data */ + if (info == -1){ + CntCorrIts += linitmax; + } + else{ + CntCorrIts += info; + } + actcorrits[act] = info; + + /* orthonormalize v to Q, cause the implicit + orthogonalization in the solvers may be too inaccurate. Then + apply "IteratedCGS" to prevent numerical breakdown + in order to orthogonalize v to V */ + + ModifiedGS(v, n, k+actblksize, Q, lda); + IteratedClassicalGS(v, &alpha, n, j, V, temp1, lda); + + alpha = 1.0 / alpha; + _FT(dscal)(&n2, &alpha, (double*) v, &ONE); + + /* update interaction matrix M */ + A_psi((spinor*) temp1, (spinor*) v); + idummy = j+1; + for(i = 0; i < idummy; i++) { + M[j*jmax+i] = scalar_prod((spinor*)(V+i*lda), (spinor*) temp1, N, 1); + } + + /* Increasing SearchSpaceSize j */ + j ++; + } /* for (act = 0;act < actblksize; act ++) */ + + /* Print information line */ + if(g_proc_id == 0) { + print_status(verbosity, *it, k, j - blksize, kmax, blksize, actblksize, + s, resnrm, actcorrits); + } + + /* Increase iteration-counter for outer loop */ + (*it) = (*it) + 1; + + } /* Main iteration loop */ + + /****************************************************************** + * * + * Eigensolutions converged or iteration limit reached * + * * + * Print statistics. Free memory. Return. * + * * + ******************************************************************/ + + (*k_conv) = k; + if (g_proc_id == 0 && verbosity > 0) { + printf("\nJDHER execution statistics\n\n"); + printf("IT_OUTER=%d IT_INNER_TOT=%d IT_INNER_AVG=%8.2f\n", + (*it), CntCorrIts, (double)CntCorrIts/(*it)); + printf("\nConverged eigensolutions in order of convergence:\n"); + printf("\n I LAMBDA(I) RES(I)\n"); + printf("---------------------------------------\n"); + } + for (act = 0; act < *k_conv; act ++) { + /* Compute the residual for solution act */ + q = Q + act*lda; + theta = -lambda[act]; + A_psi((spinor*) r, (spinor*) q); + _FT(daxpy)(&n2, &theta, (double*) q, &ONE, (double*) r, &ONE); + alpha = sqrt(square_norm((spinor*) r, N, 1)); + if(g_proc_id == 0 && verbosity > 0) { + printf("%3d %22.15e %12.5e\n", act+1, lambda[act], + alpha); + } + } + if(g_proc_id == 0 && verbosity > 0) { + printf("\n"); + fflush( stdout ); + } + + free(V_); free(Vtmp); free(U); + free(s); free(Res_); + free(resnrm); free(resnrm_old); + free(M); free(Z); + free(eigwork); free(temp1_); + free(dtemp); free(rwork); + free(p_work); + free(idx1); free(idx2); + free(convind); free(keepind); free(solvestep); free(actcorrits); + +} /* jdher(.....) */ + + +/**************************************************************************** + * * + * Supporting functions * + * * + ****************************************************************************/ + +/* PRINT_STATUS - print status line (called for each outer iteration) + */ +static void print_status(int verbosity, int it, int k, int j, int kmax, + int blksize, int actblksize, + double *s, double *resnrm, int *actcorrits) { + const int max_vals = 5; + + int i, idummy; + + if (verbosity > 2) { + if (blksize == 1) { + if (it == 0) { + printf(" IT K J RES LINIT RITZ-VALUES(1:5)\n"); + idummy = 28 + ( 13 > max_vals*10 ? 13 : max_vals*10); + for (i = 0; i < idummy; i ++) + putchar('-'); + printf("\n"); + } + printf("%4d %3d %3d %9.2e %5d", it + 1, k, j, resnrm[0], actcorrits[0]); + for (i = 0; i < (j < max_vals ? j : max_vals); i ++){ + printf(" %9.2e", s[i]); + } + printf("\n"); + fflush( stdout ); + } + else { /* blksize > 1 */ + if (it == 0) { + printf(" IT K J RITZVALS "); + for (i = 1; i < actblksize; i ++) + printf(" "); + printf(" RES "); + for (i = 1; i < actblksize; i ++) + printf(" "); + printf(" LINIT\n"); + idummy = 12 + 4 + blksize*(10 + 10 + 5); + for (i = 0; i < idummy; i ++) + putchar('-'); + printf("\n"); + } + printf("%4d %3d %3d", it + 1, k, j); + for (i = 0; i < blksize; i ++) + if (i < actblksize) + printf(" %9.2e", s[i]); + else + printf(" "); + printf(" "); + for (i = 0; i < blksize; i ++) + if (i < actblksize) + printf(" %9.2e", resnrm[i]); + else + printf(" "); + printf(" "); + for (i = 0; i < blksize; i ++) + if (i < actblksize) + printf(" %5d", actcorrits[i]); + else + printf(" "); + printf("\n"); + fflush( stdout ); + } + } +} + +/* + * SORTEIG + * + * Default behaviour (strategy == 0): + * + * Sort eigenpairs (S(i),U(:,i)), such that + * + * |S(i) - tau| <= |S(i+1) -tau| for i=1..j-1. + * + * j : dimension of S + * ldu: leading dimension of U + * dtemp: double array of length j + * idx: int array of length j + * + * Alternate behaviour (strategy == 1): + * + * Same as above but put all S(i) < tau to the end. This is used to + * avoid computation of zero eigenvalues. + */ + +static void sorteig(int j, double S[], _Complex double U[], int ldu, double tau, + double dtemp[], int idx1[], int idx2[], int strategy){ + int i; + + /* setup vector to be sorted and index vector */ + switch (strategy) { + case 0: + for (i = 0; i < j; i ++) + dtemp[i] = fabs(S[i] - tau); + break; + case 1: + for (i = 0; i < j; i ++) + if (S[i] < tau) + dtemp[i] = DBL_MAX; + else + dtemp[i] = fabs(S[i] - tau); + break; + default: + jderrorhandler(503,"");; + } + for (i = 0; i < j; i ++) + idx1[i] = i; + + /* sort dtemp in ascending order carrying itemp along */ + quicksort(j, dtemp, idx1); + + /* compute 'inverse' index vector */ + for (i = 0; i < j; i ++) + idx2[idx1[i]] = i; + + /* sort eigenvalues */ + memcpy(dtemp, S, j * sizeof(double)); + for (i = 0; i < j; i ++) + S[i] = dtemp[idx1[i]]; + + /* sort eigenvectors (in place) */ + for (i = 0; i < j; i ++) { + if (i != idx1[i]) { + memcpy(dtemp, U+i*ldu, j*sizeof(_Complex double)); + memcpy(U+i*ldu, U+idx1[i]*ldu, j*sizeof(_Complex double)); + memcpy(U+idx1[i]*ldu, dtemp, j*sizeof(_Complex double)); + idx1[idx2[i]] = idx1[i]; + idx2[idx1[i]] = idx2[i]; + } + } +} + + + + +void Proj_A_psi(spinor * const y, spinor * const x){ + double mtheta = -p_theta; + int i; + /* y = A*x */ + p_A_psi(y, x); + /* y = -theta*x+y*/ + _FT(daxpy)(&p_n2, &mtheta, (double*) x, &ONE, (double*) y, &ONE); + /* p_work = Q^dagger*y */ + for(i = 0; i < p_k; i++) { + p_work[i] = scalar_prod((spinor*)(p_Q+i*p_lda), (spinor*) y, p_n*sizeof(_Complex double)/sizeof(spinor), 1); + } + /* y = y - Q*p_work */ + _FT(zgemv)(fupl_n, &p_n, &p_k, &CMONE, p_Q, &p_lda, (_Complex double*) p_work, &ONE, &CONE, (_Complex double*) y, &ONE, 1); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/jdher.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/jdher.h new file mode 100644 index 0000000000000000000000000000000000000000..9b18ffefd098ea2477251a5471c6174a66b422a7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/jdher.h @@ -0,0 +1,48 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _JDHER_H +#define _JDHER_H + +#ifndef JD_MAXIMAL +#define JD_MAXIMAL 1 +#endif +#ifndef JD_MINIMAL +#define JD_MINIMAL 0 +#endif + +#include +#include +#include "su3.h" +#include "solver/solver.h" + +void jderrorhandler(const int i, char * message); + +extern void jdher(int n, int lda, double tau, double tol, + int kmax, int jmax, int jmin, int itmax, + int blksize, int blkwise, + int V0dim, _Complex double *V0, + int solver_flag, + int linitmax, double eps_tr, double toldecay, + int verbosity, + int *k_conv, _Complex double *Q, double *lambda, int *it, + int maxmin, int shift_mode, + matrix_mult A_psi); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/jdher_bi.c b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/jdher_bi.c new file mode 100644 index 0000000000000000000000000000000000000000..2334c73f701b3c62fc118cc6c96972677bcd9835 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/jdher_bi.c @@ -0,0 +1,862 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * This is an implementation of the Jacobi-Davidson merhod + * for hermitian matrices. + * + * It is an adaption of the implementation of + * + * R. Geus and O. Chinellato + * + * for symmetric real matrices. + * + * See http://www.inf.ethz.ch/personal/geus/software.html + * + * It is so far implemented without preconditioning and for + * the eigenvalue problem: + * + * A*x = lambda*x + * + * The implementation uses LAPACK and BLAS routines and is fully + * parallelized. + * + * Author of this adaption: + * Carsten Urbach + * + **************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#include +#include +#include +#ifdef MPI +# include +#endif +#include "global.h" +#include "sse.h" +#include "linalg/fortran.h" +#include "linalg/blas.h" +#include "linalg/lapack.h" +#include "linalg_eo.h" +#include +#include "solver/solver.h" +#include "solver/gram-schmidt.h" +#include "solver/quicksort.h" +#include "jdher.h" +#include "jdher_bi.h" + +#define min(a,b)((a)<(b) ? (a) : (b)) +#define max(a,b)((a)<(b) ? (b) : (a)) + + + +/**************************************************************************** + * * + * Prototypes of static functions * + * * + ****************************************************************************/ +static void print_status(int clvl, int it, int k, int j, int kmax, + int blksize, int actblksize, + double *s, double *resnrm, int *actcorrits); +static void sorteig(int j, double S[], _Complex double U[], int ldu, double tau, + double dtemp[], int idx1[], int idx2[], int strategy); + +/* Projection routines */ +void Proj_A_psi_bi(bispinor * const y, bispinor * const x); + +/**************************************************************************** + * * + * Static variables * + * * + ****************************************************************************/ +/* static double DMONE = -1.0, DZER = 0.0, DONE = 1.0; */ +static int MONE = -1, ONE = 1; +static _Complex double CONE, CZERO, CMONE; + +/* Projector variables */ + +static int p_n, p_n2, p_k, p_lda; +static double p_theta; +_Complex double * p_Q_bi; +_Complex double * p_work_bi; +matrix_mult_bi p_A_psi_bi; + +static char * fupl_u = "U", *fupl_n = "N", * fupl_a = "A", *fupl_v = "V", *filaenv = "zhetrd", *fvu = "VU"; + +/**************************************************************************** + * * + * Main eigensolver routine * + * * + ****************************************************************************/ + +void jdher_bi(int n, int lda, double tau, double tol, + int kmax, int jmax, int jmin, int itmax, + int blksize, int blkwise, + int V0dim, _Complex double *V0, + int solver_flag, + int linitmax, double eps_tr, double toldecay, + int verbosity, + int *k_conv, _Complex double *Q, double *lambda, int *it, + int maxmin, const int shift_mode, + matrix_mult_bi A_psi){ + + /**************************************************************************** + * * + * Local variables * + * * + ****************************************************************************/ + + /* constants */ + + /* allocatables: + * initialize with NULL, so we can free even unallocated ptrs */ + double *s = NULL, *resnrm = NULL, *resnrm_old = NULL, *dtemp = NULL, *rwork = NULL; + + _Complex double *V_ = NULL, *V, *Vtmp = NULL, *U = NULL, *M = NULL, *Z = NULL, + *Res_ = NULL, *Res, + *eigwork = NULL, *temp1_ = NULL, *temp1; + + int *idx1 = NULL, *idx2 = NULL, + *convind = NULL, *keepind = NULL, *solvestep = NULL, + *actcorrits = NULL; + + /* non-allocated ptrs */ + _Complex double *q, *v, *u, *r = NULL; + /* _Complex double *matdummy, *vecdummy; */ + + /* scalar vars */ + double theta, alpha, it_tol; + + int i, k, j, actblksize, eigworklen, found, conv, keep, n2, N = n*sizeof(_Complex double)/sizeof(bispinor); + int act, cnt, idummy, info, CntCorrIts=0, endflag=0; + + + /* variables for random number generator */ + int IDIST = 1; + int ISEED[4] = {2, 3, 5, 7}; + ISEED[0] = g_proc_id; + + /**************************************************************************** + * * + * Execution starts here... * + * * + ****************************************************************************/ + /* NEW PART FOR GAUGE_COPY */ + /* END NEW PART */ + + /* print info header */ + if (verbosity > 2 && g_proc_id == 0) { + printf("Jacobi-Davidson method for hermitian Matrices\n"); + printf("Solving A*x = lambda*x \n\n"); + printf(" N= %10d ITMAX=%4d\n", n, itmax); + printf(" KMAX=%3d JMIN=%3d JMAX=%3d V0DIM=%3d\n", + kmax, jmin, jmax, V0dim); + printf(" BLKSIZE= %2d BLKWISE= %5s\n", + blksize, blkwise ? "TRUE" : "FALSE"); + printf(" TOL= %11.4e TAU= %11.4e\n", + tol, tau); + printf(" LINITMAX= %5d EPS_TR= %10.3e TOLDECAY=%9.2e\n", + linitmax, eps_tr, toldecay); + printf("\n Computing %s eigenvalues\n", + maxmin ? "maximal" : "minimal"); + printf("\n"); + fflush( stdout ); + } + + /* validate input parameters */ + if(tol <= 0) jderrorhandler(401,""); + if(kmax <= 0 || kmax > n) jderrorhandler(402,""); + if(jmax <= 0 || jmax > n) jderrorhandler(403,""); + if(jmin <= 0 || jmin > jmax) jderrorhandler(404,""); + if(itmax < 0) jderrorhandler(405,""); + if(blksize > jmin || blksize > (jmax - jmin)) jderrorhandler(406,""); + if(blksize <= 0 || blksize > kmax) jderrorhandler(406,""); + if(blkwise < 0 || blkwise > 1) jderrorhandler(407,""); + if(V0dim < 0 || V0dim >= jmax) jderrorhandler(408,""); + if(linitmax < 0) jderrorhandler(409,""); + if(eps_tr < 0.) jderrorhandler(500,""); + if(toldecay <= 1.0) jderrorhandler(501,""); + + CONE = 1.; + CZERO = 0.; + CMONE = -1.; + + /* Get hardware-dependent values: + * Opt size of workspace for ZHEEV is (NB+1)*j, where NB is the opt. + * block size... */ + eigworklen = (2 + _FT(ilaenv)(&ONE, filaenv, fvu, &jmax, &MONE, &MONE, &MONE, 6, 2)) * jmax; + + /* Allocating memory for matrices & vectors */ + if((void*)(V_ = (_Complex double *)malloc((lda * jmax + 4) * sizeof(_Complex double))) == NULL) { + errno = 0; + jderrorhandler(300,"V in jdher_bi"); + } +#if (defined SSE || defined SSE2 || defined SSE3) + V = (_Complex double*)(((unsigned long int)(V_)+ALIGN_BASE)&~ALIGN_BASE); +#else + V = V_; +#endif + if((void*)(U = (_Complex double *)malloc(jmax * jmax * sizeof(_Complex double))) == NULL) { + jderrorhandler(300,"U in jdher_bi"); + } + if((void*)(s = (double *)malloc(jmax * sizeof(double))) == NULL) { + jderrorhandler(300,"s in jdher_bi"); + } + if((void*)(Res_ = (_Complex double *)malloc((lda * blksize+4) * sizeof(_Complex double))) == NULL) { + jderrorhandler(300,"Res in jdher_bi"); + } +#if (defined SSE || defined SSE2 || defined SSE3) + Res = (_Complex double*)(((unsigned long int)(Res_)+ALIGN_BASE)&~ALIGN_BASE); +#else + Res = Res_; +#endif + if((void*)(resnrm = (double *)malloc(blksize * sizeof(double))) == NULL) { + jderrorhandler(300,"resnrm in jdher_bi"); + } + if((void*)(resnrm_old = (double *)calloc(blksize,sizeof(double))) == NULL) { + jderrorhandler(300,"resnrm_old in jdher_bi"); + } + if((void*)(M = (_Complex double *)malloc(jmax * jmax * sizeof(_Complex double))) == NULL) { + jderrorhandler(300,"M in jdher_bi"); + } + if((void*)(Vtmp = (_Complex double *)malloc(jmax * jmax * sizeof(_Complex double))) == NULL) { + jderrorhandler(300,"Vtmp in jdher_bi"); + } + if((void*)(p_work_bi = (_Complex double *)malloc(lda * sizeof(_Complex double))) == NULL) { + jderrorhandler(300,"p_work_bi in jdher_bi"); + } + + /* ... */ + if((void*)(idx1 = (int *)malloc(jmax * sizeof(int))) == NULL) { + jderrorhandler(300,"idx1 in jdher_bi"); + } + if((void*)(idx2 = (int *)malloc(jmax * sizeof(int))) == NULL) { + jderrorhandler(300,"idx2 in jdher_bi"); + } + + /* Indices for (non-)converged approximations */ + if((void*)(convind = (int *)malloc(blksize * sizeof(int))) == NULL) { + jderrorhandler(300,"convind in jdher_bi"); + } + if((void*)(keepind = (int *)malloc(blksize * sizeof(int))) == NULL) { + jderrorhandler(300,"keepind in jdher_bi"); + } + if((void*)(solvestep = (int *)malloc(blksize * sizeof(int))) == NULL) { + jderrorhandler(300,"solvestep in jdher_bi"); + } + if((void*)(actcorrits = (int *)malloc(blksize * sizeof(int))) == NULL) { + jderrorhandler(300,"actcorrits in jdher_bi"); + } + + if((void*)(eigwork = (_Complex double *)malloc(eigworklen * sizeof(_Complex double))) == NULL) { + jderrorhandler(300,"eigwork in jdher_bi"); + } + if((void*)(rwork = (double *)malloc(3*jmax * sizeof(double))) == NULL) { + jderrorhandler(300,"rwork in jdher_bi"); + } + if((void*)(temp1_ = (_Complex double *)malloc((lda+4) * sizeof(_Complex double))) == NULL) { + jderrorhandler(300,"temp1 in jdher_bi"); + } +#if (defined SSE || defined SSE2 || defined SSE3) + temp1 = (_Complex double*)(((unsigned long int)(temp1_)+ALIGN_BASE)&~ALIGN_BASE); +#else + temp1 = temp1_; +#endif + if((void*)(dtemp = (double *)malloc(lda * sizeof(_Complex double))) == NULL) { + jderrorhandler(300,"dtemp in jdher_bi"); + } + + /* Set variables for Projection routines */ + n2 = 2*n; + p_n = n; + p_n2 = n2; + p_Q_bi = Q; + p_A_psi_bi = A_psi; + p_lda = lda; + + /************************************************************************** + * * + * Generate initial search subspace V. Vectors are taken from V0 and if * + * necessary randomly generated. * + * * + **************************************************************************/ + + /* copy V0 to V */ + _FT(zlacpy)(fupl_a, &n, &V0dim, V0, &lda, V, &lda, 1); + j = V0dim; + /* if V0dim < blksize: generate additional random vectors */ + if (V0dim < blksize) { + idummy = (blksize - V0dim)*n; /* nof random numbers */ + _FT(zlarnv)(&IDIST, ISEED, &idummy, V + V0dim*lda); + j = blksize; + } + for (cnt = 0; cnt < j; cnt ++) { + ModifiedGS(V + cnt*lda, n, cnt, V, lda); + alpha = sqrt(square_norm((spinor*)(V+cnt*lda), 2*N, 1)); + alpha = 1.0 / alpha; + _FT(dscal)(&n2, &alpha, (double *)(V + cnt*lda), &ONE); + } + /* Generate interaction matrix M = V^dagger*A*V. Only the upper triangle + is computed. */ + for (cnt = 0; cnt < j; cnt++){ + A_psi((bispinor*) temp1, (bispinor*)(V+cnt*lda)); + idummy = cnt+1; + for(i = 0; i < idummy; i++) { + M[cnt*jmax+i] = scalar_prod((spinor*)(V+i*lda), (spinor*) temp1, 2*N, 1); + } + } + /* Other initializations */ + k = 0; (*it) = 0; + if((*k_conv) > 0) { + k = *k_conv; + } + + actblksize = blksize; + for(act = 0; act < blksize; act ++){ + solvestep[act] = 1; + } + + + /**************************************************************************** + * * + * Main JD-iteration loop * + * * + ****************************************************************************/ + while((*it) < itmax) { + /**************************************************************************** + * * + * Solving the projected eigenproblem * + * * + * M*u = V^dagger*A*V*u = s*u * + * M is hermitian, only the upper triangle is stored * + * * + ****************************************************************************/ + _FT(zlacpy)(fupl_u, &j, &j, M, &jmax, U, &jmax, 1); + _FT(zheev)(fupl_v, fupl_u, &j, U, &jmax, s, eigwork, &eigworklen, rwork, &info, 1, 1); + + if (info != 0) { + printf("error solving the projected eigenproblem."); + printf(" zheev: info = %d\n", info); + } + if(info != 0) jderrorhandler(502,"problem in zheev for jdher_bi"); + + + /* Reverse order of eigenvalues if maximal value is needed */ + if(maxmin == 1){ + sorteig(j, s, U, jmax, s[j-1], dtemp, idx1, idx2, 0); + } + else{ + sorteig(j, s, U, jmax, 0., dtemp, idx1, idx2, 0); + } + /**************************************************************************** + * * + * Convergence/Restart Check * + * * + * In case of convergence, strip off a whole block or just the converged * + * ones and put 'em into Q. Update the matrices Q, V, U, s * + * * + * In case of a restart update the V, U and M matrices and recompute the * + * Eigenvectors * + * * + ****************************************************************************/ + + found = 1; + while(found) { + + /* conv/keep = Number of converged/non-converged Approximations */ + conv = 0; keep = 0; + + for(act=0; act < actblksize; act++){ + + /* Setting pointers for single vectors */ + q = Q + (act+k)*lda; + u = U + act*jmax; + r = Res + act*lda; + + /* Compute Ritz-Vector Q[:,k+cnt1]=V*U[:,cnt1] */ + theta = s[act]; + _FT(zgemv)(fupl_n, &n, &j, &CONE, V, &lda, u, &ONE, &CZERO, q, &ONE, 1); + + /* Compute the residual */ + A_psi((bispinor*) r, (bispinor*) q); + theta = -theta; + _FT(daxpy)(&n2, &theta, (double*) q, &ONE, (double*) r, &ONE); + + /* Compute norm of the residual and update arrays convind/keepind*/ + resnrm_old[act] = resnrm[act]; + resnrm[act] = sqrt(square_norm((spinor*) r, 2*N, 1)); + if (resnrm[act] < tol){ + convind[conv] = act; + conv = conv + 1; + } + else{ + keepind[keep] = act; + keep = keep + 1; + } + + } /* for(act = 0; act < actblksize; act ++) */ + + /* Check whether the blkwise-mode is chosen and ALL the + approximations converged, or whether the strip-off mode is + active and SOME of the approximations converged */ + + found = ((blkwise==1 && conv==actblksize) || (blkwise==0 && conv!=0)) + && (j > actblksize || k == kmax - actblksize); + + /*************************************************************************** + * * + * Convergence Case * + * * + * In case of convergence, strip off a whole block or just the converged * + * ones and put 'em into Q. Update the matrices Q, V, U, s * + * * + **************************************************************************/ + + if (found) { + + /* Store Eigenvalues */ + for(act = 0; act < conv; act++) + lambda[k+act] = s[convind[act]]; + + /* Re-use non approximated Ritz-Values */ + for(act = 0; act < keep; act++) + s[act] = s[keepind[act]]; + + /* Shift the others in the right position */ + for(act = 0; act < (j-actblksize); act ++) + s[act+keep] = s[act+actblksize]; + + /* Update V. Re-use the V-Vectors not looked at yet. */ + idummy = j - actblksize; + for (act = 0; act < n; act = act + jmax) { + cnt = act + jmax > n ? n-act : jmax; + _FT(zlacpy)(fupl_a, &cnt, &j, V+act, &lda, Vtmp, &jmax, 1); + _FT(zgemm)(fupl_n, fupl_n, &cnt, &idummy, &j, &CONE, Vtmp, + &jmax, U+actblksize*jmax, &jmax, &CZERO, V+act+keep*lda, &lda, 1, 1); + } + + /* Insert the not converged approximations as first columns in V */ + for(act = 0; act < keep; act++){ + _FT(zlacpy)(fupl_a,&n,&ONE,Q+(k+keepind[act])*lda,&lda,V+act*lda,&lda,1); + } + + /* Store Eigenvectors */ + for(act = 0; act < conv; act++){ + _FT(zlacpy)(fupl_a,&n,&ONE,Q+(k+convind[act])*lda,&lda,Q+(k+act)*lda,&lda,1); + } + + /* Update SearchSpaceSize j */ + j = j - conv; + + /* Let M become a diagonalmatrix with the Ritzvalues as entries ... */ + _FT(zlaset)(fupl_u, &j, &j, &CZERO, &CZERO, M, &jmax, 1); + for (act = 0; act < j; act++) + M[act*jmax + act] = s[act]; + + /* ... and U the Identity(jnew,jnew) */ + _FT(zlaset)(fupl_a, &j, &j, &CZERO, &CONE, U, &jmax, 1); + + if(shift_mode == 1){ + if(maxmin == 0){ + for(act = 0; act < conv; act ++){ + if (lambda[k+act] > tau){ + tau = lambda[k+act]; + } + } + } + else{ + for(act = 0; act < conv; act ++){ + if (lambda[k+act] < tau){ + tau = lambda[k+act]; + } + } + } + } + + /* Update Converged-Eigenpair-counter and Pro_k */ + k = k + conv; + + /* Update the new blocksize */ + actblksize=min(blksize, kmax-k); + + /* Exit main iteration loop when kmax eigenpairs have been + approximated */ + if (k == kmax){ + endflag = 1; + break; + } + /* Counter for the linear-solver-accuracy */ + for(act = 0; act < keep; act++) + solvestep[act] = solvestep[keepind[act]]; + + /* Now we expect to have the next eigenvalues */ + /* allready with some accuracy */ + /* So we do not need to start from scratch... */ + for(act = keep; act < blksize; act++) + solvestep[act] = 1; + + } /* if(found) */ + if(endflag == 1){ + break; + } + /************************************************************************** + * * + * Restart * + * * + * The Eigenvector-Aproximations corresponding to the first jmin * + * Petrov-Vectors are kept. if (j+actblksize > jmax) { * + * * + **************************************************************************/ + if (j+actblksize > jmax) { + + idummy = j; j = jmin; + + for (act = 0; act < n; act = act + jmax) { /* V = V * U(:,1:j) */ + cnt = act+jmax > n ? n-act : jmax; + _FT(zlacpy)(fupl_a, &cnt, &idummy, V+act, &lda, Vtmp, &jmax, 1); + _FT(zgemm)(fupl_n, fupl_n, &cnt, &j, &idummy, &CONE, Vtmp, + &jmax, U, &jmax, &CZERO, V+act, &lda, 1, 1); + } + + _FT(zlaset)(fupl_a, &j, &j, &CZERO, &CONE, U, &jmax, 1); + _FT(zlaset)(fupl_u, &j, &j, &CZERO, &CZERO, M, &jmax, 1); + for (act = 0; act < j; act++) + M[act*jmax + act] = s[act]; + } + + } /* while(found) */ + + if(endflag == 1){ + break; + } + + /**************************************************************************** + * * + * Solving the correction equations * + * * + * * + ****************************************************************************/ + + /* Solve actblksize times the correction equation ... */ + for (act = 0; act < actblksize; act ++) { + + /* Setting start-value for vector v as zeros(n,1). Guarantees + orthogonality */ + v = V + j*lda; + for (cnt = 0; cnt < n; cnt ++){ + v[cnt] = 0.; + } + + /* Adaptive accuracy and shift for the lin.solver. In case the + residual is big, we don't need a too precise solution for the + correction equation, since even in exact arithmetic the + solution wouldn't be too usefull for the Eigenproblem. */ + r = Res + act*lda; + + if (resnrm[act] < eps_tr && resnrm[act] < s[act] && resnrm_old[act] > resnrm[act]){ + p_theta = s[act]; + } + else{ + p_theta = tau; + } + p_k = k + actblksize; + + /* if we are in blockwise mode, we do not want to */ + /* iterate solutions much more, if they have */ + /* allready the desired precision */ + if(blkwise == 1 && resnrm[act] < tol) { + it_tol = pow(toldecay, (double)(-5)); + } + else { + it_tol = pow(toldecay, (double)(-solvestep[act])); + } + solvestep[act] = solvestep[act] + 1; + + /* equation and project if necessary */ + ModifiedGS(r, n, k + actblksize, Q, lda); + + g_sloppy_precision = 1; + /* Solve the correction equation ... */ + if (solver_flag == BICGSTAB){ + info = bicgstab_complex_bi((bispinor*) v, (bispinor*) r, linitmax, + it_tol*it_tol, g_relative_precision_flag, VOLUME/2, &Proj_A_psi_bi); + } + else if(solver_flag == CG){ + info = cg_her_bi((bispinor*) v, (bispinor*) r, linitmax, + it_tol*it_tol, g_relative_precision_flag, VOLUME/2, &Proj_A_psi_bi); + } + else{ + info = bicgstab_complex_bi((bispinor*) v, (bispinor*) r, linitmax, + it_tol*it_tol, g_relative_precision_flag, VOLUME/2, &Proj_A_psi_bi); + } + + g_sloppy_precision = 0; + /* Actualizing profiling data */ + if (info == -1){ + CntCorrIts += linitmax; + } + else{ + CntCorrIts += info; + } + actcorrits[act] = info; + + /* orthonormalize v to Q, cause the implicit + orthogonalization in the solvers may be too inaccurate. Then + apply "IteratedCGS" to prevent numerical breakdown + in order to orthogonalize v to V */ + + ModifiedGS(v, n, k+actblksize, Q, lda); + IteratedClassicalGS(v, &alpha, n, j, V, temp1, lda); + + alpha = 1.0 / alpha; + _FT(dscal)(&n2, &alpha, (double*) v, &ONE); + + /* update interaction matrix M */ + A_psi((bispinor*) temp1, (bispinor*) v); + idummy = j+1; + for(i = 0; i < idummy; i++){ + M[j*jmax+i] = scalar_prod((spinor*)(V+i*lda), (spinor*) temp1, 2*N, 1); + } + /* Increasing SearchSpaceSize j */ + j ++; + } /* for (act = 0;act < actblksize; act ++) */ + + /* Print information line */ + if(g_proc_id == 0) { + print_status(verbosity, *it, k, j - blksize, kmax, blksize, actblksize, + s, resnrm, actcorrits); + } + + /* Increase iteration-counter for outer loop */ + (*it) = (*it) + 1; + + } /* Main iteration loop */ + + /****************************************************************** + * * + * Eigensolutions converged or iteration limit reached * + * * + * Print statistics. Free memory. Return. * + * * + ******************************************************************/ + + *k_conv = k; + if (verbosity >= 1) { + if(g_proc_id == 0) { + printf("\nJDHER execution statistics\n\n"); + printf("IT_OUTER=%d IT_INNER_TOT=%d IT_INNER_AVG=%8.2f\n", + (*it), CntCorrIts, (double)CntCorrIts/(*it)); + printf("\nConverged eigensolutions in order of convergence:\n"); + printf("\n I LAMBDA(I) RES(I)\n"); + printf("---------------------------------------\n"); + } + + for (act = 0; act < *k_conv; act ++) { + /* Compute the residual for solution act */ + q = Q + act*lda; + theta = -lambda[act]; + A_psi((bispinor*) r, (bispinor*) q); + _FT(daxpy)(&n2, &theta, (double*) q, &ONE, (double*) r, &ONE); + alpha = sqrt(square_norm((spinor*) r, 2*N, 1)); + if(g_proc_id == 0) { + printf("%3d %22.15e %12.5e\n", act+1, lambda[act], + alpha); + } + } + if(g_proc_id == 0) { + printf("\n"); + fflush( stdout ); + } + } + + free(V_); free(Vtmp); free(U); + free(s); free(Res_); + free(resnrm); free(resnrm_old); + free(M); free(Z); + free(eigwork); free(temp1_); + free(dtemp); free(rwork); + free(p_work_bi); + free(idx1); free(idx2); + free(convind); free(keepind); free(solvestep); free(actcorrits); + +} /* jdher(.....) */ + + +/**************************************************************************** + * * + * Supporting functions * + * * + ****************************************************************************/ + +/* PRINT_STATUS - print status line (called for each outer iteration) + */ +static void print_status(int verbosity, int it, int k, int j, int kmax, + int blksize, int actblksize, + double *s, double *resnrm, int *actcorrits) { + const int max_vals = 5; + + int i, idummy; + + if (verbosity > 2) { + if (blksize == 1) { + if (it == 0) { + printf(" IT K J RES LINIT RITZ-VALUES(1:5)\n"); + idummy = 28 + ( 13 > max_vals*10 ? 13 : max_vals*10); + for (i = 0; i < idummy; i ++) + putchar('-'); + printf("\n"); + } + printf("%4d %3d %3d %9.2e %5d", it + 1, k, j, resnrm[0], actcorrits[0]); + for (i = 0; i < (j < max_vals ? j : max_vals); i ++){ + printf(" %9.2e", s[i]); + } + printf("\n"); + fflush( stdout ); + } + else { /* blksize > 1 */ + if (it == 0) { + printf(" IT K J RITZVALS "); + for (i = 1; i < actblksize; i ++) + printf(" "); + printf(" RES "); + for (i = 1; i < actblksize; i ++) + printf(" "); + printf(" LINIT\n"); + idummy = 12 + 4 + blksize*(10 + 10 + 5); + for (i = 0; i < idummy; i ++) + putchar('-'); + printf("\n"); + } + printf("%4d %3d %3d", it + 1, k, j); + for (i = 0; i < blksize; i ++) + if (i < actblksize) + printf(" %9.2e", s[i]); + else + printf(" "); + printf(" "); + for (i = 0; i < blksize; i ++) + if (i < actblksize) + printf(" %9.2e", resnrm[i]); + else + printf(" "); + printf(" "); + for (i = 0; i < blksize; i ++) + if (i < actblksize) + printf(" %5d", actcorrits[i]); + else + printf(" "); + printf("\n"); + fflush( stdout ); + } + } +} + +/* + * SORTEIG + * + * Default behaviour (strategy == 0): + * + * Sort eigenpairs (S(i),U(:,i)), such that + * + * |S(i) - tau| <= |S(i+1) -tau| for i=1..j-1. + * + * j : dimension of S + * ldu: leading dimension of U + * dtemp: double array of length j + * idx: int array of length j + * + * Alternate behaviour (strategy == 1): + * + * Same as above but put all S(i) < tau to the end. This is used to + * avoid computation of zero eigenvalues. + */ + +static void sorteig(int j, double S[], _Complex double U[], int ldu, double tau, + double dtemp[], int idx1[], int idx2[], int strategy){ + int i; + + /* setup vector to be sorted and index vector */ + switch (strategy) { + case 0: + for (i = 0; i < j; i ++) + dtemp[i] = fabs(S[i] - tau); + break; + case 1: + for (i = 0; i < j; i ++) + if (S[i] < tau) + dtemp[i] = DBL_MAX; + else + dtemp[i] = fabs(S[i] - tau); + break; + default: + jderrorhandler(503,"");; + } + for (i = 0; i < j; i ++) + idx1[i] = i; + + /* sort dtemp in ascending order carrying itemp along */ + quicksort(j, dtemp, idx1); + + /* compute 'inverse' index vector */ + for (i = 0; i < j; i ++) + idx2[idx1[i]] = i; + + /* sort eigenvalues */ + memcpy(dtemp, S, j * sizeof(double)); + for (i = 0; i < j; i ++) + S[i] = dtemp[idx1[i]]; + + /* sort eigenvectors (in place) */ + for (i = 0; i < j; i ++) { + if (i != idx1[i]) { + memcpy(dtemp, U+i*ldu, j*sizeof(_Complex double)); + memcpy(U+i*ldu, U+idx1[i]*ldu, j*sizeof(_Complex double)); + memcpy(U+idx1[i]*ldu, dtemp, j*sizeof(_Complex double)); + idx1[idx2[i]] = idx1[i]; + idx2[idx1[i]] = idx2[i]; + } + } +} + + + + +void Proj_A_psi_bi(bispinor * const y, bispinor * const x){ + double mtheta = -p_theta; + int i; + /* y = A*x */ + + p_A_psi_bi(y, x); + + /* y = -theta*x+y*/ + _FT(daxpy)(&p_n2, &mtheta, (double*) x, &ONE, (double*) y, &ONE); + /* p_work_bi = Q^dagger*y */ + for(i = 0; i < p_k; i++) { + p_work_bi[i] = scalar_prod((spinor*)(p_Q_bi+i*p_lda), (spinor*) y, + p_n*sizeof(_Complex double)/sizeof(spinor), 1); + } + /* y = y - Q*p_work_bi */ + _FT(zgemv)(fupl_n, &p_n, &p_k, &CMONE, p_Q_bi, &p_lda, (_Complex double*) p_work_bi, + &ONE, &CONE, (_Complex double*) y, &ONE, 1); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/jdher_bi.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/jdher_bi.h new file mode 100644 index 0000000000000000000000000000000000000000..b5e39efb996df61dd5da118b6d2470235c1ca336 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/jdher_bi.h @@ -0,0 +1,47 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _JDHER_BI_H +#define _JDHER_BI_H + +#ifndef JD_MAXIMAL +#define JD_MAXIMAL 1 +#endif +#ifndef JD_MINIMAL +#define JD_MINIMAL 0 +#endif + +#include +#include +#include "su3.h" +#include "solver/solver.h" + + +extern void jdher_bi(int n, int lda, double tau, double jdtol, + int kmax, int jmax, int jmin, int itmax, + int blksize, int blkwise, + int V0dim, _Complex double *V0, + int linsolver, + int linitmax, double eps_tr, double toldecay, + int clvl, + int *k_conv, _Complex double *Q, double *lambda, int *it, + int maxmin, const int shift_mode, + matrix_mult_bi domatveca); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/jdher_su3vect.c b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/jdher_su3vect.c new file mode 100644 index 0000000000000000000000000000000000000000..551576f0706a40c64d725549568358221a9f0f3b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/jdher_su3vect.c @@ -0,0 +1,832 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +/* + * Routines for the computation of eigensystems of the Laplacian operator, with Jacobi-Davidson algo. + * Authors Luigi Scorzato, Marco Cristoforetti + * + * + *******************************************************************************/ +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "linalg/fortran.h" +#include "linalg/blas.h" +#include "linalg/lapack.h" +#include "linalg_eo.h" +#include "solver/solver.h" +#include "solver/gram-schmidt.h" +#include "solver/quicksort.h" +#include "cg_her_su3vect.h" +#include "jdher_su3vect.h" +#ifdef CRAY +#include +#endif + +#ifdef WITHLAPH + + +#define min(a,b) ((a)<(b) ? (a) : (b)) +#define max(a,b) ((a)<(b) ? (b) : (a)) + +/**************************************************************************** + * * + * Prototypes of static functions * + * * + ****************************************************************************/ +static void print_status_su3vect(int clvl, int it, int k, int j, int kmax, + int blksize, int actblksize, + double *s, double *resnrm, int *actcorrits); +static void sorteig_su3vect(int j, double S[], _Complex double U[], int ldu, double tau, + double dtemp[], int idx1[], int idx2[], int strategy); + +/* Projection routines */ +void Proj_A_psi_su3vect(su3_vector * const y, su3_vector * const x, int tslice); + +void jderrorhandler_su3vect(const int i, char * message) +{ + fprintf(stderr, "jdher %s \n", message); +#ifdef MPI + MPI_Finalize(); +#endif + exit(i); +} + +/**************************************************************************** + * * + * Static variables * + * * + ****************************************************************************/ +/* static double DMONE = -1.0, DZER = 0.0, DONE = 1.0; */ +static int MONE = -1, ONE = 1; +static _Complex double CONE, CZERO, CMONE; + +/* Projector variables */ + +static int p_n, p_n2, p_k, p_lda; +static double p_theta; +_Complex double * p_Q; +_Complex double * p_work; +matrix_mult_su3vect p_A_psi_s3; + +static char * fupl_u = "U", * fupl_c = "C", *fupl_n = "N", * fupl_a = "A", *fupl_v = "V", *filaenv = "zhetrd", *fvu = "VU"; + +void jdher_su3vect(int n, int lda, double tau, double tol, + int kmax, int jmax, int jmin, int itmax, + int blksize, int blkwise, + int V0dim, _Complex double *V0, + int solver_flag, + int linitmax, double eps_tr, double toldecay, + int verbosity, + int *k_conv, _Complex double *Q, double *lambda, int *it, + int maxmin, int shift_mode, int tslice, + matrix_mult_su3vect A_psi) +{ +/******************* + * Local variables * + *******************/ + +/* constants */ +/* allocatables: * + * initialize with NULL, so we can free even unallocated ptrs */ +double *s = NULL, *resnrm = NULL, *resnrm_old = NULL, *dtemp = NULL, *rwork = NULL; +_Complex double *V_ = NULL; +_Complex double *V; +_Complex double *Vtmp = NULL, *U = NULL, *M = NULL, *Z = NULL, *Res_ = NULL, *Res, *eigwork = NULL, + *temp1_ = NULL, *temp1; +int *idx1 = NULL, *idx2 = NULL, *convind = NULL, *keepind = NULL, *solvestep = NULL, *actcorrits = NULL; + +/* non-allocated ptrs */ +_Complex double *q, *v, *u, *r = NULL; +/* scalar vars */ +double theta, alpha, it_tol; +int i, k, j, actblksize, eigworklen, found, conv, keep, n2; +int act, cnt, idummy, info, CntCorrIts=0, endflag=0; +int N=n*sizeof(_Complex double)/sizeof(su3_vector); +int IDIST = 1; +int ISEED[4] = {2, 3, 5, 7}; + ISEED[0] = 2; + + /* print info header */ + if ((verbosity > 2) && (g_proc_id == 0)){ + printf("Jacobi-Davidson method for hermitian Matrices\n"); + printf("Solving A*x = lambda*x \n\n"); + printf(" N= %10d ITMAX=%4d\n", n, itmax); + printf(" KMAX=%3d JMIN=%3d JMAX=%3d V0DIM=%3d\n", + kmax, jmin, jmax, V0dim); + printf(" BLKSIZE= %2d BLKWISE= %5s\n", + blksize, blkwise ? "TRUE" : "FALSE"); + printf(" TOL= %11.4e TAU= %11.4e\n", + tol, tau); + printf(" LINITMAX= %5d EPS_TR= %10.3e TOLDECAY=%9.2e\n", + linitmax, eps_tr, toldecay); + printf("\n Computing %s eigenvalues\n", + maxmin ? "maximal" : "minimal"); + printf("\n"); + fflush( stdout ); + } + /* validate input parameters */ + if(tol <= 0) jderrorhandler(401,""); + if(kmax <= 0 || kmax > n) jderrorhandler(402,""); + if(jmax <= 0 || jmax > n) jderrorhandler(403,""); + if(jmin <= 0 || jmin > jmax) jderrorhandler(404,""); + if(itmax < 0) jderrorhandler(405,""); + if(blksize > jmin || blksize > (jmax - jmin)) jderrorhandler(406,""); + if(blksize <= 0 || blksize > kmax) jderrorhandler(406,""); + if(blkwise < 0 || blkwise > 1) jderrorhandler(407,""); + if(V0dim < 0 || V0dim >= jmax) jderrorhandler(408,""); + if(linitmax < 0) jderrorhandler(409,""); + if(eps_tr < 0.) jderrorhandler(500,""); + if(toldecay <= 1.0) jderrorhandler(501,""); + +/* CONE.re=1.; CONE.im=0.; + CZERO.re=0.; CZERO.im=0.; + CMONE.re=-1.; CMONE.im=0.; */ + CONE=(_Complex double)1.0; + CZERO=(_Complex double)0.0; + CMONE=_Complex_I; + + /* Get hardware-dependent values: + * Opt size of workspace for ZHEEV is (NB+1)*j, where NB is the opt. + * block size... */ + eigworklen = (2 + _FT(ilaenv)(&ONE, filaenv, fvu, &jmax, &MONE, &MONE, &MONE, 6, 2)) * jmax; + + if((void*)(V_ = (_Complex double *)malloc((lda * jmax + 4) * sizeof(_Complex double))) == NULL) + { + errno = 0; + jderrorhandler(300,"V in jdher"); + } + V = V_; + if((void*)(U = (_Complex double *)malloc(jmax * jmax * sizeof(_Complex double))) == NULL) + { + jderrorhandler(300,"U in jdher"); + } + if((void*)(s = (double *)malloc(jmax * sizeof(double))) == NULL) + { + jderrorhandler(300,"s in jdher"); + } + if((void*)(Res_ = (_Complex double *)malloc((lda * blksize+4) * sizeof(_Complex double))) == NULL) + { + jderrorhandler(300,"Res in jdher"); + } + Res = Res_; + + if((void*)(resnrm = (double *)malloc(blksize * sizeof(double))) == NULL) + { + jderrorhandler(300,"resnrm in jdher"); + } + if((void*)(resnrm_old = (double *)calloc(blksize,sizeof(double))) == NULL) + { + jderrorhandler(300,"resnrm_old in jdher"); + } + if((void*)(M = (_Complex double *)malloc(jmax * jmax * sizeof(_Complex double))) == NULL) + { + jderrorhandler(300,"M in jdher"); + } + if((void*)(Vtmp = (_Complex double *)malloc(jmax * jmax * sizeof(_Complex double))) == NULL) + { + jderrorhandler(300,"Vtmp in jdher"); + } + if((void*)(p_work = (_Complex double *)malloc(lda * sizeof(_Complex double))) == NULL) + { + jderrorhandler(300,"p_work in jdher"); + } + + /* ... */ + if((void*)(idx1 = (int *)malloc(jmax * sizeof(int))) == NULL) + { + jderrorhandler(300,"idx1 in jdher"); + } + if((void*)(idx2 = (int *)malloc(jmax * sizeof(int))) == NULL) + { + jderrorhandler(300,"idx2 in jdher"); + } + + /* Indices for (non-)converged approximations */ + if((void*)(convind = (int *)malloc(blksize * sizeof(int))) == NULL) + { + jderrorhandler(300,"convind in jdher"); + } + if((void*)(keepind = (int *)malloc(blksize * sizeof(int))) == NULL) + { + jderrorhandler(300,"keepind in jdher"); + } + if((void*)(solvestep = (int *)malloc(blksize * sizeof(int))) == NULL) + { + jderrorhandler(300,"solvestep in jdher"); + } + if((void*)(actcorrits = (int *)malloc(blksize * sizeof(int))) == NULL) + { + jderrorhandler(300,"actcorrits in jdher"); + } + + if((void*)(eigwork = (_Complex double *)malloc(eigworklen * sizeof(_Complex double))) == NULL) + { + jderrorhandler(300,"eigwork in jdher"); + } + if((void*)(rwork = (double *)malloc(3*jmax * sizeof(double))) == NULL) + { + jderrorhandler(300,"rwork in jdher"); + } + if((void*)(temp1_ = (_Complex double *)malloc((lda+4) * sizeof(_Complex double))) == NULL) + { + jderrorhandler(300,"temp1 in jdher"); + } + temp1 = temp1_; + if((void*)(dtemp = (double *)malloc(lda * sizeof(_Complex double))) == NULL) + { + jderrorhandler(300,"dtemp in jdher"); + } + + /* Set variables for Projection routines */ + n2 = 2*n; + p_n = n; + p_n2 = n2; + p_Q = Q; + p_A_psi_s3 = A_psi; + p_lda = lda; + + /************************************************************************** + * * + * Generate initial search subspace V. Vectors are taken from V0 and if * + * necessary randomly generated. * + * * + **************************************************************************/ + + /* copy V0 to V */ + _FT(zlacpy)(fupl_a, &n, &V0dim, V0, &lda, V, &lda, 1); + j = V0dim; + /* if V0dim < blksize: generate additional random vectors */ + if (V0dim < blksize) + { + idummy = (blksize - V0dim)*n; /* nof random numbers */ + _FT(zlarnv)(&IDIST, ISEED, &idummy, V + V0dim*lda); + j = blksize; + } + for (cnt = 0; cnt < j; cnt ++) + { + ModifiedGS_su3vect(V + cnt*lda, n, cnt, V, lda); + alpha = sqrt(square_norm_su3vect((su3_vector*)(V+cnt*lda), N, 1)); + alpha = 1.0 / alpha; + _FT(dscal)(&n2, &alpha, (double *)(V + cnt*lda), &ONE); + } + /* Generate interaction matrix M = V^dagger*A*V. Only the upper triangle + is computed. */ + for (cnt = 0; cnt < j; cnt++) { + /* WARNING: this assumes that A_psi updates the boundaries of the input vector */ + A_psi((su3_vector*) temp1, (su3_vector*) (V+cnt*lda), tslice); + idummy = cnt+1; + for(i = 0; i < idummy; i++) { + M[cnt*jmax+i] = scalar_prod_su3vect((su3_vector*)(V+i*lda), (su3_vector*) temp1, N, 1); + } + } + + /* Other initializations */ + k = 0; (*it) = 0; + if((*k_conv) > 0) + { + k = (*k_conv); + } + + actblksize = blksize; + for(act = 0; act < blksize; act ++) + { + solvestep[act] = 1; + } + + + /**************************************************************************** + * * + * Main JD-iteration loop * + * * + ****************************************************************************/ + + while((*it) < itmax) + { + /**************************************************************************** + * * + * Solving the projected eigenproblem * + * * + * M*u = V^dagger*A*V*u = s*u * + * M is hermitian, only the upper triangle is stored * + * * + ****************************************************************************/ + _FT(zlacpy)(fupl_u, &j, &j, M, &jmax, U, &jmax, 1); + _FT(zheev)(fupl_v, fupl_u, &j, U, &jmax, s, eigwork, &eigworklen, rwork, &info, 1, 1); + + if (info != 0) + { + printf("error solving the projected eigenproblem."); + printf(" zheev: info = %d\n", info); + } + if(info != 0) jderrorhandler(502,"proble in zheev"); + + + /* Reverse order of eigenvalues if maximal value is needed */ + if(maxmin == 1) + { + sorteig_su3vect(j, s, U, jmax, s[j-1], dtemp, idx1, idx2, 0); + } + else + { + sorteig_su3vect(j, s, U, jmax, 0., dtemp, idx1, idx2, 0); + } + /**************************************************************************** + * * + * Convergence/Restart Check * + * * + * In case of convergence, strip off a whole block or just the converged * + * ones and put 'em into Q. Update the matrices Q, V, U, s * + * * + * In case of a restart update the V, U and M matrices and recompute the * + * Eigenvectors * + * * + ****************************************************************************/ + + found = 1; + while(found) + { + /* conv/keep = Number of converged/non-converged Approximations */ + conv = 0; keep = 0; + for(act=0; act < actblksize; act++) + { + /* Setting pointers for single vectors */ + q = Q + (act+k)*lda; + u = U + act*jmax; + r = Res + act*lda; + /* Compute Ritz-Vector Q[:,k+cnt1]=V*U[:,cnt1] */ + theta = s[act]; + _FT(zgemv)(fupl_n, &n, &j, &CONE, V, &lda, u, &ONE, &CZERO, q, &ONE, 1); + /* Compute the residual */ + A_psi((su3_vector*) r, (su3_vector*) q,tslice); + theta = -theta; + _FT(daxpy)(&n2, &theta, (double*) q, &ONE, (double*) r, &ONE); + + /* Compute norm of the residual and update arrays convind/keepind*/ + resnrm_old[act] = resnrm[act]; + resnrm[act] = sqrt(square_norm_su3vect((su3_vector*) r, N, 1)); + if (resnrm[act] < tol) + { + convind[conv] = act; + conv = conv + 1; + } + else + { + keepind[keep] = act; + keep = keep + 1; + } + } /* for(act = 0; act < actblksize; act ++) */ + /* Check whether the blkwise-mode is chosen and ALL the + approximations converged, or whether the strip-off mode is + active and SOME of the approximations converged */ + found = ((blkwise==1 && conv==actblksize) || (blkwise==0 && conv!=0)) + && (j > actblksize || k == kmax - actblksize); + /*************************************************************************** + * * + * Convergence Case * + * * + * In case of convergence, strip off a whole block or just the converged * + * ones and put 'em into Q. Update the matrices Q, V, U, s * + * * + **************************************************************************/ + if (found) + { + /* Store Eigenvalues */ + for(act = 0; act < conv; act++) + lambda[k+act] = s[convind[act]]; + /* Re-use non approximated Ritz-Values */ + for(act = 0; act < keep; act++) + s[act] = s[keepind[act]]; + /* Shift the others in the right position */ + for(act = 0; act < (j-actblksize); act ++) + s[act+keep] = s[act+actblksize]; + /* Update V. Re-use the V-Vectors not looked at yet. */ + idummy = j - actblksize; + for (act = 0; act < n; act = act + jmax) + { + cnt = act + jmax > n ? n-act : jmax; + _FT(zlacpy)(fupl_a, &cnt, &j, V+act, &lda, Vtmp, &jmax, 1); + _FT(zgemm)(fupl_n, fupl_n, &cnt, &idummy, &j, &CONE, Vtmp, + &jmax, U+actblksize*jmax, &jmax, &CZERO, V+act+keep*lda, &lda, 1, 1); + } + /* Insert the not converged approximations as first columns in V */ + for(act = 0; act < keep; act++) + { + _FT(zlacpy)(fupl_a,&n,&ONE,Q+(k+keepind[act])*lda,&lda,V+act*lda,&lda,1); + } + /* Store Eigenvectors */ + for(act = 0; act < conv; act++) + { + _FT(zlacpy)(fupl_a,&n,&ONE,Q+(k+convind[act])*lda,&lda,Q+(k+act)*lda,&lda,1); + } + /* Update SearchSpaceSize j */ + j = j - conv; + /* Let M become a diagonalmatrix with the Ritzvalues as entries ... */ + _FT(zlaset)(fupl_u, &j, &j, &CZERO, &CZERO, M, &jmax, 1); + for (act = 0; act < j; act++) + { + M[act*jmax + act] = s[act]; + } + /* ... and U the Identity(jnew,jnew) */ + _FT(zlaset)(fupl_a, &j, &j, &CZERO, &CONE, U, &jmax, 1); + if(shift_mode == 1) + { + if(maxmin == 0) + { + for(act = 0; act < conv; act ++) + { + if (lambda[k+act] > tau) + { + tau = lambda[k+act]; + } + } + } + else + { + for(act = 0; act < conv; act ++) + { + if (lambda[k+act] < tau) + { + tau = lambda[k+act]; + } + } + } + } + /* Update Converged-Eigenpair-counter and Pro_k */ + k = k + conv; + /* Update the new blocksize */ + actblksize=min(blksize, kmax-k); + /* Exit main iteration loop when kmax eigenpairs have been approximated */ + if (k == kmax) + { + endflag = 1; + break; + } + /* Counter for the linear-solver-accuracy */ + for(act = 0; act < keep; act++) + solvestep[act] = solvestep[keepind[act]]; + /* Now we expect to have the next eigenvalues */ + /* allready with some accuracy */ + /* So we do not need to start from scratch... */ + for(act = keep; act < blksize; act++) + solvestep[act] = 1; + } /* if(found) */ + if(endflag == 1) + { + break; + } + /************************************************************************** + * * + * Restart * + * * + * The Eigenvector-Aproximations corresponding to the first jmin * + * Petrov-Vectors are kept. if (j+actblksize > jmax) * + * * + **************************************************************************/ + if (j+actblksize > jmax) + { + idummy = j; j = jmin; + + for (act = 0; act < n; act = act + jmax) + { /* V = V * U(:,1:j) */ + cnt = act+jmax > n ? n-act : jmax; + _FT(zlacpy)(fupl_a, &cnt, &idummy, V+act, &lda, Vtmp, &jmax, 1); + _FT(zgemm)(fupl_n, fupl_n, &cnt, &j, &idummy, &CONE, Vtmp, + &jmax, U, &jmax, &CZERO, V+act, &lda, 1, 1); + } + _FT(zlaset)(fupl_a, &j, &j, &CZERO, &CONE, U, &jmax, 1); + _FT(zlaset)(fupl_u, &j, &j, &CZERO, &CZERO, M, &jmax, 1); + for (act = 0; act < j; act++) + M[act*jmax + act] = s[act]; + } + } /* while(found) */ + + if(endflag == 1) + { + break; + } + + /**************************************************************************** + * * + * Solving the correction equations * + * * + * * + ****************************************************************************/ + + /* Solve actblksize times the correction equation ... */ + for (act = 0; act < actblksize; act ++) + { + /* Setting start-value for vector v as zeros(n,1). Guarantees orthogonality */ + v = V + j*lda; + for (cnt = 0; cnt < n; cnt ++) + { + v[cnt] = (_Complex double)0.; + } + /* Adaptive accuracy and shift for the lin.solver. In case the + residual is big, we don't need a too precise solution for the + correction equation, since even in exact arithmetic the + solution wouldn't be too usefull for the Eigenproblem. */ + r = Res + act*lda; + if (resnrm[act] < eps_tr && resnrm[act] < s[act] && resnrm_old[act] > resnrm[act]) + { + p_theta = s[act]; + } + else + { + p_theta = tau; + } + p_k = k + actblksize; + + /* if we are in blockwise mode, we do not want to */ + /* iterate solutions much more, if they have */ + /* allready the desired precision */ + if(blkwise == 1 && resnrm[act] < tol) + { + it_tol = pow(toldecay, (double)(-5)); + } + else + { + it_tol = pow(toldecay, (double)(-solvestep[act])); + } + solvestep[act] = solvestep[act] + 1; + + /* equation and project if necessary */ + ModifiedGS_su3vect(r, n, k + actblksize, Q, lda); + + /* Solve the correction equation ... */ + g_sloppy_precision = 1; + if(solver_flag == CG) + { + info = cg_her_su3vect((su3_vector*) v, (su3_vector*) r, linitmax, it_tol*it_tol, 0, + n*sizeof(_Complex double)/sizeof(su3_vector),tslice, &Proj_A_psi_su3vect); + } + g_sloppy_precision = 0; + + /* Actualizing profiling data */ + if (info == -1) + { + CntCorrIts += linitmax; + } + else + { + CntCorrIts += info; + } + actcorrits[act] = info; + + /* orthonormalize v to Q, cause the implicit + orthogonalization in the solvers may be too inaccurate. Then + apply "IteratedCGS" to prevent numerical breakdown + in order to orthogonalize v to V */ + + ModifiedGS_su3vect(v, n, k+actblksize, Q, lda); + IteratedClassicalGS_su3vect(v, &alpha, n, j, V, temp1, lda); + + alpha = 1.0 / alpha; + _FT(dscal)(&n2, &alpha, (double*) v, &ONE); + + /* update interaction matrix M */ + A_psi((su3_vector*) temp1, (su3_vector*) v, tslice); + idummy = j+1; + for(i = 0; i < idummy; i++) { + M[j*jmax+i] = scalar_prod_su3vect((su3_vector*) (V+i*lda), (su3_vector*) temp1, N, 1); + } + + /* Increasing SearchSpaceSize j */ + j ++; + } /* for (act = 0;act < actblksize; act ++) */ + + /* Print information line */ + if(g_proc_id == 0) { + print_status_su3vect(verbosity, *it, k, j - blksize, kmax, blksize, actblksize, + s, resnrm, actcorrits); + } + /* Increase iteration-counter for outer loop */ + (*it) = (*it) + 1; + } /* Main iteration loop */ + + /****************************************************************** + * * + * Eigensolutions converged or iteration limit reached * + * * + * Print statistics. Free memory. Return. * + * * + ******************************************************************/ + + (*k_conv) = k; + if (g_proc_id == 0 && verbosity > 0) { + printf("\nJDHER execution statistics\n\n"); + printf("IT_OUTER=%d IT_INNER_TOT=%d IT_INNER_AVG=%8.2f\n", + (*it), CntCorrIts, (double)CntCorrIts/(*it)); + printf("\nConverged eigensolutions in order of convergence:\n"); + printf("# I LAMBDA(I) RES(I)\n"); + printf("#---------------------------------------\n"); + } + for (act = 0; act < *k_conv; act ++) + { + /* Compute the residual for solution act */ + q = Q + act*lda; + theta = -lambda[act]; + A_psi((su3_vector*) r, (su3_vector*) q,tslice); + _FT(daxpy)(&n2, &theta, (double*) q, &ONE, (double*) r, &ONE); + alpha = sqrt(square_norm_su3vect((su3_vector*) r, N, 1)); + if(g_proc_id == 0 && verbosity > 1) { + printf("%3d %22.15e %12.5e\n", act+1, lambda[act], alpha); + } + } + if(g_proc_id == 0 && verbosity > 0) + { + printf("\n"); + fflush( stdout ); + } + free(V_); free(Vtmp); free(U); + free(s); free(Res_); + free(resnrm); free(resnrm_old); + free(M); free(Z); + free(eigwork); free(temp1_); + free(dtemp); free(rwork); + free(p_work); + free(idx1); free(idx2); + free(convind); free(keepind); free(solvestep); free(actcorrits); + +} /* jdher(.....) */ + +/**************************************************************************** + * * + * Supporting functions * + * * + ****************************************************************************/ + +/* PRINT_STATUS - print status line (called for each outer iteration) + */ +static void print_status_su3vect(int verbosity, int it, int k, int j, int kmax, + int blksize, int actblksize, + double *s, double *resnrm, int *actcorrits) { + const int max_vals = 5; + + int i, idummy; + + if (verbosity > 2) { + if (blksize == 1) { + if (it == 0) { + printf(" IT K J RES LINIT RITZ-VALUES(1:5)\n"); + idummy = 28 + ( 13 > max_vals*10 ? 13 : max_vals*10); + for (i = 0; i < idummy; i ++) + putchar('-'); + printf("\n"); + } + printf("%4d %3d %3d %9.2e %5d", it + 1, k, j, resnrm[0], actcorrits[0]); + for (i = 0; i < (j < max_vals ? j : max_vals); i ++){ + printf(" %9.2e", s[i]); + } + printf("\n"); + fflush( stdout ); + } + else { /* blksize > 1 */ + if (it == 0) { + printf(" IT K J RITZVALS "); + for (i = 1; i < actblksize; i ++) + printf(" "); + printf(" RES "); + for (i = 1; i < actblksize; i ++) + printf(" "); + printf(" LINIT\n"); + idummy = 12 + 4 + blksize*(10 + 10 + 5); + for (i = 0; i < idummy; i ++) + putchar('-'); + printf("\n"); + } + printf("%4d %3d %3d", it + 1, k, j); + for (i = 0; i < blksize; i ++) + if (i < actblksize) + printf(" %9.2e", s[i]); + else + printf(" "); + printf(" "); + for (i = 0; i < blksize; i ++) + if (i < actblksize) + printf(" %9.2e", resnrm[i]); + else + printf(" "); + printf(" "); + for (i = 0; i < blksize; i ++) + if (i < actblksize) + printf(" %5d", actcorrits[i]); + else + printf(" "); + printf("\n"); + fflush( stdout ); + } + } +} + +/* + * SORTEIG + * + * Default behaviour (strategy == 0): + * + * Sort eigenpairs (S(i),U(:,i)), such that + * + * |S(i) - tau| <= |S(i+1) -tau| for i=1..j-1. + * + * j : dimension of S + * ldu: leading dimension of U + * dtemp: double array of length j + * idx: int array of length j + * + * Alternate behaviour (strategy == 1): + * + * Same as above but put all S(i) < tau to the end. This is used to + * avoid computation of zero eigenvalues. + */ + +static void sorteig_su3vect(int j, double S[], _Complex double U[], int ldu, double tau, + double dtemp[], int idx1[], int idx2[], int strategy){ + int i; + + /* setup vector to be sorted and index vector */ + switch (strategy) { + case 0: + for (i = 0; i < j; i ++) + dtemp[i] = fabs(S[i] - tau); + break; + case 1: + for (i = 0; i < j; i ++) + if (S[i] < tau) + dtemp[i] = DBL_MAX; + else + dtemp[i] = fabs(S[i] - tau); + break; + default: + jderrorhandler(503,"");; + } + for (i = 0; i < j; i ++) + idx1[i] = i; + + /* sort dtemp in ascending order carrying itemp along */ + quicksort(j, dtemp, idx1); + + /* compute 'inverse' index vector */ + for (i = 0; i < j; i ++) + idx2[idx1[i]] = i; + + /* sort eigenvalues */ + memcpy(dtemp, S, j * sizeof(double)); + for (i = 0; i < j; i ++) + S[i] = dtemp[idx1[i]]; + + /* sort eigenvectors (in place) */ + for (i = 0; i < j; i ++) { + if (i != idx1[i]) { + memcpy(dtemp, U+i*ldu, j*sizeof(_Complex double)); + memcpy(U+i*ldu, U+idx1[i]*ldu, j*sizeof(_Complex double)); + memcpy(U+idx1[i]*ldu, dtemp, j*sizeof(_Complex double)); + idx1[idx2[i]] = idx1[i]; + idx2[idx1[i]] = idx2[i]; + } + } +} + + + + +void Proj_A_psi_su3vect(su3_vector * const y, su3_vector * const x, int tslice){ + double mtheta = -p_theta; + int i; + /* y = A*x */ + p_A_psi_s3(y, x, tslice); + /* y = -theta*x+y*/ + _FT(daxpy)(&p_n2, &mtheta, (double*) x, &ONE, (double*) y, &ONE); + /* p_work = Q^dagger*y */ + for(i = 0; i < p_k; i++) { + p_work[i] = scalar_prod_su3vect((su3_vector*) (p_Q+i*p_lda), (su3_vector*) y, p_n*sizeof(_Complex double)/sizeof(su3_vector), 1); + } + /* y = y - Q*p_work */ + _FT(zgemv)(fupl_n, &p_n, &p_k, &CMONE, p_Q, &p_lda, (_Complex double*) p_work, &ONE, &CONE, (_Complex double*) y, &ONE, 1); +} + +#endif // WITHLAPH diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/jdher_su3vect.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/jdher_su3vect.h new file mode 100755 index 0000000000000000000000000000000000000000..c58c574241501531ab3a3398e6afbab88d53a07e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/jdher_su3vect.h @@ -0,0 +1,48 @@ +/*********************************************************************** + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _JDHERSU3VJACOBI_H +#define _JDHERSU3VJACOBI_H + +#ifndef JD_MAXIMAL +#define JD_MAXIMAL 1 +#endif +#ifndef JD_MINIMAL +#define JD_MINIMAL 0 +#endif + +#include +#include +#include "su3.h" +#include "solver/solver.h" + +void jderrorhandler(const int i, char * message); + +extern void jdher_su3vect(int n, int lda, double tau, double tol, + int kmax, int jmax, int jmin, int itmax, + int blksize, int blkwise, + int V0dim, _Complex double *V0, + int solver_flag, + int linitmax, double eps_tr, double toldecay, + int verbosity, + int *k_conv, _Complex double *Q, double *lambda, int *it, + int maxmin, int shift_mode,int tslice, + matrix_mult_su3vect A_psi); + +#endif + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/lu_solve.c b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/lu_solve.c new file mode 100644 index 0000000000000000000000000000000000000000..8207ffd95d1fdeba46628b3150d122ca25cf3a47 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/lu_solve.c @@ -0,0 +1,351 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include "global.h" +#include +#include"solver/lu_solve.h" + +/* Solve M a = b by LU decomposition with partial pivoting */ + +double norm2(_Complex double* b, const int Nvec) { + int i; + double res=0.; + for(i = 0; i < Nvec; i++) { + res+= creal(b[i])*creal(b[i]) + cimag(b[i])*cimag(b[i]); + } + return(res); +} + +void LUSolve( const int Nvec, _Complex double * M, const int ldM, _Complex double * b) { + int i, j, k, maxrow, row; + _Complex double * b_local, * y; + double maxnorm; + _Complex double tmp, sum_LU; + + b_local = (_Complex double*)malloc(Nvec*sizeof(_Complex double)); + y = (_Complex double*)malloc(Nvec*sizeof(_Complex double)); + for(i = 0; i < Nvec; i++) { + b_local[i] = b[i]; + } + + /* ----------------------------------------------------------------- + * LU Decompose M_local, in place (Crone's algorithm?) + * It's in Numerical Recipes but also a more understandable + * description can be found at: + * http://csep10.phys.utk.edu/guidry/ + * phys594/lectures/linear_algebra/lanotes/node3.html + * + * OR look in your favourite Matrix Analysis text + * ----------------------------------------------------------------- + * + * ------------------------------------------------------------- + * Start LU Decomp. Definition. 0-th row of U is 0-th row of M + * and L_{i,i} = 1 for all i + * + * So we start with the 1-th (2nd) row + * ------------------------------------------------------------ */ + + for(i = 1; i < Nvec; i++) { + + /* ------------------------------------------------------------ + * Parital Pivot: Find the row with the largest element in the + * ith-column and make that the i-th row. This swaps rows. + * so I don't need to reorder the unknowns, but I do need + * to reorder the b_local + * ------------------------------------------------------------*/ + maxnorm = norm2(&M[i*ldM+i], 1); + maxrow = i; + + /* Compare norms with other elements in column j for row i+1.N */ + for(row = i+1; row < Nvec; row++) { + if ( norm2(&M[row*ldM + i],1) > maxnorm ) { + /* Norm of M(j,i) is bigger, store it as the maximum */ + /* and store its index */ + maxnorm = norm2(&M[row*ldM + i],1); + maxrow = row; + } + } + + /* If the element with maximum norm is not in row i, swap */ + /* its row with row i */ + if( maxrow != i ) { + + /* Swap rows i and maxindex */ + for(j = 0; j < Nvec; j++ ) { + tmp = M[i*ldM + j]; + M[i*ldM + j] = M[maxrow*ldM + j]; + M[maxrow*ldM + j] = tmp; + } + + /* Swap elems of b */ + tmp = b_local[i]; + b_local[i] = b_local[maxrow]; + b_local[maxrow] = tmp; + } + + /* -------------------------------------------------------- + * End of pivoting code + * -------------------------------------------------------- + + + * -------------------------------------------------------- + * Work out elements of L & U in place in M for row i + * -------------------------------------------------------- */ + for(j = 0; j < i; j++) { + + (sum_LU) = 0.; + for(k = 0; k < j; k++) { + /* sum_LU += M(i,k)*M(k,j); */ + (tmp) = (M[i*ldM + k]) * (M[k*ldM + j]); + (sum_LU) += tmp; + + } + /* M(i,j) -= sum_LU; */ + (M[i*ldM + j]) -= sum_LU; + /* M(i,j) /= M(j,j); */ + (tmp) = (M[i*ldM + j]) / (M[j*ldM + j]); + M[i*ldM + j] = tmp; + } + + for(j=i; j < Nvec; j++) { + (sum_LU) = 0.; + for(k = 0; k < i; k++) { + (tmp) = (M[i*ldM + k]) * (M[k*ldM + j]); + (sum_LU) += tmp; + } + /* M(i,j) -= sum_LU; */ + (M[i*ldM+j]) -= sum_LU; + } + } + + /* ---------------------------------------------------- + * LU Decomp finished. M now holds the + * U matrix in its diagonal and superdiagonal elements + * and the subdiagonal elements of the L matrix in its + * subdiagonal. Recall that the Diagonal elements of L + * are chosen to be 1 + * ----------------------------------------------------- + + * Solve L y = b by forward substitution */ + y[0] = b[0]; + for(i = 1; i < Nvec; i++) { + y[i] = b_local[i]; + for(j = 0; j < i; j++) { + (y[i]) -= (M[i*ldM+j]) * (y[j]); + } + } + + /* Solve U a = y by back substitution */ + /* a[Nvec-1] = y[Nvec-1] / M(Nvec-1, Nvec-1); */ + (tmp) = (y[Nvec-1]) / (M[(Nvec-1)*ldM + (Nvec-1)]); + b[Nvec-1] = tmp; + + for(i = Nvec-2; i >= 0; i--) { + tmp = y[i]; + for(j = i+1; j < Nvec; j++) { + /* tmp -= M(i,j)*b[j]; */ + (tmp) -= (M[i*ldM + j]) * (b[j]); + } + (b[i]) = (tmp) / (M[i*ldM+i]); + } + free(b_local); + free(y); +} + + +void LUInvert( const int Nvec, _Complex double * const M, const int ldM) { + int i, j, k, maxrow, row, col; + _Complex double * y; + double maxnorm; + _Complex double tmp, sum_LU, cone; + int * pivot; + _Complex double *A = NULL; + cone = 1.; + + pivot = (int*)malloc(Nvec*sizeof(int)); + y = (_Complex double*)malloc(Nvec*sizeof(_Complex double)); + if(g_debug_level > 4) { + A = (_Complex double*)malloc(Nvec*Nvec*sizeof(_Complex double)); + for(i = 0; i < Nvec; i++) { + for(j = 0; j < Nvec; j++) { + A[i*Nvec + j] = M[i*ldM + j]; + } + } + } + /* ----------------------------------------------------------------- + * LU Decompose M_local, in place (Crone's algorithm?) + * It's in Numerical Recipes but also a more understandable + * description can be found at: + * http://csep10.phys.utk.edu/guidry/ + * phys594/lectures/linear_algebra/lanotes/node3.html + * + * OR look in your favourite Matrix Analysis text + * ----------------------------------------------------------------- + * + * ------------------------------------------------------------- + * Start LU Decomp. Definition. 0-th row of U is 0-th row of M + * and L_{i,i} = 1 for all i + * + * So we start with the 1-th (2nd) row + * ------------------------------------------------------------ */ + + for(i = 1; i < Nvec; i++) { + + /* ------------------------------------------------------------ + * Parital Pivot: Find the row with the largest element in the + * ith-column and make that the i-th row. This swaps rows. + * so I don't need to reorder the unknowns, but I do need + * to reorder the b_local + * ------------------------------------------------------------*/ + maxnorm = norm2(&M[i*ldM+i], 1); + maxrow = i; + + /* Compare norms with other elements in column j for row i+1.N */ + for(row = i+1; row < Nvec; row++) { + if ( norm2(&M[row*ldM + i],1) > maxnorm ) { + /* Norm of M(j,i) is bigger, store it as the maximum */ + /* and store its index */ + maxnorm = norm2(&M[row*ldM + i],1); + maxrow = row; + } + } + pivot[i] = maxrow; + + /* If the element with maximum norm is not in row i, swap */ + /* its row with row i */ + if( maxrow != i ) { + + /* Swap rows i and maxindex */ + for(j = 0; j < Nvec; j++ ) { + tmp = M[i*ldM + j]; + M[i*ldM + j] = M[maxrow*ldM + j]; + M[maxrow*ldM + j] = tmp; + } + } + + /* -------------------------------------------------------- + * End of pivoting code + * -------------------------------------------------------- + + + * -------------------------------------------------------- + * Work out elements of L & U in place in M for row i + * -------------------------------------------------------- */ + for(j = 0; j < i; j++) { + + (sum_LU) = 0.; + for(k = 0; k < j; k++) { + /* sum_LU += M(i,k)*M(k,j); */ + (tmp) = (M[i*ldM + k]) * (M[k*ldM + j]); + (sum_LU) += tmp; + + } + /* M(i,j) -= sum_LU; */ + (M[i*ldM + j]) -= sum_LU; + /* M(i,j) /= M(j,j); */ + (tmp) = (M[i*ldM + j]) / (M[j*ldM + j]); + M[i*ldM + j] = tmp; + } + + for(j=i; j < Nvec; j++) { + (sum_LU) = 0.; + for(k = 0; k < i; k++) { + (tmp) = (M[i*ldM + k]) * (M[k*ldM + j]); + (sum_LU) += tmp; + } + /* M(i,j) -= sum_LU; */ + (M[i*ldM+j]) -= sum_LU; + } + } + + /* ---------------------------------------------------- + * LU Decomp finished. M now holds the + * U matrix in its diagonal and superdiagonal elements + * and the subdiagonal elements of the L matrix in its + * subdiagonal. Recall that the Diagonal elements of L + * are chosen to be 1 + * -----------------------------------------------------*/ + + /* now compute inv(U) */ + + for(row = 0; row < Nvec; row++) { + (tmp) = (cone) / (M[row*ldM + row]); + M[row*ldM + row] = tmp; + for(col = row+1; col < Nvec; col++) { + tmp = 0.; + for(j = row; j < col; j ++) { + (tmp) -= (M[row*ldM + j]) * (M[j*ldM + col]); + } + (M[row*ldM + col]) = (tmp) / (M[col*ldM + col]); + } + } + + /* last col of inv(A) already in place */ + for(col = Nvec-2; col > -1; col--) { + for(row = 0; row < Nvec; row++) { + if(row > col) { + y[row] = 0.; + } + else y[row] = M[row*ldM + col]; + for(j = col+1; j < Nvec; j++) { + (y[row]) -= (M[row*ldM + j]) * (M[j*ldM + col]); + } + } + for(row = 0; row < Nvec; row++) { + M[row*ldM+col] = y[row]; + } + } + + /* Swap cols of inv(A) according to pivot */ + for(j = Nvec-1; j > 0; j-- ) { + if(pivot[j] != j) { + for(i = 0; i < Nvec; i++) { + tmp = M[i*ldM + j]; + M[i*ldM + j] = M[i*ldM + pivot[j]]; + M[i*ldM + pivot[j]] = tmp; + } + } + } + + if(g_debug_level > 4 && g_proc_id == 0) { + printf("check little_A inversion \n"); + for(i = 0; i < Nvec; i++) { + for(j = 0; j < Nvec; j++) { + (tmp) = 0.0; + for(k = 0; k < Nvec; k++) { + (tmp) += (A[i*ldM + k]) * (M[k*Nvec + j]); + } + printf("%1.3e %1.3ei, ", creal(tmp), cimag(tmp)); + } + printf("\n"); + } + printf("\n"); + } + + free(pivot); + free(y); + if(g_debug_level > 4) free(A); + return; +} + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/lu_solve.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/lu_solve.h new file mode 100644 index 0000000000000000000000000000000000000000..ba45dc35399336e1128ad187f88bc7e1a91052f2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/lu_solve.h @@ -0,0 +1,28 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef LU_SOLVE_H +#define LU_SOLVE_H + +/* Solve M a = b by LU decomposition with partial pivoting */ +void LUSolve( const int Nvec, _Complex double * M, const int ldM, _Complex double * b); + +void LUInvert( const int Nvec, _Complex double * const M, const int ldM); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/matrix_mult_typedef.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/matrix_mult_typedef.h new file mode 100644 index 0000000000000000000000000000000000000000..66d619b79745bb5c5d02670879dc34a3b7637309 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/matrix_mult_typedef.h @@ -0,0 +1,37 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +/************************************************ + * + * Typedefinition of the pointer to the function + * which contains the matrix multiplication. + * + ************************************************/ + +#ifndef _MATRIX_MULT_TYPEDEF_H +#define _MATRIX_MULT_TYPEDEF_H + +typedef void (*matrix_mult)(spinor * const, spinor * const); +typedef void (*matrix_mult32)(spinor32 * const, spinor32 * const); +typedef void (*matrix_mult_blk)(spinor * const, spinor * const, const int); +typedef void (*matrix_mult_clover)(spinor * const, spinor * const, const double); +typedef void (*c_matrix_mult)(_Complex double * const, _Complex double * const); +typedef void (*matrix_mult_su3vect)(su3_vector * const, su3_vector * const, const int); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/matrix_mult_typedef_bi.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/matrix_mult_typedef_bi.h new file mode 100644 index 0000000000000000000000000000000000000000..6ccad8cf2104815dbe9571c5af3d225577b45406 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/matrix_mult_typedef_bi.h @@ -0,0 +1,38 @@ +/*********************************************************************** + * + * Copyright (C) 2006 Thomas Chiarappa + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * Typedefinition of the pointer to the function + * which contains the matrix multiplication. + * + * Author: Thomas Chiarappa + * Thomas.Chiarappa@mib.infn.it + * + ************************************************/ + +#ifndef _MATRIX_MULT_TYPEDEF_BI_H +#define _MATRIX_MULT_TYPEDEF_BI_H + +typedef void (*matrix_mult_bi)(bispinor * const, bispinor * const); + +/* SO FAR, THE CLOVER TERM IS NOT REALLY NEEDED FOR THE ND-CASE */ +/* +typedef void (*matrix_mult_clover)(spinor * const, spinor * const, const double); +*/ + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/matrix_mult_typedef_nd.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/matrix_mult_typedef_nd.h new file mode 100644 index 0000000000000000000000000000000000000000..b9d8b814bfb616c3483606dcc6d137613aea8331 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/matrix_mult_typedef_nd.h @@ -0,0 +1,34 @@ +/*********************************************************************** + * + * Copyright (C) 2007 Andreas Nube + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * Typedefinition of the pointer to the function + * which contains the matrix multiplication. + * + * Author: Andreas Nube + * annube@ifh.de + * + ************************************************************************/ + +#ifndef _MATRIX_MULT_TYPEDEF_ND_H +#define _MATRIX_MULT_TYPEDEF_ND_H + +typedef void (*matrix_mult_nd)(spinor * const, spinor * const,spinor * const, spinor * const); +typedef void (*matrix_mult_nd32)(spinor32 * const, spinor32 * const, spinor32 * const, spinor32 * const); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/mixed_cg_her.c b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/mixed_cg_her.c new file mode 100644 index 0000000000000000000000000000000000000000..a936a6a7da8dd6c17f12f2d9f5cd7392947713e4 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/mixed_cg_her.c @@ -0,0 +1,202 @@ +/*********************************************************************** + * Copyright (C) 2013 Florian Burger + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * + * + * File: mixed_cg_her.c + * + * CG solver for hermitian f only! + * + * The externally accessible functions are + * + * + * int cg(spinor * const P, spinor * const Q, double m, const int subtract_ev) + * CG solver + * + * input: + * m: Mass to be use in D_psi + * subtrac_ev: if set to 1, the lowest eigenvectors of Q^2 will + * be projected out. + * Q: source + * inout: + * P: initial guess and result + * + * + **************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "linalg_eo.h" +#include "start.h" +#include "operator/tm_operators_32.h" +#include "solver/matrix_mult_typedef.h" +#include "solver/solver_params.h" +#include "read_input.h" + +#include "solver_field.h" +#include "solver/mixed_cg_her.h" +#include "gettime.h" + + + + +/* P output = solution , Q input = source */ +int mixed_cg_her(spinor * const P, spinor * const Q, solver_params_t solver_params, + const int max_iter, double eps_sq, const int rel_prec, const int N, + matrix_mult f, matrix_mult32 f32) { + + int i = 0, iter = 0, j = 0; + float sqnrm = 0., sqnrm2, squarenorm; + float pro, err, alpha_cg, beta_cg; + double sourcesquarenorm, sqnrm_d, squarenorm_d; + spinor *delta, *y, *xhigh; + spinor32 *x, *stmp; + spinor ** solver_field = NULL; + spinor32 ** solver_field32 = NULL; + const int nr_sf = 3; + const int nr_sf32 = 4; + + int max_inner_it = mixcg_maxinnersolverit; + int N_outer = max_iter/max_inner_it; + //to be on the save side we allow at least 10 outer iterations + if(N_outer < 10) N_outer = 10; + + int save_sloppy = g_sloppy_precision_flag; + double atime, etime, flops; + + if(N == VOLUME) { + init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf); + init_solver_field_32(&solver_field32, VOLUMEPLUSRAND, nr_sf32); + } + else { + init_solver_field(&solver_field, VOLUMEPLUSRAND/2, nr_sf); + init_solver_field_32(&solver_field32, VOLUMEPLUSRAND/2, nr_sf32); + } + + squarenorm_d = square_norm(Q, N, 1); + sourcesquarenorm = squarenorm_d; + sqnrm_d = squarenorm_d; + + delta = solver_field[0]; + y = solver_field[1]; + xhigh = solver_field[2]; + x = solver_field32[3]; + assign(delta, Q, N); + + //set solution to zero + zero_spinor_field(P, N); + + atime = gettime(); + for(i = 0; i < N_outer; i++) { + + /* main CG loop in lower precision */ + zero_spinor_field_32(x, N); + zero_spinor_field_32(solver_field32[0], N); + assign_to_32(solver_field32[1], delta, N); + assign_to_32(solver_field32[2], delta, N); + + sqnrm = (float) sqnrm_d; + sqnrm2 = sqnrm; + + /*inner CG loop */ + for(j = 0; j <= max_inner_it; j++) { + + f32(solver_field32[0], solver_field32[2]); + pro = scalar_prod_r_32(solver_field32[2], solver_field32[0], N, 1); + alpha_cg = sqnrm2 / pro; + + assign_add_mul_r_32(x, solver_field32[2], alpha_cg, N); + + assign_mul_add_r_32(solver_field32[0], -alpha_cg, solver_field32[1], N); + + err = square_norm_32(solver_field32[0], N, 1); + + if(g_proc_id == g_stdio_proc && g_debug_level > 2) { + printf("inner CG: %d res^2 %g\n", iter+j, err); + fflush(stdout); + } + + //if (((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq*squarenorm) && (rel_prec == 1))){ + if((err <= mixcg_innereps*sqnrm)|| (j==max_inner_it) || ((1.3*err <= eps_sq) && (rel_prec == 0)) || ((1.3*err <= eps_sq*sourcesquarenorm) && (rel_prec == 1))) { + break; + } + beta_cg = err / sqnrm2; + assign_mul_add_r_32(solver_field32[2], beta_cg, solver_field32[0], N); + stmp = solver_field32[0]; + solver_field32[0] = solver_field32[1]; + solver_field32[1] = stmp; + sqnrm2 = err; + } + /* end inner CG loop */ + iter += j; + + /* we want to apply a true double matrix with f(y,P) -> set sloppy off here*/ + g_sloppy_precision_flag = 0; + + /* calculate defect in double precision */ + assign_to_64(xhigh, x, N); + add(P, P, xhigh, N); + f(y, P); + diff(delta, Q, y, N); + sqnrm_d = square_norm(delta, N, 1); + if(g_debug_level > 2 && g_proc_id == 0) { + printf("mixed CG: last inner residue: %g\t\n", err); + printf("mixed CG: true residue %d %g\t\n",iter, sqnrm_d); fflush(stdout); + } + + /* here we can reset it to its initial value*/ + g_sloppy_precision_flag = save_sloppy; + + if(((sqnrm_d <= eps_sq) && (rel_prec == 0)) || ((sqnrm_d <= eps_sq*sourcesquarenorm) && (rel_prec == 1))) { + etime = gettime(); + + if(g_debug_level > 0 && g_proc_id == 0) { + if(N != VOLUME){ + /* 2 A + 2 Nc Ns + N_Count ( 2 A + 10 Nc Ns ) */ + /* 2*1608.0 because the linalg is over VOLUME/2 */ + flops = (2*(2*1608.0+2*3*4) + 2*3*4 + iter*(2.*(2*1608.0+2*3*4) + 10*3*4))*N/1.0e6f; + printf("# mixed CG: iter: %d eps_sq: %1.4e t/s: %1.4e\n", iter, eps_sq, etime-atime); + printf("# mixed CG: flopcount (for e/o tmWilson only): t/s: %1.4e mflops_local: %.1f mflops: %.1f\n", + etime-atime, flops/(etime-atime), g_nproc*flops/(etime-atime)); + } + else{ + /* 2 A + 2 Nc Ns + N_Count ( 2 A + 10 Nc Ns ) */ + flops = (2*(1608.0+2*3*4) + 2*3*4 + iter*(2.*(1608.0+2*3*4) + 10*3*4))*N/1.0e6f; + printf("# mixed CG: iter: %d eps_sq: %1.4e t/s: %1.4e\n", iter, eps_sq, etime-atime); + printf("# mixed CG: flopcount (for non-e/o tmWilson only): t/s: %1.4e mflops_local: %.1f mflops: %.1f\n", + etime-atime, flops/(etime-atime), g_nproc*flops/(etime-atime)); + } + } + + finalize_solver(solver_field, nr_sf); + finalize_solver_32(solver_field32, nr_sf32); + return(iter+i); + } + iter++; + } + finalize_solver(solver_field, nr_sf); + finalize_solver_32(solver_field32, nr_sf32); + return(-1); +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/mixed_cg_her.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/mixed_cg_her.h new file mode 100644 index 0000000000000000000000000000000000000000..58e069031e9dd07091a8e1c536951e2ab88478f6 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/mixed_cg_her.h @@ -0,0 +1,31 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _MIXED_CG_HER_H +#define _MIXED_CG_HER_H + +#include"operator/tm_operators_32.h" +#include"solver/matrix_mult_typedef.h" +#include"solver/solver_params.h" +#include"su3.h" + +int mixed_cg_her(spinor * const P, spinor * const Q, solver_params_t solver_params, + const int max_iter, double eps_sq, const int rel_prec, const int N, + matrix_mult f, matrix_mult32 f32); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/mixed_cg_mms_tm_nd.c b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/mixed_cg_mms_tm_nd.c new file mode 100644 index 0000000000000000000000000000000000000000..34fad844cb9e5e0624f820c02615400eba96628f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/mixed_cg_mms_tm_nd.c @@ -0,0 +1,564 @@ +/*********************************************************************** + * + * Copyright (C) 2015 Florian Burger + * partially based on cg_mms_tm_nd.c by Andrea Shindler and Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * Author: 2015 Florian Burger + * + * This is a Multi-Shift reliable update single/double mixed CG solver + * it expects that the shifts fulfil + * + * shift[0] < shift[1] < shift{2] < ... < shift[no_shifts-1] + * + * in modulus. The code will use shift[i]^2, which are all >0 + * + * parameters: + * shifts are given to the solver in solver_pm->shifts + * number of shifts is in solver_pm->no_shifts + * the operator to invert in solver_pm->M_ndpsi + * the 32 bit operator to invert in solver_pm->M_ndpsi32 + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "gamma.h" +#include "linalg_eo.h" +#include "start.h" +#include "gettime.h" +#include "solver/solver.h" +#include "solver_field.h" +#include "cg_mms_tm_nd.h" +#include "mixed_cg_mms_tm_nd.h" + + +static spinor32 * x_up_qmms; +static spinor32 ** mms_x_up; +static spinor32 * x_dn_qmms; +static spinor32 ** mms_x_dn; + +static spinor32 * d_up_qmms; +static spinor32 ** mms_d_up; +static spinor32 * d_dn_qmms; +static spinor32 ** mms_d_dn; + + +static void init_mms_tm_nd_32(const unsigned int nr, const unsigned int N); +static void free_mms_tm_nd_32(); + +int mixed_cg_mms_tm_nd(spinor ** const Pup, spinor ** const Pdn, + spinor * const Qup, spinor * const Qdn, + solver_pm_t * solver_pm) { + + double eps_sq = solver_pm->squared_solver_prec; + int noshifts = solver_pm->no_shifts; + int rel_prec = solver_pm->rel_prec; + int max_iter = solver_pm->max_iter; + int check_abs, check_rel; + double * shifts = solver_pm->shifts; + int Nshift = noshifts; + + // algorithm + double rr_up, rr_dn, rr, rr_old, r0r0, dAd_up, dAd_dn, dAd; + + if(rel_prec){ + check_rel = 1; + check_abs = 0; + } + else{ + check_rel = 0; + check_abs = 1; + } + + int use_eo=1, eofactor=2; + //not even-odd? + if(solver_pm->sdim == VOLUME) { + eofactor = 1; + use_eo = 0; + } + + int N = VOLUME/eofactor; + int Vol = VOLUMEPLUSRAND/eofactor; + + + // norm of source + rr_up = square_norm(Qup, N, 1); + rr_dn = square_norm(Qdn, N, 1); + rr = rr_up + rr_dn; + + if( (g_cart_id == 0 && g_debug_level > 2)) printf("# CGMMSND_mixed: Initial mms residue: %.6e\n", rr); + if(rr < 1.0e-4){ + if( (g_cart_id == 0 && g_debug_level > 2)) printf("# CGMMSND_mixed: norm of source too low: falling back to double mms solver %.6e\n", rr); + return(cg_mms_tm_nd(Pup, Pdn, Qup, Qdn, solver_pm)); + } + + r0r0 = rr; // for relative precision + rr_old = rr; // for the first iteration + + + + //allocate an auxiliary solver fields + spinor ** sf = NULL; + const int nr_sf = 6; + init_solver_field(&sf, Vol, nr_sf); + + spinor32 ** sf32 = NULL; + const int nr_sf32 = 8; + init_solver_field_32(&sf32, Vol, nr_sf32); + + + //spinor fields + //we need one less than shifts, since one field is cared of by the usual cg fields + init_mms_tm_nd_32(noshifts-1, Vol); + + // Pup/dn can be used as auxiliary field to work on, as it is not later used (could be used as initial guess at the very start) + // Q_up/dn can be used as feedback, or if not, also as auxiliary field + + + + //allocate cg constants + double * sigma; + double * zitam1, * zita; + double * alphas, * betas; + double gamma; + double alpham1; + sigma = (double*)calloc((noshifts), sizeof(double)); + zitam1 = (double*)calloc((noshifts), sizeof(double)); + zita = (double*)calloc((noshifts), sizeof(double)); + alphas = (double*)calloc((noshifts), sizeof(double)); + betas = (double*)calloc((noshifts), sizeof(double)); + + + + spinor32 * r_up, * r_dn, * Ad_up, * Ad_dn, * x_up, * x_dn, * d_up, * d_dn; + spinor * r_up_d, * r_dn_d, * x_up_d, * x_dn_d, * Ax_up_d, * Ax_dn_d; + + // iteration counter + int j; + + //reliable update flag + int rel_update = 0; + //no of reliable updates done + int no_rel_update = 0; + //use reliable update flag + int use_reliable = 1; + + double rel_delta = 1.0e-10; + int trigger_shift = -1; + double * res; + double * res0; + double * maxres; + res = (double*)calloc((noshifts), sizeof(double)); + res0 = (double*)calloc((noshifts), sizeof(double)); + maxres = (double*)calloc((noshifts), sizeof(double)); + + ///////////////// + // ASSIGNMENTS // + ///////////////// + + x_up = sf32[0]; + x_dn = sf32[1]; + r_up = sf32[2]; + r_dn = sf32[3]; + d_up = sf32[4]; + d_dn = sf32[5]; + Ad_up = sf32[6]; + Ad_dn = sf32[7]; + + + x_up_d = sf[0]; + x_dn_d = sf[1]; + r_up_d = sf[2]; + r_dn_d = sf[3]; + Ax_up_d = sf[4]; + Ax_dn_d = sf[5]; + + /* + //matrix test + spinor32 * help_low_up = sf32[0]; + spinor32 * help_low_dn = sf32[1]; + spinor * help_high_up = sf[0]; + spinor * help_high_dn = sf[1]; + assign_to_32(help_low_up, Qup, N); + assign_to_32(help_low_dn, Qdn, N); + assign(help_high_up, Qup, N); + assign(help_high_dn, Qdn, N); + double sqn_high = square_norm(help_high_up,N,1) + + square_norm(help_high_dn,N,1); + printf("square_norm(Q_high) = %e\n", sqn_high); + float sqn_low = square_norm_32(help_low_up,N,1) + + square_norm_32(help_low_dn,N,1); + printf("square_norm(Q_low) = %e\n", sqn_low); + + solver_pm->M_ndpsi32(sf32[2], sf32[3], help_low_up, help_low_dn); + solver_pm->M_ndpsi(sf[2], sf[3], help_high_up, help_high_dn); + + assign_to_64(sf[4], sf32[2], N); + assign_to_64(sf[5], sf32[3], N); + diff(sf[0], sf[4], sf[2], N); + diff(sf[1], sf[5], sf[3], N); + double sqnrm = square_norm(sf[0], N, 1) + + square_norm(sf[1], N, 1); + printf("Operator 32 test: (square_norm) / (spinor component) = %.8e\n", sqnrm/24.0/N); + exit(1); + */ + + // r(0) = b + assign_to_32(r_up, Qup, N); + assign_to_32(r_dn, Qdn, N); + + // d(0) = b + assign_to_32(d_up, Qup, N); + assign_to_32(d_dn, Qdn, N); + + + + maxres[0] = rr; + res[0] = rr; + res0[0] = rr; + alphas[0] = 1.0; + betas[0] = 0.0; + sigma[0] = shifts[0]*shifts[0]; + if(g_cart_id == 0 && g_debug_level > 2) printf("# CGMMSND_mixed: shift %d is %e\n", 0, sigma[0]); + + // currently only implemented for P=0 + for(int im = 1; im < noshifts; im++) { + maxres[im] = rr; + res[im] = rr; + res0[im] = rr; + sigma[im] = shifts[im]*shifts[im] - sigma[0]; + if(g_cart_id == 0 && g_debug_level > 2) printf("# CGMMSND_mixed: shift %d is %e\n", im, sigma[im]); + // these will be the result spinor fields + zero_spinor_field_32(mms_x_up[im-1], N); + zero_spinor_field_32(mms_x_dn[im-1], N); + + assign_to_32(mms_d_up[im-1], Qup, N); + assign_to_32(mms_d_dn[im-1], Qdn, N); + zitam1[im] = 1.0; + zita[im] = 1.0; + alphas[im] = 1.0; + betas[im] = 0.0; + } + + //zero fields for solution Pup, Pdn + for(int im = 0; im < noshifts; im++){ + zero_spinor_field(Pup[im], N); + zero_spinor_field(Pdn[im], N); + } + + + ////////// + // LOOP // + ////////// + + for (j = 0; j < max_iter; j++) { + // A*d(k) + solver_pm->M_ndpsi32(Ad_up, Ad_dn, d_up, d_dn); + //add zero'th shift + assign_add_mul_r_32(Ad_up, d_up, (float) sigma[0], N); + assign_add_mul_r_32(Ad_dn, d_dn, (float) sigma[0], N); + + + // alpha = r(k)*r(k) / d(k)*A*d(k) + dAd_up = scalar_prod_r_32(d_up, Ad_up, N, 1); + dAd_dn = scalar_prod_r_32(d_dn, Ad_dn, N, 1); + + dAd = dAd_up + dAd_dn; + alpham1 = alphas[0]; + alphas[0] = rr_old / dAd; // rr_old is taken from the last iteration respectively + + + // r(k+1) + assign_add_mul_r_32(r_up, Ad_up, (float) -alphas[0],N); + assign_add_mul_r_32(r_dn, Ad_dn, (float) -alphas[0],N); + + // r(k+1)*r(k+1) + rr_up = square_norm_32(r_up, N, 1); + rr_dn = square_norm_32(r_dn, N, 1); + rr = rr_up + rr_dn; + + + + if((g_cart_id == 0) && (g_debug_level > 2)) printf("# CGMMSND_mixed: mms iteration j = %i: rr = %.6e\n", j, rr); + + + + // aborting ?? // check wether precision is reached ... + if ( ((check_abs)&&(rr <= eps_sq)) || ((check_rel)&&(rr <= eps_sq*r0r0)) ) + { + if ((check_rel)&&(rr <= eps_sq*r0r0)) { + if((g_cart_id == 0) && (g_debug_level > 3)) printf("# CGMMSND_mixed: Reached relative solver precision of eps_rel = %.2e\n", eps_sq); + } + break; + } + + // update alphas and zitas + // used later + for(int im = 1; im < noshifts; im++) { + gamma = zita[im]*alpham1/(alphas[0]*betas[0]*(1.-zita[im]/zitam1[im]) + + alpham1*(1.+sigma[im]*alphas[0])); + zitam1[im] = zita[im]; + zita[im] = gamma; + alphas[im] = alphas[0]*zita[im]/zitam1[im]; + } + + //check for reliable update + res[0] = rr; + for(int im=1; im= 0; im--) { + if( res[im] > maxres[im] ) maxres[im] = res[im]; + if( (res[im] < rel_delta*res0[im]) && (res0[im]<=maxres[im]) && (use_reliable) ) rel_update=1; + if( rel_update && ( trigger_shift == -1) ) trigger_shift = im; + } + + if(!rel_update) + { + // x_j(k+1) = x_j(k) + alpha_j*d_j(k) + // alphas are set above + assign_add_mul_r_32(x_up, d_up, (float) alphas[0], N); + assign_add_mul_r_32(x_dn, d_dn, (float) alphas[0], N); + + + for(int im = 1; im < noshifts; im++) { + assign_add_mul_r_32(mms_x_up[im-1], mms_d_up[im-1], (float) alphas[im], N); + assign_add_mul_r_32(mms_x_dn[im-1], mms_d_dn[im-1], (float) alphas[im], N); + } + + // beta = r(k+1)*r(k+1) / r(k)*r(k) + betas[0] = rr / rr_old; + rr_old = rr; // for next iteration + + // d_0(k+1) = r(k+1) + beta*d_0(k) + assign_mul_add_r_32(d_up, (float) betas[0], r_up, N); + assign_mul_add_r_32(d_dn, (float) betas[0], r_dn, N); + + // d_j(k+1) = zita*r(k+1) + beta*d_j(k) + for(int im = 1; im < noshifts; im++) { + betas[im] = betas[0]*zita[im]*alphas[im]/(zitam1[im]*alphas[0]); + assign_mul_add_mul_r_32(mms_d_up[im-1], r_up, (float) betas[im], (float) zita[im], N); + assign_mul_add_mul_r_32(mms_d_dn[im-1], r_dn, (float) betas[im], (float) zita[im], N); + } + } + else{ + //reliable update + if( (g_cart_id == 0) && (g_debug_level > 3) ){ + printf("# CGMMSND_mixed: Shift %d with offset squared %e triggered a reliable update\n", trigger_shift, sigma[trigger_shift]); + } + //add low prec solutions + assign_add_mul_r_32(x_up, d_up, (float) alphas[0], N); + assign_add_mul_r_32(x_dn, d_dn, (float) alphas[0], N); + + addto_32(Pup[0], x_up, N); + addto_32(Pdn[0], x_dn, N); + for(int im = 1; im < noshifts; im++) { + assign_add_mul_r_32(mms_x_up[im-1], mms_d_up[im-1], alphas[im], N); + assign_add_mul_r_32(mms_x_dn[im-1], mms_d_dn[im-1], alphas[im], N); + addto_32(Pup[im], mms_x_up[im-1], N); + addto_32(Pdn[im], mms_x_dn[im-1], N); + } + + //add low precision for shift 0 only + addto_32(x_up_d, x_up, N); + addto_32(x_dn_d, x_dn, N); + + + solver_pm->M_ndpsi(Ax_up_d, Ax_dn_d, x_up_d, x_dn_d); + //add zero'th shift + assign_add_mul_r(Ax_up_d, x_up_d, sigma[0], N); + assign_add_mul_r(Ax_dn_d, x_dn_d, sigma[0], N); + + diff(r_up_d, Qup, Ax_up_d, N); + diff(r_dn_d, Qdn, Ax_dn_d, N); + + rr_up = square_norm(r_up_d, N, 1); + rr_dn = square_norm(r_dn_d, N, 1); + rr = rr_up + rr_dn; + if ((g_cart_id == 0) && (g_debug_level > 3) ) printf("# CGMMSND_mixed: New residue after reliable update: %.6e\n", rr); + + //update res[im] + res[0] = rr; + + + if(res[trigger_shift] > res0[trigger_shift]){ + if(g_cart_id == 0) printf("# CGMMSND_mixed: Warning: residue of shift no %d got larger after rel. update\n", trigger_shift); + //if this is the zero'th shift not getting better -> no further convergence, break + if(trigger_shift == 0) break; + } + + //zero float fields + zero_spinor_field_32(x_up, N); + zero_spinor_field_32(x_dn, N); + for(int im = 1; im < noshifts; im++) { + zero_spinor_field_32(mms_x_up[im-1], N); + zero_spinor_field_32(mms_x_dn[im-1], N); + } + + //update the source + assign_to_32(r_up, r_up_d, N); + assign_to_32(r_dn, r_dn_d, N); + + + + betas[0] = res[0]/rr_old; + rr_old = rr; + // d_0(k+1) = r(k+1) + beta*d_0(k) + assign_mul_add_r_32(d_up, betas[0], r_up, N); + assign_mul_add_r_32(d_dn, betas[0], r_dn, N); + // d_j(k+1) = r(k+1) + beta*d_j(k) + for(int im = 1; im < noshifts; im++) { + betas[im] = betas[0]*zita[im]*alphas[im]/(zitam1[im]*alphas[0]); + assign_mul_add_mul_r_32(mms_d_up[im-1], r_up, (float) betas[im], (float) zita[im], N); + assign_mul_add_mul_r_32(mms_d_dn[im-1], r_dn, (float) betas[im], (float) zita[im], N); + } + + //new maxres for the shift that initiated the reliable update + res[trigger_shift] = res[0]*zita[trigger_shift]*zita[trigger_shift]; + res0[trigger_shift] = res[trigger_shift]; + maxres[trigger_shift] = res[trigger_shift]; + trigger_shift = -1; + no_rel_update ++; + } //reliable update + + //check if some shift is converged + for(int im = 1; im < noshifts; im++) { + if(j > 0 && (j % 10 == 0) && (im == noshifts-1)) { + double sn = square_norm_32(mms_d_up[im-1], N, 1); + sn += square_norm_32(mms_d_dn[im-1], N, 1); + if(alphas[noshifts-1]*alphas[noshifts-1]*sn <= eps_sq) { + noshifts--; + if( (g_debug_level > 1) && (g_cart_id == 0) ) { + printf("# CGMMSND_mixed: at iteration %d removed one shift, %d remaining\n", j, noshifts); + } + //if removed we add the latest solution vector for this shift + addto_32(Pup[im], mms_x_up[im-1], N); + addto_32(Pdn[im], mms_x_dn[im-1], N); + } + } + } + + }//LOOP + + if( (g_cart_id == 0) && (g_debug_level > 1) ) printf("Final mms residue: %.6e\n", rr); + + //add the latest solutions + for(int im = 0; im < noshifts; im++) { + if(im == 0){ + addto_32(Pup[0], x_up, N); + addto_32(Pdn[0], x_dn, N); + } + else{ + addto_32(Pup[im], mms_x_up[im-1], N); + addto_32(Pdn[im], mms_x_dn[im-1], N); + } + } + + if(g_debug_level > 4){ + if(g_cart_id == 0) printf("# CGMMSND_mixed: Checking mms result:\n"); + //loop over all shifts (-> Nshift) + for(int im = 0; im < Nshift; im++){ + solver_pm->M_ndpsi(sf[0], sf[1], Pup[im], Pdn[im]); + assign_add_mul_r(sf[0], Pup[im] , shifts[im]*shifts[im], N); + assign_add_mul_r(sf[1], Pdn[im] , shifts[im]*shifts[im], N); + diff(sf[2], sf[0], Qup, N); + diff(sf[3], sf[1], Qdn, N); + rr_up = square_norm(sf[2], N, 1); + rr_dn = square_norm(sf[3], N, 1); + rr = rr_up + rr_dn; + if(g_cart_id == 0) printf("# CGMMSND_mixed: Shift[%d] squared residue: %e\n", im, rr); + } + } + + + finalize_solver(sf, nr_sf); + finalize_solver_32(sf32, nr_sf32); + + //free cg constants + free(sigma); free(zitam1); free(zita); free(alphas); free(betas); + + //free reliable update stuff + free(res); free(res0); free(maxres); + + + //if not converged -> return(-1) + if(j nr_nd) { + if(nr_nd != 0) { + free_mms_tm_nd_32(); + } + nr_nd = _nr; + + x_up_qmms = (spinor32*)calloc(N*(nr_nd)+1,sizeof(spinor32)); + x_dn_qmms = (spinor32*)calloc(N*(nr_nd)+1,sizeof(spinor32)); + d_up_qmms = (spinor32*)calloc(N*(nr_nd)+1,sizeof(spinor32)); + d_dn_qmms = (spinor32*)calloc(N*(nr_nd)+1,sizeof(spinor32)); + mms_x_up = (spinor32**)calloc((nr_nd)+1,sizeof(spinor32*)); + mms_x_dn = (spinor32**)calloc((nr_nd)+1,sizeof(spinor32*)); + mms_d_up = (spinor32**)calloc((nr_nd)+1,sizeof(spinor32*)); + mms_d_dn = (spinor32**)calloc((nr_nd)+1,sizeof(spinor32*)); + + for(int i = 0; i < nr_nd; i++) { + mms_x_up[i]=(spinor32*)(((unsigned long int)(x_up_qmms)+ALIGN_BASE32)&~ALIGN_BASE32) + i*N; + mms_x_dn[i]=(spinor32*)(((unsigned long int)(x_dn_qmms)+ALIGN_BASE32)&~ALIGN_BASE32) + i*N; + mms_d_up[i]=(spinor32*)(((unsigned long int)(d_up_qmms)+ALIGN_BASE32)&~ALIGN_BASE32) + i*N; + mms_d_dn[i]=(spinor32*)(((unsigned long int)(d_dn_qmms)+ALIGN_BASE32)&~ALIGN_BASE32) + i*N; + } + ini_mms_nd = 1; + } +} + +static void free_mms_tm_nd_32() { + free(x_up_qmms); free(x_dn_qmms); + free(d_up_qmms); free(d_dn_qmms); + free(mms_x_up); free(mms_x_dn); + free(mms_d_up); free(mms_d_dn); + + nr_nd = 0; + ini_mms_nd = 0; + return; +} + + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/mixed_cg_mms_tm_nd.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/mixed_cg_mms_tm_nd.h new file mode 100644 index 0000000000000000000000000000000000000000..67a1f0fbcc8f44ab5c2f791bbab891217169b198 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/mixed_cg_mms_tm_nd.h @@ -0,0 +1,33 @@ +/*********************************************************************** + * + * + * Copyright (C) 2015 Florian Burger + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + ***********************************************************************/ + +#ifndef _MIXED_CG_MMS_TM_ND_H +#define _MIXED_CG_MMS_TM_ND_H + +#include"su3.h" +#include"solver.h" + +int mixed_cg_mms_tm_nd(spinor ** const Pup, spinor ** const Pdn, + spinor * const Qup, spinor * const Qdn, + solver_pm_t * solver_pm); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/mode_number.c b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/mode_number.c new file mode 100644 index 0000000000000000000000000000000000000000..a446731f4f1dbbab17903580c4ce994a6bf437de --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/mode_number.c @@ -0,0 +1,342 @@ +#include +#include +#include +#include "global.h" +#include "start.h" +#include "su3.h" +#include "linalg_eo.h" +#include "chebyshev_polynomial_nd.h" +#include +#include "solver/solver.h" +#include "solver/jdher.h" +#include "solver/eigenvalues.h" +#include "X_psi.h" +#include "gamma.h" + +double rnorm=-1; + +/* |R>=rnorm^2 Q^2 |S> */ +void norm_X_sqr_psi(spinor * const R, spinor * const S, + double const mstar); + +/* |R>=rnorm Q|S> */ +void norm_X_n_psi(spinor * const R, spinor * const S, + const int n, double const mstar); + +/* Construct the sign function of the operator X */ +/* X/sqrt(X^2) ,, X = 1-(2M^2/(DdaggeraD+M^2))*/ +void X_over_sqrt_X_sqr(spinor * const R, double * const c, + const int n, spinor * const S, + const double minev, double const mstar); + + + +double * x_cheby_coef = NULL; +double epsilon=0.01; +int x_n_cheby ; +double prec = 1.e-3; + +void mode_number(spinor * const S, double const mstar){ + int i; + double mode_n; + spinor **s, *s_; + static int n_cheby = 0; + static int rec_coefs = 1; + + // x_n_cheby = (int)(-log(1.e-12)/(2*sqrt(epsilon))); + x_n_cheby = (int)(-log(prec)/(2*sqrt(epsilon))); + + /* if(g_proc_id == 0){ + printf("using precision %1.1e we get a degree n = %d \n",prec,x_n_cheby); + }*/ + + + /* Compute Chebyshev coefficients */ + /* c[j] ,, j=0..n, n=degree of the polynomial*/ + + if(g_proc_id == 0) { + printf("Degree of Polynomial set to %d\n", x_n_cheby); + } + + if(n_cheby != x_n_cheby || rec_coefs) { + if(x_cheby_coef != NULL) free(x_cheby_coef); + x_cheby_coef = (double*)malloc(x_n_cheby*sizeof(double)); + chebyshev_coefs(epsilon, 1., x_cheby_coef, x_n_cheby, -0.5); + rec_coefs = 0; + n_cheby = x_n_cheby; + } + s_ = calloc(2*VOLUMEPLUSRAND+1, sizeof(spinor)); + s = calloc(2, sizeof(spinor*)); + + for(i = 0; i < 2; i++) { +#if (defined SSE3 || defined SSE2 || defined SSE) + s[i] = (spinor*)(((unsigned long int)(s_)+ALIGN_BASE)&~ALIGN_BASE)+i*VOLUMEPLUSRAND; +#else + s[i] = s_+i*VOLUMEPLUSRAND; +#endif + } + + if(g_proc_id == 0) { + printf("mstar= %f \n",mstar); + } + + /*Evaluate X_over_sqrt_X_sqr*/ + X_over_sqrt_X_sqr(s[0], x_cheby_coef, x_n_cheby, S, epsilon, mstar); + + /* Construct h(x)=1/2-1/2 X/sqrt(X^2) */ + /* this routine makes (*R)=c1*(*R)+c2*(*S) , c1 and c2 are real constants */ + assign_mul_add_mul_r(s[0],S, 0.5, 0.5, VOLUME); + + /*we need h(X)^2|nu>*/ + X_over_sqrt_X_sqr(s[1], x_cheby_coef, x_n_cheby, s[0], epsilon, mstar); + assign_mul_add_mul_r(s[1],s[0],0.5, 0.5, VOLUME); + + /* Calculate the square norm ||h(X)^2 nu||^2 */ + mode_n = square_norm(s[1], VOLUME, 1); + + if(g_proc_id == 0) { + printf("The Value of the Mode Number is %f \n", mode_n); + } + + + free(s); + free(s_); + + return; +} + + +void norm_X_sqr_psi(spinor * const R, spinor * const S, double const mstar) { + + spinor *aux_,*aux; +#if ( defined SSE || defined SSE2 || defined SSE3 ) + aux_=calloc(VOLUMEPLUSRAND+1, sizeof(spinor)); + aux = (spinor *)(((unsigned long int)(aux_)+ALIGN_BASE)&~ALIGN_BASE); +#else + aux_=calloc(VOLUMEPLUSRAND, sizeof(spinor)); + aux = aux_; +#endif + + /* Here is where we have to include our operator which in this case is + X = 1 - (2M^2)/(D_m^dagger*D_m + mu^2 + M^2) */ + + X_psi(aux, S, mstar); + X_psi(R, aux, mstar); + mul_r(R, rnorm*rnorm, R, VOLUME); + + free(aux_); + return; +} + + +void norm_X_n_psi(spinor * const R, spinor * const S, + const int n, double const mstar) { + + int i; + double npar = 1.; + spinor *aux_,*aux; +#if (defined SSE || defined SSE2 || defined SSE3) + aux_=calloc(VOLUMEPLUSRAND+1, sizeof(spinor)); + aux = (spinor *)(((unsigned long int)(aux_)+ALIGN_BASE)&~ALIGN_BASE); +#else + aux_=calloc(VOLUMEPLUSRAND, sizeof(spinor)); + aux = aux_; +#endif + assign(aux, S, VOLUME); + + for(i=0; i < n; i++){ + /* Here is where we have to include our operator which in this case is + X = 1 - (2M^2)/(D_m^dagger*D_m + M^2) */ + X_psi(R, aux, mstar); + npar *= rnorm; + } + mul_r(R, npar, R, VOLUME); + + free(aux_); + return; +} + +void X_over_sqrt_X_sqr(spinor * const R, double * const c, + const int n, spinor * const S, const double minev, double const mstar) { + + int j; + double fact1, fact2, temp1, temp2, temp3, temp4, maxev; + spinor *sv_, *sv, *d_, *d, *dd_, *dd, *aux_, *aux, *aux3_, *aux3; + // double ap_eps_sq = 0.; + +#if ( defined SSE || defined SSE2 || defined SSE3) + sv_ = calloc(VOLUMEPLUSRAND+1, sizeof(spinor)); + sv = (spinor *)(((unsigned long int)(sv_)+ALIGN_BASE)&~ALIGN_BASE); + d_ = calloc(VOLUMEPLUSRAND+1, sizeof(spinor)); + d = (spinor *)(((unsigned long int)(d_)+ALIGN_BASE)&~ALIGN_BASE); + dd_ = calloc(VOLUMEPLUSRAND+1, sizeof(spinor)); + dd = (spinor *)(((unsigned long int)(dd_)+ALIGN_BASE)&~ALIGN_BASE); + aux_ = calloc(VOLUMEPLUSRAND+1, sizeof(spinor)); + aux = (spinor *)(((unsigned long int)(aux_)+ALIGN_BASE)&~ALIGN_BASE); + aux3_= calloc(VOLUMEPLUSRAND+1, sizeof(spinor)); + aux3 = (spinor *)(((unsigned long int)(aux3_)+ALIGN_BASE)&~ALIGN_BASE); +#else + sv_=calloc(VOLUMEPLUSRAND, sizeof(spinor)); + sv = sv_; + d_=calloc(VOLUMEPLUSRAND, sizeof(spinor)); + d = d_; + dd_=calloc(VOLUMEPLUSRAND, sizeof(spinor)); + dd = dd_; + aux_=calloc(VOLUMEPLUSRAND, sizeof(spinor)); + aux = aux_; + aux3_=calloc(VOLUMEPLUSRAND, sizeof(spinor)); + aux3 = aux3_; +#endif + + /*EVALUATE THE APPROXIMATION USING THE CLENSHAW'S RECURRENCE FORMULA*/ + + maxev=1.0; + + /*interval = [minev,maxev] = [epsilon,1]*/ + fact1=4/(maxev-minev); + fact2=-2*(maxev+minev)/(maxev-minev); + /* d=0 , dd=0 */ + zero_spinor_field(d, VOLUME); + zero_spinor_field(dd, VOLUME); + + + /*input S = aux3*/ + if(0) assign_sub_lowest_eigenvalues(aux3, S, no_eigenvalues-1, VOLUME); + else assign(aux3, S, VOLUME); + + + /*starting the loop*/ + if(1) { + for (j = n-1; j >= 1; j--) { + + /*sv=d*/ + assign(sv, d, VOLUME); + + /*aux= our random field S*/ + assign(aux, d, VOLUME); + + if(j == n-1){ + assign(R, aux, VOLUME); + } + else{ + /*|R>=rnorm^2 X^2|aux> -> since aux=d -> |R>=rnorm^2 Q^2|d>*/ + norm_X_sqr_psi(R, aux, mstar); + } + temp1=-1.0; + temp2=c[j]; /*Chebyshev coefficients*/ + + /* d = d*fact2 + R*fact1 + dd*temp1 + aux*temp2 + d = -2*(maxev+minev)/(maxev-minev)*d + 4/(maxev-minev)*R + -1*dd + c[j]*aux3 */ + /* y = (2*x-a-b)/(b-a) , y2=2*y + d = y2*d - dd + c[j] = -2*(a+b)*d/(b-a) + 4*x*d/(b-a) -dd + c[j] */ + assign_mul_add_mul_add_mul_add_mul_r(d, R, dd, aux3, fact2, fact1, temp1, temp2, VOLUME); + /* dd = sv */ + assign(dd, sv, VOLUME); + } + + /* R = d */ + if(0) assign_sub_lowest_eigenvalues(R, d, no_eigenvalues-1, VOLUME); + else assign(R, d, VOLUME); + + /*|aux>=rnorm^2 Q^2|R> */ + norm_X_sqr_psi(aux, R, mstar); + temp1=-1.0; + temp2=c[0]/2.; + temp3=fact1/2.; + temp4=fact2/2.; + + + /* aux = aux*temp3 + d*temp4 + dd*temp1 + aux3*temp2 + aux = 2/(maxev-minev)*aux + -(maxev+minev)/(maxev-minev)d + -1*dd + 0.5*c[j]*aux3 */ + /* P(X^2)|_x = y*d -dd + 0.5*c[0] */ + assign_mul_add_mul_add_mul_add_mul_r(aux, d, dd, aux3, temp3, temp4, temp1, temp2, VOLUME); + /* ONCE WE HAVE THE EVALUATION OF P(X^2) = 1/SQRT(X^2) + WE CONSTRUCT -X/SQRT(X^2) --> -X*P(X^2) */ + norm_X_n_psi(R, aux, 1, mstar); + } + + free(sv_); + free(d_); + free(dd_); + free(aux_); + free(aux3_); + return; +} + + +void Check_Approximation(double const mstar, const int repro) { + + if(g_proc_id == 0) { + printf("Checking the approximation of X/sqrt(X^2) in the mode number: \n"); + } + + + int i; + double res = 0; + spinor **s, *s_; + spinor *Sin = NULL; + //, *Sin_ = NULL; + static int n_cheby = 0; + static int rec_coefs = 1; + + // x_n_cheby = (int)(-log(1.e-12)/(2*sqrt(epsilon))); + x_n_cheby = (int)(-log(prec)/(2*sqrt(epsilon))); + + if(g_proc_id == 0) { + printf("epsilon= %f \n", epsilon); + printf("M*^2= %f \n", mstar); + printf("x_n_cheby= %d \n", x_n_cheby); + } + + if(n_cheby != x_n_cheby || rec_coefs) { + if(x_cheby_coef != NULL) free(x_cheby_coef); + x_cheby_coef = (double*)malloc(x_n_cheby*sizeof(double)); + chebyshev_coefs(epsilon, 1., x_cheby_coef, x_n_cheby, -0.5); + rec_coefs = 0; + n_cheby = x_n_cheby; + } + +#if (defined SSE3 || defined SSE2 || defined SSE) + Sin_ = calloc(VOLUMEPLUSRAND+1, sizeof(spinor)); + Sin = (spinor *)(((unsigned long int)(Sin_)+ALIGN_BASE)&~ALIGN_BASE); +#else + Sin =calloc(VOLUMEPLUSRAND, sizeof(spinor)); +#endif + + random_spinor_field_lexic(Sin, repro, RN_GAUSS); + + s_ = calloc(4*VOLUMEPLUSRAND+1, sizeof(spinor)); + s = calloc(4, sizeof(spinor*)); + + for(i = 0; i < 4; i++) { +#if (defined SSE3 || defined SSE2 || defined SSE) + s[i] = (spinor*)(((unsigned long int)(s_)+ALIGN_BASE)&~ALIGN_BASE)+i*VOLUMEPLUSRAND; +#else + s[i] = s_+i*VOLUMEPLUSRAND; +#endif + } + + X_over_sqrt_X_sqr(s[0], x_cheby_coef, x_n_cheby, Sin, epsilon, mstar); + + diff(s[2], Sin, s[0], VOLUME); + diff(s[2], Sin, s[0], VOLUME); + + X_over_sqrt_X_sqr(s[1], x_cheby_coef, x_n_cheby, s[0], epsilon, mstar); + + diff(s[3], s[1], Sin, VOLUME); + res = square_norm(s[3],VOLUME,0); + + if(g_proc_id == 0) { + printf("\n"); + printf("Deviation from the real value : \n"); + printf("||X^2/sqrt(X^2)|psi> - |nu>||^2 = %1.4e \n",res); + printf("\n"); + } + + free(s); + free(s_); + return; +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/mode_number.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/mode_number.h new file mode 100644 index 0000000000000000000000000000000000000000..05534fbf05b1ed9e934c02b16eac6889436a442e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/mode_number.h @@ -0,0 +1,11 @@ + +#include "su3.h" + +extern int x_n_cheby; +extern double * x_cheby_coef; + +void mode_number(spinor * const, double const mstar); + +void Check_Approximation(double const mstar, const int repro); + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/monomial_solve.c b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/monomial_solve.c new file mode 100644 index 0000000000000000000000000000000000000000..0d1823a00819847da178e13dc5f1a6e6b6fa7ac2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/monomial_solve.c @@ -0,0 +1,155 @@ +/*********************************************************************** + * + * Copyright (C) 2014 Florian Burger + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * + * File: monomial_solve.c + * + * solver wrapper for monomials + * + * The externally accessible functions are + * + * + * int solve_degenerate(spinor * const P, spinor * const Q, const int max_iter, + double eps_sq, const int rel_prec, const int N, matrix_mult f) + * int solve_mms_nd(spinor ** const Pup, spinor ** const Pdn, + * spinor * const Qup, spinor * const Qdn, + * solver_pm_t * solver_pm) + * + **************************************************************************/ + + +#ifdef HAVE_CONFIG_H +# include +#endif +#include "global.h" +#include "read_input.h" +#include "solver/solver.h" +#include "solver/matrix_mult_typedef.h" +#include "solver/solver_types.h" +#include "solver/solver_params.h" +#include "operator/tm_operators.h" +#include "operator/tm_operators_32.h" +#include "operator/tm_operators_nd.h" +#include "operator/tm_operators_nd_32.h" +#include "operator/clovertm_operators.h" +#include "operator/clovertm_operators_32.h" +#include "monomial_solve.h" + +#ifdef HAVE_GPU +#include"../GPU/cudadefs.h" +extern int linsolve_eo_gpu (spinor * const P, spinor * const Q, const int max_iter, + double eps, const int rel_prec, const int N, matrix_mult f); +extern int dev_cg_mms_tm_nd(spinor ** const Pup, spinor ** const Pdn, + spinor * const Qup, spinor * const Qdn, + solver_pm_t * solver_pm); + #ifdef TEMPORALGAUGE + #include "../temporalgauge.h" + #endif +#include "read_input.h" +#endif + +int solve_degenerate(spinor * const P, spinor * const Q, solver_params_t solver_params, + const int max_iter, double eps_sq, const int rel_prec, + const int N, matrix_mult f, int solver_type){ + int iteration_count = 0; + int use_solver = solver_type; + + if(use_solver == MIXEDCG || use_solver == RGMIXEDCG){ + // the default mixed solver is rg_mixed_cg_her + int (*msolver_fp)(spinor * const, spinor * const, solver_params_t, + const int, double, const int, const int, matrix_mult, matrix_mult32) = rg_mixed_cg_her; + + // but it might be necessary at some point to use the old version + if(use_solver == MIXEDCG){ + msolver_fp = mixed_cg_her; + } + + if(usegpu_flag){ + #ifdef HAVE_GPU + #ifdef TEMPORALGAUGE + to_temporalgauge(g_gauge_field, Q , P); + #endif + iteration_count = linsolve_eo_gpu(P, Q, max_iter, eps_sq, rel_prec, N, f); + #ifdef TEMPORALGAUGE + from_temporalgauge(Q, P); + #endif + #endif + return(iteration_count); + } + else{ + if(f==Qtm_pm_psi){ + iteration_count = msolver_fp(P, Q, solver_params, max_iter, eps_sq, rel_prec, N, f, &Qtm_pm_psi_32); + return(iteration_count); + } + else if(f==Q_pm_psi){ + iteration_count = msolver_fp(P, Q, solver_params, max_iter, eps_sq, rel_prec, N, f, &Q_pm_psi_32); + return(iteration_count); + } else if(f==Qsw_pm_psi){ + copy_32_sw_fields(); + iteration_count = msolver_fp(P, Q, solver_params, max_iter, eps_sq, rel_prec, N, f, &Qsw_pm_psi_32); + return(iteration_count); + } else { + if(g_proc_id==0) printf("Warning: 32 bit matrix not available. Falling back to CG in 64 bit\n"); + use_solver = CG; + } + } + } + if(use_solver == CG){ + iteration_count = cg_her(P, Q, max_iter, eps_sq, rel_prec, N, f); + } + else if(use_solver == BICGSTAB){ + iteration_count = bicgstab_complex(P, Q, max_iter, eps_sq, rel_prec, N, f); + } + else{ + if(g_proc_id==0) printf("Error: solver not allowed for degenerate solve. Aborting...\n"); + exit(2); + } + return(iteration_count); +} + + +int solve_mms_nd(spinor ** const Pup, spinor ** const Pdn, + spinor * const Qup, spinor * const Qdn, + solver_pm_t * solver_pm){ + int iteration_count = 0; + if(solver_pm->type==MIXEDCGMMSND){ + if(usegpu_flag){ + #ifdef HAVE_GPU + #ifdef TEMPORALGAUGE + to_temporalgauge_mms(g_gauge_field , Qup, Qdn, Pup, Pdn, solver_pm->no_shifts); + #endif + iteration_count = dev_cg_mms_tm_nd(Pup, Pdn, Qup, Qdn, solver_pm); + #ifdef TEMPORALGAUGE + from_temporalgauge_mms(Qup, Qdn, Pup, Pdn, solver_pm->no_shifts); + #endif + #endif + } + else{ + iteration_count = mixed_cg_mms_tm_nd(Pup, Pdn, Qup, Qdn, solver_pm); + } + } + else if (solver_pm->type==CGMMSND){ + iteration_count = cg_mms_tm_nd(Pup, Pdn, Qup, Qdn, solver_pm); + } + else{ + if(g_proc_id==0) printf("Error: solver not allowed for ND mms solve. Aborting...\n"); + exit(2); + } + return(iteration_count); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/monomial_solve.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/monomial_solve.h new file mode 100644 index 0000000000000000000000000000000000000000..0cbe5439197f5518fcfde8ac51687ede997c4aa4 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/monomial_solve.h @@ -0,0 +1,32 @@ +/*********************************************************************** + * Copyright (C) 2014 Florian Burger + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _MONOMIAL_SOLVE_H +#define _MONOMIAL_SOLVE_H + + +#include"solver/matrix_mult_typedef.h" +#include"solver/solver_params.h" +#include"su3.h" + int solve_degenerate(spinor * const P, spinor * const Q, solver_params_t solver_params, const int max_iter, + double eps_sq, const int rel_prec, const int N, matrix_mult f, int solver_type); + int solve_mms_nd(spinor ** const Pup, spinor ** const Pdn, + spinor * const Qup, spinor * const Qdn, + solver_pm_t * solver_pm); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/mr.c b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/mr.c new file mode 100644 index 0000000000000000000000000000000000000000..cad212216f579be301776c46ec5765b0d723e5ad --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/mr.c @@ -0,0 +1,182 @@ +/*********************************************************************** + * + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * Minimal residual solver + * int mr(spinor * const P, spinor * const Q, + * const int max_iter, const double eps_sq, + * matrix_mult f){ * + * + * returns the number of iterations needed to reach + * the desired precision. return -1 if the maximal + * number of iterations was reached. + * + * Inout: + * spinor * P : guess for the solving spinor + * Input: + * spinor * Q : source spinor + * int max_iter : maximal number of iterations + * double eps_sqr : stopping criterium + * matrix_mult f : pointer to a function containing + * the matrix mult for type + * matrix_mult see + * matrix_mult_typedef.h + * + * Autor: Carsten Urbach + * + ****************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "global.h" +#include "start.h" +#include "su3.h" +#include "linalg_eo.h" +#include "solver/solver.h" +#include "solver_field.h" +#include "mr.h" + +int mr(spinor * const P, spinor * const Q, + const int max_iter, const double eps_sq, + const int rel_prec, const int N, const int parallel, + matrix_mult f){ + int i=0; + double norm_r,beta; + _Complex double alpha; + spinor * r; + spinor ** solver_field = NULL; + const int nr_sf = 3; + + if(N == VOLUME) { + init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf); + } + else { + init_solver_field(&solver_field, VOLUMEPLUSRAND/2, nr_sf); + } + r = solver_field[0]; + + zero_spinor_field(P, N); + f(solver_field[2], P); + diff(r, Q, solver_field[2], N); + norm_r=square_norm(solver_field[0], N, parallel); + if(g_proc_id == g_stdio_proc && g_debug_level > 2) { + printf("MR iteration number: %d, |res|^2 = %e\n", i, norm_r); + fflush( stdout ); + } + while((norm_r > eps_sq) && (i < max_iter)){ + i++; + f(solver_field[1], r); + alpha=scalar_prod(solver_field[1], r, N, parallel); + beta=square_norm(solver_field[1], N, parallel); + alpha /= beta; + assign_add_mul(P, r, alpha, N); + if(i%50 == 0){ + f(solver_field[2], P); + } + else{ + assign_add_mul(solver_field[2], solver_field[1], alpha, N); + } + + diff(r, Q, solver_field[2], N); + norm_r=square_norm(solver_field[0], N, parallel); + if(g_proc_id == g_stdio_proc && g_debug_level > 2) { + printf("# MR iteration= %d |res|^2= %g\n", i, norm_r); + fflush(stdout); + } + } + finalize_solver(solver_field, nr_sf); + if(norm_r > eps_sq){ + return(-1); + } + return(i); +} + + +int mrblk(spinor * const P, spinor * const Q, + const int max_iter, const double eps_sq, + const int rel_prec, const int N, + matrix_mult_blk f, const int blk) { + static int mr_init=0; + int i = 0; + double norm_r,beta; + _Complex double alpha; + spinor * r; + const int parallel = 0; + spinor * s[3]; + static spinor *s_=NULL; + static int N_; + + if(mr_init == 0 || N != N_) { + if(N!= N_ && mr_init != 0) { + free(s_); + } + N_ = N; + s_ = calloc(3*(N+1)+1, sizeof(spinor)); + mr_init = 1; + } +#if (defined SSE || defined SSE2 || defined SSE3) + s[0] = (spinor *)(((unsigned long int)(s_)+ALIGN_BASE)&~ALIGN_BASE); +#else + s[0] = s_; +#endif + s[1] = s[0] + N + 1; + s[2] = s[1] + N + 1; + + r = s[0]; + norm_r = square_norm(Q, N, parallel); + + zero_spinor_field(P, N); + f(s[2], P, blk); + diff(r, Q, s[2], N); + norm_r = square_norm(r, N, parallel); + if(g_proc_id == g_stdio_proc && g_debug_level > 2 && blk == 0) { + printf("MRblk iteration= %d |res|^2= %e\n", i, norm_r); + fflush( stdout ); + } + + while((norm_r > eps_sq) && (i < max_iter)){ + i++; + f(s[1], r, blk); + alpha = scalar_prod(s[1], r, N, parallel); + beta = square_norm(s[1], N, parallel); + alpha /= beta; + assign_add_mul(P, r, alpha, N); + if(i%50 == 0) { + f(s[2], P,blk); + } + else{ + assign_add_mul(s[2], s[1], alpha, N); + } + + diff(r, Q, s[2], N); + norm_r = square_norm(r, N, parallel); + if(g_proc_id == g_stdio_proc && g_debug_level > 2 && blk == 0) { + printf("MRblk iteration= %d |res|^2= %g\n", i, norm_r); + fflush(stdout); + } + } + /* free(s_); */ + if(norm_r > eps_sq){ + return(-1); + } + return(i); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/mr.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/mr.h new file mode 100644 index 0000000000000000000000000000000000000000..df82f3c84b6a0a4dff9909955951aa76cfce2ec4 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/mr.h @@ -0,0 +1,58 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +/**************************************************** + * Minimal residual solver + * int mr(spinor * const P, spinor * const Q, + * const int max_iter, const double eps_sq, + * matrix_mult f){ * + * + * returns the number of iterations needed to reach + * the desired precision. return -1 if the maximal + * number of iterations was reached. + * + * Inout: + * spinor * P : guess for the solving spinor + * Input: + * spinor * Q : source spinor + * int max_iter : maximal number of iterations + * double eps_sqr : stopping criterium + * matrix_mult f : pointer to a function containing + * the matrix mult for type + * matrix_mult see + * matrix_mult_typedef.h + * + * Autor: Carsten Urbach + * + ****************************************************/ + +#ifndef _MR_H +#define _MR_H + +int mr(spinor * const P, spinor * const Q, + const int max_iter, const double eps_sq, + const int rel_prec, const int N, + const int parallel, matrix_mult f); + +int mrblk(spinor * const P, spinor * const Q, + const int max_iter, const double eps_sq, + const int rel_prec, const int N, + matrix_mult_blk f, const int blk); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/ortho.c b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/ortho.c new file mode 100644 index 0000000000000000000000000000000000000000..ddbb18307cfedc6cad70bcf36a80a62296d6ea05 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/ortho.c @@ -0,0 +1,87 @@ +/***************************************************************************** + * Copyright (C) 2008,2009,2010,2011,2012 + * Andreas Stathopoulos, Kostas Orginos, Abdou M. Abdel-Rehim + * + * This program is based on interfacing the eigCG solver to the tmLQCD code. + * It was written by Abdou M. Abdel-Rehim based on the original code written + * by Andreas Stathopoulos and Kostas Orginos and uses functions written in + * tmLQCD by Carsten Urbach + * + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * + * Gram-Shmidt orthogonalization + ****************************************************************************/ +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#ifdef MPI +# include +#endif +#include "global.h" +#include "linalg_eo.h" +#include "start.h" +#include "linalg/blas.h" +#include "linalg/lapack.h" + +#include "ortho.h" + +/****************************************************************************************************/ +int ortho_new_vectors(spinor **Vecs, int N, int nv_old, int nv_new, double orthtol) +{ + + //modified Gram-Schmidt orthogonalization + int i,j,k; + int parallel; + _Complex double alpha; + int nadded=0; + double tmpd; + + #ifdef MPI + parallel=1; + #else + parallel=0; + #endif + + for(i=nv_old; i< (nv_old+nv_new); i++) + { + for(j=0; j orthtol*orthtol) + { + /* normalize Vecs[i]*/ + tmpd=1.0e+00/sqrt(creal(alpha)); + mul_r(Vecs[i],tmpd,Vecs[i],N); + nadded= nadded+1; + } + + } + + return nadded; + +} +/****************************************************************************************************/ + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/ortho.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/ortho.h new file mode 100644 index 0000000000000000000000000000000000000000..fbd19e3fde26d67cdce7f2c3c659f7519bffa7bb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/ortho.h @@ -0,0 +1,52 @@ +/***************************************************************************** + * Copyright (C) 2008,2009,2010,2011,2012 + * Andreas Stathopoulos, Kostas Orginos, Abdou M. Abdel-Rehim + * + * This program is based on interfacing the eigCG solver to the tmLQCD code. + * It was written by Abdou M. Abdel-Rehim based on the original code written + * by Andreas Stathopoulos and Kostas Orginos and uses functions written in + * tmLQCD by Carsten Urbach + * + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * + * Gram-Shmidt orthogonalization + ****************************************************************************/ + + +#ifndef _ORTHO_NEW_H +#define _ORTHO_NEW_H + +#include "su3.h" + +/* Given a set of orthonormal vectors Vecs such that the first nv_old vectors + * are othonormal, it applies gram-schmit orthogonalization for the new vectors + * nv_new such that the whole set of nv_old+nv_new are orthonormal. It returns how + * many vectors were actually added. That could be less than nv_new because of possible + * linear dependence. If the new orthognalized vector has norm less than orthtol, it is not + * added.*/ + +int ortho_new_vectors( + spinor **Vecs, /*the set of vectors*/ + int N, /* Length of the vectors */ + int nv_old, /* number of orthonormal vectors */ + int nv_new, /* number of new vectors*/ + double orthtol /* smallest value of norm of a vector that could be added to Vecs */ +); + + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/pcg_her.c b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/pcg_her.c new file mode 100644 index 0000000000000000000000000000000000000000..a4d4235a57869639f68bb4d83397aaa3cbfdbaee --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/pcg_her.c @@ -0,0 +1,134 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "linalg_eo.h" +#include "start.h" +#include "solver/matrix_mult_typedef.h" +#include "sub_low_ev.h" +#include "solver_field.h" +#include "pcg_her.h" + +/* P output = solution , Q input = source */ +int pcg_her(spinor * const P, spinor * const Q, const int max_iter, + double eps_sq, const int rel_prec, const int N, matrix_mult f) { + double normsp, pro, pro2, err, alpha_cg, beta_cg, squarenorm; + int iteration; + spinor ** solver_field = NULL; + const int nr_sf = 5; + + if(N == VOLUME) { + init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf); + } + else { + init_solver_field(&solver_field, VOLUMEPLUSRAND/2, nr_sf); + } + squarenorm = square_norm(Q, N, 1); + /* !!!! INITIALIZATION !!!! */ + assign(solver_field[0], P, N); + /* (r_0,r_0) = normsq */ + normsp = square_norm(P, N, 1); + + assign(solver_field[3], Q, N); + /* initialize residue r and search vector p */ + if(normsp==0){ + /* if a starting solution vector equal to zero is chosen */ + /* r0 */ + assign(solver_field[1], solver_field[3], N); + /* p0 */ + } + else{ + /* if a starting solution vector different from zero is chosen */ + /* r0 = b - A x0 */ + f(solver_field[2], solver_field[0]); + diff(solver_field[1], solver_field[3], solver_field[2], N); + } + /* z0 = M^-1 r0 */ + invert_eigenvalue_part(solver_field[3], solver_field[1], 10, N); + /* p0 = z0 */ + assign(solver_field[2], solver_field[3], N); + + /* Is this really real? */ + pro2 = scalar_prod_r(solver_field[1], solver_field[3], N, 1); + /* main loop */ + for(iteration = 0; iteration < max_iter; iteration++) { + /* A p */ + f(solver_field[4], solver_field[2]); + + pro = scalar_prod_r(solver_field[2], solver_field[4], N, 1); + /* Compute alpha_cg(i+1) */ + alpha_cg=pro2/pro; + + /* Compute x_(i+1) = x_i + alpha_cg(i+1) p_i */ + assign_add_mul_r(solver_field[0], solver_field[2], alpha_cg, N); + /* Compute r_(i+1) = r_i - alpha_cg(i+1) Qp_i */ + assign_add_mul_r(solver_field[1], solver_field[4], -alpha_cg, N); + + /* Check whether the precision is reached ... */ + err=square_norm(solver_field[1], N, 1); + if(g_debug_level > 1 && g_proc_id == g_stdio_proc) { + printf("%d\t%g\n",iteration,err); fflush( stdout); + } + + if(((err <= eps_sq) && (rel_prec == 0)) || ((err <= eps_sq*squarenorm) && (rel_prec == 1))) { + assign(P, solver_field[0], N); + g_sloppy_precision = 0; + finalize_solver(solver_field, nr_sf); + return(iteration+1); + } +#ifdef _USE_HALFSPINOR + if(((err*err <= eps_sq) && (rel_prec == 0)) || ((err*err <= eps_sq*squarenorm) && (rel_prec == 1)) || iteration > 1400) { + g_sloppy_precision = 1; + if(g_debug_level > 2 && g_proc_id == g_stdio_proc) { + printf("sloppy precision on\n"); fflush( stdout); + } + } +#endif + /* z_j */ + beta_cg = 1/pro2; +/* invert_eigenvalue_part(solver_field[3], solver_field[1], 10, N); */ + /* Compute beta_cg(i+1) + Compute p_(i+1) = r_i+1 + beta_(i+1) p_i */ + pro2 = scalar_prod_r(solver_field[1], solver_field[3], N, 1); + beta_cg *= pro2; + assign_mul_add_r(solver_field[2], beta_cg, solver_field[3], N); + } + assign(P, solver_field[0], N); + g_sloppy_precision = 0; +/* return(-1); */ + finalize_solver(solver_field, nr_sf); + return(1); +} + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/pcg_her.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/pcg_her.h new file mode 100644 index 0000000000000000000000000000000000000000..c5e318a5e5fa48c118e4755a5be0c9359386d59a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/pcg_her.h @@ -0,0 +1,28 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _PCG_HER_H +#define _PCG_HER_H + +#include"solver/matrix_mult_typedef.h" +#include"su3.h" + +int pcg_her(spinor * const, spinor * const, const int max_iter, double eps_sq, const int rel_prec, + const int N, matrix_mult f); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/poly_precon.c b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/poly_precon.c new file mode 100644 index 0000000000000000000000000000000000000000..249278217de148e6c8274443a6733447980d9d0f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/poly_precon.c @@ -0,0 +1,265 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "start.h" +#include "linalg_eo.h" +#include "operator/tm_operators.h" +#include "boundary.h" +#include "operator/D_psi.h" +#include "poly_precon.h" + + +#define PI 3.141592653589793 + +double f_pre(double u){ +/* return(1./(1.+(g_mu*g_mu*(1-1.5*1.5))/(u))); */ +/* return pow(u,exponent); */ + return(1./u); +} + +void get_c(double aa, double bb, double c[], int n){ + int k,j; + double fac,bpa,bma,*f; + double inv_n; + + inv_n=1./(double)n; + f=calloc(n,sizeof(double));/*vector(0,n-1);*/ + fflush(stdout); + bma=0.5*(bb-aa); + bpa=0.5*(bb+aa); + for (k=0;k4) { + D_psi(tmp0, chi); + diff(tmp0, tmp0, S, N); + dtmp = square_norm(tmp0, N, 1); + if(g_proc_id == 0) printf("poly %d %1.3e\n", j, dtmp); + } +/* boundary(-g_kappa); */ +/* g_mu = -g_mu; */ + a1 = a2; + } + assign(R, chi, N); + boundary(g_kappa); + g_mu = dtmp; + + + return; +} + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/poly_precon.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/poly_precon.h new file mode 100644 index 0000000000000000000000000000000000000000..6f8039f0850296208a4169af1f79de6b403c5694 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/poly_precon.h @@ -0,0 +1,26 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _POLY_PRECON_H +#define _POLY_PRECON_H + +void poly_precon(spinor * const, spinor * const, const double prec, const int n); +void poly_nonherm_precon(spinor * const R, spinor * const S, + const double e, const double d, const int n, const int N); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/quicksort.c b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/quicksort.c new file mode 100644 index 0000000000000000000000000000000000000000..c9aa471b343b26000a5986cf3c2ffb5b345a41e0 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/quicksort.c @@ -0,0 +1,65 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +/* + * QUICKSORT + * + * Sorts a double array using a non-recursive quicksort algorithm in + * ascending order carrying along an int array. + * + */ + +void quicksort(int n, double arr[], int idx[]){ + double v, td; + int i, j, l, r, ti, tos, stack[32]; + + l = 0; r = n-1; tos = -1; + for (;;){ + while (r > l){ + v = arr[r]; i = l; j = r-1; + for (;;){ + while (arr[i] < v) i ++; + /* j > l prevents underflow */ + while (arr[j] >= v && j > l) j --; + if (i >= j) break; + td = arr[i]; arr[i] = arr[j]; arr[j] = td; + ti = idx[i]; idx[i] = idx[j]; idx[j] = ti; + } + td = arr[i]; arr[i] = arr[r]; arr[r] = td; + ti = idx[i]; idx[i] = idx[r]; idx[r] = ti; + if (i-l > r-i){ + stack[++tos] = l; stack[++tos] = i-1; l = i+1; + } + else{ + stack[++tos] = i+1; stack[++tos] = r; r = i-1; + } + if(tos > 31) { + fprintf(stderr,"Error in quicksort! Aborting...!");fflush(stderr); + exit(31); + } + } + if (tos == -1) break; + r = stack[tos--]; l = stack[tos--]; + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/quicksort.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/quicksort.h new file mode 100644 index 0000000000000000000000000000000000000000..304c08d1e6145be257f035bcbaf35363501ced5b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/quicksort.h @@ -0,0 +1,24 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _QUICKSORT_H +#define _QUICKSORT_H + +void quicksort(int n, double arr[], int idx[]); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/restart_X.c b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/restart_X.c new file mode 100644 index 0000000000000000000000000000000000000000..821ab2bf05286c79b180bd1f896ff410fa76c687 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/restart_X.c @@ -0,0 +1,98 @@ +/*********************************************************************** + * Copyright (C) 2008,2009,2010,2011,2012 + * Andreas Stathopoulos, Kostas Orginos, Abdou M. Abdel-Rehim + * + * This program is based on interfacing the eigCG solver to the tmLQCD code. + * It was written by Abdou M. Abdel-Rehim based on the original code written + * by Andreas Stathopoulos and Kostas Orginos and uses functions written in + * tmLQCD by Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +/******************************************************************************* + * Subroutine restart_X - This subroutine computes X*hVecs and places + * the result in X. + ******************************************************************************/ +/********************************************************** + <-------basisSize------> <---restartSize--> + | | | | + | | | | + | | | | + | | | hVecs | + | | | | + | | | | + | | | | + | | ----------------- + | X | + |nLocal | + | | + |ldx | + <----------------------> +***************************************************/ + + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#ifdef MPI +# include +#endif +#include "global.h" +#include "linalg_eo.h" +#include "su3.h" +#include "start.h" +#include "linalg/blas.h" +#include "linalg/lapack.h" + +#include "restart_X.h" +#define min(a, b) (a < b ? a : b) + + +void Zrestart_X(_Complex double *X, int ldx, _Complex double *hVecs, int nLocal, + int basisSize, int restartSize, _Complex double *rwork, int rworkSize) +{ + char cN = 'N'; + int ONE = 1; + int i, k; /* Loop variables */ + int AvailRows = min(rworkSize/restartSize, nLocal); + _Complex double tpone,tzero; + tpone= +1.0e+00; tzero=+0.0e+00; + + i = 0; + + while (i < nLocal) { + /* Block matrix multiply */ + _FT(zgemm)(&cN, &cN, &AvailRows, &restartSize, &basisSize, &tpone, + &X[i], &ldx, hVecs, &basisSize, &tzero, rwork, &AvailRows ,1,1); + + /* Copy the result in the desired location of X */ + for (k=0; k < restartSize; k++) { + _FT(zcopy)(&AvailRows, &rwork[AvailRows*k],&ONE, &X[i+ldx*k],&ONE); + } + + i = i+AvailRows; + AvailRows = min(AvailRows, nLocal-i); + } + + +} + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/restart_X.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/restart_X.h new file mode 100644 index 0000000000000000000000000000000000000000..96f4e3f74a77f26fcc329f8d85c67dbd4d13780f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/restart_X.h @@ -0,0 +1,41 @@ +/*********************************************************************** + * Copyright (C) 2008,2009,2010,2011,2012 + * Andreas Stathopoulos, Kostas Orginos, Abdou M. Abdel-Rehim + * + * This program is based on interfacing the eigCG solver to the tmLQCD code. + * It was written by Abdou M. Abdel-Rehim based on the original code written + * by Andreas Stathopoulos and Kostas Orginos and uses functions written in + * tmLQCD by Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +/*********************************************************************** + * Subroutine restart_X - This subroutine computes X*hVecs and places + * the result in X. + ***********************************************************************/ + + + +#ifndef _RESTART_X_H +#define _RESTART_X_H + + +/*double precision version */ +void Zrestart_X(_Complex double *X, int ldx, _Complex double *hVecs, int nLocal, + int basisSize, int restartSize, _Complex double *rwork, int rworkSize); + + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/rg_mixed_cg_her.c b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/rg_mixed_cg_her.c new file mode 100644 index 0000000000000000000000000000000000000000..b6ff07a668a7ad96fbd02dc02c20a264d4821852 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/rg_mixed_cg_her.c @@ -0,0 +1,353 @@ +/*********************************************************************** + * Copyright (C) 2015 Bartosz Kostrzewa, Florian Burger + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + ******************* + * rg_mixed_cg_her * + ******************* + * + * Mixed precision solver which uses true reliable updates and has a double + * precision fail-safe mechanism. The Polak-Ribiere computation of beta is + * implemented but currently not used because the extra scalar product is + * more expensive than the gain from the self-stabilisation as far as has + * been tested. + * + * in: + * Q: source + * inout: + * P: result (initial guess currently not supported) + * + * POSSIBLE IMPROVEMENTS + * There are still quite a few things that can be tried to make it better, + * the most significant of which would be to guide the search direction + * using the previous one upon restart. However, it seems that for the number + * non-zero entries in the Dirac operator and usual lattice sizes, the + * requisite projection + * + * p' = r - / p + * + * cannot be computed with sufficient precision in 64 bit arithmetic. It should + * be noted that for L < 24 in general, this does work and produces + * a mixed solver which converges at the same rate as a double solver, but it's + * not generally useable... For point sources, it also works for larger lattice + * volumes. Might be introduced as an optional mode in the future with some + * fail-safe mechanism which detects if the search direction begins to diverge. + * + **************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "linalg_eo.h" +#include "start.h" +#include "operator/tm_operators_32.h" +#include "operator/clovertm_operators_32.h" +#include "solver/matrix_mult_typedef.h" +#include "solver/solver_params.h" +#include "read_input.h" + +#include "solver_field.h" +#include "solver/rg_mixed_cg_her.h" +#include "gettime.h" + +static void output_flops(const double seconds, const unsigned int N, const unsigned int iter_out, const unsigned int iter_in_sp, const unsigned int iter_in_dp, const double eps_sq); + +static inline unsigned int inner_loop_high(spinor * const x, spinor * const p, spinor * const q, spinor * const r, double * const rho1, const double delta, + matrix_mult f, const double eps_sq, const unsigned int N, const unsigned int iter, const unsigned int max_iter ){ + + static double alpha, beta, rho, rhomax; + unsigned int j = 0; + + rho = *rho1; + rhomax = *rho1; + + /* break out of inner loop if iterated residual goes below some fraction of the maximum observed + * iterated residual since the last update or if the target precision has been reached + * enforce convergence more strictly by a factor of 1.3 to avoid unnecessary restarts + * if the real residual is still a bit too large */ + while( rho > delta*rhomax && j+iter <= max_iter ){ + ++j; + f(q,p); + alpha = rho/scalar_prod_r(p,q,N,1); + assign_add_mul_r(x, p, alpha, N); + assign_add_mul_r(r, q, -alpha, N); + rho = square_norm(r,N,1); + beta = rho / *rho1; + *rho1 = rho; + assign_mul_add_r(p, beta, r, N); + + if( 1.3*rho < eps_sq ) break; + if( rho > rhomax ) rhomax = rho; + + if(g_debug_level > 2 && g_proc_id == 0) { + printf("DP_inner CG: %d res^2 %g\t\n", j+iter, rho); + } + } + + return j; +} + +static inline unsigned int inner_loop(spinor32 * const x, spinor32 * const p, spinor32 * const q, spinor32 * const r, float * const rho1, const float delta, + matrix_mult32 f32, const float eps_sq, const unsigned int N, const unsigned int iter, const unsigned max_iter, + float alpha, float beta, MCG_PIPELINED_TYPE pipelined, MCG_PR_TYPE pr ){ + + static float rho, rhomax, pro; + unsigned int j = 0; + + rho = *rho1; + rhomax = *rho1; + + if(pipelined==MCG_NO_PIPELINED){ + /* break out of inner loop if iterated residual goes below some fraction of the maximum observed + * iterated residual since the last update */ + while( rho > delta*rhomax && j+iter <= max_iter ){ + ++j; + f32(q,p); + pro = scalar_prod_r_32(p,q,N,1); + alpha = rho/pro; + assign_add_mul_r_32(x, p, alpha, N); + assign_add_mul_r_32(r, q, -alpha, N); + rho = square_norm_32(r,N,1); + // Polak-Ribiere computation of beta, claimed to be self-stabilising, positive effect so far not observed or required + if(pr==MCG_PR){ + beta = alpha*(alpha*square_norm_32(q,N,1)-pro) / *rho1; + }else{ + beta = rho / *rho1; + } + *rho1 = rho; + assign_mul_add_r_32(p, beta, r, N); + if(g_debug_level > 2 && g_proc_id == 0) { + printf("SP_inner CG: %d res^2 %g\t\n", j+iter, rho); + } + /* enforce convergence more strictly by a factor of 1.3 to avoid unnecessary restarts + * if the real residual is still a bit too large */ + if( 1.3*rho < eps_sq ) break; + if( rho > rhomax ) rhomax = rho; + } + }else{ + // pipelined cg requires one more scalar product but may allow optimisations to be made + // e.g.: one could do the collective communication for sqrnrm(r) while other stuff is being computed + // it is also self-initialising (alpha=0, beta=0 will work) + while( rho > delta*rhomax && j+iter <= max_iter ){ + ++j; + assign_add_mul_r_32(x, p, alpha, N); + assign_add_mul_r_32(r, q, -alpha, N); + assign_mul_add_r_32(p, beta, r, N); + f32(q,p); + + rho = square_norm_32(r,N,1); + pro = scalar_prod_r_32(p,q,N,1); + alpha = rho/pro; + if(pr==MCG_PR){ + beta = alpha*(alpha*square_norm_32(q,N,1)-pro)/rho; + }else{ + beta = rho/ *rho1; + } + *rho1=rho; + + if(g_debug_level > 2 && g_proc_id == 0) { + printf("SP_inner CG: %d res^2 %g\t\n", j+iter, rho); + } + if( 1.3*rho < eps_sq ) break; + if( rho > rhomax ) rhomax = rho; + } + } + + return j; +} + + +/* P output = solution , Q input = source */ +int rg_mixed_cg_her(spinor * const P, spinor * const Q, solver_params_t solver_params, + const int max_iter, const double eps_sq, const int rel_prec, + const int N, matrix_mult f, matrix_mult32 f32) { + + int iter_in_sp = 0, iter_in_dp = 0, iter_out = 0; + float rho_sp, delta = solver_params.mcg_delta; + double beta_dp, rho_dp; + double sourcesquarenorm, target_eps_sq; + + spinor *xhigh, *rhigh, *qhigh, *phigh; + spinor32 *x, *p, *q, *r; + + spinor ** solver_field = NULL; + spinor32 ** solver_field32 = NULL; + const int nr_sf = 4; + const int nr_sf32 = 4; + + int high_control = 0; + + double atime, etime, flops; + + if(N == VOLUME) { + init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf); + init_solver_field_32(&solver_field32, VOLUMEPLUSRAND, nr_sf32); + } + else { + init_solver_field(&solver_field, VOLUMEPLUSRAND/2, nr_sf); + init_solver_field_32(&solver_field32, VOLUMEPLUSRAND/2, nr_sf32); + } + + atime = gettime(); + + // we could get away with using fewer fields, of course + phigh = solver_field[3]; + xhigh = solver_field[2]; + rhigh = solver_field[1]; + qhigh = solver_field[0]; + + x = solver_field32[3]; + r = solver_field32[2]; + p = solver_field32[1]; + q = solver_field32[0]; + + // we always want to apply the full precision operator in double + int save_sloppy = g_sloppy_precision_flag; + g_sloppy_precision_flag = 0; + + sourcesquarenorm = square_norm(Q,N,1); + if( rel_prec == 1 ) { + target_eps_sq = eps_sq*sourcesquarenorm; + if(g_debug_level > 0 && g_proc_id==0) + printf("#RG_Mixed CG: Using relative precision! eps_sq: %.6g target_eps_sq: %.6g \n",eps_sq,target_eps_sq); + }else{ + target_eps_sq = eps_sq; + } + + // compute maximum expected number of outer iterations based on expected reduction + // of the residual at each run of the inner solver + int N_outer = (int)ceil(log10( sourcesquarenorm*delta/target_eps_sq )); + if(g_debug_level > 0 && g_proc_id==0) + printf("#RG_Mixed CG: N_outer: %d \n", N_outer); + + // should compute real residual here and solve subtracted problem with initial guess + // for now we always use a zero guess + zero_spinor_field_32(x,N); + zero_spinor_field(P,N); + assign(phigh,Q,N); + assign(rhigh,Q,N); + + rho_dp = square_norm(rhigh,N,1); + assign_to_32(r,rhigh,N); + rho_sp = rho_dp; + assign_32(p,r,N); + + iter_in_sp += inner_loop(x, p, q, r, &rho_sp, delta, f32, (float)target_eps_sq, + N, iter_out+iter_in_sp+iter_in_dp, max_iter, 0.0, 0.0, MCG_NO_PIPELINED, MCG_NO_PR); + + for(iter_out = 1; iter_out < N_outer; ++iter_out) { + + // prepare for defect correction + // update high precision solution + if(high_control==0) { + // accumulate solution (sp -> dp) + addto_32(P,x,N); + // compute real residual + f(qhigh,P); + diff(rhigh,Q,qhigh,N); + beta_dp = 1/rho_dp; + rho_dp = square_norm(rhigh,N,1); + beta_dp *= rho_dp; + } + + // the iteration limit was reached in the previous iteration, let's try to save the day using double precision + if( high_control==1 ) { + assign(phigh,rhigh,N); + zero_spinor_field(xhigh,N); + beta_dp = 1/rho_dp; + iter_in_dp += inner_loop_high(xhigh, phigh, qhigh, rhigh, &rho_dp, delta, f, + target_eps_sq, N, iter_out+iter_in_sp+iter_in_dp, max_iter); + rho_sp = rho_dp; + // accumulate solution + add(P,P,xhigh,N); + // compute real residual + f(qhigh,P); + diff(rhigh,Q,qhigh,N); + rho_dp = square_norm(rhigh,N,1); + beta_dp *= rho_dp; + } + + if(g_debug_level > 2 && g_proc_id == 0) { + printf("RG_mixed CG last inner residue: %17g\n", rho_sp); + printf("RG_mixed CG true residue: %6d %10g\n", iter_in_sp+iter_in_dp+iter_out, rho_dp); + printf("RG_mixed CG residue reduction factor: %6d %10g\n", iter_in_sp+iter_in_dp+iter_out, beta_dp); fflush(stdout); + } + + if( rho_dp <= target_eps_sq || (iter_in_sp+iter_in_dp+iter_out) >= max_iter ) { + etime = gettime(); + output_flops(etime-atime, N, iter_out, iter_in_sp, iter_in_dp, eps_sq); + + g_sloppy_precision_flag = save_sloppy; + finalize_solver(solver_field, nr_sf); + finalize_solver_32(solver_field32, nr_sf32); + if( (iter_in_sp+iter_in_dp+iter_out) >= max_iter ){ + return(-1); + } else { + return(iter_in_sp+iter_in_dp+iter_out); + } + } + + // if it seems like we're stuck and reaching the iteration limit, we skip this correction and proceed in full precision above + if( iter_out >= (N_outer-2) ){ + if(g_proc_id==0) printf("mixed CG: Reaching iteration limit, switching to DP!\n"); + high_control = 1; + continue; + }else{ + // correct defect + assign_to_32(r,rhigh,N); + rho_sp = rho_dp; // not sure if it's fine to truncate this or whether one should calculate it in SP directly, it seems to work fine though + assign_32(p,r,N); + } + + zero_spinor_field_32(x,N); + iter_in_sp += inner_loop(x, p, q, r, &rho_sp, delta, f32, (float)target_eps_sq, + N, iter_out+iter_in_sp+iter_in_dp, max_iter, 0.0, 0.0, MCG_NO_PIPELINED, MCG_NO_PR); + } + + // convergence failure... + g_sloppy_precision_flag = save_sloppy; + finalize_solver(solver_field, nr_sf); + finalize_solver_32(solver_field32, nr_sf32); + return -1; +} + +void output_flops(const double seconds, const unsigned int N, const unsigned int iter_out, const unsigned int iter_in_sp, const unsigned int iter_in_dp, const double eps_sq){ + double flops; + // TODO: compute real number of flops... + int total_it = iter_in_sp+iter_in_dp+iter_out; + if(g_debug_level > 0 && g_proc_id == 0) { + printf("# RG_mixed CG: iter_out: %d iter_in_sp: %d iter_in_dp: %d\n",iter_out,iter_in_sp,iter_in_dp); + if(N != VOLUME){ + /* 2 A + 2 Nc Ns + N_Count ( 2 A + 10 Nc Ns ) */ + /* 2*1608.0 because the linalg is over VOLUME/2 */ + flops = (2*(2*1608.0+2*3*4) + 2*3*4 + total_it*(2.*(2*1608.0+2*3*4) + 10*3*4))*N/1.0e6f; + } + else{ + /* 2 A + 2 Nc Ns + N_Count ( 2 A + 10 Nc Ns ) */ + flops = (2*(1608.0+2*3*4) + 2*3*4 + total_it*(2.*(1608.0+2*3*4) + 10*3*4))*N/1.0e6f; + } + printf("#RG_mixed CG: iter: %d eps_sq: %1.4e t/s: %1.4e\n", total_it, eps_sq, seconds); + printf("# FIXME: note the following flop counts are wrong! Consider only the time to solution!\n"); + printf("#RG_mixed CG: flopcount (for e/o tmWilson only): t/s: %1.4e mflops_local: %.1f mflops: %.1f\n", + seconds, flops/(seconds), g_nproc*flops/(seconds)); + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/rg_mixed_cg_her.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/rg_mixed_cg_her.h new file mode 100644 index 0000000000000000000000000000000000000000..dade21af978402d84f6a842e077f64f7047a0916 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/rg_mixed_cg_her.h @@ -0,0 +1,32 @@ +/*********************************************************************** + * Copyright (C) 2015 Bartosz Kostrzewa + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _RG_MIXED_CG_HER_H +#define _RG_MIXED_CG_HER_H + +#include "operator/tm_operators_32.h" +#include "solver/rg_mixed_cg_typedef.h" +#include "solver/matrix_mult_typedef.h" +#include "solver/solver_params.h" +#include "su3.h" + +int rg_mixed_cg_her(spinor * const P, spinor * const Q, solver_params_t solver_params, + const int max_iter, const double eps_sq, const int rel_prec, + const int N, matrix_mult f, matrix_mult32 f32); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/rg_mixed_cg_her_nd.c b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/rg_mixed_cg_her_nd.c new file mode 100644 index 0000000000000000000000000000000000000000..de5643f746fb0f46fb4304f5eaf1d053176c13a0 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/rg_mixed_cg_her_nd.c @@ -0,0 +1,363 @@ +/*********************************************************************** + * Copyright (C) 2016 Bartosz Kostrzewa, Florian Burger + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + ********************** + * rg_mixed_cg_her_nd * + ********************** + * + * Mixed precision solver which uses true reliable updates and has a double + * precision fail-safe mechanism. The Polak-Ribiere computation of beta is + * implemented but currently not used because the extra scalar product is + * more expensive than the gain from the self-stabilisation as far as has + * been tested. + * + * in: + * Q: source + * inout: + * P: result (initial guess currently not supported) + * + * POSSIBLE IMPROVEMENTS + * There are still quite a few things that can be tried to make it better, + * the most significant of which would be to guide the search direction + * using the previous one upon restart. However, it seems that for the number + * non-zero entries in the Dirac operator and usual lattice sizes, the + * requisite projection + * + * p' = r - / p + * + * cannot be computed with sufficient precision in 64 bit arithmetic. It should + * be noted that for L < 24 in general, this does work and produces + * a mixed solver which converges at the same rate as a double solver, but it's + * not generally useable... For point sources, it also works for larger lattice + * volumes. Might be introduced as an optional mode in the future with some + * fail-safe mechanism which detects if the search direction begins to diverge. + * + **************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "linalg_eo.h" +#include "start.h" +#include "operator/tm_operators_32.h" +#include "operator/clovertm_operators_32.h" +#include "solver/matrix_mult_typedef_nd.h" +#include "solver/solver_params.h" +#include "read_input.h" + +#include "solver_field.h" +#include "solver/rg_mixed_cg_her.h" +#include "gettime.h" + +static void output_flops(const double seconds, const unsigned int N, const unsigned int iter_out, + const unsigned int iter_in_sp, const unsigned int iter_in_dp, const double eps_sq); + +static inline unsigned int inner_loop_high(spinor * const x_up, spinor * const x_dn, + spinor * const p_up, spinor * const p_dn, + spinor * const q_up, spinor * const q_dn, + spinor * const r_up, spinor * const r_dn, + double * const rho1, const double delta, + matrix_mult_nd f, const double eps_sq, const unsigned int N, const unsigned int iter, const unsigned int max_iter ){ + + static double alpha, beta, rho, rhomax; + unsigned int j = 0; + + rho = *rho1; + rhomax = *rho1; + + /* break out of inner loop if iterated residual goes below some fraction of the maximum observed + * iterated residual since the last update or if the target precision has been reached + * enforce convergence more strictly by a factor of 1.3 to avoid unnecessary restarts + * if the real residual is still a bit too large */ + while( rho > delta*rhomax && j+iter <= max_iter ){ + ++j; + f(q_up,q_dn,p_up,p_dn); + alpha = rho/( scalar_prod_r(p_up,q_up,N,1) + scalar_prod_r(p_dn,q_dn,N,1) ); + assign_add_mul_r(x_up, p_up, alpha, N); assign_add_mul_r(x_dn, p_dn, alpha, N); + assign_add_mul_r(r_up, q_up, -alpha, N); assign_add_mul_r(r_dn, q_dn, -alpha, N); + rho = ( square_norm(r_up,N,1) + square_norm(r_dn,N,1) ); + beta = rho / *rho1; + *rho1 = rho; + assign_mul_add_r(p_up, beta, r_up, N); assign_mul_add_r(p_dn, beta, r_dn, N); + + if( 1.3*rho < eps_sq ) break; + if( rho > rhomax ) rhomax = rho; + + if(g_debug_level > 2 && g_proc_id == 0) { + printf("DP_inner CG: %d res^2 %g\t\n", j+iter, rho); + } + } + + return j; +} + +static inline unsigned int inner_loop(spinor32 * const x_up, spinor32 * const x_dn, + spinor32 * const p_up, spinor32 * const p_dn, + spinor32 * const q_up, spinor32 * const q_dn, + spinor32 * const r_up, spinor32 * const r_dn, + float * const rho1, const float delta, + matrix_mult_nd32 f32, const float eps_sq, const unsigned int N, const unsigned int iter, const unsigned int max_iter, + float alpha, float beta, MCG_PIPELINED_TYPE pipelined, MCG_PR_TYPE pr ){ + + static float rho, rhomax, pro; + unsigned int j = 0; + + rho = *rho1; + rhomax = *rho1; + + if(pipelined==MCG_NO_PIPELINED){ + /* break out of inner loop if iterated residual goes below some fraction of the maximum observed + * iterated residual since the last update */ + while( rho > delta*rhomax && j+iter <= max_iter ){ + ++j; + f32(q_up,q_dn,p_up,p_dn); + pro = ( scalar_prod_r_32(p_up,q_up,N,1) + scalar_prod_r_32(p_dn,q_dn,N,1) ); + alpha = rho/pro; + assign_add_mul_r_32(x_up, p_up, alpha, N); assign_add_mul_r_32(x_dn, p_dn, alpha, N); + assign_add_mul_r_32(r_up, q_up, -alpha, N); assign_add_mul_r_32(r_dn, q_dn, -alpha, N); + rho = ( square_norm_32(r_up,N,1) + square_norm_32(r_dn,N,1) ); + // Polak-Ribiere computation of beta, claimed to be self-stabilising, positive effect so far not observed or required + if(pr==MCG_PR){ + beta = alpha*(alpha*(square_norm_32(q_up,N,1)+square_norm_32(q_dn,N,1)) - pro) / *rho1; + }else{ + beta = rho / *rho1; + } + *rho1 = rho; + assign_mul_add_r_32(p_up, beta, r_up, N); assign_mul_add_r_32(p_dn, beta, r_dn, N); + if(g_debug_level > 2 && g_proc_id == 0) { + printf("SP_inner CG_ND: %d res^2 %g\t\n", j+iter, rho); + } + /* enforce convergence more strictly by a factor of 1.3 to avoid unnecessary restarts + * if the real residual is still a bit too large */ + if( 1.3*rho < eps_sq ) break; + if( rho > rhomax ) rhomax = rho; + } + }else{ + // pipelined cg requires one more scalar product but may allow optimisations to be made + // e.g.: one could do the collective communication for sqrnrm(r) while other stuff is being computed + // it is also self-initialising (alpha=0, beta=0 will work) + while( rho > delta*rhomax && j+iter <= max_iter ){ + ++j; + assign_add_mul_r_32(x_up, p_up, alpha, N); assign_add_mul_r_32(x_dn, p_dn, alpha, N); + assign_add_mul_r_32(r_up, q_up, -alpha, N); assign_add_mul_r_32(r_dn, q_dn, -alpha, N); + assign_mul_add_r_32(p_up, beta, r_up, N); assign_mul_add_r_32(p_dn, beta, r_dn, N); + f32(q_up,q_dn,p_up,p_dn); + + rho = ( square_norm_32(r_up,N,1) + square_norm_32(r_dn,N,1) ); + pro = ( scalar_prod_r_32(p_up,q_up,N,1) + scalar_prod_r_32(p_dn,q_dn,N,1) ); + alpha = rho/pro; + if(pr==MCG_PR){ + beta = alpha*(alpha*(square_norm_32(q_up,N,1)+square_norm_32(q_dn,N,1))-pro)/rho; + }else{ + beta = rho/ *rho1; + } + *rho1=rho; + + if(g_debug_level > 2 && g_proc_id == 0) { + printf("SP_inner CG_ND: %d res^2 %g\t\n", j+iter, rho); + } + if( 1.3*rho < eps_sq ) break; + if( rho > rhomax ) rhomax = rho; + } + } + + return j; +} + + +/* P output = solution , Q input = source */ +int rg_mixed_cg_her_nd(spinor * const P_up, spinor * const P_dn, spinor * const Q_up, spinor * const Q_dn, + solver_params_t solver_params, const int max_iter, const double eps_sq, const int rel_prec, + const int N, matrix_mult_nd f, matrix_mult_nd32 f32) { + + int iter_in_sp = 0, iter_in_dp = 0, iter_out = 0; + float rho_sp, delta = solver_params.mcg_delta; + double beta_dp, rho_dp; + double sourcesquarenorm, target_eps_sq; + + spinor *xhigh_up, *xhigh_dn, *rhigh_up, *rhigh_dn, *qhigh_up, *qhigh_dn, *phigh_up, *phigh_dn; + spinor32 *x_up, *x_dn, *p_up, *p_dn, *q_up, *q_dn, *r_up, *r_dn; + + spinor ** solver_field = NULL; + spinor32 ** solver_field32 = NULL; + const int nr_sf = 8; + const int nr_sf32 = 8; + + int high_control = 0; + + double atime, etime, flops; + + if(N == VOLUME) { + init_solver_field(&solver_field, VOLUMEPLUSRAND, nr_sf); + init_solver_field_32(&solver_field32, VOLUMEPLUSRAND, nr_sf32); + } + else { + init_solver_field(&solver_field, VOLUMEPLUSRAND/2, nr_sf); + init_solver_field_32(&solver_field32, VOLUMEPLUSRAND/2, nr_sf32); + } + + atime = gettime(); + + // we could get away with using fewer fields, of course + phigh_up = solver_field[7]; phigh_dn = solver_field[6]; + xhigh_up = solver_field[5]; xhigh_dn = solver_field[4]; + rhigh_up = solver_field[3]; rhigh_dn = solver_field[2]; + qhigh_up = solver_field[1]; qhigh_dn = solver_field[0]; + + x_up = solver_field32[7]; x_dn = solver_field32[6]; + r_up = solver_field32[5]; r_dn = solver_field32[4]; + p_up = solver_field32[3]; p_dn = solver_field32[2]; + q_up = solver_field32[1]; q_dn = solver_field32[0]; + + // we always want to apply the full precision operator in double + int save_sloppy = g_sloppy_precision_flag; + g_sloppy_precision_flag = 0; + + sourcesquarenorm = ( square_norm(Q_up,N,1) + square_norm(Q_dn,N,1) ); + if( rel_prec == 1 ) { + target_eps_sq = eps_sq*sourcesquarenorm; + if(g_debug_level > 0 && g_proc_id==0) + printf("#RG_Mixed CG_ND: Using relative precision! eps_sq: %.6g target_eps_sq: %.6g \n",eps_sq,target_eps_sq); + }else{ + target_eps_sq = eps_sq; + } + + // compute the maximum number of outer iterations based on the expected reduction + // of the residual at each run of the inner solver + int N_outer = (int)ceil(log10( sourcesquarenorm*delta/target_eps_sq )); + if(g_debug_level > 0 && g_proc_id==0) + printf("#RG_Mixed CG_ND: N_outer: %d \n", N_outer); + + // should compute real residual here, for now we always use a zero guess + zero_spinor_field_32(x_up,N); zero_spinor_field_32(x_dn,N); + zero_spinor_field(P_up,N); zero_spinor_field(P_dn,N); + assign(phigh_up,Q_up,N); assign(phigh_dn,Q_dn,N); + assign(rhigh_up,Q_up,N); assign(rhigh_dn,Q_dn,N); + + rho_dp = ( square_norm(rhigh_up,N,1) + square_norm(rhigh_dn,N,1) ); + assign_to_32(r_up,rhigh_up,N); assign_to_32(r_dn,rhigh_dn,N); + rho_sp = rho_dp; + assign_32(p_up,r_up,N); assign_32(p_dn,r_dn,N); + + iter_in_sp += inner_loop(x_up, x_dn, p_up, p_dn, q_up, q_dn, r_up, r_dn, &rho_sp, delta, + f32, (float)target_eps_sq, + N, iter_out+iter_in_sp+iter_in_dp, max_iter, 0.0, 0.0, MCG_NO_PIPELINED, MCG_NO_PR); + + for(iter_out = 1; iter_out < N_outer; ++iter_out ) { + + // prepare for defect correction + // update high precision solution + if(high_control==0) { + // accumulate solution (sp -> dp) + addto_32(P_up,x_up,N); addto_32(P_dn,x_dn,N); + // compute real residual + f(qhigh_up,qhigh_dn,P_up,P_dn); + diff(rhigh_up,Q_up,qhigh_up,N); diff(rhigh_dn,Q_dn,qhigh_dn,N); + beta_dp = 1/rho_dp; + rho_dp = ( square_norm(rhigh_up,N,1) + square_norm(rhigh_dn,N,1) ); + beta_dp *= rho_dp; + } + + // the iteration limit was reached in the previous iteration, let's try to save the day using double precision + if( high_control==1 ) { + assign(phigh_up,rhigh_up,N); assign(phigh_dn,rhigh_dn,N); + zero_spinor_field(xhigh_up,N); zero_spinor_field(xhigh_dn,N); + beta_dp = 1/rho_dp; + iter_in_dp += inner_loop_high(xhigh_up, xhigh_dn, phigh_up, phigh_dn, + qhigh_up, qhigh_dn, rhigh_up, rhigh_dn, &rho_dp, delta, f, + target_eps_sq, N, iter_out+iter_in_sp+iter_in_dp, max_iter); + rho_sp = rho_dp; + // accumulate solution + add(P_up,P_up,xhigh_up,N); add(P_dn, P_dn, xhigh_dn, N); + // compute real residual + f(qhigh_up, qhigh_dn, P_up, P_dn); + diff(rhigh_up,Q_up,qhigh_up,N); diff(rhigh_dn,Q_dn,qhigh_dn,N); + rho_dp = ( square_norm(rhigh_up,N,1) + square_norm(rhigh_dn,N,1) ); + beta_dp *= rho_dp; + } + + if(g_debug_level > 2 && g_proc_id == 0) { + printf("RG_mixed CG_ND last inner residue: %17g\n", rho_sp); + printf("RG_mixed CG_ND true residue: %6d %10g\n", iter_in_sp+iter_in_dp+iter_out, rho_dp); + printf("RG_mixed CG_ND residue reduction factor: %6d %10g\n", iter_in_sp+iter_in_dp+iter_out, beta_dp); fflush(stdout); + } + + if( rho_dp <= target_eps_sq || (iter_in_sp+iter_in_dp+iter_out) >= max_iter ) { + etime = gettime(); + output_flops(etime-atime, N, iter_out, iter_in_sp, iter_in_dp, eps_sq); + + g_sloppy_precision_flag = save_sloppy; + finalize_solver(solver_field, nr_sf); + finalize_solver_32(solver_field32, nr_sf32); + if( (iter_in_sp+iter_in_dp+iter_out) >= max_iter ){ + return(-1); + } else { + return(iter_in_sp+iter_in_dp+iter_out); + } + } + + // if it seems like we're stuck and reaching the iteration limit, we skip this correction and proceed in full precision above + if( iter_out >= (N_outer-2) ){ + if(g_proc_id==0) printf("RG_mixed CG_ND: Reaching iteration limit, switching to DP!\n"); + high_control = 1; + continue; + }else{ + // correct defect + assign_to_32(r_up,rhigh_up,N); assign_to_32(r_dn,rhigh_dn,N); + rho_sp = rho_dp; // not sure if it's fine to truncate this or whether one should calculate it in SP directly, it seems to work fine though + assign_32(p_up,r_up,N); assign_32(p_dn,r_dn,N); + } + + zero_spinor_field_32(x_up,N); zero_spinor_field_32(x_dn,N); + iter_in_sp += inner_loop(x_up, x_dn, p_up, p_dn, q_up, q_dn, r_up, r_dn, &rho_sp, delta, f32, (float)target_eps_sq, + N, iter_out+iter_in_sp+iter_in_dp, max_iter, 0.0, 0.0, MCG_NO_PIPELINED, MCG_NO_PR); + } + + // convergence failure... + g_sloppy_precision_flag = save_sloppy; + finalize_solver(solver_field, nr_sf); + finalize_solver_32(solver_field32, nr_sf32); + return -1; +} + +void output_flops(const double seconds, const unsigned int N, const unsigned int iter_out, const unsigned int iter_in_sp, const unsigned int iter_in_dp, const double eps_sq){ + double flops; + // TODO: compute real number of flops... + int total_it = iter_in_sp+iter_in_dp+iter_out; + if(g_debug_level > 0 && g_proc_id == 0) { + printf("# RG_mixed CG_ND: iter_out: %d iter_in_sp: %d iter_in_dp: %d\n",iter_out,iter_in_sp,iter_in_dp); + if(N != VOLUME){ + /* 2 A + 2 Nc Ns + N_Count ( 2 A + 10 Nc Ns ) */ + /* 2*1608.0 because the linalg is over VOLUME/2 */ + flops = 2*(2*(2*1608.0+2*3*4) + 2*3*4 + total_it*(2.*(2*1608.0+2*3*4) + 10*3*4))*N/1.0e6f; + } + else{ + /* 2 A + 2 Nc Ns + N_Count ( 2 A + 10 Nc Ns ) */ + flops = 2*(2*(1608.0+2*3*4) + 2*3*4 + total_it*(2.*(1608.0+2*3*4) + 10*3*4))*N/1.0e6f; + } + printf("#RG_mixed CG_ND: iter: %d eps_sq: %1.4e t/s: %1.4e\n", total_it, eps_sq, seconds); + printf("# FIXME: note the following flop counts are wrong! Consider only the time to solution!\n"); + printf("#RG_mixed CG_ND: flopcount (for e/o tmWilson only): t/s: %1.4e mflops_local: %.1f mflops: %.1f\n", + seconds, flops/(seconds), g_nproc*flops/(seconds)); + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/rg_mixed_cg_her_nd.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/rg_mixed_cg_her_nd.h new file mode 100644 index 0000000000000000000000000000000000000000..4a093091f7fbc4de69af917c40d48b17c597a50e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/rg_mixed_cg_her_nd.h @@ -0,0 +1,32 @@ +/*********************************************************************** + * Copyright (C) 2016 Bartosz Kostrzewa + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _RG_MIXED_CG_HER_ND_H +#define _RG_MIXED_CG_HER_ND_H + +#include "operator/tm_operators_32.h" +#include "solver/matrix_mult_typedef_nd.h" +#include "solver/solver_params.h" +#include "solver/rg_mixed_cg_typedef.h" +#include "su3.h" + +int rg_mixed_cg_her_nd(spinor * const Pup, spinor * const Pdn, spinor * const Qup, spinor * const Qdn, + solver_params_t solver_params, const int max_iter, const double eps_sq, const int rel_prec, + const int N, matrix_mult_nd f, matrix_mult_nd32 f32); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/rg_mixed_cg_typedef.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/rg_mixed_cg_typedef.h new file mode 100644 index 0000000000000000000000000000000000000000..61c367342f3228fd5d7bb521ef82f19632379650 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/rg_mixed_cg_typedef.h @@ -0,0 +1,21 @@ +#ifndef _RG_MIXED_CG_HER_TYPEDEF_H +#define _RG_MIXED_CG_HER_TYPEDEF_H + +typedef enum MCG_PR_TYPE { + MCG_NO_PR=0, + MCG_PR +} MCG_PR_TYPE; + +typedef enum MCG_PIPELINED_TYPE { + MCG_NO_PIPELINED=0, + MCG_PIPELINED +} MCG_PIPELINED_TYPE; + +// currently not used +typedef enum MCG_RESGUIDE_TYPE { + MCG_NO_RESGUIDE=0, + MCG_RESGUIDE +} MCG_RESGUIDE_TYPE; + + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/solver.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/solver.h new file mode 100644 index 0000000000000000000000000000000000000000..36944add5aad86a52b1501c8c7de5a9516757566 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/solver.h @@ -0,0 +1,88 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _SOLVER_H +#define _SOLVER_H + + +#include"solver/solver_types.h" + +#include"solver/matrix_mult_typedef.h" +#include "solver/matrix_mult_typedef_bi.h" +#include "solver/matrix_mult_typedef_nd.h" + +typedef struct { + // solver type + int type; + // maximal number of iterations + int max_iter; + // use relative precision + int rel_prec; + // number of shifts in multi shift solvers + int no_shifts; + // dimension of spinors + int sdim; + // squared desired residue + double squared_solver_prec; + // single flavour matrix to invert + matrix_mult M_psi; + // 32bit single flavour matrix to invert + matrix_mult32 M_psi32; + // flavour doublet matrix to invert + matrix_mult_nd M_ndpsi; + // 32bit flavour doublet matrix to invert + matrix_mult_nd32 M_ndpsi32; + // pointer to array of shifts + double * shifts; +} solver_pm_t; + +#include"solver/gmres.h" +#include"solver/gmres_dr.h" +#include"solver/fgmres.h" +#include"solver/bicgstab_complex.h" +#include"solver/cgs_real.h" +#include"solver/bicgstabell.h" +#include"solver/bicgstab2.h" +#include"solver/cg_her.h" +#include"solver/pcg_her.h" +#include"solver/mr.h" +#include"solver/gcr.h" +#include"solver/incr_eigcg.h" +#include"solver/eigenvalues.h" +#include"solver/cg_mms_tm.h" +#include"solver/mixed_cg_her.h" +#include "solver/rg_mixed_cg_her.h" + +#include"solver/sub_low_ev.h" +#include"solver/gmres_precon.h" +#include"solver/poly_precon.h" + +#include "solver/bicgstab_complex_bi.h" +#include "solver/cg_her_bi.h" + +#include "solver/cg_her_nd.h" +#include "solver/rg_mixed_cg_her_nd.h" +#include"solver/cg_mms_tm_nd.h" +#include"solver/mixed_cg_mms_tm_nd.h" + +#include "solver/generate_dfl_subspace.h" + +#include "solver/sumr.h" + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/solver_field.c b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/solver_field.c new file mode 100644 index 0000000000000000000000000000000000000000..507d9055a1c71098560e377dca44e0d91109275e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/solver_field.c @@ -0,0 +1,154 @@ +/*********************************************************************** + * + * Copyright (C) 2009,2011 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + *******************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include"global.h" +#include"su3.h" +#include"solver_field.h" + +int init_solver_field(spinor *** const solver_field, const int V, const int nr) { + int i=0; + + /* allocate nr+1 to save the linear field in solver_field[nr] */ + if((void*)((*solver_field) = (spinor**)malloc((nr+1)*sizeof(spinor*))) == NULL) { + printf ("malloc errno in init_solver_field: %d\n",errno); + errno = 0; + return(2); + } + + /* allocate the full chunk of memory to solver_field[nr] */ +#if (defined _USE_SHMEM && !(defined _USE_HALFSPINOR)) + if((void*)((*solver_field)[nr] = (spinor*)shmalloc((nr*V+1)*sizeof(spinor))) == NULL) { + fprintf (stderr, "malloc errno in init_solver_field: %d\n",errno); + errno = 0; + return(1); + } +#else + if((void*)((*solver_field)[nr] = (spinor*)calloc(nr*V+1, sizeof(spinor))) == NULL) { + printf ("malloc errno in init_solver_field: %d\n",errno); + errno = 0; + return(1); + } +#endif + + /* now cut in pieces and distribute to solver_field[0]-solver_field[nr-1] */ +#if ( defined SSE || defined SSE2 || defined SSE3) + (*solver_field)[0] = (spinor*)(((unsigned long int)((*solver_field)[nr])+ALIGN_BASE)&~ALIGN_BASE); +#else + (*solver_field)[0] = (*solver_field)[nr]; +#endif + for(i = 1; i < nr; i++){ + (*solver_field)[i] = (*solver_field)[i-1]+V; + } + return(0); +} + +void finalize_solver(spinor ** solver_field, const int nr){ + free(solver_field[nr]); + free(solver_field); + solver_field = NULL; +} + + + + + +int init_solver_field_32(spinor32 *** const solver_field, const int V, const int nr) { + int i=0; + + /* allocate nr+1 to save the linear field in solver_field[nr] */ + if((void*)((*solver_field) = (spinor32**)malloc((nr+1)*sizeof(spinor32*))) == NULL) { + printf ("malloc errno in init_solver_field: %d\n",errno); + errno = 0; + return(2); + } + + /* allocate the full chunk of memory to solver_field[nr] */ +#if (defined _USE_SHMEM && !(defined _USE_HALFSPINOR)) + if((void*)((*solver_field)[nr] = (spinor32*)shmalloc((nr*V+1)*sizeof(spinor32))) == NULL) { + fprintf (stderr, "malloc errno in init_solver_field: %d\n",errno); + errno = 0; + return(1); + } +#else + if((void*)((*solver_field)[nr] = (spinor32*)calloc(nr*V+1, sizeof(spinor32))) == NULL) { + printf ("malloc errno in init_solver_field: %d\n",errno); + errno = 0; + return(1); + } +#endif + + /* now cut in pieces and distribute to solver_field[0]-solver_field[nr-1] */ +#if ( defined SSE || defined SSE2 || defined SSE3) + (*solver_field)[0] = (spinor32*)(((unsigned long int)((*solver_field)[nr])+ALIGN_BASE32)&~ALIGN_BASE32); +#else + (*solver_field)[0] = (spinor32*)(((unsigned long int)((*solver_field)[nr])+ALIGN_BASE32)&~ALIGN_BASE32); +#endif + for(i = 1; i < nr; i++){ + (*solver_field)[i] = (*solver_field)[i-1]+V; + } + return(0); +} + +void finalize_solver_32(spinor32 ** solver_field, const int nr){ + free(solver_field[nr]); + free(solver_field); + solver_field = NULL; +} + +int init_bisolver_field(bispinor *** const solver_field, const int V, const int nr) { + int i=0; + + /* allocate nr+1 to save the linear field in solver_field[nr] */ + if((void*)((*solver_field) = (bispinor**)malloc((nr+1)*sizeof(bispinor*))) == NULL) { + printf ("malloc errno in init_solver_field: %d\n",errno); + errno = 0; + return(2); + } + + /* allocate the full chunk of memory to solver_field[nr] */ + if((void*)((*solver_field)[nr] = (bispinor*)calloc(nr*V+1, sizeof(bispinor))) == NULL) { + printf ("malloc errno in init_solver_field: %d\n",errno); + errno = 0; + return(1); + } + + /* now cut in pieces and distribute to solver_field[0]-solver_field[nr-1] */ +#if ( defined SSE || defined SSE2 || defined SSE3) + (*solver_field)[0] = (bispinor*)(((unsigned long int)((*solver_field)[nr])+ALIGN_BASE)&~ALIGN_BASE); +#else + (*solver_field)[0] = (*solver_field)[nr]; +#endif + for(i = 1; i < nr; i++){ + (*solver_field)[i] = (*solver_field)[i-1]+V; + } + return(0); +} + +void finalize_bisolver(bispinor ** solver_field, const int nr) { + free(solver_field[nr]); + free(solver_field); + solver_field = NULL; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/solver_field.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/solver_field.h new file mode 100644 index 0000000000000000000000000000000000000000..91c7cac3d3a23d62d0d82f34315fc4584c68cf36 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/solver_field.h @@ -0,0 +1,33 @@ +/*********************************************************************** + * + * Copyright (C) 2009 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + *******************************************************************************/ + +#ifndef _SOLVER_FIELD_H +#define _SOLVER_FIELD_H + +#include"su3.h" + +int init_solver_field(spinor *** const solver_field, const int V, const int nr); +void finalize_solver(spinor ** solver_field, const int nr); +int init_solver_field_32(spinor32 *** const solver_field, const int V, const int nr); +void finalize_solver_32(spinor32 ** solver_field, const int nr); +int init_bisolver_field(bispinor *** const solver_field, const int V, const int nr); +void finalize_bisolver(bispinor ** solver_field, const int nr); +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/solver_params.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/solver_params.h new file mode 100644 index 0000000000000000000000000000000000000000..485c16ccb3127713324c7bb968f72aa3d06233a9 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/solver_params.h @@ -0,0 +1,62 @@ +/*************************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ****************************************************************************/ + +/***************************************************************************** + * Struct for passing the parameters for the solver. This should replace all + * solver related parameters and eliminate the need for global parameters as + * this struct will be a member of the operator struct. + * + * A. M. Abdel-Rehim (a.abdel-rehim@cyi.ac.cy) + * March, 17th, 2013 + ****************************************************************************/ + + +#ifndef _SOLVER_PARAMS_H +#define _SOLVER_PARAMS_H + +typedef struct { + + /******************************** + * Incremental EigCG parameters + ********************************/ + + int eigcg_nrhs; /*total number of right-hand sides to be solved*/ + int eigcg_nrhs1; /*The number of right-hand sides where we solve to tolerance tolsq1 + remaining systems will be solved to tolsq*/ + int eigcg_nev; /*number of eigenvalues computed from a single right-hand side */ + int eigcg_vmax; /*size of the search subspace for eigcg*/ + int eigcg_ldh; /*total number of eigenvectors that will be computed and used in deflation */ + double eigcg_tolsq1; /*squared tolerance for the first n1 systems */ + double eigcg_tolsq; /*squared tolerance for the rest of the linear systems*/ + double eigcg_restolsq; /*tolerance squared for restarting eigcg after eigenvectors has been computed + Typically this is the square root of the tolerance squared requested for the linear system. + Example, to solve the linear systems to squared residual 1e-16, one chooses eigcg_restolsq=1e-8 or smaller + This will specify how many times deflated CG restaretd in the second phase (after eigenvectors has been computed)*/ + int eigcg_rand_guess_opt; /*set to 0 to use 0 initial guesses or non-zero values if you want to use random initial guess as a volume source */ + + /* factor below which iterated resdiual has to drop to trigger a + reliable update in the mixed solver + if() < delta * max( ) + where the maximum is over the iterated residuals since the last update */ + float mcg_delta; + +} solver_params_t; + +#endif + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/solver_types.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/solver_types.h new file mode 100644 index 0000000000000000000000000000000000000000..3999efe749a5b2f91941785c71ec73f6de0e38df --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/solver_types.h @@ -0,0 +1,26 @@ +#ifndef _SOLVER_TYPES_H +#define _SOLVER_TYPES_H + +typedef enum SOLVER_TYPE { + BICGSTAB = 0, + CG, + GMRES, + CGS, + MR, + BICGSTABELL, + FGMRES, + GCR, + GMRESDR, + PCG, + DFLGCR, + DFLFGMRES, + CGMMS, + MIXEDCG, + RGMIXEDCG, + CGMMSND, + INCREIGCG, + MIXEDCGMMSND, + SUMR +} SOLVER_TYPE; + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/spectral_proj.c b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/spectral_proj.c new file mode 100644 index 0000000000000000000000000000000000000000..8ffbcea8ca1182fdc54e6ac2f4f4cc6a252c2e79 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/spectral_proj.c @@ -0,0 +1,124 @@ +/*********************************************************************** + * + * Copyright (C) 2011 Elena Garcia-Ramos + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "global.h" +#include "start.h" +#include "su3.h" +#include "linalg_eo.h" +#include "chebyshev_polynomial_nd.h" +#include +#include "solver/solver.h" +#include "solver/jdher.h" +#include "solver/eigenvalues.h" +#include "X_psi.h" +#include "gamma.h" +#include "P_M_eta.h" +#include "spectral_proj.h" + +double mode_n; + +double mode_number(spinor * const S, double const mstarsq){ + + printf("Starting mode_number calculation...\n");fflush(stdout); + spinor **s,*s_; + + s_ = calloc(2*VOLUMEPLUSRAND+1, sizeof(spinor)); + s = calloc(2, sizeof(spinor*)); + + for(int i = 0; i < 2; i++) { +#if (defined SSE3 || defined SSE2 || defined SSE) + s[i] = (spinor*)(((unsigned long int)(s_)+ALIGN_BASE)&~ALIGN_BASE)+i*VOLUMEPLUSRAND; +#else + s[i] = s_+i*VOLUMEPLUSRAND; +#endif +} + + /* Computing P_M = h(X)^2 */ + h_X_sqr_eta(s[0],s[1],S,mstarsq); + + /* Computing the mode number nu = (|eta>,P_M|eta>)=||h(X)^2|eta>||^2 */ + /* being |eta> the stochastic source */ + + mode_n=square_norm(s[1], VOLUME, 1); + + if(g_proc_id == 0) { + printf("The Value of the Mode Number is %f \n", mode_n); + } + + free(s); + free(s_); + return(mode_n); +} + + +void top_sus(spinor * const S, double const mstarsq){ + + double mode_num, topo_sus = 0.0; + double A = 0.0, B = 0.0, C = 0.0; + spinor **s, *s_; + + s_ = calloc(25*VOLUMEPLUSRAND+1, sizeof(spinor)); + s = calloc(25, sizeof(spinor*)); + + for(int i = 0; i < 25; i++) { +#if (defined SSE3 || defined SSE2 || defined SSE) + s[i] = (spinor*)(((unsigned long int)(s_)+ALIGN_BASE)&~ALIGN_BASE)+i*VOLUMEPLUSRAND; +#else + s[i] = s_+i*VOLUMEPLUSRAND; +#endif +} + + /* s[0]=h(X)|eta> s[2]=h(X)^2|eta>*/ + h_X_sqr_eta(s[0],s[2],S,mstarsq); + + /* s[2]=[gamma5 h(X)]|eta>*/ + gamma5(s[1],s[0], VOLUME); + + /* s[3]=[h(X) gamma5 h(X)}|eta> */ + h_X_eta(s[3], s[1], mstarsq); + + + /* A = (h(X)^2|eta>,h(X)^2|eta>) */ + A=scalar_prod_r(s[2],s[2], VOLUME, 1); + + /* B = ([h(X) gamma5 h(X)]|eta>,[h(X) gamma5 h(X)]|eta>)*/ + B=scalar_prod_r(s[3],s[3], VOLUME, 1); + + /* C = ([h(X)]|eta>,[gamma5 h(X)]|eta>) */ + C=scalar_prod_r(s[0],s[1], VOLUME, 1); + + + if(g_proc_id == 0) { + printf("A = %f \n", A); + printf("B = %f \n", B); + printf("C = %f \n", C); + printf("C^2 = %f \n", C*C); + } + + free(s); + free(s_); +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/spectral_proj.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/spectral_proj.h new file mode 100644 index 0000000000000000000000000000000000000000..cb0c11c7d206495b4cd707cebf6c9a0a7838cb90 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/spectral_proj.h @@ -0,0 +1,32 @@ +/*********************************************************************** + * + * Copyright (C) 2011 Elena Garcia-Ramos + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _SPECTRAL_PROJECTOR_H +#define _SPECTRAL_PROJECTOR_H + +#include "su3.h" + +extern double mode_n; + +double mode_number(spinor * const S, double const mstarsq); + +void top_sus(spinor * const S, double const mstarsq); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/sub_low_ev.c b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/sub_low_ev.c new file mode 100644 index 0000000000000000000000000000000000000000..e29449707755bac8fb5a67a69d8c96aad0572eda --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/sub_low_ev.c @@ -0,0 +1,133 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +/******************************************************************** + * + * void sub_low_ev(spinor *S, spinor *P) + * makes + * |S> = |P> - Sum_{1}^{nev-1} *|eigen_i> + * + * where |eigen_i> is the i-th lowest eigenvectors of Q + * + * + * void addproj_q_invsqrt(spinor *Q, spinor *P) + * makes + * |Q'> = |Q> + Sum_{1}^{nev-1} sign(eigen_i)**|eigen_i> + * + * where |Q> = Q/Sqrt(Q^2)|S> and thus |Q'>= Q/Sqrt(Q^2)|P> + * + * + * Author: M.Papinutto + * Date: 11.03.2003 + * + * void sub_lowest_eigenvalues(spinor * const Q, spinor * const P, const int n) + * + * computes: Q=Q-sum_i lambda_i |eigen_i> + * where eigen_i is the i-th lowest eigenvector and + * lambda_i the i-th eigenvalue (of Q^2) + * Input: + * P + * n : number of eigenvectors to be subtracted + * Inout: + * Q + * + * void assign_add_invert_subtracted_part(spinor * const Q, spinor * const P, const int n) + * + * computes: Q = Q + sum_i 1/lambda_i |eigen_i> + * conventions as obove + * + * Input: + * P + * n : number of eigenvectors to be subtracted + * Inout: + * Q + * + * For the last two routines a previous call of + * eigenvalues or eigenvalues_for_cg must + * be done + * + * Autor: Carsten Urbach + * + ********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "global.h" +#include "linalg_eo.h" +#include "eigenvalues.h" +#include "sub_low_ev.h" + + +/* Q=Q-sum_i lambda_i |eigen_i> */ +void sub_lowest_eigenvalues(spinor * const Q, spinor * const P, const int n, const int N) { + int i; + _Complex double c; + + for(i = 0; i < n; i++){ + c = scalar_prod(&(eigenvectors[i*evlength]), P, N, 1); + c *= -eigenvls[i]; + assign_add_mul(Q, &eigenvectors[i*evlength], c, N); + } +} + +/* Q=P-sum_i |eigen_i> */ +void assign_sub_lowest_eigenvalues(spinor * const Q, spinor * const P, const int n, const int N) { + int i; + _Complex double c; + + assign(Q, P, N); + + for(i = 0; i < n; i++){ + c = scalar_prod(&(eigenvectors[i*evlength]), P, N, 1); + c = -c; + assign_add_mul(Q, &eigenvectors[i*evlength], c, N); + + } +} + +/* Q = Q + sum_i 1/lambda_i |eigen_i> */ +void assign_add_invert_subtracted_part(spinor * const Q, spinor * const P, const int n, const int N) { + int i=0; + _Complex double c; + double rev=0; + + for(i = 0; i < n; i++){ + c = scalar_prod(&eigenvectors[i*evlength], P, N, 1); + rev = 1./eigenvls[i]; + c *= rev; + assign_add_mul(Q, &eigenvectors[i*evlength], c, N); + } +} + +void invert_eigenvalue_part(spinor * const Q, spinor * const P, const int n, const int N) { + _Complex double c; + double rev=0; + + assign(Q, P, N); + for(int i = 0; i < n; ++i) + { + c = scalar_prod(&eigenvectors[i*evlength], P, N, 1); + c *= -inv_eigenvls[i]; + assign_add_mul(Q, &eigenvectors[i*evlength], c, N); + } +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/sub_low_ev.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/sub_low_ev.h new file mode 100644 index 0000000000000000000000000000000000000000..6c3ec6cbe9b55f5a3668b84cf4bcf59611c7442f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/sub_low_ev.h @@ -0,0 +1,32 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _SUB_LOW_EV_H +#define _SUB_LOW_EV_H + +#include "su3.h" + +void sub_lowest_eigenvalues(spinor * const , spinor * const, const int n, const int N); +void assign_sub_lowest_eigenvalues(spinor * const , spinor * const, const int n, const int N); +void assign_add_invert_subtracted_part(spinor * const Q, spinor * const P, const int n, const int N); +void invert_eigenvalue_part(spinor * const Q, spinor * const P, const int n, const int N); +#endif + + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/sumr.c b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/sumr.c new file mode 100644 index 0000000000000000000000000000000000000000..e96ab4112315bab8fb8ea1b3d756a013cb114fd9 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/sumr.c @@ -0,0 +1,285 @@ +/*********************************************************************** + * + * Copyright (C) 2005 Luigi Scorzato + * 2009 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * File: sumr.c + * + * + * The externally accessible functions are + * + * + * int sumr(spinor * const P, spinor * const Q, int max_iter, double eps_sq) + * Inverter for shifted unitary matrices + * [C.F.Jagels L.Reichel, Num. Lin. Alg. with Appl. Vol1(6),555-570 (1994)] + * [first applied to the Overlap in hep-lat/0311025] + * + * input: + * Q: source + * inout: + * P: initial guess and result + * + * Author: Luigi.Scorzato@physik.hu-berlin.de + * + *******************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "linalg_eo.h" +#include "start.h" +#include "solver/matrix_mult_typedef.h" +#include +#include "gamma.h" +#include "solver/eigenvalues.h" +#include "solver/sub_low_ev.h" +#include "operator/Dov_psi.h" +#include "solver_field.h" +#include "sumr.h" + +#define DEBUG_SUMR 0 + +/* to be fixed somewhere else */ + + +/* P output = solution , Q input = source */ +int sumr(spinor * const P, spinor * const Q, const int max_iter, + double eps_sq){ + double sigma, delta, s, rho, zeta, z_r, tmpr, normsp, err, tau_hat; + double ov_s, m_ov=0.; + _Complex double phi, phi_hat, tau, lambda, c, gamm, alpha, eta, kappa; + _Complex double r_hat, r_off, r_diag, r_diag_old, tmpc, tmpc1, tmpc2; + int iteration; + /* to be fixed somewhere else */ + const int N=VOLUME; + spinor *x, *r, *p, *v, *v_til, *w, *u, *b, *tmp, *tmp2; + spinor ** solver_field = NULL; + printf("Starting SUMR!\n"); + /* !!!! INITIALIZATION !!!! */ + init_solver_field(&solver_field, VOLUMEPLUSRAND, 10); + x = solver_field[0]; + r = solver_field[1]; + p = solver_field[2]; + v = solver_field[3]; + v_til = solver_field[4]; + w = solver_field[5]; + u = solver_field[6]; + b = solver_field[7]; + tmp = solver_field[8]; + tmp2 = solver_field[9]; + + assign(b, Q, N); + assign(x, P, N); + normsp = square_norm(P, N, 1); + + ov_s = 0.5 * (1. / g_kappa - 8.) - 1.; + rho = ov_s+1 - m_ov / 2.; + zeta = ov_s+1 + m_ov / 2.; + z_r = zeta / rho; + + if(normsp == 0) { + /* if a starting solution vector equal to zero is chosen */ + delta = sqrt(square_norm(b, N, 1)); + assign(r, b, N); + } + else { + /* if a starting solution vector different from zero is chosen */ + Dov_psi(tmp, x); + diff(r, b, tmp, N); + delta = sqrt(square_norm(r, N, 1)); + } + + phi_hat = 1 / delta; + tau_hat = delta / rho; + zero_spinor_field(p, N); + phi = 0.0; + s = 0.; + lambda = 0.0; + r_diag_old = 1.0; + gamm = 1.0; + sigma = 1.; + c = 1.0; + mul(v_til, phi_hat, r, N); + assign(v, v_til, N); + +#if DEBUG_SUMR ==1 + printf("delta=%g;\t phihat=%g;\t tauhat=%g;\t w=%g;\t p=%g;\t phi=%g;\t s=%g;\t lambda=%g;\t r_off=%g;\t r_off_old=%g;\t r_diag=%g;\t r_diag_old=%g;\t gamm=%g;\t sigma=%g;\t c=%g;\t v=%g;\t v_til=%g;\t ", + delta,cabs(phi_hat),tau_hat,square_norm(w),square_norm(p), + cabs(phi),s,cabs(lambda),cabs(r_off),cabs(r_off_old),cabs(r_diag),cabs(r_diag_old),cabs(gamm),sigma,cabs(c), + square_norm(v),square_norm(v_til)); +#endif + + if(ov_cheby_coef==NULL) calculateOverlapPolynomial(); + + /* main loop */ + for(iteration = 0; iteration < max_iter; iteration++) { + Q_over_sqrt_Q_sqr(tmp, ov_cheby_coef, ov_n_cheby, v, ev_qnorm, ev_minev); + gamma5(u, tmp, N); +#if DEBUG_SUMR ==1 + printf("u=%g;\t\n", square_norm(u)); +#endif + gamm = scalar_prod(v_til, u, N, 1); + gamm = -(gamm); +#if DEBUG_SUMR ==1 + printf("gamm=%g,%g;\t\n",creal(gamm),cimag(gamm)); +#endif + sigma= sqrt((1 - cabs(gamm))*(1 + cabs(gamm))); +#if DEBUG_SUMR ==1 + printf("sigma=%g;\t\n", sigma); +#endif + alpha = -gamm * delta; +#if DEBUG_SUMR ==1 + printf("alpha=%g,%g;\t\n",creal(alpha),cimag(alpha)); +#endif + r_off = s*z_r; + r_off += (alpha) * (phi); +#if DEBUG_SUMR ==1 + printf("r_off=%g,%g;\t\n",creal(r_off),cimag(r_off)); +#endif + tmpc = conj(c); + r_hat = (tmpc) * (z_r); + r_hat += (alpha) * (phi_hat); +#if DEBUG_SUMR ==1 + printf("r_hat=%g,%g;\t\n",creal(r_hat), cimag(r_hat)); +#endif + tmpr = 1/(sqrt(creal(r_hat * conj(r_hat)) + (sigma*sigma))); + tmpc = (r_hat) * (tmpr); + c = conj(tmpc); +#if DEBUG_SUMR ==1 + printf("c=%g,%g;\t\n",creal(c),cimag(c)); +#endif + s=-sigma * tmpr; +#if DEBUG_SUMR ==1 + printf("s=%g;\t\n", s); +#endif + r_diag = s*sigma; + r_diag -= c * r_hat; +#if DEBUG_SUMR ==1 + printf("r_diag=%g,%g;\t\n",creal(r_diag),cimag(r_diag)); +#endif + tau = -c * tau_hat; +#if DEBUG_SUMR ==1 + printf("tau=%g,%g;\t\n",creal(tau),cimag(tau)); +#endif + tau_hat *= s; +#if DEBUG_SUMR ==1 + printf("tau_hat=%g;\t\n", tau_hat); +#endif + eta = tau / r_diag; +#if DEBUG_SUMR ==1 + printf("eta=%g,%g;\t\n",creal(eta),cimag(eta)); +#endif + kappa = r_off / r_diag_old; +#if DEBUG_SUMR ==1 + printf("kappa=%g,%g;\t\n",creal(kappa),cimag(kappa)); +#endif + zero_spinor_field(w, N); + assign_add_mul_add_mul(w, p, tmp2, alpha, kappa, N); +#if DEBUG_SUMR ==1 + printf("w=%g;\t\n", square_norm(w)); +#endif + assign_add_mul(p, tmp2, lambda, N); +#if DEBUG_SUMR ==1 + printf("p=%g;\t\n", square_norm(p, N, 1)); +#endif + diff(tmp2, v, w, N); +#if DEBUG_SUMR ==1 + printf("w-v=%g;\t\n", square_norm(tmp2, N, 1)); +#endif + assign_add_mul(x, tmp2, eta, N); +#if DEBUG_SUMR ==1 + printf("x=%g;\t\n", square_norm(x, N, 1)); +#endif + + if(sigma==0) { + printf("Exit because Sigma = %g\n",sigma); + finalize_solver(solver_field, 10); + return(iteration); + } + /* Check whether the precision is reached ... */ + err = tau_hat * tau_hat; + +#if DEBUG_SUMR ==1 + tmpr = square_norm(x, N, 1); + if(g_proc_id == g_stdio_proc) { + printf("it, tau,sigma, ||x||^2: %d\t%g\t%g\t%g\n",iteration,err,sigma,tmpr); + fflush( stdout); + } +#endif + if ((iteration%10) == 0 && g_proc_id == g_stdio_proc ) { + printf("SUMR iteration= %d\t|res|^2= %g\n",iteration,err); + /* fflush( stdout); */ + } + + if (err <= eps_sq) { + assign(P, x, N); + finalize_solver(solver_field, 10); + return(iteration); + } + + delta = delta * sigma; +#if DEBUG_SUMR ==1 + printf("delta=%g;\t\n", delta); +#endif + tmpc = conj(gamm); + tmpc1 = conj(c); + phi = (tmpc) * (s / delta); + phi -= (c) * (phi_hat); +#if DEBUG_SUMR ==1 + printf("phi=%g;\t\n", phi); +#endif + + lambda = (phi) / (r_diag); +#if DEBUG_SUMR ==1 + printf("lambda=%g;\t\n", lambda); +#endif + + tmpc1 = (tmpc1) / (delta); + tmpc2 = (tmpc1) * (tmpc); + phi_hat = (phi_hat) * (s); + phi_hat += tmpc2; +#if DEBUG_SUMR ==1 + printf("phi_hat=%g;\t\n", phi_hat); +#endif + + assign(tmp, u, N); + assign_add_mul(tmp, v_til, gamm, N); + mul_r(v, 1 / sigma,tmp, N); +#if DEBUG_SUMR ==1 + printf("v=%g;\t\n", square_norm(v, N, 1)); +#endif + + mul(tmp, tmpc, v, N); + assign_mul_add_r(v_til, sigma, tmp, N); +#if DEBUG_SUMR ==1 + printf("v_til=%g;\t\n", square_norm(v_til, N, 1)); + printf("############################\n"); +#endif + + r_diag_old = r_diag; + } + finalize_solver(solver_field, 10); + return(-1); +} + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/solver/sumr.h b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/sumr.h new file mode 100644 index 0000000000000000000000000000000000000000..681593ed2e45ffc89ca10eccf1acdd957ced3bab --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/solver/sumr.h @@ -0,0 +1,10 @@ +#ifndef _SUMR_H +#define _SUMR_H + +#include"solver/matrix_mult_typedef.h" +#include"su3.h" + +int sumr(spinor * const, spinor * const, const int max_iter, double eps_sq); +int sumr_mms(spinor **** const, spinor * const, const int max_iter, double eps_sq, int is, int ic); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/source_generation.c b/qcd/part_cpu/applications/QCD/src/kernel_D/source_generation.c new file mode 100644 index 0000000000000000000000000000000000000000..f5d7b6984d8d635c9314e9cbc9154a68a50c23fc --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/source_generation.c @@ -0,0 +1,458 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "global.h" +#include "start.h" +#include "ranlxd.h" +#include "su3spinor.h" +#include "source_generation.h" + +#ifndef M_PI +# define M_PI 3.14159265358979323846 +#endif + +/* Generates normal distributed random numbers */ +/* using the box-muller method */ +/* this is even standard normal distributed */ +/* so mean = 0, sd = 1 */ +void rnormal(double * r, const int n) +{ + double u[2], s, l; + int i; + + /* basic form, but trig. functions needed */ +/* for(i = 0; i < n; i+=2) { */ +/* ranlxd(u, 2); */ +/* l = sqrt(-2*log(u[0])); */ +/* r[i] = l*cos(2*M_PI*u[1]); */ +/* r[i+1] = l*sin(2*M_PI*u[1]); */ +/* printf("%f\n", r[i]); */ +/* printf("%f\n", r[i+1]); */ +/* } */ +/* return; */ + /* polar form, no trig. functions, but more random numbers */ + /* which one is faster? */ + for(i = 0; i < n; i += 2) { + ranlxd(u, 2); + u[0] = 2.*u[0] - 1.; + u[1] = 2.*u[1] - 1.; + s = u[0]*u[0]+u[1]*u[1]; + while(s == 0. || s > 1.) { + ranlxd(u, 2); + u[0] = 2.*u[0] - 1.; + u[1] = 2.*u[1] - 1.; + s = u[0]*u[0]+u[1]*u[1]; + } + l = sqrt(-2.*log(s)/s); + r[i] = u[0]*l; + r[i+1] = u[1]*l; + } + return; +} + +/* Generates a volume source with gaussian noise */ +/* in all real and imaginary elements */ +/* */ +/* i.e. xi*.xi = 2 */ +/* is the normalisation */ +/* this is corrected for in the contraction */ +/* codes */ +void gaussian_volume_source(spinor * const P, spinor * const Q, + const int sample, const int nstore, const int f) +{ + int x, y, z, t, i, reset = 0, seed; + int rlxd_state[105]; + spinor * p; + + /* save the ranlxd_state if neccessary */ + if(ranlxd_init == 1) { + rlxd_get(rlxd_state); + reset = 1; + } + + /* Compute the seed */ + seed =(int) abs(1 + sample + f*10*97 + nstore*100*53 + g_cart_id*13); + + rlxd_init(2, seed); + + for(t = 0; t < T; t++) { + for(x = 0; x < LX; x++) { + for(y =0; y < LY; y++) { + for(z = 0; z < LZ; z++) { + i = g_lexic2eosub[ g_ipt[t][x][y][z] ]; + if((t+x+y+z+g_proc_coords[3]*LZ+g_proc_coords[2]*LY + + g_proc_coords[0]*T+g_proc_coords[1]*LX)%2 == 0) { + p = P + i; + } + else { + p = Q + i; + } + rnormal((double*)p, 24); + } + } + } + } + + /* reset the ranlxd if neccessary */ + if(reset) { + rlxd_reset(rlxd_state); + } + return; +} + +void extended_pion_source(spinor * const P, spinor * const Q, + spinor * const R, spinor * const S, + const int t0, + const double px, const double py, const double pz) { + int lt, lx, ly, lz, i, x, y, z, id=0, t; + int coords[4]; + spinor * p, * q, r; + _Complex double efac; + + zero_spinor_field(P,VOLUME/2); + zero_spinor_field(Q,VOLUME/2); + + t=((g_nproc_t*T)/2+t0)%(g_nproc_t*T); + lt = t - g_proc_coords[0]*T; + coords[0] = t / T; + for(x = 0; x < LX*g_nproc_x; x++) { + lx = x - g_proc_coords[1]*LX; + coords[1] = x / LX; + for(y = 0; y < LY*g_nproc_y; y++) { + ly = y - g_proc_coords[2]*LY; + coords[2] = y / LY; + for(z = 0; z < LZ*g_nproc_z; z++) { + lz = z - g_proc_coords[3]*LZ; + coords[3] = z / LZ; +#ifdef MPI + MPI_Cart_rank(g_cart_grid, coords, &id); +#endif + if(g_cart_id == id) { + efac = cexp(-(px * x + py * y + pz * z) * I); + + i = g_lexic2eosub[ g_ipt[lt][lx][ly][lz] ]; + if((lt+lx+ly+lz+g_proc_coords[3]*LZ+g_proc_coords[2]*LY + + g_proc_coords[0]*T+g_proc_coords[1]*LX)%2 == 0) { + p = P + i; + q = R + i; + } + else { + p = Q + i; + q = S + i; + } + _gamma5(r, (*q)); + _spinor_mul_complex((*p),efac,r); + } + } + } + } + return; +} + +void source_generation_pion_only(spinor * const P, spinor * const Q, + const int t, + const int sample, const int nstore) { + + int reset = 0, i, x, y, z, is, ic, lt, lx, ly, lz, id=0; + int coords[4], seed, r; + double rnumber, si=0., co=0.; + int rlxd_state[105]; + const double sqr2 = 1./sqrt(2.); + _Complex double * p = NULL; + + zero_spinor_field(P,VOLUME/2); + zero_spinor_field(Q,VOLUME/2); + + /* save the ranlxd_state if neccessary */ + if(ranlxd_init == 1) { + rlxd_get(rlxd_state); + reset = 1; + } + + /* Compute the seed */ + seed =(int) abs(1 + sample + t*10*97 + nstore*100*53); + + rlxd_init(2, seed); + + lt = t - g_proc_coords[0]*T; + coords[0] = t / T; + for(x = 0; x < LX*g_nproc_x; x++) { + lx = x - g_proc_coords[1]*LX; + coords[1] = x / LX; + for(y = 0; y < LY*g_nproc_y; y++) { + ly = y - g_proc_coords[2]*LY; + coords[2] = y / LY; + for(z = 0; z < LZ*g_nproc_z; z++) { + lz = z - g_proc_coords[3]*LZ; + coords[3] = z / LZ; +#ifdef MPI + MPI_Cart_rank(g_cart_grid, coords, &id); +#endif + for(is = 0; is < 4; is++) { + for(ic = 0; ic < 3; ic++) { + ranlxd(&rnumber, 1); + if(g_cart_id == id) { + r = (int)floor(4.*rnumber); + if(r == 0) + { + si = sqr2; + co = sqr2; + } + else if(r == 1) { + si = -sqr2; + co = sqr2; + } + else if(r==2) { + si = sqr2; + co = -sqr2; + } + else { + si = -sqr2; + co = -sqr2; + } + + i = g_lexic2eosub[ g_ipt[lt][lx][ly][lz] ]; + if((lt+lx+ly+lz+g_proc_coords[3]*LZ+g_proc_coords[2]*LY + + g_proc_coords[0]*T+g_proc_coords[1]*LX)%2 == 0) { + p = (_Complex double*)(P + i); + } + else { + p = (_Complex double*)(Q + i); + } + + (*(p+3*is+ic)) = co + si * I; + } + } + } + } + } + } + + /* reset the ranlxd if neccessary */ + if(reset) { + rlxd_reset(rlxd_state); + } + return; +} + +/* Florian Burger 4.11.2009 */ +void source_generation_pion_zdir(spinor * const P, spinor * const Q, + const int z, + const int sample, const int nstore) { + + int reset = 0, i, x, y, t, is, ic, lt, lx, ly, lz, id=0; + int coords[4], seed, r; + double rnumber, si=0., co=0.; + int rlxd_state[105]; + const double sqr2 = 1./sqrt(2.); + _Complex double * p = NULL; + + zero_spinor_field(P,VOLUME/2); + zero_spinor_field(Q,VOLUME/2); + + /* save the ranlxd_state if neccessary */ + if(ranlxd_init == 1) { + rlxd_get(rlxd_state); + reset = 1; + } + + /* Compute the seed */ + seed =(int) abs(1 + sample + z*10*97 + nstore*100*53); + + rlxd_init(2, seed); + lz = z - g_proc_coords[3]*LZ; + coords[3] = z / LZ; + for(t = 0; t < T*g_nproc_t; t++) { + lt = t - g_proc_coords[0]*T; + coords[0] = t / T; + for(x = 0; x < LX*g_nproc_x; x++) { + lx = x - g_proc_coords[1]*LX; + coords[1] = x / LX; + for(y = 0; y < LY*g_nproc_y; y++) { + ly = y - g_proc_coords[2]*LY; + coords[2] = y / LY; + +#ifdef MPI + MPI_Cart_rank(g_cart_grid, coords, &id); +#endif + for(is = 0; is < 4; is++) { + for(ic = 0; ic < 3; ic++) { + ranlxd(&rnumber, 1); + if(g_cart_id == id) { + r = (int)floor(4.*rnumber); + if(r == 0) { + si = sqr2; + co = sqr2; + } + else if(r == 1) { + si = -sqr2; + co = sqr2; + } + else if(r==2) { + si = sqr2; + co = -sqr2; + } + else { + si = -sqr2; + co = -sqr2; + } + + i = g_lexic2eosub[ g_ipt[lt][lx][ly][lz] ]; + if((lt+lx+ly+lz+g_proc_coords[3]*LZ+g_proc_coords[2]*LY + + g_proc_coords[0]*T+g_proc_coords[1]*LX)%2 == 0) { + p = (_Complex double*)(P + i); + } + else { + p = (_Complex double*)(Q + i); + } + + (*(p+3*is+ic)) = co + si * I; + } + } + } + } + } + } + + /* reset the ranlxd if neccessary */ + if(reset) { + rlxd_reset(rlxd_state); + } + return; +} + +/* end Florian Burger 4.11.2009 */ + + + + + +void source_generation_nucleon(spinor * const P, spinor * const Q, + const int is, const int ic, + const int t, const int nt, const int nx, + const int sample, const int nstore, + const int meson) { + + double rnumber, si=0., co=0., sqr2; + int rlxd_state[105]; + int reset = 0, seed, r, tt, lt, xx, lx, yy, ly, zz, lz; + int coords[4], id=0, i; + _Complex double * p = NULL; + const double s0=0.; + const double c0=1.; + const double s1=sin(2.*M_PI/3.); + const double c1=cos(2.*M_PI/3.); + const double s2=sin(4.*M_PI/3.); + const double c2=cos(4.*M_PI/3.); + + zero_spinor_field(P,VOLUME/2); + zero_spinor_field(Q,VOLUME/2); + + sqr2 = 1./sqrt(2.); + /* save the ranlxd_state if neccessary */ + if(ranlxd_init == 1) { + rlxd_get(rlxd_state); + reset = 1; + } + + /* Compute the seed */ + seed =(int) abs(1 + sample + t*10*97 + nstore*100*53); + + rlxd_init(2, seed); + + for(tt = t; tt < T*g_nproc_t; tt+=nt) { + lt = tt - g_proc_coords[0]*T; + coords[0] = tt / T; + for(xx = 0; xx < LX*g_nproc_x; xx+=nx) { + lx = xx - g_proc_coords[1]*LX; + coords[1] = xx / LX; + for(yy = 0; yy < LY*g_nproc_y; yy+=nx) { + ly = yy - g_proc_coords[2]*LY; + coords[2] = yy / LY; + for(zz = 0; zz < LZ*g_nproc_z; zz+=nx) { + lz = zz - g_proc_coords[3]*LZ; + coords[3] = zz / LZ; +#ifdef MPI + MPI_Cart_rank(g_cart_grid, coords, &id); +#endif + ranlxd(&rnumber, 1); + if(g_cart_id == id) { + if(meson) { + r = (int)floor(4.*rnumber); + if(r == 0) { + si = sqr2; + co = sqr2; + } + else if(r == 1) { + si = -sqr2; + co = sqr2; + } + else if(r==2) { + si = sqr2; + co = -sqr2; + } + else { + si = -sqr2; + co = -sqr2; + } + } + else { + r = (int)floor(3.*rnumber); + if(r == 0) { + si = s0; + co = c0; + } + else if(r == 1) { + si = s1; + co = c1; + } + else { + si = s2; + co = c2; + } + } + + i = g_lexic2eosub[ g_ipt[lt][lx][ly][lz] ]; + if((lt+lx+ly+lz+g_proc_coords[3]*LZ+g_proc_coords[2]*LY + + g_proc_coords[0]*T+g_proc_coords[1]*LX)%2 == 0) { + p = (_Complex double*)(P + i); + } + else { + p = (_Complex double*)(Q + i); + } + + (*(p+3*is+ic)) = co + si * I; + } + } + } + } + } + + /* reset the ranlxd if neccessary */ + if(reset) { + rlxd_reset(rlxd_state); + } + return; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/source_generation.h b/qcd/part_cpu/applications/QCD/src/kernel_D/source_generation.h new file mode 100644 index 0000000000000000000000000000000000000000..f733ea0aa469089d3782b63bf01d59550f0c158b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/source_generation.h @@ -0,0 +1,44 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _SOURCE_GENERATION_H +#define _SOURCE_GENERATION_H + +void gaussian_volume_source(spinor * const P, spinor * const Q, + const int sample, const int nstore, const int f); + +void source_generation_pion_only(spinor * const P, spinor * const Q, + const int t, + const int sample, const int nstore); + +void source_generation_nucleon(spinor * const P, spinor * const Q, + const int is, const int ic, + const int t, const int nt, const int nx, + const int sample, const int nstore, + const int meson); + +void extended_pion_source(spinor * const P, spinor * const Q, + spinor * const R, spinor * const S, + const int t0, + const double px, const double py, const double pz); + +void source_generation_pion_zdir(spinor * const P, spinor * const Q, + const int z, + const int sample, const int nstore); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/spinor_fft.c b/qcd/part_cpu/applications/QCD/src/kernel_D/spinor_fft.c new file mode 100644 index 0000000000000000000000000000000000000000..6fe193a6870cf33a9fc139fd6cde187971b258d9 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/spinor_fft.c @@ -0,0 +1,494 @@ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#include +#ifdef MPI +# include +#endif +#include "global.h" +#include +#include +#include +#include +#include "start.h" +#include "linalg_eo.h" + +#include "spinor_fft.h" +#include "mpi_init.h" +#include "init/init.h" + +#ifdef HAVE_FFTW + #include +#endif + +#include + +void spinor_fft_print_reduct_dims(int *remaining_dims,FILE *logFile); + +#ifdef MPI +void check_mpi_comm_membership(MPI_Comm commself,MPI_Comm commcheck,const char *name_a,const char *name_b,FILE *logFile); +#endif + +#ifdef HAVE_FFTW +fftw_plan spinor_fftw_plan2d(spinor *spinor_in,spinor *spinor_out,int dim0,int dim1,int howmany,unsigned int forward,int fftw_flags); +#endif + +void spinor_fft_transpose_xp_t(spinor *fieldout,spinor* fieldin,int dim0,int dim1,int forward,double mulp); + + + +/** + * accumulates pieces of the spinor field on nodes with index 0 in the dimensions given in which + * the collected data is returned + */ +void spinor_fft_reduce_2d(spinor *localSpinorField,int *collectionRank,spinor*** field_collection,spinor **membuff){ + /* this implementation is intended for four dimensional parallelisation */ +#if (defined PARALLELXYZT && defined MPI && defined HAVE_FFTW) + + int sendRecvCoord[4]; + int i; + int dims[]={g_nproc_t,g_nproc_x,g_nproc_y,g_nproc_z}; + + + /* logfile variables */ + char *logFilePrefix="Process"; + char logFileName[512]; + FILE *logFile; + const int MSG_LOCALDATA = 457; + MPI_Status ierr; + MPI_Datatype mpi_local_spinor; + const int which[]={0,1}; + + + (*field_collection)=NULL; + (*membuff)=NULL; + +/* int result; */ + sprintf(logFileName,"./%s_%02d.log",logFilePrefix,g_cart_id); + logFile=fopen(logFileName,"a"); + + + MPI_Type_contiguous(VOLUME, field_point, &mpi_local_spinor); + MPI_Type_commit(&mpi_local_spinor); + + + for(i=0;i<4;i++) + sendRecvCoord[i]=g_proc_coords[i]; + + if( g_proc_coords[which[0]] == 0 && g_proc_coords[which[1]] == 0 ){ + + /* i am one of the nodes where data is accumulated */ + spinor **accu_field; + spinor **fft_field; + spinor *memory_buffer_accu_field; + spinor *memory_buffer_fft_field; + int REDUCTIONVOLUME=1; + int recvRank; + MPI_Request *requests; + MPI_Status *status; + int request_count=0; + int num_requests; + fftw_plan local_2d_fft_forward; + + *collectionRank=TRUE; + + /* calculate the number of reduced 2d volume accumulated in this node */ + + /* number of spinor fields in local units */ + REDUCTIONVOLUME*=dims[which[0]]*dims[which[1]]; + + /* number of receive messages */ + num_requests=REDUCTIONVOLUME-1; + + /* reserve space for receive messages */ + requests=(MPI_Request*)malloc(sizeof(MPI_Request)*num_requests); + status=(MPI_Status*)malloc(sizeof(MPI_Status)*num_requests); + + fprintf(logFile,"reduction volume = %d\n",REDUCTIONVOLUME); + + /* allocate space for spinor field collection */ + allocate_spinor_field_array(&accu_field,&memory_buffer_accu_field,VOLUME,REDUCTIONVOLUME); + allocate_spinor_field_array(&fft_field,&memory_buffer_fft_field,VOLUME,REDUCTIONVOLUME); + + + /* receive from certain nodes pieces of the spinor field */ + for(sendRecvCoord[which[0]] = 0 ; sendRecvCoord[which[0]]< dims[which[0]] ; sendRecvCoord[which[0]]++){ + for(sendRecvCoord[which[1]] = 0 ; sendRecvCoord[which[1]]< dims[which[1]] ; sendRecvCoord[which[1]]++){ + if( sendRecvCoord[which[0]] != 0 || sendRecvCoord[which[1]] != 0){ + + MPI_Cart_rank(g_cart_grid,sendRecvCoord,&recvRank); + + MPI_Irecv(accu_field[sendRecvCoord[which[0]]*dims[which[1]]+sendRecvCoord[which[1]] ] /* buffer */, + 1, /* how may */ + mpi_local_spinor, /* mpi data type */ + recvRank, /* from whom i get it */ + MSG_LOCALDATA, /* msg id */ + g_cart_grid, /* communicator , status */ + requests+request_count); + ++request_count; + + } + } + } + + + /* wait until all request finished */ + MPI_Waitall(num_requests, requests, status); + + assign(accu_field[0],localSpinorField,VOLUME); + + /* transpose in xp-t space */ + spinor_fft_transpose_xp_t(fft_field[0],accu_field[0],dims[0],dims[1],TRUE,1.); + + /* create fftw plan */ + local_2d_fft_forward=spinor_fftw_plan2d(fft_field[0],accu_field[0],T*dims[0],LX*dims[1],LY*LZ,1,FFTW_ESTIMATE); + fftw_execute(local_2d_fft_forward); + fftw_destroy_plan(local_2d_fft_forward); + +/* assign(accu_field[0],fft_field[0],VOLUME*REDUCTIONVOLUME); */ + + + free_spinor_field_array(&memory_buffer_fft_field); memory_buffer_fft_field=NULL; + +/* free_spinor_field_array(&memory_buffer_accu_field); memory_buffer_accu_field=NULL; */ + (*field_collection)=accu_field; + (*membuff)=memory_buffer_accu_field; + free(requests); requests = NULL; + free(status); status=NULL; + + } else { + int sendRank; + MPI_Request request; + MPI_Status status; + + *collectionRank=FALSE; + + /* coordinates of the "root" */ + sendRecvCoord[which[0]]=0; + sendRecvCoord[which[1]]=0; + + MPI_Cart_rank(g_cart_grid,sendRecvCoord,&sendRank); + + MPI_Isend(localSpinorField,1,mpi_local_spinor,sendRank,MSG_LOCALDATA,g_cart_grid,&request); + + MPI_Wait(&request,&status); + + } + + + MPI_Type_free(&mpi_local_spinor); + + fclose(logFile); + +#else + if(g_proc_id==0) + fprintf(stderr,"Error: Please choose FOUR dimensional parallelization!!!\n"); + +#endif +} + + +/** + * accumulates pieces of the spinor field on nodes with index 0 in the dimensions given in which + * the collected data is returned + */ +void spinor_fft_redist_2d(spinor *localSpinorField,int collectionRank,spinor** field_collection,spinor *membuff){ + /* this implementation is intended for four dimensional parallelisation */ +#if ( defined PARALLELXYZT && defined MPI && defined HAVE_FFTW) + + int sendRecvCoord[4]; + int i; + int dims[]={g_nproc_t,g_nproc_x,g_nproc_y,g_nproc_z}; + + + /* logfile variables */ + char *logFilePrefix="Process"; + char logFileName[512]; + FILE *logFile; + const int MSG_LOCALDATA = 5687; + MPI_Status ierr; + MPI_Datatype mpi_local_spinor; + const int which[]={0,1}; + + + +/* int result; */ + sprintf(logFileName,"./%s_%02d.log",logFilePrefix,g_cart_id); + logFile=fopen(logFileName,"a"); + + + /* new mpi type */ + MPI_Type_contiguous(VOLUME, field_point, &mpi_local_spinor); + MPI_Type_commit(&mpi_local_spinor); + + + for(i=0;i<4;i++) + sendRecvCoord[i]=g_proc_coords[i]; + + if( collectionRank == TRUE ){ + + /* i am one of the nodes where data is accumulated */ + spinor **accu_field=field_collection; + spinor **fft_field; + spinor *memory_buffer_accu_field=membuff; + spinor *memory_buffer_fft_field; + int REDUCTIONVOLUME=1; + int sendRank; + MPI_Request *requests; + MPI_Status *status; + int request_count=0; + int num_requests; + fftw_plan local_2d_fft_backward; + + + /* calculate the number of reduced 2d volume accumulated in this node */ + + /* number of spinor fields in local units */ + REDUCTIONVOLUME*=dims[which[0]]*dims[which[1]]; + + /* number of receive messages */ + num_requests=REDUCTIONVOLUME-1; + + /* reserve space for receive messages */ + requests=(MPI_Request*)malloc(sizeof(MPI_Request)*num_requests); + status=(MPI_Status*)malloc(sizeof(MPI_Status)*num_requests); + + fprintf(logFile,"reduction volume = %d\n",REDUCTIONVOLUME); + + /* allocate space for spinor field collection */ + allocate_spinor_field_array(&fft_field,&memory_buffer_fft_field,VOLUME,REDUCTIONVOLUME); + + + + /* create fftw plan */ + local_2d_fft_backward=spinor_fftw_plan2d(accu_field[0],fft_field[0],T*dims[0],LX*dims[1],LY*LZ,0,FFTW_ESTIMATE); + fftw_execute(local_2d_fft_backward); + fftw_destroy_plan(local_2d_fft_backward); + + +/* assign(fft_field[0],accu_field[0],VOLUME*REDUCTIONVOLUME); */ + + /* transpose in xp-t space */ + spinor_fft_transpose_xp_t(accu_field[0],fft_field[0],dims[0],dims[1],FALSE,1./(double)(T*dims[0] * LX*dims[1])); + + + + /* receive from certain nodes pieces of the spinor field */ + for(sendRecvCoord[which[0]] = 0 ; sendRecvCoord[which[0]]< dims[which[0]] ; sendRecvCoord[which[0]]++){ + for(sendRecvCoord[which[1]] = 0 ; sendRecvCoord[which[1]]< dims[which[1]] ; sendRecvCoord[which[1]]++){ + if( sendRecvCoord[which[0]] != 0 || sendRecvCoord[which[1]] != 0){ + + MPI_Cart_rank(g_cart_grid,sendRecvCoord,&sendRank); + + MPI_Isend(accu_field[sendRecvCoord[which[0]]*dims[which[1]]+sendRecvCoord[which[1]] ] /* buffer */, + 1, /* how may */ + mpi_local_spinor, /* mpi data type */ + sendRank, /* from whom i get it */ + MSG_LOCALDATA, /* msg id */ + g_cart_grid, /* communicator , status */ + requests+request_count); + ++request_count; + + } + } + } + + assign(localSpinorField,accu_field[0],VOLUME); + + + + /* wait until all request finished */ + MPI_Waitall(num_requests, requests, status); + + + free_spinor_field_array(&memory_buffer_fft_field); memory_buffer_fft_field=NULL; fft_field=NULL; + free_spinor_field_array(&memory_buffer_accu_field); memory_buffer_accu_field=NULL; accu_field=NULL; + + free(requests); requests = NULL; + free(status); status=NULL; + + } else { + int recvRank; + MPI_Request request; + MPI_Status status; + + + /* coordinates of the "root" */ + sendRecvCoord[which[0]]=0; + sendRecvCoord[which[1]]=0; + + MPI_Cart_rank(g_cart_grid,sendRecvCoord,&recvRank); + + MPI_Irecv(localSpinorField,1,mpi_local_spinor,recvRank,MSG_LOCALDATA,g_cart_grid,&request); + + MPI_Wait(&request,&status); + + } + + MPI_Type_free(&mpi_local_spinor); + + fclose(logFile); + +#else + if(g_proc_id==0) + fprintf(stderr,"Error: Please choose FOUR dimensional parallelization!!!\n"); + +#endif +} + + +#ifdef HAVE_FFTW +fftw_plan spinor_fftw_plan2d(spinor *spinor_in,spinor *spinor_out,int dim0,int dim1,int howmany_wospin,unsigned int forward,int fftw_flags){ + +/* int index_s = gsi(get_index(it, ix, iy, iz, T, L)); */ +/* double *xi_ = xi + index_s; */ + + int Dim1[2]; +/* cerr << "Trying to create a plan for T=" << T << " L=" << L ; */ +/* cerr.flush(); */ + + int rank=2; + + int stride=12*howmany_wospin; + int dist=1; + int howmany=12*howmany_wospin; + fftw_plan plan; + + + Dim1[0]=dim0; + Dim1[1]=dim1; + + + if(fftw_flags==-1){fftw_flags=FFTW_ESTIMATE;} + if(forward){ + plan=fftw_plan_many_dft(rank, Dim1, howmany, (fftw_complex*)spinor_in, NULL, stride, dist, + (fftw_complex*)spinor_out,NULL,stride,dist, + FFTW_FORWARD,fftw_flags); + } else { + plan=fftw_plan_many_dft(rank, Dim1, howmany, (fftw_complex*)spinor_in, NULL, stride, dist, + (fftw_complex*)spinor_out,NULL,stride,dist, + FFTW_BACKWARD,fftw_flags); + } +/* if(plan!=NULL) cerr << " [OK]"<< endl; */ +/* else cerr << " [FAIL]"<< endl; */ +/* cerr.flush(); */ + + return plan; + +} +#endif + +void spinor_fft_transpose_xp_t(spinor *fieldout,spinor* fieldin,int dim0,int dim1,int forward,double mulp){ + int LXYZ=LX*LY*LZ; + int xyz,tp,xp,t; + spinor *spin1,*spin2; + if(forward == TRUE){ + for(tp=0;tps0,mulp,(spin2+xyz)->s0); + _vector_mul((spin2+xyz)->s1,mulp,(spin2+xyz)->s1); + _vector_mul((spin2+xyz)->s2,mulp,(spin2+xyz)->s2); + _vector_mul((spin2+xyz)->s3,mulp,(spin2+xyz)->s3); + } + /* optionally multiply with mulp */ + } + } + } + } else { + for(tp=0;tps0,mulp,(spin2+xyz)->s0); + _vector_mul((spin2+xyz)->s1,mulp,(spin2+xyz)->s1); + _vector_mul((spin2+xyz)->s2,mulp,(spin2+xyz)->s2); + _vector_mul((spin2+xyz)->s3,mulp,(spin2+xyz)->s3); + } + /* optionally multiply with mulp */ + } + } + } + } +} + + +#ifdef MPI +void check_mpi_comm_membership(MPI_Comm commself,MPI_Comm commcheck,const char *name_a,const char *name_b,FILE *logFile){ + int result; + fprintf(logFile,"checking %s against %s : \n" , name_a,name_b); + MPI_Comm_compare(MPI_COMM_SELF,commcheck,&result); + switch(result){ + case MPI_CONGRUENT: fprintf(logFile,"CONGRUENT\n"); break; + case MPI_IDENT: fprintf(logFile,"IDENTICAL\n"); break; + case MPI_SIMILAR: fprintf(logFile,"SIMILAR\n"); break; + case MPI_UNEQUAL: fprintf(logFile,"UNEQUAL\n"); break; + default : fprintf(logFile,"unknown relation ??\n");break; + } +} +#endif + +void spinor_fft_print_reduct_dims(int *remaining_dims,FILE *logFile){ + int i; + + fprintf(logFile,"Reducing spinor_field to dims : "); + + for(i=0;i<4;i++){ + if( remaining_dims[i]==TRUE){ + switch(i){ + case 0: fprintf(logFile," T"); break; + case 1: fprintf(logFile," X"); break; + case 2: fprintf(logFile," Y"); break; + case 3: fprintf(logFile," Z"); break; + default: fprintf(logFile," sorry we are in QCD, unknown dimension -> extra dimensions ??"); break; + } + } + } + + fprintf(logFile,"\n"); + +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/spinor_fft.h b/qcd/part_cpu/applications/QCD/src/kernel_D/spinor_fft.h new file mode 100644 index 0000000000000000000000000000000000000000..635935a4aa95897c7d390f605069242f64cfef87 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/spinor_fft.h @@ -0,0 +1,36 @@ + +/*********************************************************************** + * + * Copyright (C) 2010 Andreas Nube + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _SPINOR_FFT_H +#define _SPINOR_FFT_H + +#ifndef TRUE +#define TRUE 1 +#endif + +#ifndef FALSE +#define FALSE 0 +#endif + +void spinor_fft_reduce_2d(spinor *localSpinorField,int *collectionRank,spinor ***field_collection,spinor **membuff); +void spinor_fft_redist_2d(spinor *localSpinorField,int collectionRank,spinor **field_collection,spinor *membuff); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/sse.h b/qcd/part_cpu/applications/QCD/src/kernel_D/sse.h new file mode 100644 index 0000000000000000000000000000000000000000..f80fd2028fc9db3590be5b2fbedfa51c851e62a1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/sse.h @@ -0,0 +1,1529 @@ +/*********************************************************************** + * Copyright (C) 2001 Martin Luescher + * 2002 Martin Hasenbusch + * 2002, 2003, 2004 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _SSE_H +#define _SSE_H + +#if (defined SSE || defined SSE2 || defined SSE3) + + +/******************************************************************************* +* +* File sse.h +* +* Macros for Dirac spinors, SU(3) vectors and SU(3) matrices using +* inline assembly SSE and SSE2 instructions +* +* Needs gcc version 2.95.2 or later, and binutils snapshot 010122 or later +* if the SSE2 instructions are used +* +* Version: 1.1 +* Author: Martin Luescher +* Date: 17.03.2001 +* +* a few extension by M. Hasenbusch, all extensions are marked as such +* +* SSE3 Versions and Opteron versions added by C. Urbach +* +*******************************************************************************/ + +typedef struct +{ + int c0,c1,c2,c3; +} sse_int __attribute__ ((aligned (16))); + +typedef struct +{ + float c0,c1,c2,c3; +} sse_float __attribute__ ((aligned (16))); + +typedef struct +{ + double c0,c1; +} sse_double ALIGN; + + +/******************************************************************************* +* +* Cache manipulation macros +* +*******************************************************************************/ + +#if ((defined P4)) + +#define _prefetch_spinor(addr) \ +__asm__ __volatile__ ("prefetcht0 %0 \n\t" \ + "prefetcht0 %1" \ + : \ + : \ + "m" (*(((char*)(((unsigned long int)(addr))&~0x7f)))), \ + "m" (*(((char*)(((unsigned long int)(addr))&~0x7f))+128))) + +#define _prefetch_nta_spinor(addr) \ +__asm__ __volatile__ ("prefetchnta %0 \n\t" \ + "prefetchnta %1" \ + : \ + : \ + "m" (*(((char*)(((unsigned long int)(addr))&~0x7f)))), \ + "m" (*(((char*)(((unsigned long int)(addr))&~0x7f))+128))) + +#define _prefetch_halfspinor(addr) \ +__asm__ __volatile__ ("prefetcht0 %0" \ + : \ + : \ + "m" (*(((char*)(((unsigned long int)(addr))&~0x7f))) )) + +#define _prefetch_nta_halfspinor(addr) \ +__asm__ __volatile__ ("prefetchnta %0" \ + : \ + : \ + "m" (*(((char*)(((unsigned long int)(addr))&~0x7f))) )) + +#define _prefetch_su3(addr) \ +__asm__ __volatile__ ("prefetcht0 %0 \n\t" \ + "prefetcht0 %1" \ + : \ + : \ + "m" (*(((char*)(((unsigned long int)(addr))&~0x7f)))), \ + "m" (*(((char*)(((unsigned long int)(addr))&~0x7f))+128))) + +#define _prefetch_mom(addr) \ +__asm__ __volatile__ ("prefetchnta %0" \ + : \ + : \ + "m" (*(((char*)(((unsigned long int)(addr))&~0x7f))))) + +#elif defined OPTERON + +#define _prefetch_spinor(addr) \ +__asm__ __volatile__ ("prefetcht0 %0 \n\t" \ + "prefetcht0 %1 \n\t" \ + "prefetcht0 %2" \ + : \ + : \ + "m" (*(((char*)(addr)))), \ + "m" (*(((char*)(addr))+64)), \ + "m" (*(((char*)(addr))+128))) + + +#define _prefetch_nta_spinor(addr) \ +__asm__ __volatile__ ("prefetchnta %0 \n\t" \ + "prefetchnta %1 \n\t" \ + "prefetchnta %2" \ + : \ + : \ + "m" (*(((char*)(addr)))), \ + "m" (*(((char*)(addr))+64)), \ + "m" (*(((char*)(addr))+128))) + +#define _prefetch_halfspinor(addr) \ +__asm__ __volatile__ ("prefetcht0 %0 \n\t" \ + "prefetcht0 %1" \ + : \ + : \ + "m" (*(((char*)(addr)))), \ + "m" (*(((char*)(addr))+64))) + + +#define _prefetch_nta_halfspinor(addr) \ +__asm__ __volatile__ ("prefetchnta %0 \n\t" \ + "prefetchnta %1" \ + : \ + : \ + "m" (*(((char*)(addr)))), \ + "m" (*(((char*)(addr))+64))) + +#define _prefetch_su3(addr) \ +__asm__ __volatile__ ("prefetcht0 %0 \n\t" \ + "prefetcht0 %1 \n\t" \ + "prefetcht0 %2" \ + : \ + : \ + "m" (*(((char*)(addr)))), \ + "m" (*(((char*)(addr))+64)), \ + "m" (*(((char*)(addr))+128))) + +#define _prefetch_mom(addr) \ +__asm__ __volatile__ ("prefetcht0 %0" \ + : \ + : \ + "m" (*(((char*)((addr)))))) + + + +#else + +#define _prefetch_spinor(addr) \ +__asm__ __volatile__ ("prefetcht0 %0 \n\t" \ + "prefetcht0 %1 \n\t" \ + "prefetcht0 %2 \n\t" \ + "prefetcht0 %3 \n\t" \ + "prefetcht0 %4 \n\t" \ + "prefetcht0 %5" \ + : \ + : \ + "m" (*(((char*)(addr)))), \ + "m" (*(((char*)(addr))+32)), \ + "m" (*(((char*)(addr))+64)), \ + "m" (*(((char*)(addr))+96)), \ + "m" (*(((char*)(addr))+128)), \ + "m" (*(((char*)(addr))+160))) + +#define _prefetch_nta_spinor(addr) \ +__asm__ __volatile__ ("prefetchnta %0 \n\t" \ + "prefetchnta %1 \n\t" \ + "prefetchnta %2 \n\t" \ + "prefetchnta %3 \n\t" \ + "prefetchnta %4 \n\t" \ + "prefetchnta %5" \ + : \ + : \ + "m" (*(((char*)(addr)))), \ + "m" (*(((char*)(addr))+32)), \ + "m" (*(((char*)(addr))+64)), \ + "m" (*(((char*)(addr))+96)), \ + "m" (*(((char*)(addr))+128)), \ + "m" (*(((char*)(addr))+160))) + +#define _prefetch_halfspinor(addr) \ +__asm__ __volatile__ ("prefetcht0 %0 \n\t" \ + "prefetcht0 %1 \n\t" \ + "prefetcht0 %2" \ + : \ + : \ + "m" (*(((char*)(addr)))), \ + "m" (*(((char*)(addr))+32)), \ + "m" (*(((char*)(addr))+64))) + +#define _prefetch_nta_halfspinor(addr) \ +__asm__ __volatile__ ("prefetchnta %0 \n\t" \ + "prefetchnta %1 \n\t" \ + "prefetchnta %2" \ + : \ + : \ + "m" (*(((char*)(addr)))), \ + "m" (*(((char*)(addr))+32)), \ + "m" (*(((char*)(addr))+64))) + +#define _prefetch_su3(addr) \ +__asm__ __volatile__ ("prefetcht0 %0 \n\t" \ + "prefetcht0 %1 \n\t" \ + "prefetcht0 %2 \n\t" \ + "prefetcht0 %3 \n\t" \ + "prefetcht0 %4" \ + : \ + : \ + "m" (*(((char*)(((unsigned long int)(addr))&~0x1f)))), \ + "m" (*(((char*)(((unsigned long int)(addr))&~0x1f))+32)), \ + "m" (*(((char*)(((unsigned long int)(addr))&~0x1f))+64)), \ + "m" (*(((char*)(((unsigned long int)(addr))&~0x1f))+96)), \ + "m" (*(((char*)(((unsigned long int)(addr))&~0x1f))+128))) + +#define _prefetch_mom(addr) \ +__asm__ __volatile__ ("prefetcht0 %0 \n\t" \ + "prefetcht0 %1" \ + : \ + : \ + "m" (*(((char*)(((unsigned long int)(addr))&~0x1f)))), \ + "m" (*(((char*)(((unsigned long int)(addr))&~0x1f))+32))) + +#endif + +#if ((defined SSE2)||(defined SSE3)) + +static sse_int _sse_sgn __attribute__ ((unused)) ={0x0,0x80000000,0x0,0x0}; +/* _sse_sgn2 by Martin Hasenbusch */ +static sse_int _sse_sgn2 __attribute__ ((unused)) ={0x0,0x0,0x0,0x80000000}; + + +/******************************************************************************* +* +* Macros for su3 vectors used in D_psi version 2.0 +* +* Most of these macros operate on su3 vectors that are stored +* in xmm0,xmm1,xmm2 or xmm3,xmm4,xmm5. For example, +* +* xmm0 -> s.c1.re,s.c1.im +* xmm1 -> s.c2.re,s.c2.im +* xmm2 -> s.c3.re,s.c3.im +* +* where s is of type su3_vector +* +*******************************************************************************/ + +/* +* Loads an su3 vector s to xmm0,xmm1,xmm2 +*/ + +#define _sse_load(s) \ +__asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" \ + "movapd %1, %%xmm1 \n\t" \ + "movapd %2, %%xmm2" \ + : \ + : \ + "m" ((s).c0), \ + "m" ((s).c1), \ + "m" ((s).c2)) + +/* +* Loads an su3 vector s to xmm3,xmm4,xmm5 +*/ + +#define _sse_load_up(s) \ +__asm__ __volatile__ ("movapd %0, %%xmm3 \n\t" \ + "movapd %1, %%xmm4 \n\t" \ + "movapd %2, %%xmm5" \ + : \ + : \ + "m" ((s).c0), \ + "m" ((s).c1), \ + "m" ((s).c2)) + +/* +* Stores xmm0,xmm1,xmm2 to the components r.c1,r.c2,r.c3 of an su3 vector +*/ + +#define _sse_store(r) \ +__asm__ __volatile__ ("movapd %%xmm0, %0 \n\t" \ + "movapd %%xmm1, %1 \n\t" \ + "movapd %%xmm2, %2" \ + : \ + "=m" ((r).c0), \ + "=m" ((r).c1), \ + "=m" ((r).c2)) + +/* +* Stores xmm0,xmm1,xmm2 to the components r.c1,r.c2,r.c3 of an su3 vector +*/ + +#define _sse_store_nt(r) \ +__asm__ __volatile__ ("movntpd %%xmm0, %0 \n\t" \ + "movntpd %%xmm1, %1 \n\t" \ + "movntpd %%xmm2, %2" \ + : \ + "=m" ((r).c0), \ + "=m" ((r).c1), \ + "=m" ((r).c2)) + + +/* +* Stores xmm3,xmm4,xmm5 to the components r.c1,r.c2,r.c3 of an su3 vector +*/ + +#define _sse_store_up(r) \ +__asm__ __volatile__ ("movapd %%xmm3, %0 \n\t" \ + "movapd %%xmm4, %1 \n\t" \ + "movapd %%xmm5, %2" \ + : \ + "=m" ((r).c0), \ + "=m" ((r).c1), \ + "=m" ((r).c2)) + +/* + * Stores xmm3,xmm4,xmm5 to the components r.c1,r.c2,r.c3 of an su3 vector + * directly to memory + */ + +#define _sse_store_nt_up(r) \ +__asm__ __volatile__ ("movntpd %%xmm3, %0 \n\t" \ + "movntpd %%xmm4, %1 \n\t" \ + "movntpd %%xmm5, %2" \ + : \ + "=m" ((r).c0), \ + "=m" ((r).c1), \ + "=m" ((r).c2)) + +/* +* Multiplies xmm0,xmm1,xmm2 with a constant sse_double c +*/ + +#define _sse_vector_mul(c) \ +__asm__ __volatile__ ("mulpd %0, %%xmm0 \n\t" \ + "mulpd %0, %%xmm1 \n\t" \ + "mulpd %0, %%xmm2" \ + : \ + : \ + "m" (c)) + +/* +* Multiplies xmm3,xmm4,xmm5 with an imaginary number i*sse_double +*/ + +#define _sse_vector_imag_mul(c) \ +__asm__ __volatile__ ("shufpd $0x1, %%xmm3, %%xmm3 \n\t" \ + "shufpd $0x1, %%xmm4, %%xmm4 \n\t" \ + "shufpd $0x1, %%xmm5, %%xmm5 \n\t" \ + "xorpd %0, %%xmm3 \n\t" \ + "xorpd %0, %%xmm4 \n\t" \ + "xorpd %0, %%xmm5 \n\t" \ + "mulpd %1, %%xmm3 \n\t" \ + "mulpd %1, %%xmm4 \n\t" \ + "mulpd %1, %%xmm5" \ + : \ + : \ + "m" (_sse_sgn),\ + "m" (c)) + + +/* +* Adds xmm3,xmm4,xmm5 to xmm0,xmm1,xmm2 +*/ + +#define _sse_vector_add() \ +__asm__ __volatile__ ("addpd %%xmm3, %%xmm0 \n\t" \ + "addpd %%xmm4, %%xmm1 \n\t" \ + "addpd %%xmm5, %%xmm2" \ + : \ + :) + + +/* +* Subtracts xmm3,xmm4,xmm5 from xmm0,xmm1,xmm2 +*/ + +#define _sse_vector_sub() \ +__asm__ __volatile__ ("subpd %%xmm3, %%xmm0 \n\t" \ + "subpd %%xmm4, %%xmm1 \n\t" \ + "subpd %%xmm5, %%xmm2" \ + : \ + :) + +/* +* Subtracts xmm0,xmm1,xmm2 from xmm3,xmm4,xmm5 +*/ + +#define _sse_vector_sub_up() \ +__asm__ __volatile__ ("subpd %%xmm0, %%xmm3 \n\t" \ + "subpd %%xmm1, %%xmm4 \n\t" \ + "subpd %%xmm2, %%xmm5" \ + : \ + :) + +/* +* Multiplies xmm3,xmm4,xmm5 with i +*/ + +#define _sse_vector_i_mul() \ +__asm__ __volatile__ ("shufpd $0x1, %%xmm3, %%xmm3 \n\t" \ + "shufpd $0x1, %%xmm4, %%xmm4 \n\t" \ + "shufpd $0x1, %%xmm5, %%xmm5 \n\t" \ + "xorpd %0, %%xmm3 \n\t" \ + "xorpd %0, %%xmm4 \n\t" \ + "xorpd %0, %%xmm5" \ + : \ + : \ + "m" (_sse_sgn)) + +#ifndef SSE3 + +/* + * C.Urbach + * Multiplies xmm3,xmm4,xmm5 with the complex number stored + * in xmm6 and xmm7 + */ + +#define _sse_vector_cmplx_mul_two() \ +__asm__ __volatile__ ("movapd %%xmm3, %%xmm0 \n\t" \ + "movapd %%xmm4, %%xmm1 \n\t" \ + "movapd %%xmm5, %%xmm2 \n\t" \ + "mulpd %%xmm6, %%xmm3 \n\t" \ + "mulpd %%xmm6, %%xmm4 \n\t" \ + "mulpd %%xmm6, %%xmm5 \n\t" \ + "mulpd %%xmm7, %%xmm0 \n\t" \ + "mulpd %%xmm7, %%xmm1 \n\t" \ + "mulpd %%xmm7, %%xmm2 \n\t" \ + "shufpd $0x1, %%xmm0, %%xmm0 \n\t" \ + "shufpd $0x1, %%xmm1, %%xmm1 \n\t" \ + "shufpd $0x1, %%xmm2, %%xmm2 \n\t" \ + "xorpd %0, %%xmm0 \n\t" \ + "xorpd %0, %%xmm1 \n\t" \ + "xorpd %0, %%xmm2 \n\t" \ + "addpd %%xmm0, %%xmm3 \n\t" \ + "addpd %%xmm1, %%xmm4 \n\t" \ + "addpd %%xmm2, %%xmm5" \ + : \ + : \ + "m" (_sse_sgn)) ; + + + +/* +* M. Hasenbusch, Fri Nov 9 13:33:22 MET 2001 +* Multiplies xmm3,xmm4,xmm5 with the complex number c +*/ +#define _sse_vector_cmplx_mul(c) \ +__asm__ __volatile__ ("movsd %0, %%xmm6 \n\t" \ + "movsd %1, %%xmm7 \n\t" \ + "unpcklpd %%xmm6, %%xmm6 \n\t" \ + "unpcklpd %%xmm7, %%xmm7 \n\t" \ + "movapd %%xmm3, %%xmm0 \n\t" \ + "movapd %%xmm4, %%xmm1 \n\t" \ + "movapd %%xmm5, %%xmm2 \n\t" \ + "mulpd %%xmm6, %%xmm3 \n\t" \ + "mulpd %%xmm6, %%xmm4 \n\t" \ + "mulpd %%xmm6, %%xmm5 \n\t" \ + "mulpd %%xmm7, %%xmm0 \n\t" \ + "mulpd %%xmm7, %%xmm1 \n\t" \ + "mulpd %%xmm7, %%xmm2 \n\t" \ + "shufpd $0x1, %%xmm0, %%xmm0 \n\t" \ + "shufpd $0x1, %%xmm1, %%xmm1 \n\t" \ + "shufpd $0x1, %%xmm2, %%xmm2 \n\t" \ + "xorpd %2, %%xmm0 \n\t" \ + "xorpd %2, %%xmm1 \n\t" \ + "xorpd %2, %%xmm2 \n\t" \ + "addpd %%xmm0, %%xmm3 \n\t" \ + "addpd %%xmm1, %%xmm4 \n\t" \ + "addpd %%xmm2, %%xmm5" \ + : \ + : \ + "m" (creal(c)), \ + "m" (cimag(c)), \ + "m" (_sse_sgn)) ; + + +/* +* M. Hasenbusch, Fri Nov 9 13:33:22 MET 2001 +* Multiplies xmm3,xmm4,xmm5 with the conjugate of the complex number c +*/ +#define _sse_vector_cmplxcg_mul(c) \ +__asm__ __volatile__ ("movsd %0, %%xmm6 \n\t" \ + "movsd %1, %%xmm7 \n\t" \ + "unpcklpd %%xmm6, %%xmm6 \n\t" \ + "unpcklpd %%xmm7, %%xmm7 \n\t" \ + "movapd %%xmm3, %%xmm0 \n\t" \ + "movapd %%xmm4, %%xmm1 \n\t" \ + "movapd %%xmm5, %%xmm2 \n\t" \ + "mulpd %%xmm6, %%xmm3 \n\t" \ + "mulpd %%xmm6, %%xmm4 \n\t" \ + "mulpd %%xmm6, %%xmm5 \n\t" \ + "mulpd %%xmm7, %%xmm0 \n\t" \ + "mulpd %%xmm7, %%xmm1 \n\t" \ + "mulpd %%xmm7, %%xmm2 \n\t" \ + "shufpd $0x1, %%xmm0, %%xmm0 \n\t" \ + "shufpd $0x1, %%xmm1, %%xmm1 \n\t" \ + "shufpd $0x1, %%xmm2, %%xmm2 \n\t" \ + "xorpd %2, %%xmm0 \n\t" \ + "xorpd %2, %%xmm1 \n\t" \ + "xorpd %2, %%xmm2 \n\t" \ + "subpd %%xmm0, %%xmm3 \n\t" \ + "subpd %%xmm1, %%xmm4 \n\t" \ + "subpd %%xmm2, %%xmm5" \ + : \ + : \ + "m" (creal(c)), \ + "m" (cimag(c)), \ + "m" (_sse_sgn)) ; + + + +/* +* Multiplies an su3 vector s with an su3 matrix u, assuming s is +* stored in xmm0,xmm1,xmm2 +* +* On output the result is in xmm3,xmm4,xmm5 and the registers +* xmm0,xmm1,xmm2 are changed +*/ + +#if defined OPTERON + +#define _sse_su3_multiply(u) \ +__asm__ __volatile__ ("movsd %0, %%xmm3 \n\t" \ + "movsd %1, %%xmm6 \n\t" \ + "movsd %2, %%xmm4 \n\t" \ + "movsd %3, %%xmm7 \n\t" \ + "movsd %4, %%xmm5 \n\t" \ + "unpcklpd %%xmm3, %%xmm3 \n\t" \ + "unpcklpd %%xmm6, %%xmm6 \n\t" \ + "unpcklpd %%xmm4, %%xmm4 \n\t" \ + "mulpd %%xmm0, %%xmm3 \n\t" \ + "unpcklpd %%xmm7, %%xmm7 \n\t" \ + "mulpd %%xmm1, %%xmm6 \n\t" \ + "unpcklpd %%xmm5, %%xmm5 \n\t" \ + "mulpd %%xmm0, %%xmm4 \n\t" \ + "addpd %%xmm6, %%xmm3 \n\t" \ + "mulpd %%xmm2, %%xmm7 \n\t" \ + "mulpd %%xmm0, %%xmm5 \n\t" \ + "addpd %%xmm7, %%xmm4 \n\t" \ + "movsd %5, %%xmm8 \n\t" \ + "movsd %6, %%xmm9 \n\t" \ + "movsd %7, %%xmm10 \n\t" \ + "movsd %8, %%xmm11 \n\t" \ + "unpcklpd %%xmm8, %%xmm8 \n\t" \ + "unpcklpd %%xmm9, %%xmm9 \n\t" \ + "unpcklpd %%xmm10, %%xmm10 \n\t" \ + "unpcklpd %%xmm11, %%xmm11 \n\t" \ + "mulpd %%xmm1, %%xmm8 \n\t" \ + "mulpd %%xmm2, %%xmm9 \n\t" \ + "mulpd %%xmm1, %%xmm10 \n\t" \ + "mulpd %%xmm2, %%xmm11 \n\t" \ + "addpd %%xmm8, %%xmm5 \n\t" \ + "addpd %%xmm9, %%xmm3 \n\t" \ + "addpd %%xmm10, %%xmm4 \n\t" \ + "addpd %%xmm11, %%xmm5" \ + : \ + : \ + "m" (creal((u).c00)), \ + "m" (creal((u).c01)), \ + "m" (creal((u).c10)), \ + "m" (creal((u).c12)), \ + "m" (creal((u).c20)), \ + "m" (creal((u).c21)), \ + "m" (creal((u).c02)), \ + "m" (creal((u).c11)), \ + "m" (creal((u).c22))); \ +__asm__ __volatile__ ("movsd %0, %%xmm6 \n\t" \ + "movsd %1, %%xmm7 \n\t" \ + "movsd %2, %%xmm8 \n\t" \ + "shufpd $0x1, %%xmm0, %%xmm0 \n\t" \ + "shufpd $0x1, %%xmm1, %%xmm1 \n\t" \ + "shufpd $0x1, %%xmm2, %%xmm2 \n\t" \ + "unpcklpd %%xmm6, %%xmm6 \n\t" \ + "unpcklpd %%xmm7, %%xmm7 \n\t" \ + "unpcklpd %%xmm8, %%xmm8 \n\t" \ + "xorpd %9, %%xmm0 \n\t" \ + "xorpd %9, %%xmm1 \n\t" \ + "xorpd %9, %%xmm2 \n\t" \ + "mulpd %%xmm0, %%xmm6 \n\t" \ + "mulpd %%xmm1, %%xmm7 \n\t" \ + "mulpd %%xmm2, %%xmm8 \n\t" \ + "movsd %3, %%xmm9 \n\t" \ + "movsd %4, %%xmm10 \n\t" \ + "movsd %5, %%xmm11 \n\t" \ + "addpd %%xmm6, %%xmm3 \n\t" \ + "addpd %%xmm7, %%xmm4 \n\t" \ + "addpd %%xmm8, %%xmm5 \n\t" \ + "unpcklpd %%xmm9, %%xmm9 \n\t" \ + "unpcklpd %%xmm10, %%xmm10 \n\t" \ + "unpcklpd %%xmm11, %%xmm11 \n\t" \ + "mulpd %%xmm0, %%xmm9 \n\t" \ + "mulpd %%xmm1, %%xmm10 \n\t" \ + "mulpd %%xmm0, %%xmm11 \n\t" \ + "movsd %6, %%xmm12 \n\t" \ + "movsd %7, %%xmm13 \n\t" \ + "movsd %8, %%xmm14 \n\t" \ + "unpcklpd %%xmm12, %%xmm12 \n\t" \ + "unpcklpd %%xmm13, %%xmm13 \n\t" \ + "unpcklpd %%xmm14, %%xmm14 \n\t" \ + "addpd %%xmm9, %%xmm4 \n\t" \ + "addpd %%xmm10, %%xmm3 \n\t" \ + "addpd %%xmm11, %%xmm5 \n\t" \ + "mulpd %%xmm2, %%xmm12 \n\t" \ + "mulpd %%xmm1, %%xmm13 \n\t" \ + "mulpd %%xmm2, %%xmm14 \n\t" \ + "addpd %%xmm12, %%xmm3 \n\t" \ + "addpd %%xmm13, %%xmm5 \n\t" \ + "addpd %%xmm14, %%xmm4" \ + : \ + : \ + "m" (cimag((u).c00)), \ + "m" (cimag((u).c11)), \ + "m" (cimag((u).c22)), \ + "m" (cimag((u).c10)), \ + "m" (cimag((u).c01)), \ + "m" (cimag((u).c20)), \ + "m" (cimag((u).c02)), \ + "m" (cimag((u).c21)), \ + "m" (cimag((u).c12)), \ + "m" (_sse_sgn)) + +// else for ifdef OPTERON +#else + +#define _sse_su3_multiply(u) \ +__asm__ __volatile__ ("movsd %0, %%xmm3 \n\t" \ + "movsd %1, %%xmm6 \n\t" \ + "movsd %2, %%xmm4 \n\t" \ + "movsd %3, %%xmm7 \n\t" \ + "movsd %4, %%xmm5 \n\t" \ + "unpcklpd %%xmm3, %%xmm3 \n\t" \ + "unpcklpd %%xmm6, %%xmm6 \n\t" \ + "unpcklpd %%xmm4, %%xmm4 \n\t" \ + "mulpd %%xmm0, %%xmm3 \n\t" \ + "unpcklpd %%xmm7, %%xmm7 \n\t" \ + "mulpd %%xmm1, %%xmm6 \n\t" \ + "unpcklpd %%xmm5, %%xmm5 \n\t" \ + "mulpd %%xmm0, %%xmm4 \n\t" \ + "addpd %%xmm6, %%xmm3 \n\t" \ + "mulpd %%xmm2, %%xmm7 \n\t" \ + "mulpd %%xmm0, %%xmm5 \n\t" \ + "addpd %%xmm7, %%xmm4 \n\t" \ + "movsd %5, %%xmm6 \n\t" \ + "movsd %6, %%xmm7 \n\t" \ + "unpcklpd %%xmm6, %%xmm6 \n\t" \ + "unpcklpd %%xmm7, %%xmm7 \n\t" \ + "mulpd %%xmm1, %%xmm6 \n\t" \ + "mulpd %%xmm2, %%xmm7 \n\t" \ + "addpd %%xmm6, %%xmm5 \n\t" \ + "addpd %%xmm7, %%xmm3 \n\t" \ + "movsd %7, %%xmm6 \n\t" \ + "movsd %8, %%xmm7 \n\t" \ + "unpcklpd %%xmm6, %%xmm6 \n\t" \ + "unpcklpd %%xmm7, %%xmm7 \n\t" \ + "mulpd %%xmm1, %%xmm6 \n\t" \ + "mulpd %%xmm2, %%xmm7 \n\t" \ + "addpd %%xmm6, %%xmm4 \n\t" \ + "addpd %%xmm7, %%xmm5" \ + : \ + : \ + "m" (creal((u).c00)), \ + "m" (creal((u).c01)), \ + "m" (creal((u).c10)), \ + "m" (creal((u).c12)), \ + "m" (creal((u).c20)), \ + "m" (creal((u).c21)), \ + "m" (creal((u).c02)), \ + "m" (creal((u).c11)), \ + "m" (creal((u).c22))); \ +__asm__ __volatile__ ("movsd %0, %%xmm6 \n\t" \ + "movsd %1, %%xmm7 \n\t" \ + "shufpd $0x1, %%xmm0, %%xmm0 \n\t" \ + "shufpd $0x1, %%xmm1, %%xmm1 \n\t" \ + "shufpd $0x1, %%xmm2, %%xmm2 \n\t" \ + "unpcklpd %%xmm6, %%xmm6 \n\t" \ + "unpcklpd %%xmm7, %%xmm7 \n\t" \ + "xorpd %9, %%xmm0 \n\t" \ + "xorpd %9, %%xmm1 \n\t" \ + "xorpd %9, %%xmm2 \n\t" \ + "mulpd %%xmm0, %%xmm6 \n\t" \ + "mulpd %%xmm1, %%xmm7 \n\t" \ + "addpd %%xmm6, %%xmm3 \n\t" \ + "addpd %%xmm7, %%xmm4 \n\t" \ + "movsd %2, %%xmm6 \n\t" \ + "movsd %3, %%xmm7 \n\t" \ + "unpcklpd %%xmm6, %%xmm6 \n\t" \ + "unpcklpd %%xmm7, %%xmm7 \n\t" \ + "mulpd %%xmm2, %%xmm6 \n\t" \ + "mulpd %%xmm0, %%xmm7 \n\t" \ + "addpd %%xmm6, %%xmm5 \n\t" \ + "addpd %%xmm7, %%xmm4 \n\t" \ + "movsd %4, %%xmm6 \n\t" \ + "movsd %5, %%xmm7 \n\t" \ + "unpcklpd %%xmm6, %%xmm6 \n\t" \ + "unpcklpd %%xmm7, %%xmm7 \n\t" \ + "mulpd %%xmm1, %%xmm6 \n\t" \ + "mulpd %%xmm0, %%xmm7 \n\t" \ + "addpd %%xmm6, %%xmm3 \n\t" \ + "addpd %%xmm7, %%xmm5 \n\t" \ + "movsd %6, %%xmm0 \n\t" \ + "movsd %7, %%xmm6 \n\t" \ + "movsd %8, %%xmm7 \n\t" \ + "unpcklpd %%xmm0, %%xmm0 \n\t" \ + "unpcklpd %%xmm6, %%xmm6 \n\t" \ + "unpcklpd %%xmm7, %%xmm7 \n\t" \ + "mulpd %%xmm2, %%xmm0 \n\t" \ + "mulpd %%xmm1, %%xmm6 \n\t" \ + "mulpd %%xmm2, %%xmm7 \n\t" \ + "addpd %%xmm0, %%xmm3 \n\t" \ + "addpd %%xmm6, %%xmm5 \n\t" \ + "addpd %%xmm7, %%xmm4" \ + : \ + : \ + "m" (cimag((u).c00)), \ + "m" (cimag((u).c11)), \ + "m" (cimag((u).c22)), \ + "m" (cimag((u).c10)), \ + "m" (cimag((u).c01)), \ + "m" (cimag((u).c20)), \ + "m" (cimag((u).c02)), \ + "m" (cimag((u).c21)), \ + "m" (cimag((u).c12)), \ + "m" (_sse_sgn)) + +// endif for OPTERON +#endif + +/* + * Multiplies an su3 vector s with an su3 matrix u^dagger, assuming s is + * stored in xmm0,xmm1,xmm2 + * + * On output the result is in xmm3,xmm4,xmm5 and the registers + * xmm0,xmm1,xmm2 are changed + */ + +#if defined OPTERON + +#define _sse_su3_inverse_multiply(u) \ +__asm__ __volatile__ ("movsd %0, %%xmm3 \n\t" \ + "movsd %1, %%xmm6 \n\t" \ + "movsd %2, %%xmm4 \n\t" \ + "unpcklpd %%xmm3, %%xmm3 \n\t" \ + "unpcklpd %%xmm6, %%xmm6 \n\t" \ + "unpcklpd %%xmm4, %%xmm4 \n\t" \ + "mulpd %%xmm0, %%xmm3 \n\t" \ + "mulpd %%xmm1, %%xmm6 \n\t" \ + "mulpd %%xmm0, %%xmm4 \n\t" \ + "movsd %3, %%xmm7 \n\t" \ + "movsd %4, %%xmm5 \n\t" \ + "movsd %5, %%xmm8 \n\t" \ + "unpcklpd %%xmm7, %%xmm7 \n\t" \ + "unpcklpd %%xmm5, %%xmm5 \n\t" \ + "unpcklpd %%xmm8, %%xmm8 \n\t" \ + "mulpd %%xmm2, %%xmm7 \n\t" \ + "mulpd %%xmm0, %%xmm5 \n\t" \ + "mulpd %%xmm1, %%xmm8 \n\t" \ + "movsd %6, %%xmm9 \n\t" \ + "movsd %7, %%xmm10 \n\t" \ + "movsd %8, %%xmm11 \n\t" \ + "addpd %%xmm6, %%xmm3 \n\t" \ + "addpd %%xmm7, %%xmm4 \n\t" \ + "addpd %%xmm8, %%xmm5 \n\t" \ + "unpcklpd %%xmm9, %%xmm9 \n\t" \ + "unpcklpd %%xmm10, %%xmm10 \n\t" \ + "unpcklpd %%xmm11, %%xmm11 \n\t" \ + "mulpd %%xmm2, %%xmm9 \n\t" \ + "mulpd %%xmm1, %%xmm10 \n\t" \ + "mulpd %%xmm2, %%xmm11 \n\t" \ + "addpd %%xmm9, %%xmm3 \n\t" \ + "addpd %%xmm10, %%xmm4 \n\t" \ + "addpd %%xmm11, %%xmm5" \ + : \ + : \ + "m" (creal((u).c00)), \ + "m" (creal((u).c10)), \ + "m" (creal((u).c01)), \ + "m" (creal((u).c21)), \ + "m" (creal((u).c02)), \ + "m" (creal((u).c12)), \ + "m" (creal((u).c20)), \ + "m" (creal((u).c11)), \ + "m" (creal((u).c22))); \ +__asm__ __volatile__ ("movsd %0, %%xmm6 \n\t" \ + "movsd %1, %%xmm7 \n\t" \ + "movsd %2, %%xmm8 \n\t" \ + "xorpd %9, %%xmm0 \n\t" \ + "xorpd %9, %%xmm1 \n\t" \ + "xorpd %9, %%xmm2 \n\t" \ + "unpcklpd %%xmm6, %%xmm6 \n\t" \ + "unpcklpd %%xmm7, %%xmm7 \n\t" \ + "unpcklpd %%xmm8, %%xmm8 \n\t" \ + "shufpd $0x1, %%xmm0, %%xmm0 \n\t" \ + "shufpd $0x1, %%xmm1, %%xmm1 \n\t" \ + "shufpd $0x1, %%xmm2, %%xmm2 \n\t" \ + "mulpd %%xmm0, %%xmm6 \n\t" \ + "mulpd %%xmm1, %%xmm7 \n\t" \ + "mulpd %%xmm2, %%xmm8 \n\t" \ + "addpd %%xmm6, %%xmm3 \n\t" \ + "addpd %%xmm7, %%xmm4 \n\t" \ + "addpd %%xmm8, %%xmm5 \n\t" \ + "movsd %3, %%xmm9 \n\t" \ + "movsd %4, %%xmm10 \n\t" \ + "movsd %5, %%xmm11 \n\t" \ + "unpcklpd %%xmm9, %%xmm9 \n\t" \ + "unpcklpd %%xmm10, %%xmm10 \n\t" \ + "unpcklpd %%xmm11, %%xmm11 \n\t" \ + "mulpd %%xmm0, %%xmm9 \n\t" \ + "mulpd %%xmm1, %%xmm10 \n\t" \ + "mulpd %%xmm0, %%xmm11 \n\t" \ + "addpd %%xmm9, %%xmm4 \n\t" \ + "addpd %%xmm10, %%xmm3 \n\t" \ + "addpd %%xmm11, %%xmm5 \n\t" \ + "movsd %6, %%xmm12 \n\t" \ + "movsd %7, %%xmm13 \n\t" \ + "movsd %8, %%xmm14 \n\t" \ + "unpcklpd %%xmm12, %%xmm12 \n\t" \ + "unpcklpd %%xmm13, %%xmm13 \n\t" \ + "unpcklpd %%xmm14, %%xmm14 \n\t" \ + "mulpd %%xmm2, %%xmm12 \n\t" \ + "mulpd %%xmm1, %%xmm13 \n\t" \ + "mulpd %%xmm2, %%xmm14 \n\t" \ + "addpd %%xmm12, %%xmm3 \n\t" \ + "addpd %%xmm13, %%xmm5 \n\t" \ + "addpd %%xmm14, %%xmm4" \ + : \ + : \ + "m" (cimag((u).c00)), \ + "m" (cimag((u).c11)), \ + "m" (cimag((u).c22)), \ + "m" (cimag((u).c01)), \ + "m" (cimag((u).c10)), \ + "m" (cimag((u).c02)), \ + "m" (cimag((u).c20)), \ + "m" (cimag((u).c12)), \ + "m" (cimag((u).c21)), \ + "m" (_sse_sgn)); +// else for ifdef OPTERON +#else + +#define _sse_su3_inverse_multiply(u) \ +__asm__ __volatile__ ("movsd %0, %%xmm3 \n\t" \ + "movsd %1, %%xmm6 \n\t" \ + "movsd %2, %%xmm4 \n\t" \ + "movsd %3, %%xmm7 \n\t" \ + "movsd %4, %%xmm5 \n\t" \ + "unpcklpd %%xmm3, %%xmm3 \n\t" \ + "unpcklpd %%xmm6, %%xmm6 \n\t" \ + "unpcklpd %%xmm4, %%xmm4 \n\t" \ + "mulpd %%xmm0, %%xmm3 \n\t" \ + "unpcklpd %%xmm7, %%xmm7 \n\t" \ + "mulpd %%xmm1, %%xmm6 \n\t" \ + "unpcklpd %%xmm5, %%xmm5 \n\t" \ + "mulpd %%xmm0, %%xmm4 \n\t" \ + "addpd %%xmm6, %%xmm3 \n\t" \ + "mulpd %%xmm2, %%xmm7 \n\t" \ + "mulpd %%xmm0, %%xmm5 \n\t" \ + "addpd %%xmm7, %%xmm4 \n\t" \ + "movsd %5, %%xmm6 \n\t" \ + "movsd %6, %%xmm7 \n\t" \ + "unpcklpd %%xmm6, %%xmm6 \n\t" \ + "unpcklpd %%xmm7, %%xmm7 \n\t" \ + "mulpd %%xmm1, %%xmm6 \n\t" \ + "mulpd %%xmm2, %%xmm7 \n\t" \ + "addpd %%xmm6, %%xmm5 \n\t" \ + "addpd %%xmm7, %%xmm3 \n\t" \ + "movsd %7, %%xmm6 \n\t" \ + "movsd %8, %%xmm7 \n\t" \ + "unpcklpd %%xmm6, %%xmm6 \n\t" \ + "unpcklpd %%xmm7, %%xmm7 \n\t" \ + "mulpd %%xmm1, %%xmm6 \n\t" \ + "mulpd %%xmm2, %%xmm7 \n\t" \ + "addpd %%xmm6, %%xmm4 \n\t" \ + "addpd %%xmm7, %%xmm5" \ + : \ + : \ + "m" (creal((u).c00)), \ + "m" (creal((u).c10)), \ + "m" (creal((u).c01)), \ + "m" (creal((u).c21)), \ + "m" (creal((u).c02)), \ + "m" (creal((u).c12)), \ + "m" (creal((u).c20)), \ + "m" (creal((u).c11)), \ + "m" (creal((u).c22))); \ +__asm__ __volatile__ ("movsd %0, %%xmm6 \n\t" \ + "movsd %1, %%xmm7 \n\t" \ + "xorpd %9, %%xmm0 \n\t" \ + "xorpd %9, %%xmm1 \n\t" \ + "xorpd %9, %%xmm2 \n\t" \ + "unpcklpd %%xmm6, %%xmm6 \n\t" \ + "unpcklpd %%xmm7, %%xmm7 \n\t" \ + "shufpd $0x1, %%xmm0, %%xmm0 \n\t" \ + "shufpd $0x1, %%xmm1, %%xmm1 \n\t" \ + "shufpd $0x1, %%xmm2, %%xmm2 \n\t" \ + "mulpd %%xmm0, %%xmm6 \n\t" \ + "mulpd %%xmm1, %%xmm7 \n\t" \ + "addpd %%xmm6, %%xmm3 \n\t" \ + "addpd %%xmm7, %%xmm4 \n\t" \ + "movsd %2, %%xmm6 \n\t" \ + "movsd %3, %%xmm7 \n\t" \ + "unpcklpd %%xmm6, %%xmm6 \n\t" \ + "unpcklpd %%xmm7, %%xmm7 \n\t" \ + "mulpd %%xmm2, %%xmm6 \n\t" \ + "mulpd %%xmm0, %%xmm7 \n\t" \ + "addpd %%xmm6, %%xmm5 \n\t" \ + "addpd %%xmm7, %%xmm4 \n\t" \ + "movsd %4, %%xmm6 \n\t" \ + "movsd %5, %%xmm7 \n\t" \ + "unpcklpd %%xmm6, %%xmm6 \n\t" \ + "unpcklpd %%xmm7, %%xmm7 \n\t" \ + "mulpd %%xmm1, %%xmm6 \n\t" \ + "mulpd %%xmm0, %%xmm7 \n\t" \ + "addpd %%xmm6, %%xmm3 \n\t" \ + "addpd %%xmm7, %%xmm5 \n\t" \ + "movsd %6, %%xmm0 \n\t" \ + "movsd %7, %%xmm6 \n\t" \ + "movsd %8, %%xmm7 \n\t" \ + "unpcklpd %%xmm0, %%xmm0 \n\t" \ + "unpcklpd %%xmm6, %%xmm6 \n\t" \ + "unpcklpd %%xmm7, %%xmm7 \n\t" \ + "mulpd %%xmm2, %%xmm0 \n\t" \ + "mulpd %%xmm1, %%xmm6 \n\t" \ + "mulpd %%xmm2, %%xmm7 \n\t" \ + "addpd %%xmm0, %%xmm3 \n\t" \ + "addpd %%xmm6, %%xmm5 \n\t" \ + "addpd %%xmm7, %%xmm4" \ + : \ + : \ + "m" (cimag((u).c00)), \ + "m" (cimag((u).c11)), \ + "m" (cimag((u).c22)), \ + "m" (cimag((u).c01)), \ + "m" (cimag((u).c10)), \ + "m" (cimag((u).c02)), \ + "m" (cimag((u).c20)), \ + "m" (cimag((u).c12)), \ + "m" (cimag((u).c21)), \ + "m" (_sse_sgn)); +/* OPTERON */ +#endif + +// else for ifndef SSE3 +#else + +#include "sse3.h" + +#endif + +/* _sse_su3_times_su3 by martin hasenbusch */ +#define _sse_su3_times_su3(u3,u1,u2) \ +__asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" \ + "movapd %1, %%xmm1 \n\t" \ + "movapd %2, %%xmm2" \ + : \ + : \ + "m" ((u2).c00), \ + "m" ((u2).c10), \ + "m" ((u2).c20)); \ +_sse_su3_multiply(u1); \ +__asm__ __volatile__ ("movapd %%xmm3, %0 \n\t" \ + "movapd %%xmm4, %1 \n\t" \ + "movapd %%xmm5, %2" \ + : \ + "=m" ((u3).c00), \ + "=m" ((u3).c10), \ + "=m" ((u3).c20)) ; \ +__asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" \ + "movapd %1, %%xmm1 \n\t" \ + "movapd %2, %%xmm2" \ + : \ + : \ + "m" ((u2).c01), \ + "m" ((u2).c11), \ + "m" ((u2).c21)); \ +_sse_su3_multiply(u1); \ +__asm__ __volatile__ ("movapd %%xmm3, %0 \n\t" \ + "movapd %%xmm4, %1 \n\t" \ + "movapd %%xmm5, %2" \ + : \ + "=m" ((u3).c01), \ + "=m" ((u3).c11), \ + "=m" ((u3).c21)) ; \ +__asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" \ + "movapd %1, %%xmm1 \n\t" \ + "movapd %2, %%xmm2" \ + : \ + : \ + "m" ((u2).c02), \ + "m" ((u2).c12), \ + "m" ((u2).c22)); \ +_sse_su3_multiply(u1); \ +__asm__ __volatile__ ("movapd %%xmm3, %0 \n\t" \ + "movapd %%xmm4, %1 \n\t" \ + "movapd %%xmm5, %2" \ + : \ + "=m" ((u3).c02), \ + "=m" ((u3).c12), \ + "=m" ((u3).c22)) ; + +/* _sse_su3_times_su3_acc by martin hasenbusch */ +#define _sse_su3_times_su3_acc(u3,u1,u2) \ +__asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" \ + "movapd %1, %%xmm1 \n\t" \ + "movapd %2, %%xmm2" \ + : \ + : \ + "m" ((u2).c00), \ + "m" ((u2).c10), \ + "m" ((u2).c20)); \ +_sse_su3_multiply(u1); \ +__asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" \ + "movapd %1, %%xmm1 \n\t" \ + "movapd %2, %%xmm2" \ + : \ + : \ + "m" ((u3).c00), \ + "m" ((u3).c10), \ + "m" ((u3).c20)); \ +__asm__ __volatile__ ("addpd %%xmm3, %%xmm0 \n\t" \ + "addpd %%xmm4, %%xmm1 \n\t" \ + "addpd %%xmm5, %%xmm2" \ + : \ + :) ; \ +__asm__ __volatile__ ("movapd %%xmm0, %0 \n\t" \ + "movapd %%xmm1, %1 \n\t" \ + "movapd %%xmm2, %2" \ + : \ + "=m" ((u3).c00), \ + "=m" ((u3).c10), \ + "=m" ((u3).c20)) ; \ + \ +__asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" \ + "movapd %1, %%xmm1 \n\t" \ + "movapd %2, %%xmm2" \ + : \ + : \ + "m" ((u2).c01), \ + "m" ((u2).c11), \ + "m" ((u2).c21)); \ +_sse_su3_multiply(u1); \ +__asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" \ + "movapd %1, %%xmm1 \n\t" \ + "movapd %2, %%xmm2" \ + : \ + : \ + "m" ((u3).c01), \ + "m" ((u3).c11), \ + "m" ((u3).c21)); \ +__asm__ __volatile__ ("addpd %%xmm3, %%xmm0 \n\t" \ + "addpd %%xmm4, %%xmm1 \n\t" \ + "addpd %%xmm5, %%xmm2" \ + : \ + :) ; \ +__asm__ __volatile__ ("movapd %%xmm0, %0 \n\t" \ + "movapd %%xmm1, %1 \n\t" \ + "movapd %%xmm2, %2" \ + : \ + "=m" ((u3).c01), \ + "=m" ((u3).c11), \ + "=m" ((u3).c21)) ; \ + \ +__asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" \ + "movapd %1, %%xmm1 \n\t" \ + "movapd %2, %%xmm2" \ + : \ + : \ + "m" ((u2).c02), \ + "m" ((u2).c12), \ + "m" ((u2).c22)); \ +_sse_su3_multiply(u1); \ +__asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" \ + "movapd %1, %%xmm1 \n\t" \ + "movapd %2, %%xmm2" \ + : \ + : \ + "m" ((u3).c02), \ + "m" ((u3).c12), \ + "m" ((u3).c22)); \ +__asm__ __volatile__ ("addpd %%xmm3, %%xmm0 \n\t" \ + "addpd %%xmm4, %%xmm1 \n\t" \ + "addpd %%xmm5, %%xmm2" \ + : \ + :) ; \ +__asm__ __volatile__ ("movapd %%xmm0, %0 \n\t" \ + "movapd %%xmm1, %1 \n\t" \ + "movapd %%xmm2, %2" \ + : \ + "=m" ((u3).c02), \ + "=m" ((u3).c12), \ + "=m" ((u3).c22)) ; + +/* _sse_su3_times_su3d_acc by Carsten Urbach */ +/* NOT TESTED YET */ +#define _sse_su3_times_su3d_acc(u3,u1,u2) \ +__asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" \ + "movapd %1, %%xmm1 \n\t" \ + "movapd %2, %%xmm2" \ + : \ + : \ + "m" ((u2).c00), \ + "m" ((u2).c01), \ + "m" ((u2).c02)); \ +__asm__ __volatile__ ("xorpd %0, %%xmm0 \n\t" \ + "xorpd %0, %%xmm1 \n\t" \ + "xorpd %0, %%xmm2" \ + : \ + : \ + "m" (_sse_sgn2)); \ +_sse_su3_multiply(u1); \ +__asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" \ + "movapd %1, %%xmm1 \n\t" \ + "movapd %2, %%xmm2" \ + : \ + : \ + "m" ((u3).c00), \ + "m" ((u3).c10), \ + "m" ((u3).c20)); \ +__asm__ __volatile__ ("addpd %%xmm3, %%xmm0 \n\t" \ + "addpd %%xmm4, %%xmm1 \n\t" \ + "addpd %%xmm5, %%xmm2" \ + : \ + :) ; \ +__asm__ __volatile__ ("movapd %%xmm0, %0 \n\t" \ + "movapd %%xmm1, %1 \n\t" \ + "movapd %%xmm2, %2" \ + : \ + "=m" ((u3).c00), \ + "=m" ((u3).c10), \ + "=m" ((u3).c20)) ; \ +__asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" \ + "movapd %1, %%xmm1 \n\t" \ + "movapd %2, %%xmm2" \ + : \ + : \ + "m" ((u2).c10), \ + "m" ((u2).c11), \ + "m" ((u2).c12)); \ +__asm__ __volatile__ ("xorpd %0, %%xmm0 \n\t" \ + "xorpd %0, %%xmm1 \n\t" \ + "xorpd %0, %%xmm2" \ + : \ + : \ + "m" (_sse_sgn2)); \ +_sse_su3_multiply(u1); \ +__asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" \ + "movapd %1, %%xmm1 \n\t" \ + "movapd %2, %%xmm2" \ + : \ + : \ + "m" ((u3).c01), \ + "m" ((u3).c11), \ + "m" ((u3).c21)); \ +__asm__ __volatile__ ("addpd %%xmm3, %%xmm0 \n\t" \ + "addpd %%xmm4, %%xmm1 \n\t" \ + "addpd %%xmm5, %%xmm2" \ + : \ + :) ; \ +__asm__ __volatile__ ("movapd %%xmm0, %0 \n\t" \ + "movapd %%xmm1, %1 \n\t" \ + "movapd %%xmm2, %2" \ + : \ + "=m" ((u3).c01), \ + "=m" ((u3).c11), \ + "=m" ((u3).c21)) ; \ + \ +__asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" \ + "movapd %1, %%xmm1 \n\t" \ + "movapd %2, %%xmm2" \ + : \ + : \ + "m" ((u2).c20), \ + "m" ((u2).c21), \ + "m" ((u2).c22)); \ +__asm__ __volatile__ ("xorpd %0, %%xmm0 \n\t" \ + "xorpd %0, %%xmm1 \n\t" \ + "xorpd %0, %%xmm2" \ + : \ + : \ + "m" (_sse_sgn2)); \ +_sse_su3_multiply(u1); \ +__asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" \ + "movapd %1, %%xmm1 \n\t" \ + "movapd %2, %%xmm2" \ + : \ + : \ + "m" ((u3).c02), \ + "m" ((u3).c12), \ + "m" ((u3).c22)); \ +__asm__ __volatile__ ("addpd %%xmm3, %%xmm0 \n\t" \ + "addpd %%xmm4, %%xmm1 \n\t" \ + "addpd %%xmm5, %%xmm2" \ + : \ + :) ; \ +__asm__ __volatile__ ("movapd %%xmm0, %0 \n\t" \ + "movapd %%xmm1, %1 \n\t" \ + "movapd %%xmm2, %2" \ + : \ + "=m" ((u3).c02), \ + "=m" ((u3).c12), \ + "=m" ((u3).c22)) ; + + +/* _sse_su3d_times_su3 by martin hasenbusch */ +#define _sse_su3d_times_su3(u3,u1,u2) \ +__asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" \ + "movapd %1, %%xmm1 \n\t" \ + "movapd %2, %%xmm2" \ + : \ + : \ + "m" ((u2).c00), \ + "m" ((u2).c10), \ + "m" ((u2).c20)); \ +_sse_su3_inverse_multiply(u1); \ +__asm__ __volatile__ ("movapd %%xmm3, %0 \n\t" \ + "movapd %%xmm4, %1 \n\t" \ + "movapd %%xmm5, %2" \ + : \ + "=m" ((u3).c00), \ + "=m" ((u3).c10), \ + "=m" ((u3).c20)) ; \ +__asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" \ + "movapd %1, %%xmm1 \n\t" \ + "movapd %2, %%xmm2" \ + : \ + : \ + "m" ((u2).c01), \ + "m" ((u2).c11), \ + "m" ((u2).c21)); \ +_sse_su3_inverse_multiply(u1); \ +__asm__ __volatile__ ("movapd %%xmm3, %0 \n\t" \ + "movapd %%xmm4, %1 \n\t" \ + "movapd %%xmm5, %2" \ + : \ + "=m" ((u3).c01), \ + "=m" ((u3).c11), \ + "=m" ((u3).c21)) ; \ +__asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" \ + "movapd %1, %%xmm1 \n\t" \ + "movapd %2, %%xmm2" \ + : \ + : \ + "m" ((u2).c02), \ + "m" ((u2).c12), \ + "m" ((u2).c22)); \ +_sse_su3_inverse_multiply(u1); \ +__asm__ __volatile__ ("movapd %%xmm3, %0 \n\t" \ + "movapd %%xmm4, %1 \n\t" \ + "movapd %%xmm5, %2" \ + : \ + "=m" ((u3).c02), \ + "=m" ((u3).c12), \ + "=m" ((u3).c22)) ; + +/* _sse_su3d_times_su3_acc by martin hasenbusch */ +#define _sse_su3d_times_su3_acc(u3,u1,u2) \ +__asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" \ + "movapd %1, %%xmm1 \n\t" \ + "movapd %2, %%xmm2" \ + : \ + : \ + "m" ((u2).c00), \ + "m" ((u2).c10), \ + "m" ((u2).c20)); \ +_sse_su3_inverse_multiply(u1); \ +__asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" \ + "movapd %1, %%xmm1 \n\t" \ + "movapd %2, %%xmm2" \ + : \ + : \ + "m" ((u3).c00), \ + "m" ((u3).c10), \ + "m" ((u3).c20)); \ +__asm__ __volatile__ ("addpd %%xmm3, %%xmm0 \n\t" \ + "addpd %%xmm4, %%xmm1 \n\t" \ + "addpd %%xmm5, %%xmm2" \ + : \ + :) ; \ +__asm__ __volatile__ ("movapd %%xmm0, %0 \n\t" \ + "movapd %%xmm1, %1 \n\t" \ + "movapd %%xmm2, %2" \ + : \ + "=m" ((u3).c00), \ + "=m" ((u3).c10), \ + "=m" ((u3).c20)) ; \ + \ +__asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" \ + "movapd %1, %%xmm1 \n\t" \ + "movapd %2, %%xmm2" \ + : \ + : \ + "m" ((u2).c01), \ + "m" ((u2).c11), \ + "m" ((u2).c21)); \ +_sse_su3_inverse_multiply(u1); \ +__asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" \ + "movapd %1, %%xmm1 \n\t" \ + "movapd %2, %%xmm2" \ + : \ + : \ + "m" ((u3).c01), \ + "m" ((u3).c11), \ + "m" ((u3).c21)); \ +__asm__ __volatile__ ("addpd %%xmm3, %%xmm0 \n\t" \ + "addpd %%xmm4, %%xmm1 \n\t" \ + "addpd %%xmm5, %%xmm2" \ + : \ + :) ; \ +__asm__ __volatile__ ("movapd %%xmm0, %0 \n\t" \ + "movapd %%xmm1, %1 \n\t" \ + "movapd %%xmm2, %2" \ + : \ + "=m" ((u3).c01), \ + "=m" ((u3).c11), \ + "=m" ((u3).c21)) ; \ + \ +__asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" \ + "movapd %1, %%xmm1 \n\t" \ + "movapd %2, %%xmm2" \ + : \ + : \ + "m" ((u2).c02), \ + "m" ((u2).c12), \ + "m" ((u2).c22)); \ +_sse_su3_inverse_multiply(u1); \ +__asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" \ + "movapd %1, %%xmm1 \n\t" \ + "movapd %2, %%xmm2" \ + : \ + : \ + "m" ((u3).c02), \ + "m" ((u3).c12), \ + "m" ((u3).c22)); \ +__asm__ __volatile__ ("addpd %%xmm3, %%xmm0 \n\t" \ + "addpd %%xmm4, %%xmm1 \n\t" \ + "addpd %%xmm5, %%xmm2" \ + : \ + :) ; \ +__asm__ __volatile__ ("movapd %%xmm0, %0 \n\t" \ + "movapd %%xmm1, %1 \n\t" \ + "movapd %%xmm2, %2" \ + : \ + "=m" ((u3).c02), \ + "=m" ((u3).c12), \ + "=m" ((u3).c22)) ; + +/* _sse_su3_times_su3d by martin hasenbusch */ +#define _sse_su3_times_su3d(u3,u1,u2) \ +__asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" \ + "movapd %1, %%xmm1 \n\t" \ + "movapd %2, %%xmm2" \ + : \ + : \ + "m" ((u2).c00), \ + "m" ((u2).c01), \ + "m" ((u2).c02)); \ +__asm__ __volatile__ ("xorpd %0, %%xmm0 \n\t" \ + "xorpd %0, %%xmm1 \n\t" \ + "xorpd %0, %%xmm2" \ + : \ + : \ + "m" (_sse_sgn2)); \ +_sse_su3_multiply(u1); \ +__asm__ __volatile__ ("movapd %%xmm3, %0 \n\t" \ + "movapd %%xmm4, %1 \n\t" \ + "movapd %%xmm5, %2" \ + : \ + "=m" ((u3).c00), \ + "=m" ((u3).c10), \ + "=m" ((u3).c20)) ; \ +__asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" \ + "movapd %1, %%xmm1 \n\t" \ + "movapd %2, %%xmm2" \ + : \ + : \ + "m" ((u2).c10), \ + "m" ((u2).c11), \ + "m" ((u2).c12)); \ +__asm__ __volatile__ ("xorpd %0, %%xmm0 \n\t" \ + "xorpd %0, %%xmm1 \n\t" \ + "xorpd %0, %%xmm2" \ + : \ + : \ + "m" (_sse_sgn2)); \ +_sse_su3_multiply(u1); \ +__asm__ __volatile__ ("movapd %%xmm3, %0 \n\t" \ + "movapd %%xmm4, %1 \n\t" \ + "movapd %%xmm5, %2" \ + : \ + "=m" ((u3).c01), \ + "=m" ((u3).c11), \ + "=m" ((u3).c21)) ; \ +__asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" \ + "movapd %1, %%xmm1 \n\t" \ + "movapd %2, %%xmm2" \ + : \ + : \ + "m" ((u2).c20), \ + "m" ((u2).c21), \ + "m" ((u2).c22)); \ +__asm__ __volatile__ ("xorpd %0, %%xmm0 \n\t" \ + "xorpd %0, %%xmm1 \n\t" \ + "xorpd %0, %%xmm2" \ + : \ + : \ + "m" (_sse_sgn2)); \ +_sse_su3_multiply(u1); \ +__asm__ __volatile__ ("movapd %%xmm3, %0 \n\t" \ + "movapd %%xmm4, %1 \n\t" \ + "movapd %%xmm5, %2" \ + : \ + "=m" ((u3).c02), \ + "=m" ((u3).c12), \ + "=m" ((u3).c22)) ; + +#define _sse_su3_acc(u1,u2) \ +__asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" \ + "movapd %1, %%xmm1 \n\t" \ + "movapd %2, %%xmm2" \ + : \ + : \ + "m" ((u1).c00), \ + "m" ((u1).c01), \ + "m" ((u1).c02)); \ +__asm__ __volatile__ ("movapd %0, %%xmm3 \n\t" \ + "movapd %1, %%xmm4 \n\t" \ + "movapd %2, %%xmm5" \ + : \ + : \ + "m" ((u2).c00), \ + "m" ((u2).c01), \ + "m" ((u2).c02)); \ +__asm__ __volatile__ ("addpd %%xmm3, %%xmm0 \n\t" \ + "addpd %%xmm4, %%xmm1 \n\t" \ + "addpd %%xmm5, %%xmm2" \ + : \ + :) ; \ +__asm__ __volatile__ ("movapd %%xmm0, %0 \n\t" \ + "movapd %%xmm1, %1 \n\t" \ + "movapd %%xmm2, %2" \ + : \ + "=m" ((u1).c00), \ + "=m" ((u1).c01), \ + "=m" ((u1).c02)) ; \ +__asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" \ + "movapd %1, %%xmm1 \n\t" \ + "movapd %2, %%xmm2" \ + : \ + : \ + "m" ((u1).c10), \ + "m" ((u1).c11), \ + "m" ((u1).c12)); \ +__asm__ __volatile__ ("movapd %0, %%xmm3 \n\t" \ + "movapd %1, %%xmm4 \n\t" \ + "movapd %2, %%xmm5" \ + : \ + : \ + "m" ((u2).c10), \ + "m" ((u2).c11), \ + "m" ((u2).c12)); \ +__asm__ __volatile__ ("addpd %%xmm3, %%xmm0 \n\t" \ + "addpd %%xmm4, %%xmm1 \n\t" \ + "addpd %%xmm5, %%xmm2" \ + : \ + :) ; \ +__asm__ __volatile__ ("movapd %%xmm0, %0 \n\t" \ + "movapd %%xmm1, %1 \n\t" \ + "movapd %%xmm2, %2" \ + : \ + "=m" ((u1).c10), \ + "=m" ((u1).c11), \ + "=m" ((u1).c12)) ; \ +__asm__ __volatile__ ("movapd %0, %%xmm0 \n\t" \ + "movapd %1, %%xmm1 \n\t" \ + "movapd %2, %%xmm2" \ + : \ + : \ + "m" ((u1).c20), \ + "m" ((u1).c21), \ + "m" ((u1).c22)); \ +__asm__ __volatile__ ("movapd %0, %%xmm3 \n\t" \ + "movapd %1, %%xmm4 \n\t" \ + "movapd %2, %%xmm5" \ + : \ + : \ + "m" ((u2).c20), \ + "m" ((u2).c21), \ + "m" ((u2).c22)); \ +__asm__ __volatile__ ("addpd %%xmm3, %%xmm0 \n\t" \ + "addpd %%xmm4, %%xmm1 \n\t" \ + "addpd %%xmm5, %%xmm2" \ + : \ + :) ; \ +__asm__ __volatile__ ("movapd %%xmm0, %0 \n\t" \ + "movapd %%xmm1, %1 \n\t" \ + "movapd %%xmm2, %2" \ + : \ + "=m" ((u1).c20), \ + "=m" ((u1).c21), \ + "=m" ((u1).c22)) ; + +#endif + +#endif + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/sse3.h b/qcd/part_cpu/applications/QCD/src/kernel_D/sse3.h new file mode 100644 index 0000000000000000000000000000000000000000..718832ee5e34276179e7842983c68771cf0fd827 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/sse3.h @@ -0,0 +1,471 @@ +/*********************************************************************** + * + * Copyright (C) 2004 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * SSE3 versions of macros used in the Dirac operator + * + ***********************************************************************/ + +#ifndef _SSE3_h +#define _SSE3_h + +/* + * C.Urbach + * Multiplies xmm3,xmm4,xmm5 with the complex number stored + * in xmm6 and xmm7 + */ + +#define _sse_vector_cmplx_mul_two() \ +__asm__ __volatile__ ("movapd %%xmm7, %%xmm0 \n\t" \ + "movapd %%xmm7, %%xmm1 \n\t" \ + "movapd %%xmm7, %%xmm2 \n\t" \ + "mulpd %%xmm3, %%xmm0 \n\t" \ + "mulpd %%xmm6, %%xmm3 \n\t" \ + "mulpd %%xmm4, %%xmm1 \n\t" \ + "mulpd %%xmm6, %%xmm4 \n\t" \ + "mulpd %%xmm5, %%xmm2 \n\t" \ + "mulpd %%xmm6, %%xmm5 \n\t" \ + "shufpd $0x1, %%xmm0, %%xmm0 \n\t" \ + "shufpd $0x1, %%xmm1, %%xmm1 \n\t" \ + "shufpd $0x1, %%xmm2, %%xmm2 \n\t" \ + "addsubpd %%xmm0, %%xmm3 \n\t" \ + "addsubpd %%xmm1, %%xmm4 \n\t" \ + "addsubpd %%xmm2, %%xmm5 \n\t" \ + : \ + :); + + + +/* + * C. Urbach Thu Aug 19 15:07:01 CEST 2004 + * Multiplies xmm3,xmm4,xmm5 with the complex number c + * using SSE3 instructions + */ +#define _sse_vector_cmplx_mul(c) \ +__asm__ __volatile__ ("movddup %0, %%xmm6 \n\t" \ + "movddup %1, %%xmm7 \n\t" \ + "movapd %%xmm7, %%xmm0 \n\t" \ + "movapd %%xmm7, %%xmm1 \n\t" \ + "movapd %%xmm7, %%xmm2 \n\t" \ + "mulpd %%xmm3, %%xmm0 \n\t" \ + "mulpd %%xmm6, %%xmm3 \n\t" \ + "mulpd %%xmm4, %%xmm1 \n\t" \ + "mulpd %%xmm6, %%xmm4 \n\t" \ + "mulpd %%xmm5, %%xmm2 \n\t" \ + "mulpd %%xmm6, %%xmm5 \n\t" \ + "shufpd $0x1, %%xmm0, %%xmm0 \n\t" \ + "shufpd $0x1, %%xmm1, %%xmm1 \n\t" \ + "shufpd $0x1, %%xmm2, %%xmm2 \n\t" \ + "addsubpd %%xmm0, %%xmm3 \n\t" \ + "addsubpd %%xmm1, %%xmm4 \n\t" \ + "addsubpd %%xmm2, %%xmm5 \n\t" \ + : \ + : \ + "m" (creal(c)), \ + "m" (cimag(c))) ; + + +/* + * C. Urbach Thu Aug 19 15:07:01 CEST 2004 + * Multiplies xmm3,xmm4,xmm5 with the complex + * conjugate of the number c + * using SSE3 instructions + */ +#define _sse_vector_cmplxcg_mul(c) \ +__asm__ __volatile__ ("movddup %0, %%xmm6 \n\t" \ + "movddup %1, %%xmm7 \n\t" \ + "movapd %%xmm7, %%xmm0 \n\t" \ + "movapd %%xmm7, %%xmm1 \n\t" \ + "movapd %%xmm7, %%xmm2 \n\t" \ + "mulpd %%xmm3, %%xmm0 \n\t" \ + "mulpd %%xmm6, %%xmm3 \n\t" \ + "mulpd %%xmm4, %%xmm1 \n\t" \ + "mulpd %%xmm6, %%xmm4 \n\t" \ + "mulpd %%xmm5, %%xmm2 \n\t" \ + "mulpd %%xmm6, %%xmm5 \n\t" \ + "shufpd $0x1, %%xmm3, %%xmm3 \n\t" \ + "shufpd $0x1, %%xmm4, %%xmm4 \n\t" \ + "shufpd $0x1, %%xmm5, %%xmm5 \n\t" \ + "addsubpd %%xmm0, %%xmm3 \n\t" \ + "addsubpd %%xmm1, %%xmm4 \n\t" \ + "addsubpd %%xmm2, %%xmm5 \n\t" \ + "shufpd $0x1, %%xmm3, %%xmm3 \n\t" \ + "shufpd $0x1, %%xmm4, %%xmm4 \n\t" \ + "shufpd $0x1, %%xmm5, %%xmm5 \n\t" \ + : \ + : \ + "m" (creal(c)), \ + "m" (cimag(c))) ; + + +/* + * C. Urbach + * SSE3 implementation + * Multiplies an su3 vector s with an su3 matrix u, assuming s is + * stored in xmm0,xmm1,xmm2 + * + * On output the result is in xmm3,xmm4,xmm5 and the registers + * xmm0,xmm1,xmm2 are changed + */ +#if defined OPTERON +#define _sse_su3_multiply(u) \ +__asm__ __volatile__ ("movddup %0, %%xmm3 \n\t" \ + "movddup %1, %%xmm6 \n\t" \ + "movddup %2, %%xmm4 \n\t" \ + "mulpd %%xmm0, %%xmm3 \n\t" \ + "movddup %3, %%xmm7 \n\t" \ + "mulpd %%xmm1, %%xmm6 \n\t" \ + "movddup %4, %%xmm5 \n\t" \ + "mulpd %%xmm0, %%xmm4 \n\t" \ + "addpd %%xmm6, %%xmm3 \n\t" \ + "movddup %5, %%xmm8 \n\t" \ + "mulpd %%xmm2, %%xmm7 \n\t" \ + "mulpd %%xmm0, %%xmm5 \n\t" \ + "movddup %6, %%xmm9 \n\t" \ + "addpd %%xmm7, %%xmm4 \n\t" \ + "mulpd %%xmm1, %%xmm8 \n\t" \ + "movddup %7, %%xmm10 \n\t" \ + "mulpd %%xmm2, %%xmm9 \n\t" \ + "movddup %8, %%xmm11 \n\t" \ + "mulpd %%xmm1, %%xmm10 \n\t" \ + "mulpd %%xmm2, %%xmm11 \n\t" \ + "addpd %%xmm8, %%xmm5 \n\t" \ + "addpd %%xmm9, %%xmm3 \n\t" \ + "addpd %%xmm10, %%xmm4 \n\t" \ + "addpd %%xmm11, %%xmm5" \ + : \ + : \ + "m" (creal((u).c00)), \ + "m" (creal((u).c01)), \ + "m" (creal((u).c10)), \ + "m" (creal((u).c12)), \ + "m" (creal((u).c20)), \ + "m" (creal((u).c21)), \ + "m" (creal((u).c02)), \ + "m" (creal((u).c11)), \ + "m" (creal((u).c22))); \ +__asm__ __volatile__ ("movddup %0, %%xmm6 \n\t" \ + "movddup %1, %%xmm7 \n\t" \ + "movddup %2, %%xmm8 \n\t" \ + "shufpd $0x1, %%xmm0, %%xmm0 \n\t" \ + "shufpd $0x1, %%xmm1, %%xmm1 \n\t" \ + "shufpd $0x1, %%xmm2, %%xmm2 \n\t" \ + "mulpd %%xmm0, %%xmm6 \n\t" \ + "mulpd %%xmm1, %%xmm7 \n\t" \ + "mulpd %%xmm2, %%xmm8 \n\t" \ + "movddup %3, %%xmm9 \n\t" \ + "movddup %4, %%xmm10 \n\t" \ + "movddup %5, %%xmm11 \n\t" \ + "addsubpd %%xmm6, %%xmm3 \n\t" \ + "addsubpd %%xmm7, %%xmm4 \n\t" \ + "addsubpd %%xmm8, %%xmm5 \n\t" \ + "mulpd %%xmm0, %%xmm9 \n\t" \ + "mulpd %%xmm1, %%xmm10 \n\t" \ + "mulpd %%xmm0, %%xmm11 \n\t" \ + "movddup %6, %%xmm12 \n\t" \ + "movddup %7, %%xmm13 \n\t" \ + "movddup %8, %%xmm14 \n\t" \ + "addsubpd %%xmm9, %%xmm4 \n\t" \ + "addsubpd %%xmm10, %%xmm3 \n\t" \ + "addsubpd %%xmm11, %%xmm5 \n\t" \ + "mulpd %%xmm2, %%xmm12 \n\t" \ + "mulpd %%xmm1, %%xmm13 \n\t" \ + "mulpd %%xmm2, %%xmm14 \n\t" \ + "addsubpd %%xmm12, %%xmm3 \n\t" \ + "addsubpd %%xmm13, %%xmm5 \n\t" \ + "addsubpd %%xmm14, %%xmm4" \ + : \ + : \ + "m" (cimag((u).c00)), \ + "m" (cimag((u).c11)), \ + "m" (cimag((u).c22)), \ + "m" (cimag((u).c10)), \ + "m" (cimag((u).c01)), \ + "m" (cimag((u).c20)), \ + "m" (cimag((u).c02)), \ + "m" (cimag((u).c21)), \ + "m" (cimag((u).c12))) + + +#else + +#define _sse_su3_multiply(u) \ +__asm__ __volatile__ ("movddup %0, %%xmm3 \n\t" \ + "movddup %1, %%xmm6 \n\t" \ + "movddup %2, %%xmm4 \n\t" \ + "mulpd %%xmm0, %%xmm3 \n\t" \ + "movddup %3, %%xmm7 \n\t" \ + "mulpd %%xmm1, %%xmm6 \n\t" \ + "movddup %4, %%xmm5 \n\t" \ + "mulpd %%xmm0, %%xmm4 \n\t" \ + "addpd %%xmm6, %%xmm3 \n\t" \ + "mulpd %%xmm2, %%xmm7 \n\t" \ + "mulpd %%xmm0, %%xmm5 \n\t" \ + "addpd %%xmm7, %%xmm4 \n\t" \ + "movddup %5, %%xmm6 \n\t" \ + "movddup %6, %%xmm7 \n\t" \ + "mulpd %%xmm1, %%xmm6 \n\t" \ + "mulpd %%xmm2, %%xmm7 \n\t" \ + "addpd %%xmm6, %%xmm5 \n\t" \ + "addpd %%xmm7, %%xmm3 \n\t" \ + "movddup %7, %%xmm6 \n\t" \ + "movddup %8, %%xmm7 \n\t" \ + "mulpd %%xmm1, %%xmm6 \n\t" \ + "mulpd %%xmm2, %%xmm7 \n\t" \ + "addpd %%xmm6, %%xmm4 \n\t" \ + "addpd %%xmm7, %%xmm5" \ + : \ + : \ + "m" (creal((u).c00)), \ + "m" (creal((u).c01)), \ + "m" (creal((u).c10)), \ + "m" (creal((u).c12)), \ + "m" (creal((u).c20)), \ + "m" (creal((u).c21)), \ + "m" (creal((u).c02)), \ + "m" (creal((u).c11)), \ + "m" (creal((u).c22))); \ +__asm__ __volatile__ ("movddup %0, %%xmm6 \n\t" \ + "movddup %1, %%xmm7 \n\t" \ + "shufpd $0x1, %%xmm0, %%xmm0 \n\t" \ + "shufpd $0x1, %%xmm1, %%xmm1 \n\t" \ + "shufpd $0x1, %%xmm2, %%xmm2 \n\t" \ + "mulpd %%xmm0, %%xmm6 \n\t" \ + "mulpd %%xmm1, %%xmm7 \n\t" \ + "addsubpd %%xmm6, %%xmm3 \n\t" \ + "addsubpd %%xmm7, %%xmm4 \n\t" \ + "movddup %2, %%xmm6 \n\t" \ + "movddup %3, %%xmm7 \n\t" \ + "mulpd %%xmm2, %%xmm6 \n\t" \ + "mulpd %%xmm0, %%xmm7 \n\t" \ + "addsubpd %%xmm6, %%xmm5 \n\t" \ + "addsubpd %%xmm7, %%xmm4 \n\t" \ + "movddup %4, %%xmm6 \n\t" \ + "movddup %5, %%xmm7 \n\t" \ + "mulpd %%xmm1, %%xmm6 \n\t" \ + "mulpd %%xmm0, %%xmm7 \n\t" \ + "addsubpd %%xmm6, %%xmm3 \n\t" \ + "addsubpd %%xmm7, %%xmm5 \n\t" \ + "movddup %6, %%xmm0 \n\t" \ + "movddup %7, %%xmm6 \n\t" \ + "movddup %8, %%xmm7 \n\t" \ + "mulpd %%xmm2, %%xmm0 \n\t" \ + "mulpd %%xmm1, %%xmm6 \n\t" \ + "mulpd %%xmm2, %%xmm7 \n\t" \ + "addsubpd %%xmm0, %%xmm3 \n\t" \ + "addsubpd %%xmm6, %%xmm5 \n\t" \ + "addsubpd %%xmm7, %%xmm4" \ + : \ + : \ + "m" (cimag((u).c00)), \ + "m" (cimag((u).c11)), \ + "m" (cimag((u).c22)), \ + "m" (cimag((u).c10)), \ + "m" (cimag((u).c01)), \ + "m" (cimag((u).c20)), \ + "m" (cimag((u).c02)), \ + "m" (cimag((u).c21)), \ + "m" (cimag((u).c12))) + + +#endif + +/* + * C. Urbach + * SSE3 Implementation of + * Multiplies an su3 vector s with an su3 matrix u^dagger, assuming s is + * stored in xmm0,xmm1,xmm2 + * + * On output the result is in xmm3,xmm4,xmm5 and the registers + * xmm0,xmm1,xmm2 are changed + */ + +#if defined OPTERON + +#define _sse_su3_inverse_multiply(u) \ +__asm__ __volatile__ ("movddup %0, %%xmm3 \n\t" \ + "movddup %1, %%xmm6 \n\t" \ + "movddup %2, %%xmm4 \n\t" \ + "mulpd %%xmm0, %%xmm3 \n\t" \ + "movddup %3, %%xmm7 \n\t" \ + "mulpd %%xmm1, %%xmm6 \n\t" \ + "movddup %4, %%xmm5 \n\t" \ + "mulpd %%xmm0, %%xmm4 \n\t" \ + "addpd %%xmm6, %%xmm3 \n\t" \ + "mulpd %%xmm2, %%xmm7 \n\t" \ + "mulpd %%xmm0, %%xmm5 \n\t" \ + "addpd %%xmm7, %%xmm4 \n\t" \ + "movddup %5, %%xmm8 \n\t" \ + "movddup %6, %%xmm9 \n\t" \ + "mulpd %%xmm1, %%xmm8 \n\t" \ + "mulpd %%xmm2, %%xmm9 \n\t" \ + "movddup %7, %%xmm10 \n\t" \ + "movddup %8, %%xmm11 \n\t" \ + "mulpd %%xmm1, %%xmm10 \n\t" \ + "mulpd %%xmm2, %%xmm11 \n\t" \ + "addpd %%xmm8, %%xmm5 \n\t" \ + "addpd %%xmm9, %%xmm3 \n\t" \ + "addpd %%xmm10, %%xmm4 \n\t" \ + "addpd %%xmm11, %%xmm5" \ + : \ + : \ + "m" (creal((u).c00)), \ + "m" (creal((u).c10)), \ + "m" (creal((u).c01)), \ + "m" (creal((u).c21)), \ + "m" (creal((u).c02)), \ + "m" (creal((u).c12)), \ + "m" (creal((u).c20)), \ + "m" (creal((u).c11)), \ + "m" (creal((u).c22))); \ +__asm__ __volatile__ ("movddup %0, %%xmm6 \n\t" \ + "movddup %1, %%xmm7 \n\t" \ + "movddup %2, %%xmm8 \n\t" \ + "xorpd %9, %%xmm0 \n\t" \ + "xorpd %9, %%xmm1 \n\t" \ + "xorpd %9, %%xmm2 \n\t" \ + "shufpd $0x1, %%xmm0, %%xmm0 \n\t" \ + "shufpd $0x1, %%xmm1, %%xmm1 \n\t" \ + "shufpd $0x1, %%xmm2, %%xmm2 \n\t" \ + "mulpd %%xmm0, %%xmm6 \n\t" \ + "mulpd %%xmm1, %%xmm7 \n\t" \ + "mulpd %%xmm2, %%xmm8 \n\t" \ + "movddup %3, %%xmm9 \n\t" \ + "movddup %4, %%xmm10 \n\t" \ + "movddup %5, %%xmm11 \n\t" \ + "addpd %%xmm6, %%xmm3 \n\t" \ + "addpd %%xmm7, %%xmm4 \n\t" \ + "addpd %%xmm8, %%xmm5 \n\t" \ + "mulpd %%xmm0, %%xmm9 \n\t" \ + "mulpd %%xmm1, %%xmm10 \n\t" \ + "mulpd %%xmm0, %%xmm11 \n\t" \ + "movddup %6, %%xmm12 \n\t" \ + "movddup %7, %%xmm13 \n\t" \ + "movddup %8, %%xmm14 \n\t" \ + "addpd %%xmm9, %%xmm4 \n\t" \ + "addpd %%xmm10, %%xmm3 \n\t" \ + "addpd %%xmm11, %%xmm5 \n\t" \ + "mulpd %%xmm2, %%xmm12 \n\t" \ + "mulpd %%xmm1, %%xmm13 \n\t" \ + "mulpd %%xmm2, %%xmm14 \n\t" \ + "addpd %%xmm12, %%xmm3 \n\t" \ + "addpd %%xmm13, %%xmm5 \n\t" \ + "addpd %%xmm14, %%xmm4" \ + : \ + : \ + "m" (cimag((u).c00)), \ + "m" (cimag((u).c11)), \ + "m" (cimag((u).c22)), \ + "m" (cimag((u).c01)), \ + "m" (cimag((u).c10)), \ + "m" (cimag((u).c02)), \ + "m" (cimag((u).c20)), \ + "m" (cimag((u).c12)), \ + "m" (cimag((u).c21)), \ + "m" (_sse_sgn)); + + +#else + +#define _sse_su3_inverse_multiply(u) \ +__asm__ __volatile__ ("movddup %0, %%xmm3 \n\t" \ + "movddup %1, %%xmm6 \n\t" \ + "movddup %2, %%xmm4 \n\t" \ + "mulpd %%xmm0, %%xmm3 \n\t" \ + "movddup %3, %%xmm7 \n\t" \ + "mulpd %%xmm1, %%xmm6 \n\t" \ + "movddup %4, %%xmm5 \n\t" \ + "mulpd %%xmm0, %%xmm4 \n\t" \ + "addpd %%xmm6, %%xmm3 \n\t" \ + "mulpd %%xmm2, %%xmm7 \n\t" \ + "mulpd %%xmm0, %%xmm5 \n\t" \ + "addpd %%xmm7, %%xmm4 \n\t" \ + "movddup %5, %%xmm6 \n\t" \ + "movddup %6, %%xmm7 \n\t" \ + "mulpd %%xmm1, %%xmm6 \n\t" \ + "mulpd %%xmm2, %%xmm7 \n\t" \ + "addpd %%xmm6, %%xmm5 \n\t" \ + "addpd %%xmm7, %%xmm3 \n\t" \ + "movddup %7, %%xmm6 \n\t" \ + "movddup %8, %%xmm7 \n\t" \ + "mulpd %%xmm1, %%xmm6 \n\t" \ + "mulpd %%xmm2, %%xmm7 \n\t" \ + "addpd %%xmm6, %%xmm4 \n\t" \ + "addpd %%xmm7, %%xmm5" \ + : \ + : \ + "m" (creal((u).c00)), \ + "m" (creal((u).c10)), \ + "m" (creal((u).c01)), \ + "m" (creal((u).c21)), \ + "m" (creal((u).c02)), \ + "m" (creal((u).c12)), \ + "m" (creal((u).c20)), \ + "m" (creal((u).c11)), \ + "m" (creal((u).c22))); \ +__asm__ __volatile__ ("movddup %0, %%xmm6 \n\t" \ + "movddup %1, %%xmm7 \n\t" \ + "xorpd %9, %%xmm0 \n\t" \ + "xorpd %9, %%xmm1 \n\t" \ + "xorpd %9, %%xmm2 \n\t" \ + "shufpd $0x1, %%xmm0, %%xmm0 \n\t" \ + "shufpd $0x1, %%xmm1, %%xmm1 \n\t" \ + "shufpd $0x1, %%xmm2, %%xmm2 \n\t" \ + "mulpd %%xmm0, %%xmm6 \n\t" \ + "mulpd %%xmm1, %%xmm7 \n\t" \ + "addpd %%xmm6, %%xmm3 \n\t" \ + "addpd %%xmm7, %%xmm4 \n\t" \ + "movddup %2, %%xmm6 \n\t" \ + "movddup %3, %%xmm7 \n\t" \ + "mulpd %%xmm2, %%xmm6 \n\t" \ + "mulpd %%xmm0, %%xmm7 \n\t" \ + "addpd %%xmm6, %%xmm5 \n\t" \ + "addpd %%xmm7, %%xmm4 \n\t" \ + "movddup %4, %%xmm6 \n\t" \ + "movddup %5, %%xmm7 \n\t" \ + "mulpd %%xmm1, %%xmm6 \n\t" \ + "mulpd %%xmm0, %%xmm7 \n\t" \ + "addpd %%xmm6, %%xmm3 \n\t" \ + "addpd %%xmm7, %%xmm5 \n\t" \ + "movddup %6, %%xmm0 \n\t" \ + "movddup %7, %%xmm6 \n\t" \ + "movddup %8, %%xmm7 \n\t" \ + "mulpd %%xmm2, %%xmm0 \n\t" \ + "mulpd %%xmm1, %%xmm6 \n\t" \ + "mulpd %%xmm2, %%xmm7 \n\t" \ + "addpd %%xmm0, %%xmm3 \n\t" \ + "addpd %%xmm6, %%xmm5 \n\t" \ + "addpd %%xmm7, %%xmm4" \ + : \ + : \ + "m" (cimag((u).c00)), \ + "m" (cimag((u).c11)), \ + "m" (cimag((u).c22)), \ + "m" (cimag((u).c01)), \ + "m" (cimag((u).c10)), \ + "m" (cimag((u).c02)), \ + "m" (cimag((u).c20)), \ + "m" (cimag((u).c12)), \ + "m" (cimag((u).c21)), \ + "m" (_sse_sgn)); + +#endif + + + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/sse_32.h b/qcd/part_cpu/applications/QCD/src/kernel_D/sse_32.h new file mode 100644 index 0000000000000000000000000000000000000000..267bbc95e90a440d3891c3e406ee63f1fe86cd93 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/sse_32.h @@ -0,0 +1,662 @@ + +/******************************************************************************* +* +* File sse.h +* +* Macros for Dirac spinors, SU(3) vectors and SU(3) matrices using +* inline assembly SSE and SSE2 instructions +* +* Needs gcc version 2.95.2 or later, and binutils snapshot 010122 or later +* if the SSE2 instructions are used +* +* Version: 2.1 +* Author: Martin Luescher +* Date: 15.03.2001 +* +*******************************************************************************/ + +typedef struct +{ + float c1,c2,c3,c4; +} sse_float __attribute__ ((aligned (16))); + +typedef struct +{ + sse_float c1,c2,c3; +} sse_vector __attribute__ ((aligned (16))); + +static sse_float _sse_sgn12 __attribute__ ((unused)) ={-1.0f,-1.0f,1.0f,1.0f}; +static sse_float _sse_sgn13 __attribute__ ((unused)) ={-1.0f,1.0f,-1.0f,1.0f}; +static sse_float _sse_sgn14 __attribute__ ((unused)) ={-1.0f,1.0f,1.0f,-1.0f}; +static sse_float _sse_sgn23 __attribute__ ((unused)) ={1.0f,-1.0f,-1.0f,1.0f}; +static sse_float _sse_sgn24 __attribute__ ((unused)) ={1.0f,-1.0f,1.0f,-1.0f}; +static sse_float _sse_sgn34 __attribute__ ((unused)) ={1.0f,1.0f,-1.0f,-1.0f}; + + +/******************************************************************************* +* +* Cache manipulation macros +* +*******************************************************************************/ + +#if defined P4 + +#define _prefetch_spinor(addr) \ +__asm__ __volatile__ ("prefetcht0 %0 \n\t" \ + "prefetcht0 %1" \ + : \ + : \ + "m" (*(((char*)(((unsigned int)(addr))&~0x7f)))), \ + "m" (*(((char*)(((unsigned int)(addr))&~0x7f))+128))) + +#define _prefetch_su3(addr) \ +__asm__ __volatile__ ("prefetcht0 %0 \n\t" \ + "prefetcht0 %1" \ + : \ + : \ + "m" (*(((char*)(((unsigned int)(addr))&~0x7f)))), \ + "m" (*(((char*)(((unsigned int)(addr))&~0x7f))+128))) + +#else + +#define _prefetch_spinor(addr) \ +__asm__ __volatile__ ("prefetcht0 %0 \n\t" \ + "prefetcht0 %1 \n\t" \ + "prefetcht0 %2" \ + : \ + : \ + "m" (*(((char*)(addr)))), \ + "m" (*(((char*)(addr))+32)), \ + "m" (*(((char*)(addr))+64))) + +#define _prefetch_su3(addr) \ +__asm__ __volatile__ ("prefetcht0 %0 \n\t" \ + "prefetcht0 %1 \n\t" \ + "prefetcht0 %2" \ + : \ + : \ + "m" (*(((char*)(((unsigned int)(addr))&~0x1f)))), \ + "m" (*(((char*)(((unsigned int)(addr))&~0x1f))+32)), \ + "m" (*(((char*)(((unsigned int)(addr))&~0x1f))+64))) + +#endif + + +/******************************************************************************* +* +* Macros for su3 vectors used in D_psi version 2.1 +* +* Most of these macros operate on pairs of su3 vectors that are stored +* in the low and high words of xmm0,xmm1,xmm2 or xmm3,xmm4,xmm5. For example, +* +* xmm0 -> sl.c1.re,sl.c1.im,sh.c1.re,sh.c1.im +* xmm1 -> sl.c2.re,sl.c2.im,sh.c2.re,sh.c2.im +* xmm2 -> sl.c3.re,sl.c3.im,sh.c3.re,sh.c3.im +* +* (where sl and sh are of type su3_vector). This can also be interpreted as +* an sse_vector s that is stored in these registers according to +* +* xmm0 -> s.c1.c1,s.c1.c2,s.c1.c3,s.c1.c4 +* xmm1 -> s.c2.c1,s.c2.c2,s.c2.c3,s.c2.c4 +* xmm2 -> s.c3.c1,s.c3.c2,s.c3.c3,s.c3.c4 +* +* The load and store macros can be used to move data in either format +* from and to the xmm registers +* +*******************************************************************************/ + +/* +* Loads two su3 vectors sl and sh to the low and high words of xmm0,xmm1,xmm2 +*/ + +#if defined SSE2 + +#define _sse_pair_load(sl,sh) \ +__asm__ __volatile__ ("movsd %0, %%xmm0 \n\t" \ + "movsd %1, %%xmm1 \n\t" \ + "movsd %2, %%xmm2 \n\t" \ + "movhps %3, %%xmm0 \n\t" \ + "movhps %4, %%xmm1 \n\t" \ + "movhps %5, %%xmm2" \ + : \ + : \ + "m" ((sl).c1), \ + "m" ((sl).c2), \ + "m" ((sl).c3), \ + "m" ((sh).c1), \ + "m" ((sh).c2), \ + "m" ((sh).c3)) + +#else + +#define _sse_pair_load(sl,sh) \ +__asm__ __volatile__ ("movlps %0, %%xmm0 \n\t" \ + "movlps %1, %%xmm1 \n\t" \ + "movlps %2, %%xmm2 \n\t" \ + "movhps %3, %%xmm0 \n\t" \ + "movhps %4, %%xmm1 \n\t" \ + "movhps %5, %%xmm2" \ + : \ + : \ + "m" ((sl).c1), \ + "m" ((sl).c2), \ + "m" ((sl).c3), \ + "m" ((sh).c1), \ + "m" ((sh).c2), \ + "m" ((sh).c3)) + +#endif + +/* +* Loads two su3 vectors sl and sh to the low and high words of xmm3,xmm4,xmm5 +*/ + +#if defined SSE2 + +#define _sse_pair_load_up(sl,sh) \ +__asm__ __volatile__ ("movsd %0, %%xmm3 \n\t" \ + "movsd %1, %%xmm4 \n\t" \ + "movsd %2, %%xmm5 \n\t" \ + "movhps %3, %%xmm3 \n\t" \ + "movhps %4, %%xmm4 \n\t" \ + "movhps %5, %%xmm5" \ + : \ + : \ + "m" ((sl).c1), \ + "m" ((sl).c2), \ + "m" ((sl).c3), \ + "m" ((sh).c1), \ + "m" ((sh).c2), \ + "m" ((sh).c3)) + +#else + +#define _sse_pair_load_up(sl,sh) \ +__asm__ __volatile__ ("movlps %0, %%xmm3 \n\t" \ + "movlps %1, %%xmm4 \n\t" \ + "movlps %2, %%xmm5 \n\t" \ + "movhps %3, %%xmm3 \n\t" \ + "movhps %4, %%xmm4 \n\t" \ + "movhps %5, %%xmm5" \ + : \ + : \ + "m" ((sl).c1), \ + "m" ((sl).c2), \ + "m" ((sl).c3), \ + "m" ((sh).c1), \ + "m" ((sh).c2), \ + "m" ((sh).c3)) + +#endif + +/* +* Stores the low and high words of xmm0,xmm1,xmm2 to the su3 vectors rl and rh +*/ + +#define _sse_pair_store(rl,rh) \ +__asm__ __volatile__ ("movlps %%xmm0, %0 \n\t" \ + "movlps %%xmm1, %1 \n\t" \ + "movlps %%xmm2, %2 \n\t" \ + "movhps %%xmm0, %3 \n\t" \ + "movhps %%xmm1, %4 \n\t" \ + "movhps %%xmm2, %5" \ + : \ + "=m" ((rl).c1), \ + "=m" ((rl).c2), \ + "=m" ((rl).c3), \ + "=m" ((rh).c1), \ + "=m" ((rh).c2), \ + "=m" ((rh).c3)) + +/* +* Stores the low and high words of xmm3,xmm4,xmm5 to the su3 vectors rl and rh +*/ + +#define _sse_pair_store_up(rl,rh) \ +__asm__ __volatile__ ("movlps %%xmm3, %0 \n\t" \ + "movlps %%xmm4, %1 \n\t" \ + "movlps %%xmm5, %2 \n\t" \ + "movhps %%xmm3, %3 \n\t" \ + "movhps %%xmm4, %4 \n\t" \ + "movhps %%xmm5, %5" \ + : \ + "=m" ((rl).c1), \ + "=m" ((rl).c2), \ + "=m" ((rl).c3), \ + "=m" ((rh).c1), \ + "=m" ((rh).c2), \ + "=m" ((rh).c3)) + +/* +* Loads the components s.c1,s.c2,s.c3 of an _sse_vector s to xmm0,xmm1,xmm2 +*/ + +#define _sse_vector_load(s) \ +__asm__ __volatile__ ("movaps %0, %%xmm0 \n\t" \ + "movaps %1, %%xmm1 \n\t" \ + "movaps %2, %%xmm2" \ + : \ + : \ + "m" ((s).c1), \ + "m" ((s).c2), \ + "m" ((s).c3)) + +/* +* Stores xmm0,xmm1,xmm2 to the components r.c1,r.c2,r.c3 of an _sse_vector r +*/ + +#define _sse_vector_store(r) \ +__asm__ __volatile__ ("movaps %%xmm0, %0 \n\t" \ + "movaps %%xmm1, %1 \n\t" \ + "movaps %%xmm2, %2" \ + : \ + "=m" ((r).c1), \ + "=m" ((r).c2), \ + "=m" ((r).c3)) + +/* +* Multiplies xmm0,xmm1,xmm2 with a constant sse_float c +*/ + +#define _sse_vector_mul(c) \ +__asm__ __volatile__ ("mulps %0, %%xmm0 \n\t" \ + "mulps %0, %%xmm1 \n\t" \ + "mulps %0, %%xmm2" \ + : \ + : \ + "m" (c)) + +/* +* Adds xmm3,xmm4,xmm5 to xmm1,xmm2,xmm3 +*/ + +#define _sse_vector_add() \ +__asm__ __volatile__ ("addps %%xmm3, %%xmm0 \n\t" \ + "addps %%xmm4, %%xmm1 \n\t" \ + "addps %%xmm5, %%xmm2" \ + : \ + :) + + +/* +* Subtracts xmm3,xmm4,xmm5 from xmm1,xmm2,xmm3 +*/ + +#define _sse_vector_sub() \ +__asm__ __volatile__ ("subps %%xmm3, %%xmm0 \n\t" \ + "subps %%xmm4, %%xmm1 \n\t" \ + "subps %%xmm5, %%xmm2" \ + : \ + :) + +/* +* Multiplies the high words xmm3,xmm4,xmm5 with -1 and adds these registers +* to xmm0,xmm1,xmm2 +*/ + +#define _sse_vector_addsub() \ +__asm__ __volatile__ ("mulps %0, %%xmm3 \n\t" \ + "mulps %0, %%xmm4 \n\t" \ + "mulps %0, %%xmm5 \n\t" \ + "addps %%xmm3, %%xmm0 \n\t" \ + "addps %%xmm4, %%xmm1 \n\t" \ + "addps %%xmm5, %%xmm2" \ + : \ + : \ + "m" (_sse_sgn34)) + +/* +* Multiplies the low words xmm3,xmm4,xmm5 with -1 and adds these registers +* to xmm0,xmm1,xmm2 +*/ + +#define _sse_vector_subadd() \ +__asm__ __volatile__ ("mulps %0, %%xmm3 \n\t" \ + "mulps %0, %%xmm4 \n\t" \ + "mulps %0, %%xmm5 \n\t" \ + "addps %%xmm3, %%xmm0 \n\t" \ + "addps %%xmm4, %%xmm1 \n\t" \ + "addps %%xmm5, %%xmm2" \ + : \ + : \ + "m" (_sse_sgn12)) + +/* +* Multiplies xmm3,xmm4,xmm5 with i and adds them to xmm1,xmm2,xmm3 +*/ + +#define _sse_vector_i_add() \ +__asm__ __volatile__ ("shufps $0xb1, %%xmm3, %%xmm3 \n\t" \ + "shufps $0xb1, %%xmm4, %%xmm4 \n\t" \ + "shufps $0xb1, %%xmm5, %%xmm5 \n\t" \ + "mulps %0, %%xmm3 \n\t" \ + "mulps %0, %%xmm4 \n\t" \ + "mulps %0, %%xmm5 \n\t" \ + "addps %%xmm3, %%xmm0 \n\t" \ + "addps %%xmm4, %%xmm1 \n\t" \ + "addps %%xmm5, %%xmm2" \ + : \ + : \ + "m" (_sse_sgn13)) + +/* +* Multiplies xmm3,xmm4,xmm5 with i and subtracts them from xmm1,xmm2,xmm3 +*/ + +#define _sse_vector_i_sub() \ +__asm__ __volatile__ ("shufps $0xb1, %%xmm3, %%xmm3 \n\t" \ + "shufps $0xb1, %%xmm4, %%xmm4 \n\t" \ + "shufps $0xb1, %%xmm5, %%xmm5 \n\t" \ + "mulps %0, %%xmm3 \n\t" \ + "mulps %0, %%xmm4 \n\t" \ + "mulps %0, %%xmm5 \n\t" \ + "addps %%xmm3, %%xmm0 \n\t" \ + "addps %%xmm4, %%xmm1 \n\t" \ + "addps %%xmm5, %%xmm2" \ + : \ + : \ + "m" (_sse_sgn24)) + +/* +* Exchanges the high and low words of xmm3,xmm4,xmm5, multiplies them with i +* and adds the result to xmm1,xmm2,xmm3 +*/ + +#define _sse_vector_xch_i_add() \ +__asm__ __volatile__ ("shufps $0x1b, %%xmm3, %%xmm3 \n\t" \ + "shufps $0x1b, %%xmm4, %%xmm4 \n\t" \ + "shufps $0x1b, %%xmm5, %%xmm5 \n\t" \ + "mulps %0, %%xmm3 \n\t" \ + "mulps %0, %%xmm4 \n\t" \ + "mulps %0, %%xmm5 \n\t" \ + "addps %%xmm3, %%xmm0 \n\t" \ + "addps %%xmm4, %%xmm1 \n\t" \ + "addps %%xmm5, %%xmm2" \ + : \ + : \ + "m" (_sse_sgn13)) + +/* +* Exchanges the high and low words of xmm3,xmm4,xmm5, multiplies them with i +* and subtracts the result from xmm1,xmm2,xmm3 +*/ + +#define _sse_vector_xch_i_sub() \ +__asm__ __volatile__ ("shufps $0x1b, %%xmm3, %%xmm3 \n\t" \ + "shufps $0x1b, %%xmm4, %%xmm4 \n\t" \ + "shufps $0x1b, %%xmm5, %%xmm5 \n\t" \ + "mulps %0, %%xmm3 \n\t" \ + "mulps %0, %%xmm4 \n\t" \ + "mulps %0, %%xmm5 \n\t" \ + "addps %%xmm3, %%xmm0 \n\t" \ + "addps %%xmm4, %%xmm1 \n\t" \ + "addps %%xmm5, %%xmm2" \ + : \ + : \ + "m" (_sse_sgn24)) + +/* +* Multiplies the low and high words of xmm3,xmm4,xmm5 with i and -i +* respectively and adds these registers to xmm1,xmm2,xmm3 +*/ + +#define _sse_vector_i_addsub() \ +__asm__ __volatile__ ("shufps $0xb1, %%xmm3, %%xmm3 \n\t" \ + "shufps $0xb1, %%xmm4, %%xmm4 \n\t" \ + "shufps $0xb1, %%xmm5, %%xmm5 \n\t" \ + "mulps %0, %%xmm3 \n\t" \ + "mulps %0, %%xmm4 \n\t" \ + "mulps %0, %%xmm5 \n\t" \ + "addps %%xmm3, %%xmm0 \n\t" \ + "addps %%xmm4, %%xmm1 \n\t" \ + "addps %%xmm5, %%xmm2" \ + : \ + : \ + "m" (_sse_sgn14)) + +/* +* Multiplies the low and high words of xmm3,xmm4,xmm5 with -i and i +* respectively and adds these registers to xmm1,xmm2,xmm3 +*/ + +#define _sse_vector_i_subadd() \ +__asm__ __volatile__ ("shufps $0xb1, %%xmm3, %%xmm3 \n\t" \ + "shufps $0xb1, %%xmm4, %%xmm4 \n\t" \ + "shufps $0xb1, %%xmm5, %%xmm5 \n\t" \ + "mulps %0, %%xmm3 \n\t" \ + "mulps %0, %%xmm4 \n\t" \ + "mulps %0, %%xmm5 \n\t" \ + "addps %%xmm3, %%xmm0 \n\t" \ + "addps %%xmm4, %%xmm1 \n\t" \ + "addps %%xmm5, %%xmm2" \ + : \ + : \ + "m" (_sse_sgn23)) + +/* +* Exchanges the high and low words in xmm3,xmm4,xmm5 +*/ + +#define _sse_vector_xch() \ +__asm__ __volatile__ ("shufps $0x4e, %%xmm3, %%xmm3 \n\t" \ + "shufps $0x4e, %%xmm4, %%xmm4 \n\t" \ + "shufps $0x4e, %%xmm5, %%xmm5" \ + : \ + :) + +/* +* Multiplies a pair sl,sh of su3 vectors with an su3 matrix u, +* assuming sl and sh are in the low and high words of xmm0,xmm1,xmm2 +* +* On output the result is in xmm3,xmm4,xmm5 and the registers +* xmm0,xmm1,xmm2 are changed +*/ + +#define _sse_su3_multiply(u) \ +__asm__ __volatile__ ("movss %0, %%xmm3 \n\t" \ + "movss %1, %%xmm6 \n\t" \ + "movss %2, %%xmm4 \n\t" \ + "movss %3, %%xmm7 \n\t" \ + "movss %4, %%xmm5 \n\t" \ + "shufps $0x0, %%xmm3, %%xmm3 \n\t" \ + "shufps $0x0, %%xmm6, %%xmm6 \n\t" \ + "shufps $0x0, %%xmm4, %%xmm4 \n\t" \ + "mulps %%xmm0, %%xmm3 \n\t" \ + "shufps $0x0, %%xmm7, %%xmm7 \n\t" \ + "mulps %%xmm1, %%xmm6 \n\t" \ + "shufps $0x0, %%xmm5, %%xmm5 \n\t" \ + "mulps %%xmm0, %%xmm4 \n\t" \ + "addps %%xmm6, %%xmm3 \n\t" \ + "mulps %%xmm2, %%xmm7 \n\t" \ + "mulps %%xmm0, %%xmm5 \n\t" \ + "addps %%xmm7, %%xmm4 \n\t" \ + "movss %5, %%xmm6 \n\t" \ + "movss %6, %%xmm7 \n\t" \ + "shufps $0x0, %%xmm6, %%xmm6 \n\t" \ + "shufps $0x0, %%xmm7, %%xmm7 \n\t" \ + "mulps %%xmm1, %%xmm6 \n\t" \ + "mulps %%xmm2, %%xmm7 \n\t" \ + "addps %%xmm6, %%xmm5 \n\t" \ + "addps %%xmm7, %%xmm3 \n\t" \ + "movss %7, %%xmm6 \n\t" \ + "movss %8, %%xmm7 \n\t" \ + "shufps $0x0, %%xmm6, %%xmm6 \n\t" \ + "shufps $0x0, %%xmm7, %%xmm7 \n\t" \ + "mulps %%xmm1, %%xmm6 \n\t" \ + "mulps %%xmm2, %%xmm7 \n\t" \ + "addps %%xmm6, %%xmm4 \n\t" \ + "addps %%xmm7, %%xmm5" \ + : \ + : \ + "m" ((u).c11.re), \ + "m" ((u).c12.re), \ + "m" ((u).c21.re), \ + "m" ((u).c23.re), \ + "m" ((u).c31.re), \ + "m" ((u).c32.re), \ + "m" ((u).c13.re), \ + "m" ((u).c22.re), \ + "m" ((u).c33.re)); \ +__asm__ __volatile__ ("movss %0, %%xmm6 \n\t" \ + "movss %1, %%xmm7 \n\t" \ + "shufps $0xb1, %%xmm0, %%xmm0 \n\t" \ + "shufps $0xb1, %%xmm1, %%xmm1 \n\t" \ + "shufps $0xb1, %%xmm2, %%xmm2 \n\t" \ + "shufps $0x0, %%xmm6, %%xmm6 \n\t" \ + "shufps $0x0, %%xmm7, %%xmm7 \n\t" \ + "mulps %9, %%xmm0 \n\t" \ + "mulps %9, %%xmm1 \n\t" \ + "mulps %9, %%xmm2 \n\t" \ + "mulps %%xmm0, %%xmm6 \n\t" \ + "mulps %%xmm1, %%xmm7 \n\t" \ + "addps %%xmm6, %%xmm3 \n\t" \ + "addps %%xmm7, %%xmm4 \n\t" \ + "movss %2, %%xmm6 \n\t" \ + "movss %3, %%xmm7 \n\t" \ + "shufps $0x0, %%xmm6, %%xmm6 \n\t" \ + "shufps $0x0, %%xmm7, %%xmm7 \n\t" \ + "mulps %%xmm2, %%xmm6 \n\t" \ + "mulps %%xmm0, %%xmm7 \n\t" \ + "addps %%xmm6, %%xmm5 \n\t" \ + "addps %%xmm7, %%xmm4 \n\t" \ + "movss %4, %%xmm6 \n\t" \ + "movss %5, %%xmm7 \n\t" \ + "shufps $0x0, %%xmm6, %%xmm6 \n\t" \ + "shufps $0x0, %%xmm7, %%xmm7 \n\t" \ + "mulps %%xmm1, %%xmm6 \n\t" \ + "mulps %%xmm0, %%xmm7 \n\t" \ + "addps %%xmm6, %%xmm3 \n\t" \ + "addps %%xmm7, %%xmm5 \n\t" \ + "movss %6, %%xmm0 \n\t" \ + "movss %7, %%xmm6 \n\t" \ + "movss %8, %%xmm7 \n\t" \ + "shufps $0x0, %%xmm0, %%xmm0 \n\t" \ + "shufps $0x0, %%xmm6, %%xmm6 \n\t" \ + "shufps $0x0, %%xmm7, %%xmm7 \n\t" \ + "mulps %%xmm2, %%xmm0 \n\t" \ + "mulps %%xmm1, %%xmm6 \n\t" \ + "mulps %%xmm2, %%xmm7 \n\t" \ + "addps %%xmm0, %%xmm3 \n\t" \ + "addps %%xmm6, %%xmm5 \n\t" \ + "addps %%xmm7, %%xmm4" \ + : \ + : \ + "m" ((u).c11.im), \ + "m" ((u).c22.im), \ + "m" ((u).c33.im), \ + "m" ((u).c21.im), \ + "m" ((u).c12.im), \ + "m" ((u).c31.im), \ + "m" ((u).c13.im), \ + "m" ((u).c32.im), \ + "m" ((u).c23.im), \ + "m" (_sse_sgn13)) + +/* +* Multiplies a pair sl,sh of su3 vectors with an su3 matrix u^dagger, +* assuming sl and sh are in the low and high words of xmm0,xmm1,xmm2 +* +* On output the result is in xmm3,xmm4,xmm5 and the registers +* xmm0,xmm1,xmm2 are changed +*/ + +#define _sse_su3_inverse_multiply(u) \ +__asm__ __volatile__ ("movss %0, %%xmm3 \n\t" \ + "movss %1, %%xmm6 \n\t" \ + "movss %2, %%xmm4 \n\t" \ + "movss %3, %%xmm7 \n\t" \ + "movss %4, %%xmm5 \n\t" \ + "shufps $0x0, %%xmm3, %%xmm3 \n\t" \ + "shufps $0x0, %%xmm6, %%xmm6 \n\t" \ + "shufps $0x0, %%xmm4, %%xmm4 \n\t" \ + "mulps %%xmm0, %%xmm3 \n\t" \ + "shufps $0x0, %%xmm7, %%xmm7 \n\t" \ + "mulps %%xmm1, %%xmm6 \n\t" \ + "shufps $0x0, %%xmm5, %%xmm5 \n\t" \ + "mulps %%xmm0, %%xmm4 \n\t" \ + "addps %%xmm6, %%xmm3 \n\t" \ + "mulps %%xmm2, %%xmm7 \n\t" \ + "mulps %%xmm0, %%xmm5 \n\t" \ + "addps %%xmm7, %%xmm4 \n\t" \ + "movss %5, %%xmm6 \n\t" \ + "movss %6, %%xmm7 \n\t" \ + "shufps $0x0, %%xmm6, %%xmm6 \n\t" \ + "shufps $0x0, %%xmm7, %%xmm7 \n\t" \ + "mulps %%xmm1, %%xmm6 \n\t" \ + "mulps %%xmm2, %%xmm7 \n\t" \ + "addps %%xmm6, %%xmm5 \n\t" \ + "addps %%xmm7, %%xmm3 \n\t" \ + "movss %7, %%xmm6 \n\t" \ + "movss %8, %%xmm7 \n\t" \ + "shufps $0x0, %%xmm6, %%xmm6 \n\t" \ + "shufps $0x0, %%xmm7, %%xmm7 \n\t" \ + "mulps %%xmm1, %%xmm6 \n\t" \ + "mulps %%xmm2, %%xmm7 \n\t" \ + "addps %%xmm6, %%xmm4 \n\t" \ + "addps %%xmm7, %%xmm5" \ + : \ + : \ + "m" ((u).c11.re), \ + "m" ((u).c21.re), \ + "m" ((u).c12.re), \ + "m" ((u).c32.re), \ + "m" ((u).c13.re), \ + "m" ((u).c23.re), \ + "m" ((u).c31.re), \ + "m" ((u).c22.re), \ + "m" ((u).c33.re)); \ +__asm__ __volatile__ ("movss %0, %%xmm6 \n\t" \ + "movss %1, %%xmm7 \n\t" \ + "shufps $0xb1, %%xmm0, %%xmm0 \n\t" \ + "shufps $0xb1, %%xmm1, %%xmm1 \n\t" \ + "shufps $0xb1, %%xmm2, %%xmm2 \n\t" \ + "shufps $0x0, %%xmm6, %%xmm6 \n\t" \ + "shufps $0x0, %%xmm7, %%xmm7 \n\t" \ + "mulps %9, %%xmm0 \n\t" \ + "mulps %9, %%xmm1 \n\t" \ + "mulps %9, %%xmm2 \n\t" \ + "mulps %%xmm0, %%xmm6 \n\t" \ + "mulps %%xmm1, %%xmm7 \n\t" \ + "addps %%xmm6, %%xmm3 \n\t" \ + "addps %%xmm7, %%xmm4 \n\t" \ + "movss %2, %%xmm6 \n\t" \ + "movss %3, %%xmm7 \n\t" \ + "shufps $0x0, %%xmm6, %%xmm6 \n\t" \ + "shufps $0x0, %%xmm7, %%xmm7 \n\t" \ + "mulps %%xmm2, %%xmm6 \n\t" \ + "mulps %%xmm0, %%xmm7 \n\t" \ + "addps %%xmm6, %%xmm5 \n\t" \ + "addps %%xmm7, %%xmm4 \n\t" \ + "movss %4, %%xmm6 \n\t" \ + "movss %5, %%xmm7 \n\t" \ + "shufps $0x0, %%xmm6, %%xmm6 \n\t" \ + "shufps $0x0, %%xmm7, %%xmm7 \n\t" \ + "mulps %%xmm1, %%xmm6 \n\t" \ + "mulps %%xmm0, %%xmm7 \n\t" \ + "addps %%xmm6, %%xmm3 \n\t" \ + "addps %%xmm7, %%xmm5 \n\t" \ + "movss %6, %%xmm0 \n\t" \ + "movss %7, %%xmm6 \n\t" \ + "movss %8, %%xmm7 \n\t" \ + "shufps $0x0, %%xmm0, %%xmm0 \n\t" \ + "shufps $0x0, %%xmm6, %%xmm6 \n\t" \ + "shufps $0x0, %%xmm7, %%xmm7 \n\t" \ + "mulps %%xmm2, %%xmm0 \n\t" \ + "mulps %%xmm1, %%xmm6 \n\t" \ + "mulps %%xmm2, %%xmm7 \n\t" \ + "addps %%xmm0, %%xmm3 \n\t" \ + "addps %%xmm6, %%xmm5 \n\t" \ + "addps %%xmm7, %%xmm4" \ + : \ + : \ + "m" ((u).c11.im), \ + "m" ((u).c22.im), \ + "m" ((u).c33.im), \ + "m" ((u).c12.im), \ + "m" ((u).c21.im), \ + "m" ((u).c13.im), \ + "m" ((u).c31.im), \ + "m" ((u).c23.im), \ + "m" ((u).c32.im), \ + "m" (_sse_sgn24)); + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/start.c b/qcd/part_cpu/applications/QCD/src/kernel_D/start.c new file mode 100644 index 0000000000000000000000000000000000000000..03bcde06ecc4751d6088558798e7feee4674f750 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/start.c @@ -0,0 +1,931 @@ +/*********************************************************************** + * + * Copyright (C) 2000 Martin Luescher + * 2002 Martin Hasenbusch, Ines Wetzorke + * 2003-2008 Carsten Urbach, Remi Baron + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * File start.c + * + * Collection of useful programs that mainly serve to initialize the fields + * + * The externally accessible functions are + * + * su3_vector random_su3_vector(void) + * Returns a uniformly distributed random SU(3) vector with norm 1 + * + * spinor random_spinor(void) + * Returns a random spinor with norm 1 + * + * M.Hasenbusch: + * void random_spinor_field(int k) + * Initializes the spinor field psi[k] to a Gaussian random field + * + * M.Hasenbusch: + * void zero_spinor_field(spinor * const k, const int V) + * Initializes the spinor field psi[k] to zero + * + * su3 random_su3(void) + * Returns a uniformly distributed random SU(3) matrix + * + * void unit_g_gauge_field(void) + * Sets the gauge field variables to unity + * + * void random_gauge_field(void) + * Initializes the gauge field to a random configuration + * + * Version: 1.0 + * Author: Martin Luescher + * Date: 24.10.2000 + * + * Added the function + * void source_spinor_field_point_from_file(spinor * const P, spinor * const Q, int is, int ic, int source_indx) + * which uses the new input parameter SourceLocation in the input parameter files + * to place the source at the desired point + * + * Author: Remi Baron April 2007 + *******************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#ifdef MPI +# include +#endif +#include "global.h" +#include "read_input.h" +#include "su3.h" +#include "su3adj.h" +#include "ranlxd.h" +#include "ranlxs.h" +#include "start.h" + +static void gauss_vector(double v[],int n) +{ + int k; + double r[2]; + double x1,x2,rho,y1,y2; + + + for (k=0;;k+=2) + { + ranlxd(r,2); + x1=r[0]; + x2=r[1]; + + rho = -log(1.0 - x1); + rho = sqrt(rho); + x2 *= 6.2831853071796; + y1 = rho * sin(x2); + y2 = rho * cos(x2); + + if (n > k) + v[k] = y1; + if (n > (k+1)) + v[k + 1] = y2; + if (n <= (k + 2)) + return; + } +} + +/* produce a double array of z2 noise of length N */ +static void z2_vector(double *v, const int N) { + ranlxd(v,N); + for (int i = 0; i < N; ++i) { + if(v[i] < 0.5) + v[i]=1/sqrt(2); + else + v[i]=-1/sqrt(2); + } + return; +} + +static su3 unit_su3(void) +{ + su3 u = {1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0}; + return u; +} + +su3_vector unit_su3_vector() +{ + su3_vector s = {1.0, 1.0, 1.0}; + return s; +} + +/* produce a su3 vector with components distributed according to rn_type and unit norm */ +static void random_su3_vector( su3_vector * const s, const enum RN_TYPE rn_type ) +{ + int i; + double v[6],norm,fact; + + void (*random_vector)(double*,int) = NULL; + + _rn_switch(rn_type,random_vector) + + while (1) + { + random_vector(v,6); + norm=0.0; + + for (i = 0; i < 6; ++i) + norm += v[i] * v[i]; + + norm = sqrt(norm); + + if (1.0 != (1.0 + norm)) + break; + } + + fact = 1.0 / norm; + s->c0 = fact * (v[0] + I * v[1]); + s->c1 = fact * (v[2] + I * v[3]); + s->c2 = fact * (v[4] + I * v[5]); + + return; +} + +static void random_spinor(spinor * const s, const enum RN_TYPE rn_type) { + random_su3_vector(&s->s0, rn_type); + random_su3_vector(&s->s1, rn_type); + random_su3_vector(&s->s2, rn_type); + random_su3_vector(&s->s3, rn_type); + + _vector_mul(s->s0, 0.5, s->s0); + _vector_mul(s->s1, 0.5, s->s1); + _vector_mul(s->s2, 0.5, s->s2); + _vector_mul(s->s3, 0.5, s->s3); + return; +} + +spinor unit_spinor() +{ + spinor s; + + s.s0 = unit_su3_vector(); + s.s1 = unit_su3_vector(); + s.s2 = unit_su3_vector(); + s.s3 = unit_su3_vector(); + + return(s); +} + +void unit_spinor_field(const int k) +{ + int i=0; + spinor *s; + + s = &g_spinor_field[k][0]; + for(i = 0; i < VOLUME/2; i++, s++) { + *s = unit_spinor(); + } +} + +/* Function provides a spinor field of length VOLUME with + distributions given by rn_type as defined in start.h */ +void random_spinor_field_lexic(spinor * const k, const int repro, const enum RN_TYPE rn_type) { + int x, y, z, t, X, Y, Z, tt, id=0; + + void (*random_vector)(double*,int) = NULL; + + _rn_switch(rn_type,random_vector) + +#ifdef MPI + int rlxd_state[105]; + int rlxd_state_backup[105]; +#endif + int coords[4]; + spinor *s; + double v[24]; + + if(repro) { +#ifdef MPI + if(g_proc_id != 0) { + rlxd_get(rlxd_state_backup); + } else if(g_proc_id == 0) { + rlxd_get(rlxd_state); + } + MPI_Bcast(rlxd_state, 105, MPI_INT, 0, MPI_COMM_WORLD); + if(g_proc_id != 0) { + rlxd_reset(rlxd_state); + } +#endif + for(t = 0; t < g_nproc_t*T; t++) { + tt = t - g_proc_coords[0]*T; + coords[0] = t / T; + for(x = 0; x < g_nproc_x*LX; x++) { + X = x - g_proc_coords[1]*LX; + coords[1] = x / LX; + for(y = 0; y < g_nproc_y*LY; y++) { + Y = y - g_proc_coords[2]*LY; + coords[2] = y / LY; + for(z = 0; z < g_nproc_z*LZ; z++) { + Z = z - g_proc_coords[3]*LZ; + coords[3] = z / LZ; +#ifdef MPI + MPI_Cart_rank(g_cart_grid, coords, &id); +#endif + if(g_cart_id == id) { + random_vector(v, 24); + s = k + g_ipt[tt][X][Y][Z]; + memcpy(s, v, 24*sizeof(double)); + } else { + ranlxd(v,24); + } + } + } + } + } +#ifdef MPI + if(g_proc_id != 0) { + rlxd_reset(rlxd_state_backup); + } +#endif + } + else { + for(x = 0; x < VOLUME; x++) { + random_vector(v, 24); + s = k + x; + memcpy(s, v, 24*sizeof(double)); + } + } + return; +} + +/* Function provides a spinor field of length VOLUME/2 for even odd preconditioning + with distributions given by rn_type as defined in start.h */ + +void random_spinor_field_eo(spinor * const k, const int repro, const enum RN_TYPE rn_type ) { + int x, X, y, Y, z, Z, t, t0, id = 0; + + void (*random_vector)(double*,int) = NULL; + + _rn_switch(rn_type,random_vector) + +#ifdef MPI + int rlxd_state[105]; + int rlxd_state_backup[105]; +#endif + int coords[4]; + spinor *s; + double v[24]; + + if(repro) { +#ifdef MPI + if(g_proc_id != 0) { + rlxd_get(rlxd_state_backup); + } else if(g_proc_id == 0) { + rlxd_get(rlxd_state); + } + MPI_Bcast(rlxd_state, 105, MPI_INT, 0, MPI_COMM_WORLD); + if(g_proc_id != 0) { + rlxd_reset(rlxd_state); + } +#endif + for(t0 = 0; t0 < g_nproc_t*T; t0++) { + coords[0] = t0 / T; + t = t0 - T*g_proc_coords[0]; + for(x = 0; x < g_nproc_x*LX; x++) { + coords[1] = x / LX; + X = x - g_proc_coords[1]*LX; + for(y = 0; y < g_nproc_y*LY; y++) { + coords[2] = y / LY; + Y = y - g_proc_coords[2]*LY; + for(z = 0; z < g_nproc_z*LZ; z++) { + coords[3] = z / LZ; + Z = z - g_proc_coords[3]*LZ; +#ifdef MPI + MPI_Cart_rank(g_cart_grid, coords, &id); +#endif + if((t0+x+y+z)%2 == 0) { + random_vector(v, 24); + if(g_cart_id == id) { + s = k + g_lexic2eosub[ g_ipt[t][X][Y][Z] ]; + memcpy(s, v, 24*sizeof(double)); + } + } + } + } + } + } +#ifdef MPI + if(g_proc_id != 0) { + rlxd_reset(rlxd_state_backup); + } +#endif + } + else { + for (x = 0; x < VOLUME/2; x++) { + s = k + x; + random_vector(v, 24); + memcpy(s, v, 24*sizeof(double)); + } + } + return; +} + +/* Function provides a zero spinor field of length N */ +void zero_spinor_field(spinor * const k, const int N) +{ + memset(k, 0, sizeof(spinor) * N); +} + +/* Function provides a zero spinor field of length N */ +void zero_spinor_field_32(spinor32 * const k, const int N) +{ + memset(k, 0, sizeof(spinor32) * N); +} + + +/* Function provides a constant spinor field of length N */ +void constant_spinor_field(spinor * const k, const int p, const int N) +{ + int ix; + spinor *s; + double * tmp; + s = k; + for (ix = 0; ix < N; ix++) + { + memset(s, 0, sizeof(spinor)); + tmp = (double*) s; + tmp[2*p] = 1.; + s++; + } + return; +} + +/* a random su3 matrix. two unit norm su3 vectors with components drawn from a uniform ditribution + are used to construct an orthogonal third vector. The three vectors then make up the rows + of the matrix */ + +void random_su3_KD(su3 * const u) { + double norm,fact; + _Complex double z; + su3_vector z1,z2,z3; + + random_su3_vector(&z1,RN_UNIF); + for (;;) + { + random_su3_vector(&z2,RN_UNIF); + + z = conj(z1.c0) * z2.c0 + conj(z1.c1) * z2.c1 + conj(z1.c2) * z2.c2; + + _vector_project(z2,z,z1); + + norm=sqrt(_vector_norm_square(z2)); + + if (1.0 != (1.0 + norm)) + break; + } + + fact = 1.0 / norm; + _vector_mul(z2, fact, z2); + + z3.c0 = conj((z1.c1 * z2.c2) - (z1.c2 * z2.c1)); + z3.c1 = conj((z1.c2 * z2.c0) - (z1.c0 * z2.c2)); + z3.c2 = conj((z1.c0 * z2.c1) - (z1.c1 * z2.c0)); + + u->c00 = z1.c0; + u->c01 = z1.c1; + u->c02 = z1.c2; + + u->c10 = z2.c0; + u->c11 = z2.c1; + u->c12 = z2.c2; + + u->c20 = z3.c0; + u->c21 = z3.c1; + u->c22 = z3.c2; + return; +} + + +void unit_g_gauge_field(void) +{ + int ix,mu; + + for (ix=0;ixs0.c0 = c * (1 + I); + s->s0.c1 = c * (1 + I); + s->s0.c2 = c * (1 + I); + s->s1.c0 = c * (1 + I); + s->s1.c1 = c * (1 + I); + s->s1.c2 = c * (1 + I); + s->s2.c0 = c * (1 + I); + s->s2.c1 = c * (1 + I); + s->s2.c2 = c * (1 + I); + s->s3.c0 = c * (1 + I); + s->s3.c1 = c * (1 + I); + s->s3.c2 = c * (1 + I); +} + +void set_spinor_field(int k, const double c) +{ + int ix; + spinor *s; + for (ix=0;ixs0.c0 = c * (1 + I); + s->s0.c1 = c * (1 + I); + s->s0.c2 = c * (1 + I); + s->s1.c0 = c * (1 + I); + s->s1.c1 = c * (1 + I); + s->s1.c2 = c * (1 + I); + s->s2.c0 = c * (1 + I); + s->s2.c1 = c * (1 + I); + s->s2.c2 = c * (1 + I); + s->s3.c0 = c * (1 + I); + s->s3.c1 = c * (1 + I); + s->s3.c2 = c * (1 + I); + } + for (ix=VOLUME/2;ixs0.c0 = 0.; + s->s0.c1 = 0.; + s->s0.c2 = 0.; + s->s1.c0 = 0.; + s->s1.c1 = 0.; + s->s1.c2 = 0.; + s->s2.c0 = 0.; + s->s2.c1 = 0.; + s->s2.c2 = 0.; + s->s3.c0 = 0.; + s->s3.c1 = 0.; + s->s3.c2 = 0.; + } +} + +su3 set_su3(const double c) +{ + su3 u; + + u.c00 = c * (1 + I); + u.c01 = c * (1 + I); + u.c02 = c * (1 + I); + + u.c10 = c * (1 + I); + u.c11 = c * (1 + I); + u.c12 = c * (1 + I); + + u.c20 = c * (1 + I); + u.c21 = c * (1 + I); + u.c22 = c * (1 + I); + + return(u); +} + +void set_gauge_field(const double c) +{ + int ix,mu; + + for (ix=0;ixs0.c0 = 1.0; + else if (ic==1) s->s0.c1 = 1.0; + else if (ic==2) s->s0.c2 = 1.0; + } + else if (is==1){ + if (ic==0) s->s1.c0 = 1.0; + else if (ic==1) s->s1.c1 = 1.0; + else if (ic==2) s->s1.c2 = 1.0; + } + else if (is==2){ + if (ic==0) s->s2.c0 = 1.0; + else if (ic==1) s->s2.c1 = 1.0; + else if (ic==2) s->s2.c2 = 1.0; + } + else if (is==3){ + if (ic==0) s->s3.c0 = 1.0; + else if (ic==1) s->s3.c1 = 1.0; + else if (ic==2) s->s3.c2 = 1.0; + } + } +} + +void source_spinor_field_point_from_file(spinor * const P, spinor * const Q, int is, int ic, int source_indx) +{ + int tmp; + int source_coord[4],source_pe_coord[4],source_loc_coord[4]; + int source_pe_indx,source_loc_indx; + spinor * s; + + /* set fields to zero */ + zero_spinor_field(P,VOLUME/2); + zero_spinor_field(Q,VOLUME/2); + + /* Check if source_indx is valid */ + if((source_indx < 0) || (source_indx >= (g_nproc_t*g_nproc_x*g_nproc_y*g_nproc_z*T*LX*LY*LZ))) + { + printf("Error in the input parameter file, SourceLocation must be in [0,VOLUME-1]! Exiting...!\n"); + exit(1); + } + + /* translate it into global coordinate */ + /* For a T*L^3 lattice then L = g_nproc_z * LZ = g_nproc_y * LY = g_nproc_x * LX */ + source_coord[3]=source_indx % (g_nproc_z * LZ); + tmp = source_indx / (g_nproc_z * LZ); + source_coord[2]=tmp % (g_nproc_y * LY); + tmp = tmp / (g_nproc_y * LY); + source_coord[1]=tmp % (g_nproc_x * LX); + tmp = tmp / (g_nproc_x * LX); + source_coord[0]=tmp; + + if(3*is+ic == index_start && g_proc_id == g_stdio_proc) + printf("# The source site number is %i which corresponds to (t,x,y,z) = (%i,%i,%i,%i)\n",source_indx,source_coord[0],source_coord[1],source_coord[2],source_coord[3]); + + /* compute the coordinates and the index of the node*/ + /* be careful!!! nodes indices have different convention (see io.c)*/ + source_pe_coord[0] = source_coord[0]/T; + source_pe_coord[1] = source_coord[1]/LX; + source_pe_coord[2] = source_coord[2]/LY; + source_pe_coord[3] = source_coord[3]/LZ; + +#ifdef MPI + MPI_Cart_rank(g_cart_grid, source_pe_coord, &source_pe_indx); +#else + source_pe_indx=0; +#endif + + /* compute the local (inside the node) coordinates and index*/ + source_loc_coord[0] = source_coord[0] - source_pe_coord[0] * T; + source_loc_coord[1] = source_coord[1] - source_pe_coord[1] * LX; + source_loc_coord[2] = source_coord[2] - source_pe_coord[2] * LY; + source_loc_coord[3] = source_coord[3] - source_pe_coord[3] * LZ; + + source_loc_indx=g_ipt[source_loc_coord[0]][source_loc_coord[1]][source_loc_coord[2]][source_loc_coord[3]]; + + /* Essayer g_proc_id au lieu de g_cart_id */ + if(source_pe_indx == g_cart_id) + { + if(3*is + ic == index_start && g_debug_level > 1) + { + printf("g_cart_id =%i\n",g_cart_id); + printf("source_loc_coord[0] = %i\n",source_loc_coord[0]); + printf("source_loc_coord[1] = %i\n",source_loc_coord[1]); + printf("source_loc_coord[2] = %i\n",source_loc_coord[2]); + printf("source_loc_coord[3] = %i\n",source_loc_coord[3]); + printf("source_loc_indx = %i\n",source_loc_indx); + } + /* Check which spinor field (even or odd) needs to be initialized */ + if(g_lexic2eo[source_loc_indx] < VOLUME/2) + s = P + g_lexic2eo[source_loc_indx]; + else + s = Q + g_lexic2eosub[source_loc_indx]; + + /* put source to 1.0 */ + if (is==0){ + if (ic==0) s->s0.c0 = 1.0; + else if (ic==1) s->s0.c1 = 1.0; + else if (ic==2) s->s0.c2 = 1.0; + } + else if (is==1){ + if (ic==0) s->s1.c0 = 1.0; + else if (ic==1) s->s1.c1 = 1.0; + else if (ic==2) s->s1.c2 = 1.0; + } + else if (is==2){ + if (ic==0) s->s2.c0 = 1.0; + else if (ic==1) s->s2.c1 = 1.0; + else if (ic==2) s->s2.c2 = 1.0; + } + else if (is==3){ + if (ic==0) s->s3.c0 = 1.0; + else if (ic==1) s->s3.c1 = 1.0; + else if (ic==2) s->s3.c2 = 1.0; + } + } +} + +void start_ranlux_KD(int level, int seed) +{ + unsigned int max_seed,loc_seed; + unsigned int step = g_proc_coords[0]*g_nproc_x*g_nproc_y*g_nproc_z + + g_proc_coords[1]*g_nproc_y*g_nproc_z + + g_proc_coords[2]*g_nproc_z + g_proc_coords[3]; + + max_seed = 2147483647 / g_nproc; + loc_seed = (seed + step*max_seed) % 2147483647; + + if(loc_seed == 0) loc_seed++; + + #ifdef MPI + unsigned int * seeds = calloc(g_nproc,sizeof(unsigned int)); + if(seeds == NULL) fatal_error("Memory allocation for seeds buffer failed!","start_ranlux"); + MPI_Gather(&loc_seed,1,MPI_UNSIGNED,seeds,1,MPI_UNSIGNED,0,MPI_COMM_WORLD); + if(g_proc_id == 0) { + for(int i = 0; i < g_nproc; ++i) { + for(int j = i+1; j < g_nproc; ++j) { + if( seeds[i] == seeds[j] ) { + char error_message[100]; + snprintf(error_message,100,"Process %d and %d have the same seed. Aborting!",i,j); + fatal_error(error_message,"start_ranlux"); + } + } + } + } + free(seeds); + #endif + + if(g_debug_level > 3) { + printf("Local seed is %d proc_id = %d\n", loc_seed, g_proc_id); + } + + rlxs_init(level-1, loc_seed); + rlxd_init(level, loc_seed); +} + +void gen_test_spinor_field(spinor * const k, const int eoflag) { + + int ix,iy,effvol; + spinor *s; + double invind,invvol; + + if (eoflag==1) { + effvol=VOLUME/2; + }else{ + effvol=VOLUME; + } + + invvol=1/(VOLUME*100); + s = k; + + for(ix = 0; ix < effvol; ix++){ + if (eoflag==1) { + iy=g_eo2lexic[ix]; + }else{ + iy=ix; + } + + invind=(double)(((g_coord[iy][0]*g_nproc_x*LX + g_coord[iy][1])*g_nproc_y*LY + g_coord[iy][2])*g_nproc_z*LZ + g_coord[iy][3] + 1.0); + invind=1.0/invind; + s->s0.c0 = invind; + s->s0.c1 = invind+invvol; + s->s0.c2 = invind+invvol/2.0; + s->s1.c0 = invind+invvol/3.0; + s->s1.c1 = invind+invvol/4.0; + s->s1.c2 = invind+invvol/5.0; + s->s2.c0 = invind+invvol/6.0; + s->s2.c1 = invind+invvol/7.0; + s->s2.c2 = invind+invvol/8.0; + s->s3.c0 = invind+invvol/9.0; + s->s3.c1 = invind+invvol/10.0; + s->s3.c2 = invind+invvol/11.0; + s++; + } + +} + +void write_test_spinor_field(spinor * const k, const int eoflag, char * postfix) { + FILE * testout; + char filenames[50]; + int ix,iy,effvol; + + sprintf(filenames,"test_out.%.4d.",g_proc_id); + strcat(filenames,postfix); + testout=fopen(filenames,"w"); + + if (eoflag==1) { + effvol=VOLUME/2; + }else{ + effvol=VOLUME; + } + + for(ix = 0; ix < effvol; ix++){ + if (eoflag==1) { + iy=g_eo2lexic[ix]; + }else{ + iy=ix; + } + fprintf(testout,"[%d,%d,%d,%d;0,0]:%e\n",g_coord[iy][0],g_coord[iy][1],g_coord[iy][2],g_coord[iy][3],creal((k[ix]).s0.c0)); + } + fclose(testout); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/start.h b/qcd/part_cpu/applications/QCD/src/kernel_D/start.h new file mode 100644 index 0000000000000000000000000000000000000000..800a844fbae5ef0966523e29d80240caf44fbabc --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/start.h @@ -0,0 +1,74 @@ +/*********************************************************************** + * + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _START_H +#define _START_H + + +/* functions requesting random numbers can request different distributions by calling _rn_switch + with the first argument set to a random number type as defined below and a function pointer + (see start.c for examples) */ + +#define _rn_switch(type,rn_fn_ptr) \ + switch( type ) { \ + case RN_Z2: \ + rn_fn_ptr = z2_vector; \ + break; \ + case RN_UNIF: \ + rn_fn_ptr = ranlxd; \ + break; \ + case RN_GAUSS: \ + default: \ + rn_fn_ptr = gauss_vector; \ + break; \ + } \ + +/* RN_GAUSS: gaussian ditributed random numbers + RN_UNIF: random numbers drawn from a uniform distribution (this is a simple call to ranlxd!) + RN_Z2: z2 noise */ + +enum RN_TYPE { RN_GAUSS, RN_UNIF, RN_Z2 }; + +void unit_spinor_field(const int k); +void zero_spinor_field(spinor * const k, const int N); +void zero_spinor_field_32(spinor32 * const k, const int N); +void constant_spinor_field(spinor * const k, const int p, const int N); + +void random_spinor_field_lexic(spinor * const k, const int repro, const enum RN_TYPE rn_type); +void random_spinor_field_eo(spinor * const k, const int repro, const enum RN_TYPE rn_type); + +void unit_g_gauge_field(void); + +void random_gauge_field(const int repro, su3 ** const gf); + +double random_su3adj_field(const int repro, su3adj ** const momenta); + +void set_spinor_field(int k, const double c); +void set_gauge_field(const double c); +void set_spinor_point(spinor * s, const double c); +su3 set_su3(const double c); + +void source_spinor_field(spinor * const P, spinor * const Q, int is, int ic); +void source_spinor_field_point_from_file(spinor * const P, spinor * const Q, int is, int ic, int source_indx); + +void start_ranlux_KD(int level,int seed); + +void gen_test_spinor_field(spinor * const k , const int eoflag); +void write_test_spinor_field(spinor * const k , const int eoflag, char * postfix); +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/su3.h b/qcd/part_cpu/applications/QCD/src/kernel_D/su3.h new file mode 100644 index 0000000000000000000000000000000000000000..4ab88d49903b0eb18ded1408f9775e81725ef448 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/su3.h @@ -0,0 +1,717 @@ +#ifndef _SU3_H +#define _SU3_H + +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * File su3.h + * + * Type definitions and macros for SU(3) matrices and spinors + * + * Version: 1.0 + * Author: Martin Luescher + * Date: 24.10.2000 + * + * Extended by Martin Hasenbusch 2001. + * Rewritten for C99 complex by Albert Deuzeman 2012 + * + *******************************************************************************/ + +#include +#if (defined XLC && defined BGL) +# include "bgl.h" +#endif + +typedef struct +{ + _Complex double c00, c01, c02, c10, c11, c12, c20, c21, c22; +} su3; + +typedef struct +{ + _Complex float c00, c01, c02, c10, c11, c12, c20, c21, c22; +} su3_32; + +typedef struct +{ + _Complex double c0,c1,c2; +} su3_vector; + +typedef struct +{ + _Complex float c0,c1,c2; +} su3_vector32; + +typedef struct +{ + su3_vector s0,s1,s2,s3; +} spinor; + +typedef struct +{ + su3_vector32 s0,s1,s2,s3; +} spinor32; + +typedef struct +{ + su3_vector s0, s1; +} halfspinor; + +typedef struct +{ + su3_vector32 s0, s1; +} halfspinor32; + +typedef struct +{ + spinor sp_up,sp_dn; +} bispinor; + +typedef struct +{ + _Complex double s00,s01,s02,s03,s10,s11,s12,s13,s20,s21,s22,s23,s30,s31,s32,s33; +} spinor_matrix; + +typedef struct +{ + _Complex double sc0,sc1,sc2,sc3; +} complex_spinor; + + +/******************************************************************************* +* +* Macros for SU(3) vectors +* +* Arguments are variables of type su3_vector (or su3 in the case of the +* matrix times vector multiplication macros) +* +*******************************************************************************/ +/* M. Hasenbusch Mon Sep 24 +* r.c1=0 +* r.c2=0 +* r.c3=0 +*/ +#define _vector_null(r) \ + (r).c0 = 0.; \ + (r).c1 = 0.; \ + (r).c2 = 0.; + +/* M. Hasenbusch Mon Sep 24 +* r.c1=s.c1 +* r.c2=s.c2 +* r.c3=s.c3 +*/ + +/* The following should be taken care of by the compiler, actually... Just redefine as _vector_assign */ +#define _vector_assign32(r,s) _vector_assign(r,s) + +#define _vector_assign(r,s) \ + (r).c0 = (s).c0; \ + (r).c1 = (s).c1; \ + (r).c2 = (s).c2; + + +#define _spinor_assign(r,s) \ + _vector_assign((r).s0,(s).s0);\ + _vector_assign((r).s1,(s).s1);\ + _vector_assign((r).s2,(s).s2);\ + _vector_assign((r).s3,(s).s3); + +#define _vector_norm_square(r) \ + conj((r).c0) * (r).c0 + conj((r).c1) * (r).c1 + conj((r).c2) * (r).c2 + +#define _vector_minus_assign(r,s) \ + (r).c0 = -(s).c0; \ + (r).c1 = -(s).c1; \ + (r).c2 = -(s).c2; + +#define _vector_mul(r,c,s) \ + (r).c0 = (c) * (s).c0; \ + (r).c1 = (c) * (s).c1; \ + (r).c2 = (c) * (s).c2; + +#define _vector_add_mul(r,c,s) \ + (r).c0 += (c) * (s).c0; \ + (r).c1 += (c) * (s).c1; \ + (r).c2 += (c) * (s).c2; + + +/* r += I * c * s (c real) */ +#define _vector_add_i_mul(r,c,s) \ + (r).c0 += I*(c)*(s).c0; \ + (r).c1 += I*(c)*(s).c1; \ + (r).c2 += I*(c)*(s).c2; + +#if ((defined SSE2)||(defined SSE3)) + +#define _vector_add(r,s1,s2) \ +_sse_load(s1); \ +_sse_load_up(s2); \ +_sse_vector_add(); \ +_sse_store(r); + +#define _vector_sub(r,s1,s2) \ +_sse_load(s1); \ +_sse_load_up(s2); \ +_sse_vector_sub(); \ +_sse_store(r); + +#elif (defined XLC && defined BGLNOTCHECKED) + +#define _vector_add(r,s1,s2) \ + _bgl_load(s1); \ + _bgl_load_up(s2); \ + _bgl_vector_add(); \ + _bgl_store(r); + +#define _vector_sub(r,s1,s2) \ + _bgl_load(s1); \ + _bgl_load_up(s2); \ + _bgl_vector_sub(); \ + _bgl_store(r); + +#else + +#define _vector_add(r,s1,s2) \ + (r).c0 = (s1).c0 + (s2).c0; \ + (r).c1 = (s1).c1 + (s2).c1; \ + (r).c2 = (s1).c2 + (s2).c2; + +#define _vector_sub(r,s1,s2) \ + (r).c0 = (s1).c0 - (s2).c0; \ + (r).c1 = (s1).c1 - (s2).c1; \ + (r).c2 = (s1).c2 - (s2).c2; +#endif + + +#define _vector_i_add(r,s1,s2) \ + (r).c0 = (s1).c0 + I * (s2).c0; \ + (r).c1 = (s1).c1 + I * (s2).c1; \ + (r).c2 = (s1).c2 + I * (s2).c2; + +#define _vector_i_sub(r,s1,s2) \ + (r).c0 = (s1).c0 - I * (s2).c0; \ + (r).c1 = (s1).c1 - I * (s2).c1; \ + (r).c2 = (s1).c2 - I * (s2).c2; + +#define _vector_combined_add_i_add(r1, s1, r2, s2, s) \ + (r1).c0 = (s1).c0 + (s).c0; \ + (r2).c0 = (s2).c0 + I * (s).c0; \ + (r1).c1 = (s1).c1 + (s).c1; \ + (r2).c1 = (s2).c1 + I * (s).c1; \ + (r1).c2 = (s1).c2 + (s).c2; \ + (r2).c2 = (s2).c2 + I * (s).c2; \ + +#if ((defined SSE2) || (defined SSE3)) + +#define _vector_add_assign(r,s) \ +_sse_load(r); \ +_sse_load_up(s); \ +_sse_vector_add(); \ +_sse_store(r); + +#define _vector_sub_assign(r,s) \ +_sse_load(r); \ +_sse_load_up(s); \ +_sse_vector_sub(); \ +_sse_store(r); + +#else + +#define _vector_add_assign(r,s) \ + (r).c0 += (s).c0; \ + (r).c1 += (s).c1; \ + (r).c2 += (s).c2; + +#define _vector_sub_assign(r,s) \ + (r).c0 -= (s).c0; \ + (r).c1 -= (s).c1; \ + (r).c2 -= (s).c2; + +#endif + +#define _vector_i_add_assign(r,s) \ + (r).c0 += I * (s).c0; \ + (r).c1 += I * (s).c1; \ + (r).c2 += I * (s).c2; + + +#define _vector_i_sub_assign(r,s) \ + (r).c0 -= I * (s).c0; \ + (r).c1 -= I * (s).c1; \ + (r).c2 -= I * (s).c2; + +#define complex_times_vector(r,c,s) \ + (r).c0 = (c) * (s).c0; \ + (r).c1 = (c) * (s).c1; \ + (r).c2 = (c) * (s).c2; + +/* M.Hasenbusch */ +#define _complexcjg_times_vector(r,c,s) \ + (r).c0 = conj(c) * (s).c0; \ + (r).c1 = conj(c) * (s).c1; \ + (r).c2 = conj(c) * (s).c2; + +#define _vector_project(r,z,s) \ + (r).c0 -= z * (s).c0; \ + (r).c1 -= z * (s).c1; \ + (r).c2 -= z * (s).c2; + +#if ((defined SSE2) || (defined SSE3)) + +#define _su3_multiply(r,u,s) \ +_sse_load(s); \ +_sse_su3_multiply(u); \ +_sse_store_up(r); + +#define _su3_inverse_multiply(r,u,s) \ +_sse_load(s); \ +_sse_su3_inverse_multiply(u); \ +_sse_store_up(r); + +#elif (defined XLC && defined BGLNOTCHECKED) + +#define _su3_multiply(r,u,s) \ + _bgl_load(s); \ + _bgl_su3_multiply(u); \ + _bgl_store_up(r); + +#define _su3_inverse_multiply(r,u,s) \ + _bgl_load(s); \ + _bgl_su3_inverse_multiply(u); \ + _bgl_store_up(r); + +#else + +#define _su3_multiply(r,u,s) \ + (r).c0 = (u).c00 * (s).c0 + (u).c01 * (s).c1 + (u).c02 * (s).c2; \ + (r).c1 = (u).c10 * (s).c0 + (u).c11 * (s).c1 + (u).c12 * (s).c2; \ + (r).c2 = (u).c20 * (s).c0 + (u).c21 * (s).c1 + (u).c22 * (s).c2; + +#define _su3_inverse_multiply(r,u,s) \ +(r).c0 = conj((u).c00) * (s).c0 + conj((u).c10) * (s).c1 + conj((u).c20) * (s).c2; \ +(r).c1 = conj((u).c01) * (s).c0 + conj((u).c11) * (s).c1 + conj((u).c21) * (s).c2; \ +(r).c2 = conj((u).c02) * (s).c0 + conj((u).c12) * (s).c1 + conj((u).c22) * (s).c2; + +#endif + +/******************************************************************************* +* +* Macros for SU(3) matrices +* +* Arguments are variables of type su3 +* +*******************************************************************************/ + +#define _su3_norm_sq(x,u) \ + x = (u).c00 * conj((u).c00) + (u).c01 * conj((u).c01) + (u).c02 * conj((u).c02) \ + (u).c10 * conj((u).c10) + (u).c11 * conj((u).c11) + (u).c12 * conj((u).c12) \ + (u).c20 * conj((u).c20) + (u).c21 * conj((u).c21) + (u).c22 * conj((u).c22); + +#define _su3_one(u) \ + (u).c00 = 1.; \ + (u).c01 = 0.; \ + (u).c02 = 0.; \ + (u).c10 = 0.; \ + (u).c11 = 1.; \ + (u).c12 = 0.; \ + (u).c20 = 0.; \ + (u).c21 = 0.; \ + (u).c22 = 1.; + +#define _su3_zero(u) \ + (u).c00 = 0.; \ + (u).c01 = 0.; \ + (u).c02 = 0.; \ + (u).c10 = 0.; \ + (u).c11 = 0.; \ + (u).c12 = 0.; \ + (u).c20 = 0.; \ + (u).c21 = 0.; \ + (u).c22 = 0.; + +#define _su3_assign(u,v) \ + (u).c00 = (v).c00; \ + (u).c01 = (v).c01; \ + (u).c02 = (v).c02; \ + (u).c10 = (v).c10; \ + (u).c11 = (v).c11; \ + (u).c12 = (v).c12; \ + (u).c20 = (v).c20; \ + (u).c21 = (v).c21; \ + (u).c22 = (v).c22; + +#define _su3_minus_assign(u,v) \ + (u).c00 = -(v).c00; \ + (u).c01 = -(v).c01; \ + (u).c02 = -(v).c02; \ + (u).c10 = -(v).c10; \ + (u).c11 = -(v).c11; \ + (u).c12 = -(v).c12; \ + (u).c20 = -(v).c20; \ + (u).c21 = -(v).c21; \ + (u).c22 = -(v).c22; + +#define _su3_dagger(u,v) \ + (u).c00 = conj((v).c00); \ + (u).c01 = conj((v).c10); \ + (u).c02 = conj((v).c20); \ + (u).c10 = conj((v).c01); \ + (u).c11 = conj((v).c11); \ + (u).c12 = conj((v).c21); \ + (u).c20 = conj((v).c02); \ + (u).c21 = conj((v).c12); \ + (u).c22 = conj((v).c22); + +#define _su3_transpose(u,v) \ + (u).c00 = ((v).c00); \ + (u).c01 = ((v).c10); \ + (u).c02 = ((v).c20); \ + (u).c10 = ((v).c01); \ + (u).c11 = ((v).c11); \ + (u).c12 = ((v).c21); \ + (u).c20 = ((v).c02); \ + (u).c21 = ((v).c12); \ + (u).c22 = ((v).c22); + +#define _itimes_su3(u,v) \ + (u).c00 = I * (v).c00; \ + (u).c01 = I * (v).c01; \ + (u).c02 = I * (v).c02; \ + (u).c10 = I * (v).c10; \ + (u).c11 = I * (v).c11; \ + (u).c12 = I * (v).c12; \ + (u).c20 = I * (v).c20; \ + (u).c21 = I * (v).c21; \ + (u).c22 = I * (v).c22; + +#define _real_times_su3(u,a,v) \ + (u).c00 = (a) * (v).c00; \ + (u).c01 = (a) * (v).c01; \ + (u).c02 = (a) * (v).c02; \ + (u).c10 = (a) * (v).c10; \ + (u).c11 = (a) * (v).c11; \ + (u).c12 = (a) * (v).c12; \ + (u).c20 = (a) * (v).c20; \ + (u).c21 = (a) * (v).c21; \ + (u).c22 = (a) * (v).c22; + +#define _real_times_su3_plus_real_times_su3(u, a, v, b, w) \ + (u).c00 = (a)*(v).c00 + (b)*(w).c00; \ + (u).c01 = (a)*(v).c01 + (b)*(w).c01; \ + (u).c02 = (a)*(v).c02 + (b)*(w).c02; \ + (u).c10 = (a)*(v).c10 + (b)*(w).c10; \ + (u).c11 = (a)*(v).c11 + (b)*(w).c11; \ + (u).c12 = (a)*(v).c12 + (b)*(w).c12; \ + (u).c20 = (a)*(v).c20 + (b)*(w).c20; \ + (u).c21 = (a)*(v).c21 + (b)*(w).c21; \ + (u).c22 = (a)*(v).c22 + (b)*(w).c22; + +#define _su3_minus_su3(u,v,w) \ + (u).c00 = (v).c00 - (w).c00; \ + (u).c01 = (v).c01 - (w).c01; \ + (u).c02 = (v).c02 - (w).c02; \ + (u).c10 = (v).c10 - (w).c10; \ + (u).c11 = (v).c11 - (w).c11; \ + (u).c12 = (v).c12 - (w).c12; \ + (u).c20 = (v).c20 - (w).c20; \ + (u).c21 = (v).c21 - (w).c21; \ + (u).c22 = (v).c22 - (w).c22; \ + +#define _itimes_su3_minus_su3(u,v,w) \ + (u).c00 = I * ((v).c00 - (w).c00); \ + (u).c01 = I * ((v).c01 - (w).c01); \ + (u).c02 = I * ((v).c02 - (w).c02); \ + (u).c10 = I * ((v).c10 - (w).c10); \ + (u).c11 = I * ((v).c11 - (w).c11); \ + (u).c12 = I * ((v).c12 - (w).c12); \ + (u).c20 = I * ((v).c20 - (w).c20); \ + (u).c21 = I * ((v).c21 - (w).c21); \ + (u).c22 = I * ((v).c22 - (w).c22); + +#define _su3_plus_su3(u,v,w) \ + (u).c00 = (v).c00 + (w).c00; \ + (u).c01 = (v).c01 + (w).c01; \ + (u).c02 = (v).c02 + (w).c02; \ + (u).c10 = (v).c10 + (w).c10; \ + (u).c11 = (v).c11 + (w).c11; \ + (u).c12 = (v).c12 + (w).c12; \ + (u).c20 = (v).c20 + (w).c20; \ + (u).c21 = (v).c21 + (w).c21; \ + (u).c22 = (v).c22 + (w).c22; + +#define _minus_su3_plus_su3(u,v,w) \ + (u).c00 = -((v).c00 + (w).c00); \ + (u).c01 = -((v).c01 + (w).c01); \ + (u).c02 = -((v).c02 + (w).c02); \ + (u).c10 = -((v).c10 + (w).c10); \ + (u).c11 = -((v).c11 + (w).c11); \ + (u).c12 = -((v).c12 + (w).c12); \ + (u).c20 = -((v).c20 + (w).c20); \ + (u).c21 = -((v).c21 + (w).c21); \ + (u).c22 = -((v).c22 + (w).c22); + +#define _itimes_su3_plus_su3(u,v,w) \ + (u).c00 = I * ((v).c00 + (w).c00); \ + (u).c01 = I * ((v).c01 + (w).c01); \ + (u).c02 = I * ((v).c02 + (w).c02); \ + (u).c10 = I * ((v).c10 + (w).c10); \ + (u).c11 = I * ((v).c11 + (w).c11); \ + (u).c12 = I * ((v).c12 + (w).c12); \ + (u).c20 = I * ((v).c20 + (w).c20); \ + (u).c21 = I * ((v).c21 + (w).c21); \ + (u).c22 = I * ((v).c22 + (w).c22); + +#define _minus_itimes_su3_plus_su3(u,v,w) \ + (u).c00 = -I * ((v).c00 + (w).c00); \ + (u).c01 = -I * ((v).c01 + (w).c01); \ + (u).c02 = -I * ((v).c02 + (w).c02); \ + (u).c10 = -I * ((v).c10 + (w).c10); \ + (u).c11 = -I * ((v).c11 + (w).c11); \ + (u).c12 = -I * ((v).c12 + (w).c12); \ + (u).c20 = -I * ((v).c20 + (w).c20); \ + (u).c21 = -I * ((v).c21 + (w).c21); \ + (u).c22 = -I * ((v).c22 + (w).c22); + +#define _complex_times_su3(r,c,s) \ + (r).c00 = (c) * (s).c00; \ + (r).c01 = (c) * (s).c01; \ + (r).c02 = (c) * (s).c02; \ + (r).c10 = (c) * (s).c10; \ + (r).c11 = (c) * (s).c11; \ + (r).c12 = (c) * (s).c12; \ + (r).c20 = (c) * (s).c20; \ + (r).c21 = (c) * (s).c21; \ + (r).c22 = (c) * (s).c22; + +#define _complexcjg_times_su3(r,c,s) \ + (r).c00 = conj(c) * (s).c00; \ + (r).c01 = conj(c) * (s).c01; \ + (r).c02 = conj(c) * (s).c02; \ + (r).c10 = conj(c) * (s).c10; \ + (r).c11 = conj(c) * (s).c11; \ + (r).c12 = conj(c) * (s).c12; \ + (r).c20 = conj(c) * (s).c20; \ + (r).c21 = conj(c) * (s).c21; \ + (r).c22 = conj(c) * (s).c22; + + +/* M. Hasenbusch +* su3_acc +*/ +#if ((defined SSE2) || (defined SSE3)) +#define _su3_acc(u,v) _sse_su3_acc(u,v) +#else +#define _su3_acc(u,v) \ + (u).c00 += (v).c00; \ + (u).c01 += (v).c01; \ + (u).c02 += (v).c02; \ + (u).c10 += (v).c10; \ + (u).c11 += (v).c11; \ + (u).c12 += (v).c12; \ + (u).c20 += (v).c20; \ + (u).c21 += (v).c21; \ + (u).c22 += (v).c22; +#endif + +/* +* su3_refac_acc +*/ +#define _su3_refac_acc(u,a,v) \ + (u).c00 += a * (v).c00; \ + (u).c01 += a * (v).c01; \ + (u).c02 += a * (v).c02; \ + (u).c10 += a * (v).c10; \ + (u).c11 += a * (v).c11; \ + (u).c12 += a * (v).c12; \ + (u).c20 += a * (v).c20; \ + (u).c21 += a * (v).c21; \ + (u).c22 += a * (v).c22; + +/* +* su3_imfac_acc +*/ +#define _su3_imfac_acc(u,a,v) \ + (u).c00 += I * a * (v).c00; \ + (u).c01 += I * a * (v).c01; \ + (u).c02 += I * a * (v).c02; \ + (u).c10 += I * a * (v).c10; \ + (u).c11 += I * a * (v).c11; \ + (u).c12 += I * a * (v).c12; \ + (u).c20 += I * a * (v).c20; \ + (u).c21 += I * a * (v).c21; \ + (u).c22 += I * a * (v).c22; + +#define _su3_square_norm(s, v) \ + s = conj(v.c00) * (v.c00) + conj(v.c01) * (v.c01) + conj(v.c02) * (v.c02) + \ + conj(v.c10) * (v.c10) + conj(v.c11) * (v.c11) + conj(v.c12) * (v.c12) + \ + conj(v.c20) * (v.c20) + conj(v.c21) * (v.c21) + conj(v.c22) * (v.c22); + +#if ((defined SSE2) || (defined SSE3)) + +#define _su3_times_su3(u,v,w) _sse_su3_times_su3(u,v,w) +#define _su3_times_su3_acc(u,v,w) _sse_su3_times_su3_acc(u,v,w) +#define _su3d_times_su3(u,v,w) _sse_su3d_times_su3(u,v,w) +#define _su3d_times_su3_acc(u,v,w) _sse_su3d_times_su3_acc(u,v,w) +#define _su3_times_su3d(u,v,w) _sse_su3_times_su3d(u,v,w) +#define _su3_times_su3d_acc(u,v,w) _sse_su3_times_su3d_acc(u,v,w) + +#else + +#define _su3_times_su3(u,v,w) \ + (u).c00 = (v).c00 * (w).c00 + (v).c01 * (w).c10 + (v).c02*(w).c20; \ + (u).c01 = (v).c00 * (w).c01 + (v).c01 * (w).c11 + (v).c02*(w).c21; \ + (u).c02 = (v).c00 * (w).c02 + (v).c01 * (w).c12 + (v).c02*(w).c22; \ + (u).c10 = (v).c10 * (w).c00 + (v).c11 * (w).c10 + (v).c12*(w).c20; \ + (u).c11 = (v).c10 * (w).c01 + (v).c11 * (w).c11 + (v).c12*(w).c21; \ + (u).c12 = (v).c10 * (w).c02 + (v).c11 * (w).c12 + (v).c12*(w).c22; \ + (u).c20 = (v).c20 * (w).c00 + (v).c21 * (w).c10 + (v).c22*(w).c20; \ + (u).c21 = (v).c20 * (w).c01 + (v).c21 * (w).c11 + (v).c22*(w).c21; \ + (u).c22 = (v).c20 * (w).c02 + (v).c21 * (w).c12 + (v).c22*(w).c22; \ + +#define _su3_times_su3_acc(u,v,w) \ + (u).c00 += (v).c00 * (w).c00 + (v).c01*(w).c10 + (v).c02*(w).c20; \ + (u).c01 += (v).c00 * (w).c01 + (v).c01*(w).c11 + (v).c02*(w).c21; \ + (u).c02 += (v).c00 * (w).c02 + (v).c01*(w).c12 + (v).c02*(w).c22; \ + (u).c10 += (v).c10 * (w).c00 + (v).c11*(w).c10 + (v).c12*(w).c20; \ + (u).c11 += (v).c10 * (w).c01 + (v).c11*(w).c11 + (v).c12*(w).c21; \ + (u).c12 += (v).c10 * (w).c02 + (v).c11*(w).c12 + (v).c12*(w).c22; \ + (u).c20 += (v).c20 * (w).c00 + (v).c21*(w).c10 + (v).c22*(w).c20; \ + (u).c21 += (v).c20 * (w).c01 + (v).c21*(w).c11 + (v).c22*(w).c21; \ + (u).c22 += (v).c20 * (w).c02 + (v).c21*(w).c12 + (v).c22*(w).c22; + +#define _su3_times_su3d(u,v,w) \ + (u).c00 = (v).c00 * conj((w).c00) + (v).c01 * conj((w).c01) + (v).c02 * conj((w).c02); \ + (u).c01 = (v).c00 * conj((w).c10) + (v).c01 * conj((w).c11) + (v).c02 * conj((w).c12); \ + (u).c02 = (v).c00 * conj((w).c20) + (v).c01 * conj((w).c21) + (v).c02 * conj((w).c22); \ + (u).c10 = (v).c10 * conj((w).c00) + (v).c11 * conj((w).c01) + (v).c12 * conj((w).c02); \ + (u).c11 = (v).c10 * conj((w).c10) + (v).c11 * conj((w).c11) + (v).c12 * conj((w).c12); \ + (u).c12 = (v).c10 * conj((w).c20) + (v).c11 * conj((w).c21) + (v).c12 * conj((w).c22); \ + (u).c20 = (v).c20 * conj((w).c00) + (v).c21 * conj((w).c01) + (v).c22 * conj((w).c02); \ + (u).c21 = (v).c20 * conj((w).c10) + (v).c21 * conj((w).c11) + (v).c22 * conj((w).c12); \ + (u).c22 = (v).c20 * conj((w).c20) + (v).c21 * conj((w).c21) + (v).c22 * conj((w).c22); + +#define _su3_times_su3d_acc(u,v,w) \ + (u).c00 += (v).c00 * conj((w).c00) + (v).c01 * conj((w).c01) + (v).c02 * conj((w).c02); \ + (u).c01 += (v).c00 * conj((w).c10) + (v).c01 * conj((w).c11) + (v).c02 * conj((w).c12); \ + (u).c02 += (v).c00 * conj((w).c20) + (v).c01 * conj((w).c21) + (v).c02 * conj((w).c22); \ + (u).c10 += (v).c10 * conj((w).c00) + (v).c11 * conj((w).c01) + (v).c12 * conj((w).c02); \ + (u).c11 += (v).c10 * conj((w).c10) + (v).c11 * conj((w).c11) + (v).c12 * conj((w).c12); \ + (u).c12 += (v).c10 * conj((w).c20) + (v).c11 * conj((w).c21) + (v).c12 * conj((w).c22); \ + (u).c20 += (v).c20 * conj((w).c00) + (v).c21 * conj((w).c01) + (v).c22 * conj((w).c02); \ + (u).c21 += (v).c20 * conj((w).c10) + (v).c21 * conj((w).c11) + (v).c22 * conj((w).c12); \ + (u).c22 += (v).c20 * conj((w).c20) + (v).c21 * conj((w).c21) + (v).c22 * conj((w).c22); + +#define _su3d_times_su3(u,v,w) \ + (u).c00 = conj((v).c00) * (w).c00 + conj((v).c10) * (w).c10 + conj((v).c20) * (w).c20; \ + (u).c01 = conj((v).c00) * (w).c01 + conj((v).c10) * (w).c11 + conj((v).c20) * (w).c21; \ + (u).c02 = conj((v).c00) * (w).c02 + conj((v).c10) * (w).c12 + conj((v).c20) * (w).c22; \ + (u).c10 = conj((v).c01) * (w).c00 + conj((v).c11) * (w).c10 + conj((v).c21) * (w).c20; \ + (u).c11 = conj((v).c01) * (w).c01 + conj((v).c11) * (w).c11 + conj((v).c21) * (w).c21; \ + (u).c12 = conj((v).c01) * (w).c02 + conj((v).c11) * (w).c12 + conj((v).c21) * (w).c22; \ + (u).c20 = conj((v).c02) * (w).c00 + conj((v).c12) * (w).c10 + conj((v).c22) * (w).c20; \ + (u).c21 = conj((v).c02) * (w).c01 + conj((v).c12) * (w).c11 + conj((v).c22) * (w).c21; \ + (u).c22 = conj((v).c02) * (w).c02 + conj((v).c12) * (w).c12 + conj((v).c22) * (w).c22; + +#define _su3d_times_su3_acc(u,v,w) \ + (u).c00 += conj((v).c00) * (w).c00 + conj((v).c10) * (w).c10 + conj((v).c20) * (w).c20; \ + (u).c01 += conj((v).c00) * (w).c01 + conj((v).c10) * (w).c11 + conj((v).c20) * (w).c21; \ + (u).c02 += conj((v).c00) * (w).c02 + conj((v).c10) * (w).c12 + conj((v).c20) * (w).c22; \ + (u).c10 += conj((v).c01) * (w).c00 + conj((v).c11) * (w).c10 + conj((v).c21) * (w).c20; \ + (u).c11 += conj((v).c01) * (w).c01 + conj((v).c11) * (w).c11 + conj((v).c21) * (w).c21; \ + (u).c12 += conj((v).c01) * (w).c02 + conj((v).c11) * (w).c12 + conj((v).c21) * (w).c22; \ + (u).c20 += conj((v).c02) * (w).c00 + conj((v).c12) * (w).c10 + conj((v).c22) * (w).c20; \ + (u).c21 += conj((v).c02) * (w).c01 + conj((v).c12) * (w).c11 + conj((v).c22) * (w).c21; \ + (u).c22 += conj((v).c02) * (w).c02 + conj((v).c12) * (w).c12 + conj((v).c22) * (w).c22; + +#endif + +#define _su3_minus_const_times_im_trace_su3(w,c,v) \ + (w).c00 -= I*c*(cimag((v).c00) + cimag((v).c11) + cimag((v).c22)); \ + (w).c11 -= I*c*(cimag((v).c00) + cimag((v).c11) + cimag((v).c22)); \ + (w).c22 -= I*c*(cimag((v).c00) + cimag((v).c11) + cimag((v).c22)); + +#define _trace_su3_times_su3d(x,v,w) \ + x = (v).c00 * conj((w).c00) \ + + (v).c01 * conj((w).c01) \ + + (v).c02 * conj((w).c02) \ + + (v).c10 * conj((w).c10) \ + + (v).c11 * conj((w).c11) \ + + (v).c12 * conj((w).c12) \ + + (v).c20 * conj((w).c20) \ + + (v).c21 * conj((w).c21) \ + + (v).c22 * conj((w).c22); + +#define _trace_su3_times_su3(x,v,w) \ + x = (v).c00 * (w).c00 \ + + (v).c01 * (w).c10 \ + + (v).c02 * (w).c20 \ + + (v).c10 * (w).c01 \ + + (v).c11 * (w).c11 \ + + (v).c12 * (w).c21 \ + + (v).c20 * (w).c02 \ + + (v).c21 * (w).c12 \ + + (v).c22 * (w).c22; + +#define _complex_times_vector(x, c, y) \ + x.c0 = (c) * (y).c0; \ + x.c1 = (c) * (y).c1; \ + x.c2 = (c) * (y).c2; + +#define _vector_tensor_vector(t,u,v) \ + (t).c00 = (u).c0 * conj((v).c0); \ + (t).c01 = (u).c0 * conj((v).c1); \ + (t).c02 = (u).c0 * conj((v).c2); \ + (t).c10 = (u).c1 * conj((v).c0); \ + (t).c11 = (u).c1 * conj((v).c1); \ + (t).c12 = (u).c1 * conj((v).c2); \ + (t).c20 = (u).c2 * conj((v).c0); \ + (t).c21 = (u).c2 * conj((v).c1); \ + (t).c22 = (u).c2 * conj((v).c2); + +#define _mvector_tensor_vector(t,u,v) \ + (t).c00 = -(u).c0 * conj((v).c0); \ + (t).c01 = -(u).c0 * conj((v).c1); \ + (t).c02 = -(u).c0 * conj((v).c2); \ + (t).c10 = -(u).c1 * conj((v).c0); \ + (t).c11 = -(u).c1 * conj((v).c1); \ + (t).c12 = -(u).c1 * conj((v).c2); \ + (t).c20 = -(u).c2 * conj((v).c0); \ + (t).c21 = -(u).c2 * conj((v).c1); \ + (t).c22 = -(u).c2 * conj((v).c2); + + +#define _vector_tensor_vector_add(t, u, v, w, z) \ + (t).c00 = (u).c0 * conj((v).c0) + (w).c0 * conj((z).c0) ; \ + (t).c01 = (u).c0 * conj((v).c1) + (w).c0 * conj((z).c1); \ + (t).c02 = (u).c0 * conj((v).c2) + (w).c0 * conj((z).c2); \ + (t).c10 = (u).c1 * conj((v).c0) + (w).c1 * conj((z).c0); \ + (t).c11 = (u).c1 * conj((v).c1) + (w).c1 * conj((z).c1); \ + (t).c12 = (u).c1 * conj((v).c2) + (w).c1 * conj((z).c2); \ + (t).c20 = (u).c2 * conj((v).c0) + (w).c2 * conj((z).c0); \ + (t).c21 = (u).c2 * conj((v).c1) + (w).c2 * conj((z).c1); \ + (t).c22 = (u).c2 * conj((v).c2) + (w).c2 * conj((z).c2); + +#define _su3_add_equals_complex_identity(u, c) \ + (u).c00 += (c); \ + (u).c11 += (c); \ + (u).c22 += (c); + +#endif + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/su3adj.h b/qcd/part_cpu/applications/QCD/src/kernel_D/su3adj.h new file mode 100644 index 0000000000000000000000000000000000000000..02880edc238a7182b159217d9c2c0da4c1d4f4d5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/su3adj.h @@ -0,0 +1,307 @@ +/*********************************************************************** + * + * Copyright (C) 2001 Martin Hasenbusch + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _SU3ADJ_H +#define _SU3ADJ_H + +typedef struct +{ + double d1, d2, d3, d4, d5, d6, d7, d8; +} su3adj; + +/******************************************************************************* +* +* Macros for su3adj +* +* Arguments are variables of type su3 +* +*******************************************************************************/ + +/* + * + * a = p_j * \lambda_j + * + * \lambda_j are the eight + * Gell-Mann matrices + * + */ + +#define _make_su3(v,p) \ +(v).c00 = 0.0 + (0.5773502691896258 * (p).d8 + (p).d3) * I; \ +(v).c01 = (p).d2 + (p).d1 * I; \ +(v).c02 = (p).d5 + (p).d4 * I; \ +(v).c10 = -(p).d2 + (p).d1 * I; \ +(v).c11 = 0.0 + (0.5773502691896258 * (p).d8 - (p).d3) * I; \ +(v).c12 = (p).d7 + (p).d6 * I; \ +(v).c20 = -(p).d5 + (p).d4 * I; \ +(v).c21 = -(p).d7 + (p).d6 * I; \ +(v).c22 = 0.0 - (1.154700538379252 * (p).d8) * I; + +/* + * + * r_j = Re(tr(a*\lambda_j)) + * + * 0.577350269189625 = 1/sqrt(3) + * + * \lambda_j are the eight + * Gell-Mann matrices + * + */ + +#define _trace_lambda(r,a) \ +(r).d1=-cimag((a).c10)-cimag((a).c01); \ +(r).d2=+creal((a).c10)-creal((a).c01); \ +(r).d3=-cimag((a).c00)+cimag((a).c11); \ +(r).d4=-cimag((a).c20)-cimag((a).c02); \ +(r).d5=+creal((a).c20)-creal((a).c02); \ +(r).d6=-cimag((a).c21)-cimag((a).c12); \ +(r).d7=+creal((a).c21)-creal((a).c12); \ +(r).d8=(-cimag((a).c00)-cimag((a).c11) + 2.0 * cimag((a).c22))*0.577350269189625; + +#define _trace_lambda_mul(r,c,a) \ +(r).d1=c*(+cimag((a).c10)-cimag((a).c01)); \ +(r).d2=c*(+creal((a).c10)-creal((a).c01)); \ +(r).d3=c*(-cimag((a).c00)+cimag((a).c11)); \ +(r).d4=c*(-cimag((a).c20)-cimag((a).c02)); \ +(r).d5=c*(+creal((a).c20)-creal((a).c02)); \ +(r).d6=c*(-cimag((a).c21)-cimag((a).c12)); \ +(r).d7=c*(+creal((a).c21)-creal((a).c12)); \ +(r).d8=c*((-cimag((a).c00)-cimag((a).c11) + 2.0 * cimag((a).c22))*0.577350269189625); + +#define _add_trace_lambda(r,a) \ +(r).d1+=-cimag((a).c10)-cimag((a).c01); \ +(r).d2+=+creal((a).c10)-creal((a).c01); \ +(r).d3+=-cimag((a).c00)+cimag((a).c11); \ +(r).d4+=-cimag((a).c20)-cimag((a).c02); \ +(r).d5+=+creal((a).c20)-creal((a).c02); \ +(r).d6+=-cimag((a).c21)-cimag((a).c12); \ +(r).d7+=+creal((a).c21)-creal((a).c12); \ +(r).d8+=(-cimag((a).c00)-cimag((a).c11) + 2.0*cimag(a.c22))*0.577350269189625; + +#define _add_su3adj(r,a) \ +(r).d1+=(a).d1; \ +(r).d2+=(a).d2; \ +(r).d3+=(a).d3; \ +(r).d4+=(a).d4; \ +(r).d5+=(a).d5; \ +(r).d6+=(a).d6; \ +(r).d7+=(a).d7; \ +(r).d8+=(a).d8; + +#define _sub_su3adj(r,a) \ +(r).d1-=(a).d1; \ +(r).d2-=(a).d2; \ +(r).d3-=(a).d3; \ +(r).d4-=(a).d4; \ +(r).d5-=(a).d5; \ +(r).d6-=(a).d6; \ +(r).d7-=(a).d7; \ +(r).d8-=(a).d8; + + +#define _trace_lambda_add_assign(r,a) \ +(r).d1 += (-cimag((a).c10)-cimag((a).c01)); \ +(r).d2 += (+creal((a).c10)-creal((a).c01)); \ +(r).d3 += (-cimag((a).c00)+cimag((a).c11)); \ +(r).d4 += (-cimag((a).c20)-cimag((a).c02)); \ +(r).d5 += (+creal((a).c20)-creal((a).c02)); \ +(r).d6 += (-cimag((a).c21)-cimag((a).c12)); \ +(r).d7 += (+creal((a).c21)-creal((a).c12)); \ +(r).d8 += ((-cimag((a).c00)-cimag((a).c11) + 2.0 * cimag(a.c22))*0.577350269189625); + +#define _trace_lambda_sub_assign(r,a) \ +(r).d1 -= (-cimag((a).c10)-cimag((a).c01)); \ +(r).d2 -= (+creal((a).c10)-creal((a).c01)); \ +(r).d3 -= (-cimag((a).c00)+cimag((a).c11)); \ +(r).d4 -= (-cimag((a).c20)-cimag((a).c02)); \ +(r).d5 -= (+creal((a).c20)-creal((a).c02)); \ +(r).d6 -= (-cimag((a).c21)-cimag((a).c12)); \ +(r).d7 -= (+creal((a).c21)-creal((a).c12)); \ +(r).d8 -= ((-cimag((a).c00)-cimag((a).c11) + 2.0 * cimag(a.c22))*0.577350269189625); + +#if ( defined OMP ) + +#define _trace_lambda_mul_add_assign_nonlocal(r,c,a) \ +_Pragma("omp atomic") \ +(r).d1 += c*(-cimag((a).c10)-cimag((a).c01)); \ +_Pragma("omp atomic") \ +(r).d2 += c*(+creal((a).c10)-creal((a).c01)); \ +_Pragma("omp atomic") \ +(r).d3 += c*(-cimag((a).c00)+cimag((a).c11)); \ +_Pragma("omp atomic") \ +(r).d4 += c*(-cimag((a).c20)-cimag((a).c02)); \ +_Pragma("omp atomic") \ +(r).d5 += c*(+creal((a).c20)-creal((a).c02)); \ +_Pragma("omp atomic") \ +(r).d6 += c*(-cimag((a).c21)-cimag((a).c12)); \ +_Pragma("omp atomic") \ +(r).d7 += c*(+creal((a).c21)-creal((a).c12)); \ +_Pragma("omp atomic") \ +(r).d8 += c*((-cimag((a).c00)-cimag((a).c11) + 2.0 * cimag(a.c22))*0.577350269189625); + +#else + +#define _trace_lambda_mul_add_assign_nonlocal(r,c,a) _trace_lambda_mul_add_assign(r,c,a) + +#endif + +#define _trace_lambda_mul_add_assign(r,c,a) \ +(r).d1 += c*(-cimag((a).c10)-cimag((a).c01)); \ +(r).d2 += c*(+creal((a).c10)-creal((a).c01)); \ +(r).d3 += c*(-cimag((a).c00)+cimag((a).c11)); \ +(r).d4 += c*(-cimag((a).c20)-cimag((a).c02)); \ +(r).d5 += c*(+creal((a).c20)-creal((a).c02)); \ +(r).d6 += c*(-cimag((a).c21)-cimag((a).c12)); \ +(r).d7 += c*(+creal((a).c21)-creal((a).c12)); \ +(r).d8 += c*((-cimag((a).c00)-cimag((a).c11) + 2.0 * cimag(a.c22))*0.577350269189625); + + +/************************************************* + * + * Square norm of su3adj + * + * \|X\|^2 = -2tr{X^2} + * + *************************************************/ + +#define _su3adj_square_norm(r) \ +(r).d1*(r).d1 + \ +(r).d2*(r).d2 + \ +(r).d3*(r).d3 + \ +(r).d4*(r).d4 + \ +(r).d5*(r).d5 + \ +(r).d6*(r).d6 + \ +(r).d7*(r).d7 + \ +(r).d8*(r).d8; + +#define _zero_su3adj(r) \ +(r).d1=0.; \ +(r).d2=0.; \ +(r).d3=0.; \ +(r).d4=0.; \ +(r).d5=0.; \ +(r).d6=0.; \ +(r).d7=0.; \ +(r).d8=0.; + +#if defined SSE2 +#define _su3adj_assign_const_times_su3adj(res,c,in) \ +__asm__ __volatile__ ("movsd %0, %%xmm0 \n\t" \ + "unpcklpd %%xmm0, %%xmm0 \n\t" \ + "movapd %%xmm0, %%xmm1 \n\t" \ + "movapd %%xmm0, %%xmm2 \n\t" \ + "movapd %%xmm0, %%xmm3" \ + : \ + : \ + "m" (c)); \ +__asm__ __volatile__ ("movapd %0, %%xmm4 \n\t" \ + "movapd %1, %%xmm5 \n\t" \ + "movapd %2, %%xmm6 \n\t" \ + "movapd %3, %%xmm7 \n\t" \ + "mulpd %%xmm4, %%xmm0 \n\t" \ + "mulpd %%xmm5, %%xmm1 \n\t" \ + "mulpd %%xmm6, %%xmm2 \n\t" \ + "mulpd %%xmm7, %%xmm3" \ + : \ + : \ + "m" ((in).d1), \ + "m" ((in).d3), \ + "m" ((in).d5), \ + "m" ((in).d7)); \ +__asm__ __volatile__ ("movapd %%xmm0, %0 \n\t" \ + "movapd %%xmm1, %1 \n\t" \ + "movapd %%xmm2, %2 \n\t" \ + "movapd %%xmm3, %3" \ + : \ + "=m" ((res).d1), \ + "=m" ((res).d3), \ + "=m" ((res).d5), \ + "=m" ((res).d7)) +#else +#define _su3adj_assign_const_times_su3adj(res,c,in) \ +(res).d1=(c)*(in).d1; \ +(res).d2=(c)*(in).d2; \ +(res).d3=(c)*(in).d3; \ +(res).d4=(c)*(in).d4; \ +(res).d5=(c)*(in).d5; \ +(res).d6=(c)*(in).d6; \ +(res).d7=(c)*(in).d7; \ +(res).d8=(c)*(in).d8; +#endif + +#if defined SSE2 +#define _su3adj_minus_const_times_su3adj(res,c,in) \ +__asm__ __volatile__ ("movsd %0, %%xmm0 \n\t" \ + "unpcklpd %%xmm0, %%xmm0 \n\t" \ + "movapd %%xmm0, %%xmm1 \n\t" \ + "movapd %%xmm0, %%xmm2 \n\t" \ + "movapd %%xmm0, %%xmm3" \ + : \ + : \ + "m" (c)); \ +__asm__ __volatile__ ("movapd %0, %%xmm4 \n\t" \ + "movapd %1, %%xmm5 \n\t" \ + "movapd %2, %%xmm6 \n\t" \ + "movapd %3, %%xmm7 \n\t" \ + "mulpd %%xmm4, %%xmm0 \n\t" \ + "mulpd %%xmm5, %%xmm1 \n\t" \ + "mulpd %%xmm6, %%xmm2 \n\t" \ + "mulpd %%xmm7, %%xmm3" \ + : \ + : \ + "m" ((in).d1), \ + "m" ((in).d3), \ + "m" ((in).d5), \ + "m" ((in).d7)); \ +__asm__ __volatile__ ("movapd %0, %%xmm4 \n\t" \ + "movapd %1, %%xmm5 \n\t" \ + "movapd %2, %%xmm6 \n\t" \ + "movapd %3, %%xmm7 \n\t" \ + "subpd %%xmm0, %%xmm4 \n\t" \ + "subpd %%xmm1, %%xmm5 \n\t" \ + "subpd %%xmm2, %%xmm6 \n\t" \ + "subpd %%xmm3, %%xmm7" \ + : \ + : \ + "m" ((res).d1), \ + "m" ((res).d3), \ + "m" ((res).d5), \ + "m" ((res).d7)); \ +__asm__ __volatile__ ("movapd %%xmm4, %0 \n\t" \ + "movapd %%xmm5, %1 \n\t" \ + "movapd %%xmm6, %2 \n\t" \ + "movapd %%xmm7, %3" \ + : \ + "=m" ((res).d1), \ + "=m" ((res).d3), \ + "=m" ((res).d5), \ + "=m" ((res).d7)) +#else +#define _su3adj_minus_const_times_su3adj(res,c,in) \ +(res).d1-=(c)*(in).d1; \ +(res).d2-=(c)*(in).d2; \ +(res).d3-=(c)*(in).d3; \ +(res).d4-=(c)*(in).d4; \ +(res).d5-=(c)*(in).d5; \ +(res).d6-=(c)*(in).d6; \ +(res).d7-=(c)*(in).d7; \ +(res).d8-=(c)*(in).d8; +#endif + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/su3spinor.h b/qcd/part_cpu/applications/QCD/src/kernel_D/su3spinor.h new file mode 100644 index 0000000000000000000000000000000000000000..6fd5085f3ab27452133e17f987ec5e1121ddd93b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/su3spinor.h @@ -0,0 +1,469 @@ +/* ********************************************************************** + * + * Copyright (C) 2003 Ines Wetzorke + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . +********************************************************************** */ +#ifndef _SU3SPINOR_H +#define _SU3SPINOR_H + +/* ****************************************************************************** + * + * Macros for SU(3) spinors + * + * Arguments are variables of type spinor, + * gamma matrices in the chiral representation + * + * _spinor_null(r) + * _spinor_prod_re(r,s) + * _spinor_mul_complex(r,c,s) + * _gamma0(r,s) + * _gamma1(r,s) + * _gamma2(r,s) + * _gamma3(r,s) + * _gamma5(r,s) + * _gamma50(r,s) + * _gamma51(r,s) + * _gamma52(r,s) + * _gamma53(r,s) + * + * +****************************************************************************** */ + +#include "su3.h" + +/* + * r.s0 = 0 + * r.s1 = 0 for each color index + * r.s2 = 0 + * r.s3 = 0 + */ + +#define _spinor_null(r) \ + (r).s0.c0 = 0.0; \ + (r).s0.c1 = 0.0; \ + (r).s0.c2 = 0.0; \ + (r).s1.c0 = 0.0; \ + (r).s1.c1 = 0.0; \ + (r).s1.c2 = 0.0; \ + (r).s2.c0 = 0.0; \ + (r).s2.c1 = 0.0; \ + (r).s2.c2 = 0.0; \ + (r).s3.c0 = 0.0; \ + (r).s3.c1 = 0.0; \ + (r).s3.c2 = 0.0; + + +/* + * Real part of the scalar product (r,s) + */ + +#define _spinor_prod_re(r,s) \ + creal((r).s0.c0) * creal((s).s0.c0) + cimag((r).s0.c0) * cimag((s).s0.c0) + \ + creal((r).s0.c1) * creal((s).s0.c1) + cimag((r).s0.c1) * cimag((s).s0.c1) + \ + creal((r).s0.c2) * creal((s).s0.c2) + cimag((r).s0.c2) * cimag((s).s0.c2) + \ + creal((r).s1.c0) * creal((s).s1.c0) + cimag((r).s1.c0) * cimag((s).s1.c0) + \ + creal((r).s1.c1) * creal((s).s1.c1) + cimag((r).s1.c1) * cimag((s).s1.c1) + \ + creal((r).s1.c2) * creal((s).s1.c2) + cimag((r).s1.c2) * cimag((s).s1.c2) + \ + creal((r).s2.c0) * creal((s).s2.c0) + cimag((r).s2.c0) * cimag((s).s2.c0) + \ + creal((r).s2.c1) * creal((s).s2.c1) + cimag((r).s2.c1) * cimag((s).s2.c1) + \ + creal((r).s2.c2) * creal((s).s2.c2) + cimag((r).s2.c2) * cimag((s).s2.c2) + \ + creal((r).s3.c0) * creal((s).s3.c0) + cimag((r).s3.c0) * cimag((s).s3.c0) + \ + creal((r).s3.c1) * creal((s).s3.c1) + cimag((r).s3.c1) * cimag((s).s3.c1) + \ + creal((r).s3.c2) * creal((s).s3.c2) + cimag((r).s3.c2) * cimag((s).s3.c2) + +/* + * Imaginary part of the scalar product (r,s) + */ + +#define _spinor_prod_im(r,s) \ + -creal((r).s0.c0) * cimag((s).s0.c0) + cimag((r).s0.c0) * creal((s).s0.c0) - \ + creal((r).s0.c1) * cimag((s).s0.c1) + cimag((r).s0.c1) * creal((s).s0.c1) - \ + creal((r).s0.c2) * cimag((s).s0.c2) + cimag((r).s0.c2) * creal((s).s0.c2) - \ + creal((r).s1.c0) * cimag((s).s1.c0) + cimag((r).s1.c0) * creal((s).s1.c0) - \ + creal((r).s1.c1) * cimag((s).s1.c1) + cimag((r).s1.c1) * creal((s).s1.c1) - \ + creal((r).s1.c2) * cimag((s).s1.c2) + cimag((r).s1.c2) * creal((s).s1.c2) - \ + creal((r).s2.c0) * cimag((s).s2.c0) + cimag((r).s2.c0) * creal((s).s2.c0) - \ + creal((r).s2.c1) * cimag((s).s2.c1) + cimag((r).s2.c1) * creal((s).s2.c1) - \ + creal((r).s2.c2) * cimag((s).s2.c2) + cimag((r).s2.c2) * creal((s).s2.c2) - \ + creal((r).s3.c0) * cimag((s).s3.c0) + cimag((r).s3.c0) * creal((s).s3.c0) - \ + creal((r).s3.c1) * cimag((s).s3.c1) + cimag((r).s3.c1) * creal((s).s3.c1) - \ + creal((r).s3.c2) * cimag((s).s3.c2) + cimag((r).s3.c2) * creal((s).s3.c2) + + +/* + * r is the product of s with the complex number c + * + * Stefano Capitani , June 2003 + */ + +#define _spinor_mul_complex(r,c,s) \ + (r).s0.c0 = c * (s).s0.c0; \ + (r).s0.c1 = c * (s).s0.c1; \ + (r).s0.c2 = c * (s).s0.c2; \ + (r).s1.c0 = c * (s).s1.c0; \ + (r).s1.c1 = c * (s).s1.c1; \ + (r).s1.c2 = c * (s).s1.c2; \ + (r).s2.c0 = c * (s).s2.c0; \ + (r).s2.c1 = c * (s).s2.c1; \ + (r).s2.c2 = c * (s).s2.c2; \ + (r).s3.c0 = c * (s).s3.c0; \ + (r).s3.c1 = c * (s).s3.c1; \ + (r).s3.c2 = c * (s).s3.c2; + +/* square norm of spinor s */ + +#define _spinor_norm_sq(d,s) \ + d = creal((s).s0.c0 * conj((s).s0.c0)) + creal((s).s0.c1 * conj((s).s0.c1)) + \ + creal((s).s0.c2 * conj((s).s0.c2)) + creal((s).s1.c0 * conj((s).s1.c0)) + \ + creal((s).s1.c1 * conj((s).s1.c1)) + creal((s).s1.c2 * conj((s).s1.c2)) + \ + creal((s).s2.c0 * conj((s).s2.c0)) + creal((s).s2.c1 * conj((s).s2.c1)) + \ + creal((s).s2.c2 * conj((s).s2.c2)) + creal((s).s3.c0 * conj((s).s3.c0)) + \ + creal((s).s3.c1 * conj((s).s3.c1)) + creal((s).s3.c2 * conj((s).s3.c2)) + + +/* gamma 0 + * (r.s0) ( 0 0 + 1 0 ) (s.s0) + * (r.s1) = ( 0 0 0 + 1 ) * (s.s1) + * (r.s2) ( + 1 0 0 0 ) (s.s2) + * (r.s3) ( 0 + 1 0 0 ) (s.s3) + */ + +#define _gamma0(r,s) \ + (r).s0.c0 = (s).s2.c0; \ + (r).s0.c1 = (s).s2.c1; \ + (r).s0.c2 = (s).s2.c2; \ + (r).s1.c0 = (s).s3.c0; \ + (r).s1.c1 = (s).s3.c1; \ + (r).s1.c2 = (s).s3.c2; \ + (r).s2.c0 = (s).s0.c0; \ + (r).s2.c1 = (s).s0.c1; \ + (r).s2.c2 = (s).s0.c2; \ + (r).s3.c0 = (s).s1.c0; \ + (r).s3.c1 = (s).s1.c1; \ + (r).s3.c2 = (s).s1.c2; + +/* gamma 1 + * (r.s0) ( 0 0 0 + i ) (s.s0) + * (r.s1) = ( 0 0 + i 0 ) * (s.s1) + * (r.s2) ( 0 -i 0 0 ) (s.s2) + * (r.s3) ( -i 0 0 0 ) (s.s3) + */ + +#define _gamma1(r,s) \ + (r).s0.c0 = I * (s).s3.c0; \ + (r).s0.c1 = I * (s).s3.c1; \ + (r).s0.c2 = I * (s).s3.c2; \ + (r).s1.c0 = I * (s).s2.c0; \ + (r).s1.c1 = I * (s).s2.c1; \ + (r).s1.c2 = I * (s).s2.c2; \ + (r).s2.c0 = -I * (s).s1.c0; \ + (r).s2.c1 = -I * (s).s1.c1; \ + (r).s2.c2 = -I * (s).s1.c2; \ + (r).s3.c0 = -I * (s).s0.c0; \ + (r).s3.c1 = -I * (s).s0.c1; \ + (r).s3.c2 = -I * (s).s0.c2; + + +/* gamma 2 + * (r.s0) ( 0 0 0 + 1 ) (s.s0) + * (r.s1) = ( 0 0 -1 0 ) * (s.s1) + * (r.s2) ( 0 -1 0 0 ) (s.s2) + * (r.s3) ( + 1 0 0 0 ) (s.s3) + */ + +#define _gamma2(r,s) \ + (r).s0.c0 = (s).s3.c0; \ + (r).s0.c1 = (s).s3.c1; \ + (r).s0.c2 = (s).s3.c2; \ + (r).s1.c0 = -(s).s2.c0; \ + (r).s1.c1 = -(s).s2.c1; \ + (r).s1.c2 = -(s).s2.c2; \ + (r).s2.c0 = -(s).s1.c0; \ + (r).s2.c1 = -(s).s1.c1; \ + (r).s2.c2 = -(s).s1.c2; \ + (r).s3.c0 = (s).s0.c0; \ + (r).s3.c1 = (s).s0.c1; \ + (r).s3.c2 = (s).s0.c2; + + +/* gamma 3 + * (r.s0) ( 0 0 + i 0 ) (s.s0) + * (r.s1) = ( 0 0 0 -i ) * (s.s1) + * (r.s2) ( -i 0 0 0 ) (s.s2) + * (r.s3) ( 0 + i 0 0 ) (s.s3) + */ + +#define _gamma3(r,s) \ + (r).s0.c0 = I * (s).s2.c0; \ + (r).s0.c1 = I * (s).s2.c1; \ + (r).s0.c2 = I * (s).s2.c2; \ + (r).s1.c0 = -I * (s).s3.c0; \ + (r).s1.c1 = -I * (s).s3.c1; \ + (r).s1.c2 = -I * (s).s3.c2; \ + (r).s2.c0 = -I * (s).s0.c0; \ + (r).s2.c1 = -I * (s).s0.c1; \ + (r).s2.c2 = -I * (s).s0.c2; \ + (r).s3.c0 = I * (s).s1.c0; \ + (r).s3.c1 = I * (s).s1.c1; \ + (r).s3.c2 = I * (s).s1.c2; + + +/* gamma 5 + * (r.s0) ( + 1 0 0 0 ) (s.s0) + * (r.s1) = ( 0 + 1 0 0 ) * (s.s1) + * (r.s2) ( 0 0 -1 0 ) (s.s2) + * (r.s3) ( 0 0 0 -1 ) (s.s3) + */ + +#define _gamma5(r,s) \ + (r).s0.c0 = (s).s0.c0; \ + (r).s0.c1 = (s).s0.c1; \ + (r).s0.c2 = (s).s0.c2; \ + (r).s1.c0 = (s).s1.c0; \ + (r).s1.c1 = (s).s1.c1; \ + (r).s1.c2 = (s).s1.c2; \ + (r).s2.c0 = -(s).s2.c0; \ + (r).s2.c1 = -(s).s2.c1; \ + (r).s2.c2 = -(s).s2.c2; \ + (r).s3.c0 = -(s).s3.c0; \ + (r).s3.c1 = -(s).s3.c1; \ + (r).s3.c2 = -(s).s3.c2; + + +/* P_plus + * (r.s0) ( + 1 0 0 0 ) (s.s0) + * (r.s1) = ( 0 + 1 0 0 ) * (s.s1) + * (r.s2) ( 0 0 0 0 ) (s.s2) + * (r.s3) ( 0 0 0 0 ) (s.s3) + */ + +#define _P_plus(r,s) \ + (r).s0.c0 = (s).s0.c0; \ + (r).s0.c1 = (s).s0.c1; \ + (r).s0.c2 = (s).s0.c2; \ + (r).s1.c0 = (s).s1.c0; \ + (r).s1.c1 = (s).s1.c1; \ + (r).s1.c2 = (s).s1.c2; \ + (r).s2.c0 = 0.; \ + (r).s2.c1 = 0.; \ + (r).s2.c2 = 0.; \ + (r).s3.c0 = 0.; \ + (r).s3.c1 = 0.; \ + (r).s3.c2 = 0.; + + +/* gamma 5 + ID + * (r.s0) ( + 2 0 0 0 ) (s.s0) + * (r.s1) = ( 0 + 2 0 0 ) * (s.s1) + * (r.s2) ( 0 0 0 0 ) (s.s2) + * (r.s3) ( 0 0 0 0 ) (s.s3) + */ + +#define _gamma5_plus_id(r,s) \ + (r).s0.c0 = 2. * (s).s0.c0; \ + (r).s0.c1 = 2. * (s).s0.c1; \ + (r).s0.c2 = 2. * (s).s0.c2; \ + (r).s1.c0 = 2. * (s).s1.c0; \ + (r).s1.c1 = 2. * (s).s1.c1; \ + (r).s1.c2 = 2. * (s).s1.c2; \ + (r).s2.c0 = 0.; \ + (r).s2.c1 = 0.; \ + (r).s2.c2 = 0.; \ + (r).s3.c0 = 0.; \ + (r).s3.c1 = 0.; \ + (r).s3.c2 = 0.; + +/* P_minus + * (r.s0) ( 0 0 0 0 ) (s.s0) + * (r.s1) = ( 0 0 0 0 ) * (s.s1) + * (r.s2) ( 0 0 1 0 ) (s.s2) + * (r.s3) ( 0 0 0 1 ) (s.s3) + */ + +#define _P_minus(r,s) \ + (r).s0.c0 = 0.; \ + (r).s0.c1 = 0.; \ + (r).s0.c2 = 0.; \ + (r).s1.c0 = 0.; \ + (r).s1.c1 = 0.; \ + (r).s1.c2 = 0.; \ + (r).s2.c0 = (s).s2.c0; \ + (r).s2.c1 = (s).s2.c1; \ + (r).s2.c2 = (s).s2.c2; \ + (r).s3.c0 = (s).s3.c0; \ + (r).s3.c1 = (s).s3.c1; \ + (r).s3.c2 = (s).s3.c2; + + +/* gamma 5 - ID + * (r.s0) ( 0 0 0 0 ) (s.s0) + * (r.s1) = ( 0 0 0 0 ) * (s.s1) + * (r.s2) ( 0 0 -2 0 ) (s.s2) + * (r.s3) ( 0 0 0 -2 ) (s.s3) + */ + +#define _gamma5_minus_id(r,s) \ + (r).s0.c0 = 0.; \ + (r).s0.c1 = 0.; \ + (r).s0.c2 = 0.; \ + (r).s1.c0 = 0.; \ + (r).s1.c1 = 0.; \ + (r).s1.c2 = 0.; \ + (r).s2.c0 = -2. * (s).s2.c0; \ + (r).s2.c1 = -2. * (s).s2.c1; \ + (r).s2.c2 = -2. * (s).s2.c2; \ + (r).s3.c0 = -2. * (s).s3.c0; \ + (r).s3.c1 = -2. * (s).s3.c1; \ + (r).s3.c2 = -2. * (s).s3.c2; + + + +/* gamma 50 + * (r.s0) ( 0 0 -1 0 ) (s.s0) + * (r.s1) = ( 0 0 0 -1 ) * (s.s1) + * (r.s2) ( + 1 0 0 0 ) (s.s2) + * (r.s3) ( 0 + 1 0 0 ) (s.s3) + */ + +#define _gamma50(r,s) \ + (r).s0.c0 = -(s).s2.c0; \ + (r).s0.c1 = -(s).s2.c1; \ + (r).s0.c2 = -(s).s2.c2; \ + (r).s1.c0 = -(s).s3.c0; \ + (r).s1.c1 = -(s).s3.c1; \ + (r).s1.c2 = -(s).s3.c2; \ + (r).s2.c0 = (s).s0.c0; \ + (r).s2.c1 = (s).s0.c1; \ + (r).s2.c2 = (s).s0.c2; \ + (r).s3.c0 = (s).s1.c0; \ + (r).s3.c1 = (s).s1.c1; \ + (r).s3.c2 = (s).s1.c2; + + +/* gamma 51 + * (r.s0) ( 0 0 0 -i ) (s.s0) + * (r.s1) = ( 0 0 -i 0 ) * (s.s1) + * (r.s2) ( 0 -i 0 0 ) (s.s2) + * (r.s3) ( -i 0 0 0 ) (s.s3) + */ + +#define _gamma51(r,s) \ + (r).s0.c0 = -I * (s).s3.c0; \ + (r).s0.c1 = -I * (s).s3.c1; \ + (r).s0.c2 = -I * (s).s3.c2; \ + (r).s1.c0 = -I * (s).s2.c0; \ + (r).s1.c1 = -I * (s).s2.c1; \ + (r).s1.c2 = -I * (s).s2.c2; \ + (r).s2.c0 = -I * (s).s1.c0; \ + (r).s2.c1 = -I * (s).s1.c1; \ + (r).s2.c2 = -I * (s).s1.c2; \ + (r).s3.c0 = -I * (s).s0.c0; \ + (r).s3.c1 = -I * (s).s0.c1; \ + (r).s3.c2 = -I * (s).s0.c2 + +/* gamma 52 + * (r.s0) ( 0 0 0 -1 ) (s.s0) + * (r.s1) = ( 0 0 + 1 0 ) * (s.s1) + * (r.s2) ( 0 -1 0 0 ) (s.s2) + * (r.s3) ( + 1 0 0 0 ) (s.s3) + */ + +#define _gamma52(r,s) \ + (r).s0.c0 = -(s).s3.c0; \ + (r).s0.c1 = -(s).s3.c1; \ + (r).s0.c2 = -(s).s3.c2; \ + (r).s1.c0 = (s).s2.c0; \ + (r).s1.c1 = (s).s2.c1; \ + (r).s1.c2 = (s).s2.c2; \ + (r).s2.c0 = -(s).s1.c0; \ + (r).s2.c1 = -(s).s1.c1; \ + (r).s2.c2 = -(s).s1.c2; \ + (r).s3.c0 = (s).s0.c0; \ + (r).s3.c1 = (s).s0.c1; \ + (r).s3.c2 = (s).s0.c2 + +/* gamma 53 + * (r.s0) ( 0 0 -i 0 ) (s.s0) + * (r.s1) = ( 0 0 0 + i ) * (s.s1) + * (r.s2) ( -i 0 0 0 ) (s.s2) + * (r.s3) ( 0 + i 0 0 ) (s.s3) (r).s3.c1 = (s).s1.c1; \ + * (r).s3.c2 = (s).s1.c2; + * + * + / * *gamma 51 + * (r.c1) ( 0 0 0 -i ) (s.s0) + * (r.s1) = ( 0 0 -i 0 ) * (s.s1) + * (r.s2) ( 0 -i 0 0 ) (s.s2) + * (r.s3) ( -i 0 0 0 ) (s.s3) + */ + +#define _gamma51(r,s) \ +(r).s0.c0 = -I * (s).s3.c0; \ +(r).s0.c1 = -I * (s).s3.c1; \ +(r).s0.c2 = -I * (s).s3.c2; \ +(r).s1.c0 = -I * (s).s2.c0; \ +(r).s1.c1 = -I * (s).s2.c1; \ +(r).s1.c2 = -I * (s).s2.c2; \ +(r).s2.c0 = -I * (s).s1.c0; \ +(r).s2.c1 = -I * (s).s1.c1; \ +(r).s2.c2 = -I * (s).s1.c2; \ +(r).s3.c0 = -I * (s).s0.c0; \ +(r).s3.c1 = -I * (s).s0.c1; \ +(r).s3.c2 = -I * (s).s0.c2 + +/* gamma 52 + * (r.c1) ( 0 0 0 -1 ) (s.s0) + * (r.s1) = ( 0 0 + 1 0 ) * (s.s1) + * (r.s2) ( 0 -1 0 0 ) (s.s2) + * (r.s3) ( + 1 0 0 0 ) (s.s3) + */ + +#define _gamma52(r,s) \ +(r).s0.c0 = -(s).s3.c0; \ +(r).s0.c1 = -(s).s3.c1; \ +(r).s0.c2 = -(s).s3.c2; \ +(r).s1.c0 = (s).s2.c0; \ +(r).s1.c1 = (s).s2.c1; \ +(r).s1.c2 = (s).s2.c2; \ +(r).s2.c0 = -(s).s1.c0; \ +(r).s2.c1 = -(s).s1.c1; \ +(r).s2.c2 = -(s).s1.c2; \ +(r).s3.c0 = (s).s0.c0; \ +(r).s3.c1 = (s).s0.c1; \ +(r).s3.c2 = (s).s0.c2 + +/* gamma 53 + * (r.c1) ( 0 0 -i 0 ) (s.s0) + * (r.s1) = ( 0 0 0 + i ) * (s.s1) + * (r.s2) ( -i 0 0 0 ) (s.s2) + * (r.s3) ( 0 + i 0 0 ) (s.s3) + */ + +#define _gamma53(r,s) \ + (r).s0.c0 = -I * (s).s2.c0; \ + (r).s0.c1 = -I * (s).s2.c1; \ + (r).s0.c2 = -I * (s).s2.c2; \ + (r).s1.c0 = I * (s).s3.c0; \ + (r).s1.c1 = I * (s).s3.c1; \ + (r).s1.c2 = I * (s).s3.c2; \ + (r).s2.c0 = -I * (s).s0.c0; \ + (r).s2.c1 = -I * (s).s0.c1; \ + (r).s2.c2 = -I * (s).s0.c2; \ + (r).s3.c0 = I * (s).s1.c0; \ + (r).s3.c1 = I * (s).s1.c1; \ + (r).s3.c2 = I * (s).s1.c2 +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/temporalgauge.c b/qcd/part_cpu/applications/QCD/src/kernel_D/temporalgauge.c new file mode 100644 index 0000000000000000000000000000000000000000..98eee274a8a0138e76977cc12d81266f13c44bb4 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/temporalgauge.c @@ -0,0 +1,1090 @@ +#ifdef HAVE_CONFIG_H +# include +#endif +#include "global.h" +#include "GPU/cudadefs.h" +#include "su3.h" +#include "geometry_eo.h" +#include "start.h" +#include "temporalgauge.h" +#include "measure_gauge_action.h" +#include "stdio.h" +#include "stdlib.h" +#include "linalg_eo.h" +#ifdef MPI + #include + #include "mpi_init.h" +#endif + + + +su3 * g_trafo; +su3 * tempgauge_field = NULL; + +su3 * left; +su3 * right; + + + +static su3 unit_su3 (void) +{ + su3 u; + _su3_one(u); + return(u); +} + + + +/*copy a complete gauge field*/ +/* THINK OF PARALLELIZATION (RAND!!!)*/ +void copy_gauge_field (su3 ** to, su3 ** from) +{ + for (int ix = 0; ix < VOLUME; ix++) + { + _su3_assign(to[ix][0], from[ix][0]); + _su3_assign(to[ix][1], from[ix][1]); + _su3_assign(to[ix][2], from[ix][2]); + _su3_assign(to[ix][3], from[ix][3]); + } +} + +/* + Set the trafo field for a temporal gauge + g(t=0) == ID + other g's are determined recursively from U (gfield) requiering that U^{'}_0 != ID + => only the U(t=T-1) are not ID!! +*/ +int init_temporalgauge_trafo (const int V, su3** gfield) { + +#ifndef MPI + + int it, iz, iy, ix; + + int pos; + + if ((void *)(g_trafo = (su3 *) calloc(V, sizeof(su3))) == NULL ) { + printf("malloc error in 'init_temporalgauge_trafo'\n"); + return(2); + } + + /* initialize first timeslice (t=0) with unit matrices*/ + for (ix = 0; ix < LX; ix++) { + for (iy = 0; iy < LY; iy++) { + for (iz = 0; iz < LZ; iz++) { + g_trafo[g_ipt[0][ix][iy][iz]] = unit_su3(); + } + } + } + + /* U^{'}_0(x) g(x) U_0(x) g^{+}(x+0) != ID => g_(x+0) = g(x) U_0(x) */ + for (it = 1; it < T; it++) { + for (ix = 0; ix < LX; ix++) { + for (iy = 0; iy < LY; iy++) { + for (iz = 0; iz < LZ; iz++) { + pos = g_ipt[it][ix][iy][iz]; + _su3_times_su3( g_trafo[ g_ipt[it ][ix][iy][iz] ] , + g_trafo[ g_ipt[it-1][ix][iy][iz] ] , + //gfield [ g_ipt[it-1][ix][iy][iz] ] [0] ); + gfield [ g_idn[pos][0] ] [0] ); + } + } + } + } + +#else // MPI + + int it, iz, iy, ix; + + int pos; + + MPI_Status status; + + + + if ((void *)(left = (su3 *) calloc(LX*LY*LZ, sizeof(su3))) == NULL ) { // allocates memory for a time-slice of su3-matrices + printf("malloc error in 'init_temporalgauge_trafo_mpi'\n"); + return(-1); + } + + if ((void *)(right = (su3 *) calloc(LX*LY*LZ, sizeof(su3))) == NULL ) { // allocates memory for a time-slice of su3-matrices + printf("malloc error in 'init_temporalgauge_trafo_mpi'\n"); + return(-1); + } + + + + + if ((void *)(g_trafo = (su3 *) calloc(V, sizeof(su3))) == NULL ) { // allocates memory for V su3-matrices + printf("malloc error in 'init_temporalgauge_trafo'\n"); + return(2); + } + + + + + ////////////////////////////////////////////// + // initializing the transformation matrices // + ////////////////////////////////////////////// + + + // first process in t-direction + + if (g_cart_id == 0) { + + /* initialize first timeslice (t=0) with unit matrices*/ + for (ix = 0; ix < LX; ix++) { + for (iy = 0; iy < LY; iy++) { + for (iz = 0; iz < LZ; iz++) { + g_trafo[g_ipt[0][ix][iy][iz]] = unit_su3(); // g_trafo[0-th time slice] = ID + } + } + } + + /* U^{'}_0(x) = g(x) U_0(x) g^{+}(x+0) != ID => g_(x+0) = g(x) U_0(x) */ + for (it = 1; it < T; it++) { + for (ix = 0; ix < LX; ix++) { + for (iy = 0; iy < LY; iy++) { + for (iz = 0; iz < LZ; iz++) { + _su3_times_su3( g_trafo[ g_ipt[it ][ix][iy][iz] ] , // g_trafo[next t-slice] = g_trafo[old t-slice] * gfield[old t-slice][t-dir.] + g_trafo[ g_ipt[it-1][ix][iy][iz] ] , + gfield [ g_ipt[it-1][ix][iy][iz] ] [0] ); + + } + } + } + } + + + // sending + MPI_Send((void *)(g_trafo+(T-1)*LX*LY*LZ), LX*LY*LZ, mpi_su3, g_nb_t_up, 0, g_cart_grid); + //MPI_Send((void *)(g_trafo+(T-1)*LX*LY*LZ), LX*LY*LZ, mpi_su3, g_cart_id+1, 0, g_cart_grid); + + printf("g_cart_id = %i has send a message to %i\n", g_cart_id, g_nb_t_up); + + + } // first process + + + + + // following processes + + else { + + // receiving + MPI_Recv((void *)left, LX*LY*LZ, mpi_su3, g_nb_t_dn, 0, g_cart_grid, &status); + //MPI_Recv((void *)left, LX*LY*LZ, mpi_su3, g_cart_id-1, 0, g_cart_grid, &status); + + + printf("g_cart_id = %i has received a message from %i\n", g_cart_id, g_nb_t_dn); + + it = 0; + for (ix = 0; ix < LX; ix++) { + for (iy = 0; iy < LY; iy++) { + for (iz = 0; iz < LZ; iz++) { + pos = g_ipt[it][ix][iy][iz]; + _su3_times_su3( g_trafo[ g_ipt[it ][ix][iy][iz] ] , // g_trafo[0-th time slice] = left[xchanged t-slice] * gfield[ + left [ g_ipt[it ][ix][iy][iz] ] , + gfield [ g_idn[pos ][0] ] [0] ); // notice: have to access the RAND region of the gauge field + } + } + } + + + for (it = 1; it < T; it++) { + for (ix = 0; ix < LX; ix++) { + for (iy = 0; iy < LY; iy++) { + for (iz = 0; iz < LZ; iz++) { + _su3_times_su3( g_trafo[ g_ipt[it ][ix][iy][iz] ] , + g_trafo[ g_ipt[it-1][ix][iy][iz] ] , + gfield [ g_ipt[it-1][ix][iy][iz] ] [0] ); + + } + } + } + } + + + // sending + if (g_cart_id != g_nproc-1) { + MPI_Send((void *)(g_trafo+(T-1)*LX*LY*LZ), LX*LY*LZ, mpi_su3, g_nb_t_up, 0, g_cart_grid); + //MPI_Send((void *)(g_trafo+(T-1)*LX*LY*LZ), LX*LY*LZ, mpi_su3, g_cart_id+1, 0, g_cart_grid); + + printf("g_cart_id = %i has send a message to %i\n", g_cart_id, g_nb_t_up); + + } + + + } // following processes + + + + + //////////////////////////////////////////// + // exchanging the transformation matrices // + //////////////////////////////////////////// + + + MPI_Sendrecv((void *)(g_trafo), LX*LY*LZ, mpi_su3, g_nb_t_dn, 1, + (void *)(right ), LX*LY*LZ, mpi_su3, g_nb_t_up, 1, + g_cart_grid, &status); + + printf("g_cart_id = %i has send to %i and received from %i\n", g_cart_id, g_nb_t_dn, g_nb_t_up); + + +#endif // MPI + + + /* + allocate and initialize g_tempgauge_field which holds a copy of the + global gauge field g_gauge_field which is copied back after the inversion + when the temporal gauge is undone again + */ + + int i = 0; + + if ((void *)(g_tempgauge_field = (su3 **) calloc(V, sizeof(su3*))) == NULL ) { + printf ("malloc error in 'init_temporalgauge_trafo'\n"); + return(1); + } + if ((void *)(tempgauge_field = (su3 *) calloc(4*V+1, sizeof(su3))) == NULL ) { + printf ("malloc error in 'init_temporalgauge_trafo'\n"); + return(2); + } + + #if (defined SSE || defined SSE2 || defined SSE3) + g_tempgauge_field[0] = (su3*)(((unsigned long int)(tempgauge_field)+ALIGN_BASE)&~ALIGN_BASE); + #else + g_tempgauge_field[0] = tempgauge_field; + #endif + + for(i = 1; i < V; i++){ + g_tempgauge_field[i] = g_tempgauge_field[i-1]+4; + } + + /* copy the original field */ + copy_gauge_field(g_tempgauge_field, g_gauge_field); + + return(0); + +} + + + + +/* + +// MPI implementation // was merged into init_temporalgauge_without_mpi() + +#ifdef MPI + +int init_temporalgauge_trafo_mpi (const int V, su3 ** gfield) { // will initialize g_trafo[] as the transformation matrices + // and g_tempgauge_field as a copy of g_gauge_field + int it, iz, iy, ix; + + int pos; + + MPI_Status status; + + + + if ((void *)(left = (su3 *) calloc(LX*LY*LZ, sizeof(su3))) == NULL ) { // allocates memory for a time-slice of su3-matrices + printf("malloc error in 'init_temporalgauge_trafo_mpi'\n"); + return(-1); + } + + if ((void *)(right = (su3 *) calloc(LX*LY*LZ, sizeof(su3))) == NULL ) { // allocates memory for a time-slice of su3-matrices + printf("malloc error in 'init_temporalgauge_trafo_mpi'\n"); + return(-1); + } + + + + + if ((void *)(g_trafo = (su3 *) calloc(V, sizeof(su3))) == NULL ) { // allocates memory for V su3-matrices + printf("malloc error in 'init_temporalgauge_trafo'\n"); + return(2); + } + + + + + ////////////////////////////////////////////// + // initializing the transformation matrices // + ////////////////////////////////////////////// + + + // first process in t-direction + + if (g_cart_id == 0) { + + // initialize first timeslice (t=0) with unit matrices + for (ix = 0; ix < LX; ix++) { + for (iy = 0; iy < LY; iy++) { + for (iz = 0; iz < LZ; iz++) { + g_trafo[g_ipt[0][ix][iy][iz]] = unit_su3(); // g_trafo[0-th time slice] = ID + } + } + } + + // U^{'}_0(x) = g(x) U_0(x) g^{+}(x+0) != ID => g_(x+0) = g(x) U_0(x) + for (it = 1; it < T; it++) { + for (ix = 0; ix < LX; ix++) { + for (iy = 0; iy < LY; iy++) { + for (iz = 0; iz < LZ; iz++) { + _su3_times_su3( g_trafo[ g_ipt[it ][ix][iy][iz] ] , // g_trafo[next t-slice] = g_trafo[old t-slice] * gfield[old t-slice][t-dir.] + g_trafo[ g_ipt[it-1][ix][iy][iz] ] , + gfield [ g_ipt[it-1][ix][iy][iz] ] [0] ); + + } + } + } + } + + + // sending + MPI_Send((void *)(g_trafo+(T-1)*LX*LY*LZ), LX*LY*LZ, mpi_su3, g_nb_t_up, 0, g_cart_grid); + //MPI_Send((void *)(g_trafo+(T-1)*LX*LY*LZ), LX*LY*LZ, mpi_su3, g_cart_id+1, 0, g_cart_grid); + + printf("g_cart_id = %i has send a message to %i\n", g_cart_id, g_nb_t_up); + + + } // first process + + + + + // following processes + + else { + + // receiving + MPI_Recv((void *)left, LX*LY*LZ, mpi_su3, g_nb_t_dn, 0, g_cart_grid, &status); + //MPI_Recv((void *)left, LX*LY*LZ, mpi_su3, g_cart_id-1, 0, g_cart_grid, &status); + + + printf("g_cart_id = %i has received a message from %i\n", g_cart_id, g_nb_t_dn); + + it = 0; + for (ix = 0; ix < LX; ix++) { + for (iy = 0; iy < LY; iy++) { + for (iz = 0; iz < LZ; iz++) { + pos = g_ipt[it][ix][iy][iz]; + _su3_times_su3( g_trafo[ g_ipt[it ][ix][iy][iz] ] , // g_trafo[0-th time slice] = left[xchanged t-slice] * gfield[ + left [ g_ipt[it ][ix][iy][iz] ] , + gfield [ g_idn[pos ][0] ] [0] ); // notice: have to access the RAND region of the gauge field + } + } + } + + + for (it = 1; it < T; it++) { + for (ix = 0; ix < LX; ix++) { + for (iy = 0; iy < LY; iy++) { + for (iz = 0; iz < LZ; iz++) { + _su3_times_su3( g_trafo[ g_ipt[it ][ix][iy][iz] ] , + g_trafo[ g_ipt[it-1][ix][iy][iz] ] , + gfield [ g_ipt[it-1][ix][iy][iz] ] [0] ); + + } + } + } + } + + + // sending + if (g_cart_id != g_nproc-1) { + MPI_Send((void *)(g_trafo+(T-1)*LX*LY*LZ), LX*LY*LZ, mpi_su3, g_nb_t_up, 0, g_cart_grid); + //MPI_Send((void *)(g_trafo+(T-1)*LX*LY*LZ), LX*LY*LZ, mpi_su3, g_cart_id+1, 0, g_cart_grid); + + printf("g_cart_id = %i has send a message to %i\n", g_cart_id, g_nb_t_up); + + } + + + } // following processes + + + + + //////////////////////////////////////////// + // exchanging the transformation matrices // + //////////////////////////////////////////// + + + MPI_Sendrecv((void *)(g_trafo), LX*LY*LZ, mpi_su3, g_nb_t_dn, 1, + (void *)(right ), LX*LY*LZ, mpi_su3, g_nb_t_up, 1, + g_cart_grid, &status); + + printf("g_cart_id = %i has send to %i and received from %i\n", g_cart_id, g_nb_t_dn, g_nb_t_up); + + + + + + // all processes + + // copying the gaugefield (for later undoing the transformation) + + if ((void *)(g_tempgauge_field = (su3 **) calloc(V, sizeof(su3*))) == NULL ) { // allocates V su3 * + printf ("malloc error in 'init_temporalgauge_trafo'\n"); + return(1); + } + if ((void *)(tempgauge_field = (su3 *) calloc(4*V+1, sizeof(su3))) == NULL ) { // allocates 4*V+1 su3-matrices + printf ("malloc error in 'init_temporalgauge_trafo'\n"); + return(2); + } + + #if (defined SSE || defined SSE2 || defined SSE3) + g_tempgauge_field[0] = (su3*)(((unsigned long int)(tempgauge_field)+ALIGN_BASE)&~ALIGN_BASE); + #else + g_tempgauge_field[0] = tempgauge_field; + #endif + + int i = 0; + + for (i = 1; i < V; i++) { + g_tempgauge_field[i] = g_tempgauge_field[i-1]+4; + } + + // copy the original field + copy_gauge_field(g_tempgauge_field, g_gauge_field); + + + return(0); + + +}//init_temporalgauge_trafo_mpi() + +#endif //MPI + +*/ + + + + + +void finalize_temporalgauge() { + + free(g_trafo); + free(tempgauge_field); + free(g_tempgauge_field); + + #ifdef MPI + free(left); + free(right); + #endif + +} + + + + + +/* + +// apply gauge transform to gfield with the trafo stored in trafofield + +void apply_gtrafo2 (su3 ** gfield, su3 * trafofield) { + + int it, iz, iy, ix, xpos, mu; + + su3 temp1; + + if (g_proc_id == 0) { + printf("Applying gauge transformation..."); + } + + for (it = 0; it < T; it++) { + for (ix = 0; ix < LX; ix++) { + for (iy = 0; iy < LY; iy++) { + for (iz = 0; iz < LZ; iz++) { + + xpos = g_ipt[it][ix][iy][iz]; + + for (mu = 0; mu < 4; mu++) { + // help = g(x) U_mu(x) + _su3_times_su3( temp1, trafofield[xpos], gfield[xpos][mu] ); + // U_mu(x) <- U_mu^{'}(x) = help g^{+}(x+mu) + _su3_times_su3d( gfield[xpos][mu],temp1, trafofield[ g_iup[xpos][mu] ]); + } + + } + } + } + } + + if (g_proc_id == 0) { + printf("done\n"); + } + + // update gauge copy fields in the next call to HoppingMatrix + g_update_gauge_copy = 1; +} + +*/ + + + + + +// apply gauge transform to gfield with the trafo stored in trafofield + +void apply_gtrafo (su3 ** gfield, su3 * trafofield) { + + int it, iz, iy, ix; + int pos; + int mu; + + su3 temp1; + + if (g_proc_id == 0) { + printf("Applying gauge transformation..."); + } + + for (it = 0; it < T; it++) { + for (ix = 0; ix < LX; ix++) { + for (iy = 0; iy < LY; iy++) { + for (iz = 0; iz < LZ; iz++) { + + #ifdef MPI // this is the MPI implementation of the GLOBAL TEMPORALGAUGE + + pos = g_ipt[it][ix][iy][iz]; + + for (mu = 0; mu < 4; mu++) { + if ((it != T-1) || (mu != 0)) { + /* help = g(x) U_mu(x) */ + _su3_times_su3( temp1, trafofield[pos], gfield[pos][mu] ); // temp1 = trafofield[pos] * gfield[pos][mu] + /* U_mu(x) <- U_mu^{'}(x) = help g^{+}(x+mu)*/ + _su3_times_su3d( gfield[pos][mu],temp1, trafofield[ g_iup[pos][mu] ]); // gfield[pos][mu] = temp1 * trafofield[ g_iup[pos][mu] ] _ {dagger} + } // = trafofield[pos] * gfield[pos][mu] * trafofield[ g_iup[pos][mu] ]_{dagger} + else { + _su3_times_su3( temp1, trafofield[pos], gfield[pos][mu] ); + _su3_times_su3d( gfield[pos][mu],temp1, right[ g_ipt[0][ix][iy][iz] ]); // "rightest" transf. matrices are stored in right[] + } + } + + #else // in case of using this version with MPI this is + // a LOCAL version of TEMPORALGAUGE + pos = g_ipt[it][ix][iy][iz]; + + for (mu = 0; mu < 4; mu++) { + if ((it != T-1) || (mu != 0)) { + /* help = g(x) U_mu(x) */ + _su3_times_su3( temp1, trafofield[pos], gfield[pos][mu] ); + /* U_mu(x) <- U_mu^{'}(x) = help g^{+}(x+mu)*/ + _su3_times_su3d( gfield[pos][mu],temp1, trafofield[ g_iup[pos][mu] ]); + } + else { // (it = T-1) && (mu = 0) + _su3_times_su3( temp1, trafofield[pos], gfield[pos][mu] ); + _su3_times_su3d( gfield[pos][mu],temp1, trafofield[ g_ipt[0][ix][iy][iz] ]); // "rightest" transf. matrices are the first (periodic) and are initialized to ID + } + } + + #endif + + } + } + } + } + + if (g_proc_id == 0) { + printf("done\n"); + } + + /* update gauge copy fields in the next call to HoppingMatrix */ + g_update_gauge_copy = 1; + +}//apply_gtrafo() + + + + +/* + apply the inverse gauge transform to gfield with the trafo stored in trafofield +*/ + +// this is not really needed, instead we are copying the original gauge field + +void apply_inv_gtrafo (su3 ** gfield, su3 * trafofield) { + + int it, iz, iy, ix; + int xpos; + int mu; + + su3 temp1, temp2; + + if(g_proc_id == 0) { + printf("Applying INVERSE gauge transformation..."); + } + + for (it = 0; it < T; it++) { + for (ix = 0; ix < LX; ix++) { + for (iy = 0; iy < LY; iy++) { + for (iz = 0; iz < LZ; iz++) { + + xpos = g_ipt[it][ix][iy][iz]; + + for (mu = 0; mu < 4; mu++) { + /* + _su3d_times_su3( temp1, trafofield[xpos], gfield[xpos][mu] ); + + _su3_times_su3( gfield[xpos][mu],temp1, trafofield[ g_iup[xpos][mu] ]); + */ + + /* help = U^{'}_mu(x) g(x+mu)*/ + _su3_times_su3( temp1, gfield[xpos][mu], trafofield[ g_iup[xpos][mu]] ); // temp1 = gfield[xpos][mu] * trafofield[ g_iup[xpos][mu] ] + + /* U_mu(x) <- g^{+}(x) help */ + _su3_dagger(temp2, trafofield[xpos] ) // temp2 = trafofield[xpos]_{dagger} + _su3_times_su3( gfield[xpos][mu], temp2, temp1); // gfield[xpos][mu] = temp2 * temp1 + // = trafofield[xpos]_{dagger} * gfield[xpos][mu] * trafofield[ g_iup[xpos][mu] ] + } + }}}} + + if(g_proc_id == 0) { + printf("done\n"); + } + + /* update gauge copy fields in the next call to HoppingMatrix */ + g_update_gauge_copy = 1; + +} + + + +/* + apply inverse gauge transform to spinor + U_0(x) = g^{+}(x) U^{'}_0(x) g(x+0) + => psi(x) = g^{+}(x) psi^{'}(x) + (the primed (^{'}) quantities are the gauge transformed fields) +*/ + +void apply_inv_gtrafo_spinor (spinor * spin, su3 * trafofield) { + + int it, iz, iy, ix; + int pos; + + spinor temp; + + if(g_proc_id == 0) { + printf("Applying INVERSE gauge transformation to spinor..."); + } + + for (it = 0; it < T; it++) { + for (ix = 0; ix < LX; ix++) { + for (iy = 0; iy < LY; iy++) { + for (iz = 0; iz < LZ; iz++) { + + pos = g_ipt[it][ix][iy][iz]; + + _su3_inverse_multiply(temp.s0, trafofield[pos], spin[pos].s0); + _su3_inverse_multiply(temp.s1, trafofield[pos], spin[pos].s1); + _su3_inverse_multiply(temp.s2, trafofield[pos], spin[pos].s2); + _su3_inverse_multiply(temp.s3, trafofield[pos], spin[pos].s3); + + _vector_assign(spin[pos].s0,temp.s0); + _vector_assign(spin[pos].s1,temp.s1); + _vector_assign(spin[pos].s2,temp.s2); + _vector_assign(spin[pos].s3,temp.s3); + + } + } + } + } + + if (g_proc_id == 0) { + printf("done\n"); + } + +} + + + + +/* + apply gauge transform to spinor + U^{'}_0(x) = g(x) U_0(x) g^{+}(x+0) + => psi^{'}(x) = g(x) psi(x) + (the primed (^{'}) quantities are the gauge transformed fields) +*/ + +void apply_gtrafo_spinor (spinor * spin, su3 * trafofield) { + + int it, iz, iy, ix; + int pos; + spinor temp; + + if(g_proc_id == 0) { + printf("Applying gauge transformation to spinor..."); + } + + for (it = 0; it < T; it++) { + for (ix = 0; ix < LX; ix++) { + for (iy = 0; iy < LY; iy++) { + for (iz = 0; iz < LZ; iz++) { + + pos = g_ipt[it][ix][iy][iz]; + + _su3_multiply(temp.s0, trafofield[pos], spin[pos].s0); + _su3_multiply(temp.s1, trafofield[pos], spin[pos].s1); + _su3_multiply(temp.s2, trafofield[pos], spin[pos].s2); + _su3_multiply(temp.s3, trafofield[pos], spin[pos].s3); + + _vector_assign(spin[pos].s0,temp.s0); + _vector_assign(spin[pos].s1,temp.s1); + _vector_assign(spin[pos].s2,temp.s2); + _vector_assign(spin[pos].s3,temp.s3); + } + } + } + } + + if(g_proc_id == 0) { + printf("done\n"); + } + +} + + + + +/* + apply gauge transform to ODD spinor + U^{'}_0(x) = g(x) U_0(x) g^{+}(x+0) + => psi^{'}(x) = g(x) psi(x) + (the primed (^{'}) quantities are the gauge transformed fields) +*/ + +void apply_gtrafo_spinor_odd (spinor * spin, su3 * trafofield) { + + int it, iz, iy, ix; + int pos; + int oddpos; + spinor temp; + + if (g_proc_id == 0) { + printf("Applying gauge transformation to odd spinor..."); + } + + for (it = 0; it < T; it++) { + for (ix = 0; ix < LX; ix++) { + for (iy = 0; iy < LY; iy++) { + for (iz = 0; iz < LZ; iz++) { + + if ((it + ix + iy + iz) % 2 != 0) { + /* odd positions */ + pos = g_ipt[it][ix][iy][iz]; + oddpos = g_lexic2eosub[ pos ]; + + _su3_multiply(temp.s0, trafofield[pos], spin[oddpos].s0); + _su3_multiply(temp.s1, trafofield[pos], spin[oddpos].s1); + _su3_multiply(temp.s2, trafofield[pos], spin[oddpos].s2); + _su3_multiply(temp.s3, trafofield[pos], spin[oddpos].s3); + + _vector_assign(spin[oddpos].s0, temp.s0); + _vector_assign(spin[oddpos].s1, temp.s1); + _vector_assign(spin[oddpos].s2, temp.s2); + _vector_assign(spin[oddpos].s3, temp.s3); + + } + } + } + } + } + + if (g_proc_id == 0) { + printf("done\n"); + } + +} + + + + +/* + apply inverse gauge transform to ODD spinor + U_0(x) = g^{+}(x) U^{'}_0(x) g(x+0) + => psi(x) = g^{+}(x) psi^{'}(x) + (the primed (^{'}) quantities are the gauge ttemp.s0ransformed fields) +*/ + +void apply_inv_gtrafo_spinor_odd (spinor * spin, su3 * trafofield) { + + int it, iz, iy, ix; + int pos; + int oddpos; + + spinor temp; + + if (g_proc_id == 0) { + printf("Applying INVERSE gauge transformation to odd spinor..."); + } + for (it = 0; it < T; it++) { + for (ix = 0; ix < LX; ix++) { + for (iy = 0; iy < LY; iy++) { + for (iz = 0; iz < LZ; iz++) { + + if ((it + ix + iy + iz) % 2 != 0) { + + /* odd positions */ + pos = g_ipt[it][ix][iy][iz]; + oddpos = g_lexic2eosub[ pos ]; + + _su3_inverse_multiply(temp.s0, trafofield[pos], spin[oddpos].s0); + _su3_inverse_multiply(temp.s1, trafofield[pos], spin[oddpos].s1); + _su3_inverse_multiply(temp.s2, trafofield[pos], spin[oddpos].s2); + _su3_inverse_multiply(temp.s3, trafofield[pos], spin[oddpos].s3); + + _vector_assign(spin[oddpos].s0, temp.s0); + _vector_assign(spin[oddpos].s1, temp.s1); + _vector_assign(spin[oddpos].s2, temp.s2); + _vector_assign(spin[oddpos].s3, temp.s3); + + } + + } + } + } + } + + if (g_proc_id == 0) { + printf("done\n"); + } + +} + + + + +/* + apply gauge transform to EVENspinor + U^{'}_0(x) = g(x) U_0(x) g^{+}(x+0) + => psi^{'}(x) = g(x) psi(x) + (the primed (^{'}) quantities are the gauge transformed fields) +*/ + +void apply_gtrafo_spinor_even (spinor * spin, su3 * trafofield) { + + int it, iz, iy, ix; + int pos; + int evenpos; + + spinor temp; + + if (g_proc_id == 0) { + printf("Applying gauge transformation to even spinor..."); + } + + for (it = 0; it < T; it++) { + for (ix = 0; ix < LX; ix++) { + for (iy = 0; iy < LY; iy++) { + for (iz = 0; iz < LZ; iz++) { + + if ((it + ix + iy + iz) % 2 == 0) { + + /* even positions */ + pos = g_ipt[it][ix][iy][iz]; + evenpos = g_lexic2eosub[ pos ]; + + _su3_multiply(temp.s0, trafofield[pos], spin[evenpos].s0); + _su3_multiply(temp.s1, trafofield[pos], spin[evenpos].s1); + _su3_multiply(temp.s2, trafofield[pos], spin[evenpos].s2); + _su3_multiply(temp.s3, trafofield[pos], spin[evenpos].s3); + + _vector_assign(spin[evenpos].s0, temp.s0); + _vector_assign(spin[evenpos].s1, temp.s1); + _vector_assign(spin[evenpos].s2, temp.s2); + _vector_assign(spin[evenpos].s3, temp.s3); + + } + } + } + } + } + + if (g_proc_id == 0) { + printf("done\n"); + } + +} + + + +/* + apply inverse gauge transform to EVEN spinor + U_0(x) = g^{+}(x) U^{'}_0(x) g(x+0) + => psi(x) = g^{+}(x) psi^{'}(x) + (the primed (^{'}) quantities are the gauge transformed fields) +*/ +void apply_inv_gtrafo_spinor_even (spinor * spin, su3 * trafofield) { + + int it, iz, iy, ix; + int xpos; + int evenpos; + + spinor temp; + + if (g_proc_id == 0) { + printf("Applying INVERSE gauge transformation to even spinor..."); + } + for (it = 0; it < T; it++) { + for (ix = 0; ix < LX; ix++) { + for (iy = 0; iy < LY; iy++) { + for (iz = 0; iz < LZ; iz++) { + + if ((it+ix+iy+iz)%2 == 0) { + /* even positions */ + xpos = g_ipt[it][ix][iy][iz]; + evenpos = g_lexic2eosub[ xpos ]; + + _su3_inverse_multiply(temp.s0, trafofield[xpos], spin[evenpos].s0); + _su3_inverse_multiply(temp.s1, trafofield[xpos], spin[evenpos].s1); + _su3_inverse_multiply(temp.s2, trafofield[xpos], spin[evenpos].s2); + _su3_inverse_multiply(temp.s3, trafofield[xpos], spin[evenpos].s3); + + _vector_assign(spin[evenpos].s0,temp.s0); + _vector_assign(spin[evenpos].s1,temp.s1); + _vector_assign(spin[evenpos].s2,temp.s2); + _vector_assign(spin[evenpos].s3,temp.s3); + + } + + } + } + } + } + + if (g_proc_id == 0) { + printf("done\n"); + } + +} + +void gtrafo_eo_nd(spinor * const Even_s, spinor * const Odd_s, spinor * const Even_c, spinor * const Odd_c, + spinor * const Even_new_s, spinor * const Odd_new_s, spinor * const Even_new_c, spinor * const Odd_new_c, + GTRAFO_TYPE type){ + + /* initialize temporal gauge here */ + int retval; + double dret1, dret2; + static double plaquette1 = 0.0; + static double plaquette2 = 0.0; + + if(type==GTRAFO_APPLY){ + /* need VOLUME here (not N=VOLUME/2)*/ + if ((retval = init_temporalgauge_trafo(VOLUME, g_gauge_field)) != 0 ) { // initializes the transformation matrices + if (g_proc_id == 0) printf("Error while gauge fixing to temporal gauge. Aborting...\n"); // g_tempgauge_field as a copy of g_gauge_field + exit(200); + } + + /* do trafo */ + plaquette1 = measure_plaquette(g_gauge_field); + apply_gtrafo(g_gauge_field, g_trafo); // transformation of the gauge field + plaquette2 = measure_plaquette(g_gauge_field); + if (g_proc_id == 0) printf("\tPlaquette before gauge fixing: %.16e\n", plaquette1/6./VOLUME); + if (g_proc_id == 0) printf("\tPlaquette after gauge fixing: %.16e\n", plaquette2/6./VOLUME); + + /* do trafo to odd_s part of source */ + dret1 = square_norm(Odd_s, VOLUME/2 , 1); + apply_gtrafo_spinor_odd(Odd_s, g_trafo); // odd spinor transformation, strange + dret2 = square_norm(Odd_s, VOLUME/2, 1); + if (g_proc_id == 0) printf("\tsquare norm before gauge fixing: %.16e\n", dret1); + if (g_proc_id == 0) printf("\tsquare norm after gauge fixing: %.16e\n", dret2); + + /* do trafo to odd_c part of source */ + dret1 = square_norm(Odd_c, VOLUME/2 , 1); + apply_gtrafo_spinor_odd(Odd_c, g_trafo); // odd spinor transformation, charm + dret2 = square_norm(Odd_c, VOLUME/2, 1); + if (g_proc_id == 0) printf("\tsquare norm before gauge fixing: %.16e\n", dret1); + if (g_proc_id == 0) printf("\tsquare norm after gauge fixing: %.16e\n", dret2); + + /* do trafo to even_s part of source */ + dret1 = square_norm(Even_s, VOLUME/2 , 1); + apply_gtrafo_spinor_even(Even_s, g_trafo); // even spinor transformation, strange + dret2 = square_norm(Even_s, VOLUME/2, 1); + if (g_proc_id == 0) printf("\tsquare norm before gauge fixing: %.16e\n", dret1); + if (g_proc_id == 0) printf("\tsquare norm after gauge fixing: %.16e\n", dret2); + + /* do trafo to even_c part of source */ + dret1 = square_norm(Even_c, VOLUME/2 , 1); + apply_gtrafo_spinor_even(Even_c, g_trafo); // even spinor transformation, charm + dret2 = square_norm(Even_c, VOLUME/2, 1); + if (g_proc_id == 0) printf("\tsquare norm before gauge fixing: %.16e\n", dret1); + if (g_proc_id == 0) printf("\tsquare norm after gauge fixing: %.16e\n", dret2); + } else { + /* undo trafo */ + /* apply_inv_gtrafo(g_gauge_field, g_trafo);*/ + /* copy back the saved original field located in g_tempgauge_field -> update necessary*/ + plaquette1 = measure_plaquette(g_gauge_field); + copy_gauge_field(g_gauge_field, g_tempgauge_field); + g_update_gauge_copy = 1; + plaquette2 = measure_plaquette(g_gauge_field); + if (g_proc_id == 0) printf("\tPlaquette before inverse gauge fixing: %.16e\n", plaquette1/6./VOLUME); + if (g_proc_id == 0) printf("\tPlaquette after inverse gauge fixing: %.16e\n", plaquette2/6./VOLUME); + + /* undo trafo to source Even_s */ + dret1 = square_norm(Even_s, VOLUME/2 , 1); + apply_inv_gtrafo_spinor_even(Even_s, g_trafo); + dret2 = square_norm(Even_s, VOLUME/2, 1); + if (g_proc_id == 0) printf("\tsquare norm before gauge fixing: %.16e\n", dret1); + if (g_proc_id == 0) printf("\tsquare norm after gauge fixing: %.16e\n", dret2); + + + /* undo trafo to source Even_c */ + dret1 = square_norm(Even_c, VOLUME/2 , 1); + apply_inv_gtrafo_spinor_even(Even_c, g_trafo); + dret2 = square_norm(Even_c, VOLUME/2, 1); + if (g_proc_id == 0) printf("\tsquare norm before gauge fixing: %.16e\n", dret1); + if (g_proc_id == 0) printf("\tsquare norm after gauge fixing: %.16e\n", dret2); + + /* undo trafo to source Odd_s */ + dret1 = square_norm(Odd_s, VOLUME/2 , 1); + apply_inv_gtrafo_spinor_odd(Odd_s, g_trafo); + dret2 = square_norm(Odd_s, VOLUME/2, 1); + if (g_proc_id == 0) printf("\tsquare norm before gauge fixing: %.16e\n", dret1); + if (g_proc_id == 0) printf("\tsquare norm after gauge fixing: %.16e\n", dret2); + + /* undo trafo to source Odd_c */ + dret1 = square_norm(Odd_c, VOLUME/2 , 1); + apply_inv_gtrafo_spinor_odd(Odd_c, g_trafo); + dret2 = square_norm(Odd_c, VOLUME/2, 1); + if (g_proc_id == 0) printf("\tsquare norm before gauge fixing: %.16e\n", dret1); + if (g_proc_id == 0) printf("\tsquare norm after gauge fixing: %.16e\n", dret2); + + + // Even_new_s + dret1 = square_norm(Even_new_s, VOLUME/2 , 1); + apply_inv_gtrafo_spinor_even(Even_new_s, g_trafo); + dret2 = square_norm(Even_new_s, VOLUME/2, 1); + if (g_proc_id == 0) printf("\tsquare norm before gauge fixing: %.16e\n", dret1); + if (g_proc_id == 0) printf("\tsquare norm after gauge fixing: %.16e\n", dret2); + + // Even_new_c + dret1 = square_norm(Even_new_c, VOLUME/2 , 1); + apply_inv_gtrafo_spinor_even(Even_new_c, g_trafo); + dret2 = square_norm(Even_new_c, VOLUME/2, 1); + if (g_proc_id == 0) printf("\tsquare norm before gauge fixing: %.16e\n", dret1); + if (g_proc_id == 0) printf("\tsquare norm after gauge fixing: %.16e\n", dret2); + + // Odd_new_s + dret1 = square_norm(Odd_new_s, VOLUME/2 , 1); + apply_inv_gtrafo_spinor_odd(Odd_new_s, g_trafo); + dret2 = square_norm(Odd_new_s, VOLUME/2, 1); + if (g_proc_id == 0) printf("\tsquare norm before gauge fixing: %.16e\n", dret1); + if (g_proc_id == 0) printf("\tsquare norm after gauge fixing: %.16e\n", dret2); + + // Odd_new_c + dret1 = square_norm(Odd_new_c, VOLUME/2 , 1); + apply_inv_gtrafo_spinor_odd(Odd_new_c, g_trafo); + dret2 = square_norm(Odd_new_c, VOLUME/2, 1); + if (g_proc_id == 0) printf("\tsquare norm before gauge fixing: %.16e\n", dret1); + if (g_proc_id == 0) printf("\tsquare norm after gauge fixing: %.16e\n", dret2); + + finalize_temporalgauge(); + } +# ifdef MPI + xchange_gauge(g_gauge_field); +# endif +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/temporalgauge.h b/qcd/part_cpu/applications/QCD/src/kernel_D/temporalgauge.h new file mode 100644 index 0000000000000000000000000000000000000000..3ebb5d68702981bd736a325910fd9850e5eea2c8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/temporalgauge.h @@ -0,0 +1,51 @@ +/*********************************************************************** + * Copyright (C) 2010 Florian Burger + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _TEMPORALGAUGE_H +#define _TEMPORALGAUGE_H + +typedef enum GTRAFO_TYPE { + GTRAFO_APPLY = 0, + GTRAFO_REVERT } GTRAFO_TYPE; + +int init_temporalgauge_trafo(const int V, su3** gfield); +void apply_gtrafo(su3 ** gfield, su3 * trafofield); +void apply_gtrafo_spinor(spinor * spin, su3 * trafofield); +void apply_inv_gtrafo(su3 ** gfield, su3 * trafofield); +void apply_inv_gtrafo_spinor(spinor * spin, su3 * trafofield); +void finalize_temporalgauge(); + +void apply_gtrafo_spinor_odd(spinor * spin, su3 * trafofield); +void apply_inv_gtrafo_spinor_odd(spinor * spin, su3 * trafofield); +void apply_gtrafo_spinor_even(spinor * spin, su3 * trafofield); +void apply_inv_gtrafo_spinor_even(spinor * spin, su3 * trafofield); + +void gtrafo_eo_nd(spinor * const Even_s, spinor * const Odd_s, spinor * const Even_c, spinor * const Odd_c, + spinor * const Even_new_s, spinor * const Odd_new_s, spinor * const Even_new_c, spinor * const Odd_new_c, + GTRAFO_TYPE type); + +void gtrafo_eo(spinor * const Even, spinor * const Odd, GTRAFO_TYPE type); + +void copy_gauge_field(su3** to, su3** from); + +#endif + + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/test/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_D/test/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..8efc8b56946d10339e4e53a16ae3a884162e70a1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/test/Makefile @@ -0,0 +1,88 @@ +TARGETS = scalar_prod_r_test + +USESF = yes + +OS = -os3 + +# gcc shouldn't see this options, that's why we don't use CGLAGS here +NLCCFLAGS = -D_STD_C99_COMPLEX_CHECKED -D_STD_C99_COMPLEX -Dapenext +INCLUDES = -I../ +# workaround to let nlcc not see the non-standard complex.h +NLCCINCLUDES = -I${NROOT}/include/nlibc/ ${INCLUDES} + +NLCCOPTS = -gp ${NLCCFLAGS} ${NLCCINCLUDES} +ifdef USESF + MPPOPTS = -sf -v + SHAKEROPTS = -n -z +else + MPPOPTS = -v + SHAKEROPTS = +a -z +endif +SOFANOPTS = --rr + +# needed due to a bug in nlcc +NLCCOS = -OS3 + +NLCC = nlcc-0.5.2 +MPP = mpp +SOFAN = sofan +SHAKER = shaker +M4 = m4 +CCDEP = gcc +DEPFLAGS = -MM -MQ $*.sasm ${CFLAGS} ${INCLUDES} + +DEPFILES = $(addsuffix .d, ${TARGETS}) +MEMFILES = $(addsuffix .mem, ${TARGETS}) $(addsuffix -sofan.mem, ${TARGETS}) \ + $(addsuffix .no, ${TARGETS}) $(addsuffix -sofan.no, ${TARGETS}) +ASMFILES = $(addsuffix .sasm, ${TARGETS}) $(addsuffix .masm, ${TARGETS}) $(addsuffix -sofan.masm, ${TARGETS}) +NCDFILES = $(addsuffix .ncd, ${TARGETS}) $(addsuffix -sofan.ncd, ${TARGETS}) +SFOUTFILES = $(addsuffix .svn-out, ${TARGETS}) $(addsuffix .svn-out%, ${TARGETS}) \ + $(addsuffix .sf_log, ${TARGETS}) $(addsuffix .sf_log%, ${TARGETS}) \ + $(addsuffix .sf_log0, ${TARGETS}) $(addsuffix .sf_log0%, ${TARGETS}) \ + $(addsuffix .err-sf, ${TARGETS}) $(addsuffix .svn-out, ${TARGETS}) \ + $(addsuffix .dmo, ${TARGETS}) \ + $(addsuffix -sofan.svn-out, ${TARGETS}) $(addsuffix -sofan.svn-out%, ${TARGETS}) \ + $(addsuffix -sofan.sf_log, ${TARGETS}) $(addsuffix -sofan.sf_log%, ${TARGETS}) \ + $(addsuffix -sofan.sf_log0, ${TARGETS}) $(addsuffix -sofan.sf_log0%, ${TARGETS}) \ + $(addsuffix -sofan.err-sf, ${TARGETS}) $(addsuffix -sofan.svn-out, ${TARGETS}) \ + $(addsuffix -sofan.dmo, ${TARGETS}) +GCCBINARIES = $(addsuffix .gccbin, ${TARGETS}) + +all: $(addsuffix -sofan.mem, ${TARGETS}) +allgcc: $(addsuffix .gccbin, ${TARGETS}) + +-include $(DEPFILES) + +%.mem: %.masm + ${SHAKER} ${SHAKEROPTS} $< + +%.masm: %.sasm + ${MPP} ${OS} ${MPPOPTS} $< + +%-sofan.masm: %.masm + ${SOFAN} ${SOFANOPTS} $< $@ + +%.sasm: %.c Makefile + ${NLCC} ${NLCCOPTS} ${NLCCOS} -S $< + +%.ncd: %.mem + dispminit $< > $@ + +%-sofan.perf: %-sofan.ncd + nperf -asm=$*.sasm -c -l -a $< > $@ || (rm -f $@; exit 1) + +# beware, this is not very general +%.gccbin: %.c + gcc -I../ $< -o $@ + +$(DEPFILES): %.d: %.c Makefile + $(CCDEP) ${DEPFLAGS} ${INCLUDES} $< > $@ + +clean: + rm -f ${ASMFILES} ${MEMFILES} ${NCDFILES} ${GCCBINARIES} + +distclean: clean + rm -f ${DEPFILES} ${SFOUTFILES} + +.SECONDARY: +.DELETE_ON_ERROR: diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/test/check_geometry.c b/qcd/part_cpu/applications/QCD/src/kernel_D/test/check_geometry.c new file mode 100644 index 0000000000000000000000000000000000000000..eae06a51bdabcab4a75d178a3585faa0848d948d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/test/check_geometry.c @@ -0,0 +1,2747 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +/******************************************************************************* + * + * File check_geometry.c + * + * Consistency of the index arrays ipt, iup and idn + * + * Author: Carsten Urbach + * using a file of Martin Luescher as template + * + *******************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#ifdef MPI +#include +#endif +#include "global.h" +#include "geometry_eo.h" +#include "test/check_geometry.h" +#ifdef MPI +#include "mpi_init.h" +#endif + +#if defined _INDEX_INDEP_GEOM + +int check_geometry() +{ +#ifdef XLC +#pragma execution_frequency(very_low) +#endif + int ix, j; + int * stest; + int * itest; + int x0,x1,x2,x3; + int iy0,iy1,iy2,iy3; + int iz0,iz1,iz2,iz3; + int bndcnt = 0; + int ext_t=0, ext_x=0, ext_y=0, ext_z=0; + +#if ( defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT ) + ext_t=2; +#endif +#if ( defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ ) + ext_x=2; +#endif +#if ( defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ ) + ext_y=2; +#endif +#if ( defined PARALLELXYZT || defined PARALLELXYZ ) + ext_z=2; +#endif + + + + + itest = calloc(VOLUMEPLUSRAND + g_dbw2rand, sizeof(int)); + stest = calloc((VOLUMEPLUSRAND)/2, sizeof(int)); + + for (ix=0;ix= VOLUME)) { + printf("The index ipt is out of range (%d, %d, %d, %d) ix = %d\n", x0, x1, x2, x3, ix); + printf("Program aborted\n"); + return(-1); + } + itest[ix]+=1; + } + } + } + } + + for (ix = 0; ix < VOLUME; ix++){ + if (itest[ix]!=1){ + printf("The index ipt is not one-to-one %d\n", itest[ix]); + printf("Program aborted\n"); + return(-1); + } + } + + for (x0 = 0; x0 < T; x0++){ + for (x1 = 0; x1 < LX; x1++){ + for (x2 = 0; x2 < LY; x2++){ + for (x3 = 0; x3 < LZ; x3++){ + ix=g_ipt[x0][x1][x2][x3]; + + iy0=g_iup[ix][0]; +#if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) + if(x0!=T-1) { + iz0=g_ipt[(x0+1)%T][x1][x2][x3]; + } + else { + iz0 = g_ipt[T][x1][x2][x3]; + itest[iy0]++; + if(iy0 < gI_L_0_0_0 || iy0 >= gI_L_0_0_0 + LX*LY*LZ) { + printf("Boundary for time direction up is wrong %d %d %d\n", + iy0 < gI_L_0_0_0, iy0 >= gI_L_0_0_0 + LX*LY*LZ, iy0); + return(-1); + } + } +#else + iz0=g_ipt[(x0+1)%T][x1][x2][x3]; +#endif + + iy1=g_iup[ix][1]; +#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ) + if(x1 !=LX-1) { + iz1=g_ipt[x0][(x1+1)%LX][x2][x3]; + } + else { + iz1=g_ipt[x0][LX][x2][x3]; + itest[iy1]++; + if(iy1 < gI_0_L_0_0 || iy1 >= gI_0_L_0_0 + T*LY*LZ) { + printf("Boundary for x direction up is wrong %d %d %d\n", + iy1 < gI_0_L_0_0, iy1 >= gI_0_L_0_0 + T*LY*LZ, iy1); + return(-1); + } + } +#else + iz1=g_ipt[x0][(x1+1)%LX][x2][x3]; +#endif + + iy2=g_iup[ix][2]; +#if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ ) + if(x2 !=LY-1) { + iz2=g_ipt[x0][x1][(x2+1)%LY][x3]; + } + else { + iz2=g_ipt[x0][x1][LY][x3]; + itest[iy2]++; + if(iy2 < gI_0_0_L_0 || iy2 >= gI_0_0_L_0 + T*LX*LZ) { + printf("Boundary for y direction up is wrong %d %d %d\n", + iy2 < gI_0_0_L_0, iy2 > gI_0_0_L_0 + T*LX*LZ, iy2); + return(-1); + } + } +#else + iz2=g_ipt[x0][x1][(x2+1)%LY][x3]; +#endif + + iy3=g_iup[ix][3]; +#if ( defined PARALLELXYZT || defined PARALLELXYZ ) + if(x3 !=LZ-1) { + iz3=g_ipt[x0][x1][x2][(x3+1)%LZ]; + } + else { + iz3=g_ipt[x0][x1][x2][LZ]; + itest[iy3]++; + if(iy3 < gI_0_0_0_L || iy3 >= gI_0_0_0_L + T*LX*LY) { + printf("Boundary for z direction up is wrong %d %d %d\n", + iy3 < gI_0_0_0_L, iy3 > gI_0_0_0_L+ T*LX*LY, iy3); + return(-1); + } + } +#else + iz3=g_ipt[x0][x1][x2][(x3+1)%LZ]; +#endif + + if ((iy0!=iz0)||(iy1!=iz1)||(iy2!=iz2)||(iy3!=iz3)|| + (g_idn[iy0][0]!=ix)||(g_idn[iy1][1]!=ix)||(g_idn[iy2][2]!=ix)||(g_idn[iy3][3]!=ix)) { + printf("The index iup is incorrect\n"); + printf("%d %d %d %d\n", (iy0!=iz0), (iy1!=iz1), (iy2!=iz2), (iy3!=iz3)); + printf("%d %d %d %d %d %d\n", x0, x1, x2, x3, iy1, iz1); + return(-1); + } + + iy0=g_idn[ix][0]; +#if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) + if(x0 !=0) { + iz0=g_ipt[(x0+T-1)%T][x1][x2][x3]; + } + else { + iz0 = g_ipt[T+1][x1][x2][x3];; + itest[iy0]++; + if(iy0 < gI_m1_0_0_0 || iy0 >= gI_m1_0_0_0 + LX*LY*LZ) { + printf("Boundary for time direction is wrong %d %d %d\n", + iy0 < gI_m1_0_0_0, iy0 >= gI_m1_0_0_0 + LX*LY*LZ, iy0); + return(-1); + } + } +#else + iz0=g_ipt[(x0+T-1)%T][x1][x2][x3]; +#endif + + iy1=g_idn[ix][1]; +#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ) + if(x1 !=0) { + iz1=g_ipt[x0][(x1+LX-1)%LX][x2][x3]; + } + else { + iz1 = g_ipt[x0][LX+1][x2][x3]; + itest[iy1]++; + if(iy1 < gI_0_m1_0_0 || iy1 >= gI_0_m1_0_0 + T*LY*LZ) { + printf("Boundary for x direction is wrong %d %d %d\n", + iy1 < gI_0_m1_0_0, iy1 >= gI_0_m1_0_0 + T*LY*LZ, iy1); + return(-1); + } + } +#else + iz1=g_ipt[x0][(x1+LX-1)%LX][x2][x3]; +#endif + iy2=g_idn[ix][2]; +#if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ ) + if(x2 !=0) { + iz2=g_ipt[x0][x1][(x2+LY-1)%LY][x3]; + } + else { + iz2 = g_ipt[x0][x1][LY+1][x3]; + itest[iy2]++; + if(iy2 < gI_0_0_m1_0 || iy2 >= gI_0_0_m1_0 + T*LX*LZ) { + printf("Boundary for y direction is wrong %d %d %d\n", + iy2 < gI_0_0_m1_0, iy2 >= gI_0_0_m1_0 + T*LX*LZ, iy2); + return(-1); + } + } +#else + iz2=g_ipt[x0][x1][(x2+LY-1)%LY][x3]; +#endif + + iy3=g_idn[ix][3]; +#if ( defined PARALLELXYZT || defined PARALLELXYZ ) + if(x3 !=0) { + iz3=g_ipt[x0][x1][x2][(x3+LZ-1)%LZ]; + } + else { + iz3 = g_ipt[x0][x1][x2][LZ+1]; + itest[iy3]++; + if(iy3 < gI_0_0_0_m1 || iy3 >= gI_0_0_0_m1 + T*LX*LY) { + printf("Boundary for z direction is wrong %d %d %d\n", + iy3 < gI_0_0_0_m1, iy3 >= gI_0_0_0_m1 + T*LX*LY, iy3); + printf("%d %d %d %d %d\n", x0, x1, x2, x3, ix); + return(-1); + } + } +#else + iz3=g_ipt[x0][x1][x2][(x3+LZ-1)%LZ]; +#endif + + if ((iy0!=iz0)||(iy1!=iz1)||(iy2!=iz2)||(iy3!=iz3)|| + (g_iup[iy0][0]!=ix)||(g_iup[iy1][1]!=ix)||(g_iup[iy2][2]!=ix)||(g_iup[iy3][3]!=ix)) { + printf("The index idn is incorrect\n"); + printf("%d %d %d %d\n", (iy0!=iz0), (iy1!=iz1), (iy2!=iz2), (iy3!=iz3)); + printf("%d %d %d %d\n", iy1, iz1, iy2, iz2); + printf("%d %d %d %d %d\n", x0, x1, x2, x3, ix); + return(-1); + } + + /* The edges */ + /* In case of PARALLELT or PARALLELX there is actually no edge to take care of */ +#if ((defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT)) + if(x0 == 0) { + iy0 = g_idn[ g_idn[ix][1] ][0]; + if(x1 != 0) { + iz0 = g_ipt[T+1][(x1+LX-1)%LX][x2][x3]; + } + else { + iz0 = g_ipt[T+1][LX+1][x2][x3]; + itest[iy0]++; + } + if(iz0 != iy0) { + printf("Edge -t -x has an error\n"); + return(-1); + } + + iy0 = g_idn[ g_iup[ix][1] ][0]; + if(x1 != LX-1) { + iz0 = g_ipt[T+1][(x1+1)%LX][x2][x3]; + } + else { + iz0 = g_ipt[T+1][LX][x2][x3]; + itest[iy0]++; + } + if(iz0 != iy0) { + printf("Edge -t +x has an error\n"); + return(-1); + } + } + + if(x0 == T-1) { + iy0 = g_iup[ g_idn[ix][1] ][0]; + if(x1 != 0) { + iz0 = g_ipt[T][(x1+LX-1)%LX][x2][x3]; + } + else { + iz0 = g_ipt[T][LX+1][x2][x3]; + itest[iy0]++; + } + if(iz0 != iy0) { + printf("Edge +t -x has an error\n"); + return(-1); + } + + iy0 = g_iup[ g_iup[ix][1] ][0]; + if(x1 != LX-1) { + iz0 = g_ipt[T][(x1+1)%LX][x2][x3]; + } + else { + iz0 = g_ipt[T][LX][x2][x3]; + itest[iy0]++; + } + if(iz0 != iy0) { + printf("Edge +t +x has an error\n"); + return(-1); + } + } + +#endif + +#if (defined PARALLELXYT || defined PARALLELXYZT) + if(x0 == 0) { + iy0 = g_idn[ g_idn[ix][2] ][0]; + if(x2 != 0) { + iz0 = g_ipt[T+1][x1][(x2+LY-1)%LY][x3]; + } + else { + iz0 = g_ipt[T+1][x1][LY+1][x3]; + itest[iy0]++; + } + if(iz0 != iy0) { + printf("Edge -t -y has an error\n"); + return(-1); + } + + iy0 = g_idn[ g_iup[ix][2] ][0]; + if(x2 != LY-1) { + iz0 = g_ipt[T+1][x1][(x2+1)%LY][x3]; + } + else { + iz0 = g_ipt[T+1][x1][LY][x3]; + itest[iy0]++; + } + if(iz0 != iy0) { + printf("Edge -t +y has an error\n"); + return(-1); + } + } + + if(x0 == T-1) { + iy0 = g_iup[ g_idn[ix][2] ][0]; + if(x2 != 0) { + iz0 = g_ipt[T][x1][(x2+LY-1)%LY][x3]; + } + else { + iz0 = g_ipt[T][x1][LY+1][x3]; + itest[iy0]++; + } + if(iz0 != iy0) { + printf("Edge +t -y has an error\n"); + return(-1); + } + + iy0 = g_iup[ g_iup[ix][2] ][0]; + if(x2 != LY-1) { + iz0 = g_ipt[T][x1][(x2+1)%LY][x3]; + } + else { + iz0 = g_ipt[T][x1][LY][x3]; + itest[iy0]++; + } + if(iz0 != iy0) { + printf("Edge +t +y has an error\n"); + return(-1); + } + } + +#endif +#if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ) + + if(x1 == 0) { + iy0 = g_idn[ g_idn[ix][2] ][1]; + if(x2 != 0) { + iz0 = g_ipt[x0][LX+1][(x2+LY-1)%LY][x3]; + } + else { + iz0 = g_ipt[x0][LX+1][LY+1][x3]; + itest[iy0]++; + } + if(iz0 != iy0) { + printf("Edge -x -y has an error\n"); + return(-1); + } + iy0 = g_idn[ g_iup[ix][2] ][1]; + if(x2 != LY-1) { + iz0 = g_ipt[x0][LX+1][(x2+1)%LY][x3]; + } + else { + iz0 = g_ipt[x0][LX+1][LY][x3]; + itest[iy0]++; + } + if(iz0 != iy0) { + printf("Edge -x +y has an error\n"); + return(-1); + } + } + if(x1 == LX-1) { + iy0 = g_iup[ g_idn[ix][2] ][1]; + if(x2 != 0) { + iz0 = g_ipt[x0][LX][(x2+LY-1)%LY][x3]; + } + else { + iz0 = g_ipt[x0][LX][LY+1][x3]; + itest[iy0]++; + } + if(iz0 != iy0) { + printf("Edge +x -y has an error\n"); + return(-1); + } + + iy0 = g_iup[ g_iup[ix][2] ][1]; + if(x2 != LY-1) { + iz0 = g_ipt[x0][LX][(x2+1)%LY][x3]; + } + else { + iz0 = g_ipt[x0][LX][LY][x3]; + itest[iy0]++; + } + if(iz0 != iy0) { + printf("Edge +x +y has an error\n"); + return(-1); + } + } +#endif +#if defined PARALLELXYZT + if(x0 == 0) { + iy0 = g_idn[ g_idn[ix][3] ][0]; + if(x3 != 0) { + iz0 = g_ipt[T+1][x1][x2][(x3+LZ-1)%LZ]; + } + else { + iz0 = g_ipt[T+1][x1][x2][LZ+1]; + itest[iy0]++; + if(itest[iy0]>1) { + printf("Edge -t -z has itest = %d at %d, %d, %d, %d, iy0 = %d\n", itest[iy0], x0, x1, x2, x3, iy0); + return(-1); + } + } + if(iz0 != iy0) { + printf("Edge -t -z has an error\n"); + printf("Program aborted\n"); +# ifdef MPI + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); +# endif + exit(0); + } + + iy0 = g_idn[ g_iup[ix][3] ][0]; + if(x3 != LZ-1) { + iz0 = g_ipt[T+1][x1][x2][(x3+1)%LZ]; + } + else { + iz0 = g_ipt[T+1][x1][x2][LZ]; + itest[iy0]++; + if(itest[iy0]>1) { + printf("Edge -t +z has itest = %d at %d, %d, %d, %d, iy0 = %d\n", itest[iy0], x0, x1, x2, x3, iy0); + printf("ix = %d, iz0 = %d %d\n", ix, iz0, g_iup[ix][3]); + return(-1); + } + } + if(iz0 != iy0) { + printf("Edge -t +z has an error\n"); + printf("Program aborted\n"); +# ifdef MPI + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); +# endif + exit(0); + } + } + + if(x0 == T-1) { + iy0 = g_iup[ g_idn[ix][3] ][0]; + if(x3 != 0) { + iz0 = g_ipt[T][x1][x2][(x3+LZ-1)%LZ]; + } + else { + iz0 = g_ipt[T][x1][x2][LZ+1]; + itest[iy0]++; + if(itest[iy0]>1) { + printf("Edge +t -z has itest = %d at %d, %d, %d, %d, iy0 = %d\n", itest[iy0], x0, x1, x2, x3, iy0); + return(-1); + } + } + if(iz0 != iy0) { + printf("Edge +t -z has an error\n"); + return(-1); + } + + iy0 = g_iup[ g_iup[ix][0] ][3]; + if(x3 != LZ-1) { + iz0 = g_ipt[T][x1][x2][(x3+1)%LZ]; + } + else { + iz0 = g_ipt[T][x1][x2][LZ]; + itest[iy0]++; + if(itest[iy0]>1) { + printf("Edge +t +z has itest = %d at %d, %d, %d, %d, iy0 = %d\n", itest[iy0], x0, x1, x2, x3, iy0); + return(-1); + } + } + if(iz0 != iy0) { + printf("Edge +t +z has an error\n"); + return(-1); + } + } + +#endif +#if ( defined PARALLELXYZT || defined PARALLELXYZ ) + + if(x1 == 0) { + iy0 = g_idn[ g_idn[ix][3] ][1]; + if(x3 != 0) { + iz0 = g_ipt[x0][LX+1][x2][(x3+LZ-1)%LZ]; + } + else { + iz0 = g_ipt[x0][LX+1][x2][LZ+1]; + itest[iy0]++; + if(itest[iy0]>1) { + printf("Edge -x -z has itest = %d at %d, %d, %d, %d, iy0 = %d\n", itest[iy0], x0, x1, x2, x3, iy0); + return(-1); + } + } + if(iz0 != iy0) { + printf("Edge -x -z has an error\n"); + return(-1); + } + iy0 = g_idn[ g_iup[ix][3] ][1]; + if(x3 != LZ-1) { + iz0 = g_ipt[x0][LX+1][x2][(x3+1)%LZ]; + } + else { + iz0 = g_ipt[x0][LX+1][x2][LZ]; + itest[iy0]++; + if(itest[iy0]>1) { + printf("Edge -x +z has itest = %d at %d, %d, %d, %d, iy0 = %d\n", itest[iy0], x0, x1, x2, x3, iy0); + return(-1); + } + } + if(iz0 != iy0) { + printf("Edge -x +z has an error\n"); + return(-1); + } + } + if(x1 == LX-1) { + iy0 = g_iup[ g_idn[ix][3] ][1]; + if(x3 != 0) { + iz0 = g_ipt[x0][LX][x2][(x3+LZ-1)%LZ]; + } + else { + iz0 = g_ipt[x0][LX][x2][LZ+1]; + itest[iy0]++; + if(itest[iy0]>1) { + printf("Edge +x -z has itest = %d at %d, %d, %d, %d, iy0 = %d\n", itest[iy0], x0, x1, x2, x3, iy0); + return(-1); + } + } + if(iz0 != iy0) { + printf("Edge +x -z has an error\n"); + return(-1); + } + + iy0 = g_iup[ g_iup[ix][3] ][1]; + if(x3 != LZ-1) { + iz0 = g_ipt[x0][LX][x2][(x3+1)%LZ]; + } + else { + iz0 = g_ipt[x0][LX][x2][LZ]; + itest[iy0]++; + if(itest[iy0]>1) { + printf("Edge +x +z has itest = %d at %d, %d, %d, %d, iy0 = %d iz0 = %d giup[%d][3] = %d ix = %d\n", + itest[iy0], x0, x1, x2, x3, iy0, iz0, ix, g_iup[ix][3], ix); + return(-1); + } + } + if(iz0 != iy0) { + printf("Edge +x +z has an error\n"); + return(-1); + } + } + + if(x2 == 0) { + iy0 = g_idn[ g_idn[ix][3] ][2]; + if(x3 != 0) { + iz0 = g_ipt[x0][x1][LY+1][(x3+LZ-1)%LZ]; + } + else { + iz0 = g_ipt[x0][x1][LY+1][LZ+1]; + itest[iy0]++; + if(itest[iy0]>1) { + printf("Edge -y -z has itest = %d at %d, %d, %d, %d, iy0 = %d\n", itest[iy0], x0, x1, x2, x3, iy0); + return(-1); + } + } + if(iz0 != iy0) { + printf("Edge -y -z has an error\n"); + return(-1); + } + iy0 = g_idn[ g_iup[ix][3] ][2]; + if(x3 != LZ-1) { + iz0 = g_ipt[x0][x1][LY+1][(x3+1)%LZ]; + } + else { + iz0 = g_ipt[x0][x1][LY+1][LZ]; + itest[iy0]++; + if(itest[iy0]>1) { + printf("Edge -y +z has itest = %d at %d, %d, %d, %d, iy0 = %d\n", itest[iy0], x0, x1, x2, x3, iy0); + return(-1); + } + } + if(iz0 != iy0) { + printf("Edge -y +z has an error\n"); + return(-1); + } + } + if(x2 == LY-1) { + iy0 = g_iup[ g_idn[ix][3] ][2]; + if(x3 != 0) { + iz0 = g_ipt[x0][x1][LY][(x3+LZ-1)%LZ]; + } + else { + iz0 = g_ipt[x0][x1][LY][LZ+1]; + itest[iy0]++; + if(itest[iy0]>1) { + printf("Edge +y -z has itest = %d at %d, %d, %d, %d, iy0 = %d\n", itest[iy0], x0, x1, x2, x3, iy0); + return(-1); + } + } + if(iz0 != iy0) { + printf("Edge +y -z has an error\n"); + return(-1); + } + + iy0 = g_iup[ g_iup[ix][3] ][2]; + if(x3 != LZ-1) { + iz0 = g_ipt[x0][x1][LY][(x3+1)%LZ]; + } + else { + iz0 = g_ipt[x0][x1][LY][LZ]; + itest[iy0]++; + if(itest[iy0]>1) { + printf("Edge +y +z has itest = %d at %d, %d, %d, %d, iy0 = %d\n", itest[iy0], x0, x1, x2, x3, iy0); + return(-1); + } + } + if(iz0 != iy0) { + printf("Edge +y +z has an error\n"); + return(-1); + } + } +#endif + } + } + } + } + + for (ix = VOLUME; ix < (VOLUME+RAND+EDGES); ix++){ + if (itest[ix]!=1) { + printf("The boundary is not correctly used itest = %d ix = %d %d %d %d\n", itest[ix], ix, VOLUME, RAND, EDGES); + return(-1); + } + } + + for (ix=0;ix 0) { + +#if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) + for (x1 = 0; x1 < LX; x1++) { + for (x2 = 0; x2 < LY; x2++) { + for (x3 = 0; x3 < LZ; x3++) { + x0 = T; + ix=g_ipt[x0][x1][x2][x3]; + + iy0=g_iup[ix][0]; + if(iy0 < gI_Lp1_0_0_0 || iy0 >= gI_Lp1_0_0_0 + LX*LY*LZ) { + printf("The DBW2 boundary is not correctly mapped in up t-direction %d %d %d %d %d %d\n", x0, x1, x2, x3, ix, iy0); + return(-1); + } + itest[iy0]++; + if (itest[iy0]>1) { + printf("The DBW2 boundary is not correctly used itest = %d (%d %d %d %d) iy0 = %d ix = %d\n", itest[iy0], x0, x1, x2, x3, iy0, ix); + return(-1); + } + + x0 = T+1; + ix=g_ipt[x0][x1][x2][x3]; + + iy0=g_idn[ix][0]; + if(iy0 < gI_m2_0_0_0 || iy0 >= gI_m2_0_0_0 + LX*LY*LZ) { + printf("The DBW2 boundary is not correctly mapped in down t-direction %d %d %d %d %d %d\n", x0, x1, x2, x3, ix, iy0); + return(-1); + } + itest[iy0]++; + if (itest[iy0]>1) { + printf("The DBW2 boundary is not correctly used itest = %d (%d %d %d %d) iy0 = %d ix = %d\n", itest[iy0], x0, x1, x2, x3, iy0, ix); + return(-1); + } + } + } + } +#endif + +#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ ) + for (x0 = 0; x0 < T+ext_t; x0++) { + for (x2 = 0; x2 < LY; x2++) { + for (x3 = 0; x3 < LZ; x3++) { + x1 = LX; + ix = g_ipt[x0][x1][x2][x3]; + + iy1=g_iup[ix][1]; + if((iy1 < gI_0_Lp1_0_0 || iy1 >= gI_0_Lp1_0_0 + T*LY*LZ) && x0 < T) { + printf("The DBW2 boundary is not correctly mapped in up x-direction %d %d %d %d %d %d\n", x0, x1, x2, x3, ix, iy1); + return(-1); + } + itest[iy1]++; + if (itest[iy1]>1) { + printf("The DBW2 boundary is not correctly used up x itest = %d (%d %d %d %d) iy1 = %d ix = %d \n", itest[iy1], x0, x1, x2, x3, iy1, ix); + return(-1); + } + + if(x0 == T) { + iy0 = g_iup[ix][0]; + if(iy0 < gI_Lp1_L_0_0 || iy0 >= gI_Lp1_L_0_0 + LY*LZ) { + printf("The DBW2 boundary is not correctly mapped in up t-direction up x %d %d %d %d %d %d\n", x0, x1, x2, x3, ix, iy0); + return(-1); + } + itest[iy0]++; + if (itest[iy0]>1) { + printf("The DBW2 boundary is not correctly used up t up x itest = %d (%d %d %d %d) iy0 = %d ix = %d\n", itest[iy0], x0, x1, x2, x3, iy0, ix); + return(-1); + } + } + if(x0 == T+1) { + iy0 = g_idn[ix][0]; + if(iy0 < gI_m2_L_0_0 || iy0 >= gI_m2_L_0_0 + LY*LZ) { + printf("The DBW2 boundary is not correctly mapped in down t-direction up x %d %d %d %d %d %d\n", x0, x1, x2, x3, ix, iy0); + return(-1); + } + itest[iy0]++; + if (itest[iy0]>1) { + printf("The DBW2 boundary is not correctly used down t up x itest = %d (%d %d %d %d) iy0 = %d ix = %d\n", itest[iy0], x0, x1, x2, x3, iy0, ix); + return(-1); + } + } + + + x1 = LX+1; + ix = g_ipt[x0][x1][x2][x3]; + + iy1=g_idn[ix][1]; + if((iy1 < gI_0_m2_0_0 || iy1 >= gI_0_m2_0_0 + T*LY*LZ) && x0 < T) { + printf("The DBW2 boundary is not correctly mapped in down x-direction %d %d %d %d %d %d\n", x0, x1, x2, x3, ix, iy1); + return(-1); + } + itest[iy1]++; + if (itest[iy1]>1) { + printf("The DBW2 boundary is not correctly used down x itest = %d (%d %d %d %d) iy1 = %d ix = %d \n", itest[iy1], x0, x1, x2, x3, iy1, ix); + return(-1); + } + + if(x0 == T) { + iy0 = g_iup[ix][0]; + if(iy0 < gI_Lp1_m1_0_0 || iy0 >= gI_Lp1_m1_0_0 + LY*LZ) { + printf("The DBW2 boundary is not correctly mapped in up t-direction down x %d %d %d %d %d %d\n", x0, x1, x2, x3, ix, iy0); + return(-1); + } + itest[iy0]++; + if (itest[iy0]>1) { + printf("The DBW2 boundary is not correctly used up t down x itest = %d (%d %d %d %d) iy0 = %d ix = %d \n", itest[iy0], x0, x1, x2, x3, iy0, ix); +return(-1); } + } + if(x0 == T+1) { + iy0 = g_idn[ix][0]; + if(iy0 < gI_m2_m1_0_0 || iy0 >= gI_m2_m1_0_0 + LY*LZ) { + printf("The DBW2 boundary is not correctly mapped in down t-direction down x %d %d %d %d %d %d\n", x0, x1, x2, x3, ix, iy0); +return(-1); + } + itest[iy0]++; + if (itest[iy0]>1) { + printf("The DBW2 boundary is not correctly used down t down xitest = %d (%d %d %d %d) iy0 = %d ix = %d \n", itest[iy0], x0, x1, x2, x3, iy0, ix); +return(-1); + } + } + } + } + } +#endif + +#if ( defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ ) + + for (x0 = 0; x0 < T+ext_t; x0++) { + for (x1 = 0; x1 < LX+ext_x; x1++) { + for (x3 = 0; x3 < LZ; x3++) { + if(x0 < T || x1 < LX) { + x2 = LY; + ix = g_ipt[x0][x1][x2][x3]; + + iy2=g_iup[ix][2]; + if((iy2 < gI_0_0_Lp1_0 || iy2 >= gI_0_0_Lp1_0 + T*LX*LZ) + && x0 < T && x1 < LX) { + printf("The DBW2 boundary is not correctly mapped in up y-direction %d %d %d %d %d %d\n", + x0, x1, x2, x3, ix, iy2); +return(-1); + } + itest[iy2]++; + if (itest[iy2]>1) { + printf("The DBW2 boundary is not correctly used up y itest = %d (%d %d %d %d) iy2 = %d ix = %d \n", + itest[iy2], x0, x1, x2, x3, iy2, ix); +return(-1); + } + + if(x0 == T && x1 < LX) { + iy0 = g_iup[ix][0]; + if(iy0 < gI_Lp1_0_L_0 || iy0 >= gI_Lp1_0_L_0 + LX*LZ) { + printf("The DBW2 boundary is not correctly mapped in up t-direction up y %d %d %d %d %d %d\n", + x0, x1, x2, x3, ix, iy0); +return(-1); + } + itest[iy0]++; + if (itest[iy0]>1) { + printf("The DBW2 boundary is not correctly used up t up y itest = %d (%d %d %d %d) iy0 = %d ix = %d\n", + itest[iy0], x0, x1, x2, x3, iy0, ix); +return(-1); + } + } + if(x0 == T+1 && x1 < LX) { + iy0 = g_idn[ix][0]; + if(iy0 < gI_m2_0_L_0 || iy0 >= gI_m2_0_L_0 + LX*LZ) { + printf("The DBW2 boundary is not correctly mapped in down t-direction up y %d %d %d %d %d %d\n", + x0, x1, x2, x3, ix, iy0); +return(-1); + } + itest[iy0]++; + if (itest[iy0]>1) { + printf("The DBW2 boundary is not correctly used down t up y itest = %d (%d %d %d %d) iy0 = %d ix = %d\n", + itest[iy0], x0, x1, x2, x3, iy0, ix); +return(-1); + } + } + + if(x1 == LX && x0 < T) { + iy1 = g_iup[ix][1]; + if(iy1 < gI_0_Lp1_L_0 || iy1 >= gI_0_Lp1_L_0 + T*LZ) { + printf("The DBW2 boundary is not correctly mapped in up x-direction up y %d %d %d %d %d %d\n", + x0, x1, x2, x3, ix, iy1); +return(-1); + } + itest[iy1]++; + if (itest[iy1]>1) { + printf("The DBW2 boundary is not correctly used x up y up itest = %d (%d %d %d %d) iy1 = %d ix = %d\n", + itest[iy1], x0, x1, x2, x3, iy1, ix); +return(-1); + } + } + if(x1 == LX+1 && x0 < T) { + iy1 = g_idn[ix][1]; + if(iy1 < gI_0_m2_L_0 || iy1 >= gI_0_m2_L_0 + T*LZ) { + printf("The DBW2 boundary is not correctly mapped in down x-direction up y %d %d %d %d %d %d\n", + x0, x1, x2, x3, ix, iy1); +return(-1); + } + itest[iy1]++; + if (itest[iy1]>1) { + printf("The DBW2 boundary is not correctly used x down y up itest = %d (%d %d %d %d) iy1 = %d ix = %d\n", + itest[iy1], x0, x1, x2, x3, iy1, ix); +return(-1); + } + } + + x2 = LY+1; + ix = g_ipt[x0][x1][x2][x3]; + iy2=g_idn[ix][2]; + if((iy2 < gI_0_0_m2_0 || iy2 >= gI_0_0_m2_0 + T*LX*LZ ) + && x0 < T && x1 < LX) { + printf("The DBW2 boundary is not correctly mapped in down y-direction %d %d %d %d %d %d\n", + x0, x1, x2, x3, ix, iy2); +return(-1); + } + itest[iy2]++; + if (itest[iy2]>1) { + printf("The DBW2 boundary is not correctly used down y itest = %d (%d %d %d %d) iy2 = %d ix = %d \n", + itest[iy2], x0, x1, x2, x3, iy2, ix); +return(-1); + } + + if(x0 == T && x1 < LX) { + iy0 = g_iup[ix][0]; + if(iy0 < gI_Lp1_0_m1_0 || iy0 >= gI_Lp1_0_m1_0 + LX*LZ) { + printf("The DBW2 boundary is not correctly mapped in up t-direction down y %d %d %d %d %d %d\n", + x0, x1, x2, x3, ix, iy0); +return(-1); + } + itest[iy0]++; + if (itest[iy0]>1) { + printf("The DBW2 boundary is not correctly used up t down y itest = %d (%d %d %d %d) iy0 = %d ix = %d \n", + itest[iy0], x0, x1, x2, x3, iy0, ix); +return(-1); + } + } + if(x0 == T+1 && x1 < LX) { + iy0 = g_idn[ix][0]; + if(iy0 < gI_m2_0_m1_0 || iy0 >= gI_m2_0_m1_0 + LX*LZ ) { + printf("The DBW2 boundary is not correctly mapped in down t-direction down y %d %d %d %d %d %d\n", + x0, x1, x2, x3, ix, iy0); +return(-1); + } + itest[iy0]++; + if (itest[iy0]>1) { + printf("The DBW2 boundary is not correctly used down t down y itest = %d (%d %d %d %d) iy0 = %d ix = %d \n", + itest[iy0], x0, x1, x2, x3, iy0, ix); +return(-1); + } + } + if(x1 == LX && x0 < T) { + iy1 = g_iup[ix][1]; + if(iy1 < gI_0_Lp1_m1_0 || iy1 >= gI_0_Lp1_m1_0 + T*LZ) { + printf("The DBW2 boundary is not correctly mapped in up x-direction down y %d %d %d %d %d %d\n", + x0, x1, x2, x3, ix, iy1); +return(-1); + } + itest[iy1]++; + if (itest[iy1]>1) { + printf("The DBW2 boundary is not correctly used up x down y itest = %d (%d %d %d %d) iy1 = %d ix = %d\n", + itest[iy1], x0, x1, x2, x3, iy1, ix); +return(-1); + } + } + if(x1 == LX+1 && x0 < T) { + iy1 = g_idn[ix][1]; + if(iy1 < gI_0_m2_m1_0 || iy1 >= gI_0_m2_m1_0 + T*LZ) { + printf("The DBW2 boundary is not correctly mapped in down x-direction down y %d %d %d %d %d %d\n", + x0, x1, x2, x3, ix, iy1); +return(-1); + } + itest[iy1]++; + if (itest[iy1]>1) { + printf("The DBW2 boundary is not correctly used down x down y itest = %d (%d %d %d %d) iy1 = %d ix = %d\n", + itest[iy1], x0, x1, x2, x3, iy1, ix); +return(-1); + } + } + } + } + } + } +#endif +#if ( defined PARALLELXYZT || defined PARALLELXYZ ) + for (x0 = 0; x0 < T+ext_t; x0++) { + for (x1 = 0; x1 < LX+ext_x; x1++) { + for (x2 = 0; x2 < LY+ext_y; x2++) { + bndcnt = 0; + if(x0 >= T) bndcnt++; + if(x1 >= LX) bndcnt++; + if(x2 >= LY) bndcnt++; + if(bndcnt < 2) { + x3 = LZ; + ix = g_ipt[x0][x1][x2][x3]; + + iy3=g_iup[ix][3]; + if(((iy3 < gI_0_0_0_Lp1 || iy3 >= gI_0_0_0_Lp1 + T*LX*LY) && bndcnt == 0) || + (x0 == T && (iy3 < gI_L_0_0_Lp1 || iy3 >= gI_L_0_0_Lp1 + LX*LY )) + ){ + printf("The DBW2 boundary is not correctly mapped in up z-direction %d %d %d %d %d %d\n", + x0, x1, x2, x3, ix, iy3); +return(-1); + } + itest[iy3]++; + if (itest[iy3]>1) { + printf("The DBW2 boundary is not correctly used up z itest = %d (%d %d %d %d) iy3 = %d ix = %d \n", + itest[iy3], x0, x1, x2, x3, iy3, ix); +return(-1); + } + if(x0 == T) { + iy0 = g_iup[ix][0]; + if(iy0 < gI_Lp1_0_0_L || iy0 >= gI_Lp1_0_0_L + LX*LY) { + printf("The DBW2 boundary is not correctly mapped in up t-direction up z %d %d %d %d %d %d\n", + x0, x1, x2, x3, ix, iy0); +return(-1); + } + itest[iy0]++; + if (itest[iy0]>1) { + printf("The DBW2 boundary is not correctly used up t up z itest = %d (%d %d %d %d) iy0 = %d ix = %d\n", + itest[iy0], x0, x1, x2, x3, iy0, ix); +return(-1); + } + } + if(x0 == T+1) { + iy0 = g_idn[ix][0]; + if(iy0 < gI_m2_0_0_L || iy0 >= gI_m2_0_0_L + LX*LY) { + printf("The DBW2 boundary is not correctly mapped in down t-direction up z %d %d %d %d %d %d\n", + x0, x1, x2, x3, ix, iy0); +return(-1); + } + itest[iy0]++; + if (itest[iy0]>1) { + printf("The DBW2 boundary is not correctly used down t up z itest = %d (%d %d %d %d) iy0 = %d ix = %d\n", + itest[iy0], x0, x1, x2, x3, iy0, ix); +return(-1); + } + } + + if(x1 == LX) { + iy1 = g_iup[ix][1]; + if(iy1 < gI_0_Lp1_0_L || iy1 >= gI_0_Lp1_0_L + T*LY) { + printf("The DBW2 boundary is not correctly mapped in up x-direction up z %d %d %d %d %d %d\n", + x0, x1, x2, x3, ix, iy1); +return(-1); + } + itest[iy1]++; + if (itest[iy1]>1) { + printf("The DBW2 boundary is not correctly used x up z up itest = %d (%d %d %d %d) iy1 = %d ix = %d\n", + itest[iy1], x0, x1, x2, x3, iy1, ix); +return(-1); + } + } + if(x1 == LX+1) { + iy1 = g_idn[ix][1]; + if(iy1 < gI_0_m2_0_L || iy1 >= gI_0_m2_0_L + T*LY ) { + printf("The DBW2 boundary is not correctly mapped in down x-direction up z %d %d %d %d %d %d\n", + x0, x1, x2, x3, ix, iy1); +return(-1); + } + itest[iy1]++; + if (itest[iy1]>1) { + printf("The DBW2 boundary is not correctly used x down z up itest = %d (%d %d %d %d) iy1 = %d ix = %d\n", + itest[iy1], x0, x1, x2, x3, iy1, ix); +return(-1); + } + } + + if(x2 == LY) { + iy2 = g_iup[ix][2]; + if(iy2 < gI_0_0_Lp1_L || iy2 >= gI_0_0_Lp1_L + T*LX ) { + printf("The DBW2 boundary is not correctly mapped in up y-direction up z %d %d %d %d %d %d\n", + x0, x1, x2, x3, ix, iy2); +return(-1); + } + itest[iy2]++; + if (itest[iy2]>1) { + printf("The DBW2 boundary is not correctly used y up z up itest = %d (%d %d %d %d) iy2 = %d ix = %d\n", + itest[iy2], x0, x1, x2, x3, iy2, ix); +return(-1); + } + } + if(x2 == LY+1) { + iy2 = g_idn[ix][2]; + if(iy2 < gI_0_0_m2_L || iy2 >= gI_0_0_m2_L + T*LX ) { + printf("The DBW2 boundary is not correctly mapped in down y-direction up z %d %d %d %d %d %d\n", + x0, x1, x2, x3, ix, iy2); +return(-1); + } + itest[iy2]++; + if (itest[iy2]>1) { + printf("The DBW2 boundary is not correctly used y down z up itest = %d (%d %d %d %d) iy2 = %d ix = %d\n", + itest[iy2], x0, x1, x2, x3, iy2, ix); +return(-1); + } + } + + + x3 = LZ+1; + ix = g_ipt[x0][x1][x2][x3]; + iy3=g_idn[ix][3]; + if(((iy3 < gI_0_0_0_m2 || iy3 >= gI_0_0_0_m2 + T*LX*LY) && bndcnt == 0) || + (x0 == T && (iy3 < gI_L_0_0_m2 || iy3 >= gI_L_0_0_m2 + LX*LY)) || + (x0 == T+1 && (iy3 < gI_m1_0_0_m2 || iy3 >= gI_m1_0_0_m2 + LX*LY)) + ) { + printf("The DBW2 boundary is not correctly mapped in down z-direction %d %d %d %d %d %d\n", + x0, x1, x2, x3, ix, iy3); +return(-1); + } + itest[iy3]++; + if (itest[iy3]>1) { + printf("The DBW2 boundary is not correctly used down z itest = %d (%d %d %d %d) iy3 = %d ix = %d \n", + itest[iy3], x0, x1, x2, x3, iy3, ix); +return(-1); + } + + if(x0 == T) { + iy0 = g_iup[ix][0]; + if(iy0 < gI_Lp1_0_0_m1 || iy0 >= gI_Lp1_0_0_m1 + LX*LY) { + printf("The DBW2 boundary is not correctly mapped in up t-direction down z %d %d %d %d %d %d\n", + x0, x1, x2, x3, ix, iy0); +return(-1); + } + itest[iy0]++; + if (itest[iy0]>1) { + printf("The DBW2 boundary is not correctly used up t down z itest = %d (%d %d %d %d) iy0 = %d ix = %d \n", + itest[iy0], x0, x1, x2, x3, iy0, ix); +return(-1); + } + } + if(x0 == T+1) { + iy0 = g_idn[ix][0]; + if(iy0 < gI_m2_0_0_m1 || iy0 >= gI_m2_0_0_m1 + LX*LY) { + printf("The DBW2 boundary is not correctly mapped in down t-direction down z %d %d %d %d %d %d\n", + x0, x1, x2, x3, ix, iy0); +return(-1); + } + itest[iy0]++; + if (itest[iy0]>1) { + printf("The DBW2 boundary is not correctly used down t down z itest = %d (%d %d %d %d) iy0 = %d ix = %d \n", + itest[iy0], x0, x1, x2, x3, iy0, ix); +return(-1); + } + } + if(x1 == LX) { + iy1 = g_iup[ix][1]; + if(iy1 < gI_0_Lp1_0_m1 || iy1 >= gI_0_Lp1_0_m1 + T*LY ) { + printf("The DBW2 boundary is not correctly mapped in up x-direction down z %d %d %d %d %d %d\n", + x0, x1, x2, x3, ix, iy1); +return(-1); + } + itest[iy1]++; + if (itest[iy1]>1) { + printf("The DBW2 boundary is not correctly used up x down z itest = %d (%d %d %d %d) iy1 = %d ix = %d\n", + itest[iy1], x0, x1, x2, x3, iy1, ix); +return(-1); + } + } + if(x1 == LX+1) { + iy1 = g_idn[ix][1]; + if(iy1 < gI_0_m2_0_m1 || iy1 >= gI_0_m2_0_m1 + T*LY ) { + printf("The DBW2 boundary is not correctly mapped in down x-direction down z %d %d %d %d %d %d\n", + x0, x1, x2, x3, ix, iy1); +return(-1); + } + itest[iy1]++; + if (itest[iy1]>1) { + printf("The DBW2 boundary is not correctly used down x down z itest = %d (%d %d %d %d) iy1 = %d ix = %d\n", + itest[iy1], x0, x1, x2, x3, iy1, ix); +return(-1); + } + } + + if(x2 == LY) { + iy2 = g_iup[ix][2]; + if(iy2 < gI_0_0_Lp1_m1 || iy2 >= gI_0_0_Lp1_m1 + T*LX ) { + printf("The DBW2 boundary is not correctly mapped in up y-direction down z %d %d %d %d %d %d\n", + x0, x1, x2, x3, ix, iy2); +return(-1); + } + itest[iy2]++; + if (itest[iy2]>1) { + printf("The DBW2 boundary is not correctly used y up z down itest = %d (%d %d %d %d) iy2 = %d ix = %d\n", + itest[iy2], x0, x1, x2, x3, iy2, ix); +return(-1); + } + } + if(x2 == LY+1) { + iy2 = g_idn[ix][2]; + if(iy2 < gI_0_0_m2_m1 || iy2 >= gI_0_0_m2_m1 + T*LX ) { + printf("The DBW2 boundary is not correctly mapped in down y-direction down z %d %d %d %d %d %d\n", + x0, x1, x2, x3, ix, iy2); +return(-1); + } + itest[iy2]++; + if (itest[iy2]>1) { + printf("The DBW2 boundary is not correctly used y down z down itest = %d (%d %d %d %d) iy2 = %d ix = %d\n", + itest[iy2], x0, x1, x2, x3, iy2, ix); +return(-1); + } + } + } + } + } + } +#endif + } /* end of if dbw2>0 */ + for (ix = VOLUMEPLUSRAND; ix < (VOLUMEPLUSRAND) + g_dbw2rand; ix++){ + if (itest[ix]!=1) { + printf("The DBW2 boundary is not correctly used itest = %d ix = %d \n", itest[ix], ix); +return(-1); + } + } +#endif + + /* check of EO geometry */ + + for (ix=0;ix VOLUMEPLUSRAND/2 || iz0 < 0) { + printf("There is a problem with EO geometry in direction 0-\n"); + printf("%d\n", iz0); +return(-1); + } + stest[iz0] += 1; + + iy0 = g_iup[ix][0]; + iz0 = g_lexic2eosub[iy0]; + if(iz0 > VOLUMEPLUSRAND/2 || iz0 < 0) { + printf("There is a problem with EO geometry in direction 0+\n"); +return(-1); + } + stest[iz0] += 1; + + iy1 = g_idn[ix][1]; + iz1 = g_lexic2eosub[iy1]; + if(iz1 > VOLUMEPLUSRAND/2 || iz1 < 0) { + printf("There is a problem with EO geometry in direction 1-\n"); +return(-1); + } + stest[iz1] += 1; + + iy1 = g_iup[ix][1]; + iz1 = g_lexic2eosub[iy1]; + if(iz1 >= VOLUMEPLUSRAND/2 || iz1 < 0) { + printf("There is a problem with EO geometry in direction 1+\n"); +return(-1); + } + stest[iz1] += 1; + + iy2 = g_idn[ix][2]; + iz2 = g_lexic2eosub[iy2]; + if(iz2 > VOLUMEPLUSRAND/2 || iz2 < 0) { + printf("There is a problem with EO geometry in direction 2-\n"); +return(-1); + } + stest[iz2] += 1; + + iy2 = g_iup[ix][2]; + iz2 = g_lexic2eosub[iy2]; + if(iz2 > VOLUMEPLUSRAND/2 || iz2 < 0) { + printf("There is a problem with EO geometry in direction 2+\n"); +return(-1); + } + stest[iz2] += 1; + + + iy3 = g_idn[ix][3]; + iz3 = g_lexic2eosub[iy3]; + if(iz3 > VOLUMEPLUSRAND/2 || iz3 < 0) { + printf("There is a problem with EO geometry in direction 3-\n"); +return(-1); + } + stest[iz3] += 1; + + iy3 = g_iup[ix][3]; + iz3 = g_lexic2eosub[iy3]; + if(iz3 > VOLUMEPLUSRAND/2 || iz3 < 0) { + printf("There is a problem with EO geometry in direction 3+\n"); +return(-1); + } + stest[iz3] += 1; + } + iz0 = 0; + for(j = 0; j < (VOLUME)/2; j++) { + iz0 += stest[j]; + } + if(iz0 != 8*(VOLUME)/2-RAND/2) { + printf("There is a problem in the first part of the even odd geometry\n"); + printf("%d is not equal to 8*(VOLUME)/2-RAND/2=%d\n", iz0, 8*(VOLUME)/2-RAND/2); +return(-1); + } + + for(j = VOLUME/2; j < (VOLUME+RAND)/2; j++) { + if(stest[j] != 1) { + printf("There is a problem in the first boundary of the even odd geometry\n"); +return(-1); + } + } + + + for (ix=0;ix VOLUMEPLUSRAND/2 || iz0 < 0) { + printf("There is a problem with EO geometry in direction 0-\n"); +return(-1); + } + stest[iz0] += 1; + + iy0 = g_iup[ix][0]; + iz0 = g_lexic2eosub[iy0]; + if(iz0 > VOLUMEPLUSRAND/2 || iz0 < 0) { + printf("There is a problem with EO geometry in direction 0+\n"); +return(-1); + } + stest[iz0] += 1; + + iy1 = g_idn[ix][1]; + iz1 = g_lexic2eosub[iy1]; + if(iz1 > VOLUMEPLUSRAND/2 || iz1 < 0) { + printf("There is a problem with EO geometry in direction 1-\n"); +return(-1); + } + stest[iz1] += 1; + + iy1 = g_iup[ix][1]; + iz1 = g_lexic2eosub[iy1]; + if(iz1 > VOLUMEPLUSRAND/2 || iz1 < 0) { + printf("There is a problem with EO geometry in direction 1+\n"); +return(-1); + } + stest[iz1] += 1; + + iy2 = g_idn[ix][2]; + iz2 = g_lexic2eosub[iy2]; + if(iz2 > VOLUMEPLUSRAND/2 || iz2 < 0) { + printf("There is a problem with EO geometry in direction 2-\n"); +return(-1); + } + stest[iz2] += 1; + + iy2 = g_iup[ix][2]; + iz2 = g_lexic2eosub[iy2]; + if(iz2 > VOLUMEPLUSRAND/2 || iz2 < 0) { + printf("There is a problem with EO geometry in direction 2+\n"); +return(-1); + } + stest[iz2] += 1; + + + iy3 = g_idn[ix][3]; + iz3 = g_lexic2eosub[iy3]; + if(iz3 > VOLUMEPLUSRAND/2 || iz3 < 0) { + printf("There is a problem with EO geometry in direction 3-\n"); +return(-1); + } + stest[iz3] += 1; + + iy3 = g_iup[ix][3]; + iz3 = g_lexic2eosub[iy3]; + if(iz3 > VOLUMEPLUSRAND/2 || iz3 < 0) { + printf("There is a problem with EO geometry in direction 3+\n"); +return(-1); + } + stest[iz3] += 1; + } + iz0 = 0; + for(j = 0; j < (VOLUME)/2; j++) { + iz0 += stest[j]; + } + if(iz0 != 8*(VOLUME)/2-RAND/2) { + printf("There is a problem in the second part of the even odd geometry\n"); + printf("%d is not equal to 8*(VOLUME)/2-RAND/2=%d\n", iz0, 8*(VOLUME)/2-RAND/2); +return(-1); + } + + for(j = VOLUME/2; j < (VOLUME+RAND)/2; j++) { + if(stest[j] != 1) { + printf("There is a problem in the second boundary of the even odd geometry\n"); +return(-1); + } + } + + if(g_proc_id == 0 ) { + printf("# The lattice is correctly mapped by the index arrays\n\n"); + } + fflush(stdout); + + free(stest); + free(itest); + + return(0); +} + +#else /* _INDEX_INDEP_GEOM */ + +int check_geometry() +{ +#ifdef XLC +#pragma execution_frequency(very_low) +#endif + int ix, j; + int * stest; + int * itest; + int x0,x1,x2,x3; + int iy0,iy1,iy2,iy3; + int iz0,iz1,iz2,iz3; + int bndcnt = 0; + + itest = calloc(VOLUMEPLUSRAND + g_dbw2rand, sizeof(int)); + stest = calloc((VOLUMEPLUSRAND)/2, sizeof(int)); + + for (ix=0;ix= VOLUME)) { + printf("The index ipt is out of range (%d, %d, %d, %d) ix = %d\n", x0, x1, x2, x3, ix); +return(-1); + } + + itest[ix]+=1; + } + } + } + } + + for (ix = 0; ix < VOLUME; ix++){ + if (itest[ix]!=1){ + printf("The index ipt is not one-to-one %d\n", itest[ix]); +return(-1); + } + } + + for (x0 = 0; x0 < T; x0++){ + for (x1 = 0; x1 < LX; x1++){ + for (x2 = 0; x2 < LY; x2++){ + for (x3 = 0; x3 < LZ; x3++){ + ix=g_ipt[x0][x1][x2][x3]; + + iy0=g_iup[ix][0]; +#if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) + if(x0!=T-1) { + iz0=g_ipt[(x0+1)%T][x1][x2][x3]; + } + else { + iz0 = g_ipt[T][x1][x2][x3]; + itest[iy0]++; + if(iy0 < VOLUME || iy0 >= VOLUME + LX*LY*LZ) { + printf("Boundary for time direction up is wrong %d %d %d\n", + iy0 < VOLUME, iy0 >= VOLUME+LX*LY*LZ, iy0); +return(-1); + } + } +#else + iz0=g_ipt[(x0+1)%T][x1][x2][x3]; +#endif + + iy1=g_iup[ix][1]; +#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) + if(x1 !=LX-1) { + iz1=g_ipt[x0][(x1+1)%LX][x2][x3]; + } + else { + iz1=g_ipt[x0][LX][x2][x3]; + itest[iy1]++; + if(iy1 < VOLUME + 2*LX*LY*LZ || iy1 >= VOLUME + 2*LX*LY*LZ + T*LY*LZ) { + printf("Boundary for x direction up is wrong %d %d %d\n", + iy1 < VOLUME + 2*LX*LY*LZ, iy1 >= VOLUME + 2*LX*LY*LZ + T*LY*LZ, iy1); +return(-1); + } + } +#else + iz1=g_ipt[x0][(x1+1)%LX][x2][x3]; +#endif + + iy2=g_iup[ix][2]; +#if (defined PARALLELXYT || defined PARALLELXYZT) + if(x2 !=LY-1) { + iz2=g_ipt[x0][x1][(x2+1)%LY][x3]; + } + else { + iz2=g_ipt[x0][x1][LY][x3]; + itest[iy2]++; + if(iy2 < VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ|| iy2 >= VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + T*LX*LZ) { + printf("Boundary for y direction up is wrong %d %d %d\n", + iy2 < VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ, iy2 > VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + T*LX*LZ, iy2); +return(-1); + } + } +#else + iz2=g_ipt[x0][x1][(x2+1)%LY][x3]; +#endif + + iy3=g_iup[ix][3]; +#if defined PARALLELXYZT + if(x3 !=LZ-1) { + iz3=g_ipt[x0][x1][x2][(x3+1)%LZ]; + } + else { + iz3=g_ipt[x0][x1][x2][LZ]; + itest[iy3]++; + if(iy3 < VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ || iy3 >= VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + T*LX*LY) { + printf("Boundary for z direction up is wrong %d %d %d\n", + iy3 < VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ, iy3 > VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + T*LX*LY, iy3); +return(-1); + } + } +#else + iz3=g_ipt[x0][x1][x2][(x3+1)%LZ]; +#endif + + if ((iy0!=iz0)||(iy1!=iz1)||(iy2!=iz2)||(iy3!=iz3)|| + (g_idn[iy0][0]!=ix)||(g_idn[iy1][1]!=ix)||(g_idn[iy2][2]!=ix)||(g_idn[iy3][3]!=ix)) { + printf("The index iup is incorrect\n"); + printf("%d %d %d %d\n", (iy0!=iz0), (iy1!=iz1), (iy2!=iz2), (iy3!=iz3)); + printf("%d %d %d %d %d %d\n", x0, x1, x2, x3, iy1, iz1); +return(-1); + } + + iy0=g_idn[ix][0]; +#if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) + if(x0 !=0) { + iz0=g_ipt[(x0+T-1)%T][x1][x2][x3]; + } + else { + iz0 = g_ipt[T+1][x1][x2][x3];; + itest[iy0]++; + if(iy0 < VOLUME + LX*LY*LZ || iy0 >= VOLUME + 2*LX*LY*LZ) { + printf("Boundary for time direction is wrong %d %d %d\n", + iy0 < VOLUME + LX*LY*LZ, iy0 >= VOLUME + 2*LX*LY*LZ, iy0); +return(-1); + } + } +#else + iz0=g_ipt[(x0+T-1)%T][x1][x2][x3]; +#endif + + iy1=g_idn[ix][1]; +#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) + if(x1 !=0) { + iz1=g_ipt[x0][(x1+LX-1)%LX][x2][x3]; + } + else { + iz1 = g_ipt[x0][LX+1][x2][x3]; + itest[iy1]++; + if(iy1 < VOLUME + 2*LX*LY*LZ + T*LY*LZ || iy1 >= VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ) { + printf("Boundary for x direction is wrong %d %d %d\n", + iy1 < VOLUME + 2*LX*LY*LZ + T*LY*LZ, iy1 >= VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ, iy1); +return(-1); + } + } +#else + iz1=g_ipt[x0][(x1+LX-1)%LX][x2][x3]; +#endif + iy2=g_idn[ix][2]; +#if (defined PARALLELXYT || defined PARALLELXYZT) + if(x2 !=0) { + iz2=g_ipt[x0][x1][(x2+LY-1)%LY][x3]; + } + else { + iz2 = g_ipt[x0][x1][LY+1][x3]; + itest[iy2]++; + if(iy2 < VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + T*LX*LZ|| iy2 >= VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ) { + printf("Boundary for y direction is wrong %d %d %d\n", + iy2 < VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + T*LX*LZ, iy2 >= VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ, iy2); +return(-1); + } + } +#else + iz2=g_ipt[x0][x1][(x2+LY-1)%LY][x3]; +#endif + + iy3=g_idn[ix][3]; +#if defined PARALLELXYZT + if(x3 !=0) { + iz3=g_ipt[x0][x1][x2][(x3+LZ-1)%LZ]; + } + else { + iz3 = g_ipt[x0][x1][x2][LZ+1]; + itest[iy3]++; + if(iy3 < VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + T*LX*LY|| iy3 >= VOLUME + RAND) { + printf("Boundary for z direction is wrong %d %d %d\n", + iy3 < VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + T*LX*LY, iy3 >= VOLUME + RAND, iy3); + printf("%d %d %d %d %d\n", x0, x1, x2, x3, ix); +return(-1); + } + } +#else + iz3=g_ipt[x0][x1][x2][(x3+LZ-1)%LZ]; +#endif + + if ((iy0!=iz0)||(iy1!=iz1)||(iy2!=iz2)||(iy3!=iz3)|| + (g_iup[iy0][0]!=ix)||(g_iup[iy1][1]!=ix)||(g_iup[iy2][2]!=ix)||(g_iup[iy3][3]!=ix)) { + printf("The index idn is incorrect\n"); + printf("%d %d %d %d\n", (iy0!=iz0), (iy1!=iz1), (iy2!=iz2), (iy3!=iz3)); + printf("%d %d %d %d\n", iy1, iz1, iy2, iz2); + printf("%d %d %d %d %d\n", x0, x1, x2, x3, ix); +return(-1); + } + + /* The edges */ + /* In case of PARALLELT there is actually no edge to take care of */ +#if ((defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT)) + if(x0 == 0) { + iy0 = g_idn[ g_idn[ix][1] ][0]; + if(x1 != 0) { + iz0 = g_ipt[T+1][(x1+LX-1)%LX][x2][x3]; + } + else { + iz0 = g_ipt[T+1][LX+1][x2][x3]; + itest[iy0]++; + } + if(iz0 != iy0) { + printf("Edge -t -x has an error\n"); +return(-1); + } + + iy0 = g_idn[ g_iup[ix][1] ][0]; + if(x1 != LX-1) { + iz0 = g_ipt[T+1][(x1+1)%LX][x2][x3]; + } + else { + iz0 = g_ipt[T+1][LX][x2][x3]; + itest[iy0]++; + } + if(iz0 != iy0) { + printf("Edge -t +x has an error\n"); +return(-1); + } + } + + if(x0 == T-1) { + iy0 = g_iup[ g_idn[ix][1] ][0]; + if(x1 != 0) { + iz0 = g_ipt[T][(x1+LX-1)%LX][x2][x3]; + } + else { + iz0 = g_ipt[T][LX+1][x2][x3]; + itest[iy0]++; + } + if(iz0 != iy0) { + printf("Edge +t -x has an error\n"); +return(-1); + } + + iy0 = g_iup[ g_iup[ix][1] ][0]; + if(x1 != LX-1) { + iz0 = g_ipt[T][(x1+1)%LX][x2][x3]; + } + else { + iz0 = g_ipt[T][LX][x2][x3]; + itest[iy0]++; + } + if(iz0 != iy0) { + printf("Edge +t +x has an error\n"); +return(-1); + } + } + +#endif + +#if (defined PARALLELXYT || defined PARALLELXYZT) + if(x0 == 0) { + iy0 = g_idn[ g_idn[ix][2] ][0]; + if(x2 != 0) { + iz0 = g_ipt[T+1][x1][(x2+LY-1)%LY][x3]; + } + else { + iz0 = g_ipt[T+1][x1][LY+1][x3]; + itest[iy0]++; + } + if(iz0 != iy0) { + printf("Edge -t -y has an error\n"); +return(-1); + } + + iy0 = g_idn[ g_iup[ix][2] ][0]; + if(x2 != LY-1) { + iz0 = g_ipt[T+1][x1][(x2+1)%LY][x3]; + } + else { + iz0 = g_ipt[T+1][x1][LY][x3]; + itest[iy0]++; + } + if(iz0 != iy0) { + printf("Edge -t +y has an error\n"); +return(-1); + } + } + + if(x0 == T-1) { + iy0 = g_iup[ g_idn[ix][2] ][0]; + if(x2 != 0) { + iz0 = g_ipt[T][x1][(x2+LY-1)%LY][x3]; + } + else { + iz0 = g_ipt[T][x1][LY+1][x3]; + itest[iy0]++; + } + if(iz0 != iy0) { + printf("Edge +t -y has an error\n"); +return(-1); + } + + iy0 = g_iup[ g_iup[ix][2] ][0]; + if(x2 != LY-1) { + iz0 = g_ipt[T][x1][(x2+1)%LY][x3]; + } + else { + iz0 = g_ipt[T][x1][LY][x3]; + itest[iy0]++; + } + if(iz0 != iy0) { + printf("Edge +t +y has an error\n"); +return(-1); + } + } + + if(x1 == 0) { + iy0 = g_idn[ g_idn[ix][2] ][1]; + if(x2 != 0) { + iz0 = g_ipt[x0][LX+1][(x2+LY-1)%LY][x3]; + } + else { + iz0 = g_ipt[x0][LX+1][LY+1][x3]; + itest[iy0]++; + } + if(iz0 != iy0) { + printf("Edge -x -y has an error\n"); +return(-1); + } + iy0 = g_idn[ g_iup[ix][2] ][1]; + if(x2 != LY-1) { + iz0 = g_ipt[x0][LX+1][(x2+1)%LY][x3]; + } + else { + iz0 = g_ipt[x0][LX+1][LY][x3]; + itest[iy0]++; + } + if(iz0 != iy0) { + printf("Edge -x +y has an error\n"); +return(-1); + } + } + if(x1 == LX-1) { + iy0 = g_iup[ g_idn[ix][2] ][1]; + if(x2 != 0) { + iz0 = g_ipt[x0][LX][(x2+LY-1)%LY][x3]; + } + else { + iz0 = g_ipt[x0][LX][LY+1][x3]; + itest[iy0]++; + } + if(iz0 != iy0) { + printf("Edge +x -y has an error\n"); +return(-1); + } + + iy0 = g_iup[ g_iup[ix][2] ][1]; + if(x2 != LY-1) { + iz0 = g_ipt[x0][LX][(x2+1)%LY][x3]; + } + else { + iz0 = g_ipt[x0][LX][LY][x3]; + itest[iy0]++; + } + if(iz0 != iy0) { + printf("Edge +x +y has an error\n"); +return(-1); + } + } +#endif +#if defined PARALLELXYZT + if(x0 == 0) { + iy0 = g_idn[ g_idn[ix][3] ][0]; + if(x3 != 0) { + iz0 = g_ipt[T+1][x1][x2][(x3+LZ-1)%LZ]; + } + else { + iz0 = g_ipt[T+1][x1][x2][LZ+1]; + itest[iy0]++; + if(itest[iy0]>1) { + printf("Edge -t -z has itest = %d at %d, %d, %d, %d, iy0 = %d\n", itest[iy0], x0, x1, x2, x3, iy0); +return(-1); + } + } + if(iz0 != iy0) { + printf("Edge -t -z has an error\n"); +return(-1); + } + + iy0 = g_idn[ g_iup[ix][3] ][0]; + if(x3 != LZ-1) { + iz0 = g_ipt[T+1][x1][x2][(x3+1)%LZ]; + } + else { + iz0 = g_ipt[T+1][x1][x2][LZ]; + itest[iy0]++; + if(itest[iy0]>1) { + printf("Edge -t +z has itest = %d at %d, %d, %d, %d, iy0 = %d\n", itest[iy0], x0, x1, x2, x3, iy0); + printf("ix = %d, iz0 = %d %d\n", ix, iz0, g_iup[ix][3]); +return(-1); + } + } + if(iz0 != iy0) { + printf("Edge -t +z has an error\n"); +return(-1); + } + } + + if(x0 == T-1) { + iy0 = g_iup[ g_idn[ix][3] ][0]; + if(x3 != 0) { + iz0 = g_ipt[T][x1][x2][(x3+LZ-1)%LZ]; + } + else { + iz0 = g_ipt[T][x1][x2][LZ+1]; + itest[iy0]++; + if(itest[iy0]>1) { + printf("Edge +t -z has itest = %d at %d, %d, %d, %d, iy0 = %d\n", itest[iy0], x0, x1, x2, x3, iy0); +return(-1); + } + } + if(iz0 != iy0) { + printf("Edge +t -z has an error\n"); +return(-1); + } + + iy0 = g_iup[ g_iup[ix][0] ][3]; + if(x3 != LZ-1) { + iz0 = g_ipt[T][x1][x2][(x3+1)%LZ]; + } + else { + iz0 = g_ipt[T][x1][x2][LZ]; + itest[iy0]++; + if(itest[iy0]>1) { + printf("Edge +t +z has itest = %d at %d, %d, %d, %d, iy0 = %d\n", itest[iy0], x0, x1, x2, x3, iy0); +return(-1); + } + } + if(iz0 != iy0) { + printf("Edge +t +z has an error\n"); +return(-1); + } + } + + if(x1 == 0) { + iy0 = g_idn[ g_idn[ix][3] ][1]; + if(x3 != 0) { + iz0 = g_ipt[x0][LX+1][x2][(x3+LZ-1)%LZ]; + } + else { + iz0 = g_ipt[x0][LX+1][x2][LZ+1]; + itest[iy0]++; + if(itest[iy0]>1) { + printf("Edge -x -z has itest = %d at %d, %d, %d, %d, iy0 = %d\n", itest[iy0], x0, x1, x2, x3, iy0); +return(-1); + } + } + if(iz0 != iy0) { + printf("Edge -x -z has an error\n"); +return(-1); + } + iy0 = g_idn[ g_iup[ix][3] ][1]; + if(x3 != LZ-1) { + iz0 = g_ipt[x0][LX+1][x2][(x3+1)%LZ]; + } + else { + iz0 = g_ipt[x0][LX+1][x2][LZ]; + itest[iy0]++; + if(itest[iy0]>1) { + printf("Edge -x +z has itest = %d at %d, %d, %d, %d, iy0 = %d\n", itest[iy0], x0, x1, x2, x3, iy0); +return(-1); + } + } + if(iz0 != iy0) { + printf("Edge -x +z has an error\n"); +return(-1); + } + } + if(x1 == LX-1) { + iy0 = g_iup[ g_idn[ix][3] ][1]; + if(x3 != 0) { + iz0 = g_ipt[x0][LX][x2][(x3+LZ-1)%LZ]; + } + else { + iz0 = g_ipt[x0][LX][x2][LZ+1]; + itest[iy0]++; + if(itest[iy0]>1) { + printf("Edge +x -z has itest = %d at %d, %d, %d, %d, iy0 = %d\n", itest[iy0], x0, x1, x2, x3, iy0); +return(-1); + } + } + if(iz0 != iy0) { + printf("Edge +x -z has an error\n"); +return(-1); + } + + iy0 = g_iup[ g_iup[ix][3] ][1]; + if(x3 != LZ-1) { + iz0 = g_ipt[x0][LX][x2][(x3+1)%LZ]; + } + else { + iz0 = g_ipt[x0][LX][x2][LZ]; + itest[iy0]++; + if(itest[iy0]>1) { + printf("Edge +x +z has itest = %d at %d, %d, %d, %d, iy0 = %d iz0 = %d giup[%d][3] = %d ix = %d\n", + itest[iy0], x0, x1, x2, x3, iy0, iz0, ix, g_iup[ix][3], ix); +return(-1); + } + } + if(iz0 != iy0) { + printf("Edge +x +z has an error\n"); +return(-1); + } + } + + if(x2 == 0) { + iy0 = g_idn[ g_idn[ix][3] ][2]; + if(x3 != 0) { + iz0 = g_ipt[x0][x1][LY+1][(x3+LZ-1)%LZ]; + } + else { + iz0 = g_ipt[x0][x1][LY+1][LZ+1]; + itest[iy0]++; + if(itest[iy0]>1) { + printf("Edge -y -z has itest = %d at %d, %d, %d, %d, iy0 = %d\n", itest[iy0], x0, x1, x2, x3, iy0); +return(-1); + } + } + if(iz0 != iy0) { + printf("Edge -y -z has an error\n"); +return(-1); + } + iy0 = g_idn[ g_iup[ix][3] ][2]; + if(x3 != LZ-1) { + iz0 = g_ipt[x0][x1][LY+1][(x3+1)%LZ]; + } + else { + iz0 = g_ipt[x0][x1][LY+1][LZ]; + itest[iy0]++; + if(itest[iy0]>1) { + printf("Edge -y +z has itest = %d at %d, %d, %d, %d, iy0 = %d\n", itest[iy0], x0, x1, x2, x3, iy0); +return(-1); + } + } + if(iz0 != iy0) { + printf("Edge -y +z has an error\n"); +return(-1); + } + } + if(x2 == LY-1) { + iy0 = g_iup[ g_idn[ix][3] ][2]; + if(x3 != 0) { + iz0 = g_ipt[x0][x1][LY][(x3+LZ-1)%LZ]; + } + else { + iz0 = g_ipt[x0][x1][LY][LZ+1]; + itest[iy0]++; + if(itest[iy0]>1) { + printf("Edge +y -z has itest = %d at %d, %d, %d, %d, iy0 = %d\n", itest[iy0], x0, x1, x2, x3, iy0); +return(-1); + } + } + if(iz0 != iy0) { + printf("Edge +y -z has an error\n"); +return(-1); + } + + iy0 = g_iup[ g_iup[ix][3] ][2]; + if(x3 != LZ-1) { + iz0 = g_ipt[x0][x1][LY][(x3+1)%LZ]; + } + else { + iz0 = g_ipt[x0][x1][LY][LZ]; + itest[iy0]++; + if(itest[iy0]>1) { + printf("Edge +y +z has itest = %d at %d, %d, %d, %d, iy0 = %d\n", itest[iy0], x0, x1, x2, x3, iy0); +return(-1); + } + } + if(iz0 != iy0) { + printf("Edge +y +z has an error\n"); +return(-1); + } + } +#endif + } + } + } + } + + for (ix = VOLUME; ix < (VOLUME+RAND+EDGES); ix++){ + if (itest[ix]!=1) { + printf("The boundary is not correctly used itest = %d ix = %d %d %d %d\n", itest[ix], ix, VOLUME, RAND, EDGES); +return(-1); + } + } + + for (ix=0;ix 0) { + for (x1 = 0; x1 < LX; x1++) { + for (x2 = 0; x2 < LY; x2++) { + for (x3 = 0; x3 < LZ; x3++) { + x0 = T; + ix=g_ipt[x0][x1][x2][x3]; + + iy0=g_iup[ix][0]; + if(iy0 < VOLUMEPLUSRAND || iy0 >= VOLUMEPLUSRAND + LX*LY*LZ) { + printf("The DBW2 boundary is not correctly mapped in up t-direction %d %d %d %d %d %d\n", x0, x1, x2, x3, ix, iy0); +return(-1); + } + itest[iy0]++; + if (itest[iy0]>1) { + printf("The DBW2 boundary is not correctly used itest = %d (%d %d %d %d) iy0 = %d ix = %d\n", itest[iy0], x0, x1, x2, x3, iy0, ix); +return(-1); + } + + x0 = T+1; + ix=g_ipt[x0][x1][x2][x3]; + + iy0=g_idn[ix][0]; + if(iy0 < VOLUMEPLUSRAND + LX*LZ*LY|| iy0 >= VOLUMEPLUSRAND + 2*LX*LY*LZ) { + printf("The DBW2 boundary is not correctly mapped in down t-direction %d %d %d %d %d %d\n", x0, x1, x2, x3, ix, iy0); +return(-1); + } + itest[iy0]++; + if (itest[iy0]>1) { + printf("The DBW2 boundary is not correctly used itest = %d (%d %d %d %d) iy0 = %d ix = %d\n", itest[iy0], x0, x1, x2, x3, iy0, ix); +return(-1); + } + } + } + } + +#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) + for (x0 = 0; x0 < T+2; x0++) { + for (x2 = 0; x2 < LY; x2++) { + for (x3 = 0; x3 < LZ; x3++) { + x1 = LX; + ix = g_ipt[x0][x1][x2][x3]; + + iy1=g_iup[ix][1]; + if((iy1 < VOLUMEPLUSRAND + 2*LX*LY*LZ || iy1 >= VOLUMEPLUSRAND + 2*LX*LY*LZ + T*LY*LZ) && x0 < T) { + printf("The DBW2 boundary is not correctly mapped in up x-direction %d %d %d %d %d %d\n", x0, x1, x2, x3, ix, iy1); +return(-1); + } + itest[iy1]++; + if (itest[iy1]>1) { + printf("The DBW2 boundary is not correctly used up x itest = %d (%d %d %d %d) iy1 = %d ix = %d \n", itest[iy1], x0, x1, x2, x3, iy1, ix); +return(-1); + } + + if(x0 == T) { + iy0 = g_iup[ix][0]; + if(iy0 < VOLUMEPLUSRAND + RAND || iy0 >= VOLUMEPLUSRAND + RAND + LY*LZ) { + printf("The DBW2 boundary is not correctly mapped in up t-direction up x %d %d %d %d %d %d\n", x0, x1, x2, x3, ix, iy0); +return(-1); + } + itest[iy0]++; + if (itest[iy0]>1) { + printf("The DBW2 boundary is not correctly used up t up x itest = %d (%d %d %d %d) iy0 = %d ix = %d\n", itest[iy0], x0, x1, x2, x3, iy0, ix); +return(-1); + } + } + if(x0 == T+1) { + iy0 = g_idn[ix][0]; + if(iy0 < VOLUMEPLUSRAND + RAND + 2*LY*LZ|| iy0 >= VOLUMEPLUSRAND + RAND + 3*LY*LZ) { + printf("The DBW2 boundary is not correctly mapped in down t-direction up x %d %d %d %d %d %d\n", x0, x1, x2, x3, ix, iy0); +return(-1); + } + itest[iy0]++; + if (itest[iy0]>1) { + printf("The DBW2 boundary is not correctly used down t up x itest = %d (%d %d %d %d) iy0 = %d ix = %d\n", itest[iy0], x0, x1, x2, x3, iy0, ix); +return(-1); + } + } + + + x1 = LX+1; + ix = g_ipt[x0][x1][x2][x3]; + + iy1=g_idn[ix][1]; + if((iy1 < VOLUMEPLUSRAND + 2*LX*LY*LZ + T*LY*LZ|| iy1 >= VOLUMEPLUSRAND + 2*LX*LY*LZ + 2*T*LY*LZ) && x0 < T) { + printf("The DBW2 boundary is not correctly mapped in down x-direction %d %d %d %d %d %d\n", x0, x1, x2, x3, ix, iy1); +return(-1); + } + itest[iy1]++; + if (itest[iy1]>1) { + printf("The DBW2 boundary is not correctly used down x itest = %d (%d %d %d %d) iy1 = %d ix = %d \n", itest[iy1], x0, x1, x2, x3, iy1, ix); +return(-1); + } + + if(x0 == T) { + iy0 = g_iup[ix][0]; + if(iy0 < VOLUMEPLUSRAND + RAND + LY*LZ || iy0 >= VOLUMEPLUSRAND + RAND + 2*LY*LZ) { + printf("The DBW2 boundary is not correctly mapped in up t-direction down x %d %d %d %d %d %d\n", x0, x1, x2, x3, ix, iy0); +return(-1); + } + itest[iy0]++; + if (itest[iy0]>1) { + printf("The DBW2 boundary is not correctly used up t down x itest = %d (%d %d %d %d) iy0 = %d ix = %d \n", itest[iy0], x0, x1, x2, x3, iy0, ix); +return(-1); + } + } + if(x0 == T+1) { + iy0 = g_idn[ix][0]; + if(iy0 < VOLUMEPLUSRAND + RAND + 3*LY*LZ || iy0 >= VOLUMEPLUSRAND + RAND + 4*LY*LZ) { + printf("The DBW2 boundary is not correctly mapped in down t-direction down x %d %d %d %d %d %d\n", x0, x1, x2, x3, ix, iy0); +return(-1); + } + itest[iy0]++; + if (itest[iy0]>1) { + printf("The DBW2 boundary is not correctly used down t down xitest = %d (%d %d %d %d) iy0 = %d ix = %d \n", itest[iy0], x0, x1, x2, x3, iy0, ix); +return(-1); + } + } + } + } + } +#endif + +#if (defined PARALLELXYT || defined PARALLELXYZT) + + for (x0 = 0; x0 < T+2; x0++) { + for (x1 = 0; x1 < LX+2; x1++) { + for (x3 = 0; x3 < LZ; x3++) { + if(x0 < T || x1 < LX) { + x2 = LY; + ix = g_ipt[x0][x1][x2][x3]; + + iy2=g_iup[ix][2]; + if((iy2 < VOLUMEPLUSRAND + 2*LX*LY*LZ + 2*T*LY*LZ || iy2 >= VOLUMEPLUSRAND + 2*LX*LY*LZ + 2*T*LY*LZ + T*LX*LZ) + && x0 < T && x1 < LX) { + printf("The DBW2 boundary is not correctly mapped in up y-direction %d %d %d %d %d %d\n", + x0, x1, x2, x3, ix, iy2); +return(-1); + } + itest[iy2]++; + if (itest[iy2]>1) { + printf("The DBW2 boundary is not correctly used up y itest = %d (%d %d %d %d) iy2 = %d ix = %d \n", + itest[iy2], x0, x1, x2, x3, iy2, ix); +return(-1); + } + + if(x0 == T && x1 < LX) { + iy0 = g_iup[ix][0]; + if(iy0 < VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ || iy0 >= VOLUMEPLUSRAND+g_dbw2rand) { + printf("The DBW2 boundary is not correctly mapped in up t-direction up y %d %d %d %d %d %d\n", + x0, x1, x2, x3, ix, iy0); +return(-1); + } + itest[iy0]++; + if (itest[iy0]>1) { + printf("The DBW2 boundary is not correctly used up t up y itest = %d (%d %d %d %d) iy0 = %d ix = %d\n", + itest[iy0], x0, x1, x2, x3, iy0, ix); +return(-1); + } + } + if(x0 == T+1 && x1 < LX) { + iy0 = g_idn[ix][0]; + if(iy0 < VOLUMEPLUSRAND || iy0 >= VOLUMEPLUSRAND+g_dbw2rand) { + printf("The DBW2 boundary is not correctly mapped in down t-direction up y %d %d %d %d %d %d\n", + x0, x1, x2, x3, ix, iy0); +return(-1); + } + itest[iy0]++; + if (itest[iy0]>1) { + printf("The DBW2 boundary is not correctly used down t up y itest = %d (%d %d %d %d) iy0 = %d ix = %d\n", + itest[iy0], x0, x1, x2, x3, iy0, ix); +return(-1); + } + } + + if(x1 == LX && x0 < T) { + iy1 = g_iup[ix][1]; + if(iy1 < VOLUMEPLUSRAND || iy1 >= VOLUMEPLUSRAND + g_dbw2rand) { + printf("The DBW2 boundary is not correctly mapped in up x-direction up y %d %d %d %d %d %d\n", + x0, x1, x2, x3, ix, iy1); +return(-1); + } + itest[iy1]++; + if (itest[iy1]>1) { + printf("The DBW2 boundary is not correctly used x up y up itest = %d (%d %d %d %d) iy1 = %d ix = %d\n", + itest[iy1], x0, x1, x2, x3, iy1, ix); +return(-1); + } + } + if(x1 == LX+1 && x0 < T) { + iy1 = g_idn[ix][1]; + if(iy1 < VOLUMEPLUSRAND || iy1 >= VOLUMEPLUSRAND+g_dbw2rand) { + printf("The DBW2 boundary is not correctly mapped in down x-direction up y %d %d %d %d %d %d\n", + x0, x1, x2, x3, ix, iy1); +return(-1); + } + itest[iy1]++; + if (itest[iy1]>1) { + printf("The DBW2 boundary is not correctly used x down y up itest = %d (%d %d %d %d) iy1 = %d ix = %d\n", + itest[iy1], x0, x1, x2, x3, iy1, ix); +return(-1); + } + } + + + x2 = LY+1; + ix = g_ipt[x0][x1][x2][x3]; + iy2=g_idn[ix][2]; + if((iy2 < VOLUMEPLUSRAND + 2*LX*LY*LZ + 2*T*LY*LZ + T*LX*LZ|| iy2 >= VOLUMEPLUSRAND + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ ) + && x0 < T && x1 < LX) { + printf("The DBW2 boundary is not correctly mapped in down y-direction %d %d %d %d %d %d\n", + x0, x1, x2, x3, ix, iy2); +return(-1); + } + itest[iy2]++; + if (itest[iy2]>1) { + printf("The DBW2 boundary is not correctly used down y itest = %d (%d %d %d %d) iy2 = %d ix = %d \n", + itest[iy2], x0, x1, x2, x3, iy2, ix); +return(-1); + } + + if(x0 == T && x1 < LX) { + iy0 = g_iup[ix][0]; + if(iy0 < VOLUMEPLUSRAND || iy0 >= VOLUMEPLUSRAND+g_dbw2rand) { + printf("The DBW2 boundary is not correctly mapped in up t-direction down y %d %d %d %d %d %d\n", + x0, x1, x2, x3, ix, iy0); +return(-1); + } + itest[iy0]++; + if (itest[iy0]>1) { + printf("The DBW2 boundary is not correctly used up t down y itest = %d (%d %d %d %d) iy0 = %d ix = %d \n", + itest[iy0], x0, x1, x2, x3, iy0, ix); +return(-1); + } + } + if(x0 == T+1 && x1 < LX) { + iy0 = g_idn[ix][0]; + if(iy0 < VOLUMEPLUSRAND || iy0 >= VOLUMEPLUSRAND+g_dbw2rand) { + printf("The DBW2 boundary is not correctly mapped in down t-direction down y %d %d %d %d %d %d\n", + x0, x1, x2, x3, ix, iy0); +return(-1); + } + itest[iy0]++; + if (itest[iy0]>1) { + printf("The DBW2 boundary is not correctly used down t down y itest = %d (%d %d %d %d) iy0 = %d ix = %d \n", + itest[iy0], x0, x1, x2, x3, iy0, ix); +return(-1); + } + } + if(x1 == LX && x0 < T) { + iy1 = g_iup[ix][1]; + if(iy1 < VOLUMEPLUSRAND || iy1 >= VOLUMEPLUSRAND+g_dbw2rand) { + printf("The DBW2 boundary is not correctly mapped in up x-direction down y %d %d %d %d %d %d\n", + x0, x1, x2, x3, ix, iy1); +return(-1); + } + itest[iy1]++; + if (itest[iy1]>1) { + printf("The DBW2 boundary is not correctly used up x down y itest = %d (%d %d %d %d) iy1 = %d ix = %d\n", + itest[iy1], x0, x1, x2, x3, iy1, ix); +return(-1); + } + } + if(x1 == LX+1 && x0 < T) { + iy1 = g_idn[ix][1]; + if(iy1 < VOLUMEPLUSRAND || iy1 >= VOLUMEPLUSRAND+g_dbw2rand) { + printf("The DBW2 boundary is not correctly mapped in down x-direction down y %d %d %d %d %d %d\n", + x0, x1, x2, x3, ix, iy1); +return(-1); + } + itest[iy1]++; + if (itest[iy1]>1) { + printf("The DBW2 boundary is not correctly used down x down y itest = %d (%d %d %d %d) iy1 = %d ix = %d\n", + itest[iy1], x0, x1, x2, x3, iy1, ix); +return(-1); + } + } + } + } + } + } +#endif +#ifdef PARALLELXYZT + for (x0 = 0; x0 < T+2; x0++) { + for (x1 = 0; x1 < LX+2; x1++) { + for (x2 = 0; x2 < LY+2; x2++) { + bndcnt = 0; + if(x0 >= T) bndcnt++; + if(x1 >= LX) bndcnt++; + if(x2 >= LY) bndcnt++; + if(bndcnt < 2) { + x3 = LZ; + ix = g_ipt[x0][x1][x2][x3]; + + iy3=g_iup[ix][3]; + if(((iy3 < VOLUMEPLUSRAND + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ || + iy3 >= VOLUMEPLUSRAND + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + T*LX*LY) && + bndcnt == 0) || + (x0 == T && (iy3 < VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 4*LX*LY || + iy3 >= VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 5*LX*LY )) +/* ||(x0 == T+1 && (iy3 < VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 5*LX*LY || */ +/* iy3 >= VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 6*LX*LY)) */ + ){ + printf("The DBW2 boundary is not correctly mapped in up z-direction %d %d %d %d %d %d\n", + x0, x1, x2, x3, ix, iy3); +return(-1); + } + itest[iy3]++; + if (itest[iy3]>1) { + printf("The DBW2 boundary is not correctly used up z itest = %d (%d %d %d %d) iy3 = %d ix = %d \n", + itest[iy3], x0, x1, x2, x3, iy3, ix); +return(-1); + } + + if(x0 == T) { + iy0 = g_iup[ix][0]; + if(iy0 < VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ || + iy0 >= VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + LX*LY) { + printf("The DBW2 boundary is not correctly mapped in up t-direction up z %d %d %d %d %d %d\n", + x0, x1, x2, x3, ix, iy0); +return(-1); + } + itest[iy0]++; + if (itest[iy0]>1) { + printf("The DBW2 boundary is not correctly used up t up z itest = %d (%d %d %d %d) iy0 = %d ix = %d\n", + itest[iy0], x0, x1, x2, x3, iy0, ix); +return(-1); + } + } + if(x0 == T+1) { + iy0 = g_idn[ix][0]; + if(iy0 < VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + LX*LY || + iy0 >= VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 2*LX*LY) { + printf("The DBW2 boundary is not correctly mapped in down t-direction up z %d %d %d %d %d %d\n", + x0, x1, x2, x3, ix, iy0); +return(-1); + } + itest[iy0]++; + if (itest[iy0]>1) { + printf("The DBW2 boundary is not correctly used down t up z itest = %d (%d %d %d %d) iy0 = %d ix = %d\n", + itest[iy0], x0, x1, x2, x3, iy0, ix); +return(-1); + } + } + + if(x1 == LX) { + iy1 = g_iup[ix][1]; + if(iy1 < VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 4*T*LY|| + iy1 >= VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 5*T*LY) { + printf("The DBW2 boundary is not correctly mapped in up x-direction up z %d %d %d %d %d %d\n", + x0, x1, x2, x3, ix, iy1); +return(-1); + } + itest[iy1]++; + if (itest[iy1]>1) { + printf("The DBW2 boundary is not correctly used x up z up itest = %d (%d %d %d %d) iy1 = %d ix = %d\n", + itest[iy1], x0, x1, x2, x3, iy1, ix); +return(-1); + } + } + if(x1 == LX+1) { + iy1 = g_idn[ix][1]; + if(iy1 < VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 6*T*LY || + iy1 >= VOLUMEPLUSRAND + + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 7*T*LY ) { + printf("The DBW2 boundary is not correctly mapped in down x-direction up z %d %d %d %d %d %d\n", + x0, x1, x2, x3, ix, iy1); +return(-1); + } + itest[iy1]++; + if (itest[iy1]>1) { + printf("The DBW2 boundary is not correctly used x down z up itest = %d (%d %d %d %d) iy1 = %d ix = %d\n", + itest[iy1], x0, x1, x2, x3, iy1, ix); +return(-1); + } + } + + if(x2 == LY) { + iy2 = g_iup[ix][2]; + if(iy2 < VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 8*T*LY + 4*T*LX || + iy2 >= VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 8*T*LY + 5*T*LX ) { + printf("The DBW2 boundary is not correctly mapped in up y-direction up z %d %d %d %d %d %d\n", + x0, x1, x2, x3, ix, iy2); +return(-1); + } + itest[iy2]++; + if (itest[iy2]>1) { + printf("The DBW2 boundary is not correctly used y up z up itest = %d (%d %d %d %d) iy2 = %d ix = %d\n", + itest[iy2], x0, x1, x2, x3, iy2, ix); +return(-1); + } + } + if(x2 == LY+1) { + iy2 = g_idn[ix][2]; + if(iy2 < VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 8*T*LY + 6*T*LX || + iy2 >= VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 8*T*LY + 7*T*LX ) { + printf("The DBW2 boundary is not correctly mapped in down y-direction up z %d %d %d %d %d %d\n", + x0, x1, x2, x3, ix, iy2); +return(-1); + } + itest[iy2]++; + if (itest[iy2]>1) { + printf("The DBW2 boundary is not correctly used y down z up itest = %d (%d %d %d %d) iy2 = %d ix = %d\n", + itest[iy2], x0, x1, x2, x3, iy2, ix); +return(-1); + } + } + + + x3 = LZ+1; + ix = g_ipt[x0][x1][x2][x3]; + iy3=g_idn[ix][3]; + if(((iy3 < VOLUMEPLUSRAND + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + T*LX*LY|| + iy3 >= VOLUMEPLUSRAND + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + 2*T*LX*LY) && + bndcnt == 0) || + (x0 == T && (iy3 < VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 6*LX*LY|| + iy3 >= VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 7*LX*LY)) || + (x0 == T+1 && (iy3 < VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 7*LX*LY|| + iy3 >= VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY)) + ) { + printf("The DBW2 boundary is not correctly mapped in down z-direction %d %d %d %d %d %d\n", + x0, x1, x2, x3, ix, iy3); +return(-1); + } + itest[iy3]++; + if (itest[iy3]>1) { + printf("The DBW2 boundary is not correctly used down z itest = %d (%d %d %d %d) iy3 = %d ix = %d \n", + itest[iy3], x0, x1, x2, x3, iy3, ix); +return(-1); + } + + if(x0 == T) { + iy0 = g_iup[ix][0]; + if(iy0 < VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 2*LX*LY || + iy0 >= VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 3*LX*LY) { + printf("The DBW2 boundary is not correctly mapped in up t-direction down z %d %d %d %d %d %d\n", + x0, x1, x2, x3, ix, iy0); +return(-1); + } + itest[iy0]++; + if (itest[iy0]>1) { + printf("The DBW2 boundary is not correctly used up t down z itest = %d (%d %d %d %d) iy0 = %d ix = %d \n", + itest[iy0], x0, x1, x2, x3, iy0, ix); +return(-1); + } + } + if(x0 == T+1) { + iy0 = g_idn[ix][0]; + if(iy0 < VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 3*LX*LY || + iy0 >= VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 4*LX*LY) { + printf("The DBW2 boundary is not correctly mapped in down t-direction down z %d %d %d %d %d %d\n", + x0, x1, x2, x3, ix, iy0); +return(-1); + } + itest[iy0]++; + if (itest[iy0]>1) { + printf("The DBW2 boundary is not correctly used down t down z itest = %d (%d %d %d %d) iy0 = %d ix = %d \n", + itest[iy0], x0, x1, x2, x3, iy0, ix); +return(-1); + } + } + if(x1 == LX) { + iy1 = g_iup[ix][1]; + if(iy1 < VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 5*T*LY || + iy1 >= VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 6*T*LY ) { + printf("The DBW2 boundary is not correctly mapped in up x-direction down z %d %d %d %d %d %d\n", + x0, x1, x2, x3, ix, iy1); +return(-1); + } + itest[iy1]++; + if (itest[iy1]>1) { + printf("The DBW2 boundary is not correctly used up x down z itest = %d (%d %d %d %d) iy1 = %d ix = %d\n", + itest[iy1], x0, x1, x2, x3, iy1, ix); +return(-1); + } + } + if(x1 == LX+1) { + iy1 = g_idn[ix][1]; + if(iy1 < VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 7*T*LY || + iy1 >= VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 8*T*LY ) { + printf("The DBW2 boundary is not correctly mapped in down x-direction down z %d %d %d %d %d %d\n", + x0, x1, x2, x3, ix, iy1); +return(-1); + } + itest[iy1]++; + if (itest[iy1]>1) { + printf("The DBW2 boundary is not correctly used down x down z itest = %d (%d %d %d %d) iy1 = %d ix = %d\n", + itest[iy1], x0, x1, x2, x3, iy1, ix); +return(-1); + } + } + + if(x2 == LY) { + iy2 = g_iup[ix][2]; + if(iy2 < VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 8*T*LY + 5*T*LX || + iy2 >= VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 8*T*LY + 6*T*LX ) { + printf("The DBW2 boundary is not correctly mapped in up y-direction down z %d %d %d %d %d %d\n", + x0, x1, x2, x3, ix, iy2); +return(-1); + } + itest[iy2]++; + if (itest[iy2]>1) { + printf("The DBW2 boundary is not correctly used y up z down itest = %d (%d %d %d %d) iy2 = %d ix = %d\n", + itest[iy2], x0, x1, x2, x3, iy2, ix); +return(-1); + } + } + if(x2 == LY+1) { + iy2 = g_idn[ix][2]; + if(iy2 < VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 8*T*LY + 7*T*LX || + iy2 >= VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 8*T*LY + 8*T*LX ) { + printf("The DBW2 boundary is not correctly mapped in down y-direction down z %d %d %d %d %d %d\n", + x0, x1, x2, x3, ix, iy2); +return(-1); + } + itest[iy2]++; + if (itest[iy2]>1) { + printf("The DBW2 boundary is not correctly used y down z down itest = %d (%d %d %d %d) iy2 = %d ix = %d\n", + itest[iy2], x0, x1, x2, x3, iy2, ix); +return(-1); + } + } + } + } + } + } +#endif + } + for (ix = VOLUMEPLUSRAND; ix < (VOLUMEPLUSRAND) + g_dbw2rand; ix++){ + if (itest[ix]!=1) { + printf("The DBW2 boundary is not correctly used itest = %d ix = %d \n", itest[ix], ix); +return(-1); + } + } +#endif + + /* check of EO geometry */ + + for (ix=0;ix VOLUMEPLUSRAND/2 || iz0 < 0) { + printf("There is a problem with EO geometry in direction 0-\n"); + printf("%d\n", iz0); +return(-1); + } + stest[iz0] += 1; + + iy0 = g_iup[ix][0]; + iz0 = g_lexic2eosub[iy0]; + if(iz0 > VOLUMEPLUSRAND/2 || iz0 < 0) { + printf("There is a problem with EO geometry in direction 0+\n"); +return(-1); + } + stest[iz0] += 1; + + iy1 = g_idn[ix][1]; + iz1 = g_lexic2eosub[iy1]; + if(iz1 > VOLUMEPLUSRAND/2 || iz1 < 0) { + printf("There is a problem with EO geometry in direction 1-\n"); +return(-1); + } + stest[iz1] += 1; + + iy1 = g_iup[ix][1]; + iz1 = g_lexic2eosub[iy1]; + if(iz1 >= VOLUMEPLUSRAND/2 || iz1 < 0) { + printf("There is a problem with EO geometry in direction 1+\n"); +return(-1); + } + stest[iz1] += 1; + + iy2 = g_idn[ix][2]; + iz2 = g_lexic2eosub[iy2]; + if(iz2 > VOLUMEPLUSRAND/2 || iz2 < 0) { + printf("There is a problem with EO geometry in direction 2-\n"); +return(-1); + } + stest[iz2] += 1; + + iy2 = g_iup[ix][2]; + iz2 = g_lexic2eosub[iy2]; + if(iz2 > VOLUMEPLUSRAND/2 || iz2 < 0) { + printf("There is a problem with EO geometry in direction 2+\n"); +return(-1); + } + stest[iz2] += 1; + + + iy3 = g_idn[ix][3]; + iz3 = g_lexic2eosub[iy3]; + if(iz3 > VOLUMEPLUSRAND/2 || iz3 < 0) { + printf("There is a problem with EO geometry in direction 3-\n"); +return(-1); + } + stest[iz3] += 1; + + iy3 = g_iup[ix][3]; + iz3 = g_lexic2eosub[iy3]; + if(iz3 > VOLUMEPLUSRAND/2 || iz3 < 0) { + printf("There is a problem with EO geometry in direction 3+\n"); +return(-1); + } + stest[iz3] += 1; + } + iz0 = 0; + for(j = 0; j < (VOLUME)/2; j++) { + iz0 += stest[j]; + } + if(iz0 != 8*(VOLUME)/2-RAND/2) { + printf("There is a problem in the first part of the even odd geometry\n"); + printf("%d is not equal to 8*(VOLUME)/2-RAND/2=%d\n", iz0, 8*(VOLUME)/2-RAND/2); +return(-1); + } + + for(j = VOLUME/2; j < (VOLUME+RAND)/2; j++) { + if(stest[j] != 1) { + printf("There is a problem in the first boundary of the even odd geometry\n"); +return(-1); + } + } + + + for (ix=0;ix VOLUMEPLUSRAND/2 || iz0 < 0) { + printf("There is a problem with EO geometry in direction 0-\n"); +return(-1); + } + stest[iz0] += 1; + + iy0 = g_iup[ix][0]; + iz0 = g_lexic2eosub[iy0]; + if(iz0 > VOLUMEPLUSRAND/2 || iz0 < 0) { + printf("There is a problem with EO geometry in direction 0+\n"); +return(-1); + } + stest[iz0] += 1; + + iy1 = g_idn[ix][1]; + iz1 = g_lexic2eosub[iy1]; + if(iz1 > VOLUMEPLUSRAND/2 || iz1 < 0) { + printf("There is a problem with EO geometry in direction 1-\n"); +return(-1); + } + stest[iz1] += 1; + + iy1 = g_iup[ix][1]; + iz1 = g_lexic2eosub[iy1]; + if(iz1 > VOLUMEPLUSRAND/2 || iz1 < 0) { + printf("There is a problem with EO geometry in direction 1+\n"); +return(-1); + } + stest[iz1] += 1; + + iy2 = g_idn[ix][2]; + iz2 = g_lexic2eosub[iy2]; + if(iz2 > VOLUMEPLUSRAND/2 || iz2 < 0) { + printf("There is a problem with EO geometry in direction 2-\n"); +return(-1); + } + stest[iz2] += 1; + + iy2 = g_iup[ix][2]; + iz2 = g_lexic2eosub[iy2]; + if(iz2 > VOLUMEPLUSRAND/2 || iz2 < 0) { + printf("There is a problem with EO geometry in direction 2+\n"); +return(-1); + } + stest[iz2] += 1; + + + iy3 = g_idn[ix][3]; + iz3 = g_lexic2eosub[iy3]; + if(iz3 > VOLUMEPLUSRAND/2 || iz3 < 0) { + printf("There is a problem with EO geometry in direction 3-\n"); +return(-1); + } + stest[iz3] += 1; + + iy3 = g_iup[ix][3]; + iz3 = g_lexic2eosub[iy3]; + if(iz3 > VOLUMEPLUSRAND/2 || iz3 < 0) { + printf("There is a problem with EO geometry in direction 3+\n"); +return(-1); + } + stest[iz3] += 1; + } + iz0 = 0; + for(j = 0; j < (VOLUME)/2; j++) { + iz0 += stest[j]; + } + if(iz0 != 8*(VOLUME)/2-RAND/2) { + printf("There is a problem in the second part of the even odd geometry\n"); + printf("%d is not equal to 8*(VOLUME)/2-RAND/2=%d\n", iz0, 8*(VOLUME)/2-RAND/2); +return(-1); + } + + for(j = VOLUME/2; j < (VOLUME+RAND)/2; j++) { + if(stest[j] != 1) { + printf("There is a problem in the second boundary of the even odd geometry\n"); +return(-1); + } + } + + if(g_proc_id == 0 ) { + printf("# The lattice is correctly mapped by the index arrays\n\n"); + } + fflush(stdout); + + free(stest); + free(itest); + + return(0); +} + +#endif /* _INDEX_INDEP_GEOM */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/test/check_geometry.h b/qcd/part_cpu/applications/QCD/src/kernel_D/test/check_geometry.h new file mode 100644 index 0000000000000000000000000000000000000000..f29983bd093a9052cca569bcbb47a9d080e4e5da --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/test/check_geometry.h @@ -0,0 +1,24 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _CHECK_GEOMETRY_H +#define _CHECK_GEOMETRY_H + +int check_geometry(); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/test/check_nan.c b/qcd/part_cpu/applications/QCD/src/kernel_D/test/check_nan.c new file mode 100644 index 0000000000000000000000000000000000000000..51e51516323a2332453144dfdaf11990ecc8c690 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/test/check_nan.c @@ -0,0 +1,92 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include "global.h" +#include "su3adj.h" +#include "check_nan.h" + +int check_nan() { + int i, mu; + su3 * um; + + um = &g_gauge_field[0][0]; + for(i = 0; i < VOLUMEPLUSRAND; i++) { + for(mu = 0; mu < 4; mu++) { + if(__isnan(creal(um->c00))|| __isnan(cimag(um->c00)) || __isnan(creal(um->c01)) || __isnan(cimag(um->c01)) || + __isnan(creal(um->c02)) || __isnan(cimag(um->c02)) || __isnan(creal(um->c10)) || __isnan(cimag(um->c10)) || + __isnan(creal(um->c11)) || __isnan(cimag(um->c11)) || __isnan(creal(um->c12)) || __isnan(cimag(um->c12)) || + __isnan(creal(um->c20)) || __isnan(cimag(um->c20)) || __isnan(creal(um->c21)) || __isnan(cimag(um->c21)) || + __isnan(creal(um->c22)) || __isnan(cimag(um->c22))) { + return(i); + } + um++; + } + } + return(-1); +} + +int check_greater(const double a) { + int i, mu; + su3 * um; + + um = &g_gauge_field[0][0]; + for(i = 0; i < VOLUMEPLUSRAND; i++) { + for(mu = 0; mu < 4; mu++) { + if((creal(um->c00) > a)|| (cimag(um->c00) > a) || (creal(um->c01) > a) || (cimag(um->c01) > a) || + (creal(um->c02) > a) || (cimag(um->c02) > a) || (creal(um->c10) > a) || (cimag(um->c10) > a) || + (creal(um->c11) > a) || (cimag(um->c11) > a) || (creal(um->c12) > a) || (cimag(um->c12) > a) || + (creal(um->c20) > a) || (cimag(um->c20) > a) || (creal(um->c21) > a) || (cimag(um->c21) > a) || + (creal(um->c22) > a) || (cimag(um->c22) > a)) { + return(i); + } + um++; + } + } + return(-1); +} + +int check_nan_gauge(const int i, const int mu) { + su3 * um; + + um = &g_gauge_field[i][mu]; + if(__isnan(creal(um->c00))|| __isnan(cimag(um->c00)) || __isnan(creal(um->c01)) || __isnan(cimag(um->c01)) || + __isnan(creal(um->c02)) || __isnan(cimag(um->c02)) || __isnan(creal(um->c10)) || __isnan(cimag(um->c10)) || + __isnan(creal(um->c11)) || __isnan(cimag(um->c11)) || __isnan(creal(um->c12)) || __isnan(cimag(um->c12)) || + __isnan(creal(um->c20)) || __isnan(cimag(um->c20)) || __isnan(creal(um->c21)) || __isnan(cimag(um->c21)) || + __isnan(creal(um->c22)) || __isnan(cimag(um->c22))) { + return(i); + } + return(-1); +} + +int check_su3adj(su3adj * s, const double a) { + + if(s->d1 > a || s->d2 > a || s->d3 > a || s->d4 > a || + s->d5 > a || s->d6 > a || s->d7 > a || s->d8 > a ) { + return(1); + } + return(0); +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/test/check_nan.h b/qcd/part_cpu/applications/QCD/src/kernel_D/test/check_nan.h new file mode 100644 index 0000000000000000000000000000000000000000..e0d2ff90a86acd045359abb6e03a68e7d4e3c1ec --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/test/check_nan.h @@ -0,0 +1,30 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _CHECK_NAN_H +#define _CHECK_NAN_H + +#include "su3adj.h" + +int check_nan(); +int check_nan_gauge(const int ix, const int mu); +int check_su3adj(su3adj * s, const double a); +int check_greater(const double a); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/test/check_overlap.c b/qcd/part_cpu/applications/QCD/src/kernel_D/test/check_overlap.c new file mode 100644 index 0000000000000000000000000000000000000000..5626aaf86dd3ca649fbd09191a39a8890cb0c6c7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/test/check_overlap.c @@ -0,0 +1,415 @@ +/*********************************************************************** + * + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * invert for twisted mass QCD + * + * Author: Carsten Urbach + * urbach@physik.fu-berlin.de + * + *******************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef BENCHMARK +#include <../c-lime/include/lime.h> +#else +#include +#endif +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#include +#include +#ifdef MPI +#include +#endif +#include "global.h" +#include "getopt.h" +#include "linalg_eo.h" +#include "geometry_eo.h" +#include "start.h" +/*#include "eigenvalues.h"*/ +#include "observables.h" +#ifdef MPI +#include "xchange.h" +#endif +#include "io.h" +#include "io_utils.h" +#include "propagator_io.h" +#include "gauge_io.h" +#include "read_input.h" +#include "mpi_init.h" +#include "sighandler.h" +#include "boundary.h" +#include "solver/solver.h" +#include "init/init.h" +#include "xchange_halffield.h" +#include "stout_smear.h" +#include "invert_eo.h" +#include "monomial.h" +#include "ranlxd.h" +#include "phmc.h" +#include "D_psi.h" +#include "little_D.h" +#include "reweighting_factor.h" +#include "linalg/convert_eo_to_lexic.h" +#include "block.h" +#include "sighandler.h" +#include "solver/dfl_projector.h" +#include "solver/generate_dfl_subspace.h" +#include "Dov_psi.h" + +#include +#include + +#include "overlaptests.h" + +void usage() +{ + fprintf(stdout, "Inversion for EO preconditioned Wilson twisted mass QCD\n"); + fprintf(stdout, "Version %s \n\n", PACKAGE_VERSION); + fprintf(stdout, "Please send bug reports to %s\n", PACKAGE_BUGREPORT); + fprintf(stdout, "Usage: invert [options]\n"); + fprintf(stdout, "Options: [-f input-filename]\n"); + fprintf(stdout, " [-o output-filename]\n"); + fprintf(stdout, " [-h|-? this help]\n"); + exit(0); +} + +extern int nstore; +int check_geometry(); +double delta = 1.0e-12; + +int main(int argc, char *argv[]) +{ + + FILE *parameterfile = NULL; + int c, j; + char * filename = NULL; + char datafilename[50]; + char parameterfilename[50]; + char conf_filename[50]; + char * input_filename = NULL; + char * xlfmessage = NULL; + char * gaugelfn = NULL; + char * gaugecksum = NULL; + double plaquette_energy; + +#ifdef _KOJAK_INST +#pragma pomp inst init +#pragma pomp inst begin(main) +#endif + +#ifdef HAVE_LIBLEMON + MPI_File fh; + LemonWriter *lemonWriter; + paramsXlfInfo *xlfInfo; + paramsPropagatorFormat *propagatorFormat; +#endif + +#if (defined SSE || defined SSE2 || SSE3) + signal(SIGILL, &catch_ill_inst); +#endif + + DUM_DERI = 6; + /* DUM_DERI + 2 is enough (not 7) */ + DUM_SOLVER = DUM_DERI + 3; + DUM_MATRIX = DUM_SOLVER + 8; + /* DUM_MATRIX + 2 is enough (not 6) */ + NO_OF_SPINORFIELDS = DUM_MATRIX + 2; + + verbose = 0; + g_use_clover_flag = 0; + +#ifdef MPI + MPI_Init(&argc, &argv); +#endif + + while ((c = getopt(argc, argv, "h?f:o:")) != -1) { + switch (c) { + case 'f': + input_filename = calloc(200, sizeof(char)); + strcpy(input_filename, optarg); + break; + case 'o': + filename = calloc(200, sizeof(char)); + strcpy(filename, optarg); + break; + case 'h': + case '?': + default: + usage(); + break; + } + } + if (input_filename == NULL) { + input_filename = "hmc.input"; + } + if (filename == NULL) { + filename = "output"; + } + + /* Read the input file */ + read_input(input_filename); + if (solver_flag == 12 && even_odd_flag == 1) { + even_odd_flag = 0; + if (g_proc_id == 0) { + fprintf(stderr, "CGMMS works only without even/odd! Forcing!\n"); + } + } + + /* this DBW2 stuff is not needed for the inversion ! */ + if (g_dflgcr_flag == 1) { + even_odd_flag = 0; + } + g_rgi_C1 = 0; + if (Nsave == 0) { + Nsave = 1; + } + + if(g_running_phmc) { + NO_OF_SPINORFIELDS = DUM_MATRIX + 8; + } + + mpi_init(argc, argv); + + g_dbw2rand = 0; + + /* starts the single and double precision random number */ + /* generator */ + start_ranlux_KD(rlxd_level, random_seed); + +#ifndef MPI + g_dbw2rand = 0; +#endif + +#ifdef _GAUGE_COPY + j = init_gauge_field(VOLUMEPLUSRAND, 1); +#else + j = init_gauge_field(VOLUMEPLUSRAND, 0); +#endif + if(j != 0) { + fprintf(stderr, "Not enough memory for gauge_fields! Aborting...\n"); + exit(-1); + } + j = init_geometry_indices(VOLUMEPLUSRAND); + if(j != 0) { + fprintf(stderr, "Not enough memory for geometry indices! Aborting...\n"); + exit(-1); + } + if(no_monomials > 0) { + if(even_odd_flag) { + j = init_monomials(VOLUMEPLUSRAND / 2, even_odd_flag); + } + else { + j = init_monomials(VOLUMEPLUSRAND, even_odd_flag); + } + if(j != 0) { + fprintf(stderr, "Not enough memory for monomial pseudo fermion fields! Aborting...\n"); + exit(0); + } + } + if(even_odd_flag) { + j = init_spinor_field(VOLUMEPLUSRAND / 2, NO_OF_SPINORFIELDS); + } + else { + j = init_spinor_field(VOLUMEPLUSRAND, NO_OF_SPINORFIELDS); + } + if(j != 0) { + fprintf(stderr, "Not enough memory for spinor fields! Aborting...\n"); + exit(-1); + } + + if(g_running_phmc) { + j = init_chi_up_spinor_field(VOLUMEPLUSRAND / 2, 20); + if(j != 0) { + fprintf(stderr, "Not enough memory for PHMC Chi_up fields! Aborting...\n"); + exit(0); + } + j = init_chi_dn_spinor_field(VOLUMEPLUSRAND / 2, 20); + if(j != 0) { + fprintf(stderr, "Not enough memory for PHMC Chi_dn fields! Aborting...\n"); + exit(0); + } + } + + g_mu = g_mu1; + if(g_proc_id == 0) { + /*construct the filenames for the observables and the parameters*/ + strcpy(datafilename, filename); + strcat(datafilename, ".data"); + strcpy(parameterfilename, filename); + strcat(parameterfilename, ".para"); + + parameterfile = fopen(parameterfilename, "w"); + write_first_messages(parameterfile, 1); + fclose(parameterfile); + } + + /* this is for the extra masses of the CGMMS */ + if (solver_flag == 12 && g_no_extra_masses > 0) { + if ((parameterfile = fopen("extra_masses.input", "r")) != NULL) { + for (j = 0; j < g_no_extra_masses; j++) { + fscanf(parameterfile, "%lf", &g_extra_masses[j]); + if (g_proc_id == 0 && g_debug_level > 0) { + printf("# g_extra_masses[%d] = %lf\n", j, g_extra_masses[j]); + } + } + fclose(parameterfile); + } + else { + fprintf(stderr, "Could not open file extra_masses.input!\n"); + g_no_extra_masses = 0; + } + } + + /* define the geometry */ + geometry(); + + /* define the boundary conditions for the fermion fields */ + boundary(g_kappa); + + phmc_invmaxev = 1.; + + +#ifdef _USE_HALFSPINOR + j = init_dirac_halfspinor(); + if (j != 0) { + fprintf(stderr, "Not enough memory for halffield! Aborting...\n"); + exit(-1); + } + if (g_sloppy_precision_flag == 1) { + j = init_dirac_halfspinor32(); + if (j != 0) { + fprintf(stderr, "Not enough memory for 32-Bit halffield! Aborting...\n"); + exit(-1); + } + } +# if (defined _PERSISTENT) + if (even_odd_flag) { + init_xchange_halffield(); + } +# endif +#endif + + for (j = 0; j < Nmeas; j++) { + sprintf(conf_filename, "%s.%.4d", gauge_input_filename, nstore); + if (g_proc_id == 0) { + printf("Reading Gauge field from file %s\n", conf_filename); + fflush(stdout); + } +#ifdef HAVE_LIBLEMON + read_lemon_gauge_field_parallel(conf_filename, &gaugecksum, &xlfmessage, &gaugelfn); +#else /* HAVE_LIBLEMON */ + if (xlfmessage != (char*)NULL) + free(xlfmessage); + if (gaugelfn != (char*)NULL) + free(gaugelfn); + if (gaugecksum != (char*)NULL) + free(gaugecksum); + read_lime_gauge_field(conf_filename); + xlfmessage = read_message(conf_filename, "xlf-info"); + gaugelfn = read_message(conf_filename, "ildg-data-lfn"); + gaugecksum = read_message(conf_filename, "scidac-checksum"); + printf("%s \n", gaugecksum); +#endif /* HAVE_LIBLEMON */ + if (g_proc_id == 0) { + printf("done!\n"); + fflush(stdout); + } + /* unit_g_gauge_field(); */ +#ifdef MPI + xchange_gauge(g_gauge_field); +#endif + + /*compute the energy of the gauge field*/ + plaquette_energy = measure_gauge_action(); + + if (g_proc_id == 0) { + printf("The plaquette value is %e\n", plaquette_energy / (6.*VOLUME*g_nproc)); + fflush(stdout); + } + + if (use_stout_flag == 1) { + if (stout_smear_gauge_field(stout_rho , stout_no_iter) != 0) { + exit(1) ; + } + plaquette_energy = measure_gauge_action(); + + if (g_proc_id == 0) { + printf("The plaquette value after stouting is %e\n", plaquette_energy / (6.*VOLUME*g_nproc)); + fflush(stdout); + } + } + + /* Compute minimal eigenvalues, necessary for overlap! */ + if (compute_evs != 0) + eigenvalues(&no_eigenvalues, max_solver_iterations, eigenvalue_precision, 0, compute_evs, nstore, even_odd_flag); + else { + compute_evs = 1; + no_eigenvalues = 1; + eigenvalues(&no_eigenvalues, max_solver_iterations, eigenvalue_precision, 0, compute_evs, nstore, even_odd_flag); + no_eigenvalues = 0; + compute_evs = 0; + } + + if (phmc_compute_evs != 0) { +#ifdef MPI + MPI_Finalize(); +#endif + return (0); + } + + /* here we can do something */ + ov_n_cheby = (-log(delta))/(2*sqrt(ev_minev)); + printf("// Degree of cheby polynomial: %d\n", ov_n_cheby); +// g_mu = 0.; + ov_check_locality(); +// ov_check_ginsparg_wilson_relation_strong(); +// ov_compare_4x4("overlap.mat"); +// ov_compare_12x12("overlap.mat"); +// ov_save_12x12("overlap.mat"); +// ov_check_operator(1,0,0,0); + + nstore += Nsave; + } +#ifdef MPI + MPI_Finalize(); +#endif + + free_blocks(); + free_dfl_subspace(); + free_gauge_field(); + free_geometry_indices(); + free_spinor_field(); + free_moment_field(); + if (g_running_phmc) { + free_chi_up_spinor_field(); + free_chi_dn_spinor_field(); + } + return(0); +#ifdef _KOJAK_INST +#pragma pomp inst end(main) +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/test/check_xchange.c b/qcd/part_cpu/applications/QCD/src/kernel_D/test/check_xchange.c new file mode 100644 index 0000000000000000000000000000000000000000..4367ccda83d938d8c3a134385ac905ef6d06b989 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/test/check_xchange.c @@ -0,0 +1,6407 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * File check_xchange.c + * + * Check of the exchange routines + * + * Author: Carsten Urbach + * + *******************************************************************************/ + + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#ifdef MPI +# include +#endif +#include "global.h" +#include "geometry_eo.h" +#include "start.h" +#include "xchange/xchange.h" + +void set_deri_point(); +int check_geometry(); + +#if (defined _INDEX_INDEP_GEOM) + +int check_xchange() +{ +#ifdef XLC +#pragma execution_frequency(very_low) +#endif + +#ifdef MPI + double * x; + int i,ix, mu, x0, x1, x2, x3, k; + int mp, pm, mm, pp, di[4]; + + int startvaluet=0,startvaluex=0,startvaluey=0,startvaluez=0; + int bndcntu,bndcntu2,bndcntd,bndcntd2; + +#if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) + startvaluet = 2; +#endif +#if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ ) + startvaluex = 2; +#endif +#if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ ) + startvaluey = 2; +#endif +#if (defined PARALLELXYZT || defined PARALLELXYZ ) + startvaluez = 2; +#endif + +# ifdef _USE_TSPLITPAR +# ifdef PARALLELX +# define REQC 4 +# elif defined PARALLELXY +# define REQC 8 +# elif defined PARALLELXYZ +# define REQC 12 +# endif + MPI_Request requests[REQC]; + MPI_Status status[REQC]; +# endif + + /* Check the field exchange */ + /* Set the whole field to -1 */ + set_spinor_field(0, -1.); + + /* Set the internal boundary to g_cart_id */ + /* We need here g_lexic2eo, otherwise the test is useless... */ + +# if ((defined PARALLELT) || (defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT)) + for(x1 = 0; x1 < LX; x1++) { + for(x2 = 0; x2 < LY; x2++) { + for(x3 = 0; x3 < LZ; x3++) { + set_spinor_point(&g_spinor_field[0][ g_lexic2eo[g_ipt[0][x1][x2][x3]] ], g_cart_id); + set_spinor_point(&g_spinor_field[0][ g_lexic2eo[g_ipt[T-1][x1][x2][x3]] ], g_cart_id); + } + } + } +# endif + +# if ((defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT) || (defined PARALLELX) || (defined PARALLELXY) || (defined PARALLELXYZ)) + for(x0 = 0; x0 < T; x0++) { + for(x2 = 0; x2 < LY; x2++) { + for(x3 = 0; x3 < LZ; x3++) { + set_spinor_point(&g_spinor_field[0][ g_lexic2eo[g_ipt[x0][0][x2][x3]] ], g_cart_id); + set_spinor_point(&g_spinor_field[0][ g_lexic2eo[g_ipt[x0][LX-1][x2][x3]] ], g_cart_id); + } + } + } +# endif + +# if ((defined PARALLELXYT) || (defined PARALLELXYZT) || (defined PARALLELXY) || (defined PARALLELXYZ)) + for(x0 = 0; x0 < T; x0++) { + for(x1 = 0; x1 < LX; x1++) { + for(x3 = 0; x3 < LZ; x3++) { + set_spinor_point(&g_spinor_field[0][ g_lexic2eo[g_ipt[x0][x1][0][x3]] ], g_cart_id); + set_spinor_point(&g_spinor_field[0][ g_lexic2eo[g_ipt[x0][x1][LY-1][x3]] ], g_cart_id); + } + } + } +# endif + MPI_Barrier(MPI_COMM_WORLD); + +#ifdef _USE_TSPLITPAR + for(x0 = 0; x0 < T; x0++){ + xchange_field_open(g_spinor_field[0], 0, x0, requests, status); + xchange_field_close(requests, status, REQC); + } +#else + xchange_field(g_spinor_field[0], 0); +#endif + + MPI_Barrier(MPI_COMM_WORLD); + +# if ((defined PARALLELT) || (defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT)) + x = (double*) &g_spinor_field[0][g_1st_t_ext_up]; + for(i = 0; i < LX*LY*LZ/2*24; i++, x++) { + if((int)(*x) != g_nb_t_up) { + printf("The exchange up of fields in time direction\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_t_up); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) &g_spinor_field[0][g_1st_t_ext_dn]; + for(i = 0; i < LX*LY*LZ/2*24; i++, x++) { + if((int)(*x) != g_nb_t_dn) { + printf("The exchange down of fields in time direction\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_t_dn); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } +# endif + +# if ((defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT) || (defined PARALLELX) || (defined PARALLELXY) || (defined PARALLELXYZ)) + x = (double*) &g_spinor_field[0][g_1st_x_ext_up]; + for(i = 0; i < T*LY*LZ/2*24; i++, x++) { + if((int)(*x) != g_nb_x_up) { + printf("The exchange up of fields in x direction\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_x_up); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) &g_spinor_field[0][g_1st_x_ext_dn]; + for(i = 0; i < T*LY*LZ/2*24; i++, x++) { + if((int)(*x) != g_nb_x_dn) { + printf("The exchange down of fields in x direction\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_x_dn); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } +# endif + +# if ((defined PARALLELXYT) || (defined PARALLELXYZT) || (defined PARALLELXY) || (defined PARALLELXYZ)) + x = (double*) &g_spinor_field[0][g_1st_y_ext_up]; + for(i = 0; i < T*LX*LZ/2*24; i++, x++) { + if((int)(*x) != g_nb_y_up) { + printf("The exchange up of fields in y direction\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_y_up); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) &g_spinor_field[0][g_1st_y_ext_dn]; + for(i = 0; i < T*LX*LZ/2*24; i++, x++) { + if((int)(*x) != g_nb_y_dn) { + printf("The exchange down of fields in y direction\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_y_dn); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } +# endif + +# if ((defined PARALLELXYZT) || (defined PARALLELXYZ)) + set_spinor_field(0, -1.); + + for(x0 = 0; x0 < T; x0++) { + for(x1 = 0; x1 < LX; x1++) { + for(x2 = 0; x2 < LY; x2++) { + set_spinor_point(&g_spinor_field[0][ g_lexic2eosub[g_ipt[x0][x1][x2][0]] ], g_cart_id); /* only even */ + set_spinor_point(&g_spinor_field[0][ g_lexic2eo[g_ipt[x0][x1][x2][LZ-1]] ], g_cart_id); + } + } + } + + MPI_Barrier(MPI_COMM_WORLD); +#ifdef _USE_TSPLITPAR + for(x0 = 0; x0 < T; x0++){ + xchange_field_open(g_spinor_field[0], 1, x0, requests, status); + xchange_field_close(requests, status, REQC); + } +#else + xchange_field(g_spinor_field[0],1); /* only even */ +#endif + MPI_Barrier(MPI_COMM_WORLD); + + x = (double*) &g_spinor_field[0][g_1st_z_ext_up]; + for(i = 0; i < T*LX*LY/2*24; i++, x++) { + if((int)(*x) != g_nb_z_up) { + printf("The exchange up of fields in z (1) direction up\n"); + printf("between %d and %d is not correct at i=%d\n", g_cart_id, g_nb_z_up,i); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) &g_spinor_field[0][g_1st_z_ext_dn]; + for(i = 0; i < T*LX*LY/2*24; i++, x++) { + if((int)(*x) != g_nb_z_dn) { + printf("The exchange down of fields in z (1) direction down\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_z_dn); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + set_spinor_field(0, -1.); + + for(x0 = 0; x0 < T; x0++) { + for(x1 = 0; x1 < LX; x1++) { + for(x2 = 0; x2 < LY; x2++) { + set_spinor_point(&g_spinor_field[0][ g_lexic2eo[g_ipt[x0][x1][x2][0]] ], g_cart_id); + set_spinor_point(&g_spinor_field[0][ g_lexic2eosub[g_ipt[x0][x1][x2][LZ-1]] ], g_cart_id); /* only even */ + } + } + } + + MPI_Barrier(MPI_COMM_WORLD); +#ifdef _USE_TSPLITPAR + for(x0 = 0; x0 < T; x0++){ + xchange_field_open(g_spinor_field[0], 1, x0, requests, status); + xchange_field_close(requests, status, REQC); + } +#else + xchange_field(g_spinor_field[0],1); /* only even */ +#endif + MPI_Barrier(MPI_COMM_WORLD); + + x = (double*) &g_spinor_field[0][g_1st_z_ext_up]; + for(i = 0; i < T*LX*LY/2*24; i++, x++) { + if((int)(*x) != g_nb_z_up) { + printf("The exchange up of fields in z (0) direction up\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_z_up); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) &g_spinor_field[0][g_1st_z_ext_dn]; + for(i = 0; i < T*LX*LY/2*24; i++, x++) { + if((int)(*x) != g_nb_z_dn) { + printf("The exchange down of fields in z (0) direction down\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_z_dn); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } +# endif + + if(g_proc_id == 0) { + printf("# Exchange of spinor fields checked successfully!\n"); + } + fflush(stdout); + fflush(stderr); + + /* Check the gauge exchange */ + + set_gauge_field(-1.); +# if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT ) + /* Set the time boundary */ + for(x1 = 0; x1 < LX; x1++) { + for(x2 = 0; x2 < LY; x2++) { + for(x3 = 0; x3 < LZ; x3++) { + for (mu = 0; mu < 4; mu++) { + g_gauge_field[ g_ipt[0][x1][x2][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[T-1][x1][x2][x3] ][mu] = set_su3((double)g_cart_id); + } + } + } + } + +# endif +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ ) + /* Set the x boundary */ + for(x0 = 0; x0 < T; x0++) { + for(x2 = 0; x2 < LY; x2++) { + for(x3 = 0; x3 < LZ; x3++) { + for (mu = 0; mu < 4; mu++) { + g_gauge_field[ g_ipt[x0][0][x2][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][LX-1][x2][x3] ][mu] = set_su3((double)g_cart_id); + } + } + } + } +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ ) + /* Set the y boundary */ + for(x0 = 0; x0 < T; x0++) { + for(x1 = 0; x1 < LX; x1++) { + for(x3 = 0; x3 < LZ; x3++) { + for (mu = 0; mu < 4; mu++) { + g_gauge_field[ g_ipt[x0][x1][0][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][x1][LY-1][x3] ][mu] = set_su3((double)g_cart_id); + } + } + } + } +# endif + +# if (defined PARALLELXYZT || defined PARALLELXYZ ) + /* Set the z boundary */ + for(x0 = 0; x0 < T; x0++) { + for(x1 = 0; x1 < LX; x1++) { + for(x2 = 0; x2 < LY; x2++) { + for (mu = 0; mu < 4; mu++) { + g_gauge_field[ g_ipt[x0][x1][x2][0] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][x1][x2][LZ-1] ][mu] = set_su3((double)g_cart_id); + } + } + } + } +# endif + + MPI_Barrier(MPI_COMM_WORLD); + xchange_gauge(g_gauge_field); + MPI_Barrier(MPI_COMM_WORLD); + +# if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT ) + x = (double*) &g_gauge_field[gI_L_0_0_0][0]; + for(i = 0; i < LX*LY*LZ*72; i++, x++) { + if((int)(*x) != g_nb_t_up) { + printf("The exchange up of gaugefields in time direction\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_t_up); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) &g_gauge_field[gI_m1_0_0_0][0]; + for(i = 0; i < LX*LZ*LY*72; i++, x++) { + if((int)(*x) != g_nb_t_dn) { + printf("The exchange down of gaugefields in time direction\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_t_dn); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } +# endif +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ ) + x = (double*) &g_gauge_field[gI_0_L_0_0][0]; + for(i = 0; i < T*LY*LZ*72; i++, x++) { + if((int)(*x) != g_nb_x_up) { + printf("The exchange up of gaugefields in x direction\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_x_up); + printf("%d %d %d\n", g_cart_id, i, (int)(*x)); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) &g_gauge_field[gI_0_m1_0_0][0]; + for(i = 0; i < T*LY*LZ*72; i++, x++) { + if((int)(*x) != g_nb_x_dn) { + printf("The exchange down of gaugefields in x direction\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_x_dn); + printf("%d %d %d\n", g_cart_id, i, (int)(*x)); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ ) + x = (double*) &g_gauge_field[gI_0_0_L_0][0]; + for(i = 0; i < T*LX*LZ*72; i++, x++) { + if((int)(*x) != g_nb_y_up) { + printf("The exchange up of gaugefields in y direction\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_y_up); + printf("%d %d %d\n", g_cart_id, i, (int)(*x)); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) &g_gauge_field[gI_0_0_m1_0][0]; + for(i = 0; i < T*LX*LZ*72; i++, x++) { + if((int)(*x) != g_nb_y_dn) { + printf("The exchange down of gaugefields in y direction\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_y_dn); + printf("%d %d %d\n", g_cart_id, i, (int)(*x)); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } +# endif + +# if (defined PARALLELXYZT || defined PARALLELXYZ ) + x = (double*) g_gauge_field[gI_0_0_0_L]; + for(i = 0; i < T*LX*LY*72; i++, x++) { + if((int)(*x) != g_nb_z_up) { + printf("The exchange up of gaugefields in z direction up\n"); + printf("between %d and %d is not correct, down is %d\n", g_cart_id, g_nb_z_up, g_nb_z_dn); + printf("%d %d %d\n", g_cart_id, i, (int)(*x)); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) &g_gauge_field[gI_0_0_0_m1][0]; + for(i = 0; i < T*LX*LY*72; i++, x++) { + if((int)(*x) != g_nb_z_dn) { + printf("The exchange down of gaugefields in z direction down\n"); + printf("between %d and %d is not correct, up is %d\n", g_cart_id, g_nb_z_dn, g_nb_z_up); + printf("%d %d %d\n", g_cart_id, i, (int)(*x)); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } +# endif + + set_gauge_field(-1.); + + /* Set the tx boundary */ + for(x2 = 0; x2 < LY; x2++) { + for(x3 = 0; x3 < LZ; x3++) { + for (mu = 0; mu < 4; mu++) { + g_gauge_field[ g_ipt[0][0][x2][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[T-1][0][x2][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[0][LX-1][x2][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[T-1][LX-1][x2][x3] ][mu] = set_su3((double)g_cart_id); + } + } + } + + /* Set the xy boundary */ + for(x0 = 0; x0 < T; x0++) { + for(x3 = 0; x3 < LZ; x3++) { + for (mu = 0; mu < 4; mu++) { + g_gauge_field[ g_ipt[x0][0][0][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][LX-1][0][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][0][LY-1][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][LX-1][LY-1][x3] ][mu] = set_su3((double)g_cart_id); + } + } + } + + /* Set the ty boundary */ + for(x1 = 0; x1 < LX; x1++) { + for(x3 = 0; x3 < LZ; x3++) { + for (mu = 0; mu < 4; mu++) { + g_gauge_field[ g_ipt[0][x1][0][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[T-1][x1][0][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[0][x1][LY-1][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[T-1][x1][LY-1][x3] ][mu] = set_su3((double)g_cart_id); + } + } + } + + /* Set the tz boundary */ + for(x1 = 0; x1 < LX; x1++) { + for(x2 = 0; x2 < LY; x2++) { + for (mu = 0; mu < 4; mu++) { + g_gauge_field[ g_ipt[0][x1][x2][0] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[T-1][x1][x2][0] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[0][x1][x2][LZ-1] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[T-1][x1][x2][LZ-1] ][mu] = set_su3((double)g_cart_id); + } + } + } + + /* Set the xz boundary */ + for(x0 = 0; x0 < T; x0++) { + for(x2 = 0; x2 < LY; x2++) { + for (mu = 0; mu < 4; mu++) { + g_gauge_field[ g_ipt[x0][0][x2][0] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][LX-1][x2][0] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][0][x2][LZ-1] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][LX-1][x2][LZ-1] ][mu] = set_su3((double)g_cart_id); + } + } + } + + /* Set the yz boundary */ + for(x0 = 0; x0 < T; x0++) { + for(x1 = 0; x1 < LX; x1++) { + for (mu = 0; mu < 4; mu++) { + g_gauge_field[ g_ipt[x0][x1][0][0] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][x1][LY-1][0] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][x1][0][LZ-1] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][x1][LY-1][LZ-1] ][mu] = set_su3((double)g_cart_id); + } + } + } + + MPI_Barrier(MPI_COMM_WORLD); + xchange_gauge(g_gauge_field); + MPI_Barrier(MPI_COMM_WORLD); + + /* DEBUG */ + /* + for (x0 = -startvaluet; x0 < (T+startvaluet); x0++){ + for (x1 = -startvaluex; x1 < (LX+startvaluex); x1++){ + for (x2 = -startvaluey; x2 < (LY+startvaluey); x2++){ + for (x3 = -startvaluez; x3 < (LZ+startvaluez); x3++){ + bndcntu = 0; + bndcntd = 0; + bndcntu2 = 0; + bndcntd2 = 0; + if(x0 < 0 ) bndcntd++; + if(x1 < 0 ) bndcntd++; + if(x2 < 0 ) bndcntd++; + if(x3 < 0 ) bndcntd++; + if(x0 > T-1) bndcntu++; + if(x1 > LX-1) bndcntu++; + if(x2 > LY-1) bndcntu++; + if(x3 > LZ-1) bndcntu++; + if(x0 < -1 ) bndcntd2++; + if(x1 < -1 ) bndcntd2++; + if(x2 < -1 ) bndcntd2++; + if(x3 < -1 ) bndcntd2++; + if(x0 > T) bndcntu2++; + if(x1 > LX) bndcntu2++; + if(x2 > LY) bndcntu2++; + if(x3 > LZ) bndcntu2++; + if((bndcntu+bndcntd<=2) && (bndcntu2+bndcntd2<=1) && (bndcntu2*bndcntd==0) && (bndcntu*bndcntd2==0)){ + i=Index(x0,x1,x2,x3); + x = (double*) g_gauge_field[i]; + if(g_proc_id==0) fprintf(stdout,"debuG-%d: %g , %d,%d,%d,%d , %d\n",g_proc_id,*x,x0,x1,x2,x3,i); + fflush(stdout); + } else { + if(g_proc_id==0) fprintf(stdout,"outside-%d: nan, %d,%d,%d,%d\n",g_proc_id,x0,x1,x2,x3); + fflush(stdout); + } + } + } + } + } + + MPI_Barrier(MPI_COMM_WORLD); + */ + + /* The edges */ +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT ) + fprintf(stdout, "rank:%d; (c0,c1,c2,c3)=(%d,%d,%d,%d)\n",g_proc_id,g_proc_coords[0],g_proc_coords[1],g_proc_coords[2],g_proc_coords[3]); fflush(stdout); + + di[0] = (g_proc_coords[0] - 1)%g_nproc_t; + di[1] = (g_proc_coords[1] - 1)%g_nproc_x; + di[2] = g_proc_coords[2]; + di[3] = g_proc_coords[3]; + MPI_Cart_rank(g_cart_grid, di, &mm); + di[0] = (g_proc_coords[0] - 1)%g_nproc_t; + di[1] = (g_proc_coords[1] + 1)%g_nproc_x; + MPI_Cart_rank(g_cart_grid, di, &mp); + di[0] = (g_proc_coords[0] + 1)%g_nproc_t; + di[1] = (g_proc_coords[1] - 1)%g_nproc_x; + MPI_Cart_rank(g_cart_grid, di, &pm); + di[0] = (g_proc_coords[0] + 1)%g_nproc_t; + di[1] = (g_proc_coords[1] + 1)%g_nproc_x; + MPI_Cart_rank(g_cart_grid, di, &pp); + + + x = (double*) g_gauge_field[gI_L_L_0_0]; + for(i = 0; i < LY*LZ*72; i++, x++) { + if((int)(*x) != pp) { + printf("The exchange of gaugefields edges (xt) in direction +x+t\n"); + printf("between %d and %d is not correct\n", g_cart_id, pp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[Index(T,-1,0,0)]; // gI_L_m1_0_0 + for(i = 0; i < LY*LZ*72; i++, x++) { + if((int)(*x) != pm) { + printf("The exchange of gaugefields edges (xt) in direction -x+t\n"); + printf("between %d and %d is not correct\n", g_cart_id, pm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[gI_m1_L_0_0]; + for(i = 0; i < LY*LZ*72; i++, x++) { + if((int)(*x) != mp) { + printf("The exchange of gaugefields edges (xt) in direction +x-t\n"); + printf("between %d and %d is not correct\n", g_cart_id, mp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[Index(-1,-1,0,0)]; // gI_m1_m1_0_0 + for(i = 0; i < LY*LZ*72; i++, x++) { + if((int)(*x) != mm) { + printf("The exchange of gaugefields edges (xt) in direction -x-t\n"); + printf("between %d and %d is not correct\n", g_cart_id, mm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ ) + di[1] = (g_proc_coords[1] - 1)%g_nproc_x; + di[2] = (g_proc_coords[2] - 1)%g_nproc_y; + di[0] = g_proc_coords[0]; + di[3] = g_proc_coords[3]; + MPI_Cart_rank(g_cart_grid, di, &mm); + di[1] = (g_proc_coords[1] - 1)%g_nproc_x; + di[2] = (g_proc_coords[2] + 1)%g_nproc_y; + MPI_Cart_rank(g_cart_grid, di, &mp); + di[1] = (g_proc_coords[1] + 1)%g_nproc_x; + di[2] = (g_proc_coords[2] - 1)%g_nproc_y; + MPI_Cart_rank(g_cart_grid, di, &pm); + di[1] = (g_proc_coords[1] + 1)%g_nproc_x; + di[2] = (g_proc_coords[2] + 1)%g_nproc_y; + MPI_Cart_rank(g_cart_grid, di, &pp); + + x = (double*) g_gauge_field[gI_0_L_L_0]; + for(i = 0; i < T*LZ*72; i++, x++) { + if((int)(*x) != pp) { + printf("The exchange of gaugefields edges (yx) in direction +y+x\n"); + printf("between %d and %d is not correct\n", g_cart_id, pp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[Index(0,LX,-1,0)]; // gI_0_L_m1_0 + for(i = 0; i < T*LZ*72; i++, x++) { + if((int)(*x) != pm) { + printf("The exchange of gaugefields edges (yx) in direction -y+x\n"); + printf("between %d and %d is not correct\n", g_cart_id, pm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[gI_0_m1_L_0]; + for(i = 0; i < T*LZ*72; i++, x++) { + if((int)(*x) != mp) { + printf("The exchange of gaugefields edges (yx) in direction +y-x\n"); + printf("between %d and %d is not correct\n", g_cart_id, mp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[Index(0,-1,-1,0)]; // gI_0_m1_m1_0 + for(i = 0; i < T*LZ*72; i++, x++) { + if((int)(*x) != mm) { + printf("The exchange of gaugefields edges (yx) in direction -y-x\n"); + printf("between %d and %d is not correct\n", g_cart_id, mm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT ) + di[0] = (g_proc_coords[0] - 1)%g_nproc_t; + di[2] = (g_proc_coords[2] - 1)%g_nproc_y; + di[1] = g_proc_coords[1]; + di[3] = g_proc_coords[3]; + MPI_Cart_rank(g_cart_grid, di, &mm); + di[0] = (g_proc_coords[0] - 1)%g_nproc_t; + di[2] = (g_proc_coords[2] + 1)%g_nproc_y; + MPI_Cart_rank(g_cart_grid, di, &mp); + di[0] = (g_proc_coords[0] + 1)%g_nproc_t; + di[2] = (g_proc_coords[2] - 1)%g_nproc_y; + MPI_Cart_rank(g_cart_grid, di, &pm); + di[0] = (g_proc_coords[0] + 1)%g_nproc_t; + di[2] = (g_proc_coords[2] + 1)%g_nproc_y; + MPI_Cart_rank(g_cart_grid, di, &pp); + + x = (double*) g_gauge_field[gI_L_0_L_0]; + for(i = 0; i < LX*LZ*72; i++, x++) { + if((int)(*x) != pp) { + printf("The exchange of gaugefields edges (ty) in direction +t+y\n"); + printf("between %d and %d is not correct\n", g_cart_id, pp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[Index(-1,0,LY,0)]; // gI_m1_0_L_0 + for(i = 0; i < LX*LZ*72; i++, x++) { + if((int)(*x) != mp) { + printf("The exchange of gaugefields edges (ty) in direction -t+y\n"); + printf("between %d and %d is not correct\n", g_cart_id, mp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[gI_L_0_m1_0]; + for(i = 0; i < LX*LZ*72; i++, x++) { + if((int)(*x) != pm) { + printf("The exchange of gaugefields edges (ty) in direction +t-y\n"); + printf("between %d and %d is not correct\n", g_cart_id, pm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[Index(-1,0,-1,0)]; // gI_m1_0_m1_0 + for(i = 0; i < LX*LZ*72; i++, x++) { + if((int)(*x) != mm) { + printf("The exchange of gaugefields edges (ty) in direction -t-y\n"); + printf("between %d and %d is not correct\n", g_cart_id, mm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } +# endif +# if (defined PARALLELXYZT || defined PARALLELXYZ ) + di[1] = (g_proc_coords[1] - 1)%g_nproc_x; + di[3] = (g_proc_coords[3] - 1)%g_nproc_z; + di[0] = g_proc_coords[0]; + di[2] = g_proc_coords[2]; + MPI_Cart_rank(g_cart_grid, di, &mm); + di[1] = (g_proc_coords[1] - 1)%g_nproc_x; + di[3] = (g_proc_coords[3] + 1)%g_nproc_z; + MPI_Cart_rank(g_cart_grid, di, &mp); + di[1] = (g_proc_coords[1] + 1)%g_nproc_x; + di[3] = (g_proc_coords[3] - 1)%g_nproc_z; + MPI_Cart_rank(g_cart_grid, di, &pm); + di[1] = (g_proc_coords[1] + 1)%g_nproc_x; + di[3] = (g_proc_coords[3] + 1)%g_nproc_z; + MPI_Cart_rank(g_cart_grid, di, &pp); + /*xz-edge */ + x = (double*) g_gauge_field[gI_0_L_0_L]; + for(i = 0; i < T*LY*72; i++, x++) { + if((int)(*x) != pp) { + printf("The exchange of gaugefields edges (zx) in direction +z+x\n"); + printf("between %d and %d is not correct\n", g_cart_id, pp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[Index(0,LX,0,-1)]; // gI_0_L_0_m1 + for(i = 0; i < T*LY*72; i++, x++) { + if((int)(*x) != pm) { + printf("The exchange of gaugefields edges (zx) in direction -z+x\n"); + printf("between %d and %d is not correct\n", g_cart_id, pm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[gI_0_m1_0_L]; + for(i = 0; i < T*LY*72; i++, x++) { + if((int)(*x) != mp) { + printf("The exchange of gaugefields edges (zx) in direction +z-x\n"); + printf("between %d and %d is not correct\n", g_cart_id, mp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[Index(0,-1,0,-1)]; // gI_0_m1_0_m1 + for(i = 0; i < T*LY*72; i++, x++) { + if((int)(*x) != mm) { + printf("The exchange of gaugefields edges (zx) in direction -z-x\n"); + printf("between %d and %d is not correct\n", g_cart_id, mm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } +# endif +# if (defined PARALLELXYZT ) + di[0] = (g_proc_coords[0] - 1)%g_nproc_t; + di[3] = (g_proc_coords[3] - 1)%g_nproc_z; + di[1] = g_proc_coords[1]; + di[2] = g_proc_coords[2]; + MPI_Cart_rank(g_cart_grid, di, &mm); + di[0] = (g_proc_coords[0] - 1)%g_nproc_t; + di[3] = (g_proc_coords[3] + 1)%g_nproc_z; + MPI_Cart_rank(g_cart_grid, di, &mp); + di[0] = (g_proc_coords[0] + 1)%g_nproc_t; + di[3] = (g_proc_coords[3] - 1)%g_nproc_z; + MPI_Cart_rank(g_cart_grid, di, &pm); + di[0] = (g_proc_coords[0] + 1)%g_nproc_t; + di[3] = (g_proc_coords[3] + 1)%g_nproc_z; + MPI_Cart_rank(g_cart_grid, di, &pp); + + x = (double*) g_gauge_field[gI_L_0_0_L]; + for(i = 0; i < LX*LY*72; i++, x++) { + if((int)(*x) != pp) { + printf("The exchange of gaugefields edges (tz) in direction +t+z\n"); + printf("between %d and %d is not correct\n", g_cart_id, pp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[Index(-1,0,0,LZ)]; // gI_m1_0_0_L + for(i = 0; i < LX*LY*72; i++, x++) { + if((int)(*x) != mp) { + printf("The exchange of gaugefields edges (tz) in direction -t+z\n"); + printf("between %d and %d is not correct\n", g_cart_id, mp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[gI_L_0_0_m1]; + for(i = 0; i < LX*LY*72; i++, x++) { + if((int)(*x) != pm) { + printf("The exchange of gaugefields edges (tz) in direction +t-z\n"); + printf("between %d and %d is not correct\n", g_cart_id, pm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[Index(-1,0,0,-1)]; //gI_m1_0_0_m1 + for(i = 0; i < LX*LY*72; i++, x++) { + if((int)(*x) != mm) { + printf("The exchange of gaugefields edges (tz) in direction -t-z\n"); + printf("between %d and %d is not correct\n", g_cart_id, mm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } +# endif + +# if (defined PARALLELXYZT || defined PARALLELXYZ ) + + di[2] = (g_proc_coords[2] - 1)%g_nproc_y; + di[3] = (g_proc_coords[3] - 1)%g_nproc_z; + di[1] = g_proc_coords[1]; + di[0] = g_proc_coords[0]; + MPI_Cart_rank(g_cart_grid, di, &mm); + di[2] = (g_proc_coords[2] - 1)%g_nproc_y; + di[3] = (g_proc_coords[3] + 1)%g_nproc_z; + MPI_Cart_rank(g_cart_grid, di, &mp); + di[2] = (g_proc_coords[2] + 1)%g_nproc_y; + di[3] = (g_proc_coords[3] - 1)%g_nproc_z; + MPI_Cart_rank(g_cart_grid, di, &pm); + di[2] = (g_proc_coords[2] + 1)%g_nproc_y; + di[3] = (g_proc_coords[3] + 1)%g_nproc_z; + MPI_Cart_rank(g_cart_grid, di, &pp); + + x = (double*) g_gauge_field[Index(0,0,LY,LZ)]; //gI_0_0_L_L + for(i = 0; i < T*LX*72; i++, x++) { + if((int)(*x) != pp) { + printf("The exchange of gaugefields edges (yz) in direction +y+z\n"); + printf("between %d and %d is not correct\n", g_cart_id, pp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[Index(0,0,-1,LZ)]; //gI_0_0_m1_L + for(i = 0; i < LX*T*72; i++, x++) { + if((int)(*x) != mp) { + printf("The exchange of gaugefields edges (yz) in direction -y+z\n"); + printf("between %d and %d is not correct\n", g_cart_id, mp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[Index(0,0,LY,-1)]; //gI_0_0_L_m1 + for(i = 0; i < LX*T*72; i++, x++) { + if((int)(*x) != pm) { + printf("The exchange of gaugefields edges (yz) in direction +y-z\n"); + printf("between %d and %d is not correct\n", g_cart_id, pm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[Index(0,0,-1,-1)]; //gI_0_0_m1_m1 + for(i = 0; i < LX*T*72; i++, x++) { + if((int)(*x) != mm) { + printf("The exchange of gaugefields edges (yz) in direction -y-z\n"); + printf("between %d and %d is not correct\n", g_cart_id, mm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } +# endif + + if(g_dbw2rand > 0) { + set_gauge_field(-1.); + + /* Set the t2 boundary */ + for(x1 = 0; x1 < LX; x1++) { + for(x2 = 0; x2 < LY; x2++) { + for(x3 = 0; x3 < LZ; x3++) { + for (mu = 0; mu < 4; mu++) { + g_gauge_field[ g_ipt[1][x1][x2][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[T-2][x1][x2][x3] ][mu] = set_su3((double)g_cart_id); + } + } + } + } + + /* Set the x2 boundary */ + for(x0 = 0; x0 < T; x0++) { + for(x2 = 0; x2 < LY; x2++) { + for(x3 = 0; x3 < LZ; x3++) { + for (mu = 0; mu < 4; mu++) { + g_gauge_field[ g_ipt[x0][1][x2][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][LX-2][x2][x3] ][mu] = set_su3((double)g_cart_id); + } + } + } + } + + /* Set the y2 boundary */ + for(x0 = 0; x0 < T; x0++) { + for(x1 = 0; x1 < LX; x1++) { + for(x3 = 0; x3 < LZ; x3++) { + for (mu = 0; mu < 4; mu++) { + g_gauge_field[ g_ipt[x0][x1][1][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][x1][LY-2][x3] ][mu] = set_su3((double)g_cart_id); + } + } + } + } + + /* Set the z2 boundary */ + for(x0 = 0; x0 < T; x0++) { + for(x1 = 0; x1 < LX; x1++) { + for(x2 = 0; x2 < LY; x2++) { + for (mu = 0; mu < 4; mu++) { + g_gauge_field[ g_ipt[x0][x1][x2][1] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][x1][x2][LZ-2] ][mu] = set_su3((double)g_cart_id); + } + } + } + } + + MPI_Barrier(MPI_COMM_WORLD); + xchange_gauge(g_gauge_field); + MPI_Barrier(MPI_COMM_WORLD); + +# if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT ) + x = (double*) &g_gauge_field[gI_Lp1_0_0_0][0]; + for(i = 0; i < LX*LY*LZ*72; i++, x++) { + if((int)(*x) != g_nb_t_up) { + printf("The exchange up of gaugefields in 2 time direction\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_t_up); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) &g_gauge_field[gI_m2_0_0_0][0]; + for(i = 0; i < LX*LY*LZ*72; i++, x++) { + if((int)(*x) != g_nb_t_dn) { + printf("The exchange up of gaugefields in 2 time direction\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_t_dn); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } +# endif +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ ) + x = (double*) &g_gauge_field[gI_0_Lp1_0_0][0]; + for(i = 0; i < T*LY*LZ*72; i++, x++) { + if((int)(*x) != g_nb_x_up) { + printf("The exchange up of gaugefields in 2 x direction\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_x_up); + printf("%d %d %d\n", g_cart_id, i, (int)(*x)); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) &g_gauge_field[gI_0_m2_0_0][0]; + for(i = 0; i < T*LY*LZ*72; i++, x++) { + if((int)(*x) != g_nb_x_dn) { + printf("The exchange down of gaugefields in x direction\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_x_dn); + printf("%d %d %d\n", g_cart_id, i, (int)(*x)); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ ) + x = (double*) &g_gauge_field[gI_0_0_Lp1_0][0]; + for(i = 0; i < T*LX*LZ*72; i++, x++) { + if((int)(*x) != g_nb_y_up) { + printf("The exchange up of gaugefields in 2 y direction\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_y_up); + printf("%d %d %d\n", g_cart_id, i, (int)(*x)); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) &g_gauge_field[gI_0_0_m2_0][0]; + for(i = 0; i < T*LX*LZ*72; i++, x++) { + if((int)(*x) != g_nb_y_dn) { + printf("The exchange down of gaugefields in 2 y direction\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_y_dn); + printf("%d %d %d\n", g_cart_id, i, (int)(*x)); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } +# endif + +# if (defined PARALLELXYZT || defined PARALLELXYZ ) + x = (double*) &g_gauge_field[gI_0_0_0_Lp1][0]; + for(i = 0; i < T*LX*LY*72; i++, x++) { + if((int)(*x) != g_nb_z_up) { + printf("The exchange up of gaugefields in 2 z direction\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_z_up); + printf("%d %d %d\n", g_cart_id, i, (int)(*x)); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) &g_gauge_field[gI_0_0_0_m2][0]; + for(i = 0; i < T*LX*LY*72; i++, x++) { + if((int)(*x) != g_nb_z_dn) { + printf("The exchange down of gaugefields in 2 z direction\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_z_dn); + printf("%d %d %d\n", g_cart_id, i, (int)(*x)); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } +# endif + + set_gauge_field(-1.); + /* Set the edges */ +# if ( defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT ) + /* Set the tx boundary */ + for(x2 = 0; x2 < LY; x2++) { + for(x3 = 0; x3 < LZ; x3++) { + for (mu = 0; mu < 4; mu++) { + g_gauge_field[ g_ipt[1][0][x2][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[0][1][x2][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[T-1][1][x2][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[T-2][0][x2][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[1][LX-1][x2][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[0][LX-2][x2][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[T-1][LX-2][x2][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[T-2][LX-1][x2][x3] ][mu] = set_su3((double)g_cart_id); + } + } + } +# endif +# if ( defined PARALLELXYT || defined PARALLELXY || defined PARALLELXYZ || defined PARALLELXYZT ) + /* Set the xy boundary */ + for(x0 = 0; x0 < T; x0++) { + for(x3 = 0; x3 < LZ; x3++) { + for (mu = 0; mu < 4; mu++) { + g_gauge_field[ g_ipt[x0][1][0][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][0][1][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][LX-1][1][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][LX-2][0][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][1][LY-1][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][0][LY-2][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][LX-2][LY-1][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][LX-1][LY-2][x3] ][mu] = set_su3((double)g_cart_id); + } + } + } +# endif +# if ( defined PARALLELXYT || defined PARALLELXYZT ) + /* Set the ty boundary */ + for(x1 = 0; x1 < LX; x1++) { + for(x3 = 0; x3 < LZ; x3++) { + for (mu = 0; mu < 4; mu++) { + g_gauge_field[ g_ipt[1][x1][0][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[0][x1][1][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[T-2][x1][0][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[T-1][x1][1][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[1][x1][LY-1][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[0][x1][LY-2][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[T-2][x1][LY-1][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[T-1][x1][LY-2][x3] ][mu] = set_su3((double)g_cart_id); + } + } + } +# endif +# if defined PARALLELXYZT + /* Set the tz boundary */ + for(x1 = 0; x1 < LX; x1++) { + for(x2 = 0; x2 < LY; x2++) { + for (mu = 0; mu < 4; mu++) { + g_gauge_field[ g_ipt[1 ][x1][x2][0 ] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[0 ][x1][x2][1 ] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[T-2][x1][x2][0 ] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[T-1][x1][x2][1 ] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[1 ][x1][x2][LZ-1] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[0 ][x1][x2][LZ-2] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[T-2][x1][x2][LZ-1] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[T-1][x1][x2][LZ-2] ][mu] = set_su3((double)g_cart_id); + } + } + } +# endif +# if ( defined PARALLELXYZT || defined PARALLELXYZ ) + /* Set the yz boundary */ + for(x0 = 0; x0 < T; x0++) { + for(x1 = 0; x1 < LX; x1++) { + for (mu = 0; mu < 4; mu++) { + g_gauge_field[ g_ipt[x0][x1][1 ][0 ] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][x1][0 ][1 ] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][x1][LY-2][0 ] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][x1][LY-1][1 ] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][x1][1 ][LZ-1] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][x1][0 ][LZ-2] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][x1][LY-2][LZ-1] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][x1][LY-1][LZ-2] ][mu] = set_su3((double)g_cart_id); + } + } + } + + /* Set the xz boundary */ + for(x0 = 0; x0 < T; x0++) { + for(x2 = 0; x2 < LY; x2++) { + for (mu = 0; mu < 4; mu++) { + g_gauge_field[ g_ipt[x0][1][x2][0] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][0][x2][1] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][LX-2][x2][0] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][LX-1][x2][1] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][1][x2][LZ-1] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][0][x2][LZ-2] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][LX-2][x2][LZ-1] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][LX-1][x2][LZ-2] ][mu] = set_su3((double)g_cart_id); + } + } + } +# endif + MPI_Barrier(MPI_COMM_WORLD); + xchange_gauge(g_gauge_field); + MPI_Barrier(MPI_COMM_WORLD); +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) + di[0] = (g_proc_coords[0] - 1)%g_nproc_t; + di[1] = (g_proc_coords[1] - 1)%g_nproc_x; + di[2] = g_proc_coords[2]; + di[3] = g_proc_coords[3]; + MPI_Cart_rank(g_cart_grid, di, &mm); + di[0] = (g_proc_coords[0] - 1)%g_nproc_t; + di[1] = (g_proc_coords[1] + 1)%g_nproc_x; + MPI_Cart_rank(g_cart_grid, di, &mp); + di[0] = (g_proc_coords[0] + 1)%g_nproc_t; + di[1] = (g_proc_coords[1] - 1)%g_nproc_x; + MPI_Cart_rank(g_cart_grid, di, &pm); + di[0] = (g_proc_coords[0] + 1)%g_nproc_t; + di[1] = (g_proc_coords[1] + 1)%g_nproc_x; + MPI_Cart_rank(g_cart_grid, di, &pp); + + x = (double*) g_gauge_field[gI_Lp1_L_0_0]; + for(i = 0; i < LY*LZ*72; i++, x++) { + if((int)(*x) != pp) { + printf("The exchange of gaugefields edges (xt) in direction +x+2t\n"); + printf("between %d and %d is not correct\n", g_cart_id, pp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[gI_Lp1_m1_0_0]; + for(i = 0; i < LY*LZ*72; i++, x++) { + if((int)(*x) != pm) { + printf("The exchange of gaugefields edges (xt) in direction -x+2t\n"); + printf("between %d and %d is not correct\n", g_cart_id, pm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[gI_m2_L_0_0]; + for(i = 0; i < LY*LZ*72; i++, x++) { + if((int)(*x) != mp) { + printf("The exchange of gaugefields edges (xt) in direction +x-2t\n"); + printf("between %d and %d is not correct\n", g_cart_id, mp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[gI_m2_m1_0_0]; + for(i = 0; i < LY*LZ*72; i++, x++) { + if((int)(*x) != mm) { + printf("The exchange of gaugefields edges (xt) in direction -x-2t\n"); + printf("between %d and %d is not correct\n", g_cart_id, mm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[gI_L_Lp1_0_0]; + for(i = 0; i < LY*LZ*72; i++, x++) { + if((int)(*x) != pp) { + printf("The exchange of gaugefields edges (xt) in direction +2x+t\n"); + printf("between %d and %d is not correct\n", g_cart_id, pp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[Index(T,-2,0,0)]; // gI_L_m2_0_0 + for(i = 0; i < LY*LZ*72; i++, x++) { + if((int)(*x) != pm) { + printf("The exchange of gaugefields edges (xt) in direction -2x+t\n"); + printf("between %d and %d is not correct\n", g_cart_id, pm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[gI_m1_Lp1_0_0]; + for(i = 0; i < LY*LZ*72; i++, x++) { + if((int)(*x) != mp) { + printf("The exchange of gaugefields edges (xt) in direction +2x-t\n"); + printf("between %d and %d is not correct\n", g_cart_id, mp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[Index(-1,-2,0,0)]; //gI_m1_m2_0_0 + for(i = 0; i < LY*LZ*72; i++, x++) { + if((int)(*x) != mm) { + printf("The exchange of gaugefields edges (xt) in direction -2x-t\n"); + printf("between %d and %d is not correct\n", g_cart_id, mm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ ) + + di[1] = (g_proc_coords[1] - 1)%g_nproc_x; + di[2] = (g_proc_coords[2] - 1)%g_nproc_y; + di[0] = g_proc_coords[0]; + di[3] = g_proc_coords[3]; + MPI_Cart_rank(g_cart_grid, di, &mm); + di[1] = (g_proc_coords[1] - 1)%g_nproc_x; + di[2] = (g_proc_coords[2] + 1)%g_nproc_y; + MPI_Cart_rank(g_cart_grid, di, &mp); + di[1] = (g_proc_coords[1] + 1)%g_nproc_x; + di[2] = (g_proc_coords[2] - 1)%g_nproc_y; + MPI_Cart_rank(g_cart_grid, di, &pm); + di[1] = (g_proc_coords[1] + 1)%g_nproc_x; + di[2] = (g_proc_coords[2] + 1)%g_nproc_y; + MPI_Cart_rank(g_cart_grid, di, &pp); + + x = (double*) g_gauge_field[gI_0_Lp1_L_0]; + for(i = 0; i < T*LZ*72; i++, x++) { + if((int)(*x) != pp) { + printf("The exchange of gaugefields edges (yx) in direction +y+2x\n"); + printf("between %d and %d is not correct\n", g_cart_id, pp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[gI_0_Lp1_m1_0]; + for(i = 0; i < T*LZ*72; i++, x++) { + if((int)(*x) != pm) { + printf("The exchange of gaugefields edges (yx) in direction -y+2x\n"); + printf("between %d and %d is not correct\n", g_cart_id, pm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[gI_0_m2_L_0]; + for(i = 0; i < T*LZ*72; i++, x++) { + if((int)(*x) != mp) { + printf("The exchange of gaugefields edges (yx) in direction +y-2x\n"); + printf("between %d and %d is not correct\n", g_cart_id, mp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[gI_0_m2_m1_0]; + for(i = 0; i < T*LZ*72; i++, x++) { + if((int)(*x) != mm) { + printf("The exchange of gaugefields edges (yx) in direction -y-2x\n"); + printf("between %d and %d is not correct\n", g_cart_id, mm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[gI_0_L_Lp1_0]; + for(i = 0; i < T*LZ*72; i++, x++) { + if((int)(*x) != pp) { + printf("The exchange of gaugefields edges (yx) in direction +2y+x\n"); + printf("between %d and %d is not correct\n", g_cart_id, pp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[Index(0,LX,-2,0)]; //gI_0_L_m2_0 + for(i = 0; i < T*LZ*72; i++, x++) { + if((int)(*x) != pm) { + printf("The exchange of gaugefields edges (yx) in direction -2y+x\n"); + printf("between %d and %d is not correct\n", g_cart_id, pm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[gI_0_m1_Lp1_0]; + for(i = 0; i < T*LZ*72; i++, x++) { + if((int)(*x) != mp) { + printf("The exchange of gaugefields edges (yx) in direction +2y-x\n"); + printf("between %d and %d is not correct\n", g_cart_id, mp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[Index(0,-1,-2,0)]; //gI_0_m1_m2_0 + for(i = 0; i < T*LZ*72; i++, x++) { + if((int)(*x) != mm) { + printf("The exchange of gaugefields edges (yx) in direction -2y-x\n"); + printf("between %d and %d is not correct\n", g_cart_id, mm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + +# endif +# if (defined PARALLELXYT || defined PARALLELXYZT ) + di[0] = (g_proc_coords[0] - 1)%g_nproc_t; + di[2] = (g_proc_coords[2] - 1)%g_nproc_y; + di[1] = g_proc_coords[1]; + di[3] = g_proc_coords[3]; + MPI_Cart_rank(g_cart_grid, di, &mm); + di[0] = (g_proc_coords[0] - 1)%g_nproc_t; + di[2] = (g_proc_coords[2] + 1)%g_nproc_y; + MPI_Cart_rank(g_cart_grid, di, &mp); + di[0] = (g_proc_coords[0] + 1)%g_nproc_t; + di[2] = (g_proc_coords[2] - 1)%g_nproc_y; + MPI_Cart_rank(g_cart_grid, di, &pm); + di[0] = (g_proc_coords[0] + 1)%g_nproc_t; + di[2] = (g_proc_coords[2] + 1)%g_nproc_y; + MPI_Cart_rank(g_cart_grid, di, &pp); + + x = (double*) g_gauge_field[gI_Lp1_0_L_0]; + for(i = 0; i < LX*LZ*72; i++, x++) { + if((int)(*x) != pp) { + printf("The exchange of gaugefields edges (ty) in direction +2t+y\n"); + printf("between %d and %d is not correct\n", g_cart_id, pp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[gI_m2_0_L_0]; + for(i = 0; i < LX*LZ*72; i++, x++) { + if((int)(*x) != mp) { + printf("The exchange of gaugefields edges (ty) in direction -2t+y\n"); + printf("between %d and %d is not correct\n", g_cart_id, mp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[gI_Lp1_0_m1_0]; + for(i = 0; i < LX*LZ*72; i++, x++) { + if((int)(*x) != pm) { + printf("The exchange of gaugefields edges (ty) in direction +2t-y\n"); + printf("between %d and %d is not correct\n", g_cart_id, pm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[gI_m2_0_m1_0]; + for(i = 0; i < LX*LZ*72; i++, x++) { + if((int)(*x) != mm) { + printf("The exchange of gaugefields edges (ty) in direction -2t-y\n"); + printf("between %d and %d is not correct\n", g_cart_id, mm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[gI_L_0_Lp1_0]; + for(i = 0; i < LX*LZ*72; i++, x++) { + if((int)(*x) != pp) { + printf("The exchange of gaugefields edges (ty) in direction +t+2y\n"); + printf("between %d and %d is not correct\n", g_cart_id, pp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[Index(-1,0,LY+1,0)]; //gI_m1_0_Lp1_0 + for(i = 0; i < LX*LZ*72; i++, x++) { + if((int)(*x) != mp) { + printf("The exchange of gaugefields edges (ty) in direction -t+2y\n"); + printf("between %d and %d is not correct\n", g_cart_id, mp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[gI_L_0_m2_0]; + for(i = 0; i < LX*LZ*72; i++, x++) { + if((int)(*x) != pm) { + printf("The exchange of gaugefields edges (ty) in direction +t-2y\n"); + printf("between %d and %d is not correct\n", g_cart_id, pm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[Index(-1,0,-2,0)]; //gI_m1_0_m2_0 + for(i = 0; i < LX*LZ*72; i++, x++) { + if((int)(*x) != mm) { + printf("The exchange of gaugefields edges (ty) in direction -t-2y\n"); + printf("between %d and %d is not correct\n", g_cart_id, mm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } +# endif +# if defined PARALLELXYZT + + di[0] = (g_proc_coords[0] - 1)%g_nproc_t; + di[3] = (g_proc_coords[3] - 1)%g_nproc_z; + di[1] = g_proc_coords[1]; + di[2] = g_proc_coords[2]; + MPI_Cart_rank(g_cart_grid, di, &mm); + di[0] = (g_proc_coords[0] - 1)%g_nproc_t; + di[3] = (g_proc_coords[3] + 1)%g_nproc_z; + MPI_Cart_rank(g_cart_grid, di, &mp); + di[0] = (g_proc_coords[0] + 1)%g_nproc_t; + di[3] = (g_proc_coords[3] - 1)%g_nproc_z; + MPI_Cart_rank(g_cart_grid, di, &pm); + di[0] = (g_proc_coords[0] + 1)%g_nproc_t; + di[3] = (g_proc_coords[3] + 1)%g_nproc_z; + MPI_Cart_rank(g_cart_grid, di, &pp); + + x = (double*) g_gauge_field[gI_Lp1_0_0_L]; + for(i = 0; i < LX*LY*72; i++, x++) { + if((int)(*x) != pp) { + printf("The exchange of gaugefields edges (tz) in direction +z+2t\n"); + printf("between %d and %d is not correct\n", g_cart_id, pp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + x = (double*) g_gauge_field[gI_m2_0_0_L]; + for(i = 0; i < LX*LY*72; i++, x++) { + if((int)(*x) != mp) { + printf("The exchange of gaugefields edges (tz) in direction +z-2t\n"); + printf("between %d and %d is not correct\n", g_cart_id, mp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[gI_Lp1_0_0_m1]; + for(i = 0; i < LX*LY*72; i++, x++) { + if((int)(*x) != pm) { + printf("The exchange of gaugefields edges (tz) in direction -z+2t\n"); + printf("between %d and %d is not correct\n", g_cart_id, pm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[gI_m2_0_0_m1]; + for(i = 0; i < LX*LY*72; i++, x++) { + if((int)(*x) != mm) { + printf("The exchange of gaugefields edges (tz) in direction -z-2t\n"); + printf("between %d and %d is not correct\n", g_cart_id, mm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[gI_L_0_0_Lp1]; + for(i = 0; i < LX*LY*72; i++, x++) { + if((int)(*x) != pp) { + printf("The exchange of gaugefields edges (zt) in direction +2z+t\n"); + printf("between %d and %d is not correct\n", g_cart_id, pp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[Index(-1,0,0,LZ+1)]; //gI_m1_0_0_Lp1 + for(i = 0; i < LX*LY*72; i++, x++) { + if((int)(*x) != mp) { + printf("The exchange of gaugefields edges (zt) in direction +2z-t\n"); + printf("between %d and %d is not correct\n", g_cart_id, mp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[gI_L_0_0_m2]; + for(i = 0; i < LX*LY*72; i++, x++) { + if((int)(*x) != pm) { + printf("The exchange of gaugefields edges (zt) in direction -2z+t\n"); + printf("between %d and %d is not correct\n", g_cart_id, pm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[gI_m1_0_0_m2]; + for(i = 0; i < LX*LY*72; i++, x++) { + if((int)(*x) != mm) { + printf("The exchange of gaugefields edges (zt) in direction -2z-t\n"); + printf("between %d and %d is not correct\n", g_cart_id, mm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + +# endif +# if (defined PARALLELXYZT || defined PARALLELXYZ ) + /* zx-edge */ + di[1] = (g_proc_coords[1] - 1)%g_nproc_x; + di[3] = (g_proc_coords[3] - 1)%g_nproc_z; + di[0] = g_proc_coords[0]; + di[2] = g_proc_coords[2]; + MPI_Cart_rank(g_cart_grid, di, &mm); + di[1] = (g_proc_coords[1] - 1)%g_nproc_x; + di[3] = (g_proc_coords[3] + 1)%g_nproc_z; + MPI_Cart_rank(g_cart_grid, di, &mp); + di[1] = (g_proc_coords[1] + 1)%g_nproc_x; + di[3] = (g_proc_coords[3] - 1)%g_nproc_z; + MPI_Cart_rank(g_cart_grid, di, &pm); + di[1] = (g_proc_coords[1] + 1)%g_nproc_x; + di[3] = (g_proc_coords[3] + 1)%g_nproc_z; + MPI_Cart_rank(g_cart_grid, di, &pp); + + x = (double*) g_gauge_field[gI_0_L_0_Lp1]; + for(i = 0; i < T*LY*72; i++, x++) { + if((int)(*x) != pp) { + printf("The exchange of gaugefields edges (zx) in direction +2z+x\n"); + printf("between %d and %d is not correct\n", g_cart_id, pp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[Index(0,LX,0,-2)]; //gI_0_L_0_m2 + for(i = 0; i < T*LY*72; i++, x++) { + if((int)(*x) != pm) { + printf("The exchange of gaugefields edges (zx) in direction -2z+x\n"); + printf("between %d and %d is not correct\n", g_cart_id, pm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[gI_0_m1_0_Lp1]; + for(i = 0; i < T*LY*72; i++, x++) { + if((int)(*x) != mp) { + printf("The exchange of gaugefields edges (zx) in direction +2z-x\n"); + printf("between %d and %d is not correct\n", g_cart_id, mp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[Index(0,-1,0,-2)]; //gI_0_m1_0_m2 + for(i = 0; i < T*LY*72; i++, x++) { + if((int)(*x) != mm) { + printf("The exchange of gaugefields edges (zx) in direction -2z-x\n"); + printf("between %d and %d is not correct\n", g_cart_id, mm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[gI_0_Lp1_0_L]; + for(i = 0; i < T*LY*72; i++, x++) { + if((int)(*x) != pp) { + printf("The exchange of gaugefields edges (xz) in direction +z+2x\n"); + printf("between %d and %d is not correct\n", g_cart_id, pp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[gI_0_Lp1_0_m1]; + for(i = 0; i < T*LY*72; i++, x++) { + if((int)(*x) != pm) { + printf("The exchange of gaugefields edges (xz) in direction -z+2x\n"); + printf("between %d and %d is not correct\n", g_cart_id, pm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[gI_0_m2_0_L]; + for(i = 0; i < T*LY*72; i++, x++) { + if((int)(*x) != mp) { + printf("The exchange of gaugefields edges (xz) in direction +z-2x\n"); + printf("between %d and %d is not correct\n", g_cart_id, mp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[gI_0_m2_0_m1]; + for(i = 0; i < T*LY*72; i++, x++) { + if((int)(*x) != mm) { + printf("The exchange of gaugefields edges (xz) in direction -z-2x\n"); + printf("between %d and %d is not correct\n", g_cart_id, mm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } +# endif +# if ( defined PARALLELXYZT || defined PARALLELXYZ ) + + /* zy-edge */ + di[2] = (g_proc_coords[2] - 1)%g_nproc_y; + di[3] = (g_proc_coords[3] - 1)%g_nproc_z; + di[0] = g_proc_coords[0]; + di[1] = g_proc_coords[1]; + MPI_Cart_rank(g_cart_grid, di, &mm); + di[2] = (g_proc_coords[2] - 1)%g_nproc_y; + di[3] = (g_proc_coords[3] + 1)%g_nproc_z; + MPI_Cart_rank(g_cart_grid, di, &mp); + di[2] = (g_proc_coords[2] + 1)%g_nproc_y; + di[3] = (g_proc_coords[3] - 1)%g_nproc_z; + MPI_Cart_rank(g_cart_grid, di, &pm); + di[2] = (g_proc_coords[2] + 1)%g_nproc_y; + di[3] = (g_proc_coords[3] + 1)%g_nproc_z; + MPI_Cart_rank(g_cart_grid, di, &pp); + + x = (double*) g_gauge_field[gI_0_0_L_Lp1]; + for(i = 0; i < T*LX*72; i++, x++) { + if((int)(*x) != pp) { + printf("The exchange of gaugefields edges (zy) in direction +2z+y\n"); + printf("between %d and %d is not correct\n", g_cart_id, pp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[Index(0,0,LY,-2)]; //gI_0_0_L_m2 + for(i = 0; i < T*LX*72; i++, x++) { + if((int)(*x) != pm) { + printf("The exchange of gaugefields edges (zy) in direction -2z+y\n"); + printf("between %d and %d is not correct\n", g_cart_id, pm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[gI_0_0_m1_Lp1]; + for(i = 0; i < T*LX*72; i++, x++) { + if((int)(*x) != mp) { + printf("The exchange of gaugefields edges (zy) in direction +2z-y\n"); + printf("between %d and %d is not correct\n", g_cart_id, mp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[Index(0,0,-1,-2)]; //gI_0_0_m1_m2 + for(i = 0; i < T*LX*72; i++, x++) { + if((int)(*x) != mm) { + printf("The exchange of gaugefields edges (zy) in direction -2z-y\n"); + printf("between %d and %d is not correct\n", g_cart_id, mm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[gI_0_0_Lp1_L]; + for(i = 0; i < T*LX*72; i++, x++) { + if((int)(*x) != pp) { + printf("The exchange of gaugefields edges (yz) in direction +z+2y\n"); + printf("between %d and %d is not correct\n", g_cart_id, pp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[gI_0_0_Lp1_m1]; + for(i = 0; i < T*LX*72; i++, x++) { + if((int)(*x) != pm) { + printf("The exchange of gaugefields edges (yz) in direction -z+2y\n"); + printf("between %d and %d is not correct\n", g_cart_id, pm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[gI_0_0_m2_L]; + for(i = 0; i < T*LX*72; i++, x++) { + if((int)(*x) != mp) { + printf("The exchange of gaugefields edges (yz) in direction +z-2y\n"); + printf("between %d and %d is not correct\n", g_cart_id, mp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[gI_0_0_m2_m1]; + for(i = 0; i < T*LX*72; i++, x++) { + if((int)(*x) != mm) { + printf("The exchange of gaugefields edges (yz) in direction -z-2y\n"); + printf("between %d and %d is not correct\n", g_cart_id, mm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm); + printf("Program aborted\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + +# endif + if(g_proc_id == 0) { + printf("# Exchange of rectangular gauge action boundaries checked successfully!\n"); + } + fflush(stdout); + fflush(stderr); + + } /* dbw2 */ + + if(g_proc_id == 0) { + printf("# Exchange of gauge fields checked successfully!\n"); + printf("# Starting check of deri...\n"); + } + fflush(stdout); + fflush(stderr); + + /* Check the deri exchange */ + + for(ix = 0; ix < VOLUME+RAND; ix++){ + for(mu=0; mu<4; mu++){ + ddummy[ix][mu].d1=0.; + ddummy[ix][mu].d2=0.; + ddummy[ix][mu].d3=0.; + ddummy[ix][mu].d4=0.; + ddummy[ix][mu].d5=0.; + ddummy[ix][mu].d6=0.; + ddummy[ix][mu].d7=0.; + ddummy[ix][mu].d8=0.; + df0[ix][mu].d1=0.; + df0[ix][mu].d2=0.; + df0[ix][mu].d3=0.; + df0[ix][mu].d4=0.; + df0[ix][mu].d5=0.; + df0[ix][mu].d6=0.; + df0[ix][mu].d7=0.; + df0[ix][mu].d8=0.; + } + } + +# if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT ) + for(x1 = 0; x1 < LX; x1++) { + for(x2 = 0; x2 < LY; x2++) { + for(x3 = 0; x3 < LZ; x3++) { + ix = g_idn[ g_ipt[0][x1][x2][x3] ][0]; + for(mu = 0; mu < 4; mu++) { + df0[ix][mu].d1=(double)g_cart_id; + df0[ix][mu].d2=(double)g_cart_id; + df0[ix][mu].d3=(double)g_cart_id; + df0[ix][mu].d4=(double)g_cart_id; + df0[ix][mu].d5=(double)g_cart_id; + df0[ix][mu].d6=(double)g_cart_id; + df0[ix][mu].d7=(double)g_cart_id; + df0[ix][mu].d8=(double)g_cart_id; + } + } + } + } +# endif +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ ) + for(x0 = 0; x0 < T; x0++) { + for(x2 = 0; x2 < LY; x2++) { + for(x3 = 0; x3 < LZ; x3++) { + ix = g_idn[ g_ipt[x0][0][x2][x3] ][1]; + for(mu = 0; mu < 4; mu++) { + df0[ix][mu].d1=(double)g_cart_id; + df0[ix][mu].d2=(double)g_cart_id; + df0[ix][mu].d3=(double)g_cart_id; + df0[ix][mu].d4=(double)g_cart_id; + df0[ix][mu].d5=(double)g_cart_id; + df0[ix][mu].d6=(double)g_cart_id; + df0[ix][mu].d7=(double)g_cart_id; + df0[ix][mu].d8=(double)g_cart_id; + } + } + } + } +# endif +# if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ ) + for(x0 = 0; x0 < T; x0++) { + for(x1 = 0; x1 < LX; x1++) { + for(x3 = 0; x3 < LZ; x3++) { + ix = g_idn[ g_ipt[x0][x1][0][x3] ][2]; + for(mu = 0; mu < 4; mu++) { + df0[ix][mu].d1=(double)g_cart_id; + df0[ix][mu].d2=(double)g_cart_id; + df0[ix][mu].d3=(double)g_cart_id; + df0[ix][mu].d4=(double)g_cart_id; + df0[ix][mu].d5=(double)g_cart_id; + df0[ix][mu].d6=(double)g_cart_id; + df0[ix][mu].d7=(double)g_cart_id; + df0[ix][mu].d8=(double)g_cart_id; + } + } + } + } +# endif +# if (defined PARALLELXYZT || defined PARALLELXYZ ) + for(x0 = 0; x0 < T; x0++) { + for(x1 = 0; x1 < LX; x1++) { + for(x2 = 0; x2 < LY; x2++) { + ix = g_idn[ g_ipt[x0][x1][x2][0] ][3]; + for(mu = 0; mu < 4; mu++) { + df0[ix][mu].d1=(double)g_cart_id; + df0[ix][mu].d2=(double)g_cart_id; + df0[ix][mu].d3=(double)g_cart_id; + df0[ix][mu].d4=(double)g_cart_id; + df0[ix][mu].d5=(double)g_cart_id; + df0[ix][mu].d6=(double)g_cart_id; + df0[ix][mu].d7=(double)g_cart_id; + df0[ix][mu].d8=(double)g_cart_id; + } + } + } + } +# endif + + MPI_Barrier(MPI_COMM_WORLD); + xchange_deri(df0); + MPI_Barrier(MPI_COMM_WORLD); + +# if defined PARALLELT + for(x1 = 0; x1 < LX; x1++) { + for(x2 = 0; x2 < LY; x2++) { + for(x3 = 0; x3 < LZ; x3++) { + ix = g_ipt[T-1][x1][x2][x3]; + for(mu = 0; mu < 4; mu++) { + if(df0[ix][mu].d1 != (double)g_nb_t_up || + df0[ix][mu].d2 != (double)g_nb_t_up || + df0[ix][mu].d3 != (double)g_nb_t_up || + df0[ix][mu].d4 != (double)g_nb_t_up || + df0[ix][mu].d5 != (double)g_nb_t_up || + df0[ix][mu].d6 != (double)g_nb_t_up || + df0[ix][mu].d7 != (double)g_nb_t_up || + df0[ix][mu].d8 != (double)g_nb_t_up){ + printf("Exchange of derivatives is working not correctly (1)!\n"); + printf("%d %d %d %d %f %d %d\n", ix, x1, x2, x3, df0[ix][mu].d1, g_nb_t_up, mu, (T-1+x1+x2+x3)%2); + printf("Aborting program!"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + } +# endif +# if defined PARALLELXT + for(x1 = 0; x1 < LX-1; x1++) { + for(x2 = 0; x2 < LY; x2++) { + for(x3 = 0; x3 < LZ; x3++) { + ix = g_ipt[T-1][x1][x2][x3]; + for(mu = 0; mu < 4; mu++) { + if(df0[ix][mu].d1 != (double)g_nb_t_up || + df0[ix][mu].d2 != (double)g_nb_t_up || + df0[ix][mu].d3 != (double)g_nb_t_up || + df0[ix][mu].d4 != (double)g_nb_t_up || + df0[ix][mu].d5 != (double)g_nb_t_up || + df0[ix][mu].d6 != (double)g_nb_t_up || + df0[ix][mu].d7 != (double)g_nb_t_up || + df0[ix][mu].d8 != (double)g_nb_t_up){ + printf("Exchange of derivatives is working not correctly (2)!\n"); + printf("Aborting program!"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + } + for(x0 = 0; x0 < T-1; x0++) { + for(x2 = 0; x2 < LY; x2++) { + for(x3 = 0; x3 < LZ; x3++) { + ix = g_ipt[x0][LX-1][x2][x3]; + for(mu = 0; mu < 4; mu++) { + if(df0[ix][mu].d1 != (double)g_nb_x_up || + df0[ix][mu].d2 != (double)g_nb_x_up || + df0[ix][mu].d3 != (double)g_nb_x_up || + df0[ix][mu].d4 != (double)g_nb_x_up || + df0[ix][mu].d5 != (double)g_nb_x_up || + df0[ix][mu].d6 != (double)g_nb_x_up || + df0[ix][mu].d7 != (double)g_nb_x_up || + df0[ix][mu].d8 != (double)g_nb_x_up){ + printf("Exchange of derivatives is working not correctly (3)!\n"); + printf("Aborting program!"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + } + for(x2 = 0; x2 < LY; x2++) { + for(x3 = 0; x3 < LZ; x3++) { + ix = g_ipt[T-1][LX-1][x2][x3]; + for(mu = 0; mu < 4; mu++) { + if(df0[ix][mu].d1 != (double)(g_nb_x_up + g_nb_t_up) || + df0[ix][mu].d2 != (double)(g_nb_x_up + g_nb_t_up) || + df0[ix][mu].d3 != (double)(g_nb_x_up + g_nb_t_up) || + df0[ix][mu].d4 != (double)(g_nb_x_up + g_nb_t_up) || + df0[ix][mu].d5 != (double)(g_nb_x_up + g_nb_t_up) || + df0[ix][mu].d6 != (double)(g_nb_x_up + g_nb_t_up) || + df0[ix][mu].d7 != (double)(g_nb_x_up + g_nb_t_up) || + df0[ix][mu].d8 != (double)(g_nb_x_up + g_nb_t_up)){ + printf("Exchange of derivatives is working not correctly (4)!\n"); + printf("Aborting program!\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } +# endif +# if defined PARALLELXYT + for(x1 = 0; x1 < LX-1; x1++) { + for(x2 = 0; x2 < LY-1; x2++) { + for(x3 = 0; x3 < LZ; x3++) { + ix = g_ipt[T-1][x1][x2][x3]; + for(mu = 0; mu < 4; mu++) { + if(df0[ix][mu].d1 != (double)g_nb_t_up || + df0[ix][mu].d2 != (double)g_nb_t_up || + df0[ix][mu].d3 != (double)g_nb_t_up || + df0[ix][mu].d4 != (double)g_nb_t_up || + df0[ix][mu].d5 != (double)g_nb_t_up || + df0[ix][mu].d6 != (double)g_nb_t_up || + df0[ix][mu].d7 != (double)g_nb_t_up || + df0[ix][mu].d8 != (double)g_nb_t_up){ + printf("Exchange of derivatives is working not correctly (5)!\n"); + printf("%d %d %d %d %d\n", x1, x2, x3, ix, g_proc_id); + printf("%f %d %d\n", df0[ix][mu].d8, g_nb_t_up, g_nb_t_dn); + printf("Aborting program!\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + } + for(x0 = 0; x0 < T-1; x0++) { + for(x2 = 0; x2 < LY-1; x2++) { + for(x3 = 0; x3 < LZ; x3++) { + ix = g_ipt[x0][LX-1][x2][x3]; + for(mu = 0; mu < 4; mu++) { + if(df0[ix][mu].d1 != (double)g_nb_x_up || + df0[ix][mu].d2 != (double)g_nb_x_up || + df0[ix][mu].d3 != (double)g_nb_x_up || + df0[ix][mu].d4 != (double)g_nb_x_up || + df0[ix][mu].d5 != (double)g_nb_x_up || + df0[ix][mu].d6 != (double)g_nb_x_up || + df0[ix][mu].d7 != (double)g_nb_x_up || + df0[ix][mu].d8 != (double)g_nb_x_up){ + printf("Exchange of derivatives is working not correctly (6)!\n"); + printf("Aborting program!\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + } + for(x0 = 0; x0 < T-1; x0++) { + for(x1 = 1; x1 < LX-1; x1++) { + for(x3 = 0; x3 < LZ; x3++) { + ix = g_ipt[x0][x1][LY-1][x3]; + for(mu = 0; mu < 4; mu++) { + if(df0[ix][mu].d1 != (double)g_nb_y_up || + df0[ix][mu].d2 != (double)g_nb_y_up || + df0[ix][mu].d3 != (double)g_nb_y_up || + df0[ix][mu].d4 != (double)g_nb_y_up || + df0[ix][mu].d5 != (double)g_nb_y_up || + df0[ix][mu].d6 != (double)g_nb_y_up || + df0[ix][mu].d7 != (double)g_nb_y_up || + df0[ix][mu].d8 != (double)g_nb_y_up){ + printf("Exchange of derivatives is working not correctly (7)!\n"); + printf("%d %d %d %d %d\n", x0, x1, x3, ix, g_proc_id); + printf("Aborting program!\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + } + for(x2 = 0; x2 < LY-1; x2++) { + for(x3 = 0; x3 < LZ; x3++) { + ix = g_ipt[T-1][LX-1][x2][x3]; + for(mu = 0; mu < 4; mu++) { + if(df0[ix][mu].d1 != (double)(g_nb_x_up + g_nb_t_up) || + df0[ix][mu].d2 != (double)(g_nb_x_up + g_nb_t_up) || + df0[ix][mu].d3 != (double)(g_nb_x_up + g_nb_t_up) || + df0[ix][mu].d4 != (double)(g_nb_x_up + g_nb_t_up) || + df0[ix][mu].d5 != (double)(g_nb_x_up + g_nb_t_up) || + df0[ix][mu].d6 != (double)(g_nb_x_up + g_nb_t_up) || + df0[ix][mu].d7 != (double)(g_nb_x_up + g_nb_t_up) || + df0[ix][mu].d8 != (double)(g_nb_x_up + g_nb_t_up)){ + printf("Exchange of derivatives is working not correctly (8)!\n"); + printf("Aborting program!\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + for(x1 = 0; x1 < LX-1; x1++) { + for(x3 = 0; x3 < LZ; x3++) { + ix = g_ipt[T-1][x1][LY-1][x3]; + for(mu = 0; mu < 4; mu++) { + if(df0[ix][mu].d1 != (double)(g_nb_t_up + g_nb_y_up) || + df0[ix][mu].d2 != (double)(g_nb_t_up + g_nb_y_up) || + df0[ix][mu].d3 != (double)(g_nb_t_up + g_nb_y_up) || + df0[ix][mu].d4 != (double)(g_nb_t_up + g_nb_y_up) || + df0[ix][mu].d5 != (double)(g_nb_t_up + g_nb_y_up) || + df0[ix][mu].d6 != (double)(g_nb_t_up + g_nb_y_up) || + df0[ix][mu].d7 != (double)(g_nb_t_up + g_nb_y_up) || + df0[ix][mu].d8 != (double)(g_nb_t_up + g_nb_y_up)){ + printf("Exchange of derivatives is working not correctly (9)!\n"); + printf("Aborting program!\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + for(x0 = 0; x0 < T-1; x0++) { + for(x3 = 0; x3 < LZ; x3++) { + ix = g_ipt[x0][LX-1][LY-1][x3]; + for(mu = 0; mu < 4; mu++) { + if(df0[ix][mu].d1 != (double)(g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d2 != (double)(g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d3 != (double)(g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d4 != (double)(g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d5 != (double)(g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d6 != (double)(g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d7 != (double)(g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d8 != (double)(g_nb_x_up + g_nb_y_up)){ + printf("Exchange of derivatives is working not correctly (10)!\n"); + printf("Aborting program!\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + for(x3 = 0; x3 < LZ; x3++) { + ix = g_ipt[T-1][LX-1][LY-1][x3]; + for(mu = 0; mu < 4; mu++) { + if(df0[ix][mu].d1 != (double)(g_nb_t_up + g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d2 != (double)(g_nb_t_up + g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d3 != (double)(g_nb_t_up + g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d4 != (double)(g_nb_t_up + g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d5 != (double)(g_nb_t_up + g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d6 != (double)(g_nb_t_up + g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d7 != (double)(g_nb_t_up + g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d8 != (double)(g_nb_t_up + g_nb_x_up + g_nb_y_up)){ + printf("Exchange of derivatives is working not correctly (11)!\n"); + printf("Aborting program!\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + +# endif + +# if defined PARALLELXYZT + for(x1 = 0; x1 < LX-1; x1++) { + for(x2 = 0; x2 < LY-1; x2++) { + for(x3 = 0; x3 < LZ-1; x3++) { + ix = g_ipt[T-1][x1][x2][x3]; + for(mu = 0; mu < 4; mu++) { + if(df0[ix][mu].d1 != (double)g_nb_t_up || + df0[ix][mu].d2 != (double)g_nb_t_up || + df0[ix][mu].d3 != (double)g_nb_t_up || + df0[ix][mu].d4 != (double)g_nb_t_up || + df0[ix][mu].d5 != (double)g_nb_t_up || + df0[ix][mu].d6 != (double)g_nb_t_up || + df0[ix][mu].d7 != (double)g_nb_t_up || + df0[ix][mu].d8 != (double)g_nb_t_up){ + printf("Exchange of derivatives is working not correctly (12)!\n"); + printf("%d %d %d %d %d\n", x1, x2, x3, ix, g_proc_id); + printf("%f %d %d\n", df0[ix][mu].d8, g_nb_t_up, g_nb_t_dn); + printf("Aborting program!\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + } + for(x0 = 0; x0 < T-1; x0++) { + for(x2 = 0; x2 < LY-1; x2++) { + for(x3 = 0; x3 < LZ-1; x3++) { + ix = g_ipt[x0][LX-1][x2][x3]; + for(mu = 0; mu < 4; mu++) { + if(df0[ix][mu].d1 != (double)g_nb_x_up || + df0[ix][mu].d2 != (double)g_nb_x_up || + df0[ix][mu].d3 != (double)g_nb_x_up || + df0[ix][mu].d4 != (double)g_nb_x_up || + df0[ix][mu].d5 != (double)g_nb_x_up || + df0[ix][mu].d6 != (double)g_nb_x_up || + df0[ix][mu].d7 != (double)g_nb_x_up || + df0[ix][mu].d8 != (double)g_nb_x_up){ + printf("Exchange of derivatives is working not correctly (13)!\n"); + printf("Aborting program!\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + } + for(x0 = 0; x0 < T-1; x0++) { + for(x1 = 1; x1 < LX-1; x1++) { + for(x3 = 0; x3 < LZ-1; x3++) { + ix = g_ipt[x0][x1][LY-1][x3]; + for(mu = 0; mu < 4; mu++) { + if(df0[ix][mu].d1 != (double)g_nb_y_up || + df0[ix][mu].d2 != (double)g_nb_y_up || + df0[ix][mu].d3 != (double)g_nb_y_up || + df0[ix][mu].d4 != (double)g_nb_y_up || + df0[ix][mu].d5 != (double)g_nb_y_up || + df0[ix][mu].d6 != (double)g_nb_y_up || + df0[ix][mu].d7 != (double)g_nb_y_up || + df0[ix][mu].d8 != (double)g_nb_y_up){ + printf("Exchange of derivatives is working not correctly (14)!\n"); + printf("%d %d %d %d %d\n", x0, x1, x3, ix, g_proc_id); + printf("Aborting program!\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + } + for(x0 = 0; x0 < T-1; x0++) { + for(x1 = 1; x1 < LX-1; x1++) { + for(x2 = 0; x2 < LY-1; x2++) { + ix = g_ipt[x0][x1][x2][LZ-1]; + for(mu = 0; mu < 4; mu++) { + if(df0[ix][mu].d1 != (double)g_nb_z_up || + df0[ix][mu].d2 != (double)g_nb_z_up || + df0[ix][mu].d3 != (double)g_nb_z_up || + df0[ix][mu].d4 != (double)g_nb_z_up || + df0[ix][mu].d5 != (double)g_nb_z_up || + df0[ix][mu].d6 != (double)g_nb_z_up || + df0[ix][mu].d7 != (double)g_nb_z_up || + df0[ix][mu].d8 != (double)g_nb_z_up){ + printf("Exchange of derivatives is working not correctly (15)!\n"); + printf("%d %d %d %d %d\n", x0, x1, x3, ix, g_proc_id); + printf("Aborting program!\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + } + for(x2 = 0; x2 < LY-1; x2++) { + for(x3 = 0; x3 < LZ-1; x3++) { + ix = g_ipt[T-1][LX-1][x2][x3]; + for(mu = 0; mu < 4; mu++) { + if(df0[ix][mu].d1 != (double)(g_nb_x_up + g_nb_t_up) || + df0[ix][mu].d2 != (double)(g_nb_x_up + g_nb_t_up) || + df0[ix][mu].d3 != (double)(g_nb_x_up + g_nb_t_up) || + df0[ix][mu].d4 != (double)(g_nb_x_up + g_nb_t_up) || + df0[ix][mu].d5 != (double)(g_nb_x_up + g_nb_t_up) || + df0[ix][mu].d6 != (double)(g_nb_x_up + g_nb_t_up) || + df0[ix][mu].d7 != (double)(g_nb_x_up + g_nb_t_up) || + df0[ix][mu].d8 != (double)(g_nb_x_up + g_nb_t_up)){ + printf("Exchange of derivatives is working not correctly (16)!\n"); + printf("Aborting program!\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + for(x1 = 0; x1 < LX-1; x1++) { + for(x3 = 0; x3 < LZ-1; x3++) { + ix = g_ipt[T-1][x1][LY-1][x3]; + for(mu = 0; mu < 4; mu++) { + if(df0[ix][mu].d1 != (double)(g_nb_t_up + g_nb_y_up) || + df0[ix][mu].d2 != (double)(g_nb_t_up + g_nb_y_up) || + df0[ix][mu].d3 != (double)(g_nb_t_up + g_nb_y_up) || + df0[ix][mu].d4 != (double)(g_nb_t_up + g_nb_y_up) || + df0[ix][mu].d5 != (double)(g_nb_t_up + g_nb_y_up) || + df0[ix][mu].d6 != (double)(g_nb_t_up + g_nb_y_up) || + df0[ix][mu].d7 != (double)(g_nb_t_up + g_nb_y_up) || + df0[ix][mu].d8 != (double)(g_nb_t_up + g_nb_y_up)){ + printf("Exchange of derivatives is working not correctly (17)!\n"); + printf("Aborting program!\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + for(x0 = 0; x0 < T-1; x0++) { + for(x3 = 0; x3 < LZ-1; x3++) { + ix = g_ipt[x0][LX-1][LY-1][x3]; + for(mu = 0; mu < 4; mu++) { + if(df0[ix][mu].d1 != (double)(g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d2 != (double)(g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d3 != (double)(g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d4 != (double)(g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d5 != (double)(g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d6 != (double)(g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d7 != (double)(g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d8 != (double)(g_nb_x_up + g_nb_y_up)){ + printf("Exchange of derivatives is working not correctly (18)!\n"); + printf("Aborting program!\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + for(x0 = 0; x0 < T-1; x0++) { + for(x2 = 0; x2 < LY-1; x2++) { + ix = g_ipt[x0][LX-1][x2][LZ-1]; + for(mu = 0; mu < 4; mu++) { + if(df0[ix][mu].d1 != (double)(g_nb_x_up + g_nb_z_up) || + df0[ix][mu].d2 != (double)(g_nb_x_up + g_nb_z_up) || + df0[ix][mu].d3 != (double)(g_nb_x_up + g_nb_z_up) || + df0[ix][mu].d4 != (double)(g_nb_x_up + g_nb_z_up) || + df0[ix][mu].d5 != (double)(g_nb_x_up + g_nb_z_up) || + df0[ix][mu].d6 != (double)(g_nb_x_up + g_nb_z_up) || + df0[ix][mu].d7 != (double)(g_nb_x_up + g_nb_z_up) || + df0[ix][mu].d8 != (double)(g_nb_x_up + g_nb_z_up)){ + printf("Exchange of derivatives is working not correctly (19)!\n"); + printf("%f %d %d %d\n", df0[ix][mu].d1, g_nb_x_up + g_nb_z_up, g_nb_x_up, g_nb_z_up); + printf("Aborting program!\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + for(x0 = 0; x0 < T-1; x0++) { + for(x1 = 0; x1 < LX-1; x1++) { + ix = g_ipt[x0][x1][LY-1][LZ-1]; + for(mu = 0; mu < 4; mu++) { + if(df0[ix][mu].d1 != (double)(g_nb_y_up + g_nb_z_up) || + df0[ix][mu].d2 != (double)(g_nb_y_up + g_nb_z_up) || + df0[ix][mu].d3 != (double)(g_nb_y_up + g_nb_z_up) || + df0[ix][mu].d4 != (double)(g_nb_y_up + g_nb_z_up) || + df0[ix][mu].d5 != (double)(g_nb_y_up + g_nb_z_up) || + df0[ix][mu].d6 != (double)(g_nb_y_up + g_nb_z_up) || + df0[ix][mu].d7 != (double)(g_nb_y_up + g_nb_z_up) || + df0[ix][mu].d8 != (double)(g_nb_y_up + g_nb_z_up)){ + printf("Exchange of derivatives is working not correctly (20)!\n"); + printf("Aborting program!\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + for(x1 = 0; x1 < LX-1; x1++) { + for(x2 = 0; x2 < LY-1; x2++) { + ix = g_ipt[T-1][x1][x2][LZ-1]; + for(mu = 0; mu < 4; mu++) { + if(df0[ix][mu].d1 != (double)(g_nb_t_up + g_nb_z_up) || + df0[ix][mu].d2 != (double)(g_nb_t_up + g_nb_z_up) || + df0[ix][mu].d3 != (double)(g_nb_t_up + g_nb_z_up) || + df0[ix][mu].d4 != (double)(g_nb_t_up + g_nb_z_up) || + df0[ix][mu].d5 != (double)(g_nb_t_up + g_nb_z_up) || + df0[ix][mu].d6 != (double)(g_nb_t_up + g_nb_z_up) || + df0[ix][mu].d7 != (double)(g_nb_t_up + g_nb_z_up) || + df0[ix][mu].d8 != (double)(g_nb_t_up + g_nb_z_up)){ + printf("Exchange of derivatives is working not correctly (21)!\n"); + printf("Aborting program!\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + for(x3 = 0; x3 < LZ-1; x3++) { + ix = g_ipt[T-1][LX-1][LY-1][x3]; + for(mu = 0; mu < 4; mu++) { + if(df0[ix][mu].d1 != (double)(g_nb_t_up + g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d2 != (double)(g_nb_t_up + g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d3 != (double)(g_nb_t_up + g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d4 != (double)(g_nb_t_up + g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d5 != (double)(g_nb_t_up + g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d6 != (double)(g_nb_t_up + g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d7 != (double)(g_nb_t_up + g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d8 != (double)(g_nb_t_up + g_nb_x_up + g_nb_y_up)){ + printf("Exchange of derivatives is working not correctly (22)!\n"); + printf("Aborting program!\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + for(x2 = 0; x2 < LY-1; x2++) { + ix = g_ipt[T-1][LX-1][x2][LZ-1]; + for(mu = 0; mu < 4; mu++) { + if(df0[ix][mu].d1 != (double)(g_nb_t_up + g_nb_x_up + g_nb_z_up) || + df0[ix][mu].d2 != (double)(g_nb_t_up + g_nb_x_up + g_nb_z_up) || + df0[ix][mu].d3 != (double)(g_nb_t_up + g_nb_x_up + g_nb_z_up) || + df0[ix][mu].d4 != (double)(g_nb_t_up + g_nb_x_up + g_nb_z_up) || + df0[ix][mu].d5 != (double)(g_nb_t_up + g_nb_x_up + g_nb_z_up) || + df0[ix][mu].d6 != (double)(g_nb_t_up + g_nb_x_up + g_nb_z_up) || + df0[ix][mu].d7 != (double)(g_nb_t_up + g_nb_x_up + g_nb_z_up) || + df0[ix][mu].d8 != (double)(g_nb_t_up + g_nb_x_up + g_nb_z_up)){ + printf("Exchange of derivatives is working not correctly (23)!\n"); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + for(x1 = 0; x1 < LX-1; x1++) { + ix = g_ipt[T-1][x1][LY-1][LZ-1]; + for(mu = 0; mu < 4; mu++) { + if(df0[ix][mu].d1 != (double)(g_nb_t_up + g_nb_z_up + g_nb_y_up) || + df0[ix][mu].d2 != (double)(g_nb_t_up + g_nb_z_up + g_nb_y_up) || + df0[ix][mu].d3 != (double)(g_nb_t_up + g_nb_z_up + g_nb_y_up) || + df0[ix][mu].d4 != (double)(g_nb_t_up + g_nb_z_up + g_nb_y_up) || + df0[ix][mu].d5 != (double)(g_nb_t_up + g_nb_z_up + g_nb_y_up) || + df0[ix][mu].d6 != (double)(g_nb_t_up + g_nb_z_up + g_nb_y_up) || + df0[ix][mu].d7 != (double)(g_nb_t_up + g_nb_z_up + g_nb_y_up) || + df0[ix][mu].d8 != (double)(g_nb_t_up + g_nb_z_up + g_nb_y_up)){ + printf("Exchange of derivatives is working not correctly (24)!\n"); + printf("Aborting program!\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + for(x0 = 0; x0 < T-1; x0++) { + ix = g_ipt[x0][LX-1][LY-1][LZ-1]; + for(mu = 0; mu < 4; mu++) { + if(df0[ix][mu].d1 != (double)(g_nb_z_up + g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d2 != (double)(g_nb_z_up + g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d3 != (double)(g_nb_z_up + g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d4 != (double)(g_nb_z_up + g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d5 != (double)(g_nb_z_up + g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d6 != (double)(g_nb_z_up + g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d7 != (double)(g_nb_z_up + g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d8 != (double)(g_nb_z_up + g_nb_x_up + g_nb_y_up)){ + printf("Exchange of derivatives is working not correctly (25)!\n"); + printf("Aborting program!\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + ix = g_ipt[T-1][LX-1][LY-1][LZ-1]; + for(mu = 0; mu < 4; mu++) { + if(df0[ix][mu].d1 != (double)(g_nb_z_up + g_nb_x_up + g_nb_y_up + g_nb_t_up) || + df0[ix][mu].d2 != (double)(g_nb_z_up + g_nb_x_up + g_nb_y_up + g_nb_t_up) || + df0[ix][mu].d3 != (double)(g_nb_z_up + g_nb_x_up + g_nb_y_up + g_nb_t_up) || + df0[ix][mu].d4 != (double)(g_nb_z_up + g_nb_x_up + g_nb_y_up + g_nb_t_up) || + df0[ix][mu].d5 != (double)(g_nb_z_up + g_nb_x_up + g_nb_y_up + g_nb_t_up) || + df0[ix][mu].d6 != (double)(g_nb_z_up + g_nb_x_up + g_nb_y_up + g_nb_t_up) || + df0[ix][mu].d7 != (double)(g_nb_z_up + g_nb_x_up + g_nb_y_up + g_nb_t_up) || + df0[ix][mu].d8 != (double)(g_nb_z_up + g_nb_x_up + g_nb_y_up + g_nb_t_up)){ + printf("Exchange of derivatives is working not correctly (26)!\n"); + printf("Aborting program!\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + +# endif + +# if defined PARALLELX + for(x0 = 0; x0 < T; x0++) { + for(x2 = 0; x2 < LY; x2++) { + for(x3 = 0; x3 < LZ; x3++) { + ix = g_ipt[x0][LX-1][x2][x3]; + for(mu = 0; mu < 4; mu++) { + if(df0[ix][mu].d1 != (double)g_nb_x_up || + df0[ix][mu].d2 != (double)g_nb_x_up || + df0[ix][mu].d3 != (double)g_nb_x_up || + df0[ix][mu].d4 != (double)g_nb_x_up || + df0[ix][mu].d5 != (double)g_nb_x_up || + df0[ix][mu].d6 != (double)g_nb_x_up || + df0[ix][mu].d7 != (double)g_nb_x_up || + df0[ix][mu].d8 != (double)g_nb_x_up){ + printf("Exchange of derivatives is working not correctly (27)!\n"); + printf("Aborting program!"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + } +# endif +# if defined PARALLELXY + for(x0 = 0; x0 < T; x0++) { + for(x1 = 0; x1 < LX-1; x1++) { + for(x3 = 0; x3 < LZ; x3++) { + ix = g_ipt[x0][x1][LY-1][x3]; + for(mu = 0; mu < 4; mu++) { + if(df0[ix][mu].d1 != (double)g_nb_y_up || + df0[ix][mu].d2 != (double)g_nb_y_up || + df0[ix][mu].d3 != (double)g_nb_y_up || + df0[ix][mu].d4 != (double)g_nb_y_up || + df0[ix][mu].d5 != (double)g_nb_y_up || + df0[ix][mu].d6 != (double)g_nb_y_up || + df0[ix][mu].d7 != (double)g_nb_y_up || + df0[ix][mu].d8 != (double)g_nb_y_up){ + printf("Exchange of derivatives is working not correctly (28)!\n"); + printf("Aborting program!"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + } + for(x0 = 0; x0 < T; x0++) { + for(x2 = 0; x2 < LY-1; x2++) { + for(x3 = 0; x3 < LZ; x3++) { + ix = g_ipt[x0][LX-1][x2][x3]; + for(mu = 0; mu < 4; mu++) { + if(df0[ix][mu].d1 != (double)g_nb_x_up || + df0[ix][mu].d2 != (double)g_nb_x_up || + df0[ix][mu].d3 != (double)g_nb_x_up || + df0[ix][mu].d4 != (double)g_nb_x_up || + df0[ix][mu].d5 != (double)g_nb_x_up || + df0[ix][mu].d6 != (double)g_nb_x_up || + df0[ix][mu].d7 != (double)g_nb_x_up || + df0[ix][mu].d8 != (double)g_nb_x_up){ + printf("Exchange of derivatives is working not correctly (29)!\n"); + printf("Aborting program!"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + } + for(x0 = 0; x0 < T; x0++) { + for(x3 = 0; x3 < LZ; x3++) { + ix = g_ipt[x0][LX-1][LY-1][x3]; + for(mu = 0; mu < 4; mu++) { + if(df0[ix][mu].d1 != (double)(g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d2 != (double)(g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d3 != (double)(g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d4 != (double)(g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d5 != (double)(g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d6 != (double)(g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d7 != (double)(g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d8 != (double)(g_nb_x_up + g_nb_y_up)){ + printf("Exchange of derivatives is working not correctly (30)!\n"); + printf("Aborting program!\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } +# endif + +# if defined PARALLELXYZ + for(x0 = 0; x0 < T; x0++) { + for(x1 = 0; x1 < LX-1; x1++) { + for(x2 = 0; x2 < LY-1; x2++) { + ix = g_ipt[x0][x1][x2][LZ-1]; + for(mu = 0; mu < 4; mu++) { + if(df0[ix][mu].d1 != (double)g_nb_z_up || + df0[ix][mu].d2 != (double)g_nb_z_up || + df0[ix][mu].d3 != (double)g_nb_z_up || + df0[ix][mu].d4 != (double)g_nb_z_up || + df0[ix][mu].d5 != (double)g_nb_z_up || + df0[ix][mu].d6 != (double)g_nb_z_up || + df0[ix][mu].d7 != (double)g_nb_z_up || + df0[ix][mu].d8 != (double)g_nb_z_up){ + printf("Exchange of derivatives is working not correctly (31)!\n"); + printf("Aborting program!\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + } + for(x0 = 0; x0 < T; x0++) { + for(x2 = 0; x2 < LY-1; x2++) { + for(x3 = 0; x3 < LZ-1; x3++) { + ix = g_ipt[x0][LX-1][x2][x3]; + for(mu = 0; mu < 4; mu++) { + if(df0[ix][mu].d1 != (double)g_nb_x_up || + df0[ix][mu].d2 != (double)g_nb_x_up || + df0[ix][mu].d3 != (double)g_nb_x_up || + df0[ix][mu].d4 != (double)g_nb_x_up || + df0[ix][mu].d5 != (double)g_nb_x_up || + df0[ix][mu].d6 != (double)g_nb_x_up || + df0[ix][mu].d7 != (double)g_nb_x_up || + df0[ix][mu].d8 != (double)g_nb_x_up){ + printf("Exchange of derivatives is working not correctly (32)!\n"); + printf("Aborting program!\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + } + for(x0 = 0; x0 < T; x0++) { + for(x1 = 1; x1 < LX-1; x1++) { + for(x3 = 0; x3 < LZ-1; x3++) { + ix = g_ipt[x0][x1][LY-1][x3]; + for(mu = 0; mu < 4; mu++) { + if(df0[ix][mu].d1 != (double)g_nb_y_up || + df0[ix][mu].d2 != (double)g_nb_y_up || + df0[ix][mu].d3 != (double)g_nb_y_up || + df0[ix][mu].d4 != (double)g_nb_y_up || + df0[ix][mu].d5 != (double)g_nb_y_up || + df0[ix][mu].d6 != (double)g_nb_y_up || + df0[ix][mu].d7 != (double)g_nb_y_up || + df0[ix][mu].d8 != (double)g_nb_y_up){ + printf("Exchange of derivatives is working not correctly (33)!\n"); + printf("Aborting program!\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + } + for(x0 = 0; x0 < T; x0++) { + for(x2 = 0; x2 < LY-1; x2++) { + ix = g_ipt[x0][LX-1][x2][LZ-1]; + for(mu = 0; mu < 4; mu++) { + if(df0[ix][mu].d1 != (double)(g_nb_x_up + g_nb_z_up) || + df0[ix][mu].d2 != (double)(g_nb_x_up + g_nb_z_up) || + df0[ix][mu].d3 != (double)(g_nb_x_up + g_nb_z_up) || + df0[ix][mu].d4 != (double)(g_nb_x_up + g_nb_z_up) || + df0[ix][mu].d5 != (double)(g_nb_x_up + g_nb_z_up) || + df0[ix][mu].d6 != (double)(g_nb_x_up + g_nb_z_up) || + df0[ix][mu].d7 != (double)(g_nb_x_up + g_nb_z_up) || + df0[ix][mu].d8 != (double)(g_nb_x_up + g_nb_z_up)){ + printf("Exchange of derivatives is working not correctly (34)!\n"); + printf("Aborting program!\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + for(x0 = 0; x0 < T; x0++) { + for(x1 = 0; x1 < LX-1; x1++) { + ix = g_ipt[x0][x1][LY-1][LZ-1]; + for(mu = 0; mu < 4; mu++) { + if(df0[ix][mu].d1 != (double)(g_nb_z_up + g_nb_y_up) || + df0[ix][mu].d2 != (double)(g_nb_z_up + g_nb_y_up) || + df0[ix][mu].d3 != (double)(g_nb_z_up + g_nb_y_up) || + df0[ix][mu].d4 != (double)(g_nb_z_up + g_nb_y_up) || + df0[ix][mu].d5 != (double)(g_nb_z_up + g_nb_y_up) || + df0[ix][mu].d6 != (double)(g_nb_z_up + g_nb_y_up) || + df0[ix][mu].d7 != (double)(g_nb_z_up + g_nb_y_up) || + df0[ix][mu].d8 != (double)(g_nb_z_up + g_nb_y_up)){ + printf("Exchange of derivatives is working not correctly (35)!\n"); + printf("Aborting program!\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + for(x0 = 0; x0 < T; x0++) { + for(x3 = 0; x3 < LZ-1; x3++) { + ix = g_ipt[x0][LX-1][LY-1][x3]; + for(mu = 0; mu < 4; mu++) { + if(df0[ix][mu].d1 != (double)(g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d2 != (double)(g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d3 != (double)(g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d4 != (double)(g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d5 != (double)(g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d6 != (double)(g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d7 != (double)(g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d8 != (double)(g_nb_x_up + g_nb_y_up)){ + printf("Exchange of derivatives is working not correctly (36)!\n"); + printf("Aborting program!\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + for(x0 = 0; x0 < T; x0++) { + ix = g_ipt[x0][LX-1][LY-1][LZ-1]; + for(mu = 0; mu < 4; mu++) { + if(df0[ix][mu].d1 != (double)(g_nb_z_up + g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d2 != (double)(g_nb_z_up + g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d3 != (double)(g_nb_z_up + g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d4 != (double)(g_nb_z_up + g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d5 != (double)(g_nb_z_up + g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d6 != (double)(g_nb_z_up + g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d7 != (double)(g_nb_z_up + g_nb_x_up + g_nb_y_up) || + df0[ix][mu].d8 != (double)(g_nb_z_up + g_nb_x_up + g_nb_y_up)){ + printf("Exchange of derivatives is working not correctly (37)!\n"); + printf("Aborting program!\n"); + fflush(stdout);fflush(stderr); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + +# endif + + if(g_proc_id == 0) { + printf("# The exchange routines are working correctly.\n"); + } + fflush(stdout); + fflush(stderr); + +#endif /* MPI */ + return(0); +} + +#else /* _INDEX_INDEP_GEOM */ + +int check_xchange() +{ +#ifdef XLC +#pragma execution_frequency(very_low) +#endif + +#ifdef MPI + double * x; + int i,ix, mu, x0, x1, x2, x3 = 0, k; + int mp, pm, mm, pp, di[4]; + + + for(k = 0; k < 1; k++) { + + /* Check the field exchange */ + /* Set the whole field to -1 */ + set_spinor_field(0, -1.); + + /* Set the internal boundary to g_cart_id */ + /* We need here g_lexic2eo, otherwise the test is useless... */ + for(x1 = 0; x1 < LX; x1++) { + for(x2 = 0; x2 < LY; x2++) { + for(x3 = 0; x3 < LZ; x3++) { + set_spinor_point(&g_spinor_field[0][ g_lexic2eo[g_ipt[0][x1][x2][x3]] ], g_cart_id); + set_spinor_point(&g_spinor_field[0][ g_lexic2eo[g_ipt[T-1][x1][x2][x3]] ], g_cart_id); + } + } + } + +# if ((defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT)) + for(x0 = 0; x0 < T; x0++) { + for(x2 = 0; x2 < LY; x2++) { + for(x3 = 0; x3 < LZ; x3++) { + set_spinor_point(&g_spinor_field[0][ g_lexic2eo[g_ipt[x0][0][x2][x3]] ], g_cart_id); + set_spinor_point(&g_spinor_field[0][ g_lexic2eo[g_ipt[x0][LX-1][x2][x3]] ], g_cart_id); + } + } + } +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT) + for(x0 = 0; x0 < T; x0++) { + for(x1 = 0; x1 < LX; x1++) { + for(x3 = 0; x3 < LZ; x3++) { + set_spinor_point(&g_spinor_field[0][ g_lexic2eo[g_ipt[x0][x1][0][x3]] ], g_cart_id); + set_spinor_point(&g_spinor_field[0][ g_lexic2eo[g_ipt[x0][x1][LY-1][x3]] ], g_cart_id); + } + } + } +# endif + + MPI_Barrier(MPI_COMM_WORLD); + xchange_field(g_spinor_field[0], 0); + MPI_Barrier(MPI_COMM_WORLD); + + x = (double*) &g_spinor_field[0][VOLUME/2]; + for(i = 0; i < LX*LY*LZ/2*24; i++, x++) { + if((int)(*x) != g_nb_t_up) { + printf("The exchange up of fields in time direction\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_t_up); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) &g_spinor_field[0][(VOLUME+LX*LY*LZ)/2]; + for(i = 0; i < LX*LY*LZ/2*24; i++, x++) { + if((int)(*x) != g_nb_t_dn) { + printf("The exchange down of fields in time direction\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_t_dn); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + +# if ((defined PARALLELXT) || (defined PARALLELXYT) || (defined PARALLELXYZT)) + x = (double*) &g_spinor_field[0][(VOLUME+2*LX*LY*LZ)/2]; + for(i = 0; i < T*LY*LZ/2*24; i++, x++) { + if((int)(*x) != g_nb_x_up) { + printf("The exchange up of fields in x direction\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_x_up); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) &g_spinor_field[0][(VOLUME+2*LX*LY*LZ)/2+T*LY*LZ/2]; + for(i = 0; i < T*LY*LZ/2*24; i++, x++) { + if((int)(*x) != g_nb_x_dn) { + printf("The exchange down of fields in x direction\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_x_dn); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT) + x = (double*) &g_spinor_field[0][(VOLUME+2*LX*LY*LZ)/2+2*T*LY*LZ/2]; + for(i = 0; i < T*LX*LZ/2*24; i++, x++) { + if((int)(*x) != g_nb_y_up) { + printf("The exchange up of fields in y direction\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_y_up); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) &g_spinor_field[0][(VOLUME+2*LX*LY*LZ)/2+2*T*LY*LZ/2+T*LX*LZ/2]; + for(i = 0; i < T*LX*LZ/2*24; i++, x++) { + if((int)(*x) != g_nb_y_dn) { + printf("The exchange down of fields in y direction\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_y_dn); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } +# endif + +# if (defined PARALLELXYZT) + set_spinor_field(0, -1.); + + for(x0 = 0; x0 < T; x0++) { + for(x1 = 0; x1 < LX; x1++) { + for(x2 = 0; x2 < LY; x2++) { + set_spinor_point(&g_spinor_field[0][ g_lexic2eosub[g_ipt[x0][x1][x2][0]] ], g_cart_id); + set_spinor_point(&g_spinor_field[0][ g_lexic2eo[g_ipt[x0][x1][x2][LZ-1]] ], g_cart_id); + } + } + } + + MPI_Barrier(MPI_COMM_WORLD); + xchange_field(g_spinor_field[0],1); + MPI_Barrier(MPI_COMM_WORLD); + + x = (double*) &g_spinor_field[0][VOLUME/2 + 2*LX*LY*LZ/2 + 2*T*LY*LZ/2 + 2*T*LX*LZ/2]; + for(i = 0; i < T*LX*LY/2*24; i++, x++) { + if((int)(*x) != g_nb_z_up) { + printf("The exchange up of fields in z (1) direction up\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_z_up); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) &g_spinor_field[0][VOLUME/2 + 2*LX*LY*LZ/2 + 2*T*LY*LZ/2 + 2*T*LX*LZ/2 + T*LX*LY/2]; + for(i = 0; i < T*LX*LY/2*24; i++, x++) { + if((int)(*x) != g_nb_z_dn) { + printf("The exchange down of fields in z (1) direction down\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_z_dn); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + set_spinor_field(0, -1.); + + for(x0 = 0; x0 < T; x0++) { + for(x1 = 0; x1 < LX; x1++) { + for(x2 = 0; x2 < LY; x2++) { + set_spinor_point(&g_spinor_field[0][ g_lexic2eo[g_ipt[x0][x1][x2][0]] ], g_cart_id); + set_spinor_point(&g_spinor_field[0][ g_lexic2eosub[g_ipt[x0][x1][x2][LZ-1]] ], g_cart_id); + } + } + } + + MPI_Barrier(MPI_COMM_WORLD); + xchange_field(g_spinor_field[0],1); + MPI_Barrier(MPI_COMM_WORLD); + + x = (double*) &g_spinor_field[0][VOLUME/2 + 2*LX*LY*LZ/2 + 2*T*LY*LZ/2 + 2*T*LX*LZ/2]; + for(i = 0; i < T*LX*LY/2*24; i++, x++) { + if((int)(*x) != g_nb_z_up) { + printf("The exchange up of fields in z (0) direction up\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_z_up); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) &g_spinor_field[0][VOLUME/2 + 2*LX*LY*LZ/2 + 2*T*LY*LZ/2 + 2*T*LX*LZ/2 + T*LX*LY/2]; + for(i = 0; i < T*LX*LY/2*24; i++, x++) { + if((int)(*x) != g_nb_z_dn) { + printf("The exchange down of fields in z (0) direction down\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_z_dn); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } +# endif + + + + if(g_proc_id == 0) { + printf("# Exchange of spinor fields checked successfully!\n"); + } + + /* Check the gauge exchange */ + + set_gauge_field(-1.); + + /* Set the time boundary */ + for(x1 = 0; x1 < LX; x1++) { + for(x2 = 0; x2 < LY; x2++) { + for(x3 = 0; x3 < LZ; x3++) { + for (mu = 0; mu < 4; mu++) { + g_gauge_field[ g_ipt[0][x1][x2][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[T-1][x1][x2][x3] ][mu] = set_su3((double)g_cart_id); + } + } + } + } + +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) + /* Set the x boundary */ + for(x0 = 0; x0 < T; x0++) { + for(x2 = 0; x2 < LY; x2++) { + for(x3 = 0; x3 < LZ; x3++) { + for (mu = 0; mu < 4; mu++) { + g_gauge_field[ g_ipt[x0][0][x2][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][LX-1][x2][x3] ][mu] = set_su3((double)g_cart_id); + } + } + } + } +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT) + /* Set the y boundary */ + for(x0 = 0; x0 < T; x0++) { + for(x1 = 0; x1 < LX; x1++) { + for(x3 = 0; x3 < LZ; x3++) { + for (mu = 0; mu < 4; mu++) { + g_gauge_field[ g_ipt[x0][x1][0][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][x1][LY-1][x3] ][mu] = set_su3((double)g_cart_id); + } + } + } + } +# endif + +# if (defined PARALLELXYZT) + /* Set the z boundary */ + for(x0 = 0; x0 < T; x0++) { + for(x1 = 0; x1 < LX; x1++) { + for(x2 = 0; x2 < LY; x2++) { + for (mu = 0; mu < 4; mu++) { + g_gauge_field[ g_ipt[x0][x1][x2][0] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][x1][x2][LZ-1] ][mu] = set_su3((double)g_cart_id); + } + } + } + } +# endif + + MPI_Barrier(MPI_COMM_WORLD); + xchange_gauge(g_gauge_field); + MPI_Barrier(MPI_COMM_WORLD); + + x = (double*) &g_gauge_field[T*LX*LY*LZ][0]; + for(i = 0; i < LX*LY*LZ*72; i++, x++) { + if((int)(*x) != g_nb_t_up) { + printf("The exchange up of gaugefields in time direction\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_t_up); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) &g_gauge_field[(T+1)*LX*LY*LZ][0]; + for(i = 0; i < LX*LZ*LY*72; i++, x++) { + if((int)(*x) != g_nb_t_dn) { + printf("The exchange down of gaugefields in time direction\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_t_dn); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) + x = (double*) &g_gauge_field[(T+2)*LX*LY*LZ][0]; + for(i = 0; i < T*LY*LZ*72; i++, x++) { + if((int)(*x) != g_nb_x_up) { + printf("The exchange up of gaugefields in x direction\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_x_up); + printf("%d %d %d\n", g_cart_id, i, (int)(*x)); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) &g_gauge_field[(T+2)*LX*LY*LZ+T*LY*LZ][0]; + for(i = 0; i < T*LY*LZ*72; i++, x++) { + if((int)(*x) != g_nb_x_dn) { + printf("The exchange down of gaugefields in x direction\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_x_dn); + printf("%d %d %d\n", g_cart_id, i, (int)(*x)); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT) + x = (double*) &g_gauge_field[(T+2)*LX*LY*LZ + 2*T*LZ*LY][0]; + for(i = 0; i < T*LX*LZ*72; i++, x++) { + if((int)(*x) != g_nb_y_up) { + printf("The exchange up of gaugefields in y direction\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_y_up); + printf("%d %d %d\n", g_cart_id, i, (int)(*x)); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) &g_gauge_field[(T+2)*LX*LY*LZ+2*T*LY*LZ+T*LX*LZ][0]; + for(i = 0; i < T*LX*LZ*72; i++, x++) { + if((int)(*x) != g_nb_y_dn) { + printf("The exchange down of gaugefields in y direction\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_y_dn); + printf("%d %d %d\n", g_cart_id, i, (int)(*x)); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } +# endif + +# if (defined PARALLELXYZT) + x = (double*) g_gauge_field[VOLUME + 2*LX*LY*LZ + 2*T*LZ*LY + 2*T*LX*LZ]; + for(i = 0; i < T*LX*LY*72; i++, x++) { + if((int)(*x) != g_nb_z_up) { + printf("The exchange up of gaugefields in z direction up\n"); + printf("between %d and %d is not correct, down is %d\n", g_cart_id, g_nb_z_up, g_nb_z_dn); + printf("%d %d %d\n", g_cart_id, i, (int)(*x)); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) &g_gauge_field[VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + T*LX*LY][0]; + for(i = 0; i < T*LX*LY*72; i++, x++) { + if((int)(*x) != g_nb_z_dn) { + printf("The exchange down of gaugefields in z direction down\n"); + printf("between %d and %d is not correct, up is %d\n", g_cart_id, g_nb_z_dn, g_nb_z_up); + printf("%d %d %d\n", g_cart_id, i, (int)(*x)); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } +# endif + + set_gauge_field(-1.); + + /* Set the x boundary */ + for(x2 = 0; x2 < LY; x2++) { + for(x3 = 0; x3 < LZ; x3++) { + for (mu = 0; mu < 4; mu++) { + g_gauge_field[ g_ipt[0][0][x2][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[T-1][0][x2][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[0][LX-1][x2][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[T-1][LX-1][x2][x3] ][mu] = set_su3((double)g_cart_id); + } + } + } + + /* Set the y boundary */ + for(x0 = 0; x0 < T; x0++) { + for(x3 = 0; x3 < LZ; x3++) { + for (mu = 0; mu < 4; mu++) { + g_gauge_field[ g_ipt[x0][0][0][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][LX-1][0][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][0][LY-1][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][LX-1][LY-1][x3] ][mu] = set_su3((double)g_cart_id); + } + } + } + + /* Set the t boundary */ + for(x1 = 0; x1 < LX; x1++) { + for(x3 = 0; x3 < LZ; x3++) { + for (mu = 0; mu < 4; mu++) { + g_gauge_field[ g_ipt[0][x1][0][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[T-1][x1][0][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[0][x1][LY-1][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[T-1][x1][LY-1][x3] ][mu] = set_su3((double)g_cart_id); + } + } + } + + /* Set the z boundary */ + for(x1 = 0; x1 < LX; x1++) { + for(x2 = 0; x2 < LY; x2++) { + for (mu = 0; mu < 4; mu++) { + g_gauge_field[ g_ipt[0][x1][x2][0] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[T-1][x1][x2][0] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[0][x1][x2][LZ-1] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[T-1][x1][x2][LZ-1] ][mu] = set_su3((double)g_cart_id); + } + } + } + + /* Set the z boundary */ + for(x0 = 0; x0 < T; x0++) { + for(x2 = 0; x2 < LY; x2++) { + for (mu = 0; mu < 4; mu++) { + g_gauge_field[ g_ipt[x0][0][x2][0] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][LX-1][x2][0] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][0][x2][LZ-1] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][LX-1][x2][LZ-1] ][mu] = set_su3((double)g_cart_id); + } + } + } + + /* Set the z boundary */ + for(x0 = 0; x0 < T; x0++) { + for(x1 = 0; x1 < LX; x1++) { + for (mu = 0; mu < 4; mu++) { + g_gauge_field[ g_ipt[x0][x1][0][0] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][x1][LY-1][0] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][x1][0][LZ-1] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][x1][LY-1][LZ-1] ][mu] = set_su3((double)g_cart_id); + } + } + } + + MPI_Barrier(MPI_COMM_WORLD); + xchange_gauge(g_gauge_field); + MPI_Barrier(MPI_COMM_WORLD); + + /* The edges */ +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) + fprintf(stdout, "# Rank: %d, (c0, c1, c2, c3) = (%d, %d, %d, %d)\n",g_proc_id,g_proc_coords[0],g_proc_coords[1],g_proc_coords[2],g_proc_coords[3]); + fflush(stdout); + + di[0] = (g_proc_coords[0] - 1)%g_nproc_t; + di[1] = (g_proc_coords[1] - 1)%g_nproc_x; + di[2] = g_proc_coords[2]; + di[3] = g_proc_coords[3]; + MPI_Cart_rank(g_cart_grid, di, &mm); + di[0] = (g_proc_coords[0] - 1)%g_nproc_t; + di[1] = (g_proc_coords[1] + 1)%g_nproc_x; + MPI_Cart_rank(g_cart_grid, di, &mp); + di[0] = (g_proc_coords[0] + 1)%g_nproc_t; + di[1] = (g_proc_coords[1] - 1)%g_nproc_x; + MPI_Cart_rank(g_cart_grid, di, &pm); + di[0] = (g_proc_coords[0] + 1)%g_nproc_t; + di[1] = (g_proc_coords[1] + 1)%g_nproc_x; + MPI_Cart_rank(g_cart_grid, di, &pp); + + + x = (double*) g_gauge_field[VOLUME + RAND]; + for(i = 0; i < LY*LZ*72; i++, x++) { + if((int)(*x) != pp) { + printf("The exchange of gaugefields edges (xt) in direction +x+t\n"); + printf("between %d and %d is not correct\n", g_cart_id, pp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUME + RAND + LY*LZ]; + for(i = 0; i < LY*LZ*72; i++, x++) { + if((int)(*x) != pm) { + printf("The exchange of gaugefields edges (xt) in direction -x+t\n"); + printf("between %d and %d is not correct\n", g_cart_id, pm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUME + RAND + 2*LY*LZ]; + for(i = 0; i < LY*LZ*72; i++, x++) { + if((int)(*x) != mp) { + printf("The exchange of gaugefields edges (xt) in direction +x-t\n"); + printf("between %d and %d is not correct\n", g_cart_id, mp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUME + RAND + 3*LY*LZ]; + for(i = 0; i < LY*LZ*72; i++, x++) { + if((int)(*x) != mm) { + printf("The exchange of gaugefields edges (xt) in direction -x-t\n"); + printf("between %d and %d is not correct\n", g_cart_id, mm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT) + di[1] = (g_proc_coords[1] - 1)%g_nproc_x; + di[2] = (g_proc_coords[2] - 1)%g_nproc_y; + di[0] = g_proc_coords[0]; + di[3] = g_proc_coords[3]; + MPI_Cart_rank(g_cart_grid, di, &mm); + di[1] = (g_proc_coords[1] - 1)%g_nproc_x; + di[2] = (g_proc_coords[2] + 1)%g_nproc_y; + MPI_Cart_rank(g_cart_grid, di, &mp); + di[1] = (g_proc_coords[1] + 1)%g_nproc_x; + di[2] = (g_proc_coords[2] - 1)%g_nproc_y; + MPI_Cart_rank(g_cart_grid, di, &pm); + di[1] = (g_proc_coords[1] + 1)%g_nproc_x; + di[2] = (g_proc_coords[2] + 1)%g_nproc_y; + MPI_Cart_rank(g_cart_grid, di, &pp); + + x = (double*) g_gauge_field[VOLUME + RAND + 4*LY*LZ]; + for(i = 0; i < T*LZ*72; i++, x++) { + if((int)(*x) != pp) { + printf("The exchange of gaugefields edges (yx) in direction +y+x\n"); + printf("between %d and %d is not correct\n", g_cart_id, pp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUME + RAND + 4*LY*LZ + T*LZ]; + for(i = 0; i < T*LZ*72; i++, x++) { + if((int)(*x) != pm) { + printf("The exchange of gaugefields edges (yx) in direction -y+x\n"); + printf("between %d and %d is not correct\n", g_cart_id, pm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUME + RAND + 4*LY*LZ + 2*T*LZ]; + for(i = 0; i < T*LZ*72; i++, x++) { + if((int)(*x) != mp) { + printf("The exchange of gaugefields edges (yx) in direction +y-x\n"); + printf("between %d and %d is not correct\n", g_cart_id, mp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUME + RAND + 4*LY*LZ + 3*T*LZ]; + for(i = 0; i < T*LZ*72; i++, x++) { + if((int)(*x) != mm) { + printf("The exchange of gaugefields edges (yx) in direction -y-x\n"); + printf("between %d and %d is not correct\n", g_cart_id, mm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + di[0] = (g_proc_coords[0] - 1)%g_nproc_t; + di[2] = (g_proc_coords[2] - 1)%g_nproc_y; + di[1] = g_proc_coords[1]; + di[3] = g_proc_coords[3]; + MPI_Cart_rank(g_cart_grid, di, &mm); + di[0] = (g_proc_coords[0] - 1)%g_nproc_t; + di[2] = (g_proc_coords[2] + 1)%g_nproc_y; + MPI_Cart_rank(g_cart_grid, di, &mp); + di[0] = (g_proc_coords[0] + 1)%g_nproc_t; + di[2] = (g_proc_coords[2] - 1)%g_nproc_y; + MPI_Cart_rank(g_cart_grid, di, &pm); + di[0] = (g_proc_coords[0] + 1)%g_nproc_t; + di[2] = (g_proc_coords[2] + 1)%g_nproc_y; + MPI_Cart_rank(g_cart_grid, di, &pp); + + x = (double*) g_gauge_field[VOLUME + RAND + 4*LY*LZ + 4*T*LZ]; + for(i = 0; i < LX*LZ*72; i++, x++) { + if((int)(*x) != pp) { + printf("The exchange of gaugefields edges (ty) in direction +t+y\n"); + printf("between %d and %d is not correct\n", g_cart_id, pp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUME + RAND + 4*LY*LZ + 4*T*LZ + LX*LZ]; + for(i = 0; i < LX*LZ*72; i++, x++) { + if((int)(*x) != mp) { + printf("The exchange of gaugefields edges (ty) in direction -t+y\n"); + printf("between %d and %d is not correct\n", g_cart_id, mp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 2*LX*LZ]; + for(i = 0; i < LX*LZ*72; i++, x++) { + if((int)(*x) != pm) { + printf("The exchange of gaugefields edges (ty) in direction +t-y\n"); + printf("between %d and %d is not correct\n", g_cart_id, pm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 3*LX*LZ]; + for(i = 0; i < LX*LZ*72; i++, x++) { + if((int)(*x) != mm) { + printf("The exchange of gaugefields edges (ty) in direction -t-y\n"); + printf("between %d and %d is not correct\n", g_cart_id, mm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } +# endif +# ifdef PARALLELXYZT + di[1] = (g_proc_coords[1] - 1)%g_nproc_x; + di[3] = (g_proc_coords[3] - 1)%g_nproc_z; + di[0] = g_proc_coords[0]; + di[2] = g_proc_coords[2]; + MPI_Cart_rank(g_cart_grid, di, &mm); + di[1] = (g_proc_coords[1] - 1)%g_nproc_x; + di[3] = (g_proc_coords[3] + 1)%g_nproc_z; + MPI_Cart_rank(g_cart_grid, di, &mp); + di[1] = (g_proc_coords[1] + 1)%g_nproc_x; + di[3] = (g_proc_coords[3] - 1)%g_nproc_z; + MPI_Cart_rank(g_cart_grid, di, &pm); + di[1] = (g_proc_coords[1] + 1)%g_nproc_x; + di[3] = (g_proc_coords[3] + 1)%g_nproc_z; + MPI_Cart_rank(g_cart_grid, di, &pp); + /*xz-edge */ + x = (double*) g_gauge_field[VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ]; + for(i = 0; i < T*LY*72; i++, x++) { + if((int)(*x) != pp) { + printf("The exchange of gaugefields edges (zx) in direction +z+x\n"); + printf("between %d and %d is not correct\n", g_cart_id, pp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ + T*LY]; + for(i = 0; i < T*LY*72; i++, x++) { + if((int)(*x) != pm) { + printf("The exchange of gaugefields edges (zx) in direction -z+x\n"); + printf("between %d and %d is not correct\n", g_cart_id, pm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ + 2*T*LY]; + for(i = 0; i < T*LY*72; i++, x++) { + if((int)(*x) != mp) { + printf("The exchange of gaugefields edges (zx) in direction +z-x\n"); + printf("between %d and %d is not correct\n", g_cart_id, mp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ + 3*T*LY]; + for(i = 0; i < T*LY*72; i++, x++) { + if((int)(*x) != mm) { + printf("The exchange of gaugefields edges (zx) in direction -z-x\n"); + printf("between %d and %d is not correct\n", g_cart_id, mm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + di[0] = (g_proc_coords[0] - 1)%g_nproc_t; + di[3] = (g_proc_coords[3] - 1)%g_nproc_z; + di[1] = g_proc_coords[1]; + di[2] = g_proc_coords[2]; + MPI_Cart_rank(g_cart_grid, di, &mm); + di[0] = (g_proc_coords[0] - 1)%g_nproc_t; + di[3] = (g_proc_coords[3] + 1)%g_nproc_z; + MPI_Cart_rank(g_cart_grid, di, &mp); + di[0] = (g_proc_coords[0] + 1)%g_nproc_t; + di[3] = (g_proc_coords[3] - 1)%g_nproc_z; + MPI_Cart_rank(g_cart_grid, di, &pm); + di[0] = (g_proc_coords[0] + 1)%g_nproc_t; + di[3] = (g_proc_coords[3] + 1)%g_nproc_z; + MPI_Cart_rank(g_cart_grid, di, &pp); + + x = (double*) g_gauge_field[VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ + 4*T*LY]; + for(i = 0; i < LX*LY*72; i++, x++) { + if((int)(*x) != pp) { + printf("The exchange of gaugefields edges (tz) in direction +t+z\n"); + printf("between %d and %d is not correct\n", g_cart_id, pp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ + 4*T*LY + LX*LY]; + for(i = 0; i < LX*LY*72; i++, x++) { + if((int)(*x) != mp) { + printf("The exchange of gaugefields edges (tz) in direction -t+z\n"); + printf("between %d and %d is not correct\n", g_cart_id, mp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ + 4*T*LY + 2*LX*LY]; + for(i = 0; i < LX*LY*72; i++, x++) { + if((int)(*x) != pm) { + printf("The exchange of gaugefields edges (tz) in direction +t-z\n"); + printf("between %d and %d is not correct\n", g_cart_id, pm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ + 4*T*LY + 3*LX*LY]; + for(i = 0; i < LX*LY*72; i++, x++) { + if((int)(*x) != mm) { + printf("The exchange of gaugefields edges (tz) in direction -t-z\n"); + printf("between %d and %d is not correct\n", g_cart_id, mm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + di[2] = (g_proc_coords[2] - 1)%g_nproc_y; + di[3] = (g_proc_coords[3] - 1)%g_nproc_z; + di[1] = g_proc_coords[1]; + di[0] = g_proc_coords[0]; + MPI_Cart_rank(g_cart_grid, di, &mm); + di[2] = (g_proc_coords[2] - 1)%g_nproc_y; + di[3] = (g_proc_coords[3] + 1)%g_nproc_z; + MPI_Cart_rank(g_cart_grid, di, &mp); + di[2] = (g_proc_coords[2] + 1)%g_nproc_y; + di[3] = (g_proc_coords[3] - 1)%g_nproc_z; + MPI_Cart_rank(g_cart_grid, di, &pm); + di[2] = (g_proc_coords[2] + 1)%g_nproc_y; + di[3] = (g_proc_coords[3] + 1)%g_nproc_z; + MPI_Cart_rank(g_cart_grid, di, &pp); + + x = (double*) g_gauge_field[VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ + 4*T*LY + 4*LX*LY]; + for(i = 0; i < T*LX*72; i++, x++) { + if((int)(*x) != pp) { + printf("The exchange of gaugefields edges (tz) in direction +y+z\n"); + printf("between %d and %d is not correct\n", g_cart_id, pp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ + 4*T*LY + 4*LX*LY + T*LX]; + for(i = 0; i < LX*T*72; i++, x++) { + if((int)(*x) != pm) { + printf("The exchange of gaugefields edges (tz) in direction +y+z\n"); + printf("between %d and %d is not correct\n", g_cart_id, mp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ + 4*T*LY + 4*LX*LY + 2*T*LX]; + for(i = 0; i < LX*T*72; i++, x++) { + if((int)(*x) != mp) { + printf("The exchange of gaugefields edges (tz) in direction -y-z\n"); + printf("between %d and %d is not correct\n", g_cart_id, pm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ + 4*T*LY + 4*LX*LY + 3*T*LX]; + for(i = 0; i < LX*T*72; i++, x++) { + if((int)(*x) != mm) { + printf("The exchange of gaugefields edges (tz) in direction -y-z\n"); + printf("between %d and %d is not correct\n", g_cart_id, mm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } +# endif + + if(g_dbw2rand > 0) { + set_gauge_field(-1.); + + /* Set the t2 boundary */ + for(x1 = 0; x1 < LX; x1++) { + for(x2 = 0; x2 < LY; x2++) { + for(x3 = 0; x3 < LZ; x3++) { + for (mu = 0; mu < 4; mu++) { + g_gauge_field[ g_ipt[1][x1][x2][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[T-2][x1][x2][x3] ][mu] = set_su3((double)g_cart_id); + } + } + } + } + + /* Set the x2 boundary */ + for(x0 = 0; x0 < T; x0++) { + for(x2 = 0; x2 < LY; x2++) { + for(x3 = 0; x3 < LZ; x3++) { + for (mu = 0; mu < 4; mu++) { + g_gauge_field[ g_ipt[x0][1][x2][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][LX-2][x2][x3] ][mu] = set_su3((double)g_cart_id); + } + } + } + } + + /* Set the y2 boundary */ + for(x0 = 0; x0 < T; x0++) { + for(x1 = 0; x1 < LX; x1++) { + for(x3 = 0; x3 < LZ; x3++) { + for (mu = 0; mu < 4; mu++) { + g_gauge_field[ g_ipt[x0][x1][1][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][x1][LY-2][x3] ][mu] = set_su3((double)g_cart_id); + } + } + } + } + + /* Set the z2 boundary */ + for(x0 = 0; x0 < T; x0++) { + for(x1 = 0; x1 < LX; x1++) { + for(x2 = 0; x2 < LY; x2++) { + for (mu = 0; mu < 4; mu++) { + g_gauge_field[ g_ipt[x0][x1][x2][1] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][x1][x2][LZ-2] ][mu] = set_su3((double)g_cart_id); + } + } + } + } + + MPI_Barrier(MPI_COMM_WORLD); + xchange_gauge(g_gauge_field); + MPI_Barrier(MPI_COMM_WORLD); + + x = (double*) &g_gauge_field[VOLUMEPLUSRAND][0]; + for(i = 0; i < LX*LY*LZ*72; i++, x++) { + if((int)(*x) != g_nb_t_up) { + printf("The exchange up of gaugefields in time direction\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_t_up); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) &g_gauge_field[VOLUMEPLUSRAND+LX*LY*LZ][0]; + for(i = 0; i < LX*LY*LZ*72; i++, x++) { + if((int)(*x) != g_nb_t_dn) { + printf("The exchange up of gaugefields in time direction\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_t_up); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) + x = (double*) &g_gauge_field[VOLUMEPLUSRAND + 2*LX*LY*LZ][0]; + for(i = 0; i < T*LY*LZ*72; i++, x++) { + if((int)(*x) != g_nb_x_up) { + printf("The exchange up of gaugefields in x direction\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_x_up); + printf("%d %d %d\n", g_cart_id, i, (int)(*x)); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) &g_gauge_field[VOLUMEPLUSRAND + 2*LX*LY*LZ+T*LY*LZ][0]; + for(i = 0; i < T*LY*LZ*72; i++, x++) { + if((int)(*x) != g_nb_x_dn) { + printf("The exchange down of gaugefields in x direction\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_x_dn); + printf("%d %d %d\n", g_cart_id, i, (int)(*x)); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT) + x = (double*) &g_gauge_field[VOLUMEPLUSRAND + 2*LX*LY*LZ + 2*T*LZ*LY][0]; + for(i = 0; i < T*LX*LZ*72; i++, x++) { + if((int)(*x) != g_nb_y_up) { + printf("The exchange up of gaugefields in y direction\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_y_up); + printf("%d %d %d\n", g_cart_id, i, (int)(*x)); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) &g_gauge_field[VOLUMEPLUSRAND + 2*LX*LY*LZ+2*T*LY*LZ+T*LX*LZ][0]; + for(i = 0; i < T*LX*LZ*72; i++, x++) { + if((int)(*x) != g_nb_y_dn) { + printf("The exchange down of gaugefields in y direction\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_y_dn); + printf("%d %d %d\n", g_cart_id, i, (int)(*x)); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } +# endif + +# if (defined PARALLELXYZT) + x = (double*) &g_gauge_field[VOLUMEPLUSRAND + 2*LX*LY*LZ + 2*T*LZ*LY + 2*T*LX*LZ][0]; + for(i = 0; i < T*LX*LY*72; i++, x++) { + if((int)(*x) != g_nb_z_up) { + printf("The exchange up of gaugefields in z direction\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_z_up); + printf("%d %d %d\n", g_cart_id, i, (int)(*x)); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) &g_gauge_field[VOLUMEPLUSRAND + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + T*LX*LY][0]; + for(i = 0; i < T*LX*LY*72; i++, x++) { + if((int)(*x) != g_nb_z_dn) { + printf("The exchange down of gaugefields in y direction\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_z_dn); + printf("%d %d %d\n", g_cart_id, i, (int)(*x)); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } +# endif + + + +# if defined PARALLELXYZT + + set_gauge_field(-1.); + + /* Set the tz boundary */ + for(x1 = 0; x1 < LX; x1++) { + for(x2 = 0; x2 < LY; x2++) { + for (mu = 0; mu < 4; mu++) { + g_gauge_field[ g_ipt[1 ][x1][x2][0 ] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[0 ][x1][x2][1 ] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[T-2][x1][x2][0 ] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[T-1][x1][x2][1 ] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[1 ][x1][x2][LZ-1] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[0 ][x1][x2][LZ-2] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[T-2][x1][x2][LZ-1] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[T-1][x1][x2][LZ-2] ][mu] = set_su3((double)g_cart_id); + } + } + } + + MPI_Barrier(MPI_COMM_WORLD); + xchange_gauge(g_gauge_field); + MPI_Barrier(MPI_COMM_WORLD); + + /* Now there should be in the t and t2 Rand certain values set */ + + /* t-Rand (x1*LY + x2)*LZ + x3 */ + /* Hier sollte also x3=1 und x3=LZ-2 gesetzt sein */ + /* t2-Rand (x1*LY + x2)*LZ + x3 */ + /* Hier sollte also x3=0 und x3=LZ-1 gesetzt sein */ + for(x1 = 0; x1 < LX; x1 ++) { + for(x2 = 0; x2 < LY; x2 ++) { + x3 = 1; + x = (double*) g_gauge_field[VOLUME + x3 + (x1*LY+x2)*LZ]; + for(i = 0; i < 72; i++, x++) { + if((int)(*x) != g_nb_t_up) { + printf("The exchange of t1 Rand for gaugefields t-up z=1\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_t_up); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), g_nb_t_up); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + x3 = LZ-2; + x = (double*) g_gauge_field[VOLUME + x3 + (x1*LY+x2)*LZ]; + for(i = 0; i < 72; i++, x++) { + if((int)(*x) != g_nb_t_up) { + printf("The exchange of t1 Rand for gaugefields t-up z=LZ-2\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_t_up); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), g_nb_t_up); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + x3 = 1; + x = (double*) g_gauge_field[VOLUME + LX*LY*LZ + x3 + (x1*LY+x2)*LZ]; + for(i = 0; i < 72; i++, x++) { + if((int)(*x) != g_nb_t_dn) { + printf("The exchange of t1 Rand for gaugefields t-down z=1\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_t_dn); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), g_nb_t_dn); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + x3 = LZ-2; + x = (double*) g_gauge_field[VOLUME + LX*LY*LZ + x3 + (x1*LY+x2)*LZ]; + for(i = 0; i < 72; i++, x++) { + if((int)(*x) != g_nb_t_dn) { + printf("The exchange of t1 Rand for gaugefields t-down z=LZ-2\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_t_dn); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), g_nb_t_dn); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x3 = 0; + x = (double*) g_gauge_field[VOLUMEPLUSRAND + x3 + (x1*LY+x2)*LZ]; + for(i = 0; i < 72; i++, x++) { + if((int)(*x) != g_nb_t_up) { + printf("The exchange of t2 Rand for gaugefields t-up z=0\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_t_up); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), g_nb_t_up); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + x3 = LZ-1; + x = (double*) g_gauge_field[VOLUMEPLUSRAND + x3 + (x1*LY+x2)*LZ]; + for(i = 0; i < 72; i++, x++) { + if((int)(*x) != g_nb_t_up) { + printf("The exchange of t2 Rand for gaugefields t-up z=LZ-1\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_t_up); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), g_nb_t_up); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + x3 = 0; + x = (double*) g_gauge_field[VOLUMEPLUSRAND + LX*LY*LZ + x3 + (x1*LY+x2)*LZ]; + for(i = 0; i < 72; i++, x++) { + if((int)(*x) != g_nb_t_dn) { + printf("The exchange of t2 Rand for gaugefields t-down z=0\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_t_dn); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), g_nb_t_dn); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + x3 = LZ-1; + x = (double*) g_gauge_field[VOLUMEPLUSRAND + LX*LY*LZ + x3 + (x1*LY+x2)*LZ]; + for(i = 0; i < 72; i++, x++) { + if((int)(*x) != g_nb_t_dn) { + printf("The exchange of t2 Rand for gaugefields t-down z=LZ-1\n"); + printf("between %d and %d is not correct\n", g_cart_id, g_nb_t_dn); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), g_nb_t_dn); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + +# endif + + set_gauge_field(-1.); + + /* Set the edges */ + for(x2 = 0; x2 < LY; x2++) { + for(x3 = 0; x3 < LZ; x3++) { + for (mu = 0; mu < 4; mu++) { + g_gauge_field[ g_ipt[1][0][x2][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[0][1][x2][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[T-1][1][x2][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[T-2][0][x2][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[1][LX-1][x2][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[0][LX-2][x2][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[T-1][LX-2][x2][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[T-2][LX-1][x2][x3] ][mu] = set_su3((double)g_cart_id); + } + } + } + + /* Set the y boundary */ + for(x0 = 0; x0 < T; x0++) { + for(x3 = 0; x3 < LZ; x3++) { + for (mu = 0; mu < 4; mu++) { + g_gauge_field[ g_ipt[x0][1][0][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][0][1][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][LX-1][1][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][LX-2][0][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][1][LY-1][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][0][LY-2][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][LX-2][LY-1][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][LX-1][LY-2][x3] ][mu] = set_su3((double)g_cart_id); + } + } + } + + /* Set the t boundary */ + for(x1 = 0; x1 < LX; x1++) { + for(x3 = 0; x3 < LZ; x3++) { + for (mu = 0; mu < 4; mu++) { + g_gauge_field[ g_ipt[1][x1][0][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[0][x1][1][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[T-2][x1][0][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[T-1][x1][1][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[1][x1][LY-1][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[0][x1][LY-2][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[T-2][x1][LY-1][x3] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[T-1][x1][LY-2][x3] ][mu] = set_su3((double)g_cart_id); + } + } + } +# if defined PARALLELXYZT + /* Set the tz boundary */ + for(x1 = 0; x1 < LX; x1++) { + for(x2 = 0; x2 < LY; x2++) { + for (mu = 0; mu < 4; mu++) { + g_gauge_field[ g_ipt[1 ][x1][x2][0 ] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[0 ][x1][x2][1 ] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[T-2][x1][x2][0 ] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[T-1][x1][x2][1 ] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[1 ][x1][x2][LZ-1] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[0 ][x1][x2][LZ-2] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[T-2][x1][x2][LZ-1] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[T-1][x1][x2][LZ-2] ][mu] = set_su3((double)g_cart_id); + } + } + } + + /* Set the yz boundary */ + for(x0 = 0; x0 < T; x0++) { + for(x1 = 0; x1 < LX; x1++) { + for (mu = 0; mu < 4; mu++) { + g_gauge_field[ g_ipt[x0][x1][1 ][0 ] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][x1][0 ][1 ] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][x1][LY-2][0 ] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][x1][LY-1][1 ] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][x1][1 ][LZ-1] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][x1][0 ][LZ-2] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][x1][LY-2][LZ-1] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][x1][LY-1][LZ-2] ][mu] = set_su3((double)g_cart_id); + } + } + } + + /* Set the xz boundary */ + for(x0 = 0; x0 < T; x0++) { + for(x2 = 0; x2 < LY; x2++) { + for (mu = 0; mu < 4; mu++) { + g_gauge_field[ g_ipt[x0][1][x2][0] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][0][x2][1] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][LX-2][x2][0] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][LX-1][x2][1] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][1][x2][LZ-1] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][0][x2][LZ-2] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][LX-2][x2][LZ-1] ][mu] = set_su3((double)g_cart_id); + g_gauge_field[ g_ipt[x0][LX-1][x2][LZ-2] ][mu] = set_su3((double)g_cart_id); + } + } + } +# endif + MPI_Barrier(MPI_COMM_WORLD); + xchange_gauge(g_gauge_field); + MPI_Barrier(MPI_COMM_WORLD); + +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) + di[0] = (g_proc_coords[0] - 1)%g_nproc_t; + di[1] = (g_proc_coords[1] - 1)%g_nproc_x; + di[2] = g_proc_coords[2]; + di[3] = g_proc_coords[3]; + MPI_Cart_rank(g_cart_grid, di, &mm); + di[0] = (g_proc_coords[0] - 1)%g_nproc_t; + di[1] = (g_proc_coords[1] + 1)%g_nproc_x; + MPI_Cart_rank(g_cart_grid, di, &mp); + di[0] = (g_proc_coords[0] + 1)%g_nproc_t; + di[1] = (g_proc_coords[1] - 1)%g_nproc_x; + MPI_Cart_rank(g_cart_grid, di, &pm); + di[0] = (g_proc_coords[0] + 1)%g_nproc_t; + di[1] = (g_proc_coords[1] + 1)%g_nproc_x; + MPI_Cart_rank(g_cart_grid, di, &pp); + + x = (double*) g_gauge_field[VOLUMEPLUSRAND + RAND]; + for(i = 0; i < LY*LZ*72; i++, x++) { + if((int)(*x) != pp) { + printf("The exchange of gaugefields edges (xt) in direction +x+2t\n"); + printf("between %d and %d is not correct\n", g_cart_id, pp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUMEPLUSRAND + RAND + LY*LZ]; + for(i = 0; i < LY*LZ*72; i++, x++) { + if((int)(*x) != pm) { + printf("The exchange of gaugefields edges (xt) in direction -x+2t\n"); + printf("between %d and %d is not correct\n", g_cart_id, pm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUMEPLUSRAND + RAND + 2*LY*LZ]; + for(i = 0; i < LY*LZ*72; i++, x++) { + if((int)(*x) != mp) { + printf("The exchange of gaugefields edges (xt) in direction +x-2t\n"); + printf("between %d and %d is not correct\n", g_cart_id, mp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUMEPLUSRAND + RAND + 3*LY*LZ]; + for(i = 0; i < LY*LZ*72; i++, x++) { + if((int)(*x) != mm) { + printf("The exchange of gaugefields edges (xt) in direction -x-2t\n"); + printf("between %d and %d is not correct\n", g_cart_id, mm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUMEPLUSRAND + RAND + 4*LY*LZ]; + for(i = 0; i < LY*LZ*72; i++, x++) { + if((int)(*x) != pp) { + printf("The exchange of gaugefields edges (xt) in direction +2x+t\n"); + printf("between %d and %d is not correct\n", g_cart_id, pp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUMEPLUSRAND + RAND + 5*LY*LZ]; + for(i = 0; i < LY*LZ*72; i++, x++) { + if((int)(*x) != pm) { + printf("The exchange of gaugefields edges (xt) in direction -2x+t\n"); + printf("between %d and %d is not correct\n", g_cart_id, pm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUMEPLUSRAND + RAND + 6*LY*LZ]; + for(i = 0; i < LY*LZ*72; i++, x++) { + if((int)(*x) != mp) { + printf("The exchange of gaugefields edges (xt) in direction +2x-t\n"); + printf("between %d and %d is not correct\n", g_cart_id, mp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUMEPLUSRAND + RAND + 7*LY*LZ]; + for(i = 0; i < LY*LZ*72; i++, x++) { + if((int)(*x) != mm) { + printf("The exchange of gaugefields edges (xt) in direction -2x-t\n"); + printf("between %d and %d is not correct\n", g_cart_id, mm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT) + + di[1] = (g_proc_coords[1] - 1)%g_nproc_x; + di[2] = (g_proc_coords[2] - 1)%g_nproc_y; + di[0] = g_proc_coords[0]; + di[3] = g_proc_coords[3]; + MPI_Cart_rank(g_cart_grid, di, &mm); + di[1] = (g_proc_coords[1] - 1)%g_nproc_x; + di[2] = (g_proc_coords[2] + 1)%g_nproc_y; + MPI_Cart_rank(g_cart_grid, di, &mp); + di[1] = (g_proc_coords[1] + 1)%g_nproc_x; + di[2] = (g_proc_coords[2] - 1)%g_nproc_y; + MPI_Cart_rank(g_cart_grid, di, &pm); + di[1] = (g_proc_coords[1] + 1)%g_nproc_x; + di[2] = (g_proc_coords[2] + 1)%g_nproc_y; + MPI_Cart_rank(g_cart_grid, di, &pp); + + x = (double*) g_gauge_field[VOLUMEPLUSRAND + RAND + 8*LY*LZ]; + for(i = 0; i < T*LZ*72; i++, x++) { + if((int)(*x) != pp) { + printf("The exchange of gaugefields edges (yx) in direction +y+2x\n"); + printf("between %d and %d is not correct\n", g_cart_id, pp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUMEPLUSRAND + RAND + 8*LY*LZ + T*LZ]; + for(i = 0; i < T*LZ*72; i++, x++) { + if((int)(*x) != pm) { + printf("The exchange of gaugefields edges (yx) in direction -y+2x\n"); + printf("between %d and %d is not correct\n", g_cart_id, pm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 2*T*LZ]; + for(i = 0; i < T*LZ*72; i++, x++) { + if((int)(*x) != mp) { + printf("The exchange of gaugefields edges (yx) in direction +y-2x\n"); + printf("between %d and %d is not correct\n", g_cart_id, mp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 3*T*LZ]; + for(i = 0; i < T*LZ*72; i++, x++) { + if((int)(*x) != mm) { + printf("The exchange of gaugefields edges (yx) in direction -y-2x\n"); + printf("between %d and %d is not correct\n", g_cart_id, mm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 4*T*LZ]; + for(i = 0; i < T*LZ*72; i++, x++) { + if((int)(*x) != pp) { + printf("The exchange of gaugefields edges (yx) in direction +2y+x\n"); + printf("between %d and %d is not correct\n", g_cart_id, pp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 5*T*LZ]; + for(i = 0; i < T*LZ*72; i++, x++) { + if((int)(*x) != pm) { + printf("The exchange of gaugefields edges (yx) in direction -2y+x\n"); + printf("between %d and %d is not correct\n", g_cart_id, pm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 6*T*LZ]; + for(i = 0; i < T*LZ*72; i++, x++) { + if((int)(*x) != mp) { + printf("The exchange of gaugefields edges (yx) in direction +2y-x\n"); + printf("between %d and %d is not correct\n", g_cart_id, mp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 7*T*LZ]; + for(i = 0; i < T*LZ*72; i++, x++) { + if((int)(*x) != mm) { + printf("The exchange of gaugefields edges (yx) in direction -2y-x\n"); + printf("between %d and %d is not correct\n", g_cart_id, mm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + + di[0] = (g_proc_coords[0] - 1)%g_nproc_t; + di[2] = (g_proc_coords[2] - 1)%g_nproc_y; + di[1] = g_proc_coords[1]; + di[3] = g_proc_coords[3]; + MPI_Cart_rank(g_cart_grid, di, &mm); + di[0] = (g_proc_coords[0] - 1)%g_nproc_t; + di[2] = (g_proc_coords[2] + 1)%g_nproc_y; + MPI_Cart_rank(g_cart_grid, di, &mp); + di[0] = (g_proc_coords[0] + 1)%g_nproc_t; + di[2] = (g_proc_coords[2] - 1)%g_nproc_y; + MPI_Cart_rank(g_cart_grid, di, &pm); + di[0] = (g_proc_coords[0] + 1)%g_nproc_t; + di[2] = (g_proc_coords[2] + 1)%g_nproc_y; + MPI_Cart_rank(g_cart_grid, di, &pp); + + x = (double*) g_gauge_field[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ]; + for(i = 0; i < LX*LZ*72; i++, x++) { + if((int)(*x) != pp) { + printf("The exchange of gaugefields edges (ty) in direction +2t+y\n"); + printf("between %d and %d is not correct\n", g_cart_id, pp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 1*LX*LZ]; + for(i = 0; i < LX*LZ*72; i++, x++) { + if((int)(*x) != mp) { + printf("The exchange of gaugefields edges (ty) in direction -2t+y\n"); + printf("between %d and %d is not correct\n", g_cart_id, mp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 2*LX*LZ]; + for(i = 0; i < LX*LZ*72; i++, x++) { + if((int)(*x) != pm) { + printf("The exchange of gaugefields edges (ty) in direction +2t-y\n"); + printf("between %d and %d is not correct\n", g_cart_id, pm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 3*LX*LZ]; + for(i = 0; i < LX*LZ*72; i++, x++) { + if((int)(*x) != mm) { + printf("The exchange of gaugefields edges (ty) in direction -2t-y\n"); + printf("between %d and %d is not correct\n", g_cart_id, mm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 4*LX*LZ]; + for(i = 0; i < LX*LZ*72; i++, x++) { + if((int)(*x) != pp) { + printf("The exchange of gaugefields edges (ty) in direction +t+2y\n"); + printf("between %d and %d is not correct\n", g_cart_id, pp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 5*LX*LZ]; + for(i = 0; i < LX*LZ*72; i++, x++) { + if((int)(*x) != mp) { + printf("The exchange of gaugefields edges (ty) in direction -t+2y\n"); + printf("between %d and %d is not correct\n", g_cart_id, mp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 6*LX*LZ]; + for(i = 0; i < LX*LZ*72; i++, x++) { + if((int)(*x) != pm) { + printf("The exchange of gaugefields edges (ty) in direction +t-2y\n"); + printf("between %d and %d is not correct\n", g_cart_id, pm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 7*LX*LZ]; + for(i = 0; i < LX*LZ*72; i++, x++) { + if((int)(*x) != mm) { + printf("The exchange of gaugefields edges (ty) in direction -t-2y\n"); + printf("between %d and %d is not correct\n", g_cart_id, mm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } +# endif +# if defined PARALLELXYZT + + di[0] = (g_proc_coords[0] - 1)%g_nproc_t; + di[3] = (g_proc_coords[3] - 1)%g_nproc_z; + di[1] = g_proc_coords[1]; + di[2] = g_proc_coords[2]; + MPI_Cart_rank(g_cart_grid, di, &mm); + di[0] = (g_proc_coords[0] - 1)%g_nproc_t; + di[3] = (g_proc_coords[3] + 1)%g_nproc_z; + MPI_Cart_rank(g_cart_grid, di, &mp); + di[0] = (g_proc_coords[0] + 1)%g_nproc_t; + di[3] = (g_proc_coords[3] - 1)%g_nproc_z; + MPI_Cart_rank(g_cart_grid, di, &pm); + di[0] = (g_proc_coords[0] + 1)%g_nproc_t; + di[3] = (g_proc_coords[3] + 1)%g_nproc_z; + MPI_Cart_rank(g_cart_grid, di, &pp); + + x = (double*) g_gauge_field[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ]; + for(i = 0; i < LX*LY*72; i++, x++) { + if((int)(*x) != pp) { + printf("The exchange of gaugefields edges (tz) in direction +z+2t\n"); + printf("between %d and %d is not correct\n", g_cart_id, pp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + x = (double*) g_gauge_field[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + LX*LY]; + for(i = 0; i < LX*LY*72; i++, x++) { + if((int)(*x) != mp) { + printf("The exchange of gaugefields edges (tz) in direction +z-2t\n"); + printf("between %d and %d is not correct\n", g_cart_id, mp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 2*LX*LY]; + for(i = 0; i < LX*LY*72; i++, x++) { + if((int)(*x) != pm) { + printf("The exchange of gaugefields edges (tz) in direction -z+2t\n"); + printf("between %d and %d is not correct\n", g_cart_id, pm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 3*LX*LY]; + for(i = 0; i < LX*LY*72; i++, x++) { + if((int)(*x) != mm) { + printf("The exchange of gaugefields edges (tz) in direction -z-2t\n"); + printf("between %d and %d is not correct\n", g_cart_id, mm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 4*LX*LY]; + for(i = 0; i < LX*LY*72; i++, x++) { + if((int)(*x) != pp) { + printf("The exchange of gaugefields edges (zt) in direction +2z+t\n"); + printf("between %d and %d is not correct\n", g_cart_id, pp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 5*LX*LY]; + for(i = 0; i < LX*LY*72; i++, x++) { + if((int)(*x) != mp) { + printf("The exchange of gaugefields edges (zt) in direction +2z-t\n"); + printf("between %d and %d is not correct\n", g_cart_id, mp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 6*LX*LY]; + for(i = 0; i < LX*LY*72; i++, x++) { + if((int)(*x) != pm) { + printf("The exchange of gaugefields edges (zt) in direction -2z+t\n"); + printf("between %d and %d is not correct\n", g_cart_id, pm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 7*LX*LY]; + for(i = 0; i < LX*LY*72; i++, x++) { + if((int)(*x) != mm) { + printf("The exchange of gaugefields edges (zt) in direction -2z-t\n"); + printf("between %d and %d is not correct\n", g_cart_id, mm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + /* zx-edge */ + di[1] = (g_proc_coords[1] - 1)%g_nproc_x; + di[3] = (g_proc_coords[3] - 1)%g_nproc_z; + di[0] = g_proc_coords[0]; + di[2] = g_proc_coords[2]; + MPI_Cart_rank(g_cart_grid, di, &mm); + di[1] = (g_proc_coords[1] - 1)%g_nproc_x; + di[3] = (g_proc_coords[3] + 1)%g_nproc_z; + MPI_Cart_rank(g_cart_grid, di, &mp); + di[1] = (g_proc_coords[1] + 1)%g_nproc_x; + di[3] = (g_proc_coords[3] - 1)%g_nproc_z; + MPI_Cart_rank(g_cart_grid, di, &pm); + di[1] = (g_proc_coords[1] + 1)%g_nproc_x; + di[3] = (g_proc_coords[3] + 1)%g_nproc_z; + MPI_Cart_rank(g_cart_grid, di, &pp); + + x = (double*) g_gauge_field[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY]; + for(i = 0; i < T*LY*72; i++, x++) { + if((int)(*x) != pp) { + printf("The exchange of gaugefields edges (zx) in direction +2z+x\n"); + printf("between %d and %d is not correct\n", g_cart_id, pp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + T*LY]; + for(i = 0; i < T*LY*72; i++, x++) { + if((int)(*x) != pm) { + printf("The exchange of gaugefields edges (zx) in direction -2z+x\n"); + printf("between %d and %d is not correct\n", g_cart_id, pm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 2*T*LY]; + for(i = 0; i < T*LY*72; i++, x++) { + if((int)(*x) != mp) { + printf("The exchange of gaugefields edges (zx) in direction +2z-x\n"); + printf("between %d and %d is not correct\n", g_cart_id, mp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 3*T*LY]; + for(i = 0; i < T*LY*72; i++, x++) { + if((int)(*x) != mm) { + printf("The exchange of gaugefields edges (zx) in direction -2z-x\n"); + printf("between %d and %d is not correct\n", g_cart_id, mm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 4*T*LY]; + for(i = 0; i < T*LY*72; i++, x++) { + if((int)(*x) != pp) { + printf("The exchange of gaugefields edges (xz) in direction +z+2x\n"); + printf("between %d and %d is not correct\n", g_cart_id, pp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 5*T*LY]; + for(i = 0; i < T*LY*72; i++, x++) { + if((int)(*x) != pm) { + printf("The exchange of gaugefields edges (xz) in direction -z+2x\n"); + printf("between %d and %d is not correct\n", g_cart_id, pm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 6*T*LY]; + for(i = 0; i < T*LY*72; i++, x++) { + if((int)(*x) != mp) { + printf("The exchange of gaugefields edges (xz) in direction +z-2x\n"); + printf("between %d and %d is not correct\n", g_cart_id, mp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 7*T*LY]; + for(i = 0; i < T*LY*72; i++, x++) { + if((int)(*x) != mm) { + printf("The exchange of gaugefields edges (xz) in direction -z-2x\n"); + printf("between %d and %d is not correct\n", g_cart_id, mm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + /* zy-edge */ + di[2] = (g_proc_coords[2] - 1)%g_nproc_y; + di[3] = (g_proc_coords[3] - 1)%g_nproc_z; + di[0] = g_proc_coords[0]; + di[1] = g_proc_coords[1]; + MPI_Cart_rank(g_cart_grid, di, &mm); + di[2] = (g_proc_coords[2] - 1)%g_nproc_y; + di[3] = (g_proc_coords[3] + 1)%g_nproc_z; + MPI_Cart_rank(g_cart_grid, di, &mp); + di[2] = (g_proc_coords[2] + 1)%g_nproc_y; + di[3] = (g_proc_coords[3] - 1)%g_nproc_z; + MPI_Cart_rank(g_cart_grid, di, &pm); + di[2] = (g_proc_coords[2] + 1)%g_nproc_y; + di[3] = (g_proc_coords[3] + 1)%g_nproc_z; + MPI_Cart_rank(g_cart_grid, di, &pp); + + x = (double*) g_gauge_field[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 8*T*LY]; + for(i = 0; i < T*LX*72; i++, x++) { + if((int)(*x) != pp) { + printf("The exchange of gaugefields edges (zy) in direction +2z+y\n"); + printf("between %d and %d is not correct\n", g_cart_id, pp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 8*T*LY + T*LX]; + for(i = 0; i < T*LX*72; i++, x++) { + if((int)(*x) != pm) { + printf("The exchange of gaugefields edges (zy) in direction -2z+y\n"); + printf("between %d and %d is not correct\n", g_cart_id, pm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 8*T*LY + 2*T*LX]; + for(i = 0; i < T*LX*72; i++, x++) { + if((int)(*x) != mp) { + printf("The exchange of gaugefields edges (zy) in direction +2z-y\n"); + printf("between %d and %d is not correct\n", g_cart_id, mp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 8*T*LY + 3*T*LX]; + for(i = 0; i < T*LX*72; i++, x++) { + if((int)(*x) != mm) { + printf("The exchange of gaugefields edges (zy) in direction -2z-y\n"); + printf("between %d and %d is not correct\n", g_cart_id, mm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 8*T*LY + 4*T*LX]; + for(i = 0; i < T*LX*72; i++, x++) { + if((int)(*x) != pp) { + printf("The exchange of gaugefields edges (yz) in direction +z+2y\n"); + printf("between %d and %d is not correct\n", g_cart_id, pp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pp); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 8*T*LY + 5*T*LX]; + for(i = 0; i < T*LX*72; i++, x++) { + if((int)(*x) != pm) { + printf("The exchange of gaugefields edges (yz) in direction -z+2y\n"); + printf("between %d and %d is not correct\n", g_cart_id, pm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), pm); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 8*T*LY + 6*T*LX]; + for(i = 0; i < T*LX*72; i++, x++) { + if((int)(*x) != mp) { + printf("The exchange of gaugefields edges (yz) in direction +z-2y\n"); + printf("between %d and %d is not correct\n", g_cart_id, mp); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mp); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + + x = (double*) g_gauge_field[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 8*T*LY + 7*T*LX]; + for(i = 0; i < T*LX*72; i++, x++) { + if((int)(*x) != mm) { + printf("The exchange of gaugefields edges (yz) in direction -z-2y\n"); + printf("between %d and %d is not correct\n", g_cart_id, mm); + printf("%d %d (%d != %d)\n", g_cart_id, i, (int)(*x), mm); + printf("Program aborted\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + +# endif + if(g_proc_id == 0) { + printf("# Exchange of rectangular gauge action boundaries checked successfully!\n"); + } + + } /* dbw2 */ + + if(g_proc_id == 0) { + printf("# Exchange of gauge fields checked successfully!\n"); + printf("# Starting check of deri...\n"); + } + + /* Check the deri exchange */ + + for(ix = 0; ix < VOLUMEPLUSRAND; ix++) { + for(mu=0; mu<4; mu++) { + x = (double*)&ddummy[ix][mu]; + for(int j = 0; j < 8; j++) { + x[j] = 0.; + } + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + x[j] = 0.; + } + } + } + + for(x1 = 0; x1 < LX; x1++) { + for(x2 = 0; x2 < LY; x2++) { + for(x3 = 0; x3 < LZ; x3++) { + ix = g_idn[ g_ipt[0][x1][x2][x3] ][0]; + for(mu = 0; mu < 4; mu++){ + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + x[j] = (double)g_cart_id; + } + } + ix = g_iup[ g_ipt[T-1][x1][x2][x3] ][0]; + for(mu = 0; mu < 4; mu++){ + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + x[j] = (double)g_cart_id; + } + } + } + } + } +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) + for(x0 = 0; x0 < T; x0++) { + for(x2 = 0; x2 < LY; x2++) { + for(x3 = 0; x3 < LZ; x3++) { + ix = g_idn[ g_ipt[x0][0][x2][x3] ][1]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + x[j] = (double)g_cart_id; + } + } + ix = g_iup[ g_ipt[x0][LX-1][x2][x3] ][1]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + x[j] = (double)g_cart_id; + } + } + } + } + } +# endif +# if (defined PARALLELXYT || defined PARALLELXYZT) + for(x0 = 0; x0 < T; x0++) { + for(x1 = 0; x1 < LX; x1++) { + for(x3 = 0; x3 < LZ; x3++) { + ix = g_idn[ g_ipt[x0][x1][0][x3] ][2]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + x[j] = (double)g_cart_id; + } + } + ix = g_iup[ g_ipt[x0][x1][LY-1][x3] ][2]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + x[j] = (double)g_cart_id; + } + } + } + } + } +# endif +# if defined PARALLELXYZT + for(x0 = 0; x0 < T; x0++) { + for(x1 = 0; x1 < LX; x1++) { + for(x2 = 0; x2 < LY; x2++) { + ix = g_idn[ g_ipt[x0][x1][x2][0] ][3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + x[j] = (double)g_cart_id; + } + } + ix = g_iup[ g_ipt[x0][x1][x2][LZ-1] ][3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + x[j] = (double)g_cart_id; + } + } + } + } + } +# endif + + MPI_Barrier(MPI_COMM_WORLD); + xchange_deri(df0); + MPI_Barrier(MPI_COMM_WORLD); + +# if defined PARALLELT + for(x1 = 0; x1 < LX; x1++) { + for(x2 = 0; x2 < LY; x2++) { + for(x3 = 0; x3 < LZ; x3++) { + ix = g_ipt[T-1][x1][x2][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != g_nb_t_up) { + printf("Exchange of derivatives is working not correctly (1u)!\n"); + printf("Aborting program!"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + ix = g_ipt[0][x1][x2][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != g_nb_t_dn) { + printf("Exchange of derivatives is working not correctly (1d)!\n"); + printf("Aborting program!"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + } + } +# endif +# if defined PARALLELXT + for(x1 = 1; x1 < LX-1; x1++) { + for(x2 = 0; x2 < LY; x2++) { + for(x3 = 0; x3 < LZ; x3++) { + ix = g_ipt[T-1][x1][x2][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != g_nb_t_up) { + printf("Exchange of derivatives is working not correctly (2u)!\n"); + printf("Aborting program!"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + ix = g_ipt[0][x1][x2][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != g_nb_t_dn) { + printf("Exchange of derivatives is working not correctly (2d)!\n"); + printf("Aborting program!"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + } + } + MPI_Barrier(MPI_COMM_WORLD); + for(x0 = 1; x0 < T-1; x0++) { + for(x2 = 0; x2 < LY; x2++) { + for(x3 = 0; x3 < LZ; x3++) { + ix = g_ipt[x0][LX-1][x2][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != g_nb_x_up) { + printf("Exchange of derivatives is working not correctly (3u)!\n"); + printf("Aborting program!"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + ix = g_ipt[x0][0][x2][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != g_nb_x_dn) { + printf("Exchange of derivatives is working not correctly (3d)!\n"); + printf("Aborting program!"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + } + } + for(x2 = 0; x2 < LY; x2++) { + for(x3 = 0; x3 < LZ; x3++) { + ix = g_ipt[T-1][LX-1][x2][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != g_nb_x_up + g_nb_t_up) { + printf("Exchange of derivatives is working not correctly (4uu)!\n"); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + ix = g_ipt[0][LX-1][x2][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != g_nb_x_up + g_nb_t_dn) { + printf("Exchange of derivatives is working not correctly (4ud)!\n"); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + ix = g_ipt[T-1][0][x2][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != g_nb_x_dn + g_nb_t_up) { + printf("Exchange of derivatives is working not correctly (4du)!\n"); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + ix = g_ipt[0][0][x2][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != g_nb_x_dn + g_nb_t_dn) { + printf("Exchange of derivatives is working not correctly (4dd)!\n"); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + } +# endif +# if defined PARALLELXYT + for(x1 = 1; x1 < LX-1; x1++) { + for(x2 = 1; x2 < LY-1; x2++) { + for(x3 = 0; x3 < LZ; x3++) { + ix = g_ipt[T-1][x1][x2][x3]; + for(mu = 0; mu < 4; mu++){ + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != g_nb_t_up) { + printf("Exchange of derivatives is working not correctly (5u)!\n"); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + ix = g_ipt[0][x1][x2][x3]; + for(mu = 0; mu < 4; mu++){ + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != g_nb_t_dn) { + printf("Exchange of derivatives is working not correctly (5d)!\n"); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + } + } + for(x0 = 1; x0 < T-1; x0++) { + for(x2 = 1; x2 < LY-1; x2++) { + for(x3 = 0; x3 < LZ; x3++) { + ix = g_ipt[x0][LX-1][x2][x3]; + for(mu = 0; mu < 4; mu++){ + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != g_nb_x_up) { + printf("Exchange of derivatives is working not correctly (6u)!\n"); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + ix = g_ipt[x0][0][x2][x3]; + for(mu = 0; mu < 4; mu++){ + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != g_nb_x_dn) { + printf("Exchange of derivatives is working not correctly (6d)!\n"); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + } + } + for(x0 = 1; x0 < T-1; x0++) { + for(x1 = 1; x1 < LX-1; x1++) { + for(x3 = 0; x3 < LZ; x3++) { + ix = g_ipt[x0][x1][LY-1][x3]; + for(mu = 0; mu < 4; mu++){ + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != g_nb_y_up) { + printf("Exchange of derivatives is working not correctly (7u)!\n"); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + ix = g_ipt[x0][x1][0][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != g_nb_y_dn) { + printf("Exchange of derivatives is working not correctly (7d)!\n"); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + } + } + for(x2 = 1; x2 < LY-1; x2++) { + for(x3 = 0; x3 < LZ; x3++) { + ix = g_ipt[T-1][LX-1][x2][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != g_nb_x_up + g_nb_t_up) { + printf("Exchange of derivatives is working not correctly (8uu)!\n"); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + ix = g_ipt[0][LX-1][x2][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != g_nb_x_up + g_nb_t_dn) { + printf("Exchange of derivatives is working not correctly (8ud)!\n"); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + ix = g_ipt[T-1][0][x2][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != g_nb_x_dn + g_nb_t_up) { + printf("Exchange of derivatives is working not correctly (8du)!\n"); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + ix = g_ipt[0][0][x2][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != g_nb_x_dn + g_nb_t_dn) { + printf("Exchange of derivatives is working not correctly (8dd)!\n"); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + } + for(x1 = 1; x1 < LX-1; x1++) { + for(x3 = 0; x3 < LZ; x3++) { + ix = g_ipt[T-1][x1][LY-1][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != g_nb_t_up + g_nb_y_up) { + printf("Exchange of derivatives is working not correctly (9uu)!\n"); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + ix = g_ipt[T-1][x1][0][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != g_nb_t_up + g_nb_y_dn) { + printf("Exchange of derivatives is working not correctly (9ud)!\n"); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + ix = g_ipt[0][x1][LY-1][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != g_nb_t_dn + g_nb_y_up) { + printf("Exchange of derivatives is working not correctly (9du)!\n"); + printf("%d %d %d %d %d %d %d\n", (int)x[j], g_nb_t_dn, g_nb_t_up, g_nb_y_dn, g_nb_y_up, x1, x3); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + ix = g_ipt[0][x1][0][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != g_nb_t_dn + g_nb_y_dn) { + printf("Exchange of derivatives is working not correctly (9dd)!\n"); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + } + for(x0 = 1; x0 < T-1; x0++) { + for(x3 = 0; x3 < LZ; x3++) { + ix = g_ipt[x0][LX-1][LY-1][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != g_nb_x_up + g_nb_y_up) { + printf("Exchange of derivatives is working not correctly (10uu)!\n"); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + ix = g_ipt[x0][LX-1][0][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != g_nb_x_up + g_nb_y_dn) { + printf("Exchange of derivatives is working not correctly (10ud)!\n"); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + ix = g_ipt[x0][0][LY-1][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != g_nb_x_dn + g_nb_y_up) { + printf("Exchange of derivatives is working not correctly (10du)!\n"); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + ix = g_ipt[x0][0][0][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != g_nb_x_dn + g_nb_y_dn) { + printf("Exchange of derivatives is working not correctly (10dd)!\n"); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + } + for(x3 = 0; x3 < LZ; x3++) { + ix = g_ipt[T-1][LX-1][LY-1][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != g_nb_x_up + g_nb_y_up + g_nb_t_up) { + printf("Exchange of derivatives is working not correctly (11uuu)!\n"); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + ix = g_ipt[T-1][0][LY-1][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != g_nb_x_dn + g_nb_y_up + g_nb_t_up) { + printf("Exchange of derivatives is working not correctly (11duu)!\n"); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + ix = g_ipt[0][0][LY-1][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != g_nb_x_dn + g_nb_y_up + g_nb_t_dn) { + printf("Exchange of derivatives is working not correctly (11dud)!\n"); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + ix = g_ipt[T-1][0][0][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != g_nb_x_dn + g_nb_y_dn + g_nb_t_up) { + printf("Exchange of derivatives is working not correctly (11ddu)!\n"); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + ix = g_ipt[0][LX-1][0][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != g_nb_x_up + g_nb_y_dn + g_nb_t_dn) { + printf("Exchange of derivatives is working not correctly (11udd)!\n"); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + ix = g_ipt[0][LX-1][LY-1][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != g_nb_x_up + g_nb_y_up + g_nb_t_dn) { + printf("Exchange of derivatives is working not correctly (11uud)!\n"); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + ix = g_ipt[T-1][LX-1][0][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != g_nb_x_up + g_nb_y_dn + g_nb_t_up) { + printf("Exchange of derivatives is working not correctly (11udu)!\n"); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + ix = g_ipt[0][0][0][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != g_nb_x_dn + g_nb_y_dn + g_nb_t_dn) { + printf("Exchange of derivatives is working not correctly (11ddd)!\n"); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + for(x0 = 1; x0 < T-1; x0++) { + for(x1 = 1; x1 < LX-1; x1++) { + for(x2 = 1; x2 < LY-1; x2++) { + for(x3 = 0; x3 < LZ; x3++) { + ix = g_ipt[x0][x1][x2][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != 0) { + printf("Exchange of derivatives is working not correctly (bulk XYT)!\n"); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + } + } + } + + +# endif + +# if defined PARALLELXYZT + for(x1 = 1; x1 < LX-1; x1++) { + for(x2 = 1; x2 < LY-1; x2++) { + for(x3 = 1; x3 < LZ-1; x3++) { + ix = g_ipt[T-1][x1][x2][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != g_nb_t_up) { + printf("Exchange of derivatives is working not correctly (12)!\n"); + printf("%d %d %d %d %d\n", x1, x2, x3, ix, g_proc_id); + printf("%f %d %d\n", df0[ix][mu].d8, g_nb_t_up, g_nb_t_dn); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + } + } + for(x0 = 1; x0 < T-1; x0++) { + for(x2 = 1; x2 < LY-1; x2++) { + for(x3 = 1; x3 < LZ-1; x3++) { + ix = g_ipt[x0][LX-1][x2][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != g_nb_x_up) { + printf("Exchange of derivatives is working not correctly (13)!\n"); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + } + } + for(x0 = 1; x0 < T-1; x0++) { + for(x1 = 1; x1 < LX-1; x1++) { + for(x3 = 1; x3 < LZ-1; x3++) { + ix = g_ipt[x0][x1][LY-1][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != g_nb_y_up) { + printf("Exchange of derivatives is working not correctly (14)!\n"); + printf("%d %d %d %d %d\n", x0, x1, x3, ix, g_proc_id); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + } + } + for(x0 = 1; x0 < T-1; x0++) { + for(x1 = 1; x1 < LX-1; x1++) { + for(x2 = 1; x2 < LY-1; x2++) { + ix = g_ipt[x0][x1][x2][LZ-1]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != g_nb_z_up) { + printf("Exchange of derivatives is working not correctly (15)!\n"); + printf("%d %d %d %d %d\n", x0, x1, x3, ix, g_proc_id); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + } + } + for(x2 = 1; x2 < LY-1; x2++) { + for(x3 = 1; x3 < LZ-1; x3++) { + ix = g_ipt[T-1][LX-1][x2][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != (g_nb_x_up + g_nb_t_up)) { + printf("Exchange of derivatives is working not correctly (16)!\n"); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + } + for(x1 = 1; x1 < LX-1; x1++) { + for(x3 = 1; x3 < LZ-1; x3++) { + ix = g_ipt[T-1][x1][LY-1][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != (g_nb_y_up + g_nb_t_up)) { + printf("Exchange of derivatives is working not correctly (17)!\n"); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + } + for(x0 = 1; x0 < T-1; x0++) { + for(x3 = 1; x3 < LZ-1; x3++) { + ix = g_ipt[x0][LX-1][LY-1][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != (g_nb_y_up + g_nb_x_up)) { + printf("Exchange of derivatives is working not correctly (18)!\n"); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + } + for(x0 = 1; x0 < T-1; x0++) { + for(x2 = 1; x2 < LY-1; x2++) { + ix = g_ipt[x0][LX-1][x2][LZ-1]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != (g_nb_x_up + g_nb_z_up)) { + printf("Exchange of derivatives is working not correctly (19)!\n"); + printf("%f %d %d %d\n", df0[ix][mu].d1, g_nb_x_up + g_nb_z_up, g_nb_x_up, g_nb_z_up); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + } + for(x0 = 1; x0 < T-1; x0++) { + for(x1 = 1; x1 < LX-1; x1++) { + ix = g_ipt[x0][x1][LY-1][LZ-1]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != (g_nb_y_up + g_nb_z_up)) { + printf("Exchange of derivatives is working not correctly (20)!\n"); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + } + for(x1 = 1; x1 < LX-1; x1++) { + for(x2 = 1; x2 < LY-1; x2++) { + ix = g_ipt[T-1][x1][x2][LZ-1]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != (g_nb_t_up + g_nb_z_up)) { + printf("Exchange of derivatives is working not correctly (21)!\n"); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + } + for(x3 = 1; x3 < LZ-1; x3++) { + ix = g_ipt[T-1][LX-1][LY-1][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != (g_nb_t_up + g_nb_x_up + g_nb_y_up)) { + printf("Exchange of derivatives is working not correctly (22)!\n"); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + for(x2 = 1; x2 < LY-1; x2++) { + ix = g_ipt[T-1][LX-1][x2][LZ-1]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != (g_nb_t_up + g_nb_x_up + g_nb_z_up)) { + printf("Exchange of derivatives is working not correctly (23)!\n"); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + for(x1 = 1; x1 < LX-1; x1++) { + ix = g_ipt[T-1][x1][LY-1][LZ-1]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != (g_nb_t_up + g_nb_z_up + g_nb_y_up)) { + printf("Exchange of derivatives is working not correctly (24)!\n"); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + for(x0 = 1; x0 < T-1; x0++) { + ix = g_ipt[x0][LX-1][LY-1][LZ-1]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != (g_nb_z_up + g_nb_x_up + g_nb_y_up)) { + printf("Exchange of derivatives is working not correctly (25)!\n"); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + ix = g_ipt[T-1][LX-1][LY-1][LZ-1]; + for(mu = 0; mu < 4; mu++){ + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != (g_nb_z_up + g_nb_x_up + g_nb_y_up + g_nb_t_up)) { + printf("Exchange of derivatives is working not correctly (26)!\n"); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + +# endif + + // edges + if(g_proc_id == 0) { + printf("# Setting edges\n"); + } + + for(ix = 0; ix < VOLUMEPLUSRAND; ix++) { + for(mu=0; mu<4; mu++) { + x = (double*)&ddummy[ix][mu]; + for(int j = 0; j < 8; j++) { + x[j] = 0.; + } + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + x[j] = 0.; + } + } + } + +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) + + //xt edge + for(x2 = 0; x2 < LY; x2++) { + for(x3 = 0; x3 < LZ; x3++) { + ix = g_iup[g_iup[ g_ipt[T-1][LX-1][x2][x3] ][1] ][0]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + x[j] = (double)g_cart_id; + } + } + ix = g_iup[g_idn[ g_ipt[T-1][0][x2][x3] ][1] ][0]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + x[j] = (double)g_cart_id; + } + } + ix = g_idn[g_iup[ g_ipt[0][LX-1][x2][x3] ][1] ][0]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + x[j] = (double)g_cart_id; + } + } + ix = g_idn[g_idn[ g_ipt[0][0][x2][x3] ][1] ][0]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + x[j] = (double)g_cart_id; + } + } + } + } +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT) + + // ty edge + for(x1 = 0; x1 < LX; x1++) { + for(x3 = 0; x3 < LZ; x3++) { + ix = g_iup[g_iup[ g_ipt[T-1][x1][LY-1][x3] ][2] ][0]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + x[j] = (double)g_cart_id; + } + } + ix = g_iup[g_idn[ g_ipt[T-1][x1][0][x3] ][2] ][0]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + x[j] = (double)g_cart_id; + } + } + ix = g_idn[g_iup[ g_ipt[0][x1][LY-1][x3] ][2] ][0]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + x[j] = (double)g_cart_id; + } + } + ix = g_idn[g_idn[ g_ipt[0][x1][0][x3] ][2] ][0]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + x[j] = (double)g_cart_id; + } + } + } + } + + // xy edge + for(x0 = 0; x0 < T; x0++) { + for(x3 = 0; x3 < LZ; x3++) { + ix = g_iup[g_iup[ g_ipt[x0][LX-1][LY-1][x3] ][2] ][1]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + x[j] = (double)g_cart_id; + } + } + ix = g_iup[g_idn[ g_ipt[x0][LX-1][0][x3] ][2] ][1]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + x[j] = (double)g_cart_id; + } + } + ix = g_idn[g_iup[ g_ipt[x0][0][LY-1][x3] ][2] ][1]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + x[j] = (double)g_cart_id; + } + } + ix = g_idn[g_idn[ g_ipt[x0][0][0][x3] ][2] ][1]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + x[j] = (double)g_cart_id; + } + } + } + } + + +# endif + + MPI_Barrier(MPI_COMM_WORLD); + xchange_deri(df0); + MPI_Barrier(MPI_COMM_WORLD); + +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) + + di[0] = (g_proc_coords[0] - 1)%g_nproc_t; + di[1] = (g_proc_coords[1] - 1)%g_nproc_x; + di[2] = g_proc_coords[2]; + di[3] = g_proc_coords[3]; + MPI_Cart_rank(g_cart_grid, di, &mm); + di[0] = (g_proc_coords[0] - 1)%g_nproc_t; + di[1] = (g_proc_coords[1] + 1)%g_nproc_x; + MPI_Cart_rank(g_cart_grid, di, &mp); + di[0] = (g_proc_coords[0] + 1)%g_nproc_t; + di[1] = (g_proc_coords[1] - 1)%g_nproc_x; + MPI_Cart_rank(g_cart_grid, di, &pm); + di[0] = (g_proc_coords[0] + 1)%g_nproc_t; + di[1] = (g_proc_coords[1] + 1)%g_nproc_x; + MPI_Cart_rank(g_cart_grid, di, &pp); + +#ifdef PARALLELXT + for(x2 = 0; x2 < LY; x2++) { + for(x3 = 0; x3 < LZ; x3++) { +#else + for(x2 = 1; x2 < LY-1; x2++) { + for(x3 = 0; x3 < LZ; x3++) { +#endif + ix = g_ipt[0][0][x2][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != mm) { + printf("Exchange of derivatives is working not correctly (e5mm)!\n"); + printf("%f %d %d %d %d\n", x[j], g_nb_t_up, g_nb_x_up, pp, mm); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + ix = g_ipt[0][LX-1][x2][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != mp) { + printf("Exchange of derivatives is working not correctly (e5mp)!\n"); + printf("%f %d %d %d %d\n", x[j], g_nb_t_up, g_nb_x_up, pm, mp); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + ix = g_ipt[T-1][0][x2][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != pm) { + printf("Exchange of derivatives is working not correctly (e5pm)!\n"); + printf("%f %d %d %d %d\n", x[j], g_nb_t_up, g_nb_x_up, pm, mp); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + ix = g_ipt[T-1][LX-1][x2][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != pp) { + printf("Exchange of derivatives is working not correctly (e5pp)!\n"); + printf("%f %d %d %d %d\n", x[j], g_nb_t_up, g_nb_x_up, pp, mm); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + } + +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT) + + // xy-edge + di[1] = (g_proc_coords[1] - 1)%g_nproc_x; + di[2] = (g_proc_coords[2] - 1)%g_nproc_y; + di[0] = g_proc_coords[0]; + di[3] = g_proc_coords[3]; + MPI_Cart_rank(g_cart_grid, di, &mm); + di[1] = (g_proc_coords[1] - 1)%g_nproc_x; + di[2] = (g_proc_coords[2] + 1)%g_nproc_y; + MPI_Cart_rank(g_cart_grid, di, &mp); + di[1] = (g_proc_coords[1] + 1)%g_nproc_x; + di[2] = (g_proc_coords[2] - 1)%g_nproc_y; + MPI_Cart_rank(g_cart_grid, di, &pm); + di[1] = (g_proc_coords[1] + 1)%g_nproc_x; + di[2] = (g_proc_coords[2] + 1)%g_nproc_y; + MPI_Cart_rank(g_cart_grid, di, &pp); + + for(x0 = 1; x0 < T-1; x0++) { + for(x3 = 0; x3 < LZ; x3++) { + ix = g_ipt[x0][0][0][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != mm) { + printf("Exchange of derivatives is working not correctly (e6mm)!\n"); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + ix = g_ipt[x0][LX-1][0][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != pm) { + printf("Exchange of derivatives is working not correctly (e6pm)!\n"); + printf("%f %d %d %d %d\n", x[j], g_nb_x_up, g_nb_y_up, pm, mp); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + ix = g_ipt[x0][0][LY-1][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != mp) { + printf("Exchange of derivatives is working not correctly (e6mp)!\n"); + printf("%f %d %d %d %d\n", x[j], g_nb_x_up, g_nb_y_up, pm, mp); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + ix = g_ipt[x0][LX-1][LY-1][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != pp) { + printf("Exchange of derivatives is working not correctly (e6pp)!\n"); + printf("%f %d %d %d %d\n", x[j], g_nb_x_up, g_nb_y_up, pp, mm); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + } + + di[0] = (g_proc_coords[0] - 1)%g_nproc_t; + di[2] = (g_proc_coords[2] - 1)%g_nproc_y; + di[1] = g_proc_coords[1]; + di[3] = g_proc_coords[3]; + MPI_Cart_rank(g_cart_grid, di, &mm); + di[0] = (g_proc_coords[0] - 1)%g_nproc_t; + di[2] = (g_proc_coords[2] + 1)%g_nproc_y; + MPI_Cart_rank(g_cart_grid, di, &mp); + di[0] = (g_proc_coords[0] + 1)%g_nproc_t; + di[2] = (g_proc_coords[2] - 1)%g_nproc_y; + MPI_Cart_rank(g_cart_grid, di, &pm); + di[0] = (g_proc_coords[0] + 1)%g_nproc_t; + di[2] = (g_proc_coords[2] + 1)%g_nproc_y; + MPI_Cart_rank(g_cart_grid, di, &pp); + + for(x1 = 1; x1 < LX-1; x1++) { + for(x3 = 0; x3 < LZ; x3++) { + ix = g_ipt[0][x1][0][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != mm) { + printf("Exchange of derivatives is working not correctly (e7mm)!\n"); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + ix = g_ipt[T-1][x1][0][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != pm) { + printf("Exchange of derivatives is working not correctly (e7pm)!\n"); + printf("%f %d %d %d %d\n", x[j], g_nb_t_up, g_nb_y_up, pm, pm); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + ix = g_ipt[0][x1][LY-1][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != mp) { + printf("Exchange of derivatives is working not correctly (e7mp)!\n"); + printf("%f %d %d %d %d\n", x[j], g_nb_t_up, g_nb_y_up, pm, mp); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + ix = g_ipt[T-1][x1][LY-1][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != pp) { + printf("Exchange of derivatives is working not correctly (e7pp)!\n"); + printf("%f %d %d %d %d\n", x[j], g_nb_t_up, g_nb_y_up, pp, mm); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + } + + di[0] = (g_proc_coords[0] - 1)%g_nproc_t; + di[2] = (g_proc_coords[2] - 1)%g_nproc_y; + di[1] = g_proc_coords[1]; + di[3] = g_proc_coords[3]; + MPI_Cart_rank(g_cart_grid, di, &mm); + di[0] = g_proc_coords[0]; + di[1] = (g_proc_coords[1] - 1)%g_nproc_x; + di[2] = (g_proc_coords[2] - 1)%g_nproc_y; + di[3] = g_proc_coords[3]; + MPI_Cart_rank(g_cart_grid, di, &mp); + di[0] = (g_proc_coords[0] - 1)%g_nproc_t; + di[1] = (g_proc_coords[1] - 1)%g_nproc_x; + di[2] = g_proc_coords[2]; + di[3] = g_proc_coords[3]; + MPI_Cart_rank(g_cart_grid, di, &pm); + + for(x3 = 0; x3 < LZ; x3++) { + ix = g_ipt[0][0][0][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != mm + mp + pm) { + printf("Exchange of derivatives is working not correctly (e8mmm)!\n"); + printf("%d %d %d %d %d\n", (int)x[j], mm, mp, pm, pp); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + + di[0] = (g_proc_coords[0] + 1)%g_nproc_t; + di[2] = (g_proc_coords[2] - 1)%g_nproc_y; + di[1] = g_proc_coords[1]; + di[3] = g_proc_coords[3]; + MPI_Cart_rank(g_cart_grid, di, &mm); + di[0] = g_proc_coords[0]; + di[1] = (g_proc_coords[1] - 1)%g_nproc_x; + di[2] = (g_proc_coords[2] - 1)%g_nproc_y; + di[3] = g_proc_coords[3]; + MPI_Cart_rank(g_cart_grid, di, &mp); + di[0] = (g_proc_coords[0] + 1)%g_nproc_t; + di[1] = (g_proc_coords[1] - 1)%g_nproc_x; + di[2] = g_proc_coords[2]; + di[3] = g_proc_coords[3]; + MPI_Cart_rank(g_cart_grid, di, &pm); + + for(x3 = 0; x3 < LZ; x3++) { + ix = g_ipt[T-1][0][0][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != mm + mp + pm) { + printf("Exchange of derivatives is working not correctly (e8pmm)!\n"); + printf("%d %d %d %d %d\n", (int)x[j], mm, mp, pm, pp); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + + di[0] = (g_proc_coords[0] + 1)%g_nproc_t; + di[2] = (g_proc_coords[2] + 1)%g_nproc_y; + di[1] = g_proc_coords[1]; + di[3] = g_proc_coords[3]; + MPI_Cart_rank(g_cart_grid, di, &mm); + di[0] = g_proc_coords[0]; + di[1] = (g_proc_coords[1] - 1)%g_nproc_x; + di[2] = (g_proc_coords[2] + 1)%g_nproc_y; + di[3] = g_proc_coords[3]; + MPI_Cart_rank(g_cart_grid, di, &mp); + di[0] = (g_proc_coords[0] + 1)%g_nproc_t; + di[1] = (g_proc_coords[1] - 1)%g_nproc_x; + di[2] = g_proc_coords[2]; + di[3] = g_proc_coords[3]; + MPI_Cart_rank(g_cart_grid, di, &pm); + + for(x3 = 0; x3 < LZ; x3++) { + ix = g_ipt[T-1][0][LY-1][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != mm + mp + pm) { + printf("Exchange of derivatives is working not correctly (e8pmp)!\n"); + printf("%d %d %d %d %d\n", (int)x[j], mm, mp, pm, pp); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + + di[0] = (g_proc_coords[0] + 1)%g_nproc_t; + di[2] = (g_proc_coords[2] + 1)%g_nproc_y; + di[1] = g_proc_coords[1]; + di[3] = g_proc_coords[3]; + MPI_Cart_rank(g_cart_grid, di, &mm); + di[0] = g_proc_coords[0]; + di[1] = (g_proc_coords[1] + 1)%g_nproc_x; + di[2] = (g_proc_coords[2] + 1)%g_nproc_y; + di[3] = g_proc_coords[3]; + MPI_Cart_rank(g_cart_grid, di, &mp); + di[0] = (g_proc_coords[0] + 1)%g_nproc_t; + di[1] = (g_proc_coords[1] + 1)%g_nproc_x; + di[2] = g_proc_coords[2]; + di[3] = g_proc_coords[3]; + MPI_Cart_rank(g_cart_grid, di, &pm); + + for(x3 = 0; x3 < LZ; x3++) { + ix = g_ipt[T-1][LX-1][LY-1][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != mm + mp + pm) { + printf("Exchange of derivatives is working not correctly (e8ppp)!\n"); + printf("%d %d %d %d %d\n", (int)x[j], mm, mp, pm, pp); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + + di[0] = (g_proc_coords[0] - 1)%g_nproc_t; + di[2] = (g_proc_coords[2] - 1)%g_nproc_y; + di[1] = g_proc_coords[1]; + di[3] = g_proc_coords[3]; + MPI_Cart_rank(g_cart_grid, di, &mm); + di[0] = g_proc_coords[0]; + di[1] = (g_proc_coords[1] + 1)%g_nproc_x; + di[2] = (g_proc_coords[2] - 1)%g_nproc_y; + di[3] = g_proc_coords[3]; + MPI_Cart_rank(g_cart_grid, di, &mp); + di[0] = (g_proc_coords[0] - 1)%g_nproc_t; + di[1] = (g_proc_coords[1] + 1)%g_nproc_x; + di[2] = g_proc_coords[2]; + di[3] = g_proc_coords[3]; + MPI_Cart_rank(g_cart_grid, di, &pm); + + for(x3 = 0; x3 < LZ; x3++) { + ix = g_ipt[0][LX-1][0][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != mm + mp + pm) { + printf("Exchange of derivatives is working not correctly (e8mpm)!\n"); + printf("%d %d %d %d %d\n", (int)x[j], mm, mp, pm, pp); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + + di[0] = (g_proc_coords[0] - 1)%g_nproc_t; + di[2] = (g_proc_coords[2] + 1)%g_nproc_y; + di[1] = g_proc_coords[1]; + di[3] = g_proc_coords[3]; + MPI_Cart_rank(g_cart_grid, di, &mm); + di[0] = g_proc_coords[0]; + di[1] = (g_proc_coords[1] - 1)%g_nproc_x; + di[2] = (g_proc_coords[2] + 1)%g_nproc_y; + di[3] = g_proc_coords[3]; + MPI_Cart_rank(g_cart_grid, di, &mp); + di[0] = (g_proc_coords[0] - 1)%g_nproc_t; + di[1] = (g_proc_coords[1] - 1)%g_nproc_x; + di[2] = g_proc_coords[2]; + di[3] = g_proc_coords[3]; + MPI_Cart_rank(g_cart_grid, di, &pm); + + for(x3 = 0; x3 < LZ; x3++) { + ix = g_ipt[0][0][LY-1][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != mm + mp + pm) { + printf("Exchange of derivatives is working not correctly (e8mmp)!\n"); + printf("%d %d %d %d %d\n", (int)x[j], mm, mp, pm, pp); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + + di[0] = (g_proc_coords[0] - 1)%g_nproc_t; + di[2] = (g_proc_coords[2] + 1)%g_nproc_y; + di[1] = g_proc_coords[1]; + di[3] = g_proc_coords[3]; + MPI_Cart_rank(g_cart_grid, di, &mm); + di[0] = g_proc_coords[0]; + di[1] = (g_proc_coords[1] + 1)%g_nproc_x; + di[2] = (g_proc_coords[2] + 1)%g_nproc_y; + di[3] = g_proc_coords[3]; + MPI_Cart_rank(g_cart_grid, di, &mp); + di[0] = (g_proc_coords[0] - 1)%g_nproc_t; + di[1] = (g_proc_coords[1] + 1)%g_nproc_x; + di[2] = g_proc_coords[2]; + di[3] = g_proc_coords[3]; + MPI_Cart_rank(g_cart_grid, di, &pm); + + for(x3 = 0; x3 < LZ; x3++) { + ix = g_ipt[0][LX-1][LY-1][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != mm + mp + pm) { + printf("Exchange of derivatives is working not correctly (e8mpp)!\n"); + printf("%d %d %d %d %d\n", (int)x[j], mm, mp, pm, pp); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + + di[0] = (g_proc_coords[0] + 1)%g_nproc_t; + di[2] = (g_proc_coords[2] - 1)%g_nproc_y; + di[1] = g_proc_coords[1]; + di[3] = g_proc_coords[3]; + MPI_Cart_rank(g_cart_grid, di, &mm); + di[0] = g_proc_coords[0]; + di[1] = (g_proc_coords[1] + 1)%g_nproc_x; + di[2] = (g_proc_coords[2] - 1)%g_nproc_y; + di[3] = g_proc_coords[3]; + MPI_Cart_rank(g_cart_grid, di, &mp); + di[0] = (g_proc_coords[0] + 1)%g_nproc_t; + di[1] = (g_proc_coords[1] + 1)%g_nproc_x; + di[2] = g_proc_coords[2]; + di[3] = g_proc_coords[3]; + MPI_Cart_rank(g_cart_grid, di, &pm); + + for(x3 = 0; x3 < LZ; x3++) { + ix = g_ipt[T-1][LX-1][0][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != mm + mp + pm) { + printf("Exchange of derivatives is working not correctly (e8ppm)!\n"); + printf("%d %d %d %d %d\n", (int)x[j], mm, mp, pm, pp); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + for(x0 = 1; x0 < T-1; x0++) { + for(x1 = 1; x1 < LX-1; x1++) { + for(x2 = 1; x2 < LY-1; x2++) { + for(x3 = 0; x3 < LZ; x3++) { + ix = g_ipt[x0][x1][x2][x3]; + for(mu = 0; mu < 4; mu++) { + x = (double*)&df0[ix][mu]; + for(int j = 0; j < 8; j++) { + if((int)x[j] != 0) { + printf("Exchange of derivatives is working not correctly (ebulk XYT)!\n"); + printf("Aborting program!\n"); + MPI_Abort(MPI_COMM_WORLD, 5); MPI_Finalize(); + exit(0); + } + } + } + } + } + } + } + + +# endif + + + if(g_proc_id == 0) { + printf("# The exchange routines are working correctly.\n"); + } + } /* for k=0, k<1 */ +#endif /* MPI */ + return(0); +} + + +#endif /* _INDEX_INDEP_GEOM */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/test/hopping_test.README b/qcd/part_cpu/applications/QCD/src/kernel_D/test/hopping_test.README new file mode 100644 index 0000000000000000000000000000000000000000..0bcaaddcaee11a150b9bec1d908256213859d585 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/test/hopping_test.README @@ -0,0 +1,27 @@ +hopping_test allows the debug of new version of the hopping matrix +(parallel or not) against a previous and reliable one. + +1. compile hopping_test.c (main directory hmc) using the reliable +Hopping_Matrix for a single processor and call the executable, say, +hopping_test_ref + +2. compile hopping_test.c using the new parallel Hopping_Matrix and +call it, say hopping_test_new + +3. use the script hopping_test_generate_script to generate input +random gauge configs, input random spinor configs and output random +spinor configs that are saved in the new directory confs. This is +repeated for all the choices of L and T specified in the loop inside +the script. The script will need also the template input files +hopping_test.input.start See comments in hopping_test_generate_script +for more info. + +4. use the script hopping_test_qscript to check that the new +Hopping_Matrix produces the same spinor output, after reading the +gauge and spinor input saved before. The script hopping_test_qscript +needs the specification of the number of processes and tries all +allowed parallelization for all the lattice sizes specified in the +loop. Results arewritten in files res_$suff/out_* and +res_$suff/diff_*. The script will need also the template input files +hopping_test.input.new hopping_test.input.compare. See comments in +hopping_test_qscript for more info. diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/test/hopping_test.input.compare b/qcd/part_cpu/applications/QCD/src/kernel_D/test/hopping_test.input.compare new file mode 100644 index 0000000000000000000000000000000000000000..a2d1af83863f5690e5ccdac966971706ce07f5c0 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/test/hopping_test.input.compare @@ -0,0 +1,19 @@ +# This only compares SourceFilename with SourceFilename.2 (no MPI) +T=TT +L=LL +NrXProcs=NX +NrYProcs=NY +NrZProcs=NZ + +UseEvenOdd = yes + + +#StartCondition can be: hot (random gauge), cold (unit gauge), continue (read from GaugeConfigInputFile), +#restart (random gauge, written on GaugeConfigInputFile). +Startcondition = cold +GaugeConfigInputFile = gaugeconf +#ReadSource:yes(read from SourceInputFilename), no(random), save(random spinor, written on SourceInputFilename). +ReadSource = yes +SourceFilename = spincolorfield + +WriteCheckpoints = yes \ No newline at end of file diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/test/hopping_test.input.new b/qcd/part_cpu/applications/QCD/src/kernel_D/test/hopping_test.input.new new file mode 100644 index 0000000000000000000000000000000000000000..d24f2fa1d248504af59b5249c5bf0b6e5648f9d1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/test/hopping_test.input.new @@ -0,0 +1,19 @@ +T=TT +L=LL + +NrXProcs=NX +NrYProcs=NY +NrZProcs=NZ + +UseEvenOdd = yes + + +#StartCondition can be: hot (random gauge), cold (unit gauge), continue (read from GaugeConfigInputFile), +#restart (random gauge, written on GaugeConfigInputFile). +Startcondition = continue +GaugeConfigInputFile = gaugeconf +#ReadSource:yes(read from SourceInputFilename), no(random), save(random spinor, written on SourceInputFilename). +ReadSource = yes +SourceFilename = spincolorfield + +WriteCheckpoints = no \ No newline at end of file diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/test/hopping_test.input.start b/qcd/part_cpu/applications/QCD/src/kernel_D/test/hopping_test.input.start new file mode 100644 index 0000000000000000000000000000000000000000..2bb825e286e64e09322595db52047fbbb971c3eb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/test/hopping_test.input.start @@ -0,0 +1,20 @@ +T=TT +L=LL + +UseEvenOdd=yes + +NrXProcs=NX +NrYProcs=NY +NrZProcs=NZ + + +#StartCondition can be: hot (random gauge), cold (unit gauge), continue (read from GaugeConfigInputFile), +#restart (random gauge, written on GaugeConfigInputFile). +Startcondition = restart +GaugeConfigInputFile = gaugeconf +#ReadSource:yes(read from SourceInputFilename), no(random), save(random spinor, written on SourceInputFilename). +ReadSource = nobutsave +SourceFilename = spincolorfield + +WriteCheckpoints = no + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/test/hopping_test_generate_script b/qcd/part_cpu/applications/QCD/src/kernel_D/test/hopping_test_generate_script new file mode 100755 index 0000000000000000000000000000000000000000..a770b858463ff965d6084335147909fc2202f4d4 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/test/hopping_test_generate_script @@ -0,0 +1,30 @@ +#!/bin/bash + + +# This script generates random gauge configurations, random spinor configurations +# for all the lattices specified in the loop below. +# For each lattice it also applies the Hopping matrix once (for even and once for odd) +# and saves the resulting output spinors. +# The companion script hopping_test_qscript verifies that the parallel version of the program +# produces the same result for all allowed parallelizations (with the number of procs and +# lattices specified there) + +program=hopping_test_ref + +mkdir -p confs + +for ll in 4 6 8 10 12 14 16 ; do + for tt in 4 5 6 7 8 9 10 16 20 24 ; do + + echo "$ll - $tt" + + rm -f spincolorfield spincolorfield.out gaugeconf + + sed s/TT/${tt}/g hopping_test.input.start | sed s/LL/${ll}/g | sed s/NX/1/g | sed s/NY/1/g | sed s/NZ/1/g > hopping_test.input + ./$program > confs/out_L${ll}T${tt} + mv spincolorfield confs/spincolorfield.in.L${ll}T${tt} + mv spincolorfield.out confs/spincolorfield.out.L${ll}T${tt} + mv gaugeconf confs/gaugeconf.L${ll}T${tt} + + done +done diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/test/hopping_test_qscript b/qcd/part_cpu/applications/QCD/src/kernel_D/test/hopping_test_qscript new file mode 100755 index 0000000000000000000000000000000000000000..b0c49b85481731b9afd233b34dd23432a56e4389 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/test/hopping_test_qscript @@ -0,0 +1,102 @@ +#!/bin/zsh +# +#$ -N test +#$ -l h_cpu=02:30:00 +#$ -l h_vmem=800M +###$ -m bea scorzato@ect.it +#$ -e errfile +#$ -o outfile +#$ -pe mpi 8 + +####### +# This script should be run after hopping_test_generate_script (see explamations there) +# see hopping_test.README for a general overview +# see below for the explanation of the parameters +######## parameters to set: +suff=XYZtsp # suffix for the program name and also suffix of the directory name + # res_$suff containing the results +nxmax=100 # 1 if x direction is not parallelized, 100 otherwise +nymax=100 # 1 if y direction is not parallelized, 100 otherwise +nzmax=100 # 1 if z direction is not parallelized, 100 otherwise +ntmax=1 # 1 if t direction is not parallelized, 100 otherwise +################### + +joke=0 +ini openmpi +program=hopping_test_${suff} +procs=$NSLOTS +cd /afs/ifh.de/group/nic/scratch/pool1/scorzato/hmc-test +mkdir -p res_${suff} + +echo "nprocs: $procs, running: $program" > list + +for ll in 4 6 8 10 12 14 16 ; do + for tt in 4 5 6 7 8 9 10 16 20 24 ; do + +#ll=4 +#tt=5 + + for nx in `seq $(( $ll - 1 ))` ; do + if [ $nx -le $nxmax ] ; then + lx=$(($ll / $nx)) + if [ $(($nx * $lx)) -eq $ll ] ; then + for ny in `seq $(( $ll - 1 ))` ; do + if [ $ny -le $nymax ] ; then + ly=$(($ll / $ny)) + if [ $(($ny * $ly)) -eq $ll ] ; then + for nz in `seq $(( $ll - 1 ))` ; do + if [ $nz -le $nzmax ] ; then + lz=$(($ll / $nz)) + if [ $(($nz * $lz)) -eq $ll ] ; then + for nt in `seq $(( $tt - 1 ))` ; do + if [ $nt -le $ntmax ] ; then + lt=$(($tt / $nt)) + if [ $(($nt * $lt)) -eq $tt ] ; then + + if [ $(( $nx * $ny * $nz * $nt )) -eq $procs ] ; then + if [ $((2 * $(($lz / 2)))) -eq $lz ] ; then + svol=$(( $lx * $ly * $lt )) + if [ $((2 * $(($svol / 2)))) -eq $svol ] ; then + + echo "$ll - $tt : $nx - $ny - $nz - $nt : $procs" >> list + + + if [ $joke -eq 0 ] ; then + rm -f spincolorfield spincolorfield.2 spincolorfield.out gaugeconf + + cp confs/spincolorfield.in.L${ll}T${tt} spincolorfield + cp confs/gaugeconf.L${ll}T${tt} gaugeconf + + sed s/TT/${tt}/g hopping_test.input.new | sed s/LL/${ll}/g | sed s/NX/${nx}/g | sed s/NY/${ny}/g | sed s/NZ/${nz}/g > hopping_test.input + if [ $procs -eq 1 ] ; then + ./$program &> res_${suff}/out_L${ll}T${tt}_nx${nx}ny${ny}nz${nz}nt${nt}_P${procs} + else + mpirun --mca btl "^udapl" -np $NSLOTS $program &> res_${suff}/out_L${ll}T${tt}_nx${nx}ny${ny}nz${nz}nt${nt}_P${procs} + fi + mv spincolorfield.out spincolorfield.2 + cp confs/spincolorfield.out.L${ll}T${tt} spincolorfield + + sed s/TT/${tt}/g hopping_test.input.compare | sed s/LL/${ll}/g | sed s/NX/${nx}/g | sed s/NY/${ny}/g | sed s/NZ/${nz}/g > hopping_test.input + + ./hopping_test_refv &> res_${suff}/diff_L${ll}T${tt}_nx${nx}ny${ny}nz${nz}nt${nt}_P${procs} + fi + fi + fi + + fi + + fi + fi + done + fi + fi + done + fi + fi + done + fi + fi + done + + done +done diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/test/measure_rectangles.debug.c b/qcd/part_cpu/applications/QCD/src/kernel_D/test/measure_rectangles.debug.c new file mode 100644 index 0000000000000000000000000000000000000000..8eda8f018467fa2f33851335d152f8e327aa17eb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/test/measure_rectangles.debug.c @@ -0,0 +1,140 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +/******************************************************************* + * + * Here the 1x2 rectangles are implemented + * for renormalization group improved gauge + * actions like the DBW2 or the Iwasaki + * gauge action. + * + * 1/3 \sum_{\mu\leq\nu;\mu,nu=1}^4 Tr U^{1x2} + * + * author: Carsten Urbach + * + * + *******************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#ifdef MPI +#include +#endif +#include "global.h" +#include "sse.h" +#include "su3.h" +#include "su3adj.h" +#include "geometry_eo.h" +#include "measure_rectangles.h" + + +double measure_rectangles() { + int i, j, k, mu, nu; + int x, y, z, t; + static su3 pr1, pr2, tmp; + su3 *v = NULL , *w = NULL; + static double ga, ac, gas; + static double ks, kc, tr, ts, tt; + kc=0.0; ks=0.0; + double d = 0.; + FILE * debugfile; + char filename[100]; + + sprintf(filename,"debug_mr.s"); +#ifdef PARALLELT + sprintf(filename,"debug_mr.pt.%d", g_proc_id); +#endif +#ifdef PARALLELXT + sprintf(filename,"debug_mr.pxt.%d", g_proc_id); +#endif + debugfile = fopen(filename,"w"); + + for(x = 0; x < LX; x++) { + for(y = 0; y < LY; y++) { + for(z = 0; z < LZ; z++) { + for(t = 0; t < T; t++) { + i = g_ipt[t][x][y][z]; + + for (mu = 0; mu < 4; mu++) { + d = 0.; + for (nu = 0; nu < 4; nu++) { + if(nu != mu) { + /* + ^ + | + ^ + | + -> + */ + j = g_iup[i][mu]; + k = g_iup[j][nu]; + v = &g_gauge_field[i][mu]; + w = &g_gauge_field[j][nu]; + _su3_times_su3(tmp, *v, *w); + v = &g_gauge_field[k][nu]; + _su3_times_su3(pr1, tmp, *v); + /* + -> + ^ + | + ^ + | + */ + j = g_iup[i][nu]; + k = g_iup[j][nu]; + v = &g_gauge_field[i][nu]; + w = &g_gauge_field[j][nu]; + _su3_times_su3(tmp, *v, *w); + v = &g_gauge_field[k][mu]; + _su3_times_su3(pr2, tmp, *v); + + /* Trace it */ + _trace_su3_times_su3d(ac,pr1,pr2); + d += ac; + /* printf("i mu nu: %d %d %d, ac = %e\n", i, mu, nu, ac); */ + /* Kahan summation */ + tr=ac+kc; + ts=tr+ks; + tt=ts-ks; + ks=ts; + kc=tr-tt; + } + } + fprintf(debugfile,"%d %d %d %d %d %e\n", + g_proc_coords[0]*T+t, g_proc_coords[1]*LX+x, y, z, mu, d); + } + + } + } + } + } +/* fprintf(debugfile,"###\n"); */ + fclose(debugfile); + ga=(kc+ks)/3.0; +#ifdef MPI + MPI_Allreduce(&ga, &gas, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD); + return gas; +#else + return ga; +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/test/overlaptests.c b/qcd/part_cpu/applications/QCD/src/kernel_D/test/overlaptests.c new file mode 100644 index 0000000000000000000000000000000000000000..d2748ec31c73f34747d7460de862766c60f99935 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/test/overlaptests.c @@ -0,0 +1,766 @@ +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef BENCHMARK +#include <../c-lime/include/lime.h> +#else +#include +#endif +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#include +#include +#ifdef MPI +#include +#endif +#include "global.h" +#include "su3spinor.h" +#include "linalg_eo.h" +#include "start.h" +#ifdef MPI +# include "xchange/xchange.h" +#endif +#include "read_input.h" +#include "boundary.h" +#include "linalg/convert_eo_to_lexic.h" +#include "operator/Dov_psi.h" + +#include "overlaptests.h" +#include "gamma.h" + +void ov_check_alloc(void * pS) { + + if (pS==NULL) { + fprintf(stderr, "Error: could not allocate memory for spinor field"); + exit(EXIT_FAILURE); + } + +} + +spinor** ov_alloc_spinors(int n) { + + spinor *_s, **s; + int i; + + s = (spinor**)calloc(n, sizeof(spinor*)); + ov_check_alloc(s); +#if ( defined SSE || defined SSE2 || defined SSE3) + _s = malloc((n*VOLUMEPLUSRAND)*sizeof(spinor)+ALIGN_BASE+sizeof(spinor*)); + ov_check_alloc(_s); + s[0] = (spinor *)(((unsigned long int)(_s) + sizeof(spinor*) + ALIGN_BASE)&~ALIGN_BASE); + *(((spinor**)s[0])-1) = _s; +#else + s[0] = malloc(n*VOLUMEPLUSRAND*sizeof(spinor)); + ov_check_alloc(s[0]); +#endif + + for(i = 1; i < n; i++) + s[i] = s[0]+VOLUMEPLUSRAND; + + return s; +} + +void ov_free_spinors(spinor** s) { + +#if (defined SSE3 || defined SSE2 || defined SSE) + free(*(((spinor**)s[0])-1)); +#else + free(s[0]); +#endif + free(s); + +} + +spinor* ov_alloc_spinor(void) { + + spinor *s, *_s; + +#if ( defined SSE || defined SSE2 || defined SSE3) + _s = malloc(sizeof(spinor*)+VOLUMEPLUSRAND*sizeof(spinor)+ALIGN_BASE); + ov_check_alloc(_s); + s = (spinor *)(((unsigned long int)(_s) + sizeof(spinor*) + ALIGN_BASE)&~ALIGN_BASE); + *(((spinor**)s)-1) = _s; +#else + s = malloc(VOLUMEPLUSRAND*sizeof(spinor)); + ov_check_alloc(s); +#endif + + return s; +} + +void ov_free_spinor(spinor *s) { + +#if ( defined SSE || defined SSE2 || defined SSE3) + free(*(((spinor**)s)-1)); +#else + free(s); +#endif + +} + + +/* col sum norm of operator in colour and spinor space */ +double ov_operator_colsumnorm(spinor *s[4][3], int k) +{ + double norm = 0.0, nrm; + + for (int i=0; i<4; ++i) + for (int j=0; j<3; ++j) + { + _spinor_norm_l1(nrm, s[i][j][k]); + if (nrm > norm) + norm = nrm; + } + return norm; +} + +void ov_check_locality() { + + double norm, *maxnorm, *minnorm, *avgnorm; + int i, j, k, x, x_taxi, y, y_taxi, z, z_taxi, t, t_taxi, maxtaxi, *samples, taxi; + spinor *s[4][3]; + + /* evaluate Dov(psi) */ + for (i=0; i<4; i++) + for (j=0; j<3; j++) { + + /* get memory for the spinor */ + s[i][j] = ov_alloc_spinor(); + + /* create delta source at origin */ + source_spinor_field(g_spinor_field[1], g_spinor_field[0], i, j); + convert_eo_to_lexic(g_spinor_field[2], g_spinor_field[1], g_spinor_field[0]); + + /* apply Dov */ + Dov_psi(s[i][j], g_spinor_field[2]); + } + + /* init locality table */ + maxtaxi = (LX/2)+(LY/2)+(LZ/2)+T/2; + maxnorm = (double*)calloc(maxtaxi+1, sizeof(double)); + minnorm = (double*)calloc(maxtaxi+1, sizeof(double)); + avgnorm = (double*)calloc(maxtaxi+1, sizeof(double)); + samples = (int*)calloc(maxtaxi+1, sizeof(int)); + for(i = 0; i <= maxtaxi; i++) { + maxnorm[i] = 0.; + minnorm[i] = 1.0e100; + avgnorm[i] = 0.; + samples[i] = 0; + } + + /* fill locality table */ + printf("// beginning locality test\n"); + for(x=0; x LX/2) ? LX-x : x; + for(y = 0; y < LY; y++){ + y_taxi = (y > LY/2) ? LY-y : y; + for(z = 0; z < LZ; z++){ + z_taxi = (z > LZ/2) ? LZ-z : z; + for(t = 0; t < T; t++){ + t_taxi = (t > T/2) ? T - t : t; + taxi = x_taxi + y_taxi + z_taxi + t_taxi; + k = g_ipt[t][x][y][z]; + + norm = ov_operator_colsumnorm(s, k); + + // statistics + if (norm > maxnorm[taxi]) + maxnorm[taxi] = norm; + if (norm < minnorm[taxi]) + minnorm[taxi] = norm; + avgnorm[taxi] += norm; + samples[taxi]++; + } + } + } + } + + /* print locality table */ + printf("// locality check of overlap operator\n"); + printf("// taxi | max norm | avg norm | min norm | #samples\n"); + for(i = 0; i <= maxtaxi; i++) + printf("%7d %10.6le %10.6le %10.6le %8d\n", i, maxnorm[i], (double)(avgnorm[i]/samples[i]), minnorm[i], samples[i]); + printf("\n"); + + /* free memory */ + free(maxnorm); + free(minnorm); + free(avgnorm); + free(samples); + for (i=0; i<4; i++) + for (j=0; j<3; j++) + ov_free_spinor(s[i][j]); + +} + +void ov_matrix4x4_diff(matrix4x4 result, matrix4x4 left, matrix4x4 right) +{ + for (int i = 0; i < 4; ++i) + for (int j = 0; j < 4; ++j) + result[i][j] = left[i][j] - right[i][j]; +} + + +double ov_matrix4x4_rowsumnorm(matrix4x4 A) { + + double norm, nrm; + int i, j; + + norm = 0.0; + for (i=0; i<4; i++) { + + nrm = 0.0; + for (j=0; j<4; j++) + nrm += cabs(A[i][j]); + + if (nrm > norm) + norm = nrm; + } + + return norm; +} + +void ov_matrix12x12_diff(matrix12x12 result, matrix12x12 left, matrix12x12 right) +{ + for (int i = 0; i < 12; ++i) + for (int j = 0; j < 12; ++j) + result[i][j] = left[i][j] - right[i][j]; +} + +double ov_matrix12x12_rowsumnorm(matrix12x12 A) { + + double norm, nrm; + int i, j; + + norm = 0.0; + for (i=0; i<12; i++) { + + nrm = 0.0; + for (j=0; j<12; j++) + nrm += cabs(A[i][j]); + + if (nrm > norm) + norm = nrm; + } + + return norm; +} +/* compares the operator with the one given in pFileName */ +void ov_compare_4x4(const char * pFileName) { + + double norm, rel, *max_rel, *max_abs, Max_rel = 0.0, Max_abs = 0.0; + int i, j, k, x, x_taxi, y, y_taxi, z, z_taxi, t, t_taxi, maxtaxi, taxi; + spinor *s[4]; + matrix4x4 mat, mat2, diff; + FILE * pCompare; + + /* evaluate Dov(psi) */ + for (i=0; i<4; i++) { + + /* get memory for the spinor */ + s[i] = ov_alloc_spinor(); + + /* create delta source at origin */ + source_spinor_field(g_spinor_field[1], g_spinor_field[0], i, 0); + convert_eo_to_lexic(g_spinor_field[2], g_spinor_field[1], g_spinor_field[0]); + + /* apply Dov */ + Dov_psi(s[i], g_spinor_field[2]); + } + + /* init locality table */ + maxtaxi = (LX/2)+(LY/2)+(LZ/2)+T/2; + max_abs = (double*)calloc(maxtaxi+1, sizeof(double)); + max_rel = (double*)calloc(maxtaxi+1, sizeof(double)); + for(i = 0; i <= maxtaxi; i++) { + max_abs[i] = 0.0; + max_rel[i] = 0.0; + } + + /* open file containing operator for comparison */ + pCompare = fopen(pFileName, "r"); + if (pCompare == NULL) { + fprintf(stderr, "Error: could not open '%s' for comparison of operator\n", pFileName); + exit(1); + } + + /* fill locality table */ + if (g_debug_level > 0) { + printf("// beginning comparison\n"); + fflush(stdout); + } + for(t = 0; t < T; t++){ + t_taxi = (t > T/2) ? T - t : t; + for(x = 0; x < LX; x++){ + x_taxi = (x > LX/2) ? LX-x : x; + for(y = 0; y < LY; y++){ + y_taxi = (y > LY/2) ? LY-y : y; + for(z=0; z LZ/2) ? LZ-z : z; + taxi = x_taxi + y_taxi + z_taxi + t_taxi; + k = g_ipt[t][x][y][z]; + + for (i=0; i<4; i++) { + mat[0][i] = s[i][k].s0.c0; + mat[1][i] = s[i][k].s1.c0; + mat[2][i] = s[i][k].s2.c0; + mat[3][i] = s[i][k].s3.c0; + } + + for (i=0;i<4; i++) + for (j=0; j<4; j++) + fscanf(pCompare, "%le %le", (double*)&mat2[i][j], (double*)&mat2[i][j] + 1); + + ov_matrix4x4_diff(diff, mat, mat2); + + /* statistics */ + norm = ov_matrix4x4_rowsumnorm(diff); + if (norm > max_abs[taxi]) { + max_abs[taxi] = norm; + if (norm > Max_abs) + Max_abs = norm; + } + rel = (ov_matrix4x4_rowsumnorm(mat) + ov_matrix4x4_rowsumnorm(mat2))/2; + if (rel>0.0) { + rel = norm/rel; + if (rel > max_rel[taxi]) { + max_rel[taxi] = rel; + if (rel > Max_rel) + Max_rel = rel; + } + } + } + } + } + } + + /* print locality table */ + printf("// comparison of overlap operator to %s\n", pFileName); + printf(" - maximum absolute deviation: %.4le\n", Max_abs); + printf(" - maximum relative deviation: %.4le\n", Max_rel); + printf("// taxi | max abs | max rel\n"); + for(i = 0; i <= maxtaxi; i++) + printf("%7d %10.6le %10.6le\n", i, max_abs[i], max_rel[i]); + printf("\n"); + + /* close file */ + fclose(pCompare); + + /* free memory */ + free(max_abs); + free(max_rel); + for (i=0; i<4; i++) + ov_free_spinor(s[i]); + +} + +/* compares the operator with the one given in pFileName */ +void ov_compare_12x12(const char * pFileName) { + + double norm, rel, *max_rel, *max_abs, Max_rel = 0.0, Max_abs = 0.0; + int i, j, k, x, x_taxi, y, y_taxi, z, z_taxi, t, t_taxi, maxtaxi, taxi; + spinor *s[4][3]; + matrix12x12 mat, mat2, diff; + FILE * pCompare; + + /* evaluate Dov(psi) */ + for (i=0; i<4; i++) + for (j=0; j<3; j++) { + + /* get memory for the spinor */ + s[i][j] = ov_alloc_spinor(); + + /* create delta source at origin */ + source_spinor_field(g_spinor_field[1], g_spinor_field[0], i, j); + convert_eo_to_lexic(g_spinor_field[2], g_spinor_field[1], g_spinor_field[0]); + + /* apply Dov */ + Dov_psi(s[i][j], g_spinor_field[2]); + } + + /* init locality table */ + maxtaxi = (LX/2)+(LY/2)+(LZ/2)+T/2; + max_abs = (double*)calloc(maxtaxi+1, sizeof(double)); + max_rel = (double*)calloc(maxtaxi+1, sizeof(double)); + for(i = 0; i <= maxtaxi; i++) { + max_abs[i] = 0.0; + max_rel[i] = 0.0; + } + + /* open file containing operator for comparison */ + pCompare = fopen(pFileName, "r"); + if (pCompare == NULL) { + fprintf(stderr, "Error: could not open '%s' for comparison of operator\n", pFileName); + exit(1); + } + + /* fill locality table */ + if (g_debug_level > 0) { + printf("// beginning comparison\n"); + fflush(stdout); + } + for(t = 0; t < T; t++){ + t_taxi = (t > T/2) ? T - t : t; + for(x = 0; x < LX; x++){ + x_taxi = (x > LX/2) ? LX-x : x; + for(y = 0; y < LY; y++){ + y_taxi = (y > LY/2) ? LY-y : y; + for(z=0; z LZ/2) ? LZ-z : z; + taxi = x_taxi + y_taxi + z_taxi + t_taxi; + k = g_ipt[t][x][y][z]; + + for (j=0; j<3; j++) + for (i=0; i<4; i++) { + mat[0][i+4*j] = s[i][j][k].s0.c0; + mat[1][i+4*j] = s[i][j][k].s1.c0; + mat[2][i+4*j] = s[i][j][k].s2.c0; + mat[3][i+4*j] = s[i][j][k].s3.c0; + mat[4][i+4*j] = s[i][j][k].s0.c1; + mat[5][i+4*j] = s[i][j][k].s0.c1; + mat[6][i+4*j] = s[i][j][k].s1.c1; + mat[7][i+4*j] = s[i][j][k].s2.c1; + mat[8][i+4*j] = s[i][j][k].s3.c2; + mat[9][i+4*j] = s[i][j][k].s1.c2; + mat[10][i+4*j] = s[i][j][k].s2.c2; + mat[11][i+4*j] = s[i][j][k].s3.c2; + } + + for (i=0;i<12; i++) + for (j=0; j<12; j++) + fscanf(pCompare, "%le %le", (double*)&mat2[i][j], (double*)&mat2[i][j] + 1); + + ov_matrix12x12_diff(diff, mat, mat2); + + /* statistics */ + norm = ov_matrix12x12_rowsumnorm(diff); + if (norm > max_abs[taxi]) { + max_abs[taxi] = norm; + if (norm > Max_abs) + Max_abs = norm; + } + rel = (ov_matrix12x12_rowsumnorm(mat) + ov_matrix12x12_rowsumnorm(mat2))/2; + if (rel>0.0) { + rel = norm/rel; + if (rel > max_rel[taxi]) { + max_rel[taxi] = rel; + if (rel > Max_rel) + Max_rel = rel; + } + } + } + } + } + } + + /* print locality table */ + printf("// comparison of overlap operator to %s\n", pFileName); + printf(" - maximum absolute deviation: %.4le\n", Max_abs); + printf(" - maximum relative deviation: %.4le\n", Max_rel); + printf("// taxi | max abs | max rel\n"); + for(i = 0; i <= maxtaxi; i++) + printf("%7d %10.6le %10.6le\n", i, max_abs[i], max_rel[i]); + printf("\n"); + + /* close file */ + fclose(pCompare); + + /* free memory */ + free(max_abs); + free(max_rel); + for (i=0; i<4; i++) + for (j=0; j<3; j++) + ov_free_spinor(s[i][j]); + +} + +/* saves the operator to the given filename */ +void ov_save_12x12(const char * pFileName) { + + int i, j, k, x, y, z, t; + spinor *s[4][3]; + matrix12x12 mat; + FILE * pCompare; + + /* evaluate Dov(psi) */ + for (i=0; i<4; i++) + for (j=0; j<3; j++) { + + /* get memory for the spinor */ + s[i][j] = ov_alloc_spinor(); + + /* create delta source at origin */ + source_spinor_field(g_spinor_field[1], g_spinor_field[0], i, j); + convert_eo_to_lexic(g_spinor_field[2], g_spinor_field[1], g_spinor_field[0]); + + /* apply Dov */ + Dov_psi(s[i][j], g_spinor_field[2]); + } + + /* open file for storing the operator */ + pCompare = fopen(pFileName, "w"); + if (pCompare == NULL) { + fprintf(stderr, "Error: could not open '%s' for writing the operator\n", pFileName); + exit(1); + } + + for(t = 0; t < T; t++){ + for(x = 0; x < LX; x++){ + for(y = 0; y < LY; y++){ + for(z=0; zs0.c0), (double)cimag(pS->s0.c0), (double)creal(pS->s0.c1), (double)cimag(pS->s0.c1), (double)creal(pS->s0.c2), (double)cimag(pS->s0.c2)); + printf("%16.19le %+16.19le I | %16.9le %+16.9le I | %16.9le %+16.9le I\n", (double)creal(pS->s1.c0), (double)cimag(pS->s1.c0), (double)creal(pS->s1.c1), (double)cimag(pS->s1.c1), (double)creal(pS->s1.c2), (double)cimag(pS->s1.c2)); + printf("%16.19le %+16.19le I | %16.9le %+16.9le I | %16.9le %+16.9le I\n", (double)creal(pS->s2.c0), (double)cimag(pS->s2.c0), (double)creal(pS->s2.c1), (double)cimag(pS->s2.c1), (double)creal(pS->s2.c2), (double)cimag(pS->s2.c2)); + printf("%16.19le %+16.19le I | %16.9le %+16.9le I | %16.9le %+16.9le I\n", (double)creal(pS->s3.c0), (double)cimag(pS->s3.c0), (double)creal(pS->s3.c1), (double)cimag(pS->s3.c1), (double)creal(pS->s3.c2), (double)cimag(pS->s3.c2)); + +} + + +void ov_check_operator(int t, int x, int y, int z) { + + /* Create delta source at origin */ + source_spinor_field(g_spinor_field[1], g_spinor_field[0], 0, 0); + convert_eo_to_lexic(g_spinor_field[2], g_spinor_field[1], g_spinor_field[0]); + + /* Evaluate Dov(psi) */ + Dov_psi(g_spinor_field[3], g_spinor_field[2]); + ov_print_spinor(&g_spinor_field[3][g_ipt[t][x][y][z]]); + +} + +/* Check GW relation with operator norm over the full lattice */ +void ov_check_ginsparg_wilson_relation_strong(void) { + + double norm_diff, norm_left, norm_right, norm, max_rel = 0.0, min_left = 1.0e100, min_right = 1.0e100, max_diff = 0.0, min_norm = 1.0e100; + int x, y, z, t, i, j, k; + spinor *S_left[4][3], *S_right[4][3], *S_diff[4][3]; + + if (g_debug_level>0) { + printf("// creating spinor fields and calculating {gamma_5,D} psi and a D gamma_5 D psi\n"); + fflush(stdout); + } + for (i=0; i<4; i++) + for (j=0; j<3; j++) { + + if (g_debug_level>1) { + printf("// spinor field: delta_dirac at %d, delta_color at %d\n", i, j); + fflush(stdout); + } + + /* get memory for the spinor */ + S_left[i][j] = ov_alloc_spinor(); + S_right[i][j] = ov_alloc_spinor(); + S_diff[i][j] = ov_alloc_spinor(); + + /* Create delta source at origin */ + source_spinor_field(g_spinor_field[1], g_spinor_field[0], i, j); + convert_eo_to_lexic(g_spinor_field[2], g_spinor_field[1], g_spinor_field[0]); + + /* S_right = D gamma_5 D psi */ + Dov_psi(g_spinor_field[3], g_spinor_field[2]); + gamma5(S_left[i][j], g_spinor_field[3], VOLUME); + Dov_psi(S_right[i][j], S_left[i][j]); + + /* S_left = {gamma_5, D} psi */ + gamma5(g_spinor_field[3], g_spinor_field[2], VOLUME); + Dov_psi(g_spinor_field[4], g_spinor_field[3]); + add(S_left[i][j], S_left[i][j], g_spinor_field[4], VOLUME); + + /* S_diff = (S_left-S_right) psi, should be zero (GW relation) */ + diff(S_diff[i][j], S_left[i][j], S_right[i][j], VOLUME); + } + + /* scan the whole lattice and check GW relation */ + printf("// test of the Ginsparg-Wilson relation:\n"); + if (g_debug_level>0) + fflush(stdout); + for(x=0; x 0.0) { + norm = norm_diff/norm; + if (norm > max_rel) + max_rel = norm; + if ((norm > 1.8) && (g_debug_level)>=5) { + printf("(%d,%d,%d,%d): taxi = %d, rel = %.20le, lr = [%.4le, %.4le], diff = %.4le\n", t, x, y, z, ((x>LX/2) ? LX-x : x)+((y>LY/2) ? LY-y : y)+((z>LZ/2) ? LZ-z : z)+((t>T/2) ? T-t : t), norm, norm_left, norm_right, norm_diff); + printf("// left[0][0]:\n"); + ov_print_spinor(&S_left[0][0][k]); + printf("// right[0][0]:\n"); + ov_print_spinor(&S_right[0][0][k]); + printf("// diff[0][0]:\n"); + ov_print_spinor(&S_diff[0][0][k]); + } + } + if (norm_left < min_left) + min_left = norm_left; + if (norm_right < min_right) + min_right = norm_right; + if (norm_diff > max_diff) + max_diff = norm_diff; + } + + /* print results */ + printf(" - maximum absolute deviation: %.4le\n", max_diff); + printf(" - maximum relative deviation: %.4le\n", max_rel); + printf(" - minimum mean norm: %.4le\n", min_norm); + printf(" - minimum norm {gamma_5, D}: %.4le\n", min_left); + printf(" - minimum norm D gamma_5 D: %.4le\n", min_right); + + /* free memory */ + for (i=0; i<4; i++) + for (j=0; j<3; j++) { + ov_free_spinor(S_left[i][j]); + ov_free_spinor(S_right[i][j]); + ov_free_spinor(S_diff[i][j]); + } +} + +/* Checks GW relation only by applying Dov to delta(0,0) */ +void ov_check_ginsparg_wilson_relation(void) { + + double norm_diff, norm_left, norm_right, norm, max_rel = 0.0, min_left = 1.0e100, min_right = 1.0e100, max_diff = 0.0, min_norm = 1.0e100; + int x, y, z, t, i; + spinor *S_left, *S_right, *S_diff; + + /* get memory for the spinor fields */ + S_left = ov_alloc_spinor(); + S_right = ov_alloc_spinor(); + S_diff = ov_alloc_spinor(); + + /* Create delta source at origin */ + source_spinor_field(g_spinor_field[1], g_spinor_field[0], 0, 0); + convert_eo_to_lexic(g_spinor_field[2], g_spinor_field[1], g_spinor_field[0]); + + /* S_right = D gamma_5 D */ + Dov_psi(g_spinor_field[3], g_spinor_field[2]); + gamma5(S_left, g_spinor_field[3], VOLUME); + Dov_psi(S_right, S_left); + + /* S_left = {gamma_5, D} */ + gamma5(g_spinor_field[3], g_spinor_field[2], VOLUME); + Dov_psi(g_spinor_field[4], g_spinor_field[3]); + add(S_left, S_left, g_spinor_field[4], VOLUME); + + /* S_diff = S_left-S_right */ + diff(S_diff, S_left, S_right, VOLUME); + + /* scan the whole lattice */ + printf("// test of the Ginsparg-Wilson relation\n"); + for(x=0; x 0.0) { + norm = 2.*norm_diff/norm; + if (norm > max_rel) + max_rel = norm; + } + if (norm_left < min_left) + min_left = norm_left; + if (norm_right < min_right) + min_right = norm_right; + if (norm_diff > max_diff) + max_diff = norm_diff; + } + + /* print results */ + printf(" - maximum absoulte deviation: %.4le\n", max_diff); + printf(" - maximum relative deviation: %.4le\n", max_rel); + printf(" - minimum mean norm: %4.le\n", min_norm); + printf(" - minimum norm {gamma_5, D}: %.4le\n", min_left); + printf(" - minimum norm D gamma_5 D: %.4le\n", min_right); + + /* free memory */ + ov_free_spinor(S_left); + ov_free_spinor(S_right); + ov_free_spinor(S_diff); +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/test/overlaptests.h b/qcd/part_cpu/applications/QCD/src/kernel_D/test/overlaptests.h new file mode 100644 index 0000000000000000000000000000000000000000..0fc7478374620f1a73b1c589c3656e38f3a98b17 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/test/overlaptests.h @@ -0,0 +1,25 @@ +#ifndef OVERLAPTESTS_INCLUDE_GUARD +#define OVERLAPTESTS_INCLUDE_GUARD + +void ov_check_operator(int t, int x, int y, int z); +void ov_check_locality(); +void ov_check_ginsparg_wilson_relation(void); +void ov_check_ginsparg_wilson_relation_strong(void); +void ov_compare_4x4(const char * pFileName); +void ov_compare_12x12(const char * pFileName); +void ov_save_12x12(const char * pFileName); + +typedef _Complex double matrix4x4[4][4]; +typedef _Complex double matrix12x12[12][12]; + +#define _spinor_norm_l1(d,s)\ + d = 0.; \ + d = cabs((s).s0.c0) + cabs((s).s0.c1) + \ + cabs((s).s0.c2) + cabs((s).s1.c0) + \ + cabs((s).s1.c1) + cabs((s).s1.c2) + \ + cabs((s).s2.c0) + cabs((s).s2.c1) + \ + cabs((s).s2.c2) + cabs((s).s3.c0) + \ + cabs((s).s3.c1) + cabs((s).s3.c2) + + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/test/qdran64.h b/qcd/part_cpu/applications/QCD/src/kernel_D/test/qdran64.h new file mode 100644 index 0000000000000000000000000000000000000000..09ea4dfafb7bf0cd35feb40d5c788809e7c9e297 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/test/qdran64.h @@ -0,0 +1,91 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#include + +uint64_t qdran64low, qdran64high; + +void qdran64_init(uint64_t seedlow, uint64_t seedhigh) +{ + qdran64low = seedlow; + qdran64high = seedhigh; +} + + +void qdran64(uint64_t *ranlow, uint64_t *ranhigh) +{ + const uint64_t ia = 0x27bb2ee687b0b0fdULL; + const uint64_t ic = 0x00000000b504f32dULL; + + qdran64low *= ia; + qdran64low += ic; + qdran64high *= ia; + qdran64high += ic; + + (*ranlow) = qdran64low; + (*ranhigh) = qdran64high; +} + + +#ifdef _STD_C99_COMPLEX +#include +void qdran64z(complex *ranz) +{ + union { double x; uint64_t i; } re, im; + uint64_t * ranlow = &re.i; + uint64_t * ranhigh = &im.i; + + re.x = creal(*ranz); + im.x = cimag(*ranz); + + qdran64(ranlow, ranhigh); + + (*ranlow) &= 0x000fffffffffffffULL; + (*ranlow) |= 0x3ff0000000000000ULL; + (*ranhigh) &= 0x000fffffffffffffULL; + (*ranhigh) |= 0x3ff0000000000000ULL; + + *ranz = re.x + I * im.x; +} +#endif + +void qdran64_2d(double *rand1, double *rand2) +{ + union { double x; uint64_t i; } re, im; + uint64_t * ranlow = &re.i; + uint64_t * ranhigh = &im.i; + + re.x = (*rand1); + im.x = (*rand2); + + qdran64(ranlow, ranhigh); + + (*ranlow) &= 0x000fffffffffffffULL; + (*ranlow) |= 0x3ff0000000000000ULL; + (*ranhigh) &= 0x000fffffffffffffULL; + (*ranhigh) |= 0x3ff0000000000000ULL; + + *rand1 = re.x; + *rand2 = im.x; +} + +/* void qdran32d(double *rand) { */ +/* complex tmp; */ +/* qdran64z(&tmp); */ +/* *rand=tmp; */ +/* } */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/test/scalar_prod_r_test.c b/qcd/part_cpu/applications/QCD/src/kernel_D/test/scalar_prod_r_test.c new file mode 100644 index 0000000000000000000000000000000000000000..fed7d0e39574f06a3b818a5a4ef42d1186dadca0 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/test/scalar_prod_r_test.c @@ -0,0 +1,90 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +/* #ifndef apenext */ +/* #include */ +/* #endif */ + +#ifdef HAVE_CONFIG_H +# include +#endif +#ifndef _STD_C99_COMPLEX +#include "complex.h" +#endif + +#include "qdran64.h" + +#include "su3.h" +#include "linalg/scalar_prod_r.c" + + +#define N 16 + +int main(void) { + double s; + int i; + spinor a[N], b[N]; + + qdran64_init(42,13); + + for(i=0; i. + * + * + * Main for testing the Eigenvalues computation using bispinors + * + * Author: Thomas Chiarappa + * Thomas.Chiarappa@mib.infn.it + * + *******************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#include +#include +#include +#ifdef MPI +#include +#endif +#include "global.h" +#include "getopt.h" +#include "ranlxd.h" +#include "geometry_eo.h" +#include "start.h" +/* +#include "clover_eo.h" +*/ +#include "observables.h" +#include "measure_rectangles.h" +#ifdef MPI +#include "xchange.h" +#endif +#include "io.h" +#include "read_input.h" +#include "mpi_init.h" +#include "sighandler.h" +#include "monomial/moment_energy.h" +#include "update_tm.h" +#include "init/init.h" +#include "test/check_geometry.h" +#include "boundary.h" +#include "polyakov_loop.h" + +#include "solver/eigenvalues_bi.h" + +char * Version = "2.3.5"; + + +void usage(){ + fprintf(stderr, "hmc for Wilson twisted mass QCD\n\n"); + fprintf(stderr, "Usage: [-f input-filename]\n"); + fprintf(stderr, "Usage: [-o output-filename]\n"); + exit(1); +} + +extern int nstore; + +int main(int argc,char *argv[]) { + + FILE *parameterfile=NULL,*rlxdfile=NULL, *countfile=NULL; + char * filename = NULL; + char datafilename[50]; + char parameterfilename[50]; + char gauge_filename[50]; + char * nstore_filename = ".nstore_counter"; + char * input_filename = NULL; + int rlxd_state[105]; + int j,ix,mu; + int k; + struct timeval t1; + + int g_nev, max_iter_ev; + double stop_prec_ev; + + + /* Energy corresponding to the Gauge part */ + double eneg = 0., plaquette_energy = 0., rectangle_energy = 0.; + /* Acceptance rate */ + int Rate=0; + /* Do we want to perform reversibility checks */ + /* See also return_check_flag in read_input.h */ + int return_check = 0; + /* For getopt */ + int c; + + /* For the Polyakov loop: */ + int dir = 2; + _Complex double pl, pl4; + + verbose = 0; + g_use_clover_flag = 0; + g_nr_of_psf = 1; + +#ifndef XLC + signal(SIGUSR1,&catch_del_sig); + signal(SIGUSR2,&catch_del_sig); + signal(SIGTERM,&catch_del_sig); + signal(SIGXCPU,&catch_del_sig); +#endif + + while ((c = getopt(argc, argv, "h?f:o:")) != -1) { + switch (c) { + case 'f': + input_filename = calloc(200, sizeof(char)); + strcpy(input_filename,optarg); + break; + case 'o': + filename = calloc(200, sizeof(char)); + strcpy(filename,optarg); + break; + case 'h': + case '?': + default: + usage(); + break; + } + } + if(input_filename == NULL){ + input_filename = "hmc.input"; + } + if(filename == NULL){ + filename = "output"; + } + + /* Read the input file */ + read_input(input_filename); + + mpi_init(argc, argv); + + if(Nsave == 0){ + Nsave = 1; + } + if(nstore == -1) { + countfile = fopen(nstore_filename, "r"); + if(countfile != NULL) { + fscanf(countfile, "%d\n", &nstore); + fclose(countfile); + } + else { + nstore = 0; + } + } + + if(g_rgi_C1 == 0.) { + g_dbw2rand = 0; + } +#ifndef MPI + g_dbw2rand = 0; +#endif + + /* Reorder the mu parameter and the number of iterations */ + if(g_mu3 > 0.) { + g_mu = g_mu1; + g_mu1 = g_mu3; + g_mu3 = g_mu; + + j = int_n[1]; + int_n[1] = int_n[3]; + int_n[3] = j; + + j = g_csg_N[0]; + g_csg_N[0] = g_csg_N[4]; + g_csg_N[4] = j; + g_csg_N[6] = j; + if(fabs(g_mu3) > 0) { + g_csg_N[6] = 0; + } + + g_nr_of_psf = 3; + } + else if(g_mu2 > 0.) { + g_mu = g_mu1; + g_mu1 = g_mu2; + g_mu2 = g_mu; + + int_n[3] = int_n[1]; + int_n[1] = int_n[2]; + int_n[2] = int_n[3]; + + /* For chronological inverter */ + g_csg_N[4] = g_csg_N[0]; + g_csg_N[0] = g_csg_N[2]; + g_csg_N[2] = g_csg_N[4]; + if(fabs(g_mu2) > 0) { + g_csg_N[4] = 0; + } + g_csg_N[6] = 0; + + g_nr_of_psf = 2; + } + else { + g_csg_N[2] = g_csg_N[0]; + if(fabs(g_mu2) > 0) { + g_csg_N[2] = 0; + } + g_csg_N[4] = 0; + g_csg_N[6] = 0; + } + + for(j = 0; j < g_nr_of_psf+1; j++) { + if(int_n[j] == 0) int_n[j] = 1; + } + if(g_nr_of_psf == 3) { + g_eps_sq_force = g_eps_sq_force1; + g_eps_sq_force1 = g_eps_sq_force3; + g_eps_sq_force3 = g_eps_sq_force; + g_eps_sq_acc = g_eps_sq_acc1; + g_eps_sq_acc1 = g_eps_sq_acc3; + g_eps_sq_acc3 = g_eps_sq_acc; + } + if(g_nr_of_psf == 2) { + g_eps_sq_force = g_eps_sq_force1; + g_eps_sq_force1 = g_eps_sq_force2; + g_eps_sq_force2 = g_eps_sq_force; + g_eps_sq_acc = g_eps_sq_acc1; + g_eps_sq_acc1 = g_eps_sq_acc2; + g_eps_sq_acc2 = g_eps_sq_acc; + } + g_mu = g_mu1; + g_eps_sq_acc = g_eps_sq_acc1; + g_eps_sq_force = g_eps_sq_force1; + + +#ifdef _GAUGE_COPY + j = init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1); +#else + j = init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 0); +#endif + if ( j!= 0) { + fprintf(stderr, "Not enough memory for gauge_fields! Aborting...\n"); + exit(0); + } + j = init_geometry_indices(VOLUMEPLUSRAND + g_dbw2rand); + if ( j!= 0) { + fprintf(stderr, "Not enough memory for geometry_indices! Aborting...\n"); + exit(0); + } + j = init_spinor_field(VOLUMEPLUSRAND/2, NO_OF_SPINORFIELDS); + if ( j!= 0) { + fprintf(stderr, "Not enough memory for spinor fields! Aborting...\n"); + exit(0); + } + + j = init_bispinor_field(VOLUME/2, NO_OF_SPINORFIELDS); + + + j = init_csg_field(VOLUMEPLUSRAND/2, g_csg_N); + if ( j!= 0) { + fprintf(stderr, "Not enough memory for csg fields! Aborting...\n"); + exit(0); + } + j = init_moment_field(VOLUME, VOLUMEPLUSRAND); + if ( j!= 0) { + fprintf(stderr, "Not enough memory for moment fields! Aborting...\n"); + exit(0); + } + + zero_spinor_field(g_spinor_field[DUM_DERI+4],VOLUME/2); + zero_spinor_field(g_spinor_field[DUM_DERI+5],VOLUME/2); + zero_spinor_field(g_spinor_field[DUM_DERI+6],VOLUME/2); + + + if(g_proc_id == 0){ + +/* fscanf(fp6,"%s",filename); */ + /*construct the filenames for the observables and the parameters*/ + strcpy(datafilename,filename); strcat(datafilename,".data"); + strcpy(parameterfilename,filename); strcat(parameterfilename,".para"); + + parameterfile=fopen(parameterfilename, "w"); + printf("# This is the hmc code for twisted Mass Wilson QCD\n\nVersion %s\n", Version); +#ifdef SSE + printf("# The code was compiled with SSE instructions\n"); +#endif +#ifdef SSE2 + printf("# The code was compiled with SSE2 instructions\n"); +#endif +#ifdef SSE3 + printf("# The code was compiled with SSE3 instructions\n"); +#endif +#ifdef P4 + printf("# The code was compiled for Pentium4\n"); +#endif +#ifdef OPTERON + printf("# The code was compiled for AMD Opteron\n"); +#endif +#ifdef _NEW_GEOMETRY + printf("# The code was compiled with -D_NEW_GEOMETRY\n"); +#endif +#ifdef _GAUGE_COPY + printf("# The code was compiled with -D_GAUGE_COPY\n"); +#endif + printf("# The lattice size is %d x %d x %d x %d\n", + (int)(T*g_nproc_t), (int)(LX*g_nproc_x), (int)(LY), (int)(LZ)); + printf("# The local lattice size is %d x %d x %d x %d\n", + (int)(T), (int)(LX), (int)(LY),(int) LZ); + printf("# beta = %f , kappa= %f\n", g_beta, g_kappa); + printf("# mus = %f, %f, %f\n", g_mu1, g_mu2, g_mu3); + printf("# int_n_gauge = %d, int_n_ferm1 = %d, int_n_ferm2 = %d, int_n_ferm3 = %d\n", + int_n[0], int_n[1], int_n[2], int_n[3]); + printf("# g_rgi_C0 = %f, g_rgi_C1 = %f\n", g_rgi_C0, g_rgi_C1); + printf("# Number of pseudo-fermion fields: %d\n", g_nr_of_psf); + printf("# g_eps_sq_force = %e, g_eps_sq_acc = %e\n", g_eps_sq_force, g_eps_sq_acc); + printf("# Integration scheme: "); + if(integtyp == 1) printf("leap-frog (single time scale)\n"); + if(integtyp == 2) printf("Sexton-Weingarten (single time scale)\n"); + if(integtyp == 3) printf("leap-frog (multiple time scales)\n"); + if(integtyp == 4) printf("Sexton-Weingarten (multiple time scales)\n"); + if(integtyp == 5) printf("higher order and leap-frog (multiple time scales)\n"); + printf("# Using %s precision for the inversions!\n", + g_relative_precision_flag ? "relative" : "absolute"); + printf("# Using in chronological inverter for spinor_field 1,2,3 a history of %d, %d, %d, respectively\n", + g_csg_N[0], g_csg_N[2], g_csg_N[4]); + + + fprintf(parameterfile, "The lattice size is %d x %d x %d x %d\n", (int)(g_nproc_t*T), (int)(g_nproc_x*LX), (int)(LY), (int)(LZ)); + fprintf(parameterfile, "The local lattice size is %d x %d x %d x %d\n", (int)(T), (int)(LX), (int)(LY), (int)(LZ)); + fprintf(parameterfile, "g_beta = %f , g_kappa= %f, g_kappa*csw/8= %f \n",g_beta,g_kappa,g_ka_csw_8); + fprintf(parameterfile, "boundary of fermion fields (t,x,y,z): %f %f %f %f \n",X0,X1,X2,X3); + fprintf(parameterfile, "EPS_SQ0=%e, EPS_SQ1=%e EPS_SQ2=%e, EPS_SQ3=%e \n" + ,EPS_SQ0,EPS_SQ1,EPS_SQ2,EPS_SQ3); + fprintf(parameterfile, "g_eps_sq_force = %e, g_eps_sq_acc = %e\n", g_eps_sq_force, g_eps_sq_acc); + fprintf(parameterfile, "dtau=%f, Nsteps=%d, Nmeas=%d, Nsave=%d, integtyp=%d, nsmall=%d \n", + dtau,Nsteps,Nmeas,Nsave,integtyp,nsmall); + fprintf(parameterfile, "mu = %f, mu2=%f, mu3=%f\n ", g_mu, g_mu2, g_mu3); + fprintf(parameterfile, "int_n_gauge = %d, int_n_ferm1 = %d, int_n_ferm2 = %d, int_n_ferm3 = %d\n ", + int_n[0], int_n[1], int_n[2], int_n[3]); + fprintf(parameterfile, "g_rgi_C0 = %f, g_rgi_C1 = %f\n", g_rgi_C0, g_rgi_C1); + fprintf(parameterfile, "# Number of pseudo-fermion fields: %d\n", g_nr_of_psf); + fprintf(parameterfile, "# Integration scheme: "); + if(integtyp == 1) fprintf(parameterfile, "leap-frog (single time scale)\n"); + if(integtyp == 2) fprintf(parameterfile, "Sexton-Weingarten (single time scale)\n"); + if(integtyp == 3) fprintf(parameterfile, "leap-frog (multiple time scales)\n"); + if(integtyp == 4) fprintf(parameterfile, "Sexton-Weingarten (multiple time scales)\n"); + if(integtyp == 5) fprintf(parameterfile, "higher order and leap-frog (multiple time scales)\n"); + fprintf(parameterfile, "Using %s precision for the inversions!\n", + g_relative_precision_flag ? "relative" : "absolute"); + fprintf(parameterfile, "Using in chronological inverter for spinor_field 1,2,3 a history of %d, %d, %d, respectively\n", + g_csg_N[0], g_csg_N[2], g_csg_N[4]); + fflush(stdout); fflush(parameterfile); + } + + /* define the geometry */ + geometry(); + + /* define the boundary conditions for the fermion fields */ + boundary(); + + check_geometry(); + + if(g_proc_id == 0) { +#if defined GEOMETRIC + if(g_proc_id==0) fprintf(parameterfile,"The geometric series is used as solver \n\n"); +#else + if(g_proc_id==0) fprintf(parameterfile,"The BICG_stab is used as solver \n\n"); +#endif + fflush(parameterfile); + } + + /* Continue */ + if(startoption == 3){ + rlxdfile = fopen(rlxd_input_filename,"r"); + if(rlxdfile != NULL) { + if(g_proc_id == 0) { + fread(rlxd_state,sizeof(rlxd_state),1,rlxdfile); + } + } + else { + if(g_proc_id == 0) { + printf("%s does not exist, switching to restart...\n", rlxd_input_filename); + } + startoption = 2; + } + fclose(rlxdfile); + if(startoption != 2) { + if(g_proc_id == 0) { + rlxd_reset(rlxd_state); + printf("Reading Gauge field from file %s\n", gauge_input_filename); fflush(stdout); + } + + read_gauge_field_time_p(gauge_input_filename,g_gauge_field); + } + } + if(startoption != 3){ + /* Initialize random number generator */ + if(g_proc_id == 0) { + rlxd_init(1, random_seed); + /* hot */ + if(startoption == 1) { + random_gauge_field(); + } + rlxd_get(rlxd_state); +#ifdef MPI + MPI_Send(&rlxd_state[0], 105, MPI_INT, 1, 99, MPI_COMM_WORLD); + MPI_Recv(&rlxd_state[0], 105, MPI_INT, g_nproc-1, 99, MPI_COMM_WORLD, &status); + rlxd_reset(rlxd_state); +#endif + } +#ifdef MPI + else { + MPI_Recv(&rlxd_state[0], 105, MPI_INT, g_proc_id-1, 99, MPI_COMM_WORLD, &status); + rlxd_reset(rlxd_state); + /* hot */ + if(startoption == 1) { + random_gauge_field(); + } + k=g_proc_id+1; + if(k==g_nproc){ + k=0; + } + rlxd_get(rlxd_state); + MPI_Send(&rlxd_state[0], 105, MPI_INT, k, 99, MPI_COMM_WORLD); + } +#endif + + /* Cold */ + if(startoption == 0) { + unit_g_gauge_field(); + } + /* Restart */ + else if(startoption == 2) { + if (g_proc_id == 0){ + printf("Reading Gauge field from file %s\n", gauge_input_filename); fflush(stdout); + } + read_gauge_field_time_p(gauge_input_filename,g_gauge_field); + } + + } + + /*For parallelization: exchange the gaugefield */ +#ifdef MPI + xchange_gauge(g_gauge_field); +#endif +#ifdef _GAUGE_COPY + update_backward_gauge(); +#endif + + /*compute the energy of the gauge field*/ + plaquette_energy=measure_gauge_action(); + if(g_rgi_C1 > 0. || g_rgi_C1 < 0.) { + rectangle_energy = measure_rectangles(); + if(g_proc_id==0){ + fprintf(parameterfile,"#First rectangle value: %14.12f \n",rectangle_energy/(12.*VOLUME*g_nproc)); + } + } + eneg = g_rgi_C0 * plaquette_energy + g_rgi_C1 * rectangle_energy; + + /* Measure and print the Polyakov loop: */ + polyakov_loop(&pl, dir); + + if(g_proc_id==0){ + fprintf(parameterfile,"#First plaquette value: %14.12f \n", plaquette_energy/(6.*VOLUME*g_nproc)); + fprintf(parameterfile,"#First Polyakov loop value in %d-direction |L(%d)|= %14.12f \n", + dir, dir, cabs(pl)); + } + + dir=3; + polyakov_loop(&pl, dir); + if(g_proc_id==0){ + fprintf(parameterfile,"#First Polyakov loop value in %d-direction |L(%d)|= %14.12f \n", + dir, dir, cabs(pl)); + fclose(parameterfile); + } + + /* set ddummy to zero */ + for(ix = 0; ix < VOLUME+RAND; ix++){ + for(mu=0; mu<4; mu++){ + ddummy[ix][mu].d1=0.; + ddummy[ix][mu].d2=0.; + ddummy[ix][mu].d3=0.; + ddummy[ix][mu].d4=0.; + ddummy[ix][mu].d5=0.; + ddummy[ix][mu].d6=0.; + ddummy[ix][mu].d7=0.; + ddummy[ix][mu].d8=0.; + } + } + + if(g_proc_id == 0) { + gettimeofday(&t1,NULL); + countfile = fopen("history_hmc_tm", "a"); + fprintf(countfile, "!!! Timestamp %ld, Nsave = %d, g_mu = %e, g_mu1 = %e, g_mu_2 = %e, g_mu3 = %e, beta = %f, kappa = %f, C1 = %f, int0 = %d, int1 = %d, int2 = %d, int3 = %d, g_eps_sq_force = %e, g_eps_sq_acc = %e, ", + t1.tv_sec, Nsave, g_mu, g_mu1, g_mu2, g_mu3, g_beta, g_kappa, g_rgi_C1, + int_n[0], int_n[1], int_n[2], int_n[3], g_eps_sq_force, g_eps_sq_acc); + fprintf(countfile, "Nsteps = %d, dtau = %e, tau = %e, integtyp = %d, rel. prec. = %d\n", + Nsteps, dtau, tau, integtyp, g_relative_precision_flag); + fclose(countfile); + } + + + + /* HERE THE CALLS FOR SOME EIGENVALUES */ + + /* for lowest + g_nev = 10; + */ + + /* for largest + */ + g_nev = 10; + + max_iter_ev = 1000; + stop_prec_ev = 1.e-10; + + if(g_proc_id==0) { + + printf(" Values of mu = %e mubar = %e eps = %e precision = %e \n \n", g_mu, g_mubar, g_epsbar, stop_prec_ev); + + } + + eigenvalues(&g_nev, operator_flag, max_iter_ev, stop_prec_ev); + + g_nev = 4; + + max_iter_ev = 200; + stop_prec_ev = 1.e-03; + + max_eigenvalues(&g_nev, operator_flag, max_iter_ev, stop_prec_ev); + + if(g_proc_id==0) { + + printf(" Values of mu = %e mubar = %e eps = %e precision = %e \n \n", g_mu, g_mubar, g_epsbar, stop_prec_ev); + + /* + printf(" Values of mu = %e precision = %e \n \n", g_mu, stop_prec_ev); + */ + + } + + /* END OF EIGENVALUES CALLS */ + + + if(g_proc_id==0) { + rlxd_get(rlxd_state); + rlxdfile=fopen("last_state","w"); + fwrite(rlxd_state,sizeof(rlxd_state),1,rlxdfile); + fclose(rlxdfile); + + printf("Acceptance Rate was: %e Prozent\n", 100.*(double)Rate/(double)Nmeas); + fflush(stdout); + parameterfile = fopen(parameterfilename, "a"); + fprintf(parameterfile, "Acceptance Rate was: %e Prozent\n", 100.*(double)Rate/(double)Nmeas); + fclose(parameterfile); + } +#ifdef MPI + MPI_Finalize(); +#endif + free_gauge_tmp(); + free_gauge_field(); + free_geometry_indices(); + free_spinor_field(); + free_bispinor_field(); + free_moment_field(); + return(0); +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/test_lemon.c b/qcd/part_cpu/applications/QCD/src/kernel_D/test_lemon.c new file mode 100644 index 0000000000000000000000000000000000000000..8f8e2296ede249f3c92e241e7d2670939c762d92 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/test_lemon.c @@ -0,0 +1,181 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +/******************************************************************************* +* +* Benchmark program for the even-odd preconditioned Wilson-Dirac operator +* +* +*******************************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef BENCHMARK +#include <./c-lime/include/lime.h> +#else +#include +#endif +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#include +#if (defined BGL && !defined BGP) +# include +#endif +#ifdef MPI +# include +#endif +#include "su3.h" +#include "su3adj.h" +#include +#include + +#include "ranlxd.h" +#include "geometry_eo.h" +#include "read_input.h" +#include "start.h" +#include "boundary.h" +#include "global.h" +#include "xchange/xchange.h" +#include "init/init.h" +#include "measure_gauge_action.h" +#include "mpi_init.h" + + +int main(int argc,char *argv[]) { + + double plaquette_energy; + paramsXlfInfo *xlfInfo; + + +#ifdef MPI + + MPI_Init(&argc, &argv); +#endif + g_rgi_C1 = 1.; + + /* Read the input file */ + read_input("benchmark.input"); + + tmlqcd_mpi_init(argc, argv); + + +#ifdef _GAUGE_COPY + init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 1); +#else + init_gauge_field(VOLUMEPLUSRAND + g_dbw2rand, 0); +#endif + init_geometry_indices(VOLUMEPLUSRAND + g_dbw2rand); + + if(g_proc_id == 0) { + fprintf(stdout,"The number of processes is %d \n",g_nproc); + printf("# The lattice size is %d x %d x %d x %d\n", + (int)(T*g_nproc_t), (int)(LX*g_nproc_x), (int)(LY*g_nproc_y), (int)(g_nproc_z*LZ)); + printf("# The local lattice size is %d x %d x %d x %d\n", + (int)(T), (int)(LX), (int)(LY),(int) LZ); + printf("# Testing IO routines for gauge-fields\n"); + fflush(stdout); + } + + /* define the geometry */ + geometry(); + /* define the boundary conditions for the fermion fields */ + boundary(g_kappa); + + /* generate a random gauge field */ + start_ranlux_KD(1, 123456); + random_gauge_field(reproduce_randomnumber_flag, g_gauge_field); + +#ifdef MPI + /*For parallelization: exchange the gaugefield */ + xchange_gauge(g_gauge_field); +#endif + + plaquette_energy = measure_plaquette(g_gauge_field) / (6.*VOLUME*g_nproc); + + if(g_proc_id == 0) { + printf("# the first plaquette value is %e\n", plaquette_energy); + printf("# writing with lime first to conf.lime\n"); + } + + /* write with lime first */ + xlfInfo = construct_paramsXlfInfo(plaquette_energy, 0); + write_lime_gauge_field( "conf.lime", 64, xlfInfo); + +#ifdef HAVE_LIBLEMON + if(g_proc_id == 0) { + printf("Now we do write with lemon to conf.lemon...\n"); + } + write_lemon_gauge_field_parallel( "conf.lemon", 64, xlfInfo); + + + if(g_proc_id == 0) { + printf("# now we read with lemon from conf.lime\n"); + } + read_lemon_gauge_field_parallel("conf.lime", NULL, NULL, NULL); + plaquette_energy = measure_plaquette(g_gauge_field) / (6.*VOLUME*g_nproc); + if(g_proc_id == 0) { + printf("# the plaquette value after lemon read of conf.lime is %e\n", plaquette_energy); + } + + if(g_proc_id == 0) { + printf("# now we read with lemon from conf.lemon\n"); + } + read_lemon_gauge_field_parallel("conf.lemon", NULL, NULL, NULL); + plaquette_energy = measure_plaquette(g_gauge_field) / (6.*VOLUME*g_nproc); + if(g_proc_id == 0) { + printf("# the plaquette value after lemon read of conf.lemon is %e\n", plaquette_energy); + } + + if(g_proc_id == 0) { + printf("# now we read with lime from conf.lemon\n"); + } + read_lime_gauge_field("conf.lemon"); + plaquette_energy = measure_plaquette(g_gauge_field) / (6.*VOLUME*g_nproc); + if(g_proc_id == 0) { + printf("# the plaquette value after lime read of conf.lemon is %e\n", plaquette_energy); + } + + free(xlfInfo); + if(g_proc_id==0) { + printf("done ...\n"); + } +#endif + + if(g_proc_id == 0) { + printf("# now we read with lime from conf.lime\n"); + } + read_lime_gauge_field("conf.lime", NULL, NULL, NULL); + plaquette_energy = measure_plaquette(g_gauge_field) / (6.*VOLUME*g_nproc); + if(g_proc_id == 0) { + printf("# the plaquette value after lime read of conf.lime is %e\n", plaquette_energy); + } + + +#ifdef MPI + MPI_Finalize(); +#endif + free_gauge_field(); + free_geometry_indices(); + return(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_buffers.c b/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_buffers.c new file mode 100644 index 0000000000000000000000000000000000000000..98ac13c5280b427e7f7eaaa13c1a4fdd432bc2ec --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_buffers.c @@ -0,0 +1,34 @@ + +#include +#include + +#ifdef MPI +#include +#endif + +#include "test_buffers_gauge.h" + +TEST_SUITES { + TEST_SUITE_ADD(BUFFERS_GAUGE), + TEST_SUITES_CLOSURE +}; + +int main(int argc,char *argv[]){ +#ifdef MPI + MPI_Init(&argc, &argv); + MPI_Comm_rank(MPI_COMM_WORLD, &g_proc_id); +#else + g_proc_id = 0; +#endif + + CU_SET_OUT_PREFIX("regressions/"); + CU_RUN(argc,argv); + +#ifdef MPI + MPI_Finalize(); +#endif + + return 0; +} + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_buffers_gauge.c b/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_buffers_gauge.c new file mode 100644 index 0000000000000000000000000000000000000000..393bf7c12214109915ed84d81b39f0cf0ae542a4 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_buffers_gauge.c @@ -0,0 +1,95 @@ +#include +#include + +#include + +#include + +#define EPS 5e-16 + +int VOLUMEPLUSRAND = 10000; + +/* g_gauge_buffers is defined in one of the includes! */ +extern gauge_buffers_t g_gauge_buffers; + +TEST(buffers_gauge_allocate_finalize) { + int test = 0; + const unsigned int max = 10; + + initialize_gauge_buffers(max); + + assertEqualsM(g_gauge_buffers.max,max,"Buffers were not initialized correctly! max != 10 \n"); + + finalize_gauge_buffers(); + assertFalseM(test,"TODO: No good test condition for a failed finalize.\n"); + + assertEqualsM(g_gauge_buffers.allocated,0,"Finalize error, allocated != 0 \n"); + assertEqualsM(g_gauge_buffers.free,0,"Finalize error, free != 0 \n"); +} + +/* TODO: add test for reaching max, but since this terminates the program currently, + * need to wait for a cleaner failure condition to be implemented */ + +TEST(buffers_gauge_get_return) { + const int max = 10; + + initialize_gauge_buffers(max); + assertEqualsM(g_gauge_buffers.max,max,"Buffers were not initialized correctly! max != 10 \n"); + + + gauge_field_t gauge_field[max]; + + gauge_field[0] = get_gauge_field(); + + assertEqualsM(g_gauge_buffers.allocated,1,"Get error, allocated != 1 \n"); + assertEqualsM(g_gauge_buffers.free,0,"Get error, free != 0 \n"); + + for(int i = 1; i < 6; ++i) { + gauge_field[i] = get_gauge_field(); + } + + assertEqualsM(g_gauge_buffers.allocated,6,"Get error, allocated != 6 \n"); + + return_gauge_field(&gauge_field[5]); + + assertEqualsM(gauge_field[5].field,NULL,"Return error, field pointer not NULL \n"); + assertEqualsM(g_gauge_buffers.allocated,6,"Return error, allocated != 6 \n"); + assertEqualsM(g_gauge_buffers.free,1,"Return error, free != 1 \n"); + + gauge_field[5] = get_gauge_field(); + + assertNotEqualsM(gauge_field[5].field,NULL,"Get error, field pointer still NULL \n"); + assertEqualsM(g_gauge_buffers.allocated,6,"Get error, allocated != 6 \n"); + assertEqualsM(g_gauge_buffers.free,0,"Get error, free != 0 \n"); + + allocate_gauge_buffers(2); + + assertEqualsM(g_gauge_buffers.allocated,8,"Allocate error, allocated != 8 \n"); + assertEqualsM(g_gauge_buffers.free,2,"Allocate error, free != 2 \n"); + + gauge_field[6] = get_gauge_field(); + + assertEqualsM(g_gauge_buffers.allocated,8,"Get error, allocated != 8 \n"); + assertEqualsM(g_gauge_buffers.free,1,"Get error, free != 1 \n"); + + for(int i = 4; i <= 6; ++i) { + return_gauge_field(&gauge_field[i]); + } + + assertEqualsM(g_gauge_buffers.free,4,"Return error, free != 4 \n"); + + free_unused_gauge_buffers(); + + assertEqualsM(g_gauge_buffers.allocated,4,"Free error, allocated != 4 \n"); + assertEqualsM(g_gauge_buffers.free,0,"Free error, free != 0 \n"); + + for(int i = 0; i < 4; ++i) { + return_gauge_field(&gauge_field[i]); + } + + finalize_gauge_buffers(); + + assertEqualsM(g_gauge_buffers.allocated,0,"Finalize error, allocated != 0 \n"); + assertEqualsM(g_gauge_buffers.free,0,"Finalize error, free != 0 \n"); +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_buffers_gauge.h b/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_buffers_gauge.h new file mode 100644 index 0000000000000000000000000000000000000000..71b5f92872d5e607517c908713b569e14b03b7ea --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_buffers_gauge.h @@ -0,0 +1,17 @@ +#ifndef _TEST_BUFFERS_GAUGE_H +#define _TEST_BUFFERS_GAUGE_H + +#include + +TEST(buffers_gauge_allocate_finalize); +TEST(buffers_gauge_get_return); + +TEST_SUITE(BUFFERS_GAUGE){ + TEST_ADD(buffers_gauge_allocate_finalize), + TEST_ADD(buffers_gauge_get_return), + TEST_SUITE_CLOSURE +}; + +#endif /* _TEST_BUFFERS_GAUGE_H */ + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_clover.c b/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_clover.c new file mode 100644 index 0000000000000000000000000000000000000000..bb26a63e1821211e617988292ed63a983de2fad1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_clover.c @@ -0,0 +1,15 @@ + +#include "../global.h" +#include "test_clover_six_invert.h" + +TEST_SUITES { + TEST_SUITE_ADD(CLOVER), + TEST_SUITES_CLOSURE +}; + +int main(int argc,char *argv[]){ + CU_SET_OUT_PREFIX("regressions/"); + CU_RUN(argc,argv); + return 0; +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_clover_six_invert.c b/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_clover_six_invert.c new file mode 100644 index 0000000000000000000000000000000000000000..fa95c8ef6080496aa98ab88ff54da3275486f6fc --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_clover_six_invert.c @@ -0,0 +1,91 @@ +#include +#include +#include +#include + +#define EPS 1e-7 + +int six_invert(_Complex double a[6][6]); +_Complex double six_det(_Complex double a[6][6]); + +TEST(clover_six_invert) { + _Complex double a[6][6], b[6][6]; + int test = 0; + + // random matrix a + a[0][0] = -0.0226172-1.0842742*I; a[0][1] = -0.4641519+0.7071808*I; a[0][2] = -0.0786318+1.4290063*I; + a[1][0] = 0.2165182+2.6528579*I; a[1][1] = 1.4397192-0.5239191*I; a[1][2] = -0.7269084+0.8157988*I; + a[2][0] = -0.0628841-0.3470563*I; a[2][1] = -1.0386082-0.2135166*I; a[2][2] = -1.3647777+0.7312646*I; + a[3][0] = -0.1675412-0.7309873*I; a[3][1] = 0.1120023-1.3983000*I; a[3][2] = -0.1266411+0.4298037*I; + a[4][0] = -0.2725515+0.1809753*I; a[4][1] = -0.1379395-0.7037811*I; a[4][2] = -0.6896344+0.1783902*I; + a[5][0] = -1.0980302+0.2763006*I; a[5][1] = -1.8903566-0.3511587*I; a[5][2] = 1.1886761-1.7150829*I; + + a[0][3] = 0.5028327+1.1093231*I; a[0][4] = 0.3878236-1.3375976*I; a[0][5] = 0.1203910+2.0495843*I; + a[1][3] = -0.5099459-0.0617545*I; a[1][4] = 1.6599072-0.1078419*I; a[1][5] = 0.5164999+1.0314383*I; + a[2][3] = -0.6036081+0.3900738*I; a[2][4] = -0.0447905+0.7071715*I; a[2][5] = 0.6763751+0.4613504*I; + a[3][3] = 1.0440726+1.4681992*I; a[3][4] = -1.3339747+0.0932149*I; a[3][5] = 0.3268227-0.4352195*I; + a[4][3] = -0.3226257-0.8897978*I; a[4][4] = -0.2680521+0.1304365*I; a[4][5] = -1.0114200-0.2461815*I; + a[5][3] = -0.1194779-0.4089390*I; a[5][4] = -0.1003558+1.6537274*I; a[5][5] = -0.6532741+0.5098912*I; + + // b = inverse of a + b[0][0] = -0.24037097+0.14414191*I; b[0][1] = -0.11380668-0.08118723*I; b[0][2] = -0.1589440+0.4350548*I; + b[1][0] = -0.10475996+0.12442873*I; b[1][1] = 0.10510192+0.23615703*I; b[1][2] = -0.0141379+0.2762152*I; + b[2][0] = -0.01620610+0.00456679*I; b[2][1] = 0.02483109-0.02776261*I; b[2][2] = -0.1478979-0.0784658*I; + b[3][0] = 0.09209149+0.00787285*I; b[3][1] = 0.01995269+0.00092068*I; b[3][2] = -0.2347910+0.1687461*I; + b[4][0] = 0.21497592+0.31304060*I; b[4][1] = 0.24420948-0.01908121*I; b[4][2] = 0.3385191-0.2141792*I; + b[5][0] = -0.01061067-0.16808488*I; b[5][1] = 0.09468236-0.08485920*I; b[5][2] = 0.4353193+0.0010994*I; + + b[0][3] = -0.0239881-0.4151801*I; b[0][4] = -0.6263347-0.5963434*I; b[0][5] = -0.45655201-0.02202738*I; + b[1][3] = -0.1350729-0.0418095*I; b[1][4] = -0.6033738+0.0647601*I; b[1][5] = -0.28037632+0.30025691*I; + b[2][3] = -0.1431319+0.0244497*I; b[2][4] = -0.2807683-0.0808173*I; b[2][5] = 0.12654249+0.21884983*I; + b[3][3] = 0.2140318-0.4344302*I; b[3][4] = -0.1638382+0.0162849*I; b[3][5] = -0.17682708-0.12990665*I; + b[4][3] = -0.4013470+0.0988086*I; b[4][4] = -0.3337646+0.9573819*I; b[4][5] = 0.28730090+0.30454484*I; + b[5][3] = -0.1739908+0.0800473*I; b[5][4] = -0.2584657+0.3703075*I; b[5][5] = 0.09579707+0.08151071*I; + + six_invert(a); + test = 0; + + for(int i = 0; i < 6; i++) { + for(int j = 0; j < 6; j++) { + if(creal(a[i][j] - b[i][j]) > EPS || cimag(a[i][j] - b[i][j]) > EPS) { + printf("%d %d %e %e %e %e\n", i, j, creal(a[i][j]), cimag(a[i][j]), creal(b[i][j]), cimag(b[i][j])); + test = 1; + } + } + } + + assertFalseM(test,"The six_invert function does not work correctly!\n"); +} + +TEST(clover_six_det) { + _Complex double a[6][6]; + int test = 0; + _Complex double d = 0.; + + // random matrix a + a[0][0] = -0.0226172-1.0842742*I; a[0][1] = -0.4641519+0.7071808*I; a[0][2] = -0.0786318+1.4290063*I; + a[1][0] = 0.2165182+2.6528579*I; a[1][1] = 1.4397192-0.5239191*I; a[1][2] = -0.7269084+0.8157988*I; + a[2][0] = -0.0628841-0.3470563*I; a[2][1] = -1.0386082-0.2135166*I; a[2][2] = -1.3647777+0.7312646*I; + a[3][0] = -0.1675412-0.7309873*I; a[3][1] = 0.1120023-1.3983000*I; a[3][2] = -0.1266411+0.4298037*I; + a[4][0] = -0.2725515+0.1809753*I; a[4][1] = -0.1379395-0.7037811*I; a[4][2] = -0.6896344+0.1783902*I; + a[5][0] = -1.0980302+0.2763006*I; a[5][1] = -1.8903566-0.3511587*I; a[5][2] = 1.1886761-1.7150829*I; + + a[0][3] = 0.5028327+1.1093231*I; a[0][4] = 0.3878236-1.3375976*I; a[0][5] = 0.1203910+2.0495843*I; + a[1][3] = -0.5099459-0.0617545*I; a[1][4] = 1.6599072-0.1078419*I; a[1][5] = 0.5164999+1.0314383*I; + a[2][3] = -0.6036081+0.3900738*I; a[2][4] = -0.0447905+0.7071715*I; a[2][5] = 0.6763751+0.4613504*I; + a[3][3] = 1.0440726+1.4681992*I; a[3][4] = -1.3339747+0.0932149*I; a[3][5] = 0.3268227-0.4352195*I; + a[4][3] = -0.3226257-0.8897978*I; a[4][4] = -0.2680521+0.1304365*I; a[4][5] = -1.0114200-0.2461815*I; + a[5][3] = -0.1194779-0.4089390*I; a[5][4] = -0.1003558+1.6537274*I; a[5][5] = -0.6532741+0.5098912*I; + + d = six_det(a); + test = 0; + + if(creal(d) + 44.9277673 > EPS || cimag(d) - 84.4696631 > EPS) { + printf("%.10e.10 %.10e\n", creal(d), cimag(d)); + test = 1; + } + + assertFalseM(test,"The six_det function does not work correctly!\n"); +} + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_clover_six_invert.h b/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_clover_six_invert.h new file mode 100644 index 0000000000000000000000000000000000000000..35c6ee23757c93b2a06a6f22d246f4d74cd155ff --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_clover_six_invert.h @@ -0,0 +1,15 @@ +#ifndef _TEST_CLOVER_SIX_INVERT_H +#define _TEST_CLOVER_SIX_INVERT_H + +#include + +TEST(clover_six_invert); +TEST(clover_six_det); + +TEST_SUITE(CLOVER){ + TEST_ADD(clover_six_invert), + TEST_ADD(clover_six_det), + TEST_SUITE_CLOSURE +}; + +#endif /* _TEST_CLOVER_SIX_INVERT_H */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_linalg.c b/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_linalg.c new file mode 100644 index 0000000000000000000000000000000000000000..51ceb0cf199c30482869a916de6ae884e0a2c564 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_linalg.c @@ -0,0 +1,18 @@ + +#if HAVE_CONFIG_H +#include +#endif +#include "../global.h" +#include "test_linalg_spinor.h" + +TEST_SUITES { + TEST_SUITE_ADD(LINALG), + TEST_SUITES_CLOSURE +}; + +int main(int argc,char *argv[]){ + CU_SET_OUT_PREFIX("regressions/"); + CU_RUN(argc,argv); + return 0; +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_linalg_spinor.c b/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_linalg_spinor.c new file mode 100644 index 0000000000000000000000000000000000000000..fd8df823df84a7db0ae5f97a7f1ee61119ee0c84 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_linalg_spinor.c @@ -0,0 +1,437 @@ +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#include +#include +#include "../su3.h" +#include "../linalg_eo.h" + +#define EPS 1e-15 + +TEST(scalar_prod_real) { + const int N = 1000; + int test = 0; + double snrm = 0., atime=0., etime=0.; + double *s, *r; + spinor R[N] ALIGN; + spinor S[N] ALIGN; + + R[0].s0.c0 =-1.1+0.7*I; + R[0].s0.c1 =-0.0-0.7*I; + R[0].s0.c2 =-1.3+1.9*I; + R[0].s1.c0 = 4.4-4.0*I; + R[0].s1.c1 =-1.5-3.1*I; + R[0].s1.c2 = 6.6+2.3*I; + R[0].s2.c0 =-5.4-0.7*I; + R[0].s2.c1 =-7.8-6.3*I; + R[0].s2.c2 = 1.3+3.7*I; + R[0].s3.c0 =-8.3-4.6*I; + R[0].s3.c1 =-1.3+4.5*I; + R[0].s3.c2 = 9.3-2.3*I; + + S[0].s0.c0 =+1.1-0.7*I; + S[0].s0.c1 =-0.0-1.7*I; + S[0].s0.c2 =-1.3+1.9*I; + S[0].s1.c0 =-4.4-4.0*I; + S[0].s1.c1 =-1.5-3.1*I; + S[0].s1.c2 = 3.4+0.3*I; + S[0].s2.c0 =-5.4-0.7*I; + S[0].s2.c1 =+7.4-6.3*I; + S[0].s2.c2 = 1.6+3.7*I; + S[0].s3.c0 =+8.3-4.6*I; + S[0].s3.c1 =-1.8-0.5*I; + S[0].s3.c2 =-9.0+0.3*I; + + R[1].s0.c0 =-1.1+0.7*I; + R[1].s0.c1 = 0.0-0.7*I; + R[1].s0.c2 =-1.3+1.9*I; + R[1].s1.c0 = 1.4-4.0*I; + R[1].s1.c1 =-1.0-3.1*I; + R[1].s1.c2 = 6.6+3.3*I; + R[1].s2.c0 =-5.4+0.7*I; + R[1].s2.c1 =+7.8-4.3*I; + R[1].s2.c2 =-1.3+3.7*I; + R[1].s3.c0 =-8.3-4.6*I; + R[1].s3.c1 =-1.3+4.5*I; + R[1].s3.c2 = 9.3-2.3*I; + + S[1].s0.c0 =+1.1-0.7*I; + S[1].s0.c1 =+0.0-1.7*I; + S[1].s0.c2 = 1.2+4.1*I; + S[1].s1.c0 =-4.4-4.7*I; + S[1].s1.c1 =-1.5-2.1*I; + S[1].s1.c2 = 3.4-0.3*I; + S[1].s2.c0 =-5.4-0.7*I; + S[1].s2.c1 = 7.4+6.3*I; + S[1].s2.c2 =-1.6+3.7*I; + S[1].s3.c0 =+1.3-4.6*I; + S[1].s3.c1 =-1.8-0.5*I; + S[1].s3.c2 =-2.0+0.3*I; + + snrm = scalar_prod_r(R, S, 2, 0); + if( (snrm - 4.584000e+01) > EPS) test = 1; + assertFalseM(test, "scalar_prod_r failed\n."); + for(int i = 0; i < N; i++) { + s = (double*)(S+i); + r = (double*)(R+i); + for(int j = 0; j < 24; j++) { + s[j] = (double)random()/(double)RAND_MAX; + r[j] = (double)random()/(double)RAND_MAX; + } + } + + atime = (double)clock()/(double)(CLOCKS_PER_SEC); + for(int i = 0; i < 10000; i++) { + snrm += scalar_prod_r(R, S, N, 0); + } + etime = (double)clock()/(double)(CLOCKS_PER_SEC); + printf("res %e\n\n", snrm); + printf("time = %e\n", etime-atime); +} + + +TEST(snorm) { + const int N = 1000; + int test = 0; + double snrm = 0., atime=0., etime=0.; + double *s, *r; + spinor R[N] ALIGN; + spinor S[N] ALIGN; + + R[0].s0.c0 =-1.1+0.7*I; + R[0].s0.c1 =-0.0-0.7*I; + R[0].s0.c2 =-1.3+1.9*I; + R[0].s1.c0 = 4.4-4.0*I; + R[0].s1.c1 =-1.5-3.1*I; + R[0].s1.c2 = 6.6+2.3*I; + R[0].s2.c0 =-5.4-0.7*I; + R[0].s2.c1 =-7.8-6.3*I; + R[0].s2.c2 = 1.3+3.7*I; + R[0].s3.c0 =-8.3-4.6*I; + R[0].s3.c1 =-1.3+4.5*I; + R[0].s3.c2 = 9.3-2.3*I; + + S[0].s0.c0 =+1.1-0.7*I; + S[0].s0.c1 =-0.0-1.7*I; + S[0].s0.c2 =-1.3+1.9*I; + S[0].s1.c0 =-4.4-4.0*I; + S[0].s1.c1 =-1.5-3.1*I; + S[0].s1.c2 = 3.4+0.3*I; + S[0].s2.c0 =-5.4-0.7*I; + S[0].s2.c1 =+7.4-6.3*I; + S[0].s2.c2 = 1.6+3.7*I; + S[0].s3.c0 =+8.3-4.6*I; + S[0].s3.c1 =-1.8-0.5*I; + S[0].s3.c2 =-9.0+0.3*I; + + R[1].s0.c0 =-1.1+0.7*I; + R[1].s0.c1 = 0.0-0.7*I; + R[1].s0.c2 =-1.3+1.9*I; + R[1].s1.c0 = 1.4-4.0*I; + R[1].s1.c1 =-1.0-3.1*I; + R[1].s1.c2 = 6.6+3.3*I; + R[1].s2.c0 =-5.4+0.7*I; + R[1].s2.c1 =+7.8-4.3*I; + R[1].s2.c2 =-1.3+3.7*I; + R[1].s3.c0 =-8.3-4.6*I; + R[1].s3.c1 =-1.3+4.5*I; + R[1].s3.c2 = 9.3-2.3*I; + + S[1].s0.c0 =+1.1-0.7*I; + S[1].s0.c1 =+0.0-1.7*I; + S[1].s0.c2 = 1.2+4.1*I; + S[1].s1.c0 =-4.4-4.7*I; + S[1].s1.c1 =-1.5-2.1*I; + S[1].s1.c2 = 3.4-0.3*I; + S[1].s2.c0 =-5.4-0.7*I; + S[1].s2.c1 = 7.4+6.3*I; + S[1].s2.c2 =-1.6+3.7*I; + S[1].s3.c0 =+1.3-4.6*I; + S[1].s3.c1 =-1.8-0.5*I; + S[1].s3.c2 =-2.0+0.3*I; + + snrm = square_norm(R, 2, 0); + printf("square norm = %.16e\n", snrm); + if( (snrm - 8.7152999999999997e+02 ) > EPS) test = 1; + assertFalseM(test, "square_norm failed\n."); + for(int i = 0; i < N; i++) { + s = (double*)(S+i); + r = (double*)(R+i); + for(int j = 0; j < 24; j++) { + s[j] = (double)random()/(double)RAND_MAX; + r[j] = (double)random()/(double)RAND_MAX; + } + } + + atime = (double)clock()/(double)(CLOCKS_PER_SEC); + for(int i = 0; i < 10000; i++) { + snrm += square_norm(R, N, 0); + } + etime = (double)clock()/(double)(CLOCKS_PER_SEC); + printf("res %e\n\n", snrm); + printf("time = %e\n", etime-atime); + +} + +TEST(sdiff) { + const int N = 1000; + int test = 0; + double snrm = 0., atime=0., etime=0.; + double *s, *r; + spinor R[N] ALIGN; + spinor S[N] ALIGN; + spinor Q[N] ALIGN; + + R[0].s0.c0 =-1.1+0.7*I; + R[0].s0.c1 =-0.0-0.7*I; + R[0].s0.c2 =-1.3+1.9*I; + R[0].s1.c0 = 4.4-4.0*I; + R[0].s1.c1 =-1.5-3.1*I; + R[0].s1.c2 = 6.6+2.3*I; + R[0].s2.c0 =-5.4-0.7*I; + R[0].s2.c1 =-7.8-6.3*I; + R[0].s2.c2 = 1.3+3.7*I; + R[0].s3.c0 =-8.3-4.6*I; + R[0].s3.c1 =-1.3+4.5*I; + R[0].s3.c2 = 9.3-2.3*I; + + S[0].s0.c0 =+1.1-0.7*I; + S[0].s0.c1 =-0.0-1.7*I; + S[0].s0.c2 =-1.3+1.9*I; + S[0].s1.c0 =-4.4-4.0*I; + S[0].s1.c1 =-1.5-3.1*I; + S[0].s1.c2 = 3.4+0.3*I; + S[0].s2.c0 =-5.4-0.7*I; + S[0].s2.c1 =+7.4-6.3*I; + S[0].s2.c2 = 1.6+3.7*I; + S[0].s3.c0 =+8.3-4.6*I; + S[0].s3.c1 =-1.8-0.5*I; + S[0].s3.c2 =-9.0+0.3*I; + + R[1].s0.c0 =-1.1+0.7*I; + R[1].s0.c1 = 0.0-0.7*I; + R[1].s0.c2 =-1.3+1.9*I; + R[1].s1.c0 = 1.4-4.0*I; + R[1].s1.c1 =-1.0-3.1*I; + R[1].s1.c2 = 6.6+3.3*I; + R[1].s2.c0 =-5.4+0.7*I; + R[1].s2.c1 =+7.8-4.3*I; + R[1].s2.c2 =-1.3+3.7*I; + R[1].s3.c0 =-8.3-4.6*I; + R[1].s3.c1 =-1.3+4.5*I; + R[1].s3.c2 = 9.3-2.3*I; + + S[1].s0.c0 =+1.1-0.7*I; + S[1].s0.c1 =+0.0-1.7*I; + S[1].s0.c2 = 1.2+4.1*I; + S[1].s1.c0 =-4.4-4.7*I; + S[1].s1.c1 =-1.5-2.1*I; + S[1].s1.c2 = 3.4-0.3*I; + S[1].s2.c0 =-5.4-0.7*I; + S[1].s2.c1 = 7.4+6.3*I; + S[1].s2.c2 =-1.6+3.7*I; + S[1].s3.c0 =+1.3-4.6*I; + S[1].s3.c1 =-1.8-0.5*I; + S[1].s3.c2 =-2.0+0.3*I; + + diff(Q, R, S, 2); + snrm = square_norm(Q, 2, 0); + printf("diff %.16e %.16e\n", creal(Q[0].s0.c0), cimag(Q[1].s2.c1)); + printf("square_norm Q=R-S = %.16e\n\n", snrm); + if( (snrm - 1.4169700000000000e+03 ) > EPS || + (creal(Q[0].s0.c0) +2.2 )>EPS || (cimag(Q[1].s2.c1 + 10.6) > EPS)) test = 1; + assertFalseM(test, "diff failed\n."); + for(int i = 0; i < N; i++) { + s = (double*)(S+i); + r = (double*)(R+i); + for(int j = 0; j < 24; j++) { + s[j] = (double)random()/(double)RAND_MAX; + r[j] = (double)random()/(double)RAND_MAX; + } + } + + atime = (double)clock()/(double)(CLOCKS_PER_SEC); + for(int i = 0; i < 10000; i++) { + diff(Q, R, S, N); + diff(R, S, Q, N); + } + etime = (double)clock()/(double)(CLOCKS_PER_SEC); + printf("time = %e\n", etime-atime); + +} + +TEST(aaddm_r) { + const int N = 1000; + int test = 0; + double c = 0.756; + double snrm = 0., atime=0., etime=0.; + double *s, *r; + spinor R[N] ALIGN; + spinor S[N] ALIGN; + + R[0].s0.c0 =-1.1+0.7*I; + R[0].s0.c1 =-0.0-0.7*I; + R[0].s0.c2 =-1.3+1.9*I; + R[0].s1.c0 = 4.4-4.0*I; + R[0].s1.c1 =-1.5-3.1*I; + R[0].s1.c2 = 6.6+2.3*I; + R[0].s2.c0 =-5.4-0.7*I; + R[0].s2.c1 =-7.8-6.3*I; + R[0].s2.c2 = 1.3+3.7*I; + R[0].s3.c0 =-8.3-4.6*I; + R[0].s3.c1 =-1.3+4.5*I; + R[0].s3.c2 = 9.3-2.3*I; + + S[0].s0.c0 =+1.1-0.7*I; + S[0].s0.c1 =-0.0-1.7*I; + S[0].s0.c2 =-1.3+1.9*I; + S[0].s1.c0 =-4.4-4.0*I; + S[0].s1.c1 =-1.5-3.1*I; + S[0].s1.c2 = 3.4+0.3*I; + S[0].s2.c0 =-5.4-0.7*I; + S[0].s2.c1 =+7.4-6.3*I; + S[0].s2.c2 = 1.6+3.7*I; + S[0].s3.c0 =+8.3-4.6*I; + S[0].s3.c1 =-1.8-0.5*I; + S[0].s3.c2 =-9.0+0.3*I; + + R[1].s0.c0 =-1.1+0.7*I; + R[1].s0.c1 = 0.0-0.7*I; + R[1].s0.c2 =-1.3+1.9*I; + R[1].s1.c0 = 1.4-4.0*I; + R[1].s1.c1 =-1.0-3.1*I; + R[1].s1.c2 = 6.6+3.3*I; + R[1].s2.c0 =-5.4+0.7*I; + R[1].s2.c1 =+7.8-4.3*I; + R[1].s2.c2 =-1.3+3.7*I; + R[1].s3.c0 =-8.3-4.6*I; + R[1].s3.c1 =-1.3+4.5*I; + R[1].s3.c2 = 9.3-2.3*I; + + S[1].s0.c0 =+1.1-0.7*I; + S[1].s0.c1 =+0.0-1.7*I; + S[1].s0.c2 = 1.2+4.1*I; + S[1].s1.c0 =-4.4-4.7*I; + S[1].s1.c1 =-1.5-2.1*I; + S[1].s1.c2 = 3.4-0.3*I; + S[1].s2.c0 =-5.4-0.7*I; + S[1].s2.c1 = 7.4+6.3*I; + S[1].s2.c2 =-1.6+3.7*I; + S[1].s3.c0 =+1.3-4.6*I; + S[1].s3.c1 =-1.8-0.5*I; + S[1].s3.c2 =-2.0+0.3*I; + + assign_add_mul_r(R, S, c, 2); + snrm = square_norm(R, 2, 0); + printf("single parts %.16e %.16e\n", creal(R[0].s0.c0), cimag(R[1].s2.c1)); + printf("assign_add_mul_r = %.16e\n\n", snrm); + if( (snrm/1000. - 1.3049770963199999 ) > EPS || + (creal(R[0].s0.c0)*10 + 2.6840000000000003 ) > EPS || (cimag(R[1].s2.c1)*10 - 4.6280000000000010) > EPS) test = 1; + assertFalseM(test, "assign_add_mul_r failed\n."); + + for(int i = 0; i < N; i++) { + s = (double*)(S+i); + r = (double*)(R+i); + for(int j = 0; j < 24; j++) { + s[j] = (double)random()/(double)RAND_MAX; + r[j] = (double)random()/(double)RAND_MAX; + } + } + + atime = (double)clock()/(double)(CLOCKS_PER_SEC); + for(int i = 0; i < 10000; i++) { + assign_add_mul_r(R, S, c, N); + } + etime = (double)clock()/(double)(CLOCKS_PER_SEC); + printf("time assign_add_mul_r = %e\n", etime-atime); +} + +TEST(amadd_r) { + const int N = 1000; + int test = 0; + double c = 0.756; + double snrm = 0., atime=0., etime=0.; + double *s, *r; + spinor R[N] ALIGN; + spinor S[N] ALIGN; + + R[0].s0.c0 =-1.1+0.7*I; + R[0].s0.c1 =-0.0-0.7*I; + R[0].s0.c2 =-1.3+1.9*I; + R[0].s1.c0 = 4.4-4.0*I; + R[0].s1.c1 =-1.5-3.1*I; + R[0].s1.c2 = 6.6+2.3*I; + R[0].s2.c0 =-5.4-0.7*I; + R[0].s2.c1 =-7.8-6.3*I; + R[0].s2.c2 = 1.3+3.7*I; + R[0].s3.c0 =-8.3-4.6*I; + R[0].s3.c1 =-1.3+4.5*I; + R[0].s3.c2 = 9.3-2.3*I; + + S[0].s0.c0 =+1.1-0.7*I; + S[0].s0.c1 =-0.0-1.7*I; + S[0].s0.c2 =-1.3+1.9*I; + S[0].s1.c0 =-4.4-4.0*I; + S[0].s1.c1 =-1.5-3.1*I; + S[0].s1.c2 = 3.4+0.3*I; + S[0].s2.c0 =-5.4-0.7*I; + S[0].s2.c1 =+7.4-6.3*I; + S[0].s2.c2 = 1.6+3.7*I; + S[0].s3.c0 =+8.3-4.6*I; + S[0].s3.c1 =-1.8-0.5*I; + S[0].s3.c2 =-9.0+0.3*I; + + R[1].s0.c0 =-1.1+0.7*I; + R[1].s0.c1 = 0.0-0.7*I; + R[1].s0.c2 =-1.3+1.9*I; + R[1].s1.c0 = 1.4-4.0*I; + R[1].s1.c1 =-1.0-3.1*I; + R[1].s1.c2 = 6.6+3.3*I; + R[1].s2.c0 =-5.4+0.7*I; + R[1].s2.c1 =+7.8-4.3*I; + R[1].s2.c2 =-1.3+3.7*I; + R[1].s3.c0 =-8.3-4.6*I; + R[1].s3.c1 =-1.3+4.5*I; + R[1].s3.c2 = 9.3-2.3*I; + + S[1].s0.c0 =+1.1-0.7*I; + S[1].s0.c1 =+0.0-1.7*I; + S[1].s0.c2 = 1.2+4.1*I; + S[1].s1.c0 =-4.4-4.7*I; + S[1].s1.c1 =-1.5-2.1*I; + S[1].s1.c2 = 3.4-0.3*I; + S[1].s2.c0 =-5.4-0.7*I; + S[1].s2.c1 = 7.4+6.3*I; + S[1].s2.c2 =-1.6+3.7*I; + S[1].s3.c0 =+1.3-4.6*I; + S[1].s3.c1 =-1.8-0.5*I; + S[1].s3.c2 =-2.0+0.3*I; + + assign_mul_add_r(R, c, S, 2); + snrm = square_norm(R, 2, 0); + printf("single parts %.16e %.16e\n", creal(R[0].s0.c0), cimag(R[1].s2.c1)); + printf("assign_mul_add_r = %.16e\n\n", snrm); + if( (snrm/1000. - 1.2045408500799999 ) > EPS || + (creal(R[0].s0.c0)*10 - 2.6840000000000003 ) > EPS || (cimag(R[1].s2.c1) - 3.0491999999999999) > EPS) test = 1; + assertFalseM(test, "assign_mul_add_r failed\n."); + + for(int i = 0; i < N; i++) { + s = (double*)(S+i); + r = (double*)(R+i); + for(int j = 0; j < 24; j++) { + s[j] = (double)random()/(double)RAND_MAX; + r[j] = (double)random()/(double)RAND_MAX; + } + } + + atime = (double)clock()/(double)(CLOCKS_PER_SEC); + for(int i = 0; i < 10000; i++) { + assign_mul_add_r(R, c, S, N); + } + etime = (double)clock()/(double)(CLOCKS_PER_SEC); + printf("time assign_mul_add_r = %e\n", etime-atime); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_linalg_spinor.h b/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_linalg_spinor.h new file mode 100644 index 0000000000000000000000000000000000000000..7981cf0a9ec9a152fd166675085064be2934cbcf --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_linalg_spinor.h @@ -0,0 +1,21 @@ +#ifndef _TEST_LINALG_SPINOR_H +#define _TEST_LINALG_SPINOR_H + +#include + +TEST(scalar_prod_real); +TEST(snorm); +TEST(sdiff); +TEST(aaddm_r); +TEST(amadd_r); + +TEST_SUITE(LINALG){ + TEST_ADD(scalar_prod_real), + TEST_ADD(snorm), + TEST_ADD(sdiff), + TEST_ADD(aaddm_r), + TEST_ADD(amadd_r), + TEST_SUITE_CLOSURE + }; + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_qpx.c b/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_qpx.c new file mode 100644 index 0000000000000000000000000000000000000000..5e27f51b55b037beb196246f7779c3c36c0da03a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_qpx.c @@ -0,0 +1,14 @@ + +#include "test_qpx_algebra.h" + +TEST_SUITES { + TEST_SUITE_ADD(QPX_ALGEBRA), + TEST_SUITES_CLOSURE +}; + +int main(int argc,char *argv[]){ + CU_SET_OUT_PREFIX("regressions/"); + CU_RUN(argc,argv); + return 0; +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_qpx_algebra.c b/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_qpx_algebra.c new file mode 100644 index 0000000000000000000000000000000000000000..7de1685affc0ded795e4897f994879bd349a49ad --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_qpx_algebra.c @@ -0,0 +1,557 @@ +#include +#include +#include +#include +#if (defined SSE || defined SSE2 || defined SSE3) +# include "sse.h" +#endif +#include "../su3.h" +#include "../su3adj.h" +#include "../expo.h" +#if (!defined BGQ && defined XLC) +# include "../bgq.h" +#endif + +#define EPS 5e-16 + +TEST(qpx_algebra) { + int test = 0; +#if (!defined BGQ && defined XLC) + complex double ca, cb, cc, cd, k ALIGN; + vector4double a, b, c, d; + su3 u ALIGN; + su3_vector phi0, phi1, phi2, phi3, phi4, phi5 ALIGN; + spinor s, temp, temp2 ALIGN; + spinor * sp; + vector4double rs[12]; + vector4double r[12]; + vector4double U[9]; + + k = 0.3-0.7*I; + phi0.c0 = 0.9+2*I; + phi0.c1 = 0.6+1.3*I; + phi0.c2 = 0.6-.2*I; + + phi1.c0 = -0.1+1*I; + phi1.c1 = 0.1+0.3*I; + phi1.c2 = -0.1-.12*I; + + u.c00 = +0.3391 -0.1635*I; + u.c01 = -0.2357 +0.5203*I; + u.c02 = +0.5609 +0.4663*I; + u.c10 = -0.0740 -0.4204*I; + u.c11 = -0.7706 -0.1863*I; + u.c12 = +0.1191 -0.4185*I; + u.c20 = +0.5351 -0.6243*I; + u.c21 = +0.1825 +0.1089*I; + u.c22 = -0.5279 -0.0022*I; + + _su3_multiply(phi2, u, phi0); + _su3_multiply(phi3, u, phi1); + + vec_load2(r, &phi0); + vec_load2(&r[3], &phi1); + vec_su3_multiply_double2(&u, U, r); + vec_store2(&phi4, &r[6]); + vec_store2(&phi5, &r[9]); + if( cabs(phi4.c0 - phi2.c0) > EPS || + cabs(phi4.c1 - phi2.c1) > EPS || + cabs(phi4.c2 - phi2.c2) > EPS || + cabs(phi5.c0 - phi3.c0) > EPS || + cabs(phi5.c1 - phi3.c1) > EPS || + cabs(phi5.c2 - phi3.c2) > EPS ) + test = 1; + + assertFalseM(test, "vec_su3_multiply_double2 failed\n"); + test = 0; + + _su3_inverse_multiply(phi2, u, phi0); + _su3_inverse_multiply(phi3, u, phi1); + + vec_load2(r, &phi0); + vec_load2(&r[3], &phi1); + vec_su3_inverse_multiply_double2(&u, U, r); + vec_store2(&phi4, &r[6]); + vec_store2(&phi5, &r[9]); + if( cabs(phi4.c0 - phi2.c0) > EPS || + cabs(phi4.c1 - phi2.c1) > EPS || + cabs(phi4.c2 - phi2.c2) > EPS || + cabs(phi5.c0 - phi3.c0) > EPS || + cabs(phi5.c1 - phi3.c1) > EPS || + cabs(phi5.c2 - phi3.c2) > EPS ) + test = 1; + + assertFalseM(test, "vec_su3_inverse_multiply_double2 failed\n"); + test = 0; + + vec_load2(r, &phi0); + vec_load2(r+3, &phi1); + vec_add2(r, r+3); + vec_store2(&phi4, r); + + if( cabs(phi4.c0 - phi0.c0 - phi1.c0) > EPS || + cabs(phi4.c1 - phi0.c1 - phi1.c1) > EPS || + cabs(phi4.c2 - phi0.c2 - phi1.c2) > EPS ) + test = 1; + + assertFalseM(test, "vec_add2 failed\n"); + test = 0; + + vec_load2(r, &phi0); + vec_load2(r+3, &phi1); + vec_sub2(r, r+3); + vec_store2(&phi4, r); + + if( cabs(phi4.c0 - phi0.c0 + phi1.c0) > EPS || + cabs(phi4.c1 - phi0.c1 + phi1.c1) > EPS || + cabs(phi4.c2 - phi0.c2 + phi1.c2) > EPS ) + test = 1; + + assertFalseM(test, "vec_sub2 failed\n"); + test = 0; + + vec_load2(r, &phi0); + vec_load2(r+3, &phi1); + vec_cmplx_mul_double2(&r[6], r, U, &k); + vec_store2(&phi4, r+6); + vec_store2(&phi5, r+9); + + if( cabs(phi4.c0 - k*phi0.c0) > EPS || + cabs(phi4.c1 - k*phi0.c1) > EPS || + cabs(phi4.c2 - k*phi0.c2) > EPS || + cabs(phi5.c0 - k*phi1.c0) > EPS || + cabs(phi5.c1 - k*phi1.c1) > EPS || + cabs(phi5.c2 - k*phi1.c2) > EPS ) + test = 1; + + assertFalseM(test, "vec_cmplx_mul_double2 failed\n"); + test = 0; + + vec_load2(r, &phi0); + vec_load2(r+3, &phi1); + vec_cmplxcg_mul_double2(r+6, r, U, &k); + vec_store2(&phi4, r+6); + vec_store2(&phi5, r+9); + + if( cabs(phi4.c0 - conj(k)*phi0.c0) > EPS || + cabs(phi4.c1 - conj(k)*phi0.c1) > EPS || + cabs(phi4.c2 - conj(k)*phi0.c2) > EPS || + cabs(phi5.c0 - conj(k)*phi1.c0) > EPS || + cabs(phi5.c1 - conj(k)*phi1.c1) > EPS || + cabs(phi5.c2 - conj(k)*phi1.c2) > EPS ) + test = 1; + + assertFalseM(test, "vec_cmplxcg_mul_double2 failed\n"); + test = 0; + + vec_load2(r, &phi0); + vec_load2(r+3, &phi1); + vec_i_mul_add2(r, r+3, U); + vec_store2(&phi4, r); + + if( cabs(phi4.c0 - phi0.c0 - I*phi1.c0) > EPS || + cabs(phi4.c1 - phi0.c1 - I*phi1.c1) > EPS || + cabs(phi4.c2 - phi0.c2 - I*phi1.c2) > EPS ) + test = 1; + + assertFalseM(test, "vec_i_mul_add2 failed\n"); + test = 0; + + vec_load2(r, &phi0); + vec_load2(r+3, &phi1); + vec_i_mul_sub2(r, r+3, U); + vec_store2(&phi4, r); + + if( cabs(phi4.c0 - phi0.c0 + I*phi1.c0) > EPS || + cabs(phi4.c1 - phi0.c1 + I*phi1.c1) > EPS || + cabs(phi4.c2 - phi0.c2 + I*phi1.c2) > EPS ) + test = 1; + + assertFalseM(test, "vec_i_mul_sub2 failed\n"); + test = 0; + + phi2.c0 = 0.9+2*I; + phi2.c1 = 0.6+1.3*I; + phi2.c2 = 0.6-.2*I; + + phi3.c0 = -0.4+0.3*I; + phi3.c1 = 0.5-1.3*I; + phi3.c2 = -0.1-3.12*I; + + s.s0 = phi0; + s.s1 = phi1; + s.s2 = phi2; + s.s3 = phi3; + sp = &s; + _vector_add(psi,sp->s0,sp->s2); + _su3_multiply(chi,u,psi); + _complex_times_vector(psi,k,chi); + _vector_assign(temp.s0,psi); + _vector_assign(temp.s2,psi); + _vector_add(psi, sp->s1, sp->s3); + _su3_multiply(chi,u,psi); + _complex_times_vector(psi,k,chi); + _vector_assign(temp.s1,psi); + _vector_assign(temp.s3,psi); + + vec_load2(r, &sp->s0); + vec_load2(r+3, &sp->s1); + vec_load2(r+6, &sp->s2); + vec_load2(r+9, &sp->s3); + // s0 + s2 and s1 + s3 + vec_add_double2(r, &r[6]); + // result is now in r[0-5] + vec_su3_multiply_double2(&u, U, r); + // result is now in r[6-11] + // mult with ka0 and store in rs + vec_cmplx_mul_double2(rs, &r[6], U, &k); + rs[6] = rs[0]; rs[7] = rs[1]; rs[8] = rs[2]; + rs[9] = rs[3]; rs[10]= rs[4]; rs[11]= rs[5]; + vec_store2(&temp2.s0, rs); + vec_store2(&temp2.s1, rs+3); + vec_store2(&temp2.s2, rs+6); + vec_store2(&temp2.s3, rs+9); + + if( cabs(temp.s0.c0 - temp2.s0.c0) > EPS || + cabs(temp.s0.c1 - temp2.s0.c1) > EPS || + cabs(temp.s0.c2 - temp2.s0.c2) > EPS || + cabs(temp.s1.c0 - temp2.s1.c0) > EPS || + cabs(temp.s1.c1 - temp2.s1.c1) > EPS || + cabs(temp.s1.c2 - temp2.s1.c2) > EPS || + cabs(temp.s2.c0 - temp2.s2.c0) > EPS || + cabs(temp.s2.c1 - temp2.s2.c1) > EPS || + cabs(temp.s2.c2 - temp2.s2.c2) > EPS || + cabs(temp.s3.c0 - temp2.s3.c0) > EPS || + cabs(temp.s3.c1 - temp2.s3.c1) > EPS || + cabs(temp.s3.c2 - temp2.s3.c2) > EPS ) + test = 1; + + assertFalseM(test, "D_t+ failed\n"); + test = 0; + + sm = sp; + _vector_sub(psi,sm->s0,sm->s2); + _su3_inverse_multiply(chi,u,psi); + _complexcjg_times_vector(psi,k,chi); + _vector_add_assign(temp.s0,psi); + _vector_sub_assign(temp.s2,psi); + _vector_sub(psi,sm->s1,sm->s3); + _su3_inverse_multiply(chi,u,psi); + _complexcjg_times_vector(psi,k,chi); + _vector_add_assign(temp.s1,psi); + _vector_sub_assign(temp.s3,psi); + + vec_load2(r, &sm->s0); + vec_load2(r+3, &sm->s1); + vec_load2(r+6, &sm->s2); + vec_load2(r+9, &sm->s3); + // s0 - s2 and s1 - s3 + vec_sub_double2(r, &r[6]); + // result is now in r[0-5] + vec_su3_inverse_multiply_double2(&u, U, r); + // result is now in r[6-11] + // mult with k0 + vec_cmplxcg_mul_double2(r, &r[6], U, &k); + // result in r[0-5] now + vec_add_double2(rs, r); + vec_sub_double2(&rs[6], r); + + vec_store2(&temp2.s0, rs); + vec_store2(&temp2.s1, rs+3); + vec_store2(&temp2.s2, rs+6); + vec_store2(&temp2.s3, rs+9); + + if( cabs(temp.s0.c0 - temp2.s0.c0) > EPS || + cabs(temp.s0.c1 - temp2.s0.c1) > EPS || + cabs(temp.s0.c2 - temp2.s0.c2) > EPS || + cabs(temp.s1.c0 - temp2.s1.c0) > EPS || + cabs(temp.s1.c1 - temp2.s1.c1) > EPS || + cabs(temp.s1.c2 - temp2.s1.c2) > EPS || + cabs(temp.s2.c0 - temp2.s2.c0) > EPS || + cabs(temp.s2.c1 - temp2.s2.c1) > EPS || + cabs(temp.s2.c2 - temp2.s2.c2) > EPS || + cabs(temp.s3.c0 - temp2.s3.c0) > EPS || + cabs(temp.s3.c1 - temp2.s3.c1) > EPS || + cabs(temp.s3.c2 - temp2.s3.c2) > EPS ) + test = 1; + + assertFalseM(test, "D_t- failed\n"); + test = 0; + + _vector_i_add(psi,sp->s0,sp->s3); + _su3_multiply(chi,u,psi); + _complex_times_vector(psi,k,chi); + _vector_add_assign(temp.s0,psi); + _vector_i_sub_assign(temp.s3,psi); + _vector_i_add(psi,sp->s1,sp->s2); + _su3_multiply(chi,u,psi); + _complex_times_vector(psi,k,chi); + _vector_add_assign(temp.s1,psi); + _vector_i_sub_assign(temp.s2,psi); + + vec_load2(r, &sp->s0); + vec_load2(r+3, &sp->s1); + vec_load2(r+9, &sp->s2); + vec_load2(r+6, &sp->s3); + vec_i_mul_add_double2(r, &r[6], U); + vec_su3_multiply_double2(&u, U, r); + vec_cmplx_mul_double2(r, &r[6], U, &k); + vec_add_double2(rs, r); + vec_i_mul_sub2(&rs[6], &r[3], U); + vec_i_mul_sub2(&rs[9], &r[0], U); + + vec_store2(&temp2.s0, rs); + vec_store2(&temp2.s1, rs+3); + vec_store2(&temp2.s2, rs+6); + vec_store2(&temp2.s3, rs+9); + + if( cabs(temp.s0.c0 - temp2.s0.c0) > EPS || + cabs(temp.s0.c1 - temp2.s0.c1) > EPS || + cabs(temp.s0.c2 - temp2.s0.c2) > EPS || + cabs(temp.s1.c0 - temp2.s1.c0) > EPS || + cabs(temp.s1.c1 - temp2.s1.c1) > EPS || + cabs(temp.s1.c2 - temp2.s1.c2) > EPS || + cabs(temp.s2.c0 - temp2.s2.c0) > EPS || + cabs(temp.s2.c1 - temp2.s2.c1) > EPS || + cabs(temp.s2.c2 - temp2.s2.c2) > EPS || + cabs(temp.s3.c0 - temp2.s3.c0) > EPS || + cabs(temp.s3.c1 - temp2.s3.c1) > EPS || + cabs(temp.s3.c2 - temp2.s3.c2) > EPS ) + test = 1; + + assertFalseM(test, "D_x+ failed\n"); + test = 0; + + _vector_i_sub(psi,sm->s0,sm->s3); + _su3_inverse_multiply(chi,u,psi); + _complexcjg_times_vector(psi,k,chi); + _vector_add_assign(temp.s0,psi); + _vector_i_add_assign(temp.s3,psi); + _vector_i_sub(psi,sm->s1,sm->s2); + _su3_inverse_multiply(chi,u,psi); + _complexcjg_times_vector(psi,k,chi); + _vector_add_assign(temp.s1,psi); + _vector_i_add_assign(temp.s2,psi); + + vec_load2(r, &sm->s0); + vec_load2(r+3, &sm->s1); + vec_load2(r+9, &sm->s2); + vec_load2(r+6, &sm->s3); + vec_i_mul_sub_double2(r, &r[6], U); + vec_su3_inverse_multiply_double2(&u, U, r); + vec_cmplxcg_mul_double2(r, &r[6], U, &k); + vec_add_double2(rs, r); + vec_i_mul_add2(&rs[6], &r[3], U); + vec_i_mul_add2(&rs[9], &r[0], U); + + vec_store2(&temp2.s0, rs); + vec_store2(&temp2.s1, rs+3); + vec_store2(&temp2.s2, rs+6); + vec_store2(&temp2.s3, rs+9); + + if( cabs(temp.s0.c0 - temp2.s0.c0) > EPS || + cabs(temp.s0.c1 - temp2.s0.c1) > EPS || + cabs(temp.s0.c2 - temp2.s0.c2) > EPS || + cabs(temp.s1.c0 - temp2.s1.c0) > EPS || + cabs(temp.s1.c1 - temp2.s1.c1) > EPS || + cabs(temp.s1.c2 - temp2.s1.c2) > EPS || + cabs(temp.s2.c0 - temp2.s2.c0) > EPS || + cabs(temp.s2.c1 - temp2.s2.c1) > EPS || + cabs(temp.s2.c2 - temp2.s2.c2) > EPS || + cabs(temp.s3.c0 - temp2.s3.c0) > EPS || + cabs(temp.s3.c1 - temp2.s3.c1) > EPS || + cabs(temp.s3.c2 - temp2.s3.c2) > EPS ) + test = 1; + + assertFalseM(test, "D_x- failed\n"); + test = 0; + + _vector_add(psi,sp->s0,sp->s3); + _su3_multiply(chi,u,psi); + _complex_times_vector(psi,k,chi); + _vector_add_assign(temp.s0,psi); + _vector_add_assign(temp.s3,psi); + _vector_sub(psi,sp->s1,sp->s2); + _su3_multiply(chi,u,psi); + _complex_times_vector(psi,k,chi); + _vector_add_assign(temp.s1,psi); + _vector_sub_assign(temp.s2,psi); + + vec_load2(r, &sp->s0); + vec_load2(r+3, &sp->s1); + vec_load2(r+9, &sp->s2); + vec_load2(r+6, &sp->s3); + vec_add2(r, &r[6]); + vec_sub2(r+3, &r[9]); + vec_su3_multiply_double2(&u, U, r); + vec_cmplx_mul_double2(r, &r[6], U, &k); + vec_add_double2(rs, r); + vec_sub2(&rs[6], &r[3]); + vec_add2(&rs[9], r); + + vec_store2(&temp2.s0, rs); + vec_store2(&temp2.s1, rs+3); + vec_store2(&temp2.s2, rs+6); + vec_store2(&temp2.s3, rs+9); + + if( cabs(temp.s0.c0 - temp2.s0.c0) > EPS || + cabs(temp.s0.c1 - temp2.s0.c1) > EPS || + cabs(temp.s0.c2 - temp2.s0.c2) > EPS || + cabs(temp.s1.c0 - temp2.s1.c0) > EPS || + cabs(temp.s1.c1 - temp2.s1.c1) > EPS || + cabs(temp.s1.c2 - temp2.s1.c2) > EPS || + cabs(temp.s2.c0 - temp2.s2.c0) > EPS || + cabs(temp.s2.c1 - temp2.s2.c1) > EPS || + cabs(temp.s2.c2 - temp2.s2.c2) > EPS || + cabs(temp.s3.c0 - temp2.s3.c0) > EPS || + cabs(temp.s3.c1 - temp2.s3.c1) > EPS || + cabs(temp.s3.c2 - temp2.s3.c2) > EPS ) + test = 1; + + assertFalseM(test, "D_y+ failed\n"); + test = 0; + + _vector_sub(psi,sm->s0,sm->s3); + _su3_inverse_multiply(chi,u,psi); + _complexcjg_times_vector(psi,k,chi); + _vector_add_assign(temp.s0,psi); + _vector_sub_assign(temp.s3,psi); + _vector_add(psi,sm->s1,sm->s2); + _su3_inverse_multiply(chi,u,psi); + _complexcjg_times_vector(psi,k,chi); + _vector_add_assign(temp.s1,psi); + _vector_add_assign(temp.s2,psi); + + vec_load2(r, &sm->s0); + vec_load2(r+3, &sm->s1); + vec_load2(r+9, &sm->s2); + vec_load2(r+6, &sm->s3); + vec_sub2(r, r+6); + vec_add2(r+3, r+9); + vec_su3_inverse_multiply_double2(&u, U, r); + vec_cmplxcg_mul_double2(r, &r[6], U, &k); + vec_add_double2(rs, r); + vec_add2(rs+6, r+3); + vec_sub2(rs+9, r); + + vec_store2(&temp2.s0, rs); + vec_store2(&temp2.s1, rs+3); + vec_store2(&temp2.s2, rs+6); + vec_store2(&temp2.s3, rs+9); + + if( cabs(temp.s0.c0 - temp2.s0.c0) > EPS || + cabs(temp.s0.c1 - temp2.s0.c1) > EPS || + cabs(temp.s0.c2 - temp2.s0.c2) > EPS || + cabs(temp.s1.c0 - temp2.s1.c0) > EPS || + cabs(temp.s1.c1 - temp2.s1.c1) > EPS || + cabs(temp.s1.c2 - temp2.s1.c2) > EPS || + cabs(temp.s2.c0 - temp2.s2.c0) > EPS || + cabs(temp.s2.c1 - temp2.s2.c1) > EPS || + cabs(temp.s2.c2 - temp2.s2.c2) > EPS || + cabs(temp.s3.c0 - temp2.s3.c0) > EPS || + cabs(temp.s3.c1 - temp2.s3.c1) > EPS || + cabs(temp.s3.c2 - temp2.s3.c2) > EPS ) + test = 1; + + assertFalseM(test, "D_y- failed\n"); + test = 0; + + _vector_i_add(psi,sp->s0,sp->s2); + _su3_multiply(chi,u,psi); + _complex_times_vector(psi,k,chi); + _vector_add_assign(temp.s0,psi); + _vector_i_sub_assign(temp.s2,psi); + _vector_i_sub(psi,sp->s1,sp->s3); + _su3_multiply(chi,u,psi); + _complex_times_vector(psi,k,chi); + _vector_add_assign(temp.s1,psi); + _vector_i_add_assign(temp.s3,psi); + + vec_load2(r, &sp->s0); + vec_load2(r+3, &sp->s1); + vec_load2(r+6, &sp->s2); + vec_load2(r+9, &sp->s3); + vec_i_mul_add2(r, r+6, U); + vec_i_mul_sub2(r+3, r+9, U); + vec_su3_multiply_double2(&u, U, r); + vec_cmplx_mul_double2(r, &r[6], U, &k); + vec_add_double2(rs, r); + vec_i_mul_sub2(rs+6, r, U); + vec_i_mul_add2(rs+9, r+3, U); + + vec_store2(&temp2.s0, rs); + vec_store2(&temp2.s1, rs+3); + vec_store2(&temp2.s2, rs+6); + vec_store2(&temp2.s3, rs+9); + + if( cabs(temp.s0.c0 - temp2.s0.c0) > EPS || + cabs(temp.s0.c1 - temp2.s0.c1) > EPS || + cabs(temp.s0.c2 - temp2.s0.c2) > EPS || + cabs(temp.s1.c0 - temp2.s1.c0) > EPS || + cabs(temp.s1.c1 - temp2.s1.c1) > EPS || + cabs(temp.s1.c2 - temp2.s1.c2) > EPS || + cabs(temp.s2.c0 - temp2.s2.c0) > EPS || + cabs(temp.s2.c1 - temp2.s2.c1) > EPS || + cabs(temp.s2.c2 - temp2.s2.c2) > EPS || + cabs(temp.s3.c0 - temp2.s3.c0) > EPS || + cabs(temp.s3.c1 - temp2.s3.c1) > EPS || + cabs(temp.s3.c2 - temp2.s3.c2) > EPS ) + test = 1; + + assertFalseM(test, "D_z+ failed\n"); + test = 0; + + _vector_i_sub(psi,sm->s0,sm->s2); + _su3_inverse_multiply(chi,u,psi); + _complexcjg_times_vector(psi,k,chi); + _vector_add_assign(temp.s0, psi); + _vector_i_add_assign(temp.s2, psi); + _vector_i_add(psi,sm->s1,sm->s3); + _su3_inverse_multiply(chi,u,psi); + _complexcjg_times_vector(psi,k,chi); + _vector_add_assign(temp.s1, psi); + _vector_i_sub_assign(temp.s3, psi); + + vec_load2(r, &sm->s0); + vec_load2(r+3, &sm->s1); + vec_load2(r+6, &sm->s2); + vec_load2(r+9, &sm->s3); + vec_i_mul_sub2(r, r+6, U); + vec_i_mul_add2(r+3, r+9, U); + vec_su3_inverse_multiply_double2(&u, U, r); + vec_cmplxcg_mul_double2(r, &r[6], U, &k); + vec_add_double2(rs, r); + vec_store2(&temp2.s0, rs); + vec_store2(&temp2.s1, rs+3); + vec_i_mul_add2(rs+6, r, U); + vec_store2(&temp2.s2, rs+6); + vec_i_mul_sub2(rs+9, r+3, U); + vec_store2(&temp2.s3, rs+9); + + vec_store2(&temp2.s0, rs); + vec_store2(&temp2.s1, rs+3); + vec_store2(&temp2.s2, rs+6); + vec_store2(&temp2.s3, rs+9); + + if( cabs(temp.s0.c0 - temp2.s0.c0) > EPS || + cabs(temp.s0.c1 - temp2.s0.c1) > EPS || + cabs(temp.s0.c2 - temp2.s0.c2) > EPS || + cabs(temp.s1.c0 - temp2.s1.c0) > EPS || + cabs(temp.s1.c1 - temp2.s1.c1) > EPS || + cabs(temp.s1.c2 - temp2.s1.c2) > EPS || + cabs(temp.s2.c0 - temp2.s2.c0) > EPS || + cabs(temp.s2.c1 - temp2.s2.c1) > EPS || + cabs(temp.s2.c2 - temp2.s2.c2) > EPS || + cabs(temp.s3.c0 - temp2.s3.c0) > EPS || + cabs(temp.s3.c1 - temp2.s3.c1) > EPS || + cabs(temp.s3.c2 - temp2.s3.c2) > EPS ) + test = 1; + + assertFalseM(test, "D_z- failed\n"); + test = 0; + +#else + test = 1; + assertFalseM(test, "not on a BG/Q or not compiling with XLC\n"); +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_qpx_algebra.h b/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_qpx_algebra.h new file mode 100644 index 0000000000000000000000000000000000000000..8a4ebb6f0686aace47ab277cf7bff2225b7a32e8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_qpx_algebra.h @@ -0,0 +1,13 @@ +#ifndef _TEST_QPX_ALGEBRA_H +#define _TEST_QPX_ALGEBRA_H + +#include + +TEST(qpx_algebra); + +TEST_SUITE(QPX_ALGEBRA){ + TEST_ADD(qpx_algebra), + TEST_SUITE_CLOSURE + }; + +#endif /* _TEST_QPX_ALGEBRA_H */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_rat.c b/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_rat.c new file mode 100644 index 0000000000000000000000000000000000000000..754ef0a2d8a6b95aa4934399d83aac45bee9f5cd --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_rat.c @@ -0,0 +1,20 @@ + +#if HAVE_CONFIG_H +#include +#endif +#include "../global.h" +#include "test_rat_init.h" + +TEST_SUITES { + TEST_SUITE_ADD(RAT), + TEST_SUITES_CLOSURE +}; + +int main(int argc,char *argv[]){ + CU_SET_OUT_PREFIX("regressions/"); + CU_RUN(argc,argv); + return 0; +} + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_rat_init.c b/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_rat_init.c new file mode 100644 index 0000000000000000000000000000000000000000..fa6e8e0b15bb2fab08c29e7c76191b272446d6c7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_rat_init.c @@ -0,0 +1,80 @@ +#include +#include +#include +#include +#include +#include +#include +#include "../rational/rational.h" + +#define EPS 1e-8 + +// apply rational approximation as partial fraction +double apply_R(const int order, const double y, const double A, double * rs, double * as) { + double x = 1.; + + for(int i = 0; i < order; i++) { + x += rs[i]/(y+as[i]); + } + return(A*x); +} + +_Complex double apply_C(const int order, const double y, const double A, double * rs, double * as) { + _Complex double x = 1.; + for(int i = 0; i < order; i++) { + x += I*rs[i]/(sqrt(y)+I*as[i]); + } + return(x/sqrt(A)); +} + +_Complex double apply_Cdag(const int order, const double y, const double A, double * rs, double * as) { + _Complex double x = 1.; + for(int i = 0; i < order; i++) { + x -= I*rs[i]/(sqrt(y)-I*as[i]); + } + return(x/sqrt(A)); +} + +TEST(rat_init) { + int t = 0, ret=0; + int order = 10; + double eps = 1.e-4; + double ra = eps, rb = 1.; + rational_t rat; + double * ar = malloc(order*sizeof(double)); + rat.order = order; + rat.range[0] = ra; + rat.range[1] = rb; + rat.crange[0] = 0; + rat.crange[1] = order-1; + + ret = init_rational(&rat); + assertFalseM(ret, "rat_init failed\n"); + + for(int i = 0; i < order; i++) { + ar[i] = (rat.mu[i])*(rat.mu[i]); + } + + for(double y = eps; y < 1.; y += eps) { + double x = apply_R(rat.order, y, rat.A, rat.rmu, ar); + if(fabs(1 - x*sqrt(y)) > rat.delta + EPS) { + t++; + printf("%e %e %e %e %e\n", y, x, 1./sqrt(y), fabs(1 - x*sqrt(y)), rat.delta); + } + } + assertFalseM(t, "rational approximation not as accurate as expected\n."); + t = 0; + + for(double y = eps; y < 1.; y += eps) { + _Complex double c0 = apply_C(rat.order, y, rat.A, rat.rnu, rat.nu); + _Complex double c1 = apply_Cdag(rat.order, y, rat.A, rat.rnu, rat.nu); + double x = apply_R(rat.order, y, rat.A, rat.rmu, ar); + if((fabs(1-creal(c0*x*c1)) > rat.delta + EPS) || cimag(c0*c1) > EPS ) { + t++; + printf("res: %e %e %e %e (%e, %e) (%e, %e)\n", y, sqrt(y), x, creal(c0*c1), creal(c0), cimag(c0), creal(c1), cimag(c1)); + } + } + assertFalseM(t, "C^dagger C approximation not as accurate as expected\n."); + t = 0; + +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_rat_init.h b/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_rat_init.h new file mode 100644 index 0000000000000000000000000000000000000000..1fbd32c2c3d45170e146ee23d0c65a60f019e38b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_rat_init.h @@ -0,0 +1,13 @@ +#ifndef _TEST_RAT_INIT_H +#define _TEST_RAT_INIT_H + +#include + +TEST(rat_init); + +TEST_SUITE(RAT){ + TEST_ADD(rat_init), + TEST_SUITE_CLOSURE +}; + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_sample.c b/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_sample.c new file mode 100644 index 0000000000000000000000000000000000000000..722506184e5461703181a7fb712c0233250b4d37 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_sample.c @@ -0,0 +1,18 @@ + +#include "test_sample_ts1.h" +#include "test_sample_ts2.h" + +TEST_SUITES { + TEST_SUITE_ADD(TS1), + TEST_SUITE_ADD(TS2), + TEST_SUITES_CLOSURE +}; + +int main(int argc, char *argv[]) { + CU_SET_OUT_PREFIX("regressions/"); + CU_RUN(argc,argv); + + return 0; +} + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_sample_ts1.c b/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_sample_ts1.c new file mode 100644 index 0000000000000000000000000000000000000000..316423dbd24e54071a2914b8ffa7e1e0ec602bca --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_sample_ts1.c @@ -0,0 +1,14 @@ +#include + +TEST(test_true) { + assertTrue(1); +} + +TEST(test_false) { + assertFalse(0); +} + +TEST(test_fail) { + assertFalse(1); +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_sample_ts1.h b/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_sample_ts1.h new file mode 100644 index 0000000000000000000000000000000000000000..4603c0e18cd628208998e01f9456503049b09cf1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_sample_ts1.h @@ -0,0 +1,20 @@ +#ifndef _TEST_SAMPLE_TS1_H +#define _TEST_SAMPLE_TS1_H + +#include + +/* test declarations, definitions in %.c */ +TEST(test_true); +TEST(test_false); +TEST(test_fail); + +/* define test suite (enum type thing) */ +TEST_SUITE(TS1) { + TEST_ADD(test_true), + TEST_ADD(test_false), + TEST_ADD(test_fail), + TEST_SUITE_CLOSURE +}; + +#endif /* _TEST_SAMPLE_TS1_H */ + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_sample_ts2.c b/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_sample_ts2.c new file mode 100644 index 0000000000000000000000000000000000000000..37a3415ef0ebe5c76263bd82145794980335b8b0 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_sample_ts2.c @@ -0,0 +1,6 @@ +#include + +TEST(test_true2) { + assertTrue(1); +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_sample_ts2.h b/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_sample_ts2.h new file mode 100644 index 0000000000000000000000000000000000000000..8c45be3e15bf99e18d3b0513eb74efddddcfa0c2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_sample_ts2.h @@ -0,0 +1,16 @@ +#ifndef _TEST_SAMPLE_TS2_H +#define _TEST_SAMPLE_TS2_H + +#include + +/* test declarations, definitions in %.c */ +TEST(test_true2); + +/* define test suite (enum type thing) */ +TEST_SUITE(TS2) { + TEST_ADD(test_true2), + TEST_SUITE_CLOSURE +}; + +#endif /* _TEST_SAMPLE_TS2_H */ + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_su3.c b/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_su3.c new file mode 100644 index 0000000000000000000000000000000000000000..60b3eca7f9b3d099e112d3401cdd5d5da6347844 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_su3.c @@ -0,0 +1,15 @@ + +#include "test_su3_algebra.h" + +TEST_SUITES { + TEST_SUITE_ADD(SU3_ALGEBRA), + TEST_SUITES_CLOSURE +}; + +int main(int argc,char *argv[]){ + CU_SET_OUT_PREFIX("regressions/"); + CU_RUN(argc,argv); + return 0; +} + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_su3_algebra.c b/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_su3_algebra.c new file mode 100644 index 0000000000000000000000000000000000000000..25265bcb8f01b2d716f07c1e796f136986e31f8c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_su3_algebra.c @@ -0,0 +1,265 @@ +#include +#include +#include +#include +#if (defined SSE || defined SSE2 || defined SSE3) +# include "sse.h" +#endif +#include "../su3.h" +#include "../su3adj.h" +#include "../expo.h" + +#define EPS 5e-16 + +TEST(su3_multiply) { + su3 u; + su3_vector phi0, phi1, phi2, phi3; + int test = 0; + + phi0.c0 = 0.9+2*I; + phi0.c1 = 0.6+1.3*I; + phi0.c2 = 0.6-.2*I; + + phi1.c0 = -0.1+1*I; + phi1.c1 = 0.1+0.3*I; + phi1.c2 = -0.1-.12*I; + + u.c00 = +0.3391 -0.1635*I; + u.c01 = -0.2357 +0.5203*I; + u.c02 = +0.5609 +0.4663*I; + u.c10 = -0.0740 -0.4204*I; + u.c11 = -0.7706 -0.1863*I; + u.c12 = +0.1191 -0.4185*I; + u.c20 = +0.5351 -0.6243*I; + u.c21 = +0.1825 +0.1089*I; + u.c22 = -0.5279 -0.0022*I; + + _su3_multiply(phi2, u, phi0); + _su3_multiply(phi3, u, phi1); + + if( (creal(phi2.c0) - 2.441800e-01) > EPS || (cimag(phi2.c0) - 7.044200e-01) > EPS || + (creal(phi2.c1) - 5.417900e-01) > EPS || (cimag(phi2.c1) + 1.914840e+00) > EPS || + (creal(phi2.c2) - 1.380940e+00) > EPS || (cimag(phi2.c2) - 9.151800e-01) > EPS || + (creal(phi3.c0) + 5.020400e-02) > EPS || (cimag(phi3.c0) - 2.228320e-01) > EPS || + (creal(phi3.c1) - 3.445000e-01) > EPS || (cimag(phi3.c1) + 2.542120e-01) > EPS || + (creal(phi3.c2) - 6.088960e-01) > EPS || (cimag(phi3.c2) - 7.267380e-01) > EPS) + test = 1; + assertFalseM(test, "_su3_multiply failed\n."); +} + +TEST(su3_inverse_multiply) { + su3 u; + su3_vector phi0, phi1, phi2, phi3; + int test = 0; + + phi0.c0 = 0.9+2*I; + phi0.c1 = 0.6+1.3*I; + phi0.c2 = 0.6-.2*I; + + phi1.c0 = -0.1+1*I; + phi1.c1 = 0.1+0.3*I; + phi1.c2 = -0.1-.12*I; + + u.c00 = +0.3391 -0.1635*I; + u.c01 = -0.2357 +0.5203*I; + u.c02 = +0.5609 +0.4663*I; + u.c10 = -0.0740 -0.4204*I; + u.c11 = -0.7706 -0.1863*I; + u.c12 = +0.1191 -0.4185*I; + u.c20 = +0.5351 -0.6243*I; + u.c21 = +0.1825 +0.1089*I; + u.c22 = -0.5279 -0.0022*I; + + _su3_inverse_multiply(phi2, u, phi0); + _su3_inverse_multiply(phi3, u, phi1); + + if( (creal(phi2.c0) + 1.668100e-01) > EPS || (cimag(phi2.c0) - 1.248950e+00) > EPS || + (creal(phi2.c1) - 2.116400e-01) > EPS || (cimag(phi2.c1) + 1.931510e+00) > EPS || + (creal(phi2.c2) - 6.485200e-01) > EPS || (cimag(phi2.c2) - 1.214960e+00) > EPS || + (creal(phi3.c0) + 3.095240e-01) > EPS || (cimag(phi3.c0) - 2.159480e-01) > EPS || + (creal(phi3.c1) - 3.796020e-01) > EPS || (cimag(phi3.c1) + 4.072300e-01) > EPS || + (creal(phi3.c2) - 3.496240e-01) > EPS || (cimag(phi3.c2) - 7.482380e-01) > EPS) + test = 1; + assertFalseM(test, "_su3_multiply failed\n."); +} + +TEST(vector_add) { + su3_vector phi0, phi1, phi2; + int test = 0; + + phi0.c0 = 0.9+2*I; + phi0.c1 = 0.6+1.3*I; + phi0.c2 = 0.6-.2*I; + + phi1.c0 = -0.1+1*I; + phi1.c1 = 0.1+0.3*I; + phi1.c2 = -0.1-.12*I; + + _vector_add(phi2, phi0, phi1); + + if( cabs(phi2.c0 - phi0.c0 - phi1.c0) > EPS || + cabs(phi2.c1 - phi0.c1 - phi1.c1) > EPS || + cabs(phi2.c2 - phi0.c2 - phi1.c2) > EPS ) + test = 1; + assertFalseM(test, "_vector_add failed\n."); +} + +TEST(vector_sub) { + su3_vector phi0, phi1, phi2; + int test = 0; + + phi0.c0 = 0.9+2*I; + phi0.c1 = 0.6+1.3*I; + phi0.c2 = 0.6-.2*I; + + phi1.c0 = -0.1+1*I; + phi1.c1 = 0.1+0.3*I; + phi1.c2 = -0.1-.12*I; + + _vector_sub(phi2, phi0, phi1); + + if( cabs(phi2.c0 - phi0.c0 + phi1.c0) > EPS || + cabs(phi2.c1 - phi0.c1 + phi1.c1) > EPS || + cabs(phi2.c2 - phi0.c2 + phi1.c2) > EPS ) + test = 1; + assertFalseM(test, "_vector_sub failed\n."); +} + +TEST(vector_i_add) { + su3_vector phi0, phi1, phi2; + int test = 0; + + phi0.c0 = 0.9+2*I; + phi0.c1 = 0.6+1.3*I; + phi0.c2 = 0.6-.2*I; + + phi1.c0 = -0.1+1*I; + phi1.c1 = 0.1+0.3*I; + phi1.c2 = -0.1-.12*I; + + _vector_i_add(phi2, phi0, phi1); + + if( cabs(phi2.c0 - phi0.c0 - I*phi1.c0) > EPS || + cabs(phi2.c1 - phi0.c1 - I*phi1.c1) > EPS || + cabs(phi2.c2 - phi0.c2 - I*phi1.c2) > EPS) + test = 1; + assertFalseM(test, "_vector_i_add failed\n."); +} + +TEST(vector_i_sub) { + su3_vector phi0, phi1, phi2; + int test = 0; + + phi0.c0 = 0.9+2*I; + phi0.c1 = 0.6+1.3*I; + phi0.c2 = 0.6-.2*I; + + phi1.c0 = -0.1+1*I; + phi1.c1 = 0.1+0.3*I; + phi1.c2 = -0.1-.12*I; + + _vector_i_sub(phi2, phi0, phi1); + + if( cabs(phi2.c0 - phi0.c0 + I*phi1.c0) > EPS || + cabs(phi2.c1 - phi0.c1 + I*phi1.c1) > EPS || + cabs(phi2.c2 - phi0.c2 + I*phi1.c2) > EPS) + test = 1; + assertFalseM(test, "_vector_i_sub failed\n."); +} + +TEST(cmplx_times_vector) { + su3_vector phi0, phi1; + complex double c = 3.7655 - 0.3*I; + int test = 0; + + phi0.c0 = 0.9+2*I; + phi0.c1 = 0.6+1.3*I; + phi0.c2 = 0.6-.2*I; + + _complex_times_vector(phi1, c, phi0); + + if( cabs(phi1.c0 - c*phi0.c0) > EPS || + cabs(phi1.c1 - c*phi0.c1) > EPS || + cabs(phi1.c2 - c*phi0.c2) > EPS) + test = 1; + assertFalseM(test, "_complex_times_vector failed\n."); +} + +TEST(cmplxcjg_times_vector) { + su3_vector phi0, phi1; + complex double c = 3.7655 - 0.3*I; + int test = 0; + + phi0.c0 = 0.9+2*I; + phi0.c1 = 0.6+1.3*I; + phi0.c2 = 0.6-.2*I; + + _complexcjg_times_vector(phi1, c, phi0); + + if( cabs(phi1.c0 - conj(c)*phi0.c0) > EPS || + cabs(phi1.c1 - conj(c)*phi0.c1) > EPS || + cabs(phi1.c2 - conj(c)*phi0.c2) > EPS) + test = 1; + assertFalseM(test, "_complexcjg_times_vector failed\n."); +} + + +TEST(su3_assign) { + su3 m1,m2; + + int test = 0; + m1.c00 = 1 + 1.*I; m1.c01 = 0.; m1.c02 = 0.; + m1.c10 = 0.; m1.c11 = 1 + 1.*I; m1.c12 = 0.; + m1.c20 = 0.; m1.c21 = 0.; m1.c22 = 1 + 1.*I; + + _su3_assign(m2,m1); + + if( creal(m2.c00) == 1 && cimag(m2.c00) == 1 && creal(m2.c01) == 0 && cimag(m2.c01) == 0 && creal(m2.c02) == 0 && cimag(m2.c02) == 0 && + creal(m2.c10) == 0 && cimag(m2.c10) == 0 && creal(m2.c11) == 1 && cimag(m2.c11) == 1 && creal(m2.c12) == 0 && cimag(m2.c12) == 0 && + creal(m2.c20) == 0 && cimag(m2.c20) == 0 && creal(m2.c21) == 0 && cimag(m2.c21) == 0 && creal(m2.c22) == 1 && cimag(m2.c22) == 1 ) + test = 1; + + assertTrueM(test,"The SU3 assignment operator does not work correctly!\n"); +} + +TEST(su3_expo_positivedet) { + su3 Q, U; + su3adj T; + + int test = 0; + + /* Positive determinant */ + Q.c00 = -0.2994; + Q.c01 = 0.5952 + 1.3123*I; + Q.c02 = -0.7943 + 0.0913*I; + Q.c11 = -1.1430; + Q.c12 = -2.0025 + 0.2978*I; + Q.c22 = +1.4424; + Q.c10 = conj(Q.c01); + Q.c20 = conj(Q.c02); + Q.c21 = conj(Q.c12); + + /* Matlab's solution for U = exp(i * Q) */ + U.c00 = +0.3391 -0.1635*I; + U.c01 = -0.2357 +0.5203*I; + U.c02 = +0.5609 +0.4663*I; + U.c10 = -0.0740 -0.4204*I; + U.c11 = -0.7706 -0.1863*I; + U.c12 = +0.1191 -0.4185*I; + U.c20 = +0.5351 -0.6243*I; + U.c21 = +0.1825 +0.1089*I; + U.c22 = -0.5279 -0.0022*I; + + _trace_lambda(T,Q); + exposu3(&Q,&T); + + if( creal(Q.c00 - U.c00) > EPS && creal(Q.c01 - U.c01) > EPS && creal(Q.c02 - U.c02) > EPS && + creal(Q.c10 - U.c10) > EPS && creal(Q.c11 - U.c11) > EPS && creal(Q.c12 - U.c12) > EPS && + creal(Q.c20 - U.c20) > EPS && creal(Q.c21 - U.c21) > EPS && creal(Q.c22 - U.c22) > EPS && + cimag(Q.c00 - U.c00) > EPS && cimag(Q.c01 - U.c01) > EPS && cimag(Q.c02 - U.c02) > EPS && + cimag(Q.c10 - U.c10) > EPS && cimag(Q.c11 - U.c11) > EPS && cimag(Q.c12 - U.c12) > EPS && + cimag(Q.c20 - U.c20) > EPS && cimag(Q.c21 - U.c21) > EPS && cimag(Q.c22 - U.c22) > EPS ) + test = 1; + + assertFalseM(test,"The exponentation of Q with a positive determinant failed.\n"); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_su3_algebra.h b/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_su3_algebra.h new file mode 100644 index 0000000000000000000000000000000000000000..5f291407c81b5ed3788a673fa52739de3ba67197 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/tests/test_su3_algebra.h @@ -0,0 +1,33 @@ +#ifndef _TEST_SU3_ALGEBRA_H +#define _TEST_SU3_ALGEBRA_H + +#include + +TEST(su3_assign); +TEST(su3_expo_positivedet); +TEST(su3_multiply); +TEST(su3_inverse_multiply); +TEST(vector_add); +TEST(vector_sub); +TEST(vector_i_add); +TEST(vector_i_sub); +TEST(cmplx_times_vector); +TEST(cmplxcjg_times_vector); + +TEST_SUITE(SU3_ALGEBRA){ + TEST_ADD(su3_assign), + TEST_ADD(su3_expo_positivedet), + TEST_ADD(su3_multiply), + TEST_ADD(su3_inverse_multiply), + TEST_ADD(vector_add), + TEST_ADD(vector_sub), + TEST_ADD(vector_i_add), + TEST_ADD(vector_i_sub), + TEST_ADD(cmplx_times_vector), + TEST_ADD(cmplxcjg_times_vector), + TEST_SUITE_CLOSURE + }; + +#endif /* _TEST_SU3_ALGEBRA_H */ + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/tmp/read_input.tmp.c b/qcd/part_cpu/applications/QCD/src/kernel_D/tmp/read_input.tmp.c new file mode 100644 index 0000000000000000000000000000000000000000..4a4b1a14c6cd8d411d41a02109534ecef573bd4b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/tmp/read_input.tmp.c @@ -0,0 +1,7391 @@ + +#line 3 "" + +#define YY_INT_ALIGNED short int + +/* A lexical scanner generated by flex */ + +#define FLEX_SCANNER +#define YY_FLEX_MAJOR_VERSION 2 +#define YY_FLEX_MINOR_VERSION 5 +#define YY_FLEX_SUBMINOR_VERSION 33 +#if YY_FLEX_SUBMINOR_VERSION > 0 +#define FLEX_BETA +#endif + +/* First, we deal with platform-specific or compiler-specific issues. */ + +/* begin standard C headers. */ +#include +#include +#include +#include + +/* end standard C headers. */ + +/* flex integer type definitions */ + +#ifndef FLEXINT_H +#define FLEXINT_H + +/* C99 systems have . Non-C99 systems may or may not. */ + +#if __STDC_VERSION__ >= 199901L + +/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h, + * if you want the limit (max/min) macros for int types. + */ +#ifndef __STDC_LIMIT_MACROS +#define __STDC_LIMIT_MACROS 1 +#endif + +#include +typedef int8_t flex_int8_t; +typedef uint8_t flex_uint8_t; +typedef int16_t flex_int16_t; +typedef uint16_t flex_uint16_t; +typedef int32_t flex_int32_t; +typedef uint32_t flex_uint32_t; +#else +typedef signed char flex_int8_t; +typedef short int flex_int16_t; +typedef int flex_int32_t; +typedef unsigned char flex_uint8_t; +typedef unsigned short int flex_uint16_t; +typedef unsigned int flex_uint32_t; +#endif /* ! C99 */ + +/* Limits of integral types. */ +#ifndef INT8_MIN +#define INT8_MIN (-128) +#endif +#ifndef INT16_MIN +#define INT16_MIN (-32767-1) +#endif +#ifndef INT32_MIN +#define INT32_MIN (-2147483647-1) +#endif +#ifndef INT8_MAX +#define INT8_MAX (127) +#endif +#ifndef INT16_MAX +#define INT16_MAX (32767) +#endif +#ifndef INT32_MAX +#define INT32_MAX (2147483647) +#endif +#ifndef UINT8_MAX +#define UINT8_MAX (255U) +#endif +#ifndef UINT16_MAX +#define UINT16_MAX (65535U) +#endif +#ifndef UINT32_MAX +#define UINT32_MAX (4294967295U) +#endif + +#endif /* ! FLEXINT_H */ + +#ifdef __cplusplus + +/* The "const" storage-class-modifier is valid. */ +#define YY_USE_CONST + +#else /* ! __cplusplus */ + +#if __STDC__ + +#define YY_USE_CONST + +#endif /* __STDC__ */ +#endif /* ! __cplusplus */ + +#ifdef YY_USE_CONST +#define yyconst const +#else +#define yyconst +#endif + +/* Returned upon end-of-file. */ +#define YY_NULL 0 + +/* Promotes a possibly negative, possibly signed char to an unsigned + * integer for use as an array index. If the signed char is negative, + * we want to instead treat it as an 8-bit unsigned char, hence the + * double cast. + */ +#define YY_SC_TO_UI(c) ((unsigned int) (unsigned char) c) + +/* Enter a start condition. This macro really ought to take a parameter, + * but we do it the disgusting crufty way forced on us by the ()-less + * definition of BEGIN. + */ +#define BEGIN (yy_start) = 1 + 2 * + +/* Translate the current start state into a value that can be later handed + * to BEGIN to return to the state. The YYSTATE alias is for lex + * compatibility. + */ +#define YY_START (((yy_start) - 1) / 2) +#define YYSTATE YY_START + +/* Action number for EOF rule of a given start state. */ +#define YY_STATE_EOF(state) (YY_END_OF_BUFFER + state + 1) + +/* Special action meaning "start processing a new file". */ +#define YY_NEW_FILE yyrestart(yyin ) + +#define YY_END_OF_BUFFER_CHAR 0 + +/* Size of default input buffer. */ +#ifndef YY_BUF_SIZE +#define YY_BUF_SIZE 16384 +#endif + +/* The state buf must be large enough to hold one state per character in the main buffer. + */ +#define YY_STATE_BUF_SIZE ((YY_BUF_SIZE + 2) * sizeof(yy_state_type)) + +#ifndef YY_TYPEDEF_YY_BUFFER_STATE +#define YY_TYPEDEF_YY_BUFFER_STATE +typedef struct yy_buffer_state *YY_BUFFER_STATE; +#endif + +extern int yyleng; + +extern FILE *yyin, *yyout; + +#define EOB_ACT_CONTINUE_SCAN 0 +#define EOB_ACT_END_OF_FILE 1 +#define EOB_ACT_LAST_MATCH 2 + + #define YY_LESS_LINENO(n) + +/* Return all but the first "n" matched characters back to the input stream. */ +#define yyless(n) \ + do \ + { \ + /* Undo effects of setting up yytext. */ \ + int yyless_macro_arg = (n); \ + YY_LESS_LINENO(yyless_macro_arg);\ + *yy_cp = (yy_hold_char); \ + YY_RESTORE_YY_MORE_OFFSET \ + (yy_c_buf_p) = yy_cp = yy_bp + yyless_macro_arg - YY_MORE_ADJ; \ + YY_DO_BEFORE_ACTION; /* set up yytext again */ \ + } \ + while ( 0 ) + +#define unput(c) yyunput( c, (yytext_ptr) ) + +/* The following is because we cannot portably get our hands on size_t + * (without autoconf's help, which isn't available because we want + * flex-generated scanners to compile on their own). + */ + +#ifndef YY_TYPEDEF_YY_SIZE_T +#define YY_TYPEDEF_YY_SIZE_T +typedef unsigned int yy_size_t; +#endif + +#ifndef YY_STRUCT_YY_BUFFER_STATE +#define YY_STRUCT_YY_BUFFER_STATE +struct yy_buffer_state + { + FILE *yy_input_file; + + char *yy_ch_buf; /* input buffer */ + char *yy_buf_pos; /* current position in input buffer */ + + /* Size of input buffer in bytes, not including room for EOB + * characters. + */ + yy_size_t yy_buf_size; + + /* Number of characters read into yy_ch_buf, not including EOB + * characters. + */ + int yy_n_chars; + + /* Whether we "own" the buffer - i.e., we know we created it, + * and can realloc() it to grow it, and should free() it to + * delete it. + */ + int yy_is_our_buffer; + + /* Whether this is an "interactive" input source; if so, and + * if we're using stdio for input, then we want to use getc() + * instead of fread(), to make sure we stop fetching input after + * each newline. + */ + int yy_is_interactive; + + /* Whether we're considered to be at the beginning of a line. + * If so, '^' rules will be active on the next match, otherwise + * not. + */ + int yy_at_bol; + + int yy_bs_lineno; /**< The line count. */ + int yy_bs_column; /**< The column count. */ + + /* Whether to try to fill the input buffer when we reach the + * end of it. + */ + int yy_fill_buffer; + + int yy_buffer_status; + +#define YY_BUFFER_NEW 0 +#define YY_BUFFER_NORMAL 1 + /* When an EOF's been seen but there's still some text to process + * then we mark the buffer as YY_EOF_PENDING, to indicate that we + * shouldn't try reading from the input source any more. We might + * still have a bunch of tokens to match, though, because of + * possible backing-up. + * + * When we actually see the EOF, we change the status to "new" + * (via yyrestart()), so that the user can continue scanning by + * just pointing yyin at a new input file. + */ +#define YY_BUFFER_EOF_PENDING 2 + + }; +#endif /* !YY_STRUCT_YY_BUFFER_STATE */ + +/* Stack of input buffers. */ +static size_t yy_buffer_stack_top = 0; /**< index of top of stack. */ +static size_t yy_buffer_stack_max = 0; /**< capacity of stack. */ +static YY_BUFFER_STATE * yy_buffer_stack = 0; /**< Stack as an array. */ + +/* We provide macros for accessing buffer states in case in the + * future we want to put the buffer states in a more general + * "scanner state". + * + * Returns the top of the stack, or NULL. + */ +#define YY_CURRENT_BUFFER ( (yy_buffer_stack) \ + ? (yy_buffer_stack)[(yy_buffer_stack_top)] \ + : NULL) + +/* Same as previous macro, but useful when we know that the buffer stack is not + * NULL or when we need an lvalue. For internal use only. + */ +#define YY_CURRENT_BUFFER_LVALUE (yy_buffer_stack)[(yy_buffer_stack_top)] + +/* yy_hold_char holds the character lost when yytext is formed. */ +static char yy_hold_char; +static int yy_n_chars; /* number of characters read into yy_ch_buf */ +int yyleng; + +/* Points to current character in buffer. */ +static char *yy_c_buf_p = (char *) 0; +static int yy_init = 0; /* whether we need to initialize */ +static int yy_start = 0; /* start state number */ + +/* Flag which is used to allow yywrap()'s to do buffer switches + * instead of setting up a fresh yyin. A bit of a hack ... + */ +static int yy_did_buffer_switch_on_eof; + +void yyrestart (FILE *input_file ); +void yy_switch_to_buffer (YY_BUFFER_STATE new_buffer ); +YY_BUFFER_STATE yy_create_buffer (FILE *file,int size ); +void yy_delete_buffer (YY_BUFFER_STATE b ); +void yy_flush_buffer (YY_BUFFER_STATE b ); +void yypush_buffer_state (YY_BUFFER_STATE new_buffer ); +void yypop_buffer_state (void ); + +static void yyensure_buffer_stack (void ); +static void yy_load_buffer_state (void ); +static void yy_init_buffer (YY_BUFFER_STATE b,FILE *file ); + +#define YY_FLUSH_BUFFER yy_flush_buffer(YY_CURRENT_BUFFER ) + +YY_BUFFER_STATE yy_scan_buffer (char *base,yy_size_t size ); +YY_BUFFER_STATE yy_scan_string (yyconst char *yy_str ); +YY_BUFFER_STATE yy_scan_bytes (yyconst char *bytes,int len ); + +void *yyalloc (yy_size_t ); +void *yyrealloc (void *,yy_size_t ); +void yyfree (void * ); + +#define yy_new_buffer yy_create_buffer + +#define yy_set_interactive(is_interactive) \ + { \ + if ( ! YY_CURRENT_BUFFER ){ \ + yyensure_buffer_stack (); \ + YY_CURRENT_BUFFER_LVALUE = \ + yy_create_buffer(yyin,YY_BUF_SIZE ); \ + } \ + YY_CURRENT_BUFFER_LVALUE->yy_is_interactive = is_interactive; \ + } + +#define yy_set_bol(at_bol) \ + { \ + if ( ! YY_CURRENT_BUFFER ){\ + yyensure_buffer_stack (); \ + YY_CURRENT_BUFFER_LVALUE = \ + yy_create_buffer(yyin,YY_BUF_SIZE ); \ + } \ + YY_CURRENT_BUFFER_LVALUE->yy_at_bol = at_bol; \ + } + +#define YY_AT_BOL() (YY_CURRENT_BUFFER_LVALUE->yy_at_bol) + +/* Begin user sect3 */ + +typedef unsigned char YY_CHAR; + +FILE *yyin = (FILE *) 0, *yyout = (FILE *) 0; + +typedef int yy_state_type; + +extern int yylineno; + +int yylineno = 1; + +extern char *yytext; +#define yytext_ptr yytext + +static yy_state_type yy_get_previous_state (void ); +static yy_state_type yy_try_NUL_trans (yy_state_type current_state ); +static int yy_get_next_buffer (void ); +static void yy_fatal_error (yyconst char msg[] ); + +/* Done after the current pattern has been matched and before the + * corresponding action - sets up yytext. + */ +#define YY_DO_BEFORE_ACTION \ + (yytext_ptr) = yy_bp; \ + yyleng = (size_t) (yy_cp - yy_bp); \ + (yy_hold_char) = *yy_cp; \ + *yy_cp = '\0'; \ + (yy_c_buf_p) = yy_cp; + +#define YY_NUM_RULES 371 +#define YY_END_OF_BUFFER 372 +/* This struct is not used in this scanner, + but its presence is necessary. */ +struct yy_trans_info + { + flex_int32_t yy_verify; + flex_int32_t yy_nxt; + }; +static yyconst flex_int16_t yy_accept[2644] = + {} ; + +static yyconst flex_int32_t yy_ec[256] = + { 0, + 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, + 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 4, 1, 5, 6, 1, 1, 1, 1, 1, + 1, 1, 7, 1, 8, 9, 5, 10, 11, 12, + 13, 14, 15, 16, 15, 15, 15, 1, 1, 1, + 17, 1, 1, 1, 20, 21, 22, 23, 24, 25, + 26, 27, 28, 5, 29, 30, 31, 32, 33, 34, + 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 18, 18, 18, 18, 19, 18, 20, 21, 22, 23, + + 24, 25, 26, 27, 28, 5, 29, 30, 31, 32, + 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, + 43, 44, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1 + } ; + +static yyconst flex_int32_t yy_meta[45] = + { 0, + 1, 2, 3, 1, 4, 1, 5, 5, 6, 7, + 7, 7, 7, 7, 7, 7, 1, 4, 4, 4, + 4, 4, 4, 8, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4 + } ; + +static yyconst flex_int16_t yy_base[2999] = + {} ; + +static yyconst flex_int16_t yy_def[2999] = + { 0, + 2644, 2643, 2645, 2645, 2646, 2646, 2647, 2647, 2648, 2648, + 2649, 2649, 2650, 2650, 2651, 2651, 2652, 2652, 2653, 2653, + 2654, 2654, 2655, 2655, 2656, 2656, 2657, 2657, 2658, 2658, + 2659, 2659, 2659, 2659, 2659, 35, 35, 37, 2659, 39, + 39, 41, 2659, 2659, 39, 39, 2660, 2660, 2661, 2661, + 2662, 2662, 2663, 2663, 2664, 2664, 2665, 2665, 2666, 2666, + 2667, 2667, 2668, 2668, 2669, 2669, 2670, 2670, 2671, 2671, + 2672, 2672, 2673, 2673, 2674, 2674, 2675, 2675, 2676, 2676, + 2677, 2677, 2678, 2678, 2679, 2679, 2680, 2680, 2681, 2681, + 2682, 2682, 2683, 2683, 2684, 2684, 39, 39, 39, 39, + + 41, 101, 2685, 2685, 2686, 2686, 2687, 2687, 2688, 2688, + 39, 111, 111, 111, 2689, 2689, 39, 39, 39, 39, + 39, 39, 39, 39, 39, 39, 39, 39, 2690, 2690, + 2690, 131, 131, 133, 133, 135, 135, 137, 2691, 2691, + 2692, 2692, 142, 143, 143, 145, 145, 147, 2693, 2693, + 150, 151, 151, 153, 153, 155, 111, 111, 39, 39, + 39, 39, 2694, 2694, 164, 164, 164, 164, 2695, 2695, + 170, 170, 111, 111, 2695, 2695, 2696, 2696, 2697, 2697, + 2698, 2698, 2699, 2699, 2690, 2690, 2700, 2700, 2701, 2701, + 2702, 2702, 192, 193, 182, 182, 2703, 2703, 2704, 2704, + + 2705, 2705, 2695, 2695, 170, 170, 170, 207, 2706, 2706, + 2707, 2707, 2708, 2708, 2709, 2709, 2690, 2690, 2710, 2710, + 2711, 2711, 2712, 2712, 2713, 2713, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2714, 2715, 2714, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2716, 2643, 2643, 2643, 2717, + 2643, 2643, 2643, 2718, 2719, 2718, 2643, 2643, 2643, 2720, + 2721, 2720, 2643, 2643, 2643, 2722, 2723, 2722, 2643, 2643, + 2643, 2724, 2643, 2643, 2643, 2725, 2726, 2725, 2643, 2643, + + 2643, 2727, 2728, 2727, 2643, 2643, 2643, 2729, 2730, 2729, + 2643, 2643, 2643, 2731, 2732, 2731, 2643, 2643, 2643, 2733, + 2734, 2733, 2643, 2643, 2643, 2735, 2643, 2643, 2643, 2643, + 2736, 2737, 2736, 2738, 2739, 2738, 2740, 2643, 2741, 2742, + 2741, 2643, 2643, 2643, 2643, 2643, 2643, 2743, 2643, 2643, + 2744, 2643, 2643, 2745, 2643, 2643, 2746, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2747, 2643, 2643, + 2643, 2748, 2643, 2643, 2643, 2643, 2749, 2643, 2643, 2643, + 2750, 2643, 2643, 2643, 2751, 2643, 2643, 2643, 2752, 2643, + 2643, 2643, 2753, 2643, 2643, 2643, 2754, 2643, 2643, 2643, + 2755, 2643, 2643, 2643, 2756, 2643, 2643, 2643, 2757, 2643, + 2643, 2643, 2643, 2643, 2758, 2759, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2760, 2761, 2762, 2763, 2762, 2643, 2643, + 2764, 2643, 2643, 2643, 2765, 2766, 2765, 2643, 2643, 2643, + 2767, 2643, 2643, 2643, 2768, 2769, 2768, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2770, 2643, 2771, 2643, 2772, + + 2643, 2773, 2643, 2774, 2775, 2776, 2777, 2778, 2777, 2643, + 2779, 2780, 2779, 2781, 2782, 2781, 2783, 2784, 2783, 2785, + 2786, 2785, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2787, + 2788, 2787, 2643, 2789, 2790, 2789, 2791, 2792, 2791, 2793, + 2794, 2793, 2643, 2643, 2795, 2796, 2795, 2643, 2643, 2797, + 2798, 2797, 2643, 2799, 2800, 2799, 2801, 2802, 2801, 2643, + 2643, 2803, 2804, 2643, 2643, 2805, 2643, 2806, 2807, 2643, + 2643, 2808, 2643, 2809, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2810, 2811, 2810, + 2643, 2643, 2643, 2812, 2643, 2643, 2643, 2643, 2643, 2643, + + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2813, 2814, 2813, 2643, + 2815, 2816, 2815, 2643, 2817, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2818, 2643, + 2643, 2643, 2643, 2643, 2643, 2819, 2643, 2820, 2821, 2820, + 2643, 2643, 2822, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2823, 2643, 2824, + 2824, 2825, 2825, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2826, 2827, + 2826, 2828, 2643, 2643, 2643, 2829, 2830, 2831, 2832, 2831, + 2833, 2834, 2835, 2834, 2836, 2837, 2838, 2837, 2839, 2840, + 2841, 2842, 2841, 2843, 2844, 2845, 2844, 2846, 2847, 2848, + 2847, 2849, 2850, 2851, 2850, 2852, 2853, 2854, 2853, 2855, + 2856, 2857, 2858, 2857, 2859, 2860, 2861, 2860, 2862, 2863, + 2864, 2865, 2864, 2866, 2643, 2643, 2643, 2643, 2643, 2867, + + 2868, 2869, 2870, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2871, 2872, 2643, + 2873, 2874, 2875, 2876, 2877, 2878, 2879, 2880, 2881, 2643, + 2643, 2882, 2883, 2643, 2643, 2643, 2884, 2885, 2886, 2887, + 2886, 2888, 2889, 2890, 2891, 2890, 2892, 2893, 2894, 2895, + 2894, 2896, 2643, 2643, 2643, 2643, 2643, 2897, 2898, 2899, + 2900, 2901, 2902, 2903, 2904, 2905, 2904, 2906, 2907, 2908, + 2907, 2909, 2910, 2911, 2910, 2912, 2913, 2914, 2913, 2915, + 2916, 2917, 2916, 2918, 2643, 2643, 2919, 2920, 2919, 2921, + + 2922, 2923, 2922, 2924, 2925, 2926, 2925, 2927, 2928, 2929, + 2928, 2930, 2931, 2932, 2931, 2933, 2934, 2935, 2934, 2936, + 2937, 2938, 2937, 2939, 2940, 2941, 2940, 2942, 2643, 2643, + 2943, 2944, 2945, 2946, 2947, 2948, 2949, 2643, 2643, 2643, + 2643, 2643, 2643, 2950, 2951, 2950, 2952, 2953, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2954, 2955, 2954, + 2956, 2957, 2958, 2957, 2959, 2960, 2643, 2643, 2643, 2643, + 2961, 2643, 2643, 2962, 2963, 2964, 2963, 2965, 2966, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2967, 2968, + 2969, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2970, 2970, + 2643, 2643, 2643, 2643, 2971, 2971, 2972, 2972, 2973, 2973, + 2974, 2974, 2975, 2975, 2976, 2976, 2977, 2977, 2978, 2978, + 2979, 2979, 2980, 2980, 2981, 2981, 2643, 2643, 2643, 2643, + + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2982, + 2982, 2983, 2983, 2984, 2984, 2643, 2643, 2643, 2643, 2985, + 2985, 2986, 2986, 2987, 2987, 2988, 2988, 2989, 2989, 2643, + 2990, 2990, 2991, 2991, 2992, 2992, 2993, 2993, 2994, 2994, + 2995, 2995, 2996, 2996, 2997, 2997, 2643, 2643, 2643, 2643, + 2998, 2998, 2643, 2643, 2643, 2643, 2643, 2643, 2956, 1169, + 2959, 1171, 2643, 2643, 2643, 2965, 1176, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 0, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643 + + } ; + +static yyconst flex_int16_t yy_nxt[6138] = + { 0, + 228, 228, 229, 228, 228, 230, 228, 228, 228, 228, + 228, 231, 228, 228, 228, 228, 228, 228, 228, 232, + 233, 234, 235, 236, 237, 238, 228, 239, 240, 241, + 242, 243, 244, 245, 246, 247, 248, 249, 250, 228, + 251, 228, 228, 228, 229, 353, 229, 253, 254, 254, + 255, 256, 256, 256, 256, 256, 256, 256, 229, 721, + 722, 257, 254, 254, 255, 256, 256, 256, 256, 256, + 256, 256, 229, 229, 229, 259, 263, 329, 229, 229, + 229, 330, 329, 330, 229, 229, 229, 378, 359, 510, + 229, 260, 260, 362, 1264, 360, 261, 261, 361, 229, + + 360, 1265, 364, 361, 379, 262, 262, 229, 229, 365, + 265, 459, 366, 491, 266, 266, 266, 266, 266, 266, + 266, 229, 693, 229, 267, 229, 681, 460, 266, 266, + 266, 266, 266, 266, 266, 229, 694, 461, 269, 560, + 700, 462, 270, 270, 270, 270, 270, 270, 270, 229, + 561, 701, 271, 686, 612, 687, 270, 270, 270, 270, + 270, 270, 270, 229, 688, 613, 273, 274, 274, 275, + 276, 276, 276, 276, 276, 276, 276, 229, 1001, 1355, + 277, 274, 274, 275, 276, 276, 276, 276, 276, 276, + 276, 229, 1356, 1002, 279, 280, 280, 281, 282, 282, + + 282, 282, 282, 282, 282, 229, 1069, 1069, 283, 280, + 280, 281, 282, 282, 282, 282, 282, 282, 282, 229, + 1075, 1075, 285, 286, 286, 287, 288, 288, 288, 288, + 288, 288, 288, 229, 1077, 1077, 289, 286, 286, 287, + 288, 288, 288, 288, 288, 288, 288, 229, 353, 229, + 291, 716, 683, 717, 292, 292, 292, 292, 292, 292, + 292, 229, 698, 1059, 293, 1357, 1060, 699, 292, 292, + 292, 292, 292, 292, 292, 229, 1061, 1358, 295, 296, + 296, 297, 298, 298, 298, 298, 298, 298, 298, 229, + 1079, 1079, 299, 296, 296, 297, 298, 298, 298, 298, + + 298, 298, 298, 229, 1081, 1081, 301, 302, 302, 303, + 304, 304, 304, 304, 304, 304, 304, 229, 1083, 1083, + 305, 302, 302, 303, 304, 304, 304, 304, 304, 304, + 304, 229, 1085, 1085, 307, 308, 308, 309, 310, 310, + 310, 310, 310, 310, 310, 229, 1087, 1087, 311, 308, + 308, 309, 310, 310, 310, 310, 310, 310, 310, 229, + 1089, 1089, 313, 314, 314, 315, 316, 316, 316, 316, + 316, 316, 316, 229, 1091, 1091, 317, 314, 314, 315, + 316, 316, 316, 316, 316, 316, 316, 229, 1093, 1093, + 319, 320, 320, 321, 322, 322, 322, 322, 322, 322, + + 322, 229, 1095, 1095, 323, 320, 320, 321, 322, 322, + 322, 322, 322, 322, 322, 229, 709, 1071, 325, 1072, + 710, 811, 326, 326, 326, 326, 326, 326, 326, 229, + 812, 1361, 327, 1120, 1120, 711, 326, 326, 326, 326, + 326, 326, 326, 229, 1122, 1122, 329, 331, 331, 332, + 333, 333, 333, 333, 333, 333, 333, 334, 334, 335, + 336, 336, 336, 336, 336, 336, 336, 229, 1023, 229, + 329, 1362, 382, 1024, 337, 337, 337, 337, 337, 337, + 337, 338, 339, 339, 340, 341, 341, 341, 341, 341, + 341, 341, 229, 229, 229, 329, 367, 380, 383, 229, + + 1363, 343, 385, 365, 229, 229, 366, 387, 390, 384, + 1124, 1124, 718, 344, 379, 719, 495, 345, 229, 346, + 229, 392, 229, 395, 720, 463, 347, 229, 383, 1364, + 330, 1130, 1130, 388, 388, 229, 343, 575, 397, 384, + 994, 460, 560, 1365, 389, 389, 995, 393, 344, 393, + 1056, 461, 345, 561, 346, 462, 1132, 1132, 394, 1057, + 394, 347, 338, 576, 398, 1366, 348, 348, 348, 348, + 348, 348, 348, 342, 577, 399, 1367, 348, 348, 348, + 348, 348, 348, 348, 229, 712, 1368, 350, 1134, 1134, + 578, 351, 351, 351, 351, 351, 351, 351, 229, 713, + + 1369, 352, 714, 715, 1370, 351, 351, 351, 351, 351, + 351, 351, 353, 353, 229, 353, 576, 355, 353, 353, + 1016, 1016, 1016, 229, 1136, 1136, 400, 577, 353, 353, + 353, 353, 229, 353, 815, 356, 353, 353, 1184, 816, + 1371, 229, 817, 1185, 402, 1372, 353, 353, 353, 353, + 229, 353, 398, 355, 353, 353, 229, 1138, 1138, 405, + 491, 1141, 1141, 399, 353, 353, 353, 353, 229, 353, + 403, 356, 353, 353, 495, 1373, 492, 1374, 493, 1375, + 494, 404, 353, 353, 229, 403, 328, 369, 229, 1340, + 492, 524, 493, 723, 494, 724, 404, 328, 1341, 725, + + 328, 229, 370, 371, 527, 229, 372, 373, 597, 1143, + 1143, 328, 374, 598, 599, 375, 229, 525, 1282, 376, + 1033, 1034, 1035, 229, 229, 1283, 571, 573, 526, 1376, + 525, 1145, 1145, 229, 370, 371, 667, 229, 372, 373, + 582, 526, 1377, 668, 374, 1378, 669, 375, 229, 1147, + 1147, 407, 579, 579, 1379, 408, 408, 408, 408, 408, + 408, 408, 229, 580, 580, 409, 583, 1149, 1149, 408, + 408, 408, 408, 408, 408, 408, 229, 584, 1380, 411, + 1063, 1063, 1063, 412, 412, 412, 412, 412, 412, 412, + 1381, 229, 1012, 229, 600, 229, 670, 1013, 585, 598, + + 599, 229, 229, 668, 601, 607, 669, 1014, 229, 413, + 229, 610, 1382, 414, 1151, 1151, 1383, 412, 412, 412, + 412, 412, 412, 412, 583, 1384, 229, 1153, 1153, 661, + 602, 608, 1359, 229, 603, 584, 614, 608, 1155, 1155, + 1360, 604, 609, 413, 229, 662, 663, 416, 609, 664, + 1385, 417, 417, 417, 417, 417, 417, 417, 229, 1161, + 1161, 418, 612, 1169, 1169, 417, 417, 417, 417, 417, + 417, 417, 229, 613, 1386, 420, 704, 704, 704, 421, + 421, 421, 421, 421, 421, 421, 229, 1171, 1171, 422, + 1388, 705, 1389, 421, 421, 421, 421, 421, 421, 421, + + 229, 1176, 1176, 424, 1017, 1017, 1017, 425, 425, 425, + 425, 425, 425, 425, 229, 2643, 2643, 426, 1390, 1018, + 1391, 425, 425, 425, 425, 425, 425, 425, 229, 2643, + 2643, 428, 1019, 1019, 1019, 429, 429, 429, 429, 429, + 429, 429, 229, 2643, 2643, 430, 1392, 1020, 1393, 429, + 429, 429, 429, 429, 429, 429, 229, 1395, 1396, 432, + 1021, 1021, 1021, 433, 433, 433, 433, 433, 433, 433, + 229, 1397, 1398, 434, 1399, 1022, 1404, 433, 433, 433, + 433, 433, 433, 433, 229, 1405, 1406, 436, 1044, 1044, + 1044, 437, 437, 437, 437, 437, 437, 437, 229, 1400, + + 1409, 438, 1401, 1045, 1410, 437, 437, 437, 437, 437, + 437, 437, 229, 1411, 1414, 440, 1046, 1046, 1046, 441, + 441, 441, 441, 441, 441, 441, 229, 1415, 1416, 442, + 1417, 1047, 1420, 441, 441, 441, 441, 441, 441, 441, + 229, 1421, 1422, 444, 732, 732, 732, 445, 445, 445, + 445, 445, 445, 445, 229, 229, 1423, 446, 605, 733, + 1424, 445, 445, 445, 445, 445, 445, 445, 229, 1425, + 1426, 448, 1194, 1194, 1194, 449, 449, 449, 449, 449, + 449, 449, 1429, 229, 602, 450, 628, 1195, 603, 229, + 229, 1402, 631, 633, 1036, 604, 1037, 451, 229, 1403, + + 1430, 452, 1048, 1038, 1049, 449, 449, 449, 449, 449, + 449, 449, 629, 1427, 1431, 450, 1050, 229, 629, 634, + 636, 1428, 1051, 630, 1016, 1016, 1016, 451, 229, 630, + 635, 454, 1432, 455, 1433, 456, 456, 456, 456, 456, + 456, 456, 229, 1434, 1435, 457, 634, 455, 1438, 456, + 456, 456, 456, 456, 456, 456, 338, 635, 1439, 1442, + 464, 464, 464, 464, 464, 464, 464, 342, 1209, 1209, + 1209, 464, 464, 464, 464, 464, 464, 464, 338, 1210, + 1210, 1210, 465, 465, 465, 465, 465, 465, 465, 342, + 1211, 1211, 1211, 465, 465, 465, 465, 465, 465, 465, + + 466, 466, 467, 468, 468, 468, 468, 468, 468, 468, + 229, 1443, 1444, 470, 1017, 1017, 1017, 471, 471, 471, + 471, 471, 471, 471, 229, 1445, 1446, 472, 1448, 1018, + 1449, 471, 471, 471, 471, 471, 471, 471, 229, 1450, + 1451, 474, 475, 475, 476, 477, 477, 477, 477, 477, + 477, 477, 229, 1452, 1455, 478, 475, 475, 476, 477, + 477, 477, 477, 477, 477, 477, 229, 1456, 1457, 480, + 1019, 1019, 1019, 481, 481, 481, 481, 481, 481, 481, + 229, 1458, 1453, 482, 1459, 1020, 1454, 481, 481, 481, + 481, 481, 481, 481, 229, 1460, 1461, 484, 485, 485, + + 486, 487, 487, 487, 487, 487, 487, 487, 229, 1464, + 1465, 488, 485, 485, 486, 487, 487, 487, 487, 487, + 487, 487, 338, 1219, 1219, 1219, 328, 328, 328, 328, + 328, 328, 328, 229, 1466, 229, 665, 695, 641, 689, + 690, 1467, 229, 696, 1468, 644, 691, 1469, 489, 1470, + 692, 697, 662, 663, 1471, 1472, 664, 1473, 1474, 490, + 328, 328, 229, 328, 642, 338, 328, 328, 1475, 229, + 229, 642, 656, 659, 1476, 643, 328, 328, 328, 328, + 229, 328, 643, 342, 328, 328, 1235, 1235, 1235, 1477, + 229, 1478, 1479, 672, 328, 328, 497, 1480, 657, 657, + + 498, 498, 498, 498, 498, 498, 498, 499, 1256, 658, + 658, 498, 498, 498, 498, 498, 498, 498, 338, 673, + 1257, 1258, 500, 500, 500, 500, 500, 500, 500, 342, + 674, 1481, 1482, 500, 500, 500, 500, 500, 500, 500, + 501, 1236, 1236, 1236, 502, 502, 502, 502, 502, 502, + 502, 503, 1063, 1063, 1063, 502, 502, 502, 502, 502, + 502, 502, 501, 1294, 1294, 1294, 504, 504, 504, 504, + 504, 504, 504, 503, 1209, 1209, 1209, 504, 504, 504, + 504, 504, 504, 504, 501, 1210, 1210, 1210, 505, 505, + 505, 505, 505, 505, 505, 503, 1211, 1211, 1211, 505, + + 505, 505, 505, 505, 505, 505, 501, 1312, 1312, 1312, + 506, 506, 506, 506, 506, 506, 506, 503, 1313, 1313, + 1313, 506, 506, 506, 506, 506, 506, 506, 229, 1021, + 1021, 1021, 507, 507, 508, 509, 509, 509, 509, 509, + 509, 509, 229, 1483, 1022, 510, 507, 507, 508, 509, + 509, 509, 509, 509, 509, 509, 229, 1219, 1219, 1219, + 511, 511, 512, 513, 513, 513, 513, 513, 513, 513, + 514, 514, 515, 516, 516, 516, 516, 516, 516, 516, + 517, 517, 518, 519, 519, 519, 519, 519, 519, 519, + 520, 520, 521, 522, 522, 522, 522, 522, 522, 522, + + 229, 1484, 1485, 529, 530, 530, 531, 532, 532, 532, + 532, 532, 532, 532, 229, 1486, 1487, 533, 530, 530, + 531, 532, 532, 532, 532, 532, 532, 532, 529, 534, + 534, 535, 536, 536, 536, 536, 536, 536, 536, 537, + 537, 538, 539, 539, 539, 539, 539, 539, 539, 540, + 540, 541, 542, 542, 542, 542, 542, 542, 542, 229, + 1488, 1489, 544, 545, 545, 546, 547, 547, 547, 547, + 547, 547, 547, 229, 1490, 1491, 548, 545, 545, 546, + 547, 547, 547, 547, 547, 547, 547, 549, 550, 550, + 551, 552, 552, 552, 552, 552, 552, 552, 554, 554, + + 555, 556, 556, 556, 556, 556, 556, 556, 557, 557, + 558, 559, 559, 559, 559, 559, 559, 559, 491, 1315, + 1315, 1315, 562, 562, 562, 562, 562, 562, 562, 495, + 1235, 1235, 1235, 562, 562, 562, 562, 562, 562, 562, + 491, 1236, 1236, 1236, 563, 563, 563, 563, 563, 563, + 563, 495, 1026, 1026, 1026, 563, 563, 563, 563, 563, + 563, 563, 229, 1492, 1493, 565, 1462, 1029, 1494, 566, + 566, 566, 566, 566, 566, 566, 229, 1497, 1463, 567, + 1354, 1354, 1354, 566, 566, 566, 566, 566, 566, 566, + 565, 1387, 1387, 1387, 568, 568, 568, 568, 568, 568, + + 568, 568, 568, 568, 568, 568, 568, 568, 565, 1294, + 1294, 1294, 569, 569, 569, 569, 569, 569, 569, 569, + 569, 569, 569, 569, 569, 569, 229, 1498, 1499, 571, + 1215, 1215, 1215, 572, 572, 572, 572, 572, 572, 572, + 229, 1500, 1501, 573, 1502, 1216, 1503, 572, 572, 572, + 572, 572, 572, 572, 571, 1394, 1394, 1394, 574, 574, + 574, 574, 574, 574, 574, 574, 574, 574, 574, 574, + 574, 574, 229, 1504, 1505, 587, 588, 588, 589, 590, + 590, 590, 590, 590, 590, 590, 229, 1506, 1508, 591, + 588, 588, 589, 590, 590, 590, 590, 590, 590, 590, + + 229, 1509, 1510, 593, 1217, 1217, 1217, 594, 594, 594, + 594, 594, 594, 594, 229, 1512, 1513, 595, 1514, 1218, + 1515, 594, 594, 594, 594, 594, 594, 594, 229, 1521, + 1522, 616, 617, 617, 618, 619, 619, 619, 619, 619, + 619, 619, 229, 1523, 1524, 620, 617, 617, 618, 619, + 619, 619, 619, 619, 619, 619, 616, 621, 621, 622, + 623, 623, 623, 623, 623, 623, 623, 624, 1312, 1312, + 1312, 625, 625, 625, 625, 625, 625, 625, 626, 1221, + 1221, 1221, 625, 625, 625, 625, 625, 625, 625, 229, + 229, 1525, 638, 675, 1222, 1526, 639, 639, 639, 639, + + 639, 639, 639, 229, 1527, 1528, 640, 1313, 1313, 1313, + 639, 639, 639, 639, 639, 639, 639, 645, 1529, 673, + 1530, 646, 646, 646, 646, 646, 646, 646, 647, 1531, + 674, 1532, 646, 646, 646, 646, 646, 646, 646, 645, + 648, 648, 649, 650, 650, 650, 650, 650, 650, 650, + 229, 1533, 1534, 652, 1044, 1044, 1044, 653, 653, 653, + 653, 653, 653, 653, 229, 1536, 1537, 654, 1539, 1045, + 1540, 653, 653, 653, 653, 653, 653, 653, 229, 1541, + 1542, 677, 1046, 1046, 1046, 678, 678, 678, 678, 678, + 678, 678, 229, 1543, 1550, 679, 1551, 1047, 1552, 678, + + 678, 678, 678, 678, 678, 678, 704, 704, 704, 726, + 1252, 1252, 1252, 727, 1287, 1287, 1287, 1194, 1194, 1194, + 1553, 705, 728, 729, 1554, 1253, 1555, 730, 731, 1288, + 1556, 1557, 1195, 1315, 1315, 1315, 1558, 1296, 1296, 1296, + 1215, 1215, 1215, 1217, 1217, 1217, 706, 707, 708, 732, + 732, 732, 1297, 1342, 1544, 1216, 1559, 1343, 1218, 1447, + 1447, 1447, 1560, 1545, 733, 1344, 1561, 734, 1221, 1221, + 1221, 1354, 1354, 1354, 735, 736, 1026, 1026, 1026, 1345, + 1345, 1345, 1562, 1222, 1563, 1564, 1027, 1028, 1252, 1252, + 1252, 1029, 1565, 1566, 1346, 1030, 1326, 1327, 1328, 1287, + + 1287, 1287, 1567, 1253, 1568, 1329, 1330, 1569, 1331, 1570, + 1332, 1333, 1571, 1572, 1288, 1296, 1296, 1296, 1407, 1407, + 1407, 1412, 1412, 1412, 1418, 1418, 1418, 1436, 1436, 1436, + 1297, 1573, 1574, 1408, 1575, 1576, 1413, 1577, 1578, 1419, + 1579, 1580, 1437, 1440, 1440, 1440, 1345, 1345, 1345, 1387, + 1387, 1387, 1394, 1394, 1394, 1495, 1495, 1495, 1441, 1581, + 1582, 1346, 1407, 1407, 1407, 1507, 1507, 1507, 1583, 1584, + 1496, 1412, 1412, 1412, 1511, 1511, 1511, 1408, 1418, 1418, + 1418, 1516, 1516, 1516, 1585, 1586, 1413, 1517, 1517, 1517, + 1519, 1519, 1519, 1419, 1436, 1436, 1436, 1535, 1535, 1535, + + 1591, 1589, 1518, 1590, 1592, 1520, 1440, 1440, 1440, 1437, + 1538, 1538, 1538, 1447, 1447, 1447, 1587, 1546, 1594, 1595, + 1547, 1441, 1548, 1588, 1495, 1495, 1495, 1593, 1593, 1593, + 1549, 1596, 1597, 1598, 1599, 1600, 1601, 1601, 1601, 1496, + 1603, 1604, 1507, 1507, 1507, 1605, 1606, 1607, 1511, 1511, + 1511, 1602, 1608, 1609, 1610, 1611, 1516, 1516, 1516, 1517, + 1517, 1517, 1612, 1612, 1612, 1519, 1519, 1519, 1613, 1613, + 1613, 1614, 1615, 1616, 1518, 1617, 1618, 1619, 1620, 1621, + 1520, 1622, 1623, 1624, 1625, 1626, 1627, 1535, 1535, 1535, + 1628, 1629, 1538, 1538, 1538, 1630, 1631, 1632, 1633, 1634, + + 1635, 1636, 1637, 1638, 1639, 1640, 1641, 1642, 1643, 1644, + 1645, 1646, 1647, 1648, 1649, 1650, 1651, 1652, 1653, 1654, + 1655, 1656, 1657, 1658, 1659, 1660, 1661, 1662, 1663, 1664, + 1665, 1666, 1666, 1666, 1668, 1669, 1670, 1670, 1670, 1672, + 1673, 1674, 1675, 1676, 1677, 1678, 1667, 1679, 1680, 1681, + 1682, 1671, 1593, 1593, 1593, 1683, 1684, 1685, 1686, 1687, + 1688, 1601, 1601, 1601, 1689, 1689, 1689, 1690, 1691, 1693, + 1694, 1695, 1696, 1692, 1703, 1704, 1602, 1697, 1697, 1697, + 1699, 1699, 1699, 1701, 1701, 1701, 1612, 1612, 1612, 1613, + 1613, 1613, 1698, 1705, 1706, 1700, 1707, 1708, 1702, 1709, + + 1710, 1711, 1712, 1713, 1714, 1715, 1716, 1717, 1718, 1719, + 1720, 1721, 1722, 1723, 1724, 1725, 1726, 1727, 1728, 1729, + 1730, 1732, 1733, 1731, 1734, 1735, 1735, 1735, 1737, 1738, + 1739, 1740, 1741, 1742, 1743, 1744, 1745, 1746, 1747, 1748, + 1736, 1749, 1750, 1751, 1752, 1753, 1666, 1666, 1666, 1756, + 1754, 1755, 1755, 1755, 1757, 1670, 1670, 1670, 1758, 1758, + 1758, 1667, 1759, 1760, 1761, 1761, 1761, 1763, 1764, 1765, + 1671, 1766, 1766, 1766, 1770, 1771, 1772, 1773, 1774, 1762, + 1775, 1767, 1768, 1776, 1777, 1778, 1769, 1779, 1689, 1689, + 1689, 1780, 1781, 1782, 1783, 1784, 1785, 1786, 1697, 1697, + + 1697, 1787, 1787, 1787, 1699, 1699, 1699, 1788, 1788, 1788, + 1701, 1701, 1701, 1698, 1789, 1789, 1789, 1790, 1791, 1700, + 1792, 1793, 1794, 1795, 1796, 1702, 1797, 1797, 1797, 1799, + 1800, 1801, 1802, 1803, 1804, 1805, 1806, 1807, 1808, 1809, + 1810, 1798, 1811, 1812, 1813, 1814, 1815, 1816, 1817, 1818, + 1819, 1820, 1821, 1822, 1735, 1735, 1735, 1823, 1823, 1823, + 1824, 1825, 1826, 1827, 1828, 1829, 1830, 1831, 1832, 1736, + 1833, 1834, 1835, 1836, 1837, 1838, 1839, 1840, 1841, 1755, + 1755, 1755, 1842, 1842, 1842, 1844, 1758, 1758, 1758, 1845, + 1846, 1761, 1761, 1761, 1847, 1847, 1847, 1843, 1848, 1849, + + 1849, 1849, 1851, 1766, 1766, 1766, 1762, 1852, 1852, 1852, + 1854, 1854, 1854, 1857, 1850, 1856, 1856, 1856, 1769, 1858, + 1858, 1858, 1853, 1860, 1861, 1855, 1862, 1863, 1864, 1865, + 1866, 1867, 1868, 1869, 1859, 1870, 1870, 1870, 1872, 1872, + 1872, 1874, 1875, 1876, 1787, 1787, 1787, 1788, 1788, 1788, + 1871, 1877, 1878, 1873, 1789, 1789, 1789, 1879, 1880, 1881, + 1882, 1882, 1882, 1884, 1797, 1797, 1797, 1885, 1885, 1885, + 1886, 1887, 1888, 1889, 1890, 1883, 1894, 1895, 1891, 1798, + 1892, 1892, 1892, 1896, 1897, 1898, 1899, 1900, 1901, 1901, + 1901, 1903, 1904, 1905, 1906, 1893, 1907, 1908, 1909, 1910, + + 1911, 1912, 1913, 1902, 1823, 1823, 1823, 1914, 1915, 1916, + 1916, 1916, 1918, 1919, 1920, 1921, 1922, 1923, 1924, 1925, + 1926, 1927, 1928, 1929, 1917, 1930, 1931, 1931, 1931, 1842, + 1842, 1842, 1936, 1936, 1936, 1937, 1932, 1933, 1934, 1938, + 1939, 1935, 1940, 1942, 1843, 1847, 1847, 1847, 1849, 1849, + 1849, 1941, 1941, 1941, 1852, 1852, 1852, 1943, 1943, 1943, + 1854, 1854, 1854, 1850, 1944, 1944, 1944, 1945, 1947, 1853, + 1856, 1856, 1856, 1950, 1951, 1855, 1858, 1858, 1858, 1946, + 1946, 1946, 1948, 1955, 1952, 1958, 1959, 1949, 1956, 1956, + 1956, 1859, 1953, 1960, 1870, 1870, 1870, 1954, 1961, 1961, + + 1961, 1963, 1964, 1957, 1872, 1872, 1872, 1965, 1966, 1871, + 1962, 1962, 1962, 1967, 1968, 1969, 1970, 1972, 1973, 1873, + 1882, 1882, 1882, 1971, 1971, 1971, 1885, 1885, 1885, 1974, + 1975, 1976, 1978, 1979, 1981, 1883, 1892, 1892, 1892, 1977, + 1980, 1980, 1980, 1982, 1983, 1984, 1985, 1986, 1987, 1989, + 1990, 1893, 1901, 1901, 1901, 1988, 1988, 1988, 1991, 1992, + 1993, 1994, 1995, 1996, 1997, 1998, 1999, 1902, 2000, 2001, + 1916, 1916, 1916, 2002, 2002, 2002, 2003, 2004, 2005, 2006, + 2007, 2008, 2009, 2010, 2011, 1917, 2012, 1931, 1931, 1931, + 2013, 2013, 2013, 2015, 2015, 2015, 2017, 2017, 2017, 2019, + + 2019, 2019, 1935, 2020, 2021, 2014, 2024, 2025, 2016, 2030, + 2031, 2018, 1936, 1936, 1936, 2022, 2022, 2022, 1941, 1941, + 1941, 1943, 1943, 1943, 1944, 1944, 1944, 2026, 2026, 2026, + 2023, 1946, 1946, 1946, 2032, 2033, 2034, 2027, 2028, 2035, + 2036, 2037, 2029, 2038, 1956, 1956, 1956, 2039, 2039, 2039, + 2040, 2041, 2042, 1961, 1961, 1961, 1962, 1962, 1962, 1957, + 2043, 2044, 2044, 2044, 2046, 2047, 2048, 2048, 2048, 2050, + 2051, 2052, 1971, 1971, 1971, 2053, 2045, 2054, 2055, 2056, + 2057, 2049, 2058, 2059, 2060, 1980, 1980, 1980, 2061, 2062, + 2063, 2064, 2065, 2066, 2067, 1988, 1988, 1988, 2068, 2069, + + 2069, 2069, 2071, 2072, 2073, 2074, 2075, 2076, 2077, 2078, + 2079, 2080, 2081, 2082, 2070, 2002, 2002, 2002, 2083, 2084, + 2085, 2086, 2087, 2088, 2089, 2090, 2013, 2013, 2013, 2091, + 2091, 2091, 2015, 2015, 2015, 2092, 2092, 2092, 2017, 2017, + 2017, 2014, 2093, 2093, 2093, 2094, 2095, 2016, 2019, 2019, + 2019, 2097, 2098, 2018, 2022, 2022, 2022, 2096, 2096, 2096, + 2026, 2026, 2026, 2099, 2099, 2099, 2101, 2101, 2101, 2023, + 2103, 2103, 2103, 2104, 2105, 2029, 2110, 2111, 2100, 2112, + 2113, 2102, 2106, 2107, 2108, 2109, 2114, 2115, 2039, 2039, + 2039, 2116, 2117, 2118, 2119, 2044, 2044, 2044, 2120, 2120, + + 2120, 2121, 2121, 2121, 2123, 2048, 2048, 2048, 2125, 2126, + 2045, 2124, 2124, 2124, 2129, 2130, 2122, 2127, 2127, 2127, + 2049, 2131, 2133, 2134, 2134, 2134, 2138, 2139, 2132, 2136, + 2136, 2136, 2128, 2140, 2141, 2142, 2142, 2142, 2135, 2144, + 2145, 2146, 2147, 2148, 2137, 2069, 2069, 2069, 2150, 2151, + 2143, 2149, 2149, 2149, 2152, 2153, 2154, 2155, 2156, 2157, + 2070, 2158, 2159, 2160, 2161, 2162, 2163, 2164, 2165, 2166, + 2167, 2168, 2169, 2169, 2169, 2091, 2091, 2091, 2092, 2092, + 2092, 2093, 2093, 2093, 2171, 2174, 2175, 2170, 2172, 2172, + 2172, 2096, 2096, 2096, 2099, 2099, 2099, 2176, 2176, 2176, + + 2101, 2101, 2101, 2173, 2177, 2177, 2177, 2178, 2179, 2100, + 2103, 2103, 2103, 2188, 2192, 2102, 2180, 2180, 2180, 2182, + 2182, 2182, 2184, 2184, 2184, 2186, 2186, 2186, 2189, 2189, + 2189, 2181, 2193, 2194, 2183, 2195, 2196, 2185, 2197, 2198, + 2187, 2199, 2201, 2190, 2120, 2120, 2120, 2121, 2121, 2121, + 2200, 2200, 2200, 2124, 2124, 2124, 2202, 2191, 2203, 2127, + 2127, 2127, 2122, 2204, 2204, 2204, 2205, 2206, 2206, 2206, + 2208, 2209, 2210, 2213, 2128, 2134, 2134, 2134, 2211, 2211, + 2211, 2216, 2207, 2136, 2136, 2136, 2212, 2212, 2212, 2217, + 2135, 2214, 2214, 2214, 2142, 2142, 2142, 2219, 2137, 2218, + + 2218, 2218, 2220, 2221, 2222, 2223, 2215, 2224, 2227, 2143, + 2149, 2149, 2149, 2225, 2225, 2225, 2228, 2229, 2229, 2229, + 2231, 2231, 2231, 2233, 2233, 2233, 2235, 2236, 2226, 2237, + 2238, 2239, 2230, 2240, 2241, 2232, 2242, 2243, 2234, 2244, + 2245, 2246, 2169, 2169, 2169, 2250, 2252, 2247, 2249, 2249, + 2249, 2253, 2248, 2172, 2172, 2172, 2254, 2170, 2251, 2251, + 2251, 2176, 2176, 2176, 2177, 2177, 2177, 2255, 2173, 2180, + 2180, 2180, 2256, 2256, 2256, 2182, 2182, 2182, 2257, 2257, + 2257, 2184, 2184, 2184, 2181, 2258, 2258, 2258, 2263, 2264, + 2183, 2186, 2186, 2186, 2265, 2266, 2185, 2259, 2259, 2259, + + 2260, 2260, 2260, 2189, 2189, 2189, 2187, 2262, 2262, 2262, + 2267, 2268, 2268, 2268, 2270, 2261, 2271, 2272, 2190, 2200, + 2200, 2200, 2273, 2274, 2275, 2276, 2269, 2204, 2204, 2204, + 2206, 2206, 2206, 2277, 2277, 2277, 2278, 2279, 2280, 2211, + 2211, 2211, 2212, 2212, 2212, 2207, 2281, 2214, 2214, 2214, + 2282, 2282, 2282, 2283, 2284, 2218, 2218, 2218, 2285, 2285, + 2285, 2287, 2215, 2288, 2289, 2289, 2289, 2291, 2291, 2291, + 2293, 2297, 2301, 2286, 2225, 2225, 2225, 2304, 2305, 2290, + 2306, 2307, 2292, 2294, 2294, 2294, 2295, 2295, 2295, 2226, + 2229, 2229, 2229, 2298, 2298, 2298, 2231, 2231, 2231, 2308, + + 2309, 2296, 2299, 2299, 2299, 2230, 2233, 2233, 2233, 2310, + 2311, 2232, 2300, 2300, 2300, 2302, 2302, 2302, 2312, 2313, + 2314, 2234, 2315, 2249, 2249, 2249, 2316, 2251, 2251, 2251, + 2303, 2317, 2318, 2319, 2320, 2320, 2320, 2256, 2256, 2256, + 2257, 2257, 2257, 2258, 2258, 2258, 2259, 2259, 2259, 2321, + 2260, 2260, 2260, 2322, 2322, 2322, 2262, 2262, 2262, 2323, + 2323, 2323, 2327, 2328, 2329, 2261, 2330, 2330, 2330, 2324, + 2325, 2268, 2268, 2268, 2326, 2332, 2332, 2332, 2333, 2334, + 2336, 2331, 2337, 2338, 2335, 2339, 2269, 2340, 2277, 2277, + 2277, 2341, 2342, 2343, 2344, 2282, 2282, 2282, 2345, 2346, + + 2285, 2285, 2285, 2347, 2347, 2347, 2348, 2349, 2289, 2289, + 2289, 2350, 2350, 2350, 2352, 2286, 2291, 2291, 2291, 2351, + 2351, 2351, 2354, 2290, 2294, 2294, 2294, 2295, 2295, 2295, + 2355, 2292, 2353, 2353, 2353, 2298, 2298, 2298, 2299, 2299, + 2299, 2357, 2296, 2300, 2300, 2300, 2302, 2302, 2302, 2356, + 2356, 2356, 2358, 2359, 2360, 2361, 2361, 2361, 2363, 2363, + 2363, 2303, 2365, 2366, 2367, 2368, 2369, 2370, 2371, 2372, + 2362, 2373, 2380, 2364, 2320, 2320, 2320, 2374, 2374, 2374, + 2322, 2322, 2322, 2323, 2323, 2323, 2375, 2375, 2375, 2321, + 2377, 2377, 2377, 2379, 2379, 2379, 2381, 2382, 2326, 2384, + + 2385, 2376, 2330, 2330, 2330, 2378, 2383, 2383, 2383, 2332, + 2332, 2332, 2386, 2387, 2388, 2389, 2390, 2331, 2391, 2392, + 2392, 2392, 2394, 2395, 2395, 2395, 2397, 2398, 2399, 2347, + 2347, 2347, 2400, 2401, 2393, 2350, 2350, 2350, 2396, 2351, + 2351, 2351, 2402, 2353, 2353, 2353, 2403, 2404, 2404, 2404, + 2356, 2356, 2356, 2406, 2407, 2408, 2409, 2361, 2361, 2361, + 2412, 2413, 2405, 2410, 2410, 2410, 2363, 2363, 2363, 2411, + 2411, 2411, 2362, 2414, 2415, 2416, 2417, 2417, 2417, 2419, + 2420, 2364, 2374, 2374, 2374, 2375, 2375, 2375, 2421, 2421, + 2421, 2418, 2377, 2377, 2377, 2422, 2422, 2422, 2423, 2424, + + 2376, 2379, 2379, 2379, 2425, 2426, 2427, 2378, 2383, 2383, + 2383, 2428, 2428, 2428, 2433, 2434, 2435, 2436, 2436, 2436, + 2438, 2429, 2430, 2431, 2440, 2442, 2432, 2392, 2392, 2392, + 2443, 2449, 2437, 2439, 2439, 2439, 2395, 2395, 2395, 2441, + 2441, 2441, 2393, 2444, 2444, 2444, 2447, 2447, 2447, 2450, + 2454, 2396, 2451, 2451, 2451, 2404, 2404, 2404, 2445, 2455, + 2456, 2448, 2453, 2453, 2453, 2459, 2460, 2452, 2461, 2446, + 2405, 2457, 2457, 2457, 2410, 2410, 2410, 2411, 2411, 2411, + 2462, 2463, 2463, 2463, 2467, 2470, 2458, 2417, 2417, 2417, + 2466, 2466, 2466, 2468, 2468, 2468, 2464, 2421, 2421, 2421, + + 2471, 2472, 2418, 2422, 2422, 2422, 2475, 2485, 2469, 2486, + 2465, 2473, 2473, 2473, 2428, 2428, 2428, 2476, 2476, 2476, + 2478, 2478, 2478, 2480, 2480, 2480, 2474, 2494, 2496, 2432, + 2498, 2502, 2477, 2503, 2504, 2479, 2506, 2507, 2481, 2482, + 2482, 2482, 2483, 2483, 2483, 2436, 2436, 2436, 2487, 2487, + 2487, 2488, 2488, 2488, 2439, 2439, 2439, 2484, 2508, 2510, + 2437, 2490, 2490, 2490, 2511, 2515, 2489, 2441, 2441, 2441, + 2492, 2492, 2492, 2444, 2444, 2444, 2491, 2495, 2495, 2495, + 2447, 2447, 2447, 2516, 2518, 2493, 2523, 2524, 2445, 2497, + 2497, 2497, 2499, 2499, 2499, 2448, 2451, 2451, 2451, 2501, + + 2501, 2501, 2453, 2453, 2453, 2528, 2529, 2500, 2457, 2457, + 2457, 2452, 2505, 2505, 2505, 2463, 2463, 2463, 2509, 2509, + 2509, 2530, 2534, 2458, 2466, 2466, 2466, 2468, 2468, 2468, + 2464, 2512, 2512, 2512, 2513, 2513, 2513, 2473, 2473, 2473, + 2537, 2538, 2469, 2517, 2517, 2517, 2476, 2476, 2476, 2514, + 2539, 2547, 2474, 2519, 2519, 2519, 2478, 2478, 2478, 2548, + 2551, 2477, 2520, 2520, 2520, 2480, 2480, 2480, 2521, 2521, + 2521, 2479, 2482, 2482, 2482, 2483, 2483, 2483, 2552, 2553, + 2481, 2522, 2522, 2522, 2487, 2487, 2487, 2488, 2488, 2488, + 2484, 2525, 2525, 2525, 2490, 2490, 2490, 2526, 2526, 2526, + + 2554, 2555, 2489, 2492, 2492, 2492, 2527, 2527, 2527, 2491, + 2495, 2495, 2495, 2497, 2497, 2497, 2562, 2563, 2493, 2499, + 2499, 2499, 2531, 2531, 2531, 2501, 2501, 2501, 2532, 2532, + 2532, 2535, 2535, 2535, 2500, 2505, 2505, 2505, 2509, 2509, + 2509, 2570, 2571, 2533, 2575, 2578, 2536, 2540, 2540, 2540, + 2544, 2544, 2544, 2512, 2512, 2512, 2579, 2541, 2542, 2513, + 2513, 2513, 2543, 2582, 2583, 2545, 2546, 2546, 2546, 2517, + 2517, 2517, 2586, 2587, 2514, 2549, 2549, 2549, 2519, 2519, + 2519, 2520, 2520, 2520, 2521, 2521, 2521, 2522, 2522, 2522, + 2550, 2525, 2525, 2525, 2526, 2526, 2526, 2527, 2527, 2527, + + 2531, 2531, 2531, 2532, 2532, 2532, 2556, 2556, 2556, 2557, + 2557, 2557, 2535, 2535, 2535, 2559, 2559, 2559, 2533, 2560, + 2560, 2560, 2589, 2591, 2558, 2592, 2593, 2536, 2540, 2540, + 2540, 2564, 2564, 2564, 2561, 2566, 2566, 2566, 2568, 2568, + 2568, 2594, 2597, 2543, 2598, 2599, 2565, 2544, 2544, 2544, + 2567, 2569, 2569, 2569, 2546, 2546, 2546, 2549, 2549, 2549, + 2600, 2601, 2545, 2572, 2572, 2572, 2573, 2573, 2573, 2576, + 2576, 2576, 2550, 2556, 2556, 2556, 2557, 2557, 2557, 2602, + 2608, 2574, 2609, 2610, 2577, 2580, 2580, 2580, 2559, 2559, + 2559, 2558, 2560, 2560, 2560, 2581, 2581, 2581, 2564, 2564, + + 2564, 2584, 2584, 2584, 2566, 2566, 2566, 2561, 2585, 2585, + 2585, 2611, 2614, 2565, 2568, 2568, 2568, 2619, 2623, 2567, + 2569, 2569, 2569, 2572, 2572, 2572, 2573, 2573, 2573, 2588, + 2588, 2588, 2576, 2576, 2576, 2590, 2590, 2590, 2580, 2580, + 2580, 2574, 2581, 2581, 2581, 2624, 2625, 2577, 2584, 2584, + 2584, 2585, 2585, 2585, 2595, 2595, 2595, 2588, 2588, 2588, + 2590, 2590, 2590, 2595, 2595, 2595, 2603, 2603, 2603, 2596, + 2604, 2604, 2604, 2606, 2606, 2606, 2627, 2628, 2596, 2603, + 2603, 2603, 2604, 2604, 2604, 2605, 2629, 2630, 2607, 2612, + 2612, 2612, 2606, 2606, 2606, 2617, 2631, 2605, 2613, 2613, + + 2613, 2615, 2615, 2615, 2612, 2612, 2612, 2607, 2618, 2613, + 2613, 2613, 2620, 2620, 2620, 2632, 2616, 2615, 2615, 2615, + 2622, 2622, 2622, 2620, 2620, 2620, 2633, 2621, 2626, 2626, + 2626, 2634, 2616, 2622, 2622, 2622, 2635, 2636, 2621, 2626, + 2626, 2626, 2637, 2638, 2639, 2640, 2641, 2642, 228, 228, + 228, 228, 228, 228, 228, 228, 252, 252, 252, 252, + 252, 252, 252, 252, 258, 258, 258, 258, 258, 258, + 258, 258, 264, 264, 264, 264, 264, 264, 264, 264, + 268, 268, 268, 268, 268, 268, 268, 268, 272, 272, + 272, 272, 272, 272, 272, 272, 278, 278, 278, 278, + + 278, 278, 278, 278, 284, 284, 284, 284, 284, 284, + 284, 284, 290, 290, 290, 290, 290, 290, 290, 290, + 294, 294, 294, 294, 294, 294, 294, 294, 300, 300, + 300, 300, 300, 300, 300, 300, 306, 306, 306, 306, + 306, 306, 306, 306, 312, 312, 312, 312, 312, 312, + 312, 312, 318, 318, 318, 318, 318, 318, 318, 318, + 324, 324, 324, 324, 324, 324, 324, 324, 328, 328, + 328, 328, 328, 328, 328, 328, 349, 349, 349, 349, + 349, 349, 349, 349, 354, 354, 354, 354, 354, 354, + 354, 354, 357, 357, 357, 357, 357, 357, 357, 357, + + 358, 358, 358, 358, 358, 358, 358, 358, 363, 363, + 363, 363, 363, 363, 363, 363, 368, 368, 368, 368, + 368, 368, 368, 368, 377, 377, 377, 377, 377, 377, + 377, 377, 381, 381, 381, 381, 381, 381, 381, 381, + 386, 386, 386, 386, 386, 386, 386, 386, 391, 391, + 391, 391, 391, 391, 391, 391, 396, 396, 396, 396, + 396, 396, 396, 396, 401, 401, 401, 401, 401, 401, + 401, 401, 406, 406, 406, 406, 406, 406, 406, 406, + 410, 410, 410, 410, 410, 410, 410, 410, 415, 415, + 415, 415, 415, 415, 415, 415, 419, 419, 419, 419, + + 419, 419, 419, 419, 423, 423, 423, 423, 423, 423, + 423, 423, 427, 427, 427, 427, 427, 427, 427, 427, + 431, 431, 431, 431, 431, 431, 431, 431, 435, 435, + 435, 435, 435, 435, 435, 435, 439, 439, 439, 439, + 439, 439, 439, 439, 443, 443, 443, 443, 443, 443, + 443, 443, 447, 447, 447, 447, 447, 447, 447, 447, + 453, 453, 453, 453, 453, 453, 453, 453, 458, 458, + 458, 458, 458, 458, 458, 458, 469, 469, 469, 469, + 469, 469, 469, 469, 473, 473, 473, 473, 473, 473, + 473, 473, 479, 479, 479, 479, 479, 479, 479, 479, + + 483, 483, 483, 483, 483, 483, 483, 483, 496, 496, + 496, 496, 496, 496, 496, 496, 353, 353, 353, 353, + 353, 353, 353, 353, 523, 523, 523, 523, 523, 523, + 523, 523, 528, 528, 528, 528, 528, 528, 528, 528, + 543, 543, 543, 543, 543, 543, 543, 543, 564, 564, + 564, 564, 564, 564, 564, 564, 570, 570, 570, 570, + 570, 570, 570, 570, 581, 581, 581, 581, 581, 581, + 581, 581, 586, 586, 586, 586, 586, 586, 586, 586, + 592, 592, 592, 592, 592, 592, 592, 592, 596, 596, + 596, 596, 596, 596, 596, 596, 606, 606, 606, 606, + + 606, 606, 606, 606, 611, 611, 611, 611, 611, 611, + 611, 611, 615, 615, 615, 615, 615, 615, 615, 615, + 627, 627, 627, 627, 627, 627, 627, 627, 632, 632, + 632, 632, 632, 632, 632, 632, 637, 637, 637, 637, + 637, 637, 637, 637, 651, 651, 651, 651, 651, 651, + 651, 651, 655, 655, 655, 655, 655, 655, 655, 655, + 660, 660, 660, 660, 660, 660, 660, 660, 666, 666, + 666, 666, 666, 666, 666, 666, 671, 671, 671, 671, + 671, 671, 671, 671, 676, 676, 676, 676, 676, 676, + 676, 676, 680, 680, 680, 680, 680, 680, 680, 680, + + 682, 682, 682, 682, 682, 682, 682, 682, 740, 740, + 741, 741, 746, 747, 749, 749, 750, 750, 753, 753, + 754, 754, 757, 757, 758, 758, 760, 762, 762, 763, + 763, 766, 766, 767, 767, 770, 770, 771, 771, 774, + 774, 775, 775, 778, 778, 779, 779, 781, 783, 783, + 784, 784, 787, 787, 788, 788, 790, 792, 792, 793, + 793, 800, 801, 802, 828, 802, 802, 802, 803, 829, + 803, 803, 803, 831, 832, 833, 834, 835, 836, 837, + 838, 839, 842, 843, 847, 848, 850, 850, 851, 851, + 853, 855, 855, 856, 856, 858, 860, 860, 861, 861, + + 868, 869, 868, 868, 868, 870, 871, 872, 873, 874, + 876, 876, 877, 877, 880, 880, 881, 881, 884, 884, + 885, 885, 888, 888, 889, 889, 892, 892, 893, 893, + 898, 898, 899, 899, 902, 902, 903, 903, 906, 906, + 907, 907, 910, 910, 911, 911, 914, 914, 915, 915, + 918, 918, 919, 919, 922, 922, 923, 923, 926, 926, + 927, 927, 931, 932, 933, 934, 935, 936, 937, 945, + 945, 946, 946, 948, 959, 959, 960, 960, 963, 963, + 964, 964, 966, 971, 974, 976, 976, 977, 977, 979, + 989, 990, 990, 746, 990, 990, 990, 990, 990, 991, + + 747, 760, 991, 991, 991, 991, 991, 741, 741, 740, + 740, 1070, 781, 1070, 750, 750, 749, 749, 1076, 790, + 1076, 754, 754, 753, 753, 1078, 800, 1078, 758, 758, + 757, 757, 1080, 801, 1080, 763, 763, 762, 762, 1082, + 828, 1082, 767, 767, 766, 766, 1084, 829, 1084, 771, + 771, 770, 770, 1086, 831, 1086, 775, 775, 774, 774, + 1088, 832, 1088, 779, 779, 778, 778, 1090, 833, 1090, + 784, 784, 783, 783, 1092, 834, 1092, 788, 788, 787, + 787, 1094, 835, 1094, 793, 793, 792, 792, 1096, 836, + 1096, 802, 837, 802, 802, 802, 803, 838, 803, 803, + + 803, 839, 842, 843, 847, 848, 851, 851, 850, 850, + 1121, 853, 1121, 856, 856, 855, 855, 1123, 858, 1123, + 861, 861, 860, 860, 1125, 869, 1125, 868, 870, 868, + 868, 868, 871, 872, 873, 874, 877, 877, 876, 876, + 1131, 931, 1131, 881, 881, 880, 880, 1133, 932, 1133, + 885, 885, 884, 884, 1135, 933, 1135, 889, 889, 888, + 888, 1137, 934, 1137, 893, 893, 892, 892, 1139, 935, + 1139, 899, 899, 898, 898, 1142, 936, 1142, 903, 903, + 902, 902, 1144, 937, 1144, 907, 907, 906, 906, 1146, + 948, 1146, 911, 911, 910, 910, 1148, 966, 1148, 915, + + 915, 914, 914, 1150, 971, 1150, 919, 919, 918, 918, + 1152, 974, 1152, 923, 923, 922, 922, 1154, 979, 1154, + 927, 927, 926, 926, 1156, 989, 1156, 946, 946, 945, + 945, 1162, 1070, 1162, 960, 960, 959, 959, 1170, 1076, + 1170, 964, 964, 963, 963, 1172, 1078, 1172, 977, 977, + 976, 976, 1177, 1080, 1177, 990, 990, 1082, 990, 990, + 990, 990, 990, 991, 1084, 1086, 991, 991, 991, 991, + 991, 1088, 1090, 1092, 1094, 1096, 1121, 1123, 1125, 1131, + 1133, 1135, 1137, 1139, 1142, 1144, 1146, 1148, 1150, 1152, + 1154, 1156, 1162, 1353, 1352, 1351, 1350, 1349, 1348, 1347, + + 1339, 1338, 1337, 1336, 1335, 1334, 1325, 1324, 1323, 1322, + 1321, 1320, 1319, 1318, 1317, 1316, 1314, 1311, 1310, 1309, + 1308, 1307, 1306, 1305, 1304, 1303, 1302, 1301, 1300, 1299, + 1298, 1295, 1293, 1292, 1291, 1290, 1289, 1286, 1285, 1284, + 1281, 1280, 1279, 1278, 1277, 1276, 1275, 1274, 1273, 1272, + 1271, 1270, 1269, 1268, 1267, 1266, 1263, 1262, 1261, 1260, + 1259, 1255, 1254, 1251, 1250, 1249, 1248, 1247, 1246, 1245, + 1244, 1243, 1242, 1241, 1240, 1239, 1238, 1237, 1234, 1233, + 1232, 1231, 1230, 1229, 1228, 1227, 1226, 1225, 1224, 1223, + 1220, 1214, 1213, 1212, 1208, 1207, 1206, 1205, 1204, 1203, + + 1202, 1201, 1200, 1199, 1198, 1197, 1196, 1193, 1192, 1191, + 1190, 1189, 1188, 1187, 1186, 1183, 1182, 1181, 1180, 1179, + 1178, 978, 975, 978, 1175, 1174, 1173, 965, 962, 965, + 961, 958, 961, 1168, 1167, 1166, 1165, 1164, 1163, 947, + 944, 947, 1160, 1159, 1158, 1157, 928, 925, 928, 924, + 921, 924, 920, 917, 920, 916, 913, 916, 912, 909, + 912, 908, 905, 908, 904, 901, 904, 900, 897, 900, + 1140, 894, 891, 894, 890, 887, 890, 886, 883, 886, + 882, 879, 882, 878, 875, 878, 1129, 1128, 1127, 1126, + 862, 859, 862, 857, 854, 857, 852, 849, 852, 1119, + + 1118, 1117, 1116, 1115, 1114, 1113, 1112, 1111, 1110, 1109, + 1108, 1107, 1106, 1105, 1104, 1103, 1102, 1101, 1100, 1099, + 1098, 1097, 794, 791, 794, 789, 786, 789, 785, 782, + 785, 780, 777, 780, 776, 773, 776, 772, 769, 772, + 768, 765, 768, 764, 761, 764, 759, 756, 759, 755, + 752, 755, 751, 748, 751, 1074, 1073, 742, 739, 742, + 1068, 1067, 1066, 1065, 1064, 1062, 1058, 1055, 1054, 1053, + 1052, 1043, 1042, 1041, 1040, 1039, 1032, 1031, 1025, 1015, + 1011, 1010, 1009, 1008, 1007, 1006, 1005, 1004, 1003, 1000, + 999, 998, 997, 996, 993, 992, 988, 987, 986, 985, + + 984, 983, 982, 981, 980, 975, 978, 975, 973, 972, + 970, 969, 968, 967, 962, 965, 962, 958, 961, 958, + 957, 956, 955, 954, 953, 952, 951, 950, 949, 944, + 947, 944, 943, 942, 941, 940, 939, 938, 930, 929, + 925, 928, 925, 921, 924, 921, 917, 920, 917, 913, + 916, 913, 909, 912, 909, 905, 908, 905, 901, 904, + 901, 897, 900, 897, 896, 895, 891, 894, 891, 887, + 890, 887, 883, 886, 883, 879, 882, 879, 875, 878, + 875, 867, 866, 865, 864, 863, 859, 862, 859, 854, + 857, 854, 849, 852, 849, 846, 845, 844, 841, 840, + + 830, 827, 826, 825, 824, 823, 822, 821, 820, 819, + 818, 814, 813, 810, 809, 808, 807, 806, 805, 804, + 799, 798, 797, 796, 795, 791, 794, 791, 786, 789, + 786, 782, 785, 782, 777, 780, 777, 773, 776, 773, + 769, 772, 769, 765, 768, 765, 761, 764, 761, 756, + 759, 756, 752, 755, 752, 748, 751, 748, 745, 744, + 743, 739, 742, 739, 738, 737, 703, 702, 685, 684, + 2643, 229, 229, 647, 620, 553, 553, 553, 533, 533, + 533, 510, 510, 510, 510, 342, 342, 342, 330, 330, + 330, 229, 227, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643 + } ; + +static yyconst flex_int16_t yy_chk[6138] = + { 0, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 3, 225, 225, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 4, 246, + 246, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 5, 6, 31, 5, 6, 31, 32, 33, + 34, 32, 33, 34, 59, 53, 218, 59, 53, 218, + 54, 5, 6, 54, 1098, 53, 5, 6, 53, 55, + + 54, 1098, 55, 54, 59, 5, 6, 7, 95, 55, + 7, 95, 55, 157, 7, 7, 7, 7, 7, 7, + 7, 8, 235, 224, 8, 189, 224, 95, 8, 8, + 8, 8, 8, 8, 8, 9, 235, 95, 9, 157, + 238, 95, 9, 9, 9, 9, 9, 9, 9, 10, + 157, 238, 10, 233, 189, 233, 10, 10, 10, 10, + 10, 10, 10, 11, 233, 189, 11, 11, 11, 11, + 11, 11, 11, 11, 11, 11, 11, 12, 692, 1254, + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, + 12, 13, 1255, 692, 13, 13, 13, 13, 13, 13, + + 13, 13, 13, 13, 13, 14, 742, 742, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, 14, 15, + 751, 751, 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 16, 755, 755, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 17, 226, 226, + 17, 244, 226, 244, 17, 17, 17, 17, 17, 17, + 17, 18, 237, 730, 18, 1256, 730, 237, 18, 18, + 18, 18, 18, 18, 18, 19, 730, 1257, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, 19, 20, + 759, 759, 20, 20, 20, 20, 20, 20, 20, 20, + + 20, 20, 20, 21, 764, 764, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 22, 768, 768, + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, + 22, 23, 772, 772, 23, 23, 23, 23, 23, 23, + 23, 23, 23, 23, 23, 24, 776, 776, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 25, + 780, 780, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 26, 785, 785, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 27, 789, 789, + 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, + + 27, 28, 794, 794, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 29, 242, 743, 29, 743, + 242, 373, 29, 29, 29, 29, 29, 29, 29, 30, + 373, 1259, 30, 852, 852, 242, 30, 30, 30, 30, + 30, 30, 30, 35, 857, 857, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 37, 37, 37, + 37, 37, 37, 37, 37, 37, 37, 39, 709, 61, + 39, 1261, 61, 709, 39, 39, 39, 39, 39, 39, + 39, 41, 41, 41, 41, 41, 41, 41, 41, 41, + 41, 41, 43, 56, 60, 43, 56, 60, 61, 62, + + 1262, 43, 62, 56, 63, 64, 56, 63, 64, 61, + 862, 862, 245, 43, 60, 245, 158, 43, 65, 43, + 66, 65, 96, 66, 245, 96, 43, 44, 62, 1263, + 44, 878, 878, 63, 64, 67, 44, 173, 67, 62, + 686, 96, 158, 1264, 63, 64, 686, 65, 44, 66, + 728, 96, 44, 158, 44, 96, 882, 882, 65, 728, + 66, 44, 45, 173, 67, 1265, 45, 45, 45, 45, + 45, 45, 45, 46, 173, 67, 1266, 46, 46, 46, + 46, 46, 46, 46, 47, 243, 1267, 47, 886, 886, + 174, 47, 47, 47, 47, 47, 47, 47, 48, 243, + + 1268, 48, 243, 243, 1269, 48, 48, 48, 48, 48, + 48, 48, 49, 49, 49, 49, 174, 49, 49, 49, + 705, 705, 705, 68, 890, 890, 68, 174, 49, 49, + 50, 50, 50, 50, 379, 50, 50, 50, 993, 379, + 1270, 69, 379, 993, 69, 1271, 50, 50, 51, 51, + 51, 51, 68, 51, 51, 51, 70, 894, 894, 70, + 113, 900, 900, 68, 51, 51, 52, 52, 52, 52, + 69, 52, 52, 52, 114, 1272, 113, 1273, 113, 1274, + 113, 69, 52, 52, 57, 70, 113, 57, 139, 1242, + 114, 139, 114, 247, 114, 247, 70, 113, 1242, 247, + + 114, 140, 57, 57, 140, 183, 57, 57, 183, 904, + 904, 114, 57, 183, 183, 57, 58, 139, 1184, 58, + 714, 714, 714, 175, 176, 1184, 175, 176, 139, 1275, + 140, 908, 908, 215, 58, 58, 215, 177, 58, 58, + 177, 140, 1276, 215, 58, 1278, 215, 58, 71, 912, + 912, 71, 175, 176, 1279, 71, 71, 71, 71, 71, + 71, 71, 72, 175, 176, 72, 177, 916, 916, 72, + 72, 72, 72, 72, 72, 72, 73, 177, 1280, 73, + 733, 733, 733, 73, 73, 73, 73, 73, 73, 73, + 1281, 184, 702, 216, 184, 178, 216, 702, 178, 184, + + 184, 185, 187, 216, 185, 187, 216, 702, 188, 73, + 74, 188, 1282, 74, 920, 920, 1283, 74, 74, 74, + 74, 74, 74, 74, 178, 1284, 213, 924, 924, 213, + 185, 187, 1258, 190, 185, 178, 190, 188, 928, 928, + 1258, 185, 187, 74, 75, 213, 213, 75, 188, 213, + 1285, 75, 75, 75, 75, 75, 75, 75, 76, 947, + 947, 76, 190, 961, 961, 76, 76, 76, 76, 76, + 76, 76, 77, 190, 1286, 77, 704, 704, 704, 77, + 77, 77, 77, 77, 77, 77, 78, 965, 965, 78, + 1289, 704, 1290, 78, 78, 78, 78, 78, 78, 78, + + 79, 978, 978, 79, 706, 706, 706, 79, 79, 79, + 79, 79, 79, 79, 80, 1169, 1169, 80, 1291, 706, + 1292, 80, 80, 80, 80, 80, 80, 80, 81, 1171, + 1171, 81, 707, 707, 707, 81, 81, 81, 81, 81, + 81, 81, 82, 1176, 1176, 82, 1293, 707, 1295, 82, + 82, 82, 82, 82, 82, 82, 83, 1298, 1299, 83, + 708, 708, 708, 83, 83, 83, 83, 83, 83, 83, + 84, 1300, 1301, 84, 1302, 708, 1305, 84, 84, 84, + 84, 84, 84, 84, 85, 1306, 1307, 85, 721, 721, + 721, 85, 85, 85, 85, 85, 85, 85, 86, 1303, + + 1309, 86, 1303, 721, 1310, 86, 86, 86, 86, 86, + 86, 86, 87, 1311, 1316, 87, 722, 722, 722, 87, + 87, 87, 87, 87, 87, 87, 88, 1317, 1318, 88, + 1319, 722, 1321, 88, 88, 88, 88, 88, 88, 88, + 89, 1322, 1323, 89, 732, 732, 732, 89, 89, 89, + 89, 89, 89, 89, 90, 186, 1324, 90, 186, 732, + 1325, 90, 90, 90, 90, 90, 90, 90, 91, 1326, + 1327, 91, 1002, 1002, 1002, 91, 91, 91, 91, 91, + 91, 91, 1329, 197, 186, 91, 197, 1002, 186, 198, + 199, 1304, 198, 199, 715, 186, 715, 91, 92, 1304, + + 1330, 92, 723, 715, 723, 92, 92, 92, 92, 92, + 92, 92, 197, 1328, 1331, 92, 723, 200, 198, 199, + 200, 1328, 723, 197, 1016, 1016, 1016, 92, 93, 198, + 199, 93, 1332, 93, 1333, 93, 93, 93, 93, 93, + 93, 93, 94, 1334, 1335, 94, 200, 94, 1337, 94, + 94, 94, 94, 94, 94, 94, 97, 200, 1338, 1340, + 97, 97, 97, 97, 97, 97, 97, 98, 1018, 1018, + 1018, 98, 98, 98, 98, 98, 98, 98, 99, 1020, + 1020, 1020, 99, 99, 99, 99, 99, 99, 99, 100, + 1022, 1022, 1022, 100, 100, 100, 100, 100, 100, 100, + + 101, 101, 101, 101, 101, 101, 101, 101, 101, 101, + 103, 1341, 1342, 103, 1017, 1017, 1017, 103, 103, 103, + 103, 103, 103, 103, 104, 1343, 1344, 104, 1347, 1017, + 1348, 104, 104, 104, 104, 104, 104, 104, 105, 1349, + 1350, 105, 105, 105, 105, 105, 105, 105, 105, 105, + 105, 105, 106, 1351, 1353, 106, 106, 106, 106, 106, + 106, 106, 106, 106, 106, 106, 107, 1355, 1356, 107, + 1019, 1019, 1019, 107, 107, 107, 107, 107, 107, 107, + 108, 1357, 1352, 108, 1358, 1019, 1352, 108, 108, 108, + 108, 108, 108, 108, 109, 1359, 1360, 109, 109, 109, + + 109, 109, 109, 109, 109, 109, 109, 109, 110, 1362, + 1363, 110, 110, 110, 110, 110, 110, 110, 110, 110, + 110, 110, 111, 1029, 1029, 1029, 111, 111, 111, 111, + 111, 111, 111, 214, 1364, 203, 214, 236, 203, 234, + 234, 1365, 204, 236, 1366, 204, 234, 1367, 111, 1368, + 234, 236, 214, 214, 1369, 1370, 214, 1371, 1372, 111, + 115, 115, 115, 115, 203, 115, 115, 115, 1373, 211, + 212, 204, 211, 212, 1374, 203, 115, 115, 116, 116, + 116, 116, 204, 116, 116, 116, 1045, 1045, 1045, 1375, + 219, 1376, 1377, 219, 116, 116, 117, 1379, 211, 212, + + 117, 117, 117, 117, 117, 117, 117, 118, 1067, 211, + 212, 118, 118, 118, 118, 118, 118, 118, 119, 219, + 1067, 1067, 119, 119, 119, 119, 119, 119, 119, 120, + 219, 1380, 1381, 120, 120, 120, 120, 120, 120, 120, + 121, 1047, 1047, 1047, 121, 121, 121, 121, 121, 121, + 121, 122, 1063, 1063, 1063, 122, 122, 122, 122, 122, + 122, 122, 123, 1195, 1195, 1195, 123, 123, 123, 123, + 123, 123, 123, 124, 1209, 1209, 1209, 124, 124, 124, + 124, 124, 124, 124, 125, 1210, 1210, 1210, 125, 125, + 125, 125, 125, 125, 125, 126, 1211, 1211, 1211, 126, + + 126, 126, 126, 126, 126, 126, 127, 1216, 1216, 1216, + 127, 127, 127, 127, 127, 127, 127, 128, 1218, 1218, + 1218, 128, 128, 128, 128, 128, 128, 128, 129, 1021, + 1021, 1021, 129, 129, 129, 129, 129, 129, 129, 129, + 129, 129, 130, 1382, 1021, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 131, 1219, 1219, 1219, + 131, 131, 131, 131, 131, 131, 131, 131, 131, 131, + 133, 133, 133, 133, 133, 133, 133, 133, 133, 133, + 135, 135, 135, 135, 135, 135, 135, 135, 135, 135, + 137, 137, 137, 137, 137, 137, 137, 137, 137, 137, + + 141, 1383, 1384, 141, 141, 141, 141, 141, 141, 141, + 141, 141, 141, 141, 142, 1385, 1386, 142, 142, 142, + 142, 142, 142, 142, 142, 142, 142, 142, 143, 143, + 143, 143, 143, 143, 143, 143, 143, 143, 143, 145, + 145, 145, 145, 145, 145, 145, 145, 145, 145, 147, + 147, 147, 147, 147, 147, 147, 147, 147, 147, 149, + 1388, 1389, 149, 149, 149, 149, 149, 149, 149, 149, + 149, 149, 149, 150, 1390, 1391, 150, 150, 150, 150, + 150, 150, 150, 150, 150, 150, 150, 151, 151, 151, + 151, 151, 151, 151, 151, 151, 151, 151, 153, 153, + + 153, 153, 153, 153, 153, 153, 153, 153, 155, 155, + 155, 155, 155, 155, 155, 155, 155, 155, 159, 1222, + 1222, 1222, 159, 159, 159, 159, 159, 159, 159, 160, + 1235, 1235, 1235, 160, 160, 160, 160, 160, 160, 160, + 161, 1236, 1236, 1236, 161, 161, 161, 161, 161, 161, + 161, 162, 1026, 1026, 1026, 162, 162, 162, 162, 162, + 162, 162, 163, 1392, 1393, 163, 1361, 1026, 1395, 163, + 163, 163, 163, 163, 163, 163, 164, 1397, 1361, 164, + 1253, 1253, 1253, 164, 164, 164, 164, 164, 164, 164, + 165, 1288, 1288, 1288, 165, 165, 165, 165, 165, 165, + + 165, 166, 166, 166, 166, 166, 166, 166, 167, 1294, + 1294, 1294, 167, 167, 167, 167, 167, 167, 167, 168, + 168, 168, 168, 168, 168, 168, 169, 1398, 1399, 169, + 1027, 1027, 1027, 169, 169, 169, 169, 169, 169, 169, + 170, 1400, 1401, 170, 1402, 1027, 1403, 170, 170, 170, + 170, 170, 170, 170, 171, 1297, 1297, 1297, 171, 171, + 171, 171, 171, 171, 171, 172, 172, 172, 172, 172, + 172, 172, 179, 1404, 1405, 179, 179, 179, 179, 179, + 179, 179, 179, 179, 179, 179, 180, 1406, 1409, 180, + 180, 180, 180, 180, 180, 180, 180, 180, 180, 180, + + 181, 1410, 1411, 181, 1028, 1028, 1028, 181, 181, 181, + 181, 181, 181, 181, 182, 1414, 1415, 182, 1416, 1028, + 1417, 182, 182, 182, 182, 182, 182, 182, 191, 1422, + 1423, 191, 191, 191, 191, 191, 191, 191, 191, 191, + 191, 191, 192, 1424, 1425, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 193, 193, 193, 193, + 193, 193, 193, 193, 193, 193, 193, 195, 1312, 1312, + 1312, 195, 195, 195, 195, 195, 195, 195, 196, 1031, + 1031, 1031, 196, 196, 196, 196, 196, 196, 196, 201, + 220, 1426, 201, 220, 1031, 1427, 201, 201, 201, 201, + + 201, 201, 201, 202, 1428, 1429, 202, 1313, 1313, 1313, + 202, 202, 202, 202, 202, 202, 202, 205, 1430, 220, + 1431, 205, 205, 205, 205, 205, 205, 205, 206, 1432, + 220, 1433, 206, 206, 206, 206, 206, 206, 206, 207, + 207, 207, 207, 207, 207, 207, 207, 207, 207, 207, + 209, 1434, 1435, 209, 1044, 1044, 1044, 209, 209, 209, + 209, 209, 209, 209, 210, 1438, 1439, 210, 1442, 1044, + 1443, 210, 210, 210, 210, 210, 210, 210, 221, 1444, + 1445, 221, 1046, 1046, 1046, 221, 221, 221, 221, 221, + 221, 221, 222, 1446, 1450, 222, 1451, 1046, 1452, 222, + + 222, 222, 222, 222, 222, 222, 241, 241, 241, 248, + 1064, 1064, 1064, 248, 1188, 1188, 1188, 1194, 1194, 1194, + 1453, 241, 248, 248, 1454, 1064, 1455, 248, 248, 1188, + 1456, 1457, 1194, 1315, 1315, 1315, 1458, 1197, 1197, 1197, + 1215, 1215, 1215, 1217, 1217, 1217, 241, 241, 241, 249, + 249, 249, 1197, 1243, 1448, 1215, 1459, 1243, 1217, 1346, + 1346, 1346, 1460, 1448, 249, 1243, 1461, 249, 1221, 1221, + 1221, 1354, 1354, 1354, 249, 249, 711, 711, 711, 1244, + 1244, 1244, 1462, 1221, 1463, 1464, 711, 711, 1252, 1252, + 1252, 711, 1465, 1466, 1244, 711, 1233, 1233, 1233, 1287, + + 1287, 1287, 1467, 1252, 1468, 1233, 1233, 1469, 1233, 1470, + 1233, 1233, 1471, 1472, 1287, 1296, 1296, 1296, 1308, 1308, + 1308, 1314, 1314, 1314, 1320, 1320, 1320, 1336, 1336, 1336, + 1296, 1474, 1475, 1308, 1476, 1478, 1314, 1481, 1482, 1320, + 1483, 1484, 1336, 1339, 1339, 1339, 1345, 1345, 1345, 1387, + 1387, 1387, 1394, 1394, 1394, 1396, 1396, 1396, 1339, 1485, + 1486, 1345, 1407, 1407, 1407, 1408, 1408, 1408, 1487, 1488, + 1396, 1412, 1412, 1412, 1413, 1413, 1413, 1407, 1418, 1418, + 1418, 1419, 1419, 1419, 1489, 1490, 1412, 1420, 1420, 1420, + 1421, 1421, 1421, 1418, 1436, 1436, 1436, 1437, 1437, 1437, + + 1493, 1492, 1420, 1492, 1494, 1421, 1440, 1440, 1440, 1436, + 1441, 1441, 1441, 1447, 1447, 1447, 1491, 1449, 1497, 1498, + 1449, 1440, 1449, 1491, 1495, 1495, 1495, 1496, 1496, 1496, + 1449, 1499, 1500, 1501, 1502, 1503, 1504, 1504, 1504, 1495, + 1505, 1506, 1507, 1507, 1507, 1508, 1509, 1510, 1511, 1511, + 1511, 1504, 1512, 1513, 1514, 1515, 1516, 1516, 1516, 1517, + 1517, 1517, 1518, 1518, 1518, 1519, 1519, 1519, 1520, 1520, + 1520, 1521, 1522, 1523, 1517, 1524, 1525, 1526, 1527, 1528, + 1519, 1529, 1530, 1531, 1532, 1533, 1534, 1535, 1535, 1535, + 1536, 1537, 1538, 1538, 1538, 1539, 1540, 1541, 1542, 1543, + + 1544, 1545, 1546, 1547, 1548, 1549, 1550, 1551, 1552, 1553, + 1554, 1555, 1556, 1557, 1558, 1559, 1560, 1561, 1562, 1563, + 1564, 1566, 1567, 1568, 1569, 1570, 1571, 1572, 1574, 1577, + 1578, 1579, 1579, 1579, 1580, 1581, 1582, 1582, 1582, 1583, + 1584, 1585, 1586, 1587, 1588, 1589, 1579, 1590, 1591, 1592, + 1594, 1582, 1593, 1593, 1593, 1595, 1596, 1597, 1598, 1599, + 1600, 1601, 1601, 1601, 1602, 1602, 1602, 1603, 1604, 1605, + 1606, 1607, 1608, 1604, 1614, 1615, 1601, 1609, 1609, 1609, + 1610, 1610, 1610, 1611, 1611, 1611, 1612, 1612, 1612, 1613, + 1613, 1613, 1609, 1616, 1617, 1610, 1618, 1619, 1611, 1620, + + 1621, 1622, 1623, 1624, 1625, 1626, 1627, 1628, 1629, 1630, + 1631, 1632, 1633, 1634, 1635, 1636, 1637, 1638, 1639, 1640, + 1641, 1642, 1643, 1641, 1644, 1645, 1645, 1645, 1646, 1647, + 1648, 1649, 1650, 1651, 1652, 1653, 1654, 1656, 1657, 1658, + 1645, 1659, 1661, 1662, 1664, 1665, 1666, 1666, 1666, 1668, + 1665, 1667, 1667, 1667, 1669, 1670, 1670, 1670, 1671, 1671, + 1671, 1666, 1672, 1673, 1674, 1674, 1674, 1675, 1676, 1677, + 1670, 1678, 1678, 1678, 1679, 1680, 1681, 1682, 1683, 1674, + 1684, 1678, 1678, 1685, 1686, 1687, 1678, 1688, 1689, 1689, + 1689, 1690, 1691, 1692, 1693, 1694, 1695, 1696, 1697, 1697, + + 1697, 1698, 1698, 1698, 1699, 1699, 1699, 1700, 1700, 1700, + 1701, 1701, 1701, 1697, 1702, 1702, 1702, 1703, 1704, 1699, + 1705, 1706, 1707, 1708, 1709, 1701, 1710, 1710, 1710, 1711, + 1712, 1713, 1714, 1715, 1716, 1717, 1718, 1719, 1720, 1721, + 1722, 1710, 1723, 1724, 1725, 1726, 1727, 1728, 1729, 1730, + 1731, 1732, 1733, 1734, 1735, 1735, 1735, 1736, 1736, 1736, + 1737, 1738, 1739, 1740, 1741, 1742, 1743, 1744, 1745, 1735, + 1746, 1747, 1748, 1749, 1750, 1751, 1752, 1753, 1754, 1755, + 1755, 1755, 1756, 1756, 1756, 1757, 1758, 1758, 1758, 1759, + 1760, 1761, 1761, 1761, 1762, 1762, 1762, 1756, 1763, 1764, + + 1764, 1764, 1765, 1766, 1766, 1766, 1761, 1767, 1767, 1767, + 1768, 1768, 1768, 1770, 1764, 1769, 1769, 1769, 1766, 1771, + 1771, 1771, 1767, 1772, 1773, 1768, 1774, 1775, 1776, 1777, + 1778, 1779, 1780, 1781, 1771, 1782, 1782, 1782, 1783, 1783, + 1783, 1784, 1785, 1786, 1787, 1787, 1787, 1788, 1788, 1788, + 1782, 1790, 1791, 1783, 1789, 1789, 1789, 1792, 1793, 1794, + 1795, 1795, 1795, 1796, 1797, 1797, 1797, 1798, 1798, 1798, + 1799, 1800, 1801, 1802, 1803, 1795, 1805, 1806, 1803, 1797, + 1804, 1804, 1804, 1807, 1808, 1809, 1810, 1811, 1812, 1812, + 1812, 1813, 1814, 1815, 1816, 1804, 1817, 1818, 1819, 1820, + + 1821, 1822, 1824, 1812, 1823, 1823, 1823, 1825, 1826, 1827, + 1827, 1827, 1828, 1829, 1830, 1831, 1832, 1833, 1834, 1835, + 1836, 1837, 1838, 1839, 1827, 1840, 1841, 1841, 1841, 1842, + 1842, 1842, 1843, 1843, 1843, 1844, 1841, 1841, 1841, 1845, + 1846, 1841, 1848, 1851, 1842, 1847, 1847, 1847, 1849, 1849, + 1849, 1850, 1850, 1850, 1852, 1852, 1852, 1853, 1853, 1853, + 1854, 1854, 1854, 1849, 1855, 1855, 1855, 1857, 1860, 1852, + 1856, 1856, 1856, 1862, 1863, 1854, 1858, 1858, 1858, 1859, + 1859, 1859, 1861, 1865, 1864, 1867, 1868, 1861, 1866, 1866, + 1866, 1858, 1864, 1869, 1870, 1870, 1870, 1864, 1871, 1871, + + 1871, 1874, 1875, 1866, 1872, 1872, 1872, 1876, 1877, 1870, + 1873, 1873, 1873, 1878, 1879, 1880, 1881, 1884, 1886, 1872, + 1882, 1882, 1882, 1883, 1883, 1883, 1885, 1885, 1885, 1887, + 1888, 1889, 1890, 1891, 1894, 1882, 1892, 1892, 1892, 1889, + 1893, 1893, 1893, 1895, 1896, 1897, 1898, 1899, 1900, 1903, + 1904, 1892, 1901, 1901, 1901, 1902, 1902, 1902, 1905, 1906, + 1907, 1908, 1909, 1910, 1911, 1912, 1913, 1901, 1914, 1915, + 1916, 1916, 1916, 1917, 1917, 1917, 1918, 1919, 1920, 1921, + 1922, 1925, 1926, 1927, 1929, 1916, 1930, 1931, 1931, 1931, + 1932, 1932, 1932, 1933, 1933, 1933, 1934, 1934, 1934, 1935, + + 1935, 1935, 1931, 1937, 1938, 1932, 1940, 1942, 1933, 1947, + 1948, 1934, 1936, 1936, 1936, 1939, 1939, 1939, 1941, 1941, + 1941, 1943, 1943, 1943, 1944, 1944, 1944, 1945, 1945, 1945, + 1939, 1946, 1946, 1946, 1949, 1950, 1951, 1945, 1945, 1952, + 1953, 1954, 1945, 1955, 1956, 1956, 1956, 1957, 1957, 1957, + 1958, 1959, 1960, 1961, 1961, 1961, 1962, 1962, 1962, 1956, + 1963, 1964, 1964, 1964, 1965, 1966, 1967, 1967, 1967, 1968, + 1969, 1970, 1971, 1971, 1971, 1972, 1964, 1973, 1974, 1975, + 1976, 1967, 1977, 1978, 1979, 1980, 1980, 1980, 1981, 1982, + 1983, 1984, 1985, 1986, 1987, 1988, 1988, 1988, 1989, 1990, + + 1990, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, + 1999, 2000, 2001, 2003, 1990, 2002, 2002, 2002, 2004, 2005, + 2006, 2007, 2008, 2010, 2011, 2012, 2013, 2013, 2013, 2014, + 2014, 2014, 2015, 2015, 2015, 2016, 2016, 2016, 2017, 2017, + 2017, 2013, 2018, 2018, 2018, 2020, 2021, 2015, 2019, 2019, + 2019, 2024, 2025, 2017, 2022, 2022, 2022, 2023, 2023, 2023, + 2026, 2026, 2026, 2027, 2027, 2027, 2028, 2028, 2028, 2022, + 2029, 2029, 2029, 2030, 2031, 2026, 2033, 2034, 2027, 2035, + 2036, 2028, 2032, 2032, 2032, 2032, 2037, 2038, 2039, 2039, + 2039, 2040, 2041, 2042, 2043, 2044, 2044, 2044, 2045, 2045, + + 2045, 2046, 2046, 2046, 2047, 2048, 2048, 2048, 2050, 2051, + 2044, 2049, 2049, 2049, 2053, 2054, 2046, 2052, 2052, 2052, + 2048, 2055, 2056, 2057, 2057, 2057, 2059, 2060, 2055, 2058, + 2058, 2058, 2052, 2061, 2062, 2063, 2063, 2063, 2057, 2064, + 2065, 2066, 2067, 2068, 2058, 2069, 2069, 2069, 2071, 2072, + 2063, 2070, 2070, 2070, 2073, 2074, 2075, 2076, 2077, 2078, + 2069, 2079, 2080, 2081, 2082, 2083, 2084, 2085, 2086, 2087, + 2088, 2089, 2090, 2090, 2090, 2091, 2091, 2091, 2092, 2092, + 2092, 2093, 2093, 2093, 2094, 2097, 2098, 2090, 2095, 2095, + 2095, 2096, 2096, 2096, 2099, 2099, 2099, 2100, 2100, 2100, + + 2101, 2101, 2101, 2095, 2102, 2102, 2102, 2104, 2105, 2099, + 2103, 2103, 2103, 2110, 2112, 2101, 2106, 2106, 2106, 2107, + 2107, 2107, 2108, 2108, 2108, 2109, 2109, 2109, 2111, 2111, + 2111, 2106, 2113, 2114, 2107, 2115, 2116, 2108, 2117, 2118, + 2109, 2119, 2123, 2111, 2120, 2120, 2120, 2121, 2121, 2121, + 2122, 2122, 2122, 2124, 2124, 2124, 2125, 2111, 2126, 2127, + 2127, 2127, 2121, 2128, 2128, 2128, 2129, 2130, 2130, 2130, + 2131, 2132, 2133, 2138, 2127, 2134, 2134, 2134, 2135, 2135, + 2135, 2140, 2130, 2136, 2136, 2136, 2137, 2137, 2137, 2141, + 2134, 2139, 2139, 2139, 2142, 2142, 2142, 2144, 2136, 2143, + + 2143, 2143, 2145, 2146, 2147, 2148, 2139, 2150, 2152, 2142, + 2149, 2149, 2149, 2151, 2151, 2151, 2153, 2154, 2154, 2154, + 2155, 2155, 2155, 2156, 2156, 2156, 2157, 2158, 2151, 2159, + 2160, 2161, 2154, 2162, 2163, 2155, 2164, 2165, 2156, 2166, + 2167, 2168, 2169, 2169, 2169, 2171, 2174, 2168, 2170, 2170, + 2170, 2175, 2168, 2172, 2172, 2172, 2178, 2169, 2173, 2173, + 2173, 2176, 2176, 2176, 2177, 2177, 2177, 2179, 2172, 2180, + 2180, 2180, 2181, 2181, 2181, 2182, 2182, 2182, 2183, 2183, + 2183, 2184, 2184, 2184, 2180, 2185, 2185, 2185, 2191, 2192, + 2182, 2186, 2186, 2186, 2193, 2194, 2184, 2187, 2187, 2187, + + 2188, 2188, 2188, 2189, 2189, 2189, 2186, 2190, 2190, 2190, + 2195, 2196, 2196, 2196, 2197, 2188, 2198, 2199, 2189, 2200, + 2200, 2200, 2201, 2202, 2203, 2205, 2196, 2204, 2204, 2204, + 2206, 2206, 2206, 2207, 2207, 2207, 2208, 2209, 2210, 2211, + 2211, 2211, 2212, 2212, 2212, 2206, 2213, 2214, 2214, 2214, + 2215, 2215, 2215, 2216, 2217, 2218, 2218, 2218, 2219, 2219, + 2219, 2220, 2214, 2221, 2222, 2222, 2222, 2223, 2223, 2223, + 2224, 2228, 2235, 2219, 2225, 2225, 2225, 2237, 2238, 2222, + 2239, 2240, 2223, 2226, 2226, 2226, 2227, 2227, 2227, 2225, + 2229, 2229, 2229, 2230, 2230, 2230, 2231, 2231, 2231, 2241, + + 2242, 2227, 2232, 2232, 2232, 2229, 2233, 2233, 2233, 2243, + 2244, 2231, 2234, 2234, 2234, 2236, 2236, 2236, 2245, 2246, + 2247, 2233, 2248, 2249, 2249, 2249, 2250, 2251, 2251, 2251, + 2236, 2252, 2253, 2254, 2255, 2255, 2255, 2256, 2256, 2256, + 2257, 2257, 2257, 2258, 2258, 2258, 2259, 2259, 2259, 2255, + 2260, 2260, 2260, 2261, 2261, 2261, 2262, 2262, 2262, 2263, + 2263, 2263, 2264, 2265, 2266, 2260, 2267, 2267, 2267, 2263, + 2263, 2268, 2268, 2268, 2263, 2269, 2269, 2269, 2270, 2271, + 2272, 2267, 2273, 2274, 2271, 2275, 2268, 2276, 2277, 2277, + 2277, 2278, 2279, 2280, 2281, 2282, 2282, 2282, 2283, 2284, + + 2285, 2285, 2285, 2286, 2286, 2286, 2287, 2288, 2289, 2289, + 2289, 2290, 2290, 2290, 2293, 2285, 2291, 2291, 2291, 2292, + 2292, 2292, 2297, 2289, 2294, 2294, 2294, 2295, 2295, 2295, + 2301, 2291, 2296, 2296, 2296, 2298, 2298, 2298, 2299, 2299, + 2299, 2304, 2295, 2300, 2300, 2300, 2302, 2302, 2302, 2303, + 2303, 2303, 2305, 2306, 2307, 2308, 2308, 2308, 2309, 2309, + 2309, 2302, 2310, 2311, 2313, 2314, 2315, 2316, 2317, 2318, + 2308, 2319, 2327, 2309, 2320, 2320, 2320, 2321, 2321, 2321, + 2322, 2322, 2322, 2323, 2323, 2323, 2324, 2324, 2324, 2320, + 2325, 2325, 2325, 2326, 2326, 2326, 2328, 2329, 2323, 2333, + + 2334, 2324, 2330, 2330, 2330, 2325, 2331, 2331, 2331, 2332, + 2332, 2332, 2335, 2336, 2337, 2338, 2339, 2330, 2340, 2341, + 2341, 2341, 2342, 2343, 2343, 2343, 2344, 2345, 2346, 2347, + 2347, 2347, 2348, 2349, 2341, 2350, 2350, 2350, 2343, 2351, + 2351, 2351, 2352, 2353, 2353, 2353, 2354, 2355, 2355, 2355, + 2356, 2356, 2356, 2357, 2358, 2359, 2360, 2361, 2361, 2361, + 2365, 2366, 2355, 2362, 2362, 2362, 2363, 2363, 2363, 2364, + 2364, 2364, 2361, 2367, 2369, 2370, 2371, 2371, 2371, 2372, + 2373, 2363, 2374, 2374, 2374, 2375, 2375, 2375, 2376, 2376, + 2376, 2371, 2377, 2377, 2377, 2378, 2378, 2378, 2380, 2381, + + 2375, 2379, 2379, 2379, 2382, 2384, 2385, 2377, 2383, 2383, + 2383, 2386, 2386, 2386, 2387, 2388, 2389, 2390, 2390, 2390, + 2391, 2386, 2386, 2386, 2394, 2397, 2386, 2392, 2392, 2392, + 2398, 2401, 2390, 2393, 2393, 2393, 2395, 2395, 2395, 2396, + 2396, 2396, 2392, 2399, 2399, 2399, 2400, 2400, 2400, 2402, + 2406, 2395, 2403, 2403, 2403, 2404, 2404, 2404, 2399, 2407, + 2408, 2400, 2405, 2405, 2405, 2412, 2413, 2403, 2414, 2399, + 2404, 2409, 2409, 2409, 2410, 2410, 2410, 2411, 2411, 2411, + 2415, 2416, 2416, 2416, 2419, 2423, 2409, 2417, 2417, 2417, + 2418, 2418, 2418, 2420, 2420, 2420, 2416, 2421, 2421, 2421, + + 2424, 2425, 2417, 2422, 2422, 2422, 2427, 2434, 2420, 2435, + 2416, 2426, 2426, 2426, 2428, 2428, 2428, 2429, 2429, 2429, + 2430, 2430, 2430, 2431, 2431, 2431, 2426, 2443, 2446, 2428, + 2449, 2454, 2429, 2455, 2456, 2430, 2459, 2461, 2431, 2432, + 2432, 2432, 2433, 2433, 2433, 2436, 2436, 2436, 2437, 2437, + 2437, 2438, 2438, 2438, 2439, 2439, 2439, 2433, 2462, 2465, + 2436, 2440, 2440, 2440, 2467, 2471, 2438, 2441, 2441, 2441, + 2442, 2442, 2442, 2444, 2444, 2444, 2440, 2445, 2445, 2445, + 2447, 2447, 2447, 2472, 2475, 2442, 2485, 2486, 2444, 2448, + 2448, 2448, 2450, 2450, 2450, 2447, 2451, 2451, 2451, 2452, + + 2452, 2452, 2453, 2453, 2453, 2494, 2496, 2450, 2457, 2457, + 2457, 2451, 2458, 2458, 2458, 2463, 2463, 2463, 2464, 2464, + 2464, 2498, 2503, 2457, 2466, 2466, 2466, 2468, 2468, 2468, + 2463, 2469, 2469, 2469, 2470, 2470, 2470, 2473, 2473, 2473, + 2506, 2507, 2468, 2474, 2474, 2474, 2476, 2476, 2476, 2470, + 2508, 2515, 2473, 2477, 2477, 2477, 2478, 2478, 2478, 2516, + 2523, 2476, 2479, 2479, 2479, 2480, 2480, 2480, 2481, 2481, + 2481, 2478, 2482, 2482, 2482, 2483, 2483, 2483, 2524, 2528, + 2480, 2484, 2484, 2484, 2487, 2487, 2487, 2488, 2488, 2488, + 2483, 2489, 2489, 2489, 2490, 2490, 2490, 2491, 2491, 2491, + + 2529, 2530, 2488, 2492, 2492, 2492, 2493, 2493, 2493, 2490, + 2495, 2495, 2495, 2497, 2497, 2497, 2538, 2539, 2492, 2499, + 2499, 2499, 2500, 2500, 2500, 2501, 2501, 2501, 2502, 2502, + 2502, 2504, 2504, 2504, 2499, 2505, 2505, 2505, 2509, 2509, + 2509, 2547, 2548, 2502, 2552, 2554, 2504, 2510, 2510, 2510, + 2511, 2511, 2511, 2512, 2512, 2512, 2555, 2510, 2510, 2513, + 2513, 2513, 2510, 2562, 2563, 2511, 2514, 2514, 2514, 2517, + 2517, 2517, 2570, 2571, 2513, 2518, 2518, 2518, 2519, 2519, + 2519, 2520, 2520, 2520, 2521, 2521, 2521, 2522, 2522, 2522, + 2518, 2525, 2525, 2525, 2526, 2526, 2526, 2527, 2527, 2527, + + 2531, 2531, 2531, 2532, 2532, 2532, 2533, 2533, 2533, 2534, + 2534, 2534, 2535, 2535, 2535, 2536, 2536, 2536, 2532, 2537, + 2537, 2537, 2575, 2578, 2534, 2579, 2582, 2535, 2540, 2540, + 2540, 2541, 2541, 2541, 2537, 2542, 2542, 2542, 2543, 2543, + 2543, 2583, 2587, 2540, 2589, 2591, 2541, 2544, 2544, 2544, + 2542, 2545, 2545, 2545, 2546, 2546, 2546, 2549, 2549, 2549, + 2592, 2593, 2544, 2550, 2550, 2550, 2551, 2551, 2551, 2553, + 2553, 2553, 2549, 2556, 2556, 2556, 2557, 2557, 2557, 2594, + 2599, 2551, 2600, 2601, 2553, 2558, 2558, 2558, 2559, 2559, + 2559, 2557, 2560, 2560, 2560, 2561, 2561, 2561, 2564, 2564, + + 2564, 2565, 2565, 2565, 2566, 2566, 2566, 2560, 2567, 2567, + 2567, 2602, 2608, 2564, 2568, 2568, 2568, 2611, 2617, 2566, + 2569, 2569, 2569, 2572, 2572, 2572, 2573, 2573, 2573, 2574, + 2574, 2574, 2576, 2576, 2576, 2577, 2577, 2577, 2580, 2580, + 2580, 2573, 2581, 2581, 2581, 2618, 2619, 2576, 2584, 2584, + 2584, 2585, 2585, 2585, 2586, 2586, 2586, 2588, 2588, 2588, + 2590, 2590, 2590, 2595, 2595, 2595, 2596, 2596, 2596, 2586, + 2597, 2597, 2597, 2598, 2598, 2598, 2623, 2624, 2595, 2603, + 2603, 2603, 2604, 2604, 2604, 2597, 2625, 2627, 2598, 2605, + 2605, 2605, 2606, 2606, 2606, 2610, 2628, 2604, 2607, 2607, + + 2607, 2609, 2609, 2609, 2612, 2612, 2612, 2606, 2610, 2613, + 2613, 2613, 2614, 2614, 2614, 2629, 2609, 2615, 2615, 2615, + 2616, 2616, 2616, 2620, 2620, 2620, 2630, 2614, 2621, 2621, + 2621, 2631, 2615, 2622, 2622, 2622, 2633, 2634, 2620, 2626, + 2626, 2626, 2635, 2636, 2637, 2638, 2640, 2641, 2644, 2644, + 2644, 2644, 2644, 2644, 2644, 2644, 2645, 2645, 2645, 2645, + 2645, 2645, 2645, 2645, 2646, 2646, 2646, 2646, 2646, 2646, + 2646, 2646, 2647, 2647, 2647, 2647, 2647, 2647, 2647, 2647, + 2648, 2648, 2648, 2648, 2648, 2648, 2648, 2648, 2649, 2649, + 2649, 2649, 2649, 2649, 2649, 2649, 2650, 2650, 2650, 2650, + + 2650, 2650, 2650, 2650, 2651, 2651, 2651, 2651, 2651, 2651, + 2651, 2651, 2652, 2652, 2652, 2652, 2652, 2652, 2652, 2652, + 2653, 2653, 2653, 2653, 2653, 2653, 2653, 2653, 2654, 2654, + 2654, 2654, 2654, 2654, 2654, 2654, 2655, 2655, 2655, 2655, + 2655, 2655, 2655, 2655, 2656, 2656, 2656, 2656, 2656, 2656, + 2656, 2656, 2657, 2657, 2657, 2657, 2657, 2657, 2657, 2657, + 2658, 2658, 2658, 2658, 2658, 2658, 2658, 2658, 2659, 2659, + 2659, 2659, 2659, 2659, 2659, 2659, 2660, 2660, 2660, 2660, + 2660, 2660, 2660, 2660, 2661, 2661, 2661, 2661, 2661, 2661, + 2661, 2661, 2662, 2662, 2662, 2662, 2662, 2662, 2662, 2662, + + 2663, 2663, 2663, 2663, 2663, 2663, 2663, 2663, 2664, 2664, + 2664, 2664, 2664, 2664, 2664, 2664, 2665, 2665, 2665, 2665, + 2665, 2665, 2665, 2665, 2666, 2666, 2666, 2666, 2666, 2666, + 2666, 2666, 2667, 2667, 2667, 2667, 2667, 2667, 2667, 2667, + 2668, 2668, 2668, 2668, 2668, 2668, 2668, 2668, 2669, 2669, + 2669, 2669, 2669, 2669, 2669, 2669, 2670, 2670, 2670, 2670, + 2670, 2670, 2670, 2670, 2671, 2671, 2671, 2671, 2671, 2671, + 2671, 2671, 2672, 2672, 2672, 2672, 2672, 2672, 2672, 2672, + 2673, 2673, 2673, 2673, 2673, 2673, 2673, 2673, 2674, 2674, + 2674, 2674, 2674, 2674, 2674, 2674, 2675, 2675, 2675, 2675, + + 2675, 2675, 2675, 2675, 2676, 2676, 2676, 2676, 2676, 2676, + 2676, 2676, 2677, 2677, 2677, 2677, 2677, 2677, 2677, 2677, + 2678, 2678, 2678, 2678, 2678, 2678, 2678, 2678, 2679, 2679, + 2679, 2679, 2679, 2679, 2679, 2679, 2680, 2680, 2680, 2680, + 2680, 2680, 2680, 2680, 2681, 2681, 2681, 2681, 2681, 2681, + 2681, 2681, 2682, 2682, 2682, 2682, 2682, 2682, 2682, 2682, + 2683, 2683, 2683, 2683, 2683, 2683, 2683, 2683, 2684, 2684, + 2684, 2684, 2684, 2684, 2684, 2684, 2685, 2685, 2685, 2685, + 2685, 2685, 2685, 2685, 2686, 2686, 2686, 2686, 2686, 2686, + 2686, 2686, 2687, 2687, 2687, 2687, 2687, 2687, 2687, 2687, + + 2688, 2688, 2688, 2688, 2688, 2688, 2688, 2688, 2689, 2689, + 2689, 2689, 2689, 2689, 2689, 2689, 2690, 2690, 2690, 2690, + 2690, 2690, 2690, 2690, 2691, 2691, 2691, 2691, 2691, 2691, + 2691, 2691, 2692, 2692, 2692, 2692, 2692, 2692, 2692, 2692, + 2693, 2693, 2693, 2693, 2693, 2693, 2693, 2693, 2694, 2694, + 2694, 2694, 2694, 2694, 2694, 2694, 2695, 2695, 2695, 2695, + 2695, 2695, 2695, 2695, 2696, 2696, 2696, 2696, 2696, 2696, + 2696, 2696, 2697, 2697, 2697, 2697, 2697, 2697, 2697, 2697, + 2698, 2698, 2698, 2698, 2698, 2698, 2698, 2698, 2699, 2699, + 2699, 2699, 2699, 2699, 2699, 2699, 2700, 2700, 2700, 2700, + + 2700, 2700, 2700, 2700, 2701, 2701, 2701, 2701, 2701, 2701, + 2701, 2701, 2702, 2702, 2702, 2702, 2702, 2702, 2702, 2702, + 2703, 2703, 2703, 2703, 2703, 2703, 2703, 2703, 2704, 2704, + 2704, 2704, 2704, 2704, 2704, 2704, 2705, 2705, 2705, 2705, + 2705, 2705, 2705, 2705, 2706, 2706, 2706, 2706, 2706, 2706, + 2706, 2706, 2707, 2707, 2707, 2707, 2707, 2707, 2707, 2707, + 2708, 2708, 2708, 2708, 2708, 2708, 2708, 2708, 2709, 2709, + 2709, 2709, 2709, 2709, 2709, 2709, 2710, 2710, 2710, 2710, + 2710, 2710, 2710, 2710, 2711, 2711, 2711, 2711, 2711, 2711, + 2711, 2711, 2712, 2712, 2712, 2712, 2712, 2712, 2712, 2712, + + 2713, 2713, 2713, 2713, 2713, 2713, 2713, 2713, 2714, 2714, + 2715, 2715, 2716, 2717, 2718, 2718, 2719, 2719, 2720, 2720, + 2721, 2721, 2722, 2722, 2723, 2723, 2724, 2725, 2725, 2726, + 2726, 2727, 2727, 2728, 2728, 2729, 2729, 2730, 2730, 2731, + 2731, 2732, 2732, 2733, 2733, 2734, 2734, 2735, 2736, 2736, + 2737, 2737, 2738, 2738, 2739, 2739, 2740, 2741, 2741, 2742, + 2742, 2743, 2744, 2745, 2747, 2745, 2745, 2745, 2746, 2748, + 2746, 2746, 2746, 2749, 2750, 2751, 2752, 2753, 2754, 2755, + 2756, 2757, 2758, 2759, 2760, 2761, 2762, 2762, 2763, 2763, + 2764, 2765, 2765, 2766, 2766, 2767, 2768, 2768, 2769, 2769, + + 2770, 2771, 2770, 2770, 2770, 2772, 2773, 2774, 2775, 2776, + 2777, 2777, 2778, 2778, 2779, 2779, 2780, 2780, 2781, 2781, + 2782, 2782, 2783, 2783, 2784, 2784, 2785, 2785, 2786, 2786, + 2787, 2787, 2788, 2788, 2789, 2789, 2790, 2790, 2791, 2791, + 2792, 2792, 2793, 2793, 2794, 2794, 2795, 2795, 2796, 2796, + 2797, 2797, 2798, 2798, 2799, 2799, 2800, 2800, 2801, 2801, + 2802, 2802, 2803, 2804, 2805, 2806, 2807, 2808, 2809, 2810, + 2810, 2811, 2811, 2812, 2813, 2813, 2814, 2814, 2815, 2815, + 2816, 2816, 2817, 2818, 2819, 2820, 2820, 2821, 2821, 2822, + 2823, 2824, 2824, 2829, 2824, 2824, 2824, 2824, 2824, 2825, + + 2830, 2840, 2825, 2825, 2825, 2825, 2825, 2826, 2826, 2827, + 2827, 2828, 2856, 2828, 2831, 2831, 2832, 2832, 2833, 2863, + 2833, 2834, 2834, 2835, 2835, 2836, 2867, 2836, 2837, 2837, + 2838, 2838, 2839, 2868, 2839, 2841, 2841, 2842, 2842, 2843, + 2871, 2843, 2844, 2844, 2845, 2845, 2846, 2872, 2846, 2847, + 2847, 2848, 2848, 2849, 2873, 2849, 2850, 2850, 2851, 2851, + 2852, 2874, 2852, 2853, 2853, 2854, 2854, 2855, 2875, 2855, + 2857, 2857, 2858, 2858, 2859, 2876, 2859, 2860, 2860, 2861, + 2861, 2862, 2877, 2862, 2864, 2864, 2865, 2865, 2866, 2878, + 2866, 2869, 2879, 2869, 2869, 2869, 2870, 2880, 2870, 2870, + + 2870, 2881, 2882, 2883, 2884, 2885, 2886, 2886, 2887, 2887, + 2888, 2889, 2888, 2890, 2890, 2891, 2891, 2892, 2893, 2892, + 2894, 2894, 2895, 2895, 2896, 2898, 2896, 2897, 2899, 2897, + 2897, 2897, 2900, 2901, 2902, 2903, 2904, 2904, 2905, 2905, + 2906, 2943, 2906, 2907, 2907, 2908, 2908, 2909, 2944, 2909, + 2910, 2910, 2911, 2911, 2912, 2945, 2912, 2913, 2913, 2914, + 2914, 2915, 2946, 2915, 2916, 2916, 2917, 2917, 2918, 2947, + 2918, 2919, 2919, 2920, 2920, 2921, 2948, 2921, 2922, 2922, + 2923, 2923, 2924, 2949, 2924, 2925, 2925, 2926, 2926, 2927, + 2953, 2927, 2928, 2928, 2929, 2929, 2930, 2960, 2930, 2931, + + 2931, 2932, 2932, 2933, 2961, 2933, 2934, 2934, 2935, 2935, + 2936, 2962, 2936, 2937, 2937, 2938, 2938, 2939, 2966, 2939, + 2940, 2940, 2941, 2941, 2942, 2967, 2942, 2950, 2950, 2951, + 2951, 2952, 2970, 2952, 2954, 2954, 2955, 2955, 2956, 2971, + 2956, 2957, 2957, 2958, 2958, 2959, 2972, 2959, 2963, 2963, + 2964, 2964, 2965, 2973, 2965, 2968, 2968, 2974, 2968, 2968, + 2968, 2968, 2968, 2969, 2975, 2976, 2969, 2969, 2969, 2969, + 2969, 2977, 2978, 2979, 2980, 2981, 2982, 2983, 2984, 2985, + 2986, 2987, 2988, 2989, 2990, 2991, 2992, 2993, 2994, 2995, + 2996, 2997, 2998, 1251, 1250, 1249, 1248, 1247, 1246, 1245, + + 1241, 1240, 1239, 1238, 1237, 1234, 1232, 1231, 1230, 1229, + 1228, 1227, 1226, 1225, 1224, 1223, 1220, 1214, 1213, 1212, + 1208, 1207, 1206, 1205, 1204, 1203, 1202, 1201, 1200, 1199, + 1198, 1196, 1193, 1192, 1191, 1190, 1189, 1187, 1186, 1185, + 1183, 1180, 1165, 1164, 1128, 1119, 1118, 1116, 1110, 1108, + 1106, 1104, 1102, 1101, 1100, 1099, 1097, 1074, 1072, 1071, + 1068, 1066, 1065, 1062, 1061, 1060, 1059, 1058, 1057, 1056, + 1055, 1054, 1053, 1052, 1051, 1050, 1049, 1048, 1043, 1042, + 1041, 1040, 1039, 1038, 1037, 1036, 1035, 1034, 1033, 1032, + 1030, 1025, 1024, 1023, 1015, 1014, 1013, 1012, 1011, 1010, + + 1009, 1008, 1007, 1006, 1005, 1004, 1003, 1001, 1000, 999, + 998, 997, 996, 995, 994, 992, 988, 984, 983, 982, + 981, 977, 976, 975, 973, 970, 968, 964, 963, 962, + 960, 959, 958, 957, 955, 953, 952, 950, 949, 946, + 945, 944, 943, 941, 939, 930, 927, 926, 925, 923, + 922, 921, 919, 918, 917, 915, 914, 913, 911, 910, + 909, 907, 906, 905, 903, 902, 901, 899, 898, 897, + 896, 893, 892, 891, 889, 888, 887, 885, 884, 883, + 881, 880, 879, 877, 876, 875, 867, 866, 865, 864, + 861, 860, 859, 856, 855, 854, 851, 850, 849, 846, + + 844, 840, 830, 827, 825, 823, 821, 819, 817, 816, + 815, 814, 812, 811, 810, 809, 808, 799, 798, 797, + 796, 795, 793, 792, 791, 788, 787, 786, 784, 783, + 782, 779, 778, 777, 775, 774, 773, 771, 770, 769, + 767, 766, 765, 763, 762, 761, 758, 757, 756, 754, + 753, 752, 750, 749, 748, 745, 744, 741, 740, 739, + 738, 737, 736, 735, 734, 731, 729, 727, 726, 725, + 724, 720, 719, 718, 717, 716, 713, 712, 710, 703, + 701, 700, 699, 698, 697, 696, 695, 694, 693, 691, + 690, 689, 688, 687, 685, 684, 674, 673, 669, 668, + + 664, 663, 662, 658, 657, 650, 649, 648, 643, 642, + 635, 634, 630, 629, 623, 622, 621, 619, 618, 617, + 613, 612, 609, 608, 604, 603, 602, 599, 598, 590, + 589, 588, 584, 583, 580, 579, 577, 576, 561, 560, + 559, 558, 557, 556, 555, 554, 552, 551, 550, 547, + 546, 545, 542, 541, 540, 539, 538, 537, 536, 535, + 534, 532, 531, 530, 526, 525, 522, 521, 520, 519, + 518, 517, 516, 515, 514, 513, 512, 511, 509, 508, + 507, 494, 493, 492, 490, 489, 487, 486, 485, 477, + 476, 475, 468, 467, 466, 462, 461, 460, 451, 450, + + 413, 404, 403, 399, 398, 394, 393, 389, 388, 384, + 383, 375, 374, 372, 371, 370, 366, 365, 361, 360, + 347, 346, 345, 344, 343, 341, 340, 339, 336, 335, + 334, 333, 332, 331, 322, 321, 320, 316, 315, 314, + 310, 309, 308, 304, 303, 302, 298, 297, 296, 288, + 287, 286, 282, 281, 280, 276, 275, 274, 262, 261, + 260, 256, 255, 254, 251, 250, 240, 239, 232, 231, + 227, 223, 217, 208, 194, 156, 154, 152, 148, 146, + 144, 138, 136, 134, 132, 112, 102, 42, 40, 38, + 36, 1, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, 2643, + 2643, 2643, 2643, 2643, 2643, 2643, 2643 + } ; + +static yy_state_type yy_last_accepting_state; +static char *yy_last_accepting_cpos; + +extern int yy_flex_debug; +int yy_flex_debug = 0; + +/* The intent behind this definition is that it'll catch + * any uses of REJECT which flex missed. + */ +#define REJECT reject_used_but_not_detected +#define yymore() yymore_used_but_not_detected +#define YY_MORE_ADJ 0 +#define YY_RESTORE_YY_MORE_OFFSET +char *yytext; +#line 1 "read_input.l" +/* + * $Id: read_input.c,v 1.69 2008/08/01 14:04:29 urbach Exp $ + * + * This is the parser. (Dec 2002) + * The .c-file is generated from .l using flex. + * Please edit read_input.l instead of read_input.c! + * flex should be said to be case insensitive! + * + * After modifiing read_input.l please call once + * make flex_read_input + * to update read_input.c + * + * Autor: Carsten Urbach + * urbach@physik.fu-berlin.de + */ +#line 28 "read_input.l" +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include"global.h" +#include"read_input.h" +#include"default_input_values.h" + + /* Name of the parsing routine */ +#define YY_DECL int parse_config() +#define YY_NO_UNPUT + + /* declaration of input parameters */ + int line_of_file=1; + int verbose=0; + int startoption; + int Ntherm; + int Nmeas; + int Nskip; + int integtyp; + int int_n[4]; + double lambda[4]; + int nsmall; + int solver_flag; + int gmres_m_parameter, gmresdr_nr_ev; + int operator_flag; + int matrix_element_flag; + int save_config_flag; + int save_prop_flag; + int save_prop_g2_flag; + int write_cp_flag; + int cp_interval; + int nstore; + int index_start, index_end; + int random_seed; + double dtau, tau; + int Nsteps; + char rlxd_input_filename[100]; + char gauge_input_filename[100]; + int first_prop_flag; + int max_solver_iterations; + double solver_precision; + int mass_number; + int read_source_flag; + char source_input_filename[100]; + int return_check_flag, return_check_interval; + int source_format_flag; + int source_time_slice; + int gauge_precision_read_flag; + int gauge_precision_write_flag; + int prop_precision_flag; + int gmres_m_parameter, gmresdr_nr_ev; + int reproduce_randomnumber_flag; + double stout_rho; + int stout_no_iter; + int use_stout_flag; + int phmc_no_flavours; + int phmc_heavy_timescale; + int phmc_exact_poly; + int compute_evs; + int phmc_compute_evs; + double stilde_max; + double stilde_min; + int degree_of_p; + int propagator_splitted; + int source_splitted; + int source_location; + int no_eigenvalues; + double eigenvalue_precision; + int sub_evs_cg_flag; + int even_odd_flag; + int write_prop_format_flag; + int online_measurement_flag; + int online_measurement_freq; + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +#line 2957 "" + +#define INITIAL 0 +#define BETA 1 +#define STARTCOND 2 +#define THERMSWEEPS 3 +#define NMEAS 4 +#define KAPPA 5 +#define ACCPTILDE 6 +#define ACCHFIN 7 +#define RECEV 8 +#define MUBAR 9 +#define EPSBAR 10 +#define MU 11 +#define MU2 12 +#define MU3 13 +#define SEED 14 +#define Q1 15 +#define Q2 16 +#define DTAU 17 +#define TAU 18 +#define NSTEPS 19 +#define CSW 20 +#define INTTYP 21 +#define NSMALL 22 +#define NSKIP 23 +#define RLXDINPUTFILE 24 +#define GAUGEINPUTFILE 25 +#define GAUGERPREC 26 +#define GAUGEWPREC 27 +#define SOLVFLAG 28 +#define OPFLAG 29 +#define MEFLAG 30 +#define SAVECONF 31 +#define SAVEPROP 32 +#define SAVEPRG2 33 +#define WRITECP 34 +#define CPINT 35 +#define NSTORE 36 +#define TT 37 +#define LL 38 +#define LLX 39 +#define LLY 40 +#define LLZ 41 +#define NPROCX 42 +#define NPROCY 43 +#define NPROCZ 44 +#define IOPROC 45 +#define IDX 46 +#define FPROP 47 +#define CGMAX 48 +#define BCGMAX 49 +#define BOUND 50 +#define SITER 51 +#define SPREC 52 +#define MNR 53 +#define RGIC 54 +#define READSOURCE 55 +#define SOURCEFORMAT 56 +#define SOURCEFILE 57 +#define SOURCETS 58 +#define INT0 59 +#define INT1 60 +#define INT2 61 +#define INT3 62 +#define INT4 63 +#define LAMBDA0 64 +#define LAMBDA1 65 +#define LAMBDA2 66 +#define LAMBDA3 67 +#define LAMBDA4 68 +#define RELPREC 69 +#define FORCEPREC 70 +#define FORCEPREC1 71 +#define FORCEPREC2 72 +#define FORCEPREC3 73 +#define ACCPREC 74 +#define ACCPREC1 75 +#define ACCPREC2 76 +#define ACCPREC3 77 +#define REVCHECK 78 +#define REVINT 79 +#define DEBUG 80 +#define CSGN1 81 +#define CSGN2 82 +#define CSGN3 83 +#define GMRESM 84 +#define GMRESDRNEV 85 +#define REPRORND 86 +#define SLOPPYPREC 87 +#define USESTOUT 88 +#define STOUTRHO 89 +#define STOUTITER 90 +#define PHMCFLAV 91 +#define COMPUTEEVS 92 +#define PCOMPUTEEVS 93 +#define PPP 94 +#define SMAX 95 +#define SMIN 96 +#define DEGP 97 +#define SPLITPROP 98 +#define SPLITSOURCE 99 +#define SRCLOC 100 +#define SUBEVCG 101 +#define NOEV 102 +#define PRECEV 103 +#define HEAVYTS 104 +#define EO 105 +#define WRPROPFLAG 106 +#define PROPPREC 107 +#define PROPTYPE 108 +#define ONMEAS 109 +#define ONFREQ 110 +#define COMMENT 111 +#define ERROR 112 + +#ifndef YY_NO_UNISTD_H +/* Special case for "unistd.h", since it is non-ANSI. We include it way + * down here because we want the user's section 1 to have been scanned first. + * The user has a chance to override it with an option. + */ +#include +#endif + +#ifndef YY_EXTRA_TYPE +#define YY_EXTRA_TYPE void * +#endif + +static int yy_init_globals (void ); + +/* Macros after this point can all be overridden by user definitions in + * section 1. + */ + +#ifndef YY_SKIP_YYWRAP +#ifdef __cplusplus +extern "C" int yywrap (void ); +#else +extern int yywrap (void ); +#endif +#endif + + static void yyunput (int c,char *buf_ptr ); + +#ifndef yytext_ptr +static void yy_flex_strncpy (char *,yyconst char *,int ); +#endif + +#ifdef YY_NEED_STRLEN +static int yy_flex_strlen (yyconst char * ); +#endif + +#ifndef YY_NO_INPUT + +#ifdef __cplusplus +static int yyinput (void ); +#else +static int input (void ); +#endif + +#endif + +/* Amount of stuff to slurp up with each read. */ +#ifndef YY_READ_BUF_SIZE +#define YY_READ_BUF_SIZE 8192 +#endif + +/* Copy whatever the last rule matched to the standard output. */ +#ifndef ECHO +/* This used to be an fputs(), but since the string might contain NUL's, + * we now use fwrite(). + */ +#define ECHO (void) fwrite( yytext, yyleng, 1, yyout ) +#endif + +/* Gets input and stuffs it into "buf". number of characters read, or YY_NULL, + * is returned in "result". + */ +#ifndef YY_INPUT +#define YY_INPUT(buf,result,max_size) \ + if ( YY_CURRENT_BUFFER_LVALUE->yy_is_interactive ) \ + { \ + int c = '*'; \ + size_t n; \ + for ( n = 0; n < max_size && \ + (c = getc( yyin )) != EOF && c != '\n'; ++n ) \ + buf[n] = (char) c; \ + if ( c == '\n' ) \ + buf[n++] = (char) c; \ + if ( c == EOF && ferror( yyin ) ) \ + YY_FATAL_ERROR( "input in flex scanner failed" ); \ + result = n; \ + } \ + else \ + { \ + errno=0; \ + while ( (result = fread(buf, 1, max_size, yyin))==0 && ferror(yyin)) \ + { \ + if( errno != EINTR) \ + { \ + YY_FATAL_ERROR( "input in flex scanner failed" ); \ + break; \ + } \ + errno=0; \ + clearerr(yyin); \ + } \ + }\ +\ + +#endif + +/* No semi-colon after return; correct usage is to write "yyterminate();" - + * we don't want an extra ';' after the "return" because that will cause + * some compilers to complain about unreachable statements. + */ +#ifndef yyterminate +#define yyterminate() return YY_NULL +#endif + +/* Number of entries by which start-condition stack grows. */ +#ifndef YY_START_STACK_INCR +#define YY_START_STACK_INCR 25 +#endif + +/* Report a fatal error. */ +#ifndef YY_FATAL_ERROR +#define YY_FATAL_ERROR(msg) yy_fatal_error( msg ) +#endif + +/* end tables serialization structures and prototypes */ + +/* Default declaration of generated scanner - a define so the user can + * easily add parameters. + */ +#ifndef YY_DECL +#define YY_DECL_IS_OURS 1 + +extern int yylex (void); + +#define YY_DECL int yylex (void) +#endif /* !YY_DECL */ + +/* Code executed at the beginning of each rule, after yytext and yyleng + * have been set up. + */ +#ifndef YY_USER_ACTION +#define YY_USER_ACTION +#endif + +/* Code executed at the end of each rule. */ +#ifndef YY_BREAK +#define YY_BREAK break; +#endif + +#define YY_RULE_SETUP \ + if ( yyleng > 0 ) \ + YY_CURRENT_BUFFER_LVALUE->yy_at_bol = \ + (yytext[yyleng - 1] == '\n'); \ + YY_USER_ACTION + +/** The main scanner function which does all the work. + */ +YY_DECL +{ + register yy_state_type yy_current_state; + register char *yy_cp, *yy_bp; + register int yy_act; + +#line 222 "read_input.l" + +#line 3227 "" + + if ( !(yy_init) ) + { + (yy_init) = 1; + +#ifdef YY_USER_INIT + YY_USER_INIT; +#endif + + if ( ! (yy_start) ) + (yy_start) = 1; /* first start state */ + + if ( ! yyin ) + yyin = stdin; + + if ( ! yyout ) + yyout = stdout; + + if ( ! YY_CURRENT_BUFFER ) { + yyensure_buffer_stack (); + YY_CURRENT_BUFFER_LVALUE = + yy_create_buffer(yyin,YY_BUF_SIZE ); + } + + yy_load_buffer_state( ); + } + + while ( 1 ) /* loops until end-of-file is reached */ + { + yy_cp = (yy_c_buf_p); + + /* Support of yytext. */ + *yy_cp = (yy_hold_char); + + /* yy_bp points to the position in yy_ch_buf of the start of + * the current run. + */ + yy_bp = yy_cp; + + yy_current_state = (yy_start); + yy_current_state += YY_AT_BOL(); +yy_match: + do + { + register YY_CHAR yy_c = yy_ec[YY_SC_TO_UI(*yy_cp)]; + if ( yy_accept[yy_current_state] ) + { + (yy_last_accepting_state) = yy_current_state; + (yy_last_accepting_cpos) = yy_cp; + } + while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) + { + yy_current_state = (int) yy_def[yy_current_state]; + if ( yy_current_state >= 2644 ) + yy_c = yy_meta[(unsigned int) yy_c]; + } + yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c]; + ++yy_cp; + } + while ( yy_current_state != 2643 ); + yy_cp = (yy_last_accepting_cpos); + yy_current_state = (yy_last_accepting_state); + +yy_find_action: + yy_act = yy_accept[yy_current_state]; + + YY_DO_BEFORE_ACTION; + +do_action: /* This label is used only to access EOF actions. */ + + switch ( yy_act ) + { /* beginning of action switch */ + case 0: /* must back up */ + /* undo the effects of YY_DO_BEFORE_ACTION */ + *yy_cp = (yy_hold_char); + yy_cp = (yy_last_accepting_cpos); + yy_current_state = (yy_last_accepting_state); + goto yy_find_action; + +case 1: +/* rule 1 can match eol */ +YY_RULE_SETUP +#line 223 "read_input.l" +BEGIN(TT); + YY_BREAK +case 2: +/* rule 2 can match eol */ +YY_RULE_SETUP +#line 224 "read_input.l" +BEGIN(LL); + YY_BREAK +case 3: +/* rule 3 can match eol */ +YY_RULE_SETUP +#line 225 "read_input.l" +BEGIN(LLX); + YY_BREAK +case 4: +/* rule 4 can match eol */ +YY_RULE_SETUP +#line 226 "read_input.l" +BEGIN(LLY); + YY_BREAK +case 5: +/* rule 5 can match eol */ +YY_RULE_SETUP +#line 227 "read_input.l" +BEGIN(LLZ); + YY_BREAK +case 6: +/* rule 6 can match eol */ +YY_RULE_SETUP +#line 228 "read_input.l" +BEGIN(NPROCX); + YY_BREAK +case 7: +/* rule 7 can match eol */ +YY_RULE_SETUP +#line 229 "read_input.l" +BEGIN(NPROCY); + YY_BREAK +case 8: +/* rule 8 can match eol */ +YY_RULE_SETUP +#line 230 "read_input.l" +BEGIN(NPROCZ); + YY_BREAK +case 9: +/* rule 9 can match eol */ +YY_RULE_SETUP +#line 231 "read_input.l" +BEGIN(KAPPA); + YY_BREAK +case 10: +/* rule 10 can match eol */ +YY_RULE_SETUP +#line 232 "read_input.l" +BEGIN(MU); + YY_BREAK +case 11: +/* rule 11 can match eol */ +YY_RULE_SETUP +#line 233 "read_input.l" +BEGIN(MU2); + YY_BREAK +case 12: +/* rule 12 can match eol */ +YY_RULE_SETUP +#line 234 "read_input.l" +BEGIN(MU3); + YY_BREAK +case 13: +/* rule 13 can match eol */ +YY_RULE_SETUP +#line 235 "read_input.l" +BEGIN(MUBAR); + YY_BREAK +case 14: +/* rule 14 can match eol */ +YY_RULE_SETUP +#line 236 "read_input.l" +BEGIN(MUBAR); + YY_BREAK +case 15: +/* rule 15 can match eol */ +YY_RULE_SETUP +#line 237 "read_input.l" +BEGIN(PPP); + YY_BREAK +case 16: +/* rule 16 can match eol */ +YY_RULE_SETUP +#line 238 "read_input.l" +BEGIN(EPSBAR); + YY_BREAK +case 17: +/* rule 17 can match eol */ +YY_RULE_SETUP +#line 239 "read_input.l" +BEGIN(EPSBAR); + YY_BREAK +case 18: +/* rule 18 can match eol */ +YY_RULE_SETUP +#line 240 "read_input.l" +BEGIN(BETA); + YY_BREAK +case 19: +/* rule 19 can match eol */ +YY_RULE_SETUP +#line 241 "read_input.l" +BEGIN(ACCPTILDE); + YY_BREAK +case 20: +/* rule 20 can match eol */ +YY_RULE_SETUP +#line 242 "read_input.l" +BEGIN(ACCPTILDE); + YY_BREAK +case 21: +/* rule 21 can match eol */ +YY_RULE_SETUP +#line 243 "read_input.l" +BEGIN(ACCHFIN); + YY_BREAK +case 22: +/* rule 22 can match eol */ +YY_RULE_SETUP +#line 244 "read_input.l" +BEGIN(ACCHFIN); + YY_BREAK +case 23: +/* rule 23 can match eol */ +YY_RULE_SETUP +#line 245 "read_input.l" +BEGIN(RECEV); + YY_BREAK +case 24: +/* rule 24 can match eol */ +YY_RULE_SETUP +#line 246 "read_input.l" +BEGIN(RECEV); + YY_BREAK +case 25: +/* rule 25 can match eol */ +YY_RULE_SETUP +#line 247 "read_input.l" +BEGIN(NOEV); + YY_BREAK +case 26: +/* rule 26 can match eol */ +YY_RULE_SETUP +#line 248 "read_input.l" +BEGIN(PRECEV); + YY_BREAK +case 27: +/* rule 27 can match eol */ +YY_RULE_SETUP +#line 249 "read_input.l" +BEGIN(SEED); + YY_BREAK +case 28: +/* rule 28 can match eol */ +YY_RULE_SETUP +#line 250 "read_input.l" +BEGIN(STARTCOND); + YY_BREAK +case 29: +/* rule 29 can match eol */ +YY_RULE_SETUP +#line 251 "read_input.l" +BEGIN(THERMSWEEPS); + YY_BREAK +case 30: +/* rule 30 can match eol */ +YY_RULE_SETUP +#line 252 "read_input.l" +BEGIN(NMEAS); + YY_BREAK +case 31: +/* rule 31 can match eol */ +YY_RULE_SETUP +#line 253 "read_input.l" +BEGIN(NSKIP); + YY_BREAK +case 32: +/* rule 32 can match eol */ +YY_RULE_SETUP +#line 254 "read_input.l" +BEGIN(GAUGEINPUTFILE); + YY_BREAK +case 33: +/* rule 33 can match eol */ +YY_RULE_SETUP +#line 255 "read_input.l" +BEGIN(RLXDINPUTFILE); + YY_BREAK +case 34: +/* rule 34 can match eol */ +YY_RULE_SETUP +#line 256 "read_input.l" +BEGIN(SOLVFLAG); + YY_BREAK +case 35: +/* rule 35 can match eol */ +YY_RULE_SETUP +#line 257 "read_input.l" +BEGIN(SUBEVCG); + YY_BREAK +case 36: +/* rule 36 can match eol */ +YY_RULE_SETUP +#line 258 "read_input.l" +BEGIN(OPFLAG); + YY_BREAK +case 37: +/* rule 37 can match eol */ +YY_RULE_SETUP +#line 259 "read_input.l" +BEGIN(MEFLAG); + YY_BREAK +case 38: +/* rule 38 can match eol */ +YY_RULE_SETUP +#line 260 "read_input.l" +BEGIN(SAVECONF); + YY_BREAK +case 39: +/* rule 39 can match eol */ +YY_RULE_SETUP +#line 261 "read_input.l" +BEGIN(SAVEPROP); + YY_BREAK +case 40: +/* rule 40 can match eol */ +YY_RULE_SETUP +#line 262 "read_input.l" +BEGIN(SAVEPRG2); + YY_BREAK +case 41: +/* rule 41 can match eol */ +YY_RULE_SETUP +#line 263 "read_input.l" +BEGIN(WRITECP); + YY_BREAK +case 42: +/* rule 42 can match eol */ +YY_RULE_SETUP +#line 264 "read_input.l" +BEGIN(CPINT); + YY_BREAK +case 43: +/* rule 43 can match eol */ +YY_RULE_SETUP +#line 265 "read_input.l" +BEGIN(GAUGEINPUTFILE); + YY_BREAK +case 44: +/* rule 44 can match eol */ +YY_RULE_SETUP +#line 266 "read_input.l" +BEGIN(RLXDINPUTFILE); + YY_BREAK +case 45: +/* rule 45 can match eol */ +YY_RULE_SETUP +#line 267 "read_input.l" +BEGIN(NSTORE); + YY_BREAK +case 46: +/* rule 46 can match eol */ +YY_RULE_SETUP +#line 268 "read_input.l" +BEGIN(IOPROC); + YY_BREAK +case 47: +/* rule 47 can match eol */ +YY_RULE_SETUP +#line 269 "read_input.l" +BEGIN(IDX); + YY_BREAK +case 48: +/* rule 48 can match eol */ +YY_RULE_SETUP +#line 270 "read_input.l" +BEGIN(FPROP); + YY_BREAK +case 49: +/* rule 49 can match eol */ +YY_RULE_SETUP +#line 271 "read_input.l" +BEGIN(CSW); + YY_BREAK +case 50: +/* rule 50 can match eol */ +YY_RULE_SETUP +#line 272 "read_input.l" +BEGIN(Q1); + YY_BREAK +case 51: +/* rule 51 can match eol */ +YY_RULE_SETUP +#line 273 "read_input.l" +BEGIN(Q2); + YY_BREAK +case 52: +/* rule 52 can match eol */ +YY_RULE_SETUP +#line 274 "read_input.l" +BEGIN(INTTYP); + YY_BREAK +case 53: +/* rule 53 can match eol */ +YY_RULE_SETUP +#line 275 "read_input.l" +BEGIN(NSMALL); + YY_BREAK +case 54: +/* rule 54 can match eol */ +YY_RULE_SETUP +#line 276 "read_input.l" +BEGIN(DTAU); + YY_BREAK +case 55: +/* rule 55 can match eol */ +YY_RULE_SETUP +#line 277 "read_input.l" +BEGIN(TAU); + YY_BREAK +case 56: +/* rule 56 can match eol */ +YY_RULE_SETUP +#line 278 "read_input.l" +BEGIN(NSTEPS); + YY_BREAK +case 57: +/* rule 57 can match eol */ +YY_RULE_SETUP +#line 279 "read_input.l" +BEGIN(BCGMAX); + YY_BREAK +case 58: +/* rule 58 can match eol */ +YY_RULE_SETUP +#line 280 "read_input.l" +BEGIN(CGMAX); + YY_BREAK +case 59: +/* rule 59 can match eol */ +YY_RULE_SETUP +#line 281 "read_input.l" +BEGIN(BOUND); + YY_BREAK +case 60: +/* rule 60 can match eol */ +YY_RULE_SETUP +#line 282 "read_input.l" +BEGIN(BOUND); + YY_BREAK +case 61: +/* rule 61 can match eol */ +YY_RULE_SETUP +#line 283 "read_input.l" +BEGIN(SITER); + YY_BREAK +case 62: +/* rule 62 can match eol */ +YY_RULE_SETUP +#line 284 "read_input.l" +BEGIN(SPREC); + YY_BREAK +case 63: +/* rule 63 can match eol */ +YY_RULE_SETUP +#line 285 "read_input.l" +BEGIN(MNR); + YY_BREAK +case 64: +/* rule 64 can match eol */ +YY_RULE_SETUP +#line 286 "read_input.l" +BEGIN(RGIC); + YY_BREAK +case 65: +/* rule 65 can match eol */ +YY_RULE_SETUP +#line 287 "read_input.l" +BEGIN(READSOURCE); + YY_BREAK +case 66: +/* rule 66 can match eol */ +YY_RULE_SETUP +#line 288 "read_input.l" +BEGIN(SOURCEFILE); + YY_BREAK +case 67: +/* rule 67 can match eol */ +YY_RULE_SETUP +#line 289 "read_input.l" +BEGIN(SOURCEFORMAT); + YY_BREAK +case 68: +/* rule 68 can match eol */ +YY_RULE_SETUP +#line 290 "read_input.l" +BEGIN(SOURCETS); + YY_BREAK +case 69: +/* rule 69 can match eol */ +YY_RULE_SETUP +#line 291 "read_input.l" +BEGIN(INT0); + YY_BREAK +case 70: +/* rule 70 can match eol */ +YY_RULE_SETUP +#line 292 "read_input.l" +BEGIN(INT0); + YY_BREAK +case 71: +/* rule 71 can match eol */ +YY_RULE_SETUP +#line 293 "read_input.l" +BEGIN(INT0); + YY_BREAK +case 72: +/* rule 72 can match eol */ +YY_RULE_SETUP +#line 294 "read_input.l" +BEGIN(INT1); + YY_BREAK +case 73: +/* rule 73 can match eol */ +YY_RULE_SETUP +#line 295 "read_input.l" +BEGIN(INT1); + YY_BREAK +case 74: +/* rule 74 can match eol */ +YY_RULE_SETUP +#line 296 "read_input.l" +BEGIN(INT2); + YY_BREAK +case 75: +/* rule 75 can match eol */ +YY_RULE_SETUP +#line 297 "read_input.l" +BEGIN(INT2); + YY_BREAK +case 76: +/* rule 76 can match eol */ +YY_RULE_SETUP +#line 298 "read_input.l" +BEGIN(INT3); + YY_BREAK +case 77: +/* rule 77 can match eol */ +YY_RULE_SETUP +#line 299 "read_input.l" +BEGIN(INT3); + YY_BREAK +case 78: +/* rule 78 can match eol */ +YY_RULE_SETUP +#line 300 "read_input.l" +BEGIN(INT4); + YY_BREAK +case 79: +/* rule 79 can match eol */ +YY_RULE_SETUP +#line 301 "read_input.l" +BEGIN(INT4); + YY_BREAK +case 80: +/* rule 80 can match eol */ +YY_RULE_SETUP +#line 302 "read_input.l" +BEGIN(LAMBDA0); + YY_BREAK +case 81: +/* rule 81 can match eol */ +YY_RULE_SETUP +#line 303 "read_input.l" +BEGIN(LAMBDA1); + YY_BREAK +case 82: +/* rule 82 can match eol */ +YY_RULE_SETUP +#line 304 "read_input.l" +BEGIN(LAMBDA2); + YY_BREAK +case 83: +/* rule 83 can match eol */ +YY_RULE_SETUP +#line 305 "read_input.l" +BEGIN(LAMBDA3); + YY_BREAK +case 84: +/* rule 84 can match eol */ +YY_RULE_SETUP +#line 306 "read_input.l" +BEGIN(LAMBDA4); + YY_BREAK +case 85: +/* rule 85 can match eol */ +YY_RULE_SETUP +#line 307 "read_input.l" +BEGIN(RELPREC); + YY_BREAK +case 86: +/* rule 86 can match eol */ +YY_RULE_SETUP +#line 308 "read_input.l" +BEGIN(FORCEPREC); + YY_BREAK +case 87: +/* rule 87 can match eol */ +YY_RULE_SETUP +#line 309 "read_input.l" +BEGIN(FORCEPREC1); + YY_BREAK +case 88: +/* rule 88 can match eol */ +YY_RULE_SETUP +#line 310 "read_input.l" +BEGIN(FORCEPREC2); + YY_BREAK +case 89: +/* rule 89 can match eol */ +YY_RULE_SETUP +#line 311 "read_input.l" +BEGIN(FORCEPREC3); + YY_BREAK +case 90: +/* rule 90 can match eol */ +YY_RULE_SETUP +#line 312 "read_input.l" +BEGIN(ACCPREC); + YY_BREAK +case 91: +/* rule 91 can match eol */ +YY_RULE_SETUP +#line 313 "read_input.l" +BEGIN(ACCPREC1); + YY_BREAK +case 92: +/* rule 92 can match eol */ +YY_RULE_SETUP +#line 314 "read_input.l" +BEGIN(ACCPREC2); + YY_BREAK +case 93: +/* rule 93 can match eol */ +YY_RULE_SETUP +#line 315 "read_input.l" +BEGIN(ACCPREC3); + YY_BREAK +case 94: +/* rule 94 can match eol */ +YY_RULE_SETUP +#line 316 "read_input.l" +BEGIN(REVCHECK); + YY_BREAK +case 95: +/* rule 95 can match eol */ +YY_RULE_SETUP +#line 317 "read_input.l" +BEGIN(REVINT); + YY_BREAK +case 96: +/* rule 96 can match eol */ +YY_RULE_SETUP +#line 318 "read_input.l" +BEGIN(DEBUG); + YY_BREAK +case 97: +/* rule 97 can match eol */ +YY_RULE_SETUP +#line 319 "read_input.l" +BEGIN(CSGN1); + YY_BREAK +case 98: +/* rule 98 can match eol */ +YY_RULE_SETUP +#line 320 "read_input.l" +BEGIN(CSGN1); + YY_BREAK +case 99: +/* rule 99 can match eol */ +YY_RULE_SETUP +#line 321 "read_input.l" +BEGIN(CSGN2); + YY_BREAK +case 100: +/* rule 100 can match eol */ +YY_RULE_SETUP +#line 322 "read_input.l" +BEGIN(CSGN2); + YY_BREAK +case 101: +/* rule 101 can match eol */ +YY_RULE_SETUP +#line 323 "read_input.l" +BEGIN(CSGN3); + YY_BREAK +case 102: +/* rule 102 can match eol */ +YY_RULE_SETUP +#line 324 "read_input.l" +BEGIN(CSGN3); + YY_BREAK +case 103: +/* rule 103 can match eol */ +YY_RULE_SETUP +#line 325 "read_input.l" +BEGIN(GMRESM); + YY_BREAK +case 104: +/* rule 104 can match eol */ +YY_RULE_SETUP +#line 326 "read_input.l" +BEGIN(GMRESDRNEV); + YY_BREAK +case 105: +/* rule 105 can match eol */ +YY_RULE_SETUP +#line 327 "read_input.l" +BEGIN(GAUGERPREC); + YY_BREAK +case 106: +/* rule 106 can match eol */ +YY_RULE_SETUP +#line 328 "read_input.l" +BEGIN(GAUGEWPREC); + YY_BREAK +case 107: +/* rule 107 can match eol */ +YY_RULE_SETUP +#line 329 "read_input.l" +BEGIN(PROPPREC); + YY_BREAK +case 108: +/* rule 108 can match eol */ +YY_RULE_SETUP +#line 330 "read_input.l" +BEGIN(REPRORND); + YY_BREAK +case 109: +/* rule 109 can match eol */ +YY_RULE_SETUP +#line 331 "read_input.l" +BEGIN(SLOPPYPREC); + YY_BREAK +case 110: +/* rule 110 can match eol */ +YY_RULE_SETUP +#line 332 "read_input.l" +BEGIN(USESTOUT); + YY_BREAK +case 111: +/* rule 111 can match eol */ +YY_RULE_SETUP +#line 333 "read_input.l" +BEGIN(STOUTRHO); + YY_BREAK +case 112: +/* rule 112 can match eol */ +YY_RULE_SETUP +#line 334 "read_input.l" +BEGIN(STOUTITER); + YY_BREAK +case 113: +/* rule 113 can match eol */ +YY_RULE_SETUP +#line 335 "read_input.l" +BEGIN(PHMCFLAV); + YY_BREAK +case 114: +/* rule 114 can match eol */ +YY_RULE_SETUP +#line 336 "read_input.l" +BEGIN(PCOMPUTEEVS); + YY_BREAK +case 115: +/* rule 115 can match eol */ +YY_RULE_SETUP +#line 337 "read_input.l" +BEGIN(COMPUTEEVS); + YY_BREAK +case 116: +/* rule 116 can match eol */ +YY_RULE_SETUP +#line 338 "read_input.l" +BEGIN(SMAX); + YY_BREAK +case 117: +/* rule 117 can match eol */ +YY_RULE_SETUP +#line 339 "read_input.l" +BEGIN(SMIN); + YY_BREAK +case 118: +/* rule 118 can match eol */ +YY_RULE_SETUP +#line 340 "read_input.l" +BEGIN(DEGP); + YY_BREAK +case 119: +/* rule 119 can match eol */ +YY_RULE_SETUP +#line 341 "read_input.l" +BEGIN(SPLITPROP); + YY_BREAK +case 120: +/* rule 120 can match eol */ +YY_RULE_SETUP +#line 342 "read_input.l" +BEGIN(SPLITSOURCE); + YY_BREAK +case 121: +/* rule 121 can match eol */ +YY_RULE_SETUP +#line 343 "read_input.l" +BEGIN(SRCLOC); + YY_BREAK +case 122: +/* rule 122 can match eol */ +YY_RULE_SETUP +#line 344 "read_input.l" +BEGIN(HEAVYTS); + YY_BREAK +case 123: +/* rule 123 can match eol */ +YY_RULE_SETUP +#line 345 "read_input.l" +BEGIN(EO); + YY_BREAK +case 124: +/* rule 124 can match eol */ +YY_RULE_SETUP +#line 346 "read_input.l" +BEGIN(WRPROPFLAG); + YY_BREAK +case 125: +/* rule 125 can match eol */ +YY_RULE_SETUP +#line 347 "read_input.l" +BEGIN(WRPROPFLAG); + YY_BREAK +case 126: +/* rule 126 can match eol */ +YY_RULE_SETUP +#line 348 "read_input.l" +BEGIN(ONMEAS); + YY_BREAK +case 127: +/* rule 127 can match eol */ +YY_RULE_SETUP +#line 349 "read_input.l" +BEGIN(ONFREQ); + YY_BREAK +case 128: +YY_RULE_SETUP +#line 351 "read_input.l" +{ +#ifndef FIXEDVOLUME + T_global = atoi(yytext); +#endif + if(verbose!=0) printf("T =%s\n", yytext); +} + YY_BREAK +case 129: +YY_RULE_SETUP +#line 357 "read_input.l" +{ +#ifndef FIXEDVOLUME + L = atoi(yytext); +#endif + if(verbose!=0) printf("L =%s\n", yytext); +} + YY_BREAK +case 130: +YY_RULE_SETUP +#line 363 "read_input.l" +{ +#ifndef FIXEDVOLUME + LX = atoi(yytext); +#endif + if(verbose!=0) printf("LX =%s\n", yytext); +} + YY_BREAK +case 131: +YY_RULE_SETUP +#line 369 "read_input.l" +{ +#ifndef FIXEDVOLUME + LY = atoi(yytext); +#endif + if(verbose!=0) printf("LY =%s\n", yytext); +} + YY_BREAK +case 132: +YY_RULE_SETUP +#line 375 "read_input.l" +{ +#ifndef FIXEDVOLUME + LZ = atoi(yytext); +#endif + if(verbose!=0) printf("LZ =%s\n", yytext); +} + YY_BREAK +case 133: +YY_RULE_SETUP +#line 381 "read_input.l" +{ +#ifndef FIXEDVOLUME + N_PROC_X = atoi(yytext); +#endif + if(verbose!=0) printf("Nr of processors in x direction = %s\n", yytext); +} + YY_BREAK +case 134: +YY_RULE_SETUP +#line 387 "read_input.l" +{ +#ifndef FIXEDVOLUME + N_PROC_Y = atoi(yytext); +#endif + if(verbose!=0) printf("Nr of processors in y direction = %s\n", yytext); +} + YY_BREAK +case 135: +YY_RULE_SETUP +#line 393 "read_input.l" +{ +#ifndef FIXEDVOLUME + N_PROC_Z = atoi(yytext); +#endif + if(verbose!=0) printf("Nr of processors in z direction = %s\n", yytext); +} + YY_BREAK +case 136: +YY_RULE_SETUP +#line 399 "read_input.l" +{ + random_seed=atoi(yytext); + if(verbose!=0) printf("seed=%s \n", yytext); +} + YY_BREAK +case 137: +YY_RULE_SETUP +#line 403 "read_input.l" +{ + g_kappa=atof(yytext); + if(verbose!=0) printf("kappa=%s \n", yytext); +} + YY_BREAK +case 138: +YY_RULE_SETUP +#line 407 "read_input.l" +{ + g_acc_Ptilde=atof(yytext); + if(verbose!=0) printf("Acc_Ptilde=%s \n", yytext); +} + YY_BREAK +case 139: +YY_RULE_SETUP +#line 411 "read_input.l" +{ + g_acc_Hfin=atof(yytext); + if(verbose!=0) printf("Acc_Hfin=%s \n", yytext); +} + YY_BREAK +case 140: +YY_RULE_SETUP +#line 415 "read_input.l" +{ + g_rec_ev = atoi(yytext); + if(verbose!=0) printf("Rec_EV=%s \n", yytext); +} + YY_BREAK +case 141: +YY_RULE_SETUP +#line 419 "read_input.l" +{ + g_mubar=atof(yytext); + if(verbose!=0) printf("mubar=%s \n", yytext); +} + YY_BREAK +case 142: +YY_RULE_SETUP +#line 423 "read_input.l" +{ + g_epsbar=atof(yytext); + if(verbose!=0) printf("epsbar=%s \n", yytext); +} + YY_BREAK +case 143: +YY_RULE_SETUP +#line 427 "read_input.l" +{ + g_mu1=atof(yytext); + if(verbose!=0) printf("mu=%s \n", yytext); +} + YY_BREAK +case 144: +YY_RULE_SETUP +#line 431 "read_input.l" +{ + g_mu2=atof(yytext); + if(verbose!=0) printf("mu2=%s \n", yytext); +} + YY_BREAK +case 145: +YY_RULE_SETUP +#line 435 "read_input.l" +{ + g_mu3=atof(yytext); + if(verbose!=0) printf("mu3=%s \n", yytext); +} + YY_BREAK +case 146: +YY_RULE_SETUP +#line 439 "read_input.l" +{ + g_beta=atof(yytext); + if(verbose!=0) printf("beta=%s \n",yytext); +} + YY_BREAK +case 147: +YY_RULE_SETUP +#line 443 "read_input.l" +{ + startoption=0; + if(verbose!=0) printf("Start Condition is %s \n",yytext); +} + YY_BREAK +case 148: +YY_RULE_SETUP +#line 447 "read_input.l" +{ + startoption=1; + if(verbose!=0) printf("Start Condition is %s \n",yytext); +} + YY_BREAK +case 149: +YY_RULE_SETUP +#line 451 "read_input.l" +{ + startoption=2; + if(verbose!=0) printf("Start Condition is %s \n",yytext); +} + YY_BREAK +case 150: +YY_RULE_SETUP +#line 455 "read_input.l" +{ + startoption=3; + if(verbose!=0) printf("Start Condition is %s \n",yytext); +} + YY_BREAK +case 151: +YY_RULE_SETUP +#line 459 "read_input.l" +{ + Ntherm=atoi(yytext); + if(verbose!=0) printf("Nterm= %s \n",yytext); +} + YY_BREAK +case 152: +YY_RULE_SETUP +#line 463 "read_input.l" +{ + Nmeas=atoi(yytext); + if(verbose!=0) printf("Nmeas= %s \n",yytext); +} + YY_BREAK +case 153: +YY_RULE_SETUP +#line 467 "read_input.l" +{ + Nskip=atoi(yytext); + if(verbose!=0) printf("Nskip= %s \n",yytext); +} + YY_BREAK +case 154: +YY_RULE_SETUP +#line 471 "read_input.l" +{ + solver_flag=0; + if(verbose!=0) printf("Use BiCGStab Solver"); +} + YY_BREAK +case 155: +YY_RULE_SETUP +#line 475 "read_input.l" +{ + solver_flag=1; + if(verbose!=0) printf("Use CG Solver\n"); +} + YY_BREAK +case 156: +YY_RULE_SETUP +#line 479 "read_input.l" +{ + solver_flag=9; + if(verbose!=0) printf("Use PCG Solver (eigenvectors needed) \n"); +} + YY_BREAK +case 157: +YY_RULE_SETUP +#line 483 "read_input.l" +{ + solver_flag=2; + if(verbose!=0) printf("Use GMRES Solver\n"); +} + YY_BREAK +case 158: +YY_RULE_SETUP +#line 487 "read_input.l" +{ + solver_flag=7; + if(verbose!=0) printf("Use GCR Solver\n"); +} + YY_BREAK +case 159: +YY_RULE_SETUP +#line 491 "read_input.l" +{ + solver_flag=8; + if(verbose!=0) printf("Use GMRES-DR Solver\n"); +} + YY_BREAK +case 160: +YY_RULE_SETUP +#line 495 "read_input.l" +{ + solver_flag=3; + if(verbose!=0) printf("Use CGS Solver\n"); +} + YY_BREAK +case 161: +YY_RULE_SETUP +#line 499 "read_input.l" +{ + solver_flag=4; + if(verbose!=0) printf("Use MR Solver \n"); +} + YY_BREAK +case 162: +YY_RULE_SETUP +#line 503 "read_input.l" +{ + solver_flag=5; + if(verbose!=0) printf("Use BiCGstab(2) Solver \n"); +} + YY_BREAK +case 163: +YY_RULE_SETUP +#line 507 "read_input.l" +{ + solver_flag=6; + if(verbose!=0) printf("Use FGMRES solver (eigenvectors needed) \n"); +} + YY_BREAK +case 164: +YY_RULE_SETUP +#line 511 "read_input.l" +{ + gmres_m_parameter = atoi(yytext); + if(verbose!=0) printf("Use Krylov Space of size %d in GMRES \n", gmres_m_parameter); +} + YY_BREAK +case 165: +YY_RULE_SETUP +#line 515 "read_input.l" +{ + gmresdr_nr_ev = atoi(yytext); + if(verbose!=0) printf("Deflate %d eigenvectors in GMRES-DR \n", gmresdr_nr_ev); +} + YY_BREAK +case 166: +YY_RULE_SETUP +#line 519 "read_input.l" +{ + max_solver_iterations = atoi(yytext); + if(verbose!=0) printf("Use %d iterations in the solvers!\n", max_solver_iterations); +} + YY_BREAK +case 167: +YY_RULE_SETUP +#line 523 "read_input.l" +{ + solver_precision = atof(yytext); + if(verbose!=0) printf("Use %e as convergence precision for the solvers!\n", solver_precision); +} + YY_BREAK +case 168: +YY_RULE_SETUP +#line 527 "read_input.l" +{ + operator_flag=2; + if(verbose!=0) printf("Operator Flag is set to %s\n",yytext); +} + YY_BREAK +case 169: +YY_RULE_SETUP +#line 531 "read_input.l" +{ + operator_flag=1; + if(verbose!=0) printf("Operator Flag is set to %s\n",yytext); +} + YY_BREAK +case 170: +YY_RULE_SETUP +#line 535 "read_input.l" +{ + operator_flag=0; + if(verbose!=0) printf("Operator Flag is set to %s\n",yytext); +} + YY_BREAK +case 171: +YY_RULE_SETUP +#line 539 "read_input.l" +{ + matrix_element_flag=1; + if(verbose!=0) printf("Compute Matrix Elements: %s\n", yytext); +} + YY_BREAK +case 172: +YY_RULE_SETUP +#line 543 "read_input.l" +{ + matrix_element_flag=0; + if(verbose!=0) printf("Compute Matrix Elements: %s\n", yytext); +} + YY_BREAK +case 173: +YY_RULE_SETUP +#line 547 "read_input.l" +{ + save_config_flag=1; + if(verbose!=0) printf("Save configurations\n"); +} + YY_BREAK +case 174: +YY_RULE_SETUP +#line 551 "read_input.l" +{ + save_config_flag=0; + if(verbose!=0) printf("Don't save configurations\n"); +} + YY_BREAK +case 175: +YY_RULE_SETUP +#line 555 "read_input.l" +{ + save_prop_flag=1; + if(verbose!=0) printf("Save propagators\n"); +} + YY_BREAK +case 176: +YY_RULE_SETUP +#line 559 "read_input.l" +{ + save_prop_flag=0; + if(verbose!=0) printf("Don't save propagators\n"); +} + YY_BREAK +case 177: +YY_RULE_SETUP +#line 563 "read_input.l" +{ + save_prop_g2_flag=1; + if(verbose!=0) printf("Save generalized propagators\n"); +} + YY_BREAK +case 178: +YY_RULE_SETUP +#line 567 "read_input.l" +{ + save_prop_g2_flag=0; + if(verbose!=0) printf("Don't save generalized propagators\n"); +} + YY_BREAK +case 179: +YY_RULE_SETUP +#line 571 "read_input.l" +{ + write_cp_flag=1; + if(verbose!=0) printf("Write Checkpoints\n"); +} + YY_BREAK +case 180: +YY_RULE_SETUP +#line 575 "read_input.l" +{ + write_cp_flag=0; + if(verbose!=0) printf("Don't write Checkpoints\n"); +} + YY_BREAK +case 181: +YY_RULE_SETUP +#line 579 "read_input.l" +{ + cp_interval=atoi(yytext); + if(verbose!=0) printf("Write Checkpoint all %s measurements\n",yytext); +} + YY_BREAK +case 182: +YY_RULE_SETUP +#line 583 "read_input.l" +{ + strcpy(rlxd_input_filename,yytext); + if(verbose!=0) printf("Ranluxd input filename set to %s\n",yytext); +} + YY_BREAK +case 183: +YY_RULE_SETUP +#line 587 "read_input.l" +{ + strcpy(gauge_input_filename,yytext); + if(verbose!=0) printf("Gauge Configuration input filename set to %s\n",yytext); +} + YY_BREAK +case 184: +YY_RULE_SETUP +#line 591 "read_input.l" +{ + nstore=atoi(yytext); + if(verbose!=0) printf("Initial store counter set to %s\n",yytext); +} + YY_BREAK +case 185: +YY_RULE_SETUP +#line 595 "read_input.l" +{ + nstore=-1; + if(verbose!=0) printf("Trying to read InitialStoreCounter from file .nstore_counter\n"); +} + YY_BREAK +case 186: +YY_RULE_SETUP +#line 599 "read_input.l" +{ + g_stdio_proc = -1; + if(verbose!=0) printf("All processors will give output to stdout\n"); +} + YY_BREAK +case 187: +YY_RULE_SETUP +#line 603 "read_input.l" +{ + g_stdio_proc = -2; + if(verbose!=0) printf("No processor will give output to stdout\n"); +} + YY_BREAK +case 188: +YY_RULE_SETUP +#line 607 "read_input.l" +{ + g_stdio_proc = atoi(yytext); + if(verbose!=0) printf("processor %s will give output to stdout\n", yytext); +} + YY_BREAK +case 189: +YY_RULE_SETUP +#line 611 "read_input.l" +{ + index_start = atoi(yytext); + index_end = index_start+1; + if((index_start < 0)||(index_start >11)){ + printf("Error in line %d! index_start must be in [0,11]! Exiting...!\n", line_of_file); + exit(1); + } + if(verbose!=0) printf("inverting for index %s\n", yytext); +} + YY_BREAK +case 190: +YY_RULE_SETUP +#line 620 "read_input.l" +{ + sscanf(yytext, "-%d", &index_end); + if((index_end < 0)||(index_end >11)){ + printf("Error in line %d! index_end must be in [0,11]! Exiting...!\n", line_of_file); + exit(1); + } + if((index_end < 0)||(index_end >11)){ + printf("Warnig! index_end bigger than index_start. Will compute no propagator!\n"); + } + if(verbose!=0) printf("inverting up to color index %d\n", index_end); + index_end+=1; +} + YY_BREAK +case 191: +YY_RULE_SETUP +#line 632 "read_input.l" +{ + first_prop_flag = -1; + if(verbose!=0) printf("Do not compute the first propagator (default)\n"); +} + YY_BREAK +case 192: +YY_RULE_SETUP +#line 636 "read_input.l" +{ + first_prop_flag = 0; + if(verbose!=0) printf("Computing the first propagator (default)\n"); +} + YY_BREAK +case 193: +YY_RULE_SETUP +#line 640 "read_input.l" +{ + first_prop_flag = 1; + if(verbose!=0) printf("Reading in the first propagator\n"); +} + YY_BREAK +case 194: +YY_RULE_SETUP +#line 644 "read_input.l" +{ + integtyp = 1; + if(verbose!=0) printf("Using Leap Frog integrator!\n"); +} + YY_BREAK +case 195: +YY_RULE_SETUP +#line 648 "read_input.l" +{ + integtyp = 2; + if(verbose!=0) printf("Using SW integrator!\n"); +} + YY_BREAK +case 196: +YY_RULE_SETUP +#line 652 "read_input.l" +{ + integtyp = 3; + if(verbose!=0) printf("Using multiple time scale Leapfrog integrator!\n"); +} + YY_BREAK +case 197: +YY_RULE_SETUP +#line 656 "read_input.l" +{ + integtyp = 4; + if(verbose!=0) printf("Using multiple time scale Sexton-Weingarten integrator!\n"); +} + YY_BREAK +case 198: +YY_RULE_SETUP +#line 660 "read_input.l" +{ + integtyp = 5; + if(verbose!=0) printf("Using higher order Leapfrog integrator!\n"); +} + YY_BREAK +case 199: +YY_RULE_SETUP +#line 664 "read_input.l" +{ + integtyp = 6; + if(verbose!=0) printf("Using Second order Minimal norm integrator!\n"); +} + YY_BREAK +case 200: +YY_RULE_SETUP +#line 668 "read_input.l" +{ + integtyp = 7; + if(verbose!=0) printf("Using Second order Minimal norm integrator (position version)!\n"); +} + YY_BREAK +case 201: +YY_RULE_SETUP +#line 672 "read_input.l" +{ + nsmall = atoi(yytext); + if(verbose!=0) printf("nsmall set to %d\n", nsmall); +} + YY_BREAK +case 202: +YY_RULE_SETUP +#line 676 "read_input.l" +{ + g_c_sw = atof(yytext); + if(verbose!=0) printf("c_sw set to %e\n", g_c_sw); +} + YY_BREAK +case 203: +YY_RULE_SETUP +#line 680 "read_input.l" +{ + dtau = atof(yytext); + if(verbose!=0) printf("dtau set to %e\n", dtau); +} + YY_BREAK +case 204: +YY_RULE_SETUP +#line 684 "read_input.l" +{ + tau = atof(yytext); + if(verbose!=0) printf("tau set to %e\n", tau); +} + YY_BREAK +case 205: +YY_RULE_SETUP +#line 688 "read_input.l" +{ + Nsteps = atoi(yytext); + if(verbose!=0) printf("NSteps set to %d\n", Nsteps); +} + YY_BREAK +case 206: +YY_RULE_SETUP +#line 692 "read_input.l" +{ + ITER_MAX_BCG = atoi(yytext); + if(verbose != 0) printf("Maximal number of iterations for BCGstab set ro %d\n", ITER_MAX_BCG); +} + YY_BREAK +case 207: +YY_RULE_SETUP +#line 696 "read_input.l" +{ + ITER_MAX_CG = atoi(yytext); + if(verbose != 0) printf("Maximal number of iterations for CG set ro %d\n", ITER_MAX_CG); +} + YY_BREAK +case 208: +YY_RULE_SETUP +#line 700 "read_input.l" +{ + X0 = atof(yytext); + if(verbose != 0) printf("X0 for boundary cond. in time set to %e\n", X0); +} + YY_BREAK +case 209: +YY_RULE_SETUP +#line 704 "read_input.l" +{ + mass_number = atoi(yytext); + if(verbose != 0) printf("Setting mass number to %s\n", yytext); +} + YY_BREAK +case 210: +YY_RULE_SETUP +#line 708 "read_input.l" +{ + g_rgi_C1=atof(yytext); + if(verbose!=0) printf("g_rgi_C1=%s \n", yytext); +} + YY_BREAK +case 211: +YY_RULE_SETUP +#line 712 "read_input.l" +{ + read_source_flag=1; + if(verbose!=0) printf("Read inversion source from file\n"); +} + YY_BREAK +case 212: +YY_RULE_SETUP +#line 716 "read_input.l" +{ + read_source_flag=0; + if(verbose!=0) printf("Don't read inversion source from file\n"); +} + YY_BREAK +case 213: +YY_RULE_SETUP +#line 720 "read_input.l" +{ + strcpy(source_input_filename,yytext); + if(verbose!=0) printf("source input filename set to %s\n",yytext); +} + YY_BREAK +case 214: +YY_RULE_SETUP +#line 724 "read_input.l" +{ + source_format_flag = 0; + if(verbose!=0) printf("Using standard ETMC binary format for source input file\n"); +} + YY_BREAK +case 215: +YY_RULE_SETUP +#line 728 "read_input.l" +{ + source_format_flag = 1; + if(verbose!=0) printf("Using CM format for source input file\n"); +} + YY_BREAK +case 216: +YY_RULE_SETUP +#line 732 "read_input.l" +{ + source_format_flag = 2; + if(verbose!=0) printf("Using GWC format for source input file\n"); +} + YY_BREAK +case 217: +YY_RULE_SETUP +#line 736 "read_input.l" +{ + source_time_slice = atoi(yytext); + if(verbose!=0) printf("Using only timeslice %s of the source, padding the rest with zeros\n", yytext); +} + YY_BREAK +case 218: +YY_RULE_SETUP +#line 740 "read_input.l" +{ + int_n[0] = atoi(yytext); + if(verbose!=0) printf("Number of steps in ExtLeapFrog integrator for gauge set to %d!\n", int_n[0]); +} + YY_BREAK +case 219: +YY_RULE_SETUP +#line 744 "read_input.l" +{ + int_n[1] = atoi(yytext); + if(verbose!=0) printf("Number of steps in ExtLeapFrog integrator for psf 1 (mu) set to %d!\n", int_n[1]); +} + YY_BREAK +case 220: +YY_RULE_SETUP +#line 748 "read_input.l" +{ + int_n[2] = atoi(yytext); + if(verbose!=0) printf("Number of steps in ExtLeapFrog integrator for psf 2 (mu2) set to %d!\n", int_n[2]); +} + YY_BREAK +case 221: +YY_RULE_SETUP +#line 752 "read_input.l" +{ + int_n[3] = atoi(yytext); + if(verbose!=0) printf("Number of steps in ExtLeapFrog integrator for psf 3 (mu3) set to %d!\n", int_n[3]); +} + YY_BREAK +case 222: +YY_RULE_SETUP +#line 756 "read_input.l" +{ + if(verbose!=0) printf("Number of steps in ExtLeapFrog integrator for psf 4 (mu4) set to %d!\n", int_n[1]); +} + YY_BREAK +case 223: +YY_RULE_SETUP +#line 759 "read_input.l" +{ + lambda[0] = atof(yytext); + if(verbose!=0) printf("Set lambda parameter for gauge fields (in the 2MN integrator) to %f!\n", lambda[0]); +} + YY_BREAK +case 224: +YY_RULE_SETUP +#line 763 "read_input.l" +{ + lambda[1] = atof(yytext); + if(verbose!=0) printf("Set lambda parameter for psf 1 (in the 2MN integrator) to %f!\n", lambda[0]); +} + YY_BREAK +case 225: +YY_RULE_SETUP +#line 767 "read_input.l" +{ + lambda[2] = atof(yytext); + if(verbose!=0) printf("Set lambda parameter for psf 2 (in the 2MN integrator) to %f!\n", lambda[0]); +} + YY_BREAK +case 226: +YY_RULE_SETUP +#line 771 "read_input.l" +{ + lambda[3] = atof(yytext); + if(verbose!=0) printf("Set lambda parameter for psf 3 (in the 2MN integrator) to %f!\n", lambda[0]); +} + YY_BREAK +case 227: +YY_RULE_SETUP +#line 775 "read_input.l" +{ + if(verbose!=0) printf("Set lambda parameter for psf 4 (in the 2MN integrator) to %f! (not yet implemented)\n", lambda[0]); +} + YY_BREAK +case 228: +YY_RULE_SETUP +#line 778 "read_input.l" +{ + g_eps_sq_force=atof(yytext); + if(verbose!=0) printf("g_eps_sq_force=%s Residual for inversions in the force computation\n", yytext); +} + YY_BREAK +case 229: +YY_RULE_SETUP +#line 782 "read_input.l" +{ + g_eps_sq_force1=atof(yytext); + if(verbose!=0) printf("g_eps_sq_force(mu)=%s Residual for inversions in the force computation\n", yytext); +} + YY_BREAK +case 230: +YY_RULE_SETUP +#line 786 "read_input.l" +{ + g_eps_sq_force2=atof(yytext); + if(verbose!=0) printf("g_eps_sq_force(mu2)=%s Residual for inversions in the force computation\n", yytext); +} + YY_BREAK +case 231: +YY_RULE_SETUP +#line 790 "read_input.l" +{ + g_eps_sq_force3=atof(yytext); + if(verbose!=0) printf("g_eps_sq_force(mu3)=%s Residual for inversions in the force computation\n", yytext); +} + YY_BREAK +case 232: +YY_RULE_SETUP +#line 794 "read_input.l" +{ + g_eps_sq_acc=atof(yytext); + if(verbose!=0) printf("g_eps_sq_acc=%s Residual for inversions in the acceptance step\n", yytext); +} + YY_BREAK +case 233: +YY_RULE_SETUP +#line 798 "read_input.l" +{ + g_eps_sq_acc1=atof(yytext); + if(verbose!=0) printf("g_eps_sq_acc(mu)=%s Residual for inversions in the acceptance step\n", yytext); +} + YY_BREAK +case 234: +YY_RULE_SETUP +#line 802 "read_input.l" +{ + g_eps_sq_acc2=atof(yytext); + if(verbose!=0) printf("g_eps_sq_acc(mu2)=%s Residual for inversions in the acceptance step\n", yytext); +} + YY_BREAK +case 235: +YY_RULE_SETUP +#line 806 "read_input.l" +{ + g_eps_sq_acc3=atof(yytext); + if(verbose!=0) printf("g_eps_sq_acc(mu3)=%s Residual for inversions in the acceptance step\n", yytext); +} + YY_BREAK +case 236: +YY_RULE_SETUP +#line 810 "read_input.l" +{ + g_relative_precision_flag = 1; + if(verbose!=0) printf("Using relative precision\n"); +} + YY_BREAK +case 237: +YY_RULE_SETUP +#line 814 "read_input.l" +{ + g_relative_precision_flag = 0; + if(verbose!=0) printf("Using absolute precision\n"); +} + YY_BREAK +case 238: +YY_RULE_SETUP +#line 818 "read_input.l" +{ + return_check_flag = 1; + if(verbose!=0) printf("Perform checks of Reversibility\n"); +} + YY_BREAK +case 239: +YY_RULE_SETUP +#line 822 "read_input.l" +{ + return_check_flag = 0; + if(verbose!=0) printf("Don't perform checks of Reversibility\n"); +} + YY_BREAK +case 240: +YY_RULE_SETUP +#line 826 "read_input.l" +{ + return_check_interval = atoi(yytext); + if(verbose!=0) printf("Check reversibility all %d trajectories\n", return_check_interval); +} + YY_BREAK +case 241: +YY_RULE_SETUP +#line 830 "read_input.l" +{ + g_debug_level = atoi(yytext); + if(verbose!=0) printf("Debug level = %d\n", g_debug_level); +} + YY_BREAK +case 242: +YY_RULE_SETUP +#line 834 "read_input.l" +{ + g_csg_N[0] = atoi(yytext); + if(verbose!=0) printf("Chronological Invertier history length for mu set to %d\n", g_csg_N[0]); +} + YY_BREAK +case 243: +YY_RULE_SETUP +#line 838 "read_input.l" +{ + g_csg_N[2] = atoi(yytext); + if(verbose!=0) printf("Chronological Invertier history length for mu set to %d\n", g_csg_N[2]); +} + YY_BREAK +case 244: +YY_RULE_SETUP +#line 842 "read_input.l" +{ + g_csg_N[4] = atoi(yytext); + if(verbose!=0) printf("Chronological Invertier history length for mu set to %d\n", g_csg_N[4]); +} + YY_BREAK +case 245: +YY_RULE_SETUP +#line 846 "read_input.l" +{ + gauge_precision_read_flag = 32; + if(verbose!=0) printf("Read gauges in 32 Bit precision!\n"); +} + YY_BREAK +case 246: +YY_RULE_SETUP +#line 850 "read_input.l" +{ + gauge_precision_read_flag = 64; + if(verbose!=0) printf("Read gauges in 64 Bit precision!\n"); +} + YY_BREAK +case 247: +YY_RULE_SETUP +#line 854 "read_input.l" +{ + gauge_precision_write_flag = 32; + if(verbose!=0) printf("Save gauges in 32 Bit precision!\n"); +} + YY_BREAK +case 248: +YY_RULE_SETUP +#line 858 "read_input.l" +{ + gauge_precision_write_flag = 64; + if(verbose!=0) printf("Save gauges in 64 Bit precision!\n"); +} + YY_BREAK +case 249: +YY_RULE_SETUP +#line 862 "read_input.l" +{ + prop_precision_flag = 32; + if(verbose!=0) printf("Save propagators in 32 Bit precision!\n"); +} + YY_BREAK +case 250: +YY_RULE_SETUP +#line 866 "read_input.l" +{ + prop_precision_flag = 64; + if(verbose!=0) printf("Save propagators in 64 Bit precision!\n"); +} + YY_BREAK +case 251: +YY_RULE_SETUP +#line 870 "read_input.l" +{ + reproduce_randomnumber_flag = 1; + if(verbose!=0) printf("Use reproducable randomnumbers!\n"); +} + YY_BREAK +case 252: +YY_RULE_SETUP +#line 874 "read_input.l" +{ + reproduce_randomnumber_flag = 0; + if(verbose!=0) printf("Use a different seed for each process in ranlxd!\n"); +} + YY_BREAK +case 253: +YY_RULE_SETUP +#line 878 "read_input.l" +{ + g_sloppy_precision_flag = 1; + if(verbose!=0) printf("Use sloppy precision if available!\n"); +} + YY_BREAK +case 254: +YY_RULE_SETUP +#line 882 "read_input.l" +{ + g_sloppy_precision_flag = 0; + if(verbose!=0) printf("Don't use sloppy precision!\n"); +} + YY_BREAK +case 255: +YY_RULE_SETUP +#line 886 "read_input.l" +{ + use_stout_flag = 1; + if(verbose!=0) printf("Use stout smearing for invert!\n"); +} + YY_BREAK +case 256: +YY_RULE_SETUP +#line 890 "read_input.l" +{ + use_stout_flag = 0; + if(verbose!=0) printf("Don't use stout smearing for invert!\n"); +} + YY_BREAK +case 257: +YY_RULE_SETUP +#line 894 "read_input.l" +{ + stout_rho=atof(yytext); + if(verbose!=0) printf("use stout rho=%e!\n", stout_rho); +} + YY_BREAK +case 258: +YY_RULE_SETUP +#line 898 "read_input.l" +{ + stout_no_iter=atoi(yytext); + if(verbose!=0) printf("make %d stout iterations!\n", stout_no_iter); +} + YY_BREAK +case 259: +YY_RULE_SETUP +#line 902 "read_input.l" +{ + phmc_no_flavours=4; + if(verbose!=0) printf("Simulate 2+1+1 flavours (1+1 PHMC).\n"); +} + YY_BREAK +case 260: +YY_RULE_SETUP +#line 906 "read_input.l" +{ + phmc_no_flavours=2; + if(verbose!=0) printf("Simulate 1+1 flavours only (1+1 PHMC).\n"); +} + YY_BREAK +case 261: +YY_RULE_SETUP +#line 910 "read_input.l" +{ + phmc_compute_evs=1; + if(verbose!=0) printf("Compute Eigenvalues and exit."); +} + YY_BREAK +case 262: +YY_RULE_SETUP +#line 914 "read_input.l" +{ + phmc_compute_evs=0; +} + YY_BREAK +case 263: +YY_RULE_SETUP +#line 917 "read_input.l" +{ + compute_evs=1; + if(verbose!=0) printf("Compute Eigenvalues in invert."); +} + YY_BREAK +case 264: +YY_RULE_SETUP +#line 921 "read_input.l" +{ + compute_evs=0; + if(verbose!=0) printf("Do not compute Eigenvalues in invert."); +} + YY_BREAK +case 265: +YY_RULE_SETUP +#line 925 "read_input.l" +{ + compute_evs=2; + if(verbose!=0) printf("Try to only read in eigenvalues and vectors in invert."); +} + YY_BREAK +case 266: +YY_RULE_SETUP +#line 929 "read_input.l" +{ + phmc_exact_poly = 0; + if(verbose!=0) printf("Run the PHMC as usual."); +} + YY_BREAK +case 267: +YY_RULE_SETUP +#line 933 "read_input.l" +{ + phmc_exact_poly = 1; + if(verbose!=0) printf("Run the PHMC only with usage of the less accurate polynomial."); +} + YY_BREAK +case 268: +YY_RULE_SETUP +#line 938 "read_input.l" +{ + stilde_max = atof(yytext); + if(verbose!=0) printf("Stilde max for PHMC set to %e.\n", stilde_max); +} + YY_BREAK +case 269: +YY_RULE_SETUP +#line 942 "read_input.l" +{ + stilde_min = atof(yytext); + if(verbose!=0) printf("Stilde min for PHMC set to %e.\n", stilde_min); +} + YY_BREAK +case 270: +YY_RULE_SETUP +#line 946 "read_input.l" +{ + degree_of_p = atoi(yytext); + if(verbose!=0) printf("Degree for less precise polynomial P set to %d \n", degree_of_p); +} + YY_BREAK +case 271: +YY_RULE_SETUP +#line 950 "read_input.l" +{ + propagator_splitted=1; + if(verbose!=0) printf("Split the propagator in several files! (invert)\n"); +} + YY_BREAK +case 272: +YY_RULE_SETUP +#line 954 "read_input.l" +{ + propagator_splitted=0; + if(verbose!=0) printf("Do not split the propagator in several files (default) (invert)!\n"); +} + YY_BREAK +case 273: +YY_RULE_SETUP +#line 958 "read_input.l" +{ + source_splitted=1; + if(verbose!=0) printf("Expect source to be split in several files (invert)!\n"); +} + YY_BREAK +case 274: +YY_RULE_SETUP +#line 962 "read_input.l" +{ + source_splitted=0; + if(verbose!=0) printf("Do not expect source to be split in several files (default) (invert)!\n"); +} + YY_BREAK +case 275: +YY_RULE_SETUP +#line 966 "read_input.l" +{ + source_location=atoi(yytext); + if(verbose!=0) printf("source_location = %s\n",yytext); +} + YY_BREAK +case 276: +YY_RULE_SETUP +#line 970 "read_input.l" +{ + eigenvalue_precision = atof(yytext); + if(verbose!=0) printf("precision for eigenvalues = %e\n", eigenvalue_precision); +} + YY_BREAK +case 277: +YY_RULE_SETUP +#line 974 "read_input.l" +{ + no_eigenvalues = atoi(yytext); + if(verbose!=0) printf("no of eigenvalues = %d\n", no_eigenvalues); +} + YY_BREAK +case 278: +YY_RULE_SETUP +#line 978 "read_input.l" +{ + sub_evs_cg_flag = 1; + if(verbose!=0) printf("project out eigenvector subspace\n"); +} + YY_BREAK +case 279: +YY_RULE_SETUP +#line 982 "read_input.l" +{ + sub_evs_cg_flag = 0; + if(verbose!=0) printf("Do no project out eigenvector subspace\n"); +} + YY_BREAK +case 280: +YY_RULE_SETUP +#line 986 "read_input.l" +{ + phmc_heavy_timescale = atoi(yytext); + if(verbose!=0) printf("Integrate heavy doublet on timescale %d\n", phmc_heavy_timescale); +} + YY_BREAK +case 281: +YY_RULE_SETUP +#line 990 "read_input.l" +{ + even_odd_flag = 1; + if(verbose) printf("Use even/odd preconditioning\n"); +} + YY_BREAK +case 282: +YY_RULE_SETUP +#line 994 "read_input.l" +{ + even_odd_flag = 0; + if(verbose) printf("Do not use even/odd preconditioning\n"); +} + YY_BREAK +case 283: +YY_RULE_SETUP +#line 998 "read_input.l" +{ + write_prop_format_flag = 10; + if(verbose!=0) fprintf(stderr, "GWC format no longer supported for writing propagators\n"); +} + YY_BREAK +case 284: +YY_RULE_SETUP +#line 1002 "read_input.l" +{ + write_prop_format_flag = 11; + if(verbose!=0) fprintf(stderr, "CM format no longer supported for writing propagators\n"); +} + YY_BREAK +case 285: +YY_RULE_SETUP +#line 1006 "read_input.l" +{ + write_prop_format_flag = 0; + if(verbose!=0) printf("Propagator type: DiracFermion_Sinks\n"); +} + YY_BREAK +case 286: +YY_RULE_SETUP +#line 1010 "read_input.l" +{ + write_prop_format_flag = 1; + if(verbose!=0) printf("Propagator type: DiracFermion_Source_Sink_Pairs\n"); +} + YY_BREAK +case 287: +YY_RULE_SETUP +#line 1014 "read_input.l" +{ + write_prop_format_flag = 1; + fprintf(stderr, "Propagator type: DiracFermion_ScalarSource_TwelveSink, not yet supported\n"); +} + YY_BREAK +case 288: +YY_RULE_SETUP +#line 1018 "read_input.l" +{ + write_prop_format_flag = 1; + fprintf(stderr, "Propagator type: DiracFermion_ScalarSource_FourSink, not yet supported\n"); +} + YY_BREAK +case 289: +YY_RULE_SETUP +#line 1022 "read_input.l" +{ + online_measurement_flag = 1; + if(verbose!=0) fprintf(stderr, "Switched on online measurements\n"); +} + YY_BREAK +case 290: +YY_RULE_SETUP +#line 1026 "read_input.l" +{ + online_measurement_flag = 0; + if(verbose!=0) fprintf(stderr, "Online measurements not switched on\n"); +} + YY_BREAK +case 291: +YY_RULE_SETUP +#line 1030 "read_input.l" +{ + online_measurement_freq = atoi(yytext); + if(verbose!=0) fprintf(stderr, "Frequency for online measurements set to %s\n", yytext); +} + YY_BREAK +case 292: +YY_RULE_SETUP +#line 1035 "read_input.l" +BEGIN(COMMENT); + YY_BREAK +case 293: +YY_RULE_SETUP +#line 1036 "read_input.l" +BEGIN(COMMENT); + YY_BREAK +case 294: +YY_RULE_SETUP +#line 1037 "read_input.l" +BEGIN(COMMENT); + YY_BREAK +case 295: +YY_RULE_SETUP +#line 1038 "read_input.l" +BEGIN(COMMENT); + YY_BREAK +case 296: +YY_RULE_SETUP +#line 1039 "read_input.l" +BEGIN(COMMENT); + YY_BREAK +case 297: +YY_RULE_SETUP +#line 1040 "read_input.l" +BEGIN(COMMENT); + YY_BREAK +case 298: +YY_RULE_SETUP +#line 1041 "read_input.l" +BEGIN(COMMENT); + YY_BREAK +case 299: +YY_RULE_SETUP +#line 1042 "read_input.l" +BEGIN(COMMENT); + YY_BREAK +case 300: +YY_RULE_SETUP +#line 1043 "read_input.l" +BEGIN(COMMENT); + YY_BREAK +case 301: +YY_RULE_SETUP +#line 1044 "read_input.l" +BEGIN(COMMENT); + YY_BREAK +case 302: +YY_RULE_SETUP +#line 1045 "read_input.l" +BEGIN(COMMENT); + YY_BREAK +case 303: +YY_RULE_SETUP +#line 1046 "read_input.l" +BEGIN(COMMENT); + YY_BREAK +case 304: +YY_RULE_SETUP +#line 1047 "read_input.l" +{ + ; +} + YY_BREAK +case 305: +/* rule 305 can match eol */ +YY_RULE_SETUP +#line 1052 "read_input.l" +{ + line_of_file++; + BEGIN(0); +} + YY_BREAK +case 306: +YY_RULE_SETUP +#line 1057 "read_input.l" +{ + printf("Unknown seed in line %d.\n Must be an integer. Exiting...!\n",line_of_file); + exit(1); +} + YY_BREAK +case 307: +YY_RULE_SETUP +#line 1061 "read_input.l" +{ + printf("Unknown kappa in line %d.\n Must be a floating point number. Exiting...!\n",line_of_file); + exit(1); +} + YY_BREAK +case 308: +YY_RULE_SETUP +#line 1065 "read_input.l" +{ + printf("Unknown PhmcPrecisionPtilde in line %d.\n Must be a floating point number. Exiting...!\n",line_of_file); + exit(1); +} + YY_BREAK +case 309: +YY_RULE_SETUP +#line 1069 "read_input.l" +{ + printf("Unknown PhmcPrecisionHfin in line %d.\n Must be a floating point number. Exiting...!\n",line_of_file); + exit(1); +} + YY_BREAK +case 310: +YY_RULE_SETUP +#line 1073 "read_input.l" +{ + printf("Unknown Rec_EV in line %d.\n Must be an integer number. Exiting...!\n",line_of_file); + exit(1); +} + YY_BREAK +case 311: +YY_RULE_SETUP +#line 1077 "read_input.l" +{ + printf("Unknown PhmcMuBar in line %d.\n Must be a floating point number. Exiting...!\n",line_of_file); + exit(1); +} + YY_BREAK +case 312: +YY_RULE_SETUP +#line 1081 "read_input.l" +{ + printf("Unknown PhmcEpsBar in line %d.\n Must be a floating point number. Exiting...!\n",line_of_file); + exit(1); +} + YY_BREAK +case 313: +YY_RULE_SETUP +#line 1085 "read_input.l" +{ + printf("Unknown mu in line %d.\n Must be a floating point number. Exiting...!\n",line_of_file); + exit(1); +} + YY_BREAK +case 314: +YY_RULE_SETUP +#line 1089 "read_input.l" +{ + printf("Unknown mu in line %d.\n Must be a floating point number. Exiting...!\n",line_of_file); + exit(1); +} + YY_BREAK +case 315: +YY_RULE_SETUP +#line 1093 "read_input.l" +{ + printf("Unknown mu in line %d.\n Must be a floating point number. Exiting...!\n",line_of_file); + exit(1); +} + YY_BREAK +case 316: +YY_RULE_SETUP +#line 1097 "read_input.l" +{ + printf("Unknown beta in line %d.\n Must be a floating point number. Exiting...!\n",line_of_file); + exit(1); +} + YY_BREAK +case 317: +YY_RULE_SETUP +#line 1101 "read_input.l" +{ + printf("Unknown Startcondition in line %d! \n Must be hot, cold, continue or restart. Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 318: +YY_RULE_SETUP +#line 1105 "read_input.l" +{ + printf("Unknown number of TermSteps in line %d! \n Must be an integer. Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 319: +YY_RULE_SETUP +#line 1109 "read_input.l" +{ + printf("Unknown number of MeasSteps in line %d! \n Must be an integer. Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 320: +YY_RULE_SETUP +#line 1113 "read_input.l" +{ + printf("Unknown number of Sweeps to skip in line %d! \n Must be an integer. Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 321: +YY_RULE_SETUP +#line 1117 "read_input.l" +{ + printf("Unknown value for solver_flag in line %d! \n Must be bicgstab, cg, cgs, mr or gmres. Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 322: +YY_RULE_SETUP +#line 1121 "read_input.l" +{ + printf("Unknown value for operator_flag in line %d! \n Must be an integer. Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 323: +YY_RULE_SETUP +#line 1125 "read_input.l" +{ + printf("Unknown value for matrix_element_flag in line %d! \n Must be yes or no. Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 324: +YY_RULE_SETUP +#line 1129 "read_input.l" +{ + printf("Unknown value for save_config_flag in line %d! \n Must be yes or no! Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 325: +YY_RULE_SETUP +#line 1133 "read_input.l" +{ + printf("Unknown value for save_prop_flag in line %d! \n Must be yes or no! Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 326: +YY_RULE_SETUP +#line 1137 "read_input.l" +{ + printf("Unknown value for save_prop_g2_flag in line %d! \n Must be yes or no! Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 327: +YY_RULE_SETUP +#line 1141 "read_input.l" +{ + printf("Unknown value for write_checkpoint_flag in line %d! \n Must be yes or no! Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 328: +YY_RULE_SETUP +#line 1145 "read_input.l" +{ + printf("Unknown value for checkpoint interval in line %d! \n Must be an integer! Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 329: +YY_RULE_SETUP +#line 1149 "read_input.l" +{ + printf("Unknown value for Initial store counter in line %d! \n Must be an integer! Exiting...!\n",line_of_file); + exit(1); +} + YY_BREAK +case 330: +YY_RULE_SETUP +#line 1153 "read_input.l" +{ + printf("Unknown value for T in line %d!\n Must be an integer value! Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 331: +YY_RULE_SETUP +#line 1157 "read_input.l" +{ + printf("Unknown value for L in line %d!\n Must be an integer value! Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 332: +YY_RULE_SETUP +#line 1161 "read_input.l" +{ + printf("Unknown value for LX in line %d!\n Must be an integer value! Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 333: +YY_RULE_SETUP +#line 1165 "read_input.l" +{ + printf("Unknown value for LY in line %d!\n Must be an integer value! Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 334: +YY_RULE_SETUP +#line 1169 "read_input.l" +{ + printf("Unknown value for LZ in line %d!\n Must be an integer value! Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 335: +YY_RULE_SETUP +#line 1173 "read_input.l" +{ + printf("Unknown value for NRXProcs in line %d!\n Must be an integer value! Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 336: +YY_RULE_SETUP +#line 1177 "read_input.l" +{ + printf("Unknown value for NRYProcs in line %d!\n Must be an integer value! Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 337: +YY_RULE_SETUP +#line 1181 "read_input.l" +{ + printf("Unknown value for NRYProcs in line %d!\n Must be an integer value! Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 338: +YY_RULE_SETUP +#line 1185 "read_input.l" +{ + printf("Unknown value for StdIOProcessor in line %d!\n Must be all, no or an integer value! Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 339: +YY_RULE_SETUP +#line 1189 "read_input.l" +{ + printf("Error in line %d! Must be 32 or 64 Bit precision!\n", line_of_file); + exit(1); +} + YY_BREAK +case 340: +YY_RULE_SETUP +#line 1193 "read_input.l" +{ + printf("Error in line %d! Must be 32 or 64 Bit precision!\n", line_of_file); + exit(1); +} + YY_BREAK +case 341: +YY_RULE_SETUP +#line 1197 "read_input.l" +{ + printf("Error in line %d! Must be 32 or 64 Bit precision!\n", line_of_file); + exit(1); +} + YY_BREAK +case 342: +YY_RULE_SETUP +#line 1201 "read_input.l" +{ + printf("Error in line %d! Exiting...!\n",line_of_file); + exit(1); +} + YY_BREAK +case 343: +YY_RULE_SETUP +#line 1205 "read_input.l" +{ + printf("Error in line %d! Must be compute or readin! Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 344: +YY_RULE_SETUP +#line 1209 "read_input.l" +{ + printf("Error in line %d!\n", line_of_file); + exit(1); +} + YY_BREAK +case 345: +YY_RULE_SETUP +#line 1213 "read_input.l" +{ + printf("Error in line %d!\n", line_of_file); + exit(1); +} + YY_BREAK +case 346: +YY_RULE_SETUP +#line 1217 "read_input.l" +{ + printf("Unknown value for MaxSolverIterations in line %d! Must be an integer. Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 347: +YY_RULE_SETUP +#line 1221 "read_input.l" +{ + printf("Unknown value for SolverPrecision in line %d! Must be a floating point number. Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 348: +YY_RULE_SETUP +#line 1225 "read_input.l" +{ + printf("Unknown value for MassNumber in line %d! Must be an integer. Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 349: +YY_RULE_SETUP +#line 1229 "read_input.l" +{ + printf("Unknown value for RGIC1 in line %d! Must be a floating point number. Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 350: +YY_RULE_SETUP +#line 1233 "read_input.l" +{ + printf("Should be yes or no for relative precision in line %d! Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 351: +YY_RULE_SETUP +#line 1237 "read_input.l" +{ + printf("Unknown value for ForcePrecision in line %d! Must be a floating point number. Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 352: +YY_RULE_SETUP +#line 1241 "read_input.l" +{ + printf("Unknown value for AcceptancePrecision in line %d! Must be a floating point number. Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 353: +YY_RULE_SETUP +#line 1245 "read_input.l" +{ + printf("Unknown value for CSGHistMu in line %d! Must be an integer number. Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 354: +YY_RULE_SETUP +#line 1249 "read_input.l" +{ + printf("Unknown value in line %d! Must be yes or no. Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 355: +YY_RULE_SETUP +#line 1253 "read_input.l" +{ + printf("Unknown value in line %d! Must be a floating point number. Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 356: +YY_RULE_SETUP +#line 1257 "read_input.l" +{ + printf("Unknown value in line %d! Must be an integer. Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 357: +YY_RULE_SETUP +#line 1261 "read_input.l" +{ + printf("Unknown value in line %d! Must be 2+1+1 or 1+1. Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 358: +YY_RULE_SETUP +#line 1265 "read_input.l" +{ + printf("Unknown value in line %d! Must be an yes or no. Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 359: +YY_RULE_SETUP +#line 1269 "read_input.l" +{ + printf("Unknown value in line %d! Must be an yes or no. Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 360: +YY_RULE_SETUP +#line 1273 "read_input.l" +{ + printf("Unknown value in line %d! Must be a floating point number. Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 361: +YY_RULE_SETUP +#line 1277 "read_input.l" +{ + printf("Unknown value for SplittedPropagator in line %d! Must be yes or no. Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 362: +YY_RULE_SETUP +#line 1281 "read_input.l" +{ + printf("Unknown value for SplittedSource in line %d! Must be yes or no. Exiting...!\n", line_of_file); + exit(1); +} + YY_BREAK +case 363: +YY_RULE_SETUP +#line 1285 "read_input.l" +{ + printf("Unknown source_location in line %d.\n Must be an integer. Exiting...!\n",line_of_file); + exit(1); +} + YY_BREAK +case 364: +YY_RULE_SETUP +#line 1289 "read_input.l" +{ + printf("Unknown value for TimeScaleHeavyDoublet in line %d\n", line_of_file); + exit(1); +} + YY_BREAK +case 365: +YY_RULE_SETUP +#line 1293 "read_input.l" +{ + printf("Unknown value for UseEvenOdd in line %d\n Must be yes or no. Aborting!\n", line_of_file); +} + YY_BREAK +case 366: +YY_RULE_SETUP +#line 1296 "read_input.l" +{ + printf("Unknown value for WritePropagatorFormat in line %d\n Must be gwc or cmi. Aborting!\n", line_of_file); +} + YY_BREAK +case 367: +YY_RULE_SETUP +#line 1299 "read_input.l" +{ + printf("Unknown value for PerformOnlineMeasurements in line %d\n Must be yes or no. Aborting!\n", line_of_file); +} + YY_BREAK +case 368: +YY_RULE_SETUP +#line 1302 "read_input.l" +{ + printf("Unknown value for OnlineMeasurementsFreq in line %d\n Must be an integer. Aborting!\n", line_of_file); +} + YY_BREAK +case 369: +YY_RULE_SETUP +#line 1307 "read_input.l" +BEGIN(ERROR); + YY_BREAK +case 370: +YY_RULE_SETUP +#line 1308 "read_input.l" +{ + printf("Error in line %d: %s \n",line_of_file,yytext); + exit(1); +} + YY_BREAK +case 371: +YY_RULE_SETUP +#line 1314 "read_input.l" +ECHO; + YY_BREAK +#line 6001 "" +case YY_STATE_EOF(INITIAL): +case YY_STATE_EOF(BETA): +case YY_STATE_EOF(STARTCOND): +case YY_STATE_EOF(THERMSWEEPS): +case YY_STATE_EOF(NMEAS): +case YY_STATE_EOF(KAPPA): +case YY_STATE_EOF(ACCPTILDE): +case YY_STATE_EOF(ACCHFIN): +case YY_STATE_EOF(RECEV): +case YY_STATE_EOF(MUBAR): +case YY_STATE_EOF(EPSBAR): +case YY_STATE_EOF(MU): +case YY_STATE_EOF(MU2): +case YY_STATE_EOF(MU3): +case YY_STATE_EOF(SEED): +case YY_STATE_EOF(Q1): +case YY_STATE_EOF(Q2): +case YY_STATE_EOF(DTAU): +case YY_STATE_EOF(TAU): +case YY_STATE_EOF(NSTEPS): +case YY_STATE_EOF(CSW): +case YY_STATE_EOF(INTTYP): +case YY_STATE_EOF(NSMALL): +case YY_STATE_EOF(NSKIP): +case YY_STATE_EOF(RLXDINPUTFILE): +case YY_STATE_EOF(GAUGEINPUTFILE): +case YY_STATE_EOF(GAUGERPREC): +case YY_STATE_EOF(GAUGEWPREC): +case YY_STATE_EOF(SOLVFLAG): +case YY_STATE_EOF(OPFLAG): +case YY_STATE_EOF(MEFLAG): +case YY_STATE_EOF(SAVECONF): +case YY_STATE_EOF(SAVEPROP): +case YY_STATE_EOF(SAVEPRG2): +case YY_STATE_EOF(WRITECP): +case YY_STATE_EOF(CPINT): +case YY_STATE_EOF(NSTORE): +case YY_STATE_EOF(TT): +case YY_STATE_EOF(LL): +case YY_STATE_EOF(LLX): +case YY_STATE_EOF(LLY): +case YY_STATE_EOF(LLZ): +case YY_STATE_EOF(NPROCX): +case YY_STATE_EOF(NPROCY): +case YY_STATE_EOF(NPROCZ): +case YY_STATE_EOF(IOPROC): +case YY_STATE_EOF(IDX): +case YY_STATE_EOF(FPROP): +case YY_STATE_EOF(CGMAX): +case YY_STATE_EOF(BCGMAX): +case YY_STATE_EOF(BOUND): +case YY_STATE_EOF(SITER): +case YY_STATE_EOF(SPREC): +case YY_STATE_EOF(MNR): +case YY_STATE_EOF(RGIC): +case YY_STATE_EOF(READSOURCE): +case YY_STATE_EOF(SOURCEFORMAT): +case YY_STATE_EOF(SOURCEFILE): +case YY_STATE_EOF(SOURCETS): +case YY_STATE_EOF(INT0): +case YY_STATE_EOF(INT1): +case YY_STATE_EOF(INT2): +case YY_STATE_EOF(INT3): +case YY_STATE_EOF(INT4): +case YY_STATE_EOF(LAMBDA0): +case YY_STATE_EOF(LAMBDA1): +case YY_STATE_EOF(LAMBDA2): +case YY_STATE_EOF(LAMBDA3): +case YY_STATE_EOF(LAMBDA4): +case YY_STATE_EOF(RELPREC): +case YY_STATE_EOF(FORCEPREC): +case YY_STATE_EOF(FORCEPREC1): +case YY_STATE_EOF(FORCEPREC2): +case YY_STATE_EOF(FORCEPREC3): +case YY_STATE_EOF(ACCPREC): +case YY_STATE_EOF(ACCPREC1): +case YY_STATE_EOF(ACCPREC2): +case YY_STATE_EOF(ACCPREC3): +case YY_STATE_EOF(REVCHECK): +case YY_STATE_EOF(REVINT): +case YY_STATE_EOF(DEBUG): +case YY_STATE_EOF(CSGN1): +case YY_STATE_EOF(CSGN2): +case YY_STATE_EOF(CSGN3): +case YY_STATE_EOF(GMRESM): +case YY_STATE_EOF(GMRESDRNEV): +case YY_STATE_EOF(REPRORND): +case YY_STATE_EOF(SLOPPYPREC): +case YY_STATE_EOF(USESTOUT): +case YY_STATE_EOF(STOUTRHO): +case YY_STATE_EOF(STOUTITER): +case YY_STATE_EOF(PHMCFLAV): +case YY_STATE_EOF(COMPUTEEVS): +case YY_STATE_EOF(PCOMPUTEEVS): +case YY_STATE_EOF(PPP): +case YY_STATE_EOF(SMAX): +case YY_STATE_EOF(SMIN): +case YY_STATE_EOF(DEGP): +case YY_STATE_EOF(SPLITPROP): +case YY_STATE_EOF(SPLITSOURCE): +case YY_STATE_EOF(SRCLOC): +case YY_STATE_EOF(SUBEVCG): +case YY_STATE_EOF(NOEV): +case YY_STATE_EOF(PRECEV): +case YY_STATE_EOF(HEAVYTS): +case YY_STATE_EOF(EO): +case YY_STATE_EOF(WRPROPFLAG): +case YY_STATE_EOF(PROPPREC): +case YY_STATE_EOF(PROPTYPE): +case YY_STATE_EOF(ONMEAS): +case YY_STATE_EOF(ONFREQ): +case YY_STATE_EOF(COMMENT): +case YY_STATE_EOF(ERROR): + yyterminate(); + + case YY_END_OF_BUFFER: + { + /* Amount of text matched not including the EOB char. */ + int yy_amount_of_matched_text = (int) (yy_cp - (yytext_ptr)) - 1; + + /* Undo the effects of YY_DO_BEFORE_ACTION. */ + *yy_cp = (yy_hold_char); + YY_RESTORE_YY_MORE_OFFSET + + if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_NEW ) + { + /* We're scanning a new file or input source. It's + * possible that this happened because the user + * just pointed yyin at a new source and called + * yylex(). If so, then we have to assure + * consistency between YY_CURRENT_BUFFER and our + * globals. Here is the right place to do so, because + * this is the first action (other than possibly a + * back-up) that will match for the new input source. + */ + (yy_n_chars) = YY_CURRENT_BUFFER_LVALUE->yy_n_chars; + YY_CURRENT_BUFFER_LVALUE->yy_input_file = yyin; + YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = YY_BUFFER_NORMAL; + } + + /* Note that here we test for yy_c_buf_p "<=" to the position + * of the first EOB in the buffer, since yy_c_buf_p will + * already have been incremented past the NUL character + * (since all states make transitions on EOB to the + * end-of-buffer state). Contrast this with the test + * in input(). + */ + if ( (yy_c_buf_p) <= &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] ) + { /* This was really a NUL. */ + yy_state_type yy_next_state; + + (yy_c_buf_p) = (yytext_ptr) + yy_amount_of_matched_text; + + yy_current_state = yy_get_previous_state( ); + + /* Okay, we're now positioned to make the NUL + * transition. We couldn't have + * yy_get_previous_state() go ahead and do it + * for us because it doesn't know how to deal + * with the possibility of jamming (and we don't + * want to build jamming into it because then it + * will run more slowly). + */ + + yy_next_state = yy_try_NUL_trans( yy_current_state ); + + yy_bp = (yytext_ptr) + YY_MORE_ADJ; + + if ( yy_next_state ) + { + /* Consume the NUL. */ + yy_cp = ++(yy_c_buf_p); + yy_current_state = yy_next_state; + goto yy_match; + } + + else + { + yy_cp = (yy_last_accepting_cpos); + yy_current_state = (yy_last_accepting_state); + goto yy_find_action; + } + } + + else switch ( yy_get_next_buffer( ) ) + { + case EOB_ACT_END_OF_FILE: + { + (yy_did_buffer_switch_on_eof) = 0; + + if ( yywrap( ) ) + { + /* Note: because we've taken care in + * yy_get_next_buffer() to have set up + * yytext, we can now set up + * yy_c_buf_p so that if some total + * hoser (like flex itself) wants to + * call the scanner after we return the + * YY_NULL, it'll still work - another + * YY_NULL will get returned. + */ + (yy_c_buf_p) = (yytext_ptr) + YY_MORE_ADJ; + + yy_act = YY_STATE_EOF(YY_START); + goto do_action; + } + + else + { + if ( ! (yy_did_buffer_switch_on_eof) ) + YY_NEW_FILE; + } + break; + } + + case EOB_ACT_CONTINUE_SCAN: + (yy_c_buf_p) = + (yytext_ptr) + yy_amount_of_matched_text; + + yy_current_state = yy_get_previous_state( ); + + yy_cp = (yy_c_buf_p); + yy_bp = (yytext_ptr) + YY_MORE_ADJ; + goto yy_match; + + case EOB_ACT_LAST_MATCH: + (yy_c_buf_p) = + &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)]; + + yy_current_state = yy_get_previous_state( ); + + yy_cp = (yy_c_buf_p); + yy_bp = (yytext_ptr) + YY_MORE_ADJ; + goto yy_find_action; + } + break; + } + + default: + YY_FATAL_ERROR( + "fatal flex scanner internal error--no action found" ); + } /* end of action switch */ + } /* end of scanning one token */ +} /* end of yylex */ + +/* yy_get_next_buffer - try to read in a new buffer + * + * Returns a code representing an action: + * EOB_ACT_LAST_MATCH - + * EOB_ACT_CONTINUE_SCAN - continue scanning from current position + * EOB_ACT_END_OF_FILE - end of file + */ +static int yy_get_next_buffer (void) +{ + register char *dest = YY_CURRENT_BUFFER_LVALUE->yy_ch_buf; + register char *source = (yytext_ptr); + register int number_to_move, i; + int ret_val; + + if ( (yy_c_buf_p) > &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars) + 1] ) + YY_FATAL_ERROR( + "fatal flex scanner internal error--end of buffer missed" ); + + if ( YY_CURRENT_BUFFER_LVALUE->yy_fill_buffer == 0 ) + { /* Don't try to fill the buffer, so this is an EOF. */ + if ( (yy_c_buf_p) - (yytext_ptr) - YY_MORE_ADJ == 1 ) + { + /* We matched a single character, the EOB, so + * treat this as a final EOF. + */ + return EOB_ACT_END_OF_FILE; + } + + else + { + /* We matched some text prior to the EOB, first + * process it. + */ + return EOB_ACT_LAST_MATCH; + } + } + + /* Try to read more data. */ + + /* First move last chars to start of buffer. */ + number_to_move = (int) ((yy_c_buf_p) - (yytext_ptr)) - 1; + + for ( i = 0; i < number_to_move; ++i ) + *(dest++) = *(source++); + + if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_EOF_PENDING ) + /* don't do the read, it's not guaranteed to return an EOF, + * just force an EOF + */ + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars) = 0; + + else + { + int num_to_read = + YY_CURRENT_BUFFER_LVALUE->yy_buf_size - number_to_move - 1; + + while ( num_to_read <= 0 ) + { /* Not enough room in the buffer - grow it. */ + + /* just a shorter name for the current buffer */ + YY_BUFFER_STATE b = YY_CURRENT_BUFFER; + + int yy_c_buf_p_offset = + (int) ((yy_c_buf_p) - b->yy_ch_buf); + + if ( b->yy_is_our_buffer ) + { + int new_size = b->yy_buf_size * 2; + + if ( new_size <= 0 ) + b->yy_buf_size += b->yy_buf_size / 8; + else + b->yy_buf_size *= 2; + + b->yy_ch_buf = (char *) + /* Include room in for 2 EOB chars. */ + yyrealloc((void *) b->yy_ch_buf,b->yy_buf_size + 2 ); + } + else + /* Can't grow it, we don't own it. */ + b->yy_ch_buf = 0; + + if ( ! b->yy_ch_buf ) + YY_FATAL_ERROR( + "fatal error - scanner input buffer overflow" ); + + (yy_c_buf_p) = &b->yy_ch_buf[yy_c_buf_p_offset]; + + num_to_read = YY_CURRENT_BUFFER_LVALUE->yy_buf_size - + number_to_move - 1; + + } + + if ( num_to_read > YY_READ_BUF_SIZE ) + num_to_read = YY_READ_BUF_SIZE; + + /* Read in more data. */ + YY_INPUT( (&YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[number_to_move]), + (yy_n_chars), (size_t) num_to_read ); + + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars); + } + + if ( (yy_n_chars) == 0 ) + { + if ( number_to_move == YY_MORE_ADJ ) + { + ret_val = EOB_ACT_END_OF_FILE; + yyrestart(yyin ); + } + + else + { + ret_val = EOB_ACT_LAST_MATCH; + YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = + YY_BUFFER_EOF_PENDING; + } + } + + else + ret_val = EOB_ACT_CONTINUE_SCAN; + + (yy_n_chars) += number_to_move; + YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] = YY_END_OF_BUFFER_CHAR; + YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars) + 1] = YY_END_OF_BUFFER_CHAR; + + (yytext_ptr) = &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[0]; + + return ret_val; +} + +/* yy_get_previous_state - get the state just before the EOB char was reached */ + + static yy_state_type yy_get_previous_state (void) +{ + register yy_state_type yy_current_state; + register char *yy_cp; + + yy_current_state = (yy_start); + yy_current_state += YY_AT_BOL(); + + for ( yy_cp = (yytext_ptr) + YY_MORE_ADJ; yy_cp < (yy_c_buf_p); ++yy_cp ) + { + register YY_CHAR yy_c = (*yy_cp ? yy_ec[YY_SC_TO_UI(*yy_cp)] : 1); + if ( yy_accept[yy_current_state] ) + { + (yy_last_accepting_state) = yy_current_state; + (yy_last_accepting_cpos) = yy_cp; + } + while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) + { + yy_current_state = (int) yy_def[yy_current_state]; + if ( yy_current_state >= 2644 ) + yy_c = yy_meta[(unsigned int) yy_c]; + } + yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c]; + } + + return yy_current_state; +} + +/* yy_try_NUL_trans - try to make a transition on the NUL character + * + * synopsis + * next_state = yy_try_NUL_trans( current_state ); + */ + static yy_state_type yy_try_NUL_trans (yy_state_type yy_current_state ) +{ + register int yy_is_jam; + register char *yy_cp = (yy_c_buf_p); + + register YY_CHAR yy_c = 1; + if ( yy_accept[yy_current_state] ) + { + (yy_last_accepting_state) = yy_current_state; + (yy_last_accepting_cpos) = yy_cp; + } + while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) + { + yy_current_state = (int) yy_def[yy_current_state]; + if ( yy_current_state >= 2644 ) + yy_c = yy_meta[(unsigned int) yy_c]; + } + yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c]; + yy_is_jam = (yy_current_state == 2643); + + return yy_is_jam ? 0 : yy_current_state; +} + + static void yyunput (int c, register char * yy_bp ) +{ + register char *yy_cp; + + yy_cp = (yy_c_buf_p); + + /* undo effects of setting up yytext */ + *yy_cp = (yy_hold_char); + + if ( yy_cp < YY_CURRENT_BUFFER_LVALUE->yy_ch_buf + 2 ) + { /* need to shift things up to make room */ + /* +2 for EOB chars. */ + register int number_to_move = (yy_n_chars) + 2; + register char *dest = &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[ + YY_CURRENT_BUFFER_LVALUE->yy_buf_size + 2]; + register char *source = + &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[number_to_move]; + + while ( source > YY_CURRENT_BUFFER_LVALUE->yy_ch_buf ) + *--dest = *--source; + + yy_cp += (int) (dest - source); + yy_bp += (int) (dest - source); + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = + (yy_n_chars) = YY_CURRENT_BUFFER_LVALUE->yy_buf_size; + + if ( yy_cp < YY_CURRENT_BUFFER_LVALUE->yy_ch_buf + 2 ) + YY_FATAL_ERROR( "flex scanner push-back overflow" ); + } + + *--yy_cp = (char) c; + + (yytext_ptr) = yy_bp; + (yy_hold_char) = *yy_cp; + (yy_c_buf_p) = yy_cp; +} + +#ifndef YY_NO_INPUT +#ifdef __cplusplus + static int yyinput (void) +#else + static int input (void) +#endif + +{ + int c; + + *(yy_c_buf_p) = (yy_hold_char); + + if ( *(yy_c_buf_p) == YY_END_OF_BUFFER_CHAR ) + { + /* yy_c_buf_p now points to the character we want to return. + * If this occurs *before* the EOB characters, then it's a + * valid NUL; if not, then we've hit the end of the buffer. + */ + if ( (yy_c_buf_p) < &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] ) + /* This was really a NUL. */ + *(yy_c_buf_p) = '\0'; + + else + { /* need more input */ + int offset = (yy_c_buf_p) - (yytext_ptr); + ++(yy_c_buf_p); + + switch ( yy_get_next_buffer( ) ) + { + case EOB_ACT_LAST_MATCH: + /* This happens because yy_g_n_b() + * sees that we've accumulated a + * token and flags that we need to + * try matching the token before + * proceeding. But for input(), + * there's no matching to consider. + * So convert the EOB_ACT_LAST_MATCH + * to EOB_ACT_END_OF_FILE. + */ + + /* Reset buffer status. */ + yyrestart(yyin ); + + /*FALLTHROUGH*/ + + case EOB_ACT_END_OF_FILE: + { + if ( yywrap( ) ) + return EOF; + + if ( ! (yy_did_buffer_switch_on_eof) ) + YY_NEW_FILE; +#ifdef __cplusplus + return yyinput(); +#else + return input(); +#endif + } + + case EOB_ACT_CONTINUE_SCAN: + (yy_c_buf_p) = (yytext_ptr) + offset; + break; + } + } + } + + c = *(unsigned char *) (yy_c_buf_p); /* cast for 8-bit char's */ + *(yy_c_buf_p) = '\0'; /* preserve yytext */ + (yy_hold_char) = *++(yy_c_buf_p); + + YY_CURRENT_BUFFER_LVALUE->yy_at_bol = (c == '\n'); + + return c; +} +#endif /* ifndef YY_NO_INPUT */ + +/** Immediately switch to a different input stream. + * @param input_file A readable stream. + * + * @note This function does not reset the start condition to @c INITIAL . + */ + void yyrestart (FILE * input_file ) +{ + + if ( ! YY_CURRENT_BUFFER ){ + yyensure_buffer_stack (); + YY_CURRENT_BUFFER_LVALUE = + yy_create_buffer(yyin,YY_BUF_SIZE ); + } + + yy_init_buffer(YY_CURRENT_BUFFER,input_file ); + yy_load_buffer_state( ); +} + +/** Switch to a different input buffer. + * @param new_buffer The new input buffer. + * + */ + void yy_switch_to_buffer (YY_BUFFER_STATE new_buffer ) +{ + + /* TODO. We should be able to replace this entire function body + * with + * yypop_buffer_state(); + * yypush_buffer_state(new_buffer); + */ + yyensure_buffer_stack (); + if ( YY_CURRENT_BUFFER == new_buffer ) + return; + + if ( YY_CURRENT_BUFFER ) + { + /* Flush out information for old buffer. */ + *(yy_c_buf_p) = (yy_hold_char); + YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = (yy_c_buf_p); + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars); + } + + YY_CURRENT_BUFFER_LVALUE = new_buffer; + yy_load_buffer_state( ); + + /* We don't actually know whether we did this switch during + * EOF (yywrap()) processing, but the only time this flag + * is looked at is after yywrap() is called, so it's safe + * to go ahead and always set it. + */ + (yy_did_buffer_switch_on_eof) = 1; +} + +static void yy_load_buffer_state (void) +{ + (yy_n_chars) = YY_CURRENT_BUFFER_LVALUE->yy_n_chars; + (yytext_ptr) = (yy_c_buf_p) = YY_CURRENT_BUFFER_LVALUE->yy_buf_pos; + yyin = YY_CURRENT_BUFFER_LVALUE->yy_input_file; + (yy_hold_char) = *(yy_c_buf_p); +} + +/** Allocate and initialize an input buffer state. + * @param file A readable stream. + * @param size The character buffer size in bytes. When in doubt, use @c YY_BUF_SIZE. + * + * @return the allocated buffer state. + */ + YY_BUFFER_STATE yy_create_buffer (FILE * file, int size ) +{ + YY_BUFFER_STATE b; + + b = (YY_BUFFER_STATE) yyalloc(sizeof( struct yy_buffer_state ) ); + if ( ! b ) + YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" ); + + b->yy_buf_size = size; + + /* yy_ch_buf has to be 2 characters longer than the size given because + * we need to put in 2 end-of-buffer characters. + */ + b->yy_ch_buf = (char *) yyalloc(b->yy_buf_size + 2 ); + if ( ! b->yy_ch_buf ) + YY_FATAL_ERROR( "out of dynamic memory in yy_create_buffer()" ); + + b->yy_is_our_buffer = 1; + + yy_init_buffer(b,file ); + + return b; +} + +/** Destroy the buffer. + * @param b a buffer created with yy_create_buffer() + * + */ + void yy_delete_buffer (YY_BUFFER_STATE b ) +{ + + if ( ! b ) + return; + + if ( b == YY_CURRENT_BUFFER ) /* Not sure if we should pop here. */ + YY_CURRENT_BUFFER_LVALUE = (YY_BUFFER_STATE) 0; + + if ( b->yy_is_our_buffer ) + yyfree((void *) b->yy_ch_buf ); + + yyfree((void *) b ); +} + +/* Initializes or reinitializes a buffer. + * This function is sometimes called more than once on the same buffer, + * such as during a yyrestart() or at EOF. + */ + static void yy_init_buffer (YY_BUFFER_STATE b, FILE * file ) + +{ + int oerrno = errno; + + yy_flush_buffer(b ); + + b->yy_input_file = file; + b->yy_fill_buffer = 1; + + /* If b is the current buffer, then yy_init_buffer was _probably_ + * called from yyrestart() or through yy_get_next_buffer. + * In that case, we don't want to reset the lineno or column. + */ + if (b != YY_CURRENT_BUFFER){ + b->yy_bs_lineno = 1; + b->yy_bs_column = 0; + } + + b->yy_is_interactive = 0; + + errno = oerrno; +} + +/** Discard all buffered characters. On the next scan, YY_INPUT will be called. + * @param b the buffer state to be flushed, usually @c YY_CURRENT_BUFFER. + * + */ + void yy_flush_buffer (YY_BUFFER_STATE b ) +{ + if ( ! b ) + return; + + b->yy_n_chars = 0; + + /* We always need two end-of-buffer characters. The first causes + * a transition to the end-of-buffer state. The second causes + * a jam in that state. + */ + b->yy_ch_buf[0] = YY_END_OF_BUFFER_CHAR; + b->yy_ch_buf[1] = YY_END_OF_BUFFER_CHAR; + + b->yy_buf_pos = &b->yy_ch_buf[0]; + + b->yy_at_bol = 1; + b->yy_buffer_status = YY_BUFFER_NEW; + + if ( b == YY_CURRENT_BUFFER ) + yy_load_buffer_state( ); +} + +/** Pushes the new state onto the stack. The new state becomes + * the current state. This function will allocate the stack + * if necessary. + * @param new_buffer The new state. + * + */ +void yypush_buffer_state (YY_BUFFER_STATE new_buffer ) +{ + if (new_buffer == NULL) + return; + + yyensure_buffer_stack(); + + /* This block is copied from yy_switch_to_buffer. */ + if ( YY_CURRENT_BUFFER ) + { + /* Flush out information for old buffer. */ + *(yy_c_buf_p) = (yy_hold_char); + YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = (yy_c_buf_p); + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars); + } + + /* Only push if top exists. Otherwise, replace top. */ + if (YY_CURRENT_BUFFER) + (yy_buffer_stack_top)++; + YY_CURRENT_BUFFER_LVALUE = new_buffer; + + /* copied from yy_switch_to_buffer. */ + yy_load_buffer_state( ); + (yy_did_buffer_switch_on_eof) = 1; +} + +/** Removes and deletes the top of the stack, if present. + * The next element becomes the new top. + * + */ +void yypop_buffer_state (void) +{ + if (!YY_CURRENT_BUFFER) + return; + + yy_delete_buffer(YY_CURRENT_BUFFER ); + YY_CURRENT_BUFFER_LVALUE = NULL; + if ((yy_buffer_stack_top) > 0) + --(yy_buffer_stack_top); + + if (YY_CURRENT_BUFFER) { + yy_load_buffer_state( ); + (yy_did_buffer_switch_on_eof) = 1; + } +} + +/* Allocates the stack if it does not exist. + * Guarantees space for at least one push. + */ +static void yyensure_buffer_stack (void) +{ + int num_to_alloc; + + if (!(yy_buffer_stack)) { + + /* First allocation is just for 2 elements, since we don't know if this + * scanner will even need a stack. We use 2 instead of 1 to avoid an + * immediate realloc on the next call. + */ + num_to_alloc = 1; + (yy_buffer_stack) = (struct yy_buffer_state**)yyalloc + (num_to_alloc * sizeof(struct yy_buffer_state*) + ); + + memset((yy_buffer_stack), 0, num_to_alloc * sizeof(struct yy_buffer_state*)); + + (yy_buffer_stack_max) = num_to_alloc; + (yy_buffer_stack_top) = 0; + return; + } + + if ((yy_buffer_stack_top) >= ((yy_buffer_stack_max)) - 1){ + + /* Increase the buffer to prepare for a possible push. */ + int grow_size = 8 /* arbitrary grow size */; + + num_to_alloc = (yy_buffer_stack_max) + grow_size; + (yy_buffer_stack) = (struct yy_buffer_state**)yyrealloc + ((yy_buffer_stack), + num_to_alloc * sizeof(struct yy_buffer_state*) + ); + + /* zero only the new slots.*/ + memset((yy_buffer_stack) + (yy_buffer_stack_max), 0, grow_size * sizeof(struct yy_buffer_state*)); + (yy_buffer_stack_max) = num_to_alloc; + } +} + +/** Setup the input buffer state to scan directly from a user-specified character buffer. + * @param base the character buffer + * @param size the size in bytes of the character buffer + * + * @return the newly allocated buffer state object. + */ +YY_BUFFER_STATE yy_scan_buffer (char * base, yy_size_t size ) +{ + YY_BUFFER_STATE b; + + if ( size < 2 || + base[size-2] != YY_END_OF_BUFFER_CHAR || + base[size-1] != YY_END_OF_BUFFER_CHAR ) + /* They forgot to leave room for the EOB's. */ + return 0; + + b = (YY_BUFFER_STATE) yyalloc(sizeof( struct yy_buffer_state ) ); + if ( ! b ) + YY_FATAL_ERROR( "out of dynamic memory in yy_scan_buffer()" ); + + b->yy_buf_size = size - 2; /* "- 2" to take care of EOB's */ + b->yy_buf_pos = b->yy_ch_buf = base; + b->yy_is_our_buffer = 0; + b->yy_input_file = 0; + b->yy_n_chars = b->yy_buf_size; + b->yy_is_interactive = 0; + b->yy_at_bol = 1; + b->yy_fill_buffer = 0; + b->yy_buffer_status = YY_BUFFER_NEW; + + yy_switch_to_buffer(b ); + + return b; +} + +/** Setup the input buffer state to scan a string. The next call to yylex() will + * scan from a @e copy of @a str. + * @param yystr a NUL-terminated string to scan + * + * @return the newly allocated buffer state object. + * @note If you want to scan bytes that may contain NUL values, then use + * yy_scan_bytes() instead. + */ +YY_BUFFER_STATE yy_scan_string (yyconst char * yystr ) +{ + + return yy_scan_bytes(yystr,strlen(yystr) ); +} + +/** Setup the input buffer state to scan the given bytes. The next call to yylex() will + * scan from a @e copy of @a bytes. + * @param bytes the byte buffer to scan + * @param len the number of bytes in the buffer pointed to by @a bytes. + * + * @return the newly allocated buffer state object. + */ +YY_BUFFER_STATE yy_scan_bytes (yyconst char * yybytes, int _yybytes_len ) +{ + YY_BUFFER_STATE b; + char *buf; + yy_size_t n; + int i; + + /* Get memory for full buffer, including space for trailing EOB's. */ + n = _yybytes_len + 2; + buf = (char *) yyalloc(n ); + if ( ! buf ) + YY_FATAL_ERROR( "out of dynamic memory in yy_scan_bytes()" ); + + for ( i = 0; i < _yybytes_len; ++i ) + buf[i] = yybytes[i]; + + buf[_yybytes_len] = buf[_yybytes_len+1] = YY_END_OF_BUFFER_CHAR; + + b = yy_scan_buffer(buf,n ); + if ( ! b ) + YY_FATAL_ERROR( "bad buffer in yy_scan_bytes()" ); + + /* It's okay to grow etc. this buffer, and we should throw it + * away when we're done. + */ + b->yy_is_our_buffer = 1; + + return b; +} + +#ifndef YY_EXIT_FAILURE +#define YY_EXIT_FAILURE 2 +#endif + +static void yy_fatal_error (yyconst char* msg ) +{ + (void) fprintf( stderr, "%s\n", msg ); + exit( YY_EXIT_FAILURE ); +} + +/* Redefine yyless() so it works in section 3 code. */ + +#undef yyless +#define yyless(n) \ + do \ + { \ + /* Undo effects of setting up yytext. */ \ + int yyless_macro_arg = (n); \ + YY_LESS_LINENO(yyless_macro_arg);\ + yytext[yyleng] = (yy_hold_char); \ + (yy_c_buf_p) = yytext + yyless_macro_arg; \ + (yy_hold_char) = *(yy_c_buf_p); \ + *(yy_c_buf_p) = '\0'; \ + yyleng = yyless_macro_arg; \ + } \ + while ( 0 ) + +/* Accessor methods (get/set functions) to struct members. */ + +/** Get the current line number. + * + */ +int yyget_lineno (void) +{ + + return yylineno; +} + +/** Get the input stream. + * + */ +FILE *yyget_in (void) +{ + return yyin; +} + +/** Get the output stream. + * + */ +FILE *yyget_out (void) +{ + return yyout; +} + +/** Get the length of the current token. + * + */ +int yyget_leng (void) +{ + return yyleng; +} + +/** Get the current token. + * + */ + +char *yyget_text (void) +{ + return yytext; +} + +/** Set the current line number. + * @param line_number + * + */ +void yyset_lineno (int line_number ) +{ + + yylineno = line_number; +} + +/** Set the input stream. This does not discard the current + * input buffer. + * @param in_str A readable stream. + * + * @see yy_switch_to_buffer + */ +void yyset_in (FILE * in_str ) +{ + yyin = in_str ; +} + +void yyset_out (FILE * out_str ) +{ + yyout = out_str ; +} + +int yyget_debug (void) +{ + return yy_flex_debug; +} + +void yyset_debug (int bdebug ) +{ + yy_flex_debug = bdebug ; +} + +static int yy_init_globals (void) +{ + /* Initialization is the same as for the non-reentrant scanner. + * This function is called from yylex_destroy(), so don't allocate here. + */ + + (yy_buffer_stack) = 0; + (yy_buffer_stack_top) = 0; + (yy_buffer_stack_max) = 0; + (yy_c_buf_p) = (char *) 0; + (yy_init) = 0; + (yy_start) = 0; + +/* Defined in main.c */ +#ifdef YY_STDINIT + yyin = stdin; + yyout = stdout; +#else + yyin = (FILE *) 0; + yyout = (FILE *) 0; +#endif + + /* For future reference: Set errno on error, since we are called by + * yylex_init() + */ + return 0; +} + +/* yylex_destroy is for both reentrant and non-reentrant scanners. */ +int yylex_destroy (void) +{ + + /* Pop the buffer stack, destroying each element. */ + while(YY_CURRENT_BUFFER){ + yy_delete_buffer(YY_CURRENT_BUFFER ); + YY_CURRENT_BUFFER_LVALUE = NULL; + yypop_buffer_state(); + } + + /* Destroy the stack itself. */ + yyfree((yy_buffer_stack) ); + (yy_buffer_stack) = NULL; + + /* Reset the globals. This is important in a non-reentrant scanner so the next time + * yylex() is called, initialization will occur. */ + yy_init_globals( ); + + return 0; +} + +/* + * Internal utility routines. + */ + +#ifndef yytext_ptr +static void yy_flex_strncpy (char* s1, yyconst char * s2, int n ) +{ + register int i; + for ( i = 0; i < n; ++i ) + s1[i] = s2[i]; +} +#endif + +#ifdef YY_NEED_STRLEN +static int yy_flex_strlen (yyconst char * s ) +{ + register int n; + for ( n = 0; s[n]; ++n ) + ; + + return n; +} +#endif + +void *yyalloc (yy_size_t size ) +{ + return (void *) malloc( size ); +} + +void *yyrealloc (void * ptr, yy_size_t size ) +{ + /* The cast to (char *) in the following accommodates both + * implementations that use char* generic pointers, and those + * that use void* generic pointers. It works with the latter + * because both ANSI C and C++ allow castless assignment from + * any pointer type to void*, and deal with argument conversions + * as though doing an assignment. + */ + return (void *) realloc( (char *) ptr, size ); +} + +void yyfree (void * ptr ) +{ + free( (char *) ptr ); /* see yyrealloc() for (char *) cast */ +} + +#define YYTABLES_NAME "yytables" + +#line 1314 "read_input.l" + + + +/* + * Dummy (but not dumb) routine - well, function + */ + +int yywrap() +{ + return(1); +} + +/* + * This is the function to parse the input file. + * default values for all paramters will be set + * correspondig to settings in + * default_input_values.h + * + * read_input expects the filename of the input file + * as an input parameter. + * + * read_input returns 2 if the input file did not exist + */ + +int read_input(char * conf_file){ + + /******************************************** + * Setting default values! + ********************************************/ + +#ifndef FIXEDVOLUME + T_global = _default_T_global; + L = _default_L; + LX = _default_LX; + LY = _default_LY; + LZ = _default_LZ; + N_PROC_X = _default_N_PROC_X; + N_PROC_Y = _default_N_PROC_Y; + N_PROC_Z = _default_N_PROC_Z; +#endif + g_kappa = _default_g_kappa; + g_acc_Ptilde = _default_g_acc_Ptilde; + g_acc_Hfin = _default_g_acc_Hfin; + g_rec_ev = _default_g_rec_ev; + g_mubar = _default_g_mubar; + g_epsbar = _default_g_epsbar; + g_mu = _default_g_mu; + g_mu1 = _default_g_mu1; + g_mu2 = _default_g_mu2; + g_mu3 = _default_g_mu3; + g_beta = _default_g_beta; + g_c_sw = _default_g_c_sw; + dtau = _default_dtau; + tau = _default_tau; + Nsteps = _default_Nsteps; + nsmall = _default_nsmall; + integtyp = _default_integtyp; + random_seed = _default_random_seed; + matrix_element_flag = _default_matrix_element_flag; + solver_flag = _default_solver_flag; + operator_flag = _default_operator_flag; + startoption = _default_startoption; + Ntherm = _default_Ntherm; + Nmeas = _default_Nmeas; + Nskip = _default_Nskip; + save_config_flag = _default_save_config_flag; + save_prop_flag = _default_save_prop_flag; + save_prop_g2_flag = _default_save_prop_g2_flag; + write_cp_flag = _default_write_cp_flag; + cp_interval = _default_cp_interval; + nstore = _default_nstore; + strcpy(rlxd_input_filename, _default_rlxd_input_filename); + strcpy(gauge_input_filename, _default_gauge_input_filename); + g_stdio_proc = _default_g_stdio_proc; + index_start = _default_index_start; + index_end = _default_index_end; + first_prop_flag = _default_first_prop_flag; + ITER_MAX_CG = _default_ITER_MAX_CG; + ITER_MAX_BCG = _default_ITER_MAX_BCG; + X0 = _default_X0; + max_solver_iterations = _default_max_solver_iterations; + solver_precision = _default_solver_precision; + mass_number = _default_mass_number; + g_rgi_C1 = _default_g_rgi_C1; + read_source_flag= _default_read_source_flag; + strcpy(source_input_filename, _default_source_filename); + g_eps_sq_force = _default_g_eps_sq_force; + g_eps_sq_acc = _default_g_eps_sq_acc; + g_eps_sq_force1 = _default_g_eps_sq_force1; + g_eps_sq_acc1 = _default_g_eps_sq_acc1; + g_eps_sq_force2 = _default_g_eps_sq_force2; + g_eps_sq_acc2 = _default_g_eps_sq_acc2; + g_eps_sq_force3 = _default_g_eps_sq_force3; + g_eps_sq_acc3 = _default_g_eps_sq_acc3; + g_relative_precision_flag = _default_g_relative_precision_flag; + return_check_flag = _default_return_check_flag; + return_check_interval = _default_return_check_interval; + g_debug_level = _default_g_debug_level; + g_csg_N[0] = _default_g_csg_N; + g_csg_N[2] = _default_g_csg_N; + g_csg_N[4] = _default_g_csg_N; + g_csg_N[6] = _default_g_csg_N; + lambda[0] = _default_2mn_lambda; + lambda[1] = _default_2mn_lambda; + lambda[2] = _default_2mn_lambda; + lambda[3] = _default_2mn_lambda; + source_format_flag = _default_source_format_flag; + source_time_slice = _default_source_time_slice; + gmres_m_parameter = _default_gmres_m_parameter; + gmresdr_nr_ev = _default_gmresdr_nr_ev; + gauge_precision_read_flag = _default_gauge_precision_read_flag; + gauge_precision_write_flag = _default_gauge_precision_write_flag; + prop_precision_flag = _default_prop_precision_flag; + reproduce_randomnumber_flag = _default_reproduce_randomnumber_flag; + g_sloppy_precision_flag = _default_g_sloppy_precision_flag; + use_stout_flag = _default_use_stout_flag; + stout_rho = _default_stout_rho; + stout_no_iter = _default_stout_no_iter; + /* check for reread ! */ + phmc_no_flavours = _default_phmc_no_flavours; + phmc_compute_evs = _default_phmc_compute_evs; + compute_evs = _default_compute_evs; + stilde_min = _default_stilde_min; + stilde_max = _default_stilde_max; + degree_of_p = _default_degree_of_p; + propagator_splitted = _default_propagator_splitted; + source_splitted = _default_source_splitted; + source_location = _default_source_location; + eigenvalue_precision = _default_eigenvalue_precision; + no_eigenvalues = _default_no_eigenvalues; + sub_evs_cg_flag = _default_sub_evs_cg_flag; + phmc_heavy_timescale = _default_phmc_heavy_timescale; + phmc_exact_poly = _default_phmc_exact_poly; + even_odd_flag = _default_even_odd_flag; + online_measurement_flag = _default_online_measurement_flag; + online_measurement_freq = _default_online_measurement_freq; + + /* Put -1 in write_prop_format_flag to see if parse_config() will + change the value. If not then set it to source_format_flag */ + write_prop_format_flag = -1; + /********************************************/ + + if ((yyin = fopen(conf_file, "rt")) == NULL){ + return(2); + } + yyout = fopen("/dev/null", "w"); + + parse_config(); +#ifndef FIXEDVOLUME + if(LX == 0) { + LX = L; + } + if(LY == 0) { + LY = L; + } + if(LZ == 0) { + LZ = L; + } +#endif + + if(g_eps_sq_force1 < 0) g_eps_sq_force1 = g_eps_sq_force; + if(g_eps_sq_force2 < 0) g_eps_sq_force2 = g_eps_sq_force; + if(g_eps_sq_force3 < 0) g_eps_sq_force3 = g_eps_sq_force; + if(g_eps_sq_acc1 < 0) g_eps_sq_acc1 = g_eps_sq_acc; + if(g_eps_sq_acc2 < 0) g_eps_sq_acc2 = g_eps_sq_acc; + if(g_eps_sq_acc3 < 0) g_eps_sq_acc3 = g_eps_sq_acc; + + if(write_prop_format_flag == -1) write_prop_format_flag = source_format_flag; + g_rgi_C0 = 1. - 8.*g_rgi_C1; + g_ka_csw_8 = g_kappa*g_c_sw/8.; + + fclose(yyout); + fclose(yyin); + return(0); +} + + +/* + * This is the function to parse the input file + * again. Only parameters are changed, that + * are specified in the input file. + * default values for paramters will not be set. + * + * reread_input expects the filename of the input file + * as an input parameter. + * + * reread_input returns 2 if the input file did not exist + */ + +int reread_input(char * conf_file){ +#ifndef FIXEDVOLUME + int tt=T, ll=L, lx = LX, ly = LY, lz = LZ, + np=N_PROC_X, npy = N_PROC_Y; +#endif + int nst=nstore, j=0; + double m2 = g_mu2, m3 = g_mu3; + int n1 = g_csg_N[0], n2 = g_csg_N[2], n3 = g_csg_N[4], n4 = g_csg_N[6]; + double x; + + /******************************************** + * Setting default values! + ********************************************/ + + /********************************************/ + + if ((yyin = fopen(conf_file, "rt")) == NULL){ + return(2); + } + yyout = fopen("/dev/null", "w"); + + parse_config(); + +#ifndef FIXEDVOLUME + T = tt; + L = ll; + LX = lx; + LY = ly; + LZ = lz; + N_PROC_X = np; + N_PROC_Y = npy; +#endif + g_csg_N[0] = n1; + g_csg_N[2] = n2; + g_csg_N[4] = n3; + g_csg_N[6] = n4; + + + if(g_dbw2rand == 0) { + g_rgi_C1 = 0.; + } + nstore = nst; + + g_rgi_C0 = 1. - 8.*g_rgi_C1; + g_ka_csw_8 = g_kappa*g_c_sw/8.; + + if(g_mu3 > 0. && g_mu3 != m3) { + g_mu = g_mu1; + g_mu1 = g_mu3; + g_mu3 = g_mu; + + j = int_n[1]; + int_n[1] = int_n[3]; + int_n[3] = j; + + x = lambda[1]; + lambda[1] = lambda[3]; + lambda[3] = x; + + g_nr_of_psf = 3; + } + else if(g_mu2 > 0. && g_mu2 != m2) { + g_mu = g_mu1; + g_mu1 = g_mu2; + g_mu2 = g_mu; + + int_n[3] = int_n[1]; + int_n[1] = int_n[2]; + int_n[2] = int_n[3]; + + lambda[3] = lambda[1]; + lambda[1] = lambda[2]; + lambda[2] = lambda[3]; + + g_nr_of_psf = 2; + } + for(j = 0; j < g_nr_of_psf+1; j++) { + if(int_n[j] == 0) int_n[j] = 1; + } + if(g_nr_of_psf == 3) { + g_eps_sq_force = g_eps_sq_force1; + g_eps_sq_force1 = g_eps_sq_force3; + g_eps_sq_force3 = g_eps_sq_force; + g_eps_sq_acc = g_eps_sq_acc1; + g_eps_sq_acc1 = g_eps_sq_acc3; + g_eps_sq_acc3 = g_eps_sq_acc; + } + if(g_nr_of_psf == 2) { + g_eps_sq_force = g_eps_sq_force1; + g_eps_sq_force1 = g_eps_sq_force2; + g_eps_sq_force2 = g_eps_sq_force; + g_eps_sq_acc = g_eps_sq_acc1; + g_eps_sq_acc1 = g_eps_sq_acc2; + g_eps_sq_acc2 = g_eps_sq_acc; + } + g_mu = g_mu1; + g_eps_sq_acc = g_eps_sq_acc1; + g_eps_sq_force = g_eps_sq_force1; + + fclose(yyout); + fclose(yyin); + return(0); +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/tmp/read_input.tmp.h b/qcd/part_cpu/applications/QCD/src/kernel_D/tmp/read_input.tmp.h new file mode 100644 index 0000000000000000000000000000000000000000..f74b2519fe935ac3889af5da2de16fa99b3e630a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/tmp/read_input.tmp.h @@ -0,0 +1,104 @@ +/* $Id: read_input.h,v 1.31 2008/07/31 22:07:49 urbach Exp $ */ + +/* + * This is the function to parse the input file. + * No default values for any paramter will be set + * + * read_inputg expects the filename of the input file + * as an input parameter. + * + * read_input returns 2 if the input file did not exist + */ + +#ifndef _PARSER_H +#define _PARSER_H + +#define COLD 0 +#define HOT 1 +#define RESTART 2 +#define CONTINUE 3 + +#ifdef __cplusplus +extern "C" +{ +#endif /* __cplusplus */ + + /* input parameters defined in */ + /* read_input.h */ + extern int verbose; + extern int startoption; + extern int Ntherm; + extern int Nmeas; + extern int Nskip; + extern int solver_flag; + extern int gmres_m_parameter, gmresdr_nr_ev; + extern int operator_flag; + extern int matrix_element_flag; + extern int save_config_flag; + extern int save_prop_flag; + extern int save_prop_g2_flag; + extern int write_cp_flag; + extern int cp_interval; + extern int nstore; + extern int int_n[4]; + extern double lambda[4]; + extern int crylov_space_dim; + extern char rlxd_input_filename[100]; + extern char gauge_input_filename[100]; + extern int subforwilson_flag; + extern int eigenvalue_method_flag; + extern int eigenvalue_max_iterations; + extern double eigenvalue_precision; + extern int index_start; + extern int index_end; + extern int first_prop_flag; + extern double dtau, tau; + extern int Nsteps; + extern int random_seed; + extern int integtyp,nsmall; + extern int ITER_MAX_BCG; + extern int ITER_MAX_CG; + extern double X0; + extern int max_solver_iterations; + extern double solver_precision; + extern int mass_number; + extern int read_source_flag; + extern char source_input_filename[100]; + extern int return_check_flag; + extern int return_check_interval; + extern int source_format_flag; + extern int source_time_slice; + extern int gauge_precision_read_flag; + extern int gauge_precision_write_flag; + extern int prop_precision_flag; + extern int reproduce_randomnumber_flag; + extern double stout_rho; + extern int stout_no_iter; + extern int use_stout_flag; + extern int phmc_no_flavours; + extern int phmc_heavy_timescale; + extern int phmc_compute_evs; + extern int phmc_exact_poly; + extern int compute_evs; + extern int no_eigenvalues; + extern double eigenvalue_precision; + extern double stilde_max; + extern double stilde_min; + extern int degree_of_p; + extern int propagator_splitted; + extern int source_splitted; + extern int source_location; + extern int sub_evs_cg_flag; + extern int even_odd_flag; + extern int write_prop_format_flag; + extern int online_measurement_flag; + extern int online_measurement_freq; + + int read_input(char *); + int reread_input(char *); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/tmp/read_input.tmp.l b/qcd/part_cpu/applications/QCD/src/kernel_D/tmp/read_input.tmp.l new file mode 100644 index 0000000000000000000000000000000000000000..4c1ad73bba9522b83586ff9e50dd7ebec30385ce --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/tmp/read_input.tmp.l @@ -0,0 +1,2431 @@ +/*********************************************************************** + * + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * Modified by Jenifer Gonzalez Lopez 2009/03/27 + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * This is the parser. (Dec 2002) + * The .c-file is generated from .l using flex. + * Please edit read_input.l instead of read_input.c! + * flex should be said to be case insensitive! + * + * After modifiing read_input.l please update read_input.c with + * flex -Ptmlqcd -i -t read_input.l > read_input.c + * + * Autor: Carsten Urbach + * urbach@physik.fu-berlin.de + ***********************************************************************/ + +SPC [[:blank:]]+ +CMD [:][[:space:]]+ +RLN [1-9(10)(11)(12)(13)(14)(15)(16)][:] +DIGIT [[:digit:]] +ZT [0-9(10)(11)] +IDXEX ("-"{DIGIT}+) +SIGN ("+"|"-") +FLT {SIGN}?{DIGIT}+(".")?{DIGIT}*(e("-"|"+")?{DIGIT}+)? +FILENAME [a-zA-Z0-9_".""-""/"][a-zA-z0-9"."_"-""/"]+ +NAME [a-zA-Z0-9_]+ +CSTR \"[a-zA-Z0-9\-\._]+\" +TYPE [0-9A-Z]+ +FLTLIST [0-9"."","" "]* +EQL {SPC}*={SPC}* + +%{ +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#define INIT_GLOBALS +#include"global.h" +#include"read_input.h" +#include"default_input_values.h" +#include"monomial/monomial.h" +#include"solver/solver_types.h" +#include"meas/measurements.h" +#include"integrator.h" +#include"operator.h" +#include"phmc.h" +#include + +static inline void rmQuotes(char *str){ + char* strsave=str; + + while(*str== ' ' || *str == '\t') str++; + + if(*str=='\"') *(strsave++)=*( ++str ); + else fprintf(stderr,"Error in removing quotes from string:\n %s \n" , str); + ++str; + + while( !( *str=='\0' || *str == '\"' || *str == '\n') ) *(strsave++)=*(str++); + *strsave='\0'; +} + + /* Name of the parsing routine */ +#define YY_DECL int parse_config() +#define YY_NO_UNPUT + /* helper vars */ + char *cstring_to_parse=NULL; + int cstring_caller; + int solver_caller; + + /* declaration of input parameters */ + int i=0; + int line_of_file = 1; + int current_monomial = -1; + int current_measurement=-1; + int current_operator = -1; + extern int no_monomials; + extern int no_measurements; + monomial * mnl = NULL; + measurement * meas = NULL; + operator * optr = NULL; + int comment_caller; + int name_caller; + int a,b; + double c; + float cs; + int reread = 0; + char name[100]; + char * type; + + int verbose = 0; + int myverbose = 0; + int startoption; + int Ntherm; + int Nmeas; + int Nsave; + int gmres_m_parameter, gmresdr_nr_ev; + int write_cp_flag; + int cp_interval; + int nstore; + int index_start, index_end; + int random_seed; + int rlxd_level; + char rlxd_input_filename[500]; + char gauge_input_filename[500]; + int read_source_flag; + int return_check_flag, return_check_interval; + int gauge_precision_read_flag; + int gauge_precision_write_flag; + int g_disable_IO_checks; + int gmres_m_parameter, gmresdr_nr_ev; + int reproduce_randomnumber_flag; + double stout_rho; + int stout_no_iter; + int use_stout_flag; + int phmc_no_flavours; + int phmc_heavy_timescale; + int phmc_exact_poly; + int compute_evs; + int phmc_compute_evs; + double stilde_max; + double stilde_min; + int degree_of_p; + int source_location; + int no_eigenvalues; + double eigenvalue_precision; + int sub_evs_cg_flag; + int even_odd_flag; + int bc_flag; + int online_measurement_flag; + int online_measurement_freq; + int reweighting_flag; + int reweighting_samples; + int no_samples; + int compute_modenumber; + int compute_topsus; + double mstarsq; + int no_sources_z2; + int usegpu_flag; + int max_innersolver_it; + int device_num; + double innersolver_precision; + + double innersolver_precision_abs; + double innersolver_precision_rel; + int innersolver_precision_check_abs; + int innersolver_precision_check_rel; + + int max_mms_shifts; + int use_mixed_mms; + int min_innersolver_it; + + double mixcg_innereps; + int mixcg_maxinnersolverit; + + int propagator_comparison; + int nb_cores; + + int omp_num_threads; + + int nblocks_t; + int nblocks_x; + int nblocks_y; + int nblocks_z; + + int dfl_field_iter; + int dfl_poly_iter; + + int use_preconditioning; + + int use_qudainverter; + + +%} + +%option never-interactive + +%x STARTCOND +%x THERMSWEEPS +%x NMEAS +%x KAPPA +%x MUBAR +%x EPSBAR +%x MU +%x CSW +%x SEED +%x RLXDLEVEL +%x NSAVE +%x RLXDINPUTFILE +%x GAUGEINPUTFILE +%x GAUGERPREC +%x GAUGEWPREC +%x DSBLIOCHECK +%x DFLSP +%x PRECON +%x WRITECP +%x CPINT +%x NSTORE +%x TT +%x LL +%x LLX +%x LLY +%x LLZ +%x NPROCX +%x NPROCY +%x NPROCZ +%x IOPROC +%x IDX +%x BOUNDT +%x BOUNDX +%x BOUNDY +%x BOUNDZ +%X READSOURCE +%x SOURCEFORMAT +%x SOURCEFILE +%x SOURCETS +%x SOURCETYPE +%x PROPSPLIT +%x NOSAMPLES +%x RELPREC +%x REVCHECK +%x REVINT +%x DEBUG +%x GMRESM +%x GMRESDRNEV +%x REPRORND +%x SLOPPYPREC +%x USESTOUT +%x STOUTRHO +%x STOUTITER +%x COMPUTEEVS +%x SRCLOC +%x SUBEVCG +%x NOEV +%x PRECEV +%x EO +%x BC +%x WRPROPFLAG +%x PROPTYPE +%x COMPUTEMN +%x COMPUTETS +%x MSTARSQ +%x NOSOURCESZ2 +%x INITMEASUREMENT +%x ONLINEMEAS +%x PIONNORMMEAS +%x PLOOP +%x ORIENTEDPLAQUETTESMEAS +%x GRADIENTFLOWMEAS + +%x REWEIGH +%x REWSAMPLES + +%x INITINTEGRATOR +%x INTEGRATOR + +%x GPU +%x INITGPU + +%x INITOPERATOR +%x TMOP +%x DBTMOP +%x WILSONOP +%x OVERLAPOP +%x CLOVEROP +%x DBCLOVEROP +%x TMSOLVER +%x DBTMSOLVER +%x CSWSOLVER +%x OVSOLVER + +%x INITMONOMIAL +%x DETMONOMIAL +%x CLDETMONOMIAL +%x CLDETRATMONOMIAL +%x GAUGEMONOMIAL +%x NDPOLYMONOMIAL +%x NDRATMONOMIAL +%x RATMONOMIAL +%x CLRATMONOMIAL +%x RATCORMONOMIAL +%x CLRATCORMONOMIAL +%x NDCLRATMONOMIAL +%x NDRATCORMONOMIAL +%x NDCLRATCORMONOMIAL +%x POLYMONOMIAL +%x CLPOLYMONOMIAL +%x MNAME +%x MCSTR +%x MSOLVER +%x NDMSOLVER +%x GTYPE + +%x COMMENT +%x ERROR + +%x PCOMP +%x NBCORES + +%x OMPNUMTHREADS + +%x DFLNBLOCKT +%x DFLNBLOCKX +%x DFLNBLOCKY +%x DFLNBLOCKZ + +%x DFLFIELDITER +%x DFLPOLYITER + +%x PRECONDITIONING + +%x QUDAINVERTER +%x COMPRESSION + +%x MIXCGEPS +%x MIXCGIT + + +%% +^SourceFilename{EQL} BEGIN(SOURCEFILE); +^T{EQL} BEGIN(TT); +^L{EQL} BEGIN(LL); +^LX{EQL} BEGIN(LLX); +^LY{EQL} BEGIN(LLY); +^LZ{EQL} BEGIN(LLZ); +^NRXProcs{EQL} BEGIN(NPROCX); +^NRYProcs{EQL} BEGIN(NPROCY); +^NRZProcs{EQL} BEGIN(NPROCZ); +^kappa{EQL} BEGIN(KAPPA); +^csw{EQL} BEGIN(CSW); +^2KappaMu{EQL} BEGIN(MU); +^2KappaMubar{EQL} BEGIN(MUBAR); +^2KappaEpsBar{EQL} BEGIN(EPSBAR); +^NoEigenvalues{EQL} BEGIN(NOEV); +^EigenvaluePrecision{EQL} BEGIN(PRECEV); +^seed{EQL} BEGIN(SEED); +^StartCondition{EQL} BEGIN(STARTCOND); +^ThermalisationSweeps{EQL} BEGIN(THERMSWEEPS); +^Measurements{EQL} BEGIN(NMEAS); +^NSave{EQL} BEGIN(NSAVE); +^GaugeFieldInFile{EQL} BEGIN(GAUGEINPUTFILE); +^RlxdStateInFile{EQL} BEGIN(RLXDINPUTFILE); +^SubtractEVForCG{EQL} BEGIN(SUBEVCG); +^WriteCheckpoints{EQL} BEGIN(WRITECP); +^CheckpointInterval{EQL} BEGIN(CPINT); +^GaugeConfigInputFile{EQL} BEGIN(GAUGEINPUTFILE); +^RlxdInputFile{EQL} BEGIN(RLXDINPUTFILE); +^InitialStoreCounter{EQL} BEGIN(NSTORE); +^StdIOProcessor{EQL} BEGIN(IOPROC); +^Indices{EQL} BEGIN(IDX); +^BCAngleT{EQL} BEGIN(BOUNDT); +^ThetaT{EQL} BEGIN(BOUNDT); +^ThetaX{EQL} BEGIN(BOUNDX); +^ThetaY{EQL} BEGIN(BOUNDY); +^ThetaZ{EQL} BEGIN(BOUNDZ); +^ReadSource{EQL} BEGIN(READSOURCE); +^UseRelativePrecision{EQL} BEGIN(RELPREC); +^ReversibilityCheck{EQL} BEGIN(REVCHECK); +^ReversibilityCheckIntervall{EQL} BEGIN(REVINT); +^DebugLevel{EQL} BEGIN(DEBUG); +^GMRESMParameter{EQL} BEGIN(GMRESM); +^GMRESDRNrEv{EQL} BEGIN(GMRESDRNEV); +^GaugeConfigReadPrecision{EQL} BEGIN(GAUGERPREC); +^GaugeConfigWritePrecision{EQL} BEGIN(GAUGEWPREC); +^DisableIOChecks{EQL} BEGIN(DSBLIOCHECK); +^ReproduceRandomNumbers{EQL} BEGIN(REPRORND); +^UseSloppyPrecision{EQL} BEGIN(SLOPPYPREC); +^UseStoutSmearing{EQL} BEGIN(USESTOUT); +^StoutRho{EQL} BEGIN(STOUTRHO); +^StoutNoIterations{EQL} BEGIN(STOUTITER); +^ComputeEVs{EQL} BEGIN(COMPUTEEVS); +^SourceLocation{EQL} BEGIN(SRCLOC); +^UseEvenOdd{EQL} BEGIN(EO); +^Bc{EQL} BEGIN(BC); +^WritePropagatorFormat{EQL} BEGIN(WRPROPFLAG); +^PropagatorType{EQL} BEGIN(WRPROPFLAG); +^RanluxdLevel{EQL} BEGIN(RLXDLEVEL); +^DeflationSubspaceDimension{EQL} BEGIN(DFLSP); +^GCRPreconditioner{EQL} BEGIN(PRECON); +^ComputeReweightingFactor{EQL} BEGIN(REWEIGH); +^NoReweightingSamples{EQL} BEGIN(REWSAMPLES); +^SourceTimeSlice{EQL} BEGIN(SOURCETS); +^SourceType{EQL} BEGIN(SOURCETYPE); +^NoSamples{EQL} BEGIN(NOSAMPLES); +^SplittedPropagator{EQL} BEGIN(PROPSPLIT); +^UsePreconditioning{EQL} BEGIN(PRECONDITIONING); +^UseQudaInverter{EQL} BEGIN(QUDAINVERTER); +^UseCompression{EQL} BEGIN(COMPRESSION); + +^BeginMeasurement{SPC}+ BEGIN(INITMEASUREMENT); +^ComputeModeNumber{EQL} BEGIN(COMPUTEMN); +^ComputeTopSus{EQL} BEGIN(COMPUTETS); +^MStarSq{EQL} BEGIN(MSTARSQ); +^NoSourcesZ2{EQL} BEGIN(NOSOURCESZ2); +^BeginMonomial{SPC}+ BEGIN(INITMONOMIAL); +^BeginInt BEGIN(INITINTEGRATOR); +^BeginOperator{SPC}+ BEGIN(INITOPERATOR); + +^PropagatorComparison{EQL} BEGIN(PCOMP); +^NbCoresPerNode{EQL} BEGIN(NBCORES); + +^OMPNumThreads{EQL} BEGIN(OMPNUMTHREADS); + +^NoBlocksT{EQL} BEGIN(DFLNBLOCKT); +^NoBlocksX{EQL} BEGIN(DFLNBLOCKX); +^NoBlocksY{EQL} BEGIN(DFLNBLOCKY); +^NoBlocksZ{EQL} BEGIN(DFLNBLOCKZ); + +^DflFieldIter{EQL} BEGIN(DFLFIELDITER); +^DflPolyIter{EQL} BEGIN(DFLPOLYITER); + +^BeginGPU BEGIN(INITGPU); + +^MixCGInnerEps{EQL} BEGIN(MIXCGEPS); +^MixCGMaxIter{EQL} BEGIN(MIXCGIT); + + + + + +Init{SPC}* { + if(myverbose) printf("Initialising GPU line %d\n", line_of_file); + usegpu_flag = 1; + if(myverbose!=0) printf(" Using help of GPU for inversions\n"); + BEGIN(GPU); + } +{ + {SPC}*InnersolverPrecision{EQL}{FLT} { + sscanf(yytext, " %[2a-zA-Z] = %lf", name, &c); + innersolver_precision = c; + if(myverbose) printf(" Inner solver precision set to %lf line %d\n", c, line_of_file); + } + {SPC}*MaxInnersolverIteration{EQL}{DIGIT}+ { + sscanf(yytext, " %[2a-zA-Z] = %d", name, &a); + max_innersolver_it = a; + if(myverbose) printf(" Inner solver iterations set to %d line %d\n", a, line_of_file); + } + {SPC}*DeviceNum{EQL}{DIGIT}+ { + sscanf(yytext, " %[2a-zA-Z] = %d", name, &a); + device_num = a; + if(myverbose) printf(" Device Number set to %d line %d\n", a, line_of_file); + } + {SPC}*InnersolverPrecisionAbs{EQL}{FLT} { + sscanf(yytext, " %[2a-zA-Z] = %lf", name, &c); + innersolver_precision_abs = c; + if(myverbose) printf(" innersolver_precision_abs set to %lf line %d\n", c, line_of_file); + } + {SPC}*InnersolverPrecisionRel{EQL}{FLT} { + sscanf(yytext, " %[2a-zA-Z] = %lf", name, &c); + innersolver_precision_rel = c; + if(myverbose) printf(" innersolver_precision_rel set to %lf line %d\n", c, line_of_file); + } + {SPC}*InnersolverPrecisionCheckAbs{EQL}{DIGIT}+ { + sscanf(yytext, " %[2a-zA-Z] = %d", name, &a); + innersolver_precision_check_abs = a; + if(myverbose) printf(" innersolver_precision_check_abs set to %d line %d\n", a, line_of_file); + } + {SPC}*InnersolverPrecisionCheckRel{EQL}{DIGIT}+ { + sscanf(yytext, " %[2a-zA-Z] = %d", name, &a); + innersolver_precision_check_rel = a; + if(myverbose) printf(" innersolver_precision_check_rel set to %d line %d\n", a, line_of_file); + } + {SPC}*MinInnerSolverIterations{EQL}{DIGIT}+ { + sscanf(yytext, " %[2a-zA-Z] = %d", name, &a); + min_innersolver_it = a; + if(myverbose) printf(" min_innersolver_it set to %d line %d\n", a, line_of_file); + } + {SPC}*MaxMmsShifts{EQL}{DIGIT}+ { + sscanf(yytext, " %[2a-zA-Z] = %d", name, &a); + max_mms_shifts = a; + if(myverbose) printf(" max_mms_shifts set to %d line %d\n", a, line_of_file); + } + {SPC}*UseMixedMms{EQL}yes { + sscanf(yytext, " %[2a-zA-Z] = %lf", name, &c); + use_mixed_mms = 1; + if(myverbose) printf(" Using mixed solver for smallest shifts\n"); } + {SPC}*UseMixedMms{EQL}no { + sscanf(yytext, " %[2a-zA-Z] = %lf", name, &c); + use_mixed_mms = 0; + if(myverbose) printf(" Not using mixed solver for smallest shifts\n"); + } + EndGPUInit{SPC}* { + if(myverbose) printf("GPU parsed in line %d\n\n", line_of_file); + BEGIN(0); + } +} + + +{TYPE} { + current_operator++; + optr = &operator_list[current_operator]; + optr->id = current_operator; + optr->initialised = 0; + if(strcmp(yytext, "WILSON")==0) { + optr->type = WILSON; + } + else if(strcmp(yytext, "TMWILSON")==0) { + optr->type = TMWILSON; + } + else if(strcmp(yytext, "CLOVER")==0) { + optr->type = CLOVER; + } + else if(strcmp(yytext, "DBCLOVER")==0) { + optr->type = DBCLOVER; + } + else if(strcmp(yytext, "DBTMWILSON")==0) { + optr->type = DBTMWILSON; + } + else if(strcmp(yytext, "OVERLAP")==0) { + optr->type = OVERLAP; + } + else { + fprintf(stderr, "Unknown operator type %s in line %d\n", yytext, line_of_file); + exit(1); + } + if(!reread) { + if(add_operator(optr->type) < 0) { + fprintf(stderr, "Something went wrong in adding operators\nAborting...!\n"); + exit(1); + } + } + if(myverbose) printf("initialising operator with type %s (%d) line %d\n", yytext, optr->type, line_of_file); + if(myverbose) printf("operator has id %d\n", current_operator); + + if(optr->type == WILSON) BEGIN(WILSONOP); + else if(optr->type == CLOVER) BEGIN(CLOVEROP); + else if(optr->type == TMWILSON) BEGIN(TMOP); + else if(optr->type == DBTMWILSON) BEGIN(DBTMOP); + else if(optr->type == DBCLOVER) BEGIN(DBCLOVEROP); + else BEGIN(OVERLAPOP); +} + +{ + {SPC}*kappa{EQL}{FLT} { + sscanf(yytext, " %[2a-zA-Z] = %lf", name, &c); + optr->kappa = c; + if(myverbose) printf(" kappa set to %f line %d operator %d\n", c, line_of_file, current_operator); + } + {SPC}*MaxSolverIterations{EQL}{DIGIT}+ { + sscanf(yytext, " %[a-zA-Z] = %d", name, &a); + optr->maxiter = a; + if(myverbose) printf(" MaxSolverIterations set to %d line %d operator %d\n", a, line_of_file, current_operator); + } + {SPC}*PropagatorPrecision{EQL}32 { + optr->prop_precision = 32; + PropInfo.precision = 32; + if(myverbose) printf(" PropagatorPrecision set to 32 line %d operator %d\n", line_of_file, current_operator); + } + {SPC}*PropagatorPrecision{EQL}64 { + optr->prop_precision = 64; + PropInfo.precision = 64; + if(myverbose) printf(" PropagatorPrecision set to 64 line %d operator %d\n", line_of_file, current_operator); + } + {SPC}*SolverPrecision{EQL}{FLT} { + sscanf(yytext, " %[2a-zA-Z] = %lf", name, &c); + optr->eps_sq = c; + (optr->solver_params).eigcg_tolsq = c; + if(myverbose) printf(" SolverPrecision set to %lf line %d operator %d\n", c, line_of_file, current_operator); + if(myverbose) printf(" EigCGtolsq set to %lf line %d operator %d\n", c, line_of_file, current_operator); + } + {SPC}*SolverRelativePrecision{EQL}yes { + sscanf(yytext, " %[2a-zA-Z] = %lf", name, &c); + optr->rel_prec = 1; + if(myverbose) printf(" SolverRelativePrecision set to YES line %d operator %d\n", line_of_file, current_operator); + } + {SPC}*SolverRelativePrecision{EQL}no { + sscanf(yytext, " %[2a-zA-Z] = %lf", name, &c); + optr->rel_prec = 0; + if(myverbose) printf(" SolverRelativePrecision set to NO line %d operator %d\n", line_of_file, current_operator); + } + {SPC}*mcgdelta{EQL}{FLT} { + sscanf(yytext, " %[a-zA-Z1] = %lf", name, &c); + (optr->solver_params).mcg_delta = c; + if(myverbose) printf(" mcg_delta set to %lf line %d operator %d\n", c, line_of_file, current_operator); + } + {SPC}*EigCGnrhs{EQL}{DIGIT}+ { + sscanf(yytext, " %[a-zA-Z] = %d", name, &a); + (optr->solver_params).eigcg_nrhs = a; + if(myverbose) printf(" EigCGnrhs set to %d line %d operator %d\n", a, line_of_file, current_operator); + } + {SPC}*EigCGnrhs1{EQL}{DIGIT}+ { + sscanf(yytext, " %[a-zA-Z1] = %d", name, &a); + (optr->solver_params).eigcg_nrhs1 = a; + if(myverbose) printf(" EigCGnrhs1 set to %d line %d operator %d\n", a, line_of_file, current_operator); + } + {SPC}*EigCGnev{EQL}{DIGIT}+ { + sscanf(yytext, " %[a-zA-Z] = %d", name, &a); + (optr->solver_params).eigcg_nev = a; + if(myverbose) printf(" EigCGnev set to %d line %d operator %d\n", a, line_of_file, current_operator); + } + {SPC}*EigCGvmax{EQL}{DIGIT}+ { + sscanf(yytext, " %[a-zA-Z] = %d", name, &a); + (optr->solver_params).eigcg_vmax = a; + if(myverbose) printf(" EigCGvmax set to %d line %d operator %d\n", a, line_of_file, current_operator); + } + {SPC}*EigCGldh{EQL}{DIGIT}+ { + sscanf(yytext, " %[a-zA-Z] = %d", name, &a); + (optr->solver_params).eigcg_ldh = a; + if(myverbose) printf(" EigCGldh set to %d line %d operator %d\n", a, line_of_file, current_operator); + } + {SPC}*EigCGtolsq1{EQL}{FLT} { + sscanf(yytext, " %[a-zA-Z1] = %lf", name, &c); + (optr->solver_params).eigcg_tolsq1 = c; + if(myverbose) printf(" EigCGtolsq1 set to %lf line %d operator %d\n", c, line_of_file, current_operator); + } + {SPC}*EigCGrestolsq{EQL}{FLT} { + sscanf(yytext, " %[a-zA-Z] = %lf", name, &c); + (optr->solver_params).eigcg_restolsq = c; + if(myverbose) printf(" EigCGrestolsq set to %lf line %d operator %d\n", c, line_of_file, current_operator); + } + {SPC}*EigCGRandGuessOpt{EQL}{DIGIT}+ { + sscanf(yytext, " %[a-zA-Z] = %d", name, &a); + (optr->solver_params).eigcg_rand_guess_opt = a; + if(myverbose) printf(" EigCGrand_guess_opt set to %d line %d operator %d\n", a, line_of_file, current_operator); + } + + {SPC}*ExtraMasses{EQL}{FLTLIST} { + char * token = NULL; + optr->no_extra_masses = 0; + double mass; + if( strtok(yytext,"\n\t =,\\") != NULL ) { + /* Implicitly drop the first token, it is "ExtraMasses" */ + token = strtok(NULL," =,\t"); + + while( token != NULL ) { + if( optr->no_extra_masses >= MAX_EXTRA_MASSES ) { + yy_fatal_error(" CGMMS maximum number of extra masses reached. Increase MAX_EXTRA_MASSES in global.h!"); + } + sscanf(token,"%lf",&mass); + optr->extra_masses[optr->no_extra_masses] = mass; + if(myverbose) printf(" CGMMS extra mass %d = %lf (2*kappa*mu) added line %d operator %d\n",optr->no_extra_masses,mass,line_of_file,current_operator); + ++(optr->no_extra_masses); + token = strtok(NULL," =,\t"); + } + } else { + yy_fatal_error(" CGMMS Failed to read ExtraMasses input line in configuration file!"); + } + + if( optr->no_extra_masses == 0 ) { + yy_fatal_error(" CGMMS Failed to add any extra masses. Input line must be malformed."); + } else { + if(myverbose) printf(" CGMMS %d extra masses added line %d operator %d\n",optr->no_extra_masses,line_of_file,current_operator); + } + } + {SPC}*ExtraMasses{EQL}{FILENAME} { + FILE * ifs; + double tempmass = 0.0; + optr->no_extra_masses = 0; + char * token = NULL; + if( strtok(yytext,"\n\t =,\\") != NULL ) { + /* drop the first token, it is ExtraMasses */ + token = strtok(NULL," =\t"); + if( token != NULL ) { + printf(" CGMMS Reading extra masses input file %s\n",token); + if ((ifs = fopen(token, "r")) != NULL) { + while ( fscanf( ifs, "%lf", &tempmass ) != EOF ) { + if( optr->no_extra_masses >= MAX_EXTRA_MASSES ) { + yy_fatal_error(" CGMMS maximum number of extra masses reached. Increase MAX_EXTRA_MASSES in global.h!"); + } + optr->extra_masses[optr->no_extra_masses] = tempmass; + if (myverbose) { + printf(" CGMMS Extra mass %d = %lf (2*kappa*mu) added line %d operator %d\n", + optr->no_extra_masses, optr->extra_masses[optr->no_extra_masses],line_of_file,current_operator); + } + ++(optr->no_extra_masses); + } + if( optr->no_extra_masses == 0 ) { + yy_fatal_error(" CGMMS Failed to add any extra masses. Extra masses input file must be malformed!"); + } + else { + if(myverbose) + printf(" CGMMS %d extra masses added line %d operator %d\n",optr->no_extra_masses,line_of_file,current_operator); + } + fclose(ifs); + } + else { + fprintf(stderr, "Could not open extra masses input file %s\n",token); + optr->no_extra_masses=0; + } + } + } + } + ^EndOperator{SPC}* { + if(myverbose) printf("operator %d parsed line %d\n\n", current_operator, line_of_file); + BEGIN(0); + } +} + +{ + {SPC}*Solver{EQL} { + name_caller = YY_START; + BEGIN(TMSOLVER); + } +} + +{ + {SPC}*Solver{EQL} { + name_caller = YY_START; + BEGIN(CSWSOLVER); + } +} + + +{ + {SPC}*UseEvenOdd{EQL}yes { + if(myverbose) printf(" Use even/odd preconditioning line %d operator %d\n", line_of_file, current_operator); + optr->even_odd_flag = 1; + } + {SPC}*UseEvenOdd{EQL}no { + if(myverbose) printf(" Do not use even/odd preconditioning line %d operator %d\n", line_of_file, current_operator); + optr->even_odd_flag = 0; + } +} + +{ + {SPC}*2KappaMubar{EQL}{FLT} { + sscanf(yytext, " %[2a-zA-Z] = %lf", name, &c); + optr->mubar = c; + if(myverbose) printf(" 2KappaMubar set to %f line %d operator %d\n", c, line_of_file, current_operator); + } + {SPC}*2KappaEpsbar{EQL}{FLT} { + sscanf(yytext, " %[2a-zA-Z] = %lf", name, &c); + optr->epsbar = c; + if(myverbose) printf(" 2KappaEpsbar set to %f line %d operator %d\n", c, line_of_file, current_operator); + } + {SPC}*Solver{EQL} { + name_caller = YY_START; + BEGIN(DBTMSOLVER); + } +} + +{ + {SPC}*2KappaMu{EQL}{FLT} { + sscanf(yytext, " %[2a-zA-Z] = %lf", name, &c); + optr->mu = c; + if(myverbose) printf(" 2KappaMu set to %f line %d operator %d\n", c, line_of_file, current_operator); + } + {SPC}*AddDownPropagator{EQL}yes { + optr->DownProp = 1; + if(myverbose) printf(" Invert for + and - mu set in line %d operator %d\n", line_of_file, current_operator); + } + {SPC}*AddDownPropagator{EQL}no { + optr->DownProp = 0; + if(myverbose) printf(" Don't invert for + and - mu set in line %d operator %d\n", line_of_file, current_operator); + } +} + +{ + {SPC}*csw{EQL}{FLT} { + sscanf(yytext, " %[2a-zA-Z] = %lf", name, &c); + optr->c_sw = c; + if(myverbose) printf(" Set c_sw set to %lf in line %d for operator %d\n", optr->c_sw, line_of_file, current_operator); + } +} + +{ + {SPC}*UseQudaInverter{EQL}yes { + if(myverbose) printf(" Use Quda inverter line %d operator %d\n", line_of_file, current_operator); + optr->external_inverter = QUDA_INVERTER; + } + {SPC}*UseQudaInverter{EQL}no { + if(myverbose) printf(" Do not use Quda inverter line %d operator %d\n", line_of_file, current_operator); + optr->external_inverter = NO_EXT_INV; + } + {SPC}*UseSloppyPrecision{EQL}yes { + if(myverbose) printf(" Use use sloppy precision (single) in the inverter (if supported by the inverter) line %d operator %d\n", line_of_file, current_operator); + optr->sloppy_precision = SLOPPY_SINGLE; + } + {SPC}*UseSloppyPrecision{EQL}float { + if(myverbose) printf(" Use use sloppy precision (single) in the inverter (if supported by the inverter) line %d operator %d\n", line_of_file, current_operator); + optr->sloppy_precision = SLOPPY_SINGLE; + } + {SPC}*UseSloppyPrecision{EQL}single { + if(myverbose) printf(" Use use sloppy precision (single) in the inverter (if supported by the inverter) line %d operator %d\n", line_of_file, current_operator); + optr->sloppy_precision = SLOPPY_SINGLE; + } + {SPC}*UseSloppyPrecision{EQL}no { + if(myverbose) printf(" Use use sloppy precision (single) in the inverter line %d operator %d\n", line_of_file, current_operator); + optr->sloppy_precision = SLOPPY_DOUBLE; + } + {SPC}*UseSloppyPrecision{EQL}double { + if(myverbose) printf(" Use use sloppy precision (single) in the inverter line %d operator %d\n", line_of_file, current_operator); + optr->sloppy_precision = SLOPPY_DOUBLE; + } + {SPC}*UseSloppyPrecision{EQL}half { + if(myverbose) printf(" Use use sloppy precision (half) in the inverter (if supported by the inverter) line %d operator %d\n", line_of_file, current_operator); + optr->sloppy_precision = SLOPPY_HALF; + } + {SPC}*UseCompression{EQL}12 { + if(myverbose) printf(" Use 12 compression in the inverter (if supported) line %d operator %d\n", line_of_file, current_operator); + optr->compression_type = COMPRESSION_12; + } + {SPC}*UseCompression{EQL}8 { + if(myverbose) printf(" Use 8 compression in the inverter (if supported) line %d operator %d\n", line_of_file, current_operator); + optr->compression_type = COMPRESSION_8; + } + {SPC}*UseCompression{EQL}18 { + if(myverbose) printf(" Not using compression in the inverter line %d operator %d\n", line_of_file, current_operator); + optr->compression_type = NO_COMPRESSION; + } +} + +{ + {SPC}*Solver{EQL} { + BEGIN(OVSOLVER); + } + {SPC}*m{EQL}{FLT} { + sscanf(yytext, " %[2a-zA-Z] = %lf", name, &c); + optr->m = c; + if(myverbose) printf(" m set to %f line %d operator %d\n", c, line_of_file, current_operator); + } + {SPC}*s{EQL}{FLT} { + sscanf(yytext, " %[2a-zA-Z] = %lf", name, &c); + optr->s = c; + if(myverbose) printf(" s set to %f line %d operator %d\n", c, line_of_file, current_operator); + } + {SPC}*DegreeOfPolynomial{EQL}{DIGIT}+ { + sscanf(yytext, " %[a-zA-Z] = %d", name, &a); + optr->deg_poly = a; + if(myverbose) printf(" DegreeOfPolynomial set to %d line %d operator %d\n", a, line_of_file, current_operator); + } + {SPC}*NoKernelEigenvalues{EQL}{DIGIT}+ { + sscanf(yytext, " %[a-zA-Z] = %d", name, &a); + optr->no_ev = a; + if(myverbose) printf(" NoKernelEigenvalues set to %d line %d operator %d\n", a, line_of_file, current_operator); + } + {SPC}*KernelEigenvaluePrecision{EQL}{FLT} { + sscanf(yytext, " %[2a-zA-Z] = %lf", name, &c); + optr->ev_prec = c; + if(myverbose) printf(" KernelEigenvaluePrecision set to %f line %d operator %d\n", c, line_of_file, current_operator); + } + {SPC}*KernelEigenvectorsReadWrite{EQL}yes { + optr->ev_readwrite = 1; + if(myverbose) printf(" KernelEigenvectorsReadWrite set to 1 line %d operator %d\n", line_of_file, current_operator); + } + {SPC}*KernelEigenvectorsReadWrite{EQL}no { + optr->ev_readwrite = 0; + if(myverbose) printf(" KernelEigenvectorsReadWrite set to 0 line %d operator %d\n", line_of_file, current_operator); + } +} + +{ + cg { + optr->solver=1; + if(myverbose) printf(" Solver set to CG line %d operator %d\n", line_of_file, current_operator); + BEGIN(name_caller); + } + rgmixedcg { + optr->solver=RGMIXEDCG; + if(myverbose) printf(" Solver set to RGMixedCG line %d operator %d\n", line_of_file, current_operator); + BEGIN(name_caller); + } +} + +{ + mixedcg { + optr->solver=MIXEDCG; + if(myverbose) printf(" Solver set to MixedCG line %d operator %d\n", line_of_file, current_operator); + BEGIN(name_caller); + } + bicgstab { + optr->solver=BICGSTAB; + if(myverbose) printf(" Solver set to BiCGstab line %d operator %d\n", line_of_file, current_operator); + BEGIN(name_caller); + } + pcg { + optr->solver=PCG; + if(myverbose) printf(" Solver set to PCG line %d operator %d\n", line_of_file, current_operator); + BEGIN(name_caller); + } + gmres { + optr->solver=GMRES; + if(myverbose) printf(" Solver set to GMRES line %d operator %d\n", line_of_file, current_operator); + BEGIN(name_caller); + } + gcr { + optr->solver=GCR; + if(myverbose) printf(" Solver set to GCR line %d operator %d\n", line_of_file, current_operator); + BEGIN(name_caller); + } + gmresdr { + optr->solver=GMRESDR; + if(myverbose) printf(" Solver set to GMRES-DR line %d operator %d\n", line_of_file, current_operator); + BEGIN(name_caller); + } + cgs { + optr->solver=CGS; + if(myverbose) printf(" Solver set to CGS line %d operator %d\n", line_of_file, current_operator); + BEGIN(name_caller); + } + mr { + optr->solver=MR; + if(myverbose) printf(" Solver set to MR line %d operator %d\n", line_of_file, current_operator); + BEGIN(name_caller); + } + fgmres { + optr->solver=FGMRES; + if(myverbose) printf(" Solver set to FGMRES line %d operator %d\n", line_of_file, current_operator); + BEGIN(name_caller); + } + dflgcr { + optr->solver=DFLGCR; + g_dflgcr_flag = 1; + if(myverbose) printf(" Solver set to DFL-GCR line %d operator %d\n", line_of_file, current_operator); + BEGIN(name_caller); + } + dflfgmres { + optr->solver=DFLFGMRES; + g_dflgcr_flag = 1; + if(myverbose) printf(" Solver set to DFL-FGMRES line %d operator %d\n", line_of_file, current_operator); + BEGIN(name_caller); + } + cgmms { + optr->solver = CGMMS; + if(myverbose) printf(" Solver set to CGMMS line %d operator %d\n", line_of_file, current_operator); + BEGIN(name_caller); + } + increigcg { + optr->solver = INCREIGCG; + if(myverbose) printf(" Solver set to INCR-EIG-CG line %d operator %d\n", line_of_file, current_operator); + BEGIN(name_caller); + } + +} + + +{ + increigcg { + optr->solver = INCREIGCG; + if(myverbose) printf(" Solver set to INCR-EIG-CG line %d operator %d\n", line_of_file, current_operator); + BEGIN(name_caller); + } + mixedcg { + optr->solver=MIXEDCG; + if(myverbose) printf(" Solver set to MixedCG line %d operator %d\n", line_of_file, current_operator); + BEGIN(name_caller); + } +} + + + +{ + sumr { + optr->solver = SUMR; + if(myverbose) printf(" Solver set to SUMR in line %d operator %d\n", line_of_file, current_operator); + BEGIN(OVERLAPOP); + } + cg { + optr->solver = CG; + if(myverbose) printf(" Solver set to CG in line %d operator %d\n", line_of_file, current_operator); + BEGIN(OVERLAPOP); + } +} + +{TYPE} { + current_monomial++; + mnl = &monomial_list[current_monomial]; + mnl->id = current_monomial; + if(strcmp(yytext, "DET")==0) { + mnl->type = DET; + strcpy((*mnl).name, "DET"); + } + else if(strcmp(yytext, "CLOVERDET")==0) { + mnl->type = CLOVERDET; + strcpy((*mnl).name, "CLOVERDET"); + } + else if(strcmp(yytext, "CLOVERDETRATIO")==0) { + mnl->type = CLOVERDETRATIO; + strcpy((*mnl).name, "CLOVERDETRATIO"); + } + else if(strcmp(yytext, "DETRATIO")==0) { + mnl->type = DETRATIO; + strcpy((*mnl).name, "DETRATIO"); + } + else if(strcmp(yytext, "NDDETRATIO")==0) { + mnl->type = NDDETRATIO; + strcpy((*mnl).name, "NDDETRATIO"); + g_running_phmc = 1; + } + else if(strcmp(yytext, "NDPOLY")==0) { + mnl->type = NDPOLY; + strcpy((*mnl).name, "NDPOLY"); + g_running_phmc = 1; + } + else if(strcmp(yytext, "NDRAT")==0) { + mnl->type = NDRAT; + strcpy((*mnl).name, "NDRAT"); + g_running_phmc = 1; + } + else if(strcmp(yytext, "RAT")==0) { + mnl->type = RAT; + strcpy((*mnl).name, "RAT"); + g_running_phmc = 1; + } + else if(strcmp(yytext, "CLOVERRAT")==0) { + mnl->type = CLOVERRAT; + strcpy((*mnl).name, "CLOVERRAT"); + g_running_phmc = 1; + } + else if(strcmp(yytext, "RATCOR")==0) { + mnl->type = RATCOR; + strcpy((*mnl).name, "RATCOR"); + g_running_phmc = 1; + } + else if(strcmp(yytext, "CLOVERRATCOR")==0) { + mnl->type = CLOVERRATCOR; + strcpy((*mnl).name, "CLOVERRATCOR"); + g_running_phmc = 1; + } + else if(strcmp(yytext, "NDCLOVERRAT")==0) { + mnl->type = NDCLOVERRAT; + strcpy((*mnl).name, "NDCLOVERRAT"); + g_running_phmc = 1; + } + else if(strcmp(yytext, "NDRATCOR")==0) { + mnl->type = NDRATCOR; + strcpy((*mnl).name, "NDRATCOR"); + g_running_phmc = 1; + } + else if(strcmp(yytext, "NDCLOVERRATCOR")==0) { + mnl->type = NDCLOVERRATCOR; + strcpy((*mnl).name, "NDCLOVERRATCOR"); + g_running_phmc = 1; + } + else if(strcmp(yytext, "NDCLOVER")==0) { + mnl->type = NDCLOVER; + strcpy((*mnl).name, "NDCLOVER"); + g_running_phmc = 1; + } + else if(strcmp(yytext, "POLY")==0) { + mnl->type = POLY; + strcpy((*mnl).name, "POLY"); + } + else if(strcmp(yytext, "POLYDETRATIO")==0) { + mnl->type = POLYDETRATIO; + strcpy((*mnl).name, "POLYDETRATIO"); + } + else if(strcmp(yytext, "GAUGE")==0) { + mnl->type = GAUGE; + mnl->gtype = 3; + strcpy((*mnl).name, "GAUGE"); + } + else { + fprintf(stderr, "Unknown monomial type %s in line %d\n", yytext, line_of_file); + exit(1); + } + if(!reread) { + if(add_monomial(mnl->type) < 0) { + fprintf(stderr, "Something went wrong in adding monomials\nAborting...!\n"); + exit(1); + } + } + if(myverbose) printf("initialising monomial with type %s %d line %d\n", yytext, mnl->type, line_of_file); + if(myverbose) printf("monomial has id %d\n", current_monomial); + + if(mnl->type == GAUGE) BEGIN(GAUGEMONOMIAL); + else if(mnl->type == NDPOLY) BEGIN(NDPOLYMONOMIAL); + else if(mnl->type == NDCLOVER) BEGIN(CLPOLYMONOMIAL); + else if(mnl->type == NDRAT) BEGIN(NDRATMONOMIAL); + else if(mnl->type == RAT) BEGIN(RATMONOMIAL); + else if(mnl->type == NDCLOVERRAT) BEGIN(NDCLRATMONOMIAL); + else if(mnl->type == CLOVERRAT) BEGIN(CLRATMONOMIAL); + else if(mnl->type == NDRATCOR) BEGIN(NDRATCORMONOMIAL); + else if(mnl->type == RATCOR) BEGIN(RATCORMONOMIAL); + else if(mnl->type == NDCLOVERRATCOR) BEGIN(NDCLRATCORMONOMIAL); + else if(mnl->type == CLOVERRATCOR) BEGIN(CLRATCORMONOMIAL); + else if(mnl->type == POLY || mnl->type == POLYDETRATIO) { + fprintf(stderr,"starting to parse poly(detratio) monomial\n"); + BEGIN(POLYMONOMIAL); + } + else if(mnl->type == CLOVERDET) BEGIN(CLDETMONOMIAL); + else if(mnl->type == CLOVERDETRATIO) BEGIN(CLDETRATMONOMIAL); + else BEGIN(DETMONOMIAL); +} + + + +{ + {SPC}*Timescale{EQL}{DIGIT}+ { + if(mnl->type == NDDETRATIO) { + mnl->timescale = -5; + if(myverbose) printf(" timescales set to %d line %d monomial %d since NDDETRATIO is not for MD evolution\n", a, line_of_file, current_monomial); + } + else { + sscanf(yytext, " %[a-zA-Z] = %d", name, &a); + mnl->timescale = a; + if(myverbose) printf(" timescales set to %d line %d monomial %d\n", a, line_of_file, current_monomial); + } + } + {SPC}*Name{EQL} { + name_caller = YY_START; + BEGIN(MNAME); + } + ^EndMonomial{SPC}* { + if(myverbose) printf("monomial %d parsed line %d\n\n", current_monomial, line_of_file); + BEGIN(0); + } +} + +{ + {SPC}*CSW{EQL}{FLT} { + sscanf(yytext, " %[a-zA-Z] = %lf", name, &c); + mnl->c_sw = c; + if(myverbose) printf(" CSW set to %f line %d monomial %d\n", c, line_of_file, current_monomial); + } +} + +{ + {SPC}*rho{EQL}{FLT} { + sscanf(yytext, " %[a-zA-Z] = %lf", name, &c); + mnl->rho = c; + if(myverbose) printf(" mass shift rho set to %f line %d monomial %d\n", c, line_of_file, current_monomial); + } +} + +{ + {SPC}*rho2{EQL}{FLT} { + sscanf(yytext, " %[a-zA-Z]2 = %lf", name, &c); + mnl->rho2 = c; + if(myverbose) printf(" mass shift rho2 set to %f line %d monomial %d\n", c, line_of_file, current_monomial); + } +} + +{ + {SPC}*Kappa{EQL}{FLT} { + sscanf(yytext, " %[a-zA-Z] = %lf", name, &c); + mnl->kappa = c; + if(myverbose) printf(" Kappa set to %f line %d monomial %d\n", c, line_of_file, current_monomial); + } +} + +{ + {SPC}*2KappaMu2{EQL}{FLT} { + sscanf(yytext, " %[2a-zA-Z] = %lf", name, &c); + mnl->mu2 = c; + if(myverbose) printf(" 2KappaMu2 set to %f line %d monomial %d\n", c, line_of_file, current_monomial); + } + {SPC}*Kappa2{EQL}{FLT} { + sscanf(yytext, " %[2a-zA-Z] = %lf", name, &c); + mnl->kappa2 = c; + if(myverbose) printf(" kappa2 set to %f line %d monomial %d\n", c, line_of_file, current_monomial); + } +} + +{ + {SPC}*AddTrLog{EQL}yes { + mnl->trlog = 1; + if(myverbose) printf(" Added trlog monomial in line %d to monomial %d\n", line_of_file, current_monomial); + } + {SPC}*AddTrLog{EQL}no { + mnl->trlog = 0; + if(myverbose) printf(" No trlog monomial (default) in line %d for monomial %d\n", line_of_file, current_monomial); + } +} + +{ + {SPC}*2KappaMubar{EQL}{FLT} { + sscanf(yytext, " %[2a-zA-Z] = %lf", name, &c); + mnl->mubar = c; + if(myverbose) printf(" 2KappaMubar set to %f line %d monomial %d\n", c, line_of_file, current_monomial); + } + {SPC}*2KappaEpsbar{EQL}{FLT} { + sscanf(yytext, " %[2a-zA-Z] = %lf", name, &c); + mnl->epsbar = c; + if(myverbose) printf(" 2KappaEpsbar set to %f line %d monomial %d\n", c, line_of_file, current_monomial); + } +} + +{ + {SPC}*ForcePrecision{EQL}{FLT} { + sscanf(yytext, " %[a-zA-Z] = %lf",name , &c); + mnl->forceprec = c; + if(myverbose) printf(" ForcePrecision set to %e line %d monomial %d\n", c, line_of_file, current_monomial); + } + {SPC}*AcceptancePrecision{EQL}{FLT} { + sscanf(yytext, " %[a-zA-Z] = %lf",name , &c); + mnl->accprec = c; + if(myverbose) printf(" AcceptancePrecision set to %e line %d monomial %d\n", c, line_of_file, current_monomial); + } + {SPC}*MaxSolverIterations{EQL}{DIGIT}+ { + sscanf(yytext, " %[a-zA-Z] = %d", name, &a); + mnl->maxiter = a; + if(myverbose) printf(" MaxSolverIterations set to %d line %d monomial %d\n", a, line_of_file, current_monomial); + } + {SPC}*mcgdelta{EQL}{FLT} { + sscanf(yytext, " %[a-zA-Z1] = %lf", name, &c); + (mnl->solver_params).mcg_delta = c; + if(myverbose) printf(" mcg_delta set to %lf line %d monomial %d\n", c, line_of_file, current_monomial); + } +} + +{ + {SPC}*Solver{EQL} { + solver_caller=YY_START; + BEGIN(NDMSOLVER); + } +} + +{ + {SPC}*2KappaMu{EQL}{FLT} { + sscanf(yytext, " %[2a-zA-Z] = %lf", name, &c); + mnl->mu = c; + if(myverbose) printf(" 2KappaMu set to %f line %d monomial %d\n", c, line_of_file, current_monomial); + } + {SPC}*CSGHistory{EQL}{DIGIT}+ { + sscanf(yytext, " %[a-zA-Z] = %d", name, &a); + mnl->csg_N = a; + if(myverbose) printf(" csg history length set to %d line %d monomial %d\n", a, line_of_file, current_monomial); + } + {SPC}*CSGHistory2{EQL}{DIGIT}+ { + sscanf(yytext, " %[a-zA-Z2] = %d", name , &a); + mnl->csg_N2 = a; + if(myverbose) printf(" csg history2 length (for bicgstab) set to %d line %d monomial %d\n", + a, line_of_file, current_monomial); + } + {SPC}*Solver{EQL} { + solver_caller=YY_START; + BEGIN(MSOLVER); + } +} + +{ + {SPC}*Type{EQL} BEGIN(GTYPE); + {SPC}*UseRectangleStaples{EQL}yes { + mnl->use_rectangles = 1; + g_dbw2rand = 1; + if(myverbose) printf(" UseRectangleStaples set to true line %d monomial %d\n", line_of_file, current_monomial); + } + {SPC}*UseRectangleStaples{EQL}no { + mnl->use_rectangles = 0; + /* g_dbw2rand = 0; */ + mnl->c1 = 0.; + g_rgi_C1 = 0.; + if(myverbose) printf(" UseRectangleStaples set to false line %d monomial %d\n", line_of_file, current_monomial); + } + {SPC}*Beta{EQL}{FLT} { + sscanf(yytext, " %[a-zA-Z] = %lf",name , &c); + mnl->beta = c; + g_beta = c; + if(myverbose) printf(" beta set to %e line %d monomial %d\n", c, line_of_file, current_monomial); + } + {SPC}*RectangleCoefficient{EQL}{FLT} { + sscanf(yytext, " %[a-zA-Z] = %lf",name , &c); + mnl->c1 = c; + g_rgi_C1 = c; + if(myverbose) printf(" RectangleCoefficient c1 set to %e line %d monomial %d\n", c, line_of_file, current_monomial); + } + {SPC}*Lambda{EQL}{FLT} { + sscanf(yytext, " %[a-zA-Z] = %lf",name , &c); + mnl->glambda = c; + if(myverbose) printf(" Gauge lambda parameter (Wilson plaquette only) set to %e line %d monomial %d\n", c, line_of_file, current_monomial); + } +} + +{ + {SPC}*ExactPolynomial{EQL}yes { + phmc_exact_poly = 1; + if(myverbose!=0) printf(" phmc_exact_poly set to true line %d monomial %d\n", line_of_file, current_monomial); + } + {SPC}*ExactPolynomial{EQL}no { + phmc_exact_poly = 0; + if(myverbose!=0) printf(" phmc_exact_poly set to false line %d monomial %d\n", line_of_file, current_monomial); + } +} +{ + {SPC}*Cmin{EQL}{DIGIT}+ { + sscanf(yytext, " %[a-zA-Z] = %d", name, &a); + mnl->rat.crange[0] = a; + if(myverbose!=0) printf(" Coefficient range of rational starts at coefficient %d line %d monomial %d\n", a, line_of_file, current_monomial); + } + {SPC}*Cmax{EQL}{DIGIT}+ { + sscanf(yytext, " %[a-zA-Z] = %d", name, &a); + mnl->rat.crange[1] = a; + if(myverbose!=0) printf(" Coefficient range of rational ends at coefficient %d line %d monomial %d\n", a, line_of_file, current_monomial); + } +} + +{ + {SPC}*DegreeOfRational{EQL}{DIGIT}+ { + sscanf(yytext, " %[a-zA-Z] = %d", name, &a); + mnl->rat.order = a; + if(myverbose!=0) printf(" Degree of rational approximation set to %d line %d monomial %d\n", a, line_of_file, current_monomial); + } +} +{ + {SPC}*StildeMax{EQL}{FLT} { + sscanf(yytext, " %[a-zA-Z] = %lf",name , &c); + stilde_max = c; + mnl->StildeMax = c; + mnl->rat.range[1] = c; + if(myverbose!=0) printf(" Stilde max set to %e line %d monomial %d\n", c, line_of_file, current_monomial); + } + {SPC}*StildeMin{EQL}{FLT} { + sscanf(yytext, " %[a-zA-Z] = %lf",name , &c); + mnl->StildeMin = c; + stilde_min = c; + mnl->rat.range[0] = c; + if(myverbose!=0) printf(" Stilde min set to %e line %d monomial %d\n", c, line_of_file, current_monomial); + } + {SPC}*ComputeEVFreq{EQL}{DIGIT}+ { + sscanf(yytext, " %[a-zA-Z] = %d", name, &a); + mnl->rec_ev = a; + if(myverbose!=0) printf(" Frequency for computing EV's set to %d in line %d monomial %d\n", mnl->rec_ev, line_of_file, current_monomial); + } +} +{ + {SPC}*MaxPtildeDegree{EQL}{DIGIT}+ { + sscanf(yytext, " %[a-zA-Z] = %d", name, &a); + mnl->MaxPtildeDegree = a; + phmc_max_ptilde_degree = a; + if(myverbose!=0) printf(" Maximal Degree of Ptilde set to %d line %d monomial %d\n", mnl->MaxPtildeDegree, line_of_file, current_monomial); + } + {SPC}*DegreeOfMDPolynomial{EQL}{DIGIT}+ { + sscanf(yytext, " %[a-zA-Z] = %d", name, &a); + mnl->MDPolyDegree = a; + degree_of_p = a; + if(myverbose!=0) printf(" Degree of MD polynomial set to %d line %d monomial %d\n", a, line_of_file, current_monomial); + } + {SPC}*PrecisionPtilde{EQL}{FLT} { + sscanf(yytext, " %[a-zA-Z] = %lf",name , &c); + mnl->PrecisionPtilde = c; + if(myverbose!=0) printf(" Precision for Ptilde set to %e line %d monomial %d\n", c, line_of_file, current_monomial); + } + {SPC}*PrecisionHfinal{EQL}{FLT} { + sscanf(yytext, " %[a-zA-Z] = %lf",name , &c); + mnl->PrecisionHfinal = c; + if(myverbose!=0) printf(" Precision for final H set to %e line %d monomial %d\n", c, line_of_file, current_monomial); + } + {SPC}*ComputeOnlyEVs{EQL}yes { + phmc_compute_evs=1; + if(myverbose!=0) printf(" Compute only heavy EVs set to true line %d monomial %d\n", line_of_file, current_monomial); + } + {SPC}*ComputeOnlyEVs{EQL}no { + phmc_compute_evs=0; + if(myverbose!=0) printf(" Compute only heavy EVs set to false line %d monomial %d\n", line_of_file, current_monomial); + } +} + +{ + {SPC}*Degree{EQL}{DIGIT}+ { + sscanf(yytext, " %[a-zA-Z] = %d", name, &a); + mnl->MDPolyDegree = a; + if(myverbose!=0) printf(" Degree of degenerate MD polynomial set to %d line %d monomial %d\n", mnl->MDPolyDegree, line_of_file, current_monomial); + } + {SPC}*Lmin{EQL}{FLT} { + sscanf(yytext, " %[a-zA-Z] = %lf", name, &c); + mnl->MDPolyLmin = c; + if(myverbose!=0) + printf(" lower bound of degenerate MD polynomial set to %f line %d monomial %d\n", + mnl->MDPolyLmin, line_of_file, current_monomial); + } + {SPC}*Lmax{EQL}{FLT} { + sscanf(yytext, " %[a-zA-Z] = %lf", name, &c); + mnl->MDPolyLmax = c; + if(myverbose!=0) + printf(" upper bound of degenerate MD polynomial set to %f line %d monomial %d\n", + mnl->MDPolyLmax, line_of_file, current_monomial); + } +} + + +{ + {SPC}*LocNormConst{EQL}{FLT} { + sscanf(yytext, " %[a-zA-Z] = %lf", name, &c); + mnl->MDPolyLocNormConst = c; + if(myverbose!=0) + printf(" local normalisation constant MD polynomial set to %f line %d monomial %d\n", + mnl->MDPolyLocNormConst, line_of_file, current_monomial); + } + {SPC}*RootsFile{EQL} { + cstring_to_parse=mnl->MDPolyRootsFile; + cstring_caller = YY_START; + BEGIN(MCSTR); + } +} + + +{NAME} { + if(myverbose) printf(" monomial named \"%s\" line %d monomial %d\n", yytext, line_of_file, current_monomial); + strcpy((*mnl).name, yytext); + BEGIN(name_caller); +} + +{CSTR} { + if(myverbose) printf(" monomial named \"%s\" line %d monomial %d\n", yytext, line_of_file, current_monomial); + strcpy(cstring_to_parse, yytext); + rmQuotes(cstring_to_parse); + /* reset variable */ + cstring_to_parse=NULL; + BEGIN(cstring_caller); +} + + +{ + CG { + if(myverbose) printf(" Solver set to \"%s\" line %d monomial %d\n", yytext, line_of_file, current_monomial); + mnl->solver = CG; + BEGIN(solver_caller); + } + mixedCG { + if(myverbose) printf(" Solver set to \"%s\" line %d monomial %d\n", yytext, line_of_file, current_monomial); + mnl->solver = MIXEDCG; + BEGIN(solver_caller); + } + rgmixedCG { + if(myverbose) printf(" Solver set to \"%s\" line %d monomial %d\n", yytext, line_of_file, current_monomial); + mnl->solver = RGMIXEDCG; + BEGIN(solver_caller); + } + bicgstab { + if(myverbose) printf(" Solver set to \"%s\" line %d monomial %d\n", yytext, line_of_file, current_monomial); + mnl->solver = BICGSTAB; + BEGIN(solver_caller); + } +} + +{ + cgmmsnd { + if(myverbose) printf(" Solver set to \"%s\" line %d monomial %d\n", yytext, line_of_file, current_monomial); + mnl->solver = CGMMSND; + BEGIN(solver_caller); + } + mixedCGmmsnd { + if(myverbose) printf(" Solver set to \"%s\" line %d monomial %d\n", yytext, line_of_file, current_monomial); + mnl->solver = MIXEDCGMMSND; + BEGIN(solver_caller); + } +} + +{ + Wilson { + mnl->gtype = 0; + mnl->c1 = 0.; + mnl->use_rectangles = 0; + g_rgi_C1 = 0.; + /* g_dbw2rand = 0; */ + BEGIN(GAUGEMONOMIAL); + } + tlsym { + mnl->gtype = 1; + mnl->c1 = -0.083333333; + g_rgi_C1 = -0.083333333; + mnl->use_rectangles = 1; + g_dbw2rand = 1; + BEGIN(GAUGEMONOMIAL); + } + Iwasaki { + mnl->gtype = 2; + mnl->c1 = -0.331; + g_rgi_C1 = -0.331; + mnl->use_rectangles = 1; + g_dbw2rand = 1; + BEGIN(GAUGEMONOMIAL); + } + user { + mnl->gtype = 3; + BEGIN(GAUGEMONOMIAL); + } + DBW2 { + mnl->gtype = 4; + mnl->c1 = -1.4088; + g_rgi_C1 = -1.4088; + g_dbw2rand = 1; + mnl->use_rectangles = 1; + BEGIN(GAUGEMONOMIAL); + } +} + +egrator{SPC}* { + Integrator.no_timescales = -1; + Integrator.tau = 1.; + Integrator.monitor_forces = 0; + for(i = 0; i < 10; i++) { + Integrator.lambda[i] = _default_2mn_lambda; + Integrator.type[i] = MN2; + } + if(myverbose) printf("initialising integrator line %d\n", line_of_file); + BEGIN(INTEGRATOR); +} +{ + {SPC}*Type{DIGIT}{EQL}{TYPE} { + type = (char*)malloc(100*sizeof(char)); + sscanf(yytext, " %[a-zA-Z]%d = %s", name, &a, type); + if(strcmp(type, "LEAPFROG")==0) { + Integrator.type[a] = LEAPFROG; + } + else if(strcmp(type, "2MN")==0) { + Integrator.type[a] = MN2; + } + else if(strcmp(type, "2MNPOSITION")==0) { + Integrator.type[a] = MN2p; + } + else if(strcmp(type, "OMF4")==0) { + Integrator.type[a] = OMF4; + } + else { + fprintf(stderr, "Unknown integrator type %s in line %d\n", yytext, line_of_file); + exit(1); + } + + if(myverbose) printf(" timescale %d type = %s line %d\n", a, type, line_of_file); + free(type); + } + {SPC}*MonitorForces{EQL}yes { + Integrator.monitor_forces = 1; + if(myverbose) printf(" Force monitoring switched on in line %d\n", line_of_file); + BEGIN(INTEGRATOR); + } + {SPC}*MonitorForces{EQL}no { + Integrator.monitor_forces = 0; + if(myverbose) printf(" Force monitoring switched off in line %d\n", line_of_file); + BEGIN(INTEGRATOR); + } + {SPC}*IntegrationSteps{DIGIT}{EQL}{DIGIT}+ { + sscanf(yytext, " %[a-zA-Z]%d = %d", name, &a, &b); + if(myverbose) printf(" timescale %d steps=%d line %d\n", a, b, line_of_file); + Integrator.n_int[a] = b; + BEGIN(INTEGRATOR); + } + {SPC}*Lambda{DIGIT}{EQL}{FLT} { + sscanf(yytext, " %[a-zA-Z]%d = %lf", name, &a, &c); + Integrator.lambda[a] = c; + if(myverbose) printf(" timescale %d Lambda=%f line %d\n", a, c, line_of_file); + BEGIN(INTEGRATOR); + } + {SPC}*NumberOfTimescales{EQL}{DIGIT}+ { + sscanf(yytext, " %[a-zA-Z] = %d", name, &a); + if(myverbose) printf(" Number of timescales set to %d line %d\n", a, line_of_file); + if(a > 10) { + if(g_proc_id == 0) fprintf(stderr, "maximal number of timescales is 10! Aborting...!\n"); + exit(-1); + } + Integrator.no_timescales = a; + BEGIN(INTEGRATOR); + } + {SPC}*Tau{EQL}{FLT} { + sscanf(yytext, " %[a-zA-Z] = %lf", name, &c); + if(myverbose) printf(" tau set to %e line %d\n", c, line_of_file); + Integrator.tau = c; + BEGIN(INTEGRATOR); + } + EndIntegrator{SPC}* { + if(Integrator.no_timescales == -1) { + fprintf(stderr, "NumberOfTimescales must be specified!\n"); + exit(1); + } + if(myverbose) printf("Integrators parsed line %d\n\n", line_of_file); + BEGIN(0); + } +} + +{ + Point { + SourceInfo.type = 0; + if(myverbose) printf("Using Point Sources\n"); + } + Volume { + SourceInfo.type = 1; + if(myverbose) printf("Using Volume Sources\n"); + } + TimeSlice { + SourceInfo.type = 2; + if(myverbose) printf("Using TimeSlice Sources\n"); + } +} +{ + yes { + PropInfo.splitted = 1; + if(myverbose) printf("Writing Propagators in seperate files\n"); + } + no { + PropInfo.splitted = 0; + if(myverbose) printf("Writing all Propagators per gauge into one single files\n"); + } +} + +{DIGIT}+ { + no_samples = atoi(yytext); + if(myverbose) printf("Inverting for %d samples\n", no_samples); +} + +{TYPE} { + if(myverbose) printf("initialising measurements line %d\n", line_of_file); + current_measurement++; + meas = &measurement_list[current_measurement]; + meas->id = current_measurement; + meas->direction = 0; + meas->max_iter = 15000; + if(strcmp(yytext, "CORRELATORS")==0) { + meas->type = ONLINE; + strcpy((*meas).name, "CORRELATORS"); + } + else if(strcmp(yytext, "PIONNORM")==0) { + meas->type = PIONNORM; + strcpy((*meas).name, "PIONNORM"); + } + else if(strcmp(yytext, "POLYAKOVLOOP")==0) { + meas->type = POLYAKOV; + strcpy((*meas).name, "POLYAKOV"); + } + else if(strcmp(yytext, "ORIENTEDPLAQUETTES")==0) { + meas->type = ORIENTED_PLAQUETTES; + strcpy(meas->name, "ORIENTEDPLAQUETTES"); + } + else if(strcmp(yytext, "GRADIENTFLOW")==0) { + meas->type = GRADIENT_FLOW; + strcpy(meas->name, "GRADIENTFLOW"); + } + else { + fprintf(stderr, "Unknown measurement type %s in line %d\n", yytext, line_of_file); + exit(1); + } + /*set default frequency here, in case it is not specified + in the input file */ + meas->freq = _default_measurement_freq; + if(!reread) { + if(add_measurement(meas->type) < 0) { + fprintf(stderr, "Something went wrong in adding measurements\nAborting...!\n"); + exit(1); + } + } + if(myverbose) printf("initialising measurement with type %s %d line %d\n", yytext, meas->type, line_of_file); + if(myverbose) printf("measurement has id %d\n", current_measurement); + + if(meas->type == ONLINE) BEGIN(ONLINEMEAS); + else if(meas->type == PIONNORM) BEGIN(PIONNORMMEAS); + else if(meas->type == POLYAKOV) BEGIN(PLOOP); + else if(meas->type == ORIENTED_PLAQUETTES) BEGIN(ORIENTEDPLAQUETTESMEAS); + else if(meas->type == GRADIENT_FLOW) BEGIN(GRADIENTFLOWMEAS); +} + +{ + ^EndMeasurement{SPC}* { + if(myverbose) printf("Measurement with id %d parsed in line %d\n\n", meas->id, line_of_file); + BEGIN(0); + } + {SPC}*Frequency{EQL}{DIGIT}+ { + sscanf(yytext, " %[a-zA-Z] = %d", name, &a); + meas->freq = a; + if(myverbose!=0) printf(" Frequency for measurement with id %d set to %d\n", meas->id, meas->freq); + } +} + +{ + {SPC}*MaxSolverIterations{EQL}{DIGIT}+ { + sscanf(yytext, " %[a-zA-Z] = %d", name, &a); + meas->max_iter = a; + if(myverbose) printf(" MaxSolverIterations set to %d line %d measurement id=%d\n", a, line_of_file, meas->id); + } +} + +{ + {SPC}*Direction{EQL}[03] { + sscanf(yytext, " %[a-zA-Z] = %d", name, &a); + meas->direction = a; + if(myverbose!=0) fprintf(stderr, " Direction for polyakov loop set to %d\n", meas->direction); + } +} + +{DIGIT}+ { +#ifndef FIXEDVOLUME + T_global = atoi(yytext); + if(myverbose!=0) printf("T =%s\n", yytext); +#endif +} +{DIGIT}+ { +#ifndef FIXEDVOLUME + L = atoi(yytext); + if(myverbose!=0) printf("L =%s\n", yytext); +#endif +} +{DIGIT}+ { +#ifndef FIXEDVOLUME + LX = atoi(yytext); + if(myverbose!=0) printf("LX =%s\n", yytext); +#endif +} +{DIGIT}+ { +#ifndef FIXEDVOLUME + LY = atoi(yytext); + if(myverbose!=0) printf("LY =%s\n", yytext); +#endif +} +{DIGIT}+ { +#ifndef FIXEDVOLUME + LZ = atoi(yytext); + if(myverbose!=0) printf("LZ =%s\n", yytext); +#endif +} +{DIGIT}+ { +#ifndef FIXEDVOLUME + N_PROC_X = atoi(yytext); + if(myverbose!=0) printf("Nr of processors in x direction = %s\n", yytext); +#endif +} +{DIGIT}+ { +#ifndef FIXEDVOLUME + N_PROC_Y = atoi(yytext); + if(myverbose!=0) printf("Nr of processors in y direction = %s\n", yytext); +#endif +} +{DIGIT}+ { +#ifndef FIXEDVOLUME + N_PROC_Z = atoi(yytext); + if(myverbose!=0) printf("Nr of processors in z direction = %s\n", yytext); +#endif +} +{DIGIT}+ { + propagator_comparison=atoi(yytext); + if(myverbose!=0) printf("propagator_comparison = %s \n", yytext); +} +{DIGIT}+ { + nb_cores=atoi(yytext); + if(myverbose!=0) printf("nb_cores = %s \n", yytext); +} +{DIGIT}+ { + omp_num_threads=atoi(yytext); + if(myverbose!=0) printf("omp_num_threads = %s \n", yytext); +} +{DIGIT}+ { + nblocks_t=atoi(yytext); + if(myverbose!=0) printf("nblocks_t = %s \n", yytext); +} +{DIGIT}+ { + nblocks_x=atoi(yytext); + if(myverbose!=0) printf("nblocks_x = %s \n", yytext); +} +{DIGIT}+ { + nblocks_y=atoi(yytext); + if(myverbose!=0) printf("nblocks_y = %s \n", yytext); +} +{DIGIT}+ { + nblocks_z=atoi(yytext); + if(myverbose!=0) printf("nblocks_z = %s \n", yytext); +} +{DIGIT}+ { + dfl_field_iter=atoi(yytext); + if(myverbose!=0) printf("dfl_fields_iter = %s \n", yytext); +} +{DIGIT}+ { + dfl_poly_iter=atoi(yytext); + if(myverbose!=0) printf("dfl_poly_iter = %s \n", yytext); +} +{DIGIT}+ { + random_seed=atoi(yytext); + if(myverbose!=0) printf("seed=%s \n", yytext); +} +[12] { + rlxd_level = atoi(yytext); + if(myverbose!=0) printf("RanluxdLevel set to %d \n", rlxd_level); +} +{FLT} { + g_kappa=atof(yytext); + if(myverbose!=0) printf("kappa=%s \n", yytext); +} +{FLT} { + g_mubar=atof(yytext); + if(myverbose!=0) printf("2 kappa mubar=%s \n", yytext); +} +{FLT} { + g_epsbar=atof(yytext); + if(myverbose!=0) printf("2 kappa epsbar=%s \n", yytext); +} +{FLT} { + g_mu1=atof(yytext); + if(myverbose!=0) printf("2 kappa mu=%s \n", yytext); +} +{FLT} { + g_c_sw=atof(yytext); + if(myverbose!=0) printf("c_sw=%lf \n", g_c_sw); +} +{ + cold { + startoption=0; + if(myverbose!=0) printf("Start Condition is %s \n",yytext); + } + hot { + startoption=1; + if(myverbose!=0) printf("Start Condition is %s \n",yytext); + } + restart { + startoption=2; + if(myverbose!=0) printf("Start Condition is %s \n",yytext); + } + continue { + startoption=3; + if(myverbose!=0) printf("Start Condition is %s \n",yytext); + } +} +{DIGIT}+ { + Ntherm=atoi(yytext); + if(myverbose!=0) printf("Nterm= %s \n",yytext); +} +{DIGIT}+ { + Nmeas=atoi(yytext); + if(myverbose!=0) printf("Nmeas= %s \n",yytext); +} +{DIGIT}+ { + Nsave=atoi(yytext); + if(myverbose!=0) printf("Nsave= %s \n",yytext); +} +{DIGIT}+ { + gmres_m_parameter = atoi(yytext); + if(myverbose!=0) printf("Use Krylov Space of size %d in GMRES \n", gmres_m_parameter); +} +{DIGIT}+ { + gmresdr_nr_ev = atoi(yytext); + if(myverbose!=0) printf("Deflate %d eigenvectors in GMRES-DR \n", gmresdr_nr_ev); +} +{DIGIT}+ { + g_N_s = atoi(yytext); + if(myverbose!=0) printf("Deflation subspace dimension set to %d \n", g_N_s); +} +{ + none { + if(myverbose!=0) printf("Using no right preconditioner \n"); + } + polynomial { + if(myverbose!=0) printf("Using polynomial as right preconditioner \n"); + } + cg { + if(myverbose!=0) printf("Using cg as right preconditioner \n"); + } +} +yes { + write_cp_flag=1; + if(myverbose!=0) printf("Write Checkpoints\n"); +} +no { + write_cp_flag=0; + if(myverbose!=0) printf("Don't write Checkpoints\n"); +} +yes { + g_disable_IO_checks = 1; + if(myverbose!=0) printf("Disable IO checks (and readback in case of Lemon IO)\n"); +} +no { + g_disable_IO_checks = 0; + if(myverbose!=0) printf("Enable IO checks (and readback in case of Lemon IO)\n"); +} +{DIGIT}+ { + cp_interval=atoi(yytext); + if(myverbose!=0) printf("Write Checkpoint all %s measurements\n",yytext); +} +{FILENAME} { + int length = strlen(yytext)+1; + if(length >= sizeof(gauge_input_filename)/sizeof(char) ) + yy_fatal_error("Filename GaugeConfigInputFile too long! (see read_input.l)\n"); + + strcpy(gauge_input_filename,yytext); + if(myverbose!=0) printf("Gauge Configuration input filename set to %s\n",yytext); +} +{DIGIT}+ { + nstore=atoi(yytext); + if(myverbose!=0) printf("Initial store counter set to %s\n",yytext); +} +readin { + nstore=-1; + if(myverbose!=0) printf("Trying to read InitialStoreCounter from file .nstore_counter\n"); +} +all { + g_stdio_proc = -1; + if(myverbose!=0) printf("All processors will give output to stdout\n"); +} +no { + g_stdio_proc = -2; + if(myverbose!=0) printf("No processor will give output to stdout\n"); +} +{DIGIT}+ { + g_stdio_proc = atoi(yytext); + if(myverbose!=0) printf("processor %s will give output to stdout\n", yytext); +} +{DIGIT}+ { + index_start = atoi(yytext); + index_end = index_start+1; + if((index_start < 0)||(index_start > 11)){ + printf("Error in line %d! index_start must be in [0,11]! Exiting...!\n", line_of_file); + exit(1); + } + if(myverbose!=0) printf("inverting for index %s\n", yytext); +} +{IDXEX} { + sscanf(yytext, "-%d", &index_end); + if((index_end < 0)||(index_end > 11)){ + printf("Error in line %d! index_end must be in [0,11]! Exiting...!\n", line_of_file); + exit(1); + } + if(myverbose!=0) printf("inverting up to spin-color index %d\n", index_end); + index_end+=1; +} +{FLT} { + X0 = atof(yytext); + if(myverbose != 0) printf("X0 for boundary cond. in t-direction set to %e times pi\n", X0); +} +{FLT} { + X1 = atof(yytext); + if(myverbose != 0) printf("X1 for boundary cond. in x-direction set to %e times pi\n", X1); +} +{FLT} { + X2 = atof(yytext); + if(myverbose != 0) printf("X2 for boundary cond. in y-direction set to %e times pi\n", X2); +} +{FLT} { + X3 = atof(yytext); + if(myverbose != 0) printf("X3 for boundary cond. in z-direction set to %e times pi\n", X3); +} +yes { + read_source_flag=1; + if(myverbose!=0) printf("Read inversion source from file\n"); +} +no { + read_source_flag=0; + if(myverbose!=0) printf("Don't read inversion source from file\n"); +} +nobutsave { + read_source_flag=2; + if(myverbose!=0) printf("Don't read inversion source from file, but save the one generated\n"); +} +{FILENAME} { + if(SourceInfo.basename == NULL) free(SourceInfo.basename); + SourceInfo.basename = (char*)malloc((strlen(yytext)+1)*sizeof(char)); + strcpy(SourceInfo.basename, yytext); + if(PropInfo.basename == NULL) free(PropInfo.basename); + PropInfo.basename = (char*)malloc((strlen(yytext)+1)*sizeof(char)); + strcpy(PropInfo.basename, yytext); + if(myverbose!=0) printf("source input filename set to %s\n",yytext); +} +etmc { + SourceInfo.format = 0; + if(myverbose!=0) printf("Using standard ETMC binary format for source input file\n"); +} +cmi { + SourceInfo.format = 11; + if(myverbose!=0) printf("Using CM format for source input file\n"); +} +gwc { + SourceInfo.format = 10; + if(myverbose!=0) printf("Using GWC format for source input file\n"); +} +{DIGIT}+ { + SourceInfo.t = atoi(yytext); + SourceInfo.automaticTS = 0; + if(myverbose!=0) printf("Using only timeslice %s of the source, padding the rest with zeros\n", yytext); +} +detect { + SourceInfo.automaticTS = 1; + if(myverbose!=0) printf("Try to detect source timeslice automatically\n"); +} +yes { + g_relative_precision_flag = 1; + if(myverbose!=0) printf("Using relative precision\n"); +} +no { + g_relative_precision_flag = 0; + if(myverbose!=0) printf("Using absolute precision\n"); +} +yes { + return_check_flag = 1; + if(myverbose!=0) printf("Perform checks of Reversibility\n"); +} +no { + return_check_flag = 0; + if(myverbose!=0) printf("Don't perform checks of Reversibility\n"); +} +{DIGIT}+ { + return_check_interval = atoi(yytext); + if(myverbose!=0) printf("Check reversibility all %d trajectories\n", return_check_interval); +} +{DIGIT}+ { + g_debug_level = atoi(yytext); + if(myverbose!=0) printf("Debug level = %d\n", g_debug_level); +} +32 { + gauge_precision_read_flag = 32; + if(myverbose!=0) printf("Read gauges in 32 Bit precision!\n"); +} +64 { + gauge_precision_read_flag = 64; + if(myverbose!=0) printf("Read gauges in 64 Bit precision!\n"); +} +32 { + gauge_precision_write_flag = 32; + if(myverbose!=0) printf("Save gauges in 32 Bit precision!\n"); +} +64 { + gauge_precision_write_flag = 64; + if(myverbose!=0) printf("Save gauges in 64 Bit precision!\n"); +} +yes { + reproduce_randomnumber_flag = 1; + if(myverbose!=0) printf("Use reproducable randomnumbers!\n"); +} +no { + reproduce_randomnumber_flag = 0; + if(myverbose!=0) printf("Use a different seed for each process in ranlxd!\n"); +} +yes { + g_sloppy_precision_flag = 1; + if(myverbose!=0) printf("Use sloppy precision if available!\n"); +} +no { + g_sloppy_precision_flag = 0; + if(myverbose!=0) printf("Don't use sloppy precision!\n"); +} +yes { + use_stout_flag = 1; + if(myverbose!=0) printf("Use stout smearing for invert!\n"); +} +no { + use_stout_flag = 0; + if(myverbose!=0) printf("Don't use stout smearing for invert!\n"); +} +yes { + use_preconditioning = 1; + if(myverbose) printf("Using project \"QCD-Preconditioning\"\n"); +} +no { + use_preconditioning = 0; + if(myverbose) printf("not using project \"QCD-Preconditioning\"\n"); +} +{FLT} { + stout_rho=atof(yytext); + if(myverbose!=0) printf("use stout rho=%e!\n", stout_rho); +} +{DIGIT}+ { + stout_no_iter=atoi(yytext); + if(myverbose!=0) printf("make %d stout iterations!\n", stout_no_iter); +} +yes { + compute_evs=1; + if(myverbose!=0) printf("Compute Eigenvalues in invert."); +} +no { + compute_evs=0; + if(myverbose!=0) printf("Do not compute Eigenvalues in invert."); +} +readin { + compute_evs=2; + if(myverbose!=0) printf("Try to only read in eigenvalues and vectors in invert."); +} +yes { + compute_modenumber=1; + if(myverbose!=0) printf("Compute Mode Number using Spectral Projectors in invert.\n"); +} +no { + compute_modenumber=0; + if(myverbose!=0) printf("Do not compute Mode Number using Spectral Projectors in invert.\n"); +} +yes { + compute_topsus=1; + if(myverbose!=0) printf("Compute Topological Susceptibility using Spectral Projectors in invert.\n"); +} +no { + compute_topsus=0; + if(myverbose!=0) printf("Do not compute Topological Susceptibility using Spectral Projectors in invert.\n"); +} +{FLT} { + mstarsq = atof(yytext); + if(myverbose!=0) printf("Mstar^2 = %f \n", mstarsq); +} +{DIGIT}+ { + no_sources_z2 = atoi(yytext); + if(myverbose!=0) printf("no of Z2 random sources used for the spectral projectors method = %d\n", no_sources_z2); +} +{DIGIT}+ { + source_location=atoi(yytext); + if(myverbose!=0) printf("source_location = %s\n",yytext); +} +{FLT} { + eigenvalue_precision = atof(yytext); + if(myverbose!=0) printf("precision for eigenvalues = %e\n", eigenvalue_precision); +} +{DIGIT}+ { + no_eigenvalues = atoi(yytext); + if(myverbose!=0) printf("no of eigenvalues = %d\n", no_eigenvalues); +} +yes { + sub_evs_cg_flag = 1; + if(myverbose!=0) printf("project out eigenvector subspace\n"); +} +no { + sub_evs_cg_flag = 0; + if(myverbose!=0) printf("Do no project out eigenvector subspace\n"); +} +yes { + even_odd_flag = 1; + if(myverbose) printf("Use even/odd preconditioning\n"); +} +no { + even_odd_flag = 0; + if(myverbose) printf("Do not use even/odd preconditioning\n"); +} +yes { + bc_flag = 1; + if(verbose) printf("Schroedinger Functional bc\n"); +} +no { + bc_flag = 0; + if(verbose) printf("Periodic bc\n"); +} +gwc { + PropInfo.format = 10; + if(myverbose!=0) fprintf(stderr, "GWC format no longer supported for writing propagators\n"); +} +cmi { + PropInfo.format = 11; + if(myverbose!=0) fprintf(stderr, "CM format no longer supported for writing propagators\n"); +} +DiracFermion_Sink { + PropInfo.format = 0; + if(myverbose!=0) printf("Propagator type: DiracFermion_Sinks\n"); +} +DiracFermion_Source_Sink_Pairs { + PropInfo.format = 1; + if(myverbose!=0) printf("Propagator type: DiracFermion_Source_Sink_Pairs\n"); +} +DiracFermion_ScalarSource_TwelveSink { + PropInfo.format = 1; + fprintf(stderr, "Propagator type: DiracFermion_ScalarSource_TwelveSink, not yet supported\n"); +} +DiracFermion_ScalarSource_FourSink { + PropInfo.format = 1; + fprintf(stderr, "Propagator type: DiracFermion_ScalarSource_FourSink, not yet supported\n"); +} +yes { + reweighting_flag = 1; + if(myverbose!=0) fprintf(stderr, "Compute reweighting factor\n"); +} +no { + reweighting_flag = 0; + if(myverbose!=0) fprintf(stderr, "Do not compute reweighting factor\n"); +} +{DIGIT}+ { + reweighting_samples = atoi(yytext); + if(myverbose!=0) fprintf(stderr, "Number of reweighting samples set to %d\n", reweighting_samples); +} + +{DIGIT}+ { + mixcg_maxinnersolverit = atoi(yytext); + if(myverbose) printf("MixedCG: setting maximal inner solver iterations to %d\n", mixcg_maxinnersolverit); +} + +{FLT} { + mixcg_innereps=atof(yytext); + if(myverbose!=0) printf("MixedCG: setting inner solver eps to %s \n", yytext); +} + +<*>^# { + comment_caller = YY_START; + BEGIN(COMMENT); +} +<*>{SPC}*# { + comment_caller = YY_START; + BEGIN(COMMENT); +} +[^\n]* { + BEGIN(comment_caller); +} + + +{SPC}*\n { + line_of_file++; +} +<*>{SPC}*\n { + line_of_file++; + BEGIN(0); +} + +<*>. { + BEGIN(ERROR); +} +[^\t\n]* { + fprintf(stderr, "Parsing error in line %d\nAborting...!\n", line_of_file); + fprintf(stderr, "Could not make sense out off: %s\n", yytext); + exit(1); +} + + +%% + +/* + * Dummy (but not dumb) routine - well, function + */ + +int yywrap() +{ + return(1); +} + +/* + * This is the function to parse the input file. + * default values for all paramters will be set + * correspondig to settings in + * default_input_values.h + * + * read_input expects the filename of the input file + * as an input parameter. + * + * read_input returns 2 if the input file did not exist + */ + +int read_input(char * conf_file){ + + /******************************************** + * Setting default values! + ********************************************/ + reread = 0; +#ifndef FIXEDVOLUME + T_global = _default_T_global; + L = _default_L; + LX = _default_LX; + LY = _default_LY; + LZ = _default_LZ; + N_PROC_X = _default_N_PROC_X; + N_PROC_Y = _default_N_PROC_Y; + N_PROC_Z = _default_N_PROC_Z; +#endif + propagator_comparison = 0; + nb_cores = 1; + + omp_num_threads=_default_omp_num_threads; + + nblocks_t = 1; + nblocks_x = 1; + nblocks_y = 1; + nblocks_z = 1; + + dfl_field_iter = 80; + dfl_poly_iter = 20; + + g_kappa = _default_g_kappa; + g_mubar = _default_g_mubar; + g_epsbar = _default_g_epsbar; + g_mu = _default_g_mu; + g_c_sw = _default_c_sw; + g_mu1 = _default_g_mu1; + g_mu2 = _default_g_mu2; + g_mu3 = _default_g_mu3; + g_dbw2rand = 0; + g_running_phmc = 0; + g_beta = _default_g_beta; + g_N_s = _default_g_N_s; + g_dflgcr_flag = _default_g_dflgcr_flag; + random_seed = _default_random_seed; + rlxd_level = _default_rlxd_level; + startoption = _default_startoption; + Ntherm = _default_Ntherm; + Nmeas = _default_Nmeas; + Nsave = _default_Nsave; + write_cp_flag = _default_write_cp_flag; + cp_interval = _default_cp_interval; + nstore = _default_nstore; + strcpy(rlxd_input_filename, _default_rlxd_input_filename); + strcpy(gauge_input_filename, _default_gauge_input_filename); + g_stdio_proc = _default_g_stdio_proc; + index_start = _default_index_start; + index_end = _default_index_end; + X0 = _default_X0; + X1 = _default_X1; + X2 = _default_X2; + X3 = _default_X3; + g_rgi_C1 = _default_g_rgi_C1; + read_source_flag= _default_read_source_flag; + if(SourceInfo.basename == NULL) SourceInfo.basename = (char*)malloc(100*sizeof(char)); + strcpy(SourceInfo.basename, _default_source_filename); + if(PropInfo.basename == NULL) PropInfo.basename = (char*)malloc(100*sizeof(char)); + strcpy(PropInfo.basename, _default_source_filename); + PropInfo.splitted = _default_propagator_splitted; + SourceInfo.splitted = _default_source_splitted; + g_relative_precision_flag = _default_g_relative_precision_flag; + return_check_flag = _default_return_check_flag; + return_check_interval = _default_return_check_interval; + g_debug_level = _default_g_debug_level; + SourceInfo.t = _default_source_time_slice; + SourceInfo.automaticTS = _default_automaticTS; + gmres_m_parameter = _default_gmres_m_parameter; + gmresdr_nr_ev = _default_gmresdr_nr_ev; + gauge_precision_read_flag = _default_gauge_precision_read_flag; + gauge_precision_write_flag = _default_gauge_precision_write_flag; + g_disable_IO_checks = _default_g_disable_IO_checks; + reproduce_randomnumber_flag = _default_reproduce_randomnumber_flag; + g_sloppy_precision_flag = _default_g_sloppy_precision_flag; + use_stout_flag = _default_use_stout_flag; + use_preconditioning = _default_use_preconditioning; + use_qudainverter = _default_use_qudainverter; + stout_rho = _default_stout_rho; + stout_no_iter = _default_stout_no_iter; + + /* check for reread ! */ + phmc_compute_evs = _default_phmc_compute_evs; + compute_evs = _default_compute_evs; + stilde_min = _default_stilde_min; + stilde_max = _default_stilde_max; + degree_of_p = _default_degree_of_p; + source_location = _default_source_location; + eigenvalue_precision = _default_eigenvalue_precision; + no_eigenvalues = _default_no_eigenvalues; + sub_evs_cg_flag = _default_sub_evs_cg_flag; + phmc_exact_poly = _default_phmc_exact_poly; + even_odd_flag = _default_even_odd_flag; + bc_flag = _default_bc_flag; + SourceInfo.type = _default_source_type_flag; + no_samples = _default_no_samples; + compute_modenumber = _default_compute_modenumber; + compute_topsus = _default_compute_topsus; + mstarsq = _default_mstarsq; + no_sources_z2 = _default_no_sources_z2; + device_num = _default_device_num; + min_innersolver_it = _default_min_innersolver_it; + max_mms_shifts = _default_max_mms_shifts; + use_mixed_mms = 0; + innersolver_precision_rel = 1.e-4; + innersolver_precision_abs = 1.e-4; + + mixcg_innereps = _default_mixcg_innereps; + mixcg_maxinnersolverit = _default_mixcg_maxinnersolverit; + + /* Put -1 in PropInfo.format to see if parse_config() will + change the value. If not then set it to source_format_flag */ + PropInfo.format = -1; + /********************************************/ + + if(verbose && g_proc_id == 0) { + myverbose = 1; + } + if ((yyin = fopen(conf_file, "rt")) == NULL){ + return(2); + } + yyout = fopen("/dev/null", "w"); + + parse_config(); +#ifndef FIXEDVOLUME + if(LX == 0) { + LX = L; + } + if(LY == 0) { + LY = L; + } + if(LZ == 0) { + LZ = L; + } +#endif + + if(PropInfo.format == -1) PropInfo.format = SourceInfo.format; + g_rgi_C0 = 1. - 8.*g_rgi_C1; + + + fclose(yyout); + fclose(yyin); + return(0); +} + + +/* + * This is the function to parse the input file + * again. Only parameters are changed, that + * are specified in the input file. + * default values for paramters will not be set. + * + * reread_input expects the filename of the input file + * as an input parameter. + * + * reread_input returns 2 if the input file did not exist + */ + +int reread_input(char * conf_file){ +#ifndef FIXEDVOLUME + int tt=T, ll=L, lx = LX, ly = LY, lz = LZ, + np=N_PROC_X, npy = N_PROC_Y; +#endif + int nst=nstore; + + if(verbose && g_proc_id == 0) { + myverbose = 1; + } + current_monomial = -1; + reread = 1; + + /******************************************** + * Setting default values! + ********************************************/ + + /********************************************/ + + if ((yyin = fopen(conf_file, "rt")) == NULL){ + return(2); + } + yyout = fopen("/dev/null", "w"); + + parse_config(); + +#ifndef FIXEDVOLUME + T = tt; + L = ll; + LX = lx; + LY = ly; + LZ = lz; + N_PROC_X = np; + N_PROC_Y = npy; +#endif + + + if(g_dbw2rand == 0) { + g_rgi_C1 = 0.; + } + nstore = nst; + + g_rgi_C0 = 1. - 8.*g_rgi_C1; + g_mu = g_mu1; + + fclose(yyout); + fclose(yyin); + return(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/update_backward_gauge.c b/qcd/part_cpu/applications/QCD/src/kernel_D/update_backward_gauge.c new file mode 100644 index 0000000000000000000000000000000000000000..ef6f2ac3d55d8afe497530d4d21685523b178bec --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/update_backward_gauge.c @@ -0,0 +1,312 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include "global.h" +#include "su3.h" +#include "update_backward_gauge.h" + + +#if defined _USE_HALFSPINOR +void update_backward_gauge(su3 ** const gf) { +#ifdef OMP +#pragma omp parallel + { +#endif + + int ix=0, kb=0, iy=0; + +#ifdef OMP +#pragma omp for +#endif + for(ix = 0; ix < VOLUME/2; ix++) { + iy = (VOLUME+RAND)/2+ix; + kb = g_idn[ g_eo2lexic[iy] ][0]; + _su3_assign(g_gauge_field_copy[0][ix][0], gf[kb][0]); + kb = g_idn[ g_eo2lexic[iy] ][1]; + _su3_assign(g_gauge_field_copy[0][ix][1], gf[kb][1]); + kb = g_idn[ g_eo2lexic[iy] ][2]; + _su3_assign(g_gauge_field_copy[0][ix][2], gf[kb][2]); + kb = g_idn[ g_eo2lexic[iy] ][3]; + _su3_assign(g_gauge_field_copy[0][ix][3], gf[kb][3]); + + kb = g_idn[ g_eo2lexic[ix] ][0]; + _su3_assign(g_gauge_field_copy[1][ix][0], gf[kb][0]); + kb = g_idn[ g_eo2lexic[ix] ][1]; + _su3_assign(g_gauge_field_copy[1][ix][1], gf[kb][1]); + kb = g_idn[ g_eo2lexic[ix] ][2]; + _su3_assign(g_gauge_field_copy[1][ix][2], gf[kb][2]); + kb = g_idn[ g_eo2lexic[ix] ][3]; + _su3_assign(g_gauge_field_copy[1][ix][3], gf[kb][3]); + } + +#ifdef OMP + } /* OpenMP closing brace */ +#endif + + g_update_gauge_copy = 0; + return; +} + +void update_backward_gauge_32_orphaned(su3_32 ** const gf) { + + int ix=0, kb=0, iy=0; + +#ifdef OMP +#pragma omp for +#endif + for(ix = 0; ix < VOLUME/2; ix++) { + iy = (VOLUME+RAND)/2+ix; + kb = g_idn[ g_eo2lexic[iy] ][0]; + _su3_assign(g_gauge_field_copy_32[0][ix][0], gf[kb][0]); + kb = g_idn[ g_eo2lexic[iy] ][1]; + _su3_assign(g_gauge_field_copy_32[0][ix][1], gf[kb][1]); + kb = g_idn[ g_eo2lexic[iy] ][2]; + _su3_assign(g_gauge_field_copy_32[0][ix][2], gf[kb][2]); + kb = g_idn[ g_eo2lexic[iy] ][3]; + _su3_assign(g_gauge_field_copy_32[0][ix][3], gf[kb][3]); + + kb = g_idn[ g_eo2lexic[ix] ][0]; + _su3_assign(g_gauge_field_copy_32[1][ix][0], gf[kb][0]); + kb = g_idn[ g_eo2lexic[ix] ][1]; + _su3_assign(g_gauge_field_copy_32[1][ix][1], gf[kb][1]); + kb = g_idn[ g_eo2lexic[ix] ][2]; + _su3_assign(g_gauge_field_copy_32[1][ix][2], gf[kb][2]); + kb = g_idn[ g_eo2lexic[ix] ][3]; + _su3_assign(g_gauge_field_copy_32[1][ix][3], gf[kb][3]); + } + +// we use the implicit barrier at the end of the single section to catch all +// threads, in the meantime, one of them modifies the global flag +#ifdef OMP +#pragma omp single + { +#endif + g_update_gauge_copy_32 = 0; +#ifdef OMP + } +#endif +} + +void update_backward_gauge_32(su3_32 ** const gf) { +#ifdef OMP +#pragma omp parallel + { +#endif + update_backward_gauge_32_orphaned(gf); +#ifdef OMP + } /* OpenMP closing brace */ +#endif + return; +} + +#elif _USE_TSPLITPAR + +void update_backward_gauge(su3 ** const gf) { +#ifdef OMP +#pragma omp parallel + { +#endif + + int ix=0, kb=0, kb2=0; + +#ifdef OMP +#pragma omp for +#endif + for(ix = 0; ix < VOLUME/2;ix++) { + kb2=g_eo2lexic[ix]; + _su3_assign(g_gauge_field_copyt[ix][0],gf[kb2][0]); + kb=g_idn[g_eo2lexic[ix]][0]; + _su3_assign(g_gauge_field_copyt[ix][1],gf[kb][0]); + + _su3_assign(g_gauge_field_copys[ix][0],gf[kb2][1]); + kb=g_idn[g_eo2lexic[ix]][1]; + _su3_assign(g_gauge_field_copys[ix][1],gf[kb][1]); + + _su3_assign(g_gauge_field_copys[ix][2],gf[kb2][2]); + kb=g_idn[g_eo2lexic[ix]][2]; + _su3_assign(g_gauge_field_copys[ix][3],gf[kb][2]); + + _su3_assign(g_gauge_field_copys[ix][4],gf[kb2][3]); + kb=g_idn[g_eo2lexic[ix]][3]; + _su3_assign(g_gauge_field_copys[ix][5],gf[kb][3]); + } +#ifdef OMP +#pragma omp for +#endif + for(ix = (VOLUME+RAND)/2; ix < (VOLUME+RAND)/2+VOLUME/2;ix++) { + kb2=g_eo2lexic[ix]; + _su3_assign(g_gauge_field_copyt[ix][0],gf[kb2][0]); + kb=g_idn[g_eo2lexic[ix]][0]; + _su3_assign(g_gauge_field_copyt[ix][1],gf[kb][0]); + + _su3_assign(g_gauge_field_copys[ix][0],gf[kb2][1]); + kb=g_idn[g_eo2lexic[ix]][1]; + _su3_assign(g_gauge_field_copys[ix][1],gf[kb][1]); + + _su3_assign(g_gauge_field_copys[ix][2],gf[kb2][2]); + kb=g_idn[g_eo2lexic[ix]][2]; + _su3_assign(g_gauge_field_copys[ix][3],gf[kb][2]); + + _su3_assign(g_gauge_field_copys[ix][4],gf[kb2][3]); + kb=g_idn[g_eo2lexic[ix]][3]; + _su3_assign(g_gauge_field_copys[ix][5],gf[kb][3]); + } + +#ifdef OMP + } /* OpenMP closing brace */ +#endif + + g_update_gauge_copy = 0; + return; +} + +#else + +void update_backward_gauge(su3 ** const gf) { +#ifdef OMP +#pragma omp parallel + { +#endif + + int ix=0, kb=0, kb2=0; + +#ifdef OMP +#pragma omp for +#endif + for(ix = 0; ix < VOLUME/2; ix++) { + kb2=g_eo2lexic[ix]; + _su3_assign(g_gauge_field_copy[ix][0],gf[kb2][0]); + kb=g_idn[g_eo2lexic[ix]][0]; + _su3_assign(g_gauge_field_copy[ix][1],gf[kb][0]); + + _su3_assign(g_gauge_field_copy[ix][2],gf[kb2][1]); + kb=g_idn[g_eo2lexic[ix]][1]; + _su3_assign(g_gauge_field_copy[ix][3],gf[kb][1]); + + _su3_assign(g_gauge_field_copy[ix][4],gf[kb2][2]); + kb=g_idn[g_eo2lexic[ix]][2]; + _su3_assign(g_gauge_field_copy[ix][5],gf[kb][2]); + + _su3_assign(g_gauge_field_copy[ix][6],gf[kb2][3]); + kb=g_idn[g_eo2lexic[ix]][3]; + _su3_assign(g_gauge_field_copy[ix][7],gf[kb][3]); + } +#ifdef OMP +#pragma omp for +#endif + for(ix = (VOLUME+RAND)/2; ix < (VOLUME+RAND)/2+VOLUME/2; ix++) { + kb2=g_eo2lexic[ix]; + _su3_assign(g_gauge_field_copy[ix][0],gf[kb2][0]); + kb=g_idn[g_eo2lexic[ix]][0]; + _su3_assign(g_gauge_field_copy[ix][1],gf[kb][0]); + + _su3_assign(g_gauge_field_copy[ix][2],gf[kb2][1]); + kb=g_idn[g_eo2lexic[ix]][1]; + _su3_assign(g_gauge_field_copy[ix][3],gf[kb][1]); + + _su3_assign(g_gauge_field_copy[ix][4],gf[kb2][2]); + kb=g_idn[g_eo2lexic[ix]][2]; + _su3_assign(g_gauge_field_copy[ix][5],gf[kb][2]); + + _su3_assign(g_gauge_field_copy[ix][6],gf[kb2][3]); + kb=g_idn[g_eo2lexic[ix]][3]; + _su3_assign(g_gauge_field_copy[ix][7],gf[kb][3]); + } + +#ifdef OMP + } /* OpenMP closing brace */ +#endif + + g_update_gauge_copy = 0; + return; +} + +void update_backward_gauge_32_orphaned(su3_32 ** const gf) { + int ix=0, kb=0, kb2=0; + +#ifdef OMP +#pragma omp for nowait +#endif + for(ix = 0; ix < VOLUME/2; ix++) { + kb2=g_eo2lexic[ix]; + _su3_assign(g_gauge_field_copy_32[ix][0],gf[kb2][0]); + kb=g_idn[g_eo2lexic[ix]][0]; + _su3_assign(g_gauge_field_copy_32[ix][1],gf[kb][0]); + + _su3_assign(g_gauge_field_copy_32[ix][2],gf[kb2][1]); + kb=g_idn[g_eo2lexic[ix]][1]; + _su3_assign(g_gauge_field_copy_32[ix][3],gf[kb][1]); + + _su3_assign(g_gauge_field_copy_32[ix][4],gf[kb2][2]); + kb=g_idn[g_eo2lexic[ix]][2]; + _su3_assign(g_gauge_field_copy_32[ix][5],gf[kb][2]); + + _su3_assign(g_gauge_field_copy_32[ix][6],gf[kb2][3]); + kb=g_idn[g_eo2lexic[ix]][3]; + _su3_assign(g_gauge_field_copy_32[ix][7],gf[kb][3]); + } +#ifdef OMP +#pragma omp for nowait +#endif + for(ix = (VOLUME+RAND)/2; ix < (VOLUME+RAND)/2+VOLUME/2; ix++) { + kb2=g_eo2lexic[ix]; + _su3_assign(g_gauge_field_copy_32[ix][0],gf[kb2][0]); + kb=g_idn[g_eo2lexic[ix]][0]; + _su3_assign(g_gauge_field_copy_32[ix][1],gf[kb][0]); + + _su3_assign(g_gauge_field_copy_32[ix][2],gf[kb2][1]); + kb=g_idn[g_eo2lexic[ix]][1]; + _su3_assign(g_gauge_field_copy_32[ix][3],gf[kb][1]); + + _su3_assign(g_gauge_field_copy_32[ix][4],gf[kb2][2]); + kb=g_idn[g_eo2lexic[ix]][2]; + _su3_assign(g_gauge_field_copy_32[ix][5],gf[kb][2]); + + _su3_assign(g_gauge_field_copy_32[ix][6],gf[kb2][3]); + kb=g_idn[g_eo2lexic[ix]][3]; + _su3_assign(g_gauge_field_copy_32[ix][7],gf[kb][3]); + } +// the threads are caught by the implicit barrier here +#ifdef OMP +#pragma omp single + { +#endif + g_update_gauge_copy_32 = 0; +#ifdef OMP + } +#endif +} + +void update_backward_gauge_32(su3_32 ** const gf) { +#ifdef OMP +#pragma omp parallel + { +#endif + update_backward_gauge_32_orphaned(gf); +#ifdef OMP + } /* OpenMP closing brace */ +#endif + return; +} + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/update_backward_gauge.h b/qcd/part_cpu/applications/QCD/src/kernel_D/update_backward_gauge.h new file mode 100644 index 0000000000000000000000000000000000000000..e06d5655e6eb6b72f901102e30ccd3f105a73831 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/update_backward_gauge.h @@ -0,0 +1,29 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _UPDATE_BACKWARD_GAUGE_H +#define _UPDATE_BACKWARD_GAUGE_H + +#include "su3.h" + +void update_backward_gauge(su3 ** const gf); +void update_backward_gauge_32_orphaned(su3_32 ** const gf); +void update_backward_gauge_32(su3_32 ** const gf); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/update_gauge.c b/qcd/part_cpu/applications/QCD/src/kernel_D/update_gauge.c new file mode 100644 index 0000000000000000000000000000000000000000..e10c31d4efdad8b10fdc48255f407405d6e032c5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/update_gauge.c @@ -0,0 +1,116 @@ +/*********************************************************************** + * + * Copyright (C) 2001 Martin Hasebusch + * + * some changes by C. Urbach 2002-2008,2012 + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#include "global.h" +#include "gettime.h" +#include "su3.h" +#include "su3adj.h" +#include "su3spinor.h" +#include "expo.h" +#include "sse.h" +#include "xchange/xchange.h" +#include "hamiltonian_field.h" +#include "update_gauge.h" +#include "init/init_gauge_field.h" + + +/******************************************************* + * + * Updates the gauge field corresponding to the momenta + * + *******************************************************/ + + +void update_gauge(const double step, hamiltonian_field_t * const hf) { + double atime, etime; + atime = gettime(); +#ifdef OMP +#define static +#pragma omp parallel + { +#endif + int i,mu; + static su3 v,w; + su3 *z; + static su3adj deriv; + su3adj *xm; +#ifdef _KOJAK_INST +#pragma pomp inst begin(updategauge) +#endif + +#ifdef OMP +#undef static +#endif + +#ifdef OMP +#pragma omp for +#endif + for(i = 0; i < VOLUME; i++) { + for(mu = 0; mu < 4; mu++){ + /* moment[i][mu] = h_{i,mu}^{alpha} */ + xm = &hf->momenta[i][mu]; + z = &hf->gaugefield[i][mu]; + _su3adj_assign_const_times_su3adj(deriv, step, *xm); + exposu3(&w,&deriv); + restoresu3(&v,&w); + _su3_times_su3(w, v, *z); + _su3_assign(*z, w); + } + } + +#ifdef OMP + } /* OpenMP parallel closing brace */ +#endif + +#ifdef MPI + /* for parallelization */ + xchange_gauge(hf->gaugefield); +#endif + + /*Convert to a 32 bit gauge field, after xchange*/ + convert_32_gauge_field(g_gauge_field_32, hf->gaugefield, VOLUMEPLUSRAND + g_dbw2rand); + + /* + * The backward copy of the gauge field + * is not updated here! + */ + hf->update_gauge_copy = 1; + g_update_gauge_copy = 1; + g_update_gauge_copy_32 = 1; + + etime = gettime(); + if(g_debug_level > 1 && g_proc_id == 0) { + printf("# Time gauge update: %e s\n", etime-atime); + } + return; +#ifdef _KOJAK_INST +#pragma pomp inst end(updategauge) +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/update_gauge.h b/qcd/part_cpu/applications/QCD/src/kernel_D/update_gauge.h new file mode 100644 index 0000000000000000000000000000000000000000..bc90799284037ca26fbb45b8608d5b46c5091410 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/update_gauge.h @@ -0,0 +1,27 @@ +/*********************************************************************** + * + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _UPDATE_GAUGE_H +#define _UPDATE_GAUGE_H + +#include "hamiltonian_field.h" + +void update_gauge(const double step, hamiltonian_field_t * const hf); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/update_momenta.c b/qcd/part_cpu/applications/QCD/src/kernel_D/update_momenta.c new file mode 100644 index 0000000000000000000000000000000000000000..e9f8bcf9e541f5d9b4b8568b84919dcd7ae3a342 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/update_momenta.c @@ -0,0 +1,76 @@ +/*********************************************************************** + * + * Copyright (C) 2001 Martin Hasebusch + * 2002,2003,2004,2005,2006,2007,2008,2012 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#include "global.h" +#include "su3.h" +#include "su3adj.h" +#include "su3spinor.h" +#include "monomial/monomial.h" +#include "xchange/xchange.h" +#include "operator/clover_leaf.h" +#include "read_input.h" +#include "hamiltonian_field.h" +#include "update_momenta.h" +#include "gettime.h" + +/* Updates the momenta: equation 16 of Gottlieb */ +void update_momenta(int * mnllist, double step, const int no, + hamiltonian_field_t * const hf) { + +#ifdef OMP +#pragma omp parallel for +#endif + for(int i = 0; i < (VOLUMEPLUSRAND + g_dbw2rand);i++) { + for(int mu=0;mu<4;mu++) { + _zero_su3adj(hf->derivative[i][mu]); + } + } + + for(int k = 0; k < no; k++) { + if(monomial_list[ mnllist[k] ].derivativefunction != NULL) { + monomial_list[ mnllist[k] ].derivativefunction(mnllist[k], hf); + } + } + +#ifdef MPI + xchange_deri(hf->derivative); +#endif + +#ifdef OMP +#pragma omp parallel for +#endif + for(int i = 0; i < VOLUME; i++) { + for(int mu = 0; mu < 4; mu++) { + /* the minus comes from an extra minus in trace_lambda */ + _su3adj_minus_const_times_su3adj(hf->momenta[i][mu], step, hf->derivative[i][mu]); + } + } + + return; +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/update_momenta.h b/qcd/part_cpu/applications/QCD/src/kernel_D/update_momenta.h new file mode 100644 index 0000000000000000000000000000000000000000..847d524f18d705da86dd23df20468f326453a2f1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/update_momenta.h @@ -0,0 +1,26 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _UPDATE_MOMENTA_H +#define _UPDATE_MOMENTA_H + +#include "hamiltonian_field.h" + +void update_momenta(int * mnllist, double step, const int no, hamiltonian_field_t * const hf); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/update_tm.c b/qcd/part_cpu/applications/QCD/src/kernel_D/update_tm.c new file mode 100644 index 0000000000000000000000000000000000000000..2b1e4c66c0ab49cc8f5992176849211e2ba4a985 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/update_tm.c @@ -0,0 +1,379 @@ +/*********************************************************************** + * + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * This routine contains the update part for + * the HMC with up to three pseudo fermion fields + * for twisted mass QCD + * + * Author: Carsten Urbach + * + * Modified by Jenifer Gonzalez Lopez for the Schroedinger Functional + * + ***********************************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef BENCHMARK +#include <./c-lime/include/lime.h> +#else +#include +#endif +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#ifdef MPI +# include +#endif +#ifdef OMP +# include +#endif +#include "global.h" +#include "start.h" +#include "sighandler.h" +#include "operator/tm_operators.h" +#include "linalg_eo.h" +#include "io/gauge.h" +#include "io/params.h" +#include "measure_gauge_action.h" +#include "ranlxd.h" +#include "read_input.h" +#include "expo.h" +#include "xchange/xchange.h" +#include "measure_rectangles.h" +#include "init/init_gauge_tmp.h" +#include "monomial/monomial.h" +#include "integrator.h" +#include "hamiltonian_field.h" +#include "update_tm.h" +#include "gettime.h" + +extern su3 ** g_gauge_field_saved; + +int update_tm(double *plaquette_energy, double *rectangle_energy, + char * filename, const int return_check, const int acctest, + const int traj_counter) { + + su3 *v, *w; + int accept, i=0, j=0, iostatus=0; + + double yy[1]; + double dh, expmdh, ret_dh=0., ret_gauge_diff=0., tmp; + double atime=0., etime=0.; + double ks = 0., kc = 0., ds, tr, ts, tt; + + char tmp_filename[50]; + + /* Energy corresponding to the Gauge part */ + double new_plaquette_energy=0., new_rectangle_energy = 0.; + + /* Energy corresponding to the Momenta part */ + double enep=0., enepx=0., ret_enep = 0.; + + /* Energy corresponding to the pseudo fermion part(s) */ + FILE * datafile=NULL, * ret_check_file=NULL; + hamiltonian_field_t hf; + paramsXlfInfo *xlfInfo; + + hf.gaugefield = g_gauge_field; + hf.momenta = moment; + hf.derivative = df0; + hf.update_gauge_copy = g_update_gauge_copy; + hf.traj_counter = traj_counter; + integrator_set_fields(&hf); + + sprintf(tmp_filename, ".conf.t%05d.tmp",traj_counter); + atime = gettime(); + + /* + * here the momentum and spinor fields are initialized + * and their respective actions are calculated + */ + + /* + * copy the gauge field to gauge_tmp + */ +#ifdef OMP +#pragma omp parallel for private(w,v) +#endif + for(int ix=0;ix 0) { + Integrator.integrate[Integrator.no_timescales-1](Integrator.tau, + Integrator.no_timescales-1, 1); + } + + g_sloppy_precision = 0; + + /* compute the final energy contributions for all monomials */ + dh = 0.; + for(i = 0; i < Integrator.no_timescales; i++) { + for(j = 0; j < Integrator.no_mnls_per_ts[i]; j++) { + dh += monomial_list[ Integrator.mnls_per_ts[i][j] ].accfunction(Integrator.mnls_per_ts[i][j], &hf); + } + } + + enepx = moment_energy(hf.momenta); + + if (!bc_flag) { /* if PBC */ + new_plaquette_energy = measure_plaquette( (const su3**) hf.gaugefield); + if(g_rgi_C1 > 0. || g_rgi_C1 < 0.) { + new_rectangle_energy = measure_rectangles( (const su3**) hf.gaugefield); + } + } + if(g_proc_id == 0 && g_debug_level > 3) printf("called moment_energy: dh = %1.10e\n", (enepx - enep)); + /* Compute the energy difference */ + dh = dh + (enepx - enep); + if(g_proc_id == 0 && g_debug_level > 3) { + printf("called momenta_acc dH = %e\n", (enepx - enep)); + } + expmdh = exp(-dh); + /* the random number is only taken at node zero and then distributed to + the other sites */ + ranlxd(yy,1); +#ifdef MPI + MPI_Bcast(&yy[0], 1, MPI_DOUBLE, 0, MPI_COMM_WORLD); +#endif + + /* when acctest is 0 (i.e. do not perform acceptance test), the trajectory is accepted whatever the energy difference */ + accept = (!acctest | (expmdh > yy[0])); + if(g_proc_id == 0) { + fprintf(stdout, "# Trajectory is %saccepted.\n", (accept ? "" : "not ")); + } + /* Here a reversibility test is performed */ + /* The trajectory is integrated back */ + if(return_check) { + if(g_proc_id == 0) { + fprintf(stdout, "# Performing reversibility check.\n"); + } + if(accept) { + /* save gauge file to disk before performing reversibility check */ + xlfInfo = construct_paramsXlfInfo((*plaquette_energy)/(6.*VOLUME*g_nproc), -1); + // Should write this to temporary file first, and then check + if(g_proc_id == 0 && g_debug_level > 0) { + fprintf(stdout, "# Writing gauge field to file %s.\n", tmp_filename); + } + if((iostatus = write_gauge_field( tmp_filename, 64, xlfInfo) != 0 )) { + /* Writing failed directly */ + fprintf(stderr, "Error %d while writing gauge field to %s\nAborting...\n", iostatus, tmp_filename); + exit(-2); + } + /* There is double writing of the gauge field, also in hmc_tm.c in this case */ + /* No reading back check needed here, as reading back is done further down */ + if(g_proc_id == 0 && g_debug_level > 0) { + fprintf(stdout, "# Writing done.\n"); + } + free(xlfInfo); + } + + g_sloppy_precision = 1; + /* run the trajectory back */ + Integrator.integrate[Integrator.no_timescales-1](-Integrator.tau, + Integrator.no_timescales-1, 1); + g_sloppy_precision = 0; + + /* compute the energy contributions from the pseudo-fermions */ + ret_dh = 0.; + for(i = 0; i < Integrator.no_timescales; i++) { + for(j = 0; j < Integrator.no_mnls_per_ts[i]; j++) { + ret_dh += monomial_list[ Integrator.mnls_per_ts[i][j] ].accfunction(Integrator.mnls_per_ts[i][j], &hf); + } + } + + ret_enep = moment_energy(hf.momenta); + + /* Compute the energy difference */ + ret_dh += ret_enep - enep ; + + /* Compute Differences in the fields */ + ks = 0.; + kc = 0.; + +#ifdef OMP +#pragma omp parallel private(w,v,tt,tr,ts,ds,ks,kc) + { + int thread_num = omp_get_thread_num(); +#endif + su3 ALIGN v0; +#ifdef OMP +#pragma omp for +#endif + for(int ix = 0; ix < VOLUME; ++ix) + { + for(int mu = 0; mu < 4; ++mu) + { + v=&hf.gaugefield[ix][mu]; + w=&gauge_tmp[ix][mu]; + _su3_minus_su3(v0, *v, *w); + _su3_square_norm(ds, v0); + + tr = sqrt(ds) + kc; + ts = tr + ks; + tt = ts-ks; + ks = ts; + kc = tr-tt; + } + } + kc=ks+kc; +#ifdef OMP + g_omp_acc_re[thread_num] = kc; + + } /* OpenMP parallel section closing brace */ + + /* sum up contributions from thread-local kahan summations */ + for(int k = 0; k < omp_num_threads; ++k) + ret_gauge_diff += g_omp_acc_re[k]; +#else + ret_gauge_diff = kc; +#endif + +#ifdef MPI + tmp = ret_gauge_diff; + MPI_Reduce(&tmp, &ret_gauge_diff, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); +#endif + /* compute the total H */ + tmp = enep; + for(i = 0; i < Integrator.no_timescales; i++) { + for(j = 0; j < Integrator.no_mnls_per_ts[i]; j++) { + tmp += monomial_list[ Integrator.mnls_per_ts[i][j] ].energy0; + } + } + /* Output */ + if(g_proc_id == 0) { + ret_check_file = fopen("return_check.data","a"); + fprintf(ret_check_file,"%08d ddh = %1.4e ddh/dh = %1.4e ddh/H = %1.4e ddU= %1.4e\n", traj_counter, + ret_dh, ret_dh/dh, ret_dh/tmp, ret_gauge_diff/4./((double)(VOLUME*g_nproc))/3.); + fclose(ret_check_file); + } + + if(accept) { + /* Read back gauge field + FIXME unlike in hmc_tm we abort immediately if there is a failure */ + if(g_proc_id == 0 && g_debug_level > 0) { + fprintf(stdout, "# Trying to read gauge field from file %s.\n", tmp_filename); + } + + if((iostatus = read_gauge_field(tmp_filename,g_gauge_field) != 0)) { + fprintf(stderr, "Error %d while reading gauge field from %s\nAborting...\n", iostatus, tmp_filename); + exit(-2); + } + if(g_proc_id == 0 && g_debug_level > 0) { + fprintf(stdout, "# Reading done.\n"); + } + } + if(g_proc_id == 0) { + fprintf(stdout, "# Reversibility check done.\n"); + } + } /* end of reversibility check */ + + if(accept) { + *plaquette_energy = new_plaquette_energy; + *rectangle_energy = new_rectangle_energy; + /* put the links back to SU(3) group */ + if (!bc_flag) { /* periodic boundary conditions */ +#ifdef OMP +#pragma omp parallel for private(v) +#endif + for(int ix=0;ix 0. || g_rgi_C1 < 0) { + fprintf(datafile, " %e", (*rectangle_energy)/(12*VOLUME*g_nproc)); + } + fprintf(datafile, "\n"); + fflush(datafile); + fclose(datafile); + } + return(accept); +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/update_tm.h b/qcd/part_cpu/applications/QCD/src/kernel_D/update_tm.h new file mode 100644 index 0000000000000000000000000000000000000000..e26456c5d085144037f4572a3836e3a604f6398d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/update_tm.h @@ -0,0 +1,26 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _UPDATE_TM_H +#define _UPDATE_TM_H + +int update_tm(double *plaquette_energy, double *rectangle_energy, + char * filename, const int return_check, const int acctest, + const int traj_counter); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/util/blue.pl b/qcd/part_cpu/applications/QCD/src/kernel_D/util/blue.pl new file mode 100755 index 0000000000000000000000000000000000000000..846995b5c58dc069e6dd050d142e6aa7f5f706f2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/util/blue.pl @@ -0,0 +1,237 @@ +#!/usr/bin/perl -w +# +# TODO: +# perlscript should redirect it's output, especially if running from at +# (maybe sth like 'tee'?) + +use strict; +use Getopt::Long; + +use constant VERSION => '$Revision 0.0$'; + +my $debug=0; +my $mpirun; +my $llstat; +my $schedbgl_cmd; +my $runpath; +if ($debug) { + $mpirun = "/bin/echo "; + $schedbgl_cmd="/bin/echo 'Q 1131 $ENV{USER} 2006.04.06_18:00 2006.04.06_19:00 R11'"; + $llstat="echo 'R11 $ENV{USER} init'"; + $runpath = $ENV{HOME}; +} else { + $mpirun = "/usr/bin/mpirun"; + $schedbgl_cmd="sched_bgl -l"; + $llstat="llstat"; + $runpath = "/home5/hch02/hch026/bglhome/b4.05_L32T64_k0.157025_mu0.003/"; +} +my $atShellScript="atScript.tmp.sh"; +# FIXME this should first canonicalize the path... +my $atPerlScript=$0; +my $partition = 'R11 '; +my $mode = 'VN'; +my $executable = "/home5/hch02/hch026/bglhome/bin/hmc_tm_xyzt"; +my $logfile = "runlog"; +my $args = ""; +# +#my @startingtimes = (0, 6, 12); +#my @startingdays = (29, 29, 30); + + + +my ($verbose,$dryrun,$at,$nodelete)=(0,0,0,0); +my $getoptRet=GetOptions ( 'help|?' => \&usage, + 'verbose|v+' => \$verbose, + 'quiet' => sub { $verbose = 0 }, + 'dryrun|n!' => \$dryrun, + 'at' => \$at, + 'executable=s' => \$executable, + 'runpath=s' => \$runpath, + 'logfile=s' => \$logfile, + 'mode=s' => \$mode, + 'nodelete' => \$nodelete, + 'args=s' => \$args, + ); +exit -1 unless ($getoptRet); + +my ($resid)=@ARGV; +usage() unless (defined($resid)); +if ($verbose > 0) { + printf("resid = %s exe=%s dry=%s path=%s logfile=%s mode=%s verbose=%s\n", $resid, $executable, $dryrun, $runpath, $logfile, $mode, $verbose); +} + +if ($at) { + submitAtJob($resid); +} else { + run($resid) +} + + +#################### submit at script ########################################## +sub submitAtJob { + my ($resid)=@_; + + my %reservationParameters=bglJobParameters($resid); + unless (defined($reservationParameters{resid})) { + printf(STDERR "Job with resID %s does not seem to exist (according to sched_bgl -l output)\n",$resid); + exit -1; + } + if (!($ENV{USER} eq $reservationParameters{user})) { + printf(STDERR "Wrong username (running under %s, submitting for %s)\n",$ENV{USER},$reservationParameters{user}); + exit -1; + } + my $nodel = " "; + if($nodelete) { + $nodel = "--nodelete"; + } + + open(ATSCRIPT, "> $atShellScript"); + print ATSCRIPT < ". + "$runpath/$logfile.$reservationParameters{resid}"; + +# waitForReservationTime(%reservationParameters); + waitForBglInitialize(%reservationParameters); + + print "Running: $command\n"; + my $errorstatus = system "$command" unless($dryrun); + +# We may want to delete the reservation if the job finished +# such that we save allocation time + system "sched_bgl -d $resid" unless($dryrun || $nodelete); + + exit 0; + ############## we might want to extend here one day ########################### + + sleep 60; + my $status = `llstat | grep $partition`; + if( bglAvailable($ENV{USER},$resid) && waitForBglInitialize(%reservationParameters) && bglAvailable($ENV{USER},$resid) ) { + for(my $j = 0; $j < 2; $j++) { + if ($errorstatus != 0) { + print "Job finished with error, try again!\n"; + $errorstatus = system "$command"; + } + } + if ($errorstatus != 0) { + print "Job failed to restart twice, Aborting\n"; + } + } +} + +################################################################################### + +sub bglTime2atTime { + my ($bglTime)=@_; + # format of bgl time is + # YYYY.MM.DD_HH:MM + # format of at time must be + # HH:MM MMDDYY + # no, I think it must be + # HH:MM DDMMYY (Carsten) + my ($date,$time)=split '_', $bglTime; + my $year=substr $date, 2,2; + my $month=substr $date, 5,2; + my $day=substr $date, 8,2; + return (sprintf("%s %s.%s.%s",$time,$day,$month,$year)); +} + +sub bglJobParameters { + my ($ResID)=@_; + # output of sched_bgl -l is + # Q ResID User Start End Resource Prio + open(SCHEDBGL, "$schedbgl_cmd|"); + my %reservationParameters; + while () { + next unless (/.*\s+$ResID/); + my (@line)=split; + $reservationParameters{"resid"}=$line[1]; + $reservationParameters{"user"}=$line[2]; + $reservationParameters{"start"}=$line[3]; + $reservationParameters{"end"}=$line[4]; + $reservationParameters{"partition"}=$line[5] + } + close SCHEDBGL; + return %reservationParameters; +} + +sub bglAvailable { + my ($user,$resid,$exit)=@_; + my $sched_bgl=`$schedbgl_cmd`; + if ($sched_bgl =~ /$resid\s+$user/) { + return(1); + } + if (defined($exit) && ($exit != 0)) { + printf STDERR "vaffanculo: Somebody stole our slot for %s, exiting.\n",$resid; + exit $exit; + } + return; +} + +sub waitForReservationTime { + my ($starttime,$startday)=@_; + my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time); + while( ($hour < $starttime) || ($mday != $startday)) { + print "sleeping ...\n"; + sleep 60; + ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time); + } +} + +sub waitForBglInitialize { + my (%jobPars)=@_; + my $status = `$llstat | grep $jobPars{partition}`; + while (($status !~ /$jobPars{user}/) || ($status !~ /init/) ) { + printf "%s: Waiting for partition to be initialised...\n",$jobPars{resid}; + sleep 60; + bglAvailable($jobPars{user},$jobPars{resid},-1); + $status = `$llstat | grep $jobPars{partition}`; + } + return 1; +} + +sub usage { + use File::Basename; + my $basename=basename $0; + printf <. + ***********************************************************************/ + +/**************************************************** + * IO routines: + * + * read_lime_gauge_field_doubleprec + * + * read_lime_gauge_field_singleprec + * + * Autor: + * Carsten Urbach + * + ****************************************************/ + +/* + * Note: + * Required version of lime: >= 1.2.3 + * n_uint64_t is a lime defined type!! + * + */ + +#define _FILE_OFFSET_BITS 64 + +#ifdef HAVE_CONFIG_H +# include +#endif +#ifdef BENCHMARK +#include <../c-lime/include/lime.h> +#else +#include +#endif +#include +#include +#include +#include +#include +#include +#include "io.h" + +#define MAXBUF 1048576 + +void byte_swap(void *ptr, int nmemb); +void byte_swap_assign(void * out_ptr, void * in_ptr, int nmemb); +void byte_swap_assign_singleprec(void * out_ptr, void * in_ptr, int nmemb); +void byte_swap_assign_single2double(void * out_ptr, void * in_ptr, int nmemb); +void single2double(void * out_ptr, void * in_ptr, int nmemb); +void byte_swap_assign_double2single(void * out_ptr, void * in_ptr, int nmemb); +void double2single(void * out_ptr, void * in_ptr, int nmemb); +int big_endian(); + +int read_lime_gauge_field_doubleprec(double * config, char * filename, + const int T, const int LX, const int LY, const int LZ) { + FILE * ifs; + int t, x, y, z, status, p=0; + n_uint64_t bytes; + char * header_type; + LimeReader * limereader; + double tmp[72]; + int words_bigendian; + + words_bigendian = big_endian(); + ifs = fopen(filename, "r"); + if(ifs == (FILE *)NULL) { + fprintf(stderr, "Could not open file %s\n Aborting...\n", filename); + exit(500); + } + limereader = limeCreateReader( ifs ); + if( limereader == (LimeReader *)NULL ) { + fprintf(stderr, "Unable to open LimeReader\n"); + exit(500); + } + while( (status = limeReaderNextRecord(limereader)) != LIME_EOF ) { + if(status != LIME_SUCCESS ) { + fprintf(stderr, "limeReaderNextRecord returned error with status = %d!\n", status); + status = LIME_EOF; + break; + } + header_type = limeReaderType(limereader); + if(!strcmp("ildg-binary-data",header_type)) break; + } + if(status == LIME_EOF) { + fprintf(stderr, "no ildg-binary-data record found in file %s\n",filename); + limeDestroyReader(limereader); + fclose(ifs); + exit(-2); + } + bytes = limeReaderBytes(limereader); + if((int)bytes != LX*LY*LZ*T*72*sizeof(double)) { + fprintf(stderr, "Probably wrong lattice size or precision (bytes=%d) in file %s\n", (int)bytes, filename); + fprintf(stderr, "Aborting...!\n"); + fflush( stdout ); + exit(501); + } + + bytes = (n_uint64_t)72*sizeof(double); + + for(t = 0; t < T; t++) { + for(z = 0; z < LZ; z++) { + for(y = 0; y < LY; y++) { + for(x = 0; x < LX; x++) { + p = (((t*LZ+z)*LY+y)*LX+x)*72; + if(!words_bigendian) { + status = limeReaderReadData(tmp, &bytes, limereader); + byte_swap_assign(&config[p], tmp, 72); + } + else { + status = limeReaderReadData(&config[p], &bytes, limereader); + } + if(status < 0 && status != LIME_EOR) { + fprintf(stderr, "LIME read error occured with status = %d while reading file %s!\n Aborting...\n", + status, filename); + exit(500); + } + } + } + } + } + limeDestroyReader(limereader); + fclose(ifs); + return(0); +} + + +int read_lime_gauge_field_singleprec(float * config, char * filename, + const int T, const int LX, const int LY, const int LZ){ + FILE * ifs; + int t, x, y, z, status, p=0; + n_uint64_t bytes; + char * header_type; + LimeReader * limereader; + float tmp[72]; + int words_bigendian; + + words_bigendian = big_endian(); + ifs = fopen(filename, "r"); + if(ifs == (FILE *)NULL) { + fprintf(stderr, "Could not open file %s\n Aborting...\n", filename); + exit(500); + } + limereader = limeCreateReader( ifs ); + if( limereader == (LimeReader *)NULL ) { + fprintf(stderr, "Unable to open LimeReader\n"); + exit(500); + } + while( (status = limeReaderNextRecord(limereader)) != LIME_EOF ) { + if(status != LIME_SUCCESS ) { + fprintf(stderr, "limeReaderNextRecord returned error with status = %d!\n", status); + status = LIME_EOF; + break; + } + header_type = limeReaderType(limereader); + if(!strcmp("ildg-binary-data",header_type)) break; + } + if(status == LIME_EOF) { + fprintf(stderr, "no ildg-binary-data record found in file %s\n",filename); + limeDestroyReader(limereader); + fclose(ifs); + exit(-2); + } + bytes = limeReaderBytes(limereader); + if((int)bytes != LX*LY*LZ*T*72*sizeof(float)) { + fprintf(stderr, "Probably wrong lattice size or precision (bytes=%d) in file %s\n", (int)bytes, filename); + fprintf(stderr, "Aborting...!\n"); + fflush( stdout ); + exit(501); + } + + bytes = (n_uint64_t)72*sizeof(float); + for(t = 0; t < T; t++){ + for(z = 0; z < LZ; z++){ + for(y = 0; y < LY; y++){ + for(x = 0; x < LX; x++) { + p = (((t*LZ+z)*LY+y)*LX+x)*72; + if(!words_bigendian) { + status = limeReaderReadData(tmp, &bytes, limereader); + byte_swap_assign_singleprec(&config[p], tmp, 72); + } + else { + status = limeReaderReadData(&config[p], &bytes, limereader); + } + if(status < 0 && status != LIME_EOR) { + fprintf(stderr, "LIME read error occured with status = %d while reading file %s!\n Aborting...\n", + status, filename); + exit(500); + } + } + } + } + } + limeDestroyReader(limereader); + fclose(ifs); + return(0); +} + + +int big_endian(){ + union{ + int l; + char c[sizeof(int)]; + } u; + + u.l=1; + return(u.c[sizeof(int) - 1] == 1); +} + +void byte_swap(void * ptr, int nmemb){ + int j; + char char_in[4]; + char * in_ptr; + int * int_ptr; + + for(j = 0, int_ptr = (int *) ptr; j < nmemb; j++, int_ptr++) { + in_ptr = (char *) int_ptr; + + char_in[0] = in_ptr[0]; + char_in[1] = in_ptr[1]; + char_in[2] = in_ptr[2]; + char_in[3] = in_ptr[3]; + + in_ptr[0] = char_in[3]; + in_ptr[1] = char_in[2]; + in_ptr[2] = char_in[1]; + in_ptr[3] = char_in[0]; + } +} + +void byte_swap_assign(void * out_ptr, void * in_ptr, int nmemb){ + int j; + char * char_in_ptr, * char_out_ptr; + double * double_in_ptr, * double_out_ptr; + + double_in_ptr = (double *) in_ptr; + double_out_ptr = (double *) out_ptr; + for(j = 0; j < nmemb; j++){ + char_in_ptr = (char *) double_in_ptr; + char_out_ptr = (char *) double_out_ptr; + + char_out_ptr[7] = char_in_ptr[0]; + char_out_ptr[6] = char_in_ptr[1]; + char_out_ptr[5] = char_in_ptr[2]; + char_out_ptr[4] = char_in_ptr[3]; + char_out_ptr[3] = char_in_ptr[4]; + char_out_ptr[2] = char_in_ptr[5]; + char_out_ptr[1] = char_in_ptr[6]; + char_out_ptr[0] = char_in_ptr[7]; + double_in_ptr++; + double_out_ptr++; + } +} + +void byte_swap_assign_singleprec(void * out_ptr, void * in_ptr, int nmemb){ + int j; + char * char_in_ptr, * char_out_ptr; + float * float_in_ptr, * float_out_ptr; + + float_in_ptr = (float *) in_ptr; + float_out_ptr = (float *) out_ptr; + for(j = 0; j < nmemb; j++){ + char_in_ptr = (char *) float_in_ptr; + char_out_ptr = (char *) float_out_ptr; + + char_out_ptr[3] = char_in_ptr[0]; + char_out_ptr[2] = char_in_ptr[1]; + char_out_ptr[1] = char_in_ptr[2]; + char_out_ptr[0] = char_in_ptr[3]; + float_in_ptr++; + float_out_ptr++; + } +} + +void single2double(void * out_ptr, void * in_ptr, int nmemb) { + int i; + float * float_ptr = (float*) in_ptr; + double * double_ptr = (double*) out_ptr; + + for(i = 0; i < nmemb; i++) { + (*double_ptr) = (double) (*float_ptr); + + float_ptr++; + double_ptr++; + } + +} + +void double2single(void * out_ptr, void * in_ptr, int nmemb) { + int i; + float * float_ptr = (float*) out_ptr; + double * double_ptr = (double*) in_ptr; + + for(i = 0; i < nmemb; i++) { + (*float_ptr) = (float) (*double_ptr); + + float_ptr++; + double_ptr++; + } + +} + +void byte_swap_assign_single2double(void * out_ptr, void * in_ptr, int nmemb){ + int j; + char * char_in_ptr, * char_out_ptr; + double * double_out_ptr; + float * float_in_ptr; + float tmp; + + float_in_ptr = (float *) in_ptr; + double_out_ptr = (double *) out_ptr; + char_out_ptr = (char *) &tmp; + for(j = 0; j < nmemb; j++){ + char_in_ptr = (char *) float_in_ptr; + + char_out_ptr[3] = char_in_ptr[0]; + char_out_ptr[2] = char_in_ptr[1]; + char_out_ptr[1] = char_in_ptr[2]; + char_out_ptr[0] = char_in_ptr[3]; + (*double_out_ptr) = (double) tmp; + float_in_ptr++; + double_out_ptr++; + } +} + +void byte_swap_assign_double2single(void * out_ptr, void * in_ptr, int nmemb){ + int j; + char * char_in_ptr, * char_out_ptr; + double * double_in_ptr; + float * float_out_ptr; + float tmp; + + float_out_ptr = (float *) out_ptr; + double_in_ptr = (double *) in_ptr; + char_in_ptr = (char *) &tmp; + for(j = 0; j < nmemb; j++){ + tmp = (float) (*double_in_ptr); + char_out_ptr = (char*) float_out_ptr; + + char_out_ptr[3] = char_in_ptr[0]; + char_out_ptr[2] = char_in_ptr[1]; + char_out_ptr[1] = char_in_ptr[2]; + char_out_ptr[0] = char_in_ptr[3]; + + float_out_ptr++; + double_in_ptr++; + } +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/util/io.h b/qcd/part_cpu/applications/QCD/src/kernel_D/util/io.h new file mode 100644 index 0000000000000000000000000000000000000000..cb9716cbcef194e7e274920c4114dee8b668028e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/util/io.h @@ -0,0 +1,28 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _IO_H +#define _IO_H + +int read_lime_gauge_field_doubleprec(double * config, char * filename, + const int T, const int LX, const int LY, const int LZ); + +int read_lime_gauge_field_singleprec(float * config, char * filename, + const int T, const int LX, const int LY, const int LZ); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/util/laguer/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_D/util/laguer/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..f9bce70e3846aa805e8875fcf8bbadd09ecfa173 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/util/laguer/Makefile @@ -0,0 +1,9 @@ +CXX=g++ +CXXFLAGS=-g -O2 +CLNDIR=${HOME}/daten/workdir/cln/ + +chebyRoot: chebyRoot.C Makefile chebyRoot.H + ${CXX} $< -g -o $@ -I${CLNDIR}/include/ -L${CLNDIR}/lib -lcln -lm + +clean: + rm -f *.o chebyRoot *.dat *.log *~ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/util/laguer/chebyRoot.C b/qcd/part_cpu/applications/QCD/src/kernel_D/util/laguer/chebyRoot.C new file mode 100644 index 0000000000000000000000000000000000000000..b8dad5f2d8655795dc30ac85997e7a45bcbca713 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/util/laguer/chebyRoot.C @@ -0,0 +1,1491 @@ +/******************************************************************************/ +// +// Copyright (C) 2007 Istvan Montvay +// Copyright (C) 2007 Carsten Urbach +// +// This file is part of tmLQCD. +// +// tmLQCD is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. +// +// tmLQCD is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with tmLQCD. If not, see . +// +// +// Computes all roots of a Chebycheff Polynomial of a given +// order. +// +// The function to be approximated is: 1/x^(1/2) +// +// Set order, approximation inverval and the precision +// in chebyRoot.H +// +// For other powers than -1/2 change the function "func" below +// appropiately +// +// Root pairs and square-root of roots are bit reversed ordered. +// +// The roots are written to the file roots.dat +// The monomials are written to Square_root_BR_roots.dat +// The coefficients are written to coefs.dat +// +// The normalisation factor is written to normierungLocal.dat +// it corresponds to the n-th root of C, where n is the order +// of the polynomial. Note that for the monomial representation +// you need to take the square-root of that value! +// +// This runs with CLN, which is available from +// http://www.ginac.de/CLN/ +// +// Last changed: Feb. 15, 2007 C. Urbach +// +// This is based on a code provided by Istvan Montvay for +// least squared optimised polynomials, see +// quadroptRoot.C +// +/******************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +using namespace cln; +using namespace std; + +// Defining input parameters in a header file + +#include "chebyRoot.H" + + +// Define global variables + +cl_F Recb[1+MAXORD-1], Recg[1+MAXORD-2], Orth[1], Coed[1+MAXORD]; +//cl_F Coef[1+MAXORD]; +cl_F Coef[1+MAXORD], c[1+MAXORD]; + +/******************************************************************************/ +// +// Calculate the basic integral s_nu for least-square optimization. +// +// Input parameters: +// order of the polynomial: Maxpow +// +// the power to be approximated: Alpha +// lower bound of interval of approximation: Epsilon +// upper bound of interval of approximation: Lambda +// +// The result is put in Sint +// The precision: Digit + +void BaseIntS(int Maxpow, cl_F Alpha, cl_F Epsilon, cl_F Lambda, cl_F* Sint, + float_format_t Digit) + { + int ord; + cl_F power, small = As(cl_F)(expt(cl_float(0.1,Digit),DIGIT)); + +// Loop over powers + + for(ord = 0; ord < 2*Maxpow+1; ord++) + { power = As(cl_F)(2*Alpha+(ord+1)); + + if(abs(power) < small) + Sint[ord] = As(cl_F)(log(Lambda/Epsilon)); + + else + Sint[ord] = As(cl_F)(expt(Lambda,power)-expt(Epsilon,power))/power; } + + } +/******************************************************************************/ +// +// Calculate the basic integral t_nu for least-square optimization. +// +// Input parameters: +// order of the polynomial: Maxpow +// +// the power to be approximated: Alpha +// lower bound of interval of approximation: Epsilon +// upper bound of interval of approximation: Lambda +// +// The result is put in Tint +// The precision: Digit + +void BaseIntT(int Maxpow, cl_F Alpha, cl_F Epsilon, cl_F Lambda, cl_F* Tint, + float_format_t Digit) + { + int ord; + cl_F power, small = As(cl_F)(expt(cl_float(0.1,Digit),DIGIT));; + +// Loop over powers + + for(ord = 0; ord < Maxpow+1; ord++) + { power = As(cl_F)(Alpha+(ord+1)); + + if(abs(power) < small) + Tint[ord] = As(cl_F)(log(Lambda/Epsilon)); + + else + Tint[ord] = As(cl_F)(expt(Lambda,power)-expt(Epsilon,power))/power; } + + } +/******************************************************************************/ +// +// Evaluate the approximate polynomial up to the order Maxpow +// at the variable value Xval +// +// The recurrence coefficients are in Recb +// and in Recg +// Constant term of the first orthogonal polynomial: Orth +// Expansion coefficients in orthogonal polynomials: Coed + +cl_F Recurev(int Maxpow, cl_F Xval, + cl_F* Recb,cl_F* Recg,cl_F* Orth,cl_F* Coed) + { + int ord; + cl_F res, orth, orthb, orthg; + + +// Check input + + if(Maxpow < 0 || Maxpow > MAXORD) + { cout < 0) + { orthb = orth; + orth = Xval+Orth[0]; + res = res+Coed[1]*orth; } + +// Iteration for recurrence + + for(ord = 2; ord < Maxpow+1; ord++) + { orthg = orthb; + orthb = orth; + orth = (Xval+Recb[ord-1])*orthb+Recg[ord-2]*orthg; + res = res+Coed[ord]*orth; } + + return res; + } +/******************************************************************************/ +// +// Write elements of a list of numbers in the array Wlist +// of length Leng +// as assignment statements +// to the elements of an array Arrayname +// +// The ofstream of the output file is Ostream +// The format of assignements is C, F(ortran) or T(ao): Format +// +// At the beginning write the string Text + +void WriteAssign(ostream& Ostream, char* Format, char* Text, + cl_F* Wlist, int Leng, char* Arrayname) + { + int ord; + char c[] = "C", fortran[] = "Fortran", tao[] = "Tao"; + + Ostream < 0) + { Recb[0] = -sint[1]/sint[0]; + Orth[0] = -sint[1]/sint[0]; + norq[1] = sint[2]-expt(sint[1],2)/sint[0]; } + + if(Maxpow > 1) + { for(ord = 0; ord < 2*Maxpow-1; ord++) + rmum[ord] = sint[ord]; + + for(ord = 0; ord < 2*Maxpow; ord++) + rmuv[ord] = sint[ord+1]+Orth[0]*sint[ord]; + + fmu = Orth[0]; + Recb[1] = -fmu-rmuv[2]/rmuv[1]; + Recg[0] = -rmuv[1]/rmum[0]; } + +// Iteration for orthogonal polynomials + + for(or1 = 1; or1 < Maxpow; or1++) + { Recb[or1] = -fmu-rmuv[or1+1]/rmuv[or1]; + Recg[or1-1] = -rmuv[or1]/rmum[or1-1]; + fmu = fmu+Recb[or1]; + + for(or2 = 0; or2 < 2*Maxpow-or1; or2++) + rmup[or2] = rmuv[or2+1]+Recb[or1]*rmuv[or2]+Recg[or1-1]*rmum[or2]; + + norq[or1+1] = rmup[or1+1]; + + for(or2 = 0; or2 < 2*Maxpow-or1-1; or2++) + rmum[or2] = rmuv[or2]; + + for(or2 = 0; or2 < 2*Maxpow-or1; or2++) + rmuv[or2] = rmup[or2]; + + if(Printing == TRUE) + cout < 0) + { bint[1] = tint[1]+Orth[0]*tint[0]; + Coed[1] = bint[1]/norq[1]; } + + if(Maxpow > 1) + { for(ord = 0; ord < Maxpow-1; ord++) + bmum[ord] = tint[ord]; + + for(ord = 0; ord < Maxpow; ord++) + bmuv[ord] = tint[ord+1]+Orth[0]*tint[ord]; } + +// Perform iteration + + for(or1 = 1; or1 < Maxpow; or1++) + { + for(or2 = 0; or2 < Maxpow-or1; or2++) + bmup[or2] = bmuv[or2+1]+Recb[or1]*bmuv[or2]+Recg[or1-1]*bmum[or2]; + + bint[or1+1] = bmup[0]; + Coed[or1+1] = bint[or1+1]/norq[or1+1]; + + for(or2 = 0; or2 < Maxpow-or1-1; or2++) + bmum[or2] = bmuv[or2]; + + for(or2 = 0; or2 < Maxpow-or1; or2++) + bmuv[or2] = bmup[or2]; + + if(Printing == TRUE) + cout < MAXORD) + { cout < 0) oplp[orv] = oplp[orv]+opl[orv-1]; } + + for(orv = 0; orv < Maxpow+1; orv++) + poly[orv] = poly[orv]+Coed[ord+1]*oplp[orv]; + + for(orv = 0; orv < Maxpow+1; orv++) + oplm[orv] = opl[orv]; + + for(orv = 0; orv < Maxpow+1; orv++) + opl[orv] = oplp[orv]; } + +// Extract coefficients + + for(ord = 0; ord < Maxpow+1; ord++) + Coef[ord] = As(cl_F)(poly[Maxpow-ord]); + + } +/******************************************************************************/ +// +// Evaluate a complex polynomial +// +// The polynomial is Poly +// The order is Maxpow +// +// The value of the variable: Valu + +cl_N EvalPoly(cl_N* Poly, int Maxpow, cl_N Valu) + { + int pow; + cl_N xpow, sum; + + + sum = As(cl_N)(complex(ZERO,ZERO)); + xpow = As(cl_N)(complex(ONE,ZERO)); + + for(pow = 0; pow < Maxpow+1; pow++) + { sum = sum+xpow*Poly[pow]; + xpow = xpow*Valu; } + + return sum; + + } + +// this routine evaluated the product representation of a +// polynomial with roots roots, order Maxpow at value x and +// with (overall) normalisation factor norma + +cl_N EvalPolyProd(cl_N* roots, const int Maxpow, + const cl_N x, const cl_N norma = complex(ONE, ZERO)) { + + cl_N prod = As(cl_N)(norma); + for(int i = 0; i < Maxpow; i++) { + prod = prod * (x-roots[i]); + } + return(prod); +} + +/******************************************************************************/ +// +// Find a root of a complex polynomial by Laguerre iteration. +// +// The polynomial is Poly +// The order is Maxpow +// +// The precision: Digit +// +// Print intermediate results for Printing=TRUE + +cl_N Lasolv(cl_N* Poly, int Maxpow, float_format_t Digit, int Printing, + cl_N root = complex(ZERO,ZERO), const int itemax=100) + { + int pow, ite; + + cl_F angl, small = As(cl_F)(expt(cl_float(0.1,Digit),DIGIT/2)); + + cl_N dif1[Maxpow], dif2[Maxpow-1]; + cl_N val0, val, val1, val2, denp, denm, las1, las2, sqrv; + // cl_N root; + for(pow = 0; pow < Maxpow; pow++) + dif1[pow] = (pow+1)*Poly[pow+1]; + + for(pow = 0; pow < Maxpow-1; pow++) + dif2[pow] = (pow+1)*dif1[pow+1]; + +// The maximal allowed number of iterations is set here; +// this can be chosen larger, but 100 usually suffices + +// root = As(cl_N)(complex(ZERO,ZERO)); + val0 = EvalPoly(Poly,Maxpow,root); + +// Iteration + + for(ite = 0; ite < itemax; ite++) + { + val = val0; + val1 = EvalPoly(dif1,Maxpow-1,root); + val2 = EvalPoly(dif2,Maxpow-2,root); + + sqrv = (Maxpow-1)*((Maxpow-1)*val1*val1-Maxpow*val0*val2); + angl = HALF*cl_float(phase(sqrv),Digit); + sqrv = sqrt(abs(sqrv))*complex(cos(angl),sin(angl)); + denp = val1+sqrv; + denm = val1-sqrv; + + if(denp == complex(ZERO,ZERO)) + root = root-Maxpow*val0/denm; + + else + { if(denm == complex(ZERO,ZERO)) + root = root-Maxpow*val0/denp; + + else + { las1 = -Maxpow*val0/denp; + las2 = -Maxpow*val0/denm; + + if(realpart(las1*conjugate(las1)) < + realpart(las2*conjugate(las2))) + root = root+las1; + + else + root = root+las2; } } + +// Look whether the root is good enough + + val0 = EvalPoly(Poly,Maxpow,root); + + if(abs(val0) == ZERO || + (abs(val0) < small) && abs(val0/val) > 0.7) + { + if(Printing == TRUE) + { cout << endl << "Laguerre iterations: " << ite << endl; + cout << endl << "root = " << root << endl; + cout << endl << "value at root: " << val0 << endl; } + + break; } } + + if(ite >= itemax) { + cout < 0; pow--) { + polc[pow] = polc[pow-1]-Root[fnd]*polc[pow]; + } + + polc[0] = -Root[fnd]*polc[pow]; + + // Divide the polynomial by the root + + maxp = Maxpow-fnd-1; + coen[maxp] = coef[maxp+1]; + + for(pow = maxp-1; pow > -1; pow--) { + coen[pow] = coef[pow+1]+Root[fnd]*coen[pow+1]; + } + + for(pow = 0; pow < maxp+1; pow++) { + coef[pow] = coen[pow]; + poly[pow] = coef[pow]; + } + } + + else { + break; + } + } + +// Compare input with product of root factors + + if(Printing == TRUE) { + for(pow = 0; pow < Maxpow+1; pow++) { + polc[pow] = Poly[pow]-poly[0]*polc[pow]; + } + + cout < 47) + { cout < values[rr][nl]) min = values[rr][nl]; } + + mx[rr] = max-min; } + + mn = large; + rv = -1; + + for(rr = 0; rr < Maxpow; rr++) + if(facc[rr] == 1) + if(mn > mx[rr]) + { mn = mx[rr]; + rv = rr; } + + if(Printing == TRUE) + cout < 137) + { cout < 0) + { facp[rn] = r1; + facr[r1] = 0; + rn++; + facp[rn] = r2; + facr[r2] = 0; + rn++; } + else + { facp[rn] = r2; + facr[r2] = 0; + rn++; + facp[rn] = r1; + facr[r1] = 0; + rn++; } + + break; } } } + + // Check whether the complex roots are in complex conjugate pairs + + if((Maxpow-rn) != 0) + { cout < 0); + rc++; } } + + // Calculate root factors + + for(rr = 0; rr < Maxpow; rr++) + for(nl = 0; nl < nmax; nl++) + ff[rr][nl] = xx[nl]-root[rr]; + + for(nl = 0; nl < nmax; nl++) + pp[nl] = complex(cl_float(1,digit),cl_float(0,digit)); + + if(Printing == TRUE) + { cout < values[rr][nl]) min = values[rr][nl]; } + + mx[rr] = max-min; } + + mn = large; + rv = -1; + + for(rr = 0; rr < Maxpow; rr++) + if(facl[rr] != 0 && facr[rr] == 1) + if(mn > mx[rr]) + { mn = mx[rr]; + rv = rr; } + + if(Printing == TRUE) + cout < realpart(y)) { + return(true); + } + } + return(false); +} + +// This routine returns for idx in [0,2^digits-1] the corresponding +// bit reversal index + +int bitReversalRepresentation(const int idx, const int digits) { + int res[digits]; + int num = idx; + for(int i = 0; i < digits; i++) { + res[i] = 0; + } + for(int i = 0; i < digits; i++) { + int k = digits - i; + int p = int(pow(2., (digits-(i+1))) ); + res[k] = num/p; + num = num - res[k]*p; + } + num = 0; + for(int i = 0; i < digits; i++) { + num = num + res[i]*int(pow(2.,digits-(i+1))); + } + return(num); +} + +// quicksort template for comparison function (*comp)(const T&, const T&) + +template void quicksort(const int n, T arr[], int idx[], bool (*comp)(const T&, const T&)){ + T v, td; + int i, j, l, r, ti, tos, stack[32]; + + l = 0; r = n-1; tos = -1; + for (;;) { + while (r > l) { + v = arr[r]; i = l; j = r-1; + for (;;){ + while (comp(arr[i], v)) i ++; + /* j > l prevents underflow */ + while (!comp(arr[j], v) && j > l) j --; + if (i >= j) break; + td = arr[i]; arr[i] = arr[j]; arr[j] = td; + ti = idx[i]; idx[i] = idx[j]; idx[j] = ti; + } + td = arr[i]; arr[i] = arr[r]; arr[r] = td; + ti = idx[i]; idx[i] = idx[r]; idx[r] = ti; + if (i-l > r-i){ + stack[++tos] = l; stack[++tos] = i-1; l = i+1; + } + else{ + stack[++tos] = i+1; stack[++tos] = r; r = i-1; + } + if(tos > 31) { + cerr << "Error in quicksort! Aborting...!" << endl; + exit(31); + } + } + if (tos == -1) break; + r = stack[tos--]; l = stack[tos--]; + } +} + +// This bring the complex list Roots of length Maxpow to +// naive order. +// See hep-lat/9805026 + +void NaiveOrder(cl_N * Roots, const int Maxpow) { + int idx[Maxpow]; + + for(int i =0; i < Maxpow; i++) { + idx[i] = i; + } + quicksort(Maxpow, Roots, idx, &CompSortNaiv); + cout << "Naivly ordered roots" << endl; + for(int i = 0; i < Maxpow; i++) { + cout << i << " " << double_approx(realpart(Roots[i])) << " " << + double_approx(imagpart(Roots[i])) << endl; + } + cout << endl; +} + +// This bring the complex list Roots of length Maxpow to +// bit reversal order. +// See hep-lat/9805026 + +void BitReversalOrder(cl_N *Roots, const int Maxpow, bool Printing=false) { + + int digits = 2; + int power = 2; + while(power < Maxpow) { + power=power*2; + digits++; + } + + // cout << "digits = " << digits << " power " << power << " " << pow(2.,digits-1) << endl; + + cl_N paddedRoots[power]; + cl_N reversedRoots[power]; + + for(int i = 0; i < Maxpow; i++) { + paddedRoots[i] = Roots[i]; + } + for(int i = Maxpow; i < power; i++) { + paddedRoots[i] = complex(-HUND, ZERO); + } + NaiveOrder(paddedRoots, power); + for(int i = 0; i < power; i++) { + reversedRoots[i] = paddedRoots[bitReversalRepresentation(i, digits)]; + // cout << i << " " << bitReversalRepresentation(i, digits) << endl; + } + + if(Printing) { + cout << "Bit reversed ordered roots" << endl; + for(int i = 0, j=0; i < power; i++) { + if((realpart(reversedRoots[i]) != -HUND)) { + Roots[j] = reversedRoots[i]; + cout << j << " " << double_approx(realpart(Roots[j])) << " " << + double_approx(imagpart(Roots[j])) << endl; + j++; + } + } + cout << endl; + } + return; +} + + +/******************************************************************************/ +// +// Calculating the roots of a polynomial approximation minimizing +// the integral of relative quadratic deviation from x^(-Alpha) +// in an interval. +// +// The coefficients of the polynomials are assumed to be known. + +// Input parameters: +// order of the polynomial: Maxpow +// the (negative) power to be approximated: Alpha +// lower bound of interval of approximation: Epsilon +// upper bound of interval of approximation: Lambda +// +// The name of a array containing the coefficients: Coef +// +// The precision: Digit +// +// name of the file for writing results: Filename +// start file for Start=yes, otherwise append Start +// format for assignments, fortran or tao: Format +// print intermediate results for Printing=yes Printing + + +void ApproxiRootr(int Maxpow, cl_F Epsilon, cl_F Lambda, cl_F* Coef, + float_format_t Digit, + char* Filename, char* Format, int Printing) +{ + int ord, leng; + + cl_N Poly[1+Maxpow], Root[Maxpow], Rho[Maxpow]; + cl_F wlst[1+Maxpow]; + cl_N roots[2*Maxpow]; + cl_F pi2 = cl_float(realpart(acos(ZERO)))/HALF/HALF; + cl_F rr, ang, coef; + + + // Check input + + if(Maxpow < 0 || Maxpow > MAXORD) { + cout < cl_F func(T &x) { + return(ONE/sqrt(x)); +} + +// This routine produces coefficients for Chebycheff pol. +// of a given order and interval [epsilon, lambda] + +void ChebyCoeff(const int order, + const cl_F &epsilon, const cl_F &lambda, + cl_F * Coeff, cl_F * c) { + + cl_F bma = HALF*(lambda-epsilon); + cl_F bpa = HALF*(lambda+epsilon); + cl_F y; + cl_F ftable[5000]; + + for(int i = 0; i < order+1; i++) { + y = cos(pi(bma)*As(cl_F)((cl_R(i)+HALF)/cl_R(order+1))); + ftable[i] = func(y*bma+bpa); + } + + cl_F fac = As(cl_F)(TWO/cl_R(order+1)); + for(int i = 0; i < order+1; i++) { + cl_F sumit = ZERO; + for(int j = 0; j < order+1; j++) { + sumit = sumit + ftable[j]*cos(pi(bma)*(cl_R(i)*(cl_R(j)+HALF)/cl_R(order+1))); + } + Coeff[i] = fac*sumit; + } + + cl_univpoly_real_ring PR = find_univpoly_ring(cl_R_ring); + cl_UP_R b = PR->create(order); + for(int i = 0; i < order+1; i++) { + b.set_coeff(i, ZERO); + } + b.set_coeff(0, -HALF*Coeff[0]); + b.finalize(); + + for(int i=0; i < order+1; i++) { + cl_UP_I C = tschebychev(i); + for(int j=0; j < i+1; j++) { + cl_R c = coeff(b, j); + b.set_coeff(j, c + Coeff[i]*cl_R(coeff(C, j))); + } + b.finalize(); + } + for(int i = 0; i < order+1; i++) { + c[i] = Coeff[i]; + Coeff[i] = As(cl_F)(coeff(b, i)); + cout << i << " : " << c[i] << endl; + } +} + +// here we use clenshaw to evaluate the polynomial + +cl_N EvalCheby(const int order, cl_N * Coeff, cl_N &x, + const cl_N & epsilon, const cl_N lambda) { + cl_N d=complex(ZERO,ZERO), dd=complex(ZERO,ZERO), sv, z, res; + int j; + + z = (TWO*x - epsilon - lambda)/(lambda-epsilon); + + for(j=order; j>=1; j--) { + sv = d; + d = TWO*z*d - dd + Coeff[j]; + dd = sv; + } + + res = z*d - dd + HALF*Coeff[0]; + + return(res); +} + +int main(int argc, char *argv[]) + { + int Maxpow = MAXPOW; + int Printing = TRUE; + + float_format_t Digit = float_format(DIGIT); + + cl_F Epsilon = EPSILON, Lambda = LAMBDA; + + double sec(-(double(clock()))/double(CLOCKS_PER_SEC)); + + +// Check order + + if(Maxpow > MAXORD) + { cout <<"Polynomial order is too large: " + < " <. + +// Defining input parameters for quadroptRoot.C + +// Defining macros + +#define TRUE 1 +#define FALSE 0 + +#define MAXORD 200 // Maximal possible value of polynomial order ( > 1) + +// Precision of cl_F corresponding to MAXORD. +// +// A good guess is: DIGIT = 70+2.8*MAXORD +// +// but one has to check this by two runs with increasing precision. + +// 700 +#define DIGIT 700 // Precision of cl_F + + +// Define constants to the desired precision + +cl_F ONE = "1.0e+0_700"; // Precise 1 +cl_F TWO = "2.0e+0_700"; // Precise 2 +cl_F ZERO = "0.0e+0_700"; // Precise 0 +cl_F HALF = "0.5e+0_700"; // Precise 0.5 +cl_F HUND = "100.e+0_700"; + +// Define basic parameters to the desired precision + +int MAXPOW = 48; + +/* cl_F ALPHA = "-0.500e+0_700", */ +/* EPSILON = "0.1e+0_700", */ +/* LAMBDA = "1.00e+0_700"; */ + +cl_F ALPHA = "0.500e+0_700", + EPSILON = "0.0043e+0_700", + LAMBDA = "1.e+0_700"; + +// Define output format and files + +char Format[] = "C"; +char Filename[] = "recur_A25_8_002.cff"; +char Filenamr[] = "roots_A25_8_002.cff"; + +/******************************************************************************/ + + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/util/laguer/laguer.c b/qcd/part_cpu/applications/QCD/src/kernel_D/util/laguer/laguer.c new file mode 100644 index 0000000000000000000000000000000000000000..bfe78188f0e0fa37e64c6b82375cf2541f179b75 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/util/laguer/laguer.c @@ -0,0 +1,144 @@ +/*********************************************************************** + * + * Copyright (C) 2007 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * + * Hybrid-Monte-Carlo for twisted mass QCD + * + * Author: Carsten Urbach + * urbach@physik.fu-berlin.de + * + *******************************************************************************/ +#include +#include +#include +#include + +const double epss=1.e-7; +const int MT = 10; +#define MR 8 + +inline dmax(double x, double y) { + if(x > y) return(x); + return(y); +} + +int laguer(double complex a[], const int m, double complex *x, int *its, const int maxit) { + int iter, i, j; + double abx, abp, abm, err; + double complex dx,x1,b,d,f,g,h,sq,gp,gm,g2; + static double frac[MR+1] = {0.0,0.5,0.25,0.75,0.13,0.38,0.62,0.88,1.0}; + for (iter = 1; iter <= maxit; iter++) { + *its = iter; + b = a[m]; + err = cabs(b); + d = 0.; + f = 0.; + abx = cabs(*x); + for (j = m-1; j >= 0; j--) { + f = (*x) * f + d; + d = (*x) * d + b; + b = (*x) * b + a[j]; + err = cabs(b) + abx * err; + } + err *= epss; + if (cabs(b) <= err) return(0); + g = d / b; + g2 = g * g; + h = g2 - 2. * f / b; + sq = csqrt((double)(m-1) * ((double)(m)*h - g2)); + gp = g + sq; + gm = g - sq; + abp = cabs(gp); + abm = cabs(gm); + if (abp < abm) gp = gm; + dx=((dmax(abp,abm) > 0. ? + ((double complex)(m))/gp : + (1. + abx)*(cos((double)iter) + _Complex_I*sin((double)iter)))); + x1 = (*x) - dx; + + if (creal(*x) == creal(x1) && cimag(*x) == cimag(x1)) { + return(0); + } + if (iter % MT) { + *x=x1; + } + else { + *x = (*x) - frac[iter/MT]*dx; + } + } + fprintf(stderr, "Too many iterations in laguer\n"); + return(-1); +} + +int zroots(double complex a[], const int m, double complex roots[], const int polish) { + int i, j, jj, its, k; + double complex x, b, c, ad[1000]; + for(j = 0; j < m+1; j++) { + ad[j] = a[j]; + } + for(j = m; j > 0; j--) { + x = 0.; + if((k = laguer(ad, j, &x, &its, 800)) != 0) { + fprintf(stderr, "something wront!\n"); + } + if(abs(cimag(x)) <= 2.*epss*abs(creal(x))) x = creal(x); + roots[j-1] = x; + b = ad[j]; + for(jj = j-1; jj > -1; jj--) { + c = ad[jj]; + ad[jj] = b; + c = x*b + c; + } + } + if(polish) { + for(j = 1; j < m+1; j++) { + if((k = laguer(a, m, &roots[j-1], &its, 800)) != 0) { + fprintf(stderr, "something wront!\n"); + } + } + } + for(j = 2; j < m+1; j++) { + x = roots[j-1]; + for(i = j-1; i > 0; i--) { + if(creal(roots[i-1]) <= creal(x)) break; + roots[i] = roots[i-1]; + } + } + return(0); +} + +int main() { + int i; + double complex a[5]; + double complex roots[5]; + + a[0] = 1.; + a[1] = 1.; + a[2] = 1.; + a[3] = 1.; + a[4] = 1.; + zroots(a, 2, roots, 1); + for(i = 0; i < 2; i++) { + printf("%f %f %f %f\n", creal(roots[i]), cimag(roots[i]), + creal(a[0]+a[1]*roots[i]+a[2]*roots[i]*roots[i]), + cimag(a[0]+a[1]*roots[i]+a[2]*roots[i]*roots[i])); + } + + return(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/util/laguer/quadroptRoot.C b/qcd/part_cpu/applications/QCD/src/kernel_D/util/laguer/quadroptRoot.C new file mode 100644 index 0000000000000000000000000000000000000000..3bc58a7831091aca73accd69aba75c3c57472f75 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/util/laguer/quadroptRoot.C @@ -0,0 +1,1189 @@ +/******************************************************************************/ +// +// Copyright (C) 2005 Istvan Montvay +// +// Procedures for determining the recurrence coefficients and +// roots of a least-square optimized polynomial. +// +// The function to be approximated is: 1/x^alpha +// +// Root pairs and square-root of roots are optimally ordered. +// +// This runs with CLN. +// +// Last changed: April 18, 2005 Istvan Montvay +// +/******************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace cln; +using namespace std; + +// Defining input parameters in a header file + +#include "inputParam.h" + + +// Define global variables + +cl_F Recb[1+MAXORD-1], Recg[1+MAXORD-2], Orth[1], Coed[1+MAXORD]; +cl_F Coef[1+MAXORD]; + + +/******************************************************************************/ +// +// Calculate the basic integral s_nu for least-square optimization. +// +// Input parameters: +// order of the polynomial: Maxpow +// +// the power to be approximated: Alpha +// lower bound of interval of approximation: Epsilon +// upper bound of interval of approximation: Lambda +// +// The result is put in Sint +// The precision: Digit + +void BaseIntS(int Maxpow, cl_F Alpha, cl_F Epsilon, cl_F Lambda, cl_F* Sint, + float_format_t Digit) + { + int ord; + cl_F power, small = As(cl_F)(expt(cl_float(0.1,Digit),DIGIT)); + +// Loop over powers + + for(ord = 0; ord < 2*Maxpow+1; ord++) + { power = As(cl_F)(2*Alpha+(ord+1)); + + if(abs(power) < small) + Sint[ord] = As(cl_F)(log(Lambda/Epsilon)); + + else + Sint[ord] = As(cl_F)(expt(Lambda,power)-expt(Epsilon,power))/power; } + + } +/******************************************************************************/ +// +// Calculate the basic integral t_nu for least-square optimization. +// +// Input parameters: +// order of the polynomial: Maxpow +// +// the power to be approximated: Alpha +// lower bound of interval of approximation: Epsilon +// upper bound of interval of approximation: Lambda +// +// The result is put in Tint +// The precision: Digit + +void BaseIntT(int Maxpow, cl_F Alpha, cl_F Epsilon, cl_F Lambda, cl_F* Tint, + float_format_t Digit) + { + int ord; + cl_F power, small = As(cl_F)(expt(cl_float(0.1,Digit),DIGIT));; + +// Loop over powers + + for(ord = 0; ord < Maxpow+1; ord++) + { power = As(cl_F)(Alpha+(ord+1)); + + if(abs(power) < small) + Tint[ord] = As(cl_F)(log(Lambda/Epsilon)); + + else + Tint[ord] = As(cl_F)(expt(Lambda,power)-expt(Epsilon,power))/power; } + + } +/******************************************************************************/ +// +// Evaluate the approximate polynomial up to the order Maxpow +// at the variable value Xval +// +// The recurrence coefficients are in Recb +// and in Recg +// Constant term of the first orthogonal polynomial: Orth +// Expansion coefficients in orthogonal polynomials: Coed + +cl_F Recurev(int Maxpow, cl_F Xval, + cl_F* Recb,cl_F* Recg,cl_F* Orth,cl_F* Coed) + { + int ord; + cl_F res, orth, orthb, orthg; + + +// Check input + + if(Maxpow < 0 || Maxpow > MAXORD) + { cout < 0) + { orthb = orth; + orth = Xval+Orth[0]; + res = res+Coed[1]*orth; } + +// Iteration for recurrence + + for(ord = 2; ord < Maxpow+1; ord++) + { orthg = orthb; + orthb = orth; + orth = (Xval+Recb[ord-1])*orthb+Recg[ord-2]*orthg; + res = res+Coed[ord]*orth; } + + return res; + } +/******************************************************************************/ +// +// Write elements of a list of numbers in the array Wlist +// of length Leng +// as assignment statements +// to the elements of an array Arrayname +// +// The ofstream of the output file is Ostream +// The format of assignements is C, F(ortran) or T(ao): Format +// +// At the beginning write the string Text + +void WriteAssign(ostream& Ostream, char* Format, char* Text, + cl_F* Wlist, int Leng, char* Arrayname) + { + int ord; + char c[] = "C", fortran[] = "Fortran", tao[] = "Tao"; + + Ostream < 0) + { Recb[0] = -sint[1]/sint[0]; + Orth[0] = -sint[1]/sint[0]; + norq[1] = sint[2]-expt(sint[1],2)/sint[0]; } + + if(Maxpow > 1) + { for(ord = 0; ord < 2*Maxpow-1; ord++) + rmum[ord] = sint[ord]; + + for(ord = 0; ord < 2*Maxpow; ord++) + rmuv[ord] = sint[ord+1]+Orth[0]*sint[ord]; + + fmu = Orth[0]; + Recb[1] = -fmu-rmuv[2]/rmuv[1]; + Recg[0] = -rmuv[1]/rmum[0]; } + +// Iteration for orthogonal polynomials + + for(or1 = 1; or1 < Maxpow; or1++) + { Recb[or1] = -fmu-rmuv[or1+1]/rmuv[or1]; + Recg[or1-1] = -rmuv[or1]/rmum[or1-1]; + fmu = fmu+Recb[or1]; + + for(or2 = 0; or2 < 2*Maxpow-or1; or2++) + rmup[or2] = rmuv[or2+1]+Recb[or1]*rmuv[or2]+Recg[or1-1]*rmum[or2]; + + norq[or1+1] = rmup[or1+1]; + + for(or2 = 0; or2 < 2*Maxpow-or1-1; or2++) + rmum[or2] = rmuv[or2]; + + for(or2 = 0; or2 < 2*Maxpow-or1; or2++) + rmuv[or2] = rmup[or2]; + + if(Printing == TRUE) + cout < 0) + { bint[1] = tint[1]+Orth[0]*tint[0]; + Coed[1] = bint[1]/norq[1]; } + + if(Maxpow > 1) + { for(ord = 0; ord < Maxpow-1; ord++) + bmum[ord] = tint[ord]; + + for(ord = 0; ord < Maxpow; ord++) + bmuv[ord] = tint[ord+1]+Orth[0]*tint[ord]; } + +// Perform iteration + + for(or1 = 1; or1 < Maxpow; or1++) + { + for(or2 = 0; or2 < Maxpow-or1; or2++) + bmup[or2] = bmuv[or2+1]+Recb[or1]*bmuv[or2]+Recg[or1-1]*bmum[or2]; + + bint[or1+1] = bmup[0]; + Coed[or1+1] = bint[or1+1]/norq[or1+1]; + + for(or2 = 0; or2 < Maxpow-or1-1; or2++) + bmum[or2] = bmuv[or2]; + + for(or2 = 0; or2 < Maxpow-or1; or2++) + bmuv[or2] = bmup[or2]; + + if(Printing == TRUE) + cout < MAXORD) + { cout < 0) oplp[orv] = oplp[orv]+opl[orv-1]; } + + for(orv = 0; orv < Maxpow+1; orv++) + poly[orv] = poly[orv]+Coed[ord+1]*oplp[orv]; + + for(orv = 0; orv < Maxpow+1; orv++) + oplm[orv] = opl[orv]; + + for(orv = 0; orv < Maxpow+1; orv++) + opl[orv] = oplp[orv]; } + +// Extract coefficients + + for(ord = 0; ord < Maxpow+1; ord++) + Coef[ord] = As(cl_F)(poly[Maxpow-ord]); + + } +/******************************************************************************/ +// +// Evaluate a complex polynomial +// +// The polynomial is Poly +// The order is Maxpow +// +// The value of the variable: Valu + +cl_N EvalPoly(cl_N* Poly, int Maxpow, cl_N Valu) + { + int pow; + cl_N xpow, sum; + + + sum = As(cl_N)(complex(ZERO,ZERO)); + xpow = As(cl_N)(complex(ONE,ZERO)); + + for(pow = 0; pow < Maxpow+1; pow++) + { sum = sum+xpow*Poly[pow]; + xpow = xpow*Valu; } + + return sum; + + } +/******************************************************************************/ +// +// Find a root of a complex polynomial by Laguerre iteration. +// +// The polynomial is Poly +// The order is Maxpow +// +// The precision: Digit +// +// Print intermediate results for Printing=TRUE + +cl_N Lasolv(cl_N* Poly, int Maxpow, float_format_t Digit, int Printing, const int itemax=200) + { + int pow, ite; + + cl_F angl, small = As(cl_F)(expt(cl_float(0.1,Digit),DIGIT/2)); + + cl_N dif1[Maxpow], dif2[Maxpow-1]; + cl_N root, val0, val, val1, val2, denp, denm, las1, las2, sqrv; + + for(pow = 0; pow < Maxpow; pow++) + dif1[pow] = (pow+1)*Poly[pow+1]; + + for(pow = 0; pow < Maxpow-1; pow++) + dif2[pow] = (pow+1)*dif1[pow+1]; + +// The maximal allowed number of iterations is set here; +// this can be chosen larger, but 100 usually suffices + + root = As(cl_N)(complex(ZERO,ZERO)); + val0 = EvalPoly(Poly,Maxpow,root); + +// Iteration + + for(ite = 0; ite < itemax; ite++) + { + val = val0; + val1 = EvalPoly(dif1,Maxpow-1,root); + val2 = EvalPoly(dif2,Maxpow-2,root); + + sqrv = (Maxpow-1)*((Maxpow-1)*val1*val1-Maxpow*val0*val2); + angl = HALF*cl_float(phase(sqrv),Digit); + sqrv = sqrt(abs(sqrv))*complex(cos(angl),sin(angl)); + denp = val1+sqrv; + denm = val1-sqrv; + + if(denp == complex(ZERO,ZERO)) + root = root-Maxpow*val0/denm; + + else + { if(denm == complex(ZERO,ZERO)) + root = root-Maxpow*val0/denp; + + else + { las1 = -Maxpow*val0/denp; + las2 = -Maxpow*val0/denm; + + if(realpart(las1*conjugate(las1)) < + realpart(las2*conjugate(las2))) + root = root+las1; + + else + root = root+las2; } } + +// Look whether the root is good enough + + val0 = EvalPoly(Poly,Maxpow,root); + + if(abs(val0) == ZERO || + (abs(val0) < small) && abs(val0/val) > 0.7) + { + if(Printing == TRUE) + { cout << endl << "Laguerre iterations: " << ite << endl; + cout << endl << "root = " << root << endl; + cout << endl << "value at root: " << val0 << endl; } + + break; } } + + if(ite >= itemax) + cout < 0; pow--) { + polc[pow] = polc[pow-1]-Root[fnd]*polc[pow]; + } + + polc[0] = -Root[fnd]*polc[pow]; + + // Divide the polynomial by the root + + maxp = Maxpow-fnd-1; + coen[maxp] = coef[maxp+1]; + + for(pow = maxp-1; pow > -1; pow--) { + coen[pow] = coef[pow+1]+Root[fnd]*coen[pow+1]; + } + + for(pow = 0; pow < maxp+1; pow++) { + coef[pow] = coen[pow]; + poly[pow] = coef[pow]; + } + } + + else { + break; + } + } + +// Compare input with product of root factors + + if(Printing == TRUE) { + for(pow = 0; pow < Maxpow+1; pow++) { + polc[pow] = Poly[pow]-poly[0]*polc[pow]; + } + + cout < 47) + { cout < values[rr][nl]) min = values[rr][nl]; } + + mx[rr] = max-min; } + + mn = large; + rv = -1; + + for(rr = 0; rr < Maxpow; rr++) + if(facc[rr] == 1) + if(mn > mx[rr]) + { mn = mx[rr]; + rv = rr; } + + if(Printing == TRUE) + cout < 137) + { cout < 0) + { facp[rn] = r1; + facr[r1] = 0; + rn++; + facp[rn] = r2; + facr[r2] = 0; + rn++; } + else + { facp[rn] = r2; + facr[r2] = 0; + rn++; + facp[rn] = r1; + facr[r1] = 0; + rn++; } + + break; } } } + +// Check whether the complex roots are in complex conjugate pairs + + if((Maxpow-rn) != 0) + { cout < 0); + rc++; } } + +// Calculate root factors + + for(rr = 0; rr < Maxpow; rr++) + for(nl = 0; nl < nmax; nl++) + ff[rr][nl] = xx[nl]-root[rr]; + + for(nl = 0; nl < nmax; nl++) + pp[nl] = complex(cl_float(1,digit),cl_float(0,digit)); + + if(Printing == TRUE) + { cout < values[rr][nl]) min = values[rr][nl]; } + + mx[rr] = max-min; } + + mn = large; + rv = -1; + + for(rr = 0; rr < Maxpow; rr++) + if(facl[rr] != 0 && facr[rr] == 1) + if(mn > mx[rr]) + { mn = mx[rr]; + rv = rr; } + + if(Printing == TRUE) + cout < MAXORD) + { cout < cl_F func(T &x) { + return(ONE/sqrt(x)); +} + +void ChebyCoeff(const int order, + const cl_F &epsilon, const cl_F &lambda, + cl_F * Coef) { + + cl_F bma = HALF*(lambda-epsilon); + cl_F bpa = HALF*(lambda+epsilon); + cl_F y; + cl_F ftable[500]; + + for(int i = 0; i < order+1; i++) { + y = cos(pi(bma)*(cl_R(i)+HALF)/cl_R(order+1)); + ftable[i] = func(y*bma+bpa); + } + + cl_F fac = cl_F(TWO/cl_R(order+1)); + for(int i = 0; i < order+1; i++) { + cl_F sumit = ZERO; + for(int j = 0; j < order+1; j++) { + sumit = sumit + ftable[j]*cos(pi(bma)*cl_R(i)*(cl_R(j)+HALF)/cl_R(order+1)); + } + Coef[i] = fac*sumit; + } +} + +int main(int argc, char *argv[]) + { + int Maxpow = MAXPOW; + int Printing = TRUE; + + float_format_t Digit = float_format(DIGIT); + + cl_F Alpha = ALPHA, Epsilon = EPSILON, Lambda = LAMBDA; + + double sec(-(double(clock()))/double(CLOCKS_PER_SEC)); + + +// Check order + + if(Maxpow > MAXORD) + { cout <<"Polynomial order is too large: " + < " <. + ***********************************************************************/ +/* + Program to read a ILDG + configuration into memory. - based on ILDG from Carsten + + Writes float to D_ukqcd + +*/ + +#include +#include +#include +#include"io.h" + +int main() +{ + char filename[150] ; + /* ILDG double U[NT][NZ][NY][NX][4][3][3][2]; mu: XYZT*/ + const int NX = 24 ; + const int NY = 24 ; + const int NZ = 24 ; + const int NT = 48 ; + + int dim = NX*NY*NZ*NT*2*9*4 ; + int dimukqcd = NX*NY*NZ*2*6*4 ; /* dimension of write 2 cols case */ + int nxyz = NX*NY*NZ ; + int iri,irv,idim,icol,irow, ixyz,ix,iy,iz,it; + int iuk, icp, iout; + double xnr11,xnr12,xnr22,xtemp1,xtemp2; + double *config=NULL ; + + FILE *fp; + /* UKQCD float U[NT][NZ][NY][NX][4][3][2][2] and separate t-slices */ + float *ukqcd ; + char fileout[] = "D_ukqcd"; + char fname_t[100]; + +/* read in file name from stdin - to be checked in ILDF_read with header */ + scanf("%s",filename); + + + printf("Read in a tmqcd configuration\n") ; + printf("filename = %s\n",filename); + fflush(stdout); + + config = (double *) malloc((size_t) dim * sizeof(double) ) ; + if(errno == ENOMEM) { + fprintf(stderr, "Error reserving space for config\n"); + return(-1); + } + + read_lime_gauge_field_doubleprec(config, filename, NT, NX, NY, NZ); + + for ( iri=0; iri<6; iri++) { + irv=iri*2; + printf("iri=%d value %lf",iri,*(config+irv)); + printf(" value %lf\n",*(config+irv+1)); + } + +/* test SU(3) 12 34 45 + 67 89 1011 */ + + xnr11=0; + xnr12=0; + xnr22=0; + + for ( iri=0; iri<6; iri++) { + xtemp1=*(config+iri); + xtemp2=*(config+iri+6); + xnr11+=xtemp1*xtemp1; + xnr12+=xtemp1*xtemp2; + xnr22+=xtemp2*xtemp2; + } + printf(" xnr11, xnr12, xnr22 =%lf,%lf,%lf\n",xnr11,xnr12,xnr22); + + + ukqcd = (float *) malloc((size_t) dimukqcd * sizeof(float) ) ; + if(errno == ENOMEM) { + fprintf(stderr, "Error reserving space for ukqcd\n"); + return(-1); + } + + for ( it=0; it < NT; it++) { + + sprintf(fname_t,"%s_T%02d",fileout,it); + + printf("fname_t is %s\n",fname_t); + fp= fopen(fname_t, "wb"); + if( fp == NULL ) { + fprintf(stderr, "Error opening binary file to write\n"); + return(-1) ; + } + + /* ukqcd order here mu XYZT */ + + iout=0; + + for (iz = 0; iz < NZ; iz++) { + for (iy = 0; iy < NY; iy++) { + for (ix = 0; ix < NX; ix++) { + /* ILDG has order TZYX as UKQCD */ + ixyz = (iz*NY+iy)*NX+ix; + + for (idim = 0; idim < 4; idim++) { + /* ILDG has mu XYZT */ + + for (icol = 0; icol < 3; icol++) { + for (irow = 0; irow < 2; irow++) { + for (iri = 0; iri < 2; iri++) { + icp = iri+icol*2+irow*6+idim*18+(ixyz+it*nxyz)*72 ; + *(ukqcd+iout) =*(config+icp); + iout++; + } + } + } + } + } + } + } + + if( fwrite(ukqcd,sizeof(float),dimukqcd,fp) != dimukqcd ) { + fprintf(stderr, "Error writing binary file\n"); + return(-1) ; + } + fclose(fp); + } + return(0) ; +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/util/oox/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_D/util/oox/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..88de5bdd5f3d23582fb60ce8af72709a1c68394a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/util/oox/Makefile @@ -0,0 +1,46 @@ +CC=gcc +CXX=g++ +CFLAGS=-O2 -fexpensive-optimizations -fomit-frame-pointer # -mfpmath=sse -msse2 +LIBS=-lm +OBJECTS_OOX=oox.o +INCLUDE=-I./ + + +# variables for oox_ga executable +# if you want to compile with ga lib support +# please adjust the GALIBPATH variable +# to the toplevel dir of galib +# it is assumed that you compiled the library +# such that a libga.a file is present in the +# ./ga subdir of galib +GALIBPATH=/usr1/scratch/annube/galib247 +LIBS_GA=${LIBS} -L${GALIBPATH}/ga -lga +CFLAGS_GA=${CFLAGS} -DWITHGALIB +INCLUDE_GA=${INCLUDE} -I${GALIBPATH} +OBJECTS_OOX_GA=oox_ga.o oox_gawrapper.o + + +all: oox oox_ga + +oox: ${OBJECTS_OOX} Makefile + ${CXX} ${OBJECTS_OOX} -o $@ ${CFLAGS} ${LIBS} + +oox_ga: ${OBJECTS_OOX_GA} Makefile + ${CXX} ${OBJECTS_OOX_GA} -o $@ ${CFLAGS_GA} ${LIBS_GA} + +oox_gawrapper.o: oox_gawrapper.cxx + ${CXX} ${CFLAGS_GA} -o $@ -c $< ${INCLUDE_GA} + +oox_ga.o: oox.c + ${CC} ${CFLAGS_GA} -o $@ -c $< ${INCLUDE_GA} + +clean: + rm oox oox_ga *.o + +.SUFFIXES: + +%.o: %.c + ${CC} ${CFLAGS} -o $@ -c $< ${INCLUDE} + +%.o: %.cxx + ${CXX} ${CFLAGS} -o $@ -c $< ${INCLUDE} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/util/oox/oox.c b/qcd/part_cpu/applications/QCD/src/kernel_D/util/oox/oox.c new file mode 100644 index 0000000000000000000000000000000000000000..64ca6c5582dc023df45507af1d39fa7a7154531d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/util/oox/oox.c @@ -0,0 +1,746 @@ +/************************************************************* + *Written by Andreas Nube (C) 2007 + * + *This program computes and outputs the roots of a chebycheff + * + *polynomial approximating 1/x One Over X -> oox + *************************************************************/ + + + + +#include +#include + +#include +#include + +#include + + +#include +#include + +#include + +#ifdef WITHGALIB +#include "oox_gawrapper.h" +#endif + + +#ifndef NULL +#define NULL 0 +#endif + +#define COMPLEX double complex + +#ifndef M_PI +#define M_PI 3.1415926535897931 +#endif + +int wait=20; + + +/* will be explained later */ +double norm_estimate=3.55; + +/* enum for kind of opitmization criterion */ + +enum optikind {OPTINO,OPTIDEVIATION,OPTIMAX,OPTIRELERROR,OPTIGA}; + +inline double flat(){ return (double)rand()/(double)RAND_MAX;} + +/** + * this computes the routs of the chebycheff polynomial approximating 1/x + */ + + +static void RootsOfOneOverX(COMPLEX* roots,const int degree, + double epsilon){ + + int i; + double arg; + + for(i=1;i<=degree;i++){ + arg=2*M_PI*(double)i/((double)degree+1.0) ; + roots[i-1]=0.5*(1.0+epsilon)*(1.0-cos(arg)) + -I*sqrt(epsilon)*sin(arg); + } + + +} + +/** + * this is MY bitreversal routine im proud of + */ + +void ReverseBits(unsigned int *bits,int numbits){ + if(sizeof(int)==4){ + *bits= (((*bits) & 0x0000ffff) <<16) + (((*bits) & 0xffff0000)>>16); + *bits= (((*bits) & 0x00ff00ff) <<8) + (((*bits) & 0xff00ff00)>>8); + *bits= (((*bits) & 0x0f0f0f0f) <<4) + (((*bits) & 0xf0f0f0f0)>>4); + *bits= (((*bits) & 0x33333333) <<2) + (((*bits) & 0xCCCCCCCC)>>2); + *bits= (((*bits) & 0x55555555) <<1) + (((*bits) & 0xAAAAAAAA)>>1); + *bits >>=(32-numbits); + } else {fprintf(stderr,"error: int has not 32 bits\n");} +} + +/**** + * the most efficient way (from my point of view) to compute integer powers of 2 two + */ +int pow2(int n){ + return 1<max) max=logProd; + else if(logProd globMax) globMax=max-min; */ + globMax+=max-min; + } + + return globMax/(double)degree; +} + + +/* used for optimizing the ordering of the polynomial roots */ +double optiMaxVal(const complex *roots,int degree,double norm,complex* testPoints,int n_points,complex* prod){ + int i,j; + double pointsMax=0; + double globMax=0; + double cabsprod; + for(i=0;ipointsMax) pointsMax=cabsprod; + } + if(j==0) globMax=pointsMax; + else if(pointsMax>globMax) globMax=pointsMax; + } + + return globMax; +} + + +/*********************************************************** + * a small brute-force stupid genetic algorithm to optimize + * root ordering + ***********************************************************/ + +void OptimizeOrderMC(COMPLEX *roots,int degree,double norm,double epsilon,int n_points,double (*fn)(const complex*,int,double,complex*,int,complex*) ){ + + + complex *testPoints=malloc(sizeof(complex)*n_points); + complex *error=malloc(sizeof(complex)*n_points); + double overallerror,newoverallerror,initialoverallerror; + complex *newRoots=malloc(sizeof(complex)*degree); + complex* prod=(complex*)malloc(sizeof(complex)*n_points); + + int *perm=malloc(sizeof(int)*degree); + int *permsave=malloc(sizeof(int)*degree); + int *permtmp=malloc(sizeof(int)*degree); + double dx=(1.-epsilon)/((double)n_points-1.0); + int i; + double min=1.0; + + int index1=1; + int index2=2; + time_t lastMinTime,actualTime; + + FILE *fileIndex; + + + /* create test points */ + for(i=0;i break */ + if(actualTime-lastMinTime> wait){ + printf("We stop here since we found no new minimum within the last %d seconds\n",wait); + break; + + } + + + /* save state before shuffeling */ + for(i=0;i Doing NO optimization!\n"); +#endif + } else { + fprintf(stderr," Argument -o requires an argument: max | relerror | deviation \n"); + abort(); + } + break; + case 'd': + degree=atol(optarg); + break; + case 'e': + epsilon=atof(optarg); + break; + case 'n': + num_points_for_opti=atoi(optarg); + break; + case 'w': + wait=atoi(optarg); + break; + case 'p': + output_prod_hist = 1 ; + break; + case '?': + if(optopt == 'd' || optopt == 'e' ) { + fprintf(stderr," Argument %c requires an argument \n",optopt); + abort(); + } else if(optopt == 'o' ){ + fprintf(stderr," Argument %c requires an argument: max | relerror | deviation \n",optopt); + abort(); + } else if (isprint (optopt)) + fprintf (stderr, "Unknown option `-%c'.\n", optopt); + else + fprintf (stderr, + "Unknown option character `\\x%x'.\n", + optopt); + default: + abort(); + } + } + + invdegreepo=1.0/((double)degree); + + if(num_points_for_opti==-1) + num_points_for_opti=(int) degree; + + + + dx=(1.0-epsilon)/divisions;x=epsilon; + + /*allocate memory for the roots*/ + roots=(COMPLEX*)malloc(sizeof(COMPLEX)*degree); + if(roots==NULL) + {fprintf(stderr," error allocating memory\n"); + exit(-1); + } + + /*calculate the roots of the chebycheff polynomial approximating 1/x*/ + RootsOfOneOverX(roots,degree,epsilon); + + fprintf(stderr,"here come the roots\n"); + + /* reorder roots in "my manner" of bitreversal order */ + MyBitReversalOrder(roots,degree); + + + /*calculate the normierung */ + fprintf(stderr,"norm_estimate local %lf\n",norm_estimate); + + normierunglocal=norm_estimate; + norm=Normierung(roots,degree,epsilon); + norm_estimate=pow(norm,invdegreepo)*norm_estimate; + + normierunglocal_olddelta=norm_estimate-normierunglocal; + + fprintf(stderr,"First normierung local %lf (delta) -> %e\n",normierunglocal,normierunglocal_olddelta); + + normierunglocal=norm_estimate; + + j=0; + do { + norm=Normierung(roots,degree,epsilon); + norm_estimate=pow(norm,invdegreepo)*norm_estimate; + + normierunglocal_delta=norm_estimate-normierunglocal; + if(fabs(normierunglocal_delta) +#include + +#include +//#include "MyRootOrderGenome.h" + + + #include "oox_gawrapper.h" + +using namespace std; + + + +struct poly_params { + int degree; + std::complex *roots; + double norm; + double epsilon; + int n_points; + double *points; + std::complex *prod; + bool *indexMap1; + bool *indexMap2; +}; + + +float objectiveFn(GAGenome &g); +void IdInitializer(GAGenome &g); +void RandomInitializer(GAGenome &g); + +int SwapMutator(GAGenome &g,float pmut); +int SimpleSwapMutator(GAGenome &g,float pmut); +int MyUniformCrossover(const GAGenome& p1, const GAGenome& p2, GAGenome* c1, GAGenome* c2); +bool check(GA1DArrayGenome &g,const char *c); + + +double objectiveDeviation(poly_params* pp,GA1DArrayGenome &g); + + +complex testPP(poly_params* pp,complex value=complex(0.5,0.0)){ + complex prod(1.0,0.); + int i; + for(i=0;idegree;i++){ + prod*=value - pp->roots[i]; + prod*=pp->norm; + } + return prod; +} + + +void initGAObject(int degree,double *roots,double norm,double epsilon,int n_points){ + + complex *wr_roots=new complex[degree]; + unsigned int seed=0; + + GARandomSeed(seed); + + for(int i = 0;i(roots[2*i],roots[2*i+1]); + } + + + poly_params *pp=new poly_params; + + + pp->degree=degree; + pp->roots=wr_roots; + pp->norm=norm; + pp->epsilon=epsilon; + pp->n_points=n_points; + + + pp->points=new double[pp->n_points]; + + pp->prod=new complex[pp->n_points]; + pp->indexMap1=new bool[pp->degree]; + pp->indexMap2=new bool[pp->degree]; + + + double dx=(1.-epsilon)/((double)pp->n_points-1.0); + + /* create test points */ + for(int i=0;in_points;i++){ + pp->points[i]=epsilon+dx*(double)(i); +// pp->points[i]=epsilon+(1.0-epsilon)*GARandomDouble(); + } + + + cout << "result of testPP = " << testPP(pp) << "\n"; + + + GA1DArrayGenome newobj(degree,objectiveFn,(void*)pp); + + if(pp->degree < 200 ){ + newobj.initializer(RandomInitializer); + } else { + newobj.initializer(IdInitializer); + } + + + newobj.mutator(SwapMutator); + newobj.crossover(MyUniformCrossover); + + + GASimpleGA ga(newobj); + + + ga.parameters("settings.txt"); + + ga.pMutation(ga.pMutation()*10.0/(double)pp->degree); + + ga.evolve(seed); + + // ga.evolve(); + + cout << ga.statistics() << "\n"; + + const GA1DArrayGenome &bestgenome=DYN_CAST(const GA1DArrayGenome &, + ga.statistics().bestIndividual()); + + cout << " here comes the best individual \n"; + cout << bestgenome << "\n"; + + for(int i = 0;iroots[bestgenome.gene(i)]); + roots[2*i+1]=imag(pp->roots[bestgenome.gene(i)]); + } + + delete [] pp->roots; + delete [] pp->points; + delete [] pp->prod; + delete [] pp->indexMap1; + delete [] pp->indexMap2; + delete pp; + +} + +float objectiveFn(GAGenome &g){ + GA1DArrayGenome &child=DYN_CAST(GA1DArrayGenome&,g); + double objVal=(double)objectiveDeviation((poly_params*)child.userData(),child); + cout << "Objval = " << objVal; + return 1.0/ objVal ; +} + + +void IdInitializer(GAGenome &g){ + + GA1DArrayGenome &child=DYN_CAST(GA1DArrayGenome&,g); + + int n=child.length(); + for( int i=0;i &child=DYN_CAST(GA1DArrayGenome&,g); + + bool *indexMap=((poly_params*)child.userData())->indexMap1; + + int n=child.length(); + int rnd; + int j,c; + + for(int i=0;i=rnd) break; + } + ++j; + } + + child.gene(i,j); + child.gene(n-1-i,n-1-j); + + indexMap[j]=false; + indexMap[n-1-j]=false; + + } + check(child,"RandomInitializer"); +} + + + + +/* used for optimizing the ordering of the polynomial roots */ +double objectiveDeviation(poly_params* pp,GA1DArrayGenome &g){ + int i,j; + double globMax=0; + double max=0,min=0; + double logProd=0,absprod; + const double eps=1.e-10; + const double large=1.e16; + bool first; + + for(i=0;in_points;i++){ + pp->prod[i]=complex(pp->points[i],0.0); + } + + for(j=0;jdegree;j++){ + + first=true; + for(i=0;in_points;i++){ + pp->prod[i]*=pp->norm*(pp->points[i]-pp->roots[g.gene(j)]); + absprod=abs(pp->prod[i]); + +// if(absprod>large){ absprod=large;} +// if(absprodmax) max=logProd; + else if(logProd globMax) globMax=max-min; */ + globMax+=max-min; + } + + return globMax/pp->degree; +} + +bool check(GA1DArrayGenome &g,const char *msg=" "){ + return true; + bool *indexMap=((poly_params*)g.userData())->indexMap1; + int n=g.length(); + + for(int i=0;i &child=DYN_CAST(GA1DArrayGenome&,g); + + register int n, i; + int index1, index2; + if(pmut <= 0.0) return(0); + + float nMut = pmut * STA_CAST(float,child.length()); + int length = child.length(); + if(nMut < 1.0){ // we have to do a flip test on each bit + nMut = 0; + for(i=0; i &child=DYN_CAST(GA1DArrayGenome&,g); + + int index1,index2; + + int length = child.length(); + index1=GARandomInt(0, length/2-1); + index2=GARandomInt(0, length/2-1); + + if(index1!=index2){ + child.swap(index1, index2); + child.swap(length-1-index1, length-1-index2); + + check(child,"SimpleSwapMutator"); + return 2; + } else { + child.swap(index1, length-1-index1); + check(child,"SimpleSwapMutator"); + return 1; + } + +} + + +// #define RECURSIVE + + int MyUniformCrossover(const GAGenome& p1, const GAGenome& p2, + GAGenome* c1, GAGenome* c2){ + const GA1DArrayGenome &mom=DYN_CAST(const GA1DArrayGenome &, p1); + const GA1DArrayGenome &dad=DYN_CAST(const GA1DArrayGenome &, p2); + + int n=0; + int i; + int mgi,dgi; + + if(c1 && c2){ + GA1DArrayGenome &sis=DYN_CAST(GA1DArrayGenome &, *c1); + GA1DArrayGenome &bro=DYN_CAST(GA1DArrayGenome &, *c2); + + bool *indexMapSis=((poly_params*)sis.userData())->indexMap1; + bool *indexMapBro=((poly_params*)sis.userData())->indexMap2; + + if(sis.length() == bro.length() && + mom.length() == dad.length() && + sis.length() == mom.length()){ + int length=sis.length(); + + /* initialize sisters and brothers indexmap */ + for(i=0;i &sis = (c1 ? + DYN_CAST(GA1DArrayGenome &, *c1) : + DYN_CAST(GA1DArrayGenome &, *c2)); + + if(mom.length() == dad.length() && sis.length() == mom.length()){ + + int length=sis.length(); + bool *indexMapSis=((poly_params*)sis.userData())->indexMap1; + + /* initialize sisters and brothers indexmap */ + for(i=0;i. + ***********************************************************************/ +#include +#include +#include +#include + + +void usage(){ + fprintf(stdout, "Usage: swapendian [options]\n"); + fprintf(stdout, "Options: [-i input-filename]\n"); + fprintf(stdout, " [-o output-filename]\n"); + fprintf(stdout, " [-e single precision]\n"); + fprintf(stdout, " [-h|-? this help]\n"); + exit(0); +} +void byte_swap_assign_singleprec(void * out_ptr, void * in_ptr, int nmemb); +void byte_swap_assign(void * out_ptr, void * in_ptr, int nmemb); + +int main(int argc,char *argv[]) { + + int c; + FILE *ifs, *ofs; + char * ifilename = NULL; + char * ofilename = NULL; + int single = 0; + double tmpd, swapd; + float tmps, swaps; + int cnt = 0; + + while ((c = getopt(argc, argv, "h?i:o:e")) != -1) { + switch (c) { + case 'i': + ifilename = (char*)calloc(200, sizeof(char)); + strcpy(ifilename,optarg); + break; + case 'o': + ofilename = (char*)calloc(200, sizeof(char)); + strcpy(ofilename,optarg); + break; + case 'e': + single = 1; + break; + case 'h': + case '?': + default: + usage(); + break; + } + } + if(ifilename == NULL){ + fprintf(stderr, "input filename missing! Aborting...\n"); + exit(-1); + } + ifs = fopen(ifilename, "r"); + if(ifs == (FILE *)NULL) { + fprintf(stderr, "Could not open file %s\n Aborting...\n", ifilename); + exit(500); + } + + if(ofilename == NULL){ + fprintf(stderr, "output filename missing! Aborting...\n"); + exit(-2); + } + ofs = fopen(ofilename, "w"); + if(ofs == (FILE *)NULL) { + fprintf(stderr, "Could not open file %s\n Aborting...\n", ofilename); + exit(500); + } + + while(!feof(ifs)) { + if(!single) { + fread(&tmpd, sizeof(double), 1, ifs); + if(!feof(ifs)) { + cnt++; + byte_swap_assign(&swapd, &tmpd, 1); + fwrite(&swapd, sizeof(double), 1, ofs); + } + } + else { + fread(&tmps, sizeof(float), 1, ifs); + if(!feof(ifs)) { + cnt++; + byte_swap_assign_singleprec(&swaps, &tmps, 1); + fwrite(&swaps, sizeof(float), 1, ofs); + } + } + } + + printf("Swapped endian for %d words\n", cnt); + if(single) { + printf("in- and output file in single precision\n"); + } + else { + printf("in- and output file in double precision\n"); + } + + fclose(ofs); + fclose(ifs); + + return(0); +} + +void byte_swap_assign(void * out_ptr, void * in_ptr, int nmemb){ + int j; + char * char_in_ptr, * char_out_ptr; + double * double_in_ptr, * double_out_ptr; + + double_in_ptr = (double *) in_ptr; + double_out_ptr = (double *) out_ptr; + for(j = 0; j < nmemb; j++){ + char_in_ptr = (char *) double_in_ptr; + char_out_ptr = (char *) double_out_ptr; + + char_out_ptr[7] = char_in_ptr[0]; + char_out_ptr[6] = char_in_ptr[1]; + char_out_ptr[5] = char_in_ptr[2]; + char_out_ptr[4] = char_in_ptr[3]; + char_out_ptr[3] = char_in_ptr[4]; + char_out_ptr[2] = char_in_ptr[5]; + char_out_ptr[1] = char_in_ptr[6]; + char_out_ptr[0] = char_in_ptr[7]; + double_in_ptr++; + double_out_ptr++; + } +} + +void byte_swap_assign_singleprec(void * out_ptr, void * in_ptr, int nmemb){ + int j; + char * char_in_ptr, * char_out_ptr; + float * float_in_ptr, * float_out_ptr; + + float_in_ptr = (float *) in_ptr; + float_out_ptr = (float *) out_ptr; + for(j = 0; j < nmemb; j++){ + char_in_ptr = (char *) float_in_ptr; + char_out_ptr = (char *) float_out_ptr; + + char_out_ptr[3] = char_in_ptr[0]; + char_out_ptr[2] = char_in_ptr[1]; + char_out_ptr[1] = char_in_ptr[2]; + char_out_ptr[0] = char_in_ptr[3]; + float_in_ptr++; + float_out_ptr++; + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/wrapper/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_D/wrapper/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..049ed7e8c8640b7b9d6f18d34029121b40f687de --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/wrapper/Makefile @@ -0,0 +1,96 @@ + +srcdir = . +top_builddir = .. +abs_top_builddir = /home/jacob/Job/Prace/kernel_D_soft/kernel_D_update2 +top_srcdir = .. +abs_top_srcdir = /home/jacob/Job/Prace/kernel_D_soft/kernel_D_update2 +subdir = wrapper +builddir = . + +CFLAGS = -std=c99 -fopenmp -pedantic -Wall +DEPFLAGS = -MM +LDFLAGS = -L${HOME}/lib -L${top_builddir}/lib +DEFS = -DHAVE_CONFIG_H +OPTARGS = -O +SOPTARGS = -O + +AR = ar +RANLIB = ranlib +CC = mpicc +CCDEP = gcc +CCLD = ${CC} +LINK = ${CCLD} ${CFLAGS} ${LDFLAGS} ${OPTARGS} -o $@ +LEX = flex +AUTOCONF = autoconf +DEFS = -DHAVE_CONFIG_H + +INCLUDES = -I$(HOME)/include/ -I. -I${abs_top_builddir}/ -I${abs_top_srcdir}/ -I/include/ -I/include/ +LDADD = +#COMPILE = ${CC} ${DEFS} ${INCLUDES} ${CFLAGS} +COMPILE = ${CC} $(DEFS) ${INCLUDES} ${CFLAGS} + +LIBRARIES = libwrapper +libwrapper_TARGETS = lib_wrapper + +libwrapper_STARGETS = + +libwrapper_OBJECTS = $(addsuffix .o, ${libwrapper_TARGETS}) +libwrapper_SOBJECTS = $(addsuffix .o, ${libwrapper_STARGETS}) + +# default rule + +all: Makefile dep libwrapper.a + +# rules for debugging +debug all-debug: CFLAGS := $(CFLAGS) -g +debug all-debug: all + +# rules for profiling information +profile all-profile: CFLAGS := $(filter-out -fomit-frame-pointer,${CFLAGS}) +profile all-profile: all + + +#include dep rules + +-include $(addsuffix .d,${libwrapper_TARGETS}) + +include ${top_srcdir}/Makefile.global + +# rule to compile objects + +${libwrapper_OBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h + $(COMPILE) ${OPTARGS} -c $< + +${libwrapper_SOBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h + $(COMPILE) ${SOPTARGS} -c $< + +# rule to make libwrapper + +libwrapper.a: ${libwrapper_OBJECTS} ${libwrapper_SOBJECTS} Makefile + @rm -f libwrapper.a + @${AR} cru libwrapper.a ${libwrapper_OBJECTS} ${libwrapper_SOBJECTS} + @$(RANLIB) libwrapper.a + @cp libwrapper.a ../lib/libwrapper.a + +# rule to generate .d files + +$(addsuffix .d, $(libwrapper_TARGETS) ${libwrapper_STARGETS}): %.d: ${srcdir}/%.c Makefile + @${CCDEP} ${DEFS} ${DEPFLAGS} ${INCLUDES} $< > $@ + +# rule to make dependencies + +dep: ${addsuffix .d, ${libwrapper_TARGETS} ${libwrapper_STARGETS}} + +# rules to clean + +compile-clean: Makefile + rm -f ${$(addsuffix _OBJECTS, ${LIBRARIES})} ${$(addsuffix _SOBJECTS, ${LIBRARIES})} *.d + +clean: compile-clean + rm -f $(addsuffix .a, ${LIBRARIES}) + rm -f ../lib/libwrapper.a + +distclean: clean + rm -f Makefile + +.PHONY: all dep clean compile-clean distclean profile all-profile debug all-debug diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/wrapper/Makefile.in b/qcd/part_cpu/applications/QCD/src/kernel_D/wrapper/Makefile.in new file mode 100644 index 0000000000000000000000000000000000000000..bbff117e5cb301625bb79bd3bf1485af2268542d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/wrapper/Makefile.in @@ -0,0 +1,96 @@ + +srcdir = @srcdir@ +top_builddir = @top_builddir@ +abs_top_builddir = @abs_top_builddir@ +top_srcdir = @top_srcdir@ +abs_top_srcdir = @abs_top_srcdir@ +subdir = wrapper +builddir = @builddir@ + +CFLAGS = @CFLAGS@ +DEPFLAGS = @DEPFLAGS@ +LDFLAGS = @LDFLAGS@ +DEFS = @DEFS@ +OPTARGS = @OPTARGS@ +SOPTARGS = @SOPTARGS@ + +AR = @AR@ +RANLIB = @RANLIB@ +CC = @CC@ +CCDEP = @CCDEP@ +CCLD = ${CC} +LINK = ${CCLD} ${CFLAGS} ${LDFLAGS} ${OPTARGS} -o $@ +LEX = @LEX@ +AUTOCONF = @AUTOCONF@ +DEFS = @DEFS@ + +INCLUDES = @INCLUDES@ +LDADD = +#COMPILE = ${CC} ${DEFS} ${INCLUDES} ${CFLAGS} +COMPILE = ${CC} $(DEFS) ${INCLUDES} ${CFLAGS} + +LIBRARIES = libwrapper +libwrapper_TARGETS = lib_wrapper + +libwrapper_STARGETS = + +libwrapper_OBJECTS = $(addsuffix .o, ${libwrapper_TARGETS}) +libwrapper_SOBJECTS = $(addsuffix .o, ${libwrapper_STARGETS}) + +# default rule + +all: Makefile dep libwrapper.a + +# rules for debugging +debug all-debug: CFLAGS := $(CFLAGS) @DEBUG_FLAG@ +debug all-debug: all + +# rules for profiling information +profile all-profile: CFLAGS := $(filter-out -fomit-frame-pointer,${CFLAGS}) @PROFILE_FLAG@ +profile all-profile: all + + +#include dep rules + +-include $(addsuffix .d,${libwrapper_TARGETS}) + +include ${top_srcdir}/Makefile.global + +# rule to compile objects + +${libwrapper_OBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h + $(COMPILE) ${OPTARGS} -c $< + +${libwrapper_SOBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h + $(COMPILE) ${SOPTARGS} -c $< + +# rule to make libwrapper + +libwrapper.a: ${libwrapper_OBJECTS} ${libwrapper_SOBJECTS} Makefile + @rm -f libwrapper.a + @${AR} cru libwrapper.a ${libwrapper_OBJECTS} ${libwrapper_SOBJECTS} + @$(RANLIB) libwrapper.a + @cp libwrapper.a ../lib/libwrapper.a + +# rule to generate .d files + +$(addsuffix .d, $(libwrapper_TARGETS) ${libwrapper_STARGETS}): %.d: ${srcdir}/%.c Makefile + @${CCDEP} ${DEFS} ${DEPFLAGS} ${INCLUDES} $< > $@ + +# rule to make dependencies + +dep: ${addsuffix .d, ${libwrapper_TARGETS} ${libwrapper_STARGETS}} + +# rules to clean + +compile-clean: Makefile + rm -f ${$(addsuffix _OBJECTS, ${LIBRARIES})} ${$(addsuffix _SOBJECTS, ${LIBRARIES})} *.d + +clean: compile-clean + rm -f $(addsuffix .a, ${LIBRARIES}) + rm -f ../lib/libwrapper.a + +distclean: clean + rm -f Makefile + +.PHONY: all dep clean compile-clean distclean profile all-profile debug all-debug diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/wrapper/lib_wrapper.c b/qcd/part_cpu/applications/QCD/src/kernel_D/wrapper/lib_wrapper.c new file mode 100755 index 0000000000000000000000000000000000000000..f8b736234d08ddeec0984fb43c2cdd492a0b5cc1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/wrapper/lib_wrapper.c @@ -0,0 +1,333 @@ +/*********************************************************************** + * + * Copyright (C) 2014 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * invert wrapper for using tmLQCD as a library + * + * Author: Carsten Urbach + * curbach@gmx.de + * + *******************************************************************************/ + +#include +#include +#include +#include +#include +#include +#ifdef MPI +#include +#endif +#ifdef OMP +# include +#endif +#include "global.h" +#include "git_hash.h" +#include "getopt.h" +#include "linalg_eo.h" +#include "geometry_eo.h" +#ifdef MPI +#include "xchange/xchange.h" +#endif +#include +#include +#include "read_input.h" +#include "mpi_init.h" +#include "init/init.h" +#include "sighandler.h" +#include "boundary.h" +#include "invert_eo.h" +#include "start.h" +#include "operator.h" +#include "linalg/convert_eo_to_lexic.h" +#include "include/tmLQCD.h" + +#ifdef HAVE_GPU +extern void init_mixedsolve_eo(su3** gf); +extern void init_mixedsolve(su3** gf); +extern void finalize_mixedsolve(); +extern void init_gpu_fields(int need_momenta); +extern void finalize_gpu_fields(); +#include "GPU/cudadefs.h" +# ifdef TEMPORALGAUGE +# include "temporalgauge.h" +# endif +#endif + + +static int tmLQCD_invert_initialised = 0; + +int tmLQCD_invert_init(int argc, char *argv[], const int _verbose) { + + DUM_DERI = 8; + DUM_MATRIX = DUM_DERI + 5; + NO_OF_SPINORFIELDS = DUM_MATRIX + 3; + //4 extra fields (corresponding to DUM_MATRIX+0..5) for deg. and ND matrix mult. + NO_OF_SPINORFIELDS_32 = 6; + + // in read_input.h + verbose = _verbose; + g_use_clover_flag = 0; + +#ifdef MPI + MPI_Comm_rank(MPI_COMM_WORLD, &g_proc_id); +#else + g_proc_id = 0; +#endif + + /* Read the input file */ + if( (read_input("invert.input")) != 0) { + fprintf(stderr, "tmLQCD_init_invert: Could not find input file: invert.input\nAborting..."); + } + +#ifdef OMP + init_openmp(); +#endif + + tmlqcd_mpi_init(argc, argv); + g_dbw2rand = 0; + for(int j = 0; j < no_operators; j++) if(!operator_list[j].even_odd_flag) even_odd_flag = 0; + +#ifdef _GAUGE_COPY + int j = init_gauge_field(VOLUMEPLUSRAND, 1); + j += init_gauge_field_32(VOLUMEPLUSRAND, 1); +#else + int j = init_gauge_field(VOLUMEPLUSRAND, 0); + j += init_gauge_field_32(VOLUMEPLUSRAND, 0); +#endif + if (j != 0) { + fprintf(stderr, "tmLQCD_init_invert: Not enough memory for gauge_fields! Aborting...\n"); + return(-1); + } + j = init_geometry_indices(VOLUMEPLUSRAND); + if (j != 0) { + fprintf(stderr, "tmLQCD_init_invert: Not enough memory for geometry indices! Aborting...\n"); + return(-1); + } + if (even_odd_flag) { + j = init_spinor_field(VOLUMEPLUSRAND / 2, NO_OF_SPINORFIELDS); + j += init_spinor_field_32(VOLUMEPLUSRAND / 2, NO_OF_SPINORFIELDS_32); + } + else { + j = init_spinor_field(VOLUMEPLUSRAND, NO_OF_SPINORFIELDS); + j += init_spinor_field_32(VOLUMEPLUSRAND, NO_OF_SPINORFIELDS_32); + } + if (j != 0) { + fprintf(stderr, "tmLQCD_init_invert: Not enough memory for spinor fields! Aborting...\n"); + return(-1); + } + // define the geometry + geometry(); + + // initialise the operators + init_operators(); + +#ifdef HAVE_GPU + if(usegpu_flag){ + if(even_odd_flag){ + init_mixedsolve_eo(g_gauge_field); + } + else{ + init_mixedsolve(g_gauge_field); + } +# ifdef GPU_DOUBLE + /*init double fields w/o momenta*/ + init_gpu_fields(0); +# endif +# ifdef TEMPORALGAUGE + int retval; + if((retval=init_temporalgauge(VOLUME, g_gauge_field)) !=0){ + if(g_proc_id == 0) printf("tmLQCD_init_invert: Error while initializing temporal gauge. Aborting...\n"); + exit(200); + } +# endif + }//usegpu_flag +#endif + + +#ifdef _USE_HALFSPINOR + j = init_dirac_halfspinor(); + if (j != 0) { + fprintf(stderr, "tmLQCD_init_invert: Not enough memory for halffield! Aborting...\n"); + return(-1); + } + /* for mixed precision solvers, single precisio halfspinor field must always be there! */ + j = init_dirac_halfspinor32(); + if (j != 0) { + fprintf(stderr, "tmLQCD_init_invert: Not enough memory for 32-bit halffield! Aborting...\n"); + return(-1); + } +# if (defined _PERSISTENT) + if (even_odd_flag) + init_xchange_halffield(); +# endif +#endif + tmLQCD_invert_initialised = 1; + return(0); +} + +int tmLQCD_read_gauge(const int nconfig) { + char conf_filename[500]; + if(!tmLQCD_invert_initialised) { + fprintf(stderr, "tmLQCD_read_gauge: tmLQCD_inver_init must be called first. Aborting...\n"); + return(-1); + } + + sprintf(conf_filename, "%s.%.4d", gauge_input_filename, nconfig); + int j=0; + if (g_cart_id == 0) { + printf("#\n# Trying to read gauge field from file %s.\n", + conf_filename); + fflush(stdout); + } + if( (j = read_gauge_field(conf_filename,g_gauge_field)) !=0) { + fprintf(stderr, "tmLQCD_read_gauge: Error %d while reading gauge field from %s\n ...\n", j, conf_filename); + return(-1); + } + if (g_cart_id == 0) { + printf("# Finished reading gauge field.\n"); + fflush(stdout); + } +#ifdef MPI + xchange_gauge(g_gauge_field); +#endif + convert_32_gauge_field(g_gauge_field_32, g_gauge_field, VOLUMEPLUSRAND); + return(0); +} + + +int tmLQCD_invert(double * const propagator, double * const source, + const int op_id, const int write_prop) { + unsigned int index_start = 0; + g_mu = 0.; + + if(!tmLQCD_invert_initialised) { + fprintf(stderr, "tmLQCD_invert: tmLQCD_inver_init must be called first. Aborting...\n"); + return(-1); + } + + if(op_id < 0 || op_id >= no_operators) { + fprintf(stderr, "tmLQCD_invert: op_id=%d not in valid range. Aborting...\n", op_id); + return(-1); + } + + operator_list[op_id].sr0 = g_spinor_field[0]; + operator_list[op_id].sr1 = g_spinor_field[1]; + operator_list[op_id].prop0 = g_spinor_field[2]; + operator_list[op_id].prop1 = g_spinor_field[3]; + + zero_spinor_field(operator_list[op_id].prop0, VOLUME / 2); + zero_spinor_field(operator_list[op_id].prop1, VOLUME / 2); + + // convert to even/odd order + convert_lexic_to_eo(operator_list[op_id].sr0, operator_list[op_id].sr1, (spinor*) source); + + // invert + operator_list[op_id].inverter(op_id, index_start, write_prop); + + // convert back to lexicographic order + convert_eo_to_lexic((spinor*) propagator, operator_list[op_id].prop0, operator_list[op_id].prop1); + + return(0); +} + + +int tmLQCD_finalise() { + +#ifdef OMP + free_omp_accumulators(); +#endif + +#ifdef HAVE_GPU + if(usegpu_flag){ + finalize_mixedsolve(); +# ifdef GPU_DOUBLE + finalize_gpu_fields(); +# endif +# ifdef TEMPORALGAUGE + finalize_temporalgauge(); +# endif + } +#endif + + free_gauge_field(); + free_gauge_field_32(); + free_geometry_indices(); + free_spinor_field(); + free_spinor_field_32(); + free_moment_field(); + free_chi_spinor_field(); +#ifdef MPI + MPI_Barrier(MPI_COMM_WORLD); +#endif + return(0); +} + + +int tmLQCD_get_lat_params(tmLQCD_lat_params * params) { + if(!tmLQCD_invert_initialised) { + fprintf(stderr, "tmLQCD_get_lat_params: tmLQCD_inver_init must be called first. Aborting...\n"); + return(-1); + } + + params->LX = LX; + params->LY = LY; + params->LZ = LZ; + params->T = T; + params->nstore = nstore; + params->nsave = Nsave; + params->no_operators = no_operators; + return(0); +} + +int tmLQCD_get_mpi_params(tmLQCD_mpi_params * params) { + if(!tmLQCD_invert_initialised) { + fprintf(stderr, "tmLQCD_get_mpi_params: tmLQCD_inver_init must be called first. Aborting...\n"); + return(-1); + } + + params->nproc = g_nproc; + params->nproc_t = g_nproc_t; + params->nproc_x = g_nproc_x; + params->nproc_y = g_nproc_y; + params->nproc_z = g_nproc_z; + params->cart_id = g_cart_id; + params->proc_id = g_proc_id; + params->time_rank = g_mpi_time_rank; + params->omp_num_threads = omp_num_threads; + params->proc_coords[0] = g_proc_coords[0]; + params->proc_coords[1] = g_proc_coords[1]; + params->proc_coords[2] = g_proc_coords[2]; + params->proc_coords[3] = g_proc_coords[3]; + + return(0); +} + +int tmLQCD_get_gauge_field_pointer(double ** gf) { + if(!tmLQCD_invert_initialised) { + fprintf(stderr, "tmLQCD_get_gauge_field_pointer: tmLQCD_invert_init must be called first. Aborting...\n"); + return(-1); + } +#ifdef MPI + xchange_gauge(g_gauge_field); +#endif + + *gf = (double*) g_gauge_field[0]; + + return(0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..6e9e44161ff3fab04554d532b4de026b09957395 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/Makefile @@ -0,0 +1,98 @@ + +srcdir = . +top_builddir = .. +abs_top_builddir = /home/jacob/Job/Prace/kernel_D_soft/kernel_D_update2 +top_srcdir = .. +abs_top_srcdir = /home/jacob/Job/Prace/kernel_D_soft/kernel_D_update2 +subdir = xchange +builddir = . + +CFLAGS = -std=c99 -fopenmp -pedantic -Wall +DEPFLAGS = -MM +LDFLAGS = -L${HOME}/lib -L${top_builddir}/lib +DEFS = -DHAVE_CONFIG_H +OPTARGS = -O +SOPTARGS = -O + +AR = ar +RANLIB = ranlib +CC = mpicc +CCDEP = gcc +CCLD = ${CC} +LINK = ${CCLD} ${CFLAGS} ${LDFLAGS} ${OPTARGS} -o $@ +LEX = flex +AUTOCONF = autoconf +DEFS = -DHAVE_CONFIG_H + +INCLUDES = -I$(HOME)/include/ -I. -I${abs_top_builddir}/ -I${abs_top_srcdir}/ -I/include/ -I/include/ +LDADD = +#COMPILE = ${CC} ${DEFS} ${INCLUDES} ${CFLAGS} +COMPILE = ${CC} $(DEFS) ${INCLUDES} ${CFLAGS} + +LIBRARIES = libxchange +libxchange_TARGETS = xchange_deri xchange_field xchange_gauge xchange_halffield \ + xchange_lexicfield xchange_2fields xchange_field_tslice \ + xchange_jacobi + +libxchange_STARGETS = + +libxchange_OBJECTS = $(addsuffix .o, ${libxchange_TARGETS}) +libxchange_SOBJECTS = $(addsuffix .o, ${libxchange_STARGETS}) + +# default rule + +all: Makefile dep libxchange.a + +# rules for debugging +debug all-debug: CFLAGS := $(CFLAGS) -g +debug all-debug: all + +# rules for profiling information +profile all-profile: CFLAGS := $(filter-out -fomit-frame-pointer,${CFLAGS}) +profile all-profile: all + + +#include dep rules + +-include $(addsuffix .d,${libxchange_TARGETS}) + +include ${top_srcdir}/Makefile.global + +# rule to compile objects + +${libxchange_OBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h + $(COMPILE) ${OPTARGS} -c $< + +${libxchange_SOBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h + $(COMPILE) ${SOPTARGS} -c $< + +# rule to make libxchange + +libxchange.a: ${libxchange_OBJECTS} ${libxchange_SOBJECTS} Makefile + @rm -f libxchange.a + @${AR} cru libxchange.a ${libxchange_OBJECTS} ${libxchange_SOBJECTS} + @$(RANLIB) libxchange.a + @cp libxchange.a ../lib/libxchange.a + +# rule to generate .d files + +$(addsuffix .d, $(libxchange_TARGETS) ${libxchange_STARGETS}): %.d: ${srcdir}/%.c Makefile + @${CCDEP} ${DEFS} ${DEPFLAGS} ${INCLUDES} $< > $@ + +# rule to make dependencies + +dep: ${addsuffix .d, ${libxchange_TARGETS} ${libxchange_STARGETS}} + +# rules to clean + +compile-clean: Makefile + rm -f ${$(addsuffix _OBJECTS, ${LIBRARIES})} ${$(addsuffix _SOBJECTS, ${LIBRARIES})} *.d + +clean: compile-clean + rm -f $(addsuffix .a, ${LIBRARIES}) + rm -f ../lib/libxchange.a + +distclean: clean + rm -f Makefile + +.PHONY: all dep clean compile-clean distclean profile all-profile debug all-debug diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/Makefile.in b/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/Makefile.in new file mode 100644 index 0000000000000000000000000000000000000000..bfea6a61b04b0e69ea790dcb69363c4f3686d33b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/Makefile.in @@ -0,0 +1,98 @@ + +srcdir = @srcdir@ +top_builddir = @top_builddir@ +abs_top_builddir = @abs_top_builddir@ +top_srcdir = @top_srcdir@ +abs_top_srcdir = @abs_top_srcdir@ +subdir = xchange +builddir = @builddir@ + +CFLAGS = @CFLAGS@ +DEPFLAGS = @DEPFLAGS@ +LDFLAGS = @LDFLAGS@ +DEFS = @DEFS@ +OPTARGS = @OPTARGS@ +SOPTARGS = @SOPTARGS@ + +AR = @AR@ +RANLIB = @RANLIB@ +CC = @CC@ +CCDEP = @CCDEP@ +CCLD = ${CC} +LINK = ${CCLD} ${CFLAGS} ${LDFLAGS} ${OPTARGS} -o $@ +LEX = @LEX@ +AUTOCONF = @AUTOCONF@ +DEFS = @DEFS@ + +INCLUDES = @INCLUDES@ +LDADD = +#COMPILE = ${CC} ${DEFS} ${INCLUDES} ${CFLAGS} +COMPILE = ${CC} $(DEFS) ${INCLUDES} ${CFLAGS} + +LIBRARIES = libxchange +libxchange_TARGETS = xchange_deri xchange_field xchange_gauge xchange_halffield \ + xchange_lexicfield xchange_2fields xchange_field_tslice \ + xchange_jacobi + +libxchange_STARGETS = + +libxchange_OBJECTS = $(addsuffix .o, ${libxchange_TARGETS}) +libxchange_SOBJECTS = $(addsuffix .o, ${libxchange_STARGETS}) + +# default rule + +all: Makefile dep libxchange.a + +# rules for debugging +debug all-debug: CFLAGS := $(CFLAGS) @DEBUG_FLAG@ +debug all-debug: all + +# rules for profiling information +profile all-profile: CFLAGS := $(filter-out -fomit-frame-pointer,${CFLAGS}) @PROFILE_FLAG@ +profile all-profile: all + + +#include dep rules + +-include $(addsuffix .d,${libxchange_TARGETS}) + +include ${top_srcdir}/Makefile.global + +# rule to compile objects + +${libxchange_OBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h + $(COMPILE) ${OPTARGS} -c $< + +${libxchange_SOBJECTS}: %.o: ${srcdir}/%.c %.d Makefile ${abs_top_builddir}/config.h + $(COMPILE) ${SOPTARGS} -c $< + +# rule to make libxchange + +libxchange.a: ${libxchange_OBJECTS} ${libxchange_SOBJECTS} Makefile + @rm -f libxchange.a + @${AR} cru libxchange.a ${libxchange_OBJECTS} ${libxchange_SOBJECTS} + @$(RANLIB) libxchange.a + @cp libxchange.a ../lib/libxchange.a + +# rule to generate .d files + +$(addsuffix .d, $(libxchange_TARGETS) ${libxchange_STARGETS}): %.d: ${srcdir}/%.c Makefile + @${CCDEP} ${DEFS} ${DEPFLAGS} ${INCLUDES} $< > $@ + +# rule to make dependencies + +dep: ${addsuffix .d, ${libxchange_TARGETS} ${libxchange_STARGETS}} + +# rules to clean + +compile-clean: Makefile + rm -f ${$(addsuffix _OBJECTS, ${LIBRARIES})} ${$(addsuffix _SOBJECTS, ${LIBRARIES})} *.d + +clean: compile-clean + rm -f $(addsuffix .a, ${LIBRARIES}) + rm -f ../lib/libxchange.a + +distclean: clean + rm -f Makefile + +.PHONY: all dep clean compile-clean distclean profile all-profile debug all-debug diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/xchange.h b/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/xchange.h new file mode 100644 index 0000000000000000000000000000000000000000..66f68024ab93ed97e6844d6ea86b71ef986fc584 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/xchange.h @@ -0,0 +1,34 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _XCHANGE_H +#define _XCHANGE_H + +#include "xchange/xchange_field.h" +#include "xchange/xchange_gauge.h" +#include "xchange/xchange_deri.h" +#include "xchange/xchange_halffield.h" +#include "xchange/xchange_jacobi.h" +#include "xchange/xchange_2fields.h" +#include "xchange/xchange_lexicfield.h" + +# ifdef _USE_TSPLITPAR +# include "xchange/xchange_field_tslice.h" +# endif + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/xchange_2fields.c b/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/xchange_2fields.c new file mode 100644 index 0000000000000000000000000000000000000000..9e083cd6e0923ed668979ca709ee405353928a6e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/xchange_2fields.c @@ -0,0 +1,405 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +/********************************************************** + * + * exchange routines for 2 spinor fields at once + * + * Author: Carsten Urbach + * + **********************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#ifdef MPI +# include +#endif + +#include "global.h" +#if (defined XLC && defined BGL) +# include "bgl.h" +#endif +#include "mpi_init.h" +#include "su3.h" +#include "xchange_2fields.h" + +#if (defined _NON_BLOCKING) + +#if ((defined XLC) && (defined PARALLELXYZT)) +#pragma disjoint(*field_buffer_z2, *field_buffer_z, *field_buffer_z3, *field_buffer_z4) +#endif + +/* this version uses non-blocking MPI calls */ + +# ifdef _INDEX_INDEP_GEOM + +/* this is the version independent of the content of the function Index */ +/* this if statement will be removed in future and _INDEX_INDEP_GEOM will be the default */ + +void xchange_2fields(spinor * const l, spinor * const k, const int ieo) { + +#ifdef MPI + MPI_Request requests[32]; + MPI_Status status[32]; +#endif + int reqcount = 0; +#if defined PARALLELXYZT + int ix=0; +#endif + +#ifdef _KOJAK_INST +#pragma pomp inst begin(xchange2fields) +#endif + +# ifdef MPI + +# if (defined BGL && defined XLC) + __alignx(16, l); +# endif + +# if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT ) + /* send the data to the neighbour on the left */ + /* recieve the data from the neighbour on the right */ + MPI_Isend((void*)(l+g_1st_t_int_dn), 1, field_time_slice_cont, g_nb_t_dn, 81, g_cart_grid, &requests[reqcount]); + MPI_Irecv((void*)(l+g_1st_t_ext_up), 1, field_time_slice_cont, g_nb_t_up, 81, g_cart_grid, &requests[reqcount+1]); + reqcount=reqcount+2; + + /* send the data to the neighbour on the right */ + /* recieve the data from the neighbour on the left */ + MPI_Isend((void*)(l+g_1st_t_int_up), 1, field_time_slice_cont, g_nb_t_up, 82, g_cart_grid, &requests[reqcount]); + MPI_Irecv((void*)(l+g_1st_t_ext_dn), 1, field_time_slice_cont, g_nb_t_dn, 82, g_cart_grid, &requests[reqcount+1]); + reqcount=reqcount+2; + + /* send the data to the neighbour on the left */ + /* recieve the data from the neighbour on the right */ + MPI_Isend((void*)(k+g_1st_t_int_dn), 1, field_time_slice_cont, g_nb_t_dn, 83, g_cart_grid, &requests[reqcount]); + MPI_Irecv((void*)(k+g_1st_t_ext_up), 1, field_time_slice_cont, g_nb_t_up, 83, g_cart_grid, &requests[reqcount+1]); + reqcount=reqcount+2; + + /* send the data to the neighbour on the right */ + /* recieve the data from the neighbour on the left */ + MPI_Isend((void*)(k+g_1st_t_int_up), 1, field_time_slice_cont, g_nb_t_up, 84, g_cart_grid, &requests[reqcount]); + MPI_Irecv((void*)(k+g_1st_t_int_dn), 1, field_time_slice_cont, g_nb_t_dn, 84, g_cart_grid, &requests[reqcount+1]); + reqcount=reqcount+2; +# endif + +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + MPI_Isend((void*)(l+g_1st_x_int_dn), 1, field_x_slice_gath, g_nb_x_dn, 91, g_cart_grid, &requests[reqcount]); + MPI_Irecv((void*)(l+g_1st_x_ext_up), 1, field_x_slice_cont, g_nb_x_up, 91, g_cart_grid, &requests[reqcount+1]); + reqcount=reqcount+2; + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + MPI_Isend((void*)(l+g_1st_x_int_up), 1, field_x_slice_gath, g_nb_x_up, 92, g_cart_grid, &requests[reqcount]); + MPI_Irecv((void*)(l+g_1st_x_ext_dn), 1, field_x_slice_cont, g_nb_x_dn, 92, g_cart_grid, &requests[reqcount+1]); + reqcount=reqcount+2; + + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + MPI_Isend((void*)(k+g_1st_x_int_dn), 1, field_x_slice_gath, g_nb_x_dn, 93, g_cart_grid, &requests[reqcount]); + MPI_Irecv((void*)(k+g_1st_x_ext_up), 1, field_x_slice_cont, g_nb_x_up, 93, g_cart_grid, &requests[reqcount+1]); + reqcount=reqcount+2; + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + MPI_Isend((void*)(k+g_1st_x_int_up), 1, field_x_slice_gath, g_nb_x_up, 94, g_cart_grid, &requests[reqcount]); + MPI_Irecv((void*)(k+g_1st_x_ext_dn), 1, field_x_slice_cont, g_nb_x_dn, 94, g_cart_grid, &requests[reqcount+1]); + reqcount=reqcount+2; +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + MPI_Isend((void*)(l+g_1st_y_int_dn), 1, field_y_slice_gath, g_nb_y_dn, 101, g_cart_grid, &requests[reqcount]); + MPI_Irecv((void*)(l+g_1st_y_ext_up), 1, field_y_slice_cont, g_nb_y_up, 101, g_cart_grid, &requests[reqcount+1]); + reqcount=reqcount+2; + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + MPI_Isend((void*)(l+g_1st_y_int_up), 1, field_y_slice_gath, g_nb_y_up, 102, g_cart_grid, &requests[reqcount]); + MPI_Irecv((void*)(l+g_1st_y_ext_dn), 1, field_y_slice_cont, g_nb_y_dn, 102, g_cart_grid, &requests[reqcount+1]); + reqcount=reqcount+2; + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + MPI_Isend((void*)(k+g_1st_y_int_dn), 1, field_y_slice_gath, g_nb_y_dn, 103, g_cart_grid, &requests[reqcount]); + MPI_Irecv((void*)(k+g_1st_y_ext_up), 1, field_y_slice_cont, g_nb_y_up, 103, g_cart_grid, &requests[reqcount+1]); + reqcount=reqcount+2; + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + MPI_Isend((void*)(k+g_1st_y_int_up), 1, field_y_slice_gath, g_nb_y_up, 104, g_cart_grid, &requests[reqcount]); + MPI_Irecv((void*)(k+g_1st_y_ext_dn), 1, field_y_slice_cont, g_nb_y_dn, 104, g_cart_grid, &requests[reqcount+1]); + reqcount=reqcount+2; + +# endif + +# if (defined PARALLELXYZ || defined PARALLELXYZT) + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + if(ieo == 1) { + MPI_Isend((void*)(l+g_1st_z_int_dn),1,field_z_slice_even_dn,g_nb_z_dn,503,g_cart_grid,&requests[reqcount]); + MPI_Irecv((void*)(l+g_1st_z_ext_up),1,field_z_slice_cont,g_nb_z_up,503,g_cart_grid,&requests[reqcount+1]); + reqcount=reqcount+2; + } else { + MPI_Isend((void*)(l+g_1st_z_int_dn),1,field_z_slice_odd_dn,g_nb_z_dn,503,g_cart_grid,&requests[reqcount]); + MPI_Irecv((void*)(l+g_1st_z_ext_up),1,field_z_slice_cont,g_nb_z_up,503,g_cart_grid,&requests[reqcount+1]); + reqcount=reqcount+2; + } + if(ieo == 1) { + MPI_Isend((void*)(k+g_1st_z_int_dn),1,field_z_slice_even_dn,g_nb_z_dn,505,g_cart_grid,&requests[reqcount]); + MPI_Irecv((void*)(k+g_1st_z_ext_up),1,field_z_slice_cont,g_nb_z_up,505,g_cart_grid,&requests[reqcount+1]); + reqcount=reqcount+2; + } else { + MPI_Isend((void*)(k+g_1st_z_int_dn),1,field_z_slice_odd_dn,g_nb_z_dn,505,g_cart_grid,&requests[reqcount]); + MPI_Irecv((void*)(k+g_1st_z_ext_up),1,field_z_slice_cont,g_nb_z_up,505,g_cart_grid,&requests[reqcount+1]); + reqcount=reqcount+2; + } + + /* send the data to the neighbour on the right in z direction */ + /* recieve the data from the neighbour on the left in z direction */ + if(ieo == 1) { + MPI_Isend((void*)(l+g_1st_z_int_up),1,field_z_slice_even_up,g_nb_z_up,504,g_cart_grid,&requests[reqcount]); + MPI_Irecv((void*)(l+g_1st_z_ext_dn),1,field_z_slice_cont,g_nb_z_dn,504,g_cart_grid,&requests[reqcount+1]); + reqcount=reqcount+2; + } else { + MPI_Isend((void*)(l+g_1st_z_int_up),1,field_z_slice_odd_up,g_nb_z_up,504,g_cart_grid,&requests[reqcount]); + MPI_Irecv((void*)(l+g_1st_z_ext_dn),1,field_z_slice_cont,g_nb_z_dn,504,g_cart_grid,&requests[reqcount+1]); + reqcount=reqcount+2; + } + if(ieo == 1) { + MPI_Isend((void*)(k+g_1st_z_int_up),1,field_z_slice_even_up,g_nb_z_up,506,g_cart_grid,&requests[reqcount]); + MPI_Irecv((void*)(k+g_1st_z_ext_dn),1,field_z_slice_cont,g_nb_z_dn,506,g_cart_grid,&requests[reqcount+1]); + reqcount=reqcount+2; + } else { + MPI_Isend((void*)(k+g_1st_z_int_up),1,field_z_slice_odd_up,g_nb_z_up,506,g_cart_grid,&requests[reqcount]); + MPI_Irecv((void*)(k+g_1st_z_ext_dn),1,field_z_slice_cont,g_nb_z_dn,506,g_cart_grid,&requests[reqcount+1]); + reqcount=reqcount+2; + } + +# endif + + + MPI_Waitall(reqcount, requests, status); +# endif + return; +#ifdef _KOJAK_INST +#pragma pomp inst end(xchange2fields) +#endif +} + +# else /* _INDEX_INDEP_GEOM */ + +void xchange_2fields(spinor * const l, spinor * const k, const int ieo) { + + MPI_Request requests[32]; + MPI_Status status[32]; + int reqcount = 0; +#if defined PARALLELXYZT + int ix=0; +#endif + +#ifdef _KOJAK_INST +#pragma pomp inst begin(xchange2fields) +#endif + +# ifdef MPI + +# if (defined BGL && defined XLC) +# ifdef PARALLELXYZT + __alignx(16, field_buffer_z); + __alignx(16, field_buffer_z2); + __alignx(16, field_buffer_z3); + __alignx(16, field_buffer_z4); +# endif + __alignx(16, l); +# endif + + /* send the data to the neighbour on the left */ + /* recieve the data from the neighbour on the right */ + MPI_Isend((void*)l, 1, field_time_slice_cont, g_nb_t_dn, 81, g_cart_grid, &requests[reqcount]); + MPI_Irecv((void*)(l+T*LX*LY*LZ/2), 1, field_time_slice_cont, g_nb_t_up, 81, g_cart_grid, &requests[reqcount+1]); + reqcount=reqcount+2; + + /* send the data to the neighbour on the right */ + /* recieve the data from the neighbour on the left */ + MPI_Isend((void*)(l+(T-1)*LX*LY*LZ/2), 1, field_time_slice_cont, g_nb_t_up, 82, g_cart_grid, &requests[reqcount]); + MPI_Irecv((void*)(l+(T+1)*LX*LY*LZ/2), 1, field_time_slice_cont, g_nb_t_dn, 82, g_cart_grid, &requests[reqcount+1]); + reqcount=reqcount+2; + + /* send the data to the neighbour on the left */ + /* recieve the data from the neighbour on the right */ + MPI_Isend((void*)k, 1, field_time_slice_cont, g_nb_t_dn, 83, g_cart_grid, &requests[reqcount]); + MPI_Irecv((void*)(k+T*LX*LY*LZ/2), 1, field_time_slice_cont, g_nb_t_up, 83, g_cart_grid, &requests[reqcount+1]); + reqcount=reqcount+2; + + /* send the data to the neighbour on the right */ + /* recieve the data from the neighbour on the left */ + MPI_Isend((void*)(k+(T-1)*LX*LY*LZ/2), 1, field_time_slice_cont, g_nb_t_up, 84, g_cart_grid, &requests[reqcount]); + MPI_Irecv((void*)(k+(T+1)*LX*LY*LZ/2), 1, field_time_slice_cont, g_nb_t_dn, 84, g_cart_grid, &requests[reqcount+1]); + reqcount=reqcount+2; + + +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + MPI_Isend((void*)l, 1, field_x_slice_gath, g_nb_x_dn, 91, g_cart_grid, &requests[reqcount]); + MPI_Irecv((void*)(l+(T+2)*LX*LY*LZ/2), 1, field_x_slice_cont, g_nb_x_up, 91, g_cart_grid, &requests[reqcount+1]); + reqcount=reqcount+2; + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + MPI_Isend((void*)(l+(LX-1)*LY*LZ/2), 1, field_x_slice_gath, g_nb_x_up, 92, g_cart_grid, &requests[reqcount]); + MPI_Irecv((void*)(l+((T+2)*LX*LY*LZ + T*LY*LZ)/2), 1, field_x_slice_cont, g_nb_x_dn, 92, g_cart_grid, &requests[reqcount+1]); + reqcount=reqcount+2; + + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + MPI_Isend((void*)k, 1, field_x_slice_gath, g_nb_x_dn, 93, g_cart_grid, &requests[reqcount]); + MPI_Irecv((void*)(k+(T+2)*LX*LY*LZ/2), 1, field_x_slice_cont, g_nb_x_up, 93, g_cart_grid, &requests[reqcount+1]); + reqcount=reqcount+2; + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + MPI_Isend((void*)(k+(LX-1)*LY*LZ/2), 1, field_x_slice_gath, g_nb_x_up, 94, g_cart_grid, &requests[reqcount]); + MPI_Irecv((void*)(k+((T+2)*LX*LY*LZ + T*LY*LZ)/2), 1, field_x_slice_cont, g_nb_x_dn, 94, g_cart_grid, &requests[reqcount+1]); + reqcount=reqcount+2; +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT) + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + MPI_Isend((void*)l, 1, field_y_slice_gath, g_nb_y_dn, 101, g_cart_grid, &requests[reqcount]); + MPI_Irecv((void*)(l+((T+2)*LX*LY*LZ + 2*T*LY*LZ)/2), 1, field_y_slice_cont, g_nb_y_up, 101, g_cart_grid, &requests[reqcount+1]); + reqcount=reqcount+2; + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + MPI_Isend((void*)(l+(LY-1)*LZ/2), 1, field_y_slice_gath, g_nb_y_up, 102, g_cart_grid, &requests[reqcount]); + MPI_Irecv((void*)(l+((T+2)*LX*LY*LZ + 2*T*LY*LZ + T*LX*LZ)/2), 1, field_y_slice_cont, g_nb_y_dn, 102, g_cart_grid, &requests[reqcount+1]); + reqcount=reqcount+2; + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + MPI_Isend((void*)k, 1, field_y_slice_gath, g_nb_y_dn, 103, g_cart_grid, &requests[reqcount]); + MPI_Irecv((void*)(k+((T+2)*LX*LY*LZ + 2*T*LY*LZ)/2), 1, field_y_slice_cont, g_nb_y_up, 103, g_cart_grid, &requests[reqcount+1]); + reqcount=reqcount+2; + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + MPI_Isend((void*)(k+(LY-1)*LZ/2), 1, field_y_slice_gath, g_nb_y_up, 104, g_cart_grid, &requests[reqcount]); + MPI_Irecv((void*)(k+((T+2)*LX*LY*LZ + 2*T*LY*LZ + T*LX*LZ)/2), 1, field_y_slice_cont, g_nb_y_dn, 104, g_cart_grid, &requests[reqcount+1]); + reqcount=reqcount+2; + +# endif + +# if (defined PARALLELXYZT) + /* fill buffer ! */ + /* This is now depending on whether the field is */ + /* even or odd */ + if(ieo == 1) { + for(ix = 0; ix < T*LX*LY/2; ix++) { + field_buffer_z[ix] = l[ g_field_z_ipt_even[ix] ]; + } + } + else { + for(ix = 0; ix < T*LX*LY/2; ix++) { + field_buffer_z[ix] = l[ g_field_z_ipt_odd[ix] ]; + } + } + if(ieo == 1) { + for(ix = T*LX*LY/2; ix < T*LX*LY; ix++) { + field_buffer_z2[ix-T*LX*LY/2] = l[ g_field_z_ipt_even[ix] ]; + } + } + else { + for(ix = T*LX*LY/2; ix < T*LX*LY; ix++) { + field_buffer_z2[ix-T*LX*LY/2] = l[ g_field_z_ipt_odd[ix] ]; + } + } + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + MPI_Isend((void*)field_buffer_z, 12*T*LX*LY, MPI_DOUBLE, g_nb_z_dn, 503, g_cart_grid, &requests[reqcount]); + MPI_Irecv((void*)(l+(VOLUME/2 + LX*LY*LZ + T*LY*LZ +T*LX*LZ)), 12*T*LX*LY, MPI_DOUBLE, g_nb_z_up, 503, g_cart_grid, &requests[reqcount+1]); + reqcount=reqcount+2; + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + MPI_Isend((void*)field_buffer_z2, 12*T*LX*LY, MPI_DOUBLE, g_nb_z_up, 504, g_cart_grid, &requests[reqcount]); + MPI_Irecv((void*)(l+(VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + T*LX*LY)/2), 12*T*LX*LY, MPI_DOUBLE, g_nb_z_dn, 504, g_cart_grid, &requests[reqcount+1]); + reqcount=reqcount+2; + + /* fill buffer ! */ + /* This is now depending on whether the field is */ + /* even or odd */ + if(ieo == 0) { + for(ix = 0; ix < T*LX*LY/2; ix++) { + field_buffer_z3[ix] = k[ g_field_z_ipt_even[ix] ]; + } + } + else { + for(ix = 0; ix < T*LX*LY/2; ix++) { + field_buffer_z3[ix] = k[ g_field_z_ipt_odd[ix] ]; + } + } + if(ieo == 0) { + for(ix = T*LX*LY/2; ix < T*LX*LY; ix++) { + field_buffer_z4[ix-T*LX*LY/2] = k[ g_field_z_ipt_even[ix] ]; + } + } + else { + for(ix = T*LX*LY/2; ix < T*LX*LY; ix++) { + field_buffer_z4[ix-T*LX*LY/2] = k[ g_field_z_ipt_odd[ix] ]; + } + } + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + MPI_Isend((void*)field_buffer_z3, 12*T*LX*LY, MPI_DOUBLE, g_nb_z_dn, 505, g_cart_grid, &requests[reqcount]); + MPI_Irecv((void*)(k+(VOLUME/2 + LX*LY*LZ + T*LY*LZ +T*LX*LZ)), 12*T*LX*LY, MPI_DOUBLE, g_nb_z_up, 505, g_cart_grid, &requests[reqcount+1]); + reqcount=reqcount+2; + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + MPI_Isend((void*)field_buffer_z4, 12*T*LX*LY, MPI_DOUBLE, g_nb_z_up, 506, g_cart_grid, &requests[reqcount]); + MPI_Irecv((void*)(k+(VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + T*LX*LY)/2), 12*T*LX*LY, MPI_DOUBLE, g_nb_z_dn, 506, g_cart_grid, &requests[reqcount+1]); + reqcount=reqcount+2; + + +# endif + + + MPI_Waitall(reqcount, requests, status); +# endif + return; +#ifdef _KOJAK_INST +#pragma pomp inst end(xchange2fields) +#endif +} + +# endif /* _INDEX_INDEP_GEOM */ +#endif /* _NON_BLOCKING */ + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/xchange_2fields.h b/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/xchange_2fields.h new file mode 100644 index 0000000000000000000000000000000000000000..2ca4a60f14fd30663fc5ef04f55d9e102cf3e2e6 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/xchange_2fields.h @@ -0,0 +1,43 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +/********************************************************** + * + * exchange routines for spinor fields + * + * Author: Carsten Urbach + * + **********************************************************/ + +#ifndef _XCHANGE_2FIELDs_H +#define _XCHANGE_2FIELDs_H + +#define EVEN 1 +#define ODD 0 + +#ifdef _NON_BLOCKING +void xchange_2fields(spinor * const k, spinor * const l, const int ieo); +#else +# define xchange_2fields(k, l, ieo) \ + xchange_field(k, ieo); \ + xchange_field(l, (ieo+1)%2); + +#endif + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/xchange_deri.c b/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/xchange_deri.c new file mode 100644 index 0000000000000000000000000000000000000000..57f51f055372772fda4c218799b17c22ce77339e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/xchange_deri.c @@ -0,0 +1,619 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * exchange routines for gauge fields + * + * Author: Carsten Urbach + * + **********************************************************/ + + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#ifdef MPI +# include +#endif +#include "global.h" +#include "mpi_init.h" +#include "su3.h" +#include "su3adj.h" +#include "xchange_deri.h" + +static void addup_ddummy(su3adj** const df, const int ix, const int iy) { + for(int mu = 0; mu < 4; mu++) { + df[ix][mu].d1 += ddummy[iy][mu].d1; + df[ix][mu].d2 += ddummy[iy][mu].d2; + df[ix][mu].d3 += ddummy[iy][mu].d3; + df[ix][mu].d4 += ddummy[iy][mu].d4; + df[ix][mu].d5 += ddummy[iy][mu].d5; + df[ix][mu].d6 += ddummy[iy][mu].d6; + df[ix][mu].d7 += ddummy[iy][mu].d7; + df[ix][mu].d8 += ddummy[iy][mu].d8; + } + return; +} + +/* this if statement will be removed in future and _INDEX_INDEP_GEOM will be the default */ +#ifdef _INDEX_INDEP_GEOM + +void xchange_deri(su3adj ** const df) +{ +# ifdef MPI + int ix,mu, t, y, z, x; + MPI_Status status; + +# if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT ) + /* send the data to the neighbour on the left in time direction */ + /* recieve the data from the neighbour on the right in time direction */ + MPI_Sendrecv(&df[gI_m1_0_0_0][0].d1, 1, deri_time_slice_cont, g_nb_t_dn, 43, + &ddummy[gI_Lm1_0_0_0][0].d1, 1, deri_time_slice_cont, g_nb_t_up, 43, + g_cart_grid, &status); + + /* add ddummy to df */ + for(x = 0; x < LX; x++) { + for(y = 0; y < LY; y++) { + for(z = 0; z < LZ; z++) { + ix = g_ipt[T-1][x][y][z]; + for(mu=0;mu<4;mu++){ + df[ix][mu].d1 += ddummy[ix][mu].d1; + df[ix][mu].d2 += ddummy[ix][mu].d2; + df[ix][mu].d3 += ddummy[ix][mu].d3; + df[ix][mu].d4 += ddummy[ix][mu].d4; + df[ix][mu].d5 += ddummy[ix][mu].d5; + df[ix][mu].d6 += ddummy[ix][mu].d6; + df[ix][mu].d7 += ddummy[ix][mu].d7; + df[ix][mu].d8 += ddummy[ix][mu].d8; + } + } + } + } + + /* send the data to the neighbour on the right is needed for the */ + /* clover case, so this needs fixing here! */ +# endif /* (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT ) */ +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ ) + + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + MPI_Sendrecv(&df[gI_0_m1_0_0][0], 1, deri_x_slice_cont, g_nb_x_dn, 44, + &ddummy[gI_0_Lm1_0_0][0], 1, deri_x_slice_gath, g_nb_x_up, 44, + g_cart_grid, &status); + /* add ddummy to df */ + for(t = 0; t < T; t++) { + for(y = 0; y < LY; y++) { + for(z = 0; z < LZ; z++) { + ix = g_ipt[t][LX-1][y][z]; + for(mu=0;mu<4;mu++){ + df[ix][mu].d1 += ddummy[ix][mu].d1; + df[ix][mu].d2 += ddummy[ix][mu].d2; + df[ix][mu].d3 += ddummy[ix][mu].d3; + df[ix][mu].d4 += ddummy[ix][mu].d4; + df[ix][mu].d5 += ddummy[ix][mu].d5; + df[ix][mu].d6 += ddummy[ix][mu].d6; + df[ix][mu].d7 += ddummy[ix][mu].d7; + df[ix][mu].d8 += ddummy[ix][mu].d8; + } + } + } + } + /* send the data to the neighbour on the right is needed for the */ + /* clover case, so this needs fixing here! */ +# endif /* (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ ) */ + +# if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + MPI_Sendrecv((void*)df[gI_0_0_m1_0], + 1, deri_y_slice_cont, g_nb_y_dn, 45, + (void*)ddummy[gI_0_0_Lm1_0], + 1, deri_y_slice_gath, g_nb_y_up, 45, + g_cart_grid, &status); + /* add ddummy to df */ + for(t = 0; t < T; t++) { + for(x = 0; x < LX; x++) { + for(z = 0; z < LZ; z++) { + ix = g_ipt[t][x][LY-1][z]; + for(mu=0;mu<4;mu++){ + df[ix][mu].d1 += ddummy[ix][mu].d1; + df[ix][mu].d2 += ddummy[ix][mu].d2; + df[ix][mu].d3 += ddummy[ix][mu].d3; + df[ix][mu].d4 += ddummy[ix][mu].d4; + df[ix][mu].d5 += ddummy[ix][mu].d5; + df[ix][mu].d6 += ddummy[ix][mu].d6; + df[ix][mu].d7 += ddummy[ix][mu].d7; + df[ix][mu].d8 += ddummy[ix][mu].d8; + } + } + } + } + /* send the data to the neighbour on the right is needed for the */ + /* clover case, so this needs fixing here! */ +# endif /* (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ ) */ + +# if (defined PARALLELXYZT || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + MPI_Sendrecv((void*)df[gI_0_0_0_m1], + 1, deri_z_slice_cont, g_nb_z_dn, 46, + (void*)ddummy[gI_0_0_0_Lm1], + 1, deri_z_slice_gath, g_nb_z_up, 46, + g_cart_grid, &status); + /* add ddummy to df */ + for(t = 0; t < T; t++) { + for(x = 0; x < LX; x++) { + for(y = 0; y < LY; y++) { + ix = g_ipt[t][x][y][LZ-1]; + for(mu=0;mu<4;mu++){ + df[ix][mu].d1 += ddummy[ix][mu].d1; + df[ix][mu].d2 += ddummy[ix][mu].d2; + df[ix][mu].d3 += ddummy[ix][mu].d3; + df[ix][mu].d4 += ddummy[ix][mu].d4; + df[ix][mu].d5 += ddummy[ix][mu].d5; + df[ix][mu].d6 += ddummy[ix][mu].d6; + df[ix][mu].d7 += ddummy[ix][mu].d7; + df[ix][mu].d8 += ddummy[ix][mu].d8; + } + } + } + } + /* send the data to the neighbour on the right is needed for the */ + /* clover case, so this needs fixing here! */ +# endif /* (defined PARALLELXYZT || defined PARALLELXYZ ) */ +# endif /* MPI */ + return; +} + +#else /* _INDEX_INDEP_GEOM */ + +void xchange_deri(su3adj ** const df) +{ +# ifdef MPI + int ix,iy, t, y, z, x; + MPI_Status status; +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) + /* The edges need to come first */ + + /* send the data to the neighbour on the left in t direction */ + /* recieve the data from the neighbour on the right in t direction */ + /* is on the x-boundary: xt-edge */ + MPI_Sendrecv((void*)df[VOLUME + RAND + 2*LY*LZ], 1, deri_xt_edge_cont, g_nb_t_dn, 492, + (void*)ddummy[0], 1, deri_xt_edge_cont, g_nb_t_up, 492, + g_cart_grid, &status); + + /* add ddummy to df */ + for(y = 0; y < LY; y++) { + for(z = 0; z < LZ; z++) { + ix = g_iup[ g_ipt[T-1][LX-1][y][z] ][1]; + iy = y*LZ + z; + addup_ddummy(df, ix, iy); + + ix = g_idn[ g_ipt[T-1][0][y][z] ][1]; + iy = LY*LZ + y*LZ + z; + addup_ddummy(df, ix, iy); + } + } + + /* send the data to the neighbour on the right in t direction */ + /* recieve the data from the neighbour on the left in t direction */ + /* xt-edge */ + MPI_Sendrecv((void*)df[VOLUME + RAND], 1, deri_xt_edge_cont, g_nb_t_up, 493, + (void*)ddummy[0], 1, deri_xt_edge_cont, g_nb_t_dn, 493, + g_cart_grid, &status); + + /* add ddummy to df */ + for(y = 0; y < LY; y++) { + for(z = 0; z < LZ; z++) { + ix = g_iup[ g_ipt[0][LX-1][y][z] ][1]; + iy = y*LZ + z; + addup_ddummy(df, ix, iy); + + ix = g_idn[ g_ipt[0][0][y][z] ][1]; + iy = LY*LZ + y*LZ + z; + addup_ddummy(df, ix, iy); + } + } + +# endif /* (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) */ + +# if (defined PARALLELXYT || defined PARALLELXYZT) + /* edges */ + + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + /* is on the y-Rand -> yx-edge*/ + MPI_Sendrecv((void*)df[VOLUME + RAND + 4*LZ*LY + 2*T*LZ], 1, deri_yx_edge_cont, g_nb_x_dn, 494, + (void*)ddummy[0], 1, deri_yx_edge_cont, g_nb_x_up, 494, + g_cart_grid, &status); + + /* add ddummy to df */ + for(t = 0; t < T; t++) { + for(z = 0; z < LZ; z++) { + ix = g_iup[ g_ipt[t][LX-1][LY-1][z] ][2]; + iy = t*LZ + z; + addup_ddummy(df, ix, iy); + + ix = g_idn[ g_ipt[t][LX-1][0][z] ][2]; + iy = T*LZ + t*LZ + z; + addup_ddummy(df, ix, iy); + } + } + + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* yx-edge */ + MPI_Sendrecv((void*)df[VOLUME + RAND + 4*LZ*LY], 1, deri_yx_edge_cont, g_nb_x_up, 495, + (void*)ddummy[0], 1, deri_yx_edge_cont, g_nb_x_dn, 495, + g_cart_grid, &status); + + /* add ddummy to df */ + for(t = 0; t < T; t++) { + for(z = 0; z < LZ; z++) { + ix = g_iup[ g_ipt[t][0][LY-1][z] ][2]; + iy = t*LZ + z; + addup_ddummy(df, ix, iy); + + ix = g_idn[ g_ipt[t][0][0][z] ][2]; + iy = T*LZ + t*LZ + z; + addup_ddummy(df, ix, iy); + } + } + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* is on the t-Rand -> ty-edge*/ + MPI_Sendrecv((void*)df[VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 2*LX*LZ], 1, deri_ty_edge_cont, g_nb_y_dn, 496, + (void*)ddummy[0], 1, deri_ty_edge_cont, g_nb_y_up, 496, + g_cart_grid, &status); + + /* add ddummy to df */ + for(x = 0; x < LX; x++) { + for(z = 0; z < LZ; z++) { + ix = g_iup[ g_ipt[T-1][x][LY-1][z] ][0]; + iy = x*LZ + z; + addup_ddummy(df, ix, iy); + + ix = g_idn[ g_ipt[0][x][LY-1][z] ][0]; + iy = LX*LZ + x*LZ + z; + addup_ddummy(df, ix, iy); + } + } + + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* ty-edge */ + MPI_Sendrecv((void*)df[VOLUME + RAND + 4*LY*LZ + 4*T*LZ], 1, deri_ty_edge_cont, g_nb_y_up, 497, + (void*)ddummy[0], 1, deri_ty_edge_cont, g_nb_y_dn, 497, + g_cart_grid, &status); + + /* add ddummy to df */ + for(x = 0; x < LX; x++) { + for(z = 0; z < LZ; z++) { + ix = g_iup[ g_ipt[T-1][x][0][z] ][0]; + iy = x*LZ + z; + addup_ddummy(df, ix, iy); + + ix = g_idn[ g_ipt[0][x][0][z] ][0]; + iy = LX*LZ + x*LZ + z; + addup_ddummy(df, ix, iy); + } + } + +# endif /* (defined PARALLELXYT || defined PARALLELXYZT) */ + +# ifdef PARALLELXYZT + + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + /* xz-edge */ + MPI_Sendrecv((void*)df[VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ + 2*T*LY], + 1, deri_zx_edge_cont, g_nb_x_dn, 498, + (void*)ddummy[0], + 1, deri_zx_edge_cont, g_nb_x_up, 498, + g_cart_grid, &status); + + /* add ddummy to df */ + for(t = 0; t < T; t++) { + for(y = 0; y < LY; y++) { + ix = g_iup[ g_ipt[t][LX-1][y][LZ-1] ][3]; + iy = t*LY + y; + addup_ddummy(df, ix, iy); + + ix = g_idn[ g_ipt[t][LX-1][y][0] ][3]; + iy = T*LY + t*LY + y; + addup_ddummy(df, ix, iy); + } + } + + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* xz-edge */ + MPI_Sendrecv((void*)df[VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ], + 1, deri_zx_edge_cont, g_nb_x_up, 499, + (void*)ddummy[0], + 1, deri_zx_edge_cont, g_nb_x_dn, 499, + g_cart_grid, &status); + + /* add ddummy to df */ + for(t = 0; t < T; t++) { + for(y = 0; y < LY; y++) { + ix = g_iup[ g_ipt[t][0][y][LZ-1] ][3]; + iy = t*LY + y; + addup_ddummy(df, ix, iy); + + ix = g_idn[ g_ipt[t][0][y][0] ][3]; + iy = T*LY + t*LY + y; + addup_ddummy(df, ix, iy); + } + } + + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + /* tz-edge */ + MPI_Sendrecv((void*)df[VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ + 4*T*LY + 2*LX*LY], + 1, deri_tz_edge_cont, g_nb_z_dn, 500, + (void*)ddummy[0], + 1, deri_tz_edge_cont, g_nb_z_up, 500, + g_cart_grid, &status); + + /* add ddummy to df */ + for(x = 0; x < LX; x++) { + for(y = 0; y < LY; y++) { + ix = g_iup[ g_ipt[T-1][x][y][LZ-1] ][0]; + iy = x*LY + y; + addup_ddummy(df, ix, iy); + + ix = g_idn[ g_ipt[0][x][y][LZ-1] ][0]; + iy = LX*LY + x*LY + y; + addup_ddummy(df, ix, iy); + } + } + + + /* send the data to the neighbour on the right in z direction */ + /* recieve the data from the neighbour on the left in z direction */ + /* tz-edge */ + MPI_Sendrecv((void*)df[VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ + 4*T*LY], + 1, deri_tz_edge_cont, g_nb_z_up, 501, + (void*)ddummy[0], + 1, deri_tz_edge_cont, g_nb_z_dn, 501, + g_cart_grid, &status); + + /* add ddummy to df */ + for(x = 0; x < LX; x++) { + for(y = 0; y < LY; y++) { + ix = g_iup[ g_ipt[T-1][x][y][0] ][0]; + iy = x*LY + y; + addup_ddummy(df, ix, iy); + + ix = g_idn[ g_ipt[0][x][y][0] ][0]; + iy = LX*LY + x*LY + y; + addup_ddummy(df, ix, iy); + } + } + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* zy-edge */ + MPI_Sendrecv((void*)df[VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ + 4*T*LY + 4*LX*LY + 2*T*LX], + 1, deri_zy_edge_cont, g_nb_y_dn, 502, + (void*)ddummy[0], + 1, deri_zy_edge_cont, g_nb_y_up, 502, + g_cart_grid, &status); + + /* add ddummy to df */ + for(t = 0; t < T; t++) { + for(x = 0; x < LX; x++) { + ix = g_iup[ g_ipt[t][x][LY-1][LZ-1] ][3]; + iy = t*LX + x; + addup_ddummy(df, ix, iy); + + ix = g_idn[ g_ipt[t][x][LY-1][0] ][3]; + iy = T*LX + t*LX + x; + addup_ddummy(df, ix, iy); + } + } + + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* zy-edge */ + MPI_Sendrecv((void*)df[VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ + 4*T*LY + 4*LX*LY], + 1, deri_zy_edge_cont, g_nb_y_up, 503, + (void*)ddummy[0], + 1, deri_zy_edge_cont, g_nb_y_dn, 503, + g_cart_grid, &status); + + /* add ddummy to df */ + for(t = 0; t < T; t++) { + for(x = 0; x < LX; x++) { + ix = g_iup[ g_ipt[t][x][0][LZ-1] ][3]; + iy = t*LX + x; + addup_ddummy(df, ix, iy); + + ix = g_idn[ g_ipt[t][x][0][0] ][3]; + iy = T*LX + t*LX + x; + addup_ddummy(df, ix, iy); + } + } + +# endif /* PARALLELXYZT */ + + // now the normal boundaries + + /* send the data to the neighbour on the left in time direction */ + /* recieve the data from the neighbour on the right in time direction */ + MPI_Sendrecv((void*)df[(T+1)*LX*LY*LZ], 1, deri_time_slice_cont, g_nb_t_dn, 40, + (void*)ddummy[0], 1, deri_time_slice_cont, g_nb_t_up, 40, + g_cart_grid, &status); + + /* add ddummy to df */ + for(x = 0; x < LX; x++) { + for(y = 0; y < LY; y++) { + for(z = 0; z < LZ; z++) { + ix = g_ipt[T-1][x][y][z]; + iy = x*LY*LZ + y*LZ + z; + addup_ddummy(df, ix, iy); + } + } + } + + /* send the data to the neighbour on the right in time direction needed for clover */ + + MPI_Sendrecv((void*)df[T*LX*LY*LZ], 1, deri_time_slice_cont, g_nb_t_up, 41, + (void*)ddummy[0], 1, deri_time_slice_cont, g_nb_t_dn, 41, + g_cart_grid, &status); + + /* add ddummy to df */ + for(x = 0; x < LX; x++) { + for(y = 0; y < LY; y++) { + for(z = 0; z < LZ; z++) { + ix = g_ipt[0][x][y][z]; + iy = x*LY*LZ + y*LZ + z; + addup_ddummy(df, ix, iy); + } + } + } + + + +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + MPI_Sendrecv((void*)df[(T+2)*LX*LY*LZ + T*LY*LZ], 1, deri_x_slice_cont, g_nb_x_dn, 42, + (void*)ddummy[0], 1, deri_x_slice_cont, g_nb_x_up, 42, + g_cart_grid, &status); + /* add ddummy to df */ + for(t = 0; t < T; t++) { + for(y = 0; y < LY; y++) { + for(z = 0; z < LZ; z++) { + ix = g_ipt[t][LX-1][y][z]; + iy = t*LY*LZ + y*LZ + z; + addup_ddummy(df, ix, iy); + } + } + } + + /* send the data to the neighbour on the right needed for clover */ + /* and receive from the one on the left */ + MPI_Sendrecv((void*)df[(T+2)*LX*LY*LZ], 1, deri_x_slice_cont, g_nb_x_up, 43, + (void*)ddummy[0], 1, deri_x_slice_cont, g_nb_x_dn, 43, + g_cart_grid, &status); + /* add ddummy to df */ + for(t = 0; t < T; t++) { + for(y = 0; y < LY; y++) { + for(z = 0; z < LZ; z++) { + ix = g_ipt[t][0][y][z]; + iy = t*LY*LZ + y*LZ + z; + addup_ddummy(df, ix, iy); + } + } + } + +# endif /* (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) */ + + +# if (defined PARALLELXYT || defined PARALLELXYZT) + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + MPI_Sendrecv((void*)df[VOLUME + 2*LZ*(LX*LY + T*LY) + T*LX*LZ], + 1, deri_y_slice_cont, g_nb_y_dn, 44, + (void*)ddummy[0], + 1, deri_y_slice_cont, g_nb_y_up, 44, + g_cart_grid, &status); + /* add ddummy to df */ + for(t = 0; t < T; t++) { + for(x = 0; x < LX; x++) { + for(z = 0; z < LZ; z++) { + ix = g_ipt[t][x][LY-1][z]; + iy = t*LX*LZ + x*LZ + z; + addup_ddummy(df, ix, iy); + } + } + } + /* send the data to the neighbour on the right needed for clover*/ + + MPI_Sendrecv((void*)df[VOLUME + 2*LZ*(LX*LY + T*LY)], + 1, deri_y_slice_cont, g_nb_y_up, 45, + (void*)ddummy[0], + 1, deri_y_slice_cont, g_nb_y_dn, 45, + g_cart_grid, &status); + /* add ddummy to df */ + for(t = 0; t < T; t++) { + for(x = 0; x < LX; x++) { + for(z = 0; z < LZ; z++) { + ix = g_ipt[t][x][0][z]; + iy = t*LX*LZ + x*LZ + z; + addup_ddummy(df, ix, iy); + } + } + } + + +# endif /* (defined PARALLELXYT || defined PARALLELXYZT) */ + +# ifdef PARALLELXYZT + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + MPI_Sendrecv((void*)df[VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + T*LX*LY], + 1, deri_z_slice_cont, g_nb_z_dn, 46, + (void*)ddummy[0], + 1, deri_z_slice_cont, g_nb_z_up, 46, + g_cart_grid, &status); + /* add ddummy to df */ + for(t = 0; t < T; t++) { + for(x = 0; x < LX; x++) { + for(y = 0; y < LY; y++) { + ix = g_ipt[t][x][y][LZ-1]; + iy = t*LX*LY + x*LY + y; + addup_ddummy(df, ix, iy); + } + } + } + /* send the data to the neighbour on the right needed for clover */ + + MPI_Sendrecv((void*)df[VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ], + 1, deri_z_slice_cont, g_nb_z_up, 47, + (void*)ddummy[0], + 1, deri_z_slice_cont, g_nb_z_dn, 47, + g_cart_grid, &status); + /* add ddummy to df */ + for(t = 0; t < T; t++) { + for(x = 0; x < LX; x++) { + for(y = 0; y < LY; y++) { + ix = g_ipt[t][x][y][0]; + iy = t*LX*LY + x*LY + y; + addup_ddummy(df, ix, iy); + } + } + } + +# endif /* PARALLELXYZT */ +# endif /* MPI */ + return; +} + +#endif /* _INDEX_INDEP_GEOM */ + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/xchange_deri.h b/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/xchange_deri.h new file mode 100644 index 0000000000000000000000000000000000000000..61c392e74dccc3bf34531c625a9be2fd6e33adde --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/xchange_deri.h @@ -0,0 +1,35 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +/********************************************************** + * + * exchange routines for derivative fields + * + * Author: Carsten Urbach + * + **********************************************************/ + + +#ifndef _XCHANGE_DERI_H +#define _XCHANGE_DERI_H + +#include "su3adj.h" +void xchange_deri(su3adj ** const df); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/xchange_field.c b/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/xchange_field.c new file mode 100644 index 0000000000000000000000000000000000000000..7138d30e7270470d995c45f9b430fd49d0aef2a8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/xchange_field.c @@ -0,0 +1,795 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +/********************************************************** + * + * exchange routines for spinor fields + * + * Author: Carsten Urbach + * + **********************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#ifdef MPI +# include +#endif +#ifdef _USE_SHMEM +# include +#endif + +#include "global.h" +#if (defined XLC && defined BGL) +# include "bgl.h" +#endif +#include "mpi_init.h" +#include "su3.h" +#include "xchange_field.h" + +#if (defined XLC && defined PARALLELXYZT) +#pragma disjoint(*field_buffer_z2, *field_buffer_z) +#endif + +/* this version uses non-blocking MPI calls */ +#if (defined _NON_BLOCKING) + +/* this is the version independent of the content of the function Index */ +/* this if statement will be removed in future and _INDEX_INDEP_GEOM will be the default */ +# ifdef _INDEX_INDEP_GEOM + +void xchange_field(spinor * const l, const int ieo) { + +#ifdef MPI + MPI_Request requests[16]; + MPI_Status status[16]; +#endif + int ireq; +# if ( defined PARALLELT || defined PARALLELX ) + int reqcount = 4; +# elif ( defined PARALLELXT || defined PARALLELXY ) + int reqcount = 8; +# elif ( defined PARALLELXYT || defined PARALLELXYZ ) + int reqcount = 12; +# elif defined PARALLELXYZT + int ix=0; + int reqcount = 16; +# endif + +#ifdef _KOJAK_INST +#pragma pomp inst begin(xchangefield) +#endif +# if (defined BGL && defined XLC) + __alignx(16, l); +# endif + +# ifdef MPI + + + /* In 4 dimensions there are two processors sharing the */ + /* communication bandwidth. So the first should start */ + /* in forward direction, the second in backward direction */ + /* This might only work if the third direction is */ + /* parallelised only on the node */ + if(g_proc_coords[3]%2 == 0) { + + ireq=0; + +# if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT ) + /* send the data to the neighbour on the left */ + /* recieve the data from the neighbour on the right */ + MPI_Isend((void*)(l+g_1st_t_int_dn), 1, field_time_slice_cont, g_nb_t_dn, 81, g_cart_grid, &requests[ireq]); + MPI_Irecv( (void*)(l+g_1st_t_ext_up), 1, field_time_slice_cont, g_nb_t_up, 81, g_cart_grid, &requests[ireq+1]); + ireq=ireq+4; +# endif + +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + MPI_Isend((void*)(l+g_1st_x_int_dn), 1, field_x_slice_gath, g_nb_x_dn, 91, g_cart_grid, &requests[ireq]); + MPI_Irecv((void*)(l+g_1st_x_ext_up), 1, field_x_slice_cont, g_nb_x_up, 91, g_cart_grid, &requests[ireq+1]); + ireq=ireq+4; +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + MPI_Isend((void*)(l+g_1st_y_int_dn), 1, field_y_slice_gath, g_nb_y_dn, 101, g_cart_grid, &requests[ireq]); + MPI_Irecv((void*)(l+g_1st_y_ext_up), 1, field_y_slice_cont, g_nb_y_up, 101, g_cart_grid, &requests[ireq+1]); + ireq=ireq+4; +# endif + +# if (defined PARALLELXYZT || defined PARALLELXYZ ) + /* This is now depending on whether the field is even or odd */ + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + + + if(ieo == 1) { + MPI_Isend((void*)(l+g_1st_z_int_dn),1,field_z_slice_even_dn,g_nb_z_dn,503,g_cart_grid,&requests[ireq]); + MPI_Irecv((void*)(l+g_1st_z_ext_up),1,field_z_slice_cont,g_nb_z_up,503,g_cart_grid,&requests[ireq+1]); + } else { + MPI_Isend((void*)(l+g_1st_z_int_dn),1,field_z_slice_odd_dn,g_nb_z_dn,503,g_cart_grid,&requests[ireq]); + MPI_Irecv((void*)(l+g_1st_z_ext_up),1,field_z_slice_cont,g_nb_z_up,503,g_cart_grid,&requests[ireq+1]); + } + +# endif + + + ireq=2; + +# if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT ) + /* send the data to the neighbour on the right */ + /* recieve the data from the neighbour on the left */ + MPI_Isend((void*)(l+g_1st_t_int_up), 1, field_time_slice_cont, g_nb_t_up, 82, g_cart_grid, &requests[ireq]); + MPI_Irecv((void*)(l+g_1st_t_ext_dn), 1, field_time_slice_cont, g_nb_t_dn, 82, g_cart_grid, &requests[ireq+1]); + ireq=ireq+4; +# endif + +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ ) + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + MPI_Isend((void*)(l+g_1st_x_int_up), 1, field_x_slice_gath, g_nb_x_up, 92, g_cart_grid, &requests[ireq]); + MPI_Irecv((void*)(l+g_1st_x_ext_dn), 1, field_x_slice_cont, g_nb_x_dn, 92, g_cart_grid, &requests[ireq+1]); + ireq=ireq+4; +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ ) + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + MPI_Isend((void*)(l+g_1st_y_int_up), 1, field_y_slice_gath, g_nb_y_up, 102, g_cart_grid, &requests[ireq]); + MPI_Irecv((void*)(l+g_1st_y_ext_dn), 1, field_y_slice_cont, g_nb_y_dn, 102, g_cart_grid, &requests[ireq+1]); + ireq=ireq+4; +# endif + +# if ( defined PARALLELXYZT || defined PARALLELXYZ ) + /* send the data to the neighbour on the right in z direction */ + /* recieve the data from the neighbour on the left in z direction */ + if(ieo == 1) { + MPI_Isend((void*)(l+g_1st_z_int_up),1,field_z_slice_even_up,g_nb_z_up,504,g_cart_grid,&requests[ireq]); + MPI_Irecv((void*)(l+g_1st_z_ext_dn),1,field_z_slice_cont,g_nb_z_dn,504,g_cart_grid,&requests[ireq+1]); + } else { + MPI_Isend((void*)(l+g_1st_z_int_up),1,field_z_slice_odd_up,g_nb_z_up,504,g_cart_grid,&requests[ireq]); + MPI_Irecv((void*)(l+g_1st_z_ext_dn),1,field_z_slice_cont,g_nb_z_dn,504,g_cart_grid,&requests[ireq+1]); + } +# endif + + } else { + ireq=0; + +# if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT ) + /* send the data to the neighbour on the right */ + /* recieve the data from the neighbour on the left */ + MPI_Isend((void*)(l+g_1st_t_int_up), 1, field_time_slice_cont, g_nb_t_up, 82, g_cart_grid, &requests[ireq]); + MPI_Irecv((void*)(l+g_1st_t_ext_dn), 1, field_time_slice_cont, g_nb_t_dn, 82, g_cart_grid, &requests[ireq+1]); + ireq=ireq+4; +# endif + +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ ) + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + MPI_Isend((void*)(l+g_1st_x_int_up), 1, field_x_slice_gath, g_nb_x_up, 92, g_cart_grid, &requests[ireq]); + MPI_Irecv((void*)(l+g_1st_x_ext_dn), 1, field_x_slice_cont, g_nb_x_dn, 92, g_cart_grid, &requests[ireq+1]); + ireq=ireq+4; +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ ) + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + MPI_Isend((void*)(l+g_1st_y_int_up), 1, field_y_slice_gath, g_nb_y_up, 102, g_cart_grid, &requests[ireq]); + MPI_Irecv((void*)(l+g_1st_y_ext_dn), 1, field_y_slice_cont, g_nb_y_dn, 102, g_cart_grid, &requests[ireq+1]); + ireq=ireq+4; +# endif + +# if (defined PARALLELXYZT || defined PARALLELXYZ ) + /* This is now depending on whether the field is even or odd */ + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + if(ieo == 1) { + MPI_Isend((void*)(l+g_1st_z_int_dn),1,field_z_slice_even_dn,g_nb_z_dn,503,g_cart_grid,&requests[ireq]); + MPI_Irecv((void*)(l+g_1st_z_ext_up),1,field_z_slice_cont,g_nb_z_up,503,g_cart_grid,&requests[ireq+1]); + } else { + MPI_Isend((void*)(l+g_1st_z_int_dn),1,field_z_slice_odd_dn,g_nb_z_dn,503,g_cart_grid,&requests[ireq]); + MPI_Irecv((void*)(l+g_1st_z_ext_up),1,field_z_slice_cont,g_nb_z_up,503,g_cart_grid,&requests[ireq+1]); + } +# endif + + ireq=2; + +# if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT ) + /* send the data to the neighbour on the left */ + /* recieve the data from the neighbour on the right */ + MPI_Isend((void*)(l+g_1st_t_int_dn), 1, field_time_slice_cont, g_nb_t_dn, 81, g_cart_grid, &requests[ireq]); + MPI_Irecv((void*)(l+g_1st_t_ext_up), 1, field_time_slice_cont, g_nb_t_up, 81, g_cart_grid, &requests[ireq+1]); + ireq=ireq+4; +# endif + +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + MPI_Isend((void*)(l+g_1st_x_int_dn), 1, field_x_slice_gath, g_nb_x_dn, 91, g_cart_grid, &requests[ireq]); + MPI_Irecv((void*)(l+g_1st_x_ext_up), 1, field_x_slice_cont, g_nb_x_up, 91, g_cart_grid, &requests[ireq+1]); + ireq=ireq+4; +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + MPI_Isend((void*)(l+g_1st_y_int_dn), 1, field_y_slice_gath, g_nb_y_dn, 101, g_cart_grid, &requests[ireq]); + MPI_Irecv((void*)(l+g_1st_y_ext_up), 1, field_y_slice_cont, g_nb_y_up, 101, g_cart_grid, &requests[ireq+1]); + ireq=ireq+4; +# endif + +# if ( defined PARALLELXYZT || defined PARALLELXYZ ) + /* send the data to the neighbour on the right in z direction */ + /* recieve the data from the neighbour on the left in z direction */ + if(ieo == 1) { + MPI_Isend((void*)(l+g_1st_z_int_up),1,field_z_slice_even_up,g_nb_z_up,504,g_cart_grid,&requests[ireq]); + MPI_Irecv((void*)(l+g_1st_z_ext_dn),1,field_z_slice_cont,g_nb_z_dn,504,g_cart_grid,&requests[ireq+1]); + } else { + MPI_Isend((void*)(l+g_1st_z_int_up),1,field_z_slice_odd_up,g_nb_z_up,504,g_cart_grid,&requests[ireq]); + MPI_Irecv((void*)(l+g_1st_z_ext_dn),1,field_z_slice_cont,g_nb_z_dn,504,g_cart_grid,&requests[ireq+1]); + } +# endif + + } + MPI_Waitall(reqcount, requests, status); + + +# endif /* MPI */ + return; +#ifdef _KOJAK_INST +#pragma pomp inst end(xchangefield) +#endif +} + + +# else /* _INDEX_INDEP_GEOM */ + +void xchange_field(spinor * const l, const int ieo) { + +#ifdef MPI + MPI_Request requests[16]; + MPI_Status status[16]; +#endif +# ifdef PARALLELT + int reqcount = 4; +# elif defined PARALLELXT + int reqcount = 8; +# elif defined PARALLELXYT + int reqcount = 12; +# elif defined PARALLELXYZT + int ix=0; + int reqcount = 16; +# endif + +#ifdef _KOJAK_INST +#pragma pomp inst begin(xchangefield) +#endif +# if (defined BGL && defined XLC) +# ifdef PARALLELXYZT + __alignx(16, field_buffer_z); + __alignx(16, field_buffer_z2); +# endif + __alignx(16, l); +# endif + +# ifdef MPI + + + /* In 4 dimensions there are two processors sharing the */ + /* communication bandwidth. So the first should start */ + /* in forward direction, the second in backward direction */ + /* This might only work if the third direction is */ + /* parallelised only on the node */ + if(g_proc_coords[3]%2 == 0) { + /* send the data to the neighbour on the left */ + /* recieve the data from the neighbour on the right */ + MPI_Isend((void*)l, 1, field_time_slice_cont, g_nb_t_dn, 81, g_cart_grid, &requests[0]); + MPI_Irecv((void*)(l+T*LX*LY*LZ/2), 1, field_time_slice_cont, g_nb_t_up, 81, g_cart_grid, &requests[1]); +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + MPI_Isend((void*)l, 1, field_x_slice_gath, g_nb_x_dn, 91, g_cart_grid, &requests[4]); + MPI_Irecv((void*)(l+(T+2)*LX*LY*LZ/2), 1, field_x_slice_cont, g_nb_x_up, 91, g_cart_grid, &requests[5]); +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT) + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + MPI_Isend((void*)l, 1, field_y_slice_gath, g_nb_y_dn, 101, g_cart_grid, &requests[8]); + MPI_Irecv((void*)(l+((T+2)*LX*LY*LZ + 2*T*LY*LZ)/2), 1, field_y_slice_cont, g_nb_y_up, 101, g_cart_grid, &requests[9]); +# endif + +# if (defined PARALLELXYZT) + /* fill buffer ! */ + /* This is now depending on whether the field is */ + /* even or odd */ + if(ieo == 1) { + for(ix = 0; ix < T*LX*LY/2; ix++) { + field_buffer_z[ix] = l[ g_field_z_ipt_even[ix] ]; + } + } + else { + for(ix = 0; ix < T*LX*LY/2; ix++) { + field_buffer_z[ix] = l[ g_field_z_ipt_odd[ix] ]; + } + } + + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + MPI_Isend((void*)field_buffer_z, 12*T*LX*LY, MPI_DOUBLE, g_nb_z_dn, 503, g_cart_grid, &requests[12]); + MPI_Irecv((void*)(l+(VOLUME/2 + LX*LY*LZ + T*LY*LZ +T*LX*LZ)), 12*T*LX*LY, MPI_DOUBLE, g_nb_z_up, 503, g_cart_grid, &requests[13]); + +# endif + /* send the data to the neighbour on the right */ + /* recieve the data from the neighbour on the left */ + MPI_Isend((void*)(l+(T-1)*LX*LY*LZ/2), 1, field_time_slice_cont, g_nb_t_up, 82, g_cart_grid, &requests[2]); + MPI_Irecv((void*)(l+(T+1)*LX*LY*LZ/2), 1, field_time_slice_cont, g_nb_t_dn, 82, g_cart_grid, &requests[3]); + +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + MPI_Isend((void*)(l+(LX-1)*LY*LZ/2), 1, field_x_slice_gath, g_nb_x_up, 92, g_cart_grid, &requests[6]); + MPI_Irecv((void*)(l+((T+2)*LX*LY*LZ + T*LY*LZ)/2), 1, field_x_slice_cont, g_nb_x_dn, 92, g_cart_grid, &requests[7]); +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT) + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + MPI_Isend((void*)(l+(LY-1)*LZ/2), 1, field_y_slice_gath, g_nb_y_up, 102, g_cart_grid, &requests[10]); + MPI_Irecv((void*)(l+((T+2)*LX*LY*LZ + 2*T*LY*LZ + T*LX*LZ)/2), 1, field_y_slice_cont, g_nb_y_dn, 102, g_cart_grid, &requests[11]); +# endif + +# if defined PARALLELXYZT + if(ieo == 1) { + for(ix = T*LX*LY/2; ix < T*LX*LY; ix++) { + field_buffer_z2[ix-T*LX*LY/2] = l[ g_field_z_ipt_even[ix] ]; + } + } + else { + for(ix = T*LX*LY/2; ix < T*LX*LY; ix++) { + field_buffer_z2[ix-T*LX*LY/2] = l[ g_field_z_ipt_odd[ix] ]; + } + } + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + MPI_Isend((void*)field_buffer_z2, 12*T*LX*LY, MPI_DOUBLE, g_nb_z_up, 504, g_cart_grid, &requests[14]); + MPI_Irecv((void*)(l+(VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + T*LX*LY)/2), 12*T*LX*LY, MPI_DOUBLE, g_nb_z_dn, 504, g_cart_grid, &requests[15]); +# endif + + } + else { + /* send the data to the neighbour on the right */ + /* recieve the data from the neighbour on the left */ + MPI_Isend((void*)(l+(T-1)*LX*LY*LZ/2), 1, field_time_slice_cont, g_nb_t_up, 82, g_cart_grid, &requests[0]); + MPI_Irecv((void*)(l+(T+1)*LX*LY*LZ/2), 1, field_time_slice_cont, g_nb_t_dn, 82, g_cart_grid, &requests[1]); +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + MPI_Isend((void*)(l+(LX-1)*LY*LZ/2), 1, field_x_slice_gath, g_nb_x_up, 92, g_cart_grid, &requests[4]); + MPI_Irecv((void*)(l+((T+2)*LX*LY*LZ + T*LY*LZ)/2), 1, field_x_slice_cont, g_nb_x_dn, 92, g_cart_grid, &requests[5]); +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT) + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + MPI_Isend((void*)(l+(LY-1)*LZ/2), 1, field_y_slice_gath, g_nb_y_up, 102, g_cart_grid, &requests[8]); + MPI_Irecv((void*)(l+((T+2)*LX*LY*LZ + 2*T*LY*LZ + T*LX*LZ)/2), 1, field_y_slice_cont, g_nb_y_dn, 102, g_cart_grid, &requests[9]); +# endif + +# if (defined PARALLELXYZT) + /* fill buffer ! */ + /* This is now depending on whether the field is */ + /* even or odd */ + if(ieo == 1) { + for(ix = 0; ix < T*LX*LY/2; ix++) { + field_buffer_z[ix] = l[ g_field_z_ipt_even[ix] ]; + } + } + else { + for(ix = 0; ix < T*LX*LY/2; ix++) { + field_buffer_z[ix] = l[ g_field_z_ipt_odd[ix] ]; + } + } + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + MPI_Isend((void*)field_buffer_z, 12*T*LX*LY, MPI_DOUBLE, g_nb_z_dn, 503, g_cart_grid, &requests[12]); + MPI_Irecv((void*)(l+(VOLUME/2 + LX*LY*LZ + T*LY*LZ +T*LX*LZ)), 12*T*LX*LY, MPI_DOUBLE, g_nb_z_up, 503, g_cart_grid, &requests[13]); +# endif + + /* send the data to the neighbour on the left */ + /* recieve the data from the neighbour on the right */ + MPI_Isend((void*)l, 1, field_time_slice_cont, g_nb_t_dn, 81, g_cart_grid, &requests[2]); + MPI_Irecv((void*)(l+T*LX*LY*LZ/2), 1, field_time_slice_cont, g_nb_t_up, 81, g_cart_grid, &requests[3]); +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + MPI_Isend((void*)l, 1, field_x_slice_gath, g_nb_x_dn, 91, g_cart_grid, &requests[6]); + MPI_Irecv((void*)(l+(T+2)*LX*LY*LZ/2), 1, field_x_slice_cont, g_nb_x_up, 91, g_cart_grid, &requests[7]); +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT) + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + MPI_Isend((void*)l, 1, field_y_slice_gath, g_nb_y_dn, 101, g_cart_grid, &requests[10]); + MPI_Irecv((void*)(l+((T+2)*LX*LY*LZ + 2*T*LY*LZ)/2), 1, field_y_slice_cont, g_nb_y_up, 101, g_cart_grid, &requests[11]); +# endif + +# if defined PARALLELXYZT + if(ieo == 1) { + for(ix = T*LX*LY/2; ix < T*LX*LY; ix++) { + field_buffer_z2[ix-T*LX*LY/2] = l[ g_field_z_ipt_even[ix] ]; + } + } + else { + for(ix = T*LX*LY/2; ix < T*LX*LY; ix++) { + field_buffer_z2[ix-T*LX*LY/2] = l[ g_field_z_ipt_odd[ix] ]; + } + } + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + MPI_Isend((void*)field_buffer_z2, 12*T*LX*LY, MPI_DOUBLE, g_nb_z_up, 504, g_cart_grid, &requests[14]); + MPI_Irecv((void*)(l+(VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + T*LX*LY)/2), 12*T*LX*LY, MPI_DOUBLE, g_nb_z_dn, 504, g_cart_grid, &requests[15]); +# endif + + } + MPI_Waitall(reqcount, requests, status); + +# endif + + + return; +#ifdef _KOJAK_INST +#pragma pomp inst end(xchangefield) +#endif +} + +# endif /* _INDEX_INDEP_GEOM */ + +#elif (defined _USE_SHMEM) /* _NON_BLOCKING */ + +/* Here comes the version with shared memory */ +/* exchanges the field l */ +void xchange_field(spinor * const l, const int ieo) { + +# ifdef MPI + int i,ix, mu, x0, x1, x2, x3, k; + +#ifdef _KOJAK_INST +#pragma pomp inst begin(xchangefield) +#endif + + shmem_barrier_all(); + + shmem_double_put((double*)(l+T*LX*LY*LZ/2), (double*)l, + (LX*LY*LZ*12), g_nb_t_dn); + shmem_double_put((double*)(l+(T+1)*LX*LY*LZ/2), (double*)(l+(T-1)*LX*LY*LZ/2), + (LX*LY*LZ*12), g_nb_t_up); + +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) + k = (T+2)*LX*LY*LZ/2; + for(x0 = 0; x0 < T; x0++) { + shmem_double_put((double*)(l + k), + (double*)(l + g_lexic2eo[g_ipt[x0][0][0][0]]), + 12*LZ*LY, g_nb_x_dn); + k+=LZ*LY; + } + k = ((T+2)*LX*LY*LZ + T*LY*LZ)/2; + for(x0 = 0; x0 < T; x0++) { + shmem_double_put((double*)(l + k), + (double*)(l + g_lexic2eo[g_ipt[x0][LX-1][0][0]]), + 12*LZ*LY, g_nb_x_up); + k+=LZ*LY; + } +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT) + k = ((T+2)*LX*LY*LZ + 2*T*LY*LZ)/2; + for(x0 = 0; x0 < T; x0++) { + for(x1 = 0; x1 < LX; x1++) { + shmem_double_put((double*)(l + k), + (double*)(l + g_lexic2eo[g_ipt[x0][x1][0][0]]), + 12*LZ, g_nb_y_dn); + k+=LZ; + } + } + k = ((T+2)*LX*LY*LZ + 2*T*LY*LZ + T*LX*LZ)/2; + for(x0 = 0; x0 < T; x0++) { + for(x1 = 0; x1 < LX; x1++) { + shmem_double_put((double*)(l + k), + (double*)(l + g_lexic2eo[g_ipt[x0][x1][LY-1][0]]), + 12*LZ, g_nb_y_up); + k+=LZ; + } + } +# endif + +# if (defined PARALLELXYZT) + x0 = (VOLUME/2 + LX*LY*LZ + T*LY*LZ +T*LX*LZ); + if(ieo == 1) { + for(k = 0; k < T*LX*LY/2; k++) { + shmem_double_put((double*)(l + x0), + (double*)(l + g_field_z_ipt_even[k]), + 24, g_nb_z_dn); + x0++; + } + } + else { + for(k = 0; k < T*LX*LY/2; k++) { + shmem_double_put((double*)(l + x0), + (double*)(l + g_field_z_ipt_odd[k]), + 24, g_nb_z_dn); + x0++; + } + } + x0 = (VOLUME/2 + LX*LY*LZ + T*LY*LZ + T*LX*LZ + T*LX*LY/2); + if(ieo == 1) { + for(k = T*LX*LY/2; k < T*LX*LY; k++) { + shmem_double_put((double*)(l + x0), + (double*)(l + g_field_z_ipt_even[k]), + 24, g_nb_z_up); + x0++; + } + } + else { + for(k = T*LX*LY/2; k < T*LX*LY; k++) { + shmem_double_put((double*)(l + x0), + (double*)(l + g_field_z_ipt_even[k]), + 24, g_nb_z_up); + x0++; + } + } +# endif + + shmem_barrier_all(); +# endif // MPI + return; +#ifdef _KOJAK_INST +#pragma pomp inst end(xchangefield) +#endif +} + + +/* Here comes the naive version */ +/* Using MPI_Sendrecv */ +#else /* _NON_BLOCKING _USE_SHMEM */ + + +/* this is the version independent of the content of the function Index */ +# ifdef _INDEX_INDEP_GEOM + +/* exchanges the field l */ +void xchange_field(spinor * const l, const int ieo) { + +# ifdef PARALLELXYZT + int x0=0, x1=0, x2=0, ix=0; +# endif +#ifdef _KOJAK_INST +#pragma pomp inst begin(xchangefield) +#endif + +# ifdef MPI + + MPI_Status status; + +# if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT ) + /* send the data to the neighbour on the left */ + /* recieve the data from the neighbour on the right */ + MPI_Sendrecv((void*)(l+g_1st_t_int_dn), 1, field_time_slice_cont, g_nb_t_dn, 81, + (void*)(l+g_1st_t_ext_up), 1, field_time_slice_cont, g_nb_t_up, 81, + g_cart_grid, &status); + + /* send the data to the neighbour on the right */ + /* recieve the data from the neighbour on the left */ + MPI_Sendrecv((void*)(l+g_1st_t_int_up), 1, field_time_slice_cont, g_nb_t_up, 82, + (void*)(l+g_1st_t_ext_dn), 1, field_time_slice_cont, g_nb_t_dn, 82, + g_cart_grid, &status); +# endif + +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + MPI_Sendrecv((void*)(l+g_1st_x_int_dn), 1, field_x_slice_gath, g_nb_x_dn, 91, + (void*)(l+g_1st_x_ext_up), 1, field_x_slice_cont, g_nb_x_up, 91, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + MPI_Sendrecv((void*)(l+g_1st_x_int_up), 1, field_x_slice_gath, g_nb_x_up, 92, + (void*)(l+g_1st_x_ext_dn), 1, field_x_slice_cont, g_nb_x_dn, 92, + g_cart_grid, &status); + +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + MPI_Sendrecv((void*)(l+g_1st_y_int_dn), 1, field_y_slice_gath, g_nb_y_dn, 101, + (void*)(l+g_1st_y_ext_up), 1, field_y_slice_cont, g_nb_y_up, 101, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + MPI_Sendrecv((void*)(l+g_1st_y_int_up), 1, field_y_slice_gath, g_nb_y_up, 102, + (void*)(l+g_1st_y_ext_dn), 1, field_y_slice_cont, g_nb_y_dn, 102, + g_cart_grid, &status); + +# endif + +# if (defined PARALLELXYZT || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + if(ieo == 1){ + MPI_Sendrecv((void*)(l+g_1st_z_int_dn),1,field_z_slice_even_dn, g_nb_z_dn, 503, + (void*)(l+g_1st_z_ext_up),1,field_z_slice_cont, g_nb_z_up, 503, + g_cart_grid, &status); + } else { + MPI_Sendrecv((void*)(l+g_1st_z_int_dn),1,field_z_slice_odd_dn, g_nb_z_dn, 503, + (void*)(l+g_1st_z_ext_up),1,field_z_slice_cont, g_nb_z_up, 503, + g_cart_grid, &status); + } + + /* send the data to the neighbour on the right in z direction */ + /* recieve the data from the neighbour on the left in z direction */ + if(ieo == 1){ + MPI_Sendrecv((void*)(l+g_1st_z_int_up),1,field_z_slice_even_up, g_nb_z_up, 504, + (void*)(l+g_1st_z_ext_dn),1,field_z_slice_cont, g_nb_z_dn, 504, + g_cart_grid, &status); + } else { + MPI_Sendrecv((void*)(l+g_1st_z_int_up),1,field_z_slice_odd_up, g_nb_z_up, 504, + (void*)(l+g_1st_z_ext_dn),1,field_z_slice_cont, g_nb_z_dn, 504, + g_cart_grid, &status); + } + +# endif +# endif // MPI + return; +#ifdef _KOJAK_INST +#pragma pomp inst end(xchangefield) +#endif +} + + +# else /* _INDEX_INDEP_GEOM */ + +/* exchanges the field l */ +void xchange_field(spinor * const l, const int ieo) { + +# ifdef PARALLELXYZT + int x0=0, x1=0, x2=0, ix=0; +# endif +#ifdef _KOJAK_INST +#pragma pomp inst begin(xchangefield) +#endif + +# ifdef MPI + + MPI_Status status; + /* send the data to the neighbour on the left */ + /* recieve the data from the neighbour on the right */ + MPI_Sendrecv((void*)l, 1, field_time_slice_cont, g_nb_t_dn, 81, + (void*)(l+T*LX*LY*LZ/2), 1, field_time_slice_cont, g_nb_t_up, 81, + g_cart_grid, &status); + + /* send the data to the neighbour on the right */ + /* recieve the data from the neighbour on the left */ + MPI_Sendrecv((void*)(l+(T-1)*LX*LY*LZ/2), 1, field_time_slice_cont, g_nb_t_up, 82, + (void*)(l+(T+1)*LX*LY*LZ/2), 1, field_time_slice_cont, g_nb_t_dn, 82, + g_cart_grid, &status); + +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + MPI_Sendrecv((void*)l, 1, field_x_slice_gath, g_nb_x_dn, 91, + (void*)(l+(T+2)*LX*LY*LZ/2), 1, field_x_slice_cont, g_nb_x_up, 91, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + MPI_Sendrecv((void*)(l+(LX-1)*LY*LZ/2), 1, field_x_slice_gath, g_nb_x_up, 92, + (void*)(l+((T+2)*LX*LY*LZ + T*LY*LZ)/2), 1, field_x_slice_cont, g_nb_x_dn, 92, + g_cart_grid, &status); + +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT) + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + MPI_Sendrecv((void*)l, 1, field_y_slice_gath, g_nb_y_dn, 101, + (void*)(l+((T+2)*LX*LY*LZ + 2*T*LY*LZ)/2), 1, field_y_slice_cont, g_nb_y_up, 101, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + MPI_Sendrecv((void*)(l+(LY-1)*LZ/2), 1, field_y_slice_gath, g_nb_y_up, 102, + (void*)(l+((T+2)*LX*LY*LZ + 2*T*LY*LZ + T*LX*LZ)/2), 1, field_y_slice_cont, g_nb_y_dn, 102, + g_cart_grid, &status); + +# endif + +# if (defined PARALLELXYZT) + /* fill buffer ! */ + /* This is now depending on whether the field is */ + /* even or odd */ + if(ieo == 1) { + for(ix = 0; ix < T*LX*LY/2; ix++) { + field_buffer_z[ix] = l[ g_field_z_ipt_even[ix] ]; + } + } + else { + for(ix = 0; ix < T*LX*LY/2; ix++) { + field_buffer_z[ix] = l[ g_field_z_ipt_odd[ix] ]; + } + } + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + MPI_Sendrecv((void*)field_buffer_z, + 12*T*LX*LY, MPI_DOUBLE, g_nb_z_dn, 503, + (void*)(l+(VOLUME/2 + LX*LY*LZ + T*LY*LZ +T*LX*LZ)), + 12*T*LX*LY, MPI_DOUBLE, g_nb_z_up, 503, + g_cart_grid, &status); + + if(ieo == 1) { + for(ix = T*LX*LY/2; ix < T*LX*LY; ix++) { + field_buffer_z[ix-T*LX*LY/2] = l[ g_field_z_ipt_even[ix] ]; + } + } + else { + for(ix = T*LX*LY/2; ix < T*LX*LY; ix++) { + field_buffer_z[ix-T*LX*LY/2] = l[ g_field_z_ipt_odd[ix] ]; + } + } + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + MPI_Sendrecv((void*)field_buffer_z, + 12*T*LX*LY, MPI_DOUBLE, g_nb_z_up, 504, + (void*)(l+(VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + T*LX*LY)/2), + 12*T*LX*LY, MPI_DOUBLE, g_nb_z_dn, 504, + g_cart_grid, &status); + +# endif +# endif // MPI + return; +#ifdef _KOJAK_INST +#pragma pomp inst end(xchangefield) +#endif +} + + +# endif /* _INDEX_INDEP_GEOM */ + +#endif /* _NON_BLOCKING */ + + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/xchange_field.h b/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/xchange_field.h new file mode 100644 index 0000000000000000000000000000000000000000..d9306bd7d279e42e7fe1454dc0f4e21a5accd9bd --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/xchange_field.h @@ -0,0 +1,36 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +/********************************************************** + * + * exchange routines for spinor fields + * + * Author: Carsten Urbach + * + **********************************************************/ + +#ifndef _XCHANGE_FIELD_H +#define _XCHANGE_FIELD_H + +#define EVEN 1 +#define ODD 0 + +void xchange_field(spinor * const l, const int ieo); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/xchange_field_tslice.c b/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/xchange_field_tslice.c new file mode 100644 index 0000000000000000000000000000000000000000..83f7d4f92723aa302c145f070817a5bcfc211952 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/xchange_field_tslice.c @@ -0,0 +1,240 @@ +/********************************************************** + * + * exchange routines for the borders of a timeslice of spinor fields + * + * Author: Luigi Scorzato + * + **********************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#ifdef MPI +# include +#endif +#ifdef _USE_SHMEM +# include +#endif + +#include "global.h" +#if (defined XLC && defined BGL) +# include "bgl.h" +#endif +#include "mpi_init.h" +#include "su3.h" +#include "xchange_field_tslice.h" + +#ifdef MPI +# ifdef _USE_TSPLITPAR +void xchange_field_open(spinor * const l, const int ieo, const int x0, MPI_Request * requests, + MPI_Status * status) { + +#ifdef _KOJAK_INST +#pragma pomp inst begin(xchangetslicefield) +#endif +# if (defined BGL && defined XLC) + __alignx(16, l); /* ?!? */ +# endif + +# ifdef MPI + +# if (defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + MPI_Isend((void*)(l+g_1st_xt_int_dn[x0]), 1, field_xt_slice_int, g_nb_x_dn, 91, g_cart_grid, &requests[0]); + MPI_Irecv((void*)(l+g_1st_xt_ext_up[x0]), 1, field_xt_slice_ext, g_nb_x_up, 91, g_cart_grid, &requests[1]); +# endif + +# if (defined PARALLELXY || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + MPI_Isend((void*)(l+g_1st_yt_int_dn[x0]), 1, field_yt_slice_int, g_nb_y_dn, 101, g_cart_grid, &requests[4]); + MPI_Irecv((void*)(l+g_1st_yt_ext_up[x0]), 1, field_yt_slice_ext, g_nb_y_up, 101, g_cart_grid, &requests[5]); +# endif + +# if (defined PARALLELXYZ) + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + if(ieo == 1){ + if(x0 % 2 == 0) { + MPI_Isend((void*)(l+g_1st_zt_int_dn[x0]),1,field_zt_slice_even_dn_et,g_nb_z_dn,111,g_cart_grid,&requests[8]); + MPI_Irecv((void*)(l+g_1st_zt_ext_up[x0]),1 , field_zt_slice_ext_L, g_nb_z_up, 111, g_cart_grid, &requests[9]); + } else { + MPI_Isend((void*)(l+g_1st_zt_int_dn[x0]),1,field_zt_slice_even_dn_ot,g_nb_z_dn,111,g_cart_grid,&requests[8]); + MPI_Irecv((void*)(l+g_1st_zt_ext_up[x0]),1 , field_zt_slice_ext_S, g_nb_z_up, 111, g_cart_grid, &requests[9]); + } + } else { + if(x0 % 2 == 0) { + MPI_Isend((void*)(l+g_1st_zt_int_dn[x0]),1,field_zt_slice_odd_dn_et,g_nb_z_dn,111,g_cart_grid,&requests[8]); + MPI_Irecv((void*)(l+g_1st_zt_ext_up[x0]),1 , field_zt_slice_ext_S, g_nb_z_up, 111, g_cart_grid, &requests[9]); + } else { + MPI_Isend((void*)(l+g_1st_zt_int_dn[x0]),1,field_zt_slice_odd_dn_ot,g_nb_z_dn,111,g_cart_grid,&requests[8]); + MPI_Irecv((void*)(l+g_1st_zt_ext_up[x0]),1 , field_zt_slice_ext_L, g_nb_z_up, 111, g_cart_grid, &requests[9]); + } + } +# endif + +# if (defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ ) + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + MPI_Isend((void*)(l+g_1st_xt_int_up[x0]), 1, field_xt_slice_int, g_nb_x_up, 92, g_cart_grid, &requests[2]); + MPI_Irecv((void*)(l+g_1st_xt_ext_dn[x0]), 1, field_xt_slice_ext, g_nb_x_dn, 92, g_cart_grid, &requests[3]); +# endif + +# if (defined PARALLELXY || defined PARALLELXYZ ) + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + MPI_Isend((void*)(l+g_1st_yt_int_up[x0]), 1, field_yt_slice_int, g_nb_y_up, 102, g_cart_grid, &requests[6]); + MPI_Irecv((void*)(l+g_1st_yt_ext_dn[x0]), 1, field_yt_slice_ext, g_nb_y_dn, 102, g_cart_grid, &requests[7]); +# endif + +# if (defined PARALLELXYZ) + /* send the data to the neighbour on the right in z direction */ + /* recieve the data from the neighbour on the left in z direction */ + if(ieo == 1){ + if(x0 % 2 == 0) { + MPI_Isend((void*)(l+g_1st_zt_int_up[x0]),1,field_zt_slice_even_up_et,g_nb_z_up,112,g_cart_grid,&requests[10]); + MPI_Irecv((void*)(l+g_1st_zt_ext_dn[x0]), 1, field_zt_slice_ext_S, g_nb_z_dn, 112, g_cart_grid, &requests[11]); + } else { + MPI_Isend((void*)(l+g_1st_zt_int_up[x0]),1,field_zt_slice_even_up_ot,g_nb_z_up,112,g_cart_grid,&requests[10]); + MPI_Irecv((void*)(l+g_1st_zt_ext_dn[x0]), 1, field_zt_slice_ext_L, g_nb_z_dn, 112, g_cart_grid, &requests[11]); + } + } else { + if(x0 % 2 == 0) { + MPI_Isend((void*)(l+g_1st_zt_int_up[x0]),1,field_zt_slice_odd_up_et,g_nb_z_up,112,g_cart_grid,&requests[10]); + MPI_Irecv((void*)(l+g_1st_zt_ext_dn[x0]), 1, field_zt_slice_ext_L, g_nb_z_dn, 112, g_cart_grid, &requests[11]); + } else { + MPI_Isend((void*)(l+g_1st_zt_int_up[x0]),1,field_zt_slice_odd_up_ot,g_nb_z_up,112,g_cart_grid,&requests[10]); + MPI_Irecv((void*)(l+g_1st_zt_ext_dn[x0]), 1, field_zt_slice_ext_S, g_nb_z_dn, 112, g_cart_grid, &requests[11]); + } + } +# endif + +# endif /* MPI */ + return; +#ifdef _KOJAK_INST +#pragma pomp inst end(xchangetslicefield) +#endif +} + + +void xchange_field_close(MPI_Request * requests, MPI_Status * status, int reqcount) { + +#ifdef _KOJAK_INST +#pragma pomp inst begin(xchangetslicefieldclose) +#endif + + MPI_Waitall(reqcount, requests, status); + +#ifdef _KOJAK_INST +#pragma pomp inst end(xchangetslicefieldclose) +#endif + +} + +void xchange_field_slice(spinor * const l, const int ieo, const int x0) { + +#ifdef _KOJAK_INST +#pragma pomp inst begin(xchangetslicefield) +#endif +# if (defined BGL && defined XLC) + __alignx(16, l); /* ?!? */ +# endif + +# ifdef MPI + + MPI_Status status; + +# if (defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + MPI_Sendrecv((void*)(l+g_1st_xt_int_dn[x0]), 1, field_xt_slice_int, g_nb_x_dn, 91, + (void*)(l+g_1st_xt_ext_up[x0]), 1, field_xt_slice_ext, g_nb_x_up, 91, g_cart_grid, &status); +# endif + +# if (defined PARALLELXY || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + MPI_Sendrecv((void*)(l+g_1st_yt_int_dn[x0]), 1, field_yt_slice_int, g_nb_y_dn, 101, + (void*)(l+g_1st_yt_ext_up[x0]), 1, field_yt_slice_ext, g_nb_y_up, 101, g_cart_grid, &status); +# endif + +# if (defined PARALLELXYZ) + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + if(ieo == 1){ + if(x0 % 2 == 0) { + MPI_Sendrecv((void*)(l+g_1st_zt_int_dn[x0]),1,field_zt_slice_even_dn_et,g_nb_z_dn,111, + (void*)(l+g_1st_zt_ext_up[x0]),1 , field_zt_slice_ext_L, g_nb_z_up, 111, g_cart_grid, &status); + } else { + MPI_Sendrecv((void*)(l+g_1st_zt_int_dn[x0]),1,field_zt_slice_even_dn_ot,g_nb_z_dn,111, + (void*)(l+g_1st_zt_ext_up[x0]),1 , field_zt_slice_ext_S, g_nb_z_up, 111, g_cart_grid, &status); + } + } else { + if(x0 % 2 == 0) { + MPI_Sendrecv((void*)(l+g_1st_zt_int_dn[x0]),1,field_zt_slice_odd_dn_et,g_nb_z_dn,111, + (void*)(l+g_1st_zt_ext_up[x0]),1 , field_zt_slice_ext_S, g_nb_z_up, 111, g_cart_grid, &status); + } else { + MPI_Sendrecv((void*)(l+g_1st_zt_int_dn[x0]),1,field_zt_slice_odd_dn_ot,g_nb_z_dn,111, + (void*)(l+g_1st_zt_ext_up[x0]),1 , field_zt_slice_ext_L, g_nb_z_up, 111, g_cart_grid, &status); + } + } +# endif + +# if (defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ ) + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + MPI_Sendrecv((void*)(l+g_1st_xt_int_up[x0]), 1, field_xt_slice_int, g_nb_x_up, 92, + (void*)(l+g_1st_xt_ext_dn[x0]), 1, field_xt_slice_ext, g_nb_x_dn, 92, g_cart_grid, &status); +# endif + +# if (defined PARALLELXY || defined PARALLELXYZ ) + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + MPI_Sendrecv((void*)(l+g_1st_yt_int_up[x0]), 1, field_yt_slice_int, g_nb_y_up, 102, + (void*)(l+g_1st_yt_ext_dn[x0]), 1, field_yt_slice_ext, g_nb_y_dn, 102, g_cart_grid, &status); +# endif + +# if (defined PARALLELXYZ) + /* send the data to the neighbour on the right in z direction */ + /* recieve the data from the neighbour on the left in z direction */ + if(ieo == 1){ + if(x0 % 2 == 0) { + MPI_Sendrecv((void*)(l+g_1st_zt_int_up[x0]),1,field_zt_slice_even_up_et,g_nb_z_up,112, + (void*)(l+g_1st_zt_ext_dn[x0]), 1, field_zt_slice_ext_S, g_nb_z_dn, 112, g_cart_grid, &status); + } else { + MPI_Sendrecv((void*)(l+g_1st_zt_int_up[x0]),1,field_zt_slice_even_up_ot,g_nb_z_up,112, + (void*)(l+g_1st_zt_ext_dn[x0]), 1, field_zt_slice_ext_L, g_nb_z_dn, 112, g_cart_grid, &status); + } + } else { + if(x0 % 2 == 0) { + MPI_Sendrecv((void*)(l+g_1st_zt_int_up[x0]),1,field_zt_slice_odd_up_et,g_nb_z_up,112, + (void*)(l+g_1st_zt_ext_dn[x0]), 1, field_zt_slice_ext_L, g_nb_z_dn, 112, g_cart_grid, &status); + } else { + MPI_Sendrecv((void*)(l+g_1st_zt_int_up[x0]),1,field_zt_slice_odd_up_ot,g_nb_z_up,112, + (void*)(l+g_1st_zt_ext_dn[x0]), 1, field_zt_slice_ext_S, g_nb_z_dn, 112, g_cart_grid, &status); + } + } +# endif + +# endif /* MPI */ + return; +#ifdef _KOJAK_INST +#pragma pomp inst end(xchangetslicefield) +#endif +} + +# endif // _USE_TSPLITPAR +#endif // MPI + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/xchange_field_tslice.h b/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/xchange_field_tslice.h new file mode 100644 index 0000000000000000000000000000000000000000..724a45c9f7bbd3cb009356cf1ae5d301df6365f0 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/xchange_field_tslice.h @@ -0,0 +1,21 @@ +/********************************************************** + * + * exchange routines for the borders of a timeslice of spinor fields + * + * Author: Luigi Scorzato + * + **********************************************************/ + +#ifndef _XCHANGE_FIELDTS_H +#define _XCHANGE_FIELDTS_H + +#define EVEN 1 +#define ODD 0 + +#ifdef MPI +void xchange_field_open(spinor * const , const int , const int , MPI_Request * , MPI_Status *); +void xchange_field_close(MPI_Request * , MPI_Status * , int ); +void xchange_field_slice(spinor * const , const int , const int ); +#endif + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/xchange_gauge.c b/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/xchange_gauge.c new file mode 100644 index 0000000000000000000000000000000000000000..f80a0c9b592c81aea71e08cce114af4e77dd7fd4 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/xchange_gauge.c @@ -0,0 +1,2093 @@ +/*********************************************************************** + * + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * exchange routines for gauge fields + * + * Author: Carsten Urbach + * + **********************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#ifdef MPI +# include +#endif +#include "global.h" +#include "mpi_init.h" +#include "su3.h" +#include "su3adj.h" +#include "xchange_gauge.h" + +#if defined _NON_BLOCKING + +/* this if statement will be removed in future and _INDEX_INDEP_GEOM will be the default */ +# if defined _INDEX_INDEP_GEOM + +void xchange_gauge(su3 ** const gf) { + int cntr=0; +# ifdef MPI + MPI_Request request[105]; + MPI_Status status[105]; + +# if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT ) + /* send the data to the neighbour on the left */ + /* recieve the data from the neighbour on the right */ + + MPI_Isend(gf[gI_0_0_0_0], 1, gauge_time_slice_cont, g_nb_t_dn, 83, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[gI_L_0_0_0], 1, gauge_time_slice_cont, g_nb_t_up, 83, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right */ + /* recieve the data from the neighbour on the left */ + MPI_Isend(gf[gI_Lm1_0_0_0], 1, gauge_time_slice_cont, g_nb_t_up, 84, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[gI_m1_0_0_0], 1, gauge_time_slice_cont, g_nb_t_dn, 84, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + if(g_dbw2rand > 0) { + /* send the data to the neighbour on the left */ + /* recieve the data from the neighbour on the right */ + /* t2-Rand */ + MPI_Isend(gf[gI_p1_0_0_0], 1, gauge_time_slice_cont, g_nb_t_dn, 85, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[gI_Lp1_0_0_0], 1, gauge_time_slice_cont, g_nb_t_up, 85, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right */ + /* recieve the data from the neighbour on the left */ + /* t2-Rand */ + MPI_Isend(gf[gI_Lm2_0_0_0], 1, gauge_time_slice_cont, g_nb_t_up, 86, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[gI_m2_0_0_0], 1, gauge_time_slice_cont, g_nb_t_dn, 86, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + } +# endif +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + MPI_Isend(gf[gI_0_0_0_0], 1, gauge_x_slice_gath, g_nb_x_dn, 87, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[gI_0_L_0_0], 1, gauge_x_slice_cont, g_nb_x_up, 87, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* x-Rand */ + MPI_Isend(gf[gI_0_Lm1_0_0], 1, gauge_x_slice_gath, g_nb_x_up, 88, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[gI_0_m1_0_0], 1, gauge_x_slice_cont, g_nb_x_dn, 88, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + if(g_dbw2rand > 0) { + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + /* x2-Rand */ + MPI_Isend(gf[gI_0_p1_0_0], 1, gauge_x_slice_gath, g_nb_x_dn, 89, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[gI_0_Lp1_0_0], 1, gauge_x_slice_cont, g_nb_x_up, 89, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* x2-Rand */ + MPI_Isend(gf[gI_0_Lm2_0_0], 1, gauge_x_slice_gath, g_nb_x_up, 90, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[gI_0_m2_0_0], 1, gauge_x_slice_cont, g_nb_x_dn, 90, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + } +# endif + + MPI_Waitall(cntr, request, status); + cntr=0; + + /* Communications of the xt (x2t and t2x) edges are done by using the previously + communicated x-borders whose t-borders are now exchanged in t directions [ORD!] */ + /* In this case the code cannot be completely independent of the definition in Index, + since gauge_xt_edge_gath are defined by joining together the x=L and the x=-1 parts. + For this reason we need to know that x=L comes before x=-1 in the definition of + Index() and hence we need to refer to the starting point gI_0_L_0_0 . [DEP!] */ + +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT ) + /* send the data to the neighbour on the left in t direction */ + /* recieve the data from the neighbour on the right in t direction */ + /* is on the x-Rand: xt-edge */ + MPI_Isend(gf[gI_0_L_0_0], 1, gauge_xt_edge_gath, g_nb_t_dn, 100, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[gI_L_L_0_0], 1, gauge_xt_edge_cont, g_nb_t_up, 100, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in t direction */ + /* recieve the data from the neighbour on the left in t direction */ + /* xt-edge */ + MPI_Isend(gf[gI_Lm1_L_0_0], 1, gauge_xt_edge_gath, g_nb_t_up, 101, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[gI_m1_L_0_0], 1, gauge_xt_edge_cont, g_nb_t_dn, 101, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + if(g_dbw2rand > 0) { + /* send the data to the neighbour on the left in t direction */ + /* recieve the data from the neighbour on the right in t direction */ + /* t2x-edge */ + MPI_Isend(gf[gI_p1_L_0_0], 1, gauge_xt_edge_gath, g_nb_t_dn, 102, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[gI_Lp1_L_0_0], 1, gauge_xt_edge_cont, g_nb_t_up, 102, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in t direction */ + /* recieve the data from the neighbour on the left in t direction */ + /* t2x-edge */ + MPI_Isend(gf[gI_Lm2_L_0_0], 1, gauge_xt_edge_gath, g_nb_t_up, 103, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[gI_m2_L_0_0], 1, gauge_xt_edge_cont, g_nb_t_dn, 103, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the left in t direction */ + /* recieve the data from the neighbour on the right in t direction */ + /* x2t-edge */ /* x=L+1 comes before x=-2. see [DEP!] */ + MPI_Isend(gf[gI_0_Lp1_0_0], 1, gauge_xt_edge_gath, g_nb_t_dn, 104, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[gI_L_Lp1_0_0], 1, gauge_xt_edge_cont, g_nb_t_up, 104, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in t direction */ + /* recieve the data from the neighbour on the left in t direction */ + /* x2t-edge */ + MPI_Isend(gf[gI_Lm1_Lp1_0_0], 1, gauge_xt_edge_gath, g_nb_t_up, 105, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[gI_m1_Lp1_0_0], 1, gauge_xt_edge_cont, g_nb_t_dn, 105, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + } +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + MPI_Isend(gf[gI_0_0_0_0], 1, gauge_y_slice_gath, g_nb_y_dn, 106, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[gI_0_0_L_0], 1, gauge_y_slice_cont, g_nb_y_up, 106, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + MPI_Isend(gf[gI_0_0_Lm1_0], 1, gauge_y_slice_gath, g_nb_y_up, 107, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[gI_0_0_m1_0], 1, gauge_y_slice_cont, g_nb_y_dn, 107, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + if(g_dbw2rand > 0) { + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + /* y2-Rand */ + MPI_Isend(gf[gI_0_0_p1_0], 1, gauge_y_slice_gath, g_nb_y_dn, 108, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[gI_0_0_Lp1_0], 1, gauge_y_slice_cont, g_nb_y_up, 108, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* y2-Rand */ + MPI_Isend(gf[gI_0_0_Lm2_0], 1, gauge_y_slice_gath, g_nb_y_up, 109, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[gI_0_0_m2_0], 1, gauge_y_slice_cont, g_nb_y_dn, 109, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + } +# endif + + MPI_Waitall(cntr, request, status); + cntr=0; + + /* see [ORD!] above, where now x plays the role of t and y the role of x */ + /* see [DEP!] above, where now y=L comes before y=-1 */ + +# if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ ) + + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + /* is on the y-Rand -> yx-edge*/ + MPI_Isend(gf[gI_0_0_L_0], 1, gauge_yx_edge_gath, g_nb_x_dn, 110, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[gI_0_L_L_0], 1, gauge_yx_edge_cont, g_nb_x_up, 110, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* yx-edge */ + MPI_Isend(gf[gI_0_Lm1_L_0], 1, gauge_yx_edge_gath, g_nb_x_up, 111, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[gI_0_m1_L_0], 1, gauge_yx_edge_cont, g_nb_x_dn, 111, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + +# endif + + /* see [ORD!] above, where now y plays the role of t and t the role of x */ + /* see [DEP!] above, where now t=L comes before t=-1 */ + +# if (defined PARALLELXYT || defined PARALLELXYZT ) + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* is on the t-Rand -> ty-edge*/ + MPI_Isend(gf[gI_L_0_0_0], 1, gauge_ty_edge_gath, g_nb_y_dn, 112, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[gI_L_0_L_0], 1, gauge_ty_edge_cont, g_nb_y_up, 112, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* ty-edge */ + MPI_Isend(gf[gI_L_0_Lm1_0], 1, gauge_ty_edge_gath, g_nb_y_up, 113, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[gI_L_0_m1_0], 1, gauge_ty_edge_cont, g_nb_y_dn, 113, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + +# endif + + if(g_dbw2rand > 0) { + +# if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ ) + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* x2y edge */ /* y=L comes before y=-1 */ + MPI_Isend(gf[gI_0_p1_L_0], 1, gauge_yx_edge_gath, g_nb_x_dn, 114, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[gI_0_Lp1_L_0], 1, gauge_yx_edge_cont, g_nb_x_up, 114, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* x2y-edge */ + MPI_Isend(gf[gI_0_Lm2_L_0], 1, gauge_yx_edge_gath, g_nb_x_up, 115, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[gI_0_m2_L_0], 1, gauge_yx_edge_cont, g_nb_x_dn, 115, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* y2x -edge */ + MPI_Isend(gf[gI_0_0_Lp1_0], 1, gauge_yx_edge_gath, g_nb_x_dn, 116, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[gI_0_L_Lp1_0], 1, gauge_yx_edge_cont, g_nb_x_up, 116, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* y2x edge */ + MPI_Isend(gf[gI_0_Lm1_Lp1_0], 1, gauge_yx_edge_gath, g_nb_x_up, 117, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[gI_0_m1_Lp1_0], 1, gauge_yx_edge_cont, g_nb_x_dn, 117, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + +# endif +# if (defined PARALLELXYT || defined PARALLELXYZT ) + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* t2y-edge */ + MPI_Isend(gf[gI_Lp1_0_0_0], 1, gauge_ty_edge_gath, g_nb_y_dn, 118, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[gI_Lp1_0_L_0], 1, gauge_ty_edge_cont, g_nb_y_up, 118, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* t2y edge */ + MPI_Isend(gf[gI_Lp1_0_Lm1_0], 1, gauge_ty_edge_gath, g_nb_y_up, 119, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[gI_Lp1_0_m1_0], 1, gauge_ty_edge_cont, g_nb_y_dn, 119, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* y2t edge */ + MPI_Isend(gf[gI_L_0_p1_0], 1, gauge_ty_edge_gath, g_nb_y_dn, 120, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[gI_L_0_Lp1_0], 1, gauge_ty_edge_cont, g_nb_y_up, 120, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* y2t-edge */ + MPI_Isend(gf[gI_L_0_Lm2_0], 1, gauge_ty_edge_gath, g_nb_y_up, 121, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[gI_L_0_m2_0], 1, gauge_ty_edge_cont, g_nb_y_dn, 121, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; +# endif + } +# if (defined PARALLELXYZT || defined PARALLELXYZ ) + /* z-Rand */ + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + MPI_Isend(gf[gI_0_0_0_0], 1, gauge_z_slice_gath, g_nb_z_dn, 122, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[gI_0_0_0_L], 1, gauge_z_slice_cont, g_nb_z_up, 122, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + /* send the data to the neighbour on the right in z direction */ + /* recieve the data from the neighbour on the left in z direction */ + MPI_Isend(gf[gI_0_0_0_Lm1], 1, gauge_z_slice_gath, g_nb_z_up, 123, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[gI_0_0_0_m1], 1, gauge_z_slice_cont, g_nb_z_dn, 123, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + if(g_dbw2rand > 0) { + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + /* z2-Rand */ + MPI_Isend(gf[gI_0_0_0_p1], 1, gauge_z_slice_gath, g_nb_z_dn, 124, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[gI_0_0_0_Lp1], 1, gauge_z_slice_cont, g_nb_z_up, 124, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + /* send the data to the neighbour on the right in z direction */ + /* recieve the data from the neighbour on the left in z direction */ + /* z2-Rand */ + MPI_Isend(gf[gI_0_0_0_Lm2], 1, gauge_z_slice_gath, g_nb_z_up, 125, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[gI_0_0_0_m2], 1, gauge_z_slice_cont, g_nb_z_dn, 125, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + } +# endif + MPI_Waitall(cntr, request, status); + cntr=0; + + /* see [ORD!] above, where now x plays the role of t and z the role of x */ + /* see [DEP!] above, where now z=L comes before z=-1 */ + +# if (defined PARALLELXYZT || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + /* is on the z-Rand -> zx-edge*/ + MPI_Isend(gf[gI_0_0_0_L], 1, gauge_zx_edge_gath, g_nb_x_dn, 126, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[gI_0_L_0_L], 1, gauge_zx_edge_cont, g_nb_x_up, 126, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* zx-edge */ + MPI_Isend(gf[gI_0_Lm1_0_L], 1, gauge_zx_edge_gath, g_nb_x_up, 127, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[gI_0_m1_0_L], + 1, gauge_zx_edge_cont, g_nb_x_dn, 127, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + +# endif + + /* see [ORD!] above, where now z plays the role of t and t the role of x */ + /* see [DEP!] above, where now t=L comes before t=-1 */ + +# if (defined PARALLELXYZT) + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + /* is on the t-Rand -> tz-edge*/ + MPI_Isend(gf[gI_L_0_0_0], 1, gauge_tz_edge_gath, g_nb_z_dn, 128, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[gI_L_0_0_L], 1, gauge_tz_edge_cont, g_nb_z_up, 128, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in z direction */ + /* recieve the data from the neighbour on the left in z direction */ + /* tz-edge */ + MPI_Isend(gf[gI_L_0_0_Lm1], 1, gauge_tz_edge_gath, g_nb_z_up, 129, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[gI_L_0_0_m1], 1, gauge_tz_edge_cont, g_nb_z_dn, 129, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + +# endif + + /* see [ORD!] above, where now y plays the role of t and z the role of x */ + /* see [DEP!] above, where now z=L comes before z=-1 */ + +# if (defined PARALLELXYZT || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* is on the z-Rand -> zy-edge*/ + MPI_Isend(gf[gI_0_0_0_L], 1, gauge_zy_edge_gath, g_nb_y_dn, 130, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[gI_0_0_L_L], 1, gauge_zy_edge_cont, g_nb_y_up, 130, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* zy-edge */ + MPI_Isend(gf[gI_0_0_Lm1_L], 1, gauge_zy_edge_gath, g_nb_y_up, 131, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[gI_0_0_m1_L], 1, gauge_zy_edge_cont, g_nb_y_dn, 131, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + +# endif + + if(g_dbw2rand > 0) { + +# if (defined PARALLELXYZT) + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + /* t2z edge */ /* t=L+1 comes before t=-2*/ + MPI_Isend(gf[gI_Lp1_0_0_0], 1, gauge_tz_edge_gath, g_nb_z_dn, 132, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[gI_Lp1_0_0_L], 1, gauge_tz_edge_cont, g_nb_z_up, 132, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in z direction */ + /* recieve the data from the neighbour on the left in z direction */ + /* t2z-edge */ + MPI_Isend(gf[gI_Lp1_0_0_Lm1], 1, gauge_tz_edge_gath, g_nb_z_up, 133, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[gI_Lp1_0_0_m1], 1, gauge_tz_edge_cont, g_nb_z_dn, 133, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + /* z2t -edge */ + MPI_Isend(gf[gI_L_0_0_p1], 1, gauge_tz_edge_gath, g_nb_z_dn, 134, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[gI_L_0_0_Lp1], 1, gauge_tz_edge_cont, g_nb_z_up, 134, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in z direction */ + /* recieve the data from the neighbour on the left in z direction */ + /* z2t edge */ + MPI_Isend(gf[gI_L_0_0_Lm2], 1, gauge_tz_edge_gath, g_nb_z_up, 135, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[gI_L_0_0_m2], 1, gauge_tz_edge_cont, g_nb_z_dn, 135, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + +# endif +# if (defined PARALLELXYZT || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + /* z2x-edge */ + MPI_Isend(gf[gI_0_0_0_Lp1], 1, gauge_zx_edge_gath, g_nb_x_dn, 136, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[gI_0_L_0_Lp1], 1, gauge_zx_edge_cont, g_nb_x_up, 136, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* z2x edge */ + MPI_Isend(gf[gI_0_Lm1_0_Lp1], 1, gauge_zx_edge_gath, g_nb_x_up, 137, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[gI_0_m1_0_Lp1], 1, gauge_zx_edge_cont, g_nb_x_dn, 137, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + /* x2z edge */ + MPI_Isend(gf[gI_0_p1_0_L], 1, gauge_zx_edge_gath, g_nb_x_dn, 138, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[gI_0_Lp1_0_L], 1, gauge_zx_edge_cont, g_nb_x_up, 138, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* x2z-edge */ + MPI_Isend(gf[gI_0_Lm2_0_L], 1, gauge_zx_edge_gath, g_nb_x_up, 139, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[gI_0_m2_0_L], 1, gauge_zx_edge_cont, g_nb_x_dn, 139, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + +# endif +# if (defined PARALLELXYZT || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* z2y-edge */ /* z=L+1 comes before z=-2 */ + MPI_Isend(gf[gI_0_0_0_Lp1], 1, gauge_zy_edge_gath, g_nb_y_dn, 140, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[gI_0_0_L_Lp1], 1, gauge_zy_edge_cont, g_nb_y_up, 140, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* z2y edge */ + MPI_Isend(gf[gI_0_0_Lm1_Lp1], 1, gauge_zy_edge_gath, g_nb_y_up, 141, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[gI_0_0_m1_Lp1], 1, gauge_zy_edge_cont, g_nb_y_dn, 141, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* y2z edge */ /* z=L comes before z=-1 */ + MPI_Isend(gf[gI_0_0_p1_L], 1, gauge_zy_edge_gath, g_nb_y_dn, 142, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[gI_0_0_Lp1_L], 1, gauge_zy_edge_cont, g_nb_y_up, 142, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* y2z-edge */ + MPI_Isend(gf[gI_0_0_Lm2_L], 1, gauge_zy_edge_gath, g_nb_y_up, 143, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[gI_0_0_m2_L], 1, gauge_zy_edge_cont, g_nb_y_dn, 143, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; +# endif + } + +# if (defined PARALLELXYZT || defined PARALLELXYZ ) + MPI_Waitall(cntr, request, status); +# endif + +# endif /* MPI */ + return; +} + + +# else /* _INDEX_INDEP_GEOM */ + +void xchange_gauge(su3 ** const gf) { + int cntr=0; +# ifdef MPI + MPI_Request request[105]; + MPI_Status status[105]; + + /* send the data to the neighbour on the left */ + /* recieve the data from the neighbour on the right */ + + MPI_Isend(gf[0], 1, gauge_time_slice_cont, g_nb_t_dn, 83, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[VOLUME], 1, gauge_time_slice_cont, g_nb_t_up, 83, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right */ + /* recieve the data from the neighbour on the left */ + MPI_Isend(gf[(T-1)*LX*LY*LZ], 1, gauge_time_slice_cont, g_nb_t_up, 84, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[(T+1)*LX*LY*LZ], 1, gauge_time_slice_cont, g_nb_t_dn, 84, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + if(g_dbw2rand > 0) { + /* send the data to the neighbour on the left */ + /* recieve the data from the neighbour on the right */ + /* t2-Rand */ + MPI_Isend(gf[1*LX*LY*LZ], 1, gauge_time_slice_cont, g_nb_t_dn, 85, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[VOLUMEPLUSRAND], 1, gauge_time_slice_cont, g_nb_t_up, 85, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right */ + /* recieve the data from the neighbour on the left */ + /* t2-Rand */ + MPI_Isend(gf[(T-2)*LX*LY*LZ], 1, gauge_time_slice_cont, g_nb_t_up, 86, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[VOLUMEPLUSRAND+LX*LY*LZ], 1, gauge_time_slice_cont, g_nb_t_dn, 86, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + } + +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + MPI_Isend(gf[0], 1, gauge_x_slice_gath, g_nb_x_dn, 87, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[(T+2)*LX*LY*LZ], 1, gauge_x_slice_cont, g_nb_x_up, 87, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* x-Rand */ + MPI_Isend(gf[(LX-1)*LY*LZ], 1, gauge_x_slice_gath, g_nb_x_up, 88, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[(T+2)*LX*LY*LZ + T*LY*LZ], 1, gauge_x_slice_cont, g_nb_x_dn, 88, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + if(g_dbw2rand > 0) { + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + /* x2-Rand */ + MPI_Isend(gf[LY*LZ], 1, gauge_x_slice_gath, g_nb_x_dn, 89, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[VOLUMEPLUSRAND+2*LX*LY*LZ], 1, gauge_x_slice_cont, g_nb_x_up, 89, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* x2-Rand */ + MPI_Isend(gf[(LX-2)*LY*LZ], 1, gauge_x_slice_gath, g_nb_x_up, 90, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[VOLUMEPLUSRAND+2*LX*LY*LZ + T*LY*LZ], 1, gauge_x_slice_cont, g_nb_x_dn, 90, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + } +# endif + MPI_Waitall(cntr, request, status); + cntr=0; +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) + /* The edges */ + + /* send the data to the neighbour on the left in t direction */ + /* recieve the data from the neighbour on the right in t direction */ + /* is on the x-Rand: xt-edge */ + MPI_Isend(gf[(T+2)*LX*LY*LZ], 1, gauge_xt_edge_gath, g_nb_t_dn, 100, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[VOLUME + RAND], 1, gauge_xt_edge_cont, g_nb_t_up, 100, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in t direction */ + /* recieve the data from the neighbour on the left in t direction */ + /* xt-edge */ + MPI_Isend(gf[(T+2)*LX*LY*LZ + (T-1)*LY*LZ], 1, gauge_xt_edge_gath, g_nb_t_up, 101, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[VOLUME + RAND + 2*LY*LZ], 1, gauge_xt_edge_cont, g_nb_t_dn, 101, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + if(g_dbw2rand > 0) { + /* send the data to the neighbour on the left in t direction */ + /* recieve the data from the neighbour on the right in t direction */ + /* t2x-edge */ + MPI_Isend(gf[(T+2)*LX*LY*LZ + LY*LZ], + 1, gauge_xt_edge_gath, g_nb_t_dn, 102, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[VOLUMEPLUSRAND + RAND], + 1, gauge_xt_edge_cont, g_nb_t_up, 102, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in t direction */ + /* recieve the data from the neighbour on the left in t direction */ + /* t2x-edge */ + MPI_Isend(gf[(T+2)*LX*LY*LZ + (T-2)*LY*LZ], + 1, gauge_xt_edge_gath, g_nb_t_up, 103, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[VOLUMEPLUSRAND + RAND + 2*LY*LZ], + 1, gauge_xt_edge_cont, g_nb_t_dn, 103, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the left in t direction */ + /* recieve the data from the neighbour on the right in t direction */ + /* x2t-edge */ + MPI_Isend(gf[VOLUMEPLUSRAND + 2*LX*LY*LZ], + 1, gauge_xt_edge_gath, g_nb_t_dn, 104, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[VOLUMEPLUSRAND + RAND + 4*LY*LZ], + 1, gauge_xt_edge_cont, g_nb_t_up, 104, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in t direction */ + /* recieve the data from the neighbour on the left in t direction */ + /* x2t-edge */ + MPI_Isend(gf[VOLUMEPLUSRAND + 2*LX*LY*LZ + (T-1)*LY*LZ], + 1, gauge_xt_edge_gath, g_nb_t_up, 105, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[VOLUMEPLUSRAND + RAND + 6*LY*LZ], + 1, gauge_xt_edge_cont, g_nb_t_dn, 105, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + } + /* end of if defined PARALLELXT || PARALLELXYT || PARALLELXYZT*/ +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT) + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + MPI_Isend(gf[0], 1, gauge_y_slice_gath, g_nb_y_dn, 106, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[VOLUME + 2*LZ*(LX*LY + T*LY)], 1, gauge_y_slice_cont, g_nb_y_up, 106, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + MPI_Isend(gf[(LY-1)*LZ], 1, gauge_y_slice_gath, g_nb_y_up, 107, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[VOLUME + 2*LZ*(LX*LY + T*LY) + T*LX*LZ], 1, gauge_y_slice_cont, g_nb_y_dn, 107, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + if(g_dbw2rand > 0) { + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + /* y2-Rand */ + MPI_Isend(gf[LZ], 1, gauge_y_slice_gath, g_nb_y_dn, 108, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[VOLUMEPLUSRAND+(2*LX+2*T)*LY*LZ], 1, gauge_y_slice_cont, g_nb_y_up, 108, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* y2-Rand */ + MPI_Isend(gf[(LY-2)*LZ], 1, gauge_y_slice_gath, g_nb_y_up, 109, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[VOLUMEPLUSRAND+(2*LX+2*T)*LY*LZ + T*LX*LZ], 1, gauge_y_slice_cont, g_nb_y_dn, 109, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + } +# endif + MPI_Waitall(cntr, request, status); + cntr=0; +# if (defined PARALLELXYT || defined PARALLELXYZT) + + /* jetzt wirds richtig eklig ... */ + + /* edges */ + + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + /* is on the y-Rand -> yx-edge*/ + MPI_Isend(gf[VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ], 1, gauge_yx_edge_gath, g_nb_x_dn, 110, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[VOLUME + RAND + 4*LY*LZ], 1, gauge_yx_edge_cont, g_nb_x_up, 110, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* yx-edge */ + MPI_Isend(gf[VOLUME + 2*LZ*(LX*LY + T*LY) + (LX-1)*LZ], 1, gauge_yx_edge_gath, g_nb_x_up, 111, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[VOLUME + RAND + 4*LY*LZ + 2*T*LZ], 1, gauge_yx_edge_cont, g_nb_x_dn, 111, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* is on the t-Rand -> ty-edge*/ + MPI_Isend(gf[VOLUME], 1, gauge_ty_edge_gath, g_nb_y_dn, 112, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[VOLUME + RAND + 4*LY*LZ + 4*T*LZ], 1, gauge_ty_edge_cont, g_nb_y_up, 112, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* ty-edge */ + MPI_Isend(gf[VOLUME + (LY-1)*LZ], 1, gauge_ty_edge_gath, g_nb_y_up, 113, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 2*LX*LZ], 1, gauge_ty_edge_cont, g_nb_y_dn, 113, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + + + if(g_dbw2rand > 0) { + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* x2y edge */ + MPI_Isend(gf[VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + LZ], + 1, gauge_yx_edge_gath, g_nb_x_dn, 114, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[VOLUMEPLUSRAND + RAND + 8*LY*LZ], + 1, gauge_yx_edge_cont, g_nb_x_up, 114, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* x2y-edge */ + MPI_Isend(gf[VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + (LX-2)*LZ], + 1, gauge_yx_edge_gath, g_nb_x_up, 115, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 2*T*LZ], + 1, gauge_yx_edge_cont, g_nb_x_dn, 115, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* y2x -edge */ + MPI_Isend(gf[VOLUMEPLUSRAND + 2*LX*LY*LZ + 2*T*LY*LZ], + 1, gauge_yx_edge_gath, g_nb_x_dn, 116, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 4*T*LZ], + 1, gauge_yx_edge_cont, g_nb_x_up, 116, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* y2x edge */ + MPI_Isend(gf[VOLUMEPLUSRAND + 2*LX*LY*LZ + 2*T*LY*LZ + (LX-1)*LZ], + 1, gauge_yx_edge_gath, g_nb_x_up, 117, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 6*T*LZ], + 1, gauge_yx_edge_cont, g_nb_x_dn, 117, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* t2y-edge */ + MPI_Isend(gf[VOLUMEPLUSRAND], + 1, gauge_ty_edge_gath, g_nb_y_dn, 118, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ], + 1, gauge_ty_edge_cont, g_nb_y_up, 118, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* t2y edge */ + MPI_Isend(gf[VOLUMEPLUSRAND + (LY-1)*LZ], + 1, gauge_ty_edge_gath, g_nb_y_up, 119, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 2*LX*LZ], + 1, gauge_ty_edge_cont, g_nb_y_dn, 119, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* y2t edge */ + MPI_Isend(gf[VOLUME + LZ], + 1, gauge_ty_edge_gath, g_nb_y_dn, 120, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 4*LX*LZ], + 1, gauge_ty_edge_cont, g_nb_y_up, 120, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* y2t-edge */ + MPI_Isend(gf[VOLUME + (LY-2)*LZ], + 1, gauge_ty_edge_gath, g_nb_y_up, 121, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 6*LX*LZ], + 1, gauge_ty_edge_cont, g_nb_y_dn, 121, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + } + + /* end of if defined PARALLELXYT || PARALLELXYZT */ +# endif +# if defined PARALLELXYZT + /* z-Rand */ + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + MPI_Isend(gf[0], + 1, gauge_z_slice_gath, g_nb_z_dn, 122, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[VOLUME + 2*LZ*(LX*LY + T*LY) + 2*LZ*T*LX], + 1, gauge_z_slice_cont, g_nb_z_up, 122, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + /* send the data to the neighbour on the right in z direction */ + /* recieve the data from the neighbour on the left in z direction */ + MPI_Isend(gf[LZ-1], + 1, gauge_z_slice_gath, g_nb_z_up, 123, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[VOLUME + 2*LZ*(LX*LY + T*LY) + 2*T*LX*LZ + T*LX*LY], + 1, gauge_z_slice_cont, g_nb_z_dn, 123, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + if(g_dbw2rand > 0) { + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + /* z2-Rand */ + MPI_Isend(gf[1], + 1, gauge_z_slice_gath, g_nb_z_dn, 124, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[VOLUMEPLUSRAND+(2*LX+2*T)*LY*LZ + 2*T*LX*LZ], + 1, gauge_z_slice_cont, g_nb_z_up, 124, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + /* send the data to the neighbour on the right in z direction */ + /* recieve the data from the neighbour on the left in z direction */ + /* z2-Rand */ + MPI_Isend(gf[LZ-2], + 1, gauge_z_slice_gath, g_nb_z_up, 125, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[VOLUMEPLUSRAND+(2*LX+2*T)*LY*LZ + 2*T*LX*LZ + T*LX*LY], + 1, gauge_z_slice_cont, g_nb_z_dn, 125, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + } +# endif + MPI_Waitall(cntr, request, status); +# if defined PARALLELXYZT + cntr=0; + /* edges */ + + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + /* is on the z-Rand -> zx-edge*/ + MPI_Isend(gf[VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ], + 1, gauge_zx_edge_gath, g_nb_x_dn, 126, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ], + 1, gauge_zx_edge_cont, g_nb_x_up, 126, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* zx-edge */ + MPI_Isend(gf[VOLUME + 2*LZ*(LX*LY + T*LY) + 2*T*LX*LZ + (LX-1)*LY], + 1, gauge_zx_edge_gath, g_nb_x_up, 127, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ + 2*T*LY], + 1, gauge_zx_edge_cont, g_nb_x_dn, 127, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + /* is on the t-Rand -> tz-edge*/ + MPI_Isend(gf[VOLUME], + 1, gauge_tz_edge_gath, g_nb_z_dn, 128, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ + 4*T*LY], + 1, gauge_tz_edge_cont, g_nb_z_up, 128, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in z direction */ + /* recieve the data from the neighbour on the left in z direction */ + /* tz-edge */ + MPI_Isend(gf[VOLUME + (LZ-1)], + 1, gauge_tz_edge_gath, g_nb_z_up, 129, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ + 4*T*LY + 2*LX*LY], + 1, gauge_tz_edge_cont, g_nb_z_dn, 129, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* is on the z-Rand -> zy-edge*/ + MPI_Isend(gf[VOLUME + 2*LZ*(LX*LY + T*LY) + 2*T*LX*LZ], + 1, gauge_zy_edge_gath, g_nb_y_dn, 130, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ + 4*T*LY + 4*LX*LY], + 1, gauge_zy_edge_cont, g_nb_y_up, 130, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* zy-edge */ + MPI_Isend(gf[VOLUME + 2*LZ*(LX*LY + T*LY) + 2*T*LX*LZ + (LY-1)], + 1, gauge_zy_edge_gath, g_nb_y_up, 131, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ + 4*T*LY + 4*LX*LY + 2*T*LX], + 1, gauge_zy_edge_cont, g_nb_y_dn, 131, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* rectangular gauge action Stuff! */ + if(g_dbw2rand > 0) { + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + /* t2z edge */ + MPI_Isend(gf[VOLUMEPLUSRAND], + 1, gauge_tz_edge_gath, g_nb_z_dn, 132, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ], + 1, gauge_tz_edge_cont, g_nb_z_up, 132, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in z direction */ + /* recieve the data from the neighbour on the left in z direction */ + /* t2z-edge */ + MPI_Isend(gf[VOLUMEPLUSRAND + (LZ-1)], + 1, gauge_tz_edge_gath, g_nb_z_up, 133, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 2*LX*LY], + 1, gauge_tz_edge_cont, g_nb_z_dn, 133, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + /* z2t -edge */ + MPI_Isend(gf[VOLUME + 1], + 1, gauge_tz_edge_gath, g_nb_z_dn, 134, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 4*LX*LY], + 1, gauge_tz_edge_cont, g_nb_z_up, 134, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in z direction */ + /* recieve the data from the neighbour on the left in z direction */ + /* z2t edge */ + MPI_Isend(gf[VOLUME + (LZ-2)], + 1, gauge_tz_edge_gath, g_nb_z_up, 135, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 6*LX*LY], + 1, gauge_tz_edge_cont, g_nb_z_dn, 135, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + /* z2x-edge */ + MPI_Isend(gf[VOLUMEPLUSRAND + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ], + 1, gauge_zx_edge_gath, g_nb_x_dn, 136, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY], + 1, gauge_zx_edge_cont, g_nb_x_up, 136, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* z2x edge */ + MPI_Isend(gf[VOLUMEPLUSRAND + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + (LX-1)*LY], + 1, gauge_zx_edge_gath, g_nb_x_up, 137, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 2*T*LY], + 1, gauge_zx_edge_cont, g_nb_x_dn, 137, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + /* x2z edge */ + MPI_Isend(gf[VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + LY], + 1, gauge_zx_edge_gath, g_nb_x_dn, 138, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 4*T*LY], + 1, gauge_zx_edge_cont, g_nb_x_up, 138, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* x2z-edge */ + MPI_Isend(gf[VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + (LX-2)*LY], + 1, gauge_zx_edge_gath, g_nb_x_up, 139, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 6*T*LY], + 1, gauge_zx_edge_cont, g_nb_x_dn, 139, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* z2y-edge */ + MPI_Isend(gf[VOLUMEPLUSRAND + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ], + 1, gauge_zy_edge_gath, g_nb_y_dn, 140, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 8*T*LY], + 1, gauge_zy_edge_cont, g_nb_y_up, 140, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* z2y edge */ + MPI_Isend(gf[VOLUMEPLUSRAND + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + (LY-1)], + 1, gauge_zy_edge_gath, g_nb_y_up, 141, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 8*T*LY + 2*T*LX], + 1, gauge_zy_edge_cont, g_nb_y_dn, 141, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* y2z edge */ + MPI_Isend(gf[VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + 1], + 1, gauge_zy_edge_gath, g_nb_y_dn, 142, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 8*T*LY + 4*T*LX], + 1, gauge_zy_edge_cont, g_nb_y_up, 142, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* y2z-edge */ + MPI_Isend(gf[VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + (LY-2)], + 1, gauge_zy_edge_gath, g_nb_y_up, 143, + g_cart_grid, &request[cntr]); + MPI_Irecv(gf[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 8*T*LY + 6*T*LX], + 1, gauge_zy_edge_cont, g_nb_y_dn, 143, + g_cart_grid, &request[cntr+1]); + cntr=cntr+2; + } + MPI_Waitall(cntr, request, status); + + /* end of if defined PARALLELXYZT */ +# endif +# endif + return; +} + +# endif /* _INDEX_INDEP_GEOM */ + +#else /* _NON_BLOCKING */ + +# if defined _INDEX_INDEP_GEOM + +void xchange_gauge(su3 ** const gf) { + +#ifdef MPI + + MPI_Status status; +# if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT ) + /* send the data to the neighbour on the left */ + /* recieve the data from the neighbour on the right */ + MPI_Sendrecv(gf[gI_0_0_0_0], 1, gauge_time_slice_cont, g_nb_t_dn, 83, + gf[gI_L_0_0_0], 1, gauge_time_slice_cont, g_nb_t_up, 83, + g_cart_grid, &status); + + /* send the data to the neighbour on the right */ + /* recieve the data from the neighbour on the left */ + MPI_Sendrecv(gf[gI_Lm1_0_0_0], 1, gauge_time_slice_cont, g_nb_t_up, 84, + gf[gI_m1_0_0_0], 1, gauge_time_slice_cont, g_nb_t_dn, 84, + g_cart_grid, &status); + + if(g_dbw2rand > 0) { + /* send the data to the neighbour on the left */ + /* recieve the data from the neighbour on the right */ + /* t2-Rand */ + MPI_Sendrecv(gf[gI_p1_0_0_0], 1, gauge_time_slice_cont, g_nb_t_dn, 85, + gf[gI_Lp1_0_0_0], 1, gauge_time_slice_cont, g_nb_t_up, 85, + g_cart_grid, &status); + + /* send the data to the neighbour on the right */ + /* recieve the data from the neighbour on the left */ + /* t2-Rand */ + MPI_Sendrecv(gf[gI_Lm2_0_0_0], 1, gauge_time_slice_cont, g_nb_t_up, 86, + gf[gI_m2_0_0_0], 1, gauge_time_slice_cont, g_nb_t_dn, 86, + g_cart_grid, &status); + } + +# endif +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + MPI_Sendrecv(gf[gI_0_0_0_0], 1, gauge_x_slice_gath, g_nb_x_dn, 87, + gf[gI_0_L_0_0], 1, gauge_x_slice_cont, g_nb_x_up, 87, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* x2-Rand */ + MPI_Sendrecv(gf[gI_0_Lm1_0_0], 1, gauge_x_slice_gath, g_nb_x_up, 88, + gf[gI_0_m1_0_0], 1, gauge_x_slice_cont, g_nb_x_dn, 88, + g_cart_grid, &status); + + if(g_dbw2rand > 0) { + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + /* x2-Rand */ + MPI_Sendrecv(gf[gI_0_p1_0_0], 1, gauge_x_slice_gath, g_nb_x_dn, 89, + gf[gI_0_Lp1_0_0], 1, gauge_x_slice_cont, g_nb_x_up, 89, + g_cart_grid, &status); + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* x2-Rand */ + MPI_Sendrecv(gf[gI_0_Lm2_0_0], 1, gauge_x_slice_gath, g_nb_x_up, 90, + gf[gI_0_m2_0_0], 1, gauge_x_slice_cont, g_nb_x_dn, 90, + g_cart_grid, &status); + } +# endif + /* The edges */ +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT ) + /* send the data to the neighbour on the left in t direction */ + /* recieve the data from the neighbour on the right in t direction */ + /* is on the x-Rand: xt-edge */ + MPI_Sendrecv(gf[gI_0_L_0_0], 1, gauge_xt_edge_gath, g_nb_t_dn, 100, + gf[gI_L_L_0_0], 1, gauge_xt_edge_cont, g_nb_t_up, 100, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in t direction */ + /* recieve the data from the neighbour on the left in t direction */ + /* xt-edge */ + MPI_Sendrecv(gf[gI_Lm1_L_0_0], 1, gauge_xt_edge_gath, g_nb_t_up, 101, + gf[gI_m1_L_0_0], 1, gauge_xt_edge_cont, g_nb_t_dn, 101, + g_cart_grid, &status); + + if(g_dbw2rand > 0) { + /* send the data to the neighbour on the left in t direction */ + /* recieve the data from the neighbour on the right in t direction */ + /* t2x-edge */ + MPI_Sendrecv(gf[gI_p1_L_0_0], 1, gauge_xt_edge_gath, g_nb_t_dn, 102, + gf[gI_Lp1_L_0_0], 1, gauge_xt_edge_cont, g_nb_t_up, 102, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in t direction */ + /* recieve the data from the neighbour on the left in t direction */ + /* t2x-edge */ + MPI_Sendrecv(gf[gI_Lm2_L_0_0], 1, gauge_xt_edge_gath, g_nb_t_up, 103, + gf[gI_m2_L_0_0], 1, gauge_xt_edge_cont, g_nb_t_dn, 103, + g_cart_grid, &status); + + /* send the data to the neighbour on the left in t direction */ + /* recieve the data from the neighbour on the right in t direction */ + /* x2t-edge */ + MPI_Sendrecv(gf[gI_0_Lp1_0_0], 1, gauge_xt_edge_gath, g_nb_t_dn, 104, + gf[gI_L_Lp1_0_0], 1, gauge_xt_edge_cont, g_nb_t_up, 104, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in t direction */ + /* recieve the data from the neighbour on the left in t direction */ + /* x2t-edge */ + MPI_Sendrecv(gf[gI_Lm1_Lp1_0_0], 1, gauge_xt_edge_gath, g_nb_t_up, 105, + gf[gI_m1_Lp1_0_0], 1, gauge_xt_edge_cont, g_nb_t_dn, 105, + g_cart_grid, &status); + } + /* end of if defined PARALLELXT || PARALLELXYT || PARALLELXYZT*/ +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + MPI_Sendrecv(gf[gI_0_0_0_0], 1, gauge_y_slice_gath, g_nb_y_dn, 106, + gf[gI_0_0_L_0], 1, gauge_y_slice_cont, g_nb_y_up, 106, + g_cart_grid, &status); + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + MPI_Sendrecv(gf[gI_0_0_Lm1_0], 1, gauge_y_slice_gath, g_nb_y_up, 107, + gf[gI_0_0_m1_0], 1, gauge_y_slice_cont, g_nb_y_dn, 107, + g_cart_grid, &status); + + if(g_dbw2rand > 0) { + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + /* y2-Rand */ + MPI_Sendrecv(gf[gI_0_0_p1_0], 1, gauge_y_slice_gath, g_nb_y_dn, 108, + gf[gI_0_0_Lp1_0], 1, gauge_y_slice_cont, g_nb_y_up, 108, + g_cart_grid, &status); + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* y2-Rand */ + MPI_Sendrecv(gf[gI_0_0_Lm2_0], 1, gauge_y_slice_gath, g_nb_y_up, 109, + gf[gI_0_0_m2_0], 1, gauge_y_slice_cont, g_nb_y_dn, 109, + g_cart_grid, &status); + } +# endif + /* jetzt wirds richtig eklig ... */ + + /* edges */ +# if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + /* is on the y-Rand -> yx-edge*/ + MPI_Sendrecv(gf[gI_0_0_L_0], 1, gauge_yx_edge_gath, g_nb_x_dn, 110, + gf[gI_0_L_L_0], 1, gauge_yx_edge_cont, g_nb_x_up, 110, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* yx-edge */ + MPI_Sendrecv(gf[gI_0_Lm1_L_0], 1, gauge_yx_edge_gath, g_nb_x_up, 111, + gf[gI_0_m1_L_0], 1, gauge_yx_edge_cont, g_nb_x_dn, 111, + g_cart_grid, &status); + +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT ) + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* is on the t-Rand -> ty-edge*/ + MPI_Sendrecv(gf[gI_L_0_0_0], 1, gauge_ty_edge_gath, g_nb_y_dn, 112, + gf[gI_L_0_L_0], 1, gauge_ty_edge_cont, g_nb_y_up, 112, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* ty-edge */ + MPI_Sendrecv(gf[gI_L_0_Lm1_0], 1, gauge_ty_edge_gath, g_nb_y_up, 113, + gf[gI_L_0_m1_0], 1, gauge_ty_edge_cont, g_nb_y_dn, 113, + g_cart_grid, &status); +# endif + + + if(g_dbw2rand > 0) { + +# if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* x2y edge */ + MPI_Sendrecv(gf[gI_0_p1_L_0], 1, gauge_yx_edge_gath, g_nb_x_dn, 114, + gf[gI_0_Lp1_L_0], 1, gauge_yx_edge_cont, g_nb_x_up, 114, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* x2y-edge */ + MPI_Sendrecv(gf[gI_0_Lm2_L_0], 1, gauge_yx_edge_gath, g_nb_x_up, 115, + gf[gI_0_m2_L_0], 1, gauge_yx_edge_cont, g_nb_x_dn, 115, + g_cart_grid, &status); + + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* y2x -edge */ + MPI_Sendrecv(gf[gI_0_0_Lp1_0], 1, gauge_yx_edge_gath, g_nb_x_dn, 116, + gf[gI_0_L_Lp1_0], 1, gauge_yx_edge_cont, g_nb_x_up, 116, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* y2x edge */ + MPI_Sendrecv(gf[gI_0_Lm1_Lp1_0], 1, gauge_yx_edge_gath, g_nb_x_up, 117, + gf[gI_0_m1_Lp1_0], 1, gauge_yx_edge_cont, g_nb_x_dn, 117, + g_cart_grid, &status); + +# endif +# if (defined PARALLELXYT || defined PARALLELXYZT ) + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* t2y-edge */ + MPI_Sendrecv(gf[gI_Lp1_0_0_0], 1, gauge_ty_edge_gath, g_nb_y_dn, 118, + gf[gI_Lp1_0_L_0], 1, gauge_ty_edge_cont, g_nb_y_up, 118, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* t2y edge */ + MPI_Sendrecv(gf[gI_Lp1_0_Lm1_0], 1, gauge_ty_edge_gath, g_nb_y_up, 119, + gf[gI_Lp1_0_m1_0], 1, gauge_ty_edge_cont, g_nb_y_dn, 119, + g_cart_grid, &status); + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* y2t edge */ + MPI_Sendrecv(gf[gI_L_0_p1_0], 1, gauge_ty_edge_gath, g_nb_y_dn, 120, + gf[gI_L_0_Lp1_0], 1, gauge_ty_edge_cont, g_nb_y_up, 120, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* y2t-edge */ + MPI_Sendrecv(gf[gI_L_0_Lm2_0], 1, gauge_ty_edge_gath, g_nb_y_up, 121, + gf[gI_L_0_m2_0], 1, gauge_ty_edge_cont, g_nb_y_dn, 121, + g_cart_grid, &status); +# endif /* end of if defined PARALLELXYT || PARALLELXYZT */ + } + + +# if (defined PARALLELXYZT || defined PARALLELXYZ ) + /* z-Rand */ + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + MPI_Sendrecv(gf[gI_0_0_0_0], 1, gauge_z_slice_gath, g_nb_z_dn, 122, + gf[gI_0_0_0_L], 1, gauge_z_slice_cont, g_nb_z_up, 122, + g_cart_grid, &status); + /* send the data to the neighbour on the right in z direction */ + /* recieve the data from the neighbour on the left in z direction */ + MPI_Sendrecv(gf[gI_0_0_0_Lm1], 1, gauge_z_slice_gath, g_nb_z_up, 123, + gf[gI_0_0_0_m1], 1, gauge_z_slice_cont, g_nb_z_dn, 123, + g_cart_grid, &status); + + if(g_dbw2rand > 0) { + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + /* z2-Rand */ + MPI_Sendrecv(gf[gI_0_0_0_p1], 1, gauge_z_slice_gath, g_nb_z_dn, 124, + gf[gI_0_0_0_Lp1], 1, gauge_z_slice_cont, g_nb_z_up, 124, + g_cart_grid, &status); + /* send the data to the neighbour on the right in z direction */ + /* recieve the data from the neighbour on the left in z direction */ + /* z2-Rand */ + MPI_Sendrecv(gf[gI_0_0_0_Lm2], 1, gauge_z_slice_gath, g_nb_z_up, 125, + gf[gI_0_0_0_m2], 1, gauge_z_slice_cont, g_nb_z_dn, 125, + g_cart_grid, &status); + } + +# endif + /* edges */ + +# if (defined PARALLELXYZT || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + /* is on the z-Rand -> zx-edge*/ + MPI_Sendrecv(gf[gI_0_0_0_L], 1, gauge_zx_edge_gath, g_nb_x_dn, 126, + gf[gI_0_L_0_L], 1, gauge_zx_edge_cont, g_nb_x_up, 126, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* zx-edge */ + MPI_Sendrecv(gf[gI_0_Lm1_0_L], 1, gauge_zx_edge_gath, g_nb_x_up, 127, + gf[gI_0_m1_0_L], 1, gauge_zx_edge_cont, g_nb_x_dn, 127, + g_cart_grid, &status); +# endif + +# if (defined PARALLELXYZT) + + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + /* is on the t-Rand -> tz-edge*/ + MPI_Sendrecv(gf[gI_L_0_0_0], 1, gauge_tz_edge_gath, g_nb_z_dn, 128, + gf[gI_L_0_0_L], 1, gauge_tz_edge_cont, g_nb_z_up, 128, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in z direction */ + /* recieve the data from the neighbour on the left in z direction */ + /* tz-edge */ + MPI_Sendrecv(gf[gI_L_0_0_Lm1], 1, gauge_tz_edge_gath, g_nb_z_up, 129, + gf[gI_L_0_0_m1], 1, gauge_tz_edge_cont, g_nb_z_dn, 129, + g_cart_grid, &status); + +# endif + +# if (defined PARALLELXYZT || defined PARALLELXYZ ) + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* is on the z-Rand -> zy-edge*/ + MPI_Sendrecv(gf[gI_0_0_0_L], 1, gauge_zy_edge_gath, g_nb_y_dn, 130, + gf[gI_0_0_L_L], 1, gauge_zy_edge_cont, g_nb_y_up, 130, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* zy-edge */ + MPI_Sendrecv(gf[gI_0_0_Lm1_L], 1, gauge_zy_edge_gath, g_nb_y_up, 131, + gf[gI_0_0_m1_L], 1, gauge_zy_edge_cont, g_nb_y_dn, 131, + g_cart_grid, &status); + +# endif + + /* rectangular gauge action Stuff! */ + if(g_dbw2rand > 0) { + +# if (defined PARALLELXYZT) + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + /* t2z edge */ + MPI_Sendrecv(gf[gI_Lp1_0_0_0], 1, gauge_tz_edge_gath, g_nb_z_dn, 132, + gf[gI_Lp1_0_0_L], 1, gauge_tz_edge_cont, g_nb_z_up, 132, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in z direction */ + /* recieve the data from the neighbour on the left in z direction */ + /* t2z-edge */ + MPI_Sendrecv(gf[gI_Lp1_0_0_Lm1], 1, gauge_tz_edge_gath, g_nb_z_up, 133, + gf[gI_Lp1_0_0_m1], 1, gauge_tz_edge_cont, g_nb_z_dn, 133, + g_cart_grid, &status); + + + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + /* z2t -edge */ + MPI_Sendrecv(gf[gI_L_0_0_p1], 1, gauge_tz_edge_gath, g_nb_z_dn, 134, + gf[gI_L_0_0_Lp1], 1, gauge_tz_edge_cont, g_nb_z_up, 134, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in z direction */ + /* recieve the data from the neighbour on the left in z direction */ + /* z2t edge */ + MPI_Sendrecv(gf[gI_L_0_0_Lm2], 1, gauge_tz_edge_gath, g_nb_z_up, 135, + gf[gI_L_0_0_m2], 1, gauge_tz_edge_cont, g_nb_z_dn, 135, + g_cart_grid, &status); + +# endif +# if (defined PARALLELXYZT || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + /* z2x-edge */ + MPI_Sendrecv(gf[gI_0_0_0_Lp1], 1, gauge_zx_edge_gath, g_nb_x_dn, 136, + gf[gI_0_L_0_Lp1], 1, gauge_zx_edge_cont, g_nb_x_up, 136, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* z2x edge */ + MPI_Sendrecv(gf[gI_0_Lm1_0_Lp1], 1, gauge_zx_edge_gath, g_nb_x_up, 137, + gf[gI_0_m1_0_Lp1], 1, gauge_zx_edge_cont, g_nb_x_dn, 137, + g_cart_grid, &status); + + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + /* x2z edge */ + MPI_Sendrecv(gf[gI_0_p1_0_L], 1, gauge_zx_edge_gath, g_nb_x_dn, 138, + gf[gI_0_Lp1_0_L], 1, gauge_zx_edge_cont, g_nb_x_up, 138, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* x2z-edge */ + MPI_Sendrecv(gf[gI_0_Lm2_0_L], 1, gauge_zx_edge_gath, g_nb_x_up, 139, + gf[gI_0_m2_0_L], 1, gauge_zx_edge_cont, g_nb_x_dn, 139, + g_cart_grid, &status); + +# endif +# if (defined PARALLELXYZT || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* z2y-edge */ + MPI_Sendrecv(gf[gI_0_0_0_Lp1], 1, gauge_zy_edge_gath, g_nb_y_dn, 140, + gf[gI_0_0_L_Lp1], 1, gauge_zy_edge_cont, g_nb_y_up, 140, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* z2y edge */ + MPI_Sendrecv(gf[gI_0_0_Lm1_Lp1], 1, gauge_zy_edge_gath, g_nb_y_up, 141, + gf[gI_0_0_m1_Lp1], 1, gauge_zy_edge_cont, g_nb_y_dn, 141, + g_cart_grid, &status); + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* y2z edge */ + MPI_Sendrecv(gf[gI_0_0_p1_L], 1, gauge_zy_edge_gath, g_nb_y_dn, 142, + gf[gI_0_0_Lp1_L], 1, gauge_zy_edge_cont, g_nb_y_up, 142, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* y2z-edge */ + MPI_Sendrecv(gf[gI_0_0_Lm2_L], 1, gauge_zy_edge_gath, g_nb_y_up, 143, + gf[gI_0_0_m2_L], 1, gauge_zy_edge_cont, g_nb_y_dn, 143, + g_cart_grid, &status); + +# endif /* end of if defined PARALLELXYZT or PARALLELXYZ */ + } + +#endif /* MPI */ + return; +} + +# else /* _INDEX_INDEP_GEOM */ +void xchange_gauge(su3 ** const gf) { + +#ifdef MPI + + MPI_Status status; + /* send the data to the neighbour on the left */ + /* recieve the data from the neighbour on the right */ + MPI_Sendrecv(gf[0], 1, gauge_time_slice_cont, g_nb_t_dn, 83, + gf[VOLUME], 1, gauge_time_slice_cont, g_nb_t_up, 83, + g_cart_grid, &status); + + /* send the data to the neighbour on the right */ + /* recieve the data from the neighbour on the left */ + MPI_Sendrecv(gf[(T-1)*LX*LY*LZ], 1, gauge_time_slice_cont, g_nb_t_up, 84, + gf[(T+1)*LX*LY*LZ], 1, gauge_time_slice_cont, g_nb_t_dn, 84, + g_cart_grid, &status); + + if(g_dbw2rand > 0) { + /* send the data to the neighbour on the left */ + /* recieve the data from the neighbour on the right */ + /* t2-Rand */ + MPI_Sendrecv(gf[1*LX*LY*LZ], 1, gauge_time_slice_cont, g_nb_t_dn, 85, + gf[VOLUMEPLUSRAND], 1, gauge_time_slice_cont, g_nb_t_up, 85, + g_cart_grid, &status); + + /* send the data to the neighbour on the right */ + /* recieve the data from the neighbour on the left */ + /* t2-Rand */ + MPI_Sendrecv(gf[(T-2)*LX*LY*LZ], 1, gauge_time_slice_cont, g_nb_t_up, 86, + gf[VOLUMEPLUSRAND+LX*LY*LZ], 1, gauge_time_slice_cont, g_nb_t_dn, 86, + g_cart_grid, &status); + } + +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + MPI_Sendrecv(gf[0], 1, gauge_x_slice_gath, g_nb_x_dn, 93, + gf[(T+2)*LX*LY*LZ], 1, gauge_x_slice_cont, g_nb_x_up, 93, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* x2-Rand */ + MPI_Sendrecv(gf[(LX-1)*LY*LZ], 1, gauge_x_slice_gath, g_nb_x_up, 94, + gf[(T+2)*LX*LY*LZ + T*LY*LZ], 1, gauge_x_slice_cont, g_nb_x_dn, 94, + g_cart_grid, &status); + + if(g_dbw2rand > 0) { + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + /* x2-Rand */ + MPI_Sendrecv(gf[LY*LZ], 1, gauge_x_slice_gath, g_nb_x_dn, 95, + gf[VOLUMEPLUSRAND+2*LX*LY*LZ], 1, gauge_x_slice_cont, g_nb_x_up, 95, + g_cart_grid, &status); + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* x2-Rand */ + MPI_Sendrecv(gf[(LX-2)*LY*LZ], 1, gauge_x_slice_gath, g_nb_x_up, 96, + gf[VOLUMEPLUSRAND+2*LX*LY*LZ + T*LY*LZ], 1, gauge_x_slice_cont, g_nb_x_dn, 96, + g_cart_grid, &status); + } + + /* The edges */ + + /* send the data to the neighbour on the left in t direction */ + /* recieve the data from the neighbour on the right in t direction */ + /* is on the x-Rand: xt-edge */ + MPI_Sendrecv(gf[(T+2)*LX*LY*LZ], 1, gauge_xt_edge_gath, g_nb_t_dn, 95, + gf[VOLUME + RAND], 1, gauge_xt_edge_cont, g_nb_t_up, 95, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in t direction */ + /* recieve the data from the neighbour on the left in t direction */ + /* xt-edge */ + MPI_Sendrecv(gf[(T+2)*LX*LY*LZ + (T-1)*LY*LZ], 1, gauge_xt_edge_gath, g_nb_t_up, 96, + gf[VOLUME + RAND + 2*LY*LZ], 1, gauge_xt_edge_cont, g_nb_t_dn, 96, + g_cart_grid, &status); + + if(g_dbw2rand > 0) { + /* send the data to the neighbour on the left in t direction */ + /* recieve the data from the neighbour on the right in t direction */ + /* t2x-edge */ + MPI_Sendrecv(gf[(T+2)*LX*LY*LZ + LY*LZ], + 1, gauge_xt_edge_gath, g_nb_t_dn, 97, + gf[VOLUMEPLUSRAND + RAND], + 1, gauge_xt_edge_cont, g_nb_t_up, 97, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in t direction */ + /* recieve the data from the neighbour on the left in t direction */ + /* t2x-edge */ + MPI_Sendrecv(gf[(T+2)*LX*LY*LZ + (T-2)*LY*LZ], + 1, gauge_xt_edge_gath, g_nb_t_up, 98, + gf[VOLUMEPLUSRAND + RAND + 2*LY*LZ], + 1, gauge_xt_edge_cont, g_nb_t_dn, 98, + g_cart_grid, &status); + + /* send the data to the neighbour on the left in t direction */ + /* recieve the data from the neighbour on the right in t direction */ + /* x2t-edge */ + MPI_Sendrecv(gf[VOLUMEPLUSRAND + 2*LX*LY*LZ], + 1, gauge_xt_edge_gath, g_nb_t_dn, 97, + gf[VOLUMEPLUSRAND + RAND + 4*LY*LZ], + 1, gauge_xt_edge_cont, g_nb_t_up, 97, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in t direction */ + /* recieve the data from the neighbour on the left in t direction */ + /* x2t-edge */ + MPI_Sendrecv(gf[VOLUMEPLUSRAND + 2*LX*LY*LZ + (T-1)*LY*LZ], + 1, gauge_xt_edge_gath, g_nb_t_up, 98, + gf[VOLUMEPLUSRAND + RAND + 6*LY*LZ], + 1, gauge_xt_edge_cont, g_nb_t_dn, 98, + g_cart_grid, &status); + } + /* end of if defined PARALLELXT || PARALLELXYT || PARALLELXYZT*/ +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT) + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + MPI_Sendrecv(gf[0], 1, gauge_y_slice_gath, g_nb_y_dn, 103, + gf[VOLUME + 2*LZ*(LX*LY + T*LY)], 1, gauge_y_slice_cont, g_nb_y_up, 103, + g_cart_grid, &status); + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + MPI_Sendrecv(gf[(LY-1)*LZ], 1, gauge_y_slice_gath, g_nb_y_up, 104, + gf[VOLUME + 2*LZ*(LX*LY + T*LY) + T*LX*LZ], 1, gauge_y_slice_cont, g_nb_y_dn, 104, + g_cart_grid, &status); + + if(g_dbw2rand > 0) { + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + /* y2-Rand */ + MPI_Sendrecv(gf[LZ], 1, gauge_y_slice_gath, g_nb_y_dn, 105, + gf[VOLUMEPLUSRAND+(2*LX+2*T)*LY*LZ], 1, gauge_y_slice_cont, g_nb_y_up, 105, + g_cart_grid, &status); + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* y2-Rand */ + MPI_Sendrecv(gf[(LY-2)*LZ], 1, gauge_y_slice_gath, g_nb_y_up, 106, + gf[VOLUMEPLUSRAND+(2*LX+2*T)*LY*LZ + T*LX*LZ], 1, gauge_y_slice_cont, g_nb_y_dn, 106, + g_cart_grid, &status); + } + + /* jetzt wirds richtig eklig ... */ + + /* edges */ + + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + /* is on the y-Rand -> yx-edge*/ + MPI_Sendrecv(gf[VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ], 1, gauge_yx_edge_gath, g_nb_x_dn, 107, + gf[VOLUME + RAND + 4*LY*LZ], 1, gauge_yx_edge_cont, g_nb_x_up, 107, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* yx-edge */ + MPI_Sendrecv(gf[VOLUME + 2*LZ*(LX*LY + T*LY) + (LX-1)*LZ], 1, gauge_yx_edge_gath, g_nb_x_up, 108, + gf[VOLUME + RAND + 4*LY*LZ + 2*T*LZ], 1, gauge_yx_edge_cont, g_nb_x_dn, 108, + g_cart_grid, &status); + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* is on the t-Rand -> ty-edge*/ + MPI_Sendrecv(gf[VOLUME], 1, gauge_ty_edge_gath, g_nb_y_dn, 109, + gf[VOLUME + RAND + 4*LY*LZ + 4*T*LZ], 1, gauge_ty_edge_cont, g_nb_y_up, 109, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* ty-edge */ + MPI_Sendrecv(gf[VOLUME + (LY-1)*LZ], 1, gauge_ty_edge_gath, g_nb_y_up, 110, + gf[VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 2*LX*LZ], 1, gauge_ty_edge_cont, g_nb_y_dn, 110, + g_cart_grid, &status); + + + + if(g_dbw2rand > 0) { + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* x2y edge */ + MPI_Sendrecv(gf[VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + LZ], + 1, gauge_yx_edge_gath, g_nb_x_dn, 97, + gf[VOLUMEPLUSRAND + RAND + 8*LY*LZ], + 1, gauge_yx_edge_cont, g_nb_x_up, 97, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* x2y-edge */ + MPI_Sendrecv(gf[VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + (LX-2)*LZ], + 1, gauge_yx_edge_gath, g_nb_x_up, 98, + gf[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 2*T*LZ], + 1, gauge_yx_edge_cont, g_nb_x_dn, 98, + g_cart_grid, &status); + + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* y2x -edge */ + MPI_Sendrecv(gf[VOLUMEPLUSRAND + 2*LX*LY*LZ + 2*T*LY*LZ], + 1, gauge_yx_edge_gath, g_nb_x_dn, 97, + gf[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 4*T*LZ], + 1, gauge_yx_edge_cont, g_nb_x_up, 97, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* y2x edge */ + MPI_Sendrecv(gf[VOLUMEPLUSRAND + 2*LX*LY*LZ + 2*T*LY*LZ + (LX-1)*LZ], + 1, gauge_yx_edge_gath, g_nb_x_up, 98, + gf[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 6*T*LZ], + 1, gauge_yx_edge_cont, g_nb_x_dn, 98, + g_cart_grid, &status); + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* t2y-edge */ + MPI_Sendrecv(gf[VOLUMEPLUSRAND], + 1, gauge_ty_edge_gath, g_nb_y_dn, 197, + gf[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ], + 1, gauge_ty_edge_cont, g_nb_y_up, 197, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* t2y edge */ + MPI_Sendrecv(gf[VOLUMEPLUSRAND + (LY-1)*LZ], + 1, gauge_ty_edge_gath, g_nb_y_up, 198, + gf[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 2*LX*LZ], + 1, gauge_ty_edge_cont, g_nb_y_dn, 198, + g_cart_grid, &status); + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* y2t edge */ + MPI_Sendrecv(gf[VOLUME + LZ], + 1, gauge_ty_edge_gath, g_nb_y_dn, 297, + gf[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 4*LX*LZ], + 1, gauge_ty_edge_cont, g_nb_y_up, 297, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* y2t-edge */ + MPI_Sendrecv(gf[VOLUME + (LY-2)*LZ], + 1, gauge_ty_edge_gath, g_nb_y_up, 298, + gf[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 6*LX*LZ], + 1, gauge_ty_edge_cont, g_nb_y_dn, 298, + g_cart_grid, &status); + } + + /* end of if defined PARALLELXYT || PARALLELXYZT */ +# endif +# if defined PARALLELXYZT + /* z-Rand */ + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + MPI_Sendrecv(gf[0], + 1, gauge_z_slice_gath, g_nb_z_dn, 303, + gf[VOLUME + 2*LZ*(LX*LY + T*LY) + 2*LZ*T*LX], + 1, gauge_z_slice_cont, g_nb_z_up, 303, + g_cart_grid, &status); + /* send the data to the neighbour on the right in z direction */ + /* recieve the data from the neighbour on the left in z direction */ + MPI_Sendrecv(gf[LZ-1], + 1, gauge_z_slice_gath, g_nb_z_up, 304, + gf[VOLUME + 2*LZ*(LX*LY + T*LY) + 2*T*LX*LZ + T*LX*LY], + 1, gauge_z_slice_cont, g_nb_z_dn, 304, + g_cart_grid, &status); + + if(g_dbw2rand > 0) { + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + /* z2-Rand */ + MPI_Sendrecv(gf[1], + 1, gauge_z_slice_gath, g_nb_z_dn, 305, + gf[VOLUMEPLUSRAND+(2*LX+2*T)*LY*LZ + 2*T*LX*LZ], + 1, gauge_z_slice_cont, g_nb_z_up, 305, + g_cart_grid, &status); + /* send the data to the neighbour on the right in z direction */ + /* recieve the data from the neighbour on the left in z direction */ + /* z2-Rand */ + MPI_Sendrecv(gf[LZ-2], + 1, gauge_z_slice_gath, g_nb_z_up, 306, + gf[VOLUMEPLUSRAND+(2*LX+2*T)*LY*LZ + 2*T*LX*LZ + T*LX*LY], + 1, gauge_z_slice_cont, g_nb_z_dn, 306, + g_cart_grid, &status); + } + + /* edges */ + + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + /* is on the z-Rand -> zx-edge*/ + MPI_Sendrecv(gf[VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ], + 1, gauge_zx_edge_gath, g_nb_x_dn, 307, + gf[VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ], + 1, gauge_zx_edge_cont, g_nb_x_up, 307, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* zx-edge */ + MPI_Sendrecv(gf[VOLUME + 2*LZ*(LX*LY + T*LY) + 2*T*LX*LZ + (LX-1)*LY], + 1, gauge_zx_edge_gath, g_nb_x_up, 308, + gf[VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ + 2*T*LY], + 1, gauge_zx_edge_cont, g_nb_x_dn, 308, + g_cart_grid, &status); + + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + /* is on the t-Rand -> tz-edge*/ + MPI_Sendrecv(gf[VOLUME], + 1, gauge_tz_edge_gath, g_nb_z_dn, 309, + gf[VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ + 4*T*LY], + 1, gauge_tz_edge_cont, g_nb_z_up, 309, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in z direction */ + /* recieve the data from the neighbour on the left in z direction */ + /* tz-edge */ + MPI_Sendrecv(gf[VOLUME + (LZ-1)], + 1, gauge_tz_edge_gath, g_nb_z_up, 310, + gf[VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ + 4*T*LY + 2*LX*LY], + 1, gauge_tz_edge_cont, g_nb_z_dn, 310, + g_cart_grid, &status); + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* is on the z-Rand -> zy-edge*/ + MPI_Sendrecv(gf[VOLUME + 2*LZ*(LX*LY + T*LY) + 2*T*LX*LZ], + 1, gauge_zy_edge_gath, g_nb_y_dn, 310, + gf[VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ + 4*T*LY + 4*LX*LY], + 1, gauge_zy_edge_cont, g_nb_y_up, 310, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* zy-edge */ + MPI_Sendrecv(gf[VOLUME + 2*LZ*(LX*LY + T*LY) + 2*T*LX*LZ + (LY-1)], + 1, gauge_zy_edge_gath, g_nb_y_up, 310, + gf[VOLUME + RAND + 4*LY*LZ + 4*T*LZ + 4*LX*LZ + 4*T*LY + 4*LX*LY + 2*T*LX], + 1, gauge_zy_edge_cont, g_nb_y_dn, 310, + g_cart_grid, &status); + + /* rectangular gauge action Stuff! */ + if(g_dbw2rand > 0) { + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + /* t2z edge */ + MPI_Sendrecv(gf[VOLUMEPLUSRAND], + 1, gauge_tz_edge_gath, g_nb_z_dn, 500, + gf[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ], + 1, gauge_tz_edge_cont, g_nb_z_up, 500, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in z direction */ + /* recieve the data from the neighbour on the left in z direction */ + /* t2z-edge */ + MPI_Sendrecv(gf[VOLUMEPLUSRAND + (LZ-1)], + 1, gauge_tz_edge_gath, g_nb_z_up, 501, + gf[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 2*LX*LY], + 1, gauge_tz_edge_cont, g_nb_z_dn, 501, + g_cart_grid, &status); + + + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + /* z2t -edge */ + MPI_Sendrecv(gf[VOLUME + 1], + 1, gauge_tz_edge_gath, g_nb_z_dn, 502, + gf[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 4*LX*LY], + 1, gauge_tz_edge_cont, g_nb_z_up, 502, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in z direction */ + /* recieve the data from the neighbour on the left in z direction */ + /* z2t edge */ + MPI_Sendrecv(gf[VOLUME + (LZ-2)], + 1, gauge_tz_edge_gath, g_nb_z_up, 503, + gf[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 6*LX*LY], + 1, gauge_tz_edge_cont, g_nb_z_dn, 503, + g_cart_grid, &status); + + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + /* z2x-edge */ + MPI_Sendrecv(gf[VOLUMEPLUSRAND + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ], + 1, gauge_zx_edge_gath, g_nb_x_dn, 504, + gf[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY], + 1, gauge_zx_edge_cont, g_nb_x_up, 504, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* z2x edge */ + MPI_Sendrecv(gf[VOLUMEPLUSRAND + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + (LX-1)*LY], + 1, gauge_zx_edge_gath, g_nb_x_up, 504, + gf[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 2*T*LY], + 1, gauge_zx_edge_cont, g_nb_x_dn, 504, + g_cart_grid, &status); + + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + /* x2z edge */ + MPI_Sendrecv(gf[VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + LY], + 1, gauge_zx_edge_gath, g_nb_x_dn, 505, + gf[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 4*T*LY], + 1, gauge_zx_edge_cont, g_nb_x_up, 505, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + /* x2z-edge */ + MPI_Sendrecv(gf[VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + (LX-2)*LY], + 1, gauge_zx_edge_gath, g_nb_x_up, 506, + gf[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 6*T*LY], + 1, gauge_zx_edge_cont, g_nb_x_dn, 506, + g_cart_grid, &status); + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* z2y-edge */ + MPI_Sendrecv(gf[VOLUMEPLUSRAND + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ], + 1, gauge_zy_edge_gath, g_nb_y_dn, 507, + gf[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 8*T*LY], + 1, gauge_zy_edge_cont, g_nb_y_up, 507, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* z2y edge */ + MPI_Sendrecv(gf[VOLUMEPLUSRAND + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + (LY-1)], + 1, gauge_zy_edge_gath, g_nb_y_up, 508, + gf[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 8*T*LY + 2*T*LX], + 1, gauge_zy_edge_cont, g_nb_y_dn, 508, + g_cart_grid, &status); + + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + /* y2z edge */ + MPI_Sendrecv(gf[VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + 1], + 1, gauge_zy_edge_gath, g_nb_y_dn, 509, + gf[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 8*T*LY + 4*T*LX], + 1, gauge_zy_edge_cont, g_nb_y_up, 509, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + /* y2z-edge */ + MPI_Sendrecv(gf[VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + (LY-2)], + 1, gauge_zy_edge_gath, g_nb_y_up, 510, + gf[VOLUMEPLUSRAND + RAND + 8*LY*LZ + 8*T*LZ + 8*LX*LZ + 8*LX*LY + 8*T*LY + 6*T*LX], + 1, gauge_zy_edge_cont, g_nb_y_dn, 510, + g_cart_grid, &status); + + } + + + /* end of if defined PARALLELXYZT */ +# endif +#endif + return; +} + +# endif /* _INDEX_INDEP_GEOM */ + +#endif /* _NON_BLOCKING */ + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/xchange_gauge.h b/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/xchange_gauge.h new file mode 100644 index 0000000000000000000000000000000000000000..172bcbfa270a6df2c46a5dac20a71d4cb3313c36 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/xchange_gauge.h @@ -0,0 +1,33 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008,2012 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * exchange routines for gauge fields + * + * Author: Carsten Urbach + * + **********************************************************/ + + +#ifndef _XCHANGE_GAUGE_H +#define _XCHANGE_GAUGE_H + +#include "su3.h" + +void xchange_gauge(su3 ** const gf); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/xchange_halffield.c b/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/xchange_halffield.c new file mode 100644 index 0000000000000000000000000000000000000000..73106a26ec59d12ab6f2df8704b9dcf2266fa9b1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/xchange_halffield.c @@ -0,0 +1,476 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +/********************************************************** + * + * exchange routines for half spinor fields + * + * Author: Carsten Urbach + * + **********************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#ifdef MPI +# include +#endif +#include "global.h" +#if (defined XLC && defined BGL) +# include "bgl.h" +#endif +#include "mpi_init.h" +#include "su3.h" +#include "init/init_dirac_halfspinor.h" +#include "xchange_halffield.h" + +#if (defined _USE_HALFSPINOR) + +#if (defined _PERSISTENT) + +MPI_Request prequests[16]; + +/* 2. */ +void init_xchange_halffield() { + +# ifdef MPI + +# ifdef PARALLELT + int reqcount = 4; +# elif defined PARALLELXT + int reqcount = 8; +# elif defined PARALLELXYT + int reqcount = 12; +# elif defined PARALLELXYZT + int x0=0, x1=0, x2=0, ix=0; + int reqcount = 16; +# endif +# if (defined XLC && defined BGL) + __alignx(16, HalfSpinor); +# endif + + /* send the data to the neighbour on the right in t direction */ + /* recieve the data from the neighbour on the left in t direction */ + MPI_Send_init((void*)(sendBuffer), LX*LY*LZ*12/2, MPI_DOUBLE, + g_nb_t_up, 81, g_cart_grid, &prequests[0]); + + MPI_Recv_init((void*)(recvBuffer + LX*LY*LZ/2), LX*LY*LZ*12/2, MPI_DOUBLE, + g_nb_t_dn, 81, g_cart_grid, &prequests[1]); + + /* send the data to the neighbour on the left in t direction */ + /* recieve the data from the neighbour on the right in t direction */ + MPI_Send_init((void*)(sendBuffer + LX*LY*LZ/2), LX*LY*LZ*12/2, MPI_DOUBLE, + g_nb_t_dn, 82, g_cart_grid, &prequests[2]); + + MPI_Recv_init((void*)(recvBuffer), LX*LY*LZ*12/2, MPI_DOUBLE, + g_nb_t_up, 82, g_cart_grid, &prequests[3]); + +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + MPI_Send_init((void*)(sendBuffer + LX*LY*LZ), T*LY*LZ*12/2, MPI_DOUBLE, + g_nb_x_up, 91, g_cart_grid, &prequests[4]); + + MPI_Recv_init((void*)(recvBuffer + LX*LY*LZ + T*LY*LZ/2), T*LY*LZ*12/2, MPI_DOUBLE, + g_nb_x_dn, 91, g_cart_grid, &prequests[5]); + + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + MPI_Send_init((void*)(sendBuffer + LX*LY*LZ + T*LY*LZ/2), T*LY*LZ*12/2, MPI_DOUBLE, + g_nb_x_dn, 92, g_cart_grid, &prequests[6]); + + MPI_Recv_init((void*)(recvBuffer + LX*LY*LZ), T*LY*LZ*12/2, MPI_DOUBLE, + g_nb_x_up, 92, g_cart_grid, &prequests[7]); +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT) + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + MPI_Send_init((void*)(sendBuffer + LX*LY*LZ + T*LY*LZ), T*LX*LZ*12/2, MPI_DOUBLE, + g_nb_y_up, 101, g_cart_grid, &prequests[8]); + + MPI_Recv_init((void*)(recvBuffer + LX*LY*LZ + T*LY*LZ + T*LX*LZ/2), T*LX*LZ*12/2, MPI_DOUBLE, + g_nb_y_dn, 101, g_cart_grid, &prequests[9]); + + /* send the data to the neighbour on the leftt in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + MPI_Send_init((void*)(sendBuffer + LX*LY*LZ + T*LY*LZ + T*LX*LZ/2), T*LX*LZ*12/2, MPI_DOUBLE, + g_nb_y_dn, 102, g_cart_grid, &prequests[10]); + + MPI_Recv_init((void*)(recvBuffer + LX*LY*LZ + T*LY*LZ), T*LX*LZ*12/2, MPI_DOUBLE, + g_nb_y_up, 102, g_cart_grid, &prequests[11]); +# endif + +# if (defined PARALLELXYZT) + /* send the data to the neighbour on the right in z direction */ + /* recieve the data from the neighbour on the left in z direction */ + MPI_Send_init((void*)(sendBuffer + LX*LY*LZ + T*LY*LZ + T*LX*LZ), + T*LX*LY*12/2, MPI_DOUBLE, g_nb_z_up, 503, g_cart_grid, &prequests[12]); + + MPI_Recv_init((void*)(recvBuffer + LX*LY*LZ + T*LY*LZ + T*LX*LZ + T*LX*LY/2), + T*LX*LY*12/2, MPI_DOUBLE, g_nb_z_dn, 503, g_cart_grid, &prequests[13]); + + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + MPI_Send_init((void*)(sendBuffer + LX*LY*LZ + T*LY*LZ + T*LX*LZ + T*LX*LY/2), + 12*T*LX*LY/2, MPI_DOUBLE, g_nb_z_dn, 504, g_cart_grid, &prequests[14]); + + MPI_Recv_init((void*)(recvBuffer + LX*LY*LZ + T*LY*LZ + T*LX*LZ), + T*LX*LY*12/2, MPI_DOUBLE, g_nb_z_up, 504, g_cart_grid, &prequests[15]); +# endif +# endif /* MPI */ + return; +} + +/* 3. */ +void xchange_halffield() { +# ifdef MPI + + MPI_Status status[16]; +# ifdef PARALLELT + int reqcount = 4; +# elif defined PARALLELXT + int reqcount = 8; +# elif defined PARALLELXYT + int reqcount = 12; +# elif defined PARALLELXYZT + int x0=0, x1=0, x2=0, ix=0; + int reqcount = 16; +# endif +# if (defined XLC && defined BGL) + __alignx(16, HalfSpinor); +# endif + MPI_Startall(reqcount, prequests); + + MPI_Waitall(reqcount, prequests, status); +# endif /* MPI */ + return; +} + +#else /* def (_USE_SHMEM || _PERSISTENT) */ + +# if defined _INDEX_INDEP_GEOM + +/* 4. -IIG */ +void xchange_halffield() { + +# ifdef MPI + + MPI_Request requests[16]; + MPI_Status status[16]; +# if ((defined PARALLELT) || (defined PARALLELX)) + int reqcount = 4; +# elif ((defined PARALLELXT) || (defined PARALLELXY)) + int reqcount = 8; +# elif ((defined PARALLELXYT) || (defined PARALLELXYZ)) + int reqcount = 12; +# elif defined PARALLELXYZT + int reqcount = 16; +# endif +# if (defined XLC && defined BGL) + __alignx(16, HalfSpinor); +# endif + +#ifdef _KOJAK_INST +#pragma pomp inst begin(xchangehalf) +#endif + +# if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT ) + /* send the data to the neighbour on the right in t direction */ + /* recieve the data from the neighbour on the left in t direction */ + MPI_Isend((void*)(sendBuffer + g_HS_shift_t), LX*LY*LZ*12/2, MPI_DOUBLE, + g_nb_t_up, 81, g_cart_grid, &requests[0]); + MPI_Irecv((void*)(recvBuffer + g_HS_shift_t + LX*LY*LZ/2), LX*LY*LZ*12/2, MPI_DOUBLE, + g_nb_t_dn, 81, g_cart_grid, &requests[1]); + /* send the data to the neighbour on the left in t direction */ + /* recieve the data from the neighbour on the right in t direction */ + MPI_Isend((void*)(sendBuffer + g_HS_shift_t + LX*LY*LZ/2), LX*LY*LZ*12/2, MPI_DOUBLE, + g_nb_t_dn, 82, g_cart_grid, &requests[2]); + MPI_Irecv((void*)(recvBuffer + g_HS_shift_t), LX*LY*LZ*12/2, MPI_DOUBLE, + g_nb_t_up, 82, g_cart_grid, &requests[3]); +# endif +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ ) + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + MPI_Isend((void*)(sendBuffer + g_HS_shift_x), T*LY*LZ*12/2, MPI_DOUBLE, + g_nb_x_up, 91, g_cart_grid, &requests[4]); + MPI_Irecv((void*)(recvBuffer + g_HS_shift_x + T*LY*LZ/2), T*LY*LZ*12/2, MPI_DOUBLE, + g_nb_x_dn, 91, g_cart_grid, &requests[5]); + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + MPI_Isend((void*)(sendBuffer + g_HS_shift_x + T*LY*LZ/2), T*LY*LZ*12/2, MPI_DOUBLE, + g_nb_x_dn, 92, g_cart_grid, &requests[6]); + MPI_Irecv((void*)(recvBuffer + g_HS_shift_x), T*LY*LZ*12/2, MPI_DOUBLE, + g_nb_x_up, 92, g_cart_grid, &requests[7]); +# endif +# if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ ) + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + MPI_Isend((void*)(sendBuffer + g_HS_shift_y), T*LX*LZ*12/2, MPI_DOUBLE, + g_nb_y_up, 101, g_cart_grid, &requests[8]); + MPI_Irecv((void*)(recvBuffer + g_HS_shift_y + T*LX*LZ/2), T*LX*LZ*12/2, MPI_DOUBLE, + g_nb_y_dn, 101, g_cart_grid, &requests[9]); + /* send the data to the neighbour on the leftt in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + MPI_Isend((void*)(sendBuffer + g_HS_shift_y + T*LX*LZ/2), T*LX*LZ*12/2, MPI_DOUBLE, + g_nb_y_dn, 102, g_cart_grid, &requests[10]); + MPI_Irecv((void*)(recvBuffer + g_HS_shift_y), T*LX*LZ*12/2, MPI_DOUBLE, + g_nb_y_up, 102, g_cart_grid, &requests[11]); +# endif +# if (defined PARALLELXYZT || defined PARALLELXYZ ) + /* send the data to the neighbour on the right in z direction */ + /* recieve the data from the neighbour on the left in z direction */ + MPI_Isend((void*)(sendBuffer + g_HS_shift_z), T*LX*LY*12/2, MPI_DOUBLE, + g_nb_z_up, 503, g_cart_grid, &requests[12]); + MPI_Irecv((void*)(recvBuffer + g_HS_shift_z + T*LX*LY/2), T*LX*LY*12/2, MPI_DOUBLE, + g_nb_z_dn, 503, g_cart_grid, &requests[13]); + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + MPI_Isend((void*)(sendBuffer + g_HS_shift_z + T*LX*LY/2), 12*T*LX*LY/2, MPI_DOUBLE, + g_nb_z_dn, 504, g_cart_grid, &requests[14]); + MPI_Irecv((void*)(recvBuffer + g_HS_shift_z), T*LX*LY*12/2, MPI_DOUBLE, + g_nb_z_up, 504, g_cart_grid, &requests[15]); +# endif + + MPI_Waitall(reqcount, requests, status); +# endif /* MPI */ + return; + +#ifdef _KOJAK_INST +#pragma pomp inst end(xchangehalf) +#endif +} + +# else /* _INDEX_INDEP_GEOM */ + +/* 4. */ +void xchange_halffield() { + +# ifdef MPI + + MPI_Request requests[16]; + MPI_Status status[16]; +# ifdef PARALLELT + int reqcount = 4; +# elif defined PARALLELXT + int reqcount = 8; +# elif defined PARALLELXYT + int reqcount = 12; +# elif defined PARALLELXYZT + int reqcount = 16; +# endif +# if (defined XLC && defined BGL) + __alignx(16, HalfSpinor); +# endif + +#ifdef _KOJAK_INST +#pragma pomp inst begin(xchangehalf) +#endif + /* send the data to the neighbour on the right in t direction */ + /* recieve the data from the neighbour on the left in t direction */ + MPI_Isend((void*)(sendBuffer), LX*LY*LZ*12/2, MPI_DOUBLE, + g_nb_t_up, 81, g_cart_grid, &requests[0]); + MPI_Irecv((void*)(recvBuffer + LX*LY*LZ/2), LX*LY*LZ*12/2, MPI_DOUBLE, + g_nb_t_dn, 81, g_cart_grid, &requests[1]); + + /* send the data to the neighbour on the left in t direction */ + /* recieve the data from the neighbour on the right in t direction */ + MPI_Isend((void*)(sendBuffer+ LX*LY*LZ/2), LX*LY*LZ*12/2, MPI_DOUBLE, + g_nb_t_dn, 82, g_cart_grid, &requests[2]); + MPI_Irecv((void*)(recvBuffer), LX*LY*LZ*12/2, MPI_DOUBLE, + g_nb_t_up, 82, g_cart_grid, &requests[3]); + +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + MPI_Isend((void*)(sendBuffer + LX*LY*LZ), T*LY*LZ*12/2, MPI_DOUBLE, + g_nb_x_up, 91, g_cart_grid, &requests[4]); + MPI_Irecv((void*)(recvBuffer+ LX*LY*LZ + T*LY*LZ/2), T*LY*LZ*12/2, MPI_DOUBLE, + g_nb_x_dn, 91, g_cart_grid, &requests[5]); + + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + MPI_Isend((void*)(sendBuffer + LX*LY*LZ + T*LY*LZ/2), T*LY*LZ*12/2, MPI_DOUBLE, + g_nb_x_dn, 92, g_cart_grid, &requests[6]); + MPI_Irecv((void*)(recvBuffer + LX*LY*LZ), T*LY*LZ*12/2, MPI_DOUBLE, + g_nb_x_up, 92, g_cart_grid, &requests[7]); +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT) + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + MPI_Isend((void*)(sendBuffer + LX*LY*LZ + T*LY*LZ), T*LX*LZ*12/2, MPI_DOUBLE, + g_nb_y_up, 101, g_cart_grid, &requests[8]); + MPI_Irecv((void*)(recvBuffer + LX*LY*LZ + T*LY*LZ + T*LX*LZ/2), T*LX*LZ*12/2, MPI_DOUBLE, + g_nb_y_dn, 101, g_cart_grid, &requests[9]); + + /* send the data to the neighbour on the leftt in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + MPI_Isend((void*)(sendBuffer + LX*LY*LZ + T*LY*LZ + T*LX*LZ/2), T*LX*LZ*12/2, MPI_DOUBLE, + g_nb_y_dn, 102, g_cart_grid, &requests[10]); + MPI_Irecv((void*)(recvBuffer + LX*LY*LZ + T*LY*LZ), T*LX*LZ*12/2, MPI_DOUBLE, + g_nb_y_up, 102, g_cart_grid, &requests[11]); +# endif + +# if (defined PARALLELXYZT) + /* send the data to the neighbour on the right in z direction */ + /* recieve the data from the neighbour on the left in z direction */ + MPI_Isend((void*)(sendBuffer + LX*LY*LZ + T*LY*LZ + T*LX*LZ), + T*LX*LY*12/2, MPI_DOUBLE, g_nb_z_up, 503, g_cart_grid, &requests[12]); + MPI_Irecv((void*)(recvBuffer + LX*LY*LZ + T*LY*LZ + T*LX*LZ + T*LX*LY/2), + T*LX*LY*12/2, MPI_DOUBLE, g_nb_z_dn, 503, g_cart_grid, &requests[13]); + + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + MPI_Isend((void*)(sendBuffer + LX*LY*LZ + T*LY*LZ + T*LX*LZ + T*LX*LY/2), + 12*T*LX*LY/2, MPI_DOUBLE, g_nb_z_dn, 504, g_cart_grid, &requests[14]); + MPI_Irecv((void*)(recvBuffer + LX*LY*LZ + T*LY*LZ + T*LX*LZ), + T*LX*LY*12/2, MPI_DOUBLE, g_nb_z_up, 504, g_cart_grid, &requests[15]); +# endif + + MPI_Waitall(reqcount, requests, status); +# endif /* MPI */ + return; + +#ifdef _KOJAK_INST +#pragma pomp inst end(xchangehalf) +#endif +} + +# endif /* _INDEX_INDEP_GEOM */ + +#endif /* def (_USE_SHMEM || _PERSISTENT) */ + + +# if defined _INDEX_INDEP_GEOM +// IIG xchange_halffield32 still Missing +# else // defined _INDEX_INDEP_GEOM +/* 32-2. */ +void xchange_halffield32() { + +# ifdef MPI + + MPI_Request requests[16]; + MPI_Status status[16]; +# ifdef PARALLELT + int reqcount = 4; +# elif defined PARALLELXT + int reqcount = 8; +# elif defined PARALLELXYT + int reqcount = 12; +# elif defined PARALLELXYZT + int reqcount = 16; +# endif +#ifdef _KOJAK_INST +#pragma pomp inst begin(xchangehalf32) +#endif +# if (defined XLC && defined BGL) + __alignx(16, HalfSpinor32); +# endif + + /* send the data to the neighbour on the right in t direction */ + /* recieve the data from the neighbour on the left in t direction */ + MPI_Isend((void*)(sendBuffer32), LX*LY*LZ*12/2, MPI_FLOAT, + g_nb_t_up, 81, g_cart_grid, &requests[0]); + MPI_Irecv((void*)(recvBuffer32 + LX*LY*LZ/2), LX*LY*LZ*12/2, MPI_FLOAT, + g_nb_t_dn, 81, g_cart_grid, &requests[1]); + + /* send the data to the neighbour on the left in t direction */ + /* recieve the data from the neighbour on the right in t direction */ + MPI_Isend((void*)(sendBuffer32 + LX*LY*LZ/2), LX*LY*LZ*12/2, MPI_FLOAT, + g_nb_t_dn, 82, g_cart_grid, &requests[2]); + MPI_Irecv((void*)(recvBuffer32), LX*LY*LZ*12/2, MPI_FLOAT, + g_nb_t_up, 82, g_cart_grid, &requests[3]); + +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + MPI_Isend((void*)(sendBuffer32 + LX*LY*LZ), T*LY*LZ*12/2, MPI_FLOAT, + g_nb_x_up, 91, g_cart_grid, &requests[4]); + MPI_Irecv((void*)(recvBuffer32 + LX*LY*LZ + T*LY*LZ/2), T*LY*LZ*12/2, MPI_FLOAT, + g_nb_x_dn, 91, g_cart_grid, &requests[5]); + + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + MPI_Isend((void*)(sendBuffer32 + LX*LY*LZ + T*LY*LZ/2), T*LY*LZ*12/2, MPI_FLOAT, + g_nb_x_dn, 92, g_cart_grid, &requests[6]); + MPI_Irecv((void*)(recvBuffer32 + LX*LY*LZ), T*LY*LZ*12/2, MPI_FLOAT, + g_nb_x_up, 92, g_cart_grid, &requests[7]); +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT) + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + MPI_Isend((void*)(sendBuffer32 + LX*LY*LZ + T*LY*LZ), T*LX*LZ*12/2, MPI_FLOAT, + g_nb_y_up, 101, g_cart_grid, &requests[8]); + MPI_Irecv((void*)(recvBuffer32 + LX*LY*LZ + T*LY*LZ + T*LX*LZ/2), T*LX*LZ*12/2, MPI_FLOAT, + g_nb_y_dn, 101, g_cart_grid, &requests[9]); + + /* send the data to the neighbour on the leftt in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + MPI_Isend((void*)(sendBuffer32 + LX*LY*LZ + T*LY*LZ + T*LX*LZ/2), T*LX*LZ*12/2, MPI_FLOAT, + g_nb_y_dn, 102, g_cart_grid, &requests[10]); + MPI_Irecv((void*)(recvBuffer32 + LX*LY*LZ + T*LY*LZ), T*LX*LZ*12/2, MPI_FLOAT, + g_nb_y_up, 102, g_cart_grid, &requests[11]); +# endif + +# if (defined PARALLELXYZT) + /* send the data to the neighbour on the right in z direction */ + /* recieve the data from the neighbour on the left in z direction */ + MPI_Isend((void*)(sendBuffer32 + LX*LY*LZ + T*LY*LZ + T*LX*LZ), + T*LX*LY*12/2, MPI_FLOAT, g_nb_z_up, 503, g_cart_grid, &requests[12]); + MPI_Irecv((void*)(recvBuffer32 + LX*LY*LZ + T*LY*LZ + T*LX*LZ + T*LX*LY/2), + T*LX*LY*12/2, MPI_FLOAT, g_nb_z_dn, 503, g_cart_grid, &requests[13]); + + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + MPI_Isend((void*)(sendBuffer32 + LX*LY*LZ + T*LY*LZ + T*LX*LZ + T*LX*LY/2), + 12*T*LX*LY/2, MPI_FLOAT, g_nb_z_dn, 504, g_cart_grid, &requests[14]); + MPI_Irecv((void*)(recvBuffer32 + LX*LY*LZ + T*LY*LZ + T*LX*LZ), + T*LX*LY*12/2, MPI_FLOAT, g_nb_z_up, 504, g_cart_grid, &requests[15]); +# endif + + MPI_Waitall(reqcount, requests, status); +# endif /* MPI */ + return; +#ifdef _KOJAK_INST +#pragma pomp inst end(xchangehalf32) +#endif +} +# endif /* defined _INDEX_INDEP_GEOM */ +#endif /* defined _USE_HALFSPINOR */ + + + + + + + + + + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/xchange_halffield.h b/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/xchange_halffield.h new file mode 100644 index 0000000000000000000000000000000000000000..688a8024f1c4dfc4df13a1cc992d704f2c032f7d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/xchange_halffield.h @@ -0,0 +1,25 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +#ifndef _XCHANGE_HALFFIELD_H +#define _XCHANGE_HALFFIELD_H + +void init_xchange_halffield(); +void xchange_halffield(); +void xchange_halffield32(); +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/xchange_jacobi.c b/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/xchange_jacobi.c new file mode 100644 index 0000000000000000000000000000000000000000..de1985af25fc279073cc3df70e47e3f0b3334964 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/xchange_jacobi.c @@ -0,0 +1,110 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ +/********************************************************** + * + * exchange routines for su3_vector fields + * + * Author: Luigi Scorzato + * + **********************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#ifdef MPI +# include +#endif + +#include "global.h" +#if (defined XLC && defined BGL) +# include "bgl.h" +#endif +#include "mpi_init.h" +#include "su3.h" +#include "xchange_jacobi.h" + +#ifdef WITHLAPH +/* Note that LAPH also implies _INDEX_INDEP_GEOM, NO PARALLELT* */ + +/* exchanges the field l */ +void xchange_jacobi(su3_vector * const l) { + +#ifdef _KOJAK_INST +#pragma pomp inst begin(xchange_jacobi) +#endif + +# ifdef MPI + + MPI_Status status; +# if (defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + MPI_Sendrecv((void*)(l+gI_0_0_0), 1, jfield_x_slice_gath, g_nb_x_dn, 5091, + (void*)(l+gI_L_0_0), 1, jfield_x_slice_cont, g_nb_x_up, 5091, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + MPI_Sendrecv((void*)(l+gI_Lm1_0_0), 1, jfield_x_slice_gath, g_nb_x_up, 5092, + (void*)(l+gI_m1_0_0), 1, jfield_x_slice_cont, g_nb_x_dn, 5092, + g_cart_grid, &status); + +# endif + +# if (defined PARALLELXY || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + MPI_Sendrecv((void*)(l+gI_0_0_0), 1, jfield_y_slice_gath, g_nb_y_dn, 5101, + (void*)(l+gI_0_L_0), 1, jfield_y_slice_cont, g_nb_y_up, 5101, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + MPI_Sendrecv((void*)(l+gI_0_Lm1_0), 1, jfield_y_slice_gath, g_nb_y_up, 5102, + (void*)(l+gI_0_m1_0), 1, jfield_y_slice_cont, g_nb_y_dn, 5102, + g_cart_grid, &status); + +# endif + +# if (defined PARALLELXYZ ) + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + MPI_Sendrecv((void*)(l+gI_0_0_0), 1, jfield_z_slice_gath, g_nb_z_dn, 5503, + (void*)(l+gI_0_0_L), 1, jfield_z_slice_cont, g_nb_z_up, 5503, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + MPI_Sendrecv((void*)(l+gI_0_0_Lm1), 1, jfield_z_slice_gath, g_nb_z_up, 5504, + (void*)(l+gI_0_0_m1), 1, jfield_z_slice_cont, g_nb_z_dn, 5504, + g_cart_grid, &status); + +# endif +# endif // MPI + return; +#ifdef _KOJAK_INST +#pragma pomp inst end(xchange_jacobi) +#endif +} + +#endif // WITHLAPH diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/xchange_jacobi.h b/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/xchange_jacobi.h new file mode 100644 index 0000000000000000000000000000000000000000..3c8916a8fcc7eebf2e8be7a8b2b47a1947738ee3 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/xchange_jacobi.h @@ -0,0 +1,25 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _XCHANGE_JACOBI_H +#define _XCHANGE_JACOBI_H + +void xchange_jacobi(su3_vector * const s); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/xchange_lexicfield.c b/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/xchange_lexicfield.c new file mode 100644 index 0000000000000000000000000000000000000000..9499f055bdbc8843d4bb70779f44f8bb355a6dbe --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/xchange_lexicfield.c @@ -0,0 +1,806 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +/********************************************************** + * + * exchange routines for lexicographic spinor fields + * (not even/odd) + * + * Author: Carsten Urbach + * + **********************************************************/ + +#ifdef HAVE_CONFIG_H +# include +#endif +#include +#include +#include +#include +#ifdef MPI +# include +#endif + +#include "global.h" +#if (defined XLC && defined BGL) +# include "bgl.h" +#endif +#include "mpi_init.h" +#include "su3.h" +#include "xchange_lexicfield.h" + +/* this version uses non-blocking MPI calls */ +#if (defined _NON_BLOCKING) + +/* this is the version independent of the content of the function Index (only available with non-blocking)) */ +/* this if statement will be removed in future and _INDEX_INDEP_GEOM will be the default */ +# if defined _INDEX_INDEP_GEOM + +void xchange_lexicfield(spinor * const l) { + +#ifdef MPI + MPI_Request requests[16]; + MPI_Status status[16]; +#endif + int ireq; +# if ( defined PARALLELT || defined PARALLELX ) + int reqcount = 4; +# elif ( defined PARALLELXT || defined PARALLELXY ) + int reqcount = 8; +# elif ( defined PARALLELXYT || defined PARALLELXYZ ) + int reqcount = 12; +# elif defined PARALLELXYZT + int ix=0; + int reqcount = 16; +# endif + +#ifdef _KOJAK_INST +#pragma pomp inst begin(xchange_lexicfield) +#endif +# if (defined BGL && defined XLC) + __alignx(16, l); +# endif + +# ifdef MPI + + + ireq=0; + +# if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT ) + /* send the data to the neighbour on the left */ + /* recieve the data from the neighbour on the right */ + MPI_Isend((void*)(l+gI_0_0_0_0), 1, lfield_time_slice_cont, g_nb_t_dn, 5081, g_cart_grid, &requests[ireq]); + MPI_Irecv((void*)(l+gI_L_0_0_0), 1, lfield_time_slice_cont, g_nb_t_up, 5081, g_cart_grid, &requests[ireq+1]); + ireq=ireq+4; +# endif + + +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + MPI_Isend((void*)(l+gI_0_0_0_0), 1, lfield_x_slice_gath, g_nb_x_dn, 5091, g_cart_grid, &requests[ireq]); + MPI_Irecv((void*)(l+gI_0_L_0_0), 1, lfield_x_slice_cont, g_nb_x_up, 5091, g_cart_grid, &requests[ireq+1]); + ireq=ireq+4; +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + MPI_Isend((void*)(l+gI_0_0_0_0), 1, lfield_y_slice_gath, g_nb_y_dn, 5101, g_cart_grid, &requests[ireq]); + MPI_Irecv((void*)(l+gI_0_0_L_0), 1, lfield_y_slice_cont, g_nb_y_up, 5101, g_cart_grid, &requests[ireq+1]); + ireq=ireq+4; +# endif + +# if (defined PARALLELXYZT || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + MPI_Isend((void*)(l+gI_0_0_0_0), 1, lfield_z_slice_gath, g_nb_z_dn, 5503, g_cart_grid, &requests[ireq]); + MPI_Irecv((void*)(l+gI_0_0_0_L), 1, lfield_z_slice_cont, g_nb_z_up, 5503, g_cart_grid, &requests[ireq+1]); + ireq=ireq+4; +# endif + + ireq=2; + +# if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT ) + /* send the data to the neighbour on the right */ + /* recieve the data from the neighbour on the left */ + MPI_Isend((void*)(l+gI_Lm1_0_0_0), 1, lfield_time_slice_cont, g_nb_t_up, 5082, g_cart_grid, &requests[ireq]); + MPI_Irecv((void*)(l+gI_m1_0_0_0), 1, lfield_time_slice_cont, g_nb_t_dn, 5082, g_cart_grid, &requests[ireq+1]); + ireq=ireq+4; +#endif + +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ ) + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + MPI_Isend((void*)(l+gI_0_Lm1_0_0), 1, lfield_x_slice_gath, g_nb_x_up, 5092, g_cart_grid, &requests[ireq]); + MPI_Irecv((void*)(l+gI_0_m1_0_0), 1, lfield_x_slice_cont, g_nb_x_dn, 5092, g_cart_grid, &requests[ireq+1]); + ireq=ireq+4; +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ ) + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + MPI_Isend((void*)(l+gI_0_0_Lm1_0), 1, lfield_y_slice_gath, g_nb_y_up, 5102, g_cart_grid, &requests[ireq]); + MPI_Irecv((void*)(l+gI_0_0_m1_0), 1, lfield_y_slice_cont, g_nb_y_dn, 5102, g_cart_grid, &requests[ireq+1]); + ireq=ireq+4; +# endif + +# if ( defined PARALLELXYZT || defined PARALLELXYZ ) + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + MPI_Isend((void*)(l+gI_0_0_0_Lm1), 1, lfield_z_slice_gath, g_nb_z_up, 5504, g_cart_grid, &requests[ireq]); + MPI_Irecv((void*)(l+gI_0_0_0_m1), 1, lfield_z_slice_cont, g_nb_z_dn, 5504, g_cart_grid, &requests[ireq+1]); +# endif + + MPI_Waitall(reqcount, requests, status); + +# endif + return; +#ifdef _KOJAK_INST +#pragma pomp inst end(xchange_lexicfield) +#endif +} +# else /* _INDEX_INDEP_GEOM */ + +void xchange_lexicfield(spinor * const l) { + + MPI_Request requests[16]; + MPI_Status status[16]; +# ifdef PARALLELT + int reqcount = 4; +# elif defined PARALLELXT + int reqcount = 8; +# elif defined PARALLELXYT + int reqcount = 12; +# elif defined PARALLELXYZT + int reqcount = 16; +# endif +#ifdef _KOJAK_INST +#pragma pomp inst begin(xchange_lexicfield) +#endif +# if (defined BGL && defined XLC) + __alignx(16, l); +# endif + +# ifdef MPI + + + /* send the data to the neighbour on the left */ + /* recieve the data from the neighbour on the right */ + MPI_Isend((void*)l, 1, lfield_time_slice_cont, g_nb_t_dn, 5081, g_cart_grid, &requests[0]); + MPI_Irecv((void*)(l+VOLUME), 1, lfield_time_slice_cont, g_nb_t_up, 5081, g_cart_grid, &requests[1]); +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + MPI_Isend((void*)l, 1, lfield_x_slice_gath, g_nb_x_dn, 5091, g_cart_grid, &requests[4]); + MPI_Irecv((void*)(l+(T+2)*LX*LY*LZ), 1, lfield_x_slice_cont, g_nb_x_up, 5091, g_cart_grid, &requests[5]); + +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT) + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + MPI_Isend((void*)l, 1, lfield_y_slice_gath, g_nb_y_dn, 5101, g_cart_grid, &requests[8]); + MPI_Irecv((void*)(l + VOLUME + 2*LZ*(LX*LY + T*LY)), 1, lfield_y_slice_cont, g_nb_y_up, 5101, g_cart_grid, &requests[9]); +# endif + +# if (defined PARALLELXYZT) + + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + MPI_Isend((void*)l, 1, lfield_z_slice_gath, g_nb_z_dn, 5503, g_cart_grid, &requests[12]); + MPI_Irecv((void*)(l+VOLUME + 2*LZ*(LX*LY + T*LY) + 2*LZ*T*LX), 1, lfield_z_slice_cont, g_nb_z_up, 5503, g_cart_grid, &requests[13]); +# endif + /* send the data to the neighbour on the right */ + /* recieve the data from the neighbour on the left */ + MPI_Isend((void*)(l+(T-1)*LX*LY*LZ), 1, lfield_time_slice_cont, g_nb_t_up, 5082, g_cart_grid, &requests[2]); + MPI_Irecv((void*)(l+(T+1)*LX*LY*LZ), 1, lfield_time_slice_cont, g_nb_t_dn, 5082, g_cart_grid, &requests[3]); + +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + MPI_Isend((void*)(l+(LX-1)*LY*LZ), 1, lfield_x_slice_gath, g_nb_x_up, 5092, g_cart_grid, &requests[6]); + MPI_Irecv((void*)(l+((T+2)*LX*LY*LZ + T*LY*LZ)), 1, lfield_x_slice_cont, g_nb_x_dn, 5092, g_cart_grid, &requests[7]); +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT) + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + MPI_Isend((void*)(l+(LY-1)*LZ), 1, lfield_y_slice_gath, g_nb_y_up, 5102, g_cart_grid, &requests[10]); + MPI_Irecv((void*)(l+VOLUME + 2*LZ*(LX*LY + T*LY) + T*LX*LZ), 1, lfield_y_slice_cont, g_nb_y_dn, 5102, g_cart_grid, &requests[11]); +# endif + +# if defined PARALLELXYZT + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + MPI_Isend((void*)(l+LZ-1), 1, lfield_z_slice_gath, g_nb_z_up, 5504, g_cart_grid, &requests[14]); + MPI_Irecv((void*)(l+VOLUME + 2*LZ*(LX*LY + T*LY) + 2*T*LX*LZ + T*LX*LY), 1, lfield_z_slice_cont, g_nb_z_dn, 5504, g_cart_grid, &requests[15]); +# endif + + MPI_Waitall(reqcount, requests, status); + +# endif + return; +#ifdef _KOJAK_INST +#pragma pomp inst end(xchange_lexicfield) +#endif +} + +# endif /* _INDEX_INDEP_GEOM */ + +/* Here comes the naive version */ +/* Using MPI_Sendrecv */ +#else /* _NON_BLOCKING */ + +/* this is the version independent of the content of the function Index (only available with non-blocking)) */ +/* this if statement will be removed in future and _INDEX_INDEP_GEOM will be the default */ +# if defined _INDEX_INDEP_GEOM + +/* exchanges the field l */ +void xchange_lexicfield(spinor * const l) { + +# ifdef PARALLELXYZT + int x0=0, x1=0, x2=0, ix=0; +# endif +#ifdef _KOJAK_INST +#pragma pomp inst begin(xchange_lexicfield) +#endif + +# ifdef MPI + +# if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) + MPI_Status status; + /* send the data to the neighbour on the left */ + /* recieve the data from the neighbour on the right */ + MPI_Sendrecv((void*)(l+gI_0_0_0_0), 1, lfield_time_slice_cont, g_nb_t_dn, 5081, + (void*)(l+gI_L_0_0_0), 1, lfield_time_slice_cont, g_nb_t_up, 5081, + g_cart_grid, &status); + + /* send the data to the neighbour on the right */ + /* recieve the data from the neighbour on the left */ + MPI_Sendrecv((void*)(l+gI_Lm1_0_0_0), 1, lfield_time_slice_cont, g_nb_t_up, 5082, + (void*)(l+gI_m1_0_0_0), 1, lfield_time_slice_cont, g_nb_t_dn, 5082, + g_cart_grid, &status); +# endif +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + MPI_Sendrecv((void*)(l+gI_0_0_0_0), 1, lfield_x_slice_gath, g_nb_x_dn, 5091, + (void*)(l+gI_0_L_0_0), 1, lfield_x_slice_cont, g_nb_x_up, 5091, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + MPI_Sendrecv((void*)(l+gI_0_Lm1_0_0), 1, lfield_x_slice_gath, g_nb_x_up, 5092, + (void*)(l+gI_0_m1_0_0), 1, lfield_x_slice_cont, g_nb_x_dn, 5092, + g_cart_grid, &status); + +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + MPI_Sendrecv((void*)(l+gI_0_0_0_0), 1, lfield_y_slice_gath, g_nb_y_dn, 5101, + (void*)(l+gI_0_0_L_0), 1, lfield_y_slice_cont, g_nb_y_up, 5101, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + MPI_Sendrecv((void*)(l+gI_0_0_Lm1_0), 1, lfield_y_slice_gath, g_nb_y_up, 5102, + (void*)(l+gI_0_0_m1_0), 1, lfield_y_slice_cont, g_nb_y_dn, 5102, + g_cart_grid, &status); + +# endif + +# if (defined PARALLELXYZT || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + MPI_Sendrecv((void*)(l+gI_0_0_0_0), 1, lfield_z_slice_gath, g_nb_z_dn, 5503, + (void*)(l+gI_0_0_0_L), 1, lfield_z_slice_cont, g_nb_z_up, 5503, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + MPI_Sendrecv((void*)(l+gI_0_0_0_Lm1), 1, lfield_z_slice_gath, g_nb_z_up, 5504, + (void*)(l+gI_0_0_0_m1), 1, lfield_z_slice_cont, g_nb_z_dn, 5504, + g_cart_grid, &status); + +# endif +# endif + return; +#ifdef _KOJAK_INST +#pragma pomp inst end(xchange_lexicfield) +#endif +} + +# else // _INDEX_INDEP_GEOM + +/* exchanges the field l */ +void xchange_lexicfield(spinor * const l) { + +# ifdef PARALLELXYZT + int x0=0, x1=0, x2=0, ix=0; +# endif +#ifdef _KOJAK_INST +#pragma pomp inst begin(xchange_lexicfield) +#endif + +# ifdef MPI + + MPI_Status status; + /* send the data to the neighbour on the left */ + /* recieve the data from the neighbour on the right */ + MPI_Sendrecv((void*)l, 1, lfield_time_slice_cont, g_nb_t_dn, 5081, + (void*)(l+T*LX*LY*LZ), 1, lfield_time_slice_cont, g_nb_t_up, 5081, + g_cart_grid, &status); + + /* send the data to the neighbour on the right */ + /* recieve the data from the neighbour on the left */ + MPI_Sendrecv((void*)(l+(T-1)*LX*LY*LZ), 1, lfield_time_slice_cont, g_nb_t_up, 5082, + (void*)(l+(T+1)*LX*LY*LZ), 1, lfield_time_slice_cont, g_nb_t_dn, 5082, + g_cart_grid, &status); + +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + MPI_Sendrecv((void*)l, 1, lfield_x_slice_gath, g_nb_x_dn, 5091, + (void*)(l+(T+2)*LX*LY*LZ), 1, lfield_x_slice_cont, g_nb_x_up, 5091, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + MPI_Sendrecv((void*)(l+(LX-1)*LY*LZ), 1, lfield_x_slice_gath, g_nb_x_up, 5092, + (void*)(l+((T+2)*LX*LY*LZ + T*LY*LZ)), 1, lfield_x_slice_cont, g_nb_x_dn, 5092, + g_cart_grid, &status); + +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT) + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + MPI_Sendrecv((void*)l, 1, lfield_y_slice_gath, g_nb_y_dn, 5101, + (void*)(l+((T+2)*LX*LY*LZ + 2*T*LY*LZ)), 1, lfield_y_slice_cont, g_nb_y_up, 5101, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + MPI_Sendrecv((void*)(l+(LY-1)*LZ/2), 1, lfield_y_slice_gath, g_nb_y_up, 5102, + (void*)(l+((T+2)*LX*LY*LZ + 2*T*LY*LZ + T*LX*LZ)), 1, lfield_y_slice_cont, g_nb_y_dn, 5102, + g_cart_grid, &status); + +# endif + +# if (defined PARALLELXYZT) + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + MPI_Sendrecv((void*)l, + 1, lfield_z_slice_gath, g_nb_z_dn, 5503, + (void*)(l + VOLUME + 2*LZ*(LX*LY + T*LY) + 2*LZ*T*LX), + 1, lfield_z_slice_cont, g_nb_z_up, 5503, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + MPI_Sendrecv((void*)(l+LZ-1), + 1, lfield_z_slice_gath, g_nb_z_up, 5504, + (void*)(l+(VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + T*LX*LY)), + 1, lfield_z_slice_cont, g_nb_z_dn, 5504, + g_cart_grid, &status); + +# endif +# endif + return; +#ifdef _KOJAK_INST +#pragma pomp inst end(xchange_lexicfield) +#endif +} + +# endif // _INDEX_INDEP_GEOM + +#endif + + + + + +/*********************************************************************** + **************** 32 bit versions ******************** + ***********************************************************************/ + + + + + +/* this version uses non-blocking MPI calls */ +#if (defined _NON_BLOCKING) + +/* this is the version independent of the content of the function Index (only available with non-blocking)) */ +/* this if statement will be removed in future and _INDEX_INDEP_GEOM will be the default */ +# if defined _INDEX_INDEP_GEOM + +void xchange_lexicfield32(spinor32 * const l) { + +#ifdef MPI + MPI_Request requests[16]; + MPI_Status status[16]; +#endif + int ireq; +# if ( defined PARALLELT || defined PARALLELX ) + int reqcount = 4; +# elif ( defined PARALLELXT || defined PARALLELXY ) + int reqcount = 8; +# elif ( defined PARALLELXYT || defined PARALLELXYZ ) + int reqcount = 12; +# elif defined PARALLELXYZT + int ix=0; + int reqcount = 16; +# endif + +#ifdef _KOJAK_INST +#pragma pomp inst begin(xchange_lexicfield32) +#endif +# if (defined BGL && defined XLC) + __alignx(16, l); +# endif + +# ifdef MPI + + + ireq=0; + +# if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT ) + /* send the data to the neighbour on the left */ + /* recieve the data from the neighbour on the right */ + MPI_Isend((void*)(l+gI_0_0_0_0), 1, lfield_time_slice_cont32, g_nb_t_dn, 5081, g_cart_grid, &requests[ireq]); + MPI_Irecv((void*)(l+gI_L_0_0_0), 1, lfield_time_slice_cont32, g_nb_t_up, 5081, g_cart_grid, &requests[ireq+1]); + ireq=ireq+4; +# endif + + +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + MPI_Isend((void*)(l+gI_0_0_0_0), 1, lfield_x_slice_gath32, g_nb_x_dn, 5091, g_cart_grid, &requests[ireq]); + MPI_Irecv((void*)(l+gI_0_L_0_0), 1, lfield_x_slice_cont32, g_nb_x_up, 5091, g_cart_grid, &requests[ireq+1]); + ireq=ireq+4; +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + MPI_Isend((void*)(l+gI_0_0_0_0), 1, lfield_y_slice_gath32, g_nb_y_dn, 5101, g_cart_grid, &requests[ireq]); + MPI_Irecv((void*)(l+gI_0_0_L_0), 1, lfield_y_slice_cont32, g_nb_y_up, 5101, g_cart_grid, &requests[ireq+1]); + ireq=ireq+4; +# endif + +# if (defined PARALLELXYZT || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + MPI_Isend((void*)(l+gI_0_0_0_0), 1, lfield_z_slice_gath32, g_nb_z_dn, 5503, g_cart_grid, &requests[ireq]); + MPI_Irecv((void*)(l+gI_0_0_0_L), 1, lfield_z_slice_cont32, g_nb_z_up, 5503, g_cart_grid, &requests[ireq+1]); + ireq=ireq+4; +# endif + + ireq=2; + +# if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT ) + /* send the data to the neighbour on the right */ + /* recieve the data from the neighbour on the left */ + MPI_Isend((void*)(l+gI_Lm1_0_0_0), 1, lfield_time_slice_cont32, g_nb_t_up, 5082, g_cart_grid, &requests[ireq]); + MPI_Irecv((void*)(l+gI_m1_0_0_0), 1, lfield_time_slice_cont32, g_nb_t_dn, 5082, g_cart_grid, &requests[ireq+1]); + ireq=ireq+4; +#endif + +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ ) + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + MPI_Isend((void*)(l+gI_0_Lm1_0_0), 1, lfield_x_slice_gath32, g_nb_x_up, 5092, g_cart_grid, &requests[ireq]); + MPI_Irecv((void*)(l+gI_0_m1_0_0), 1, lfield_x_slice_cont32, g_nb_x_dn, 5092, g_cart_grid, &requests[ireq+1]); + ireq=ireq+4; +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ ) + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + MPI_Isend((void*)(l+gI_0_0_Lm1_0), 1, lfield_y_slice_gath32, g_nb_y_up, 5102, g_cart_grid, &requests[ireq]); + MPI_Irecv((void*)(l+gI_0_0_m1_0), 1, lfield_y_slice_cont32, g_nb_y_dn, 5102, g_cart_grid, &requests[ireq+1]); + ireq=ireq+4; +# endif + +# if ( defined PARALLELXYZT || defined PARALLELXYZ ) + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + MPI_Isend((void*)(l+gI_0_0_0_Lm1), 1, lfield_z_slice_gath32, g_nb_z_up, 5504, g_cart_grid, &requests[ireq]); + MPI_Irecv((void*)(l+gI_0_0_0_m1), 1, lfield_z_slice_cont32, g_nb_z_dn, 5504, g_cart_grid, &requests[ireq+1]); +# endif + + MPI_Waitall(reqcount, requests, status); + +# endif + return; +#ifdef _KOJAK_INST +#pragma pomp inst end(xchange_lexicfield32) +#endif +} +# else /* _INDEX_INDEP_GEOM */ + +void xchange_lexicfield32(spinor32 * const l) { + + MPI_Request requests[16]; + MPI_Status status[16]; +# ifdef PARALLELT + int reqcount = 4; +# elif defined PARALLELXT + int reqcount = 8; +# elif defined PARALLELXYT + int reqcount = 12; +# elif defined PARALLELXYZT + int reqcount = 16; +# endif +#ifdef _KOJAK_INST +#pragma pomp inst begin(xchange_lexicfield32) +#endif +# if (defined BGL && defined XLC) + __alignx(16, l); +# endif + +# ifdef MPI + + + /* send the data to the neighbour on the left */ + /* recieve the data from the neighbour on the right */ + MPI_Isend((void*)l, 1, lfield_time_slice_cont32, g_nb_t_dn, 5081, g_cart_grid, &requests[0]); + MPI_Irecv((void*)(l+VOLUME), 1, lfield_time_slice_cont32, g_nb_t_up, 5081, g_cart_grid, &requests[1]); +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + MPI_Isend((void*)l, 1, lfield_x_slice_gath32, g_nb_x_dn, 5091, g_cart_grid, &requests[4]); + MPI_Irecv((void*)(l+(T+2)*LX*LY*LZ), 1, lfield_x_slice_cont32, g_nb_x_up, 5091, g_cart_grid, &requests[5]); + +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT) + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + MPI_Isend((void*)l, 1, lfield_y_slice_gath32, g_nb_y_dn, 5101, g_cart_grid, &requests[8]); + MPI_Irecv((void*)(l + VOLUME + 2*LZ*(LX*LY + T*LY)), 1, lfield_y_slice_cont32, g_nb_y_up, 5101, g_cart_grid, &requests[9]); +# endif + +# if (defined PARALLELXYZT) + + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + MPI_Isend((void*)l, 1, lfield_z_slice_gath32, g_nb_z_dn, 5503, g_cart_grid, &requests[12]); + MPI_Irecv((void*)(l+VOLUME + 2*LZ*(LX*LY + T*LY) + 2*LZ*T*LX), 1, lfield_z_slice_cont32, g_nb_z_up, 5503, g_cart_grid, &requests[13]); +# endif + /* send the data to the neighbour on the right */ + /* recieve the data from the neighbour on the left */ + MPI_Isend((void*)(l+(T-1)*LX*LY*LZ), 1, lfield_time_slice_cont32, g_nb_t_up, 5082, g_cart_grid, &requests[2]); + MPI_Irecv((void*)(l+(T+1)*LX*LY*LZ), 1, lfield_time_slice_cont32, g_nb_t_dn, 5082, g_cart_grid, &requests[3]); + +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + MPI_Isend((void*)(l+(LX-1)*LY*LZ), 1, lfield_x_slice_gath32, g_nb_x_up, 5092, g_cart_grid, &requests[6]); + MPI_Irecv((void*)(l+((T+2)*LX*LY*LZ + T*LY*LZ)), 1, lfield_x_slice_cont32, g_nb_x_dn, 5092, g_cart_grid, &requests[7]); +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT) + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + MPI_Isend((void*)(l+(LY-1)*LZ), 1, lfield_y_slice_gath32, g_nb_y_up, 5102, g_cart_grid, &requests[10]); + MPI_Irecv((void*)(l+VOLUME + 2*LZ*(LX*LY + T*LY) + T*LX*LZ), 1, lfield_y_slice_cont32, g_nb_y_dn, 5102, g_cart_grid, &requests[11]); +# endif + +# if defined PARALLELXYZT + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + MPI_Isend((void*)(l+LZ-1), 1, lfield_z_slice_gath32, g_nb_z_up, 5504, g_cart_grid, &requests[14]); + MPI_Irecv((void*)(l+VOLUME + 2*LZ*(LX*LY + T*LY) + 2*T*LX*LZ + T*LX*LY), 1, lfield_z_slice_cont32, g_nb_z_dn, 5504, g_cart_grid, &requests[15]); +# endif + + MPI_Waitall(reqcount, requests, status); + +# endif + return; +#ifdef _KOJAK_INST +#pragma pomp inst end(xchange_lexicfield32) +#endif +} + +# endif /* _INDEX_INDEP_GEOM */ + +/* Here comes the naive version */ +/* Using MPI_Sendrecv */ +#else /* _NON_BLOCKING */ + +/* this is the version independent of the content of the function Index (only available with non-blocking)) */ +/* this if statement will be removed in future and _INDEX_INDEP_GEOM will be the default */ +# if defined _INDEX_INDEP_GEOM + +/* exchanges the field l */ +void xchange_lexicfield32(spinor32 * const l) { + +# ifdef PARALLELXYZT + int x0=0, x1=0, x2=0, ix=0; +# endif +#ifdef _KOJAK_INST +#pragma pomp inst begin(xchange_lexicfield32) +#endif + +# ifdef MPI + +# if (defined PARALLELT || defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) + MPI_Status status; + /* send the data to the neighbour on the left */ + /* recieve the data from the neighbour on the right */ + MPI_Sendrecv((void*)(l+gI_0_0_0_0), 1, lfield_time_slice_cont32, g_nb_t_dn, 5081, + (void*)(l+gI_L_0_0_0), 1, lfield_time_slice_cont32, g_nb_t_up, 5081, + g_cart_grid, &status); + + /* send the data to the neighbour on the right */ + /* recieve the data from the neighbour on the left */ + MPI_Sendrecv((void*)(l+gI_Lm1_0_0_0), 1, lfield_time_slice_cont32, g_nb_t_up, 5082, + (void*)(l+gI_m1_0_0_0), 1, lfield_time_slice_cont32, g_nb_t_dn, 5082, + g_cart_grid, &status); +# endif +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELX || defined PARALLELXY || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + MPI_Sendrecv((void*)(l+gI_0_0_0_0), 1, lfield_x_slice_gath32, g_nb_x_dn, 5091, + (void*)(l+gI_0_L_0_0), 1, lfield_x_slice_cont32, g_nb_x_up, 5091, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + MPI_Sendrecv((void*)(l+gI_0_Lm1_0_0), 1, lfield_x_slice_gath32, g_nb_x_up, 5092, + (void*)(l+gI_0_m1_0_0), 1, lfield_x_slice_cont32, g_nb_x_dn, 5092, + g_cart_grid, &status); + +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT || defined PARALLELXY || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + MPI_Sendrecv((void*)(l+gI_0_0_0_0), 1, lfield_y_slice_gath32, g_nb_y_dn, 5101, + (void*)(l+gI_0_0_L_0), 1, lfield_y_slice_cont32, g_nb_y_up, 5101, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + MPI_Sendrecv((void*)(l+gI_0_0_Lm1_0), 1, lfield_y_slice_gath32, g_nb_y_up, 5102, + (void*)(l+gI_0_0_m1_0), 1, lfield_y_slice_cont32, g_nb_y_dn, 5102, + g_cart_grid, &status); + +# endif + +# if (defined PARALLELXYZT || defined PARALLELXYZ ) + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + MPI_Sendrecv((void*)(l+gI_0_0_0_0), 1, lfield_z_slice_gath32, g_nb_z_dn, 5503, + (void*)(l+gI_0_0_0_L), 1, lfield_z_slice_cont32, g_nb_z_up, 5503, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + MPI_Sendrecv((void*)(l+gI_0_0_0_Lm1), 1, lfield_z_slice_gath32, g_nb_z_up, 5504, + (void*)(l+gI_0_0_0_m1), 1, lfield_z_slice_cont32, g_nb_z_dn, 5504, + g_cart_grid, &status); + +# endif +# endif + return; +#ifdef _KOJAK_INST +#pragma pomp inst end(xchange_lexicfield32) +#endif +} + +# else // _INDEX_INDEP_GEOM + +/* exchanges the field l */ +void xchange_lexicfield32(spinor32 * const l) { + +# ifdef PARALLELXYZT + int x0=0, x1=0, x2=0, ix=0; +# endif +#ifdef _KOJAK_INST +#pragma pomp inst begin(xchange_lexicfield32) +#endif + +# ifdef MPI + + MPI_Status status; + /* send the data to the neighbour on the left */ + /* recieve the data from the neighbour on the right */ + MPI_Sendrecv((void*)l, 1, lfield_time_slice_cont32, g_nb_t_dn, 5081, + (void*)(l+T*LX*LY*LZ), 1, lfield_time_slice_cont32, g_nb_t_up, 5081, + g_cart_grid, &status); + + /* send the data to the neighbour on the right */ + /* recieve the data from the neighbour on the left */ + MPI_Sendrecv((void*)(l+(T-1)*LX*LY*LZ), 1, lfield_time_slice_cont32, g_nb_t_up, 5082, + (void*)(l+(T+1)*LX*LY*LZ), 1, lfield_time_slice_cont32, g_nb_t_dn, 5082, + g_cart_grid, &status); + +# if (defined PARALLELXT || defined PARALLELXYT || defined PARALLELXYZT) + /* send the data to the neighbour on the left in x direction */ + /* recieve the data from the neighbour on the right in x direction */ + MPI_Sendrecv((void*)l, 1, lfield_x_slice_gath32, g_nb_x_dn, 5091, + (void*)(l+(T+2)*LX*LY*LZ), 1, lfield_x_slice_cont32, g_nb_x_up, 5091, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in x direction */ + /* recieve the data from the neighbour on the left in x direction */ + MPI_Sendrecv((void*)(l+(LX-1)*LY*LZ), 1, lfield_x_slice_gath32, g_nb_x_up, 5092, + (void*)(l+((T+2)*LX*LY*LZ + T*LY*LZ)), 1, lfield_x_slice_cont32, g_nb_x_dn, 5092, + g_cart_grid, &status); + +# endif + +# if (defined PARALLELXYT || defined PARALLELXYZT) + /* send the data to the neighbour on the left in y direction */ + /* recieve the data from the neighbour on the right in y direction */ + MPI_Sendrecv((void*)l, 1, lfield_y_slice_gath32, g_nb_y_dn, 5101, + (void*)(l+((T+2)*LX*LY*LZ + 2*T*LY*LZ)), 1, lfield_y_slice_cont32, g_nb_y_up, 5101, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + MPI_Sendrecv((void*)(l+(LY-1)*LZ/2), 1, lfield_y_slice_gath32, g_nb_y_up, 5102, + (void*)(l+((T+2)*LX*LY*LZ + 2*T*LY*LZ + T*LX*LZ)), 1, lfield_y_slice_cont32, g_nb_y_dn, 5102, + g_cart_grid, &status); + +# endif + +# if (defined PARALLELXYZT) + /* send the data to the neighbour on the left in z direction */ + /* recieve the data from the neighbour on the right in z direction */ + MPI_Sendrecv((void*)l, + 1, lfield_z_slice_gath32, g_nb_z_dn, 5503, + (void*)(l + VOLUME + 2*LZ*(LX*LY + T*LY) + 2*LZ*T*LX), + 1, lfield_z_slice_cont32, g_nb_z_up, 5503, + g_cart_grid, &status); + + /* send the data to the neighbour on the right in y direction */ + /* recieve the data from the neighbour on the left in y direction */ + MPI_Sendrecv((void*)(l+LZ-1), + 1, lfield_z_slice_gath32, g_nb_z_up, 5504, + (void*)(l+(VOLUME + 2*LX*LY*LZ + 2*T*LY*LZ + 2*T*LX*LZ + T*LX*LY)), + 1, lfield_z_slice_cont32, g_nb_z_dn, 5504, + g_cart_grid, &status); + +# endif +# endif + return; +#ifdef _KOJAK_INST +#pragma pomp inst end(xchange_lexicfield32) +#endif +} + +# endif // _INDEX_INDEP_GEOM + +#endif + + + + + + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/xchange_lexicfield.h b/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/xchange_lexicfield.h new file mode 100644 index 0000000000000000000000000000000000000000..2da8f804ada722b337fc45b146a454c17b9476e7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/xchange/xchange_lexicfield.h @@ -0,0 +1,26 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + ***********************************************************************/ + +#ifndef _XCHANGE_LEXICFIELD_H +#define _XCHANGE_LEXICFIELD_H + +void xchange_lexicfield(spinor * const s); +void xchange_lexicfield32(spinor32 * const s); + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_D/xlc_prefetch.h b/qcd/part_cpu/applications/QCD/src/kernel_D/xlc_prefetch.h new file mode 100644 index 0000000000000000000000000000000000000000..1beb78fbfc6bcbe9d0a80bf5cf229c2c145d6f04 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_D/xlc_prefetch.h @@ -0,0 +1,81 @@ +/*********************************************************************** + * Copyright (C) 2002,2003,2004,2005,2006,2007,2008 Carsten Urbach + * + * This file is part of tmLQCD. + * + * tmLQCD is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * tmLQCD is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with tmLQCD. If not, see . + * + * Prefetch macros for the xlc compiler + * on the ibm for power4 processors + * + * Author: Carsten Urbach + * urbach@physik.fu-berlin.de + * + ***********************************************/ +#ifndef _XLC_PREFETCH_H +#define _XLC_PREFETCH_H + +#ifdef XLC + +#define _prefetch_halfspinor(addr) \ + __dcbt(((char*)((unsigned long int)(addr)))); + +#define _prefetch_spinor(addr) \ + __dcbt(((char*)((unsigned long int)(addr)))); \ + __dcbt(((char*)((unsigned long int)(addr)))+128); + +#define _prefetch_spinor_32(addr) \ + __dcbt(((char*)((unsigned long int)(addr)))); +//#define _prefetch_spinor_32(addr) + + +#define _prefetch_su3_32(addr) \ + __dcbt(((char*)((unsigned long int)(addr)))); +//#define _prefetch_su3_32(addr) + +#define _prefetch_su3(addr) \ + __dcbt(((char*)((unsigned long int)(addr)))); \ + __dcbt(((char*)((unsigned long int)(addr)))+128); + +#define _prefetch_spinor_dcbt(addr1, addr2) \ +__dcbt((void*)(addr1)); \ +__dcbt((void*)(addr2)); + +#define _prefetch_spinor_by_load(addr1, addr2) \ +__prefetch_by_load((void*)(addr1)); \ +__prefetch_by_load((void*)(addr2)); + +#define _prefetch_su3_dcbt(addr1, addr2) \ +__dcbt((void*)(addr1)); \ +__dcbt((void*)(addr2)); + +#define _prefetch_su3_by_load(addr1, addr2) \ +__prefetch_by_load((void*)(addr1)); \ +__prefetch_by_load((void*)(addr2)); + +#else + +#define _prefetch_halfspinor(addr) + +#define _prefetch_spinor(addr) + +#define _prefetch_su3(addr) + +#define _prefetch_spinor_32(addr) + +#define _prefetch_su3_32(addr) + +#endif + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/Makefile b/qcd/part_cpu/applications/QCD/src/kernel_E/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..ba1c643249f8ab5a2e94f7c6c646099e9295a2e2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/Makefile @@ -0,0 +1,76 @@ +include Makefile.defs + +#defines +TARGETDIR=. +DEBUG=0 + +ifeq ($(DEBUG),1) +DEFINES+= -DDEBUG -g +endif + +default: bench + +#GENERIC +VPATH= libraries/ arch_mpi/ include/ +ARCHDIR=arch_mpi/ +#CC = mpicc +#LD = mpicc +#CFLAGS = $(DEFINES) -O2 -DARCH=0 -I./include +#LDFLAGS = -lm +.c.o: + $(CC) $(CFLAGS) $< -c -o $(patsubst %.c,%.o,$<) + + + +#top level headers +HEADER = \ + config.h \ + complex.h \ + su3.h \ + macros.h \ + machine.h \ + comdefs.h \ + generic.h \ + generic_wilson.h \ + io_lat.h \ + includes.h \ + lattice.h + + + + +#top level sources +SOURCE:= \ + control.c \ + setup.c \ + io_helpers.c \ + check_unitarity.c \ + d_plaq4.c \ + make_lattice.c \ + fermion_stuff.c \ + mt19937-64.c \ + random.c \ + mult_fmat.c \ + congrad.c + +#subdir sources and headers +include libraries/module.mk +include $(ARCHDIR)/module.mk + + +OBJECT:= \ + $(patsubst %.c,%.o,$(filter %.c,$(SOURCE))) \ + $(patsubst %.asm,%.o,$(filter %.asm,$(SOURCE))) \ + $(patsubst %.f,%.o,$(filter %.f,$(SOURCE))) + +$(OBJECT): $(HEADER) Makefile + +#targets +bench: $(OBJECT) + $(LD) -o $(TARGETDIR)/$@ $(OBJECT) $(ARPACK) $(LDFLAGS) + +kernel: $(OBJECT) + $(AR) $(ARFLAGS) ../kernel_E.a $(OBJECT) + +clean: + rm -f ./*.o ./*.a ./*/*.o ./*~ ./*/*~ bench ../kernel_E.a diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/Makefile.defs.in b/qcd/part_cpu/applications/QCD/src/kernel_E/Makefile.defs.in new file mode 100644 index 0000000000000000000000000000000000000000..c5eeaa53e9baffccfd4366793b659034295cfd3e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/Makefile.defs.in @@ -0,0 +1,11 @@ +SHELL = #SHELL# + +CC = #MPI_CC# +CFLAGS = #CFLAGS# + +AR = #AR# +ARFLAGS = #ARFLAGS# + +LDFLAGS = #LDFLAGS# + +RM = #RM# diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/arch_mpi/com_mpi.c b/qcd/part_cpu/applications/QCD/src/kernel_E/arch_mpi/com_mpi.c new file mode 100644 index 0000000000000000000000000000000000000000..ec850ddd189065f60a9068c6751f925ebf78b74e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/arch_mpi/com_mpi.c @@ -0,0 +1,605 @@ +#include "includes.h" +#include +#define COM_BIT 0x40000000 + +/* Global variables */ +int totnodes[4]; /* number of nodes in machine directions */ +int Mynode[4], node_parity; +int **neighbor; +int offnode_even[8]; /* # of even sites that have off-node neighbors in a dir */ +int offnode_odd[8]; /* # of odd sites that have off-node neighbors in a dir */ +MPI_Comm comm_grid, comm_subgrid[4]; /* grid communicators */ + +/* print on 0 node only */ +void node0_printf( const char *fmt, ... ) +{ + va_list argp; + if( this_node == 0 ) + { + va_start( argp, fmt ); + vprintf( fmt, argp ); + va_end( argp ); + } + + fflush( 0 ); + + MPI_Barrier( MPI_COMM_WORLD ); +} + +void node0_fprintf( FILE * file, const char *fmt, ... ) +{ + va_list argp; + if( this_node == 0 ) + { + va_start( argp, fmt ); + vfprintf( file, fmt, argp ); + va_end( argp ); + } + + fflush( 0 ); + + MPI_Barrier( MPI_COMM_WORLD ); +} + +void verbose_fprintf( FILE * file, const char *fmt, ... ) +{ + va_list argp; + if( verbose && this_node == 0 ) + { + va_start( argp, fmt ); + vfprintf( file, fmt, argp ); + va_end( argp ); + } + + fflush( 0 ); + + MPI_Barrier( MPI_COMM_WORLD ); +} + +/* JuBE no args needed */ +void initialize_machine_KE() +{ + int free_coords[4], wrap_around[4]; + + MPI_Comm_size(MPI_COMM_WORLD, &number_of_nodes); + + MPI_Comm_rank(MPI_COMM_WORLD, &this_node); + + /* get the totnodes[dir] from parameters file */ + FILE *fpar; +/* JuBE set para file to kernel_E.input*/ + if( ( fpar = fopen( "kernel_E.input", "r" ) ) == 0 && this_node == 0 ) + { + printf( "ERROR initialize_machine: missing parameter file\n" ); + fflush(0); + exit( 1 ); + } + if (get_totnodes( fpar, "totnodes" ) && this_node==0) + { + printf( "ERROR initialize_machine: missing totnodes\n" ); + fflush(0); + exit(1); + } + + if (totnodes[XUP]*totnodes[YUP]*totnodes[ZUP]*totnodes[TUP]!=number_of_nodes && + this_node==0) + { + printf( "ERROR initialize_machine: bad total number of nodes\n" ); + fflush(0); + exit(1); + } + fclose(fpar); + + /* Cartesian grid */ + wrap_around[0] = 1; + wrap_around[1] = 1; + wrap_around[2] = 1; + wrap_around[3] = 1; + MPI_Cart_create(MPI_COMM_WORLD, 4, totnodes, wrap_around, 1, &comm_grid); + + /* new coordinates */ + MPI_Comm_rank(comm_grid, &this_node); + MPI_Cart_coords(comm_grid, this_node, 4,Mynode); + + node_parity = ( Mynode[XUP] + Mynode[YUP] + Mynode[ZUP] + Mynode[TUP] ) % 2; + node0_printf( "initialize_machine: topology: %dx%dx%dx%d, mynode: %d,%d,%d,%d, numnodes: %d\n", + totnodes[XUP], totnodes[YUP], totnodes[ZUP], totnodes[TUP], + Mynode[XUP], Mynode[YUP], Mynode[ZUP], Mynode[TUP], numnodes_KE( ) ); + + /* set up communicators */ + free_coords[XUP] = 1; + free_coords[YUP] = 0; + free_coords[ZUP] = 0; + free_coords[TUP] = 0; + MPI_Cart_sub(comm_grid, free_coords,&comm_subgrid[XUP]); + free_coords[XUP] = 0; + free_coords[YUP] = 1; + free_coords[ZUP] = 0; + free_coords[TUP] = 0; + MPI_Cart_sub(comm_grid, free_coords,&comm_subgrid[YUP]); + free_coords[XUP] = 0; + free_coords[YUP] = 0; + free_coords[ZUP] = 1; + free_coords[TUP] = 0; + MPI_Cart_sub(comm_grid, free_coords,&comm_subgrid[ZUP]); + free_coords[XUP] = 0; + free_coords[YUP] = 0; + free_coords[ZUP] = 0; + free_coords[TUP] = 1; + MPI_Cart_sub(comm_grid, free_coords,&comm_subgrid[TUP]); + +} + +static char name[] = "Generic communication"; +char *machine_type_KE( ) +{ + return ( name ); +} + +int mynode_KE( ) +{ + return Mynode[TUP]+totnodes[TUP]*Mynode[ZUP]+ + totnodes[TUP]*totnodes[ZUP]*Mynode[YUP]+ + totnodes[TUP]*totnodes[ZUP]*totnodes[YUP]*Mynode[XUP]; +} + +void mynode4( int *n_x, int *n_y, int *n_z, int *n_t ) +{ + *n_x = Mynode[XUP]; + *n_y = Mynode[YUP]; + *n_z = Mynode[ZUP]; + *n_t = Mynode[TUP]; +} + +int numnodes_KE( ) +{ + return ( totnodes[XUP] * totnodes[YUP] * totnodes[ZUP] * totnodes[TUP] ); +} + +void numnodes4( int *n_x, int *n_y, int *n_z, int *n_t ) +{ + *n_x = totnodes[XUP]; + *n_y = totnodes[YUP]; + *n_z = totnodes[ZUP]; + *n_t = totnodes[TUP]; +} + +void gen_send_recv( int dir, char *sbuf, char *rbuf, int size ) +{ + MPI_Status status; + int source; + int dest; + + if (dir<0 || dir>7) + { + node0_fprintf(file_o1, "ERROR gen_send_recv: Bad direction %d\n",dir); + exit(1); + } + + if (dir<4) + { + /* positive direction */ + source=(Mynode[dir]+1)%totnodes[dir]; + dest=(Mynode[dir]-1+totnodes[dir])%totnodes[dir]; + } + else + { + /* negative direction */ + dir=OPP_DIR(dir); + dest=(Mynode[dir]+1)%totnodes[dir]; + source=(Mynode[dir]-1+totnodes[dir])%totnodes[dir]; + } + + MPI_Sendrecv(sbuf,size,MPI_BYTE,dest,0, + rbuf,size,MPI_BYTE,source,0, + comm_subgrid[dir],&status); +} + +void make_nn_gathers( ) +{ + int x, y, z, t, xp, yp, zp, tp, xm, ym, zm, tm; + int i, ixp, ixm, iyp, iym, izp, izm, itp, itm, p; + MEMALIGN(neighbor, int *, 8 ); + for ( i = 0; i < 8; i++ ) + MEMALIGN(neighbor[i],int, sites_on_node); + /* neighbor = malloc( 8 * sizeof( int * ) ); */ + /* for ( i = 0; i < 8; i++ ) */ + /* { */ + /* neighbor[i] = malloc( sites_on_node * sizeof( int ) ); */ + /* } */ + for ( i = 0; i < 8; i++ ) + { + offnode_even[i] = offnode_odd[i] = 0; + } + for ( x = 0; x < nx; x++ ) + for ( y = 0; y < ny; y++ ) + for ( z = 0; z < nz; z++ ) + for ( t = 0; t < nt; t++ ) + if( node_number_KE( x, y, z, t ) == mynode_KE( ) ) + { + i = node_index_KE( x, y, z, t ); + p = lattice[i].parity; + xp = ( x + 1 ) % nx; + xm = ( x - 1 + nx ) % nx; + yp = ( y + 1 ) % ny; + ym = ( y - 1 + ny ) % ny; + zp = ( z + 1 ) % nz; + zm = ( z - 1 + nz ) % nz; + tp = ( t + 1 ) % nt; + tm = ( t - 1 + nt ) % nt; + ixp = node_index_KE( xp, y, z, t ); + ixm = node_index_KE( xm, y, z, t ); + iyp = node_index_KE( x, yp, z, t ); + iym = node_index_KE( x, ym, z, t ); + izp = node_index_KE( x, y, zp, t ); + izm = node_index_KE( x, y, zm, t ); + itp = node_index_KE( x, y, z, tp ); + itm = node_index_KE( x, y, z, tm ); + + if( node_number_KE( xp, y, z, t ) == mynode_KE( ) ) + { + neighbor[XUP][i] = ixp; + } + else + { + neighbor[XUP][i] = ixp + COM_BIT; + if( p == EVEN ) + { + offnode_even[XUP]++; + } + else + offnode_odd[XUP]++; + } + + if( node_number_KE( xm, y, z, t ) == mynode_KE( ) ) + { + neighbor[XDOWN][i] = ixm; + } + else + { + neighbor[XDOWN][i] = ixm + COM_BIT; + if( p == EVEN ) + { + offnode_even[XDOWN]++; + } + else + offnode_odd[XDOWN]++; + } + if( node_number_KE( x, yp, z, t ) == mynode_KE( ) ) + { + neighbor[YUP][i] = iyp; + } + else + { + neighbor[YUP][i] = iyp + COM_BIT; + if( p == EVEN ) + { + offnode_even[YUP]++; + } + else + offnode_odd[YUP]++; + } + if( node_number_KE( x, ym, z, t ) == mynode_KE( ) ) + { + neighbor[YDOWN][i] = iym; + } + else + { + neighbor[YDOWN][i] = iym + COM_BIT; + if( p == EVEN ) + { + offnode_even[YDOWN]++; + } + else + offnode_odd[YDOWN]++; + } + if( node_number_KE( x, y, zp, t ) == mynode_KE( ) ) + { + neighbor[ZUP][i] = izp; + } + else + { + neighbor[ZUP][i] = izp + COM_BIT; + if( p == EVEN ) + { + offnode_even[ZUP]++; + } + else + offnode_odd[ZUP]++; + } + if( node_number_KE( x, y, zm, t ) == mynode_KE( ) ) + { + neighbor[ZDOWN][i] = izm; + } + else + { + neighbor[ZDOWN][i] = izm + COM_BIT; + if( p == EVEN ) + { + offnode_even[ZDOWN]++; + } + else + offnode_odd[ZDOWN]++; + } + if( node_number_KE( x, y, z, tp ) == mynode_KE( ) ) + { + neighbor[TUP][i] = itp; + } + else + { + neighbor[TUP][i] = itp + COM_BIT; + if( p == EVEN ) + { + offnode_even[TUP]++; + } + else + offnode_odd[TUP]++; + } + if( node_number_KE( x, y, z, tm ) == mynode_KE( ) ) + { + neighbor[TDOWN][i] = itm; + } + else + { + neighbor[TDOWN][i] = itm + COM_BIT; + if( p == EVEN ) + { + offnode_even[TDOWN]++; + } + else + offnode_odd[TDOWN]++; + } + } +} /*}}} */ + +msg_tag *start_gather_from_temp( + /* arguments */ + void *field, /* which field? Pointer to the field array */ + int size, /* size in bytes of the field */ + int dist, /* distance of elements in the array */ + int dir, /* direction to gather from. eg XUP - index into + neighbor tables */ + int parity, /* parity of sites whose neighbors we gather. + one of EVEN, ODD or EVENANDODD. */ + char **dest ) /* one of the vectors of pointers */ +{ + int i, n, in, n_off; + site *st; + msg_tag *mbuf; + + mbuf = 0; + + switch ( parity ) + { + case EVEN: + n_off = offnode_even[dir]; + break; + case ODD: + n_off = offnode_odd[dir]; + break; + case EVENANDODD: + n_off = offnode_even[dir] + offnode_odd[dir]; + break; + default: + printf( "ERROR start_gather_from_temp: Wrong parity\n" ); + return ( 0 ); + } + if( n_off > 0 ) + { // Communication is needed + MEMALIGN(mbuf, msg_tag, 1 ); + /* mbuf = ( msg_tag * ) malloc( sizeof( msg_tag ) ); */ + mbuf->size = size; + MEMALIGN(mbuf->rbuf, char, size*n_off); + MEMALIGN(mbuf->sbuf, char, size*n_off); + MEMALIGN(mbuf->ptr, char *, n_off ); + /* mbuf->rbuf = memalign( 16, size * n_off ); */ + /* mbuf->sbuf = memalign( 16, size * n_off ); */ + /* mbuf->ptr = malloc( sizeof( char * ) * n_off ); */ + mbuf->done = 0; + mbuf->n = n_off; + } + n = 0; + FORSOMEPARITY( i, st, parity ) + { + if( ( in = neighbor[dir][i] ) < COM_BIT ) + { + dest[i] = ( char * ) field + dist * in; + } + else + { + dest[i] = mbuf->rbuf + n * size; + mbuf->ptr[n] = ( char * ) field + dist * ( in - COM_BIT ); + memcpy( mbuf->sbuf + n * size, mbuf->ptr[n], size ); + n++; + } + } + + /* node0_fprintf(stderr,"No of off points %d\n",n_off); */ + + if( n_off > 0 ) + { + if( dir > 7 ) + dir -= 8; + gen_send_recv( dir, mbuf->sbuf, mbuf->rbuf , ( mbuf->size ) * ( mbuf->n ) ); + } + return ( mbuf ); +} + +void restart_gather_from_temp( + /* arguments */ + void *field, /* which field? Some member of structure "site" */ + int size, /* size in bytes of the field */ + int dist, /* distance of elements in the array */ + int dir, /* direction to gather from. eg XUP - index into + neighbor tables */ + int parity, /* parity of sites whose neighbors we gather. + one of EVEN, ODD or EVENANDODD. */ + char **dest, /* one of the vectors of pointers */ + msg_tag * mbuf ) /* previously returned by start_gather */ +{ + int i; + if( mbuf == 0 ) + return; + if( mbuf->done != 1 ) + { + node0_printf( "ERROR restart_gather:previos gather was not waited for\n" ); + return; + } + mbuf->done = 0; + for ( i = 0; i < mbuf->n; i++ ) + { + memcpy( mbuf->sbuf + i * ( mbuf->size ), mbuf->ptr[i], mbuf->size ); + } + if( dir > 7 ) + dir -= 8; + gen_send_recv( dir, mbuf->sbuf , mbuf->rbuf , ( mbuf->size ) * ( mbuf->n ) ); +} + +msg_tag *start_general_gather_from_temp( + /* arguments */ + void *field, /* which field? Pointer to the field array */ + int size, /* size in bytes of the field (eg sizeof(su3_vector)) */ + int dist, /* distance of elements in the array */ + int *dst, /* direction to gather from. eg XUP - index into + neighbor tables */ + int parity, /* parity of sites whose neighbors we gather. + one of EVEN, ODD or EVENANDODD. */ + char **dest ) /* one of the vectors of pointers */ +{ + char *buf2; + char **ptr; + int i, d, dir, nd, n; + site *st; + msg_tag *tg; + msg_tag *mbuf; + + MEMALIGN(mbuf, msg_tag, 1) ; + mbuf->size=size; + mbuf->n=sites_on_node; + MEMALIGN(mbuf->rbuf, char, size*sites_on_node); + /* mbuf = ( msg_tag * ) malloc( sizeof( msg_tag ) ); */ + /* mbuf->rbuf = memalign( 16, size * sites_on_node ); */ + + MEMALIGN(ptr, char *, sites_on_node); + MEMALIGN(buf2, char, size*sites_on_node); + /* ptr = malloc( sites_on_node * sizeof( char * ) ); */ + /* buf2 = memalign( 16, size * sites_on_node ); */ + + FORALLSITES( i, st ) + { + memcpy( mbuf->rbuf + i * size, ( char * ) field + i * dist, size ); + } + for ( d = 0; d < 4; d++ ) + { + dir = d; + nd = dst[d]; + + if( nd < 0 ) + { + dir = 7 - d; + nd = -nd; + } + for ( n = 0; n < nd; n++ ) + { + tg = start_gather_from_temp( mbuf->rbuf, size, size, dir, EVENANDODD, ptr ); + wait_gather_KE( tg ); + // copy to buf2 + FORALLSITES( i, st ) + { + memcpy( buf2 + i * size, ptr[i], size ); + } + // copy back to buf1 + memcpy( mbuf->rbuf, buf2, size * sites_on_node ); + cleanup_gather( tg ); + } + } + FORSOMEPARITY( i, st, parity ) + { + dest[i] = mbuf->rbuf + i * size; + } + + FREE(ptr, char *, sites_on_node); + FREE(buf2, char, size*sites_on_node); + + return mbuf; +} + +void wait_gather_KE( msg_tag * mbuf ) +{ + if( mbuf == 0 ) + return; + mbuf->done = 1; + return; +} + +void wait_general_gather( msg_tag * mbuf ) +{ +} + +void cleanup_gather( msg_tag * mbuf ) +{ + if( mbuf != 0 ) + { + FREE(mbuf->rbuf, char, (mbuf->size)*(mbuf->n)); + FREE(mbuf->sbuf, char, (mbuf->size)*(mbuf->n)); + FREE(mbuf->ptr, char *, mbuf->n ); + FREE(mbuf, msg_tag, 1 ); + } +} + +void cleanup_general_gather( msg_tag * mbuf ) +{ + if( mbuf != 0 ) + { + FREE(mbuf->rbuf, char, (mbuf->size)*(mbuf->n)); + FREE(mbuf, msg_tag, 1 ); + } +} + +/* Synchronize all nodes by sending one double to each directions */ +void g_sync_KE( ) +{ + MPI_Barrier( MPI_COMM_WORLD ); +} + +void g_doublesum_KE( double *dpt ) +{ + double work; + MPI_Allreduce( dpt, &work, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD ); + *dpt = work; +} + +void g_doublemax_KE( double *dpt ) +{ + double work; + MPI_Allreduce( dpt, &work, 1, MPI_DOUBLE, MPI_MAX, MPI_COMM_WORLD ); + *dpt = work; +} + +void broadcast_double_KE( double *dpt ) +{ + MPI_Bcast( dpt, 1, MPI_DOUBLE, 0, MPI_COMM_WORLD ); +} + +void broadcast_char( char *buf, int size ) +{ + MPI_Bcast( buf, size, MPI_BYTE, 0, MPI_COMM_WORLD ); +} + +void broadcast_int_KE( int *buf ) +{ + MPI_Bcast( buf, 1, MPI_INTEGER, 0, MPI_COMM_WORLD ); +} + +/* Double precision time */ +double dclock( ) +{ + return MPI_Wtime(); +} + +/* version of exit for multinode processes -- kill all nodes */ +void terminate_KE( int status ) +{ + printf( "terminate: node %d, status = %d\n", this_node, status ); + exit( status ); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/arch_mpi/layout_mpi.c b/qcd/part_cpu/applications/QCD/src/kernel_E/arch_mpi/layout_mpi.c new file mode 100644 index 0000000000000000000000000000000000000000..0dd2aba96023a84072cc69591ec28a5926734c23 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/arch_mpi/layout_mpi.c @@ -0,0 +1,72 @@ +/******** layout_bgl.c *********/ +#include "includes.h" + +int loc_size[4]; /* local dimensions in machine directions */ +extern int totnodes[4]; /* number of nodes in machine directions */ + +void setup_layout_KE( ) +{ + node0_fprintf( file_o1, "setup_layout: 4d evenfirst\n" ); + + if( ( nx % totnodes[XUP] ) || ( ny % totnodes[YUP] ) + || ( nz % totnodes[ZUP] ) || ( nt % totnodes[TUP] ) ) + { + node0_printf( "ERROR setup_layout: Can't lay out this lattice.\n" ); + exit( 1 ); + } + + + loc_size[XUP] = nx / totnodes[XUP]; + loc_size[YUP] = ny / totnodes[YUP]; + loc_size[ZUP] = nz / totnodes[ZUP]; + loc_size[TUP] = nt / totnodes[TUP]; + sites_on_node = loc_size[XUP] * loc_size[YUP] * loc_size[ZUP] * loc_size[TUP]; + even_sites_on_node = odd_sites_on_node = sites_on_node / 2; + node0_fprintf( file_o1, "setup_layout: local lattice size: %d x %d x %d x %d\n", + loc_size[XUP], loc_size[YUP], loc_size[ZUP], loc_size[TUP] ); + + if( sites_on_node % 2 != 0 ) + { + node0_printf( "ERROR steup_layout: we need EVEN sites on node\n" ); + exit( 1 ); + } + +} + +int node_number_KE( int x, int y, int z, int t ) +{ + int coord[4]; + coord[XUP] = x / loc_size[XUP]; + coord[YUP] = y / loc_size[YUP]; + coord[ZUP] = z / loc_size[ZUP]; + coord[TUP] = t / loc_size[TUP]; + return coord[TUP]+totnodes[TUP]*coord[ZUP]+ + totnodes[TUP]*totnodes[ZUP]*coord[YUP]+ + totnodes[TUP]*totnodes[ZUP]*totnodes[YUP]*coord[XUP]; +} +/* TODO: largest loc_size[dir] should be the fastest */ +int node_index_KE( int x, int y, int z, int t ) +{ + int coord[4], i; + coord[XUP] = x % loc_size[XUP]; + coord[YUP] = y % loc_size[YUP]; + coord[ZUP] = z % loc_size[ZUP]; + coord[TUP] = t % loc_size[TUP]; + i = coord[TUP] + loc_size[TUP]*coord[ZUP]+ + loc_size[TUP]*loc_size[ZUP]*coord[YUP]+ + loc_size[TUP]*loc_size[ZUP]*loc_size[YUP]*coord[XUP]; + + if( ( x + y + z + t ) % 2 == 0 ) + { /* even site */ + return ( i / 2 ); + } + else + { + return ( ( i + sites_on_node ) / 2 ); + } +} + +int num_sites( int node ) +{ + return ( sites_on_node ); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/arch_mpi/module.mk b/qcd/part_cpu/applications/QCD/src/kernel_E/arch_mpi/module.mk new file mode 100644 index 0000000000000000000000000000000000000000..ae70fdb355fb704cbdd9a5a976bc5425d0216c16 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/arch_mpi/module.mk @@ -0,0 +1,6 @@ +MYFILES_ARCH:= \ + mpi_utils.c \ + com_mpi.c \ + layout_mpi.c + +SOURCE+= $(patsubst %,arch_mpi/%,$(MYFILES_ARCH)) diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/arch_mpi/mpi_utils.c b/qcd/part_cpu/applications/QCD/src/kernel_E/arch_mpi/mpi_utils.c new file mode 100644 index 0000000000000000000000000000000000000000..1d823b8ee9f2d374d760f2f4c8cb4e8e7934c7fc --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/arch_mpi/mpi_utils.c @@ -0,0 +1,683 @@ +#include "includes.h" + +/* matrix x matrix */ +void mult_su3_na_KE( su3_matrix *a, su3_matrix *b, su3_matrix *c ) +{ + register int i,j,k; + register complex x,y; + for(i=0;i<3;i++)for(j=0;j<3;j++) + { + x.real=x.imag=0.0; + for(k=0;k<3;k++){ + CMUL_J( a->ROWCOL(i,k) , b->ROWCOL(j,k) , y ); + CSUM( x , y ); + } + c->ROWCOL(i,j) = x; + } +} +void mult_su3_nn_KE( su3_matrix *a, su3_matrix *b, su3_matrix *c ) +{ + register int i,j,k; + register complex x,y; + for(i=0;i<3;i++)for(j=0;j<3;j++){ + x.real=x.imag=0.0; + for(k=0;k<3;k++){ + CMUL( a->ROWCOL(i,k) , b->ROWCOL(k,j) , y ); + CSUM( x , y ); + } + c->ROWCOL(i,j).real = x.real; + c->ROWCOL(i,j).imag = x.imag; + } +} +void mult_su3_an_KE( su3_matrix *a, su3_matrix *b, su3_matrix *c ) +{ + register int i,j,k; + register complex x,y; + for(i=0;i<3;i++)for(j=0;j<3;j++){ + x.real=x.imag=0.0; + for(k=0;k<3;k++){ + CMULJ_( a->ROWCOL(k,i) , b->ROWCOL(k,j), y ); + CSUM( x , y ); + } + c->ROWCOL(i,j) = x; + } +} +void mult_su3_aa_KE( su3_matrix *a, su3_matrix *b, su3_matrix *c ) +{ + register int i,j,k; + register complex x,y; + for(i=0;i<3;i++)for(j=0;j<3;j++){ + x.real=x.imag=0.0; + for(k=0;k<3;k++){ + CMULJJ( a->ROWCOL(k,i) , b->ROWCOL(j,k), y ); + CSUM( x , y ); + } + c->ROWCOL(i,j) = x; + } +} + + +inline void mult_su3_32(float *A, float *x, float *y) +{ + register int c, d; + register float re, im; + for (c=0;c<3;c++) + { + re=im=0; + for (d=0;d<3;d++) + { + re+=A[2*(3*c+d)+0]*x[2*d+0]-A[2*(3*c+d)+1]*x[2*d+1]; + im+=A[2*(3*c+d)+1]*x[2*d+0]+A[2*(3*c+d)+0]*x[2*d+1]; + } + y[2*c+0]=re; + y[2*c+1]=im; + } +} + +inline void mult_adj_su3_32(float *A, float *x, float *y) +{ + register int c, d; + register float re, im; + for (c=0;c<3;c++) + { + re=im=0; + for (d=0;d<3;d++) + { + re+=A[2*(3*d+c)+0]*x[2*d+0]+A[2*(3*d+c)+1]*x[2*d+1]; + im+=-A[2*(3*d+c)+1]*x[2*d+0]+A[2*(3*d+c)+0]*x[2*d+1]; + } + y[2*c+0]=re; + y[2*c+1]=im; + } +} + +/* dslash */ +void latutil_dslash0_32(float *src, + float *b1, float *b2, float *b3, float *b4, + int isign, int parity) +{ + int i, c; + int lat_begin, lat_end; + + lat_begin = 0; + if( parity == ODD ) + lat_begin = even_sites_on_node; + lat_end = sites_on_node; + if( parity == EVEN ) + lat_end = even_sites_on_node; + + if (isign==PLUS) + { + for (i=lat_begin;it > 0 ) + { +#else + for ( dir = XUP; dir <= TUP; dir++ ) + { +#endif + mat = ( su3_matrix * ) & ( gauge[4 * i + dir] ); + deviation = check_su3( mat ); + if( deviation > TOLERANCE ) + { + printf( "Unitarity problem on node %d, site %d, dir %d, deviation=%f\n", mynode_KE( ), i, dir, deviation ); + printf( "SU3 matrix:\n" ); + for ( ii = 0; ii <= 2; ii++ ) + { + for ( jj = 0; jj <= 2; jj++ ) + { + printf( "%f ", ( *mat ).e[ii][jj].real ); + printf( "%f ", ( *mat ).e[ii][jj].imag ); + } + printf( "\n" ); + } + printf( "repeat in hex:\n" ); + for ( ii = 0; ii <= 2; ii++ ) + { + for ( jj = 0; jj <= 2; jj++ ) + { + ifval.fval = ( *mat ).e[ii][jj].real; + printf( "%08x ", ifval.ival ); + ifval.fval = ( *mat ).e[ii][jj].imag; + printf( "%08x ", ifval.ival ); + } + printf( "\n" ); + } + printf( " \n \n" ); + fflush( stdout ); + terminate_KE( 1 ); + } + if( max_deviation < deviation ) + max_deviation = deviation; + av_deviation += deviation * deviation; + } + } + av_deviation = sqrt( av_deviation / ( 4 * i ) ); +#ifdef UNIDEBUG + printf( "Deviation from unitarity on node %d: max %g, avrg %g\n", mynode_KE( ), max_deviation, av_deviation ); +#endif + if( max_deviation > TOLERANCE ) + printf( "Unitarity problem on node %d, maximum deviation=%f\n", mynode_KE( ), max_deviation ); + return max_deviation; +} /*check_unitarity() */ + +double check_su3( su3_matrix * c ) +{ + register double ar, ai, ari, max; + register int i; + + /* first normalize row */ + for ( i = 0, max = 0.; i < 3; ++i ) + { + ar = ( *c ).ROWCOL( i, 0 ).real * ( *c ).ROWCOL( i, 0 ).real + /* sum of squares of row */ + ( *c ).ROWCOL( i, 0 ).imag * ( *c ).ROWCOL( i, 0 ).imag + + ( *c ).ROWCOL( i, 1 ).real * ( *c ).ROWCOL( i, 1 ).real + + ( *c ).ROWCOL( i, 1 ).imag * ( *c ).ROWCOL( i, 1 ).imag + + ( *c ).ROWCOL( i, 2 ).real * ( *c ).ROWCOL( i, 2 ).real + ( *c ).ROWCOL( i, 2 ).imag * ( *c ).ROWCOL( i, 2 ).imag; + ar = fabs( sqrt( ( double ) ar ) - 1. ); + if( max < ar ) + max = ar; + } + +#ifdef STRONG + + /* Test orthogonality of row 0 and row 1 */ + ar = ( *c ).ROWCOL( 0, 0 ).real * ( *c ).ROWCOL( 1, 0 ).real + /* real part of 0 dot 1 */ + ( *c ).ROWCOL( 0, 0 ).imag * ( *c ).ROWCOL( 1, 0 ).imag + + ( *c ).ROWCOL( 0, 1 ).real * ( *c ).ROWCOL( 1, 1 ).real + + ( *c ).ROWCOL( 0, 1 ).imag * ( *c ).ROWCOL( 1, 1 ).imag + + ( *c ).ROWCOL( 0, 2 ).real * ( *c ).ROWCOL( 1, 2 ).real + ( *c ).ROWCOL( 0, 2 ).imag * ( *c ).ROWCOL( 1, 2 ).imag; + ai = ( *c ).ROWCOL( 0, 0 ).real * ( *c ).ROWCOL( 1, 0 ).imag - /* imag part of 0 dot 1 */ + ( *c ).ROWCOL( 0, 0 ).imag * ( *c ).ROWCOL( 1, 0 ).real + + ( *c ).ROWCOL( 0, 1 ).real * ( *c ).ROWCOL( 1, 1 ).imag - + ( *c ).ROWCOL( 0, 1 ).imag * ( *c ).ROWCOL( 1, 1 ).real + + ( *c ).ROWCOL( 0, 2 ).real * ( *c ).ROWCOL( 1, 2 ).imag - ( *c ).ROWCOL( 0, 2 ).imag * ( *c ).ROWCOL( 1, 2 ).real; + + ari = sqrt( ( double ) ( ar * ar + ai * ai ) ); + if( max < ari ) + max = ari; + + /* Test orthogonality of row 0 and row 2 */ + ar = ( *c ).ROWCOL( 0, 0 ).real * ( *c ).ROWCOL( 2, 0 ).real + /* real part of 0 dot 1 */ + ( *c ).ROWCOL( 0, 0 ).imag * ( *c ).ROWCOL( 2, 0 ).imag + + ( *c ).ROWCOL( 0, 1 ).real * ( *c ).ROWCOL( 2, 1 ).real + + ( *c ).ROWCOL( 0, 1 ).imag * ( *c ).ROWCOL( 2, 1 ).imag + + ( *c ).ROWCOL( 0, 2 ).real * ( *c ).ROWCOL( 2, 2 ).real + ( *c ).ROWCOL( 0, 2 ).imag * ( *c ).ROWCOL( 2, 2 ).imag; + ai = ( *c ).ROWCOL( 0, 0 ).real * ( *c ).ROWCOL( 2, 0 ).imag - /* imag part of 0 dot 1 */ + ( *c ).ROWCOL( 0, 0 ).imag * ( *c ).ROWCOL( 2, 0 ).real + + ( *c ).ROWCOL( 0, 1 ).real * ( *c ).ROWCOL( 2, 1 ).imag - + ( *c ).ROWCOL( 0, 1 ).imag * ( *c ).ROWCOL( 2, 1 ).real + + ( *c ).ROWCOL( 0, 2 ).real * ( *c ).ROWCOL( 2, 2 ).imag - ( *c ).ROWCOL( 0, 2 ).imag * ( *c ).ROWCOL( 2, 2 ).real; + + ari = sqrt( ( double ) ( ar * ar + ai * ai ) ); + if( max < ari ) + max = ari; + + /* Test orthogonality of row 1 and row 2 */ + ar = ( *c ).ROWCOL( 1, 0 ).real * ( *c ).ROWCOL( 2, 0 ).real + /* real part of 0 dot 1 */ + ( *c ).ROWCOL( 1, 0 ).imag * ( *c ).ROWCOL( 2, 0 ).imag + + ( *c ).ROWCOL( 1, 1 ).real * ( *c ).ROWCOL( 2, 1 ).real + + ( *c ).ROWCOL( 1, 1 ).imag * ( *c ).ROWCOL( 2, 1 ).imag + + ( *c ).ROWCOL( 1, 2 ).real * ( *c ).ROWCOL( 2, 2 ).real + ( *c ).ROWCOL( 1, 2 ).imag * ( *c ).ROWCOL( 2, 2 ).imag; + ai = ( *c ).ROWCOL( 1, 0 ).real * ( *c ).ROWCOL( 2, 0 ).imag - /* imag part of 0 dot 1 */ + ( *c ).ROWCOL( 1, 0 ).imag * ( *c ).ROWCOL( 2, 0 ).real + + ( *c ).ROWCOL( 1, 1 ).real * ( *c ).ROWCOL( 2, 1 ).imag - + ( *c ).ROWCOL( 1, 1 ).imag * ( *c ).ROWCOL( 2, 1 ).real + + ( *c ).ROWCOL( 1, 2 ).real * ( *c ).ROWCOL( 2, 2 ).imag - ( *c ).ROWCOL( 1, 2 ).imag * ( *c ).ROWCOL( 2, 2 ).real; + + ari = sqrt( ( double ) ( ar * ar + ai * ai ) ); + if( max < ari ) + max = ari; + +#endif /*STRONG*/ + return ( max ); + +} /* check_su3 */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/congrad.c b/qcd/part_cpu/applications/QCD/src/kernel_E/congrad.c new file mode 100644 index 0000000000000000000000000000000000000000..ce93d8320e60b401341498e15e306d49ab28244a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/congrad.c @@ -0,0 +1,294 @@ +#include "./include/includes.h" +#include + +/* memory */ +wilson_vector *cg_p, *cg_mp, *cg_res, *cg_temp; +float *cg_p_32, *cg_mp_32, *cg_res_32, *cg_temp_32; +int cg_ishift; + +void malloc_cg64() +{ + MEMALIGN(cg_p,wilson_vector,sites_on_node); + MEMALIGN(cg_mp,wilson_vector,sites_on_node); + MEMALIGN(cg_res,wilson_vector,sites_on_node); + MEMALIGN(cg_temp,wilson_vector,sites_on_node); +} +void free_cg64() +{ + FREE(cg_p,wilson_vector,sites_on_node); + FREE(cg_mp,wilson_vector,sites_on_node); + FREE(cg_res,wilson_vector,sites_on_node); + FREE(cg_temp,wilson_vector,sites_on_node); +} +void malloc_cg32() +{ + MEMALIGN(cg_p_32,float,24*sites_on_node); + MEMALIGN(cg_mp_32,float,24*sites_on_node); + MEMALIGN(cg_res_32,float,24*sites_on_node); + MEMALIGN(cg_temp_32,float,24*sites_on_node); +} +void free_cg32() +{ + FREE(cg_p_32,float,24*sites_on_node); + FREE(cg_mp_32,float,24*sites_on_node); + FREE(cg_res_32,float,24*sites_on_node); + FREE(cg_temp_32,float,24*sites_on_node); +} + +/* 32 bit CG. starts from 0 */ +/* wvec: start contains src, end contains dest */ +/* cg_p_32: search direction has to be set! */ +double global_res; +int congrad_32(float *wvec, double myrsqmin, int maxniter, double shift) +{ + int it = 0, i, j; /*counter for iterations */ + site *s; + double flp=0; + double alpha, mybeta; + + double source_norm; + double rsq, oldrsq, pkp; + + double mytime; + mytime = -dclock( ); + + malloc_cg32(); + + /* r=res_32, rsq=|r|^2, source_norm=rsq */ + rsq = 0; + FORALLSITES( i, s ) + { + for (j=0;j<24;j++) + { + cg_p_32[24*i+j]=cg_res_32[24*i+j]=wvec[24*i+j]; + rsq+=(double)(cg_res_32[24*i+j]*cg_res_32[24*i+j]); + wvec[24*i+j]=0.0; + } + } + g_doublesum_KE( &rsq ); + source_norm=rsq; + if( rsq <= myrsqmin ) + { + goto end; + } + for ( it = 0; it < maxniter; it++ ) + { + /* rsq -> oldrsq */ + oldrsq = rsq; + + /* mp = M+M*p, pkp = p*mp */ + multiply_fmat_32( cg_p_32, cg_temp_32, 1 ); + multiply_fmat_32( cg_temp_32, cg_mp_32, -1 ); + + if (cg_ishift) + { + latutil_axpy_32((float)shift,cg_p_32,cg_mp_32,EVENANDODD); + } + pkp=latutil_rdot_32(cg_p_32,cg_mp_32,EVENANDODD); + g_doublesum_KE( &pkp ); + + /* mybeta = rsq/pkp */ + mybeta = rsq / pkp; + + /* dest += mybeta*p */ + latutil_axpy_32((float)mybeta,cg_p_32,wvec,EVENANDODD); + + /* r -= mybeta*mp */ + rsq=latutil_axpy_nrm2_32(-(float)mybeta,cg_mp_32,cg_res_32,EVENANDODD); + g_doublesum_KE( &rsq ); + + /* alpha = rsq/oldrsq */ + alpha = rsq / oldrsq; + + /* p = r+alpha*p */ + latutil_xpay_32(cg_res_32,(float)alpha,cg_p_32,EVENANDODD); + +/* do not end the iteration */ +/* if( rsq <= source_norm*myrsqmin ) */ +/* goto end; */ + + if (it%200==0) + node0_fprintf(file_o1,"congrad_32: %d prec= %e\n", + it,sqrt(rsq/source_norm)); + + } + verbose_fprintf( file_o1, "congrad_32: maxiter= %d reached prec= %e\n", + maxniter,sqrt(rsq/source_norm)); + +end: + mytime += dclock( ); + /* flops in the infinit vol limit */ + flp = 2.0*1320.0+ + 48.0+ + 24.0+ + 48.0+ + 72.0+ + 48.0; + if (cg_ishift) + flp += 48.0; + flp *= 1.0*it; + flp += 24.0; + flp *= 1.0*sites_on_node; + verbose_fprintf( file_o1, "congrad_32: it= %d\t%.3g sec\t%.3g GFlop/s/thread (%e Flop)\n", + it, mytime, flp / mytime / 1.e9, flp ); + + free_cg32(); + return it; +} + + +/* original cg */ +int congrad_64( wilson_vector *src, wilson_vector *dest, int maxniter, double myrsqmin, double shift) +{ + int it = 0, i; /*counter for iterations */ + site *s; + double alpha, mybeta; + + double source_norm; + double rsqstop; + double rsq, oldrsq, pkp; + double flp; + + double mytime; + mytime = -dclock( ); + int nmatmul=0; + + malloc_cg64(); + + + /* shifted version */ + double absshift; + cg_ishift=0; + absshift=(shift>=0) ? (shift) : (-shift); + if (absshift>DBL_EPSILON) + { + cg_ishift=1; + } + + multiply_fmat( dest, cg_temp, 1 ); + multiply_fmat( cg_temp, cg_mp, -1 ); + + /*r=p=src-(M+M)*dest, rsq=|r|^2, source_norm=|src|^2 */ + source_norm = rsq = 0; + FORALLSITES( i, s ) + { + if (cg_ishift) + scalar_mult_add_wvec( &( cg_mp[i] ), &( dest[i] ), shift, &( cg_mp[i] ) ); + + sub_wilson_vector( &( src[i] ), &( cg_mp[i] ), &( cg_res[i] ) ); + copy_wvec( &( cg_res[i] ), &( cg_p[i] ) ); + + rsq += ( double ) magsq_wvec( &( cg_res[i] ) ); + source_norm += ( double ) magsq_wvec( &( src[i] ) ); + } + g_doublesum_KE( &rsq ); + g_doublesum_KE( &source_norm ); + +/* do not end the iteration */ + rsqstop = myrsqmin * source_norm; +/* if( rsq <= rsqstop || source_norm <= myrsqmin ) */ +/* { */ +/* goto end; */ +/* } */ + + it=0; + while (/*rsq>rsqstop &&*/ (nmatmul)<2*maxniter) + { + /* // rsq -> oldrsq */ + oldrsq = rsq; + + /* // mp = M+M*p, pkp = p*mp */ + multiply_fmat( cg_p, cg_temp, 1 ); + multiply_fmat( cg_temp, cg_mp, -1 ); + nmatmul+=2; + + pkp = 0.0; + if (cg_ishift) + { + FORALLSITES( i, s ) + { + scalar_mult_add_wvec( &( cg_mp[i] ), &( cg_p[i] ), shift, &( cg_mp[i] ) ); + pkp += ( double ) wvec_rdot( &( cg_p[i] ), &( cg_mp[i] ) ); + } + } + else + { + FORALLSITES( i, s ) + { + pkp += ( double ) wvec_rdot( &( cg_p[i] ), &( cg_mp[i] ) ); + } + } + g_doublesum_KE( &pkp ); + mybeta = rsq / pkp; + + /* // dest += mybeta*p */ + FORALLSITES( i, s ) + { + scalar_mult_add_wvec( &( dest[i] ), &( cg_p[i] ), mybeta, &( dest[i] ) ); + } + + /* // r -= mybeta*mp */ + rsq = 0.; + FORALLSITES( i, s ) + { + scalar_mult_add_wvec( &( cg_res[i] ), &( cg_mp[i] ), -mybeta, &( cg_res[i] ) ); + rsq += ( double ) magsq_wvec( &( cg_res[i] ) ); + } + g_doublesum_KE( &rsq ); + + alpha = rsq / oldrsq; + + /* // p = r+alpha*p */ + FORALLSITES( i, s ) + { + scalar_mult_add_wvec( &( cg_res[i] ), &( cg_p[i] ), alpha, &( cg_p[i] ) ); + } + if (it%200==0) + node0_fprintf(file_o1,"congrad_orig: %d prec= %e\n", + it,sqrt(rsq/source_norm)); + + + it++; + } + + if ((nmatmul)>=2*maxniter) + node0_fprintf( file_o1, "WARNING congrad_orig: not converged after it=%i: mvm= %d %e > %e\n", + it, nmatmul, sqrt( rsq / source_norm ), sqrt( myrsqmin ) ); + +end: + if( source_norm <= myrsqmin ) + { + FORALLSITES( i, s ) clear_wvec( &dest[i] ); + } + + /* timing */ + mytime += dclock( ); + + if (!cg_ishift) + node0_fprintf(file_o1,"congrad_orig: end %d prec= %e mvm= %d time= %.3g\n", + it,sqrt(rsq/source_norm),nmatmul,mytime); + + /* // flops in the infinit vol limit */ + flp = 2.0*1320.0 + +24.0 + +48.0 + +48.0 + +36.0 + +48.0; + if (cg_ishift) + flp += 48.0; + flp *= 1.0*it; + flp += 2.0*1320.0 + + 48.0 + + 24.0 + + 36.0 + + 36.0; + flp *= 1.0*sites_on_node; + verbose_fprintf( file_o1, "congrad_orig: it= %d\t%.3g sec\t%.3g GFlop/s/thread (%e Flop)\n", + it, mytime, flp / mytime / 1.e9, flp ); + + free_cg64(); + + return nmatmul; +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/control.c b/qcd/part_cpu/applications/QCD/src/kernel_E/control.c new file mode 100644 index 0000000000000000000000000000000000000000..23b76a5a7e9b6d72c890183fd401f59012ce0943 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/control.c @@ -0,0 +1,79 @@ +/************************ control.c ******************************/ +/* MIMD version 6 */ +/* Main procedure for SU3 with dynamical Wilson fermions */ + +#define CONTROL + +#include "./include/includes.h" + +/* JuBE */ +void kernel_e( ) +{ + +/* JuBE */ + int jube_kernel_number = 4; + jube_kernel_init(&jube_kernel_number); + + debug(); +/* JuBE */ +/* MPI_Init(&argc, &argv); */ + debug(); +/* JuBE: no args needed */ + initialize_machine_KE(); + debug(); + g_sync_KE( ); + +/* JuBE: std para file kernel_E.input */ + setup_KE( ); + + /* Measure performance */ + { + wilson_vector *latwvec_a, *latwvec_b; + int i,j; site *s; + float *vec_a; + int iters; + + MEMALIGN(latwvec_a,wilson_vector, sites_on_node); + MEMALIGN(latwvec_b,wilson_vector, sites_on_node); + MEMALIGN(vec_a,float,24*sites_on_node); + + /* 64 bit vectors*/ + unit_wvec(latwvec_a,EVENANDODD); + clear_latwvec(latwvec_b,EVENANDODD); + + /* 32 bit vectors*/ + FORALLSITES( i, s) + { + for (j=0;j<24;j++) + { + if (j%2){ + vec_a[24*i+j]=0.0; + }else{ + vec_a[24*i+j]=1.0; + } + } + } + + int congrad_32(float *wvec, double myrsqmin, int maxniter, double shift); + int congrad_64( wilson_vector *src, wilson_vector *dest, int maxniter, double myrsqmin, double shift); + +/* JuBE: */ + jube_kernel_run(); + + iters = congrad_32(vec_a, 1e-8, max_cg_iters, 0.0); + iters = congrad_64(latwvec_a, latwvec_b, max_cg_iters, 1e-16, 0.0); + +/* JuBE: */ + jube_kernel_finalize(); + + FREE(latwvec_a,wilson_vector, sites_on_node); + FREE(latwvec_b,wilson_vector, sites_on_node); + FREE(vec_a,float,24*even_sites_on_node); + } + +/* JuBE: */ + jube_kernel_end(); + + +/* return 0; */ +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/d_plaq4.c b/qcd/part_cpu/applications/QCD/src/kernel_E/d_plaq4.c new file mode 100644 index 0000000000000000000000000000000000000000..a3f595fe80ee342a21a92e1c14d379feef95e2c0 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/d_plaq4.c @@ -0,0 +1,74 @@ +/************************** d_plaq4.c *******************************/ +/* MIMD version 6 */ +/* This version mallocs the temporary su3_matrix */ + +/* Double precision version of "plaquette4.c" including optional + Schroedinger functional - UMH - 1/27/00 */ + +/* Measure the average plaquette of the space-space and + space-time plaquettes */ + +#include "./include/includes.h" + +void d_plaquette( double *ss_plaq, double *st_plaq ) +{ + /* su3mat is scratch space of size su3_matrix */ + register int i, dir1, dir2; + register site *s; + register su3_matrix *m1, *m4; + static su3_matrix mtmp __attribute__ ( ( aligned( 64 ) ) ); + su3_matrix *su3_gath; + double ss_sum, st_sum; + msg_tag *mtag0, *mtag1; + ss_sum = st_sum = 0.0; + + MEMALIGN(su3_gath, su3_matrix, sites_on_node); + + for ( dir1 = YUP; dir1 <= TUP; dir1++ ) + { + for ( dir2 = XUP; dir2 < dir1; dir2++ ) + { + + mtag0 = start_gather_from_temp( &( gauge[dir2] ), sizeof( su3_matrix ), + 4 * sizeof( su3_matrix ), dir1, EVENANDODD, gen_pt[0] ); + mtag1 = start_gather_from_temp( &( gauge[dir1] ), sizeof( su3_matrix ), + 4 * sizeof( su3_matrix ), dir2, EVENANDODD, gen_pt[1] ); + + FORALLSITES( i, s ) + { + m1 = &( gauge[4 * i + dir1] ); + m4 = &( gauge[4 * i + dir2] ); + mult_su3_an_KE( m4, m1, &su3_gath[i] ); + } + wait_gather_KE( mtag0 ); + wait_gather_KE( mtag1 ); + + + FORALLSITES( i, s ) + { + mult_su3_nn_KE( &su3_gath[i], ( su3_matrix * ) ( gen_pt[0][i] ), &mtmp ); + if( dir1 == TUP ) + st_sum += ( double ) realtrace_su3_KE( ( su3_matrix * ) ( gen_pt[1][i] ), &mtmp ); + + else + ss_sum += ( double ) realtrace_su3_KE( ( su3_matrix * ) ( gen_pt[1][i] ), &mtmp ); + /* plaquette in usual definition */ + /* if(dir1==TUP )st_sum += 1.0-1./3.*(double) + realtrace_su3_KE((su3_matrix *)(gen_pt[1][i]),&mtmp); + else ss_sum += 1.0-1./3.*(double) + realtrace_su3_KE((su3_matrix *)(gen_pt[1][i]),&mtmp); + */ + } + + cleanup_gather( mtag0 ); + cleanup_gather( mtag1 ); + } + } + g_doublesum_KE( &ss_sum ); + g_doublesum_KE( &st_sum ); + *ss_plaq = ss_sum / ( ( double ) ( 3 * nx * ny * nz * nt ) ); + *st_plaq = st_sum / ( ( double ) ( 3 * nx * ny * nz * nt ) ); + node0_fprintf( file_o1, "d_plaquette: %.15e\t%.15e\n", *ss_plaq, *st_plaq ); + + FREE(su3_gath, su3_matrix, sites_on_node); +} /* d_plaquette4 */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/fermion_stuff.c b/qcd/part_cpu/applications/QCD/src/kernel_E/fermion_stuff.c new file mode 100644 index 0000000000000000000000000000000000000000..35494a52193677ec40bd8021f189ea4a28c756c0 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/fermion_stuff.c @@ -0,0 +1,334 @@ +#include "./include/includes.h" +/* create gaussian random wilson vector */ +void grand_wvec( wilson_vector * chi, int parity ) +{ + register int i, j, k; + register site *s; + FORSOMEPARITY( i, s, parity ) + { + for ( k = 0; k < 4; k++ ) + for ( j = 0; j < 3; j++ ) + { +#ifdef FUNNY + chi[i].COLORSPINOR( j, k ).real = ( double ) ( ( s->x ) * ( s->t ) ) / 137.0 - k * 0.1 + 0.2; + chi[i].COLORSPINOR( j, k ).imag = -( double ) ( ( s->y ) + ( s->z ) ) / 42.0 + j * 0.2 - 0.1; +#else + chi[i].COLORSPINOR( j, k ).real = gaussian_rand_no_KE( ); + chi[i].COLORSPINOR( j, k ).imag = gaussian_rand_no_KE( ); +#endif + } + } +} + +/* all real element is 1 */ +void unit_wvec(wilson_vector *wvec, int parity) +{ + int i; + site *s; + FORSOMEPARITY(i,s,parity) + { + clear_wvec(wvec+i); + (wvec+i)->COLORSPINOR(0,0).real=1.0; + (wvec+i)->COLORSPINOR(1,0).real=1.0; + (wvec+i)->COLORSPINOR(2,0).real=1.0; + (wvec+i)->COLORSPINOR(0,1).real=1.0; + (wvec+i)->COLORSPINOR(1,1).real=1.0; + (wvec+i)->COLORSPINOR(2,1).real=1.0; + (wvec+i)->COLORSPINOR(0,2).real=1.0; + (wvec+i)->COLORSPINOR(1,2).real=1.0; + (wvec+i)->COLORSPINOR(2,2).real=1.0; + (wvec+i)->COLORSPINOR(0,3).real=1.0; + (wvec+i)->COLORSPINOR(1,3).real=1.0; + (wvec+i)->COLORSPINOR(2,3).real=1.0; + } +} + +/* create gaussian random wilson vector */ +void clear_latwvec( wilson_vector * chi, int parity ) +{ + register int i, j, k; + register site *s; + FORSOMEPARITY( i, s, parity ) + { + for ( k = 0; k < 4; k++ ) + for ( j = 0; j < 3; j++ ) + { + chi[i].COLORSPINOR( j, k ).real = 0.0; + chi[i].COLORSPINOR( j, k ).imag = 0.0; + } + } +} + +/* funny vector */ +void funny_wvec( wilson_vector * chi ) +{ + register int i, j, k; + register site *s; + FORALLSITES( i, s ) + { + for ( k = 0; k < 4; k++ ) + for ( j = 0; j < 3; j++ ) + { + chi[i].COLORSPINOR( j, k ).real = ( double ) ( ( s->x ) * ( s->t ) ) / 137.0 - k * 0.1 + 0.2; + chi[i].COLORSPINOR( j, k ).imag = -( double ) ( ( s->y ) + ( s->z ) ) / 42.0 + j * 0.2 - 0.1; + } + + } +} + + +/* some linear algebra */ +void latutil_xpay_32(float * x, float a, float * y, int parity) +{ + int i, lat_begin, lat_end; + + lat_begin = 0; + if( parity == ODD ) + lat_begin = even_sites_on_node*24; + lat_end = sites_on_node*24; + if( parity == EVEN ) + lat_end = even_sites_on_node*24; + for (i=lat_begin;i +#endif + +#define MAX_GATHERS 32 /* Maximum number of gather tables */ + +/* arguments to the make_gather() routine */ +#define FORWARDS 1 +#define BACKWARDS (-1) /* BACKWARDS = -FORWARDS */ +#define OWN_INVERSE 0 +#define WANT_INVERSE 1 +#define NO_INVERSE 2 +#define ALLOW_EVEN_ODD 0 +#define NO_EVEN_ODD 1 +#define SAME_PARITY 0 +#define SWITCH_PARITY 1 +#define SCRAMBLE_PARITY 2 + +#ifdef COMM_GB +/* communication types */ +#define COMM_CPU1 0 +#define COMM_CPU2 1 +#define COMM_ROW 2 +#define COMM_COL 3 +#define COMM_ROWCOL 4 +#endif + +/* Structure to keep track of outstanding sends and receives */ +typedef struct +{ + int size; /* size of each element of the field */ + int n; /* number of sites to be sent/received */ + char *sbuf; /* send buffer */ + char *rbuf; /* receive buffer */ + char *rbuf_temp; /* temporary receive buffer */ + char **ptr; /* pointers to fields to speed up restart_gather */ +#ifdef COMM_MPI + MPI_Request srequest, rrequest; +#endif + int done; +} msg_tag; + + + + +/**********************************************************************/ +/* Declarations for all routines called in the com_*.c files */ + +void start_handlers( ); + +/* JuBE: no args needed */ +void initialize_machine_KE(); + +void make_nn_gathers( ); +void sort_eight_special( void **pt ); + +void neighbor_coords_special( int x, int y, int z, int t, /* coordinates of site */ + int *dirpt, /* direction (eg XUP) */ + int fb, /* "forwards/backwards" */ + int *x2p, int *y2p, int *z2p, int *t2p ); + /* pointers to coordinates of neighbor */ +int make_gather( void ( *func ) ( int, int, int, int, int *, int, int *, int *, int *, int * ), + /* function which defines sites to gather from */ + int *args, /* list of arguments, to be passed to function */ + int inverse, /* OWN_INVERSE, WANT_INVERSE, or NO_INVERSE */ + int want_even_odd, /* ALLOW_EVEN_ODD or NO_EVEN_ODD */ + int parity_conserve ); /* {SAME,SWITCH,SCRAMBLE}_PARITY */ + +void neighbor_coords( int x, int y, int z, int t, /* coordinates of site */ + int dir, /* direction (eg XUP) */ + int *x2p, int *y2p, int *z2p, int *t2p ); + /* pointers to coordinates of neighbor */ +msg_tag *start_gather_from_temp( + /* arguments */ + void *field, /* which field? pointer returned by malloc() */ + int size, /* size in bytes of the field (eg sizeof(su3_vector)) */ + int dist, int index, /* direction to gather from. eg XUP - index into + neighbor tables */ + int parity, /* parity of sites whose neighbors we gather. + one of EVEN, ODD or EVENANDODD. */ + char **dest ); /* one of the vectors of pointers */ + +void restart_gather_from_temp( + /* arguments */ + void *field, /* which field? pointer returned by malloc() */ + int size, /* size in bytes of the field (eg sizeof(su3_vector)) */ + int dist, int index, /* direction to gather from. eg XUP - index into + neighbor tables */ + int parity, /* parity of sites whose neighbors we gather. + one of EVEN, ODD or EVENANDODD. */ + char **dest, /* one of the vectors of pointers */ + msg_tag * mbuf ); /* previously returned by start_gather */ + +void wait_gather_KE( msg_tag * mbuf ); +void cleanup_gather( msg_tag * mbuf ); + +msg_tag *start_general_gather_from_temp( + /* arguments */ + void *field, /* which field? Some member of structure "site" */ + int size, /* size in bytes of the field (eg sizeof(su3_vector)) */ + int dist, /* separation */ + int *displacement, /* displacement to gather from. four components */ + int parity, /* parity of sites to which we gather. + one of EVEN, ODD or EVENANDODD. */ + char **dest ); /* one of the vectors of pointers */ + +void wait_general_gather( msg_tag * mbuf ); +void cleanup_general_gather( msg_tag * mbuf ); + +void node0_printf( const char *fmt, ... ); +void verbose_fprintf( FILE * file, const char *fmt, ... ); +void node0_fprintf( FILE * file, const char *fmt, ... ); + +char *machine_type_KE( ); +int mynode_KE( ); +int numnodes_KE( ); +void numnodes2( int *x, int *y ); +void numnodes3( int *x, int *y, int *z ); +#ifdef COMM_GB +void mynode3( int *n_x, int *n_y, int *n_z ); +#endif + +void g_sync_KE( ); +void g_doublesum_KE( double *dpt ); +void g_vecdoublesum( double *dpt, int ndoubles ); +void g_complexsum( complex * cpt ); +void g_veccomplexsum( complex * cpt, int ncomplex ); +void g_wvectorsum( wilson_vector * wvpt ); +void g_doublemax_KE( double *dpt ); +void broadcast_double_KE( double *dpt ); +void broadcast_char( char *buf, int size ); +void broadcast_bytes( char *buf, int size ); +void broadcast_int_KE( int *buf ); +void collect_bytes( char *buf, char *res, int size ); +void gen_broadcast_bytes( char *buf, int size, int node ); +void send_integer( int tonode, int *address ); +void receive_integer( int *address ); + +/* On the Paragon dclock is a library routine with the + same functionality as ours */ +/* Either way, it needs to be declared double */ +double dclock( ); +void time_stamp( char *msg ); + +void terminate_KE( int status ); +long long get_totcomm( ); + +#endif /* _COMDEFS_H */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/include/complex.h b/qcd/part_cpu/applications/QCD/src/kernel_E/include/complex.h new file mode 100644 index 0000000000000000000000000000000000000000..73881d8167698493e788d76481bbb5dd1c1f712f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/include/complex.h @@ -0,0 +1,88 @@ +#ifndef _COMPLEX_H +#define _COMPLEX_H + +#include "../include/machine.h" + +typedef struct +{ + double real; + double imag; +} complex; + + +/* Function Prototypes for Complex Numbers */ +complex cmplx_KE( double x, double y ); +complex cadd_KE( complex * a, complex * b ); +complex cmul_KE( complex * a, complex * b ); +complex csub( complex * a, complex * b ); +complex cdiv_KE( complex * a, complex * b ); +complex conjg_KE( complex * a ); +complex cexp_milc( complex * a ); +complex clog_milc( complex * a ); +complex csqrt_milc( complex * z ); +complex ce_itheta_KE( double theta ); + +/* Macros for Complex Numbers */ +#define set_complex_equal(a,b) { (*b).real=(*a).real; (*b).imag=(*a).imag; } +/* |*a| */ +#define cabs(a) (sqrt( (*a).real*(*a).real + (*a).imag*(*a).imag ) ) +/* *a * *a* */ +#define dcabs cabs +#define cabs_sq(a) ( (*a).real*(*a).real + (*a).imag*(*a).imag ) +/* phase(*a) */ +#define carg(a) (atan2((double)(*a).imag, (double)(*a).real ) ) +/* b = a* */ +#define dcarg carg +#define CONJG(a,b) { (b).real = (a).real; (b).imag = -(a).imag; } +/* c = a + b */ +#define CADD(a,b,c) { (c).real = (a).real + (b).real; \ + (c).imag = (a).imag + (b).imag; } + /* a += b */ +#define CSUM(a,b) { (a).real += (b).real; (a).imag += (b).imag; } + /* c = a - b */ +#define CSUB(a,b,c) { (c).real = (a).real - (b).real; \ + (c).imag = (a).imag - (b).imag; } + /* c = a * b */ +#define CMUL(a,b,c) { (c).real = (a).real*(b).real - (a).imag*(b).imag; \ + (c).imag = (a).real*(b).imag + (a).imag*(b).real; } + /* c = a / b */ +#define CDIV(a,b,c) { double t = (b).real*(b).real + (b).imag*(b).imag; \ + (c).real = ((a).real*(b).real + (a).imag*(b).imag)/t; \ + (c).imag = ((a).imag*(b).real - (a).real*(b).imag)/t; } + /* c = a * b* */ +#define CMUL_J(a,b,c) { (c).real = (a).real*(b).real + (a).imag*(b).imag; \ + (c).imag = (a).imag*(b).real - (a).real*(b).imag; } + /* c = a* * b */ +#define CMULJ_(a,b,c) { (c).real = (a).real*(b).real + (a).imag*(b).imag; \ + (c).imag = (a).real*(b).imag - (a).imag*(b).real; } + /* c = (a*b)* */ +#define CMULJJ(a,b,c) { (c).real = (a).real*(b).real - (a).imag*(b).imag; \ + (c).imag = -(a).real*(b).imag - (a).imag*(b).real; } + /* b = - a */ +#define CNEGATE(a,b) { (b).real = -(a).real; (b).imag = -(a).imag; } + /* b = ia */ +#define CMUL_I(a,b) { (b).real = -(a).imag; (b).imag = (a).real; } + /* b = -ia */ +#define CMUL_MINUS_I(a,b) { (b).real = (a).imag; (b).imag = -(a).real; } + /* c = ba */ +#define CMULREAL(a,b,c) { (c).real = (b) * (a).real; (c).imag = (b)*(a).imag; } + /* c = a/b */ +#define CDIVREAL(a,b,c) { (c).real = (a).real/(b); (c).imag = (a).imag/(b); } + /* c = a/b */ +#define CADDREAL(a,b,c) { (c).real = (a).real+(b); (c).imag = (a).imag; } + + /* a += i*b */ +#define CSUM_TPI(a,b) { (a).real -= (b).imag; (a).imag += (b).real; } + + /* a += -i*b */ +#define CSUM_TMI(a,b) { (a).real += (b).imag; (a).imag -= (b).real; } + +#define CABSSQ(c) (((c).real*(c).real+(c).imag*(c).imag)) +#define CABS(c) (sqrt((c).real*(c).real+(c).imag*(c).imag)) +/* c += a*b */ +#define CMULSUM(a,b,c) { (c).real += (a).real*(b).real - (a).imag*(b).imag; \ + (c).imag += (a).real*(b).imag + (a).imag*(b).real; } +/* c += conj(a)*b */ +#define CMULJ_SUM(a,b,c) { (c).real += (a).real*(b).real + (a).imag*(b).imag; \ + (c).imag += (a).real*(b).imag - (a).imag*(b).real; } +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/include/config.h b/qcd/part_cpu/applications/QCD/src/kernel_E/include/config.h new file mode 100644 index 0000000000000000000000000000000000000000..b6c5ec2725281cf8e3ade163eeda5a4be7378025 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/include/config.h @@ -0,0 +1,49 @@ +#ifndef _CONFIG_H +#define _CONFIG_H +/* config.h. For now, NOT generated automatically by configure. */ + +/* Collects macros for preprocessor tweaks that accommodate + differences in compilers, architecture and OS */ + +/********************************************************************/ +/* Compiler/Processor-dependent macros */ +/********************************************************************/ + +/* Specify the unsigned 32 bit integer base type for this compiler */ +/* Run the script "getint.sh" to find out what to use */ +/* One and only one of these should be defined */ +#define INT_IS_32BIT 1 /* Most present systems */ +#undef SHORT_IS_32BIT /* Needed on T3E UNICOS, for example */ + +/* Define if the target processor has native double precision */ +/* (For some library routines, gives slightly better performance) */ +/* Systems that do: IBM SP. Most do not. */ +#undef NATIVEDOUBLE + +/* Define if the cache line is 64 bytes (if not, we assume 32 bytes). */ +/* Processors that do: P4 (actually fetches 128), EV67, EV68 */ +/* Used only for prefetching, so it only affects performance */ +#undef HAVE_64_BYTE_CACHELINE + +/********************************************************************/ +/* Compiler/OS-dependent macros */ +/********************************************************************/ + +/* Define if you have the header file. */ +/* Systems that don't: T3E UNICOS, Exemplar, Linux gcc, SP AIX, HP/Compaq True64 */ +/* #define HAVE_IEEEFP_H 1 */ + +/* Define if you have the header file. */ +/* Systems that don't: NT */ +#define HAVE_UNISTD_H 1 + +/* Define if you have the header file. */ +/* Most systems do */ +#define HAVE_SYS_TIME_H 1 + +/* Define if you have ANSI "fseeko" */ +/* #undef HAVE_FSEEKO */ +/* Systems that don't: T3E UNICOS */ +#define HAVE_FSEEKO 1 + +#endif /* _CONFIG_H */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/include/dirs.h b/qcd/part_cpu/applications/QCD/src/kernel_E/include/dirs.h new file mode 100644 index 0000000000000000000000000000000000000000..d619f935c1b5d927c9ec4af1d2141c2cdc1aa53b --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/include/dirs.h @@ -0,0 +1,22 @@ +#ifndef _DIRS_H +#define _DIRS_H + +/* Directions, and a macro to give the opposite direction */ +/* These must go from 0 to 7 because they will be used to index an + array. */ +/* Also define NDIRS = number of directions */ +#define XUP 0 +#define YUP 1 +#define ZUP 2 +#define TUP 3 +#define TDOWN 4 +#define ZDOWN 5 +#define YDOWN 6 +#define XDOWN 7 + +#define NODIR -1 /* not a direction */ + +#define OPP_DIR(dir) (7-(dir)) /* Opposite direction */ +#define NDIRS 8 /* number of directions */ + +#endif /* _DIRS_H */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/include/generic.h b/qcd/part_cpu/applications/QCD/src/kernel_E/include/generic.h new file mode 100644 index 0000000000000000000000000000000000000000..de5342ca40c605e9f948ac265d38ee61ebe1871a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/include/generic.h @@ -0,0 +1,102 @@ +#ifndef _GENERIC_H +#define _GENERIC_H +/************************ generic.h ************************************* +* * +* Macros and declarations for miscellaneous generic routines * +* This header is for codes that call generic routines * +* MIMD version 6 * +* * +*/ + +/* Other generic directory declarations are elsewhere: + + For com_*.c, see comdefs.h + For io_lat4.c io_ansi.c, io_nonansi.c, io_piofs.c, io_romio.c see io_lat.h + For io_wb3.c, see io_wb.h + */ + +#include "../include/int32type.h" +#include "../include/complex.h" +#include "../include/macros.h" +#include "../include/random.h" + + +/* ax_gauge.c */ +void ax_gauge( ); +void w_loop1( int nsmear ); +void w_loop2( int nsmear ); + +/* check_unitarity.c */ +double check_unitarity( void ); + +/* d_plaq?.c */ +void d_plaquette( double *ss_plaq, double *st_plaq ); + +/* gaugefix.c and gaugefix2.c */ +void gaugefix( int gauge_dir, double relax_boost, int max_gauge_iter, double gauge_fix_tol ); + +/* gauge_stuff.c */ +void dsdu_qhb_subl( int dir, int subl ); + +/* layout_*.c */ +void setup_layout_KE( void ); +int node_number_KE( int x, int y, int z, int t ); +int node_index_KE( int x, int y, int z, int t ); +int num_sites( int node ); + +/* make_lattice.c */ +void make_lattice( ); +int site_mu( int i, int mu ); +int taxi_dist( int j ); + +/* make_global_fields.c */ +void make_global_fields( ); + +/* plaquette4.c */ +void plaquette( double *ss_plaq, double *st_plaq ); + +/* ploop?.c */ +complex ploop( void ); +complex ploop_dir( int dir ); + +/* ploop_staple.c */ +complex ploop_staple( double alpha_fuzz ); + +/* project_su3_hit.c */ +void project_su3( su3_matrix * w, /* input initial guess. output resulting + SU(3) matrix */ + su3_matrix * q, /* starting 3 x 3 complex matrix */ + int Nhit, /* number of SU(2) hits. 0 for no projection */ + double tol /* tolerance for SU(3) projection. + If nonzero, treat Nhit as a maximum + number of hits. If zero, treat Nhit + as a prescribed number of hits. */ + ); + +/* rand_gauge.c */ +void rand_gauge( field_offset G ); + +/* ranmom.c */ +void ranmom( void ); + +/* ranstuff.c */ +double myrand( ); + +/* reunitarize2.c */ +void reunitarize( void ); +int reunit_su3( su3_matrix * c ); + +/* smearing.c */ +void ape_smearing( double smear_fac ); + +/* hyp_smearing.c */ +void malloc_hyp( ); +void free_hyp( ); +void ape_block_det( int NumStp ); + +/* exp_smearing.c */ +void malloc_stout(); +void stout_smear_main(); +void free_stout(); + +#endif /* _GENERIC_H */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/include/generic_wilson.h b/qcd/part_cpu/applications/QCD/src/kernel_E/include/generic_wilson.h new file mode 100644 index 0000000000000000000000000000000000000000..10259519e23e6a7543bd97298c2dbb645f492fe2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/include/generic_wilson.h @@ -0,0 +1,48 @@ +#ifndef _GENERIC_WILSON_H +#define _GENERIC_WILSON_H +/************************ generic_wilson.h ****************************** + * * + * Macros and declarations for generic_wilson routines * + * This header is for codes that call generic_wilson routines * + * MIMD version 6 * + * * + */ + +#include "../include/su3.h" +#include "../include/macros.h" + +/* matrix multiplications */ +void multiply_fmat( wilson_vector * src, wilson_vector * dest, int isign ); +void dslash( wilson_vector * src, wilson_vector * dest, int isign, int parity ); + +void latutil_cheb(double *tmp1, double *d, double *dd, double *qsrc, + double f1, double f2, double fch); + + +/* other */ +void unit_wvec(wilson_vector *wvec, int parity); +void grand_wvec( wilson_vector * chi, int parity ); +void clear_latwvec( wilson_vector * chi, int parity ); +void funny_wvec( wilson_vector * chi ); +int setup_KE( ); +int readin( int prompt ); +void meas_perf(); + +/* single precision stuff */ +void malloc_32bit(); +void free_32bit(); +void multiply_fmat_32( float * src, float * dest, int isign ); +void dslash_32( float *src, float *dest, int isign, int parity ); +void convert_gauge(); +void convert_wvec(wilson_vector *src, float *dest_32); +void latutil_cheb_32(float *tmp1, float *d, float *dd, float *qsrc, + float f1, float f2, float fch); +void latutil_xpay_32(float * x, float a, float * y, int parity); +void latutil_axpy_32(float a, float * x, float * y, int parity); +void latutil_5xpay_32(float * x, float a, float * y, int parity); +double latutil_rdot_32(float * x, float * y, int parity); +complex latutil_dot_32(float * x, float * y, int parity); +double latutil_axpy_nrm2_32(float a, float * x, float * y, int parity); +double latutil_nrm2_32(float * x, int parity); + +#endif /* _GENERIC_WILSON_H */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/include/includes.h b/qcd/part_cpu/applications/QCD/src/kernel_E/include/includes.h new file mode 100644 index 0000000000000000000000000000000000000000..957b985f1e88df0c815af309ba03eec150ac6eff --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/include/includes.h @@ -0,0 +1,33 @@ +/************************ generic_wilson_includes.h *********************/ +/****************** wi_dyn_includes.h ***********************************/ +/************************ generic_includes.h ****************************/ +#include "config.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "complex.h" +#include "su3.h" +#include "dirs.h" +#include "macros.h" +#include "generic.h" +#include "generic_wilson.h" +#include "comdefs.h" + +#ifdef COMM_MPI +#include +#endif + +#include "int32type.h" +#include +#include +#include "lattice.h" +#include "io_lat.h" +#include +#include diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/include/int32type.h b/qcd/part_cpu/applications/QCD/src/kernel_E/include/int32type.h new file mode 100644 index 0000000000000000000000000000000000000000..99eab9147f18aa3a6338bd9b88e6d3fc66a960f4 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/include/int32type.h @@ -0,0 +1,33 @@ +/* Some architectures have 64-bit ints. Our binary file formats use + 32 bit integers. So we define a type se we can get 32-bit integers + on all platforms. For 64-bit architectures, the code must be + compiled with -DSHORT32. */ +/* + 7/26/01 Changed name of macro to make it more obvious CD + 4/17/98 Added an unsigned u_int32type for checksums CD + 2/26/98 Changed int32type to signed CD + */ + +#ifndef _TYPE32_H +#define _TYPE32_H + +#include "../include/config.h" + +/* One and only one should be defined */ +#if defined(SHORT_IS_32BIT) && defined(INT_IS_32BIT) +MAKE UP YOUR MIND ! !SEE config.h +#endif +#if !defined(SHORT_IS_32BIT) && !defined(INT_IS_32BIT) + MAKE UP YOUR MIND ! !SEE config.h +#endif +#ifdef SHORT_IS_32BIT +typedef short int32type; +typedef unsigned short u_int32type; + +#else + typedef int int32type; +typedef unsigned int u_int32type; + +#endif + +#endif /* _TYPE32_H */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/include/io_lat.h b/qcd/part_cpu/applications/QCD/src/kernel_E/include/io_lat.h new file mode 100644 index 0000000000000000000000000000000000000000..e273d78379093dfd9b0201facd342abd0d819563 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/include/io_lat.h @@ -0,0 +1,42 @@ +#define CONTINUE 10 +#define FRESH 11 +#define RELOAD_ASCII 12 +#define RELOAD_SERIAL 13 +#define RELOAD_SERIAL_12 17 +#define RELOAD_SERIAL_COMP 16 +#define RELOAD_PARALLEL 14 +#define LOAD_RANDOM 15 +#define RELOAD_MULTIDUMP 18 +#define FORGET 20 +#define SAVE_ASCII 21 +#define SAVE_SERIAL 22 +#define SAVE_SERIAL_COMP 25 +#define SAVE_PARALLEL 24 +#define SAVE_CHECKPOINT 23 +#define SAVE_MULTIDUMP 27 +#define SAVE_SERIAL_ARCHIVE 30 + +/* Helps in defining globals */ +#ifdef CONTROL +#define EXTERN +#else +#define EXTERN extern +#endif + +#ifdef HAVE_UNISTD_H +#include /* For "write" and "close" "off_t" */ +#endif +#include /* For "off_t" */ +#include +#include "../include/int32type.h" +#include "../include/macros.h" /* For MAXFILENAME */ + + +void reload_lattice( int flag, char *filename ); +int ask_starting_lattice( char *latstart, int *flag, char *filename ); +int get_f( FILE * f, char *variable_name_string, double * value ); +int get_i_KE( FILE * f, char *variable_name_string, int *value ); +int get_s_KE( FILE * f, char *variable_name_string, char *value ); +int get_hbrho( FILE * f, char *variable_name_string ); +int get_sw( FILE * f, char *variable_name_string ); +int get_totnodes( FILE * f, char *variable_name_string ); diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/include/lattice.h b/qcd/part_cpu/applications/QCD/src/kernel_E/include/lattice.h new file mode 100644 index 0000000000000000000000000000000000000000..d6fe21105d0f12c3a0cd78c5cdceca22fbdba8a2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/include/lattice.h @@ -0,0 +1,86 @@ +#ifndef _LATTICE_H +#define _LATTICE_H +/****************************** lattice.h ********************************/ + +/* include file for MIMD version 6 + This file defines global scalars and the fields in the lattice. */ + +#include "macros.h" /* For MAXFILENAME */ +#include "random.h" /* For double_prn */ +#include "io_lat.h" /* For gauge_file */ + +#include "su3.h" +#include "random.h" /* For double_prn */ + +/* The lattice is an array of sites. */ +typedef struct +{ + short x, y, z, t; + char parity; + int index; +} site; + + +/* End definition of site structure */ + +/* Definition of globals */ + +#ifdef CONTROL +#define EXTERN +#else +#define EXTERN extern +#endif + +/* fields */ +EXTERN half_wilson_vector *htmp[8]; /* temporary arrays for dslash */ +EXTERN float *htmp_32[8]; +EXTERN su3_matrix *gauge; +EXTERN float *gauge_32; + +/* global scalars */ +EXTERN int startflag; /* beginning lattice: CONTINUE, RELOAD, FRESH */ +EXTERN char conf_id[256]; +EXTERN int nx, ny, nz, nt; /* lattice dimensions */ +EXTERN int volume; /* volume of lattice = nx*ny*nz*nt */ +EXTERN int verbose; /* for verbose printf */ +EXTERN int niter; +EXTERN double rsqmin; + +/* timing, memory size */ +EXTERN double time_comm; +EXTERN double memsize; +/* on fly results to output1 */ +EXTERN FILE *file_o1; + +/* analyze */ +EXTERN double mass_wilson, kappa; +EXTERN double ov_prec; +EXTERN int n_hyp_smear; +EXTERN int st_step; +EXTERN double st_rho; +EXTERN int spect_nmass; +EXTERN double *spect_mass; +EXTERN int spect_ntslice, *spect_tslice; +EXTERN int spect_allspinor, spect_spinor; + +/* Some of these global variables are node dependent */ +/* They are set in "make_lattice()" */ +EXTERN int sites_on_node; /* number of sites on this node */ +EXTERN int even_sites_on_node; /* number of even sites on this node */ +EXTERN int odd_sites_on_node; /* number of odd sites on this node */ +EXTERN int number_of_nodes; /* number of nodes in use */ +EXTERN int this_node; /* node number of this node */ + +/* The lattice is a single global variable - (actually this is the + part of the lattice on this node) */ +EXTERN site *lattice; + +/* Vectors for addressing */ +/* Generic pointers, for gather routines */ +#define N_POINTERS 8 +EXTERN char **gen_pt[N_POINTERS]; + +EXTERN int debugflag; + +EXTERN int max_cg_iters; +#endif /* _LATTICE_H */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/include/machine.h b/qcd/part_cpu/applications/QCD/src/kernel_E/include/machine.h new file mode 100644 index 0000000000000000000000000000000000000000..a98775f1e91ea4870c0210836e76f63dab1bbd33 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/include/machine.h @@ -0,0 +1,15 @@ +/* machine specific options */ +#ifndef _MACHINE_H +#define _MACHINE_H + +#define GENERIC 0 + +/* GENERIC */ +#if ARCH == GENERIC +#define COMM_MPI +#define COLORSPINOR(i,j) c[i].d[j] +#define ROWCOL(i,j) e[i][j] +#define LINKDIST_32 4 +#endif + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/include/macros.h b/qcd/part_cpu/applications/QCD/src/kernel_E/include/macros.h new file mode 100644 index 0000000000000000000000000000000000000000..4da81f43a8eb9f7662a43bfb1b76c666578697cc --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/include/macros.h @@ -0,0 +1,162 @@ +#ifndef _MACROS_H +#define _MACROS_H + +/* Macros common to all applications */ + +/* ---------------------------------------------------------- */ +/* Constants */ + +#define SGN(a) ((a>=0) ? (1.0) : (-1.0)) +#define PI 3.14159265358979323846 +#define MIN(a,b) ((a>=b) ? (b) : (a)) + +#define MAX(x,y) ((x)>(y)? (x) :(y)) +/* ---------------------------------------------------------- */ +/* Conventions for defining checkerboard parity of inversions */ + +#define EVEN 0x02 +#define ODD 0x01 +#define EVENANDODD 0x03 + +/* ---------------------------------------------------------- */ +/* Storage constants */ + +#define MAXFILENAME 256 /* ASCII string length for all file names */ + +/* ---------------------------------------------------------- */ +/* Names of gauge fixing options */ + +#define NO_GAUGE_FIX 30 +#define COULOMB_GAUGE_FIX 31 +#define LANDAU_GAUGE_FIX 32 + +/* ---------------------------------------------------------- */ +/* "field offset" and "field pointer" */ + +/* used when fields are arguments to subroutines */ +/* Usage: fo = F_OFFSET( field ), where "field" is the name of a field + in lattice. + address = F_PT( &site , fo ), where &site is the address of the + site and fo is a field_offset. Usually, the result will have to be + cast to a pointer to the appropriate type. (It is naturally a char *). + */ +typedef int field_offset; +#define F_OFFSET(a) \ + ((field_offset)(((char *)&(lattice[0]. a ))-((char *)&(lattice[0])) )) +#define F_PT( site , fo ) ((char *)( site ) + (fo)) + + /* ---------------------------------------------------------- */ + /* Macros for looping over directions */ + +#define FORALLUPDIR(dir) for(dir=XUP; dir<=TUP; dir++) + +#define FORALLUPDIRBUT(direction,dir) \ + for(dir=XUP; dir<= TUP; dir++)if(dir != direction) + +#define OPP_PAR(parity) (0x03 ^ parity) /* Switches EVEN and ODD. Nulls EVENANDOdd */ + + /* ---------------------------------------------------------- */ + /* printf on node zero only */ + /* #define node0_printf if(this_node==0)printf */ + /* #define node0_fprintf if(this_node==0)fprintf */ + + + /* ---------------------------------------------------------- */ +#define ON 1 +#define OFF 0 + + /* ---------------------------------------------------------- */ + /* Macros for looping over sites on a node */ + + /**********************************************************************/ + /* WARNING: FORSOMEPARITY and FORSOMEPARITYDOMAIN is redefined in some + routines if LOOPEND is specified. See loopend.h */ + /**********************************************************************/ + +#ifndef N_SUBL32 + /*--------------*/ + + /* Standard red-black checkerboard */ + + /* macros to loop over sites of a given parity. + Usage: + int i; + site *s; + FOREVENSITES(i,s){ + commands, where s is a pointer to the current site and i is + the index of the site on the node + } + */ + +#define FOREVENSITES(i,s) \ + for(i=0,s=lattice;it > 0) +#define FORODDSITESDOMAIN(i,s) \ +FORODDSITES(i,s) if(s->t > 0) +#define FORALLSITESDOMAIN(i,s) \ +FORALLSITES(i,s) if(s->t > 0) +#define FORSOMEPARITYDOMAIN(i,s,parity) \ +FORSOMEPARITY(i,s,parity) if(s->t > 0) +#else +#define FOREVENSITESDOMAIN FOREVENSITES +#define FORODDSITESDOMAIN FORODDSITES +#define FORALLSITESDOMAIN FORALLSITES +#define FORSOMEPARITYDOMAIN FORSOMEPARITY +#endif + +#ifdef DEBUG +#define node0_debug() do {node0_printf("DEBUG: %s:%d in %s()\n",__FILE__,__LINE__,__FUNCTION__);}while(0) +#define debug() do {printf("DEBUG: node= %d %s:%d in %s()\n",mynode_KE(),__FILE__,__LINE__,__FUNCTION__); fflush(0);} while(0) +#else +#define node0_debug() do{ }while(0) +#define debug() do{ }while(0) +#endif + +#ifdef POSIX_MEMALIGN +#define MEMALIGN( variable, type, size ) do{ posix_memalign( (void**) &variable, 16, (size) * sizeof( type ) ); if( (variable) == NULL ) { printf( "ERROR node= %d memory allocation failed\n",this_node ); exit(1);}; memsize+=(size)*sizeof(type); }while(0) +#else +#define MEMALIGN( variable, type, size ) do{ (variable) = ( type * ) memalign( 16, (size) * sizeof( type ) ); if( (variable) == NULL ) { printf( "ERROR node= %d memory allocation failed\n",this_node ); exit(1);}; memsize+=(size)*sizeof(type); }while(0) +#endif + +#define FREE( variable, type, size ) do{ free(variable); variable=NULL; memsize-=(size)*sizeof(type); }while(0) + + +#endif /* _MACROS_H */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/include/random.h b/qcd/part_cpu/applications/QCD/src/kernel_E/include/random.h new file mode 100644 index 0000000000000000000000000000000000000000..413dbdadcc9470b09e85b99866c41bf2f69c94a1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/include/random.h @@ -0,0 +1,10 @@ +#ifndef _RANDOM_H +#define _RANDOM_H + +/* Generic random number generator returning a uniformly distributed + random value on [0,1] */ +double myrand( ); +void ranstart( ); +void ranend( ); + +#endif /* _RANDOM_H */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/include/su3.h b/qcd/part_cpu/applications/QCD/src/kernel_E/include/su3.h new file mode 100644 index 0000000000000000000000000000000000000000..209eb4389090018dac1ec7cde15556323409a5c0 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/include/su3.h @@ -0,0 +1,243 @@ +#ifndef _SU3_H +#define _SU3_H +#include "../include/machine.h" +#include "../include/complex.h" +#include "../include/random.h" + +/* SU(3) */ +typedef struct +{ + complex e[3][3]; +} su3_matrix; +typedef struct +{ + float a[8]; +} su3_matrix_comp; +typedef struct +{ + complex e[3][3][3][3]; +} su3_hypermatrix; +typedef struct +{ + complex c[3]; +} su3_vector; +typedef struct +{ + complex m01, m02, m12; + double m00im, m11im, m22im; + double space; +} anti_hermitmat; + +#ifdef SPINORFIRST +typedef struct +{ + su3_vector d[4]; +} wilson_vector; +#else +typedef struct +{ + complex d[4]; +} spinor_vector; +typedef struct +{ + spinor_vector c[3]; +} wilson_vector; +#endif + +typedef struct +{ + su3_vector h[2]; +} half_wilson_vector; +typedef struct +{ + wilson_vector d[4]; +} spin_wilson_vector; +typedef struct +{ + spin_wilson_vector c[3]; +} wilson_propagator; +typedef struct +{ + wilson_vector c[3]; +} color_wilson_vector; +typedef struct +{ + color_wilson_vector d[4]; +} wilson_matrix; + +#ifdef CLOVER +/* Clover vectors */ +typedef struct +{ + complex tr[2][15]; +} triangular; +typedef struct +{ + double di[2][6]; +} diagonal; +#endif +/* SU2 for gauge fixing */ +typedef struct +{ + complex esu2[2][2]; +} su2_matrix; +typedef struct +{ + double a[4]; +} su2_matr_comp; + + +#define GAMMAFIVE -1 /* some integer which is not a direction */ +#define PLUS 1 /* flags for selecting M or M_adjoint */ +#define MINUS -1 +/* Macros to multiply complex numbers by +-1 and +-i */ +#define TIMESPLUSONE(a,b) { (b).real = (a).real; (b).imag = (a).imag; } +#define TIMESMINUSONE(a,b) { (b).real = -(a).real; (b).imag = -(a).imag; } +#define TIMESPLUSI(a,b) { (b).real = -(a).imag; (b).imag = (a).real; } +#define TIMESMINUSI(a,b) { (b).real = (a).imag; (b).imag = -(a).real; } + +#define FORMAT(a,b) for(a=0;a<3;a++) for (b=0;b<3;b++) + +/* for lattice i/o */ +void su3_to_comp( su3_matrix * U, su3_matrix_comp * alpha ); +void comp_to_su3( su3_matrix_comp * alpha, su3_matrix * result ); + +double magsq_hwvec( half_wilson_vector * vec ); +complex hwvec_dot( half_wilson_vector * a, half_wilson_vector * b ); + + +double realtrace_su3_KE( su3_matrix * a, su3_matrix * b ); +complex trace_su3_KE( su3_matrix * a ); +complex complextrace_su3_KE( su3_matrix * a, su3_matrix * b ); +complex det_su3_KE( su3_matrix * a ); +void add_su3_matrix_KE( su3_matrix * a, su3_matrix * b, su3_matrix * c ); +void sub_su3_matrix_KE( su3_matrix * a, su3_matrix * b, su3_matrix * c ); +void scalar_mult_su3_matrix_KE( su3_matrix * src, double scalar, su3_matrix * dest ); +void scalar_mult_sub_su3_matrix_KE( su3_matrix * src1, su3_matrix * src2, double scalar, su3_matrix * dest ); +void c_scalar_mult_su3mat_KE( su3_matrix * src, complex * scalar, su3_matrix * dest ); +void c_scalar_mult_add_su3mat_KE( su3_matrix * src1, su3_matrix * src2, complex * scalar, su3_matrix * dest ); +void c_scalar_mult_sub_su3mat_KE( su3_matrix * src1, su3_matrix * src2, complex * scalar, su3_matrix * dest ); +void su3_adjoint_KE( su3_matrix * a, su3_matrix * b ); +void su3_transpose( su3_matrix * a, su3_matrix * b ); +void make_anti_hermitian_KE( su3_matrix * m3, anti_hermitmat * ah3 ); +void make_traceless( su3_matrix * m3, su3_matrix * m4 ); + +void funny_anti_hermitian( int ix, int iy, int iz, int it, int idir, anti_hermitmat * mat_antihermit ); +void random_anti_hermitian_KE( anti_hermitmat * mat_antihermit ); +void uncompress_anti_hermitian_KE( anti_hermitmat * mat_anti, su3_matrix * mat ); +void compress_anti_hermitian_KE( su3_matrix * mat, anti_hermitmat * mat_anti ); +void clear_su3mat_KE( su3_matrix * dest ); +void unit_su3mat( su3_matrix *dest ); +void su3mat_copy_KE( su3_matrix * a, su3_matrix * b ); +void dump_mat( su3_matrix * m ); +void scalar_mult_ahm( anti_hermitmat * a, double s, anti_hermitmat * b ); +void clear_ahm( anti_hermitmat * a ); + +complex su3_dot_KE( su3_vector * a, su3_vector * b ); +double su3_rdot_KE( su3_vector * a, su3_vector * b ); +double magsq_su3vec_KE( su3_vector * a ); +void su3vec_copy_KE( su3_vector * a, su3_vector * b ); +void dumpvec_KE( su3_vector * v ); +void clearvec_KE( su3_vector * v ); + +void mult_su3_mat_vec_sum_KE( su3_matrix * a, su3_vector * b, su3_vector * c ); +void mult_su3_mat_vec_nsum_KE( su3_matrix * a, su3_vector * b, su3_vector * c ); +void mult_adj_su3_mat_vec_sum_KE( su3_matrix * a, su3_vector * b, su3_vector * c ); +void mult_adj_su3_mat_vec_nsum_KE( su3_matrix * a, su3_vector * b, su3_vector * c ); + +void sub_su3_vector_KE( su3_vector * a, su3_vector * b, su3_vector * c ); + +void scalar_mult_su3_vector_KE( su3_vector * src, double scalar, su3_vector * dest ); +void scalar_mult_sum_su3_vector_KE( su3_vector * src1, su3_vector * src2, double scalar ); +void scalar_mult_sub_su3_vector_KE( su3_vector * src1, su3_vector * src2, double scalar, su3_vector * dest ); +void scalar_mult_wvec( wilson_vector * src, double s, wilson_vector * dest ); +void scalar_mult_hwvec( half_wilson_vector * src, double s, half_wilson_vector * dest ); +void scalar_mult_add_wvec( wilson_vector * src1, wilson_vector * src2, double scalar, wilson_vector * dest ); +void scalar2_mult_add_wvec( wilson_vector * src1, double t, wilson_vector * src2, double s, wilson_vector * dest ); +void scalar3_mult_add_wvec( wilson_vector * src1, double t, + wilson_vector * src2, double s, + wilson_vector * src3, double u, + wilson_vector * dest ); + +void scalar_mult_add_g5_wvec( wilson_vector * src1, wilson_vector * src2, double scalar, wilson_vector * dest ); +void scalar_g5_mult_add_wvec( wilson_vector * src1, wilson_vector * src2, double scalar, wilson_vector * dest ); +void g5_mult_wvec( wilson_vector * src, wilson_vector * dest ); + +void scalar_mult_addtm_wvec( wilson_vector * src1, wilson_vector * src2, double scalar, wilson_vector * dest ); +void c_scalar_mult_wvec( wilson_vector * src1, complex * phase, wilson_vector * dest ); +void c_scalar_mult_add_wvec( wilson_vector * src1, wilson_vector * src2, complex * phase, wilson_vector * dest ); +void c_scalar_mult_add_wvec2( wilson_vector * src1, wilson_vector * src2, complex s, wilson_vector * dest ); +void c_scalar_mult_su3vec_KE( su3_vector * src, complex * phase, su3_vector * dest ); +void c_scalar_mult_add_su3vec_KE( su3_vector * v1, complex * phase, su3_vector * v2 ); +void c_scalar_mult_sub_su3vec_KE( su3_vector * v1, complex * phase, su3_vector * v2 ); + +void mult_by_gamma_left( wilson_matrix * src, wilson_matrix * dest, int dir ); +void mult_by_gamma_right( wilson_matrix * src, wilson_matrix * dest, int dir ); +void mult_by_gamma_l( spin_wilson_vector * src, spin_wilson_vector * dest, int dir ); +void mult_by_gamma_r( spin_wilson_vector * src, spin_wilson_vector * dest, int dir ); + +void mult_mat_wilson_vec( su3_matrix * mat, wilson_vector * src, wilson_vector * dest ); +void mult_adj_mat_wilson_vec( su3_matrix * mat, wilson_vector * src, wilson_vector * dest ); + +void add_wilson_vector( wilson_vector * src1, wilson_vector * src2, wilson_vector * dest ); +void sub_wilson_vector( wilson_vector * src1, wilson_vector * src2, wilson_vector * dest ); +double magsq_wvec( wilson_vector * src ); +complex wvec_dot( wilson_vector * src1, wilson_vector * src2 ); +complex wvec2_dot( wilson_vector * src1, wilson_vector * src2 ); +double wvec_rdot( wilson_vector * a, wilson_vector * b ); + +void wp_shrink( wilson_vector * src, half_wilson_vector * dest, int dir, int sign ); +void wp_shrink_4dir( wilson_vector * a, half_wilson_vector * b1, + half_wilson_vector * b2, half_wilson_vector * b3, half_wilson_vector * b4, int sign ); +void wp_grow_hch( half_wilson_vector * src, wilson_vector * dest, int dir, int sign ); +void wp_shrink_4dir_hch( wilson_vector * a, half_wilson_vector * b1, + half_wilson_vector * b2, half_wilson_vector * b3, half_wilson_vector * b4, int sign ); +void wp_shrink_hch( wilson_vector * src, half_wilson_vector * dest, int dir, int sign ); +void wp_grow_add_hch( half_wilson_vector * src, wilson_vector * dest, int dir, int sign ); +void grow_add_four_wvecs_hch( wilson_vector * a, half_wilson_vector * b1, + half_wilson_vector * b2, half_wilson_vector * b3, half_wilson_vector * b4, int sign, int sum ); +void wp_grow( half_wilson_vector * src, wilson_vector * dest, int dir, int sign ); +void wp_grow_add( half_wilson_vector * src, wilson_vector * dest, int dir, int sign ); +void grow_add_four_wvecs( wilson_vector * a, half_wilson_vector * b1, + half_wilson_vector * b2, half_wilson_vector * b3, half_wilson_vector * b4, int sign, int sum ); +void mult_by_gamma( wilson_vector * src, wilson_vector * dest, int dir ); +void su3_projector_w( wilson_vector * a, wilson_vector * b, su3_matrix * c ); +void clear_wvec( wilson_vector * dest ); +void clear_half_wvec( half_wilson_vector * dest ); +void copy_wvec( wilson_vector * src, wilson_vector * dest ); +void copy_half_wvec( half_wilson_vector * src, half_wilson_vector * dest ); +void dump_wvec( wilson_vector * src ); +void dump_wvec_32( float * v ); +void dump_half_wvec( half_wilson_vector * src ); +void dump_half_wvec_32( float * v ); + +double gaussian_rand_no_KE( ); +#include "../include/int32type.h" +void byterevn( int32type w[], int n ); + +void mult_su3_nn_KE( su3_matrix * a, su3_matrix * b, su3_matrix * c ); +void mult_su3_na_KE( su3_matrix * a, su3_matrix * b, su3_matrix * c ); +void mult_su3_an_KE( su3_matrix * a, su3_matrix * b, su3_matrix * c ); +void mult_su3_aa_KE( su3_matrix * a, su3_matrix * b, su3_matrix * c ); +void mult_su3_mat_vec_KE( su3_matrix * a, su3_vector * b, su3_vector * c ); +void mult_adj_su3_mat_vec_KE( su3_matrix * a, su3_vector * b, su3_vector * c ); +void mult_su3_mat_vec_sum_4dir_KE( su3_matrix * a, su3_vector * b0, su3_vector * b1, su3_vector * b2, su3_vector * b3, su3_vector * c ); +void mult_adj_su3_mat_vec_4dir_KE( su3_matrix * a, su3_vector * b, su3_vector * c ); +void mult_adj_su3_mat_4vec( su3_matrix * mat, su3_vector * src, + su3_vector * dest0, su3_vector * dest1, su3_vector * dest2, su3_vector * dest3 ); +void su3_projector_KE( su3_vector * a, su3_vector * b, su3_matrix * c ); +void mult_su3_mat_hwvec( su3_matrix * mat, half_wilson_vector * src, half_wilson_vector * dest ); +void mult_adj_su3_mat_hwvec( su3_matrix * mat, half_wilson_vector * src, half_wilson_vector * dest ); +void sub_four_su3_vecs_KE( su3_vector * a, su3_vector * b1, su3_vector * b2, su3_vector * b3, su3_vector * b4 ); +void add_su3_vector_KE( su3_vector * a, su3_vector * b, su3_vector * c ); +void scalar_mult_add_su3_vector_KE( su3_vector * src1, su3_vector * src2, double scalar, su3_vector * dest ); +void scalar_mult_add_su3_matrix_KE( su3_matrix * src1, su3_matrix * src2, double scalar, su3_matrix * dest ); + +/* su2 */ +void left_su2_hit_n_KE( su2_matrix * u, int p, int q, su3_matrix * link ); +void right_su2_hit_a( su2_matrix * u, int p, int q, su3_matrix * link ); +void mult_su2_mat_vec_elem_n_KE( su2_matrix * u, complex * x0, complex * x1 ); +void mult_su2_mat_vec_elem_a( su2_matrix * u, complex * x0, complex * x1 ); + + +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/io_helpers.c b/qcd/part_cpu/applications/QCD/src/kernel_E/io_helpers.c new file mode 100644 index 0000000000000000000000000000000000000000..43afd0ba6239af5c2607571e1b6c6a305b9280a9 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/io_helpers.c @@ -0,0 +1,226 @@ +/********************** io_helpers.c **********************************/ +/* MIMD version 6 */ +/* DT 8/97 + General purpose high level routines, to be used by any application + that wants them. + */ + +#include "./include/includes.h" + +extern int totnodes[4]; + +/* reads lattice from the same lattice seen by any nodes */ +void reload_lattice( int flag, char *filename ) +{ + FILE *f; + int x, y, z, t, ind, dir; + double ssplaq, stplaq; + double max_deviation; + int a; + su3_matrix_comp mat_comp; + void coldlat( ); + void ranlat( ); + + double mytime; + mytime = -dclock( ); + + /* always use HOT lattice*/ + ranlat( ); + + mytime += dclock( ); + g_sync_KE( ); + + d_plaquette( &ssplaq, &stplaq ); + node0_fprintf( file_o1, "reload_lattice: time= %.3g\t checkplaq: %e %e\n", mytime, ssplaq, stplaq ); + + max_deviation = check_unitarity( ); + g_doublemax_KE( &max_deviation ); + node0_fprintf( file_o1, "reload_lattice: Unitarity checked. Max deviation %.2e\n", max_deviation ); + +} + + + +void ranlat( ) +{ + /* sets link matrices to random SU(3) matrices */ + register int i, dir; + register site *sit; + void random_su3_KE( su3_matrix * r_su3, double eps ); + + FORALLSITES( i, sit ) + { + for ( dir = XUP; dir <= TUP; dir++ ) + { + random_su3_KE( &( gauge[4 * i + dir] ), 1.0 ); + } + } + node0_fprintf( file_o1, "ranlat: Random gauge configuration loaded\n" ); +} + + +int get_f( FILE * f, char *variable_name_string, double * value ) +{ + char readname[80]; + double read_d; + char ch; + int status; + + status = 0; + rewind( f ); + do + { + fscanf( f, "%s", readname ); + if( strcmp( readname, variable_name_string ) == 0 ) + { + if( fscanf( f, "%lg", &read_d ) == 1 ) + { + *value = ( double ) read_d; + if( this_node == 0 ) + fprintf( file_o1, "get_f: %s\t %g\n", variable_name_string, *value ); + } + else + { + if( this_node == 0 ) + printf( "ERROR get_f: Error reading %s\n", variable_name_string ); + status = 1; + } + goto endread; + } + else + { + while( ( ch = getc( f ) ) != '\n' && ( !feof( f ) ) ); + } + } + while( !feof( f ) ); + if( this_node == 0 ) + printf( "ERROR get_f: Error reading %s\n", variable_name_string ); + status = 1; + endread: + + return status; +} + + +int get_i_KE( FILE * f, char *variable_name_string, int *value ) +{ + char readname[80]; + char ch; + int status; + + status = 0; + rewind( f ); + do + { + fscanf( f, "%s", readname ); + if( strcmp( readname, variable_name_string ) == 0 ) + { + if( fscanf( f, "%d", value ) == 1 ) + { + if( this_node == 0 ) + fprintf( file_o1, "get_i: %s\t %d\n", variable_name_string, *value ); + } + else + { + if( this_node == 0 ) + printf( "ERROR get_i: Error reading %s\n", variable_name_string ); + status = 1; + } + goto endread; + } + else + { + while( ( ch = getc( f ) ) != '\n' && ( !feof( f ) ) ); + } + } + while( !feof( f ) ); + if( this_node == 0 ) + printf( "ERROR get_i: Error reading %s\n", variable_name_string ); + status = 1; + endread: + + return status; +} + +/* get the string after variable until the end of line */ +int get_s_KE( FILE * f, char *variable_name_string, char *value ) +{ + char readname[256]; + char ch; + int i, status; + + status = 0; + rewind( f ); + do + { + fscanf( f, "%s", readname ); + if( strcmp( readname, variable_name_string ) == 0 ) + { + i = 0; + while( ( ch = getc( f ) ) < 33 && ( !feof( f ) ) ); + do + { + value[i] = ch; + i++; + ch = getc( f ); + } + while( ch != '\n' && ( !feof( f ) ) ); + value[i] = 0; + goto endread; + } + else + { + while( ( ch = getc( f ) ) != '\n' && ( !feof( f ) ) ); + } + } + while( !feof( f ) ); + if( this_node == 0 ) + printf( "ERROR get_s: Error reading %s\n", variable_name_string ); + status = 1; + endread: + return status; +} + +/* read the total number of nodes in each (physical) direction */ +int get_totnodes( FILE * f, char *variable_name_string ) +{ + char readname[80]; + char ch; + int i, status; + + status = 0; + rewind( f ); + do + { + fscanf( f, "%s", readname ); + if( strcmp( readname, variable_name_string ) == 0 ) + { + for ( i = 0; i < 4; i++ ) + { + if( fscanf( f, "%d ", &totnodes[i] ) != 1 ) + { + if( this_node == 0 ) + printf( "ERROR get_totnodes: Not enough directions\n" ); + status = 1; + } + else + { + if( this_node == 0 ) + printf( "get_totnodes: totnodes[%i]\t %i\n", i, totnodes[i] ); + } + } + goto endread; + } + else + { + while( ( ch = getc( f ) ) != '\n' && ( !feof( f ) ) ); + } + } + while( !feof( f ) ); + if( this_node == 0 ) + printf( "ERROR get_totnodes: Error reading %s\n", variable_name_string ); + status = 1; + endread: + + return status; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/add_wvec.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/add_wvec.c new file mode 100644 index 0000000000000000000000000000000000000000..0a73b6e89bb005b72b4e2ef9f07027d873d22a3a --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/add_wvec.c @@ -0,0 +1,26 @@ +/******************** add_wvec.c (in su3.a) ******************** +* +*void add_wilson_vector(wilson_vector *src1,*src2,*dest) +* add two Wilson vectors +* dest <- src1 + src2 +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +void add_wilson_vector( wilson_vector * src1, wilson_vector * src2, wilson_vector * dest ) +{ + register int i; + for ( i = 0; i < 3; i++ ) + { + dest->COLORSPINOR( i, 0 ).real = src1->COLORSPINOR( i, 0 ).real + ( src2->COLORSPINOR( i, 0 ).real ); + dest->COLORSPINOR( i, 1 ).real = src1->COLORSPINOR( i, 1 ).real + ( src2->COLORSPINOR( i, 1 ).real ); + dest->COLORSPINOR( i, 2 ).real = src1->COLORSPINOR( i, 2 ).real + ( src2->COLORSPINOR( i, 2 ).real ); + dest->COLORSPINOR( i, 3 ).real = src1->COLORSPINOR( i, 3 ).real + ( src2->COLORSPINOR( i, 3 ).real ); + + dest->COLORSPINOR( i, 0 ).imag = src1->COLORSPINOR( i, 0 ).imag + ( src2->COLORSPINOR( i, 0 ).imag ); + dest->COLORSPINOR( i, 1 ).imag = src1->COLORSPINOR( i, 1 ).imag + ( src2->COLORSPINOR( i, 1 ).imag ); + dest->COLORSPINOR( i, 2 ).imag = src1->COLORSPINOR( i, 2 ).imag + ( src2->COLORSPINOR( i, 2 ).imag ); + dest->COLORSPINOR( i, 3 ).imag = src1->COLORSPINOR( i, 3 ).imag + ( src2->COLORSPINOR( i, 3 ).imag ); + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/addmat.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/addmat.c new file mode 100644 index 0000000000000000000000000000000000000000..18574590c5fbf0ecda295ead630a196cbe65f4a0 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/addmat.c @@ -0,0 +1,17 @@ +/******************** addmat.c (in su3.a) ***************************** +* * +* Add two SU3 matrices * +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +void add_su3_matrix_KE( su3_matrix * a, su3_matrix * b, su3_matrix * c ) +{ + register int i, j; + for ( i = 0; i < 3; i++ ) + for ( j = 0; j < 3; j++ ) + { + CADD( a->ROWCOL( i, j ), b->ROWCOL( i, j ), c->ROWCOL( i, j ) ); + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/addvec.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/addvec.c new file mode 100644 index 0000000000000000000000000000000000000000..d205266fb546df46b3b413ba6cd2721052f9c227 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/addvec.c @@ -0,0 +1,16 @@ +/******************** addvec.c (in su3.a) ***************************** +* * +* Add two SU3 vectors * +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +void add_su3_vector_KE( su3_vector * a, su3_vector * b, su3_vector * c ) +{ + register int i; + for ( i = 0; i < 3; i++ ) + { + CADD( a->c[i], b->c[i], c->c[i] ); + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/byterevn.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/byterevn.c new file mode 100644 index 0000000000000000000000000000000000000000..b8465a8ca86f3c73d08f078790beae6583638510 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/byterevn.c @@ -0,0 +1,27 @@ +/******************************** byterevn.c ***************************/ +/* MIMD version 6 */ + +/* WARNING - MUST BE COMPILED WITH APPROPRIATE SHORT32 FLAG! */ +#include "../include/config.h" +#include "../include/int32type.h" +#include + +/* For doing byte reversal on 32-bit words */ + +void byterevn( int32type w[], int n ) +{ + register int32type old, newv; + int j; + + assert( sizeof( int32type ) == 4 ); + + for ( j = 0; j < n; j++ ) + { + old = w[j]; + newv = old >> 24 & 0x000000ff; + newv |= old >> 8 & 0x0000ff00; + newv |= old << 8 & 0x00ff0000; + newv |= old << 24 & 0xff000000; + w[j] = newv; + } +} /* byterevn */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/cadd.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/cadd.c new file mode 100644 index 0000000000000000000000000000000000000000..f2f92882cbc6e3ae7a2450a5302d150e7ae67ce9 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/cadd.c @@ -0,0 +1,14 @@ +/********************** cadd.c (in complex.a) **********************/ +/* MIMD version 6 */ +/* Subroutines for operations on complex numbers */ +/* add two complex numbers */ +#include "../include/config.h" +#include "../include/complex.h" + +complex cadd_KE( complex * a, complex * b ) +{ + complex c; + c.real = ( *a ).real + ( *b ).real; + c.imag = ( *a ).imag + ( *b ).imag; + return ( c ); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/cdiv.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/cdiv.c new file mode 100644 index 0000000000000000000000000000000000000000..37db76727c14393a6fe0dc32496d22239e4211a5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/cdiv.c @@ -0,0 +1,16 @@ +/********************** cdiv.c (in complex.a) **********************/ +/* MIMD version 6 */ +/* Subroutines for operations on complex numbers */ +/* Divide two complex numbers */ +#include "../include/config.h" +#include "../include/complex.h" + +complex cdiv_KE( complex * a, complex * b ) +{ + complex c; + double scale; + scale = 1.0 / ( ( *b ).real * ( *b ).real + ( *b ).imag * ( *b ).imag ); + c.real = scale * ( ( *a ).real * ( *b ).real + ( *a ).imag * ( *b ).imag ); + c.imag = scale * ( ( *a ).imag * ( *b ).real - ( *a ).real * ( *b ).imag ); + return ( c ); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/ce_itheta.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/ce_itheta.c new file mode 100644 index 0000000000000000000000000000000000000000..623d7a3de210eb4b0fb48ccd48b22f56ae893d87 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/ce_itheta.c @@ -0,0 +1,16 @@ +/********************** ce_itheta.c (in complex.a) **********************/ +/* MIMD version 6 */ +/* Subroutines for operations on complex numbers */ +/* exp( i*theta ) */ +#include "../include/config.h" +#include +#include "../include/complex.h" + +complex ce_itheta_KE( double theta ) +{ + complex c; + c.real = ( double ) cos( ( double ) theta ); + c.imag = ( double ) sin( ( double ) theta ); + /* there must be a more efficient way */ + return ( c ); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/cexp.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/cexp.c new file mode 100644 index 0000000000000000000000000000000000000000..7e63a2b9f2e06c66361891c8ed293a8ae6c4a314 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/cexp.c @@ -0,0 +1,17 @@ +/********************** cexp.c (in complex.a) **********************/ +/* MIMD version 6 */ +/* Subroutines for operations on complex numbers */ +/* complex exponential */ +#include "../include/config.h" +#include +#include "../include/complex.h" + +complex cexp_milc( complex * a ) +{ + complex c; + double mag; + mag = ( double ) exp( ( double ) ( *a ).real ); + c.real = mag * ( double ) cos( ( double ) ( *a ).imag ); + c.imag = mag * ( double ) sin( ( double ) ( *a ).imag ); + return ( c ); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/clear_mat.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/clear_mat.c new file mode 100644 index 0000000000000000000000000000000000000000..c8ca5660e0cecc00f7cef1065b0894e26ba7fea8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/clear_mat.c @@ -0,0 +1,30 @@ +/******************** clear_mat.c (in su3.a) ******************** +* +*void clear_su3mat_KE( su3_matrix *dest ) +* clear an SU3 matrix +* dest <- zero_matrix +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +void clear_su3mat_KE( su3_matrix * dest ) +{ + register int i, j; + for ( i = 0; i < 3; i++ ) + for ( j = 0; j < 3; j++ ) + { + dest->ROWCOL( i, j ).real = dest->ROWCOL( i, j ).imag = 0.0; + } +} +void unit_su3mat( su3_matrix *dest ) +{ + register int i,j; + for(i=0;i<3;i++) + for(j=0;j<3;j++) + { + dest->ROWCOL( i, j ).real = dest->ROWCOL( i, j ).imag = 0.0; + if (i==j) dest->ROWCOL(i,j).real=1.0; + } +} + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/clear_wvec.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/clear_wvec.c new file mode 100644 index 0000000000000000000000000000000000000000..36e0f40ebe96d57a305d7e0ff5ecbccbef0931cb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/clear_wvec.c @@ -0,0 +1,28 @@ +/******************** clear_wvec.c (in su3.a) ******************** +* +*void clear_wilson_vector( wilson_vector *dest ) +* clear a Wilson vector +* dest <- zero_vector +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +void clear_wvec( wilson_vector * dest ) +{ + register int i, j; + for ( i = 0; i < 4; i++ ) + for ( j = 0; j < 3; j++ ) + { + dest->COLORSPINOR( j, i ).real = dest->COLORSPINOR( j, i ).imag = 0.0; + } +} +void clear_half_wvec( half_wilson_vector * dest ) +{ + register int i, j; + for ( i = 0; i < 2; i++ ) + for ( j = 0; j < 3; j++ ) + { + dest->h[i].c[j].real = dest->h[i].c[j].imag = 0.0; + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/clearvec.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/clearvec.c new file mode 100644 index 0000000000000000000000000000000000000000..2f35ce8cca6463f4e3b1213abca27c785dc35876 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/clearvec.c @@ -0,0 +1,15 @@ +/******************* clearvec.c (in su3.a) ***************************** +* * +* void clearvec_KE( su3_vector *vec ) * +* clear a 3 element complex vector * +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +void clearvec_KE( su3_vector * v ) +{ + v->c[0].real = v->c[0].imag = 0.0; + v->c[1].real = v->c[1].imag = 0.0; + v->c[2].real = v->c[2].imag = 0.0; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/clog.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/clog.c new file mode 100644 index 0000000000000000000000000000000000000000..88f21ef9e81649b994086888fd64ff16d6810049 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/clog.c @@ -0,0 +1,15 @@ +/********************** clog.c (in complex.a) **********************/ +/* MIMD version 6 */ +/* Subroutines for operations on complex numbers */ +/* complex logarithm */ +#include "../include/config.h" +#include +#include "../include/complex.h" + +complex clog_milc( complex * a ) +{ + complex c; + c.real = 0.5 * ( double ) log( ( double ) ( ( *a ).real * ( *a ).real + ( *a ).imag * ( *a ).imag ) ); + c.imag = ( double ) atan2( ( double ) ( *a ).imag, ( double ) ( *a ).real ); + return ( c ); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/cmp_ahmat.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/cmp_ahmat.c new file mode 100644 index 0000000000000000000000000000000000000000..9870be51c907ac1d0edc549451bf3a10ac248bac --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/cmp_ahmat.c @@ -0,0 +1,21 @@ +/***************** cmp_ahmat.c (in su3.a) ***************************** +* * +* Make an anti_hermitmat (anti Hermitian matrix in compressed form) * +* from an SU3 matrix (3x3 complex matrix). * +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +void compress_anti_hermitian_KE( su3_matrix * mat_su3, anti_hermitmat * mat_antihermit ) +{ + mat_antihermit->m00im = mat_su3->ROWCOL( 0, 0 ).imag; + mat_antihermit->m11im = mat_su3->ROWCOL( 1, 1 ).imag; + mat_antihermit->m22im = mat_su3->ROWCOL( 2, 2 ).imag; + mat_antihermit->m01.real = mat_su3->ROWCOL( 0, 1 ).real; + mat_antihermit->m02.real = mat_su3->ROWCOL( 0, 2 ).real; + mat_antihermit->m12.real = mat_su3->ROWCOL( 1, 2 ).real; + mat_antihermit->m01.imag = mat_su3->ROWCOL( 0, 1 ).imag; + mat_antihermit->m02.imag = mat_su3->ROWCOL( 0, 2 ).imag; + mat_antihermit->m12.imag = mat_su3->ROWCOL( 1, 2 ).imag; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/cmplx.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/cmplx.c new file mode 100644 index 0000000000000000000000000000000000000000..8ed7703a7b15601c477511d117cc7ca173dac070 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/cmplx.c @@ -0,0 +1,14 @@ +/********************** cmplx.c (in complex.a) **********************/ +/* MIMD version 6 */ +/* Subroutines for operations on complex numbers */ +/* make a complex number from two real numbers */ +#include "../include/config.h" +#include "../include/complex.h" + +complex cmplx_KE( double x, double y ) +{ + complex c; + c.real = x; + c.imag = y; + return ( c ); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/cmul.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/cmul.c new file mode 100644 index 0000000000000000000000000000000000000000..fb501562711113d2a598f287e9fdbb05eaba3f67 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/cmul.c @@ -0,0 +1,14 @@ +/********************** cmul.c (in complex.a) **********************/ +/* MIMD version 6 */ +/* Subroutines for operations on complex numbers */ +/* multiply two complex numbers */ +#include "../include/config.h" +#include "../include/complex.h" + +complex cmul_KE( complex * a, complex * b ) +{ + complex c; + c.real = ( *a ).real * ( *b ).real - ( *a ).imag * ( *b ).imag; + c.imag = ( *a ).imag * ( *b ).real + ( *a ).real * ( *b ).imag; + return ( c ); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/complextr.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/complextr.c new file mode 100644 index 0000000000000000000000000000000000000000..fb90daa62e4065575eb0688b9ac8cee410e17042 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/complextr.c @@ -0,0 +1,24 @@ +/****************** complextr.c (in su3.a) **************************** +* * +* complex complextrace_su3_KE( su3_matrix *a,*b) * +* return Tr( A_adjoint*B ) * +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +complex complextrace_su3_KE( su3_matrix * a, su3_matrix * b ) +{ + register int i, j; + register double sumr, sumi; + complex sum; + for ( sumr = 0.0, sumi = 0.0, i = 0; i < 3; i++ ) + for ( j = 0; j < 3; j++ ) + { + sumr += a->ROWCOL( i, j ).real * b->ROWCOL( i, j ).real + a->ROWCOL( i, j ).imag * b->ROWCOL( i, j ).imag; + sumi += a->ROWCOL( i, j ).real * b->ROWCOL( i, j ).imag - a->ROWCOL( i, j ).imag * b->ROWCOL( i, j ).real; + } + sum.real = sumr; + sum.imag = sumi; + return ( sum ); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/conjg.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/conjg.c new file mode 100644 index 0000000000000000000000000000000000000000..3f1cadfaa9bd8d9adb71dba1e444fd2ec5c3d36c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/conjg.c @@ -0,0 +1,14 @@ +/********************** conjg.c (in complex.a) **********************/ +/* MIMD version 6 */ +/* Subroutines for operations on complex numbers */ +/* complex conjugate */ +#include "../include/config.h" +#include "../include/complex.h" + +complex conjg_KE( complex * a ) +{ + complex c; + c.real = ( *a ).real; + c.imag = -( *a ).imag; + return ( c ); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/copy_wvec.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/copy_wvec.c new file mode 100644 index 0000000000000000000000000000000000000000..16d02fa4cb4c47eb858e2efdb78f59cd6c2f1ff6 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/copy_wvec.c @@ -0,0 +1,19 @@ +/******************** copy_wvec.c (in su3.a) ******************** +* +*void copy_wvec( wilson_vector *src,*dest ) +* copy a Wilson vector +* dest <- src +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +void copy_wvec( wilson_vector * src, wilson_vector * dest ) +{ + *dest = *src; /* hardly worth a function */ +} + +void copy_half_wvec( half_wilson_vector * src, half_wilson_vector * dest ) +{ + *dest = *src; /* hardly worth a function */ +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/cs_m_a_mat.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/cs_m_a_mat.c new file mode 100644 index 0000000000000000000000000000000000000000..ede944f8f8cee6d80f6e2490757bb799d1094b20 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/cs_m_a_mat.c @@ -0,0 +1,45 @@ +/****************** cs_m_a_mat.c (in su3.a) *************************** +* * +* c_scalar_mult_add_su3mat_KE( su3_matrix *ma, su3_matrix *m2, * +* complex *phase, su3_matrix *m3) * +* multiply an su3 matrix by a complex scalar and add it to another * +* matrix: m3 <- m1 + number*m2 * +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +void c_scalar_mult_add_su3mat_KE( su3_matrix * m1, su3_matrix * m2, complex * phase, su3_matrix * m3 ) +{ + +#ifndef NATIVEDOUBLE + register int i, j; + complex t; + for ( i = 0; i < 3; i++ ) + for ( j = 0; j < 3; j++ ) + { + t = cmul_KE( &m2->ROWCOL( i, j ), phase ); + m3->ROWCOL( i, j ) = cadd_KE( &m1->ROWCOL( i, j ), &t ); + } + +#else + register int i, j; + register double sr, si, br, bi, cr, ci; + + sr = ( *phase ).real; + si = ( *phase ).imag; + + for ( i = 0; i < 3; i++ ) + for ( j = 0; j < 3; j++ ) + { + br = m2->ROWCOL( i, j ).real; + bi = m2->ROWCOL( i, j ).imag; + + cr = sr * br - si * bi; + ci = sr * bi + si * br; + + m3->ROWCOL( i, j ).real = m1->ROWCOL( i, j ).real + cr; + m3->ROWCOL( i, j ).imag = m1->ROWCOL( i, j ).imag + ci; + } +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/cs_m_a_vec.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/cs_m_a_vec.c new file mode 100644 index 0000000000000000000000000000000000000000..943d8004818c173ebe9d3304e68091b80046ef76 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/cs_m_a_vec.c @@ -0,0 +1,41 @@ +/****************** cs_m_a_vec.c (in su3.a) *************************** +* * +* c_scalar_mult_add_su3vec_KE(): * +* multiply an su3 vector by a complex scalar and add it to another * +* vector: v1 <- v1 + number*v2 * +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +void c_scalar_mult_add_su3vec_KE( su3_vector * v1, complex * phase, su3_vector * v2 ) +{ + +#ifndef NATIVEDOUBLE + register int i; + complex t; + for ( i = 0; i < 3; i++ ) + { + t = cmul_KE( &v2->c[i], phase ); + v1->c[i] = cadd_KE( &v1->c[i], &t ); + } +#else + register int i; + register double sr, si, br, bi, cr, ci; + + sr = ( *phase ).real; + si = ( *phase ).imag; + + for ( i = 0; i < 3; i++ ) + { + br = v2->c[i].real; + bi = v2->c[i].imag; + + cr = sr * br - si * bi; + ci = sr * bi + si * br; + + v1->c[i].real += cr; + v1->c[i].imag += ci; + } +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/cs_m_a_wvec.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/cs_m_a_wvec.c new file mode 100644 index 0000000000000000000000000000000000000000..0f54170b21e26e8337fbf98b19c1e0c7ed60d8bf --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/cs_m_a_wvec.c @@ -0,0 +1,39 @@ +/******************** cs_m_a_wvec.c (in su3.a) ******************** +* +*void c_scalar_mult_add_wvec(wilson_vector *src1, wilson_vector *src2, + complex *s, wilson_vector *dest) +* Multiply a Wilson vector by a complex scalar and add to another vector +* dest <- src1 + s*src2 +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +void c_scalar_mult_add_wvec( wilson_vector * src1, wilson_vector * src2, complex * phase, wilson_vector * dest ) +{ + + register int i, j; +#ifdef NATIVEDOUBLE + register double sr, si, br, bi, cr, ci; +#else + register double sr, si, br, bi, cr, ci; +#endif + + sr = ( *phase ).real; + si = ( *phase ).imag; + + for ( i = 0; i < 4; i++ ) + { + for ( j = 0; j < 3; j++ ) + { + br = src2->COLORSPINOR( j, i ).real; + bi = src2->COLORSPINOR( j, i ).imag; + + cr = sr * br - si * bi; + ci = sr * bi + si * br; + + dest->COLORSPINOR( j, i ).real = src1->COLORSPINOR( j, i ).real + cr; + dest->COLORSPINOR( j, i ).imag = src1->COLORSPINOR( j, i ).imag + ci; + } + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/cs_m_mat.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/cs_m_mat.c new file mode 100644 index 0000000000000000000000000000000000000000..dc33a178b6c374958e761dcfe0db6b0eb7732c5d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/cs_m_mat.c @@ -0,0 +1,44 @@ +/**************** cs_m_mat.c (in su3.a) ******************************* +* * +* void c_scalar_mult_su3mat_KE( su3_matrix *b, complex *s, su3_matrix *c) * +* C <- s*B, B and C matrices * +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +/* c <- s*b, matrices */ +void c_scalar_mult_su3mat_KE( su3_matrix * b, complex * s, su3_matrix * c ) +{ + +#ifndef NATIVEDOUBLE + register int i, j; + for ( i = 0; i < 3; i++ ) + for ( j = 0; j < 3; j++ ) + { + c->ROWCOL( i, j ) = cmul_KE( &b->ROWCOL( i, j ), s ); + /* old: c->ROWCOL(i,j).real = s.real*b->ROWCOL(i,j).real-s.imag*b->ROWCOL(i,j).imag; + c->ROWCOL(i,j).imag = s.real*b->ROWCOL(i,j).imag + s.imag*b->ROWCOL(i,j).real; */ + } + +#else + register int i, j; + register double sr, si, br, bi, cr, ci; + + sr = ( *s ).real; + si = ( *s ).imag; + + for ( i = 0; i < 3; i++ ) + for ( j = 0; j < 3; j++ ) + { + br = b->ROWCOL( i, j ).real; + bi = b->ROWCOL( i, j ).imag; + + cr = sr * br - si * bi; + ci = sr * bi + si * br; + + c->ROWCOL( i, j ).real = cr; + c->ROWCOL( i, j ).imag = ci; + } +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/cs_m_s_mat.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/cs_m_s_mat.c new file mode 100644 index 0000000000000000000000000000000000000000..ea774c065553018f2e15014f5c497014363cca55 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/cs_m_s_mat.c @@ -0,0 +1,45 @@ +/**************** cs_m_s_mat.c (in su3.a) ***************************** +* * +* void c_scalar_mult_sub_su3mat_KE( su3_matrix *a, su3_matrix *b, * +* complex *s, su3_matrix *c) * +* C <- A - s*B, A,B and C matrices * +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +/* c <- a - s*b, matrices */ +void c_scalar_mult_sub_su3mat_KE( su3_matrix * a, su3_matrix * b, complex * s, su3_matrix * c ) +{ + +#ifndef NATIVEDOUBLE + register int i, j; + complex t; + for ( i = 0; i < 3; i++ ) + for ( j = 0; j < 3; j++ ) + { + t = cmul_KE( &b->ROWCOL( i, j ), s ); + c->ROWCOL( i, j ) = csub( &a->ROWCOL( i, j ), &t ); + } + +#else + register int i, j; + register double sr, si, br, bi, cr, ci; + + sr = ( *s ).real; + si = ( *s ).imag; + + for ( i = 0; i < 3; i++ ) + for ( j = 0; j < 3; j++ ) + { + br = b->ROWCOL( i, j ).real; + bi = b->ROWCOL( i, j ).imag; + + cr = sr * br - si * bi; + ci = sr * bi + si * br; + + c->ROWCOL( i, j ).real = a->ROWCOL( i, j ).real - cr; + c->ROWCOL( i, j ).imag = a->ROWCOL( i, j ).imag - ci; + } +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/cs_m_s_vec.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/cs_m_s_vec.c new file mode 100644 index 0000000000000000000000000000000000000000..6bc7f86b313c6179ca1e35e846c431ac2693538c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/cs_m_s_vec.c @@ -0,0 +1,41 @@ +/******************* cs_m_s_vec.c (in su3.a) ************************** +* * +* c_scalar_mult_sub_su3vec_KE() * +* multiply an su3 vector by a complex scalar and subtract it from * +* another vector: v1 <- v1 - number*v2 * +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +void c_scalar_mult_sub_su3vec_KE( su3_vector * v1, complex * phase, su3_vector * v2 ) +{ + +#ifndef NATIVEDOUBLE + register int i; + complex t; + for ( i = 0; i < 3; i++ ) + { + t = cmul_KE( &v2->c[i], phase ); + v1->c[i] = csub( &v1->c[i], &t ); + } +#else + register int i; + register double sr, si, br, bi, cr, ci; + + sr = ( *phase ).real; + si = ( *phase ).imag; + + for ( i = 0; i < 3; i++ ) + { + br = v2->c[i].real; + bi = v2->c[i].imag; + + cr = sr * br - si * bi; + ci = sr * bi + si * br; + + v1->c[i].real -= cr; + v1->c[i].imag -= ci; + } +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/cs_m_vec.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/cs_m_vec.c new file mode 100644 index 0000000000000000000000000000000000000000..dae1db85ea066b01ab4f0c35d95749e16f998195 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/cs_m_vec.c @@ -0,0 +1,40 @@ +/******************* cs_m_vec.c (in su3.a) **************************** +* * +* c_scalar_mult_su3vec_KE(): * +* multiply an su3 vector by a complex scalar * +* dest <- number*src * +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +void c_scalar_mult_su3vec_KE( su3_vector * src, complex * phase, su3_vector * dest ) +{ + +#ifndef NATIVEDOUBLE + register int i; + for ( i = 0; i < 3; i++ ) + { + dest->c[i] = cmul_KE( &src->c[i], phase ); + } + +#else + register int i; + register double sr, si, br, bi, cr, ci; + + sr = ( *phase ).real; + si = ( *phase ).imag; + + for ( i = 0; i < 3; i++ ) + { + br = src->c[i].real; + bi = src->c[i].imag; + + cr = sr * br - si * bi; + ci = sr * bi + si * br; + + dest->c[i].real = cr; + dest->c[i].imag = ci; + } +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/cs_m_wvec.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/cs_m_wvec.c new file mode 100644 index 0000000000000000000000000000000000000000..1e75fe54c326cb448d81fa0de29b3495edca8bba --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/cs_m_wvec.c @@ -0,0 +1,23 @@ +/******************** cs_m_wvec.c (in su3.a) ******************** +* +*void c_scalar_mult_wvec(wilson_vector *src, complex *s, wilson_vector *dest) +* Multiply a Wilson vector by a complex scalar and add to another vector +* dest <- s * src +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +void c_scalar_mult_wvec( wilson_vector * src, complex * phase, wilson_vector * dest ) +{ + + register int i, j; + for ( i = 0; i < 4; i++ ) + { + for ( j = 0; j < 3; j++ ) + { + CMUL( src->COLORSPINOR( j, i ), *phase, dest->COLORSPINOR( j, i ) ); + } + } + +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/csqrt.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/csqrt.c new file mode 100644 index 0000000000000000000000000000000000000000..2beb7091e094d0afaf158d295461f895a7158a36 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/csqrt.c @@ -0,0 +1,19 @@ +/********************** csqrt.c (in complex.a) **********************/ +/* MIMD version 6 */ +/* Subroutines for operations on complex numbers */ +/* complex square root */ +#include "../include/config.h" +#include +#include "../include/complex.h" + +complex csqrt_milc( complex * z ) +{ + complex c; + double theta, r; + r = sqrt( hypot( z->real, z->imag ) ); + theta = 0.5 * atan2( z->imag, z->real ); + c = ce_itheta_KE( theta ); + c.real *= r; + c.imag *= r; + return ( c ); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/csub.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/csub.c new file mode 100644 index 0000000000000000000000000000000000000000..d7dff113a7a7b6e5bea56369f8686add1b158b52 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/csub.c @@ -0,0 +1,14 @@ +/********************** csub.c (in complex.a) **********************/ +/* MIMD version 6 */ +/* Subroutines for operations on complex numbers */ +/* complex subtract */ +#include "../include/config.h" +#include "../include/complex.h" + +complex csub( complex * a, complex * b ) +{ + complex c; + c.real = ( *a ).real - ( *b ).real; + c.imag = ( *a ).imag - ( *b ).imag; + return ( c ); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/det_su3.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/det_su3.c new file mode 100644 index 0000000000000000000000000000000000000000..851ea4d7cfbfc86c9f6cb9350934ba47cd89724c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/det_su3.c @@ -0,0 +1,33 @@ +/****************** det_su3.c (in su3.a) ****************************** +* * +* complex det_su3_KE( su3_matrix *a ) * +* Complex determinant of an SU3 matrix * +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +/* FIX THIS - more efficient to take cross product of first two + rows, dot with third. */ +complex det_su3_KE( su3_matrix * a ) +{ + register complex cc, dd, sum; + CMUL( a->ROWCOL( 0, 0 ), a->ROWCOL( 1, 1 ), cc ); + CMUL( cc, a->ROWCOL( 2, 2 ), sum ); + CMUL( a->ROWCOL( 0, 0 ), a->ROWCOL( 1, 2 ), cc ); + CMUL( cc, a->ROWCOL( 2, 1 ), dd ); + CSUB( sum, dd, sum ); + CMUL( a->ROWCOL( 0, 1 ), a->ROWCOL( 1, 2 ), cc ); + CMUL( cc, a->ROWCOL( 2, 0 ), dd ); + CADD( sum, dd, sum ); + CMUL( a->ROWCOL( 0, 1 ), a->ROWCOL( 1, 0 ), cc ); + CMUL( cc, a->ROWCOL( 2, 2 ), dd ); + CSUB( sum, dd, sum ); + CMUL( a->ROWCOL( 0, 2 ), a->ROWCOL( 1, 0 ), cc ); + CMUL( cc, a->ROWCOL( 2, 1 ), dd ); + CADD( sum, dd, sum ); + CMUL( a->ROWCOL( 0, 2 ), a->ROWCOL( 1, 1 ), cc ); + CMUL( cc, a->ROWCOL( 2, 0 ), dd ); + CSUB( sum, dd, sum ); + return ( sum ); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/dump_wvec.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/dump_wvec.c new file mode 100644 index 0000000000000000000000000000000000000000..9a3896adc8a8059d46f17138de9c88014f19ffcb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/dump_wvec.c @@ -0,0 +1,60 @@ +/**************** dump_wvec.c (in su3.a) *********************** +* * +* void dump_wvec( wilson_vector *v ) * +* Print out a Wilson vector * +*/ +#include "../include/config.h" +#include +#include "../include/complex.h" +#include "../include/su3.h" + + +void dump_wvec( wilson_vector * v ) +{ + register int i, j; + for ( i = 0; i < 4; i++ ) + { + for ( j = 0; j < 3; j++ ) + printf( "(%.8e,%.2e)\t", v->COLORSPINOR( j, i ).real, v->COLORSPINOR( j, i ).imag ); + printf( "\n" ); + } + printf( "\n" ); +} + +void dump_half_wvec( half_wilson_vector * v ) +{ + register int i, j; + for ( i = 0; i < 2; i++ ) + { + for ( j = 0; j < 3; j++ ) + printf( "(%.8e,%.2e)\t", v->h[i].c[j].real, v->h[i].c[j].imag ); + printf( "\n" ); + } + printf( "\n" ); +} + +void dump_wvec_32( float * v ) +{ + register int i, j; + for ( i = 0; i < 4; i++ ) + { + for ( j = 0; j < 3; j++ ) +#ifndef SPINORFIRST + printf( "(%.8e,%.2e)\t", *(v+2*(4*j+i)), *(v+2*(4*j+i)+1) ); +#endif + printf( "\n" ); + } + printf( "\n" ); +} + +void dump_half_wvec_32( float * v ) +{ + register int i, j; + for ( i = 0; i < 2; i++ ) + { + for ( j = 0; j < 3; j++ ) + printf( "(%.8e,%.2e)\t", *(v+2*(3*i+j)), *(v+2*(3*i+j)+1) ); + printf( "\n" ); + } + printf( "\n" ); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/dumpmat.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/dumpmat.c new file mode 100644 index 0000000000000000000000000000000000000000..d4f7c37ef24cca488911b73c76257a3abcda6e61 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/dumpmat.c @@ -0,0 +1,21 @@ +/****************** dumpmat.c (in su3.a) ****************************** +* * +* void dumpmat( su3_matrix *mat ) * +* print out a 3x3 complex matrix * +*/ +#include "../include/config.h" +#include +#include "../include/complex.h" +#include "../include/su3.h" + +void dump_mat( su3_matrix * m ) +{ + int i, j; + for ( i = 0; i < 3; i++ ) + { + for ( j = 0; j < 3; j++ ) + printf( "(%.5e,%.2e)\t", m->ROWCOL( i, j ).real, m->ROWCOL( i, j ).imag ); + printf( "\n" ); + } + printf( "\n" ); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/dumpvec.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/dumpvec.c new file mode 100644 index 0000000000000000000000000000000000000000..384a37b87ce5583022a4341d346e24cd3b6a8fc5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/dumpvec.c @@ -0,0 +1,17 @@ +/******************* dumpvec.c (in su3.a) ***************************** +* * +* void dumpvec_KE( su3_vector *vec ) * +* print out a 3 element complex vector * +*/ +#include "../include/config.h" +#include +#include "../include/complex.h" +#include "../include/su3.h" + +void dumpvec_KE( su3_vector * v ) +{ + int j; + for ( j = 0; j < 3; j++ ) + printf( "(%.2e,%.2e)\t", v->c[j].real, v->c[j].imag ); + printf( "\n" ); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/flush_to_zero.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/flush_to_zero.c new file mode 100644 index 0000000000000000000000000000000000000000..13b2b04e4ff9d601a8e5983bd3ea36c2ed27b3f1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/flush_to_zero.c @@ -0,0 +1,6 @@ +/** flush_to_zero.c ***/ + +/* DUMMY ROUTINE - nothing to do unless on Intel machine */ +void flush_to_zero_KE( ) +{ +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/g5_m_wvec.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/g5_m_wvec.c new file mode 100644 index 0000000000000000000000000000000000000000..58a308eb7c96651d9d2dc8ad9aa33210c8a7e25f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/g5_m_wvec.c @@ -0,0 +1,27 @@ +/******************** g5_m_wvec.c (in su3.a) ******************** +* +*void g5_mult_wvec(wilson_vector *src, wilson_vector *dest) +* Multiply a Wilson vector by gamma5 +* dest <- gamma5*src +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +void g5_mult_wvec( wilson_vector * src, wilson_vector * dest ) +{ + register int i; + + for ( i = 0; i < 3; i++ ) + { + dest->COLORSPINOR( i, 0 ).real = src->COLORSPINOR( i, 0 ).real; + dest->COLORSPINOR( i, 1 ).real = src->COLORSPINOR( i, 1 ).real; + dest->COLORSPINOR( i, 2 ).real = -( src->COLORSPINOR( i, 2 ).real ); + dest->COLORSPINOR( i, 3 ).real = -( src->COLORSPINOR( i, 3 ).real ); + + dest->COLORSPINOR( i, 0 ).imag = src->COLORSPINOR( i, 0 ).imag; + dest->COLORSPINOR( i, 1 ).imag = src->COLORSPINOR( i, 1 ).imag; + dest->COLORSPINOR( i, 2 ).imag = -( src->COLORSPINOR( i, 2 ).imag ); + dest->COLORSPINOR( i, 3 ).imag = -( src->COLORSPINOR( i, 3 ).imag ); + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/gaussrand.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/gaussrand.c new file mode 100644 index 0000000000000000000000000000000000000000..147aea3ee8fb1e775ce4a3c11b5afdc78738e3b5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/gaussrand.c @@ -0,0 +1,41 @@ +/***************** gaussrand.c (in su3.a) ***************************** +* * +* double gaussian_ran_no( double_prn *prn_pt ) * +* Gaussian distributed random number * +* Probability distribution exp( -x*x ), so < x^2 > = 1/2 * +* This requires a random number generator named "myrand()", returning * +* a double uniformly distributed between zero and one. The argument of * +* this routine is a pointer to be passed to myrand(). * +*/ + +#include "../include/config.h" +#include +#include "../include/su3.h" +#include "../include/random.h" + +double gaussian_rand_no_KE( ) +{ + static int iset = 0; + static double gset; + double fac, r, v1, v2; + + if( iset == 0 ) + { + do + { + v1 = 2.0 * myrand( ) - 1.0; + v2 = 2.0 * myrand( ) - 1.0; + r = v1 * v1 + v2 * v2; + } + while( r >= 1.0 ); + fac = sqrt( -log( r ) / r ); + gset = v1 * fac; + iset = 1; + return v2 * fac; + } + else + { + iset = 0; + return gset; + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/grow4wvecs.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/grow4wvecs.c new file mode 100644 index 0000000000000000000000000000000000000000..2d32f518516457b101064b32bfd0a7dade2919d0 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/grow4wvecs.c @@ -0,0 +1,177 @@ +/***************** grow4wvecs.c (in su3.a) **************************** +* * +* If sum=0, * +* Grow and add four wilson_vectors * +* If sum=1, * +* Grow and sum four wilson_vectors to another wilson_vector * +* void grow_four_wvecs(a,b1,b2,b3,b4,sign,sum) * +* wilson_vector *a; half_wilson_vector *b1,*b2,*b3,*b4; * +* int sign,sum; * +* A <- B1 + B2 + B3 + B4 or * +* A <- A + B1 + B2 + B3 + B4 * +* B1 is expanded using gamma_x, B2 using gamma_y, etc. * +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" +#include "../include/dirs.h" +void grow_add_four_wvecs( wilson_vector * a, half_wilson_vector * b1, + half_wilson_vector * b2, half_wilson_vector * b3, half_wilson_vector * b4, int sign, int sum ) +{ + int i; + if( sum == 0 ) + { + /* wp_grow( b1,a,XUP,sign); */ + + /* case XUP: */ + if( sign == PLUS ) + { + for ( i = 0; i < 3; i++ ) + { + a->COLORSPINOR( i, 0 ) = b1->h[0].c[i]; + a->COLORSPINOR( i, 1 ) = b1->h[1].c[i]; + TIMESMINUSI( b1->h[0].c[i], a->COLORSPINOR( i, 3 ) ); + TIMESMINUSI( b1->h[1].c[i], a->COLORSPINOR( i, 2 ) ); + } + } + else + { + /* case XDOWN: */ + for ( i = 0; i < 3; i++ ) + { + a->COLORSPINOR( i, 0 ) = b1->h[0].c[i]; + a->COLORSPINOR( i, 1 ) = b1->h[1].c[i]; + TIMESPLUSI( b1->h[0].c[i], a->COLORSPINOR( i, 3 ) ); + TIMESPLUSI( b1->h[1].c[i], a->COLORSPINOR( i, 2 ) ); + } + } + } + else + { + /* wp_grow_add( b1,a,XUP,sign); */ + + /* case XUP: */ + if( sign == PLUS ) + { + for ( i = 0; i < 3; i++ ) + { + CSUM( a->COLORSPINOR( i, 0 ), b1->h[0].c[i] ); + CSUM( a->COLORSPINOR( i, 1 ), b1->h[1].c[i] ); + CSUM_TMI( a->COLORSPINOR( i, 2 ), b1->h[1].c[i] ); + CSUM_TMI( a->COLORSPINOR( i, 3 ), b1->h[0].c[i] ); + } + } + else + { + /* case XDOWN: */ + for ( i = 0; i < 3; i++ ) + { + CSUM( a->COLORSPINOR( i, 0 ), b1->h[0].c[i] ); + CSUM( a->COLORSPINOR( i, 1 ), b1->h[1].c[i] ); + CSUM_TPI( a->COLORSPINOR( i, 2 ), b1->h[1].c[i] ); + CSUM_TPI( a->COLORSPINOR( i, 3 ), b1->h[0].c[i] ); + } + } + } + + /* wp_grow_add( b2,a,YUP,sign); */ + + if( sign == PLUS ) + { + /* case YUP: */ + for ( i = 0; i < 3; i++ ) + { + CSUM( a->COLORSPINOR( i, 0 ), b2->h[0].c[i] ); + CSUM( a->COLORSPINOR( i, 1 ), b2->h[1].c[i] ); + CSUM( a->COLORSPINOR( i, 2 ), b2->h[1].c[i] ); + CSUB( a->COLORSPINOR( i, 3 ), b2->h[0].c[i], a->COLORSPINOR( i, 3 ) ); + } + } + else + { + /* case YDOWN: */ + for ( i = 0; i < 3; i++ ) + { + CSUM( a->COLORSPINOR( i, 0 ), b2->h[0].c[i] ); + CSUM( a->COLORSPINOR( i, 1 ), b2->h[1].c[i] ); + CSUB( a->COLORSPINOR( i, 2 ), b2->h[1].c[i], a->COLORSPINOR( i, 2 ) ); + CSUM( a->COLORSPINOR( i, 3 ), b2->h[0].c[i] ); + } + } + + /* wp_grow_add( b3,a,ZUP,sign); */ + + if( sign == PLUS ) + { + /* case ZUP: */ + for ( i = 0; i < 3; i++ ) + { + CSUM( a->COLORSPINOR( i, 0 ), b3->h[0].c[i] ); + CSUM( a->COLORSPINOR( i, 1 ), b3->h[1].c[i] ); + CSUM_TMI( a->COLORSPINOR( i, 2 ), b3->h[0].c[i] ); + CSUM_TPI( a->COLORSPINOR( i, 3 ), b3->h[1].c[i] ); + } + } + else + { + /* case ZDOWN: */ + for ( i = 0; i < 3; i++ ) + { + CSUM( a->COLORSPINOR( i, 0 ), b3->h[0].c[i] ); + CSUM( a->COLORSPINOR( i, 1 ), b3->h[1].c[i] ); + CSUM_TPI( a->COLORSPINOR( i, 2 ), b3->h[0].c[i] ); + CSUM_TMI( a->COLORSPINOR( i, 3 ), b3->h[1].c[i] ); + } + } + + /* wp_grow_add( b4,a,TUP,sign); */ + + if( sign == PLUS ) + { + /* case TUP: */ + for ( i = 0; i < 3; i++ ) + { + CSUM( a->COLORSPINOR( i, 0 ), b4->h[0].c[i] ); + CSUM( a->COLORSPINOR( i, 1 ), b4->h[1].c[i] ); + CSUM( a->COLORSPINOR( i, 2 ), b4->h[0].c[i] ); + CSUM( a->COLORSPINOR( i, 3 ), b4->h[1].c[i] ); + } + } + else + { + /* case TDOWN: */ + for ( i = 0; i < 3; i++ ) + { + CSUM( a->COLORSPINOR( i, 0 ), b4->h[0].c[i] ); + CSUM( a->COLORSPINOR( i, 1 ), b4->h[1].c[i] ); + CSUB( a->COLORSPINOR( i, 2 ), b4->h[0].c[i], a->COLORSPINOR( i, 2 ) ); + CSUB( a->COLORSPINOR( i, 3 ), b4->h[1].c[i], a->COLORSPINOR( i, 3 ) ); + } + } + + +} + +void grow_add_four_wvecs_hch( wilson_vector * a, half_wilson_vector * b1, + half_wilson_vector * b2, half_wilson_vector * b3, half_wilson_vector * b4, int sign, int sum ) +{ + + if( sum == 0 ) + { + wp_grow_hch( b1, a, XUP, sign ); + + } + else + { + wp_grow_add_hch( b1, a, XUP, sign ); + + } + + wp_grow_add_hch( b2, a, YUP, sign ); + + wp_grow_add_hch( b3, a, ZUP, sign ); + + wp_grow_add_hch( b4, a, TUP, sign ); + + +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/l_su2_hit_n.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/l_su2_hit_n.c new file mode 100644 index 0000000000000000000000000000000000000000..44fb70ac725baddffbf424b62c372d283d099ac7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/l_su2_hit_n.c @@ -0,0 +1,22 @@ +/************** l_su2_hit_n.c (in su3.a) ********************** +* * +* left multiply an su3_matrix by an su2 matrix * +*/ + +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +void left_su2_hit_n_KE( su2_matrix * u, int p, int q, su3_matrix * link ) +{ + /* link <- u * link */ + /* The 0 row of the SU(2) matrix u matches row p of the SU(3) matrix */ + /* The 1 row of the SU(2) matrix u matches row q of the SU(3) matrix */ + /* C. DeTar 18 Oct 1990 */ + + register int m; + + for ( m = 0; m < 3; m++ ) + mult_su2_mat_vec_elem_n_KE( u, &( link->ROWCOL( p, m ) ), &( link->ROWCOL( q, m ) ) ); + +} /* l_su2_hit_n.c */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/m_amat_hwvec.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/m_amat_hwvec.c new file mode 100644 index 0000000000000000000000000000000000000000..f0e2aeabae85a2fe115116c1c657e8eed184b1fc --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/m_amat_hwvec.c @@ -0,0 +1,123 @@ +/************** m_amat_hwvec.c (in su3.a) ********************** +* * +* void mult_adj_su3_mat_hwvec( su3_matrix *mat, * +* half_wilson_vector *src,*dest ) * +* multiply a Wilson half-vector by the adjoint of a matrix * +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +void mult_adj_su3_mat_hwvec( su3_matrix * mat, half_wilson_vector * src, half_wilson_vector * dest ) +{ + +#ifdef NATIVEDOUBLE + register double a0r, a0i, a1r, a1i, a2r, a2i; + register double b0r, b0i, b1r, b1i, b2r, b2i; +#else + register double a0r, a0i, a1r, a1i, a2r, a2i; + register double b0r, b0i, b1r, b1i, b2r, b2i; +#endif + + /* mult_adj_su3_mat_vec_KE(mat, &(src->h[0]), &(dest->h[0]) ); */ + + a0r = mat->ROWCOL( 0, 0 ).real; + a0i = mat->ROWCOL( 0, 0 ).imag; + b0r = src->h[0].c[0].real; + b0i = src->h[0].c[0].imag; + a1r = mat->ROWCOL( 1, 0 ).real; + a1i = mat->ROWCOL( 1, 0 ).imag; + b1r = src->h[0].c[1].real; + b1i = src->h[0].c[1].imag; + a2r = mat->ROWCOL( 2, 0 ).real; + a2i = mat->ROWCOL( 2, 0 ).imag; + b2r = src->h[0].c[2].real; + b2i = src->h[0].c[2].imag; + + dest->h[0].c[0].real = a0r * b0r + a0i * b0i + a1r * b1r + a1i * b1i + a2r * b2r + a2i * b2i; + dest->h[0].c[0].imag = a0r * b0i - a0i * b0r + a1r * b1i - a1i * b1r + a2r * b2i - a2i * b2r; + + a0r = mat->ROWCOL( 0, 1 ).real; + a0i = mat->ROWCOL( 0, 1 ).imag; + b0r = src->h[0].c[0].real; + b0i = src->h[0].c[0].imag; + a1r = mat->ROWCOL( 1, 1 ).real; + a1i = mat->ROWCOL( 1, 1 ).imag; + b1r = src->h[0].c[1].real; + b1i = src->h[0].c[1].imag; + a2r = mat->ROWCOL( 2, 1 ).real; + a2i = mat->ROWCOL( 2, 1 ).imag; + b2r = src->h[0].c[2].real; + b2i = src->h[0].c[2].imag; + + dest->h[0].c[1].real = a0r * b0r + a0i * b0i + a1r * b1r + a1i * b1i + a2r * b2r + a2i * b2i; + dest->h[0].c[1].imag = a0r * b0i - a0i * b0r + a1r * b1i - a1i * b1r + a2r * b2i - a2i * b2r; + + a0r = mat->ROWCOL( 0, 2 ).real; + a0i = mat->ROWCOL( 0, 2 ).imag; + b0r = src->h[0].c[0].real; + b0i = src->h[0].c[0].imag; + a1r = mat->ROWCOL( 1, 2 ).real; + a1i = mat->ROWCOL( 1, 2 ).imag; + b1r = src->h[0].c[1].real; + b1i = src->h[0].c[1].imag; + a2r = mat->ROWCOL( 2, 2 ).real; + a2i = mat->ROWCOL( 2, 2 ).imag; + b2r = src->h[0].c[2].real; + b2i = src->h[0].c[2].imag; + + dest->h[0].c[2].real = a0r * b0r + a0i * b0i + a1r * b1r + a1i * b1i + a2r * b2r + a2i * b2i; + dest->h[0].c[2].imag = a0r * b0i - a0i * b0r + a1r * b1i - a1i * b1r + a2r * b2i - a2i * b2r; + + + /* mult_adj_su3_mat_vec_KE(mat, &(src->h[1]), &(dest->h[1]) ); */ + + a0r = mat->ROWCOL( 0, 0 ).real; + a0i = mat->ROWCOL( 0, 0 ).imag; + b0r = src->h[1].c[0].real; + b0i = src->h[1].c[0].imag; + a1r = mat->ROWCOL( 1, 0 ).real; + a1i = mat->ROWCOL( 1, 0 ).imag; + b1r = src->h[1].c[1].real; + b1i = src->h[1].c[1].imag; + a2r = mat->ROWCOL( 2, 0 ).real; + a2i = mat->ROWCOL( 2, 0 ).imag; + b2r = src->h[1].c[2].real; + b2i = src->h[1].c[2].imag; + + dest->h[1].c[0].real = a0r * b0r + a0i * b0i + a1r * b1r + a1i * b1i + a2r * b2r + a2i * b2i; + dest->h[1].c[0].imag = a0r * b0i - a0i * b0r + a1r * b1i - a1i * b1r + a2r * b2i - a2i * b2r; + + a0r = mat->ROWCOL( 0, 1 ).real; + a0i = mat->ROWCOL( 0, 1 ).imag; + b0r = src->h[1].c[0].real; + b0i = src->h[1].c[0].imag; + a1r = mat->ROWCOL( 1, 1 ).real; + a1i = mat->ROWCOL( 1, 1 ).imag; + b1r = src->h[1].c[1].real; + b1i = src->h[1].c[1].imag; + a2r = mat->ROWCOL( 2, 1 ).real; + a2i = mat->ROWCOL( 2, 1 ).imag; + b2r = src->h[1].c[2].real; + b2i = src->h[1].c[2].imag; + + dest->h[1].c[1].real = a0r * b0r + a0i * b0i + a1r * b1r + a1i * b1i + a2r * b2r + a2i * b2i; + dest->h[1].c[1].imag = a0r * b0i - a0i * b0r + a1r * b1i - a1i * b1r + a2r * b2i - a2i * b2r; + + a0r = mat->ROWCOL( 0, 2 ).real; + a0i = mat->ROWCOL( 0, 2 ).imag; + b0r = src->h[1].c[0].real; + b0i = src->h[1].c[0].imag; + a1r = mat->ROWCOL( 1, 2 ).real; + a1i = mat->ROWCOL( 1, 2 ).imag; + b1r = src->h[1].c[1].real; + b1i = src->h[1].c[1].imag; + a2r = mat->ROWCOL( 2, 2 ).real; + a2i = mat->ROWCOL( 2, 2 ).imag; + b2r = src->h[1].c[2].real; + b2i = src->h[1].c[2].imag; + + dest->h[1].c[2].real = a0r * b0r + a0i * b0i + a1r * b1r + a1i * b1i + a2r * b2r + a2i * b2i; + dest->h[1].c[2].imag = a0r * b0i - a0i * b0r + a1r * b1i - a1i * b1r + a2r * b2i - a2i * b2r; + +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/m_amat_wvec.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/m_amat_wvec.c new file mode 100644 index 0000000000000000000000000000000000000000..c2c9f06a1f354edfe66ab65441137260ba8fd517 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/m_amat_wvec.c @@ -0,0 +1,61 @@ +/*************** m_amat_wvec.c (in su3.a) ********************** + * * + * void mult_adj_mat_wilson_vec( su3_matrix *mat, * + * wilson_vector *src,*dest) * + * multiply a Wilson vector by the adjoint of a matrix * + */ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +void mult_adj_mat_wilson_vec( su3_matrix * mat, wilson_vector * src, wilson_vector * dest ) +{ + register int i, k; + register double t, ar, ai, br, bi, cr, ci; + for ( k = 0; k < 4; k++ ) + { + for ( i = 0; i < 3; i++ ) + { + + ar = mat->ROWCOL( 0, i ).real; + ai = mat->ROWCOL( 0, i ).imag; + br = src->COLORSPINOR( 0, k ).real; + bi = src->COLORSPINOR( 0, k ).imag; + cr = ar * br; + t = ai * bi; + cr += t; + ci = ar * bi; + t = ai * br; + ci -= t; + + ar = mat->ROWCOL( 1, i ).real; + ai = mat->ROWCOL( 1, i ).imag; + br = src->COLORSPINOR( 1, k ).real; + bi = src->COLORSPINOR( 1, k ).imag; + t = ar * br; + cr += t; + t = ai * bi; + cr += t; + t = ar * bi; + ci += t; + t = ai * br; + ci -= t; + + ar = mat->ROWCOL( 2, i ).real; + ai = mat->ROWCOL( 2, i ).imag; + br = src->COLORSPINOR( 2, k ).real; + bi = src->COLORSPINOR( 2, k ).imag; + t = ar * br; + cr += t; + t = ai * bi; + cr += t; + t = ar * bi; + ci += t; + t = ai * br; + ci -= t; + + dest->COLORSPINOR( i, k ).real = cr; + dest->COLORSPINOR( i, k ).imag = ci; + } + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/m_amatvec.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/m_amatvec.c new file mode 100644 index 0000000000000000000000000000000000000000..5ce7299ad38f0bfb2551686643425157463e78ba --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/m_amatvec.c @@ -0,0 +1,166 @@ +/***************** m_amatvec.c (in su3.a) ***************************** +* * +* void mult_adj_su3_mat_vec_KE( su3_matrix *a, su3_vector *b,*c ) * +* C <- A_adjoint * B * +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +#ifndef FAST +/* adjoint matrix times vector multiply */ +void mult_adj_su3_mat_vec_KE( su3_matrix * a, su3_vector * b, su3_vector * c ) +{ + register int i, j; + register complex x, y, z; + for ( i = 0; i < 3; i++ ) + { + x.real = x.imag = 0.0; + for ( j = 0; j < 3; j++ ) + { + CONJG( a->ROWCOL( j, i ), z ); + CMUL( z, b->c[j], y ) CSUM( x, y ); + } + c->c[i] = x; + } +} + +#else +#ifdef NATIVEDOUBLE /* IBM RS6000 version */ +void mult_adj_su3_mat_vec_KE( su3_matrix * a, su3_vector * b, su3_vector * c ) +{ + + register double c0r, c0i, c1r, c1i, c2r, c2i; + register double br, bi, a0, a1, a2; + + br = b->c[0].real; + bi = b->c[0].imag; + a0 = a->ROWCOL( 0, 0 ).real; + a1 = a->ROWCOL( 0, 1 ).real; + a2 = a->ROWCOL( 0, 2 ).real; + + c0r = a0 * br; + c1r = a1 * br; + c2r = a2 * br; + c0i = a0 * bi; + c1i = a1 * bi; + c2i = a2 * bi; + + a0 = a->ROWCOL( 0, 0 ).imag; + a1 = a->ROWCOL( 0, 1 ).imag; + a2 = a->ROWCOL( 0, 2 ).imag; + + c0r += a0 * bi; + c1r += a1 * bi; + c2r += a2 * bi; + c0i -= a0 * br; + c1i -= a1 * br; + c2i -= a2 * br; + + br = b->c[1].real; + bi = b->c[1].imag; + a0 = a->ROWCOL( 1, 0 ).real; + a1 = a->ROWCOL( 1, 1 ).real; + a2 = a->ROWCOL( 1, 2 ).real; + + c0r += a0 * br; + c1r += a1 * br; + c2r += a2 * br; + c0i += a0 * bi; + c1i += a1 * bi; + c2i += a2 * bi; + + a0 = a->ROWCOL( 1, 0 ).imag; + a1 = a->ROWCOL( 1, 1 ).imag; + a2 = a->ROWCOL( 1, 2 ).imag; + + c0r += a0 * bi; + c1r += a1 * bi; + c2r += a2 * bi; + c0i -= a0 * br; + c1i -= a1 * br; + c2i -= a2 * br; + + br = b->c[2].real; + bi = b->c[2].imag; + a0 = a->ROWCOL( 2, 0 ).real; + a1 = a->ROWCOL( 2, 1 ).real; + a2 = a->ROWCOL( 2, 2 ).real; + + c0r += a0 * br; + c1r += a1 * br; + c2r += a2 * br; + c0i += a0 * bi; + c1i += a1 * bi; + c2i += a2 * bi; + + a0 = a->ROWCOL( 2, 0 ).imag; + a1 = a->ROWCOL( 2, 1 ).imag; + a2 = a->ROWCOL( 2, 2 ).imag; + + c0r += a0 * bi; + c1r += a1 * bi; + c2r += a2 * bi; + c0i -= a0 * br; + c1i -= a1 * br; + c2i -= a2 * br; + + c->c[0].real = c0r; + c->c[0].imag = c0i; + c->c[1].real = c1r; + c->c[1].imag = c1i; + c->c[2].real = c2r; + c->c[2].imag = c2i; + +} +#else +void mult_adj_su3_mat_vec_KE( su3_matrix * a, su3_vector * b, su3_vector * c ) +{ + int i; + register double t, ar, ai, br, bi, cr, ci; + for ( i = 0; i < 3; i++ ) + { + + ar = a->ROWCOL( 0, i ).real; + ai = a->ROWCOL( 0, i ).imag; + br = b->c[0].real; + bi = b->c[0].imag; + cr = ar * br; + t = ai * bi; + cr += t; + ci = ar * bi; + t = ai * br; + ci -= t; + + ar = a->ROWCOL( 1, i ).real; + ai = a->ROWCOL( 1, i ).imag; + br = b->c[1].real; + bi = b->c[1].imag; + t = ar * br; + cr += t; + t = ai * bi; + cr += t; + t = ar * bi; + ci += t; + t = ai * br; + ci -= t; + + ar = a->ROWCOL( 2, i ).real; + ai = a->ROWCOL( 2, i ).imag; + br = b->c[2].real; + bi = b->c[2].imag; + t = ar * br; + cr += t; + t = ai * bi; + cr += t; + t = ar * bi; + ci += t; + t = ai * br; + ci -= t; + + c->c[i].real = cr; + c->c[i].imag = ci; + } +} +#endif /* End of "#ifdef NATIVEDOUBLE" */ +#endif /* End of "#ifndef FAST" */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/m_amatvec_ns.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/m_amatvec_ns.c new file mode 100644 index 0000000000000000000000000000000000000000..7ed1d1f4f50e7a8f9b3d36f03acab9717296d89c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/m_amatvec_ns.c @@ -0,0 +1,121 @@ +/****************** m_amatvec_ns.c (in su3.a) ************************* +* * +* void mult_adj_su3_mat_vec_nsum_KE( su3_matrix *a, su3_vector *b,*c ) * +* adjoint matrix times vector multiply and subtract from another vector * +* C <- C - A_adjoint*B * +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +#ifndef FAST +void mult_adj_su3_mat_vec_nsum_KE( su3_matrix * a, su3_vector * b, su3_vector * c ) +{ + register int i, j; + register complex x, y, z; + for ( i = 0; i < 3; i++ ) + { + x.real = x.imag = 0.0; + for ( j = 0; j < 3; j++ ) + { + CONJG( a->ROWCOL( j, i ), z ); + CMUL( z, b->c[j], y ) CSUM( x, y ); + } + c->c[i].real -= x.real; + c->c[i].imag -= x.imag; + } +} + +#else +void mult_adj_su3_mat_vec_nsum_KE( su3_matrix * a, su3_vector * b, su3_vector * c ) +{ + +#ifdef NATIVEDOUBLE + register double c0r, c0i, c1r, c1i, c2r, c2i; + register double br, bi, a0, a1, a2; +#else + register double c0r, c0i, c1r, c1i, c2r, c2i; + register double br, bi, a0, a1, a2; +#endif + + br = b->c[0].real; + bi = b->c[0].imag; + a0 = a->ROWCOL( 0, 0 ).real; + a1 = a->ROWCOL( 0, 1 ).real; + a2 = a->ROWCOL( 0, 2 ).real; + + c0r = a0 * br; + c1r = a1 * br; + c2r = a2 * br; + c0i = a0 * bi; + c1i = a1 * bi; + c2i = a2 * bi; + + a0 = a->ROWCOL( 0, 0 ).imag; + a1 = a->ROWCOL( 0, 1 ).imag; + a2 = a->ROWCOL( 0, 2 ).imag; + + c0r += a0 * bi; + c1r += a1 * bi; + c2r += a2 * bi; + c0i -= a0 * br; + c1i -= a1 * br; + c2i -= a2 * br; + + br = b->c[1].real; + bi = b->c[1].imag; + a0 = a->ROWCOL( 1, 0 ).real; + a1 = a->ROWCOL( 1, 1 ).real; + a2 = a->ROWCOL( 1, 2 ).real; + + c0r += a0 * br; + c1r += a1 * br; + c2r += a2 * br; + c0i += a0 * bi; + c1i += a1 * bi; + c2i += a2 * bi; + + a0 = a->ROWCOL( 1, 0 ).imag; + a1 = a->ROWCOL( 1, 1 ).imag; + a2 = a->ROWCOL( 1, 2 ).imag; + + c0r += a0 * bi; + c1r += a1 * bi; + c2r += a2 * bi; + c0i -= a0 * br; + c1i -= a1 * br; + c2i -= a2 * br; + + br = b->c[2].real; + bi = b->c[2].imag; + a0 = a->ROWCOL( 2, 0 ).real; + a1 = a->ROWCOL( 2, 1 ).real; + a2 = a->ROWCOL( 2, 2 ).real; + + c0r += a0 * br; + c1r += a1 * br; + c2r += a2 * br; + c0i += a0 * bi; + c1i += a1 * bi; + c2i += a2 * bi; + + a0 = a->ROWCOL( 2, 0 ).imag; + a1 = a->ROWCOL( 2, 1 ).imag; + a2 = a->ROWCOL( 2, 2 ).imag; + + c0r += a0 * bi; + c1r += a1 * bi; + c2r += a2 * bi; + c0i -= a0 * br; + c1i -= a1 * br; + c2i -= a2 * br; + + c->c[0].real -= c0r; + c->c[0].imag -= c0i; + c->c[1].real -= c1r; + c->c[1].imag -= c1i; + c->c[2].real -= c2r; + c->c[2].imag -= c2i; + +} +#endif /* End of "#ifdef FAST" */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/m_amatvec_s.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/m_amatvec_s.c new file mode 100644 index 0000000000000000000000000000000000000000..75d3bb76bb006a0999eb0be6848cd420ea5bb7da --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/m_amatvec_s.c @@ -0,0 +1,121 @@ +/******************* m_amatvec_s.c (in su3.a) ************************* +* * +* void mult_adj_su3_mat_vec_sum_KE( su3_matrix *a, su3_vector *b,*c ) * +* adjoint matrix times vector multiply and add to another vector * +* C <- C + A_adjoint*B * +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +#ifndef FAST +void mult_adj_su3_mat_vec_sum_KE( su3_matrix * a, su3_vector * b, su3_vector * c ) +{ + register int i, j; + register complex x, y, z; + for ( i = 0; i < 3; i++ ) + { + x.real = x.imag = 0.0; + for ( j = 0; j < 3; j++ ) + { + CONJG( a->ROWCOL( j, i ), z ); + CMUL( z, b->c[j], y ) CSUM( x, y ); + } + c->c[i].real += x.real; + c->c[i].imag += x.imag; + } +} + +#else +void mult_adj_su3_mat_vec_sum_KE( su3_matrix * a, su3_vector * b, su3_vector * c ) +{ + +#ifdef NATIVEDOUBLE + register double c0r, c0i, c1r, c1i, c2r, c2i; + register double br, bi, a0, a1, a2; +#else + register double c0r, c0i, c1r, c1i, c2r, c2i; + register double br, bi, a0, a1, a2; +#endif + + br = b->c[0].real; + bi = b->c[0].imag; + a0 = a->ROWCOL( 0, 0 ).real; + a1 = a->ROWCOL( 0, 1 ).real; + a2 = a->ROWCOL( 0, 2 ).real; + + c0r = a0 * br; + c1r = a1 * br; + c2r = a2 * br; + c0i = a0 * bi; + c1i = a1 * bi; + c2i = a2 * bi; + + a0 = a->ROWCOL( 0, 0 ).imag; + a1 = a->ROWCOL( 0, 1 ).imag; + a2 = a->ROWCOL( 0, 2 ).imag; + + c0r += a0 * bi; + c1r += a1 * bi; + c2r += a2 * bi; + c0i -= a0 * br; + c1i -= a1 * br; + c2i -= a2 * br; + + br = b->c[1].real; + bi = b->c[1].imag; + a0 = a->ROWCOL( 1, 0 ).real; + a1 = a->ROWCOL( 1, 1 ).real; + a2 = a->ROWCOL( 1, 2 ).real; + + c0r += a0 * br; + c1r += a1 * br; + c2r += a2 * br; + c0i += a0 * bi; + c1i += a1 * bi; + c2i += a2 * bi; + + a0 = a->ROWCOL( 1, 0 ).imag; + a1 = a->ROWCOL( 1, 1 ).imag; + a2 = a->ROWCOL( 1, 2 ).imag; + + c0r += a0 * bi; + c1r += a1 * bi; + c2r += a2 * bi; + c0i -= a0 * br; + c1i -= a1 * br; + c2i -= a2 * br; + + br = b->c[2].real; + bi = b->c[2].imag; + a0 = a->ROWCOL( 2, 0 ).real; + a1 = a->ROWCOL( 2, 1 ).real; + a2 = a->ROWCOL( 2, 2 ).real; + + c0r += a0 * br; + c1r += a1 * br; + c2r += a2 * br; + c0i += a0 * bi; + c1i += a1 * bi; + c2i += a2 * bi; + + a0 = a->ROWCOL( 2, 0 ).imag; + a1 = a->ROWCOL( 2, 1 ).imag; + a2 = a->ROWCOL( 2, 2 ).imag; + + c0r += a0 * bi; + c1r += a1 * bi; + c2r += a2 * bi; + c0i -= a0 * br; + c1i -= a1 * br; + c2i -= a2 * br; + + c->c[0].real += c0r; + c->c[0].imag += c0i; + c->c[1].real += c1r; + c->c[1].imag += c1i; + c->c[2].real += c2r; + c->c[2].imag += c2i; +} + +#endif /* End of "#ifdef FAST" */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/m_amv_4dir.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/m_amv_4dir.c new file mode 100644 index 0000000000000000000000000000000000000000..5b1778b357ef4c84c41daad7391f5a4e5f82b6b6 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/m_amv_4dir.c @@ -0,0 +1,124 @@ +/***************** m_amv_4dir.c (in su3.a) ***************************** +* * +* void mult_adj_su3_mat_vec_4dir_KE( su3_matrix *mat, * +* su3_vector *src, su3_vector *dest ) * +* Multiply an su3_vector by an array of four adjoint su3_matrices, * +* result in an array of four su3_vectors. * +* dest[i] <- A_adjoint[i] * src * +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +#ifndef FAST +void mult_adj_su3_mat_vec_4dir_KE( su3_matrix * mat, su3_vector * src, su3_vector * dest ) +{ + mult_adj_su3_mat_vec_KE( mat + 0, src, dest + 0 ); + mult_adj_su3_mat_vec_KE( mat + 1, src, dest + 1 ); + mult_adj_su3_mat_vec_KE( mat + 2, src, dest + 2 ); + mult_adj_su3_mat_vec_KE( mat + 3, src, dest + 3 ); +} + +#else +/* Fast code, with subroutines inlined */ + +void mult_adj_su3_mat_vec_4dir_KE( su3_matrix * mat, su3_vector * src, su3_vector * dest ) +{ + register int n; +#ifdef NATIVEDOUBLE + register double c0r, c0i, c1r, c1i, c2r, c2i; + register double br, bi, a0, a1, a2; +#else + register double c0r, c0i, c1r, c1i, c2r, c2i; + register double br, bi, a0, a1, a2; +#endif + register su3_matrix *a; + register su3_vector *b, *c; + + a = mat; + c = dest; + b = src; + for ( n = 0; n < 4; n++, a++, c++ ) + { + + br = b->c[0].real; + bi = b->c[0].imag; + a0 = a->ROWCOL( 0, 0 ).real; + a1 = a->ROWCOL( 0, 1 ).real; + a2 = a->ROWCOL( 0, 2 ).real; + + c0r = a0 * br; + c1r = a1 * br; + c2r = a2 * br; + c0i = a0 * bi; + c1i = a1 * bi; + c2i = a2 * bi; + + a0 = a->ROWCOL( 0, 0 ).imag; + a1 = a->ROWCOL( 0, 1 ).imag; + a2 = a->ROWCOL( 0, 2 ).imag; + + c0r += a0 * bi; + c1r += a1 * bi; + c2r += a2 * bi; + c0i -= a0 * br; + c1i -= a1 * br; + c2i -= a2 * br; + + br = b->c[1].real; + bi = b->c[1].imag; + a0 = a->ROWCOL( 1, 0 ).real; + a1 = a->ROWCOL( 1, 1 ).real; + a2 = a->ROWCOL( 1, 2 ).real; + + c0r += a0 * br; + c1r += a1 * br; + c2r += a2 * br; + c0i += a0 * bi; + c1i += a1 * bi; + c2i += a2 * bi; + + a0 = a->ROWCOL( 1, 0 ).imag; + a1 = a->ROWCOL( 1, 1 ).imag; + a2 = a->ROWCOL( 1, 2 ).imag; + + c0r += a0 * bi; + c1r += a1 * bi; + c2r += a2 * bi; + c0i -= a0 * br; + c1i -= a1 * br; + c2i -= a2 * br; + + br = b->c[2].real; + bi = b->c[2].imag; + a0 = a->ROWCOL( 2, 0 ).real; + a1 = a->ROWCOL( 2, 1 ).real; + a2 = a->ROWCOL( 2, 2 ).real; + + c0r += a0 * br; + c1r += a1 * br; + c2r += a2 * br; + c0i += a0 * bi; + c1i += a1 * bi; + c2i += a2 * bi; + + a0 = a->ROWCOL( 2, 0 ).imag; + a1 = a->ROWCOL( 2, 1 ).imag; + a2 = a->ROWCOL( 2, 2 ).imag; + + c0r += a0 * bi; + c1r += a1 * bi; + c2r += a2 * bi; + c0i -= a0 * br; + c1i -= a1 * br; + c2i -= a2 * br; + + c->c[0].real = c0r; + c->c[0].imag = c0i; + c->c[1].real = c1r; + c->c[1].imag = c1i; + c->c[2].real = c2r; + c->c[2].imag = c2i; + } +} +#endif /* End of "#ifndef FAST" */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/m_amv_4vec.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/m_amv_4vec.c new file mode 100644 index 0000000000000000000000000000000000000000..e34c9154d10f8ebea1ba930f51a491ccd760523c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/m_amv_4vec.c @@ -0,0 +1,132 @@ +/***************** m_amv_4vec.c in su3.a ***************************** +* * +* void mult_adj_su3_mat_4vec( su3_matrix *mat, * +* su3_vector *src, su3_vector *dest0, *dest1, *dest2, *dest3 ) * +* Multiply an su3_vector by an array of four adjoint su3_matrices, * +* result in four SEPARATE su3_vectors. * +* desti <- A_adjoint[i] * src * +* See also m_amv_4dir.c for the case desti = dest[i] * +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" +#ifndef FAST +void mult_adj_su3_mat_4vec( su3_matrix * mat, su3_vector * src, + su3_vector * dest0, su3_vector * dest1, su3_vector * dest2, su3_vector * dest3 ) +{ + mult_adj_su3_mat_vec_KE( mat + 0, src, dest0 ); + mult_adj_su3_mat_vec_KE( mat + 1, src, dest1 ); + mult_adj_su3_mat_vec_KE( mat + 2, src, dest2 ); + mult_adj_su3_mat_vec_KE( mat + 3, src, dest3 ); +} + +#else +/* Fast code, with subroutines inlined */ + +void mult_adj_su3_mat_4vec( su3_matrix * mat, su3_vector * src, + su3_vector * dest0, su3_vector * dest1, su3_vector * dest2, su3_vector * dest3 ) +{ + register int n; +#ifdef NATIVEDOUBLE + register double c0r, c0i, c1r, c1i, c2r, c2i; + register double br, bi, a0, a1, a2; +#else + register double c0r, c0i, c1r, c1i, c2r, c2i; + register double br, bi, a0, a1, a2; +#endif + register su3_matrix *a; + register su3_vector *b, *c; + su3_vector *cc[4]; + + cc[0] = dest0; + cc[1] = dest1; + cc[2] = dest2; + cc[3] = dest3; + + a = mat; + c = dest0; + b = src; + for ( n = 0; n < 4; n++, a++, c = cc[n] ) + { + + br = b->c[0].real; + bi = b->c[0].imag; + a0 = a->ROWCOL( 0, 0 ).real; + a1 = a->ROWCOL( 0, 1 ).real; + a2 = a->ROWCOL( 0, 2 ).real; + + c0r = a0 * br; + c1r = a1 * br; + c2r = a2 * br; + c0i = a0 * bi; + c1i = a1 * bi; + c2i = a2 * bi; + + a0 = a->ROWCOL( 0, 0 ).imag; + a1 = a->ROWCOL( 0, 1 ).imag; + a2 = a->ROWCOL( 0, 2 ).imag; + + c0r += a0 * bi; + c1r += a1 * bi; + c2r += a2 * bi; + c0i -= a0 * br; + c1i -= a1 * br; + c2i -= a2 * br; + + br = b->c[1].real; + bi = b->c[1].imag; + a0 = a->ROWCOL( 1, 0 ).real; + a1 = a->ROWCOL( 1, 1 ).real; + a2 = a->ROWCOL( 1, 2 ).real; + + c0r += a0 * br; + c1r += a1 * br; + c2r += a2 * br; + c0i += a0 * bi; + c1i += a1 * bi; + c2i += a2 * bi; + + a0 = a->ROWCOL( 1, 0 ).imag; + a1 = a->ROWCOL( 1, 1 ).imag; + a2 = a->ROWCOL( 1, 2 ).imag; + + c0r += a0 * bi; + c1r += a1 * bi; + c2r += a2 * bi; + c0i -= a0 * br; + c1i -= a1 * br; + c2i -= a2 * br; + + br = b->c[2].real; + bi = b->c[2].imag; + a0 = a->ROWCOL( 2, 0 ).real; + a1 = a->ROWCOL( 2, 1 ).real; + a2 = a->ROWCOL( 2, 2 ).real; + + c0r += a0 * br; + c1r += a1 * br; + c2r += a2 * br; + c0i += a0 * bi; + c1i += a1 * bi; + c2i += a2 * bi; + + a0 = a->ROWCOL( 2, 0 ).imag; + a1 = a->ROWCOL( 2, 1 ).imag; + a2 = a->ROWCOL( 2, 2 ).imag; + + c0r += a0 * bi; + c1r += a1 * bi; + c2r += a2 * bi; + c0i -= a0 * br; + c1i -= a1 * br; + c2i -= a2 * br; + + c->c[0].real = c0r; + c->c[0].imag = c0i; + c->c[1].real = c1r; + c->c[1].imag = c1i; + c->c[2].real = c2r; + c->c[2].imag = c2i; + } +} +#endif /* End of "#ifndef FAST" */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/m_mat_hwvec.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/m_mat_hwvec.c new file mode 100644 index 0000000000000000000000000000000000000000..391b0c131fd12ccc42bc0a366c551cb72e29c29f --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/m_mat_hwvec.c @@ -0,0 +1,123 @@ +/************** m_mat_hwvec.c (in su3.a) *********************** +* * +* void mult_su3_mat_hwvec(su3_matrix *mat, * +* half_wilson_vector *src,*dest) * +* multiply a Wilson half-vector by a matrix * +* dest <- mat*src * +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +void mult_su3_mat_hwvec( su3_matrix * mat, half_wilson_vector * src, half_wilson_vector * dest ) +{ + +#ifdef NATIVEDOUBLE + register double a0r, a0i, a1r, a1i, a2r, a2i; + register double b0r, b0i, b1r, b1i, b2r, b2i; +#else + register double a0r, a0i, a1r, a1i, a2r, a2i; + register double b0r, b0i, b1r, b1i, b2r, b2i; +#endif + + /* mult_su3_mat_vec_KE(mat, &(src->h[0]), &(dest->h[0]) ); */ + + a0r = mat->ROWCOL( 0, 0 ).real; + a0i = mat->ROWCOL( 0, 0 ).imag; + b0r = src->h[0].c[0].real; + b0i = src->h[0].c[0].imag; + a1r = mat->ROWCOL( 0, 1 ).real; + a1i = mat->ROWCOL( 0, 1 ).imag; + b1r = src->h[0].c[1].real; + b1i = src->h[0].c[1].imag; + a2r = mat->ROWCOL( 0, 2 ).real; + a2i = mat->ROWCOL( 0, 2 ).imag; + b2r = src->h[0].c[2].real; + b2i = src->h[0].c[2].imag; + + dest->h[0].c[0].real = a0r * b0r - a0i * b0i + a1r * b1r - a1i * b1i + a2r * b2r - a2i * b2i; + dest->h[0].c[0].imag = a0r * b0i + a0i * b0r + a1r * b1i + a1i * b1r + a2r * b2i + a2i * b2r; + + a0r = mat->ROWCOL( 1, 0 ).real; + a0i = mat->ROWCOL( 1, 0 ).imag; + b0r = src->h[0].c[0].real; + b0i = src->h[0].c[0].imag; + a1r = mat->ROWCOL( 1, 1 ).real; + a1i = mat->ROWCOL( 1, 1 ).imag; + b1r = src->h[0].c[1].real; + b1i = src->h[0].c[1].imag; + a2r = mat->ROWCOL( 1, 2 ).real; + a2i = mat->ROWCOL( 1, 2 ).imag; + b2r = src->h[0].c[2].real; + b2i = src->h[0].c[2].imag; + + dest->h[0].c[1].real = a0r * b0r - a0i * b0i + a1r * b1r - a1i * b1i + a2r * b2r - a2i * b2i; + dest->h[0].c[1].imag = a0r * b0i + a0i * b0r + a1r * b1i + a1i * b1r + a2r * b2i + a2i * b2r; + + a0r = mat->ROWCOL( 2, 0 ).real; + a0i = mat->ROWCOL( 2, 0 ).imag; + b0r = src->h[0].c[0].real; + b0i = src->h[0].c[0].imag; + a1r = mat->ROWCOL( 2, 1 ).real; + a1i = mat->ROWCOL( 2, 1 ).imag; + b1r = src->h[0].c[1].real; + b1i = src->h[0].c[1].imag; + a2r = mat->ROWCOL( 2, 2 ).real; + a2i = mat->ROWCOL( 2, 2 ).imag; + b2r = src->h[0].c[2].real; + b2i = src->h[0].c[2].imag; + + dest->h[0].c[2].real = a0r * b0r - a0i * b0i + a1r * b1r - a1i * b1i + a2r * b2r - a2i * b2i; + dest->h[0].c[2].imag = a0r * b0i + a0i * b0r + a1r * b1i + a1i * b1r + a2r * b2i + a2i * b2r; + + /* mult_su3_mat_vec_KE(mat, &(src->h[1]), &(dest->h[1]) ); */ + + a0r = mat->ROWCOL( 0, 0 ).real; + a0i = mat->ROWCOL( 0, 0 ).imag; + b0r = src->h[1].c[0].real; + b0i = src->h[1].c[0].imag; + a1r = mat->ROWCOL( 0, 1 ).real; + a1i = mat->ROWCOL( 0, 1 ).imag; + b1r = src->h[1].c[1].real; + b1i = src->h[1].c[1].imag; + a2r = mat->ROWCOL( 0, 2 ).real; + a2i = mat->ROWCOL( 0, 2 ).imag; + b2r = src->h[1].c[2].real; + b2i = src->h[1].c[2].imag; + + dest->h[1].c[0].real = a0r * b0r - a0i * b0i + a1r * b1r - a1i * b1i + a2r * b2r - a2i * b2i; + dest->h[1].c[0].imag = a0r * b0i + a0i * b0r + a1r * b1i + a1i * b1r + a2r * b2i + a2i * b2r; + + a0r = mat->ROWCOL( 1, 0 ).real; + a0i = mat->ROWCOL( 1, 0 ).imag; + b0r = src->h[1].c[0].real; + b0i = src->h[1].c[0].imag; + a1r = mat->ROWCOL( 1, 1 ).real; + a1i = mat->ROWCOL( 1, 1 ).imag; + b1r = src->h[1].c[1].real; + b1i = src->h[1].c[1].imag; + a2r = mat->ROWCOL( 1, 2 ).real; + a2i = mat->ROWCOL( 1, 2 ).imag; + b2r = src->h[1].c[2].real; + b2i = src->h[1].c[2].imag; + + dest->h[1].c[1].real = a0r * b0r - a0i * b0i + a1r * b1r - a1i * b1i + a2r * b2r - a2i * b2i; + dest->h[1].c[1].imag = a0r * b0i + a0i * b0r + a1r * b1i + a1i * b1r + a2r * b2i + a2i * b2r; + + a0r = mat->ROWCOL( 2, 0 ).real; + a0i = mat->ROWCOL( 2, 0 ).imag; + b0r = src->h[1].c[0].real; + b0i = src->h[1].c[0].imag; + a1r = mat->ROWCOL( 2, 1 ).real; + a1i = mat->ROWCOL( 2, 1 ).imag; + b1r = src->h[1].c[1].real; + b1i = src->h[1].c[1].imag; + a2r = mat->ROWCOL( 2, 2 ).real; + a2i = mat->ROWCOL( 2, 2 ).imag; + b2r = src->h[1].c[2].real; + b2i = src->h[1].c[2].imag; + + dest->h[1].c[2].real = a0r * b0r - a0i * b0i + a1r * b1r - a1i * b1i + a2r * b2r - a2i * b2i; + dest->h[1].c[2].imag = a0r * b0i + a0i * b0r + a1r * b1i + a1i * b1r + a2r * b2i + a2i * b2r; + +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/m_mat_wvec.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/m_mat_wvec.c new file mode 100644 index 0000000000000000000000000000000000000000..5cb35a22fe357d8b1bd86daf794f97247cd39115 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/m_mat_wvec.c @@ -0,0 +1,61 @@ +/****************** m_mat_wvec.c (in su3.a) ******************** + * * + *void mult_mat_wilson_vec(su3_matrix *mat, wilson_vector *src,*dest) * + * multiply a Wilson vector by a matrix * + * dest <- mat*src * + */ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +void mult_mat_wilson_vec( su3_matrix * mat, wilson_vector * src, wilson_vector * dest ) +{ + register int i, k; + register double t, ar, ai, br, bi, cr, ci; + for ( k = 0; k < 4; k++ ) + { + for ( i = 0; i < 3; i++ ) + { + + ar = mat->ROWCOL( i, 0 ).real; + ai = mat->ROWCOL( i, 0 ).imag; + br = src->COLORSPINOR( 0, k ).real; + bi = src->COLORSPINOR( 0, k ).imag; + cr = ar * br; + t = ai * bi; + cr -= t; + ci = ar * bi; + t = ai * br; + ci += t; + + ar = mat->ROWCOL( i, 1 ).real; + ai = mat->ROWCOL( i, 1 ).imag; + br = src->COLORSPINOR( 1, k ).real; + bi = src->COLORSPINOR( 1, k ).imag; + t = ar * br; + cr += t; + t = ai * bi; + cr -= t; + t = ar * bi; + ci += t; + t = ai * br; + ci += t; + + ar = mat->ROWCOL( i, 2 ).real; + ai = mat->ROWCOL( i, 2 ).imag; + br = src->COLORSPINOR( 2, k ).real; + bi = src->COLORSPINOR( 2, k ).imag; + t = ar * br; + cr += t; + t = ai * bi; + cr -= t; + t = ar * bi; + ci += t; + t = ai * br; + ci += t; + + dest->COLORSPINOR( i, k ).real = cr; + dest->COLORSPINOR( i, k ).imag = ci; + } + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/m_matvec.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/m_matvec.c new file mode 100644 index 0000000000000000000000000000000000000000..d7e55ac930201500777ef5797fa07a310a2728b7 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/m_matvec.c @@ -0,0 +1,134 @@ +/**************** m_matvec.c (in su3.a) ******************************* +* * +* void mult_su3_mat_vec_KE( su3_matrix *a, su3_vector *b,*c ) * +* matrix times vector multiply, no adjoints * +* C <- A*B * +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +#ifndef FAST +void mult_su3_mat_vec_KE( su3_matrix * a, su3_vector * b, su3_vector * c ) +{ + register int i, j; + register complex x, y; + for ( i = 0; i < 3; i++ ) + { + x.real = x.imag = 0.0; + for ( j = 0; j < 3; j++ ) + { + CMUL( a->ROWCOL( i, j ), b->c[j], y ) CSUM( x, y ); + } + c->c[i] = x; + } +} +#else +#ifdef NATIVEDOUBLE /* RS6000 version */ +void mult_su3_mat_vec_KE( su3_matrix * a, su3_vector * b, su3_vector * c ) +{ + + register double a0r, a0i, a1r, a1i, a2r, a2i; + register double b0r, b0i, b1r, b1i, b2r, b2i; + + a0r = a->ROWCOL( 0, 0 ).real; + a0i = a->ROWCOL( 0, 0 ).imag; + b0r = b->c[0].real; + b0i = b->c[0].imag; + a1r = a->ROWCOL( 0, 1 ).real; + a1i = a->ROWCOL( 0, 1 ).imag; + b1r = b->c[1].real; + b1i = b->c[1].imag; + a2r = a->ROWCOL( 0, 2 ).real; + a2i = a->ROWCOL( 0, 2 ).imag; + b2r = b->c[2].real; + b2i = b->c[2].imag; + + c->c[0].real = a0r * b0r - a0i * b0i + a1r * b1r - a1i * b1i + a2r * b2r - a2i * b2i; + c->c[0].imag = a0r * b0i + a0i * b0r + a1r * b1i + a1i * b1r + a2r * b2i + a2i * b2r; + + a0r = a->ROWCOL( 1, 0 ).real; + a0i = a->ROWCOL( 1, 0 ).imag; + b0r = b->c[0].real; + b0i = b->c[0].imag; + a1r = a->ROWCOL( 1, 1 ).real; + a1i = a->ROWCOL( 1, 1 ).imag; + b1r = b->c[1].real; + b1i = b->c[1].imag; + a2r = a->ROWCOL( 1, 2 ).real; + a2i = a->ROWCOL( 1, 2 ).imag; + b2r = b->c[2].real; + b2i = b->c[2].imag; + + c->c[1].real = a0r * b0r - a0i * b0i + a1r * b1r - a1i * b1i + a2r * b2r - a2i * b2i; + c->c[1].imag = a0r * b0i + a0i * b0r + a1r * b1i + a1i * b1r + a2r * b2i + a2i * b2r; + + a0r = a->ROWCOL( 2, 0 ).real; + a0i = a->ROWCOL( 2, 0 ).imag; + b0r = b->c[0].real; + b0i = b->c[0].imag; + a1r = a->ROWCOL( 2, 1 ).real; + a1i = a->ROWCOL( 2, 1 ).imag; + b1r = b->c[1].real; + b1i = b->c[1].imag; + a2r = a->ROWCOL( 2, 2 ).real; + a2i = a->ROWCOL( 2, 2 ).imag; + b2r = b->c[2].real; + b2i = b->c[2].imag; + + c->c[2].real = a0r * b0r - a0i * b0i + a1r * b1r - a1i * b1i + a2r * b2r - a2i * b2i; + c->c[2].imag = a0r * b0i + a0i * b0r + a1r * b1i + a1i * b1r + a2r * b2i + a2i * b2r; + +} + +#else +void mult_su3_mat_vec_KE( su3_matrix * a, su3_vector * b, su3_vector * c ) +{ + int i; + register double t, ar, ai, br, bi, cr, ci; + for ( i = 0; i < 3; i++ ) + { + + ar = a->ROWCOL( i, 0 ).real; + ai = a->ROWCOL( i, 0 ).imag; + br = b->c[0].real; + bi = b->c[0].imag; + cr = ar * br; + t = ai * bi; + cr -= t; + ci = ar * bi; + t = ai * br; + ci += t; + + ar = a->ROWCOL( i, 1 ).real; + ai = a->ROWCOL( i, 1 ).imag; + br = b->c[1].real; + bi = b->c[1].imag; + t = ar * br; + cr += t; + t = ai * bi; + cr -= t; + t = ar * bi; + ci += t; + t = ai * br; + ci += t; + + ar = a->ROWCOL( i, 2 ).real; + ai = a->ROWCOL( i, 2 ).imag; + br = b->c[2].real; + bi = b->c[2].imag; + t = ar * br; + cr += t; + t = ai * bi; + cr -= t; + t = ar * bi; + ci += t; + t = ai * br; + ci += t; + + c->c[i].real = cr; + c->c[i].imag = ci; + } +} +#endif /* End of "#ifdef NATIVEDOUBLE" */ +#endif /* End of "#infdef FAST" */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/m_matvec_ns.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/m_matvec_ns.c new file mode 100644 index 0000000000000000000000000000000000000000..3fa606ebe587ab51fd59aea34b422c170ff255b8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/m_matvec_ns.c @@ -0,0 +1,177 @@ +/***************** m_matvec_ns.c (in su3.a) *************************** +* * +* void mult_su3_mat_vec_nsum_KE( su3_matrix *a, su3_vector *b,*c ) * +* su3_matrix times su3_vector multiply and subtract from another * +* su3_vector * +* C <- C - A*B * +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +#ifndef FAST +/* su3_matrix times su3_vector multiply and subtract from another su3_vector */ +/* c <- A*b-c */ +void mult_su3_mat_vec_nsum_KE( su3_matrix * a, su3_vector * b, su3_vector * c ) +{ + register int i, j; + register complex x, y; + for ( i = 0; i < 3; i++ ) + { + x.real = x.imag = 0.0; + for ( j = 0; j < 3; j++ ) + { + CMUL( a->ROWCOL( i, j ), b->c[j], y ) CSUM( x, y ); + } + c->c[i].real -= x.real; + c->c[i].imag -= x.imag; + } +} + +#else +#ifdef NATIVEDOUBLE +void mult_su3_mat_vec_nsum_KE( su3_matrix * a, su3_vector * b, su3_vector * c ) +{ + + register double c0r, c0i, c1r, c1i, c2r, c2i; + register double br, bi, a0, a1, a2; + + c0r = c->c[0].real; + c0i = c->c[0].imag; + c1r = c->c[1].real; + c1i = c->c[1].imag; + c2r = c->c[2].real; + c2i = c->c[2].imag; + + br = b->c[0].real; + bi = b->c[0].imag; + a0 = a->ROWCOL( 0, 0 ).real; + a1 = a->ROWCOL( 1, 0 ).real; + a2 = a->ROWCOL( 2, 0 ).real; + + c0r -= a0 * br; + c1r -= a1 * br; + c2r -= a2 * br; + c0i -= a0 * bi; + c1i -= a1 * bi; + c2i -= a2 * bi; + + a0 = a->ROWCOL( 0, 0 ).imag; + a1 = a->ROWCOL( 1, 0 ).imag; + a2 = a->ROWCOL( 2, 0 ).imag; + + c0r += a0 * bi; + c1r += a1 * bi; + c2r += a2 * bi; + c0i -= a0 * br; + c1i -= a1 * br; + c2i -= a2 * br; + + br = b->c[1].real; + bi = b->c[1].imag; + a0 = a->ROWCOL( 0, 1 ).real; + a1 = a->ROWCOL( 1, 1 ).real; + a2 = a->ROWCOL( 2, 1 ).real; + + c0r -= a0 * br; + c1r -= a1 * br; + c2r -= a2 * br; + c0i -= a0 * bi; + c1i -= a1 * bi; + c2i -= a2 * bi; + + a0 = a->ROWCOL( 0, 1 ).imag; + a1 = a->ROWCOL( 1, 1 ).imag; + a2 = a->ROWCOL( 2, 1 ).imag; + + c0r += a0 * bi; + c1r += a1 * bi; + c2r += a2 * bi; + c0i -= a0 * br; + c1i -= a1 * br; + c2i -= a2 * br; + + br = b->c[2].real; + bi = b->c[2].imag; + a0 = a->ROWCOL( 0, 2 ).real; + a1 = a->ROWCOL( 1, 2 ).real; + a2 = a->ROWCOL( 2, 2 ).real; + + c0r -= a0 * br; + c1r -= a1 * br; + c2r -= a2 * br; + c0i -= a0 * bi; + c1i -= a1 * bi; + c2i -= a2 * bi; + + a0 = a->ROWCOL( 0, 2 ).imag; + a1 = a->ROWCOL( 1, 2 ).imag; + a2 = a->ROWCOL( 2, 2 ).imag; + + c0r += a0 * bi; + c1r += a1 * bi; + c2r += a2 * bi; + c0i -= a0 * br; + c1i -= a1 * br; + c2i -= a2 * br; + + c->c[0].real = c0r; + c->c[0].imag = c0i; + c->c[1].real = c1r; + c->c[1].imag = c1i; + c->c[2].real = c2r; + c->c[2].imag = c2i; + +} + +#else +void mult_su3_mat_vec_nsum_KE( su3_matrix * a, su3_vector * b, su3_vector * c ) +{ + int i; + register double t, ar, ai, br, bi, cr, ci; + for ( i = 0; i < 3; i++ ) + { + + ar = a->ROWCOL( i, 0 ).real; + ai = a->ROWCOL( i, 0 ).imag; + br = b->c[0].real; + bi = b->c[0].imag; + cr = ar * br; + t = ai * bi; + cr -= t; + ci = ar * bi; + t = ai * br; + ci += t; + + ar = a->ROWCOL( i, 1 ).real; + ai = a->ROWCOL( i, 1 ).imag; + br = b->c[1].real; + bi = b->c[1].imag; + t = ar * br; + cr += t; + t = ai * bi; + cr -= t; + t = ar * bi; + ci += t; + t = ai * br; + ci += t; + + ar = a->ROWCOL( i, 2 ).real; + ai = a->ROWCOL( i, 2 ).imag; + br = b->c[2].real; + bi = b->c[2].imag; + t = ar * br; + cr += t; + t = ai * bi; + cr -= t; + t = ar * bi; + ci += t; + t = ai * br; + ci += t; + + c->c[i].real -= cr; + c->c[i].imag -= ci; + } +} +#endif /* End of "#ifdef NATIVEDOUBLE" */ +#endif /* End of "#ifdef FAST" */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/m_matvec_s.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/m_matvec_s.c new file mode 100644 index 0000000000000000000000000000000000000000..99c33d660b084eebe4a45db468db0ed61a7e7f99 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/m_matvec_s.c @@ -0,0 +1,177 @@ +/**************** m_matvec_s.c (in su3.a) ***************************** +* * +* void mult_su3_mat_vec_sum_KE( su3_matrix *a, su3_vector *b,*c ) * +* su3_matrix times su3_vector multiply and add to another su3_vector * +* C <- C + A*B * +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +#ifndef FAST +/* su3_matrix times su3_vector multiply and add to another su3_vector */ +/* c <- A*b+c */ +void mult_su3_mat_vec_sum_KE( su3_matrix * a, su3_vector * b, su3_vector * c ) +{ + register int i, j; + register complex x, y; + for ( i = 0; i < 3; i++ ) + { + x.real = x.imag = 0.0; + for ( j = 0; j < 3; j++ ) + { + CMUL( a->ROWCOL( i, j ), b->c[j], y ) CSUM( x, y ); + } + c->c[i].real += x.real; + c->c[i].imag += x.imag; + } +} + +#else +#ifdef NATIVEDOUBLE /* RS6000 version */ +void mult_su3_mat_vec_sum_KE( a, b, c ) + su3_matrix *a; + su3_vector *b, *c; +{ + + register double c0r, c0i, c1r, c1i, c2r, c2i; + register double br, bi, a0, a1, a2; + + c0r = c->c[0].real; + c0i = c->c[0].imag; + c1r = c->c[1].real; + c1i = c->c[1].imag; + c2r = c->c[2].real; + c2i = c->c[2].imag; + + br = b->c[0].real; + bi = b->c[0].imag; + a0 = a->ROWCOL( 0, 0 ).real; + a1 = a->ROWCOL( 1, 0 ).real; + a2 = a->ROWCOL( 2, 0 ).real; + + c0r += a0 * br; + c1r += a1 * br; + c2r += a2 * br; + c0i += a0 * bi; + c1i += a1 * bi; + c2i += a2 * bi; + + a0 = a->ROWCOL( 0, 0 ).imag; + a1 = a->ROWCOL( 1, 0 ).imag; + a2 = a->ROWCOL( 2, 0 ).imag; + + c0r -= a0 * bi; + c1r -= a1 * bi; + c2r -= a2 * bi; + c0i += a0 * br; + c1i += a1 * br; + c2i += a2 * br; + + br = b->c[1].real; + bi = b->c[1].imag; + a0 = a->ROWCOL( 0, 1 ).real; + a1 = a->ROWCOL( 1, 1 ).real; + a2 = a->ROWCOL( 2, 1 ).real; + + c0r += a0 * br; + c1r += a1 * br; + c2r += a2 * br; + c0i += a0 * bi; + c1i += a1 * bi; + c2i += a2 * bi; + + a0 = a->ROWCOL( 0, 1 ).imag; + a1 = a->ROWCOL( 1, 1 ).imag; + a2 = a->ROWCOL( 2, 1 ).imag; + + c0r -= a0 * bi; + c1r -= a1 * bi; + c2r -= a2 * bi; + c0i += a0 * br; + c1i += a1 * br; + c2i += a2 * br; + + br = b->c[2].real; + bi = b->c[2].imag; + a0 = a->ROWCOL( 0, 2 ).real; + a1 = a->ROWCOL( 1, 2 ).real; + a2 = a->ROWCOL( 2, 2 ).real; + + c0r += a0 * br; + c1r += a1 * br; + c2r += a2 * br; + c0i += a0 * bi; + c1i += a1 * bi; + c2i += a2 * bi; + + a0 = a->ROWCOL( 0, 2 ).imag; + a1 = a->ROWCOL( 1, 2 ).imag; + a2 = a->ROWCOL( 2, 2 ).imag; + + c0r -= a0 * bi; + c1r -= a1 * bi; + c2r -= a2 * bi; + c0i += a0 * br; + c1i += a1 * br; + c2i += a2 * br; + + c->c[0].real = c0r; + c->c[0].imag = c0i; + c->c[1].real = c1r; + c->c[1].imag = c1i; + c->c[2].real = c2r; + c->c[2].imag = c2i; + +} +#else +void mult_su3_mat_vec_sum_KE( su3_matrix * a, su3_vector * b, su3_vector * c ) +{ + int i; + register double t, ar, ai, br, bi, cr, ci; + for ( i = 0; i < 3; i++ ) + { + + ar = a->ROWCOL( i, 0 ).real; + ai = a->ROWCOL( i, 0 ).imag; + br = b->c[0].real; + bi = b->c[0].imag; + cr = ar * br; + t = ai * bi; + cr -= t; + ci = ar * bi; + t = ai * br; + ci += t; + + ar = a->ROWCOL( i, 1 ).real; + ai = a->ROWCOL( i, 1 ).imag; + br = b->c[1].real; + bi = b->c[1].imag; + t = ar * br; + cr += t; + t = ai * bi; + cr -= t; + t = ar * bi; + ci += t; + t = ai * br; + ci += t; + + ar = a->ROWCOL( i, 2 ).real; + ai = a->ROWCOL( i, 2 ).imag; + br = b->c[2].real; + bi = b->c[2].imag; + t = ar * br; + cr += t; + t = ai * bi; + cr -= t; + t = ar * bi; + ci += t; + t = ai * br; + ci += t; + + c->c[i].real += cr; + c->c[i].imag += ci; + } +} +#endif /* End of "#ifdef NATIVEDOUBLE" */ +#endif /* End of "#ifdef FAST" */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/m_mv_s_4dir.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/m_mv_s_4dir.c new file mode 100644 index 0000000000000000000000000000000000000000..742a585a2c85656c12fe8edf1a4544e7f2232609 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/m_mv_s_4dir.c @@ -0,0 +1,216 @@ +/**************** m_mv_s_4dir.c (in su3.a) ***************************** +* * +* void mult_su3_mat_vec_sum_4dir_KE( su3_matrix *a, su3_vector *b[0123],*c )* +* Multiply the elements of an array of four su3_matrices by the * +* four su3_vectors, and add the results to * +* produce a single su3_vector. * +* C <- A[0]*B[0]+A[1]*B[1]+A[2]*B[2]+A[3]*B[3] * +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +#ifndef FAST +void mult_su3_mat_vec_sum_4dir_KE( su3_matrix * a, su3_vector * b0, su3_vector * b1, su3_vector * b2, su3_vector * b3, su3_vector * c ) +{ + mult_su3_mat_vec_KE( a + 0, b0, c ); + mult_su3_mat_vec_sum_KE( a + 1, b1, c ); + mult_su3_mat_vec_sum_KE( a + 2, b2, c ); + mult_su3_mat_vec_sum_KE( a + 3, b3, c ); +} + +#else +/* Fast code, with subroutines inlined */ +#ifdef NATIVEDOUBLE /* IBM RS6000 version */ +void mult_su3_mat_vec_sum_4dir_KE( su3_matrix * a, su3_vector * b0, su3_vector * b1, su3_vector * b2, su3_vector * b3, su3_vector * c ) +{ + + register int n; + register double c0r, c0i, c1r, c1i, c2r, c2i; + register double br, bi, a0, a1, a2; + register su3_matrix *mat; + register su3_vector *b; + + c0r = c0i = c1r = c1i = c2r = c2i = 0.0; + mat = a; + + for ( n = 0; n < 4; n++, mat++ ) + { + + switch ( n ) + { + case ( 0 ): + b = b0; + break; + case ( 1 ): + b = b1; + break; + case ( 2 ): + b = b2; + break; + case ( 3 ): + b = b3; + break; + } + + br = b->c[0].real; + bi = b->c[0].imag; + a0 = mat->ROWCOL( 0, 0 ).real; + a1 = mat->ROWCOL( 1, 0 ).real; + a2 = mat->ROWCOL( 2, 0 ).real; + + c0r += a0 * br; + c1r += a1 * br; + c2r += a2 * br; + c0i += a0 * bi; + c1i += a1 * bi; + c2i += a2 * bi; + + a0 = mat->ROWCOL( 0, 0 ).imag; + a1 = mat->ROWCOL( 1, 0 ).imag; + a2 = mat->ROWCOL( 2, 0 ).imag; + + c0r -= a0 * bi; + c1r -= a1 * bi; + c2r -= a2 * bi; + c0i += a0 * br; + c1i += a1 * br; + c2i += a2 * br; + + br = b->c[1].real; + bi = b->c[1].imag; + a0 = mat->ROWCOL( 0, 1 ).real; + a1 = mat->ROWCOL( 1, 1 ).real; + a2 = mat->ROWCOL( 2, 1 ).real; + + c0r += a0 * br; + c1r += a1 * br; + c2r += a2 * br; + c0i += a0 * bi; + c1i += a1 * bi; + c2i += a2 * bi; + + a0 = mat->ROWCOL( 0, 1 ).imag; + a1 = mat->ROWCOL( 1, 1 ).imag; + a2 = mat->ROWCOL( 2, 1 ).imag; + + c0r -= a0 * bi; + c1r -= a1 * bi; + c2r -= a2 * bi; + c0i += a0 * br; + c1i += a1 * br; + c2i += a2 * br; + + br = b->c[2].real; + bi = b->c[2].imag; + a0 = mat->ROWCOL( 0, 2 ).real; + a1 = mat->ROWCOL( 1, 2 ).real; + a2 = mat->ROWCOL( 2, 2 ).real; + + c0r += a0 * br; + c1r += a1 * br; + c2r += a2 * br; + c0i += a0 * bi; + c1i += a1 * bi; + c2i += a2 * bi; + + a0 = mat->ROWCOL( 0, 2 ).imag; + a1 = mat->ROWCOL( 1, 2 ).imag; + a2 = mat->ROWCOL( 2, 2 ).imag; + + c0r -= a0 * bi; + c1r -= a1 * bi; + c2r -= a2 * bi; + c0i += a0 * br; + c1i += a1 * br; + c2i += a2 * br; + + } + + c->c[0].real = c0r; + c->c[0].imag = c0i; + c->c[1].real = c1r; + c->c[1].imag = c1i; + c->c[2].real = c2r; + c->c[2].imag = c2i; + +} + +#else +void mult_su3_mat_vec_sum_4dir_KE( su3_matrix * a, su3_vector * b0, su3_vector * b1, su3_vector * b2, su3_vector * b3, su3_vector * c ) +{ + int i, n; + register su3_matrix *at; + register su3_vector *b; + register double t, ar, ai, br, bi, cr, ci; + + for ( i = 0; i < 3; i++ ) + { + c->c[i].real = 0.0; + c->c[i].imag = 0.0; + } + for ( n = 0; n < 4; n++ ) + { + at = a + n; + switch ( n ) + { + case ( 0 ): + b = b0; + break; + case ( 1 ): + b = b1; + break; + case ( 2 ): + b = b2; + break; + case ( 3 ): + b = b3; + break; + } + for ( i = 0; i < 3; i++ ) + { + + ar = at->ROWCOL( i, 0 ).real; + ai = at->ROWCOL( i, 0 ).imag; + br = b->c[0].real; + bi = b->c[0].imag; + cr = ar * br; + t = ai * bi; + cr -= t; + ci = ar * bi; + t = ai * br; + ci += t; + + ar = at->ROWCOL( i, 1 ).real; + ai = at->ROWCOL( i, 1 ).imag; + br = b->c[1].real; + bi = b->c[1].imag; + t = ar * br; + cr += t; + t = ai * bi; + cr -= t; + t = ar * bi; + ci += t; + t = ai * br; + ci += t; + + ar = at->ROWCOL( i, 2 ).real; + ai = at->ROWCOL( i, 2 ).imag; + br = b->c[2].real; + bi = b->c[2].imag; + t = ar * br; + cr += t; + t = ai * bi; + cr -= t; + t = ar * bi; + ci += t; + t = ai * br; + ci += t; + + c->c[i].real += cr; + c->c[i].imag += ci; + } + } +} +#endif /* End of "#ifdef NATIVEDOUBLE" */ +#endif /* End of "#ifdef FAST" */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/m_su2_mat_vec_a.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/m_su2_mat_vec_a.c new file mode 100644 index 0000000000000000000000000000000000000000..89a1fa35c18bf79b043014a0eafd6a00e947269e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/m_su2_mat_vec_a.c @@ -0,0 +1,29 @@ +/************** m_su2_mat_vec_a.c (in su3.a) ********************** + * * + * adjoint su2 matrix times vector * + */ + +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +void mult_su2_mat_vec_elem_a( su2_matrix * u, complex * x0, complex * x1 ) +{ + /* Multiplies the complex row spinor (x0, x1) by the adjoint of the */ + /* SU(2) matrix u and puts the result in (x0,x1). */ + /* Thus x <- x * u-adj */ + /* C. DeTar 3 Oct 1990 */ + + complex z0, z1, t0, t1; + + t0 = *x0; + t1 = *x1; + + CMUL_J( t0, u->esu2[0][0], z0 ); + CMUL_J( t1, u->esu2[0][1], z1 ); + CADD( z0, z1, *x0 ); + CMUL_J( t0, u->esu2[1][0], z0 ); + CMUL_J( t1, u->esu2[1][1], z1 ); + CADD( z0, z1, *x1 ); + +} /* m_su2_mat_vec_a.c */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/m_su2_mat_vec_n.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/m_su2_mat_vec_n.c new file mode 100644 index 0000000000000000000000000000000000000000..d238f0b872d8316acb9ab9faba33d3c6b54d6231 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/m_su2_mat_vec_n.c @@ -0,0 +1,29 @@ +/************** m_su2_mat_vec_n.c (in su3.a) ********************** + * * + * su2 matrix times vector * + */ + +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +void mult_su2_mat_vec_elem_n_KE( su2_matrix * u, complex * x0, complex * x1 ) +{ + /* Multiplies the complex column spinor (x0, x1) by the SU(2) matrix u */ + /* and puts the result in (x0,x1). */ + /* Thus x <- u * x */ + /* C. DeTar 3 Oct 1990 */ + + complex z0, z1, t0, t1; + + t0 = *x0; + t1 = *x1; + + CMUL( u->esu2[0][0], t0, z0 ); + CMUL( u->esu2[0][1], t1, z1 ); + CADD( z0, z1, *x0 ); + CMUL( u->esu2[1][0], t0, z0 ); + CMUL( u->esu2[1][1], t1, z1 ); + CADD( z0, z1, *x1 ); + +} /* m_su2_mat_vec_elem_n.c */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/make_ahmat.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/make_ahmat.c new file mode 100644 index 0000000000000000000000000000000000000000..991a30eb5d9dd46d627f0eeea47c76aec4b29b22 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/make_ahmat.c @@ -0,0 +1,102 @@ +/***************** make_ahmat.c (in su3.a) **************************** +* * +* void make_anti_hermitian_KE( su3_matrix *m3, anti_hermitmat *ah3) * +* take the traceless and anti_hermitian part of an su3 matrix * +* and compress it * +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +#ifndef FAST +void make_anti_hermitian_KE( su3_matrix * m3, anti_hermitmat * ah3 ) +{ + double temp; + + temp = ( m3->ROWCOL( 0, 0 ).imag + m3->ROWCOL( 1, 1 ).imag + m3->ROWCOL( 2, 2 ).imag ) / 3.; + ah3->m00im = m3->ROWCOL( 0, 0 ).imag - temp; + ah3->m11im = m3->ROWCOL( 1, 1 ).imag - temp; + ah3->m22im = m3->ROWCOL( 2, 2 ).imag - temp; + ah3->m01.real = ( m3->ROWCOL( 0, 1 ).real - m3->ROWCOL( 1, 0 ).real ) * 0.5; + ah3->m02.real = ( m3->ROWCOL( 0, 2 ).real - m3->ROWCOL( 2, 0 ).real ) * 0.5; + ah3->m12.real = ( m3->ROWCOL( 1, 2 ).real - m3->ROWCOL( 2, 1 ).real ) * 0.5; + ah3->m01.imag = ( m3->ROWCOL( 0, 1 ).imag + m3->ROWCOL( 1, 0 ).imag ) * 0.5; + ah3->m02.imag = ( m3->ROWCOL( 0, 2 ).imag + m3->ROWCOL( 2, 0 ).imag ) * 0.5; + ah3->m12.imag = ( m3->ROWCOL( 1, 2 ).imag + m3->ROWCOL( 2, 1 ).imag ) * 0.5; + +} /* make_anti_hermitian */ + +#else +void make_anti_hermitian_KE( su3_matrix * m3, anti_hermitmat * ah3 ) +{ + double temp, temp2; + + temp = ( m3->ROWCOL( 0, 0 ).imag + m3->ROWCOL( 1, 1 ).imag ); + temp2 = temp + m3->ROWCOL( 2, 2 ).imag; + temp = temp2 / 3.; + ah3->m00im = m3->ROWCOL( 0, 0 ).imag - temp; + ah3->m11im = m3->ROWCOL( 1, 1 ).imag - temp; + ah3->m22im = m3->ROWCOL( 2, 2 ).imag - temp; + temp = m3->ROWCOL( 0, 1 ).real - m3->ROWCOL( 1, 0 ).real; + ah3->m01.real = temp * 0.5; + temp = m3->ROWCOL( 0, 2 ).real - m3->ROWCOL( 2, 0 ).real; + ah3->m02.real = temp * 0.5; + temp = m3->ROWCOL( 1, 2 ).real - m3->ROWCOL( 2, 1 ).real; + ah3->m12.real = temp * 0.5; + temp = m3->ROWCOL( 0, 1 ).imag + m3->ROWCOL( 1, 0 ).imag; + ah3->m01.imag = temp * 0.5; + temp = m3->ROWCOL( 0, 2 ).imag + m3->ROWCOL( 2, 0 ).imag; + ah3->m02.imag = temp * 0.5; + temp = m3->ROWCOL( 1, 2 ).imag + m3->ROWCOL( 2, 1 ).imag; + ah3->m12.imag = temp * 0.5; + +} /* make_anti_hermitian */ +#endif /*end ifdef FAST */ + +void make_traceless( su3_matrix * m3, su3_matrix * m4 ) +{ + double retr, imtr; + + retr = m3->ROWCOL( 0, 0 ).real + m3->ROWCOL( 1, 1 ).real + m3->ROWCOL( 2, 2 ).real; + imtr = m3->ROWCOL( 0, 0 ).imag + m3->ROWCOL( 1, 1 ).imag + m3->ROWCOL( 2, 2 ).imag; + + m4->ROWCOL( 0, 1 ) = m3->ROWCOL( 0, 1 ); + m4->ROWCOL( 0, 2 ) = m3->ROWCOL( 0, 2 ); + m4->ROWCOL( 1, 0 ) = m3->ROWCOL( 1, 0 ); + m4->ROWCOL( 1, 2 ) = m3->ROWCOL( 1, 2 ); + m4->ROWCOL( 2, 0 ) = m3->ROWCOL( 2, 0 ); + m4->ROWCOL( 2, 1 ) = m3->ROWCOL( 2, 1 ); + + m4->ROWCOL( 0, 0 ).real = m3->ROWCOL( 0, 0 ).real - retr / 3.; + m4->ROWCOL( 0, 0 ).imag = m3->ROWCOL( 0, 0 ).imag - imtr / 3.; + m4->ROWCOL( 1, 1 ).real = m3->ROWCOL( 1, 1 ).real - retr / 3.; + m4->ROWCOL( 1, 1 ).imag = m3->ROWCOL( 1, 1 ).imag - imtr / 3.; + m4->ROWCOL( 2, 2 ).real = m3->ROWCOL( 2, 2 ).real - retr / 3.; + m4->ROWCOL( 2, 2 ).imag = m3->ROWCOL( 2, 2 ).imag - imtr / 3.; +} + +void scalar_mult_ahm( anti_hermitmat * a, double s, anti_hermitmat * b ) +{ + b->m00im = s * ( a->m00im ); + b->m11im = s * ( a->m11im ); + b->m22im = s * ( a->m22im ); + b->m01.real = s * ( a->m01.real ); + b->m02.real = s * ( a->m02.real ); + b->m12.real = s * ( a->m12.real ); + b->m01.imag = s * ( a->m01.imag ); + b->m02.imag = s * ( a->m02.imag ); + b->m12.imag = s * ( a->m12.imag ); +} + +void clear_ahm( anti_hermitmat * b ) +{ + b->m00im = 0.0; + b->m11im = 0.0; + b->m22im = 0.0; + b->m01.real = 0.0; + b->m02.real = 0.0; + b->m12.real = 0.0; + b->m01.imag = 0.0; + b->m02.imag = 0.0; + b->m12.imag = 0.0; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/mb_gamma_l.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/mb_gamma_l.c new file mode 100644 index 0000000000000000000000000000000000000000..d3f59180e829992fdf5611e30ad6df101e89f9b1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/mb_gamma_l.c @@ -0,0 +1,180 @@ +/************* mb_gamma_l.c (in su3.a) **************************/ +/* + Multiply a Wilson matrix by a gamma matrix acting on the row index + (This is the first index, or equivalently, multiplication on the left) + usage: mult_by_gamma_left( wilson_matrix *src, wilson_matrix *dest, int dir ) + dir = XUP, YUP, ZUP, TUP or GAMMAFIVE + + gamma(XUP) + 0 0 0 i + 0 0 i 0 + 0 -i 0 0 + -i 0 0 0 + + gamma(YUP) + 0 0 0 -1 + 0 0 1 0 + 0 1 0 0 + -1 0 0 0 + + gamma(ZUP) + 0 0 i 0 + 0 0 0 -i + -i 0 0 0 + 0 i 0 0 + + gamma(TUP) + 0 0 1 0 + 0 0 0 1 + 1 0 0 0 + 0 1 0 0 + + gamma(FIVE) + 1 0 0 0 + 0 1 0 0 + 0 0 -1 0 + 0 0 0 -1 + */ +#include +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" +#include "../include/dirs.h" + +void mult_by_gamma_left( wilson_matrix * src, wilson_matrix * dest, int dir ) +{ + register int i; /*color */ + register int c2, s2; /* column indices, color and spin */ + + switch ( dir ) + { + case XUP: + for ( i = 0; i < 3; i++ ) + for ( s2 = 0; s2 < 4; s2++ ) + for ( c2 = 0; c2 < 3; c2++ ) + { + TIMESPLUSI( src->d[3].c[i].COLORSPINOR( c2, s2 ), dest->d[0].c[i].COLORSPINOR( c2, s2 ) ); + TIMESPLUSI( src->d[2].c[i].COLORSPINOR( c2, s2 ), dest->d[1].c[i].COLORSPINOR( c2, s2 ) ); + TIMESMINUSI( src->d[1].c[i].COLORSPINOR( c2, s2 ), dest->d[2].c[i].COLORSPINOR( c2, s2 ) ); + TIMESMINUSI( src->d[0].c[i].COLORSPINOR( c2, s2 ), dest->d[3].c[i].COLORSPINOR( c2, s2 ) ); + } + break; + case YUP: + for ( i = 0; i < 3; i++ ) + for ( s2 = 0; s2 < 4; s2++ ) + for ( c2 = 0; c2 < 3; c2++ ) + { + TIMESMINUSONE( src->d[3].c[i].COLORSPINOR( c2, s2 ), dest->d[0].c[i].COLORSPINOR( c2, s2 ) ); + TIMESPLUSONE( src->d[2].c[i].COLORSPINOR( c2, s2 ), dest->d[1].c[i].COLORSPINOR( c2, s2 ) ); + TIMESPLUSONE( src->d[1].c[i].COLORSPINOR( c2, s2 ), dest->d[2].c[i].COLORSPINOR( c2, s2 ) ); + TIMESMINUSONE( src->d[0].c[i].COLORSPINOR( c2, s2 ), dest->d[3].c[i].COLORSPINOR( c2, s2 ) ); + } + break; + case ZUP: + for ( i = 0; i < 3; i++ ) + for ( s2 = 0; s2 < 4; s2++ ) + for ( c2 = 0; c2 < 3; c2++ ) + { + TIMESPLUSI( src->d[2].c[i].COLORSPINOR( c2, s2 ), dest->d[0].c[i].COLORSPINOR( c2, s2 ) ); + TIMESMINUSI( src->d[3].c[i].COLORSPINOR( c2, s2 ), dest->d[1].c[i].COLORSPINOR( c2, s2 ) ); + TIMESMINUSI( src->d[0].c[i].COLORSPINOR( c2, s2 ), dest->d[2].c[i].COLORSPINOR( c2, s2 ) ); + TIMESPLUSI( src->d[1].c[i].COLORSPINOR( c2, s2 ), dest->d[3].c[i].COLORSPINOR( c2, s2 ) ); + } + break; + case TUP: + for ( i = 0; i < 3; i++ ) + for ( s2 = 0; s2 < 4; s2++ ) + for ( c2 = 0; c2 < 3; c2++ ) + { + TIMESPLUSONE( src->d[2].c[i].COLORSPINOR( c2, s2 ), dest->d[0].c[i].COLORSPINOR( c2, s2 ) ); + TIMESPLUSONE( src->d[3].c[i].COLORSPINOR( c2, s2 ), dest->d[1].c[i].COLORSPINOR( c2, s2 ) ); + TIMESPLUSONE( src->d[0].c[i].COLORSPINOR( c2, s2 ), dest->d[2].c[i].COLORSPINOR( c2, s2 ) ); + TIMESPLUSONE( src->d[1].c[i].COLORSPINOR( c2, s2 ), dest->d[3].c[i].COLORSPINOR( c2, s2 ) ); + } + break; + case GAMMAFIVE: + for ( i = 0; i < 3; i++ ) + for ( s2 = 0; s2 < 4; s2++ ) + for ( c2 = 0; c2 < 3; c2++ ) + { + TIMESPLUSONE( src->d[0].c[i].COLORSPINOR( c2, s2 ), dest->d[0].c[i].COLORSPINOR( c2, s2 ) ); + TIMESPLUSONE( src->d[1].c[i].COLORSPINOR( c2, s2 ), dest->d[1].c[i].COLORSPINOR( c2, s2 ) ); + TIMESMINUSONE( src->d[2].c[i].COLORSPINOR( c2, s2 ), dest->d[2].c[i].COLORSPINOR( c2, s2 ) ); + TIMESMINUSONE( src->d[3].c[i].COLORSPINOR( c2, s2 ), dest->d[3].c[i].COLORSPINOR( c2, s2 ) ); + } + break; + default: + printf( "BAD CALL TO MULT_BY_GAMMA_LEFT()\n" ); + } +} + +void mult_by_gamma_l( spin_wilson_vector *src, spin_wilson_vector *dest, + int dir) +{ +register int c2,s2; /* column indices, color and spin */ + + switch(dir){ + case XUP: + for(s2=0;s2<4;s2++)for(c2=0;c2<3;c2++){ + TIMESPLUSI( src->d[3].COLORSPINOR(c2,s2), + dest->d[0].COLORSPINOR(c2,s2) ); + TIMESPLUSI( src->d[2].COLORSPINOR(c2,s2), + dest->d[1].COLORSPINOR(c2,s2) ); + TIMESMINUSI( src->d[1].COLORSPINOR(c2,s2), + dest->d[2].COLORSPINOR(c2,s2) ); + TIMESMINUSI( src->d[0].COLORSPINOR(c2,s2), + dest->d[3].COLORSPINOR(c2,s2) ); + } + break; + case YUP: + for(s2=0;s2<4;s2++)for(c2=0;c2<3;c2++){ + TIMESMINUSONE( src->d[3].COLORSPINOR(c2,s2), + dest->d[0].COLORSPINOR(c2,s2) ); + TIMESPLUSONE( src->d[2].COLORSPINOR(c2,s2), + dest->d[1].COLORSPINOR(c2,s2) ); + TIMESPLUSONE( src->d[1].COLORSPINOR(c2,s2), + dest->d[2].COLORSPINOR(c2,s2) ); + TIMESMINUSONE( src->d[0].COLORSPINOR(c2,s2), + dest->d[3].COLORSPINOR(c2,s2) ); + } + break; + case ZUP: + for(s2=0;s2<4;s2++)for(c2=0;c2<3;c2++){ + TIMESPLUSI( src->d[2].COLORSPINOR(c2,s2), + dest->d[0].COLORSPINOR(c2,s2) ); + TIMESMINUSI( src->d[3].COLORSPINOR(c2,s2), + dest->d[1].COLORSPINOR(c2,s2) ); + TIMESMINUSI( src->d[0].COLORSPINOR(c2,s2), + dest->d[2].COLORSPINOR(c2,s2) ); + TIMESPLUSI( src->d[1].COLORSPINOR(c2,s2), + dest->d[3].COLORSPINOR(c2,s2) ); + } + break; + case TUP: + for(s2=0;s2<4;s2++)for(c2=0;c2<3;c2++){ + TIMESPLUSONE( src->d[2].COLORSPINOR(c2,s2), + dest->d[0].COLORSPINOR(c2,s2) ); + TIMESPLUSONE( src->d[3].COLORSPINOR(c2,s2), + dest->d[1].COLORSPINOR(c2,s2) ); + TIMESPLUSONE( src->d[0].COLORSPINOR(c2,s2), + dest->d[2].COLORSPINOR(c2,s2) ); + TIMESPLUSONE( src->d[1].COLORSPINOR(c2,s2), + dest->d[3].COLORSPINOR(c2,s2) ); + } + break; + case GAMMAFIVE: + for(s2=0;s2<4;s2++)for(c2=0;c2<3;c2++){ + TIMESPLUSONE( src->d[0].COLORSPINOR(c2,s2), + dest->d[0].COLORSPINOR(c2,s2) ); + TIMESPLUSONE( src->d[1].COLORSPINOR(c2,s2), + dest->d[1].COLORSPINOR(c2,s2) ); + TIMESMINUSONE( src->d[2].COLORSPINOR(c2,s2), + dest->d[2].COLORSPINOR(c2,s2) ); + TIMESMINUSONE( src->d[3].COLORSPINOR(c2,s2), + dest->d[3].COLORSPINOR(c2,s2) ); + } + break; + default: + printf("BAD CALL TO MULT_BY_GAMMA_LEFT()\n"); + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/mb_gamma_r.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/mb_gamma_r.c new file mode 100644 index 0000000000000000000000000000000000000000..b5d158988d8542078d9c14a07184608e896c5e13 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/mb_gamma_r.c @@ -0,0 +1,182 @@ +/************* mb_gamma_r.c (in su3.a) **************************/ +/* + Multiply a Wilson matrix by a gamma matrix acting on the column index + (This is the second index, or equivalently, multiplication on the right) + usage: mult_by_gamma_right wilson_matrix *src, wilson_matrix *dest, + int dir ) + dir = XUP, YUP, ZUP, TUP or GAMMAFIVE + + gamma(XUP) + 0 0 0 i + 0 0 i 0 + 0 -i 0 0 + -i 0 0 0 + + gamma(YUP) + 0 0 0 -1 + 0 0 1 0 + 0 1 0 0 + -1 0 0 0 + + gamma(ZUP) + 0 0 i 0 + 0 0 0 -i + -i 0 0 0 + 0 i 0 0 + + gamma(TUP) + 0 0 1 0 + 0 0 0 1 + 1 0 0 0 + 0 1 0 0 + + gamma(FIVE) + 1 0 0 0 + 0 1 0 0 + 0 0 -1 0 + 0 0 0 -1 + */ +#include +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" +#include "../include/dirs.h" + +void mult_by_gamma_right( wilson_matrix * src, wilson_matrix * dest, int dir ) +{ + register int i; /*color */ + register int c1, s1; /* row indices, color and spin */ + + switch ( dir ) + { + case XUP: + for ( i = 0; i < 3; i++ ) + for ( s1 = 0; s1 < 4; s1++ ) + for ( c1 = 0; c1 < 3; c1++ ) + { + TIMESMINUSI( src->d[s1].c[c1].COLORSPINOR( i, 3 ), dest->d[s1].c[c1].COLORSPINOR( i, 0 ) ); + TIMESMINUSI( src->d[s1].c[c1].COLORSPINOR( i, 2 ), dest->d[s1].c[c1].COLORSPINOR( i, 1 ) ); + TIMESPLUSI( src->d[s1].c[c1].COLORSPINOR( i, 1 ), dest->d[s1].c[c1].COLORSPINOR( i, 2 ) ); + TIMESPLUSI( src->d[s1].c[c1].COLORSPINOR( i, 0 ), dest->d[s1].c[c1].COLORSPINOR( i, 3 ) ); + } + break; + case YUP: + for ( i = 0; i < 3; i++ ) + for ( s1 = 0; s1 < 4; s1++ ) + for ( c1 = 0; c1 < 3; c1++ ) + { + TIMESMINUSONE( src->d[s1].c[c1].COLORSPINOR( i, 3 ), dest->d[s1].c[c1].COLORSPINOR( i, 0 ) ); + TIMESPLUSONE( src->d[s1].c[c1].COLORSPINOR( i, 2 ), dest->d[s1].c[c1].COLORSPINOR( i, 1 ) ); + TIMESPLUSONE( src->d[s1].c[c1].COLORSPINOR( i, 1 ), dest->d[s1].c[c1].COLORSPINOR( i, 2 ) ); + TIMESMINUSONE( src->d[s1].c[c1].COLORSPINOR( i, 0 ), dest->d[s1].c[c1].COLORSPINOR( i, 3 ) ); + } + break; + case ZUP: + for ( i = 0; i < 3; i++ ) + for ( s1 = 0; s1 < 4; s1++ ) + for ( c1 = 0; c1 < 3; c1++ ) + { + TIMESMINUSI( src->d[s1].c[c1].COLORSPINOR( i, 2 ), dest->d[s1].c[c1].COLORSPINOR( i, 0 ) ); + TIMESPLUSI( src->d[s1].c[c1].COLORSPINOR( i, 3 ), dest->d[s1].c[c1].COLORSPINOR( i, 1 ) ); + TIMESPLUSI( src->d[s1].c[c1].COLORSPINOR( i, 0 ), dest->d[s1].c[c1].COLORSPINOR( i, 2 ) ); + TIMESMINUSI( src->d[s1].c[c1].COLORSPINOR( i, 1 ), dest->d[s1].c[c1].COLORSPINOR( i, 3 ) ); + } + break; + case TUP: + for ( i = 0; i < 3; i++ ) + for ( s1 = 0; s1 < 4; s1++ ) + for ( c1 = 0; c1 < 3; c1++ ) + { + TIMESPLUSONE( src->d[s1].c[c1].COLORSPINOR( i, 2 ), dest->d[s1].c[c1].COLORSPINOR( i, 0 ) ); + TIMESPLUSONE( src->d[s1].c[c1].COLORSPINOR( i, 3 ), dest->d[s1].c[c1].COLORSPINOR( i, 1 ) ); + TIMESPLUSONE( src->d[s1].c[c1].COLORSPINOR( i, 0 ), dest->d[s1].c[c1].COLORSPINOR( i, 2 ) ); + TIMESPLUSONE( src->d[s1].c[c1].COLORSPINOR( i, 1 ), dest->d[s1].c[c1].COLORSPINOR( i, 3 ) ); + } + break; + case GAMMAFIVE: + for ( i = 0; i < 3; i++ ) + for ( s1 = 0; s1 < 4; s1++ ) + for ( c1 = 0; c1 < 3; c1++ ) + { + TIMESPLUSONE( src->d[s1].c[c1].COLORSPINOR( i, 0 ), dest->d[s1].c[c1].COLORSPINOR( i, 0 ) ); + TIMESPLUSONE( src->d[s1].c[c1].COLORSPINOR( i, 1 ), dest->d[s1].c[c1].COLORSPINOR( i, 1 ) ); + TIMESMINUSONE( src->d[s1].c[c1].COLORSPINOR( i, 2 ), dest->d[s1].c[c1].COLORSPINOR( i, 2 ) ); + TIMESMINUSONE( src->d[s1].c[c1].COLORSPINOR( i, 3 ), dest->d[s1].c[c1].COLORSPINOR( i, 3 ) ); + } + break; + default: + printf( "BAD CALL TO MULT_BY_GAMMA_RIGHT()\n" ); + } +} + +void mult_by_gamma_r( spin_wilson_vector *src, spin_wilson_vector *dest, + int dir) +{ +register int i; /*color*/ +register int s1; /* row spin indices*/ + + switch(dir){ + case XUP: + for(i=0;i<3;i++)for(s1=0;s1<4;s1++){ + TIMESMINUSI( src->d[s1].COLORSPINOR(i,3), + dest->d[s1].COLORSPINOR(i,0) ); + TIMESMINUSI( src->d[s1].COLORSPINOR(i,2), + dest->d[s1].COLORSPINOR(i,1) ); + TIMESPLUSI( src->d[s1].COLORSPINOR(i,1), + dest->d[s1].COLORSPINOR(i,2) ); + TIMESPLUSI( src->d[s1].COLORSPINOR(i,0), + dest->d[s1].COLORSPINOR(i,3) ); + } + break; + case YUP: + for(i=0;i<3;i++)for(s1=0;s1<4;s1++){ + TIMESMINUSONE( src->d[s1].COLORSPINOR(i,3), + dest->d[s1].COLORSPINOR(i,0) ); + TIMESPLUSONE( src->d[s1].COLORSPINOR(i,2), + dest->d[s1].COLORSPINOR(i,1) ); + TIMESPLUSONE( src->d[s1].COLORSPINOR(i,1), + dest->d[s1].COLORSPINOR(i,2) ); + TIMESMINUSONE( src->d[s1].COLORSPINOR(i,0), + dest->d[s1].COLORSPINOR(i,3) ); + } + break; + case ZUP: + for(i=0;i<3;i++)for(s1=0;s1<4;s1++){ + TIMESMINUSI( src->d[s1].COLORSPINOR(i,2), + dest->d[s1].COLORSPINOR(i,0) ); + TIMESPLUSI( src->d[s1].COLORSPINOR(i,3), + dest->d[s1].COLORSPINOR(i,1) ); + TIMESPLUSI( src->d[s1].COLORSPINOR(i,0), + dest->d[s1].COLORSPINOR(i,2) ); + TIMESMINUSI( src->d[s1].COLORSPINOR(i,1), + dest->d[s1].COLORSPINOR(i,3) ); + } + break; + case TUP: + for(i=0;i<3;i++)for(s1=0;s1<4;s1++){ + TIMESPLUSONE( src->d[s1].COLORSPINOR(i,2), + dest->d[s1].COLORSPINOR(i,0) ); + TIMESPLUSONE( src->d[s1].COLORSPINOR(i,3), + dest->d[s1].COLORSPINOR(i,1) ); + TIMESPLUSONE( src->d[s1].COLORSPINOR(i,0), + dest->d[s1].COLORSPINOR(i,2) ); + TIMESPLUSONE( src->d[s1].COLORSPINOR(i,1), + dest->d[s1].COLORSPINOR(i,3) ); + } + break; + case GAMMAFIVE: + for(i=0;i<3;i++)for(s1=0;s1<4;s1++){ + TIMESPLUSONE( src->d[s1].COLORSPINOR(i,0), + dest->d[s1].COLORSPINOR(i,0) ); + TIMESPLUSONE( src->d[s1].COLORSPINOR(i,1), + dest->d[s1].COLORSPINOR(i,1) ); + TIMESMINUSONE( src->d[s1].COLORSPINOR(i,2), + dest->d[s1].COLORSPINOR(i,2) ); + TIMESMINUSONE( src->d[s1].COLORSPINOR(i,3), + dest->d[s1].COLORSPINOR(i,3) ); + } + break; + default: + printf("BAD CALL TO MULT_BY_GAMMA_RIGHT()\n"); + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/module.mk b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/module.mk new file mode 100644 index 0000000000000000000000000000000000000000..d20aca86e473fda0c513399eca7e1a2de1f9e265 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/module.mk @@ -0,0 +1,29 @@ +MYFILES:= \ + cadd.c cdiv.c ce_itheta.c cexp.c clog.c cmplx.c cmul.c \ + conjg.c csqrt.c csub.c \ + addmat.c addvec.c cmp_ahmat.c cs_m_a_vec.c cs_m_a_mat.c cs_m_s_vec.c \ + cs_m_vec.c det_su3.c clear_mat.c dumpmat.c dumpvec.c clearvec.c \ + m_amatvec_s.c m_amatvec.c m_amatvec_ns.c \ + m_matvec.c m_matvec_ns.c m_matvec_s.c \ + make_ahmat.c rand_ahmat.c realtr.c complextr.c \ + s_m_a_mat.c s_m_a_vec.c s_m_s_mat.c s_m_s_vec.c s_m_sum_vec.c \ + s_m_vec.c s_m_mat.c cs_m_mat.c cs_m_s_mat.c \ + su3_adjoint.c su3_dot.c su3_rdot.c su3_proj.c su3mat_copy.c \ + su3vec_copy.c \ + submat.c subvec.c trace_su3.c uncmp_ahmat.c \ + msq_su3vec.c sub4vecs.c m_amv_4dir.c m_amv_4vec.c \ + m_mv_s_4dir.c flush_to_zero.c \ + mb_gamma_l.c mb_gamma_r.c \ + gaussrand.c byterevn.c \ + m_su2_mat_vec_a.c m_su2_mat_vec_n.c r_su2_hit_a.c l_su2_hit_n.c \ + wp_shrink.c wp_grow.c wp_grow_a.c dump_wvec.c clear_wvec.c \ + su3_proj_w.c copy_wvec.c add_wvec.c sub_wvec.c s_m_wvec.c \ + s_m_hwvec.c msq_wvec.c wvec_dot.c wvec2_dot.c wvec_rdot.c \ + s_m_a_wvec.c s_m_a_g5_wvec.c \ + cs_m_wvec.c cs_m_a_wvec.c \ + m_mat_wvec.c m_amat_wvec.c \ + m_mat_hwvec.c m_amat_hwvec.c \ + grow4wvecs.c wp_shrink4.c s_g5_m_a_wvec.c g5_m_wvec.c + +SOURCE+= $(patsubst %,libraries/%,$(MYFILES)) + diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/msq_su3vec.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/msq_su3vec.c new file mode 100644 index 0000000000000000000000000000000000000000..44eb27e542f7e2043a8e9c105dafdb5e14e27674 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/msq_su3vec.c @@ -0,0 +1,62 @@ +/****************** msq_su3vec.c (in su3.a) ****************************/ +/* MIMD version 6 */ +/* * + * double magsq_su3vec_KE( su3_vector *a ) * + * return squared magnitude of an SU3 vector + */ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +#ifndef FAST +double magsq_su3vec_KE( su3_vector * a ) +{ + register double sum; + register int i; + for ( i = 0, sum = 0.0; i < 3; i++ ) + sum += a->c[i].real * a->c[i].real + a->c[i].imag * a->c[i].imag; + return ( sum ); +} + +#else +#ifdef NATIVEDOUBLE /* IBM RS6000 version */ +double magsq_su3vec_KE( su3_vector * a ) +{ + + register double ar, ai, sum; + + ar = a->c[0].real; + ai = a->c[0].imag; + sum = ar * ar + ai * ai; + + ar = a->c[1].real; + ai = a->c[1].imag; + sum += ar * ar + ai * ai; + + ar = a->c[2].real; + ai = a->c[2].imag; + sum += ar * ar + ai * ai; + + return ( ( double ) sum ); +} +#else +double magsq_su3vec_KE( su3_vector * a ) +{ + register double temp, sum; + sum = 0.0; + temp = a->c[0].real * a->c[0].real; + sum += temp; + temp = a->c[0].imag * a->c[0].imag; + sum += temp; + temp = a->c[1].real * a->c[1].real; + sum += temp; + temp = a->c[1].imag * a->c[1].imag; + sum += temp; + temp = a->c[2].real * a->c[2].real; + sum += temp; + temp = a->c[2].imag * a->c[2].imag; + sum += temp; + return ( sum ); +} +#endif /* End of "#ifdef NATIVEDOUBLE" */ +#endif /* end ifdef FAST */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/msq_wvec.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/msq_wvec.c new file mode 100644 index 0000000000000000000000000000000000000000..2217cc77e8a8440c78a79a28bc1fea2f1c5b63cb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/msq_wvec.c @@ -0,0 +1,88 @@ + /******************** msq_wvec.c (in su3.a) ******************** +* +*double msq_wvec(wilson_vector *vec) +* squared magnitude of a Wilson vector +* +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +double magsq_wvec( wilson_vector * vec ) +{ + register double ar, ai, sum; + + ar = vec->COLORSPINOR( 0, 0 ).real; + ai = vec->COLORSPINOR( 0, 0 ).imag; + sum = ar * ar + ai * ai; + ar = vec->COLORSPINOR( 1, 0 ).real; + ai = vec->COLORSPINOR( 1, 0 ).imag; + sum += ar * ar + ai * ai; + ar = vec->COLORSPINOR( 2, 0 ).real; + ai = vec->COLORSPINOR( 2, 0 ).imag; + sum += ar * ar + ai * ai; + + ar = vec->COLORSPINOR( 0, 1 ).real; + ai = vec->COLORSPINOR( 0, 1 ).imag; + sum += ar * ar + ai * ai; + ar = vec->COLORSPINOR( 1, 1 ).real; + ai = vec->COLORSPINOR( 1, 1 ).imag; + sum += ar * ar + ai * ai; + ar = vec->COLORSPINOR( 2, 1 ).real; + ai = vec->COLORSPINOR( 2, 1 ).imag; + sum += ar * ar + ai * ai; + + ar = vec->COLORSPINOR( 0, 2 ).real; + ai = vec->COLORSPINOR( 0, 2 ).imag; + sum += ar * ar + ai * ai; + ar = vec->COLORSPINOR( 1, 2 ).real; + ai = vec->COLORSPINOR( 1, 2 ).imag; + sum += ar * ar + ai * ai; + ar = vec->COLORSPINOR( 2, 2 ).real; + ai = vec->COLORSPINOR( 2, 2 ).imag; + sum += ar * ar + ai * ai; + + ar = vec->COLORSPINOR( 0, 3 ).real; + ai = vec->COLORSPINOR( 0, 3 ).imag; + sum += ar * ar + ai * ai; + ar = vec->COLORSPINOR( 1, 3 ).real; + ai = vec->COLORSPINOR( 1, 3 ).imag; + sum += ar * ar + ai * ai; + ar = vec->COLORSPINOR( 2, 3 ).real; + ai = vec->COLORSPINOR( 2, 3 ).imag; + sum += ar * ar + ai * ai; + + return ( ( double ) sum ); +} + +double magsq_hwvec( half_wilson_vector * vec ) +{ + +#ifdef NATIVEDOUBLE + register double ar, ai, sum; +#else + register double ar, ai, sum; +#endif + + ar = vec->h[0].c[0].real; + ai = vec->h[0].c[0].imag; + sum = ar * ar + ai * ai; + ar = vec->h[0].c[1].real; + ai = vec->h[0].c[1].imag; + sum += ar * ar + ai * ai; + ar = vec->h[0].c[2].real; + ai = vec->h[0].c[2].imag; + sum += ar * ar + ai * ai; + + ar = vec->h[1].c[0].real; + ai = vec->h[1].c[0].imag; + sum += ar * ar + ai * ai; + ar = vec->h[1].c[1].real; + ai = vec->h[1].c[1].imag; + sum += ar * ar + ai * ai; + ar = vec->h[1].c[2].real; + ai = vec->h[1].c[2].imag; + sum += ar * ar + ai * ai; + + return ( ( double ) sum ); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/r_su2_hit_a.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/r_su2_hit_a.c new file mode 100644 index 0000000000000000000000000000000000000000..56d754e38e904991e7b135bab774255983004902 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/r_su2_hit_a.c @@ -0,0 +1,22 @@ +/************** r_su2_hit_a.c (in su3.a) ********************** + * * + * right multiply an su3_matrix by the adjoint of an su2 matrix * + */ + +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +void right_su2_hit_a( su2_matrix * u, int p, int q, su3_matrix * link ) +{ + /* link <- link * u adj */ + /* The 0 column of u-adjoint matches column p of the SU(3) matrix */ + /* The 1 column of u-adjoint matches column q of the SU(3) matrix */ + /* C. DeTar 18 Oct 1990 */ + + register int m; + + for ( m = 0; m < 3; m++ ) + mult_su2_mat_vec_elem_a( u, &( link->ROWCOL( m, p ) ), &( link->ROWCOL( m, q ) ) ); + +} /* r_su2_hit_a.c */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/rand_ahmat.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/rand_ahmat.c new file mode 100644 index 0000000000000000000000000000000000000000..87c996666031e617118de84c6ae94aa09c328e45 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/rand_ahmat.c @@ -0,0 +1,51 @@ +/****************** rand_ahmat.c (in su3.a) *************************** +* * +* void random_anti_hermitian_KE( anti_hermitmat *mat_antihermit, passthru *prn_pt)* +* Creates gaussian random anti-hermitian matrices * +* Normalization is < |m01|^2 > = 1, or < m01.real*m01.real > = 1/2 * +* The argument "prn_pt" is a pointer to be passed to gaussian_rand_no_KE() * +* RS6000 may choke on void * * +*/ +#include "../include/config.h" +#include +#include "../include/complex.h" +#include "../include/su3.h" + +void random_anti_hermitian_KE( anti_hermitmat * mat_antihermit ) +{ + double r3, r8; + double sqrt_third; + + sqrt_third = sqrt( 1.0 / 3.0 ); + r3 = gaussian_rand_no_KE( ); + r8 = gaussian_rand_no_KE( ); + mat_antihermit->m00im = r3 + sqrt_third * r8; + mat_antihermit->m11im = -r3 + sqrt_third * r8; + mat_antihermit->m22im = -2.0 * sqrt_third * r8; + mat_antihermit->m01.real = gaussian_rand_no_KE( ); + mat_antihermit->m02.real = gaussian_rand_no_KE( ); + mat_antihermit->m12.real = gaussian_rand_no_KE( ); + mat_antihermit->m01.imag = gaussian_rand_no_KE( ); + mat_antihermit->m02.imag = gaussian_rand_no_KE( ); + mat_antihermit->m12.imag = gaussian_rand_no_KE( ); + +} /*random_anti_hermitian_ */ + +void funny_anti_hermitian( int ix, int iy, int iz, int it, int idir, anti_hermitmat * mat_antihermit ) +{ + double r3, r8; + double sqrt_third; + + sqrt_third = sqrt( ( double ) ( 1.0 / 3.0 ) ); + r3 = ( double ) ix / 137.0 * ( double ) idir; + r8 = ( double ) ( iy + iz ) / 42.0; + mat_antihermit->m00im = r3 + sqrt_third * r8; + mat_antihermit->m11im = -r3 + sqrt_third * r8; + mat_antihermit->m22im = -2.0 * sqrt_third * r8; + mat_antihermit->m01.real = -( double ) ix / 25. * 10; + mat_antihermit->m02.real = -( double ) iy / 20. * 10; + mat_antihermit->m12.real = ( double ) iz / 19. * ( double ) idir *10; + mat_antihermit->m01.imag = -( double ) it / 80.; + mat_antihermit->m02.imag = ( double ) it / 5.; + mat_antihermit->m12.imag = ( double ) ix / 19.; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/realtr.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/realtr.c new file mode 100644 index 0000000000000000000000000000000000000000..d1f8ba8575f1618e90c4e5a68a236ef025f08e51 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/realtr.c @@ -0,0 +1,18 @@ +/****************** realtr.c (in su3.a) ******************************* +* * +* double realtrace_su3_KE( su3_matrix *a,*b) * +* return Re( Tr( A_adjoint*B ) * +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +double realtrace_su3_KE( su3_matrix * a, su3_matrix * b ) +{ + register int i, j; + register double sum; + for ( sum = 0.0, i = 0; i < 3; i++ ) + for ( j = 0; j < 3; j++ ) + sum += a->ROWCOL( i, j ).real * b->ROWCOL( i, j ).real + a->ROWCOL( i, j ).imag * b->ROWCOL( i, j ).imag; + return ( sum ); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/s_g5_m_a_wvec.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/s_g5_m_a_wvec.c new file mode 100644 index 0000000000000000000000000000000000000000..f1c75cbd8d26ae0822eae56e83fcb4791970fca3 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/s_g5_m_a_wvec.c @@ -0,0 +1,28 @@ +/******************** s_g5_m_a_wvec.c (in su3.a) ******************** +* +*void scalar_g5_mult_add_wvec(wilson_vector *src1, wilson_vector *src2, + double s, wilson_vector *dest) +* Multiply a Wilson vector by a scalar and gamma5 and add to another vector +* dest <- src1 + gamma5*s*src2 +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +void scalar_g5_mult_add_wvec( wilson_vector * src1, wilson_vector * src2, double s, wilson_vector * dest ) +{ + register int i; + for ( i = 0; i < 3; i++ ) + { + + dest->COLORSPINOR( i, 0 ).real = src1->COLORSPINOR( i, 0 ).real + s * ( src2->COLORSPINOR( i, 0 ).real ); + dest->COLORSPINOR( i, 1 ).real = src1->COLORSPINOR( i, 1 ).real + s * ( src2->COLORSPINOR( i, 1 ).real ); + dest->COLORSPINOR( i, 2 ).real = src1->COLORSPINOR( i, 2 ).real - s * ( src2->COLORSPINOR( i, 2 ).real ); + dest->COLORSPINOR( i, 3 ).real = src1->COLORSPINOR( i, 3 ).real - s * ( src2->COLORSPINOR( i, 3 ).real ); + + dest->COLORSPINOR( i, 0 ).imag = src1->COLORSPINOR( i, 0 ).imag + s * ( src2->COLORSPINOR( i, 0 ).imag ); + dest->COLORSPINOR( i, 1 ).imag = src1->COLORSPINOR( i, 1 ).imag + s * ( src2->COLORSPINOR( i, 1 ).imag ); + dest->COLORSPINOR( i, 2 ).imag = src1->COLORSPINOR( i, 2 ).imag - s * ( src2->COLORSPINOR( i, 2 ).imag ); + dest->COLORSPINOR( i, 3 ).imag = src1->COLORSPINOR( i, 3 ).imag - s * ( src2->COLORSPINOR( i, 3 ).imag ); + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/s_m_a_g5_wvec.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/s_m_a_g5_wvec.c new file mode 100644 index 0000000000000000000000000000000000000000..0ec42d7d8013aea201b9692649c7e13a1a568c85 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/s_m_a_g5_wvec.c @@ -0,0 +1,29 @@ +/******************** s_m_a_g5_wvec.c (in su3.a) ******************** +* +*void scalar_mult_add_g5_wvec(wilson_vector *src1, wilson_vector *src2, + double s, wilson_vector *dest) +* Multiply a Wilson vector by a scalar and add to another vector then multiply by gamma5 +* dest <- gamma5*(src1 + s*src2) +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +void scalar_mult_add_g5_wvec( wilson_vector * src1, wilson_vector * src2, double s, wilson_vector * dest ) +{ + register int i; + for ( i = 0; i < 3; i++ ) + { + + dest->COLORSPINOR( i, 0 ).real = src1->COLORSPINOR( i, 0 ).real + s * ( src2->COLORSPINOR( i, 0 ).real ); + dest->COLORSPINOR( i, 1 ).real = src1->COLORSPINOR( i, 1 ).real + s * ( src2->COLORSPINOR( i, 1 ).real ); + dest->COLORSPINOR( i, 2 ).real = -( src1->COLORSPINOR( i, 2 ).real ) - s * ( src2->COLORSPINOR( i, 2 ).real ); + dest->COLORSPINOR( i, 3 ).real = -( src1->COLORSPINOR( i, 3 ).real ) - s * ( src2->COLORSPINOR( i, 3 ).real ); + + dest->COLORSPINOR( i, 0 ).imag = src1->COLORSPINOR( i, 0 ).imag + s * ( src2->COLORSPINOR( i, 0 ).imag ); + dest->COLORSPINOR( i, 1 ).imag = src1->COLORSPINOR( i, 1 ).imag + s * ( src2->COLORSPINOR( i, 1 ).imag ); + dest->COLORSPINOR( i, 2 ).imag = -( src1->COLORSPINOR( i, 2 ).imag ) - s * ( src2->COLORSPINOR( i, 2 ).imag ); + dest->COLORSPINOR( i, 3 ).imag = -( src1->COLORSPINOR( i, 3 ).imag ) - s * ( src2->COLORSPINOR( i, 3 ).imag ); + } + +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/s_m_a_mat.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/s_m_a_mat.c new file mode 100644 index 0000000000000000000000000000000000000000..8918230439e6cc191d861269f79f77e954e995ef --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/s_m_a_mat.c @@ -0,0 +1,21 @@ +/**************** s_m_a_mat.c (in su3.a) ****************************** +* * +* void scalar_mult_add_su3_matrix_KE( su3_matrix *a, su3_matrix *b, * +* double s, su3_matrix *c) * +* C <- A + s*B * +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +/* c <- a + s*b, matrices */ +void scalar_mult_add_su3_matrix_KE( su3_matrix * a, su3_matrix * b, double s, su3_matrix * c ) +{ + register int i, j; + for ( i = 0; i < 3; i++ ) + for ( j = 0; j < 3; j++ ) + { + c->ROWCOL( i, j ).real = a->ROWCOL( i, j ).real + s * b->ROWCOL( i, j ).real; + c->ROWCOL( i, j ).imag = a->ROWCOL( i, j ).imag + s * b->ROWCOL( i, j ).imag; + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/s_m_a_vec.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/s_m_a_vec.c new file mode 100644 index 0000000000000000000000000000000000000000..1b1d2a6fb044c247e1ff30b85624bfa9331c5b89 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/s_m_a_vec.c @@ -0,0 +1,38 @@ +/**************** s_m_a_vec.c (in su3.a) ****************************** +* * +* void scalar_mult_add_su3_vector_KE( su3_vector *a, su3_vector *b, * +* double s, su3_vector *c) * +* C <- A + s*B, A,B and C vectors * +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +/* c <- a + s*b, vectors */ + +void scalar_mult_add_su3_vector_KE( su3_vector * a, su3_vector * b, double s, su3_vector * c ) +{ + +#ifndef NATIVEDOUBLE + register int i; + for ( i = 0; i < 3; i++ ) + { + c->c[i].real = a->c[i].real + s * b->c[i].real; + c->c[i].imag = a->c[i].imag + s * b->c[i].imag; + } + +#else /* RS6000 version */ + + register double ss; + + ss = s; + + c->c[0].real = a->c[0].real + ss * b->c[0].real; + c->c[0].imag = a->c[0].imag + ss * b->c[0].imag; + c->c[1].real = a->c[1].real + ss * b->c[1].real; + c->c[1].imag = a->c[1].imag + ss * b->c[1].imag; + c->c[2].real = a->c[2].real + ss * b->c[2].real; + c->c[2].imag = a->c[2].imag + ss * b->c[2].imag; + +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/s_m_a_wvec.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/s_m_a_wvec.c new file mode 100644 index 0000000000000000000000000000000000000000..56edcbb994e676183cd89ba23cce0fe14087a353 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/s_m_a_wvec.c @@ -0,0 +1,128 @@ +/******************** s_m_a_wvec.c (in su3.a) ******************** +* +*void scalar_mult_add_wvec(wilson_vector *src1, wilson_vector *src2, + double s, wilson_vector *dest) +* Multiply a Wilson vector by a scalar and add to another vector +* dest <- src1 + s*src2 +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +void scalar_mult_add_wvec( wilson_vector * src1, wilson_vector * src2, double s, wilson_vector * dest ) +{ + + register double ss; + ss = s; + + dest->COLORSPINOR( 0, 0 ).real = src1->COLORSPINOR( 0, 0 ).real + ss * src2->COLORSPINOR( 0, 0 ).real; + dest->COLORSPINOR( 0, 0 ).imag = src1->COLORSPINOR( 0, 0 ).imag + ss * src2->COLORSPINOR( 0, 0 ).imag; + dest->COLORSPINOR( 1, 0 ).real = src1->COLORSPINOR( 1, 0 ).real + ss * src2->COLORSPINOR( 1, 0 ).real; + dest->COLORSPINOR( 1, 0 ).imag = src1->COLORSPINOR( 1, 0 ).imag + ss * src2->COLORSPINOR( 1, 0 ).imag; + dest->COLORSPINOR( 2, 0 ).real = src1->COLORSPINOR( 2, 0 ).real + ss * src2->COLORSPINOR( 2, 0 ).real; + dest->COLORSPINOR( 2, 0 ).imag = src1->COLORSPINOR( 2, 0 ).imag + ss * src2->COLORSPINOR( 2, 0 ).imag; + + dest->COLORSPINOR( 0, 1 ).real = src1->COLORSPINOR( 0, 1 ).real + ss * src2->COLORSPINOR( 0, 1 ).real; + dest->COLORSPINOR( 0, 1 ).imag = src1->COLORSPINOR( 0, 1 ).imag + ss * src2->COLORSPINOR( 0, 1 ).imag; + dest->COLORSPINOR( 1, 1 ).real = src1->COLORSPINOR( 1, 1 ).real + ss * src2->COLORSPINOR( 1, 1 ).real; + dest->COLORSPINOR( 1, 1 ).imag = src1->COLORSPINOR( 1, 1 ).imag + ss * src2->COLORSPINOR( 1, 1 ).imag; + dest->COLORSPINOR( 2, 1 ).real = src1->COLORSPINOR( 2, 1 ).real + ss * src2->COLORSPINOR( 2, 1 ).real; + dest->COLORSPINOR( 2, 1 ).imag = src1->COLORSPINOR( 2, 1 ).imag + ss * src2->COLORSPINOR( 2, 1 ).imag; + + dest->COLORSPINOR( 0, 2 ).real = src1->COLORSPINOR( 0, 2 ).real + ss * src2->COLORSPINOR( 0, 2 ).real; + dest->COLORSPINOR( 0, 2 ).imag = src1->COLORSPINOR( 0, 2 ).imag + ss * src2->COLORSPINOR( 0, 2 ).imag; + dest->COLORSPINOR( 1, 2 ).real = src1->COLORSPINOR( 1, 2 ).real + ss * src2->COLORSPINOR( 1, 2 ).real; + dest->COLORSPINOR( 1, 2 ).imag = src1->COLORSPINOR( 1, 2 ).imag + ss * src2->COLORSPINOR( 1, 2 ).imag; + dest->COLORSPINOR( 2, 2 ).real = src1->COLORSPINOR( 2, 2 ).real + ss * src2->COLORSPINOR( 2, 2 ).real; + dest->COLORSPINOR( 2, 2 ).imag = src1->COLORSPINOR( 2, 2 ).imag + ss * src2->COLORSPINOR( 2, 2 ).imag; + + dest->COLORSPINOR( 0, 3 ).real = src1->COLORSPINOR( 0, 3 ).real + ss * src2->COLORSPINOR( 0, 3 ).real; + dest->COLORSPINOR( 0, 3 ).imag = src1->COLORSPINOR( 0, 3 ).imag + ss * src2->COLORSPINOR( 0, 3 ).imag; + dest->COLORSPINOR( 1, 3 ).real = src1->COLORSPINOR( 1, 3 ).real + ss * src2->COLORSPINOR( 1, 3 ).real; + dest->COLORSPINOR( 1, 3 ).imag = src1->COLORSPINOR( 1, 3 ).imag + ss * src2->COLORSPINOR( 1, 3 ).imag; + dest->COLORSPINOR( 2, 3 ).real = src1->COLORSPINOR( 2, 3 ).real + ss * src2->COLORSPINOR( 2, 3 ).real; + dest->COLORSPINOR( 2, 3 ).imag = src1->COLORSPINOR( 2, 3 ).imag + ss * src2->COLORSPINOR( 2, 3 ).imag; + + +} + +void scalar2_mult_add_wvec( wilson_vector * src1, double t, wilson_vector * src2, double s, wilson_vector * dest ) +{ + + register double ss; + ss = s; + register double tt; + tt = t; + + dest->COLORSPINOR( 0, 0 ).real = tt * src1->COLORSPINOR( 0, 0 ).real + ss * src2->COLORSPINOR( 0, 0 ).real; + dest->COLORSPINOR( 0, 0 ).imag = tt * src1->COLORSPINOR( 0, 0 ).imag + ss * src2->COLORSPINOR( 0, 0 ).imag; + dest->COLORSPINOR( 1, 0 ).real = tt * src1->COLORSPINOR( 1, 0 ).real + ss * src2->COLORSPINOR( 1, 0 ).real; + dest->COLORSPINOR( 1, 0 ).imag = tt * src1->COLORSPINOR( 1, 0 ).imag + ss * src2->COLORSPINOR( 1, 0 ).imag; + dest->COLORSPINOR( 2, 0 ).real = tt * src1->COLORSPINOR( 2, 0 ).real + ss * src2->COLORSPINOR( 2, 0 ).real; + dest->COLORSPINOR( 2, 0 ).imag = tt * src1->COLORSPINOR( 2, 0 ).imag + ss * src2->COLORSPINOR( 2, 0 ).imag; + + dest->COLORSPINOR( 0, 1 ).real = tt * src1->COLORSPINOR( 0, 1 ).real + ss * src2->COLORSPINOR( 0, 1 ).real; + dest->COLORSPINOR( 0, 1 ).imag = tt * src1->COLORSPINOR( 0, 1 ).imag + ss * src2->COLORSPINOR( 0, 1 ).imag; + dest->COLORSPINOR( 1, 1 ).real = tt * src1->COLORSPINOR( 1, 1 ).real + ss * src2->COLORSPINOR( 1, 1 ).real; + dest->COLORSPINOR( 1, 1 ).imag = tt * src1->COLORSPINOR( 1, 1 ).imag + ss * src2->COLORSPINOR( 1, 1 ).imag; + dest->COLORSPINOR( 2, 1 ).real = tt * src1->COLORSPINOR( 2, 1 ).real + ss * src2->COLORSPINOR( 2, 1 ).real; + dest->COLORSPINOR( 2, 1 ).imag = tt * src1->COLORSPINOR( 2, 1 ).imag + ss * src2->COLORSPINOR( 2, 1 ).imag; + + dest->COLORSPINOR( 0, 2 ).real = tt * src1->COLORSPINOR( 0, 2 ).real + ss * src2->COLORSPINOR( 0, 2 ).real; + dest->COLORSPINOR( 0, 2 ).imag = tt * src1->COLORSPINOR( 0, 2 ).imag + ss * src2->COLORSPINOR( 0, 2 ).imag; + dest->COLORSPINOR( 1, 2 ).real = tt * src1->COLORSPINOR( 1, 2 ).real + ss * src2->COLORSPINOR( 1, 2 ).real; + dest->COLORSPINOR( 1, 2 ).imag = tt * src1->COLORSPINOR( 1, 2 ).imag + ss * src2->COLORSPINOR( 1, 2 ).imag; + dest->COLORSPINOR( 2, 2 ).real = tt * src1->COLORSPINOR( 2, 2 ).real + ss * src2->COLORSPINOR( 2, 2 ).real; + dest->COLORSPINOR( 2, 2 ).imag = tt * src1->COLORSPINOR( 2, 2 ).imag + ss * src2->COLORSPINOR( 2, 2 ).imag; + + dest->COLORSPINOR( 0, 3 ).real = tt * src1->COLORSPINOR( 0, 3 ).real + ss * src2->COLORSPINOR( 0, 3 ).real; + dest->COLORSPINOR( 0, 3 ).imag = tt * src1->COLORSPINOR( 0, 3 ).imag + ss * src2->COLORSPINOR( 0, 3 ).imag; + dest->COLORSPINOR( 1, 3 ).real = tt * src1->COLORSPINOR( 1, 3 ).real + ss * src2->COLORSPINOR( 1, 3 ).real; + dest->COLORSPINOR( 1, 3 ).imag = tt * src1->COLORSPINOR( 1, 3 ).imag + ss * src2->COLORSPINOR( 1, 3 ).imag; + dest->COLORSPINOR( 2, 3 ).real = tt * src1->COLORSPINOR( 2, 3 ).real + ss * src2->COLORSPINOR( 2, 3 ).real; + dest->COLORSPINOR( 2, 3 ).imag = tt * src1->COLORSPINOR( 2, 3 ).imag + ss * src2->COLORSPINOR( 2, 3 ).imag; + +} + +void scalar3_mult_add_wvec( wilson_vector * src1, double t, + wilson_vector * src2, double s, + wilson_vector * src3, double u, + wilson_vector * dest ) +{ + + register double ss; + ss = s; + register double tt; + tt = t; + register double uu; + uu = u; + + dest->COLORSPINOR( 0, 0 ).real = tt * src1->COLORSPINOR( 0, 0 ).real + ss * src2->COLORSPINOR( 0, 0 ).real+ uu * src3->COLORSPINOR( 0, 0 ).real; + dest->COLORSPINOR( 0, 0 ).imag = tt * src1->COLORSPINOR( 0, 0 ).imag + ss * src2->COLORSPINOR( 0, 0 ).imag+ uu * src3->COLORSPINOR( 0, 0 ).imag; + dest->COLORSPINOR( 1, 0 ).real = tt * src1->COLORSPINOR( 1, 0 ).real + ss * src2->COLORSPINOR( 1, 0 ).real+ uu * src3->COLORSPINOR( 1, 0 ).real; + dest->COLORSPINOR( 1, 0 ).imag = tt * src1->COLORSPINOR( 1, 0 ).imag + ss * src2->COLORSPINOR( 1, 0 ).imag+ uu * src3->COLORSPINOR( 1, 0 ).imag; + dest->COLORSPINOR( 2, 0 ).real = tt * src1->COLORSPINOR( 2, 0 ).real + ss * src2->COLORSPINOR( 2, 0 ).real+ uu * src3->COLORSPINOR( 2, 0 ).real; + dest->COLORSPINOR( 2, 0 ).imag = tt * src1->COLORSPINOR( 2, 0 ).imag + ss * src2->COLORSPINOR( 2, 0 ).imag+ uu * src3->COLORSPINOR( 2, 0 ).imag; + + dest->COLORSPINOR( 0, 1 ).real = tt * src1->COLORSPINOR( 0, 1 ).real + ss * src2->COLORSPINOR( 0, 1 ).real+ uu * src3->COLORSPINOR( 0, 1 ).real; + dest->COLORSPINOR( 0, 1 ).imag = tt * src1->COLORSPINOR( 0, 1 ).imag + ss * src2->COLORSPINOR( 0, 1 ).imag+ uu * src3->COLORSPINOR( 0, 1 ).imag; + dest->COLORSPINOR( 1, 1 ).real = tt * src1->COLORSPINOR( 1, 1 ).real + ss * src2->COLORSPINOR( 1, 1 ).real+ uu * src3->COLORSPINOR( 1, 1 ).real; + dest->COLORSPINOR( 1, 1 ).imag = tt * src1->COLORSPINOR( 1, 1 ).imag + ss * src2->COLORSPINOR( 1, 1 ).imag+ uu * src3->COLORSPINOR( 1, 1 ).imag; + dest->COLORSPINOR( 2, 1 ).real = tt * src1->COLORSPINOR( 2, 1 ).real + ss * src2->COLORSPINOR( 2, 1 ).real+ uu * src3->COLORSPINOR( 2, 1 ).real; + dest->COLORSPINOR( 2, 1 ).imag = tt * src1->COLORSPINOR( 2, 1 ).imag + ss * src2->COLORSPINOR( 2, 1 ).imag+ uu * src3->COLORSPINOR( 2, 1 ).imag; + + dest->COLORSPINOR( 0, 2 ).real = tt * src1->COLORSPINOR( 0, 2 ).real + ss * src2->COLORSPINOR( 0, 2 ).real+ uu * src3->COLORSPINOR( 0, 2 ).real; + dest->COLORSPINOR( 0, 2 ).imag = tt * src1->COLORSPINOR( 0, 2 ).imag + ss * src2->COLORSPINOR( 0, 2 ).imag+ uu * src3->COLORSPINOR( 0, 2 ).imag; + dest->COLORSPINOR( 1, 2 ).real = tt * src1->COLORSPINOR( 1, 2 ).real + ss * src2->COLORSPINOR( 1, 2 ).real+ uu * src3->COLORSPINOR( 1, 2 ).real; + dest->COLORSPINOR( 1, 2 ).imag = tt * src1->COLORSPINOR( 1, 2 ).imag + ss * src2->COLORSPINOR( 1, 2 ).imag+ uu * src3->COLORSPINOR( 1, 2 ).imag; + dest->COLORSPINOR( 2, 2 ).real = tt * src1->COLORSPINOR( 2, 2 ).real + ss * src2->COLORSPINOR( 2, 2 ).real+ uu * src3->COLORSPINOR( 2, 2 ).real; + dest->COLORSPINOR( 2, 2 ).imag = tt * src1->COLORSPINOR( 2, 2 ).imag + ss * src2->COLORSPINOR( 2, 2 ).imag+ uu * src3->COLORSPINOR( 2, 2 ).imag; + + dest->COLORSPINOR( 0, 3 ).real = tt * src1->COLORSPINOR( 0, 3 ).real + ss * src2->COLORSPINOR( 0, 3 ).real+ ss * src3->COLORSPINOR( 0, 3 ).real; + dest->COLORSPINOR( 0, 3 ).imag = tt * src1->COLORSPINOR( 0, 3 ).imag + ss * src2->COLORSPINOR( 0, 3 ).imag+ ss * src3->COLORSPINOR( 0, 3 ).imag; + dest->COLORSPINOR( 1, 3 ).real = tt * src1->COLORSPINOR( 1, 3 ).real + ss * src2->COLORSPINOR( 1, 3 ).real+ ss * src3->COLORSPINOR( 1, 3 ).real; + dest->COLORSPINOR( 1, 3 ).imag = tt * src1->COLORSPINOR( 1, 3 ).imag + ss * src2->COLORSPINOR( 1, 3 ).imag+ ss * src3->COLORSPINOR( 1, 3 ).imag; + dest->COLORSPINOR( 2, 3 ).real = tt * src1->COLORSPINOR( 2, 3 ).real + ss * src2->COLORSPINOR( 2, 3 ).real+ ss * src3->COLORSPINOR( 2, 3 ).real; + dest->COLORSPINOR( 2, 3 ).imag = tt * src1->COLORSPINOR( 2, 3 ).imag + ss * src2->COLORSPINOR( 2, 3 ).imag+ ss * src3->COLORSPINOR( 2, 3 ).imag; + +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/s_m_hwvec.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/s_m_hwvec.c new file mode 100644 index 0000000000000000000000000000000000000000..9b054f4cf3972a5464cdfe833004320d0758ba0d --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/s_m_hwvec.c @@ -0,0 +1,40 @@ +/******************** s_m_hwvec.c (in su3.a) ******************** +* +*void scalar_mult_hwvec(half_wilson_vector *src, double s, + half_wilson_vector *dest) +* Multiply a half Wilson vector by a scalar +* dest <- s*src +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +void scalar_mult_hwvec( half_wilson_vector * src, double s, half_wilson_vector * dest ) +{ + +#ifndef NATIVEDOUBLE + register int i; + for ( i = 0; i < 2; i++ ) + scalar_mult_su3_vector_KE( &( src->h[i] ), s, &( dest->h[i] ) ); + +#else /* RS6000 version */ + + register double ss; + ss = s; + + dest->h[0].c[0].real = ss * src->h[0].c[0].real; + dest->h[0].c[0].imag = ss * src->h[0].c[0].imag; + dest->h[0].c[1].real = ss * src->h[0].c[1].real; + dest->h[0].c[1].imag = ss * src->h[0].c[1].imag; + dest->h[0].c[2].real = ss * src->h[0].c[2].real; + dest->h[0].c[2].imag = ss * src->h[0].c[2].imag; + + dest->h[1].c[0].real = ss * src->h[1].c[0].real; + dest->h[1].c[0].imag = ss * src->h[1].c[0].imag; + dest->h[1].c[1].real = ss * src->h[1].c[1].real; + dest->h[1].c[1].imag = ss * src->h[1].c[1].imag; + dest->h[1].c[2].real = ss * src->h[1].c[2].real; + dest->h[1].c[2].imag = ss * src->h[1].c[2].imag; + +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/s_m_mat.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/s_m_mat.c new file mode 100644 index 0000000000000000000000000000000000000000..22113daa8b14259d10539e94fb2debff2dfc8c99 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/s_m_mat.c @@ -0,0 +1,20 @@ +/****************** s_m_mat.c (in su3.a) ****************************** +* * +* void scalar_mult_su3_matrix_KE( su3_matrix *a, double s, su3_matrix *b) * +* B <- s*A * +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +/* b <- s*a, matrices */ +void scalar_mult_su3_matrix_KE( su3_matrix * a, double s, su3_matrix * b ) +{ + register int i, j; + for ( i = 0; i < 3; i++ ) + for ( j = 0; j < 3; j++ ) + { + b->ROWCOL( i, j ).real = s * a->ROWCOL( i, j ).real; + b->ROWCOL( i, j ).imag = s * a->ROWCOL( i, j ).imag; + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/s_m_s_mat.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/s_m_s_mat.c new file mode 100644 index 0000000000000000000000000000000000000000..d58ce2b28a7f2510ff80d01521354f417d0637cd --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/s_m_s_mat.c @@ -0,0 +1,52 @@ +/**************** s_m_s_mat.c (in su3.a) ****************************** +* * +* void scalar_mult_sub_su3_matrix_KE( su3_matrix *a, su3_matrix *b, * +* double s, su3_matrix *c) * +* C <- A - s*B, A,B and C matrices * +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +/* c <- a - s*b, matrices */ +void scalar_mult_sub_su3_matrix_KE( su3_matrix * a, su3_matrix * b, double s, su3_matrix * c ) +{ + +#ifndef NATIVEDOUBLE + register int i, j; + for ( i = 0; i < 3; i++ ) + for ( j = 0; j < 3; j++ ) + { + c->ROWCOL( i, j ).real = a->ROWCOL( i, j ).real - s * b->ROWCOL( i, j ).real; + c->ROWCOL( i, j ).imag = a->ROWCOL( i, j ).imag - s * b->ROWCOL( i, j ).imag; + } + +#else /* RS6000 version */ + + register double ss; + + ss = s; + + c->ROWCOL( 0, 0 ).real = a->ROWCOL( 0, 0 ).real - ss * b->ROWCOL( 0, 0 ).real; + c->ROWCOL( 0, 0 ).imag = a->ROWCOL( 0, 0 ).imag - ss * b->ROWCOL( 0, 0 ).imag; + c->ROWCOL( 0, 1 ).real = a->ROWCOL( 0, 1 ).real - ss * b->ROWCOL( 0, 1 ).real; + c->ROWCOL( 0, 1 ).imag = a->ROWCOL( 0, 1 ).imag - ss * b->ROWCOL( 0, 1 ).imag; + c->ROWCOL( 0, 2 ).real = a->ROWCOL( 0, 2 ).real - ss * b->ROWCOL( 0, 2 ).real; + c->ROWCOL( 0, 2 ).imag = a->ROWCOL( 0, 2 ).imag - ss * b->ROWCOL( 0, 2 ).imag; + + c->ROWCOL( 1, 0 ).real = a->ROWCOL( 1, 0 ).real - ss * b->ROWCOL( 1, 0 ).real; + c->ROWCOL( 1, 0 ).imag = a->ROWCOL( 1, 0 ).imag - ss * b->ROWCOL( 1, 0 ).imag; + c->ROWCOL( 1, 1 ).real = a->ROWCOL( 1, 1 ).real - ss * b->ROWCOL( 1, 1 ).real; + c->ROWCOL( 1, 1 ).imag = a->ROWCOL( 1, 1 ).imag - ss * b->ROWCOL( 1, 1 ).imag; + c->ROWCOL( 1, 2 ).real = a->ROWCOL( 1, 2 ).real - ss * b->ROWCOL( 1, 2 ).real; + c->ROWCOL( 1, 2 ).imag = a->ROWCOL( 1, 2 ).imag - ss * b->ROWCOL( 1, 2 ).imag; + + c->ROWCOL( 2, 0 ).real = a->ROWCOL( 2, 0 ).real - ss * b->ROWCOL( 2, 0 ).real; + c->ROWCOL( 2, 0 ).imag = a->ROWCOL( 2, 0 ).imag - ss * b->ROWCOL( 2, 0 ).imag; + c->ROWCOL( 2, 1 ).real = a->ROWCOL( 2, 1 ).real - ss * b->ROWCOL( 2, 1 ).real; + c->ROWCOL( 2, 1 ).imag = a->ROWCOL( 2, 1 ).imag - ss * b->ROWCOL( 2, 1 ).imag; + c->ROWCOL( 2, 2 ).real = a->ROWCOL( 2, 2 ).real - ss * b->ROWCOL( 2, 2 ).real; + c->ROWCOL( 2, 2 ).imag = a->ROWCOL( 2, 2 ).imag - ss * b->ROWCOL( 2, 2 ).imag; + +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/s_m_s_vec.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/s_m_s_vec.c new file mode 100644 index 0000000000000000000000000000000000000000..603f46d6cb9e0e966dbda0041dd1944eea0f3f31 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/s_m_s_vec.c @@ -0,0 +1,37 @@ +/***************** s_m_s_vec.c (in su3.a) ***************************** +* * +* void scalar_mult_sub_su3_vector_KE( su3_vector *a, su3_vector *b, * +* double s, su3_vector *c) * +* C <- A - s*B, A,B and C vectors * +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +/* c <- a - s*b, vectors */ +void scalar_mult_sub_su3_vector_KE( su3_vector * a, su3_vector * b, double s, su3_vector * c ) +{ + +#ifndef NATIVEDOUBLE + register int i; + for ( i = 0; i < 3; i++ ) + { + c->c[i].real = a->c[i].real - s * b->c[i].real; + c->c[i].imag = a->c[i].imag - s * b->c[i].imag; + } + +#else /* RS6000 version */ + + register double ss; + + ss = s; + + c->c[0].real = a->c[0].real - ss * b->c[0].real; + c->c[0].imag = a->c[0].imag - ss * b->c[0].imag; + c->c[1].real = a->c[1].real - ss * b->c[1].real; + c->c[1].imag = a->c[1].imag - ss * b->c[1].imag; + c->c[2].real = a->c[2].real - ss * b->c[2].real; + c->c[2].imag = a->c[2].imag - ss * b->c[2].imag; + +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/s_m_sum_vec.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/s_m_sum_vec.c new file mode 100644 index 0000000000000000000000000000000000000000..b9a262582f3730e6266225b95eb4c15391fd7ab2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/s_m_sum_vec.c @@ -0,0 +1,36 @@ +/**************** s_m_sum_vec.c (in su3.a) **************************** +* * +* void scalar_mult_sum_su3_vector_KE( su3_vector *a, su3_vector *b, double s )* +* A <- A + s*B, A and B vectors * +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +/* a <- a + s*b, vectors */ +void scalar_mult_sum_su3_vector_KE( su3_vector * a, su3_vector * b, double s ) +{ + +#ifndef NATIVEDOUBLE + register int i; + for ( i = 0; i < 3; i++ ) + { + a->c[i].real += s * b->c[i].real; + a->c[i].imag += s * b->c[i].imag; + } + +#else /* RS6000 version */ + + register double ss; + + ss = s; + + a->c[0].real += ss * b->c[0].real; + a->c[0].imag += ss * b->c[0].imag; + a->c[1].real += ss * b->c[1].real; + a->c[1].imag += ss * b->c[1].imag; + a->c[2].real += ss * b->c[2].real; + a->c[2].imag += ss * b->c[2].imag; + +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/s_m_vec.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/s_m_vec.c new file mode 100644 index 0000000000000000000000000000000000000000..b11a1a263ea88ebe6a9aa644ce3fd6ce5ab71579 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/s_m_vec.c @@ -0,0 +1,36 @@ +/****************** s_m_vec.c (in su3.a) ****************************** +* * +* void scalar_mult_su3_vector_KE( su3_vector *a, double s, su3_vector *c) * +* C <- s*A, A and C vectors * +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +/* c <- s*a, vectors */ +void scalar_mult_su3_vector_KE( su3_vector * a, double s, su3_vector * c ) +{ + +#ifndef NATIVEDOUBLE + register int i; + for ( i = 0; i < 3; i++ ) + { + c->c[i].real = s * a->c[i].real; + c->c[i].imag = s * a->c[i].imag; + } + +#else /* RS6000 version */ + + register double ss; + + ss = s; + + c->c[0].real = ss * a->c[0].real; + c->c[0].imag = ss * a->c[0].imag; + c->c[1].real = ss * a->c[1].real; + c->c[1].imag = ss * a->c[1].imag; + c->c[2].real = ss * a->c[2].real; + c->c[2].imag = ss * a->c[2].imag; + +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/s_m_wvec.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/s_m_wvec.c new file mode 100644 index 0000000000000000000000000000000000000000..3b18ce922d92dbc47d4b2b3c1dfd995f8b191233 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/s_m_wvec.c @@ -0,0 +1,44 @@ +/******************** s_m_wvec.c (in su3.a) ******************** +* +*void scalar_mult_wvec(wilson_vector *src, double s, wilson_vector *dest) +* Multiply a Wilson vector by a scalar +* dest <- s*src +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +void scalar_mult_wvec( wilson_vector * src, double s, wilson_vector * dest ) +{ + register double ss; + ss = s; + + dest->COLORSPINOR( 0, 0 ).real = ss * src->COLORSPINOR( 0, 0 ).real; + dest->COLORSPINOR( 0, 0 ).imag = ss * src->COLORSPINOR( 0, 0 ).imag; + dest->COLORSPINOR( 1, 0 ).real = ss * src->COLORSPINOR( 1, 0 ).real; + dest->COLORSPINOR( 1, 0 ).imag = ss * src->COLORSPINOR( 1, 0 ).imag; + dest->COLORSPINOR( 2, 0 ).real = ss * src->COLORSPINOR( 2, 0 ).real; + dest->COLORSPINOR( 2, 0 ).imag = ss * src->COLORSPINOR( 2, 0 ).imag; + + dest->COLORSPINOR( 0, 1 ).real = ss * src->COLORSPINOR( 0, 1 ).real; + dest->COLORSPINOR( 0, 1 ).imag = ss * src->COLORSPINOR( 0, 1 ).imag; + dest->COLORSPINOR( 1, 1 ).real = ss * src->COLORSPINOR( 1, 1 ).real; + dest->COLORSPINOR( 1, 1 ).imag = ss * src->COLORSPINOR( 1, 1 ).imag; + dest->COLORSPINOR( 2, 1 ).real = ss * src->COLORSPINOR( 2, 1 ).real; + dest->COLORSPINOR( 2, 1 ).imag = ss * src->COLORSPINOR( 2, 1 ).imag; + + dest->COLORSPINOR( 0, 2 ).real = ss * src->COLORSPINOR( 0, 2 ).real; + dest->COLORSPINOR( 0, 2 ).imag = ss * src->COLORSPINOR( 0, 2 ).imag; + dest->COLORSPINOR( 1, 2 ).real = ss * src->COLORSPINOR( 1, 2 ).real; + dest->COLORSPINOR( 1, 2 ).imag = ss * src->COLORSPINOR( 1, 2 ).imag; + dest->COLORSPINOR( 2, 2 ).real = ss * src->COLORSPINOR( 2, 2 ).real; + dest->COLORSPINOR( 2, 2 ).imag = ss * src->COLORSPINOR( 2, 2 ).imag; + + dest->COLORSPINOR( 0, 3 ).real = ss * src->COLORSPINOR( 0, 3 ).real; + dest->COLORSPINOR( 0, 3 ).imag = ss * src->COLORSPINOR( 0, 3 ).imag; + dest->COLORSPINOR( 1, 3 ).real = ss * src->COLORSPINOR( 1, 3 ).real; + dest->COLORSPINOR( 1, 3 ).imag = ss * src->COLORSPINOR( 1, 3 ).imag; + dest->COLORSPINOR( 2, 3 ).real = ss * src->COLORSPINOR( 2, 3 ).real; + dest->COLORSPINOR( 2, 3 ).imag = ss * src->COLORSPINOR( 2, 3 ).imag; + +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/su3_adjoint.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/su3_adjoint.c new file mode 100644 index 0000000000000000000000000000000000000000..0b8547d275c7891062535706ea6f0e7e549c0cde --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/su3_adjoint.c @@ -0,0 +1,30 @@ +/****************** su3_adjoint.c (in su3.a) ************************** +* * +* void su3_adjoint_KE( su3_matrix *a, su3_matrix *b ) * +* B <- A_adjoint, adjoint of an SU3 matrix * +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +/* adjoint of an SU3 matrix */ +void su3_adjoint_KE( su3_matrix * a, su3_matrix * b ) +{ + register int i, j; + for ( i = 0; i < 3; i++ ) + for ( j = 0; j < 3; j++ ) + { + CONJG( a->ROWCOL( j, i ), b->ROWCOL( i, j ) ); + } +} + +/* transpose of an SU3 matrix */ +void su3_transpose( su3_matrix * a, su3_matrix * b ) +{ + register int i, j; + for ( i = 0; i < 3; i++ ) + for ( j = 0; j < 3; j++ ) + { + b->ROWCOL( j, i ) = a->ROWCOL( i, j ); + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/su3_dot.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/su3_dot.c new file mode 100644 index 0000000000000000000000000000000000000000..35bc1baa652328afe6e7c176169602010f4c21de --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/su3_dot.c @@ -0,0 +1,54 @@ +/****************** su3_dot.c (in su3.a) ****************************** +* * +* complex su3_dot_KE( su3_vector *a, su3_vector *b ) * +* return dot product of two su3_vectors: a^dagger b * +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +complex su3_dot_KE( su3_vector * a, su3_vector * b ) +{ + +#ifndef FAST + complex temp1, temp2; + CMULJ_( a->c[0], b->c[0], temp1 ) CMULJ_( a->c[1], b->c[1], temp2 ) CSUM( temp1, temp2 ); + CMULJ_( a->c[2], b->c[2], temp2 ) CSUM( temp1, temp2 ); + return ( temp1 ); + +#else /* RS6000 version */ + +#ifdef NATIVEDOUBLE + register double ar, ai, br, bi, cr, ci; +#else + register double ar, ai, br, bi, cr, ci; +#endif + register complex cc; + + ar = a->c[0].real; + ai = a->c[0].imag; + br = b->c[0].real; + bi = b->c[0].imag; + cr = ar * br + ai * bi; + ci = ar * bi - ai * br; + + ar = a->c[1].real; + ai = a->c[1].imag; + br = b->c[1].real; + bi = b->c[1].imag; + cr += ar * br + ai * bi; + ci += ar * bi - ai * br; + + ar = a->c[2].real; + ai = a->c[2].imag; + br = b->c[2].real; + bi = b->c[2].imag; + cr += ar * br + ai * bi; + ci += ar * bi - ai * br; + + cc.real = cr; + cc.imag = ci; + return ( cc ); + +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/su3_proj.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/su3_proj.c new file mode 100644 index 0000000000000000000000000000000000000000..59bd63fab44640da6d9eac3f6e8297cfc9e85a80 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/su3_proj.c @@ -0,0 +1,62 @@ +/***************** su3_proj.c (in su3.a) ****************************** +* * +* void su3_projector_KE( su3_vector *a, su3_vector *b, su3_matrix *c ) * +* C <- outer product of A and B * +* C_ij = A_i * B_adjoint_j * +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +#ifndef FAST +void su3_projector_KE( su3_vector * a, su3_vector * b, su3_matrix * c ) +{ + register int i, j; + for ( i = 0; i < 3; i++ ) + for ( j = 0; j < 3; j++ ) + { + CMUL_J( a->c[i], b->c[j], c->ROWCOL( i, j ) ); + } +} + +#else +#ifdef NATIVEDOUBLE /* RS6000 version */ + +void su3_projector_KE( su3_vector * a, su3_vector * b, su3_matrix * c ) +{ + + register int i, j; + register double ar, ai, br, bi; + + for ( i = 0; i < 3; i++ ) + { + ar = a->c[i].real; + ai = a->c[i].imag; + for ( j = 0; j < 3; j++ ) + { + br = b->c[j].real; + bi = b->c[j].imag; + c->ROWCOL( i, j ).real = ar * br + ai * bi; + c->ROWCOL( i, j ).imag = ai * br - ar * bi; + } + } +} +#else + +void su3_projector_KE( su3_vector * a, su3_vector * b, su3_matrix * c ) +{ + register int i, j; + register double tmp, tmp2; + for ( i = 0; i < 3; i++ ) + for ( j = 0; j < 3; j++ ) + { + tmp2 = a->c[i].real * b->c[j].real; + tmp = a->c[i].imag * b->c[j].imag; + c->ROWCOL( i, j ).real = tmp + tmp2; + tmp2 = a->c[i].real * b->c[j].imag; + tmp = a->c[i].imag * b->c[j].real; + c->ROWCOL( i, j ).imag = tmp - tmp2; + } +} +#endif /* End of "#ifdef NATIVEDOUBLE" */ +#endif /* end ifdef FAST */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/su3_proj_w.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/su3_proj_w.c new file mode 100644 index 0000000000000000000000000000000000000000..14a27b92a2d7f9a08d6902f7f02f05a39ecd114e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/su3_proj_w.c @@ -0,0 +1,26 @@ +/***************** su3_proj_w.c (in su3.a) ****************************/ +/* MIMD version 6 */ +/* * + * void su3_projector_w( wilson_vector *a, wilson_vector *b, su3_matrix *c ) + * C <- sum over spins of outer product of A.d[i] and B.d[i] * + * C_ij = sum( A_i * B_adjoint_j ) * + */ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +void su3_projector_w( wilson_vector * a, wilson_vector * b, su3_matrix * c ) +{ + register int i, j, k; + register complex cc; + for ( i = 0; i < 3; i++ ) + for ( j = 0; j < 3; j++ ) + { + c->ROWCOL( i, j ) = cmplx_KE( 0.0, 0.0 ); + for ( k = 0; k < 4; k++ ) + { + CMUL_J( a->COLORSPINOR( i, k ), b->COLORSPINOR( j, k ), cc ); + CSUM( c->ROWCOL( i, j ), cc ); + } + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/su3_rdot.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/su3_rdot.c new file mode 100644 index 0000000000000000000000000000000000000000..8a5f2b03b9f99d72fcb46b7814f46c4fb8b08579 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/su3_rdot.c @@ -0,0 +1,53 @@ +/***************** su3_rdot.c (in su3.a) ****************************** +* * +* double su3_rdot_KE( su3_vector *a, su3_vector *b ) * +* return real part of dot product of two su3_vectors * +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +double su3_rdot_KE( su3_vector * a, su3_vector * b ) +{ + +#ifndef NATIVEDOUBLE + register double temp1, temp2; + temp2 = a->c[0].real * b->c[0].real; + temp1 = a->c[0].imag * b->c[0].imag; + temp2 += temp1; + temp1 = a->c[1].real * b->c[1].real; + temp2 += temp1; + temp1 = a->c[1].imag * b->c[1].imag; + temp2 += temp1; + temp1 = a->c[2].real * b->c[2].real; + temp2 += temp1; + temp1 = a->c[2].imag * b->c[2].imag; + temp2 += temp1; + return ( temp2 ); + +#else /* RS6000 version */ + + register double ar, ai, br, bi, ss; + + ar = a->c[0].real; + ai = a->c[0].imag; + br = b->c[0].real; + bi = b->c[0].imag; + ss = ar * br + ai * bi; + + ar = a->c[1].real; + ai = a->c[1].imag; + br = b->c[1].real; + bi = b->c[1].imag; + ss += ar * br + ai * bi; + + ar = a->c[2].real; + ai = a->c[2].imag; + br = b->c[2].real; + bi = b->c[2].imag; + ss += ar * br + ai * bi; + + return ( ss ); + +#endif +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/su3mat_copy.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/su3mat_copy.c new file mode 100644 index 0000000000000000000000000000000000000000..185eb8fba73f416579f613ddcf849ed052c4bc2e --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/su3mat_copy.c @@ -0,0 +1,20 @@ +/***************** su3mat_copy.c (in su3.a) *************************** +* * +* void su3mat_copy_KE( su3_matrix *a, su3_matrix *b ) * +* Copy an su3 matrix: B <- A * +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +/* Copy a su3 matrix: b <- a */ +void su3mat_copy_KE( su3_matrix * a, su3_matrix * b ) +{ + register int i, j; + for ( i = 0; i < 3; i++ ) + for ( j = 0; j < 3; j++ ) + { + b->ROWCOL( i, j ).real = a->ROWCOL( i, j ).real; + b->ROWCOL( i, j ).imag = a->ROWCOL( i, j ).imag; + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/su3vec_copy.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/su3vec_copy.c new file mode 100644 index 0000000000000000000000000000000000000000..b861e1bd387c51ec3d7c0aa3d07bea4c2a337fc1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/su3vec_copy.c @@ -0,0 +1,19 @@ +/***************** su3vec_copy.c (in su3.a) *************************** +* * +* void su3vec_copy_KE( su3_vector *a, su3_vector *b ) * +* Copy an su3 vector: B <- A * +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +/* Copy a su3 vector: b <- a */ +void su3vec_copy_KE( su3_vector * a, su3_vector * b ) +{ + register int i; + for ( i = 0; i < 3; i++ ) + { + b->c[i].real = a->c[i].real; + b->c[i].imag = a->c[i].imag; + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/sub4vecs.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/sub4vecs.c new file mode 100644 index 0000000000000000000000000000000000000000..dbad652723cf3a99cd42efe1d9109bc999b6897c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/sub4vecs.c @@ -0,0 +1,40 @@ +/***************** sub4vecs.c (in su3.a) ****************************** +* * +* Subtract four su3_vectors from an su3_vector * +* void sub_four_su3_vecs_KE( su3_vector *a,*b1,*b2,*b3,*b4) * +* A <- A - B1 - B2 - B3 - B4 * +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +/* subtract four su3 vectors */ +#ifndef FAST +void sub_four_su3_vecs_KE( su3_vector * a, su3_vector * b1, su3_vector * b2, su3_vector * b3, su3_vector * b4 ) +{ + register int i; + for ( i = 0; i < 3; i++ ) + { + CSUB( a->c[i], b1->c[i], a->c[i] ); + CSUB( a->c[i], b2->c[i], a->c[i] ); + CSUB( a->c[i], b3->c[i], a->c[i] ); + CSUB( a->c[i], b4->c[i], a->c[i] ); + } +} +#else +void sub_four_su3_vecs_KE( su3_vector * a, su3_vector * b1, su3_vector * b2, su3_vector * b3, su3_vector * b4 ) +{ + CSUB( a->c[0], b1->c[0], a->c[0] ); + CSUB( a->c[1], b1->c[1], a->c[1] ); + CSUB( a->c[2], b1->c[2], a->c[2] ); + CSUB( a->c[0], b2->c[0], a->c[0] ); + CSUB( a->c[1], b2->c[1], a->c[1] ); + CSUB( a->c[2], b2->c[2], a->c[2] ); + CSUB( a->c[0], b3->c[0], a->c[0] ); + CSUB( a->c[1], b3->c[1], a->c[1] ); + CSUB( a->c[2], b3->c[2], a->c[2] ); + CSUB( a->c[0], b4->c[0], a->c[0] ); + CSUB( a->c[1], b4->c[1], a->c[1] ); + CSUB( a->c[2], b4->c[2], a->c[2] ); +} +#endif diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/sub_wvec.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/sub_wvec.c new file mode 100644 index 0000000000000000000000000000000000000000..7df0ad391727a732d5f2227e28179f4dcbc9eae1 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/sub_wvec.c @@ -0,0 +1,41 @@ +/******************** sub_wvec.c (in su3.a) ******************** +* +*void sub_wilson_vector(wilson_vector *src1,*src2,*dest) +* sub two Wilson vectors +* dest <- src1 + src2 +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + + +void sub_wilson_vector( wilson_vector * src1, wilson_vector * src2, wilson_vector * dest ) +{ + dest->COLORSPINOR( 0, 0 ).real = src1->COLORSPINOR( 0, 0 ).real - src2->COLORSPINOR( 0, 0 ).real; + dest->COLORSPINOR( 0, 0 ).imag = src1->COLORSPINOR( 0, 0 ).imag - src2->COLORSPINOR( 0, 0 ).imag; + dest->COLORSPINOR( 1, 0 ).real = src1->COLORSPINOR( 1, 0 ).real - src2->COLORSPINOR( 1, 0 ).real; + dest->COLORSPINOR( 1, 0 ).imag = src1->COLORSPINOR( 1, 0 ).imag - src2->COLORSPINOR( 1, 0 ).imag; + dest->COLORSPINOR( 2, 0 ).real = src1->COLORSPINOR( 2, 0 ).real - src2->COLORSPINOR( 2, 0 ).real; + dest->COLORSPINOR( 2, 0 ).imag = src1->COLORSPINOR( 2, 0 ).imag - src2->COLORSPINOR( 2, 0 ).imag; + + dest->COLORSPINOR( 0, 1 ).real = src1->COLORSPINOR( 0, 1 ).real - src2->COLORSPINOR( 0, 1 ).real; + dest->COLORSPINOR( 0, 1 ).imag = src1->COLORSPINOR( 0, 1 ).imag - src2->COLORSPINOR( 0, 1 ).imag; + dest->COLORSPINOR( 1, 1 ).real = src1->COLORSPINOR( 1, 1 ).real - src2->COLORSPINOR( 1, 1 ).real; + dest->COLORSPINOR( 1, 1 ).imag = src1->COLORSPINOR( 1, 1 ).imag - src2->COLORSPINOR( 1, 1 ).imag; + dest->COLORSPINOR( 2, 1 ).real = src1->COLORSPINOR( 2, 1 ).real - src2->COLORSPINOR( 2, 1 ).real; + dest->COLORSPINOR( 2, 1 ).imag = src1->COLORSPINOR( 2, 1 ).imag - src2->COLORSPINOR( 2, 1 ).imag; + + dest->COLORSPINOR( 0, 2 ).real = src1->COLORSPINOR( 0, 2 ).real - src2->COLORSPINOR( 0, 2 ).real; + dest->COLORSPINOR( 0, 2 ).imag = src1->COLORSPINOR( 0, 2 ).imag - src2->COLORSPINOR( 0, 2 ).imag; + dest->COLORSPINOR( 1, 2 ).real = src1->COLORSPINOR( 1, 2 ).real - src2->COLORSPINOR( 1, 2 ).real; + dest->COLORSPINOR( 1, 2 ).imag = src1->COLORSPINOR( 1, 2 ).imag - src2->COLORSPINOR( 1, 2 ).imag; + dest->COLORSPINOR( 2, 2 ).real = src1->COLORSPINOR( 2, 2 ).real - src2->COLORSPINOR( 2, 2 ).real; + dest->COLORSPINOR( 2, 2 ).imag = src1->COLORSPINOR( 2, 2 ).imag - src2->COLORSPINOR( 2, 2 ).imag; + + dest->COLORSPINOR( 0, 3 ).real = src1->COLORSPINOR( 0, 3 ).real - src2->COLORSPINOR( 0, 3 ).real; + dest->COLORSPINOR( 0, 3 ).imag = src1->COLORSPINOR( 0, 3 ).imag - src2->COLORSPINOR( 0, 3 ).imag; + dest->COLORSPINOR( 1, 3 ).real = src1->COLORSPINOR( 1, 3 ).real - src2->COLORSPINOR( 1, 3 ).real; + dest->COLORSPINOR( 1, 3 ).imag = src1->COLORSPINOR( 1, 3 ).imag - src2->COLORSPINOR( 1, 3 ).imag; + dest->COLORSPINOR( 2, 3 ).real = src1->COLORSPINOR( 2, 3 ).real - src2->COLORSPINOR( 2, 3 ).real; + dest->COLORSPINOR( 2, 3 ).imag = src1->COLORSPINOR( 2, 3 ).imag - src2->COLORSPINOR( 2, 3 ).imag; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/submat.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/submat.c new file mode 100644 index 0000000000000000000000000000000000000000..c0af4024e47777e53f96f7f741a0376dfe37e5ae --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/submat.c @@ -0,0 +1,19 @@ +/******************* submat.c (in su3.a) ****************************** +* * +* void sub_su3_matrix_KE(a,b,c) su3_matrix *a,*b,*c; * +* subtract su3 matrices: C <- A - B * +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +/* subtract su3 matrices */ +void sub_su3_matrix_KE( su3_matrix * a, su3_matrix * b, su3_matrix * c ) +{ + register int i, j; + for ( i = 0; i < 3; i++ ) + for ( j = 0; j < 3; j++ ) + { + CSUB( a->ROWCOL( i, j ), b->ROWCOL( i, j ), c->ROWCOL( i, j ) ); + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/subvec.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/subvec.c new file mode 100644 index 0000000000000000000000000000000000000000..f5e6ba59b414f6423b94b4b8ee3a0a22445c1ed5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/subvec.c @@ -0,0 +1,18 @@ +/********************* subvec.c (in su3.a) **************************** +* * +* void sub_su3_vector_KE(a,b,c) su3_vector *a,*b,*c; * +* subtract su3 vectors: C <- A - B * +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +/* subtract su3 vectors */ +void sub_su3_vector_KE( su3_vector * a, su3_vector * b, su3_vector * c ) +{ + register int i; + for ( i = 0; i < 3; i++ ) + { + CSUB( a->c[i], b->c[i], c->c[i] ); + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/trace_su3.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/trace_su3.c new file mode 100644 index 0000000000000000000000000000000000000000..e4a1fed91c4d0dc0db02bfd444f16d479c82daec --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/trace_su3.c @@ -0,0 +1,17 @@ +/******************* trace_su3.c (in su3.a) *************************** +* * +* complex trace_su3_KE(a) su3_matrix *a; * +* return complex trace of an SU3 matrix * +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +/* Complex trace of an SU3 matrix */ +complex trace_su3_KE( su3_matrix * a ) +{ + register complex t1, t2; + CADD( a->ROWCOL( 0, 0 ), a->ROWCOL( 1, 1 ), t1 ); + CADD( t1, a->ROWCOL( 2, 2 ), t2 ); + return ( t2 ); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/uncmp_ahmat.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/uncmp_ahmat.c new file mode 100644 index 0000000000000000000000000000000000000000..09cb83c4b51041af4955d99ad7a33b3a79800b17 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/uncmp_ahmat.c @@ -0,0 +1,36 @@ +/************ uncmp_ahmat.c (in su3.a) ******************************** +* * +* void uncompress_anti_hermitian_KE( anti_hermitmat *mat_antihermit, * +* su3_matrix *mat_su3 ) * +* uncompresses an anti_hermitian matrix to make a 3x3 complex matrix * +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +void uncompress_anti_hermitian_KE( anti_hermitmat * mat_antihermit, su3_matrix * mat_su3 ) +{ + /* uncompresses an anti_hermitian su3 matrix */ + double temp1; + mat_su3->ROWCOL( 0, 0 ).imag = mat_antihermit->m00im; + mat_su3->ROWCOL( 0, 0 ).real = 0.; + mat_su3->ROWCOL( 1, 1 ).imag = mat_antihermit->m11im; + mat_su3->ROWCOL( 1, 1 ).real = 0.; + mat_su3->ROWCOL( 2, 2 ).imag = mat_antihermit->m22im; + mat_su3->ROWCOL( 2, 2 ).real = 0.; + mat_su3->ROWCOL( 0, 1 ).imag = mat_antihermit->m01.imag; + temp1 = mat_antihermit->m01.real; + mat_su3->ROWCOL( 0, 1 ).real = temp1; + mat_su3->ROWCOL( 1, 0 ).real = -temp1; + mat_su3->ROWCOL( 1, 0 ).imag = mat_antihermit->m01.imag; + mat_su3->ROWCOL( 0, 2 ).imag = mat_antihermit->m02.imag; + temp1 = mat_antihermit->m02.real; + mat_su3->ROWCOL( 0, 2 ).real = temp1; + mat_su3->ROWCOL( 2, 0 ).real = -temp1; + mat_su3->ROWCOL( 2, 0 ).imag = mat_antihermit->m02.imag; + mat_su3->ROWCOL( 1, 2 ).imag = mat_antihermit->m12.imag; + temp1 = mat_antihermit->m12.real; + mat_su3->ROWCOL( 1, 2 ).real = temp1; + mat_su3->ROWCOL( 2, 1 ).real = -temp1; + mat_su3->ROWCOL( 2, 1 ).imag = mat_antihermit->m12.imag; +} /*uncompress_anti_hermitian */ diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/wp_grow.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/wp_grow.c new file mode 100644 index 0000000000000000000000000000000000000000..a3c57ebe6ef6c09a737baaf3d4c9ad21457cd2eb --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/wp_grow.c @@ -0,0 +1,240 @@ +/***************** wp_grow.c (in su3.a) **************************/ +/* + Expand the "Wilson projection" of a Wilson fermion vector. + (1 +- gamma_j) is a projection operator, and we are given a + half_wilson_vector which contains the two components of a Wilson + vector projected out. This routine reexpands it to a four component + object. + + usage: wp_grow( half_wilson_vector *src, wilson_vector *dest, + int dir, int sign ); + + If dir is one of XUP,YUP,ZUP or TUP, the projection is + along the eigenvectors with eigenvalue +1, which survive + multiplcation by (1+gamma[dir]). + If dir is one of XDOWN,YDOWN,ZDOWN or TDOWN, the projection is + along the eigenvectors with eigenvalue -1, which survive + multiplication by (1-gamma[OPP_DIR(dir)]). + If sign=MINUS reverse the roles of +1 and -1 - in other words + use -gamma_dir instead of gamma_dir + + Here my eigenvectors are normalized to 2, so for XYZT directions + I won't explicitely multiply by 2. In other words, the matrix of + eigenvectors is sqrt(2) times a unitary matrix, and in reexpanding + the vector I will multiply by the adjoint of this matrix. + + For UP directions, hvec.h[0] and hvec.h[2] contain the projections + along the first and second eigenvectors respectively. + For DOWN directions, hvec.h[0] and hvec.h[2] contain the projections + along the third and fourth eigenvectors respectively. This results + in down directions differing from up directions only in the sign of + the addition. + + Note: wp_shrink( +-dir) followed by wp_grow( +-dir) amounts to multiplication + by 1+-gamma_dir + + gamma(XUP) eigenvectors eigenvalue + 0 0 0 i ( 1, 0, 0,-i) +1 + 0 0 i 0 ( 0, 1,-i, 0) +1 + 0 -i 0 0 ( 0, 1, 0,+i) -1 + -i 0 0 0 ( 1, 0,+i, 0) -1 + + gamma(YUP) eigenvectors eigenvalue + 0 0 0 -1 ( 1, 0, 0,-1) +1 + 0 0 1 0 ( 0, 1, 1, 0) +1 + 0 1 0 0 ( 1, 0, 0, 1) -1 + -1 0 0 0 ( 0, 1,-1, 0) -1 + + gamma(ZUP) eigenvectors eigenvalue + 0 0 i 0 ( 1, 0,-i, 0) +1 + 0 0 0 -i ( 0, 1, 0,+i) +1 + -i 0 0 0 ( 1, 0,+i, 0) -1 + 0 i 0 0 ( 0, 1, 0,-i) -1 + + gamma(TUP) eigenvectors eigenvalue + 0 0 1 0 ( 1, 0, 1, 0) +1 + 0 0 0 1 ( 0, 1, 0, 1) +1 + 1 0 0 0 ( 1, 0,-1, 0) -1 + 0 1 0 0 ( 0, 1, 0,-1) -1 + + gamma(FIVE) eigenvectors eigenvalue + 1 0 0 0 + 0 1 0 0 + 0 0 -1 0 + 0 0 0 -1 + */ +#include "../include/config.h" +#include +#include "../include/complex.h" +#include "../include/su3.h" +#include "../include/dirs.h" + + +void wp_grow( half_wilson_vector * src, wilson_vector * dest, int dir, int sign ) +{ + register int i; /*color */ + + if( sign == MINUS ) + dir = OPP_DIR( dir ); /* two ways to get -gamma_dir ! */ + switch ( dir ) + { + case XUP: + for ( i = 0; i < 3; i++ ) + { + dest->COLORSPINOR( i, 0 ) = src->h[0].c[i]; + dest->COLORSPINOR( i, 1 ) = src->h[1].c[i]; + TIMESMINUSI( src->h[0].c[i], dest->COLORSPINOR( i, 3 ) ); + TIMESMINUSI( src->h[1].c[i], dest->COLORSPINOR( i, 2 ) ); + } + break; + case XDOWN: + for ( i = 0; i < 3; i++ ) + { + dest->COLORSPINOR( i, 0 ) = src->h[0].c[i]; + dest->COLORSPINOR( i, 1 ) = src->h[1].c[i]; + TIMESPLUSI( src->h[0].c[i], dest->COLORSPINOR( i, 3 ) ); + TIMESPLUSI( src->h[1].c[i], dest->COLORSPINOR( i, 2 ) ); + } + break; + case YUP: + for ( i = 0; i < 3; i++ ) + { + dest->COLORSPINOR( i, 0 ) = src->h[0].c[i]; + dest->COLORSPINOR( i, 1 ) = src->h[1].c[i]; + TIMESMINUSONE( src->h[0].c[i], dest->COLORSPINOR( i, 3 ) ); + TIMESPLUSONE( src->h[1].c[i], dest->COLORSPINOR( i, 2 ) ); + } + break; + case YDOWN: + for ( i = 0; i < 3; i++ ) + { + dest->COLORSPINOR( i, 0 ) = src->h[0].c[i]; + dest->COLORSPINOR( i, 1 ) = src->h[1].c[i]; + TIMESPLUSONE( src->h[0].c[i], dest->COLORSPINOR( i, 3 ) ); + TIMESMINUSONE( src->h[1].c[i], dest->COLORSPINOR( i, 2 ) ); + } + break; + case ZUP: + for ( i = 0; i < 3; i++ ) + { + dest->COLORSPINOR( i, 0 ) = src->h[0].c[i]; + dest->COLORSPINOR( i, 1 ) = src->h[1].c[i]; + TIMESMINUSI( src->h[0].c[i], dest->COLORSPINOR( i, 2 ) ); + TIMESPLUSI( src->h[1].c[i], dest->COLORSPINOR( i, 3 ) ); + } + break; + case ZDOWN: + for ( i = 0; i < 3; i++ ) + { + dest->COLORSPINOR( i, 0 ) = src->h[0].c[i]; + dest->COLORSPINOR( i, 1 ) = src->h[1].c[i]; + TIMESPLUSI( src->h[0].c[i], dest->COLORSPINOR( i, 2 ) ); + TIMESMINUSI( src->h[1].c[i], dest->COLORSPINOR( i, 3 ) ); + } + break; + case TUP: + for ( i = 0; i < 3; i++ ) + { + dest->COLORSPINOR( i, 0 ) = src->h[0].c[i]; + dest->COLORSPINOR( i, 1 ) = src->h[1].c[i]; + dest->COLORSPINOR( i, 2 ) = src->h[0].c[i]; + dest->COLORSPINOR( i, 3 ) = src->h[1].c[i]; + } + break; + case TDOWN: + for ( i = 0; i < 3; i++ ) + { + dest->COLORSPINOR( i, 0 ) = src->h[0].c[i]; + dest->COLORSPINOR( i, 1 ) = src->h[1].c[i]; + TIMESMINUSONE( src->h[0].c[i], dest->COLORSPINOR( i, 2 ) ); + TIMESMINUSONE( src->h[1].c[i], dest->COLORSPINOR( i, 3 ) ); + } + break; + default: + printf( "BAD CALL TO WP_GROW()\n" ); + } +} +void wp_grow_hch( half_wilson_vector * src, wilson_vector * dest, int dir, int sign ) +{ + register int i; /*color */ + + if( sign == MINUS ) + dir = OPP_DIR( dir ); /* two ways to get -gamma_dir ! */ + switch ( dir ) + { + case XUP: + for ( i = 0; i < 3; i++ ) + { + dest->COLORSPINOR( i, 0 ) = src->h[0].c[i]; + dest->COLORSPINOR( i, 1 ) = src->h[1].c[i]; + TIMESMINUSI( src->h[0].c[i], dest->COLORSPINOR( i, 3 ) ); + TIMESMINUSI( src->h[1].c[i], dest->COLORSPINOR( i, 2 ) ); + } + break; + case XDOWN: + for ( i = 0; i < 3; i++ ) + { + TIMESPLUSI( src->h[0].c[i], dest->COLORSPINOR( i, 0 ) ); + TIMESPLUSI( src->h[1].c[i], dest->COLORSPINOR( i, 1 ) ); + TIMESMINUSONE( src->h[1].c[i], dest->COLORSPINOR( i, 2 ) ); + TIMESMINUSONE( src->h[0].c[i], dest->COLORSPINOR( i, 3 ) ); + } + break; + case YUP: + for ( i = 0; i < 3; i++ ) + { + dest->COLORSPINOR( i, 0 ) = src->h[0].c[i]; + dest->COLORSPINOR( i, 1 ) = src->h[1].c[i]; + TIMESMINUSONE( src->h[0].c[i], dest->COLORSPINOR( i, 3 ) ); + TIMESPLUSONE( src->h[1].c[i], dest->COLORSPINOR( i, 2 ) ); + } + break; + case YDOWN: + for ( i = 0; i < 3; i++ ) + { + TIMESMINUSONE( src->h[0].c[i], dest->COLORSPINOR( i, 0 ) ); + TIMESPLUSONE( src->h[1].c[i], dest->COLORSPINOR( i, 1 ) ); + TIMESMINUSONE( src->h[1].c[i], dest->COLORSPINOR( i, 2 ) ); + TIMESMINUSONE( src->h[0].c[i], dest->COLORSPINOR( i, 3 ) ); + } + break; + case ZUP: + for ( i = 0; i < 3; i++ ) + { + dest->COLORSPINOR( i, 0 ) = src->h[0].c[i]; + dest->COLORSPINOR( i, 1 ) = src->h[1].c[i]; + TIMESMINUSI( src->h[0].c[i], dest->COLORSPINOR( i, 2 ) ); + TIMESPLUSI( src->h[1].c[i], dest->COLORSPINOR( i, 3 ) ); + } + break; + case ZDOWN: + for ( i = 0; i < 3; i++ ) + { + TIMESPLUSI( src->h[1].c[i], dest->COLORSPINOR( i, 0 ) ); + TIMESMINUSI( src->h[0].c[i], dest->COLORSPINOR( i, 1 ) ); + TIMESMINUSONE( src->h[1].c[i], dest->COLORSPINOR( i, 2 ) ); + TIMESMINUSONE( src->h[0].c[i], dest->COLORSPINOR( i, 3 ) ); + } + break; + case TUP: + for ( i = 0; i < 3; i++ ) + { + dest->COLORSPINOR( i, 0 ) = src->h[0].c[i]; + dest->COLORSPINOR( i, 1 ) = src->h[1].c[i]; + dest->COLORSPINOR( i, 2 ) = src->h[0].c[i]; + dest->COLORSPINOR( i, 3 ) = src->h[1].c[i]; + } + break; + case TDOWN: + for ( i = 0; i < 3; i++ ) + { + TIMESPLUSONE( src->h[1].c[i], dest->COLORSPINOR( i, 0 ) ); + TIMESPLUSONE( src->h[0].c[i], dest->COLORSPINOR( i, 1 ) ); + TIMESMINUSONE( src->h[1].c[i], dest->COLORSPINOR( i, 2 ) ); + TIMESMINUSONE( src->h[0].c[i], dest->COLORSPINOR( i, 3 ) ); + } + break; + default: + printf( "BAD CALL TO WP_GROW()\n" ); + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/wp_grow_a.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/wp_grow_a.c new file mode 100644 index 0000000000000000000000000000000000000000..914e421c4a08b56f988fd299b6bd1accd717f1ff --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/wp_grow_a.c @@ -0,0 +1,240 @@ +/***************** wp_grow_a.c (in su3.a) **************************/ +/* + Expand the "Wilson projection" of a Wilson fermion vector. + (1 +- gamma_j) is a projection operator, and we are given a + half_wilson_vector which contains the two components of a Wilson + vector projected out. This routine reexpands it to a four component + object and adds it to another Wilson vector. + + usage: wp_grow_add( half_wilson_vector *src, wilson_vector *dest, + int dir, int sign ); + + If dir is one of XUP,YUP,ZUP or TUP, the projection is + along the eigenvectors with eigenvalue +1, which survive + multiplcation by (1+gamma[dir]). + If dir is one of XDOWN,YDOWN,ZDOWN or TDOWN, the projection is + along the eigenvectors with eigenvalue -1, which survive + multiplication by (1-gamma[OPP_DIR(dir)]). + If sign=MINUS reverse the roles of +1 and -1 - in other words + use -gamma_dir instead of gamma_dir + + Here my eigenvectors are normalized to 2, so for XYZT directions + I won't explicitly multiply by 2. In other words, the matrix of + eigenvectors is sqrt(2) times a unitary matrix, and in reexpanding + the vector I will multiply by the adjoint of this matrix. + + For UP directions, hvec.h[0] and hvec.h[2] contain the projections + along the first and second eigenvectors respectively. + For DOWN directions, hvec.h[0] and hvec.h[2] contain the projections + along the third and fourth eigenvectors respectively. This results + in down directions differing from up directions only in the sign of + the addition. + + Note: wp_shrink( +-dir) followed by wp_grow( +-dir) amounts to multiplication + by 1+-gamma_dir + + gamma(XUP) eigenvectors eigenvalue + 0 0 0 i ( 1, 0, 0,-i) +1 + 0 0 i 0 ( 0, 1,-i, 0) +1 + 0 -i 0 0 ( 0, 1, 0,+i) -1 + -i 0 0 0 ( 1, 0,+i ,0) -1 + + gamma(YUP) eigenvectors eigenvalue + 0 0 0 -1 ( 1, 0, 0,-1) +1 + 0 0 1 0 ( 0, 1, 1, 0) +1 + 0 1 0 0 ( 1, 0, 0, 1) -1 + -1 0 0 0 ( 0, 1,-1, 0) -1 + + gamma(ZUP) eigenvectors eigenvalue + 0 0 i 0 ( 1, 0,-i, 0) +1 + 0 0 0 -i ( 0, 1, 0,+i) +1 + -i 0 0 0 ( 1, 0,+i, 0) -1 + 0 i 0 0 ( 0, 1, 0,-i) -1 + + gamma(TUP) eigenvectors eigenvalue + 0 0 1 0 ( 1, 0, 1, 0) +1 + 0 0 0 1 ( 0, 1, 0, 1) +1 + 1 0 0 0 ( 1, 0,-1, 0) -1 + 0 1 0 0 ( 0, 1, 0,-1) -1 + + gamma(FIVE) eigenvectors eigenvalue + 1 0 0 0 + 0 1 0 0 + 0 0 -1 0 + 0 0 0 -1 + */ +#include +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" +#include "../include/dirs.h" + +void wp_grow_add( half_wilson_vector * src, wilson_vector * dest, int dir, int sign ) +{ + register int i; /*color */ + + if( sign == MINUS ) + dir = OPP_DIR( dir ); /* two ways to get -gamma_dir ! */ + switch ( dir ) + { + case XUP: + for ( i = 0; i < 3; i++ ) + { + CSUM( dest->COLORSPINOR( i, 0 ), src->h[0].c[i] ); + CSUM( dest->COLORSPINOR( i, 1 ), src->h[1].c[i] ); + CSUM_TMI( dest->COLORSPINOR( i, 2 ), src->h[1].c[i] ); + CSUM_TMI( dest->COLORSPINOR( i, 3 ), src->h[0].c[i] ); + } + break; + case XDOWN: + for ( i = 0; i < 3; i++ ) + { + CSUM( dest->COLORSPINOR( i, 0 ), src->h[0].c[i] ); + CSUM( dest->COLORSPINOR( i, 1 ), src->h[1].c[i] ); + CSUM_TPI( dest->COLORSPINOR( i, 2 ), src->h[1].c[i] ); + CSUM_TPI( dest->COLORSPINOR( i, 3 ), src->h[0].c[i] ); + } + break; + case YUP: + for ( i = 0; i < 3; i++ ) + { + CSUM( dest->COLORSPINOR( i, 0 ), src->h[0].c[i] ); + CSUM( dest->COLORSPINOR( i, 1 ), src->h[1].c[i] ); + CSUM( dest->COLORSPINOR( i, 2 ), src->h[1].c[i] ); + CSUB( dest->COLORSPINOR( i, 3 ), src->h[0].c[i], dest->COLORSPINOR( i, 3 ) ); + } + break; + case YDOWN: + for ( i = 0; i < 3; i++ ) + { + CSUM( dest->COLORSPINOR( i, 0 ), src->h[0].c[i] ); + CSUM( dest->COLORSPINOR( i, 1 ), src->h[1].c[i] ); + CSUB( dest->COLORSPINOR( i, 2 ), src->h[1].c[i], dest->COLORSPINOR( i, 2 ) ); + CSUM( dest->COLORSPINOR( i, 3 ), src->h[0].c[i] ); + } + break; + case ZUP: + for ( i = 0; i < 3; i++ ) + { + CSUM( dest->COLORSPINOR( i, 0 ), src->h[0].c[i] ); + CSUM( dest->COLORSPINOR( i, 1 ), src->h[1].c[i] ); + CSUM_TMI( dest->COLORSPINOR( i, 2 ), src->h[0].c[i] ); + CSUM_TPI( dest->COLORSPINOR( i, 3 ), src->h[1].c[i] ); + } + break; + case ZDOWN: + for ( i = 0; i < 3; i++ ) + { + CSUM( dest->COLORSPINOR( i, 0 ), src->h[0].c[i] ); + CSUM( dest->COLORSPINOR( i, 1 ), src->h[1].c[i] ); + CSUM_TPI( dest->COLORSPINOR( i, 2 ), src->h[0].c[i] ); + CSUM_TMI( dest->COLORSPINOR( i, 3 ), src->h[1].c[i] ); + } + break; + case TUP: + for ( i = 0; i < 3; i++ ) + { + CSUM( dest->COLORSPINOR( i, 0 ), src->h[0].c[i] ); + CSUM( dest->COLORSPINOR( i, 1 ), src->h[1].c[i] ); + CSUM( dest->COLORSPINOR( i, 2 ), src->h[0].c[i] ); + CSUM( dest->COLORSPINOR( i, 3 ), src->h[1].c[i] ); + } + break; + case TDOWN: + for ( i = 0; i < 3; i++ ) + { + CSUM( dest->COLORSPINOR( i, 0 ), src->h[0].c[i] ); + CSUM( dest->COLORSPINOR( i, 1 ), src->h[1].c[i] ); + CSUB( dest->COLORSPINOR( i, 2 ), src->h[0].c[i], dest->COLORSPINOR( i, 2 ) ); + CSUB( dest->COLORSPINOR( i, 3 ), src->h[1].c[i], dest->COLORSPINOR( i, 3 ) ); + } + break; + default: + printf( "BAD CALL TO WP_GROW()\n" ); + } +} + +void wp_grow_add_hch( half_wilson_vector * src, wilson_vector * dest, int dir, int sign ) +{ + register int i; /*color */ + + if( sign == MINUS ) + dir = OPP_DIR( dir ); /* two ways to get -gamma_dir ! */ + switch ( dir ) + { + case XUP: + for ( i = 0; i < 3; i++ ) + { + CSUM( dest->COLORSPINOR( i, 0 ), src->h[0].c[i] ); + CSUM( dest->COLORSPINOR( i, 1 ), src->h[1].c[i] ); + CSUM_TMI( dest->COLORSPINOR( i, 2 ), src->h[1].c[i] ); + CSUM_TMI( dest->COLORSPINOR( i, 3 ), src->h[0].c[i] ); + } + break; + case XDOWN: + for ( i = 0; i < 3; i++ ) + { + CSUM_TPI( dest->COLORSPINOR( i, 0 ), src->h[0].c[i] ); + CSUM_TPI( dest->COLORSPINOR( i, 1 ), src->h[1].c[i] ); + CSUB( dest->COLORSPINOR( i, 2 ), src->h[1].c[i], dest->COLORSPINOR( i, 2 ) ); + CSUB( dest->COLORSPINOR( i, 3 ), src->h[0].c[i], dest->COLORSPINOR( i, 3 ) ); + } + break; + case YUP: + for ( i = 0; i < 3; i++ ) + { + CSUM( dest->COLORSPINOR( i, 0 ), src->h[0].c[i] ); + CSUM( dest->COLORSPINOR( i, 1 ), src->h[1].c[i] ); + CSUM( dest->COLORSPINOR( i, 2 ), src->h[1].c[i] ); + CSUB( dest->COLORSPINOR( i, 3 ), src->h[0].c[i], dest->COLORSPINOR( i, 3 ) ); + } + break; + case YDOWN: + for ( i = 0; i < 3; i++ ) + { + CSUB( dest->COLORSPINOR( i, 0 ), src->h[0].c[i], dest->COLORSPINOR( i, 0 ) ); + CSUM( dest->COLORSPINOR( i, 1 ), src->h[1].c[i] ); + CSUB( dest->COLORSPINOR( i, 2 ), src->h[1].c[i], dest->COLORSPINOR( i, 2 ) ); + CSUB( dest->COLORSPINOR( i, 3 ), src->h[0].c[i], dest->COLORSPINOR( i, 3 ) ); + } + break; + case ZUP: + for ( i = 0; i < 3; i++ ) + { + CSUM( dest->COLORSPINOR( i, 0 ), src->h[0].c[i] ); + CSUM( dest->COLORSPINOR( i, 1 ), src->h[1].c[i] ); + CSUM_TMI( dest->COLORSPINOR( i, 2 ), src->h[0].c[i] ); + CSUM_TPI( dest->COLORSPINOR( i, 3 ), src->h[1].c[i] ); + } + break; + case ZDOWN: + for ( i = 0; i < 3; i++ ) + { + CSUM_TPI( dest->COLORSPINOR( i, 0 ), src->h[1].c[i] ); + CSUM_TMI( dest->COLORSPINOR( i, 1 ), src->h[0].c[i] ); + CSUB( dest->COLORSPINOR( i, 2 ), src->h[1].c[i], dest->COLORSPINOR( i, 2 ) ); + CSUB( dest->COLORSPINOR( i, 3 ), src->h[0].c[i], dest->COLORSPINOR( i, 3 ) ); + } + break; + case TUP: + for ( i = 0; i < 3; i++ ) + { + CSUM( dest->COLORSPINOR( i, 0 ), src->h[0].c[i] ); + CSUM( dest->COLORSPINOR( i, 1 ), src->h[1].c[i] ); + CSUM( dest->COLORSPINOR( i, 2 ), src->h[0].c[i] ); + CSUM( dest->COLORSPINOR( i, 3 ), src->h[1].c[i] ); + } + break; + case TDOWN: + for ( i = 0; i < 3; i++ ) + { + CSUM( dest->COLORSPINOR( i, 0 ), src->h[1].c[i] ); + CSUM( dest->COLORSPINOR( i, 1 ), src->h[0].c[i] ); + CSUB( dest->COLORSPINOR( i, 2 ), src->h[1].c[i], dest->COLORSPINOR( i, 2 ) ); + CSUB( dest->COLORSPINOR( i, 3 ), src->h[0].c[i], dest->COLORSPINOR( i, 3 ) ); + } + break; + default: + printf( "BAD CALL TO WP_GROW()\n" ); + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/wp_shrink.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/wp_shrink.c new file mode 100644 index 0000000000000000000000000000000000000000..0833cec1f191690fc8428435f91dc4531813b801 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/wp_shrink.c @@ -0,0 +1,241 @@ +/************* wp_shrink.c (in su3.a) **************************/ +/* + Compute the "Wilson projection" of a Wilson fermion vector. + (1 +- gamma_j) is a projection operator, and we want to isolate + the components of the vector that it keeps. In other words, keep + the components of the vector along the eigenvectors of 1+-gamma_j + with eigenvalue 2, and throw away those with eigenvalue 0. + + usage: wp_shrink( wilson_vector *src, half_wilson_vector *dest, + int dir, int sign ) + + If dir is one of XUP,YUP,ZUP or TUP, take the projections + along the eigenvectors with eigenvalue +1, which survive + multiplication by (1+gamma[dir]). + If dir is one of XDOWN,YDOWN,ZDOWN or TDOWN, take the projections + along the eigenvectors with eigenvalue -1, which survive + multiplication by (1-gamma[OPP_DIR(dir)]). + If sign=MINUS, switch the roles of +1 and -1 (ie use -gamma_dir + instead of gamma_dir ) + + Here my eigenvectors are normalized to 2, so for XYZT directions + I won't explicitely multiply by 2. In other words, the matrix of + eigenvectors is sqrt(2) times a unitary matrix, and in reexpanding + the vector I will multiply by the adjoint of this matrix. + + For UP directions, hvec.h[0] and hvec.h[2] contain the projections + along the first and second eigenvectors respectively. + For DOWN directions, hvec.h[0] and hvec.h[2] contain the projections + along the third and fourth eigenvectors respectively. This results + in down directions differing from up directions only in the sign of + the addition. + + Note: wp_shrink( +-dir) followed by wp_grow( +-dir) amounts to multiplication + by 1+-gamma_dir + + gamma(XUP) eigenvectors eigenvalue + 0 0 0 i ( 1, 0, 0,-i) +1 + 0 0 i 0 ( 0, 1,-i, 0) +1 + 0 -i 0 0 ( 0, 1, 0,+i) -1 + -i 0 0 0 ( 1, 0,+i, 0) -1 + + gamma(YUP) eigenvectors eigenvalue + 0 0 0 -1 ( 1, 0, 0,-1) +1 + 0 0 1 0 ( 0, 1, 1, 0) +1 + 0 1 0 0 ( 1, 0, 0, 1) -1 + -1 0 0 0 ( 0, 1,-1, 0) -1 + + gamma(ZUP) eigenvectors eigenvalue + 0 0 i 0 ( 1, 0,-i, 0) +1 + 0 0 0 -i ( 0, 1, 0,+i) +1 + -i 0 0 0 ( 1, 0,+i, 0) -1 + 0 i 0 0 ( 0, 1, 0,-i) -1 + + gamma(TUP) eigenvectors eigenvalue + 0 0 1 0 ( 1, 0, 1, 0) +1 + 0 0 0 1 ( 0, 1, 0, 1) +1 + 1 0 0 0 ( 1, 0,-1, 0) -1 + 0 1 0 0 ( 0, 1, 0,-1) -1 + + gamma(FIVE) eigenvectors eigenvalue + 1 0 0 0 + 0 1 0 0 + 0 0 -1 0 + 0 0 0 -1 + */ +#include +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" +#include "../include/dirs.h" + + +void wp_shrink( wilson_vector * src, half_wilson_vector * dest, int dir, int sign ) +{ + register int i; /*color */ + + if( sign == MINUS ) + dir = OPP_DIR( dir ); /* two ways to get -gamma_dir ! */ + switch ( dir ) + { + case XUP: + for ( i = 0; i < 3; i++ ) + { + dest->h[0].c[i].real = src->COLORSPINOR( i, 0 ).real - src->COLORSPINOR( i, 3 ).imag; + dest->h[0].c[i].imag = src->COLORSPINOR( i, 0 ).imag + src->COLORSPINOR( i, 3 ).real; + dest->h[1].c[i].real = src->COLORSPINOR( i, 1 ).real - src->COLORSPINOR( i, 2 ).imag; + dest->h[1].c[i].imag = src->COLORSPINOR( i, 1 ).imag + src->COLORSPINOR( i, 2 ).real; + } + break; + case XDOWN: + for ( i = 0; i < 3; i++ ) + { + dest->h[0].c[i].real = src->COLORSPINOR( i, 0 ).real + src->COLORSPINOR( i, 3 ).imag; + dest->h[0].c[i].imag = src->COLORSPINOR( i, 0 ).imag - src->COLORSPINOR( i, 3 ).real; + dest->h[1].c[i].real = src->COLORSPINOR( i, 1 ).real + src->COLORSPINOR( i, 2 ).imag; + dest->h[1].c[i].imag = src->COLORSPINOR( i, 1 ).imag - src->COLORSPINOR( i, 2 ).real; + } + break; + case YUP: + for ( i = 0; i < 3; i++ ) + { + dest->h[0].c[i].real = src->COLORSPINOR( i, 0 ).real - src->COLORSPINOR( i, 3 ).real; + dest->h[0].c[i].imag = src->COLORSPINOR( i, 0 ).imag - src->COLORSPINOR( i, 3 ).imag; + dest->h[1].c[i].real = src->COLORSPINOR( i, 1 ).real + src->COLORSPINOR( i, 2 ).real; + dest->h[1].c[i].imag = src->COLORSPINOR( i, 1 ).imag + src->COLORSPINOR( i, 2 ).imag; + } + break; + case YDOWN: + for ( i = 0; i < 3; i++ ) + { + dest->h[0].c[i].real = src->COLORSPINOR( i, 0 ).real + src->COLORSPINOR( i, 3 ).real; + dest->h[0].c[i].imag = src->COLORSPINOR( i, 0 ).imag + src->COLORSPINOR( i, 3 ).imag; + dest->h[1].c[i].real = src->COLORSPINOR( i, 1 ).real - src->COLORSPINOR( i, 2 ).real; + dest->h[1].c[i].imag = src->COLORSPINOR( i, 1 ).imag - src->COLORSPINOR( i, 2 ).imag; + } + break; + case ZUP: + for ( i = 0; i < 3; i++ ) + { + dest->h[0].c[i].real = src->COLORSPINOR( i, 0 ).real - src->COLORSPINOR( i, 2 ).imag; + dest->h[0].c[i].imag = src->COLORSPINOR( i, 0 ).imag + src->COLORSPINOR( i, 2 ).real; + dest->h[1].c[i].real = src->COLORSPINOR( i, 1 ).real + src->COLORSPINOR( i, 3 ).imag; + dest->h[1].c[i].imag = src->COLORSPINOR( i, 1 ).imag - src->COLORSPINOR( i, 3 ).real; + } + break; + case ZDOWN: + for ( i = 0; i < 3; i++ ) + { + dest->h[0].c[i].real = src->COLORSPINOR( i, 0 ).real + src->COLORSPINOR( i, 2 ).imag; + dest->h[0].c[i].imag = src->COLORSPINOR( i, 0 ).imag - src->COLORSPINOR( i, 2 ).real; + dest->h[1].c[i].real = src->COLORSPINOR( i, 1 ).real - src->COLORSPINOR( i, 3 ).imag; + dest->h[1].c[i].imag = src->COLORSPINOR( i, 1 ).imag + src->COLORSPINOR( i, 3 ).real; + } + break; + case TUP: + for ( i = 0; i < 3; i++ ) + { + dest->h[0].c[i].real = src->COLORSPINOR( i, 0 ).real + src->COLORSPINOR( i, 2 ).real; + dest->h[0].c[i].imag = src->COLORSPINOR( i, 0 ).imag + src->COLORSPINOR( i, 2 ).imag; + dest->h[1].c[i].real = src->COLORSPINOR( i, 1 ).real + src->COLORSPINOR( i, 3 ).real; + dest->h[1].c[i].imag = src->COLORSPINOR( i, 1 ).imag + src->COLORSPINOR( i, 3 ).imag; + } + break; + case TDOWN: + for ( i = 0; i < 3; i++ ) + { + dest->h[0].c[i].real = src->COLORSPINOR( i, 0 ).real - src->COLORSPINOR( i, 2 ).real; + dest->h[0].c[i].imag = src->COLORSPINOR( i, 0 ).imag - src->COLORSPINOR( i, 2 ).imag; + dest->h[1].c[i].real = src->COLORSPINOR( i, 1 ).real - src->COLORSPINOR( i, 3 ).real; + dest->h[1].c[i].imag = src->COLORSPINOR( i, 1 ).imag - src->COLORSPINOR( i, 3 ).imag; + } + break; + default: + printf( "BAD CALL TO WP_SHRINK()\n" ); + } +} + +void wp_shrink_hch( wilson_vector * src, half_wilson_vector * dest, int dir, int sign ) +{ + register int i; /*color */ + + if( sign == MINUS ) + dir = OPP_DIR( dir ); /* two ways to get -gamma_dir ! */ + switch ( dir ) + { + case XUP: + for ( i = 0; i < 3; i++ ) + { + dest->h[0].c[i].real = src->COLORSPINOR( i, 0 ).real - src->COLORSPINOR( i, 3 ).imag; + dest->h[0].c[i].imag = src->COLORSPINOR( i, 0 ).imag + src->COLORSPINOR( i, 3 ).real; + dest->h[1].c[i].real = src->COLORSPINOR( i, 1 ).real - src->COLORSPINOR( i, 2 ).imag; + dest->h[1].c[i].imag = src->COLORSPINOR( i, 1 ).imag + src->COLORSPINOR( i, 2 ).real; + } + break; + case XDOWN: + for ( i = 0; i < 3; i++ ) + { + dest->h[1].c[i].real = src->COLORSPINOR( i, 1 ).imag - src->COLORSPINOR( i, 2 ).real; + dest->h[1].c[i].imag = -src->COLORSPINOR( i, 1 ).real - src->COLORSPINOR( i, 2 ).imag; + dest->h[0].c[i].real = src->COLORSPINOR( i, 0 ).imag - src->COLORSPINOR( i, 3 ).real; + dest->h[0].c[i].imag = -src->COLORSPINOR( i, 0 ).real - src->COLORSPINOR( i, 3 ).imag; + } + break; + case YUP: + for ( i = 0; i < 3; i++ ) + { + dest->h[0].c[i].real = src->COLORSPINOR( i, 0 ).real - src->COLORSPINOR( i, 3 ).real; + dest->h[0].c[i].imag = src->COLORSPINOR( i, 0 ).imag - src->COLORSPINOR( i, 3 ).imag; + dest->h[1].c[i].real = src->COLORSPINOR( i, 1 ).real + src->COLORSPINOR( i, 2 ).real; + dest->h[1].c[i].imag = src->COLORSPINOR( i, 1 ).imag + src->COLORSPINOR( i, 2 ).imag; + } + break; + case YDOWN: + for ( i = 0; i < 3; i++ ) + { + dest->h[1].c[i].real = src->COLORSPINOR( i, 1 ).real - src->COLORSPINOR( i, 2 ).real; + dest->h[1].c[i].imag = src->COLORSPINOR( i, 1 ).imag - src->COLORSPINOR( i, 2 ).imag; + dest->h[0].c[i].real = -src->COLORSPINOR( i, 0 ).real - src->COLORSPINOR( i, 3 ).real; + dest->h[0].c[i].imag = -src->COLORSPINOR( i, 0 ).imag - src->COLORSPINOR( i, 3 ).imag; + } + break; + case ZUP: + for ( i = 0; i < 3; i++ ) + { + dest->h[0].c[i].real = src->COLORSPINOR( i, 0 ).real - src->COLORSPINOR( i, 2 ).imag; + dest->h[0].c[i].imag = src->COLORSPINOR( i, 0 ).imag + src->COLORSPINOR( i, 2 ).real; + dest->h[1].c[i].real = src->COLORSPINOR( i, 1 ).real + src->COLORSPINOR( i, 3 ).imag; + dest->h[1].c[i].imag = src->COLORSPINOR( i, 1 ).imag - src->COLORSPINOR( i, 3 ).real; + } + break; + case ZDOWN: + for ( i = 0; i < 3; i++ ) + { + dest->h[1].c[i].real = src->COLORSPINOR( i, 0 ).imag - src->COLORSPINOR( i, 2 ).real; + dest->h[1].c[i].imag = -src->COLORSPINOR( i, 0 ).real - src->COLORSPINOR( i, 2 ).imag; + dest->h[0].c[i].real = -src->COLORSPINOR( i, 1 ).imag - src->COLORSPINOR( i, 3 ).real; + dest->h[0].c[i].imag = src->COLORSPINOR( i, 1 ).real - src->COLORSPINOR( i, 3 ).imag; + } + break; + case TUP: + for ( i = 0; i < 3; i++ ) + { + dest->h[0].c[i].real = src->COLORSPINOR( i, 0 ).real + src->COLORSPINOR( i, 2 ).real; + dest->h[0].c[i].imag = src->COLORSPINOR( i, 0 ).imag + src->COLORSPINOR( i, 2 ).imag; + dest->h[1].c[i].real = src->COLORSPINOR( i, 1 ).real + src->COLORSPINOR( i, 3 ).real; + dest->h[1].c[i].imag = src->COLORSPINOR( i, 1 ).imag + src->COLORSPINOR( i, 3 ).imag; + } + break; + case TDOWN: + for ( i = 0; i < 3; i++ ) + { + dest->h[1].c[i].real = src->COLORSPINOR( i, 0 ).real - src->COLORSPINOR( i, 2 ).real; + dest->h[1].c[i].imag = src->COLORSPINOR( i, 0 ).imag - src->COLORSPINOR( i, 2 ).imag; + dest->h[0].c[i].real = src->COLORSPINOR( i, 1 ).real - src->COLORSPINOR( i, 3 ).real; + dest->h[0].c[i].imag = src->COLORSPINOR( i, 1 ).imag - src->COLORSPINOR( i, 3 ).imag; + } + break; + default: + printf( "BAD CALL TO WP_SHRINK()\n" ); + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/wp_shrink4.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/wp_shrink4.c new file mode 100644 index 0000000000000000000000000000000000000000..5434636157647db3d0cc9ed1ec7971023cf11237 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/wp_shrink4.c @@ -0,0 +1,233 @@ +/***************** wp_shrink4.c (in su3.a) **************************** +* * +* Shrink a wilson vector in four directions, producing four * +* half_wilson_vectors. * +* void wp_shrink_4dir( wilson_vector *a, half_wilson_vector *b1, * +* half_wilson_vector *b2, half_wilson_vector *b3, * +* half_wilson_vector *b4, int sign ); * +* B1 <- (1 +- gamma_x)A,, projection * +* argument "sign" is sign of gamma matrix. * +* See wp_shrink.c for definitions of gamma matrices and eigenvectors. * +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" +#include "../include/dirs.h" + +void wp_shrink_4dir( wilson_vector * a, half_wilson_vector * b1, + half_wilson_vector * b2, half_wilson_vector * b3, half_wilson_vector * b4, int sign ) +{ + register int i; /*color */ + + /* wp_shrink( a,b1,XUP,sign); */ + + if( sign == PLUS ) + { + /* case XUP: */ + for ( i = 0; i < 3; i++ ) + { + b1->h[0].c[i].real = a->COLORSPINOR( i, 0 ).real - a->COLORSPINOR( i, 3 ).imag; + b1->h[0].c[i].imag = a->COLORSPINOR( i, 0 ).imag + a->COLORSPINOR( i, 3 ).real; + b1->h[1].c[i].real = a->COLORSPINOR( i, 1 ).real - a->COLORSPINOR( i, 2 ).imag; + b1->h[1].c[i].imag = a->COLORSPINOR( i, 1 ).imag + a->COLORSPINOR( i, 2 ).real; + } + } + else + { + /* case XDOWN: */ + for ( i = 0; i < 3; i++ ) + { + b1->h[0].c[i].real = a->COLORSPINOR( i, 0 ).real + a->COLORSPINOR( i, 3 ).imag; + b1->h[0].c[i].imag = a->COLORSPINOR( i, 0 ).imag - a->COLORSPINOR( i, 3 ).real; + b1->h[1].c[i].real = a->COLORSPINOR( i, 1 ).real + a->COLORSPINOR( i, 2 ).imag; + b1->h[1].c[i].imag = a->COLORSPINOR( i, 1 ).imag - a->COLORSPINOR( i, 2 ).real; + } + } + + + /*wp_shrink( a,b2,YUP,sign); */ + + if( sign == PLUS ) + { + /* case YUP: */ + for ( i = 0; i < 3; i++ ) + { + b2->h[0].c[i].real = a->COLORSPINOR( i, 0 ).real - a->COLORSPINOR( i, 3 ).real; + b2->h[0].c[i].imag = a->COLORSPINOR( i, 0 ).imag - a->COLORSPINOR( i, 3 ).imag; + b2->h[1].c[i].real = a->COLORSPINOR( i, 1 ).real + a->COLORSPINOR( i, 2 ).real; + b2->h[1].c[i].imag = a->COLORSPINOR( i, 1 ).imag + a->COLORSPINOR( i, 2 ).imag; + } + + } + else + { + /* case YDOWN: */ + for ( i = 0; i < 3; i++ ) + { + b2->h[0].c[i].real = a->COLORSPINOR( i, 0 ).real + a->COLORSPINOR( i, 3 ).real; + b2->h[0].c[i].imag = a->COLORSPINOR( i, 0 ).imag + a->COLORSPINOR( i, 3 ).imag; + b2->h[1].c[i].real = a->COLORSPINOR( i, 1 ).real - a->COLORSPINOR( i, 2 ).real; + b2->h[1].c[i].imag = a->COLORSPINOR( i, 1 ).imag - a->COLORSPINOR( i, 2 ).imag; + } + } + + /*wp_shrink( a,b3,ZUP,sign); */ + + if( sign == PLUS ) + { + /* case ZUP: */ + for ( i = 0; i < 3; i++ ) + { + b3->h[0].c[i].real = a->COLORSPINOR( i, 0 ).real - a->COLORSPINOR( i, 2 ).imag; + b3->h[0].c[i].imag = a->COLORSPINOR( i, 0 ).imag + a->COLORSPINOR( i, 2 ).real; + b3->h[1].c[i].real = a->COLORSPINOR( i, 1 ).real + a->COLORSPINOR( i, 3 ).imag; + b3->h[1].c[i].imag = a->COLORSPINOR( i, 1 ).imag - a->COLORSPINOR( i, 3 ).real; + } + } + else + { + /* case ZDOWN: */ + for ( i = 0; i < 3; i++ ) + { + b3->h[0].c[i].real = a->COLORSPINOR( i, 0 ).real + a->COLORSPINOR( i, 2 ).imag; + b3->h[0].c[i].imag = a->COLORSPINOR( i, 0 ).imag - a->COLORSPINOR( i, 2 ).real; + b3->h[1].c[i].real = a->COLORSPINOR( i, 1 ).real - a->COLORSPINOR( i, 3 ).imag; + b3->h[1].c[i].imag = a->COLORSPINOR( i, 1 ).imag + a->COLORSPINOR( i, 3 ).real; + } + + } + + /*wp_shrink( a,b4,TUP,sign); */ + + if( sign == PLUS ) + { + /* case TUP: */ + for ( i = 0; i < 3; i++ ) + { + b4->h[0].c[i].real = a->COLORSPINOR( i, 0 ).real + a->COLORSPINOR( i, 2 ).real; + b4->h[0].c[i].imag = a->COLORSPINOR( i, 0 ).imag + a->COLORSPINOR( i, 2 ).imag; + b4->h[1].c[i].real = a->COLORSPINOR( i, 1 ).real + a->COLORSPINOR( i, 3 ).real; + b4->h[1].c[i].imag = a->COLORSPINOR( i, 1 ).imag + a->COLORSPINOR( i, 3 ).imag; + } + } + else + { + /* case TDOWN: */ + for ( i = 0; i < 3; i++ ) + { + b4->h[0].c[i].real = a->COLORSPINOR( i, 0 ).real - a->COLORSPINOR( i, 2 ).real; + b4->h[0].c[i].imag = a->COLORSPINOR( i, 0 ).imag - a->COLORSPINOR( i, 2 ).imag; + b4->h[1].c[i].real = a->COLORSPINOR( i, 1 ).real - a->COLORSPINOR( i, 3 ).real; + b4->h[1].c[i].imag = a->COLORSPINOR( i, 1 ).imag - a->COLORSPINOR( i, 3 ).imag; + } + } +} + +void wp_shrink_4dir_hch( wilson_vector * a, half_wilson_vector * b1, + half_wilson_vector * b2, half_wilson_vector * b3, half_wilson_vector * b4, int sign ) +{ + register int i; /*color */ + + /* wp_shrink( a,b1,XUP,sign); */ + + if( sign == PLUS ) + { + /* case XUP: */ + for ( i = 0; i < 3; i++ ) + { + b1->h[0].c[i].real = a->COLORSPINOR( i, 0 ).real - a->COLORSPINOR( i, 3 ).imag; + b1->h[0].c[i].imag = a->COLORSPINOR( i, 0 ).imag + a->COLORSPINOR( i, 3 ).real; + b1->h[1].c[i].real = a->COLORSPINOR( i, 1 ).real - a->COLORSPINOR( i, 2 ).imag; + b1->h[1].c[i].imag = a->COLORSPINOR( i, 1 ).imag + a->COLORSPINOR( i, 2 ).real; + } + } + else + { + /* case XDOWN: */ + for ( i = 0; i < 3; i++ ) + { + b1->h[1].c[i].real = a->COLORSPINOR( i, 1 ).imag - a->COLORSPINOR( i, 2 ).real; + b1->h[1].c[i].imag = -a->COLORSPINOR( i, 1 ).real - a->COLORSPINOR( i, 2 ).imag; + b1->h[0].c[i].real = a->COLORSPINOR( i, 0 ).imag - a->COLORSPINOR( i, 3 ).real; + b1->h[0].c[i].imag = -a->COLORSPINOR( i, 0 ).real - a->COLORSPINOR( i, 3 ).imag; + } + } + + + /*wp_shrink( a,b2,YUP,sign); */ + + if( sign == PLUS ) + { + /* case YUP: */ + for ( i = 0; i < 3; i++ ) + { + b2->h[0].c[i].real = a->COLORSPINOR( i, 0 ).real - a->COLORSPINOR( i, 3 ).real; + b2->h[0].c[i].imag = a->COLORSPINOR( i, 0 ).imag - a->COLORSPINOR( i, 3 ).imag; + b2->h[1].c[i].real = a->COLORSPINOR( i, 1 ).real + a->COLORSPINOR( i, 2 ).real; + b2->h[1].c[i].imag = a->COLORSPINOR( i, 1 ).imag + a->COLORSPINOR( i, 2 ).imag; + } + + } + else + { + /* case YDOWN: */ + for ( i = 0; i < 3; i++ ) + { + b2->h[1].c[i].real = a->COLORSPINOR( i, 1 ).real - a->COLORSPINOR( i, 2 ).real; + b2->h[1].c[i].imag = a->COLORSPINOR( i, 1 ).imag - a->COLORSPINOR( i, 2 ).imag; + b2->h[0].c[i].real = -a->COLORSPINOR( i, 0 ).real - a->COLORSPINOR( i, 3 ).real; + b2->h[0].c[i].imag = -a->COLORSPINOR( i, 0 ).imag - a->COLORSPINOR( i, 3 ).imag; + } + } + + /*wp_shrink( a,b3,ZUP,sign); */ + + if( sign == PLUS ) + { + /* case ZUP: */ + for ( i = 0; i < 3; i++ ) + { + b3->h[0].c[i].real = a->COLORSPINOR( i, 0 ).real - a->COLORSPINOR( i, 2 ).imag; + b3->h[0].c[i].imag = a->COLORSPINOR( i, 0 ).imag + a->COLORSPINOR( i, 2 ).real; + b3->h[1].c[i].real = a->COLORSPINOR( i, 1 ).real + a->COLORSPINOR( i, 3 ).imag; + b3->h[1].c[i].imag = a->COLORSPINOR( i, 1 ).imag - a->COLORSPINOR( i, 3 ).real; + } + } + else + { + /* case ZDOWN: */ + for ( i = 0; i < 3; i++ ) + { + b3->h[1].c[i].real = a->COLORSPINOR( i, 0 ).imag - a->COLORSPINOR( i, 2 ).real; + b3->h[1].c[i].imag = -a->COLORSPINOR( i, 0 ).real - a->COLORSPINOR( i, 2 ).imag; + b3->h[0].c[i].real = -a->COLORSPINOR( i, 1 ).imag - a->COLORSPINOR( i, 3 ).real; + b3->h[0].c[i].imag = a->COLORSPINOR( i, 1 ).real - a->COLORSPINOR( i, 3 ).imag; + } + + } + + /*wp_shrink( a,b4,TUP,sign); */ + + if( sign == PLUS ) + { + /* case TUP: */ + for ( i = 0; i < 3; i++ ) + { + b4->h[0].c[i].real = a->COLORSPINOR( i, 0 ).real + a->COLORSPINOR( i, 2 ).real; + b4->h[0].c[i].imag = a->COLORSPINOR( i, 0 ).imag + a->COLORSPINOR( i, 2 ).imag; + b4->h[1].c[i].real = a->COLORSPINOR( i, 1 ).real + a->COLORSPINOR( i, 3 ).real; + b4->h[1].c[i].imag = a->COLORSPINOR( i, 1 ).imag + a->COLORSPINOR( i, 3 ).imag; + } + } + else + { + /* case TDOWN: */ + for ( i = 0; i < 3; i++ ) + { + b4->h[1].c[i].real = a->COLORSPINOR( i, 0 ).real - a->COLORSPINOR( i, 2 ).real; + b4->h[1].c[i].imag = a->COLORSPINOR( i, 0 ).imag - a->COLORSPINOR( i, 2 ).imag; + b4->h[0].c[i].real = a->COLORSPINOR( i, 1 ).real - a->COLORSPINOR( i, 3 ).real; + b4->h[0].c[i].imag = a->COLORSPINOR( i, 1 ).imag - a->COLORSPINOR( i, 3 ).imag; + } + } +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/wp_shrink8.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/wp_shrink8.c new file mode 100644 index 0000000000000000000000000000000000000000..1249e2c1c57f7a86b55ffa2abbe662ee0e86ede2 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/wp_shrink8.c @@ -0,0 +1,27 @@ +/***************** wp_shrink8.c (in su3.a) **************************** +* * +* Shrink a wilson vector in eight directions, producing eight * +* half_wilson_vectors. * +* void wp_shrink_8dir(a,b,sign) * +* wilson_vector *a; half_wilson_vector *b; * +* int sign; * +* B1 <- (1 +- gamma_x)A,, projection * +* argument "sign" is sign of gamma matrix. * +* See wp_shrink.c for definitions of gamma matrices and eigenvectors. * +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" +#include "../include/dirs.h" + +void wp_shrink_8dir( wilson_vector * a, half_wilson_vector * b, int sign ) +{ + wp_shrink( a, &( b[XUP] ), XUP, sign ); + wp_shrink( a, &( b[YUP] ), YUP, sign ); + wp_shrink( a, &( b[ZUP] ), ZUP, sign ); + wp_shrink( a, &( b[TUP] ), TUP, sign ); + wp_shrink( a, &( b[XDOWN] ), XDOWN, sign ); + wp_shrink( a, &( b[YDOWN] ), YDOWN, sign ); + wp_shrink( a, &( b[ZDOWN] ), ZDOWN, sign ); + wp_shrink( a, &( b[TDOWN] ), TDOWN, sign ); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/wvec2_dot.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/wvec2_dot.c new file mode 100644 index 0000000000000000000000000000000000000000..06f256c396a3cad7cedd1cc80d6bd391df7097c8 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/wvec2_dot.c @@ -0,0 +1,31 @@ +/****************** wvec2_dot.c (in su3.a) ***************************/ +/* MIMD version 6 */ +/* * + * complex wvec2_dot( wilson_vector *a, wilson_vector *b ) * + * return dot product of two wilson_vectors = a-dagger times b * + */ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +complex wvec2_dot( wilson_vector * a, wilson_vector * b ) +{ + complex temp; + wilson_vector c; + register int i, j; + + temp.real = wvec_rdot( a, b ); + + for ( i = 0; i < 4; i++ ) + { + for ( j = 0; j < 3; j++ ) + { + c.COLORSPINOR( j, i ).real = -( a->COLORSPINOR( j, i ).imag ); + c.COLORSPINOR( j, i ).imag = a->COLORSPINOR( j, i ).real; + } + } + + temp.imag = wvec_rdot( &c, b ); + + return ( temp ); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/wvec_dot.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/wvec_dot.c new file mode 100644 index 0000000000000000000000000000000000000000..7524126bb39a8d36c44f5a3603df1217c32d0185 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/wvec_dot.c @@ -0,0 +1,45 @@ +/****************** wvec_dot.c (in su3.a) ****************************/ +/* MIMD version 6 */ +/* * + * complex wvec_dot(a,b) wilson_vector *a,*b; * + * return dot product of two wilson_vectors * + */ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +complex wvec_dot( wilson_vector * a, wilson_vector * b ) +{ + complex temp1, temp2; + register int i; + temp1.real = temp1.imag = 0.0; + for ( i = 0; i < 4; i++ ) + { + CMULJ_( a->COLORSPINOR( 0, i ), b->COLORSPINOR( 0, i ), temp2 ); + CSUM( temp1, temp2 ); + CMULJ_( a->COLORSPINOR( 1, i ), b->COLORSPINOR( 1, i ), temp2 ); + CSUM( temp1, temp2 ); + CMULJ_( a->COLORSPINOR( 2, i ), b->COLORSPINOR( 2, i ), temp2 ); + CSUM( temp1, temp2 ); + } + return ( temp1 ); + +} + +complex hwvec_dot( half_wilson_vector * a, half_wilson_vector * b ) +{ + complex temp1, temp2; + register int i; + temp1.real = temp1.imag = 0.0; + for ( i = 0; i < 2; i++ ) + { + CMULJ_( a->h[i].c[0], b->h[i].c[0], temp2 ); + CSUM( temp1, temp2 ); + CMULJ_( a->h[i].c[1], b->h[i].c[1], temp2 ); + CSUM( temp1, temp2 ); + CMULJ_( a->h[i].c[2], b->h[i].c[2], temp2 ); + CSUM( temp1, temp2 ); + } + return ( temp1 ); + +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/wvec_rdot.c b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/wvec_rdot.c new file mode 100644 index 0000000000000000000000000000000000000000..6475bf6f44f164e6606571d85bb9ce11d9839933 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/libraries/wvec_rdot.c @@ -0,0 +1,79 @@ +/***************** wvec_rdot.c (in su3.a) ****************************** +* * +* double wvec_rdot( wilson_vector *a, wilson_vector *b ) * +* return real part of dot product of two wilson_vectors * +*/ +#include "../include/config.h" +#include "../include/complex.h" +#include "../include/su3.h" + +double wvec_rdot( wilson_vector * a, wilson_vector * b ) +{ + register double ar, ai, br, bi, ss; + + ar = a->COLORSPINOR( 0, 0 ).real; + ai = a->COLORSPINOR( 0, 0 ).imag; + br = b->COLORSPINOR( 0, 0 ).real; + bi = b->COLORSPINOR( 0, 0 ).imag; + ss = ar * br + ai * bi; + ar = a->COLORSPINOR( 1, 0 ).real; + ai = a->COLORSPINOR( 1, 0 ).imag; + br = b->COLORSPINOR( 1, 0 ).real; + bi = b->COLORSPINOR( 1, 0 ).imag; + ss += ar * br + ai * bi; + ar = a->COLORSPINOR( 2, 0 ).real; + ai = a->COLORSPINOR( 2, 0 ).imag; + br = b->COLORSPINOR( 2, 0 ).real; + bi = b->COLORSPINOR( 2, 0 ).imag; + ss += ar * br + ai * bi; + + ar = a->COLORSPINOR( 0, 1 ).real; + ai = a->COLORSPINOR( 0, 1 ).imag; + br = b->COLORSPINOR( 0, 1 ).real; + bi = b->COLORSPINOR( 0, 1 ).imag; + ss += ar * br + ai * bi; + ar = a->COLORSPINOR( 1, 1 ).real; + ai = a->COLORSPINOR( 1, 1 ).imag; + br = b->COLORSPINOR( 1, 1 ).real; + bi = b->COLORSPINOR( 1, 1 ).imag; + ss += ar * br + ai * bi; + ar = a->COLORSPINOR( 2, 1 ).real; + ai = a->COLORSPINOR( 2, 1 ).imag; + br = b->COLORSPINOR( 2, 1 ).real; + bi = b->COLORSPINOR( 2, 1 ).imag; + ss += ar * br + ai * bi; + + ar = a->COLORSPINOR( 0, 2 ).real; + ai = a->COLORSPINOR( 0, 2 ).imag; + br = b->COLORSPINOR( 0, 2 ).real; + bi = b->COLORSPINOR( 0, 2 ).imag; + ss += ar * br + ai * bi; + ar = a->COLORSPINOR( 1, 2 ).real; + ai = a->COLORSPINOR( 1, 2 ).imag; + br = b->COLORSPINOR( 1, 2 ).real; + bi = b->COLORSPINOR( 1, 2 ).imag; + ss += ar * br + ai * bi; + ar = a->COLORSPINOR( 2, 2 ).real; + ai = a->COLORSPINOR( 2, 2 ).imag; + br = b->COLORSPINOR( 2, 2 ).real; + bi = b->COLORSPINOR( 2, 2 ).imag; + ss += ar * br + ai * bi; + + ar = a->COLORSPINOR( 0, 3 ).real; + ai = a->COLORSPINOR( 0, 3 ).imag; + br = b->COLORSPINOR( 0, 3 ).real; + bi = b->COLORSPINOR( 0, 3 ).imag; + ss += ar * br + ai * bi; + ar = a->COLORSPINOR( 1, 3 ).real; + ai = a->COLORSPINOR( 1, 3 ).imag; + br = b->COLORSPINOR( 1, 3 ).real; + bi = b->COLORSPINOR( 1, 3 ).imag; + ss += ar * br + ai * bi; + ar = a->COLORSPINOR( 2, 3 ).real; + ai = a->COLORSPINOR( 2, 3 ).imag; + br = b->COLORSPINOR( 2, 3 ).real; + bi = b->COLORSPINOR( 2, 3 ).imag; + ss += ar * br + ai * bi; + + return ( ss ); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/make_lattice.c b/qcd/part_cpu/applications/QCD/src/kernel_E/make_lattice.c new file mode 100644 index 0000000000000000000000000000000000000000..7c541f3c8be8a1312bac2f2949f41201107f1d7c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/make_lattice.c @@ -0,0 +1,77 @@ +/******** make_lattice.c *********/ + +/* 1. Allocates space for the lattice fields, as specified by the + application site structure. Fills in coordinates, parity, index. + 2. Allocates gen_pt pointers for gather results + 3. Initializes site-based random number generator, if specified + by macro SITERAND */ + +#include "./include/includes.h" +void make_lattice( ) +{ + register int i, n; + int x, y, z, t; + + /* allocate space for lattice, fill in parity, coordinates and index */ + MEMALIGN(lattice, site, sites_on_node); + + /* Allocate address vectors */ + for ( i = 0; i < N_POINTERS; i++ ) + MEMALIGN(gen_pt[i], char *, sites_on_node); + + for ( t = 0; t < nt; t++ ) + for ( z = 0; z < nz; z++ ) + for ( y = 0; y < ny; y++ ) + for ( x = 0; x < nx; x++ ) + { + if( node_number_KE( x, y, z, t ) == mynode_KE( ) ) + { + i = node_index_KE( x, y, z, t ); + lattice[i].x = x; + lattice[i].y = y; + lattice[i].z = z; + lattice[i].t = t; + lattice[i].index = x + nx * ( y + ny * ( z + nz * t ) ); + if( ( x + y + z + t ) % 2 == 0 ) + lattice[i].parity = EVEN; + else + lattice[i].parity = ODD; + } + } + + /* matrices */ + MEMALIGN(gauge, su3_matrix, 4*sites_on_node); + MEMALIGN(gauge_32, float, 4*18*sites_on_node); + for (i=0; i<8; i++){ + MEMALIGN(htmp[i],half_wilson_vector, sites_on_node); + MEMALIGN(htmp_32[i],float, 12*sites_on_node); + } + node0_fprintf( file_o1, "make_lattice: Mallocing %.1f Mbytes per node\n", ( double ) memsize / 1e6 ); +} + +/* returns i_mu */ +int site_mu( int i, int mu ) +{ + if( mu == 0 ) + return lattice[i].x; + if( mu == 1 ) + return lattice[i].y; + if( mu == 2 ) + return lattice[i].z; + if( mu == 3 ) + return lattice[i].t; + node0_printf( "ERROR site_mu: Out of direction range!\n" ); + terminate_KE( 0 ); + return -1; +} + +/* taxi driver distance from the origin */ +int taxi_dist( int j ) +{ + int dist; + dist = ( lattice[j].x <= nx / 2 ) ? ( lattice[j].x ) : ( nx - lattice[j].x ); + dist += ( lattice[j].y <= ny / 2 ) ? ( lattice[j].y ) : ( ny - lattice[j].y ); + dist += ( lattice[j].z <= nz / 2 ) ? ( lattice[j].z ) : ( nz - lattice[j].z ); + dist += ( lattice[j].t <= nt / 2 ) ? ( lattice[j].t ) : ( nt - lattice[j].t ); + return dist; +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/mt19937-64.c b/qcd/part_cpu/applications/QCD/src/kernel_E/mt19937-64.c new file mode 100644 index 0000000000000000000000000000000000000000..2deb32902838c268f5cc7984119e617b78e06a38 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/mt19937-64.c @@ -0,0 +1,191 @@ +/* + A C-program for MT19937-64 (2004/9/29 version). + Coded by Takuji Nishimura and Makoto Matsumoto. + + This is a 64-bit version of Mersenne Twister pseudorandom number + generator. + + Before using, initialize the state by using init_genrand64(seed) + or init_by_array64(init_key, key_length). + + Copyright (C) 2004, Makoto Matsumoto and Takuji Nishimura, + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + 3. The names of its contributors may not be used to endorse or promote + products derived from this software without specific prior written + permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR + CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + References: + T. Nishimura, ``Tables of 64-bit Mersenne Twisters'' + ACM Transactions on Modeling and + Computer Simulation 10. (2000) 348--357. + M. Matsumoto and T. Nishimura, + ``Mersenne Twister: a 623-dimensionally equidistributed + uniform pseudorandom number generator'' + ACM Transactions on Modeling and + Computer Simulation 8. (Jan. 1998) 3--30. + + Any feedback is very welcome. + http://www.math.hiroshima-u.ac.jp/~m-mat/MT/emt.html + email: m-mat @ math.sci.hiroshima-u.ac.jp (remove spaces) +*/ + + +#include + +/* initializes mt[NN] with a seed */ +void init_genrand64(unsigned long long seed); + +/* initialize by an array with array-length */ +/* init_key is the array for initializing keys */ +/* key_length is its length */ +void init_by_array64(unsigned long long init_key[], + unsigned long long key_length); + +/* generates a random number on [0, 2^64-1]-interval */ +unsigned long long genrand64_int64(void); + + +/* generates a random number on [0, 2^63-1]-interval */ +long long genrand64_int63(void); + +/* generates a random number on [0,1]-real-interval */ +double genrand64_real1(void); + +/* generates a random number on [0,1)-real-interval */ +double genrand64_real2(void); + +/* generates a random number on (0,1)-real-interval */ +double genrand64_real3(void); +#define NN 312 +#define MM 156 +#define MATRIX_A 0xB5026F5AA96619E9ULL +#define UM 0xFFFFFFFF80000000ULL /* Most significant 33 bits */ +#define LM 0x7FFFFFFFULL /* Least significant 31 bits */ + + +/* The array for the state vector */ +unsigned long long mt[NN]; +/* mti==NN+1 means mt[NN] is not initialized */ +/* static int mti=NN+1; */ +int mti; + +/* initializes mt[NN] with a seed */ +void init_genrand64(unsigned long long seed) +{ + mt[0] = seed; + for (mti=1; mti> 62)) + mti); +} + +/* initialize by an array with array-length */ +/* init_key is the array for initializing keys */ +/* key_length is its length */ +void init_by_array64(init_key, key_length) +unsigned long long init_key[], key_length; +{ + unsigned long long i, j, k; + init_genrand64(19650218ULL); + i=1; j=0; + k = (NN>key_length ? NN : key_length); + for (; k; k--) { + mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 62)) * 3935559000370003845ULL)) + + init_key[j] + j; /* non linear */ + i++; j++; + if (i>=NN) { mt[0] = mt[NN-1]; i=1; } + if (j>=key_length) j=0; + } + for (k=NN-1; k; k--) { + mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 62)) * 2862933555777941757ULL)) + - i; /* non linear */ + i++; + if (i>=NN) { mt[0] = mt[NN-1]; i=1; } + } + + mt[0] = 1ULL << 63; /* MSB is 1; assuring non-zero initial array */ +} + +/* generates a random number on [0, 2^64-1]-interval */ +unsigned long long genrand64_int64(void) +{ + int i; + unsigned long long x; + static unsigned long long mag01[2]={0ULL, MATRIX_A}; + + if (mti >= NN) { /* generate NN words at one time */ + + /* if init_genrand64() has not been called, */ + /* a default initial seed is used */ + if (mti == NN+1) + init_genrand64(5489ULL); + + for (i=0;i>1) ^ mag01[(int)(x&1ULL)]; + } + for (;i>1) ^ mag01[(int)(x&1ULL)]; + } + x = (mt[NN-1]&UM)|(mt[0]&LM); + mt[NN-1] = mt[MM-1] ^ (x>>1) ^ mag01[(int)(x&1ULL)]; + + mti = 0; + } + + x = mt[mti++]; + + x ^= (x >> 29) & 0x5555555555555555ULL; + x ^= (x << 17) & 0x71D67FFFEDA60000ULL; + x ^= (x << 37) & 0xFFF7EEE000000000ULL; + x ^= (x >> 43); + + return x; +} + +/* generates a random number on [0, 2^63-1]-interval */ +long long genrand64_int63(void) +{ + return (long long)(genrand64_int64() >> 1); +} + +/* generates a random number on [0,1]-real-interval */ +double genrand64_real1(void) +{ + return (genrand64_int64() >> 11) * (1.0/9007199254740991.0); +} + +/* generates a random number on [0,1)-real-interval */ +double genrand64_real2(void) +{ + return (genrand64_int64() >> 11) * (1.0/9007199254740992.0); +} + +/* generates a random number on (0,1)-real-interval */ +double genrand64_real3(void) +{ + return ((genrand64_int64() >> 12) + 0.5) * (1.0/4503599627370496.0); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/mult_fmat.c b/qcd/part_cpu/applications/QCD/src/kernel_E/mult_fmat.c new file mode 100644 index 0000000000000000000000000000000000000000..8d3f707f4024c8e862bc445c79dc1fffbfc197b5 --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/mult_fmat.c @@ -0,0 +1,50 @@ +/************* mult_Hw.c *******************************/ +/* + mult_Hw(wilson_vector *src, wilson_vector *dest); + dest(x)=g5*{src(x)-kappa*SUM_dirs( (1+g[dir])*U(x,dir)*src(x+dir) + +(1-g[dir])*U+(x-dir,dir)*src(x-dir))} + */ + +#include "./include/includes.h" + +void dslash( wilson_vector * src, wilson_vector * dest, int isign, int parity ); +void dslash_32( float * src, float * dest, int isign, int parity ); + +/* WILSON */ + +/* single precision matrix multiplication */ +void multiply_fmat_32( float * src, float * dest, int isign ) +{ + dslash_32( src, dest, isign, EVEN ); + dslash_32( src, dest, isign, ODD ); + latutil_xpay_32(src,-(float)kappa,dest,EVENANDODD); + +} +void multiply_hfmat_32( float * src, float * dest ) +{ + + dslash_32( src, dest, 1, EVENANDODD ); + latutil_5xpay_32(src,-(float)kappa,dest,EVENANDODD); + +} +void multiply_hfmat( wilson_vector * src, wilson_vector * dest ) +{ + int i; + site *s; + dslash( src, dest, 1, EVENANDODD ); + FORALLSITES(i,s) + { + scalar_mult_add_wvec(&(src[i]),&(dest[i]),-kappa,&(dest[i])); + g5_mult_wvec( &( dest[i] ), &( dest[i] ) ); + } +} +void multiply_fmat( wilson_vector * src, wilson_vector * dest, int isign) +{ + int i; + site *s; + + dslash( src, dest, isign, EVENANDODD ); + + FORALLSITES(i,s) + scalar_mult_add_wvec(&(src[i]),&(dest[i]),-kappa,&(dest[i])); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/output b/qcd/part_cpu/applications/QCD/src/kernel_E/output new file mode 100644 index 0000000000000000000000000000000000000000..b53abbd87a9151be67e642290f25f91261aa61ab --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/output @@ -0,0 +1,88 @@ +initial_set: A simple QCD benchmark -- 081110_073708 -- +initial_set: Based on MILC v6 +initial_set: Machine = Generic communication, with 128 nodes +get_i: nx 32 +get_i: ny 32 +get_i: nz 32 +get_i: nt 64 +get_f: mass_wilson -0.01 +get_i: verbose 1 +initial_set: Done +setup_layout: 4d evenfirst +setup_layout: local lattice size: 16 x 8 x 8 x 16 +make_lattice: Mallocing 33.8 Mbytes per node +ranlat: Random gauge configuration loaded +d_plaquette: 4.367005313551050e-02 4.055214210611848e-02 +reload_lattice: time= 0.11 checkplaq: 4.367005e-02 4.055214e-02 +reload_lattice: Unitarity checked. Max deviation 1.11e-16 +congrad_32: 0 prec= 7.286045e-01 +congrad_32: it= 20 8.42 sec 0.112 GFlop/s/thread (9.441116e+08 Flop) +congrad_orig: 0 prec= 9.337358e-01 +congrad_orig: end 42 prec= 6.503263e-09 mvm= 84 time= 12.5 +congrad_orig: it= 42 12.5 sec 0.16 GFlop/s/thread (2.002649e+09 Flop) +initial_set: A simple QCD benchmark -- 081110_074200 -- +initial_set: Based on MILC v6 +initial_set: Machine = Generic communication, with 128 nodes +get_i: nx 32 +get_i: ny 32 +get_i: nz 32 +get_i: nt 64 +get_f: mass_wilson -0.01 +get_i: verbose 1 +initial_set: Done +setup_layout: 4d evenfirst +setup_layout: local lattice size: 16 x 8 x 8 x 16 +make_lattice: Mallocing 33.8 Mbytes per node +ranlat: Random gauge configuration loaded +d_plaquette: 4.367005313551050e-02 4.055214210611848e-02 +reload_lattice: time= 0.11 checkplaq: 4.367005e-02 4.055214e-02 +reload_lattice: Unitarity checked. Max deviation 1.11e-16 +congrad_32: 0 prec= 7.286045e-01 +congrad_32: it= 20 8.41 sec 0.112 GFlop/s/thread (9.441116e+08 Flop) +congrad_orig: 0 prec= 9.337358e-01 +congrad_orig: end 42 prec= 6.503263e-09 mvm= 84 time= 12.5 +congrad_orig: it= 42 12.5 sec 0.16 GFlop/s/thread (2.002649e+09 Flop) +initial_set: A simple QCD benchmark -- 081110_074352 -- +initial_set: Based on MILC v6 +initial_set: Machine = Generic communication, with 256 nodes +get_i: nx 32 +get_i: ny 32 +get_i: nz 32 +get_i: nt 64 +get_f: mass_wilson -0.01 +get_i: verbose 1 +initial_set: Done +setup_layout: 4d evenfirst +setup_layout: local lattice size: 8 x 8 x 8 x 16 +make_lattice: Mallocing 16.9 Mbytes per node +ranlat: Random gauge configuration loaded +d_plaquette: 3.515844744463865e-02 3.545999095838800e-02 +reload_lattice: time= 0.0551 checkplaq: 3.515845e-02 3.545999e-02 +reload_lattice: Unitarity checked. Max deviation 1.11e-16 +congrad_32: 0 prec= 7.302510e-01 +congrad_32: it= 20 4.23 sec 0.112 GFlop/s/thread (4.720558e+08 Flop) +congrad_orig: 0 prec= 9.329330e-01 +congrad_orig: end 41 prec= 9.714454e-09 mvm= 82 time= 6.24 +congrad_orig: it= 41 6.24 sec 0.157 GFlop/s/thread (9.780265e+08 Flop) +initial_set: A simple QCD benchmark -- 081110_074555 -- +initial_set: Based on MILC v6 +initial_set: Machine = Generic communication, with 512 nodes +get_i: nx 32 +get_i: ny 32 +get_i: nz 32 +get_i: nt 64 +get_f: mass_wilson -0.01 +get_i: verbose 1 +initial_set: Done +setup_layout: 4d evenfirst +setup_layout: local lattice size: 8 x 8 x 8 x 8 +make_lattice: Mallocing 8.5 Mbytes per node +ranlat: Random gauge configuration loaded +d_plaquette: 2.904621520885099e-02 1.859144319355961e-02 +reload_lattice: time= 0.0276 checkplaq: 2.904622e-02 1.859144e-02 +reload_lattice: Unitarity checked. Max deviation 1.11e-16 +congrad_32: 0 prec= 7.220887e-01 +congrad_32: it= 20 2.16 sec 0.109 GFlop/s/thread (2.360279e+08 Flop) +congrad_orig: 0 prec= 9.189107e-01 +congrad_orig: end 41 prec= 8.836237e-09 mvm= 82 time= 3.2 +congrad_orig: it= 41 3.2 sec 0.153 GFlop/s/thread (4.890132e+08 Flop) diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/parameters b/qcd/part_cpu/applications/QCD/src/kernel_E/parameters new file mode 100644 index 0000000000000000000000000000000000000000..4bb102c4e8b5084b8e7d3633b128cf5ea4e2d0fa --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/parameters @@ -0,0 +1,12 @@ +#lattice +nx 32 +ny 32 +nz 32 +nt 64 +totnodes 4 4 8 8 + +#wilson +mass_wilson -0.01 + +#etc +verbose 1 diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/random.c b/qcd/part_cpu/applications/QCD/src/kernel_E/random.c new file mode 100644 index 0000000000000000000000000000000000000000..734119e15c321e57c6dc9d29c892460aec7f143c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/random.c @@ -0,0 +1,103 @@ +#include "./include/includes.h" +#include "./include/random.h" + + +/* 64 bit MT PRNG from mt19937-64.c */ +#define NN 312 +extern int mti; +extern unsigned long long mt[NN]; +void init_genrand64(unsigned long long seed); +double genrand64_real1(void); + +double myrand() +{ + return genrand64_real1(); +} + +/* generate new rng.dat/ from rng.dat.init */ +void make_rngdat() +{ + int idum=4711; + unsigned long long seed; + + /* start MT */ + mti=NN+1; + if (idum<0) idum=-idum; + seed=(unsigned long long)idum; + init_genrand64( seed ); + + g_sync_KE(); + ranend(); +} + +void ranstart() +{ + make_rngdat(); +} + +void ranend() +{ +} + +void random_su2( su2_matr_comp * r_su2, double eps ) +{ + double arg_cs, cs, sn, su2_select; + + arg_cs = 2 * M_PI * eps * ( myrand( 0 ) - 0.5 ); + cs = cos( arg_cs ); + sn = sin( arg_cs ); + r_su2->a[0] = cs; + su2_select = myrand( 0 ); + if( su2_select < 0.3333333 ) + { + r_su2->a[1] = sn; + r_su2->a[2] = 0; + r_su2->a[3] = 0; + } + else if( su2_select < 0.6666667 ) + { + r_su2->a[1] = 0; + r_su2->a[2] = sn; + r_su2->a[3] = 0; + } + else + { + r_su2->a[1] = 0; + r_su2->a[2] = 0; + r_su2->a[3] = sn; + } +} + +void random_su3_KE( su3_matrix * r_su3, double eps ) +{ + su2_matr_comp t; + int a, b, index, i, j; + + /* pick out an SU(2) subgroup */ + index = 3.0 * myrand( 0 ); + a = ( index + 1 ) % 3; + b = ( index + 2 ) % 3; + if( a > b ) + { + i = a; + a = b; + b = i; + } + for ( i = 0; i < 3; i++ ) + for ( j = 0; j < 3; j++ ) + { + if( i == j ) + { + r_su3->ROWCOL( i, j ) = cmplx_KE( 1, 0 ); + } + else + { + r_su3->ROWCOL( i, j ) = cmplx_KE( 0, 0 ); + } + } + random_su2( &t, eps ); + r_su3->ROWCOL( a, a ) = cmplx_KE( t.a[0], t.a[3] ); + r_su3->ROWCOL( a, b ) = cmplx_KE( t.a[2], t.a[1] ); + r_su3->ROWCOL( b, a ) = cmplx_KE( -t.a[2], t.a[1] ); + r_su3->ROWCOL( b, b ) = cmplx_KE( t.a[0], -t.a[3] ); +} diff --git a/qcd/part_cpu/applications/QCD/src/kernel_E/setup.c b/qcd/part_cpu/applications/QCD/src/kernel_E/setup.c new file mode 100644 index 0000000000000000000000000000000000000000..f0051fc0276535902ea7839d11cba61b65b64e4c --- /dev/null +++ b/qcd/part_cpu/applications/QCD/src/kernel_E/setup.c @@ -0,0 +1,151 @@ +/******** setup.c *********/ +/* MIMD version 6 */ +#define IF_OK if(status==0) + +/* Modifications ... + 9/03/96 Added reload_parallel for gauge fields C.D. + 9/03/96 Added unitarity checking C.D. */ + +#include "./include/includes.h" + +char *mytime( const struct tm *timeptr ) +{ + static char result[26]; + + sprintf( result, "%02d%02d%02d_%02d%02d%02d", + timeptr->tm_year - 100, timeptr->tm_mon + 1, timeptr->tm_mday, timeptr->tm_hour, timeptr->tm_min, timeptr->tm_sec ); + return result; +} + +int setup_KE( char *par_file ) +{ + void initial_set( FILE *f ); + char filename[MAXFILENAME]; + FILE *f; + + debug(); + +/* JuBE: kernel_E.input para file */ + if( ( f = fopen( "kernel_E.input", "r" ) ) == 0 && mynode_KE( ) == 0 ) + { + printf( "ERROR setup: missing parameter file\n" ); + terminate_KE( 0 ); + } + + /* initial output1 */ + if( this_node == 0 ) + { +/* JuBE: output file changed */ + if( ( file_o1 = fopen( "kernel_E.output", "a" ) ) == 0 ) + { + printf( "ERROR setup: cannot open output1 file\n" ); + exit( 0 ); + } + fflush( 0 ); + } + + memsize=0; /* bit optimistic */ + debugflag=0; + + /* print banner, get volume, seed */ + initial_set( f ); + /* Initialize the layout functions, which decide where sites live */ + setup_layout_KE( ); + /* initialize the node random number generator */ + ranstart( ); + + /* allocate space for lattice, set up coordinate fields */ + make_lattice( ); + /* set up neighbor pointers and comlink structures */ + make_nn_gathers( ); + + /* load lattice */ + reload_lattice( startflag, filename ); + convert_gauge(); + + debug(); + + return 0; +} + +/* read parameters */ +void initial_set( FILE * f ) +{ + int status; + char latstart[MAXFILENAME], propend[MAXFILENAME], eigstart[MAXFILENAME]; + time_t now; + char *mytime( const struct tm *timeptr ); + char parid[256]; + int m; + + status = 0; + /* print banner */ + if( this_node == 0 ) + { + now = time( NULL ); + fprintf( file_o1, "initial_set: A simple QCD benchmark -- %s --\n", mytime( localtime( &now ) ) ); + } + node0_fprintf( file_o1, "initial_set: Based on MILC v6\n" ); + node0_fprintf( file_o1, "initial_set: Machine = %s, with %d nodes\n", machine_type_KE( ), numnodes_KE( ) ); + + /* read in parameters - general */ + IF_OK status += get_i_KE( f, "nx", &nx ); + IF_OK status += get_i_KE( f, "ny", &ny ); + IF_OK status += get_i_KE( f, "nz", &nz ); + IF_OK status += get_i_KE( f, "nt", &nt ); + + /* parameters */ + IF_OK status += get_f( f, "mass_wilson", &mass_wilson ); + kappa=1.0/(2*mass_wilson+8.0); + + /* some general things */ + IF_OK status += get_i_KE( f, "max_cg_iters", &max_cg_iters ); + IF_OK status += get_i_KE( f, "verbose", &verbose ); + + g_sync_KE( ); + + if( status ) + terminate_KE( 1 ); + + node0_fprintf( file_o1, "initial_set: Done\n" ); + + volume = nx * ny * nz * nt; +} + +void convert_gauge() +{ +#if LINKDIST_32 == 4 + int i; + for (i=0;i +#include +#include +#include "mpi.h" + +#ifdef UNDERSCORE_CALLS + +#define kernel_a kernel_a_ +#define jube_kernel_init_f jube_kernel_init_f_ +#define jube_kernel_run_f jube_kernel_run_f_ +#define jube_kernel_finalize_f jube_kernel_finalize_f_ +#define jube_kernel_end_f jube_kernel_end_f_ +#endif + + +#ifdef IHPCT_HWC +#include "libhpm.h" +#endif + +#ifdef PAPI +#include + +#define JUBE_PAPI_NMB_EVENTS 2 +char jube_papi_str[PAPI_MAX_STR_LEN]; +int jube_papi_events[JUBE_PAPI_NMB_EVENTS] = {PAPI_TOT_CYC, PAPI_FP_OPS}; +long_long jube_papi_values[JUBE_PAPI_NMB_EVENTS]; +#endif + + +#define NMB_KERNEL 5 + +#define HWPC_ALL 1 +#define HWPC_INIT 2 +#define HWPC_KERNEL 3 +#define HWPC_FINALIZE 4 + +int jube_mympirank; + +int jube_current_kernel; + +int jube_kernel_active[NMB_KERNEL]; + +double jube_time_kernel_init[NMB_KERNEL], + jube_time_kernel_run[NMB_KERNEL], + jube_time_kernel_finalize[NMB_KERNEL], + jube_time_kernel_end[NMB_KERNEL]; + +char *jube_kernel_names[] = {"kernel_A", "kernel_B", "kernel_C", "kernel_D", "kernel_E"}; + +unsigned jube_mem_usage(); + +void jube_init() +{ + + jube_current_kernel = -1; + + printf("JuBE: init QCD benchmark\n"); + + MPI_Comm_rank(MPI_COMM_WORLD, &jube_mympirank); + +#ifdef IHPCT_HWC + printf("JuBE: starting HPM\n"); + hpmInit(0, "QCD"); +#ifdef IHPCT_ALL + hpmStart(1, "QCD"); +#endif +#endif + +#ifdef PAPI + assert( PAPI_start_counters(jube_papi_events, JUBE_PAPI_NMB_EVENTS) == PAPI_OK ); +#endif + +} + +void jube_end() +{ + int cnt, num_proc; + double time_total, time_kernel; + + double timings[3*NMB_KERNEL], timings_sq[NMB_KERNEL]; + double buf_timings[3*NMB_KERNEL], buf_timings_sq[NMB_KERNEL]; + +#ifdef IHPCT_HWC +#ifdef IHPCT_ALL + hpmStop(1); +#endif + hpmTerminate(0); + printf("JuBE: terminated HPM\n"); +#endif + +#ifdef PAPI + assert( PAPI_stop_counters(jube_papi_values, JUBE_PAPI_NMB_EVENTS) == PAPI_OK ); +#endif + + + printf("JuBE: end QCD benchmark\n"); + + MPI_Barrier(MPI_COMM_WORLD); + + printf("JuBE: \tkernelname \tinit \t\trun \t\tfinalize\n"); + + for(cnt=0; cnt < NMB_KERNEL; cnt++) + { + timings[cnt + 0] = jube_kernel_active[cnt] ? jube_time_kernel_run[cnt] - jube_time_kernel_init[cnt] : 0; + timings[cnt + NMB_KERNEL] = jube_kernel_active[cnt] ? jube_time_kernel_finalize[cnt] - jube_time_kernel_run[cnt] : 0; + timings[cnt + 2*NMB_KERNEL] = jube_kernel_active[cnt] ? jube_time_kernel_end[cnt] - jube_time_kernel_finalize[cnt] : 0; + + timings_sq[cnt] = timings[cnt +0] + timings[cnt + NMB_KERNEL] + timings[cnt + 2*NMB_KERNEL]; + timings_sq[cnt] = timings_sq[cnt] * timings_sq[cnt]; + } + + for(cnt=0; cnt < NMB_KERNEL; cnt++) + { + if(jube_kernel_active[cnt]) + printf("JuBE: \t%s \t%.3e \t%.3e \t%.3e\n", jube_kernel_names[cnt], + timings[cnt], timings[cnt + NMB_KERNEL], timings[cnt + 2*NMB_KERNEL]); + } + + time_total = 0; + for(cnt=0; cnt < NMB_KERNEL; cnt++) + { + if(jube_kernel_active[cnt]) + time_total += timings[cnt] + timings[cnt + NMB_KERNEL] + timings[cnt + 2*NMB_KERNEL]; + } + + printf("JuBE: total time on process %d: %.3e\n", jube_mympirank, time_total); + + MPI_Reduce(timings, buf_timings, 3*NMB_KERNEL, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); + MPI_Reduce(timings_sq, buf_timings_sq, NMB_KERNEL, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD); + + if(jube_mympirank == 0) + { + MPI_Comm_size(MPI_COMM_WORLD, &num_proc); + + printf("JuBE global mean timing statistics: \tkernelname \tinit \t\trun \t\tfinalize \ttotal \t\ttotal(stddev)\n"); + + time_total = 0; + for(cnt=0; cnt < NMB_KERNEL; cnt++) + { + time_kernel = buf_timings[cnt] + buf_timings[cnt + NMB_KERNEL] + buf_timings[cnt + 2*NMB_KERNEL]; + printf("JuBE global mean timing statistics: \t%s \t%.3e \t%.3e \t%.3e \t%.3e \t%.3e\n", jube_kernel_names[cnt], + buf_timings[cnt]/num_proc, buf_timings[cnt + NMB_KERNEL]/num_proc, buf_timings[cnt + 2*NMB_KERNEL]/num_proc, + (time_kernel)/num_proc, + sqrt((num_proc*buf_timings_sq[cnt] - time_kernel*time_kernel)/num_proc/(num_proc-1))); + time_total += time_kernel/num_proc; + } + printf("JuBE: total mean run time: %e\n", time_total); + + } + + printf("JuBE: total max mem: %d\n", jube_mem_usage()); +} + +double jube_time_get() +{ + return MPI_Wtime(); +} + +unsigned jube_mem_avail() +{ + return 0; +} + +unsigned jube_mem_usage() +{ +#ifdef HUYGENS + char buf[30]; + unsigned size; // total program size + unsigned resident;// resident set size + unsigned share;// shared pages + unsigned text;// text (code) + unsigned lib;// library + unsigned data;// data/stack + unsigned dt;// dirty pages (unused in Linux 2.6) + + snprintf(buf, 30, "/proc/%u/statm", (unsigned)getpid()); + FILE* pf = fopen(buf, "r"); + if (pf) { + fscanf(pf, "%u" /* %u %u %u %u %u"*/, &size/*, &resident, &share, &text, &lib, &data*/); + } + else + { + size = 0; + } + fclose(pf); + + return size; +#else + struct rusage rus; + int ret_rus = getrusage(RUSAGE_SELF, &rus); + assert(0 == ret_rus); + + return rus.ru_maxrss; +#endif + +} + +void jube_kernel_init(int* kernelnumber); +void jube_kernel_init_(int* kernelnumber) +{ + jube_kernel_init(kernelnumber); +} +void jube_kernel_init(int* kernelnumber) +{ + jube_current_kernel = *kernelnumber; + + printf("JuBE: init kernel %s\n", jube_kernel_names[jube_current_kernel]); + + jube_time_kernel_init[jube_current_kernel] = jube_time_get(); + +#ifdef IHPCT_HWC +#ifndef IHPCT_ALL + hpmStart(jube_current_kernel, jube_kernel_names[jube_current_kernel]); +#endif +#endif + +#ifdef PAPI + assert( PAPI_read_counters(jube_papi_values, JUBE_PAPI_NMB_EVENTS) == PAPI_OK ); +#endif + +} + +void jube_kernel_run(); +void jube_kernel_run_() +{ + jube_kernel_run(); +} +void jube_kernel_run() +{ + + printf("JuBE: run kernel %s\n", jube_kernel_names[jube_current_kernel]); + + jube_time_kernel_run[jube_current_kernel] = jube_time_get(); +} + + +void jube_kernel_finalize(); +void jube_kernel_finalize_() +{ + jube_kernel_finalize(); +} +void jube_kernel_finalize() +{ + + printf("JuBE: finalize kernel %s\n", jube_kernel_names[jube_current_kernel]); + + jube_time_kernel_finalize[jube_current_kernel] = jube_time_get(); +} + +void jube_kernel_end(); +void jube_kernel_end_() +{ + jube_kernel_end(); +} +void jube_kernel_end() +{ + int cnt; + +#ifdef IHPCT_HWC +#ifndef IHPCT_ALL + hpmStop(jube_current_kernel); +#endif +#endif + +#ifdef PAPI + assert( PAPI_read_counters(jube_papi_values, JUBE_PAPI_NMB_EVENTS) == PAPI_OK ); + for(cnt=0; cnt + + + + + + + perl ./run/verify_qcd.pl $subdir/verify.xml + + + + + + + diff --git a/qcd/part_cpu/bench/ChangeLog.txt b/qcd/part_cpu/bench/ChangeLog.txt new file mode 100644 index 0000000000000000000000000000000000000000..f772027b0e0265edade2af1a6455f328aa7c0e8e --- /dev/null +++ b/qcd/part_cpu/bench/ChangeLog.txt @@ -0,0 +1,188 @@ +Change log for JuBE: +-------------------- +23.05.2013: FJ +patch level 20 +- New option 'always new' for keyword 'version' in compile step added +- When using this option, the compile step is executed always, i.e. + executables in /run are ignored as well as executables generated + in a previous step. This means, if a parameter space is spanned, + the compile step will be executed fully for each parameter. + +16.04.2012: WF +patch level 19 +- added specifier 'last' or 'last-\d' to update of result spec option + +03.01.2012: SJ +patch level 18 +- Bug fix: min and max values in statistics should not be divided by +the avg value + +29.06.2009: SM +patch level 17 +- Bug fix: empty variables in substitute + +19.05.2009: LA +patch level 16 +- new feature: (implemented by Sebastian von Alfthan) + the result element may now contain a title, which will be + printed above the according result table, and a transpose + section, which transposes the table. + + Example: + + + Domain_dec,Send_X_PME,Comm_coord,Neigh_srch,Force,Wait_Cmm_F, + PME_mesh,Wait_Cm_XF,Wait_Rx_PF,Write_traj,Update,Constrnts,Comm_energ,Rest + + + + ncpus + + + + + Will result in the following table: + + Real Cycle and Time Accounting (%) + Subid: | n44p8t1_t001_i01 n88p8t1_t001_i01 + -----------+------------------------------------ + Domain_dec | 1.00 0.80 + Send_X_PME | 0.10 0.00 + Comm_coord | 2.10 1.10 + Neigh_srch | 2.90 1.80 + Force | 28.00 17.10 + Wait_Cmm_F | 4.00 3.90 + PME_mesh | 22.80 23.80 + Wait_Cm_XF | 2.20 1.20 + Wait_Rx_PF | 32.60 47.60 + Write_traj | 0.00 0.00 + Update | 0.90 0.60 + Constrnts | 0.70 0.50 + Comm_energ | 2.20 1.30 + Rest | 0.50 0.30 + + + +20.03.2009: FJ +patch level 15 +- A default column width of colw="10" is used if the attribute is not + specified in result.xml + +05.03.2009: AS +patch level: 14 +- new feature: It's now possible to define the column width for the + result tables. As of now a new attribute called colw has + to be defined in result.xml. + Example: ... + If you do not set the column width jube will terminate with + an error message. + Thanks go to Jon Hill for this adaption + + +14.01.2009: AS +patch level: 13 + - jube adds a header to xml-longlog-files. Combined with jube_report.xsl and style.css + which are copied to xmllogs/ it is now possible to regard the longlog files in your + browser provided with an formatted style. Please choose Firefox or Windows IE because + parsing xsl-files is not supported by all browsers yet. Please make sure that your + browser recognizes that your longlog file is a xml file. + usage example: firefox + +26.11.2008: LA +patch level: 12 + - add new function: predefparams; it allows to read in the top level xml-file + parameter out of a choosen section + +04.11.2008: WF +patch level: 11 + - add precommand in analyse tag: command will be executed before + verify and pattern analysis + +02.11.2008: FJ +patch level: 10 + - jube adds the content of platform.xml to the xmllog for + the chosen platform. + +30.10.2008: AS +patch level: 9 + - jube prints out the xml-file names when processing them. + If there is a mistake in these files the user now knows + where to look for the problem. + +27.10.2008: AS +patch level: 8 + - 'index' has been modified. It is used in analyse.xml via the attribute + 'mode' in order to extract data out of a table. 'index' takes a table + from input and writes out columns choosen by the user through indeces + and puts them into the xmllog file. + + +23.10.2008: WF +patch level: 7 + - new option '-showall' shows all benchmark runs in result table, + shows also queued and failed runs + - new feature: multiple output tables in result: + ... can be repeated in result.xml + ... can also be repeated, if not, sort order will + be the same for all tables + - result.xml: active option to show- and sort- tag implemented + e.g.: ... + ... + +17.10.2008: AS +patch level: 6 + - modification of the -cmpdir option. This option allows + for setting a compile directory other than the home/ + directory. This option works with a single compilation + and with a bunch of compilations as well. Each + compilation takes place in an unique subdirectories. The + option -cmpdir can also be used together with the option + -tmpdir. JuBE creates a tar ball of the source files and + transfers it to the corresponding subdirectory in the + directory chosen by -tmpdir. + +14.10.2008: AS +patch level: 5 + - if the start_info.xml file and end_info.xml file + respectively don't exist in the end of a calculation + a hint will be added to the output while updating the + results via the option -update. + +01.10.2008: AS +patch level: 4 + - new command line option -cmpdir added: + can be used in the compile step to choose another + directory for compilation than the home/ directory. + In the end the new executable will be copied to the + home/ directory as well + +16.09.2008: SM +patch level: 3 + - nbench has been changed to JuBE + +31.03.2008: WF +patch level: 2 +- new tag in analyse.xml: + + + + + All files specified in the addfile attribute of this tag will be + appended to the analyse data (stderr and stdout). The contents of the + addfiles attribute can be a comma or blank delimited list of files. + If the filename is not starting with a / the execution directory of + benchmark run will be automatically prepended to the filename. + +23.01.2008: WF +patch level: 1 + - new command line option -tmpdir added: + for specifying a directory containing all files + needed during runtime of a benchmark + can be used, if benchmark directory resides on a filesystem + which is not mounted on the compute nodes + stdout/err will be temporally stored in $TMPdir/logs + and moved later by nbench (-update option) to the logs + directory in the benchmark suite + - first enhancements for logging compile parameters when using + reuse option + diff --git a/qcd/part_cpu/bench/jube b/qcd/part_cpu/bench/jube new file mode 100755 index 0000000000000000000000000000000000000000..c77eab8201f5d7c48dd4167c5d4bcde7ef395020 --- /dev/null +++ b/qcd/part_cpu/bench/jube @@ -0,0 +1,2665 @@ +#!/usr/bin/perl -w +# +##################################################################################### +# # +# JuBE: Juelich Benchmarking Environment # +# # +##################################################################################### + + +# Copyright (C) 2008, Forschungszentrum Juelich GmbH, Federal Republic of +# Germany. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. +# +# - Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# +# - Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# - Any publications that result from the use of this software shall +# reasonably refer to the Research Centre's development. +# +# - All advertising materials mentioning features or use of this software +# must display the following acknowledgement: +# +# This product includes software developed by Forschungszentrum +# Juelich GmbH, Federal Republic of Germany. +# +# - Forschungszentrum Juelich GmbH is not obligated to provide the user with +# any support, consulting, training or assistance of any kind with regard +# to the use, operation and performance of this software or to provide +# the user with any updates, revisions or new versions. +# +# +# THIS SOFTWARE IS PROVIED BY FORSCHUNGSZENTRUM JUELICH GMBH "AS IS" AND ANY +# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL FORSCHUNGSZENTRUM JUELICH GMBH BE LIABLE FOR +# ANY SPECIAL, DIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER +# RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF +# CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN +# CONNECTION WITH THE ACCESS, USE OR PERFORMANCE OF THIS SOFTWARE. + + +use strict; +use Carp; + +# get installation path of jube perl script +use FindBin; +my $instpath="$FindBin::RealBin"; + +my $patint="([\\+\\-\\d]+)"; # Pattern for Integer number +my $patfp ="([\\+\\-\\d.Ee]+)"; # Pattern for Floating Point number +my $patwrd="([\^\\s]+)"; # Pattern for Work (all noblank characters) +my $patnint="[\\+\\-\\d]+"; # Pattern for Integer number, no () +my $patnfp ="[\\+\\-\\d.Ee]+"; # Pattern for Floating Point number, no () +my $patnwrd="[\^\\s]+"; # Pattern for Work (all noblank characters), no () +my $patbl ="\\s+"; # Pattern for blank space (variable length) + +my @columns; + +use Getopt::Long qw(:config no_ignore_case); + +my $pwd=`pwd`; +chomp($pwd); + +my $opt_verbose=0; +my $opt_dump=0; +my $opt_debug=undef; +my $opt_start=undef; +my $opt_update=undef; +my $opt_result=undef; +my $opt_force=undef; +my $opt_rmtmp=undef; +my $opt_tmpdir=undef; +my $opt_configdir="$pwd"; +my $opt_platformsdir="$instpath/../platform"; +my $opt_result_showall=0; +my $opt_cmpdir=undef; +my $opt_version=undef; +my $SUBSTITUTE_NOTFOUND=""; +my $colw_default=10; + +# if set, indicates that job stdout/err should be keep in +# tmpdir (will be copied to logs dir when calling jube with -update) +my $defer_stdout_to_tmpdir=0; + +usage($0) if( ! GetOptions( + 'verbose=i' => \$opt_verbose, + 'dump' => \$opt_dump, + 'debug' => \$opt_debug, + 'update' => \$opt_update, + 'result' => \$opt_result, + 'start|submit' => \$opt_start, + 'force' => \$opt_force, + 'cdir=s' => \$opt_configdir, + 'pdir=s' => \$opt_platformsdir, + 'tmpdir=s' => \$opt_tmpdir, + 'showall' => \$opt_result_showall, + 'rmtmp' => \$opt_rmtmp, + 'cmpdir=s' => \$opt_cmpdir, + 'Version|V' => \$opt_version + ) ); + + +&version() if ($opt_version); + +use FindBin; +use lib "$FindBin::RealBin/lib"; + +use XML::Simple; +use Data::Dumper; +use File::Listing; +use Time::HiRes qw ( time ); + +my $idfile="./.bench_current_id.dat"; + + +my $startdate=localtime(time()); + + +my $logdir="$pwd/xmllogs"; +if(! -d $logdir) { + mkdir $logdir; +} + +my $stdlogdir="$pwd/logs"; +if(! -d $stdlogdir) { + mkdir $stdlogdir; +} +my $benchlogdir="$pwd/benchlog"; +if(! -d $benchlogdir) { + mkdir $benchlogdir; +} +my $tmpdir="$pwd/tmp"; +if($opt_tmpdir) { + $tmpdir=$opt_tmpdir; + $defer_stdout_to_tmpdir=1; +} +if(! -d $tmpdir) { + mkdir $tmpdir; +} +my $tmplogdir="$tmpdir/logs"; +if($defer_stdout_to_tmpdir) { + if(! -d $tmplogdir) { + mkdir $tmplogdir; + } +} + +my $rundir="$pwd/run"; +if(! -d $rundir) { + mkdir $rundir; +} +my $resdir="$pwd/results"; +if(! -d $resdir) { + mkdir $resdir; +} + + +my $logfilelevel=0; +my $loglinelength=140; +my $compilexmlfile="$opt_configdir/compile.xml"; +my $preparexmlfile="$opt_configdir/prepare.xml"; +my $executexmlfile="$opt_configdir/execute.xml"; +my $verifyxmlfile="$opt_configdir/verify.xml"; +my $analysexmlfile="$opt_configdir/analyse.xml"; +my $resultxmlfile="$opt_configdir/result.xml"; +my $platformxmlfile="$opt_platformsdir/platform.xml"; + +my ($benchxmlfile,$benchlogfile,$startspec,$enddate); +my(@attrsortlist); # sort order for result lists + +my(%generatedexecutables); + +my $tstart=time; +my $xs=XML::Simple->new(); +my $tdiff=time-$tstart; + +if((!$opt_update) && (!$opt_result)) { + $opt_start=1; +} + +if($opt_start) { + # submit new benchmarks + + if(!$ARGV[0]) { + if(-f "$opt_configdir/bench.xml") { + $benchxmlfile="$opt_configdir/bench.xml"; + } elsif(-f "$opt_configdir/bench1.xml") { + $benchxmlfile="$opt_configdir/bench1.xml"; + } else { + usage($0); + } + } else { + $benchxmlfile=$ARGV[0]; + } + $benchlogfile=sprintf("%s/benchlog_%06d.log",$benchlogdir,&get_identifier_ro()); + open(BENCHLOG,"> $benchlogfile"); +} +if($opt_update) { + # update logfiles (verifying and analyse) + $benchlogfile=sprintf("%s/benchupdatelog_%06d.log",$benchlogdir,&get_identifier_ro()); + open(BENCHLOG,"> $benchlogfile"); + $startspec="0+"; + $startspec = $ARGV[0] if($ARGV[0]); + if($startspec=~/last(\-\d+)?/) { + my $id=&get_identifier_ro(); + my $offset=-0; + $offset=$1 if($1); + if($id>0) { $startspec=sprintf("%d",$id-1+$offset);} + else { $startspec="0+"; } + } + + printlog(0," Looking for new log files with spec \#%s\n",$startspec); +} + +if($opt_result) { + # show results from logfiles + $benchlogfile=sprintf("%s/benchresultlog_%06d.log",$benchlogdir,&get_identifier_ro()); + open(BENCHLOG,"> $benchlogfile"); + $startspec="0+"; + $startspec = $ARGV[0] if($ARGV[0]); + if($startspec=~/last(\-\d+)?/) { + my $id=&get_identifier_ro(); + my $offset=-0; + $offset=$1 if($1); + if($id>0) { $startspec=sprintf("%d",$id-1+$offset);} + else { $startspec="0+"; } + } + printlog(0," Looking for log files with spec \#%s\n",$startspec); +} + + +printlog(0,"%s\n","-"x80); +printlog(0," Benchmark-Suite: starting at %s\n",$startdate); +printlog(0,"%s\n","-"x80); +printlog(0," %s\n", &getversion()); +printlog(0,"%s\n","-"x80); +printlog(0," OPTIONS: %-25s = %d \n","start", $opt_start) if ($opt_start); +printlog(0," OPTIONS: %-25s = %d \n","update", $opt_update) if ($opt_update); +printlog(0," OPTIONS: %-25s = %d \n","result", $opt_result) if ($opt_result); +printlog(0,"%s\n","-"x80); +printlog(0," OPTIONS: %-25s = %d \n","force update", $opt_force) if ($opt_force); +printlog(0," OPTIONS: %-25s = %d \n","verbose",$opt_verbose) if (defined($opt_verbose)); +printlog(0," OPTIONS: %-25s = %d \n","dump", $opt_dump) if ($opt_dump); +printlog(0," OPTIONS: %-25s = %s \n","Benchmark XML-file",$benchxmlfile) if ($benchxmlfile); +printlog(0," OPTIONS: %-25s = %s \n","Compile XML-file",$compilexmlfile); +printlog(0," OPTIONS: %-25s = %-25s\n","benchlogfile",$benchlogfile); +printlog(0," OPTIONS: %-25s = %-25s\n","configdir",$opt_configdir); +printlog(0," OPTIONS: %-25s = %-25s\n","platformdir",$opt_platformsdir); +printlog(0," OPTIONS: %-25s = %-25s\n","tmpdir",$opt_tmpdir) if ($opt_tmpdir); +printlog(0," OPTIONS: %-25s = %-25s\n","cmpdir",$opt_cmpdir) if ($opt_cmpdir); +printlog(0,"%s\n","-"x80); + + +&benchmark($benchxmlfile) if ($opt_start); +&check_defered_logdir() if(($defer_stdout_to_tmpdir) && ($opt_update)); +&update() if ($opt_update); +&result() if ($opt_result); + +$enddate=localtime(time()); +printlog(0,"%s\n","-"x80); +printlog(0," Benchmark-Suite: ending at %s\n",$enddate); +printlog(0,"%s\n","-"x80); + +close(BENCHLOG); + +sub benchmark { + my($benchxmlfile)=@_; + my($tstart,$tdiff,$benchref,$compileref,$prepareref,$executeref,$platformtopref,$platformref); + my($first_id,$last_id,$benchname,$platform,$i,$bench,$dir,$executable,$cproto,$bc,$taskref); + my($nodes,$taskspernode,$threadspertask,$param,$c,$t,$p,@param_ptr,$iter,$subid,$subdir); + my($protofile,$rc,$done,$cmd,$key,$platformparamsref,$val); + + + printlog(0, "--->\tprocessing %s ...\n", $benchxmlfile); + $tstart=time; + $benchref=$xs->XMLin($benchxmlfile, KeyAttr => { 'map' => "n", + 'benchmark' => "+name" + }, + ForceArray => 1); + $tdiff=time-$tstart; + printlog(3,"parsing $benchxmlfile in %6.4f sec\n",$tdiff); + + + + printlog(0, "--->\tprocessing %s ...\n", $compilexmlfile); + $tstart=time; + $compileref=$xs->XMLin($compilexmlfile, KeyAttr => { compile => "+cname", + substitute => "+infile", + 'sub' => "+from" + }, + ForceArray => 1); + $tdiff=time-$tstart; + printlog(3,"parsing $compilexmlfile in %6.4f sec\n",$tdiff); + + printlog(0, "--->\tprocessing %s ...\n", $preparexmlfile); + $tstart=time; + $prepareref=$xs->XMLin($preparexmlfile, KeyAttr => { prepare => "+cname", + substitute => "+infile", + 'sub' => "+from", + 'mkdir' => "+directory", + }, + ForceArray => 1); + $tdiff=time-$tstart; + printlog(3,"parsing $preparexmlfile in %6.4f sec\n",$tdiff); + + printlog(0, "--->\tprocessing %s ...\n", $executexmlfile); + $tstart=time; + $executeref=$xs->XMLin($executexmlfile, KeyAttr => { execute => "+cname", + substitute => "+infile", + 'sub' => "+from", + 'env' => '+var', + 'mkdir' => "+directory", + }, + ForceArray => 1); + $tdiff=time-$tstart; + printlog(3,"parsing $executexmlfile in %6.4f sec\n",$tdiff); + + $first_id=-1; + $last_id=-1; + + if($benchref) { + + if($opt_dump) { + printlog(-1,"%s",Dumper($benchref)); + exit(1); + } + + $benchname= $benchref->{'name'}; + $platform = $benchref->{'platform'}; + + + # scan platform.xml for coressponding entry + printlog(0, "--->\tprocessing %s ...\n", $platformxmlfile); + $tstart=time; + $platformtopref=$xs->XMLin($platformxmlfile, KeyAttr => { platform => "+name" + }, + ForceArray => 1); + $tdiff=time-$tstart; + printlog(3,"parsing $platformxmlfile in %6.4f sec\n",$tdiff); + + + printlog(4,"platform=>%s<\n",$platform,); + if($platformtopref->{'platform'}->{$platform}) { + $platformref=$platformtopref->{'platform'}->{$platform}; +# printlog(-1,"%s",Dumper($platformref)); + } else { + printlog(0,"No platform description found for %s in %s\n",$platform,$platformxmlfile); + $platformref=undef; + } + + # Evaluate expressions in platform.xml + + $platformparamsref=$platformref->{'params'}->[0]; + foreach $key (keys(%{$platformparamsref})) { + $val=$platformparamsref->{$key}; + $rc=&substitute(\$val,$platformparamsref); + $platformparamsref->{$key}=$val; + } + + printlog(0,"scanning benchmarks for $benchname on $platform: \n",""); + $i=0; + foreach $bench (keys(%{$benchref->{'benchmark'}})) { + my $active=$benchref->{'benchmark'}->{$bench}->{'active'}; + next if(!$active); + my $cname = $benchref->{'benchmark'}->{$bench}->{'compile'}->[0]->{'cname'}; + my $cname_expand=$cname; + my $cversion = $benchref->{'benchmark'}->{$bench}->{'compile'}->[0]->{'version'}; + my $params= $benchref->{'benchmark'}->{$bench}->{'params'}->[0]; + my $prep= $benchref->{'benchmark'}->{$bench}->{'prepare'}->[0]; + my $execu= $benchref->{'benchmark'}->{$bench}->{'execution'}->[0]; + my $analyse= $benchref->{'benchmark'}->{$bench}->{'analyse'}->[0]; + my $verify= $benchref->{'benchmark'}->{$bench}->{'verify'}->[0]; + my $iteration= $benchref->{'benchmark'}->{$bench}->{'execution'}->[0]->{'iteration'}; + my $addopt; + my $lastcommand=""; # will be executed after last benchmark, e.g. for job chains + + if(exists($benchref->{'benchmark'}->{$bench}->{'execution'}->[0]->{'addopt'})) { + $addopt = $benchref->{'benchmark'}->{$bench}->{'execution'}->[0]->{'addopt'}; + } else { + $addopt = ""; + } + + my $id=&get_identifier(); $last_id=$id; $first_id=$id if($first_id==-1); + $cversion="new" if(!$cversion); + + my $identifier=sprintf("%s_%s_%s_i%06d",$benchname,$platform,$bench,$id); + printlog(0,"\t %02d: %-20s cname=%-10s (%s) -> Identifier=%s\n",++$i,$bench,$cname,$platform,$identifier); + + $dir="$tmpdir/$identifier"; + printlog(0,"\t\t\t\t -> generating temporary directory %s\n",$dir); + mkdir $dir; + if(! -d $dir) { + printlog(-1,"... failed to create directory $dir\n"); + exit(-1); + } + +# if -cmpdir is set the compile step has to be performed in the applied compile directory +# this causes a redefinition of $dir + + my $cmpdir; + if($opt_cmpdir){ + $cmpdir="$opt_cmpdir/$identifier"; + printlog(0,"\t\t\t\t -> generating temporary directory %s\n",$cmpdir); + system "mkdir -p $cmpdir"; + if(! -d $cmpdir) { + printlog(-1,"... failed to create directory $cmpdir\n"); + exit(-1); + } + } else { + $cmpdir=$dir; + } + + + + + printlog(0,"\t\t\t\t -> generating run step %s\n",$cname); + $bc=0; + + my $aref; + if(ref($benchref->{'benchmark'}->{$bench}->{'tasks'}) eq "ARRAY") { + $aref=$benchref->{'benchmark'}->{$bench}->{'tasks'}; + } else { + $aref=[$benchref->{'benchmark'}->{$bench}->{'tasks'}]; + } + foreach $taskref (@{$aref}) { + my($threadspertaskspec,$taskspernodespec,$nodesspec)=(1,1,1); + $threadspertaskspec=$taskref->{threadspertask} if ($taskref->{threadspertask}); + $taskspernodespec= $taskref->{taskspernode} if ($taskref->{taskspernode}); + $nodesspec= $taskref->{nodes} if ($taskref->{nodes}); + + my $bproto=" {'cname'}."\"\n"; + $bproto.=" execute_cname=\"".$execu->{'cname'}."\"\n"; + $bproto.=" verify_cname=\"".$verify->{'cname'}."\"\n"; + $bproto.=" analyse_cname=\"".$analyse->{'cname'}."\"\n"; + foreach $nodes (&getsequence($nodesspec)) { + foreach $taskspernode (&getsequence($taskspernodespec)) { + foreach $threadspertask (&getsequence($threadspertaskspec)) { + my (%paramlist,@param_name,@param_cnt,@param_pt,$numparam,$numtests,%phash); + printlog(0,"\t\t\t\t\t%-2d: %d nodes %d tasks %d threads\n",++$bc, + $nodes,$taskspernode,$threadspertask); + + $phash{nodes}=$nodes; $phash{taskspernode}=$taskspernode; + $phash{threadspertask}=$threadspertask; $phash{tasks}=$taskspernode*$nodes; + $phash{ncpus}=$threadspertask*$taskspernode*$nodes; + + # run twice over parms for substituting scalar parms + foreach my $step (1,2) { + foreach $param (keys(%$params)) { + my$val=$params->{$param}; + &substitute(\$val,\%phash); + $paramlist{$param}=[&getsequence($val,$benchref->{'benchmark'}->{$bench})]; + if(scalar @{$paramlist{$param}} == 1) { + $phash{$param}=$paramlist{$param}->[0]; + } + } + } + + $c=0; + $numtests=1; + foreach $param ( sort { $#{$paramlist{$a}} <=> $#{$paramlist{$b}} } (keys(%paramlist))) { + $param_name[$c]=$param; + $param_cnt[$c]=$#{$paramlist{$param}}+1; + $numtests*=$param_cnt[$c]; + $param_ptr[$c]=0; +# print "debug: $c-> $param_name[$c] $param_cnt[$c]\n"; + $c++; + } + $numparam=$c; + + for($t=1;$t<=$numtests;$t++) { + my $str=""; + my %parmhash; + my $tproto=$bproto; + + for($p=0;$p<$numparam;$p++) { + $str.=sprintf("[%s->%s]",$param_name[$p],$paramlist{$param_name[$p]}->[$param_ptr[$p]]); + if($param_name[$p]=~/\-/) { + #field of parms + my @plist=split(/\-/,$param_name[$p]); + my @vlist=split(/\:/,$paramlist{$param_name[$p]}->[$param_ptr[$p]]); + for(my $j=0;$j<=$#plist;$j++) { + $parmhash{$plist[$j]}=$vlist[$j]; + } + } else { + # scalar + $parmhash{$param_name[$p]}=$paramlist{$param_name[$p]}->[$param_ptr[$p]]; + } + $tproto.=" ".$param_name[$p]."=\"".$paramlist{$param_name[$p]}->[$param_ptr[$p]]."\"\n"; + } + printlog(0,"\t\t\t\t\t %2d: %s\n",$t,$str); + $parmhash{platform}=$platform; + $parmhash{pdir}=$opt_platformsdir; + $parmhash{benchname}=$benchname; + $parmhash{benchhome}=$pwd; + $parmhash{name}=$bench; + $parmhash{nodes}=$nodes; + $parmhash{taskspernode}=$taskspernode; + $parmhash{threadspertask}=$threadspertask; + $parmhash{addopt}=$addopt; + foreach $param (keys(%phash)) { + $parmhash{$param}=$phash{$param}; + } + $tproto.=" nodes=\"".$nodes."\"\n"; + $tproto.=" taskspernode=\"".$taskspernode."\"\n"; + $tproto.=" threadspertask=\"".$threadspertask."\"\n"; + + substitute(\$cname_expand,\%parmhash); + + $iteration=1 if(!$iteration); + for($iter=1;$iter<=$iteration;$iter++) { + my $proto=$tproto; + + $subid=sprintf("n%dp%dt%d_t%03d_i%02d",$nodes,$taskspernode,$threadspertask,$t,$iter); + + + # $cmpsubdir holds the direction for the compile step if -cmpdir is set + my $cmpsubdir=$cmpdir."/".$subid; + + $subdir=$dir."/".$subid; + $parmhash{id}=$identifier; + $parmhash{subid}=$subid; + $parmhash{rundir}=$subdir; + $parmhash{subdir}=$subdir; + printlog(1,"\t\t\t\t\t\t -> generating temporary directory %s\n",$subdir); + mkdir $subdir; + if(! -d $subdir) { + printlog(-1,"... failed to create directory $subdir\n"); + exit(-1); + } + if($subdir ne $cmpsubdir) { + mkdir $cmpsubdir; + if(! -d $cmpsubdir) { + printlog(-1,"... failed to create directory $cmpsubdir\n"); + exit(-1); + } + } + $proto.=" iteration=\"".$iter."\"\n"; + $proto.=" subdir=\"".$subdir."\"\n"; + $proto.=" identifier=\"".$identifier."\"\n"; + $proto.=" subid=\"".$subid."\"\n"; + + $protofile=sprintf("%s/benchlog_%s_%s.log",$logdir,$identifier,$subid); + + printlog(0,"\t\t\t\t\t\t -> compile step %s (%s)\n",$cname,$cname_expand); + + $executable=&compile($compileref,$subdir,$cmpsubdir,$identifier,$cname,$cversion,\$cproto, + $benchref->{'benchmark'}->{$bench}->{'compile'}->[0], + $platformref,\%parmhash); +#FJ + if(! $executable) { + printlog(-1,"... failed to create executable %s\n",$cname); + exit(-1); + } + + $rc=&pproto_open($protofile,$benchref,$bench,$startdate); + + # Write platform information to xmllog + my $platform_proto=" \n"; + $platform_proto.=" {$platform_key}\"\n"; + } + $platform_proto.=" />\n"; + $platform_proto.=" \n"; + $rc=&pproto($protofile,$platform_proto); + + # Write compile information to xmllog + $rc=&pproto($protofile,$cproto); + + # Write prepare information to xmllog + $proto.=" />\n"; + $rc=&pproto($protofile,$proto); + + printlog(0,"\t\t\t\t\t\t -> prepare step %s (%s)\n",$prep->{'cname'},$platform); + $rc=&prepare($prepareref,$subdir,$identifier,$prep->{'cname'},\%parmhash); + printlog(0,"\t\t\t\t\t\t -> execute step %s (%s)\n",$execu->{'cname'},$platform); + + $rc=&execute($executeref,$subdir,$identifier,$subid,$executable,$execu,\%parmhash,\$lastcommand); + + $rc=&pproto_close($protofile,$benchref->{'benchmark'}->{$bench}); + + + } + + # search next parameter set + if($numtests>1) { + $done=0; + $p=$numparam-1; + while(!$done) { + $param_ptr[$p]++; + if($param_ptr[$p]<$param_cnt[$p]) { + $done=1; + } else { + $param_ptr[$p]=0; + $p--; + $done=1 if($p<0); # only for fun + } + } + } + } + + } + } + } + printlog(0,"\t\t\t\t\t\n",""); + } + + # execute lastcommand + if($lastcommand ne "") { + printlog(0,"\t\t\t\t\t-> last command: %s %s\n",($opt_debug)?"[debug]":"",$lastcommand); + system($lastcommand) if (!$opt_debug); + if($?) { printlog(-1,"... failed to execute %s\n",$lastcommand); return (-1);} + } + + if($opt_rmtmp) { + $cmd="rm -r $dir"; + printlog(0,"\t\t\t\t\texecuting: %s\n",$cmd); + system($cmd); + if($?) { printlog(-1,"... failed to execute \n",$?); exit(-1);} + } + + } + } else { + printlog(-1,"Error while processing XML file, exiting ...\n",""); + } + + printlog(0,"%s\n","-"x80); + printlog(0," JUBE: used id: %s\n",$last_id) if($first_id == $last_id); + printlog(0," JUBE: used id range: from %s_id to %s\n",$first_id,$last_id) if($first_id != $last_id); + +} + + +sub check_defered_logdir { + my($rc,$fn,$id,$name,$destname,$cmd); + $rc=opendir(DIR,$tmplogdir); + while($fn=readdir(DIR)) { + next if($fn!~/\.log$/); + if($fn=~/.*_i$patint.*_i$patint\_/) { + $name=$tmplogdir."/".$fn; + $destname=$stdlogdir."/".$fn; + my($id)=($1); + if(&testspec($id,$startspec)) { + printlog(0,"%s\n","-"x80); + printlog(0," JUBE: check log file in tmpdir: %s<=>%s \n",$name,$destname); + if(! -f $destname) { + printlog(0," JUBE: new file: cp -p %s %s \n",$name,$destname); + $cmd="cp -p ${name} ${destname}"; + printlog(3,"\texecuting: %s\n",$cmd); + system($cmd); + if($?) { printlog(-1,"... failed to execute %s\n",$?); exit(-1);} + + } else { + my($dev1,$ino1,$mode1,$nlink1,$uid1,$gid1,$rdev1,$size1, + $atime1,$mtime1,$ctime1,$blksize1,$blocks1) = stat($name); + my($dev2,$ino2,$mode2,$nlink2,$uid2,$gid2,$rdev2,$size2, + $atime2,$mtime2,$ctime2,$blksize2,$blocks2) = stat($destname); + + if( ($size1 != $size2) || ($mtime1 != $mtime2)) { + printlog(0," JUBE: newer file: cp -p %s %s \n",$name,$destname); + $cmd="cp -p ${name} ${destname}"; + printlog(3,"\texecuting: %s\n",$cmd); + system($cmd); + } + } + } + } + } +} + +sub update { + my($entry,$logref); + my($analyseref,$bench,$nextstep,$jobstartfile,$bstdout,$bstderr,$inpsep,$stdoutdata,$stderrdata); + my($jobendfile,$verifyref,$verifyfile,$analysefile,$cname,$cmd,$rc,$fn,$name,$endmarkfound,$line); + my($includeref,$includepattern,$file,$parm); + my($precommand); + + printlog(0, "--->\tprocessing %s ...\n", $analysexmlfile); + $tstart=time; + $analyseref=$xs->XMLin($analysexmlfile, KeyAttr => { analyse => "+cname", + parm => "+name", + includepattern => "+file" + }, + ForceArray => 1); + $tdiff=time-$tstart; + printlog(3,"parsing $analysexmlfile in %6.4f sec\n",$tdiff); + + printlog(0, "--->\tprocessing %s ...\n", $verifyxmlfile); + $tstart=time; + $verifyref=$xs->XMLin($verifyxmlfile, KeyAttr => { verify => "+cname"}, + ForceArray => 1); + $tdiff=time-$tstart; + printlog(3,"parsing $verifyxmlfile in %6.4f sec\n",$tdiff); + + foreach $cname (sort(keys(%{$analyseref->{analyse}}))) { + foreach $includepattern (sort(keys(%{$analyseref->{analyse}->{$cname}->{includepattern}}))) { + $file=$includepattern; + printlog(3,"analysis: include patterns from %s\n",$file); + $tstart=time; + $includeref=$xs->XMLin($file, KeyAttr => { parm => "+name"}, + ForceArray => 1); + $tdiff=time-$tstart; + printlog(3,"parsing includefile $file in %6.4f sec\n",$tdiff); +# printlog(-1,"%s",Dumper($includeref)); + foreach $parm (keys(%{$includeref->{parm}})) { + $analyseref->{analyse}->{$cname}->{parm}->{$parm}=$includeref->{parm}->{$parm}; + } + } + } + +# test if jube_report.xsl and style.css are put in xmllogs/ +{ + my $xsl_file = "jube_report.xsl"; + my $css = "style.css"; + my $cmd1 = "cp $instpath/$xsl_file $logdir"; + my $cmd2 = "cp $instpath/$css $logdir"; + + system($cmd1) unless -e "$logdir/$xsl_file"; + system($cmd2) unless -e "$logdir/$css"; +} + + +# foreach $entry (File::Listing::parse_dir(`ls -lrt $logdir/*.log`)) { +# my($name,$type,$size,$mtime,$mode)=@$entry; + + $rc=opendir(DIR,$logdir); + while($fn=readdir(DIR)) { + next if($fn!~/\.log$/); + if($fn=~/benchlog.*i$patint\_/) { + $name=$logdir."/".$fn; + my($id)=($1); + if(&testspec($id,$startspec)) { + printlog(0,"%s\n","-"x80); + printlog(0," JUBE: update, parsing: %s \n",$name); + + # check if xml log is complete + $endmarkfound=0; + open(IN,$name); + while($line=) { + $endmarkfound=1 if($line=~/<\/benchrun>/); + } + close(IN); + next if(!$endmarkfound); + + # read xml log file + $tstart=time; + $logref=$xs->XMLin($name, KeyAttr => { benchmark => "+name" }, + ForceArray => 1); +# printlog(-1,"%s",Dumper($logref)); + # bug fix + if(exists($logref->{benchmark}->{""})) { + delete($logref->{benchmark}->{""}); + } + $bench=(keys(%{$logref->{benchmark}}))[0]; + $tdiff=time-$tstart; + printlog(2,"\t\tparsing $name in %6.4f sec bench=%s\n",$tdiff,$bench); +# printlog(-1,"%s",Dumper($logref)); + + # some fixes + # ncpus is not stored in result XML file + $logref->{benchmark}->{$bench}->{ncpus}= + $logref->{benchmark}->{$bench}->{threadspertask} + * $logref->{benchmark}->{$bench}->{taskspernode} + * $logref->{benchmark}->{$bench}->{nodes}; +# printlog(-1,"%s",Dumper($logref)); + if(!exists($logref->{benchmark}->{$bench}->{platform})) { + $logref->{benchmark}->{$bench}->{platform}=$logref->{platform}->[0]; + } + + # testing job start + $nextstep=1; + $jobstartfile=$logref->{benchmark}->{$bench}->{subdir}."/start_info.xml"; +# print "debug: $bench $jobstartfile\n"; + if ((-f $jobstartfile) && ($nextstep)) { + my $startref=$xs->XMLin($jobstartfile, ForceArray => 1); + my $starttime=$startref->{at}; + printlog(1,"\t\t job: started at %s\n",$starttime); + $logref->{jobstartdate}=[$starttime]; + } else {$nextstep=0;} + + # test if start_info.xml exist + if(!-f $jobstartfile) { + printlog(0,"%s doesn't exist\n",$jobstartfile); + } + + # testing job end + $jobendfile=$logref->{benchmark}->{$bench}->{subdir}."/end_info.xml"; + if ((-f $jobendfile) && ($nextstep)) { + my $endref=$xs->XMLin($jobendfile, ForceArray => 1); + my $endtime=$endref->{at}; + printlog(1,"\t\t job: ended at %s\n",$endtime); + $logref->{jobenddate}=[$endtime]; + } else {$nextstep=0;} + + # test if start_info.xml exist + if(!-f $jobendfile) { + printlog(0,"%s doesn't exist\n",$jobendfile); + } + + # read stdout and stderr + if($nextstep) { + $bstdout=sprintf("%s/%s.%s_stdout.log",$stdlogdir, + $logref->{benchmark}->{$bench}->{identifier}, + $logref->{benchmark}->{$bench}->{subid}); + $bstderr=sprintf("%s/%s.%s_stderr.log",$stdlogdir, + $logref->{benchmark}->{$bench}->{identifier}, + $logref->{benchmark}->{$bench}->{subid}); + + $inpsep=$/;$/=undef; + if(! open(IN,"$bstdout") ) { + $stdoutdata=""; + printlog(-1,"... failed to open stdout file %s\n",$bstdout); + } else { + $stdoutdata=; + close(IN); + } + if(! open(IN,"$bstderr") ) { + $stderrdata=""; + printlog(-1," JUBE: no stderr file found or not possible to open. \n",$bstderr); + } else { + $stderrdata=; + close(IN); + } + $/=$inpsep; + + $logref->{stdoutfile}->[0]->{name}=$bstdout; + $logref->{stderrfile}->[0]->{name}=$bstderr; + printlog(1,"\t\t job: stdout %d bytes\n",length($stdoutdata)); + printlog(1,"\t\t job: stderr %d bytes\n",length($stderrdata)); + + } + + # call verify step, generates also verify.xml in subdir + if($nextstep) { + $verifyfile=$logref->{benchmark}->{$bench}->{subdir}."/verify.xml"; + if ( (! -f $verifyfile) || ($opt_force) ) { + # call verify + $cname=$logref->{benchmark}->{$bench}->{verify_cname}; + $cname=$logref->{benchmark}->{$bench}->{postp_cname} if(!$cname); + &verify($logref->{benchmark}->{$bench}->{subdir}, + $bstdout,$bstderr,$cname, + $verifyfile,$verifyref,$logref->{benchmark}->{$bench}); + printlog(1,"\t\t job: verify done %s\n",$cname); + } + + # read verify results + if (-f $verifyfile) { + # slurp verify data in + my $vref=$xs->XMLin($verifyfile, KeyAttr => {parm => "+name"}, ForceArray => 1); +# printlog(-1,"%s",Dumper($vref)); + + $logref->{verify}=$vref; + if($vref->{parm}->{vcheck}->{'value'}) { + printlog(1,"\t\t job: verify, bench check=%s\n",$vref->{parm}->{vcheck}->{'value'}); + } else { + printlog(1,"\t\t job: verify, bench no results\n",""); + } + } else {$nextstep=0;} + } + + + # execute precommand +# printlog(-1,"$cname %s",Dumper($analyseref)); + if(exists($analyseref->{analyse})) { + $cname=$logref->{benchmark}->{$bench}->{analyse_cname}; + &substitute(\$cname,$logref->{benchmark}->{$bench}); + $precommand = $analyseref->{analyse}->{$cname}->{precommand}->[0]; + } + if($precommand) { + my $command=$precommand; + my $subdir=$logref->{benchmark}->{$bench}->{subdir}; + $rc=&substitute(\$command,$logref->{benchmark}->{$bench}); + $rc=&substitute(\$command,$logref->{compile}->[0]->{params}->[0]); + printlog(3,"\t\t precommand substitute param %s not found for cmd=>%s<\n",$SUBSTITUTE_NOTFOUND,$command,$rc) if($rc==-1); + printlog(3,"\t\t precommand %s cmd=>%s< rc=%d\n",$cname,$command,$rc); + $cmd="(cd $subdir; $command 1>$subdir/precommand_out.log 2>$subdir/precommand_err.log)"; + printlog(0,"\t\t\t\t\texecuting: %s\n",$cmd); + system($cmd); +# if($?) { printlog(-1,"... failed to execute precommand $?\n",""); return (undef);} + } + + # analyse + if($nextstep) { + $analysefile=$logref->{benchmark}->{$bench}->{subdir}."/analyse.xml"; + if ( (! -f $analysefile) || ($opt_force) ) { + # call analyse + $cname=$logref->{benchmark}->{$bench}->{analyse_cname}; + printlog(3,"\t\t analyse cname=>%s<\n",$cname); + &analyse(\$stdoutdata,\$stderrdata,$cname,$analysefile,$analyseref,$logref->{benchmark}->{$bench}); + printlog(1,"\t\t job: analyse done\n",""); + } + if (-f $analysefile) { + # slurp analyse data in + my $aref=$xs->XMLin($analysefile, ForceArray => 1); + $logref->{analyse}=$aref; + if($aref->{parm}->{walltime}->{'value'}) { + printlog(1,"\t\t job: analyse, bench runtime=%10.4f s\n",$aref->{parm}->{walltime}->{'value'}); + } else { + printlog(1,"\t\t job: analyse, bench no results\n",""); + } + } else {$nextstep=0;} + } + + +# return(); + + if(! open(OUT,"> ${name}.new") ) { + printlog(-1,"... failed to open log file ${name}.new\n"); return (-1); + } + print OUT $xs->XMLout($logref, AttrIndent => 1, RootName => "benchrun" ); + close(OUT); + if(0) { + $cmd="mv ${name} ${name}.old"; + printlog(3,"\texecuting: %s\n",$cmd); + system($cmd); + if($?) { printlog(-1,"... failed to execute %s\n",$?); exit(-1);} + } + $cmd="mv ${name}.new ${name}"; + printlog(3,"--->\tprocessing %s\n",$resultxmlfile); + printlog(3,"\texecuting: %s\n",$cmd); + system($cmd); + if($?) { printlog(-1,"... failed to execute %s\n",$?); exit(-1);} + + + $logref->{stdoutfile}->[0]->{content}=$stdoutdata if($stdoutdata); + $logref->{stderrfile}->[0]->{content}=$stderrdata if($stderrdata); + + my $longname=$name; + if($longname=~s/\.log$/\.longlog/s) { + if(! open(OUT,"> ${longname}") ) { + printlog(-1,"... failed to open log file %s\n",$longname); return (-1); + } + + my $header = ''; + print OUT $xs->XMLout($logref, AttrIndent => 1, RootName => "benchrun" , xmldecl => "$header", noescape => 1); + close(OUT); + } + + +# printlog(-1,"%s",Dumper($logref)); + + } + + + } + } +} + + +sub result { + my ($entry,$logref,$benchmref,$key,$bench); + my($identifier,$subid,$result,%keylist,%keytype,$aref); + my($resultref,$attrlist,@allattrsort,@allattr,@attr,$sortlist,$rc,$fn,$name,$endmarkfound,$line); + my($compileparmref,$href,$resfile,$vref,$lcnt,$tabcnt,$tabcntsort,$tab,$showref,$sortref); + + my @colw; + my $subw=18; + my $digits=2; + my @tabtitle; + my @is_transposed; + + $tstart=time; + $resultref=$xs->XMLin($resultxmlfile, KeyAttr => { }, + ForceArray => 1, + ForceContent => 1); + $tdiff=time-$tstart; +# printlog(-1,"%s",Dumper($resultref)); + printlog(3,"parsing $resultxmlfile in %6.4f sec\n",$tdiff); + $tabcnt=0; + foreach $showref (@{$resultref->{'show'}}) { + $attrlist=$showref->{'content'}; + $attrlist=~s/\s*//gs; + if(exists($showref->{'active'})) { + next if($showref->{'active'} ne "1"); + } + + if(exists($showref->{'colw'})) { + $colw[$tabcnt] = $showref->{'colw'}; + $subw = $colw[$tabcnt]+8; + } + + if(exists($showref->{'digits'})) { + $digits = $showref->{'digits'}; + $subw = $colw[$tabcnt]+8; + } + + if(!exists($showref->{'colw'})) { + printlog(-1,"\nWARNING in result.xml: column width has to be defined ... !\n\n",""); + printlog(-1, "Example: ... \n\n",""); + printlog(-1, "Using colw=\"$colw_default\" as default\n\n",""); + $colw[$tabcnt] = $colw_default; + $subw = $colw[$tabcnt]+8; + } + + if (exists $showref->{'title'}) { + $tabtitle[$tabcnt] = $showref->{'title'}; + } + + if (exists $showref->{'transpose'} && $showref->{'transpose'} eq "yes") { + $is_transposed[$tabcnt] = 1; + } + + @{$attr[$tabcnt]}=(split(/,/,$attrlist)); + push(@allattr,split(/,/,$attrlist)); + $tabcnt++; + } + + $tabcntsort=0; + foreach $sortref (@{$resultref->{'sort'}}) { + $sortlist=$sortref->{'content'}; + if(exists($sortref->{'active'})) { + next if($sortref->{'active'} ne "1"); + } + $sortlist=~s/\s*//gs; + @{$attrsortlist[$tabcntsort]}=(split(/,/,$sortlist)); + push(@allattrsort,split(/,/,$sortlist)); + $tabcntsort++; + } + + if($tabcnt==0) { + printlog(-1,"\nERROR in result: no show tab found, exiting ... !\n\n",""); return (-1); + } + + if($tabcntsort==0) { + printlog(-1,"\nERROR in result: no sort tab found, exiting ... !\n\n",""); return (-1); + } + + if($tabcntsort < $tabcnt) { + for($tab=$tabcntsort;$tab<$tabcnt;$tab++) { + $attrsortlist[$tab]=$attrsortlist[$tabcntsort-1]; + } + $tabcntsort=$tabcnt; + } + + if($tabcntsort != $tabcnt) { + printlog(-1,"\nERROR in result: number of sort tabs and show tabs are different (%d != %d), exiting ... !\n\n", + $tabcnt,$tabcntsort); return (-1); + } + + $rc=opendir(DIR,$logdir); + while($fn=readdir(DIR)) { + next if($fn!~/\.log$/); + $name=$logdir."/".$fn; +# foreach $entry (File::Listing::parse_dir(`ls -l $logdir/*.log`)) { +# my($name,$type,$size,$mtime,$mode)=@$entry; + if($name=~/benchlog.*i$patint\_/) { + my($id)=($1); + if(&testspec($id,$startspec)) { + printlog(0," JUBE: result, parsing %s \n",$name); + open(IN,$name); + $endmarkfound=0; + while($line=) { + $endmarkfound=1 if($line=~/<\/benchrun>/); + } + close(IN); + next if(!$endmarkfound); + $tstart=time; + $logref=$xs->XMLin($name, KeyAttr => { benchmark => "+name", values => "name" }, + ForceArray => 1); + $tdiff=time-$tstart; + printlog(2,"\t\t parsing $name in %6.4f sec\n",$tdiff); +# printlog(-1,"%s",Dumper($logref)); + # bug fix + if(exists($logref->{benchmark}->{""})) { + delete($logref->{benchmark}->{""}); + } + $bench=(keys(%{$logref->{benchmark}}))[0]; + $benchmref=$logref->{benchmark}->{$bench}; + $compileparmref=$logref->{compile}->[0]->{params}->[0]; + $identifier=$benchmref->{identifier}; + $subid=$benchmref->{subid}; + foreach $href ($benchmref,$compileparmref) { + foreach $key (keys(%$href)) { + $result->{$identifier}->{$subid}->{$key}=$href->{$key}; + $keylist{$key}++; + if(!exists($keytype{$key})) { + $keytype{$key}="string"; + $keytype{$key}="float" if($result->{$identifier}->{$subid}->{$key}=~/^$patnfp$/); + $keytype{$key}="int" if($result->{$identifier}->{$subid}->{$key}=~/^$patnint$/); + } + } + } + + foreach $key ("jobenddate") { + $result->{$identifier}->{$subid}->{$key}=$logref->{$key}->[0]; + $keylist{$key}++;$keytype{$key}="string"; + } + + + $aref=$logref->{analyse}->[0]; + foreach $key (keys(%$aref)) { + if(ref($aref->{$key})) { +# printlog(-1,"%s",Dumper($aref->{$key})); + if(exists($aref->{$key}->[0]->{value})) { + $result->{$identifier}->{$subid}->{$key}=$aref->{$key}->[0]->{value}; +# print "debug: $key: $result->{$identifier}->{$subid}->{$key}\n"; + $keylist{$key}++; + $keytype{$key}=$aref->{$key}->[0]->{type}; + } elsif(exists($aref->{$key}->[0]->{values})) { + $result->{$identifier}->{$subid}->{$key}=$aref->{$key}->[0]->{values}; + $keylist{$key}++; + $keytype{$key}="index"; + } + } + } + + $vref=$logref->{verify}->[0]; + foreach $key (keys(%$vref)) { + if(ref($vref->{$key})) { +# printlog(-1,"%s",Dumper($aref->{$key})); + if(exists($vref->{$key}->[0]->{value})) { + $result->{$identifier}->{$subid}->{$key}=$vref->{$key}->[0]->{value}; +# print "debug: $key: $result->{$identifier}->{$subid}->{$key}\n"; + $keylist{$key}++; + $keytype{$key}=$vref->{$key}->[0]->{type}; + } elsif(exists($vref->{$key}->[0]->{values})) { + $result->{$identifier}->{$subid}->{$key}=$vref->{$key}->[0]->{values}; + $keylist{$key}++; + $keytype{$key}="index"; + } + } + } + + } + } + } + + foreach $key (@allattr,@allattrsort) { + if(!defined($keytype{$key})) { +# print "debug: not found $key -> $result->{$identifier}->{$subid}->{$key}\n"; + $keytype{$key}="int"; + } + } + + + foreach $identifier (sort {$a cmp $b} keys(%$result)) { + $lcnt=0; + printf("\n%-40s\n",$identifier); + printf("%-40s\n","="x40); + $resfile="$resdir"."/".$identifier.".dat"; + if(! open(DAT,"> $resfile") ) { + printlog(-1,"... failed to open log file $resfile\n"); return (-1); + } + + + for($tab=0;$tab<$tabcnt;$tab++) { + + # header + + + # Print title if it is given. + if ($tabtitle[$tab]) { + printf(" %s\n", $tabtitle[$tab]); + printf(DAT " %s\n", $tabtitle[$tab]); + } + + if (not $is_transposed[$tab]) { + # + # Non-transposed output + # + + printf(" %-${subw}s", "Subid"); + printf(DAT "# %-${subw}s", "Subid"); + foreach $key (@{$attr[$tab]}) { + my $wkey=$key; + $wkey=~s/000000\b/M/s; + $wkey=~s/000\b/K/s; + + if(length($wkey)>$colw[$tab]) { + printf(" %s%s",substr($wkey,0,$colw[$tab]-3),substr($wkey,-3)); + printf(DAT " %s%s",substr($wkey,0,$colw[$tab]-3),substr($wkey,-3)); + } else { + printf(" %$colw[$tab]s",$wkey); + printf(DAT " %$colw[$tab]s",$wkey); + } + } + print "\n"; + print DAT "\n"; + printf(" %${subw}s", "-"x${subw}); + printf(DAT "# %${subw}s", "-"x${subw}); + foreach $key (@{$attr[$tab]}) { + printf(" %$colw[$tab]s","-"x$colw[$tab]); + printf(DAT " %$colw[$tab]s","-"x$colw[$tab]); + } + print "\n"; + print DAT "\n"; + + foreach $subid (sort {&attrsort($result->{$identifier},\%keytype,$attrsortlist[$tab],$a,$b) } + keys(%{$result->{$identifier}})) { + next if(!($result->{$identifier})); + next if(!exists($result->{$identifier}->{$subid})); + next if((!exists($result->{$identifier}->{$subid}->{'walltime'})) && (!$opt_result_showall)); + if(($opt_result_showall) || ($result->{$identifier}->{$subid}->{'walltime'}>=0)) { + $lcnt++; + printf(" %${subw}s",$subid); + printf(DAT " %${subw}s",$subid); + foreach $key (@{$attr[$tab]}) { + if($key=~/$patwrd\($patwrd\)/) { + # index + my($kkey,$kind)=($1,$2); +# print "debug: $key: ($kkey,$kind)\n"; + printf(" %$colw[$tab].${digits}f",$result->{$identifier}->{$subid}->{$kkey}->{$kind}->{"value"}); + printf(DAT " %$colw[$tab].${digits}f",$result->{$identifier}->{$subid}->{$kkey}->{$kind}->{"value"}); + } else { + if(exists($result->{$identifier}->{$subid}->{$key})) { + printf(" %$colw[$tab]d",$result->{$identifier}->{$subid}->{$key}) if($keytype{$key} eq "bool"); + printf(" %$colw[$tab]s",$result->{$identifier}->{$subid}->{$key}) if($keytype{$key} eq "string"); + printf(" %$colw[$tab].${digits}f",$result->{$identifier}->{$subid}->{$key}) if($keytype{$key} eq "float"); + printf(" %$colw[$tab]d",$result->{$identifier}->{$subid}->{$key}) if($keytype{$key} eq "int"); + printf(DAT " %$colw[$tab]d",$result->{$identifier}->{$subid}->{$key}) if($keytype{$key} eq "bool"); + printf(DAT " %$colw[$tab]s",$result->{$identifier}->{$subid}->{$key}) if($keytype{$key} eq "string"); + printf(DAT " %$colw[$tab].${digits}f",$result->{$identifier}->{$subid}->{$key}) if($keytype{$key} eq "float"); + printf(DAT " %$colw[$tab]d",$result->{$identifier}->{$subid}->{$key}) if($keytype{$key} eq "int"); + } else { + printf(" %${colw[$tab]}s"," ---"); + printf(DAT " %${colw[$tab]}s"," ---"); + } + } + } + print "\n"; + print DAT "\n"; + } + } + print "\n"; + print DAT "\n"; + + } else { + # + # Transposed output + # + + my $keyw; # key width + my $tcolw; # data column width + my @sorted_subids = sort {&attrsort($result->{$identifier}, \%keytype, $attrsortlist[$tab], $a, $b)} + keys (%{$result->{$identifier}}); + + # Get maximum key width. + $keyw = 10; # Has to fit "Subid:". + foreach $key (@{$attr[$tab]}) { + $keyw = max(length($key) + 1, $keyw); + } + + # Get maximum data column width. Take attribute 'colw' as a hint. + $tcolw = $colw[$tab]; + for (@sorted_subids) { + $tcolw = max(length($_) + 1, $tcolw); + } + + # Print the first row. + printf(" %-${keyw}s|", "Subid:"); + printf(DAT "# %-${keyw}s|", "Subid:"); + for (@sorted_subids) { + printf(" %${tcolw}s",$_); + printf(DAT " %${tcolw}s",$_); + } + # Print the second row (separator line). + printf("\n %s+", "-"x$keyw); + printf(DAT "\n %s+", "-"x$keyw); + for (@sorted_subids) { + printf("-%s", "-"x$tcolw); + printf(DAT "-%s", "-"x$tcolw); + } + print "\n"; + print DAT "\n"; + # For each key print the key and its associated values. + foreach $key (@{$attr[$tab]}) { + my $wkey = $key; + $wkey =~ s/000000\b/M/s; + $wkey =~ s/000\b/K/s; + printf(" %-${keyw}s|", $wkey); + printf(DAT " %-${keyw}s|", $wkey); + undef $wkey; + + foreach $subid (@sorted_subids) { + next if (!($result->{$identifier})); + next if (!exists($result->{$identifier}->{$subid})); + next if ((!exists($result->{$identifier}->{$subid}->{'walltime'})) && (!$opt_result_showall)); + if (($opt_result_showall) || ($result->{$identifier}->{$subid}->{'walltime'} >= 0)) { + $lcnt++; + if ($key =~ /$patwrd\($patwrd\)/) { + my ($kkey,$kind) = ($1,$2); + printf(" %${tcolw}.2f",$result->{$identifier}->{$subid}->{$kkey}->{$kind}->{"value"}); + printf(DAT " %${tcolw}.2f",$result->{$identifier}->{$subid}->{$kkey}->{$kind}->{"value"}); + } else { + if (exists($result->{$identifier}->{$subid}->{$key})) { + printf(" %${tcolw}d",$result->{$identifier}->{$subid}->{$key}) if ($keytype{$key} eq "bool"); + printf(" %${tcolw}s",$result->{$identifier}->{$subid}->{$key}) if ($keytype{$key} eq "string"); + printf(" %${tcolw}.2f",$result->{$identifier}->{$subid}->{$key}) if ($keytype{$key} eq "float"); + printf(" %${tcolw}d",$result->{$identifier}->{$subid}->{$key}) if ($keytype{$key} eq "int"); + printf(DAT " %${tcolw}d",$result->{$identifier}->{$subid}->{$key}) if ($keytype{$key} eq "bool"); + printf(DAT " %${tcolw}s",$result->{$identifier}->{$subid}->{$key}) if ($keytype{$key} eq "string"); + printf(DAT " %${tcolw}.2f",$result->{$identifier}->{$subid}->{$key}) if ($keytype{$key} eq "float"); + printf(DAT " %${tcolw}d",$result->{$identifier}->{$subid}->{$key}) if ($keytype{$key} eq "int"); + } else { + printf(" %${tcolw}s"," ---"); + printf(DAT " %${tcolw}s"," ---"); + } + } + } + } + print "\n"; + print DAT "\n"; + } + print "\n"; + print DAT "\n"; + } + } + + if($opt_verbose>3) { + print "\nKeylist: "; + foreach $key (sort {$a cmp $b} keys(%keylist)) { + print "$key,"; + } + print "\n"; + } + close(DAT); + if ($lcnt==0) { + # remove empty file + unlink($resfile); + } + } +} + + +sub attrsort { + my($hashref,$keytyperef,$attrsortlistref,$aa,$bb)=@_; + my($key,$val,$aaa,$bbb,$kkey,$dir); + foreach $key (@{$attrsortlistref}) { + $key=~/^([^\+\-]*)([\+\-])?$/; + ($kkey,$dir)=($1,$2); + if((!$dir) || ($dir eq "+")) {$aaa=$aa;$bbb=$bb;} + else {$aaa=$bb;$bbb=$aa;} + + return(-1) if(!exists($hashref->{$aaa}->{$kkey})); + return(1) if(!exists($hashref->{$bbb}->{$kkey})); + + $val=$hashref->{$aaa}->{$kkey} cmp $hashref->{$bbb}->{$kkey} if($keytyperef->{$kkey} eq "string"); + $val=$hashref->{$aaa}->{$kkey} <=> $hashref->{$bbb}->{$kkey} if($keytyperef->{$kkey} eq "float"); + $val=$hashref->{$aaa}->{$kkey} <=> $hashref->{$bbb}->{$kkey} if($keytyperef->{$kkey} eq "int"); +# print "<$kkey,$hashref->{$aaa}->{$kkey},$hashref->{$bbb}->{$kkey},$val,$keytyperef->{$kkey}>"; + last if($val!=0); + } +# print "$val\n"; + + $val=($aa cmp $bb) if($val==0); + return($val); +} + + +sub verify { + my($subdir,$stdoutfile,$stderrfile,$cname,$verifyfile,$verifyref,$parmhash)=@_; + my($mode,$dtype,$val,$evalstr,$rc); + my($vref,$command,$cmd); + + $parmhash->{stdoutfile}=$stdoutfile; + $parmhash->{stderrfile}=$stderrfile; + + &substitute(\$cname,$parmhash); + $vref=$verifyref->{verify}->{$cname}; + if (!$vref) { printlog(-1,"... no verify step '%s' found in verify.xml\n",$cname); return (-2);} + $command = $vref->{command}->[0]; + + $rc=&substitute(\$command,$parmhash); + printlog(3,"\t\t verify substitute param %s not found for cmd=>%s<\n",$SUBSTITUTE_NOTFOUND,$command,$rc) if($rc==-1); + + if($command) { + printlog(3,"\t\t verify %s cmd=>%s< rc=%d\n",$cname,$command,$rc); + $cmd="($command 1>$subdir/verify_out.log 2>$subdir/verify_err.log)"; + printlog(0,"\t\t\t\t\texecuting: %s\n",$cmd); + system($cmd); + if($?) { printlog(-1,"... failed to verify $?\n",""); return (undef);} + } + +} + +sub analyse { + my($stdoutdataref,$stderrdataref,$cname,$analysefile,$analyseref,$gparmhashref)=@_; + my($mode,$dtype,$val,$evalstr,$rc); +# printlog(-1,"%s",Dumper($analyseref)); + + &substitute(\$cname,$gparmhashref); + printlog(3,"\t\t analyse2 cname=>%s<\n",$cname); + my $aref=$analyseref->{analyse}->{$cname}; + if (!$aref) { printlog(-1,"... no analyse step '%s' found in analyse.xml\n",$cname); return (-2);} + + my ($data,$inpsep,$parm,$regexp,$unit,$naref,$parmhashref); + my $reghash= { 'patfp' => $patfp, 'patint' => $patint, 'patwrd' => $patwrd, 'patbl' => $patbl, + 'patnfp' => $patnfp, 'patnint' => $patnint, 'patnwrd' => $patnwrd}; + + $data=$$stdoutdataref.$$stderrdataref; + + return(-1) if(!$data); + +# printlog(-1,"%s",Dumper($aref)); + # test parameters + + if(exists($aref->{'input'})) { +# printlog(-1,"%s",Dumper($gparmhashref)); + my $addfiles=$aref->{'input'}[0]->{'addfiles'}; + my($inputfile,$line,$lnr); + foreach $inputfile (split(/\s,?\s*/,$addfiles)) { + &substitute(\$inputfile,$gparmhashref); + $inputfile=$gparmhashref->{'subdir'}."/".$inputfile if($inputfile!~/^\//); + if(-f $inputfile) { + $lnr=0; + open(IN,"$inputfile"); + while($line=) { + $lnr++; + $data.=$line; + } + close(IN); + printlog(2,"\t\t\t include additional input file: %-16s (%d lines)\n",$inputfile,$lnr); + } else { + printlog(2,"\t\t\t include additional input file: %-16s NOT FOUND\n",$inputfile); + } + } + } + + foreach $parm (sort(keys(%{$aref->{parm}}))) { + $regexp=$aref->{parm}->{$parm}->{content}; + $regexp=~s/^\s+//gs; + $regexp=~s/\s+$//gs; + $unit=$aref->{parm}->{$parm}->{unit}; + $mode=$aref->{parm}->{$parm}->{mode}; + $dtype=$aref->{parm}->{$parm}->{type}; + printlog(3,"\t\t search for %-16s --> %-8s (%s)\n",$parm,$unit,$regexp); + &substitute(\$regexp,$reghash); + if ($mode eq "line") { + if($data=~/$regexp/m) { + $val=$1; + printlog(3,"\t\t found %-16s --> %15s %-8s (%s)\n",$parm,$val,$unit,$regexp); + } else { + $val="-1"; + printlog(3,"\t\t not found %-16s --> %15s %-8s (%s)\n",$parm,"?",$unit,$regexp); + } + $naref->{parm}->{$parm}->{'unit'}=$unit; + $naref->{parm}->{$parm}->{'value'}=$val; + $naref->{parm}->{$parm}->{'type'}=$dtype; + $parmhashref->{$parm}=$val; + } elsif($mode eq "line,last") { + my $found=0; + while($data=~/$regexp/mg) { + $val=$1; + $found=1; + } + if($found) { + printlog(3,"\t\t found %-16s --> %15s %-8s (%s)\n",$parm,$val,$unit,$regexp); + } else { + $val="-1"; + printlog(3,"\t\t not found %-16s --> %15s %-8s (%s)\n",$parm,"?",$unit,$regexp); + } + $parmhashref->{$parm}=$val; + $naref->{parm}->{$parm}->{'value'}=$val; + $naref->{parm}->{$parm}->{'unit'}=$unit; + $naref->{parm}->{$parm}->{'type'}=$dtype; + } elsif($mode eq "line,add") { + $naref->{parm}->{$parm}->{'count'}=0; + $naref->{parm}->{$parm}->{'value'}=0; + $naref->{parm}->{$parm}->{'unit'}=$unit; + $naref->{parm}->{$parm}->{'type'}=$dtype; + $naref->{parm}->{$parm."_cnt"}->{'value'}=0; + $naref->{parm}->{$parm."_cnt"}->{'unit'}="#"; + $naref->{parm}->{$parm."_cnt"}->{'type'}="int"; + + while($data=~/$regexp/mg) { + $val=$1; + $naref->{parm}->{$parm}->{'value'}+=$val; + $naref->{parm}->{$parm}->{'count'}++; + } + if($naref->{parm}->{$parm}->{'count'}>0) { + printlog(3,"\t\t found %-16s --> %15s %-8s (%s) sum of %d matches (%s)\n", + $parm,$naref->{parm}->{$parm}->{'value'},$unit,$regexp,$naref->{parm}->{$parm}->{'count'}, + $parm."_cnt"); + $naref->{parm}->{$parm."_cnt"}->{'value'}=$naref->{parm}->{$parm}->{'count'}; + } else { + $val="?"; + printlog(3,"\t\t not found %-16s --> %15s %-8s (%s)\n",$parm,"?",$unit,$regexp); + } +# printlog(3,"\t\t set %s to '%s'\n",$parm,$naref->{parm}->{$parm}->{'value'}); + + $parmhashref->{$parm}=$naref->{parm}->{$parm}->{'value'}; + $parmhashref->{$parm."_cnt"}=$naref->{parm}->{$parm."_cnt"}->{'value'}; + } elsif($mode eq "line,min") { + $naref->{parm}->{$parm}->{'count'}=0; + $naref->{parm}->{$parm}->{'value'}=0; + $naref->{parm}->{$parm}->{'unit'}=$unit; + $naref->{parm}->{$parm}->{'type'}=$dtype; + $naref->{parm}->{$parm."_cnt"}->{'value'}=0; + $naref->{parm}->{$parm."_cnt"}->{'unit'}="#"; + $naref->{parm}->{$parm."_cnt"}->{'type'}="int"; + + while($data=~/$regexp/mg) { + $val=$1; + if($naref->{parm}->{$parm}->{'count'} == 0) { + $naref->{parm}->{$parm}->{'value'} = $val; + } + $naref->{parm}->{$parm}->{'value'}=min($val,$naref->{parm}->{$parm}->{'value'}); + $naref->{parm}->{$parm}->{'count'}++; + } + if($naref->{parm}->{$parm}->{'count'}>0) { + printlog(3,"\t\t found %-16s --> %15s %-8s (%s) min of %d matches (%s)\n", + $parm,$naref->{parm}->{$parm}->{'value'},$unit,$regexp,$naref->{parm}->{$parm}->{'count'}, + $parm."_cnt"); + $naref->{parm}->{$parm."_cnt"}->{'value'}=$naref->{parm}->{$parm}->{'count'}; + } else { + $val="?"; + printlog(3,"\t\t not found %-16s --> %15s %-8s (%s)\n",$parm,"?",$unit,$regexp); + } +# printlog(3,"\t\t set %s to '%s'\n",$parm,$naref->{parm}->{$parm}->{'value'}); + + $parmhashref->{$parm}=$naref->{parm}->{$parm}->{'value'}; + $parmhashref->{$parm."_cnt"}=$naref->{parm}->{$parm."_cnt"}->{'value'}; + } elsif($mode eq "line,max") { + $naref->{parm}->{$parm}->{'count'}=0; + $naref->{parm}->{$parm}->{'value'}=0; + $naref->{parm}->{$parm}->{'unit'}=$unit; + $naref->{parm}->{$parm}->{'type'}=$dtype; + $naref->{parm}->{$parm."_cnt"}->{'value'}=0; + $naref->{parm}->{$parm."_cnt"}->{'unit'}="#"; + $naref->{parm}->{$parm."_cnt"}->{'type'}="int"; + + while($data=~/$regexp/mg) { + $val=$1; + if($naref->{parm}->{$parm}->{'count'} == 0) { + $naref->{parm}->{$parm}->{'value'} = $val; + } + $naref->{parm}->{$parm}->{'value'}=max($val,$naref->{parm}->{$parm}->{'value'}); + $naref->{parm}->{$parm}->{'count'}++; + } + if($naref->{parm}->{$parm}->{'count'}>0) { + printlog(3,"\t\t found %-16s --> %15s %-8s (%s) max of %d matches (%s)\n", + $parm,$naref->{parm}->{$parm}->{'value'},$unit,$regexp,$naref->{parm}->{$parm}->{'count'}, + $parm."_cnt"); + $naref->{parm}->{$parm."_cnt"}->{'value'}=$naref->{parm}->{$parm}->{'count'}; + } else { + $val="?"; + printlog(3,"\t\t not found %-16s --> %15s %-8s (%s)\n",$parm,"?",$unit,$regexp); + } +# printlog(3,"\t\t set %s to '%s'\n",$parm,$naref->{parm}->{$parm}->{'value'}); + + $parmhashref->{$parm}=$naref->{parm}->{$parm}->{'value'}; + $parmhashref->{$parm."_cnt"}=$naref->{parm}->{$parm."_cnt"}->{'value'}; + } elsif($mode eq "line,statistics") { + $naref->{parm}->{$parm}->{'count'}=0; + $naref->{parm}->{$parm}->{'value'}=0; + $naref->{parm}->{$parm}->{'unit'}=$unit; + $naref->{parm}->{$parm}->{'type'}=$dtype; + + $naref->{parm}->{$parm."_cnt"}->{'value'}=0; + $naref->{parm}->{$parm."_cnt"}->{'unit'}="#"; + $naref->{parm}->{$parm."_cnt"}->{'type'}="int"; + + $naref->{parm}->{$parm."_min"}->{'value'}=0; + $naref->{parm}->{$parm."_min"}->{'unit'}=$unit; + $naref->{parm}->{$parm."_min"}->{'type'}="float"; + + $naref->{parm}->{$parm."_max"}->{'value'}=0; + $naref->{parm}->{$parm."_max"}->{'unit'}=$unit; + $naref->{parm}->{$parm."_max"}->{'type'}="float"; + + $naref->{parm}->{$parm."_avg"}->{'value'}=0; + $naref->{parm}->{$parm."_avg"}->{'unit'}=$unit; + $naref->{parm}->{$parm."_avg"}->{'type'}="float"; + + $naref->{parm}->{$parm."_std"}->{'value'}=0; + $naref->{parm}->{$parm."_std"}->{'unit'}=$unit; + $naref->{parm}->{$parm."_std"}->{'type'}="float"; + + + while($data=~/$regexp/mg) { + $val=$1; + if($naref->{parm}->{$parm}->{'count'} == 0) { + $naref->{parm}->{$parm."_max"}->{'value'} = $val; + $naref->{parm}->{$parm."_min"}->{'value'} = $val; + $naref->{parm}->{$parm."_std"}->{'value'} = 0; + $naref->{parm}->{$parm."_avg"}->{'value'} = 0; + $naref->{parm}->{$parm."_sum"}->{'value'} = 0; + } + $naref->{parm}->{$parm."_max"}->{'value'}=max($val,$naref->{parm}->{$parm."_max"}->{'value'}); + $naref->{parm}->{$parm."_min"}->{'value'}=min($val,$naref->{parm}->{$parm."_min"}->{'value'}); + $naref->{parm}->{$parm."_avg"}->{'value'}+=$val; + $naref->{parm}->{$parm."_sum"}->{'value'}+=$val; + $naref->{parm}->{$parm."_std"}->{'value'}+=$val*$val; + $naref->{parm}->{$parm}->{'count'}++; + } + + if($naref->{parm}->{$parm}->{'count'}>1 && $naref->{parm}->{$parm."_avg"}->{'value'}>0) { + my $help=($naref->{parm}->{$parm}->{'count'}*$naref->{parm}->{$parm."_std"}->{'value'} - + $naref->{parm}->{$parm."_avg"}->{'value'} * $naref->{parm}->{$parm."_avg"}->{'value'}) / + ($naref->{parm}->{$parm}->{'count'} * ($naref->{parm}->{$parm}->{'count'}-1)); + if($help<0) { + printlog(-1,"\nWARNING value $help <0 $parm ... ! setting to 0\n",""); + $help=0; + } + $naref->{parm}->{$parm."_std"}->{'value'} = sqrt($help); + $naref->{parm}->{$parm."_avg"}->{'value'}/=$naref->{parm}->{$parm}->{'count'}; +#Bug in statistics min/max should not be divided trough avg +# $naref->{parm}->{$parm."_min"}->{'value'}/=$naref->{parm}->{$parm."_avg"}->{'value'}; +# $naref->{parm}->{$parm."_max"}->{'value'}/=$naref->{parm}->{$parm."_avg"}->{'value'}; + $naref->{parm}->{$parm."_std"}->{'value'}/=$naref->{parm}->{$parm."_avg"}->{'value'}; + } + + + if($naref->{parm}->{$parm}->{'count'}>0) { + printlog(3,"\t\t found %-16s --> %15s %-8s (%s) statistics of %d matches (%s)\n", + $parm,$naref->{parm}->{$parm."_avg"}->{'value'},$unit,$regexp,$naref->{parm}->{$parm}->{'count'}, + $parm."_cnt"); + $naref->{parm}->{$parm."_cnt"}->{'value'}=$naref->{parm}->{$parm}->{'count'}; + } else { + $val="?"; + printlog(3,"\t\t not found %-16s --> %15s %-8s (%s)\n",$parm,"?",$unit,$regexp); + } +# printlog(3,"\t\t set %s to '%s'\n",$parm,$naref->{parm}->{$parm}->{'value'}); + + $parmhashref->{$parm}=$naref->{parm}->{$parm}->{'value'}; + $parmhashref->{$parm."_cnt"}=$naref->{parm}->{$parm."_cnt"}->{'value'}; + $parmhashref->{$parm."_min"}=$naref->{parm}->{$parm."_min"}->{'value'}; + $parmhashref->{$parm."_max"}=$naref->{parm}->{$parm."_max"}->{'value'}; + $parmhashref->{$parm."_avg"}=$naref->{parm}->{$parm."_avg"}->{'value'}; + $parmhashref->{$parm."_std"}=$naref->{parm}->{$parm."_std"}->{'value'}; + } elsif($mode eq "span") { + # mode: span line +# index takes a table form input and writes out columns via indeces choosen by the user + } elsif ($mode=~/line\,index\(([\d,]+){1,}\)/){ +# the indeces are stored in @avalues + my @avalues = split /,/,$1; +# first index (key-value) + my $indexvalue = shift @avalues; + + $naref->{parm}->{$parm}->{'count'}=0; + $naref->{parm}->{$parm}->{'unit'}=$unit; + $naref->{parm}->{$parm}->{'type'}=$dtype; + + while($data=~/$regexp/mg) { + my $correspondingValues = ""; + my @val=($1,$2,$3,$4,$5,$6,$7,$8,$9); + foreach (@avalues) { + $correspondingValues = $correspondingValues . " " . $val[$_-1]; + } +# print "\$corrrespondingValues: $correspondingValues\n"; + + push(@{$naref->{parm}->{$parm}->{'values'}},{"key" => $val[$indexvalue-1], + "value" => $correspondingValues}); + $naref->{parm}->{$parm}->{'count'}++; + } + if($naref->{parm}->{$parm}->{'count'}>0) { + printlog(3,"\t\t found %-16s --> %15d idx ent. (%s) \n", + $parm,$naref->{parm}->{$parm}->{'count'},$regexp); + $naref->{parm}->{$parm."_cnt"}->{'value'}=$naref->{parm}->{$parm}->{'count'}; + $naref->{parm}->{$parm."_cnt"}->{'unit'}="#"; + $naref->{parm}->{$parm."_cnt"}->{'type'}="int"; + } else { + $val="?"; + printlog(3,"\t\t not found %-16s --> %15s %-8s (%s)\n",$parm,"?",$unit,$regexp); + } + $parmhashref->{$parm}=$naref->{parm}->{$parm}->{'value'}; + $parmhashref->{$parm."_cnt"}=$naref->{parm}->{$parm."_cnt"}->{'value'}; + } else { + # e.g. derived + } + + } + + foreach $parm (keys(%{$gparmhashref})) { + $parmhashref->{$parm}=$gparmhashref->{$parm}; + } + + #derived parms + my %derivedparms=(); + foreach $parm (keys(%{$aref->{parm}})) { + $mode=$aref->{parm}->{$parm}->{mode}; + next if($mode ne "derived"); + $derivedparms{$parm}=1; + } + + while((scalar keys(%derivedparms))>0) { + foreach $parm (keys(%derivedparms)) { + $evalstr=$aref->{parm}->{$parm}->{content}; + $evalstr=~s/^\s*//gs; + $evalstr=~s/\s*$//gs; + $unit=$aref->{parm}->{$parm}->{unit}; + $dtype=$aref->{parm}->{$parm}->{type}; + $val="\`".$evalstr."\`"; + $rc=&substitute(\$val,$parmhashref); + if($rc>=0) { + printlog(3,"\t\t derived[%1d] %-16s --> %15s %-12s (%s)\n",$derivedparms{$parm},$parm,$val,$unit,$evalstr); + $naref->{parm}->{$parm}->{'unit'}=$unit; + $naref->{parm}->{$parm}->{'value'}=$val; + $naref->{parm}->{$parm}->{'type'}=$dtype; + $parmhashref->{$parm}=$naref->{parm}->{$parm}->{'value'}; + delete($derivedparms{$parm}); + } else { + $derivedparms{$parm}++; + if($derivedparms{$parm}>4) { + printlog(3,"\t\t derived[%1d] %-16s --> %15s %-8s could not be resolved\n",$derivedparms{$parm},$parm,$val,$unit); + delete($derivedparms{$parm}); + } + } + } + } + + if(! open(OUT,"> $analysefile") ) { + printlog(-1,"... failed to open log file $analysefile\n"); return (-1); + } +# printlog(-1,"%s",Dumper($naref)); + print OUT $xs->XMLout($naref, AttrIndent => 0, RootName => "analyse" ); + close(OUT); + + +} + +sub compile { + my($compileref,$dir,$cmpdir,$identifier,$cname,$cversion,$cproto,$cstepref,$platformref,$parmhash)=@_; + my($cmd,$from,$to,$lparam,$execname,$execnamepath); + my($file,$rc,$key,$spec,$var); + + + my $tmpdir_for_copy = $dir; + if ($opt_cmpdir) + { + $dir = $cmpdir; + } + + + + &substitute(\$cname,$parmhash); + my $cref=$compileref->{compile}->{$cname}; + if (!$cref) { printlog(-1,"... no compile step '%s' found in compile.xml\n",$cname); return (-2);} + + my $execfound; + return(undef) if (!$cstepref); + +# printlog(-1,"%s",Dumper($cstepref)); + + my $srcdir = $cref->{src}->[0]->{directory}; + my $srcfiles = $cref->{src}->[0]->{files}; + my $param; + if (defined ($cref->{param})) { + $param = $cref->{param}->[0]; + } else { + $param = $cref->{params}->[0]; + } + my $command = $cref->{command}->[0]; + my $executable = $cref->{executable}->[0]; + my $platformparam = $platformref->{params}->[0]; + my $platform = $parmhash->{platform}; + + $$cproto=" \n"; + + # scan params and build executable name + $execname=$parmhash->{benchname}; + $execname.="_".$parmhash->{platform}; + + + # parameters of cstep in top level xml file + foreach $key (sort(keys(%$cstepref))) { + next if($key eq "version"); + $var=$cstepref->{$key}; + $rc=&substitute(\$var,$parmhash); + $execname.="_".$key."_".$var; + $param->{$key}=$var; + } + $execname.=".exe"; + $execnamepath="$dir/$execname"; + + # HK modification, 8.5.08 + foreach (keys(%$parmhash)){ + if($parmhash->{$_} ne "") { + $param->{$_} = $parmhash->{$_}; + } + } + + if($command) { + + + $param->{outdir}=$dir; + $param->{rundir}="$pwd/run"; + $param->{id}=$identifier; + $param->{execname}=$execnamepath; + + $execfound=0; + + # reuse old version +# FJ: Introduction of option "always new" + if( ($cversion ne "always new") && (($cversion eq "reuse") + || $generatedexecutables{$execname} )) { + if(-f "$pwd/run/$execname") { + printlog(1,"\t\t\t\t\t\t\treusing executable: %s\n",$execname); + $cmd="cp -p $pwd/run/$execname $execnamepath"; + printlog(1,"\t\t\t\t\t\t\t executing: %s\n",$cmd); + system($cmd); + $execfound=1; + } + } + + # include platform specific parameter in param + foreach $key (keys(%{$param})) { + my $var=$param->{$key}; + $rc=&substitute(\$var,$platformparam); + $param->{$key}=$var; + printlog(4,"\t\t\t\t\t key= >%s< >%s< rc=%d\n",$key,$var,$rc); + if($rc==-1) { + printlog(-1,"... parameter(s) not found in platform.xml for key %s (%s)\n",$key,$var); + exit(-1); + } + } + # merge platform parameter in bench hash + foreach $key (keys(%{$platformparam})) { + $param->{$key}=$platformparam->{$key} if(!exists($param->{$key})); + } + # adjust command + &substitute(\$command,$param); + + # copy src and execute cmd (e.g. make, configure) + if(!$execfound) { + return(undef) if(! ((-d $srcdir) && ($srcfiles) )); + + # copy files + printlog(1,"\t\t\t\t\t\t\tcopy files/dirs: %s\n",$srcfiles); + system "mkdir -p $dir/src"; + if(! -d "$dir/src") { printlog(-1,"... failed to create directory %s\n",$dir); return (undef);} + foreach $file (split('[, ]',$srcfiles)) { + if ($file =~ m/.*\.(tar|tar\.gz|tgz)/) { + $cmd="(cd $dir/src/; gunzip -c $opt_configdir/$srcdir/$file | tar xf -)"; + printlog(2,"\t\t\t\t\texecuting: %s\n",$cmd); + system($cmd); + if($?) { printlog(-1,"... failed to extract $file to %s\n","$dir/src"); return (undef);} + } + else + { + $cmd="cp -rp $srcdir/$file $dir/src/"; + printlog(2,"\t\t\t\t\t\t\texecuting: %s\n",$cmd); + system($cmd); + if($?) { printlog(-1,"... failed to copy file $file to %s\n","$dir/src"); return (undef);} + } + } + + # substitute parameters + $rc=&substitute_files($cref->{substitute},$param,"$dir/src"); + + # execute compile command + &substitute(\$command,$param); + $cmd="(cd $dir/src; $command 1>$dir/compile_out.log 2>$dir/compile_err.log)"; + printlog(0,"\t\t\t\t\t\t\texecuting compile command: %s\n",$cmd); + system($cmd); + if($?) { printlog(-1,"... failed to compile\n",""); return (undef);} + + # save version of executable + $cmd="cp -p $execnamepath $pwd/run/$execname"; + printlog(1,"\t\t\t\t\t\t\texecuting: %s\n",$cmd); + system($cmd); + if($?) { printlog(-1,"... failed to copy file %s to %s\n","$execnamepath","$pwd/run/$execname"); return (undef);} + $generatedexecutables{$execname}=1; + + # generate XML compile description + # if(! open(CPROTO,"> $pwd/run/${execname}.xml") ) { + # printlog(-1,"... failed to open protocol file %s\n","$pwd/run/${execname}.xml"); return (-1); + # } + + + } + + # generate log file information + $$cproto.=" $command\n"; + $$cproto.=" {$key}."\"\n"; + } + $$cproto.=" />\n"; + + + } elsif ($executable) { + # precompiled + my $ex= $executable->{name}; + my $dest=$executable->{destname}; + my $desc=$cref->{description}->[0]; + &substitute(\$dest,{"id" => "$identifier"}); + $cmd="cp -p $ex $dir/$dest"; + printlog(0,"\t\t\t\t\texecuting: %s\n",$cmd); + system($cmd); + if($?) { printlog(-1,"... failed to copy file $ex to $dir/$dest\n"); return (undef);} + $execnamepath="$dir/$dest"; + $$cproto.=" \n"; + if($desc) { + $$cproto.=" $desc\n"; + } + } else { + printlog(-1," problems to compile executable\n",""); return (undef); + $execnamepath=undef; + } + $$cproto.=" \n"; + +# copy executable to tmpdir if tmpdir is specified in the option section + +if($tmpdir_for_copy ne $cmpdir) +{ + my $archive = $identifier . ".tar"; + my $archive_zip = $archive . ".gz"; + + printlog(0," JUBE: tar source code: tar cf $archive %s \n",$archive, "./src"); + my $cmd = "(cd $dir; tar cf $archive ./src)"; + printlog(3,"\texecuting: %s\n",$cmd); + system($cmd); + if($?) { printlog(-1,"... failed to execute %s\n",$?); return (undef);} + + printlog(0," JUBE: zip tar file: gzip %s \n", $archive); + $cmd = "(cd $dir; gzip $archive)"; + printlog(3,"\executing: %s\n", $cmd); + system($cmd); + if($?) { printlog(-1,"... failed to execute %s\n", $cmd); return (undef);} + + printlog(0," JUBE: new directory: mkdir -p %s \n", "$tmpdir/run"); + $cmd = "(mkdir -p $tmpdir/run)"; + printlog(3,"\executing: %s\n", $cmd); + system($cmd); + if($?) { printlog(-1,"... failed to execute %s\n", $cmd); return (undef);} + + printlog(0," JUBE: new File: cp -p %s %s\n", $execnamepath, "$tmpdir/run"); + $cmd = "(cp -p $execnamepath $tmpdir/run)"; + printlog(3,"\executing: %s\n", $cmd); + system($cmd); + if($?) { printlog(-1,"... failed to execute %s\n", $cmd); return (undef);} + + printlog(0," JUBE: new File: cp -p %s %s\n", $archive_zip, "$tmpdir_for_copy"); + $cmd = "(cd $dir ; cp -p $archive_zip $tmpdir_for_copy)"; + printlog(3,"\executing: %s\n", $cmd); + system($cmd); + if($?) { printlog(-1,"... failed to execute %s\n", $cmd); return (undef);} +} + +return($execnamepath); +} + +sub prepare { + my($prepareref,$dir,$identifier,$cname,$parmhash)=@_; + my($cmd,$from,$to,$lparam,$file,$rc); + + &substitute(\$cname,$parmhash); + my $pref=$prepareref->{prepare}->{$cname}; + if (!$pref) { printlog(-1,"... no prepare step '%s' found in prepare.xml\n",$cname); return (-2);} + + my $inpfiles=$pref->{input}->[0]->{files}; + +# printlog(-1,"%s",Dumper($pref)); + + + if($pref->{mkdir}) { + # execute command + my $subdir; + printlog(1,"\t\t\t\t\t\t\tmkdirs: %s\n",join(" ",keys(%{$pref->{mkdir}}))); + foreach $subdir (keys(%{$pref->{mkdir}})) { + my $cmd="(mkdir $dir/$subdir 1>$dir/prepare_mkdir_out.log 2>$dir/prepare_mkdir_err.log)"; + $parmhash->{outdir}=$dir; + $parmhash->{identifier}=$identifier; + &substitute(\$cmd,$parmhash); + printlog(2,"\t\t\t\t\t\t\t\t executing: %s\n",$cmd); + system($cmd); + if($?) { printlog(-1,"... failed to execute %s rc=%s\n",$cmd,$?); return (undef);} + } + } + + if(($pref->{precommand}) && ($pref->{precommand}->[0]) && !ref($pref->{precommand}->[0]) && (($pref->{precommand})!~/^\s*$/)) { + # execute command + my $command=$pref->{precommand}->[0]; + $command=~s/\n/ /gs; + my $cmd="($command 1>$dir/prepare_precmd_out.log 2>$dir/prepare_precmd_err.log)"; + $parmhash->{outdir}=$dir; + $parmhash->{identifier}=$identifier; +# printlog(-1,"%s",Dumper($parmhash)); + $rc=&substitute(\$cmd,$parmhash); + printlog(1,"\t\t\t\t\t\t\texec. prep precommand: %s\n",$command); + printlog(3,"\t\t prepare substitute param %s not found for cmd=>%s<\n",$SUBSTITUTE_NOTFOUND,$cmd,$rc) if($rc==-1); + printlog(2,"\t\t\t\t\t\t executing: %s\n",$cmd); + system($cmd); + if($?) { printlog(-1,"... failed to execute %s rc=%s\n",$cmd,$?); return (undef);} + } + + + if($inpfiles) { + printlog(1,"\t\t\t\t\t\t\tprep input files: %s\n",$inpfiles); + foreach $file (split('[, ]',$inpfiles)) { + &substitute(\$file,$parmhash); + $cmd="cp -rp $file $dir/"; + printlog(2,"\t\t\t\t\t\t\t executing: %s\n",$cmd); + system($cmd); + if($?) { printlog(-1,"... failed to copy file %s to %s\n",$file,$dir); return (-1);} + } + } + if($pref->{substitute}) { + # substitute parameters + $rc=&substitute_files($pref->{substitute},$parmhash,$dir); + } + + if(($pref->{command}) && ($pref->{command}->[0]) && !ref($pref->{command}->[0]) && (($pref->{command})!~/^\s*$/)) { + # execute command + my $command=$pref->{command}->[0]; + $command=~s/\n/ /gs; + my $cmd="($command 1>$dir/prepare_cmd_out.log 2>$dir/prepare_cmd_err.log)"; + $parmhash->{outdir}=$dir; + $parmhash->{identifier}=$identifier; +# printlog(-1,"%s",Dumper($parmhash)); + $rc=&substitute(\$cmd,$parmhash); + printlog(1,"\t\t\t\t\t\t\texec. prep command: %s\n",$command); + printlog(3,"\t\t prepare substitute param %s not found for cmd=>%s<\n",$SUBSTITUTE_NOTFOUND,$cmd,$rc) if($rc==-1); + printlog(2,"\t\t\t\t\t\t executing: %s\n",$cmd); + system($cmd); + if($?) { printlog(-1,"... failed to execute %s rc=%s\n",$cmd,$?); return (undef);} + } +} + + +sub execute { + my($executeref,$dir,$identifier,$subid,$executable,$estepref,$parmhash,$lastcommandref)=@_; + my($cmd,$from,$to,$lparam,$envstr,$envvar,$file,$rc,$var,$cname,$key); + + $cname=$estepref->{'cname'}; + + # parameters of cstep in top level xml file + foreach $key (sort(keys(%$estepref))) { + next if($key eq "cname"); + $var=$estepref->{$key}; + $parmhash->{$key}=$var; + } + +# printlog(-1,"in execute: %s",Dumper($parmhash)); + + + &substitute(\$cname,$parmhash); + my $eref=$executeref->{execute}->{$cname}; + if (!$eref) { printlog(-1,"... no execution step '%s' found in execute.xml\n",$cname); return (-2);} + + my $inpfiles=$eref->{input}->[0]->{files}; + my $envref=$eref->{environment}->[0]->{env}; + my $command =$eref->{command}->[0]; + my $lastcommand =$eref->{lastcommand}->[0]; + if($defer_stdout_to_tmpdir) { + $parmhash->{logdir}=$tmplogdir; + $parmhash->{stdoutlogfile}=sprintf("%s/%s.%s_stdout.log",$tmplogdir,$identifier,$subid); + $parmhash->{stderrlogfile}=sprintf("%s/%s.%s_stderr.log",$tmplogdir,$identifier,$subid); + } else { + $parmhash->{logdir}=$stdlogdir; + $parmhash->{stdoutlogfile}=sprintf("%s/%s.%s_stdout.log",$stdlogdir,$identifier,$subid); + $parmhash->{stderrlogfile}=sprintf("%s/%s.%s_stderr.log",$stdlogdir,$identifier,$subid); + } + + $parmhash->{outdir}=$dir; + $parmhash->{id}=$identifier; + $parmhash->{subid}=$subid; + $parmhash->{executable}=$executable; + +# printlog(-1,Dumper($eref)); + + printlog(1,"\t\t\t\t\t\t\tcopy files: %s\n",$inpfiles); + foreach $file (split('[, ]',$inpfiles)) { + &substitute(\$file,$parmhash); + $cmd="cp -rp $file $dir/"; + printlog(2,"\t\t\t\t\t\t\t executing: %s\n",$cmd); + system($cmd); + if($?) { printlog(-1,"... failed to copy file $file to $dir\n"); return (-1);} + } + + $envstr=""; + foreach $envvar (keys(%{$envref})) { + my $envval=$envref->{$envvar}->{'value'}; + &substitute(\$envval,$parmhash); + $envstr.="export $envvar=\"$envval\"\n"; + } + $parmhash->{env}=$envstr; + $rc=&substitute_files($eref->{substitute},$parmhash,$dir); + $parmhash->{env}=""; + + # execute command + if($command) { + &substitute(\$command,$parmhash); + $cmd="(cd $dir; $command 1>$dir/execute_out.log 2>$dir/execute_err.log)"; + printlog(0,"\t\t\t\t\t\t\t-> submit job command: %s %s\n",($opt_debug)?"[debug]":"",$command); + system($cmd) if (!$opt_debug); + if($?) { printlog(-1,"... failed to execute %s\n",$command); return (-1);} + } + + # execute command + if($lastcommand) { + &substitute(\$lastcommand,$parmhash); + $$lastcommandref="(cd $dir; $lastcommand 1>$dir/execute_out.log 2>$dir/execute_err.log)"; + } + +} + +sub pproto_open { + my($protofile,$benchref)=@_; + my($startdate,$help1,$help2); + if(! open(PROTO,"> $protofile") ) { + printlog(-1,"... failed to open protocol file %s\n",$protofile); return (-1); + } + + $help1= $benchref->{'name'}; + $help2 = $benchref->{'platform'}; + + print PROTO "\n"; + $startdate=localtime(time()); + print PROTO " $startdate\n"; +} + +sub pproto { + my($protofile,$str)=@_; + if(! open(PROTO,">> $protofile") ) { + printlog(-1,"... failed to open protocol file %s\n",$protofile); return (-1); + } + + print PROTO $str; +} + + +sub pproto_close { + my($protofile,$benchref)=@_; + + if(! open(PROTO,">> $protofile") ) { + printlog(-1,"... failed to open protocol file %s\n",$protofile); return (-1); + } + $enddate=localtime(time()); + print PROTO " $enddate\n"; + print PROTO "\n"; +} + +sub getsequence { + my($str,$benchref)=@_; + my($spec,$i); + my (@sequence,@result); + # only if evaluated + return if ($str=~/\$/); + if($str=~/^$patwrd\($patwrd\)$/) { + my ($func,$parms)=($1,$2); +# print "getsequence: $func $parms\n"; +# printlog(-1,"%s",Dumper($benchref)); + @result=&readperm(split(/\s*,\s*/,$parms),$benchref) if ($func eq "readperm"); + @result=&factorperm(split(/\s*,\s*/,$parms),$benchref) if ($func eq "factorperm"); + @result=&factorpermbound(split(/\s*,\s*/,$parms),$benchref) if ($func eq "factorpermbound"); + @result=&iterator(split(/\s*,\s*/,$parms),$benchref) if ($func eq "iterator"); + @result=&predefinedparams(split(/\s*,\s*/,$parms),$benchref) if ($func eq "predefinedparams"); + } else { + if($str!~/\,/) { + push(@sequence,$str); + } else { + @sequence=split(/,/,$str); + } + foreach $spec (@sequence) { + if($spec=~/(\d+)\.\.(\d+)/) { + my($a,$e)=($1,$2); + for($i=$a;$i<=$e;$i++) { + push(@result,$i); + } + } else { + push(@result,$spec); + } + } + } +# print "debug: getsequence >$str< -> $result[0]:$result[1]:$result[2]\n"; + return(@result); +} + +sub readperm { + my($maptagname,$x,$y,$z,$t,$n,$benchref)=@_; + my(@result); + if($benchref->{$maptagname} && $benchref->{$maptagname}->[0]->{"map"}) { + my $ref=$benchref->{$maptagname}->[0]->{"map"}; + if($ref->{$n}) { + my $spec; + foreach $spec (split(/\s+/,$ref->{$n}->{"spec"})) { + my ($px,$py,$pz,$pt)=split(/:/,$spec); + if( ($px*$py*$pz*$pt == $n) + && ($x % $px == 0) + && ($y % $py == 0) + && ($z % $pz == 0) + && ($t % $pt == 0) + ) + { + push(@result,$spec); + } + } + } else { + printlog(-1,"... no mapping entry found for $maptagname and tasknumber=%s\n",$n); + } + } else { + printlog(-1,"... no mapping entry found for %s\n",$maptagname); + } + return(@result); +} + +sub iterator { + my($start,$end,$op,$benchref)=@_; + my(@result,$val,$opr); + if($op=~/[+]$patfp/) { + $opr=$1; + $val=$start; + while($val<=$end) { + push(@result,$val); + $val+=$opr; + } + } elsif($op=~/[*]$patfp/) { + $opr=$1; + $val=$start; + while($val<$end) { + push(@result,$val); + $val*=$opr; + } + }elsif($op=~/[-]$patfp/) { + $opr=$1; + $val=$start; + while($val>=$end) { + push(@result,$val); + $val-=$opr; + } + } elsif($op=~/[\/]$patfp/) { + $opr=$1; + $val=$start; + while($val>=$end) { + push(@result,$val); + $val/=$opr; + } + }else { + @result=(); + } + return(@result); +} + +# subrutine to read in predefined parameter sets, as a function of tasks and tagname +# args: - xml target tag name +# - number of tasks +# - request, i.e. which parameter shall be read in +# - reference to bench +sub predefinedparams { + my($paramtagname, $tasks, $request, $benchref)=@_; + my $result; + + printlog(5,"predefined parameter call: $paramtagname, $tasks, $request\n", 0); + + if($benchref->{$paramtagname} && $benchref->{$paramtagname}->[0]->{"predefparam"}) { +# print Dumper($benchref->{$paramtagname}->[0]->{"predefparam"}); + my $paranmb = @{$benchref->{$paramtagname}->[0]->{"predefparam"}}; + my $parafound = 0; + for(my $paracnt=0; $paracnt < $paranmb; $paracnt++) + { + my $ref=$benchref->{$paramtagname}->[0]->{"predefparam"}->[$paracnt]; + if( $ref->{$request} && $ref->{"tasks"}==$tasks) + { + $result=$ref->{$request}; + $parafound = 1; + } + } + if($parafound == 0) + { + printlog(-1, "... could not find request $request in predefined params ($paramtagname) for $tasks tasks\n",0); + } + else + { + printlog(5, "request: $request; value: $result\n", 0); + } + + } + else { + printlog(-1,"... no mapping entry found for %s\n",$paramtagname); + } + return $result; +} + + +sub factorperm { + my($x,$y,$z,$t,$n,$benchref)=@_; + my(@result); + my (@factors, $i, $limit); + # factors of $n + for (my $i = 2; $i <= $n; $i++) { + last if $i > ($n / 2); + if ($n % $i == 0) { + push @factors, $i; + } + } + # matches + my (@matches, @previous_bases, $skip); + my @base1 = my @base2 = my @base3 = my @base4 = @factors; + + for my $base1 (@base1) { + for my $base2 (@base2) { + for my $base3 (@base3) { + for my $base4 (@base4) { + if ($base1 * $base2 * $base3 * $base4 == $n) { + $skip=0; + $skip=1 if (( ($x % $base1) != 0) && ($x != $base1)); + $skip=1 if (( ($y % $base2) != 0) && ($y != $base2)); + $skip=1 if (( ($z % $base3) != 0) && ($z != $base3)); + $skip=1 if (( ($t % $base4) != 0) && ($t != $base4)); + push(@result, "$base1:$base2:$base3:$base4") if(!$skip); + } + } + } + } + } + return(@result); +} + +sub factorpermbound { + my($x,$y,$z,$t,$n,$benchref)=@_; + my(@result); + my (@factors, $i, $limit); + # factors of $n + for (my $i = 2; $i <= $n; $i++) { + last if $i > ($n / 2); + if ($n % $i == 0) { + push @factors, $i; + } + } + push(@factors,1); + push(@factors,$n); + + # matches + my (@matches, @previous_bases, $skip); + my @base1 = my @base2 = my @base3 = my @base4 = @factors; + + for my $base1 (@base1) { + for my $base2 (@base2) { + for my $base3 (@base3) { + for my $base4 (@base4) { + if ($base1 * $base2 * $base3 * $base4 == $n) { + $skip=0; + $skip=1 if (( ($x % $base1) != 0) && ($x != $base1)); + $skip=1 if (( ($y % $base2) != 0) && ($y != $base2)); + $skip=1 if (( ($z % $base3) != 0) && ($z != $base3)); + $skip=1 if (( ($t % $base4) != 0) && ($t != $base4)); + push(@result, "$base1:$base2:$base3:$base4") if(!$skip); + } + } + } + } + } + return(@result); +} + +sub substitute_files { + my($substhashref,$parmhash,$dir)=@_; + my($subst,$data,$from,$to,$nc); + + foreach $subst (keys(%{$substhashref})) { + my $infile=$subst; + my $outfile=$substhashref->{$subst}->{outfile}; + &substitute(\$infile,$parmhash); + &substitute(\$outfile,$parmhash); + + printlog(2,"\t\t\t\t\t\t\t sub: %s -> %s\n",$infile,$outfile); + + # read infile + $data=""; + if(! open(IN,"$dir/$infile") ) { + printlog(-1,"... failed to open input file to %s/%s\n",$dir,$infile); return (-1); + } + while () { + $data.=$_; + } + close(IN); + + # substitute params + my $subhash=$substhashref->{$subst}->{'sub'}; + foreach $from (keys(%$subhash)) { + $to=$subhash->{$from}->{to}; + &substitute(\$to,$parmhash); + $nc= $data=~s/$from/$to/gs; + my $tto=substr($to,0,80); + $tto.="..." if(length($to)>80); + $tto=~s/\n/ /gs; + printlog(2,"\t\t\t\t\t\t(1) #%02d %-20s -> %s\n",$nc,$from,$tto); + } + # second substitute step for recursive definitions + foreach $from (keys(%$subhash)) { + $to=$subhash->{$from}->{to}; + &substitute(\$to,$parmhash); + $nc= $data=~s/$from/$to/gs; + my $tto=substr($to,0,80); + $tto.="..." if(length($to)>80); + $tto=~s/\n/ /gs; + printlog(2,"\t\t\t\t\t\t(2) #%02d %-20s -> %s\n",$nc,$from,$tto); + } + + #write outfile + if(! open(OUT,"> $dir/$outfile") ) { + printlog(-1,"... failed to open output file to %s/%s\n",$dir,$outfile); return (-1); + } + print OUT $data; + close(OUT); + } +} + +sub substitute { + my($strref,$hashref)=@_; + my($found,$c,@varlist1,@varlist2,$var); + $c=0; + $found=0; + +# return(0) if($$strref eq ""); + $$strref=" " if($$strref eq ""); + + # search normal variables + @varlist1=($$strref=~/\$([^\{\[\$\\\s\.\,\*\/\+\-\\\`\(\)\'\?\:\;\}]+)/g); + foreach $var (sort {length($b) <=> length($a)} (@varlist1)) { + if(exists($hashref->{$var})) { + my $val=$hashref->{$var}; + $$strref=~s/\$$var/$val/egs; + printlog(5," substitute var1: %s = %s\n",$var,$val); + $found=1; + } + } + + # search variables in following form: ${name} + @varlist2=($$strref=~/\$\{([^\{\[\$\\\s\.\,\*\/\+\-\\\`\(\)\'\?\:\;\}]+)\}/g); + foreach $var (sort {length($b) <=> length($a)} (@varlist2)) { + if(exists($hashref->{$var})) { + my $val=$hashref->{$var}; + $$strref=~s/\$\{$var\}/$val/egs; + printlog(5," substitute var2: %s = %s\n",$var,$val); + $found=1; + } + } + + # search eval strings (`...`) + while($$strref=~/^(.*)(\`(.*?)\`)(.*)$/) { + my ($before,$evalall,$evalstr,$after)=($1,$2,$3,$4); + my($val,$executeval); + $val=undef; + + if($evalstr=~/^\s*getstdout\((.*)\)\s*$/) { + $executeval=$1; + eval("{\$val=`$executeval`}"); + $val=~s/\n/ /gs; + } + if(!defined($val)) { + eval("{\$val=$evalstr;}"); + } + if(!defined($val)) { + $val=eval("{$evalstr;}"); + } + $val="" if(!defined($val)); + if($val ne "") { + $$strref=$before.$val.$after; + } else { + last; + } + printlog(5," eval %s -> %s >%s<\n",$val,$$strref,$evalall); + } + + # search for variables which could not be substitute + @varlist1=($$strref=~/\$([^\{\[\$\\\s\.\,\*\/\+\-\\\`\(\)\'\?\:\;\}]+)/g); + @varlist2=($$strref=~/\$\{([^\{\[\$\\\s\.\,\*\/\+\-\\\`\(\)\'\?\:\;\}]+)\}/g); + if ( (@varlist1) || (@varlist2) ) { + $SUBSTITUTE_NOTFOUND=join(',',@varlist1,@varlist2); + $found=-1; + printlog(5," unknown vars in %s: %s\n",$$strref,$SUBSTITUTE_NOTFOUND); + } + return($found); +} + +sub substitute_ori { + my($strref,$hashref)=@_; + my($found,$c); + $found=1; + $c=0; +# print "debug: $val -> '$$strref', >$lparam<\n"; + while($found>0) { + $c++; last if($c>10); + # variable replacement + if($$strref=~/^.*\$([^\{\[\$\\\s\.\,\*\/\+\-\/\\\`\(\)\'\?\:\;]+).*$/) { + my $lparam=$1; +# print "debug: -> '$$strref', >$lparam<\n"; + if(exists($hashref->{$lparam})) { + my $val=$hashref->{$lparam}; + $$strref=~s/\$$lparam/$val/es; +# print "debug: val = >$val<\n"; + $found=1; + } else { + # eval string could also handle variables + if(($$strref!~/^(.*)(\`(.*)\`)(.*)$/)) { + $SUBSTITUTE_NOTFOUND=$lparam;$found=-1; + } + } + # ${..} + } elsif ($$strref=~/^.*\$\{([^\}]+)\}.*$/) { + my $lparam=$1; +# print "debug: {}-> '$$strref', >$lparam<\n"; + if(exists($hashref->{$lparam})) { + my $val=$hashref->{$lparam}; + $$strref=~s/\$\{$lparam\}/$val/es; +# print "debug: {} val = >$val<\n"; + $found=1; + } else {$SUBSTITUTE_NOTFOUND=$lparam;$found=-1;} + + } else {$found=0;} + } + if($found==0) { + if($$strref=~/^(.*)(\`(.*)\`)(.*)$/) { + my ($before,$evalall,$evalstr,$after)=($1,$2,$3,$4); + my($val); + eval("{\$val=$evalstr;}"); + $val="" if(!defined($val)); + $$strref=$before.$val.$after; +# print "debug: $val -> '$$strref', >$evalall<\n"; + } + } + return($found); +} + +sub printlog { + my($level,$format,@parms)=@_; + my($i); + my $PWD='$PWD'; + $format=~s/$pwd/$PWD/es; + +# print "printlog: :$level:$format ",caller(),"\n"; + for($i=0;$i<=$#parms;$i++) { + next if($parms[$i] eq ""); + $parms[$i]=~s/$pwd/$PWD/es; + } + + if($level==-1) { + # error message + croak "error in printlog" if(!$format); + croak "error in printlog" if(!@parms); + printf (STDERR $format,@parms); + printf(BENCHLOG "ERROR: $format",@parms); + } else { + if(defined($opt_verbose)) { + if($opt_verbose>=$level) { + my $str=sprintf($format,@parms); + + if(length($str)>=$loglinelength) { + print STDOUT substr($str,0,$loglinelength)," ...\n"; + } else { + print STDOUT $str; + } + } + croak "error in printlog" if(!@parms); + croak "error in printlog" if(!$format); +# print "debug: >$format<>@parms<>$opt_verbose>=$logfilelevel<\n"; + printf(BENCHLOG $format,@parms) if($opt_verbose>=$logfilelevel); + } + } +} + +sub get_identifier { + my($id); + if(-f $idfile) { + open(IN,"$idfile"); + $id=; + chomp($id); + } else { + $id=0; + } + $id++; + open(OUT,">$idfile"); + print OUT $id,"\n"; + close(OUT); + + return($id); +} + +sub get_identifier_ro { + my($id); + if(-f $idfile) { + open(IN,"$idfile"); + $id=; + chomp($id); + } else { + $id=0; + } + $id++; + + return($id); +} + +sub testspec { + my($id,$spec)=@_; + my $rc=0; + + if($spec=~/^(\d+)$/) { + $rc=($id==$1); + } + if($spec=~/^(\d+)\+$/) { + $rc=($id>=$1); + } + if($spec=~/^(\d+)\-$/) { + $rc=($id<=$1); + } + if($spec=~/^(\d+)\.\.(\d+)$/) { + $rc=(($id>=$1) && ($id<=$2)); + } + + return($rc); +} + +sub min { + my($a,$b)=@_; + if ($a<$b) { + return($a); + } else { + return($b); + } +} + +sub max { + my($a,$b)=@_; + if ($a>$b) { + return($a); + } else { + return($b); + } +} + + +sub version { + print &getversion(); print "\n"; + exit(0); +} + + +sub getversion { + my $text = "jube version 1.1p20"; + return $text; +} + +sub usage { + die "Usage: $_[0] + + -start, -submit * : submit new set of benchmark runs (defined in xml-file) + -update + : scans for results of finished jobs + -result + : shows results of benchmark runs (tables) + -force + : force a rescan of benchmark output files for new results + -cdir : directory containing the xml files + (default: ./) + -pdir : directory containing platforms definition XML files + (default: ../platforms) + -tmpdir : directory which is used for running the job in, + please use only an absolute path + (default: tmp in benchmark directory) + -verbose level : verbose + -dump : dump XML-file structure + -showall : shows all results, incl. failed and queued runs + -debug : don't submit jobs + -rmtmp : remove temp directory directly + -cmpdir : directory which is used for running the compile step in, + please use only an absolute path + -Version : prints out the current version + * : needs XML top level file + + : a range of benchmark run ids can be specified + +"; +} + diff --git a/qcd/part_cpu/bench/jube_data_dbhash.pm b/qcd/part_cpu/bench/jube_data_dbhash.pm new file mode 100644 index 0000000000000000000000000000000000000000..31c3fa1c783bb637295512da2beaf9892f3e0001 --- /dev/null +++ b/qcd/part_cpu/bench/jube_data_dbhash.pm @@ -0,0 +1,152 @@ +# $Source: /cvsroot/esa4-t3/DEISA_BENCH/bench/bench_data_dbhash.pm,v $ +# $Author: mahermanns $ +# $Revision: 1.1.1.1 $ +# $Date: 2007/08/07 06:49:12 $ +# +# +package jube_data_dbhash; +use strict; +use File::stat qw(:FIELDS); +use XML::Simple; +use Slurp; # Slurpt ganze Dateien in Variablen +use Data::Dumper; +use File::Listing; +use Time::HiRes qw ( time ); + +my $patint="([\\+\\-\\d]+)"; # Pattern for Integer number +my $patfp ="([\\+\\-\\d.E]+)"; # Pattern for Floating Point number +my $patwrd="([\^\\s]+)"; # Pattern for Work (all noblank characters) +my $patnint="[\\+\\-\\d]+"; # Pattern for Integer number, no () +my $patnfp ="[\\+\\-\\d.E]+"; # Pattern for Floating Point number, no () +my $patnwrd="[\^\\s]+"; # Pattern for Work (all noblank characters), no () +my $patbl ="\\s+"; # Pattern for blank space (variable length) + +$Data::Dumper::Indent=0; + +my($debug)=2; + +sub new { + my $self = {}; + my $proto = shift; + my $class = ref($proto) || $proto; + my($tstart,$tdiff); + printf("\t\tBench_Data_Dbhash: new %s\n",ref($proto)) if($debug>=3); + $self->{VERBOSE}=0; + $self->{BENCHDBREF}=0; + + $tstart=time; + $self->{XS}=XML::Simple->new(); + $tdiff=time-$tstart; + printf("WF: init XML-Parser in %6.4f sec\n",$tdiff); + + bless $self, $class; + return $self; +} + +sub load_db { + my($self) = shift; + my($tstart,$tdiff); + my ($VAR1); + if(-f "benchdb.dump" ) { + $tstart=time; + my $dump = slurp( "benchdb.dump" ) # Inhalt der Dumper-Datei einlesen + or die $!; + eval( $dump ); # Macht eine Datenstruktur daraus + $self->{BENCHDBREF} = $VAR1; + $tdiff=time-$tstart; + printf("WF: reading database in %6.4f sec\n",$tdiff); + } + return(1); +} + +sub save_db { + my($self) = shift; + my($tstart,$tdiff); + $tstart=time; + open(DB,">benchdb.dump"); + print DB Dumper($self->{BENCHDBREF}); + $tdiff=time-$tstart; + printf("WF: dumping database in %6.4f sec\n",$tdiff); + close(DB); +} + +sub load_db_from_xml { + my($self) = shift; + my($tstart,$tdiff); + if(-f "benchdb.dat" ) { + $tstart=time; + $self->{BENCHDBREF} = XMLin("benchdb.dat"); + $tdiff=time-$tstart; + printf("WF: parsing benchdb.dat in %6.4f sec\n",$tdiff); + } +} + +sub save_db_to_xml { + my($self) = shift; + my($tstart,$tdiff); + my $xml = XMLout($self->{BENCHDBREF}); + open(DB,">benchdb.dat"); + print DB $xml; + close(DB); +} + +sub scan_for_new_benchmarks { + my($self) = shift; + my($dir)=@_; + my($tstart,$tdiff); + my($rc,$fn,$afn,$n,$scanfile,$st); + $n=0; + $rc=opendir(DIR,$dir); + while($fn=readdir(DIR)) { + $afn="$dir/$fn"; + next if($fn!~/\.log$/); + next if($fn!~/benchlog.*i$patint\_/); + stat($afn) or die "No $afn: $!"; + $scanfile=1; + if(exists($self->{BENCHDBREF}->{$fn})) { + if(exists($self->{BENCHDBREF}->{$fn}{SIZE}) && exists($self->{BENCHDBREF}->{$fn}{MTIME})) { + $scanfile=0; + $scanfile=1 if($st_size != $self->{BENCHDBREF}->{$fn}{SIZE}); + $scanfile=1 if($st_mtime != $self->{BENCHDBREF}->{$fn}{MTIME}); + } + } + if($scanfile) { + $n++; + printf("WF[%4d]: %s/%s\n",$n,$dir,$fn); + $self->{BENCHDBREF}->{$fn}{SIZE}=$st_size; + $self->{BENCHDBREF}->{$fn}{MTIME}=$st_mtime; + $tstart=time; + $self->{BENCHDBREF}->{$fn}=$self->{XS}->XMLin("$dir/$fn", KeyAttr => { 'map' => "n", + 'benchmark' => "+name" + }, + ForceArray => 1); + $tdiff=time-$tstart; +# printf("WF: parsing $dir/$fn in %6.4f sec\n",$tdiff); + } + } +} + + +sub clean_statdata { + my($self) = shift; + my($tstart,$tdiff,$id); + foreach $id (keys(%{$self->{B_NAME}})) { delete($self->{B_NAME}->{$id}); } + foreach $id (keys(%{$self->{B_PLATFORM}})) { delete($self->{B_PLATFORM}->{$id}); } +} + +sub analyse { + my($self) = shift; + my($tstart,$tdiff); + my($key,$val); + while (($key,$val)=each %{$self->{BENCHDBREF}}) { + my $name=$val->{name}; + my $platform=$val->{platform}; + $self->{B_NAME}->{$name}++; + $self->{B_PLATFORM}->{$platform}++; + } +} + + +1; + + diff --git a/qcd/part_cpu/bench/jube_data_mysql.pm b/qcd/part_cpu/bench/jube_data_mysql.pm new file mode 100644 index 0000000000000000000000000000000000000000..c9f4f252677273f7a71abd6ea3a119d30669ed3c --- /dev/null +++ b/qcd/part_cpu/bench/jube_data_mysql.pm @@ -0,0 +1,329 @@ +# $Source: /cvsroot/esa4-t3/DEISA_BENCH/bench/bench_data_mysql.pm,v $ +# $Author: mahermanns $ +# $Revision: 1.1.1.1 $ +# $Date: 2007/08/07 06:49:12 $ +# +# +package jube_data_mysql; +use strict; +use Data::Dumper; +use File::stat qw(:FIELDS); +use XML::Simple; +use File::Listing; +use Time::HiRes qw ( time ); +use DBI; +$Data::Dumper::Indent = 1; + +my $patint="([\\+\\-\\d]+)"; # Pattern for Integer number +my $patfp ="([\\+\\-\\d.E]+)"; # Pattern for Floating Point number +my $patwrd="([\^\\s]+)"; # Pattern for Work (all noblank characters) +my $patnint="[\\+\\-\\d]+"; # Pattern for Integer number, no () +my $patnfp ="[\\+\\-\\d.E]+"; # Pattern for Floating Point number, no () +my $patnwrd="[\^\\s]+"; # Pattern for Work (all noblank characters), no () +my $patbl ="\\s+"; # Pattern for blank space (variable length) + + +my($debug)=2; + +sub new { + my $self = {}; + my $proto = shift; + my $class = ref($proto) || $proto; + my($tstart,$tdiff); + + $self->{HOST} = shift; + $self->{DATABASE} = shift; + $self->{USER} = shift; + $self->{PASSWD} = shift; + $self->{CONNECT} = "DBI:mysql:hostname=$self->{HOST}:database=$self->{DATABASE}"; + printf("\t\tBench_Data_mysql: new %s\n",ref($proto)) if($debug>=3); + $self->{VERBOSE}=0; + $tstart=time; + print "WF: >$self->{CONNECT}<>$self->{USER}<>$self->{PASSWD}<\n"; + $self->{DBH} = DBI->connect($self->{CONNECT},$self->{USER},$self->{PASSWD}) || die "Cannot connect to DB"; + $self->{XS}=XML::Simple->new(); + $tdiff=time-$tstart; + printf("WF: connect to database in %6.4f sec\n",$tdiff); + + bless $self, $class; + return $self; +} + +sub DESTROY { + my($self) = shift; + my($tstart,$tdiff); + $tstart=time; + $self->{DBH}->disconnect() || die "Cannot disconnect to DB"; + $tdiff=time-$tstart; + printf("WF: disconnect to database in %6.4f sec\n",$tdiff); +} + +sub load_db { + my($self) = shift; + + if(!$self->table_exists("STATUS")) { + $self->create_table("STATUS"); + $self->init_table("STATUS"); + + } + if(!$self->table_exists("BENCHRUNS")) { + $self->create_table("BENCHRUNS"); + } + if(!$self->table_exists("BENCHMARKS")) { + $self->create_table("BENCHMARKS"); + } + + return(1); +} + +sub save_db { + my($self) = shift; +} + + +sub scan_for_new_benchmarks { + my($self) = shift; + my($dir)=@_; + my($tstart,$tdiff); + my($rc,$fn,$afn,$n,$scanfile,$st,$xmltree,$id,$subid,$nr,$snr,$benchname); + my($NR,$ID,$FN,$FNSIZE,$MTIME); + $n=0; + + + my $sth = $self->{DBH}->prepare( q{ + SELECT NR,SUBID,FN,FNSIZE,MTIME FROM BENCHMARKS WHERE FN=?; + }) or die "Can't prepare statement: $DBI::errstr"; + + + $rc=opendir(DIR,$dir); + while($fn=readdir(DIR)) { + $afn="$dir/$fn"; + next if($fn!~/\.log$/); + next if($fn!~/benchlog.*i$patint\_/); + stat($afn) or die "No $afn: $!"; + + $scanfile=1; + # check if file is already in DB + my $rc = $sth->execute($fn) or die "Can't execute statement: $DBI::errstr"; + if(($NR,$ID,$FN,$FNSIZE,$MTIME) = $sth->fetchrow()) { + $scanfile=0 if (($FNSIZE==$st_size) && ($MTIME==$st_mtime)); + } + $sth->finish(); + + if($scanfile) { + $n++; +# printf("WF[%4d]: %s/%s\n",$n,$dir,$fn); + $tstart=time; + $xmltree=$self->{XS}->XMLin("$dir/$fn", KeyAttr => { 'map' => "n", + 'benchmark' => "+name" + }, + ForceArray => 1); + $tdiff=time-$tstart; +# print Dumper($xmltree); + ($benchname)=(keys(%{$xmltree->{'benchmark'}})); + $id=$xmltree->{'benchmark'}->{$benchname}->{'identifier'}; + $subid=$xmltree->{'benchmark'}->{$benchname}->{'subid'}; + $nr=$self->add_to_benchruns($id,$fn,$st_size,$st_mtime); +# $snr=$self->add_to_benchmarks($subid,$fn,$st_size,$st_mtime); + printf("WF[%4d]: parsing $dir/$fn in %6.4f sec\n",$n,$tdiff); + } + } +} + + +sub clean_statdata { + my($self) = shift; + my($tstart,$tdiff,$id); + foreach $id (keys(%{$self->{B_NAME}})) { delete($self->{B_NAME}->{$id}); } + foreach $id (keys(%{$self->{B_PLATFORM}})) { delete($self->{B_PLATFORM}->{$id}); } +} + +sub analyse { + my($self) = shift; + my($tstart,$tdiff); + my($key,$val); + while (($key,$val)=each %{$self->{BENCHDBREF}}) { + my $name=$val->{name}; + my $platform=$val->{platform}; + $self->{B_NAME}->{$name}++; + $self->{B_PLATFORM}->{$platform}++; + } +} + + +##################################################################################################### +# Utility functions +##################################################################################################### + +sub table_exists { + my($self) = shift; + my($tablename)=@_; + my($table,$tablespace,$found); + my $sth = $self->{DBH}->prepare( q{ + show tables; + }) or die "Can't prepare statement: $DBI::errstr"; + + my $rc = $sth->execute() or die "Can't execute statement: $DBI::errstr"; + + $found=0; + while(($table) = $sth->fetchrow()) { + print "WF: table_exists: >$table<\n"; + $found=1 if($table eq $tablename); + } + $sth->finish(); + return($found); +} + +sub create_table { + my($self) = shift; + my($tablename)=@_; + my($createstmt); + my($tstart,$tdiff); + + $createstmt=qq{ + CREATE table STATUS + ( + NAME VARCHAR(20) NOT NULL, + VAL INTEGER(12) NOT NULL + ) + } if ($tablename eq "STATUS"); + $createstmt=qq{ + CREATE table BENCHRUNS + ( + NR INTEGER(12) NOT NULL, + ID VARCHAR(255) NOT NULL, + COUNTBM INTEGER(12) NOT NULL + ) + } if ($tablename eq "BENCHRUNS"); + $createstmt=qq{ + CREATE table BENCHMARKS + ( + NR INTEGER(12) NOT NULL, + NR_IN_BENCHMARKS INTEGER(12) NOT NULL, + SUBID VARCHAR(255) NOT NULL, + FN VARCHAR(255) NOT NULL, + FNSIZE INTEGER(12) NOT NULL, + MTIME INTEGER(12) NOT NULL + ) + } if ($tablename eq "BENCHMARKS"); + + $tstart=time; + my $sth = $self->{DBH}->prepare( $createstmt ) or die "Can't prepare statement: $DBI::errstr"; + my $rc = $sth->execute() or die "Can't execute statement: $DBI::errstr"; + $sth->finish(); + $tdiff=time-$tstart; + printf("WF: table $tablename created ($rc) in %6.4f sec\n",$tdiff); + + return($rc); +} + +sub init_table { + my($self) = shift; + my($tablename)=@_; + my $createstmt=0; + my($tstart,$tdiff,$rc); + + $createstmt=qq{ + INSERT INTO STATUS (NAME,VAL) VALUES('BENCHRUNS_NEXTID',0); + INSERT INTO STATUS (NAME,VAL) VALUES('BENCHMARKS_NEXTID',0); + } if ($tablename eq "STATUS"); + + if($createstmt) { + $tstart=time; + my $sth = $self->{DBH}->prepare( $createstmt ) or die "Can't prepare statement: $DBI::errstr"; + $rc = $sth->execute() or die "Can't execute statement: $DBI::errstr"; + $sth->finish(); + $tdiff=time-$tstart; + printf("WF: table $tablename created ($rc) in %6.4f sec\n",$tdiff); + } + return($rc); +} + +sub set_next_id { + my($self) = shift; + my($table,$nextid)=@_; + my($sth); + my $entry=$table."_NEXTID"; + $sth = $self->{DBH}->prepare( q{ + UPDATE STATUS SET VAL=? WHERE NAME =?; + }) or die "Can't prepare statement: $DBI::errstr"; + my $rc = $sth->execute($nextid,$entry) or die "Can't execute statement: $DBI::errstr"; + $sth->finish(); + return($rc); +} + +sub get_next_id { + my($self) = shift; + my($table) = shift; + my $nextid=-1; + my $rnextid; + my $entry=$table."_NEXTID"; + my $sth = $self->{DBH}->prepare( q{ + SELECT VAL FROM STATUS WHERE NAME =?; + }) or die "Can't prepare statement: $DBI::errstr"; + my $rc = $sth->execute($entry) or die "Can't execute statement: $DBI::errstr"; + if(($rnextid) = $sth->fetchrow()) { + $nextid=$rnextid; + } + $sth->finish(); + +# printf("WF: get_next_id: %d\n",$nextid); + + return($nextid); +} + +sub add_to_benchruns { + my($self) = shift; + my($id)=@_; + my($nr,$rid,$countbm,$rc,$inserted,$rc,$sth,$nextid); + + # check if already inserted + $sth = $self->{DBH}->prepare( q{ + SELECT NR,ID,COUNTBM FROM BENCHRUNS WHERE ID=?; + }) or die "Can't prepare statement: $DBI::errstr"; + $rc = $sth->execute($id) or die "Can't execute statement: $DBI::errstr"; + if(($nr,$rid,$countbm) = $sth->fetchrow()) { + $inserted=1; + } else { + $inserted=0; + $countbm=0; + } + $sth->finish(); + #update or insert new entry + $countbm++; + if($inserted) { + $sth = $self->{DBH}->prepare( q{ + UPDATE BENCHRUNS SET COUNTBM=? WHERE NR=?; + }) or die "Can't prepare statement: $DBI::errstr"; + $rc = $sth->execute($countbm,$nr) or die "Can't execute statement: $DBI::errstr"; + } else { + $nr=$nextid=$self->get_next_id("BENCHRUNS"); + $sth = $self->{DBH}->prepare( q{ + INSERT INTO BENCHRUNS (NR,ID,COUNTBM) VALUES(?,?,?); + }) or die "Can't prepare statement: $DBI::errstr"; + $rc = $sth->execute($nextid,$id,$countbm) or die "Can't execute statement: $DBI::errstr"; + $nextid++; + $self->set_next_id("BENCHRUNS",$nextid); + } + return($nr); +} + +sub insert_in_benchmarks { + my($self) = shift; + my($id,$fn,$st_size,$st_mtime)=@_; + my($nr,$nextid,$sth); + $nr=$nextid=$self->get_next_id(); + $sth = $self->{DBH}->prepare( q{ + INSERT INTO BENCHMARKS (NR,ID,FN,FNSIZE,MTIME) VALUES(?,?,?,?,?); + }) or die "Can't prepare statement: $DBI::errstr"; + my $rc = $sth->execute($nr,$id,$fn,$st_size,$st_mtime) or die "Can't execute statement: $DBI::errstr"; + $sth->finish(); + $nextid++; + $self->set_next_id($nextid); + return($nr); +} + + +1; + + diff --git a/qcd/part_cpu/bench/jube_report.xsl b/qcd/part_cpu/bench/jube_report.xsl new file mode 100644 index 0000000000000000000000000000000000000000..008627501f210da8a993a088c88f2a5ce57c4687 --- /dev/null +++ b/qcd/part_cpu/bench/jube_report.xsl @@ -0,0 +1,254 @@ + + + + + + Report + +

JuBE Report

+ + + + +

+

General Information

+ + + + + + + + + + + + + + + + + + + + + + + + + + +
Identifier
Number Of CPUs
Number Of Tasks
Tasks Per Node
Threads Per Task
Wall Clock Time sec.
+ + + + + + + + + + +
+ + +

Compile Section

+

Top Of Page

+ Compile command: + +

+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ParameterValue
+
+

+
+ + +

XXX started:

+ +

+
+ + +

XXX ended:

+ +

+
+ + +

+

Analyse Section

+

Top Of Page

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameValueUnit
+ + + + + + + + + + + +
MethodValue
+
+

+ + + + +

+
+ + +

+

Benchrun

+

Top Of Page

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameValue
+

+
+ + +

Standard Output File:

+

Top Of Page

+ +
+ + +

Standard Error File

+

Top Of Page

+ +
+ + +

Calculation started:

+
+ + +

Calculation ended:

+
+
diff --git a/qcd/part_cpu/bench/jubedb.pl b/qcd/part_cpu/bench/jubedb.pl new file mode 100755 index 0000000000000000000000000000000000000000..b6b17b162fad70291667e5388b74f4314f4e1742 --- /dev/null +++ b/qcd/part_cpu/bench/jubedb.pl @@ -0,0 +1,129 @@ +#!/usr/bin/perl -w + +use strict; + +my $patint="([\\+\\-\\d]+)"; # Pattern for Integer number +my $patfp ="([\\+\\-\\d.E]+)"; # Pattern for Floating Point number +my $patwrd="([\^\\s]+)"; # Pattern for Work (all noblank characters) +my $patnint="[\\+\\-\\d]+"; # Pattern for Integer number, no () +my $patnfp ="[\\+\\-\\d.E]+"; # Pattern for Floating Point number, no () +my $patnwrd="[\^\\s]+"; # Pattern for Work (all noblank characters), no () +my $patbl ="\\s+"; # Pattern for blank space (variable length) + +use Getopt::Long; +use File::stat qw(:FIELDS); +use FindBin; +use lib "$FindBin::RealBin"; +use lib "$FindBin::RealBin/lib"; +use Time::HiRes qw ( time ); +use Tk; +use Tk::Tree; + +use jube_data_dbhash; +use jube_data_mysql; + +my $pwd=`pwd`; +chomp($pwd); + +my $opt_verbose=0; +my $opt_dump=0; +my $opt_gui=0; +my $opt_debug=undef; +my $opt_scan=undef; +my $opt_dbtype="mysql"; + +usage($0) if( ! GetOptions( + 'verbose:i' => \$opt_verbose, + 'dump' => \$opt_dump, + 'gui' => \$opt_gui, + 'scan=s' => \$opt_scan, + 'dbtype=s' => \$opt_dbtype + ) ); + + + + +my $startdate=localtime(time()); + +my ($benchxmlfile,$benchlogfile,$startspec,$enddate); + +my $dbobj; + + +$dbobj=bench_data_dbhash->new() if($opt_dbtype eq "dbhash"); +$dbobj=bench_data_mysql->new("localhost","nbench","nbench","") if($opt_dbtype eq "mysql"); + +$dbobj->load_db(); + +if($opt_scan) { + $dbobj->scan_for_new_benchmarks($opt_scan); +} + +exit; + + +$dbobj->analyse(); + +my ($name); +foreach $name (keys %{$dbobj->{B_NAME}}) { + print "WF Benchmark: $name\n"; +} + +my ($platform); +foreach $platform (keys %{$dbobj->{B_PLATFORM}}) { + print "WF Benchmark: $platform\n"; +} + +if($opt_gui) { + &gui(); +} + +$dbobj->save_db(); + +sub usage { + die "Usage: $_[0] + -verbose : verbose + -dump : dump XML-file structure + -debug : don't submit jobs + -rmtmp : remove temp directory directly + +"; +} + +sub gui { + my $top = new MainWindow( -title => "DirTree" ); + + my $current_bench; + my $tree = $top->Scrolled( qw/Tree + -width 35 -height 30 + -selectmode browse -exportselection 1 + -scrollbars osoe/ ); + + my $lab = $top->Label( -text => "Benchmark:" ); + my $ent = $top->Entry( -textvariable => \$current_bench ); + + my $ok = $top->Button( qw/-text Ok -underline 0 -width 6/ ); + my $cancel = $top->Button( qw/-text Cancel -underline 0 -width 6/ ); + + $tree->configure( -browsecmd => sub { $current_bench = shift } ); + $tree->configure( -command => sub { do_it( $current_bench ) } ); + $ok->configure( -command => sub { do_it( $current_bench ) } ); + $cancel->configure( -command => sub { exit } ); + + $tree->pack( qw/-expand yes -fill both -padx 10 -pady 10 -side top/ ); + $lab->pack( qw/-anchor w/ ); + $ent->pack( qw/-fill x/ ); + $ok->pack( qw/-side left -padx 10 -pady 10/ ); + $cancel->pack( qw/-side right -padx 10 -pady 10/ ); + + MainLoop(); +} + + + + + + + + + diff --git a/qcd/part_cpu/bench/lib/File/Listing.pm b/qcd/part_cpu/bench/lib/File/Listing.pm new file mode 100644 index 0000000000000000000000000000000000000000..5e02afa7aa8b59363fdd9458de9262f397c5cf3d --- /dev/null +++ b/qcd/part_cpu/bench/lib/File/Listing.pm @@ -0,0 +1,411 @@ +package File::Listing; + +# $Id: Listing.pm,v 1.1.1.1 2007/08/07 06:49:12 mahermanns Exp $ + +sub Version { $VERSION; } +$VERSION = sprintf("%d.%02d", q$Revision: 1.1.1.1 $ =~ /(\d+)\.(\d+)/); + +require Exporter; +@ISA = qw(Exporter); +@EXPORT = qw(parse_dir); + +use strict; + +use Carp (); +use HTTP::Date qw(str2time); + + + +sub parse_dir ($;$$$) +{ + my($dir, $tz, $fstype, $error) = @_; + + $fstype ||= 'unix'; + $fstype = "File::Listing::" . lc $fstype; + + my @args = $_[0]; + push(@args, $tz) if(@_ >= 2); + push(@args, $error) if(@_ >= 4); + + $fstype->parse(@args); +} + + +sub line { Carp::croak("Not implemented yet"); } +sub init { } # Dummy sub + + +sub file_mode ($) +{ + # This routine was originally borrowed from Graham Barr's + # Net::FTP package. + + local $_ = shift; + my $mode = 0; + my($type,$ch); + + s/^(.)// and $type = $1; + + while (/(.)/g) { + $mode <<= 1; + $mode |= 1 if $1 ne "-" && + $1 ne 'S' && + $1 ne 't' && + $1 ne 'T'; + } + + $type eq "d" and $mode |= 0040000 or # Directory + $type eq "l" and $mode |= 0120000 or # Symbolic Link + $mode |= 0100000; # Regular File + + $mode |= 0004000 if /^...s....../i; + $mode |= 0002000 if /^......s.../i; + $mode |= 0001000 if /^.........t/i; + + $mode; +} + + +sub parse +{ + my($pkg, $dir, $tz, $error) = @_; + + # First let's try to determine what kind of dir parameter we have + # received. We allow both listings, reference to arrays and + # file handles to read from. + + if (ref($dir) eq 'ARRAY') { + # Already splitted up + } + elsif (ref($dir) eq 'GLOB') { + # A file handle + } + elsif (ref($dir)) { + Carp::croak("Illegal argument to parse_dir()"); + } + elsif ($dir =~ /^\*\w+(::\w+)+$/) { + # This scalar looks like a file handle, so we assume it is + } + else { + # A normal scalar listing + $dir = [ split(/\n/, $dir) ]; + } + + $pkg->init(); + + my @files = (); + if (ref($dir) eq 'ARRAY') { + for (@$dir) { + push(@files, $pkg->line($_, $tz, $error)); + } + } + else { + local($_); + while (<$dir>) { + chomp; + push(@files, $pkg->line($_, $tz, $error)); + } + } + wantarray ? @files : \@files; +} + + + +package File::Listing::unix; + +use HTTP::Date qw(str2time); + +# A place to remember current directory from last line parsed. +use vars qw($curdir); +no strict qw(vars); + +@ISA = qw(File::Listing); + + + +sub init +{ + $curdir = ''; +} + + +sub line +{ + shift; # package name + local($_) = shift; + my($tz, $error) = @_; + + s/\015//g; + #study; + + my ($kind, $size, $date, $name); + if (($kind, $size, $date, $name) = + /^([\-FlrwxsStTdD]{10}) # Type and permission bits + .* # Graps + \D(\d+) # File size + \s+ # Some space + (\w{3}\s+\d+\s+(?:\d{1,2}:\d{2}|\d{4})) # Date + \s+ # Some more space + (.*)$ # File name + /x ) + + { + return if $name eq '.' || $name eq '..'; + $name = "$curdir/$name" if length $curdir; + my $type = '?'; + if ($kind =~ /^l/ && $name =~ /(.*) -> (.*)/ ) { + $name = $1; + $type = "l $2"; + } + elsif ($kind =~ /^[\-F]/) { # (hopefully) a regular file + $type = 'f'; + } + elsif ($kind =~ /^[dD]/) { + $type = 'd'; + $size = undef; # Don't believe the reported size + } + return [$name, $type, $size, str2time($date, $tz), + File::Listing::file_mode($kind)]; + + } + elsif (/^(.+):$/ && !/^[dcbsp].*\s.*\s.*:$/ ) { + my $dir = $1; + return () if $dir eq '.'; + $curdir = $dir; + return (); + } + elsif (/^[Tt]otal\s+(\d+)$/ || /^\s*$/) { + return (); + } + elsif (/not found/ || # OSF1, HPUX, and SunOS return + # "$file not found" + /No such file/ || # IRIX returns + # "UX:ls: ERROR: Cannot access $file: No such file or directory" + # Solaris returns + # "$file: No such file or directory" + /cannot find/ # Windows NT returns + # "The system cannot find the path specified." + ) { + return () unless defined $error; + &$error($_) if ref($error) eq 'CODE'; + warn "Error: $_\n" if $error eq 'warn'; + return (); + } + elsif ($_ eq '') { # AIX, and Linux return nothing + return () unless defined $error; + &$error("No such file or directory") if ref($error) eq 'CODE'; + warn "Warning: No such file or directory\n" if $error eq 'warn'; + return (); + } + else { + # parse failed, check if the dosftp parse understands it + return(File::Listing::dosftp->line($_,$tz,$error)); + } + +} + + + +package File::Listing::dosftp; + +use HTTP::Date qw(str2time); + +# A place to remember current directory from last line parsed. +use vars qw($curdir); +no strict qw(vars); + +@ISA = qw(File::Listing); + + + +sub init +{ + $curdir = ''; +} + + +sub line +{ + shift; # package name + local($_) = shift; + my($tz, $error) = @_; + + s/\015//g; + + my ($kind, $size, $date, $name); + + # 02-05-96 10:48AM 1415 src.slf + # 09-10-96 09:18AM sl_util + if (($date,$size_or_dir,$name) = + /^(\d\d-\d\d-\d\d\s+\d\d:\d\d\wM) # Date and time info + \s+ # Some space + (<\w{3}>|\d+) # Dir or Size + \s+ # Some more space + (.+)$ # File name + /x ) + { + return if $name eq '.' || $name eq '..'; + $name = "$curdir/$name" if length $curdir; + my $type = '?'; + if ($size_or_dir eq '') { + $type = "d"; + $size = ""; # directories have no size in the pc listing + } + else { + $type = 'f'; + $size = $size_or_dir; + } + return [$name, $type, $size, str2time($date, $tz), + File::Listing::file_mode($kind)]; + + } + else { + return () unless defined $error; + &$error($_) if ref($error) eq 'CODE'; + warn "Can't parse: $_\n" if $error eq 'warn'; + return (); + } + +} + + + +package File::Listing::vms; +@File::Listing::vms::ISA = qw(File::Listing); + +package File::Listing::netware; +@File::Listing::netware::ISA = qw(File::Listing); + + + +package File::Listing::apache; + +@ISA = qw(File::Listing); + + +sub init { } + + +sub line { + shift; # package name + local($_) = shift; + my($tz, $error) = @_; # ignored for now... + + if (m!.*.*?(\d+)-([a-zA-Z]+)-(\d+)\s+(\d+):(\d+)\s+(?:([\d\.]+[kM]?|-))!i) { + my($filename, $filesize) = ($1, $7); + my($d,$m,$y, $H,$M) = ($2,$3,$4,$5,$6); + + $filesize = 0 if $filesize eq '-'; + if ($filesize =~ s/k$//i) { + $filesize *= 1024; + } + elsif ($filesize =~ s/M$//) { + $filesize *= 1024*1024; + } + elsif ($filesize =~ s/G$//) { + $filesize *= 1024*1024*1024; + } + $filesize = int $filesize; + + require Time::Local; + my $filetime = Time::Local::timelocal(0,$M,$H,$d,_monthabbrev_number($m)-1,_guess_year($y)-1900); + my $filetype = ($filename =~ s|/$|| ? "d" : "f"); + return [$filename, $filetype, $filesize, $filetime, undef]; + } + + return (); +} + + +sub _guess_year { + my $y = shift; + if ($y >= 90) { + $y = 1900+$y; + } + elsif ($y < 100) { + $y = 2000+$y; + } + $y; +} + + +sub _monthabbrev_number { + my $mon = shift; + +{'Jan' => 1, + 'Feb' => 2, + 'Mar' => 3, + 'Apr' => 4, + 'May' => 5, + 'Jun' => 6, + 'Jul' => 7, + 'Aug' => 8, + 'Sep' => 9, + 'Oct' => 10, + 'Nov' => 11, + 'Dec' => 12, + }->{$mon}; +} + + +1; + +__END__ + +=head1 NAME + +File::Listing - parse directory listing + +=head1 SYNOPSIS + + use File::Listing qw(parse_dir); + for (parse_dir(`ls -l`)) { + ($name, $type, $size, $mtime, $mode) = @$_; + next if $type ne 'f'; # plain file + #... + } + + # directory listing can also be read from a file + open(LISTING, "zcat ls-lR.gz|"); + $dir = parse_dir(\*LISTING, '+0000'); + +=head1 DESCRIPTION + +This module exports a single function called parse_dir(), which can be +used to parse directory listings. Currently it only understand Unix +C<'ls -l'> and C<'ls -lR'> format. It should eventually be able to +most things you might get back from a ftp server file listing (LIST +command), i.e. VMS listings, NT listings, DOS listings,... + +The first parameter to parse_dir() is the directory listing to parse. +It can be a scalar, a reference to an array of directory lines or a +glob representing a filehandle to read the directory listing from. + +The second parameter is the time zone to use when parsing time stamps +in the listing. If this value is undefined, then the local time zone is +assumed. + +The third parameter is the type of listing to assume. The values will +be strings like 'unix', 'vms', 'dos'. Currently only 'unix' is +implemented and this is also the default value. Ideally, the listing +type should be determined automatically. + +The fourth parameter specifies how unparseable lines should be treated. +Values can be 'ignore', 'warn' or a code reference. Warn means that +the perl warn() function will be called. If a code reference is +passed, then this routine will be called and the return value from it +will be incorporated in the listing. The default is 'ignore'. + +Only the first parameter is mandatory. + +The return value from parse_dir() is a list of directory entries. In +a scalar context the return value is a reference to the list. The +directory entries are represented by an array consisting of [ +$filename, $filetype, $filesize, $filetime, $filemode ]. The +$filetype value is one of the letters 'f', 'd', 'l' or '?'. The +$filetime value is the seconds since Jan 1, 1970. The +$filemode is a bitmask like the mode returned by stat(). + +=head1 CREDITS + +Based on lsparse.pl (from Lee McLoughlin's ftp mirror package) and +Net::FTP's parse_dir (Graham Barr). diff --git a/qcd/part_cpu/bench/lib/HTTP/Date.pm b/qcd/part_cpu/bench/lib/HTTP/Date.pm new file mode 100644 index 0000000000000000000000000000000000000000..b62aad7a60ae29844196fe81ddd4be916aabc759 --- /dev/null +++ b/qcd/part_cpu/bench/lib/HTTP/Date.pm @@ -0,0 +1,389 @@ +package HTTP::Date; # $Date: 2008/05/15 07:50:53 $ + +$VERSION = sprintf("%d.%02d", q$Revision: 1.1 $ =~ /(\d+)\.(\d+)/); + +require 5.004; +require Exporter; +@ISA = qw(Exporter); +@EXPORT = qw(time2str str2time); +@EXPORT_OK = qw(parse_date time2iso time2isoz); + +use strict; +require Time::Local; + +use vars qw(@DoW @MoY %MoY); +@DoW = qw(Sun Mon Tue Wed Thu Fri Sat); +@MoY = qw(Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec); +@MoY{@MoY} = (1..12); + +my %GMT_ZONE = (GMT => 1, UTC => 1, UT => 1, Z => 1); + + +sub time2str (;$) +{ + my $time = shift; + $time = time unless defined $time; + my ($sec, $min, $hour, $mday, $mon, $year, $wday) = gmtime($time); + sprintf("%s, %02d %s %04d %02d:%02d:%02d GMT", + $DoW[$wday], + $mday, $MoY[$mon], $year+1900, + $hour, $min, $sec); +} + + +sub str2time ($;$) +{ + my $str = shift; + return undef unless defined $str; + + # fast exit for strictly conforming string + if ($str =~ /^[SMTWF][a-z][a-z], (\d\d) ([JFMAJSOND][a-z][a-z]) (\d\d\d\d) (\d\d):(\d\d):(\d\d) GMT$/) { + return eval { + my $t = Time::Local::timegm($6, $5, $4, $1, $MoY{$2}-1, $3-1900); + $t < 0 ? undef : $t; + }; + } + + my @d = parse_date($str); + return undef unless @d; + $d[0] -= 1900; # year + $d[1]--; # month + + my $tz = pop(@d); + unless (defined $tz) { + unless (defined($tz = shift)) { + return eval { my $frac = $d[-1]; $frac -= ($d[-1] = int($frac)); + my $t = Time::Local::timelocal(reverse @d) + $frac; + $t < 0 ? undef : $t; + }; + } + } + + my $offset = 0; + if ($GMT_ZONE{uc $tz}) { + # offset already zero + } + elsif ($tz =~ /^([-+])?(\d\d?):?(\d\d)?$/) { + $offset = 3600 * $2; + $offset += 60 * $3 if $3; + $offset *= -1 if $1 && $1 eq '-'; + } + else { + eval { require Time::Zone } || return undef; + $offset = Time::Zone::tz_offset($tz); + return undef unless defined $offset; + } + + return eval { my $frac = $d[-1]; $frac -= ($d[-1] = int($frac)); + my $t = Time::Local::timegm(reverse @d) + $frac; + $t < 0 ? undef : $t - $offset; + }; +} + + +sub parse_date ($) +{ + local($_) = shift; + return unless defined; + + # More lax parsing below + s/^\s+//; # kill leading space + s/^(?:Sun|Mon|Tue|Wed|Thu|Fri|Sat)[a-z]*,?\s*//i; # Useless weekday + + my($day, $mon, $yr, $hr, $min, $sec, $tz, $ampm); + + # Then we are able to check for most of the formats with this regexp + (($day,$mon,$yr,$hr,$min,$sec,$tz) = + /^ + (\d\d?) # day + (?:\s+|[-\/]) + (\w+) # month + (?:\s+|[-\/]) + (\d+) # year + (?: + (?:\s+|:) # separator before clock + (\d\d?):(\d\d) # hour:min + (?::(\d\d))? # optional seconds + )? # optional clock + \s* + ([-+]?\d{2,4}|(?![APap][Mm]\b)[A-Za-z]+)? # timezone + \s* + (?:\(\w+\))? # ASCII representation of timezone in parens. + \s*$ + /x) + + || + + # Try the ctime and asctime format + (($mon, $day, $hr, $min, $sec, $tz, $yr) = + /^ + (\w{1,3}) # month + \s+ + (\d\d?) # day + \s+ + (\d\d?):(\d\d) # hour:min + (?::(\d\d))? # optional seconds + \s+ + (?:([A-Za-z]+)\s+)? # optional timezone + (\d+) # year + \s*$ # allow trailing whitespace + /x) + + || + + # Then the Unix 'ls -l' date format + (($mon, $day, $yr, $hr, $min, $sec) = + /^ + (\w{3}) # month + \s+ + (\d\d?) # day + \s+ + (?: + (\d\d\d\d) | # year + (\d{1,2}):(\d{2}) # hour:min + (?::(\d\d))? # optional seconds + ) + \s*$ + /x) + + || + + # ISO 8601 format '1996-02-29 12:00:00 -0100' and variants + (($yr, $mon, $day, $hr, $min, $sec, $tz) = + /^ + (\d{4}) # year + [-\/]? + (\d\d?) # numerical month + [-\/]? + (\d\d?) # day + (?: + (?:\s+|[-:Tt]) # separator before clock + (\d\d?):?(\d\d) # hour:min + (?::?(\d\d(?:\.\d*)?))? # optional seconds (and fractional) + )? # optional clock + \s* + ([-+]?\d\d?:?(:?\d\d)? + |Z|z)? # timezone (Z is "zero meridian", i.e. GMT) + \s*$ + /x) + + || + + # Windows 'dir' 11-12-96 03:52PM + (($mon, $day, $yr, $hr, $min, $ampm) = + /^ + (\d{2}) # numerical month + - + (\d{2}) # day + - + (\d{2}) # year + \s+ + (\d\d?):(\d\d)([APap][Mm]) # hour:min AM or PM + \s*$ + /x) + + || + return; # unrecognized format + + # Translate month name to number + $mon = $MoY{$mon} || + $MoY{"\u\L$mon"} || + ($mon =~ /^\d\d?$/ && $mon >= 1 && $mon <= 12 && int($mon)) || + return; + + # If the year is missing, we assume first date before the current, + # because of the formats we support such dates are mostly present + # on "ls -l" listings. + unless (defined $yr) { + my $cur_mon; + ($cur_mon, $yr) = (localtime)[4, 5]; + $yr += 1900; + $cur_mon++; + $yr-- if $mon > $cur_mon; + } + elsif (length($yr) < 3) { + # Find "obvious" year + my $cur_yr = (localtime)[5] + 1900; + my $m = $cur_yr % 100; + my $tmp = $yr; + $yr += $cur_yr - $m; + $m -= $tmp; + $yr += ($m > 0) ? 100 : -100 + if abs($m) > 50; + } + + # Make sure clock elements are defined + $hr = 0 unless defined($hr); + $min = 0 unless defined($min); + $sec = 0 unless defined($sec); + + # Compensate for AM/PM + if ($ampm) { + $ampm = uc $ampm; + $hr = 0 if $hr == 12 && $ampm eq 'AM'; + $hr += 12 if $ampm eq 'PM' && $hr != 12; + } + + return($yr, $mon, $day, $hr, $min, $sec, $tz) + if wantarray; + + if (defined $tz) { + $tz = "Z" if $tz =~ /^(GMT|UTC?|[-+]?0+)$/; + } + else { + $tz = ""; + } + return sprintf("%04d-%02d-%02d %02d:%02d:%02d%s", + $yr, $mon, $day, $hr, $min, $sec, $tz); +} + + +sub time2iso (;$) +{ + my $time = shift; + $time = time unless defined $time; + my($sec,$min,$hour,$mday,$mon,$year) = localtime($time); + sprintf("%04d-%02d-%02d %02d:%02d:%02d", + $year+1900, $mon+1, $mday, $hour, $min, $sec); +} + + +sub time2isoz (;$) +{ + my $time = shift; + $time = time unless defined $time; + my($sec,$min,$hour,$mday,$mon,$year) = gmtime($time); + sprintf("%04d-%02d-%02d %02d:%02d:%02dZ", + $year+1900, $mon+1, $mday, $hour, $min, $sec); +} + +1; + + +__END__ + +=head1 NAME + +HTTP::Date - date conversion routines + +=head1 SYNOPSIS + + use HTTP::Date; + + $string = time2str($time); # Format as GMT ASCII time + $time = str2time($string); # convert ASCII date to machine time + +=head1 DESCRIPTION + +This module provides functions that deal the date formats used by the +HTTP protocol (and then some more). Only the first two functions, +time2str() and str2time(), are exported by default. + +=over 4 + +=item time2str( [$time] ) + +The time2str() function converts a machine time (seconds since epoch) +to a string. If the function is called without an argument, it will +use the current time. + +The string returned is in the format preferred for the HTTP protocol. +This is a fixed length subset of the format defined by RFC 1123, +represented in Universal Time (GMT). An example of a time stamp +in this format is: + + Sun, 06 Nov 1994 08:49:37 GMT + +=item str2time( $str [, $zone] ) + +The str2time() function converts a string to machine time. It returns +C if the format of $str is unrecognized, or the time is outside +the representable range. The time formats recognized are the same as +for parse_date(). + +The function also takes an optional second argument that specifies the +default time zone to use when converting the date. This parameter is +ignored if the zone is found in the date string itself. If this +parameter is missing, and the date string format does not contain any +zone specification, then the local time zone is assumed. + +If the zone is not "C" or numerical (like "C<-0800>" or +"C<+0100>"), then the C module must be installed in order +to get the date recognized. + +=item parse_date( $str ) + +This function will try to parse a date string, and then return it as a +list of numerical values followed by a (possible undefined) time zone +specifier; ($year, $month, $day, $hour, $min, $sec, $tz). The $year +returned will B have the number 1900 subtracted from it and the +$month numbers start with 1. + +In scalar context the numbers are interpolated in a string of the +"YYYY-MM-DD hh:mm:ss TZ"-format and returned. + +If the date is unrecognized, then the empty list is returned. + +The function is able to parse the following formats: + + "Wed, 09 Feb 1994 22:23:32 GMT" -- HTTP format + "Thu Feb 3 17:03:55 GMT 1994" -- ctime(3) format + "Thu Feb 3 00:00:00 1994", -- ANSI C asctime() format + "Tuesday, 08-Feb-94 14:15:29 GMT" -- old rfc850 HTTP format + "Tuesday, 08-Feb-1994 14:15:29 GMT" -- broken rfc850 HTTP format + + "03/Feb/1994:17:03:55 -0700" -- common logfile format + "09 Feb 1994 22:23:32 GMT" -- HTTP format (no weekday) + "08-Feb-94 14:15:29 GMT" -- rfc850 format (no weekday) + "08-Feb-1994 14:15:29 GMT" -- broken rfc850 format (no weekday) + + "1994-02-03 14:15:29 -0100" -- ISO 8601 format + "1994-02-03 14:15:29" -- zone is optional + "1994-02-03" -- only date + "1994-02-03T14:15:29" -- Use T as separator + "19940203T141529Z" -- ISO 8601 compact format + "19940203" -- only date + + "08-Feb-94" -- old rfc850 HTTP format (no weekday, no time) + "08-Feb-1994" -- broken rfc850 HTTP format (no weekday, no time) + "09 Feb 1994" -- proposed new HTTP format (no weekday, no time) + "03/Feb/1994" -- common logfile format (no time, no offset) + + "Feb 3 1994" -- Unix 'ls -l' format + "Feb 3 17:03" -- Unix 'ls -l' format + + "11-15-96 03:52PM" -- Windows 'dir' format + +The parser ignores leading and trailing whitespace. It also allow the +seconds to be missing and the month to be numerical in most formats. + +If the year is missing, then we assume that the date is the first +matching date I current month. If the year is given with only +2 digits, then parse_date() will select the century that makes the +year closest to the current date. + +=item time2iso( [$time] ) + +Same as time2str(), but returns a "YYYY-MM-DD hh:mm:ss"-formatted +string representing time in the local time zone. + +=item time2isoz( [$time] ) + +Same as time2str(), but returns a "YYYY-MM-DD hh:mm:ssZ"-formatted +string representing Universal Time. + + +=back + +=head1 SEE ALSO + +L, L + +=head1 COPYRIGHT + +Copyright 1995-1999, Gisle Aas + +This library is free software; you can redistribute it and/or +modify it under the same terms as Perl itself. + +=cut diff --git a/qcd/part_cpu/bench/lib/Slurp.pm b/qcd/part_cpu/bench/lib/Slurp.pm new file mode 100644 index 0000000000000000000000000000000000000000..d834658c0dcbc550fdddbfa31b959e48b3be2a97 --- /dev/null +++ b/qcd/part_cpu/bench/lib/Slurp.pm @@ -0,0 +1,115 @@ +package Slurp; + +use Exporter; +use vars qw/ @EXPORT @EXPORT_OK @ISA $VERSION /; + +@ISA = qw/ Exporter /; +@EXPORT = qw/ slurp /; +@EXPORT_OK = qw/ slurp to_array to_scalar /; + + +$VERSION = '0.4'; + + +sub slurp { + local( $/, @ARGV ) = ( wantarray ? $/ : undef, @_ ); + return ; +} + +sub to_array { + my @array = slurp( @_ ); + return wantarray ? @array : \@array; +} + +sub to_scalar { + my $scalar = slurp( @_ ); + return $scalar; +} + + +1; + + +__END__ + +=pod + +=head1 NAME + +Slurp - Slurp entire files into variables + +=head1 SYNOPSIS + + use Slurp; + + my $file = slurp($file1, $file2, ...); + + my @array = Slurp::to_array($filename); + my $scalar = Slurp::to_scalar($filename); + +=head1 DESCRIPTION + +This simple module serves one purpose - to provide a simple means to +read (or slurp) an entire file into memory for processing. This +module allows the replacement of the prototypical foreach- or while- +loops used for opening and reading of files with single-line +constructs. + +Of note with this module is that the magic of the C<@ARGV> variable +and the input record separator, C<$/>, are used to facilitate the +reading of entire files into either an array or scalar using minimal +code. + +=head1 METHODS + +The following methods are available through this module for use in +other applications. By default, the C method is exported +into the calling namespace - The other methods of this module, +C and C, may also be exported into the calling +namespace. + +=over 4 + +=item B + + my @array = slurp($filename, ...); + my $scalar = slurp($filename, ...); + +This method slurps one or more files, specified by filenames passed +to this method as arguments, into memory. The assignment can be +made either either an array or scalar depending upon the context in +which this method was called. + +=item B + + my @array = Slurp::to_array($filename, ...); + my $array_ref = Slurp::to_array($filename, ...); + +This method slurps one or more files, specified by filenames passed +to this method as arguments, into memory. If called in a scalar +context, this method returns an array reference - This is +particularly useful if dealing with large files. + +=item B + + my $scalar = Slurp::to_scalar($filename, ...); + +This method slurps one or more files, specified by filenames passed +to this method as arguments, into a scalar variable. + +=back + +=head1 SEE ALSO + +L + +=head1 VERSION + +0.4 + +=head1 AUTHOR + +Rob Casey + +=cut + diff --git a/qcd/part_cpu/bench/lib/XML/NamespaceSupport.pm b/qcd/part_cpu/bench/lib/XML/NamespaceSupport.pm new file mode 100644 index 0000000000000000000000000000000000000000..0e7fc89bb2d17189b77819b9c4f0f5c8f1f4a3b5 --- /dev/null +++ b/qcd/part_cpu/bench/lib/XML/NamespaceSupport.pm @@ -0,0 +1,583 @@ + +### +# XML::NamespaceSupport - a simple generic namespace processor +# Robin Berjon +### + +package XML::NamespaceSupport; +use strict; +use constant FATALS => 0; # root object +use constant NSMAP => 1; +use constant UNKNOWN_PREF => 2; +use constant AUTO_PREFIX => 3; +use constant XMLNS_11 => 4; +use constant DEFAULT => 0; # maps +use constant PREFIX_MAP => 1; +use constant DECLARATIONS => 2; + +use vars qw($VERSION $NS_XMLNS $NS_XML); +$VERSION = '1.09'; +$NS_XMLNS = 'http://www.w3.org/2000/xmlns/'; +$NS_XML = 'http://www.w3.org/XML/1998/namespace'; + + +# add the ns stuff that baud wants based on Java's xml-writer + + +#-------------------------------------------------------------------# +# constructor +#-------------------------------------------------------------------# +sub new { + my $class = ref($_[0]) ? ref(shift) : shift; + my $options = shift; + my $self = [ + 1, # FATALS + [[ # NSMAP + undef, # DEFAULT + { xml => $NS_XML }, # PREFIX_MAP + undef, # DECLARATIONS + ]], + 'aaa', # UNKNOWN_PREF + 0, # AUTO_PREFIX + 1, # XML_11 + ]; + $self->[NSMAP]->[0]->[PREFIX_MAP]->{xmlns} = $NS_XMLNS if $options->{xmlns}; + $self->[FATALS] = $options->{fatal_errors} if defined $options->{fatal_errors}; + $self->[AUTO_PREFIX] = $options->{auto_prefix} if defined $options->{auto_prefix}; + $self->[XMLNS_11] = $options->{xmlns_11} if defined $options->{xmlns_11}; + return bless $self, $class; +} +#-------------------------------------------------------------------# + +#-------------------------------------------------------------------# +# reset() - return to the original state (for reuse) +#-------------------------------------------------------------------# +sub reset { + my $self = shift; + $#{$self->[NSMAP]} = 0; +} +#-------------------------------------------------------------------# + +#-------------------------------------------------------------------# +# push_context() - add a new empty context to the stack +#-------------------------------------------------------------------# +sub push_context { + my $self = shift; + push @{$self->[NSMAP]}, [ + $self->[NSMAP]->[-1]->[DEFAULT], + { %{$self->[NSMAP]->[-1]->[PREFIX_MAP]} }, + [], + ]; +} +#-------------------------------------------------------------------# + +#-------------------------------------------------------------------# +# pop_context() - remove the topmost context fromt the stack +#-------------------------------------------------------------------# +sub pop_context { + my $self = shift; + die 'Trying to pop context without push context' unless @{$self->[NSMAP]} > 1; + pop @{$self->[NSMAP]}; +} +#-------------------------------------------------------------------# + +#-------------------------------------------------------------------# +# declare_prefix() - declare a prefix in the current scope +#-------------------------------------------------------------------# +sub declare_prefix { + my $self = shift; + my $prefix = shift; + my $value = shift; + + warn <<' EOWARN' unless defined $prefix or $self->[AUTO_PREFIX]; + Prefix was undefined. + If you wish to set the default namespace, use the empty string ''. + If you wish to autogenerate prefixes, set the auto_prefix option + to a true value. + EOWARN + + no warnings 'uninitialized'; + if ($prefix eq 'xml' and $value ne $NS_XML) { + die "The xml prefix can only be bound to the $NS_XML namespace." + } + elsif ($value eq $NS_XML and $prefix ne 'xml') { + die "the $NS_XML namespace can only be bound to the xml prefix."; + } + elsif ($value eq $NS_XML and $prefix eq 'xml') { + return 1; + } + return 0 if index(lc($prefix), 'xml') == 0; + use warnings 'uninitialized'; + + if (defined $prefix and $prefix eq '') { + $self->[NSMAP]->[-1]->[DEFAULT] = $value; + } + else { + die "Cannot undeclare prefix $prefix" if $value eq '' and not $self->[XMLNS_11]; + if (not defined $prefix and $self->[AUTO_PREFIX]) { + while (1) { + $prefix = $self->[UNKNOWN_PREF]++; + last if not exists $self->[NSMAP]->[-1]->[PREFIX_MAP]->{$prefix}; + } + } + elsif (not defined $prefix and not $self->[AUTO_PREFIX]) { + return 0; + } + $self->[NSMAP]->[-1]->[PREFIX_MAP]->{$prefix} = $value; + } + push @{$self->[NSMAP]->[-1]->[DECLARATIONS]}, $prefix; + return 1; +} +#-------------------------------------------------------------------# + +#-------------------------------------------------------------------# +# declare_prefixes() - declare several prefixes in the current scope +#-------------------------------------------------------------------# +sub declare_prefixes { + my $self = shift; + my %prefixes = @_; + while (my ($k,$v) = each %prefixes) { + $self->declare_prefix($k,$v); + } +} +#-------------------------------------------------------------------# + +#-------------------------------------------------------------------# +# undeclare_prefix +#-------------------------------------------------------------------# +sub undeclare_prefix { + my $self = shift; + my $prefix = shift; + return unless not defined $prefix or $prefix eq ''; + return unless exists $self->[NSMAP]->[-1]->[PREFIX_MAP]->{$prefix}; + + my ( $tfix ) = grep { $_ eq $prefix } @{$self->[NSMAP]->[-1]->[DECLARATIONS]}; + if ( not defined $tfix ) { + die "prefix $prefix not declared in this context\n"; + } + + @{$self->[NSMAP]->[-1]->[DECLARATIONS]} = grep { $_ ne $prefix } @{$self->[NSMAP]->[-1]->[DECLARATIONS]}; + delete $self->[NSMAP]->[-1]->[PREFIX_MAP]->{$prefix}; +} +#-------------------------------------------------------------------# + +#-------------------------------------------------------------------# +# get_prefix() - get a (random) prefix for a given URI +#-------------------------------------------------------------------# +sub get_prefix { + my $self = shift; + my $uri = shift; + + # we have to iterate over the whole hash here because if we don't + # the iterator isn't reset and the next pass will fail + my $pref; + while (my ($k, $v) = each %{$self->[NSMAP]->[-1]->[PREFIX_MAP]}) { + $pref = $k if $v eq $uri; + } + return $pref; +} +#-------------------------------------------------------------------# + +#-------------------------------------------------------------------# +# get_prefixes() - get all the prefixes for a given URI +#-------------------------------------------------------------------# +sub get_prefixes { + my $self = shift; + my $uri = shift; + + return keys %{$self->[NSMAP]->[-1]->[PREFIX_MAP]} unless defined $uri; + return grep { $self->[NSMAP]->[-1]->[PREFIX_MAP]->{$_} eq $uri } keys %{$self->[NSMAP]->[-1]->[PREFIX_MAP]}; +} +#-------------------------------------------------------------------# + +#-------------------------------------------------------------------# +# get_declared_prefixes() - get all prefixes declared in the last context +#-------------------------------------------------------------------# +sub get_declared_prefixes { + return @{$_[0]->[NSMAP]->[-1]->[DECLARATIONS]}; +} +#-------------------------------------------------------------------# + +#-------------------------------------------------------------------# +# get_uri() - get an URI given a prefix +#-------------------------------------------------------------------# +sub get_uri { + my $self = shift; + my $prefix = shift; + + warn "Prefix must not be undef in get_uri(). The emtpy prefix must be ''" unless defined $prefix; + + return $self->[NSMAP]->[-1]->[DEFAULT] if $prefix eq ''; + return $self->[NSMAP]->[-1]->[PREFIX_MAP]->{$prefix} if exists $self->[NSMAP]->[-1]->[PREFIX_MAP]->{$prefix}; + return undef; +} +#-------------------------------------------------------------------# + +#-------------------------------------------------------------------# +# process_name() - provide details on a name +#-------------------------------------------------------------------# +sub process_name { + my $self = shift; + my $qname = shift; + my $aflag = shift; + + if ($self->[FATALS]) { + return( ($self->_get_ns_details($qname, $aflag))[0,2], $qname ); + } + else { + eval { return( ($self->_get_ns_details($qname, $aflag))[0,2], $qname ); } + } +} +#-------------------------------------------------------------------# + +#-------------------------------------------------------------------# +# process_element_name() - provide details on a element's name +#-------------------------------------------------------------------# +sub process_element_name { + my $self = shift; + my $qname = shift; + + if ($self->[FATALS]) { + return $self->_get_ns_details($qname, 0); + } + else { + eval { return $self->_get_ns_details($qname, 0); } + } +} +#-------------------------------------------------------------------# + + +#-------------------------------------------------------------------# +# process_attribute_name() - provide details on a attribute's name +#-------------------------------------------------------------------# +sub process_attribute_name { + my $self = shift; + my $qname = shift; + + if ($self->[FATALS]) { + return $self->_get_ns_details($qname, 1); + } + else { + eval { return $self->_get_ns_details($qname, 1); } + } +} +#-------------------------------------------------------------------# + + +#-------------------------------------------------------------------# +# ($ns, $prefix, $lname) = $self->_get_ns_details($qname, $f_attr) +# returns ns, prefix, and lname for a given attribute name +# >> the $f_attr flag, if set to one, will work for an attribute +#-------------------------------------------------------------------# +sub _get_ns_details { + my $self = shift; + my $qname = shift; + my $aflag = shift; + + my ($ns, $prefix, $lname); + (my ($tmp_prefix, $tmp_lname) = split /:/, $qname, 3) + < 3 or die "Invalid QName: $qname"; + + # no prefix + my $cur_map = $self->[NSMAP]->[-1]; + if (not defined($tmp_lname)) { + $prefix = undef; + $lname = $qname; + # attr don't have a default namespace + $ns = ($aflag) ? undef : $cur_map->[DEFAULT]; + } + + # prefix + else { + if (exists $cur_map->[PREFIX_MAP]->{$tmp_prefix}) { + $prefix = $tmp_prefix; + $lname = $tmp_lname; + $ns = $cur_map->[PREFIX_MAP]->{$prefix} + } + else { # no ns -> lname == name, all rest undef + die "Undeclared prefix: $tmp_prefix"; + } + } + + return ($ns, $prefix, $lname); +} +#-------------------------------------------------------------------# + +#-------------------------------------------------------------------# +# parse_jclark_notation() - parse the Clarkian notation +#-------------------------------------------------------------------# +sub parse_jclark_notation { + shift; + my $jc = shift; + $jc =~ m/^\{(.*)\}([^}]+)$/; + return $1, $2; +} +#-------------------------------------------------------------------# + + +#-------------------------------------------------------------------# +# Java names mapping +#-------------------------------------------------------------------# +*XML::NamespaceSupport::pushContext = \&push_context; +*XML::NamespaceSupport::popContext = \&pop_context; +*XML::NamespaceSupport::declarePrefix = \&declare_prefix; +*XML::NamespaceSupport::declarePrefixes = \&declare_prefixes; +*XML::NamespaceSupport::getPrefix = \&get_prefix; +*XML::NamespaceSupport::getPrefixes = \&get_prefixes; +*XML::NamespaceSupport::getDeclaredPrefixes = \&get_declared_prefixes; +*XML::NamespaceSupport::getURI = \&get_uri; +*XML::NamespaceSupport::processName = \&process_name; +*XML::NamespaceSupport::processElementName = \&process_element_name; +*XML::NamespaceSupport::processAttributeName = \&process_attribute_name; +*XML::NamespaceSupport::parseJClarkNotation = \&parse_jclark_notation; +*XML::NamespaceSupport::undeclarePrefix = \&undeclare_prefix; +#-------------------------------------------------------------------# + + +1; +#,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,# +#`,`, Documentation `,`,`,`,`,`,`,`,`,`,`,`,`,`,`,`,`,`,`,`,`,`,`,`,# +#```````````````````````````````````````````````````````````````````# + +=pod + +=head1 NAME + +XML::NamespaceSupport - a simple generic namespace support class + +=head1 SYNOPSIS + + use XML::NamespaceSupport; + my $nsup = XML::NamespaceSupport->new; + + # add a new empty context + $nsup->push_context; + # declare a few prefixes + $nsup->declare_prefix($prefix1, $uri1); + $nsup->declare_prefix($prefix2, $uri2); + # the same shorter + $nsup->declare_prefixes($prefix1 => $uri1, $prefix2 => $uri2); + + # get a single prefix for a URI (randomly) + $prefix = $nsup->get_prefix($uri); + # get all prefixes for a URI (probably better) + @prefixes = $nsup->get_prefixes($uri); + # get all prefixes in scope + @prefixes = $nsup->get_prefixes(); + # get all prefixes that were declared for the current scope + @prefixes = $nsup->get_declared_prefixes; + # get a URI for a given prefix + $uri = $nsup->get_uri($prefix); + + # get info on a qname (java-ish way, it's a bit weird) + ($ns_uri, $local_name, $qname) = $nsup->process_name($qname, $is_attr); + # the same, more perlish + ($ns_uri, $prefix, $local_name) = $nsup->process_element_name($qname); + ($ns_uri, $prefix, $local_name) = $nsup->process_attribute_name($qname); + + # remove the current context + $nsup->pop_context; + + # reset the object for reuse in another document + $nsup->reset; + + # a simple helper to process Clarkian Notation + my ($ns, $lname) = $nsup->parse_jclark_notation('{http://foo}bar'); + # or (given that it doesn't care about the object + my ($ns, $lname) = XML::NamespaceSupport->parse_jclark_notation('{http://foo}bar'); + + +=head1 DESCRIPTION + +This module offers a simple to process namespaced XML names (unames) +from within any application that may need them. It also helps maintain +a prefix to namespace URI map, and provides a number of basic checks. + +The model for this module is SAX2's NamespaceSupport class, readable at +http://www.megginson.com/SAX/Java/javadoc/org/xml/sax/helpers/NamespaceSupport.html. +It adds a few perlisations where we thought it appropriate. + +=head1 METHODS + +=over 4 + +=item * XML::NamespaceSupport->new(\%options) + +A simple constructor. + +The options are C, C, and C + +If C is turned on (it is off by default) the mapping from the +xmlns prefix to the URI defined for it in DOM level 2 is added to the +list of predefined mappings (which normally only contains the xml +prefix mapping). + +If C is turned off (it is on by default) a number of +validity errors will simply be flagged as failures, instead of +die()ing. + +If C is turned on (it is off by default) when one +provides a prefix of C to C it will generate a +random prefix mapped to that namespace. Otherwise an undef prefix will +trigger a warning (you should probably know what you're doing if you +turn this option on). + +If C us turned off, it becomes illegal to undeclare namespace +prefixes. It is on by default. This behaviour is compliant with Namespaces +in XML 1.1, turning it off reverts you to version 1.0. + +=item * $nsup->push_context + +Adds a new empty context to the stack. You can then populate it with +new prefixes defined at this level. + +=item * $nsup->pop_context + +Removes the topmost context in the stack and reverts to the previous +one. It will die() if you try to pop more than you have pushed. + +=item * $nsup->declare_prefix($prefix, $uri) + +Declares a mapping of $prefix to $uri, at the current level. + +Note that with C turned on, if you declare a prefix +mapping in which $prefix is undef(), you will get an automatic prefix +selected for you. If it is off you will get a warning. + +This is useful when you deal with code that hasn't kept prefixes around +and need to reserialize the nodes. It also means that if you want to +set the default namespace (ie with an empty prefix) you must use the +empty string instead of undef. This behaviour is consistent with the +SAX 2.0 specification. + +=item * $nsup->declare_prefixes(%prefixes2uris) + +Declares a mapping of several prefixes to URIs, at the current level. + +=item * $nsup->get_prefix($uri) + +Returns a prefix given an URI. Note that as several prefixes may be +mapped to the same URI, it returns an arbitrary one. It'll return +undef on failure. + +=item * $nsup->get_prefixes($uri) + +Returns an array of prefixes given an URI. It'll return all the +prefixes if the uri is undef. + +=item * $nsup->get_declared_prefixes + +Returns an array of all the prefixes that have been declared within +this context, ie those that were declared on the last element, not +those that were declared above and are simply in scope. + +=item * $nsup->get_uri($prefix) + +Returns a URI for a given prefix. Returns undef on failure. + +=item * $nsup->process_name($qname, $is_attr) + +Given a qualified name and a boolean indicating whether this is an +attribute or another type of name (those are differently affected by +default namespaces), it returns a namespace URI, local name, qualified +name tuple. I know that that is a rather abnormal list to return, but +it is so for compatibility with the Java spec. See below for more +Perlish alternatives. + +If the prefix is not declared, or if the name is not valid, it'll +either die or return undef depending on the current setting of +C. + +=item * $nsup->undeclare_prefix($prefix); + +Removes a namespace prefix from the current context. This function may +be used in SAX's end_prefix_mapping when there is fear that a namespace +declaration might be available outside their scope (which shouldn't +normally happen, but you never know ;). This may be needed in order to +properly support Namespace 1.1. + +=item * $nsup->process_element_name($qname) + +Given a qualified name, it returns a namespace URI, prefix, and local +name tuple. This method applies to element names. + +If the prefix is not declared, or if the name is not valid, it'll +either die or return undef depending on the current setting of +C. + +=item * $nsup->process_attribute_name($qname) + +Given a qualified name, it returns a namespace URI, prefix, and local +name tuple. This method applies to attribute names. + +If the prefix is not declared, or if the name is not valid, it'll +either die or return undef depending on the current setting of +C. + +=item * $nsup->reset + +Resets the object so that it can be reused on another document. + +=back + +All methods of the interface have an alias that is the name used in +the original Java specification. You can use either name +interchangeably. Here is the mapping: + + Java name Perl name + --------------------------------------------------- + pushContext push_context + popContext pop_context + declarePrefix declare_prefix + declarePrefixes declare_prefixes + getPrefix get_prefix + getPrefixes get_prefixes + getDeclaredPrefixes get_declared_prefixes + getURI get_uri + processName process_name + processElementName process_element_name + processAttributeName process_attribute_name + parseJClarkNotation parse_jclark_notation + undeclarePrefix undeclare_prefix + +=head1 VARIABLES + +Two global variables are made available to you. They used to be constants but +simple scalars are easier to use in a number of contexts. They are not +exported but can easily be accessed from any package, or copied into it. + +=over 4 + +=item * C<$NS_XMLNS> + +The namespace for xmlns prefixes, http://www.w3.org/2000/xmlns/. + +=item * C<$NS_XML> + +The namespace for xml prefixes, http://www.w3.org/XML/1998/namespace. + +=back + +=head1 TODO + + - add more tests + - optimise here and there + +=head1 AUTHOR + +Robin Berjon, robin@knowscape.com, with lots of it having been done +by Duncan Cameron, and a number of suggestions from the perl-xml +list. + +=head1 COPYRIGHT + +Copyright (c) 2001-2005 Robin Berjon. All rights reserved. This program is +free software; you can redistribute it and/or modify it under the same terms +as Perl itself. + +=head1 SEE ALSO + +XML::Parser::PerlSAX + +=cut + diff --git a/qcd/part_cpu/bench/lib/XML/Parser/Lite.pm b/qcd/part_cpu/bench/lib/XML/Parser/Lite.pm new file mode 100644 index 0000000000000000000000000000000000000000..be212a621f870a32ed24de09efceb0eace4a6199 --- /dev/null +++ b/qcd/part_cpu/bench/lib/XML/Parser/Lite.pm @@ -0,0 +1,202 @@ +# ====================================================================== +# +# Copyright (C) 2000-2001 Paul Kulchenko (paulclinger@yahoo.com) +# SOAP::Lite is free software; you can redistribute it +# and/or modify it under the same terms as Perl itself. +# +# $Id: Lite.pm,v 1.1.1.1 2007/08/07 06:49:12 mahermanns Exp $ +# +# ====================================================================== + +package XML::Parser::Lite; + +use strict; +use vars qw($VERSION); +$VERSION = sprintf("%d.%s", map {s/_//g; $_} q$Name: HEAD $ =~ /-(\d+)_([\d_]+)/); + +sub new { + my $self = shift; + my $class = ref($self) || $self; + return $self if ref $self; + + $self = bless {} => $class; + my %parameters = @_; + $self->setHandlers(); # clear first + $self->setHandlers(%{$parameters{Handlers} || {}}); + return $self; +} + +sub setHandlers { + my $self = shift; + no strict 'refs'; local $^W; + # clear all handlers if called without parameters + unless (@_) { foreach (qw(Start End Char Final Init)) { *$_ = sub {} } } + while (@_) { my($name => $func) = splice(@_, 0, 2); *$name = defined $func ? $func : sub {} } + return $self; +} + +sub regexp { + my $patch = shift || ''; + my $package = __PACKAGE__; + + # This parser is based on "shallow parser" http://www.cs.sfu.ca/~cameron/REX.html + + # Robert D. Cameron "REX: XML Shallow Parsing with Regular Expressions", + # Technical Report TR 1998-17, School of Computing Science, Simon Fraser University, November, 1998. + # Copyright (c) 1998, Robert D. Cameron. + # The following code may be freely used and distributed provided that + # this copyright and citation notice remains intact and that modifications + # or additions are clearly identified. + + my $TextSE = "[^<]+"; + my $UntilHyphen = "[^-]*-"; + my $Until2Hyphens = "$UntilHyphen(?:[^-]$UntilHyphen)*-"; + my $CommentCE = "$Until2Hyphens>?"; + my $UntilRSBs = "[^\\]]*](?:[^\\]]+])*]+"; + my $CDATA_CE = "$UntilRSBs(?:[^\\]>]$UntilRSBs)*>"; + my $S = "[ \\n\\t\\r]+"; + my $NameStrt = "[A-Za-z_:]|[^\\x00-\\x7F]"; + my $NameChar = "[A-Za-z0-9_:.-]|[^\\x00-\\x7F]"; + my $Name = "(?:$NameStrt)(?:$NameChar)*"; + my $QuoteSE = "\"[^\"]*\"|'[^']*'"; + my $DT_IdentSE = "$S$Name(?:$S(?:$Name|$QuoteSE))*"; + my $MarkupDeclCE = "(?:[^\\]\"'><]+|$QuoteSE)*>"; + my $S1 = "[\\n\\r\\t ]"; + my $UntilQMs = "[^?]*\\?+"; + my $PI_Tail = "\\?>|$S1$UntilQMs(?:[^>?]$UntilQMs)*>"; + my $DT_ItemSE = "<(?:!(?:--$Until2Hyphens>|[^-]$MarkupDeclCE)|\\?$Name(?:$PI_Tail))|%$Name;|$S"; + my $DocTypeCE = "$DT_IdentSE(?:$S)?(?:\\[(?:$DT_ItemSE)*](?:$S)?)?>?"; + my $DeclCE = "--(?:$CommentCE)?|\\[CDATA\\[(?:$CDATA_CE)?|DOCTYPE(?:$DocTypeCE)?"; + my $PI_CE = "$Name(?:$PI_Tail)?"; + + # these expressions were modified for backtracking and events + my $EndTagCE = "($Name)(?{${package}::end(\$2)})(?:$S)?>"; + my $AttValSE = "\"([^<\"]*)\"|'([^<']*)'"; + my $ElemTagCE = "($Name)(?:$S($Name)(?:$S)?=(?:$S)?(?:$AttValSE)(?{[\@{\$^R||[]},\$4=>defined\$5?\$5:\$6]}))*(?:$S)?(/)?>(?{${package}::start(\$3,\@{\$^R||[]})})(?{\${7} and ${package}::end(\$3)})"; + my $MarkupSPE = "<(?:!(?:$DeclCE)?|\\?(?:$PI_CE)?|/(?:$EndTagCE)?|(?:$ElemTagCE)?)"; + + # Next expression is under "black magic". + # Ideally it should be '($TextSE)(?{${package}::char(\$1)})|$MarkupSPE', + # but it doesn't work under Perl 5.005 and only magic with + # (?:....)?? solved the problem. + # I would appreciate if someone let me know what is the right thing to do + # and what's the reason for all this magic. + # Seems like a problem related to (?:....)? rather than to ?{} feature. + # Tests are in t/31-xmlparserlite.t if you decide to play with it. + "(?:($TextSE)(?{${package}::char(\$1)}))$patch|$MarkupSPE"; +} + +sub compile { local $^W; + # try regexp as it should be, apply patch if doesn't work + foreach (regexp(), regexp('??')) { + eval qq{sub parse_re { use re "eval"; 1 while \$_[0] =~ m{$_}go }; 1} or die; + last if eval { parse_re('bar'); 1 } + }; + + *compile = sub {}; +} + +setHandlers(); +compile(); + +sub parse { + init(); + parse_re($_[1]); + final(); +} + +my(@stack, $level); + +sub init { + @stack = (); $level = 0; + Init(__PACKAGE__, @_); +} + +sub final { + die "not properly closed tag '$stack[-1]'\n" if @stack; + die "no element found\n" unless $level; + Final(__PACKAGE__, @_) +} + +sub start { + die "multiple roots, wrong element '$_[0]'\n" if $level++ && !@stack; + push(@stack, $_[0]); + Start(__PACKAGE__, @_); +} + +sub char { + Char(__PACKAGE__, $_[0]), return if @stack; + + # check for junk before or after element + # can't use split or regexp due to limitations in ?{} implementation, + # will iterate with loop, but we'll do it no more than two times, so + # it shouldn't affect performance + for (my $i=0; $i < length $_[0]; $i++) { + die "junk '$_[0]' @{[$level ? 'after' : 'before']} XML element\n" + if index("\n\r\t ", substr($_[0],$i,1)) < 0; # or should '< $[' be there + } +} + +sub end { + pop(@stack) eq $_[0] or die "mismatched tag '$_[0]'\n"; + End(__PACKAGE__, $_[0]); +} + +# ====================================================================== + +1; + +__END__ + +=head1 NAME + +XML::Parser::Lite - Lightweight regexp-based XML parser + +=head1 SYNOPSIS + + use XML::Parser::Lite; + + $p1 = new XML::Parser::Lite; + $p1->setHandlers( + Start => sub { shift; print "start: @_\n" }, + Char => sub { shift; print "char: @_\n" }, + End => sub { shift; print "end: @_\n" }, + ); + $p1->parse('Hello World!'); + + $p2 = new XML::Parser::Lite + Handlers => { + Start => sub { shift; print "start: @_\n" }, + Char => sub { shift; print "char: @_\n" }, + End => sub { shift; print "end: @_\n" }, + } + ; + $p2->parse('Hello cruel World!'); + +=head1 DESCRIPTION + +This Perl module gives you access to XML parser with interface similar to +XML::Parser interface. Though only basic calls are supported (init, final, +start, char, and end) you should be able to use it in the same way you use +XML::Parser. Due to using experimantal regexp features it'll work only on +Perl 5.6 and may behave differently on different platforms. + +=head1 SEE ALSO + + XML::Parser + +=head1 COPYRIGHT + +Copyright (C) 2000-2001 Paul Kulchenko. All rights reserved. + +This library is free software; you can redistribute it and/or modify +it under the same terms as Perl itself. + +This parser is based on "shallow parser" http://www.cs.sfu.ca/~cameron/REX.html +Copyright (c) 1998, Robert D. Cameron. + +=head1 AUTHOR + +Paul Kulchenko (paulclinger@yahoo.com) + +=cut diff --git a/qcd/part_cpu/bench/lib/XML/SAX.pm b/qcd/part_cpu/bench/lib/XML/SAX.pm new file mode 100644 index 0000000000000000000000000000000000000000..730233d7d775ca22d46b4287f5d6a646b4e79cae --- /dev/null +++ b/qcd/part_cpu/bench/lib/XML/SAX.pm @@ -0,0 +1,375 @@ +# $Id: SAX.pm,v 1.1.1.1 2007/08/07 06:49:12 mahermanns Exp $ + +package XML::SAX; + +use strict; +use vars qw($VERSION @ISA @EXPORT_OK); + +$VERSION = '0.12'; + +use Exporter (); +@ISA = ('Exporter'); + +@EXPORT_OK = qw(Namespaces Validation); + +use File::Basename qw(dirname); +use File::Spec (); +use Symbol qw(gensym); +use XML::SAX::ParserFactory (); # loaded for simplicity + +use constant PARSER_DETAILS => "ParserDetails.ini"; + +use constant Namespaces => "http://xml.org/sax/features/namespaces"; +use constant Validation => "http://xml.org/sax/features/validation"; + +my $known_parsers = undef; + +# load_parsers takes the ParserDetails.ini file out of the same directory +# that XML::SAX is in, and looks at it. Format in POD below + +=begin EXAMPLE + +[XML::SAX::PurePerl] +http://xml.org/sax/features/namespaces = 1 +http://xml.org/sax/features/validation = 0 +# a comment + +# blank lines ignored + +[XML::SAX::AnotherParser] +http://xml.org/sax/features/namespaces = 0 +http://xml.org/sax/features/validation = 1 + +=end EXAMPLE + +=cut + +sub load_parsers { + my $class = shift; + my $dir = shift; + + # reset parsers + $known_parsers = []; + + # get directory from wherever XML::SAX is installed + if (!$dir) { + $dir = $INC{'XML/SAX.pm'}; + $dir = dirname($dir); + } + + my $fh = gensym(); + if (!open($fh, File::Spec->catfile($dir, "SAX", PARSER_DETAILS))) { + XML::SAX->do_warn("could not find " . PARSER_DETAILS . " in $dir/SAX\n"); + return $class; + } + + $known_parsers = $class->_parse_ini_file($fh); + + return $class; +} + +sub _parse_ini_file { + my $class = shift; + my ($fh) = @_; + + my @config; + + my $lineno = 0; + while (defined(my $line = <$fh>)) { + $lineno++; + my $original = $line; + # strip whitespace + $line =~ s/\s*$//m; + $line =~ s/^\s*//m; + # strip comments + $line =~ s/[#;].*$//m; + # ignore blanks + next if $line =~ /^$/m; + + # heading + if ($line =~ /^\[\s*(.*)\s*\]$/m) { + push @config, { Name => $1 }; + next; + } + + # instruction + elsif ($line =~ /^(.*?)\s*?=\s*(.*)$/) { + unless(@config) { + push @config, { Name => '' }; + } + $config[-1]{Features}{$1} = $2; + } + + # not whitespace, comment, or instruction + else { + die "Invalid line in ini: $lineno\n>>> $original\n"; + } + } + + return \@config; +} + +sub parsers { + my $class = shift; + if (!$known_parsers) { + $class->load_parsers(); + } + return $known_parsers; +} + +sub remove_parser { + my $class = shift; + my ($parser_module) = @_; + + if (!$known_parsers) { + $class->load_parsers(); + } + + @$known_parsers = grep { $_->{Name} ne $parser_module } @$known_parsers; + + return $class; +} + +sub add_parser { + my $class = shift; + my ($parser_module) = @_; + + if (!$known_parsers) { + $class->load_parsers(); + } + + # first load module, then query features, then push onto known_parsers, + + my $parser_file = $parser_module; + $parser_file =~ s/::/\//g; + $parser_file .= ".pm"; + + require $parser_file; + + my @features = $parser_module->supported_features(); + + my $new = { Name => $parser_module }; + foreach my $feature (@features) { + $new->{Features}{$feature} = 1; + } + + # If exists in list already, move to end. + my $done = 0; + my $pos = undef; + for (my $i = 0; $i < @$known_parsers; $i++) { + my $p = $known_parsers->[$i]; + if ($p->{Name} eq $parser_module) { + $pos = $i; + } + } + if (defined $pos) { + splice(@$known_parsers, $pos, 1); + push @$known_parsers, $new; + $done++; + } + + # Otherwise (not in list), add at end of list. + if (!$done) { + push @$known_parsers, $new; + } + + return $class; +} + +sub save_parsers { + my $class = shift; + + # get directory from wherever XML::SAX is installed + my $dir = $INC{'XML/SAX.pm'}; + $dir = dirname($dir); + + my $file = File::Spec->catfile($dir, "SAX", PARSER_DETAILS); + chmod 0644, $file; + unlink($file); + + my $fh = gensym(); + open($fh, ">$file") || + die "Cannot write to $file: $!"; + + foreach my $p (@$known_parsers) { + print $fh "[$p->{Name}]\n"; + foreach my $key (keys %{$p->{Features}}) { + print $fh "$key = $p->{Features}{$key}\n"; + } + print $fh "\n"; + } + + print $fh "\n"; + + close $fh; + + return $class; +} + +sub do_warn { + my $class = shift; + # Don't output warnings if running under Test::Harness + warn(@_) unless $ENV{HARNESS_ACTIVE}; +} + +1; +__END__ + +=head1 NAME + +XML::SAX - Simple API for XML + +=head1 SYNOPSIS + + use XML::SAX; + + # get a list of known parsers + my $parsers = XML::SAX->parsers(); + + # add/update a parser + XML::SAX->add_parser(q(XML::SAX::PurePerl)); + + # remove parser + XML::SAX->remove_parser(q(XML::SAX::Foodelberry)); + + # save parsers + XML::SAX->save_parsers(); + +=head1 DESCRIPTION + +XML::SAX is a SAX parser access API for Perl. It includes classes +and APIs required for implementing SAX drivers, along with a factory +class for returning any SAX parser installed on the user's system. + +=head1 USING A SAX2 PARSER + +The factory class is XML::SAX::ParserFactory. Please see the +documentation of that module for how to instantiate a SAX parser: +L. However if you don't want to load up +another manual page, here's a short synopsis: + + use XML::SAX::ParserFactory; + use XML::SAX::XYZHandler; + my $handler = XML::SAX::XYZHandler->new(); + my $p = XML::SAX::ParserFactory->parser(Handler => $handler); + $p->parse_uri("foo.xml"); + # or $p->parse_string("") or $p->parse_file($fh); + +This will automatically load a SAX2 parser (defaulting to +XML::SAX::PurePerl if no others are found) and return it to you. + +In order to learn how to use SAX to parse XML, you will need to read +L and for reference, L. + +=head1 WRITING A SAX2 PARSER + +The first thing to remember in writing a SAX2 parser is to subclass +XML::SAX::Base. This will make your life infinitely easier, by providing +a number of methods automagically for you. See L for more +details. + +When writing a SAX2 parser that is compatible with XML::SAX, you need +to inform XML::SAX of the presence of that driver when you install it. +In order to do that, XML::SAX contains methods for saving the fact that +the parser exists on your system to a "INI" file, which is then loaded +to determine which parsers are installed. + +The best way to do this is to follow these rules: + +=over 4 + +=item * Add XML::SAX as a prerequisite in Makefile.PL: + + WriteMakefile( + ... + PREREQ_PM => { 'XML::SAX' => 0 }, + ... + ); + +Alternatively you may wish to check for it in other ways that will +cause more than just a warning. + +=item * Add the following code snippet to your Makefile.PL: + + sub MY::install { + package MY; + my $script = shift->SUPER::install(@_); + if (ExtUtils::MakeMaker::prompt( + "Do you want to modify ParserDetails.ini?", 'Y') + =~ /^y/i) { + $script =~ s/install :: (.*)$/install :: $1 install_sax_driver/m; + $script .= <<"INSTALL"; + + install_sax_driver : + \t\@\$(PERL) -MXML::SAX -e "XML::SAX->add_parser(q(\$(NAME)))->save_parsers()" + + INSTALL + } + return $script; + } + +Note that you should check the output of this - \$(NAME) will use the name of +your distribution, which may not be exactly what you want. For example XML::LibXML +has a driver called XML::LibXML::SAX::Generator, which is used in place of +\$(NAME) in the above. + +=item * Add an XML::SAX test: + +A test file should be added to your t/ directory containing something like the +following: + + use Test; + BEGIN { plan tests => 3 } + use XML::SAX; + use XML::SAX::PurePerl::DebugHandler; + XML::SAX->add_parser(q(XML::SAX::MyDriver)); + local $XML::SAX::ParserPackage = 'XML::SAX::MyDriver'; + eval { + my $handler = XML::SAX::PurePerl::DebugHandler->new(); + ok($handler); + my $parser = XML::SAX::ParserFactory->parser(Handler => $handler); + ok($parser); + ok($parser->isa('XML::SAX::MyDriver'); + $parser->parse_string(""); + ok($handler->{seen}{start_element}); + }; + +=back + +=head1 EXPORTS + +By default, XML::SAX exports nothing into the caller's namespace. However you +can request the symbols C and C which are the +URIs for those features, allowing an easier way to request those features +via ParserFactory: + + use XML::SAX qw(Namespaces Validation); + my $factory = XML::SAX::ParserFactory->new(); + $factory->require_feature(Namespaces); + $factory->require_feature(Validation); + my $parser = $factory->parser(); + +=head1 AUTHOR + +Matt Sergeant, matt@sergeant.org + +Kip Hampton, khampton@totalcinema.com + +Robin Berjon, robin@knowscape.com + +=head1 LICENSE + +This is free software, you may use it and distribute it under +the same terms as Perl itself. + +=head1 SEE ALSO + +L for writing SAX Filters and Parsers + +L for an XML parser written in 100% +pure perl. + +L for details on exception handling + +=cut + diff --git a/qcd/part_cpu/bench/lib/XML/SAX/Base.pm b/qcd/part_cpu/bench/lib/XML/SAX/Base.pm new file mode 100644 index 0000000000000000000000000000000000000000..f0336db4b81ebbf652236caff61915a73ed6c304 --- /dev/null +++ b/qcd/part_cpu/bench/lib/XML/SAX/Base.pm @@ -0,0 +1,2854 @@ +package XML::SAX::Base; + +# version 0.10 - Kip Hampton +# version 0.13 - Robin Berjon +# version 0.15 - Kip Hampton +# version 0.17 - Kip Hampton +# version 0.19 - Kip Hampton +# version 0.21 - Kip Hampton +# version 0.22 - Robin Berjon +# version 0.23 - Matt Sergeant +# version 0.24 - Robin Berjon +# version 0.25 - Kip Hampton +# version 1.00 - Kip Hampton +# version 1.01 - Kip Hampton +# version 1.02 - Robin Berjon +# version 1.03 - Matt Sergeant +# version 1.04 - Kip Hampton + +#-----------------------------------------------------# +# STOP!!!!! +# +# This file is generated by the 'Makefile.PL' file +# that ships with the XML::SAX distribution. +# If you need to make changes, patch that file NOT +# this one. +#-----------------------------------------------------# + +use strict; +use vars qw($VERSION); +use XML::SAX::Exception qw(); + +$VERSION = '1.04'; + +sub end_prefix_mapping { + my $self = shift; + if (defined $self->{Methods}->{'end_prefix_mapping'}) { + $self->{Methods}->{'end_prefix_mapping'}->(@_); + } + else { + my $method; + my $callbacks; + if (exists $self->{ParseOptions}) { + $callbacks = $self->{ParseOptions}; + } + else { + $callbacks = $self; + } + if (0) { # dummy to make elsif's below compile + } + elsif (defined $callbacks->{'ContentHandler'} and $method = $callbacks->{'ContentHandler'}->can('end_prefix_mapping') ) { + my $handler = $callbacks->{'ContentHandler'}; + $self->{Methods}->{'end_prefix_mapping'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'Handler'} and $method = $callbacks->{'Handler'}->can('end_prefix_mapping') ) { + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'end_prefix_mapping'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'ContentHandler'} and $callbacks->{'ContentHandler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'ContentHandler'}->end_prefix_mapping(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'ContentHandler'}; + $self->{Methods}->{'end_prefix_mapping'} = sub { $handler->end_prefix_mapping(@_) }; + } + return $res; + } + elsif (defined $callbacks->{'Handler'} and $callbacks->{'Handler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'Handler'}->end_prefix_mapping(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'end_prefix_mapping'} = sub { $handler->end_prefix_mapping(@_) }; + } + return $res; + } + else { + $self->{Methods}->{'end_prefix_mapping'} = sub { }; + } + } + +} + +sub internal_entity_decl { + my $self = shift; + if (defined $self->{Methods}->{'internal_entity_decl'}) { + $self->{Methods}->{'internal_entity_decl'}->(@_); + } + else { + my $method; + my $callbacks; + if (exists $self->{ParseOptions}) { + $callbacks = $self->{ParseOptions}; + } + else { + $callbacks = $self; + } + if (0) { # dummy to make elsif's below compile + } + elsif (defined $callbacks->{'DeclHandler'} and $method = $callbacks->{'DeclHandler'}->can('internal_entity_decl') ) { + my $handler = $callbacks->{'DeclHandler'}; + $self->{Methods}->{'internal_entity_decl'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'Handler'} and $method = $callbacks->{'Handler'}->can('internal_entity_decl') ) { + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'internal_entity_decl'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'DeclHandler'} and $callbacks->{'DeclHandler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'DeclHandler'}->internal_entity_decl(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'DeclHandler'}; + $self->{Methods}->{'internal_entity_decl'} = sub { $handler->internal_entity_decl(@_) }; + } + return $res; + } + elsif (defined $callbacks->{'Handler'} and $callbacks->{'Handler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'Handler'}->internal_entity_decl(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'internal_entity_decl'} = sub { $handler->internal_entity_decl(@_) }; + } + return $res; + } + else { + $self->{Methods}->{'internal_entity_decl'} = sub { }; + } + } + +} + +sub characters { + my $self = shift; + if (defined $self->{Methods}->{'characters'}) { + $self->{Methods}->{'characters'}->(@_); + } + else { + my $method; + my $callbacks; + if (exists $self->{ParseOptions}) { + $callbacks = $self->{ParseOptions}; + } + else { + $callbacks = $self; + } + if (0) { # dummy to make elsif's below compile + } + elsif (defined $callbacks->{'ContentHandler'} and $method = $callbacks->{'ContentHandler'}->can('characters') ) { + my $handler = $callbacks->{'ContentHandler'}; + $self->{Methods}->{'characters'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'DocumentHandler'} and $method = $callbacks->{'DocumentHandler'}->can('characters') ) { + my $handler = $callbacks->{'DocumentHandler'}; + $self->{Methods}->{'characters'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'Handler'} and $method = $callbacks->{'Handler'}->can('characters') ) { + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'characters'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'ContentHandler'} and $callbacks->{'ContentHandler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'ContentHandler'}->characters(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'ContentHandler'}; + $self->{Methods}->{'characters'} = sub { $handler->characters(@_) }; + } + return $res; + } + elsif (defined $callbacks->{'DocumentHandler'} and $callbacks->{'DocumentHandler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'DocumentHandler'}->characters(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'DocumentHandler'}; + $self->{Methods}->{'characters'} = sub { $handler->characters(@_) }; + } + return $res; + } + elsif (defined $callbacks->{'Handler'} and $callbacks->{'Handler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'Handler'}->characters(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'characters'} = sub { $handler->characters(@_) }; + } + return $res; + } + else { + $self->{Methods}->{'characters'} = sub { }; + } + } + +} + +sub start_element { + my $self = shift; + if (defined $self->{Methods}->{'start_element'}) { + $self->{Methods}->{'start_element'}->(@_); + } + else { + my $method; + my $callbacks; + if (exists $self->{ParseOptions}) { + $callbacks = $self->{ParseOptions}; + } + else { + $callbacks = $self; + } + if (0) { # dummy to make elsif's below compile + } + elsif (defined $callbacks->{'ContentHandler'} and $method = $callbacks->{'ContentHandler'}->can('start_element') ) { + my $handler = $callbacks->{'ContentHandler'}; + $self->{Methods}->{'start_element'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'DocumentHandler'} and $method = $callbacks->{'DocumentHandler'}->can('start_element') ) { + my $handler = $callbacks->{'DocumentHandler'}; + $self->{Methods}->{'start_element'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'Handler'} and $method = $callbacks->{'Handler'}->can('start_element') ) { + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'start_element'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'ContentHandler'} and $callbacks->{'ContentHandler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'ContentHandler'}->start_element(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'ContentHandler'}; + $self->{Methods}->{'start_element'} = sub { $handler->start_element(@_) }; + } + return $res; + } + elsif (defined $callbacks->{'DocumentHandler'} and $callbacks->{'DocumentHandler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'DocumentHandler'}->start_element(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'DocumentHandler'}; + $self->{Methods}->{'start_element'} = sub { $handler->start_element(@_) }; + } + return $res; + } + elsif (defined $callbacks->{'Handler'} and $callbacks->{'Handler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'Handler'}->start_element(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'start_element'} = sub { $handler->start_element(@_) }; + } + return $res; + } + else { + $self->{Methods}->{'start_element'} = sub { }; + } + } + +} + +sub external_entity_decl { + my $self = shift; + if (defined $self->{Methods}->{'external_entity_decl'}) { + $self->{Methods}->{'external_entity_decl'}->(@_); + } + else { + my $method; + my $callbacks; + if (exists $self->{ParseOptions}) { + $callbacks = $self->{ParseOptions}; + } + else { + $callbacks = $self; + } + if (0) { # dummy to make elsif's below compile + } + elsif (defined $callbacks->{'DeclHandler'} and $method = $callbacks->{'DeclHandler'}->can('external_entity_decl') ) { + my $handler = $callbacks->{'DeclHandler'}; + $self->{Methods}->{'external_entity_decl'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'Handler'} and $method = $callbacks->{'Handler'}->can('external_entity_decl') ) { + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'external_entity_decl'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'DeclHandler'} and $callbacks->{'DeclHandler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'DeclHandler'}->external_entity_decl(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'DeclHandler'}; + $self->{Methods}->{'external_entity_decl'} = sub { $handler->external_entity_decl(@_) }; + } + return $res; + } + elsif (defined $callbacks->{'Handler'} and $callbacks->{'Handler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'Handler'}->external_entity_decl(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'external_entity_decl'} = sub { $handler->external_entity_decl(@_) }; + } + return $res; + } + else { + $self->{Methods}->{'external_entity_decl'} = sub { }; + } + } + +} + +sub xml_decl { + my $self = shift; + if (defined $self->{Methods}->{'xml_decl'}) { + $self->{Methods}->{'xml_decl'}->(@_); + } + else { + my $method; + my $callbacks; + if (exists $self->{ParseOptions}) { + $callbacks = $self->{ParseOptions}; + } + else { + $callbacks = $self; + } + if (0) { # dummy to make elsif's below compile + } + elsif (defined $callbacks->{'DTDHandler'} and $method = $callbacks->{'DTDHandler'}->can('xml_decl') ) { + my $handler = $callbacks->{'DTDHandler'}; + $self->{Methods}->{'xml_decl'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'Handler'} and $method = $callbacks->{'Handler'}->can('xml_decl') ) { + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'xml_decl'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'DTDHandler'} and $callbacks->{'DTDHandler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'DTDHandler'}->xml_decl(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'DTDHandler'}; + $self->{Methods}->{'xml_decl'} = sub { $handler->xml_decl(@_) }; + } + return $res; + } + elsif (defined $callbacks->{'Handler'} and $callbacks->{'Handler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'Handler'}->xml_decl(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'xml_decl'} = sub { $handler->xml_decl(@_) }; + } + return $res; + } + else { + $self->{Methods}->{'xml_decl'} = sub { }; + } + } + +} + +sub entity_decl { + my $self = shift; + if (defined $self->{Methods}->{'entity_decl'}) { + $self->{Methods}->{'entity_decl'}->(@_); + } + else { + my $method; + my $callbacks; + if (exists $self->{ParseOptions}) { + $callbacks = $self->{ParseOptions}; + } + else { + $callbacks = $self; + } + if (0) { # dummy to make elsif's below compile + } + elsif (defined $callbacks->{'DTDHandler'} and $method = $callbacks->{'DTDHandler'}->can('entity_decl') ) { + my $handler = $callbacks->{'DTDHandler'}; + $self->{Methods}->{'entity_decl'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'Handler'} and $method = $callbacks->{'Handler'}->can('entity_decl') ) { + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'entity_decl'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'DTDHandler'} and $callbacks->{'DTDHandler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'DTDHandler'}->entity_decl(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'DTDHandler'}; + $self->{Methods}->{'entity_decl'} = sub { $handler->entity_decl(@_) }; + } + return $res; + } + elsif (defined $callbacks->{'Handler'} and $callbacks->{'Handler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'Handler'}->entity_decl(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'entity_decl'} = sub { $handler->entity_decl(@_) }; + } + return $res; + } + else { + $self->{Methods}->{'entity_decl'} = sub { }; + } + } + +} + +sub end_dtd { + my $self = shift; + if (defined $self->{Methods}->{'end_dtd'}) { + $self->{Methods}->{'end_dtd'}->(@_); + } + else { + my $method; + my $callbacks; + if (exists $self->{ParseOptions}) { + $callbacks = $self->{ParseOptions}; + } + else { + $callbacks = $self; + } + if (0) { # dummy to make elsif's below compile + } + elsif (defined $callbacks->{'LexicalHandler'} and $method = $callbacks->{'LexicalHandler'}->can('end_dtd') ) { + my $handler = $callbacks->{'LexicalHandler'}; + $self->{Methods}->{'end_dtd'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'Handler'} and $method = $callbacks->{'Handler'}->can('end_dtd') ) { + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'end_dtd'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'LexicalHandler'} and $callbacks->{'LexicalHandler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'LexicalHandler'}->end_dtd(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'LexicalHandler'}; + $self->{Methods}->{'end_dtd'} = sub { $handler->end_dtd(@_) }; + } + return $res; + } + elsif (defined $callbacks->{'Handler'} and $callbacks->{'Handler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'Handler'}->end_dtd(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'end_dtd'} = sub { $handler->end_dtd(@_) }; + } + return $res; + } + else { + $self->{Methods}->{'end_dtd'} = sub { }; + } + } + +} + +sub unparsed_entity_decl { + my $self = shift; + if (defined $self->{Methods}->{'unparsed_entity_decl'}) { + $self->{Methods}->{'unparsed_entity_decl'}->(@_); + } + else { + my $method; + my $callbacks; + if (exists $self->{ParseOptions}) { + $callbacks = $self->{ParseOptions}; + } + else { + $callbacks = $self; + } + if (0) { # dummy to make elsif's below compile + } + elsif (defined $callbacks->{'DTDHandler'} and $method = $callbacks->{'DTDHandler'}->can('unparsed_entity_decl') ) { + my $handler = $callbacks->{'DTDHandler'}; + $self->{Methods}->{'unparsed_entity_decl'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'Handler'} and $method = $callbacks->{'Handler'}->can('unparsed_entity_decl') ) { + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'unparsed_entity_decl'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'DTDHandler'} and $callbacks->{'DTDHandler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'DTDHandler'}->unparsed_entity_decl(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'DTDHandler'}; + $self->{Methods}->{'unparsed_entity_decl'} = sub { $handler->unparsed_entity_decl(@_) }; + } + return $res; + } + elsif (defined $callbacks->{'Handler'} and $callbacks->{'Handler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'Handler'}->unparsed_entity_decl(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'unparsed_entity_decl'} = sub { $handler->unparsed_entity_decl(@_) }; + } + return $res; + } + else { + $self->{Methods}->{'unparsed_entity_decl'} = sub { }; + } + } + +} + +sub processing_instruction { + my $self = shift; + if (defined $self->{Methods}->{'processing_instruction'}) { + $self->{Methods}->{'processing_instruction'}->(@_); + } + else { + my $method; + my $callbacks; + if (exists $self->{ParseOptions}) { + $callbacks = $self->{ParseOptions}; + } + else { + $callbacks = $self; + } + if (0) { # dummy to make elsif's below compile + } + elsif (defined $callbacks->{'ContentHandler'} and $method = $callbacks->{'ContentHandler'}->can('processing_instruction') ) { + my $handler = $callbacks->{'ContentHandler'}; + $self->{Methods}->{'processing_instruction'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'DocumentHandler'} and $method = $callbacks->{'DocumentHandler'}->can('processing_instruction') ) { + my $handler = $callbacks->{'DocumentHandler'}; + $self->{Methods}->{'processing_instruction'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'Handler'} and $method = $callbacks->{'Handler'}->can('processing_instruction') ) { + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'processing_instruction'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'ContentHandler'} and $callbacks->{'ContentHandler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'ContentHandler'}->processing_instruction(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'ContentHandler'}; + $self->{Methods}->{'processing_instruction'} = sub { $handler->processing_instruction(@_) }; + } + return $res; + } + elsif (defined $callbacks->{'DocumentHandler'} and $callbacks->{'DocumentHandler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'DocumentHandler'}->processing_instruction(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'DocumentHandler'}; + $self->{Methods}->{'processing_instruction'} = sub { $handler->processing_instruction(@_) }; + } + return $res; + } + elsif (defined $callbacks->{'Handler'} and $callbacks->{'Handler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'Handler'}->processing_instruction(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'processing_instruction'} = sub { $handler->processing_instruction(@_) }; + } + return $res; + } + else { + $self->{Methods}->{'processing_instruction'} = sub { }; + } + } + +} + +sub attribute_decl { + my $self = shift; + if (defined $self->{Methods}->{'attribute_decl'}) { + $self->{Methods}->{'attribute_decl'}->(@_); + } + else { + my $method; + my $callbacks; + if (exists $self->{ParseOptions}) { + $callbacks = $self->{ParseOptions}; + } + else { + $callbacks = $self; + } + if (0) { # dummy to make elsif's below compile + } + elsif (defined $callbacks->{'DeclHandler'} and $method = $callbacks->{'DeclHandler'}->can('attribute_decl') ) { + my $handler = $callbacks->{'DeclHandler'}; + $self->{Methods}->{'attribute_decl'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'Handler'} and $method = $callbacks->{'Handler'}->can('attribute_decl') ) { + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'attribute_decl'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'DeclHandler'} and $callbacks->{'DeclHandler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'DeclHandler'}->attribute_decl(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'DeclHandler'}; + $self->{Methods}->{'attribute_decl'} = sub { $handler->attribute_decl(@_) }; + } + return $res; + } + elsif (defined $callbacks->{'Handler'} and $callbacks->{'Handler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'Handler'}->attribute_decl(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'attribute_decl'} = sub { $handler->attribute_decl(@_) }; + } + return $res; + } + else { + $self->{Methods}->{'attribute_decl'} = sub { }; + } + } + +} + +sub fatal_error { + my $self = shift; + if (defined $self->{Methods}->{'fatal_error'}) { + $self->{Methods}->{'fatal_error'}->(@_); + } + else { + my $method; + my $callbacks; + if (exists $self->{ParseOptions}) { + $callbacks = $self->{ParseOptions}; + } + else { + $callbacks = $self; + } + if (0) { # dummy to make elsif's below compile + } + elsif (defined $callbacks->{'ErrorHandler'} and $method = $callbacks->{'ErrorHandler'}->can('fatal_error') ) { + my $handler = $callbacks->{'ErrorHandler'}; + $self->{Methods}->{'fatal_error'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'Handler'} and $method = $callbacks->{'Handler'}->can('fatal_error') ) { + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'fatal_error'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'ErrorHandler'} and $callbacks->{'ErrorHandler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'ErrorHandler'}->fatal_error(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'ErrorHandler'}; + $self->{Methods}->{'fatal_error'} = sub { $handler->fatal_error(@_) }; + } + return $res; + } + elsif (defined $callbacks->{'Handler'} and $callbacks->{'Handler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'Handler'}->fatal_error(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'fatal_error'} = sub { $handler->fatal_error(@_) }; + } + return $res; + } + else { + $self->{Methods}->{'fatal_error'} = sub { }; + } + } + +} + +sub end_cdata { + my $self = shift; + if (defined $self->{Methods}->{'end_cdata'}) { + $self->{Methods}->{'end_cdata'}->(@_); + } + else { + my $method; + my $callbacks; + if (exists $self->{ParseOptions}) { + $callbacks = $self->{ParseOptions}; + } + else { + $callbacks = $self; + } + if (0) { # dummy to make elsif's below compile + } + elsif (defined $callbacks->{'DocumentHandler'} and $method = $callbacks->{'DocumentHandler'}->can('end_cdata') ) { + my $handler = $callbacks->{'DocumentHandler'}; + $self->{Methods}->{'end_cdata'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'LexicalHandler'} and $method = $callbacks->{'LexicalHandler'}->can('end_cdata') ) { + my $handler = $callbacks->{'LexicalHandler'}; + $self->{Methods}->{'end_cdata'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'Handler'} and $method = $callbacks->{'Handler'}->can('end_cdata') ) { + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'end_cdata'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'DocumentHandler'} and $callbacks->{'DocumentHandler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'DocumentHandler'}->end_cdata(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'DocumentHandler'}; + $self->{Methods}->{'end_cdata'} = sub { $handler->end_cdata(@_) }; + } + return $res; + } + elsif (defined $callbacks->{'LexicalHandler'} and $callbacks->{'LexicalHandler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'LexicalHandler'}->end_cdata(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'LexicalHandler'}; + $self->{Methods}->{'end_cdata'} = sub { $handler->end_cdata(@_) }; + } + return $res; + } + elsif (defined $callbacks->{'Handler'} and $callbacks->{'Handler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'Handler'}->end_cdata(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'end_cdata'} = sub { $handler->end_cdata(@_) }; + } + return $res; + } + else { + $self->{Methods}->{'end_cdata'} = sub { }; + } + } + +} + +sub start_entity { + my $self = shift; + if (defined $self->{Methods}->{'start_entity'}) { + $self->{Methods}->{'start_entity'}->(@_); + } + else { + my $method; + my $callbacks; + if (exists $self->{ParseOptions}) { + $callbacks = $self->{ParseOptions}; + } + else { + $callbacks = $self; + } + if (0) { # dummy to make elsif's below compile + } + elsif (defined $callbacks->{'LexicalHandler'} and $method = $callbacks->{'LexicalHandler'}->can('start_entity') ) { + my $handler = $callbacks->{'LexicalHandler'}; + $self->{Methods}->{'start_entity'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'Handler'} and $method = $callbacks->{'Handler'}->can('start_entity') ) { + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'start_entity'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'LexicalHandler'} and $callbacks->{'LexicalHandler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'LexicalHandler'}->start_entity(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'LexicalHandler'}; + $self->{Methods}->{'start_entity'} = sub { $handler->start_entity(@_) }; + } + return $res; + } + elsif (defined $callbacks->{'Handler'} and $callbacks->{'Handler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'Handler'}->start_entity(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'start_entity'} = sub { $handler->start_entity(@_) }; + } + return $res; + } + else { + $self->{Methods}->{'start_entity'} = sub { }; + } + } + +} + +sub start_prefix_mapping { + my $self = shift; + if (defined $self->{Methods}->{'start_prefix_mapping'}) { + $self->{Methods}->{'start_prefix_mapping'}->(@_); + } + else { + my $method; + my $callbacks; + if (exists $self->{ParseOptions}) { + $callbacks = $self->{ParseOptions}; + } + else { + $callbacks = $self; + } + if (0) { # dummy to make elsif's below compile + } + elsif (defined $callbacks->{'ContentHandler'} and $method = $callbacks->{'ContentHandler'}->can('start_prefix_mapping') ) { + my $handler = $callbacks->{'ContentHandler'}; + $self->{Methods}->{'start_prefix_mapping'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'Handler'} and $method = $callbacks->{'Handler'}->can('start_prefix_mapping') ) { + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'start_prefix_mapping'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'ContentHandler'} and $callbacks->{'ContentHandler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'ContentHandler'}->start_prefix_mapping(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'ContentHandler'}; + $self->{Methods}->{'start_prefix_mapping'} = sub { $handler->start_prefix_mapping(@_) }; + } + return $res; + } + elsif (defined $callbacks->{'Handler'} and $callbacks->{'Handler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'Handler'}->start_prefix_mapping(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'start_prefix_mapping'} = sub { $handler->start_prefix_mapping(@_) }; + } + return $res; + } + else { + $self->{Methods}->{'start_prefix_mapping'} = sub { }; + } + } + +} + +sub error { + my $self = shift; + if (defined $self->{Methods}->{'error'}) { + $self->{Methods}->{'error'}->(@_); + } + else { + my $method; + my $callbacks; + if (exists $self->{ParseOptions}) { + $callbacks = $self->{ParseOptions}; + } + else { + $callbacks = $self; + } + if (0) { # dummy to make elsif's below compile + } + elsif (defined $callbacks->{'ErrorHandler'} and $method = $callbacks->{'ErrorHandler'}->can('error') ) { + my $handler = $callbacks->{'ErrorHandler'}; + $self->{Methods}->{'error'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'Handler'} and $method = $callbacks->{'Handler'}->can('error') ) { + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'error'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'ErrorHandler'} and $callbacks->{'ErrorHandler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'ErrorHandler'}->error(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'ErrorHandler'}; + $self->{Methods}->{'error'} = sub { $handler->error(@_) }; + } + return $res; + } + elsif (defined $callbacks->{'Handler'} and $callbacks->{'Handler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'Handler'}->error(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'error'} = sub { $handler->error(@_) }; + } + return $res; + } + else { + $self->{Methods}->{'error'} = sub { }; + } + } + +} + +sub start_document { + my $self = shift; + if (defined $self->{Methods}->{'start_document'}) { + $self->{Methods}->{'start_document'}->(@_); + } + else { + my $method; + my $callbacks; + if (exists $self->{ParseOptions}) { + $callbacks = $self->{ParseOptions}; + } + else { + $callbacks = $self; + } + if (0) { # dummy to make elsif's below compile + } + elsif (defined $callbacks->{'ContentHandler'} and $method = $callbacks->{'ContentHandler'}->can('start_document') ) { + my $handler = $callbacks->{'ContentHandler'}; + $self->{Methods}->{'start_document'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'DocumentHandler'} and $method = $callbacks->{'DocumentHandler'}->can('start_document') ) { + my $handler = $callbacks->{'DocumentHandler'}; + $self->{Methods}->{'start_document'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'Handler'} and $method = $callbacks->{'Handler'}->can('start_document') ) { + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'start_document'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'ContentHandler'} and $callbacks->{'ContentHandler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'ContentHandler'}->start_document(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'ContentHandler'}; + $self->{Methods}->{'start_document'} = sub { $handler->start_document(@_) }; + } + return $res; + } + elsif (defined $callbacks->{'DocumentHandler'} and $callbacks->{'DocumentHandler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'DocumentHandler'}->start_document(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'DocumentHandler'}; + $self->{Methods}->{'start_document'} = sub { $handler->start_document(@_) }; + } + return $res; + } + elsif (defined $callbacks->{'Handler'} and $callbacks->{'Handler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'Handler'}->start_document(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'start_document'} = sub { $handler->start_document(@_) }; + } + return $res; + } + else { + $self->{Methods}->{'start_document'} = sub { }; + } + } + +} + +sub ignorable_whitespace { + my $self = shift; + if (defined $self->{Methods}->{'ignorable_whitespace'}) { + $self->{Methods}->{'ignorable_whitespace'}->(@_); + } + else { + my $method; + my $callbacks; + if (exists $self->{ParseOptions}) { + $callbacks = $self->{ParseOptions}; + } + else { + $callbacks = $self; + } + if (0) { # dummy to make elsif's below compile + } + elsif (defined $callbacks->{'ContentHandler'} and $method = $callbacks->{'ContentHandler'}->can('ignorable_whitespace') ) { + my $handler = $callbacks->{'ContentHandler'}; + $self->{Methods}->{'ignorable_whitespace'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'DocumentHandler'} and $method = $callbacks->{'DocumentHandler'}->can('ignorable_whitespace') ) { + my $handler = $callbacks->{'DocumentHandler'}; + $self->{Methods}->{'ignorable_whitespace'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'Handler'} and $method = $callbacks->{'Handler'}->can('ignorable_whitespace') ) { + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'ignorable_whitespace'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'ContentHandler'} and $callbacks->{'ContentHandler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'ContentHandler'}->ignorable_whitespace(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'ContentHandler'}; + $self->{Methods}->{'ignorable_whitespace'} = sub { $handler->ignorable_whitespace(@_) }; + } + return $res; + } + elsif (defined $callbacks->{'DocumentHandler'} and $callbacks->{'DocumentHandler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'DocumentHandler'}->ignorable_whitespace(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'DocumentHandler'}; + $self->{Methods}->{'ignorable_whitespace'} = sub { $handler->ignorable_whitespace(@_) }; + } + return $res; + } + elsif (defined $callbacks->{'Handler'} and $callbacks->{'Handler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'Handler'}->ignorable_whitespace(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'ignorable_whitespace'} = sub { $handler->ignorable_whitespace(@_) }; + } + return $res; + } + else { + $self->{Methods}->{'ignorable_whitespace'} = sub { }; + } + } + +} + +sub end_document { + my $self = shift; + if (defined $self->{Methods}->{'end_document'}) { + $self->{Methods}->{'end_document'}->(@_); + } + else { + my $method; + my $callbacks; + if (exists $self->{ParseOptions}) { + $callbacks = $self->{ParseOptions}; + } + else { + $callbacks = $self; + } + if (0) { # dummy to make elsif's below compile + } + elsif (defined $callbacks->{'ContentHandler'} and $method = $callbacks->{'ContentHandler'}->can('end_document') ) { + my $handler = $callbacks->{'ContentHandler'}; + $self->{Methods}->{'end_document'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'DocumentHandler'} and $method = $callbacks->{'DocumentHandler'}->can('end_document') ) { + my $handler = $callbacks->{'DocumentHandler'}; + $self->{Methods}->{'end_document'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'Handler'} and $method = $callbacks->{'Handler'}->can('end_document') ) { + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'end_document'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'ContentHandler'} and $callbacks->{'ContentHandler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'ContentHandler'}->end_document(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'ContentHandler'}; + $self->{Methods}->{'end_document'} = sub { $handler->end_document(@_) }; + } + return $res; + } + elsif (defined $callbacks->{'DocumentHandler'} and $callbacks->{'DocumentHandler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'DocumentHandler'}->end_document(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'DocumentHandler'}; + $self->{Methods}->{'end_document'} = sub { $handler->end_document(@_) }; + } + return $res; + } + elsif (defined $callbacks->{'Handler'} and $callbacks->{'Handler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'Handler'}->end_document(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'end_document'} = sub { $handler->end_document(@_) }; + } + return $res; + } + else { + $self->{Methods}->{'end_document'} = sub { }; + } + } + +} + +sub start_cdata { + my $self = shift; + if (defined $self->{Methods}->{'start_cdata'}) { + $self->{Methods}->{'start_cdata'}->(@_); + } + else { + my $method; + my $callbacks; + if (exists $self->{ParseOptions}) { + $callbacks = $self->{ParseOptions}; + } + else { + $callbacks = $self; + } + if (0) { # dummy to make elsif's below compile + } + elsif (defined $callbacks->{'DocumentHandler'} and $method = $callbacks->{'DocumentHandler'}->can('start_cdata') ) { + my $handler = $callbacks->{'DocumentHandler'}; + $self->{Methods}->{'start_cdata'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'LexicalHandler'} and $method = $callbacks->{'LexicalHandler'}->can('start_cdata') ) { + my $handler = $callbacks->{'LexicalHandler'}; + $self->{Methods}->{'start_cdata'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'Handler'} and $method = $callbacks->{'Handler'}->can('start_cdata') ) { + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'start_cdata'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'DocumentHandler'} and $callbacks->{'DocumentHandler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'DocumentHandler'}->start_cdata(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'DocumentHandler'}; + $self->{Methods}->{'start_cdata'} = sub { $handler->start_cdata(@_) }; + } + return $res; + } + elsif (defined $callbacks->{'LexicalHandler'} and $callbacks->{'LexicalHandler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'LexicalHandler'}->start_cdata(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'LexicalHandler'}; + $self->{Methods}->{'start_cdata'} = sub { $handler->start_cdata(@_) }; + } + return $res; + } + elsif (defined $callbacks->{'Handler'} and $callbacks->{'Handler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'Handler'}->start_cdata(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'start_cdata'} = sub { $handler->start_cdata(@_) }; + } + return $res; + } + else { + $self->{Methods}->{'start_cdata'} = sub { }; + } + } + +} + +sub set_document_locator { + my $self = shift; + if (defined $self->{Methods}->{'set_document_locator'}) { + $self->{Methods}->{'set_document_locator'}->(@_); + } + else { + my $method; + my $callbacks; + if (exists $self->{ParseOptions}) { + $callbacks = $self->{ParseOptions}; + } + else { + $callbacks = $self; + } + if (0) { # dummy to make elsif's below compile + } + elsif (defined $callbacks->{'ContentHandler'} and $method = $callbacks->{'ContentHandler'}->can('set_document_locator') ) { + my $handler = $callbacks->{'ContentHandler'}; + $self->{Methods}->{'set_document_locator'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'DocumentHandler'} and $method = $callbacks->{'DocumentHandler'}->can('set_document_locator') ) { + my $handler = $callbacks->{'DocumentHandler'}; + $self->{Methods}->{'set_document_locator'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'Handler'} and $method = $callbacks->{'Handler'}->can('set_document_locator') ) { + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'set_document_locator'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'ContentHandler'} and $callbacks->{'ContentHandler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'ContentHandler'}->set_document_locator(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'ContentHandler'}; + $self->{Methods}->{'set_document_locator'} = sub { $handler->set_document_locator(@_) }; + } + return $res; + } + elsif (defined $callbacks->{'DocumentHandler'} and $callbacks->{'DocumentHandler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'DocumentHandler'}->set_document_locator(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'DocumentHandler'}; + $self->{Methods}->{'set_document_locator'} = sub { $handler->set_document_locator(@_) }; + } + return $res; + } + elsif (defined $callbacks->{'Handler'} and $callbacks->{'Handler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'Handler'}->set_document_locator(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'set_document_locator'} = sub { $handler->set_document_locator(@_) }; + } + return $res; + } + else { + $self->{Methods}->{'set_document_locator'} = sub { }; + } + } + +} + +sub attlist_decl { + my $self = shift; + if (defined $self->{Methods}->{'attlist_decl'}) { + $self->{Methods}->{'attlist_decl'}->(@_); + } + else { + my $method; + my $callbacks; + if (exists $self->{ParseOptions}) { + $callbacks = $self->{ParseOptions}; + } + else { + $callbacks = $self; + } + if (0) { # dummy to make elsif's below compile + } + elsif (defined $callbacks->{'DTDHandler'} and $method = $callbacks->{'DTDHandler'}->can('attlist_decl') ) { + my $handler = $callbacks->{'DTDHandler'}; + $self->{Methods}->{'attlist_decl'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'Handler'} and $method = $callbacks->{'Handler'}->can('attlist_decl') ) { + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'attlist_decl'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'DTDHandler'} and $callbacks->{'DTDHandler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'DTDHandler'}->attlist_decl(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'DTDHandler'}; + $self->{Methods}->{'attlist_decl'} = sub { $handler->attlist_decl(@_) }; + } + return $res; + } + elsif (defined $callbacks->{'Handler'} and $callbacks->{'Handler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'Handler'}->attlist_decl(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'attlist_decl'} = sub { $handler->attlist_decl(@_) }; + } + return $res; + } + else { + $self->{Methods}->{'attlist_decl'} = sub { }; + } + } + +} + +sub start_dtd { + my $self = shift; + if (defined $self->{Methods}->{'start_dtd'}) { + $self->{Methods}->{'start_dtd'}->(@_); + } + else { + my $method; + my $callbacks; + if (exists $self->{ParseOptions}) { + $callbacks = $self->{ParseOptions}; + } + else { + $callbacks = $self; + } + if (0) { # dummy to make elsif's below compile + } + elsif (defined $callbacks->{'LexicalHandler'} and $method = $callbacks->{'LexicalHandler'}->can('start_dtd') ) { + my $handler = $callbacks->{'LexicalHandler'}; + $self->{Methods}->{'start_dtd'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'Handler'} and $method = $callbacks->{'Handler'}->can('start_dtd') ) { + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'start_dtd'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'LexicalHandler'} and $callbacks->{'LexicalHandler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'LexicalHandler'}->start_dtd(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'LexicalHandler'}; + $self->{Methods}->{'start_dtd'} = sub { $handler->start_dtd(@_) }; + } + return $res; + } + elsif (defined $callbacks->{'Handler'} and $callbacks->{'Handler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'Handler'}->start_dtd(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'start_dtd'} = sub { $handler->start_dtd(@_) }; + } + return $res; + } + else { + $self->{Methods}->{'start_dtd'} = sub { }; + } + } + +} + +sub resolve_entity { + my $self = shift; + if (defined $self->{Methods}->{'resolve_entity'}) { + $self->{Methods}->{'resolve_entity'}->(@_); + } + else { + my $method; + my $callbacks; + if (exists $self->{ParseOptions}) { + $callbacks = $self->{ParseOptions}; + } + else { + $callbacks = $self; + } + if (0) { # dummy to make elsif's below compile + } + elsif (defined $callbacks->{'EntityResolver'} and $method = $callbacks->{'EntityResolver'}->can('resolve_entity') ) { + my $handler = $callbacks->{'EntityResolver'}; + $self->{Methods}->{'resolve_entity'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'Handler'} and $method = $callbacks->{'Handler'}->can('resolve_entity') ) { + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'resolve_entity'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'EntityResolver'} and $callbacks->{'EntityResolver'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'EntityResolver'}->resolve_entity(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'EntityResolver'}; + $self->{Methods}->{'resolve_entity'} = sub { $handler->resolve_entity(@_) }; + } + return $res; + } + elsif (defined $callbacks->{'Handler'} and $callbacks->{'Handler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'Handler'}->resolve_entity(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'resolve_entity'} = sub { $handler->resolve_entity(@_) }; + } + return $res; + } + else { + $self->{Methods}->{'resolve_entity'} = sub { }; + } + } + +} + +sub entity_reference { + my $self = shift; + if (defined $self->{Methods}->{'entity_reference'}) { + $self->{Methods}->{'entity_reference'}->(@_); + } + else { + my $method; + my $callbacks; + if (exists $self->{ParseOptions}) { + $callbacks = $self->{ParseOptions}; + } + else { + $callbacks = $self; + } + if (0) { # dummy to make elsif's below compile + } + elsif (defined $callbacks->{'DocumentHandler'} and $method = $callbacks->{'DocumentHandler'}->can('entity_reference') ) { + my $handler = $callbacks->{'DocumentHandler'}; + $self->{Methods}->{'entity_reference'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'Handler'} and $method = $callbacks->{'Handler'}->can('entity_reference') ) { + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'entity_reference'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'DocumentHandler'} and $callbacks->{'DocumentHandler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'DocumentHandler'}->entity_reference(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'DocumentHandler'}; + $self->{Methods}->{'entity_reference'} = sub { $handler->entity_reference(@_) }; + } + return $res; + } + elsif (defined $callbacks->{'Handler'} and $callbacks->{'Handler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'Handler'}->entity_reference(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'entity_reference'} = sub { $handler->entity_reference(@_) }; + } + return $res; + } + else { + $self->{Methods}->{'entity_reference'} = sub { }; + } + } + +} + +sub element_decl { + my $self = shift; + if (defined $self->{Methods}->{'element_decl'}) { + $self->{Methods}->{'element_decl'}->(@_); + } + else { + my $method; + my $callbacks; + if (exists $self->{ParseOptions}) { + $callbacks = $self->{ParseOptions}; + } + else { + $callbacks = $self; + } + if (0) { # dummy to make elsif's below compile + } + elsif (defined $callbacks->{'DeclHandler'} and $method = $callbacks->{'DeclHandler'}->can('element_decl') ) { + my $handler = $callbacks->{'DeclHandler'}; + $self->{Methods}->{'element_decl'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'Handler'} and $method = $callbacks->{'Handler'}->can('element_decl') ) { + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'element_decl'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'DeclHandler'} and $callbacks->{'DeclHandler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'DeclHandler'}->element_decl(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'DeclHandler'}; + $self->{Methods}->{'element_decl'} = sub { $handler->element_decl(@_) }; + } + return $res; + } + elsif (defined $callbacks->{'Handler'} and $callbacks->{'Handler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'Handler'}->element_decl(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'element_decl'} = sub { $handler->element_decl(@_) }; + } + return $res; + } + else { + $self->{Methods}->{'element_decl'} = sub { }; + } + } + +} + +sub notation_decl { + my $self = shift; + if (defined $self->{Methods}->{'notation_decl'}) { + $self->{Methods}->{'notation_decl'}->(@_); + } + else { + my $method; + my $callbacks; + if (exists $self->{ParseOptions}) { + $callbacks = $self->{ParseOptions}; + } + else { + $callbacks = $self; + } + if (0) { # dummy to make elsif's below compile + } + elsif (defined $callbacks->{'DTDHandler'} and $method = $callbacks->{'DTDHandler'}->can('notation_decl') ) { + my $handler = $callbacks->{'DTDHandler'}; + $self->{Methods}->{'notation_decl'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'Handler'} and $method = $callbacks->{'Handler'}->can('notation_decl') ) { + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'notation_decl'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'DTDHandler'} and $callbacks->{'DTDHandler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'DTDHandler'}->notation_decl(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'DTDHandler'}; + $self->{Methods}->{'notation_decl'} = sub { $handler->notation_decl(@_) }; + } + return $res; + } + elsif (defined $callbacks->{'Handler'} and $callbacks->{'Handler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'Handler'}->notation_decl(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'notation_decl'} = sub { $handler->notation_decl(@_) }; + } + return $res; + } + else { + $self->{Methods}->{'notation_decl'} = sub { }; + } + } + +} + +sub skipped_entity { + my $self = shift; + if (defined $self->{Methods}->{'skipped_entity'}) { + $self->{Methods}->{'skipped_entity'}->(@_); + } + else { + my $method; + my $callbacks; + if (exists $self->{ParseOptions}) { + $callbacks = $self->{ParseOptions}; + } + else { + $callbacks = $self; + } + if (0) { # dummy to make elsif's below compile + } + elsif (defined $callbacks->{'ContentHandler'} and $method = $callbacks->{'ContentHandler'}->can('skipped_entity') ) { + my $handler = $callbacks->{'ContentHandler'}; + $self->{Methods}->{'skipped_entity'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'Handler'} and $method = $callbacks->{'Handler'}->can('skipped_entity') ) { + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'skipped_entity'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'ContentHandler'} and $callbacks->{'ContentHandler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'ContentHandler'}->skipped_entity(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'ContentHandler'}; + $self->{Methods}->{'skipped_entity'} = sub { $handler->skipped_entity(@_) }; + } + return $res; + } + elsif (defined $callbacks->{'Handler'} and $callbacks->{'Handler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'Handler'}->skipped_entity(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'skipped_entity'} = sub { $handler->skipped_entity(@_) }; + } + return $res; + } + else { + $self->{Methods}->{'skipped_entity'} = sub { }; + } + } + +} + +sub end_element { + my $self = shift; + if (defined $self->{Methods}->{'end_element'}) { + $self->{Methods}->{'end_element'}->(@_); + } + else { + my $method; + my $callbacks; + if (exists $self->{ParseOptions}) { + $callbacks = $self->{ParseOptions}; + } + else { + $callbacks = $self; + } + if (0) { # dummy to make elsif's below compile + } + elsif (defined $callbacks->{'ContentHandler'} and $method = $callbacks->{'ContentHandler'}->can('end_element') ) { + my $handler = $callbacks->{'ContentHandler'}; + $self->{Methods}->{'end_element'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'DocumentHandler'} and $method = $callbacks->{'DocumentHandler'}->can('end_element') ) { + my $handler = $callbacks->{'DocumentHandler'}; + $self->{Methods}->{'end_element'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'Handler'} and $method = $callbacks->{'Handler'}->can('end_element') ) { + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'end_element'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'ContentHandler'} and $callbacks->{'ContentHandler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'ContentHandler'}->end_element(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'ContentHandler'}; + $self->{Methods}->{'end_element'} = sub { $handler->end_element(@_) }; + } + return $res; + } + elsif (defined $callbacks->{'DocumentHandler'} and $callbacks->{'DocumentHandler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'DocumentHandler'}->end_element(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'DocumentHandler'}; + $self->{Methods}->{'end_element'} = sub { $handler->end_element(@_) }; + } + return $res; + } + elsif (defined $callbacks->{'Handler'} and $callbacks->{'Handler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'Handler'}->end_element(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'end_element'} = sub { $handler->end_element(@_) }; + } + return $res; + } + else { + $self->{Methods}->{'end_element'} = sub { }; + } + } + +} + +sub doctype_decl { + my $self = shift; + if (defined $self->{Methods}->{'doctype_decl'}) { + $self->{Methods}->{'doctype_decl'}->(@_); + } + else { + my $method; + my $callbacks; + if (exists $self->{ParseOptions}) { + $callbacks = $self->{ParseOptions}; + } + else { + $callbacks = $self; + } + if (0) { # dummy to make elsif's below compile + } + elsif (defined $callbacks->{'DTDHandler'} and $method = $callbacks->{'DTDHandler'}->can('doctype_decl') ) { + my $handler = $callbacks->{'DTDHandler'}; + $self->{Methods}->{'doctype_decl'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'Handler'} and $method = $callbacks->{'Handler'}->can('doctype_decl') ) { + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'doctype_decl'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'DTDHandler'} and $callbacks->{'DTDHandler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'DTDHandler'}->doctype_decl(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'DTDHandler'}; + $self->{Methods}->{'doctype_decl'} = sub { $handler->doctype_decl(@_) }; + } + return $res; + } + elsif (defined $callbacks->{'Handler'} and $callbacks->{'Handler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'Handler'}->doctype_decl(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'doctype_decl'} = sub { $handler->doctype_decl(@_) }; + } + return $res; + } + else { + $self->{Methods}->{'doctype_decl'} = sub { }; + } + } + +} + +sub comment { + my $self = shift; + if (defined $self->{Methods}->{'comment'}) { + $self->{Methods}->{'comment'}->(@_); + } + else { + my $method; + my $callbacks; + if (exists $self->{ParseOptions}) { + $callbacks = $self->{ParseOptions}; + } + else { + $callbacks = $self; + } + if (0) { # dummy to make elsif's below compile + } + elsif (defined $callbacks->{'DocumentHandler'} and $method = $callbacks->{'DocumentHandler'}->can('comment') ) { + my $handler = $callbacks->{'DocumentHandler'}; + $self->{Methods}->{'comment'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'LexicalHandler'} and $method = $callbacks->{'LexicalHandler'}->can('comment') ) { + my $handler = $callbacks->{'LexicalHandler'}; + $self->{Methods}->{'comment'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'Handler'} and $method = $callbacks->{'Handler'}->can('comment') ) { + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'comment'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'DocumentHandler'} and $callbacks->{'DocumentHandler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'DocumentHandler'}->comment(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'DocumentHandler'}; + $self->{Methods}->{'comment'} = sub { $handler->comment(@_) }; + } + return $res; + } + elsif (defined $callbacks->{'LexicalHandler'} and $callbacks->{'LexicalHandler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'LexicalHandler'}->comment(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'LexicalHandler'}; + $self->{Methods}->{'comment'} = sub { $handler->comment(@_) }; + } + return $res; + } + elsif (defined $callbacks->{'Handler'} and $callbacks->{'Handler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'Handler'}->comment(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'comment'} = sub { $handler->comment(@_) }; + } + return $res; + } + else { + $self->{Methods}->{'comment'} = sub { }; + } + } + +} + +sub end_entity { + my $self = shift; + if (defined $self->{Methods}->{'end_entity'}) { + $self->{Methods}->{'end_entity'}->(@_); + } + else { + my $method; + my $callbacks; + if (exists $self->{ParseOptions}) { + $callbacks = $self->{ParseOptions}; + } + else { + $callbacks = $self; + } + if (0) { # dummy to make elsif's below compile + } + elsif (defined $callbacks->{'LexicalHandler'} and $method = $callbacks->{'LexicalHandler'}->can('end_entity') ) { + my $handler = $callbacks->{'LexicalHandler'}; + $self->{Methods}->{'end_entity'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'Handler'} and $method = $callbacks->{'Handler'}->can('end_entity') ) { + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'end_entity'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'LexicalHandler'} and $callbacks->{'LexicalHandler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'LexicalHandler'}->end_entity(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'LexicalHandler'}; + $self->{Methods}->{'end_entity'} = sub { $handler->end_entity(@_) }; + } + return $res; + } + elsif (defined $callbacks->{'Handler'} and $callbacks->{'Handler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'Handler'}->end_entity(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'end_entity'} = sub { $handler->end_entity(@_) }; + } + return $res; + } + else { + $self->{Methods}->{'end_entity'} = sub { }; + } + } + +} + +sub warning { + my $self = shift; + if (defined $self->{Methods}->{'warning'}) { + $self->{Methods}->{'warning'}->(@_); + } + else { + my $method; + my $callbacks; + if (exists $self->{ParseOptions}) { + $callbacks = $self->{ParseOptions}; + } + else { + $callbacks = $self; + } + if (0) { # dummy to make elsif's below compile + } + elsif (defined $callbacks->{'ErrorHandler'} and $method = $callbacks->{'ErrorHandler'}->can('warning') ) { + my $handler = $callbacks->{'ErrorHandler'}; + $self->{Methods}->{'warning'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'Handler'} and $method = $callbacks->{'Handler'}->can('warning') ) { + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'warning'} = sub { $method->($handler, @_) }; + return $method->($handler, @_); + } + elsif (defined $callbacks->{'ErrorHandler'} and $callbacks->{'ErrorHandler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'ErrorHandler'}->warning(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'ErrorHandler'}; + $self->{Methods}->{'warning'} = sub { $handler->warning(@_) }; + } + return $res; + } + elsif (defined $callbacks->{'Handler'} and $callbacks->{'Handler'}->can('AUTOLOAD') ) { + my $res = eval { $callbacks->{'Handler'}->warning(@_) }; + if ($@) { + die $@; + } + else { + # I think there's a buggette here... + # if the first call throws an exception, we don't set it up right. + # Not fatal, but we might want to address it. + my $handler = $callbacks->{'Handler'}; + $self->{Methods}->{'warning'} = sub { $handler->warning(@_) }; + } + return $res; + } + else { + $self->{Methods}->{'warning'} = sub { }; + } + } + +} + +#-------------------------------------------------------------------# +# Class->new(%options) +#-------------------------------------------------------------------# +sub new { + my $proto = shift; + my $class = ref($proto) || $proto; + my $options = ($#_ == 0) ? shift : { @_ }; + + unless ( defined( $options->{Handler} ) or + defined( $options->{ContentHandler} ) or + defined( $options->{DTDHandler} ) or + defined( $options->{DocumentHandler} ) or + defined( $options->{LexicalHandler} ) or + defined( $options->{ErrorHandler} ) or + defined( $options->{DeclHandler} ) ) { + + $options->{Handler} = XML::SAX::Base::NoHandler->new; + } + + my $self = bless $options, $class; + # turn NS processing on by default + $self->set_feature('http://xml.org/sax/features/namespaces', 1); + return $self; +} +#-------------------------------------------------------------------# + +#-------------------------------------------------------------------# +# $p->parse(%options) +#-------------------------------------------------------------------# +sub parse { + my $self = shift; + my $parse_options = $self->get_options(@_); + local $self->{ParseOptions} = $parse_options; + if ($self->{Parent}) { # calling parse on a filter for some reason + return $self->{Parent}->parse($parse_options); + } + else { + my $method; + if (defined $parse_options->{Source}{CharacterStream} and $method = $self->can('_parse_characterstream')) { + warn("parse charstream???\n"); + return $method->($self, $parse_options->{Source}{CharacterStream}); + } + elsif (defined $parse_options->{Source}{ByteStream} and $method = $self->can('_parse_bytestream')) { + return $method->($self, $parse_options->{Source}{ByteStream}); + } + elsif (defined $parse_options->{Source}{String} and $method = $self->can('_parse_string')) { + return $method->($self, $parse_options->{Source}{String}); + } + elsif (defined $parse_options->{Source}{SystemId} and $method = $self->can('_parse_systemid')) { + return $method->($self, $parse_options->{Source}{SystemId}); + } + else { + die "No _parse_* routine defined on this driver (if it a filter, remember to set the Parent property) [$self]"; + } + } +} +#-------------------------------------------------------------------# + +#-------------------------------------------------------------------# +# $p->parse_file(%options) +#-------------------------------------------------------------------# +sub parse_file { + my $self = shift; + my $file = shift; + my $parse_options = $self->get_options(@_); + $parse_options->{Source}{ByteStream} = $file; + return $self->parse($parse_options); +} +#-------------------------------------------------------------------# + +#-------------------------------------------------------------------# +# $p->parse_uri(%options) +#-------------------------------------------------------------------# +sub parse_uri { + my $self = shift; + my $file = shift; + my $parse_options = $self->get_options(@_); + $parse_options->{Source}{SystemId} = $file; + return $self->parse($parse_options); +} +#-------------------------------------------------------------------# + +#-------------------------------------------------------------------# +# $p->parse_string(%options) +#-------------------------------------------------------------------# +sub parse_string { + my $self = shift; + my $string = shift; + my $parse_options = $self->get_options(@_); + $parse_options->{Source}{String} = $string; + return $self->parse($parse_options); +} +#-------------------------------------------------------------------# + +#-------------------------------------------------------------------# +# get_options +#-------------------------------------------------------------------# +sub get_options { + my $self = shift; + + if (@_ == 1) { + return { %$self, %{$_[0]} }; + } else { + return { %$self, @_ }; + } +} +#-------------------------------------------------------------------# + +#-------------------------------------------------------------------# +# get_features +#-------------------------------------------------------------------# +sub get_features { + return ( + 'http://xml.org/sax/features/external-general-entities' => undef, + 'http://xml.org/sax/features/external-parameter-entities' => undef, + 'http://xml.org/sax/features/is-standalone' => undef, + 'http://xml.org/sax/features/lexical-handler' => undef, + 'http://xml.org/sax/features/parameter-entities' => undef, + 'http://xml.org/sax/features/namespaces' => 1, + 'http://xml.org/sax/features/namespace-prefixes' => 0, + 'http://xml.org/sax/features/string-interning' => undef, + 'http://xml.org/sax/features/use-attributes2' => undef, + 'http://xml.org/sax/features/use-locator2' => undef, + 'http://xml.org/sax/features/validation' => undef, + + 'http://xml.org/sax/properties/dom-node' => undef, + 'http://xml.org/sax/properties/xml-string' => undef, + ); +} +#-------------------------------------------------------------------# + +#-------------------------------------------------------------------# +# get_feature +#-------------------------------------------------------------------# +sub get_feature { + my $self = shift; + my $feat = shift; + + # check %FEATURES to see if it's there, and return it if so + # throw XML::SAX::Exception::NotRecognized if it's not there + # throw XML::SAX::Exception::NotSupported if it's there but we + # don't support it + + my %features = $self->get_features(); + if (exists $features{$feat}) { + my %supported = map { $_ => 1 } $self->supported_features(); + if ($supported{$feat}) { + return $self->{__PACKAGE__ . "::Features"}{$feat}; + } + throw XML::SAX::Exception::NotSupported( + Message => "The feature '$feat' is not supported by " . ref($self), + Exception => undef, + ); + } + throw XML::SAX::Exception::NotRecognized( + Message => "The feature '$feat' is not recognized by " . ref($self), + Exception => undef, + ); +} +#-------------------------------------------------------------------# + +#-------------------------------------------------------------------# +# set_feature +#-------------------------------------------------------------------# +sub set_feature { + my $self = shift; + my $feat = shift; + my $value = shift; + # check %FEATURES to see if it's there, and set it if so + # throw XML::SAX::Exception::NotRecognized if it's not there + # throw XML::SAX::Exception::NotSupported if it's there but we + # don't support it + + my %features = $self->get_features(); + if (exists $features{$feat}) { + my %supported = map { $_ => 1 } $self->supported_features(); + if ($supported{$feat}) { + return $self->{__PACKAGE__ . "::Features"}{$feat} = $value; + } + throw XML::SAX::Exception::NotSupported( + Message => "The feature '$feat' is not supported by " . ref($self), + Exception => undef, + ); + } + throw XML::SAX::Exception::NotRecognized( + Message => "The feature '$feat' is not recognized by " . ref($self), + Exception => undef, + ); +} +#-------------------------------------------------------------------# + +#-------------------------------------------------------------------# +# get_handler and friends +#-------------------------------------------------------------------# +sub get_handler { + my $self = shift; + my $handler_type = shift; + $handler_type ||= 'Handler'; + return defined( $self->{$handler_type} ) ? $self->{$handler_type} : undef; +} + +sub get_document_handler { + my $self = shift; + return $self->get_handler('DocumentHandler', @_); +} + +sub get_content_handler { + my $self = shift; + return $self->get_handler('ContentHandler', @_); +} + +sub get_dtd_handler { + my $self = shift; + return $self->get_handler('DTDHandler', @_); +} + +sub get_lexical_handler { + my $self = shift; + return $self->get_handler('LexicalHandler', @_); +} + +sub get_decl_handler { + my $self = shift; + return $self->get_handler('DeclHandler', @_); +} + +sub get_error_handler { + my $self = shift; + return $self->get_handler('ErrorHandler', @_); +} + +sub get_entity_resolver { + my $self = shift; + return $self->get_handler('EntityResolver', @_); +} +#-------------------------------------------------------------------# + +#-------------------------------------------------------------------# +# set_handler and friends +#-------------------------------------------------------------------# +sub set_handler { + my $self = shift; + my ($new_handler, $handler_type) = reverse @_; + $handler_type ||= 'Handler'; + $self->{Methods} = {} if $self->{Methods}; + $self->{$handler_type} = $new_handler; + return 1; +} + +sub set_document_handler { + my $self = shift; + return $self->set_handler('DocumentHandler', @_); +} + +sub set_content_handler { + my $self = shift; + return $self->set_handler('ContentHandler', @_); +} +sub set_dtd_handler { + my $self = shift; + return $self->set_handler('DTDHandler', @_); +} +sub set_lexical_handler { + my $self = shift; + return $self->set_handler('LexicalHandler', @_); +} +sub set_decl_handler { + my $self = shift; + return $self->set_handler('DeclHandler', @_); +} +sub set_error_handler { + my $self = shift; + return $self->set_handler('ErrorHandler', @_); +} +sub set_entity_resolver { + my $self = shift; + return $self->set_handler('EntityResolver', @_); +} + +#-------------------------------------------------------------------# + +#-------------------------------------------------------------------# +# supported_features +#-------------------------------------------------------------------# +sub supported_features { + my $self = shift; + # Only namespaces are required by all parsers + return ( + 'http://xml.org/sax/features/namespaces', + ); +} +#-------------------------------------------------------------------# + +sub no_op { + # this space intentionally blank +} + + +package XML::SAX::Base::NoHandler; + +# we need a fake handler that doesn't implement anything, this +# simplifies the code a lot (though given the recent changes, +# it may be better to do without) +sub new { + #warn "no handler called\n"; + return bless {}; +} + +1; + +__END__ + +=head1 NAME + +XML::SAX::Base - Base class SAX Drivers and Filters + +=head1 SYNOPSIS + + package MyFilter; + use XML::SAX::Base; + @ISA = ('XML::SAX::Base'); + +=head1 DESCRIPTION + +This module has a very simple task - to be a base class for PerlSAX +drivers and filters. It's default behaviour is to pass the input directly +to the output unchanged. It can be useful to use this module as a base class +so you don't have to, for example, implement the characters() callback. + +The main advantages that it provides are easy dispatching of events the right +way (ie it takes care for you of checking that the handler has implemented +that method, or has defined an AUTOLOAD), and the guarantee that filters +will pass along events that they aren't implementing to handlers downstream +that might nevertheless be interested in them. + +=head1 WRITING SAX DRIVERS AND FILTERS + +Writing SAX Filters is tremendously easy: all you need to do is +inherit from this module, and define the events you want to handle. A +more detailed explanation can be found at +http://www.xml.com/pub/a/2001/10/10/sax-filters.html. + +Writing Drivers is equally simple. The one thing you need to pay +attention to is B to call events yourself (this applies to Filters +as well). For instance: + + package MyFilter; + use base qw(XML::SAX::Base); + + sub start_element { + my $self = shift; + my $data = shift; + # do something + $self->{Handler}->start_element($data); # BAD + } + +The above example works well as precisely that: an example. But it has +several faults: 1) it doesn't test to see whether the handler defines +start_element. Perhaps it doesn't want to see that event, in which +case you shouldn't throw it (otherwise it'll die). 2) it doesn't check +ContentHandler and then Handler (ie it doesn't look to see that the +user hasn't requested events on a specific handler, and if not on the +default one), 3) if it did check all that, not only would the code be +cumbersome (see this module's source to get an idea) but it would also +probably have to check for a DocumentHandler (in case this were SAX1) +and for AUTOLOADs potentially defined in all these packages. As you can +tell, that would be fairly painful. Instead of going through that, +simply remember to use code similar to the following instead: + + package MyFilter; + use base qw(XML::SAX::Base); + + sub start_element { + my $self = shift; + my $data = shift; + # do something to filter + $self->SUPER::start_element($data); # GOOD (and easy) ! + } + +This way, once you've done your job you hand the ball back to +XML::SAX::Base and it takes care of all those problems for you! + +Note that the above example doesn't apply to filters only, drivers +will benefit from the exact same feature. + +=head1 METHODS + +A number of methods are defined within this class for the purpose of +inheritance. Some probably don't need to be overridden (eg parse_file) +but some clearly should be (eg parse). Options for these methods are +described in the PerlSAX2 specification available from +http://cvs.sourceforge.net/cgi-bin/viewcvs.cgi/~checkout~/perl-xml/libxml-perl/doc/sax-2.0.html?rev=HEAD&content-type=text/html. + +=over 4 + +=item * parse + +The parse method is the main entry point to parsing documents. Internally +the parse method will detect what type of "thing" you are parsing, and +call the appropriate method in your implementation class. Here is the +mapping table of what is in the Source options (see the Perl SAX 2.0 +specification for the meaning of these values): + + Source Contains parse() calls + =============== ============= + CharacterStream (*) _parse_characterstream($stream, $options) + ByteStream _parse_bytestream($stream, $options) + String _parse_string($string, $options) + SystemId _parse_systemid($string, $options) + +However note that these methods may not be sensible if your driver class +is not for parsing XML. An example might be a DBI driver that generates +XML/SAX from a database table. If that is the case, you likely want to +write your own parse() method. + +Also note that the Source may contain both a PublicId entry, and an +Encoding entry. To get at these, examine $options->{Source} as passed +to your method. + +(*) A CharacterStream is a filehandle that does not need any encoding +translation done on it. This is implemented as a regular filehandle +and only works under Perl 5.7.2 or higher using PerlIO. To get a single +character, or number of characters from it, use the perl core read() +function. To get a single byte from it (or number of bytes), you can +use sysread(). The encoding of the stream should be in the Encoding +entry for the Source. + +=item * parse_file, parse_uri, parse_string + +These are all convenience variations on parse(), and in fact simply +set up the options before calling it. You probably don't need to +override these. + +=item * get_options + +This is a convenience method to get options in SAX2 style, or more +generically either as hashes or as hashrefs (it returns a hashref). +You will probably want to use this method in your own implementations +of parse() and of new(). + +=item * get_feature, set_feature + +These simply get and set features, and throw the +appropriate exceptions defined in the specification if need be. + +If your subclass defines features not defined in this one, +then you should override these methods in such a way that they check for +your features first, and then call the base class's methods +for features not defined by your class. An example would be: + + sub get_feature { + my $self = shift; + my $feat = shift; + if (exists $MY_FEATURES{$feat}) { + # handle the feature in various ways + } + else { + return $self->SUPER::get_feature($feat); + } + } + +Currently this part is unimplemented. + + +=item * set_handler + +This method takes a handler type (Handler, ContentHandler, etc.) and a +handler object as arguments, and changes the current handler for that +handler type, while taking care of resetting the internal state that +needs to be reset. This allows one to change a handler during parse +without running into problems (changing it on the parser object +directly will most likely cause trouble). + +=item * set_document_handler, set_content_handler, set_dtd_handler, set_lexical_handler, set_decl_handler, set_error_handler, set_entity_resolver + +These are just simple wrappers around the former method, and take a +handler object as their argument. Internally they simply call +set_handler with the correct arguments. + +=item * get_handler + +The inverse of set_handler, this method takes a an optional string containing a handler type (DTDHandler, +ContentHandler, etc. 'Handler' is used if no type is passed). It returns a reference to the object that implements +that that class, or undef if that handler type is not set for the current driver/filter. + +=item * get_document_handler, get_content_handler, get_dtd_handler, get_lexical_handler, get_decl_handler, +get_error_handler, get_entity_resolver + +These are just simple wrappers around the get_handler() method, and take no arguments. Internally +they simply call get_handler with the correct handler type name. + +=back + +It would be rather useless to describe all the methods that this +module implements here. They are all the methods supported in SAX1 and +SAX2. In case your memory is a little short, here is a list. The +apparent duplicates are there so that both versions of SAX can be +supported. + +=over 4 + +=item * start_document + +=item * end_document + +=item * start_element + +=item * start_document + +=item * end_document + +=item * start_element + +=item * end_element + +=item * characters + +=item * processing_instruction + +=item * ignorable_whitespace + +=item * set_document_locator + +=item * start_prefix_mapping + +=item * end_prefix_mapping + +=item * skipped_entity + +=item * start_cdata + +=item * end_cdata + +=item * comment + +=item * entity_reference + +=item * notation_decl + +=item * unparsed_entity_decl + +=item * element_decl + +=item * attlist_decl + +=item * doctype_decl + +=item * xml_decl + +=item * entity_decl + +=item * attribute_decl + +=item * internal_entity_decl + +=item * external_entity_decl + +=item * resolve_entity + +=item * start_dtd + +=item * end_dtd + +=item * start_entity + +=item * end_entity + +=item * warning + +=item * error + +=item * fatal_error + +=back + +=head1 TODO + + - more tests + - conform to the "SAX Filters" and "Java and DOM compatibility" + sections of the SAX2 document. + +=head1 AUTHOR + +Kip Hampton (khampton@totalcinema.com) did most of the work, after porting +it from XML::Filter::Base. + +Robin Berjon (robin@knowscape.com) pitched in with patches to make it +usable as a base for drivers as well as filters, along with other patches. + +Matt Sergeant (matt@sergeant.org) wrote the original XML::Filter::Base, +and patched a few things here and there, and imported it into +the XML::SAX distribution. + +=head1 SEE ALSO + +L + +=cut + diff --git a/qcd/part_cpu/bench/lib/XML/SAX/DocumentLocator.pm b/qcd/part_cpu/bench/lib/XML/SAX/DocumentLocator.pm new file mode 100644 index 0000000000000000000000000000000000000000..f364c7fb0e70ae14773fadcd973bdb9c2bdfc53f --- /dev/null +++ b/qcd/part_cpu/bench/lib/XML/SAX/DocumentLocator.pm @@ -0,0 +1,121 @@ +# $Id: DocumentLocator.pm,v 1.1.1.1 2007/08/07 06:49:12 mahermanns Exp $ + +package XML::SAX::DocumentLocator; +use strict; + +sub new { + my $class = shift; + my %object; + tie %object, $class, @_; + + return bless \%object, $class; +} + +sub TIEHASH { + my $class = shift; + my ($pubmeth, $sysmeth, $linemeth, $colmeth) = @_; + return bless { + pubmeth => $pubmeth, + sysmeth => $sysmeth, + linemeth => $linemeth, + colmeth => $colmeth, + }, $class; +} + +sub FETCH { + my ($self, $key) = @_; + my $method; + if ($key eq 'PublicId') { + $method = $self->{pubmeth}; + } + elsif ($key eq 'SystemId') { + $method = $self->{sysmeth}; + } + elsif ($key eq 'LineNumber') { + $method = $self->{linemeth}; + } + elsif ($key eq 'ColumnNumber') { + $method = $self->{colmeth}; + } + if ($method) { + my $value = $method->($key); + return $value; + } + return undef; +} + +sub EXISTS { + my ($self, $key) = @_; + if ($key =~ /^(PublicId|SystemId|LineNumber|ColumnNumber)$/) { + return 1; + } + return 0; +} + +sub STORE { + my ($self, $key, $value) = @_; +} + +sub DELETE { + my ($self, $key) = @_; +} + +sub CLEAR { + my ($self) = @_; +} + +sub FIRSTKEY { + my ($self) = @_; + # assignment resets. + $self->{keys} = { + PublicId => 1, + SystemId => 1, + LineNumber => 1, + ColumnNumber => 1, + }; + return each %{$self->{keys}}; +} + +sub NEXTKEY { + my ($self, $lastkey) = @_; + return each %{$self->{keys}}; +} + +1; +__END__ + +=head1 NAME + +XML::SAX::DocumentLocator - Helper class for document locators + +=head1 SYNOPSIS + + my $locator = XML::SAX::DocumentLocator->new( + sub { $object->get_public_id }, + sub { $object->get_system_id }, + sub { $reader->current_line }, + sub { $reader->current_column }, + ); + +=head1 DESCRIPTION + +This module gives you a tied hash reference that calls the +specified closures when asked for PublicId, SystemId, +LineNumber and ColumnNumber. + +It is useful for writing SAX Parsers so that you don't have +to constantly update the line numbers in a hash reference on +the object you pass to set_document_locator(). See the source +code for XML::SAX::PurePerl for a usage example. + +=head1 API + +There is only 1 method: C. Simply pass it a list of +closures that when called will return the PublicId, the +SystemId, the LineNumber and the ColumnNumber, respectively. + +The closures are passed a single parameter, the key being +requested. But you're free to ignore that. + +=cut + diff --git a/qcd/part_cpu/bench/lib/XML/SAX/Exception.pm b/qcd/part_cpu/bench/lib/XML/SAX/Exception.pm new file mode 100644 index 0000000000000000000000000000000000000000..381f3a1db06722f45ac251b2a4ed9b3f6959412a --- /dev/null +++ b/qcd/part_cpu/bench/lib/XML/SAX/Exception.pm @@ -0,0 +1,126 @@ +package XML::SAX::Exception; + +use strict; + +use overload '""' => "stringify", + 'fallback' => 1; + +use vars qw/$StackTrace $VERSION/; +$VERSION = '1.01'; +use Carp; + +$StackTrace = $ENV{XML_DEBUG} || 0; + +# Other exception classes: + +@XML::SAX::Exception::NotRecognized::ISA = ('XML::SAX::Exception'); +@XML::SAX::Exception::NotSupported::ISA = ('XML::SAX::Exception'); +@XML::SAX::Exception::Parse::ISA = ('XML::SAX::Exception'); + + +sub throw { + my $class = shift; + if (ref($class)) { + die $class; + } + die $class->new(@_); +} + +sub new { + my $class = shift; + my %opts = @_; + confess "Invalid options: " . join(', ', keys %opts) unless exists $opts{Message}; + + bless { ($StackTrace ? (StackTrace => stacktrace()) : ()), %opts }, + $class; +} + +sub stringify { + my $self = shift; + local $^W; + my $error; + if (exists $self->{LineNumber}) { + $error = $self->{Message} . " [Ln: " . $self->{LineNumber} . + ", Col: " . $self->{ColumnNumber} . "]"; + } + else { + $error = $self->{Message}; + } + if ($StackTrace) { + $error .= stackstring($self->{StackTrace}); + } + $error .= "\n"; + return $error; +} + +sub stacktrace { + my $i = 2; + my @fulltrace; + while (my @trace = caller($i++)) { + my %hash; + @hash{qw(Package Filename Line)} = @trace[0..2]; + push @fulltrace, \%hash; + } + return \@fulltrace; +} + +sub stackstring { + my $stacktrace = shift; + my $string = "\nFrom:\n"; + foreach my $current (@$stacktrace) { + $string .= $current->{Filename} . " Line: " . $current->{Line} . "\n"; + } + return $string; +} + +1; + +__END__ + +=head1 NAME + +XML::SAX::Exception - Exception classes for XML::SAX + +=head1 SYNOPSIS + + throw XML::SAX::Exception::NotSupported( + Message => "The foo feature is not supported", + ); + +=head1 DESCRIPTION + +This module is the base class for all SAX Exceptions, those defined in +the spec as well as those that one may create for one's own SAX errors. + +There are three subclasses included, corresponding to those of the SAX +spec: + + XML::SAX::Exception::NotSupported + XML::SAX::Exception::NotRecognized + XML::SAX::Exception::Parse + +Use them wherever you want, and as much as possible when you encounter +such errors. SAX is meant to use exceptions as much as possible to +flag problems. + +=head1 CREATING NEW EXCEPTION CLASSES + +All you need to do to create a new exception class is: + + @XML::SAX::Exception::MyException::ISA = ('XML::SAX::Exception') + +The given package doesn't need to exist, it'll behave correctly this +way. If your exception refines an existing exception class, then you +may also inherit from that instead of from the base class. + +=head1 THROWING EXCEPTIONS + +This is as simple as exemplified in the SYNOPSIS. In fact, there's +nothing more to know. All you have to do is: + + throw XML::SAX::Exception::MyException( Message => 'Something went wrong' ); + +and voila, you've thrown an exception which can be caught in an eval block. + +=cut + diff --git a/qcd/part_cpu/bench/lib/XML/SAX/Intro.pod b/qcd/part_cpu/bench/lib/XML/SAX/Intro.pod new file mode 100644 index 0000000000000000000000000000000000000000..04ef81432af8bca97a6ae144797b646be1c16bc2 --- /dev/null +++ b/qcd/part_cpu/bench/lib/XML/SAX/Intro.pod @@ -0,0 +1,407 @@ +=head1 NAME + +XML::SAX::Intro - An Introduction to SAX Parsing with Perl + +=head1 Introduction + +XML::SAX is a new way to work with XML Parsers in Perl. In this article +we'll discuss why you should be using SAX, why you should be using +XML::SAX, and we'll see some of the finer implementation details. The +text below assumes some familiarity with callback, or push based +parsing, but if you are unfamiliar with these techniques then a good +place to start is Kip Hampton's excellent series of articles on XML.com. + +=head1 Replacing XML::Parser + +The de-facto way of parsing XML under perl is to use Larry Wall and +Clark Cooper's XML::Parser. This module is a Perl and XS wrapper around +the expat XML parser library by James Clark. It has been a hugely +successful project, but suffers from a couple of rather major flaws. +Firstly it is a proprietary API, designed before the SAX API was +conceived, which means that it is not easily replaceable by other +streaming parsers. Secondly it's callbacks are subrefs. This doesn't +sound like much of an issue, but unfortunately leads to code like: + + sub handle_start { + my ($e, $el, %attrs) = @_; + if ($el eq 'foo') { + $e->{inside_foo}++; # BAD! $e is an XML::Parser::Expat object. + } + } + +As you can see, we're using the $e object to hold our state +information, which is a bad idea because we don't own that object - we +didn't create it. It's an internal object of XML::Parser, that happens +to be a hashref. We could all too easily overwrite XML::Parser internal +state variables by using this, or Clark could change it to an array ref +(not that he would, because it would break so much code, but he could). + +The only way currently with XML::Parser to safely maintain state is to +use a closure: + + my $state = MyState->new(); + $parser->setHandlers(Start => sub { handle_start($state, @_) }); + +This closure traps the $state variable, which now gets passed as the +first parameter to your callback. Unfortunately very few people use +this technique, as it is not documented in the XML::Parser POD files. + +Another reason you might not want to use XML::Parser is because you +need some feature that it doesn't provide (such as validation), or you +might need to use a library that doesn't use expat, due to it not being +installed on your system, or due to having a restrictive ISP. Using SAX +allows you to work around these restrictions. + +=head1 Introducing SAX + +SAX stands for the Simple API for XML. And simple it really is. +Constructing a SAX parser and passing events to handlers is done as +simply as: + + use XML::SAX; + use MySAXHandler; + + my $parser = XML::SAX::ParserFactory->parser( + Handler => MySAXHandler->new + ); + + $parser->parse_uri("foo.xml"); + +The important concept to grasp here is that SAX uses a factory class +called XML::SAX::ParserFactory to create a new parser instance. The +reason for this is so that you can support other underlying +parser implementations for different feature sets. This is one thing +that XML::Parser has always sorely lacked. + +In the code above we see the parse_uri method used, but we could +have equally well +called parse_file, parse_string, or parse(). Please see XML::SAX::Base +for what these methods take as parameters, but don't be fooled into +believing parse_file takes a filename. No, it takes a file handle, a +glob, or a subclass of IO::Handle. Beware. + +SAX works very similarly to XML::Parser's default callback method, +except it has one major difference: rather than setting individual +callbacks, you create a new class in which to recieve the callbacks. +Each callback is called as a method call on an instance of that handler +class. An example will best demonstrate this: + + package MySAXHandler; + use base qw(XML::SAX::Base); + + sub start_document { + my ($self, $doc) = @_; + # process document start event + } + + sub start_element { + my ($self, $el) = @_; + # process element start event + } + +Now, when we instantiate this as above, and parse some XML with this as +the handler, the methods start_document and start_element will be +called as method calls, so this would be the equivalent of directly +calling: + + $object->start_element($el); + +Notice how this is different to XML::Parser's calling style, which +calls: + + start_element($e, $name, %attribs); + +It's the difference between function calling and method calling which +allows you to subclass SAX handlers which contributes to SAX being a +powerful solution. + +As you can see, unlike XML::Parser, we have to define a new package in +which to do our processing (there are hacks you can do to make this +uneccessary, but I'll leave figuring those out to the experts). The +biggest benefit of this is that you maintain your own state variable +($self in the above example) thus freeing you of the concerns listed +above. It is also an improvement in maintainability - you can place the +code in a separate file if you wish to, and your callback methods are +always called the same thing, rather than having to choose a suitable +name for them as you had to with XML::Parser. This is an obvious win. + +SAX parsers are also very flexible in how you pass a handler to them. +You can use a constructor parameter as we saw above, or we can pass the +handler directly in the call to one of the parse methods: + + $parser->parse(Handler => $handler, + Source => { SystemId => "foo.xml" }); + # or... + $parser->parse_file($fh, Handler => $handler); + +This flexibility allows for one parser to be used in many different +scenarios throughout your script (though one shouldn't feel pressure to +use this method, as parser construction is generally not a time +consuming process). + +=head1 Callback Parameters + +The only other thing you need to know to understand basic SAX is the +structure of the parameters passed to each of the callbacks. In +XML::Parser, all parameters are passed as multiple options to the +callbacks, so for example the Start callback would be called as +my_start($e, $name, %attributes), and the PI callback would be called +as my_processing_instruction($e, $target, $data). In SAX, every +callback is passed a hash reference, containing entries that define our +"node". The key callbacks and the structures they receive are: + +=head2 start_element + +The start_element handler is called whenever a parser sees an opening +tag. It is passed an element structure consisting of: + +=over 4 + +=item LocalName + +The name of the element minus any namespace prefix it may +have come with in the document. + +=item NamespaceURI + +The URI of the namespace associated with this element, +or the empty string for none. + +=item Attributes + +A set of attributes as described below. + +=item Name + +The name of the element as it was seen in the document (i.e. +including any prefix associated with it) + +=item Prefix + +The prefix used to qualify this element's namespace, or the +empty string if none. + +=back + +The B are a hash reference, keyed by what we have called +"James Clark" notation. This means that the attribute name has been +expanded to include any associated namespace URI, and put together as +{ns}name, where "ns" is the expanded namespace URI of the attribute if +and only if the attribute had a prefix, and "name" is the LocalName of +the attribute. + +The value of each entry in the attributes hash is another hash +structure consisting of: + +=over 4 + +=item LocalName + +The name of the attribute minus any namespace prefix it may have +come with in the document. + +=item NamespaceURI + +The URI of the namespace associated with this attribute. If the +attribute had no prefix, then this consists of just the empty string. + +=item Name + +The attribute's name as it appeared in the document, including any +namespace prefix. + +=item Prefix + +The prefix used to qualify this attribute's namepace, or the +empty string if none. + +=item Value + +The value of the attribute. + +=back + +So a full example, as output by Data::Dumper might be: + + .... + +=head2 end_element + +The end_element handler is called either when a parser sees a closing +tag, or after start_element has been called for an empty element (do +note however that a parser may if it is so inclined call characters +with an empty string when it sees an empty element. There is no simple +way in SAX to determine if the parser in fact saw an empty element, a +start and end element with no content.. + +The end_element handler receives exactly the same structure as +start_element, minus the Attributes entry. One must note though that it +should not be a reference to the same data as start_element receives, +so you may change the values in start_element but this will not affect +the values later seen by end_element. + +=head2 characters + +The characters callback may be called in serveral circumstances. The +most obvious one is when seeing ordinary character data in the markup. +But it is also called for text in a CDATA section, and is also called +in other situations. A SAX parser has to make no guarantees whatsoever +about how many times it may call characters for a stretch of text in an +XML document - it may call once, or it may call once for every +character in the text. In order to work around this it is often +important for the SAX developer to use a bundling technique, where text +is gathered up and processed in one of the other callbacks. This is not +always necessary, but it is a worthwhile technique to learn, which we +will cover in XML::SAX::Advanced (when I get around to writing it). + +The characters handler is called with a very simple structure - a hash +reference consisting of just one entry: + +=over 4 + +=item Data + +The text data that was received. + +=back + +=head2 comment + +The comment callback is called for comment text. Unlike with +C, the comment callback *must* be invoked just once for an +entire comment string. It receives a single simple structure - a hash +reference containing just one entry: + +=over 4 + +=item Data + +The text of the comment. + +=back + +=head2 processing_instruction + +The processing instruction handler is called for all processing +instructions in the document. Note that these processing instructions +may appear before the document root element, or after it, or anywhere +where text and elements would normally appear within the document, +according to the XML specification. + +The handler is passed a structure containing just two entries: + +=over 4 + +=item Target + +The target of the processing instrcution + +=item Data + +The text data in the processing instruction. Can be an empty +string for a processing instruction that has no data element. +For example E?wiggle?E is a perfectly valid processing instruction. + +=back + +=head1 Tip of the iceberg + +What we have discussed above is really the tip of the SAX iceberg. And +so far it looks like there's not much of interest to SAX beyond what we +have seen with XML::Parser. But it does go much further than that, I +promise. + +People who hate Object Oriented code for the sake of it may be thinking +here that creating a new package just to parse something is a waste +when they've been parsing things just fine up to now using procedural +code. But there's reason to all this madness. And that reason is SAX +Filters. + +As you saw right at the very start, to let the parser know about our +class, we pass it an instance of our class as the Handler to the +parser. But now imagine what would happen if our class could also take +a Handler option, and simply do some processing and pass on our data +further down the line? That in a nutshell is how SAX filters work. It's +Unix pipes for the 21st century! + +There are two downsides to this. Number 1 - writing SAX filters can be +tricky. If you look into the future and read the advanced tutorial I'm +writing, you'll see that Handler can come in several shapes and sizes. +So making sure your filter does the right thing can be tricky. +Secondly, constructing complex filter chains can be difficult, and +simple thinking tells us that we only get one pass at our document, +when often we'll need more than that. + +Luckily though, those downsides have been fixed by the release of two +very cool modules. What's even better is that I didn't write either of +them! + +The first module is XML::SAX::Base. This is a VITAL SAX module that +acts as a base class for all SAX parsers and filters. It provides an +abstraction away from calling the handler methods, that makes sure your +filter or parser does the right thing, and it does it FAST. So, if you +ever need to write a SAX filter, which if you're processing XML -> XML, +or XML -> HTML, then you probably do, then you need to be writing it as +a subclass of XML::SAX::Base. Really - this is advice not to ignore +lightly. I will not go into the details of writing a SAX filter here. +Kip Hampton, the author of XML::SAX::Base has covered this nicely in +his article on XML.com here . + +To construct SAX pipelines, Barrie Slaymaker, a long time Perl hacker +who's modules you will probably have heard of or used, wrote a very +clever module called XML::SAX::Machines. This combines some really +clever SAX filter-type modules, with a construction toolkit for filters +that makes building pipelines easy. But before we see how it makes +things easy, first lets see how tricky it looks to build complex SAX +filter pipelines. + + use XML::SAX::ParserFactory; + use XML::Filter::Filter1; + use XML::Filter::Filter2; + use XML::SAX::Writer; + + my $output_string; + my $writer = XML::SAX::Writer->new(Output => \$output_string); + my $filter2 = XML::SAX::Filter2->new(Handler => $writer); + my $filter1 = XML::SAX::Filter1->new(Handler => $filter2); + my $parser = XML::SAX::ParserFactory->parser(Handler => $filter1); + + $parser->parse_uri("foo.xml"); + +This is a lot easier with XML::SAX::Machines: + + use XML::SAX::Machines qw(Pipeline); + + my $output_string; + my $parser = Pipeline( + XML::SAX::Filter1 => XML::SAX::Filter2 => \$output_string + ); + + $parser->parse_uri("foo.xml"); + +One of the main benefits of XML::SAX::Machines is that the pipelines +are constructed in natural order, rather than the reverse order we saw +with manual pipeline construction. XML::SAX::Machines takes care of all +the internals of pipe construction, providing you at the end with just +a parser you can use (and you can re-use the same parser as many times +as you need to). + +Just a final tip. If you ever get stuck and are confused about what is +being passed from one SAX filter or parser to the next, then +Devel::TraceSAX will come to your rescue. This perl debugger plugin +will allow you to dump the SAX stream of events as it goes by. Usage is +really very simple just call your perl script that uses SAX as follows: + + $ perl -d:TraceSAX + +And preferably pipe the output to a pager of some sort, such as more or +less. The output is extremely verbose, but should help clear some +issues up. + +=head1 AUTHOR + +Matt Sergeant, matt@sergeant.org + +$Id: Intro.pod,v 1.1.1.1 2007/08/07 06:49:12 mahermanns Exp $ + +=cut diff --git a/qcd/part_cpu/bench/lib/XML/SAX/ParserDetails.ini b/qcd/part_cpu/bench/lib/XML/SAX/ParserDetails.ini new file mode 100644 index 0000000000000000000000000000000000000000..c0c954ef88a8fd2d59cbe0d024debc4a92136350 --- /dev/null +++ b/qcd/part_cpu/bench/lib/XML/SAX/ParserDetails.ini @@ -0,0 +1,4 @@ +[XML::SAX::PurePerl] +http://xml.org/sax/features/namespaces = 1 + + diff --git a/qcd/part_cpu/bench/lib/XML/SAX/ParserFactory.pm b/qcd/part_cpu/bench/lib/XML/SAX/ParserFactory.pm new file mode 100644 index 0000000000000000000000000000000000000000..ff83c924d33903ee12f0123a1f3ebcbc808dc1c3 --- /dev/null +++ b/qcd/part_cpu/bench/lib/XML/SAX/ParserFactory.pm @@ -0,0 +1,232 @@ +# $Id: ParserFactory.pm,v 1.1.1.1 2007/08/07 06:49:12 mahermanns Exp $ + +package XML::SAX::ParserFactory; + +use strict; +use vars qw($VERSION); + +$VERSION = '1.01'; + +use Symbol qw(gensym); +use XML::SAX; +use XML::SAX::Exception; + +sub new { + my $class = shift; + my %params = @_; # TODO : Fix this in spec. + my $self = bless \%params, $class; + $self->{KnownParsers} = XML::SAX->parsers(); + return $self; +} + +sub parser { + my $self = shift; + my @parser_params = @_; + if (!ref($self)) { + $self = $self->new(); + } + + my $parser_class = $self->_parser_class(); + + my $version = ''; + if ($parser_class =~ s/\s*\(([\d\.]+)\)\s*$//) { + $version = " $1"; + } + + { + no strict 'refs'; + if (!keys %{"${parser_class}::"}) { + eval "use $parser_class $version;"; + } + } + + return $parser_class->new(@parser_params); +} + +sub require_feature { + my $self = shift; + my ($feature) = @_; + $self->{RequiredFeatures}{$feature}++; + return $self; +} + +sub _parser_class { + my $self = shift; + + # First try ParserPackage + if ($XML::SAX::ParserPackage) { + return $XML::SAX::ParserPackage; + } + + # Now check if required/preferred is there + if ($self->{RequiredFeatures}) { + my %required = %{$self->{RequiredFeatures}}; + # note - we never go onto the next try (ParserDetails.ini), + # because if we can't provide the requested feature + # we need to throw an exception. + PARSER: + foreach my $parser (reverse @{$self->{KnownParsers}}) { + foreach my $feature (keys %required) { + if (!exists $parser->{Features}{$feature}) { + next PARSER; + } + } + # got here - all features must exist! + return $parser->{Name}; + } + # TODO : should this be NotSupported() ? + throw XML::SAX::Exception ( + Message => "Unable to provide required features", + ); + } + + # Next try SAX.ini + for my $dir (@INC) { + my $fh = gensym(); + if (open($fh, "$dir/SAX.ini")) { + my $param_list = XML::SAX->_parse_ini_file($fh); + my $params = $param_list->[0]->{Features}; + if ($params->{ParserPackage}) { + return $params->{ParserPackage}; + } + else { + # we have required features (or nothing?) + PARSER: + foreach my $parser (reverse @{$self->{KnownParsers}}) { + foreach my $feature (keys %$params) { + if (!exists $parser->{Features}{$feature}) { + next PARSER; + } + } + return $parser->{Name}; + } + XML::SAX->do_warn("Unable to provide SAX.ini required features. Using fallback\n"); + } + last; # stop after first INI found + } + } + + if (@{$self->{KnownParsers}}) { + return $self->{KnownParsers}[-1]{Name}; + } + else { + return "XML::SAX::PurePerl"; # backup plan! + } +} + +1; +__END__ + +=head1 NAME + +XML::SAX::ParserFactory - Obtain a SAX parser + +=head1 SYNOPSIS + + use XML::SAX::ParserFactory; + use XML::SAX::XYZHandler; + my $handler = XML::SAX::XYZHandler->new(); + my $p = XML::SAX::ParserFactory->parser(Handler => $handler); + $p->parse_uri("foo.xml"); + # or $p->parse_string("") or $p->parse_file($fh); + +=head1 DESCRIPTION + +XML::SAX::ParserFactory is a factory class for providing an application +with a Perl SAX2 XML parser. It is akin to DBI - a front end for other +parser classes. Each new SAX2 parser installed will register itself +with XML::SAX, and then it will become available to all applications +that use XML::SAX::ParserFactory to obtain a SAX parser. + +Unlike DBI however, XML/SAX parsers almost all work alike (especially +if they subclass XML::SAX::Base, as they should), so rather than +specifying the parser you want in the call to C, XML::SAX +has several ways to automatically choose which parser to use: + +=over 4 + +=item * $XML::SAX::ParserPackage + +If this package variable is set, then this package is Cd +and an instance of this package is returned by calling the C +class method in that package. If it cannot be loaded or there is +an error, an exception will be thrown. The variable can also contain +a version number: + + $XML::SAX::ParserPackage = "XML::SAX::Expat (0.72)"; + +And the number will be treated as a minimum version number. + +=item * Required features + +It is possible to require features from the parsers. For example, you +may wish for a parser that supports validation via a DTD. To do that, +use the following code: + + use XML::SAX::ParserFactory; + my $factory = XML::SAX::ParserFactory->new(); + $factory->require_feature('http://xml.org/sax/features/validation'); + my $parser = $factory->parser(...); + +Alternatively, specify the required features in the call to the +ParserFactory constructor: + + my $factory = XML::SAX::ParserFactory->new( + RequiredFeatures => { + 'http://xml.org/sax/features/validation' => 1, + } + ); + +If the features you have asked for are unavailable (for example the +user might not have a validating parser installed), then an +exception will be thrown. + +The list of known parsers is searched in reverse order, so it will +always return the last installed parser that supports all of your +requested features (Note: this is subject to change if someone +comes up with a better way of making this work). + +=item * SAX.ini + +ParserFactory will search @INC for a file called SAX.ini, which +is in a simple format: + + # a comment looks like this, + ; or like this, and are stripped anywhere in the file + key = value # SAX.in contains key/value pairs. + +All whitespace is non-significant. + +This file can contain either a line: + + ParserPackage = MyParserModule (1.02) + +Where MyParserModule is the module to load and use for the parser, +and the number in brackets is a minimum version to load. + +Or you can list required features: + + http://xml.org/sax/features/validation = 1 + +And each feature with a true value will be required. + +=item * Fallback + +If none of the above works, the last parser installed on the user's +system will be used. The XML::SAX package ships with a pure perl +XML parser, XML::SAX::PurePerl, so that there will always be a +fallback parser. + +=back + +=head1 AUTHOR + +Matt Sergeant, matt@sergeant.org + +=head1 LICENSE + +This is free software, you may use it and distribute it under the same +terms as Perl itself. + +=cut + diff --git a/qcd/part_cpu/bench/lib/XML/SAX/PurePerl.pm b/qcd/part_cpu/bench/lib/XML/SAX/PurePerl.pm new file mode 100644 index 0000000000000000000000000000000000000000..6502fd8cacd5dd6531d5389b1831b55f6515dd6b --- /dev/null +++ b/qcd/part_cpu/bench/lib/XML/SAX/PurePerl.pm @@ -0,0 +1,748 @@ +# $Id: PurePerl.pm,v 1.1.1.1 2007/08/07 06:49:12 mahermanns Exp $ + +package XML::SAX::PurePerl; + +use strict; +use vars qw/$VERSION/; + +$VERSION = '0.90'; + +use XML::SAX::PurePerl::Productions qw($Any $CharMinusDash $SingleChar); +use XML::SAX::PurePerl::Reader; +use XML::SAX::PurePerl::EncodingDetect (); +use XML::SAX::Exception; +use XML::SAX::PurePerl::DocType (); +use XML::SAX::PurePerl::DTDDecls (); +use XML::SAX::PurePerl::XMLDecl (); +use XML::SAX::DocumentLocator (); +use XML::SAX::Base (); +use XML::SAX qw(Namespaces); +use XML::NamespaceSupport (); +use IO::File; + +if ($] < 5.006) { + require XML::SAX::PurePerl::NoUnicodeExt; +} +else { + require XML::SAX::PurePerl::UnicodeExt; +} + +use vars qw(@ISA); +@ISA = ('XML::SAX::Base'); + +my %int_ents = ( + amp => '&', + lt => '<', + gt => '>', + quot => '"', + apos => "'", + ); + +my $xmlns_ns = "http://www.w3.org/2000/xmlns/"; +my $xml_ns = "http://www.w3.org/XML/1998/namespace"; + +use Carp; +sub _parse_characterstream { + my $self = shift; + my ($fh) = @_; + confess("CharacterStream is not yet correctly implemented"); + my $reader = XML::SAX::PurePerl::Reader::Stream->new($fh); + return $self->_parse($reader); +} + +sub _parse_bytestream { + my $self = shift; + my ($fh) = @_; + my $reader = XML::SAX::PurePerl::Reader::Stream->new($fh); + return $self->_parse($reader); +} + +sub _parse_string { + my $self = shift; + my ($str) = @_; + my $reader = XML::SAX::PurePerl::Reader::String->new($str); + return $self->_parse($reader); +} + +sub _parse_systemid { + my $self = shift; + my ($uri) = @_; + my $reader = XML::SAX::PurePerl::Reader::URI->new($uri); + return $self->_parse($reader); +} + +sub _parse { + my ($self, $reader) = @_; + + $reader->public_id($self->{ParseOptions}{Source}{PublicId}); + $reader->system_id($self->{ParseOptions}{Source}{SystemId}); + $reader->next; + + $self->{NSHelper} = XML::NamespaceSupport->new({xmlns => 1}); + + $self->set_document_locator( + XML::SAX::DocumentLocator->new( + sub { $reader->public_id }, + sub { $reader->system_id }, + sub { $reader->line }, + sub { $reader->column }, + ), + ); + + $self->start_document({}); + + if (defined $self->{ParseOptions}{Source}{Encoding}) { + $reader->set_encoding($self->{ParseOptions}{Source}{Encoding}); + } + else { + $self->encoding_detect($reader); + } + + # parse a document + $self->document($reader); + + return $self->end_document({}); +} + +sub parser_error { + my $self = shift; + my ($error, $reader) = @_; + +# warn("parser error: $error from ", $reader->line, " : ", $reader->column, "\n"); + my $exception = XML::SAX::Exception::Parse->new( + Message => $error, + ColumnNumber => $reader->column, + LineNumber => $reader->line, + PublicId => $reader->public_id, + SystemId => $reader->system_id, + ); + + $self->fatal_error($exception); + $exception->throw; +} + +sub document { + my ($self, $reader) = @_; + + # document ::= prolog element Misc* + + $self->prolog($reader); + $self->element($reader) || + $self->parser_error("Document requires an element", $reader); + + while(!$reader->eof) { + $self->Misc($reader) || + $self->parser_error("Only Comments, PIs and whitespace allowed at end of document", $reader); + } +} + +sub prolog { + my ($self, $reader) = @_; + + $self->XMLDecl($reader); + + # consume all misc bits + 1 while($self->Misc($reader)); + + if ($self->doctypedecl($reader)) { + while (!$reader->eof) { + $self->Misc($reader) || last; + } + } +} + +sub element { + my ($self, $reader) = @_; + + if ($reader->match_char('<')) { + my $name = $self->Name($reader) || + $self->parser_error("Invalid element name", $reader); + + my %attribs; + + while( my ($k, $v) = $self->Attribute($reader) ) { + $attribs{$k} = $v; + } + + $self->skip_whitespace($reader); + + my $content; + unless ($reader->match_sequence('/', '>')) { + $reader->match_char('>') || + $self->parser_error("No close element tag", $reader); + + # only push onto _el_stack if not an empty element + push @{$self->{_el_stack}}, $name; + $content++; + } + + # Namespace processing + $self->{NSHelper}->push_context; + my @new_ns; +# my %attrs = @attribs; +# while (my ($k,$v) = each %attrs) { + if ($self->get_feature(Namespaces)) { + while ( my ($k, $v) = each %attribs ) { + if ($k =~ m/^xmlns(:(.*))?$/) { + my $prefix = $2 || ''; + $self->{NSHelper}->declare_prefix($prefix, $v); + my $ns = + { + Prefix => $prefix, + NamespaceURI => $v, + }; + push @new_ns, $ns; + $self->SUPER::start_prefix_mapping($ns); + } + } + } + + # Create element object and fire event + my %attrib_hash; + while (my ($name, $value) = each %attribs ) { + # TODO normalise value here + my ($ns, $prefix, $lname); + if ($self->get_feature(Namespaces)) { + ($ns, $prefix, $lname) = $self->{NSHelper}->process_attribute_name($name); + } + $ns ||= ''; $prefix ||= ''; $lname ||= ''; + $attrib_hash{"{$ns}$lname"} = { + Name => $name, + LocalName => $lname, + Prefix => $prefix, + NamespaceURI => $ns, + Value => $value, + }; + } + + %attribs = (); # lose the memory since we recurse deep + + my ($ns, $prefix, $lname); + if ($self->get_feature(Namespaces)) { + ($ns, $prefix, $lname) = $self->{NSHelper}->process_element_name($name); + } + $ns ||= ''; $prefix ||= ''; $lname ||= ''; + + my $el = + { + Name => $name, + LocalName => $lname, + Prefix => $prefix, + NamespaceURI => $ns, + Attributes => \%attrib_hash, + }; + $self->start_element($el); + + # warn("($name\n"); + + if ($content) { + $self->content($reader); + + $reader->match_sequence('<', '/') || $self->parser_error("No close tag marker", $reader); + my $end_name = $self->Name($reader); + $end_name eq $name || $self->parser_error("End tag mismatch ($end_name != $name)", $reader); + $self->skip_whitespace($reader); + $reader->match_char('>') || $self->parser_error("No close '>' on end tag", $reader); + } + + my %end_el = %$el; + delete $end_el{Attributes}; + $self->end_element(\%end_el); + + for my $ns (@new_ns) { + $self->end_prefix_mapping($ns); + } + $self->{NSHelper}->pop_context; + + return 1; + } + + return 0; +} + +sub content { + my ($self, $reader) = @_; + + $self->CharData($reader); + + while (1) { + if ($reader->match_sequence('<', '/')) { + $reader->buffer('Reference($reader) || + $self->CDSect($reader) || + $self->PI($reader) || + $self->Comment($reader) || + $self->element($reader) + ) + { + $self->CharData($reader); + next; + } + else { + last; + } + } + + return 1; +} + +sub CDSect { + my ($self, $reader) = @_; + + if ($reader->match_sequence('<', '!', '[', 'C', 'D', 'A', 'T', 'A', '[')) { + $self->start_cdata({}); + my $chars = ''; + while (1) { + if ($reader->eof) { + $self->parser_error("EOF looking for CDATA section end", $reader); + } + $reader->consume_not(']'); + $chars .= $reader->consumed; + if ($reader->match_char(']')) { + if ($reader->match_sequence(']', '>')) { + # end of CDATA section + + $self->characters({Data => $chars}); + last; + } + $chars .= ']'; + } + } + $self->end_cdata({}); + return 1; + } + + return 0; +} + +sub CharData { + my ($self, $reader) = @_; + + my $chars = ''; + while (1) { + $reader->consume_not('<', '&', ']'); + $chars .= $reader->consumed; + if ($reader->match_char(']')) { + if ($reader->match_sequence(']', '>')) { + $self->parser_error("String ']]>' not allowed in character data", $reader); + } + else { + $chars .= ']'; + } + next; + } + last; + } + + $self->characters({ Data => $chars }) if length($chars); +} + +sub Misc { + my ($self, $reader) = @_; + if ($self->Comment($reader)) { + return 1; + } + elsif ($self->PI($reader)) { + return 1; + } + elsif ($self->skip_whitespace($reader)) { + return 1; + } + + return 0; +} + +sub Reference { + my ($self, $reader) = @_; + + if (!$reader->match_char('&')) { + return 0; + } + + if ($reader->match_char('#')) { + # CharRef + my $char; + my $ref; + if ($reader->match_char('x')) { + $reader->consume(qr/[0-9a-fA-F]/) || + $self->parser_error("Hex character reference contains illegal characters", $reader); + $ref = $reader->consumed; + $char = chr_ref(hex($ref)); + $ref = "x$ref"; + } + else { + $reader->consume(qr/[0-9]/) || + $self->parser_error("Decimal character reference contains illegal characters", $reader); + $ref = $reader->consumed; + $char = chr_ref($ref); + } + $reader->match_char(';') || + $self->parser_error("No semi-colon found after character reference", $reader); + if ($char !~ $SingleChar) { # match a single character + $self->parser_error("Character reference &#$ref; refers to an illegal XML character ($char)", $reader); + } + $self->characters({ Data => $char }); + return 1; + } + else { + # EntityRef + my $name = $self->Name($reader); + $reader->match_char(';') || + $self->parser_error("No semi-colon found after entity name", $reader); + + # expand it + if ($self->_is_entity($name)) { + + if ($self->_is_external($name)) { + my $value = $self->_get_entity($name); + my $ent_reader = XML::SAX::PurePerl::Reader::URI->new($value); + $self->encoding_detect($ent_reader); + $self->extParsedEnt($ent_reader); + } + else { + my $value = $self->_stringify_entity($name); + my $ent_reader = XML::SAX::PurePerl::Reader::String->new($value); + $self->content($ent_reader); + } + return 1; + } + elsif (_is_internal($name)) { + $self->characters({ Data => $int_ents{$name} }); + return 1; + } + else { + $self->parser_error("Undeclared entity", $reader); + } + } +} + +sub AttReference { + # a reference in an attribute value. + my ($self, $reader) = @_; + + if ($reader->match_char('#')) { + # CharRef + my $char; + my $ref; + if ($reader->match_char('x')) { + $reader->consume(qr/[0-9a-fA-F]/) || + $self->parser_error("Hex character reference contains illegal characters", $reader); + $ref = $reader->consumed; + $char = chr_ref(hex($ref)); + $ref = "x$ref"; + } + else { + $reader->consume(qr/[0-9]/) || + $self->parser_error("Decimal character reference contains illegal characters", $reader); + $ref = $reader->consumed; + $char = chr_ref($ref); + } + $reader->match_char(';') || + $self->parser_error("No semi-colon found after character reference", $reader); + if ($char !~ $SingleChar) { # match a single character + $self->parser_error("Character reference '&#$ref;' refers to an illegal XML character ($char)", $reader); + } + return $char; + } + else { + # EntityRef + my $name = $self->Name($reader); + $reader->match_char(';') || + $self->parser_error("No semi-colon found after entity name", $reader); + + # expand it + if ($self->_is_entity($name)) { + if ($self->_is_external($name)) { + $self->parser_error("No external entity references allowed in attribute values", $reader); + } + else { + my $value = $self->_stringify_entity($name); + return $value; + } + } + elsif (_is_internal($name)) { + return $int_ents{$name}; + } + else { + $self->parser_error("Undeclared entity '$name'", $reader); + } + } + +} + +sub extParsedEnt { + my ($self, $reader) = @_; + + $self->TextDecl($reader); + $self->content($reader); +} + +sub _is_internal { + my $e = shift; + return 1 if $e eq 'amp' || $e eq 'lt' || $e eq 'gt' || $e eq 'quot' || $e eq 'apos'; + return 0; +} + +sub _is_external { + my ($self, $name) = @_; +# TODO: Fix this to use $reader to store the entities perhaps. + if ($self->{ParseOptions}{external_entities}{$name}) { + return 1; + } + return ; +} + +sub _is_entity { + my ($self, $name) = @_; +# TODO: ditto above + if (exists $self->{ParseOptions}{entities}{$name}) { + return 1; + } + return 0; +} + +sub _stringify_entity { + my ($self, $name) = @_; +# TODO: ditto above + if (exists $self->{ParseOptions}{expanded_entity}{$name}) { + return $self->{ParseOptions}{expanded_entity}{$name}; + } + # expand + my $reader = XML::SAX::PurePerl::Reader::URI->new($self->{ParseOptions}{entities}{$name}); + $reader->consume(qr/./); + return $self->{ParseOptions}{expanded_entity}{$name} = $reader->consumed; +} + +sub _get_entity { + my ($self, $name) = @_; +# TODO: ditto above + return $self->{ParseOptions}{entities}{$name}; +} + +sub skip_whitespace { + my ($self, $reader) = @_; + + my $found = 0; + while (1) { + if ($reader->match_char("\x20") || + $reader->match_char("\x0A") || + $reader->match_char("\x0D") || + $reader->match_char("\x09")) + { + $found++; + } + else { + last; + } + } + return $found; +} + +sub Attribute { + my ($self, $reader) = @_; + + $self->skip_whitespace($reader) || return; + if ($reader->match_sequence('/', '>')) { + $reader->buffer("/>"); + return; + } + if ($reader->match_char(">")) { + $reader->buffer(">"); + return; + } + if (my $name = $self->Name($reader)) { + $self->skip_whitespace($reader); + $reader->match_char('=') || + $self->parser_error("No '=' in Attribute", $reader); + $self->skip_whitespace($reader); + my $value = $self->AttValue($reader); + + if (!$self->cdata_attrib($name)) { + $value =~ s/^\x20*//; # discard leading spaces + $value =~ s/\x20*$//; # discard trailing spaces + $value =~ s/ {1,}/ /g; # all >1 space to single space + } + + return $name, $value; + } + + return; +} + +sub cdata_attrib { + # TODO implement this! + return 0; +} + +sub AttValue { + my ($self, $reader) = @_; + + my $quote = '"'; + if (!$reader->match_char($quote)) { + $quote = "'"; + $reader->match_char($quote) || + $self->parser_error("Not a quote character", $reader); + } + + my $value = ''; + + while (1) { + if ($reader->consume_not('<', '&', $quote)) { + my $to_append = $reader->consumed; + $to_append =~ s/[\x09\x0A\x0D]/\x20/g; # Attrib value normalize + $value .= $to_append; + } + elsif ($reader->match_char('&')) { + $value .= $self->AttReference($reader); + } + elsif ($reader->match_char($quote)) { + # end of attrib + last; + } + else { + $self->parser_error("Invalid character in attribute value", $reader); + } + } + + return $value; +} + +sub Comment { + my ($self, $reader) = @_; + + if ($reader->match_sequence('<', '!', '-', '-')) { + my $comment_str = ''; + while (1) { + if ($reader->match_char('-')) { + if ($reader->match_char('-')) { + $reader->match_char('>') || + $self->parser_error("Invalid string in comment field", $reader); + last; + } + $comment_str .= '-'; + $reader->consume($CharMinusDash) || + $self->parser_error("Invalid string in comment field", $reader); + $comment_str .= $reader->consumed; + } + elsif ($reader->consume($CharMinusDash)) { + $comment_str .= $reader->consumed; + } + else { + $self->parser_error("Invalid string in comment field", $reader); + } + } + + $self->comment({ Data => $comment_str }); + + return 1; + } + return 0; +} + +sub PI { + my ($self, $reader) = @_; + if ($reader->match_sequence('<', '?')) { + my ($target, $data); + $target = $self->Name($reader) || + $self->parser_error("PI has no target", $reader); + if ($self->skip_whitespace($reader)) { + while (1) { + if ($reader->match_sequence('?', '>')) { + last; + } + elsif ($reader->match_re($Any)) { + $data .= $reader->matched; + } + else { + last; + } + } + } + else { + $reader->match_sequence('?', '>') || + $self->parser_error("PI closing sequence not found", $reader); + } + $self->processing_instruction({ Target => $target, Data => $data }); + + return 1; + } + return 0; +} + +sub Name { + my ($self, $reader) = @_; + + return $reader->consume_name(); +} + +sub quote { + my ($self, $reader) = @_; + my $quote = '"'; + + if (!$reader->match_char($quote)) { + $quote = "'"; + $reader->match_char($quote) || + $self->parser_error("Invalid quote token", $reader); + } + return $quote; +} + +1; +__END__ + +=head1 NAME + +XML::SAX::PurePerl - Pure Perl XML Parser with SAX2 interface + +=head1 SYNOPSIS + + use XML::Handler::Foo; + use XML::SAX::PurePerl; + my $handler = XML::Handler::Foo->new(); + my $parser = XML::SAX::PurePerl->new(Handler => $handler); + $parser->parse_uri("myfile.xml"); + +=head1 DESCRIPTION + +This module implements an XML parser in pure perl. It is written around the +upcoming perl 5.8's unicode support and support for multiple document +encodings (using the PerlIO layer), however it has been ported to work with +ASCII/UTF8 documents under lower perl versions. + +The SAX2 API is described in detail at http://sourceforge.net/projects/perl-xml/, in +the CVS archive, under libxml-perl/docs. Hopefully those documents will be in a +better location soon. + +Please refer to the SAX2 documentation for how to use this module - it is merely a +front end to SAX2, and implements nothing that is not in that spec (or at least tries +not to - please email me if you find errors in this implementation). + +=head1 BUGS + +XML::SAX::PurePerl is B. Very slow. I suggest you use something else +in fact. However it is great as a fallback parser for XML::SAX, where the +user might not be able to install an XS based parser or C library. + +Currently lots, probably. At the moment the weakest area is parsing DOCTYPE declarations, +though the code is in place to start doing this. Also parsing parameter entity +references is causing me much confusion, since it's not exactly what I would call +trivial, or well documented in the XML grammar. XML documents with internal subsets +are likely to fail. + +I am however trying to work towards full conformance using the Oasis test suite. + +=head1 AUTHOR + +Matt Sergeant, matt@sergeant.org. Copyright 2001. + +Please report all bugs to the Perl-XML mailing list at perl-xml@listserv.activestate.com. + +=head1 LICENSE + +This is free software. You may use it or redistribute it under the same terms as +Perl 5.7.2 itself. + +=cut + diff --git a/qcd/part_cpu/bench/lib/XML/SAX/PurePerl/DTDDecls.pm b/qcd/part_cpu/bench/lib/XML/SAX/PurePerl/DTDDecls.pm new file mode 100644 index 0000000000000000000000000000000000000000..32977a5f0b9153644f673e709389e0aec58b3fc1 --- /dev/null +++ b/qcd/part_cpu/bench/lib/XML/SAX/PurePerl/DTDDecls.pm @@ -0,0 +1,543 @@ +# $Id: DTDDecls.pm,v 1.1.1.1 2007/08/07 06:49:12 mahermanns Exp $ + +package XML::SAX::PurePerl; + +use strict; +use XML::SAX::PurePerl::Productions qw($NameChar $SingleChar); + +sub elementdecl { + my ($self, $reader) = @_; + + if ($reader->match_string('skip_whitespace($reader) || + $self->parser_error("No whitespace after ELEMENT declaration", $reader); + + my $name = $self->Name($reader); + + $self->skip_whitespace($reader) || + $self->parser_error("No whitespace after ELEMENT's name", $reader); + + $self->contentspec($reader, $name); + + $self->skip_whitespace($reader); + + $reader->match('>') || + $self->parser_error("Closing angle bracket not found on ELEMENT declaration", $reader); + + return 1; + } + + return 0; +} + +sub contentspec { + my ($self, $reader, $name) = @_; + + my $model; + if ($reader->match_string('EMPTY')) { + $model = 'EMPTY'; + } + elsif ($reader->match_string('ANY')) { + $model = 'ANY'; + } + else { + $model = $self->Mixed_or_children($reader); + } + + if ($model) { + # call SAX callback now. + $self->element_decl({Name => $name, Model => $model}); + return 1; + } + + $self->parser_error("contentspec not found in ELEMENT declaration", $reader); +} + +sub Mixed_or_children { + my ($self, $reader) = @_; + + my $model; + if ($reader->match('(')) { + $model = '('; + + $self->skip_whitespace($reader); + + if ($reader->match_string('#PCDATA')) { + return $self->Mixed($reader); + } + + # not matched - must be Children + $reader->buffer('('); + return $self->children($reader); + } + + return; +} + +# Mixed ::= ( '(' S* PCDATA ( S* '|' S* QName )* S* ')' '*' ) +# | ( '(' S* PCDATA S* ')' ) +sub Mixed { + my ($self, $reader) = @_; + + # Mixed_or_children already matched '(' S* '#PCDATA' + + my $model = '(#PCDATA'; + + $self->skip_whitespace($reader); + + my %seen; + + while ($reader->match('|')) { + $self->skip_whitespace($reader); + + my $name = $self->Name($reader) || + $self->parser_error("No 'Name' after Mixed content '|'", $reader); + + if ($seen{$name}) { + $self->parser_error("Element '$name' has already appeared in this group", $reader); + } + $seen{$name}++; + + $model .= "|$name"; + + $self->skip_whitespace($reader); + } + + $reader->match(')') || $self->parser_error("no closing bracket on mixed content", $reader); + + $model .= ")"; + + if ($reader->match('*')) { + $model .= "*"; + } + + return $model; +} + +# [[47]] Children ::= ChoiceOrSeq Cardinality? +# [[48]] Cp ::= ( QName | ChoiceOrSeq ) Cardinality? +# ChoiceOrSeq ::= '(' S* Cp ( Choice | Seq )? S* ')' +# [[49]] Choice ::= ( S* '|' S* Cp )+ +# [[50]] Seq ::= ( S* ',' S* Cp )+ +# // Children ::= (Choice | Seq) Cardinality? +# // Cp ::= ( QName | Choice | Seq) Cardinality? +# // Choice ::= '(' S* Cp ( S* '|' S* Cp )+ S* ')' +# // Seq ::= '(' S* Cp ( S* ',' S* Cp )* S* ')' +# [[51]] Mixed ::= ( '(' S* PCDATA ( S* '|' S* QName )* S* ')' MixedCardinality ) +# | ( '(' S* PCDATA S* ')' ) +# Cardinality ::= '?' | '+' | '*' +# MixedCardinality ::= '*' +sub children { + my ($self, $reader) = @_; + + return $self->ChoiceOrSeq($reader) . $self->Cardinality($reader); +} + +sub ChoiceOrSeq { + my ($self, $reader) = @_; + + $reader->match('(') || $self->parser_error("choice/seq contains no opening bracket", $reader); + + my $model = '('; + + $self->skip_whitespace($reader); + + $model .= $self->Cp($reader); + + if (my $choice = $self->Choice($reader)) { + $model .= $choice; + } + else { + $model .= $self->Seq($reader); + } + + $self->skip_whitespace($reader); + + $reader->match(')') || $self->parser_error("choice/seq contains no closing bracket", $reader); + + $model .= ')'; + + return $model; +} + +sub Cardinality { + my ($self, $reader) = @_; + # cardinality is always optional + if ($reader->match('?')) { + return '?'; + } + if ($reader->match('+')) { + return '+'; + } + if ($reader->match('*')) { + return '*'; + } + return ''; +} + +sub Cp { + my ($self, $reader) = @_; + + my $model; + if (my $name = $self->Name($reader)) { + return $name . $self->Cardinality($reader); + } + return $self->ChoiceOrSeq($reader) . $self->Cardinality($reader); +} + +sub Choice { + my ($self, $reader) = @_; + + my $model = ''; + $self->skip_whitespace($reader); + while ($reader->match('|')) { + $self->skip_whitespace($reader); + $model .= '|'; + $model .= $self->Cp($reader); + $self->skip_whitespace($reader); + } + + return $model; +} + +sub Seq { + my ($self, $reader) = @_; + + my $model = ''; + $self->skip_whitespace($reader); + while ($reader->match(',')) { + $self->skip_whitespace($reader); + $model .= ','; + $model .= $self->Cp($reader); + $self->skip_whitespace($reader); + } + + return $model; +} + +sub AttlistDecl { + my ($self, $reader) = @_; + + if ($reader->match_string('skip_whitespace($reader) || + $self->parser_error("No whitespace after ATTLIST declaration", $reader); + my $name = $self->Name($reader); + + $self->AttDefList($reader, $name); + + $self->skip_whitespace($reader); + $reader->match('>') || + $self->parser_error("Closing angle bracket not found on ATTLIST declaration", $reader); + return 1; + } + + return 0; +} + +sub AttDefList { + my ($self, $reader, $name) = @_; + + 1 while $self->AttDef($reader, $name); +} + +sub AttDef { + my ($self, $reader, $el_name) = @_; + + $self->skip_whitespace($reader) || return 0; + my $att_name = $self->Name($reader) || return 0; + $self->skip_whitespace($reader) || + $self->parser_error("No whitespace after Name in attribute definition", $reader); + my $att_type = $self->AttType($reader); + + $self->skip_whitespace($reader) || + $self->parser_error("No whitespace after AttType in attribute definition", $reader); + my ($default, $value) = $self->DefaultDecl($reader); + + # fire SAX event here! + $self->attribute_decl({ + eName => $el_name, + aName => $att_name, + Type => $att_type, + ValueDefault => $default, + Value => $value, + }); + return 1; +} + +sub AttType { + my ($self, $reader) = @_; + + return $self->StringType($reader) || + $self->TokenizedType($reader) || + $self->EnumeratedType($reader) || + $self->parser_error("Can't match AttType", $reader); +} + +sub StringType { + my ($self, $reader) = @_; + if ($reader->match_string('CDATA')) { + return 'CDATA'; + } + return; +} + +sub TokenizedType { + my ($self, $reader) = @_; + if ($reader->match_string('IDREFS')) { + return 'IDREFS'; + } + if ($reader->match_string('IDREF')) { + return 'IDREF'; + } + if ($reader->match_string('ID')) { + return 'ID'; + } + if ($reader->match_string('ENTITIES')) { + return 'ENTITIES'; + } + if ($reader->match_string('ENTITY')) { + return 'ENTITY'; + } + if ($reader->match_string('NMTOKENS')) { + return 'NMTOKENS'; + } + if ($reader->match_string('NMTOKEN')) { + return 'NMTOKEN'; + } + return; +} + +sub EnumeratedType { + my ($self, $reader) = @_; + return $self->NotationType($reader) || $self->Enumeration($reader); +} + +sub NotationType { + my ($self, $reader) = @_; + if ($reader->match_string('NOTATION')) { + $self->skip_whitespace($reader) || + $self->parser_error("No whitespace after NOTATION", $reader); + $reader->match('(') || + $self->parser_error("No opening bracket in notation section", $reader); + $self->skip_whitespace($reader); + my $model = 'NOTATION ('; + my $name = $self->Name($reader) || + $self->parser_error("No name in notation section", $reader); + $model .= $name; + $self->skip_whitespace($reader); + while ($reader->match('|')) { + $model .= '|'; + $self->skip_whitespace($reader); + my $name = $self->Name($reader) || + $self->parser_error("No name in notation section", $reader); + $model .= $name; + $self->skip_whitespace($reader); + } + $reader->match(')') || + $self->parser_error("No closing bracket in notation section", $reader); + $model .= ')'; + + return $model; + } + return; +} + +sub Enumeration { + my ($self, $reader) = @_; + if ($reader->match('(')) { + $self->skip_whitespace($reader); + my $model = '('; + my $nmtoken = $self->Nmtoken($reader) || + $self->parser_error("No Nmtoken in enumerated declaration", $reader); + $model .= $nmtoken; + $self->skip_whitespace($reader); + while ($reader->match('|')) { + $model .= '|'; + $self->skip_whitespace($reader); + my $nmtoken = $self->Nmtoken($reader) || + $self->parser_error("No Nmtoken in enumerated declaration", $reader); + $model .= $nmtoken; + $self->skip_whitespace($reader); + } + $reader->match(')') || + $self->parser_error("No closing bracket in enumerated declaration", $reader); + $model .= ')'; + + return $model; + } + return; +} + +sub Nmtoken { + my ($self, $reader) = @_; + $reader->consume($NameChar); + return $reader->consumed; +} + +sub DefaultDecl { + my ($self, $reader) = @_; + if ($reader->match_string('#REQUIRED')) { + return '#REQUIRED'; + } + if ($reader->match_string('#IMPLIED')) { + return '#IMPLIED'; + } + my $model = ''; + if ($reader->match_string('#FIXED')) { + $self->skip_whitespace($reader) || $self->parser_error( + "no whitespace after FIXED specifier", $reader); + my $value = $self->AttValue($reader); + return "#FIXED", $value; + } + my $value = $self->AttValue($reader); + return undef, $value; +} + +sub EntityDecl { + my ($self, $reader) = @_; + + if ($reader->match_string('skip_whitespace($reader) || $self->parser_error( + "No whitespace after ENTITY declaration", $reader); + + $self->PEDecl($reader) || $self->GEDecl($reader); + + $self->skip_whitespace($reader); + $reader->match('>') || $self->parser_error("No closing '>' in entity definition", $reader); + + return 1; + } + return 0; +} + +sub GEDecl { + my ($self, $reader) = @_; + + my $name = $self->Name($reader) || $self->parser_error("No entity name given", $reader); + $self->skip_whitespace($reader) || $self->parser_error("No whitespace after entity name", $reader); + + # TODO: ExternalID calls lexhandler method. Wrong place for it. + my $value; + if ($value = $self->ExternalID($reader)) { + $value .= $self->NDataDecl($reader); + } + else { + $value = $self->EntityValue($reader); + } + + if ($self->{ParseOptions}{entities}{$name}) { + warn("entity $name already exists\n"); + } else { + $self->{ParseOptions}{entities}{$name} = 1; + $self->{ParseOptions}{expanded_entity}{$name} = $value; # ??? + } + # do callback? + return 1; +} + +sub PEDecl { + my ($self, $reader) = @_; + + $reader->match('%') || return 0; + $self->skip_whitespace($reader) || $self->parser_error("No whitespace after parameter entity marker", $reader); + my $name = $self->Name($reader) || $self->parser_error("No parameter entity name given", $reader); + $self->skip_whitespace($reader) || $self->parser_error("No whitespace after parameter entity name", $reader); + my $value = $self->ExternalID($reader) || + $self->EntityValue($reader) || + $self->parser_error("PE is not a value or an external resource", $reader); + # do callback? + return 1; +} + +my $quotre = qr/[^%&\"]/; +my $aposre = qr/[^%&\']/; + +sub EntityValue { + my ($self, $reader) = @_; + + my $quote = '"'; + my $re = $quotre; + if (!$reader->match($quote)) { + $quote = "'"; + $re = $aposre; + $reader->match($quote) || + $self->parser_error("Not a quote character", $reader); + } + + my $value = ''; + + while (1) { + if ($reader->consume($re)) { + $value .= $reader->consumed; + } + elsif ($reader->match('&')) { + # if it's a char ref, expand now: + if ($reader->match('#')) { + my $char; + my $ref; + if ($reader->match('x')) { + $reader->consume(qr/[0-9a-fA-F]/) || + $self->parser_error("Hex character reference contains illegal characters", $reader); + $ref = $reader->consumed; + $char = chr_ref(hex($ref)); + $ref = "x$ref"; + } + else { + $reader->consume(qr/[0-9]/) || + $self->parser_error("Decimal character reference contains illegal characters", $reader); + $ref = $reader->consumed; + $char = chr($ref); + } + $reader->match(';') || + $self->parser_error("No semi-colon found after character reference", $reader); + if ($char !~ $SingleChar) { # match a single character + $self->parser_error("Character reference '&#$ref;' refers to an illegal XML character ($char)", $reader); + } + $value .= $char; + } + else { + # entity refs in entities get expanded later, so don't parse now. + $value .= '&'; + } + } + elsif ($reader->match('%')) { + $value .= $self->PEReference($reader); + } + elsif ($reader->match($quote)) { + # end of attrib + last; + } + else { + $self->parser_error("Invalid character in attribute value", $reader); + } + } + + return $value; +} + +sub NDataDecl { + my ($self, $reader) = @_; + $self->skip_whitespace($reader) || return ''; + $reader->match_string("NDATA") || return ''; + $self->skip_whitespace($reader) || $self->parser_error("No whitespace after NDATA declaration", $reader); + my $name = $self->Name($reader) || $self->parser_error("NDATA declaration lacks a proper Name", $reader); + return " NDATA $name"; +} + +sub NotationDecl { + my ($self, $reader) = @_; + + if ($reader->match_string('skip_whitespace($reader) || + $self->parser_error("No whitespace after NOTATION declaration", $reader); + $reader->consume(qr/[^>]/); # FIXME + $reader->match('>'); # FIXME + $self->notation_decl({Name => "FIXME", SystemId => "FIXME", PublicId => "FIXME" }); + return 1; + } + return 0; +} + +1; diff --git a/qcd/part_cpu/bench/lib/XML/SAX/PurePerl/DebugHandler.pm b/qcd/part_cpu/bench/lib/XML/SAX/PurePerl/DebugHandler.pm new file mode 100644 index 0000000000000000000000000000000000000000..0d72ea95b81e49def0dce6047abcdda403153ca8 --- /dev/null +++ b/qcd/part_cpu/bench/lib/XML/SAX/PurePerl/DebugHandler.pm @@ -0,0 +1,95 @@ +# $Id: DebugHandler.pm,v 1.1.1.1 2007/08/07 06:49:12 mahermanns Exp $ + +package XML::SAX::PurePerl::DebugHandler; + +use strict; + +sub new { + my $class = shift; + my %opts = @_; + return bless \%opts, $class; +} + +# DocumentHandler + +sub set_document_locator { + my $self = shift; + print "set_document_locator\n" if $ENV{DEBUG_XML}; + $self->{seen}{set_document_locator}++; +} + +sub start_document { + my $self = shift; + print "start_document\n" if $ENV{DEBUG_XML}; + $self->{seen}{start_document}++; +} + +sub end_document { + my $self = shift; + print "end_document\n" if $ENV{DEBUG_XML}; + $self->{seen}{end_document}++; +} + +sub start_element { + my $self = shift; + print "start_element\n" if $ENV{DEBUG_XML}; + $self->{seen}{start_element}++; +} + +sub end_element { + my $self = shift; + print "end_element\n" if $ENV{DEBUG_XML}; + $self->{seen}{end_element}++; +} + +sub characters { + my $self = shift; + print "characters\n" if $ENV{DEBUG_XML}; +# warn "Char: ", $_[0]->{Data}, "\n"; + $self->{seen}{characters}++; +} + +sub processing_instruction { + my $self = shift; + print "processing_instruction\n" if $ENV{DEBUG_XML}; + $self->{seen}{processing_instruction}++; +} + +sub ignorable_whitespace { + my $self = shift; + print "ignorable_whitespace\n" if $ENV{DEBUG_XML}; + $self->{seen}{ignorable_whitespace}++; +} + +# LexHandler + +sub comment { + my $self = shift; + print "comment\n" if $ENV{DEBUG_XML}; + $self->{seen}{comment}++; +} + +# DTDHandler + +sub notation_decl { + my $self = shift; + print "notation_decl\n" if $ENV{DEBUG_XML}; + $self->{seen}{notation_decl}++; +} + +sub unparsed_entity_decl { + my $self = shift; + print "unparsed_entity_decl\n" if $ENV{DEBUG_XML}; + $self->{seen}{entity_decl}++; +} + +# EntityResolver + +sub resolve_entity { + my $self = shift; + print "resolve_entity\n" if $ENV{DEBUG_XML}; + $self->{seen}{resolve_entity}++; + return ''; +} + +1; diff --git a/qcd/part_cpu/bench/lib/XML/SAX/PurePerl/DocType.pm b/qcd/part_cpu/bench/lib/XML/SAX/PurePerl/DocType.pm new file mode 100644 index 0000000000000000000000000000000000000000..6c92140c1cfd0f3e12c2ff99b629b182fcd581cb --- /dev/null +++ b/qcd/part_cpu/bench/lib/XML/SAX/PurePerl/DocType.pm @@ -0,0 +1,158 @@ +# $Id: DocType.pm,v 1.1.1.1 2007/08/07 06:49:12 mahermanns Exp $ + +package XML::SAX::PurePerl; + +use strict; +use XML::SAX::PurePerl::Productions qw($PubidChar); + +sub doctypedecl { + my ($self, $reader) = @_; + + if ($reader->match_string('skip_whitespace($reader) || + $self->parser_error("No whitespace after doctype declaration", $reader); + + my $root_name = $self->Name($reader) || + $self->parser_error("Doctype declaration has no root element name", $reader); + + if ($self->skip_whitespace($reader)) { + # might be externalid... + my %dtd = $self->ExternalID($reader); + # TODO: Call SAX event + } + + $self->skip_whitespace($reader); + + $self->InternalSubset($reader); + + $reader->match('>') || + $self->parser_error("Doctype not closed", $reader); + + return 1; + } + + return 0; +} + +sub ExternalID { + my ($self, $reader) = @_; + + if ($reader->match_string('SYSTEM')) { + $self->skip_whitespace($reader) || + $self->parser_error("No whitespace after SYSTEM identifier", $reader); + return (SYSTEM => $self->SystemLiteral($reader)); + } + elsif ($reader->match_string('PUBLIC')) { + $self->skip_whitespace($reader) || + $self->parser_error("No whitespace after PUBLIC identifier", $reader); + + my $quote = $self->quote($reader) || + $self->parser_error("Not a quote character in PUBLIC identifier", $reader); + + $reader->consume(qr/[^$quote]/); + my $pubid = $reader->consumed; + if ($pubid !~ /^($PubidChar)+$/) { + $self->parser_error("Invalid characters in PUBLIC identifier", $reader); + } + + $reader->match($quote) || + $self->parser_error("Invalid quote character ending PUBLIC identifier", $reader); + $self->skip_whitespace($reader) || + $self->parser_error("Not whitespace after PUBLIC ID in DOCTYPE", $reader); + + return (PUBLIC => $pubid, + SYSTEM => $self->SystemLiteral($reader)); + } + else { + return; + } + + return 1; +} + +sub SystemLiteral { + my ($self, $reader) = @_; + + my $quote = $self->quote($reader); + + $reader->consume(qr/[^$quote]/); + my $systemid = $reader->consumed; + + $reader->match($quote) || + $self->parser_error("Invalid token in System Literal", $reader); + return $systemid; +} + +sub InternalSubset { + my ($self, $reader) = @_; + + if ($reader->match('[')) { + + 1 while $self->IntSubsetDecl($reader); + + $reader->match(']') || + $self->parser_error("No close bracket on internal subset", $reader); + $self->skip_whitespace($reader); + return 1; + } + + return 0; +} + +sub IntSubsetDecl { + my ($self, $reader) = @_; + + return $self->DeclSep($reader) || $self->markupdecl($reader); +} + +sub DeclSep { + my ($self, $reader) = @_; + + if ($self->skip_whitespace($reader)) { + return 1; + } + + if ($self->PEReference($reader)) { + return 1; + } + +# if ($self->ParsedExtSubset($reader)) { +# return 1; +# } + + return 0; +} + +sub PEReference { + my ($self, $reader) = @_; + + if ($reader->match('%')) { + my $peref = $self->Name($reader) || + $self->parser_error("PEReference did not find a Name", $reader); + # TODO - load/parse the peref + + $reader->match(';') || + $self->parser_error("Invalid token in PEReference", $reader); + return 1; + } + + return 0; +} + +sub markupdecl { + my ($self, $reader) = @_; + + if ($self->elementdecl($reader) || + $self->AttlistDecl($reader) || + $self->EntityDecl($reader) || + $self->NotationDecl($reader) || + $self->PI($reader) || + $self->Comment($reader)) + { + return 1; + } + + return 0; +} + +1; diff --git a/qcd/part_cpu/bench/lib/XML/SAX/PurePerl/EncodingDetect.pm b/qcd/part_cpu/bench/lib/XML/SAX/PurePerl/EncodingDetect.pm new file mode 100644 index 0000000000000000000000000000000000000000..ad3e5a8536b67aeb222ef425976631627d3610a5 --- /dev/null +++ b/qcd/part_cpu/bench/lib/XML/SAX/PurePerl/EncodingDetect.pm @@ -0,0 +1,187 @@ +# $Id: EncodingDetect.pm,v 1.1.1.1 2007/08/07 06:49:12 mahermanns Exp $ + +package XML::SAX::PurePerl; # NB, not ::EncodingDetect! + +use strict; + +sub encoding_detect { + my ($parser, $reader) = @_; + + my $error = "Invalid byte sequence at start of file"; + + # BO == Byte Order mark + if ($reader->match_nocheck("\x00")) { + # maybe BO-UCS4-be, BO-UCS4-3412, UCS4-be, UCS4-2143, UCS4-3412, UTF-16BE + if ($reader->match_nocheck("\x00")) { + # maybe BO-UCS4-be, BO-UCS4-2143, UCS4-be, UCS4-2143 + if ($reader->match_nocheck("\xFE")) { + if ($reader->match_nonext("\xFF")) { + # BO-UCS4-be + $reader->set_encoding("UCS-4BE"); + $reader->next; + return; + } + } + elsif ($reader->match_nocheck("\xFF")) { + if ($reader->match_nonext("\xFE")) { + # BO-UCS-4-2143 + $reader->set_encoding("UCS-4-2143"); + $reader->next; + return; + } + } + elsif ($reader->match_nocheck("\x00")) { + if ($reader->match_nonext("\x3C")) { + # UCS4-be + $reader->set_encoding("UCS-4BE"); + $reader->next; + $reader->buffer('<'); + return; + } + } + elsif ($reader->match_nocheck("\x3C")) { + if ($reader->match_nonext("\x00")) { + # UCS-4-2143 + $reader->set_encoding("UCS-4-2143"); + $reader->next; + $reader->buffer('<'); + return; + } + } + } + elsif ($reader->match_nocheck("\x3C")) { + # maybe UCS4-3412, UTF-16BE + if ($reader->match_nocheck("\x00")) { + if ($reader->match_nonext("\x00")) { + # UCS4-3412 + $reader->set_encoding("UCS-4-3412"); + $reader->next; + # these are parsable chars + $reader->buffer("<"); + return; + } + elsif ($reader->match_nonext("\x3F")) { + # UTF-16BE + $reader->set_encoding("UTF-16BE"); + # these are parsable chars + $reader->buffer("parser_error($error, $reader); + } + elsif ($reader->match_nocheck("\xFF")) { + # maybe BO-UCS-4LE, UTF-16LE + if ($reader->match_nocheck("\xFE")) { + if ($reader->match_nocheck("\x00")) { + if ($reader->match_nonext("\x00")) { + $reader->set_encoding("UCS-4LE"); + $reader->next; + return; + } + } + else { + my $byte1 = $reader->current; + $reader->next; + my $char = chr unpack("v", $byte1 . $reader->current); + $reader->set_encoding("UTF-16LE"); + $reader->next; + $reader->buffer($char); + return; + } + } + + $parser->parser_error($error, $reader); + } + elsif ($reader->match_nocheck("\xFE")) { + # maybe BO-UCS-4-3412, UTF-16BE + if ($reader->match_nocheck("\xFF")) { + if ($reader->match_nocheck("\x00")) { + if ($reader->match_nonext("\x00")) { + $reader->set_encoding("UCS-4-3412"); + $reader->next; + return; + } + elsif ($reader->match_nonext("\x3C")) { + $reader->set_encoding("UTF-16BE"); + $reader->next; + $reader->buffer("<"); + return; + } + } + } + $parser->parser_error($error, $reader); + } + elsif ($reader->match_nocheck("\xEF")) { + if ($reader->match_nocheck("\xBB")) { + if ($reader->match_nonext("\xBF")) { + # OK, UTF-8 + $reader->set_encoding("UTF-8"); + $reader->next; + return; + } + } + $parser->parser_error($error, $reader); + } + elsif ($reader->match_nocheck("\x3C")) { + if ($reader->match_nocheck("\x00")) { + if ($reader->match_nocheck("\x00")) { + if ($reader->match_nonext("\x00")) { + $reader->set_encoding("UCS-4LE"); + $reader->next; + $reader->buffer("<"); + return; + } + } + elsif ($reader->match_nocheck("\x3F")) { + if ($reader->match_nonext("\x00")) { + $reader->set_encoding("UTF-16LE"); + $reader->next; + $reader->buffer("match_nocheck("\x3F")) { + if ($reader->match_nocheck("\x78")) { + if ($reader->match_nocheck("\x6D")) { + # some 7 or 8 bit charset with ASCII chars in right place + $reader->buffer("buffer('buffer('buffer("<"); + return; + } + } + elsif ($reader->match_nocheck("\x4C") && + $reader->match_nocheck("\x6F") && + $reader->match_nocheck("\xA7") && + $reader->match_nonext("\x94")) + { + $reader->set_encoding("EBCDIC"); + $reader->next; + return; + } + + # lets just try parsing it... + return; + + # $parser->parser_error($error, $reader); +} + +1; + diff --git a/qcd/part_cpu/bench/lib/XML/SAX/PurePerl/Exception.pm b/qcd/part_cpu/bench/lib/XML/SAX/PurePerl/Exception.pm new file mode 100644 index 0000000000000000000000000000000000000000..f9ab02068c0be1c554c8db004d2609747ed64afe --- /dev/null +++ b/qcd/part_cpu/bench/lib/XML/SAX/PurePerl/Exception.pm @@ -0,0 +1,67 @@ +# $Id: Exception.pm,v 1.1.1.1 2007/08/07 06:49:12 mahermanns Exp $ + +package XML::SAX::PurePerl::Exception; + +use strict; + +use overload '""' => "stringify"; + +use vars qw/$StackTrace/; + +$StackTrace = $ENV{XML_DEBUG} || 0; + +sub throw { + my $class = shift; + die $class->new(@_); +} + +sub new { + my $class = shift; + my %opts = @_; + die "Invalid options" unless exists $opts{Message}; + + if ($opts{reader}) { + return bless { Message => $opts{Message}, + Exception => undef, # not sure what this is for!!! + ColumnNumber => $opts{reader}->column, + LineNumber => $opts{reader}->line, + PublicId => $opts{reader}->public_id, + SystemId => $opts{reader}->system_id, + $StackTrace ? (StackTrace => stacktrace()) : (), + }, $class; + } + return bless { Message => $opts{Message}, + Exception => undef, # not sure what this is for!!! + }, $class; +} + +sub stringify { + my $self = shift; + local $^W; + return $self->{Message} . " [Ln: " . $self->{LineNumber} . + ", Col: " . $self->{ColumnNumber} . "]" . + ($StackTrace ? stackstring($self->{StackTrace}) : "") . "\n"; +} + +sub stacktrace { + my $i = 2; + my @fulltrace; + while (my @trace = caller($i++)) { + my %hash; + @hash{qw(Package Filename Line)} = @trace[0..2]; + push @fulltrace, \%hash; + } + return \@fulltrace; +} + +sub stackstring { + my $stacktrace = shift; + my $string = "\nFrom:\n"; + foreach my $current (@$stacktrace) { + $string .= $current->{Filename} . " Line: " . $current->{Line} . "\n"; + } + return $string; +} + +1; + diff --git a/qcd/part_cpu/bench/lib/XML/SAX/PurePerl/NoUnicodeExt.pm b/qcd/part_cpu/bench/lib/XML/SAX/PurePerl/NoUnicodeExt.pm new file mode 100644 index 0000000000000000000000000000000000000000..39eafd4bb56f295a2ca6ffa7c7d74ddf2f2695a7 --- /dev/null +++ b/qcd/part_cpu/bench/lib/XML/SAX/PurePerl/NoUnicodeExt.pm @@ -0,0 +1,28 @@ +# $Id: NoUnicodeExt.pm,v 1.1.1.1 2007/08/07 06:49:12 mahermanns Exp $ + +package XML::SAX::PurePerl; +use strict; + +sub chr_ref { + my $n = shift; + if ($n < 0x80) { + return chr ($n); + } + elsif ($n < 0x800) { + return pack ("CC", (($n >> 6) | 0xc0), (($n & 0x3f) | 0x80)); + } + elsif ($n < 0x10000) { + return pack ("CCC", (($n >> 12) | 0xe0), ((($n >> 6) & 0x3f) | 0x80), + (($n & 0x3f) | 0x80)); + } + elsif ($n < 0x110000) + { + return pack ("CCCC", (($n >> 18) | 0xf0), ((($n >> 12) & 0x3f) | 0x80), + ((($n >> 6) & 0x3f) | 0x80), (($n & 0x3f) | 0x80)); + } + else { + return undef; + } +} + +1; diff --git a/qcd/part_cpu/bench/lib/XML/SAX/PurePerl/Productions.pm b/qcd/part_cpu/bench/lib/XML/SAX/PurePerl/Productions.pm new file mode 100644 index 0000000000000000000000000000000000000000..21335ab685ddb34281455f9cfe1365ca7f416cfc --- /dev/null +++ b/qcd/part_cpu/bench/lib/XML/SAX/PurePerl/Productions.pm @@ -0,0 +1,151 @@ +# $Id: Productions.pm,v 1.1.1.1 2007/08/07 06:49:12 mahermanns Exp $ + +package XML::SAX::PurePerl::Productions; + +use Exporter; +@ISA = ('Exporter'); +@EXPORT_OK = qw($S $Char $VersionNum $BaseChar $Letter $Ideographic + $Extender $Digit $CombiningChar $EncNameStart $EncNameEnd $NameChar $CharMinusDash + $PubidChar $Any $SingleChar); + +### WARNING!!! All productions here must *only* match a *single* character!!! ### + +BEGIN { +$S = qr/^[\x20\x09\x0D\x0A]$/; + +$CharMinusDash = qr/[^-]/x; + +$Any = qr/ . /xms; + +$VersionNum = qr/ [a-zA-Z0-9_.:-]+ /x; + +$EncNameStart = qr/ [A-Za-z] /x; +$EncNameEnd = qr/ [A-Za-z0-9\._-] /x; + +$PubidChar = qr/ [\x20\x0D\x0Aa-zA-Z0-9'()\+,.\/:=\?;!*\#@\$_\%-] /x; + +if ($] < 5.006) { + eval <<' PERL'; + $Char = qr/^ [\x09\x0A\x0D\x20-\x7F]|([\xC0-\xFD][\x80-\xBF]+) $/x; + + $SingleChar = qr/^$Char$/; + + $BaseChar = qr/ [\x41-\x5A\x61-\x7A]|([\xC0-\xFD][\x80-\xBF]+) /x; + + $Extender = qr/ \xB7 /x; + + $Digit = qr/ [\x30-\x39] /x; + + $Letter = qr/^ $BaseChar $/x; + + # can't do this one without unicode + # $CombiningChar = qr/^$/msx; + + $NameChar = qr/^ $BaseChar | $Digit | [._:-] | $Extender $/x; + PERL + die $@ if $@; +} +else { + eval <<' PERL'; + + use utf8; # for 5.6 + + $Char = qr/^ [\x09\x0A\x0D\x{0020}-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}] $/x; + + $SingleChar = qr/^$Char$/; + + $BaseChar = qr/ +[\x{0041}-\x{005A}\x{0061}-\x{007A}\x{00C0}-\x{00D6}\x{00D8}-\x{00F6}] | +[\x{00F8}-\x{00FF}\x{0100}-\x{0131}\x{0134}-\x{013E}\x{0141}-\x{0148}] | +[\x{014A}-\x{017E}\x{0180}-\x{01C3}\x{01CD}-\x{01F0}\x{01F4}-\x{01F5}] | +[\x{01FA}-\x{0217}\x{0250}-\x{02A8}\x{02BB}-\x{02C1}\x{0386}\x{0388}-\x{038A}] | +[\x{038C}\x{038E}-\x{03A1}\x{03A3}-\x{03CE}\x{03D0}-\x{03D6}\x{03DA}] | +[\x{03DC}\x{03DE}\x{03E0}\x{03E2}-\x{03F3}\x{0401}-\x{040C}\x{040E}-\x{044F}] | +[\x{0451}-\x{045C}\x{045E}-\x{0481}\x{0490}-\x{04C4}\x{04C7}-\x{04C8}] | +[\x{04CB}-\x{04CC}\x{04D0}-\x{04EB}\x{04EE}-\x{04F5}\x{04F8}-\x{04F9}] | +[\x{0531}-\x{0556}\x{0559}\x{0561}-\x{0586}\x{05D0}-\x{05EA}\x{05F0}-\x{05F2}] | +[\x{0621}-\x{063A}\x{0641}-\x{064A}\x{0671}-\x{06B7}\x{06BA}-\x{06BE}] | +[\x{06C0}-\x{06CE}\x{06D0}-\x{06D3}\x{06D5}\x{06E5}-\x{06E6}\x{0905}-\x{0939}] | +[\x{093D}\x{0958}-\x{0961}\x{0985}-\x{098C}\x{098F}-\x{0990}] | +[\x{0993}-\x{09A8}\x{09AA}-\x{09B0}\x{09B2}\x{09B6}-\x{09B9}\x{09DC}-\x{09DD}] | +[\x{09DF}-\x{09E1}\x{09F0}-\x{09F1}\x{0A05}-\x{0A0A}\x{0A0F}-\x{0A10}] | +[\x{0A13}-\x{0A28}\x{0A2A}-\x{0A30}\x{0A32}-\x{0A33}\x{0A35}-\x{0A36}] | +[\x{0A38}-\x{0A39}\x{0A59}-\x{0A5C}\x{0A5E}\x{0A72}-\x{0A74}\x{0A85}-\x{0A8B}] | +[\x{0A8D}\x{0A8F}-\x{0A91}\x{0A93}-\x{0AA8}\x{0AAA}-\x{0AB0}] | +[\x{0AB2}-\x{0AB3}\x{0AB5}-\x{0AB9}\x{0ABD}\x{0AE0}\x{0B05}-\x{0B0C}] | +[\x{0B0F}-\x{0B10}\x{0B13}-\x{0B28}\x{0B2A}-\x{0B30}\x{0B32}-\x{0B33}] | +[\x{0B36}-\x{0B39}\x{0B3D}\x{0B5C}-\x{0B5D}\x{0B5F}-\x{0B61}\x{0B85}-\x{0B8A}] | +[\x{0B8E}-\x{0B90}\x{0B92}-\x{0B95}\x{0B99}-\x{0B9A}\x{0B9C}] | +[\x{0B9E}-\x{0B9F}\x{0BA3}-\x{0BA4}\x{0BA8}-\x{0BAA}\x{0BAE}-\x{0BB5}] | +[\x{0BB7}-\x{0BB9}\x{0C05}-\x{0C0C}\x{0C0E}-\x{0C10}\x{0C12}-\x{0C28}] | +[\x{0C2A}-\x{0C33}\x{0C35}-\x{0C39}\x{0C60}-\x{0C61}\x{0C85}-\x{0C8C}] | +[\x{0C8E}-\x{0C90}\x{0C92}-\x{0CA8}\x{0CAA}-\x{0CB3}\x{0CB5}-\x{0CB9}\x{0CDE}] | +[\x{0CE0}-\x{0CE1}\x{0D05}-\x{0D0C}\x{0D0E}-\x{0D10}\x{0D12}-\x{0D28}] | +[\x{0D2A}-\x{0D39}\x{0D60}-\x{0D61}\x{0E01}-\x{0E2E}\x{0E30}\x{0E32}-\x{0E33}] | +[\x{0E40}-\x{0E45}\x{0E81}-\x{0E82}\x{0E84}\x{0E87}-\x{0E88}\x{0E8A}] | +[\x{0E8D}\x{0E94}-\x{0E97}\x{0E99}-\x{0E9F}\x{0EA1}-\x{0EA3}\x{0EA5}\x{0EA7}] | +[\x{0EAA}-\x{0EAB}\x{0EAD}-\x{0EAE}\x{0EB0}\x{0EB2}-\x{0EB3}\x{0EBD}] | +[\x{0EC0}-\x{0EC4}\x{0F40}-\x{0F47}\x{0F49}-\x{0F69}\x{10A0}-\x{10C5}] | +[\x{10D0}-\x{10F6}\x{1100}\x{1102}-\x{1103}\x{1105}-\x{1107}\x{1109}] | +[\x{110B}-\x{110C}\x{110E}-\x{1112}\x{113C}\x{113E}\x{1140}\x{114C}\x{114E}] | +[\x{1150}\x{1154}-\x{1155}\x{1159}\x{115F}-\x{1161}\x{1163}\x{1165}] | +[\x{1167}\x{1169}\x{116D}-\x{116E}\x{1172}-\x{1173}\x{1175}\x{119E}\x{11A8}] | +[\x{11AB}\x{11AE}-\x{11AF}\x{11B7}-\x{11B8}\x{11BA}\x{11BC}-\x{11C2}] | +[\x{11EB}\x{11F0}\x{11F9}\x{1E00}-\x{1E9B}\x{1EA0}-\x{1EF9}\x{1F00}-\x{1F15}] | +[\x{1F18}-\x{1F1D}\x{1F20}-\x{1F45}\x{1F48}-\x{1F4D}\x{1F50}-\x{1F57}] | +[\x{1F59}\x{1F5B}\x{1F5D}\x{1F5F}-\x{1F7D}\x{1F80}-\x{1FB4}\x{1FB6}-\x{1FBC}] | +[\x{1FBE}\x{1FC2}-\x{1FC4}\x{1FC6}-\x{1FCC}\x{1FD0}-\x{1FD3}] | +[\x{1FD6}-\x{1FDB}\x{1FE0}-\x{1FEC}\x{1FF2}-\x{1FF4}\x{1FF6}-\x{1FFC}] | +[\x{2126}\x{212A}-\x{212B}\x{212E}\x{2180}-\x{2182}\x{3041}-\x{3094}] | +[\x{30A1}-\x{30FA}\x{3105}-\x{312C}\x{AC00}-\x{D7A3}] + /x; + + $Extender = qr/ +[\x{00B7}\x{02D0}\x{02D1}\x{0387}\x{0640}\x{0E46}\x{0EC6}\x{3005}\x{3031}-\x{3035}\x{309D}-\x{309E}\x{30FC}-\x{30FE}] +/x; + + $Digit = qr/ +[\x{0030}-\x{0039}\x{0660}-\x{0669}\x{06F0}-\x{06F9}\x{0966}-\x{096F}] | +[\x{09E6}-\x{09EF}\x{0A66}-\x{0A6F}\x{0AE6}-\x{0AEF}\x{0B66}-\x{0B6F}] | +[\x{0BE7}-\x{0BEF}\x{0C66}-\x{0C6F}\x{0CE6}-\x{0CEF}\x{0D66}-\x{0D6F}] | +[\x{0E50}-\x{0E59}\x{0ED0}-\x{0ED9}\x{0F20}-\x{0F29}] +/x; + + $CombiningChar = qr/ +[\x{0300}-\x{0345}\x{0360}-\x{0361}\x{0483}-\x{0486}\x{0591}-\x{05A1}] | +[\x{05A3}-\x{05B9}\x{05BB}-\x{05BD}\x{05BF}\x{05C1}-\x{05C2}\x{05C4}] | +[\x{064B}-\x{0652}\x{0670}\x{06D6}-\x{06DC}\x{06DD}-\x{06DF}\x{06E0}-\x{06E4}] | +[\x{06E7}-\x{06E8}\x{06EA}-\x{06ED}\x{0901}-\x{0903}\x{093C}] | +[\x{093E}-\x{094C}\x{094D}\x{0951}-\x{0954}\x{0962}-\x{0963}\x{0981}-\x{0983}] | +[\x{09BC}\x{09BE}\x{09BF}\x{09C0}-\x{09C4}\x{09C7}-\x{09C8}] | +[\x{09CB}-\x{09CD}\x{09D7}\x{09E2}-\x{09E3}\x{0A02}\x{0A3C}\x{0A3E}\x{0A3F}] | +[\x{0A40}-\x{0A42}\x{0A47}-\x{0A48}\x{0A4B}-\x{0A4D}\x{0A70}-\x{0A71}] | +[\x{0A81}-\x{0A83}\x{0ABC}\x{0ABE}-\x{0AC5}\x{0AC7}-\x{0AC9}\x{0ACB}-\x{0ACD}] | +[\x{0B01}-\x{0B03}\x{0B3C}\x{0B3E}-\x{0B43}\x{0B47}-\x{0B48}] | +[\x{0B4B}-\x{0B4D}\x{0B56}-\x{0B57}\x{0B82}-\x{0B83}\x{0BBE}-\x{0BC2}] | +[\x{0BC6}-\x{0BC8}\x{0BCA}-\x{0BCD}\x{0BD7}\x{0C01}-\x{0C03}\x{0C3E}-\x{0C44}] | +[\x{0C46}-\x{0C48}\x{0C4A}-\x{0C4D}\x{0C55}-\x{0C56}\x{0C82}-\x{0C83}] | +[\x{0CBE}-\x{0CC4}\x{0CC6}-\x{0CC8}\x{0CCA}-\x{0CCD}\x{0CD5}-\x{0CD6}] | +[\x{0D02}-\x{0D03}\x{0D3E}-\x{0D43}\x{0D46}-\x{0D48}\x{0D4A}-\x{0D4D}\x{0D57}] | +[\x{0E31}\x{0E34}-\x{0E3A}\x{0E47}-\x{0E4E}\x{0EB1}\x{0EB4}-\x{0EB9}] | +[\x{0EBB}-\x{0EBC}\x{0EC8}-\x{0ECD}\x{0F18}-\x{0F19}\x{0F35}\x{0F37}\x{0F39}] | +[\x{0F3E}\x{0F3F}\x{0F71}-\x{0F84}\x{0F86}-\x{0F8B}\x{0F90}-\x{0F95}] | +[\x{0F97}\x{0F99}-\x{0FAD}\x{0FB1}-\x{0FB7}\x{0FB9}\x{20D0}-\x{20DC}\x{20E1}] | +[\x{302A}-\x{302F}\x{3099}\x{309A}] +/x; + + $Ideographic = qr/ +[\x{4E00}-\x{9FA5}\x{3007}\x{3021}-\x{3029}] +/x; + + $Letter = qr/^ $BaseChar | $Ideographic $/x; + + $NameChar = qr/^ $Letter | $Digit | [._:-] | $CombiningChar | $Extender $/x; + PERL + + die $@ if $@; +} + +} + +1; diff --git a/qcd/part_cpu/bench/lib/XML/SAX/PurePerl/Reader.pm b/qcd/part_cpu/bench/lib/XML/SAX/PurePerl/Reader.pm new file mode 100644 index 0000000000000000000000000000000000000000..dc4d274f69a4af50e5d78844407879b52fc62065 --- /dev/null +++ b/qcd/part_cpu/bench/lib/XML/SAX/PurePerl/Reader.pm @@ -0,0 +1,339 @@ +# $Id: Reader.pm,v 1.1.1.1 2007/08/07 06:49:12 mahermanns Exp $ + +package XML::SAX::PurePerl::Reader; + +use strict; +use XML::SAX::PurePerl::Reader::URI; +use XML::SAX::PurePerl::Productions qw( $SingleChar $Letter $NameChar ); +use Exporter (); + +use vars qw(@ISA @EXPORT_OK); +@ISA = qw(Exporter); +@EXPORT_OK = qw( + EOF + BUFFER + INTERNAL_BUFFER + LINE + COLUMN + CURRENT + ENCODING +); + +use constant EOF => 0; +use constant BUFFER => 1; +use constant INTERNAL_BUFFER => 2; +use constant LINE => 3; +use constant COLUMN => 4; +use constant MATCHED => 5; +use constant CURRENT => 6; +use constant CONSUMED => 7; +use constant ENCODING => 8; +use constant SYSTEM_ID => 9; +use constant PUBLIC_ID => 10; + +require XML::SAX::PurePerl::Reader::Stream; +require XML::SAX::PurePerl::Reader::String; + +if ($] >= 5.007002) { + require XML::SAX::PurePerl::Reader::UnicodeExt; +} +else { + require XML::SAX::PurePerl::Reader::NoUnicodeExt; +} + +sub new { + my $class = shift; + my $thing = shift; + + # try to figure if this $thing is a handle of some sort + if (ref($thing) && UNIVERSAL::isa($thing, 'IO::Handle')) { + return XML::SAX::PurePerl::Reader::Stream->new($thing)->init; + } + my $ioref; + if (tied($thing)) { + my $class = ref($thing); + no strict 'refs'; + $ioref = $thing if defined &{"${class}::TIEHANDLE"}; + } + else { + eval { + $ioref = *{$thing}{IO}; + }; + undef $@; + } + if ($ioref) { + return XML::SAX::PurePerl::Reader::Stream->new($thing)->init; + } + + if ($thing =~ /new($thing)->init; + } + + # assume it is a uri + return XML::SAX::PurePerl::Reader::URI->new($thing)->init; +} + +sub init { + my $self = shift; + $self->[LINE] = 1; + $self->[COLUMN] = 1; + $self->nextchar; + return $self; +} + +sub match { + my $self = shift; + if ($self->match_nocheck(@_)) { + if ($self->[MATCHED] =~ $SingleChar) { + return 1; + } + throw XML::SAX::Exception::Parse ( + Message => "Not a valid XML character: '&#x". + sprintf("%X", ord($self->[MATCHED])). + ";'" + ); + } + return 0; +} + +sub match_char { + my $self = shift; + + if (defined($self->[CURRENT]) && $self->[CURRENT] eq $_[0]) { + $self->[MATCHED] = $_[0]; + $self->nextchar; + return 1; + } + $self->[MATCHED] = ''; + return 0; +} + +sub match_re { + my $self = shift; + + if ($self->[CURRENT] =~ $_[0]) { + $self->[MATCHED] = $self->[CURRENT]; + $self->nextchar; + return 1; + } + $self->[MATCHED] = ''; + return 0; +} + +sub match_not { + my $self = shift; + + my $current = $self->[CURRENT]; + return 0 unless defined $current; + + for my $m (@_) { + if ($current eq $m) { + $self->[MATCHED] = ''; + return 0; + } + } + $self->[MATCHED] = $current; + $self->nextchar; + return 1; +} + +my %hist; +END { + foreach my $k (sort { $hist{$a} <=> $hist{$b} } keys %hist ) { + my $x = $k; + $k =~ s/^(.{80})(.{3}).*/$1\.\.\./s; + # warn("$k called $hist{$x} times\n"); + } +} + +sub match_nonext { + my $self = shift; + + my $current = $self->[CURRENT]; + return 0 unless defined $current; + + foreach my $m (@_) { + # $hist{$m}++; + if (my $ref = ref($m)) { + if ($ref eq 'Regexp' && $current =~ $m) { + $self->[MATCHED] = $current; + return 1; + } + } + elsif ($current eq $m) { + $self->[MATCHED] = $current; + return 1; + } + } + $self->[MATCHED] = ''; + return 0; +} + +sub match_nocheck { + my $self = shift; + + if ($self->match_nonext(@_)) { + $self->nextchar; + + return 1; + } + return 0; +} + +sub matched { + my $self = shift; + return $self->[MATCHED]; +} + +my $unpack_type = ($] >= 5.007002) ? 'U*' : 'C*'; + +sub match_string { + my $self = shift; + my ($str) = @_; + my $matched = ''; +# for my $char (map { chr } unpack($unpack_type, $str)) { + for my $char (split //, $str) { + if ($self->match_char($char)) { + $matched .= $self->[MATCHED]; + } + else { + $self->buffer($matched); + return 0; + } + } + return 1; +} + +# avoids split +sub match_sequence { + my $self = shift; + my $matched = ''; + for my $char (@_) { + if ($self->match_char($char)) { + $matched .= $self->[MATCHED]; + } + else { + $self->buffer($matched); + return 0; + } + } + return 1; +} + +sub consume_name { + my $self = shift; + + my $current = $self->[CURRENT]; + return unless defined $current; # perhaps die here instead? + + my $name; + if ($current eq '_') { + $name = '_'; + } + elsif ($current eq ':') { + $name = ':'; + } + else { + $self->consume($Letter) || + throw XML::SAX::Exception::Parse ( + Message => "Name contains invalid start character: '&#x". + sprintf("%X", ord($self->[CURRENT])). + ";'", reader => $self ); + $name = $self->[CONSUMED]; + } + + $self->consume($NameChar); + $name .= $self->[CONSUMED]; + return $name; +} + +sub consume { + my $self = shift; + + my $consumed = ''; + + while(!$self->eof && $self->match_re(@_)) { + $consumed .= $self->[MATCHED]; + } + return length($self->[CONSUMED] = $consumed); +} + + + +sub consume_not { + my $self = shift; + + my $consumed = ''; + + while(!$self->[EOF] && $self->match_not(@_)) { + $consumed .= $self->[MATCHED]; + } + return length($self->[CONSUMED] = $consumed); +} + +sub consumed { + my $self = shift; + return $self->[CONSUMED]; +} + +sub current { + my $self = shift; + return $self->[CURRENT]; +} + +sub buffer { + my $self = shift; + # warn("buffering: '$_[0]' + '$self->[CURRENT]' + '$self->[BUFFER]'\n"); + local $^W; + my $current = $self->[CURRENT]; + if ($] >= 5.006 && $] < 5.007) { + $current = pack("C0A*", $current); + } + $self->[BUFFER] = $_[0] . $current . $self->[BUFFER]; + $self->[COLUMN] -= length($_[0]); + $self->nextchar; +} + +sub public_id { + my ($self, $value) = @_; + if (defined $value) { + return $self->[PUBLIC_ID] = $value; + } + return $self->[PUBLIC_ID]; +} + +sub system_id { + my ($self, $value) = @_; + if (defined $value) { + return $self->[SYSTEM_ID] = $value; + } + return $self->[SYSTEM_ID]; +} + +sub line { + shift->[LINE]; +} + +sub column { + shift->[COLUMN]; +} + +sub get_encoding { + my $self = shift; + return $self->[ENCODING]; +} + +sub eof { + return shift->[EOF]; +} + +1; + +__END__ + +=head1 NAME + +XML::Parser::PurePerl::Reader - Abstract Reader factory class + +=cut diff --git a/qcd/part_cpu/bench/lib/XML/SAX/PurePerl/Reader/NoUnicodeExt.pm b/qcd/part_cpu/bench/lib/XML/SAX/PurePerl/Reader/NoUnicodeExt.pm new file mode 100644 index 0000000000000000000000000000000000000000..c22af0b807de93ace604c9431af61a6a5c66baca --- /dev/null +++ b/qcd/part_cpu/bench/lib/XML/SAX/PurePerl/Reader/NoUnicodeExt.pm @@ -0,0 +1,99 @@ +# $Id: NoUnicodeExt.pm,v 1.1.1.1 2007/08/07 06:49:12 mahermanns Exp $ + +package XML::SAX::PurePerl::Reader; +use strict; + +use XML::SAX::PurePerl::Reader qw( + CURRENT + ENCODING +); + +sub set_raw_stream { + # no-op +} + +sub switch_encoding_stream { + my ($fh, $encoding) = @_; + throw XML::SAX::Exception::Parse ( + Message => "Only ASCII encoding allowed without perl 5.7.2 or higher. You tried: $encoding", + ) if $encoding !~ /(ASCII|UTF\-?8)/i; +} + +sub switch_encoding_string { + my (undef, $encoding) = @_; + throw XML::SAX::Exception::Parse ( + Message => "Only ASCII encoding allowed without perl 5.7.2 or higher. You tried: $encoding", + ) if $encoding !~ /(ASCII|UTF\-?8)/i; +} + +sub nextchar { + my $self = shift; + $self->next; + + return unless defined $self->[CURRENT]; + + if ($self->[CURRENT] eq "\x0D") { + $self->next; + return unless defined($self->[CURRENT]); + if ($self->[CURRENT] ne "\x0A") { + $self->buffer("\x0A"); + } + } + + return unless $self->[ENCODING]; + my $n = ord($self->[CURRENT]); + # warn(sprintf("ch: 0x%x ($self->[CURRENT])\n", $n)); + if (($] < 5.007002) && ($n > 0x7F)) { + # utf8 surrogate + my $current = $self->[CURRENT]; + if ($n >= 0xFC) { + # read 5 chars + $self->next; $current .= $self->[CURRENT]; + $self->next; $current .= $self->[CURRENT]; + $self->next; $current .= $self->[CURRENT]; + $self->next; $current .= $self->[CURRENT]; + $self->next; $current .= $self->[CURRENT]; + } + elsif ($n >= 0xF8) { + # read 4 chars + $self->next; $current .= $self->[CURRENT]; + $self->next; $current .= $self->[CURRENT]; + $self->next; $current .= $self->[CURRENT]; + $self->next; $current .= $self->[CURRENT]; + } + elsif ($n >= 0xF0) { + # read 3 chars + $self->next; $current .= $self->[CURRENT]; + $self->next; $current .= $self->[CURRENT]; + $self->next; $current .= $self->[CURRENT]; + } + elsif ($n >= 0xE0) { + # read 2 chars + $self->next; $current .= $self->[CURRENT]; + $self->next; $current .= $self->[CURRENT]; + } + elsif ($n >= 0xC0) { + # read 1 char + $self->next; $current .= $self->[CURRENT]; + } + else { + throw XML::SAX::Exception::Parse( + Message => sprintf("Invalid character 0x%x", $n), + ColumnNumber => $self->column, + LineNumber => $self->line, + PublicId => $self->public_id, + SystemId => $self->system_id, + ); + } + if ($] >= 5.006001) { + $self->[CURRENT] = pack("U0A*", $current); + } + else { + $self->[CURRENT] = $current; + } + # warn("read extra. current now: $current\n"); + } +} + +1; + diff --git a/qcd/part_cpu/bench/lib/XML/SAX/PurePerl/Reader/Stream.pm b/qcd/part_cpu/bench/lib/XML/SAX/PurePerl/Reader/Stream.pm new file mode 100644 index 0000000000000000000000000000000000000000..2cdd8c472a092a8c6f3a678334fa627bcf60c570 --- /dev/null +++ b/qcd/part_cpu/bench/lib/XML/SAX/PurePerl/Reader/Stream.pm @@ -0,0 +1,82 @@ +# $Id: Stream.pm,v 1.1.1.1 2007/08/07 06:49:12 mahermanns Exp $ + +package XML::SAX::PurePerl::Reader::Stream; + +use strict; +use vars qw(@ISA); + +use XML::SAX::PurePerl::Reader qw( + EOF + BUFFER + INTERNAL_BUFFER + LINE + COLUMN + CURRENT + ENCODING +); +use XML::SAX::Exception; + +@ISA = ('XML::SAX::PurePerl::Reader'); + +use constant FH => 11; +use constant BUFFER_SIZE => 12; + +sub new { + my $class = shift; + my $ioref = shift; + XML::SAX::PurePerl::Reader::set_raw_stream($ioref); + my @parts; + @parts[FH, LINE, COLUMN, BUFFER, EOF, INTERNAL_BUFFER, BUFFER_SIZE] = + ($ioref, 1, 0, '', 0, '', 1); + return bless \@parts, $class; +} + +sub next { + my $self = shift; + + # check for chars in buffer first. + if (length($self->[BUFFER])) { + return $self->[CURRENT] = substr($self->[BUFFER], 0, 1, ''); # last param truncates buffer + } + + + if (length($self->[INTERNAL_BUFFER])) { +BUFFERED_READ: + $self->[CURRENT] = substr($self->[INTERNAL_BUFFER], 0, 1, ''); + if ($self->[CURRENT] eq "\x0A") { + $self->[LINE]++; + $self->[COLUMN] = 1; + } + else { $self->[COLUMN]++ } + return; + } + + my $bytesread = read($self->[FH], $self->[INTERNAL_BUFFER], $self->[BUFFER_SIZE]); + if ($bytesread) { + goto BUFFERED_READ; + } + elsif (defined($bytesread)) { + $self->[EOF]++; + return $self->[CURRENT] = undef; + } + throw XML::SAX::Exception::Parse( + Message => "Error reading from filehandle: $!", + ); +} + +sub set_encoding { + my $self = shift; + my ($encoding) = @_; + # warn("set encoding to: $encoding\n"); + XML::SAX::PurePerl::Reader::switch_encoding_stream($self->[FH], $encoding); + $self->[BUFFER_SIZE] = 1024; + $self->[ENCODING] = $encoding; +} + +sub bytepos { + my $self = shift; + tell($self->[FH]); +} + +1; + diff --git a/qcd/part_cpu/bench/lib/XML/SAX/PurePerl/Reader/String.pm b/qcd/part_cpu/bench/lib/XML/SAX/PurePerl/Reader/String.pm new file mode 100644 index 0000000000000000000000000000000000000000..cf0b721609f9276f8824264a901c70903dff9ed0 --- /dev/null +++ b/qcd/part_cpu/bench/lib/XML/SAX/PurePerl/Reader/String.pm @@ -0,0 +1,65 @@ +# $Id: String.pm,v 1.1.1.1 2007/08/07 06:49:12 mahermanns Exp $ + +package XML::SAX::PurePerl::Reader::String; + +use strict; +use vars qw(@ISA); + +use XML::SAX::PurePerl::Reader qw( + CURRENT + LINE + COLUMN + INTERNAL_BUFFER + BUFFER + ENCODING + EOF +); + +@ISA = ('XML::SAX::PurePerl::Reader'); + +use constant DISCARDED => 11; + +sub new { + my $class = shift; + my $string = shift; + my @parts; + @parts[BUFFER, EOF, LINE, COLUMN, INTERNAL_BUFFER, DISCARDED] = + ('', 0, 1, 0, $string, ''); + return bless \@parts, $class; +} + +sub next { + my $self = shift; + + $self->[DISCARDED] .= $self->[CURRENT] if defined $self->[CURRENT]; + + # check for chars in buffer first. + if (length($self->[BUFFER])) { + return $self->[CURRENT] = substr($self->[BUFFER], 0, 1, ''); # last param truncates buffer + } + + $self->[CURRENT] = substr($self->[INTERNAL_BUFFER], 0, 1, ''); + + if ($self->[CURRENT] eq "\x0A") { + $self->[LINE]++; + $self->[COLUMN] = 1; + } else { $self->[COLUMN]++ } + + $self->[EOF]++ unless length($self->[INTERNAL_BUFFER]); + return; +} + +sub set_encoding { + my $self = shift; + my ($encoding) = @_; + + XML::SAX::PurePerl::Reader::switch_encoding_string($self->[INTERNAL_BUFFER], $encoding, "utf-8"); + $self->[ENCODING] = $encoding; +} + +sub bytepos { + my $self = shift; + length($self->[DISCARDED]); +} + +1; diff --git a/qcd/part_cpu/bench/lib/XML/SAX/PurePerl/Reader/URI.pm b/qcd/part_cpu/bench/lib/XML/SAX/PurePerl/Reader/URI.pm new file mode 100644 index 0000000000000000000000000000000000000000..d192ad5521e5dcbe5ba7b96feb2627135fbff400 --- /dev/null +++ b/qcd/part_cpu/bench/lib/XML/SAX/PurePerl/Reader/URI.pm @@ -0,0 +1,57 @@ +# $Id: URI.pm,v 1.1.1.1 2007/08/07 06:49:12 mahermanns Exp $ + +package XML::SAX::PurePerl::Reader::URI; + +use strict; + +use XML::SAX::PurePerl::Reader; +use File::Temp qw(tempfile); +use Symbol; + +## NOTE: This is *not* a subclass of Reader. It just returns Stream or String +## Reader objects depending on what it's capabilities are. + +sub new { + my $class = shift; + my $uri = shift; + # request the URI + if (-e $uri && -f _) { + my $fh = gensym; + open($fh, $uri) || die "Cannot open file $uri : $!"; + return XML::SAX::PurePerl::Reader::Stream->new($fh); + } + elsif ($uri =~ /^file:(.*)$/ && -e $1 && -f _) { + my $file = $1; + my $fh = gensym; + open($fh, $file) || die "Cannot open file $file : $!"; + return XML::SAX::PurePerl::Reader::Stream->new($fh); + } + else { + # request URI, return String reader + require LWP::UserAgent; + my $ua = LWP::UserAgent->new; + $ua->agent("Perl/XML/SAX/PurePerl/1.0 " . $ua->agent); + + my $req = HTTP::Request->new(GET => $uri); + + my $fh = tempfile(); + + my $callback = sub { + my ($data, $response, $protocol) = @_; + print $fh $data; + }; + + my $res = $ua->request($req, $callback, 4096); + + if ($res->is_success) { + seek($fh, 0, 0); + return XML::SAX::PurePerl::Reader::Stream->new($fh); + } + else { + die "LWP Request Failed"; + } + } +} + + +1; diff --git a/qcd/part_cpu/bench/lib/XML/SAX/PurePerl/Reader/UnicodeExt.pm b/qcd/part_cpu/bench/lib/XML/SAX/PurePerl/Reader/UnicodeExt.pm new file mode 100644 index 0000000000000000000000000000000000000000..d87758a7f18e93fe27a72a1d9ed6db43fb36b671 --- /dev/null +++ b/qcd/part_cpu/bench/lib/XML/SAX/PurePerl/Reader/UnicodeExt.pm @@ -0,0 +1,40 @@ +# $Id: UnicodeExt.pm,v 1.1.1.1 2007/08/07 06:49:12 mahermanns Exp $ + +package XML::SAX::PurePerl::Reader; +use strict; + +use XML::SAX::PurePerl::Reader qw(CURRENT); +use Encode; + +sub set_raw_stream { + my ($fh) = @_; + binmode($fh, ":bytes"); +} + +sub switch_encoding_stream { + my ($fh, $encoding) = @_; + binmode($fh, ":encoding($encoding)"); +} + +sub switch_encoding_string { + Encode::from_to($_[0], $_[1], "utf-8"); +} + +sub nextchar { + my $self = shift; + $self->next; + + return unless defined($self->[CURRENT]); + + if ($self->[CURRENT] eq "\x0D") { + $self->next; + return unless defined($self->[CURRENT]); + if ($self->[CURRENT] ne "\x0A") { + $self->buffer("\x0A"); + } + } +} + + +1; + diff --git a/qcd/part_cpu/bench/lib/XML/SAX/PurePerl/UnicodeExt.pm b/qcd/part_cpu/bench/lib/XML/SAX/PurePerl/UnicodeExt.pm new file mode 100644 index 0000000000000000000000000000000000000000..8a33cc832b24f88dec587bf77491d0aa52d06f1c --- /dev/null +++ b/qcd/part_cpu/bench/lib/XML/SAX/PurePerl/UnicodeExt.pm @@ -0,0 +1,22 @@ +# $Id: UnicodeExt.pm,v 1.1.1.1 2007/08/07 06:49:12 mahermanns Exp $ + +package XML::SAX::PurePerl; +use strict; + +no warnings 'utf8'; + +sub chr_ref { + return chr(shift); +} + +if ($] >= 5.007002) { + require Encode; + + Encode::define_alias( "UTF-16" => "UCS-2" ); + Encode::define_alias( "UTF-16BE" => "UCS-2" ); + Encode::define_alias( "UTF-16LE" => "ucs-2le" ); + Encode::define_alias( "UTF16LE" => "ucs-2le" ); +} + +1; + diff --git a/qcd/part_cpu/bench/lib/XML/SAX/PurePerl/XMLDecl.pm b/qcd/part_cpu/bench/lib/XML/SAX/PurePerl/XMLDecl.pm new file mode 100644 index 0000000000000000000000000000000000000000..a01b116b93a608d344294f8dc6f0c0197a4b9de0 --- /dev/null +++ b/qcd/part_cpu/bench/lib/XML/SAX/PurePerl/XMLDecl.pm @@ -0,0 +1,150 @@ +# $Id: XMLDecl.pm,v 1.1.1.1 2007/08/07 06:49:12 mahermanns Exp $ + +package XML::SAX::PurePerl; + +use strict; +use XML::SAX::PurePerl::Productions qw($S $VersionNum $EncNameStart $EncNameEnd); + +sub XMLDecl { + my ($self, $reader) = @_; + + if ($reader->match_string("match($S)) { + $self->skip_whitespace($reader); + + # get version attribute + $self->VersionInfo($reader) || + $self->parser_error("XML Declaration lacks required version attribute", $reader); + + if (!$self->skip_whitespace($reader)) { + $reader->match_string('?>') || $self->parser_error("Syntax error", $reader); + return; + } + + if ($self->EncodingDecl($reader)) { + if (!$self->skip_whitespace($reader)) { + $reader->match_string('?>') || $self->parser_error("Syntax error", $reader); + return; + } + } + + $self->SDDecl($reader); + + $self->skip_whitespace($reader); + + $reader->match_string('?>') || $self->parser_error("Syntax error in XML declaration", $reader); + # TODO: Call SAX event (xml_decl?) + # actually, sax has no xml_decl event. + } + else { + # no xml decl + if (!$reader->get_encoding) { + $reader->set_encoding("UTF-8"); + } + } +} + +sub VersionInfo { + my ($self, $reader) = @_; + + $reader->match_string('version') + || return 0; + $self->skip_whitespace($reader); + $reader->match('=') || + $self->parser_error("Invalid token", $reader); + $self->skip_whitespace($reader); + + # set right quote char + my $quote = $self->quote($reader); + + # get version value + $reader->consume($VersionNum) || + $self->parser_error("Version number contains invalid characters", $reader); + + my $vernum = $reader->consumed; + if ($vernum ne "1.0") { + $self->parser_error("Only XML version 1.0 supported. Saw: '$vernum'", $reader); + } + + $reader->match($quote) || + $self->parser_error("Invalid token while looking for quote character", $reader); + + return 1; +} + +sub SDDecl { + my ($self, $reader) = @_; + + $reader->match_string("standalone") || return 0; + + $self->skip_whitespace($reader); + $reader->match('=') || $self->parser_error( + "No '=' by standalone declaration", $reader); + $self->skip_whitespace($reader); + + my $quote = $self->quote($reader); + + if ($reader->match_string('yes')) { + $self->{standalone} = 1; + } + elsif ($reader->match_string('no')) { + $self->{standalone} = 0; + } + else { + $self->parser_error("standalone declaration must be 'yes' or 'no'", $reader); + } + + $reader->match($quote) || + $self->parser_error("Invalid token in XML declaration", $reader); + + return 1; +} + +sub EncodingDecl { + my ($self, $reader) = @_; + + $reader->match_string('encoding') || return 0; + + $self->skip_whitespace($reader); + $reader->match('=') || $self->parser_error( + "No '=' by encoding declaration", $reader); + $self->skip_whitespace($reader); + + my $quote = $self->quote($reader); + + my $encoding = ''; + $reader->match($EncNameStart) || + $self->parser_error("Invalid encoding name", $reader); + $encoding .= $reader->matched; + $reader->consume($EncNameEnd); + $encoding .= $reader->consumed; + $reader->set_encoding($encoding); + + $reader->match($quote) || + $self->parser_error("Invalid token in XML declaration", $reader); + + return 1; +} + +sub TextDecl { + my ($self, $reader) = @_; + + $reader->match_string('skip_whitespace($reader) || $self->parser_error("No whitespace after text declaration", $reader); + + if ($self->VersionInfo($reader)) { + $self->skip_whitespace($reader) || + $self->parser_error("Lack of whitespace after version attribute in text declaration", $reader); + } + + $self->EncodingDecl($reader) || + $self->parser_error("Encoding declaration missing from external entity text declaration", $reader); + + $self->skip_whitespace($reader); + + $reader->match_string('?>') || $self->parser_error("Syntax error", $reader); + + return 1; +} + +1; diff --git a/qcd/part_cpu/bench/lib/XML/SAX/placeholder.pl b/qcd/part_cpu/bench/lib/XML/SAX/placeholder.pl new file mode 100644 index 0000000000000000000000000000000000000000..f5996ee63ab5c69ae608cc615c5a911db26fc925 --- /dev/null +++ b/qcd/part_cpu/bench/lib/XML/SAX/placeholder.pl @@ -0,0 +1 @@ +# ignore me diff --git a/qcd/part_cpu/bench/lib/XML/Simple.pm b/qcd/part_cpu/bench/lib/XML/Simple.pm new file mode 100644 index 0000000000000000000000000000000000000000..3dd7a37925ab8e41b8efe69107a7c48a060cbdad --- /dev/null +++ b/qcd/part_cpu/bench/lib/XML/Simple.pm @@ -0,0 +1,3041 @@ +# $Id: Simple.pm,v 1.1.1.1 2007/08/07 06:49:12 mahermanns Exp $ + +package XML::Simple; + +=head1 NAME + +XML::Simple - Easy API to maintain XML (esp config files) + +=head1 SYNOPSIS + + use XML::Simple; + + my $ref = XMLin([] [, ]); + + my $xml = XMLout($hashref [, ]); + +Or the object oriented way: + + require XML::Simple; + + my $xs = new XML::Simple(options); + + my $ref = $xs->XMLin([] [, ]); + + my $xml = $xs->XMLout($hashref [, ]); + +(or see L<"SAX SUPPORT"> for 'the SAX way'). + +To catch common errors: + + use XML::Simple qw(:strict); + +(see L<"STRICT MODE"> for more details). + +=cut + +# See after __END__ for more POD documentation + + +# Load essentials here, other modules loaded on demand later + +use strict; +use Carp; +require Exporter; + + +############################################################################## +# Define some constants +# + +use vars qw($VERSION @ISA @EXPORT @EXPORT_OK $PREFERRED_PARSER); + +@ISA = qw(Exporter); +@EXPORT = qw(XMLin XMLout); +@EXPORT_OK = qw(xml_in xml_out); +$VERSION = '2.14'; +$PREFERRED_PARSER = undef; + +my $StrictMode = 0; +my %CacheScheme = ( + storable => [ \&StorableSave, \&StorableRestore ], + memshare => [ \&MemShareSave, \&MemShareRestore ], + memcopy => [ \&MemCopySave, \&MemCopyRestore ] + ); + +my @KnownOptIn = qw(keyattr keeproot forcecontent contentkey noattr + searchpath forcearray cache suppressempty parseropts + grouptags nsexpand datahandler varattr variables + normalisespace normalizespace valueattr); + +my @KnownOptOut = qw(keyattr keeproot contentkey noattr + rootname xmldecl outputfile noescape suppressempty + grouptags nsexpand handler noindent attrindent nosort + valueattr numericescape); + +my @DefKeyAttr = qw(name key id); +my $DefRootName = qq(opt); +my $DefContentKey = qq(content); +my $DefXmlDecl = qq(); + +my $xmlns_ns = 'http://www.w3.org/2000/xmlns/'; +my $bad_def_ns_jcn = '{' . $xmlns_ns . '}'; # LibXML::SAX workaround + + +############################################################################## +# Globals for use by caching routines +# + +my %MemShareCache = (); +my %MemCopyCache = (); + + +############################################################################## +# Wrapper for Exporter - handles ':strict' +# + +sub import { + + # Handle the :strict tag + + $StrictMode = 1 if grep(/^:strict$/, @_); + + # Pass everything else to Exporter.pm + + __PACKAGE__->export_to_level(1, grep(!/^:strict$/, @_)); +} + + +############################################################################## +# Constructor for optional object interface. +# + +sub new { + my $class = shift; + + if(@_ % 2) { + croak "Default options must be name=>value pairs (odd number supplied)"; + } + + my %known_opt; + @known_opt{@KnownOptIn, @KnownOptOut} = (undef) x 100; + + my %raw_opt = @_; + my %def_opt; + while(my($key, $val) = each %raw_opt) { + my $lkey = lc($key); + $lkey =~ s/_//g; + croak "Unrecognised option: $key" unless(exists($known_opt{$lkey})); + $def_opt{$lkey} = $val; + } + my $self = { def_opt => \%def_opt }; + + return(bless($self, $class)); +} + + +############################################################################## +# Sub/Method: XMLin() +# +# Exported routine for slurping XML into a hashref - see pod for info. +# +# May be called as object method or as a plain function. +# +# Expects one arg for the source XML, optionally followed by a number of +# name => value option pairs. +# + +sub XMLin { + + # If this is not a method call, create an object + + my $self; + if($_[0] and UNIVERSAL::isa($_[0], 'XML::Simple')) { + $self = shift; + } + else { + $self = new XML::Simple(); + } + + + my $string = shift; + + $self->handle_options('in', @_); + + + # If no XML or filename supplied, look for scriptname.xml in script directory + + unless(defined($string)) { + + # Translate scriptname[.suffix] to scriptname.xml + + require File::Basename; + + my($ScriptName, $ScriptDir, $Extension) = + File::Basename::fileparse($0, '\.[^\.]+'); + + $string = $ScriptName . '.xml'; + + + # Add script directory to searchpath + + if($ScriptDir) { + unshift(@{$self->{opt}->{searchpath}}, $ScriptDir); + } + } + + + # Are we parsing from a file? If so, is there a valid cache available? + + my($filename, $scheme); + unless($string =~ m{<.*?>}s or ref($string) or $string eq '-') { + + require File::Basename; + require File::Spec; + + $filename = $self->find_xml_file($string, @{$self->{opt}->{searchpath}}); + + if($self->{opt}->{cache}) { + foreach $scheme (@{$self->{opt}->{cache}}) { + croak "Unsupported caching scheme: $scheme" + unless($CacheScheme{$scheme}); + + my $opt = $CacheScheme{$scheme}->[1]->($filename); + return($opt) if($opt); + } + } + } + else { + delete($self->{opt}->{cache}); + if($string eq '-') { + # Read from standard input + + local($/) = undef; + $string = ; + } + } + + + # Parsing is required, so let's get on with it + + my $tree = $self->build_tree($filename, $string); + + + # Now work some magic on the resulting parse tree + + my($ref); + if($self->{opt}->{keeproot}) { + $ref = $self->collapse({}, @$tree); + } + else { + $ref = $self->collapse(@{$tree->[1]}); + } + + if($self->{opt}->{cache}) { + $CacheScheme{$self->{opt}->{cache}->[0]}->[0]->($ref, $filename); + } + + return($ref); +} + + +############################################################################## +# Method: build_tree() +# +# This routine will be called if there is no suitable pre-parsed tree in a +# cache. It parses the XML and returns an XML::Parser 'Tree' style data +# structure (summarised in the comments for the collapse() routine below). +# +# XML::Simple requires the services of another module that knows how to +# parse XML. If XML::SAX is installed, the default SAX parser will be used, +# otherwise XML::Parser will be used. +# +# This routine expects to be passed a 'string' as argument 1 or a filename as +# argument 2. The 'string' might be a string of XML or it might be a +# reference to an IO::Handle. (This non-intuitive mess results in part from +# the way XML::Parser works but that's really no excuse). +# + +sub build_tree { + my $self = shift; + my $filename = shift; + my $string = shift; + + + my $preferred_parser = $PREFERRED_PARSER; + unless(defined($preferred_parser)) { + $preferred_parser = $ENV{XML_SIMPLE_PREFERRED_PARSER} || ''; + } + if($preferred_parser eq 'XML::Parser') { + return($self->build_tree_xml_parser($filename, $string)); + } + + eval { require XML::SAX; }; # We didn't need it until now + if($@) { # No XML::SAX - fall back to XML::Parser + if($preferred_parser) { # unless a SAX parser was expressly requested + croak "XMLin() could not load XML::SAX"; + } + return($self->build_tree_xml_parser($filename, $string)); + } + + $XML::SAX::ParserPackage = $preferred_parser if($preferred_parser); + + my $sp = XML::SAX::ParserFactory->parser(Handler => $self); + + $self->{nocollapse} = 1; + my($tree); + if($filename) { + $tree = $sp->parse_uri($filename); + } + else { + if(ref($string)) { + $tree = $sp->parse_file($string); + } + else { + $tree = $sp->parse_string($string); + } + } + + return($tree); +} + + +############################################################################## +# Method: build_tree_xml_parser() +# +# This routine will be called if XML::SAX is not installed, or if XML::Parser +# was specifically requested. It takes the same arguments as build_tree() and +# returns the same data structure (XML::Parser 'Tree' style). +# + +sub build_tree_xml_parser { + my $self = shift; + my $filename = shift; + my $string = shift; + + + eval { + local($^W) = 0; # Suppress warning from Expat.pm re File::Spec::load() + require XML::Parser; # We didn't need it until now + }; + if($@) { + croak "XMLin() requires either XML::SAX or XML::Parser"; + } + + if($self->{opt}->{nsexpand}) { + carp "'nsexpand' option requires XML::SAX"; + } + + my $xp = new XML::Parser(Style => 'Tree', @{$self->{opt}->{parseropts}}); + my($tree); + if($filename) { + # $tree = $xp->parsefile($filename); # Changed due to prob w/mod_perl + local(*XML_FILE); + open(XML_FILE, '<', $filename) || croak qq($filename - $!); + $tree = $xp->parse(*XML_FILE); + close(XML_FILE); + } + else { + $tree = $xp->parse($string); + } + + return($tree); +} + + +############################################################################## +# Sub: StorableSave() +# +# Wrapper routine for invoking Storable::nstore() to cache a parsed data +# structure. +# + +sub StorableSave { + my($data, $filename) = @_; + + my $cachefile = $filename; + $cachefile =~ s{(\.xml)?$}{.stor}; + + require Storable; # We didn't need it until now + + if ('VMS' eq $^O) { + Storable::nstore($data, $cachefile); + } + else { + # If the following line fails for you, your Storable.pm is old - upgrade + Storable::lock_nstore($data, $cachefile); + } + +} + + +############################################################################## +# Sub: StorableRestore() +# +# Wrapper routine for invoking Storable::retrieve() to read a cached parsed +# data structure. Only returns cached data if the cache file exists and is +# newer than the source XML file. +# + +sub StorableRestore { + my($filename) = @_; + + my $cachefile = $filename; + $cachefile =~ s{(\.xml)?$}{.stor}; + + return unless(-r $cachefile); + return unless((stat($cachefile))[9] > (stat($filename))[9]); + + require Storable; # We didn't need it until now + + if ('VMS' eq $^O) { + return(Storable::retrieve($cachefile)); + } + else { + return(Storable::lock_retrieve($cachefile)); + } + +} + + +############################################################################## +# Sub: MemShareSave() +# +# Takes the supplied data structure reference and stores it away in a global +# hash structure. +# + +sub MemShareSave { + my($data, $filename) = @_; + + $MemShareCache{$filename} = [time(), $data]; +} + + +############################################################################## +# Sub: MemShareRestore() +# +# Takes a filename and looks in a global hash for a cached parsed version. +# + +sub MemShareRestore { + my($filename) = @_; + + return unless($MemShareCache{$filename}); + return unless($MemShareCache{$filename}->[0] > (stat($filename))[9]); + + return($MemShareCache{$filename}->[1]); + +} + + +############################################################################## +# Sub: MemCopySave() +# +# Takes the supplied data structure and stores a copy of it in a global hash +# structure. +# + +sub MemCopySave { + my($data, $filename) = @_; + + require Storable; # We didn't need it until now + + $MemCopyCache{$filename} = [time(), Storable::dclone($data)]; +} + + +############################################################################## +# Sub: MemCopyRestore() +# +# Takes a filename and looks in a global hash for a cached parsed version. +# Returns a reference to a copy of that data structure. +# + +sub MemCopyRestore { + my($filename) = @_; + + return unless($MemCopyCache{$filename}); + return unless($MemCopyCache{$filename}->[0] > (stat($filename))[9]); + + return(Storable::dclone($MemCopyCache{$filename}->[1])); + +} + + +############################################################################## +# Sub/Method: XMLout() +# +# Exported routine for 'unslurping' a data structure out to XML. +# +# Expects a reference to a data structure and an optional list of option +# name => value pairs. +# + +sub XMLout { + + # If this is not a method call, create an object + + my $self; + if($_[0] and UNIVERSAL::isa($_[0], 'XML::Simple')) { + $self = shift; + } + else { + $self = new XML::Simple(); + } + + croak "XMLout() requires at least one argument" unless(@_); + my $ref = shift; + + $self->handle_options('out', @_); + + + # If namespace expansion is set, XML::NamespaceSupport is required + + if($self->{opt}->{nsexpand}) { + require XML::NamespaceSupport; + $self->{nsup} = XML::NamespaceSupport->new(); + $self->{ns_prefix} = 'aaa'; + } + + + # Wrap top level arrayref in a hash + + if(UNIVERSAL::isa($ref, 'ARRAY')) { + $ref = { anon => $ref }; + } + + + # Extract rootname from top level hash if keeproot enabled + + if($self->{opt}->{keeproot}) { + my(@keys) = keys(%$ref); + if(@keys == 1) { + $ref = $ref->{$keys[0]}; + $self->{opt}->{rootname} = $keys[0]; + } + } + + # Ensure there are no top level attributes if we're not adding root elements + + elsif($self->{opt}->{rootname} eq '') { + if(UNIVERSAL::isa($ref, 'HASH')) { + my $refsave = $ref; + $ref = {}; + foreach (keys(%$refsave)) { + if(ref($refsave->{$_})) { + $ref->{$_} = $refsave->{$_}; + } + else { + $ref->{$_} = [ $refsave->{$_} ]; + } + } + } + } + + + # Encode the hashref and write to file if necessary + + $self->{_ancestors} = []; + my $xml = $self->value_to_xml($ref, $self->{opt}->{rootname}, ''); + delete $self->{_ancestors}; + + if($self->{opt}->{xmldecl}) { + $xml = $self->{opt}->{xmldecl} . "\n" . $xml; + } + + if($self->{opt}->{outputfile}) { + if(ref($self->{opt}->{outputfile})) { + return($self->{opt}->{outputfile}->print($xml)); + } + else { + local(*OUT); + open(OUT, '>', "$self->{opt}->{outputfile}") || + croak "open($self->{opt}->{outputfile}): $!"; + binmode(OUT, ':utf8') if($] >= 5.008); + print OUT $xml || croak "print: $!"; + close(OUT); + } + } + elsif($self->{opt}->{handler}) { + require XML::SAX; + my $sp = XML::SAX::ParserFactory->parser( + Handler => $self->{opt}->{handler} + ); + return($sp->parse_string($xml)); + } + else { + return($xml); + } +} + + +############################################################################## +# Method: handle_options() +# +# Helper routine for both XMLin() and XMLout(). Both routines handle their +# first argument and assume all other args are options handled by this routine. +# Saves a hash of options in $self->{opt}. +# +# If default options were passed to the constructor, they will be retrieved +# here and merged with options supplied to the method call. +# +# First argument should be the string 'in' or the string 'out'. +# +# Remaining arguments should be name=>value pairs. Sets up default values +# for options not supplied. Unrecognised options are a fatal error. +# + +sub handle_options { + my $self = shift; + my $dirn = shift; + + + # Determine valid options based on context + + my %known_opt; + if($dirn eq 'in') { + @known_opt{@KnownOptIn} = @KnownOptIn; + } + else { + @known_opt{@KnownOptOut} = @KnownOptOut; + } + + + # Store supplied options in hashref and weed out invalid ones + + if(@_ % 2) { + croak "Options must be name=>value pairs (odd number supplied)"; + } + my %raw_opt = @_; + my $opt = {}; + $self->{opt} = $opt; + + while(my($key, $val) = each %raw_opt) { + my $lkey = lc($key); + $lkey =~ s/_//g; + croak "Unrecognised option: $key" unless($known_opt{$lkey}); + $opt->{$lkey} = $val; + } + + + # Merge in options passed to constructor + + foreach (keys(%known_opt)) { + unless(exists($opt->{$_})) { + if(exists($self->{def_opt}->{$_})) { + $opt->{$_} = $self->{def_opt}->{$_}; + } + } + } + + + # Set sensible defaults if not supplied + + if(exists($opt->{rootname})) { + unless(defined($opt->{rootname})) { + $opt->{rootname} = ''; + } + } + else { + $opt->{rootname} = $DefRootName; + } + + if($opt->{xmldecl} and $opt->{xmldecl} eq '1') { + $opt->{xmldecl} = $DefXmlDecl; + } + + if(exists($opt->{contentkey})) { + if($opt->{contentkey} =~ m{^-(.*)$}) { + $opt->{contentkey} = $1; + $opt->{collapseagain} = 1; + } + } + else { + $opt->{contentkey} = $DefContentKey; + } + + unless(exists($opt->{normalisespace})) { + $opt->{normalisespace} = $opt->{normalizespace}; + } + $opt->{normalisespace} = 0 unless(defined($opt->{normalisespace})); + + # Cleanups for values assumed to be arrays later + + if($opt->{searchpath}) { + unless(ref($opt->{searchpath})) { + $opt->{searchpath} = [ $opt->{searchpath} ]; + } + } + else { + $opt->{searchpath} = [ ]; + } + + if($opt->{cache} and !ref($opt->{cache})) { + $opt->{cache} = [ $opt->{cache} ]; + } + if($opt->{cache}) { + $_ = lc($_) foreach (@{$opt->{cache}}); + } + + if(exists($opt->{parseropts})) { + if($^W) { + carp "Warning: " . + "'ParserOpts' is deprecated, contact the author if you need it"; + } + } + else { + $opt->{parseropts} = [ ]; + } + + + # Special cleanup for {forcearray} which could be regex, arrayref or boolean + # or left to default to 0 + + if(exists($opt->{forcearray})) { + if(ref($opt->{forcearray}) eq 'Regexp') { + $opt->{forcearray} = [ $opt->{forcearray} ]; + } + + if(ref($opt->{forcearray}) eq 'ARRAY') { + my @force_list = @{$opt->{forcearray}}; + if(@force_list) { + $opt->{forcearray} = {}; + foreach my $tag (@force_list) { + if(ref($tag) eq 'Regexp') { + push @{$opt->{forcearray}->{_regex}}, $tag; + } + else { + $opt->{forcearray}->{$tag} = 1; + } + } + } + else { + $opt->{forcearray} = 0; + } + } + else { + $opt->{forcearray} = ( $opt->{forcearray} ? 1 : 0 ); + } + } + else { + if($StrictMode and $dirn eq 'in') { + croak "No value specified for 'ForceArray' option in call to XML$dirn()"; + } + $opt->{forcearray} = 0; + } + + + # Special cleanup for {keyattr} which could be arrayref or hashref or left + # to default to arrayref + + if(exists($opt->{keyattr})) { + if(ref($opt->{keyattr})) { + if(ref($opt->{keyattr}) eq 'HASH') { + + # Make a copy so we can mess with it + + $opt->{keyattr} = { %{$opt->{keyattr}} }; + + + # Convert keyattr => { elem => '+attr' } + # to keyattr => { elem => [ 'attr', '+' ] } + + foreach my $el (keys(%{$opt->{keyattr}})) { + if($opt->{keyattr}->{$el} =~ /^(\+|-)?(.*)$/) { + $opt->{keyattr}->{$el} = [ $2, ($1 ? $1 : '') ]; + if($StrictMode and $dirn eq 'in') { + next if($opt->{forcearray} == 1); + next if(ref($opt->{forcearray}) eq 'HASH' + and $opt->{forcearray}->{$el}); + croak "<$el> set in KeyAttr but not in ForceArray"; + } + } + else { + delete($opt->{keyattr}->{$el}); # Never reached (famous last words?) + } + } + } + else { + if(@{$opt->{keyattr}} == 0) { + delete($opt->{keyattr}); + } + } + } + else { + $opt->{keyattr} = [ $opt->{keyattr} ]; + } + } + else { + if($StrictMode) { + croak "No value specified for 'KeyAttr' option in call to XML$dirn()"; + } + $opt->{keyattr} = [ @DefKeyAttr ]; + } + + + # Special cleanup for {valueattr} which could be arrayref or hashref + + if(exists($opt->{valueattr})) { + if(ref($opt->{valueattr}) eq 'ARRAY') { + $opt->{valueattrlist} = {}; + $opt->{valueattrlist}->{$_} = 1 foreach(@{ delete $opt->{valueattr} }); + } + } + + # make sure there's nothing weird in {grouptags} + + if($opt->{grouptags} and !UNIVERSAL::isa($opt->{grouptags}, 'HASH')) { + croak "Illegal value for 'GroupTags' option - expected a hashref"; + } + + + # Check the {variables} option is valid and initialise variables hash + + if($opt->{variables} and !UNIVERSAL::isa($opt->{variables}, 'HASH')) { + croak "Illegal value for 'Variables' option - expected a hashref"; + } + + if($opt->{variables}) { + $self->{_var_values} = { %{$opt->{variables}} }; + } + elsif($opt->{varattr}) { + $self->{_var_values} = {}; + } + +} + + +############################################################################## +# Method: find_xml_file() +# +# Helper routine for XMLin(). +# Takes a filename, and a list of directories, attempts to locate the file in +# the directories listed. +# Returns a full pathname on success; croaks on failure. +# + +sub find_xml_file { + my $self = shift; + my $file = shift; + my @search_path = @_; + + + my($filename, $filedir) = + File::Basename::fileparse($file); + + if($filename ne $file) { # Ignore searchpath if dir component + return($file) if(-e $file); + } + else { + my($path); + foreach $path (@search_path) { + my $fullpath = File::Spec->catfile($path, $file); + return($fullpath) if(-e $fullpath); + } + } + + # If user did not supply a search path, default to current directory + + if(!@search_path) { + return($file) if(-e $file); + croak "File does not exist: $file"; + } + + croak "Could not find $file in ", join(':', @search_path); +} + + +############################################################################## +# Method: collapse() +# +# Helper routine for XMLin(). This routine really comprises the 'smarts' (or +# value add) of this module. +# +# Takes the parse tree that XML::Parser produced from the supplied XML and +# recurses through it 'collapsing' unnecessary levels of indirection (nested +# arrays etc) to produce a data structure that is easier to work with. +# +# Elements in the original parser tree are represented as an element name +# followed by an arrayref. The first element of the array is a hashref +# containing the attributes. The rest of the array contains a list of any +# nested elements as name+arrayref pairs: +# +# , [ { }, , [ ... ], ... ] +# +# The special element name '0' (zero) flags text content. +# +# This routine cuts down the noise by discarding any text content consisting of +# only whitespace and then moves the nested elements into the attribute hash +# using the name of the nested element as the hash key and the collapsed +# version of the nested element as the value. Multiple nested elements with +# the same name will initially be represented as an arrayref, but this may be +# 'folded' into a hashref depending on the value of the keyattr option. +# + +sub collapse { + my $self = shift; + + + # Start with the hash of attributes + + my $attr = shift; + if($self->{opt}->{noattr}) { # Discard if 'noattr' set + $attr = {}; + } + elsif($self->{opt}->{normalisespace} == 2) { + while(my($key, $value) = each %$attr) { + $attr->{$key} = $self->normalise_space($value) + } + } + + + # Do variable substitutions + + if(my $var = $self->{_var_values}) { + while(my($key, $val) = each(%$attr)) { + $val =~ s{\$\{(\w+)\}}{ $self->get_var($1) }ge; + $attr->{$key} = $val; + } + } + + + # Roll up 'value' attributes (but only if no nested elements) + + if(!@_ and keys %$attr == 1) { + my($k) = keys %$attr; + if($self->{opt}->{valueattrlist} and $self->{opt}->{valueattrlist}->{$k}) { + return $attr->{$k}; + } + } + + + # Add any nested elements + + my($key, $val); + while(@_) { + $key = shift; + $val = shift; + + if(ref($val)) { + $val = $self->collapse(@$val); + next if(!defined($val) and $self->{opt}->{suppressempty}); + } + elsif($key eq '0') { + next if($val =~ m{^\s*$}s); # Skip all whitespace content + + $val = $self->normalise_space($val) + if($self->{opt}->{normalisespace} == 2); + + # do variable substitutions + + if(my $var = $self->{_var_values}) { + $val =~ s{\$\{(\w+)\}}{ $self->get_var($1) }ge; + } + + + # look for variable definitions + + if(my $var = $self->{opt}->{varattr}) { + if(exists $attr->{$var}) { + $self->set_var($attr->{$var}, $val); + } + } + + + # Collapse text content in element with no attributes to a string + + if(!%$attr and !@_) { + return($self->{opt}->{forcecontent} ? + { $self->{opt}->{contentkey} => $val } : $val + ); + } + $key = $self->{opt}->{contentkey}; + } + + + # Combine duplicate attributes into arrayref if required + + if(exists($attr->{$key})) { + if(UNIVERSAL::isa($attr->{$key}, 'ARRAY')) { + push(@{$attr->{$key}}, $val); + } + else { + $attr->{$key} = [ $attr->{$key}, $val ]; + } + } + elsif(defined($val) and UNIVERSAL::isa($val, 'ARRAY')) { + $attr->{$key} = [ $val ]; + } + else { + if( $key ne $self->{opt}->{contentkey} + and ( + ($self->{opt}->{forcearray} == 1) + or ( + (ref($self->{opt}->{forcearray}) eq 'HASH') + and ( + $self->{opt}->{forcearray}->{$key} + or (grep $key =~ $_, @{$self->{opt}->{forcearray}->{_regex}}) + ) + ) + ) + ) { + $attr->{$key} = [ $val ]; + } + else { + $attr->{$key} = $val; + } + } + + } + + + # Turn arrayrefs into hashrefs if key fields present + + if($self->{opt}->{keyattr}) { + while(($key,$val) = each %$attr) { + if(defined($val) and UNIVERSAL::isa($val, 'ARRAY')) { + $attr->{$key} = $self->array_to_hash($key, $val); + } + } + } + + + # disintermediate grouped tags + + if($self->{opt}->{grouptags}) { + while(my($key, $val) = each(%$attr)) { + next unless(UNIVERSAL::isa($val, 'HASH') and (keys %$val == 1)); + next unless(exists($self->{opt}->{grouptags}->{$key})); + + my($child_key, $child_val) = %$val; + + if($self->{opt}->{grouptags}->{$key} eq $child_key) { + $attr->{$key}= $child_val; + } + } + } + + + # Fold hashes containing a single anonymous array up into just the array + + my $count = scalar keys %$attr; + if($count == 1 + and exists $attr->{anon} + and UNIVERSAL::isa($attr->{anon}, 'ARRAY') + ) { + return($attr->{anon}); + } + + + # Do the right thing if hash is empty, otherwise just return it + + if(!%$attr and exists($self->{opt}->{suppressempty})) { + if(defined($self->{opt}->{suppressempty}) and + $self->{opt}->{suppressempty} eq '') { + return(''); + } + return(undef); + } + + + # Roll up named elements with named nested 'value' attributes + + if($self->{opt}->{valueattr}) { + while(my($key, $val) = each(%$attr)) { + next unless($self->{opt}->{valueattr}->{$key}); + next unless(UNIVERSAL::isa($val, 'HASH') and (keys %$val == 1)); + my($k) = keys %$val; + next unless($k eq $self->{opt}->{valueattr}->{$key}); + $attr->{$key} = $val->{$k}; + } + } + + return($attr) + +} + + +############################################################################## +# Method: set_var() +# +# Called when a variable definition is encountered in the XML. (A variable +# definition looks like value where attrname +# matches the varattr setting). +# + +sub set_var { + my($self, $name, $value) = @_; + + $self->{_var_values}->{$name} = $value; +} + + +############################################################################## +# Method: get_var() +# +# Called during variable substitution to get the value for the named variable. +# + +sub get_var { + my($self, $name) = @_; + + my $value = $self->{_var_values}->{$name}; + return $value if(defined($value)); + + return '${' . $name . '}'; +} + + +############################################################################## +# Method: normalise_space() +# +# Strips leading and trailing whitespace and collapses sequences of whitespace +# characters to a single space. +# + +sub normalise_space { + my($self, $text) = @_; + + $text =~ s/^\s+//s; + $text =~ s/\s+$//s; + $text =~ s/\s\s+/ /sg; + + return $text; +} + + +############################################################################## +# Method: array_to_hash() +# +# Helper routine for collapse(). +# Attempts to 'fold' an array of hashes into an hash of hashes. Returns a +# reference to the hash on success or the original array if folding is +# not possible. Behaviour is controlled by 'keyattr' option. +# + +sub array_to_hash { + my $self = shift; + my $name = shift; + my $arrayref = shift; + + my $hashref = {}; + + my($i, $key, $val, $flag); + + + # Handle keyattr => { .... } + + if(ref($self->{opt}->{keyattr}) eq 'HASH') { + return($arrayref) unless(exists($self->{opt}->{keyattr}->{$name})); + ($key, $flag) = @{$self->{opt}->{keyattr}->{$name}}; + for($i = 0; $i < @$arrayref; $i++) { + if(UNIVERSAL::isa($arrayref->[$i], 'HASH') and + exists($arrayref->[$i]->{$key}) + ) { + $val = $arrayref->[$i]->{$key}; + if(ref($val)) { + if($StrictMode) { + croak "<$name> element has non-scalar '$key' key attribute"; + } + if($^W) { + carp "Warning: <$name> element has non-scalar '$key' key attribute"; + } + return($arrayref); + } + $val = $self->normalise_space($val) + if($self->{opt}->{normalisespace} == 1); + $hashref->{$val} = { %{$arrayref->[$i]} }; + $hashref->{$val}->{"-$key"} = $hashref->{$val}->{$key} if($flag eq '-'); + delete $hashref->{$val}->{$key} unless($flag eq '+'); + } + else { + croak "<$name> element has no '$key' key attribute" if($StrictMode); + carp "Warning: <$name> element has no '$key' key attribute" if($^W); + return($arrayref); + } + } + } + + + # Or assume keyattr => [ .... ] + + else { + ELEMENT: for($i = 0; $i < @$arrayref; $i++) { + return($arrayref) unless(UNIVERSAL::isa($arrayref->[$i], 'HASH')); + + foreach $key (@{$self->{opt}->{keyattr}}) { + if(defined($arrayref->[$i]->{$key})) { + $val = $arrayref->[$i]->{$key}; + return($arrayref) if(ref($val)); + $val = $self->normalise_space($val) + if($self->{opt}->{normalisespace} == 1); + $hashref->{$val} = { %{$arrayref->[$i]} }; + delete $hashref->{$val}->{$key}; + next ELEMENT; + } + } + + return($arrayref); # No keyfield matched + } + } + + # collapse any hashes which now only have a 'content' key + + if($self->{opt}->{collapseagain}) { + $hashref = $self->collapse_content($hashref); + } + + return($hashref); +} + + +############################################################################## +# Method: collapse_content() +# +# Helper routine for array_to_hash +# +# Arguments expected are: +# - an XML::Simple object +# - a hasref +# the hashref is a former array, turned into a hash by array_to_hash because +# of the presence of key attributes +# at this point collapse_content avoids over-complicated structures like +# dir => { libexecdir => { content => '$exec_prefix/libexec' }, +# localstatedir => { content => '$prefix' }, +# } +# into +# dir => { libexecdir => '$exec_prefix/libexec', +# localstatedir => '$prefix', +# } + +sub collapse_content { + my $self = shift; + my $hashref = shift; + + my $contentkey = $self->{opt}->{contentkey}; + + # first go through the values,checking that they are fit to collapse + foreach my $val (values %$hashref) { + return $hashref unless ( (ref($val) eq 'HASH') + and (keys %$val == 1) + and (exists $val->{$contentkey}) + ); + } + + # now collapse them + foreach my $key (keys %$hashref) { + $hashref->{$key}= $hashref->{$key}->{$contentkey}; + } + + return $hashref; +} + + +############################################################################## +# Method: value_to_xml() +# +# Helper routine for XMLout() - recurses through a data structure building up +# and returning an XML representation of that structure as a string. +# +# Arguments expected are: +# - the data structure to be encoded (usually a reference) +# - the XML tag name to use for this item +# - a string of spaces for use as the current indent level +# + +sub value_to_xml { + my $self = shift;; + + + # Grab the other arguments + + my($ref, $name, $indent) = @_; + + my $named = (defined($name) and $name ne '' ? 1 : 0); + + my $nl = "\n"; + + my $is_root = $indent eq '' ? 1 : 0; # Warning, dirty hack! + if($self->{opt}->{noindent}) { + $indent = ''; + $nl = ''; + } + + + # Convert to XML + + if(ref($ref)) { + croak "circular data structures not supported" + if(grep($_ == $ref, @{$self->{_ancestors}})); + push @{$self->{_ancestors}}, $ref; + } + else { + if($named) { + return(join('', + $indent, '<', $name, '>', + ($self->{opt}->{noescape} ? $ref : $self->escape_value($ref)), + '", $nl + )); + } + else { + return("$ref$nl"); + } + } + + + # Unfold hash to array if possible + + if(UNIVERSAL::isa($ref, 'HASH') # It is a hash + and keys %$ref # and it's not empty + and $self->{opt}->{keyattr} # and folding is enabled + and !$is_root # and its not the root element + ) { + $ref = $self->hash_to_array($name, $ref); + } + + + my @result = (); + my($key, $value); + + + # Handle hashrefs + + if(UNIVERSAL::isa($ref, 'HASH')) { + + # Reintermediate grouped values if applicable + + if($self->{opt}->{grouptags}) { + $ref = $self->copy_hash($ref); + while(my($key, $val) = each %$ref) { + if($self->{opt}->{grouptags}->{$key}) { + $ref->{$key} = { $self->{opt}->{grouptags}->{$key} => $val }; + } + } + } + + + # Scan for namespace declaration attributes + + my $nsdecls = ''; + my $default_ns_uri; + if($self->{nsup}) { + $ref = $self->copy_hash($ref); + $self->{nsup}->push_context(); + + # Look for default namespace declaration first + + if(exists($ref->{xmlns})) { + $self->{nsup}->declare_prefix('', $ref->{xmlns}); + $nsdecls .= qq( xmlns="$ref->{xmlns}"); + delete($ref->{xmlns}); + } + $default_ns_uri = $self->{nsup}->get_uri(''); + + + # Then check all the other keys + + foreach my $qname (keys(%$ref)) { + my($uri, $lname) = $self->{nsup}->parse_jclark_notation($qname); + if($uri) { + if($uri eq $xmlns_ns) { + $self->{nsup}->declare_prefix($lname, $ref->{$qname}); + $nsdecls .= qq( xmlns:$lname="$ref->{$qname}"); + delete($ref->{$qname}); + } + } + } + + # Translate any remaining Clarkian names + + foreach my $qname (keys(%$ref)) { + my($uri, $lname) = $self->{nsup}->parse_jclark_notation($qname); + if($uri) { + if($default_ns_uri and $uri eq $default_ns_uri) { + $ref->{$lname} = $ref->{$qname}; + delete($ref->{$qname}); + } + else { + my $prefix = $self->{nsup}->get_prefix($uri); + unless($prefix) { + # $self->{nsup}->declare_prefix(undef, $uri); + # $prefix = $self->{nsup}->get_prefix($uri); + $prefix = $self->{ns_prefix}++; + $self->{nsup}->declare_prefix($prefix, $uri); + $nsdecls .= qq( xmlns:$prefix="$uri"); + } + $ref->{"$prefix:$lname"} = $ref->{$qname}; + delete($ref->{$qname}); + } + } + } + } + + + my @nested = (); + my $text_content = undef; + if($named) { + push @result, $indent, '<', $name, $nsdecls; + } + + if(keys %$ref) { + my $first_arg = 1; + foreach my $key ($self->sorted_keys($name, $ref)) { + my $value = $ref->{$key}; + next if(substr($key, 0, 1) eq '-'); + if(!defined($value)) { + next if $self->{opt}->{suppressempty}; + unless(exists($self->{opt}->{suppressempty}) + and !defined($self->{opt}->{suppressempty}) + ) { + carp 'Use of uninitialized value' if($^W); + } + if($key eq $self->{opt}->{contentkey}) { + $text_content = ''; + } + else { + $value = exists($self->{opt}->{suppressempty}) ? {} : ''; + } + } + + if(!ref($value) + and $self->{opt}->{valueattr} + and $self->{opt}->{valueattr}->{$key} + ) { + $value = { $self->{opt}->{valueattr}->{$key} => $value }; + } + + if(ref($value) or $self->{opt}->{noattr}) { + push @nested, + $self->value_to_xml($value, $key, "$indent "); + } + else { + $value = $self->escape_value($value) unless($self->{opt}->{noescape}); + if($key eq $self->{opt}->{contentkey}) { + $text_content = $value; + } + else { + push @result, "\n$indent " . ' ' x length($name) + if($self->{opt}->{attrindent} and !$first_arg); + push @result, ' ', $key, '="', $value , '"'; + $first_arg = 0; + } + } + } + } + else { + $text_content = ''; + } + + if(@nested or defined($text_content)) { + if($named) { + push @result, ">"; + if(defined($text_content)) { + push @result, $text_content; + $nested[0] =~ s/^\s+// if(@nested); + } + else { + push @result, $nl; + } + if(@nested) { + push @result, @nested, $indent; + } + push @result, '", $nl; + } + else { + push @result, @nested; # Special case if no root elements + } + } + else { + push @result, " />", $nl; + } + $self->{nsup}->pop_context() if($self->{nsup}); + } + + + # Handle arrayrefs + + elsif(UNIVERSAL::isa($ref, 'ARRAY')) { + foreach $value (@$ref) { + if(!ref($value)) { + push @result, + $indent, '<', $name, '>', + ($self->{opt}->{noescape} ? $value : $self->escape_value($value)), + '$nl"; + } + elsif(UNIVERSAL::isa($value, 'HASH')) { + push @result, $self->value_to_xml($value, $name, $indent); + } + else { + push @result, + $indent, '<', $name, ">$nl", + $self->value_to_xml($value, 'anon', "$indent "), + $indent, '$nl"; + } + } + } + + else { + croak "Can't encode a value of type: " . ref($ref); + } + + + pop @{$self->{_ancestors}} if(ref($ref)); + + return(join('', @result)); +} + + +############################################################################## +# Method: sorted_keys() +# +# Returns the keys of the referenced hash sorted into alphabetical order, but +# with the 'key' key (as in KeyAttr) first, if there is one. +# + +sub sorted_keys { + my($self, $name, $ref) = @_; + + return keys %$ref if $self->{opt}->{nosort}; + + my %hash = %$ref; + my $keyattr = $self->{opt}->{keyattr}; + + my @key; + + if(ref $keyattr eq 'HASH') { + if(exists $keyattr->{$name} and exists $hash{$keyattr->{$name}->[0]}) { + push @key, $keyattr->{$name}->[0]; + delete $hash{$keyattr->{$name}->[0]}; + } + } + elsif(ref $keyattr eq 'ARRAY') { + foreach (@{$keyattr}) { + if(exists $hash{$_}) { + push @key, $_; + delete $hash{$_}; + last; + } + } + } + + return(@key, sort keys %hash); +} + +############################################################################## +# Method: escape_value() +# +# Helper routine for automatically escaping values for XMLout(). +# Expects a scalar data value. Returns escaped version. +# + +sub escape_value { + my($self, $data) = @_; + + return '' unless(defined($data)); + + $data =~ s/&/&/sg; + $data =~ s//>/sg; + $data =~ s/"/"/sg; + + my $level = $self->{opt}->{numericescape} or return $data; + + return $self->numeric_escape($data, $level); +} + +sub numeric_escape { + my($self, $data, $level) = @_; + + use utf8; # required for 5.6 + + if($self->{opt}->{numericescape} eq '2') { + $data =~ s/([^\x00-\x7F])/'&#' . ord($1) . ';'/gse; + } + else { + $data =~ s/([^\x00-\xFF])/'&#' . ord($1) . ';'/gse; + } + + return $data; +} + + +############################################################################## +# Method: hash_to_array() +# +# Helper routine for value_to_xml(). +# Attempts to 'unfold' a hash of hashes into an array of hashes. Returns a +# reference to the array on success or the original hash if unfolding is +# not possible. +# + +sub hash_to_array { + my $self = shift; + my $parent = shift; + my $hashref = shift; + + my $arrayref = []; + + my($key, $value); + + my @keys = $self->{opt}->{nosort} ? keys %$hashref : sort keys %$hashref; + foreach $key (@keys) { + $value = $hashref->{$key}; + return($hashref) unless(UNIVERSAL::isa($value, 'HASH')); + + if(ref($self->{opt}->{keyattr}) eq 'HASH') { + return($hashref) unless(defined($self->{opt}->{keyattr}->{$parent})); + push @$arrayref, $self->copy_hash( + $value, $self->{opt}->{keyattr}->{$parent}->[0] => $key + ); + } + else { + push(@$arrayref, { $self->{opt}->{keyattr}->[0] => $key, %$value }); + } + } + + return($arrayref); +} + + +############################################################################## +# Method: copy_hash() +# +# Helper routine for hash_to_array(). When unfolding a hash of hashes into +# an array of hashes, we need to copy the key from the outer hash into the +# inner hash. This routine makes a copy of the original hash so we don't +# destroy the original data structure. You might wish to override this +# method if you're using tied hashes and don't want them to get untied. +# + +sub copy_hash { + my($self, $orig, @extra) = @_; + + return { @extra, %$orig }; +} + +############################################################################## +# Methods required for building trees from SAX events +############################################################################## + +sub start_document { + my $self = shift; + + $self->handle_options('in') unless($self->{opt}); + + $self->{lists} = []; + $self->{curlist} = $self->{tree} = []; +} + + +sub start_element { + my $self = shift; + my $element = shift; + + my $name = $element->{Name}; + if($self->{opt}->{nsexpand}) { + $name = $element->{LocalName} || ''; + if($element->{NamespaceURI}) { + $name = '{' . $element->{NamespaceURI} . '}' . $name; + } + } + my $attributes = {}; + if($element->{Attributes}) { # Might be undef + foreach my $attr (values %{$element->{Attributes}}) { + if($self->{opt}->{nsexpand}) { + my $name = $attr->{LocalName} || ''; + if($attr->{NamespaceURI}) { + $name = '{' . $attr->{NamespaceURI} . '}' . $name + } + $name = 'xmlns' if($name eq $bad_def_ns_jcn); + $attributes->{$name} = $attr->{Value}; + } + else { + $attributes->{$attr->{Name}} = $attr->{Value}; + } + } + } + my $newlist = [ $attributes ]; + push @{ $self->{lists} }, $self->{curlist}; + push @{ $self->{curlist} }, $name => $newlist; + $self->{curlist} = $newlist; +} + + +sub characters { + my $self = shift; + my $chars = shift; + + my $text = $chars->{Data}; + my $clist = $self->{curlist}; + my $pos = $#$clist; + + if ($pos > 0 and $clist->[$pos - 1] eq '0') { + $clist->[$pos] .= $text; + } + else { + push @$clist, 0 => $text; + } +} + + +sub end_element { + my $self = shift; + + $self->{curlist} = pop @{ $self->{lists} }; +} + + +sub end_document { + my $self = shift; + + delete($self->{curlist}); + delete($self->{lists}); + + my $tree = $self->{tree}; + delete($self->{tree}); + + + # Return tree as-is to XMLin() + + return($tree) if($self->{nocollapse}); + + + # Or collapse it before returning it to SAX parser class + + if($self->{opt}->{keeproot}) { + $tree = $self->collapse({}, @$tree); + } + else { + $tree = $self->collapse(@{$tree->[1]}); + } + + if($self->{opt}->{datahandler}) { + return($self->{opt}->{datahandler}->($self, $tree)); + } + + return($tree); +} + +*xml_in = \&XMLin; +*xml_out = \&XMLout; + +1; + +__END__ + +=head1 QUICK START + +Say you have a script called B and a file of configuration options +called B containing this: + + + +
10.0.0.101
+
10.0.1.101
+
+ +
10.0.0.102
+
+ +
10.0.0.103
+
10.0.1.103
+
+
+ +The following lines of code in B: + + use XML::Simple; + + my $config = XMLin(); + +will 'slurp' the configuration options into the hashref $config (because no +arguments are passed to C the name and location of the XML file will +be inferred from name and location of the script). You can dump out the +contents of the hashref using Data::Dumper: + + use Data::Dumper; + + print Dumper($config); + +which will produce something like this (formatting has been adjusted for +brevity): + + { + 'logdir' => '/var/log/foo/', + 'debugfile' => '/tmp/foo.debug', + 'server' => { + 'sahara' => { + 'osversion' => '2.6', + 'osname' => 'solaris', + 'address' => [ '10.0.0.101', '10.0.1.101' ] + }, + 'gobi' => { + 'osversion' => '6.5', + 'osname' => 'irix', + 'address' => '10.0.0.102' + }, + 'kalahari' => { + 'osversion' => '2.0.34', + 'osname' => 'linux', + 'address' => [ '10.0.0.103', '10.0.1.103' ] + } + } + } + +Your script could then access the name of the log directory like this: + + print $config->{logdir}; + +similarly, the second address on the server 'kalahari' could be referenced as: + + print $config->{server}->{kalahari}->{address}->[1]; + +What could be simpler? (Rhetorical). + +For simple requirements, that's really all there is to it. If you want to +store your XML in a different directory or file, or pass it in as a string or +even pass it in via some derivative of an IO::Handle, you'll need to check out +L<"OPTIONS">. If you want to turn off or tweak the array folding feature (that +neat little transformation that produced $config->{server}) you'll find options +for that as well. + +If you want to generate XML (for example to write a modified version of +$config back out as XML), check out C. + +If your needs are not so simple, this may not be the module for you. In that +case, you might want to read L<"WHERE TO FROM HERE?">. + +=head1 DESCRIPTION + +The XML::Simple module provides a simple API layer on top of an underlying XML +parsing module (either XML::Parser or one of the SAX2 parser modules). Two +functions are exported: C and C. Note: you can explicity +request the lower case versions of the function names: C and +C. + +The simplest approach is to call these two functions directly, but an +optional object oriented interface (see L<"OPTIONAL OO INTERFACE"> below) +allows them to be called as methods of an B object. The object +interface can also be used at either end of a SAX pipeline. + +=head2 XMLin() + +Parses XML formatted data and returns a reference to a data structure which +contains the same information in a more readily accessible form. (Skip +down to L<"EXAMPLES"> below, for more sample code). + +C accepts an optional XML specifier followed by zero or more 'name => +value' option pairs. The XML specifier can be one of the following: + +=over 4 + +=item A filename + +If the filename contains no directory components C will look for the +file in each directory in the SearchPath (see L<"OPTIONS"> below) or in the +current directory if the SearchPath option is not defined. eg: + + $ref = XMLin('/etc/params.xml'); + +Note, the filename '-' can be used to parse from STDIN. + +=item undef + +If there is no XML specifier, C will check the script directory and +each of the SearchPath directories for a file with the same name as the script +but with the extension '.xml'. Note: if you wish to specify options, you +must specify the value 'undef'. eg: + + $ref = XMLin(undef, ForceArray => 1); + +=item A string of XML + +A string containing XML (recognised by the presence of '<' and '>' characters) +will be parsed directly. eg: + + $ref = XMLin(''); + +=item An IO::Handle object + +An IO::Handle object will be read to EOF and its contents parsed. eg: + + $fh = new IO::File('/etc/params.xml'); + $ref = XMLin($fh); + +=back + +=head2 XMLout() + +Takes a data structure (generally a hashref) and returns an XML encoding of +that structure. If the resulting XML is parsed using C, it should +return a data structure equivalent to the original (see caveats below). + +The C function can also be used to output the XML as SAX events +see the C option and L<"SAX SUPPORT"> for more details). + +When translating hashes to XML, hash keys which have a leading '-' will be +silently skipped. This is the approved method for marking elements of a +data structure which should be ignored by C. (Note: If these items +were not skipped the key names would be emitted as element or attribute names +with a leading '-' which would not be valid XML). + +=head2 Caveats + +Some care is required in creating data structures which will be passed to +C. Hash keys from the data structure will be encoded as either XML +element names or attribute names. Therefore, you should use hash key names +which conform to the relatively strict XML naming rules: + +Names in XML must begin with a letter. The remaining characters may be +letters, digits, hyphens (-), underscores (_) or full stops (.). It is also +allowable to include one colon (:) in an element name but this should only be +used when working with namespaces (B can only usefully work with +namespaces when teamed with a SAX Parser). + +You can use other punctuation characters in hash values (just not in hash +keys) however B does not support dumping binary data. + +If you break these rules, the current implementation of C will +simply emit non-compliant XML which will be rejected if you try to read it +back in. (A later version of B might take a more proactive +approach). + +Note also that although you can nest hashes and arrays to arbitrary levels, +circular data structures are not supported and will cause C to die. + +If you wish to 'round-trip' arbitrary data structures from Perl to XML and back +to Perl, then you should probably disable array folding (using the KeyAttr +option) both with C and with C. If you still don't get the +expected results, you may prefer to use L which is designed for +exactly that purpose. + +Refer to L<"WHERE TO FROM HERE?"> if C is too simple for your needs. + + +=head1 OPTIONS + +B supports a number of options (in fact as each release of +B adds more options, the module's claim to the name 'Simple' +becomes increasingly tenuous). If you find yourself repeatedly having to +specify the same options, you might like to investigate L<"OPTIONAL OO +INTERFACE"> below. + +If you can't be bothered reading the documentation, refer to +L<"STRICT MODE"> to automatically catch common mistakes. + +Because there are so many options, it's hard for new users to know which ones +are important, so here are the two you really need to know about: + +=over 4 + +=item * + +check out C because you'll almost certainly want to turn it on + +=item * + +make sure you know what the C option does and what its default value is +because it may surprise you otherwise (note in particular that 'KeyAttr' +affects both C and C) + +=back + +The option name headings below have a trailing 'comment' - a hash followed by +two pieces of metadata: + +=over 4 + +=item * + +Options are marked with 'I' if they are recognised by C and +'I' if they are recognised by C. + +=item * + +Each option is also flagged to indicate whether it is: + + 'important' - don't use the module until you understand this one + 'handy' - you can skip this on the first time through + 'advanced' - you can skip this on the second time through + 'SAX only' - don't worry about this unless you're using SAX (or + alternatively if you need this, you also need SAX) + 'seldom used' - you'll probably never use this unless you were the + person that requested the feature + +=back + +The options are listed alphabetically: + +Note: option names are no longer case sensitive so you can use the mixed case +versions shown here; all lower case as required by versions 2.03 and earlier; +or you can add underscores between the words (eg: key_attr). + + +=head2 AttrIndent => 1 I<# out - handy> + +When you are using C, enable this option to have attributes printed +one-per-line with sensible indentation rather than all on one line. + +=head2 Cache => [ cache schemes ] I<# in - advanced> + +Because loading the B module and parsing an XML file can consume a +significant number of CPU cycles, it is often desirable to cache the output of +C for later reuse. + +When parsing from a named file, B supports a number of caching +schemes. The 'Cache' option may be used to specify one or more schemes (using +an anonymous array). Each scheme will be tried in turn in the hope of finding +a cached pre-parsed representation of the XML file. If no cached copy is +found, the file will be parsed and the first cache scheme in the list will be +used to save a copy of the results. The following cache schemes have been +implemented: + +=over 4 + +=item storable + +Utilises B to read/write a cache file with the same name as the +XML file but with the extension .stor + +=item memshare + +When a file is first parsed, a copy of the resulting data structure is retained +in memory in the B module's namespace. Subsequent calls to parse +the same file will return a reference to this structure. This cached version +will persist only for the life of the Perl interpreter (which in the case of +mod_perl for example, may be some significant time). + +Because each caller receives a reference to the same data structure, a change +made by one caller will be visible to all. For this reason, the reference +returned should be treated as read-only. + +=item memcopy + +This scheme works identically to 'memshare' (above) except that each caller +receives a reference to a new data structure which is a copy of the cached +version. Copying the data structure will add a little processing overhead, +therefore this scheme should only be used where the caller intends to modify +the data structure (or wishes to protect itself from others who might). This +scheme uses B to perform the copy. + +=back + +Warning! The memory-based caching schemes compare the timestamp on the file to +the time when it was last parsed. If the file is stored on an NFS filesystem +(or other network share) and the clock on the file server is not exactly +synchronised with the clock where your script is run, updates to the source XML +file may appear to be ignored. + +=head2 ContentKey => 'keyname' I<# in+out - seldom used> + +When text content is parsed to a hash value, this option let's you specify a +name for the hash key to override the default 'content'. So for example: + + XMLin('Text', ContentKey => 'text') + +will parse to: + + { 'one' => 1, 'text' => 'Text' } + +instead of: + + { 'one' => 1, 'content' => 'Text' } + +C will also honour the value of this option when converting a hashref +to XML. + +You can also prefix your selected key name with a '-' character to have +C try a little harder to eliminate unnecessary 'content' keys after +array folding. For example: + + XMLin( + 'FirstSecond', + KeyAttr => {item => 'name'}, + ForceArray => [ 'item' ], + ContentKey => '-content' + ) + +will parse to: + + { + 'item' => { + 'one' => 'First' + 'two' => 'Second' + } + } + +rather than this (without the '-'): + + { + 'item' => { + 'one' => { 'content' => 'First' } + 'two' => { 'content' => 'Second' } + } + } + +=head2 DataHandler => code_ref I<# in - SAX only> + +When you use an B object as a SAX handler, it will return a +'simple tree' data structure in the same format as C would return. If +this option is set (to a subroutine reference), then when the tree is built the +subroutine will be called and passed two arguments: a reference to the +B object and a reference to the data tree. The return value from +the subroutine will be returned to the SAX driver. (See L<"SAX SUPPORT"> for +more details). + +=head2 ForceArray => 1 I<# in - important> + +This option should be set to '1' to force nested elements to be represented +as arrays even when there is only one. Eg, with ForceArray enabled, this +XML: + + + value + + +would parse to this: + + { + 'name' => [ + 'value' + ] + } + +instead of this (the default): + + { + 'name' => 'value' + } + +This option is especially useful if the data structure is likely to be written +back out as XML and the default behaviour of rolling single nested elements up +into attributes is not desirable. + +If you are using the array folding feature, you should almost certainly enable +this option. If you do not, single nested elements will not be parsed to +arrays and therefore will not be candidates for folding to a hash. (Given that +the default value of 'KeyAttr' enables array folding, the default value of this +option should probably also have been enabled too - sorry). + +=head2 ForceArray => [ names ] I<# in - important> + +This alternative (and preferred) form of the 'ForceArray' option allows you to +specify a list of element names which should always be forced into an array +representation, rather than the 'all or nothing' approach above. + +It is also possible (since version 2.05) to include compiled regular +expressions in the list - any element names which match the pattern will be +forced to arrays. If the list contains only a single regex, then it is not +necessary to enclose it in an arrayref. Eg: + + ForceArray => qr/_list$/ + +=head2 ForceContent => 1 I<# in - seldom used> + +When C parses elements which have text content as well as attributes, +the text content must be represented as a hash value rather than a simple +scalar. This option allows you to force text content to always parse to +a hash value even when there are no attributes. So for example: + + XMLin('text1text2', ForceContent => 1) + +will parse to: + + { + 'x' => { 'content' => 'text1' }, + 'y' => { 'a' => 2, 'content' => 'text2' } + } + +instead of: + + { + 'x' => 'text1', + 'y' => { 'a' => 2, 'content' => 'text2' } + } + +=head2 GroupTags => { grouping tag => grouped tag } I<# in+out - handy> + +You can use this option to eliminate extra levels of indirection in your Perl +data structure. For example this XML: + + + + /usr/bin + /usr/local/bin + /usr/X11/bin + + + +Would normally be read into a structure like this: + + { + searchpath => { + dir => [ '/usr/bin', '/usr/local/bin', '/usr/X11/bin' ] + } + } + +But when read in with the appropriate value for 'GroupTags': + + my $opt = XMLin($xml, GroupTags => { searchpath => 'dir' }); + +It will return this simpler structure: + + { + searchpath => [ '/usr/bin', '/usr/local/bin', '/usr/X11/bin' ] + } + +The grouping element (C<< >> in the example) must not contain any +attributes or elements other than the grouped element. + +You can specify multiple 'grouping element' to 'grouped element' mappings in +the same hashref. If this option is combined with C, the array +folding will occur first and then the grouped element names will be eliminated. + +C will also use the grouptag mappings to re-introduce the tags around +the grouped elements. Beware though that this will occur in all places that +the 'grouping tag' name occurs - you probably don't want to use the same name +for elements as well as attributes. + +=head2 Handler => object_ref I<# out - SAX only> + +Use the 'Handler' option to have C generate SAX events rather than +returning a string of XML. For more details see L<"SAX SUPPORT"> below. + +Note: the current implementation of this option generates a string of XML +and uses a SAX parser to translate it into SAX events. The normal encoding +rules apply here - your data must be UTF8 encoded unless you specify an +alternative encoding via the 'XMLDecl' option; and by the time the data reaches +the handler object, it will be in UTF8 form regardless of the encoding you +supply. A future implementation of this option may generate the events +directly. + +=head2 KeepRoot => 1 I<# in+out - handy> + +In its attempt to return a data structure free of superfluous detail and +unnecessary levels of indirection, C normally discards the root +element name. Setting the 'KeepRoot' option to '1' will cause the root element +name to be retained. So after executing this code: + + $config = XMLin('', KeepRoot => 1) + +You'll be able to reference the tempdir as +C<$config-E{config}-E{tempdir}> instead of the default +C<$config-E{tempdir}>. + +Similarly, setting the 'KeepRoot' option to '1' will tell C that the +data structure already contains a root element name and it is not necessary to +add another. + +=head2 KeyAttr => [ list ] I<# in+out - important> + +This option controls the 'array folding' feature which translates nested +elements from an array to a hash. It also controls the 'unfolding' of hashes +to arrays. + +For example, this XML: + + + + + + +would, by default, parse to this: + + { + 'user' => [ + { + 'login' => 'grep', + 'fullname' => 'Gary R Epstein' + }, + { + 'login' => 'stty', + 'fullname' => 'Simon T Tyson' + } + ] + } + +If the option 'KeyAttr => "login"' were used to specify that the 'login' +attribute is a key, the same XML would parse to: + + { + 'user' => { + 'stty' => { + 'fullname' => 'Simon T Tyson' + }, + 'grep' => { + 'fullname' => 'Gary R Epstein' + } + } + } + +The key attribute names should be supplied in an arrayref if there is more +than one. C will attempt to match attribute names in the order +supplied. C will use the first attribute name supplied when +'unfolding' a hash into an array. + +Note 1: The default value for 'KeyAttr' is ['name', 'key', 'id']. If you do +not want folding on input or unfolding on output you must setting this option +to an empty list to disable the feature. + +Note 2: If you wish to use this option, you should also enable the +C option. Without 'ForceArray', a single nested element will be +rolled up into a scalar rather than an array and therefore will not be folded +(since only arrays get folded). + +=head2 KeyAttr => { list } I<# in+out - important> + +This alternative (and preferred) method of specifiying the key attributes +allows more fine grained control over which elements are folded and on which +attributes. For example the option 'KeyAttr => { package => 'id' } will cause +any package elements to be folded on the 'id' attribute. No other elements +which have an 'id' attribute will be folded at all. + +Note: C will generate a warning (or a fatal error in L<"STRICT MODE">) +if this syntax is used and an element which does not have the specified key +attribute is encountered (eg: a 'package' element without an 'id' attribute, to +use the example above). Warnings will only be generated if B<-w> is in force. + +Two further variations are made possible by prefixing a '+' or a '-' character +to the attribute name: + +The option 'KeyAttr => { user => "+login" }' will cause this XML: + + + + + + +to parse to this data structure: + + { + 'user' => { + 'stty' => { + 'fullname' => 'Simon T Tyson', + 'login' => 'stty' + }, + 'grep' => { + 'fullname' => 'Gary R Epstein', + 'login' => 'grep' + } + } + } + +The '+' indicates that the value of the key attribute should be copied rather +than moved to the folded hash key. + +A '-' prefix would produce this result: + + { + 'user' => { + 'stty' => { + 'fullname' => 'Simon T Tyson', + '-login' => 'stty' + }, + 'grep' => { + 'fullname' => 'Gary R Epstein', + '-login' => 'grep' + } + } + } + +As described earlier, C will ignore hash keys starting with a '-'. + +=head2 NoAttr => 1 I<# in+out - handy> + +When used with C, the generated XML will contain no attributes. +All hash key/values will be represented as nested elements instead. + +When used with C, any attributes in the XML will be ignored. + +=head2 NoEscape => 1 I<# out - seldom used> + +By default, C will translate the characters 'E', 'E', '&' and +'"' to '<', '>', '&' and '"' respectively. Use this option to +suppress escaping (presumably because you've already escaped the data in some +more sophisticated manner). + +=head2 NoIndent => 1 I<# out - seldom used> + +Set this option to 1 to disable C's default 'pretty printing' mode. +With this option enabled, the XML output will all be on one line (unless there +are newlines in the data) - this may be easier for downstream processing. + +=head2 NoSort => 1 I<# out - seldom used> + +Newer versions of XML::Simple sort elements and attributes alphabetically (*), +by default. Enable this option to suppress the sorting - possibly for +backwards compatibility. + +* Actually, sorting is alphabetical but 'key' attribute or element names (as in +'KeyAttr') sort first. Also, when a hash of hashes is 'unfolded', the elements +are sorted alphabetically by the value of the key field. + +=head2 NormaliseSpace => 0 | 1 | 2 I<# in - handy> + +This option controls how whitespace in text content is handled. Recognised +values for the option are: + +=over 4 + +=item * + +0 = (default) whitespace is passed through unaltered (except of course for the +normalisation of whitespace in attribute values which is mandated by the XML +recommendation) + +=item * + +1 = whitespace is normalised in any value used as a hash key (normalising means +removing leading and trailing whitespace and collapsing sequences of whitespace +characters to a single space) + +=item * + +2 = whitespace is normalised in all text content + +=back + +Note: you can spell this option with a 'z' if that is more natural for you. + +=head2 NSExpand => 1 I<# in+out handy - SAX only> + +This option controls namespace expansion - the translation of element and +attribute names of the form 'prefix:name' to '{uri}name'. For example the +element name 'xsl:template' might be expanded to: +'{http://www.w3.org/1999/XSL/Transform}template'. + +By default, C will return element names and attribute names exactly as +they appear in the XML. Setting this option to 1 will cause all element and +attribute names to be expanded to include their namespace prefix. + +I. + +This option also controls whether C performs the reverse translation +from '{uri}name' back to 'prefix:name'. The default is no translation. If +your data contains expanded names, you should set this option to 1 otherwise +C will emit XML which is not well formed. + +I to translate URIs back to prefixes>. + +=head2 NumericEscape => 0 | 1 | 2 I<# out - handy> + +Use this option to have 'high' (non-ASCII) characters in your Perl data +structure converted to numeric entities (eg: €) in the XML output. Three +levels are possible: + +0 - default: no numeric escaping (OK if you're writing out UTF8) + +1 - only characters above 0xFF are escaped (ie: characters in the 0x80-FF range are not escaped), possibly useful with ISO8859-1 output + +2 - all characters above 0x7F are escaped (good for plain ASCII output) + +=head2 OutputFile => I<# out - handy> + +The default behaviour of C is to return the XML as a string. If you +wish to write the XML to a file, simply supply the filename using the +'OutputFile' option. + +This option also accepts an IO handle object - especially useful in Perl 5.8.0 +and later for output using an encoding other than UTF-8, eg: + + open my $fh, '>:encoding(iso-8859-1)', $path or die "open($path): $!"; + XMLout($ref, OutputFile => $fh); + +=head2 ParserOpts => [ XML::Parser Options ] I<# in - don't use this> + +I. + +This option allows you to pass parameters to the constructor of the underlying +XML::Parser object (which of course assumes you're not using SAX). + +=head2 RootName => 'string' I<# out - handy> + +By default, when C generates XML, the root element will be named +'opt'. This option allows you to specify an alternative name. + +Specifying either undef or the empty string for the RootName option will +produce XML with no root elements. In most cases the resulting XML fragment +will not be 'well formed' and therefore could not be read back in by C. +Nevertheless, the option has been found to be useful in certain circumstances. + +=head2 SearchPath => [ list ] I<# in - handy> + +If you pass C a filename, but the filename include no directory +component, you can use this option to specify which directories should be +searched to locate the file. You might use this option to search first in the +user's home directory, then in a global directory such as /etc. + +If a filename is provided to C but SearchPath is not defined, the +file is assumed to be in the current directory. + +If the first parameter to C is undefined, the default SearchPath +will contain only the directory in which the script itself is located. +Otherwise the default SearchPath will be empty. + +=head2 SuppressEmpty => 1 | '' | undef I<# in+out - handy> + +This option controls what C should do with empty elements (no +attributes and no content). The default behaviour is to represent them as +empty hashes. Setting this option to a true value (eg: 1) will cause empty +elements to be skipped altogether. Setting the option to 'undef' or the empty +string will cause empty elements to be represented as the undefined value or +the empty string respectively. The latter two alternatives are a little +easier to test for in your code than a hash with no keys. + +The option also controls what C does with undefined values. Setting +the option to undef causes undefined values to be output as empty elements +(rather than empty attributes), it also suppresses the generation of warnings +about undefined values. Setting the option to a true value (eg: 1) causes +undefined values to be skipped altogether on output. + +=head2 ValueAttr => [ names ] I<# in - handy> + +Use this option to deal elements which always have a single attribute and no +content. Eg: + + + + + + +Setting C<< ValueAttr => [ 'value' ] >> will cause the above XML to parse to: + + { + colour => 'red', + size => 'XXL' + } + +instead of this (the default): + + { + colour => { value => 'red' }, + size => { value => 'XXL' } + } + +Note: This form of the ValueAttr option is not compatible with C - +since the attribute name is discarded at parse time, the original XML cannot be +reconstructed. + +=head2 ValueAttr => { element => attribute, ... } I<# in+out - handy> + +This (preferred) form of the ValueAttr option requires you to specify both +the element and the attribute names. This is not only safer, it also allows +the original XML to be reconstructed by C. + +Note: You probably don't want to use this option and the NoAttr option at the +same time. + +=head2 Variables => { name => value } I<# in - handy> + +This option allows variables in the XML to be expanded when the file is read. +(there is no facility for putting the variable names back if you regenerate +XML using C). + +A 'variable' is any text of the form C<${name}> which occurs in an attribute +value or in the text content of an element. If 'name' matches a key in the +supplied hashref, C<${name}> will be replaced with the corresponding value from +the hashref. If no matching key is found, the variable will not be replaced. + +=head2 VarAttr => 'attr_name' I<# in - handy> + +In addition to the variables defined using C, this option allows +variables to be defined in the XML. A variable definition consists of an +element with an attribute called 'attr_name' (the value of the C +option). The value of the attribute will be used as the variable name and the +text content of the element will be used as the value. A variable defined in +this way will override a variable defined using the C option. For +example: + + XMLin( ' + /usr/local/apache + ${prefix} + ${exec_prefix}/bin + ', + VarAttr => 'name', ContentKey => '-content' + ); + +produces the following data structure: + + { + dir => { + prefix => '/usr/local/apache', + exec_prefix => '/usr/local/apache', + bindir => '/usr/local/apache/bin', + } + } + +=head2 XMLDecl => 1 or XMLDecl => 'string' I<# out - handy> + +If you want the output from C to start with the optional XML +declaration, simply set the option to '1'. The default XML declaration is: + + + +If you want some other string (for example to declare an encoding value), set +the value of this option to the complete string you require. + + +=head1 OPTIONAL OO INTERFACE + +The procedural interface is both simple and convenient however there are a +couple of reasons why you might prefer to use the object oriented (OO) +interface: + +=over 4 + +=item * + +to define a set of default values which should be used on all subsequent calls +to C or C + +=item * + +to override methods in B to provide customised behaviour + +=back + +The default values for the options described above are unlikely to suit +everyone. The OO interface allows you to effectively override B's +defaults with your preferred values. It works like this: + +First create an XML::Simple parser object with your preferred defaults: + + my $xs = new XML::Simple(ForceArray => 1, KeepRoot => 1); + +then call C or C as a method of that object: + + my $ref = $xs->XMLin($xml); + my $xml = $xs->XMLout($ref); + +You can also specify options when you make the method calls and these values +will be merged with the values specified when the object was created. Values +specified in a method call take precedence. + +Overriding methods is a more advanced topic but might be useful if for example +you wished to provide an alternative routine for escaping character data (the +escape_value method) or for building the initial parse tree (the build_tree +method). + +Note: when called as methods, the C and C routines may be +called as C or C. The method names are aliased so the +only difference is the aesthetics. + +=head1 STRICT MODE + +If you import the B routines like this: + + use XML::Simple qw(:strict); + +the following common mistakes will be detected and treated as fatal errors + +=over 4 + +=item * + +Failing to explicitly set the C option - if you can't be bothered +reading about this option, turn it off with: KeyAttr => [ ] + +=item * + +Failing to explicitly set the C option - if you can't be bothered +reading about this option, set it to the safest mode with: ForceArray => 1 + +=item * + +Setting ForceArray to an array, but failing to list all the elements from the +KeyAttr hash. + +=item * + +Data error - KeyAttr is set to say { part => 'partnum' } but the XML contains +one or more EpartE elements without a 'partnum' attribute (or nested +element). Note: if strict mode is not set but -w is, this condition triggers a +warning. + +=item * + +Data error - as above, but value of key attribute (eg: partnum) is not a +scalar string (due to nested elements etc). This will also trigger a warning +if strict mode is not enabled. + +=back + +=head1 SAX SUPPORT + +From version 1.08_01, B includes support for SAX (the Simple API +for XML) - specifically SAX2. + +In a typical SAX application, an XML parser (or SAX 'driver') module generates +SAX events (start of element, character data, end of element, etc) as it parses +an XML document and a 'handler' module processes the events to extract the +required data. This simple model allows for some interesting and powerful +possibilities: + +=over 4 + +=item * + +Applications written to the SAX API can extract data from huge XML documents +without the memory overheads of a DOM or tree API. + +=item * + +The SAX API allows for plug and play interchange of parser modules without +having to change your code to fit a new module's API. A number of SAX parsers +are available with capabilities ranging from extreme portability to blazing +performance. + +=item * + +A SAX 'filter' module can implement both a handler interface for receiving +data and a generator interface for passing modified data on to a downstream +handler. Filters can be chained together in 'pipelines'. + +=item * + +One filter module might split a data stream to direct data to two or more +downstream handlers. + +=item * + +Generating SAX events is not the exclusive preserve of XML parsing modules. +For example, a module might extract data from a relational database using DBI +and pass it on to a SAX pipeline for filtering and formatting. + +=back + +B can operate at either end of a SAX pipeline. For example, +you can take a data structure in the form of a hashref and pass it into a +SAX pipeline using the 'Handler' option on C: + + use XML::Simple; + use Some::SAX::Filter; + use XML::SAX::Writer; + + my $ref = { + .... # your data here + }; + + my $writer = XML::SAX::Writer->new(); + my $filter = Some::SAX::Filter->new(Handler => $writer); + my $simple = XML::Simple->new(Handler => $filter); + $simple->XMLout($ref); + +You can also put B at the opposite end of the pipeline to take +advantage of the simple 'tree' data structure once the relevant data has been +isolated through filtering: + + use XML::SAX; + use Some::SAX::Filter; + use XML::Simple; + + my $simple = XML::Simple->new(ForceArray => 1, KeyAttr => ['partnum']); + my $filter = Some::SAX::Filter->new(Handler => $simple); + my $parser = XML::SAX::ParserFactory->parser(Handler => $filter); + + my $ref = $parser->parse_uri('some_huge_file.xml'); + + print $ref->{part}->{'555-1234'}; + +You can build a filter by using an XML::Simple object as a handler and setting +its DataHandler option to point to a routine which takes the resulting tree, +modifies it and sends it off as SAX events to a downstream handler: + + my $writer = XML::SAX::Writer->new(); + my $filter = XML::Simple->new( + DataHandler => sub { + my $simple = shift; + my $data = shift; + + # Modify $data here + + $simple->XMLout($data, Handler => $writer); + } + ); + my $parser = XML::SAX::ParserFactory->parser(Handler => $filter); + + $parser->parse_uri($filename); + +I but it could also have been specified in the constructor>. + +=head1 ENVIRONMENT + +If you don't care which parser module B uses then skip this +section entirely (it looks more complicated than it really is). + +B will default to using a B parser if one is available or +B if SAX is not available. + +You can dictate which parser module is used by setting either the environment +variable 'XML_SIMPLE_PREFERRED_PARSER' or the package variable +$XML::Simple::PREFERRED_PARSER to contain the module name. The following rules +are used: + +=over 4 + +=item * + +The package variable takes precedence over the environment variable if both are defined. To force B to ignore the environment settings and use +its default rules, you can set the package variable to an empty string. + +=item * + +If the 'preferred parser' is set to the string 'XML::Parser', then +L will be used (or C will die if L is not +installed). + +=item * + +If the 'preferred parser' is set to some other value, then it is assumed to be +the name of a SAX parser module and is passed to L +If L is not installed, or the requested parser module is not +installed, then C will die. + +=item * + +If the 'preferred parser' is not defined at all (the normal default +state), an attempt will be made to load L. If L is +installed, then a parser module will be selected according to +L's normal rules (which typically means the last SAX +parser installed). + +=item * + +if the 'preferred parser' is not defined and B is not +installed, then B will be used. C will die if +L is not installed. + +=back + +Note: The B distribution includes an XML parser written entirely in +Perl. It is very portable but it is not very fast. You should consider +installing L or L if they are available for your +platform. + +=head1 ERROR HANDLING + +The XML standard is very clear on the issue of non-compliant documents. An +error in parsing any single element (for example a missing end tag) must cause +the whole document to be rejected. B will die with an appropriate +message if it encounters a parsing error. + +If dying is not appropriate for your application, you should arrange to call +C in an eval block and look for errors in $@. eg: + + my $config = eval { XMLin() }; + PopUpMessage($@) if($@); + +Note, there is a common misconception that use of B will significantly +slow down a script. While that may be true when the code being eval'd is in a +string, it is not true of code like the sample above. + +=head1 EXAMPLES + +When C reads the following very simple piece of XML: + + + +it returns the following data structure: + + { + 'username' => 'testuser', + 'password' => 'frodo' + } + +The identical result could have been produced with this alternative XML: + + + +Or this (although see 'ForceArray' option for variations): + + + testuser + frodo + + +Repeated nested elements are represented as anonymous arrays: + + + + joe@smith.com + jsmith@yahoo.com + + + bob@smith.com + + + + { + 'person' => [ + { + 'email' => [ + 'joe@smith.com', + 'jsmith@yahoo.com' + ], + 'firstname' => 'Joe', + 'lastname' => 'Smith' + }, + { + 'email' => 'bob@smith.com', + 'firstname' => 'Bob', + 'lastname' => 'Smith' + } + ] + } + +Nested elements with a recognised key attribute are transformed (folded) from +an array into a hash keyed on the value of that attribute (see the C +option): + + + + + + + + { + 'person' => { + 'jbloggs' => { + 'firstname' => 'Joe', + 'lastname' => 'Bloggs' + }, + 'tsmith' => { + 'firstname' => 'Tom', + 'lastname' => 'Smith' + }, + 'jsmith' => { + 'firstname' => 'Joe', + 'lastname' => 'Smith' + } + } + } + + +The tag can be used to form anonymous arrays: + + + Col 1Col 2Col 3 + R1C1R1C2R1C3 + R2C1R2C2R2C3 + R3C1R3C2R3C3 + + + { + 'head' => [ + [ 'Col 1', 'Col 2', 'Col 3' ] + ], + 'data' => [ + [ 'R1C1', 'R1C2', 'R1C3' ], + [ 'R2C1', 'R2C2', 'R2C3' ], + [ 'R3C1', 'R3C2', 'R3C3' ] + ] + } + +Anonymous arrays can be nested to arbirtrary levels and as a special case, if +the surrounding tags for an XML document contain only an anonymous array the +arrayref will be returned directly rather than the usual hashref: + + + Col 1Col 2 + R1C1R1C2 + R2C1R2C2 + + + [ + [ 'Col 1', 'Col 2' ], + [ 'R1C1', 'R1C2' ], + [ 'R2C1', 'R2C2' ] + ] + +Elements which only contain text content will simply be represented as a +scalar. Where an element has both attributes and text content, the element +will be represented as a hashref with the text content in the 'content' key +(see the C option): + + + first + second + + + { + 'one' => 'first', + 'two' => { 'attr' => 'value', 'content' => 'second' } + } + +Mixed content (elements which contain both text content and nested elements) +will be not be represented in a useful way - element order and significant +whitespace will be lost. If you need to work with mixed content, then +XML::Simple is not the right tool for your job - check out the next section. + +=head1 WHERE TO FROM HERE? + +B is able to present a simple API because it makes some +assumptions on your behalf. These include: + +=over 4 + +=item * + +You're not interested in text content consisting only of whitespace + +=item * + +You don't mind that when things get slurped into a hash the order is lost + +=item * + +You don't want fine-grained control of the formatting of generated XML + +=item * + +You would never use a hash key that was not a legal XML element name + +=item * + +You don't need help converting between different encodings + +=back + +In a serious XML project, you'll probably outgrow these assumptions fairly +quickly. This section of the document used to offer some advice on chosing a +more powerful option. That advice has now grown into the 'Perl-XML FAQ' +document which you can find at: L + +The advice in the FAQ boils down to a quick explanation of tree versus +event based parsers and then recommends: + +For event based parsing, use SAX (do not set out to write any new code for +XML::Parser's handler API - it is obselete). + +For tree-based parsing, you could choose between the 'Perlish' approach of +L and more standards based DOM implementations - preferably one with +XPath support. + + +=head1 SEE ALSO + +B requires either L or L. + +To generate documents with namespaces, L is required. + +The optional caching functions require L. + +Answers to Frequently Asked Questions about XML::Simple are bundled with this +distribution as: L + +=head1 COPYRIGHT + +Copyright 1999-2004 Grant McLean Egrantm@cpan.orgE + +This library is free software; you can redistribute it and/or modify it +under the same terms as Perl itself. + +=cut + + diff --git a/qcd/part_cpu/bench/lib/XML/Writer.pm b/qcd/part_cpu/bench/lib/XML/Writer.pm new file mode 100644 index 0000000000000000000000000000000000000000..892c40f57064db6b3a316190d9b65c76f584290a --- /dev/null +++ b/qcd/part_cpu/bench/lib/XML/Writer.pm @@ -0,0 +1,1628 @@ +######################################################################## +# Writer.pm - write an XML document. +# Copyright (c) 1999 by Megginson Technologies. +# Copyright (c) 2004, 2005 by Joseph Walton . +# No warranty. Commercial and non-commercial use freely permitted. +# +# $Id: Writer.pm 185 2008-02-21 00:51:34Z josephw $ +######################################################################## + +package XML::Writer; + +require 5.004; + +use strict; +use vars qw($VERSION); +use Carp; +use IO::Handle; +$VERSION = "0.604"; + + + +######################################################################## +# Constructor. +######################################################################## + +# +# Public constructor. +# +# This actually does most of the work of the module: it defines closures +# for all of the real processing, and selects the appropriate closures +# to use based on the value of the UNSAFE parameter. The actual methods +# are just stubs. +# +sub new { + my ($class, %params) = (@_); + + # If the user wants namespaces, + # intercept the request here; it will + # come back to this constructor + # from within XML::Writer::Namespaces::new() + if ($params{NAMESPACES}) { + delete $params{NAMESPACES}; + return new XML::Writer::Namespaces(%params); + } + + # Set up $self and basic parameters + my $self; + my $output; + my $unsafe = $params{UNSAFE}; + my $newlines = $params{NEWLINES}; + my $dataMode = $params{DATA_MODE}; + my $dataIndent = $params{DATA_INDENT} || 0; + + # If the NEWLINES parameter is specified, + # set the $nl variable appropriately + my $nl = ''; + if ($newlines) { + $nl = "\n"; + } + + my $outputEncoding = $params{ENCODING} || ""; + my ($checkUnencodedRepertoire, $escapeEncoding); + if (lc($outputEncoding) eq 'us-ascii') { + $checkUnencodedRepertoire = \&_croakUnlessASCII; + $escapeEncoding = \&_escapeASCII; + } else { + my $doNothing = sub {}; + $checkUnencodedRepertoire = $doNothing; + $escapeEncoding = $doNothing; + } + + # Parse variables + my @elementStack = (); + my $elementLevel = 0; + my %seen = (); + + my $hasData = 0; + my @hasDataStack = (); + my $hasElement = 0; + my @hasElementStack = (); + my $hasHeading = 0; # Does this document have anything before the first element? + + # + # Private method to show attributes. + # + my $showAttributes = sub { + my $atts = $_[0]; + my $i = 1; + while ($atts->[$i]) { + my $aname = $atts->[$i++]; + my $value = _escapeLiteral($atts->[$i++]); + $value =~ s/\x0a/\ \;/g; + $value =~ s/\x0d/\ \;/g; + $value =~ s/\x09/\ \;/g; + &{$escapeEncoding}($value); + $output->print(" $aname=\"$value\""); + } + }; + + # Method implementations: the SAFE_ + # versions perform error checking + # and then call the regular ones. + my $end = sub { + $output->print("\n"); + }; + + my $SAFE_end = sub { + if (!$seen{ELEMENT}) { + croak("Document cannot end without a document element"); + } elsif ($elementLevel > 0) { + croak("Document ended with unmatched start tag(s): @elementStack"); + } else { + @elementStack = (); + $elementLevel = 0; + %seen = (); + &{$end}; + } + }; + + my $xmlDecl = sub { + my ($encoding, $standalone) = (@_); + if ($standalone && $standalone ne 'no') { + $standalone = 'yes'; + } + + # Only include an encoding if one has been explicitly supplied, + # either here or on construction. Allow the empty string + # to suppress it. + if (!defined($encoding)) { + $encoding = $outputEncoding; + } + $output->print("print(" encoding=\"$encoding\""); + } + if ($standalone) { + $output->print(" standalone=\"$standalone\""); + } + $output->print("?>\n"); + $hasHeading = 1; + }; + + my $SAFE_xmlDecl = sub { + if ($seen{ANYTHING}) { + croak("The XML declaration is not the first thing in the document"); + } else { + $seen{ANYTHING} = 1; + $seen{XMLDECL} = 1; + &{$xmlDecl}; + } + }; + + my $pi = sub { + my ($target, $data) = (@_); + if ($data) { + $output->print(""); + } else { + $output->print(""); + } + if ($elementLevel == 0) { + $output->print("\n"); + $hasHeading = 1; + } + }; + + my $SAFE_pi = sub { + my ($name, $data) = (@_); + $seen{ANYTHING} = 1; + if (($name =~ /^xml/i) && ($name !~ /^xml-stylesheet$/i)) { + carp("Processing instruction target begins with 'xml'"); + } + + if ($name =~ /\?\>/ || (defined($data) && $data =~ /\?\>/)) { + croak("Processing instruction may not contain '?>'"); + } elsif ($name =~ /\s/) { + croak("Processing instruction name may not contain whitespace"); + } else { + &{$pi}; + } + }; + + my $comment = sub { + my $data = $_[0]; + if ($dataMode && $elementLevel) { + $output->print("\n"); + $output->print(" " x ($elementLevel * $dataIndent)); + } + $output->print(""); + if ($dataMode && $elementLevel) { + $hasElement = 1; + } elsif ($elementLevel == 0) { + $output->print("\n"); + $hasHeading = 1; + } + }; + + my $SAFE_comment = sub { + my $data = $_[0]; + if ($data =~ /--/) { + carp("Interoperability problem: \"--\" in comment text"); + } + + if ($data =~ /-->/) { + croak("Comment may not contain '-->'"); + } else { + &{$checkUnencodedRepertoire}($data); + $seen{ANYTHING} = 1; + &{$comment}; + } + }; + + my $doctype = sub { + my ($name, $publicId, $systemId) = (@_); + $output->print("print(" PUBLIC \"$publicId\" \"$systemId\""); + } elsif ($systemId) { + $output->print(" SYSTEM \"$systemId\""); + } + $output->print(">\n"); + $hasHeading = 1; + }; + + my $SAFE_doctype = sub { + my $name = $_[0]; + if ($seen{DOCTYPE}) { + croak("Attempt to insert second DOCTYPE declaration"); + } elsif ($seen{ELEMENT}) { + croak("The DOCTYPE declaration must come before the first start tag"); + } else { + $seen{ANYTHING} = 1; + $seen{DOCTYPE} = $name; + &{$doctype}; + } + }; + + my $startTag = sub { + my $name = $_[0]; + if ($dataMode && ($hasHeading || $elementLevel)) { + $output->print("\n"); + $output->print(" " x ($elementLevel * $dataIndent)); + } + $elementLevel++; + push @elementStack, $name; + $output->print("<$name"); + &{$showAttributes}(\@_); + $output->print("$nl>"); + if ($dataMode) { + $hasElement = 1; + push @hasDataStack, $hasData; + $hasData = 0; + push @hasElementStack, $hasElement; + $hasElement = 0; + } + }; + + my $SAFE_startTag = sub { + my $name = $_[0]; + + &{$checkUnencodedRepertoire}($name); + _checkAttributes(\@_); + + if ($seen{ELEMENT} && $elementLevel == 0) { + croak("Attempt to insert start tag after close of document element"); + } elsif ($elementLevel == 0 && $seen{DOCTYPE} && $name ne $seen{DOCTYPE}) { + croak("Document element is \"$name\", but DOCTYPE is \"" + . $seen{DOCTYPE} + . "\""); + } elsif ($dataMode && $hasData) { + croak("Mixed content not allowed in data mode: element $name"); + } else { + $seen{ANYTHING} = 1; + $seen{ELEMENT} = 1; + &{$startTag}; + } + }; + + my $emptyTag = sub { + my $name = $_[0]; + if ($dataMode && ($hasHeading || $elementLevel)) { + $output->print("\n"); + $output->print(" " x ($elementLevel * $dataIndent)); + } + $output->print("<$name"); + &{$showAttributes}(\@_); + $output->print("$nl />"); + if ($dataMode) { + $hasElement = 1; + } + }; + + my $SAFE_emptyTag = sub { + my $name = $_[0]; + + &{$checkUnencodedRepertoire}($name); + _checkAttributes(\@_); + + if ($seen{ELEMENT} && $elementLevel == 0) { + croak("Attempt to insert empty tag after close of document element"); + } elsif ($elementLevel == 0 && $seen{DOCTYPE} && $name ne $seen{DOCTYPE}) { + croak("Document element is \"$name\", but DOCTYPE is \"" + . $seen{DOCTYPE} + . "\""); + } elsif ($dataMode && $hasData) { + croak("Mixed content not allowed in data mode: element $name"); + } else { + $seen{ANYTHING} = 1; + $seen{ELEMENT} = 1; + &{$emptyTag}; + } + }; + + my $endTag = sub { + my $name = $_[0]; + my $currentName = pop @elementStack; + $name = $currentName unless $name; + $elementLevel--; + if ($dataMode && $hasElement) { + $output->print("\n"); + $output->print(" " x ($elementLevel * $dataIndent)); + } + $output->print(""); + if ($dataMode) { + $hasData = pop @hasDataStack; + $hasElement = pop @hasElementStack; + } + }; + + my $SAFE_endTag = sub { + my $name = $_[0]; + my $oldName = $elementStack[$#elementStack]; + if ($elementLevel <= 0) { + croak("End tag \"$name\" does not close any open element"); + } elsif ($name && ($name ne $oldName)) { + croak("Attempt to end element \"$oldName\" with \"$name\" tag"); + } else { + &{$endTag}; + } + }; + + my $characters = sub { + my $data = $_[0]; + if ($data =~ /[\&\<\>]/) { + $data =~ s/\&/\&\;/g; + $data =~ s/\/\>\;/g; + } + &{$escapeEncoding}($data); + $output->print($data); + $hasData = 1; + }; + + my $SAFE_characters = sub { + if ($elementLevel < 1) { + croak("Attempt to insert characters outside of document element"); + } elsif ($dataMode && $hasElement) { + croak("Mixed content not allowed in data mode: characters"); + } else { + _croakUnlessDefinedCharacters($_[0]); + &{$characters}; + } + }; + + my $raw = sub { + $output->print($_[0]); + # Don't set $hasData or any other information: we know nothing + # about what was just written. + # + }; + + my $SAFE_raw = sub { + croak('raw() is only available when UNSAFE is set'); + }; + + my $cdata = sub { + my $data = $_[0]; + $data =~ s/\]\]>/\]\]\]\]>/g; + $output->print(""); + $hasData = 1; + }; + + my $SAFE_cdata = sub { + if ($elementLevel < 1) { + croak("Attempt to insert characters outside of document element"); + } elsif ($dataMode && $hasElement) { + croak("Mixed content not allowed in data mode: characters"); + } else { + _croakUnlessDefinedCharacters($_[0]); + &{$checkUnencodedRepertoire}($_[0]); + &{$cdata}; + } + }; + + # Assign the correct closures based on + # the UNSAFE parameter + if ($unsafe) { + $self = {'END' => $end, + 'XMLDECL' => $xmlDecl, + 'PI' => $pi, + 'COMMENT' => $comment, + 'DOCTYPE' => $doctype, + 'STARTTAG' => $startTag, + 'EMPTYTAG' => $emptyTag, + 'ENDTAG' => $endTag, + 'CHARACTERS' => $characters, + 'RAW' => $raw, + 'CDATA' => $cdata + }; + } else { + $self = {'END' => $SAFE_end, + 'XMLDECL' => $SAFE_xmlDecl, + 'PI' => $SAFE_pi, + 'COMMENT' => $SAFE_comment, + 'DOCTYPE' => $SAFE_doctype, + 'STARTTAG' => $SAFE_startTag, + 'EMPTYTAG' => $SAFE_emptyTag, + 'ENDTAG' => $SAFE_endTag, + 'CHARACTERS' => $SAFE_characters, + 'RAW' => $SAFE_raw, # This will intentionally fail + 'CDATA' => $SAFE_cdata + }; + } + + # Query methods + $self->{'IN_ELEMENT'} = sub { + my ($ancestor) = (@_); + return $elementStack[$#elementStack] eq $ancestor; + }; + + $self->{'WITHIN_ELEMENT'} = sub { + my ($ancestor) = (@_); + my $el; + foreach $el (@elementStack) { + return 1 if $el eq $ancestor; + } + return 0; + }; + + $self->{'CURRENT_ELEMENT'} = sub { + return $elementStack[$#elementStack]; + }; + + $self->{'ANCESTOR'} = sub { + my ($n) = (@_); + if ($n < scalar(@elementStack)) { + return $elementStack[$#elementStack-$n]; + } else { + return undef; + } + }; + + # Set and get the output destination. + $self->{'GETOUTPUT'} = sub { + return $output; + }; + + $self->{'SETOUTPUT'} = sub { + my $newOutput = $_[0]; + + if (ref($newOutput) eq 'SCALAR') { + $output = new XML::Writer::_String($newOutput); + } else { + # If there is no OUTPUT parameter, + # use standard output + $output = $newOutput || \*STDOUT; + if ($outputEncoding) { + if (lc($outputEncoding) eq 'utf-8') { + binmode($output, ':encoding(utf-8)'); + } elsif (lc($outputEncoding) eq 'us-ascii') { + binmode($output, ':encoding(us-ascii)'); + } else { + die 'The only supported encodings are utf-8 and us-ascii'; + } + } + } + }; + + $self->{'SETDATAMODE'} = sub { + $dataMode = $_[0]; + }; + + $self->{'GETDATAMODE'} = sub { + return $dataMode; + }; + + $self->{'SETDATAINDENT'} = sub { + $dataIndent = $_[0]; + }; + + $self->{'GETDATAINDENT'} = sub { + return $dataIndent; + }; + + # Set the output. + &{$self->{'SETOUTPUT'}}($params{'OUTPUT'}); + + # Return the blessed object. + return bless $self, $class; +} + + + +######################################################################## +# Public methods +######################################################################## + +# +# Finish writing the document. +# +sub end { + my $self = shift; + &{$self->{END}}; +} + +# +# Write an XML declaration. +# +sub xmlDecl { + my $self = shift; + &{$self->{XMLDECL}}; +} + +# +# Write a processing instruction. +# +sub pi { + my $self = shift; + &{$self->{PI}}; +} + +# +# Write a comment. +# +sub comment { + my $self = shift; + &{$self->{COMMENT}}; +} + +# +# Write a DOCTYPE declaration. +# +sub doctype { + my $self = shift; + &{$self->{DOCTYPE}}; +} + +# +# Write a start tag. +# +sub startTag { + my $self = shift; + &{$self->{STARTTAG}}; +} + +# +# Write an empty tag. +# +sub emptyTag { + my $self = shift; + &{$self->{EMPTYTAG}}; +} + +# +# Write an end tag. +# +sub endTag { + my $self = shift; + &{$self->{ENDTAG}}; +} + +# +# Write a simple data element. +# +sub dataElement { + my ($self, $name, $data, %atts) = (@_); + $self->startTag($name, %atts); + $self->characters($data); + $self->endTag($name); +} + +# +# Write a simple CDATA element. +# +sub cdataElement { + my ($self, $name, $data, %atts) = (@_); + $self->startTag($name, %atts); + $self->cdata($data); + $self->endTag($name); +} + +# +# Write character data. +# +sub characters { + my $self = shift; + &{$self->{CHARACTERS}}; +} + +# +# Write raw, unquoted, completely unchecked character data. +# +sub raw { + my $self = shift; + &{$self->{RAW}}; +} + +# +# Write CDATA. +# +sub cdata { + my $self = shift; + &{$self->{CDATA}}; +} + +# +# Query the current element. +# +sub in_element { + my $self = shift; + return &{$self->{IN_ELEMENT}}; +} + +# +# Query the ancestors. +# +sub within_element { + my $self = shift; + return &{$self->{WITHIN_ELEMENT}}; +} + +# +# Get the name of the current element. +# +sub current_element { + my $self = shift; + return &{$self->{CURRENT_ELEMENT}}; +} + +# +# Get the name of the numbered ancestor (zero-based). +# +sub ancestor { + my $self = shift; + return &{$self->{ANCESTOR}}; +} + +# +# Get the current output destination. +# +sub getOutput { + my $self = shift; + return &{$self->{GETOUTPUT}}; +} + + +# +# Set the current output destination. +# +sub setOutput { + my $self = shift; + return &{$self->{SETOUTPUT}}; +} + +# +# Set the current data mode (true or false). +# +sub setDataMode { + my $self = shift; + return &{$self->{SETDATAMODE}}; +} + + +# +# Get the current data mode (true or false). +# +sub getDataMode { + my $self = shift; + return &{$self->{GETDATAMODE}}; +} + + +# +# Set the current data indent step. +# +sub setDataIndent { + my $self = shift; + return &{$self->{SETDATAINDENT}}; +} + + +# +# Get the current data indent step. +# +sub getDataIndent { + my $self = shift; + return &{$self->{GETDATAINDENT}}; +} + + +# +# Empty stub. +# +sub addPrefix { +} + + +# +# Empty stub. +# +sub removePrefix { +} + + + +######################################################################## +# Private functions. +######################################################################## + +# +# Private: check for duplicate attributes and bad characters. +# Note - this starts at $_[1], because $_[0] is assumed to be an +# element name. +# +sub _checkAttributes { + my %anames; + my $i = 1; + while ($_[0]->[$i]) { + my $name = $_[0]->[$i]; + $i += 1; + if ($anames{$name}) { + croak("Two attributes named \"$name\""); + } else { + $anames{$name} = 1; + } + _croakUnlessDefinedCharacters($_[0]->[$i]); + $i += 1; + } +} + +# +# Private: escape an attribute value literal. +# +sub _escapeLiteral { + my $data = $_[0]; + if ($data =~ /[\&\<\>\"]/) { + $data =~ s/\&/\&\;/g; + $data =~ s/\/\>\;/g; + $data =~ s/\"/\"\;/g; + } + return $data; +} + +sub _escapeASCII($) { + $_[0] =~ s/([^\x00-\x7F])/sprintf('&#x%X;', ord($1))/ge; +} + +sub _croakUnlessASCII($) { + if ($_[0] =~ /[^\x00-\x7F]/) { + croak('Non-ASCII characters are not permitted in this part of a US-ASCII document'); + } +} + +# Enforce XML 1.0, section 2.2's definition of "Char" (only reject low ASCII, +# so as not to require Unicode support from perl) +sub _croakUnlessDefinedCharacters($) { + if ($_[0] =~ /([\x00-\x08\x0B-\x0C\x0E-\x1F])/) { + croak(sprintf('Code point \u%04X is not a valid character in XML', ord($1))); + } +} + + +######################################################################## +# XML::Writer::Namespaces - subclass for Namespace processing. +######################################################################## + +package XML::Writer::Namespaces; +use strict; +use vars qw(@ISA); +use Carp; + +@ISA = qw(XML::Writer); + +# +# Constructor +# +sub new { + my ($class, %params) = (@_); + + my $unsafe = $params{UNSAFE}; + + # Snarf the prefix map, if any, and + # note the default prefix. + my %prefixMap = (); + if ($params{PREFIX_MAP}) { + %prefixMap = (%{$params{PREFIX_MAP}}); + delete $params{PREFIX_MAP}; + } + $prefixMap{'http://www.w3.org/XML/1998/namespace'} = 'xml'; + + # Generate the reverse map for URIs + my $uriMap = {}; + my $key; + foreach $key (keys(%prefixMap)) { + $uriMap->{$prefixMap{$key}} = $key; + } + + my $defaultPrefix = $uriMap->{''}; + delete $prefixMap{$defaultPrefix} if ($defaultPrefix); + + # Create an instance of the parent. + my $self = new XML::Writer(%params); + + # Snarf the parent's methods that we're + # going to override. + my $OLD_startTag = $self->{STARTTAG}; + my $OLD_emptyTag = $self->{EMPTYTAG}; + my $OLD_endTag = $self->{ENDTAG}; + + # State variables + my @stack; + my $prefixCounter = 1; + my $nsDecls = {'http://www.w3.org/XML/1998/namespace' => 'xml'}; + my $nsDefaultDecl = undef; + my $nsCopyFlag = 0; + my @forcedNSDecls = (); + + if ($params{FORCED_NS_DECLS}) { + @forcedNSDecls = @{$params{FORCED_NS_DECLS}}; + delete $params{FORCED_NS_DECLS}; + } + + # + # Push the current declaration state. + # + my $pushState = sub { + push @stack, [$nsDecls, $nsDefaultDecl, $nsCopyFlag, $uriMap]; + $nsCopyFlag = 0; + }; + + + # + # Pop the current declaration state. + # + my $popState = sub { + ($nsDecls, $nsDefaultDecl, $nsCopyFlag, $uriMap) = @{pop @stack}; + }; + + # + # Generate a new prefix. + # + my $genPrefix = sub { + my $uri = $_[0]; + my $prefixCounter = 1; + my $prefix = $prefixMap{$uri}; + my %clashMap = %{$uriMap}; + while( my ($u, $p) = each(%prefixMap)) { + $clashMap{$p} = $u; + } + + while (!defined($prefix) || ($clashMap{$prefix} && $clashMap{$prefix} ne $uri)) { + $prefix = "__NS$prefixCounter"; + $prefixCounter++; + } + + return $prefix; + }; + + # + # Perform namespace processing on a single name. + # + my $processName = sub { + my ($nameref, $atts, $attFlag) = (@_); + my ($uri, $local) = @{$$nameref}; + my $prefix = $nsDecls->{$uri}; + + # Is this an element name that matches + # the default NS? + if (!$attFlag && $defaultPrefix && ($uri eq $defaultPrefix)) { + unless ($nsDefaultDecl && ($nsDefaultDecl eq $uri)) { + push @{$atts}, 'xmlns'; + push @{$atts}, $uri; + $nsDefaultDecl = $uri; + } + $$nameref = $local; + + if (defined($uriMap->{''})) { + delete ($nsDecls->{$uriMap->{''}}); + } + + $nsDecls->{$uri} = ''; + unless ($nsCopyFlag) { + $uriMap = {%{$uriMap}}; + $nsDecls = {%{$nsDecls}}; + $nsCopyFlag = 1; + } + $uriMap->{''} = $uri; + + # Is there a straight-forward prefix? + } elsif ($prefix) { + $$nameref = "$prefix:$local"; + } else { + $prefix = &{$genPrefix}($uri); + unless ($nsCopyFlag) { + $uriMap = {%{$uriMap}}; + $nsDecls = {%{$nsDecls}}; + $nsCopyFlag = 1; + } + $uriMap->{$prefix} = $uri; + $nsDecls->{$uri} = $prefix; + push @{$atts}, "xmlns:$prefix"; + push @{$atts}, $uri; + $$nameref = "$prefix:$local"; + } + }; + + + # + # Perform namespace processing on element and attribute names. + # + my $nsProcess = sub { + if (ref($_[0]->[0]) eq 'ARRAY') { + my $x = \@{$_[0]->[0]}; + &{$processName}(\$x, $_[0], 0); + splice(@{$_[0]}, 0, 1, $x); + } + my $i = 1; + while ($_[0]->[$i]) { + if (ref($_[0]->[$i]) eq 'ARRAY') { + my $x = \@{$_[0]->[$i]}; + &{$processName}(\$x, $_[0], 1); + splice(@{$_[0]}, $i, 1, $x); + } + $i += 2; + } + + # We do this if any declarations are forced, due either to + # constructor arguments or to a call during processing. + if (@forcedNSDecls) { + foreach (@forcedNSDecls) { + my @dummy = ($_, 'dummy'); + my $d2 = \@dummy; + if ($defaultPrefix && ($_ eq $defaultPrefix)) { + &{$processName}(\$d2, $_[0], 0); + } else { + &{$processName}(\$d2, $_[0], 1); + } + } + @forcedNSDecls = (); + } + }; + + + # Indicate that a namespace should be declared by the next open element + $self->{FORCENSDECL} = sub { + push @forcedNSDecls, $_[0]; + }; + + + # + # Start tag, with NS processing + # + $self->{STARTTAG} = sub { + my $name = $_[0]; + unless ($unsafe) { + _checkNSNames(\@_); + } + &{$pushState}(); + &{$nsProcess}(\@_); + &{$OLD_startTag}; + }; + + + # + # Empty tag, with NS processing + # + $self->{EMPTYTAG} = sub { + unless ($unsafe) { + _checkNSNames(\@_); + } + &{$pushState}(); + &{$nsProcess}(\@_); + &{$OLD_emptyTag}; + &{$popState}(); + }; + + + # + # End tag, with NS processing + # + $self->{ENDTAG} = sub { + my $name = $_[0]; + if (ref($_[0]) eq 'ARRAY') { + my $pfx = $nsDecls->{$_[0]->[0]}; + if ($pfx) { + $_[0] = $pfx . ':' . $_[0]->[1]; + } else { + $_[0] = $_[0]->[1]; + } + } else { + $_[0] = $_[0]; + } +# &{$nsProcess}(\@_); + &{$OLD_endTag}; + &{$popState}(); + }; + + + # + # Processing instruction, but only if not UNSAFE. + # + unless ($unsafe) { + my $OLD_pi = $self->{PI}; + $self->{PI} = sub { + my $target = $_[0]; + if (index($target, ':') >= 0) { + croak "PI target '$target' contains a colon."; + } + &{$OLD_pi}; + } + }; + + + # + # Add a prefix to the prefix map. + # + $self->{ADDPREFIX} = sub { + my ($uri, $prefix) = (@_); + if ($prefix) { + $prefixMap{$uri} = $prefix; + } else { + if (defined($defaultPrefix)) { + delete($prefixMap{$defaultPrefix}); + } + $defaultPrefix = $uri; + } + }; + + + # + # Remove a prefix from the prefix map. + # + $self->{REMOVEPREFIX} = sub { + my ($uri) = (@_); + if ($defaultPrefix && ($defaultPrefix eq $uri)) { + $defaultPrefix = undef; + } + delete $prefixMap{$uri}; + }; + + + # + # Bless and return the object. + # + return bless $self, $class; +} + + +# +# Add a preferred prefix for a namespace URI. +# +sub addPrefix { + my $self = shift; + return &{$self->{ADDPREFIX}}; +} + + +# +# Remove a preferred prefix for a namespace URI. +# +sub removePrefix { + my $self = shift; + return &{$self->{REMOVEPREFIX}}; +} + + +# +# Check names. +# +sub _checkNSNames { + my $names = $_[0]; + my $i = 1; + my $name = $names->[0]; + + # Check the element name. + if (ref($name) eq 'ARRAY') { + if (index($name->[1], ':') >= 0) { + croak("Local part of element name '" . + $name->[1] . + "' contains a colon."); + } + } elsif (index($name, ':') >= 0) { + croak("Element name '$name' contains a colon."); + } + + # Check the attribute names. + while ($names->[$i]) { + my $name = $names->[$i]; + if (ref($name) eq 'ARRAY') { + my $local = $name->[1]; + if (index($local, ':') >= 0) { + croak "Local part of attribute name '$local' contains a colon."; + } + } else { + if ($name =~ /^xmlns/) { + croak "Attribute name '$name' begins with 'xmlns'"; + } elsif (index($name, ':') >= 0) { + croak "Attribute name '$name' contains ':'"; + } + } + $i += 2; + } +} + +sub forceNSDecl +{ + my $self = shift; + return &{$self->{FORCENSDECL}}; +} + + +package XML::Writer::_String; + +# Internal class, behaving sufficiently like an IO::Handle, +# that stores written output in a string +# +# Heavily inspired by Simon Oliver's XML::Writer::String + +sub new +{ + my $class = shift; + my $scalar_ref = shift; + return bless($scalar_ref, $class); +} + +sub print +{ + ${(shift)} .= join('', @_); + return 1; +} + +1; +__END__ + +######################################################################## +# POD Documentation +######################################################################## + +=head1 NAME + +XML::Writer - Perl extension for writing XML documents. + +=head1 SYNOPSIS + + use XML::Writer; + use IO::File; + + my $output = new IO::File(">output.xml"); + + my $writer = new XML::Writer(OUTPUT => $output); + $writer->startTag("greeting", + "class" => "simple"); + $writer->characters("Hello, world!"); + $writer->endTag("greeting"); + $writer->end(); + $output->close(); + + +=head1 DESCRIPTION + +XML::Writer is a helper module for Perl programs that write an XML +document. The module handles all escaping for attribute values and +character data and constructs different types of markup, such as tags, +comments, and processing instructions. + +By default, the module performs several well-formedness checks to +catch errors during output. This behaviour can be extremely useful +during development and debugging, but it can be turned off for +production-grade code. + +The module can operate either in regular mode in or Namespace +processing mode. In Namespace mode, the module will generate +Namespace Declarations itself, and will perform additional checks on +the output. + +Additional support is available for a simplified data mode with no +mixed content: newlines are automatically inserted around elements and +elements can optionally be indented based as their nesting level. + + +=head1 METHODS + +=head2 Writing XML + +=over 4 + +=item new([$params]) + +Create a new XML::Writer object: + + my $writer = new XML::Writer(OUTPUT => $output, NEWLINES => 1); + +Arguments are an anonymous hash array of parameters: + +=over 4 + +=item OUTPUT + +An object blessed into IO::Handle or one of its subclasses (such as +IO::File), or a reference to a string; if this parameter is not present, +the module will write to standard output. If a string reference is passed, +it will capture the generated XML (as a string; to get bytes use the +C module). + +=item NAMESPACES + +A true (1) or false (0, undef) value; if this parameter is present and +its value is true, then the module will accept two-member array +reference in the place of element and attribute names, as in the +following example: + + my $rdfns = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"; + my $writer = new XML::Writer(NAMESPACES => 1); + $writer->startTag([$rdfns, "Description"]); + +The first member of the array is a namespace URI, and the second part +is the local part of a qualified name. The module will automatically +generate appropriate namespace declarations and will replace the URI +part with a prefix. + +=item PREFIX_MAP + +A hash reference; if this parameter is present and the module is +performing namespace processing (see the NAMESPACES parameter), then +the module will use this hash to look up preferred prefixes for +namespace URIs: + + + my $rdfns = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"; + my $writer = new XML::Writer(NAMESPACES => 1, + PREFIX_MAP => {$rdfns => 'rdf'}); + +The keys in the hash table are namespace URIs, and the values are the +associated prefixes. If there is not a preferred prefix for the +namespace URI in this hash, then the module will automatically +generate prefixes of the form "__NS1", "__NS2", etc. + +To set the default namespace, use '' for the prefix. + +=item FORCED_NS_DECLS + +An array reference; if this parameter is present, the document element +will contain declarations for all the given namespace URIs. +Declaring namespaces in advance is particularly useful when a large +number of elements from a namespace are siblings, but don't share a direct +ancestor from the same namespace. + +=item NEWLINES + +A true or false value; if this parameter is present and its value is +true, then the module will insert an extra newline before the closing +delimiter of start, end, and empty tags to guarantee that the document +does not end up as a single, long line. If the parameter is not +present, the module will not insert the newlines. + +=item UNSAFE + +A true or false value; if this parameter is present and its value is +true, then the module will skip most well-formedness error checking. +If the parameter is not present, the module will perform the +well-formedness error checking by default. Turn off error checking at +your own risk! + +=item DATA_MODE + +A true or false value; if this parameter is present and its value is +true, then the module will enter a special data mode, inserting +newlines automatically around elements and (unless UNSAFE is also +specified) reporting an error if any element has both characters and +elements as content. + +=item DATA_INDENT + +A numeric value; if this parameter is present, it represents the +indent step for elements in data mode (it will be ignored when not in +data mode). + +=item ENCODING + +A character encoding; currently this must be one of 'utf-8' or 'us-ascii'. +If present, it will be used for the underlying character encoding and as the +default in the XML declaration. + +=back + +=item end() + +Finish creating an XML document. This method will check that the +document has exactly one document element, and that all start tags are +closed: + + $writer->end(); + +=item xmlDecl([$encoding, $standalone]) + +Add an XML declaration to the beginning of an XML document. The +version will always be "1.0". If you provide a non-null encoding or +standalone argument, its value will appear in the declaration (any +non-null value for standalone except 'no' will automatically be +converted to 'yes'). If not given here, the encoding will be taken from the +ENCODING argument. Pass the empty string to suppress this behaviour. + + $writer->xmlDecl("UTF-8"); + +=item doctype($name, [$publicId, $systemId]) + +Add a DOCTYPE declaration to an XML document. The declaration must +appear before the beginning of the root element. If you provide a +publicId, you must provide a systemId as well, but you may provide +just a system ID by passing 'undef' for the publicId. + + $writer->doctype("html"); + +=item comment($text) + +Add a comment to an XML document. If the comment appears outside the +document element (either before the first start tag or after the last +end tag), the module will add a carriage return after it to improve +readability. In data mode, comments will be treated as empty tags: + + $writer->comment("This is a comment"); + +=item pi($target [, $data]) + +Add a processing instruction to an XML document: + + $writer->pi('xml-stylesheet', 'href="style.css" type="text/css"'); + +If the processing instruction appears outside the document element +(either before the first start tag or after the last end tag), the +module will add a carriage return after it to improve readability. + +The $target argument must be a single XML name. If you provide the +$data argument, the module will insert its contents following the +$target argument, separated by a single space. + +=item startTag($name [, $aname1 => $value1, ...]) + +Add a start tag to an XML document. Any arguments after the element +name are assumed to be name/value pairs for attributes: the module +will escape all '&', '<', '>', and '"' characters in the attribute +values using the predefined XML entities: + + $writer->startTag('doc', 'version' => '1.0', + 'status' => 'draft', + 'topic' => 'AT&T'); + +All start tags must eventually have matching end tags. + +=item emptyTag($name [, $aname1 => $value1, ...]) + +Add an empty tag to an XML document. Any arguments after the element +name are assumed to be name/value pairs for attributes (see startTag() +for details): + + $writer->emptyTag('img', 'src' => 'portrait.jpg', + 'alt' => 'Portrait of Emma.'); + +=item endTag([$name]) + +Add an end tag to an XML document. The end tag must match the closest +open start tag, and there must be a matching and properly-nested end +tag for every start tag: + + $writer->endTag('doc'); + +If the $name argument is omitted, then the module will automatically +supply the name of the currently open element: + + $writer->startTag('p'); + $writer->endTag(); + +=item dataElement($name, $data [, $aname1 => $value1, ...]) + +Print an entire element containing only character data. This is +equivalent to + + $writer->startTag($name [, $aname1 => $value1, ...]); + $writer->characters($data); + $writer->endTag($name); + +=item characters($data) + +Add character data to an XML document. All '<', '>', and '&' +characters in the $data argument will automatically be escaped using +the predefined XML entities: + + $writer->characters("Here is the formula: "); + $writer->characters("a < 100 && a > 5"); + +You may invoke this method only within the document element +(i.e. after the first start tag and before the last end tag). + +In data mode, you must not use this method to add whitespace between +elements. + +=item raw($data) + +Print data completely unquoted and unchecked to the XML document. For +example C will print a literal < character. This +necessarily bypasses all well-formedness checking, and is therefore +only available in unsafe mode. + +This can sometimes be useful for printing entities which are defined +for your XML format but the module doesn't know about, for example +  for XHTML. + +=item cdata($data) + +As C but writes the data quoted in a CDATA section, that +is, between . If the data to be written itself +contains ]]>, it will be written as several consecutive CDATA +sections. + +=item cdataElement($name, $data [, $aname1 => $value1, ...]) + +As C but the element content is written as one or more +CDATA sections (see C). + +=item setOutput($output) + +Set the current output destination, as in the OUTPUT parameter for the +constructor. + +=item getOutput() + +Return the current output destination, as in the OUTPUT parameter for +the constructor. + +=item setDataMode($mode) + +Enable or disable data mode, as in the DATA_MODE parameter for the +constructor. + +=item getDataMode() + +Return the current data mode, as in the DATA_MODE parameter for the +constructor. + +=item setDataIndent($step) + +Set the indent step for data mode, as in the DATA_INDENT parameter for +the constructor. + +=item getDataIndent() + +Return the indent step for data mode, as in the DATA_INDENT parameter +for the constructor. + + +=back + +=head2 Querying XML + +=over 4 + +=item in_element($name) + +Return a true value if the most recent open element matches $name: + + if ($writer->in_element('dl')) { + $writer->startTag('dt'); + } else { + $writer->startTag('li'); + } + +=item within_element($name) + +Return a true value if any open element matches $name: + + if ($writer->within_element('body')) { + $writer->startTag('h1'); + } else { + $writer->startTag('title'); + } + +=item current_element() + +Return the name of the currently open element: + + my $name = $writer->current_element(); + +This is the equivalent of + + my $name = $writer->ancestor(0); + +=item ancestor($n) + +Return the name of the nth ancestor, where $n=0 for the current open +element. + +=back + + +=head2 Additional Namespace Support + +As of 0.510, these methods may be used while writing a document. + +=over 4 + +=item addPrefix($uri, $prefix) + +Add a preferred mapping between a Namespace URI and a prefix. See +also the PREFIX_MAP constructor parameter. + +To set the default namespace, omit the $prefix parameter or set it to +''. + +=item removePrefix($uri) + +Remove a preferred mapping between a Namespace URI and a prefix. + +=item forceNSDecl($uri) + +Indicate that a namespace declaration for this URI should be included +with the next element to be started. + +=back + + +=head1 ERROR REPORTING + +With the default settings, the XML::Writer module can detect several +basic XML well-formedness errors: + +=over 4 + +=item * + +Lack of a (top-level) document element, or multiple document elements. + +=item * + +Unclosed start tags. + +=item * + +Misplaced delimiters in the contents of processing instructions or +comments. + +=item * + +Misplaced or duplicate XML declaration(s). + +=item * + +Misplaced or duplicate DOCTYPE declaration(s). + +=item * + +Mismatch between the document type name in the DOCTYPE declaration and +the name of the document element. + +=item * + +Mismatched start and end tags. + +=item * + +Attempts to insert character data outside the document element. + +=item * + +Duplicate attributes with the same name. + +=back + +During Namespace processing, the module can detect the following +additional errors: + +=over 4 + +=item * + +Attempts to use PI targets or element or attribute names containing a +colon. + +=item * + +Attempts to use attributes with names beginning "xmlns". + +=back + +To ensure full error detection, a program must also invoke the end +method when it has finished writing a document: + + $writer->startTag('greeting'); + $writer->characters("Hello, world!"); + $writer->endTag('greeting'); + $writer->end(); + +This error reporting can catch many hidden bugs in Perl programs that +create XML documents; however, if necessary, it can be turned off by +providing an UNSAFE parameter: + + my $writer = new XML::Writer(OUTPUT => $output, UNSAFE => 1); + + +=head1 AUTHOR + +David Megginson Edavid@megginson.comE + + +=head1 COPYRIGHT + +Copyright 1999, 2000 David Megginson Edavid@megginson.comE + +Copyright 2004, 2005 Joseph Walton Ejoe@kafsemo.orgE + + +=head1 SEE ALSO + +XML::Parser + +=cut diff --git a/qcd/part_cpu/bench/make2defs b/qcd/part_cpu/bench/make2defs new file mode 100755 index 0000000000000000000000000000000000000000..28525c907c4fdcbea73ac7eaf5b23c04a19fdee6 --- /dev/null +++ b/qcd/part_cpu/bench/make2defs @@ -0,0 +1,22 @@ +#!/usr/local/bin/perl -w +# +# Create Makefile.defs from a given Makefile +# +# contact: m.a.hermanns@fz-juelich.de +# +#################################################################### + +use strict; + +my ($line,$n); + +open(IN,$ARGV[0]); + +while() +{ + $line = $_; + $n = $line =~ s/^([A-Z0-9a-z_]+)\s*=\s*([a-zA-Z0-9_\-].*[a-zA-Z0-9_\-])[\ ]*/$1 = \#$1\#/m; + if ($n) { print $line; } +} + +close(IN); diff --git a/qcd/part_cpu/bench/make2xml b/qcd/part_cpu/bench/make2xml new file mode 100755 index 0000000000000000000000000000000000000000..56a4635bfa71ac3209eea27e7e869385d44261fa --- /dev/null +++ b/qcd/part_cpu/bench/make2xml @@ -0,0 +1,22 @@ +#!/usr/local/bin/perl -w +# +# Create compile values from a given Makefile +# +# contact: m.a.hermanns@fz-juelich.de +# +#################################################################### + +use strict; + +my ($line,$n); + +open(IN,$ARGV[0]); + +while() +{ + $line = $_; + $n = $line =~ s/^([A-Z0-9a-z_]+)\s*=\s*([a-zA-Z0-9_\-].*[a-zA-Z0-9_\-])[\ ]*//m; + if ($n) { print $line; } +} + +close(IN); diff --git a/qcd/part_cpu/bench/style.css b/qcd/part_cpu/bench/style.css new file mode 100644 index 0000000000000000000000000000000000000000..2f0658da6b89fe86c249ce5ddbd4379a4912abdb --- /dev/null +++ b/qcd/part_cpu/bench/style.css @@ -0,0 +1,15 @@ +body { + background-color: #EEEEEE; +} + +th { + background-color: #CCCCCC; +} + +td { + background-color: #CCFFFF; +} + +.multiValueElement { + background-color: #CCFFAA; +} \ No newline at end of file diff --git a/qcd/part_cpu/doc/JuBETutorial.pdf b/qcd/part_cpu/doc/JuBETutorial.pdf new file mode 100644 index 0000000000000000000000000000000000000000..273b3ac0a9dbdb21d03b2186fc5e3404ff6e877f Binary files /dev/null and b/qcd/part_cpu/doc/JuBETutorial.pdf differ diff --git a/qcd/part_cpu/doc/QUICK_GUIDE_UEABS_QCD_BENCHMARKSUITE b/qcd/part_cpu/doc/QUICK_GUIDE_UEABS_QCD_BENCHMARKSUITE new file mode 100644 index 0000000000000000000000000000000000000000..138b332d8f10d1b0b554247e1f155edc79a8f609 --- /dev/null +++ b/qcd/part_cpu/doc/QUICK_GUIDE_UEABS_QCD_BENCHMARKSUITE @@ -0,0 +1,108 @@ +################# +################# UEABS - QCD - BENCHMARKSUITE -- QUICK-USERGUIDE +################# + +This is a very short summary of the general step, which has +to be performed, to run the UEABS QCD Benchmarksuite on a new +machine. More information can be found in the documentation of +the UEABS-QCD BENCHMARKSUITE which is located in in the folder +./PABS/doc/* +or under the web-link + +http://www.prace-ri.eu/UEABS/QCD/QCD_Build_README.txt +http://www.prace-ri.eu/UEABS/QCD/QCD_Run_README.txt + +The suite works with Jube, which will handle the compilation, +the submission and the analysis of the Benchmarksuite. On a new +machine several xml-files has to be added or created. +This guide will give a short and very quick overview about +the different steps. + +The FIRST STEP on a new machine is to add information about the +system to the platform-folder located in: +./PABS/platform +Here, the new platform has to be added to the xml-file "platform.xml" +similar to the already xml-templates: + +.. + + + + +The SECOND STEP is to provide a dummy-submit script which has to +added to a new subdirectory given by: + +./PABS/platform/"NEW-PLATFORM" + +In the THIRD STEP: Go to the home-directory of the UEABS-QCD-Benchmarksuite +located in: +./PABS/applications/QCD/ +Note that the source-files of the kernels are located in "./PABS/applications/QCD/src". +Here, similar to STEP ONE the xml-files: + +compile.xml, execute.xml and analyse.xml + +has to be edit, i.e. new xml-templates with the new platform-information +has to be added. + +In the FOURTH STEP the runs will be setup by creating runs-scripts similar to +"prace-functional-NEW-PLATORM.xml" for a functional test +and +"prace-scaling-NEW-PLATORM.xml" for a scaling run. +Here, several limits of the different codes has to be taken into account, see for +this the section "Limitation" at the end of this quick-userguide. + +In the FIFTH STEP the benchmark can be compiled and ran by using the command: + +perl ../../bench/jube prace-functional-"NEW-PLATFORM".xml + +in the directory: +"./PABS/applications/QCD/". +This will generate a folder "tmp" with subfolder in "./PABS/applications/QCD/" +where the source-file will be compiled and executed. If the compilation or the submission +fails, more information can be found in the subdirectories of "tmp". In any cases +after the generation of the folder "tmp", compilation and submition can be done, +in principle, without Jube. + +In the LAST STEP, the scaling results can be analyzed, by using +perl ../../bench/jube analyse.xml + +LIMITATION: + +The different kernels consists of lattice QCD production codes and have several limitations +in parallelization and lattice volume. Kernel A,C,D and E using a four dimensional +lattice while in case of kernel B a three dimensional lattice is used. All kernels +can be parallelized in all direction. The different lattice sizes and parallelization +has to be declared in the scripts: 'prace-functional-"NEW-PLATFORM".xml' or +'prace-scaling-NEW-PLATORM.xml'. The limitation for the different kernel are given by: + +"pt * px * py * pz = task" + +and additional for the Kernel A, D and E + +" nt / pt modulo 2 = 0 " and " nt => 4 " + +and the same condition for the other pairs +"{nx,px}, {ny,py}, {nz,pz}". Moreover +the lattice extends nt, nx, ny and nx has to be even and larger +than 4. + +####### +####### Please see for further information the Readme-files +####### which are provided under +####### +####### http://www.prace-ri.eu/UEABS/QCD/QCD_Build_README.txt +####### http://www.prace-ri.eu/UEABS/QCD/QCD_Run_README.txt +####### or in +####### ./PABS/doc/* +####### +####### Jacob Finkenrath, 2017 +####### \ No newline at end of file diff --git a/qcd/part_cpu/doc/jube-deisa.pdf b/qcd/part_cpu/doc/jube-deisa.pdf new file mode 100644 index 0000000000000000000000000000000000000000..6f1dc3401bf1fe6907cc201fc8e7a61ed907828f Binary files /dev/null and b/qcd/part_cpu/doc/jube-deisa.pdf differ diff --git a/qcd/part_cpu/platform/BULL-RS422-teraplus/teraplus_ccc_msub.job.in b/qcd/part_cpu/platform/BULL-RS422-teraplus/teraplus_ccc_msub.job.in new file mode 100644 index 0000000000000000000000000000000000000000..d50e54d1d8a5249f42de722625fe0a97bcc6741a --- /dev/null +++ b/qcd/part_cpu/platform/BULL-RS422-teraplus/teraplus_ccc_msub.job.in @@ -0,0 +1,13 @@ +#!/bin/bash +#MSUB -r #RUNNAME# +#MSUB -o #STDOUTLOGFILE# +#MSUB -e #STDERRLOGFILE# +#MSUB -n #TASKS# +#MSUB -N #NODES# +#MSUB -c #THREADSPERTASK# +#MSUB -T #TIMELIMIT# +#MSUB -M #MEMLIMIT# + +#PREPROCESS# +#MEASUREMENT# #STARTER# #ARGS_STARTER# #EXECUTABLE# #ARGS_EXECUTABLE# +#POSTPROCESS# diff --git a/qcd/part_cpu/platform/BULL-RS422-teraplus/teraplus_cea_msub.job.in b/qcd/part_cpu/platform/BULL-RS422-teraplus/teraplus_cea_msub.job.in new file mode 100644 index 0000000000000000000000000000000000000000..d50e54d1d8a5249f42de722625fe0a97bcc6741a --- /dev/null +++ b/qcd/part_cpu/platform/BULL-RS422-teraplus/teraplus_cea_msub.job.in @@ -0,0 +1,13 @@ +#!/bin/bash +#MSUB -r #RUNNAME# +#MSUB -o #STDOUTLOGFILE# +#MSUB -e #STDERRLOGFILE# +#MSUB -n #TASKS# +#MSUB -N #NODES# +#MSUB -c #THREADSPERTASK# +#MSUB -T #TIMELIMIT# +#MSUB -M #MEMLIMIT# + +#PREPROCESS# +#MEASUREMENT# #STARTER# #ARGS_STARTER# #EXECUTABLE# #ARGS_EXECUTABLE# +#POSTPROCESS# diff --git a/qcd/part_cpu/platform/Cray-XE6-HECToR/cray_PBSsubmit.job.in b/qcd/part_cpu/platform/Cray-XE6-HECToR/cray_PBSsubmit.job.in new file mode 100644 index 0000000000000000000000000000000000000000..d435882df60a8f020482ff1ce78794e212a7207c --- /dev/null +++ b/qcd/part_cpu/platform/Cray-XE6-HECToR/cray_PBSsubmit.job.in @@ -0,0 +1,27 @@ +#!/bin/bash -x + +#PBS -N #BENCHNAME# +#PBS -l mppwidth=#TASKS# +#PBS -l mppnppn=#TASKSPERNODE# +#PBS -l walltime=#TIME_LIMIT# +#PBS -o #STDOUTLOGFILE# +#PBS -e #STDERRLOGFILE# +#PBS -A #ACCOUNTING# + +cd ${PBS_O_WORKDIR} +#enable modules within the batch system +. /opt/modules/default/etc/modules.sh + +echo "" >> #OUTDIR#/start_info.xml + +#ENV# +#PREPROCESS# +#MEASUREMENT# #STARTER# #ARGS_STARTER# #EXECUTABLE# #ARGS_EXECUTABLE# + + +cd ${PBS_O_WORKDIR} +#POSTPROCESS# + + + +echo "" >> #OUTDIR#/end_info.xml diff --git a/qcd/part_cpu/platform/Cray-XE6-HERMIT/cray_PBSsubmit.job.in b/qcd/part_cpu/platform/Cray-XE6-HERMIT/cray_PBSsubmit.job.in new file mode 100644 index 0000000000000000000000000000000000000000..73f6b1691f00613246c5335f87fc6b4730ee6523 --- /dev/null +++ b/qcd/part_cpu/platform/Cray-XE6-HERMIT/cray_PBSsubmit.job.in @@ -0,0 +1,24 @@ +#!/bin/bash -x + +#PBS -N #BENCHNAME# +#PBS -l mppwidth=#TASKS# +#PBS -l mppnppn=#TASKSPERNODE# +#PBS -l walltime=#TIME_LIMIT# +#PBS -o #STDOUTLOGFILE# +#PBS -e #STDERRLOGFILE# + +cd ${PBS_O_WORKDIR} + +echo "" >> #OUTDIR#/start_info.xml + +#ENV# +#PREPROCESS# +#MEASUREMENT# #STARTER# #ARGS_STARTER# #EXECUTABLE# #ARGS_EXECUTABLE# + + +cd ${PBS_O_WORKDIR} +#POSTPROCESS# + + + +echo "" >> #OUTDIR#/end_info.xml diff --git a/qcd/part_cpu/platform/Cray-XT4-HECToR/cray_qsub.job.in b/qcd/part_cpu/platform/Cray-XT4-HECToR/cray_qsub.job.in new file mode 100644 index 0000000000000000000000000000000000000000..8137cffdd20c8df65f50e882aa285ee37d3489a8 --- /dev/null +++ b/qcd/part_cpu/platform/Cray-XT4-HECToR/cray_qsub.job.in @@ -0,0 +1,19 @@ +#!/bin/bash --login +#PBS -N prace_bm +#PBS -o #STDOUTLOGFILE# +#PBS -e #STDERRLOGFILE# +#PBS -l mppwidth=#TASKS# +#PBS -l mppnppn=#TASKSPERNODE# +#PBS -l walltime=#TIME_LIMIT# +#PBS -A z01-prace + +cd $PBS_O_WORKDIR/ +export NPROC=`qstat -f $PBS_JOBID | awk '/mppwidth/ {print $3}'` +export NTASK=`qstat -f $PBS_JOBID | awk '/mppnppn/ {print $3}'` + + +echo "" >> #OUTDIR#/start_info.xml +#PREPROCESS# +#MEASUREMENT# #STARTER# #ARGS_STARTER# #EXECUTABLE# #ARGS_EXECUTABLE# +#POSTPROCESS# +echo "" >> #OUTDIR#/end_info.xml diff --git a/qcd/part_cpu/platform/Cray-XT4-Louhi/cray_qsub.excl.job.in b/qcd/part_cpu/platform/Cray-XT4-Louhi/cray_qsub.excl.job.in new file mode 100644 index 0000000000000000000000000000000000000000..931fed79925bb3adb4c7ac52e77d072ec57684e0 --- /dev/null +++ b/qcd/part_cpu/platform/Cray-XT4-Louhi/cray_qsub.excl.job.in @@ -0,0 +1,20 @@ +#!/bin/bash -l +#PBS -N #BENCHNAME# +#PBS -o #STDOUTLOGFILE# +#PBS -e #STDERRLOGFILE# +#PBS -m #NOTIFICATION# +#PBS -l mppwidth=1440 +#PBS -l mppnppn=#TASKSPERNODE# +#PBS -l mppdepth=#THREADSPERTASK# +#PBS -l mppmem=#MEMORYPERTASK# +#PBS -l walltime=#TIME_LIMIT# +#PBS -q prace +#ENV# + +cd $PBS_O_WORKDIR/ + +echo "" >> #OUTDIR#/start_info.xml +#PREPROCESS# +#MEASUREMENT# #STARTER# #ARGS_STARTER# #EXECUTABLE# #ARGS_EXECUTABLE# +#POSTPROCESS# +echo "" >> #OUTDIR#/end_info.xml diff --git a/qcd/part_cpu/platform/Cray-XT4-Louhi/cray_qsub.job.in b/qcd/part_cpu/platform/Cray-XT4-Louhi/cray_qsub.job.in new file mode 100644 index 0000000000000000000000000000000000000000..5633733bb6c454ad72a8105e7885952d5a70dccf --- /dev/null +++ b/qcd/part_cpu/platform/Cray-XT4-Louhi/cray_qsub.job.in @@ -0,0 +1,19 @@ +#!/bin/bash -l +#PBS -N #BENCHNAME# +#PBS -o #STDOUTLOGFILE# +#PBS -e #STDERRLOGFILE# +#PBS -m #NOTIFICATION# +#PBS -l mppwidth=#TASKS# +#PBS -l mppnppn=#TASKSPERNODE# +#PBS -l mppdepth=#THREADSPERTASK# +#PBS -l mppmem=#MEMORYPERTASK# +#PBS -l walltime=#TIME_LIMIT# +#ENV# + +cd $PBS_O_WORKDIR/ + +echo "" >> #OUTDIR#/start_info.xml +#PREPROCESS# +#MEASUREMENT# #STARTER# #ARGS_STARTER# #EXECUTABLE# #ARGS_EXECUTABLE# +#POSTPROCESS# +echo "" >> #OUTDIR#/end_info.xml diff --git a/qcd/part_cpu/platform/Cray-XT4-Louhi/cray_qsub.shanghai.job.in b/qcd/part_cpu/platform/Cray-XT4-Louhi/cray_qsub.shanghai.job.in new file mode 100644 index 0000000000000000000000000000000000000000..0109e8a79313960ce8410375b841827b03bac5f3 --- /dev/null +++ b/qcd/part_cpu/platform/Cray-XT4-Louhi/cray_qsub.shanghai.job.in @@ -0,0 +1,20 @@ +#!/bin/bash -l +#PBS -N #BENCHNAME# +#PBS -o #STDOUTLOGFILE# +#PBS -e #STDERRLOGFILE# +#PBS -m #NOTIFICATION# +#PBS -q prace-shanghai +#PBS -l mppwidth=#TASKS# +#PBS -l mppnppn=#TASKSPERNODE# +#PBS -l mppdepth=#THREADSPERTASK# +#PBS -l mppmem=#MEMORYPERTASK# +#PBS -l walltime=#TIME_LIMIT# +#ENV# + +cd $PBS_O_WORKDIR/ + +echo "" >> #OUTDIR#/start_info.xml +#PREPROCESS# +#MEASUREMENT# #STARTER# #ARGS_STARTER# #EXECUTABLE# #ARGS_EXECUTABLE# +#POSTPROCESS# +echo "" >> #OUTDIR#/end_info.xml diff --git a/qcd/part_cpu/platform/Cray-XT5-Louhi/cray_qsub.job.in b/qcd/part_cpu/platform/Cray-XT5-Louhi/cray_qsub.job.in new file mode 100644 index 0000000000000000000000000000000000000000..5633733bb6c454ad72a8105e7885952d5a70dccf --- /dev/null +++ b/qcd/part_cpu/platform/Cray-XT5-Louhi/cray_qsub.job.in @@ -0,0 +1,19 @@ +#!/bin/bash -l +#PBS -N #BENCHNAME# +#PBS -o #STDOUTLOGFILE# +#PBS -e #STDERRLOGFILE# +#PBS -m #NOTIFICATION# +#PBS -l mppwidth=#TASKS# +#PBS -l mppnppn=#TASKSPERNODE# +#PBS -l mppdepth=#THREADSPERTASK# +#PBS -l mppmem=#MEMORYPERTASK# +#PBS -l walltime=#TIME_LIMIT# +#ENV# + +cd $PBS_O_WORKDIR/ + +echo "" >> #OUTDIR#/start_info.xml +#PREPROCESS# +#MEASUREMENT# #STARTER# #ARGS_STARTER# #EXECUTABLE# #ARGS_EXECUTABLE# +#POSTPROCESS# +echo "" >> #OUTDIR#/end_info.xml diff --git a/qcd/part_cpu/platform/IBM-BGP-Jugene/ibm_llsubmit.job.in b/qcd/part_cpu/platform/IBM-BGP-Jugene/ibm_llsubmit.job.in new file mode 100755 index 0000000000000000000000000000000000000000..9a48e54006dfce0bb0bfef2e9640f5bdfca30cdd --- /dev/null +++ b/qcd/part_cpu/platform/IBM-BGP-Jugene/ibm_llsubmit.job.in @@ -0,0 +1,19 @@ +# @ shell = /bin/bash +# @ job_name = #BENCHNAME# +# @ output = #STDOUTLOGFILE# +# @ error = #STDERRLOGFILE# +# @ notification = #NOTIFICATION# +# @ notify_user = #NOTIFY_EMAIL# +# @ wall_clock_limit = #TIME_LIMIT# +# @ job_type = bluegene +# @ bg_size = #BGSIZE# +# @ bg_connection = #BGCONNECTION# +# @ queue + +#ENV# + +echo "" >> #OUTDIR#/start_info.xml +#PREPROCESS# +#STARTER# #ARGS_STARTER# -exe #EXECUTABLE# -args "#ARGS_EXECUTABLE#" +#POSTPROCESS# +echo "" >> #OUTDIR#/end_info.xml diff --git a/qcd/part_cpu/platform/IBM-BGP-Jugene/ibm_llsubmit.job.in.2 b/qcd/part_cpu/platform/IBM-BGP-Jugene/ibm_llsubmit.job.in.2 new file mode 100755 index 0000000000000000000000000000000000000000..3b241a816961cbc4252fd544ade50f4175ff9418 --- /dev/null +++ b/qcd/part_cpu/platform/IBM-BGP-Jugene/ibm_llsubmit.job.in.2 @@ -0,0 +1,19 @@ +# @ shell = /bin/bash +# @ job_name = #BENCHNAME# +# @ output = #STDOUTLOGFILE# +# @ error = #STDERRLOGFILE# +# @ notification = #NOTIFICATION# +# @ notify_user = #NOTIFY_EMAIL# +# @ wall_clock_limit = #TIME_LIMIT# +# @ job_type = bluegene +# @ bg_size = #BGSIZE# +# @ bg_connection = #BGCONNECTION# +# @ queue + +#ENV# + +echo "" >> #OUTDIR#/start_info.xml +#PREPROCESS# +#STARTER# #ARGS_STARTER# #EXECUTABLE# #ARGS_EXECUTABLE# +#POSTPROCESS# +echo "" >> #OUTDIR#/end_info.xml diff --git a/qcd/part_cpu/platform/IBM-BGQ-Juqueen/ibm_llsubmit.job.in b/qcd/part_cpu/platform/IBM-BGQ-Juqueen/ibm_llsubmit.job.in new file mode 100644 index 0000000000000000000000000000000000000000..fd62d531b22d47262ff6eb5009be1fab52c53b18 --- /dev/null +++ b/qcd/part_cpu/platform/IBM-BGQ-Juqueen/ibm_llsubmit.job.in @@ -0,0 +1,18 @@ +# @ job_name = #BENCHNAME# +# @ output = #STDOUTLOGFILE# +# @ error = #STDERRLOGFILE# +# @environment = COPY_ALL +# @ notification = #NOTIFICATION# +# @ wall_clock_limit = #TIME_LIMIT# +# @ job_type = bluegene +# @ bg_size = #BGSIZE# +# @ bg_connectivity = #BGCONNECTIVITY# +# @ queue + +#ENV# + +echo "" >> #OUTDIR#/start_info.xml +#PREPROCESS# +#STARTER# #ARGS_STARTER# : #EXECUTABLE# #ARGS_EXECUTABLE# +#POSTPROCESS# +echo "" >> #OUTDIR#/end_info.xml diff --git a/qcd/part_cpu/platform/IBM-BladeCenterLS21-BCX/Bccls21_lsfsubmit.job.in b/qcd/part_cpu/platform/IBM-BladeCenterLS21-BCX/Bccls21_lsfsubmit.job.in new file mode 100644 index 0000000000000000000000000000000000000000..23d4eed093a9f35ab1c5727855a1ee4128b494d4 --- /dev/null +++ b/qcd/part_cpu/platform/IBM-BladeCenterLS21-BCX/Bccls21_lsfsubmit.job.in @@ -0,0 +1,27 @@ +#!/bin/bash +#BSUB -J #BENCHNAME# +#BSUB -W #TIME_LIMIT# +#BSUB -B +#BSUB -N +#BSUB -u #NOTIFY_EMAIL# +#BSUB -n #NCPUS# +#BSUB -a #CLASS# +#BSUB -oo #STDOUTLOGFILE# +#BSUB -eo #STDERRLOGFILE# +#BSUB -R "order[ch] span[ptile=4]" + +#MODULE_INIT# +module purge +module load profile/advanced +#MODULE_CMD# #MODULE_FILES# + +export LSF_PAM_HOSTLIST_USE=#NODERESERVATION# +export OMP_NUM_THREADS=#THREADSPERTASK# + + +echo "" >> #OUTDIR#/start_info.xml +#PREPROCESS# +#STARTER# #ARGS_STARTER# #MEASUREMENT# #EXECUTABLE# #ARGS_EXECUTABLE# +#POSTPROCESS# +echo "" >> #OUTDIR#/end_info.xml + diff --git a/qcd/part_cpu/platform/IBM-Cell-MariCel/mc_submit.job.in b/qcd/part_cpu/platform/IBM-Cell-MariCel/mc_submit.job.in new file mode 100644 index 0000000000000000000000000000000000000000..ae12186b9a3c570b8796be05caeb7729549c6d0f --- /dev/null +++ b/qcd/part_cpu/platform/IBM-Cell-MariCel/mc_submit.job.in @@ -0,0 +1,21 @@ +#!/bin/bash +#@ initialdir = . +#@ job_name = #BENCHNAME# +#@ class = #CLASS# +#@ output = #STDOUTLOGFILE# +#@ error = #STDERRLOGFILE# +#@ wall_clock_limit = #TIME_LIMIT# +#@ total_tasks = #TASKS# + +export OMP_NUM_THREADS=#THREADSPERTASK# +export PATH=$PATH:/opt/openmpi/ppc64/bin/:/opt/perf/bin/ +export LD_LIBRARY_PATH=/opt/openmpi/ppc64/lib/ + +#ENV# +#HOSTLIST# + +echo "" >> #OUTDIR#/start_info.xml +#STARTER# #EXECUTABLE# #ARGS_EXECUTABLE# +echo "" >> #OUTDIR#/end_info.xml + +#POST_CMD# diff --git a/qcd/part_cpu/platform/IBM-PowerPC-MareNostrum/ppc_mnsubmit.job.in b/qcd/part_cpu/platform/IBM-PowerPC-MareNostrum/ppc_mnsubmit.job.in new file mode 100644 index 0000000000000000000000000000000000000000..78d95c7e5737bc23c64fa37862e33f0d9e1605a0 --- /dev/null +++ b/qcd/part_cpu/platform/IBM-PowerPC-MareNostrum/ppc_mnsubmit.job.in @@ -0,0 +1,17 @@ +#!/bin/bash +# @ job_name = #BENCHNAME# +# @ total_tasks = #NCPUS# +# @ tasks_per_node = #TASKSPERNODE# +# @ cpus_per_task = #THREADSPERTASK# +# @ output = #STDOUTLOGFILE# +# @ error = #STDERRLOGFILE# +# @ wall_clock_limit = #TIME_LIMIT# + +#ENV# + +echo "" >> #OUTDIR#/start_info.xml +#PREPROCESS# +#STARTER# #ARGS_STARTER# #MEASUREMENT# #EXECUTABLE# #ARGS_EXECUTABLE# +#POSTPROCESS# +echo "" >> #OUTDIR#/end_info.xml + diff --git a/qcd/part_cpu/platform/IBM-SP4-Jump/ibm_llsubmit.job.in b/qcd/part_cpu/platform/IBM-SP4-Jump/ibm_llsubmit.job.in new file mode 100644 index 0000000000000000000000000000000000000000..53ef017b22c59598efa568398c5a7ad82bb06352 --- /dev/null +++ b/qcd/part_cpu/platform/IBM-SP4-Jump/ibm_llsubmit.job.in @@ -0,0 +1,27 @@ +# @ shell = /bin/ksh +# @ tasks_per_node = #TASKSPERNODE# +# @ node_usage = #NODEUSAGE# +# @ node = #NODES# +##@ class = #CLASS# +# @ job_name = #BENCHNAME# +# @ output = #STDOUTLOGFILE# +# @ error = #STDERRLOGFILE# +# @ notify_user = #NOTIFY_EMAIL# +# @ wall_clock_limit = #TIME_LIMIT# +# @ data_limit = #DATA_LIMIT# +# @ stack_limit = #STACK_LIMIT# +# @ job_type = parallel +# @ resources = ConsumableCpus(#THREADSPERTASK#) ConsumableMemory(#MEMORYPERTASK#) +# @ requirements = ( (Machine != "j39") && (Machine != "j40") ) +# @ queue + +#ENV# + +echo "" >> #OUTDIR#/start_info.xml +#PREPROCESS# +#STARTER# #ARGS_STARTER# #MEASUREMENT# #EXECUTABLE# #ARGS_EXECUTABLE# +#POSTPROCESS# +echo "" >> #OUTDIR#/end_info.xml + + + diff --git a/qcd/part_cpu/platform/IBM-SP4-Zahir/ibm_llsubmit.job.in b/qcd/part_cpu/platform/IBM-SP4-Zahir/ibm_llsubmit.job.in new file mode 100644 index 0000000000000000000000000000000000000000..a9a5f4ee2327a8f85eb3f581eb1e99bcc89b65ad --- /dev/null +++ b/qcd/part_cpu/platform/IBM-SP4-Zahir/ibm_llsubmit.job.in @@ -0,0 +1,24 @@ +# @ shell = /bin/ksh +# @ total_tasks = #TOTALTASKS# +# @ job_name = #BENCHNAME# +# @ output = #STDOUTLOGFILE# +# @ error = #STDERRLOGFILE# +# @ notification = #NOTIFICATION# +# @ notify_user = #NOTIFY_EMAIL# +# @ cpu_limit = #TIME_LIMIT# +# @ data_limit = #DATA_LIMIT# +# @ stack_limit = #STACK_LIMIT# +# @ job_type = parallel +# @ resources = ConsumableCpus(#THREADSPERTASK#) +# @ queue + +#ENV# + +echo "" >> #OUTDIR#/start_info.xml +#PREPROCESS# +#STARTER# #ARGS_STARTER# #MEASUREMENT# #EXECUTABLE# #ARGS_EXECUTABLE# +#POSTPROCESS# +echo "" >> #OUTDIR#/end_info.xml + + + diff --git a/qcd/part_cpu/platform/IBM-SP4-psi/host.list b/qcd/part_cpu/platform/IBM-SP4-psi/host.list new file mode 100644 index 0000000000000000000000000000000000000000..6295d17342155cf674c8a701049f24c48f7a9e39 --- /dev/null +++ b/qcd/part_cpu/platform/IBM-SP4-psi/host.list @@ -0,0 +1,32 @@ +localhost +localhost +localhost +localhost +localhost +localhost +localhost +localhost +localhost +localhost +localhost +localhost +localhost +localhost +localhost +localhost +localhost +localhost +localhost +localhost +localhost +localhost +localhost +localhost +localhost +localhost +localhost +localhost +localhost +localhost +localhost +localhost diff --git a/qcd/part_cpu/platform/IBM-SP4-psi/ibm_llsubmit.job.in b/qcd/part_cpu/platform/IBM-SP4-psi/ibm_llsubmit.job.in new file mode 100644 index 0000000000000000000000000000000000000000..0cdb4a3ee8a94764b92e9e9b9ff3bcc05ab9da41 --- /dev/null +++ b/qcd/part_cpu/platform/IBM-SP4-psi/ibm_llsubmit.job.in @@ -0,0 +1,23 @@ +# @ shell = /bin/ksh +# @ tasks_per_node = #TASKSPERNODE# +# @ node_usage = #NODEUSAGE# +# @ node = #NODES# +# @ class = #CLASS# +# @ job_name = #BENCHNAME# +# @ output = #STDOUTLOGFILE# +# @ error = #STDERRLOGFILE# +# @ notify_user = #NOTIFY_EMAIL# +# @ wall_clock_limit = #TIME_LIMIT# +# @ data_limit = #DATA_LIMIT# +# @ stack_limit = #STACK_LIMIT# +# @ job_type = parallel +# @ resources = ConsumableCpus(#THREADSPERTASK#) ConsumableMemory(#MEMORYPERTASK#) +# @ queue + +#ENV# + +echo "" >> #OUTDIR#/start_info.xml +#PREPROCESS# +#STARTER# #ARGS_STARTER# #MEASUREMENT# #EXECUTABLE# #ARGS_EXECUTABLE# +#POSTPROCESS# +echo "" >> #OUTDIR#/end_info.xml diff --git a/qcd/part_cpu/platform/IBM-SP4-psi/ibm_start.sh.in b/qcd/part_cpu/platform/IBM-SP4-psi/ibm_start.sh.in new file mode 100644 index 0000000000000000000000000000000000000000..f7cec53c48798bdb56f6b1b92a681c58931115da --- /dev/null +++ b/qcd/part_cpu/platform/IBM-SP4-psi/ibm_start.sh.in @@ -0,0 +1,3 @@ +echo "" >>start_info.xml +#EXECUTABLE# -procs #NCPUS# >#STDOUTLOGFILE# 2>#STDERRLOGFILE# +echo "" >>end_info.xml diff --git a/qcd/part_cpu/platform/IBM-SP5-CINECA/ibm_llsubmit.job.in b/qcd/part_cpu/platform/IBM-SP5-CINECA/ibm_llsubmit.job.in new file mode 100644 index 0000000000000000000000000000000000000000..22c69ebef03f07aa271ffa790db933dd48d6f63e --- /dev/null +++ b/qcd/part_cpu/platform/IBM-SP5-CINECA/ibm_llsubmit.job.in @@ -0,0 +1,22 @@ +# @ shell = /bin/ksh +# @ blocking = #TASKSPERNODE# +# @ total_tasks = #TOTALTASKS# +# @ class = #CLASS# +# @ job_name = #BENCHNAME# +# @ output = #STDOUTLOGFILE# +# @ error = #STDERRLOGFILE# +# @ wall_clock_limit = #TIME_LIMIT# +# @ job_type = parallel +# @ resources = ConsumableCpus(#THREADSPERTASK#) ConsumableMemory(#MEMORYPERTASK#) +# @ queue + +#ENV# + +echo "" >> #OUTDIR#/start_info.xml +#PREPROCESS# +#STARTER# #ARGS_STARTER# #MEASUREMENT# #EXECUTABLE# #ARGS_EXECUTABLE# +#POSTPROCESS# +echo "" >> #OUTDIR#/end_info.xml + + + diff --git a/qcd/part_cpu/platform/IBM-SP5-HPCx/ibm_llsubmit.job.in b/qcd/part_cpu/platform/IBM-SP5-HPCx/ibm_llsubmit.job.in new file mode 100644 index 0000000000000000000000000000000000000000..ea3cea6efea3e7fba81ff27a756b259762324d7f --- /dev/null +++ b/qcd/part_cpu/platform/IBM-SP5-HPCx/ibm_llsubmit.job.in @@ -0,0 +1,24 @@ +# @ shell = /bin/ksh +# @ cpus = #TOTALTASKS# +# @ node_usage = not_shared +# @ network.MPI = csss,shared,US +# @ class = #CLASS# +# @ job_name = #BENCHNAME# +# @ output = #STDOUTLOGFILE# +# @ error = #STDERRLOGFILE# +# @ wall_clock_limit = #TIME_LIMIT# +# @ job_type = parallel +# @ account_no = z001-pra +# @ stack_limit = #STACK# +# @ queue + +#ENV# + +echo "" >> #OUTDIR#/start_info.xml +#PREPROCESS# +#STARTER# #ARGS_STARTER# #MEASUREMENT# #EXECUTABLE# #ARGS_EXECUTABLE# +#POSTPROCESS# +echo "" >> #OUTDIR#/end_info.xml + + + diff --git a/qcd/part_cpu/platform/IBM-SP6-Huygens/ibm_llsubmit.job.in b/qcd/part_cpu/platform/IBM-SP6-Huygens/ibm_llsubmit.job.in new file mode 100755 index 0000000000000000000000000000000000000000..3ba16aa7255cb28be2a271cdfeefa4026e08fb6f --- /dev/null +++ b/qcd/part_cpu/platform/IBM-SP6-Huygens/ibm_llsubmit.job.in @@ -0,0 +1,30 @@ +# @ shell = /bin/bash +##@ task_affinity = #TAFFINITY# +# @ node_usage = #NODEUSAGE# +# @ node = #NODES# +# @ total_tasks = #TASKS# +##@ class = #CLASS# +# @ job_name = #BENCHNAME# +# @ output = #STDOUTLOGFILE# +# @ error = #STDERRLOGFILE# +# @ notification = #NOTIFICATION# +# @ notify_user = #NOTIFY_EMAIL# +# @ wall_clock_limit = #TIME_LIMIT# +# @ data_limit = #DATA_LIMIT# +# @ stack_limit = #STACK_LIMIT# +# @ job_type = parallel +# @ network.MPI = sn_all,not_shared,US +##@ resources = ConsumableCpus(#THREADSPERTASK#) ConsumableMemory(#MEMORYPERTASK#) +##@ requirements = (Machine != "jump01m") +# @ queue + +#ENV# + +echo "" >> #OUTDIR#/start_info.xml +#PREPROCESS# +#STARTER# #ARGS_STARTER# #MEASUREMENT# #EXECUTABLE# #ARGS_EXECUTABLE# +#POSTPROCESS# +echo "" >> #OUTDIR#/end_info.xml + + + diff --git a/qcd/part_cpu/platform/IBM-SP6-Huygens/ibm_llsubmit.large.job.in b/qcd/part_cpu/platform/IBM-SP6-Huygens/ibm_llsubmit.large.job.in new file mode 100755 index 0000000000000000000000000000000000000000..cde1221f25cf232aee1f070f773cc9a0d574f0d5 --- /dev/null +++ b/qcd/part_cpu/platform/IBM-SP6-Huygens/ibm_llsubmit.large.job.in @@ -0,0 +1,30 @@ +# @ shell = /bin/bash +##@ task_affinity = #TAFFINITY# +# @ node_usage = #NODEUSAGE# +# @ node = #NODES# +# @ total_tasks = #TASKS# +##@ class = #CLASS# +# @ job_name = #BENCHNAME# +# @ output = #STDOUTLOGFILE# +# @ error = #STDERRLOGFILE# +# @ notification = #NOTIFICATION# +# @ notify_user = #NOTIFY_EMAIL# +# @ wall_clock_limit = #TIME_LIMIT# +# @ data_limit = #DATA_LIMIT# +# @ stack_limit = #STACK_LIMIT# +# @ job_type = parallel +# @ network.MPI = sn_all,not_shared,US +##@ resources = ConsumableCpus(#THREADSPERTASK#) ConsumableMemory(#MEMORYPERTASK#) +# @ requirements = (Memory > 131072) +# @ queue + +#ENV# + +echo "" >> #OUTDIR#/start_info.xml +#PREPROCESS# +#STARTER# #ARGS_STARTER# #MEASUREMENT# #EXECUTABLE# #ARGS_EXECUTABLE# +#POSTPROCESS# +echo "" >> #OUTDIR#/end_info.xml + + + diff --git a/qcd/part_cpu/platform/IBM-SP6-Huygens/ibm_llsubmit_tpn.job.in b/qcd/part_cpu/platform/IBM-SP6-Huygens/ibm_llsubmit_tpn.job.in new file mode 100755 index 0000000000000000000000000000000000000000..1747a9a15f4e8d2950a002ba83f247bcdee91541 --- /dev/null +++ b/qcd/part_cpu/platform/IBM-SP6-Huygens/ibm_llsubmit_tpn.job.in @@ -0,0 +1,30 @@ +# @ shell = /bin/bash +##@ task_affinity = #TAFFINITY# +# @ node_usage = #NODEUSAGE# +# @ node = #NODES# +# @ tasks_per_node = #TASKSPERNODE# +##@ class = #CLASS# +# @ job_name = #BENCHNAME# +# @ output = #STDOUTLOGFILE# +# @ error = #STDERRLOGFILE# +# @ notification = #NOTIFICATION# +# @ notify_user = #NOTIFY_EMAIL# +# @ wall_clock_limit = #TIME_LIMIT# +# @ data_limit = #DATA_LIMIT# +# @ stack_limit = #STACK_LIMIT# +# @ job_type = parallel +# @ network.MPI = sn_all,not_shared,US +##@ resources = ConsumableCpus(#THREADSPERTASK#) ConsumableMemory(#MEMORYPERTASK#) +##@ requirements = (Machine != "jump01m") +# @ queue + +#ENV# + +echo "" >> #OUTDIR#/start_info.xml +#PREPROCESS# +#STARTER# #ARGS_STARTER# #MEASUREMENT# #EXECUTABLE# #ARGS_EXECUTABLE# +#POSTPROCESS# +echo "" >> #OUTDIR#/end_info.xml + + + diff --git a/qcd/part_cpu/platform/IBM-SP6-Huygens/ibm_llsubmit_tpn.large.job.in b/qcd/part_cpu/platform/IBM-SP6-Huygens/ibm_llsubmit_tpn.large.job.in new file mode 100755 index 0000000000000000000000000000000000000000..b69ae97b5280cb2e07091aa2c850aef2ae20d946 --- /dev/null +++ b/qcd/part_cpu/platform/IBM-SP6-Huygens/ibm_llsubmit_tpn.large.job.in @@ -0,0 +1,30 @@ +# @ shell = /bin/bash +##@ task_affinity = #TAFFINITY# +# @ node_usage = #NODEUSAGE# +# @ node = #NODES# +# @ tasks_per_node = #TASKSPERNODE# +##@ class = #CLASS# +# @ job_name = #BENCHNAME# +# @ output = #STDOUTLOGFILE# +# @ error = #STDERRLOGFILE# +# @ notification = #NOTIFICATION# +# @ notify_user = #NOTIFY_EMAIL# +# @ wall_clock_limit = #TIME_LIMIT# +# @ data_limit = #DATA_LIMIT# +# @ stack_limit = #STACK_LIMIT# +# @ job_type = parallel +# @ network.MPI = sn_all,not_shared,US +##@ resources = ConsumableCpus(#THREADSPERTASK#) ConsumableMemory(#MEMORYPERTASK#) +# @ requirements = (Memory > 131072) +# @ queue + +#ENV# + +echo "" >> #OUTDIR#/start_info.xml +#PREPROCESS# +#STARTER# #ARGS_STARTER# #MEASUREMENT# #EXECUTABLE# #ARGS_EXECUTABLE# +#POSTPROCESS# +echo "" >> #OUTDIR#/end_info.xml + + + diff --git a/qcd/part_cpu/platform/IBM-SP6-Jump/ibm_llsubmit.job.in b/qcd/part_cpu/platform/IBM-SP6-Jump/ibm_llsubmit.job.in new file mode 100755 index 0000000000000000000000000000000000000000..310f4024fb000e7371e0a4b8ec2333980a49a1e0 --- /dev/null +++ b/qcd/part_cpu/platform/IBM-SP6-Jump/ibm_llsubmit.job.in @@ -0,0 +1,23 @@ +# @ shell = /bin/ksh +# @ tasks_per_node = #TASKSPERNODE# +# @ node_usage = #NODEUSAGE# +# @ node = #NODES# +# @ job_name = #BENCHNAME# +# @ output = #STDOUTLOGFILE# +# @ error = #STDERRLOGFILE# +# @ notification = #NOTIFICATION# +# @ notify_user = #NOTIFY_EMAIL# +# @ wall_clock_limit = #TIME_LIMIT# +# @ job_type = parallel +# @ queue + +#ENV# + +echo "" >> #OUTDIR#/start_info.xml +#PREPROCESS# +#STARTER# #ARGS_STARTER# #MEASUREMENT# #EXECUTABLE# #ARGS_EXECUTABLE# +#POSTPROCESS# +echo "" >> #OUTDIR#/end_info.xml + + + diff --git a/qcd/part_cpu/platform/Intel-Broadwell-Marconi/intel_PBSsubmit.job.in b/qcd/part_cpu/platform/Intel-Broadwell-Marconi/intel_PBSsubmit.job.in new file mode 100644 index 0000000000000000000000000000000000000000..fcf0ba0b1b87e24192b8652ead56ba4f0ba233a7 --- /dev/null +++ b/qcd/part_cpu/platform/Intel-Broadwell-Marconi/intel_PBSsubmit.job.in @@ -0,0 +1,22 @@ +#!/bin/bash +#PBS -l walltime=#TIME_LIMIT# +#PBS -l select=#NODES#:ncpus=#TASKSPERNODE#:mpiprocs=#TASKSPERNODE# +#PBS -o #STDOUTLOGFILE# +#PBS -e #STDERRLOGFILE# +#PBS -A Ppp25_3408 + +cd $PBS_O_WORKDIR # this is the dir where the job was submitted from + +echo "" >> #OUTDIR#/start_info.xml + +module load intel intelmpi +###mpirun ./myprogram < myinput > myoutput + + +#ENV# +#PREPROCESS# +#MEASUREMENT# #STARTER# #ARGS_STARTER# #EXECUTABLE# #ARGS_EXECUTABLE# + +#POSTPROCESS# + +echo "" >> #OUTDIR#/end_info.xml diff --git a/qcd/part_cpu/platform/Intel-Haswell-Cartesius/intel_SLURMsubmit.job.in b/qcd/part_cpu/platform/Intel-Haswell-Cartesius/intel_SLURMsubmit.job.in new file mode 100644 index 0000000000000000000000000000000000000000..3967c2759d09171563fc6811293f717762c5e606 --- /dev/null +++ b/qcd/part_cpu/platform/Intel-Haswell-Cartesius/intel_SLURMsubmit.job.in @@ -0,0 +1,28 @@ +#!/bin/bash +#SBATCH -t #TIME_LIMIT# +#SBATCH -n #TASKS# +#SBATCH -N #NODES# --ntasks-per-node=#TASKSPERNODE# +#SBATCH -p #PARTITION# +#SBATCH --constraint=#CONSTRAINT# +#SBATCH -J #BENCHNAME# + +#ENV# +module load compilerwrappers + +# Use the intel compilers +module unload fortran/intel +module unload c/intel +module load fortran/intel/15.0.3 +module load c/intel/15.0.3 +/opt/intel/composer_xe_2015.3.187/mkl/bin/mklvars.sh intel64 +module load mkl +module load mpi/impi +module load perl + +cd #OUTDIR# + +echo "" >> #OUTDIR#/start_info.xml +#PREPROCESS# +#MEASUREMENT# #STARTER# #ARGS_STARTER# #EXECUTABLE# #ARGS_EXECUTABLE# +#POSTPROCESS# +echo "" >> #OUTDIR#/end_info.xml diff --git a/qcd/part_cpu/platform/Intel-Nehalem-CURIE/intel_PBSsubmit.job.in b/qcd/part_cpu/platform/Intel-Nehalem-CURIE/intel_PBSsubmit.job.in new file mode 100644 index 0000000000000000000000000000000000000000..8674b5e5e0496008a0510e625504392132eb09bc --- /dev/null +++ b/qcd/part_cpu/platform/Intel-Nehalem-CURIE/intel_PBSsubmit.job.in @@ -0,0 +1,20 @@ +#!/bin/bash -x + +#MSUB -r #BENCHNAME# +#MSUB -n #TASKS# +#MSUB -c #THREADSPERTASK# +#MSUB -T #TIME_LIMIT# +#MSUB -o #STDOUTLOGFILE# +#MSUB -e #STDERRLOGFILE# +set -x +cd ${BRIDGE_MSUB_PWD} + +echo "" >> #OUTDIR#/start_info.xml + +#ENV# +#PREPROCESS# +#MEASUREMENT# #STARTER# #ARGS_STARTER# #EXECUTABLE# #ARGS_EXECUTABLE# + +#POSTPROCESS# + +echo "" >> #OUTDIR#/end_info.xml diff --git a/qcd/part_cpu/platform/Intel-Nehalem-Inti/intel_cccsubmit.job.in b/qcd/part_cpu/platform/Intel-Nehalem-Inti/intel_cccsubmit.job.in new file mode 100644 index 0000000000000000000000000000000000000000..683cd1a02a6f880e4ac5be6c7931fff0e332d45c --- /dev/null +++ b/qcd/part_cpu/platform/Intel-Nehalem-Inti/intel_cccsubmit.job.in @@ -0,0 +1,27 @@ +#!/bin/bash -x + +#### CPU time limit +#MSUB -T #TIME_LIMIT# + +## No. of nodes to use +#MSUB -N #NODES# +## Total no. of tasks +#MSUB -n #TASKS# +## No. of cores per task +#MSUB -c #THREADSPERTASK# +#MSUB -@ #NOTIFY_EMAIL#:begin,end +#MSUB -r #BENCHNAME# +#MSUB -o #STDOUTLOGFILE# +#MSUB -e #STDERRLOGFILE# + +echo "" >> #OUTDIR#/start_info.xml + +#ENV# +#PREPROCESS# +#MEASUREMENT# #STARTER# #ARGS_STARTER# #EXECUTABLE# #ARGS_EXECUTABLE# + +#POSTPROCESS# + + + +echo "" >> #OUTDIR#/end_info.xml diff --git a/qcd/part_cpu/platform/Intel-Nehalem-JUROPA/intel_PBSsubmit.job.in b/qcd/part_cpu/platform/Intel-Nehalem-JUROPA/intel_PBSsubmit.job.in new file mode 100644 index 0000000000000000000000000000000000000000..77f4408a38d6ddf398fe76f9455080c383d18e13 --- /dev/null +++ b/qcd/part_cpu/platform/Intel-Nehalem-JUROPA/intel_PBSsubmit.job.in @@ -0,0 +1,32 @@ +#!/bin/bash -x + +#### CPU time limit +#MSUB -l walltime=#TIME_LIMIT# + +#MSUB -S /bin/bash + +#MSUB -l nodes=#NODES#:ppn=#PPN# + +#MSUB -M #NOTIFY_EMAIL# +#MSUB -m #NOTIFICATION# +#MSUB -N #BENCHNAME# +#MSUB -o #STDOUTLOGFILE# +#MSUB -e #STDERRLOGFILE# +#MSUB -v tpt=#THREADSPERTASK# + + +cd ${PBS_O_WORKDIR} + +echo "" >> #OUTDIR#/start_info.xml + +#ENV# +#PREPROCESS# +#MEASUREMENT# #STARTER# #ARGS_STARTER# #EXECUTABLE# #ARGS_EXECUTABLE# + + +cd ${PBS_O_WORKDIR} +#POSTPROCESS# + + + +echo "" >> #OUTDIR#/end_info.xml diff --git a/qcd/part_cpu/platform/Intel-Nehalem-Laki/intel_PBSreserved.job.in b/qcd/part_cpu/platform/Intel-Nehalem-Laki/intel_PBSreserved.job.in new file mode 100644 index 0000000000000000000000000000000000000000..97ee415c417ae520de72ad94ddccaa0d15995ef4 --- /dev/null +++ b/qcd/part_cpu/platform/Intel-Nehalem-Laki/intel_PBSreserved.job.in @@ -0,0 +1,21 @@ +#!/bin/bash +#PBS -l nodes=#NODES#:ppn=#PPN#:#NODETYPE# +#PBS -l walltime=#TIME_LIMIT# +#PBS -M #NOTIFY_EMAIL# +#PBS -N #BENCHNAME# +#PBS -W x=FLAGS:ADVRES:#RESERVATION# +#PBS -A #ACCT# +#PBS -o #STDOUTLOGFILE# +#PBS -e #STDERRLOGFILE# + +#ENV# +module load compiler/intel/11.0 +module load mpi/openmpi/1.3-static-intel-11.0 + +cd ${PBS_O_WORKDIR} + +echo "" >> #OUTDIR#/start_info.xml +#PREPROCESS# +#MEASUREMENT# #STARTER# #ARGS_STARTER# #EXECUTABLE# #ARGS_EXECUTABLE# +#POSTPROCESS# +echo "" >> #OUTDIR#/end_info.xml diff --git a/qcd/part_cpu/platform/Intel-Nehalem-Laki/intel_PBSsubmit.job.in b/qcd/part_cpu/platform/Intel-Nehalem-Laki/intel_PBSsubmit.job.in new file mode 100644 index 0000000000000000000000000000000000000000..ccae8210659c91d24d6a1488d632fca5d5e0324f --- /dev/null +++ b/qcd/part_cpu/platform/Intel-Nehalem-Laki/intel_PBSsubmit.job.in @@ -0,0 +1,20 @@ +#!/bin/bash +#PBS -l nodes=#NODES#:ppn=#PPN#:#NODETYPE# +#PBS -l walltime=#TIME_LIMIT# +#PBS -M #NOTIFY_EMAIL# +#PBS -N #BENCHNAME# +#PBS -A #ACCT# +#PBS -o #STDOUTLOGFILE# +#PBS -e #STDERRLOGFILE# + +#ENV# +module load compiler/intel/11.0 +module load mpi/openmpi/1.3-static-intel-11.0 + +cd ${PBS_O_WORKDIR} + +echo "" >> #OUTDIR#/start_info.xml +#PREPROCESS# +#MEASUREMENT# #STARTER# #ARGS_STARTER# #EXECUTABLE# #ARGS_EXECUTABLE# +#POSTPROCESS# +echo "" >> #OUTDIR#/end_info.xml diff --git a/qcd/part_cpu/platform/Intel-Nehalem-Laki/nehalem.job.in b/qcd/part_cpu/platform/Intel-Nehalem-Laki/nehalem.job.in new file mode 100644 index 0000000000000000000000000000000000000000..e5b8b87dc3f825fe758518131cebbc88451d10e1 --- /dev/null +++ b/qcd/part_cpu/platform/Intel-Nehalem-Laki/nehalem.job.in @@ -0,0 +1,22 @@ +#!/bin/bash +#PBS -l nodes=#NODES#:nehalem:ppn=8 +#PBS -l walltime=#TIME_LIMIT# +#PBS -M #NOTIFY_EMAIL# +#PBS -N #BENCHNAME# +#PBS -A #ACCT# +##PBS -q #CLASS# +#PBS -o #STDOUTLOGFILE# +#PBS -e #STDERRLOGFILE# + +module load compiler/intel/11.0 +module load mpi/openmpi/1.3-static-intel-11.0 +#module load mpi/impi/intel-11.0.074-impi-3.2.0.011 + +cd #OUTDIR# +#ENV# + +echo "" >> #OUTDIR#/start_info.xml +#PREPROCESS# +#MEASUREMENT# #STARTER# #ARGS_STARTER# #EXECUTABLE# #ARGS_EXECUTABLE# +#POSTPROCESS# +echo "" >> #OUTDIR#/end_info.xml diff --git a/qcd/part_cpu/platform/Intel-SNB-supermuc/ibm_llsubmit.job.in b/qcd/part_cpu/platform/Intel-SNB-supermuc/ibm_llsubmit.job.in new file mode 100755 index 0000000000000000000000000000000000000000..873100b0721faef69252906ea522972722c75421 --- /dev/null +++ b/qcd/part_cpu/platform/Intel-SNB-supermuc/ibm_llsubmit.job.in @@ -0,0 +1,34 @@ +#!/bin/bash +#@ job_type = parallel +# @ class = #JOB_CLASS# +# @ tasks_per_node = #TASKSPERNODE# +# @ node_usage = #NODEUSAGE# +# @ node = #NODES# +# @ job_name = #BENCHNAME# +#@ network.MPI = sn_all,not_shared,us +# @ output = #STDOUTLOGFILE# +# @ error = #STDERRLOGFILE# +#@ notification=never +# @ notify_user = #NOTIFY_EMAIL# +# @ wall_clock_limit = #TIME_LIMIT# +# @ data_limit = #DATA_LIMIT# +# @ stack_limit = #STACK_LIMIT# +# @ energy_policy_tag = #MY_ENERGYTAG# +# @ minimize_time_to_solution = yes +#### @ resources = ConsumableCpus(#THREADSPERTASK#) ConsumableMemory(#MEMORYPERTASK#) +# @ queue +. /etc/profile +. /etc/profile.d/modules.sh +#module load prace + +#ENV# +#MODULE_CMD# #MODULE_EXEC_FILES# + +echo "" >> #OUTDIR#/start_info.xml +#PREPROCESS# +#STARTER# #ARGS_STARTER# #MEASUREMENT# #EXECUTABLE# #ARGS_EXECUTABLE# +#POSTPROCESS# +echo "" >> #OUTDIR#/end_info.xml + + + diff --git a/qcd/part_cpu/platform/NEC-SX8-HLRS/nec_qsub.job.in b/qcd/part_cpu/platform/NEC-SX8-HLRS/nec_qsub.job.in new file mode 100644 index 0000000000000000000000000000000000000000..8cc9700005912ac2b4fbadfa9970debf1dc78f43 --- /dev/null +++ b/qcd/part_cpu/platform/NEC-SX8-HLRS/nec_qsub.job.in @@ -0,0 +1,26 @@ +#!/usr/bin/ksh +#PBS -q multi +#PBS -T mpisx +#PBS -l cpunum_job=8 +#PBS -b #NODES# +#PBS -l elapstim_req=#TIME_LIMIT# +#PBS -l cputim_job=#CPU_TIME# +#PBS -l cputim_prc=#CPU_TIME# +#PBS -l memsz_job=#MEMORYPERNODE# +#PBS -M #NOTIFY_EMAIL# +#PBS -N #BENCHNAME# +#PBS -o #STDOUTLOGFILE# +#PBS -e #STDERRLOGFILE# +#PBS -m #NOTIFICATION# + +cd #OUTDIR# +#ENV# +export MPIPROGINF="DETAIL" +MPIEXPORT="$MPIEXPORT MPIPROGINF" +export MPIEXPORT + +echo "" >> #OUTDIR#/start_info.xml +#PREPROCESS# +#MEASUREMENT# #STARTER# #ARGS_STARTER# #EXECUTABLE# #ARGS_EXECUTABLE# +#POSTPROCESS# +echo "" >> #OUTDIR#/end_info.xml diff --git a/qcd/part_cpu/platform/NEC-SX9-HLRS/nec_qsub.job.in b/qcd/part_cpu/platform/NEC-SX9-HLRS/nec_qsub.job.in new file mode 100644 index 0000000000000000000000000000000000000000..a9ea9a470ce93e92b1a38f41be2d5411f6a33248 --- /dev/null +++ b/qcd/part_cpu/platform/NEC-SX9-HLRS/nec_qsub.job.in @@ -0,0 +1,27 @@ +#!/usr/bin/ksh +#PBS -q multi +#PBS -T mpisx +#PBS -l cpunum_job=16 +#PBS -b #NODES# +#PBS -l elapstim_req=#TIME_LIMIT# +#PBS -l cputim_job=#CPU_TIME# +#PBS -l cputim_prc=#CPU_TIME# +#PBS -l memsz_job=#MEMORYPERNODE# +#PBS -A #ACCT# +#PBS -M #NOTIFY_EMAIL# +#PBS -N #BENCHNAME# +#PBS -o #STDOUTLOGFILE# +#PBS -e #STDERRLOGFILE# +#PBS -m #NOTIFICATION# + +cd #OUTDIR# +#ENV# +export MPIPROGINF="DETAIL" +MPIEXPORT="$MPIEXPORT MPIPROGINF" +export MPIEXPORT + +echo "" >> #OUTDIR#/start_info.xml +#PREPROCESS# +#MEASUREMENT# #STARTER# #ARGS_STARTER# #EXECUTABLE# #ARGS_EXECUTABLE# +#POSTPROCESS# +echo "" >> #OUTDIR#/end_info.xml diff --git a/qcd/part_cpu/platform/SGI-ALTIX/Altix_llsubmit.job.in b/qcd/part_cpu/platform/SGI-ALTIX/Altix_llsubmit.job.in new file mode 100644 index 0000000000000000000000000000000000000000..a162fe2ecd0e620af0069e2285ad5607c583cdc8 --- /dev/null +++ b/qcd/part_cpu/platform/SGI-ALTIX/Altix_llsubmit.job.in @@ -0,0 +1,29 @@ +#!/bin/bash +#PBS -S /bin/bash +#PBS -N #BENCHNAME# +#PBS -o #STDOUTLOGFILE# +#PBS -e #STDERRLOGFILE# +#PBS -m #NOTIFICATION# +#PBS -M #NOTIFY_EMAIL# +#PBS -l #PARTITIONS# +#PBS -l select=#NODES#:ncpus=#CPUSPERNODE#:mpiprocs=#TASKSPERNODE#:ompthreads=#THREADSPERTASK# +#PBS -l walltime=#TIME_LIMIT# + +. /etc/profile +. /etc/profile.d/modules.sh + +set -x + +export OMP_NUM_THREADS=#THREADSPERTASK# + +module load deisa +module load mkl +module list + +cd $PBS_O_WORKDIR + +echo "" >> #OUTDIR#/start_info.xml +#PREPROCESS# +#STARTER# #ARGS_STARTER# #MEASUREMENT# #EXECUTABLE# #ARGS_EXECUTABLE# +#POSTPROCESS# +echo "" >> #OUTDIR#/end_info.xml diff --git a/qcd/part_cpu/platform/SGI-Altix-ICE-8200-EX-jade/SGI-Altix-ICE_qsub.job.in b/qcd/part_cpu/platform/SGI-Altix-ICE-8200-EX-jade/SGI-Altix-ICE_qsub.job.in new file mode 100644 index 0000000000000000000000000000000000000000..c6c3d2b7e7997755fa990f261c9d76a39554c95b --- /dev/null +++ b/qcd/part_cpu/platform/SGI-Altix-ICE-8200-EX-jade/SGI-Altix-ICE_qsub.job.in @@ -0,0 +1,26 @@ +#!/bin/bash +#PBS -S /bin/bash +#PBS -N #BENCHNAME# +#PBS -o #STDOUTLOGFILE# +#PBS -e #STDERRLOGFILE# +#PBS -m #NOTIFICATION# +#PBS -M #NOTIFY_EMAIL# +#PBS -l #PARTITIONS# +#PBS -l select=#NODES#:ncpus=#CPUSPERNODE#:mpiprocs=#TASKSPERNODE#:ompthreads=#THREADSPERTASK# +#PBS -l walltime=#TIME_LIMIT# + + +set -x + +export OMP_NUM_THREADS=#THREADSPERTASK# + +module load cce-10.1.017 fce-10.1.017 mkl-10.0.3.020 +module list + +cd $PBS_O_WORKDIR + +echo "" >> #OUTDIR#/start_info.xml +#PREPROCESS# +#STARTER# #ARGS_STARTER# #MEASUREMENT# #EXECUTABLE# #ARGS_EXECUTABLE# +#POSTPROCESS# +echo "" >> #OUTDIR#/end_info.xml diff --git a/qcd/part_cpu/platform/platform.xml b/qcd/part_cpu/platform/platform.xml new file mode 100644 index 0000000000000000000000000000000000000000..a375e89c4a6f23a774df9caa2c281fbba15ddf3c --- /dev/null +++ b/qcd/part_cpu/platform/platform.xml @@ -0,0 +1,1736 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/qcd/part_cpu/skel/Makefile.defs.in b/qcd/part_cpu/skel/Makefile.defs.in new file mode 100644 index 0000000000000000000000000000000000000000..7f7e57f0c4bf8cb1915fa3180f124d307fc4df62 --- /dev/null +++ b/qcd/part_cpu/skel/Makefile.defs.in @@ -0,0 +1,58 @@ +#--------------------------------------------------------------------- +# Make.inc file for IBM-SP4 architecture +# +#--------------------------------------------------------------------- +#- General variables +#--------------------------------------------------------------------- +MAKE = #MAKE# +RM = #RM# +AR = #AR# +ARFLAGS = #ARFLAGS# +RANLIB = #RANLIB# + +CPP = #CPP# +CPPFLAGS = #CPPFLAGS# + +F77 = #F77# +FFLAGS = #FFLAGS# + +F90 = #F90# +F90FLAGS = #F90FLAGS# + +CC = #CC# +CXX = #CXX# +CFLAGS = #CFLAGS# + +MPI_F90 = #MPI_F90# +MPI_F77 = #MPI_F77# +MPI_CC = #MPI_CC# +MPI_CXX = #MPI_CXX# + +LD = #LD# +LDFLAGS = #LDFLAGS# + +#- MPI library +MPI_DIR = #MPI_DIR# +MPI_LIB = #MPI_LIB# +MPI_INC = #MPI_INC# +MPI_BIN = #MPI_BIN# + +#--------------------------------------------------------------------- +#- BLAS library +BLAS_DIR = #BLAS_DIR# +BLAS_LIB = #BLAS_LIB# + +#- LAPACK library +LAPACK_DIR = #LAPACK_DIR# +LAPACK_LIB = #LAPACK_LIB# + +#- FFTW librairies +FFTW_DIR = #FFTW3_DIR# +FFTW_LIB = #FFTW3_LIB# +FFTW_INC = #FFTW3_INC# + +#- NetCDF Library +NETCDF3_DIR = #NETCDF3_DIR# +NETCDF3_LIB = #NETCDF3_LIB# +NETCDF3_INC = #NETCDF3_INC# + diff --git a/qcd/part_cpu/skel/analyse-pattern-posixtime.xml b/qcd/part_cpu/skel/analyse-pattern-posixtime.xml new file mode 100644 index 0000000000000000000000000000000000000000..65fc44d4cf0256fabd4056e144f22a7c5997d9f1 --- /dev/null +++ b/qcd/part_cpu/skel/analyse-pattern-posixtime.xml @@ -0,0 +1,15 @@ + + + +real\s*$patfp$ + + + +user\s*$patfp$ + + + +sys\s*$patfp$ + + + diff --git a/qcd/part_cpu/skel/analyse-pattern-time.xml b/qcd/part_cpu/skel/analyse-pattern-time.xml new file mode 100644 index 0000000000000000000000000000000000000000..b5881ced6159c6173962628942ccbec8eb39f090 --- /dev/null +++ b/qcd/part_cpu/skel/analyse-pattern-time.xml @@ -0,0 +1,15 @@ + + + +real\s*$patnfp\s*m$patfp\s*s\s*$ + + + +user\s*$patfp\s*m$patfp\s*s\s*$ + + + +system\s*$patnfp\s*m$patnfp\s*s\s*$ + + + diff --git a/qcd/part_cpu/skel/hpm3patterns.xml b/qcd/part_cpu/skel/hpm3patterns.xml new file mode 100644 index 0000000000000000000000000000000000000000..726e538e5170221e843db663fd112cb27e68ced2 --- /dev/null +++ b/qcd/part_cpu/skel/hpm3patterns.xml @@ -0,0 +1,31 @@ + + + Execution time \(wall clock time\)\s*:\s*$patfp + (($HPMwtimeSum_cnt>0)?($HPMwtimeSum/$HPMwtimeSum_cnt):-1) + + Total amount of time in user mode\s*:\s*$patfp\s+seconds + (($HPMutimeSum_cnt>0)?($HPMutimeSum/$HPMutimeSum_cnt):-1) + + Total amount of time in system mode\s*:\s*$patfp + (($HPMstimeSum_cnt>0)?($HPMstimeSum/$HPMstimeSum_cnt):-1) + + Maximum resident set size\s*:\s*$patint Kbytes + (($HPMressizeSum_cnt>0)?($HPMressizeSum/$HPMressizeSum_cnt):-1) + + Total floating point operations\s*:\s*$patfp\sM + (($HPMtflopsSum_cnt>0)?($HPMtflopsSum/$HPMtflopsSum_cnt):-1) + + Flop rate \(flops \/ WCT\)\s*:\s*$patfp\sMflop\/s + Flops / user time\s*:\s*$patfp\sMflop\/s + + FMA percentage\s*:\s*$patfp\s*\% + (($HPMFMPperc_cnt>0)?($HPMFMPperc/$HPMFMPperc_cnt):-1) + + \% of peak performance\s*:\s*$patfp\s*\% + (($HPMpercPeakSum_cnt>0)?($HPMpercPeakSum/$HPMpercPeakSum_cnt):-1) + + $HPMflopsWCT/$threadspertask + $HPMflopsUser/$threadspertask + + $HPMtflopsSum/$ncpus/$HPMwtimeAvg + diff --git a/qcd/part_cpu/skel/nec-mpiproginf-pattern.xml b/qcd/part_cpu/skel/nec-mpiproginf-pattern.xml new file mode 100644 index 0000000000000000000000000000000000000000..1383bc72b42685347a84e16f836f145c239bf286 --- /dev/null +++ b/qcd/part_cpu/skel/nec-mpiproginf-pattern.xml @@ -0,0 +1,14 @@ + + + Real\s*Time\s*\(sec\)\s*:\s*$patfp + User\s*Time\s*\(sec\)\s*:\s*$patfp + System\s*Time\s*\(sec\)\s*:\s*$patfp + Vector\s*Time\s*\(sec\)\s*:\s*$patfp + Memory size used \(GB\)\s*:\s*$patfp + GFLOPS \(rel\. to User Time\)\s*:\s*$patfp + Instruction\s*Cache miss\s*\(sec\)\s*:\s*[^\s]* \[[^,]*,[^\]]*\]\s*$patfp \[ + Operand\s*Cache miss\s*\(sec\)\s*:\s*[^\s]* \[[^,]*,[^\]]*\]\s*$patfp \[ + Bank\s*Conflict Time\s*\(sec\)\s*:\s*[^\s]* \[[^,]*,[^\]]*\]\s*$patfp \[ + Average Vector Length\s*:\s*[^\s]* \[[^,]*,[^\]]*\]\s*[^\s]* \[[^,]*,[^\]]*\]\s*$patfp + Vector Operation Ratio \(%\)\s*:\s*[^\s]* \[[^,]*,[^\]]*\]\s*[^\s]* \[[^,]*,[^\]]*\]\s*$patfp + diff --git a/qcd/part_cpu/utils/craypat/analyse-pattern-craypat.xml b/qcd/part_cpu/utils/craypat/analyse-pattern-craypat.xml new file mode 100644 index 0000000000000000000000000000000000000000..0138c532c3076922a7408a86b864fd0d721b2ad5 --- /dev/null +++ b/qcd/part_cpu/utils/craypat/analyse-pattern-craypat.xml @@ -0,0 +1,21 @@ + + + JuBE: CRAYPAT: HWC:\s+PAPI_L1_DCM\s+$patnwrd\s+$patint\s+misses + JuBE: CRAYPAT: HWC:\s+PAPI_TLB_DM\s+$patnwrd\s+$patint\s+misses + JuBE: CRAYPAT: HWC:\s+PAPI_L1_DCA\s+$patnwrd\s+$patint\s+refs + JuBE: CRAYPAT: HWC:\s+PAPI_FP_OPS\s+$patnwrd\s+$patint\s+ops + + JuBE: CRAYPAT: TIME: proc 1: \|\|\s+$patnfp\%\s+\|$patwrd + JuBE: CRAYPAT: TIME: proc 1: \|\|\s+$patfp\%\s+\|$patnwrd + + JuBE: CRAYPAT: TIME: proc 2: \|\|\s+$patnfp\%\s+\|$patwrd + JuBE: CRAYPAT: TIME: proc 2: \|\|\s+$patfp\%\s+\|$patnwrd + + JuBE: CRAYPAT: TIME: proc 3: \|\|\s+$patnfp\%\s+\|$patwrd + JuBE: CRAYPAT: TIME: proc 3: \|\|\s+$patfp\%\s+\|$patnwrd + + JuBE: CRAYPAT: TIME: proc $patint: \|\|\s+$patfp\%\s+\|$patwrd + + JuBE: CRAYPAT: HEAP:\s+$patfp\s+\|Total + + \ No newline at end of file diff --git a/qcd/part_cpu/utils/craypat/parseCRAYPAT.pl b/qcd/part_cpu/utils/craypat/parseCRAYPAT.pl new file mode 100644 index 0000000000000000000000000000000000000000..e9f4116a344213d7ed8a182581aaa0a67d02916a --- /dev/null +++ b/qcd/part_cpu/utils/craypat/parseCRAYPAT.pl @@ -0,0 +1,45 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +my $toParseFileNumber = @ARGV - 1; + +my $mode = $ARGV[0]; + +my $proccnt=0; + +my $fileCnt; +for($fileCnt=1; $fileCnt < ($toParseFileNumber+1); $fileCnt++) +{ + open(FILE, "$ARGV[$fileCnt]") or die "can not open file $ARGV[$fileCnt]"; + + my $line; + my $outstage = 0; #0: start; 1: change into output; 2: output; 3: change out; 4: end + while( defined ($line = readline(FILE)) ) + { + if( ($mode eq "HWC") || ($mode eq "HEAP")) + { + if($outstage == 1) {$outstage = 2}; + if($outstage == 3) {$outstage = 4}; + if($line =~ /Table 1/ && $outstage == 0) {$outstage = 1;} + if($line =~ /Additional details/ && $outstage == 2) {$outstage = 3;} + + if($outstage == 2) {print "JuBE: CRAYPAT: $mode: $line";} + } + elsif( ($mode eq "TIME") || ($mode eq "FLOPS")) + { + if($outstage == 1) {$outstage = 2}; + if($outstage == 3) {$outstage = 4}; + if($line =~ /\|USER/ && $outstage == 0) {$outstage = 1;} + if($line =~ /\|\|---/ && ($outstage == 0 || $outstage == 2) ) {$outstage = 1;} + if($line =~ /\|\|===/ && $outstage == 2) {$outstage = 3;} + + if($outstage == 2) {$proccnt++; print "JuBE: CRAYPAT: $mode: proc $proccnt: $line";} + } + + } + + close(FILE); +} + diff --git a/qcd/part_cpu/utils/gprof/analyse-patterns-gprof.xml b/qcd/part_cpu/utils/gprof/analyse-patterns-gprof.xml new file mode 100644 index 0000000000000000000000000000000000000000..e4f83d1f3925fb72b10ac27384b05832ca771b8e --- /dev/null +++ b/qcd/part_cpu/utils/gprof/analyse-patterns-gprof.xml @@ -0,0 +1,20 @@ + + + JuBE: gprof: proc 1:\s*$patwrd\s*$patnfp + JuBE: gprof: proc 1:\s*$patnwrd\s*$patfp + + JuBE: gprof: proc 2:\s*$patwrd\s*$patnfp + JuBE: gprof: proc 2:\s*$patnwrd\s*$patfp + + JuBE: gprof: proc 3:\s*$patwrd\s*$patnfp + JuBE: gprof: proc 3:\s*$patnwrd\s*$patfp + + JuBE: gprof: proc 4:\s*$patwrd\s*$patnfp + JuBE: gprof: proc 4:\s*$patnwrd\s*$patfp + + JuBE: gprof: proc 5:\s*$patwrd\s*$patnfp + JuBE: gprof: proc 5:\s*$patnwrd\s*$patfp + + JuBE: gprof: proc $patint:\s+$patwrd\s+$patfp + + \ No newline at end of file diff --git a/qcd/part_cpu/utils/gprof/parseGPROF.pl b/qcd/part_cpu/utils/gprof/parseGPROF.pl new file mode 100644 index 0000000000000000000000000000000000000000..dfbed03736dbc40c69af64b8705ce8ce4775fa17 --- /dev/null +++ b/qcd/part_cpu/utils/gprof/parseGPROF.pl @@ -0,0 +1,31 @@ +my $file = open(FILE,"$ARGV[2]"); +my $line, $tmpline, $time, $proccnt; +my @splitted; + +while (defined ($line = )) { + $tmpline = $line; + chomp $tmpline; + my $size = length $tmpline; + if( $line =~ ' % cumulative self self total') + { +# print $line; + $line = ; +# print $line; + $time = 1; + $proccnt = 0; + while($time > 0.1) + { + $proccnt++; + $line = ; +# print "$line"; + $tmpline = $line; + chomp $tmpline; + $size = length $tmpline; + @splitted = split(/\s+/, $line); + $time = $splitted[1]; +# foreach $x (@splitted) {print("$x\n");} + print("JuBE: gprof: proc $proccnt: $splitted[$ARGV[0]] $splitted[$ARGV[1]]\n"); +# print("\t\t$time\n"); + } + } +} diff --git a/qcd/part_cpu/utils/ihpct/analyse-pattern-ihpct-hwc.xml b/qcd/part_cpu/utils/ihpct/analyse-pattern-ihpct-hwc.xml new file mode 100644 index 0000000000000000000000000000000000000000..7d04a90acf8e37376eca080da014d19565e7fb7f --- /dev/null +++ b/qcd/part_cpu/utils/ihpct/analyse-pattern-ihpct-hwc.xml @@ -0,0 +1,23 @@ + + + IHPCT: libHPM: in section $patwrd: $patwrd:\s+$patint + + IHPCT: libHPM: in section all:\s+BGP_PU0_FPU_ADD_SUB_1:\s+$patint + IHPCT: libHPM: in section all:\s+BGP_PU0_FPU_ADD_SUB_2:\s+$patint + + IHPCT: libHPM: in section all:\s+BGP_PU0_FPU_MULT_1:\s+$patint + IHPCT: libHPM: in section all:\s+BGP_PU0_FPU_MULT_2:\s+$patint + + IHPCT: libHPM: in section all:\s+BGP_PU0_FPU_DIV_1:\s+$patint + IHPCT: libHPM: in section all:\s+BGP_PU0_FPU_DIV_2:\s+$patint + + IHPCT: libHPM: in section all:\s+BGP_PU0_FPU_FMA_2:\s+$patint + IHPCT: libHPM: in section all:\s+BGP_PU0_FPU_FMA_4:\s+$patint + + IHPCT: libHPM: in section all: PM_FPU_FLOP:\s+$patint + IHPCT: libHPM: in section all: PM_RUN_INST_CMPL:\s+$patint + IHPCT: libHPM: in section all: PM_RUN_CYC:\s+$patint + IHPCT: libHPM: in section all: % of peak performance:\s+$patfp + + + \ No newline at end of file diff --git a/qcd/part_cpu/utils/ihpct/analyse-pattern-ihpct-mpi.xml b/qcd/part_cpu/utils/ihpct/analyse-pattern-ihpct-mpi.xml new file mode 100644 index 0000000000000000000000000000000000000000..22d123c44c1b7bb84e1c901512285313f9f7048f --- /dev/null +++ b/qcd/part_cpu/utils/ihpct/analyse-pattern-ihpct-mpi.xml @@ -0,0 +1,7 @@ + + + + IHPCT: MPITracer: median communication time =\s*$patfp\s*sec for task $patnint + + + \ No newline at end of file diff --git a/qcd/part_cpu/utils/ihpct/parseHWC.pl b/qcd/part_cpu/utils/ihpct/parseHWC.pl new file mode 100644 index 0000000000000000000000000000000000000000..5f17190ab5369fc23c6b619d8f68cd804a140d4d --- /dev/null +++ b/qcd/part_cpu/utils/ihpct/parseHWC.pl @@ -0,0 +1,108 @@ +#!/usr/bin/perl +use XML::Parser; +use Data::Dumper; + +use strict; +use warnings; + +my %DataNames; + +my @IPDEFList; +my @IPList; + +our $currentlabel = ""; + +my $p = XML::Parser->new(Style => 'Tree'); + +my $toParseFileNumber = @ARGV; + +#print "number of files to parse $toParseFileNumber\n"; + +my $fileCnt; +for($fileCnt=0; $fileCnt < $toParseFileNumber; $fileCnt++) +{ + + my $tree = $p->parsefile($ARGV[$fileCnt]); + + $tree = ${$tree}[1]; + + my $cnt; + +#print "number of vector elements: $#{$tree}\n"; + for($cnt=0; $cnt<$#{$tree}; $cnt++) + { +# print "\t$cnt: ${$tree}[$cnt]\n"; + } + + for($cnt=1; $cnt < $#{$tree}; $cnt+=2) + { +# print "check $cnt -th element\n"; + if(${$tree}[$cnt] eq "ipdef") + { +# print "\tfound ipdef element\n"; + push(@IPDEFList, ${$tree}[$cnt+1]); + } + if(${$tree}[$cnt] eq "ip") + { +# print "\tfound ip element\n"; + push(@IPList, ${$tree}[$cnt+1]); + } + + } + + my $IPDEFSize = @IPDEFList; + my $IPSize = @IPList; +#print "number of ipdef elements: $IPDEFSize\n"; +#print "number of ip elements: $IPDEFSize\n"; + + my $ipdefCnt; + for($ipdefCnt=0; $ipdefCnt<$IPDEFSize; $ipdefCnt++) + { +# print Dumper($IPDEFList[$ipdefCnt]); + my $elCnt; + my $elNum = $#{$IPDEFList[$ipdefCnt]} + 1; + +# print "parse IPDEF list no. $ipdefCnt with $elNum elements\n"; + + for($elCnt=0; $elCnt<$elNum; $elCnt++) + { +# print "checking element $elCnt\n"; + if(${$IPDEFList[$ipdefCnt]}[$elCnt] eq "datadef") + { +# print "\tfound datadef element\n"; + my $id = ${${$IPDEFList[$ipdefCnt]}[$elCnt+1]}[0]{'id'}; + my $label = ${${$IPDEFList[$ipdefCnt]}[$elCnt+1]}[0]{'label'}; +# print "\t\tid: $id, label: $label\n"; + $DataNames{$id} = $label; + } + } + } + +#print Dumper(%DataNames); + + my $ipCnt; + for($ipCnt=0; $ipCnt<$IPSize; $ipCnt++) + { +# print Dumper($IPDEFList[$ipdefCnt]); + my $elCnt; + my $elNum = $#{$IPList[$ipCnt]} + 1; + + my $IPLabel = ${$IPList[$ipCnt]}[0]{'label'}; + +# print "parse IP list no. $ipdefCnt with $elNum elements, labeled $IPLabel\n"; + + for($elCnt=0; $elCnt<$elNum; $elCnt++) + { +# print "checking element $elCnt\n"; + if(${$IPList[$ipCnt]}[$elCnt] eq "d") + { +# print "\tfound d element\n"; + my $id = ${${$IPList[$ipCnt]}[$elCnt+1]}[0]{'id'}; + my $v = ${${$IPList[$ipCnt]}[$elCnt+1]}[0]{'v'}; + print "IHPCT: libHPM: in section $IPLabel: $DataNames{$id}: $v\n"; +# $DataNames{'$id'} = $label; + } + } + } +} + diff --git a/qcd/part_cpu/utils/ihpct/parseMPITR.pl b/qcd/part_cpu/utils/ihpct/parseMPITR.pl new file mode 100644 index 0000000000000000000000000000000000000000..c948eb1f15c69df16c288c9110611495e4f16650 --- /dev/null +++ b/qcd/part_cpu/utils/ihpct/parseMPITR.pl @@ -0,0 +1,19 @@ +use strict; +use warnings; + +my $toParseFileNumber = @ARGV; + +my $fileCnt; +for($fileCnt=0; $fileCnt < $toParseFileNumber; $fileCnt++) +{ + open(FILE, "$ARGV[$fileCnt]") or die "can not open file $ARGV[$fileCnt]"; + + my $line; + while( defined ($line = readline(FILE)) ) + { + print "IHPCT: MPITracer: $line"; + } + + close(FILE); +} +